Update Linux to v5.4.2

Change-Id: Idf6911045d9d382da2cfe01b1edff026404ac8fd
diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig
index abb6660..b44b1c3 100644
--- a/drivers/infiniband/Kconfig
+++ b/drivers/infiniband/Kconfig
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 menuconfig INFINIBAND
 	tristate "InfiniBand support"
 	depends on HAS_IOMEM && HAS_DMA
@@ -6,6 +7,7 @@
 	depends on m || IPV6 != m
 	depends on !ALPHA
 	select IRQ_POLL
+	select DIMLIB
 	---help---
 	  Core support for InfiniBand (IB).  Make sure to also select
 	  any protocols you wish to use as well as drivers for your
@@ -25,7 +27,7 @@
 
 config INFINIBAND_USER_ACCESS
 	tristate "InfiniBand userspace access (verbs and CM)"
-	select ANON_INODES
+	depends on MMU
 	---help---
 	  Userspace InfiniBand access support.  This enables the
 	  kernel side of userspace verbs and the userspace
@@ -35,17 +37,6 @@
 	  libibverbs, libibcm and a hardware driver library from
 	  rdma-core <https://github.com/linux-rdma/rdma-core>.
 
-config INFINIBAND_USER_ACCESS_UCM
-	tristate "Userspace CM (UCM, DEPRECATED)"
-	depends on BROKEN || COMPILE_TEST
-	depends on INFINIBAND_USER_ACCESS
-	help
-	  The UCM module has known security flaws, which no one is
-	  interested to fix. The user-space part of this code was
-	  dropped from the upstream a long time ago.
-
-	  This option is DEPRECATED and planned to be removed.
-
 config INFINIBAND_EXP_LEGACY_VERBS_NEW_UAPI
 	bool "Allow experimental legacy verbs in new ioctl uAPI  (EXPERIMENTAL)"
 	depends on INFINIBAND_USER_ACCESS
@@ -64,6 +55,7 @@
 	bool "InfiniBand on-demand paging support"
 	depends on INFINIBAND_USER_MEM
 	select MMU_NOTIFIER
+	select INTERVAL_TREE
 	default y
 	---help---
 	  On demand paging support for the InfiniBand subsystem.
@@ -88,18 +80,26 @@
 	  This allows the user to config the default GID type that the CM
 	  uses for each device, when initiaing new connections.
 
+if INFINIBAND_USER_ACCESS || !INFINIBAND_USER_ACCESS
 source "drivers/infiniband/hw/mthca/Kconfig"
 source "drivers/infiniband/hw/qib/Kconfig"
 source "drivers/infiniband/hw/cxgb3/Kconfig"
 source "drivers/infiniband/hw/cxgb4/Kconfig"
+source "drivers/infiniband/hw/efa/Kconfig"
 source "drivers/infiniband/hw/i40iw/Kconfig"
 source "drivers/infiniband/hw/mlx4/Kconfig"
 source "drivers/infiniband/hw/mlx5/Kconfig"
-source "drivers/infiniband/hw/nes/Kconfig"
 source "drivers/infiniband/hw/ocrdma/Kconfig"
 source "drivers/infiniband/hw/vmw_pvrdma/Kconfig"
 source "drivers/infiniband/hw/usnic/Kconfig"
 source "drivers/infiniband/hw/hns/Kconfig"
+source "drivers/infiniband/hw/bnxt_re/Kconfig"
+source "drivers/infiniband/hw/hfi1/Kconfig"
+source "drivers/infiniband/hw/qedr/Kconfig"
+source "drivers/infiniband/sw/rdmavt/Kconfig"
+source "drivers/infiniband/sw/rxe/Kconfig"
+source "drivers/infiniband/sw/siw/Kconfig"
+endif
 
 source "drivers/infiniband/ulp/ipoib/Kconfig"
 
@@ -110,13 +110,5 @@
 source "drivers/infiniband/ulp/isert/Kconfig"
 
 source "drivers/infiniband/ulp/opa_vnic/Kconfig"
-source "drivers/infiniband/sw/rdmavt/Kconfig"
-source "drivers/infiniband/sw/rxe/Kconfig"
-
-source "drivers/infiniband/hw/hfi1/Kconfig"
-
-source "drivers/infiniband/hw/qedr/Kconfig"
-
-source "drivers/infiniband/hw/bnxt_re/Kconfig"
 
 endif # INFINIBAND
diff --git a/drivers/infiniband/Makefile b/drivers/infiniband/Makefile
index fad0b44..8603cdf 100644
--- a/drivers/infiniband/Makefile
+++ b/drivers/infiniband/Makefile
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 obj-$(CONFIG_INFINIBAND)		+= core/
 obj-$(CONFIG_INFINIBAND)		+= hw/
 obj-$(CONFIG_INFINIBAND)		+= ulp/
diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile
index 867cee5..09881bd 100644
--- a/drivers/infiniband/core/Makefile
+++ b/drivers/infiniband/core/Makefile
@@ -6,17 +6,14 @@
 					$(infiniband-y)
 obj-$(CONFIG_INFINIBAND_USER_MAD) +=	ib_umad.o
 obj-$(CONFIG_INFINIBAND_USER_ACCESS) += ib_uverbs.o $(user_access-y)
-obj-$(CONFIG_INFINIBAND_USER_ACCESS_UCM) += ib_ucm.o $(user_access-y)
 
 ib_core-y :=			packer.o ud_header.o verbs.o cq.o rw.o sysfs.o \
 				device.o fmr_pool.o cache.o netlink.o \
 				roce_gid_mgmt.o mr_pool.o addr.o sa_query.o \
 				multicast.o mad.o smi.o agent.o mad_rmpp.o \
-				nldev.o restrack.o
+				nldev.o restrack.o counters.o
 
 ib_core-$(CONFIG_SECURITY_INFINIBAND) += security.o
-ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o
-ib_core-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += umem_odp.o
 ib_core-$(CONFIG_CGROUP_RDMA) += cgroup.o
 
 ib_cm-y :=			cm.o
@@ -31,11 +28,11 @@
 
 ib_umad-y :=			user_mad.o
 
-ib_ucm-y :=			ucm.o
-
 ib_uverbs-y :=			uverbs_main.o uverbs_cmd.o uverbs_marshall.o \
 				rdma_core.o uverbs_std_types.o uverbs_ioctl.o \
 				uverbs_std_types_cq.o \
 				uverbs_std_types_flow_action.o uverbs_std_types_dm.o \
 				uverbs_std_types_mr.o uverbs_std_types_counters.o \
-				uverbs_uapi.o
+				uverbs_uapi.o uverbs_std_types_device.o
+ib_uverbs-$(CONFIG_INFINIBAND_USER_MEM) += umem.o
+ib_uverbs-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += umem_odp.o
diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c
index 46b855a..6d7ec37 100644
--- a/drivers/infiniband/core/addr.c
+++ b/drivers/infiniband/core/addr.c
@@ -42,9 +42,11 @@
 #include <net/neighbour.h>
 #include <net/route.h>
 #include <net/netevent.h>
-#include <net/addrconf.h>
+#include <net/ipv6_stubs.h>
 #include <net/ip6_route.h>
 #include <rdma/ib_addr.h>
+#include <rdma/ib_cache.h>
+#include <rdma/ib_sa.h>
 #include <rdma/ib.h>
 #include <rdma/rdma_netlink.h>
 #include <net/netlink.h>
@@ -61,6 +63,7 @@
 			 struct rdma_dev_addr *addr, void *context);
 	unsigned long timeout;
 	struct delayed_work work;
+	bool resolve_by_gid_attr;	/* Consider gid attr in resolve phase */
 	int status;
 	u32 seq;
 };
@@ -84,8 +87,8 @@
 	if (nlh->nlmsg_flags & RDMA_NL_LS_F_ERR)
 		return false;
 
-	ret = nla_parse(tb, LS_NLA_TYPE_MAX - 1, nlmsg_data(nlh),
-			nlmsg_len(nlh), ib_nl_addr_policy, NULL);
+	ret = nla_parse_deprecated(tb, LS_NLA_TYPE_MAX - 1, nlmsg_data(nlh),
+				   nlmsg_len(nlh), ib_nl_addr_policy, NULL);
 	if (ret)
 		return false;
 
@@ -180,7 +183,7 @@
 
 	/* Repair the nlmsg header length */
 	nlmsg_end(skb, nlh);
-	rdma_nl_multicast(skb, RDMA_NL_GROUP_LS, GFP_KERNEL);
+	rdma_nl_multicast(&init_net, skb, RDMA_NL_GROUP_LS, GFP_KERNEL);
 
 	/* Make the request retry, so when we get the response from userspace
 	 * we will have something.
@@ -219,18 +222,54 @@
 }
 EXPORT_SYMBOL(rdma_addr_size_kss);
 
-void rdma_copy_addr(struct rdma_dev_addr *dev_addr,
-		    const struct net_device *dev,
-		    const unsigned char *dst_dev_addr)
+/**
+ * rdma_copy_src_l2_addr - Copy netdevice source addresses
+ * @dev_addr:	Destination address pointer where to copy the addresses
+ * @dev:	Netdevice whose source addresses to copy
+ *
+ * rdma_copy_src_l2_addr() copies source addresses from the specified netdevice.
+ * This includes unicast address, broadcast address, device type and
+ * interface index.
+ */
+void rdma_copy_src_l2_addr(struct rdma_dev_addr *dev_addr,
+			   const struct net_device *dev)
 {
 	dev_addr->dev_type = dev->type;
 	memcpy(dev_addr->src_dev_addr, dev->dev_addr, MAX_ADDR_LEN);
 	memcpy(dev_addr->broadcast, dev->broadcast, MAX_ADDR_LEN);
-	if (dst_dev_addr)
-		memcpy(dev_addr->dst_dev_addr, dst_dev_addr, MAX_ADDR_LEN);
 	dev_addr->bound_dev_if = dev->ifindex;
 }
-EXPORT_SYMBOL(rdma_copy_addr);
+EXPORT_SYMBOL(rdma_copy_src_l2_addr);
+
+static struct net_device *
+rdma_find_ndev_for_src_ip_rcu(struct net *net, const struct sockaddr *src_in)
+{
+	struct net_device *dev = NULL;
+	int ret = -EADDRNOTAVAIL;
+
+	switch (src_in->sa_family) {
+	case AF_INET:
+		dev = __ip_dev_find(net,
+				    ((const struct sockaddr_in *)src_in)->sin_addr.s_addr,
+				    false);
+		if (dev)
+			ret = 0;
+		break;
+#if IS_ENABLED(CONFIG_IPV6)
+	case AF_INET6:
+		for_each_netdev_rcu(net, dev) {
+			if (ipv6_chk_addr(net,
+					  &((const struct sockaddr_in6 *)src_in)->sin6_addr,
+					  dev, 1)) {
+				ret = 0;
+				break;
+			}
+		}
+		break;
+#endif
+	}
+	return ret ? ERR_PTR(ret) : dev;
+}
 
 int rdma_translate_ip(const struct sockaddr *addr,
 		      struct rdma_dev_addr *dev_addr)
@@ -241,38 +280,17 @@
 		dev = dev_get_by_index(dev_addr->net, dev_addr->bound_dev_if);
 		if (!dev)
 			return -ENODEV;
-		rdma_copy_addr(dev_addr, dev, NULL);
+		rdma_copy_src_l2_addr(dev_addr, dev);
 		dev_put(dev);
 		return 0;
 	}
 
-	switch (addr->sa_family) {
-	case AF_INET:
-		dev = ip_dev_find(dev_addr->net,
-			((const struct sockaddr_in *)addr)->sin_addr.s_addr);
-
-		if (!dev)
-			return -EADDRNOTAVAIL;
-
-		rdma_copy_addr(dev_addr, dev, NULL);
-		dev_put(dev);
-		break;
-#if IS_ENABLED(CONFIG_IPV6)
-	case AF_INET6:
-		rcu_read_lock();
-		for_each_netdev_rcu(dev_addr->net, dev) {
-			if (ipv6_chk_addr(dev_addr->net,
-					  &((const struct sockaddr_in6 *)addr)->sin6_addr,
-					  dev, 1)) {
-				rdma_copy_addr(dev_addr, dev, NULL);
-				break;
-			}
-		}
-		rcu_read_unlock();
-		break;
-#endif
-	}
-	return 0;
+	rcu_read_lock();
+	dev = rdma_find_ndev_for_src_ip_rcu(dev_addr->net, addr);
+	if (!IS_ERR(dev))
+		rdma_copy_src_l2_addr(dev_addr, dev);
+	rcu_read_unlock();
+	return PTR_ERR_OR_ZERO(dev);
 }
 EXPORT_SYMBOL(rdma_translate_ip);
 
@@ -295,15 +313,12 @@
 	spin_unlock_bh(&lock);
 }
 
-static int ib_nl_fetch_ha(const struct dst_entry *dst,
-			  struct rdma_dev_addr *dev_addr,
+static int ib_nl_fetch_ha(struct rdma_dev_addr *dev_addr,
 			  const void *daddr, u32 seq, u16 family)
 {
-	if (rdma_nl_chk_listeners(RDMA_NL_GROUP_LS))
+	if (!rdma_nl_chk_listeners(RDMA_NL_GROUP_LS))
 		return -EADDRNOTAVAIL;
 
-	/* We fill in what we can, the response will fill the rest */
-	rdma_copy_addr(dev_addr, dst->dev, NULL);
 	return ib_nl_ip_send_msg(dev_addr, daddr, seq, family);
 }
 
@@ -322,7 +337,7 @@
 		neigh_event_send(n, NULL);
 		ret = -ENODATA;
 	} else {
-		rdma_copy_addr(dev_addr, dst->dev, n->ha);
+		neigh_ha_snapshot(dev_addr->dst_dev_addr, n, dst->dev);
 	}
 
 	neigh_release(n);
@@ -356,18 +371,22 @@
 		(const void *)&dst_in6->sin6_addr;
 	sa_family_t family = dst_in->sa_family;
 
-	/* Gateway + ARPHRD_INFINIBAND -> IB router */
-	if (has_gateway(dst, family) && dst->dev->type == ARPHRD_INFINIBAND)
-		return ib_nl_fetch_ha(dst, dev_addr, daddr, seq, family);
+	/* If we have a gateway in IB mode then it must be an IB network */
+	if (has_gateway(dst, family) && dev_addr->network == RDMA_NETWORK_IB)
+		return ib_nl_fetch_ha(dev_addr, daddr, seq, family);
 	else
 		return dst_fetch_ha(dst, dev_addr, daddr);
 }
 
-static int addr4_resolve(struct sockaddr_in *src_in,
-			 const struct sockaddr_in *dst_in,
+static int addr4_resolve(struct sockaddr *src_sock,
+			 const struct sockaddr *dst_sock,
 			 struct rdma_dev_addr *addr,
 			 struct rtable **prt)
 {
+	struct sockaddr_in *src_in = (struct sockaddr_in *)src_sock;
+	const struct sockaddr_in *dst_in =
+			(const struct sockaddr_in *)dst_sock;
+
 	__be32 src_ip = src_in->sin_addr.s_addr;
 	__be32 dst_ip = dst_in->sin_addr.s_addr;
 	struct rtable *rt;
@@ -383,16 +402,8 @@
 	if (ret)
 		return ret;
 
-	src_in->sin_family = AF_INET;
 	src_in->sin_addr.s_addr = fl4.saddr;
 
-	/* If there's a gateway and type of device not ARPHRD_INFINIBAND, we're
-	 * definitely in RoCE v2 (as RoCE v1 isn't routable) set the network
-	 * type accordingly.
-	 */
-	if (rt->rt_uses_gateway && rt->dst.dev->type != ARPHRD_INFINIBAND)
-		addr->network = RDMA_NETWORK_IPV4;
-
 	addr->hoplimit = ip4_dst_hoplimit(&rt->dst);
 
 	*prt = rt;
@@ -400,14 +411,16 @@
 }
 
 #if IS_ENABLED(CONFIG_IPV6)
-static int addr6_resolve(struct sockaddr_in6 *src_in,
-			 const struct sockaddr_in6 *dst_in,
+static int addr6_resolve(struct sockaddr *src_sock,
+			 const struct sockaddr *dst_sock,
 			 struct rdma_dev_addr *addr,
 			 struct dst_entry **pdst)
 {
+	struct sockaddr_in6 *src_in = (struct sockaddr_in6 *)src_sock;
+	const struct sockaddr_in6 *dst_in =
+				(const struct sockaddr_in6 *)dst_sock;
 	struct flowi6 fl6;
 	struct dst_entry *dst;
-	struct rt6_info *rt;
 	int ret;
 
 	memset(&fl6, 0, sizeof fl6);
@@ -419,19 +432,8 @@
 	if (ret < 0)
 		return ret;
 
-	rt = (struct rt6_info *)dst;
-	if (ipv6_addr_any(&src_in->sin6_addr)) {
-		src_in->sin6_family = AF_INET6;
+	if (ipv6_addr_any(&src_in->sin6_addr))
 		src_in->sin6_addr = fl6.saddr;
-	}
-
-	/* If there's a gateway and type of device not ARPHRD_INFINIBAND, we're
-	 * definitely in RoCE v2 (as RoCE v1 isn't routable) set the network
-	 * type accordingly.
-	 */
-	if (rt->rt6i_flags & RTF_GATEWAY &&
-	    ip6_dst_idev(dst)->dev->type != ARPHRD_INFINIBAND)
-		addr->network = RDMA_NETWORK_IPV6;
 
 	addr->hoplimit = ip6_dst_hoplimit(dst);
 
@@ -439,8 +441,8 @@
 	return 0;
 }
 #else
-static int addr6_resolve(struct sockaddr_in6 *src_in,
-			 const struct sockaddr_in6 *dst_in,
+static int addr6_resolve(struct sockaddr *src_sock,
+			 const struct sockaddr *dst_sock,
 			 struct rdma_dev_addr *addr,
 			 struct dst_entry **pdst)
 {
@@ -451,36 +453,110 @@
 static int addr_resolve_neigh(const struct dst_entry *dst,
 			      const struct sockaddr *dst_in,
 			      struct rdma_dev_addr *addr,
+			      unsigned int ndev_flags,
 			      u32 seq)
 {
-	if (dst->dev->flags & IFF_LOOPBACK) {
-		int ret;
+	int ret = 0;
 
-		ret = rdma_translate_ip(dst_in, addr);
-		if (!ret)
-			memcpy(addr->dst_dev_addr, addr->src_dev_addr,
-			       MAX_ADDR_LEN);
+	if (ndev_flags & IFF_LOOPBACK) {
+		memcpy(addr->dst_dev_addr, addr->src_dev_addr, MAX_ADDR_LEN);
+	} else {
+		if (!(ndev_flags & IFF_NOARP)) {
+			/* If the device doesn't do ARP internally */
+			ret = fetch_ha(dst, addr, dst_in, seq);
+		}
+	}
+	return ret;
+}
 
-		return ret;
+static int copy_src_l2_addr(struct rdma_dev_addr *dev_addr,
+			    const struct sockaddr *dst_in,
+			    const struct dst_entry *dst,
+			    const struct net_device *ndev)
+{
+	int ret = 0;
+
+	if (dst->dev->flags & IFF_LOOPBACK)
+		ret = rdma_translate_ip(dst_in, dev_addr);
+	else
+		rdma_copy_src_l2_addr(dev_addr, dst->dev);
+
+	/*
+	 * If there's a gateway and type of device not ARPHRD_INFINIBAND,
+	 * we're definitely in RoCE v2 (as RoCE v1 isn't routable) set the
+	 * network type accordingly.
+	 */
+	if (has_gateway(dst, dst_in->sa_family) &&
+	    ndev->type != ARPHRD_INFINIBAND)
+		dev_addr->network = dst_in->sa_family == AF_INET ?
+						RDMA_NETWORK_IPV4 :
+						RDMA_NETWORK_IPV6;
+	else
+		dev_addr->network = RDMA_NETWORK_IB;
+
+	return ret;
+}
+
+static int rdma_set_src_addr_rcu(struct rdma_dev_addr *dev_addr,
+				 unsigned int *ndev_flags,
+				 const struct sockaddr *dst_in,
+				 const struct dst_entry *dst)
+{
+	struct net_device *ndev = READ_ONCE(dst->dev);
+
+	*ndev_flags = ndev->flags;
+	/* A physical device must be the RDMA device to use */
+	if (ndev->flags & IFF_LOOPBACK) {
+		/*
+		 * RDMA (IB/RoCE, iWarp) doesn't run on lo interface or
+		 * loopback IP address. So if route is resolved to loopback
+		 * interface, translate that to a real ndev based on non
+		 * loopback IP address.
+		 */
+		ndev = rdma_find_ndev_for_src_ip_rcu(dev_net(ndev), dst_in);
+		if (IS_ERR(ndev))
+			return -ENODEV;
 	}
 
-	/* If the device doesn't do ARP internally */
-	if (!(dst->dev->flags & IFF_NOARP))
-		return fetch_ha(dst, addr, dst_in, seq);
+	return copy_src_l2_addr(dev_addr, dst_in, dst, ndev);
+}
 
-	rdma_copy_addr(addr, dst->dev, NULL);
+static int set_addr_netns_by_gid_rcu(struct rdma_dev_addr *addr)
+{
+	struct net_device *ndev;
 
+	ndev = rdma_read_gid_attr_ndev_rcu(addr->sgid_attr);
+	if (IS_ERR(ndev))
+		return PTR_ERR(ndev);
+
+	/*
+	 * Since we are holding the rcu, reading net and ifindex
+	 * are safe without any additional reference; because
+	 * change_net_namespace() in net/core/dev.c does rcu sync
+	 * after it changes the state to IFF_DOWN and before
+	 * updating netdev fields {net, ifindex}.
+	 */
+	addr->net = dev_net(ndev);
+	addr->bound_dev_if = ndev->ifindex;
 	return 0;
 }
 
+static void rdma_addr_set_net_defaults(struct rdma_dev_addr *addr)
+{
+	addr->net = &init_net;
+	addr->bound_dev_if = 0;
+}
+
 static int addr_resolve(struct sockaddr *src_in,
 			const struct sockaddr *dst_in,
 			struct rdma_dev_addr *addr,
 			bool resolve_neigh,
+			bool resolve_by_gid_attr,
 			u32 seq)
 {
-	struct net_device *ndev;
-	struct dst_entry *dst;
+	struct dst_entry *dst = NULL;
+	unsigned int ndev_flags = 0;
+	struct rtable *rt = NULL;
 	int ret;
 
 	if (!addr->net) {
@@ -488,58 +564,55 @@
 		return -EINVAL;
 	}
 
+	rcu_read_lock();
+	if (resolve_by_gid_attr) {
+		if (!addr->sgid_attr) {
+			rcu_read_unlock();
+			pr_warn_ratelimited("%s: missing gid_attr\n", __func__);
+			return -EINVAL;
+		}
+		/*
+		 * If the request is for a specific gid attribute of the
+		 * rdma_dev_addr, derive net from the netdevice of the
+		 * GID attribute.
+		 */
+		ret = set_addr_netns_by_gid_rcu(addr);
+		if (ret) {
+			rcu_read_unlock();
+			return ret;
+		}
+	}
 	if (src_in->sa_family == AF_INET) {
-		struct rtable *rt = NULL;
-		const struct sockaddr_in *dst_in4 =
-			(const struct sockaddr_in *)dst_in;
-
-		ret = addr4_resolve((struct sockaddr_in *)src_in,
-				    dst_in4, addr, &rt);
-		if (ret)
-			return ret;
-
-		if (resolve_neigh)
-			ret = addr_resolve_neigh(&rt->dst, dst_in, addr, seq);
-
-		if (addr->bound_dev_if) {
-			ndev = dev_get_by_index(addr->net, addr->bound_dev_if);
-		} else {
-			ndev = rt->dst.dev;
-			dev_hold(ndev);
-		}
-
-		ip_rt_put(rt);
+		ret = addr4_resolve(src_in, dst_in, addr, &rt);
+		dst = &rt->dst;
 	} else {
-		const struct sockaddr_in6 *dst_in6 =
-			(const struct sockaddr_in6 *)dst_in;
+		ret = addr6_resolve(src_in, dst_in, addr, &dst);
+	}
+	if (ret) {
+		rcu_read_unlock();
+		goto done;
+	}
+	ret = rdma_set_src_addr_rcu(addr, &ndev_flags, dst_in, dst);
+	rcu_read_unlock();
 
-		ret = addr6_resolve((struct sockaddr_in6 *)src_in,
-				    dst_in6, addr,
-				    &dst);
-		if (ret)
-			return ret;
+	/*
+	 * Resolve neighbor destination address if requested and
+	 * only if src addr translation didn't fail.
+	 */
+	if (!ret && resolve_neigh)
+		ret = addr_resolve_neigh(dst, dst_in, addr, ndev_flags, seq);
 
-		if (resolve_neigh)
-			ret = addr_resolve_neigh(dst, dst_in, addr, seq);
-
-		if (addr->bound_dev_if) {
-			ndev = dev_get_by_index(addr->net, addr->bound_dev_if);
-		} else {
-			ndev = dst->dev;
-			dev_hold(ndev);
-		}
-
+	if (src_in->sa_family == AF_INET)
+		ip_rt_put(rt);
+	else
 		dst_release(dst);
-	}
-
-	if (ndev) {
-		if (ndev->flags & IFF_LOOPBACK)
-			ret = rdma_translate_ip(dst_in, addr);
-		else
-			addr->bound_dev_if = ndev->ifindex;
-		dev_put(ndev);
-	}
-
+done:
+	/*
+	 * Clear the addr net to go back to its original state, only if it was
+	 * derived from GID attribute in this context.
+	 */
+	if (resolve_by_gid_attr)
+		rdma_addr_set_net_defaults(addr);
 	return ret;
 }
 
@@ -554,7 +627,8 @@
 		src_in = (struct sockaddr *)&req->src_addr;
 		dst_in = (struct sockaddr *)&req->dst_addr;
 		req->status = addr_resolve(src_in, dst_in, req->addr,
-					   true, req->seq);
+					   true, req->resolve_by_gid_attr,
+					   req->seq);
 		if (req->status && time_after_eq(jiffies, req->timeout)) {
 			req->status = -ETIMEDOUT;
 		} else if (req->status == -ENODATA) {
@@ -586,10 +660,10 @@
 }
 
 int rdma_resolve_ip(struct sockaddr *src_addr, const struct sockaddr *dst_addr,
-		    struct rdma_dev_addr *addr, int timeout_ms,
+		    struct rdma_dev_addr *addr, unsigned long timeout_ms,
 		    void (*callback)(int status, struct sockaddr *src_addr,
 				     struct rdma_dev_addr *addr, void *context),
-		    void *context)
+		    bool resolve_by_gid_attr, void *context)
 {
 	struct sockaddr *src_in, *dst_in;
 	struct addr_req *req;
@@ -617,10 +691,12 @@
 	req->addr = addr;
 	req->callback = callback;
 	req->context = context;
+	req->resolve_by_gid_attr = resolve_by_gid_attr;
 	INIT_DELAYED_WORK(&req->work, process_one_req);
 	req->seq = (u32)atomic_inc_return(&ib_nl_addr_request_seq);
 
-	req->status = addr_resolve(src_in, dst_in, addr, true, req->seq);
+	req->status = addr_resolve(src_in, dst_in, addr, true,
+				   req->resolve_by_gid_attr, req->seq);
 	switch (req->status) {
 	case 0:
 		req->timeout = jiffies;
@@ -641,25 +717,53 @@
 }
 EXPORT_SYMBOL(rdma_resolve_ip);
 
-int rdma_resolve_ip_route(struct sockaddr *src_addr,
-			  const struct sockaddr *dst_addr,
-			  struct rdma_dev_addr *addr)
+int roce_resolve_route_from_path(struct sa_path_rec *rec,
+				 const struct ib_gid_attr *attr)
 {
-	struct sockaddr_storage ssrc_addr = {};
-	struct sockaddr *src_in = (struct sockaddr *)&ssrc_addr;
+	union {
+		struct sockaddr     _sockaddr;
+		struct sockaddr_in  _sockaddr_in;
+		struct sockaddr_in6 _sockaddr_in6;
+	} sgid, dgid;
+	struct rdma_dev_addr dev_addr = {};
+	int ret;
 
-	if (src_addr) {
-		if (src_addr->sa_family != dst_addr->sa_family)
-			return -EINVAL;
+	if (rec->roce.route_resolved)
+		return 0;
 
-		memcpy(src_in, src_addr, rdma_addr_size(src_addr));
-	} else {
-		src_in->sa_family = dst_addr->sa_family;
-	}
+	rdma_gid2ip((struct sockaddr *)&sgid, &rec->sgid);
+	rdma_gid2ip((struct sockaddr *)&dgid, &rec->dgid);
 
-	return addr_resolve(src_in, dst_addr, addr, false, 0);
+	if (sgid._sockaddr.sa_family != dgid._sockaddr.sa_family)
+		return -EINVAL;
+
+	if (!attr || !attr->ndev)
+		return -EINVAL;
+
+	dev_addr.net = &init_net;
+	dev_addr.sgid_attr = attr;
+
+	ret = addr_resolve((struct sockaddr *)&sgid, (struct sockaddr *)&dgid,
+			   &dev_addr, false, true, 0);
+	if (ret)
+		return ret;
+
+	if ((dev_addr.network == RDMA_NETWORK_IPV4 ||
+	     dev_addr.network == RDMA_NETWORK_IPV6) &&
+	    rec->rec_type != SA_PATH_REC_TYPE_ROCE_V2)
+		return -EINVAL;
+
+	rec->roce.route_resolved = true;
+	return 0;
 }
 
+/**
+ * rdma_addr_cancel - Cancel resolve ip request
+ * @addr:	Pointer to address structure given previously
+ *		during rdma_resolve_ip().
+ * rdma_addr_cancel() is synchronous function which cancels any pending
+ * request if there is any.
+ */
 void rdma_addr_cancel(struct rdma_dev_addr *addr)
 {
 	struct addr_req *req, *temp_req;
@@ -687,11 +791,6 @@
 	 * guarentees no work is running and none will be started.
 	 */
 	cancel_delayed_work_sync(&found->work);
-
-	if (found->callback)
-		found->callback(-ECANCELED, (struct sockaddr *)&found->src_addr,
-			      found->addr, found->context);
-
 	kfree(found);
 }
 EXPORT_SYMBOL(rdma_addr_cancel);
@@ -710,28 +809,28 @@
 
 int rdma_addr_find_l2_eth_by_grh(const union ib_gid *sgid,
 				 const union ib_gid *dgid,
-				 u8 *dmac, const struct net_device *ndev,
+				 u8 *dmac, const struct ib_gid_attr *sgid_attr,
 				 int *hoplimit)
 {
 	struct rdma_dev_addr dev_addr;
 	struct resolve_cb_context ctx;
 	union {
-		struct sockaddr     _sockaddr;
 		struct sockaddr_in  _sockaddr_in;
 		struct sockaddr_in6 _sockaddr_in6;
 	} sgid_addr, dgid_addr;
 	int ret;
 
-	rdma_gid2ip(&sgid_addr._sockaddr, sgid);
-	rdma_gid2ip(&dgid_addr._sockaddr, dgid);
+	rdma_gid2ip((struct sockaddr *)&sgid_addr, sgid);
+	rdma_gid2ip((struct sockaddr *)&dgid_addr, dgid);
 
 	memset(&dev_addr, 0, sizeof(dev_addr));
-	dev_addr.bound_dev_if = ndev->ifindex;
 	dev_addr.net = &init_net;
+	dev_addr.sgid_attr = sgid_attr;
 
 	init_completion(&ctx.comp);
-	ret = rdma_resolve_ip(&sgid_addr._sockaddr, &dgid_addr._sockaddr,
-			      &dev_addr, 1000, resolve_cb, &ctx);
+	ret = rdma_resolve_ip((struct sockaddr *)&sgid_addr,
+			      (struct sockaddr *)&dgid_addr, &dev_addr, 1000,
+			      resolve_cb, true, &ctx);
 	if (ret)
 		return ret;
 
diff --git a/drivers/infiniband/core/agent.c b/drivers/infiniband/core/agent.c
index 324ef85..f82b426 100644
--- a/drivers/infiniband/core/agent.c
+++ b/drivers/infiniband/core/agent.c
@@ -137,13 +137,13 @@
 err2:
 	ib_free_send_mad(send_buf);
 err1:
-	rdma_destroy_ah(ah);
+	rdma_destroy_ah(ah, RDMA_DESTROY_AH_SLEEPABLE);
 }
 
 static void agent_send_handler(struct ib_mad_agent *mad_agent,
 			       struct ib_mad_send_wc *mad_send_wc)
 {
-	rdma_destroy_ah(mad_send_wc->send_buf->ah);
+	rdma_destroy_ah(mad_send_wc->send_buf->ah, RDMA_DESTROY_AH_SLEEPABLE);
 	ib_free_send_mad(mad_send_wc->send_buf);
 }
 
diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c
index 3208ad6..00fb3ea 100644
--- a/drivers/infiniband/core/cache.c
+++ b/drivers/infiniband/core/cache.c
@@ -78,11 +78,22 @@
 	GID_TABLE_ENTRY_PENDING_DEL	= 3,
 };
 
+struct roce_gid_ndev_storage {
+	struct rcu_head rcu_head;
+	struct net_device *ndev;
+};
+
 struct ib_gid_table_entry {
 	struct kref			kref;
 	struct work_struct		del_work;
 	struct ib_gid_attr		attr;
 	void				*context;
+	/* Store the ndev pointer to release reference later on in
+	 * call_rcu context because by that time gid_table_entry
+	 * and attr might be already freed. So keep a copy of it.
+	 * ndev_storage is freed by rcu callback.
+	 */
+	struct roce_gid_ndev_storage	*ndev_storage;
 	enum gid_table_entry_state	state;
 };
 
@@ -185,7 +196,7 @@
 
 static struct ib_gid_table *rdma_gid_table(struct ib_device *device, u8 port)
 {
-	return device->cache.ports[port - rdma_start_port(device)].gid;
+	return device->port_data[port].cache.gid;
 }
 
 static bool is_gid_entry_free(const struct ib_gid_table_entry *entry)
@@ -206,19 +217,28 @@
 	queue_work(ib_wq, &entry->del_work);
 }
 
+static void put_gid_ndev(struct rcu_head *head)
+{
+	struct roce_gid_ndev_storage *storage =
+		container_of(head, struct roce_gid_ndev_storage, rcu_head);
+
+	WARN_ON(!storage->ndev);
+	/* At this point its safe to release netdev reference,
+	 * as all callers working on gid_attr->ndev are done
+	 * using this netdev.
+	 */
+	dev_put(storage->ndev);
+	kfree(storage);
+}
+
 static void free_gid_entry_locked(struct ib_gid_table_entry *entry)
 {
 	struct ib_device *device = entry->attr.device;
 	u8 port_num = entry->attr.port_num;
 	struct ib_gid_table *table = rdma_gid_table(device, port_num);
 
-	pr_debug("%s device=%s port=%d index=%d gid %pI6\n", __func__,
-		 device->name, port_num, entry->attr.index,
-		 entry->attr.gid.raw);
-
-	if (rdma_cap_roce_gid_table(device, port_num) &&
-	    entry->state != GID_TABLE_ENTRY_INVALID)
-		device->del_gid(&entry->attr, &entry->context);
+	dev_dbg(&device->dev, "%s port=%d index=%d gid %pI6\n", __func__,
+		port_num, entry->attr.index, entry->attr.gid.raw);
 
 	write_lock_irq(&table->rwlock);
 
@@ -233,8 +253,8 @@
 	/* Now this index is ready to be allocated */
 	write_unlock_irq(&table->rwlock);
 
-	if (entry->attr.ndev)
-		dev_put(entry->attr.ndev);
+	if (entry->ndev_storage)
+		call_rcu(&entry->ndev_storage->rcu_head, put_gid_ndev);
 	kfree(entry);
 }
 
@@ -271,14 +291,25 @@
 alloc_gid_entry(const struct ib_gid_attr *attr)
 {
 	struct ib_gid_table_entry *entry;
+	struct net_device *ndev;
 
 	entry = kzalloc(sizeof(*entry), GFP_KERNEL);
 	if (!entry)
 		return NULL;
+
+	ndev = rcu_dereference_protected(attr->ndev, 1);
+	if (ndev) {
+		entry->ndev_storage = kzalloc(sizeof(*entry->ndev_storage),
+					      GFP_KERNEL);
+		if (!entry->ndev_storage) {
+			kfree(entry);
+			return NULL;
+		}
+		dev_hold(ndev);
+		entry->ndev_storage->ndev = ndev;
+	}
 	kref_init(&entry->kref);
 	memcpy(&entry->attr, attr, sizeof(*attr));
-	if (entry->attr.ndev)
-		dev_hold(entry->attr.ndev);
 	INIT_WORK(&entry->del_work, free_gid_work);
 	entry->state = GID_TABLE_ENTRY_INVALID;
 	return entry;
@@ -289,9 +320,9 @@
 {
 	entry->state = GID_TABLE_ENTRY_VALID;
 
-	pr_debug("%s device=%s port=%d index=%d gid %pI6\n", __func__,
-		 entry->attr.device->name, entry->attr.port_num,
-		 entry->attr.index, entry->attr.gid.raw);
+	dev_dbg(&entry->attr.device->dev, "%s port=%d index=%d gid %pI6\n",
+		__func__, entry->attr.port_num, entry->attr.index,
+		entry->attr.gid.raw);
 
 	lockdep_assert_held(&table->lock);
 	write_lock_irq(&table->rwlock);
@@ -320,17 +351,16 @@
 	int ret;
 
 	if (!attr->ndev) {
-		pr_err("%s NULL netdev device=%s port=%d index=%d\n",
-		       __func__, attr->device->name, attr->port_num,
-		       attr->index);
+		dev_err(&attr->device->dev, "%s NULL netdev port=%d index=%d\n",
+			__func__, attr->port_num, attr->index);
 		return -EINVAL;
 	}
 	if (rdma_cap_roce_gid_table(attr->device, attr->port_num)) {
-		ret = attr->device->add_gid(attr, &entry->context);
+		ret = attr->device->ops.add_gid(attr, &entry->context);
 		if (ret) {
-			pr_err("%s GID add failed device=%s port=%d index=%d\n",
-			       __func__, attr->device->name, attr->port_num,
-			       attr->index);
+			dev_err(&attr->device->dev,
+				"%s GID add failed port=%d index=%d\n",
+				__func__, attr->port_num, attr->index);
 			return ret;
 		}
 	}
@@ -349,13 +379,13 @@
 static void del_gid(struct ib_device *ib_dev, u8 port,
 		    struct ib_gid_table *table, int ix)
 {
+	struct roce_gid_ndev_storage *ndev_storage;
 	struct ib_gid_table_entry *entry;
 
 	lockdep_assert_held(&table->lock);
 
-	pr_debug("%s device=%s port=%d index=%d gid %pI6\n", __func__,
-		 ib_dev->name, port, ix,
-		 table->data_vec[ix]->attr.gid.raw);
+	dev_dbg(&ib_dev->dev, "%s port=%d index=%d gid %pI6\n", __func__, port,
+		ix, table->data_vec[ix]->attr.gid.raw);
 
 	write_lock_irq(&table->rwlock);
 	entry = table->data_vec[ix];
@@ -367,6 +397,16 @@
 		table->data_vec[ix] = NULL;
 	write_unlock_irq(&table->rwlock);
 
+	ndev_storage = entry->ndev_storage;
+	if (ndev_storage) {
+		entry->ndev_storage = NULL;
+		rcu_assign_pointer(entry->attr.ndev, NULL);
+		call_rcu(&ndev_storage->rcu_head, put_gid_ndev);
+	}
+
+	if (rdma_cap_roce_gid_table(ib_dev, port))
+		ib_dev->ops.del_gid(&entry->attr, &entry->context);
+
 	put_gid_entry_locked(entry);
 }
 
@@ -547,32 +587,11 @@
 int ib_cache_gid_add(struct ib_device *ib_dev, u8 port,
 		     union ib_gid *gid, struct ib_gid_attr *attr)
 {
-	struct net_device *idev;
-	unsigned long mask;
-	int ret;
+	unsigned long mask = GID_ATTR_FIND_MASK_GID |
+			     GID_ATTR_FIND_MASK_GID_TYPE |
+			     GID_ATTR_FIND_MASK_NETDEV;
 
-	if (ib_dev->get_netdev) {
-		idev = ib_dev->get_netdev(ib_dev, port);
-		if (idev && attr->ndev != idev) {
-			union ib_gid default_gid;
-
-			/* Adding default GIDs in not permitted */
-			make_default_gid(idev, &default_gid);
-			if (!memcmp(gid, &default_gid, sizeof(*gid))) {
-				dev_put(idev);
-				return -EPERM;
-			}
-		}
-		if (idev)
-			dev_put(idev);
-	}
-
-	mask = GID_ATTR_FIND_MASK_GID |
-	       GID_ATTR_FIND_MASK_GID_TYPE |
-	       GID_ATTR_FIND_MASK_NETDEV;
-
-	ret = __ib_cache_gid_add(ib_dev, port, gid, attr, mask, false);
-	return ret;
+	return __ib_cache_gid_add(ib_dev, port, gid, attr, mask, false);
 }
 
 static int
@@ -769,7 +788,7 @@
 	return NULL;
 }
 
-static void release_gid_table(struct ib_device *device, u8 port,
+static void release_gid_table(struct ib_device *device,
 			      struct ib_gid_table *table)
 {
 	bool leak = false;
@@ -782,15 +801,16 @@
 		if (is_gid_entry_free(table->data_vec[i]))
 			continue;
 		if (kref_read(&table->data_vec[i]->kref) > 1) {
-			pr_err("GID entry ref leak for %s (index %d) ref=%d\n",
-			       device->name, i,
-			       kref_read(&table->data_vec[i]->kref));
+			dev_err(&device->dev,
+				"GID entry ref leak for index %d ref=%d\n", i,
+				kref_read(&table->data_vec[i]->kref));
 			leak = true;
 		}
 	}
 	if (leak)
 		return;
 
+	mutex_destroy(&table->lock);
 	kfree(table->data_vec);
 	kfree(table);
 }
@@ -867,31 +887,27 @@
 
 static void gid_table_release_one(struct ib_device *ib_dev)
 {
-	struct ib_gid_table *table;
-	u8 port;
+	unsigned int p;
 
-	for (port = 0; port < ib_dev->phys_port_cnt; port++) {
-		table = ib_dev->cache.ports[port].gid;
-		release_gid_table(ib_dev, port, table);
-		ib_dev->cache.ports[port].gid = NULL;
+	rdma_for_each_port (ib_dev, p) {
+		release_gid_table(ib_dev, ib_dev->port_data[p].cache.gid);
+		ib_dev->port_data[p].cache.gid = NULL;
 	}
 }
 
 static int _gid_table_setup_one(struct ib_device *ib_dev)
 {
-	u8 port;
 	struct ib_gid_table *table;
+	unsigned int rdma_port;
 
-	for (port = 0; port < ib_dev->phys_port_cnt; port++) {
-		u8 rdma_port = port + rdma_start_port(ib_dev);
-
-		table =	alloc_gid_table(
-				ib_dev->port_immutable[rdma_port].gid_tbl_len);
+	rdma_for_each_port (ib_dev, rdma_port) {
+		table = alloc_gid_table(
+			ib_dev->port_data[rdma_port].immutable.gid_tbl_len);
 		if (!table)
 			goto rollback_table_setup;
 
 		gid_table_reserve_default(ib_dev, rdma_port, table);
-		ib_dev->cache.ports[port].gid = table;
+		ib_dev->port_data[rdma_port].cache.gid = table;
 	}
 	return 0;
 
@@ -902,14 +918,11 @@
 
 static void gid_table_cleanup_one(struct ib_device *ib_dev)
 {
-	struct ib_gid_table *table;
-	u8 port;
+	unsigned int p;
 
-	for (port = 0; port < ib_dev->phys_port_cnt; port++) {
-		table = ib_dev->cache.ports[port].gid;
-		cleanup_gid_table_port(ib_dev, port + rdma_start_port(ib_dev),
-				       table);
-	}
+	rdma_for_each_port (ib_dev, p)
+		cleanup_gid_table_port(ib_dev, p,
+				       ib_dev->port_data[p].cache.gid);
 }
 
 static int gid_table_setup_one(struct ib_device *ib_dev)
@@ -987,17 +1000,17 @@
 	unsigned long mask = GID_ATTR_FIND_MASK_GID |
 			     GID_ATTR_FIND_MASK_GID_TYPE;
 	struct ib_gid_attr gid_attr_val = {.ndev = ndev, .gid_type = gid_type};
-	u8 p;
+	unsigned int p;
 
 	if (ndev)
 		mask |= GID_ATTR_FIND_MASK_NETDEV;
 
-	for (p = 0; p < device->phys_port_cnt; p++) {
+	rdma_for_each_port(device, p) {
 		struct ib_gid_table *table;
 		unsigned long flags;
 		int index;
 
-		table = device->cache.ports[p].gid;
+		table = device->port_data[p].cache.gid;
 		read_lock_irqsave(&table->rwlock, flags);
 		index = find_gid(table, gid, &gid_attr_val, false, mask, NULL);
 		if (index >= 0) {
@@ -1029,7 +1042,7 @@
 
 	read_lock_irqsave(&device->cache.lock, flags);
 
-	cache = device->cache.ports[port_num - rdma_start_port(device)].pkey;
+	cache = device->port_data[port_num].cache.pkey;
 
 	if (index < 0 || index >= cache->table_len)
 		ret = -EINVAL;
@@ -1047,14 +1060,12 @@
 				u64              *sn_pfx)
 {
 	unsigned long flags;
-	int p;
 
 	if (!rdma_is_port_valid(device, port_num))
 		return -EINVAL;
 
-	p = port_num - rdma_start_port(device);
 	read_lock_irqsave(&device->cache.lock, flags);
-	*sn_pfx = device->cache.ports[p].subnet_prefix;
+	*sn_pfx = device->port_data[port_num].cache.subnet_prefix;
 	read_unlock_irqrestore(&device->cache.lock, flags);
 
 	return 0;
@@ -1077,7 +1088,7 @@
 
 	read_lock_irqsave(&device->cache.lock, flags);
 
-	cache = device->cache.ports[port_num - rdma_start_port(device)].pkey;
+	cache = device->port_data[port_num].cache.pkey;
 
 	*index = -1;
 
@@ -1117,7 +1128,7 @@
 
 	read_lock_irqsave(&device->cache.lock, flags);
 
-	cache = device->cache.ports[port_num - rdma_start_port(device)].pkey;
+	cache = device->port_data[port_num].cache.pkey;
 
 	*index = -1;
 
@@ -1145,7 +1156,7 @@
 		return -EINVAL;
 
 	read_lock_irqsave(&device->cache.lock, flags);
-	*lmc = device->cache.ports[port_num - rdma_start_port(device)].lmc;
+	*lmc = device->port_data[port_num].cache.lmc;
 	read_unlock_irqrestore(&device->cache.lock, flags);
 
 	return ret;
@@ -1163,8 +1174,7 @@
 		return -EINVAL;
 
 	read_lock_irqsave(&device->cache.lock, flags);
-	*port_state = device->cache.ports[port_num
-		- rdma_start_port(device)].port_state;
+	*port_state = device->port_data[port_num].cache.port_state;
 	read_unlock_irqrestore(&device->cache.lock, flags);
 
 	return ret;
@@ -1252,6 +1262,100 @@
 }
 EXPORT_SYMBOL(rdma_hold_gid_attr);
 
+/**
+ * rdma_read_gid_attr_ndev_rcu - Read GID attribute netdevice
+ * which must be in UP state.
+ *
+ * @attr:Pointer to the GID attribute
+ *
+ * Returns pointer to netdevice if the netdevice was attached to GID and
+ * netdevice is in UP state. Caller must hold RCU lock as this API
+ * reads the netdev flags which can change while netdevice migrates to
+ * different net namespace. Returns ERR_PTR with error code otherwise.
+ *
+ */
+struct net_device *rdma_read_gid_attr_ndev_rcu(const struct ib_gid_attr *attr)
+{
+	struct ib_gid_table_entry *entry =
+			container_of(attr, struct ib_gid_table_entry, attr);
+	struct ib_device *device = entry->attr.device;
+	struct net_device *ndev = ERR_PTR(-ENODEV);
+	u8 port_num = entry->attr.port_num;
+	struct ib_gid_table *table;
+	unsigned long flags;
+	bool valid;
+
+	table = rdma_gid_table(device, port_num);
+
+	read_lock_irqsave(&table->rwlock, flags);
+	valid = is_gid_entry_valid(table->data_vec[attr->index]);
+	if (valid) {
+		ndev = rcu_dereference(attr->ndev);
+		if (!ndev ||
+		    (ndev && ((READ_ONCE(ndev->flags) & IFF_UP) == 0)))
+			ndev = ERR_PTR(-ENODEV);
+	}
+	read_unlock_irqrestore(&table->rwlock, flags);
+	return ndev;
+}
+EXPORT_SYMBOL(rdma_read_gid_attr_ndev_rcu);
+
+static int get_lower_dev_vlan(struct net_device *lower_dev, void *data)
+{
+	u16 *vlan_id = data;
+
+	if (is_vlan_dev(lower_dev))
+		*vlan_id = vlan_dev_vlan_id(lower_dev);
+
+	/* We are interested only in first level vlan device, so
+	 * always return 1 to stop iterating over next level devices.
+	 */
+	return 1;
+}
+
+/**
+ * rdma_read_gid_l2_fields - Read the vlan ID and source MAC address
+ *			     of a GID entry.
+ *
+ * @attr:	GID attribute pointer whose L2 fields to be read
+ * @vlan_id:	Pointer to vlan id to fill up if the GID entry has
+ *		vlan id. It is optional.
+ * @smac:	Pointer to smac to fill up for a GID entry. It is optional.
+ *
+ * rdma_read_gid_l2_fields() returns 0 on success and returns vlan id
+ * (if gid entry has vlan) and source MAC, or returns error.
+ */
+int rdma_read_gid_l2_fields(const struct ib_gid_attr *attr,
+			    u16 *vlan_id, u8 *smac)
+{
+	struct net_device *ndev;
+
+	rcu_read_lock();
+	ndev = rcu_dereference(attr->ndev);
+	if (!ndev) {
+		rcu_read_unlock();
+		return -ENODEV;
+	}
+	if (smac)
+		ether_addr_copy(smac, ndev->dev_addr);
+	if (vlan_id) {
+		*vlan_id = 0xffff;
+		if (is_vlan_dev(ndev)) {
+			*vlan_id = vlan_dev_vlan_id(ndev);
+		} else {
+			/* If the netdev is upper device and if it's lower
+			 * device is vlan device, consider vlan id of the
+			 * the lower vlan device for this gid entry.
+			 */
+			netdev_walk_all_lower_dev_rcu(attr->ndev,
+					get_lower_dev_vlan, vlan_id);
+		}
+	}
+	rcu_read_unlock();
+	return 0;
+}
+EXPORT_SYMBOL(rdma_read_gid_l2_fields);
+
 static int config_non_roce_gid_cache(struct ib_device *device,
 				     u8 port, int gid_tbl_len)
 {
@@ -1266,12 +1370,13 @@
 
 	mutex_lock(&table->lock);
 	for (i = 0; i < gid_tbl_len; ++i) {
-		if (!device->query_gid)
+		if (!device->ops.query_gid)
 			continue;
-		ret = device->query_gid(device, port, i, &gid_attr.gid);
+		ret = device->ops.query_gid(device, port, i, &gid_attr.gid);
 		if (ret) {
-			pr_warn("query_gid failed (%d) for %s (index %d)\n",
-				ret, device->name, i);
+			dev_warn(&device->dev,
+				 "query_gid failed (%d) for index %d\n", ret,
+				 i);
 			goto err;
 		}
 		gid_attr.index = i;
@@ -1300,8 +1405,7 @@
 
 	ret = ib_query_port(device, port, tprops);
 	if (ret) {
-		pr_warn("ib_query_port failed (%d) for %s\n",
-			ret, device->name);
+		dev_warn(&device->dev, "ib_query_port failed (%d)\n", ret);
 		goto err;
 	}
 
@@ -1323,24 +1427,22 @@
 	for (i = 0; i < pkey_cache->table_len; ++i) {
 		ret = ib_query_pkey(device, port, i, pkey_cache->table + i);
 		if (ret) {
-			pr_warn("ib_query_pkey failed (%d) for %s (index %d)\n",
-				ret, device->name, i);
+			dev_warn(&device->dev,
+				 "ib_query_pkey failed (%d) for index %d\n",
+				 ret, i);
 			goto err;
 		}
 	}
 
 	write_lock_irq(&device->cache.lock);
 
-	old_pkey_cache = device->cache.ports[port -
-		rdma_start_port(device)].pkey;
+	old_pkey_cache = device->port_data[port].cache.pkey;
 
-	device->cache.ports[port - rdma_start_port(device)].pkey = pkey_cache;
-	device->cache.ports[port - rdma_start_port(device)].lmc = tprops->lmc;
-	device->cache.ports[port - rdma_start_port(device)].port_state =
-		tprops->state;
+	device->port_data[port].cache.pkey = pkey_cache;
+	device->port_data[port].cache.lmc = tprops->lmc;
+	device->port_data[port].cache.port_state = tprops->state;
 
-	device->cache.ports[port - rdma_start_port(device)].subnet_prefix =
-							tprops->subnet_prefix;
+	device->port_data[port].cache.subnet_prefix = tprops->subnet_prefix;
 	write_unlock_irq(&device->cache.lock);
 
 	if (enforce_security)
@@ -1377,7 +1479,6 @@
 	    event->event == IB_EVENT_PORT_ACTIVE ||
 	    event->event == IB_EVENT_LID_CHANGE  ||
 	    event->event == IB_EVENT_PKEY_CHANGE ||
-	    event->event == IB_EVENT_SM_CHANGE   ||
 	    event->event == IB_EVENT_CLIENT_REREGISTER ||
 	    event->event == IB_EVENT_GID_CHANGE) {
 		work = kmalloc(sizeof *work, GFP_ATOMIC);
@@ -1398,27 +1499,17 @@
 
 int ib_cache_setup_one(struct ib_device *device)
 {
-	int p;
+	unsigned int p;
 	int err;
 
 	rwlock_init(&device->cache.lock);
 
-	device->cache.ports =
-		kcalloc(rdma_end_port(device) - rdma_start_port(device) + 1,
-			sizeof(*device->cache.ports),
-			GFP_KERNEL);
-	if (!device->cache.ports)
-		return -ENOMEM;
-
 	err = gid_table_setup_one(device);
-	if (err) {
-		kfree(device->cache.ports);
-		device->cache.ports = NULL;
+	if (err)
 		return err;
-	}
 
-	for (p = 0; p <= rdma_end_port(device) - rdma_start_port(device); ++p)
-		ib_cache_update(device, p + rdma_start_port(device), true);
+	rdma_for_each_port (device, p)
+		ib_cache_update(device, p, true);
 
 	INIT_IB_EVENT_HANDLER(&device->cache.event_handler,
 			      device, ib_cache_event);
@@ -1428,7 +1519,7 @@
 
 void ib_cache_release_one(struct ib_device *device)
 {
-	int p;
+	unsigned int p;
 
 	/*
 	 * The release function frees all the cache elements.
@@ -1436,11 +1527,10 @@
 	 * all the device's resources when the cache could no
 	 * longer be accessed.
 	 */
-	for (p = 0; p <= rdma_end_port(device) - rdma_start_port(device); ++p)
-		kfree(device->cache.ports[p].pkey);
+	rdma_for_each_port (device, p)
+		kfree(device->port_data[p].cache.pkey);
 
 	gid_table_release_one(device);
-	kfree(device->cache.ports);
 }
 
 void ib_cache_cleanup_one(struct ib_device *device)
diff --git a/drivers/infiniband/core/cgroup.c b/drivers/infiniband/core/cgroup.c
index 126ac5f..1f037fe 100644
--- a/drivers/infiniband/core/cgroup.c
+++ b/drivers/infiniband/core/cgroup.c
@@ -1,14 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Copyright (C) 2016 Parav Pandit <pandit.parav@gmail.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
  */
 
 #include "core_priv.h"
@@ -21,12 +13,11 @@
  * Register with the rdma cgroup. Should be called before
  * exposing rdma device to user space applications to avoid
  * resource accounting leak.
- * Returns 0 on success or otherwise failure code.
  */
-int ib_device_register_rdmacg(struct ib_device *device)
+void ib_device_register_rdmacg(struct ib_device *device)
 {
 	device->cg_device.name = device->name;
-	return rdmacg_register_device(&device->cg_device);
+	rdmacg_register_device(&device->cg_device);
 }
 
 /**
diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c
index 4c53327..5920c00 100644
--- a/drivers/infiniband/core/cm.c
+++ b/drivers/infiniband/core/cm.c
@@ -52,6 +52,7 @@
 #include <rdma/ib_cache.h>
 #include <rdma/ib_cm.h>
 #include "cm_msgs.h"
+#include "core_priv.h"
 
 MODULE_AUTHOR("Sean Hefty");
 MODULE_DESCRIPTION("InfiniBand CM");
@@ -124,7 +125,8 @@
 	struct rb_root remote_qp_table;
 	struct rb_root remote_id_table;
 	struct rb_root remote_sidr_table;
-	struct idr local_id_table;
+	struct xarray local_id_table;
+	u32 local_id_next;
 	__be32 random_id_operand;
 	struct list_head timewait_list;
 	struct workqueue_struct *wq;
@@ -219,7 +221,6 @@
 struct cm_device {
 	struct list_head list;
 	struct ib_device *ib_device;
-	struct device *device;
 	u8 ack_delay;
 	int going_down;
 	struct cm_port *port[0];
@@ -343,7 +344,7 @@
 		ret = -ENODEV;
 		goto out;
 	}
-	ah = rdma_create_ah(mad_agent->qp->pd, &av->ah_attr);
+	ah = rdma_create_ah(mad_agent->qp->pd, &av->ah_attr, 0);
 	if (IS_ERR(ah)) {
 		ret = PTR_ERR(ah);
 		goto out;
@@ -355,7 +356,7 @@
 			       GFP_ATOMIC,
 			       IB_MGMT_BASE_VERSION);
 	if (IS_ERR(m)) {
-		rdma_destroy_ah(ah);
+		rdma_destroy_ah(ah, 0);
 		ret = PTR_ERR(m);
 		goto out;
 	}
@@ -400,7 +401,7 @@
 static void cm_free_msg(struct ib_mad_send_buf *msg)
 {
 	if (msg->ah)
-		rdma_destroy_ah(msg->ah);
+		rdma_destroy_ah(msg->ah, 0);
 	if (msg->context[0])
 		cm_deref_id(msg->context[0]);
 	ib_free_send_mad(msg);
@@ -598,35 +599,31 @@
 
 static int cm_alloc_id(struct cm_id_private *cm_id_priv)
 {
-	unsigned long flags;
-	int id;
+	int err;
+	u32 id;
 
-	idr_preload(GFP_KERNEL);
-	spin_lock_irqsave(&cm.lock, flags);
-
-	id = idr_alloc_cyclic(&cm.local_id_table, cm_id_priv, 0, 0, GFP_NOWAIT);
-
-	spin_unlock_irqrestore(&cm.lock, flags);
-	idr_preload_end();
+	err = xa_alloc_cyclic_irq(&cm.local_id_table, &id, cm_id_priv,
+			xa_limit_32b, &cm.local_id_next, GFP_KERNEL);
 
 	cm_id_priv->id.local_id = (__force __be32)id ^ cm.random_id_operand;
-	return id < 0 ? id : 0;
+	return err;
+}
+
+static u32 cm_local_id(__be32 local_id)
+{
+	return (__force u32) (local_id ^ cm.random_id_operand);
 }
 
 static void cm_free_id(__be32 local_id)
 {
-	spin_lock_irq(&cm.lock);
-	idr_remove(&cm.local_id_table,
-		   (__force int) (local_id ^ cm.random_id_operand));
-	spin_unlock_irq(&cm.lock);
+	xa_erase_irq(&cm.local_id_table, cm_local_id(local_id));
 }
 
 static struct cm_id_private * cm_get_id(__be32 local_id, __be32 remote_id)
 {
 	struct cm_id_private *cm_id_priv;
 
-	cm_id_priv = idr_find(&cm.local_id_table,
-			      (__force int) (local_id ^ cm.random_id_operand));
+	cm_id_priv = xa_load(&cm.local_id_table, cm_local_id(local_id));
 	if (cm_id_priv) {
 		if (cm_id_priv->id.remote_id == remote_id)
 			atomic_inc(&cm_id_priv->refcount);
@@ -1988,11 +1985,12 @@
 	grh = rdma_ah_read_grh(&cm_id_priv->av.ah_attr);
 	gid_attr = grh->sgid_attr;
 
-	if (gid_attr && gid_attr->ndev) {
+	if (gid_attr &&
+	    rdma_protocol_roce(work->port->cm_dev->ib_device,
+			       work->port->port_num)) {
 		work->path[0].rec_type =
 			sa_conv_gid_to_pathrec_type(gid_attr->gid_type);
 	} else {
-		/* If no GID attribute or ndev is null, it is not RoCE. */
 		cm_path_set_rec_type(work->port->cm_dev->ib_device,
 				     work->port->port_num,
 				     &work->path[0],
@@ -2824,9 +2822,8 @@
 			spin_unlock_irq(&cm.lock);
 			return NULL;
 		}
-		cm_id_priv = idr_find(&cm.local_id_table, (__force int)
-				      (timewait_info->work.local_id ^
-				       cm.random_id_operand));
+		cm_id_priv = xa_load(&cm.local_id_table,
+				cm_local_id(timewait_info->work.local_id));
 		if (cm_id_priv) {
 			if (cm_id_priv->id.remote_id == remote_id)
 				atomic_inc(&cm_id_priv->refcount);
@@ -4052,8 +4049,7 @@
 	atomic_long_inc(&port->counter_group[CM_RECV].
 			counter[attr_id - CM_ATTR_ID_OFFSET]);
 
-	work = kmalloc(sizeof(*work) + sizeof(struct sa_path_rec) * paths,
-		       GFP_KERNEL);
+	work = kmalloc(struct_size(work, path, paths), GFP_KERNEL);
 	if (!work) {
 		ib_free_recv_mad(mad_recv_wc);
 		return;
@@ -4277,18 +4273,6 @@
 	.default_attrs = cm_counter_default_attrs
 };
 
-static void cm_release_port_obj(struct kobject *obj)
-{
-	struct cm_port *cm_port;
-
-	cm_port = container_of(obj, struct cm_port, port_obj);
-	kfree(cm_port);
-}
-
-static struct kobj_type cm_port_obj_type = {
-	.release = cm_release_port_obj
-};
-
 static char *cm_devnode(struct device *dev, umode_t *mode)
 {
 	if (mode)
@@ -4307,19 +4291,12 @@
 {
 	int i, ret;
 
-	ret = kobject_init_and_add(&port->port_obj, &cm_port_obj_type,
-				   &port->cm_dev->device->kobj,
-				   "%d", port->port_num);
-	if (ret) {
-		kfree(port);
-		return ret;
-	}
-
 	for (i = 0; i < CM_COUNTER_GROUPS; i++) {
-		ret = kobject_init_and_add(&port->counter_group[i].obj,
-					   &cm_counter_obj_type,
-					   &port->port_obj,
-					   "%s", counter_group_names[i]);
+		ret = ib_port_register_module_stat(port->cm_dev->ib_device,
+						   port->port_num,
+						   &port->counter_group[i].obj,
+						   &cm_counter_obj_type,
+						   counter_group_names[i]);
 		if (ret)
 			goto error;
 	}
@@ -4328,8 +4305,7 @@
 
 error:
 	while (i--)
-		kobject_put(&port->counter_group[i].obj);
-	kobject_put(&port->port_obj);
+		ib_port_unregister_module_stat(&port->counter_group[i].obj);
 	return ret;
 
 }
@@ -4339,9 +4315,8 @@
 	int i;
 
 	for (i = 0; i < CM_COUNTER_GROUPS; i++)
-		kobject_put(&port->counter_group[i].obj);
+		ib_port_unregister_module_stat(&port->counter_group[i].obj);
 
-	kobject_put(&port->port_obj);
 }
 
 static void cm_add_one(struct ib_device *ib_device)
@@ -4368,13 +4343,6 @@
 	cm_dev->ib_device = ib_device;
 	cm_dev->ack_delay = ib_device->attrs.local_ca_ack_delay;
 	cm_dev->going_down = 0;
-	cm_dev->device = device_create(&cm_class, &ib_device->dev,
-				       MKDEV(0, 0), NULL,
-				       "%s", ib_device->name);
-	if (IS_ERR(cm_dev->device)) {
-		kfree(cm_dev);
-		return;
-	}
 
 	set_bit(IB_MGMT_METHOD_SEND, reg_req.method_mask);
 	for (i = 1; i <= ib_device->phys_port_cnt; i++) {
@@ -4431,6 +4399,7 @@
 error1:
 	port_modify.set_port_cap_mask = 0;
 	port_modify.clr_port_cap_mask = IB_PORT_CM_SUP;
+	kfree(port);
 	while (--i) {
 		if (!rdma_cap_ib_cm(ib_device, i))
 			continue;
@@ -4439,9 +4408,9 @@
 		ib_modify_port(ib_device, port->port_num, 0, &port_modify);
 		ib_unregister_mad_agent(port->mad_agent);
 		cm_remove_port_fs(port);
+		kfree(port);
 	}
 free:
-	device_unregister(cm_dev->device);
 	kfree(cm_dev);
 }
 
@@ -4493,9 +4462,9 @@
 		spin_unlock_irq(&cm.state_lock);
 		ib_unregister_mad_agent(cur_mad_agent);
 		cm_remove_port_fs(port);
+		kfree(port);
 	}
 
-	device_unregister(cm_dev->device);
 	kfree(cm_dev);
 }
 
@@ -4503,7 +4472,6 @@
 {
 	int ret;
 
-	memset(&cm, 0, sizeof cm);
 	INIT_LIST_HEAD(&cm.device_list);
 	rwlock_init(&cm.device_lock);
 	spin_lock_init(&cm.lock);
@@ -4513,7 +4481,7 @@
 	cm.remote_id_table = RB_ROOT;
 	cm.remote_qp_table = RB_ROOT;
 	cm.remote_sidr_table = RB_ROOT;
-	idr_init(&cm.local_id_table);
+	xa_init_flags(&cm.local_id_table, XA_FLAGS_ALLOC | XA_FLAGS_LOCK_IRQ);
 	get_random_bytes(&cm.random_id_operand, sizeof cm.random_id_operand);
 	INIT_LIST_HEAD(&cm.timewait_list);
 
@@ -4539,7 +4507,6 @@
 error2:
 	class_unregister(&cm_class);
 error1:
-	idr_destroy(&cm.local_id_table);
 	return ret;
 }
 
@@ -4561,9 +4528,8 @@
 	}
 
 	class_unregister(&cm_class);
-	idr_destroy(&cm.local_id_table);
+	WARN_ON(!xa_empty(&cm.local_id_table));
 }
 
 module_init(ib_cm_init);
 module_exit(ib_cm_cleanup);
-
diff --git a/drivers/infiniband/core/cm_msgs.h b/drivers/infiniband/core/cm_msgs.h
index 476d430..3d16d61 100644
--- a/drivers/infiniband/core/cm_msgs.h
+++ b/drivers/infiniband/core/cm_msgs.h
@@ -98,7 +98,7 @@
 
 	u32 private_data[IB_CM_REQ_PRIVATE_DATA_SIZE / sizeof(u32)];
 
-} __attribute__ ((packed));
+} __packed;
 
 static inline __be32 cm_req_get_local_qpn(struct cm_req_msg *req_msg)
 {
@@ -423,7 +423,7 @@
 
 	u8 private_data[IB_CM_MRA_PRIVATE_DATA_SIZE];
 
-} __attribute__ ((packed));
+} __packed;
 
 static inline u8 cm_mra_get_msg_mraed(struct cm_mra_msg *mra_msg)
 {
@@ -461,7 +461,7 @@
 
 	u8 private_data[IB_CM_REJ_PRIVATE_DATA_SIZE];
 
-} __attribute__ ((packed));
+} __packed;
 
 static inline u8 cm_rej_get_msg_rejected(struct cm_rej_msg *rej_msg)
 {
@@ -506,7 +506,7 @@
 
 	u8 private_data[IB_CM_REP_PRIVATE_DATA_SIZE];
 
-} __attribute__ ((packed));
+} __packed;
 
 static inline __be32 cm_rep_get_local_qpn(struct cm_rep_msg *rep_msg)
 {
@@ -614,7 +614,7 @@
 
 	u8 private_data[IB_CM_RTU_PRIVATE_DATA_SIZE];
 
-} __attribute__ ((packed));
+} __packed;
 
 struct cm_dreq_msg {
 	struct ib_mad_hdr hdr;
@@ -626,7 +626,7 @@
 
 	u8 private_data[IB_CM_DREQ_PRIVATE_DATA_SIZE];
 
-} __attribute__ ((packed));
+} __packed;
 
 static inline __be32 cm_dreq_get_remote_qpn(struct cm_dreq_msg *dreq_msg)
 {
@@ -647,7 +647,7 @@
 
 	u8 private_data[IB_CM_DREP_PRIVATE_DATA_SIZE];
 
-} __attribute__ ((packed));
+} __packed;
 
 struct cm_lap_msg {
 	struct ib_mad_hdr hdr;
@@ -675,7 +675,7 @@
 	u8 offset63;
 
 	u8 private_data[IB_CM_LAP_PRIVATE_DATA_SIZE];
-} __attribute__  ((packed));
+} __packed;
 
 static inline __be32 cm_lap_get_remote_qpn(struct cm_lap_msg *lap_msg)
 {
@@ -784,7 +784,7 @@
 	u8 info[IB_CM_APR_INFO_LENGTH];
 
 	u8 private_data[IB_CM_APR_PRIVATE_DATA_SIZE];
-} __attribute__ ((packed));
+} __packed;
 
 struct cm_sidr_req_msg {
 	struct ib_mad_hdr hdr;
@@ -795,7 +795,7 @@
 	__be64 service_id;
 
 	u32 private_data[IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE / sizeof(u32)];
-} __attribute__ ((packed));
+} __packed;
 
 struct cm_sidr_rep_msg {
 	struct ib_mad_hdr hdr;
@@ -811,7 +811,7 @@
 	u8 info[IB_CM_SIDR_REP_INFO_LENGTH];
 
 	u8 private_data[IB_CM_SIDR_REP_PRIVATE_DATA_SIZE];
-} __attribute__ ((packed));
+} __packed;
 
 static inline __be32 cm_sidr_rep_get_qpn(struct cm_sidr_rep_msg *sidr_rep_msg)
 {
diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
index a36c949..d78f676 100644
--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -39,7 +39,7 @@
 #include <linux/mutex.h>
 #include <linux/random.h>
 #include <linux/igmp.h>
-#include <linux/idr.h>
+#include <linux/xarray.h>
 #include <linux/inetdevice.h>
 #include <linux/slab.h>
 #include <linux/module.h>
@@ -191,10 +191,10 @@
 static unsigned int cma_pernet_id;
 
 struct cma_pernet {
-	struct idr tcp_ps;
-	struct idr udp_ps;
-	struct idr ipoib_ps;
-	struct idr ib_ps;
+	struct xarray tcp_ps;
+	struct xarray udp_ps;
+	struct xarray ipoib_ps;
+	struct xarray ib_ps;
 };
 
 static struct cma_pernet *cma_pernet(struct net *net)
@@ -202,7 +202,8 @@
 	return net_generic(net, cma_pernet_id);
 }
 
-static struct idr *cma_pernet_idr(struct net *net, enum rdma_ucm_port_space ps)
+static
+struct xarray *cma_pernet_xa(struct net *net, enum rdma_ucm_port_space ps)
 {
 	struct cma_pernet *pernet = cma_pernet(net);
 
@@ -247,25 +248,25 @@
 static int cma_ps_alloc(struct net *net, enum rdma_ucm_port_space ps,
 			struct rdma_bind_list *bind_list, int snum)
 {
-	struct idr *idr = cma_pernet_idr(net, ps);
+	struct xarray *xa = cma_pernet_xa(net, ps);
 
-	return idr_alloc(idr, bind_list, snum, snum + 1, GFP_KERNEL);
+	return xa_insert(xa, snum, bind_list, GFP_KERNEL);
 }
 
 static struct rdma_bind_list *cma_ps_find(struct net *net,
 					  enum rdma_ucm_port_space ps, int snum)
 {
-	struct idr *idr = cma_pernet_idr(net, ps);
+	struct xarray *xa = cma_pernet_xa(net, ps);
 
-	return idr_find(idr, snum);
+	return xa_load(xa, snum);
 }
 
 static void cma_ps_remove(struct net *net, enum rdma_ucm_port_space ps,
 			  int snum)
 {
-	struct idr *idr = cma_pernet_idr(net, ps);
+	struct xarray *xa = cma_pernet_xa(net, ps);
 
-	idr_remove(idr, snum);
+	xa_erase(xa, snum);
 }
 
 enum {
@@ -494,7 +495,10 @@
 	id_priv->id.route.addr.dev_addr.transport =
 		rdma_node_get_transport(cma_dev->device->node_type);
 	list_add_tail(&id_priv->list, &cma_dev->id_list);
-	rdma_restrack_add(&id_priv->res);
+	if (id_priv->res.kern_name)
+		rdma_restrack_kadd(&id_priv->res);
+	else
+		rdma_restrack_uadd(&id_priv->res);
 }
 
 static void cma_attach_to_dev(struct rdma_id_private *id_priv,
@@ -612,6 +616,9 @@
 	int dev_type = dev_addr->dev_type;
 	struct net_device *ndev = NULL;
 
+	if (!rdma_dev_access_netns(device, id_priv->id.route.addr.dev_addr.net))
+		return ERR_PTR(-ENODEV);
+
 	if ((dev_type == ARPHRD_INFINIBAND) && !rdma_protocol_ib(device, port))
 		return ERR_PTR(-ENODEV);
 
@@ -639,56 +646,147 @@
 	id_priv->id.route.addr.dev_addr.sgid_attr = sgid_attr;
 }
 
-static int cma_acquire_dev(struct rdma_id_private *id_priv,
-			   const struct rdma_id_private *listen_id_priv)
+/**
+ * cma_acquire_dev_by_src_ip - Acquire cma device, port, gid attribute
+ * based on source ip address.
+ * @id_priv:	cm_id which should be bound to cma device
+ *
+ * cma_acquire_dev_by_src_ip() binds cm id to cma device, port and GID attribute
+ * based on source IP address. It returns 0 on success or error code otherwise.
+ * It is applicable to active and passive side cm_id.
+ */
+static int cma_acquire_dev_by_src_ip(struct rdma_id_private *id_priv)
+{
+	struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr;
+	const struct ib_gid_attr *sgid_attr;
+	union ib_gid gid, iboe_gid, *gidp;
+	struct cma_device *cma_dev;
+	enum ib_gid_type gid_type;
+	int ret = -ENODEV;
+	unsigned int port;
+
+	if (dev_addr->dev_type != ARPHRD_INFINIBAND &&
+	    id_priv->id.ps == RDMA_PS_IPOIB)
+		return -EINVAL;
+
+	rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr,
+		    &iboe_gid);
+
+	memcpy(&gid, dev_addr->src_dev_addr +
+	       rdma_addr_gid_offset(dev_addr), sizeof(gid));
+
+	mutex_lock(&lock);
+	list_for_each_entry(cma_dev, &dev_list, list) {
+		rdma_for_each_port (cma_dev->device, port) {
+			gidp = rdma_protocol_roce(cma_dev->device, port) ?
+			       &iboe_gid : &gid;
+			gid_type = cma_dev->default_gid_type[port - 1];
+			sgid_attr = cma_validate_port(cma_dev->device, port,
+						      gid_type, gidp, id_priv);
+			if (!IS_ERR(sgid_attr)) {
+				id_priv->id.port_num = port;
+				cma_bind_sgid_attr(id_priv, sgid_attr);
+				cma_attach_to_dev(id_priv, cma_dev);
+				ret = 0;
+				goto out;
+			}
+		}
+	}
+out:
+	mutex_unlock(&lock);
+	return ret;
+}
+
+/**
+ * cma_ib_acquire_dev - Acquire cma device, port and SGID attribute
+ * @id_priv:		cm id to bind to cma device
+ * @listen_id_priv:	listener cm id to match against
+ * @req:		Pointer to req structure containaining incoming
+ *			request information
+ * cma_ib_acquire_dev() acquires cma device, port and SGID attribute when
+ * rdma device matches for listen_id and incoming request. It also verifies
+ * that a GID table entry is present for the source address.
+ * Returns 0 on success, or returns error code otherwise.
+ */
+static int cma_ib_acquire_dev(struct rdma_id_private *id_priv,
+			      const struct rdma_id_private *listen_id_priv,
+			      struct cma_req_info *req)
+{
+	struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr;
+	const struct ib_gid_attr *sgid_attr;
+	enum ib_gid_type gid_type;
+	union ib_gid gid;
+
+	if (dev_addr->dev_type != ARPHRD_INFINIBAND &&
+	    id_priv->id.ps == RDMA_PS_IPOIB)
+		return -EINVAL;
+
+	if (rdma_protocol_roce(req->device, req->port))
+		rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr,
+			    &gid);
+	else
+		memcpy(&gid, dev_addr->src_dev_addr +
+		       rdma_addr_gid_offset(dev_addr), sizeof(gid));
+
+	gid_type = listen_id_priv->cma_dev->default_gid_type[req->port - 1];
+	sgid_attr = cma_validate_port(req->device, req->port,
+				      gid_type, &gid, id_priv);
+	if (IS_ERR(sgid_attr))
+		return PTR_ERR(sgid_attr);
+
+	id_priv->id.port_num = req->port;
+	cma_bind_sgid_attr(id_priv, sgid_attr);
+	/* Need to acquire lock to protect against reader
+	 * of cma_dev->id_list such as cma_netdev_callback() and
+	 * cma_process_remove().
+	 */
+	mutex_lock(&lock);
+	cma_attach_to_dev(id_priv, listen_id_priv->cma_dev);
+	mutex_unlock(&lock);
+	return 0;
+}
+
+static int cma_iw_acquire_dev(struct rdma_id_private *id_priv,
+			      const struct rdma_id_private *listen_id_priv)
 {
 	struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr;
 	const struct ib_gid_attr *sgid_attr;
 	struct cma_device *cma_dev;
-	union ib_gid gid, iboe_gid, *gidp;
 	enum ib_gid_type gid_type;
 	int ret = -ENODEV;
+	union ib_gid gid;
 	u8 port;
 
 	if (dev_addr->dev_type != ARPHRD_INFINIBAND &&
 	    id_priv->id.ps == RDMA_PS_IPOIB)
 		return -EINVAL;
 
-	mutex_lock(&lock);
-	rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr,
-		    &iboe_gid);
-
 	memcpy(&gid, dev_addr->src_dev_addr +
-	       rdma_addr_gid_offset(dev_addr), sizeof gid);
+	       rdma_addr_gid_offset(dev_addr), sizeof(gid));
 
-	if (listen_id_priv) {
-		cma_dev = listen_id_priv->cma_dev;
-		port = listen_id_priv->id.port_num;
-		gidp = rdma_protocol_roce(cma_dev->device, port) ?
-		       &iboe_gid : &gid;
-		gid_type = listen_id_priv->gid_type;
-		sgid_attr = cma_validate_port(cma_dev->device, port,
-					      gid_type, gidp, id_priv);
-		if (!IS_ERR(sgid_attr)) {
-			id_priv->id.port_num = port;
-			cma_bind_sgid_attr(id_priv, sgid_attr);
-			ret = 0;
-			goto out;
-		}
+	mutex_lock(&lock);
+
+	cma_dev = listen_id_priv->cma_dev;
+	port = listen_id_priv->id.port_num;
+	gid_type = listen_id_priv->gid_type;
+	sgid_attr = cma_validate_port(cma_dev->device, port,
+				      gid_type, &gid, id_priv);
+	if (!IS_ERR(sgid_attr)) {
+		id_priv->id.port_num = port;
+		cma_bind_sgid_attr(id_priv, sgid_attr);
+		ret = 0;
+		goto out;
 	}
 
 	list_for_each_entry(cma_dev, &dev_list, list) {
 		for (port = 1; port <= cma_dev->device->phys_port_cnt; ++port) {
-			if (listen_id_priv &&
-			    listen_id_priv->cma_dev == cma_dev &&
+			if (listen_id_priv->cma_dev == cma_dev &&
 			    listen_id_priv->id.port_num == port)
 				continue;
 
-			gidp = rdma_protocol_roce(cma_dev->device, port) ?
-			       &iboe_gid : &gid;
 			gid_type = cma_dev->default_gid_type[port - 1];
 			sgid_attr = cma_validate_port(cma_dev->device, port,
-						      gid_type, gidp, id_priv);
+						      gid_type, &gid, id_priv);
 			if (!IS_ERR(sgid_attr)) {
 				id_priv->id.port_num = port;
 				cma_bind_sgid_attr(id_priv, sgid_attr);
@@ -785,10 +883,7 @@
 	if (!id_priv)
 		return ERR_PTR(-ENOMEM);
 
-	if (caller)
-		id_priv->res.kern_name = caller;
-	else
-		rdma_restrack_set_task(&id_priv->res, current);
+	rdma_restrack_set_task(&id_priv->res, caller);
 	id_priv->res.type = RDMA_RESTRACK_CM_ID;
 	id_priv->state = RDMA_CM_IDLE;
 	id_priv->id.context = context;
@@ -796,6 +891,7 @@
 	id_priv->id.ps = ps;
 	id_priv->id.qp_type = qp_type;
 	id_priv->tos_set = false;
+	id_priv->timeout_set = false;
 	id_priv->gid_type = IB_GID_TYPE_IB;
 	spin_lock_init(&id_priv->lock);
 	mutex_init(&id_priv->qp_mutex);
@@ -1038,6 +1134,9 @@
 	} else
 		ret = -ENOSYS;
 
+	if ((*qp_attr_mask & IB_QP_TIMEOUT) && id_priv->timeout_set)
+		qp_attr->timeout = id_priv->timeout;
+
 	return ret;
 }
 EXPORT_SYMBOL(rdma_init_qp_attr);
@@ -1078,18 +1177,31 @@
 	return cma_zero_addr(addr) || cma_loopback_addr(addr);
 }
 
-static int cma_addr_cmp(struct sockaddr *src, struct sockaddr *dst)
+static int cma_addr_cmp(const struct sockaddr *src, const struct sockaddr *dst)
 {
 	if (src->sa_family != dst->sa_family)
 		return -1;
 
 	switch (src->sa_family) {
 	case AF_INET:
-		return ((struct sockaddr_in *) src)->sin_addr.s_addr !=
-		       ((struct sockaddr_in *) dst)->sin_addr.s_addr;
-	case AF_INET6:
-		return ipv6_addr_cmp(&((struct sockaddr_in6 *) src)->sin6_addr,
-				     &((struct sockaddr_in6 *) dst)->sin6_addr);
+		return ((struct sockaddr_in *)src)->sin_addr.s_addr !=
+		       ((struct sockaddr_in *)dst)->sin_addr.s_addr;
+	case AF_INET6: {
+		struct sockaddr_in6 *src_addr6 = (struct sockaddr_in6 *)src;
+		struct sockaddr_in6 *dst_addr6 = (struct sockaddr_in6 *)dst;
+		bool link_local;
+
+		if (ipv6_addr_cmp(&src_addr6->sin6_addr,
+					  &dst_addr6->sin6_addr))
+			return 1;
+		link_local = ipv6_addr_type(&dst_addr6->sin6_addr) &
+			     IPV6_ADDR_LINKLOCAL;
+		/* Link local must match their scope_ids */
+		return link_local ? (src_addr6->sin6_scope_id !=
+				     dst_addr6->sin6_scope_id) :
+				    0;
+	}
+
 	default:
 		return ib_addr_cmp(&((struct sockaddr_ib *) src)->sib_addr,
 				   &((struct sockaddr_ib *) dst)->sib_addr);
@@ -1374,6 +1486,7 @@
 roce_get_net_dev_by_cm_event(const struct ib_cm_event *ib_event)
 {
 	const struct ib_gid_attr *sgid_attr = NULL;
+	struct net_device *ndev;
 
 	if (ib_event->event == IB_CM_REQ_RECEIVED)
 		sgid_attr = ib_event->param.req_rcvd.ppath_sgid_attr;
@@ -1382,8 +1495,15 @@
 
 	if (!sgid_attr)
 		return NULL;
-	dev_hold(sgid_attr->ndev);
-	return sgid_attr->ndev;
+
+	rcu_read_lock();
+	ndev = rdma_read_gid_attr_ndev_rcu(sgid_attr);
+	if (IS_ERR(ndev))
+		ndev = NULL;
+	else
+		dev_hold(ndev);
+	rcu_read_unlock();
+	return ndev;
 }
 
 static struct net_device *cma_get_net_dev(const struct ib_cm_event *ib_event,
@@ -1462,18 +1582,35 @@
 	return rdma_protocol_roce(device, port_num);
 }
 
+static bool cma_is_req_ipv6_ll(const struct cma_req_info *req)
+{
+	const struct sockaddr *daddr =
+			(const struct sockaddr *)&req->listen_addr_storage;
+	const struct sockaddr_in6 *daddr6 = (const struct sockaddr_in6 *)daddr;
+
+	/* Returns true if the req is for IPv6 link local */
+	return (daddr->sa_family == AF_INET6 &&
+		(ipv6_addr_type(&daddr6->sin6_addr) & IPV6_ADDR_LINKLOCAL));
+}
+
 static bool cma_match_net_dev(const struct rdma_cm_id *id,
 			      const struct net_device *net_dev,
-			      u8 port_num)
+			      const struct cma_req_info *req)
 {
 	const struct rdma_addr *addr = &id->route.addr;
 
 	if (!net_dev)
 		/* This request is an AF_IB request */
-		return (!id->port_num || id->port_num == port_num) &&
+		return (!id->port_num || id->port_num == req->port) &&
 		       (addr->src_addr.ss_family == AF_IB);
 
 	/*
+	 * If the request is not for IPv6 link local, allow matching
+	 * request to any netdevice of the one or multiport rdma device.
+	 */
+	if (!cma_is_req_ipv6_ll(req))
+		return true;
+	/*
 	 * Net namespaces must match, and if the listner is listening
 	 * on a specific netdevice than netdevice must match as well.
 	 */
@@ -1500,13 +1637,14 @@
 	hlist_for_each_entry(id_priv, &bind_list->owners, node) {
 		if (cma_match_private_data(id_priv, ib_event->private_data)) {
 			if (id_priv->id.device == cm_id->device &&
-			    cma_match_net_dev(&id_priv->id, net_dev, req->port))
+			    cma_match_net_dev(&id_priv->id, net_dev, req))
 				return id_priv;
 			list_for_each_entry(id_priv_dev,
 					    &id_priv->listen_list,
 					    listen_list) {
 				if (id_priv_dev->id.device == cm_id->device &&
-				    cma_match_net_dev(&id_priv_dev->id, net_dev, req->port))
+				    cma_match_net_dev(&id_priv_dev->id,
+						      net_dev, req))
 					return id_priv_dev;
 			}
 		}
@@ -1518,18 +1656,18 @@
 static struct rdma_id_private *
 cma_ib_id_from_event(struct ib_cm_id *cm_id,
 		     const struct ib_cm_event *ib_event,
+		     struct cma_req_info *req,
 		     struct net_device **net_dev)
 {
-	struct cma_req_info req;
 	struct rdma_bind_list *bind_list;
 	struct rdma_id_private *id_priv;
 	int err;
 
-	err = cma_save_req_info(ib_event, &req);
+	err = cma_save_req_info(ib_event, req);
 	if (err)
 		return ERR_PTR(err);
 
-	*net_dev = cma_get_net_dev(ib_event, &req);
+	*net_dev = cma_get_net_dev(ib_event, req);
 	if (IS_ERR(*net_dev)) {
 		if (PTR_ERR(*net_dev) == -EAFNOSUPPORT) {
 			/* Assuming the protocol is AF_IB */
@@ -1567,17 +1705,17 @@
 		}
 
 		if (!validate_net_dev(*net_dev,
-				 (struct sockaddr *)&req.listen_addr_storage,
-				 (struct sockaddr *)&req.src_addr_storage)) {
+				 (struct sockaddr *)&req->listen_addr_storage,
+				 (struct sockaddr *)&req->src_addr_storage)) {
 			id_priv = ERR_PTR(-EHOSTUNREACH);
 			goto err;
 		}
 	}
 
 	bind_list = cma_ps_find(*net_dev ? dev_net(*net_dev) : &init_net,
-				rdma_ps_from_service_id(req.service_id),
-				cma_port_from_service_id(req.service_id));
-	id_priv = cma_find_listener(bind_list, cm_id, ib_event, &req, *net_dev);
+				rdma_ps_from_service_id(req->service_id),
+				cma_port_from_service_id(req->service_id));
+	id_priv = cma_find_listener(bind_list, cm_id, ib_event, req, *net_dev);
 err:
 	rcu_read_unlock();
 	if (IS_ERR(id_priv) && *net_dev) {
@@ -1710,8 +1848,8 @@
 	mutex_lock(&id_priv->handler_mutex);
 	mutex_unlock(&id_priv->handler_mutex);
 
+	rdma_restrack_del(&id_priv->res);
 	if (id_priv->cma_dev) {
-		rdma_restrack_del(&id_priv->res);
 		if (rdma_cap_ib_cm(id_priv->id.device, 1)) {
 			if (id_priv->cm_id.ib)
 				ib_destroy_cm_id(id_priv->cm_id.ib);
@@ -1902,7 +2040,7 @@
 		rt->path_rec[1] = *ib_event->param.req_rcvd.alternate_path;
 
 	if (net_dev) {
-		rdma_copy_addr(&rt->addr.dev_addr, net_dev, NULL);
+		rdma_copy_src_l2_addr(&rt->addr.dev_addr, net_dev);
 	} else {
 		if (!cma_protocol_roce(listen_id) &&
 		    cma_any_addr(cma_src_addr(id_priv))) {
@@ -1952,7 +2090,7 @@
 		goto err;
 
 	if (net_dev) {
-		rdma_copy_addr(&id->route.addr.dev_addr, net_dev, NULL);
+		rdma_copy_src_l2_addr(&id->route.addr.dev_addr, net_dev);
 	} else {
 		if (!cma_any_addr(cma_src_addr(id_priv))) {
 			ret = cma_translate_addr(cma_src_addr(id_priv),
@@ -1999,11 +2137,12 @@
 {
 	struct rdma_id_private *listen_id, *conn_id = NULL;
 	struct rdma_cm_event event = {};
+	struct cma_req_info req = {};
 	struct net_device *net_dev;
 	u8 offset;
 	int ret;
 
-	listen_id = cma_ib_id_from_event(cm_id, ib_event, &net_dev);
+	listen_id = cma_ib_id_from_event(cm_id, ib_event, &req, &net_dev);
 	if (IS_ERR(listen_id))
 		return PTR_ERR(listen_id);
 
@@ -2036,7 +2175,7 @@
 	}
 
 	mutex_lock_nested(&conn_id->handler_mutex, SINGLE_DEPTH_NESTING);
-	ret = cma_acquire_dev(conn_id, listen_id);
+	ret = cma_ib_acquire_dev(conn_id, listen_id, &req);
 	if (ret)
 		goto err2;
 
@@ -2232,7 +2371,7 @@
 		goto out;
 	}
 
-	ret = cma_acquire_dev(conn_id, listen_id);
+	ret = cma_iw_acquire_dev(conn_id, listen_id);
 	if (ret) {
 		mutex_unlock(&conn_id->handler_mutex);
 		rdma_destroy_id(new_cm_id);
@@ -2257,9 +2396,10 @@
 		conn_id->cm_id.iw = NULL;
 		cma_exch(conn_id, RDMA_CM_DESTROYING);
 		mutex_unlock(&conn_id->handler_mutex);
+		mutex_unlock(&listen_id->handler_mutex);
 		cma_deref_id(conn_id);
 		rdma_destroy_id(&conn_id->id);
-		goto out;
+		return ret;
 	}
 
 	mutex_unlock(&conn_id->handler_mutex);
@@ -2299,6 +2439,7 @@
 		return PTR_ERR(id);
 
 	id->tos = id_priv->tos;
+	id->tos_set = id_priv->tos_set;
 	id_priv->cm_id.iw = id;
 
 	memcpy(&id_priv->cm_id.iw->local_addr, cma_src_addr(id_priv),
@@ -2351,11 +2492,13 @@
 	atomic_inc(&id_priv->refcount);
 	dev_id_priv->internal_id = 1;
 	dev_id_priv->afonly = id_priv->afonly;
+	dev_id_priv->tos_set = id_priv->tos_set;
+	dev_id_priv->tos = id_priv->tos;
 
 	ret = rdma_listen(id, id_priv->backlog);
 	if (ret)
-		pr_warn("RDMA CMA: cma_listen_on_dev, error %d, listening on device %s\n",
-			ret, cma_dev->device->name);
+		dev_warn(&cma_dev->device->dev,
+			 "RDMA CMA: cma_listen_on_dev, error %d\n", ret);
 }
 
 static void cma_listen_on_all(struct rdma_id_private *id_priv)
@@ -2379,6 +2522,34 @@
 }
 EXPORT_SYMBOL(rdma_set_service_type);
 
+/**
+ * rdma_set_ack_timeout() - Set the ack timeout of QP associated
+ *                          with a connection identifier.
+ * @id: Communication identifier to associated with service type.
+ * @timeout: Ack timeout to set a QP, expressed as 4.096 * 2^(timeout) usec.
+ *
+ * This function should be called before rdma_connect() on active side,
+ * and on passive side before rdma_accept(). It is applicable to primary
+ * path only. The timeout will affect the local side of the QP, it is not
+ * negotiated with remote side and zero disables the timer.
+ *
+ * Return: 0 for success
+ */
+int rdma_set_ack_timeout(struct rdma_cm_id *id, u8 timeout)
+{
+	struct rdma_id_private *id_priv;
+
+	if (id->qp_type != IB_QPT_RC)
+		return -EINVAL;
+
+	id_priv = container_of(id, struct rdma_id_private, id);
+	id_priv->timeout = timeout;
+	id_priv->timeout_set = true;
+
+	return 0;
+}
+EXPORT_SYMBOL(rdma_set_ack_timeout);
+
 static void cma_query_handler(int status, struct sa_path_rec *path_rec,
 			      void *context)
 {
@@ -2402,8 +2573,8 @@
 	queue_work(cma_wq, &work->work);
 }
 
-static int cma_query_ib_route(struct rdma_id_private *id_priv, int timeout_ms,
-			      struct cma_work *work)
+static int cma_query_ib_route(struct rdma_id_private *id_priv,
+			      unsigned long timeout_ms, struct cma_work *work)
 {
 	struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr;
 	struct sa_path_rec path_rec;
@@ -2521,7 +2692,8 @@
 	work->event.event = RDMA_CM_EVENT_ADDR_RESOLVED;
 }
 
-static int cma_resolve_ib_route(struct rdma_id_private *id_priv, int timeout_ms)
+static int cma_resolve_ib_route(struct rdma_id_private *id_priv,
+				unsigned long timeout_ms)
 {
 	struct rdma_route *route = &id_priv->id.route;
 	struct cma_work *work;
@@ -2643,7 +2815,7 @@
 }
 EXPORT_SYMBOL(rdma_set_ib_path);
 
-static int cma_resolve_iw_route(struct rdma_id_private *id_priv, int timeout_ms)
+static int cma_resolve_iw_route(struct rdma_id_private *id_priv)
 {
 	struct cma_work *work;
 
@@ -2744,7 +2916,7 @@
 	return ret;
 }
 
-int rdma_resolve_route(struct rdma_cm_id *id, int timeout_ms)
+int rdma_resolve_route(struct rdma_cm_id *id, unsigned long timeout_ms)
 {
 	struct rdma_id_private *id_priv;
 	int ret;
@@ -2759,7 +2931,7 @@
 	else if (rdma_protocol_roce(id->device, id->port_num))
 		ret = cma_resolve_iboe_route(id_priv);
 	else if (rdma_protocol_iwarp(id->device, id->port_num))
-		ret = cma_resolve_iw_route(id_priv, timeout_ms);
+		ret = cma_resolve_iw_route(id_priv);
 	else
 		ret = -ENOSYS;
 
@@ -2854,23 +3026,34 @@
 {
 	struct rdma_id_private *id_priv = context;
 	struct rdma_cm_event event = {};
+	struct sockaddr *addr;
+	struct sockaddr_storage old_addr;
 
 	mutex_lock(&id_priv->handler_mutex);
 	if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_QUERY,
 			   RDMA_CM_ADDR_RESOLVED))
 		goto out;
 
-	memcpy(cma_src_addr(id_priv), src_addr, rdma_addr_size(src_addr));
+	/*
+	 * Store the previous src address, so that if we fail to acquire
+	 * matching rdma device, old address can be restored back, which helps
+	 * to cancel the cma listen operation correctly.
+	 */
+	addr = cma_src_addr(id_priv);
+	memcpy(&old_addr, addr, rdma_addr_size(addr));
+	memcpy(addr, src_addr, rdma_addr_size(src_addr));
 	if (!status && !id_priv->cma_dev) {
-		status = cma_acquire_dev(id_priv, NULL);
+		status = cma_acquire_dev_by_src_ip(id_priv);
 		if (status)
 			pr_debug_ratelimited("RDMA CM: ADDR_ERROR: failed to acquire device. status %d\n",
 					     status);
-	} else {
+	} else if (status) {
 		pr_debug_ratelimited("RDMA CM: ADDR_ERROR: failed to resolve IP. status %d\n", status);
 	}
 
 	if (status) {
+		memcpy(addr, &old_addr,
+		       rdma_addr_size((struct sockaddr *)&old_addr));
 		if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_RESOLVED,
 				   RDMA_CM_ADDR_BOUND))
 			goto out;
@@ -2882,13 +3065,11 @@
 	if (id_priv->id.event_handler(&id_priv->id, &event)) {
 		cma_exch(id_priv, RDMA_CM_DESTROYING);
 		mutex_unlock(&id_priv->handler_mutex);
-		cma_deref_id(id_priv);
 		rdma_destroy_id(&id_priv->id);
 		return;
 	}
 out:
 	mutex_unlock(&id_priv->handler_mutex);
-	cma_deref_id(id_priv);
 }
 
 static int cma_resolve_loopback(struct rdma_id_private *id_priv)
@@ -2966,7 +3147,7 @@
 }
 
 int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr,
-		      const struct sockaddr *dst_addr, int timeout_ms)
+		      const struct sockaddr *dst_addr, unsigned long timeout_ms)
 {
 	struct rdma_id_private *id_priv;
 	int ret;
@@ -2985,16 +3166,16 @@
 		return -EINVAL;
 
 	memcpy(cma_dst_addr(id_priv), dst_addr, rdma_addr_size(dst_addr));
-	atomic_inc(&id_priv->refcount);
 	if (cma_any_addr(dst_addr)) {
 		ret = cma_resolve_loopback(id_priv);
 	} else {
 		if (dst_addr->sa_family == AF_IB) {
 			ret = cma_resolve_ib_addr(id_priv);
 		} else {
-			ret = rdma_resolve_ip(cma_src_addr(id_priv),
-					      dst_addr, &id->route.addr.dev_addr,
-					      timeout_ms, addr_handler, id_priv);
+			ret = rdma_resolve_ip(cma_src_addr(id_priv), dst_addr,
+					      &id->route.addr.dev_addr,
+					      timeout_ms, addr_handler,
+					      false, id_priv);
 		}
 	}
 	if (ret)
@@ -3003,7 +3184,6 @@
 	return 0;
 err:
 	cma_comp_exch(id_priv, RDMA_CM_ADDR_QUERY, RDMA_CM_ADDR_BOUND);
-	cma_deref_id(id_priv);
 	return ret;
 }
 EXPORT_SYMBOL(rdma_resolve_addr);
@@ -3093,7 +3273,7 @@
 		goto err;
 
 	bind_list->ps = ps;
-	bind_list->port = (unsigned short)ret;
+	bind_list->port = snum;
 	cma_bind_port(bind_list, id_priv);
 	return 0;
 err:
@@ -3414,7 +3594,7 @@
 		if (ret)
 			goto err1;
 
-		ret = cma_acquire_dev(id_priv, NULL);
+		ret = cma_acquire_dev_by_src_ip(id_priv);
 		if (ret)
 			goto err1;
 	}
@@ -3439,10 +3619,9 @@
 
 	return 0;
 err2:
-	if (id_priv->cma_dev) {
-		rdma_restrack_del(&id_priv->res);
+	rdma_restrack_del(&id_priv->res);
+	if (id_priv->cma_dev)
 		cma_release_dev(id_priv);
-	}
 err1:
 	cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_IDLE);
 	return ret;
@@ -3690,6 +3869,7 @@
 		return PTR_ERR(cm_id);
 
 	cm_id->tos = id_priv->tos;
+	cm_id->tos_set = id_priv->tos_set;
 	id_priv->cm_id.iw = cm_id;
 
 	memcpy(&cm_id->local_addr, cma_src_addr(id_priv),
@@ -3839,10 +4019,7 @@
 
 	id_priv = container_of(id, struct rdma_id_private, id);
 
-	if (caller)
-		id_priv->res.kern_name = caller;
-	else
-		rdma_restrack_set_task(&id_priv->res, current);
+	rdma_restrack_set_task(&id_priv->res, caller);
 
 	if (!cma_comp(id_priv, RDMA_CM_CONNECT))
 		return -EINVAL;
@@ -4087,9 +4264,10 @@
 	    (!ib_sa_sendonly_fullmem_support(&sa_client,
 					     id_priv->id.device,
 					     id_priv->id.port_num))) {
-		pr_warn("RDMA CM: %s port %u Unable to multicast join\n"
-			"RDMA CM: SM doesn't support Send Only Full Member option\n",
-			id_priv->id.device->name, id_priv->id.port_num);
+		dev_warn(
+			&id_priv->id.device->dev,
+			"RDMA CM: port %u Unable to multicast join: SM doesn't support Send Only Full Member option\n",
+			id_priv->id.port_num);
 		return -EOPNOTSUPP;
 	}
 
@@ -4395,7 +4573,7 @@
 	if (!cma_dev->default_roce_tos)
 		goto free_gid_type;
 
-	for (i = rdma_start_port(device); i <= rdma_end_port(device); i++) {
+	rdma_for_each_port (device, i) {
 		supported_gids = roce_gid_type_mask_support(device, i);
 		WARN_ON(!supported_gids);
 		if (supported_gids & (1 << CMA_PREFERRED_ROCE_GID_TYPE))
@@ -4499,93 +4677,14 @@
 	kfree(cma_dev);
 }
 
-static int cma_get_id_stats(struct sk_buff *skb, struct netlink_callback *cb)
-{
-	struct nlmsghdr *nlh;
-	struct rdma_cm_id_stats *id_stats;
-	struct rdma_id_private *id_priv;
-	struct rdma_cm_id *id = NULL;
-	struct cma_device *cma_dev;
-	int i_dev = 0, i_id = 0;
-
-	/*
-	 * We export all of the IDs as a sequence of messages.  Each
-	 * ID gets its own netlink message.
-	 */
-	mutex_lock(&lock);
-
-	list_for_each_entry(cma_dev, &dev_list, list) {
-		if (i_dev < cb->args[0]) {
-			i_dev++;
-			continue;
-		}
-
-		i_id = 0;
-		list_for_each_entry(id_priv, &cma_dev->id_list, list) {
-			if (i_id < cb->args[1]) {
-				i_id++;
-				continue;
-			}
-
-			id_stats = ibnl_put_msg(skb, &nlh, cb->nlh->nlmsg_seq,
-						sizeof *id_stats, RDMA_NL_RDMA_CM,
-						RDMA_NL_RDMA_CM_ID_STATS,
-						NLM_F_MULTI);
-			if (!id_stats)
-				goto out;
-
-			memset(id_stats, 0, sizeof *id_stats);
-			id = &id_priv->id;
-			id_stats->node_type = id->route.addr.dev_addr.dev_type;
-			id_stats->port_num = id->port_num;
-			id_stats->bound_dev_if =
-				id->route.addr.dev_addr.bound_dev_if;
-
-			if (ibnl_put_attr(skb, nlh,
-					  rdma_addr_size(cma_src_addr(id_priv)),
-					  cma_src_addr(id_priv),
-					  RDMA_NL_RDMA_CM_ATTR_SRC_ADDR))
-				goto out;
-			if (ibnl_put_attr(skb, nlh,
-					  rdma_addr_size(cma_dst_addr(id_priv)),
-					  cma_dst_addr(id_priv),
-					  RDMA_NL_RDMA_CM_ATTR_DST_ADDR))
-				goto out;
-
-			id_stats->pid	= task_pid_vnr(id_priv->res.task);
-			id_stats->port_space	= id->ps;
-			id_stats->cm_state	= id_priv->state;
-			id_stats->qp_num	= id_priv->qp_num;
-			id_stats->qp_type	= id->qp_type;
-
-			i_id++;
-			nlmsg_end(skb, nlh);
-		}
-
-		cb->args[1] = 0;
-		i_dev++;
-	}
-
-out:
-	mutex_unlock(&lock);
-	cb->args[0] = i_dev;
-	cb->args[1] = i_id;
-
-	return skb->len;
-}
-
-static const struct rdma_nl_cbs cma_cb_table[RDMA_NL_RDMA_CM_NUM_OPS] = {
-	[RDMA_NL_RDMA_CM_ID_STATS] = { .dump = cma_get_id_stats},
-};
-
 static int cma_init_net(struct net *net)
 {
 	struct cma_pernet *pernet = cma_pernet(net);
 
-	idr_init(&pernet->tcp_ps);
-	idr_init(&pernet->udp_ps);
-	idr_init(&pernet->ipoib_ps);
-	idr_init(&pernet->ib_ps);
+	xa_init(&pernet->tcp_ps);
+	xa_init(&pernet->udp_ps);
+	xa_init(&pernet->ipoib_ps);
+	xa_init(&pernet->ib_ps);
 
 	return 0;
 }
@@ -4594,10 +4693,10 @@
 {
 	struct cma_pernet *pernet = cma_pernet(net);
 
-	idr_destroy(&pernet->tcp_ps);
-	idr_destroy(&pernet->udp_ps);
-	idr_destroy(&pernet->ipoib_ps);
-	idr_destroy(&pernet->ib_ps);
+	WARN_ON(!xa_empty(&pernet->tcp_ps));
+	WARN_ON(!xa_empty(&pernet->udp_ps));
+	WARN_ON(!xa_empty(&pernet->ipoib_ps));
+	WARN_ON(!xa_empty(&pernet->ib_ps));
 }
 
 static struct pernet_operations cma_pernet_operations = {
@@ -4626,11 +4725,14 @@
 	if (ret)
 		goto err;
 
-	rdma_nl_register(RDMA_NL_RDMA_CM, cma_cb_table);
-	cma_configfs_init();
+	ret = cma_configfs_init();
+	if (ret)
+		goto err_ib;
 
 	return 0;
 
+err_ib:
+	ib_unregister_client(&cma_client);
 err:
 	unregister_netdevice_notifier(&cma_nb);
 	ib_sa_unregister_client(&sa_client);
@@ -4642,7 +4744,6 @@
 static void __exit cma_cleanup(void)
 {
 	cma_configfs_exit();
-	rdma_nl_unregister(RDMA_NL_RDMA_CM);
 	ib_unregister_client(&cma_client);
 	unregister_netdevice_notifier(&cma_nb);
 	ib_sa_unregister_client(&sa_client);
@@ -4650,7 +4751,5 @@
 	destroy_workqueue(cma_wq);
 }
 
-MODULE_ALIAS_RDMA_NETLINK(RDMA_NL_RDMA_CM, 1);
-
 module_init(cma_init);
 module_exit(cma_cleanup);
diff --git a/drivers/infiniband/core/cma_configfs.c b/drivers/infiniband/core/cma_configfs.c
index eee38b4..8b0b5ae 100644
--- a/drivers/infiniband/core/cma_configfs.c
+++ b/drivers/infiniband/core/cma_configfs.c
@@ -33,7 +33,10 @@
 #include <linux/module.h>
 #include <linux/configfs.h>
 #include <rdma/ib_verbs.h>
+#include <rdma/rdma_cm.h>
+
 #include "core_priv.h"
+#include "cma_priv.h"
 
 struct cma_device;
 
@@ -65,7 +68,7 @@
 
 static bool filter_by_name(struct ib_device *ib_dev, void *cookie)
 {
-	return !strcmp(ib_dev->name, cookie);
+	return !strcmp(dev_name(&ib_dev->dev), cookie);
 }
 
 static int cma_configfs_params_get(struct config_item *item,
@@ -339,12 +342,18 @@
 
 int __init cma_configfs_init(void)
 {
+	int ret;
+
 	config_group_init(&cma_subsys.su_group);
 	mutex_init(&cma_subsys.su_mutex);
-	return configfs_register_subsystem(&cma_subsys);
+	ret = configfs_register_subsystem(&cma_subsys);
+	if (ret)
+		mutex_destroy(&cma_subsys.su_mutex);
+	return ret;
 }
 
 void __exit cma_configfs_exit(void)
 {
 	configfs_unregister_subsystem(&cma_subsys);
+	mutex_destroy(&cma_subsys.su_mutex);
 }
diff --git a/drivers/infiniband/core/cma_priv.h b/drivers/infiniband/core/cma_priv.h
index 194cfe7..ca73072 100644
--- a/drivers/infiniband/core/cma_priv.h
+++ b/drivers/infiniband/core/cma_priv.h
@@ -84,9 +84,11 @@
 	u32			options;
 	u8			srq;
 	u8			tos;
-	bool			tos_set;
+	u8			tos_set:1;
+	u8                      timeout_set:1;
 	u8			reuseaddr;
 	u8			afonly;
+	u8			timeout;
 	enum ib_gid_type	gid_type;
 
 	/*
@@ -94,4 +96,32 @@
 	 */
 	struct rdma_restrack_entry     res;
 };
+
+#if IS_ENABLED(CONFIG_INFINIBAND_ADDR_TRANS_CONFIGFS)
+int cma_configfs_init(void);
+void cma_configfs_exit(void);
+#else
+static inline int cma_configfs_init(void)
+{
+	return 0;
+}
+
+static inline void cma_configfs_exit(void)
+{
+}
+#endif
+
+void cma_ref_dev(struct cma_device *dev);
+void cma_deref_dev(struct cma_device *dev);
+typedef bool (*cma_device_filter)(struct ib_device *, void *);
+struct cma_device *cma_enum_devices_by_ibdev(cma_device_filter filter,
+					     void *cookie);
+int cma_get_default_gid_type(struct cma_device *dev, unsigned int port);
+int cma_set_default_gid_type(struct cma_device *dev, unsigned int port,
+			     enum ib_gid_type default_gid_type);
+int cma_get_default_roce_tos(struct cma_device *dev, unsigned int port);
+int cma_set_default_roce_tos(struct cma_device *dev, unsigned int port,
+			     u8 default_roce_tos);
+struct ib_device *cma_get_ib_dev(struct cma_device *dev);
+
 #endif /* _CMA_PRIV_H */
diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h
index 77c7005..9d07378 100644
--- a/drivers/infiniband/core/core_priv.h
+++ b/drivers/infiniband/core/core_priv.h
@@ -36,6 +36,8 @@
 #include <linux/list.h>
 #include <linux/spinlock.h>
 #include <linux/cgroup_rdma.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
 
 #include <rdma/ib_verbs.h>
 #include <rdma/opa_addr.h>
@@ -44,7 +46,7 @@
 #include "mad_priv.h"
 
 /* Total number of ports combined across all struct ib_devices's */
-#define RDMA_MAX_PORTS 1024
+#define RDMA_MAX_PORTS 8192
 
 struct pkey_index_qp_list {
 	struct list_head    pkey_index_list;
@@ -54,39 +56,31 @@
 	struct list_head    qp_list;
 };
 
-#if IS_ENABLED(CONFIG_INFINIBAND_ADDR_TRANS_CONFIGFS)
-int cma_configfs_init(void);
-void cma_configfs_exit(void);
-#else
-static inline int cma_configfs_init(void)
+/**
+ * struct rdma_dev_net - rdma net namespace metadata for a net
+ * @nl_sock:	Pointer to netlink socket
+ * @net:	Pointer to owner net namespace
+ * @id:		xarray id to identify the net namespace.
+ */
+struct rdma_dev_net {
+	struct sock *nl_sock;
+	possible_net_t net;
+	u32 id;
+};
+
+extern const struct attribute_group ib_dev_attr_group;
+extern bool ib_devices_shared_netns;
+extern unsigned int rdma_dev_net_id;
+
+static inline struct rdma_dev_net *rdma_net_to_dev_net(struct net *net)
 {
-	return 0;
+	return net_generic(net, rdma_dev_net_id);
 }
 
-static inline void cma_configfs_exit(void)
-{
-}
-#endif
-struct cma_device;
-void cma_ref_dev(struct cma_device *cma_dev);
-void cma_deref_dev(struct cma_device *cma_dev);
-typedef bool (*cma_device_filter)(struct ib_device *, void *);
-struct cma_device *cma_enum_devices_by_ibdev(cma_device_filter	filter,
-					     void		*cookie);
-int cma_get_default_gid_type(struct cma_device *cma_dev,
-			     unsigned int port);
-int cma_set_default_gid_type(struct cma_device *cma_dev,
-			     unsigned int port,
-			     enum ib_gid_type default_gid_type);
-int cma_get_default_roce_tos(struct cma_device *cma_dev, unsigned int port);
-int cma_set_default_roce_tos(struct cma_device *a_dev, unsigned int port,
-			     u8 default_roce_tos);
-struct ib_device *cma_get_ib_dev(struct cma_device *cma_dev);
-
-int  ib_device_register_sysfs(struct ib_device *device,
-			      int (*port_callback)(struct ib_device *,
-						   u8, struct kobject *));
+int ib_device_register_sysfs(struct ib_device *device);
 void ib_device_unregister_sysfs(struct ib_device *device);
+int ib_device_rename(struct ib_device *ibdev, const char *name);
+int ib_device_set_dim(struct ib_device *ibdev, u8 use_dim);
 
 typedef void (*roce_netdev_callback)(struct ib_device *device, u8 port,
 	      struct net_device *idev, void *cookie);
@@ -94,6 +88,9 @@
 typedef bool (*roce_netdev_filter)(struct ib_device *device, u8 port,
 				   struct net_device *idev, void *cookie);
 
+struct net_device *ib_device_get_netdev(struct ib_device *ib_dev,
+					unsigned int port);
+
 void ib_enum_roce_netdev(struct ib_device *ib_dev,
 			 roce_netdev_filter filter,
 			 void *filter_cookie,
@@ -112,6 +109,15 @@
 int ib_enum_all_devs(nldev_callback nldev_cb, struct sk_buff *skb,
 		     struct netlink_callback *cb);
 
+struct ib_client_nl_info {
+	struct sk_buff *nl_msg;
+	struct device *cdev;
+	unsigned int port;
+	u64 abi;
+};
+int ib_get_client_nl_info(struct ib_device *ibdev, const char *client_name,
+			  struct ib_client_nl_info *res);
+
 enum ib_cache_gid_default_mode {
 	IB_CACHE_GID_DEFAULT_MODE_SET,
 	IB_CACHE_GID_DEFAULT_MODE_DELETE
@@ -145,7 +151,7 @@
 void ib_cache_release_one(struct ib_device *device);
 
 #ifdef CONFIG_CGROUP_RDMA
-int ib_device_register_rdmacg(struct ib_device *device);
+void ib_device_register_rdmacg(struct ib_device *device);
 void ib_device_unregister_rdmacg(struct ib_device *device);
 
 int ib_rdmacg_try_charge(struct ib_rdmacg_object *cg_obj,
@@ -156,21 +162,26 @@
 			struct ib_device *device,
 			enum rdmacg_resource_type resource_index);
 #else
-static inline int ib_device_register_rdmacg(struct ib_device *device)
-{ return 0; }
+static inline void ib_device_register_rdmacg(struct ib_device *device)
+{
+}
 
 static inline void ib_device_unregister_rdmacg(struct ib_device *device)
-{ }
+{
+}
 
 static inline int ib_rdmacg_try_charge(struct ib_rdmacg_object *cg_obj,
 				       struct ib_device *device,
 				       enum rdmacg_resource_type resource_index)
-{ return 0; }
+{
+	return 0;
+}
 
 static inline void ib_rdmacg_uncharge(struct ib_rdmacg_object *cg_obj,
 				      struct ib_device *device,
 				      enum rdmacg_resource_type resource_index)
-{ }
+{
+}
 #endif
 
 static inline bool rdma_is_upper_dev_rcu(struct net_device *dev,
@@ -188,7 +199,7 @@
 int ib_sa_init(void);
 void ib_sa_cleanup(void);
 
-int rdma_nl_init(void);
+void rdma_nl_init(void);
 void rdma_nl_exit(void);
 
 int ib_nl_handle_resolve_resp(struct sk_buff *skb,
@@ -206,7 +217,7 @@
 				u64              *sn_pfx);
 
 #ifdef CONFIG_SECURITY_INFINIBAND
-void ib_security_destroy_port_pkey_list(struct ib_device *device);
+void ib_security_release_port_pkey_list(struct ib_device *device);
 
 void ib_security_cache_change(struct ib_device *device,
 			      u8 port_num,
@@ -227,8 +238,9 @@
 				enum ib_qp_type qp_type);
 void ib_mad_agent_security_cleanup(struct ib_mad_agent *agent);
 int ib_mad_enforce_security(struct ib_mad_agent_private *map, u16 pkey_index);
+void ib_mad_agent_security_change(void);
 #else
-static inline void ib_security_destroy_port_pkey_list(struct ib_device *device)
+static inline void ib_security_release_port_pkey_list(struct ib_device *device)
 {
 }
 
@@ -243,10 +255,10 @@
 					int qp_attr_mask,
 					struct ib_udata *udata)
 {
-	return qp->device->modify_qp(qp->real_qp,
-				     qp_attr,
-				     qp_attr_mask,
-				     udata);
+	return qp->device->ops.modify_qp(qp->real_qp,
+					 qp_attr,
+					 qp_attr_mask,
+					 udata);
 }
 
 static inline int ib_create_qp_security(struct ib_qp *qp,
@@ -292,9 +304,14 @@
 {
 	return 0;
 }
+
+static inline void ib_mad_agent_security_change(void)
+{
+}
 #endif
 
-struct ib_device *ib_device_get_by_index(u32 ifindex);
+struct ib_device *ib_device_get_by_index(const struct net *net, u32 index);
+
 /* RDMA device netlink */
 void nldev_init(void);
 void nldev_exit(void);
@@ -305,26 +322,33 @@
 					  struct ib_udata *udata,
 					  struct ib_uobject *uobj)
 {
+	enum ib_qp_type qp_type = attr->qp_type;
 	struct ib_qp *qp;
+	bool is_xrc;
 
-	if (!dev->create_qp)
+	if (!dev->ops.create_qp)
 		return ERR_PTR(-EOPNOTSUPP);
 
-	qp = dev->create_qp(pd, attr, udata);
+	qp = dev->ops.create_qp(pd, attr, udata);
 	if (IS_ERR(qp))
 		return qp;
 
 	qp->device = dev;
 	qp->pd = pd;
 	qp->uobject = uobj;
+	qp->real_qp = qp;
 	/*
 	 * We don't track XRC QPs for now, because they don't have PD
 	 * and more importantly they are created internaly by driver,
 	 * see mlx5 create_dev_resources() as an example.
 	 */
-	if (attr->qp_type < IB_QPT_XRC_INI) {
+	is_xrc = qp_type == IB_QPT_XRC_INI || qp_type == IB_QPT_XRC_TGT;
+	if ((qp_type < IB_QPT_MAX && !is_xrc) || qp_type == IB_QPT_DRIVER) {
 		qp->res.type = RDMA_RESTRACK_QP;
-		rdma_restrack_add(&qp->res);
+		if (uobj)
+			rdma_restrack_uadd(&qp->res);
+		else
+			rdma_restrack_kadd(&qp->res);
 	} else
 		qp->res.valid = false;
 
@@ -338,7 +362,30 @@
 
 int rdma_addr_find_l2_eth_by_grh(const union ib_gid *sgid,
 				 const union ib_gid *dgid,
-				 u8 *dmac, const struct net_device *ndev,
+				 u8 *dmac, const struct ib_gid_attr *sgid_attr,
 				 int *hoplimit);
+void rdma_copy_src_l2_addr(struct rdma_dev_addr *dev_addr,
+			   const struct net_device *dev);
 
+struct sa_path_rec;
+int roce_resolve_route_from_path(struct sa_path_rec *rec,
+				 const struct ib_gid_attr *attr);
+
+struct net_device *rdma_read_gid_attr_ndev_rcu(const struct ib_gid_attr *attr);
+
+void ib_free_port_attrs(struct ib_core_device *coredev);
+int ib_setup_port_attrs(struct ib_core_device *coredev);
+
+int rdma_compatdev_set(u8 enable);
+
+int ib_port_register_module_stat(struct ib_device *device, u8 port_num,
+				 struct kobject *kobj, struct kobj_type *ktype,
+				 const char *name);
+void ib_port_unregister_module_stat(struct kobject *kobj);
+
+int ib_device_set_netns_put(struct sk_buff *skb,
+			    struct ib_device *dev, u32 ns_fd);
+
+int rdma_nl_net_init(struct rdma_dev_net *rnet);
+void rdma_nl_net_exit(struct rdma_dev_net *rnet);
 #endif /* _CORE_PRIV_H */
diff --git a/drivers/infiniband/core/counters.c b/drivers/infiniband/core/counters.c
new file mode 100644
index 0000000..680ad27
--- /dev/null
+++ b/drivers/infiniband/core/counters.c
@@ -0,0 +1,641 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright (c) 2019 Mellanox Technologies. All rights reserved.
+ */
+#include <rdma/ib_verbs.h>
+#include <rdma/rdma_counter.h>
+
+#include "core_priv.h"
+#include "restrack.h"
+
+#define ALL_AUTO_MODE_MASKS (RDMA_COUNTER_MASK_QP_TYPE)
+
+static int __counter_set_mode(struct rdma_counter_mode *curr,
+			      enum rdma_nl_counter_mode new_mode,
+			      enum rdma_nl_counter_mask new_mask)
+{
+	if ((new_mode == RDMA_COUNTER_MODE_AUTO) &&
+	    ((new_mask & (~ALL_AUTO_MODE_MASKS)) ||
+	     (curr->mode != RDMA_COUNTER_MODE_NONE)))
+		return -EINVAL;
+
+	curr->mode = new_mode;
+	curr->mask = new_mask;
+	return 0;
+}
+
+/**
+ * rdma_counter_set_auto_mode() - Turn on/off per-port auto mode
+ *
+ * When @on is true, the @mask must be set; When @on is false, it goes
+ * into manual mode if there's any counter, so that the user is able to
+ * manually access them.
+ */
+int rdma_counter_set_auto_mode(struct ib_device *dev, u8 port,
+			       bool on, enum rdma_nl_counter_mask mask)
+{
+	struct rdma_port_counter *port_counter;
+	int ret;
+
+	port_counter = &dev->port_data[port].port_counter;
+	if (!port_counter->hstats)
+		return -EOPNOTSUPP;
+
+	mutex_lock(&port_counter->lock);
+	if (on) {
+		ret = __counter_set_mode(&port_counter->mode,
+					 RDMA_COUNTER_MODE_AUTO, mask);
+	} else {
+		if (port_counter->mode.mode != RDMA_COUNTER_MODE_AUTO) {
+			ret = -EINVAL;
+			goto out;
+		}
+
+		if (port_counter->num_counters)
+			ret = __counter_set_mode(&port_counter->mode,
+						 RDMA_COUNTER_MODE_MANUAL, 0);
+		else
+			ret = __counter_set_mode(&port_counter->mode,
+						 RDMA_COUNTER_MODE_NONE, 0);
+	}
+
+out:
+	mutex_unlock(&port_counter->lock);
+	return ret;
+}
+
+static struct rdma_counter *rdma_counter_alloc(struct ib_device *dev, u8 port,
+					       enum rdma_nl_counter_mode mode)
+{
+	struct rdma_port_counter *port_counter;
+	struct rdma_counter *counter;
+	int ret;
+
+	if (!dev->ops.counter_dealloc || !dev->ops.counter_alloc_stats)
+		return NULL;
+
+	counter = kzalloc(sizeof(*counter), GFP_KERNEL);
+	if (!counter)
+		return NULL;
+
+	counter->device    = dev;
+	counter->port      = port;
+	counter->res.type  = RDMA_RESTRACK_COUNTER;
+	counter->stats     = dev->ops.counter_alloc_stats(counter);
+	if (!counter->stats)
+		goto err_stats;
+
+	port_counter = &dev->port_data[port].port_counter;
+	mutex_lock(&port_counter->lock);
+	if (mode == RDMA_COUNTER_MODE_MANUAL) {
+		ret = __counter_set_mode(&port_counter->mode,
+					 RDMA_COUNTER_MODE_MANUAL, 0);
+		if (ret)
+			goto err_mode;
+	}
+
+	port_counter->num_counters++;
+	mutex_unlock(&port_counter->lock);
+
+	counter->mode.mode = mode;
+	kref_init(&counter->kref);
+	mutex_init(&counter->lock);
+
+	return counter;
+
+err_mode:
+	mutex_unlock(&port_counter->lock);
+	kfree(counter->stats);
+err_stats:
+	kfree(counter);
+	return NULL;
+}
+
+static void rdma_counter_free(struct rdma_counter *counter)
+{
+	struct rdma_port_counter *port_counter;
+
+	port_counter = &counter->device->port_data[counter->port].port_counter;
+	mutex_lock(&port_counter->lock);
+	port_counter->num_counters--;
+	if (!port_counter->num_counters &&
+	    (port_counter->mode.mode == RDMA_COUNTER_MODE_MANUAL))
+		__counter_set_mode(&port_counter->mode, RDMA_COUNTER_MODE_NONE,
+				   0);
+
+	mutex_unlock(&port_counter->lock);
+
+	rdma_restrack_del(&counter->res);
+	kfree(counter->stats);
+	kfree(counter);
+}
+
+static void auto_mode_init_counter(struct rdma_counter *counter,
+				   const struct ib_qp *qp,
+				   enum rdma_nl_counter_mask new_mask)
+{
+	struct auto_mode_param *param = &counter->mode.param;
+
+	counter->mode.mode = RDMA_COUNTER_MODE_AUTO;
+	counter->mode.mask = new_mask;
+
+	if (new_mask & RDMA_COUNTER_MASK_QP_TYPE)
+		param->qp_type = qp->qp_type;
+}
+
+static bool auto_mode_match(struct ib_qp *qp, struct rdma_counter *counter,
+			    enum rdma_nl_counter_mask auto_mask)
+{
+	struct auto_mode_param *param = &counter->mode.param;
+	bool match = true;
+
+	if (!rdma_is_visible_in_pid_ns(&qp->res))
+		return false;
+
+	/* Ensure that counter belongs to the right PID */
+	if (task_pid_nr(counter->res.task) != task_pid_nr(qp->res.task))
+		return false;
+
+	if (auto_mask & RDMA_COUNTER_MASK_QP_TYPE)
+		match &= (param->qp_type == qp->qp_type);
+
+	return match;
+}
+
+static int __rdma_counter_bind_qp(struct rdma_counter *counter,
+				  struct ib_qp *qp)
+{
+	int ret;
+
+	if (qp->counter)
+		return -EINVAL;
+
+	if (!qp->device->ops.counter_bind_qp)
+		return -EOPNOTSUPP;
+
+	mutex_lock(&counter->lock);
+	ret = qp->device->ops.counter_bind_qp(counter, qp);
+	mutex_unlock(&counter->lock);
+
+	return ret;
+}
+
+static int __rdma_counter_unbind_qp(struct ib_qp *qp)
+{
+	struct rdma_counter *counter = qp->counter;
+	int ret;
+
+	if (!qp->device->ops.counter_unbind_qp)
+		return -EOPNOTSUPP;
+
+	mutex_lock(&counter->lock);
+	ret = qp->device->ops.counter_unbind_qp(qp);
+	mutex_unlock(&counter->lock);
+
+	return ret;
+}
+
+static void counter_history_stat_update(const struct rdma_counter *counter)
+{
+	struct ib_device *dev = counter->device;
+	struct rdma_port_counter *port_counter;
+	int i;
+
+	port_counter = &dev->port_data[counter->port].port_counter;
+	if (!port_counter->hstats)
+		return;
+
+	for (i = 0; i < counter->stats->num_counters; i++)
+		port_counter->hstats->value[i] += counter->stats->value[i];
+}
+
+/**
+ * rdma_get_counter_auto_mode - Find the counter that @qp should be bound
+ *     with in auto mode
+ *
+ * Return: The counter (with ref-count increased) if found
+ */
+static struct rdma_counter *rdma_get_counter_auto_mode(struct ib_qp *qp,
+						       u8 port)
+{
+	struct rdma_port_counter *port_counter;
+	struct rdma_counter *counter = NULL;
+	struct ib_device *dev = qp->device;
+	struct rdma_restrack_entry *res;
+	struct rdma_restrack_root *rt;
+	unsigned long id = 0;
+
+	port_counter = &dev->port_data[port].port_counter;
+	rt = &dev->res[RDMA_RESTRACK_COUNTER];
+	xa_lock(&rt->xa);
+	xa_for_each(&rt->xa, id, res) {
+		if (!rdma_is_visible_in_pid_ns(res))
+			continue;
+
+		counter = container_of(res, struct rdma_counter, res);
+		if ((counter->device != qp->device) || (counter->port != port))
+			goto next;
+
+		if (auto_mode_match(qp, counter, port_counter->mode.mask))
+			break;
+next:
+		counter = NULL;
+	}
+
+	if (counter && !kref_get_unless_zero(&counter->kref))
+		counter = NULL;
+
+	xa_unlock(&rt->xa);
+	return counter;
+}
+
+static void rdma_counter_res_add(struct rdma_counter *counter,
+				 struct ib_qp *qp)
+{
+	if (rdma_is_kernel_res(&qp->res)) {
+		rdma_restrack_set_task(&counter->res, qp->res.kern_name);
+		rdma_restrack_kadd(&counter->res);
+	} else {
+		rdma_restrack_attach_task(&counter->res, qp->res.task);
+		rdma_restrack_uadd(&counter->res);
+	}
+}
+
+static void counter_release(struct kref *kref)
+{
+	struct rdma_counter *counter;
+
+	counter = container_of(kref, struct rdma_counter, kref);
+	counter_history_stat_update(counter);
+	counter->device->ops.counter_dealloc(counter);
+	rdma_counter_free(counter);
+}
+
+/**
+ * rdma_counter_bind_qp_auto - Check and bind the QP to a counter base on
+ *   the auto-mode rule
+ */
+int rdma_counter_bind_qp_auto(struct ib_qp *qp, u8 port)
+{
+	struct rdma_port_counter *port_counter;
+	struct ib_device *dev = qp->device;
+	struct rdma_counter *counter;
+	int ret;
+
+	if (!rdma_is_port_valid(dev, port))
+		return -EINVAL;
+
+	port_counter = &dev->port_data[port].port_counter;
+	if (port_counter->mode.mode != RDMA_COUNTER_MODE_AUTO)
+		return 0;
+
+	counter = rdma_get_counter_auto_mode(qp, port);
+	if (counter) {
+		ret = __rdma_counter_bind_qp(counter, qp);
+		if (ret) {
+			kref_put(&counter->kref, counter_release);
+			return ret;
+		}
+	} else {
+		counter = rdma_counter_alloc(dev, port, RDMA_COUNTER_MODE_AUTO);
+		if (!counter)
+			return -ENOMEM;
+
+		auto_mode_init_counter(counter, qp, port_counter->mode.mask);
+
+		ret = __rdma_counter_bind_qp(counter, qp);
+		if (ret) {
+			rdma_counter_free(counter);
+			return ret;
+		}
+
+		rdma_counter_res_add(counter, qp);
+	}
+
+	return 0;
+}
+
+/**
+ * rdma_counter_unbind_qp - Unbind a qp from a counter
+ * @force:
+ *   true - Decrease the counter ref-count anyway (e.g., qp destroy)
+ */
+int rdma_counter_unbind_qp(struct ib_qp *qp, bool force)
+{
+	struct rdma_counter *counter = qp->counter;
+	int ret;
+
+	if (!counter)
+		return -EINVAL;
+
+	ret = __rdma_counter_unbind_qp(qp);
+	if (ret && !force)
+		return ret;
+
+	kref_put(&counter->kref, counter_release);
+	return 0;
+}
+
+int rdma_counter_query_stats(struct rdma_counter *counter)
+{
+	struct ib_device *dev = counter->device;
+	int ret;
+
+	if (!dev->ops.counter_update_stats)
+		return -EINVAL;
+
+	mutex_lock(&counter->lock);
+	ret = dev->ops.counter_update_stats(counter);
+	mutex_unlock(&counter->lock);
+
+	return ret;
+}
+
+static u64 get_running_counters_hwstat_sum(struct ib_device *dev,
+					   u8 port, u32 index)
+{
+	struct rdma_restrack_entry *res;
+	struct rdma_restrack_root *rt;
+	struct rdma_counter *counter;
+	unsigned long id = 0;
+	u64 sum = 0;
+
+	rt = &dev->res[RDMA_RESTRACK_COUNTER];
+	xa_lock(&rt->xa);
+	xa_for_each(&rt->xa, id, res) {
+		if (!rdma_restrack_get(res))
+			continue;
+
+		xa_unlock(&rt->xa);
+
+		counter = container_of(res, struct rdma_counter, res);
+		if ((counter->device != dev) || (counter->port != port) ||
+		    rdma_counter_query_stats(counter))
+			goto next;
+
+		sum += counter->stats->value[index];
+
+next:
+		xa_lock(&rt->xa);
+		rdma_restrack_put(res);
+	}
+
+	xa_unlock(&rt->xa);
+	return sum;
+}
+
+/**
+ * rdma_counter_get_hwstat_value() - Get the sum value of all counters on a
+ *   specific port, including the running ones and history data
+ */
+u64 rdma_counter_get_hwstat_value(struct ib_device *dev, u8 port, u32 index)
+{
+	struct rdma_port_counter *port_counter;
+	u64 sum;
+
+	port_counter = &dev->port_data[port].port_counter;
+	if (!port_counter->hstats)
+		return 0;
+
+	sum = get_running_counters_hwstat_sum(dev, port, index);
+	sum += port_counter->hstats->value[index];
+
+	return sum;
+}
+
+static struct ib_qp *rdma_counter_get_qp(struct ib_device *dev, u32 qp_num)
+{
+	struct rdma_restrack_entry *res = NULL;
+	struct ib_qp *qp = NULL;
+
+	res = rdma_restrack_get_byid(dev, RDMA_RESTRACK_QP, qp_num);
+	if (IS_ERR(res))
+		return NULL;
+
+	if (!rdma_is_visible_in_pid_ns(res))
+		goto err;
+
+	qp = container_of(res, struct ib_qp, res);
+	if (qp->qp_type == IB_QPT_RAW_PACKET && !capable(CAP_NET_RAW))
+		goto err;
+
+	return qp;
+
+err:
+	rdma_restrack_put(res);
+	return NULL;
+}
+
+static int rdma_counter_bind_qp_manual(struct rdma_counter *counter,
+				       struct ib_qp *qp)
+{
+	if ((counter->device != qp->device) || (counter->port != qp->port))
+		return -EINVAL;
+
+	return __rdma_counter_bind_qp(counter, qp);
+}
+
+static struct rdma_counter *rdma_get_counter_by_id(struct ib_device *dev,
+						   u32 counter_id)
+{
+	struct rdma_restrack_entry *res;
+	struct rdma_counter *counter;
+
+	res = rdma_restrack_get_byid(dev, RDMA_RESTRACK_COUNTER, counter_id);
+	if (IS_ERR(res))
+		return NULL;
+
+	if (!rdma_is_visible_in_pid_ns(res)) {
+		rdma_restrack_put(res);
+		return NULL;
+	}
+
+	counter = container_of(res, struct rdma_counter, res);
+	kref_get(&counter->kref);
+	rdma_restrack_put(res);
+
+	return counter;
+}
+
+/**
+ * rdma_counter_bind_qpn() - Bind QP @qp_num to counter @counter_id
+ */
+int rdma_counter_bind_qpn(struct ib_device *dev, u8 port,
+			  u32 qp_num, u32 counter_id)
+{
+	struct rdma_counter *counter;
+	struct ib_qp *qp;
+	int ret;
+
+	qp = rdma_counter_get_qp(dev, qp_num);
+	if (!qp)
+		return -ENOENT;
+
+	counter = rdma_get_counter_by_id(dev, counter_id);
+	if (!counter) {
+		ret = -ENOENT;
+		goto err;
+	}
+
+	if (counter->res.task != qp->res.task) {
+		ret = -EINVAL;
+		goto err_task;
+	}
+
+	ret = rdma_counter_bind_qp_manual(counter, qp);
+	if (ret)
+		goto err_task;
+
+	rdma_restrack_put(&qp->res);
+	return 0;
+
+err_task:
+	kref_put(&counter->kref, counter_release);
+err:
+	rdma_restrack_put(&qp->res);
+	return ret;
+}
+
+/**
+ * rdma_counter_bind_qpn_alloc() - Alloc a counter and bind QP @qp_num to it
+ *   The id of new counter is returned in @counter_id
+ */
+int rdma_counter_bind_qpn_alloc(struct ib_device *dev, u8 port,
+				u32 qp_num, u32 *counter_id)
+{
+	struct rdma_counter *counter;
+	struct ib_qp *qp;
+	int ret;
+
+	if (!rdma_is_port_valid(dev, port))
+		return -EINVAL;
+
+	if (!dev->port_data[port].port_counter.hstats)
+		return -EOPNOTSUPP;
+
+	qp = rdma_counter_get_qp(dev, qp_num);
+	if (!qp)
+		return -ENOENT;
+
+	if (rdma_is_port_valid(dev, qp->port) && (qp->port != port)) {
+		ret = -EINVAL;
+		goto err;
+	}
+
+	counter = rdma_counter_alloc(dev, port, RDMA_COUNTER_MODE_MANUAL);
+	if (!counter) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	ret = rdma_counter_bind_qp_manual(counter, qp);
+	if (ret)
+		goto err_bind;
+
+	if (counter_id)
+		*counter_id = counter->id;
+
+	rdma_counter_res_add(counter, qp);
+
+	rdma_restrack_put(&qp->res);
+	return ret;
+
+err_bind:
+	rdma_counter_free(counter);
+err:
+	rdma_restrack_put(&qp->res);
+	return ret;
+}
+
+/**
+ * rdma_counter_unbind_qpn() - Unbind QP @qp_num from a counter
+ */
+int rdma_counter_unbind_qpn(struct ib_device *dev, u8 port,
+			    u32 qp_num, u32 counter_id)
+{
+	struct rdma_port_counter *port_counter;
+	struct ib_qp *qp;
+	int ret;
+
+	if (!rdma_is_port_valid(dev, port))
+		return -EINVAL;
+
+	qp = rdma_counter_get_qp(dev, qp_num);
+	if (!qp)
+		return -ENOENT;
+
+	if (rdma_is_port_valid(dev, qp->port) && (qp->port != port)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	port_counter = &dev->port_data[port].port_counter;
+	if (!qp->counter || qp->counter->id != counter_id ||
+	    port_counter->mode.mode != RDMA_COUNTER_MODE_MANUAL) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = rdma_counter_unbind_qp(qp, false);
+
+out:
+	rdma_restrack_put(&qp->res);
+	return ret;
+}
+
+int rdma_counter_get_mode(struct ib_device *dev, u8 port,
+			  enum rdma_nl_counter_mode *mode,
+			  enum rdma_nl_counter_mask *mask)
+{
+	struct rdma_port_counter *port_counter;
+
+	port_counter = &dev->port_data[port].port_counter;
+	*mode = port_counter->mode.mode;
+	*mask = port_counter->mode.mask;
+
+	return 0;
+}
+
+void rdma_counter_init(struct ib_device *dev)
+{
+	struct rdma_port_counter *port_counter;
+	u32 port, i;
+
+	if (!dev->port_data)
+		return;
+
+	rdma_for_each_port(dev, port) {
+		port_counter = &dev->port_data[port].port_counter;
+		port_counter->mode.mode = RDMA_COUNTER_MODE_NONE;
+		mutex_init(&port_counter->lock);
+
+		if (!dev->ops.alloc_hw_stats)
+			continue;
+
+		port_counter->hstats = dev->ops.alloc_hw_stats(dev, port);
+		if (!port_counter->hstats)
+			goto fail;
+	}
+
+	return;
+
+fail:
+	for (i = port; i >= rdma_start_port(dev); i--) {
+		port_counter = &dev->port_data[port].port_counter;
+		kfree(port_counter->hstats);
+		port_counter->hstats = NULL;
+		mutex_destroy(&port_counter->lock);
+	}
+}
+
+void rdma_counter_release(struct ib_device *dev)
+{
+	struct rdma_port_counter *port_counter;
+	u32 port;
+
+	rdma_for_each_port(dev, port) {
+		port_counter = &dev->port_data[port].port_counter;
+		kfree(port_counter->hstats);
+		mutex_destroy(&port_counter->lock);
+	}
+}
diff --git a/drivers/infiniband/core/cq.c b/drivers/infiniband/core/cq.c
index af5ad6a..bbfded6 100644
--- a/drivers/infiniband/core/cq.c
+++ b/drivers/infiniband/core/cq.c
@@ -1,14 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Copyright (c) 2015 HGST, a Western Digital Company.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
  */
 #include <linux/module.h>
 #include <linux/err.h>
@@ -26,6 +18,53 @@
 #define IB_POLL_FLAGS \
 	(IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS)
 
+static const struct dim_cq_moder
+rdma_dim_prof[RDMA_DIM_PARAMS_NUM_PROFILES] = {
+	{1,   0, 1,  0},
+	{1,   0, 4,  0},
+	{2,   0, 4,  0},
+	{2,   0, 8,  0},
+	{4,   0, 8,  0},
+	{16,  0, 8,  0},
+	{16,  0, 16, 0},
+	{32,  0, 16, 0},
+	{32,  0, 32, 0},
+};
+
+static void ib_cq_rdma_dim_work(struct work_struct *w)
+{
+	struct dim *dim = container_of(w, struct dim, work);
+	struct ib_cq *cq = dim->priv;
+
+	u16 usec = rdma_dim_prof[dim->profile_ix].usec;
+	u16 comps = rdma_dim_prof[dim->profile_ix].comps;
+
+	dim->state = DIM_START_MEASURE;
+
+	cq->device->ops.modify_cq(cq, comps, usec);
+}
+
+static void rdma_dim_init(struct ib_cq *cq)
+{
+	struct dim *dim;
+
+	if (!cq->device->ops.modify_cq || !cq->device->use_cq_dim ||
+	    cq->poll_ctx == IB_POLL_DIRECT)
+		return;
+
+	dim = kzalloc(sizeof(struct dim), GFP_KERNEL);
+	if (!dim)
+		return;
+
+	dim->state = DIM_START_MEASURE;
+	dim->tune_state = DIM_GOING_RIGHT;
+	dim->profile_ix = RDMA_DIM_START_PROFILE;
+	dim->priv = cq;
+	cq->dim = dim;
+
+	INIT_WORK(&dim->work, ib_cq_rdma_dim_work);
+}
+
 static int __ib_process_cq(struct ib_cq *cq, int budget, struct ib_wc *wcs,
 			   int batch)
 {
@@ -86,6 +125,7 @@
 static int ib_poll_handler(struct irq_poll *iop, int budget)
 {
 	struct ib_cq *cq = container_of(iop, struct ib_cq, iop);
+	struct dim *dim = cq->dim;
 	int completed;
 
 	completed = __ib_process_cq(cq, budget, cq->wc, IB_POLL_BATCH);
@@ -95,6 +135,9 @@
 			irq_poll_sched(&cq->iop);
 	}
 
+	if (dim)
+		rdma_dim(dim, completed);
+
 	return completed;
 }
 
@@ -112,31 +155,35 @@
 				    IB_POLL_BATCH);
 	if (completed >= IB_POLL_BUDGET_WORKQUEUE ||
 	    ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0)
-		queue_work(ib_comp_wq, &cq->work);
+		queue_work(cq->comp_wq, &cq->work);
+	else if (cq->dim)
+		rdma_dim(cq->dim, completed);
 }
 
 static void ib_cq_completion_workqueue(struct ib_cq *cq, void *private)
 {
-	queue_work(ib_comp_wq, &cq->work);
+	queue_work(cq->comp_wq, &cq->work);
 }
 
 /**
- * __ib_alloc_cq - allocate a completion queue
+ * __ib_alloc_cq_user - allocate a completion queue
  * @dev:		device to allocate the CQ for
  * @private:		driver private data, accessible from cq->cq_context
  * @nr_cqe:		number of CQEs to allocate
  * @comp_vector:	HCA completion vectors for this CQ
  * @poll_ctx:		context to poll the CQ from.
  * @caller:		module owner name.
+ * @udata:		Valid user data or NULL for kernel object
  *
  * This is the proper interface to allocate a CQ for in-kernel users. A
  * CQ allocated with this interface will automatically be polled from the
  * specified context. The ULP must use wr->wr_cqe instead of wr->wr_id
  * to use this CQ abstraction.
  */
-struct ib_cq *__ib_alloc_cq(struct ib_device *dev, void *private,
-			    int nr_cqe, int comp_vector,
-			    enum ib_poll_context poll_ctx, const char *caller)
+struct ib_cq *__ib_alloc_cq_user(struct ib_device *dev, void *private,
+				 int nr_cqe, int comp_vector,
+				 enum ib_poll_context poll_ctx,
+				 const char *caller, struct ib_udata *udata)
 {
 	struct ib_cq_init_attr cq_attr = {
 		.cqe		= nr_cqe,
@@ -145,24 +192,29 @@
 	struct ib_cq *cq;
 	int ret = -ENOMEM;
 
-	cq = dev->create_cq(dev, &cq_attr, NULL, NULL);
-	if (IS_ERR(cq))
-		return cq;
+	cq = rdma_zalloc_drv_obj(dev, ib_cq);
+	if (!cq)
+		return ERR_PTR(ret);
 
 	cq->device = dev;
-	cq->uobject = NULL;
-	cq->event_handler = NULL;
 	cq->cq_context = private;
 	cq->poll_ctx = poll_ctx;
 	atomic_set(&cq->usecnt, 0);
 
 	cq->wc = kmalloc_array(IB_POLL_BATCH, sizeof(*cq->wc), GFP_KERNEL);
 	if (!cq->wc)
-		goto out_destroy_cq;
+		goto out_free_cq;
 
 	cq->res.type = RDMA_RESTRACK_CQ;
-	cq->res.kern_name = caller;
-	rdma_restrack_add(&cq->res);
+	rdma_restrack_set_task(&cq->res, caller);
+
+	ret = dev->ops.create_cq(cq, &cq_attr, NULL);
+	if (ret)
+		goto out_free_wc;
+
+	rdma_restrack_kadd(&cq->res);
+
+	rdma_dim_init(cq);
 
 	switch (cq->poll_ctx) {
 	case IB_POLL_DIRECT:
@@ -175,34 +227,66 @@
 		ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
 		break;
 	case IB_POLL_WORKQUEUE:
+	case IB_POLL_UNBOUND_WORKQUEUE:
 		cq->comp_handler = ib_cq_completion_workqueue;
 		INIT_WORK(&cq->work, ib_cq_poll_work);
 		ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
+		cq->comp_wq = (cq->poll_ctx == IB_POLL_WORKQUEUE) ?
+				ib_comp_wq : ib_comp_unbound_wq;
 		break;
 	default:
 		ret = -EINVAL;
-		goto out_free_wc;
+		goto out_destroy_cq;
 	}
 
 	return cq;
 
+out_destroy_cq:
+	rdma_restrack_del(&cq->res);
+	cq->device->ops.destroy_cq(cq, udata);
 out_free_wc:
 	kfree(cq->wc);
-	rdma_restrack_del(&cq->res);
-out_destroy_cq:
-	cq->device->destroy_cq(cq);
+out_free_cq:
+	kfree(cq);
 	return ERR_PTR(ret);
 }
-EXPORT_SYMBOL(__ib_alloc_cq);
+EXPORT_SYMBOL(__ib_alloc_cq_user);
 
 /**
- * ib_free_cq - free a completion queue
- * @cq:		completion queue to free.
+ * __ib_alloc_cq_any - allocate a completion queue
+ * @dev:		device to allocate the CQ for
+ * @private:		driver private data, accessible from cq->cq_context
+ * @nr_cqe:		number of CQEs to allocate
+ * @poll_ctx:		context to poll the CQ from
+ * @caller:		module owner name
+ *
+ * Attempt to spread ULP Completion Queues over each device's interrupt
+ * vectors. A simple best-effort mechanism is used.
  */
-void ib_free_cq(struct ib_cq *cq)
+struct ib_cq *__ib_alloc_cq_any(struct ib_device *dev, void *private,
+				int nr_cqe, enum ib_poll_context poll_ctx,
+				const char *caller)
 {
-	int ret;
+	static atomic_t counter;
+	int comp_vector = 0;
 
+	if (dev->num_comp_vectors > 1)
+		comp_vector =
+			atomic_inc_return(&counter) %
+			min_t(int, dev->num_comp_vectors, num_online_cpus());
+
+	return __ib_alloc_cq_user(dev, private, nr_cqe, comp_vector, poll_ctx,
+				  caller, NULL);
+}
+EXPORT_SYMBOL(__ib_alloc_cq_any);
+
+/**
+ * ib_free_cq_user - free a completion queue
+ * @cq:		completion queue to free.
+ * @udata:	User data or NULL for kernel object
+ */
+void ib_free_cq_user(struct ib_cq *cq, struct ib_udata *udata)
+{
 	if (WARN_ON_ONCE(atomic_read(&cq->usecnt)))
 		return;
 
@@ -213,15 +297,19 @@
 		irq_poll_disable(&cq->iop);
 		break;
 	case IB_POLL_WORKQUEUE:
+	case IB_POLL_UNBOUND_WORKQUEUE:
 		cancel_work_sync(&cq->work);
 		break;
 	default:
 		WARN_ON_ONCE(1);
 	}
 
-	kfree(cq->wc);
 	rdma_restrack_del(&cq->res);
-	ret = cq->device->destroy_cq(cq);
-	WARN_ON_ONCE(ret);
+	cq->device->ops.destroy_cq(cq, udata);
+	if (cq->dim)
+		cancel_work_sync(&cq->dim->work);
+	kfree(cq->dim);
+	kfree(cq->wc);
+	kfree(cq);
 }
-EXPORT_SYMBOL(ib_free_cq);
+EXPORT_SYMBOL(ib_free_cq_user);
diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index db3b627..50a9244 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -37,65 +37,238 @@
 #include <linux/kernel.h>
 #include <linux/slab.h>
 #include <linux/init.h>
-#include <linux/mutex.h>
 #include <linux/netdevice.h>
+#include <net/net_namespace.h>
 #include <linux/security.h>
 #include <linux/notifier.h>
+#include <linux/hashtable.h>
 #include <rdma/rdma_netlink.h>
 #include <rdma/ib_addr.h>
 #include <rdma/ib_cache.h>
+#include <rdma/rdma_counter.h>
 
 #include "core_priv.h"
+#include "restrack.h"
 
 MODULE_AUTHOR("Roland Dreier");
 MODULE_DESCRIPTION("core kernel InfiniBand API");
 MODULE_LICENSE("Dual BSD/GPL");
 
-struct ib_client_data {
-	struct list_head  list;
-	struct ib_client *client;
-	void *            data;
-	/* The device or client is going down. Do not call client or device
-	 * callbacks other than remove(). */
-	bool		  going_down;
-};
-
 struct workqueue_struct *ib_comp_wq;
+struct workqueue_struct *ib_comp_unbound_wq;
 struct workqueue_struct *ib_wq;
 EXPORT_SYMBOL_GPL(ib_wq);
 
-/* The device_list and client_list contain devices and clients after their
- * registration has completed, and the devices and clients are removed
- * during unregistration. */
-static LIST_HEAD(device_list);
-static LIST_HEAD(client_list);
+/*
+ * Each of the three rwsem locks (devices, clients, client_data) protects the
+ * xarray of the same name. Specifically it allows the caller to assert that
+ * the MARK will/will not be changing under the lock, and for devices and
+ * clients, that the value in the xarray is still a valid pointer. Change of
+ * the MARK is linked to the object state, so holding the lock and testing the
+ * MARK also asserts that the contained object is in a certain state.
+ *
+ * This is used to build a two stage register/unregister flow where objects
+ * can continue to be in the xarray even though they are still in progress to
+ * register/unregister.
+ *
+ * The xarray itself provides additional locking, and restartable iteration,
+ * which is also relied on.
+ *
+ * Locks should not be nested, with the exception of client_data, which is
+ * allowed to nest under the read side of the other two locks.
+ *
+ * The devices_rwsem also protects the device name list, any change or
+ * assignment of device name must also hold the write side to guarantee unique
+ * names.
+ */
 
 /*
- * device_mutex and lists_rwsem protect access to both device_list and
- * client_list.  device_mutex protects writer access by device and client
- * registration / de-registration.  lists_rwsem protects reader access to
- * these lists.  Iterators of these lists must lock it for read, while updates
- * to the lists must be done with a write lock. A special case is when the
- * device_mutex is locked. In this case locking the lists for read access is
- * not necessary as the device_mutex implies it.
+ * devices contains devices that have had their names assigned. The
+ * devices may not be registered. Users that care about the registration
+ * status need to call ib_device_try_get() on the device to ensure it is
+ * registered, and keep it registered, for the required duration.
  *
- * lists_rwsem also protects access to the client data list.
  */
-static DEFINE_MUTEX(device_mutex);
-static DECLARE_RWSEM(lists_rwsem);
+static DEFINE_XARRAY_FLAGS(devices, XA_FLAGS_ALLOC);
+static DECLARE_RWSEM(devices_rwsem);
+#define DEVICE_REGISTERED XA_MARK_1
 
+static u32 highest_client_id;
+#define CLIENT_REGISTERED XA_MARK_1
+static DEFINE_XARRAY_FLAGS(clients, XA_FLAGS_ALLOC);
+static DECLARE_RWSEM(clients_rwsem);
+
+static void ib_client_put(struct ib_client *client)
+{
+	if (refcount_dec_and_test(&client->uses))
+		complete(&client->uses_zero);
+}
+
+/*
+ * If client_data is registered then the corresponding client must also still
+ * be registered.
+ */
+#define CLIENT_DATA_REGISTERED XA_MARK_1
+
+unsigned int rdma_dev_net_id;
+
+/*
+ * A list of net namespaces is maintained in an xarray. This is necessary
+ * because we can't get the locking right using the existing net ns list. We
+ * would require a init_net callback after the list is updated.
+ */
+static DEFINE_XARRAY_FLAGS(rdma_nets, XA_FLAGS_ALLOC);
+/*
+ * rwsem to protect accessing the rdma_nets xarray entries.
+ */
+static DECLARE_RWSEM(rdma_nets_rwsem);
+
+bool ib_devices_shared_netns = true;
+module_param_named(netns_mode, ib_devices_shared_netns, bool, 0444);
+MODULE_PARM_DESC(netns_mode,
+		 "Share device among net namespaces; default=1 (shared)");
+/**
+ * rdma_dev_access_netns() - Return whether a rdma device can be accessed
+ *			     from a specified net namespace or not.
+ * @device:	Pointer to rdma device which needs to be checked
+ * @net:	Pointer to net namesapce for which access to be checked
+ *
+ * rdma_dev_access_netns() - Return whether a rdma device can be accessed
+ *			     from a specified net namespace or not. When
+ *			     rdma device is in shared mode, it ignores the
+ *			     net namespace. When rdma device is exclusive
+ *			     to a net namespace, rdma device net namespace is
+ *			     checked against the specified one.
+ */
+bool rdma_dev_access_netns(const struct ib_device *dev, const struct net *net)
+{
+	return (ib_devices_shared_netns ||
+		net_eq(read_pnet(&dev->coredev.rdma_net), net));
+}
+EXPORT_SYMBOL(rdma_dev_access_netns);
+
+/*
+ * xarray has this behavior where it won't iterate over NULL values stored in
+ * allocated arrays.  So we need our own iterator to see all values stored in
+ * the array. This does the same thing as xa_for_each except that it also
+ * returns NULL valued entries if the array is allocating. Simplified to only
+ * work on simple xarrays.
+ */
+static void *xan_find_marked(struct xarray *xa, unsigned long *indexp,
+			     xa_mark_t filter)
+{
+	XA_STATE(xas, xa, *indexp);
+	void *entry;
+
+	rcu_read_lock();
+	do {
+		entry = xas_find_marked(&xas, ULONG_MAX, filter);
+		if (xa_is_zero(entry))
+			break;
+	} while (xas_retry(&xas, entry));
+	rcu_read_unlock();
+
+	if (entry) {
+		*indexp = xas.xa_index;
+		if (xa_is_zero(entry))
+			return NULL;
+		return entry;
+	}
+	return XA_ERROR(-ENOENT);
+}
+#define xan_for_each_marked(xa, index, entry, filter)                          \
+	for (index = 0, entry = xan_find_marked(xa, &(index), filter);         \
+	     !xa_is_err(entry);                                                \
+	     (index)++, entry = xan_find_marked(xa, &(index), filter))
+
+/* RCU hash table mapping netdevice pointers to struct ib_port_data */
+static DEFINE_SPINLOCK(ndev_hash_lock);
+static DECLARE_HASHTABLE(ndev_hash, 5);
+
+static void free_netdevs(struct ib_device *ib_dev);
+static void ib_unregister_work(struct work_struct *work);
+static void __ib_unregister_device(struct ib_device *device);
 static int ib_security_change(struct notifier_block *nb, unsigned long event,
 			      void *lsm_data);
 static void ib_policy_change_task(struct work_struct *work);
 static DECLARE_WORK(ib_policy_change_work, ib_policy_change_task);
 
+static void __ibdev_printk(const char *level, const struct ib_device *ibdev,
+			   struct va_format *vaf)
+{
+	if (ibdev && ibdev->dev.parent)
+		dev_printk_emit(level[1] - '0',
+				ibdev->dev.parent,
+				"%s %s %s: %pV",
+				dev_driver_string(ibdev->dev.parent),
+				dev_name(ibdev->dev.parent),
+				dev_name(&ibdev->dev),
+				vaf);
+	else if (ibdev)
+		printk("%s%s: %pV",
+		       level, dev_name(&ibdev->dev), vaf);
+	else
+		printk("%s(NULL ib_device): %pV", level, vaf);
+}
+
+void ibdev_printk(const char *level, const struct ib_device *ibdev,
+		  const char *format, ...)
+{
+	struct va_format vaf;
+	va_list args;
+
+	va_start(args, format);
+
+	vaf.fmt = format;
+	vaf.va = &args;
+
+	__ibdev_printk(level, ibdev, &vaf);
+
+	va_end(args);
+}
+EXPORT_SYMBOL(ibdev_printk);
+
+#define define_ibdev_printk_level(func, level)                  \
+void func(const struct ib_device *ibdev, const char *fmt, ...)  \
+{                                                               \
+	struct va_format vaf;                                   \
+	va_list args;                                           \
+								\
+	va_start(args, fmt);                                    \
+								\
+	vaf.fmt = fmt;                                          \
+	vaf.va = &args;                                         \
+								\
+	__ibdev_printk(level, ibdev, &vaf);                     \
+								\
+	va_end(args);                                           \
+}                                                               \
+EXPORT_SYMBOL(func);
+
+define_ibdev_printk_level(ibdev_emerg, KERN_EMERG);
+define_ibdev_printk_level(ibdev_alert, KERN_ALERT);
+define_ibdev_printk_level(ibdev_crit, KERN_CRIT);
+define_ibdev_printk_level(ibdev_err, KERN_ERR);
+define_ibdev_printk_level(ibdev_warn, KERN_WARNING);
+define_ibdev_printk_level(ibdev_notice, KERN_NOTICE);
+define_ibdev_printk_level(ibdev_info, KERN_INFO);
+
 static struct notifier_block ibdev_lsm_nb = {
 	.notifier_call = ib_security_change,
 };
 
-static int ib_device_check_mandatory(struct ib_device *device)
+static int rdma_dev_change_netns(struct ib_device *device, struct net *cur_net,
+				 struct net *net);
+
+/* Pointer to the RCU head at the start of the ib_port_data array */
+struct ib_port_data_rcu {
+	struct rcu_head rcu_head;
+	struct ib_port_data pdata[];
+};
+
+static void ib_device_check_mandatory(struct ib_device *device)
 {
-#define IB_MANDATORY_FUNC(x) { offsetof(struct ib_device, x), #x }
+#define IB_MANDATORY_FUNC(x) { offsetof(struct ib_device_ops, x), #x }
 	static const struct {
 		size_t offset;
 		char  *name;
@@ -120,110 +293,228 @@
 	};
 	int i;
 
+	device->kverbs_provider = true;
 	for (i = 0; i < ARRAY_SIZE(mandatory_table); ++i) {
-		if (!*(void **) ((void *) device + mandatory_table[i].offset)) {
-			pr_warn("Device %s is missing mandatory function %s\n",
-				device->name, mandatory_table[i].name);
-			return -EINVAL;
+		if (!*(void **) ((void *) &device->ops +
+				 mandatory_table[i].offset)) {
+			device->kverbs_provider = false;
+			break;
 		}
 	}
-
-	return 0;
-}
-
-static struct ib_device *__ib_device_get_by_index(u32 index)
-{
-	struct ib_device *device;
-
-	list_for_each_entry(device, &device_list, core_list)
-		if (device->index == index)
-			return device;
-
-	return NULL;
 }
 
 /*
- * Caller is responsible to return refrerence count by calling put_device()
+ * Caller must perform ib_device_put() to return the device reference count
+ * when ib_device_get_by_index() returns valid device pointer.
  */
-struct ib_device *ib_device_get_by_index(u32 index)
+struct ib_device *ib_device_get_by_index(const struct net *net, u32 index)
 {
 	struct ib_device *device;
 
-	down_read(&lists_rwsem);
-	device = __ib_device_get_by_index(index);
-	if (device)
-		get_device(&device->dev);
+	down_read(&devices_rwsem);
+	device = xa_load(&devices, index);
+	if (device) {
+		if (!rdma_dev_access_netns(device, net)) {
+			device = NULL;
+			goto out;
+		}
 
-	up_read(&lists_rwsem);
+		if (!ib_device_try_get(device))
+			device = NULL;
+	}
+out:
+	up_read(&devices_rwsem);
 	return device;
 }
 
+/**
+ * ib_device_put - Release IB device reference
+ * @device: device whose reference to be released
+ *
+ * ib_device_put() releases reference to the IB device to allow it to be
+ * unregistered and eventually free.
+ */
+void ib_device_put(struct ib_device *device)
+{
+	if (refcount_dec_and_test(&device->refcount))
+		complete(&device->unreg_completion);
+}
+EXPORT_SYMBOL(ib_device_put);
+
 static struct ib_device *__ib_device_get_by_name(const char *name)
 {
 	struct ib_device *device;
+	unsigned long index;
 
-	list_for_each_entry(device, &device_list, core_list)
-		if (!strncmp(name, device->name, IB_DEVICE_NAME_MAX))
+	xa_for_each (&devices, index, device)
+		if (!strcmp(name, dev_name(&device->dev)))
 			return device;
 
 	return NULL;
 }
 
-static int alloc_name(char *name)
+/**
+ * ib_device_get_by_name - Find an IB device by name
+ * @name: The name to look for
+ * @driver_id: The driver ID that must match (RDMA_DRIVER_UNKNOWN matches all)
+ *
+ * Find and hold an ib_device by its name. The caller must call
+ * ib_device_put() on the returned pointer.
+ */
+struct ib_device *ib_device_get_by_name(const char *name,
+					enum rdma_driver_id driver_id)
 {
-	unsigned long *inuse;
-	char buf[IB_DEVICE_NAME_MAX];
 	struct ib_device *device;
-	int i;
 
-	inuse = (unsigned long *) get_zeroed_page(GFP_KERNEL);
-	if (!inuse)
-		return -ENOMEM;
+	down_read(&devices_rwsem);
+	device = __ib_device_get_by_name(name);
+	if (device && driver_id != RDMA_DRIVER_UNKNOWN &&
+	    device->ops.driver_id != driver_id)
+		device = NULL;
 
-	list_for_each_entry(device, &device_list, core_list) {
-		if (!sscanf(device->name, name, &i))
-			continue;
-		if (i < 0 || i >= PAGE_SIZE * 8)
-			continue;
-		snprintf(buf, sizeof buf, name, i);
-		if (!strncmp(buf, device->name, IB_DEVICE_NAME_MAX))
-			set_bit(i, inuse);
+	if (device) {
+		if (!ib_device_try_get(device))
+			device = NULL;
+	}
+	up_read(&devices_rwsem);
+	return device;
+}
+EXPORT_SYMBOL(ib_device_get_by_name);
+
+static int rename_compat_devs(struct ib_device *device)
+{
+	struct ib_core_device *cdev;
+	unsigned long index;
+	int ret = 0;
+
+	mutex_lock(&device->compat_devs_mutex);
+	xa_for_each (&device->compat_devs, index, cdev) {
+		ret = device_rename(&cdev->dev, dev_name(&device->dev));
+		if (ret) {
+			dev_warn(&cdev->dev,
+				 "Fail to rename compatdev to new name %s\n",
+				 dev_name(&device->dev));
+			break;
+		}
+	}
+	mutex_unlock(&device->compat_devs_mutex);
+	return ret;
+}
+
+int ib_device_rename(struct ib_device *ibdev, const char *name)
+{
+	unsigned long index;
+	void *client_data;
+	int ret;
+
+	down_write(&devices_rwsem);
+	if (!strcmp(name, dev_name(&ibdev->dev))) {
+		up_write(&devices_rwsem);
+		return 0;
 	}
 
-	i = find_first_zero_bit(inuse, PAGE_SIZE * 8);
-	free_page((unsigned long) inuse);
-	snprintf(buf, sizeof buf, name, i);
+	if (__ib_device_get_by_name(name)) {
+		up_write(&devices_rwsem);
+		return -EEXIST;
+	}
 
-	if (__ib_device_get_by_name(buf))
-		return -ENFILE;
+	ret = device_rename(&ibdev->dev, name);
+	if (ret) {
+		up_write(&devices_rwsem);
+		return ret;
+	}
 
-	strlcpy(name, buf, IB_DEVICE_NAME_MAX);
+	strlcpy(ibdev->name, name, IB_DEVICE_NAME_MAX);
+	ret = rename_compat_devs(ibdev);
+
+	downgrade_write(&devices_rwsem);
+	down_read(&ibdev->client_data_rwsem);
+	xan_for_each_marked(&ibdev->client_data, index, client_data,
+			    CLIENT_DATA_REGISTERED) {
+		struct ib_client *client = xa_load(&clients, index);
+
+		if (!client || !client->rename)
+			continue;
+
+		client->rename(ibdev, client_data);
+	}
+	up_read(&ibdev->client_data_rwsem);
+	up_read(&devices_rwsem);
 	return 0;
 }
 
+int ib_device_set_dim(struct ib_device *ibdev, u8 use_dim)
+{
+	if (use_dim > 1)
+		return -EINVAL;
+	ibdev->use_cq_dim = use_dim;
+
+	return 0;
+}
+
+static int alloc_name(struct ib_device *ibdev, const char *name)
+{
+	struct ib_device *device;
+	unsigned long index;
+	struct ida inuse;
+	int rc;
+	int i;
+
+	lockdep_assert_held_write(&devices_rwsem);
+	ida_init(&inuse);
+	xa_for_each (&devices, index, device) {
+		char buf[IB_DEVICE_NAME_MAX];
+
+		if (sscanf(dev_name(&device->dev), name, &i) != 1)
+			continue;
+		if (i < 0 || i >= INT_MAX)
+			continue;
+		snprintf(buf, sizeof buf, name, i);
+		if (strcmp(buf, dev_name(&device->dev)) != 0)
+			continue;
+
+		rc = ida_alloc_range(&inuse, i, i, GFP_KERNEL);
+		if (rc < 0)
+			goto out;
+	}
+
+	rc = ida_alloc(&inuse, GFP_KERNEL);
+	if (rc < 0)
+		goto out;
+
+	rc = dev_set_name(&ibdev->dev, name, rc);
+out:
+	ida_destroy(&inuse);
+	return rc;
+}
+
 static void ib_device_release(struct device *device)
 {
 	struct ib_device *dev = container_of(device, struct ib_device, dev);
 
-	WARN_ON(dev->reg_state == IB_DEV_REGISTERED);
-	if (dev->reg_state == IB_DEV_UNREGISTERED) {
-		/*
-		 * In IB_DEV_UNINITIALIZED state, cache or port table
-		 * is not even created. Free cache and port table only when
-		 * device reaches UNREGISTERED state.
-		 */
+	free_netdevs(dev);
+	WARN_ON(refcount_read(&dev->refcount));
+	if (dev->port_data) {
 		ib_cache_release_one(dev);
-		kfree(dev->port_immutable);
+		ib_security_release_port_pkey_list(dev);
+		rdma_counter_release(dev);
+		kfree_rcu(container_of(dev->port_data, struct ib_port_data_rcu,
+				       pdata[0]),
+			  rcu_head);
 	}
-	kfree(dev);
+
+	mutex_destroy(&dev->unregistration_lock);
+	mutex_destroy(&dev->compat_devs_mutex);
+
+	xa_destroy(&dev->compat_devs);
+	xa_destroy(&dev->client_data);
+	kfree_rcu(dev, rcu_head);
 }
 
 static int ib_device_uevent(struct device *device,
 			    struct kobj_uevent_env *env)
 {
-	struct ib_device *dev = container_of(device, struct ib_device, dev);
-
-	if (add_uevent_var(env, "NAME=%s", dev->name))
+	if (add_uevent_var(env, "NAME=%s", dev_name(device)))
 		return -ENOMEM;
 
 	/*
@@ -233,14 +524,44 @@
 	return 0;
 }
 
+static const void *net_namespace(struct device *d)
+{
+	struct ib_core_device *coredev =
+			container_of(d, struct ib_core_device, dev);
+
+	return read_pnet(&coredev->rdma_net);
+}
+
 static struct class ib_class = {
 	.name    = "infiniband",
 	.dev_release = ib_device_release,
 	.dev_uevent = ib_device_uevent,
+	.ns_type = &net_ns_type_operations,
+	.namespace = net_namespace,
 };
 
+static void rdma_init_coredev(struct ib_core_device *coredev,
+			      struct ib_device *dev, struct net *net)
+{
+	/* This BUILD_BUG_ON is intended to catch layout change
+	 * of union of ib_core_device and device.
+	 * dev must be the first element as ib_core and providers
+	 * driver uses it. Adding anything in ib_core_device before
+	 * device will break this assumption.
+	 */
+	BUILD_BUG_ON(offsetof(struct ib_device, coredev.dev) !=
+		     offsetof(struct ib_device, dev));
+
+	coredev->dev.class = &ib_class;
+	coredev->dev.groups = dev->groups;
+	device_initialize(&coredev->dev);
+	coredev->owner = dev;
+	INIT_LIST_HEAD(&coredev->port_list);
+	write_pnet(&coredev->rdma_net, net);
+}
+
 /**
- * ib_alloc_device - allocate an IB device struct
+ * _ib_alloc_device - allocate an IB device struct
  * @size:size of structure to allocate
  *
  * Low-level drivers should use ib_alloc_device() to allocate &struct
@@ -249,7 +570,7 @@
  * ib_dealloc_device() must be used to free structures allocated with
  * ib_alloc_device().
  */
-struct ib_device *ib_alloc_device(size_t size)
+struct ib_device *_ib_alloc_device(size_t size)
 {
 	struct ib_device *device;
 
@@ -260,22 +581,31 @@
 	if (!device)
 		return NULL;
 
-	rdma_restrack_init(&device->res);
+	if (rdma_restrack_init(device)) {
+		kfree(device);
+		return NULL;
+	}
 
-	device->dev.class = &ib_class;
-	device_initialize(&device->dev);
-
-	dev_set_drvdata(&device->dev, device);
+	device->groups[0] = &ib_dev_attr_group;
+	rdma_init_coredev(&device->coredev, device, &init_net);
 
 	INIT_LIST_HEAD(&device->event_handler_list);
 	spin_lock_init(&device->event_handler_lock);
-	spin_lock_init(&device->client_data_lock);
-	INIT_LIST_HEAD(&device->client_data_list);
-	INIT_LIST_HEAD(&device->port_list);
+	mutex_init(&device->unregistration_lock);
+	/*
+	 * client_data needs to be alloc because we don't want our mark to be
+	 * destroyed if the user stores NULL in the client data.
+	 */
+	xa_init_flags(&device->client_data, XA_FLAGS_ALLOC);
+	init_rwsem(&device->client_data_rwsem);
+	xa_init_flags(&device->compat_devs, XA_FLAGS_ALLOC);
+	mutex_init(&device->compat_devs_mutex);
+	init_completion(&device->unreg_completion);
+	INIT_WORK(&device->unregistration_work, ib_unregister_work);
 
 	return device;
 }
-EXPORT_SYMBOL(ib_alloc_device);
+EXPORT_SYMBOL(_ib_alloc_device);
 
 /**
  * ib_dealloc_device - free an IB device struct
@@ -285,32 +615,161 @@
  */
 void ib_dealloc_device(struct ib_device *device)
 {
-	WARN_ON(device->reg_state != IB_DEV_UNREGISTERED &&
-		device->reg_state != IB_DEV_UNINITIALIZED);
-	rdma_restrack_clean(&device->res);
+	if (device->ops.dealloc_driver)
+		device->ops.dealloc_driver(device);
+
+	/*
+	 * ib_unregister_driver() requires all devices to remain in the xarray
+	 * while their ops are callable. The last op we call is dealloc_driver
+	 * above.  This is needed to create a fence on op callbacks prior to
+	 * allowing the driver module to unload.
+	 */
+	down_write(&devices_rwsem);
+	if (xa_load(&devices, device->index) == device)
+		xa_erase(&devices, device->index);
+	up_write(&devices_rwsem);
+
+	/* Expedite releasing netdev references */
+	free_netdevs(device);
+
+	WARN_ON(!xa_empty(&device->compat_devs));
+	WARN_ON(!xa_empty(&device->client_data));
+	WARN_ON(refcount_read(&device->refcount));
+	rdma_restrack_clean(device);
+	/* Balances with device_initialize */
 	put_device(&device->dev);
 }
 EXPORT_SYMBOL(ib_dealloc_device);
 
-static int add_client_context(struct ib_device *device, struct ib_client *client)
+/*
+ * add_client_context() and remove_client_context() must be safe against
+ * parallel calls on the same device - registration/unregistration of both the
+ * device and client can be occurring in parallel.
+ *
+ * The routines need to be a fence, any caller must not return until the add
+ * or remove is fully completed.
+ */
+static int add_client_context(struct ib_device *device,
+			      struct ib_client *client)
 {
-	struct ib_client_data *context;
-	unsigned long flags;
+	int ret = 0;
 
-	context = kmalloc(sizeof *context, GFP_KERNEL);
-	if (!context)
+	if (!device->kverbs_provider && !client->no_kverbs_req)
+		return 0;
+
+	down_write(&device->client_data_rwsem);
+	/*
+	 * So long as the client is registered hold both the client and device
+	 * unregistration locks.
+	 */
+	if (!refcount_inc_not_zero(&client->uses))
+		goto out_unlock;
+	refcount_inc(&device->refcount);
+
+	/*
+	 * Another caller to add_client_context got here first and has already
+	 * completely initialized context.
+	 */
+	if (xa_get_mark(&device->client_data, client->client_id,
+		    CLIENT_DATA_REGISTERED))
+		goto out;
+
+	ret = xa_err(xa_store(&device->client_data, client->client_id, NULL,
+			      GFP_KERNEL));
+	if (ret)
+		goto out;
+	downgrade_write(&device->client_data_rwsem);
+	if (client->add)
+		client->add(device);
+
+	/* Readers shall not see a client until add has been completed */
+	xa_set_mark(&device->client_data, client->client_id,
+		    CLIENT_DATA_REGISTERED);
+	up_read(&device->client_data_rwsem);
+	return 0;
+
+out:
+	ib_device_put(device);
+	ib_client_put(client);
+out_unlock:
+	up_write(&device->client_data_rwsem);
+	return ret;
+}
+
+static void remove_client_context(struct ib_device *device,
+				  unsigned int client_id)
+{
+	struct ib_client *client;
+	void *client_data;
+
+	down_write(&device->client_data_rwsem);
+	if (!xa_get_mark(&device->client_data, client_id,
+			 CLIENT_DATA_REGISTERED)) {
+		up_write(&device->client_data_rwsem);
+		return;
+	}
+	client_data = xa_load(&device->client_data, client_id);
+	xa_clear_mark(&device->client_data, client_id, CLIENT_DATA_REGISTERED);
+	client = xa_load(&clients, client_id);
+	up_write(&device->client_data_rwsem);
+
+	/*
+	 * Notice we cannot be holding any exclusive locks when calling the
+	 * remove callback as the remove callback can recurse back into any
+	 * public functions in this module and thus try for any locks those
+	 * functions take.
+	 *
+	 * For this reason clients and drivers should not call the
+	 * unregistration functions will holdling any locks.
+	 */
+	if (client->remove)
+		client->remove(device, client_data);
+
+	xa_erase(&device->client_data, client_id);
+	ib_device_put(device);
+	ib_client_put(client);
+}
+
+static int alloc_port_data(struct ib_device *device)
+{
+	struct ib_port_data_rcu *pdata_rcu;
+	unsigned int port;
+
+	if (device->port_data)
+		return 0;
+
+	/* This can only be called once the physical port range is defined */
+	if (WARN_ON(!device->phys_port_cnt))
+		return -EINVAL;
+
+	/*
+	 * device->port_data is indexed directly by the port number to make
+	 * access to this data as efficient as possible.
+	 *
+	 * Therefore port_data is declared as a 1 based array with potential
+	 * empty slots at the beginning.
+	 */
+	pdata_rcu = kzalloc(struct_size(pdata_rcu, pdata,
+					rdma_end_port(device) + 1),
+			    GFP_KERNEL);
+	if (!pdata_rcu)
 		return -ENOMEM;
+	/*
+	 * The rcu_head is put in front of the port data array and the stored
+	 * pointer is adjusted since we never need to see that member until
+	 * kfree_rcu.
+	 */
+	device->port_data = pdata_rcu->pdata;
 
-	context->client = client;
-	context->data   = NULL;
-	context->going_down = false;
+	rdma_for_each_port (device, port) {
+		struct ib_port_data *pdata = &device->port_data[port];
 
-	down_write(&lists_rwsem);
-	spin_lock_irqsave(&device->client_data_lock, flags);
-	list_add(&context->list, &device->client_data_list);
-	spin_unlock_irqrestore(&device->client_data_lock, flags);
-	up_write(&lists_rwsem);
-
+		pdata->ib_dev = device;
+		spin_lock_init(&pdata->pkey_list_lock);
+		INIT_LIST_HEAD(&pdata->pkey_list);
+		spin_lock_init(&pdata->netdev_lock);
+		INIT_HLIST_NODE(&pdata->ndev_hash_link);
+	}
 	return 0;
 }
 
@@ -320,29 +779,20 @@
 			    rdma_max_mad_size(dev, port) != 0);
 }
 
-static int read_port_immutable(struct ib_device *device)
+static int setup_port_data(struct ib_device *device)
 {
+	unsigned int port;
 	int ret;
-	u8 start_port = rdma_start_port(device);
-	u8 end_port = rdma_end_port(device);
-	u8 port;
 
-	/**
-	 * device->port_immutable is indexed directly by the port number to make
-	 * access to this data as efficient as possible.
-	 *
-	 * Therefore port_immutable is declared as a 1 based array with
-	 * potential empty slots at the beginning.
-	 */
-	device->port_immutable = kcalloc(end_port + 1,
-					 sizeof(*device->port_immutable),
-					 GFP_KERNEL);
-	if (!device->port_immutable)
-		return -ENOMEM;
+	ret = alloc_port_data(device);
+	if (ret)
+		return ret;
 
-	for (port = start_port; port <= end_port; ++port) {
-		ret = device->get_port_immutable(device, port,
-						 &device->port_immutable[port]);
+	rdma_for_each_port (device, port) {
+		struct ib_port_data *pdata = &device->port_data[port];
+
+		ret = device->ops.get_port_immutable(device, port,
+						     &pdata->immutable);
 		if (ret)
 			return ret;
 
@@ -354,46 +804,23 @@
 
 void ib_get_device_fw_str(struct ib_device *dev, char *str)
 {
-	if (dev->get_dev_fw_str)
-		dev->get_dev_fw_str(dev, str);
+	if (dev->ops.get_dev_fw_str)
+		dev->ops.get_dev_fw_str(dev, str);
 	else
 		str[0] = '\0';
 }
 EXPORT_SYMBOL(ib_get_device_fw_str);
 
-static int setup_port_pkey_list(struct ib_device *device)
-{
-	int i;
-
-	/**
-	 * device->port_pkey_list is indexed directly by the port number,
-	 * Therefore it is declared as a 1 based array with potential empty
-	 * slots at the beginning.
-	 */
-	device->port_pkey_list = kcalloc(rdma_end_port(device) + 1,
-					 sizeof(*device->port_pkey_list),
-					 GFP_KERNEL);
-
-	if (!device->port_pkey_list)
-		return -ENOMEM;
-
-	for (i = 0; i < (rdma_end_port(device) + 1); i++) {
-		spin_lock_init(&device->port_pkey_list[i].list_lock);
-		INIT_LIST_HEAD(&device->port_pkey_list[i].pkey_list);
-	}
-
-	return 0;
-}
-
 static void ib_policy_change_task(struct work_struct *work)
 {
 	struct ib_device *dev;
+	unsigned long index;
 
-	down_read(&lists_rwsem);
-	list_for_each_entry(dev, &device_list, core_list) {
-		int i;
+	down_read(&devices_rwsem);
+	xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) {
+		unsigned int i;
 
-		for (i = rdma_start_port(dev); i <= rdma_end_port(dev); i++) {
+		rdma_for_each_port (dev, i) {
 			u64 sp;
 			int ret = ib_get_cached_subnet_prefix(dev,
 							      i,
@@ -406,7 +833,7 @@
 				ib_security_cache_change(dev, i, sp);
 		}
 	}
-	up_read(&lists_rwsem);
+	up_read(&devices_rwsem);
 }
 
 static int ib_security_change(struct notifier_block *nb, unsigned long event,
@@ -416,50 +843,331 @@
 		return NOTIFY_DONE;
 
 	schedule_work(&ib_policy_change_work);
+	ib_mad_agent_security_change();
 
 	return NOTIFY_OK;
 }
 
-/**
- *	__dev_new_index	-	allocate an device index
- *
- *	Returns a suitable unique value for a new device interface
- *	number.  It assumes that there are less than 2^32-1 ib devices
- *	will be present in the system.
- */
-static u32 __dev_new_index(void)
+static void compatdev_release(struct device *dev)
 {
+	struct ib_core_device *cdev =
+		container_of(dev, struct ib_core_device, dev);
+
+	kfree(cdev);
+}
+
+static int add_one_compat_dev(struct ib_device *device,
+			      struct rdma_dev_net *rnet)
+{
+	struct ib_core_device *cdev;
+	int ret;
+
+	lockdep_assert_held(&rdma_nets_rwsem);
+	if (!ib_devices_shared_netns)
+		return 0;
+
 	/*
-	 * The device index to allow stable naming.
-	 * Similar to struct net -> ifindex.
+	 * Create and add compat device in all namespaces other than where it
+	 * is currently bound to.
 	 */
-	static u32 index;
+	if (net_eq(read_pnet(&rnet->net),
+		   read_pnet(&device->coredev.rdma_net)))
+		return 0;
 
-	for (;;) {
-		if (!(++index))
-			index = 1;
+	/*
+	 * The first of init_net() or ib_register_device() to take the
+	 * compat_devs_mutex wins and gets to add the device. Others will wait
+	 * for completion here.
+	 */
+	mutex_lock(&device->compat_devs_mutex);
+	cdev = xa_load(&device->compat_devs, rnet->id);
+	if (cdev) {
+		ret = 0;
+		goto done;
+	}
+	ret = xa_reserve(&device->compat_devs, rnet->id, GFP_KERNEL);
+	if (ret)
+		goto done;
 
-		if (!__ib_device_get_by_index(index))
-			return index;
+	cdev = kzalloc(sizeof(*cdev), GFP_KERNEL);
+	if (!cdev) {
+		ret = -ENOMEM;
+		goto cdev_err;
+	}
+
+	cdev->dev.parent = device->dev.parent;
+	rdma_init_coredev(cdev, device, read_pnet(&rnet->net));
+	cdev->dev.release = compatdev_release;
+	dev_set_name(&cdev->dev, "%s", dev_name(&device->dev));
+
+	ret = device_add(&cdev->dev);
+	if (ret)
+		goto add_err;
+	ret = ib_setup_port_attrs(cdev);
+	if (ret)
+		goto port_err;
+
+	ret = xa_err(xa_store(&device->compat_devs, rnet->id,
+			      cdev, GFP_KERNEL));
+	if (ret)
+		goto insert_err;
+
+	mutex_unlock(&device->compat_devs_mutex);
+	return 0;
+
+insert_err:
+	ib_free_port_attrs(cdev);
+port_err:
+	device_del(&cdev->dev);
+add_err:
+	put_device(&cdev->dev);
+cdev_err:
+	xa_release(&device->compat_devs, rnet->id);
+done:
+	mutex_unlock(&device->compat_devs_mutex);
+	return ret;
+}
+
+static void remove_one_compat_dev(struct ib_device *device, u32 id)
+{
+	struct ib_core_device *cdev;
+
+	mutex_lock(&device->compat_devs_mutex);
+	cdev = xa_erase(&device->compat_devs, id);
+	mutex_unlock(&device->compat_devs_mutex);
+	if (cdev) {
+		ib_free_port_attrs(cdev);
+		device_del(&cdev->dev);
+		put_device(&cdev->dev);
 	}
 }
 
-/**
- * ib_register_device - Register an IB device with IB core
- * @device:Device to register
- *
- * Low-level drivers use ib_register_device() to register their
- * devices with the IB core.  All registered clients will receive a
- * callback for each device that is added. @device must be allocated
- * with ib_alloc_device().
- */
-int ib_register_device(struct ib_device *device,
-		       int (*port_callback)(struct ib_device *,
-					    u8, struct kobject *))
+static void remove_compat_devs(struct ib_device *device)
 {
+	struct ib_core_device *cdev;
+	unsigned long index;
+
+	xa_for_each (&device->compat_devs, index, cdev)
+		remove_one_compat_dev(device, index);
+}
+
+static int add_compat_devs(struct ib_device *device)
+{
+	struct rdma_dev_net *rnet;
+	unsigned long index;
+	int ret = 0;
+
+	lockdep_assert_held(&devices_rwsem);
+
+	down_read(&rdma_nets_rwsem);
+	xa_for_each (&rdma_nets, index, rnet) {
+		ret = add_one_compat_dev(device, rnet);
+		if (ret)
+			break;
+	}
+	up_read(&rdma_nets_rwsem);
+	return ret;
+}
+
+static void remove_all_compat_devs(void)
+{
+	struct ib_compat_device *cdev;
+	struct ib_device *dev;
+	unsigned long index;
+
+	down_read(&devices_rwsem);
+	xa_for_each (&devices, index, dev) {
+		unsigned long c_index = 0;
+
+		/* Hold nets_rwsem so that any other thread modifying this
+		 * system param can sync with this thread.
+		 */
+		down_read(&rdma_nets_rwsem);
+		xa_for_each (&dev->compat_devs, c_index, cdev)
+			remove_one_compat_dev(dev, c_index);
+		up_read(&rdma_nets_rwsem);
+	}
+	up_read(&devices_rwsem);
+}
+
+static int add_all_compat_devs(void)
+{
+	struct rdma_dev_net *rnet;
+	struct ib_device *dev;
+	unsigned long index;
+	int ret = 0;
+
+	down_read(&devices_rwsem);
+	xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) {
+		unsigned long net_index = 0;
+
+		/* Hold nets_rwsem so that any other thread modifying this
+		 * system param can sync with this thread.
+		 */
+		down_read(&rdma_nets_rwsem);
+		xa_for_each (&rdma_nets, net_index, rnet) {
+			ret = add_one_compat_dev(dev, rnet);
+			if (ret)
+				break;
+		}
+		up_read(&rdma_nets_rwsem);
+	}
+	up_read(&devices_rwsem);
+	if (ret)
+		remove_all_compat_devs();
+	return ret;
+}
+
+int rdma_compatdev_set(u8 enable)
+{
+	struct rdma_dev_net *rnet;
+	unsigned long index;
+	int ret = 0;
+
+	down_write(&rdma_nets_rwsem);
+	if (ib_devices_shared_netns == enable) {
+		up_write(&rdma_nets_rwsem);
+		return 0;
+	}
+
+	/* enable/disable of compat devices is not supported
+	 * when more than default init_net exists.
+	 */
+	xa_for_each (&rdma_nets, index, rnet) {
+		ret++;
+		break;
+	}
+	if (!ret)
+		ib_devices_shared_netns = enable;
+	up_write(&rdma_nets_rwsem);
+	if (ret)
+		return -EBUSY;
+
+	if (enable)
+		ret = add_all_compat_devs();
+	else
+		remove_all_compat_devs();
+	return ret;
+}
+
+static void rdma_dev_exit_net(struct net *net)
+{
+	struct rdma_dev_net *rnet = rdma_net_to_dev_net(net);
+	struct ib_device *dev;
+	unsigned long index;
 	int ret;
-	struct ib_client *client;
-	struct ib_udata uhw = {.outlen = 0, .inlen = 0};
+
+	down_write(&rdma_nets_rwsem);
+	/*
+	 * Prevent the ID from being re-used and hide the id from xa_for_each.
+	 */
+	ret = xa_err(xa_store(&rdma_nets, rnet->id, NULL, GFP_KERNEL));
+	WARN_ON(ret);
+	up_write(&rdma_nets_rwsem);
+
+	down_read(&devices_rwsem);
+	xa_for_each (&devices, index, dev) {
+		get_device(&dev->dev);
+		/*
+		 * Release the devices_rwsem so that pontentially blocking
+		 * device_del, doesn't hold the devices_rwsem for too long.
+		 */
+		up_read(&devices_rwsem);
+
+		remove_one_compat_dev(dev, rnet->id);
+
+		/*
+		 * If the real device is in the NS then move it back to init.
+		 */
+		rdma_dev_change_netns(dev, net, &init_net);
+
+		put_device(&dev->dev);
+		down_read(&devices_rwsem);
+	}
+	up_read(&devices_rwsem);
+
+	rdma_nl_net_exit(rnet);
+	xa_erase(&rdma_nets, rnet->id);
+}
+
+static __net_init int rdma_dev_init_net(struct net *net)
+{
+	struct rdma_dev_net *rnet = rdma_net_to_dev_net(net);
+	unsigned long index;
+	struct ib_device *dev;
+	int ret;
+
+	write_pnet(&rnet->net, net);
+
+	ret = rdma_nl_net_init(rnet);
+	if (ret)
+		return ret;
+
+	/* No need to create any compat devices in default init_net. */
+	if (net_eq(net, &init_net))
+		return 0;
+
+	ret = xa_alloc(&rdma_nets, &rnet->id, rnet, xa_limit_32b, GFP_KERNEL);
+	if (ret) {
+		rdma_nl_net_exit(rnet);
+		return ret;
+	}
+
+	down_read(&devices_rwsem);
+	xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) {
+		/* Hold nets_rwsem so that netlink command cannot change
+		 * system configuration for device sharing mode.
+		 */
+		down_read(&rdma_nets_rwsem);
+		ret = add_one_compat_dev(dev, rnet);
+		up_read(&rdma_nets_rwsem);
+		if (ret)
+			break;
+	}
+	up_read(&devices_rwsem);
+
+	if (ret)
+		rdma_dev_exit_net(net);
+
+	return ret;
+}
+
+/*
+ * Assign the unique string device name and the unique device index. This is
+ * undone by ib_dealloc_device.
+ */
+static int assign_name(struct ib_device *device, const char *name)
+{
+	static u32 last_id;
+	int ret;
+
+	down_write(&devices_rwsem);
+	/* Assign a unique name to the device */
+	if (strchr(name, '%'))
+		ret = alloc_name(device, name);
+	else
+		ret = dev_set_name(&device->dev, name);
+	if (ret)
+		goto out;
+
+	if (__ib_device_get_by_name(dev_name(&device->dev))) {
+		ret = -ENFILE;
+		goto out;
+	}
+	strlcpy(device->name, dev_name(&device->dev), IB_DEVICE_NAME_MAX);
+
+	ret = xa_alloc_cyclic(&devices, &device->index, device, xa_limit_31b,
+			&last_id, GFP_KERNEL);
+	if (ret > 0)
+		ret = 0;
+
+out:
+	up_write(&devices_rwsem);
+	return ret;
+}
+
+static void setup_dma_device(struct ib_device *device)
+{
 	struct device *parent = device->dev.parent;
 
 	WARN_ON_ONCE(device->dma_device);
@@ -491,135 +1199,516 @@
 		WARN_ON_ONCE(!parent);
 		device->dma_device = parent;
 	}
+	/* Setup default max segment size for all IB devices */
+	dma_set_max_seg_size(device->dma_device, SZ_2G);
 
-	mutex_lock(&device_mutex);
+}
 
-	if (strchr(device->name, '%')) {
-		ret = alloc_name(device->name);
+/*
+ * setup_device() allocates memory and sets up data that requires calling the
+ * device ops, this is the only reason these actions are not done during
+ * ib_alloc_device. It is undone by ib_dealloc_device().
+ */
+static int setup_device(struct ib_device *device)
+{
+	struct ib_udata uhw = {.outlen = 0, .inlen = 0};
+	int ret;
+
+	setup_dma_device(device);
+	ib_device_check_mandatory(device);
+
+	ret = setup_port_data(device);
+	if (ret) {
+		dev_warn(&device->dev, "Couldn't create per-port data\n");
+		return ret;
+	}
+
+	memset(&device->attrs, 0, sizeof(device->attrs));
+	ret = device->ops.query_device(device, &device->attrs, &uhw);
+	if (ret) {
+		dev_warn(&device->dev,
+			 "Couldn't query the device attributes\n");
+		return ret;
+	}
+
+	return 0;
+}
+
+static void disable_device(struct ib_device *device)
+{
+	u32 cid;
+
+	WARN_ON(!refcount_read(&device->refcount));
+
+	down_write(&devices_rwsem);
+	xa_clear_mark(&devices, device->index, DEVICE_REGISTERED);
+	up_write(&devices_rwsem);
+
+	/*
+	 * Remove clients in LIFO order, see assign_client_id. This could be
+	 * more efficient if xarray learns to reverse iterate. Since no new
+	 * clients can be added to this ib_device past this point we only need
+	 * the maximum possible client_id value here.
+	 */
+	down_read(&clients_rwsem);
+	cid = highest_client_id;
+	up_read(&clients_rwsem);
+	while (cid) {
+		cid--;
+		remove_client_context(device, cid);
+	}
+
+	/* Pairs with refcount_set in enable_device */
+	ib_device_put(device);
+	wait_for_completion(&device->unreg_completion);
+
+	/*
+	 * compat devices must be removed after device refcount drops to zero.
+	 * Otherwise init_net() may add more compatdevs after removing compat
+	 * devices and before device is disabled.
+	 */
+	remove_compat_devs(device);
+}
+
+/*
+ * An enabled device is visible to all clients and to all the public facing
+ * APIs that return a device pointer. This always returns with a new get, even
+ * if it fails.
+ */
+static int enable_device_and_get(struct ib_device *device)
+{
+	struct ib_client *client;
+	unsigned long index;
+	int ret = 0;
+
+	/*
+	 * One ref belongs to the xa and the other belongs to this
+	 * thread. This is needed to guard against parallel unregistration.
+	 */
+	refcount_set(&device->refcount, 2);
+	down_write(&devices_rwsem);
+	xa_set_mark(&devices, device->index, DEVICE_REGISTERED);
+
+	/*
+	 * By using downgrade_write() we ensure that no other thread can clear
+	 * DEVICE_REGISTERED while we are completing the client setup.
+	 */
+	downgrade_write(&devices_rwsem);
+
+	if (device->ops.enable_driver) {
+		ret = device->ops.enable_driver(device);
 		if (ret)
 			goto out;
 	}
 
-	if (ib_device_check_mandatory(device)) {
-		ret = -EINVAL;
-		goto out;
+	down_read(&clients_rwsem);
+	xa_for_each_marked (&clients, index, client, CLIENT_REGISTERED) {
+		ret = add_client_context(device, client);
+		if (ret)
+			break;
 	}
+	up_read(&clients_rwsem);
+	if (!ret)
+		ret = add_compat_devs(device);
+out:
+	up_read(&devices_rwsem);
+	return ret;
+}
 
-	ret = read_port_immutable(device);
-	if (ret) {
-		pr_warn("Couldn't create per port immutable data %s\n",
-			device->name);
-		goto out;
-	}
+/**
+ * ib_register_device - Register an IB device with IB core
+ * @device:Device to register
+ *
+ * Low-level drivers use ib_register_device() to register their
+ * devices with the IB core.  All registered clients will receive a
+ * callback for each device that is added. @device must be allocated
+ * with ib_alloc_device().
+ *
+ * If the driver uses ops.dealloc_driver and calls any ib_unregister_device()
+ * asynchronously then the device pointer may become freed as soon as this
+ * function returns.
+ */
+int ib_register_device(struct ib_device *device, const char *name)
+{
+	int ret;
 
-	ret = setup_port_pkey_list(device);
-	if (ret) {
-		pr_warn("Couldn't create per port_pkey_list\n");
-		goto out;
-	}
+	ret = assign_name(device, name);
+	if (ret)
+		return ret;
+
+	ret = setup_device(device);
+	if (ret)
+		return ret;
 
 	ret = ib_cache_setup_one(device);
 	if (ret) {
-		pr_warn("Couldn't set up InfiniBand P_Key/GID cache\n");
-		goto port_cleanup;
+		dev_warn(&device->dev,
+			 "Couldn't set up InfiniBand P_Key/GID cache\n");
+		return ret;
 	}
 
-	ret = ib_device_register_rdmacg(device);
-	if (ret) {
-		pr_warn("Couldn't register device with rdma cgroup\n");
-		goto cache_cleanup;
-	}
+	ib_device_register_rdmacg(device);
 
-	memset(&device->attrs, 0, sizeof(device->attrs));
-	ret = device->query_device(device, &device->attrs, &uhw);
-	if (ret) {
-		pr_warn("Couldn't query the device attributes\n");
+	rdma_counter_init(device);
+
+	/*
+	 * Ensure that ADD uevent is not fired because it
+	 * is too early amd device is not initialized yet.
+	 */
+	dev_set_uevent_suppress(&device->dev, true);
+	ret = device_add(&device->dev);
+	if (ret)
 		goto cg_cleanup;
-	}
 
-	ret = ib_device_register_sysfs(device, port_callback);
+	ret = ib_device_register_sysfs(device);
 	if (ret) {
-		pr_warn("Couldn't register device %s with driver model\n",
-			device->name);
-		goto cg_cleanup;
+		dev_warn(&device->dev,
+			 "Couldn't register device with driver model\n");
+		goto dev_cleanup;
 	}
 
-	device->reg_state = IB_DEV_REGISTERED;
+	ret = enable_device_and_get(device);
+	dev_set_uevent_suppress(&device->dev, false);
+	/* Mark for userspace that device is ready */
+	kobject_uevent(&device->dev.kobj, KOBJ_ADD);
+	if (ret) {
+		void (*dealloc_fn)(struct ib_device *);
 
-	list_for_each_entry(client, &client_list, list)
-		if (!add_client_context(device, client) && client->add)
-			client->add(device);
+		/*
+		 * If we hit this error flow then we don't want to
+		 * automatically dealloc the device since the caller is
+		 * expected to call ib_dealloc_device() after
+		 * ib_register_device() fails. This is tricky due to the
+		 * possibility for a parallel unregistration along with this
+		 * error flow. Since we have a refcount here we know any
+		 * parallel flow is stopped in disable_device and will see the
+		 * NULL pointers, causing the responsibility to
+		 * ib_dealloc_device() to revert back to this thread.
+		 */
+		dealloc_fn = device->ops.dealloc_driver;
+		device->ops.dealloc_driver = NULL;
+		ib_device_put(device);
+		__ib_unregister_device(device);
+		device->ops.dealloc_driver = dealloc_fn;
+		return ret;
+	}
+	ib_device_put(device);
 
-	device->index = __dev_new_index();
-	down_write(&lists_rwsem);
-	list_add_tail(&device->core_list, &device_list);
-	up_write(&lists_rwsem);
-	mutex_unlock(&device_mutex);
 	return 0;
 
+dev_cleanup:
+	device_del(&device->dev);
 cg_cleanup:
+	dev_set_uevent_suppress(&device->dev, false);
 	ib_device_unregister_rdmacg(device);
-cache_cleanup:
 	ib_cache_cleanup_one(device);
-	ib_cache_release_one(device);
-port_cleanup:
-	kfree(device->port_immutable);
-out:
-	mutex_unlock(&device_mutex);
 	return ret;
 }
 EXPORT_SYMBOL(ib_register_device);
 
+/* Callers must hold a get on the device. */
+static void __ib_unregister_device(struct ib_device *ib_dev)
+{
+	/*
+	 * We have a registration lock so that all the calls to unregister are
+	 * fully fenced, once any unregister returns the device is truely
+	 * unregistered even if multiple callers are unregistering it at the
+	 * same time. This also interacts with the registration flow and
+	 * provides sane semantics if register and unregister are racing.
+	 */
+	mutex_lock(&ib_dev->unregistration_lock);
+	if (!refcount_read(&ib_dev->refcount))
+		goto out;
+
+	disable_device(ib_dev);
+
+	/* Expedite removing unregistered pointers from the hash table */
+	free_netdevs(ib_dev);
+
+	ib_device_unregister_sysfs(ib_dev);
+	device_del(&ib_dev->dev);
+	ib_device_unregister_rdmacg(ib_dev);
+	ib_cache_cleanup_one(ib_dev);
+
+	/*
+	 * Drivers using the new flow may not call ib_dealloc_device except
+	 * in error unwind prior to registration success.
+	 */
+	if (ib_dev->ops.dealloc_driver) {
+		WARN_ON(kref_read(&ib_dev->dev.kobj.kref) <= 1);
+		ib_dealloc_device(ib_dev);
+	}
+out:
+	mutex_unlock(&ib_dev->unregistration_lock);
+}
+
 /**
  * ib_unregister_device - Unregister an IB device
- * @device:Device to unregister
+ * @device: The device to unregister
  *
  * Unregister an IB device.  All clients will receive a remove callback.
+ *
+ * Callers should call this routine only once, and protect against races with
+ * registration. Typically it should only be called as part of a remove
+ * callback in an implementation of driver core's struct device_driver and
+ * related.
+ *
+ * If ops.dealloc_driver is used then ib_dev will be freed upon return from
+ * this function.
  */
-void ib_unregister_device(struct ib_device *device)
+void ib_unregister_device(struct ib_device *ib_dev)
 {
-	struct ib_client_data *context, *tmp;
-	unsigned long flags;
-
-	mutex_lock(&device_mutex);
-
-	down_write(&lists_rwsem);
-	list_del(&device->core_list);
-	spin_lock_irqsave(&device->client_data_lock, flags);
-	list_for_each_entry_safe(context, tmp, &device->client_data_list, list)
-		context->going_down = true;
-	spin_unlock_irqrestore(&device->client_data_lock, flags);
-	downgrade_write(&lists_rwsem);
-
-	list_for_each_entry_safe(context, tmp, &device->client_data_list,
-				 list) {
-		if (context->client->remove)
-			context->client->remove(device, context->data);
-	}
-	up_read(&lists_rwsem);
-
-	ib_device_unregister_rdmacg(device);
-	ib_device_unregister_sysfs(device);
-
-	mutex_unlock(&device_mutex);
-
-	ib_cache_cleanup_one(device);
-
-	ib_security_destroy_port_pkey_list(device);
-	kfree(device->port_pkey_list);
-
-	down_write(&lists_rwsem);
-	spin_lock_irqsave(&device->client_data_lock, flags);
-	list_for_each_entry_safe(context, tmp, &device->client_data_list, list)
-		kfree(context);
-	spin_unlock_irqrestore(&device->client_data_lock, flags);
-	up_write(&lists_rwsem);
-
-	device->reg_state = IB_DEV_UNREGISTERED;
+	get_device(&ib_dev->dev);
+	__ib_unregister_device(ib_dev);
+	put_device(&ib_dev->dev);
 }
 EXPORT_SYMBOL(ib_unregister_device);
 
 /**
+ * ib_unregister_device_and_put - Unregister a device while holding a 'get'
+ * device: The device to unregister
+ *
+ * This is the same as ib_unregister_device(), except it includes an internal
+ * ib_device_put() that should match a 'get' obtained by the caller.
+ *
+ * It is safe to call this routine concurrently from multiple threads while
+ * holding the 'get'. When the function returns the device is fully
+ * unregistered.
+ *
+ * Drivers using this flow MUST use the driver_unregister callback to clean up
+ * their resources associated with the device and dealloc it.
+ */
+void ib_unregister_device_and_put(struct ib_device *ib_dev)
+{
+	WARN_ON(!ib_dev->ops.dealloc_driver);
+	get_device(&ib_dev->dev);
+	ib_device_put(ib_dev);
+	__ib_unregister_device(ib_dev);
+	put_device(&ib_dev->dev);
+}
+EXPORT_SYMBOL(ib_unregister_device_and_put);
+
+/**
+ * ib_unregister_driver - Unregister all IB devices for a driver
+ * @driver_id: The driver to unregister
+ *
+ * This implements a fence for device unregistration. It only returns once all
+ * devices associated with the driver_id have fully completed their
+ * unregistration and returned from ib_unregister_device*().
+ *
+ * If device's are not yet unregistered it goes ahead and starts unregistering
+ * them.
+ *
+ * This does not block creation of new devices with the given driver_id, that
+ * is the responsibility of the caller.
+ */
+void ib_unregister_driver(enum rdma_driver_id driver_id)
+{
+	struct ib_device *ib_dev;
+	unsigned long index;
+
+	down_read(&devices_rwsem);
+	xa_for_each (&devices, index, ib_dev) {
+		if (ib_dev->ops.driver_id != driver_id)
+			continue;
+
+		get_device(&ib_dev->dev);
+		up_read(&devices_rwsem);
+
+		WARN_ON(!ib_dev->ops.dealloc_driver);
+		__ib_unregister_device(ib_dev);
+
+		put_device(&ib_dev->dev);
+		down_read(&devices_rwsem);
+	}
+	up_read(&devices_rwsem);
+}
+EXPORT_SYMBOL(ib_unregister_driver);
+
+static void ib_unregister_work(struct work_struct *work)
+{
+	struct ib_device *ib_dev =
+		container_of(work, struct ib_device, unregistration_work);
+
+	__ib_unregister_device(ib_dev);
+	put_device(&ib_dev->dev);
+}
+
+/**
+ * ib_unregister_device_queued - Unregister a device using a work queue
+ * device: The device to unregister
+ *
+ * This schedules an asynchronous unregistration using a WQ for the device. A
+ * driver should use this to avoid holding locks while doing unregistration,
+ * such as holding the RTNL lock.
+ *
+ * Drivers using this API must use ib_unregister_driver before module unload
+ * to ensure that all scheduled unregistrations have completed.
+ */
+void ib_unregister_device_queued(struct ib_device *ib_dev)
+{
+	WARN_ON(!refcount_read(&ib_dev->refcount));
+	WARN_ON(!ib_dev->ops.dealloc_driver);
+	get_device(&ib_dev->dev);
+	if (!queue_work(system_unbound_wq, &ib_dev->unregistration_work))
+		put_device(&ib_dev->dev);
+}
+EXPORT_SYMBOL(ib_unregister_device_queued);
+
+/*
+ * The caller must pass in a device that has the kref held and the refcount
+ * released. If the device is in cur_net and still registered then it is moved
+ * into net.
+ */
+static int rdma_dev_change_netns(struct ib_device *device, struct net *cur_net,
+				 struct net *net)
+{
+	int ret2 = -EINVAL;
+	int ret;
+
+	mutex_lock(&device->unregistration_lock);
+
+	/*
+	 * If a device not under ib_device_get() or if the unregistration_lock
+	 * is not held, the namespace can be changed, or it can be unregistered.
+	 * Check again under the lock.
+	 */
+	if (refcount_read(&device->refcount) == 0 ||
+	    !net_eq(cur_net, read_pnet(&device->coredev.rdma_net))) {
+		ret = -ENODEV;
+		goto out;
+	}
+
+	kobject_uevent(&device->dev.kobj, KOBJ_REMOVE);
+	disable_device(device);
+
+	/*
+	 * At this point no one can be using the device, so it is safe to
+	 * change the namespace.
+	 */
+	write_pnet(&device->coredev.rdma_net, net);
+
+	down_read(&devices_rwsem);
+	/*
+	 * Currently rdma devices are system wide unique. So the device name
+	 * is guaranteed free in the new namespace. Publish the new namespace
+	 * at the sysfs level.
+	 */
+	ret = device_rename(&device->dev, dev_name(&device->dev));
+	up_read(&devices_rwsem);
+	if (ret) {
+		dev_warn(&device->dev,
+			 "%s: Couldn't rename device after namespace change\n",
+			 __func__);
+		/* Try and put things back and re-enable the device */
+		write_pnet(&device->coredev.rdma_net, cur_net);
+	}
+
+	ret2 = enable_device_and_get(device);
+	if (ret2) {
+		/*
+		 * This shouldn't really happen, but if it does, let the user
+		 * retry at later point. So don't disable the device.
+		 */
+		dev_warn(&device->dev,
+			 "%s: Couldn't re-enable device after namespace change\n",
+			 __func__);
+	}
+	kobject_uevent(&device->dev.kobj, KOBJ_ADD);
+
+	ib_device_put(device);
+out:
+	mutex_unlock(&device->unregistration_lock);
+	if (ret)
+		return ret;
+	return ret2;
+}
+
+int ib_device_set_netns_put(struct sk_buff *skb,
+			    struct ib_device *dev, u32 ns_fd)
+{
+	struct net *net;
+	int ret;
+
+	net = get_net_ns_by_fd(ns_fd);
+	if (IS_ERR(net)) {
+		ret = PTR_ERR(net);
+		goto net_err;
+	}
+
+	if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) {
+		ret = -EPERM;
+		goto ns_err;
+	}
+
+	/*
+	 * Currently supported only for those providers which support
+	 * disassociation and don't do port specific sysfs init. Once a
+	 * port_cleanup infrastructure is implemented, this limitation will be
+	 * removed.
+	 */
+	if (!dev->ops.disassociate_ucontext || dev->ops.init_port ||
+	    ib_devices_shared_netns) {
+		ret = -EOPNOTSUPP;
+		goto ns_err;
+	}
+
+	get_device(&dev->dev);
+	ib_device_put(dev);
+	ret = rdma_dev_change_netns(dev, current->nsproxy->net_ns, net);
+	put_device(&dev->dev);
+
+	put_net(net);
+	return ret;
+
+ns_err:
+	put_net(net);
+net_err:
+	ib_device_put(dev);
+	return ret;
+}
+
+static struct pernet_operations rdma_dev_net_ops = {
+	.init = rdma_dev_init_net,
+	.exit = rdma_dev_exit_net,
+	.id = &rdma_dev_net_id,
+	.size = sizeof(struct rdma_dev_net),
+};
+
+static int assign_client_id(struct ib_client *client)
+{
+	int ret;
+
+	down_write(&clients_rwsem);
+	/*
+	 * The add/remove callbacks must be called in FIFO/LIFO order. To
+	 * achieve this we assign client_ids so they are sorted in
+	 * registration order.
+	 */
+	client->client_id = highest_client_id;
+	ret = xa_insert(&clients, client->client_id, client, GFP_KERNEL);
+	if (ret)
+		goto out;
+
+	highest_client_id++;
+	xa_set_mark(&clients, client->client_id, CLIENT_REGISTERED);
+
+out:
+	up_write(&clients_rwsem);
+	return ret;
+}
+
+static void remove_client_id(struct ib_client *client)
+{
+	down_write(&clients_rwsem);
+	xa_erase(&clients, client->client_id);
+	for (; highest_client_id; highest_client_id--)
+		if (xa_load(&clients, highest_client_id - 1))
+			break;
+	up_write(&clients_rwsem);
+}
+
+/**
  * ib_register_client - Register an IB client
  * @client:Client to register
  *
@@ -635,19 +1724,25 @@
 int ib_register_client(struct ib_client *client)
 {
 	struct ib_device *device;
+	unsigned long index;
+	int ret;
 
-	mutex_lock(&device_mutex);
+	refcount_set(&client->uses, 1);
+	init_completion(&client->uses_zero);
+	ret = assign_client_id(client);
+	if (ret)
+		return ret;
 
-	list_for_each_entry(device, &device_list, core_list)
-		if (!add_client_context(device, client) && client->add)
-			client->add(device);
-
-	down_write(&lists_rwsem);
-	list_add_tail(&client->list, &client_list);
-	up_write(&lists_rwsem);
-
-	mutex_unlock(&device_mutex);
-
+	down_read(&devices_rwsem);
+	xa_for_each_marked (&devices, index, device, DEVICE_REGISTERED) {
+		ret = add_client_context(device, client);
+		if (ret) {
+			up_read(&devices_rwsem);
+			ib_unregister_client(client);
+			return ret;
+		}
+	}
+	up_read(&devices_rwsem);
 	return 0;
 }
 EXPORT_SYMBOL(ib_register_client);
@@ -659,80 +1754,140 @@
  * Upper level users use ib_unregister_client() to remove their client
  * registration.  When ib_unregister_client() is called, the client
  * will receive a remove callback for each IB device still registered.
+ *
+ * This is a full fence, once it returns no client callbacks will be called,
+ * or are running in another thread.
  */
 void ib_unregister_client(struct ib_client *client)
 {
-	struct ib_client_data *context, *tmp;
 	struct ib_device *device;
-	unsigned long flags;
+	unsigned long index;
 
-	mutex_lock(&device_mutex);
+	down_write(&clients_rwsem);
+	ib_client_put(client);
+	xa_clear_mark(&clients, client->client_id, CLIENT_REGISTERED);
+	up_write(&clients_rwsem);
 
-	down_write(&lists_rwsem);
-	list_del(&client->list);
-	up_write(&lists_rwsem);
-
-	list_for_each_entry(device, &device_list, core_list) {
-		struct ib_client_data *found_context = NULL;
-
-		down_write(&lists_rwsem);
-		spin_lock_irqsave(&device->client_data_lock, flags);
-		list_for_each_entry_safe(context, tmp, &device->client_data_list, list)
-			if (context->client == client) {
-				context->going_down = true;
-				found_context = context;
-				break;
-			}
-		spin_unlock_irqrestore(&device->client_data_lock, flags);
-		up_write(&lists_rwsem);
-
-		if (client->remove)
-			client->remove(device, found_context ?
-					       found_context->data : NULL);
-
-		if (!found_context) {
-			pr_warn("No client context found for %s/%s\n",
-				device->name, client->name);
+	/* We do not want to have locks while calling client->remove() */
+	rcu_read_lock();
+	xa_for_each (&devices, index, device) {
+		if (!ib_device_try_get(device))
 			continue;
-		}
+		rcu_read_unlock();
 
-		down_write(&lists_rwsem);
-		spin_lock_irqsave(&device->client_data_lock, flags);
-		list_del(&found_context->list);
-		kfree(found_context);
-		spin_unlock_irqrestore(&device->client_data_lock, flags);
-		up_write(&lists_rwsem);
+		remove_client_context(device, client->client_id);
+
+		ib_device_put(device);
+		rcu_read_lock();
 	}
+	rcu_read_unlock();
 
-	mutex_unlock(&device_mutex);
+	/*
+	 * remove_client_context() is not a fence, it can return even though a
+	 * removal is ongoing. Wait until all removals are completed.
+	 */
+	wait_for_completion(&client->uses_zero);
+	remove_client_id(client);
 }
 EXPORT_SYMBOL(ib_unregister_client);
 
-/**
- * ib_get_client_data - Get IB client context
- * @device:Device to get context for
- * @client:Client to get context for
- *
- * ib_get_client_data() returns client context set with
- * ib_set_client_data().
- */
-void *ib_get_client_data(struct ib_device *device, struct ib_client *client)
+static int __ib_get_global_client_nl_info(const char *client_name,
+					  struct ib_client_nl_info *res)
 {
-	struct ib_client_data *context;
-	void *ret = NULL;
-	unsigned long flags;
+	struct ib_client *client;
+	unsigned long index;
+	int ret = -ENOENT;
 
-	spin_lock_irqsave(&device->client_data_lock, flags);
-	list_for_each_entry(context, &device->client_data_list, list)
-		if (context->client == client) {
-			ret = context->data;
+	down_read(&clients_rwsem);
+	xa_for_each_marked (&clients, index, client, CLIENT_REGISTERED) {
+		if (strcmp(client->name, client_name) != 0)
+			continue;
+		if (!client->get_global_nl_info) {
+			ret = -EOPNOTSUPP;
 			break;
 		}
-	spin_unlock_irqrestore(&device->client_data_lock, flags);
+		ret = client->get_global_nl_info(res);
+		if (WARN_ON(ret == -ENOENT))
+			ret = -EINVAL;
+		if (!ret && res->cdev)
+			get_device(res->cdev);
+		break;
+	}
+	up_read(&clients_rwsem);
+	return ret;
+}
+
+static int __ib_get_client_nl_info(struct ib_device *ibdev,
+				   const char *client_name,
+				   struct ib_client_nl_info *res)
+{
+	unsigned long index;
+	void *client_data;
+	int ret = -ENOENT;
+
+	down_read(&ibdev->client_data_rwsem);
+	xan_for_each_marked (&ibdev->client_data, index, client_data,
+			     CLIENT_DATA_REGISTERED) {
+		struct ib_client *client = xa_load(&clients, index);
+
+		if (!client || strcmp(client->name, client_name) != 0)
+			continue;
+		if (!client->get_nl_info) {
+			ret = -EOPNOTSUPP;
+			break;
+		}
+		ret = client->get_nl_info(ibdev, client_data, res);
+		if (WARN_ON(ret == -ENOENT))
+			ret = -EINVAL;
+
+		/*
+		 * The cdev is guaranteed valid as long as we are inside the
+		 * client_data_rwsem as remove_one can't be called. Keep it
+		 * valid for the caller.
+		 */
+		if (!ret && res->cdev)
+			get_device(res->cdev);
+		break;
+	}
+	up_read(&ibdev->client_data_rwsem);
 
 	return ret;
 }
-EXPORT_SYMBOL(ib_get_client_data);
+
+/**
+ * ib_get_client_nl_info - Fetch the nl_info from a client
+ * @device - IB device
+ * @client_name - Name of the client
+ * @res - Result of the query
+ */
+int ib_get_client_nl_info(struct ib_device *ibdev, const char *client_name,
+			  struct ib_client_nl_info *res)
+{
+	int ret;
+
+	if (ibdev)
+		ret = __ib_get_client_nl_info(ibdev, client_name, res);
+	else
+		ret = __ib_get_global_client_nl_info(client_name, res);
+#ifdef CONFIG_MODULES
+	if (ret == -ENOENT) {
+		request_module("rdma-client-%s", client_name);
+		if (ibdev)
+			ret = __ib_get_client_nl_info(ibdev, client_name, res);
+		else
+			ret = __ib_get_global_client_nl_info(client_name, res);
+	}
+#endif
+	if (ret) {
+		if (ret == -ENOENT)
+			return -EOPNOTSUPP;
+		return ret;
+	}
+
+	if (WARN_ON(!res->cdev))
+		return -EINVAL;
+	return 0;
+}
 
 /**
  * ib_set_client_data - Set IB client context
@@ -740,27 +1895,22 @@
  * @client:Client to set context for
  * @data:Context to set
  *
- * ib_set_client_data() sets client context that can be retrieved with
- * ib_get_client_data().
+ * ib_set_client_data() sets client context data that can be retrieved with
+ * ib_get_client_data(). This can only be called while the client is
+ * registered to the device, once the ib_client remove() callback returns this
+ * cannot be called.
  */
 void ib_set_client_data(struct ib_device *device, struct ib_client *client,
 			void *data)
 {
-	struct ib_client_data *context;
-	unsigned long flags;
+	void *rc;
 
-	spin_lock_irqsave(&device->client_data_lock, flags);
-	list_for_each_entry(context, &device->client_data_list, list)
-		if (context->client == client) {
-			context->data = data;
-			goto out;
-		}
+	if (WARN_ON(IS_ERR(data)))
+		data = NULL;
 
-	pr_warn("No client context found for %s/%s\n",
-		device->name, client->name);
-
-out:
-	spin_unlock_irqrestore(&device->client_data_lock, flags);
+	rc = xa_store(&device->client_data, client->client_id, data,
+		      GFP_KERNEL);
+	WARN_ON(xa_is_err(rc));
 }
 EXPORT_SYMBOL(ib_set_client_data);
 
@@ -823,6 +1973,75 @@
 }
 EXPORT_SYMBOL(ib_dispatch_event);
 
+static int iw_query_port(struct ib_device *device,
+			   u8 port_num,
+			   struct ib_port_attr *port_attr)
+{
+	struct in_device *inetdev;
+	struct net_device *netdev;
+	int err;
+
+	memset(port_attr, 0, sizeof(*port_attr));
+
+	netdev = ib_device_get_netdev(device, port_num);
+	if (!netdev)
+		return -ENODEV;
+
+	port_attr->max_mtu = IB_MTU_4096;
+	port_attr->active_mtu = ib_mtu_int_to_enum(netdev->mtu);
+
+	if (!netif_carrier_ok(netdev)) {
+		port_attr->state = IB_PORT_DOWN;
+		port_attr->phys_state = IB_PORT_PHYS_STATE_DISABLED;
+	} else {
+		rcu_read_lock();
+		inetdev = __in_dev_get_rcu(netdev);
+
+		if (inetdev && inetdev->ifa_list) {
+			port_attr->state = IB_PORT_ACTIVE;
+			port_attr->phys_state = IB_PORT_PHYS_STATE_LINK_UP;
+		} else {
+			port_attr->state = IB_PORT_INIT;
+			port_attr->phys_state =
+				IB_PORT_PHYS_STATE_PORT_CONFIGURATION_TRAINING;
+		}
+
+		rcu_read_unlock();
+	}
+
+	dev_put(netdev);
+	err = device->ops.query_port(device, port_num, port_attr);
+	if (err)
+		return err;
+
+	return 0;
+}
+
+static int __ib_query_port(struct ib_device *device,
+			   u8 port_num,
+			   struct ib_port_attr *port_attr)
+{
+	union ib_gid gid = {};
+	int err;
+
+	memset(port_attr, 0, sizeof(*port_attr));
+
+	err = device->ops.query_port(device, port_num, port_attr);
+	if (err || port_attr->subnet_prefix)
+		return err;
+
+	if (rdma_port_get_link_layer(device, port_num) !=
+	    IB_LINK_LAYER_INFINIBAND)
+		return 0;
+
+	err = device->ops.query_gid(device, port_num, 0, &gid);
+	if (err)
+		return err;
+
+	port_attr->subnet_prefix = be64_to_cpu(gid.global.subnet_prefix);
+	return 0;
+}
+
 /**
  * ib_query_port - Query IB port attributes
  * @device:Device to query
@@ -836,29 +2055,198 @@
 		  u8 port_num,
 		  struct ib_port_attr *port_attr)
 {
-	union ib_gid gid;
-	int err;
-
 	if (!rdma_is_port_valid(device, port_num))
 		return -EINVAL;
 
-	memset(port_attr, 0, sizeof(*port_attr));
-	err = device->query_port(device, port_num, port_attr);
-	if (err || port_attr->subnet_prefix)
-		return err;
-
-	if (rdma_port_get_link_layer(device, port_num) != IB_LINK_LAYER_INFINIBAND)
-		return 0;
-
-	err = device->query_gid(device, port_num, 0, &gid);
-	if (err)
-		return err;
-
-	port_attr->subnet_prefix = be64_to_cpu(gid.global.subnet_prefix);
-	return 0;
+	if (rdma_protocol_iwarp(device, port_num))
+		return iw_query_port(device, port_num, port_attr);
+	else
+		return __ib_query_port(device, port_num, port_attr);
 }
 EXPORT_SYMBOL(ib_query_port);
 
+static void add_ndev_hash(struct ib_port_data *pdata)
+{
+	unsigned long flags;
+
+	might_sleep();
+
+	spin_lock_irqsave(&ndev_hash_lock, flags);
+	if (hash_hashed(&pdata->ndev_hash_link)) {
+		hash_del_rcu(&pdata->ndev_hash_link);
+		spin_unlock_irqrestore(&ndev_hash_lock, flags);
+		/*
+		 * We cannot do hash_add_rcu after a hash_del_rcu until the
+		 * grace period
+		 */
+		synchronize_rcu();
+		spin_lock_irqsave(&ndev_hash_lock, flags);
+	}
+	if (pdata->netdev)
+		hash_add_rcu(ndev_hash, &pdata->ndev_hash_link,
+			     (uintptr_t)pdata->netdev);
+	spin_unlock_irqrestore(&ndev_hash_lock, flags);
+}
+
+/**
+ * ib_device_set_netdev - Associate the ib_dev with an underlying net_device
+ * @ib_dev: Device to modify
+ * @ndev: net_device to affiliate, may be NULL
+ * @port: IB port the net_device is connected to
+ *
+ * Drivers should use this to link the ib_device to a netdev so the netdev
+ * shows up in interfaces like ib_enum_roce_netdev. Only one netdev may be
+ * affiliated with any port.
+ *
+ * The caller must ensure that the given ndev is not unregistered or
+ * unregistering, and that either the ib_device is unregistered or
+ * ib_device_set_netdev() is called with NULL when the ndev sends a
+ * NETDEV_UNREGISTER event.
+ */
+int ib_device_set_netdev(struct ib_device *ib_dev, struct net_device *ndev,
+			 unsigned int port)
+{
+	struct net_device *old_ndev;
+	struct ib_port_data *pdata;
+	unsigned long flags;
+	int ret;
+
+	/*
+	 * Drivers wish to call this before ib_register_driver, so we have to
+	 * setup the port data early.
+	 */
+	ret = alloc_port_data(ib_dev);
+	if (ret)
+		return ret;
+
+	if (!rdma_is_port_valid(ib_dev, port))
+		return -EINVAL;
+
+	pdata = &ib_dev->port_data[port];
+	spin_lock_irqsave(&pdata->netdev_lock, flags);
+	old_ndev = rcu_dereference_protected(
+		pdata->netdev, lockdep_is_held(&pdata->netdev_lock));
+	if (old_ndev == ndev) {
+		spin_unlock_irqrestore(&pdata->netdev_lock, flags);
+		return 0;
+	}
+
+	if (ndev)
+		dev_hold(ndev);
+	rcu_assign_pointer(pdata->netdev, ndev);
+	spin_unlock_irqrestore(&pdata->netdev_lock, flags);
+
+	add_ndev_hash(pdata);
+	if (old_ndev)
+		dev_put(old_ndev);
+
+	return 0;
+}
+EXPORT_SYMBOL(ib_device_set_netdev);
+
+static void free_netdevs(struct ib_device *ib_dev)
+{
+	unsigned long flags;
+	unsigned int port;
+
+	if (!ib_dev->port_data)
+		return;
+
+	rdma_for_each_port (ib_dev, port) {
+		struct ib_port_data *pdata = &ib_dev->port_data[port];
+		struct net_device *ndev;
+
+		spin_lock_irqsave(&pdata->netdev_lock, flags);
+		ndev = rcu_dereference_protected(
+			pdata->netdev, lockdep_is_held(&pdata->netdev_lock));
+		if (ndev) {
+			spin_lock(&ndev_hash_lock);
+			hash_del_rcu(&pdata->ndev_hash_link);
+			spin_unlock(&ndev_hash_lock);
+
+			/*
+			 * If this is the last dev_put there is still a
+			 * synchronize_rcu before the netdev is kfreed, so we
+			 * can continue to rely on unlocked pointer
+			 * comparisons after the put
+			 */
+			rcu_assign_pointer(pdata->netdev, NULL);
+			dev_put(ndev);
+		}
+		spin_unlock_irqrestore(&pdata->netdev_lock, flags);
+	}
+}
+
+struct net_device *ib_device_get_netdev(struct ib_device *ib_dev,
+					unsigned int port)
+{
+	struct ib_port_data *pdata;
+	struct net_device *res;
+
+	if (!rdma_is_port_valid(ib_dev, port))
+		return NULL;
+
+	pdata = &ib_dev->port_data[port];
+
+	/*
+	 * New drivers should use ib_device_set_netdev() not the legacy
+	 * get_netdev().
+	 */
+	if (ib_dev->ops.get_netdev)
+		res = ib_dev->ops.get_netdev(ib_dev, port);
+	else {
+		spin_lock(&pdata->netdev_lock);
+		res = rcu_dereference_protected(
+			pdata->netdev, lockdep_is_held(&pdata->netdev_lock));
+		if (res)
+			dev_hold(res);
+		spin_unlock(&pdata->netdev_lock);
+	}
+
+	/*
+	 * If we are starting to unregister expedite things by preventing
+	 * propagation of an unregistering netdev.
+	 */
+	if (res && res->reg_state != NETREG_REGISTERED) {
+		dev_put(res);
+		return NULL;
+	}
+
+	return res;
+}
+
+/**
+ * ib_device_get_by_netdev - Find an IB device associated with a netdev
+ * @ndev: netdev to locate
+ * @driver_id: The driver ID that must match (RDMA_DRIVER_UNKNOWN matches all)
+ *
+ * Find and hold an ib_device that is associated with a netdev via
+ * ib_device_set_netdev(). The caller must call ib_device_put() on the
+ * returned pointer.
+ */
+struct ib_device *ib_device_get_by_netdev(struct net_device *ndev,
+					  enum rdma_driver_id driver_id)
+{
+	struct ib_device *res = NULL;
+	struct ib_port_data *cur;
+
+	rcu_read_lock();
+	hash_for_each_possible_rcu (ndev_hash, cur, ndev_hash_link,
+				    (uintptr_t)ndev) {
+		if (rcu_access_pointer(cur->netdev) == ndev &&
+		    (driver_id == RDMA_DRIVER_UNKNOWN ||
+		     cur->ib_dev->ops.driver_id == driver_id) &&
+		    ib_device_try_get(cur->ib_dev)) {
+			res = cur->ib_dev;
+			break;
+		}
+	}
+	rcu_read_unlock();
+
+	return res;
+}
+EXPORT_SYMBOL(ib_device_get_by_netdev);
+
 /**
  * ib_enum_roce_netdev - enumerate all RoCE ports
  * @ib_dev : IB device we want to query
@@ -877,21 +2265,12 @@
 			 roce_netdev_callback cb,
 			 void *cookie)
 {
-	u8 port;
+	unsigned int port;
 
-	for (port = rdma_start_port(ib_dev); port <= rdma_end_port(ib_dev);
-	     port++)
+	rdma_for_each_port (ib_dev, port)
 		if (rdma_protocol_roce(ib_dev, port)) {
-			struct net_device *idev = NULL;
-
-			if (ib_dev->get_netdev)
-				idev = ib_dev->get_netdev(ib_dev, port);
-
-			if (idev &&
-			    idev->reg_state >= NETREG_UNREGISTERED) {
-				dev_put(idev);
-				idev = NULL;
-			}
+			struct net_device *idev =
+				ib_device_get_netdev(ib_dev, port);
 
 			if (filter(ib_dev, port, idev, filter_cookie))
 				cb(ib_dev, port, idev, cookie);
@@ -918,11 +2297,12 @@
 			      void *cookie)
 {
 	struct ib_device *dev;
+	unsigned long index;
 
-	down_read(&lists_rwsem);
-	list_for_each_entry(dev, &device_list, core_list)
+	down_read(&devices_rwsem);
+	xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED)
 		ib_enum_roce_netdev(dev, filter, filter_cookie, cb, cookie);
-	up_read(&lists_rwsem);
+	up_read(&devices_rwsem);
 }
 
 /**
@@ -934,19 +2314,22 @@
 int ib_enum_all_devs(nldev_callback nldev_cb, struct sk_buff *skb,
 		     struct netlink_callback *cb)
 {
+	unsigned long index;
 	struct ib_device *dev;
 	unsigned int idx = 0;
 	int ret = 0;
 
-	down_read(&lists_rwsem);
-	list_for_each_entry(dev, &device_list, core_list) {
+	down_read(&devices_rwsem);
+	xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) {
+		if (!rdma_dev_access_netns(dev, sock_net(skb->sk)))
+			continue;
+
 		ret = nldev_cb(dev, skb, cb, idx);
 		if (ret)
 			break;
 		idx++;
 	}
-
-	up_read(&lists_rwsem);
+	up_read(&devices_rwsem);
 	return ret;
 }
 
@@ -962,7 +2345,10 @@
 int ib_query_pkey(struct ib_device *device,
 		  u8 port_num, u16 index, u16 *pkey)
 {
-	return device->query_pkey(device, port_num, index, pkey);
+	if (!rdma_is_port_valid(device, port_num))
+		return -EINVAL;
+
+	return device->ops.query_pkey(device, port_num, index, pkey);
 }
 EXPORT_SYMBOL(ib_query_pkey);
 
@@ -979,11 +2365,11 @@
 		     int device_modify_mask,
 		     struct ib_device_modify *device_modify)
 {
-	if (!device->modify_device)
+	if (!device->ops.modify_device)
 		return -ENOSYS;
 
-	return device->modify_device(device, device_modify_mask,
-				     device_modify);
+	return device->ops.modify_device(device, device_modify_mask,
+					 device_modify);
 }
 EXPORT_SYMBOL(ib_modify_device);
 
@@ -1007,9 +2393,10 @@
 	if (!rdma_is_port_valid(device, port_num))
 		return -EINVAL;
 
-	if (device->modify_port)
-		rc = device->modify_port(device, port_num, port_modify_mask,
-					   port_modify);
+	if (device->ops.modify_port)
+		rc = device->ops.modify_port(device, port_num,
+					     port_modify_mask,
+					     port_modify);
 	else
 		rc = rdma_protocol_roce(device, port_num) ? 0 : -ENOSYS;
 	return rc;
@@ -1029,13 +2416,15 @@
 		u8 *port_num, u16 *index)
 {
 	union ib_gid tmp_gid;
-	int ret, port, i;
+	unsigned int port;
+	int ret, i;
 
-	for (port = rdma_start_port(device); port <= rdma_end_port(device); ++port) {
+	rdma_for_each_port (device, port) {
 		if (!rdma_protocol_ib(device, port))
 			continue;
 
-		for (i = 0; i < device->port_immutable[port].gid_tbl_len; ++i) {
+		for (i = 0; i < device->port_data[port].immutable.gid_tbl_len;
+		     ++i) {
 			ret = rdma_query_gid(device, port, i, &tmp_gid);
 			if (ret)
 				return ret;
@@ -1067,7 +2456,8 @@
 	u16 tmp_pkey;
 	int partial_ix = -1;
 
-	for (i = 0; i < device->port_immutable[port_num].pkey_tbl_len; ++i) {
+	for (i = 0; i < device->port_data[port_num].immutable.pkey_tbl_len;
+	     ++i) {
 		ret = ib_query_pkey(device, port_num, i, &tmp_pkey);
 		if (ret)
 			return ret;
@@ -1100,6 +2490,7 @@
  * @gid:	A GID that the net_dev uses to communicate.
  * @addr:	Contains the IP address that the request specified as its
  *		destination.
+ *
  */
 struct net_device *ib_get_net_dev_by_params(struct ib_device *dev,
 					    u8 port,
@@ -1108,34 +2499,178 @@
 					    const struct sockaddr *addr)
 {
 	struct net_device *net_dev = NULL;
-	struct ib_client_data *context;
+	unsigned long index;
+	void *client_data;
 
 	if (!rdma_protocol_ib(dev, port))
 		return NULL;
 
-	down_read(&lists_rwsem);
+	/*
+	 * Holding the read side guarantees that the client will not become
+	 * unregistered while we are calling get_net_dev_by_params()
+	 */
+	down_read(&dev->client_data_rwsem);
+	xan_for_each_marked (&dev->client_data, index, client_data,
+			     CLIENT_DATA_REGISTERED) {
+		struct ib_client *client = xa_load(&clients, index);
 
-	list_for_each_entry(context, &dev->client_data_list, list) {
-		struct ib_client *client = context->client;
-
-		if (context->going_down)
+		if (!client || !client->get_net_dev_by_params)
 			continue;
 
-		if (client->get_net_dev_by_params) {
-			net_dev = client->get_net_dev_by_params(dev, port, pkey,
-								gid, addr,
-								context->data);
-			if (net_dev)
-				break;
-		}
+		net_dev = client->get_net_dev_by_params(dev, port, pkey, gid,
+							addr, client_data);
+		if (net_dev)
+			break;
 	}
-
-	up_read(&lists_rwsem);
+	up_read(&dev->client_data_rwsem);
 
 	return net_dev;
 }
 EXPORT_SYMBOL(ib_get_net_dev_by_params);
 
+void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops)
+{
+	struct ib_device_ops *dev_ops = &dev->ops;
+#define SET_DEVICE_OP(ptr, name)                                               \
+	do {                                                                   \
+		if (ops->name)                                                 \
+			if (!((ptr)->name))				       \
+				(ptr)->name = ops->name;                       \
+	} while (0)
+
+#define SET_OBJ_SIZE(ptr, name) SET_DEVICE_OP(ptr, size_##name)
+
+	if (ops->driver_id != RDMA_DRIVER_UNKNOWN) {
+		WARN_ON(dev_ops->driver_id != RDMA_DRIVER_UNKNOWN &&
+			dev_ops->driver_id != ops->driver_id);
+		dev_ops->driver_id = ops->driver_id;
+	}
+	if (ops->owner) {
+		WARN_ON(dev_ops->owner && dev_ops->owner != ops->owner);
+		dev_ops->owner = ops->owner;
+	}
+	if (ops->uverbs_abi_ver)
+		dev_ops->uverbs_abi_ver = ops->uverbs_abi_ver;
+
+	dev_ops->uverbs_no_driver_id_binding |=
+		ops->uverbs_no_driver_id_binding;
+
+	SET_DEVICE_OP(dev_ops, add_gid);
+	SET_DEVICE_OP(dev_ops, advise_mr);
+	SET_DEVICE_OP(dev_ops, alloc_dm);
+	SET_DEVICE_OP(dev_ops, alloc_fmr);
+	SET_DEVICE_OP(dev_ops, alloc_hw_stats);
+	SET_DEVICE_OP(dev_ops, alloc_mr);
+	SET_DEVICE_OP(dev_ops, alloc_mr_integrity);
+	SET_DEVICE_OP(dev_ops, alloc_mw);
+	SET_DEVICE_OP(dev_ops, alloc_pd);
+	SET_DEVICE_OP(dev_ops, alloc_rdma_netdev);
+	SET_DEVICE_OP(dev_ops, alloc_ucontext);
+	SET_DEVICE_OP(dev_ops, alloc_xrcd);
+	SET_DEVICE_OP(dev_ops, attach_mcast);
+	SET_DEVICE_OP(dev_ops, check_mr_status);
+	SET_DEVICE_OP(dev_ops, counter_alloc_stats);
+	SET_DEVICE_OP(dev_ops, counter_bind_qp);
+	SET_DEVICE_OP(dev_ops, counter_dealloc);
+	SET_DEVICE_OP(dev_ops, counter_unbind_qp);
+	SET_DEVICE_OP(dev_ops, counter_update_stats);
+	SET_DEVICE_OP(dev_ops, create_ah);
+	SET_DEVICE_OP(dev_ops, create_counters);
+	SET_DEVICE_OP(dev_ops, create_cq);
+	SET_DEVICE_OP(dev_ops, create_flow);
+	SET_DEVICE_OP(dev_ops, create_flow_action_esp);
+	SET_DEVICE_OP(dev_ops, create_qp);
+	SET_DEVICE_OP(dev_ops, create_rwq_ind_table);
+	SET_DEVICE_OP(dev_ops, create_srq);
+	SET_DEVICE_OP(dev_ops, create_wq);
+	SET_DEVICE_OP(dev_ops, dealloc_dm);
+	SET_DEVICE_OP(dev_ops, dealloc_driver);
+	SET_DEVICE_OP(dev_ops, dealloc_fmr);
+	SET_DEVICE_OP(dev_ops, dealloc_mw);
+	SET_DEVICE_OP(dev_ops, dealloc_pd);
+	SET_DEVICE_OP(dev_ops, dealloc_ucontext);
+	SET_DEVICE_OP(dev_ops, dealloc_xrcd);
+	SET_DEVICE_OP(dev_ops, del_gid);
+	SET_DEVICE_OP(dev_ops, dereg_mr);
+	SET_DEVICE_OP(dev_ops, destroy_ah);
+	SET_DEVICE_OP(dev_ops, destroy_counters);
+	SET_DEVICE_OP(dev_ops, destroy_cq);
+	SET_DEVICE_OP(dev_ops, destroy_flow);
+	SET_DEVICE_OP(dev_ops, destroy_flow_action);
+	SET_DEVICE_OP(dev_ops, destroy_qp);
+	SET_DEVICE_OP(dev_ops, destroy_rwq_ind_table);
+	SET_DEVICE_OP(dev_ops, destroy_srq);
+	SET_DEVICE_OP(dev_ops, destroy_wq);
+	SET_DEVICE_OP(dev_ops, detach_mcast);
+	SET_DEVICE_OP(dev_ops, disassociate_ucontext);
+	SET_DEVICE_OP(dev_ops, drain_rq);
+	SET_DEVICE_OP(dev_ops, drain_sq);
+	SET_DEVICE_OP(dev_ops, enable_driver);
+	SET_DEVICE_OP(dev_ops, fill_res_entry);
+	SET_DEVICE_OP(dev_ops, get_dev_fw_str);
+	SET_DEVICE_OP(dev_ops, get_dma_mr);
+	SET_DEVICE_OP(dev_ops, get_hw_stats);
+	SET_DEVICE_OP(dev_ops, get_link_layer);
+	SET_DEVICE_OP(dev_ops, get_netdev);
+	SET_DEVICE_OP(dev_ops, get_port_immutable);
+	SET_DEVICE_OP(dev_ops, get_vector_affinity);
+	SET_DEVICE_OP(dev_ops, get_vf_config);
+	SET_DEVICE_OP(dev_ops, get_vf_stats);
+	SET_DEVICE_OP(dev_ops, init_port);
+	SET_DEVICE_OP(dev_ops, invalidate_range);
+	SET_DEVICE_OP(dev_ops, iw_accept);
+	SET_DEVICE_OP(dev_ops, iw_add_ref);
+	SET_DEVICE_OP(dev_ops, iw_connect);
+	SET_DEVICE_OP(dev_ops, iw_create_listen);
+	SET_DEVICE_OP(dev_ops, iw_destroy_listen);
+	SET_DEVICE_OP(dev_ops, iw_get_qp);
+	SET_DEVICE_OP(dev_ops, iw_reject);
+	SET_DEVICE_OP(dev_ops, iw_rem_ref);
+	SET_DEVICE_OP(dev_ops, map_mr_sg);
+	SET_DEVICE_OP(dev_ops, map_mr_sg_pi);
+	SET_DEVICE_OP(dev_ops, map_phys_fmr);
+	SET_DEVICE_OP(dev_ops, mmap);
+	SET_DEVICE_OP(dev_ops, modify_ah);
+	SET_DEVICE_OP(dev_ops, modify_cq);
+	SET_DEVICE_OP(dev_ops, modify_device);
+	SET_DEVICE_OP(dev_ops, modify_flow_action_esp);
+	SET_DEVICE_OP(dev_ops, modify_port);
+	SET_DEVICE_OP(dev_ops, modify_qp);
+	SET_DEVICE_OP(dev_ops, modify_srq);
+	SET_DEVICE_OP(dev_ops, modify_wq);
+	SET_DEVICE_OP(dev_ops, peek_cq);
+	SET_DEVICE_OP(dev_ops, poll_cq);
+	SET_DEVICE_OP(dev_ops, post_recv);
+	SET_DEVICE_OP(dev_ops, post_send);
+	SET_DEVICE_OP(dev_ops, post_srq_recv);
+	SET_DEVICE_OP(dev_ops, process_mad);
+	SET_DEVICE_OP(dev_ops, query_ah);
+	SET_DEVICE_OP(dev_ops, query_device);
+	SET_DEVICE_OP(dev_ops, query_gid);
+	SET_DEVICE_OP(dev_ops, query_pkey);
+	SET_DEVICE_OP(dev_ops, query_port);
+	SET_DEVICE_OP(dev_ops, query_qp);
+	SET_DEVICE_OP(dev_ops, query_srq);
+	SET_DEVICE_OP(dev_ops, rdma_netdev_get_params);
+	SET_DEVICE_OP(dev_ops, read_counters);
+	SET_DEVICE_OP(dev_ops, reg_dm_mr);
+	SET_DEVICE_OP(dev_ops, reg_user_mr);
+	SET_DEVICE_OP(dev_ops, req_ncomp_notif);
+	SET_DEVICE_OP(dev_ops, req_notify_cq);
+	SET_DEVICE_OP(dev_ops, rereg_user_mr);
+	SET_DEVICE_OP(dev_ops, resize_cq);
+	SET_DEVICE_OP(dev_ops, set_vf_guid);
+	SET_DEVICE_OP(dev_ops, set_vf_link_state);
+	SET_DEVICE_OP(dev_ops, unmap_fmr);
+
+	SET_OBJ_SIZE(dev_ops, ib_ah);
+	SET_OBJ_SIZE(dev_ops, ib_cq);
+	SET_OBJ_SIZE(dev_ops, ib_pd);
+	SET_OBJ_SIZE(dev_ops, ib_srq);
+	SET_OBJ_SIZE(dev_ops, ib_ucontext);
+}
+EXPORT_SYMBOL(ib_set_device_ops);
+
 static const struct rdma_nl_cbs ibnl_ls_cb_table[RDMA_NL_LS_NUM_OPS] = {
 	[RDMA_NL_LS_OP_RESOLVE] = {
 		.doit = ib_nl_handle_resolve_resp,
@@ -1166,18 +2701,23 @@
 		goto err;
 	}
 
-	ret = class_register(&ib_class);
-	if (ret) {
-		pr_warn("Couldn't create InfiniBand device class\n");
+	ib_comp_unbound_wq =
+		alloc_workqueue("ib-comp-unb-wq",
+				WQ_UNBOUND | WQ_HIGHPRI | WQ_MEM_RECLAIM |
+				WQ_SYSFS, WQ_UNBOUND_MAX_ACTIVE);
+	if (!ib_comp_unbound_wq) {
+		ret = -ENOMEM;
 		goto err_comp;
 	}
 
-	ret = rdma_nl_init();
+	ret = class_register(&ib_class);
 	if (ret) {
-		pr_warn("Couldn't init IB netlink interface: err %d\n", ret);
-		goto err_sysfs;
+		pr_warn("Couldn't create InfiniBand device class\n");
+		goto err_comp_unbound;
 	}
 
+	rdma_nl_init();
+
 	ret = addr_init();
 	if (ret) {
 		pr_warn("Could't init IB address resolution\n");
@@ -1196,18 +2736,26 @@
 		goto err_mad;
 	}
 
-	ret = register_lsm_notifier(&ibdev_lsm_nb);
+	ret = register_blocking_lsm_notifier(&ibdev_lsm_nb);
 	if (ret) {
 		pr_warn("Couldn't register LSM notifier. ret %d\n", ret);
 		goto err_sa;
 	}
 
+	ret = register_pernet_device(&rdma_dev_net_ops);
+	if (ret) {
+		pr_warn("Couldn't init compat dev. ret %d\n", ret);
+		goto err_compat;
+	}
+
 	nldev_init();
 	rdma_nl_register(RDMA_NL_LS, ibnl_ls_cb_table);
 	roce_gid_mgmt_init();
 
 	return 0;
 
+err_compat:
+	unregister_blocking_lsm_notifier(&ibdev_lsm_nb);
 err_sa:
 	ib_sa_cleanup();
 err_mad:
@@ -1215,9 +2763,9 @@
 err_addr:
 	addr_cleanup();
 err_ibnl:
-	rdma_nl_exit();
-err_sysfs:
 	class_unregister(&ib_class);
+err_comp_unbound:
+	destroy_workqueue(ib_comp_unbound_wq);
 err_comp:
 	destroy_workqueue(ib_comp_wq);
 err:
@@ -1230,18 +2778,26 @@
 	roce_gid_mgmt_cleanup();
 	nldev_exit();
 	rdma_nl_unregister(RDMA_NL_LS);
-	unregister_lsm_notifier(&ibdev_lsm_nb);
+	unregister_pernet_device(&rdma_dev_net_ops);
+	unregister_blocking_lsm_notifier(&ibdev_lsm_nb);
 	ib_sa_cleanup();
 	ib_mad_cleanup();
 	addr_cleanup();
 	rdma_nl_exit();
 	class_unregister(&ib_class);
+	destroy_workqueue(ib_comp_unbound_wq);
 	destroy_workqueue(ib_comp_wq);
 	/* Make sure that any pending umem accounting work is done. */
 	destroy_workqueue(ib_wq);
+	flush_workqueue(system_unbound_wq);
+	WARN_ON(!xa_empty(&clients));
+	WARN_ON(!xa_empty(&devices));
 }
 
 MODULE_ALIAS_RDMA_NETLINK(RDMA_NL_LS, 4);
 
-subsys_initcall(ib_core_init);
+/* ib core relies on netdev stack to first register net_ns_type_operations
+ * ns kobject type before ib_core initialization.
+ */
+fs_initcall(ib_core_init);
 module_exit(ib_core_cleanup);
diff --git a/drivers/infiniband/core/fmr_pool.c b/drivers/infiniband/core/fmr_pool.c
index a077500..e08aec4 100644
--- a/drivers/infiniband/core/fmr_pool.c
+++ b/drivers/infiniband/core/fmr_pool.c
@@ -148,13 +148,6 @@
 		hlist_del_init(&fmr->cache_node);
 		fmr->remap_count = 0;
 		list_add_tail(&fmr->fmr->list, &fmr_list);
-
-#ifdef DEBUG
-		if (fmr->ref_count !=0) {
-			pr_warn(PFX "Unmapping FMR 0x%08x with ref count %d\n",
-				fmr, fmr->ref_count);
-		}
-#endif
 	}
 
 	list_splice_init(&pool->dirty_list, &unmap_list);
@@ -211,9 +204,9 @@
 		return ERR_PTR(-EINVAL);
 
 	device = pd->device;
-	if (!device->alloc_fmr    || !device->dealloc_fmr  ||
-	    !device->map_phys_fmr || !device->unmap_fmr) {
-		pr_info(PFX "Device %s does not support FMRs\n", device->name);
+	if (!device->ops.alloc_fmr    || !device->ops.dealloc_fmr  ||
+	    !device->ops.map_phys_fmr || !device->ops.unmap_fmr) {
+		dev_info(&device->dev, "Device does not support FMRs\n");
 		return ERR_PTR(-ENOSYS);
 	}
 
@@ -257,7 +250,8 @@
 	atomic_set(&pool->flush_ser, 0);
 	init_waitqueue_head(&pool->force_wait);
 
-	pool->worker = kthread_create_worker(0, "ib_fmr(%s)", device->name);
+	pool->worker =
+		kthread_create_worker(0, "ib_fmr(%s)", dev_name(&device->dev));
 	if (IS_ERR(pool->worker)) {
 		pr_warn(PFX "couldn't start cleanup kthread worker\n");
 		ret = PTR_ERR(pool->worker);
@@ -473,7 +467,7 @@
  * Unmap an FMR.  The FMR mapping may remain valid until the FMR is
  * reused (or until ib_flush_fmr_pool() is called).
  */
-int ib_fmr_pool_unmap(struct ib_pool_fmr *fmr)
+void ib_fmr_pool_unmap(struct ib_pool_fmr *fmr)
 {
 	struct ib_fmr_pool *pool;
 	unsigned long flags;
@@ -495,14 +489,6 @@
 		}
 	}
 
-#ifdef DEBUG
-	if (fmr->ref_count < 0)
-		pr_warn(PFX "FMR %p has ref count %d < 0\n",
-			fmr, fmr->ref_count);
-#endif
-
 	spin_unlock_irqrestore(&pool->pool_lock, flags);
-
-	return 0;
 }
 EXPORT_SYMBOL(ib_fmr_pool_unmap);
diff --git a/drivers/infiniband/core/iwcm.c b/drivers/infiniband/core/iwcm.c
index 5d676cf..ade7182 100644
--- a/drivers/infiniband/core/iwcm.c
+++ b/drivers/infiniband/core/iwcm.c
@@ -87,7 +87,8 @@
 	[RDMA_NL_IWPM_REMOTE_INFO] = {.dump = iwpm_remote_info_cb},
 	[RDMA_NL_IWPM_HANDLE_ERR] = {.dump = iwpm_mapping_error_cb},
 	[RDMA_NL_IWPM_MAPINFO] = {.dump = iwpm_mapping_info_cb},
-	[RDMA_NL_IWPM_MAPINFO_NUM] = {.dump = iwpm_ack_mapping_info_cb}
+	[RDMA_NL_IWPM_MAPINFO_NUM] = {.dump = iwpm_ack_mapping_info_cb},
+	[RDMA_NL_IWPM_HELLO] = {.dump = iwpm_hello_cb}
 };
 
 static struct workqueue_struct *iwcm_wq;
@@ -371,6 +372,7 @@
 static void destroy_cm_id(struct iw_cm_id *cm_id)
 {
 	struct iwcm_id_private *cm_id_priv;
+	struct ib_qp *qp;
 	unsigned long flags;
 
 	cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
@@ -388,19 +390,22 @@
 	set_bit(IWCM_F_DROP_EVENTS, &cm_id_priv->flags);
 
 	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	qp = cm_id_priv->qp;
+	cm_id_priv->qp = NULL;
+
 	switch (cm_id_priv->state) {
 	case IW_CM_STATE_LISTEN:
 		cm_id_priv->state = IW_CM_STATE_DESTROYING;
 		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
 		/* destroy the listening endpoint */
-		cm_id->device->iwcm->destroy_listen(cm_id);
+		cm_id->device->ops.iw_destroy_listen(cm_id);
 		spin_lock_irqsave(&cm_id_priv->lock, flags);
 		break;
 	case IW_CM_STATE_ESTABLISHED:
 		cm_id_priv->state = IW_CM_STATE_DESTROYING;
 		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
 		/* Abrupt close of the connection */
-		(void)iwcm_modify_qp_err(cm_id_priv->qp);
+		(void)iwcm_modify_qp_err(qp);
 		spin_lock_irqsave(&cm_id_priv->lock, flags);
 		break;
 	case IW_CM_STATE_IDLE:
@@ -416,7 +421,7 @@
 		 */
 		cm_id_priv->state = IW_CM_STATE_DESTROYING;
 		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
-		cm_id->device->iwcm->reject(cm_id, NULL, 0);
+		cm_id->device->ops.iw_reject(cm_id, NULL, 0);
 		spin_lock_irqsave(&cm_id_priv->lock, flags);
 		break;
 	case IW_CM_STATE_CONN_SENT:
@@ -425,11 +430,9 @@
 		BUG();
 		break;
 	}
-	if (cm_id_priv->qp) {
-		cm_id_priv->id.device->iwcm->rem_ref(cm_id_priv->qp);
-		cm_id_priv->qp = NULL;
-	}
 	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+	if (qp)
+		cm_id_priv->id.device->ops.iw_rem_ref(qp);
 
 	if (cm_id->mapped) {
 		iwpm_remove_mapinfo(&cm_id->local_addr, &cm_id->m_local_addr);
@@ -502,17 +505,21 @@
  */
 static int iw_cm_map(struct iw_cm_id *cm_id, bool active)
 {
-	struct iwpm_dev_data pm_reg_msg;
+	const char *devname = dev_name(&cm_id->device->dev);
+	const char *ifname = cm_id->device->iw_ifname;
+	struct iwpm_dev_data pm_reg_msg = {};
 	struct iwpm_sa_data pm_msg;
 	int status;
 
+	if (strlen(devname) >= sizeof(pm_reg_msg.dev_name) ||
+	    strlen(ifname) >= sizeof(pm_reg_msg.if_name))
+		return -EINVAL;
+
 	cm_id->m_local_addr = cm_id->local_addr;
 	cm_id->m_remote_addr = cm_id->remote_addr;
 
-	memcpy(pm_reg_msg.dev_name, cm_id->device->name,
-	       sizeof(pm_reg_msg.dev_name));
-	memcpy(pm_reg_msg.if_name, cm_id->device->iwcm->ifname,
-	       sizeof(pm_reg_msg.if_name));
+	strcpy(pm_reg_msg.dev_name, devname);
+	strcpy(pm_reg_msg.if_name, ifname);
 
 	if (iwpm_register_pid(&pm_reg_msg, RDMA_NL_IWCM) ||
 	    !iwpm_valid_pid())
@@ -521,6 +528,8 @@
 	cm_id->mapped = true;
 	pm_msg.loc_addr = cm_id->local_addr;
 	pm_msg.rem_addr = cm_id->remote_addr;
+	pm_msg.flags = (cm_id->device->iw_driver_flags & IW_F_NO_PORT_MAP) ?
+		       IWPM_FLAGS_NO_PORT_MAP : 0;
 	if (active)
 		status = iwpm_add_and_query_mapping(&pm_msg,
 						    RDMA_NL_IWCM);
@@ -539,7 +548,7 @@
 
 	return iwpm_create_mapinfo(&cm_id->local_addr,
 				   &cm_id->m_local_addr,
-				   RDMA_NL_IWCM);
+				   RDMA_NL_IWCM, pm_msg.flags);
 }
 
 /*
@@ -570,7 +579,8 @@
 		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
 		ret = iw_cm_map(cm_id, false);
 		if (!ret)
-			ret = cm_id->device->iwcm->create_listen(cm_id, backlog);
+			ret = cm_id->device->ops.iw_create_listen(cm_id,
+								  backlog);
 		if (ret)
 			cm_id_priv->state = IW_CM_STATE_IDLE;
 		spin_lock_irqsave(&cm_id_priv->lock, flags);
@@ -610,7 +620,7 @@
 	cm_id_priv->state = IW_CM_STATE_IDLE;
 	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
 
-	ret = cm_id->device->iwcm->reject(cm_id, private_data,
+	ret = cm_id->device->ops.iw_reject(cm_id, private_data,
 					  private_data_len);
 
 	clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
@@ -646,28 +656,28 @@
 		return -EINVAL;
 	}
 	/* Get the ib_qp given the QPN */
-	qp = cm_id->device->iwcm->get_qp(cm_id->device, iw_param->qpn);
+	qp = cm_id->device->ops.iw_get_qp(cm_id->device, iw_param->qpn);
 	if (!qp) {
 		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
 		clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
 		wake_up_all(&cm_id_priv->connect_wait);
 		return -EINVAL;
 	}
-	cm_id->device->iwcm->add_ref(qp);
+	cm_id->device->ops.iw_add_ref(qp);
 	cm_id_priv->qp = qp;
 	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
 
-	ret = cm_id->device->iwcm->accept(cm_id, iw_param);
+	ret = cm_id->device->ops.iw_accept(cm_id, iw_param);
 	if (ret) {
 		/* An error on accept precludes provider events */
 		BUG_ON(cm_id_priv->state != IW_CM_STATE_CONN_RECV);
 		cm_id_priv->state = IW_CM_STATE_IDLE;
 		spin_lock_irqsave(&cm_id_priv->lock, flags);
-		if (cm_id_priv->qp) {
-			cm_id->device->iwcm->rem_ref(qp);
-			cm_id_priv->qp = NULL;
-		}
+		qp = cm_id_priv->qp;
+		cm_id_priv->qp = NULL;
 		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+		if (qp)
+			cm_id->device->ops.iw_rem_ref(qp);
 		clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
 		wake_up_all(&cm_id_priv->connect_wait);
 	}
@@ -688,7 +698,7 @@
 	struct iwcm_id_private *cm_id_priv;
 	int ret;
 	unsigned long flags;
-	struct ib_qp *qp;
+	struct ib_qp *qp = NULL;
 
 	cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
 
@@ -705,30 +715,30 @@
 	}
 
 	/* Get the ib_qp given the QPN */
-	qp = cm_id->device->iwcm->get_qp(cm_id->device, iw_param->qpn);
+	qp = cm_id->device->ops.iw_get_qp(cm_id->device, iw_param->qpn);
 	if (!qp) {
 		ret = -EINVAL;
 		goto err;
 	}
-	cm_id->device->iwcm->add_ref(qp);
+	cm_id->device->ops.iw_add_ref(qp);
 	cm_id_priv->qp = qp;
 	cm_id_priv->state = IW_CM_STATE_CONN_SENT;
 	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
 
 	ret = iw_cm_map(cm_id, true);
 	if (!ret)
-		ret = cm_id->device->iwcm->connect(cm_id, iw_param);
+		ret = cm_id->device->ops.iw_connect(cm_id, iw_param);
 	if (!ret)
 		return 0;	/* success */
 
 	spin_lock_irqsave(&cm_id_priv->lock, flags);
-	if (cm_id_priv->qp) {
-		cm_id->device->iwcm->rem_ref(qp);
-		cm_id_priv->qp = NULL;
-	}
+	qp = cm_id_priv->qp;
+	cm_id_priv->qp = NULL;
 	cm_id_priv->state = IW_CM_STATE_IDLE;
 err:
 	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+	if (qp)
+		cm_id->device->ops.iw_rem_ref(qp);
 	clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
 	wake_up_all(&cm_id_priv->connect_wait);
 	return ret;
@@ -870,6 +880,7 @@
 static int cm_conn_rep_handler(struct iwcm_id_private *cm_id_priv,
 			       struct iw_cm_event *iw_event)
 {
+	struct ib_qp *qp = NULL;
 	unsigned long flags;
 	int ret;
 
@@ -888,11 +899,13 @@
 		cm_id_priv->state = IW_CM_STATE_ESTABLISHED;
 	} else {
 		/* REJECTED or RESET */
-		cm_id_priv->id.device->iwcm->rem_ref(cm_id_priv->qp);
+		qp = cm_id_priv->qp;
 		cm_id_priv->qp = NULL;
 		cm_id_priv->state = IW_CM_STATE_IDLE;
 	}
 	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+	if (qp)
+		cm_id_priv->id.device->ops.iw_rem_ref(qp);
 	ret = cm_id_priv->id.cm_handler(&cm_id_priv->id, iw_event);
 
 	if (iw_event->private_data_len)
@@ -934,21 +947,18 @@
 static int cm_close_handler(struct iwcm_id_private *cm_id_priv,
 				  struct iw_cm_event *iw_event)
 {
+	struct ib_qp *qp;
 	unsigned long flags;
-	int ret = 0;
+	int ret = 0, notify_event = 0;
 	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	qp = cm_id_priv->qp;
+	cm_id_priv->qp = NULL;
 
-	if (cm_id_priv->qp) {
-		cm_id_priv->id.device->iwcm->rem_ref(cm_id_priv->qp);
-		cm_id_priv->qp = NULL;
-	}
 	switch (cm_id_priv->state) {
 	case IW_CM_STATE_ESTABLISHED:
 	case IW_CM_STATE_CLOSING:
 		cm_id_priv->state = IW_CM_STATE_IDLE;
-		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
-		ret = cm_id_priv->id.cm_handler(&cm_id_priv->id, iw_event);
-		spin_lock_irqsave(&cm_id_priv->lock, flags);
+		notify_event = 1;
 		break;
 	case IW_CM_STATE_DESTROYING:
 		break;
@@ -957,6 +967,10 @@
 	}
 	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
 
+	if (qp)
+		cm_id_priv->id.device->ops.iw_rem_ref(qp);
+	if (notify_event)
+		ret = cm_id_priv->id.cm_handler(&cm_id_priv->id, iw_event);
 	return ret;
 }
 
diff --git a/drivers/infiniband/core/iwpm_msg.c b/drivers/infiniband/core/iwpm_msg.c
index 8861c05..4668699 100644
--- a/drivers/infiniband/core/iwpm_msg.c
+++ b/drivers/infiniband/core/iwpm_msg.c
@@ -34,18 +34,25 @@
 #include "iwpm_util.h"
 
 static const char iwpm_ulib_name[IWPM_ULIBNAME_SIZE] = "iWarpPortMapperUser";
-static int iwpm_ulib_version = 3;
+u16 iwpm_ulib_version = IWPM_UABI_VERSION_MIN;
 static int iwpm_user_pid = IWPM_PID_UNDEFINED;
 static atomic_t echo_nlmsg_seq;
 
+/**
+ * iwpm_valid_pid - Check if the userspace iwarp port mapper pid is valid
+ *
+ * Returns true if the pid is greater than zero, otherwise returns false
+ */
 int iwpm_valid_pid(void)
 {
 	return iwpm_user_pid > 0;
 }
 
-/*
- * iwpm_register_pid - Send a netlink query to user space
- *                     for the iwarp port mapper pid
+/**
+ * iwpm_register_pid - Send a netlink query to userspace
+ *                     to get the iwarp port mapper pid
+ * @pm_msg: Contains driver info to send to the userspace port mapper
+ * @nl_client: The index of the netlink client
  *
  * nlmsg attributes:
  *	[IWPM_NLA_REG_PID_SEQ]
@@ -105,7 +112,7 @@
 	pr_debug("%s: Multicasting a nlmsg (dev = %s ifname = %s iwpm = %s)\n",
 		__func__, pm_msg->dev_name, pm_msg->if_name, iwpm_ulib_name);
 
-	ret = rdma_nl_multicast(skb, RDMA_NL_GROUP_IWPM, GFP_KERNEL);
+	ret = rdma_nl_multicast(&init_net, skb, RDMA_NL_GROUP_IWPM, GFP_KERNEL);
 	if (ret) {
 		skb = NULL; /* skb is freed in the netlink send-op handling */
 		iwpm_user_pid = IWPM_PID_UNAVAILABLE;
@@ -117,19 +124,25 @@
 	return ret;
 pid_query_error:
 	pr_info("%s: %s (client = %d)\n", __func__, err_str, nl_client);
-	if (skb)
-		dev_kfree_skb(skb);
+	dev_kfree_skb(skb);
 	if (nlmsg_request)
 		iwpm_free_nlmsg_request(&nlmsg_request->kref);
 	return ret;
 }
 
-/*
- * iwpm_add_mapping - Send a netlink add mapping message
- *                    to the port mapper
+/**
+ * iwpm_add_mapping - Send a netlink add mapping request to
+ *                    the userspace port mapper
+ * @pm_msg: Contains the local ip/tcp address info to send
+ * @nl_client: The index of the netlink client
+ *
  * nlmsg attributes:
  *	[IWPM_NLA_MANAGE_MAPPING_SEQ]
  *	[IWPM_NLA_MANAGE_ADDR]
+ *	[IWPM_NLA_MANAGE_FLAGS]
+ *
+ * If the request is successful, the pm_msg stores
+ * the port mapper response (mapped address info)
  */
 int iwpm_add_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client)
 {
@@ -173,10 +186,22 @@
 	if (ret)
 		goto add_mapping_error;
 
+	/* If flags are required and we're not V4, then return a quiet error */
+	if (pm_msg->flags && iwpm_ulib_version == IWPM_UABI_VERSION_MIN) {
+		ret = -EINVAL;
+		goto add_mapping_error_nowarn;
+	}
+	if (iwpm_ulib_version > IWPM_UABI_VERSION_MIN) {
+		ret = ibnl_put_attr(skb, nlh, sizeof(u32), &pm_msg->flags,
+				IWPM_NLA_MANAGE_FLAGS);
+		if (ret)
+			goto add_mapping_error;
+	}
+
 	nlmsg_end(skb, nlh);
 	nlmsg_request->req_buffer = pm_msg;
 
-	ret = rdma_nl_unicast_wait(skb, iwpm_user_pid);
+	ret = rdma_nl_unicast_wait(&init_net, skb, iwpm_user_pid);
 	if (ret) {
 		skb = NULL; /* skb is freed in the netlink send-op handling */
 		iwpm_user_pid = IWPM_PID_UNDEFINED;
@@ -187,20 +212,24 @@
 	return ret;
 add_mapping_error:
 	pr_info("%s: %s (client = %d)\n", __func__, err_str, nl_client);
-	if (skb)
-		dev_kfree_skb(skb);
+add_mapping_error_nowarn:
+	dev_kfree_skb(skb);
 	if (nlmsg_request)
 		iwpm_free_nlmsg_request(&nlmsg_request->kref);
 	return ret;
 }
 
-/*
- * iwpm_add_and_query_mapping - Send a netlink add and query
- *                              mapping message to the port mapper
+/**
+ * iwpm_add_and_query_mapping - Process the port mapper response to
+ *                              iwpm_add_and_query_mapping request
+ * @pm_msg: Contains the local ip/tcp address info to send
+ * @nl_client: The index of the netlink client
+ *
  * nlmsg attributes:
  *	[IWPM_NLA_QUERY_MAPPING_SEQ]
  *	[IWPM_NLA_QUERY_LOCAL_ADDR]
  *	[IWPM_NLA_QUERY_REMOTE_ADDR]
+ *	[IWPM_NLA_QUERY_FLAGS]
  */
 int iwpm_add_and_query_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client)
 {
@@ -251,10 +280,22 @@
 	if (ret)
 		goto query_mapping_error;
 
+	/* If flags are required and we're not V4, then return a quite error */
+	if (pm_msg->flags && iwpm_ulib_version == IWPM_UABI_VERSION_MIN) {
+		ret = -EINVAL;
+		goto query_mapping_error_nowarn;
+	}
+	if (iwpm_ulib_version > IWPM_UABI_VERSION_MIN) {
+		ret = ibnl_put_attr(skb, nlh, sizeof(u32), &pm_msg->flags,
+				IWPM_NLA_QUERY_FLAGS);
+		if (ret)
+			goto query_mapping_error;
+	}
+
 	nlmsg_end(skb, nlh);
 	nlmsg_request->req_buffer = pm_msg;
 
-	ret = rdma_nl_unicast_wait(skb, iwpm_user_pid);
+	ret = rdma_nl_unicast_wait(&init_net, skb, iwpm_user_pid);
 	if (ret) {
 		skb = NULL; /* skb is freed in the netlink send-op handling */
 		err_str = "Unable to send a nlmsg";
@@ -264,16 +305,20 @@
 	return ret;
 query_mapping_error:
 	pr_info("%s: %s (client = %d)\n", __func__, err_str, nl_client);
-	if (skb)
-		dev_kfree_skb(skb);
+query_mapping_error_nowarn:
+	dev_kfree_skb(skb);
 	if (nlmsg_request)
 		iwpm_free_nlmsg_request(&nlmsg_request->kref);
 	return ret;
 }
 
-/*
- * iwpm_remove_mapping - Send a netlink remove mapping message
- *                       to the port mapper
+/**
+ * iwpm_remove_mapping - Send a netlink remove mapping request
+ *                       to the userspace port mapper
+ *
+ * @local_addr: Local ip/tcp address to remove
+ * @nl_client: The index of the netlink client
+ *
  * nlmsg attributes:
  *	[IWPM_NLA_MANAGE_MAPPING_SEQ]
  *	[IWPM_NLA_MANAGE_ADDR]
@@ -316,7 +361,7 @@
 
 	nlmsg_end(skb, nlh);
 
-	ret = rdma_nl_unicast_wait(skb, iwpm_user_pid);
+	ret = rdma_nl_unicast_wait(&init_net, skb, iwpm_user_pid);
 	if (ret) {
 		skb = NULL; /* skb is freed in the netlink send-op handling */
 		iwpm_user_pid = IWPM_PID_UNDEFINED;
@@ -344,9 +389,14 @@
 	[IWPM_NLA_RREG_PID_ERR]     = { .type = NLA_U16 }
 };
 
-/*
- * iwpm_register_pid_cb - Process a port mapper response to
- *                        iwpm_register_pid()
+/**
+ * iwpm_register_pid_cb - Process the port mapper response to
+ *                        iwpm_register_pid query
+ * @skb:
+ * @cb: Contains the received message (payload and netlink header)
+ *
+ * If successful, the function receives the userspace port mapper pid
+ * which is used in future communication with the port mapper
  */
 int iwpm_register_pid_cb(struct sk_buff *skb, struct netlink_callback *cb)
 {
@@ -379,7 +429,7 @@
 	/* check device name, ulib name and version */
 	if (strcmp(pm_msg->dev_name, dev_name) ||
 			strcmp(iwpm_ulib_name, iwpm_name) ||
-			iwpm_version != iwpm_ulib_version) {
+			iwpm_version < IWPM_UABI_VERSION_MIN) {
 
 		pr_info("%s: Incorrect info (dev = %s name = %s version = %d)\n",
 				__func__, dev_name, iwpm_name, iwpm_version);
@@ -387,6 +437,10 @@
 		goto register_pid_response_exit;
 	}
 	iwpm_user_pid = cb->nlh->nlmsg_pid;
+	iwpm_ulib_version = iwpm_version;
+	if (iwpm_ulib_version < IWPM_UABI_VERSION)
+		pr_warn_once("%s: Down level iwpmd/pid %u.  Continuing...",
+			__func__, iwpm_user_pid);
 	atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq);
 	pr_debug("%s: iWarp Port Mapper (pid = %d) is available!\n",
 			__func__, iwpm_user_pid);
@@ -403,15 +457,19 @@
 
 /* netlink attribute policy for the received response to add mapping request */
 static const struct nla_policy resp_add_policy[IWPM_NLA_RMANAGE_MAPPING_MAX] = {
-	[IWPM_NLA_MANAGE_MAPPING_SEQ]     = { .type = NLA_U32 },
-	[IWPM_NLA_MANAGE_ADDR]            = { .len = sizeof(struct sockaddr_storage) },
-	[IWPM_NLA_MANAGE_MAPPED_LOC_ADDR] = { .len = sizeof(struct sockaddr_storage) },
-	[IWPM_NLA_RMANAGE_MAPPING_ERR]	  = { .type = NLA_U16 }
+	[IWPM_NLA_RMANAGE_MAPPING_SEQ]     = { .type = NLA_U32 },
+	[IWPM_NLA_RMANAGE_ADDR]            = {
+				.len = sizeof(struct sockaddr_storage) },
+	[IWPM_NLA_RMANAGE_MAPPED_LOC_ADDR] = {
+				.len = sizeof(struct sockaddr_storage) },
+	[IWPM_NLA_RMANAGE_MAPPING_ERR]	   = { .type = NLA_U16 }
 };
 
-/*
- * iwpm_add_mapping_cb - Process a port mapper response to
- *                       iwpm_add_mapping()
+/**
+ * iwpm_add_mapping_cb - Process the port mapper response to
+ *                       iwpm_add_mapping request
+ * @skb:
+ * @cb: Contains the received message (payload and netlink header)
  */
 int iwpm_add_mapping_cb(struct sk_buff *skb, struct netlink_callback *cb)
 {
@@ -430,7 +488,7 @@
 
 	atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq);
 
-	msg_seq = nla_get_u32(nltb[IWPM_NLA_MANAGE_MAPPING_SEQ]);
+	msg_seq = nla_get_u32(nltb[IWPM_NLA_RMANAGE_MAPPING_SEQ]);
 	nlmsg_request = iwpm_find_nlmsg_request(msg_seq);
 	if (!nlmsg_request) {
 		pr_info("%s: Could not find a matching request (seq = %u)\n",
@@ -439,9 +497,9 @@
 	}
 	pm_msg = nlmsg_request->req_buffer;
 	local_sockaddr = (struct sockaddr_storage *)
-			nla_data(nltb[IWPM_NLA_MANAGE_ADDR]);
+			nla_data(nltb[IWPM_NLA_RMANAGE_ADDR]);
 	mapped_sockaddr = (struct sockaddr_storage *)
-			nla_data(nltb[IWPM_NLA_MANAGE_MAPPED_LOC_ADDR]);
+			nla_data(nltb[IWPM_NLA_RMANAGE_MAPPED_LOC_ADDR]);
 
 	if (iwpm_compare_sockaddr(local_sockaddr, &pm_msg->loc_addr)) {
 		nlmsg_request->err_code = IWPM_USER_LIB_INFO_ERR;
@@ -472,17 +530,23 @@
 /* netlink attribute policy for the response to add and query mapping request
  * and response with remote address info */
 static const struct nla_policy resp_query_policy[IWPM_NLA_RQUERY_MAPPING_MAX] = {
-	[IWPM_NLA_QUERY_MAPPING_SEQ]      = { .type = NLA_U32 },
-	[IWPM_NLA_QUERY_LOCAL_ADDR]       = { .len = sizeof(struct sockaddr_storage) },
-	[IWPM_NLA_QUERY_REMOTE_ADDR]      = { .len = sizeof(struct sockaddr_storage) },
-	[IWPM_NLA_RQUERY_MAPPED_LOC_ADDR] = { .len = sizeof(struct sockaddr_storage) },
-	[IWPM_NLA_RQUERY_MAPPED_REM_ADDR] = { .len = sizeof(struct sockaddr_storage) },
+	[IWPM_NLA_RQUERY_MAPPING_SEQ]     = { .type = NLA_U32 },
+	[IWPM_NLA_RQUERY_LOCAL_ADDR]      = {
+				.len = sizeof(struct sockaddr_storage) },
+	[IWPM_NLA_RQUERY_REMOTE_ADDR]     = {
+				.len = sizeof(struct sockaddr_storage) },
+	[IWPM_NLA_RQUERY_MAPPED_LOC_ADDR] = {
+				.len = sizeof(struct sockaddr_storage) },
+	[IWPM_NLA_RQUERY_MAPPED_REM_ADDR] = {
+				.len = sizeof(struct sockaddr_storage) },
 	[IWPM_NLA_RQUERY_MAPPING_ERR]	  = { .type = NLA_U16 }
 };
 
-/*
- * iwpm_add_and_query_mapping_cb - Process a port mapper response to
- *                                 iwpm_add_and_query_mapping()
+/**
+ * iwpm_add_and_query_mapping_cb - Process the port mapper response to
+ *                                 iwpm_add_and_query_mapping request
+ * @skb:
+ * @cb: Contains the received message (payload and netlink header)
  */
 int iwpm_add_and_query_mapping_cb(struct sk_buff *skb,
 				struct netlink_callback *cb)
@@ -502,7 +566,7 @@
 		return -EINVAL;
 	atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq);
 
-	msg_seq = nla_get_u32(nltb[IWPM_NLA_QUERY_MAPPING_SEQ]);
+	msg_seq = nla_get_u32(nltb[IWPM_NLA_RQUERY_MAPPING_SEQ]);
 	nlmsg_request = iwpm_find_nlmsg_request(msg_seq);
 	if (!nlmsg_request) {
 		pr_info("%s: Could not find a matching request (seq = %u)\n",
@@ -511,9 +575,9 @@
 	}
 	pm_msg = nlmsg_request->req_buffer;
 	local_sockaddr = (struct sockaddr_storage *)
-			nla_data(nltb[IWPM_NLA_QUERY_LOCAL_ADDR]);
+			nla_data(nltb[IWPM_NLA_RQUERY_LOCAL_ADDR]);
 	remote_sockaddr = (struct sockaddr_storage *)
-			nla_data(nltb[IWPM_NLA_QUERY_REMOTE_ADDR]);
+			nla_data(nltb[IWPM_NLA_RQUERY_REMOTE_ADDR]);
 	mapped_loc_sockaddr = (struct sockaddr_storage *)
 			nla_data(nltb[IWPM_NLA_RQUERY_MAPPED_LOC_ADDR]);
 	mapped_rem_sockaddr = (struct sockaddr_storage *)
@@ -560,9 +624,13 @@
 	return 0;
 }
 
-/*
- * iwpm_remote_info_cb - Process a port mapper message, containing
- *			  the remote connecting peer address info
+/**
+ * iwpm_remote_info_cb - Process remote connecting peer address info, which
+ *                       the port mapper has received from the connecting peer
+ * @skb:
+ * @cb: Contains the received message (payload and netlink header)
+ *
+ * Stores the IPv4/IPv6 address info in a hash table
  */
 int iwpm_remote_info_cb(struct sk_buff *skb, struct netlink_callback *cb)
 {
@@ -588,9 +656,9 @@
 	atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq);
 
 	local_sockaddr = (struct sockaddr_storage *)
-			nla_data(nltb[IWPM_NLA_QUERY_LOCAL_ADDR]);
+			nla_data(nltb[IWPM_NLA_RQUERY_LOCAL_ADDR]);
 	remote_sockaddr = (struct sockaddr_storage *)
-			nla_data(nltb[IWPM_NLA_QUERY_REMOTE_ADDR]);
+			nla_data(nltb[IWPM_NLA_RQUERY_REMOTE_ADDR]);
 	mapped_loc_sockaddr = (struct sockaddr_storage *)
 			nla_data(nltb[IWPM_NLA_RQUERY_MAPPED_LOC_ADDR]);
 	mapped_rem_sockaddr = (struct sockaddr_storage *)
@@ -635,8 +703,14 @@
 	[IWPM_NLA_MAPINFO_ULIB_VER]  = { .type = NLA_U16 }
 };
 
-/*
- * iwpm_mapping_info_cb - Process a port mapper request for mapping info
+/**
+ * iwpm_mapping_info_cb - Process a notification that the userspace
+ *                        port mapper daemon is started
+ * @skb:
+ * @cb: Contains the received message (payload and netlink header)
+ *
+ * Using the received port mapper pid, send all the local mapping
+ * info records to the userspace port mapper
  */
 int iwpm_mapping_info_cb(struct sk_buff *skb, struct netlink_callback *cb)
 {
@@ -655,7 +729,7 @@
 	iwpm_name = (char *)nla_data(nltb[IWPM_NLA_MAPINFO_ULIB_NAME]);
 	iwpm_version = nla_get_u16(nltb[IWPM_NLA_MAPINFO_ULIB_VER]);
 	if (strcmp(iwpm_ulib_name, iwpm_name) ||
-			iwpm_version != iwpm_ulib_version) {
+			iwpm_version < IWPM_UABI_VERSION_MIN) {
 		pr_info("%s: Invalid port mapper name = %s version = %d\n",
 				__func__, iwpm_name, iwpm_version);
 		return ret;
@@ -669,6 +743,11 @@
 	iwpm_set_registration(nl_client, IWPM_REG_INCOMPL);
 	atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq);
 	iwpm_user_pid = cb->nlh->nlmsg_pid;
+
+	if (iwpm_ulib_version < IWPM_UABI_VERSION)
+		pr_warn_once("%s: Down level iwpmd/pid %u.  Continuing...",
+			__func__, iwpm_user_pid);
+
 	if (!iwpm_mapinfo_available())
 		return 0;
 	pr_debug("%s: iWarp Port Mapper (pid = %d) is available!\n",
@@ -684,9 +763,11 @@
 	[IWPM_NLA_MAPINFO_ACK_NUM] =  { .type = NLA_U32 }
 };
 
-/*
- * iwpm_ack_mapping_info_cb - Process a port mapper ack for
- *                            the provided mapping info records
+/**
+ * iwpm_ack_mapping_info_cb - Process the port mapper ack for
+ *                            the provided local mapping info records
+ * @skb:
+ * @cb: Contains the received message (payload and netlink header)
  */
 int iwpm_ack_mapping_info_cb(struct sk_buff *skb, struct netlink_callback *cb)
 {
@@ -712,8 +793,11 @@
 	[IWPM_NLA_ERR_CODE]       = { .type = NLA_U16 },
 };
 
-/*
- * iwpm_mapping_error_cb - Process a port mapper error message
+/**
+ * iwpm_mapping_error_cb - Process port mapper notification for error
+ *
+ * @skb:
+ * @cb: Contains the received message (payload and netlink header)
  */
 int iwpm_mapping_error_cb(struct sk_buff *skb, struct netlink_callback *cb)
 {
@@ -748,3 +832,46 @@
 	up(&nlmsg_request->sem);
 	return 0;
 }
+
+/* netlink attribute policy for the received hello request */
+static const struct nla_policy hello_policy[IWPM_NLA_HELLO_MAX] = {
+	[IWPM_NLA_HELLO_ABI_VERSION]     = { .type = NLA_U16 }
+};
+
+/**
+ * iwpm_hello_cb - Process a hello message from iwpmd
+ *
+ * @skb:
+ * @cb: Contains the received message (payload and netlink header)
+ *
+ * Using the received port mapper pid, send the kernel's abi_version
+ * after adjusting it to support the iwpmd version.
+ */
+int iwpm_hello_cb(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct nlattr *nltb[IWPM_NLA_HELLO_MAX];
+	const char *msg_type = "Hello request";
+	u8 nl_client;
+	u16 abi_version;
+	int ret = -EINVAL;
+
+	if (iwpm_parse_nlmsg(cb, IWPM_NLA_HELLO_MAX, hello_policy, nltb,
+			     msg_type)) {
+		pr_info("%s: Unable to parse nlmsg\n", __func__);
+		return ret;
+	}
+	abi_version = nla_get_u16(nltb[IWPM_NLA_HELLO_ABI_VERSION]);
+	nl_client = RDMA_NL_GET_CLIENT(cb->nlh->nlmsg_type);
+	if (!iwpm_valid_client(nl_client)) {
+		pr_info("%s: Invalid port mapper client = %d\n",
+				__func__, nl_client);
+		return ret;
+	}
+	iwpm_set_registration(nl_client, IWPM_REG_INCOMPL);
+	atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq);
+	iwpm_ulib_version = min_t(u16, IWPM_UABI_VERSION, abi_version);
+	pr_debug("Using ABI version %u\n", iwpm_ulib_version);
+	iwpm_user_pid = cb->nlh->nlmsg_pid;
+	ret = iwpm_send_hello(nl_client, iwpm_user_pid, iwpm_ulib_version);
+	return ret;
+}
diff --git a/drivers/infiniband/core/iwpm_util.c b/drivers/infiniband/core/iwpm_util.c
index cdb63f3..13495b4 100644
--- a/drivers/infiniband/core/iwpm_util.c
+++ b/drivers/infiniband/core/iwpm_util.c
@@ -51,6 +51,12 @@
 static DEFINE_MUTEX(iwpm_admin_lock);
 static struct iwpm_admin_data iwpm_admin;
 
+/**
+ * iwpm_init - Allocate resources for the iwarp port mapper
+ * @nl_client: The index of the netlink client
+ *
+ * Should be called when network interface goes up.
+ */
 int iwpm_init(u8 nl_client)
 {
 	int ret = 0;
@@ -87,6 +93,12 @@
 static void free_hash_bucket(void);
 static void free_reminfo_bucket(void);
 
+/**
+ * iwpm_exit - Deallocate resources for the iwarp port mapper
+ * @nl_client: The index of the netlink client
+ *
+ * Should be called when network interface goes down.
+ */
 int iwpm_exit(u8 nl_client)
 {
 
@@ -112,9 +124,17 @@
 static struct hlist_head *get_mapinfo_hash_bucket(struct sockaddr_storage *,
 					       struct sockaddr_storage *);
 
+/**
+ * iwpm_create_mapinfo - Store local and mapped IPv4/IPv6 address
+ *                       info in a hash table
+ * @local_addr: Local ip/tcp address
+ * @mapped_addr: Mapped local ip/tcp address
+ * @nl_client: The index of the netlink client
+ * @map_flags: IWPM mapping flags
+ */
 int iwpm_create_mapinfo(struct sockaddr_storage *local_sockaddr,
 			struct sockaddr_storage *mapped_sockaddr,
-			u8 nl_client)
+			u8 nl_client, u32 map_flags)
 {
 	struct hlist_head *hash_bucket_head = NULL;
 	struct iwpm_mapping_info *map_info;
@@ -132,6 +152,7 @@
 	memcpy(&map_info->mapped_sockaddr, mapped_sockaddr,
 	       sizeof(struct sockaddr_storage));
 	map_info->nl_client = nl_client;
+	map_info->map_flags = map_flags;
 
 	spin_lock_irqsave(&iwpm_mapinfo_lock, flags);
 	if (iwpm_hash_bucket) {
@@ -150,6 +171,15 @@
 	return ret;
 }
 
+/**
+ * iwpm_remove_mapinfo - Remove local and mapped IPv4/IPv6 address
+ *                       info from the hash table
+ * @local_addr: Local ip/tcp address
+ * @mapped_local_addr: Mapped local ip/tcp address
+ *
+ * Returns err code if mapping info is not found in the hash table,
+ * otherwise returns 0
+ */
 int iwpm_remove_mapinfo(struct sockaddr_storage *local_sockaddr,
 			struct sockaddr_storage *mapped_local_addr)
 {
@@ -250,6 +280,17 @@
 	spin_unlock_irqrestore(&iwpm_reminfo_lock, flags);
 }
 
+/**
+ * iwpm_get_remote_info - Get the remote connecting peer address info
+ *
+ * @mapped_loc_addr: Mapped local address of the listening peer
+ * @mapped_rem_addr: Mapped remote address of the connecting peer
+ * @remote_addr: To store the remote address of the connecting peer
+ * @nl_client: The index of the netlink client
+ *
+ * The remote address info is retrieved and provided to the client in
+ * the remote_addr. After that it is removed from the hash table
+ */
 int iwpm_get_remote_info(struct sockaddr_storage *mapped_loc_addr,
 			 struct sockaddr_storage *mapped_rem_addr,
 			 struct sockaddr_storage *remote_addr,
@@ -465,14 +506,14 @@
 	int ret;
 	const char *err_str = "";
 
-	ret = nlmsg_validate(cb->nlh, nlh_len, policy_max - 1, nlmsg_policy,
-			     NULL);
+	ret = nlmsg_validate_deprecated(cb->nlh, nlh_len, policy_max - 1,
+					nlmsg_policy, NULL);
 	if (ret) {
 		err_str = "Invalid attribute";
 		goto parse_nlmsg_error;
 	}
-	ret = nlmsg_parse(cb->nlh, nlh_len, nltb, policy_max - 1,
-			  nlmsg_policy, NULL);
+	ret = nlmsg_parse_deprecated(cb->nlh, nlh_len, nltb, policy_max - 1,
+				     nlmsg_policy, NULL);
 	if (ret) {
 		err_str = "Unable to parse the nlmsg";
 		goto parse_nlmsg_error;
@@ -604,7 +645,7 @@
 
 	nlmsg_end(skb, nlh);
 
-	ret = rdma_nl_unicast(skb, iwpm_pid);
+	ret = rdma_nl_unicast(&init_net, skb, iwpm_pid);
 	if (ret) {
 		skb = NULL;
 		err_str = "Unable to send a nlmsg";
@@ -614,8 +655,7 @@
 	return 0;
 mapinfo_num_error:
 	pr_info("%s: %s\n", __func__, err_str);
-	if (skb)
-		dev_kfree_skb(skb);
+	dev_kfree_skb(skb);
 	return ret;
 }
 
@@ -633,7 +673,7 @@
 		return -ENOMEM;
 	}
 	nlh->nlmsg_type = NLMSG_DONE;
-	ret = rdma_nl_unicast(skb, iwpm_pid);
+	ret = rdma_nl_unicast(&init_net, skb, iwpm_pid);
 	if (ret)
 		pr_warn("%s Unable to send a nlmsg\n", __func__);
 	return ret;
@@ -686,6 +726,14 @@
 			if (ret)
 				goto send_mapping_info_unlock;
 
+			if (iwpm_ulib_version > IWPM_UABI_VERSION_MIN) {
+				ret = ibnl_put_attr(skb, nlh, sizeof(u32),
+						&map_info->map_flags,
+						IWPM_NLA_MAPINFO_FLAGS);
+				if (ret)
+					goto send_mapping_info_unlock;
+			}
+
 			nlmsg_end(skb, nlh);
 
 			iwpm_print_sockaddr(&map_info->local_sockaddr,
@@ -729,8 +777,7 @@
 send_mapping_info_exit:
 	if (ret) {
 		pr_warn("%s: %s (ret = %d)\n", __func__, err_str, ret);
-		if (skb)
-			dev_kfree_skb(skb);
+		dev_kfree_skb(skb);
 		return ret;
 	}
 	send_nlmsg_done(skb, nl_client, iwpm_pid);
@@ -754,3 +801,37 @@
 	spin_unlock_irqrestore(&iwpm_mapinfo_lock, flags);
 	return full_bucket;
 }
+
+int iwpm_send_hello(u8 nl_client, int iwpm_pid, u16 abi_version)
+{
+	struct sk_buff *skb = NULL;
+	struct nlmsghdr *nlh;
+	const char *err_str = "";
+	int ret = -EINVAL;
+
+	skb = iwpm_create_nlmsg(RDMA_NL_IWPM_HELLO, &nlh, nl_client);
+	if (!skb) {
+		err_str = "Unable to create a nlmsg";
+		goto hello_num_error;
+	}
+	nlh->nlmsg_seq = iwpm_get_nlmsg_seq();
+	err_str = "Unable to put attribute of abi_version into nlmsg";
+	ret = ibnl_put_attr(skb, nlh, sizeof(u16), &abi_version,
+			    IWPM_NLA_HELLO_ABI_VERSION);
+	if (ret)
+		goto hello_num_error;
+	nlmsg_end(skb, nlh);
+
+	ret = rdma_nl_unicast(&init_net, skb, iwpm_pid);
+	if (ret) {
+		skb = NULL;
+		err_str = "Unable to send a nlmsg";
+		goto hello_num_error;
+	}
+	pr_debug("%s: Sent hello abi_version = %u\n", __func__, abi_version);
+	return 0;
+hello_num_error:
+	pr_info("%s: %s\n", __func__, err_str);
+	dev_kfree_skb(skb);
+	return ret;
+}
diff --git a/drivers/infiniband/core/iwpm_util.h b/drivers/infiniband/core/iwpm_util.h
index af1fc14..7e2bcc7 100644
--- a/drivers/infiniband/core/iwpm_util.h
+++ b/drivers/infiniband/core/iwpm_util.h
@@ -78,6 +78,7 @@
 	struct sockaddr_storage local_sockaddr;
 	struct sockaddr_storage mapped_sockaddr;
 	u8     nl_client;
+	u32    map_flags;
 };
 
 struct iwpm_remote_info {
@@ -266,4 +267,15 @@
  * @msg: Message to print
  */
 void iwpm_print_sockaddr(struct sockaddr_storage *sockaddr, char *msg);
+
+/**
+ * iwpm_send_hello - Send hello response to iwpmd
+ *
+ * @nl_client: The index of the netlink client
+ * @abi_version: The kernel's abi_version
+ *
+ * Returns 0 on success or a negative error code
+ */
+int iwpm_send_hello(u8 nl_client, int iwpm_pid, u16 abi_version);
+extern u16 iwpm_ulib_version;
 #endif
diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c
index ef459f2..9947d16 100644
--- a/drivers/infiniband/core/mad.c
+++ b/drivers/infiniband/core/mad.c
@@ -3,7 +3,7 @@
  * Copyright (c) 2005 Intel Corporation.  All rights reserved.
  * Copyright (c) 2005 Mellanox Technologies Ltd.  All rights reserved.
  * Copyright (c) 2009 HNR Consulting. All rights reserved.
- * Copyright (c) 2014 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2014,2018 Intel Corporation.  All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -38,10 +38,10 @@
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
 #include <linux/dma-mapping.h>
-#include <linux/idr.h>
 #include <linux/slab.h>
 #include <linux/module.h>
 #include <linux/security.h>
+#include <linux/xarray.h>
 #include <rdma/ib_cache.h>
 
 #include "mad_priv.h"
@@ -51,6 +51,32 @@
 #include "opa_smi.h"
 #include "agent.h"
 
+#define CREATE_TRACE_POINTS
+#include <trace/events/ib_mad.h>
+
+#ifdef CONFIG_TRACEPOINTS
+static void create_mad_addr_info(struct ib_mad_send_wr_private *mad_send_wr,
+			  struct ib_mad_qp_info *qp_info,
+			  struct trace_event_raw_ib_mad_send_template *entry)
+{
+	u16 pkey;
+	struct ib_device *dev = qp_info->port_priv->device;
+	u8 pnum = qp_info->port_priv->port_num;
+	struct ib_ud_wr *wr = &mad_send_wr->send_wr;
+	struct rdma_ah_attr attr = {};
+
+	rdma_query_ah(wr->ah, &attr);
+
+	/* These are common */
+	entry->sl = attr.sl;
+	ib_query_pkey(dev, pnum, wr->pkey_index, &pkey);
+	entry->pkey = pkey;
+	entry->rqpn = wr->remote_qpn;
+	entry->rqkey = wr->remote_qkey;
+	entry->dlid = rdma_ah_get_dlid(&attr);
+}
+#endif
+
 static int mad_sendq_size = IB_MAD_QP_SEND_SIZE;
 static int mad_recvq_size = IB_MAD_QP_RECV_SIZE;
 
@@ -59,12 +85,9 @@
 module_param_named(recv_queue_size, mad_recvq_size, int, 0444);
 MODULE_PARM_DESC(recv_queue_size, "Size of receive queue in number of work requests");
 
-/*
- * The mlx4 driver uses the top byte to distinguish which virtual function
- * generated the MAD, so we must avoid using it.
- */
-#define AGENT_ID_LIMIT		(1 << 24)
-static DEFINE_IDR(ib_mad_clients);
+/* Client ID 0 is used for snoop-only clients */
+static DEFINE_XARRAY_ALLOC1(ib_mad_clients);
+static u32 ib_mad_client_next;
 static struct list_head ib_mad_port_list;
 
 /* Port list lock */
@@ -220,33 +243,37 @@
 	int ret2, qpn;
 	u8 mgmt_class, vclass;
 
+	if ((qp_type == IB_QPT_SMI && !rdma_cap_ib_smi(device, port_num)) ||
+	    (qp_type == IB_QPT_GSI && !rdma_cap_ib_cm(device, port_num)))
+		return ERR_PTR(-EPROTONOSUPPORT);
+
 	/* Validate parameters */
 	qpn = get_spl_qp_index(qp_type);
 	if (qpn == -1) {
-		dev_notice(&device->dev,
-			   "ib_register_mad_agent: invalid QP Type %d\n",
-			   qp_type);
+		dev_dbg_ratelimited(&device->dev, "%s: invalid QP Type %d\n",
+				    __func__, qp_type);
 		goto error1;
 	}
 
 	if (rmpp_version && rmpp_version != IB_MGMT_RMPP_VERSION) {
-		dev_notice(&device->dev,
-			   "ib_register_mad_agent: invalid RMPP Version %u\n",
-			   rmpp_version);
+		dev_dbg_ratelimited(&device->dev,
+				    "%s: invalid RMPP Version %u\n",
+				    __func__, rmpp_version);
 		goto error1;
 	}
 
 	/* Validate MAD registration request if supplied */
 	if (mad_reg_req) {
 		if (mad_reg_req->mgmt_class_version >= MAX_MGMT_VERSION) {
-			dev_notice(&device->dev,
-				   "ib_register_mad_agent: invalid Class Version %u\n",
-				   mad_reg_req->mgmt_class_version);
+			dev_dbg_ratelimited(&device->dev,
+					    "%s: invalid Class Version %u\n",
+					    __func__,
+					    mad_reg_req->mgmt_class_version);
 			goto error1;
 		}
 		if (!recv_handler) {
-			dev_notice(&device->dev,
-				   "ib_register_mad_agent: no recv_handler\n");
+			dev_dbg_ratelimited(&device->dev,
+					    "%s: no recv_handler\n", __func__);
 			goto error1;
 		}
 		if (mad_reg_req->mgmt_class >= MAX_MGMT_CLASS) {
@@ -256,9 +283,9 @@
 			 */
 			if (mad_reg_req->mgmt_class !=
 			    IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) {
-				dev_notice(&device->dev,
-					   "ib_register_mad_agent: Invalid Mgmt Class 0x%x\n",
-					   mad_reg_req->mgmt_class);
+				dev_dbg_ratelimited(&device->dev,
+					"%s: Invalid Mgmt Class 0x%x\n",
+					__func__, mad_reg_req->mgmt_class);
 				goto error1;
 			}
 		} else if (mad_reg_req->mgmt_class == 0) {
@@ -266,8 +293,9 @@
 			 * Class 0 is reserved in IBA and is used for
 			 * aliasing of IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE
 			 */
-			dev_notice(&device->dev,
-				   "ib_register_mad_agent: Invalid Mgmt Class 0\n");
+			dev_dbg_ratelimited(&device->dev,
+					    "%s: Invalid Mgmt Class 0\n",
+					    __func__);
 			goto error1;
 		} else if (is_vendor_class(mad_reg_req->mgmt_class)) {
 			/*
@@ -275,18 +303,19 @@
 			 * ensure supplied OUI is not zero
 			 */
 			if (!is_vendor_oui(mad_reg_req->oui)) {
-				dev_notice(&device->dev,
-					   "ib_register_mad_agent: No OUI specified for class 0x%x\n",
-					   mad_reg_req->mgmt_class);
+				dev_dbg_ratelimited(&device->dev,
+					"%s: No OUI specified for class 0x%x\n",
+					__func__,
+					mad_reg_req->mgmt_class);
 				goto error1;
 			}
 		}
 		/* Make sure class supplied is consistent with RMPP */
 		if (!ib_is_mad_class_rmpp(mad_reg_req->mgmt_class)) {
 			if (rmpp_version) {
-				dev_notice(&device->dev,
-					   "ib_register_mad_agent: RMPP version for non-RMPP class 0x%x\n",
-					   mad_reg_req->mgmt_class);
+				dev_dbg_ratelimited(&device->dev,
+					"%s: RMPP version for non-RMPP class 0x%x\n",
+					__func__, mad_reg_req->mgmt_class);
 				goto error1;
 			}
 		}
@@ -297,9 +326,9 @@
 					IB_MGMT_CLASS_SUBN_LID_ROUTED) &&
 			    (mad_reg_req->mgmt_class !=
 					IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)) {
-				dev_notice(&device->dev,
-					   "ib_register_mad_agent: Invalid SM QP type: class 0x%x\n",
-					   mad_reg_req->mgmt_class);
+				dev_dbg_ratelimited(&device->dev,
+					"%s: Invalid SM QP type: class 0x%x\n",
+					__func__, mad_reg_req->mgmt_class);
 				goto error1;
 			}
 		} else {
@@ -307,9 +336,9 @@
 					IB_MGMT_CLASS_SUBN_LID_ROUTED) ||
 			    (mad_reg_req->mgmt_class ==
 					IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)) {
-				dev_notice(&device->dev,
-					   "ib_register_mad_agent: Invalid GS QP type: class 0x%x\n",
-					   mad_reg_req->mgmt_class);
+				dev_dbg_ratelimited(&device->dev,
+					"%s: Invalid GS QP type: class 0x%x\n",
+					__func__, mad_reg_req->mgmt_class);
 				goto error1;
 			}
 		}
@@ -324,18 +353,18 @@
 	/* Validate device and port */
 	port_priv = ib_get_mad_port(device, port_num);
 	if (!port_priv) {
-		dev_notice(&device->dev,
-			   "ib_register_mad_agent: Invalid port %d\n",
-			   port_num);
+		dev_dbg_ratelimited(&device->dev, "%s: Invalid port %d\n",
+				    __func__, port_num);
 		ret = ERR_PTR(-ENODEV);
 		goto error1;
 	}
 
-	/* Verify the QP requested is supported.  For example, Ethernet devices
-	 * will not have QP0 */
+	/* Verify the QP requested is supported. For example, Ethernet devices
+	 * will not have QP0.
+	 */
 	if (!port_priv->qp_info[qpn].qp) {
-		dev_notice(&device->dev,
-			   "ib_register_mad_agent: QP %d not supported\n", qpn);
+		dev_dbg_ratelimited(&device->dev, "%s: QP %d not supported\n",
+				    __func__, qpn);
 		ret = ERR_PTR(-EPROTONOSUPPORT);
 		goto error1;
 	}
@@ -383,18 +412,17 @@
 		goto error4;
 	}
 
-	idr_preload(GFP_KERNEL);
-	idr_lock(&ib_mad_clients);
-	ret2 = idr_alloc_cyclic(&ib_mad_clients, mad_agent_priv, 0,
-			AGENT_ID_LIMIT, GFP_ATOMIC);
-	idr_unlock(&ib_mad_clients);
-	idr_preload_end();
-
+	/*
+	 * The mlx4 driver uses the top byte to distinguish which virtual
+	 * function generated the MAD, so we must avoid using it.
+	 */
+	ret2 = xa_alloc_cyclic(&ib_mad_clients, &mad_agent_priv->agent.hi_tid,
+			mad_agent_priv, XA_LIMIT(0, (1 << 24) - 1),
+			&ib_mad_client_next, GFP_KERNEL);
 	if (ret2 < 0) {
 		ret = ERR_PTR(ret2);
 		goto error5;
 	}
-	mad_agent_priv->agent.hi_tid = ret2;
 
 	/*
 	 * Make sure MAD registration (if supplied)
@@ -439,12 +467,11 @@
 	}
 	spin_unlock_irq(&port_priv->reg_lock);
 
+	trace_ib_mad_create_agent(mad_agent_priv);
 	return &mad_agent_priv->agent;
 error6:
 	spin_unlock_irq(&port_priv->reg_lock);
-	idr_lock(&ib_mad_clients);
-	idr_remove(&ib_mad_clients, mad_agent_priv->agent.hi_tid);
-	idr_unlock(&ib_mad_clients);
+	xa_erase(&ib_mad_clients, mad_agent_priv->agent.hi_tid);
 error5:
 	ib_mad_agent_security_cleanup(&mad_agent_priv->agent);
 error4:
@@ -596,6 +623,7 @@
 	struct ib_mad_port_private *port_priv;
 
 	/* Note that we could still be handling received MADs */
+	trace_ib_mad_unregister_agent(mad_agent_priv);
 
 	/*
 	 * Canceling all sends results in dropping received response
@@ -608,9 +636,7 @@
 	spin_lock_irq(&port_priv->reg_lock);
 	remove_mad_reg_req(mad_agent_priv);
 	spin_unlock_irq(&port_priv->reg_lock);
-	idr_lock(&ib_mad_clients);
-	idr_remove(&ib_mad_clients, mad_agent_priv->agent.hi_tid);
-	idr_unlock(&ib_mad_clients);
+	xa_erase(&ib_mad_clients, mad_agent_priv->agent.hi_tid);
 
 	flush_workqueue(port_priv->wq);
 	ib_cancel_rmpp_recvs(mad_agent_priv);
@@ -815,6 +841,8 @@
 	if (opa && smp->class_version == OPA_SM_CLASS_VERSION) {
 		u32 opa_drslid;
 
+		trace_ib_mad_handle_out_opa_smi(opa_smp);
+
 		if ((opa_get_smp_direction(opa_smp)
 		     ? opa_smp->route.dr.dr_dlid : opa_smp->route.dr.dr_slid) ==
 		     OPA_LID_PERMISSIVE &&
@@ -840,6 +868,8 @@
 		    opa_smi_check_local_returning_smp(opa_smp, device) == IB_SMI_DISCARD)
 			goto out;
 	} else {
+		trace_ib_mad_handle_out_ib_smi(smp);
+
 		if ((ib_get_smp_direction(smp) ? smp->dr_dlid : smp->dr_slid) ==
 		     IB_LID_PERMISSIVE &&
 		     smi_handle_dr_smp_send(smp, rdma_cap_ib_switch(device), port_num) ==
@@ -882,10 +912,10 @@
 	}
 
 	/* No GRH for DR SMP */
-	ret = device->process_mad(device, 0, port_num, &mad_wc, NULL,
-				  (const struct ib_mad_hdr *)smp, mad_size,
-				  (struct ib_mad_hdr *)mad_priv->mad,
-				  &mad_size, &out_mad_pkey_index);
+	ret = device->ops.process_mad(device, 0, port_num, &mad_wc, NULL,
+				      (const struct ib_mad_hdr *)smp, mad_size,
+				      (struct ib_mad_hdr *)mad_priv->mad,
+				      &mad_size, &out_mad_pkey_index);
 	switch (ret)
 	{
 	case IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY:
@@ -1217,6 +1247,7 @@
 
 	spin_lock_irqsave(&qp_info->send_queue.lock, flags);
 	if (qp_info->send_queue.count < qp_info->send_queue.max_active) {
+		trace_ib_mad_ib_send_mad(mad_send_wr, qp_info);
 		ret = ib_post_send(mad_agent->qp, &mad_send_wr->send_wr.wr,
 				   NULL);
 		list = &qp_info->send_queue.list;
@@ -1750,7 +1781,7 @@
 		 */
 		hi_tid = be64_to_cpu(mad_hdr->tid) >> 32;
 		rcu_read_lock();
-		mad_agent = idr_find(&ib_mad_clients, hi_tid);
+		mad_agent = xa_load(&ib_mad_clients, hi_tid);
 		if (mad_agent && !atomic_inc_not_zero(&mad_agent->refcount))
 			mad_agent = NULL;
 		rcu_read_unlock();
@@ -2071,6 +2102,8 @@
 	enum smi_forward_action retsmi;
 	struct ib_smp *smp = (struct ib_smp *)recv->mad;
 
+	trace_ib_mad_handle_ib_smi(smp);
+
 	if (smi_handle_dr_smp_recv(smp,
 				   rdma_cap_ib_switch(port_priv->device),
 				   port_num,
@@ -2156,6 +2189,8 @@
 	enum smi_forward_action retsmi;
 	struct opa_smp *smp = (struct opa_smp *)recv->mad;
 
+	trace_ib_mad_handle_opa_smi(smp);
+
 	if (opa_smi_handle_dr_smp_recv(smp,
 				   rdma_cap_ib_switch(port_priv->device),
 				   port_num,
@@ -2280,6 +2315,9 @@
 	if (!validate_mad((const struct ib_mad_hdr *)recv->mad, qp_info, opa))
 		goto out;
 
+	trace_ib_mad_recv_done_handler(qp_info, wc,
+				       (struct ib_mad_hdr *)recv->mad);
+
 	mad_size = recv->mad_size;
 	response = alloc_mad_private(mad_size, GFP_KERNEL);
 	if (!response)
@@ -2299,14 +2337,12 @@
 	}
 
 	/* Give driver "right of first refusal" on incoming MAD */
-	if (port_priv->device->process_mad) {
-		ret = port_priv->device->process_mad(port_priv->device, 0,
-						     port_priv->port_num,
-						     wc, &recv->grh,
-						     (const struct ib_mad_hdr *)recv->mad,
-						     recv->mad_size,
-						     (struct ib_mad_hdr *)response->mad,
-						     &mad_size, &resp_mad_pkey_index);
+	if (port_priv->device->ops.process_mad) {
+		ret = port_priv->device->ops.process_mad(
+			port_priv->device, 0, port_priv->port_num, wc,
+			&recv->grh, (const struct ib_mad_hdr *)recv->mad,
+			recv->mad_size, (struct ib_mad_hdr *)response->mad,
+			&mad_size, &resp_mad_pkey_index);
 
 		if (opa)
 			wc->pkey_index = resp_mad_pkey_index;
@@ -2328,6 +2364,7 @@
 
 	mad_agent = find_mad_agent(port_priv, (const struct ib_mad_hdr *)recv->mad);
 	if (mad_agent) {
+		trace_ib_mad_recv_done_agent(mad_agent);
 		ib_mad_complete_recv(mad_agent, &recv->header.recv_wc);
 		/*
 		 * recv is freed up in error cases in ib_mad_complete_recv
@@ -2408,7 +2445,7 @@
 }
 
 void ib_reset_mad_timeout(struct ib_mad_send_wr_private *mad_send_wr,
-			  int timeout_ms)
+			  unsigned long timeout_ms)
 {
 	mad_send_wr->timeout = msecs_to_jiffies(timeout_ms);
 	wait_for_response(mad_send_wr);
@@ -2492,6 +2529,9 @@
 	send_queue = mad_list->mad_queue;
 	qp_info = send_queue->qp_info;
 
+	trace_ib_mad_send_done_agent(mad_send_wr->mad_agent_priv);
+	trace_ib_mad_send_done_handler(mad_send_wr, wc);
+
 retry:
 	ib_dma_unmap_single(mad_send_wr->send_buf.mad_agent->device,
 			    mad_send_wr->header_mapping,
@@ -2523,6 +2563,7 @@
 	ib_mad_complete_send_wr(mad_send_wr, &mad_send_wc);
 
 	if (queued_send_wr) {
+		trace_ib_mad_send_done_resend(queued_send_wr, qp_info);
 		ret = ib_post_send(qp_info->qp, &queued_send_wr->send_wr.wr,
 				   NULL);
 		if (ret) {
@@ -2570,6 +2611,7 @@
 		if (mad_send_wr->retry) {
 			/* Repost send */
 			mad_send_wr->retry = 0;
+			trace_ib_mad_error_handler(mad_send_wr, qp_info);
 			ret = ib_post_send(qp_info->qp, &mad_send_wr->send_wr.wr,
 					   NULL);
 			if (!ret)
@@ -3182,18 +3224,18 @@
 	if (has_smi)
 		cq_size *= 2;
 
-	port_priv->cq = ib_alloc_cq(port_priv->device, port_priv, cq_size, 0,
-			IB_POLL_WORKQUEUE);
-	if (IS_ERR(port_priv->cq)) {
-		dev_err(&device->dev, "Couldn't create ib_mad CQ\n");
-		ret = PTR_ERR(port_priv->cq);
-		goto error3;
-	}
-
 	port_priv->pd = ib_alloc_pd(device, 0);
 	if (IS_ERR(port_priv->pd)) {
 		dev_err(&device->dev, "Couldn't create ib_mad PD\n");
 		ret = PTR_ERR(port_priv->pd);
+		goto error3;
+	}
+
+	port_priv->cq = ib_alloc_cq(port_priv->device, port_priv, cq_size, 0,
+			IB_POLL_UNBOUND_WORKQUEUE);
+	if (IS_ERR(port_priv->cq)) {
+		dev_err(&device->dev, "Couldn't create ib_mad CQ\n");
+		ret = PTR_ERR(port_priv->cq);
 		goto error4;
 	}
 
@@ -3236,11 +3278,11 @@
 error7:
 	destroy_mad_qp(&port_priv->qp_info[0]);
 error6:
-	ib_dealloc_pd(port_priv->pd);
-error4:
 	ib_free_cq(port_priv->cq);
 	cleanup_recv_queue(&port_priv->qp_info[1]);
 	cleanup_recv_queue(&port_priv->qp_info[0]);
+error4:
+	ib_dealloc_pd(port_priv->pd);
 error3:
 	kfree(port_priv);
 
@@ -3270,8 +3312,8 @@
 	destroy_workqueue(port_priv->wq);
 	destroy_mad_qp(&port_priv->qp_info[1]);
 	destroy_mad_qp(&port_priv->qp_info[0]);
-	ib_dealloc_pd(port_priv->pd);
 	ib_free_cq(port_priv->cq);
+	ib_dealloc_pd(port_priv->pd);
 	cleanup_recv_queue(&port_priv->qp_info[1]);
 	cleanup_recv_queue(&port_priv->qp_info[0]);
 	/* XXX: Handle deallocation of MAD registration tables */
@@ -3322,9 +3364,9 @@
 
 static void ib_mad_remove_device(struct ib_device *device, void *client_data)
 {
-	int i;
+	unsigned int i;
 
-	for (i = rdma_start_port(device); i <= rdma_end_port(device); i++) {
+	rdma_for_each_port (device, i) {
 		if (!rdma_cap_ib_mad(device, i))
 			continue;
 
@@ -3352,9 +3394,6 @@
 
 	INIT_LIST_HEAD(&ib_mad_port_list);
 
-	/* Client ID 0 is used for snoop-only clients */
-	idr_alloc(&ib_mad_clients, NULL, 0, 0, GFP_KERNEL);
-
 	if (ib_register_client(&mad_client)) {
 		pr_err("Couldn't register ib_mad client\n");
 		return -EINVAL;
diff --git a/drivers/infiniband/core/mad_priv.h b/drivers/infiniband/core/mad_priv.h
index d84ae16..956b3a7 100644
--- a/drivers/infiniband/core/mad_priv.h
+++ b/drivers/infiniband/core/mad_priv.h
@@ -73,14 +73,14 @@
 	struct ib_mad_recv_wc recv_wc;
 	struct ib_wc wc;
 	u64 mapping;
-} __attribute__ ((packed));
+} __packed;
 
 struct ib_mad_private {
 	struct ib_mad_private_header header;
 	size_t mad_size;
 	struct ib_grh grh;
 	u8 mad[0];
-} __attribute__ ((packed));
+} __packed;
 
 struct ib_rmpp_segment {
 	struct list_head list;
@@ -221,6 +221,6 @@
 void ib_mark_mad_done(struct ib_mad_send_wr_private *mad_send_wr);
 
 void ib_reset_mad_timeout(struct ib_mad_send_wr_private *mad_send_wr,
-			  int timeout_ms);
+			  unsigned long timeout_ms);
 
 #endif	/* __IB_MAD_PRIV_H__ */
diff --git a/drivers/infiniband/core/mad_rmpp.c b/drivers/infiniband/core/mad_rmpp.c
index e5cf09c..5ec57ab 100644
--- a/drivers/infiniband/core/mad_rmpp.c
+++ b/drivers/infiniband/core/mad_rmpp.c
@@ -81,7 +81,7 @@
 {
 	deref_rmpp_recv(rmpp_recv);
 	wait_for_completion(&rmpp_recv->comp);
-	rdma_destroy_ah(rmpp_recv->ah);
+	rdma_destroy_ah(rmpp_recv->ah, RDMA_DESTROY_AH_SLEEPABLE);
 	kfree(rmpp_recv);
 }
 
@@ -171,7 +171,7 @@
 				 hdr_len, 0, GFP_KERNEL,
 				 IB_MGMT_BASE_VERSION);
 	if (IS_ERR(msg))
-		rdma_destroy_ah(ah);
+		rdma_destroy_ah(ah, RDMA_DESTROY_AH_SLEEPABLE);
 	else {
 		msg->ah = ah;
 		msg->context[0] = ah;
@@ -201,7 +201,7 @@
 
 	ret = ib_post_send_mad(msg, NULL);
 	if (ret) {
-		rdma_destroy_ah(msg->ah);
+		rdma_destroy_ah(msg->ah, RDMA_DESTROY_AH_SLEEPABLE);
 		ib_free_send_mad(msg);
 	}
 }
@@ -209,7 +209,8 @@
 void ib_rmpp_send_handler(struct ib_mad_send_wc *mad_send_wc)
 {
 	if (mad_send_wc->send_buf->context[0] == mad_send_wc->send_buf->ah)
-		rdma_destroy_ah(mad_send_wc->send_buf->ah);
+		rdma_destroy_ah(mad_send_wc->send_buf->ah,
+				RDMA_DESTROY_AH_SLEEPABLE);
 	ib_free_send_mad(mad_send_wc->send_buf);
 }
 
@@ -237,7 +238,7 @@
 
 	ret = ib_post_send_mad(msg, NULL);
 	if (ret) {
-		rdma_destroy_ah(msg->ah);
+		rdma_destroy_ah(msg->ah, RDMA_DESTROY_AH_SLEEPABLE);
 		ib_free_send_mad(msg);
 	}
 }
diff --git a/drivers/infiniband/core/mr_pool.c b/drivers/infiniband/core/mr_pool.c
index 49d478b..c0e2df1 100644
--- a/drivers/infiniband/core/mr_pool.c
+++ b/drivers/infiniband/core/mr_pool.c
@@ -1,14 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Copyright (c) 2016 HGST, a Western Digital Company.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
  */
 #include <rdma/ib_verbs.h>
 #include <rdma/mr_pool.h>
@@ -42,14 +34,18 @@
 EXPORT_SYMBOL(ib_mr_pool_put);
 
 int ib_mr_pool_init(struct ib_qp *qp, struct list_head *list, int nr,
-		enum ib_mr_type type, u32 max_num_sg)
+		enum ib_mr_type type, u32 max_num_sg, u32 max_num_meta_sg)
 {
 	struct ib_mr *mr;
 	unsigned long flags;
 	int ret, i;
 
 	for (i = 0; i < nr; i++) {
-		mr = ib_alloc_mr(qp->pd, type, max_num_sg);
+		if (type == IB_MR_TYPE_INTEGRITY)
+			mr = ib_alloc_mr_integrity(qp->pd, max_num_sg,
+						   max_num_meta_sg);
+		else
+			mr = ib_alloc_mr(qp->pd, type, max_num_sg);
 		if (IS_ERR(mr)) {
 			ret = PTR_ERR(mr);
 			goto out;
diff --git a/drivers/infiniband/core/multicast.c b/drivers/infiniband/core/multicast.c
index d50ff70..cd338dd 100644
--- a/drivers/infiniband/core/multicast.c
+++ b/drivers/infiniband/core/multicast.c
@@ -804,7 +804,6 @@
 	switch (event->event) {
 	case IB_EVENT_PORT_ERR:
 	case IB_EVENT_LID_CHANGE:
-	case IB_EVENT_SM_CHANGE:
 	case IB_EVENT_CLIENT_REREGISTER:
 		mcast_groups_event(&dev->port[index], MCAST_GROUP_ERROR);
 		break;
diff --git a/drivers/infiniband/core/netlink.c b/drivers/infiniband/core/netlink.c
index 3ccaae1..8cd31ef 100644
--- a/drivers/infiniband/core/netlink.c
+++ b/drivers/infiniband/core/netlink.c
@@ -36,27 +36,31 @@
 #include <linux/export.h>
 #include <net/netlink.h>
 #include <net/net_namespace.h>
+#include <net/netns/generic.h>
 #include <net/sock.h>
 #include <rdma/rdma_netlink.h>
 #include <linux/module.h>
 #include "core_priv.h"
 
-static DEFINE_MUTEX(rdma_nl_mutex);
-static struct sock *nls;
 static struct {
-	const struct rdma_nl_cbs   *cb_table;
+	const struct rdma_nl_cbs *cb_table;
+	/* Synchronizes between ongoing netlink commands and netlink client
+	 * unregistration.
+	 */
+	struct rw_semaphore sem;
 } rdma_nl_types[RDMA_NL_NUM_CLIENTS];
 
-int rdma_nl_chk_listeners(unsigned int group)
+bool rdma_nl_chk_listeners(unsigned int group)
 {
-	return (netlink_has_listeners(nls, group)) ? 0 : -1;
+	struct rdma_dev_net *rnet = rdma_net_to_dev_net(&init_net);
+
+	return netlink_has_listeners(rnet->nl_sock, group);
 }
 EXPORT_SYMBOL(rdma_nl_chk_listeners);
 
 static bool is_nl_msg_valid(unsigned int type, unsigned int op)
 {
 	static const unsigned int max_num_ops[RDMA_NL_NUM_CLIENTS] = {
-		[RDMA_NL_RDMA_CM] = RDMA_NL_RDMA_CM_NUM_OPS,
 		[RDMA_NL_IWCM] = RDMA_NL_IWPM_NUM_OPS,
 		[RDMA_NL_LS] = RDMA_NL_LS_NUM_OPS,
 		[RDMA_NL_NLDEV] = RDMA_NLDEV_NUM_OPS,
@@ -74,62 +78,53 @@
 	return (op < max_num_ops[type]) ? true : false;
 }
 
-static bool is_nl_valid(unsigned int type, unsigned int op)
+static const struct rdma_nl_cbs *
+get_cb_table(const struct sk_buff *skb, unsigned int type, unsigned int op)
 {
 	const struct rdma_nl_cbs *cb_table;
 
-	if (!is_nl_msg_valid(type, op))
-		return false;
+	/*
+	 * Currently only NLDEV client is supporting netlink commands in
+	 * non init_net net namespace.
+	 */
+	if (sock_net(skb->sk) != &init_net && type != RDMA_NL_NLDEV)
+		return NULL;
 
-	if (!rdma_nl_types[type].cb_table) {
-		mutex_unlock(&rdma_nl_mutex);
+	cb_table = READ_ONCE(rdma_nl_types[type].cb_table);
+	if (!cb_table) {
+		/*
+		 * Didn't get valid reference of the table, attempt module
+		 * load once.
+		 */
+		up_read(&rdma_nl_types[type].sem);
+
 		request_module("rdma-netlink-subsys-%d", type);
-		mutex_lock(&rdma_nl_mutex);
+
+		down_read(&rdma_nl_types[type].sem);
+		cb_table = READ_ONCE(rdma_nl_types[type].cb_table);
 	}
-
-	cb_table = rdma_nl_types[type].cb_table;
-
 	if (!cb_table || (!cb_table[op].dump && !cb_table[op].doit))
-		return false;
-	return true;
+		return NULL;
+	return cb_table;
 }
 
 void rdma_nl_register(unsigned int index,
 		      const struct rdma_nl_cbs cb_table[])
 {
-	mutex_lock(&rdma_nl_mutex);
-	if (!is_nl_msg_valid(index, 0)) {
-		/*
-		 * All clients are not interesting in success/failure of
-		 * this call. They want to see the print to error log and
-		 * continue their initialization. Print warning for them,
-		 * because it is programmer's error to be here.
-		 */
-		mutex_unlock(&rdma_nl_mutex);
-		WARN(true,
-		     "The not-valid %u index was supplied to RDMA netlink\n",
-		     index);
+	if (WARN_ON(!is_nl_msg_valid(index, 0)) ||
+	    WARN_ON(READ_ONCE(rdma_nl_types[index].cb_table)))
 		return;
-	}
 
-	if (rdma_nl_types[index].cb_table) {
-		mutex_unlock(&rdma_nl_mutex);
-		WARN(true,
-		     "The %u index is already registered in RDMA netlink\n",
-		     index);
-		return;
-	}
-
-	rdma_nl_types[index].cb_table = cb_table;
-	mutex_unlock(&rdma_nl_mutex);
+	/* Pairs with the READ_ONCE in is_nl_valid() */
+	smp_store_release(&rdma_nl_types[index].cb_table, cb_table);
 }
 EXPORT_SYMBOL(rdma_nl_register);
 
 void rdma_nl_unregister(unsigned int index)
 {
-	mutex_lock(&rdma_nl_mutex);
+	down_write(&rdma_nl_types[index].sem);
 	rdma_nl_types[index].cb_table = NULL;
-	mutex_unlock(&rdma_nl_mutex);
+	up_write(&rdma_nl_types[index].sem);
 }
 EXPORT_SYMBOL(rdma_nl_unregister);
 
@@ -161,15 +156,21 @@
 	unsigned int index = RDMA_NL_GET_CLIENT(type);
 	unsigned int op = RDMA_NL_GET_OP(type);
 	const struct rdma_nl_cbs *cb_table;
+	int err = -EINVAL;
 
-	if (!is_nl_valid(index, op))
+	if (!is_nl_msg_valid(index, op))
 		return -EINVAL;
 
-	cb_table = rdma_nl_types[index].cb_table;
+	down_read(&rdma_nl_types[index].sem);
+	cb_table = get_cb_table(skb, index, op);
+	if (!cb_table)
+		goto done;
 
 	if ((cb_table[op].flags & RDMA_NL_ADMIN_PERM) &&
-	    !netlink_capable(skb, CAP_NET_ADMIN))
-		return -EPERM;
+	    !netlink_capable(skb, CAP_NET_ADMIN)) {
+		err = -EPERM;
+		goto done;
+	}
 
 	/*
 	 * LS responses overload the 0x100 (NLM_F_ROOT) flag.  Don't
@@ -177,24 +178,24 @@
 	 */
 	if (index == RDMA_NL_LS) {
 		if (cb_table[op].doit)
-			return cb_table[op].doit(skb, nlh, extack);
-		return -EINVAL;
+			err = cb_table[op].doit(skb, nlh, extack);
+		goto done;
 	}
 	/* FIXME: Convert IWCM to properly handle doit callbacks */
-	if ((nlh->nlmsg_flags & NLM_F_DUMP) || index == RDMA_NL_RDMA_CM ||
-	    index == RDMA_NL_IWCM) {
+	if ((nlh->nlmsg_flags & NLM_F_DUMP) || index == RDMA_NL_IWCM) {
 		struct netlink_dump_control c = {
 			.dump = cb_table[op].dump,
 		};
 		if (c.dump)
-			return netlink_dump_start(nls, skb, nlh, &c);
-		return -EINVAL;
+			err = netlink_dump_start(skb->sk, skb, nlh, &c);
+		goto done;
 	}
 
 	if (cb_table[op].doit)
-		return cb_table[op].doit(skb, nlh, extack);
-
-	return 0;
+		err = cb_table[op].doit(skb, nlh, extack);
+done:
+	up_read(&rdma_nl_types[index].sem);
+	return err;
 }
 
 /*
@@ -255,47 +256,44 @@
 
 static void rdma_nl_rcv(struct sk_buff *skb)
 {
-	mutex_lock(&rdma_nl_mutex);
 	rdma_nl_rcv_skb(skb, &rdma_nl_rcv_msg);
-	mutex_unlock(&rdma_nl_mutex);
 }
 
-int rdma_nl_unicast(struct sk_buff *skb, u32 pid)
+int rdma_nl_unicast(struct net *net, struct sk_buff *skb, u32 pid)
 {
+	struct rdma_dev_net *rnet = rdma_net_to_dev_net(net);
 	int err;
 
-	err = netlink_unicast(nls, skb, pid, MSG_DONTWAIT);
+	err = netlink_unicast(rnet->nl_sock, skb, pid, MSG_DONTWAIT);
 	return (err < 0) ? err : 0;
 }
 EXPORT_SYMBOL(rdma_nl_unicast);
 
-int rdma_nl_unicast_wait(struct sk_buff *skb, __u32 pid)
+int rdma_nl_unicast_wait(struct net *net, struct sk_buff *skb, __u32 pid)
 {
+	struct rdma_dev_net *rnet = rdma_net_to_dev_net(net);
 	int err;
 
-	err = netlink_unicast(nls, skb, pid, 0);
+	err = netlink_unicast(rnet->nl_sock, skb, pid, 0);
 	return (err < 0) ? err : 0;
 }
 EXPORT_SYMBOL(rdma_nl_unicast_wait);
 
-int rdma_nl_multicast(struct sk_buff *skb, unsigned int group, gfp_t flags)
+int rdma_nl_multicast(struct net *net, struct sk_buff *skb,
+		      unsigned int group, gfp_t flags)
 {
-	return nlmsg_multicast(nls, skb, 0, group, flags);
+	struct rdma_dev_net *rnet = rdma_net_to_dev_net(net);
+
+	return nlmsg_multicast(rnet->nl_sock, skb, 0, group, flags);
 }
 EXPORT_SYMBOL(rdma_nl_multicast);
 
-int __init rdma_nl_init(void)
+void rdma_nl_init(void)
 {
-	struct netlink_kernel_cfg cfg = {
-		.input	= rdma_nl_rcv,
-	};
+	int idx;
 
-	nls = netlink_kernel_create(&init_net, NETLINK_RDMA, &cfg);
-	if (!nls)
-		return -ENOMEM;
-
-	nls->sk_sndtimeo = 10 * HZ;
-	return 0;
+	for (idx = 0; idx < RDMA_NL_NUM_CLIENTS; idx++)
+		init_rwsem(&rdma_nl_types[idx].sem);
 }
 
 void rdma_nl_exit(void)
@@ -303,9 +301,31 @@
 	int idx;
 
 	for (idx = 0; idx < RDMA_NL_NUM_CLIENTS; idx++)
-		rdma_nl_unregister(idx);
+		WARN(rdma_nl_types[idx].cb_table,
+		     "Netlink client %d wasn't released prior to unloading %s\n",
+		     idx, KBUILD_MODNAME);
+}
 
-	netlink_kernel_release(nls);
+int rdma_nl_net_init(struct rdma_dev_net *rnet)
+{
+	struct net *net = read_pnet(&rnet->net);
+	struct netlink_kernel_cfg cfg = {
+		.input	= rdma_nl_rcv,
+	};
+	struct sock *nls;
+
+	nls = netlink_kernel_create(net, NETLINK_RDMA, &cfg);
+	if (!nls)
+		return -ENOMEM;
+
+	nls->sk_sndtimeo = 10 * HZ;
+	rnet->nl_sock = nls;
+	return 0;
+}
+
+void rdma_nl_net_exit(struct rdma_dev_net *rnet)
+{
+	netlink_kernel_release(rnet->nl_sock);
 }
 
 MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_RDMA);
diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c
index 0385ab4..c03af08 100644
--- a/drivers/infiniband/core/nldev.c
+++ b/drivers/infiniband/core/nldev.c
@@ -33,80 +33,114 @@
 #include <linux/module.h>
 #include <linux/pid.h>
 #include <linux/pid_namespace.h>
+#include <linux/mutex.h>
 #include <net/netlink.h>
 #include <rdma/rdma_cm.h>
 #include <rdma/rdma_netlink.h>
 
 #include "core_priv.h"
 #include "cma_priv.h"
+#include "restrack.h"
 
+/*
+ * Sort array elements by the netlink attribute name
+ */
 static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = {
-	[RDMA_NLDEV_ATTR_DEV_INDEX]     = { .type = NLA_U32 },
-	[RDMA_NLDEV_ATTR_DEV_NAME]	= { .type = NLA_NUL_STRING,
-					    .len = IB_DEVICE_NAME_MAX - 1},
-	[RDMA_NLDEV_ATTR_PORT_INDEX]	= { .type = NLA_U32 },
-	[RDMA_NLDEV_ATTR_FW_VERSION]	= { .type = NLA_NUL_STRING,
-					    .len = IB_FW_VERSION_NAME_MAX - 1},
-	[RDMA_NLDEV_ATTR_NODE_GUID]	= { .type = NLA_U64 },
-	[RDMA_NLDEV_ATTR_SYS_IMAGE_GUID] = { .type = NLA_U64 },
-	[RDMA_NLDEV_ATTR_SUBNET_PREFIX]	= { .type = NLA_U64 },
-	[RDMA_NLDEV_ATTR_LID]		= { .type = NLA_U32 },
-	[RDMA_NLDEV_ATTR_SM_LID]	= { .type = NLA_U32 },
-	[RDMA_NLDEV_ATTR_LMC]		= { .type = NLA_U8 },
-	[RDMA_NLDEV_ATTR_PORT_STATE]	= { .type = NLA_U8 },
-	[RDMA_NLDEV_ATTR_PORT_PHYS_STATE] = { .type = NLA_U8 },
-	[RDMA_NLDEV_ATTR_DEV_NODE_TYPE] = { .type = NLA_U8 },
-	[RDMA_NLDEV_ATTR_RES_SUMMARY]	= { .type = NLA_NESTED },
-	[RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY]	= { .type = NLA_NESTED },
-	[RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY_NAME] = { .type = NLA_NUL_STRING,
-					     .len = 16 },
-	[RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY_CURR] = { .type = NLA_U64 },
+	[RDMA_NLDEV_ATTR_CHARDEV]		= { .type = NLA_U64 },
+	[RDMA_NLDEV_ATTR_CHARDEV_ABI]		= { .type = NLA_U64 },
+	[RDMA_NLDEV_ATTR_CHARDEV_NAME]		= { .type = NLA_NUL_STRING,
+					.len = RDMA_NLDEV_ATTR_EMPTY_STRING },
+	[RDMA_NLDEV_ATTR_CHARDEV_TYPE]		= { .type = NLA_NUL_STRING,
+					.len = RDMA_NLDEV_ATTR_CHARDEV_TYPE_SIZE },
+	[RDMA_NLDEV_ATTR_DEV_DIM]               = { .type = NLA_U8 },
+	[RDMA_NLDEV_ATTR_DEV_INDEX]		= { .type = NLA_U32 },
+	[RDMA_NLDEV_ATTR_DEV_NAME]		= { .type = NLA_NUL_STRING,
+					.len = IB_DEVICE_NAME_MAX },
+	[RDMA_NLDEV_ATTR_DEV_NODE_TYPE]		= { .type = NLA_U8 },
+	[RDMA_NLDEV_ATTR_DEV_PROTOCOL]		= { .type = NLA_NUL_STRING,
+					.len = RDMA_NLDEV_ATTR_EMPTY_STRING },
+	[RDMA_NLDEV_ATTR_DRIVER]		= { .type = NLA_NESTED },
+	[RDMA_NLDEV_ATTR_DRIVER_ENTRY]		= { .type = NLA_NESTED },
+	[RDMA_NLDEV_ATTR_DRIVER_PRINT_TYPE]	= { .type = NLA_U8 },
+	[RDMA_NLDEV_ATTR_DRIVER_STRING]		= { .type = NLA_NUL_STRING,
+					.len = RDMA_NLDEV_ATTR_EMPTY_STRING },
+	[RDMA_NLDEV_ATTR_DRIVER_S32]		= { .type = NLA_S32 },
+	[RDMA_NLDEV_ATTR_DRIVER_S64]		= { .type = NLA_S64 },
+	[RDMA_NLDEV_ATTR_DRIVER_U32]		= { .type = NLA_U32 },
+	[RDMA_NLDEV_ATTR_DRIVER_U64]		= { .type = NLA_U64 },
+	[RDMA_NLDEV_ATTR_FW_VERSION]		= { .type = NLA_NUL_STRING,
+					.len = RDMA_NLDEV_ATTR_EMPTY_STRING },
+	[RDMA_NLDEV_ATTR_LID]			= { .type = NLA_U32 },
+	[RDMA_NLDEV_ATTR_LINK_TYPE]		= { .type = NLA_NUL_STRING,
+					.len = IFNAMSIZ },
+	[RDMA_NLDEV_ATTR_LMC]			= { .type = NLA_U8 },
+	[RDMA_NLDEV_ATTR_NDEV_INDEX]		= { .type = NLA_U32 },
+	[RDMA_NLDEV_ATTR_NDEV_NAME]		= { .type = NLA_NUL_STRING,
+					.len = IFNAMSIZ },
+	[RDMA_NLDEV_ATTR_NODE_GUID]		= { .type = NLA_U64 },
+	[RDMA_NLDEV_ATTR_PORT_INDEX]		= { .type = NLA_U32 },
+	[RDMA_NLDEV_ATTR_PORT_PHYS_STATE]	= { .type = NLA_U8 },
+	[RDMA_NLDEV_ATTR_PORT_STATE]		= { .type = NLA_U8 },
+	[RDMA_NLDEV_ATTR_RES_CM_ID]		= { .type = NLA_NESTED },
+	[RDMA_NLDEV_ATTR_RES_CM_IDN]		= { .type = NLA_U32 },
+	[RDMA_NLDEV_ATTR_RES_CM_ID_ENTRY]	= { .type = NLA_NESTED },
+	[RDMA_NLDEV_ATTR_RES_CQ]		= { .type = NLA_NESTED },
+	[RDMA_NLDEV_ATTR_RES_CQE]		= { .type = NLA_U32 },
+	[RDMA_NLDEV_ATTR_RES_CQN]		= { .type = NLA_U32 },
+	[RDMA_NLDEV_ATTR_RES_CQ_ENTRY]		= { .type = NLA_NESTED },
+	[RDMA_NLDEV_ATTR_RES_CTXN]		= { .type = NLA_U32 },
+	[RDMA_NLDEV_ATTR_RES_DST_ADDR]		= {
+			.len = sizeof(struct __kernel_sockaddr_storage) },
+	[RDMA_NLDEV_ATTR_RES_IOVA]		= { .type = NLA_U64 },
+	[RDMA_NLDEV_ATTR_RES_KERN_NAME]		= { .type = NLA_NUL_STRING,
+					.len = RDMA_NLDEV_ATTR_EMPTY_STRING },
+	[RDMA_NLDEV_ATTR_RES_LKEY]		= { .type = NLA_U32 },
+	[RDMA_NLDEV_ATTR_RES_LOCAL_DMA_LKEY]	= { .type = NLA_U32 },
+	[RDMA_NLDEV_ATTR_RES_LQPN]		= { .type = NLA_U32 },
+	[RDMA_NLDEV_ATTR_RES_MR]		= { .type = NLA_NESTED },
+	[RDMA_NLDEV_ATTR_RES_MRLEN]		= { .type = NLA_U64 },
+	[RDMA_NLDEV_ATTR_RES_MRN]		= { .type = NLA_U32 },
+	[RDMA_NLDEV_ATTR_RES_MR_ENTRY]		= { .type = NLA_NESTED },
+	[RDMA_NLDEV_ATTR_RES_PATH_MIG_STATE]	= { .type = NLA_U8 },
+	[RDMA_NLDEV_ATTR_RES_PD]		= { .type = NLA_NESTED },
+	[RDMA_NLDEV_ATTR_RES_PDN]		= { .type = NLA_U32 },
+	[RDMA_NLDEV_ATTR_RES_PD_ENTRY]		= { .type = NLA_NESTED },
+	[RDMA_NLDEV_ATTR_RES_PID]		= { .type = NLA_U32 },
+	[RDMA_NLDEV_ATTR_RES_POLL_CTX]		= { .type = NLA_U8 },
+	[RDMA_NLDEV_ATTR_RES_PS]		= { .type = NLA_U32 },
 	[RDMA_NLDEV_ATTR_RES_QP]		= { .type = NLA_NESTED },
 	[RDMA_NLDEV_ATTR_RES_QP_ENTRY]		= { .type = NLA_NESTED },
-	[RDMA_NLDEV_ATTR_RES_LQPN]		= { .type = NLA_U32 },
+	[RDMA_NLDEV_ATTR_RES_RKEY]		= { .type = NLA_U32 },
 	[RDMA_NLDEV_ATTR_RES_RQPN]		= { .type = NLA_U32 },
 	[RDMA_NLDEV_ATTR_RES_RQ_PSN]		= { .type = NLA_U32 },
 	[RDMA_NLDEV_ATTR_RES_SQ_PSN]		= { .type = NLA_U32 },
-	[RDMA_NLDEV_ATTR_RES_PATH_MIG_STATE] = { .type = NLA_U8 },
-	[RDMA_NLDEV_ATTR_RES_TYPE]		= { .type = NLA_U8 },
+	[RDMA_NLDEV_ATTR_RES_SRC_ADDR]		= {
+			.len = sizeof(struct __kernel_sockaddr_storage) },
 	[RDMA_NLDEV_ATTR_RES_STATE]		= { .type = NLA_U8 },
-	[RDMA_NLDEV_ATTR_RES_PID]		= { .type = NLA_U32 },
-	[RDMA_NLDEV_ATTR_RES_KERN_NAME]		= { .type = NLA_NUL_STRING,
-						    .len = TASK_COMM_LEN },
-	[RDMA_NLDEV_ATTR_RES_CM_ID]		= { .type = NLA_NESTED },
-	[RDMA_NLDEV_ATTR_RES_CM_ID_ENTRY]	= { .type = NLA_NESTED },
-	[RDMA_NLDEV_ATTR_RES_PS]		= { .type = NLA_U32 },
-	[RDMA_NLDEV_ATTR_RES_SRC_ADDR]	= {
-			.len = sizeof(struct __kernel_sockaddr_storage) },
-	[RDMA_NLDEV_ATTR_RES_DST_ADDR]	= {
-			.len = sizeof(struct __kernel_sockaddr_storage) },
-	[RDMA_NLDEV_ATTR_RES_CQ]		= { .type = NLA_NESTED },
-	[RDMA_NLDEV_ATTR_RES_CQ_ENTRY]		= { .type = NLA_NESTED },
-	[RDMA_NLDEV_ATTR_RES_CQE]		= { .type = NLA_U32 },
+	[RDMA_NLDEV_ATTR_RES_SUMMARY]		= { .type = NLA_NESTED },
+	[RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY]	= { .type = NLA_NESTED },
+	[RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY_CURR]= { .type = NLA_U64 },
+	[RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY_NAME]= { .type = NLA_NUL_STRING,
+					.len = RDMA_NLDEV_ATTR_EMPTY_STRING },
+	[RDMA_NLDEV_ATTR_RES_TYPE]		= { .type = NLA_U8 },
+	[RDMA_NLDEV_ATTR_RES_UNSAFE_GLOBAL_RKEY]= { .type = NLA_U32 },
 	[RDMA_NLDEV_ATTR_RES_USECNT]		= { .type = NLA_U64 },
-	[RDMA_NLDEV_ATTR_RES_POLL_CTX]		= { .type = NLA_U8 },
-	[RDMA_NLDEV_ATTR_RES_MR]		= { .type = NLA_NESTED },
-	[RDMA_NLDEV_ATTR_RES_MR_ENTRY]		= { .type = NLA_NESTED },
-	[RDMA_NLDEV_ATTR_RES_RKEY]		= { .type = NLA_U32 },
-	[RDMA_NLDEV_ATTR_RES_LKEY]		= { .type = NLA_U32 },
-	[RDMA_NLDEV_ATTR_RES_IOVA]		= { .type = NLA_U64 },
-	[RDMA_NLDEV_ATTR_RES_MRLEN]		= { .type = NLA_U64 },
-	[RDMA_NLDEV_ATTR_RES_PD]		= { .type = NLA_NESTED },
-	[RDMA_NLDEV_ATTR_RES_PD_ENTRY]		= { .type = NLA_NESTED },
-	[RDMA_NLDEV_ATTR_RES_LOCAL_DMA_LKEY]	= { .type = NLA_U32 },
-	[RDMA_NLDEV_ATTR_RES_UNSAFE_GLOBAL_RKEY] = { .type = NLA_U32 },
-	[RDMA_NLDEV_ATTR_NDEV_INDEX]		= { .type = NLA_U32 },
-	[RDMA_NLDEV_ATTR_NDEV_NAME]		= { .type = NLA_NUL_STRING,
-						    .len = IFNAMSIZ },
-	[RDMA_NLDEV_ATTR_DRIVER]		= { .type = NLA_NESTED },
-	[RDMA_NLDEV_ATTR_DRIVER_ENTRY]		= { .type = NLA_NESTED },
-	[RDMA_NLDEV_ATTR_DRIVER_STRING]		= { .type = NLA_NUL_STRING,
-				    .len = RDMA_NLDEV_ATTR_ENTRY_STRLEN },
-	[RDMA_NLDEV_ATTR_DRIVER_PRINT_TYPE]	= { .type = NLA_U8 },
-	[RDMA_NLDEV_ATTR_DRIVER_S32]		= { .type = NLA_S32 },
-	[RDMA_NLDEV_ATTR_DRIVER_U32]		= { .type = NLA_U32 },
-	[RDMA_NLDEV_ATTR_DRIVER_S64]		= { .type = NLA_S64 },
-	[RDMA_NLDEV_ATTR_DRIVER_U64]		= { .type = NLA_U64 },
+	[RDMA_NLDEV_ATTR_SM_LID]		= { .type = NLA_U32 },
+	[RDMA_NLDEV_ATTR_SUBNET_PREFIX]		= { .type = NLA_U64 },
+	[RDMA_NLDEV_ATTR_STAT_AUTO_MODE_MASK]	= { .type = NLA_U32 },
+	[RDMA_NLDEV_ATTR_STAT_MODE]		= { .type = NLA_U32 },
+	[RDMA_NLDEV_ATTR_STAT_RES]		= { .type = NLA_U32 },
+	[RDMA_NLDEV_ATTR_STAT_COUNTER]		= { .type = NLA_NESTED },
+	[RDMA_NLDEV_ATTR_STAT_COUNTER_ENTRY]	= { .type = NLA_NESTED },
+	[RDMA_NLDEV_ATTR_STAT_COUNTER_ID]       = { .type = NLA_U32 },
+	[RDMA_NLDEV_ATTR_STAT_HWCOUNTERS]       = { .type = NLA_NESTED },
+	[RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY]  = { .type = NLA_NESTED },
+	[RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY_NAME] = { .type = NLA_NUL_STRING },
+	[RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY_VALUE] = { .type = NLA_U64 },
+	[RDMA_NLDEV_ATTR_SYS_IMAGE_GUID]	= { .type = NLA_U64 },
+	[RDMA_NLDEV_ATTR_UVERBS_DRIVER_ID]	= { .type = NLA_U32 },
+	[RDMA_NLDEV_NET_NS_FD]			= { .type = NLA_U32 },
+	[RDMA_NLDEV_SYS_ATTR_NETNS_MODE]	= { .type = NLA_U8 },
 };
 
 static int put_driver_name_print_type(struct sk_buff *msg, const char *name,
@@ -179,7 +213,8 @@
 {
 	if (nla_put_u32(msg, RDMA_NLDEV_ATTR_DEV_INDEX, device->index))
 		return -EMSGSIZE;
-	if (nla_put_string(msg, RDMA_NLDEV_ATTR_DEV_NAME, device->name))
+	if (nla_put_string(msg, RDMA_NLDEV_ATTR_DEV_NAME,
+			   dev_name(&device->dev)))
 		return -EMSGSIZE;
 
 	return 0;
@@ -188,6 +223,8 @@
 static int fill_dev_info(struct sk_buff *msg, struct ib_device *device)
 {
 	char fw[IB_FW_VERSION_NAME_MAX];
+	int ret = 0;
+	u8 port;
 
 	if (fill_nldev_handle(msg, device))
 		return -EMSGSIZE;
@@ -216,7 +253,27 @@
 		return -EMSGSIZE;
 	if (nla_put_u8(msg, RDMA_NLDEV_ATTR_DEV_NODE_TYPE, device->node_type))
 		return -EMSGSIZE;
-	return 0;
+	if (nla_put_u8(msg, RDMA_NLDEV_ATTR_DEV_DIM, device->use_cq_dim))
+		return -EMSGSIZE;
+
+	/*
+	 * Link type is determined on first port and mlx4 device
+	 * which can potentially have two different link type for the same
+	 * IB device is considered as better to be avoided in the future,
+	 */
+	port = rdma_start_port(device);
+	if (rdma_cap_opa_mad(device, port))
+		ret = nla_put_string(msg, RDMA_NLDEV_ATTR_DEV_PROTOCOL, "opa");
+	else if (rdma_protocol_ib(device, port))
+		ret = nla_put_string(msg, RDMA_NLDEV_ATTR_DEV_PROTOCOL, "ib");
+	else if (rdma_protocol_iwarp(device, port))
+		ret = nla_put_string(msg, RDMA_NLDEV_ATTR_DEV_PROTOCOL, "iw");
+	else if (rdma_protocol_roce(device, port))
+		ret = nla_put_string(msg, RDMA_NLDEV_ATTR_DEV_PROTOCOL, "roce");
+	else if (rdma_protocol_usnic(device, port))
+		ret = nla_put_string(msg, RDMA_NLDEV_ATTR_DEV_PROTOCOL,
+				     "usnic");
+	return ret;
 }
 
 static int fill_port_info(struct sk_buff *msg,
@@ -226,6 +283,7 @@
 	struct net_device *netdev = NULL;
 	struct ib_port_attr attr;
 	int ret;
+	u64 cap_flags = 0;
 
 	if (fill_nldev_handle(msg, device))
 		return -EMSGSIZE;
@@ -238,10 +296,12 @@
 		return ret;
 
 	if (rdma_protocol_ib(device, port)) {
-		BUILD_BUG_ON(sizeof(attr.port_cap_flags) > sizeof(u64));
+		BUILD_BUG_ON((sizeof(attr.port_cap_flags) +
+				sizeof(attr.port_cap_flags2)) > sizeof(u64));
+		cap_flags = attr.port_cap_flags |
+			((u64)attr.port_cap_flags2 << 32);
 		if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_CAP_FLAGS,
-				      (u64)attr.port_cap_flags,
-				      RDMA_NLDEV_ATTR_PAD))
+				      cap_flags, RDMA_NLDEV_ATTR_PAD))
 			return -EMSGSIZE;
 		if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_SUBNET_PREFIX,
 				      attr.subnet_prefix, RDMA_NLDEV_ATTR_PAD))
@@ -258,9 +318,7 @@
 	if (nla_put_u8(msg, RDMA_NLDEV_ATTR_PORT_PHYS_STATE, attr.phys_state))
 		return -EMSGSIZE;
 
-	if (device->get_netdev)
-		netdev = device->get_netdev(device, port);
-
+	netdev = ib_device_get_netdev(device, port);
 	if (netdev && net_eq(dev_net(netdev), net)) {
 		ret = nla_put_u32(msg,
 				  RDMA_NLDEV_ATTR_NDEV_INDEX, netdev->ifindex);
@@ -281,7 +339,8 @@
 {
 	struct nlattr *entry_attr;
 
-	entry_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY);
+	entry_attr = nla_nest_start_noflag(msg,
+					   RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY);
 	if (!entry_attr)
 		return -EMSGSIZE;
 
@@ -307,23 +366,23 @@
 		[RDMA_RESTRACK_QP] = "qp",
 		[RDMA_RESTRACK_CM_ID] = "cm_id",
 		[RDMA_RESTRACK_MR] = "mr",
+		[RDMA_RESTRACK_CTX] = "ctx",
 	};
 
-	struct rdma_restrack_root *res = &device->res;
 	struct nlattr *table_attr;
 	int ret, i, curr;
 
 	if (fill_nldev_handle(msg, device))
 		return -EMSGSIZE;
 
-	table_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_SUMMARY);
+	table_attr = nla_nest_start_noflag(msg, RDMA_NLDEV_ATTR_RES_SUMMARY);
 	if (!table_attr)
 		return -EMSGSIZE;
 
 	for (i = 0; i < RDMA_RESTRACK_MAX; i++) {
 		if (!names[i])
 			continue;
-		curr = rdma_restrack_count(res, i, task_active_pid_ns(current));
+		curr = rdma_restrack_count(device, i);
 		ret = fill_res_info_entry(msg, names[i], curr);
 		if (ret)
 			goto err;
@@ -356,13 +415,20 @@
 	return 0;
 }
 
-static int fill_res_qp_entry(struct sk_buff *msg, struct netlink_callback *cb,
+static bool fill_res_entry(struct ib_device *dev, struct sk_buff *msg,
+			   struct rdma_restrack_entry *res)
+{
+	if (!dev->ops.fill_res_entry)
+		return false;
+	return dev->ops.fill_res_entry(msg, res);
+}
+
+static int fill_res_qp_entry(struct sk_buff *msg, bool has_cap_net_admin,
 			     struct rdma_restrack_entry *res, uint32_t port)
 {
 	struct ib_qp *qp = container_of(res, struct ib_qp, res);
-	struct rdma_restrack_root *resroot = &qp->device->res;
+	struct ib_device *dev = qp->device;
 	struct ib_qp_init_attr qp_init_attr;
-	struct nlattr *entry_attr;
 	struct ib_qp_attr qp_attr;
 	int ret;
 
@@ -371,11 +437,7 @@
 		return ret;
 
 	if (port && port != qp_attr.port_num)
-		return 0;
-
-	entry_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_QP_ENTRY);
-	if (!entry_attr)
-		goto out;
+		return -EAGAIN;
 
 	/* In create_qp() port is not set yet */
 	if (qp_attr.port_num &&
@@ -407,38 +469,32 @@
 	if (nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_STATE, qp_attr.qp_state))
 		goto err;
 
+	if (!rdma_is_kernel_res(res) &&
+	    nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_PDN, qp->pd->res.id))
+		goto err;
+
 	if (fill_res_name_pid(msg, res))
 		goto err;
 
-	if (resroot->fill_res_entry(msg, res))
+	if (fill_res_entry(dev, msg, res))
 		goto err;
 
-	nla_nest_end(msg, entry_attr);
 	return 0;
 
-err:
-	nla_nest_cancel(msg, entry_attr);
-out:
-	return -EMSGSIZE;
+err:	return -EMSGSIZE;
 }
 
-static int fill_res_cm_id_entry(struct sk_buff *msg,
-				struct netlink_callback *cb,
+static int fill_res_cm_id_entry(struct sk_buff *msg, bool has_cap_net_admin,
 				struct rdma_restrack_entry *res, uint32_t port)
 {
 	struct rdma_id_private *id_priv =
 				container_of(res, struct rdma_id_private, res);
-	struct rdma_restrack_root *resroot = &id_priv->id.device->res;
+	struct ib_device *dev = id_priv->id.device;
 	struct rdma_cm_id *cm_id = &id_priv->id;
-	struct nlattr *entry_attr;
 
 	if (port && port != cm_id->port_num)
 		return 0;
 
-	entry_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_CM_ID_ENTRY);
-	if (!entry_attr)
-		goto out;
-
 	if (cm_id->port_num &&
 	    nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, cm_id->port_num))
 		goto err;
@@ -467,31 +523,25 @@
 		    &cm_id->route.addr.dst_addr))
 		goto err;
 
+	if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_CM_IDN, res->id))
+		goto err;
+
 	if (fill_res_name_pid(msg, res))
 		goto err;
 
-	if (resroot->fill_res_entry(msg, res))
+	if (fill_res_entry(dev, msg, res))
 		goto err;
 
-	nla_nest_end(msg, entry_attr);
 	return 0;
 
-err:
-	nla_nest_cancel(msg, entry_attr);
-out:
-	return -EMSGSIZE;
+err: return -EMSGSIZE;
 }
 
-static int fill_res_cq_entry(struct sk_buff *msg, struct netlink_callback *cb,
+static int fill_res_cq_entry(struct sk_buff *msg, bool has_cap_net_admin,
 			     struct rdma_restrack_entry *res, uint32_t port)
 {
 	struct ib_cq *cq = container_of(res, struct ib_cq, res);
-	struct rdma_restrack_root *resroot = &cq->device->res;
-	struct nlattr *entry_attr;
-
-	entry_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_CQ_ENTRY);
-	if (!entry_attr)
-		goto out;
+	struct ib_device *dev = cq->device;
 
 	if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_CQE, cq->cqe))
 		goto err;
@@ -504,33 +554,34 @@
 	    nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_POLL_CTX, cq->poll_ctx))
 		goto err;
 
+	if (nla_put_u8(msg, RDMA_NLDEV_ATTR_DEV_DIM, (cq->dim != NULL)))
+		goto err;
+
+	if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_CQN, res->id))
+		goto err;
+	if (!rdma_is_kernel_res(res) &&
+	    nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_CTXN,
+			cq->uobject->context->res.id))
+		goto err;
+
 	if (fill_res_name_pid(msg, res))
 		goto err;
 
-	if (resroot->fill_res_entry(msg, res))
+	if (fill_res_entry(dev, msg, res))
 		goto err;
 
-	nla_nest_end(msg, entry_attr);
 	return 0;
 
-err:
-	nla_nest_cancel(msg, entry_attr);
-out:
-	return -EMSGSIZE;
+err:	return -EMSGSIZE;
 }
 
-static int fill_res_mr_entry(struct sk_buff *msg, struct netlink_callback *cb,
+static int fill_res_mr_entry(struct sk_buff *msg, bool has_cap_net_admin,
 			     struct rdma_restrack_entry *res, uint32_t port)
 {
 	struct ib_mr *mr = container_of(res, struct ib_mr, res);
-	struct rdma_restrack_root *resroot = &mr->pd->device->res;
-	struct nlattr *entry_attr;
+	struct ib_device *dev = mr->pd->device;
 
-	entry_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_MR_ENTRY);
-	if (!entry_attr)
-		goto out;
-
-	if (netlink_capable(cb->skb, CAP_NET_ADMIN)) {
+	if (has_cap_net_admin) {
 		if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_RKEY, mr->rkey))
 			goto err;
 		if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LKEY, mr->lkey))
@@ -541,33 +592,31 @@
 			      RDMA_NLDEV_ATTR_PAD))
 		goto err;
 
+	if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_MRN, res->id))
+		goto err;
+
+	if (!rdma_is_kernel_res(res) &&
+	    nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_PDN, mr->pd->res.id))
+		goto err;
+
 	if (fill_res_name_pid(msg, res))
 		goto err;
 
-	if (resroot->fill_res_entry(msg, res))
+	if (fill_res_entry(dev, msg, res))
 		goto err;
 
-	nla_nest_end(msg, entry_attr);
 	return 0;
 
-err:
-	nla_nest_cancel(msg, entry_attr);
-out:
-	return -EMSGSIZE;
+err:	return -EMSGSIZE;
 }
 
-static int fill_res_pd_entry(struct sk_buff *msg, struct netlink_callback *cb,
+static int fill_res_pd_entry(struct sk_buff *msg, bool has_cap_net_admin,
 			     struct rdma_restrack_entry *res, uint32_t port)
 {
 	struct ib_pd *pd = container_of(res, struct ib_pd, res);
-	struct rdma_restrack_root *resroot = &pd->device->res;
-	struct nlattr *entry_attr;
+	struct ib_device *dev = pd->device;
 
-	entry_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_PD_ENTRY);
-	if (!entry_attr)
-		goto out;
-
-	if (netlink_capable(cb->skb, CAP_NET_ADMIN)) {
+	if (has_cap_net_admin) {
 		if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LOCAL_DMA_LKEY,
 				pd->local_dma_lkey))
 			goto err;
@@ -579,15 +628,51 @@
 	if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_RES_USECNT,
 			      atomic_read(&pd->usecnt), RDMA_NLDEV_ATTR_PAD))
 		goto err;
-	if ((pd->flags & IB_PD_UNSAFE_GLOBAL_RKEY) &&
-	    nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_UNSAFE_GLOBAL_RKEY,
-			pd->unsafe_global_rkey))
+
+	if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_PDN, res->id))
+		goto err;
+
+	if (!rdma_is_kernel_res(res) &&
+	    nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_CTXN,
+			pd->uobject->context->res.id))
 		goto err;
 
 	if (fill_res_name_pid(msg, res))
 		goto err;
 
-	if (resroot->fill_res_entry(msg, res))
+	if (fill_res_entry(dev, msg, res))
+		goto err;
+
+	return 0;
+
+err:	return -EMSGSIZE;
+}
+
+static int fill_stat_counter_mode(struct sk_buff *msg,
+				  struct rdma_counter *counter)
+{
+	struct rdma_counter_mode *m = &counter->mode;
+
+	if (nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_MODE, m->mode))
+		return -EMSGSIZE;
+
+	if (m->mode == RDMA_COUNTER_MODE_AUTO)
+		if ((m->mask & RDMA_COUNTER_MASK_QP_TYPE) &&
+		    nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_TYPE, m->param.qp_type))
+			return -EMSGSIZE;
+
+	return 0;
+}
+
+static int fill_stat_counter_qp_entry(struct sk_buff *msg, u32 qpn)
+{
+	struct nlattr *entry_attr;
+
+	entry_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_QP_ENTRY);
+	if (!entry_attr)
+		return -EMSGSIZE;
+
+	if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LQPN, qpn))
 		goto err;
 
 	nla_nest_end(msg, entry_attr);
@@ -595,10 +680,120 @@
 
 err:
 	nla_nest_cancel(msg, entry_attr);
-out:
 	return -EMSGSIZE;
 }
 
+static int fill_stat_counter_qps(struct sk_buff *msg,
+				 struct rdma_counter *counter)
+{
+	struct rdma_restrack_entry *res;
+	struct rdma_restrack_root *rt;
+	struct nlattr *table_attr;
+	struct ib_qp *qp = NULL;
+	unsigned long id = 0;
+	int ret = 0;
+
+	table_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_QP);
+
+	rt = &counter->device->res[RDMA_RESTRACK_QP];
+	xa_lock(&rt->xa);
+	xa_for_each(&rt->xa, id, res) {
+		if (!rdma_is_visible_in_pid_ns(res))
+			continue;
+
+		qp = container_of(res, struct ib_qp, res);
+		if (qp->qp_type == IB_QPT_RAW_PACKET && !capable(CAP_NET_RAW))
+			continue;
+
+		if (!qp->counter || (qp->counter->id != counter->id))
+			continue;
+
+		ret = fill_stat_counter_qp_entry(msg, qp->qp_num);
+		if (ret)
+			goto err;
+	}
+
+	xa_unlock(&rt->xa);
+	nla_nest_end(msg, table_attr);
+	return 0;
+
+err:
+	xa_unlock(&rt->xa);
+	nla_nest_cancel(msg, table_attr);
+	return ret;
+}
+
+static int fill_stat_hwcounter_entry(struct sk_buff *msg,
+				     const char *name, u64 value)
+{
+	struct nlattr *entry_attr;
+
+	entry_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY);
+	if (!entry_attr)
+		return -EMSGSIZE;
+
+	if (nla_put_string(msg, RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY_NAME,
+			   name))
+		goto err;
+	if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY_VALUE,
+			      value, RDMA_NLDEV_ATTR_PAD))
+		goto err;
+
+	nla_nest_end(msg, entry_attr);
+	return 0;
+
+err:
+	nla_nest_cancel(msg, entry_attr);
+	return -EMSGSIZE;
+}
+
+static int fill_stat_counter_hwcounters(struct sk_buff *msg,
+					struct rdma_counter *counter)
+{
+	struct rdma_hw_stats *st = counter->stats;
+	struct nlattr *table_attr;
+	int i;
+
+	table_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_STAT_HWCOUNTERS);
+	if (!table_attr)
+		return -EMSGSIZE;
+
+	for (i = 0; i < st->num_counters; i++)
+		if (fill_stat_hwcounter_entry(msg, st->names[i], st->value[i]))
+			goto err;
+
+	nla_nest_end(msg, table_attr);
+	return 0;
+
+err:
+	nla_nest_cancel(msg, table_attr);
+	return -EMSGSIZE;
+}
+
+static int fill_res_counter_entry(struct sk_buff *msg, bool has_cap_net_admin,
+				  struct rdma_restrack_entry *res,
+				  uint32_t port)
+{
+	struct rdma_counter *counter =
+		container_of(res, struct rdma_counter, res);
+
+	if (port && port != counter->port)
+		return -EAGAIN;
+
+	/* Dump it even query failed */
+	rdma_counter_query_stats(counter);
+
+	if (nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, counter->port) ||
+	    nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_COUNTER_ID, counter->id) ||
+	    fill_res_name_pid(msg, &counter->res) ||
+	    fill_stat_counter_mode(msg, counter) ||
+	    fill_stat_counter_qps(msg, counter) ||
+	    fill_stat_counter_hwcounters(msg, counter))
+		return -EMSGSIZE;
+
+	return 0;
+}
+
 static int nldev_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
 			  struct netlink_ext_ack *extack)
 {
@@ -608,14 +803,14 @@
 	u32 index;
 	int err;
 
-	err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
-			  nldev_policy, extack);
+	err = nlmsg_parse_deprecated(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
+				     nldev_policy, extack);
 	if (err || !tb[RDMA_NLDEV_ATTR_DEV_INDEX])
 		return -EINVAL;
 
 	index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]);
 
-	device = ib_device_get_by_index(index);
+	device = ib_device_get_by_index(sock_net(skb->sk), index);
 	if (!device)
 		return -EINVAL;
 
@@ -635,13 +830,62 @@
 
 	nlmsg_end(msg, nlh);
 
-	put_device(&device->dev);
-	return rdma_nl_unicast(msg, NETLINK_CB(skb).portid);
+	ib_device_put(device);
+	return rdma_nl_unicast(sock_net(skb->sk), msg, NETLINK_CB(skb).portid);
 
 err_free:
 	nlmsg_free(msg);
 err:
-	put_device(&device->dev);
+	ib_device_put(device);
+	return err;
+}
+
+static int nldev_set_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
+			  struct netlink_ext_ack *extack)
+{
+	struct nlattr *tb[RDMA_NLDEV_ATTR_MAX];
+	struct ib_device *device;
+	u32 index;
+	int err;
+
+	err = nlmsg_parse_deprecated(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
+				     nldev_policy, extack);
+	if (err || !tb[RDMA_NLDEV_ATTR_DEV_INDEX])
+		return -EINVAL;
+
+	index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]);
+	device = ib_device_get_by_index(sock_net(skb->sk), index);
+	if (!device)
+		return -EINVAL;
+
+	if (tb[RDMA_NLDEV_ATTR_DEV_NAME]) {
+		char name[IB_DEVICE_NAME_MAX] = {};
+
+		nla_strlcpy(name, tb[RDMA_NLDEV_ATTR_DEV_NAME],
+			    IB_DEVICE_NAME_MAX);
+		err = ib_device_rename(device, name);
+		goto done;
+	}
+
+	if (tb[RDMA_NLDEV_NET_NS_FD]) {
+		u32 ns_fd;
+
+		ns_fd = nla_get_u32(tb[RDMA_NLDEV_NET_NS_FD]);
+		err = ib_device_set_netns_put(skb, device, ns_fd);
+		goto put_done;
+	}
+
+	if (tb[RDMA_NLDEV_ATTR_DEV_DIM]) {
+		u8 use_dim;
+
+		use_dim = nla_get_u8(tb[RDMA_NLDEV_ATTR_DEV_DIM]);
+		err = ib_device_set_dim(device,  use_dim);
+		goto done;
+	}
+
+done:
+	ib_device_put(device);
+put_done:
 	return err;
 }
 
@@ -677,7 +921,7 @@
 {
 	/*
 	 * There is no need to take lock, because
-	 * we are relying on ib_core's lists_rwsem
+	 * we are relying on ib_core's locking.
 	 */
 	return ib_enum_all_devs(_nldev_get_dumpit, skb, cb);
 }
@@ -692,15 +936,15 @@
 	u32 port;
 	int err;
 
-	err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
-			  nldev_policy, extack);
+	err = nlmsg_parse_deprecated(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
+				     nldev_policy, extack);
 	if (err ||
 	    !tb[RDMA_NLDEV_ATTR_DEV_INDEX] ||
 	    !tb[RDMA_NLDEV_ATTR_PORT_INDEX])
 		return -EINVAL;
 
 	index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]);
-	device = ib_device_get_by_index(index);
+	device = ib_device_get_by_index(sock_net(skb->sk), index);
 	if (!device)
 		return -EINVAL;
 
@@ -725,14 +969,14 @@
 		goto err_free;
 
 	nlmsg_end(msg, nlh);
-	put_device(&device->dev);
+	ib_device_put(device);
 
-	return rdma_nl_unicast(msg, NETLINK_CB(skb).portid);
+	return rdma_nl_unicast(sock_net(skb->sk), msg, NETLINK_CB(skb).portid);
 
 err_free:
 	nlmsg_free(msg);
 err:
-	put_device(&device->dev);
+	ib_device_put(device);
 	return err;
 }
 
@@ -746,19 +990,19 @@
 	u32 idx = 0;
 	u32 ifindex;
 	int err;
-	u32 p;
+	unsigned int p;
 
-	err = nlmsg_parse(cb->nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
-			  nldev_policy, NULL);
+	err = nlmsg_parse_deprecated(cb->nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
+				     nldev_policy, NULL);
 	if (err || !tb[RDMA_NLDEV_ATTR_DEV_INDEX])
 		return -EINVAL;
 
 	ifindex = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]);
-	device = ib_device_get_by_index(ifindex);
+	device = ib_device_get_by_index(sock_net(skb->sk), ifindex);
 	if (!device)
 		return -EINVAL;
 
-	for (p = rdma_start_port(device); p <= rdma_end_port(device); ++p) {
+	rdma_for_each_port (device, p) {
 		/*
 		 * The dumpit function returns all information from specific
 		 * index. This specific index is taken from the netlink
@@ -789,7 +1033,7 @@
 	}
 
 out:
-	put_device(&device->dev);
+	ib_device_put(device);
 	cb->args[0] = idx;
 	return skb->len;
 }
@@ -803,13 +1047,13 @@
 	u32 index;
 	int ret;
 
-	ret = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
-			  nldev_policy, extack);
+	ret = nlmsg_parse_deprecated(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
+				     nldev_policy, extack);
 	if (ret || !tb[RDMA_NLDEV_ATTR_DEV_INDEX])
 		return -EINVAL;
 
 	index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]);
-	device = ib_device_get_by_index(index);
+	device = ib_device_get_by_index(sock_net(skb->sk), index);
 	if (!device)
 		return -EINVAL;
 
@@ -828,13 +1072,13 @@
 		goto err_free;
 
 	nlmsg_end(msg, nlh);
-	put_device(&device->dev);
-	return rdma_nl_unicast(msg, NETLINK_CB(skb).portid);
+	ib_device_put(device);
+	return rdma_nl_unicast(sock_net(skb->sk), msg, NETLINK_CB(skb).portid);
 
 err_free:
 	nlmsg_free(msg);
 err:
-	put_device(&device->dev);
+	ib_device_put(device);
 	return ret;
 }
 
@@ -857,7 +1101,6 @@
 		nlmsg_cancel(skb, nlh);
 		goto out;
 	}
-
 	nlmsg_end(skb, nlh);
 
 	idx++;
@@ -874,10 +1117,17 @@
 }
 
 struct nldev_fill_res_entry {
-	int (*fill_res_func)(struct sk_buff *msg, struct netlink_callback *cb,
+	int (*fill_res_func)(struct sk_buff *msg, bool has_cap_net_admin,
 			     struct rdma_restrack_entry *res, u32 port);
 	enum rdma_nldev_attr nldev_attr;
 	enum rdma_nldev_command nldev_cmd;
+	u8 flags;
+	u32 entry;
+	u32 id;
+};
+
+enum nldev_res_flags {
+	NLDEV_PER_DEV = 1 << 0,
 };
 
 static const struct nldev_fill_res_entry fill_entries[RDMA_RESTRACK_MAX] = {
@@ -885,29 +1135,132 @@
 		.fill_res_func = fill_res_qp_entry,
 		.nldev_cmd = RDMA_NLDEV_CMD_RES_QP_GET,
 		.nldev_attr = RDMA_NLDEV_ATTR_RES_QP,
+		.entry = RDMA_NLDEV_ATTR_RES_QP_ENTRY,
+		.id = RDMA_NLDEV_ATTR_RES_LQPN,
 	},
 	[RDMA_RESTRACK_CM_ID] = {
 		.fill_res_func = fill_res_cm_id_entry,
 		.nldev_cmd = RDMA_NLDEV_CMD_RES_CM_ID_GET,
 		.nldev_attr = RDMA_NLDEV_ATTR_RES_CM_ID,
+		.entry = RDMA_NLDEV_ATTR_RES_CM_ID_ENTRY,
+		.id = RDMA_NLDEV_ATTR_RES_CM_IDN,
 	},
 	[RDMA_RESTRACK_CQ] = {
 		.fill_res_func = fill_res_cq_entry,
 		.nldev_cmd = RDMA_NLDEV_CMD_RES_CQ_GET,
 		.nldev_attr = RDMA_NLDEV_ATTR_RES_CQ,
+		.flags = NLDEV_PER_DEV,
+		.entry = RDMA_NLDEV_ATTR_RES_CQ_ENTRY,
+		.id = RDMA_NLDEV_ATTR_RES_CQN,
 	},
 	[RDMA_RESTRACK_MR] = {
 		.fill_res_func = fill_res_mr_entry,
 		.nldev_cmd = RDMA_NLDEV_CMD_RES_MR_GET,
 		.nldev_attr = RDMA_NLDEV_ATTR_RES_MR,
+		.flags = NLDEV_PER_DEV,
+		.entry = RDMA_NLDEV_ATTR_RES_MR_ENTRY,
+		.id = RDMA_NLDEV_ATTR_RES_MRN,
 	},
 	[RDMA_RESTRACK_PD] = {
 		.fill_res_func = fill_res_pd_entry,
 		.nldev_cmd = RDMA_NLDEV_CMD_RES_PD_GET,
 		.nldev_attr = RDMA_NLDEV_ATTR_RES_PD,
+		.flags = NLDEV_PER_DEV,
+		.entry = RDMA_NLDEV_ATTR_RES_PD_ENTRY,
+		.id = RDMA_NLDEV_ATTR_RES_PDN,
+	},
+	[RDMA_RESTRACK_COUNTER] = {
+		.fill_res_func = fill_res_counter_entry,
+		.nldev_cmd = RDMA_NLDEV_CMD_STAT_GET,
+		.nldev_attr = RDMA_NLDEV_ATTR_STAT_COUNTER,
+		.entry = RDMA_NLDEV_ATTR_STAT_COUNTER_ENTRY,
+		.id = RDMA_NLDEV_ATTR_STAT_COUNTER_ID,
 	},
 };
 
+static int res_get_common_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
+			       struct netlink_ext_ack *extack,
+			       enum rdma_restrack_type res_type)
+{
+	const struct nldev_fill_res_entry *fe = &fill_entries[res_type];
+	struct nlattr *tb[RDMA_NLDEV_ATTR_MAX];
+	struct rdma_restrack_entry *res;
+	struct ib_device *device;
+	u32 index, id, port = 0;
+	bool has_cap_net_admin;
+	struct sk_buff *msg;
+	int ret;
+
+	ret = nlmsg_parse_deprecated(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
+				     nldev_policy, extack);
+	if (ret || !tb[RDMA_NLDEV_ATTR_DEV_INDEX] || !fe->id || !tb[fe->id])
+		return -EINVAL;
+
+	index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]);
+	device = ib_device_get_by_index(sock_net(skb->sk), index);
+	if (!device)
+		return -EINVAL;
+
+	if (tb[RDMA_NLDEV_ATTR_PORT_INDEX]) {
+		port = nla_get_u32(tb[RDMA_NLDEV_ATTR_PORT_INDEX]);
+		if (!rdma_is_port_valid(device, port)) {
+			ret = -EINVAL;
+			goto err;
+		}
+	}
+
+	if ((port && fe->flags & NLDEV_PER_DEV) ||
+	    (!port && ~fe->flags & NLDEV_PER_DEV)) {
+		ret = -EINVAL;
+		goto err;
+	}
+
+	id = nla_get_u32(tb[fe->id]);
+	res = rdma_restrack_get_byid(device, res_type, id);
+	if (IS_ERR(res)) {
+		ret = PTR_ERR(res);
+		goto err;
+	}
+
+	if (!rdma_is_visible_in_pid_ns(res)) {
+		ret = -ENOENT;
+		goto err_get;
+	}
+
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!msg) {
+		ret = -ENOMEM;
+		goto err_get;
+	}
+
+	nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq,
+			RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, fe->nldev_cmd),
+			0, 0);
+
+	if (fill_nldev_handle(msg, device)) {
+		ret = -EMSGSIZE;
+		goto err_free;
+	}
+
+	has_cap_net_admin = netlink_capable(skb, CAP_NET_ADMIN);
+	ret = fe->fill_res_func(msg, has_cap_net_admin, res, port);
+	rdma_restrack_put(res);
+	if (ret)
+		goto err_free;
+
+	nlmsg_end(msg, nlh);
+	ib_device_put(device);
+	return rdma_nl_unicast(sock_net(skb->sk), msg, NETLINK_CB(skb).portid);
+
+err_free:
+	nlmsg_free(msg);
+err_get:
+	rdma_restrack_put(res);
+err:
+	ib_device_put(device);
+	return ret;
+}
+
 static int res_get_common_dumpit(struct sk_buff *skb,
 				 struct netlink_callback *cb,
 				 enum rdma_restrack_type res_type)
@@ -915,16 +1268,20 @@
 	const struct nldev_fill_res_entry *fe = &fill_entries[res_type];
 	struct nlattr *tb[RDMA_NLDEV_ATTR_MAX];
 	struct rdma_restrack_entry *res;
+	struct rdma_restrack_root *rt;
 	int err, ret = 0, idx = 0;
 	struct nlattr *table_attr;
+	struct nlattr *entry_attr;
 	struct ib_device *device;
 	int start = cb->args[0];
+	bool has_cap_net_admin;
 	struct nlmsghdr *nlh;
+	unsigned long id;
 	u32 index, port = 0;
 	bool filled = false;
 
-	err = nlmsg_parse(cb->nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
-			  nldev_policy, NULL);
+	err = nlmsg_parse_deprecated(cb->nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
+				     nldev_policy, NULL);
 	/*
 	 * Right now, we are expecting the device index to get res information,
 	 * but it is possible to extend this code to return all devices in
@@ -937,7 +1294,7 @@
 		return -EINVAL;
 
 	index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]);
-	device = ib_device_get_by_index(index);
+	device = ib_device_get_by_index(sock_net(skb->sk), index);
 	if (!device)
 		return -EINVAL;
 
@@ -961,61 +1318,57 @@
 		goto err;
 	}
 
-	table_attr = nla_nest_start(skb, fe->nldev_attr);
+	table_attr = nla_nest_start_noflag(skb, fe->nldev_attr);
 	if (!table_attr) {
 		ret = -EMSGSIZE;
 		goto err;
 	}
 
-	down_read(&device->res.rwsem);
-	hash_for_each_possible(device->res.hash, res, node, res_type) {
-		if (idx < start)
+	has_cap_net_admin = netlink_capable(cb->skb, CAP_NET_ADMIN);
+
+	rt = &device->res[res_type];
+	xa_lock(&rt->xa);
+	/*
+	 * FIXME: if the skip ahead is something common this loop should
+	 * use xas_for_each & xas_pause to optimize, we can have a lot of
+	 * objects.
+	 */
+	xa_for_each(&rt->xa, id, res) {
+		if (!rdma_is_visible_in_pid_ns(res))
+			continue;
+
+		if (idx < start || !rdma_restrack_get(res))
 			goto next;
 
-		if ((rdma_is_kernel_res(res) &&
-		     task_active_pid_ns(current) != &init_pid_ns) ||
-		    (!rdma_is_kernel_res(res) && task_active_pid_ns(current) !=
-		     task_active_pid_ns(res->task)))
-			/*
-			 * 1. Kern resources should be visible in init
-			 *    namspace only
-			 * 2. Present only resources visible in the current
-			 *    namespace
-			 */
-			goto next;
-
-		if (!rdma_restrack_get(res))
-			/*
-			 * Resource is under release now, but we are not
-			 * relesing lock now, so it will be released in
-			 * our next pass, once we will get ->next pointer.
-			 */
-			goto next;
+		xa_unlock(&rt->xa);
 
 		filled = true;
 
-		up_read(&device->res.rwsem);
-		ret = fe->fill_res_func(skb, cb, res, port);
-		down_read(&device->res.rwsem);
-		/*
-		 * Return resource back, but it won't be released till
-		 * the &device->res.rwsem will be released for write.
-		 */
+		entry_attr = nla_nest_start_noflag(skb, fe->entry);
+		if (!entry_attr) {
+			ret = -EMSGSIZE;
+			rdma_restrack_put(res);
+			goto msg_full;
+		}
+
+		ret = fe->fill_res_func(skb, has_cap_net_admin, res, port);
 		rdma_restrack_put(res);
 
-		if (ret == -EMSGSIZE)
-			/*
-			 * There is a chance to optimize here.
-			 * It can be done by using list_prepare_entry
-			 * and list_for_each_entry_continue afterwards.
-			 */
-			break;
-		if (ret)
+		if (ret) {
+			nla_nest_cancel(skb, entry_attr);
+			if (ret == -EMSGSIZE)
+				goto msg_full;
+			if (ret == -EAGAIN)
+				goto again;
 			goto res_err;
+		}
+		nla_nest_end(skb, entry_attr);
+again:		xa_lock(&rt->xa);
 next:		idx++;
 	}
-	up_read(&device->res.rwsem);
+	xa_unlock(&rt->xa);
 
+msg_full:
 	nla_nest_end(skb, table_attr);
 	nlmsg_end(skb, nlh);
 	cb->args[0] = idx;
@@ -1027,49 +1380,646 @@
 	if (!filled)
 		goto err;
 
-	put_device(&device->dev);
+	ib_device_put(device);
 	return skb->len;
 
 res_err:
 	nla_nest_cancel(skb, table_attr);
-	up_read(&device->res.rwsem);
 
 err:
 	nlmsg_cancel(skb, nlh);
 
 err_index:
-	put_device(&device->dev);
+	ib_device_put(device);
 	return ret;
 }
 
-static int nldev_res_get_qp_dumpit(struct sk_buff *skb,
-				   struct netlink_callback *cb)
+#define RES_GET_FUNCS(name, type)                                              \
+	static int nldev_res_get_##name##_dumpit(struct sk_buff *skb,          \
+						 struct netlink_callback *cb)  \
+	{                                                                      \
+		return res_get_common_dumpit(skb, cb, type);                   \
+	}                                                                      \
+	static int nldev_res_get_##name##_doit(struct sk_buff *skb,            \
+					       struct nlmsghdr *nlh,           \
+					       struct netlink_ext_ack *extack) \
+	{                                                                      \
+		return res_get_common_doit(skb, nlh, extack, type);            \
+	}
+
+RES_GET_FUNCS(qp, RDMA_RESTRACK_QP);
+RES_GET_FUNCS(cm_id, RDMA_RESTRACK_CM_ID);
+RES_GET_FUNCS(cq, RDMA_RESTRACK_CQ);
+RES_GET_FUNCS(pd, RDMA_RESTRACK_PD);
+RES_GET_FUNCS(mr, RDMA_RESTRACK_MR);
+RES_GET_FUNCS(counter, RDMA_RESTRACK_COUNTER);
+
+static LIST_HEAD(link_ops);
+static DECLARE_RWSEM(link_ops_rwsem);
+
+static const struct rdma_link_ops *link_ops_get(const char *type)
 {
-	return res_get_common_dumpit(skb, cb, RDMA_RESTRACK_QP);
+	const struct rdma_link_ops *ops;
+
+	list_for_each_entry(ops, &link_ops, list) {
+		if (!strcmp(ops->type, type))
+			goto out;
+	}
+	ops = NULL;
+out:
+	return ops;
 }
 
-static int nldev_res_get_cm_id_dumpit(struct sk_buff *skb,
-				      struct netlink_callback *cb)
+void rdma_link_register(struct rdma_link_ops *ops)
 {
-	return res_get_common_dumpit(skb, cb, RDMA_RESTRACK_CM_ID);
+	down_write(&link_ops_rwsem);
+	if (WARN_ON_ONCE(link_ops_get(ops->type)))
+		goto out;
+	list_add(&ops->list, &link_ops);
+out:
+	up_write(&link_ops_rwsem);
+}
+EXPORT_SYMBOL(rdma_link_register);
+
+void rdma_link_unregister(struct rdma_link_ops *ops)
+{
+	down_write(&link_ops_rwsem);
+	list_del(&ops->list);
+	up_write(&link_ops_rwsem);
+}
+EXPORT_SYMBOL(rdma_link_unregister);
+
+static int nldev_newlink(struct sk_buff *skb, struct nlmsghdr *nlh,
+			  struct netlink_ext_ack *extack)
+{
+	struct nlattr *tb[RDMA_NLDEV_ATTR_MAX];
+	char ibdev_name[IB_DEVICE_NAME_MAX];
+	const struct rdma_link_ops *ops;
+	char ndev_name[IFNAMSIZ];
+	struct net_device *ndev;
+	char type[IFNAMSIZ];
+	int err;
+
+	err = nlmsg_parse_deprecated(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
+				     nldev_policy, extack);
+	if (err || !tb[RDMA_NLDEV_ATTR_DEV_NAME] ||
+	    !tb[RDMA_NLDEV_ATTR_LINK_TYPE] || !tb[RDMA_NLDEV_ATTR_NDEV_NAME])
+		return -EINVAL;
+
+	nla_strlcpy(ibdev_name, tb[RDMA_NLDEV_ATTR_DEV_NAME],
+		    sizeof(ibdev_name));
+	if (strchr(ibdev_name, '%'))
+		return -EINVAL;
+
+	nla_strlcpy(type, tb[RDMA_NLDEV_ATTR_LINK_TYPE], sizeof(type));
+	nla_strlcpy(ndev_name, tb[RDMA_NLDEV_ATTR_NDEV_NAME],
+		    sizeof(ndev_name));
+
+	ndev = dev_get_by_name(sock_net(skb->sk), ndev_name);
+	if (!ndev)
+		return -ENODEV;
+
+	down_read(&link_ops_rwsem);
+	ops = link_ops_get(type);
+#ifdef CONFIG_MODULES
+	if (!ops) {
+		up_read(&link_ops_rwsem);
+		request_module("rdma-link-%s", type);
+		down_read(&link_ops_rwsem);
+		ops = link_ops_get(type);
+	}
+#endif
+	err = ops ? ops->newlink(ibdev_name, ndev) : -EINVAL;
+	up_read(&link_ops_rwsem);
+	dev_put(ndev);
+
+	return err;
 }
 
-static int nldev_res_get_cq_dumpit(struct sk_buff *skb,
-				   struct netlink_callback *cb)
+static int nldev_dellink(struct sk_buff *skb, struct nlmsghdr *nlh,
+			  struct netlink_ext_ack *extack)
 {
-	return res_get_common_dumpit(skb, cb, RDMA_RESTRACK_CQ);
+	struct nlattr *tb[RDMA_NLDEV_ATTR_MAX];
+	struct ib_device *device;
+	u32 index;
+	int err;
+
+	err = nlmsg_parse_deprecated(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
+				     nldev_policy, extack);
+	if (err || !tb[RDMA_NLDEV_ATTR_DEV_INDEX])
+		return -EINVAL;
+
+	index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]);
+	device = ib_device_get_by_index(sock_net(skb->sk), index);
+	if (!device)
+		return -EINVAL;
+
+	if (!(device->attrs.device_cap_flags & IB_DEVICE_ALLOW_USER_UNREG)) {
+		ib_device_put(device);
+		return -EINVAL;
+	}
+
+	ib_unregister_device_and_put(device);
+	return 0;
 }
 
-static int nldev_res_get_mr_dumpit(struct sk_buff *skb,
-				   struct netlink_callback *cb)
+static int nldev_get_chardev(struct sk_buff *skb, struct nlmsghdr *nlh,
+			     struct netlink_ext_ack *extack)
 {
-	return res_get_common_dumpit(skb, cb, RDMA_RESTRACK_MR);
+	struct nlattr *tb[RDMA_NLDEV_ATTR_MAX];
+	char client_name[RDMA_NLDEV_ATTR_CHARDEV_TYPE_SIZE];
+	struct ib_client_nl_info data = {};
+	struct ib_device *ibdev = NULL;
+	struct sk_buff *msg;
+	u32 index;
+	int err;
+
+	err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, nldev_policy,
+			  extack);
+	if (err || !tb[RDMA_NLDEV_ATTR_CHARDEV_TYPE])
+		return -EINVAL;
+
+	nla_strlcpy(client_name, tb[RDMA_NLDEV_ATTR_CHARDEV_TYPE],
+		    sizeof(client_name));
+
+	if (tb[RDMA_NLDEV_ATTR_DEV_INDEX]) {
+		index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]);
+		ibdev = ib_device_get_by_index(sock_net(skb->sk), index);
+		if (!ibdev)
+			return -EINVAL;
+
+		if (tb[RDMA_NLDEV_ATTR_PORT_INDEX]) {
+			data.port = nla_get_u32(tb[RDMA_NLDEV_ATTR_PORT_INDEX]);
+			if (!rdma_is_port_valid(ibdev, data.port)) {
+				err = -EINVAL;
+				goto out_put;
+			}
+		} else {
+			data.port = -1;
+		}
+	} else if (tb[RDMA_NLDEV_ATTR_PORT_INDEX]) {
+		return -EINVAL;
+	}
+
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!msg) {
+		err = -ENOMEM;
+		goto out_put;
+	}
+	nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq,
+			RDMA_NL_GET_TYPE(RDMA_NL_NLDEV,
+					 RDMA_NLDEV_CMD_GET_CHARDEV),
+			0, 0);
+
+	data.nl_msg = msg;
+	err = ib_get_client_nl_info(ibdev, client_name, &data);
+	if (err)
+		goto out_nlmsg;
+
+	err = nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_CHARDEV,
+				huge_encode_dev(data.cdev->devt),
+				RDMA_NLDEV_ATTR_PAD);
+	if (err)
+		goto out_data;
+	err = nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_CHARDEV_ABI, data.abi,
+				RDMA_NLDEV_ATTR_PAD);
+	if (err)
+		goto out_data;
+	if (nla_put_string(msg, RDMA_NLDEV_ATTR_CHARDEV_NAME,
+			   dev_name(data.cdev))) {
+		err = -EMSGSIZE;
+		goto out_data;
+	}
+
+	nlmsg_end(msg, nlh);
+	put_device(data.cdev);
+	if (ibdev)
+		ib_device_put(ibdev);
+	return rdma_nl_unicast(sock_net(skb->sk), msg, NETLINK_CB(skb).portid);
+
+out_data:
+	put_device(data.cdev);
+out_nlmsg:
+	nlmsg_free(msg);
+out_put:
+	if (ibdev)
+		ib_device_put(ibdev);
+	return err;
 }
 
-static int nldev_res_get_pd_dumpit(struct sk_buff *skb,
-				   struct netlink_callback *cb)
+static int nldev_sys_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
+			      struct netlink_ext_ack *extack)
 {
-	return res_get_common_dumpit(skb, cb, RDMA_RESTRACK_PD);
+	struct nlattr *tb[RDMA_NLDEV_ATTR_MAX];
+	struct sk_buff *msg;
+	int err;
+
+	err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
+			  nldev_policy, extack);
+	if (err)
+		return err;
+
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!msg)
+		return -ENOMEM;
+
+	nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq,
+			RDMA_NL_GET_TYPE(RDMA_NL_NLDEV,
+					 RDMA_NLDEV_CMD_SYS_GET),
+			0, 0);
+
+	err = nla_put_u8(msg, RDMA_NLDEV_SYS_ATTR_NETNS_MODE,
+			 (u8)ib_devices_shared_netns);
+	if (err) {
+		nlmsg_free(msg);
+		return err;
+	}
+	nlmsg_end(msg, nlh);
+	return rdma_nl_unicast(sock_net(skb->sk), msg, NETLINK_CB(skb).portid);
+}
+
+static int nldev_set_sys_set_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
+				  struct netlink_ext_ack *extack)
+{
+	struct nlattr *tb[RDMA_NLDEV_ATTR_MAX];
+	u8 enable;
+	int err;
+
+	err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
+			  nldev_policy, extack);
+	if (err || !tb[RDMA_NLDEV_SYS_ATTR_NETNS_MODE])
+		return -EINVAL;
+
+	enable = nla_get_u8(tb[RDMA_NLDEV_SYS_ATTR_NETNS_MODE]);
+	/* Only 0 and 1 are supported */
+	if (enable > 1)
+		return -EINVAL;
+
+	err = rdma_compatdev_set(enable);
+	return err;
+}
+
+static int nldev_stat_set_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
+			       struct netlink_ext_ack *extack)
+{
+	u32 index, port, mode, mask = 0, qpn, cntn = 0;
+	struct nlattr *tb[RDMA_NLDEV_ATTR_MAX];
+	struct ib_device *device;
+	struct sk_buff *msg;
+	int ret;
+
+	ret = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
+			  nldev_policy, extack);
+	/* Currently only counter for QP is supported */
+	if (ret || !tb[RDMA_NLDEV_ATTR_STAT_RES] ||
+	    !tb[RDMA_NLDEV_ATTR_DEV_INDEX] ||
+	    !tb[RDMA_NLDEV_ATTR_PORT_INDEX] || !tb[RDMA_NLDEV_ATTR_STAT_MODE])
+		return -EINVAL;
+
+	if (nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_RES]) != RDMA_NLDEV_ATTR_RES_QP)
+		return -EINVAL;
+
+	index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]);
+	device = ib_device_get_by_index(sock_net(skb->sk), index);
+	if (!device)
+		return -EINVAL;
+
+	port = nla_get_u32(tb[RDMA_NLDEV_ATTR_PORT_INDEX]);
+	if (!rdma_is_port_valid(device, port)) {
+		ret = -EINVAL;
+		goto err;
+	}
+
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!msg) {
+		ret = -ENOMEM;
+		goto err;
+	}
+	nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq,
+			RDMA_NL_GET_TYPE(RDMA_NL_NLDEV,
+					 RDMA_NLDEV_CMD_STAT_SET),
+			0, 0);
+
+	mode = nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_MODE]);
+	if (mode == RDMA_COUNTER_MODE_AUTO) {
+		if (tb[RDMA_NLDEV_ATTR_STAT_AUTO_MODE_MASK])
+			mask = nla_get_u32(
+				tb[RDMA_NLDEV_ATTR_STAT_AUTO_MODE_MASK]);
+
+		ret = rdma_counter_set_auto_mode(device, port,
+						 mask ? true : false, mask);
+		if (ret)
+			goto err_msg;
+	} else {
+		qpn = nla_get_u32(tb[RDMA_NLDEV_ATTR_RES_LQPN]);
+		if (tb[RDMA_NLDEV_ATTR_STAT_COUNTER_ID]) {
+			cntn = nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_COUNTER_ID]);
+			ret = rdma_counter_bind_qpn(device, port, qpn, cntn);
+		} else {
+			ret = rdma_counter_bind_qpn_alloc(device, port,
+							  qpn, &cntn);
+		}
+		if (ret)
+			goto err_msg;
+
+		if (fill_nldev_handle(msg, device) ||
+		    nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, port) ||
+		    nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_COUNTER_ID, cntn) ||
+		    nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LQPN, qpn)) {
+			ret = -EMSGSIZE;
+			goto err_fill;
+		}
+	}
+
+	nlmsg_end(msg, nlh);
+	ib_device_put(device);
+	return rdma_nl_unicast(sock_net(skb->sk), msg, NETLINK_CB(skb).portid);
+
+err_fill:
+	rdma_counter_unbind_qpn(device, port, qpn, cntn);
+err_msg:
+	nlmsg_free(msg);
+err:
+	ib_device_put(device);
+	return ret;
+}
+
+static int nldev_stat_del_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
+			       struct netlink_ext_ack *extack)
+{
+	struct nlattr *tb[RDMA_NLDEV_ATTR_MAX];
+	struct ib_device *device;
+	struct sk_buff *msg;
+	u32 index, port, qpn, cntn;
+	int ret;
+
+	ret = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
+			  nldev_policy, extack);
+	if (ret || !tb[RDMA_NLDEV_ATTR_STAT_RES] ||
+	    !tb[RDMA_NLDEV_ATTR_DEV_INDEX] || !tb[RDMA_NLDEV_ATTR_PORT_INDEX] ||
+	    !tb[RDMA_NLDEV_ATTR_STAT_COUNTER_ID] ||
+	    !tb[RDMA_NLDEV_ATTR_RES_LQPN])
+		return -EINVAL;
+
+	if (nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_RES]) != RDMA_NLDEV_ATTR_RES_QP)
+		return -EINVAL;
+
+	index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]);
+	device = ib_device_get_by_index(sock_net(skb->sk), index);
+	if (!device)
+		return -EINVAL;
+
+	port = nla_get_u32(tb[RDMA_NLDEV_ATTR_PORT_INDEX]);
+	if (!rdma_is_port_valid(device, port)) {
+		ret = -EINVAL;
+		goto err;
+	}
+
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!msg) {
+		ret = -ENOMEM;
+		goto err;
+	}
+	nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq,
+			RDMA_NL_GET_TYPE(RDMA_NL_NLDEV,
+					 RDMA_NLDEV_CMD_STAT_SET),
+			0, 0);
+
+	cntn = nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_COUNTER_ID]);
+	qpn = nla_get_u32(tb[RDMA_NLDEV_ATTR_RES_LQPN]);
+	if (fill_nldev_handle(msg, device) ||
+	    nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, port) ||
+	    nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_COUNTER_ID, cntn) ||
+	    nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LQPN, qpn)) {
+		ret = -EMSGSIZE;
+		goto err_fill;
+	}
+
+	ret = rdma_counter_unbind_qpn(device, port, qpn, cntn);
+	if (ret)
+		goto err_fill;
+
+	nlmsg_end(msg, nlh);
+	ib_device_put(device);
+	return rdma_nl_unicast(sock_net(skb->sk), msg, NETLINK_CB(skb).portid);
+
+err_fill:
+	nlmsg_free(msg);
+err:
+	ib_device_put(device);
+	return ret;
+}
+
+static int stat_get_doit_default_counter(struct sk_buff *skb,
+					 struct nlmsghdr *nlh,
+					 struct netlink_ext_ack *extack,
+					 struct nlattr *tb[])
+{
+	struct rdma_hw_stats *stats;
+	struct nlattr *table_attr;
+	struct ib_device *device;
+	int ret, num_cnts, i;
+	struct sk_buff *msg;
+	u32 index, port;
+	u64 v;
+
+	if (!tb[RDMA_NLDEV_ATTR_DEV_INDEX] || !tb[RDMA_NLDEV_ATTR_PORT_INDEX])
+		return -EINVAL;
+
+	index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]);
+	device = ib_device_get_by_index(sock_net(skb->sk), index);
+	if (!device)
+		return -EINVAL;
+
+	if (!device->ops.alloc_hw_stats || !device->ops.get_hw_stats) {
+		ret = -EINVAL;
+		goto err;
+	}
+
+	port = nla_get_u32(tb[RDMA_NLDEV_ATTR_PORT_INDEX]);
+	if (!rdma_is_port_valid(device, port)) {
+		ret = -EINVAL;
+		goto err;
+	}
+
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!msg) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq,
+			RDMA_NL_GET_TYPE(RDMA_NL_NLDEV,
+					 RDMA_NLDEV_CMD_STAT_GET),
+			0, 0);
+
+	if (fill_nldev_handle(msg, device) ||
+	    nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, port)) {
+		ret = -EMSGSIZE;
+		goto err_msg;
+	}
+
+	stats = device->port_data ? device->port_data[port].hw_stats : NULL;
+	if (stats == NULL) {
+		ret = -EINVAL;
+		goto err_msg;
+	}
+	mutex_lock(&stats->lock);
+
+	num_cnts = device->ops.get_hw_stats(device, stats, port, 0);
+	if (num_cnts < 0) {
+		ret = -EINVAL;
+		goto err_stats;
+	}
+
+	table_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_STAT_HWCOUNTERS);
+	if (!table_attr) {
+		ret = -EMSGSIZE;
+		goto err_stats;
+	}
+	for (i = 0; i < num_cnts; i++) {
+		v = stats->value[i] +
+			rdma_counter_get_hwstat_value(device, port, i);
+		if (fill_stat_hwcounter_entry(msg, stats->names[i], v)) {
+			ret = -EMSGSIZE;
+			goto err_table;
+		}
+	}
+	nla_nest_end(msg, table_attr);
+
+	mutex_unlock(&stats->lock);
+	nlmsg_end(msg, nlh);
+	ib_device_put(device);
+	return rdma_nl_unicast(sock_net(skb->sk), msg, NETLINK_CB(skb).portid);
+
+err_table:
+	nla_nest_cancel(msg, table_attr);
+err_stats:
+	mutex_unlock(&stats->lock);
+err_msg:
+	nlmsg_free(msg);
+err:
+	ib_device_put(device);
+	return ret;
+}
+
+static int stat_get_doit_qp(struct sk_buff *skb, struct nlmsghdr *nlh,
+			    struct netlink_ext_ack *extack, struct nlattr *tb[])
+
+{
+	static enum rdma_nl_counter_mode mode;
+	static enum rdma_nl_counter_mask mask;
+	struct ib_device *device;
+	struct sk_buff *msg;
+	u32 index, port;
+	int ret;
+
+	if (tb[RDMA_NLDEV_ATTR_STAT_COUNTER_ID])
+		return nldev_res_get_counter_doit(skb, nlh, extack);
+
+	if (!tb[RDMA_NLDEV_ATTR_STAT_MODE] ||
+	    !tb[RDMA_NLDEV_ATTR_DEV_INDEX] || !tb[RDMA_NLDEV_ATTR_PORT_INDEX])
+		return -EINVAL;
+
+	index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]);
+	device = ib_device_get_by_index(sock_net(skb->sk), index);
+	if (!device)
+		return -EINVAL;
+
+	port = nla_get_u32(tb[RDMA_NLDEV_ATTR_PORT_INDEX]);
+	if (!rdma_is_port_valid(device, port)) {
+		ret = -EINVAL;
+		goto err;
+	}
+
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!msg) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq,
+			RDMA_NL_GET_TYPE(RDMA_NL_NLDEV,
+					 RDMA_NLDEV_CMD_STAT_GET),
+			0, 0);
+
+	ret = rdma_counter_get_mode(device, port, &mode, &mask);
+	if (ret)
+		goto err_msg;
+
+	if (fill_nldev_handle(msg, device) ||
+	    nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, port) ||
+	    nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_MODE, mode)) {
+		ret = -EMSGSIZE;
+		goto err_msg;
+	}
+
+	if ((mode == RDMA_COUNTER_MODE_AUTO) &&
+	    nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_AUTO_MODE_MASK, mask)) {
+		ret = -EMSGSIZE;
+		goto err_msg;
+	}
+
+	nlmsg_end(msg, nlh);
+	ib_device_put(device);
+	return rdma_nl_unicast(sock_net(skb->sk), msg, NETLINK_CB(skb).portid);
+
+err_msg:
+	nlmsg_free(msg);
+err:
+	ib_device_put(device);
+	return ret;
+}
+
+static int nldev_stat_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
+			       struct netlink_ext_ack *extack)
+{
+	struct nlattr *tb[RDMA_NLDEV_ATTR_MAX];
+	int ret;
+
+	ret = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
+			  nldev_policy, extack);
+	if (ret)
+		return -EINVAL;
+
+	if (!tb[RDMA_NLDEV_ATTR_STAT_RES])
+		return stat_get_doit_default_counter(skb, nlh, extack, tb);
+
+	switch (nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_RES])) {
+	case RDMA_NLDEV_ATTR_RES_QP:
+		ret = stat_get_doit_qp(skb, nlh, extack, tb);
+		break;
+
+	default:
+		ret = -EINVAL;
+		break;
+	}
+
+	return ret;
+}
+
+static int nldev_stat_get_dumpit(struct sk_buff *skb,
+				 struct netlink_callback *cb)
+{
+	struct nlattr *tb[RDMA_NLDEV_ATTR_MAX];
+	int ret;
+
+	ret = nlmsg_parse(cb->nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
+			  nldev_policy, NULL);
+	if (ret || !tb[RDMA_NLDEV_ATTR_STAT_RES])
+		return -EINVAL;
+
+	switch (nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_RES])) {
+	case RDMA_NLDEV_ATTR_RES_QP:
+		ret = nldev_res_get_counter_dumpit(skb, cb);
+		break;
+
+	default:
+		ret = -EINVAL;
+		break;
+	}
+
+	return ret;
 }
 
 static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = {
@@ -1077,6 +2027,21 @@
 		.doit = nldev_get_doit,
 		.dump = nldev_get_dumpit,
 	},
+	[RDMA_NLDEV_CMD_GET_CHARDEV] = {
+		.doit = nldev_get_chardev,
+	},
+	[RDMA_NLDEV_CMD_SET] = {
+		.doit = nldev_set_doit,
+		.flags = RDMA_NL_ADMIN_PERM,
+	},
+	[RDMA_NLDEV_CMD_NEWLINK] = {
+		.doit = nldev_newlink,
+		.flags = RDMA_NL_ADMIN_PERM,
+	},
+	[RDMA_NLDEV_CMD_DELLINK] = {
+		.doit = nldev_dellink,
+		.flags = RDMA_NL_ADMIN_PERM,
+	},
 	[RDMA_NLDEV_CMD_PORT_GET] = {
 		.doit = nldev_port_get_doit,
 		.dump = nldev_port_get_dumpit,
@@ -1086,30 +2051,43 @@
 		.dump = nldev_res_get_dumpit,
 	},
 	[RDMA_NLDEV_CMD_RES_QP_GET] = {
+		.doit = nldev_res_get_qp_doit,
 		.dump = nldev_res_get_qp_dumpit,
-		/*
-		 * .doit is not implemented yet for two reasons:
-		 * 1. It is not needed yet.
-		 * 2. There is a need to provide identifier, while it is easy
-		 * for the QPs (device index + port index + LQPN), it is not
-		 * the case for the rest of resources (PD and CQ). Because it
-		 * is better to provide similar interface for all resources,
-		 * let's wait till we will have other resources implemented
-		 * too.
-		 */
 	},
 	[RDMA_NLDEV_CMD_RES_CM_ID_GET] = {
+		.doit = nldev_res_get_cm_id_doit,
 		.dump = nldev_res_get_cm_id_dumpit,
 	},
 	[RDMA_NLDEV_CMD_RES_CQ_GET] = {
+		.doit = nldev_res_get_cq_doit,
 		.dump = nldev_res_get_cq_dumpit,
 	},
 	[RDMA_NLDEV_CMD_RES_MR_GET] = {
+		.doit = nldev_res_get_mr_doit,
 		.dump = nldev_res_get_mr_dumpit,
 	},
 	[RDMA_NLDEV_CMD_RES_PD_GET] = {
+		.doit = nldev_res_get_pd_doit,
 		.dump = nldev_res_get_pd_dumpit,
 	},
+	[RDMA_NLDEV_CMD_SYS_GET] = {
+		.doit = nldev_sys_get_doit,
+	},
+	[RDMA_NLDEV_CMD_SYS_SET] = {
+		.doit = nldev_set_sys_set_doit,
+	},
+	[RDMA_NLDEV_CMD_STAT_SET] = {
+		.doit = nldev_stat_set_doit,
+		.flags = RDMA_NL_ADMIN_PERM,
+	},
+	[RDMA_NLDEV_CMD_STAT_GET] = {
+		.doit = nldev_stat_get_doit,
+		.dump = nldev_stat_get_dumpit,
+	},
+	[RDMA_NLDEV_CMD_STAT_DEL] = {
+		.doit = nldev_stat_del_doit,
+		.flags = RDMA_NL_ADMIN_PERM,
+	},
 };
 
 void __init nldev_init(void)
diff --git a/drivers/infiniband/core/opa_smi.h b/drivers/infiniband/core/opa_smi.h
index 3bfab35..af4879b 100644
--- a/drivers/infiniband/core/opa_smi.h
+++ b/drivers/infiniband/core/opa_smi.h
@@ -55,7 +55,7 @@
 {
 	/* C14-9:3 -- We're at the end of the DR segment of path */
 	/* C14-9:4 -- Hop Pointer = Hop Count + 1 -> give to SMA/SM */
-	return (device->process_mad &&
+	return (device->ops.process_mad &&
 		!opa_get_smp_direction(smp) &&
 		(smp->hop_ptr == smp->hop_cnt + 1)) ?
 		IB_SMI_HANDLE : IB_SMI_DISCARD;
@@ -70,7 +70,7 @@
 {
 	/* C14-13:3 -- We're at the end of the DR segment of path */
 	/* C14-13:4 -- Hop Pointer == 0 -> give to SM */
-	return (device->process_mad &&
+	return (device->ops.process_mad &&
 		opa_get_smp_direction(smp) &&
 		!smp->hop_ptr) ? IB_SMI_HANDLE : IB_SMI_DISCARD;
 }
diff --git a/drivers/infiniband/core/rdma_core.c b/drivers/infiniband/core/rdma_core.c
index c4118bc..ccf4d06 100644
--- a/drivers/infiniband/core/rdma_core.c
+++ b/drivers/infiniband/core/rdma_core.c
@@ -125,9 +125,10 @@
  * and consumes the kref on the uobj.
  */
 static int uverbs_destroy_uobject(struct ib_uobject *uobj,
-				  enum rdma_remove_reason reason)
+				  enum rdma_remove_reason reason,
+				  struct uverbs_attr_bundle *attrs)
 {
-	struct ib_uverbs_file *ufile = uobj->ufile;
+	struct ib_uverbs_file *ufile = attrs->ufile;
 	unsigned long flags;
 	int ret;
 
@@ -135,7 +136,8 @@
 	assert_uverbs_usecnt(uobj, UVERBS_LOOKUP_WRITE);
 
 	if (uobj->object) {
-		ret = uobj->uapi_object->type_class->destroy_hw(uobj, reason);
+		ret = uobj->uapi_object->type_class->destroy_hw(uobj, reason,
+								attrs);
 		if (ret) {
 			if (ib_is_destroy_retryable(ret, reason, uobj))
 				return ret;
@@ -196,9 +198,9 @@
  * version requires the caller to have already obtained an
  * LOOKUP_DESTROY uobject kref.
  */
-int uobj_destroy(struct ib_uobject *uobj)
+int uobj_destroy(struct ib_uobject *uobj, struct uverbs_attr_bundle *attrs)
 {
-	struct ib_uverbs_file *ufile = uobj->ufile;
+	struct ib_uverbs_file *ufile = attrs->ufile;
 	int ret;
 
 	down_read(&ufile->hw_destroy_rwsem);
@@ -207,7 +209,7 @@
 	if (ret)
 		goto out_unlock;
 
-	ret = uverbs_destroy_uobject(uobj, RDMA_REMOVE_DESTROY);
+	ret = uverbs_destroy_uobject(uobj, RDMA_REMOVE_DESTROY, attrs);
 	if (ret) {
 		atomic_set(&uobj->usecnt, 0);
 		goto out_unlock;
@@ -224,16 +226,17 @@
  * uverbs_put_destroy.
  */
 struct ib_uobject *__uobj_get_destroy(const struct uverbs_api_object *obj,
-				      u32 id, struct ib_uverbs_file *ufile)
+				      u32 id, struct uverbs_attr_bundle *attrs)
 {
 	struct ib_uobject *uobj;
 	int ret;
 
-	uobj = rdma_lookup_get_uobject(obj, ufile, id, UVERBS_LOOKUP_DESTROY);
+	uobj = rdma_lookup_get_uobject(obj, attrs->ufile, id,
+				       UVERBS_LOOKUP_DESTROY, attrs);
 	if (IS_ERR(uobj))
 		return uobj;
 
-	ret = uobj_destroy(uobj);
+	ret = uobj_destroy(uobj, attrs);
 	if (ret) {
 		rdma_lookup_put_uobject(uobj, UVERBS_LOOKUP_DESTROY);
 		return ERR_PTR(ret);
@@ -243,21 +246,20 @@
 }
 
 /*
- * Does both uobj_get_destroy() and uobj_put_destroy().  Returns success_res
- * on success (negative errno on failure). For use by callers that do not need
- * the uobj.
+ * Does both uobj_get_destroy() and uobj_put_destroy().  Returns 0 on success
+ * (negative errno on failure). For use by callers that do not need the uobj.
  */
 int __uobj_perform_destroy(const struct uverbs_api_object *obj, u32 id,
-			   struct ib_uverbs_file *ufile, int success_res)
+			   struct uverbs_attr_bundle *attrs)
 {
 	struct ib_uobject *uobj;
 
-	uobj = __uobj_get_destroy(obj, id, ufile);
+	uobj = __uobj_get_destroy(obj, id, attrs);
 	if (IS_ERR(uobj))
 		return PTR_ERR(uobj);
 
 	rdma_lookup_put_uobject(uobj, UVERBS_LOOKUP_WRITE);
-	return success_res;
+	return 0;
 }
 
 /* alloc_uobj must be undone by uverbs_destroy_uobject() */
@@ -267,7 +269,7 @@
 	struct ib_uobject *uobj;
 	struct ib_ucontext *ucontext;
 
-	ucontext = ib_uverbs_get_ucontext(ufile);
+	ucontext = ib_uverbs_get_ucontext_file(ufile);
 	if (IS_ERR(ucontext))
 		return ERR_CAST(ucontext);
 
@@ -295,25 +297,13 @@
 
 static int idr_add_uobj(struct ib_uobject *uobj)
 {
-	int ret;
-
-	idr_preload(GFP_KERNEL);
-	spin_lock(&uobj->ufile->idr_lock);
-
-	/*
-	 * We start with allocating an idr pointing to NULL. This represents an
-	 * object which isn't initialized yet. We'll replace it later on with
-	 * the real object once we commit.
-	 */
-	ret = idr_alloc(&uobj->ufile->idr, NULL, 0,
-			min_t(unsigned long, U32_MAX - 1, INT_MAX), GFP_NOWAIT);
-	if (ret >= 0)
-		uobj->id = ret;
-
-	spin_unlock(&uobj->ufile->idr_lock);
-	idr_preload_end();
-
-	return ret < 0 ? ret : 0;
+       /*
+        * We start with allocating an idr pointing to NULL. This represents an
+        * object which isn't initialized yet. We'll replace it later on with
+        * the real object once we commit.
+        */
+	return xa_alloc(&uobj->ufile->idr, &uobj->id, NULL, xa_limit_32b,
+			GFP_KERNEL);
 }
 
 /* Returns the ib_uobject or an error. The caller should check for IS_ERR. */
@@ -323,29 +313,20 @@
 		       enum rdma_lookup_mode mode)
 {
 	struct ib_uobject *uobj;
-	unsigned long idrno = id;
 
 	if (id < 0 || id > ULONG_MAX)
 		return ERR_PTR(-EINVAL);
 
 	rcu_read_lock();
-	/* object won't be released as we're protected in rcu */
-	uobj = idr_find(&ufile->idr, idrno);
-	if (!uobj) {
-		uobj = ERR_PTR(-ENOENT);
-		goto free;
-	}
-
 	/*
 	 * The idr_find is guaranteed to return a pointer to something that
 	 * isn't freed yet, or NULL, as the free after idr_remove goes through
 	 * kfree_rcu(). However the object may still have been released and
 	 * kfree() could be called at any time.
 	 */
-	if (!kref_get_unless_zero(&uobj->ref))
+	uobj = xa_load(&ufile->idr, id);
+	if (!uobj || !kref_get_unless_zero(&uobj->ref))
 		uobj = ERR_PTR(-ENOENT);
-
-free:
 	rcu_read_unlock();
 	return uobj;
 }
@@ -392,21 +373,29 @@
 
 struct ib_uobject *rdma_lookup_get_uobject(const struct uverbs_api_object *obj,
 					   struct ib_uverbs_file *ufile, s64 id,
-					   enum rdma_lookup_mode mode)
+					   enum rdma_lookup_mode mode,
+					   struct uverbs_attr_bundle *attrs)
 {
 	struct ib_uobject *uobj;
 	int ret;
 
-	if (!obj)
-		return ERR_PTR(-EINVAL);
+	if (obj == ERR_PTR(-ENOMSG)) {
+		/* must be UVERBS_IDR_ANY_OBJECT, see uapi_get_object() */
+		uobj = lookup_get_idr_uobject(NULL, ufile, id, mode);
+		if (IS_ERR(uobj))
+			return uobj;
+	} else {
+		if (IS_ERR(obj))
+			return ERR_PTR(-EINVAL);
 
-	uobj = obj->type_class->lookup_get(obj, ufile, id, mode);
-	if (IS_ERR(uobj))
-		return uobj;
+		uobj = obj->type_class->lookup_get(obj, ufile, id, mode);
+		if (IS_ERR(uobj))
+			return uobj;
 
-	if (uobj->uapi_object != obj) {
-		ret = -EINVAL;
-		goto free;
+		if (uobj->uapi_object != obj) {
+			ret = -EINVAL;
+			goto free;
+		}
 	}
 
 	/*
@@ -423,10 +412,12 @@
 	ret = uverbs_try_lock_object(uobj, mode);
 	if (ret)
 		goto free;
+	if (attrs)
+		attrs->context = uobj->context;
 
 	return uobj;
 free:
-	obj->type_class->lookup_put(uobj, mode);
+	uobj->uapi_object->type_class->lookup_put(uobj, mode);
 	uverbs_uobject_put(uobj);
 	return ERR_PTR(ret);
 }
@@ -449,14 +440,12 @@
 	ret = ib_rdmacg_try_charge(&uobj->cg_obj, uobj->context->device,
 				   RDMACG_RESOURCE_HCA_OBJECT);
 	if (ret)
-		goto idr_remove;
+		goto remove;
 
 	return uobj;
 
-idr_remove:
-	spin_lock(&ufile->idr_lock);
-	idr_remove(&ufile->idr, uobj->id);
-	spin_unlock(&ufile->idr_lock);
+remove:
+	xa_erase(&ufile->idr, uobj->id);
 uobj_put:
 	uverbs_uobject_put(uobj);
 	return ERR_PTR(ret);
@@ -486,11 +475,12 @@
 }
 
 struct ib_uobject *rdma_alloc_begin_uobject(const struct uverbs_api_object *obj,
-					    struct ib_uverbs_file *ufile)
+					    struct ib_uverbs_file *ufile,
+					    struct uverbs_attr_bundle *attrs)
 {
 	struct ib_uobject *ret;
 
-	if (!obj)
+	if (IS_ERR(obj))
 		return ERR_PTR(-EINVAL);
 
 	/*
@@ -506,6 +496,8 @@
 		up_read(&ufile->hw_destroy_rwsem);
 		return ret;
 	}
+	if (attrs)
+		attrs->context = ret->context;
 	return ret;
 }
 
@@ -514,18 +506,17 @@
 	ib_rdmacg_uncharge(&uobj->cg_obj, uobj->context->device,
 			   RDMACG_RESOURCE_HCA_OBJECT);
 
-	spin_lock(&uobj->ufile->idr_lock);
-	idr_remove(&uobj->ufile->idr, uobj->id);
-	spin_unlock(&uobj->ufile->idr_lock);
+	xa_erase(&uobj->ufile->idr, uobj->id);
 }
 
 static int __must_check destroy_hw_idr_uobject(struct ib_uobject *uobj,
-					       enum rdma_remove_reason why)
+					       enum rdma_remove_reason why,
+					       struct uverbs_attr_bundle *attrs)
 {
 	const struct uverbs_obj_idr_type *idr_type =
 		container_of(uobj->uapi_object->type_attrs,
 			     struct uverbs_obj_idr_type, type);
-	int ret = idr_type->destroy_object(uobj, why);
+	int ret = idr_type->destroy_object(uobj, why, attrs);
 
 	/*
 	 * We can only fail gracefully if the user requested to destroy the
@@ -546,9 +537,7 @@
 
 static void remove_handle_idr_uobject(struct ib_uobject *uobj)
 {
-	spin_lock(&uobj->ufile->idr_lock);
-	idr_remove(&uobj->ufile->idr, uobj->id);
-	spin_unlock(&uobj->ufile->idr_lock);
+	xa_erase(&uobj->ufile->idr, uobj->id);
 	/* Matches the kref in alloc_commit_idr_uobject */
 	uverbs_uobject_put(uobj);
 }
@@ -559,7 +548,8 @@
 }
 
 static int __must_check destroy_hw_fd_uobject(struct ib_uobject *uobj,
-					      enum rdma_remove_reason why)
+					      enum rdma_remove_reason why,
+					      struct uverbs_attr_bundle *attrs)
 {
 	const struct uverbs_obj_fd_type *fd_type = container_of(
 		uobj->uapi_object->type_attrs, struct uverbs_obj_fd_type, type);
@@ -578,17 +568,17 @@
 static int alloc_commit_idr_uobject(struct ib_uobject *uobj)
 {
 	struct ib_uverbs_file *ufile = uobj->ufile;
+	void *old;
 
-	spin_lock(&ufile->idr_lock);
 	/*
 	 * We already allocated this IDR with a NULL object, so
 	 * this shouldn't fail.
 	 *
-	 * NOTE: Once we set the IDR we loose ownership of our kref on uobj.
+	 * NOTE: Storing the uobj transfers our kref on uobj to the XArray.
 	 * It will be put by remove_commit_idr_uobject()
 	 */
-	WARN_ON(idr_replace(&ufile->idr, uobj, uobj->id));
-	spin_unlock(&ufile->idr_lock);
+	old = xa_store(&ufile->idr, uobj->id, uobj, GFP_KERNEL);
+	WARN_ON(old != NULL);
 
 	return 0;
 }
@@ -635,15 +625,16 @@
  * caller can no longer assume uobj is valid. If this function fails it
  * destroys the uboject, including the attached HW object.
  */
-int __must_check rdma_alloc_commit_uobject(struct ib_uobject *uobj)
+int __must_check rdma_alloc_commit_uobject(struct ib_uobject *uobj,
+					   struct uverbs_attr_bundle *attrs)
 {
-	struct ib_uverbs_file *ufile = uobj->ufile;
+	struct ib_uverbs_file *ufile = attrs->ufile;
 	int ret;
 
 	/* alloc_commit consumes the uobj kref */
 	ret = uobj->uapi_object->type_class->alloc_commit(uobj);
 	if (ret) {
-		uverbs_destroy_uobject(uobj, RDMA_REMOVE_ABORT);
+		uverbs_destroy_uobject(uobj, RDMA_REMOVE_ABORT, attrs);
 		up_read(&ufile->hw_destroy_rwsem);
 		return ret;
 	}
@@ -667,12 +658,13 @@
  * This consumes the kref for uobj. It is up to the caller to unwind the HW
  * object and anything else connected to uobj before calling this.
  */
-void rdma_alloc_abort_uobject(struct ib_uobject *uobj)
+void rdma_alloc_abort_uobject(struct ib_uobject *uobj,
+			      struct uverbs_attr_bundle *attrs)
 {
 	struct ib_uverbs_file *ufile = uobj->ufile;
 
 	uobj->object = NULL;
-	uverbs_destroy_uobject(uobj, RDMA_REMOVE_ABORT);
+	uverbs_destroy_uobject(uobj, RDMA_REMOVE_ABORT, attrs);
 
 	/* Matches the down_read in rdma_alloc_begin_uobject */
 	up_read(&ufile->hw_destroy_rwsem);
@@ -720,29 +712,28 @@
 
 void setup_ufile_idr_uobject(struct ib_uverbs_file *ufile)
 {
-	spin_lock_init(&ufile->idr_lock);
-	idr_init(&ufile->idr);
+	xa_init_flags(&ufile->idr, XA_FLAGS_ALLOC);
 }
 
 void release_ufile_idr_uobject(struct ib_uverbs_file *ufile)
 {
 	struct ib_uobject *entry;
-	int id;
+	unsigned long id;
 
 	/*
 	 * At this point uverbs_cleanup_ufile() is guaranteed to have run, and
-	 * there are no HW objects left, however the IDR is still populated
+	 * there are no HW objects left, however the xarray is still populated
 	 * with anything that has not been cleaned up by userspace. Since the
 	 * kref on ufile is 0, nothing is allowed to call lookup_get.
 	 *
 	 * This is an optimized equivalent to remove_handle_idr_uobject
 	 */
-	idr_for_each_entry(&ufile->idr, entry, id) {
+	xa_for_each(&ufile->idr, id, entry) {
 		WARN_ON(entry->object);
 		uverbs_uobject_put(entry);
 	}
 
-	idr_destroy(&ufile->idr);
+	xa_destroy(&ufile->idr);
 }
 
 const struct uverbs_obj_type_class uverbs_idr_class = {
@@ -774,6 +765,10 @@
 {
 	struct ib_uobject *uobj = f->private_data;
 	struct ib_uverbs_file *ufile = uobj->ufile;
+	struct uverbs_attr_bundle attrs = {
+		.context = uobj->context,
+		.ufile = ufile,
+	};
 
 	if (down_read_trylock(&ufile->hw_destroy_rwsem)) {
 		/*
@@ -783,7 +778,7 @@
 		 * write lock here, or we have a kernel bug.
 		 */
 		WARN_ON(uverbs_try_lock_object(uobj, UVERBS_LOOKUP_WRITE));
-		uverbs_destroy_uobject(uobj, RDMA_REMOVE_CLOSE);
+		uverbs_destroy_uobject(uobj, RDMA_REMOVE_CLOSE, &attrs);
 		up_read(&ufile->hw_destroy_rwsem);
 	}
 
@@ -793,44 +788,7 @@
 	/* Pairs with filp->private_data in alloc_begin_fd_uobject */
 	uverbs_uobject_put(uobj);
 }
-
-static void ufile_disassociate_ucontext(struct ib_ucontext *ibcontext)
-{
-	struct ib_device *ib_dev = ibcontext->device;
-	struct task_struct *owning_process  = NULL;
-	struct mm_struct   *owning_mm       = NULL;
-
-	owning_process = get_pid_task(ibcontext->tgid, PIDTYPE_PID);
-	if (!owning_process)
-		return;
-
-	owning_mm = get_task_mm(owning_process);
-	if (!owning_mm) {
-		pr_info("no mm, disassociate ucontext is pending task termination\n");
-		while (1) {
-			put_task_struct(owning_process);
-			usleep_range(1000, 2000);
-			owning_process = get_pid_task(ibcontext->tgid,
-						      PIDTYPE_PID);
-			if (!owning_process ||
-			    owning_process->state == TASK_DEAD) {
-				pr_info("disassociate ucontext done, task was terminated\n");
-				/* in case task was dead need to release the
-				 * task struct.
-				 */
-				if (owning_process)
-					put_task_struct(owning_process);
-				return;
-			}
-		}
-	}
-
-	down_write(&owning_mm->mmap_sem);
-	ib_dev->disassociate_ucontext(ibcontext);
-	up_write(&owning_mm->mmap_sem);
-	mmput(owning_mm);
-	put_task_struct(owning_process);
-}
+EXPORT_SYMBOL(uverbs_close_fd);
 
 /*
  * Drop the ucontext off the ufile and completely disconnect it from the
@@ -840,21 +798,26 @@
 				   enum rdma_remove_reason reason)
 {
 	struct ib_ucontext *ucontext = ufile->ucontext;
-	int ret;
-
-	if (reason == RDMA_REMOVE_DRIVER_REMOVE)
-		ufile_disassociate_ucontext(ucontext);
-
-	put_pid(ucontext->tgid);
-	ib_rdmacg_uncharge(&ucontext->cg_obj, ucontext->device,
-			   RDMACG_RESOURCE_HCA_HANDLE);
+	struct ib_device *ib_dev = ucontext->device;
 
 	/*
-	 * FIXME: Drivers are not permitted to fail dealloc_ucontext, remove
-	 * the error return.
+	 * If we are closing the FD then the user mmap VMAs must have
+	 * already been destroyed as they hold on to the filep, otherwise
+	 * they need to be zap'd.
 	 */
-	ret = ucontext->device->dealloc_ucontext(ucontext);
-	WARN_ON(ret);
+	if (reason == RDMA_REMOVE_DRIVER_REMOVE) {
+		uverbs_user_mmap_disassociate(ufile);
+		if (ib_dev->ops.disassociate_ucontext)
+			ib_dev->ops.disassociate_ucontext(ucontext);
+	}
+
+	ib_rdmacg_uncharge(&ucontext->cg_obj, ib_dev,
+			   RDMACG_RESOURCE_HCA_HANDLE);
+
+	rdma_restrack_del(&ucontext->res);
+
+	ib_dev->ops.dealloc_ucontext(ucontext);
+	kfree(ucontext);
 
 	ufile->ucontext = NULL;
 }
@@ -864,6 +827,7 @@
 {
 	struct ib_uobject *obj, *next_obj;
 	int ret = -EINVAL;
+	struct uverbs_attr_bundle attrs = { .ufile = ufile };
 
 	/*
 	 * This shouldn't run while executing other commands on this
@@ -875,12 +839,13 @@
 	 * other threads (which might still use the FDs) chance to run.
 	 */
 	list_for_each_entry_safe(obj, next_obj, &ufile->uobjects, list) {
+		attrs.context = obj->context;
 		/*
 		 * if we hit this WARN_ON, that means we are
 		 * racing with a lookup_get.
 		 */
 		WARN_ON(uverbs_try_lock_object(obj, UVERBS_LOOKUP_WRITE));
-		if (!uverbs_destroy_uobject(obj, reason))
+		if (!uverbs_destroy_uobject(obj, reason, &attrs))
 			ret = 0;
 		else
 			atomic_set(&obj->usecnt, 0);
@@ -959,26 +924,25 @@
 EXPORT_SYMBOL(uverbs_fd_class);
 
 struct ib_uobject *
-uverbs_get_uobject_from_file(u16 object_id,
-			     struct ib_uverbs_file *ufile,
-			     enum uverbs_obj_access access, s64 id)
+uverbs_get_uobject_from_file(u16 object_id, enum uverbs_obj_access access,
+			     s64 id, struct uverbs_attr_bundle *attrs)
 {
 	const struct uverbs_api_object *obj =
-		uapi_get_object(ufile->device->uapi, object_id);
+		uapi_get_object(attrs->ufile->device->uapi, object_id);
 
 	switch (access) {
 	case UVERBS_ACCESS_READ:
-		return rdma_lookup_get_uobject(obj, ufile, id,
-					       UVERBS_LOOKUP_READ);
+		return rdma_lookup_get_uobject(obj, attrs->ufile, id,
+					       UVERBS_LOOKUP_READ, attrs);
 	case UVERBS_ACCESS_DESTROY:
 		/* Actual destruction is done inside uverbs_handle_method */
-		return rdma_lookup_get_uobject(obj, ufile, id,
-					       UVERBS_LOOKUP_DESTROY);
+		return rdma_lookup_get_uobject(obj, attrs->ufile, id,
+					       UVERBS_LOOKUP_DESTROY, attrs);
 	case UVERBS_ACCESS_WRITE:
-		return rdma_lookup_get_uobject(obj, ufile, id,
-					       UVERBS_LOOKUP_WRITE);
+		return rdma_lookup_get_uobject(obj, attrs->ufile, id,
+					       UVERBS_LOOKUP_WRITE, attrs);
 	case UVERBS_ACCESS_NEW:
-		return rdma_alloc_begin_uobject(obj, ufile);
+		return rdma_alloc_begin_uobject(obj, attrs->ufile, attrs);
 	default:
 		WARN_ON(true);
 		return ERR_PTR(-EOPNOTSUPP);
@@ -986,8 +950,8 @@
 }
 
 int uverbs_finalize_object(struct ib_uobject *uobj,
-			   enum uverbs_obj_access access,
-			   bool commit)
+			   enum uverbs_obj_access access, bool commit,
+			   struct uverbs_attr_bundle *attrs)
 {
 	int ret = 0;
 
@@ -1010,9 +974,9 @@
 		break;
 	case UVERBS_ACCESS_NEW:
 		if (commit)
-			ret = rdma_alloc_commit_uobject(uobj);
+			ret = rdma_alloc_commit_uobject(uobj, attrs);
 		else
-			rdma_alloc_abort_uobject(uobj);
+			rdma_alloc_abort_uobject(uobj, attrs);
 		break;
 	default:
 		WARN_ON(true);
diff --git a/drivers/infiniband/core/rdma_core.h b/drivers/infiniband/core/rdma_core.h
index f962f2a..e63fbda 100644
--- a/drivers/infiniband/core/rdma_core.h
+++ b/drivers/infiniband/core/rdma_core.h
@@ -48,7 +48,7 @@
 void uverbs_destroy_ufile_hw(struct ib_uverbs_file *ufile,
 			     enum rdma_remove_reason reason);
 
-int uobj_destroy(struct ib_uobject *uobj);
+int uobj_destroy(struct ib_uobject *uobj, struct uverbs_attr_bundle *attrs);
 
 /*
  * uverbs_uobject_get is called in order to increase the reference count on
@@ -83,9 +83,8 @@
  * uverbs_finalize_objects are called.
  */
 struct ib_uobject *
-uverbs_get_uobject_from_file(u16 object_id,
-			     struct ib_uverbs_file *ufile,
-			     enum uverbs_obj_access access, s64 id);
+uverbs_get_uobject_from_file(u16 object_id, enum uverbs_obj_access access,
+			     s64 id, struct uverbs_attr_bundle *attrs);
 
 /*
  * Note that certain finalize stages could return a status:
@@ -103,12 +102,16 @@
  * object.
  */
 int uverbs_finalize_object(struct ib_uobject *uobj,
-			   enum uverbs_obj_access access,
-			   bool commit);
+			   enum uverbs_obj_access access, bool commit,
+			   struct uverbs_attr_bundle *attrs);
+
+int uverbs_output_written(const struct uverbs_attr_bundle *bundle, size_t idx);
 
 void setup_ufile_idr_uobject(struct ib_uverbs_file *ufile);
 void release_ufile_idr_uobject(struct ib_uverbs_file *ufile);
 
+struct ib_udata *uverbs_get_cleared_udata(struct uverbs_attr_bundle *attrs);
+
 /*
  * This is the runtime description of the uverbs API, used by the syscall
  * machinery to validate and dispatch calls.
@@ -118,47 +121,105 @@
  * Depending on ID the slot pointer in the radix tree points at one of these
  * structs.
  */
-struct uverbs_api_object {
-	const struct uverbs_obj_type *type_attrs;
-	const struct uverbs_obj_type_class *type_class;
-};
 
 struct uverbs_api_ioctl_method {
-	int (__rcu *handler)(struct ib_uverbs_file *ufile,
-			     struct uverbs_attr_bundle *ctx);
+	int(__rcu *handler)(struct uverbs_attr_bundle *attrs);
 	DECLARE_BITMAP(attr_mandatory, UVERBS_API_ATTR_BKEY_LEN);
 	u16 bundle_size;
 	u8 use_stack:1;
 	u8 driver_method:1;
+	u8 disabled:1;
+	u8 has_udata:1;
 	u8 key_bitmap_len;
 	u8 destroy_bkey;
 };
 
+struct uverbs_api_write_method {
+	int (*handler)(struct uverbs_attr_bundle *attrs);
+	u8 disabled:1;
+	u8 is_ex:1;
+	u8 has_udata:1;
+	u8 has_resp:1;
+	u8 req_size;
+	u8 resp_size;
+};
+
 struct uverbs_api_attr {
 	struct uverbs_attr_spec spec;
 };
 
-struct uverbs_api_object;
 struct uverbs_api {
 	/* radix tree contains struct uverbs_api_* pointers */
 	struct radix_tree_root radix;
 	enum rdma_driver_id driver_id;
+
+	unsigned int num_write;
+	unsigned int num_write_ex;
+	struct uverbs_api_write_method notsupp_method;
+	const struct uverbs_api_write_method **write_methods;
+	const struct uverbs_api_write_method **write_ex_methods;
 };
 
+/*
+ * Get an uverbs_api_object that corresponds to the given object_id.
+ * Note:
+ * -ENOMSG means that any object is allowed to match during lookup.
+ */
 static inline const struct uverbs_api_object *
 uapi_get_object(struct uverbs_api *uapi, u16 object_id)
 {
-	return radix_tree_lookup(&uapi->radix, uapi_key_obj(object_id));
+	const struct uverbs_api_object *res;
+
+	if (object_id == UVERBS_IDR_ANY_OBJECT)
+		return ERR_PTR(-ENOMSG);
+
+	res = radix_tree_lookup(&uapi->radix, uapi_key_obj(object_id));
+	if (!res)
+		return ERR_PTR(-ENOENT);
+
+	return res;
 }
 
 char *uapi_key_format(char *S, unsigned int key);
-struct uverbs_api *uverbs_alloc_api(
-	const struct uverbs_object_tree_def *const *driver_specs,
-	enum rdma_driver_id driver_id);
+struct uverbs_api *uverbs_alloc_api(struct ib_device *ibdev);
 void uverbs_disassociate_api_pre(struct ib_uverbs_device *uverbs_dev);
 void uverbs_disassociate_api(struct uverbs_api *uapi);
 void uverbs_destroy_api(struct uverbs_api *uapi);
 void uapi_compute_bundle_size(struct uverbs_api_ioctl_method *method_elm,
 			      unsigned int num_attrs);
+void uverbs_user_mmap_disassociate(struct ib_uverbs_file *ufile);
+
+extern const struct uapi_definition uverbs_def_obj_counters[];
+extern const struct uapi_definition uverbs_def_obj_cq[];
+extern const struct uapi_definition uverbs_def_obj_device[];
+extern const struct uapi_definition uverbs_def_obj_dm[];
+extern const struct uapi_definition uverbs_def_obj_flow_action[];
+extern const struct uapi_definition uverbs_def_obj_intf[];
+extern const struct uapi_definition uverbs_def_obj_mr[];
+extern const struct uapi_definition uverbs_def_write_intf[];
+
+static inline const struct uverbs_api_write_method *
+uapi_get_method(const struct uverbs_api *uapi, u32 command)
+{
+	u32 cmd_idx = command & IB_USER_VERBS_CMD_COMMAND_MASK;
+
+	if (command & ~(u32)(IB_USER_VERBS_CMD_FLAG_EXTENDED |
+			     IB_USER_VERBS_CMD_COMMAND_MASK))
+		return ERR_PTR(-EINVAL);
+
+	if (command & IB_USER_VERBS_CMD_FLAG_EXTENDED) {
+		if (cmd_idx >= uapi->num_write_ex)
+			return ERR_PTR(-EOPNOTSUPP);
+		return uapi->write_ex_methods[cmd_idx];
+	}
+
+	if (cmd_idx >= uapi->num_write)
+		return ERR_PTR(-EOPNOTSUPP);
+	return uapi->write_methods[cmd_idx];
+}
+
+void uverbs_fill_udata(struct uverbs_attr_bundle *bundle,
+		       struct ib_udata *udata, unsigned int attr_in,
+		       unsigned int attr_out);
 
 #endif /* RDMA_CORE_H */
diff --git a/drivers/infiniband/core/restrack.c b/drivers/infiniband/core/restrack.c
index 3b7fa0c..a07665f 100644
--- a/drivers/infiniband/core/restrack.c
+++ b/drivers/infiniband/core/restrack.c
@@ -6,24 +6,37 @@
 #include <rdma/rdma_cm.h>
 #include <rdma/ib_verbs.h>
 #include <rdma/restrack.h>
+#include <rdma/rdma_counter.h>
 #include <linux/mutex.h>
 #include <linux/sched/task.h>
 #include <linux/pid_namespace.h>
 
 #include "cma_priv.h"
+#include "restrack.h"
 
-static int fill_res_noop(struct sk_buff *msg,
-			 struct rdma_restrack_entry *entry)
+/**
+ * rdma_restrack_init() - initialize and allocate resource tracking
+ * @dev:  IB device
+ *
+ * Return: 0 on success
+ */
+int rdma_restrack_init(struct ib_device *dev)
 {
+	struct rdma_restrack_root *rt;
+	int i;
+
+	dev->res = kcalloc(RDMA_RESTRACK_MAX, sizeof(*rt), GFP_KERNEL);
+	if (!dev->res)
+		return -ENOMEM;
+
+	rt = dev->res;
+
+	for (i = 0; i < RDMA_RESTRACK_MAX; i++)
+		xa_init_flags(&rt[i].xa, XA_FLAGS_ALLOC);
+
 	return 0;
 }
 
-void rdma_restrack_init(struct rdma_restrack_root *res)
-{
-	init_rwsem(&res->rwsem);
-	res->fill_res_entry = fill_res_noop;
-}
-
 static const char *type2str(enum rdma_restrack_type type)
 {
 	static const char * const names[RDMA_RESTRACK_MAX] = {
@@ -32,61 +45,83 @@
 		[RDMA_RESTRACK_QP] = "QP",
 		[RDMA_RESTRACK_CM_ID] = "CM_ID",
 		[RDMA_RESTRACK_MR] = "MR",
+		[RDMA_RESTRACK_CTX] = "CTX",
+		[RDMA_RESTRACK_COUNTER] = "COUNTER",
 	};
 
 	return names[type];
 };
 
-void rdma_restrack_clean(struct rdma_restrack_root *res)
+/**
+ * rdma_restrack_clean() - clean resource tracking
+ * @dev:  IB device
+ */
+void rdma_restrack_clean(struct ib_device *dev)
 {
+	struct rdma_restrack_root *rt = dev->res;
 	struct rdma_restrack_entry *e;
 	char buf[TASK_COMM_LEN];
-	struct ib_device *dev;
+	bool found = false;
 	const char *owner;
-	int bkt;
+	int i;
 
-	if (hash_empty(res->hash))
-		return;
+	for (i = 0 ; i < RDMA_RESTRACK_MAX; i++) {
+		struct xarray *xa = &dev->res[i].xa;
 
-	dev = container_of(res, struct ib_device, res);
-	pr_err("restrack: %s", CUT_HERE);
-	pr_err("restrack: BUG: RESTRACK detected leak of resources on %s\n",
-	       dev->name);
-	hash_for_each(res->hash, bkt, e, node) {
-		if (rdma_is_kernel_res(e)) {
-			owner = e->kern_name;
-		} else {
-			/*
-			 * There is no need to call get_task_struct here,
-			 * because we can be here only if there are more
-			 * get_task_struct() call than put_task_struct().
-			 */
-			get_task_comm(buf, e->task);
-			owner = buf;
+		if (!xa_empty(xa)) {
+			unsigned long index;
+
+			if (!found) {
+				pr_err("restrack: %s", CUT_HERE);
+				dev_err(&dev->dev, "BUG: RESTRACK detected leak of resources\n");
+			}
+			xa_for_each(xa, index, e) {
+				if (rdma_is_kernel_res(e)) {
+					owner = e->kern_name;
+				} else {
+					/*
+					 * There is no need to call get_task_struct here,
+					 * because we can be here only if there are more
+					 * get_task_struct() call than put_task_struct().
+					 */
+					get_task_comm(buf, e->task);
+					owner = buf;
+				}
+
+				pr_err("restrack: %s %s object allocated by %s is not freed\n",
+				       rdma_is_kernel_res(e) ? "Kernel" :
+							       "User",
+				       type2str(e->type), owner);
+			}
+			found = true;
 		}
-
-		pr_err("restrack: %s %s object allocated by %s is not freed\n",
-		       rdma_is_kernel_res(e) ? "Kernel" : "User",
-		       type2str(e->type), owner);
+		xa_destroy(xa);
 	}
-	pr_err("restrack: %s", CUT_HERE);
+	if (found)
+		pr_err("restrack: %s", CUT_HERE);
+
+	kfree(rt);
 }
 
-int rdma_restrack_count(struct rdma_restrack_root *res,
-			enum rdma_restrack_type type,
-			struct pid_namespace *ns)
+/**
+ * rdma_restrack_count() - the current usage of specific object
+ * @dev:  IB device
+ * @type: actual type of object to operate
+ */
+int rdma_restrack_count(struct ib_device *dev, enum rdma_restrack_type type)
 {
+	struct rdma_restrack_root *rt = &dev->res[type];
 	struct rdma_restrack_entry *e;
+	XA_STATE(xas, &rt->xa, 0);
 	u32 cnt = 0;
 
-	down_read(&res->rwsem);
-	hash_for_each_possible(res->hash, e, node, type) {
-		if (ns == &init_pid_ns ||
-		    (!rdma_is_kernel_res(e) &&
-		     ns == task_active_pid_ns(e->task)))
-			cnt++;
+	xa_lock(&rt->xa);
+	xas_for_each(&xas, e, U32_MAX) {
+		if (!rdma_is_visible_in_pid_ns(e))
+			continue;
+		cnt++;
 	}
-	up_read(&res->rwsem);
+	xa_unlock(&rt->xa);
 	return cnt;
 }
 EXPORT_SYMBOL(rdma_restrack_count);
@@ -131,58 +166,111 @@
 				    res)->id.device;
 	case RDMA_RESTRACK_MR:
 		return container_of(res, struct ib_mr, res)->device;
+	case RDMA_RESTRACK_CTX:
+		return container_of(res, struct ib_ucontext, res)->device;
+	case RDMA_RESTRACK_COUNTER:
+		return container_of(res, struct rdma_counter, res)->device;
 	default:
 		WARN_ONCE(true, "Wrong resource tracking type %u\n", res->type);
 		return NULL;
 	}
 }
 
-static bool res_is_user(struct rdma_restrack_entry *res)
+void rdma_restrack_set_task(struct rdma_restrack_entry *res,
+			    const char *caller)
 {
-	switch (res->type) {
-	case RDMA_RESTRACK_PD:
-		return container_of(res, struct ib_pd, res)->uobject;
-	case RDMA_RESTRACK_CQ:
-		return container_of(res, struct ib_cq, res)->uobject;
-	case RDMA_RESTRACK_QP:
-		return container_of(res, struct ib_qp, res)->uobject;
-	case RDMA_RESTRACK_CM_ID:
-		return !res->kern_name;
-	case RDMA_RESTRACK_MR:
-		return container_of(res, struct ib_mr, res)->pd->uobject;
-	default:
-		WARN_ONCE(true, "Wrong resource tracking type %u\n", res->type);
-		return false;
+	if (caller) {
+		res->kern_name = caller;
+		return;
 	}
+
+	if (res->task)
+		put_task_struct(res->task);
+	get_task_struct(current);
+	res->task = current;
+}
+EXPORT_SYMBOL(rdma_restrack_set_task);
+
+/**
+ * rdma_restrack_attach_task() - attach the task onto this resource
+ * @res:  resource entry
+ * @task: the task to attach, the current task will be used if it is NULL.
+ */
+void rdma_restrack_attach_task(struct rdma_restrack_entry *res,
+			       struct task_struct *task)
+{
+	if (res->task)
+		put_task_struct(res->task);
+	get_task_struct(task);
+	res->task = task;
 }
 
-void rdma_restrack_add(struct rdma_restrack_entry *res)
+static void rdma_restrack_add(struct rdma_restrack_entry *res)
 {
 	struct ib_device *dev = res_to_dev(res);
+	struct rdma_restrack_root *rt;
+	int ret;
 
 	if (!dev)
 		return;
 
-	if (res->type != RDMA_RESTRACK_CM_ID || !res_is_user(res))
-		res->task = NULL;
-
-	if (res_is_user(res)) {
-		if (!res->task)
-			rdma_restrack_set_task(res, current);
-		res->kern_name = NULL;
-	} else {
-		set_kern_name(res);
-	}
+	rt = &dev->res[res->type];
 
 	kref_init(&res->kref);
 	init_completion(&res->comp);
-	res->valid = true;
+	if (res->type == RDMA_RESTRACK_QP) {
+		/* Special case to ensure that LQPN points to right QP */
+		struct ib_qp *qp = container_of(res, struct ib_qp, res);
 
-	down_write(&dev->res.rwsem);
-	hash_add(dev->res.hash, &res->node, res->type);
-	up_write(&dev->res.rwsem);
+		ret = xa_insert(&rt->xa, qp->qp_num, res, GFP_KERNEL);
+		res->id = ret ? 0 : qp->qp_num;
+	} else if (res->type == RDMA_RESTRACK_COUNTER) {
+		/* Special case to ensure that cntn points to right counter */
+		struct rdma_counter *counter;
+
+		counter = container_of(res, struct rdma_counter, res);
+		ret = xa_insert(&rt->xa, counter->id, res, GFP_KERNEL);
+		res->id = ret ? 0 : counter->id;
+	} else {
+		ret = xa_alloc_cyclic(&rt->xa, &res->id, res, xa_limit_32b,
+				      &rt->next_id, GFP_KERNEL);
+	}
+
+	if (!ret)
+		res->valid = true;
 }
-EXPORT_SYMBOL(rdma_restrack_add);
+
+/**
+ * rdma_restrack_kadd() - add kernel object to the reource tracking database
+ * @res:  resource entry
+ */
+void rdma_restrack_kadd(struct rdma_restrack_entry *res)
+{
+	res->task = NULL;
+	set_kern_name(res);
+	res->user = false;
+	rdma_restrack_add(res);
+}
+EXPORT_SYMBOL(rdma_restrack_kadd);
+
+/**
+ * rdma_restrack_uadd() - add user object to the reource tracking database
+ * @res:  resource entry
+ */
+void rdma_restrack_uadd(struct rdma_restrack_entry *res)
+{
+	if ((res->type != RDMA_RESTRACK_CM_ID) &&
+	    (res->type != RDMA_RESTRACK_COUNTER))
+		res->task = NULL;
+
+	if (!res->task)
+		rdma_restrack_set_task(res, NULL);
+	res->kern_name = NULL;
+
+	res->user = true;
+	rdma_restrack_add(res);
+}
+EXPORT_SYMBOL(rdma_restrack_uadd);
 
 int __must_check rdma_restrack_get(struct rdma_restrack_entry *res)
 {
@@ -190,6 +278,31 @@
 }
 EXPORT_SYMBOL(rdma_restrack_get);
 
+/**
+ * rdma_restrack_get_byid() - translate from ID to restrack object
+ * @dev: IB device
+ * @type: resource track type
+ * @id: ID to take a look
+ *
+ * Return: Pointer to restrack entry or -ENOENT in case of error.
+ */
+struct rdma_restrack_entry *
+rdma_restrack_get_byid(struct ib_device *dev,
+		       enum rdma_restrack_type type, u32 id)
+{
+	struct rdma_restrack_root *rt = &dev->res[type];
+	struct rdma_restrack_entry *res;
+
+	xa_lock(&rt->xa);
+	res = xa_load(&rt->xa, id);
+	if (!res || !rdma_restrack_get(res))
+		res = ERR_PTR(-ENOENT);
+	xa_unlock(&rt->xa);
+
+	return res;
+}
+EXPORT_SYMBOL(rdma_restrack_get_byid);
+
 static void restrack_release(struct kref *kref)
 {
 	struct rdma_restrack_entry *res;
@@ -206,24 +319,45 @@
 
 void rdma_restrack_del(struct rdma_restrack_entry *res)
 {
+	struct rdma_restrack_entry *old;
+	struct rdma_restrack_root *rt;
 	struct ib_device *dev;
 
 	if (!res->valid)
-		return;
+		goto out;
 
 	dev = res_to_dev(res);
-	if (!dev)
+	if (WARN_ON(!dev))
 		return;
 
-	rdma_restrack_put(res);
+	rt = &dev->res[res->type];
 
+	old = xa_erase(&rt->xa, res->id);
+	WARN_ON(old != res);
+	res->valid = false;
+
+	rdma_restrack_put(res);
 	wait_for_completion(&res->comp);
 
-	down_write(&dev->res.rwsem);
-	hash_del(&res->node);
-	res->valid = false;
-	if (res->task)
+out:
+	if (res->task) {
 		put_task_struct(res->task);
-	up_write(&dev->res.rwsem);
+		res->task = NULL;
+	}
 }
 EXPORT_SYMBOL(rdma_restrack_del);
+
+bool rdma_is_visible_in_pid_ns(struct rdma_restrack_entry *res)
+{
+	/*
+	 * 1. Kern resources should be visible in init
+	 *    namespace only
+	 * 2. Present only resources visible in the current
+	 *     namespace
+	 */
+	if (rdma_is_kernel_res(res))
+		return task_active_pid_ns(current) == &init_pid_ns;
+
+	/* PID 0 means that resource is not found in current namespace */
+	return task_pid_vnr(res->task);
+}
diff --git a/drivers/infiniband/core/restrack.h b/drivers/infiniband/core/restrack.h
new file mode 100644
index 0000000..7bd177c
--- /dev/null
+++ b/drivers/infiniband/core/restrack.h
@@ -0,0 +1,31 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/*
+ * Copyright (c) 2017-2019 Mellanox Technologies. All rights reserved.
+ */
+
+#ifndef _RDMA_CORE_RESTRACK_H_
+#define _RDMA_CORE_RESTRACK_H_
+
+#include <linux/mutex.h>
+
+/**
+ * struct rdma_restrack_root - main resource tracking management
+ * entity, per-device
+ */
+struct rdma_restrack_root {
+	/**
+	 * @xa: Array of XArray structure to hold restrack entries.
+	 */
+	struct xarray xa;
+	/**
+	 * @next_id: Next ID to support cyclic allocation
+	 */
+	u32 next_id;
+};
+
+int rdma_restrack_init(struct ib_device *dev);
+void rdma_restrack_clean(struct ib_device *dev);
+void rdma_restrack_attach_task(struct rdma_restrack_entry *res,
+			       struct task_struct *task);
+bool rdma_is_visible_in_pid_ns(struct rdma_restrack_entry *res);
+#endif /* _RDMA_CORE_RESTRACK_H_ */
diff --git a/drivers/infiniband/core/roce_gid_mgmt.c b/drivers/infiniband/core/roce_gid_mgmt.c
index 25d43c8..2860def 100644
--- a/drivers/infiniband/core/roce_gid_mgmt.c
+++ b/drivers/infiniband/core/roce_gid_mgmt.c
@@ -267,6 +267,9 @@
 	struct net_device *cookie_ndev = cookie;
 	bool match = false;
 
+	if (!rdma_ndev)
+		return false;
+
 	rcu_read_lock();
 	if (netif_is_bond_master(cookie_ndev) &&
 	    rdma_is_upper_dev_rcu(rdma_ndev, cookie_ndev))
@@ -327,6 +330,7 @@
 static void enum_netdev_ipv4_ips(struct ib_device *ib_dev,
 				 u8 port, struct net_device *ndev)
 {
+	const struct in_ifaddr *ifa;
 	struct in_device *in_dev;
 	struct sin_list {
 		struct list_head	list;
@@ -346,7 +350,7 @@
 		return;
 	}
 
-	for_ifa(in_dev) {
+	in_dev_for_each_ifa_rcu(ifa, in_dev) {
 		struct sin_list *entry = kzalloc(sizeof(*entry), GFP_ATOMIC);
 
 		if (!entry)
@@ -356,7 +360,7 @@
 		entry->ip.sin_addr.s_addr = ifa->ifa_address;
 		list_add_tail(&entry->list, &sin_list);
 	}
-	endfor_ifa(in_dev);
+
 	rcu_read_unlock();
 
 	list_for_each_entry_safe(sin_iter, sin_temp, &sin_list, list) {
diff --git a/drivers/infiniband/core/rw.c b/drivers/infiniband/core/rw.c
index 683e6d1..5337393 100644
--- a/drivers/infiniband/core/rw.c
+++ b/drivers/infiniband/core/rw.c
@@ -1,17 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Copyright (c) 2016 HGST, a Western Digital Company.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
  */
 #include <linux/moduleparam.h>
 #include <linux/slab.h>
+#include <linux/pci-p2pdma.h>
 #include <rdma/mr_pool.h>
 #include <rdma/rw.h>
 
@@ -58,24 +51,23 @@
 	return false;
 }
 
-static inline u32 rdma_rw_fr_page_list_len(struct ib_device *dev)
+static inline u32 rdma_rw_fr_page_list_len(struct ib_device *dev,
+					   bool pi_support)
 {
+	u32 max_pages;
+
+	if (pi_support)
+		max_pages = dev->attrs.max_pi_fast_reg_page_list_len;
+	else
+		max_pages = dev->attrs.max_fast_reg_page_list_len;
+
 	/* arbitrary limit to avoid allocating gigantic resources */
-	return min_t(u32, dev->attrs.max_fast_reg_page_list_len, 256);
+	return min_t(u32, max_pages, 256);
 }
 
-/* Caller must have zero-initialized *reg. */
-static int rdma_rw_init_one_mr(struct ib_qp *qp, u8 port_num,
-		struct rdma_rw_reg_ctx *reg, struct scatterlist *sg,
-		u32 sg_cnt, u32 offset)
+static inline int rdma_rw_inv_key(struct rdma_rw_reg_ctx *reg)
 {
-	u32 pages_per_mr = rdma_rw_fr_page_list_len(qp->pd->device);
-	u32 nents = min(sg_cnt, pages_per_mr);
-	int count = 0, ret;
-
-	reg->mr = ib_mr_pool_get(qp, &qp->rdma_mrs);
-	if (!reg->mr)
-		return -EAGAIN;
+	int count = 0;
 
 	if (reg->mr->need_inval) {
 		reg->inv_wr.opcode = IB_WR_LOCAL_INV;
@@ -86,6 +78,25 @@
 		reg->inv_wr.next = NULL;
 	}
 
+	return count;
+}
+
+/* Caller must have zero-initialized *reg. */
+static int rdma_rw_init_one_mr(struct ib_qp *qp, u8 port_num,
+		struct rdma_rw_reg_ctx *reg, struct scatterlist *sg,
+		u32 sg_cnt, u32 offset)
+{
+	u32 pages_per_mr = rdma_rw_fr_page_list_len(qp->pd->device,
+						    qp->integrity_en);
+	u32 nents = min(sg_cnt, pages_per_mr);
+	int count = 0, ret;
+
+	reg->mr = ib_mr_pool_get(qp, &qp->rdma_mrs);
+	if (!reg->mr)
+		return -EAGAIN;
+
+	count += rdma_rw_inv_key(reg);
+
 	ret = ib_map_mr_sg(reg->mr, sg, nents, &offset, PAGE_SIZE);
 	if (ret < 0 || ret < nents) {
 		ib_mr_pool_put(qp, &qp->rdma_mrs, reg->mr);
@@ -109,7 +120,8 @@
 		u64 remote_addr, u32 rkey, enum dma_data_direction dir)
 {
 	struct rdma_rw_reg_ctx *prev = NULL;
-	u32 pages_per_mr = rdma_rw_fr_page_list_len(qp->pd->device);
+	u32 pages_per_mr = rdma_rw_fr_page_list_len(qp->pd->device,
+						    qp->integrity_en);
 	int i, j, ret = 0, count = 0;
 
 	ctx->nr_ops = (sg_cnt + pages_per_mr - 1) / pages_per_mr;
@@ -178,7 +190,6 @@
 		struct scatterlist *sg, u32 sg_cnt, u32 offset,
 		u64 remote_addr, u32 rkey, enum dma_data_direction dir)
 {
-	struct ib_device *dev = qp->pd->device;
 	u32 max_sge = dir == DMA_TO_DEVICE ? qp->max_write_sge :
 		      qp->max_read_sge;
 	struct ib_sge *sge;
@@ -208,8 +219,8 @@
 		rdma_wr->wr.sg_list = sge;
 
 		for (j = 0; j < nr_sge; j++, sg = sg_next(sg)) {
-			sge->addr = ib_sg_dma_address(dev, sg) + offset;
-			sge->length = ib_sg_dma_len(dev, sg) - offset;
+			sge->addr = sg_dma_address(sg) + offset;
+			sge->length = sg_dma_len(sg) - offset;
 			sge->lkey = qp->pd->local_dma_lkey;
 
 			total_len += sge->length;
@@ -235,14 +246,13 @@
 		struct scatterlist *sg, u32 offset, u64 remote_addr, u32 rkey,
 		enum dma_data_direction dir)
 {
-	struct ib_device *dev = qp->pd->device;
 	struct ib_rdma_wr *rdma_wr = &ctx->single.wr;
 
 	ctx->nr_ops = 1;
 
 	ctx->single.sge.lkey = qp->pd->local_dma_lkey;
-	ctx->single.sge.addr = ib_sg_dma_address(dev, sg) + offset;
-	ctx->single.sge.length = ib_sg_dma_len(dev, sg) - offset;
+	ctx->single.sge.addr = sg_dma_address(sg) + offset;
+	ctx->single.sge.length = sg_dma_len(sg) - offset;
 
 	memset(rdma_wr, 0, sizeof(*rdma_wr));
 	if (dir == DMA_TO_DEVICE)
@@ -280,7 +290,11 @@
 	struct ib_device *dev = qp->pd->device;
 	int ret;
 
-	ret = ib_dma_map_sg(dev, sg, sg_cnt, dir);
+	if (is_pci_p2pdma_page(sg_page(sg)))
+		ret = pci_p2pdma_map_sg(dev->dma_device, sg, sg_cnt, dir);
+	else
+		ret = ib_dma_map_sg(dev, sg, sg_cnt, dir);
+
 	if (!ret)
 		return -ENOMEM;
 	sg_cnt = ret;
@@ -289,7 +303,7 @@
 	 * Skip to the S/G entry that sg_offset falls into:
 	 */
 	for (;;) {
-		u32 len = ib_sg_dma_len(dev, sg);
+		u32 len = sg_dma_len(sg);
 
 		if (sg_offset < len)
 			break;
@@ -348,13 +362,14 @@
 		u64 remote_addr, u32 rkey, enum dma_data_direction dir)
 {
 	struct ib_device *dev = qp->pd->device;
-	u32 pages_per_mr = rdma_rw_fr_page_list_len(qp->pd->device);
+	u32 pages_per_mr = rdma_rw_fr_page_list_len(qp->pd->device,
+						    qp->integrity_en);
 	struct ib_rdma_wr *rdma_wr;
-	struct ib_send_wr *prev_wr = NULL;
 	int count = 0, ret;
 
 	if (sg_cnt > pages_per_mr || prot_sg_cnt > pages_per_mr) {
-		pr_err("SG count too large\n");
+		pr_err("SG count too large: sg_cnt=%d, prot_sg_cnt=%d, pages_per_mr=%d\n",
+		       sg_cnt, prot_sg_cnt, pages_per_mr);
 		return -EINVAL;
 	}
 
@@ -363,75 +378,58 @@
 		return -ENOMEM;
 	sg_cnt = ret;
 
-	ret = ib_dma_map_sg(dev, prot_sg, prot_sg_cnt, dir);
-	if (!ret) {
-		ret = -ENOMEM;
-		goto out_unmap_sg;
+	if (prot_sg_cnt) {
+		ret = ib_dma_map_sg(dev, prot_sg, prot_sg_cnt, dir);
+		if (!ret) {
+			ret = -ENOMEM;
+			goto out_unmap_sg;
+		}
+		prot_sg_cnt = ret;
 	}
-	prot_sg_cnt = ret;
 
 	ctx->type = RDMA_RW_SIG_MR;
 	ctx->nr_ops = 1;
-	ctx->sig = kcalloc(1, sizeof(*ctx->sig), GFP_KERNEL);
-	if (!ctx->sig) {
+	ctx->reg = kcalloc(1, sizeof(*ctx->reg), GFP_KERNEL);
+	if (!ctx->reg) {
 		ret = -ENOMEM;
 		goto out_unmap_prot_sg;
 	}
 
-	ret = rdma_rw_init_one_mr(qp, port_num, &ctx->sig->data, sg, sg_cnt, 0);
-	if (ret < 0)
-		goto out_free_ctx;
-	count += ret;
-	prev_wr = &ctx->sig->data.reg_wr.wr;
-
-	ret = rdma_rw_init_one_mr(qp, port_num, &ctx->sig->prot,
-				  prot_sg, prot_sg_cnt, 0);
-	if (ret < 0)
-		goto out_destroy_data_mr;
-	count += ret;
-
-	if (ctx->sig->prot.inv_wr.next)
-		prev_wr->next = &ctx->sig->prot.inv_wr;
-	else
-		prev_wr->next = &ctx->sig->prot.reg_wr.wr;
-	prev_wr = &ctx->sig->prot.reg_wr.wr;
-
-	ctx->sig->sig_mr = ib_mr_pool_get(qp, &qp->sig_mrs);
-	if (!ctx->sig->sig_mr) {
+	ctx->reg->mr = ib_mr_pool_get(qp, &qp->sig_mrs);
+	if (!ctx->reg->mr) {
 		ret = -EAGAIN;
-		goto out_destroy_prot_mr;
+		goto out_free_ctx;
 	}
 
-	if (ctx->sig->sig_mr->need_inval) {
-		memset(&ctx->sig->sig_inv_wr, 0, sizeof(ctx->sig->sig_inv_wr));
+	count += rdma_rw_inv_key(ctx->reg);
 
-		ctx->sig->sig_inv_wr.opcode = IB_WR_LOCAL_INV;
-		ctx->sig->sig_inv_wr.ex.invalidate_rkey = ctx->sig->sig_mr->rkey;
+	memcpy(ctx->reg->mr->sig_attrs, sig_attrs, sizeof(struct ib_sig_attrs));
 
-		prev_wr->next = &ctx->sig->sig_inv_wr;
-		prev_wr = &ctx->sig->sig_inv_wr;
+	ret = ib_map_mr_sg_pi(ctx->reg->mr, sg, sg_cnt, NULL, prot_sg,
+			      prot_sg_cnt, NULL, SZ_4K);
+	if (unlikely(ret)) {
+		pr_err("failed to map PI sg (%d)\n", sg_cnt + prot_sg_cnt);
+		goto out_destroy_sig_mr;
 	}
 
-	ctx->sig->sig_wr.wr.opcode = IB_WR_REG_SIG_MR;
-	ctx->sig->sig_wr.wr.wr_cqe = NULL;
-	ctx->sig->sig_wr.wr.sg_list = &ctx->sig->data.sge;
-	ctx->sig->sig_wr.wr.num_sge = 1;
-	ctx->sig->sig_wr.access_flags = IB_ACCESS_LOCAL_WRITE;
-	ctx->sig->sig_wr.sig_attrs = sig_attrs;
-	ctx->sig->sig_wr.sig_mr = ctx->sig->sig_mr;
-	if (prot_sg_cnt)
-		ctx->sig->sig_wr.prot = &ctx->sig->prot.sge;
-	prev_wr->next = &ctx->sig->sig_wr.wr;
-	prev_wr = &ctx->sig->sig_wr.wr;
+	ctx->reg->reg_wr.wr.opcode = IB_WR_REG_MR_INTEGRITY;
+	ctx->reg->reg_wr.wr.wr_cqe = NULL;
+	ctx->reg->reg_wr.wr.num_sge = 0;
+	ctx->reg->reg_wr.wr.send_flags = 0;
+	ctx->reg->reg_wr.access = IB_ACCESS_LOCAL_WRITE;
+	if (rdma_protocol_iwarp(qp->device, port_num))
+		ctx->reg->reg_wr.access |= IB_ACCESS_REMOTE_WRITE;
+	ctx->reg->reg_wr.mr = ctx->reg->mr;
+	ctx->reg->reg_wr.key = ctx->reg->mr->lkey;
 	count++;
 
-	ctx->sig->sig_sge.addr = 0;
-	ctx->sig->sig_sge.length = ctx->sig->data.sge.length;
-	if (sig_attrs->wire.sig_type != IB_SIG_TYPE_NONE)
-		ctx->sig->sig_sge.length += ctx->sig->prot.sge.length;
+	ctx->reg->sge.addr = ctx->reg->mr->iova;
+	ctx->reg->sge.length = ctx->reg->mr->length;
+	if (sig_attrs->wire.sig_type == IB_SIG_TYPE_NONE)
+		ctx->reg->sge.length -= ctx->reg->mr->sig_attrs->meta_length;
 
-	rdma_wr = &ctx->sig->data.wr;
-	rdma_wr->wr.sg_list = &ctx->sig->sig_sge;
+	rdma_wr = &ctx->reg->wr;
+	rdma_wr->wr.sg_list = &ctx->reg->sge;
 	rdma_wr->wr.num_sge = 1;
 	rdma_wr->remote_addr = remote_addr;
 	rdma_wr->rkey = rkey;
@@ -439,21 +437,18 @@
 		rdma_wr->wr.opcode = IB_WR_RDMA_WRITE;
 	else
 		rdma_wr->wr.opcode = IB_WR_RDMA_READ;
-	prev_wr->next = &rdma_wr->wr;
-	prev_wr = &rdma_wr->wr;
+	ctx->reg->reg_wr.wr.next = &rdma_wr->wr;
 	count++;
 
 	return count;
 
-out_destroy_prot_mr:
-	if (prot_sg_cnt)
-		ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->sig->prot.mr);
-out_destroy_data_mr:
-	ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->sig->data.mr);
+out_destroy_sig_mr:
+	ib_mr_pool_put(qp, &qp->sig_mrs, ctx->reg->mr);
 out_free_ctx:
-	kfree(ctx->sig);
+	kfree(ctx->reg);
 out_unmap_prot_sg:
-	ib_dma_unmap_sg(dev, prot_sg, prot_sg_cnt, dir);
+	if (prot_sg_cnt)
+		ib_dma_unmap_sg(dev, prot_sg, prot_sg_cnt, dir);
 out_unmap_sg:
 	ib_dma_unmap_sg(dev, sg, sg_cnt, dir);
 	return ret;
@@ -496,22 +491,8 @@
 
 	switch (ctx->type) {
 	case RDMA_RW_SIG_MR:
-		rdma_rw_update_lkey(&ctx->sig->data, true);
-		if (ctx->sig->prot.mr)
-			rdma_rw_update_lkey(&ctx->sig->prot, true);
-	
-		ctx->sig->sig_mr->need_inval = true;
-		ib_update_fast_reg_key(ctx->sig->sig_mr,
-			ib_inc_rkey(ctx->sig->sig_mr->lkey));
-		ctx->sig->sig_sge.lkey = ctx->sig->sig_mr->lkey;
-
-		if (ctx->sig->data.inv_wr.next)
-			first_wr = &ctx->sig->data.inv_wr;
-		else
-			first_wr = &ctx->sig->data.reg_wr.wr;
-		last_wr = &ctx->sig->data.wr.wr;
-		break;
 	case RDMA_RW_MR:
+		/* fallthrough */
 		for (i = 0; i < ctx->nr_ops; i++) {
 			rdma_rw_update_lkey(&ctx->reg[i],
 				ctx->reg[i].wr.wr.opcode !=
@@ -602,13 +583,17 @@
 		break;
 	}
 
-	ib_dma_unmap_sg(qp->pd->device, sg, sg_cnt, dir);
+	if (is_pci_p2pdma_page(sg_page(sg)))
+		pci_p2pdma_unmap_sg(qp->pd->device->dma_device, sg,
+				    sg_cnt, dir);
+	else
+		ib_dma_unmap_sg(qp->pd->device, sg, sg_cnt, dir);
 }
 EXPORT_SYMBOL(rdma_rw_ctx_destroy);
 
 /**
  * rdma_rw_ctx_destroy_signature - release all resources allocated by
- *	rdma_rw_ctx_init_signature
+ *	rdma_rw_ctx_signature_init
  * @ctx:	context to release
  * @qp:		queue pair to operate on
  * @port_num:	port num to which the connection is bound
@@ -626,16 +611,12 @@
 	if (WARN_ON_ONCE(ctx->type != RDMA_RW_SIG_MR))
 		return;
 
-	ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->sig->data.mr);
+	ib_mr_pool_put(qp, &qp->sig_mrs, ctx->reg->mr);
+	kfree(ctx->reg);
+
 	ib_dma_unmap_sg(qp->pd->device, sg, sg_cnt, dir);
-
-	if (ctx->sig->prot.mr) {
-		ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->sig->prot.mr);
+	if (prot_sg_cnt)
 		ib_dma_unmap_sg(qp->pd->device, prot_sg, prot_sg_cnt, dir);
-	}
-
-	ib_mr_pool_put(qp, &qp->sig_mrs, ctx->sig->sig_mr);
-	kfree(ctx->sig);
 }
 EXPORT_SYMBOL(rdma_rw_ctx_destroy_signature);
 
@@ -656,7 +637,7 @@
 	unsigned int mr_pages;
 
 	if (rdma_rw_can_use_mr(device, port_num))
-		mr_pages = rdma_rw_fr_page_list_len(device);
+		mr_pages = rdma_rw_fr_page_list_len(device, false);
 	else
 		mr_pages = device->attrs.max_sge_rd;
 	return DIV_ROUND_UP(maxpages, mr_pages);
@@ -682,9 +663,8 @@
 	 * we'll need two additional MRs for the registrations and the
 	 * invalidation.
 	 */
-	if (attr->create_flags & IB_QP_CREATE_SIGNATURE_EN)
-		factor += 6;	/* (inv + reg) * (data + prot + sig) */
-	else if (rdma_rw_can_use_mr(dev, attr->port_num))
+	if (attr->create_flags & IB_QP_CREATE_INTEGRITY_EN ||
+	    rdma_rw_can_use_mr(dev, attr->port_num))
 		factor += 2;	/* inv + reg */
 
 	attr->cap.max_send_wr += factor * attr->cap.max_rdma_ctxs;
@@ -700,20 +680,22 @@
 int rdma_rw_init_mrs(struct ib_qp *qp, struct ib_qp_init_attr *attr)
 {
 	struct ib_device *dev = qp->pd->device;
-	u32 nr_mrs = 0, nr_sig_mrs = 0;
+	u32 nr_mrs = 0, nr_sig_mrs = 0, max_num_sg = 0;
 	int ret = 0;
 
-	if (attr->create_flags & IB_QP_CREATE_SIGNATURE_EN) {
+	if (attr->create_flags & IB_QP_CREATE_INTEGRITY_EN) {
 		nr_sig_mrs = attr->cap.max_rdma_ctxs;
-		nr_mrs = attr->cap.max_rdma_ctxs * 2;
+		nr_mrs = attr->cap.max_rdma_ctxs;
+		max_num_sg = rdma_rw_fr_page_list_len(dev, true);
 	} else if (rdma_rw_can_use_mr(dev, attr->port_num)) {
 		nr_mrs = attr->cap.max_rdma_ctxs;
+		max_num_sg = rdma_rw_fr_page_list_len(dev, false);
 	}
 
 	if (nr_mrs) {
 		ret = ib_mr_pool_init(qp, &qp->rdma_mrs, nr_mrs,
 				IB_MR_TYPE_MEM_REG,
-				rdma_rw_fr_page_list_len(dev));
+				max_num_sg, 0);
 		if (ret) {
 			pr_err("%s: failed to allocated %d MRs\n",
 				__func__, nr_mrs);
@@ -723,10 +705,10 @@
 
 	if (nr_sig_mrs) {
 		ret = ib_mr_pool_init(qp, &qp->sig_mrs, nr_sig_mrs,
-				IB_MR_TYPE_SIGNATURE, 2);
+				IB_MR_TYPE_INTEGRITY, max_num_sg, max_num_sg);
 		if (ret) {
 			pr_err("%s: failed to allocated %d SIG MRs\n",
-				__func__, nr_mrs);
+				__func__, nr_sig_mrs);
 			goto out_free_rdma_mrs;
 		}
 	}
diff --git a/drivers/infiniband/core/sa.h b/drivers/infiniband/core/sa.h
index b1d4bbf..cbaaaa9 100644
--- a/drivers/infiniband/core/sa.h
+++ b/drivers/infiniband/core/sa.h
@@ -49,16 +49,14 @@
 }
 
 int ib_sa_mcmember_rec_query(struct ib_sa_client *client,
-			     struct ib_device *device, u8 port_num,
-			     u8 method,
+			     struct ib_device *device, u8 port_num, u8 method,
 			     struct ib_sa_mcmember_rec *rec,
 			     ib_sa_comp_mask comp_mask,
-			     int timeout_ms, gfp_t gfp_mask,
+			     unsigned long timeout_ms, gfp_t gfp_mask,
 			     void (*callback)(int status,
 					      struct ib_sa_mcmember_rec *resp,
 					      void *context),
-			     void *context,
-			     struct ib_sa_query **sa_query);
+			     void *context, struct ib_sa_query **sa_query);
 
 int mcast_init(void);
 void mcast_cleanup(void);
diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c
index 7b794a1..17fc293 100644
--- a/drivers/infiniband/core/sa_query.c
+++ b/drivers/infiniband/core/sa_query.c
@@ -40,7 +40,7 @@
 #include <linux/slab.h>
 #include <linux/dma-mapping.h>
 #include <linux/kref.h>
-#include <linux/idr.h>
+#include <linux/xarray.h>
 #include <linux/workqueue.h>
 #include <uapi/linux/if_ether.h>
 #include <rdma/ib_pack.h>
@@ -183,8 +183,7 @@
 	.remove = ib_sa_remove_one
 };
 
-static DEFINE_SPINLOCK(idr_lock);
-static DEFINE_IDR(query_idr);
+static DEFINE_XARRAY_FLAGS(queries, XA_FLAGS_ALLOC | XA_FLAGS_LOCK_IRQ);
 
 static DEFINE_SPINLOCK(tid_lock);
 static u32 tid;
@@ -761,7 +760,7 @@
 
 	/* Construct the family header first */
 	header = skb_put(skb, NLMSG_ALIGN(sizeof(*header)));
-	memcpy(header->device_name, query->port->agent->device->name,
+	memcpy(header->device_name, dev_name(&query->port->agent->device->dev),
 	       LS_DEVICE_NAME_MAX);
 	header->port_num = query->port->port_num;
 
@@ -835,7 +834,6 @@
 	struct sk_buff *skb = NULL;
 	struct nlmsghdr *nlh;
 	void *data;
-	int ret = 0;
 	struct ib_sa_mad *mad;
 	int len;
 
@@ -862,13 +860,7 @@
 	/* Repair the nlmsg header length */
 	nlmsg_end(skb, nlh);
 
-	ret = rdma_nl_multicast(skb, RDMA_NL_GROUP_LS, gfp_mask);
-	if (!ret)
-		ret = len;
-	else
-		ret = 0;
-
-	return ret;
+	return rdma_nl_multicast(&init_net, skb, RDMA_NL_GROUP_LS, gfp_mask);
 }
 
 static int ib_nl_make_request(struct ib_sa_query *query, gfp_t gfp_mask)
@@ -891,14 +883,12 @@
 	spin_unlock_irqrestore(&ib_nl_request_lock, flags);
 
 	ret = ib_nl_send_msg(query, gfp_mask);
-	if (ret <= 0) {
+	if (ret) {
 		ret = -EIO;
 		/* Remove the request */
 		spin_lock_irqsave(&ib_nl_request_lock, flags);
 		list_del(&query->list);
 		spin_unlock_irqrestore(&ib_nl_request_lock, flags);
-	} else {
-		ret = 0;
 	}
 
 	return ret;
@@ -1037,8 +1027,8 @@
 	    !(NETLINK_CB(skb).sk))
 		return -EPERM;
 
-	ret = nla_parse(tb, LS_NLA_TYPE_MAX - 1, nlmsg_data(nlh),
-			nlmsg_len(nlh), ib_nl_policy, NULL);
+	ret = nla_parse_deprecated(tb, LS_NLA_TYPE_MAX - 1, nlmsg_data(nlh),
+				   nlmsg_len(nlh), ib_nl_policy, NULL);
 	attr = (const struct nlattr *)tb[LS_NLA_TYPE_TIMEOUT];
 	if (ret || !attr)
 		goto settimeout_out;
@@ -1089,8 +1079,8 @@
 	if (nlh->nlmsg_flags & RDMA_NL_LS_F_ERR)
 		return 0;
 
-	ret = nla_parse(tb, LS_NLA_TYPE_MAX - 1, nlmsg_data(nlh),
-			nlmsg_len(nlh), ib_nl_policy, NULL);
+	ret = nla_parse_deprecated(tb, LS_NLA_TYPE_MAX - 1, nlmsg_data(nlh),
+				   nlmsg_len(nlh), ib_nl_policy, NULL);
 	if (ret)
 		return 0;
 
@@ -1156,7 +1146,7 @@
 {
 	struct ib_sa_sm_ah *sm_ah = container_of(kref, struct ib_sa_sm_ah, ref);
 
-	rdma_destroy_ah(sm_ah->ah);
+	rdma_destroy_ah(sm_ah->ah, 0);
 	kfree(sm_ah);
 }
 
@@ -1189,14 +1179,14 @@
 	struct ib_mad_agent *agent;
 	struct ib_mad_send_buf *mad_buf;
 
-	spin_lock_irqsave(&idr_lock, flags);
-	if (idr_find(&query_idr, id) != query) {
-		spin_unlock_irqrestore(&idr_lock, flags);
+	xa_lock_irqsave(&queries, flags);
+	if (xa_load(&queries, id) != query) {
+		xa_unlock_irqrestore(&queries, flags);
 		return;
 	}
 	agent = query->port->agent;
 	mad_buf = query->mad_buf;
-	spin_unlock_irqrestore(&idr_lock, flags);
+	xa_unlock_irqrestore(&queries, flags);
 
 	/*
 	 * If the query is still on the netlink request list, schedule
@@ -1227,46 +1217,6 @@
 	return src_path_mask;
 }
 
-static int roce_resolve_route_from_path(struct sa_path_rec *rec,
-					const struct ib_gid_attr *attr)
-{
-	struct rdma_dev_addr dev_addr = {};
-	union {
-		struct sockaddr     _sockaddr;
-		struct sockaddr_in  _sockaddr_in;
-		struct sockaddr_in6 _sockaddr_in6;
-	} sgid_addr, dgid_addr;
-	int ret;
-
-	if (rec->roce.route_resolved)
-		return 0;
-	if (!attr || !attr->ndev)
-		return -EINVAL;
-
-	dev_addr.bound_dev_if = attr->ndev->ifindex;
-	/* TODO: Use net from the ib_gid_attr once it is added to it,
-	 * until than, limit itself to init_net.
-	 */
-	dev_addr.net = &init_net;
-
-	rdma_gid2ip(&sgid_addr._sockaddr, &rec->sgid);
-	rdma_gid2ip(&dgid_addr._sockaddr, &rec->dgid);
-
-	/* validate the route */
-	ret = rdma_resolve_ip_route(&sgid_addr._sockaddr,
-				    &dgid_addr._sockaddr, &dev_addr);
-	if (ret)
-		return ret;
-
-	if ((dev_addr.network == RDMA_NETWORK_IPV4 ||
-	     dev_addr.network == RDMA_NETWORK_IPV6) &&
-	    rec->rec_type != SA_PATH_REC_TYPE_ROCE_V2)
-		return -EINVAL;
-
-	rec->roce.route_resolved = true;
-	return 0;
-}
-
 static int init_ah_attr_grh_fields(struct ib_device *device, u8 port_num,
 				   struct sa_path_rec *rec,
 				   struct rdma_ah_attr *ah_attr,
@@ -1409,23 +1359,17 @@
 	spin_unlock_irqrestore(&tid_lock, flags);
 }
 
-static int send_mad(struct ib_sa_query *query, int timeout_ms, gfp_t gfp_mask)
+static int send_mad(struct ib_sa_query *query, unsigned long timeout_ms,
+		    gfp_t gfp_mask)
 {
-	bool preload = gfpflags_allow_blocking(gfp_mask);
 	unsigned long flags;
 	int ret, id;
 
-	if (preload)
-		idr_preload(gfp_mask);
-	spin_lock_irqsave(&idr_lock, flags);
-
-	id = idr_alloc(&query_idr, query, 0, 0, GFP_NOWAIT);
-
-	spin_unlock_irqrestore(&idr_lock, flags);
-	if (preload)
-		idr_preload_end();
-	if (id < 0)
-		return id;
+	xa_lock_irqsave(&queries, flags);
+	ret = __xa_alloc(&queries, &id, query, xa_limit_32b, gfp_mask);
+	xa_unlock_irqrestore(&queries, flags);
+	if (ret < 0)
+		return ret;
 
 	query->mad_buf->timeout_ms  = timeout_ms;
 	query->mad_buf->context[0] = query;
@@ -1433,7 +1377,7 @@
 
 	if ((query->flags & IB_SA_ENABLE_LOCAL_SERVICE) &&
 	    (!(query->flags & IB_SA_QUERY_OPA))) {
-		if (!rdma_nl_chk_listeners(RDMA_NL_GROUP_LS)) {
+		if (rdma_nl_chk_listeners(RDMA_NL_GROUP_LS)) {
 			if (!ib_nl_make_request(query, gfp_mask))
 				return id;
 		}
@@ -1442,9 +1386,9 @@
 
 	ret = ib_post_send_mad(query->mad_buf, NULL);
 	if (ret) {
-		spin_lock_irqsave(&idr_lock, flags);
-		idr_remove(&query_idr, id);
-		spin_unlock_irqrestore(&idr_lock, flags);
+		xa_lock_irqsave(&queries, flags);
+		__xa_erase(&queries, id);
+		xa_unlock_irqrestore(&queries, flags);
 	}
 
 	/*
@@ -1599,7 +1543,7 @@
 		       struct ib_device *device, u8 port_num,
 		       struct sa_path_rec *rec,
 		       ib_sa_comp_mask comp_mask,
-		       int timeout_ms, gfp_t gfp_mask,
+		       unsigned long timeout_ms, gfp_t gfp_mask,
 		       void (*callback)(int status,
 					struct sa_path_rec *resp,
 					void *context),
@@ -1753,7 +1697,7 @@
 			    struct ib_device *device, u8 port_num, u8 method,
 			    struct ib_sa_service_rec *rec,
 			    ib_sa_comp_mask comp_mask,
-			    int timeout_ms, gfp_t gfp_mask,
+			    unsigned long timeout_ms, gfp_t gfp_mask,
 			    void (*callback)(int status,
 					     struct ib_sa_service_rec *resp,
 					     void *context),
@@ -1850,7 +1794,7 @@
 			     u8 method,
 			     struct ib_sa_mcmember_rec *rec,
 			     ib_sa_comp_mask comp_mask,
-			     int timeout_ms, gfp_t gfp_mask,
+			     unsigned long timeout_ms, gfp_t gfp_mask,
 			     void (*callback)(int status,
 					      struct ib_sa_mcmember_rec *resp,
 					      void *context),
@@ -1941,7 +1885,7 @@
 			      struct ib_device *device, u8 port_num,
 			      struct ib_sa_guidinfo_rec *rec,
 			      ib_sa_comp_mask comp_mask, u8 method,
-			      int timeout_ms, gfp_t gfp_mask,
+			      unsigned long timeout_ms, gfp_t gfp_mask,
 			      void (*callback)(int status,
 					       struct ib_sa_guidinfo_rec *resp,
 					       void *context),
@@ -2108,7 +2052,7 @@
 }
 
 static int ib_sa_classport_info_rec_query(struct ib_sa_port *port,
-					  int timeout_ms,
+					  unsigned long timeout_ms,
 					  void (*callback)(void *context),
 					  void *context,
 					  struct ib_sa_query **sa_query)
@@ -2236,9 +2180,9 @@
 			break;
 		}
 
-	spin_lock_irqsave(&idr_lock, flags);
-	idr_remove(&query_idr, query->id);
-	spin_unlock_irqrestore(&idr_lock, flags);
+	xa_lock_irqsave(&queries, flags);
+	__xa_erase(&queries, query->id);
+	xa_unlock_irqrestore(&queries, flags);
 
 	free_mad(query);
 	if (query->client)
@@ -2324,7 +2268,8 @@
 					 cpu_to_be64(IB_SA_WELL_KNOWN_GUID));
 	}
 
-	new_ah->ah = rdma_create_ah(port->agent->qp->pd, &ah_attr);
+	new_ah->ah = rdma_create_ah(port->agent->qp->pd, &ah_attr,
+				    RDMA_CREATE_AH_SLEEPABLE);
 	if (IS_ERR(new_ah->ah)) {
 		pr_warn("Couldn't create new SM AH\n");
 		kfree(new_ah);
@@ -2389,9 +2334,7 @@
 	s = rdma_start_port(device);
 	e = rdma_end_port(device);
 
-	sa_dev = kzalloc(sizeof *sa_dev +
-			 (e - s + 1) * sizeof (struct ib_sa_port),
-			 GFP_KERNEL);
+	sa_dev = kzalloc(struct_size(sa_dev, port, e - s + 1), GFP_KERNEL);
 	if (!sa_dev)
 		return;
 
@@ -2524,5 +2467,5 @@
 	destroy_workqueue(ib_nl_wq);
 	mcast_cleanup();
 	ib_unregister_client(&sa_client);
-	idr_destroy(&query_idr);
+	WARN_ON(!xa_empty(&queries));
 }
diff --git a/drivers/infiniband/core/security.c b/drivers/infiniband/core/security.c
index 9b0bea8..6eb6d27 100644
--- a/drivers/infiniband/core/security.c
+++ b/drivers/infiniband/core/security.c
@@ -39,22 +39,25 @@
 #include "core_priv.h"
 #include "mad_priv.h"
 
+static LIST_HEAD(mad_agent_list);
+/* Lock to protect mad_agent_list */
+static DEFINE_SPINLOCK(mad_agent_list_lock);
+
 static struct pkey_index_qp_list *get_pkey_idx_qp_list(struct ib_port_pkey *pp)
 {
 	struct pkey_index_qp_list *pkey = NULL;
 	struct pkey_index_qp_list *tmp_pkey;
 	struct ib_device *dev = pp->sec->dev;
 
-	spin_lock(&dev->port_pkey_list[pp->port_num].list_lock);
-	list_for_each_entry(tmp_pkey,
-			    &dev->port_pkey_list[pp->port_num].pkey_list,
-			    pkey_index_list) {
+	spin_lock(&dev->port_data[pp->port_num].pkey_list_lock);
+	list_for_each_entry (tmp_pkey, &dev->port_data[pp->port_num].pkey_list,
+			     pkey_index_list) {
 		if (tmp_pkey->pkey_index == pp->pkey_index) {
 			pkey = tmp_pkey;
 			break;
 		}
 	}
-	spin_unlock(&dev->port_pkey_list[pp->port_num].list_lock);
+	spin_unlock(&dev->port_data[pp->port_num].pkey_list_lock);
 	return pkey;
 }
 
@@ -259,12 +262,12 @@
 		if (!pkey)
 			return -ENOMEM;
 
-		spin_lock(&dev->port_pkey_list[port_num].list_lock);
+		spin_lock(&dev->port_data[port_num].pkey_list_lock);
 		/* Check for the PKey again.  A racing process may
 		 * have created it.
 		 */
 		list_for_each_entry(tmp_pkey,
-				    &dev->port_pkey_list[port_num].pkey_list,
+				    &dev->port_data[port_num].pkey_list,
 				    pkey_index_list) {
 			if (tmp_pkey->pkey_index == pp->pkey_index) {
 				kfree(pkey);
@@ -279,9 +282,9 @@
 			spin_lock_init(&pkey->qp_list_lock);
 			INIT_LIST_HEAD(&pkey->qp_list);
 			list_add(&pkey->pkey_index_list,
-				 &dev->port_pkey_list[port_num].pkey_list);
+				 &dev->port_data[port_num].pkey_list);
 		}
-		spin_unlock(&dev->port_pkey_list[port_num].list_lock);
+		spin_unlock(&dev->port_data[port_num].pkey_list_lock);
 	}
 
 	spin_lock(&pkey->qp_list_lock);
@@ -418,12 +421,15 @@
 
 int ib_create_qp_security(struct ib_qp *qp, struct ib_device *dev)
 {
-	u8 i = rdma_start_port(dev);
+	unsigned int i;
 	bool is_ib = false;
 	int ret;
 
-	while (i <= rdma_end_port(dev) && !is_ib)
-		is_ib = rdma_protocol_ib(dev, i++);
+	rdma_for_each_port (dev, i) {
+		is_ib = rdma_protocol_ib(dev, i);
+		if (is_ib)
+			break;
+	}
 
 	/* If this isn't an IB device don't create the security context */
 	if (!is_ib)
@@ -544,9 +550,8 @@
 {
 	struct pkey_index_qp_list *pkey;
 
-	list_for_each_entry(pkey,
-			    &device->port_pkey_list[port_num].pkey_list,
-			    pkey_index_list) {
+	list_for_each_entry (pkey, &device->port_data[port_num].pkey_list,
+			     pkey_index_list) {
 		check_pkey_qps(pkey,
 			       device,
 			       port_num,
@@ -554,21 +559,19 @@
 	}
 }
 
-void ib_security_destroy_port_pkey_list(struct ib_device *device)
+void ib_security_release_port_pkey_list(struct ib_device *device)
 {
 	struct pkey_index_qp_list *pkey, *tmp_pkey;
-	int i;
+	unsigned int i;
 
-	for (i = rdma_start_port(device); i <= rdma_end_port(device); i++) {
-		spin_lock(&device->port_pkey_list[i].list_lock);
+	rdma_for_each_port (device, i) {
 		list_for_each_entry_safe(pkey,
 					 tmp_pkey,
-					 &device->port_pkey_list[i].pkey_list,
+					 &device->port_data[i].pkey_list,
 					 pkey_index_list) {
 			list_del(&pkey->pkey_index_list);
 			kfree(pkey);
 		}
-		spin_unlock(&device->port_pkey_list[i].list_lock);
 	}
 }
 
@@ -626,10 +629,10 @@
 	}
 
 	if (!ret)
-		ret = real_qp->device->modify_qp(real_qp,
-						 qp_attr,
-						 qp_attr_mask,
-						 udata);
+		ret = real_qp->device->ops.modify_qp(real_qp,
+						     qp_attr,
+						     qp_attr_mask,
+						     udata);
 
 	if (new_pps) {
 		/* Clean up the lists and free the appropriate
@@ -676,20 +679,18 @@
 	return security_ib_pkey_access(sec, subnet_prefix, pkey);
 }
 
-static int ib_mad_agent_security_change(struct notifier_block *nb,
-					unsigned long event,
-					void *data)
+void ib_mad_agent_security_change(void)
 {
-	struct ib_mad_agent *ag = container_of(nb, struct ib_mad_agent, lsm_nb);
+	struct ib_mad_agent *ag;
 
-	if (event != LSM_POLICY_CHANGE)
-		return NOTIFY_DONE;
-
-	ag->smp_allowed = !security_ib_endport_manage_subnet(ag->security,
-							     ag->device->name,
-							     ag->port_num);
-
-	return NOTIFY_OK;
+	spin_lock(&mad_agent_list_lock);
+	list_for_each_entry(ag,
+			    &mad_agent_list,
+			    mad_agent_sec_list)
+		WRITE_ONCE(ag->smp_allowed,
+			   !security_ib_endport_manage_subnet(ag->security,
+				dev_name(&ag->device->dev), ag->port_num));
+	spin_unlock(&mad_agent_list_lock);
 }
 
 int ib_mad_agent_security_setup(struct ib_mad_agent *agent,
@@ -700,6 +701,8 @@
 	if (!rdma_protocol_ib(agent->device, agent->port_num))
 		return 0;
 
+	INIT_LIST_HEAD(&agent->mad_agent_sec_list);
+
 	ret = security_ib_alloc_security(&agent->security);
 	if (ret)
 		return ret;
@@ -707,20 +710,22 @@
 	if (qp_type != IB_QPT_SMI)
 		return 0;
 
+	spin_lock(&mad_agent_list_lock);
 	ret = security_ib_endport_manage_subnet(agent->security,
-						agent->device->name,
+						dev_name(&agent->device->dev),
 						agent->port_num);
 	if (ret)
-		return ret;
+		goto free_security;
 
-	agent->lsm_nb.notifier_call = ib_mad_agent_security_change;
-	ret = register_lsm_notifier(&agent->lsm_nb);
-	if (ret)
-		return ret;
-
-	agent->smp_allowed = true;
-	agent->lsm_nb_reg = true;
+	WRITE_ONCE(agent->smp_allowed, true);
+	list_add(&agent->mad_agent_sec_list, &mad_agent_list);
+	spin_unlock(&mad_agent_list_lock);
 	return 0;
+
+free_security:
+	spin_unlock(&mad_agent_list_lock);
+	security_ib_free_security(agent->security);
+	return ret;
 }
 
 void ib_mad_agent_security_cleanup(struct ib_mad_agent *agent)
@@ -728,9 +733,13 @@
 	if (!rdma_protocol_ib(agent->device, agent->port_num))
 		return;
 
+	if (agent->qp->qp_type == IB_QPT_SMI) {
+		spin_lock(&mad_agent_list_lock);
+		list_del(&agent->mad_agent_sec_list);
+		spin_unlock(&mad_agent_list_lock);
+	}
+
 	security_ib_free_security(agent->security);
-	if (agent->lsm_nb_reg)
-		unregister_lsm_notifier(&agent->lsm_nb);
 }
 
 int ib_mad_enforce_security(struct ib_mad_agent_private *map, u16 pkey_index)
@@ -739,7 +748,7 @@
 		return 0;
 
 	if (map->agent.qp->qp_type == IB_QPT_SMI) {
-		if (!map->agent.smp_allowed)
+		if (!READ_ONCE(map->agent.smp_allowed))
 			return -EACCES;
 		return 0;
 	}
diff --git a/drivers/infiniband/core/smi.h b/drivers/infiniband/core/smi.h
index 33c91c8..91d9b35 100644
--- a/drivers/infiniband/core/smi.h
+++ b/drivers/infiniband/core/smi.h
@@ -67,7 +67,7 @@
 {
 	/* C14-9:3 -- We're at the end of the DR segment of path */
 	/* C14-9:4 -- Hop Pointer = Hop Count + 1 -> give to SMA/SM */
-	return ((device->process_mad &&
+	return ((device->ops.process_mad &&
 		!ib_get_smp_direction(smp) &&
 		(smp->hop_ptr == smp->hop_cnt + 1)) ?
 		IB_SMI_HANDLE : IB_SMI_DISCARD);
@@ -82,7 +82,7 @@
 {
 	/* C14-13:3 -- We're at the end of the DR segment of path */
 	/* C14-13:4 -- Hop Pointer == 0 -> give to SM */
-	return ((device->process_mad &&
+	return ((device->ops.process_mad &&
 		ib_get_smp_direction(smp) &&
 		!smp->hop_ptr) ? IB_SMI_HANDLE : IB_SMI_DISCARD);
 }
diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c
index ace40bb..7a50ced 100644
--- a/drivers/infiniband/core/sysfs.c
+++ b/drivers/infiniband/core/sysfs.c
@@ -43,6 +43,7 @@
 #include <rdma/ib_mad.h>
 #include <rdma/ib_pma.h>
 #include <rdma/ib_cache.h>
+#include <rdma/rdma_counter.h>
 
 struct ib_port;
 
@@ -288,6 +289,24 @@
 		       ib_width_enum_to_int(attr.active_width), speed);
 }
 
+static const char *phys_state_to_str(enum ib_port_phys_state phys_state)
+{
+	static const char * phys_state_str[] = {
+		"<unknown>",
+		"Sleep",
+		"Polling",
+		"Disabled",
+		"PortConfigurationTraining",
+		"LinkUp",
+		"LinkErrorRecovery",
+		"Phy Test",
+	};
+
+	if (phys_state < ARRAY_SIZE(phys_state_str))
+		return phys_state_str[phys_state];
+	return "<unknown>";
+}
+
 static ssize_t phys_state_show(struct ib_port *p, struct port_attribute *unused,
 			       char *buf)
 {
@@ -299,16 +318,8 @@
 	if (ret)
 		return ret;
 
-	switch (attr.phys_state) {
-	case 1:  return sprintf(buf, "1: Sleep\n");
-	case 2:  return sprintf(buf, "2: Polling\n");
-	case 3:  return sprintf(buf, "3: Disabled\n");
-	case 4:  return sprintf(buf, "4: PortConfigurationTraining\n");
-	case 5:  return sprintf(buf, "5: LinkUp\n");
-	case 6:  return sprintf(buf, "6: LinkErrorRecovery\n");
-	case 7:  return sprintf(buf, "7: Phy Test\n");
-	default: return sprintf(buf, "%d: <unknown>\n", attr.phys_state);
-	}
+	return sprintf(buf, "%d: %s\n", attr.phys_state,
+		       phys_state_to_str(attr.phys_state));
 }
 
 static ssize_t link_layer_show(struct ib_port *p, struct port_attribute *unused,
@@ -349,10 +360,15 @@
 
 static size_t print_ndev(const struct ib_gid_attr *gid_attr, char *buf)
 {
-	if (!gid_attr->ndev)
-		return -EINVAL;
+	struct net_device *ndev;
+	size_t ret = -EINVAL;
 
-	return sprintf(buf, "%s\n", gid_attr->ndev->name);
+	rcu_read_lock();
+	ndev = rcu_dereference(gid_attr->ndev);
+	if (ndev)
+		ret = sprintf(buf, "%s\n", ndev->name);
+	rcu_read_unlock();
+	return ret;
 }
 
 static size_t print_gid_type(const struct ib_gid_attr *gid_attr, char *buf)
@@ -462,7 +478,7 @@
 	u16 out_mad_pkey_index = 0;
 	ssize_t ret;
 
-	if (!dev->process_mad)
+	if (!dev->ops.process_mad)
 		return -ENOSYS;
 
 	in_mad  = kzalloc(sizeof *in_mad, GFP_KERNEL);
@@ -481,11 +497,11 @@
 	if (attr != IB_PMA_CLASS_PORT_INFO)
 		in_mad->data[41] = port_num;	/* PortSelect field */
 
-	if ((dev->process_mad(dev, IB_MAD_IGNORE_MKEY,
-		 port_num, NULL, NULL,
-		 (const struct ib_mad_hdr *)in_mad, mad_size,
-		 (struct ib_mad_hdr *)out_mad, &mad_size,
-		 &out_mad_pkey_index) &
+	if ((dev->ops.process_mad(dev, IB_MAD_IGNORE_MKEY,
+				  port_num, NULL, NULL,
+				  (const struct ib_mad_hdr *)in_mad, mad_size,
+				  (struct ib_mad_hdr *)out_mad, &mad_size,
+				  &out_mad_pkey_index) &
 	     (IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY)) !=
 	    (IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY)) {
 		ret = -EINVAL;
@@ -786,7 +802,7 @@
 
 	if (time_is_after_eq_jiffies(stats->timestamp + stats->lifespan))
 		return 0;
-	ret = dev->get_hw_stats(dev, stats, port_num, index);
+	ret = dev->ops.get_hw_stats(dev, stats, port_num, index);
 	if (ret < 0)
 		return ret;
 	if (ret == stats->num_counters)
@@ -795,9 +811,12 @@
 	return 0;
 }
 
-static ssize_t print_hw_stat(struct rdma_hw_stats *stats, int index, char *buf)
+static ssize_t print_hw_stat(struct ib_device *dev, int port_num,
+			     struct rdma_hw_stats *stats, int index, char *buf)
 {
-	return sprintf(buf, "%llu\n", stats->value[index]);
+	u64 v = rdma_counter_get_hwstat_value(dev, port_num, index);
+
+	return sprintf(buf, "%llu\n", stats->value[index] + v);
 }
 
 static ssize_t show_hw_stats(struct kobject *kobj, struct attribute *attr,
@@ -823,7 +842,7 @@
 	ret = update_hw_stats(dev, stats, hsa->port_num, hsa->index);
 	if (ret)
 		goto unlock;
-	ret = print_hw_stat(stats, hsa->index, buf);
+	ret = print_hw_stat(dev, hsa->port_num, stats, hsa->index, buf);
 unlock:
 	mutex_unlock(&stats->lock);
 
@@ -946,7 +965,7 @@
 	struct rdma_hw_stats *stats;
 	int i, ret;
 
-	stats = device->alloc_hw_stats(device, port_num);
+	stats = device->ops.alloc_hw_stats(device, port_num);
 
 	if (!stats)
 		return;
@@ -964,8 +983,8 @@
 	if (!hsag)
 		goto err_free_stats;
 
-	ret = device->get_hw_stats(device, stats, port_num,
-				   stats->num_counters);
+	ret = device->ops.get_hw_stats(device, stats, port_num,
+				       stats->num_counters);
 	if (ret != stats->num_counters)
 		goto err_free_hsag;
 
@@ -994,6 +1013,8 @@
 			goto err;
 		port->hw_stats_ag = hsag;
 		port->hw_stats = stats;
+		if (device->port_data)
+			device->port_data[port_num].hw_stats = stats;
 	} else {
 		struct kobject *kobj = &device->dev.kobj;
 		ret = sysfs_create_group(kobj, hsag);
@@ -1015,10 +1036,10 @@
 	return;
 }
 
-static int add_port(struct ib_device *device, int port_num,
-		    int (*port_callback)(struct ib_device *,
-					 u8, struct kobject *))
+static int add_port(struct ib_core_device *coredev, int port_num)
 {
+	struct ib_device *device = rdma_device_to_ibdev(&coredev->dev);
+	bool is_full_dev = &device->coredev == coredev;
 	struct ib_port *p;
 	struct ib_port_attr attr;
 	int i;
@@ -1036,7 +1057,7 @@
 	p->port_num   = port_num;
 
 	ret = kobject_init_and_add(&p->kobj, &port_type,
-				   device->ports_parent,
+				   coredev->ports_kobj,
 				   "%d", port_num);
 	if (ret) {
 		kfree(p);
@@ -1057,7 +1078,7 @@
 		goto err_put;
 	}
 
-	if (device->process_mad) {
+	if (device->ops.process_mad && is_full_dev) {
 		p->pma_table = get_counter_table(device, port_num);
 		ret = sysfs_create_group(&p->kobj, p->pma_table);
 		if (ret)
@@ -1113,21 +1134,21 @@
 	if (ret)
 		goto err_free_pkey;
 
-	if (port_callback) {
-		ret = port_callback(device, port_num, &p->kobj);
+	if (device->ops.init_port && is_full_dev) {
+		ret = device->ops.init_port(device, port_num, &p->kobj);
 		if (ret)
 			goto err_remove_pkey;
 	}
 
 	/*
-	 * If port == 0, it means we have only one port and the parent
-	 * device, not this port device, should be the holder of the
-	 * hw_counters
+	 * If port == 0, it means hw_counters are per device and not per
+	 * port, so holder should be device. Therefore skip per port conunter
+	 * initialization.
 	 */
-	if (device->alloc_hw_stats && port_num)
+	if (device->ops.alloc_hw_stats && port_num && is_full_dev)
 		setup_hw_stats(device, p, port_num);
 
-	list_add_tail(&p->kobj.entry, &device->port_list);
+	list_add_tail(&p->kobj.entry, &coredev->port_list);
 
 	kobject_uevent(&p->kobj, KOBJ_ADD);
 	return 0;
@@ -1186,26 +1207,28 @@
 	return ret;
 }
 
-static ssize_t show_node_type(struct device *device,
+static ssize_t node_type_show(struct device *device,
 			      struct device_attribute *attr, char *buf)
 {
-	struct ib_device *dev = container_of(device, struct ib_device, dev);
+	struct ib_device *dev = rdma_device_to_ibdev(device);
 
 	switch (dev->node_type) {
 	case RDMA_NODE_IB_CA:	  return sprintf(buf, "%d: CA\n", dev->node_type);
 	case RDMA_NODE_RNIC:	  return sprintf(buf, "%d: RNIC\n", dev->node_type);
 	case RDMA_NODE_USNIC:	  return sprintf(buf, "%d: usNIC\n", dev->node_type);
 	case RDMA_NODE_USNIC_UDP: return sprintf(buf, "%d: usNIC UDP\n", dev->node_type);
+	case RDMA_NODE_UNSPECIFIED: return sprintf(buf, "%d: unspecified\n", dev->node_type);
 	case RDMA_NODE_IB_SWITCH: return sprintf(buf, "%d: switch\n", dev->node_type);
 	case RDMA_NODE_IB_ROUTER: return sprintf(buf, "%d: router\n", dev->node_type);
 	default:		  return sprintf(buf, "%d: <unknown>\n", dev->node_type);
 	}
 }
+static DEVICE_ATTR_RO(node_type);
 
-static ssize_t show_sys_image_guid(struct device *device,
+static ssize_t sys_image_guid_show(struct device *device,
 				   struct device_attribute *dev_attr, char *buf)
 {
-	struct ib_device *dev = container_of(device, struct ib_device, dev);
+	struct ib_device *dev = rdma_device_to_ibdev(device);
 
 	return sprintf(buf, "%04x:%04x:%04x:%04x\n",
 		       be16_to_cpu(((__be16 *) &dev->attrs.sys_image_guid)[0]),
@@ -1213,11 +1236,12 @@
 		       be16_to_cpu(((__be16 *) &dev->attrs.sys_image_guid)[2]),
 		       be16_to_cpu(((__be16 *) &dev->attrs.sys_image_guid)[3]));
 }
+static DEVICE_ATTR_RO(sys_image_guid);
 
-static ssize_t show_node_guid(struct device *device,
+static ssize_t node_guid_show(struct device *device,
 			      struct device_attribute *attr, char *buf)
 {
-	struct ib_device *dev = container_of(device, struct ib_device, dev);
+	struct ib_device *dev = rdma_device_to_ibdev(device);
 
 	return sprintf(buf, "%04x:%04x:%04x:%04x\n",
 		       be16_to_cpu(((__be16 *) &dev->node_guid)[0]),
@@ -1225,24 +1249,25 @@
 		       be16_to_cpu(((__be16 *) &dev->node_guid)[2]),
 		       be16_to_cpu(((__be16 *) &dev->node_guid)[3]));
 }
+static DEVICE_ATTR_RO(node_guid);
 
-static ssize_t show_node_desc(struct device *device,
+static ssize_t node_desc_show(struct device *device,
 			      struct device_attribute *attr, char *buf)
 {
-	struct ib_device *dev = container_of(device, struct ib_device, dev);
+	struct ib_device *dev = rdma_device_to_ibdev(device);
 
 	return sprintf(buf, "%.64s\n", dev->node_desc);
 }
 
-static ssize_t set_node_desc(struct device *device,
-			     struct device_attribute *attr,
-			     const char *buf, size_t count)
+static ssize_t node_desc_store(struct device *device,
+			       struct device_attribute *attr,
+			       const char *buf, size_t count)
 {
-	struct ib_device *dev = container_of(device, struct ib_device, dev);
+	struct ib_device *dev = rdma_device_to_ibdev(device);
 	struct ib_device_modify desc = {};
 	int ret;
 
-	if (!dev->modify_device)
+	if (!dev->ops.modify_device)
 		return -EIO;
 
 	memcpy(desc.node_desc, buf, min_t(int, count, IB_DEVICE_NODE_DESC_MAX));
@@ -1252,42 +1277,47 @@
 
 	return count;
 }
+static DEVICE_ATTR_RW(node_desc);
 
-static ssize_t show_fw_ver(struct device *device, struct device_attribute *attr,
+static ssize_t fw_ver_show(struct device *device, struct device_attribute *attr,
 			   char *buf)
 {
-	struct ib_device *dev = container_of(device, struct ib_device, dev);
+	struct ib_device *dev = rdma_device_to_ibdev(device);
 
 	ib_get_device_fw_str(dev, buf);
 	strlcat(buf, "\n", IB_FW_VERSION_NAME_MAX);
 	return strlen(buf);
 }
+static DEVICE_ATTR_RO(fw_ver);
 
-static DEVICE_ATTR(node_type, S_IRUGO, show_node_type, NULL);
-static DEVICE_ATTR(sys_image_guid, S_IRUGO, show_sys_image_guid, NULL);
-static DEVICE_ATTR(node_guid, S_IRUGO, show_node_guid, NULL);
-static DEVICE_ATTR(node_desc, S_IRUGO | S_IWUSR, show_node_desc, set_node_desc);
-static DEVICE_ATTR(fw_ver, S_IRUGO, show_fw_ver, NULL);
-
-static struct device_attribute *ib_class_attributes[] = {
-	&dev_attr_node_type,
-	&dev_attr_sys_image_guid,
-	&dev_attr_node_guid,
-	&dev_attr_node_desc,
-	&dev_attr_fw_ver,
+static struct attribute *ib_dev_attrs[] = {
+	&dev_attr_node_type.attr,
+	&dev_attr_node_guid.attr,
+	&dev_attr_sys_image_guid.attr,
+	&dev_attr_fw_ver.attr,
+	&dev_attr_node_desc.attr,
+	NULL,
 };
 
-static void free_port_list_attributes(struct ib_device *device)
+const struct attribute_group ib_dev_attr_group = {
+	.attrs = ib_dev_attrs,
+};
+
+void ib_free_port_attrs(struct ib_core_device *coredev)
 {
+	struct ib_device *device = rdma_device_to_ibdev(&coredev->dev);
+	bool is_full_dev = &device->coredev == coredev;
 	struct kobject *p, *t;
 
-	list_for_each_entry_safe(p, t, &device->port_list, entry) {
+	list_for_each_entry_safe(p, t, &coredev->port_list, entry) {
 		struct ib_port *port = container_of(p, struct ib_port, kobj);
+
 		list_del(&p->entry);
-		if (port->hw_stats) {
-			kfree(port->hw_stats);
+		if (port->hw_stats_ag)
 			free_hsag(&port->kobj, port->hw_stats_ag);
-		}
+		kfree(port->hw_stats);
+		if (device->port_data && is_full_dev)
+			device->port_data[port->port_num].hw_stats = NULL;
 
 		if (port->pma_table)
 			sysfs_remove_group(p, port->pma_table);
@@ -1301,81 +1331,95 @@
 		kobject_put(p);
 	}
 
-	kobject_put(device->ports_parent);
+	kobject_put(coredev->ports_kobj);
 }
 
-int ib_device_register_sysfs(struct ib_device *device,
-			     int (*port_callback)(struct ib_device *,
-						  u8, struct kobject *))
+int ib_setup_port_attrs(struct ib_core_device *coredev)
 {
-	struct device *class_dev = &device->dev;
+	struct ib_device *device = rdma_device_to_ibdev(&coredev->dev);
+	unsigned int port;
 	int ret;
-	int i;
 
-	ret = dev_set_name(class_dev, "%s", device->name);
-	if (ret)
-		return ret;
+	coredev->ports_kobj = kobject_create_and_add("ports",
+						     &coredev->dev.kobj);
+	if (!coredev->ports_kobj)
+		return -ENOMEM;
 
-	ret = device_add(class_dev);
-	if (ret)
-		goto err;
-
-	for (i = 0; i < ARRAY_SIZE(ib_class_attributes); ++i) {
-		ret = device_create_file(class_dev, ib_class_attributes[i]);
-		if (ret)
-			goto err_unregister;
-	}
-
-	device->ports_parent = kobject_create_and_add("ports",
-						      &class_dev->kobj);
-	if (!device->ports_parent) {
-		ret = -ENOMEM;
-		goto err_put;
-	}
-
-	if (rdma_cap_ib_switch(device)) {
-		ret = add_port(device, 0, port_callback);
+	rdma_for_each_port (device, port) {
+		ret = add_port(coredev, port);
 		if (ret)
 			goto err_put;
-	} else {
-		for (i = 1; i <= device->phys_port_cnt; ++i) {
-			ret = add_port(device, i, port_callback);
-			if (ret)
-				goto err_put;
-		}
 	}
 
-	if (device->alloc_hw_stats)
-		setup_hw_stats(device, NULL, 0);
-
 	return 0;
 
 err_put:
-	free_port_list_attributes(device);
-
-err_unregister:
-	device_del(class_dev);
-
-err:
+	ib_free_port_attrs(coredev);
 	return ret;
 }
 
+int ib_device_register_sysfs(struct ib_device *device)
+{
+	int ret;
+
+	ret = ib_setup_port_attrs(&device->coredev);
+	if (ret)
+		return ret;
+
+	if (device->ops.alloc_hw_stats)
+		setup_hw_stats(device, NULL, 0);
+
+	return 0;
+}
+
 void ib_device_unregister_sysfs(struct ib_device *device)
 {
-	int i;
-
-	/* Hold kobject until ib_dealloc_device() */
-	kobject_get(&device->dev.kobj);
-
-	free_port_list_attributes(device);
-
-	if (device->hw_stats) {
-		kfree(device->hw_stats);
+	if (device->hw_stats_ag)
 		free_hsag(&device->dev.kobj, device->hw_stats_ag);
+	kfree(device->hw_stats);
+
+	ib_free_port_attrs(&device->coredev);
+}
+
+/**
+ * ib_port_register_module_stat - add module counters under relevant port
+ *  of IB device.
+ *
+ * @device: IB device to add counters
+ * @port_num: valid port number
+ * @kobj: pointer to the kobject to initialize
+ * @ktype: pointer to the ktype for this kobject.
+ * @name: the name of the kobject
+ */
+int ib_port_register_module_stat(struct ib_device *device, u8 port_num,
+				 struct kobject *kobj, struct kobj_type *ktype,
+				 const char *name)
+{
+	struct kobject *p, *t;
+	int ret;
+
+	list_for_each_entry_safe(p, t, &device->coredev.port_list, entry) {
+		struct ib_port *port = container_of(p, struct ib_port, kobj);
+
+		if (port->port_num != port_num)
+			continue;
+
+		ret = kobject_init_and_add(kobj, ktype, &port->kobj, "%s",
+					   name);
+		if (ret)
+			return ret;
 	}
 
-	for (i = 0; i < ARRAY_SIZE(ib_class_attributes); ++i)
-		device_remove_file(&device->dev, ib_class_attributes[i]);
-
-	device_unregister(&device->dev);
+	return 0;
 }
+EXPORT_SYMBOL(ib_port_register_module_stat);
+
+/**
+ * ib_port_unregister_module_stat - release module counters
+ * @kobj: pointer to the kobject to release
+ */
+void ib_port_unregister_module_stat(struct kobject *kobj)
+{
+	kobject_put(kobj);
+}
+EXPORT_SYMBOL(ib_port_unregister_module_stat);
diff --git a/drivers/infiniband/core/ucm.c b/drivers/infiniband/core/ucm.c
deleted file mode 100644
index 73332b9..0000000
--- a/drivers/infiniband/core/ucm.c
+++ /dev/null
@@ -1,1359 +0,0 @@
-/*
- * Copyright (c) 2005 Topspin Communications.  All rights reserved.
- * Copyright (c) 2005 Intel Corporation.  All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *	copyright notice, this list of conditions and the following
- *	disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *	copyright notice, this list of conditions and the following
- *	disclaimer in the documentation and/or other materials
- *	provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include <linux/completion.h>
-#include <linux/init.h>
-#include <linux/fs.h>
-#include <linux/module.h>
-#include <linux/device.h>
-#include <linux/err.h>
-#include <linux/poll.h>
-#include <linux/sched.h>
-#include <linux/file.h>
-#include <linux/mount.h>
-#include <linux/cdev.h>
-#include <linux/idr.h>
-#include <linux/mutex.h>
-#include <linux/slab.h>
-
-#include <linux/nospec.h>
-
-#include <linux/uaccess.h>
-
-#include <rdma/ib.h>
-#include <rdma/ib_cm.h>
-#include <rdma/ib_user_cm.h>
-#include <rdma/ib_marshall.h>
-
-#include "core_priv.h"
-
-MODULE_AUTHOR("Libor Michalek");
-MODULE_DESCRIPTION("InfiniBand userspace Connection Manager access");
-MODULE_LICENSE("Dual BSD/GPL");
-
-struct ib_ucm_device {
-	int			devnum;
-	struct cdev		cdev;
-	struct device		dev;
-	struct ib_device	*ib_dev;
-};
-
-struct ib_ucm_file {
-	struct mutex file_mutex;
-	struct file *filp;
-	struct ib_ucm_device *device;
-
-	struct list_head  ctxs;
-	struct list_head  events;
-	wait_queue_head_t poll_wait;
-};
-
-struct ib_ucm_context {
-	int                 id;
-	struct completion   comp;
-	atomic_t            ref;
-	int		    events_reported;
-
-	struct ib_ucm_file *file;
-	struct ib_cm_id    *cm_id;
-	__u64		   uid;
-
-	struct list_head    events;    /* list of pending events. */
-	struct list_head    file_list; /* member in file ctx list */
-};
-
-struct ib_ucm_event {
-	struct ib_ucm_context *ctx;
-	struct list_head file_list; /* member in file event list */
-	struct list_head ctx_list;  /* member in ctx event list */
-
-	struct ib_cm_id *cm_id;
-	struct ib_ucm_event_resp resp;
-	void *data;
-	void *info;
-	int data_len;
-	int info_len;
-};
-
-enum {
-	IB_UCM_MAJOR = 231,
-	IB_UCM_BASE_MINOR = 224,
-	IB_UCM_MAX_DEVICES = RDMA_MAX_PORTS,
-	IB_UCM_NUM_FIXED_MINOR = 32,
-	IB_UCM_NUM_DYNAMIC_MINOR = IB_UCM_MAX_DEVICES - IB_UCM_NUM_FIXED_MINOR,
-};
-
-#define IB_UCM_BASE_DEV MKDEV(IB_UCM_MAJOR, IB_UCM_BASE_MINOR)
-static dev_t dynamic_ucm_dev;
-
-static void ib_ucm_add_one(struct ib_device *device);
-static void ib_ucm_remove_one(struct ib_device *device, void *client_data);
-
-static struct ib_client ucm_client = {
-	.name   = "ucm",
-	.add    = ib_ucm_add_one,
-	.remove = ib_ucm_remove_one
-};
-
-static DEFINE_MUTEX(ctx_id_mutex);
-static DEFINE_IDR(ctx_id_table);
-static DECLARE_BITMAP(dev_map, IB_UCM_MAX_DEVICES);
-
-static struct ib_ucm_context *ib_ucm_ctx_get(struct ib_ucm_file *file, int id)
-{
-	struct ib_ucm_context *ctx;
-
-	mutex_lock(&ctx_id_mutex);
-	ctx = idr_find(&ctx_id_table, id);
-	if (!ctx)
-		ctx = ERR_PTR(-ENOENT);
-	else if (ctx->file != file)
-		ctx = ERR_PTR(-EINVAL);
-	else
-		atomic_inc(&ctx->ref);
-	mutex_unlock(&ctx_id_mutex);
-
-	return ctx;
-}
-
-static void ib_ucm_ctx_put(struct ib_ucm_context *ctx)
-{
-	if (atomic_dec_and_test(&ctx->ref))
-		complete(&ctx->comp);
-}
-
-static inline int ib_ucm_new_cm_id(int event)
-{
-	return event == IB_CM_REQ_RECEIVED || event == IB_CM_SIDR_REQ_RECEIVED;
-}
-
-static void ib_ucm_cleanup_events(struct ib_ucm_context *ctx)
-{
-	struct ib_ucm_event *uevent;
-
-	mutex_lock(&ctx->file->file_mutex);
-	list_del(&ctx->file_list);
-	while (!list_empty(&ctx->events)) {
-
-		uevent = list_entry(ctx->events.next,
-				    struct ib_ucm_event, ctx_list);
-		list_del(&uevent->file_list);
-		list_del(&uevent->ctx_list);
-		mutex_unlock(&ctx->file->file_mutex);
-
-		/* clear incoming connections. */
-		if (ib_ucm_new_cm_id(uevent->resp.event))
-			ib_destroy_cm_id(uevent->cm_id);
-
-		kfree(uevent);
-		mutex_lock(&ctx->file->file_mutex);
-	}
-	mutex_unlock(&ctx->file->file_mutex);
-}
-
-static struct ib_ucm_context *ib_ucm_ctx_alloc(struct ib_ucm_file *file)
-{
-	struct ib_ucm_context *ctx;
-
-	ctx = kzalloc(sizeof *ctx, GFP_KERNEL);
-	if (!ctx)
-		return NULL;
-
-	atomic_set(&ctx->ref, 1);
-	init_completion(&ctx->comp);
-	ctx->file = file;
-	INIT_LIST_HEAD(&ctx->events);
-
-	mutex_lock(&ctx_id_mutex);
-	ctx->id = idr_alloc(&ctx_id_table, ctx, 0, 0, GFP_KERNEL);
-	mutex_unlock(&ctx_id_mutex);
-	if (ctx->id < 0)
-		goto error;
-
-	list_add_tail(&ctx->file_list, &file->ctxs);
-	return ctx;
-
-error:
-	kfree(ctx);
-	return NULL;
-}
-
-static void ib_ucm_event_req_get(struct ib_ucm_req_event_resp *ureq,
-				 const struct ib_cm_req_event_param *kreq)
-{
-	ureq->remote_ca_guid             = kreq->remote_ca_guid;
-	ureq->remote_qkey                = kreq->remote_qkey;
-	ureq->remote_qpn                 = kreq->remote_qpn;
-	ureq->qp_type                    = kreq->qp_type;
-	ureq->starting_psn               = kreq->starting_psn;
-	ureq->responder_resources        = kreq->responder_resources;
-	ureq->initiator_depth            = kreq->initiator_depth;
-	ureq->local_cm_response_timeout  = kreq->local_cm_response_timeout;
-	ureq->flow_control               = kreq->flow_control;
-	ureq->remote_cm_response_timeout = kreq->remote_cm_response_timeout;
-	ureq->retry_count                = kreq->retry_count;
-	ureq->rnr_retry_count            = kreq->rnr_retry_count;
-	ureq->srq                        = kreq->srq;
-	ureq->port			 = kreq->port;
-
-	ib_copy_path_rec_to_user(&ureq->primary_path, kreq->primary_path);
-	if (kreq->alternate_path)
-		ib_copy_path_rec_to_user(&ureq->alternate_path,
-					 kreq->alternate_path);
-}
-
-static void ib_ucm_event_rep_get(struct ib_ucm_rep_event_resp *urep,
-				 const struct ib_cm_rep_event_param *krep)
-{
-	urep->remote_ca_guid      = krep->remote_ca_guid;
-	urep->remote_qkey         = krep->remote_qkey;
-	urep->remote_qpn          = krep->remote_qpn;
-	urep->starting_psn        = krep->starting_psn;
-	urep->responder_resources = krep->responder_resources;
-	urep->initiator_depth     = krep->initiator_depth;
-	urep->target_ack_delay    = krep->target_ack_delay;
-	urep->failover_accepted   = krep->failover_accepted;
-	urep->flow_control        = krep->flow_control;
-	urep->rnr_retry_count     = krep->rnr_retry_count;
-	urep->srq                 = krep->srq;
-}
-
-static void ib_ucm_event_sidr_rep_get(struct ib_ucm_sidr_rep_event_resp *urep,
-				      const struct ib_cm_sidr_rep_event_param *krep)
-{
-	urep->status = krep->status;
-	urep->qkey   = krep->qkey;
-	urep->qpn    = krep->qpn;
-};
-
-static int ib_ucm_event_process(const struct ib_cm_event *evt,
-				struct ib_ucm_event *uvt)
-{
-	void *info = NULL;
-
-	switch (evt->event) {
-	case IB_CM_REQ_RECEIVED:
-		ib_ucm_event_req_get(&uvt->resp.u.req_resp,
-				     &evt->param.req_rcvd);
-		uvt->data_len      = IB_CM_REQ_PRIVATE_DATA_SIZE;
-		uvt->resp.present  = IB_UCM_PRES_PRIMARY;
-		uvt->resp.present |= (evt->param.req_rcvd.alternate_path ?
-				      IB_UCM_PRES_ALTERNATE : 0);
-		break;
-	case IB_CM_REP_RECEIVED:
-		ib_ucm_event_rep_get(&uvt->resp.u.rep_resp,
-				     &evt->param.rep_rcvd);
-		uvt->data_len = IB_CM_REP_PRIVATE_DATA_SIZE;
-		break;
-	case IB_CM_RTU_RECEIVED:
-		uvt->data_len = IB_CM_RTU_PRIVATE_DATA_SIZE;
-		uvt->resp.u.send_status = evt->param.send_status;
-		break;
-	case IB_CM_DREQ_RECEIVED:
-		uvt->data_len = IB_CM_DREQ_PRIVATE_DATA_SIZE;
-		uvt->resp.u.send_status = evt->param.send_status;
-		break;
-	case IB_CM_DREP_RECEIVED:
-		uvt->data_len = IB_CM_DREP_PRIVATE_DATA_SIZE;
-		uvt->resp.u.send_status = evt->param.send_status;
-		break;
-	case IB_CM_MRA_RECEIVED:
-		uvt->resp.u.mra_resp.timeout =
-					evt->param.mra_rcvd.service_timeout;
-		uvt->data_len = IB_CM_MRA_PRIVATE_DATA_SIZE;
-		break;
-	case IB_CM_REJ_RECEIVED:
-		uvt->resp.u.rej_resp.reason = evt->param.rej_rcvd.reason;
-		uvt->data_len = IB_CM_REJ_PRIVATE_DATA_SIZE;
-		uvt->info_len = evt->param.rej_rcvd.ari_length;
-		info	      = evt->param.rej_rcvd.ari;
-		break;
-	case IB_CM_LAP_RECEIVED:
-		ib_copy_path_rec_to_user(&uvt->resp.u.lap_resp.path,
-					 evt->param.lap_rcvd.alternate_path);
-		uvt->data_len = IB_CM_LAP_PRIVATE_DATA_SIZE;
-		uvt->resp.present = IB_UCM_PRES_ALTERNATE;
-		break;
-	case IB_CM_APR_RECEIVED:
-		uvt->resp.u.apr_resp.status = evt->param.apr_rcvd.ap_status;
-		uvt->data_len = IB_CM_APR_PRIVATE_DATA_SIZE;
-		uvt->info_len = evt->param.apr_rcvd.info_len;
-		info	      = evt->param.apr_rcvd.apr_info;
-		break;
-	case IB_CM_SIDR_REQ_RECEIVED:
-		uvt->resp.u.sidr_req_resp.pkey =
-					evt->param.sidr_req_rcvd.pkey;
-		uvt->resp.u.sidr_req_resp.port =
-					evt->param.sidr_req_rcvd.port;
-		uvt->data_len = IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE;
-		break;
-	case IB_CM_SIDR_REP_RECEIVED:
-		ib_ucm_event_sidr_rep_get(&uvt->resp.u.sidr_rep_resp,
-					  &evt->param.sidr_rep_rcvd);
-		uvt->data_len = IB_CM_SIDR_REP_PRIVATE_DATA_SIZE;
-		uvt->info_len = evt->param.sidr_rep_rcvd.info_len;
-		info	      = evt->param.sidr_rep_rcvd.info;
-		break;
-	default:
-		uvt->resp.u.send_status = evt->param.send_status;
-		break;
-	}
-
-	if (uvt->data_len) {
-		uvt->data = kmemdup(evt->private_data, uvt->data_len, GFP_KERNEL);
-		if (!uvt->data)
-			goto err1;
-
-		uvt->resp.present |= IB_UCM_PRES_DATA;
-	}
-
-	if (uvt->info_len) {
-		uvt->info = kmemdup(info, uvt->info_len, GFP_KERNEL);
-		if (!uvt->info)
-			goto err2;
-
-		uvt->resp.present |= IB_UCM_PRES_INFO;
-	}
-	return 0;
-
-err2:
-	kfree(uvt->data);
-err1:
-	return -ENOMEM;
-}
-
-static int ib_ucm_event_handler(struct ib_cm_id *cm_id,
-				const struct ib_cm_event *event)
-{
-	struct ib_ucm_event *uevent;
-	struct ib_ucm_context *ctx;
-	int result = 0;
-
-	ctx = cm_id->context;
-
-	uevent = kzalloc(sizeof *uevent, GFP_KERNEL);
-	if (!uevent)
-		goto err1;
-
-	uevent->ctx = ctx;
-	uevent->cm_id = cm_id;
-	uevent->resp.uid = ctx->uid;
-	uevent->resp.id = ctx->id;
-	uevent->resp.event = event->event;
-
-	result = ib_ucm_event_process(event, uevent);
-	if (result)
-		goto err2;
-
-	mutex_lock(&ctx->file->file_mutex);
-	list_add_tail(&uevent->file_list, &ctx->file->events);
-	list_add_tail(&uevent->ctx_list, &ctx->events);
-	wake_up_interruptible(&ctx->file->poll_wait);
-	mutex_unlock(&ctx->file->file_mutex);
-	return 0;
-
-err2:
-	kfree(uevent);
-err1:
-	/* Destroy new cm_id's */
-	return ib_ucm_new_cm_id(event->event);
-}
-
-static ssize_t ib_ucm_event(struct ib_ucm_file *file,
-			    const char __user *inbuf,
-			    int in_len, int out_len)
-{
-	struct ib_ucm_context *ctx;
-	struct ib_ucm_event_get cmd;
-	struct ib_ucm_event *uevent;
-	int result = 0;
-
-	if (out_len < sizeof(struct ib_ucm_event_resp))
-		return -ENOSPC;
-
-	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
-		return -EFAULT;
-
-	mutex_lock(&file->file_mutex);
-	while (list_empty(&file->events)) {
-		mutex_unlock(&file->file_mutex);
-
-		if (file->filp->f_flags & O_NONBLOCK)
-			return -EAGAIN;
-
-		if (wait_event_interruptible(file->poll_wait,
-					     !list_empty(&file->events)))
-			return -ERESTARTSYS;
-
-		mutex_lock(&file->file_mutex);
-	}
-
-	uevent = list_entry(file->events.next, struct ib_ucm_event, file_list);
-
-	if (ib_ucm_new_cm_id(uevent->resp.event)) {
-		ctx = ib_ucm_ctx_alloc(file);
-		if (!ctx) {
-			result = -ENOMEM;
-			goto done;
-		}
-
-		ctx->cm_id = uevent->cm_id;
-		ctx->cm_id->context = ctx;
-		uevent->resp.id = ctx->id;
-	}
-
-	if (copy_to_user(u64_to_user_ptr(cmd.response),
-			 &uevent->resp, sizeof(uevent->resp))) {
-		result = -EFAULT;
-		goto done;
-	}
-
-	if (uevent->data) {
-		if (cmd.data_len < uevent->data_len) {
-			result = -ENOMEM;
-			goto done;
-		}
-		if (copy_to_user(u64_to_user_ptr(cmd.data),
-				 uevent->data, uevent->data_len)) {
-			result = -EFAULT;
-			goto done;
-		}
-	}
-
-	if (uevent->info) {
-		if (cmd.info_len < uevent->info_len) {
-			result = -ENOMEM;
-			goto done;
-		}
-		if (copy_to_user(u64_to_user_ptr(cmd.info),
-				 uevent->info, uevent->info_len)) {
-			result = -EFAULT;
-			goto done;
-		}
-	}
-
-	list_del(&uevent->file_list);
-	list_del(&uevent->ctx_list);
-	uevent->ctx->events_reported++;
-
-	kfree(uevent->data);
-	kfree(uevent->info);
-	kfree(uevent);
-done:
-	mutex_unlock(&file->file_mutex);
-	return result;
-}
-
-static ssize_t ib_ucm_create_id(struct ib_ucm_file *file,
-				const char __user *inbuf,
-				int in_len, int out_len)
-{
-	struct ib_ucm_create_id cmd;
-	struct ib_ucm_create_id_resp resp;
-	struct ib_ucm_context *ctx;
-	int result;
-
-	if (out_len < sizeof(resp))
-		return -ENOSPC;
-
-	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
-		return -EFAULT;
-
-	mutex_lock(&file->file_mutex);
-	ctx = ib_ucm_ctx_alloc(file);
-	mutex_unlock(&file->file_mutex);
-	if (!ctx)
-		return -ENOMEM;
-
-	ctx->uid = cmd.uid;
-	ctx->cm_id = ib_create_cm_id(file->device->ib_dev,
-				     ib_ucm_event_handler, ctx);
-	if (IS_ERR(ctx->cm_id)) {
-		result = PTR_ERR(ctx->cm_id);
-		goto err1;
-	}
-
-	resp.id = ctx->id;
-	if (copy_to_user(u64_to_user_ptr(cmd.response),
-			 &resp, sizeof(resp))) {
-		result = -EFAULT;
-		goto err2;
-	}
-	return 0;
-
-err2:
-	ib_destroy_cm_id(ctx->cm_id);
-err1:
-	mutex_lock(&ctx_id_mutex);
-	idr_remove(&ctx_id_table, ctx->id);
-	mutex_unlock(&ctx_id_mutex);
-	kfree(ctx);
-	return result;
-}
-
-static ssize_t ib_ucm_destroy_id(struct ib_ucm_file *file,
-				 const char __user *inbuf,
-				 int in_len, int out_len)
-{
-	struct ib_ucm_destroy_id cmd;
-	struct ib_ucm_destroy_id_resp resp;
-	struct ib_ucm_context *ctx;
-	int result = 0;
-
-	if (out_len < sizeof(resp))
-		return -ENOSPC;
-
-	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
-		return -EFAULT;
-
-	mutex_lock(&ctx_id_mutex);
-	ctx = idr_find(&ctx_id_table, cmd.id);
-	if (!ctx)
-		ctx = ERR_PTR(-ENOENT);
-	else if (ctx->file != file)
-		ctx = ERR_PTR(-EINVAL);
-	else
-		idr_remove(&ctx_id_table, ctx->id);
-	mutex_unlock(&ctx_id_mutex);
-
-	if (IS_ERR(ctx))
-		return PTR_ERR(ctx);
-
-	ib_ucm_ctx_put(ctx);
-	wait_for_completion(&ctx->comp);
-
-	/* No new events will be generated after destroying the cm_id. */
-	ib_destroy_cm_id(ctx->cm_id);
-	/* Cleanup events not yet reported to the user. */
-	ib_ucm_cleanup_events(ctx);
-
-	resp.events_reported = ctx->events_reported;
-	if (copy_to_user(u64_to_user_ptr(cmd.response),
-			 &resp, sizeof(resp)))
-		result = -EFAULT;
-
-	kfree(ctx);
-	return result;
-}
-
-static ssize_t ib_ucm_attr_id(struct ib_ucm_file *file,
-			      const char __user *inbuf,
-			      int in_len, int out_len)
-{
-	struct ib_ucm_attr_id_resp resp;
-	struct ib_ucm_attr_id cmd;
-	struct ib_ucm_context *ctx;
-	int result = 0;
-
-	if (out_len < sizeof(resp))
-		return -ENOSPC;
-
-	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
-		return -EFAULT;
-
-	ctx = ib_ucm_ctx_get(file, cmd.id);
-	if (IS_ERR(ctx))
-		return PTR_ERR(ctx);
-
-	resp.service_id   = ctx->cm_id->service_id;
-	resp.service_mask = ctx->cm_id->service_mask;
-	resp.local_id     = ctx->cm_id->local_id;
-	resp.remote_id    = ctx->cm_id->remote_id;
-
-	if (copy_to_user(u64_to_user_ptr(cmd.response),
-			 &resp, sizeof(resp)))
-		result = -EFAULT;
-
-	ib_ucm_ctx_put(ctx);
-	return result;
-}
-
-static ssize_t ib_ucm_init_qp_attr(struct ib_ucm_file *file,
-				   const char __user *inbuf,
-				   int in_len, int out_len)
-{
-	struct ib_uverbs_qp_attr resp;
-	struct ib_ucm_init_qp_attr cmd;
-	struct ib_ucm_context *ctx;
-	struct ib_qp_attr qp_attr;
-	int result = 0;
-
-	if (out_len < sizeof(resp))
-		return -ENOSPC;
-
-	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
-		return -EFAULT;
-
-	ctx = ib_ucm_ctx_get(file, cmd.id);
-	if (IS_ERR(ctx))
-		return PTR_ERR(ctx);
-
-	resp.qp_attr_mask = 0;
-	memset(&qp_attr, 0, sizeof qp_attr);
-	qp_attr.qp_state = cmd.qp_state;
-	result = ib_cm_init_qp_attr(ctx->cm_id, &qp_attr, &resp.qp_attr_mask);
-	if (result)
-		goto out;
-
-	ib_copy_qp_attr_to_user(ctx->cm_id->device, &resp, &qp_attr);
-
-	if (copy_to_user(u64_to_user_ptr(cmd.response),
-			 &resp, sizeof(resp)))
-		result = -EFAULT;
-
-out:
-	ib_ucm_ctx_put(ctx);
-	return result;
-}
-
-static int ucm_validate_listen(__be64 service_id, __be64 service_mask)
-{
-	service_id &= service_mask;
-
-	if (((service_id & IB_CMA_SERVICE_ID_MASK) == IB_CMA_SERVICE_ID) ||
-	    ((service_id & IB_SDP_SERVICE_ID_MASK) == IB_SDP_SERVICE_ID))
-		return -EINVAL;
-
-	return 0;
-}
-
-static ssize_t ib_ucm_listen(struct ib_ucm_file *file,
-			     const char __user *inbuf,
-			     int in_len, int out_len)
-{
-	struct ib_ucm_listen cmd;
-	struct ib_ucm_context *ctx;
-	int result;
-
-	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
-		return -EFAULT;
-
-	ctx = ib_ucm_ctx_get(file, cmd.id);
-	if (IS_ERR(ctx))
-		return PTR_ERR(ctx);
-
-	result = ucm_validate_listen(cmd.service_id, cmd.service_mask);
-	if (result)
-		goto out;
-
-	result = ib_cm_listen(ctx->cm_id, cmd.service_id, cmd.service_mask);
-out:
-	ib_ucm_ctx_put(ctx);
-	return result;
-}
-
-static ssize_t ib_ucm_notify(struct ib_ucm_file *file,
-			     const char __user *inbuf,
-			     int in_len, int out_len)
-{
-	struct ib_ucm_notify cmd;
-	struct ib_ucm_context *ctx;
-	int result;
-
-	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
-		return -EFAULT;
-
-	ctx = ib_ucm_ctx_get(file, cmd.id);
-	if (IS_ERR(ctx))
-		return PTR_ERR(ctx);
-
-	result = ib_cm_notify(ctx->cm_id, (enum ib_event_type) cmd.event);
-	ib_ucm_ctx_put(ctx);
-	return result;
-}
-
-static int ib_ucm_alloc_data(const void **dest, u64 src, u32 len)
-{
-	void *data;
-
-	*dest = NULL;
-
-	if (!len)
-		return 0;
-
-	data = memdup_user(u64_to_user_ptr(src), len);
-	if (IS_ERR(data))
-		return PTR_ERR(data);
-
-	*dest = data;
-	return 0;
-}
-
-static int ib_ucm_path_get(struct sa_path_rec **path, u64 src)
-{
-	struct ib_user_path_rec upath;
-	struct sa_path_rec  *sa_path;
-
-	*path = NULL;
-
-	if (!src)
-		return 0;
-
-	sa_path = kmalloc(sizeof(*sa_path), GFP_KERNEL);
-	if (!sa_path)
-		return -ENOMEM;
-
-	if (copy_from_user(&upath, u64_to_user_ptr(src),
-			   sizeof(upath))) {
-
-		kfree(sa_path);
-		return -EFAULT;
-	}
-
-	ib_copy_path_rec_from_user(sa_path, &upath);
-	*path = sa_path;
-	return 0;
-}
-
-static ssize_t ib_ucm_send_req(struct ib_ucm_file *file,
-			       const char __user *inbuf,
-			       int in_len, int out_len)
-{
-	struct ib_cm_req_param param;
-	struct ib_ucm_context *ctx;
-	struct ib_ucm_req cmd;
-	int result;
-
-	param.private_data   = NULL;
-	param.primary_path   = NULL;
-	param.alternate_path = NULL;
-
-	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
-		return -EFAULT;
-
-	result = ib_ucm_alloc_data(&param.private_data, cmd.data, cmd.len);
-	if (result)
-		goto done;
-
-	result = ib_ucm_path_get(&param.primary_path, cmd.primary_path);
-	if (result)
-		goto done;
-
-	result = ib_ucm_path_get(&param.alternate_path, cmd.alternate_path);
-	if (result)
-		goto done;
-
-	param.private_data_len           = cmd.len;
-	param.service_id                 = cmd.sid;
-	param.qp_num                     = cmd.qpn;
-	param.qp_type                    = cmd.qp_type;
-	param.starting_psn               = cmd.psn;
-	param.peer_to_peer               = cmd.peer_to_peer;
-	param.responder_resources        = cmd.responder_resources;
-	param.initiator_depth            = cmd.initiator_depth;
-	param.remote_cm_response_timeout = cmd.remote_cm_response_timeout;
-	param.flow_control               = cmd.flow_control;
-	param.local_cm_response_timeout  = cmd.local_cm_response_timeout;
-	param.retry_count                = cmd.retry_count;
-	param.rnr_retry_count            = cmd.rnr_retry_count;
-	param.max_cm_retries             = cmd.max_cm_retries;
-	param.srq                        = cmd.srq;
-
-	ctx = ib_ucm_ctx_get(file, cmd.id);
-	if (!IS_ERR(ctx)) {
-		result = ib_send_cm_req(ctx->cm_id, &param);
-		ib_ucm_ctx_put(ctx);
-	} else
-		result = PTR_ERR(ctx);
-
-done:
-	kfree(param.private_data);
-	kfree(param.primary_path);
-	kfree(param.alternate_path);
-	return result;
-}
-
-static ssize_t ib_ucm_send_rep(struct ib_ucm_file *file,
-			       const char __user *inbuf,
-			       int in_len, int out_len)
-{
-	struct ib_cm_rep_param param;
-	struct ib_ucm_context *ctx;
-	struct ib_ucm_rep cmd;
-	int result;
-
-	param.private_data = NULL;
-
-	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
-		return -EFAULT;
-
-	result = ib_ucm_alloc_data(&param.private_data, cmd.data, cmd.len);
-	if (result)
-		return result;
-
-	param.qp_num              = cmd.qpn;
-	param.starting_psn        = cmd.psn;
-	param.private_data_len    = cmd.len;
-	param.responder_resources = cmd.responder_resources;
-	param.initiator_depth     = cmd.initiator_depth;
-	param.failover_accepted   = cmd.failover_accepted;
-	param.flow_control        = cmd.flow_control;
-	param.rnr_retry_count     = cmd.rnr_retry_count;
-	param.srq                 = cmd.srq;
-
-	ctx = ib_ucm_ctx_get(file, cmd.id);
-	if (!IS_ERR(ctx)) {
-		ctx->uid = cmd.uid;
-		result = ib_send_cm_rep(ctx->cm_id, &param);
-		ib_ucm_ctx_put(ctx);
-	} else
-		result = PTR_ERR(ctx);
-
-	kfree(param.private_data);
-	return result;
-}
-
-static ssize_t ib_ucm_send_private_data(struct ib_ucm_file *file,
-					const char __user *inbuf, int in_len,
-					int (*func)(struct ib_cm_id *cm_id,
-						    const void *private_data,
-						    u8 private_data_len))
-{
-	struct ib_ucm_private_data cmd;
-	struct ib_ucm_context *ctx;
-	const void *private_data = NULL;
-	int result;
-
-	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
-		return -EFAULT;
-
-	result = ib_ucm_alloc_data(&private_data, cmd.data, cmd.len);
-	if (result)
-		return result;
-
-	ctx = ib_ucm_ctx_get(file, cmd.id);
-	if (!IS_ERR(ctx)) {
-		result = func(ctx->cm_id, private_data, cmd.len);
-		ib_ucm_ctx_put(ctx);
-	} else
-		result = PTR_ERR(ctx);
-
-	kfree(private_data);
-	return result;
-}
-
-static ssize_t ib_ucm_send_rtu(struct ib_ucm_file *file,
-			       const char __user *inbuf,
-			       int in_len, int out_len)
-{
-	return ib_ucm_send_private_data(file, inbuf, in_len, ib_send_cm_rtu);
-}
-
-static ssize_t ib_ucm_send_dreq(struct ib_ucm_file *file,
-				const char __user *inbuf,
-				int in_len, int out_len)
-{
-	return ib_ucm_send_private_data(file, inbuf, in_len, ib_send_cm_dreq);
-}
-
-static ssize_t ib_ucm_send_drep(struct ib_ucm_file *file,
-				const char __user *inbuf,
-				int in_len, int out_len)
-{
-	return ib_ucm_send_private_data(file, inbuf, in_len, ib_send_cm_drep);
-}
-
-static ssize_t ib_ucm_send_info(struct ib_ucm_file *file,
-				const char __user *inbuf, int in_len,
-				int (*func)(struct ib_cm_id *cm_id,
-					    int status,
-					    const void *info,
-					    u8 info_len,
-					    const void *data,
-					    u8 data_len))
-{
-	struct ib_ucm_context *ctx;
-	struct ib_ucm_info cmd;
-	const void *data = NULL;
-	const void *info = NULL;
-	int result;
-
-	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
-		return -EFAULT;
-
-	result = ib_ucm_alloc_data(&data, cmd.data, cmd.data_len);
-	if (result)
-		goto done;
-
-	result = ib_ucm_alloc_data(&info, cmd.info, cmd.info_len);
-	if (result)
-		goto done;
-
-	ctx = ib_ucm_ctx_get(file, cmd.id);
-	if (!IS_ERR(ctx)) {
-		result = func(ctx->cm_id, cmd.status, info, cmd.info_len,
-			      data, cmd.data_len);
-		ib_ucm_ctx_put(ctx);
-	} else
-		result = PTR_ERR(ctx);
-
-done:
-	kfree(data);
-	kfree(info);
-	return result;
-}
-
-static ssize_t ib_ucm_send_rej(struct ib_ucm_file *file,
-			       const char __user *inbuf,
-			       int in_len, int out_len)
-{
-	return ib_ucm_send_info(file, inbuf, in_len, (void *)ib_send_cm_rej);
-}
-
-static ssize_t ib_ucm_send_apr(struct ib_ucm_file *file,
-			       const char __user *inbuf,
-			       int in_len, int out_len)
-{
-	return ib_ucm_send_info(file, inbuf, in_len, (void *)ib_send_cm_apr);
-}
-
-static ssize_t ib_ucm_send_mra(struct ib_ucm_file *file,
-			       const char __user *inbuf,
-			       int in_len, int out_len)
-{
-	struct ib_ucm_context *ctx;
-	struct ib_ucm_mra cmd;
-	const void *data = NULL;
-	int result;
-
-	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
-		return -EFAULT;
-
-	result = ib_ucm_alloc_data(&data, cmd.data, cmd.len);
-	if (result)
-		return result;
-
-	ctx = ib_ucm_ctx_get(file, cmd.id);
-	if (!IS_ERR(ctx)) {
-		result = ib_send_cm_mra(ctx->cm_id, cmd.timeout, data, cmd.len);
-		ib_ucm_ctx_put(ctx);
-	} else
-		result = PTR_ERR(ctx);
-
-	kfree(data);
-	return result;
-}
-
-static ssize_t ib_ucm_send_lap(struct ib_ucm_file *file,
-			       const char __user *inbuf,
-			       int in_len, int out_len)
-{
-	struct ib_ucm_context *ctx;
-	struct sa_path_rec *path = NULL;
-	struct ib_ucm_lap cmd;
-	const void *data = NULL;
-	int result;
-
-	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
-		return -EFAULT;
-
-	result = ib_ucm_alloc_data(&data, cmd.data, cmd.len);
-	if (result)
-		goto done;
-
-	result = ib_ucm_path_get(&path, cmd.path);
-	if (result)
-		goto done;
-
-	ctx = ib_ucm_ctx_get(file, cmd.id);
-	if (!IS_ERR(ctx)) {
-		result = ib_send_cm_lap(ctx->cm_id, path, data, cmd.len);
-		ib_ucm_ctx_put(ctx);
-	} else
-		result = PTR_ERR(ctx);
-
-done:
-	kfree(data);
-	kfree(path);
-	return result;
-}
-
-static ssize_t ib_ucm_send_sidr_req(struct ib_ucm_file *file,
-				    const char __user *inbuf,
-				    int in_len, int out_len)
-{
-	struct ib_cm_sidr_req_param param = {};
-	struct ib_ucm_context *ctx;
-	struct ib_ucm_sidr_req cmd;
-	int result;
-
-	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
-		return -EFAULT;
-
-	result = ib_ucm_alloc_data(&param.private_data, cmd.data, cmd.len);
-	if (result)
-		goto done;
-
-	result = ib_ucm_path_get(&param.path, cmd.path);
-	if (result)
-		goto done;
-
-	param.private_data_len = cmd.len;
-	param.service_id       = cmd.sid;
-	param.timeout_ms       = cmd.timeout;
-	param.max_cm_retries   = cmd.max_cm_retries;
-
-	ctx = ib_ucm_ctx_get(file, cmd.id);
-	if (!IS_ERR(ctx)) {
-		result = ib_send_cm_sidr_req(ctx->cm_id, &param);
-		ib_ucm_ctx_put(ctx);
-	} else
-		result = PTR_ERR(ctx);
-
-done:
-	kfree(param.private_data);
-	kfree(param.path);
-	return result;
-}
-
-static ssize_t ib_ucm_send_sidr_rep(struct ib_ucm_file *file,
-				    const char __user *inbuf,
-				    int in_len, int out_len)
-{
-	struct ib_cm_sidr_rep_param param;
-	struct ib_ucm_sidr_rep cmd;
-	struct ib_ucm_context *ctx;
-	int result;
-
-	param.info = NULL;
-
-	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
-		return -EFAULT;
-
-	result = ib_ucm_alloc_data(&param.private_data,
-				   cmd.data, cmd.data_len);
-	if (result)
-		goto done;
-
-	result = ib_ucm_alloc_data(&param.info, cmd.info, cmd.info_len);
-	if (result)
-		goto done;
-
-	param.qp_num		= cmd.qpn;
-	param.qkey		= cmd.qkey;
-	param.status		= cmd.status;
-	param.info_length	= cmd.info_len;
-	param.private_data_len	= cmd.data_len;
-
-	ctx = ib_ucm_ctx_get(file, cmd.id);
-	if (!IS_ERR(ctx)) {
-		result = ib_send_cm_sidr_rep(ctx->cm_id, &param);
-		ib_ucm_ctx_put(ctx);
-	} else
-		result = PTR_ERR(ctx);
-
-done:
-	kfree(param.private_data);
-	kfree(param.info);
-	return result;
-}
-
-static ssize_t (*ucm_cmd_table[])(struct ib_ucm_file *file,
-				  const char __user *inbuf,
-				  int in_len, int out_len) = {
-	[IB_USER_CM_CMD_CREATE_ID]     = ib_ucm_create_id,
-	[IB_USER_CM_CMD_DESTROY_ID]    = ib_ucm_destroy_id,
-	[IB_USER_CM_CMD_ATTR_ID]       = ib_ucm_attr_id,
-	[IB_USER_CM_CMD_LISTEN]        = ib_ucm_listen,
-	[IB_USER_CM_CMD_NOTIFY]        = ib_ucm_notify,
-	[IB_USER_CM_CMD_SEND_REQ]      = ib_ucm_send_req,
-	[IB_USER_CM_CMD_SEND_REP]      = ib_ucm_send_rep,
-	[IB_USER_CM_CMD_SEND_RTU]      = ib_ucm_send_rtu,
-	[IB_USER_CM_CMD_SEND_DREQ]     = ib_ucm_send_dreq,
-	[IB_USER_CM_CMD_SEND_DREP]     = ib_ucm_send_drep,
-	[IB_USER_CM_CMD_SEND_REJ]      = ib_ucm_send_rej,
-	[IB_USER_CM_CMD_SEND_MRA]      = ib_ucm_send_mra,
-	[IB_USER_CM_CMD_SEND_LAP]      = ib_ucm_send_lap,
-	[IB_USER_CM_CMD_SEND_APR]      = ib_ucm_send_apr,
-	[IB_USER_CM_CMD_SEND_SIDR_REQ] = ib_ucm_send_sidr_req,
-	[IB_USER_CM_CMD_SEND_SIDR_REP] = ib_ucm_send_sidr_rep,
-	[IB_USER_CM_CMD_EVENT]	       = ib_ucm_event,
-	[IB_USER_CM_CMD_INIT_QP_ATTR]  = ib_ucm_init_qp_attr,
-};
-
-static ssize_t ib_ucm_write(struct file *filp, const char __user *buf,
-			    size_t len, loff_t *pos)
-{
-	struct ib_ucm_file *file = filp->private_data;
-	struct ib_ucm_cmd_hdr hdr;
-	ssize_t result;
-
-	if (!ib_safe_file_access(filp)) {
-		pr_err_once("ucm_write: process %d (%s) changed security contexts after opening file descriptor, this is not allowed.\n",
-			    task_tgid_vnr(current), current->comm);
-		return -EACCES;
-	}
-
-	if (len < sizeof(hdr))
-		return -EINVAL;
-
-	if (copy_from_user(&hdr, buf, sizeof(hdr)))
-		return -EFAULT;
-
-	if (hdr.cmd >= ARRAY_SIZE(ucm_cmd_table))
-		return -EINVAL;
-	hdr.cmd = array_index_nospec(hdr.cmd, ARRAY_SIZE(ucm_cmd_table));
-
-	if (hdr.in + sizeof(hdr) > len)
-		return -EINVAL;
-
-	result = ucm_cmd_table[hdr.cmd](file, buf + sizeof(hdr),
-					hdr.in, hdr.out);
-	if (!result)
-		result = len;
-
-	return result;
-}
-
-static __poll_t ib_ucm_poll(struct file *filp,
-				struct poll_table_struct *wait)
-{
-	struct ib_ucm_file *file = filp->private_data;
-	__poll_t mask = 0;
-
-	poll_wait(filp, &file->poll_wait, wait);
-
-	if (!list_empty(&file->events))
-		mask = EPOLLIN | EPOLLRDNORM;
-
-	return mask;
-}
-
-/*
- * ib_ucm_open() does not need the BKL:
- *
- *  - no global state is referred to;
- *  - there is no ioctl method to race against;
- *  - no further module initialization is required for open to work
- *    after the device is registered.
- */
-static int ib_ucm_open(struct inode *inode, struct file *filp)
-{
-	struct ib_ucm_file *file;
-
-	file = kmalloc(sizeof(*file), GFP_KERNEL);
-	if (!file)
-		return -ENOMEM;
-
-	INIT_LIST_HEAD(&file->events);
-	INIT_LIST_HEAD(&file->ctxs);
-	init_waitqueue_head(&file->poll_wait);
-
-	mutex_init(&file->file_mutex);
-
-	filp->private_data = file;
-	file->filp = filp;
-	file->device = container_of(inode->i_cdev, struct ib_ucm_device, cdev);
-
-	return nonseekable_open(inode, filp);
-}
-
-static int ib_ucm_close(struct inode *inode, struct file *filp)
-{
-	struct ib_ucm_file *file = filp->private_data;
-	struct ib_ucm_context *ctx;
-
-	mutex_lock(&file->file_mutex);
-	while (!list_empty(&file->ctxs)) {
-		ctx = list_entry(file->ctxs.next,
-				 struct ib_ucm_context, file_list);
-		mutex_unlock(&file->file_mutex);
-
-		mutex_lock(&ctx_id_mutex);
-		idr_remove(&ctx_id_table, ctx->id);
-		mutex_unlock(&ctx_id_mutex);
-
-		ib_destroy_cm_id(ctx->cm_id);
-		ib_ucm_cleanup_events(ctx);
-		kfree(ctx);
-
-		mutex_lock(&file->file_mutex);
-	}
-	mutex_unlock(&file->file_mutex);
-	kfree(file);
-	return 0;
-}
-
-static void ib_ucm_release_dev(struct device *dev)
-{
-	struct ib_ucm_device *ucm_dev;
-
-	ucm_dev = container_of(dev, struct ib_ucm_device, dev);
-	kfree(ucm_dev);
-}
-
-static void ib_ucm_free_dev(struct ib_ucm_device *ucm_dev)
-{
-	clear_bit(ucm_dev->devnum, dev_map);
-}
-
-static const struct file_operations ucm_fops = {
-	.owner	 = THIS_MODULE,
-	.open	 = ib_ucm_open,
-	.release = ib_ucm_close,
-	.write	 = ib_ucm_write,
-	.poll    = ib_ucm_poll,
-	.llseek	 = no_llseek,
-};
-
-static ssize_t show_ibdev(struct device *dev, struct device_attribute *attr,
-			  char *buf)
-{
-	struct ib_ucm_device *ucm_dev;
-
-	ucm_dev = container_of(dev, struct ib_ucm_device, dev);
-	return sprintf(buf, "%s\n", ucm_dev->ib_dev->name);
-}
-static DEVICE_ATTR(ibdev, S_IRUGO, show_ibdev, NULL);
-
-static void ib_ucm_add_one(struct ib_device *device)
-{
-	int devnum;
-	dev_t base;
-	struct ib_ucm_device *ucm_dev;
-
-	if (!device->alloc_ucontext || !rdma_cap_ib_cm(device, 1))
-		return;
-
-	ucm_dev = kzalloc(sizeof *ucm_dev, GFP_KERNEL);
-	if (!ucm_dev)
-		return;
-
-	device_initialize(&ucm_dev->dev);
-	ucm_dev->ib_dev = device;
-	ucm_dev->dev.release = ib_ucm_release_dev;
-
-	devnum = find_first_zero_bit(dev_map, IB_UCM_MAX_DEVICES);
-	if (devnum >= IB_UCM_MAX_DEVICES)
-		goto err;
-	ucm_dev->devnum = devnum;
-	set_bit(devnum, dev_map);
-	if (devnum >= IB_UCM_NUM_FIXED_MINOR)
-		base = dynamic_ucm_dev + devnum - IB_UCM_NUM_FIXED_MINOR;
-	else
-		base = IB_UCM_BASE_DEV + devnum;
-
-	cdev_init(&ucm_dev->cdev, &ucm_fops);
-	ucm_dev->cdev.owner = THIS_MODULE;
-	kobject_set_name(&ucm_dev->cdev.kobj, "ucm%d", ucm_dev->devnum);
-
-	ucm_dev->dev.class = &cm_class;
-	ucm_dev->dev.parent = device->dev.parent;
-	ucm_dev->dev.devt = base;
-
-	dev_set_name(&ucm_dev->dev, "ucm%d", ucm_dev->devnum);
-	if (cdev_device_add(&ucm_dev->cdev, &ucm_dev->dev))
-		goto err_devnum;
-
-	if (device_create_file(&ucm_dev->dev, &dev_attr_ibdev))
-		goto err_dev;
-
-	ib_set_client_data(device, &ucm_client, ucm_dev);
-	return;
-
-err_dev:
-	cdev_device_del(&ucm_dev->cdev, &ucm_dev->dev);
-err_devnum:
-	ib_ucm_free_dev(ucm_dev);
-err:
-	put_device(&ucm_dev->dev);
-	return;
-}
-
-static void ib_ucm_remove_one(struct ib_device *device, void *client_data)
-{
-	struct ib_ucm_device *ucm_dev = client_data;
-
-	if (!ucm_dev)
-		return;
-
-	cdev_device_del(&ucm_dev->cdev, &ucm_dev->dev);
-	ib_ucm_free_dev(ucm_dev);
-	put_device(&ucm_dev->dev);
-}
-
-static CLASS_ATTR_STRING(abi_version, S_IRUGO,
-			 __stringify(IB_USER_CM_ABI_VERSION));
-
-static int __init ib_ucm_init(void)
-{
-	int ret;
-
-	ret = register_chrdev_region(IB_UCM_BASE_DEV, IB_UCM_NUM_FIXED_MINOR,
-				     "infiniband_cm");
-	if (ret) {
-		pr_err("ucm: couldn't register device number\n");
-		goto error1;
-	}
-
-	ret = alloc_chrdev_region(&dynamic_ucm_dev, 0, IB_UCM_NUM_DYNAMIC_MINOR,
-				  "infiniband_cm");
-	if (ret) {
-		pr_err("ucm: couldn't register dynamic device number\n");
-		goto err_alloc;
-	}
-
-	ret = class_create_file(&cm_class, &class_attr_abi_version.attr);
-	if (ret) {
-		pr_err("ucm: couldn't create abi_version attribute\n");
-		goto error2;
-	}
-
-	ret = ib_register_client(&ucm_client);
-	if (ret) {
-		pr_err("ucm: couldn't register client\n");
-		goto error3;
-	}
-	return 0;
-
-error3:
-	class_remove_file(&cm_class, &class_attr_abi_version.attr);
-error2:
-	unregister_chrdev_region(dynamic_ucm_dev, IB_UCM_NUM_DYNAMIC_MINOR);
-err_alloc:
-	unregister_chrdev_region(IB_UCM_BASE_DEV, IB_UCM_NUM_FIXED_MINOR);
-error1:
-	return ret;
-}
-
-static void __exit ib_ucm_cleanup(void)
-{
-	ib_unregister_client(&ucm_client);
-	class_remove_file(&cm_class, &class_attr_abi_version.attr);
-	unregister_chrdev_region(IB_UCM_BASE_DEV, IB_UCM_NUM_FIXED_MINOR);
-	unregister_chrdev_region(dynamic_ucm_dev, IB_UCM_NUM_DYNAMIC_MINOR);
-	idr_destroy(&ctx_id_table);
-}
-
-module_init(ib_ucm_init);
-module_exit(ib_ucm_cleanup);
diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c
index 01d68ed..0274e9b 100644
--- a/drivers/infiniband/core/ucma.c
+++ b/drivers/infiniband/core/ucma.c
@@ -52,6 +52,8 @@
 #include <rdma/rdma_cm_ib.h>
 #include <rdma/ib_addr.h>
 #include <rdma/ib.h>
+#include <rdma/rdma_netlink.h>
+#include "core_priv.h"
 
 MODULE_AUTHOR("Sean Hefty");
 MODULE_DESCRIPTION("RDMA Userspace Connection Manager Access");
@@ -81,7 +83,7 @@
 };
 
 struct ucma_context {
-	int			id;
+	u32			id;
 	struct completion	comp;
 	atomic_t		ref;
 	int			events_reported;
@@ -94,7 +96,7 @@
 	struct list_head	list;
 	struct list_head	mc_list;
 	/* mark that device is in process of destroying the internal HW
-	 * resources, protected by the global mut
+	 * resources, protected by the ctx_table lock
 	 */
 	int			closing;
 	/* sync between removal event and id destroy, protected by file mut */
@@ -104,7 +106,7 @@
 
 struct ucma_multicast {
 	struct ucma_context	*ctx;
-	int			id;
+	u32			id;
 	int			events_reported;
 
 	u64			uid;
@@ -122,9 +124,8 @@
 	struct work_struct	close_work;
 };
 
-static DEFINE_MUTEX(mut);
-static DEFINE_IDR(ctx_idr);
-static DEFINE_IDR(multicast_idr);
+static DEFINE_XARRAY_ALLOC(ctx_table);
+static DEFINE_XARRAY_ALLOC(multicast_table);
 
 static const struct file_operations ucma_fops;
 
@@ -133,7 +134,7 @@
 {
 	struct ucma_context *ctx;
 
-	ctx = idr_find(&ctx_idr, id);
+	ctx = xa_load(&ctx_table, id);
 	if (!ctx)
 		ctx = ERR_PTR(-ENOENT);
 	else if (ctx->file != file || !ctx->cm_id)
@@ -145,7 +146,7 @@
 {
 	struct ucma_context *ctx;
 
-	mutex_lock(&mut);
+	xa_lock(&ctx_table);
 	ctx = _ucma_find_context(id, file);
 	if (!IS_ERR(ctx)) {
 		if (ctx->closing)
@@ -153,7 +154,7 @@
 		else
 			atomic_inc(&ctx->ref);
 	}
-	mutex_unlock(&mut);
+	xa_unlock(&ctx_table);
 	return ctx;
 }
 
@@ -216,10 +217,7 @@
 	INIT_LIST_HEAD(&ctx->mc_list);
 	ctx->file = file;
 
-	mutex_lock(&mut);
-	ctx->id = idr_alloc(&ctx_idr, ctx, 0, 0, GFP_KERNEL);
-	mutex_unlock(&mut);
-	if (ctx->id < 0)
+	if (xa_alloc(&ctx_table, &ctx->id, ctx, xa_limit_32b, GFP_KERNEL))
 		goto error;
 
 	list_add_tail(&ctx->list, &file->ctx_list);
@@ -238,13 +236,10 @@
 	if (!mc)
 		return NULL;
 
-	mutex_lock(&mut);
-	mc->id = idr_alloc(&multicast_idr, NULL, 0, 0, GFP_KERNEL);
-	mutex_unlock(&mut);
-	if (mc->id < 0)
+	mc->ctx = ctx;
+	if (xa_alloc(&multicast_table, &mc->id, NULL, xa_limit_32b, GFP_KERNEL))
 		goto error;
 
-	mc->ctx = ctx;
 	list_add_tail(&mc->list, &ctx->mc_list);
 	return mc;
 
@@ -319,9 +314,9 @@
 	 * handled separately below.
 	 */
 	if (ctx->cm_id == cm_id) {
-		mutex_lock(&mut);
+		xa_lock(&ctx_table);
 		ctx->closing = 1;
-		mutex_unlock(&mut);
+		xa_unlock(&ctx_table);
 		queue_work(ctx->file->close_wq, &ctx->close_work);
 		return;
 	}
@@ -523,9 +518,7 @@
 err2:
 	rdma_destroy_id(cm_id);
 err1:
-	mutex_lock(&mut);
-	idr_remove(&ctx_idr, ctx->id);
-	mutex_unlock(&mut);
+	xa_erase(&ctx_table, ctx->id);
 	mutex_lock(&file->mut);
 	list_del(&ctx->list);
 	mutex_unlock(&file->mut);
@@ -537,13 +530,13 @@
 {
 	struct ucma_multicast *mc, *tmp;
 
-	mutex_lock(&mut);
+	mutex_lock(&ctx->file->mut);
 	list_for_each_entry_safe(mc, tmp, &ctx->mc_list, list) {
 		list_del(&mc->list);
-		idr_remove(&multicast_idr, mc->id);
+		xa_erase(&multicast_table, mc->id);
 		kfree(mc);
 	}
-	mutex_unlock(&mut);
+	mutex_unlock(&ctx->file->mut);
 }
 
 static void ucma_cleanup_mc_events(struct ucma_multicast *mc)
@@ -614,11 +607,11 @@
 	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
 		return -EFAULT;
 
-	mutex_lock(&mut);
+	xa_lock(&ctx_table);
 	ctx = _ucma_find_context(cmd.id, file);
 	if (!IS_ERR(ctx))
-		idr_remove(&ctx_idr, ctx->id);
-	mutex_unlock(&mut);
+		__xa_erase(&ctx_table, ctx->id);
+	xa_unlock(&ctx_table);
 
 	if (IS_ERR(ctx))
 		return PTR_ERR(ctx);
@@ -630,14 +623,14 @@
 	flush_workqueue(ctx->file->close_wq);
 	/* At this point it's guaranteed that there is no inflight
 	 * closing task */
-	mutex_lock(&mut);
+	xa_lock(&ctx_table);
 	if (!ctx->closing) {
-		mutex_unlock(&mut);
+		xa_unlock(&ctx_table);
 		ucma_put_ctx(ctx);
 		wait_for_completion(&ctx->comp);
 		rdma_destroy_id(ctx->cm_id);
 	} else {
-		mutex_unlock(&mut);
+		xa_unlock(&ctx_table);
 	}
 
 	resp.events_reported = ucma_free_ctx(ctx);
@@ -951,8 +944,7 @@
 		}
 	}
 
-	if (copy_to_user(response, resp,
-			 sizeof(*resp) + (i * sizeof(struct ib_path_rec_data))))
+	if (copy_to_user(response, resp, struct_size(resp, path_data, i)))
 		ret = -EFAULT;
 
 	kfree(resp);
@@ -1236,6 +1228,13 @@
 		}
 		ret = rdma_set_afonly(ctx->cm_id, *((int *) optval) ? 1 : 0);
 		break;
+	case RDMA_OPTION_ID_ACK_TIMEOUT:
+		if (optlen != sizeof(u8)) {
+			ret = -EINVAL;
+			break;
+		}
+		ret = rdma_set_ack_timeout(ctx->cm_id, *((u8 *)optval));
+		break;
 	default:
 		ret = -ENOSYS;
 	}
@@ -1425,9 +1424,7 @@
 		goto err3;
 	}
 
-	mutex_lock(&mut);
-	idr_replace(&multicast_idr, mc, mc->id);
-	mutex_unlock(&mut);
+	xa_store(&multicast_table, mc->id, mc, 0);
 
 	mutex_unlock(&file->mut);
 	ucma_put_ctx(ctx);
@@ -1437,9 +1434,7 @@
 	rdma_leave_multicast(ctx->cm_id, (struct sockaddr *) &mc->addr);
 	ucma_cleanup_mc_events(mc);
 err2:
-	mutex_lock(&mut);
-	idr_remove(&multicast_idr, mc->id);
-	mutex_unlock(&mut);
+	xa_erase(&multicast_table, mc->id);
 	list_del(&mc->list);
 	kfree(mc);
 err1:
@@ -1501,8 +1496,8 @@
 	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
 		return -EFAULT;
 
-	mutex_lock(&mut);
-	mc = idr_find(&multicast_idr, cmd.id);
+	xa_lock(&multicast_table);
+	mc = xa_load(&multicast_table, cmd.id);
 	if (!mc)
 		mc = ERR_PTR(-ENOENT);
 	else if (mc->ctx->file != file)
@@ -1510,8 +1505,8 @@
 	else if (!atomic_inc_not_zero(&mc->ctx->ref))
 		mc = ERR_PTR(-ENXIO);
 	else
-		idr_remove(&multicast_idr, mc->id);
-	mutex_unlock(&mut);
+		__xa_erase(&multicast_table, mc->id);
+	xa_unlock(&multicast_table);
 
 	if (IS_ERR(mc)) {
 		ret = PTR_ERR(mc);
@@ -1608,14 +1603,14 @@
 	 * events being added before existing events.
 	 */
 	ucma_lock_files(cur_file, new_file);
-	mutex_lock(&mut);
+	xa_lock(&ctx_table);
 
 	list_move_tail(&ctx->list, &new_file->ctx_list);
 	ucma_move_events(ctx, new_file);
 	ctx->file = new_file;
 	resp.events_reported = ctx->events_reported;
 
-	mutex_unlock(&mut);
+	xa_unlock(&ctx_table);
 	ucma_unlock_files(cur_file, new_file);
 
 response:
@@ -1737,7 +1732,7 @@
 	filp->private_data = file;
 	file->filp = filp;
 
-	return nonseekable_open(inode, filp);
+	return stream_open(inode, filp);
 }
 
 static int ucma_close(struct inode *inode, struct file *filp)
@@ -1750,18 +1745,15 @@
 		ctx->destroying = 1;
 		mutex_unlock(&file->mut);
 
-		mutex_lock(&mut);
-		idr_remove(&ctx_idr, ctx->id);
-		mutex_unlock(&mut);
-
+		xa_erase(&ctx_table, ctx->id);
 		flush_workqueue(file->close_wq);
 		/* At that step once ctx was marked as destroying and workqueue
 		 * was flushed we are safe from any inflights handlers that
 		 * might put other closing task.
 		 */
-		mutex_lock(&mut);
+		xa_lock(&ctx_table);
 		if (!ctx->closing) {
-			mutex_unlock(&mut);
+			xa_unlock(&ctx_table);
 			ucma_put_ctx(ctx);
 			wait_for_completion(&ctx->comp);
 			/* rdma_destroy_id ensures that no event handlers are
@@ -1769,7 +1761,7 @@
 			 */
 			rdma_destroy_id(ctx->cm_id);
 		} else {
-			mutex_unlock(&mut);
+			xa_unlock(&ctx_table);
 		}
 
 		ucma_free_ctx(ctx);
@@ -1798,6 +1790,19 @@
 	.fops		= &ucma_fops,
 };
 
+static int ucma_get_global_nl_info(struct ib_client_nl_info *res)
+{
+	res->abi = RDMA_USER_CM_ABI_VERSION;
+	res->cdev = ucma_misc.this_device;
+	return 0;
+}
+
+static struct ib_client rdma_cma_client = {
+	.name = "rdma_cm",
+	.get_global_nl_info = ucma_get_global_nl_info,
+};
+MODULE_ALIAS_RDMA_CLIENT("rdma_cm");
+
 static ssize_t show_abi_version(struct device *dev,
 				struct device_attribute *attr,
 				char *buf)
@@ -1826,7 +1831,14 @@
 		ret = -ENOMEM;
 		goto err2;
 	}
+
+	ret = ib_register_client(&rdma_cma_client);
+	if (ret)
+		goto err3;
+
 	return 0;
+err3:
+	unregister_net_sysctl_table(ucma_ctl_table_hdr);
 err2:
 	device_remove_file(ucma_misc.this_device, &dev_attr_abi_version);
 err1:
@@ -1836,11 +1848,10 @@
 
 static void __exit ucma_cleanup(void)
 {
+	ib_unregister_client(&rdma_cma_client);
 	unregister_net_sysctl_table(ucma_ctl_table_hdr);
 	device_remove_file(ucma_misc.this_device, &dev_attr_abi_version);
 	misc_deregister(&ucma_misc);
-	idr_destroy(&ctx_idr);
-	idr_destroy(&multicast_idr);
 }
 
 module_init(ucma_init);
diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
index a41792d..24244a2 100644
--- a/drivers/infiniband/core/umem.c
+++ b/drivers/infiniband/core/umem.c
@@ -37,62 +37,180 @@
 #include <linux/sched/signal.h>
 #include <linux/sched/mm.h>
 #include <linux/export.h>
-#include <linux/hugetlb.h>
 #include <linux/slab.h>
+#include <linux/pagemap.h>
 #include <rdma/ib_umem_odp.h>
 
 #include "uverbs.h"
 
-
 static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int dirty)
 {
-	struct scatterlist *sg;
+	struct sg_page_iter sg_iter;
 	struct page *page;
-	int i;
 
 	if (umem->nmap > 0)
-		ib_dma_unmap_sg(dev, umem->sg_head.sgl,
-				umem->npages,
+		ib_dma_unmap_sg(dev, umem->sg_head.sgl, umem->sg_nents,
 				DMA_BIDIRECTIONAL);
 
-	for_each_sg(umem->sg_head.sgl, sg, umem->npages, i) {
-
-		page = sg_page(sg);
-		if (!PageDirty(page) && umem->writable && dirty)
-			set_page_dirty_lock(page);
-		put_page(page);
+	for_each_sg_page(umem->sg_head.sgl, &sg_iter, umem->sg_nents, 0) {
+		page = sg_page_iter_page(&sg_iter);
+		put_user_pages_dirty_lock(&page, 1, umem->writable && dirty);
 	}
 
 	sg_free_table(&umem->sg_head);
 }
 
+/* ib_umem_add_sg_table - Add N contiguous pages to scatter table
+ *
+ * sg: current scatterlist entry
+ * page_list: array of npage struct page pointers
+ * npages: number of pages in page_list
+ * max_seg_sz: maximum segment size in bytes
+ * nents: [out] number of entries in the scatterlist
+ *
+ * Return new end of scatterlist
+ */
+static struct scatterlist *ib_umem_add_sg_table(struct scatterlist *sg,
+						struct page **page_list,
+						unsigned long npages,
+						unsigned int max_seg_sz,
+						int *nents)
+{
+	unsigned long first_pfn;
+	unsigned long i = 0;
+	bool update_cur_sg = false;
+	bool first = !sg_page(sg);
+
+	/* Check if new page_list is contiguous with end of previous page_list.
+	 * sg->length here is a multiple of PAGE_SIZE and sg->offset is 0.
+	 */
+	if (!first && (page_to_pfn(sg_page(sg)) + (sg->length >> PAGE_SHIFT) ==
+		       page_to_pfn(page_list[0])))
+		update_cur_sg = true;
+
+	while (i != npages) {
+		unsigned long len;
+		struct page *first_page = page_list[i];
+
+		first_pfn = page_to_pfn(first_page);
+
+		/* Compute the number of contiguous pages we have starting
+		 * at i
+		 */
+		for (len = 0; i != npages &&
+			      first_pfn + len == page_to_pfn(page_list[i]) &&
+			      len < (max_seg_sz >> PAGE_SHIFT);
+		     len++)
+			i++;
+
+		/* Squash N contiguous pages from page_list into current sge */
+		if (update_cur_sg) {
+			if ((max_seg_sz - sg->length) >= (len << PAGE_SHIFT)) {
+				sg_set_page(sg, sg_page(sg),
+					    sg->length + (len << PAGE_SHIFT),
+					    0);
+				update_cur_sg = false;
+				continue;
+			}
+			update_cur_sg = false;
+		}
+
+		/* Squash N contiguous pages into next sge or first sge */
+		if (!first)
+			sg = sg_next(sg);
+
+		(*nents)++;
+		sg_set_page(sg, first_page, len << PAGE_SHIFT, 0);
+		first = false;
+	}
+
+	return sg;
+}
+
+/**
+ * ib_umem_find_best_pgsz - Find best HW page size to use for this MR
+ *
+ * @umem: umem struct
+ * @pgsz_bitmap: bitmap of HW supported page sizes
+ * @virt: IOVA
+ *
+ * This helper is intended for HW that support multiple page
+ * sizes but can do only a single page size in an MR.
+ *
+ * Returns 0 if the umem requires page sizes not supported by
+ * the driver to be mapped. Drivers always supporting PAGE_SIZE
+ * or smaller will never see a 0 result.
+ */
+unsigned long ib_umem_find_best_pgsz(struct ib_umem *umem,
+				     unsigned long pgsz_bitmap,
+				     unsigned long virt)
+{
+	struct scatterlist *sg;
+	unsigned int best_pg_bit;
+	unsigned long va, pgoff;
+	dma_addr_t mask;
+	int i;
+
+	/* At minimum, drivers must support PAGE_SIZE or smaller */
+	if (WARN_ON(!(pgsz_bitmap & GENMASK(PAGE_SHIFT, 0))))
+		return 0;
+
+	va = virt;
+	/* max page size not to exceed MR length */
+	mask = roundup_pow_of_two(umem->length);
+	/* offset into first SGL */
+	pgoff = umem->address & ~PAGE_MASK;
+
+	for_each_sg(umem->sg_head.sgl, sg, umem->nmap, i) {
+		/* Walk SGL and reduce max page size if VA/PA bits differ
+		 * for any address.
+		 */
+		mask |= (sg_dma_address(sg) + pgoff) ^ va;
+		if (i && i != (umem->nmap - 1))
+			/* restrict by length as well for interior SGEs */
+			mask |= sg_dma_len(sg);
+		va += sg_dma_len(sg) - pgoff;
+		pgoff = 0;
+	}
+	best_pg_bit = rdma_find_pg_bit(mask, pgsz_bitmap);
+
+	return BIT_ULL(best_pg_bit);
+}
+EXPORT_SYMBOL(ib_umem_find_best_pgsz);
+
 /**
  * ib_umem_get - Pin and DMA map userspace memory.
  *
- * If access flags indicate ODP memory, avoid pinning. Instead, stores
- * the mm for future page fault handling in conjunction with MMU notifiers.
- *
- * @context: userspace context to pin memory for
+ * @udata: userspace context to pin memory for
  * @addr: userspace virtual address to start at
  * @size: length of region to pin
  * @access: IB_ACCESS_xxx flags for memory being pinned
  * @dmasync: flush in-flight DMA when the memory region is written
  */
-struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
+struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr,
 			    size_t size, int access, int dmasync)
 {
+	struct ib_ucontext *context;
 	struct ib_umem *umem;
 	struct page **page_list;
-	struct vm_area_struct **vma_list;
 	unsigned long lock_limit;
+	unsigned long new_pinned;
 	unsigned long cur_base;
+	struct mm_struct *mm;
 	unsigned long npages;
 	int ret;
-	int i;
 	unsigned long dma_attrs = 0;
-	struct scatterlist *sg, *sg_list_start;
+	struct scatterlist *sg;
 	unsigned int gup_flags = FOLL_WRITE;
 
+	if (!udata)
+		return ERR_PTR(-EIO);
+
+	context = container_of(udata, struct uverbs_attr_bundle, driver_udata)
+			  ->context;
+	if (!context)
+		return ERR_PTR(-EIO);
+
 	if (dmasync)
 		dma_attrs |= DMA_ATTR_WRITE_BARRIER;
 
@@ -107,27 +225,18 @@
 	if (!can_do_mlock())
 		return ERR_PTR(-EPERM);
 
-	umem = kzalloc(sizeof *umem, GFP_KERNEL);
+	if (access & IB_ACCESS_ON_DEMAND)
+		return ERR_PTR(-EOPNOTSUPP);
+
+	umem = kzalloc(sizeof(*umem), GFP_KERNEL);
 	if (!umem)
 		return ERR_PTR(-ENOMEM);
-
-	umem->context    = context;
+	umem->ibdev = context->device;
 	umem->length     = size;
 	umem->address    = addr;
-	umem->page_shift = PAGE_SHIFT;
 	umem->writable   = ib_access_writable(access);
-
-	if (access & IB_ACCESS_ON_DEMAND) {
-		ret = ib_umem_odp_get(context, umem, access);
-		if (ret)
-			goto umem_kfree;
-		return umem;
-	}
-
-	umem->odp_data = NULL;
-
-	/* We assume the memory is from hugetlb until proved otherwise */
-	umem->hugetlb   = 1;
+	umem->owning_mm = mm = current->mm;
+	mmgrab(mm);
 
 	page_list = (struct page **) __get_free_page(GFP_KERNEL);
 	if (!page_list) {
@@ -135,34 +244,23 @@
 		goto umem_kfree;
 	}
 
-	/*
-	 * if we can't alloc the vma_list, it's not so bad;
-	 * just assume the memory is not hugetlb memory
-	 */
-	vma_list = (struct vm_area_struct **) __get_free_page(GFP_KERNEL);
-	if (!vma_list)
-		umem->hugetlb = 0;
-
 	npages = ib_umem_num_pages(umem);
+	if (npages == 0 || npages > UINT_MAX) {
+		ret = -EINVAL;
+		goto out;
+	}
 
 	lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
 
-	down_write(&current->mm->mmap_sem);
-	current->mm->pinned_vm += npages;
-	if ((current->mm->pinned_vm > lock_limit) && !capable(CAP_IPC_LOCK)) {
-		up_write(&current->mm->mmap_sem);
+	new_pinned = atomic64_add_return(npages, &mm->pinned_vm);
+	if (new_pinned > lock_limit && !capable(CAP_IPC_LOCK)) {
+		atomic64_sub(npages, &mm->pinned_vm);
 		ret = -ENOMEM;
-		goto vma;
+		goto out;
 	}
-	up_write(&current->mm->mmap_sem);
 
 	cur_base = addr & PAGE_MASK;
 
-	if (npages == 0 || npages > UINT_MAX) {
-		ret = -EINVAL;
-		goto vma;
-	}
-
 	ret = sg_alloc_table(&umem->sg_head, npages, GFP_KERNEL);
 	if (ret)
 		goto vma;
@@ -170,38 +268,35 @@
 	if (!umem->writable)
 		gup_flags |= FOLL_FORCE;
 
-	sg_list_start = umem->sg_head.sgl;
+	sg = umem->sg_head.sgl;
 
-	down_read(&current->mm->mmap_sem);
 	while (npages) {
-		ret = get_user_pages_longterm(cur_base,
+		down_read(&mm->mmap_sem);
+		ret = get_user_pages(cur_base,
 				     min_t(unsigned long, npages,
 					   PAGE_SIZE / sizeof (struct page *)),
-				     gup_flags, page_list, vma_list);
+				     gup_flags | FOLL_LONGTERM,
+				     page_list, NULL);
 		if (ret < 0) {
-			up_read(&current->mm->mmap_sem);
+			up_read(&mm->mmap_sem);
 			goto umem_release;
 		}
 
-		umem->npages += ret;
 		cur_base += ret * PAGE_SIZE;
 		npages   -= ret;
 
-		for_each_sg(sg_list_start, sg, ret, i) {
-			if (vma_list && !is_vm_hugetlb_page(vma_list[i]))
-				umem->hugetlb = 0;
+		sg = ib_umem_add_sg_table(sg, page_list, ret,
+			dma_get_max_seg_size(context->device->dma_device),
+			&umem->sg_nents);
 
-			sg_set_page(sg, page_list[i], PAGE_SIZE, 0);
-		}
-
-		/* preparing for next loop */
-		sg_list_start = sg;
+		up_read(&mm->mmap_sem);
 	}
-	up_read(&current->mm->mmap_sem);
+
+	sg_mark_end(sg);
 
 	umem->nmap = ib_dma_map_sg_attrs(context->device,
 				  umem->sg_head.sgl,
-				  umem->npages,
+				  umem->sg_nents,
 				  DMA_BIDIRECTIONAL,
 				  dma_attrs);
 
@@ -216,99 +311,44 @@
 umem_release:
 	__ib_umem_release(context->device, umem, 0);
 vma:
-	down_write(&current->mm->mmap_sem);
-	current->mm->pinned_vm -= ib_umem_num_pages(umem);
-	up_write(&current->mm->mmap_sem);
+	atomic64_sub(ib_umem_num_pages(umem), &mm->pinned_vm);
 out:
-	if (vma_list)
-		free_page((unsigned long) vma_list);
 	free_page((unsigned long) page_list);
 umem_kfree:
-	if (ret)
+	if (ret) {
+		mmdrop(umem->owning_mm);
 		kfree(umem);
+	}
 	return ret ? ERR_PTR(ret) : umem;
 }
 EXPORT_SYMBOL(ib_umem_get);
 
-static void ib_umem_account(struct work_struct *work)
-{
-	struct ib_umem *umem = container_of(work, struct ib_umem, work);
-
-	down_write(&umem->mm->mmap_sem);
-	umem->mm->pinned_vm -= umem->diff;
-	up_write(&umem->mm->mmap_sem);
-	mmput(umem->mm);
-	kfree(umem);
-}
-
 /**
  * ib_umem_release - release memory pinned with ib_umem_get
  * @umem: umem struct to release
  */
 void ib_umem_release(struct ib_umem *umem)
 {
-	struct ib_ucontext *context = umem->context;
-	struct mm_struct *mm;
-	struct task_struct *task;
-	unsigned long diff;
-
-	if (umem->odp_data) {
-		ib_umem_odp_release(umem);
+	if (!umem)
 		return;
-	}
+	if (umem->is_odp)
+		return ib_umem_odp_release(to_ib_umem_odp(umem));
 
-	__ib_umem_release(umem->context->device, umem, 1);
+	__ib_umem_release(umem->ibdev, umem, 1);
 
-	task = get_pid_task(umem->context->tgid, PIDTYPE_PID);
-	if (!task)
-		goto out;
-	mm = get_task_mm(task);
-	put_task_struct(task);
-	if (!mm)
-		goto out;
-
-	diff = ib_umem_num_pages(umem);
-
-	/*
-	 * We may be called with the mm's mmap_sem already held.  This
-	 * can happen when a userspace munmap() is the call that drops
-	 * the last reference to our file and calls our release
-	 * method.  If there are memory regions to destroy, we'll end
-	 * up here and not be able to take the mmap_sem.  In that case
-	 * we defer the vm_locked accounting to the system workqueue.
-	 */
-	if (context->closing) {
-		if (!down_write_trylock(&mm->mmap_sem)) {
-			INIT_WORK(&umem->work, ib_umem_account);
-			umem->mm   = mm;
-			umem->diff = diff;
-
-			queue_work(ib_wq, &umem->work);
-			return;
-		}
-	} else
-		down_write(&mm->mmap_sem);
-
-	mm->pinned_vm -= diff;
-	up_write(&mm->mmap_sem);
-	mmput(mm);
-out:
+	atomic64_sub(ib_umem_num_pages(umem), &umem->owning_mm->pinned_vm);
+	mmdrop(umem->owning_mm);
 	kfree(umem);
 }
 EXPORT_SYMBOL(ib_umem_release);
 
 int ib_umem_page_count(struct ib_umem *umem)
 {
-	int i;
-	int n;
+	int i, n = 0;
 	struct scatterlist *sg;
 
-	if (umem->odp_data)
-		return ib_umem_num_pages(umem);
-
-	n = 0;
 	for_each_sg(umem->sg_head.sgl, sg, umem->nmap, i)
-		n += sg_dma_len(sg) >> umem->page_shift;
+		n += sg_dma_len(sg) >> PAGE_SHIFT;
 
 	return n;
 }
@@ -336,7 +376,7 @@
 		return -EINVAL;
 	}
 
-	ret = sg_pcopy_to_buffer(umem->sg_head.sgl, umem->npages, dst, length,
+	ret = sg_pcopy_to_buffer(umem->sg_head.sgl, umem->sg_nents, dst, length,
 				 offset + ib_umem_offset(umem));
 
 	if (ret < 0)
diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c
index 6ec748e..163ff7b 100644
--- a/drivers/infiniband/core/umem_odp.c
+++ b/drivers/infiniband/core/umem_odp.c
@@ -39,203 +39,113 @@
 #include <linux/export.h>
 #include <linux/vmalloc.h>
 #include <linux/hugetlb.h>
-#include <linux/interval_tree_generic.h>
+#include <linux/interval_tree.h>
+#include <linux/pagemap.h>
 
 #include <rdma/ib_verbs.h>
 #include <rdma/ib_umem.h>
 #include <rdma/ib_umem_odp.h>
 
-/*
- * The ib_umem list keeps track of memory regions for which the HW
- * device request to receive notification when the related memory
- * mapping is changed.
- *
- * ib_umem_lock protects the list.
- */
+#include "uverbs.h"
 
-static u64 node_start(struct umem_odp_node *n)
+static void ib_umem_notifier_start_account(struct ib_umem_odp *umem_odp)
 {
-	struct ib_umem_odp *umem_odp =
-			container_of(n, struct ib_umem_odp, interval_tree);
-
-	return ib_umem_start(umem_odp->umem);
-}
-
-/* Note that the representation of the intervals in the interval tree
- * considers the ending point as contained in the interval, while the
- * function ib_umem_end returns the first address which is not contained
- * in the umem.
- */
-static u64 node_last(struct umem_odp_node *n)
-{
-	struct ib_umem_odp *umem_odp =
-			container_of(n, struct ib_umem_odp, interval_tree);
-
-	return ib_umem_end(umem_odp->umem) - 1;
-}
-
-INTERVAL_TREE_DEFINE(struct umem_odp_node, rb, u64, __subtree_last,
-		     node_start, node_last, static, rbt_ib_umem)
-
-static void ib_umem_notifier_start_account(struct ib_umem *item)
-{
-	mutex_lock(&item->odp_data->umem_mutex);
-
-	/* Only update private counters for this umem if it has them.
-	 * Otherwise skip it. All page faults will be delayed for this umem. */
-	if (item->odp_data->mn_counters_active) {
-		int notifiers_count = item->odp_data->notifiers_count++;
-
-		if (notifiers_count == 0)
-			/* Initialize the completion object for waiting on
-			 * notifiers. Since notifier_count is zero, no one
-			 * should be waiting right now. */
-			reinit_completion(&item->odp_data->notifier_completion);
-	}
-	mutex_unlock(&item->odp_data->umem_mutex);
-}
-
-static void ib_umem_notifier_end_account(struct ib_umem *item)
-{
-	mutex_lock(&item->odp_data->umem_mutex);
-
-	/* Only update private counters for this umem if it has them.
-	 * Otherwise skip it. All page faults will be delayed for this umem. */
-	if (item->odp_data->mn_counters_active) {
+	mutex_lock(&umem_odp->umem_mutex);
+	if (umem_odp->notifiers_count++ == 0)
 		/*
-		 * This sequence increase will notify the QP page fault that
-		 * the page that is going to be mapped in the spte could have
-		 * been freed.
+		 * Initialize the completion object for waiting on
+		 * notifiers. Since notifier_count is zero, no one should be
+		 * waiting right now.
 		 */
-		++item->odp_data->notifiers_seq;
-		if (--item->odp_data->notifiers_count == 0)
-			complete_all(&item->odp_data->notifier_completion);
-	}
-	mutex_unlock(&item->odp_data->umem_mutex);
+		reinit_completion(&umem_odp->notifier_completion);
+	mutex_unlock(&umem_odp->umem_mutex);
 }
 
-/* Account for a new mmu notifier in an ib_ucontext. */
-static void ib_ucontext_notifier_start_account(struct ib_ucontext *context)
+static void ib_umem_notifier_end_account(struct ib_umem_odp *umem_odp)
 {
-	atomic_inc(&context->notifier_count);
-}
-
-/* Account for a terminating mmu notifier in an ib_ucontext.
- *
- * Must be called with the ib_ucontext->umem_rwsem semaphore unlocked, since
- * the function takes the semaphore itself. */
-static void ib_ucontext_notifier_end_account(struct ib_ucontext *context)
-{
-	int zero_notifiers = atomic_dec_and_test(&context->notifier_count);
-
-	if (zero_notifiers &&
-	    !list_empty(&context->no_private_counters)) {
-		/* No currently running mmu notifiers. Now is the chance to
-		 * add private accounting to all previously added umems. */
-		struct ib_umem_odp *odp_data, *next;
-
-		/* Prevent concurrent mmu notifiers from working on the
-		 * no_private_counters list. */
-		down_write(&context->umem_rwsem);
-
-		/* Read the notifier_count again, with the umem_rwsem
-		 * semaphore taken for write. */
-		if (!atomic_read(&context->notifier_count)) {
-			list_for_each_entry_safe(odp_data, next,
-						 &context->no_private_counters,
-						 no_private_counters) {
-				mutex_lock(&odp_data->umem_mutex);
-				odp_data->mn_counters_active = true;
-				list_del(&odp_data->no_private_counters);
-				complete_all(&odp_data->notifier_completion);
-				mutex_unlock(&odp_data->umem_mutex);
-			}
-		}
-
-		up_write(&context->umem_rwsem);
-	}
-}
-
-static int ib_umem_notifier_release_trampoline(struct ib_umem *item, u64 start,
-					       u64 end, void *cookie) {
+	mutex_lock(&umem_odp->umem_mutex);
 	/*
-	 * Increase the number of notifiers running, to
-	 * prevent any further fault handling on this MR.
+	 * This sequence increase will notify the QP page fault that the page
+	 * that is going to be mapped in the spte could have been freed.
 	 */
-	ib_umem_notifier_start_account(item);
-	item->odp_data->dying = 1;
-	/* Make sure that the fact the umem is dying is out before we release
-	 * all pending page faults. */
-	smp_wmb();
-	complete_all(&item->odp_data->notifier_completion);
-	item->context->invalidate_range(item, ib_umem_start(item),
-					ib_umem_end(item));
-	return 0;
+	++umem_odp->notifiers_seq;
+	if (--umem_odp->notifiers_count == 0)
+		complete_all(&umem_odp->notifier_completion);
+	mutex_unlock(&umem_odp->umem_mutex);
 }
 
 static void ib_umem_notifier_release(struct mmu_notifier *mn,
 				     struct mm_struct *mm)
 {
-	struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn);
+	struct ib_ucontext_per_mm *per_mm =
+		container_of(mn, struct ib_ucontext_per_mm, mn);
+	struct rb_node *node;
 
-	if (!context->invalidate_range)
-		return;
+	down_read(&per_mm->umem_rwsem);
+	if (!per_mm->mn.users)
+		goto out;
 
-	ib_ucontext_notifier_start_account(context);
-	down_read(&context->umem_rwsem);
-	rbt_ib_umem_for_each_in_range(&context->umem_tree, 0,
-				      ULLONG_MAX,
-				      ib_umem_notifier_release_trampoline,
-				      true,
-				      NULL);
-	up_read(&context->umem_rwsem);
+	for (node = rb_first_cached(&per_mm->umem_tree); node;
+	     node = rb_next(node)) {
+		struct ib_umem_odp *umem_odp =
+			rb_entry(node, struct ib_umem_odp, interval_tree.rb);
+
+		/*
+		 * Increase the number of notifiers running, to prevent any
+		 * further fault handling on this MR.
+		 */
+		ib_umem_notifier_start_account(umem_odp);
+		complete_all(&umem_odp->notifier_completion);
+		umem_odp->umem.ibdev->ops.invalidate_range(
+			umem_odp, ib_umem_start(umem_odp),
+			ib_umem_end(umem_odp));
+	}
+
+out:
+	up_read(&per_mm->umem_rwsem);
 }
 
-static int invalidate_page_trampoline(struct ib_umem *item, u64 start,
-				      u64 end, void *cookie)
+static int invalidate_range_start_trampoline(struct ib_umem_odp *item,
+					     u64 start, u64 end, void *cookie)
 {
 	ib_umem_notifier_start_account(item);
-	item->context->invalidate_range(item, start, start + PAGE_SIZE);
-	ib_umem_notifier_end_account(item);
-	return 0;
-}
-
-static int invalidate_range_start_trampoline(struct ib_umem *item, u64 start,
-					     u64 end, void *cookie)
-{
-	ib_umem_notifier_start_account(item);
-	item->context->invalidate_range(item, start, end);
+	item->umem.ibdev->ops.invalidate_range(item, start, end);
 	return 0;
 }
 
 static int ib_umem_notifier_invalidate_range_start(struct mmu_notifier *mn,
-						    struct mm_struct *mm,
-						    unsigned long start,
-						    unsigned long end,
-						    bool blockable)
+				const struct mmu_notifier_range *range)
 {
-	struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn);
-	int ret;
+	struct ib_ucontext_per_mm *per_mm =
+		container_of(mn, struct ib_ucontext_per_mm, mn);
+	int rc;
 
-	if (!context->invalidate_range)
-		return 0;
-
-	if (blockable)
-		down_read(&context->umem_rwsem);
-	else if (!down_read_trylock(&context->umem_rwsem))
+	if (mmu_notifier_range_blockable(range))
+		down_read(&per_mm->umem_rwsem);
+	else if (!down_read_trylock(&per_mm->umem_rwsem))
 		return -EAGAIN;
 
-	ib_ucontext_notifier_start_account(context);
-	ret = rbt_ib_umem_for_each_in_range(&context->umem_tree, start,
-				      end,
-				      invalidate_range_start_trampoline,
-				      blockable, NULL);
-	up_read(&context->umem_rwsem);
+	if (!per_mm->mn.users) {
+		up_read(&per_mm->umem_rwsem);
+		/*
+		 * At this point users is permanently zero and visible to this
+		 * CPU without a lock, that fact is relied on to skip the unlock
+		 * in range_end.
+		 */
+		return 0;
+	}
 
-	return ret;
+	rc = rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, range->start,
+					   range->end,
+					   invalidate_range_start_trampoline,
+					   mmu_notifier_range_blockable(range),
+					   NULL);
+	if (rc)
+		up_read(&per_mm->umem_rwsem);
+	return rc;
 }
 
-static int invalidate_range_end_trampoline(struct ib_umem *item, u64 start,
+static int invalidate_range_end_trampoline(struct ib_umem_odp *item, u64 start,
 					   u64 end, void *cookie)
 {
 	ib_umem_notifier_end_account(item);
@@ -243,234 +153,296 @@
 }
 
 static void ib_umem_notifier_invalidate_range_end(struct mmu_notifier *mn,
-						  struct mm_struct *mm,
-						  unsigned long start,
-						  unsigned long end)
+				const struct mmu_notifier_range *range)
 {
-	struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn);
+	struct ib_ucontext_per_mm *per_mm =
+		container_of(mn, struct ib_ucontext_per_mm, mn);
 
-	if (!context->invalidate_range)
+	if (unlikely(!per_mm->mn.users))
 		return;
 
-	/*
-	 * TODO: we currently bail out if there is any sleepable work to be done
-	 * in ib_umem_notifier_invalidate_range_start so we shouldn't really block
-	 * here. But this is ugly and fragile.
-	 */
-	down_read(&context->umem_rwsem);
-	rbt_ib_umem_for_each_in_range(&context->umem_tree, start,
-				      end,
+	rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, range->start,
+				      range->end,
 				      invalidate_range_end_trampoline, true, NULL);
-	up_read(&context->umem_rwsem);
-	ib_ucontext_notifier_end_account(context);
+	up_read(&per_mm->umem_rwsem);
+}
+
+static struct mmu_notifier *ib_umem_alloc_notifier(struct mm_struct *mm)
+{
+	struct ib_ucontext_per_mm *per_mm;
+
+	per_mm = kzalloc(sizeof(*per_mm), GFP_KERNEL);
+	if (!per_mm)
+		return ERR_PTR(-ENOMEM);
+
+	per_mm->umem_tree = RB_ROOT_CACHED;
+	init_rwsem(&per_mm->umem_rwsem);
+
+	WARN_ON(mm != current->mm);
+	rcu_read_lock();
+	per_mm->tgid = get_task_pid(current->group_leader, PIDTYPE_PID);
+	rcu_read_unlock();
+	return &per_mm->mn;
+}
+
+static void ib_umem_free_notifier(struct mmu_notifier *mn)
+{
+	struct ib_ucontext_per_mm *per_mm =
+		container_of(mn, struct ib_ucontext_per_mm, mn);
+
+	WARN_ON(!RB_EMPTY_ROOT(&per_mm->umem_tree.rb_root));
+
+	put_pid(per_mm->tgid);
+	kfree(per_mm);
 }
 
 static const struct mmu_notifier_ops ib_umem_notifiers = {
 	.release                    = ib_umem_notifier_release,
 	.invalidate_range_start     = ib_umem_notifier_invalidate_range_start,
 	.invalidate_range_end       = ib_umem_notifier_invalidate_range_end,
+	.alloc_notifier		    = ib_umem_alloc_notifier,
+	.free_notifier		    = ib_umem_free_notifier,
 };
 
-struct ib_umem *ib_alloc_odp_umem(struct ib_ucontext *context,
-				  unsigned long addr,
-				  size_t size)
+static inline int ib_init_umem_odp(struct ib_umem_odp *umem_odp)
 {
-	struct ib_umem *umem;
-	struct ib_umem_odp *odp_data;
-	int pages = size >> PAGE_SHIFT;
+	struct ib_ucontext_per_mm *per_mm;
+	struct mmu_notifier *mn;
 	int ret;
 
-	umem = kzalloc(sizeof(*umem), GFP_KERNEL);
-	if (!umem)
-		return ERR_PTR(-ENOMEM);
+	umem_odp->umem.is_odp = 1;
+	if (!umem_odp->is_implicit_odp) {
+		size_t page_size = 1UL << umem_odp->page_shift;
+		size_t pages;
 
-	umem->context    = context;
-	umem->length     = size;
-	umem->address    = addr;
-	umem->page_shift = PAGE_SHIFT;
-	umem->writable   = 1;
+		umem_odp->interval_tree.start =
+			ALIGN_DOWN(umem_odp->umem.address, page_size);
+		if (check_add_overflow(umem_odp->umem.address,
+				       (unsigned long)umem_odp->umem.length,
+				       &umem_odp->interval_tree.last))
+			return -EOVERFLOW;
+		umem_odp->interval_tree.last =
+			ALIGN(umem_odp->interval_tree.last, page_size);
+		if (unlikely(umem_odp->interval_tree.last < page_size))
+			return -EOVERFLOW;
+
+		pages = (umem_odp->interval_tree.last -
+			 umem_odp->interval_tree.start) >>
+			umem_odp->page_shift;
+		if (!pages)
+			return -EINVAL;
+
+		/*
+		 * Note that the representation of the intervals in the
+		 * interval tree considers the ending point as contained in
+		 * the interval.
+		 */
+		umem_odp->interval_tree.last--;
+
+		umem_odp->page_list = kvcalloc(
+			pages, sizeof(*umem_odp->page_list), GFP_KERNEL);
+		if (!umem_odp->page_list)
+			return -ENOMEM;
+
+		umem_odp->dma_list = kvcalloc(
+			pages, sizeof(*umem_odp->dma_list), GFP_KERNEL);
+		if (!umem_odp->dma_list) {
+			ret = -ENOMEM;
+			goto out_page_list;
+		}
+	}
+
+	mn = mmu_notifier_get(&ib_umem_notifiers, umem_odp->umem.owning_mm);
+	if (IS_ERR(mn)) {
+		ret = PTR_ERR(mn);
+		goto out_dma_list;
+	}
+	umem_odp->per_mm = per_mm =
+		container_of(mn, struct ib_ucontext_per_mm, mn);
+
+	mutex_init(&umem_odp->umem_mutex);
+	init_completion(&umem_odp->notifier_completion);
+
+	if (!umem_odp->is_implicit_odp) {
+		down_write(&per_mm->umem_rwsem);
+		interval_tree_insert(&umem_odp->interval_tree,
+				     &per_mm->umem_tree);
+		up_write(&per_mm->umem_rwsem);
+	}
+	mmgrab(umem_odp->umem.owning_mm);
+
+	return 0;
+
+out_dma_list:
+	kvfree(umem_odp->dma_list);
+out_page_list:
+	kvfree(umem_odp->page_list);
+	return ret;
+}
+
+/**
+ * ib_umem_odp_alloc_implicit - Allocate a parent implicit ODP umem
+ *
+ * Implicit ODP umems do not have a VA range and do not have any page lists.
+ * They exist only to hold the per_mm reference to help the driver create
+ * children umems.
+ *
+ * @udata: udata from the syscall being used to create the umem
+ * @access: ib_reg_mr access flags
+ */
+struct ib_umem_odp *ib_umem_odp_alloc_implicit(struct ib_udata *udata,
+					       int access)
+{
+	struct ib_ucontext *context =
+		container_of(udata, struct uverbs_attr_bundle, driver_udata)
+			->context;
+	struct ib_umem *umem;
+	struct ib_umem_odp *umem_odp;
+	int ret;
+
+	if (access & IB_ACCESS_HUGETLB)
+		return ERR_PTR(-EINVAL);
+
+	if (!context)
+		return ERR_PTR(-EIO);
+	if (WARN_ON_ONCE(!context->device->ops.invalidate_range))
+		return ERR_PTR(-EINVAL);
+
+	umem_odp = kzalloc(sizeof(*umem_odp), GFP_KERNEL);
+	if (!umem_odp)
+		return ERR_PTR(-ENOMEM);
+	umem = &umem_odp->umem;
+	umem->ibdev = context->device;
+	umem->writable = ib_access_writable(access);
+	umem->owning_mm = current->mm;
+	umem_odp->is_implicit_odp = 1;
+	umem_odp->page_shift = PAGE_SHIFT;
+
+	ret = ib_init_umem_odp(umem_odp);
+	if (ret) {
+		kfree(umem_odp);
+		return ERR_PTR(ret);
+	}
+	return umem_odp;
+}
+EXPORT_SYMBOL(ib_umem_odp_alloc_implicit);
+
+/**
+ * ib_umem_odp_alloc_child - Allocate a child ODP umem under an implicit
+ *                           parent ODP umem
+ *
+ * @root: The parent umem enclosing the child. This must be allocated using
+ *        ib_alloc_implicit_odp_umem()
+ * @addr: The starting userspace VA
+ * @size: The length of the userspace VA
+ */
+struct ib_umem_odp *ib_umem_odp_alloc_child(struct ib_umem_odp *root,
+					    unsigned long addr, size_t size)
+{
+	/*
+	 * Caller must ensure that root cannot be freed during the call to
+	 * ib_alloc_odp_umem.
+	 */
+	struct ib_umem_odp *odp_data;
+	struct ib_umem *umem;
+	int ret;
+
+	if (WARN_ON(!root->is_implicit_odp))
+		return ERR_PTR(-EINVAL);
 
 	odp_data = kzalloc(sizeof(*odp_data), GFP_KERNEL);
-	if (!odp_data) {
-		ret = -ENOMEM;
-		goto out_umem;
+	if (!odp_data)
+		return ERR_PTR(-ENOMEM);
+	umem = &odp_data->umem;
+	umem->ibdev = root->umem.ibdev;
+	umem->length     = size;
+	umem->address    = addr;
+	umem->writable   = root->umem.writable;
+	umem->owning_mm  = root->umem.owning_mm;
+	odp_data->page_shift = PAGE_SHIFT;
+
+	ret = ib_init_umem_odp(odp_data);
+	if (ret) {
+		kfree(odp_data);
+		return ERR_PTR(ret);
 	}
-	odp_data->umem = umem;
-
-	mutex_init(&odp_data->umem_mutex);
-	init_completion(&odp_data->notifier_completion);
-
-	odp_data->page_list =
-		vzalloc(array_size(pages, sizeof(*odp_data->page_list)));
-	if (!odp_data->page_list) {
-		ret = -ENOMEM;
-		goto out_odp_data;
-	}
-
-	odp_data->dma_list =
-		vzalloc(array_size(pages, sizeof(*odp_data->dma_list)));
-	if (!odp_data->dma_list) {
-		ret = -ENOMEM;
-		goto out_page_list;
-	}
-
-	down_write(&context->umem_rwsem);
-	context->odp_mrs_count++;
-	rbt_ib_umem_insert(&odp_data->interval_tree, &context->umem_tree);
-	if (likely(!atomic_read(&context->notifier_count)))
-		odp_data->mn_counters_active = true;
-	else
-		list_add(&odp_data->no_private_counters,
-			 &context->no_private_counters);
-	up_write(&context->umem_rwsem);
-
-	umem->odp_data = odp_data;
-
-	return umem;
-
-out_page_list:
-	vfree(odp_data->page_list);
-out_odp_data:
-	kfree(odp_data);
-out_umem:
-	kfree(umem);
-	return ERR_PTR(ret);
+	return odp_data;
 }
-EXPORT_SYMBOL(ib_alloc_odp_umem);
+EXPORT_SYMBOL(ib_umem_odp_alloc_child);
 
-int ib_umem_odp_get(struct ib_ucontext *context, struct ib_umem *umem,
-		    int access)
+/**
+ * ib_umem_odp_get - Create a umem_odp for a userspace va
+ *
+ * @udata: userspace context to pin memory for
+ * @addr: userspace virtual address to start at
+ * @size: length of region to pin
+ * @access: IB_ACCESS_xxx flags for memory being pinned
+ *
+ * The driver should use when the access flags indicate ODP memory. It avoids
+ * pinning, instead, stores the mm for future page fault handling in
+ * conjunction with MMU notifiers.
+ */
+struct ib_umem_odp *ib_umem_odp_get(struct ib_udata *udata, unsigned long addr,
+				    size_t size, int access)
 {
-	int ret_val;
-	struct pid *our_pid;
-	struct mm_struct *mm = get_task_mm(current);
+	struct ib_umem_odp *umem_odp;
+	struct ib_ucontext *context;
+	struct mm_struct *mm;
+	int ret;
 
-	if (!mm)
-		return -EINVAL;
+	if (!udata)
+		return ERR_PTR(-EIO);
 
+	context = container_of(udata, struct uverbs_attr_bundle, driver_udata)
+			  ->context;
+	if (!context)
+		return ERR_PTR(-EIO);
+
+	if (WARN_ON_ONCE(!(access & IB_ACCESS_ON_DEMAND)) ||
+	    WARN_ON_ONCE(!context->device->ops.invalidate_range))
+		return ERR_PTR(-EINVAL);
+
+	umem_odp = kzalloc(sizeof(struct ib_umem_odp), GFP_KERNEL);
+	if (!umem_odp)
+		return ERR_PTR(-ENOMEM);
+
+	umem_odp->umem.ibdev = context->device;
+	umem_odp->umem.length = size;
+	umem_odp->umem.address = addr;
+	umem_odp->umem.writable = ib_access_writable(access);
+	umem_odp->umem.owning_mm = mm = current->mm;
+
+	umem_odp->page_shift = PAGE_SHIFT;
 	if (access & IB_ACCESS_HUGETLB) {
 		struct vm_area_struct *vma;
 		struct hstate *h;
 
 		down_read(&mm->mmap_sem);
-		vma = find_vma(mm, ib_umem_start(umem));
+		vma = find_vma(mm, ib_umem_start(umem_odp));
 		if (!vma || !is_vm_hugetlb_page(vma)) {
 			up_read(&mm->mmap_sem);
-			return -EINVAL;
+			ret = -EINVAL;
+			goto err_free;
 		}
 		h = hstate_vma(vma);
-		umem->page_shift = huge_page_shift(h);
+		umem_odp->page_shift = huge_page_shift(h);
 		up_read(&mm->mmap_sem);
-		umem->hugetlb = 1;
-	} else {
-		umem->hugetlb = 0;
 	}
 
-	/* Prevent creating ODP MRs in child processes */
-	rcu_read_lock();
-	our_pid = get_task_pid(current->group_leader, PIDTYPE_PID);
-	rcu_read_unlock();
-	put_pid(our_pid);
-	if (context->tgid != our_pid) {
-		ret_val = -EINVAL;
-		goto out_mm;
-	}
+	ret = ib_init_umem_odp(umem_odp);
+	if (ret)
+		goto err_free;
+	return umem_odp;
 
-	umem->odp_data = kzalloc(sizeof(*umem->odp_data), GFP_KERNEL);
-	if (!umem->odp_data) {
-		ret_val = -ENOMEM;
-		goto out_mm;
-	}
-	umem->odp_data->umem = umem;
-
-	mutex_init(&umem->odp_data->umem_mutex);
-
-	init_completion(&umem->odp_data->notifier_completion);
-
-	if (ib_umem_num_pages(umem)) {
-		umem->odp_data->page_list =
-			vzalloc(array_size(sizeof(*umem->odp_data->page_list),
-					   ib_umem_num_pages(umem)));
-		if (!umem->odp_data->page_list) {
-			ret_val = -ENOMEM;
-			goto out_odp_data;
-		}
-
-		umem->odp_data->dma_list =
-			vzalloc(array_size(sizeof(*umem->odp_data->dma_list),
-					   ib_umem_num_pages(umem)));
-		if (!umem->odp_data->dma_list) {
-			ret_val = -ENOMEM;
-			goto out_page_list;
-		}
-	}
-
-	/*
-	 * When using MMU notifiers, we will get a
-	 * notification before the "current" task (and MM) is
-	 * destroyed. We use the umem_rwsem semaphore to synchronize.
-	 */
-	down_write(&context->umem_rwsem);
-	context->odp_mrs_count++;
-	if (likely(ib_umem_start(umem) != ib_umem_end(umem)))
-		rbt_ib_umem_insert(&umem->odp_data->interval_tree,
-				   &context->umem_tree);
-	if (likely(!atomic_read(&context->notifier_count)) ||
-	    context->odp_mrs_count == 1)
-		umem->odp_data->mn_counters_active = true;
-	else
-		list_add(&umem->odp_data->no_private_counters,
-			 &context->no_private_counters);
-	downgrade_write(&context->umem_rwsem);
-
-	if (context->odp_mrs_count == 1) {
-		/*
-		 * Note that at this point, no MMU notifier is running
-		 * for this context!
-		 */
-		atomic_set(&context->notifier_count, 0);
-		INIT_HLIST_NODE(&context->mn.hlist);
-		context->mn.ops = &ib_umem_notifiers;
-		/*
-		 * Lock-dep detects a false positive for mmap_sem vs.
-		 * umem_rwsem, due to not grasping downgrade_write correctly.
-		 */
-		lockdep_off();
-		ret_val = mmu_notifier_register(&context->mn, mm);
-		lockdep_on();
-		if (ret_val) {
-			pr_err("Failed to register mmu_notifier %d\n", ret_val);
-			ret_val = -EBUSY;
-			goto out_mutex;
-		}
-	}
-
-	up_read(&context->umem_rwsem);
-
-	/*
-	 * Note that doing an mmput can cause a notifier for the relevant mm.
-	 * If the notifier is called while we hold the umem_rwsem, this will
-	 * cause a deadlock. Therefore, we release the reference only after we
-	 * released the semaphore.
-	 */
-	mmput(mm);
-	return 0;
-
-out_mutex:
-	up_read(&context->umem_rwsem);
-	vfree(umem->odp_data->dma_list);
-out_page_list:
-	vfree(umem->odp_data->page_list);
-out_odp_data:
-	kfree(umem->odp_data);
-out_mm:
-	mmput(mm);
-	return ret_val;
+err_free:
+	kfree(umem_odp);
+	return ERR_PTR(ret);
 }
+EXPORT_SYMBOL(ib_umem_odp_get);
 
-void ib_umem_odp_release(struct ib_umem *umem)
+void ib_umem_odp_release(struct ib_umem_odp *umem_odp)
 {
-	struct ib_ucontext *context = umem->context;
+	struct ib_ucontext_per_mm *per_mm = umem_odp->per_mm;
 
 	/*
 	 * Ensure that no more pages are mapped in the umem.
@@ -478,62 +450,36 @@
 	 * It is the driver's responsibility to ensure, before calling us,
 	 * that the hardware will not attempt to access the MR any more.
 	 */
-	ib_umem_odp_unmap_dma_pages(umem, ib_umem_start(umem),
-				    ib_umem_end(umem));
-
-	down_write(&context->umem_rwsem);
-	if (likely(ib_umem_start(umem) != ib_umem_end(umem)))
-		rbt_ib_umem_remove(&umem->odp_data->interval_tree,
-				   &context->umem_tree);
-	context->odp_mrs_count--;
-	if (!umem->odp_data->mn_counters_active) {
-		list_del(&umem->odp_data->no_private_counters);
-		complete_all(&umem->odp_data->notifier_completion);
+	if (!umem_odp->is_implicit_odp) {
+		mutex_lock(&umem_odp->umem_mutex);
+		ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem_odp),
+					    ib_umem_end(umem_odp));
+		mutex_unlock(&umem_odp->umem_mutex);
+		kvfree(umem_odp->dma_list);
+		kvfree(umem_odp->page_list);
 	}
 
+	down_write(&per_mm->umem_rwsem);
+	if (!umem_odp->is_implicit_odp) {
+		interval_tree_remove(&umem_odp->interval_tree,
+				     &per_mm->umem_tree);
+		complete_all(&umem_odp->notifier_completion);
+	}
 	/*
-	 * Downgrade the lock to a read lock. This ensures that the notifiers
-	 * (who lock the mutex for reading) will be able to finish, and we
-	 * will be able to enventually obtain the mmu notifiers SRCU. Note
-	 * that since we are doing it atomically, no other user could register
-	 * and unregister while we do the check.
+	 * NOTE! mmu_notifier_unregister() can happen between a start/end
+	 * callback, resulting in a missing end, and thus an unbalanced
+	 * lock. This doesn't really matter to us since we are about to kfree
+	 * the memory that holds the lock, however LOCKDEP doesn't like this.
+	 * Thus we call the mmu_notifier_put under the rwsem and test the
+	 * internal users count to reliably see if we are past this point.
 	 */
-	downgrade_write(&context->umem_rwsem);
-	if (!context->odp_mrs_count) {
-		struct task_struct *owning_process = NULL;
-		struct mm_struct *owning_mm        = NULL;
+	mmu_notifier_put(&per_mm->mn);
+	up_write(&per_mm->umem_rwsem);
 
-		owning_process = get_pid_task(context->tgid,
-					      PIDTYPE_PID);
-		if (owning_process == NULL)
-			/*
-			 * The process is already dead, notifier were removed
-			 * already.
-			 */
-			goto out;
-
-		owning_mm = get_task_mm(owning_process);
-		if (owning_mm == NULL)
-			/*
-			 * The process' mm is already dead, notifier were
-			 * removed already.
-			 */
-			goto out_put_task;
-		mmu_notifier_unregister(&context->mn, owning_mm);
-
-		mmput(owning_mm);
-
-out_put_task:
-		put_task_struct(owning_process);
-	}
-out:
-	up_read(&context->umem_rwsem);
-
-	vfree(umem->odp_data->dma_list);
-	vfree(umem->odp_data->page_list);
-	kfree(umem->odp_data);
-	kfree(umem);
+	mmdrop(umem_odp->umem.owning_mm);
+	kfree(umem_odp);
 }
+EXPORT_SYMBOL(ib_umem_odp_release);
 
 /*
  * Map for DMA and insert a single page into the on-demand paging page tables.
@@ -544,25 +490,24 @@
  * @access_mask: access permissions needed for this page.
  * @current_seq: sequence number for synchronization with invalidations.
  *               the sequence number is taken from
- *               umem->odp_data->notifiers_seq.
+ *               umem_odp->notifiers_seq.
  *
  * The function returns -EFAULT if the DMA mapping operation fails. It returns
  * -EAGAIN if a concurrent invalidation prevents us from updating the page.
  *
- * The page is released via put_page even if the operation failed. For
+ * The page is released via put_user_page even if the operation failed. For
  * on-demand pinning, the page is released whenever it isn't stored in the
  * umem.
  */
 static int ib_umem_odp_map_dma_single_page(
-		struct ib_umem *umem,
+		struct ib_umem_odp *umem_odp,
 		int page_index,
 		struct page *page,
 		u64 access_mask,
 		unsigned long current_seq)
 {
-	struct ib_device *dev = umem->context->device;
+	struct ib_device *dev = umem_odp->umem.ibdev;
 	dma_addr_t dma_addr;
-	int stored_page = 0;
 	int remove_existing_mapping = 0;
 	int ret = 0;
 
@@ -571,45 +516,43 @@
 	 * handle case of a racing notifier. This check also allows us to bail
 	 * early if we have a notifier running in parallel with us.
 	 */
-	if (ib_umem_mmu_notifier_retry(umem, current_seq)) {
+	if (ib_umem_mmu_notifier_retry(umem_odp, current_seq)) {
 		ret = -EAGAIN;
 		goto out;
 	}
-	if (!(umem->odp_data->dma_list[page_index])) {
-		dma_addr = ib_dma_map_page(dev,
-					   page,
-					   0, BIT(umem->page_shift),
-					   DMA_BIDIRECTIONAL);
+	if (!(umem_odp->dma_list[page_index])) {
+		dma_addr =
+			ib_dma_map_page(dev, page, 0, BIT(umem_odp->page_shift),
+					DMA_BIDIRECTIONAL);
 		if (ib_dma_mapping_error(dev, dma_addr)) {
 			ret = -EFAULT;
 			goto out;
 		}
-		umem->odp_data->dma_list[page_index] = dma_addr | access_mask;
-		umem->odp_data->page_list[page_index] = page;
-		umem->npages++;
-		stored_page = 1;
-	} else if (umem->odp_data->page_list[page_index] == page) {
-		umem->odp_data->dma_list[page_index] |= access_mask;
+		umem_odp->dma_list[page_index] = dma_addr | access_mask;
+		umem_odp->page_list[page_index] = page;
+		umem_odp->npages++;
+	} else if (umem_odp->page_list[page_index] == page) {
+		umem_odp->dma_list[page_index] |= access_mask;
 	} else {
 		pr_err("error: got different pages in IB device and from get_user_pages. IB device page: %p, gup page: %p\n",
-		       umem->odp_data->page_list[page_index], page);
+		       umem_odp->page_list[page_index], page);
 		/* Better remove the mapping now, to prevent any further
 		 * damage. */
 		remove_existing_mapping = 1;
 	}
 
 out:
-	/* On Demand Paging - avoid pinning the page */
-	if (umem->context->invalidate_range || !stored_page)
-		put_page(page);
+	put_user_page(page);
 
-	if (remove_existing_mapping && umem->context->invalidate_range) {
-		invalidate_page_trampoline(
-			umem,
-			ib_umem_start(umem) + (page_index >> umem->page_shift),
-			ib_umem_start(umem) + ((page_index + 1) >>
-					       umem->page_shift),
-			NULL);
+	if (remove_existing_mapping) {
+		ib_umem_notifier_start_account(umem_odp);
+		dev->ops.invalidate_range(
+			umem_odp,
+			ib_umem_start(umem_odp) +
+				(page_index << umem_odp->page_shift),
+			ib_umem_start(umem_odp) +
+				((page_index + 1) << umem_odp->page_shift));
+		ib_umem_notifier_end_account(umem_odp);
 		ret = -EAGAIN;
 	}
 
@@ -621,7 +564,7 @@
  *
  * Pins the range of pages passed in the argument, and maps them to
  * DMA addresses. The DMA addresses of the mapped pages is updated in
- * umem->odp_data->dma_list.
+ * umem_odp->dma_list.
  *
  * Returns the number of pages mapped in success, negative error code
  * for failure.
@@ -629,7 +572,7 @@
  * the function from completing its task.
  * An -ENOENT error code indicates that userspace process is being terminated
  * and mm was already destroyed.
- * @umem: the umem to map and pin
+ * @umem_odp: the umem to map and pin
  * @user_virt: the address from which we need to map.
  * @bcnt: the minimal number of bytes to pin and map. The mapping might be
  *        bigger due to alignment, and may also be smaller in case of an error
@@ -639,52 +582,52 @@
  *               range.
  * @current_seq: the MMU notifiers sequance value for synchronization with
  *               invalidations. the sequance number is read from
- *               umem->odp_data->notifiers_seq before calling this function
+ *               umem_odp->notifiers_seq before calling this function
  */
-int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 user_virt, u64 bcnt,
-			      u64 access_mask, unsigned long current_seq)
+int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 user_virt,
+			      u64 bcnt, u64 access_mask,
+			      unsigned long current_seq)
 {
 	struct task_struct *owning_process  = NULL;
-	struct mm_struct   *owning_mm       = NULL;
+	struct mm_struct *owning_mm = umem_odp->umem.owning_mm;
 	struct page       **local_page_list = NULL;
 	u64 page_mask, off;
-	int j, k, ret = 0, start_idx, npages = 0, page_shift;
-	unsigned int flags = 0;
+	int j, k, ret = 0, start_idx, npages = 0;
+	unsigned int flags = 0, page_shift;
 	phys_addr_t p = 0;
 
 	if (access_mask == 0)
 		return -EINVAL;
 
-	if (user_virt < ib_umem_start(umem) ||
-	    user_virt + bcnt > ib_umem_end(umem))
+	if (user_virt < ib_umem_start(umem_odp) ||
+	    user_virt + bcnt > ib_umem_end(umem_odp))
 		return -EFAULT;
 
 	local_page_list = (struct page **)__get_free_page(GFP_KERNEL);
 	if (!local_page_list)
 		return -ENOMEM;
 
-	page_shift = umem->page_shift;
+	page_shift = umem_odp->page_shift;
 	page_mask = ~(BIT(page_shift) - 1);
 	off = user_virt & (~page_mask);
 	user_virt = user_virt & page_mask;
 	bcnt += off; /* Charge for the first page offset as well. */
 
-	owning_process = get_pid_task(umem->context->tgid, PIDTYPE_PID);
-	if (owning_process == NULL) {
+	/*
+	 * owning_process is allowed to be NULL, this means somehow the mm is
+	 * existing beyond the lifetime of the originating process.. Presumably
+	 * mmget_not_zero will fail in this case.
+	 */
+	owning_process = get_pid_task(umem_odp->per_mm->tgid, PIDTYPE_PID);
+	if (!owning_process || !mmget_not_zero(owning_mm)) {
 		ret = -EINVAL;
-		goto out_no_task;
-	}
-
-	owning_mm = get_task_mm(owning_process);
-	if (owning_mm == NULL) {
-		ret = -ENOENT;
 		goto out_put_task;
 	}
 
 	if (access_mask & ODP_WRITE_ALLOWED_BIT)
 		flags |= FOLL_WRITE;
 
-	start_idx = (user_virt - ib_umem_start(umem)) >> page_shift;
+	start_idx = (user_virt - ib_umem_start(umem_odp)) >> page_shift;
 	k = start_idx;
 
 	while (bcnt > 0) {
@@ -705,11 +648,16 @@
 				flags, local_page_list, NULL, NULL);
 		up_read(&owning_mm->mmap_sem);
 
-		if (npages < 0)
+		if (npages < 0) {
+			if (npages != -EAGAIN)
+				pr_warn("fail to get %zu user pages with error %d\n", gup_num_pages, npages);
+			else
+				pr_debug("fail to get %zu user pages with error %d\n", gup_num_pages, npages);
 			break;
+		}
 
 		bcnt -= min_t(size_t, npages << PAGE_SHIFT, bcnt);
-		mutex_lock(&umem->odp_data->umem_mutex);
+		mutex_lock(&umem_odp->umem_mutex);
 		for (j = 0; j < npages; j++, user_virt += PAGE_SIZE) {
 			if (user_virt & ~page_mask) {
 				p += PAGE_SIZE;
@@ -717,25 +665,35 @@
 					ret = -EFAULT;
 					break;
 				}
-				put_page(local_page_list[j]);
+				put_user_page(local_page_list[j]);
 				continue;
 			}
 
 			ret = ib_umem_odp_map_dma_single_page(
-					umem, k, local_page_list[j],
+					umem_odp, k, local_page_list[j],
 					access_mask, current_seq);
-			if (ret < 0)
+			if (ret < 0) {
+				if (ret != -EAGAIN)
+					pr_warn("ib_umem_odp_map_dma_single_page failed with error %d\n", ret);
+				else
+					pr_debug("ib_umem_odp_map_dma_single_page failed with error %d\n", ret);
 				break;
+			}
 
 			p = page_to_phys(local_page_list[j]);
 			k++;
 		}
-		mutex_unlock(&umem->odp_data->umem_mutex);
+		mutex_unlock(&umem_odp->umem_mutex);
 
 		if (ret < 0) {
-			/* Release left over pages when handling errors. */
-			for (++j; j < npages; ++j)
-				put_page(local_page_list[j]);
+			/*
+			 * Release pages, remembering that the first page
+			 * to hit an error was already released by
+			 * ib_umem_odp_map_dma_single_page().
+			 */
+			if (npages - (j + 1) > 0)
+				put_user_pages(&local_page_list[j+1],
+					       npages - (j + 1));
 			break;
 		}
 	}
@@ -749,38 +707,40 @@
 
 	mmput(owning_mm);
 out_put_task:
-	put_task_struct(owning_process);
-out_no_task:
+	if (owning_process)
+		put_task_struct(owning_process);
 	free_page((unsigned long)local_page_list);
 	return ret;
 }
 EXPORT_SYMBOL(ib_umem_odp_map_dma_pages);
 
-void ib_umem_odp_unmap_dma_pages(struct ib_umem *umem, u64 virt,
+void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 virt,
 				 u64 bound)
 {
 	int idx;
 	u64 addr;
-	struct ib_device *dev = umem->context->device;
+	struct ib_device *dev = umem_odp->umem.ibdev;
 
-	virt  = max_t(u64, virt,  ib_umem_start(umem));
-	bound = min_t(u64, bound, ib_umem_end(umem));
+	lockdep_assert_held(&umem_odp->umem_mutex);
+
+	virt = max_t(u64, virt, ib_umem_start(umem_odp));
+	bound = min_t(u64, bound, ib_umem_end(umem_odp));
 	/* Note that during the run of this function, the
 	 * notifiers_count of the MR is > 0, preventing any racing
 	 * faults from completion. We might be racing with other
 	 * invalidations, so we must make sure we free each page only
 	 * once. */
-	mutex_lock(&umem->odp_data->umem_mutex);
-	for (addr = virt; addr < bound; addr += BIT(umem->page_shift)) {
-		idx = (addr - ib_umem_start(umem)) >> umem->page_shift;
-		if (umem->odp_data->page_list[idx]) {
-			struct page *page = umem->odp_data->page_list[idx];
-			dma_addr_t dma = umem->odp_data->dma_list[idx];
+	for (addr = virt; addr < bound; addr += BIT(umem_odp->page_shift)) {
+		idx = (addr - ib_umem_start(umem_odp)) >> umem_odp->page_shift;
+		if (umem_odp->page_list[idx]) {
+			struct page *page = umem_odp->page_list[idx];
+			dma_addr_t dma = umem_odp->dma_list[idx];
 			dma_addr_t dma_addr = dma & ODP_DMA_ADDR_MASK;
 
 			WARN_ON(!dma_addr);
 
-			ib_dma_unmap_page(dev, dma_addr, PAGE_SIZE,
+			ib_dma_unmap_page(dev, dma_addr,
+					  BIT(umem_odp->page_shift),
 					  DMA_BIDIRECTIONAL);
 			if (dma & ODP_WRITE_ALLOWED_BIT) {
 				struct page *head_page = compound_head(page);
@@ -795,15 +755,11 @@
 				 */
 				set_page_dirty(head_page);
 			}
-			/* on demand pinning support */
-			if (!umem->context->invalidate_range)
-				put_page(page);
-			umem->odp_data->page_list[idx] = NULL;
-			umem->odp_data->dma_list[idx] = 0;
-			umem->npages--;
+			umem_odp->page_list[idx] = NULL;
+			umem_odp->dma_list[idx] = 0;
+			umem_odp->npages--;
 		}
 	}
-	mutex_unlock(&umem->odp_data->umem_mutex);
 }
 EXPORT_SYMBOL(ib_umem_odp_unmap_dma_pages);
 
@@ -817,35 +773,21 @@
 				  void *cookie)
 {
 	int ret_val = 0;
-	struct umem_odp_node *node, *next;
+	struct interval_tree_node *node, *next;
 	struct ib_umem_odp *umem;
 
 	if (unlikely(start == last))
 		return ret_val;
 
-	for (node = rbt_ib_umem_iter_first(root, start, last - 1);
+	for (node = interval_tree_iter_first(root, start, last - 1);
 			node; node = next) {
 		/* TODO move the blockable decision up to the callback */
 		if (!blockable)
 			return -EAGAIN;
-		next = rbt_ib_umem_iter_next(node, start, last - 1);
+		next = interval_tree_iter_next(node, start, last - 1);
 		umem = container_of(node, struct ib_umem_odp, interval_tree);
-		ret_val = cb(umem->umem, start, last, cookie) || ret_val;
+		ret_val = cb(umem, start, last, cookie) || ret_val;
 	}
 
 	return ret_val;
 }
-EXPORT_SYMBOL(rbt_ib_umem_for_each_in_range);
-
-struct ib_umem_odp *rbt_ib_umem_lookup(struct rb_root_cached *root,
-				       u64 addr, u64 length)
-{
-	struct umem_odp_node *node;
-
-	node = rbt_ib_umem_iter_first(root, addr, addr + length - 1);
-	if (node)
-		return container_of(node, struct ib_umem_odp, interval_tree);
-	return NULL;
-
-}
-EXPORT_SYMBOL(rbt_ib_umem_lookup);
diff --git a/drivers/infiniband/core/user_mad.c b/drivers/infiniband/core/user_mad.c
index c34a685..d1407fa 100644
--- a/drivers/infiniband/core/user_mad.c
+++ b/drivers/infiniband/core/user_mad.c
@@ -49,11 +49,13 @@
 #include <linux/sched.h>
 #include <linux/semaphore.h>
 #include <linux/slab.h>
+#include <linux/nospec.h>
 
 #include <linux/uaccess.h>
 
 #include <rdma/ib_mad.h>
 #include <rdma/ib_user_mad.h>
+#include <rdma/rdma_netlink.h>
 
 #include "core_priv.h"
 
@@ -88,10 +90,9 @@
 
 struct ib_umad_port {
 	struct cdev           cdev;
-	struct device	      *dev;
-
+	struct device	      dev;
 	struct cdev           sm_cdev;
-	struct device	      *sm_dev;
+	struct device	      sm_dev;
 	struct semaphore       sm_sem;
 
 	struct mutex	       file_mutex;
@@ -104,8 +105,8 @@
 };
 
 struct ib_umad_device {
-	struct kobject       kobj;
-	struct ib_umad_port  port[0];
+	struct kref kref;
+	struct ib_umad_port ports[];
 };
 
 struct ib_umad_file {
@@ -130,7 +131,8 @@
 	struct ib_user_mad mad;
 };
 
-static struct class *umad_class;
+#define CREATE_TRACE_POINTS
+#include <trace/events/ib_umad.h>
 
 static const dev_t base_umad_dev = MKDEV(IB_UMAD_MAJOR, IB_UMAD_MINOR_BASE);
 static const dev_t base_issm_dev = MKDEV(IB_UMAD_MAJOR, IB_UMAD_MINOR_BASE) +
@@ -138,22 +140,28 @@
 static dev_t dynamic_umad_dev;
 static dev_t dynamic_issm_dev;
 
-static DECLARE_BITMAP(dev_map, IB_UMAD_MAX_PORTS);
+static DEFINE_IDA(umad_ida);
 
 static void ib_umad_add_one(struct ib_device *device);
 static void ib_umad_remove_one(struct ib_device *device, void *client_data);
 
-static void ib_umad_release_dev(struct kobject *kobj)
+static void ib_umad_dev_free(struct kref *kref)
 {
 	struct ib_umad_device *dev =
-		container_of(kobj, struct ib_umad_device, kobj);
+		container_of(kref, struct ib_umad_device, kref);
 
 	kfree(dev);
 }
 
-static struct kobj_type ib_umad_dev_ktype = {
-	.release = ib_umad_release_dev,
-};
+static void ib_umad_dev_get(struct ib_umad_device *dev)
+{
+	kref_get(&dev->kref);
+}
+
+static void ib_umad_dev_put(struct ib_umad_device *dev)
+{
+	kref_put(&dev->kref, ib_umad_dev_free);
+}
 
 static int hdr_size(struct ib_umad_file *file)
 {
@@ -205,7 +213,7 @@
 	struct ib_umad_packet *packet = send_wc->send_buf->context[0];
 
 	dequeue_send(file, packet);
-	rdma_destroy_ah(packet->msg->ah);
+	rdma_destroy_ah(packet->msg->ah, RDMA_DESTROY_AH_SLEEPABLE);
 	ib_free_send_mad(packet->msg);
 
 	if (send_wc->status == IB_WC_RESP_TIMEOUT_ERR) {
@@ -331,6 +339,9 @@
 				return -EFAULT;
 		}
 	}
+
+	trace_ib_umad_read_recv(file, &packet->mad.hdr, &recv_buf->mad->mad_hdr);
+
 	return hdr_size(file) + packet->length;
 }
 
@@ -350,6 +361,9 @@
 	if (copy_to_user(buf, packet->mad.data, packet->length))
 		return -EFAULT;
 
+	trace_ib_umad_read_send(file, &packet->mad.hdr,
+				(struct ib_mad_hdr *)&packet->mad.data);
+
 	return size;
 }
 
@@ -505,6 +519,9 @@
 
 	mutex_lock(&file->mutex);
 
+	trace_ib_umad_write(file, &packet->mad.hdr,
+			    (struct ib_mad_hdr *)&packet->mad.data);
+
 	agent = __get_agent(file, packet->mad.hdr.id);
 	if (!agent) {
 		ret = -EINVAL;
@@ -621,7 +638,7 @@
 err_msg:
 	ib_free_send_mad(packet->msg);
 err_ah:
-	rdma_destroy_ah(ah);
+	rdma_destroy_ah(ah, RDMA_DESTROY_AH_SLEEPABLE);
 err_up:
 	mutex_unlock(&file->mutex);
 err:
@@ -657,7 +674,7 @@
 	mutex_lock(&file->mutex);
 
 	if (!file->port->ib_dev) {
-		dev_notice(file->port->dev,
+		dev_notice(&file->port->dev,
 			   "ib_umad_reg_agent: invalid device\n");
 		ret = -EPIPE;
 		goto out;
@@ -669,7 +686,7 @@
 	}
 
 	if (ureq.qpn != 0 && ureq.qpn != 1) {
-		dev_notice(file->port->dev,
+		dev_notice(&file->port->dev,
 			   "ib_umad_reg_agent: invalid QPN %d specified\n",
 			   ureq.qpn);
 		ret = -EINVAL;
@@ -680,7 +697,7 @@
 		if (!__get_agent(file, agent_id))
 			goto found;
 
-	dev_notice(file->port->dev,
+	dev_notice(&file->port->dev,
 		   "ib_umad_reg_agent: Max Agents (%u) reached\n",
 		   IB_UMAD_MAX_AGENTS);
 	ret = -ENOMEM;
@@ -725,11 +742,11 @@
 	if (!file->already_used) {
 		file->already_used = 1;
 		if (!file->use_pkey_index) {
-			dev_warn(file->port->dev,
+			dev_warn(&file->port->dev,
 				"process %s did not enable P_Key index support.\n",
 				current->comm);
-			dev_warn(file->port->dev,
-				"   Documentation/infiniband/user_mad.txt has info on the new ABI.\n");
+			dev_warn(&file->port->dev,
+				"   Documentation/infiniband/user_mad.rst has info on the new ABI.\n");
 		}
 	}
 
@@ -759,7 +776,7 @@
 	mutex_lock(&file->mutex);
 
 	if (!file->port->ib_dev) {
-		dev_notice(file->port->dev,
+		dev_notice(&file->port->dev,
 			   "ib_umad_reg_agent2: invalid device\n");
 		ret = -EPIPE;
 		goto out;
@@ -771,7 +788,7 @@
 	}
 
 	if (ureq.qpn != 0 && ureq.qpn != 1) {
-		dev_notice(file->port->dev,
+		dev_notice(&file->port->dev,
 			   "ib_umad_reg_agent2: invalid QPN %d specified\n",
 			   ureq.qpn);
 		ret = -EINVAL;
@@ -779,7 +796,7 @@
 	}
 
 	if (ureq.flags & ~IB_USER_MAD_REG_FLAGS_CAP) {
-		dev_notice(file->port->dev,
+		dev_notice(&file->port->dev,
 			   "ib_umad_reg_agent2 failed: invalid registration flags specified 0x%x; supported 0x%x\n",
 			   ureq.flags, IB_USER_MAD_REG_FLAGS_CAP);
 		ret = -EINVAL;
@@ -796,7 +813,7 @@
 		if (!__get_agent(file, agent_id))
 			goto found;
 
-	dev_notice(file->port->dev,
+	dev_notice(&file->port->dev,
 		   "ib_umad_reg_agent2: Max Agents (%u) reached\n",
 		   IB_UMAD_MAX_AGENTS);
 	ret = -ENOMEM;
@@ -808,7 +825,7 @@
 		req.mgmt_class         = ureq.mgmt_class;
 		req.mgmt_class_version = ureq.mgmt_class_version;
 		if (ureq.oui & 0xff000000) {
-			dev_notice(file->port->dev,
+			dev_notice(&file->port->dev,
 				   "ib_umad_reg_agent2 failed: oui invalid 0x%08x\n",
 				   ureq.oui);
 			ret = -EINVAL;
@@ -868,11 +885,14 @@
 
 	if (get_user(id, arg))
 		return -EFAULT;
+	if (id >= IB_UMAD_MAX_AGENTS)
+		return -EINVAL;
 
 	mutex_lock(&file->port->file_mutex);
 	mutex_lock(&file->mutex);
 
-	if (id >= IB_UMAD_MAX_AGENTS || !__get_agent(file, id)) {
+	id = array_index_nospec(id, IB_UMAD_MAX_AGENTS);
+	if (!__get_agent(file, id)) {
 		ret = -EINVAL;
 		goto out;
 	}
@@ -954,19 +974,27 @@
 {
 	struct ib_umad_port *port;
 	struct ib_umad_file *file;
-	int ret = -ENXIO;
+	int ret = 0;
 
 	port = container_of(inode->i_cdev, struct ib_umad_port, cdev);
 
 	mutex_lock(&port->file_mutex);
 
-	if (!port->ib_dev)
+	if (!port->ib_dev) {
+		ret = -ENXIO;
 		goto out;
+	}
 
-	ret = -ENOMEM;
-	file = kzalloc(sizeof *file, GFP_KERNEL);
-	if (!file)
+	if (!rdma_dev_access_netns(port->ib_dev, current->nsproxy->net_ns)) {
+		ret = -EPERM;
 		goto out;
+	}
+
+	file = kzalloc(sizeof(*file), GFP_KERNEL);
+	if (!file) {
+		ret = -ENOMEM;
+		goto out;
+	}
 
 	mutex_init(&file->mutex);
 	spin_lock_init(&file->send_lock);
@@ -979,15 +1007,7 @@
 
 	list_add_tail(&file->port_list, &port->file_list);
 
-	ret = nonseekable_open(inode, filp);
-	if (ret) {
-		list_del(&file->port_list);
-		kfree(file);
-		goto out;
-	}
-
-	kobject_get(&port->umad_dev->kobj);
-
+	stream_open(inode, filp);
 out:
 	mutex_unlock(&port->file_mutex);
 	return ret;
@@ -996,7 +1016,6 @@
 static int ib_umad_close(struct inode *inode, struct file *filp)
 {
 	struct ib_umad_file *file = filp->private_data;
-	struct ib_umad_device *dev = file->port->umad_dev;
 	struct ib_umad_packet *packet, *tmp;
 	int already_dead;
 	int i;
@@ -1023,10 +1042,8 @@
 				ib_unregister_mad_agent(file->agent[i]);
 
 	mutex_unlock(&file->port->file_mutex);
-
+	mutex_destroy(&file->mutex);
 	kfree(file);
-	kobject_put(&dev->kobj);
-
 	return 0;
 }
 
@@ -1066,24 +1083,20 @@
 		}
 	}
 
+	if (!rdma_dev_access_netns(port->ib_dev, current->nsproxy->net_ns)) {
+		ret = -EPERM;
+		goto err_up_sem;
+	}
+
 	ret = ib_modify_port(port->ib_dev, port->port_num, 0, &props);
 	if (ret)
 		goto err_up_sem;
 
 	filp->private_data = port;
 
-	ret = nonseekable_open(inode, filp);
-	if (ret)
-		goto err_clr_sm_cap;
-
-	kobject_get(&port->umad_dev->kobj);
-
+	nonseekable_open(inode, filp);
 	return 0;
 
-err_clr_sm_cap:
-	swap(props.set_port_cap_mask, props.clr_port_cap_mask);
-	ib_modify_port(port->ib_dev, port->port_num, 0, &props);
-
 err_up_sem:
 	up(&port->sm_sem);
 
@@ -1106,8 +1119,6 @@
 
 	up(&port->sm_sem);
 
-	kobject_put(&port->umad_dev->kobj);
-
 	return ret;
 }
 
@@ -1118,13 +1129,50 @@
 	.llseek	 = no_llseek,
 };
 
+static int ib_umad_get_nl_info(struct ib_device *ibdev, void *client_data,
+			       struct ib_client_nl_info *res)
+{
+	struct ib_umad_device *umad_dev = client_data;
+
+	if (!rdma_is_port_valid(ibdev, res->port))
+		return -EINVAL;
+
+	res->abi = IB_USER_MAD_ABI_VERSION;
+	res->cdev = &umad_dev->ports[res->port - rdma_start_port(ibdev)].dev;
+
+	return 0;
+}
+
 static struct ib_client umad_client = {
 	.name   = "umad",
 	.add    = ib_umad_add_one,
-	.remove = ib_umad_remove_one
+	.remove = ib_umad_remove_one,
+	.get_nl_info = ib_umad_get_nl_info,
 };
+MODULE_ALIAS_RDMA_CLIENT("umad");
 
-static ssize_t show_ibdev(struct device *dev, struct device_attribute *attr,
+static int ib_issm_get_nl_info(struct ib_device *ibdev, void *client_data,
+			       struct ib_client_nl_info *res)
+{
+	struct ib_umad_device *umad_dev =
+		ib_get_client_data(ibdev, &umad_client);
+
+	if (!rdma_is_port_valid(ibdev, res->port))
+		return -EINVAL;
+
+	res->abi = IB_USER_MAD_ABI_VERSION;
+	res->cdev = &umad_dev->ports[res->port - rdma_start_port(ibdev)].sm_dev;
+
+	return 0;
+}
+
+static struct ib_client issm_client = {
+	.name = "issm",
+	.get_nl_info = ib_issm_get_nl_info,
+};
+MODULE_ALIAS_RDMA_CLIENT("issm");
+
+static ssize_t ibdev_show(struct device *dev, struct device_attribute *attr,
 			  char *buf)
 {
 	struct ib_umad_port *port = dev_get_drvdata(dev);
@@ -1132,11 +1180,11 @@
 	if (!port)
 		return -ENODEV;
 
-	return sprintf(buf, "%s\n", port->ib_dev->name);
+	return sprintf(buf, "%s\n", dev_name(&port->ib_dev->dev));
 }
-static DEVICE_ATTR(ibdev, S_IRUGO, show_ibdev, NULL);
+static DEVICE_ATTR_RO(ibdev);
 
-static ssize_t show_port(struct device *dev, struct device_attribute *attr,
+static ssize_t port_show(struct device *dev, struct device_attribute *attr,
 			 char *buf)
 {
 	struct ib_umad_port *port = dev_get_drvdata(dev);
@@ -1146,10 +1194,59 @@
 
 	return sprintf(buf, "%d\n", port->port_num);
 }
-static DEVICE_ATTR(port, S_IRUGO, show_port, NULL);
+static DEVICE_ATTR_RO(port);
 
-static CLASS_ATTR_STRING(abi_version, S_IRUGO,
-			 __stringify(IB_USER_MAD_ABI_VERSION));
+static struct attribute *umad_class_dev_attrs[] = {
+	&dev_attr_ibdev.attr,
+	&dev_attr_port.attr,
+	NULL,
+};
+ATTRIBUTE_GROUPS(umad_class_dev);
+
+static char *umad_devnode(struct device *dev, umode_t *mode)
+{
+	return kasprintf(GFP_KERNEL, "infiniband/%s", dev_name(dev));
+}
+
+static ssize_t abi_version_show(struct class *class,
+				struct class_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%d\n", IB_USER_MAD_ABI_VERSION);
+}
+static CLASS_ATTR_RO(abi_version);
+
+static struct attribute *umad_class_attrs[] = {
+	&class_attr_abi_version.attr,
+	NULL,
+};
+ATTRIBUTE_GROUPS(umad_class);
+
+static struct class umad_class = {
+	.name		= "infiniband_mad",
+	.devnode	= umad_devnode,
+	.class_groups	= umad_class_groups,
+	.dev_groups	= umad_class_dev_groups,
+};
+
+static void ib_umad_release_port(struct device *device)
+{
+	struct ib_umad_port *port = dev_get_drvdata(device);
+	struct ib_umad_device *umad_dev = port->umad_dev;
+
+	ib_umad_dev_put(umad_dev);
+}
+
+static void ib_umad_init_port_dev(struct device *dev,
+				  struct ib_umad_port *port,
+				  const struct ib_device *device)
+{
+	device_initialize(dev);
+	ib_umad_dev_get(port->umad_dev);
+	dev->class = &umad_class;
+	dev->parent = device->dev.parent;
+	dev_set_drvdata(dev, port);
+	dev->release = ib_umad_release_port;
+}
 
 static int ib_umad_init_port(struct ib_device *device, int port_num,
 			     struct ib_umad_device *umad_dev,
@@ -1158,12 +1255,12 @@
 	int devnum;
 	dev_t base_umad;
 	dev_t base_issm;
+	int ret;
 
-	devnum = find_first_zero_bit(dev_map, IB_UMAD_MAX_PORTS);
-	if (devnum >= IB_UMAD_MAX_PORTS)
+	devnum = ida_alloc_max(&umad_ida, IB_UMAD_MAX_PORTS - 1, GFP_KERNEL);
+	if (devnum < 0)
 		return -1;
 	port->dev_num = devnum;
-	set_bit(devnum, dev_map);
 	if (devnum >= IB_UMAD_NUM_FIXED_MINOR) {
 		base_umad = dynamic_umad_dev + devnum - IB_UMAD_NUM_FIXED_MINOR;
 		base_issm = dynamic_issm_dev + devnum - IB_UMAD_NUM_FIXED_MINOR;
@@ -1173,63 +1270,41 @@
 	}
 
 	port->ib_dev   = device;
+	port->umad_dev = umad_dev;
 	port->port_num = port_num;
 	sema_init(&port->sm_sem, 1);
 	mutex_init(&port->file_mutex);
 	INIT_LIST_HEAD(&port->file_list);
 
+	ib_umad_init_port_dev(&port->dev, port, device);
+	port->dev.devt = base_umad;
+	dev_set_name(&port->dev, "umad%d", port->dev_num);
 	cdev_init(&port->cdev, &umad_fops);
 	port->cdev.owner = THIS_MODULE;
-	cdev_set_parent(&port->cdev, &umad_dev->kobj);
-	kobject_set_name(&port->cdev.kobj, "umad%d", port->dev_num);
-	if (cdev_add(&port->cdev, base_umad, 1))
+
+	ret = cdev_device_add(&port->cdev, &port->dev);
+	if (ret)
 		goto err_cdev;
 
-	port->dev = device_create(umad_class, device->dev.parent,
-				  port->cdev.dev, port,
-				  "umad%d", port->dev_num);
-	if (IS_ERR(port->dev))
-		goto err_cdev;
-
-	if (device_create_file(port->dev, &dev_attr_ibdev))
-		goto err_dev;
-	if (device_create_file(port->dev, &dev_attr_port))
-		goto err_dev;
-
+	ib_umad_init_port_dev(&port->sm_dev, port, device);
+	port->sm_dev.devt = base_issm;
+	dev_set_name(&port->sm_dev, "issm%d", port->dev_num);
 	cdev_init(&port->sm_cdev, &umad_sm_fops);
 	port->sm_cdev.owner = THIS_MODULE;
-	cdev_set_parent(&port->sm_cdev, &umad_dev->kobj);
-	kobject_set_name(&port->sm_cdev.kobj, "issm%d", port->dev_num);
-	if (cdev_add(&port->sm_cdev, base_issm, 1))
-		goto err_sm_cdev;
 
-	port->sm_dev = device_create(umad_class, device->dev.parent,
-				     port->sm_cdev.dev, port,
-				     "issm%d", port->dev_num);
-	if (IS_ERR(port->sm_dev))
-		goto err_sm_cdev;
-
-	if (device_create_file(port->sm_dev, &dev_attr_ibdev))
-		goto err_sm_dev;
-	if (device_create_file(port->sm_dev, &dev_attr_port))
-		goto err_sm_dev;
+	ret = cdev_device_add(&port->sm_cdev, &port->sm_dev);
+	if (ret)
+		goto err_dev;
 
 	return 0;
 
-err_sm_dev:
-	device_destroy(umad_class, port->sm_cdev.dev);
-
-err_sm_cdev:
-	cdev_del(&port->sm_cdev);
-
 err_dev:
-	device_destroy(umad_class, port->cdev.dev);
-
+	put_device(&port->sm_dev);
+	cdev_device_del(&port->cdev, &port->dev);
 err_cdev:
-	cdev_del(&port->cdev);
-	clear_bit(devnum, dev_map);
-
-	return -1;
+	put_device(&port->dev);
+	ida_free(&umad_ida, devnum);
+	return ret;
 }
 
 static void ib_umad_kill_port(struct ib_umad_port *port)
@@ -1237,17 +1312,11 @@
 	struct ib_umad_file *file;
 	int id;
 
-	dev_set_drvdata(port->dev,    NULL);
-	dev_set_drvdata(port->sm_dev, NULL);
-
-	device_destroy(umad_class, port->cdev.dev);
-	device_destroy(umad_class, port->sm_cdev.dev);
-
-	cdev_del(&port->cdev);
-	cdev_del(&port->sm_cdev);
-
 	mutex_lock(&port->file_mutex);
 
+	/* Mark ib_dev NULL and block ioctl or other file ops to progress
+	 * further.
+	 */
 	port->ib_dev = NULL;
 
 	list_for_each_entry(file, &port->file_list, port_list) {
@@ -1261,7 +1330,14 @@
 	}
 
 	mutex_unlock(&port->file_mutex);
-	clear_bit(port->dev_num, dev_map);
+
+	cdev_device_del(&port->sm_cdev, &port->sm_dev);
+	cdev_device_del(&port->cdev, &port->dev);
+	ida_free(&umad_ida, port->dev_num);
+
+	/* balances device_initialize() */
+	put_device(&port->sm_dev);
+	put_device(&port->dev);
 }
 
 static void ib_umad_add_one(struct ib_device *device)
@@ -1273,22 +1349,17 @@
 	s = rdma_start_port(device);
 	e = rdma_end_port(device);
 
-	umad_dev = kzalloc(sizeof *umad_dev +
-			   (e - s + 1) * sizeof (struct ib_umad_port),
-			   GFP_KERNEL);
+	umad_dev = kzalloc(struct_size(umad_dev, ports, e - s + 1), GFP_KERNEL);
 	if (!umad_dev)
 		return;
 
-	kobject_init(&umad_dev->kobj, &ib_umad_dev_ktype);
-
+	kref_init(&umad_dev->kref);
 	for (i = s; i <= e; ++i) {
 		if (!rdma_cap_ib_mad(device, i))
 			continue;
 
-		umad_dev->port[i - s].umad_dev = umad_dev;
-
 		if (ib_umad_init_port(device, i, umad_dev,
-				      &umad_dev->port[i - s]))
+				      &umad_dev->ports[i - s]))
 			goto err;
 
 		count++;
@@ -1306,31 +1377,28 @@
 		if (!rdma_cap_ib_mad(device, i))
 			continue;
 
-		ib_umad_kill_port(&umad_dev->port[i - s]);
+		ib_umad_kill_port(&umad_dev->ports[i - s]);
 	}
 free:
-	kobject_put(&umad_dev->kobj);
+	/* balances kref_init */
+	ib_umad_dev_put(umad_dev);
 }
 
 static void ib_umad_remove_one(struct ib_device *device, void *client_data)
 {
 	struct ib_umad_device *umad_dev = client_data;
-	int i;
+	unsigned int i;
 
 	if (!umad_dev)
 		return;
 
-	for (i = 0; i <= rdma_end_port(device) - rdma_start_port(device); ++i) {
-		if (rdma_cap_ib_mad(device, i + rdma_start_port(device)))
-			ib_umad_kill_port(&umad_dev->port[i]);
+	rdma_for_each_port (device, i) {
+		if (rdma_cap_ib_mad(device, i))
+			ib_umad_kill_port(
+				&umad_dev->ports[i - rdma_start_port(device)]);
 	}
-
-	kobject_put(&umad_dev->kobj);
-}
-
-static char *umad_devnode(struct device *dev, umode_t *mode)
-{
-	return kasprintf(GFP_KERNEL, "infiniband/%s", dev_name(dev));
+	/* balances kref_init() */
+	ib_umad_dev_put(umad_dev);
 }
 
 static int __init ib_umad_init(void)
@@ -1339,7 +1407,7 @@
 
 	ret = register_chrdev_region(base_umad_dev,
 				     IB_UMAD_NUM_FIXED_MINOR * 2,
-				     "infiniband_mad");
+				     umad_class.name);
 	if (ret) {
 		pr_err("couldn't register device number\n");
 		goto out;
@@ -1347,38 +1415,33 @@
 
 	ret = alloc_chrdev_region(&dynamic_umad_dev, 0,
 				  IB_UMAD_NUM_DYNAMIC_MINOR * 2,
-				  "infiniband_mad");
+				  umad_class.name);
 	if (ret) {
 		pr_err("couldn't register dynamic device number\n");
 		goto out_alloc;
 	}
 	dynamic_issm_dev = dynamic_umad_dev + IB_UMAD_NUM_DYNAMIC_MINOR;
 
-	umad_class = class_create(THIS_MODULE, "infiniband_mad");
-	if (IS_ERR(umad_class)) {
-		ret = PTR_ERR(umad_class);
+	ret = class_register(&umad_class);
+	if (ret) {
 		pr_err("couldn't create class infiniband_mad\n");
 		goto out_chrdev;
 	}
 
-	umad_class->devnode = umad_devnode;
-
-	ret = class_create_file(umad_class, &class_attr_abi_version.attr);
-	if (ret) {
-		pr_err("couldn't create abi_version attribute\n");
-		goto out_class;
-	}
-
 	ret = ib_register_client(&umad_client);
-	if (ret) {
-		pr_err("couldn't register ib_umad client\n");
+	if (ret)
 		goto out_class;
-	}
+
+	ret = ib_register_client(&issm_client);
+	if (ret)
+		goto out_client;
 
 	return 0;
 
+out_client:
+	ib_unregister_client(&umad_client);
 out_class:
-	class_destroy(umad_class);
+	class_unregister(&umad_class);
 
 out_chrdev:
 	unregister_chrdev_region(dynamic_umad_dev,
@@ -1394,8 +1457,9 @@
 
 static void __exit ib_umad_cleanup(void)
 {
+	ib_unregister_client(&issm_client);
 	ib_unregister_client(&umad_client);
-	class_destroy(umad_class);
+	class_unregister(&umad_class);
 	unregister_chrdev_region(base_umad_dev,
 				 IB_UMAD_NUM_FIXED_MINOR * 2);
 	unregister_chrdev_region(dynamic_umad_dev,
diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h
index 5df8e54..63f7f7d 100644
--- a/drivers/infiniband/core/uverbs.h
+++ b/drivers/infiniband/core/uverbs.h
@@ -98,15 +98,16 @@
 
 struct ib_uverbs_device {
 	atomic_t				refcount;
-	int					num_comp_vectors;
+	u32					num_comp_vectors;
 	struct completion			comp;
-	struct device			       *dev;
+	struct device				dev;
+	/* First group for device attributes, NULL terminated array */
+	const struct attribute_group		*groups[2];
 	struct ib_device	__rcu	       *ib_dev;
 	int					devnum;
 	struct cdev			        cdev;
 	struct rb_root				xrcd_tree;
 	struct mutex				xrcd_tree_mutex;
-	struct kobject				kobj;
 	struct srcu_struct			disassociate_srcu;
 	struct mutex				lists_mutex; /* protect lists */
 	struct list_head			uverbs_file_list;
@@ -146,7 +147,6 @@
 	struct ib_event_handler			event_handler;
 	struct ib_uverbs_async_event_file       *async_file;
 	struct list_head			list;
-	int					is_closed;
 
 	/*
 	 * To access the uobjects list hw_destroy_rwsem must be held for write
@@ -158,12 +158,11 @@
 	spinlock_t		uobjects_lock;
 	struct list_head	uobjects;
 
-	u64 uverbs_cmd_mask;
-	u64 uverbs_ex_cmd_mask;
+	struct mutex umap_lock;
+	struct list_head umaps;
+	struct page *disassociate_page;
 
-	struct idr		idr;
-	/* spinlock protects write access to idr */
-	spinlock_t		idr_lock;
+	struct xarray		idr;
 };
 
 struct ib_uverbs_event {
@@ -218,12 +217,6 @@
 	u32			async_events_reported;
 };
 
-struct ib_uflow_resources;
-struct ib_uflow_object {
-	struct ib_uobject		uobject;
-	struct ib_uflow_resources	*resources;
-};
-
 extern const struct file_operations uverbs_event_fops;
 void ib_uverbs_init_event_queue(struct ib_uverbs_event_queue *ev_queue);
 struct file *ib_uverbs_alloc_async_event_file(struct ib_uverbs_file *uverbs_file,
@@ -246,13 +239,13 @@
 void ib_uverbs_event_handler(struct ib_event_handler *handler,
 			     struct ib_event *event);
 int ib_uverbs_dealloc_xrcd(struct ib_uobject *uobject, struct ib_xrcd *xrcd,
-			   enum rdma_remove_reason why);
+			   enum rdma_remove_reason why,
+			   struct uverbs_attr_bundle *attrs);
 
 int uverbs_dealloc_mw(struct ib_mw *mw);
 void ib_uverbs_detach_umcast(struct ib_qp *qp,
 			     struct ib_uqp_object *uobj);
 
-void create_udata(struct uverbs_attr_bundle *ctx, struct ib_udata *udata);
 long ib_uverbs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
 
 struct ib_uverbs_flow_spec {
@@ -300,63 +293,29 @@
 extern const struct uverbs_object_def UVERBS_OBJECT(UVERBS_OBJECT_DM);
 extern const struct uverbs_object_def UVERBS_OBJECT(UVERBS_OBJECT_COUNTERS);
 
-#define IB_UVERBS_DECLARE_CMD(name)					\
-	ssize_t ib_uverbs_##name(struct ib_uverbs_file *file,		\
-				 const char __user *buf, int in_len,	\
-				 int out_len)
+/*
+ * ib_uverbs_query_port_resp.port_cap_flags started out as just a copy of the
+ * PortInfo CapabilityMask, but was extended with unique bits.
+ */
+static inline u32 make_port_cap_flags(const struct ib_port_attr *attr)
+{
+	u32 res;
 
-IB_UVERBS_DECLARE_CMD(get_context);
-IB_UVERBS_DECLARE_CMD(query_device);
-IB_UVERBS_DECLARE_CMD(query_port);
-IB_UVERBS_DECLARE_CMD(alloc_pd);
-IB_UVERBS_DECLARE_CMD(dealloc_pd);
-IB_UVERBS_DECLARE_CMD(reg_mr);
-IB_UVERBS_DECLARE_CMD(rereg_mr);
-IB_UVERBS_DECLARE_CMD(dereg_mr);
-IB_UVERBS_DECLARE_CMD(alloc_mw);
-IB_UVERBS_DECLARE_CMD(dealloc_mw);
-IB_UVERBS_DECLARE_CMD(create_comp_channel);
-IB_UVERBS_DECLARE_CMD(create_cq);
-IB_UVERBS_DECLARE_CMD(resize_cq);
-IB_UVERBS_DECLARE_CMD(poll_cq);
-IB_UVERBS_DECLARE_CMD(req_notify_cq);
-IB_UVERBS_DECLARE_CMD(destroy_cq);
-IB_UVERBS_DECLARE_CMD(create_qp);
-IB_UVERBS_DECLARE_CMD(open_qp);
-IB_UVERBS_DECLARE_CMD(query_qp);
-IB_UVERBS_DECLARE_CMD(modify_qp);
-IB_UVERBS_DECLARE_CMD(destroy_qp);
-IB_UVERBS_DECLARE_CMD(post_send);
-IB_UVERBS_DECLARE_CMD(post_recv);
-IB_UVERBS_DECLARE_CMD(post_srq_recv);
-IB_UVERBS_DECLARE_CMD(create_ah);
-IB_UVERBS_DECLARE_CMD(destroy_ah);
-IB_UVERBS_DECLARE_CMD(attach_mcast);
-IB_UVERBS_DECLARE_CMD(detach_mcast);
-IB_UVERBS_DECLARE_CMD(create_srq);
-IB_UVERBS_DECLARE_CMD(modify_srq);
-IB_UVERBS_DECLARE_CMD(query_srq);
-IB_UVERBS_DECLARE_CMD(destroy_srq);
-IB_UVERBS_DECLARE_CMD(create_xsrq);
-IB_UVERBS_DECLARE_CMD(open_xrcd);
-IB_UVERBS_DECLARE_CMD(close_xrcd);
+	/* All IBA CapabilityMask bits are passed through here, except bit 26,
+	 * which is overridden with IP_BASED_GIDS. This is due to a historical
+	 * mistake in the implementation of IP_BASED_GIDS. Otherwise all other
+	 * bits match the IBA definition across all kernel versions.
+	 */
+	res = attr->port_cap_flags & ~(u32)IB_UVERBS_PCF_IP_BASED_GIDS;
 
-#define IB_UVERBS_DECLARE_EX_CMD(name)				\
-	int ib_uverbs_ex_##name(struct ib_uverbs_file *file,	\
-				struct ib_udata *ucore,		\
-				struct ib_udata *uhw)
+	if (attr->ip_gids)
+		res |= IB_UVERBS_PCF_IP_BASED_GIDS;
 
-IB_UVERBS_DECLARE_EX_CMD(create_flow);
-IB_UVERBS_DECLARE_EX_CMD(destroy_flow);
-IB_UVERBS_DECLARE_EX_CMD(query_device);
-IB_UVERBS_DECLARE_EX_CMD(create_cq);
-IB_UVERBS_DECLARE_EX_CMD(create_qp);
-IB_UVERBS_DECLARE_EX_CMD(create_wq);
-IB_UVERBS_DECLARE_EX_CMD(modify_wq);
-IB_UVERBS_DECLARE_EX_CMD(destroy_wq);
-IB_UVERBS_DECLARE_EX_CMD(create_rwq_ind_table);
-IB_UVERBS_DECLARE_EX_CMD(destroy_rwq_ind_table);
-IB_UVERBS_DECLARE_EX_CMD(modify_qp);
-IB_UVERBS_DECLARE_EX_CMD(modify_cq);
+	return res;
+}
 
+
+void copy_port_attr_to_resp(struct ib_port_attr *attr,
+			    struct ib_uverbs_query_port_resp *resp,
+			    struct ib_device *ib_dev, u8 port_num);
 #endif /* UVERBS_H */
diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
index e012ca8..14a80fd 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -47,11 +47,149 @@
 #include "uverbs.h"
 #include "core_priv.h"
 
+/*
+ * Copy a response to userspace. If the provided 'resp' is larger than the
+ * user buffer it is silently truncated. If the user provided a larger buffer
+ * then the trailing portion is zero filled.
+ *
+ * These semantics are intended to support future extension of the output
+ * structures.
+ */
+static int uverbs_response(struct uverbs_attr_bundle *attrs, const void *resp,
+			   size_t resp_len)
+{
+	int ret;
+
+	if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_CORE_OUT))
+		return uverbs_copy_to_struct_or_zero(
+			attrs, UVERBS_ATTR_CORE_OUT, resp, resp_len);
+
+	if (copy_to_user(attrs->ucore.outbuf, resp,
+			 min(attrs->ucore.outlen, resp_len)))
+		return -EFAULT;
+
+	if (resp_len < attrs->ucore.outlen) {
+		/*
+		 * Zero fill any extra memory that user
+		 * space might have provided.
+		 */
+		ret = clear_user(attrs->ucore.outbuf + resp_len,
+				 attrs->ucore.outlen - resp_len);
+		if (ret)
+			return -EFAULT;
+	}
+
+	return 0;
+}
+
+/*
+ * Copy a request from userspace. If the provided 'req' is larger than the
+ * user buffer then the user buffer is zero extended into the 'req'. If 'req'
+ * is smaller than the user buffer then the uncopied bytes in the user buffer
+ * must be zero.
+ */
+static int uverbs_request(struct uverbs_attr_bundle *attrs, void *req,
+			  size_t req_len)
+{
+	if (copy_from_user(req, attrs->ucore.inbuf,
+			   min(attrs->ucore.inlen, req_len)))
+		return -EFAULT;
+
+	if (attrs->ucore.inlen < req_len) {
+		memset(req + attrs->ucore.inlen, 0,
+		       req_len - attrs->ucore.inlen);
+	} else if (attrs->ucore.inlen > req_len) {
+		if (!ib_is_buffer_cleared(attrs->ucore.inbuf + req_len,
+					  attrs->ucore.inlen - req_len))
+			return -EOPNOTSUPP;
+	}
+	return 0;
+}
+
+/*
+ * Generate the value for the 'response_length' protocol used by write_ex.
+ * This is the number of bytes the kernel actually wrote. Userspace can use
+ * this to detect what structure members in the response the kernel
+ * understood.
+ */
+static u32 uverbs_response_length(struct uverbs_attr_bundle *attrs,
+				  size_t resp_len)
+{
+	return min_t(size_t, attrs->ucore.outlen, resp_len);
+}
+
+/*
+ * The iterator version of the request interface is for handlers that need to
+ * step over a flex array at the end of a command header.
+ */
+struct uverbs_req_iter {
+	const void __user *cur;
+	const void __user *end;
+};
+
+static int uverbs_request_start(struct uverbs_attr_bundle *attrs,
+				struct uverbs_req_iter *iter,
+				void *req,
+				size_t req_len)
+{
+	if (attrs->ucore.inlen < req_len)
+		return -ENOSPC;
+
+	if (copy_from_user(req, attrs->ucore.inbuf, req_len))
+		return -EFAULT;
+
+	iter->cur = attrs->ucore.inbuf + req_len;
+	iter->end = attrs->ucore.inbuf + attrs->ucore.inlen;
+	return 0;
+}
+
+static int uverbs_request_next(struct uverbs_req_iter *iter, void *val,
+			       size_t len)
+{
+	if (iter->cur + len > iter->end)
+		return -ENOSPC;
+
+	if (copy_from_user(val, iter->cur, len))
+		return -EFAULT;
+
+	iter->cur += len;
+	return 0;
+}
+
+static const void __user *uverbs_request_next_ptr(struct uverbs_req_iter *iter,
+						  size_t len)
+{
+	const void __user *res = iter->cur;
+
+	if (iter->cur + len > iter->end)
+		return (void __force __user *)ERR_PTR(-ENOSPC);
+	iter->cur += len;
+	return res;
+}
+
+static int uverbs_request_finish(struct uverbs_req_iter *iter)
+{
+	if (!ib_is_buffer_cleared(iter->cur, iter->end - iter->cur))
+		return -EOPNOTSUPP;
+	return 0;
+}
+
+/*
+ * When calling a destroy function during an error unwind we need to pass in
+ * the udata that is sanitized of all user arguments. Ie from the driver
+ * perspective it looks like no udata was passed.
+ */
+struct ib_udata *uverbs_get_cleared_udata(struct uverbs_attr_bundle *attrs)
+{
+	attrs->driver_udata = (struct ib_udata){};
+	return &attrs->driver_udata;
+}
+
 static struct ib_uverbs_completion_event_file *
-_ib_uverbs_lookup_comp_file(s32 fd, struct ib_uverbs_file *ufile)
+_ib_uverbs_lookup_comp_file(s32 fd, struct uverbs_attr_bundle *attrs)
 {
 	struct ib_uobject *uobj = ufd_get_read(UVERBS_OBJECT_COMP_CHANNEL,
-					       fd, ufile);
+					       fd, attrs);
 
 	if (IS_ERR(uobj))
 		return (void *)uobj;
@@ -65,24 +203,20 @@
 #define ib_uverbs_lookup_comp_file(_fd, _ufile)                                \
 	_ib_uverbs_lookup_comp_file((_fd)*typecheck(s32, _fd), _ufile)
 
-ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,
-			      const char __user *buf,
-			      int in_len, int out_len)
+static int ib_uverbs_get_context(struct uverbs_attr_bundle *attrs)
 {
+	struct ib_uverbs_file *file = attrs->ufile;
 	struct ib_uverbs_get_context      cmd;
 	struct ib_uverbs_get_context_resp resp;
-	struct ib_udata                   udata;
 	struct ib_ucontext		 *ucontext;
 	struct file			 *filp;
 	struct ib_rdmacg_object		 cg_obj;
 	struct ib_device *ib_dev;
 	int ret;
 
-	if (out_len < sizeof resp)
-		return -ENOSPC;
-
-	if (copy_from_user(&cmd, buf, sizeof cmd))
-		return -EFAULT;
+	ret = uverbs_request(attrs, &cmd, sizeof(cmd));
+	if (ret)
+		return ret;
 
 	mutex_lock(&file->ucontext_lock);
 	ib_dev = srcu_dereference(file->device->ib_dev,
@@ -97,45 +231,27 @@
 		goto err;
 	}
 
-	ib_uverbs_init_udata(&udata, buf + sizeof(cmd),
-		   u64_to_user_ptr(cmd.response) + sizeof(resp),
-		   in_len - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr),
-		   out_len - sizeof(resp));
-
 	ret = ib_rdmacg_try_charge(&cg_obj, ib_dev, RDMACG_RESOURCE_HCA_HANDLE);
 	if (ret)
 		goto err;
 
-	ucontext = ib_dev->alloc_ucontext(ib_dev, &udata);
-	if (IS_ERR(ucontext)) {
-		ret = PTR_ERR(ucontext);
+	ucontext = rdma_zalloc_drv_obj(ib_dev, ib_ucontext);
+	if (!ucontext) {
+		ret = -ENOMEM;
 		goto err_alloc;
 	}
 
+	attrs->context = ucontext;
+
+	ucontext->res.type = RDMA_RESTRACK_CTX;
 	ucontext->device = ib_dev;
 	ucontext->cg_obj = cg_obj;
 	/* ufile is required when some objects are released */
 	ucontext->ufile = file;
 
-	rcu_read_lock();
-	ucontext->tgid = get_task_pid(current->group_leader, PIDTYPE_PID);
-	rcu_read_unlock();
-	ucontext->closing = 0;
+	ucontext->closing = false;
 	ucontext->cleanup_retryable = false;
 
-#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
-	ucontext->umem_tree = RB_ROOT_CACHED;
-	init_rwsem(&ucontext->umem_rwsem);
-	ucontext->odp_mrs_count = 0;
-	INIT_LIST_HEAD(&ucontext->no_private_counters);
-
-	if (!(ib_dev->attrs.device_cap_flags & IB_DEVICE_ON_DEMAND_PAGING))
-		ucontext->invalidate_range = NULL;
-
-#endif
-
-	resp.num_comp_vectors = file->device->num_comp_vectors;
-
 	ret = get_unused_fd_flags(O_CLOEXEC);
 	if (ret < 0)
 		goto err_free;
@@ -147,10 +263,17 @@
 		goto err_fd;
 	}
 
-	if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof resp)) {
-		ret = -EFAULT;
+	resp.num_comp_vectors = file->device->num_comp_vectors;
+
+	ret = uverbs_response(attrs, &resp, sizeof(resp));
+	if (ret)
 		goto err_file;
-	}
+
+	ret = ib_dev->ops.alloc_ucontext(ucontext, &attrs->driver_udata);
+	if (ret)
+		goto err_file;
+
+	rdma_restrack_uadd(&ucontext->res);
 
 	fd_install(resp.async_fd, filp);
 
@@ -162,7 +285,7 @@
 
 	mutex_unlock(&file->ucontext_lock);
 
-	return in_len;
+	return 0;
 
 err_file:
 	ib_uverbs_free_async_event_file(file);
@@ -172,8 +295,7 @@
 	put_unused_fd(resp.async_fd);
 
 err_free:
-	put_pid(ucontext->tgid);
-	ib_dev->dealloc_ucontext(ucontext);
+	kfree(ucontext);
 
 err_alloc:
 	ib_rdmacg_uncharge(&cg_obj, ib_dev, RDMACG_RESOURCE_HCA_HANDLE);
@@ -231,57 +353,28 @@
 	resp->phys_port_cnt		= ib_dev->phys_port_cnt;
 }
 
-ssize_t ib_uverbs_query_device(struct ib_uverbs_file *file,
-			       const char __user *buf,
-			       int in_len, int out_len)
+static int ib_uverbs_query_device(struct uverbs_attr_bundle *attrs)
 {
 	struct ib_uverbs_query_device      cmd;
 	struct ib_uverbs_query_device_resp resp;
 	struct ib_ucontext *ucontext;
+	int ret;
 
-	ucontext = ib_uverbs_get_ucontext(file);
+	ucontext = ib_uverbs_get_ucontext(attrs);
 	if (IS_ERR(ucontext))
 		return PTR_ERR(ucontext);
 
-	if (out_len < sizeof resp)
-		return -ENOSPC;
-
-	if (copy_from_user(&cmd, buf, sizeof cmd))
-		return -EFAULT;
+	ret = uverbs_request(attrs, &cmd, sizeof(cmd));
+	if (ret)
+		return ret;
 
 	memset(&resp, 0, sizeof resp);
 	copy_query_dev_fields(ucontext, &resp, &ucontext->device->attrs);
 
-	if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof resp))
-		return -EFAULT;
-
-	return in_len;
+	return uverbs_response(attrs, &resp, sizeof(resp));
 }
 
-/*
- * ib_uverbs_query_port_resp.port_cap_flags started out as just a copy of the
- * PortInfo CapabilityMask, but was extended with unique bits.
- */
-static u32 make_port_cap_flags(const struct ib_port_attr *attr)
-{
-	u32 res;
-
-	/* All IBA CapabilityMask bits are passed through here, except bit 26,
-	 * which is overridden with IP_BASED_GIDS. This is due to a historical
-	 * mistake in the implementation of IP_BASED_GIDS. Otherwise all other
-	 * bits match the IBA definition across all kernel versions.
-	 */
-	res = attr->port_cap_flags & ~(u32)IB_UVERBS_PCF_IP_BASED_GIDS;
-
-	if (attr->ip_gids)
-		res |= IB_UVERBS_PCF_IP_BASED_GIDS;
-
-	return res;
-}
-
-ssize_t ib_uverbs_query_port(struct ib_uverbs_file *file,
-			     const char __user *buf,
-			     int in_len, int out_len)
+static int ib_uverbs_query_port(struct uverbs_attr_bundle *attrs)
 {
 	struct ib_uverbs_query_port      cmd;
 	struct ib_uverbs_query_port_resp resp;
@@ -290,90 +383,45 @@
 	struct ib_ucontext *ucontext;
 	struct ib_device *ib_dev;
 
-	ucontext = ib_uverbs_get_ucontext(file);
+	ucontext = ib_uverbs_get_ucontext(attrs);
 	if (IS_ERR(ucontext))
 		return PTR_ERR(ucontext);
 	ib_dev = ucontext->device;
 
-	if (out_len < sizeof resp)
-		return -ENOSPC;
-
-	if (copy_from_user(&cmd, buf, sizeof cmd))
-		return -EFAULT;
+	ret = uverbs_request(attrs, &cmd, sizeof(cmd));
+	if (ret)
+		return ret;
 
 	ret = ib_query_port(ib_dev, cmd.port_num, &attr);
 	if (ret)
 		return ret;
 
 	memset(&resp, 0, sizeof resp);
+	copy_port_attr_to_resp(&attr, &resp, ib_dev, cmd.port_num);
 
-	resp.state 	     = attr.state;
-	resp.max_mtu 	     = attr.max_mtu;
-	resp.active_mtu      = attr.active_mtu;
-	resp.gid_tbl_len     = attr.gid_tbl_len;
-	resp.port_cap_flags  = make_port_cap_flags(&attr);
-	resp.max_msg_sz      = attr.max_msg_sz;
-	resp.bad_pkey_cntr   = attr.bad_pkey_cntr;
-	resp.qkey_viol_cntr  = attr.qkey_viol_cntr;
-	resp.pkey_tbl_len    = attr.pkey_tbl_len;
-
-	if (rdma_is_grh_required(ib_dev, cmd.port_num))
-		resp.flags |= IB_UVERBS_QPF_GRH_REQUIRED;
-
-	if (rdma_cap_opa_ah(ib_dev, cmd.port_num)) {
-		resp.lid     = OPA_TO_IB_UCAST_LID(attr.lid);
-		resp.sm_lid  = OPA_TO_IB_UCAST_LID(attr.sm_lid);
-	} else {
-		resp.lid     = ib_lid_cpu16(attr.lid);
-		resp.sm_lid  = ib_lid_cpu16(attr.sm_lid);
-	}
-	resp.lmc 	     = attr.lmc;
-	resp.max_vl_num      = attr.max_vl_num;
-	resp.sm_sl 	     = attr.sm_sl;
-	resp.subnet_timeout  = attr.subnet_timeout;
-	resp.init_type_reply = attr.init_type_reply;
-	resp.active_width    = attr.active_width;
-	resp.active_speed    = attr.active_speed;
-	resp.phys_state      = attr.phys_state;
-	resp.link_layer      = rdma_port_get_link_layer(ib_dev,
-							cmd.port_num);
-
-	if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof resp))
-		return -EFAULT;
-
-	return in_len;
+	return uverbs_response(attrs, &resp, sizeof(resp));
 }
 
-ssize_t ib_uverbs_alloc_pd(struct ib_uverbs_file *file,
-			   const char __user *buf,
-			   int in_len, int out_len)
+static int ib_uverbs_alloc_pd(struct uverbs_attr_bundle *attrs)
 {
 	struct ib_uverbs_alloc_pd      cmd;
 	struct ib_uverbs_alloc_pd_resp resp;
-	struct ib_udata                udata;
 	struct ib_uobject             *uobj;
 	struct ib_pd                  *pd;
 	int                            ret;
 	struct ib_device *ib_dev;
 
-	if (out_len < sizeof resp)
-		return -ENOSPC;
+	ret = uverbs_request(attrs, &cmd, sizeof(cmd));
+	if (ret)
+		return ret;
 
-	if (copy_from_user(&cmd, buf, sizeof cmd))
-		return -EFAULT;
-
-	ib_uverbs_init_udata(&udata, buf + sizeof(cmd),
-		   u64_to_user_ptr(cmd.response) + sizeof(resp),
-                   in_len - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr),
-                   out_len - sizeof(resp));
-
-	uobj = uobj_alloc(UVERBS_OBJECT_PD, file, &ib_dev);
+	uobj = uobj_alloc(UVERBS_OBJECT_PD, attrs, &ib_dev);
 	if (IS_ERR(uobj))
 		return PTR_ERR(uobj);
 
-	pd = ib_dev->alloc_pd(ib_dev, uobj->context, &udata);
-	if (IS_ERR(pd)) {
-		ret = PTR_ERR(pd);
+	pd = rdma_zalloc_drv_obj(ib_dev, ib_pd);
+	if (!pd) {
+		ret = -ENOMEM;
 		goto err;
 	}
 
@@ -381,39 +429,43 @@
 	pd->uobject = uobj;
 	pd->__internal_mr = NULL;
 	atomic_set(&pd->usecnt, 0);
+	pd->res.type = RDMA_RESTRACK_PD;
+
+	ret = ib_dev->ops.alloc_pd(pd, &attrs->driver_udata);
+	if (ret)
+		goto err_alloc;
 
 	uobj->object = pd;
 	memset(&resp, 0, sizeof resp);
 	resp.pd_handle = uobj->id;
-	pd->res.type = RDMA_RESTRACK_PD;
-	rdma_restrack_add(&pd->res);
+	rdma_restrack_uadd(&pd->res);
 
-	if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof resp)) {
-		ret = -EFAULT;
+	ret = uverbs_response(attrs, &resp, sizeof(resp));
+	if (ret)
 		goto err_copy;
-	}
 
-	return uobj_alloc_commit(uobj, in_len);
+	return uobj_alloc_commit(uobj, attrs);
 
 err_copy:
-	ib_dealloc_pd(pd);
-
+	ib_dealloc_pd_user(pd, uverbs_get_cleared_udata(attrs));
+	pd = NULL;
+err_alloc:
+	kfree(pd);
 err:
-	uobj_alloc_abort(uobj);
+	uobj_alloc_abort(uobj, attrs);
 	return ret;
 }
 
-ssize_t ib_uverbs_dealloc_pd(struct ib_uverbs_file *file,
-			     const char __user *buf,
-			     int in_len, int out_len)
+static int ib_uverbs_dealloc_pd(struct uverbs_attr_bundle *attrs)
 {
 	struct ib_uverbs_dealloc_pd cmd;
+	int ret;
 
-	if (copy_from_user(&cmd, buf, sizeof cmd))
-		return -EFAULT;
+	ret = uverbs_request(attrs, &cmd, sizeof(cmd));
+	if (ret)
+		return ret;
 
-	return uobj_perform_destroy(UVERBS_OBJECT_PD, cmd.pd_handle, file,
-				    in_len);
+	return uobj_perform_destroy(UVERBS_OBJECT_PD, cmd.pd_handle, attrs);
 }
 
 struct xrcd_table_entry {
@@ -501,13 +553,11 @@
 	}
 }
 
-ssize_t ib_uverbs_open_xrcd(struct ib_uverbs_file *file,
-			    const char __user *buf, int in_len,
-			    int out_len)
+static int ib_uverbs_open_xrcd(struct uverbs_attr_bundle *attrs)
 {
+	struct ib_uverbs_device *ibudev = attrs->ufile->device;
 	struct ib_uverbs_open_xrcd	cmd;
 	struct ib_uverbs_open_xrcd_resp	resp;
-	struct ib_udata			udata;
 	struct ib_uxrcd_object         *obj;
 	struct ib_xrcd                 *xrcd = NULL;
 	struct fd			f = {NULL, 0};
@@ -516,18 +566,11 @@
 	int				new_xrcd = 0;
 	struct ib_device *ib_dev;
 
-	if (out_len < sizeof resp)
-		return -ENOSPC;
+	ret = uverbs_request(attrs, &cmd, sizeof(cmd));
+	if (ret)
+		return ret;
 
-	if (copy_from_user(&cmd, buf, sizeof cmd))
-		return -EFAULT;
-
-	ib_uverbs_init_udata(&udata, buf + sizeof(cmd),
-		   u64_to_user_ptr(cmd.response) + sizeof(resp),
-                   in_len - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr),
-                   out_len - sizeof(resp));
-
-	mutex_lock(&file->device->xrcd_tree_mutex);
+	mutex_lock(&ibudev->xrcd_tree_mutex);
 
 	if (cmd.fd != -1) {
 		/* search for file descriptor */
@@ -538,7 +581,7 @@
 		}
 
 		inode = file_inode(f.file);
-		xrcd = find_xrcd(file->device, inode);
+		xrcd = find_xrcd(ibudev, inode);
 		if (!xrcd && !(cmd.oflags & O_CREAT)) {
 			/* no file descriptor. Need CREATE flag */
 			ret = -EAGAIN;
@@ -551,7 +594,7 @@
 		}
 	}
 
-	obj = (struct ib_uxrcd_object *)uobj_alloc(UVERBS_OBJECT_XRCD, file,
+	obj = (struct ib_uxrcd_object *)uobj_alloc(UVERBS_OBJECT_XRCD, attrs,
 						   &ib_dev);
 	if (IS_ERR(obj)) {
 		ret = PTR_ERR(obj);
@@ -559,7 +602,7 @@
 	}
 
 	if (!xrcd) {
-		xrcd = ib_dev->alloc_xrcd(ib_dev, obj->uobject.context, &udata);
+		xrcd = ib_dev->ops.alloc_xrcd(ib_dev, &attrs->driver_udata);
 		if (IS_ERR(xrcd)) {
 			ret = PTR_ERR(xrcd);
 			goto err;
@@ -581,73 +624,71 @@
 	if (inode) {
 		if (new_xrcd) {
 			/* create new inode/xrcd table entry */
-			ret = xrcd_table_insert(file->device, inode, xrcd);
+			ret = xrcd_table_insert(ibudev, inode, xrcd);
 			if (ret)
 				goto err_dealloc_xrcd;
 		}
 		atomic_inc(&xrcd->usecnt);
 	}
 
-	if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof resp)) {
-		ret = -EFAULT;
+	ret = uverbs_response(attrs, &resp, sizeof(resp));
+	if (ret)
 		goto err_copy;
-	}
 
 	if (f.file)
 		fdput(f);
 
-	mutex_unlock(&file->device->xrcd_tree_mutex);
+	mutex_unlock(&ibudev->xrcd_tree_mutex);
 
-	return uobj_alloc_commit(&obj->uobject, in_len);
+	return uobj_alloc_commit(&obj->uobject, attrs);
 
 err_copy:
 	if (inode) {
 		if (new_xrcd)
-			xrcd_table_delete(file->device, inode);
+			xrcd_table_delete(ibudev, inode);
 		atomic_dec(&xrcd->usecnt);
 	}
 
 err_dealloc_xrcd:
-	ib_dealloc_xrcd(xrcd);
+	ib_dealloc_xrcd(xrcd, uverbs_get_cleared_udata(attrs));
 
 err:
-	uobj_alloc_abort(&obj->uobject);
+	uobj_alloc_abort(&obj->uobject, attrs);
 
 err_tree_mutex_unlock:
 	if (f.file)
 		fdput(f);
 
-	mutex_unlock(&file->device->xrcd_tree_mutex);
+	mutex_unlock(&ibudev->xrcd_tree_mutex);
 
 	return ret;
 }
 
-ssize_t ib_uverbs_close_xrcd(struct ib_uverbs_file *file,
-			     const char __user *buf, int in_len,
-			     int out_len)
+static int ib_uverbs_close_xrcd(struct uverbs_attr_bundle *attrs)
 {
 	struct ib_uverbs_close_xrcd cmd;
+	int ret;
 
-	if (copy_from_user(&cmd, buf, sizeof cmd))
-		return -EFAULT;
+	ret = uverbs_request(attrs, &cmd, sizeof(cmd));
+	if (ret)
+		return ret;
 
-	return uobj_perform_destroy(UVERBS_OBJECT_XRCD, cmd.xrcd_handle, file,
-				    in_len);
+	return uobj_perform_destroy(UVERBS_OBJECT_XRCD, cmd.xrcd_handle, attrs);
 }
 
-int ib_uverbs_dealloc_xrcd(struct ib_uobject *uobject,
-			   struct ib_xrcd *xrcd,
-			   enum rdma_remove_reason why)
+int ib_uverbs_dealloc_xrcd(struct ib_uobject *uobject, struct ib_xrcd *xrcd,
+			   enum rdma_remove_reason why,
+			   struct uverbs_attr_bundle *attrs)
 {
 	struct inode *inode;
 	int ret;
-	struct ib_uverbs_device *dev = uobject->context->ufile->device;
+	struct ib_uverbs_device *dev = attrs->ufile->device;
 
 	inode = xrcd->inode;
 	if (inode && !atomic_dec_and_test(&xrcd->usecnt))
 		return 0;
 
-	ret = ib_dealloc_xrcd(xrcd);
+	ret = ib_dealloc_xrcd(xrcd, &attrs->driver_udata);
 
 	if (ib_is_destroy_retryable(ret, why, uobject)) {
 		atomic_inc(&xrcd->usecnt);
@@ -660,29 +701,19 @@
 	return ret;
 }
 
-ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file,
-			 const char __user *buf, int in_len,
-			 int out_len)
+static int ib_uverbs_reg_mr(struct uverbs_attr_bundle *attrs)
 {
 	struct ib_uverbs_reg_mr      cmd;
 	struct ib_uverbs_reg_mr_resp resp;
-	struct ib_udata              udata;
 	struct ib_uobject           *uobj;
 	struct ib_pd                *pd;
 	struct ib_mr                *mr;
 	int                          ret;
 	struct ib_device *ib_dev;
 
-	if (out_len < sizeof resp)
-		return -ENOSPC;
-
-	if (copy_from_user(&cmd, buf, sizeof cmd))
-		return -EFAULT;
-
-	ib_uverbs_init_udata(&udata, buf + sizeof(cmd),
-		   u64_to_user_ptr(cmd.response) + sizeof(resp),
-                   in_len - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr),
-                   out_len - sizeof(resp));
+	ret = uverbs_request(attrs, &cmd, sizeof(cmd));
+	if (ret)
+		return ret;
 
 	if ((cmd.start & ~PAGE_MASK) != (cmd.hca_va & ~PAGE_MASK))
 		return -EINVAL;
@@ -691,11 +722,11 @@
 	if (ret)
 		return ret;
 
-	uobj = uobj_alloc(UVERBS_OBJECT_MR, file, &ib_dev);
+	uobj = uobj_alloc(UVERBS_OBJECT_MR, attrs, &ib_dev);
 	if (IS_ERR(uobj))
 		return PTR_ERR(uobj);
 
-	pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd.pd_handle, file);
+	pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd.pd_handle, attrs);
 	if (!pd) {
 		ret = -EINVAL;
 		goto err_free;
@@ -710,8 +741,9 @@
 		}
 	}
 
-	mr = pd->device->reg_user_mr(pd, cmd.start, cmd.length, cmd.hca_va,
-				     cmd.access_flags, &udata);
+	mr = pd->device->ops.reg_user_mr(pd, cmd.start, cmd.length, cmd.hca_va,
+					 cmd.access_flags,
+					 &attrs->driver_udata);
 	if (IS_ERR(mr)) {
 		ret = PTR_ERR(mr);
 		goto err_put;
@@ -719,11 +751,13 @@
 
 	mr->device  = pd->device;
 	mr->pd      = pd;
+	mr->type    = IB_MR_TYPE_USER;
 	mr->dm	    = NULL;
+	mr->sig_attrs = NULL;
 	mr->uobject = uobj;
 	atomic_inc(&pd->usecnt);
 	mr->res.type = RDMA_RESTRACK_MR;
-	rdma_restrack_add(&mr->res);
+	rdma_restrack_uadd(&mr->res);
 
 	uobj->object = mr;
 
@@ -732,49 +766,38 @@
 	resp.rkey      = mr->rkey;
 	resp.mr_handle = uobj->id;
 
-	if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof resp)) {
-		ret = -EFAULT;
+	ret = uverbs_response(attrs, &resp, sizeof(resp));
+	if (ret)
 		goto err_copy;
-	}
 
 	uobj_put_obj_read(pd);
 
-	return uobj_alloc_commit(uobj, in_len);
+	return uobj_alloc_commit(uobj, attrs);
 
 err_copy:
-	ib_dereg_mr(mr);
+	ib_dereg_mr_user(mr, uverbs_get_cleared_udata(attrs));
 
 err_put:
 	uobj_put_obj_read(pd);
 
 err_free:
-	uobj_alloc_abort(uobj);
+	uobj_alloc_abort(uobj, attrs);
 	return ret;
 }
 
-ssize_t ib_uverbs_rereg_mr(struct ib_uverbs_file *file,
-			   const char __user *buf, int in_len,
-			   int out_len)
+static int ib_uverbs_rereg_mr(struct uverbs_attr_bundle *attrs)
 {
 	struct ib_uverbs_rereg_mr      cmd;
 	struct ib_uverbs_rereg_mr_resp resp;
-	struct ib_udata              udata;
 	struct ib_pd                *pd = NULL;
 	struct ib_mr                *mr;
 	struct ib_pd		    *old_pd;
 	int                          ret;
 	struct ib_uobject	    *uobj;
 
-	if (out_len < sizeof(resp))
-		return -ENOSPC;
-
-	if (copy_from_user(&cmd, buf, sizeof(cmd)))
-		return -EFAULT;
-
-	ib_uverbs_init_udata(&udata, buf + sizeof(cmd),
-		   u64_to_user_ptr(cmd.response) + sizeof(resp),
-                   in_len - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr),
-                   out_len - sizeof(resp));
+	ret = uverbs_request(attrs, &cmd, sizeof(cmd));
+	if (ret)
+		return ret;
 
 	if (cmd.flags & ~IB_MR_REREG_SUPPORTED || !cmd.flags)
 		return -EINVAL;
@@ -784,7 +807,7 @@
 	     (cmd.start & ~PAGE_MASK) != (cmd.hca_va & ~PAGE_MASK)))
 			return -EINVAL;
 
-	uobj = uobj_get_write(UVERBS_OBJECT_MR, cmd.mr_handle, file);
+	uobj = uobj_get_write(UVERBS_OBJECT_MR, cmd.mr_handle, attrs);
 	if (IS_ERR(uobj))
 		return PTR_ERR(uobj);
 
@@ -803,7 +826,7 @@
 
 	if (cmd.flags & IB_MR_REREG_PD) {
 		pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd.pd_handle,
-				       file);
+				       attrs);
 		if (!pd) {
 			ret = -EINVAL;
 			goto put_uobjs;
@@ -811,27 +834,24 @@
 	}
 
 	old_pd = mr->pd;
-	ret = mr->device->rereg_user_mr(mr, cmd.flags, cmd.start,
-					cmd.length, cmd.hca_va,
-					cmd.access_flags, pd, &udata);
-	if (!ret) {
-		if (cmd.flags & IB_MR_REREG_PD) {
-			atomic_inc(&pd->usecnt);
-			mr->pd = pd;
-			atomic_dec(&old_pd->usecnt);
-		}
-	} else {
+	ret = mr->device->ops.rereg_user_mr(mr, cmd.flags, cmd.start,
+					    cmd.length, cmd.hca_va,
+					    cmd.access_flags, pd,
+					    &attrs->driver_udata);
+	if (ret)
 		goto put_uobj_pd;
+
+	if (cmd.flags & IB_MR_REREG_PD) {
+		atomic_inc(&pd->usecnt);
+		mr->pd = pd;
+		atomic_dec(&old_pd->usecnt);
 	}
 
 	memset(&resp, 0, sizeof(resp));
 	resp.lkey      = mr->lkey;
 	resp.rkey      = mr->rkey;
 
-	if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof(resp)))
-		ret = -EFAULT;
-	else
-		ret = in_len;
+	ret = uverbs_response(attrs, &resp, sizeof(resp));
 
 put_uobj_pd:
 	if (cmd.flags & IB_MR_REREG_PD)
@@ -843,54 +863,48 @@
 	return ret;
 }
 
-ssize_t ib_uverbs_dereg_mr(struct ib_uverbs_file *file,
-			   const char __user *buf, int in_len,
-			   int out_len)
+static int ib_uverbs_dereg_mr(struct uverbs_attr_bundle *attrs)
 {
 	struct ib_uverbs_dereg_mr cmd;
+	int ret;
 
-	if (copy_from_user(&cmd, buf, sizeof cmd))
-		return -EFAULT;
+	ret = uverbs_request(attrs, &cmd, sizeof(cmd));
+	if (ret)
+		return ret;
 
-	return uobj_perform_destroy(UVERBS_OBJECT_MR, cmd.mr_handle, file,
-				    in_len);
+	return uobj_perform_destroy(UVERBS_OBJECT_MR, cmd.mr_handle, attrs);
 }
 
-ssize_t ib_uverbs_alloc_mw(struct ib_uverbs_file *file,
-			   const char __user *buf, int in_len,
-			   int out_len)
+static int ib_uverbs_alloc_mw(struct uverbs_attr_bundle *attrs)
 {
 	struct ib_uverbs_alloc_mw      cmd;
 	struct ib_uverbs_alloc_mw_resp resp;
 	struct ib_uobject             *uobj;
 	struct ib_pd                  *pd;
 	struct ib_mw                  *mw;
-	struct ib_udata		       udata;
 	int                            ret;
 	struct ib_device *ib_dev;
 
-	if (out_len < sizeof(resp))
-		return -ENOSPC;
+	ret = uverbs_request(attrs, &cmd, sizeof(cmd));
+	if (ret)
+		return ret;
 
-	if (copy_from_user(&cmd, buf, sizeof(cmd)))
-		return -EFAULT;
-
-	uobj = uobj_alloc(UVERBS_OBJECT_MW, file, &ib_dev);
+	uobj = uobj_alloc(UVERBS_OBJECT_MW, attrs, &ib_dev);
 	if (IS_ERR(uobj))
 		return PTR_ERR(uobj);
 
-	pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd.pd_handle, file);
+	pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd.pd_handle, attrs);
 	if (!pd) {
 		ret = -EINVAL;
 		goto err_free;
 	}
 
-	ib_uverbs_init_udata(&udata, buf + sizeof(cmd),
-		   u64_to_user_ptr(cmd.response) + sizeof(resp),
-		   in_len - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr),
-		   out_len - sizeof(resp));
+	if (cmd.mw_type != IB_MW_TYPE_1 && cmd.mw_type != IB_MW_TYPE_2) {
+		ret = -EINVAL;
+		goto err_put;
+	}
 
-	mw = pd->device->alloc_mw(pd, cmd.mw_type, &udata);
+	mw = pd->device->ops.alloc_mw(pd, cmd.mw_type, &attrs->driver_udata);
 	if (IS_ERR(mw)) {
 		ret = PTR_ERR(mw);
 		goto err_put;
@@ -907,53 +921,48 @@
 	resp.rkey      = mw->rkey;
 	resp.mw_handle = uobj->id;
 
-	if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof(resp))) {
-		ret = -EFAULT;
+	ret = uverbs_response(attrs, &resp, sizeof(resp));
+	if (ret)
 		goto err_copy;
-	}
 
 	uobj_put_obj_read(pd);
-	return uobj_alloc_commit(uobj, in_len);
+	return uobj_alloc_commit(uobj, attrs);
 
 err_copy:
 	uverbs_dealloc_mw(mw);
 err_put:
 	uobj_put_obj_read(pd);
 err_free:
-	uobj_alloc_abort(uobj);
+	uobj_alloc_abort(uobj, attrs);
 	return ret;
 }
 
-ssize_t ib_uverbs_dealloc_mw(struct ib_uverbs_file *file,
-			     const char __user *buf, int in_len,
-			     int out_len)
+static int ib_uverbs_dealloc_mw(struct uverbs_attr_bundle *attrs)
 {
 	struct ib_uverbs_dealloc_mw cmd;
+	int ret;
 
-	if (copy_from_user(&cmd, buf, sizeof(cmd)))
-		return -EFAULT;
+	ret = uverbs_request(attrs, &cmd, sizeof(cmd));
+	if (ret)
+		return ret;
 
-	return uobj_perform_destroy(UVERBS_OBJECT_MW, cmd.mw_handle, file,
-				    in_len);
+	return uobj_perform_destroy(UVERBS_OBJECT_MW, cmd.mw_handle, attrs);
 }
 
-ssize_t ib_uverbs_create_comp_channel(struct ib_uverbs_file *file,
-				      const char __user *buf, int in_len,
-				      int out_len)
+static int ib_uverbs_create_comp_channel(struct uverbs_attr_bundle *attrs)
 {
 	struct ib_uverbs_create_comp_channel	   cmd;
 	struct ib_uverbs_create_comp_channel_resp  resp;
 	struct ib_uobject			  *uobj;
 	struct ib_uverbs_completion_event_file	  *ev_file;
 	struct ib_device *ib_dev;
+	int ret;
 
-	if (out_len < sizeof resp)
-		return -ENOSPC;
+	ret = uverbs_request(attrs, &cmd, sizeof(cmd));
+	if (ret)
+		return ret;
 
-	if (copy_from_user(&cmd, buf, sizeof cmd))
-		return -EFAULT;
-
-	uobj = uobj_alloc(UVERBS_OBJECT_COMP_CHANNEL, file, &ib_dev);
+	uobj = uobj_alloc(UVERBS_OBJECT_COMP_CHANNEL, attrs, &ib_dev);
 	if (IS_ERR(uobj))
 		return PTR_ERR(uobj);
 
@@ -963,25 +972,17 @@
 			       uobj);
 	ib_uverbs_init_event_queue(&ev_file->ev_queue);
 
-	if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof resp)) {
-		uobj_alloc_abort(uobj);
-		return -EFAULT;
+	ret = uverbs_response(attrs, &resp, sizeof(resp));
+	if (ret) {
+		uobj_alloc_abort(uobj, attrs);
+		return ret;
 	}
 
-	return uobj_alloc_commit(uobj, in_len);
+	return uobj_alloc_commit(uobj, attrs);
 }
 
-static struct ib_ucq_object *create_cq(struct ib_uverbs_file *file,
-				       struct ib_udata *ucore,
-				       struct ib_udata *uhw,
-				       struct ib_uverbs_ex_create_cq *cmd,
-				       size_t cmd_sz,
-				       int (*cb)(struct ib_uverbs_file *file,
-						 struct ib_ucq_object *obj,
-						 struct ib_uverbs_ex_create_cq_resp *resp,
-						 struct ib_udata *udata,
-						 void *context),
-				       void *context)
+static struct ib_ucq_object *create_cq(struct uverbs_attr_bundle *attrs,
+				       struct ib_uverbs_ex_create_cq *cmd)
 {
 	struct ib_ucq_object           *obj;
 	struct ib_uverbs_completion_event_file    *ev_file = NULL;
@@ -991,21 +992,16 @@
 	struct ib_cq_init_attr attr = {};
 	struct ib_device *ib_dev;
 
-	if (cmd->comp_vector >= file->device->num_comp_vectors)
+	if (cmd->comp_vector >= attrs->ufile->device->num_comp_vectors)
 		return ERR_PTR(-EINVAL);
 
-	obj = (struct ib_ucq_object *)uobj_alloc(UVERBS_OBJECT_CQ, file,
+	obj = (struct ib_ucq_object *)uobj_alloc(UVERBS_OBJECT_CQ, attrs,
 						 &ib_dev);
 	if (IS_ERR(obj))
 		return obj;
 
-	if (!ib_dev->create_cq) {
-		ret = -EOPNOTSUPP;
-		goto err;
-	}
-
 	if (cmd->comp_channel >= 0) {
-		ev_file = ib_uverbs_lookup_comp_file(cmd->comp_channel, file);
+		ev_file = ib_uverbs_lookup_comp_file(cmd->comp_channel, attrs);
 		if (IS_ERR(ev_file)) {
 			ret = PTR_ERR(ev_file);
 			goto err;
@@ -1020,16 +1016,13 @@
 
 	attr.cqe = cmd->cqe;
 	attr.comp_vector = cmd->comp_vector;
+	attr.flags = cmd->flags;
 
-	if (cmd_sz > offsetof(typeof(*cmd), flags) + sizeof(cmd->flags))
-		attr.flags = cmd->flags;
-
-	cq = ib_dev->create_cq(ib_dev, &attr, obj->uobject.context, uhw);
-	if (IS_ERR(cq)) {
-		ret = PTR_ERR(cq);
+	cq = rdma_zalloc_drv_obj(ib_dev, ib_cq);
+	if (!cq) {
+		ret = -ENOMEM;
 		goto err_file;
 	}
-
 	cq->device        = ib_dev;
 	cq->uobject       = &obj->uobject;
 	cq->comp_handler  = ib_uverbs_comp_handler;
@@ -1037,74 +1030,53 @@
 	cq->cq_context    = ev_file ? &ev_file->ev_queue : NULL;
 	atomic_set(&cq->usecnt, 0);
 
+	ret = ib_dev->ops.create_cq(cq, &attr, &attrs->driver_udata);
+	if (ret)
+		goto err_free;
+
 	obj->uobject.object = cq;
 	memset(&resp, 0, sizeof resp);
 	resp.base.cq_handle = obj->uobject.id;
 	resp.base.cqe       = cq->cqe;
-
-	resp.response_length = offsetof(typeof(resp), response_length) +
-		sizeof(resp.response_length);
+	resp.response_length = uverbs_response_length(attrs, sizeof(resp));
 
 	cq->res.type = RDMA_RESTRACK_CQ;
-	rdma_restrack_add(&cq->res);
+	rdma_restrack_uadd(&cq->res);
 
-	ret = cb(file, obj, &resp, ucore, context);
+	ret = uverbs_response(attrs, &resp, sizeof(resp));
 	if (ret)
 		goto err_cb;
 
-	ret = uobj_alloc_commit(&obj->uobject, 0);
+	ret = uobj_alloc_commit(&obj->uobject, attrs);
 	if (ret)
 		return ERR_PTR(ret);
 	return obj;
 
 err_cb:
-	ib_destroy_cq(cq);
-
+	ib_destroy_cq_user(cq, uverbs_get_cleared_udata(attrs));
+	cq = NULL;
+err_free:
+	kfree(cq);
 err_file:
 	if (ev_file)
-		ib_uverbs_release_ucq(file, ev_file, obj);
+		ib_uverbs_release_ucq(attrs->ufile, ev_file, obj);
 
 err:
-	uobj_alloc_abort(&obj->uobject);
+	uobj_alloc_abort(&obj->uobject, attrs);
 
 	return ERR_PTR(ret);
 }
 
-static int ib_uverbs_create_cq_cb(struct ib_uverbs_file *file,
-				  struct ib_ucq_object *obj,
-				  struct ib_uverbs_ex_create_cq_resp *resp,
-				  struct ib_udata *ucore, void *context)
-{
-	if (ib_copy_to_udata(ucore, &resp->base, sizeof(resp->base)))
-		return -EFAULT;
-
-	return 0;
-}
-
-ssize_t ib_uverbs_create_cq(struct ib_uverbs_file *file,
-			    const char __user *buf, int in_len,
-			    int out_len)
+static int ib_uverbs_create_cq(struct uverbs_attr_bundle *attrs)
 {
 	struct ib_uverbs_create_cq      cmd;
 	struct ib_uverbs_ex_create_cq	cmd_ex;
-	struct ib_uverbs_create_cq_resp resp;
-	struct ib_udata                 ucore;
-	struct ib_udata                 uhw;
 	struct ib_ucq_object           *obj;
+	int ret;
 
-	if (out_len < sizeof(resp))
-		return -ENOSPC;
-
-	if (copy_from_user(&cmd, buf, sizeof(cmd)))
-		return -EFAULT;
-
-	ib_uverbs_init_udata(&ucore, buf, u64_to_user_ptr(cmd.response),
-			     sizeof(cmd), sizeof(resp));
-
-	ib_uverbs_init_udata(&uhw, buf + sizeof(cmd),
-		   u64_to_user_ptr(cmd.response) + sizeof(resp),
-		   in_len - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr),
-		   out_len - sizeof(resp));
+	ret = uverbs_request(attrs, &cmd, sizeof(cmd));
+	if (ret)
+		return ret;
 
 	memset(&cmd_ex, 0, sizeof(cmd_ex));
 	cmd_ex.user_handle = cmd.user_handle;
@@ -1112,43 +1084,19 @@
 	cmd_ex.comp_vector = cmd.comp_vector;
 	cmd_ex.comp_channel = cmd.comp_channel;
 
-	obj = create_cq(file, &ucore, &uhw, &cmd_ex,
-			offsetof(typeof(cmd_ex), comp_channel) +
-			sizeof(cmd.comp_channel), ib_uverbs_create_cq_cb,
-			NULL);
-
-	if (IS_ERR(obj))
-		return PTR_ERR(obj);
-
-	return in_len;
+	obj = create_cq(attrs, &cmd_ex);
+	return PTR_ERR_OR_ZERO(obj);
 }
 
-static int ib_uverbs_ex_create_cq_cb(struct ib_uverbs_file *file,
-				     struct ib_ucq_object *obj,
-				     struct ib_uverbs_ex_create_cq_resp *resp,
-				     struct ib_udata *ucore, void *context)
+static int ib_uverbs_ex_create_cq(struct uverbs_attr_bundle *attrs)
 {
-	if (ib_copy_to_udata(ucore, resp, resp->response_length))
-		return -EFAULT;
-
-	return 0;
-}
-
-int ib_uverbs_ex_create_cq(struct ib_uverbs_file *file,
-			   struct ib_udata *ucore,
-			   struct ib_udata *uhw)
-{
-	struct ib_uverbs_ex_create_cq_resp resp;
 	struct ib_uverbs_ex_create_cq  cmd;
 	struct ib_ucq_object           *obj;
-	int err;
+	int ret;
 
-	if (ucore->inlen < sizeof(cmd))
-		return -EINVAL;
-
-	err = ib_copy_from_udata(&cmd, ucore, sizeof(cmd));
-	if (err)
-		return err;
+	ret = uverbs_request(attrs, &cmd, sizeof(cmd));
+	if (ret)
+		return ret;
 
 	if (cmd.comp_mask)
 		return -EINVAL;
@@ -1156,52 +1104,36 @@
 	if (cmd.reserved)
 		return -EINVAL;
 
-	if (ucore->outlen < (offsetof(typeof(resp), response_length) +
-			     sizeof(resp.response_length)))
-		return -ENOSPC;
-
-	obj = create_cq(file, ucore, uhw, &cmd,
-			min(ucore->inlen, sizeof(cmd)),
-			ib_uverbs_ex_create_cq_cb, NULL);
-
+	obj = create_cq(attrs, &cmd);
 	return PTR_ERR_OR_ZERO(obj);
 }
 
-ssize_t ib_uverbs_resize_cq(struct ib_uverbs_file *file,
-			    const char __user *buf, int in_len,
-			    int out_len)
+static int ib_uverbs_resize_cq(struct uverbs_attr_bundle *attrs)
 {
 	struct ib_uverbs_resize_cq	cmd;
 	struct ib_uverbs_resize_cq_resp	resp = {};
-	struct ib_udata                 udata;
 	struct ib_cq			*cq;
 	int				ret = -EINVAL;
 
-	if (copy_from_user(&cmd, buf, sizeof cmd))
-		return -EFAULT;
+	ret = uverbs_request(attrs, &cmd, sizeof(cmd));
+	if (ret)
+		return ret;
 
-	ib_uverbs_init_udata(&udata, buf + sizeof(cmd),
-		   u64_to_user_ptr(cmd.response) + sizeof(resp),
-		   in_len - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr),
-		   out_len - sizeof(resp));
-
-	cq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ, cmd.cq_handle, file);
+	cq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ, cmd.cq_handle, attrs);
 	if (!cq)
 		return -EINVAL;
 
-	ret = cq->device->resize_cq(cq, cmd.cqe, &udata);
+	ret = cq->device->ops.resize_cq(cq, cmd.cqe, &attrs->driver_udata);
 	if (ret)
 		goto out;
 
 	resp.cqe = cq->cqe;
 
-	if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof resp.cqe))
-		ret = -EFAULT;
-
+	ret = uverbs_response(attrs, &resp, sizeof(resp));
 out:
 	uobj_put_obj_read(cq);
 
-	return ret ? ret : in_len;
+	return ret;
 }
 
 static int copy_wc_to_user(struct ib_device *ib_dev, void __user *dest,
@@ -1234,9 +1166,7 @@
 	return 0;
 }
 
-ssize_t ib_uverbs_poll_cq(struct ib_uverbs_file *file,
-			  const char __user *buf, int in_len,
-			  int out_len)
+static int ib_uverbs_poll_cq(struct uverbs_attr_bundle *attrs)
 {
 	struct ib_uverbs_poll_cq       cmd;
 	struct ib_uverbs_poll_cq_resp  resp;
@@ -1246,15 +1176,16 @@
 	struct ib_wc                   wc;
 	int                            ret;
 
-	if (copy_from_user(&cmd, buf, sizeof cmd))
-		return -EFAULT;
+	ret = uverbs_request(attrs, &cmd, sizeof(cmd));
+	if (ret)
+		return ret;
 
-	cq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ, cmd.cq_handle, file);
+	cq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ, cmd.cq_handle, attrs);
 	if (!cq)
 		return -EINVAL;
 
 	/* we copy a struct ib_uverbs_poll_cq_resp to user space */
-	header_ptr = u64_to_user_ptr(cmd.response);
+	header_ptr = attrs->ucore.outbuf;
 	data_ptr = header_ptr + sizeof resp;
 
 	memset(&resp, 0, sizeof resp);
@@ -1277,25 +1208,27 @@
 		ret = -EFAULT;
 		goto out_put;
 	}
+	ret = 0;
 
-	ret = in_len;
+	if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_CORE_OUT))
+		ret = uverbs_output_written(attrs, UVERBS_ATTR_CORE_OUT);
 
 out_put:
 	uobj_put_obj_read(cq);
 	return ret;
 }
 
-ssize_t ib_uverbs_req_notify_cq(struct ib_uverbs_file *file,
-				const char __user *buf, int in_len,
-				int out_len)
+static int ib_uverbs_req_notify_cq(struct uverbs_attr_bundle *attrs)
 {
 	struct ib_uverbs_req_notify_cq cmd;
 	struct ib_cq                  *cq;
+	int ret;
 
-	if (copy_from_user(&cmd, buf, sizeof cmd))
-		return -EFAULT;
+	ret = uverbs_request(attrs, &cmd, sizeof(cmd));
+	if (ret)
+		return ret;
 
-	cq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ, cmd.cq_handle, file);
+	cq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ, cmd.cq_handle, attrs);
 	if (!cq)
 		return -EINVAL;
 
@@ -1304,22 +1237,22 @@
 
 	uobj_put_obj_read(cq);
 
-	return in_len;
+	return 0;
 }
 
-ssize_t ib_uverbs_destroy_cq(struct ib_uverbs_file *file,
-			     const char __user *buf, int in_len,
-			     int out_len)
+static int ib_uverbs_destroy_cq(struct uverbs_attr_bundle *attrs)
 {
 	struct ib_uverbs_destroy_cq      cmd;
 	struct ib_uverbs_destroy_cq_resp resp;
 	struct ib_uobject		*uobj;
 	struct ib_ucq_object        	*obj;
+	int ret;
 
-	if (copy_from_user(&cmd, buf, sizeof cmd))
-		return -EFAULT;
+	ret = uverbs_request(attrs, &cmd, sizeof(cmd));
+	if (ret)
+		return ret;
 
-	uobj = uobj_get_destroy(UVERBS_OBJECT_CQ, cmd.cq_handle, file);
+	uobj = uobj_get_destroy(UVERBS_OBJECT_CQ, cmd.cq_handle, attrs);
 	if (IS_ERR(uobj))
 		return PTR_ERR(uobj);
 
@@ -1330,21 +1263,11 @@
 
 	uobj_put_destroy(uobj);
 
-	if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof resp))
-		return -EFAULT;
-
-	return in_len;
+	return uverbs_response(attrs, &resp, sizeof(resp));
 }
 
-static int create_qp(struct ib_uverbs_file *file,
-		     struct ib_udata *ucore,
-		     struct ib_udata *uhw,
-		     struct ib_uverbs_ex_create_qp *cmd,
-		     size_t cmd_sz,
-		     int (*cb)(struct ib_uverbs_file *file,
-			       struct ib_uverbs_ex_create_qp_resp *resp,
-			       struct ib_udata *udata),
-		     void *context)
+static int create_qp(struct uverbs_attr_bundle *attrs,
+		     struct ib_uverbs_ex_create_qp *cmd)
 {
 	struct ib_uqp_object		*obj;
 	struct ib_device		*device;
@@ -1354,7 +1277,6 @@
 	struct ib_cq			*scq = NULL, *rcq = NULL;
 	struct ib_srq			*srq = NULL;
 	struct ib_qp			*qp;
-	char				*buf;
 	struct ib_qp_init_attr		attr = {};
 	struct ib_uverbs_ex_create_qp_resp resp;
 	int				ret;
@@ -1365,7 +1287,7 @@
 	if (cmd->qp_type == IB_QPT_RAW_PACKET && !capable(CAP_NET_RAW))
 		return -EPERM;
 
-	obj = (struct ib_uqp_object *)uobj_alloc(UVERBS_OBJECT_QP, file,
+	obj = (struct ib_uqp_object *)uobj_alloc(UVERBS_OBJECT_QP, attrs,
 						 &ib_dev);
 	if (IS_ERR(obj))
 		return PTR_ERR(obj);
@@ -1373,12 +1295,10 @@
 	obj->uevent.uobject.user_handle = cmd->user_handle;
 	mutex_init(&obj->mcast_lock);
 
-	if (cmd_sz >= offsetof(typeof(*cmd), rwq_ind_tbl_handle) +
-		      sizeof(cmd->rwq_ind_tbl_handle) &&
-		      (cmd->comp_mask & IB_UVERBS_CREATE_QP_MASK_IND_TABLE)) {
+	if (cmd->comp_mask & IB_UVERBS_CREATE_QP_MASK_IND_TABLE) {
 		ind_tbl = uobj_get_obj_read(rwq_ind_table,
 					    UVERBS_OBJECT_RWQ_IND_TBL,
-					    cmd->rwq_ind_tbl_handle, file);
+					    cmd->rwq_ind_tbl_handle, attrs);
 		if (!ind_tbl) {
 			ret = -EINVAL;
 			goto err_put;
@@ -1387,13 +1307,6 @@
 		attr.rwq_ind_tbl = ind_tbl;
 	}
 
-	if (cmd_sz > sizeof(*cmd) &&
-	    !ib_is_udata_cleared(ucore, sizeof(*cmd),
-				 cmd_sz - sizeof(*cmd))) {
-		ret = -EOPNOTSUPP;
-		goto err_put;
-	}
-
 	if (ind_tbl && (cmd->max_recv_wr || cmd->max_recv_sge || cmd->is_srq)) {
 		ret = -EINVAL;
 		goto err_put;
@@ -1404,7 +1317,7 @@
 
 	if (cmd->qp_type == IB_QPT_XRC_TGT) {
 		xrcd_uobj = uobj_get_read(UVERBS_OBJECT_XRCD, cmd->pd_handle,
-					  file);
+					  attrs);
 
 		if (IS_ERR(xrcd_uobj)) {
 			ret = -EINVAL;
@@ -1424,7 +1337,7 @@
 		} else {
 			if (cmd->is_srq) {
 				srq = uobj_get_obj_read(srq, UVERBS_OBJECT_SRQ,
-							cmd->srq_handle, file);
+							cmd->srq_handle, attrs);
 				if (!srq || srq->srq_type == IB_SRQT_XRC) {
 					ret = -EINVAL;
 					goto err_put;
@@ -1435,7 +1348,7 @@
 				if (cmd->recv_cq_handle != cmd->send_cq_handle) {
 					rcq = uobj_get_obj_read(
 						cq, UVERBS_OBJECT_CQ,
-						cmd->recv_cq_handle, file);
+						cmd->recv_cq_handle, attrs);
 					if (!rcq) {
 						ret = -EINVAL;
 						goto err_put;
@@ -1446,11 +1359,11 @@
 
 		if (has_sq)
 			scq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ,
-						cmd->send_cq_handle, file);
+						cmd->send_cq_handle, attrs);
 		if (!ind_tbl)
 			rcq = rcq ?: scq;
 		pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd->pd_handle,
-				       file);
+				       attrs);
 		if (!pd || (!scq && has_sq)) {
 			ret = -EINVAL;
 			goto err_put;
@@ -1460,7 +1373,7 @@
 	}
 
 	attr.event_handler = ib_uverbs_qp_event_handler;
-	attr.qp_context    = file;
+	attr.qp_context    = attrs->ufile;
 	attr.send_cq       = scq;
 	attr.recv_cq       = rcq;
 	attr.srq           = srq;
@@ -1480,10 +1393,7 @@
 	INIT_LIST_HEAD(&obj->uevent.event_list);
 	INIT_LIST_HEAD(&obj->mcast_list);
 
-	if (cmd_sz >= offsetof(typeof(*cmd), create_flags) +
-		      sizeof(cmd->create_flags))
-		attr.create_flags = cmd->create_flags;
-
+	attr.create_flags = cmd->create_flags;
 	if (attr.create_flags & ~(IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK |
 				IB_QP_CREATE_CROSS_CHANNEL |
 				IB_QP_CREATE_MANAGED_SEND |
@@ -1505,18 +1415,10 @@
 		attr.source_qpn = cmd->source_qpn;
 	}
 
-	buf = (void *)cmd + sizeof(*cmd);
-	if (cmd_sz > sizeof(*cmd))
-		if (!(buf[0] == 0 && !memcmp(buf, buf + 1,
-					     cmd_sz - sizeof(*cmd) - 1))) {
-			ret = -EINVAL;
-			goto err_put;
-		}
-
 	if (cmd->qp_type == IB_QPT_XRC_TGT)
 		qp = ib_create_qp(pd, &attr);
 	else
-		qp = _ib_create_qp(device, pd, &attr, uhw,
+		qp = _ib_create_qp(device, pd, &attr, &attrs->driver_udata,
 				   &obj->uevent.uobject);
 
 	if (IS_ERR(qp)) {
@@ -1529,7 +1431,6 @@
 		if (ret)
 			goto err_cb;
 
-		qp->real_qp	  = qp;
 		qp->pd		  = pd;
 		qp->send_cq	  = attr.send_cq;
 		qp->recv_cq	  = attr.recv_cq;
@@ -1564,11 +1465,9 @@
 	resp.base.max_recv_wr     = attr.cap.max_recv_wr;
 	resp.base.max_send_wr     = attr.cap.max_send_wr;
 	resp.base.max_inline_data = attr.cap.max_inline_data;
+	resp.response_length = uverbs_response_length(attrs, sizeof(resp));
 
-	resp.response_length = offsetof(typeof(resp), response_length) +
-			       sizeof(resp.response_length);
-
-	ret = cb(file, &resp, ucore);
+	ret = uverbs_response(attrs, &resp, sizeof(resp));
 	if (ret)
 		goto err_cb;
 
@@ -1590,9 +1489,9 @@
 	if (ind_tbl)
 		uobj_put_obj_read(ind_tbl);
 
-	return uobj_alloc_commit(&obj->uevent.uobject, 0);
+	return uobj_alloc_commit(&obj->uevent.uobject, attrs);
 err_cb:
-	ib_destroy_qp(qp);
+	ib_destroy_qp_user(qp, uverbs_get_cleared_udata(attrs));
 
 err_put:
 	if (!IS_ERR(xrcd_uobj))
@@ -1608,43 +1507,19 @@
 	if (ind_tbl)
 		uobj_put_obj_read(ind_tbl);
 
-	uobj_alloc_abort(&obj->uevent.uobject);
+	uobj_alloc_abort(&obj->uevent.uobject, attrs);
 	return ret;
 }
 
-static int ib_uverbs_create_qp_cb(struct ib_uverbs_file *file,
-				  struct ib_uverbs_ex_create_qp_resp *resp,
-				  struct ib_udata *ucore)
-{
-	if (ib_copy_to_udata(ucore, &resp->base, sizeof(resp->base)))
-		return -EFAULT;
-
-	return 0;
-}
-
-ssize_t ib_uverbs_create_qp(struct ib_uverbs_file *file,
-			    const char __user *buf, int in_len,
-			    int out_len)
+static int ib_uverbs_create_qp(struct uverbs_attr_bundle *attrs)
 {
 	struct ib_uverbs_create_qp      cmd;
 	struct ib_uverbs_ex_create_qp	cmd_ex;
-	struct ib_udata			ucore;
-	struct ib_udata			uhw;
-	ssize_t resp_size = sizeof(struct ib_uverbs_create_qp_resp);
-	int				err;
+	int ret;
 
-	if (out_len < resp_size)
-		return -ENOSPC;
-
-	if (copy_from_user(&cmd, buf, sizeof(cmd)))
-		return -EFAULT;
-
-	ib_uverbs_init_udata(&ucore, buf, u64_to_user_ptr(cmd.response),
-		   sizeof(cmd), resp_size);
-	ib_uverbs_init_udata(&uhw, buf + sizeof(cmd),
-		   u64_to_user_ptr(cmd.response) + resp_size,
-		   in_len - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr),
-		   out_len - resp_size);
+	ret = uverbs_request(attrs, &cmd, sizeof(cmd));
+	if (ret)
+		return ret;
 
 	memset(&cmd_ex, 0, sizeof(cmd_ex));
 	cmd_ex.user_handle = cmd.user_handle;
@@ -1661,42 +1536,17 @@
 	cmd_ex.qp_type = cmd.qp_type;
 	cmd_ex.is_srq = cmd.is_srq;
 
-	err = create_qp(file, &ucore, &uhw, &cmd_ex,
-			offsetof(typeof(cmd_ex), is_srq) +
-			sizeof(cmd.is_srq), ib_uverbs_create_qp_cb,
-			NULL);
-
-	if (err)
-		return err;
-
-	return in_len;
+	return create_qp(attrs, &cmd_ex);
 }
 
-static int ib_uverbs_ex_create_qp_cb(struct ib_uverbs_file *file,
-				     struct ib_uverbs_ex_create_qp_resp *resp,
-				     struct ib_udata *ucore)
+static int ib_uverbs_ex_create_qp(struct uverbs_attr_bundle *attrs)
 {
-	if (ib_copy_to_udata(ucore, resp, resp->response_length))
-		return -EFAULT;
+	struct ib_uverbs_ex_create_qp cmd;
+	int ret;
 
-	return 0;
-}
-
-int ib_uverbs_ex_create_qp(struct ib_uverbs_file *file,
-			   struct ib_udata *ucore,
-			   struct ib_udata *uhw)
-{
-	struct ib_uverbs_ex_create_qp_resp resp;
-	struct ib_uverbs_ex_create_qp cmd = {0};
-	int err;
-
-	if (ucore->inlen < (offsetof(typeof(cmd), comp_mask) +
-			    sizeof(cmd.comp_mask)))
-		return -EINVAL;
-
-	err = ib_copy_from_udata(&cmd, ucore, min(sizeof(cmd), ucore->inlen));
-	if (err)
-		return err;
+	ret = uverbs_request(attrs, &cmd, sizeof(cmd));
+	if (ret)
+		return ret;
 
 	if (cmd.comp_mask & ~IB_UVERBS_CREATE_QP_SUP_COMP_MASK)
 		return -EINVAL;
@@ -1704,26 +1554,13 @@
 	if (cmd.reserved)
 		return -EINVAL;
 
-	if (ucore->outlen < (offsetof(typeof(resp), response_length) +
-			     sizeof(resp.response_length)))
-		return -ENOSPC;
-
-	err = create_qp(file, ucore, uhw, &cmd,
-			min(ucore->inlen, sizeof(cmd)),
-			ib_uverbs_ex_create_qp_cb, NULL);
-
-	if (err)
-		return err;
-
-	return 0;
+	return create_qp(attrs, &cmd);
 }
 
-ssize_t ib_uverbs_open_qp(struct ib_uverbs_file *file,
-			  const char __user *buf, int in_len, int out_len)
+static int ib_uverbs_open_qp(struct uverbs_attr_bundle *attrs)
 {
 	struct ib_uverbs_open_qp        cmd;
 	struct ib_uverbs_create_qp_resp resp;
-	struct ib_udata                 udata;
 	struct ib_uqp_object           *obj;
 	struct ib_xrcd		       *xrcd;
 	struct ib_uobject	       *uninitialized_var(xrcd_uobj);
@@ -1732,23 +1569,16 @@
 	int ret;
 	struct ib_device *ib_dev;
 
-	if (out_len < sizeof resp)
-		return -ENOSPC;
+	ret = uverbs_request(attrs, &cmd, sizeof(cmd));
+	if (ret)
+		return ret;
 
-	if (copy_from_user(&cmd, buf, sizeof cmd))
-		return -EFAULT;
-
-	ib_uverbs_init_udata(&udata, buf + sizeof(cmd),
-		   u64_to_user_ptr(cmd.response) + sizeof(resp),
-		   in_len - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr),
-		   out_len - sizeof(resp));
-
-	obj = (struct ib_uqp_object *)uobj_alloc(UVERBS_OBJECT_QP, file,
+	obj = (struct ib_uqp_object *)uobj_alloc(UVERBS_OBJECT_QP, attrs,
 						 &ib_dev);
 	if (IS_ERR(obj))
 		return PTR_ERR(obj);
 
-	xrcd_uobj = uobj_get_read(UVERBS_OBJECT_XRCD, cmd.pd_handle, file);
+	xrcd_uobj = uobj_get_read(UVERBS_OBJECT_XRCD, cmd.pd_handle, attrs);
 	if (IS_ERR(xrcd_uobj)) {
 		ret = -EINVAL;
 		goto err_put;
@@ -1761,7 +1591,7 @@
 	}
 
 	attr.event_handler = ib_uverbs_qp_event_handler;
-	attr.qp_context    = file;
+	attr.qp_context    = attrs->ufile;
 	attr.qp_num        = cmd.qpn;
 	attr.qp_type       = cmd.qp_type;
 
@@ -1782,24 +1612,23 @@
 	resp.qpn       = qp->qp_num;
 	resp.qp_handle = obj->uevent.uobject.id;
 
-	if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof resp)) {
-		ret = -EFAULT;
+	ret = uverbs_response(attrs, &resp, sizeof(resp));
+	if (ret)
 		goto err_destroy;
-	}
 
 	obj->uxrcd = container_of(xrcd_uobj, struct ib_uxrcd_object, uobject);
 	atomic_inc(&obj->uxrcd->refcnt);
 	qp->uobject = &obj->uevent.uobject;
 	uobj_put_read(xrcd_uobj);
 
-	return uobj_alloc_commit(&obj->uevent.uobject, in_len);
+	return uobj_alloc_commit(&obj->uevent.uobject, attrs);
 
 err_destroy:
-	ib_destroy_qp(qp);
+	ib_destroy_qp_user(qp, uverbs_get_cleared_udata(attrs));
 err_xrcd:
 	uobj_put_read(xrcd_uobj);
 err_put:
-	uobj_alloc_abort(&obj->uevent.uobject);
+	uobj_alloc_abort(&obj->uevent.uobject, attrs);
 	return ret;
 }
 
@@ -1825,9 +1654,7 @@
 	uverb_attr->port_num          = rdma_ah_get_port_num(rdma_attr);
 }
 
-ssize_t ib_uverbs_query_qp(struct ib_uverbs_file *file,
-			   const char __user *buf, int in_len,
-			   int out_len)
+static int ib_uverbs_query_qp(struct uverbs_attr_bundle *attrs)
 {
 	struct ib_uverbs_query_qp      cmd;
 	struct ib_uverbs_query_qp_resp resp;
@@ -1836,8 +1663,9 @@
 	struct ib_qp_init_attr         *init_attr;
 	int                            ret;
 
-	if (copy_from_user(&cmd, buf, sizeof cmd))
-		return -EFAULT;
+	ret = uverbs_request(attrs, &cmd, sizeof(cmd));
+	if (ret)
+		return ret;
 
 	attr      = kmalloc(sizeof *attr, GFP_KERNEL);
 	init_attr = kmalloc(sizeof *init_attr, GFP_KERNEL);
@@ -1846,7 +1674,7 @@
 		goto out;
 	}
 
-	qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd.qp_handle, file);
+	qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd.qp_handle, attrs);
 	if (!qp) {
 		ret = -EINVAL;
 		goto out;
@@ -1893,14 +1721,13 @@
 	resp.max_inline_data        = init_attr->cap.max_inline_data;
 	resp.sq_sig_all             = init_attr->sq_sig_type == IB_SIGNAL_ALL_WR;
 
-	if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof resp))
-		ret = -EFAULT;
+	ret = uverbs_response(attrs, &resp, sizeof(resp));
 
 out:
 	kfree(attr);
 	kfree(init_attr);
 
-	return ret ? ret : in_len;
+	return ret;
 }
 
 /* Remove ignored fields set in the attribute mask */
@@ -1940,8 +1767,8 @@
 	rdma_ah_set_make_grd(rdma_attr, false);
 }
 
-static int modify_qp(struct ib_uverbs_file *file,
-		     struct ib_uverbs_ex_modify_qp *cmd, struct ib_udata *udata)
+static int modify_qp(struct uverbs_attr_bundle *attrs,
+		     struct ib_uverbs_ex_modify_qp *cmd)
 {
 	struct ib_qp_attr *attr;
 	struct ib_qp *qp;
@@ -1951,7 +1778,8 @@
 	if (!attr)
 		return -ENOMEM;
 
-	qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd->base.qp_handle, file);
+	qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd->base.qp_handle,
+			       attrs);
 	if (!qp) {
 		ret = -EINVAL;
 		goto out;
@@ -2088,7 +1916,7 @@
 	ret = ib_modify_qp_with_udata(qp, attr,
 				      modify_qp_mask(qp->qp_type,
 						     cmd->base.attr_mask),
-				      udata);
+				      &attrs->driver_udata);
 
 release_qp:
 	uobj_put_obj_read(qp);
@@ -2098,80 +1926,64 @@
 	return ret;
 }
 
-ssize_t ib_uverbs_modify_qp(struct ib_uverbs_file *file,
-			    const char __user *buf, int in_len,
-			    int out_len)
+static int ib_uverbs_modify_qp(struct uverbs_attr_bundle *attrs)
 {
-	struct ib_uverbs_ex_modify_qp cmd = {};
-	struct ib_udata udata;
+	struct ib_uverbs_ex_modify_qp cmd;
 	int ret;
 
-	if (copy_from_user(&cmd.base, buf, sizeof(cmd.base)))
-		return -EFAULT;
+	ret = uverbs_request(attrs, &cmd.base, sizeof(cmd.base));
+	if (ret)
+		return ret;
 
 	if (cmd.base.attr_mask &
 	    ~((IB_USER_LEGACY_LAST_QP_ATTR_MASK << 1) - 1))
 		return -EOPNOTSUPP;
 
-	ib_uverbs_init_udata(&udata, buf + sizeof(cmd.base), NULL,
-		   in_len - sizeof(cmd.base) - sizeof(struct ib_uverbs_cmd_hdr),
-		   out_len);
-
-	ret = modify_qp(file, &cmd, &udata);
-	if (ret)
-		return ret;
-
-	return in_len;
+	return modify_qp(attrs, &cmd);
 }
 
-int ib_uverbs_ex_modify_qp(struct ib_uverbs_file *file,
-			   struct ib_udata *ucore,
-			   struct ib_udata *uhw)
+static int ib_uverbs_ex_modify_qp(struct uverbs_attr_bundle *attrs)
 {
-	struct ib_uverbs_ex_modify_qp cmd = {};
+	struct ib_uverbs_ex_modify_qp cmd;
+	struct ib_uverbs_ex_modify_qp_resp resp = {
+		.response_length = uverbs_response_length(attrs, sizeof(resp))
+	};
 	int ret;
 
+	ret = uverbs_request(attrs, &cmd, sizeof(cmd));
+	if (ret)
+		return ret;
+
 	/*
 	 * Last bit is reserved for extending the attr_mask by
 	 * using another field.
 	 */
 	BUILD_BUG_ON(IB_USER_LAST_QP_ATTR_MASK == (1 << 31));
 
-	if (ucore->inlen < sizeof(cmd.base))
-		return -EINVAL;
-
-	ret = ib_copy_from_udata(&cmd, ucore, min(sizeof(cmd), ucore->inlen));
-	if (ret)
-		return ret;
-
 	if (cmd.base.attr_mask &
 	    ~((IB_USER_LAST_QP_ATTR_MASK << 1) - 1))
 		return -EOPNOTSUPP;
 
-	if (ucore->inlen > sizeof(cmd)) {
-		if (!ib_is_udata_cleared(ucore, sizeof(cmd),
-					 ucore->inlen - sizeof(cmd)))
-			return -EOPNOTSUPP;
-	}
+	ret = modify_qp(attrs, &cmd);
+	if (ret)
+		return ret;
 
-	ret = modify_qp(file, &cmd, uhw);
-
-	return ret;
+	return uverbs_response(attrs, &resp, sizeof(resp));
 }
 
-ssize_t ib_uverbs_destroy_qp(struct ib_uverbs_file *file,
-			     const char __user *buf, int in_len,
-			     int out_len)
+static int ib_uverbs_destroy_qp(struct uverbs_attr_bundle *attrs)
 {
 	struct ib_uverbs_destroy_qp      cmd;
 	struct ib_uverbs_destroy_qp_resp resp;
 	struct ib_uobject		*uobj;
 	struct ib_uqp_object        	*obj;
+	int ret;
 
-	if (copy_from_user(&cmd, buf, sizeof cmd))
-		return -EFAULT;
+	ret = uverbs_request(attrs, &cmd, sizeof(cmd));
+	if (ret)
+		return ret;
 
-	uobj = uobj_get_destroy(UVERBS_OBJECT_QP, cmd.qp_handle, file);
+	uobj = uobj_get_destroy(UVERBS_OBJECT_QP, cmd.qp_handle, attrs);
 	if (IS_ERR(uobj))
 		return PTR_ERR(uobj);
 
@@ -2181,10 +1993,7 @@
 
 	uobj_put_destroy(uobj);
 
-	if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof resp))
-		return -EFAULT;
-
-	return in_len;
+	return uverbs_response(attrs, &resp, sizeof(resp));
 }
 
 static void *alloc_wr(size_t wr_size, __u32 num_sge)
@@ -2197,9 +2006,7 @@
 			 num_sge * sizeof (struct ib_sge), GFP_KERNEL);
 }
 
-ssize_t ib_uverbs_post_send(struct ib_uverbs_file *file,
-			    const char __user *buf, int in_len,
-			    int out_len)
+static int ib_uverbs_post_send(struct uverbs_attr_bundle *attrs)
 {
 	struct ib_uverbs_post_send      cmd;
 	struct ib_uverbs_post_send_resp resp;
@@ -2209,33 +2016,41 @@
 	struct ib_qp                   *qp;
 	int                             i, sg_ind;
 	int				is_ud;
-	ssize_t                         ret = -EINVAL;
+	int ret, ret2;
 	size_t                          next_size;
+	const struct ib_sge __user *sgls;
+	const void __user *wqes;
+	struct uverbs_req_iter iter;
 
-	if (copy_from_user(&cmd, buf, sizeof cmd))
-		return -EFAULT;
-
-	if (in_len < sizeof cmd + cmd.wqe_size * cmd.wr_count +
-	    cmd.sge_count * sizeof (struct ib_uverbs_sge))
-		return -EINVAL;
-
-	if (cmd.wqe_size < sizeof (struct ib_uverbs_send_wr))
-		return -EINVAL;
+	ret = uverbs_request_start(attrs, &iter, &cmd, sizeof(cmd));
+	if (ret)
+		return ret;
+	wqes = uverbs_request_next_ptr(&iter, cmd.wqe_size * cmd.wr_count);
+	if (IS_ERR(wqes))
+		return PTR_ERR(wqes);
+	sgls = uverbs_request_next_ptr(
+		&iter, cmd.sge_count * sizeof(struct ib_uverbs_sge));
+	if (IS_ERR(sgls))
+		return PTR_ERR(sgls);
+	ret = uverbs_request_finish(&iter);
+	if (ret)
+		return ret;
 
 	user_wr = kmalloc(cmd.wqe_size, GFP_KERNEL);
 	if (!user_wr)
 		return -ENOMEM;
 
-	qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd.qp_handle, file);
-	if (!qp)
+	qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd.qp_handle, attrs);
+	if (!qp) {
+		ret = -EINVAL;
 		goto out;
+	}
 
 	is_ud = qp->qp_type == IB_QPT_UD;
 	sg_ind = 0;
 	last = NULL;
 	for (i = 0; i < cmd.wr_count; ++i) {
-		if (copy_from_user(user_wr,
-				   buf + sizeof cmd + i * cmd.wqe_size,
+		if (copy_from_user(user_wr, wqes + i * cmd.wqe_size,
 				   cmd.wqe_size)) {
 			ret = -EFAULT;
 			goto out_put;
@@ -2263,7 +2078,7 @@
 			}
 
 			ud->ah = uobj_get_obj_read(ah, UVERBS_OBJECT_AH,
-						   user_wr->wr.ud.ah, file);
+						   user_wr->wr.ud.ah, attrs);
 			if (!ud->ah) {
 				kfree(ud);
 				ret = -EINVAL;
@@ -2343,11 +2158,9 @@
 		if (next->num_sge) {
 			next->sg_list = (void *) next +
 				ALIGN(next_size, sizeof(struct ib_sge));
-			if (copy_from_user(next->sg_list,
-					   buf + sizeof cmd +
-					   cmd.wr_count * cmd.wqe_size +
-					   sg_ind * sizeof (struct ib_sge),
-					   next->num_sge * sizeof (struct ib_sge))) {
+			if (copy_from_user(next->sg_list, sgls + sg_ind,
+					   next->num_sge *
+						   sizeof(struct ib_sge))) {
 				ret = -EFAULT;
 				goto out_put;
 			}
@@ -2357,7 +2170,7 @@
 	}
 
 	resp.bad_wr = 0;
-	ret = qp->device->post_send(qp->real_qp, wr, &bad_wr);
+	ret = qp->device->ops.post_send(qp->real_qp, wr, &bad_wr);
 	if (ret)
 		for (next = wr; next; next = next->next) {
 			++resp.bad_wr;
@@ -2365,8 +2178,9 @@
 				break;
 		}
 
-	if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof resp))
-		ret = -EFAULT;
+	ret2 = uverbs_response(attrs, &resp, sizeof(resp));
+	if (ret2)
+		ret = ret2;
 
 out_put:
 	uobj_put_obj_read(qp);
@@ -2382,28 +2196,35 @@
 out:
 	kfree(user_wr);
 
-	return ret ? ret : in_len;
+	return ret;
 }
 
-static struct ib_recv_wr *ib_uverbs_unmarshall_recv(const char __user *buf,
-						    int in_len,
-						    u32 wr_count,
-						    u32 sge_count,
-						    u32 wqe_size)
+static struct ib_recv_wr *
+ib_uverbs_unmarshall_recv(struct uverbs_req_iter *iter, u32 wr_count,
+			  u32 wqe_size, u32 sge_count)
 {
 	struct ib_uverbs_recv_wr *user_wr;
 	struct ib_recv_wr        *wr = NULL, *last, *next;
 	int                       sg_ind;
 	int                       i;
 	int                       ret;
-
-	if (in_len < wqe_size * wr_count +
-	    sge_count * sizeof (struct ib_uverbs_sge))
-		return ERR_PTR(-EINVAL);
+	const struct ib_sge __user *sgls;
+	const void __user *wqes;
 
 	if (wqe_size < sizeof (struct ib_uverbs_recv_wr))
 		return ERR_PTR(-EINVAL);
 
+	wqes = uverbs_request_next_ptr(iter, wqe_size * wr_count);
+	if (IS_ERR(wqes))
+		return ERR_CAST(wqes);
+	sgls = uverbs_request_next_ptr(
+		iter, sge_count * sizeof(struct ib_uverbs_sge));
+	if (IS_ERR(sgls))
+		return ERR_CAST(sgls);
+	ret = uverbs_request_finish(iter);
+	if (ret)
+		return ERR_PTR(ret);
+
 	user_wr = kmalloc(wqe_size, GFP_KERNEL);
 	if (!user_wr)
 		return ERR_PTR(-ENOMEM);
@@ -2411,7 +2232,7 @@
 	sg_ind = 0;
 	last = NULL;
 	for (i = 0; i < wr_count; ++i) {
-		if (copy_from_user(user_wr, buf + i * wqe_size,
+		if (copy_from_user(user_wr, wqes + i * wqe_size,
 				   wqe_size)) {
 			ret = -EFAULT;
 			goto err;
@@ -2450,10 +2271,9 @@
 		if (next->num_sge) {
 			next->sg_list = (void *) next +
 				ALIGN(sizeof *next, sizeof (struct ib_sge));
-			if (copy_from_user(next->sg_list,
-					   buf + wr_count * wqe_size +
-					   sg_ind * sizeof (struct ib_sge),
-					   next->num_sge * sizeof (struct ib_sge))) {
+			if (copy_from_user(next->sg_list, sgls + sg_ind,
+					   next->num_sge *
+						   sizeof(struct ib_sge))) {
 				ret = -EFAULT;
 				goto err;
 			}
@@ -2477,32 +2297,33 @@
 	return ERR_PTR(ret);
 }
 
-ssize_t ib_uverbs_post_recv(struct ib_uverbs_file *file,
-			    const char __user *buf, int in_len,
-			    int out_len)
+static int ib_uverbs_post_recv(struct uverbs_attr_bundle *attrs)
 {
 	struct ib_uverbs_post_recv      cmd;
 	struct ib_uverbs_post_recv_resp resp;
 	struct ib_recv_wr              *wr, *next;
 	const struct ib_recv_wr	       *bad_wr;
 	struct ib_qp                   *qp;
-	ssize_t                         ret = -EINVAL;
+	int ret, ret2;
+	struct uverbs_req_iter iter;
 
-	if (copy_from_user(&cmd, buf, sizeof cmd))
-		return -EFAULT;
+	ret = uverbs_request_start(attrs, &iter, &cmd, sizeof(cmd));
+	if (ret)
+		return ret;
 
-	wr = ib_uverbs_unmarshall_recv(buf + sizeof cmd,
-				       in_len - sizeof cmd, cmd.wr_count,
-				       cmd.sge_count, cmd.wqe_size);
+	wr = ib_uverbs_unmarshall_recv(&iter, cmd.wr_count, cmd.wqe_size,
+				       cmd.sge_count);
 	if (IS_ERR(wr))
 		return PTR_ERR(wr);
 
-	qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd.qp_handle, file);
-	if (!qp)
+	qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd.qp_handle, attrs);
+	if (!qp) {
+		ret = -EINVAL;
 		goto out;
+	}
 
 	resp.bad_wr = 0;
-	ret = qp->device->post_recv(qp->real_qp, wr, &bad_wr);
+	ret = qp->device->ops.post_recv(qp->real_qp, wr, &bad_wr);
 
 	uobj_put_obj_read(qp);
 	if (ret) {
@@ -2513,9 +2334,9 @@
 		}
 	}
 
-	if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof resp))
-		ret = -EFAULT;
-
+	ret2 = uverbs_response(attrs, &resp, sizeof(resp));
+	if (ret2)
+		ret = ret2;
 out:
 	while (wr) {
 		next = wr->next;
@@ -2523,36 +2344,36 @@
 		wr = next;
 	}
 
-	return ret ? ret : in_len;
+	return ret;
 }
 
-ssize_t ib_uverbs_post_srq_recv(struct ib_uverbs_file *file,
-				const char __user *buf, int in_len,
-				int out_len)
+static int ib_uverbs_post_srq_recv(struct uverbs_attr_bundle *attrs)
 {
 	struct ib_uverbs_post_srq_recv      cmd;
 	struct ib_uverbs_post_srq_recv_resp resp;
 	struct ib_recv_wr                  *wr, *next;
 	const struct ib_recv_wr		   *bad_wr;
 	struct ib_srq                      *srq;
-	ssize_t                             ret = -EINVAL;
+	int ret, ret2;
+	struct uverbs_req_iter iter;
 
-	if (copy_from_user(&cmd, buf, sizeof cmd))
-		return -EFAULT;
+	ret = uverbs_request_start(attrs, &iter, &cmd, sizeof(cmd));
+	if (ret)
+		return ret;
 
-	wr = ib_uverbs_unmarshall_recv(buf + sizeof cmd,
-				       in_len - sizeof cmd, cmd.wr_count,
-				       cmd.sge_count, cmd.wqe_size);
+	wr = ib_uverbs_unmarshall_recv(&iter, cmd.wr_count, cmd.wqe_size,
+				       cmd.sge_count);
 	if (IS_ERR(wr))
 		return PTR_ERR(wr);
 
-	srq = uobj_get_obj_read(srq, UVERBS_OBJECT_SRQ, cmd.srq_handle, file);
-	if (!srq)
+	srq = uobj_get_obj_read(srq, UVERBS_OBJECT_SRQ, cmd.srq_handle, attrs);
+	if (!srq) {
+		ret = -EINVAL;
 		goto out;
+	}
 
 	resp.bad_wr = 0;
-	ret = srq->device->post_srq_recv ?
-		srq->device->post_srq_recv(srq, wr, &bad_wr) : -EOPNOTSUPP;
+	ret = srq->device->ops.post_srq_recv(srq, wr, &bad_wr);
 
 	uobj_put_obj_read(srq);
 
@@ -2563,8 +2384,9 @@
 				break;
 		}
 
-	if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof resp))
-		ret = -EFAULT;
+	ret2 = uverbs_response(attrs, &resp, sizeof(resp));
+	if (ret2)
+		ret = ret2;
 
 out:
 	while (wr) {
@@ -2573,12 +2395,10 @@
 		wr = next;
 	}
 
-	return ret ? ret : in_len;
+	return ret;
 }
 
-ssize_t ib_uverbs_create_ah(struct ib_uverbs_file *file,
-			    const char __user *buf, int in_len,
-			    int out_len)
+static int ib_uverbs_create_ah(struct uverbs_attr_bundle *attrs)
 {
 	struct ib_uverbs_create_ah	 cmd;
 	struct ib_uverbs_create_ah_resp	 resp;
@@ -2587,21 +2407,13 @@
 	struct ib_ah			*ah;
 	struct rdma_ah_attr		attr = {};
 	int ret;
-	struct ib_udata                   udata;
 	struct ib_device *ib_dev;
 
-	if (out_len < sizeof resp)
-		return -ENOSPC;
+	ret = uverbs_request(attrs, &cmd, sizeof(cmd));
+	if (ret)
+		return ret;
 
-	if (copy_from_user(&cmd, buf, sizeof cmd))
-		return -EFAULT;
-
-	ib_uverbs_init_udata(&udata, buf + sizeof(cmd),
-		   u64_to_user_ptr(cmd.response) + sizeof(resp),
-		   in_len - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr),
-		   out_len - sizeof(resp));
-
-	uobj = uobj_alloc(UVERBS_OBJECT_AH, file, &ib_dev);
+	uobj = uobj_alloc(UVERBS_OBJECT_AH, attrs, &ib_dev);
 	if (IS_ERR(uobj))
 		return PTR_ERR(uobj);
 
@@ -2610,7 +2422,7 @@
 		goto err;
 	}
 
-	pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd.pd_handle, file);
+	pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd.pd_handle, attrs);
 	if (!pd) {
 		ret = -EINVAL;
 		goto err;
@@ -2634,7 +2446,7 @@
 		rdma_ah_set_ah_flags(&attr, 0);
 	}
 
-	ah = rdma_create_user_ah(pd, &attr, &udata);
+	ah = rdma_create_user_ah(pd, &attr, &attrs->driver_udata);
 	if (IS_ERR(ah)) {
 		ret = PTR_ERR(ah);
 		goto err_put;
@@ -2646,40 +2458,38 @@
 
 	resp.ah_handle = uobj->id;
 
-	if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof resp)) {
-		ret = -EFAULT;
+	ret = uverbs_response(attrs, &resp, sizeof(resp));
+	if (ret)
 		goto err_copy;
-	}
 
 	uobj_put_obj_read(pd);
-	return uobj_alloc_commit(uobj, in_len);
+	return uobj_alloc_commit(uobj, attrs);
 
 err_copy:
-	rdma_destroy_ah(ah);
+	rdma_destroy_ah_user(ah, RDMA_DESTROY_AH_SLEEPABLE,
+			     uverbs_get_cleared_udata(attrs));
 
 err_put:
 	uobj_put_obj_read(pd);
 
 err:
-	uobj_alloc_abort(uobj);
+	uobj_alloc_abort(uobj, attrs);
 	return ret;
 }
 
-ssize_t ib_uverbs_destroy_ah(struct ib_uverbs_file *file,
-			     const char __user *buf, int in_len, int out_len)
+static int ib_uverbs_destroy_ah(struct uverbs_attr_bundle *attrs)
 {
 	struct ib_uverbs_destroy_ah cmd;
+	int ret;
 
-	if (copy_from_user(&cmd, buf, sizeof cmd))
-		return -EFAULT;
+	ret = uverbs_request(attrs, &cmd, sizeof(cmd));
+	if (ret)
+		return ret;
 
-	return uobj_perform_destroy(UVERBS_OBJECT_AH, cmd.ah_handle, file,
-				    in_len);
+	return uobj_perform_destroy(UVERBS_OBJECT_AH, cmd.ah_handle, attrs);
 }
 
-ssize_t ib_uverbs_attach_mcast(struct ib_uverbs_file *file,
-			       const char __user *buf, int in_len,
-			       int out_len)
+static int ib_uverbs_attach_mcast(struct uverbs_attr_bundle *attrs)
 {
 	struct ib_uverbs_attach_mcast cmd;
 	struct ib_qp                 *qp;
@@ -2687,10 +2497,11 @@
 	struct ib_uverbs_mcast_entry *mcast;
 	int                           ret;
 
-	if (copy_from_user(&cmd, buf, sizeof cmd))
-		return -EFAULT;
+	ret = uverbs_request(attrs, &cmd, sizeof(cmd));
+	if (ret)
+		return ret;
 
-	qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd.qp_handle, file);
+	qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd.qp_handle, attrs);
 	if (!qp)
 		return -EINVAL;
 
@@ -2723,24 +2534,23 @@
 	mutex_unlock(&obj->mcast_lock);
 	uobj_put_obj_read(qp);
 
-	return ret ? ret : in_len;
+	return ret;
 }
 
-ssize_t ib_uverbs_detach_mcast(struct ib_uverbs_file *file,
-			       const char __user *buf, int in_len,
-			       int out_len)
+static int ib_uverbs_detach_mcast(struct uverbs_attr_bundle *attrs)
 {
 	struct ib_uverbs_detach_mcast cmd;
 	struct ib_uqp_object         *obj;
 	struct ib_qp                 *qp;
 	struct ib_uverbs_mcast_entry *mcast;
-	int                           ret = -EINVAL;
+	int                           ret;
 	bool                          found = false;
 
-	if (copy_from_user(&cmd, buf, sizeof cmd))
-		return -EFAULT;
+	ret = uverbs_request(attrs, &cmd, sizeof(cmd));
+	if (ret)
+		return ret;
 
-	qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd.qp_handle, file);
+	qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd.qp_handle, attrs);
 	if (!qp)
 		return -EINVAL;
 
@@ -2766,19 +2576,10 @@
 out_put:
 	mutex_unlock(&obj->mcast_lock);
 	uobj_put_obj_read(qp);
-	return ret ? ret : in_len;
+	return ret;
 }
 
-struct ib_uflow_resources {
-	size_t			max;
-	size_t			num;
-	size_t			collection_num;
-	size_t			counters_num;
-	struct ib_counters	**counters;
-	struct ib_flow_action	**collection;
-};
-
-static struct ib_uflow_resources *flow_resources_alloc(size_t num_specs)
+struct ib_uflow_resources *flow_resources_alloc(size_t num_specs)
 {
 	struct ib_uflow_resources *resources;
 
@@ -2808,6 +2609,7 @@
 
 	return NULL;
 }
+EXPORT_SYMBOL(flow_resources_alloc);
 
 void ib_uverbs_flow_resources_free(struct ib_uflow_resources *uflow_res)
 {
@@ -2826,10 +2628,11 @@
 	kfree(uflow_res->counters);
 	kfree(uflow_res);
 }
+EXPORT_SYMBOL(ib_uverbs_flow_resources_free);
 
-static void flow_resources_add(struct ib_uflow_resources *uflow_res,
-			       enum ib_flow_spec_type type,
-			       void *ibobj)
+void flow_resources_add(struct ib_uflow_resources *uflow_res,
+			enum ib_flow_spec_type type,
+			void *ibobj)
 {
 	WARN_ON(uflow_res->num >= uflow_res->max);
 
@@ -2850,8 +2653,9 @@
 
 	uflow_res->num++;
 }
+EXPORT_SYMBOL(flow_resources_add);
 
-static int kern_spec_to_ib_spec_action(struct ib_uverbs_file *ufile,
+static int kern_spec_to_ib_spec_action(struct uverbs_attr_bundle *attrs,
 				       struct ib_uverbs_flow_spec *kern_spec,
 				       union ib_flow_spec *ib_spec,
 				       struct ib_uflow_resources *uflow_res)
@@ -2880,7 +2684,7 @@
 		ib_spec->action.act = uobj_get_obj_read(flow_action,
 							UVERBS_OBJECT_FLOW_ACTION,
 							kern_spec->action.handle,
-							ufile);
+							attrs);
 		if (!ib_spec->action.act)
 			return -EINVAL;
 		ib_spec->action.size =
@@ -2898,7 +2702,7 @@
 			uobj_get_obj_read(counters,
 					  UVERBS_OBJECT_COUNTERS,
 					  kern_spec->flow_count.handle,
-					  ufile);
+					  attrs);
 		if (!ib_spec->flow_count.counters)
 			return -EINVAL;
 		ib_spec->flow_count.size =
@@ -3079,7 +2883,7 @@
 						     kern_filter_sz, ib_spec);
 }
 
-static int kern_spec_to_ib_spec(struct ib_uverbs_file *ufile,
+static int kern_spec_to_ib_spec(struct uverbs_attr_bundle *attrs,
 				struct ib_uverbs_flow_spec *kern_spec,
 				union ib_flow_spec *ib_spec,
 				struct ib_uflow_resources *uflow_res)
@@ -3088,17 +2892,15 @@
 		return -EINVAL;
 
 	if (kern_spec->type >= IB_FLOW_SPEC_ACTION_TAG)
-		return kern_spec_to_ib_spec_action(ufile, kern_spec, ib_spec,
+		return kern_spec_to_ib_spec_action(attrs, kern_spec, ib_spec,
 						   uflow_res);
 	else
 		return kern_spec_to_ib_spec_filter(kern_spec, ib_spec);
 }
 
-int ib_uverbs_ex_create_wq(struct ib_uverbs_file *file,
-			   struct ib_udata *ucore,
-			   struct ib_udata *uhw)
+static int ib_uverbs_ex_create_wq(struct uverbs_attr_bundle *attrs)
 {
-	struct ib_uverbs_ex_create_wq	  cmd = {};
+	struct ib_uverbs_ex_create_wq cmd;
 	struct ib_uverbs_ex_create_wq_resp resp = {};
 	struct ib_uwq_object           *obj;
 	int err = 0;
@@ -3106,43 +2908,27 @@
 	struct ib_pd *pd;
 	struct ib_wq *wq;
 	struct ib_wq_init_attr wq_init_attr = {};
-	size_t required_cmd_sz;
-	size_t required_resp_len;
 	struct ib_device *ib_dev;
 
-	required_cmd_sz = offsetof(typeof(cmd), max_sge) + sizeof(cmd.max_sge);
-	required_resp_len = offsetof(typeof(resp), wqn) + sizeof(resp.wqn);
-
-	if (ucore->inlen < required_cmd_sz)
-		return -EINVAL;
-
-	if (ucore->outlen < required_resp_len)
-		return -ENOSPC;
-
-	if (ucore->inlen > sizeof(cmd) &&
-	    !ib_is_udata_cleared(ucore, sizeof(cmd),
-				 ucore->inlen - sizeof(cmd)))
-		return -EOPNOTSUPP;
-
-	err = ib_copy_from_udata(&cmd, ucore, min(sizeof(cmd), ucore->inlen));
+	err = uverbs_request(attrs, &cmd, sizeof(cmd));
 	if (err)
 		return err;
 
 	if (cmd.comp_mask)
 		return -EOPNOTSUPP;
 
-	obj = (struct ib_uwq_object *)uobj_alloc(UVERBS_OBJECT_WQ, file,
+	obj = (struct ib_uwq_object *)uobj_alloc(UVERBS_OBJECT_WQ, attrs,
 						 &ib_dev);
 	if (IS_ERR(obj))
 		return PTR_ERR(obj);
 
-	pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd.pd_handle, file);
+	pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd.pd_handle, attrs);
 	if (!pd) {
 		err = -EINVAL;
 		goto err_uobj;
 	}
 
-	cq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ, cmd.cq_handle, file);
+	cq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ, cmd.cq_handle, attrs);
 	if (!cq) {
 		err = -EINVAL;
 		goto err_put_pd;
@@ -3151,20 +2937,14 @@
 	wq_init_attr.cq = cq;
 	wq_init_attr.max_sge = cmd.max_sge;
 	wq_init_attr.max_wr = cmd.max_wr;
-	wq_init_attr.wq_context = file;
+	wq_init_attr.wq_context = attrs->ufile;
 	wq_init_attr.wq_type = cmd.wq_type;
 	wq_init_attr.event_handler = ib_uverbs_wq_event_handler;
-	if (ucore->inlen >= (offsetof(typeof(cmd), create_flags) +
-			     sizeof(cmd.create_flags)))
-		wq_init_attr.create_flags = cmd.create_flags;
+	wq_init_attr.create_flags = cmd.create_flags;
 	obj->uevent.events_reported = 0;
 	INIT_LIST_HEAD(&obj->uevent.event_list);
 
-	if (!pd->device->create_wq) {
-		err = -EOPNOTSUPP;
-		goto err_put_cq;
-	}
-	wq = pd->device->create_wq(pd, &wq_init_attr, uhw);
+	wq = pd->device->ops.create_wq(pd, &wq_init_attr, &attrs->driver_udata);
 	if (IS_ERR(wq)) {
 		err = PTR_ERR(wq);
 		goto err_put_cq;
@@ -3188,63 +2968,44 @@
 	resp.max_sge = wq_init_attr.max_sge;
 	resp.max_wr = wq_init_attr.max_wr;
 	resp.wqn = wq->wq_num;
-	resp.response_length = required_resp_len;
-	err = ib_copy_to_udata(ucore,
-			       &resp, resp.response_length);
+	resp.response_length = uverbs_response_length(attrs, sizeof(resp));
+	err = uverbs_response(attrs, &resp, sizeof(resp));
 	if (err)
 		goto err_copy;
 
 	uobj_put_obj_read(pd);
 	uobj_put_obj_read(cq);
-	return uobj_alloc_commit(&obj->uevent.uobject, 0);
+	return uobj_alloc_commit(&obj->uevent.uobject, attrs);
 
 err_copy:
-	ib_destroy_wq(wq);
+	ib_destroy_wq(wq, uverbs_get_cleared_udata(attrs));
 err_put_cq:
 	uobj_put_obj_read(cq);
 err_put_pd:
 	uobj_put_obj_read(pd);
 err_uobj:
-	uobj_alloc_abort(&obj->uevent.uobject);
+	uobj_alloc_abort(&obj->uevent.uobject, attrs);
 
 	return err;
 }
 
-int ib_uverbs_ex_destroy_wq(struct ib_uverbs_file *file,
-			    struct ib_udata *ucore,
-			    struct ib_udata *uhw)
+static int ib_uverbs_ex_destroy_wq(struct uverbs_attr_bundle *attrs)
 {
-	struct ib_uverbs_ex_destroy_wq	cmd = {};
+	struct ib_uverbs_ex_destroy_wq	cmd;
 	struct ib_uverbs_ex_destroy_wq_resp	resp = {};
 	struct ib_uobject		*uobj;
 	struct ib_uwq_object		*obj;
-	size_t required_cmd_sz;
-	size_t required_resp_len;
 	int				ret;
 
-	required_cmd_sz = offsetof(typeof(cmd), wq_handle) + sizeof(cmd.wq_handle);
-	required_resp_len = offsetof(typeof(resp), reserved) + sizeof(resp.reserved);
-
-	if (ucore->inlen < required_cmd_sz)
-		return -EINVAL;
-
-	if (ucore->outlen < required_resp_len)
-		return -ENOSPC;
-
-	if (ucore->inlen > sizeof(cmd) &&
-	    !ib_is_udata_cleared(ucore, sizeof(cmd),
-				 ucore->inlen - sizeof(cmd)))
-		return -EOPNOTSUPP;
-
-	ret = ib_copy_from_udata(&cmd, ucore, min(sizeof(cmd), ucore->inlen));
+	ret = uverbs_request(attrs, &cmd, sizeof(cmd));
 	if (ret)
 		return ret;
 
 	if (cmd.comp_mask)
 		return -EOPNOTSUPP;
 
-	resp.response_length = required_resp_len;
-	uobj = uobj_get_destroy(UVERBS_OBJECT_WQ, cmd.wq_handle, file);
+	resp.response_length = uverbs_response_length(attrs, sizeof(resp));
+	uobj = uobj_get_destroy(UVERBS_OBJECT_WQ, cmd.wq_handle, attrs);
 	if (IS_ERR(uobj))
 		return PTR_ERR(uobj);
 
@@ -3253,29 +3014,17 @@
 
 	uobj_put_destroy(uobj);
 
-	return ib_copy_to_udata(ucore, &resp, resp.response_length);
+	return uverbs_response(attrs, &resp, sizeof(resp));
 }
 
-int ib_uverbs_ex_modify_wq(struct ib_uverbs_file *file,
-			   struct ib_udata *ucore,
-			   struct ib_udata *uhw)
+static int ib_uverbs_ex_modify_wq(struct uverbs_attr_bundle *attrs)
 {
-	struct ib_uverbs_ex_modify_wq cmd = {};
+	struct ib_uverbs_ex_modify_wq cmd;
 	struct ib_wq *wq;
 	struct ib_wq_attr wq_attr = {};
-	size_t required_cmd_sz;
 	int ret;
 
-	required_cmd_sz = offsetof(typeof(cmd), curr_wq_state) + sizeof(cmd.curr_wq_state);
-	if (ucore->inlen < required_cmd_sz)
-		return -EINVAL;
-
-	if (ucore->inlen > sizeof(cmd) &&
-	    !ib_is_udata_cleared(ucore, sizeof(cmd),
-				 ucore->inlen - sizeof(cmd)))
-		return -EOPNOTSUPP;
-
-	ret = ib_copy_from_udata(&cmd, ucore, min(sizeof(cmd), ucore->inlen));
+	ret = uverbs_request(attrs, &cmd, sizeof(cmd));
 	if (ret)
 		return ret;
 
@@ -3285,7 +3034,7 @@
 	if (cmd.attr_mask > (IB_WQ_STATE | IB_WQ_CUR_STATE | IB_WQ_FLAGS))
 		return -EINVAL;
 
-	wq = uobj_get_obj_read(wq, UVERBS_OBJECT_WQ, cmd.wq_handle, file);
+	wq = uobj_get_obj_read(wq, UVERBS_OBJECT_WQ, cmd.wq_handle, attrs);
 	if (!wq)
 		return -EINVAL;
 
@@ -3295,24 +3044,18 @@
 		wq_attr.flags = cmd.flags;
 		wq_attr.flags_mask = cmd.flags_mask;
 	}
-	if (!wq->device->modify_wq) {
-		ret = -EOPNOTSUPP;
-		goto out;
-	}
-	ret = wq->device->modify_wq(wq, &wq_attr, cmd.attr_mask, uhw);
-out:
+	ret = wq->device->ops.modify_wq(wq, &wq_attr, cmd.attr_mask,
+					&attrs->driver_udata);
 	uobj_put_obj_read(wq);
 	return ret;
 }
 
-int ib_uverbs_ex_create_rwq_ind_table(struct ib_uverbs_file *file,
-				      struct ib_udata *ucore,
-				      struct ib_udata *uhw)
+static int ib_uverbs_ex_create_rwq_ind_table(struct uverbs_attr_bundle *attrs)
 {
-	struct ib_uverbs_ex_create_rwq_ind_table	  cmd = {};
+	struct ib_uverbs_ex_create_rwq_ind_table cmd;
 	struct ib_uverbs_ex_create_rwq_ind_table_resp  resp = {};
 	struct ib_uobject		  *uobj;
-	int err = 0;
+	int err;
 	struct ib_rwq_ind_table_init_attr init_attr = {};
 	struct ib_rwq_ind_table *rwq_ind_tbl;
 	struct ib_wq	**wqs = NULL;
@@ -3320,27 +3063,13 @@
 	struct ib_wq	*wq = NULL;
 	int i, j, num_read_wqs;
 	u32 num_wq_handles;
-	u32 expected_in_size;
-	size_t required_cmd_sz_header;
-	size_t required_resp_len;
+	struct uverbs_req_iter iter;
 	struct ib_device *ib_dev;
 
-	required_cmd_sz_header = offsetof(typeof(cmd), log_ind_tbl_size) + sizeof(cmd.log_ind_tbl_size);
-	required_resp_len = offsetof(typeof(resp), ind_tbl_num) + sizeof(resp.ind_tbl_num);
-
-	if (ucore->inlen < required_cmd_sz_header)
-		return -EINVAL;
-
-	if (ucore->outlen < required_resp_len)
-		return -ENOSPC;
-
-	err = ib_copy_from_udata(&cmd, ucore, required_cmd_sz_header);
+	err = uverbs_request_start(attrs, &iter, &cmd, sizeof(cmd));
 	if (err)
 		return err;
 
-	ucore->inbuf += required_cmd_sz_header;
-	ucore->inlen -= required_cmd_sz_header;
-
 	if (cmd.comp_mask)
 		return -EOPNOTSUPP;
 
@@ -3348,26 +3077,17 @@
 		return -EINVAL;
 
 	num_wq_handles = 1 << cmd.log_ind_tbl_size;
-	expected_in_size = num_wq_handles * sizeof(__u32);
-	if (num_wq_handles == 1)
-		/* input size for wq handles is u64 aligned */
-		expected_in_size += sizeof(__u32);
-
-	if (ucore->inlen < expected_in_size)
-		return -EINVAL;
-
-	if (ucore->inlen > expected_in_size &&
-	    !ib_is_udata_cleared(ucore, expected_in_size,
-				 ucore->inlen - expected_in_size))
-		return -EOPNOTSUPP;
-
 	wqs_handles = kcalloc(num_wq_handles, sizeof(*wqs_handles),
 			      GFP_KERNEL);
 	if (!wqs_handles)
 		return -ENOMEM;
 
-	err = ib_copy_from_udata(wqs_handles, ucore,
-				 num_wq_handles * sizeof(__u32));
+	err = uverbs_request_next(&iter, wqs_handles,
+				  num_wq_handles * sizeof(__u32));
+	if (err)
+		goto err_free;
+
+	err = uverbs_request_finish(&iter);
 	if (err)
 		goto err_free;
 
@@ -3380,7 +3100,7 @@
 	for (num_read_wqs = 0; num_read_wqs < num_wq_handles;
 			num_read_wqs++) {
 		wq = uobj_get_obj_read(wq, UVERBS_OBJECT_WQ,
-				       wqs_handles[num_read_wqs], file);
+				       wqs_handles[num_read_wqs], attrs);
 		if (!wq) {
 			err = -EINVAL;
 			goto put_wqs;
@@ -3389,7 +3109,7 @@
 		wqs[num_read_wqs] = wq;
 	}
 
-	uobj = uobj_alloc(UVERBS_OBJECT_RWQ_IND_TBL, file, &ib_dev);
+	uobj = uobj_alloc(UVERBS_OBJECT_RWQ_IND_TBL, attrs, &ib_dev);
 	if (IS_ERR(uobj)) {
 		err = PTR_ERR(uobj);
 		goto put_wqs;
@@ -3398,11 +3118,8 @@
 	init_attr.log_ind_tbl_size = cmd.log_ind_tbl_size;
 	init_attr.ind_tbl = wqs;
 
-	if (!ib_dev->create_rwq_ind_table) {
-		err = -EOPNOTSUPP;
-		goto err_uobj;
-	}
-	rwq_ind_tbl = ib_dev->create_rwq_ind_table(ib_dev, &init_attr, uhw);
+	rwq_ind_tbl = ib_dev->ops.create_rwq_ind_table(ib_dev, &init_attr,
+						       &attrs->driver_udata);
 
 	if (IS_ERR(rwq_ind_tbl)) {
 		err = PTR_ERR(rwq_ind_tbl);
@@ -3421,10 +3138,9 @@
 
 	resp.ind_tbl_handle = uobj->id;
 	resp.ind_tbl_num = rwq_ind_tbl->ind_tbl_num;
-	resp.response_length = required_resp_len;
+	resp.response_length = uverbs_response_length(attrs, sizeof(resp));
 
-	err = ib_copy_to_udata(ucore,
-			       &resp, resp.response_length);
+	err = uverbs_response(attrs, &resp, sizeof(resp));
 	if (err)
 		goto err_copy;
 
@@ -3433,12 +3149,12 @@
 	for (j = 0; j < num_read_wqs; j++)
 		uobj_put_obj_read(wqs[j]);
 
-	return uobj_alloc_commit(uobj, 0);
+	return uobj_alloc_commit(uobj, attrs);
 
 err_copy:
 	ib_destroy_rwq_ind_table(rwq_ind_tbl);
 err_uobj:
-	uobj_alloc_abort(uobj);
+	uobj_alloc_abort(uobj, attrs);
 put_wqs:
 	for (j = 0; j < num_read_wqs; j++)
 		uobj_put_obj_read(wqs[j]);
@@ -3448,25 +3164,12 @@
 	return err;
 }
 
-int ib_uverbs_ex_destroy_rwq_ind_table(struct ib_uverbs_file *file,
-				       struct ib_udata *ucore,
-				       struct ib_udata *uhw)
+static int ib_uverbs_ex_destroy_rwq_ind_table(struct uverbs_attr_bundle *attrs)
 {
-	struct ib_uverbs_ex_destroy_rwq_ind_table	cmd = {};
-	int			ret;
-	size_t required_cmd_sz;
+	struct ib_uverbs_ex_destroy_rwq_ind_table cmd;
+	int ret;
 
-	required_cmd_sz = offsetof(typeof(cmd), ind_tbl_handle) + sizeof(cmd.ind_tbl_handle);
-
-	if (ucore->inlen < required_cmd_sz)
-		return -EINVAL;
-
-	if (ucore->inlen > sizeof(cmd) &&
-	    !ib_is_udata_cleared(ucore, sizeof(cmd),
-				 ucore->inlen - sizeof(cmd)))
-		return -EOPNOTSUPP;
-
-	ret = ib_copy_from_udata(&cmd, ucore, min(sizeof(cmd), ucore->inlen));
+	ret = uverbs_request(attrs, &cmd, sizeof(cmd));
 	if (ret)
 		return ret;
 
@@ -3474,41 +3177,30 @@
 		return -EOPNOTSUPP;
 
 	return uobj_perform_destroy(UVERBS_OBJECT_RWQ_IND_TBL,
-				    cmd.ind_tbl_handle, file, 0);
+				    cmd.ind_tbl_handle, attrs);
 }
 
-int ib_uverbs_ex_create_flow(struct ib_uverbs_file *file,
-			     struct ib_udata *ucore,
-			     struct ib_udata *uhw)
+static int ib_uverbs_ex_create_flow(struct uverbs_attr_bundle *attrs)
 {
 	struct ib_uverbs_create_flow	  cmd;
 	struct ib_uverbs_create_flow_resp resp;
 	struct ib_uobject		  *uobj;
-	struct ib_uflow_object		  *uflow;
 	struct ib_flow			  *flow_id;
 	struct ib_uverbs_flow_attr	  *kern_flow_attr;
 	struct ib_flow_attr		  *flow_attr;
 	struct ib_qp			  *qp;
 	struct ib_uflow_resources	  *uflow_res;
 	struct ib_uverbs_flow_spec_hdr	  *kern_spec;
-	int err = 0;
+	struct uverbs_req_iter iter;
+	int err;
 	void *ib_spec;
 	int i;
 	struct ib_device *ib_dev;
 
-	if (ucore->inlen < sizeof(cmd))
-		return -EINVAL;
-
-	if (ucore->outlen < sizeof(resp))
-		return -ENOSPC;
-
-	err = ib_copy_from_udata(&cmd, ucore, sizeof(cmd));
+	err = uverbs_request_start(attrs, &iter, &cmd, sizeof(cmd));
 	if (err)
 		return err;
 
-	ucore->inbuf += sizeof(cmd);
-	ucore->inlen -= sizeof(cmd);
-
 	if (cmd.comp_mask)
 		return -EINVAL;
 
@@ -3526,8 +3218,7 @@
 	if (cmd.flow_attr.num_of_specs > IB_FLOW_SPEC_SUPPORT_LAYERS)
 		return -EINVAL;
 
-	if (cmd.flow_attr.size > ucore->inlen ||
-	    cmd.flow_attr.size >
+	if (cmd.flow_attr.size >
 	    (cmd.flow_attr.num_of_specs * sizeof(struct ib_uverbs_flow_spec)))
 		return -EINVAL;
 
@@ -3542,21 +3233,25 @@
 			return -ENOMEM;
 
 		*kern_flow_attr = cmd.flow_attr;
-		err = ib_copy_from_udata(&kern_flow_attr->flow_specs, ucore,
-					 cmd.flow_attr.size);
+		err = uverbs_request_next(&iter, &kern_flow_attr->flow_specs,
+					  cmd.flow_attr.size);
 		if (err)
 			goto err_free_attr;
 	} else {
 		kern_flow_attr = &cmd.flow_attr;
 	}
 
-	uobj = uobj_alloc(UVERBS_OBJECT_FLOW, file, &ib_dev);
+	err = uverbs_request_finish(&iter);
+	if (err)
+		goto err_free_attr;
+
+	uobj = uobj_alloc(UVERBS_OBJECT_FLOW, attrs, &ib_dev);
 	if (IS_ERR(uobj)) {
 		err = PTR_ERR(uobj);
 		goto err_free_attr;
 	}
 
-	qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd.qp_handle, file);
+	qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd.qp_handle, attrs);
 	if (!qp) {
 		err = -EINVAL;
 		goto err_uobj;
@@ -3567,11 +3262,6 @@
 		goto err_put;
 	}
 
-	if (!qp->device->create_flow) {
-		err = -EOPNOTSUPP;
-		goto err_put;
-	}
-
 	flow_attr = kzalloc(struct_size(flow_attr, flows,
 				cmd.flow_attr.num_of_specs), GFP_KERNEL);
 	if (!flow_attr) {
@@ -3598,7 +3288,7 @@
 			cmd.flow_attr.size >= kern_spec->size;
 	     i++) {
 		err = kern_spec_to_ib_spec(
-				file, (struct ib_uverbs_flow_spec *)kern_spec,
+				attrs, (struct ib_uverbs_flow_spec *)kern_spec,
 				ib_spec, uflow_res);
 		if (err)
 			goto err_free;
@@ -3616,26 +3306,20 @@
 		goto err_free;
 	}
 
-	flow_id = qp->device->create_flow(qp, flow_attr,
-					  IB_FLOW_DOMAIN_USER, uhw);
+	flow_id = qp->device->ops.create_flow(
+		qp, flow_attr, IB_FLOW_DOMAIN_USER, &attrs->driver_udata);
 
 	if (IS_ERR(flow_id)) {
 		err = PTR_ERR(flow_id);
 		goto err_free;
 	}
-	atomic_inc(&qp->usecnt);
-	flow_id->qp = qp;
-	flow_id->device = qp->device;
-	flow_id->uobject = uobj;
-	uobj->object = flow_id;
-	uflow = container_of(uobj, typeof(*uflow), uobject);
-	uflow->resources = uflow_res;
+
+	ib_set_flow(uobj, flow_id, qp, qp->device, uflow_res);
 
 	memset(&resp, 0, sizeof(resp));
 	resp.flow_handle = uobj->id;
 
-	err = ib_copy_to_udata(ucore,
-			       &resp, sizeof(resp));
+	err = uverbs_response(attrs, &resp, sizeof(resp));
 	if (err)
 		goto err_copy;
 
@@ -3643,9 +3327,9 @@
 	kfree(flow_attr);
 	if (cmd.flow_attr.num_of_specs)
 		kfree(kern_flow_attr);
-	return uobj_alloc_commit(uobj, 0);
+	return uobj_alloc_commit(uobj, attrs);
 err_copy:
-	if (!qp->device->destroy_flow(flow_id))
+	if (!qp->device->ops.destroy_flow(flow_id))
 		atomic_dec(&qp->usecnt);
 err_free:
 	ib_uverbs_flow_resources_free(uflow_res);
@@ -3654,35 +3338,29 @@
 err_put:
 	uobj_put_obj_read(qp);
 err_uobj:
-	uobj_alloc_abort(uobj);
+	uobj_alloc_abort(uobj, attrs);
 err_free_attr:
 	if (cmd.flow_attr.num_of_specs)
 		kfree(kern_flow_attr);
 	return err;
 }
 
-int ib_uverbs_ex_destroy_flow(struct ib_uverbs_file *file,
-			      struct ib_udata *ucore,
-			      struct ib_udata *uhw)
+static int ib_uverbs_ex_destroy_flow(struct uverbs_attr_bundle *attrs)
 {
 	struct ib_uverbs_destroy_flow	cmd;
 	int				ret;
 
-	if (ucore->inlen < sizeof(cmd))
-		return -EINVAL;
-
-	ret = ib_copy_from_udata(&cmd, ucore, sizeof(cmd));
+	ret = uverbs_request(attrs, &cmd, sizeof(cmd));
 	if (ret)
 		return ret;
 
 	if (cmd.comp_mask)
 		return -EINVAL;
 
-	return uobj_perform_destroy(UVERBS_OBJECT_FLOW, cmd.flow_handle, file,
-				    0);
+	return uobj_perform_destroy(UVERBS_OBJECT_FLOW, cmd.flow_handle, attrs);
 }
 
-static int __uverbs_create_xsrq(struct ib_uverbs_file *file,
+static int __uverbs_create_xsrq(struct uverbs_attr_bundle *attrs,
 				struct ib_uverbs_create_xsrq *cmd,
 				struct ib_udata *udata)
 {
@@ -3695,7 +3373,7 @@
 	int ret;
 	struct ib_device *ib_dev;
 
-	obj = (struct ib_usrq_object *)uobj_alloc(UVERBS_OBJECT_SRQ, file,
+	obj = (struct ib_usrq_object *)uobj_alloc(UVERBS_OBJECT_SRQ, attrs,
 						  &ib_dev);
 	if (IS_ERR(obj))
 		return PTR_ERR(obj);
@@ -3705,7 +3383,7 @@
 
 	if (cmd->srq_type == IB_SRQT_XRC) {
 		xrcd_uobj = uobj_get_read(UVERBS_OBJECT_XRCD, cmd->xrcd_handle,
-					  file);
+					  attrs);
 		if (IS_ERR(xrcd_uobj)) {
 			ret = -EINVAL;
 			goto err;
@@ -3723,21 +3401,21 @@
 
 	if (ib_srq_has_cq(cmd->srq_type)) {
 		attr.ext.cq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ,
-						cmd->cq_handle, file);
+						cmd->cq_handle, attrs);
 		if (!attr.ext.cq) {
 			ret = -EINVAL;
 			goto err_put_xrcd;
 		}
 	}
 
-	pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd->pd_handle, file);
+	pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd->pd_handle, attrs);
 	if (!pd) {
 		ret = -EINVAL;
 		goto err_put_cq;
 	}
 
 	attr.event_handler  = ib_uverbs_srq_event_handler;
-	attr.srq_context    = file;
+	attr.srq_context    = attrs->ufile;
 	attr.srq_type       = cmd->srq_type;
 	attr.attr.max_wr    = cmd->max_wr;
 	attr.attr.max_sge   = cmd->max_sge;
@@ -3746,9 +3424,9 @@
 	obj->uevent.events_reported = 0;
 	INIT_LIST_HEAD(&obj->uevent.event_list);
 
-	srq = pd->device->create_srq(pd, &attr, udata);
-	if (IS_ERR(srq)) {
-		ret = PTR_ERR(srq);
+	srq = rdma_zalloc_drv_obj(ib_dev, ib_srq);
+	if (!srq) {
+		ret = -ENOMEM;
 		goto err_put;
 	}
 
@@ -3759,6 +3437,10 @@
 	srq->event_handler = attr.event_handler;
 	srq->srq_context   = attr.srq_context;
 
+	ret = pd->device->ops.create_srq(srq, &attr, udata);
+	if (ret)
+		goto err_free;
+
 	if (ib_srq_has_cq(cmd->srq_type)) {
 		srq->ext.cq       = attr.ext.cq;
 		atomic_inc(&attr.ext.cq->usecnt);
@@ -3782,11 +3464,9 @@
 	if (cmd->srq_type == IB_SRQT_XRC)
 		resp.srqn = srq->ext.xrc.srq_num;
 
-	if (copy_to_user(u64_to_user_ptr(cmd->response),
-			 &resp, sizeof resp)) {
-		ret = -EFAULT;
+	ret = uverbs_response(attrs, &resp, sizeof(resp));
+	if (ret)
 		goto err_copy;
-	}
 
 	if (cmd->srq_type == IB_SRQT_XRC)
 		uobj_put_read(xrcd_uobj);
@@ -3795,11 +3475,14 @@
 		uobj_put_obj_read(attr.ext.cq);
 
 	uobj_put_obj_read(pd);
-	return uobj_alloc_commit(&obj->uevent.uobject, 0);
+	return uobj_alloc_commit(&obj->uevent.uobject, attrs);
 
 err_copy:
-	ib_destroy_srq(srq);
-
+	ib_destroy_srq_user(srq, uverbs_get_cleared_udata(attrs));
+	/* It was released in ib_destroy_srq_user */
+	srq = NULL;
+err_free:
+	kfree(srq);
 err_put:
 	uobj_put_obj_read(pd);
 
@@ -3814,25 +3497,19 @@
 	}
 
 err:
-	uobj_alloc_abort(&obj->uevent.uobject);
+	uobj_alloc_abort(&obj->uevent.uobject, attrs);
 	return ret;
 }
 
-ssize_t ib_uverbs_create_srq(struct ib_uverbs_file *file,
-			     const char __user *buf, int in_len,
-			     int out_len)
+static int ib_uverbs_create_srq(struct uverbs_attr_bundle *attrs)
 {
 	struct ib_uverbs_create_srq      cmd;
 	struct ib_uverbs_create_xsrq     xcmd;
-	struct ib_uverbs_create_srq_resp resp;
-	struct ib_udata                  udata;
 	int ret;
 
-	if (out_len < sizeof resp)
-		return -ENOSPC;
-
-	if (copy_from_user(&cmd, buf, sizeof cmd))
-		return -EFAULT;
+	ret = uverbs_request(attrs, &cmd, sizeof(cmd));
+	if (ret)
+		return ret;
 
 	memset(&xcmd, 0, sizeof(xcmd));
 	xcmd.response	 = cmd.response;
@@ -3843,77 +3520,48 @@
 	xcmd.max_sge	 = cmd.max_sge;
 	xcmd.srq_limit	 = cmd.srq_limit;
 
-	ib_uverbs_init_udata(&udata, buf + sizeof(cmd),
-		   u64_to_user_ptr(cmd.response) + sizeof(resp),
-		   in_len - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr),
-		   out_len - sizeof(resp));
-
-	ret = __uverbs_create_xsrq(file, &xcmd, &udata);
-	if (ret)
-		return ret;
-
-	return in_len;
+	return __uverbs_create_xsrq(attrs, &xcmd, &attrs->driver_udata);
 }
 
-ssize_t ib_uverbs_create_xsrq(struct ib_uverbs_file *file,
-			      const char __user *buf, int in_len, int out_len)
+static int ib_uverbs_create_xsrq(struct uverbs_attr_bundle *attrs)
 {
 	struct ib_uverbs_create_xsrq     cmd;
-	struct ib_uverbs_create_srq_resp resp;
-	struct ib_udata                  udata;
 	int ret;
 
-	if (out_len < sizeof resp)
-		return -ENOSPC;
-
-	if (copy_from_user(&cmd, buf, sizeof cmd))
-		return -EFAULT;
-
-	ib_uverbs_init_udata(&udata, buf + sizeof(cmd),
-		   u64_to_user_ptr(cmd.response) + sizeof(resp),
-		   in_len - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr),
-		   out_len - sizeof(resp));
-
-	ret = __uverbs_create_xsrq(file, &cmd, &udata);
+	ret = uverbs_request(attrs, &cmd, sizeof(cmd));
 	if (ret)
 		return ret;
 
-	return in_len;
+	return __uverbs_create_xsrq(attrs, &cmd, &attrs->driver_udata);
 }
 
-ssize_t ib_uverbs_modify_srq(struct ib_uverbs_file *file,
-			     const char __user *buf, int in_len,
-			     int out_len)
+static int ib_uverbs_modify_srq(struct uverbs_attr_bundle *attrs)
 {
 	struct ib_uverbs_modify_srq cmd;
-	struct ib_udata             udata;
 	struct ib_srq              *srq;
 	struct ib_srq_attr          attr;
 	int                         ret;
 
-	if (copy_from_user(&cmd, buf, sizeof cmd))
-		return -EFAULT;
+	ret = uverbs_request(attrs, &cmd, sizeof(cmd));
+	if (ret)
+		return ret;
 
-	ib_uverbs_init_udata(&udata, buf + sizeof cmd, NULL, in_len - sizeof cmd,
-		   out_len);
-
-	srq = uobj_get_obj_read(srq, UVERBS_OBJECT_SRQ, cmd.srq_handle, file);
+	srq = uobj_get_obj_read(srq, UVERBS_OBJECT_SRQ, cmd.srq_handle, attrs);
 	if (!srq)
 		return -EINVAL;
 
 	attr.max_wr    = cmd.max_wr;
 	attr.srq_limit = cmd.srq_limit;
 
-	ret = srq->device->modify_srq(srq, &attr, cmd.attr_mask, &udata);
+	ret = srq->device->ops.modify_srq(srq, &attr, cmd.attr_mask,
+					  &attrs->driver_udata);
 
 	uobj_put_obj_read(srq);
 
-	return ret ? ret : in_len;
+	return ret;
 }
 
-ssize_t ib_uverbs_query_srq(struct ib_uverbs_file *file,
-			    const char __user *buf,
-			    int in_len, int out_len)
+static int ib_uverbs_query_srq(struct uverbs_attr_bundle *attrs)
 {
 	struct ib_uverbs_query_srq      cmd;
 	struct ib_uverbs_query_srq_resp resp;
@@ -3921,13 +3569,11 @@
 	struct ib_srq                   *srq;
 	int                             ret;
 
-	if (out_len < sizeof resp)
-		return -ENOSPC;
+	ret = uverbs_request(attrs, &cmd, sizeof(cmd));
+	if (ret)
+		return ret;
 
-	if (copy_from_user(&cmd, buf, sizeof cmd))
-		return -EFAULT;
-
-	srq = uobj_get_obj_read(srq, UVERBS_OBJECT_SRQ, cmd.srq_handle, file);
+	srq = uobj_get_obj_read(srq, UVERBS_OBJECT_SRQ, cmd.srq_handle, attrs);
 	if (!srq)
 		return -EINVAL;
 
@@ -3944,25 +3590,22 @@
 	resp.max_sge   = attr.max_sge;
 	resp.srq_limit = attr.srq_limit;
 
-	if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof resp))
-		return -EFAULT;
-
-	return in_len;
+	return uverbs_response(attrs, &resp, sizeof(resp));
 }
 
-ssize_t ib_uverbs_destroy_srq(struct ib_uverbs_file *file,
-			      const char __user *buf, int in_len,
-			      int out_len)
+static int ib_uverbs_destroy_srq(struct uverbs_attr_bundle *attrs)
 {
 	struct ib_uverbs_destroy_srq      cmd;
 	struct ib_uverbs_destroy_srq_resp resp;
 	struct ib_uobject		 *uobj;
 	struct ib_uevent_object        	 *obj;
+	int ret;
 
-	if (copy_from_user(&cmd, buf, sizeof cmd))
-		return -EFAULT;
+	ret = uverbs_request(attrs, &cmd, sizeof(cmd));
+	if (ret)
+		return ret;
 
-	uobj = uobj_get_destroy(UVERBS_OBJECT_SRQ, cmd.srq_handle, file);
+	uobj = uobj_get_destroy(UVERBS_OBJECT_SRQ, cmd.srq_handle, attrs);
 	if (IS_ERR(uobj))
 		return PTR_ERR(uobj);
 
@@ -3972,35 +3615,24 @@
 
 	uobj_put_destroy(uobj);
 
-	if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof(resp)))
-		return -EFAULT;
-
-	return in_len;
+	return uverbs_response(attrs, &resp, sizeof(resp));
 }
 
-int ib_uverbs_ex_query_device(struct ib_uverbs_file *file,
-			      struct ib_udata *ucore,
-			      struct ib_udata *uhw)
+static int ib_uverbs_ex_query_device(struct uverbs_attr_bundle *attrs)
 {
-	struct ib_uverbs_ex_query_device_resp resp = { {0} };
+	struct ib_uverbs_ex_query_device_resp resp = {};
 	struct ib_uverbs_ex_query_device  cmd;
 	struct ib_device_attr attr = {0};
 	struct ib_ucontext *ucontext;
 	struct ib_device *ib_dev;
 	int err;
 
-	ucontext = ib_uverbs_get_ucontext(file);
+	ucontext = ib_uverbs_get_ucontext(attrs);
 	if (IS_ERR(ucontext))
 		return PTR_ERR(ucontext);
 	ib_dev = ucontext->device;
 
-	if (!ib_dev->query_device)
-		return -EOPNOTSUPP;
-
-	if (ucore->inlen < sizeof(cmd))
-		return -EINVAL;
-
-	err = ib_copy_from_udata(&cmd, ucore, sizeof(cmd));
+	err = uverbs_request(attrs, &cmd, sizeof(cmd));
 	if (err)
 		return err;
 
@@ -4010,21 +3642,12 @@
 	if (cmd.reserved)
 		return -EINVAL;
 
-	resp.response_length = offsetof(typeof(resp), odp_caps);
-
-	if (ucore->outlen < resp.response_length)
-		return -ENOSPC;
-
-	err = ib_dev->query_device(ib_dev, &attr, uhw);
+	err = ib_dev->ops.query_device(ib_dev, &attr, &attrs->driver_udata);
 	if (err)
 		return err;
 
 	copy_query_dev_fields(ucontext, &resp.base, &attr);
 
-	if (ucore->outlen < resp.response_length + sizeof(resp.odp_caps))
-		goto end;
-
-#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
 	resp.odp_caps.general_caps = attr.odp_caps.general_caps;
 	resp.odp_caps.per_transport_caps.rc_odp_caps =
 		attr.odp_caps.per_transport_caps.rc_odp_caps;
@@ -4032,100 +3655,40 @@
 		attr.odp_caps.per_transport_caps.uc_odp_caps;
 	resp.odp_caps.per_transport_caps.ud_odp_caps =
 		attr.odp_caps.per_transport_caps.ud_odp_caps;
-#endif
-	resp.response_length += sizeof(resp.odp_caps);
-
-	if (ucore->outlen < resp.response_length + sizeof(resp.timestamp_mask))
-		goto end;
+	resp.xrc_odp_caps = attr.odp_caps.per_transport_caps.xrc_odp_caps;
 
 	resp.timestamp_mask = attr.timestamp_mask;
-	resp.response_length += sizeof(resp.timestamp_mask);
-
-	if (ucore->outlen < resp.response_length + sizeof(resp.hca_core_clock))
-		goto end;
-
 	resp.hca_core_clock = attr.hca_core_clock;
-	resp.response_length += sizeof(resp.hca_core_clock);
-
-	if (ucore->outlen < resp.response_length + sizeof(resp.device_cap_flags_ex))
-		goto end;
-
 	resp.device_cap_flags_ex = attr.device_cap_flags;
-	resp.response_length += sizeof(resp.device_cap_flags_ex);
-
-	if (ucore->outlen < resp.response_length + sizeof(resp.rss_caps))
-		goto end;
-
 	resp.rss_caps.supported_qpts = attr.rss_caps.supported_qpts;
 	resp.rss_caps.max_rwq_indirection_tables =
 		attr.rss_caps.max_rwq_indirection_tables;
 	resp.rss_caps.max_rwq_indirection_table_size =
 		attr.rss_caps.max_rwq_indirection_table_size;
-
-	resp.response_length += sizeof(resp.rss_caps);
-
-	if (ucore->outlen < resp.response_length + sizeof(resp.max_wq_type_rq))
-		goto end;
-
 	resp.max_wq_type_rq = attr.max_wq_type_rq;
-	resp.response_length += sizeof(resp.max_wq_type_rq);
-
-	if (ucore->outlen < resp.response_length + sizeof(resp.raw_packet_caps))
-		goto end;
-
 	resp.raw_packet_caps = attr.raw_packet_caps;
-	resp.response_length += sizeof(resp.raw_packet_caps);
-
-	if (ucore->outlen < resp.response_length + sizeof(resp.tm_caps))
-		goto end;
-
 	resp.tm_caps.max_rndv_hdr_size	= attr.tm_caps.max_rndv_hdr_size;
 	resp.tm_caps.max_num_tags	= attr.tm_caps.max_num_tags;
 	resp.tm_caps.max_ops		= attr.tm_caps.max_ops;
 	resp.tm_caps.max_sge		= attr.tm_caps.max_sge;
 	resp.tm_caps.flags		= attr.tm_caps.flags;
-	resp.response_length += sizeof(resp.tm_caps);
-
-	if (ucore->outlen < resp.response_length + sizeof(resp.cq_moderation_caps))
-		goto end;
-
 	resp.cq_moderation_caps.max_cq_moderation_count  =
 		attr.cq_caps.max_cq_moderation_count;
 	resp.cq_moderation_caps.max_cq_moderation_period =
 		attr.cq_caps.max_cq_moderation_period;
-	resp.response_length += sizeof(resp.cq_moderation_caps);
-
-	if (ucore->outlen < resp.response_length + sizeof(resp.max_dm_size))
-		goto end;
-
 	resp.max_dm_size = attr.max_dm_size;
-	resp.response_length += sizeof(resp.max_dm_size);
-end:
-	err = ib_copy_to_udata(ucore, &resp, resp.response_length);
-	return err;
+	resp.response_length = uverbs_response_length(attrs, sizeof(resp));
+
+	return uverbs_response(attrs, &resp, sizeof(resp));
 }
 
-int ib_uverbs_ex_modify_cq(struct ib_uverbs_file *file,
-			   struct ib_udata *ucore,
-			   struct ib_udata *uhw)
+static int ib_uverbs_ex_modify_cq(struct uverbs_attr_bundle *attrs)
 {
-	struct ib_uverbs_ex_modify_cq cmd = {};
+	struct ib_uverbs_ex_modify_cq cmd;
 	struct ib_cq *cq;
-	size_t required_cmd_sz;
 	int ret;
 
-	required_cmd_sz = offsetof(typeof(cmd), reserved) +
-				sizeof(cmd.reserved);
-	if (ucore->inlen < required_cmd_sz)
-		return -EINVAL;
-
-	/* sanity checks */
-	if (ucore->inlen > sizeof(cmd) &&
-	    !ib_is_udata_cleared(ucore, sizeof(cmd),
-				 ucore->inlen - sizeof(cmd)))
-		return -EOPNOTSUPP;
-
-	ret = ib_copy_from_udata(&cmd, ucore, min(sizeof(cmd), ucore->inlen));
+	ret = uverbs_request(attrs, &cmd, sizeof(cmd));
 	if (ret)
 		return ret;
 
@@ -4135,7 +3698,7 @@
 	if (cmd.attr_mask > IB_CQ_MODERATE)
 		return -EOPNOTSUPP;
 
-	cq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ, cmd.cq_handle, file);
+	cq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ, cmd.cq_handle, attrs);
 	if (!cq)
 		return -EINVAL;
 
@@ -4145,3 +3708,378 @@
 
 	return ret;
 }
+
+/*
+ * Describe the input structs for write(). Some write methods have an input
+ * only struct, most have an input and output. If the struct has an output then
+ * the 'response' u64 must be the first field in the request structure.
+ *
+ * If udata is present then both the request and response structs have a
+ * trailing driver_data flex array. In this case the size of the base struct
+ * cannot be changed.
+ */
+#define UAPI_DEF_WRITE_IO(req, resp)                                           \
+	.write.has_resp = 1 +                                                  \
+			  BUILD_BUG_ON_ZERO(offsetof(req, response) != 0) +    \
+			  BUILD_BUG_ON_ZERO(sizeof(((req *)0)->response) !=    \
+					    sizeof(u64)),                      \
+	.write.req_size = sizeof(req), .write.resp_size = sizeof(resp)
+
+#define UAPI_DEF_WRITE_I(req) .write.req_size = sizeof(req)
+
+#define UAPI_DEF_WRITE_UDATA_IO(req, resp)                                     \
+	UAPI_DEF_WRITE_IO(req, resp),                                          \
+		.write.has_udata =                                             \
+			1 +                                                    \
+			BUILD_BUG_ON_ZERO(offsetof(req, driver_data) !=        \
+					  sizeof(req)) +                       \
+			BUILD_BUG_ON_ZERO(offsetof(resp, driver_data) !=       \
+					  sizeof(resp))
+
+#define UAPI_DEF_WRITE_UDATA_I(req)                                            \
+	UAPI_DEF_WRITE_I(req),                                                 \
+		.write.has_udata =                                             \
+			1 + BUILD_BUG_ON_ZERO(offsetof(req, driver_data) !=    \
+					      sizeof(req))
+
+/*
+ * The _EX versions are for use with WRITE_EX and allow the last struct member
+ * to be specified. Buffers that do not include that member will be rejected.
+ */
+#define UAPI_DEF_WRITE_IO_EX(req, req_last_member, resp, resp_last_member)     \
+	.write.has_resp = 1,                                                   \
+	.write.req_size = offsetofend(req, req_last_member),                   \
+	.write.resp_size = offsetofend(resp, resp_last_member)
+
+#define UAPI_DEF_WRITE_I_EX(req, req_last_member)                              \
+	.write.req_size = offsetofend(req, req_last_member)
+
+const struct uapi_definition uverbs_def_write_intf[] = {
+	DECLARE_UVERBS_OBJECT(
+		UVERBS_OBJECT_AH,
+		DECLARE_UVERBS_WRITE(IB_USER_VERBS_CMD_CREATE_AH,
+				     ib_uverbs_create_ah,
+				     UAPI_DEF_WRITE_UDATA_IO(
+					     struct ib_uverbs_create_ah,
+					     struct ib_uverbs_create_ah_resp),
+				     UAPI_DEF_METHOD_NEEDS_FN(create_ah)),
+		DECLARE_UVERBS_WRITE(
+			IB_USER_VERBS_CMD_DESTROY_AH,
+			ib_uverbs_destroy_ah,
+			UAPI_DEF_WRITE_I(struct ib_uverbs_destroy_ah),
+			UAPI_DEF_METHOD_NEEDS_FN(destroy_ah))),
+
+	DECLARE_UVERBS_OBJECT(
+		UVERBS_OBJECT_COMP_CHANNEL,
+		DECLARE_UVERBS_WRITE(
+			IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL,
+			ib_uverbs_create_comp_channel,
+			UAPI_DEF_WRITE_IO(
+				struct ib_uverbs_create_comp_channel,
+				struct ib_uverbs_create_comp_channel_resp))),
+
+	DECLARE_UVERBS_OBJECT(
+		UVERBS_OBJECT_CQ,
+		DECLARE_UVERBS_WRITE(IB_USER_VERBS_CMD_CREATE_CQ,
+				     ib_uverbs_create_cq,
+				     UAPI_DEF_WRITE_UDATA_IO(
+					     struct ib_uverbs_create_cq,
+					     struct ib_uverbs_create_cq_resp),
+				     UAPI_DEF_METHOD_NEEDS_FN(create_cq)),
+		DECLARE_UVERBS_WRITE(
+			IB_USER_VERBS_CMD_DESTROY_CQ,
+			ib_uverbs_destroy_cq,
+			UAPI_DEF_WRITE_IO(struct ib_uverbs_destroy_cq,
+					  struct ib_uverbs_destroy_cq_resp),
+			UAPI_DEF_METHOD_NEEDS_FN(destroy_cq)),
+		DECLARE_UVERBS_WRITE(
+			IB_USER_VERBS_CMD_POLL_CQ,
+			ib_uverbs_poll_cq,
+			UAPI_DEF_WRITE_IO(struct ib_uverbs_poll_cq,
+					  struct ib_uverbs_poll_cq_resp),
+			UAPI_DEF_METHOD_NEEDS_FN(poll_cq)),
+		DECLARE_UVERBS_WRITE(
+			IB_USER_VERBS_CMD_REQ_NOTIFY_CQ,
+			ib_uverbs_req_notify_cq,
+			UAPI_DEF_WRITE_I(struct ib_uverbs_req_notify_cq),
+			UAPI_DEF_METHOD_NEEDS_FN(req_notify_cq)),
+		DECLARE_UVERBS_WRITE(IB_USER_VERBS_CMD_RESIZE_CQ,
+				     ib_uverbs_resize_cq,
+				     UAPI_DEF_WRITE_UDATA_IO(
+					     struct ib_uverbs_resize_cq,
+					     struct ib_uverbs_resize_cq_resp),
+				     UAPI_DEF_METHOD_NEEDS_FN(resize_cq)),
+		DECLARE_UVERBS_WRITE_EX(
+			IB_USER_VERBS_EX_CMD_CREATE_CQ,
+			ib_uverbs_ex_create_cq,
+			UAPI_DEF_WRITE_IO_EX(struct ib_uverbs_ex_create_cq,
+					     reserved,
+					     struct ib_uverbs_ex_create_cq_resp,
+					     response_length),
+			UAPI_DEF_METHOD_NEEDS_FN(create_cq)),
+		DECLARE_UVERBS_WRITE_EX(
+			IB_USER_VERBS_EX_CMD_MODIFY_CQ,
+			ib_uverbs_ex_modify_cq,
+			UAPI_DEF_WRITE_I(struct ib_uverbs_ex_modify_cq),
+			UAPI_DEF_METHOD_NEEDS_FN(create_cq))),
+
+	DECLARE_UVERBS_OBJECT(
+		UVERBS_OBJECT_DEVICE,
+		DECLARE_UVERBS_WRITE(IB_USER_VERBS_CMD_GET_CONTEXT,
+				     ib_uverbs_get_context,
+				     UAPI_DEF_WRITE_UDATA_IO(
+					     struct ib_uverbs_get_context,
+					     struct ib_uverbs_get_context_resp)),
+		DECLARE_UVERBS_WRITE(
+			IB_USER_VERBS_CMD_QUERY_DEVICE,
+			ib_uverbs_query_device,
+			UAPI_DEF_WRITE_IO(struct ib_uverbs_query_device,
+					  struct ib_uverbs_query_device_resp)),
+		DECLARE_UVERBS_WRITE(
+			IB_USER_VERBS_CMD_QUERY_PORT,
+			ib_uverbs_query_port,
+			UAPI_DEF_WRITE_IO(struct ib_uverbs_query_port,
+					  struct ib_uverbs_query_port_resp),
+			UAPI_DEF_METHOD_NEEDS_FN(query_port)),
+		DECLARE_UVERBS_WRITE_EX(
+			IB_USER_VERBS_EX_CMD_QUERY_DEVICE,
+			ib_uverbs_ex_query_device,
+			UAPI_DEF_WRITE_IO_EX(
+				struct ib_uverbs_ex_query_device,
+				reserved,
+				struct ib_uverbs_ex_query_device_resp,
+				response_length),
+			UAPI_DEF_METHOD_NEEDS_FN(query_device)),
+		UAPI_DEF_OBJ_NEEDS_FN(alloc_ucontext),
+		UAPI_DEF_OBJ_NEEDS_FN(dealloc_ucontext)),
+
+	DECLARE_UVERBS_OBJECT(
+		UVERBS_OBJECT_FLOW,
+		DECLARE_UVERBS_WRITE_EX(
+			IB_USER_VERBS_EX_CMD_CREATE_FLOW,
+			ib_uverbs_ex_create_flow,
+			UAPI_DEF_WRITE_IO_EX(struct ib_uverbs_create_flow,
+					     flow_attr,
+					     struct ib_uverbs_create_flow_resp,
+					     flow_handle),
+			UAPI_DEF_METHOD_NEEDS_FN(create_flow)),
+		DECLARE_UVERBS_WRITE_EX(
+			IB_USER_VERBS_EX_CMD_DESTROY_FLOW,
+			ib_uverbs_ex_destroy_flow,
+			UAPI_DEF_WRITE_I(struct ib_uverbs_destroy_flow),
+			UAPI_DEF_METHOD_NEEDS_FN(destroy_flow))),
+
+	DECLARE_UVERBS_OBJECT(
+		UVERBS_OBJECT_MR,
+		DECLARE_UVERBS_WRITE(IB_USER_VERBS_CMD_DEREG_MR,
+				     ib_uverbs_dereg_mr,
+				     UAPI_DEF_WRITE_I(struct ib_uverbs_dereg_mr),
+				     UAPI_DEF_METHOD_NEEDS_FN(dereg_mr)),
+		DECLARE_UVERBS_WRITE(
+			IB_USER_VERBS_CMD_REG_MR,
+			ib_uverbs_reg_mr,
+			UAPI_DEF_WRITE_UDATA_IO(struct ib_uverbs_reg_mr,
+						struct ib_uverbs_reg_mr_resp),
+			UAPI_DEF_METHOD_NEEDS_FN(reg_user_mr)),
+		DECLARE_UVERBS_WRITE(
+			IB_USER_VERBS_CMD_REREG_MR,
+			ib_uverbs_rereg_mr,
+			UAPI_DEF_WRITE_UDATA_IO(struct ib_uverbs_rereg_mr,
+						struct ib_uverbs_rereg_mr_resp),
+			UAPI_DEF_METHOD_NEEDS_FN(rereg_user_mr))),
+
+	DECLARE_UVERBS_OBJECT(
+		UVERBS_OBJECT_MW,
+		DECLARE_UVERBS_WRITE(
+			IB_USER_VERBS_CMD_ALLOC_MW,
+			ib_uverbs_alloc_mw,
+			UAPI_DEF_WRITE_UDATA_IO(struct ib_uverbs_alloc_mw,
+						struct ib_uverbs_alloc_mw_resp),
+			UAPI_DEF_METHOD_NEEDS_FN(alloc_mw)),
+		DECLARE_UVERBS_WRITE(
+			IB_USER_VERBS_CMD_DEALLOC_MW,
+			ib_uverbs_dealloc_mw,
+			UAPI_DEF_WRITE_I(struct ib_uverbs_dealloc_mw),
+			UAPI_DEF_METHOD_NEEDS_FN(dealloc_mw))),
+
+	DECLARE_UVERBS_OBJECT(
+		UVERBS_OBJECT_PD,
+		DECLARE_UVERBS_WRITE(
+			IB_USER_VERBS_CMD_ALLOC_PD,
+			ib_uverbs_alloc_pd,
+			UAPI_DEF_WRITE_UDATA_IO(struct ib_uverbs_alloc_pd,
+						struct ib_uverbs_alloc_pd_resp),
+			UAPI_DEF_METHOD_NEEDS_FN(alloc_pd)),
+		DECLARE_UVERBS_WRITE(
+			IB_USER_VERBS_CMD_DEALLOC_PD,
+			ib_uverbs_dealloc_pd,
+			UAPI_DEF_WRITE_I(struct ib_uverbs_dealloc_pd),
+			UAPI_DEF_METHOD_NEEDS_FN(dealloc_pd))),
+
+	DECLARE_UVERBS_OBJECT(
+		UVERBS_OBJECT_QP,
+		DECLARE_UVERBS_WRITE(
+			IB_USER_VERBS_CMD_ATTACH_MCAST,
+			ib_uverbs_attach_mcast,
+			UAPI_DEF_WRITE_I(struct ib_uverbs_attach_mcast),
+			UAPI_DEF_METHOD_NEEDS_FN(attach_mcast),
+			UAPI_DEF_METHOD_NEEDS_FN(detach_mcast)),
+		DECLARE_UVERBS_WRITE(IB_USER_VERBS_CMD_CREATE_QP,
+				     ib_uverbs_create_qp,
+				     UAPI_DEF_WRITE_UDATA_IO(
+					     struct ib_uverbs_create_qp,
+					     struct ib_uverbs_create_qp_resp),
+				     UAPI_DEF_METHOD_NEEDS_FN(create_qp)),
+		DECLARE_UVERBS_WRITE(
+			IB_USER_VERBS_CMD_DESTROY_QP,
+			ib_uverbs_destroy_qp,
+			UAPI_DEF_WRITE_IO(struct ib_uverbs_destroy_qp,
+					  struct ib_uverbs_destroy_qp_resp),
+			UAPI_DEF_METHOD_NEEDS_FN(destroy_qp)),
+		DECLARE_UVERBS_WRITE(
+			IB_USER_VERBS_CMD_DETACH_MCAST,
+			ib_uverbs_detach_mcast,
+			UAPI_DEF_WRITE_I(struct ib_uverbs_detach_mcast),
+			UAPI_DEF_METHOD_NEEDS_FN(detach_mcast)),
+		DECLARE_UVERBS_WRITE(
+			IB_USER_VERBS_CMD_MODIFY_QP,
+			ib_uverbs_modify_qp,
+			UAPI_DEF_WRITE_I(struct ib_uverbs_modify_qp),
+			UAPI_DEF_METHOD_NEEDS_FN(modify_qp)),
+		DECLARE_UVERBS_WRITE(
+			IB_USER_VERBS_CMD_POST_RECV,
+			ib_uverbs_post_recv,
+			UAPI_DEF_WRITE_IO(struct ib_uverbs_post_recv,
+					  struct ib_uverbs_post_recv_resp),
+			UAPI_DEF_METHOD_NEEDS_FN(post_recv)),
+		DECLARE_UVERBS_WRITE(
+			IB_USER_VERBS_CMD_POST_SEND,
+			ib_uverbs_post_send,
+			UAPI_DEF_WRITE_IO(struct ib_uverbs_post_send,
+					  struct ib_uverbs_post_send_resp),
+			UAPI_DEF_METHOD_NEEDS_FN(post_send)),
+		DECLARE_UVERBS_WRITE(
+			IB_USER_VERBS_CMD_QUERY_QP,
+			ib_uverbs_query_qp,
+			UAPI_DEF_WRITE_IO(struct ib_uverbs_query_qp,
+					  struct ib_uverbs_query_qp_resp),
+			UAPI_DEF_METHOD_NEEDS_FN(query_qp)),
+		DECLARE_UVERBS_WRITE_EX(
+			IB_USER_VERBS_EX_CMD_CREATE_QP,
+			ib_uverbs_ex_create_qp,
+			UAPI_DEF_WRITE_IO_EX(struct ib_uverbs_ex_create_qp,
+					     comp_mask,
+					     struct ib_uverbs_ex_create_qp_resp,
+					     response_length),
+			UAPI_DEF_METHOD_NEEDS_FN(create_qp)),
+		DECLARE_UVERBS_WRITE_EX(
+			IB_USER_VERBS_EX_CMD_MODIFY_QP,
+			ib_uverbs_ex_modify_qp,
+			UAPI_DEF_WRITE_IO_EX(struct ib_uverbs_ex_modify_qp,
+					     base,
+					     struct ib_uverbs_ex_modify_qp_resp,
+					     response_length),
+			UAPI_DEF_METHOD_NEEDS_FN(modify_qp))),
+
+	DECLARE_UVERBS_OBJECT(
+		UVERBS_OBJECT_RWQ_IND_TBL,
+		DECLARE_UVERBS_WRITE_EX(
+			IB_USER_VERBS_EX_CMD_CREATE_RWQ_IND_TBL,
+			ib_uverbs_ex_create_rwq_ind_table,
+			UAPI_DEF_WRITE_IO_EX(
+				struct ib_uverbs_ex_create_rwq_ind_table,
+				log_ind_tbl_size,
+				struct ib_uverbs_ex_create_rwq_ind_table_resp,
+				ind_tbl_num),
+			UAPI_DEF_METHOD_NEEDS_FN(create_rwq_ind_table)),
+		DECLARE_UVERBS_WRITE_EX(
+			IB_USER_VERBS_EX_CMD_DESTROY_RWQ_IND_TBL,
+			ib_uverbs_ex_destroy_rwq_ind_table,
+			UAPI_DEF_WRITE_I(
+				struct ib_uverbs_ex_destroy_rwq_ind_table),
+			UAPI_DEF_METHOD_NEEDS_FN(destroy_rwq_ind_table))),
+
+	DECLARE_UVERBS_OBJECT(
+		UVERBS_OBJECT_WQ,
+		DECLARE_UVERBS_WRITE_EX(
+			IB_USER_VERBS_EX_CMD_CREATE_WQ,
+			ib_uverbs_ex_create_wq,
+			UAPI_DEF_WRITE_IO_EX(struct ib_uverbs_ex_create_wq,
+					     max_sge,
+					     struct ib_uverbs_ex_create_wq_resp,
+					     wqn),
+			UAPI_DEF_METHOD_NEEDS_FN(create_wq)),
+		DECLARE_UVERBS_WRITE_EX(
+			IB_USER_VERBS_EX_CMD_DESTROY_WQ,
+			ib_uverbs_ex_destroy_wq,
+			UAPI_DEF_WRITE_IO_EX(struct ib_uverbs_ex_destroy_wq,
+					     wq_handle,
+					     struct ib_uverbs_ex_destroy_wq_resp,
+					     reserved),
+			UAPI_DEF_METHOD_NEEDS_FN(destroy_wq)),
+		DECLARE_UVERBS_WRITE_EX(
+			IB_USER_VERBS_EX_CMD_MODIFY_WQ,
+			ib_uverbs_ex_modify_wq,
+			UAPI_DEF_WRITE_I_EX(struct ib_uverbs_ex_modify_wq,
+					    curr_wq_state),
+			UAPI_DEF_METHOD_NEEDS_FN(modify_wq))),
+
+	DECLARE_UVERBS_OBJECT(
+		UVERBS_OBJECT_SRQ,
+		DECLARE_UVERBS_WRITE(IB_USER_VERBS_CMD_CREATE_SRQ,
+				     ib_uverbs_create_srq,
+				     UAPI_DEF_WRITE_UDATA_IO(
+					     struct ib_uverbs_create_srq,
+					     struct ib_uverbs_create_srq_resp),
+				     UAPI_DEF_METHOD_NEEDS_FN(create_srq)),
+		DECLARE_UVERBS_WRITE(IB_USER_VERBS_CMD_CREATE_XSRQ,
+				     ib_uverbs_create_xsrq,
+				     UAPI_DEF_WRITE_UDATA_IO(
+					     struct ib_uverbs_create_xsrq,
+					     struct ib_uverbs_create_srq_resp),
+				     UAPI_DEF_METHOD_NEEDS_FN(create_srq)),
+		DECLARE_UVERBS_WRITE(
+			IB_USER_VERBS_CMD_DESTROY_SRQ,
+			ib_uverbs_destroy_srq,
+			UAPI_DEF_WRITE_IO(struct ib_uverbs_destroy_srq,
+					  struct ib_uverbs_destroy_srq_resp),
+			UAPI_DEF_METHOD_NEEDS_FN(destroy_srq)),
+		DECLARE_UVERBS_WRITE(
+			IB_USER_VERBS_CMD_MODIFY_SRQ,
+			ib_uverbs_modify_srq,
+			UAPI_DEF_WRITE_UDATA_I(struct ib_uverbs_modify_srq),
+			UAPI_DEF_METHOD_NEEDS_FN(modify_srq)),
+		DECLARE_UVERBS_WRITE(
+			IB_USER_VERBS_CMD_POST_SRQ_RECV,
+			ib_uverbs_post_srq_recv,
+			UAPI_DEF_WRITE_IO(struct ib_uverbs_post_srq_recv,
+					  struct ib_uverbs_post_srq_recv_resp),
+			UAPI_DEF_METHOD_NEEDS_FN(post_srq_recv)),
+		DECLARE_UVERBS_WRITE(
+			IB_USER_VERBS_CMD_QUERY_SRQ,
+			ib_uverbs_query_srq,
+			UAPI_DEF_WRITE_IO(struct ib_uverbs_query_srq,
+					  struct ib_uverbs_query_srq_resp),
+			UAPI_DEF_METHOD_NEEDS_FN(query_srq))),
+
+	DECLARE_UVERBS_OBJECT(
+		UVERBS_OBJECT_XRCD,
+		DECLARE_UVERBS_WRITE(
+			IB_USER_VERBS_CMD_CLOSE_XRCD,
+			ib_uverbs_close_xrcd,
+			UAPI_DEF_WRITE_I(struct ib_uverbs_close_xrcd),
+			UAPI_DEF_METHOD_NEEDS_FN(dealloc_xrcd)),
+		DECLARE_UVERBS_WRITE(IB_USER_VERBS_CMD_OPEN_QP,
+				     ib_uverbs_open_qp,
+				     UAPI_DEF_WRITE_UDATA_IO(
+					     struct ib_uverbs_open_qp,
+					     struct ib_uverbs_create_qp_resp)),
+		DECLARE_UVERBS_WRITE(IB_USER_VERBS_CMD_OPEN_XRCD,
+				     ib_uverbs_open_xrcd,
+				     UAPI_DEF_WRITE_UDATA_IO(
+					     struct ib_uverbs_open_xrcd,
+					     struct ib_uverbs_open_xrcd_resp),
+				     UAPI_DEF_METHOD_NEEDS_FN(alloc_xrcd))),
+
+	{},
+};
diff --git a/drivers/infiniband/core/uverbs_ioctl.c b/drivers/infiniband/core/uverbs_ioctl.c
index 1a6b229..6175820 100644
--- a/drivers/infiniband/core/uverbs_ioctl.c
+++ b/drivers/infiniband/core/uverbs_ioctl.c
@@ -57,6 +57,7 @@
 	struct ib_uverbs_attr *uattrs;
 
 	DECLARE_BITMAP(uobj_finalize, UVERBS_API_ATTR_BKEY_LEN);
+	DECLARE_BITMAP(spec_finalize, UVERBS_API_ATTR_BKEY_LEN);
 
 	/*
 	 * Must be last. bundle ends in a flex array which overlaps
@@ -126,7 +127,7 @@
 	res = (void *)pbundle->internal_buffer + pbundle->internal_used;
 	pbundle->internal_used =
 		ALIGN(new_used, sizeof(*pbundle->internal_buffer));
-	if (flags & __GFP_ZERO)
+	if (want_init_on_alloc(flags))
 		memset(res, 0, size);
 	return res;
 }
@@ -143,6 +144,102 @@
 			   0, uattr->len - len);
 }
 
+static int uverbs_set_output(const struct uverbs_attr_bundle *bundle,
+			     const struct uverbs_attr *attr)
+{
+	struct bundle_priv *pbundle =
+		container_of(bundle, struct bundle_priv, bundle);
+	u16 flags;
+
+	flags = pbundle->uattrs[attr->ptr_attr.uattr_idx].flags |
+		UVERBS_ATTR_F_VALID_OUTPUT;
+	if (put_user(flags,
+		     &pbundle->user_attrs[attr->ptr_attr.uattr_idx].flags))
+		return -EFAULT;
+	return 0;
+}
+
+static int uverbs_process_idrs_array(struct bundle_priv *pbundle,
+				     const struct uverbs_api_attr *attr_uapi,
+				     struct uverbs_objs_arr_attr *attr,
+				     struct ib_uverbs_attr *uattr,
+				     u32 attr_bkey)
+{
+	const struct uverbs_attr_spec *spec = &attr_uapi->spec;
+	size_t array_len;
+	u32 *idr_vals;
+	int ret = 0;
+	size_t i;
+
+	if (uattr->attr_data.reserved)
+		return -EINVAL;
+
+	if (uattr->len % sizeof(u32))
+		return -EINVAL;
+
+	array_len = uattr->len / sizeof(u32);
+	if (array_len < spec->u2.objs_arr.min_len ||
+	    array_len > spec->u2.objs_arr.max_len)
+		return -EINVAL;
+
+	attr->uobjects =
+		uverbs_alloc(&pbundle->bundle,
+			     array_size(array_len, sizeof(*attr->uobjects)));
+	if (IS_ERR(attr->uobjects))
+		return PTR_ERR(attr->uobjects);
+
+	/*
+	 * Since idr is 4B and *uobjects is >= 4B, we can use attr->uobjects
+	 * to store idrs array and avoid additional memory allocation. The
+	 * idrs array is offset to the end of the uobjects array so we will be
+	 * able to read idr and replace with a pointer.
+	 */
+	idr_vals = (u32 *)(attr->uobjects + array_len) - array_len;
+
+	if (uattr->len > sizeof(uattr->data)) {
+		ret = copy_from_user(idr_vals, u64_to_user_ptr(uattr->data),
+				     uattr->len);
+		if (ret)
+			return -EFAULT;
+	} else {
+		memcpy(idr_vals, &uattr->data, uattr->len);
+	}
+
+	for (i = 0; i != array_len; i++) {
+		attr->uobjects[i] = uverbs_get_uobject_from_file(
+			spec->u2.objs_arr.obj_type, spec->u2.objs_arr.access,
+			idr_vals[i], &pbundle->bundle);
+		if (IS_ERR(attr->uobjects[i])) {
+			ret = PTR_ERR(attr->uobjects[i]);
+			break;
+		}
+	}
+
+	attr->len = i;
+	__set_bit(attr_bkey, pbundle->spec_finalize);
+	return ret;
+}
+
+static int uverbs_free_idrs_array(const struct uverbs_api_attr *attr_uapi,
+				  struct uverbs_objs_arr_attr *attr,
+				  bool commit, struct uverbs_attr_bundle *attrs)
+{
+	const struct uverbs_attr_spec *spec = &attr_uapi->spec;
+	int current_ret;
+	int ret = 0;
+	size_t i;
+
+	for (i = 0; i != attr->len; i++) {
+		current_ret = uverbs_finalize_object(attr->uobjects[i],
+						     spec->u2.objs_arr.access,
+						     commit, attrs);
+		if (!ret)
+			ret = current_ret;
+	}
+
+	return ret;
+}
+
 static int uverbs_process_attr(struct bundle_priv *pbundle,
 			       const struct uverbs_api_attr *attr_uapi,
 			       struct ib_uverbs_attr *uattr, u32 attr_bkey)
@@ -228,10 +325,8 @@
 		 * IDR implementation today rejects negative IDs
 		 */
 		o_attr->uobject = uverbs_get_uobject_from_file(
-					spec->u.obj.obj_type,
-					pbundle->bundle.ufile,
-					spec->u.obj.access,
-					uattr->data_s64);
+			spec->u.obj.obj_type, spec->u.obj.access,
+			uattr->data_s64, &pbundle->bundle);
 		if (IS_ERR(o_attr->uobject))
 			return PTR_ERR(o_attr->uobject);
 		__set_bit(attr_bkey, pbundle->uobj_finalize);
@@ -246,6 +341,11 @@
 		}
 
 		break;
+
+	case UVERBS_ATTR_TYPE_IDRS_ARRAY:
+		return uverbs_process_idrs_array(pbundle, attr_uapi,
+						 &e->objs_arr_attr, uattr,
+						 attr_bkey);
 	default:
 		return -EOPNOTSUPP;
 	}
@@ -300,8 +400,7 @@
 			return -EPROTONOSUPPORT;
 		return 0;
 	}
-	attr = srcu_dereference(
-		*slot, &pbundle->bundle.ufile->device->disassociate_srcu);
+	attr = rcu_dereference_protected(*slot, true);
 
 	/* Reject duplicate attributes from user-space */
 	if (test_bit(attr_bkey, pbundle->bundle.attr_present))
@@ -319,8 +418,7 @@
 static int ib_uverbs_run_method(struct bundle_priv *pbundle,
 				unsigned int num_attrs)
 {
-	int (*handler)(struct ib_uverbs_file *ufile,
-		       struct uverbs_attr_bundle *ctx);
+	int (*handler)(struct uverbs_attr_bundle *attrs);
 	size_t uattrs_size = array_size(sizeof(*pbundle->uattrs), num_attrs);
 	unsigned int destroy_bkey = pbundle->method_elm->destroy_bkey;
 	unsigned int i;
@@ -351,19 +449,39 @@
 				    pbundle->method_elm->key_bitmap_len)))
 		return -EINVAL;
 
+	if (pbundle->method_elm->has_udata)
+		uverbs_fill_udata(&pbundle->bundle,
+				  &pbundle->bundle.driver_udata,
+				  UVERBS_ATTR_UHW_IN, UVERBS_ATTR_UHW_OUT);
+	else
+		pbundle->bundle.driver_udata = (struct ib_udata){};
+
 	if (destroy_bkey != UVERBS_API_ATTR_BKEY_LEN) {
 		struct uverbs_obj_attr *destroy_attr =
 			&pbundle->bundle.attrs[destroy_bkey].obj_attr;
 
-		ret = uobj_destroy(destroy_attr->uobject);
+		ret = uobj_destroy(destroy_attr->uobject, &pbundle->bundle);
 		if (ret)
 			return ret;
 		__clear_bit(destroy_bkey, pbundle->uobj_finalize);
 
-		ret = handler(pbundle->bundle.ufile, &pbundle->bundle);
+		ret = handler(&pbundle->bundle);
 		uobj_put_destroy(destroy_attr->uobject);
 	} else {
-		ret = handler(pbundle->bundle.ufile, &pbundle->bundle);
+		ret = handler(&pbundle->bundle);
+	}
+
+	/*
+	 * Until the drivers are revised to use the bundle directly we have to
+	 * assume that the driver wrote to its UHW_OUT and flag userspace
+	 * appropriately.
+	 */
+	if (!ret && pbundle->method_elm->has_udata) {
+		const struct uverbs_attr *attr =
+			uverbs_attr_get(&pbundle->bundle, UVERBS_ATTR_UHW_OUT);
+
+		if (!IS_ERR(attr))
+			ret = uverbs_set_output(&pbundle->bundle, attr);
 	}
 
 	/*
@@ -384,6 +502,7 @@
 	unsigned int i;
 	int ret = 0;
 
+	/* fast path for simple uobjects */
 	i = -1;
 	while ((i = find_next_bit(pbundle->uobj_finalize, key_bitmap_len,
 				  i + 1)) < key_bitmap_len) {
@@ -392,11 +511,37 @@
 
 		current_ret = uverbs_finalize_object(
 			attr->obj_attr.uobject,
-			attr->obj_attr.attr_elm->spec.u.obj.access, commit);
+			attr->obj_attr.attr_elm->spec.u.obj.access, commit,
+			&pbundle->bundle);
 		if (!ret)
 			ret = current_ret;
 	}
 
+	i = -1;
+	while ((i = find_next_bit(pbundle->spec_finalize, key_bitmap_len,
+				  i + 1)) < key_bitmap_len) {
+		struct uverbs_attr *attr = &pbundle->bundle.attrs[i];
+		const struct uverbs_api_attr *attr_uapi;
+		void __rcu **slot;
+		int current_ret;
+
+		slot = uapi_get_attr_for_method(
+			pbundle,
+			pbundle->method_key | uapi_bkey_to_key_attr(i));
+		if (WARN_ON(!slot))
+			continue;
+
+		attr_uapi = rcu_dereference_protected(*slot, true);
+
+		if (attr_uapi->spec.type == UVERBS_ATTR_TYPE_IDRS_ARRAY) {
+			current_ret = uverbs_free_idrs_array(
+				attr_uapi, &attr->objs_arr_attr, commit,
+				&pbundle->bundle);
+			if (!ret)
+				ret = current_ret;
+		}
+	}
+
 	for (memblock = pbundle->allocated_mem; memblock;) {
 		struct bundle_alloc_head *tmp = memblock;
 
@@ -429,7 +574,7 @@
 			uapi_key_ioctl_method(hdr->method_id));
 	if (unlikely(!slot))
 		return -EPROTONOSUPPORT;
-	method_elm = srcu_dereference(*slot, &ufile->device->disassociate_srcu);
+	method_elm = rcu_dereference_protected(*slot, true);
 
 	if (!method_elm->use_stack) {
 		pbundle = kmalloc(method_elm->bundle_size, GFP_KERNEL);
@@ -450,6 +595,7 @@
 	pbundle->method_elm = method_elm;
 	pbundle->method_key = attrs_iter.index;
 	pbundle->bundle.ufile = ufile;
+	pbundle->bundle.context = NULL; /* only valid if bundle has uobject */
 	pbundle->radix = &uapi->radix;
 	pbundle->radix_slots = slot;
 	pbundle->radix_slots_len = radix_tree_chunk_size(&attrs_iter);
@@ -461,6 +607,7 @@
 	memset(pbundle->bundle.attr_present, 0,
 	       sizeof(pbundle->bundle.attr_present));
 	memset(pbundle->uobj_finalize, 0, sizeof(pbundle->uobj_finalize));
+	memset(pbundle->spec_finalize, 0, sizeof(pbundle->spec_finalize));
 
 	ret = ib_uverbs_run_method(pbundle, hdr->num_attrs);
 	destroy_ret = bundle_destroy(pbundle, ret == 0);
@@ -551,35 +698,37 @@
 EXPORT_SYMBOL(uverbs_get_flags32);
 
 /*
- * This is for ease of conversion. The purpose is to convert all drivers to
- * use uverbs_attr_bundle instead of ib_udata.  Assume attr == 0 is input and
- * attr == 1 is output.
+ * Fill a ib_udata struct (core or uhw) using the given attribute IDs.
+ * This is primarily used to convert the UVERBS_ATTR_UHW() into the
+ * ib_udata format used by the drivers.
  */
-void create_udata(struct uverbs_attr_bundle *bundle, struct ib_udata *udata)
+void uverbs_fill_udata(struct uverbs_attr_bundle *bundle,
+		       struct ib_udata *udata, unsigned int attr_in,
+		       unsigned int attr_out)
 {
 	struct bundle_priv *pbundle =
 		container_of(bundle, struct bundle_priv, bundle);
-	const struct uverbs_attr *uhw_in =
-		uverbs_attr_get(bundle, UVERBS_ATTR_UHW_IN);
-	const struct uverbs_attr *uhw_out =
-		uverbs_attr_get(bundle, UVERBS_ATTR_UHW_OUT);
+	const struct uverbs_attr *in =
+		uverbs_attr_get(&pbundle->bundle, attr_in);
+	const struct uverbs_attr *out =
+		uverbs_attr_get(&pbundle->bundle, attr_out);
 
-	if (!IS_ERR(uhw_in)) {
-		udata->inlen = uhw_in->ptr_attr.len;
-		if (uverbs_attr_ptr_is_inline(uhw_in))
+	if (!IS_ERR(in)) {
+		udata->inlen = in->ptr_attr.len;
+		if (uverbs_attr_ptr_is_inline(in))
 			udata->inbuf =
-				&pbundle->user_attrs[uhw_in->ptr_attr.uattr_idx]
+				&pbundle->user_attrs[in->ptr_attr.uattr_idx]
 					 .data;
 		else
-			udata->inbuf = u64_to_user_ptr(uhw_in->ptr_attr.data);
+			udata->inbuf = u64_to_user_ptr(in->ptr_attr.data);
 	} else {
 		udata->inbuf = NULL;
 		udata->inlen = 0;
 	}
 
-	if (!IS_ERR(uhw_out)) {
-		udata->outbuf = u64_to_user_ptr(uhw_out->ptr_attr.data);
-		udata->outlen = uhw_out->ptr_attr.len;
+	if (!IS_ERR(out)) {
+		udata->outbuf = u64_to_user_ptr(out->ptr_attr.data);
+		udata->outlen = out->ptr_attr.len;
 	} else {
 		udata->outbuf = NULL;
 		udata->outlen = 0;
@@ -589,10 +738,7 @@
 int uverbs_copy_to(const struct uverbs_attr_bundle *bundle, size_t idx,
 		   const void *from, size_t size)
 {
-	struct bundle_priv *pbundle =
-		container_of(bundle, struct bundle_priv, bundle);
 	const struct uverbs_attr *attr = uverbs_attr_get(bundle, idx);
-	u16 flags;
 	size_t min_size;
 
 	if (IS_ERR(attr))
@@ -602,12 +748,57 @@
 	if (copy_to_user(u64_to_user_ptr(attr->ptr_attr.data), from, min_size))
 		return -EFAULT;
 
-	flags = pbundle->uattrs[attr->ptr_attr.uattr_idx].flags |
-		UVERBS_ATTR_F_VALID_OUTPUT;
-	if (put_user(flags,
-		     &pbundle->user_attrs[attr->ptr_attr.uattr_idx].flags))
-		return -EFAULT;
+	return uverbs_set_output(bundle, attr);
+}
+EXPORT_SYMBOL(uverbs_copy_to);
+
+
+/*
+ * This is only used if the caller has directly used copy_to_use to write the
+ * data.  It signals to user space that the buffer is filled in.
+ */
+int uverbs_output_written(const struct uverbs_attr_bundle *bundle, size_t idx)
+{
+	const struct uverbs_attr *attr = uverbs_attr_get(bundle, idx);
+
+	if (IS_ERR(attr))
+		return PTR_ERR(attr);
+
+	return uverbs_set_output(bundle, attr);
+}
+
+int _uverbs_get_const(s64 *to, const struct uverbs_attr_bundle *attrs_bundle,
+		      size_t idx, s64 lower_bound, u64 upper_bound,
+		      s64  *def_val)
+{
+	const struct uverbs_attr *attr;
+
+	attr = uverbs_attr_get(attrs_bundle, idx);
+	if (IS_ERR(attr)) {
+		if ((PTR_ERR(attr) != -ENOENT) || !def_val)
+			return PTR_ERR(attr);
+
+		*to = *def_val;
+	} else {
+		*to = attr->ptr_attr.data;
+	}
+
+	if (*to < lower_bound || (*to > 0 && (u64)*to > upper_bound))
+		return -EINVAL;
 
 	return 0;
 }
-EXPORT_SYMBOL(uverbs_copy_to);
+EXPORT_SYMBOL(_uverbs_get_const);
+
+int uverbs_copy_to_struct_or_zero(const struct uverbs_attr_bundle *bundle,
+				  size_t idx, const void *from, size_t size)
+{
+	const struct uverbs_attr *attr = uverbs_attr_get(bundle, idx);
+
+	if (size < attr->ptr_attr.len) {
+		if (clear_user(u64_to_user_ptr(attr->ptr_attr.data) + size,
+			       attr->ptr_attr.len - size))
+			return -EFAULT;
+	}
+	return uverbs_copy_to(bundle, idx, from, size);
+}
diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c
index 50152c1..db98111 100644
--- a/drivers/infiniband/core/uverbs_main.c
+++ b/drivers/infiniband/core/uverbs_main.c
@@ -45,11 +45,13 @@
 #include <linux/cdev.h>
 #include <linux/anon_inodes.h>
 #include <linux/slab.h>
+#include <linux/sched/mm.h>
 
 #include <linux/uaccess.h>
 
 #include <rdma/ib.h>
 #include <rdma/uverbs_std_types.h>
+#include <rdma/rdma_netlink.h>
 
 #include "uverbs.h"
 #include "core_priv.h"
@@ -72,65 +74,7 @@
 static dev_t dynamic_uverbs_dev;
 static struct class *uverbs_class;
 
-static DECLARE_BITMAP(dev_map, IB_UVERBS_MAX_DEVICES);
-
-static ssize_t (*uverbs_cmd_table[])(struct ib_uverbs_file *file,
-				     const char __user *buf, int in_len,
-				     int out_len) = {
-	[IB_USER_VERBS_CMD_GET_CONTEXT]		= ib_uverbs_get_context,
-	[IB_USER_VERBS_CMD_QUERY_DEVICE]	= ib_uverbs_query_device,
-	[IB_USER_VERBS_CMD_QUERY_PORT]		= ib_uverbs_query_port,
-	[IB_USER_VERBS_CMD_ALLOC_PD]		= ib_uverbs_alloc_pd,
-	[IB_USER_VERBS_CMD_DEALLOC_PD]		= ib_uverbs_dealloc_pd,
-	[IB_USER_VERBS_CMD_REG_MR]		= ib_uverbs_reg_mr,
-	[IB_USER_VERBS_CMD_REREG_MR]		= ib_uverbs_rereg_mr,
-	[IB_USER_VERBS_CMD_DEREG_MR]		= ib_uverbs_dereg_mr,
-	[IB_USER_VERBS_CMD_ALLOC_MW]		= ib_uverbs_alloc_mw,
-	[IB_USER_VERBS_CMD_DEALLOC_MW]		= ib_uverbs_dealloc_mw,
-	[IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL] = ib_uverbs_create_comp_channel,
-	[IB_USER_VERBS_CMD_CREATE_CQ]		= ib_uverbs_create_cq,
-	[IB_USER_VERBS_CMD_RESIZE_CQ]		= ib_uverbs_resize_cq,
-	[IB_USER_VERBS_CMD_POLL_CQ]		= ib_uverbs_poll_cq,
-	[IB_USER_VERBS_CMD_REQ_NOTIFY_CQ]	= ib_uverbs_req_notify_cq,
-	[IB_USER_VERBS_CMD_DESTROY_CQ]		= ib_uverbs_destroy_cq,
-	[IB_USER_VERBS_CMD_CREATE_QP]		= ib_uverbs_create_qp,
-	[IB_USER_VERBS_CMD_QUERY_QP]		= ib_uverbs_query_qp,
-	[IB_USER_VERBS_CMD_MODIFY_QP]		= ib_uverbs_modify_qp,
-	[IB_USER_VERBS_CMD_DESTROY_QP]		= ib_uverbs_destroy_qp,
-	[IB_USER_VERBS_CMD_POST_SEND]		= ib_uverbs_post_send,
-	[IB_USER_VERBS_CMD_POST_RECV]		= ib_uverbs_post_recv,
-	[IB_USER_VERBS_CMD_POST_SRQ_RECV]	= ib_uverbs_post_srq_recv,
-	[IB_USER_VERBS_CMD_CREATE_AH]		= ib_uverbs_create_ah,
-	[IB_USER_VERBS_CMD_DESTROY_AH]		= ib_uverbs_destroy_ah,
-	[IB_USER_VERBS_CMD_ATTACH_MCAST]	= ib_uverbs_attach_mcast,
-	[IB_USER_VERBS_CMD_DETACH_MCAST]	= ib_uverbs_detach_mcast,
-	[IB_USER_VERBS_CMD_CREATE_SRQ]		= ib_uverbs_create_srq,
-	[IB_USER_VERBS_CMD_MODIFY_SRQ]		= ib_uverbs_modify_srq,
-	[IB_USER_VERBS_CMD_QUERY_SRQ]		= ib_uverbs_query_srq,
-	[IB_USER_VERBS_CMD_DESTROY_SRQ]		= ib_uverbs_destroy_srq,
-	[IB_USER_VERBS_CMD_OPEN_XRCD]		= ib_uverbs_open_xrcd,
-	[IB_USER_VERBS_CMD_CLOSE_XRCD]		= ib_uverbs_close_xrcd,
-	[IB_USER_VERBS_CMD_CREATE_XSRQ]		= ib_uverbs_create_xsrq,
-	[IB_USER_VERBS_CMD_OPEN_QP]		= ib_uverbs_open_qp,
-};
-
-static int (*uverbs_ex_cmd_table[])(struct ib_uverbs_file *file,
-				    struct ib_udata *ucore,
-				    struct ib_udata *uhw) = {
-	[IB_USER_VERBS_EX_CMD_CREATE_FLOW]	= ib_uverbs_ex_create_flow,
-	[IB_USER_VERBS_EX_CMD_DESTROY_FLOW]	= ib_uverbs_ex_destroy_flow,
-	[IB_USER_VERBS_EX_CMD_QUERY_DEVICE]	= ib_uverbs_ex_query_device,
-	[IB_USER_VERBS_EX_CMD_CREATE_CQ]	= ib_uverbs_ex_create_cq,
-	[IB_USER_VERBS_EX_CMD_CREATE_QP]        = ib_uverbs_ex_create_qp,
-	[IB_USER_VERBS_EX_CMD_CREATE_WQ]        = ib_uverbs_ex_create_wq,
-	[IB_USER_VERBS_EX_CMD_MODIFY_WQ]        = ib_uverbs_ex_modify_wq,
-	[IB_USER_VERBS_EX_CMD_DESTROY_WQ]       = ib_uverbs_ex_destroy_wq,
-	[IB_USER_VERBS_EX_CMD_CREATE_RWQ_IND_TBL] = ib_uverbs_ex_create_rwq_ind_table,
-	[IB_USER_VERBS_EX_CMD_DESTROY_RWQ_IND_TBL] = ib_uverbs_ex_destroy_rwq_ind_table,
-	[IB_USER_VERBS_EX_CMD_MODIFY_QP]        = ib_uverbs_ex_modify_qp,
-	[IB_USER_VERBS_EX_CMD_MODIFY_CQ]        = ib_uverbs_ex_modify_cq,
-};
-
+static DEFINE_IDA(uverbs_ida);
 static void ib_uverbs_add_one(struct ib_device *device);
 static void ib_uverbs_remove_one(struct ib_device *device, void *client_data);
 
@@ -138,7 +82,7 @@
  * Must be called with the ufile->device->disassociate_srcu held, and the lock
  * must be held until use of the ucontext is finished.
  */
-struct ib_ucontext *ib_uverbs_get_ucontext(struct ib_uverbs_file *ufile)
+struct ib_ucontext *ib_uverbs_get_ucontext_file(struct ib_uverbs_file *ufile)
 {
 	/*
 	 * We do not hold the hw_destroy_rwsem lock for this flow, instead
@@ -156,33 +100,31 @@
 
 	return ucontext;
 }
-EXPORT_SYMBOL(ib_uverbs_get_ucontext);
+EXPORT_SYMBOL(ib_uverbs_get_ucontext_file);
 
 int uverbs_dealloc_mw(struct ib_mw *mw)
 {
 	struct ib_pd *pd = mw->pd;
 	int ret;
 
-	ret = mw->device->dealloc_mw(mw);
+	ret = mw->device->ops.dealloc_mw(mw);
 	if (!ret)
 		atomic_dec(&pd->usecnt);
 	return ret;
 }
 
-static void ib_uverbs_release_dev(struct kobject *kobj)
+static void ib_uverbs_release_dev(struct device *device)
 {
 	struct ib_uverbs_device *dev =
-		container_of(kobj, struct ib_uverbs_device, kobj);
+			container_of(device, struct ib_uverbs_device, dev);
 
 	uverbs_destroy_api(dev->uapi);
 	cleanup_srcu_struct(&dev->disassociate_srcu);
+	mutex_destroy(&dev->lists_mutex);
+	mutex_destroy(&dev->xrcd_tree_mutex);
 	kfree(dev);
 }
 
-static struct kobj_type ib_uverbs_dev_ktype = {
-	.release = ib_uverbs_release_dev,
-};
-
 static void ib_uverbs_release_async_event_file(struct kref *ref)
 {
 	struct ib_uverbs_async_event_file *file =
@@ -258,14 +200,22 @@
 	srcu_key = srcu_read_lock(&file->device->disassociate_srcu);
 	ib_dev = srcu_dereference(file->device->ib_dev,
 				  &file->device->disassociate_srcu);
-	if (ib_dev && !ib_dev->disassociate_ucontext)
-		module_put(ib_dev->owner);
+	if (ib_dev && !ib_dev->ops.disassociate_ucontext)
+		module_put(ib_dev->ops.owner);
 	srcu_read_unlock(&file->device->disassociate_srcu, srcu_key);
 
 	if (atomic_dec_and_test(&file->device->refcount))
 		ib_uverbs_comp_dev(file->device);
 
-	kobject_put(&file->device->kobj);
+	if (file->async_file)
+		kref_put(&file->async_file->ref,
+			 ib_uverbs_release_async_event_file);
+	put_device(&file->device->dev);
+
+	if (file->disassociate_page)
+		__free_pages(file->disassociate_page, 0);
+	mutex_destroy(&file->umap_lock);
+	mutex_destroy(&file->ucontext_lock);
 	kfree(file);
 }
 
@@ -649,51 +599,19 @@
 	return filp;
 }
 
-static bool verify_command_mask(struct ib_uverbs_file *ufile, u32 command,
-				bool extended)
-{
-	if (!extended)
-		return ufile->uverbs_cmd_mask & BIT_ULL(command);
-
-	return ufile->uverbs_ex_cmd_mask & BIT_ULL(command);
-}
-
-static bool verify_command_idx(u32 command, bool extended)
-{
-	if (extended)
-		return command < ARRAY_SIZE(uverbs_ex_cmd_table) &&
-		       uverbs_ex_cmd_table[command];
-
-	return command < ARRAY_SIZE(uverbs_cmd_table) &&
-	       uverbs_cmd_table[command];
-}
-
-static ssize_t process_hdr(struct ib_uverbs_cmd_hdr *hdr,
-			   u32 *command, bool *extended)
-{
-	if (hdr->command & ~(u32)(IB_USER_VERBS_CMD_FLAG_EXTENDED |
-				   IB_USER_VERBS_CMD_COMMAND_MASK))
-		return -EINVAL;
-
-	*command = hdr->command & IB_USER_VERBS_CMD_COMMAND_MASK;
-	*extended = hdr->command & IB_USER_VERBS_CMD_FLAG_EXTENDED;
-
-	if (!verify_command_idx(*command, *extended))
-		return -EOPNOTSUPP;
-
-	return 0;
-}
-
 static ssize_t verify_hdr(struct ib_uverbs_cmd_hdr *hdr,
-			  struct ib_uverbs_ex_cmd_hdr *ex_hdr,
-			  size_t count, bool extended)
+			  struct ib_uverbs_ex_cmd_hdr *ex_hdr, size_t count,
+			  const struct uverbs_api_write_method *method_elm)
 {
-	if (extended) {
+	if (method_elm->is_ex) {
 		count -= sizeof(*hdr) + sizeof(*ex_hdr);
 
 		if ((hdr->in_words + ex_hdr->provider_in_words) * 8 != count)
 			return -EINVAL;
 
+		if (hdr->in_words * 8 < method_elm->req_size)
+			return -ENOSPC;
+
 		if (ex_hdr->cmd_hdr_reserved)
 			return -EINVAL;
 
@@ -701,8 +619,10 @@
 			if (!hdr->out_words && !ex_hdr->provider_out_words)
 				return -EINVAL;
 
-			if (!access_ok(VERIFY_WRITE,
-				       u64_to_user_ptr(ex_hdr->response),
+			if (hdr->out_words * 8 < method_elm->resp_size)
+				return -ENOSPC;
+
+			if (!access_ok(u64_to_user_ptr(ex_hdr->response),
 				       (hdr->out_words + ex_hdr->provider_out_words) * 8))
 				return -EFAULT;
 		} else {
@@ -717,6 +637,24 @@
 	if (hdr->in_words * 4 != count)
 		return -EINVAL;
 
+	if (count < method_elm->req_size + sizeof(hdr)) {
+		/*
+		 * rdma-core v18 and v19 have a bug where they send DESTROY_CQ
+		 * with a 16 byte write instead of 24. Old kernels didn't
+		 * check the size so they allowed this. Now that the size is
+		 * checked provide a compatibility work around to not break
+		 * those userspaces.
+		 */
+		if (hdr->command == IB_USER_VERBS_CMD_DESTROY_CQ &&
+		    count == 16) {
+			hdr->in_words = 6;
+			return 0;
+		}
+		return -ENOSPC;
+	}
+	if (hdr->out_words * 4 < method_elm->resp_size)
+		return -ENOSPC;
+
 	return 0;
 }
 
@@ -724,11 +662,12 @@
 			     size_t count, loff_t *pos)
 {
 	struct ib_uverbs_file *file = filp->private_data;
+	const struct uverbs_api_write_method *method_elm;
+	struct uverbs_api *uapi = file->device->uapi;
 	struct ib_uverbs_ex_cmd_hdr ex_hdr;
 	struct ib_uverbs_cmd_hdr hdr;
-	bool extended;
+	struct uverbs_attr_bundle bundle;
 	int srcu_key;
-	u32 command;
 	ssize_t ret;
 
 	if (!ib_safe_file_access(filp)) {
@@ -743,57 +682,94 @@
 	if (copy_from_user(&hdr, buf, sizeof(hdr)))
 		return -EFAULT;
 
-	ret = process_hdr(&hdr, &command, &extended);
-	if (ret)
-		return ret;
+	method_elm = uapi_get_method(uapi, hdr.command);
+	if (IS_ERR(method_elm))
+		return PTR_ERR(method_elm);
 
-	if (extended) {
+	if (method_elm->is_ex) {
 		if (count < (sizeof(hdr) + sizeof(ex_hdr)))
 			return -EINVAL;
 		if (copy_from_user(&ex_hdr, buf + sizeof(hdr), sizeof(ex_hdr)))
 			return -EFAULT;
 	}
 
-	ret = verify_hdr(&hdr, &ex_hdr, count, extended);
+	ret = verify_hdr(&hdr, &ex_hdr, count, method_elm);
 	if (ret)
 		return ret;
 
 	srcu_key = srcu_read_lock(&file->device->disassociate_srcu);
 
-	if (!verify_command_mask(file, command, extended)) {
-		ret = -EOPNOTSUPP;
-		goto out;
-	}
-
 	buf += sizeof(hdr);
 
-	if (!extended) {
-		ret = uverbs_cmd_table[command](file, buf,
-						hdr.in_words * 4,
-						hdr.out_words * 4);
-	} else {
-		struct ib_udata ucore;
-		struct ib_udata uhw;
+	memset(bundle.attr_present, 0, sizeof(bundle.attr_present));
+	bundle.ufile = file;
+	bundle.context = NULL; /* only valid if bundle has uobject */
+	if (!method_elm->is_ex) {
+		size_t in_len = hdr.in_words * 4 - sizeof(hdr);
+		size_t out_len = hdr.out_words * 4;
+		u64 response = 0;
 
+		if (method_elm->has_udata) {
+			bundle.driver_udata.inlen =
+				in_len - method_elm->req_size;
+			in_len = method_elm->req_size;
+			if (bundle.driver_udata.inlen)
+				bundle.driver_udata.inbuf = buf + in_len;
+			else
+				bundle.driver_udata.inbuf = NULL;
+		} else {
+			memset(&bundle.driver_udata, 0,
+			       sizeof(bundle.driver_udata));
+		}
+
+		if (method_elm->has_resp) {
+			/*
+			 * The macros check that if has_resp is set
+			 * then the command request structure starts
+			 * with a '__aligned u64 response' member.
+			 */
+			ret = get_user(response, (const u64 __user *)buf);
+			if (ret)
+				goto out_unlock;
+
+			if (method_elm->has_udata) {
+				bundle.driver_udata.outlen =
+					out_len - method_elm->resp_size;
+				out_len = method_elm->resp_size;
+				if (bundle.driver_udata.outlen)
+					bundle.driver_udata.outbuf =
+						u64_to_user_ptr(response +
+								out_len);
+				else
+					bundle.driver_udata.outbuf = NULL;
+			}
+		} else {
+			bundle.driver_udata.outlen = 0;
+			bundle.driver_udata.outbuf = NULL;
+		}
+
+		ib_uverbs_init_udata_buf_or_null(
+			&bundle.ucore, buf, u64_to_user_ptr(response),
+			in_len, out_len);
+	} else {
 		buf += sizeof(ex_hdr);
 
-		ib_uverbs_init_udata_buf_or_null(&ucore, buf,
+		ib_uverbs_init_udata_buf_or_null(&bundle.ucore, buf,
 					u64_to_user_ptr(ex_hdr.response),
 					hdr.in_words * 8, hdr.out_words * 8);
 
-		ib_uverbs_init_udata_buf_or_null(&uhw,
-					buf + ucore.inlen,
-					u64_to_user_ptr(ex_hdr.response) + ucore.outlen,
-					ex_hdr.provider_in_words * 8,
-					ex_hdr.provider_out_words * 8);
+		ib_uverbs_init_udata_buf_or_null(
+			&bundle.driver_udata, buf + bundle.ucore.inlen,
+			u64_to_user_ptr(ex_hdr.response) + bundle.ucore.outlen,
+			ex_hdr.provider_in_words * 8,
+			ex_hdr.provider_out_words * 8);
 
-		ret = uverbs_ex_cmd_table[command](file, &ucore, &uhw);
-		ret = (ret) ? : count;
 	}
 
-out:
+	ret = method_elm->handler(&bundle);
+out_unlock:
 	srcu_read_unlock(&file->device->disassociate_srcu, srcu_key);
-	return ret;
+	return (ret) ? : count;
 }
 
 static int ib_uverbs_mmap(struct file *filp, struct vm_area_struct *vma)
@@ -804,19 +780,253 @@
 	int srcu_key;
 
 	srcu_key = srcu_read_lock(&file->device->disassociate_srcu);
-	ucontext = ib_uverbs_get_ucontext(file);
+	ucontext = ib_uverbs_get_ucontext_file(file);
 	if (IS_ERR(ucontext)) {
 		ret = PTR_ERR(ucontext);
 		goto out;
 	}
 
-	ret = ucontext->device->mmap(ucontext, vma);
+	ret = ucontext->device->ops.mmap(ucontext, vma);
 out:
 	srcu_read_unlock(&file->device->disassociate_srcu, srcu_key);
 	return ret;
 }
 
 /*
+ * Each time we map IO memory into user space this keeps track of the mapping.
+ * When the device is hot-unplugged we 'zap' the mmaps in user space to point
+ * to the zero page and allow the hot unplug to proceed.
+ *
+ * This is necessary for cases like PCI physical hot unplug as the actual BAR
+ * memory may vanish after this and access to it from userspace could MCE.
+ *
+ * RDMA drivers supporting disassociation must have their user space designed
+ * to cope in some way with their IO pages going to the zero page.
+ */
+struct rdma_umap_priv {
+	struct vm_area_struct *vma;
+	struct list_head list;
+};
+
+static const struct vm_operations_struct rdma_umap_ops;
+
+static void rdma_umap_priv_init(struct rdma_umap_priv *priv,
+				struct vm_area_struct *vma)
+{
+	struct ib_uverbs_file *ufile = vma->vm_file->private_data;
+
+	priv->vma = vma;
+	vma->vm_private_data = priv;
+	vma->vm_ops = &rdma_umap_ops;
+
+	mutex_lock(&ufile->umap_lock);
+	list_add(&priv->list, &ufile->umaps);
+	mutex_unlock(&ufile->umap_lock);
+}
+
+/*
+ * The VMA has been dup'd, initialize the vm_private_data with a new tracking
+ * struct
+ */
+static void rdma_umap_open(struct vm_area_struct *vma)
+{
+	struct ib_uverbs_file *ufile = vma->vm_file->private_data;
+	struct rdma_umap_priv *opriv = vma->vm_private_data;
+	struct rdma_umap_priv *priv;
+
+	if (!opriv)
+		return;
+
+	/* We are racing with disassociation */
+	if (!down_read_trylock(&ufile->hw_destroy_rwsem))
+		goto out_zap;
+	/*
+	 * Disassociation already completed, the VMA should already be zapped.
+	 */
+	if (!ufile->ucontext)
+		goto out_unlock;
+
+	priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+	if (!priv)
+		goto out_unlock;
+	rdma_umap_priv_init(priv, vma);
+
+	up_read(&ufile->hw_destroy_rwsem);
+	return;
+
+out_unlock:
+	up_read(&ufile->hw_destroy_rwsem);
+out_zap:
+	/*
+	 * We can't allow the VMA to be created with the actual IO pages, that
+	 * would break our API contract, and it can't be stopped at this
+	 * point, so zap it.
+	 */
+	vma->vm_private_data = NULL;
+	zap_vma_ptes(vma, vma->vm_start, vma->vm_end - vma->vm_start);
+}
+
+static void rdma_umap_close(struct vm_area_struct *vma)
+{
+	struct ib_uverbs_file *ufile = vma->vm_file->private_data;
+	struct rdma_umap_priv *priv = vma->vm_private_data;
+
+	if (!priv)
+		return;
+
+	/*
+	 * The vma holds a reference on the struct file that created it, which
+	 * in turn means that the ib_uverbs_file is guaranteed to exist at
+	 * this point.
+	 */
+	mutex_lock(&ufile->umap_lock);
+	list_del(&priv->list);
+	mutex_unlock(&ufile->umap_lock);
+	kfree(priv);
+}
+
+/*
+ * Once the zap_vma_ptes has been called touches to the VMA will come here and
+ * we return a dummy writable zero page for all the pfns.
+ */
+static vm_fault_t rdma_umap_fault(struct vm_fault *vmf)
+{
+	struct ib_uverbs_file *ufile = vmf->vma->vm_file->private_data;
+	struct rdma_umap_priv *priv = vmf->vma->vm_private_data;
+	vm_fault_t ret = 0;
+
+	if (!priv)
+		return VM_FAULT_SIGBUS;
+
+	/* Read only pages can just use the system zero page. */
+	if (!(vmf->vma->vm_flags & (VM_WRITE | VM_MAYWRITE))) {
+		vmf->page = ZERO_PAGE(vmf->address);
+		get_page(vmf->page);
+		return 0;
+	}
+
+	mutex_lock(&ufile->umap_lock);
+	if (!ufile->disassociate_page)
+		ufile->disassociate_page =
+			alloc_pages(vmf->gfp_mask | __GFP_ZERO, 0);
+
+	if (ufile->disassociate_page) {
+		/*
+		 * This VMA is forced to always be shared so this doesn't have
+		 * to worry about COW.
+		 */
+		vmf->page = ufile->disassociate_page;
+		get_page(vmf->page);
+	} else {
+		ret = VM_FAULT_SIGBUS;
+	}
+	mutex_unlock(&ufile->umap_lock);
+
+	return ret;
+}
+
+static const struct vm_operations_struct rdma_umap_ops = {
+	.open = rdma_umap_open,
+	.close = rdma_umap_close,
+	.fault = rdma_umap_fault,
+};
+
+/*
+ * Map IO memory into a process. This is to be called by drivers as part of
+ * their mmap() functions if they wish to send something like PCI-E BAR memory
+ * to userspace.
+ */
+int rdma_user_mmap_io(struct ib_ucontext *ucontext, struct vm_area_struct *vma,
+		      unsigned long pfn, unsigned long size, pgprot_t prot)
+{
+	struct ib_uverbs_file *ufile = ucontext->ufile;
+	struct rdma_umap_priv *priv;
+
+	if (!(vma->vm_flags & VM_SHARED))
+		return -EINVAL;
+
+	if (vma->vm_end - vma->vm_start != size)
+		return -EINVAL;
+
+	/* Driver is using this wrong, must be called by ib_uverbs_mmap */
+	if (WARN_ON(!vma->vm_file ||
+		    vma->vm_file->private_data != ufile))
+		return -EINVAL;
+	lockdep_assert_held(&ufile->device->disassociate_srcu);
+
+	priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+	if (!priv)
+		return -ENOMEM;
+
+	vma->vm_page_prot = prot;
+	if (io_remap_pfn_range(vma, vma->vm_start, pfn, size, prot)) {
+		kfree(priv);
+		return -EAGAIN;
+	}
+
+	rdma_umap_priv_init(priv, vma);
+	return 0;
+}
+EXPORT_SYMBOL(rdma_user_mmap_io);
+
+void uverbs_user_mmap_disassociate(struct ib_uverbs_file *ufile)
+{
+	struct rdma_umap_priv *priv, *next_priv;
+
+	lockdep_assert_held(&ufile->hw_destroy_rwsem);
+
+	while (1) {
+		struct mm_struct *mm = NULL;
+
+		/* Get an arbitrary mm pointer that hasn't been cleaned yet */
+		mutex_lock(&ufile->umap_lock);
+		while (!list_empty(&ufile->umaps)) {
+			int ret;
+
+			priv = list_first_entry(&ufile->umaps,
+						struct rdma_umap_priv, list);
+			mm = priv->vma->vm_mm;
+			ret = mmget_not_zero(mm);
+			if (!ret) {
+				list_del_init(&priv->list);
+				mm = NULL;
+				continue;
+			}
+			break;
+		}
+		mutex_unlock(&ufile->umap_lock);
+		if (!mm)
+			return;
+
+		/*
+		 * The umap_lock is nested under mmap_sem since it used within
+		 * the vma_ops callbacks, so we have to clean the list one mm
+		 * at a time to get the lock ordering right. Typically there
+		 * will only be one mm, so no big deal.
+		 */
+		down_read(&mm->mmap_sem);
+		if (!mmget_still_valid(mm))
+			goto skip_mm;
+		mutex_lock(&ufile->umap_lock);
+		list_for_each_entry_safe (priv, next_priv, &ufile->umaps,
+					  list) {
+			struct vm_area_struct *vma = priv->vma;
+
+			if (vma->vm_mm != mm)
+				continue;
+			list_del_init(&priv->list);
+
+			zap_vma_ptes(vma, vma->vm_start,
+				     vma->vm_end - vma->vm_start);
+		}
+		mutex_unlock(&ufile->umap_lock);
+	skip_mm:
+		up_read(&mm->mmap_sem);
+		mmput(mm);
+	}
+}
+
+/*
  * ib_uverbs_open() does not need the BKL:
  *
  *  - the ib_uverbs_device structures are properly reference counted and
@@ -839,6 +1049,7 @@
 	if (!atomic_inc_not_zero(&dev->refcount))
 		return -ENXIO;
 
+	get_device(&dev->dev);
 	srcu_key = srcu_read_lock(&dev->disassociate_srcu);
 	mutex_lock(&dev->lists_mutex);
 	ib_dev = srcu_dereference(dev->ib_dev,
@@ -848,13 +1059,18 @@
 		goto err;
 	}
 
+	if (!rdma_dev_access_netns(ib_dev, current->nsproxy->net_ns)) {
+		ret = -EPERM;
+		goto err;
+	}
+
 	/* In case IB device supports disassociate ucontext, there is no hard
 	 * dependency between uverbs device and its low level device.
 	 */
-	module_dependent = !(ib_dev->disassociate_ucontext);
+	module_dependent = !(ib_dev->ops.disassociate_ucontext);
 
 	if (module_dependent) {
-		if (!try_module_get(ib_dev->owner)) {
+		if (!try_module_get(ib_dev->ops.owner)) {
 			ret = -ENODEV;
 			goto err;
 		}
@@ -876,22 +1092,20 @@
 	spin_lock_init(&file->uobjects_lock);
 	INIT_LIST_HEAD(&file->uobjects);
 	init_rwsem(&file->hw_destroy_rwsem);
+	mutex_init(&file->umap_lock);
+	INIT_LIST_HEAD(&file->umaps);
 
 	filp->private_data = file;
-	kobject_get(&dev->kobj);
 	list_add_tail(&file->list, &dev->uverbs_file_list);
 	mutex_unlock(&dev->lists_mutex);
 	srcu_read_unlock(&dev->disassociate_srcu, srcu_key);
 
-	file->uverbs_cmd_mask = ib_dev->uverbs_cmd_mask;
-	file->uverbs_ex_cmd_mask = ib_dev->uverbs_ex_cmd_mask;
-
 	setup_ufile_idr_uobject(file);
 
-	return nonseekable_open(inode, filp);
+	return stream_open(inode, filp);
 
 err_module:
-	module_put(ib_dev->owner);
+	module_put(ib_dev->ops.owner);
 
 err:
 	mutex_unlock(&dev->lists_mutex);
@@ -899,6 +1113,7 @@
 	if (atomic_dec_and_test(&dev->refcount))
 		ib_uverbs_comp_dev(dev);
 
+	put_device(&dev->dev);
 	return ret;
 }
 
@@ -909,16 +1124,9 @@
 	uverbs_destroy_ufile_hw(file, RDMA_REMOVE_CLOSE);
 
 	mutex_lock(&file->device->lists_mutex);
-	if (!file->is_closed) {
-		list_del(&file->list);
-		file->is_closed = 1;
-	}
+	list_del_init(&file->list);
 	mutex_unlock(&file->device->lists_mutex);
 
-	if (file->async_file)
-		kref_put(&file->async_file->ref,
-			 ib_uverbs_release_async_event_file);
-
 	kref_put(&file->ref, ib_uverbs_release_file);
 
 	return 0;
@@ -945,52 +1153,89 @@
 	.compat_ioctl = ib_uverbs_ioctl,
 };
 
+static int ib_uverbs_get_nl_info(struct ib_device *ibdev, void *client_data,
+				 struct ib_client_nl_info *res)
+{
+	struct ib_uverbs_device *uverbs_dev = client_data;
+	int ret;
+
+	if (res->port != -1)
+		return -EINVAL;
+
+	res->abi = ibdev->ops.uverbs_abi_ver;
+	res->cdev = &uverbs_dev->dev;
+
+	/*
+	 * To support DRIVER_ID binding in userspace some of the driver need
+	 * upgrading to expose their PCI dependent revision information
+	 * through get_context instead of relying on modalias matching. When
+	 * the drivers are fixed they can drop this flag.
+	 */
+	if (!ibdev->ops.uverbs_no_driver_id_binding) {
+		ret = nla_put_u32(res->nl_msg, RDMA_NLDEV_ATTR_UVERBS_DRIVER_ID,
+				  ibdev->ops.driver_id);
+		if (ret)
+			return ret;
+	}
+	return 0;
+}
+
 static struct ib_client uverbs_client = {
 	.name   = "uverbs",
+	.no_kverbs_req = true,
 	.add    = ib_uverbs_add_one,
-	.remove = ib_uverbs_remove_one
+	.remove = ib_uverbs_remove_one,
+	.get_nl_info = ib_uverbs_get_nl_info,
 };
+MODULE_ALIAS_RDMA_CLIENT("uverbs");
 
-static ssize_t show_ibdev(struct device *device, struct device_attribute *attr,
+static ssize_t ibdev_show(struct device *device, struct device_attribute *attr,
 			  char *buf)
 {
+	struct ib_uverbs_device *dev =
+			container_of(device, struct ib_uverbs_device, dev);
 	int ret = -ENODEV;
 	int srcu_key;
-	struct ib_uverbs_device *dev = dev_get_drvdata(device);
 	struct ib_device *ib_dev;
 
-	if (!dev)
-		return -ENODEV;
-
 	srcu_key = srcu_read_lock(&dev->disassociate_srcu);
 	ib_dev = srcu_dereference(dev->ib_dev, &dev->disassociate_srcu);
 	if (ib_dev)
-		ret = sprintf(buf, "%s\n", ib_dev->name);
+		ret = sprintf(buf, "%s\n", dev_name(&ib_dev->dev));
 	srcu_read_unlock(&dev->disassociate_srcu, srcu_key);
 
 	return ret;
 }
-static DEVICE_ATTR(ibdev, S_IRUGO, show_ibdev, NULL);
+static DEVICE_ATTR_RO(ibdev);
 
-static ssize_t show_dev_abi_version(struct device *device,
-				    struct device_attribute *attr, char *buf)
+static ssize_t abi_version_show(struct device *device,
+				struct device_attribute *attr, char *buf)
 {
-	struct ib_uverbs_device *dev = dev_get_drvdata(device);
+	struct ib_uverbs_device *dev =
+			container_of(device, struct ib_uverbs_device, dev);
 	int ret = -ENODEV;
 	int srcu_key;
 	struct ib_device *ib_dev;
 
-	if (!dev)
-		return -ENODEV;
 	srcu_key = srcu_read_lock(&dev->disassociate_srcu);
 	ib_dev = srcu_dereference(dev->ib_dev, &dev->disassociate_srcu);
 	if (ib_dev)
-		ret = sprintf(buf, "%d\n", ib_dev->uverbs_abi_ver);
+		ret = sprintf(buf, "%u\n", ib_dev->ops.uverbs_abi_ver);
 	srcu_read_unlock(&dev->disassociate_srcu, srcu_key);
 
 	return ret;
 }
-static DEVICE_ATTR(abi_version, S_IRUGO, show_dev_abi_version, NULL);
+static DEVICE_ATTR_RO(abi_version);
+
+static struct attribute *ib_dev_attrs[] = {
+	&dev_attr_abi_version.attr,
+	&dev_attr_ibdev.attr,
+	NULL,
+};
+
+static const struct attribute_group dev_attr_group = {
+	.attrs = ib_dev_attrs,
+};
 
 static CLASS_ATTR_STRING(abi_version, S_IRUGO,
 			 __stringify(IB_USER_VERBS_ABI_VERSION));
@@ -1000,7 +1245,7 @@
 {
 	struct uverbs_api *uapi;
 
-	uapi = uverbs_alloc_api(device->driver_specs, device->driver_id);
+	uapi = uverbs_alloc_api(device);
 	if (IS_ERR(uapi))
 		return PTR_ERR(uapi);
 
@@ -1015,7 +1260,7 @@
 	struct ib_uverbs_device *uverbs_dev;
 	int ret;
 
-	if (!device->alloc_ucontext)
+	if (!device->ops.alloc_ucontext)
 		return;
 
 	uverbs_dev = kzalloc(sizeof(*uverbs_dev), GFP_KERNEL);
@@ -1028,65 +1273,56 @@
 		return;
 	}
 
+	device_initialize(&uverbs_dev->dev);
+	uverbs_dev->dev.class = uverbs_class;
+	uverbs_dev->dev.parent = device->dev.parent;
+	uverbs_dev->dev.release = ib_uverbs_release_dev;
+	uverbs_dev->groups[0] = &dev_attr_group;
+	uverbs_dev->dev.groups = uverbs_dev->groups;
 	atomic_set(&uverbs_dev->refcount, 1);
 	init_completion(&uverbs_dev->comp);
 	uverbs_dev->xrcd_tree = RB_ROOT;
 	mutex_init(&uverbs_dev->xrcd_tree_mutex);
-	kobject_init(&uverbs_dev->kobj, &ib_uverbs_dev_ktype);
 	mutex_init(&uverbs_dev->lists_mutex);
 	INIT_LIST_HEAD(&uverbs_dev->uverbs_file_list);
 	INIT_LIST_HEAD(&uverbs_dev->uverbs_events_file_list);
+	rcu_assign_pointer(uverbs_dev->ib_dev, device);
+	uverbs_dev->num_comp_vectors = device->num_comp_vectors;
 
-	devnum = find_first_zero_bit(dev_map, IB_UVERBS_MAX_DEVICES);
-	if (devnum >= IB_UVERBS_MAX_DEVICES)
+	devnum = ida_alloc_max(&uverbs_ida, IB_UVERBS_MAX_DEVICES - 1,
+			       GFP_KERNEL);
+	if (devnum < 0)
 		goto err;
 	uverbs_dev->devnum = devnum;
-	set_bit(devnum, dev_map);
 	if (devnum >= IB_UVERBS_NUM_FIXED_MINOR)
 		base = dynamic_uverbs_dev + devnum - IB_UVERBS_NUM_FIXED_MINOR;
 	else
 		base = IB_UVERBS_BASE_DEV + devnum;
 
-	rcu_assign_pointer(uverbs_dev->ib_dev, device);
-	uverbs_dev->num_comp_vectors = device->num_comp_vectors;
-
 	if (ib_uverbs_create_uapi(device, uverbs_dev))
 		goto err_uapi;
 
-	cdev_init(&uverbs_dev->cdev, NULL);
+	uverbs_dev->dev.devt = base;
+	dev_set_name(&uverbs_dev->dev, "uverbs%d", uverbs_dev->devnum);
+
+	cdev_init(&uverbs_dev->cdev,
+		  device->ops.mmap ? &uverbs_mmap_fops : &uverbs_fops);
 	uverbs_dev->cdev.owner = THIS_MODULE;
-	uverbs_dev->cdev.ops = device->mmap ? &uverbs_mmap_fops : &uverbs_fops;
-	cdev_set_parent(&uverbs_dev->cdev, &uverbs_dev->kobj);
-	kobject_set_name(&uverbs_dev->cdev.kobj, "uverbs%d", uverbs_dev->devnum);
-	if (cdev_add(&uverbs_dev->cdev, base, 1))
-		goto err_cdev;
 
-	uverbs_dev->dev = device_create(uverbs_class, device->dev.parent,
-					uverbs_dev->cdev.dev, uverbs_dev,
-					"uverbs%d", uverbs_dev->devnum);
-	if (IS_ERR(uverbs_dev->dev))
-		goto err_cdev;
-
-	if (device_create_file(uverbs_dev->dev, &dev_attr_ibdev))
-		goto err_class;
-	if (device_create_file(uverbs_dev->dev, &dev_attr_abi_version))
-		goto err_class;
+	ret = cdev_device_add(&uverbs_dev->cdev, &uverbs_dev->dev);
+	if (ret)
+		goto err_uapi;
 
 	ib_set_client_data(device, &uverbs_client, uverbs_dev);
-
 	return;
 
-err_class:
-	device_destroy(uverbs_class, uverbs_dev->cdev.dev);
-err_cdev:
-	cdev_del(&uverbs_dev->cdev);
 err_uapi:
-	clear_bit(devnum, dev_map);
+	ida_free(&uverbs_ida, devnum);
 err:
 	if (atomic_dec_and_test(&uverbs_dev->refcount))
 		ib_uverbs_comp_dev(uverbs_dev);
 	wait_for_completion(&uverbs_dev->comp);
-	kobject_put(&uverbs_dev->kobj);
+	put_device(&uverbs_dev->dev);
 	return;
 }
 
@@ -1107,8 +1343,7 @@
 	while (!list_empty(&uverbs_dev->uverbs_file_list)) {
 		file = list_first_entry(&uverbs_dev->uverbs_file_list,
 					struct ib_uverbs_file, list);
-		file->is_closed = 1;
-		list_del(&file->list);
+		list_del_init(&file->list);
 		kref_get(&file->ref);
 
 		/* We must release the mutex before going ahead and calling
@@ -1156,12 +1391,10 @@
 	if (!uverbs_dev)
 		return;
 
-	dev_set_drvdata(uverbs_dev->dev, NULL);
-	device_destroy(uverbs_class, uverbs_dev->cdev.dev);
-	cdev_del(&uverbs_dev->cdev);
-	clear_bit(uverbs_dev->devnum, dev_map);
+	cdev_device_del(&uverbs_dev->cdev, &uverbs_dev->dev);
+	ida_free(&uverbs_ida, uverbs_dev->devnum);
 
-	if (device->disassociate_ucontext) {
+	if (device->ops.disassociate_ucontext) {
 		/* We disassociate HW resources and immediately return.
 		 * Userspace will see a EIO errno for all future access.
 		 * Upon returning, ib_device may be freed internally and is not
@@ -1182,7 +1415,7 @@
 	if (wait_clients)
 		wait_for_completion(&uverbs_dev->comp);
 
-	kobject_put(&uverbs_dev->kobj);
+	put_device(&uverbs_dev->dev);
 }
 
 static char *uverbs_devnode(struct device *dev, umode_t *mode)
@@ -1258,6 +1491,7 @@
 				 IB_UVERBS_NUM_FIXED_MINOR);
 	unregister_chrdev_region(dynamic_uverbs_dev,
 				 IB_UVERBS_NUM_DYNAMIC_MINOR);
+	mmu_notifier_synchronize();
 }
 
 module_init(ib_uverbs_init);
diff --git a/drivers/infiniband/core/uverbs_std_types.c b/drivers/infiniband/core/uverbs_std_types.c
index 203cc96..35b2e2c 100644
--- a/drivers/infiniband/core/uverbs_std_types.c
+++ b/drivers/infiniband/core/uverbs_std_types.c
@@ -40,13 +40,17 @@
 #include "uverbs.h"
 
 static int uverbs_free_ah(struct ib_uobject *uobject,
-			  enum rdma_remove_reason why)
+			  enum rdma_remove_reason why,
+			  struct uverbs_attr_bundle *attrs)
 {
-	return rdma_destroy_ah((struct ib_ah *)uobject->object);
+	return rdma_destroy_ah_user((struct ib_ah *)uobject->object,
+				    RDMA_DESTROY_AH_SLEEPABLE,
+				    &attrs->driver_udata);
 }
 
 static int uverbs_free_flow(struct ib_uobject *uobject,
-			    enum rdma_remove_reason why)
+			    enum rdma_remove_reason why,
+			    struct uverbs_attr_bundle *attrs)
 {
 	struct ib_flow *flow = (struct ib_flow *)uobject->object;
 	struct ib_uflow_object *uflow =
@@ -54,7 +58,7 @@
 	struct ib_qp *qp = flow->qp;
 	int ret;
 
-	ret = flow->device->destroy_flow(flow);
+	ret = flow->device->ops.destroy_flow(flow);
 	if (!ret) {
 		if (qp)
 			atomic_dec(&qp->usecnt);
@@ -65,13 +69,15 @@
 }
 
 static int uverbs_free_mw(struct ib_uobject *uobject,
-			  enum rdma_remove_reason why)
+			  enum rdma_remove_reason why,
+			  struct uverbs_attr_bundle *attrs)
 {
 	return uverbs_dealloc_mw((struct ib_mw *)uobject->object);
 }
 
 static int uverbs_free_qp(struct ib_uobject *uobject,
-			  enum rdma_remove_reason why)
+			  enum rdma_remove_reason why,
+			  struct uverbs_attr_bundle *attrs)
 {
 	struct ib_qp *qp = uobject->object;
 	struct ib_uqp_object *uqp =
@@ -92,19 +98,20 @@
 		ib_uverbs_detach_umcast(qp, uqp);
 	}
 
-	ret = ib_destroy_qp(qp);
+	ret = ib_destroy_qp_user(qp, &attrs->driver_udata);
 	if (ib_is_destroy_retryable(ret, why, uobject))
 		return ret;
 
 	if (uqp->uxrcd)
 		atomic_dec(&uqp->uxrcd->refcnt);
 
-	ib_uverbs_release_uevent(uobject->context->ufile, &uqp->uevent);
+	ib_uverbs_release_uevent(attrs->ufile, &uqp->uevent);
 	return ret;
 }
 
 static int uverbs_free_rwq_ind_tbl(struct ib_uobject *uobject,
-				   enum rdma_remove_reason why)
+				   enum rdma_remove_reason why,
+				   struct uverbs_attr_bundle *attrs)
 {
 	struct ib_rwq_ind_table *rwq_ind_tbl = uobject->object;
 	struct ib_wq **ind_tbl = rwq_ind_tbl->ind_tbl;
@@ -119,23 +126,25 @@
 }
 
 static int uverbs_free_wq(struct ib_uobject *uobject,
-			  enum rdma_remove_reason why)
+			  enum rdma_remove_reason why,
+			  struct uverbs_attr_bundle *attrs)
 {
 	struct ib_wq *wq = uobject->object;
 	struct ib_uwq_object *uwq =
 		container_of(uobject, struct ib_uwq_object, uevent.uobject);
 	int ret;
 
-	ret = ib_destroy_wq(wq);
+	ret = ib_destroy_wq(wq, &attrs->driver_udata);
 	if (ib_is_destroy_retryable(ret, why, uobject))
 		return ret;
 
-	ib_uverbs_release_uevent(uobject->context->ufile, &uwq->uevent);
+	ib_uverbs_release_uevent(attrs->ufile, &uwq->uevent);
 	return ret;
 }
 
 static int uverbs_free_srq(struct ib_uobject *uobject,
-			   enum rdma_remove_reason why)
+			   enum rdma_remove_reason why,
+			   struct uverbs_attr_bundle *attrs)
 {
 	struct ib_srq *srq = uobject->object;
 	struct ib_uevent_object *uevent =
@@ -143,7 +152,7 @@
 	enum ib_srq_type  srq_type = srq->srq_type;
 	int ret;
 
-	ret = ib_destroy_srq(srq);
+	ret = ib_destroy_srq_user(srq, &attrs->driver_udata);
 	if (ib_is_destroy_retryable(ret, why, uobject))
 		return ret;
 
@@ -154,12 +163,13 @@
 		atomic_dec(&us->uxrcd->refcnt);
 	}
 
-	ib_uverbs_release_uevent(uobject->context->ufile, uevent);
+	ib_uverbs_release_uevent(attrs->ufile, uevent);
 	return ret;
 }
 
 static int uverbs_free_xrcd(struct ib_uobject *uobject,
-			    enum rdma_remove_reason why)
+			    enum rdma_remove_reason why,
+			    struct uverbs_attr_bundle *attrs)
 {
 	struct ib_xrcd *xrcd = uobject->object;
 	struct ib_uxrcd_object *uxrcd =
@@ -170,15 +180,16 @@
 	if (ret)
 		return ret;
 
-	mutex_lock(&uobject->context->ufile->device->xrcd_tree_mutex);
-	ret = ib_uverbs_dealloc_xrcd(uobject, xrcd, why);
-	mutex_unlock(&uobject->context->ufile->device->xrcd_tree_mutex);
+	mutex_lock(&attrs->ufile->device->xrcd_tree_mutex);
+	ret = ib_uverbs_dealloc_xrcd(uobject, xrcd, why, attrs);
+	mutex_unlock(&attrs->ufile->device->xrcd_tree_mutex);
 
 	return ret;
 }
 
 static int uverbs_free_pd(struct ib_uobject *uobject,
-			  enum rdma_remove_reason why)
+			  enum rdma_remove_reason why,
+			  struct uverbs_attr_bundle *attrs)
 {
 	struct ib_pd *pd = uobject->object;
 	int ret;
@@ -187,7 +198,7 @@
 	if (ret)
 		return ret;
 
-	ib_dealloc_pd((struct ib_pd *)uobject->object);
+	ib_dealloc_pd_user(pd, &attrs->driver_udata);
 	return 0;
 }
 
@@ -210,8 +221,7 @@
 	return 0;
 };
 
-int uverbs_destroy_def_handler(struct ib_uverbs_file *file,
-			       struct uverbs_attr_bundle *attrs)
+int uverbs_destroy_def_handler(struct uverbs_attr_bundle *attrs)
 {
 	return 0;
 }
@@ -229,58 +239,106 @@
 	UVERBS_OBJECT_QP,
 	UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_uqp_object), uverbs_free_qp));
 
+DECLARE_UVERBS_NAMED_METHOD_DESTROY(
+	UVERBS_METHOD_MW_DESTROY,
+	UVERBS_ATTR_IDR(UVERBS_ATTR_DESTROY_MW_HANDLE,
+			UVERBS_OBJECT_MW,
+			UVERBS_ACCESS_DESTROY,
+			UA_MANDATORY));
+
 DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_MW,
-			    UVERBS_TYPE_ALLOC_IDR(uverbs_free_mw));
+			    UVERBS_TYPE_ALLOC_IDR(uverbs_free_mw),
+			    &UVERBS_METHOD(UVERBS_METHOD_MW_DESTROY));
 
 DECLARE_UVERBS_NAMED_OBJECT(
 	UVERBS_OBJECT_SRQ,
 	UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_usrq_object),
 				 uverbs_free_srq));
 
+DECLARE_UVERBS_NAMED_METHOD_DESTROY(
+	UVERBS_METHOD_AH_DESTROY,
+	UVERBS_ATTR_IDR(UVERBS_ATTR_DESTROY_AH_HANDLE,
+			UVERBS_OBJECT_AH,
+			UVERBS_ACCESS_DESTROY,
+			UA_MANDATORY));
+
 DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_AH,
-			    UVERBS_TYPE_ALLOC_IDR(uverbs_free_ah));
+			    UVERBS_TYPE_ALLOC_IDR(uverbs_free_ah),
+			    &UVERBS_METHOD(UVERBS_METHOD_AH_DESTROY));
+
+DECLARE_UVERBS_NAMED_METHOD_DESTROY(
+	UVERBS_METHOD_FLOW_DESTROY,
+	UVERBS_ATTR_IDR(UVERBS_ATTR_DESTROY_FLOW_HANDLE,
+			UVERBS_OBJECT_FLOW,
+			UVERBS_ACCESS_DESTROY,
+			UA_MANDATORY));
 
 DECLARE_UVERBS_NAMED_OBJECT(
 	UVERBS_OBJECT_FLOW,
 	UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_uflow_object),
-				 uverbs_free_flow));
+				 uverbs_free_flow),
+			    &UVERBS_METHOD(UVERBS_METHOD_FLOW_DESTROY));
 
 DECLARE_UVERBS_NAMED_OBJECT(
 	UVERBS_OBJECT_WQ,
 	UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_uwq_object), uverbs_free_wq));
 
+DECLARE_UVERBS_NAMED_METHOD_DESTROY(
+	UVERBS_METHOD_RWQ_IND_TBL_DESTROY,
+	UVERBS_ATTR_IDR(UVERBS_ATTR_DESTROY_RWQ_IND_TBL_HANDLE,
+			UVERBS_OBJECT_RWQ_IND_TBL,
+			UVERBS_ACCESS_DESTROY,
+			UA_MANDATORY));
+
 DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_RWQ_IND_TBL,
-			    UVERBS_TYPE_ALLOC_IDR(uverbs_free_rwq_ind_tbl));
+			    UVERBS_TYPE_ALLOC_IDR(uverbs_free_rwq_ind_tbl),
+			    &UVERBS_METHOD(UVERBS_METHOD_RWQ_IND_TBL_DESTROY));
+
+DECLARE_UVERBS_NAMED_METHOD_DESTROY(
+	UVERBS_METHOD_XRCD_DESTROY,
+	UVERBS_ATTR_IDR(UVERBS_ATTR_DESTROY_XRCD_HANDLE,
+			UVERBS_OBJECT_XRCD,
+			UVERBS_ACCESS_DESTROY,
+			UA_MANDATORY));
 
 DECLARE_UVERBS_NAMED_OBJECT(
 	UVERBS_OBJECT_XRCD,
 	UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_uxrcd_object),
-				 uverbs_free_xrcd));
+				 uverbs_free_xrcd),
+			    &UVERBS_METHOD(UVERBS_METHOD_XRCD_DESTROY));
+
+DECLARE_UVERBS_NAMED_METHOD_DESTROY(
+	UVERBS_METHOD_PD_DESTROY,
+	UVERBS_ATTR_IDR(UVERBS_ATTR_DESTROY_PD_HANDLE,
+			UVERBS_OBJECT_PD,
+			UVERBS_ACCESS_DESTROY,
+			UA_MANDATORY));
 
 DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_PD,
-			    UVERBS_TYPE_ALLOC_IDR(uverbs_free_pd));
+			    UVERBS_TYPE_ALLOC_IDR(uverbs_free_pd),
+			    &UVERBS_METHOD(UVERBS_METHOD_PD_DESTROY));
 
-DECLARE_UVERBS_GLOBAL_METHODS(UVERBS_OBJECT_DEVICE);
-
-DECLARE_UVERBS_OBJECT_TREE(uverbs_default_objects,
-			   &UVERBS_OBJECT(UVERBS_OBJECT_DEVICE),
-			   &UVERBS_OBJECT(UVERBS_OBJECT_PD),
-			   &UVERBS_OBJECT(UVERBS_OBJECT_MR),
-			   &UVERBS_OBJECT(UVERBS_OBJECT_COMP_CHANNEL),
-			   &UVERBS_OBJECT(UVERBS_OBJECT_CQ),
-			   &UVERBS_OBJECT(UVERBS_OBJECT_QP),
-			   &UVERBS_OBJECT(UVERBS_OBJECT_AH),
-			   &UVERBS_OBJECT(UVERBS_OBJECT_MW),
-			   &UVERBS_OBJECT(UVERBS_OBJECT_SRQ),
-			   &UVERBS_OBJECT(UVERBS_OBJECT_FLOW),
-			   &UVERBS_OBJECT(UVERBS_OBJECT_WQ),
-			   &UVERBS_OBJECT(UVERBS_OBJECT_RWQ_IND_TBL),
-			   &UVERBS_OBJECT(UVERBS_OBJECT_XRCD),
-			   &UVERBS_OBJECT(UVERBS_OBJECT_FLOW_ACTION),
-			   &UVERBS_OBJECT(UVERBS_OBJECT_DM),
-			   &UVERBS_OBJECT(UVERBS_OBJECT_COUNTERS));
-
-const struct uverbs_object_tree_def *uverbs_default_get_objects(void)
-{
-	return &uverbs_default_objects;
-}
+const struct uapi_definition uverbs_def_obj_intf[] = {
+	UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_PD,
+				      UAPI_DEF_OBJ_NEEDS_FN(dealloc_pd)),
+	UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_COMP_CHANNEL,
+				      UAPI_DEF_OBJ_NEEDS_FN(dealloc_pd)),
+	UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_QP,
+				      UAPI_DEF_OBJ_NEEDS_FN(destroy_qp)),
+	UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_AH,
+				      UAPI_DEF_OBJ_NEEDS_FN(destroy_ah)),
+	UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_MW,
+				      UAPI_DEF_OBJ_NEEDS_FN(dealloc_mw)),
+	UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_SRQ,
+				      UAPI_DEF_OBJ_NEEDS_FN(destroy_srq)),
+	UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_FLOW,
+				      UAPI_DEF_OBJ_NEEDS_FN(destroy_flow)),
+	UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_WQ,
+				      UAPI_DEF_OBJ_NEEDS_FN(destroy_wq)),
+	UAPI_DEF_CHAIN_OBJ_TREE_NAMED(
+		UVERBS_OBJECT_RWQ_IND_TBL,
+		UAPI_DEF_OBJ_NEEDS_FN(destroy_rwq_ind_table)),
+	UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_XRCD,
+				      UAPI_DEF_OBJ_NEEDS_FN(dealloc_xrcd)),
+	{}
+};
diff --git a/drivers/infiniband/core/uverbs_std_types_counters.c b/drivers/infiniband/core/uverbs_std_types_counters.c
index a0ffdcf..9f01330 100644
--- a/drivers/infiniband/core/uverbs_std_types_counters.c
+++ b/drivers/infiniband/core/uverbs_std_types_counters.c
@@ -31,11 +31,13 @@
  * SOFTWARE.
  */
 
+#include "rdma_core.h"
 #include "uverbs.h"
 #include <rdma/uverbs_std_types.h>
 
 static int uverbs_free_counters(struct ib_uobject *uobject,
-				enum rdma_remove_reason why)
+				enum rdma_remove_reason why,
+				struct uverbs_attr_bundle *attrs)
 {
 	struct ib_counters *counters = uobject->object;
 	int ret;
@@ -44,15 +46,15 @@
 	if (ret)
 		return ret;
 
-	return counters->device->destroy_counters(counters);
+	return counters->device->ops.destroy_counters(counters);
 }
 
 static int UVERBS_HANDLER(UVERBS_METHOD_COUNTERS_CREATE)(
-	struct ib_uverbs_file *file, struct uverbs_attr_bundle *attrs)
+	struct uverbs_attr_bundle *attrs)
 {
 	struct ib_uobject *uobj = uverbs_attr_get_uobject(
 		attrs, UVERBS_ATTR_CREATE_COUNTERS_HANDLE);
-	struct ib_device *ib_dev = uobj->context->device;
+	struct ib_device *ib_dev = attrs->context->device;
 	struct ib_counters *counters;
 	int ret;
 
@@ -61,10 +63,10 @@
 	 * have the ability to remove methods from parse tree once
 	 * such condition is met.
 	 */
-	if (!ib_dev->create_counters)
+	if (!ib_dev->ops.create_counters)
 		return -EOPNOTSUPP;
 
-	counters = ib_dev->create_counters(ib_dev, attrs);
+	counters = ib_dev->ops.create_counters(ib_dev, attrs);
 	if (IS_ERR(counters)) {
 		ret = PTR_ERR(counters);
 		goto err_create_counters;
@@ -82,7 +84,7 @@
 }
 
 static int UVERBS_HANDLER(UVERBS_METHOD_COUNTERS_READ)(
-	struct ib_uverbs_file *file, struct uverbs_attr_bundle *attrs)
+	struct uverbs_attr_bundle *attrs)
 {
 	struct ib_counters_read_attr read_attr = {};
 	const struct uverbs_attr *uattr;
@@ -90,7 +92,7 @@
 		uverbs_attr_get_obj(attrs, UVERBS_ATTR_READ_COUNTERS_HANDLE);
 	int ret;
 
-	if (!counters->device->read_counters)
+	if (!counters->device->ops.read_counters)
 		return -EOPNOTSUPP;
 
 	if (!atomic_read(&counters->usecnt))
@@ -109,7 +111,7 @@
 	if (IS_ERR(read_attr.counters_buff))
 		return PTR_ERR(read_attr.counters_buff);
 
-	ret = counters->device->read_counters(counters, &read_attr, attrs);
+	ret = counters->device->ops.read_counters(counters, &read_attr, attrs);
 	if (ret)
 		return ret;
 
@@ -149,3 +151,9 @@
 			    &UVERBS_METHOD(UVERBS_METHOD_COUNTERS_CREATE),
 			    &UVERBS_METHOD(UVERBS_METHOD_COUNTERS_DESTROY),
 			    &UVERBS_METHOD(UVERBS_METHOD_COUNTERS_READ));
+
+const struct uapi_definition uverbs_def_obj_counters[] = {
+	UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_COUNTERS,
+				      UAPI_DEF_OBJ_NEEDS_FN(destroy_counters)),
+	{}
+};
diff --git a/drivers/infiniband/core/uverbs_std_types_cq.c b/drivers/infiniband/core/uverbs_std_types_cq.c
index 5b5f205..e39fe6a 100644
--- a/drivers/infiniband/core/uverbs_std_types_cq.c
+++ b/drivers/infiniband/core/uverbs_std_types_cq.c
@@ -35,7 +35,8 @@
 #include "uverbs.h"
 
 static int uverbs_free_cq(struct ib_uobject *uobject,
-			  enum rdma_remove_reason why)
+			  enum rdma_remove_reason why,
+			  struct uverbs_attr_bundle *attrs)
 {
 	struct ib_cq *cq = uobject->object;
 	struct ib_uverbs_event_queue *ev_queue = cq->cq_context;
@@ -43,12 +44,12 @@
 		container_of(uobject, struct ib_ucq_object, uobject);
 	int ret;
 
-	ret = ib_destroy_cq(cq);
+	ret = ib_destroy_cq_user(cq, &attrs->driver_udata);
 	if (ib_is_destroy_retryable(ret, why, uobject))
 		return ret;
 
 	ib_uverbs_release_ucq(
-		uobject->context->ufile,
+		attrs->ufile,
 		ev_queue ? container_of(ev_queue,
 					struct ib_uverbs_completion_event_file,
 					ev_queue) :
@@ -58,13 +59,12 @@
 }
 
 static int UVERBS_HANDLER(UVERBS_METHOD_CQ_CREATE)(
-	struct ib_uverbs_file *file, struct uverbs_attr_bundle *attrs)
+	struct uverbs_attr_bundle *attrs)
 {
 	struct ib_ucq_object *obj = container_of(
 		uverbs_attr_get_uobject(attrs, UVERBS_ATTR_CREATE_CQ_HANDLE),
 		typeof(*obj), uobject);
-	struct ib_device *ib_dev = obj->uobject.context->device;
-	struct ib_udata uhw;
+	struct ib_device *ib_dev = attrs->context->device;
 	int ret;
 	u64 user_handle;
 	struct ib_cq_init_attr attr = {};
@@ -72,7 +72,7 @@
 	struct ib_uverbs_completion_event_file    *ev_file = NULL;
 	struct ib_uobject *ev_file_uobj;
 
-	if (!ib_dev->create_cq || !ib_dev->destroy_cq)
+	if (!ib_dev->ops.create_cq || !ib_dev->ops.destroy_cq)
 		return -EOPNOTSUPP;
 
 	ret = uverbs_copy_from(&attr.comp_vector, attrs,
@@ -101,7 +101,7 @@
 		uverbs_uobject_get(ev_file_uobj);
 	}
 
-	if (attr.comp_vector >= file->device->num_comp_vectors) {
+	if (attr.comp_vector >= attrs->ufile->device->num_comp_vectors) {
 		ret = -EINVAL;
 		goto err_event_file;
 	}
@@ -111,12 +111,9 @@
 	INIT_LIST_HEAD(&obj->comp_list);
 	INIT_LIST_HEAD(&obj->async_list);
 
-	/* Temporary, only until drivers get the new uverbs_attr_bundle */
-	create_udata(attrs, &uhw);
-
-	cq = ib_dev->create_cq(ib_dev, &attr, obj->uobject.context, &uhw);
-	if (IS_ERR(cq)) {
-		ret = PTR_ERR(cq);
+	cq = rdma_zalloc_drv_obj(ib_dev, ib_cq);
+	if (!cq) {
+		ret = -ENOMEM;
 		goto err_event_file;
 	}
 
@@ -125,11 +122,16 @@
 	cq->comp_handler  = ib_uverbs_comp_handler;
 	cq->event_handler = ib_uverbs_cq_event_handler;
 	cq->cq_context    = ev_file ? &ev_file->ev_queue : NULL;
-	obj->uobject.object = cq;
-	obj->uobject.user_handle = user_handle;
 	atomic_set(&cq->usecnt, 0);
 	cq->res.type = RDMA_RESTRACK_CQ;
-	rdma_restrack_add(&cq->res);
+
+	ret = ib_dev->ops.create_cq(cq, &attr, &attrs->driver_udata);
+	if (ret)
+		goto err_free;
+
+	obj->uobject.object = cq;
+	obj->uobject.user_handle = user_handle;
+	rdma_restrack_uadd(&cq->res);
 
 	ret = uverbs_copy_to(attrs, UVERBS_ATTR_CREATE_CQ_RESP_CQE, &cq->cqe,
 			     sizeof(cq->cqe));
@@ -138,8 +140,10 @@
 
 	return 0;
 err_cq:
-	ib_destroy_cq(cq);
-
+	ib_destroy_cq_user(cq, uverbs_get_cleared_udata(attrs));
+	cq = NULL;
+err_free:
+	kfree(cq);
 err_event_file:
 	if (ev_file)
 		uverbs_uobject_put(ev_file_uobj);
@@ -173,7 +177,7 @@
 	UVERBS_ATTR_UHW());
 
 static int UVERBS_HANDLER(UVERBS_METHOD_CQ_DESTROY)(
-	struct ib_uverbs_file *file, struct uverbs_attr_bundle *attrs)
+	struct uverbs_attr_bundle *attrs)
 {
 	struct ib_uobject *uobj =
 		uverbs_attr_get_uobject(attrs, UVERBS_ATTR_DESTROY_CQ_HANDLE);
@@ -207,3 +211,9 @@
 	&UVERBS_METHOD(UVERBS_METHOD_CQ_DESTROY)
 #endif
 );
+
+const struct uapi_definition uverbs_def_obj_cq[] = {
+	UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_CQ,
+				      UAPI_DEF_OBJ_NEEDS_FN(destroy_cq)),
+	{}
+};
diff --git a/drivers/infiniband/core/uverbs_std_types_device.c b/drivers/infiniband/core/uverbs_std_types_device.c
new file mode 100644
index 0000000..2a3f2f0
--- /dev/null
+++ b/drivers/infiniband/core/uverbs_std_types_device.c
@@ -0,0 +1,230 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright (c) 2018, Mellanox Technologies inc.  All rights reserved.
+ */
+
+#include <rdma/uverbs_std_types.h>
+#include "rdma_core.h"
+#include "uverbs.h"
+#include <rdma/uverbs_ioctl.h>
+#include <rdma/opa_addr.h>
+
+/*
+ * This ioctl method allows calling any defined write or write_ex
+ * handler. This essentially replaces the hdr/ex_hdr system with the ioctl
+ * marshalling, and brings the non-ex path into the same marshalling as the ex
+ * path.
+ */
+static int UVERBS_HANDLER(UVERBS_METHOD_INVOKE_WRITE)(
+	struct uverbs_attr_bundle *attrs)
+{
+	struct uverbs_api *uapi = attrs->ufile->device->uapi;
+	const struct uverbs_api_write_method *method_elm;
+	u32 cmd;
+	int rc;
+
+	rc = uverbs_get_const(&cmd, attrs, UVERBS_ATTR_WRITE_CMD);
+	if (rc)
+		return rc;
+
+	method_elm = uapi_get_method(uapi, cmd);
+	if (IS_ERR(method_elm))
+		return PTR_ERR(method_elm);
+
+	uverbs_fill_udata(attrs, &attrs->ucore, UVERBS_ATTR_CORE_IN,
+			  UVERBS_ATTR_CORE_OUT);
+
+	if (attrs->ucore.inlen < method_elm->req_size ||
+	    attrs->ucore.outlen < method_elm->resp_size)
+		return -ENOSPC;
+
+	return method_elm->handler(attrs);
+}
+
+DECLARE_UVERBS_NAMED_METHOD(UVERBS_METHOD_INVOKE_WRITE,
+			    UVERBS_ATTR_CONST_IN(UVERBS_ATTR_WRITE_CMD,
+						 enum ib_uverbs_write_cmds,
+						 UA_MANDATORY),
+			    UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CORE_IN,
+					       UVERBS_ATTR_MIN_SIZE(sizeof(u32)),
+					       UA_OPTIONAL),
+			    UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_CORE_OUT,
+						UVERBS_ATTR_MIN_SIZE(0),
+						UA_OPTIONAL),
+			    UVERBS_ATTR_UHW());
+
+static uint32_t *
+gather_objects_handle(struct ib_uverbs_file *ufile,
+		      const struct uverbs_api_object *uapi_object,
+		      struct uverbs_attr_bundle *attrs,
+		      ssize_t out_len,
+		      u64 *total)
+{
+	u64 max_count = out_len / sizeof(u32);
+	struct ib_uobject *obj;
+	u64 count = 0;
+	u32 *handles;
+
+	/* Allocated memory that cannot page out where we gather
+	 * all object ids under a spin_lock.
+	 */
+	handles = uverbs_zalloc(attrs, out_len);
+	if (IS_ERR(handles))
+		return handles;
+
+	spin_lock_irq(&ufile->uobjects_lock);
+	list_for_each_entry(obj, &ufile->uobjects, list) {
+		u32 obj_id = obj->id;
+
+		if (obj->uapi_object != uapi_object)
+			continue;
+
+		if (count >= max_count)
+			break;
+
+		handles[count] = obj_id;
+		count++;
+	}
+	spin_unlock_irq(&ufile->uobjects_lock);
+
+	*total = count;
+	return handles;
+}
+
+static int UVERBS_HANDLER(UVERBS_METHOD_INFO_HANDLES)(
+	struct uverbs_attr_bundle *attrs)
+{
+	const struct uverbs_api_object *uapi_object;
+	ssize_t out_len;
+	u64 total = 0;
+	u16 object_id;
+	u32 *handles;
+	int ret;
+
+	out_len = uverbs_attr_get_len(attrs, UVERBS_ATTR_INFO_HANDLES_LIST);
+	if (out_len <= 0 || (out_len % sizeof(u32) != 0))
+		return -EINVAL;
+
+	ret = uverbs_get_const(&object_id, attrs, UVERBS_ATTR_INFO_OBJECT_ID);
+	if (ret)
+		return ret;
+
+	uapi_object = uapi_get_object(attrs->ufile->device->uapi, object_id);
+	if (!uapi_object)
+		return -EINVAL;
+
+	handles = gather_objects_handle(attrs->ufile, uapi_object, attrs,
+					out_len, &total);
+	if (IS_ERR(handles))
+		return PTR_ERR(handles);
+
+	ret = uverbs_copy_to(attrs, UVERBS_ATTR_INFO_HANDLES_LIST, handles,
+			     sizeof(u32) * total);
+	if (ret)
+		goto err;
+
+	ret = uverbs_copy_to(attrs, UVERBS_ATTR_INFO_TOTAL_HANDLES, &total,
+			     sizeof(total));
+err:
+	return ret;
+}
+
+void copy_port_attr_to_resp(struct ib_port_attr *attr,
+			    struct ib_uverbs_query_port_resp *resp,
+			    struct ib_device *ib_dev, u8 port_num)
+{
+	resp->state = attr->state;
+	resp->max_mtu = attr->max_mtu;
+	resp->active_mtu = attr->active_mtu;
+	resp->gid_tbl_len = attr->gid_tbl_len;
+	resp->port_cap_flags = make_port_cap_flags(attr);
+	resp->max_msg_sz = attr->max_msg_sz;
+	resp->bad_pkey_cntr = attr->bad_pkey_cntr;
+	resp->qkey_viol_cntr = attr->qkey_viol_cntr;
+	resp->pkey_tbl_len = attr->pkey_tbl_len;
+
+	if (rdma_is_grh_required(ib_dev, port_num))
+		resp->flags |= IB_UVERBS_QPF_GRH_REQUIRED;
+
+	if (rdma_cap_opa_ah(ib_dev, port_num)) {
+		resp->lid = OPA_TO_IB_UCAST_LID(attr->lid);
+		resp->sm_lid = OPA_TO_IB_UCAST_LID(attr->sm_lid);
+	} else {
+		resp->lid = ib_lid_cpu16(attr->lid);
+		resp->sm_lid = ib_lid_cpu16(attr->sm_lid);
+	}
+
+	resp->lmc = attr->lmc;
+	resp->max_vl_num = attr->max_vl_num;
+	resp->sm_sl = attr->sm_sl;
+	resp->subnet_timeout = attr->subnet_timeout;
+	resp->init_type_reply = attr->init_type_reply;
+	resp->active_width = attr->active_width;
+	resp->active_speed = attr->active_speed;
+	resp->phys_state = attr->phys_state;
+	resp->link_layer = rdma_port_get_link_layer(ib_dev, port_num);
+}
+
+static int UVERBS_HANDLER(UVERBS_METHOD_QUERY_PORT)(
+	struct uverbs_attr_bundle *attrs)
+{
+	struct ib_device *ib_dev;
+	struct ib_port_attr attr = {};
+	struct ib_uverbs_query_port_resp_ex resp = {};
+	struct ib_ucontext *ucontext;
+	int ret;
+	u8 port_num;
+
+	ucontext = ib_uverbs_get_ucontext(attrs);
+	if (IS_ERR(ucontext))
+		return PTR_ERR(ucontext);
+	ib_dev = ucontext->device;
+
+	/* FIXME: Extend the UAPI_DEF_OBJ_NEEDS_FN stuff.. */
+	if (!ib_dev->ops.query_port)
+		return -EOPNOTSUPP;
+
+	ret = uverbs_get_const(&port_num, attrs,
+			       UVERBS_ATTR_QUERY_PORT_PORT_NUM);
+	if (ret)
+		return ret;
+
+	ret = ib_query_port(ib_dev, port_num, &attr);
+	if (ret)
+		return ret;
+
+	copy_port_attr_to_resp(&attr, &resp.legacy_resp, ib_dev, port_num);
+	resp.port_cap_flags2 = attr.port_cap_flags2;
+
+	return uverbs_copy_to_struct_or_zero(attrs, UVERBS_ATTR_QUERY_PORT_RESP,
+					     &resp, sizeof(resp));
+}
+
+DECLARE_UVERBS_NAMED_METHOD(
+	UVERBS_METHOD_INFO_HANDLES,
+	/* Also includes any device specific object ids */
+	UVERBS_ATTR_CONST_IN(UVERBS_ATTR_INFO_OBJECT_ID,
+			     enum uverbs_default_objects, UA_MANDATORY),
+	UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_INFO_TOTAL_HANDLES,
+			    UVERBS_ATTR_TYPE(u32), UA_OPTIONAL),
+	UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_INFO_HANDLES_LIST,
+			    UVERBS_ATTR_MIN_SIZE(sizeof(u32)), UA_OPTIONAL));
+
+DECLARE_UVERBS_NAMED_METHOD(
+	UVERBS_METHOD_QUERY_PORT,
+	UVERBS_ATTR_CONST_IN(UVERBS_ATTR_QUERY_PORT_PORT_NUM, u8, UA_MANDATORY),
+	UVERBS_ATTR_PTR_OUT(
+		UVERBS_ATTR_QUERY_PORT_RESP,
+		UVERBS_ATTR_STRUCT(struct ib_uverbs_query_port_resp_ex,
+				   reserved),
+		UA_MANDATORY));
+
+DECLARE_UVERBS_GLOBAL_METHODS(UVERBS_OBJECT_DEVICE,
+			      &UVERBS_METHOD(UVERBS_METHOD_INVOKE_WRITE),
+			      &UVERBS_METHOD(UVERBS_METHOD_INFO_HANDLES),
+			      &UVERBS_METHOD(UVERBS_METHOD_QUERY_PORT));
+
+const struct uapi_definition uverbs_def_obj_device[] = {
+	UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_DEVICE),
+	{},
+};
diff --git a/drivers/infiniband/core/uverbs_std_types_dm.c b/drivers/infiniband/core/uverbs_std_types_dm.c
index edc3ff7..d5a1de3 100644
--- a/drivers/infiniband/core/uverbs_std_types_dm.c
+++ b/drivers/infiniband/core/uverbs_std_types_dm.c
@@ -30,11 +30,13 @@
  * SOFTWARE.
  */
 
+#include "rdma_core.h"
 #include "uverbs.h"
 #include <rdma/uverbs_std_types.h>
 
 static int uverbs_free_dm(struct ib_uobject *uobject,
-			  enum rdma_remove_reason why)
+			  enum rdma_remove_reason why,
+			  struct uverbs_attr_bundle *attrs)
 {
 	struct ib_dm *dm = uobject->object;
 	int ret;
@@ -43,22 +45,21 @@
 	if (ret)
 		return ret;
 
-	return dm->device->dealloc_dm(dm);
+	return dm->device->ops.dealloc_dm(dm, attrs);
 }
 
-static int
-UVERBS_HANDLER(UVERBS_METHOD_DM_ALLOC)(struct ib_uverbs_file *file,
-				       struct uverbs_attr_bundle *attrs)
+static int UVERBS_HANDLER(UVERBS_METHOD_DM_ALLOC)(
+	struct uverbs_attr_bundle *attrs)
 {
 	struct ib_dm_alloc_attr attr = {};
 	struct ib_uobject *uobj =
 		uverbs_attr_get(attrs, UVERBS_ATTR_ALLOC_DM_HANDLE)
 			->obj_attr.uobject;
-	struct ib_device *ib_dev = uobj->context->device;
+	struct ib_device *ib_dev = attrs->context->device;
 	struct ib_dm *dm;
 	int ret;
 
-	if (!ib_dev->alloc_dm)
+	if (!ib_dev->ops.alloc_dm)
 		return -EOPNOTSUPP;
 
 	ret = uverbs_copy_from(&attr.length, attrs,
@@ -71,7 +72,7 @@
 	if (ret)
 		return ret;
 
-	dm = ib_dev->alloc_dm(ib_dev, uobj->context, &attr, attrs);
+	dm = ib_dev->ops.alloc_dm(ib_dev, attrs->context, &attr, attrs);
 	if (IS_ERR(dm))
 		return PTR_ERR(dm);
 
@@ -109,3 +110,9 @@
 			    UVERBS_TYPE_ALLOC_IDR(uverbs_free_dm),
 			    &UVERBS_METHOD(UVERBS_METHOD_DM_ALLOC),
 			    &UVERBS_METHOD(UVERBS_METHOD_DM_FREE));
+
+const struct uapi_definition uverbs_def_obj_dm[] = {
+	UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_DM,
+				      UAPI_DEF_OBJ_NEEDS_FN(dealloc_dm)),
+	{}
+};
diff --git a/drivers/infiniband/core/uverbs_std_types_flow_action.c b/drivers/infiniband/core/uverbs_std_types_flow_action.c
index d8cfafe..459cf16 100644
--- a/drivers/infiniband/core/uverbs_std_types_flow_action.c
+++ b/drivers/infiniband/core/uverbs_std_types_flow_action.c
@@ -30,11 +30,13 @@
  * SOFTWARE.
  */
 
+#include "rdma_core.h"
 #include "uverbs.h"
 #include <rdma/uverbs_std_types.h>
 
 static int uverbs_free_flow_action(struct ib_uobject *uobject,
-				   enum rdma_remove_reason why)
+				   enum rdma_remove_reason why,
+				   struct uverbs_attr_bundle *attrs)
 {
 	struct ib_flow_action *action = uobject->object;
 	int ret;
@@ -43,7 +45,7 @@
 	if (ret)
 		return ret;
 
-	return action->device->destroy_flow_action(action);
+	return action->device->ops.destroy_flow_action(action);
 }
 
 static u64 esp_flags_uverbs_to_verbs(struct uverbs_attr_bundle *attrs,
@@ -223,7 +225,6 @@
 
 #define ESP_LAST_SUPPORTED_FLAG		IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ESN_NEW_WINDOW
 static int parse_flow_action_esp(struct ib_device *ib_dev,
-				 struct ib_uverbs_file *file,
 				 struct uverbs_attr_bundle *attrs,
 				 struct ib_flow_action_esp_attr *esp_attr,
 				 bool is_modify)
@@ -305,38 +306,36 @@
 }
 
 static int UVERBS_HANDLER(UVERBS_METHOD_FLOW_ACTION_ESP_CREATE)(
-	struct ib_uverbs_file *file, struct uverbs_attr_bundle *attrs)
+	struct uverbs_attr_bundle *attrs)
 {
 	struct ib_uobject *uobj = uverbs_attr_get_uobject(
 		attrs, UVERBS_ATTR_CREATE_FLOW_ACTION_ESP_HANDLE);
-	struct ib_device *ib_dev = uobj->context->device;
+	struct ib_device *ib_dev = attrs->context->device;
 	int				  ret;
 	struct ib_flow_action		  *action;
 	struct ib_flow_action_esp_attr	  esp_attr = {};
 
-	if (!ib_dev->create_flow_action_esp)
+	if (!ib_dev->ops.create_flow_action_esp)
 		return -EOPNOTSUPP;
 
-	ret = parse_flow_action_esp(ib_dev, file, attrs, &esp_attr, false);
+	ret = parse_flow_action_esp(ib_dev, attrs, &esp_attr, false);
 	if (ret)
 		return ret;
 
 	/* No need to check as this attribute is marked as MANDATORY */
-	action = ib_dev->create_flow_action_esp(ib_dev, &esp_attr.hdr, attrs);
+	action = ib_dev->ops.create_flow_action_esp(ib_dev, &esp_attr.hdr,
+						    attrs);
 	if (IS_ERR(action))
 		return PTR_ERR(action);
 
-	atomic_set(&action->usecnt, 0);
-	action->device = ib_dev;
-	action->type = IB_FLOW_ACTION_ESP;
-	action->uobject = uobj;
-	uobj->object = action;
+	uverbs_flow_action_fill_action(action, uobj, ib_dev,
+				       IB_FLOW_ACTION_ESP);
 
 	return 0;
 }
 
 static int UVERBS_HANDLER(UVERBS_METHOD_FLOW_ACTION_ESP_MODIFY)(
-	struct ib_uverbs_file *file, struct uverbs_attr_bundle *attrs)
+	struct uverbs_attr_bundle *attrs)
 {
 	struct ib_uobject *uobj = uverbs_attr_get_uobject(
 		attrs, UVERBS_ATTR_MODIFY_FLOW_ACTION_ESP_HANDLE);
@@ -344,19 +343,19 @@
 	int				  ret;
 	struct ib_flow_action_esp_attr	  esp_attr = {};
 
-	if (!action->device->modify_flow_action_esp)
+	if (!action->device->ops.modify_flow_action_esp)
 		return -EOPNOTSUPP;
 
-	ret = parse_flow_action_esp(action->device, file, attrs, &esp_attr,
-				    true);
+	ret = parse_flow_action_esp(action->device, attrs, &esp_attr, true);
 	if (ret)
 		return ret;
 
 	if (action->type != IB_FLOW_ACTION_ESP)
 		return -EINVAL;
 
-	return action->device->modify_flow_action_esp(action, &esp_attr.hdr,
-						      attrs);
+	return action->device->ops.modify_flow_action_esp(action,
+							  &esp_attr.hdr,
+							  attrs);
 }
 
 static const struct uverbs_attr_spec uverbs_flow_action_esp_keymat[] = {
@@ -441,3 +440,10 @@
 	&UVERBS_METHOD(UVERBS_METHOD_FLOW_ACTION_ESP_CREATE),
 	&UVERBS_METHOD(UVERBS_METHOD_FLOW_ACTION_DESTROY),
 	&UVERBS_METHOD(UVERBS_METHOD_FLOW_ACTION_ESP_MODIFY));
+
+const struct uapi_definition uverbs_def_obj_flow_action[] = {
+	UAPI_DEF_CHAIN_OBJ_TREE_NAMED(
+		UVERBS_OBJECT_FLOW_ACTION,
+		UAPI_DEF_OBJ_NEEDS_FN(destroy_flow_action)),
+	{}
+};
diff --git a/drivers/infiniband/core/uverbs_std_types_mr.c b/drivers/infiniband/core/uverbs_std_types_mr.c
index cf02e77..c1286a5 100644
--- a/drivers/infiniband/core/uverbs_std_types_mr.c
+++ b/drivers/infiniband/core/uverbs_std_types_mr.c
@@ -30,17 +30,56 @@
  * SOFTWARE.
  */
 
+#include "rdma_core.h"
 #include "uverbs.h"
 #include <rdma/uverbs_std_types.h>
 
 static int uverbs_free_mr(struct ib_uobject *uobject,
-			  enum rdma_remove_reason why)
+			  enum rdma_remove_reason why,
+			  struct uverbs_attr_bundle *attrs)
 {
-	return ib_dereg_mr((struct ib_mr *)uobject->object);
+	return ib_dereg_mr_user((struct ib_mr *)uobject->object,
+				&attrs->driver_udata);
+}
+
+static int UVERBS_HANDLER(UVERBS_METHOD_ADVISE_MR)(
+	struct uverbs_attr_bundle *attrs)
+{
+	struct ib_pd *pd =
+		uverbs_attr_get_obj(attrs, UVERBS_ATTR_ADVISE_MR_PD_HANDLE);
+	enum ib_uverbs_advise_mr_advice advice;
+	struct ib_device *ib_dev = pd->device;
+	struct ib_sge *sg_list;
+	int num_sge;
+	u32 flags;
+	int ret;
+
+	/* FIXME: Extend the UAPI_DEF_OBJ_NEEDS_FN stuff.. */
+	if (!ib_dev->ops.advise_mr)
+		return -EOPNOTSUPP;
+
+	ret = uverbs_get_const(&advice, attrs, UVERBS_ATTR_ADVISE_MR_ADVICE);
+	if (ret)
+		return ret;
+
+	ret = uverbs_get_flags32(&flags, attrs, UVERBS_ATTR_ADVISE_MR_FLAGS,
+				 IB_UVERBS_ADVISE_MR_FLAG_FLUSH);
+	if (ret)
+		return ret;
+
+	num_sge = uverbs_attr_ptr_get_array_size(
+		attrs, UVERBS_ATTR_ADVISE_MR_SGE_LIST, sizeof(struct ib_sge));
+	if (num_sge < 0)
+		return num_sge;
+
+	sg_list = uverbs_attr_get_alloced_ptr(attrs,
+					      UVERBS_ATTR_ADVISE_MR_SGE_LIST);
+	return ib_dev->ops.advise_mr(pd, advice, flags, sg_list, num_sge,
+				     attrs);
 }
 
 static int UVERBS_HANDLER(UVERBS_METHOD_DM_MR_REG)(
-	struct ib_uverbs_file *file, struct uverbs_attr_bundle *attrs)
+	struct uverbs_attr_bundle *attrs)
 {
 	struct ib_dm_mr_attr attr = {};
 	struct ib_uobject *uobj =
@@ -54,7 +93,7 @@
 	struct ib_mr *mr;
 	int ret;
 
-	if (!ib_dev->reg_dm_mr)
+	if (!ib_dev->ops.reg_dm_mr)
 		return -EOPNOTSUPP;
 
 	ret = uverbs_copy_from(&attr.offset, attrs, UVERBS_ATTR_REG_DM_MR_OFFSET);
@@ -83,12 +122,13 @@
 	    attr.length > dm->length - attr.offset)
 		return -EINVAL;
 
-	mr = pd->device->reg_dm_mr(pd, dm, &attr, attrs);
+	mr = pd->device->ops.reg_dm_mr(pd, dm, &attr, attrs);
 	if (IS_ERR(mr))
 		return PTR_ERR(mr);
 
 	mr->device  = pd->device;
 	mr->pd      = pd;
+	mr->type    = IB_MR_TYPE_DM;
 	mr->dm      = dm;
 	mr->uobject = uobj;
 	atomic_inc(&pd->usecnt);
@@ -109,12 +149,29 @@
 	return 0;
 
 err_dereg:
-	ib_dereg_mr(mr);
+	ib_dereg_mr_user(mr, uverbs_get_cleared_udata(attrs));
 
 	return ret;
 }
 
 DECLARE_UVERBS_NAMED_METHOD(
+	UVERBS_METHOD_ADVISE_MR,
+	UVERBS_ATTR_IDR(UVERBS_ATTR_ADVISE_MR_PD_HANDLE,
+			UVERBS_OBJECT_PD,
+			UVERBS_ACCESS_READ,
+			UA_MANDATORY),
+	UVERBS_ATTR_CONST_IN(UVERBS_ATTR_ADVISE_MR_ADVICE,
+			     enum ib_uverbs_advise_mr_advice,
+			     UA_MANDATORY),
+	UVERBS_ATTR_FLAGS_IN(UVERBS_ATTR_ADVISE_MR_FLAGS,
+			     enum ib_uverbs_advise_mr_flag,
+			     UA_MANDATORY),
+	UVERBS_ATTR_PTR_IN(UVERBS_ATTR_ADVISE_MR_SGE_LIST,
+			   UVERBS_ATTR_MIN_SIZE(sizeof(struct ib_uverbs_sge)),
+			   UA_MANDATORY,
+			   UA_ALLOC_AND_COPY));
+
+DECLARE_UVERBS_NAMED_METHOD(
 	UVERBS_METHOD_DM_MR_REG,
 	UVERBS_ATTR_IDR(UVERBS_ATTR_REG_DM_MR_HANDLE,
 			UVERBS_OBJECT_MR,
@@ -143,7 +200,22 @@
 			    UVERBS_ATTR_TYPE(u32),
 			    UA_MANDATORY));
 
+DECLARE_UVERBS_NAMED_METHOD_DESTROY(
+	UVERBS_METHOD_MR_DESTROY,
+	UVERBS_ATTR_IDR(UVERBS_ATTR_DESTROY_MR_HANDLE,
+			UVERBS_OBJECT_MR,
+			UVERBS_ACCESS_DESTROY,
+			UA_MANDATORY));
+
 DECLARE_UVERBS_NAMED_OBJECT(
 	UVERBS_OBJECT_MR,
 	UVERBS_TYPE_ALLOC_IDR(uverbs_free_mr),
-	&UVERBS_METHOD(UVERBS_METHOD_DM_MR_REG));
+	&UVERBS_METHOD(UVERBS_METHOD_DM_MR_REG),
+	&UVERBS_METHOD(UVERBS_METHOD_MR_DESTROY),
+	&UVERBS_METHOD(UVERBS_METHOD_ADVISE_MR));
+
+const struct uapi_definition uverbs_def_obj_mr[] = {
+	UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_MR,
+				      UAPI_DEF_OBJ_NEEDS_FN(dereg_mr)),
+	{}
+};
diff --git a/drivers/infiniband/core/uverbs_uapi.c b/drivers/infiniband/core/uverbs_uapi.c
index be85462..00c5478 100644
--- a/drivers/infiniband/core/uverbs_uapi.c
+++ b/drivers/infiniband/core/uverbs_uapi.c
@@ -8,6 +8,11 @@
 #include "rdma_core.h"
 #include "uverbs.h"
 
+static int ib_uverbs_notsupp(struct uverbs_attr_bundle *attrs)
+{
+	return -EOPNOTSUPP;
+}
+
 static void *uapi_add_elm(struct uverbs_api *uapi, u32 key, size_t alloc_size)
 {
 	void *elm;
@@ -17,6 +22,8 @@
 		return ERR_PTR(-EOVERFLOW);
 
 	elm = kzalloc(alloc_size, GFP_KERNEL);
+	if (!elm)
+		return ERR_PTR(-ENOMEM);
 	rc = radix_tree_insert(&uapi->radix, key, elm);
 	if (rc) {
 		kfree(elm);
@@ -26,6 +33,70 @@
 	return elm;
 }
 
+static void *uapi_add_get_elm(struct uverbs_api *uapi, u32 key,
+			      size_t alloc_size, bool *exists)
+{
+	void *elm;
+
+	elm = uapi_add_elm(uapi, key, alloc_size);
+	if (!IS_ERR(elm)) {
+		*exists = false;
+		return elm;
+	}
+
+	if (elm != ERR_PTR(-EEXIST))
+		return elm;
+
+	elm = radix_tree_lookup(&uapi->radix, key);
+	if (WARN_ON(!elm))
+		return ERR_PTR(-EINVAL);
+	*exists = true;
+	return elm;
+}
+
+static int uapi_create_write(struct uverbs_api *uapi,
+			     struct ib_device *ibdev,
+			     const struct uapi_definition *def,
+			     u32 obj_key,
+			     u32 *cur_method_key)
+{
+	struct uverbs_api_write_method *method_elm;
+	u32 method_key = obj_key;
+	bool exists;
+
+	if (def->write.is_ex)
+		method_key |= uapi_key_write_ex_method(def->write.command_num);
+	else
+		method_key |= uapi_key_write_method(def->write.command_num);
+
+	method_elm = uapi_add_get_elm(uapi, method_key, sizeof(*method_elm),
+				      &exists);
+	if (IS_ERR(method_elm))
+		return PTR_ERR(method_elm);
+
+	if (WARN_ON(exists && (def->write.is_ex != method_elm->is_ex)))
+		return -EINVAL;
+
+	method_elm->is_ex = def->write.is_ex;
+	method_elm->handler = def->func_write;
+	if (def->write.is_ex)
+		method_elm->disabled = !(ibdev->uverbs_ex_cmd_mask &
+					 BIT_ULL(def->write.command_num));
+	else
+		method_elm->disabled = !(ibdev->uverbs_cmd_mask &
+					 BIT_ULL(def->write.command_num));
+
+	if (!def->write.is_ex && def->func_write) {
+		method_elm->has_udata = def->write.has_udata;
+		method_elm->has_resp = def->write.has_resp;
+		method_elm->req_size = def->write.req_size;
+		method_elm->resp_size = def->write.resp_size;
+	}
+
+	*cur_method_key = method_key;
+	return 0;
+}
+
 static int uapi_merge_method(struct uverbs_api *uapi,
 			     struct uverbs_api_object *obj_elm, u32 obj_key,
 			     const struct uverbs_method_def *method,
@@ -34,23 +105,21 @@
 	u32 method_key = obj_key | uapi_key_ioctl_method(method->id);
 	struct uverbs_api_ioctl_method *method_elm;
 	unsigned int i;
+	bool exists;
 
 	if (!method->attrs)
 		return 0;
 
-	method_elm = uapi_add_elm(uapi, method_key, sizeof(*method_elm));
-	if (IS_ERR(method_elm)) {
-		if (method_elm != ERR_PTR(-EEXIST))
-			return PTR_ERR(method_elm);
-
+	method_elm = uapi_add_get_elm(uapi, method_key, sizeof(*method_elm),
+				      &exists);
+	if (IS_ERR(method_elm))
+		return PTR_ERR(method_elm);
+	if (exists) {
 		/*
 		 * This occurs when a driver uses ADD_UVERBS_ATTRIBUTES_SIMPLE
 		 */
 		if (WARN_ON(method->handler))
 			return -EINVAL;
-		method_elm = radix_tree_lookup(&uapi->radix, method_key);
-		if (WARN_ON(!method_elm))
-			return -EINVAL;
 	} else {
 		WARN_ON(!method->handler);
 		rcu_assign_pointer(method_elm->handler, method->handler);
@@ -73,6 +142,18 @@
 		if (attr->attr.type == UVERBS_ATTR_TYPE_ENUM_IN)
 			method_elm->driver_method |= is_driver;
 
+		/*
+		 * Like other uobject based things we only support a single
+		 * uobject being NEW'd or DESTROY'd
+		 */
+		if (attr->attr.type == UVERBS_ATTR_TYPE_IDRS_ARRAY) {
+			u8 access = attr->attr.u2.objs_arr.access;
+
+			if (WARN_ON(access == UVERBS_ACCESS_NEW ||
+				    access == UVERBS_ACCESS_DESTROY))
+				return -EINVAL;
+		}
+
 		attr_slot =
 			uapi_add_elm(uapi, method_key | uapi_key_attr(attr->id),
 				     sizeof(*attr_slot));
@@ -86,74 +167,190 @@
 	return 0;
 }
 
-static int uapi_merge_tree(struct uverbs_api *uapi,
-			   const struct uverbs_object_tree_def *tree,
-			   bool is_driver)
+static int uapi_merge_obj_tree(struct uverbs_api *uapi,
+			       const struct uverbs_object_def *obj,
+			       bool is_driver)
 {
-	unsigned int i, j;
+	struct uverbs_api_object *obj_elm;
+	unsigned int i;
+	u32 obj_key;
+	bool exists;
 	int rc;
 
-	if (!tree->objects)
+	obj_key = uapi_key_obj(obj->id);
+	obj_elm = uapi_add_get_elm(uapi, obj_key, sizeof(*obj_elm), &exists);
+	if (IS_ERR(obj_elm))
+		return PTR_ERR(obj_elm);
+
+	if (obj->type_attrs) {
+		if (WARN_ON(obj_elm->type_attrs))
+			return -EINVAL;
+
+		obj_elm->id = obj->id;
+		obj_elm->type_attrs = obj->type_attrs;
+		obj_elm->type_class = obj->type_attrs->type_class;
+		/*
+		 * Today drivers are only permitted to use idr_class and
+		 * fd_class types. We can revoke the IDR types during
+		 * disassociation, and the FD types require the driver to use
+		 * struct file_operations.owner to prevent the driver module
+		 * code from unloading while the file is open. This provides
+		 * enough safety that uverbs_close_fd() will continue to work.
+		 * Drivers using FD are responsible to handle disassociation of
+		 * the device on their own.
+		 */
+		if (WARN_ON(is_driver &&
+			    obj->type_attrs->type_class != &uverbs_idr_class &&
+			    obj->type_attrs->type_class != &uverbs_fd_class))
+			return -EINVAL;
+	}
+
+	if (!obj->methods)
 		return 0;
 
-	for (i = 0; i != tree->num_objects; i++) {
-		const struct uverbs_object_def *obj = (*tree->objects)[i];
-		struct uverbs_api_object *obj_elm;
-		u32 obj_key;
+	for (i = 0; i != obj->num_methods; i++) {
+		const struct uverbs_method_def *method = (*obj->methods)[i];
 
-		if (!obj)
+		if (!method)
 			continue;
 
-		obj_key = uapi_key_obj(obj->id);
-		obj_elm = uapi_add_elm(uapi, obj_key, sizeof(*obj_elm));
-		if (IS_ERR(obj_elm)) {
-			if (obj_elm != ERR_PTR(-EEXIST))
-				return PTR_ERR(obj_elm);
-
-			/* This occurs when a driver uses ADD_UVERBS_METHODS */
-			if (WARN_ON(obj->type_attrs))
-				return -EINVAL;
-			obj_elm = radix_tree_lookup(&uapi->radix, obj_key);
-			if (WARN_ON(!obj_elm))
-				return -EINVAL;
-		} else {
-			obj_elm->type_attrs = obj->type_attrs;
-			if (obj->type_attrs) {
-				obj_elm->type_class =
-					obj->type_attrs->type_class;
-				/*
-				 * Today drivers are only permitted to use
-				 * idr_class types. They cannot use FD types
-				 * because we currently have no way to revoke
-				 * the fops pointer after device
-				 * disassociation.
-				 */
-				if (WARN_ON(is_driver &&
-					    obj->type_attrs->type_class !=
-						    &uverbs_idr_class))
-					return -EINVAL;
-			}
-		}
-
-		if (!obj->methods)
-			continue;
-
-		for (j = 0; j != obj->num_methods; j++) {
-			const struct uverbs_method_def *method =
-				(*obj->methods)[j];
-			if (!method)
-				continue;
-
-			rc = uapi_merge_method(uapi, obj_elm, obj_key, method,
-					       is_driver);
-			if (rc)
-				return rc;
-		}
+		rc = uapi_merge_method(uapi, obj_elm, obj_key, method,
+				       is_driver);
+		if (rc)
+			return rc;
 	}
 
 	return 0;
 }
 
+static int uapi_disable_elm(struct uverbs_api *uapi,
+			    const struct uapi_definition *def,
+			    u32 obj_key,
+			    u32 method_key)
+{
+	bool exists;
+
+	if (def->scope == UAPI_SCOPE_OBJECT) {
+		struct uverbs_api_object *obj_elm;
+
+		obj_elm = uapi_add_get_elm(
+			uapi, obj_key, sizeof(*obj_elm), &exists);
+		if (IS_ERR(obj_elm))
+			return PTR_ERR(obj_elm);
+		obj_elm->disabled = 1;
+		return 0;
+	}
+
+	if (def->scope == UAPI_SCOPE_METHOD &&
+	    uapi_key_is_ioctl_method(method_key)) {
+		struct uverbs_api_ioctl_method *method_elm;
+
+		method_elm = uapi_add_get_elm(uapi, method_key,
+					      sizeof(*method_elm), &exists);
+		if (IS_ERR(method_elm))
+			return PTR_ERR(method_elm);
+		method_elm->disabled = 1;
+		return 0;
+	}
+
+	if (def->scope == UAPI_SCOPE_METHOD &&
+	    (uapi_key_is_write_method(method_key) ||
+	     uapi_key_is_write_ex_method(method_key))) {
+		struct uverbs_api_write_method *write_elm;
+
+		write_elm = uapi_add_get_elm(uapi, method_key,
+					     sizeof(*write_elm), &exists);
+		if (IS_ERR(write_elm))
+			return PTR_ERR(write_elm);
+		write_elm->disabled = 1;
+		return 0;
+	}
+
+	WARN_ON(true);
+	return -EINVAL;
+}
+
+static int uapi_merge_def(struct uverbs_api *uapi, struct ib_device *ibdev,
+			  const struct uapi_definition *def_list,
+			  bool is_driver)
+{
+	const struct uapi_definition *def = def_list;
+	u32 cur_obj_key = UVERBS_API_KEY_ERR;
+	u32 cur_method_key = UVERBS_API_KEY_ERR;
+	bool exists;
+	int rc;
+
+	if (!def_list)
+		return 0;
+
+	for (;; def++) {
+		switch ((enum uapi_definition_kind)def->kind) {
+		case UAPI_DEF_CHAIN:
+			rc = uapi_merge_def(uapi, ibdev, def->chain, is_driver);
+			if (rc)
+				return rc;
+			continue;
+
+		case UAPI_DEF_CHAIN_OBJ_TREE:
+			if (WARN_ON(def->object_start.object_id !=
+				    def->chain_obj_tree->id))
+				return -EINVAL;
+
+			cur_obj_key = uapi_key_obj(def->object_start.object_id);
+			rc = uapi_merge_obj_tree(uapi, def->chain_obj_tree,
+						 is_driver);
+			if (rc)
+				return rc;
+			continue;
+
+		case UAPI_DEF_END:
+			return 0;
+
+		case UAPI_DEF_IS_SUPPORTED_DEV_FN: {
+			void **ibdev_fn =
+				(void *)(&ibdev->ops) + def->needs_fn_offset;
+
+			if (*ibdev_fn)
+				continue;
+			rc = uapi_disable_elm(
+				uapi, def, cur_obj_key, cur_method_key);
+			if (rc)
+				return rc;
+			continue;
+		}
+
+		case UAPI_DEF_IS_SUPPORTED_FUNC:
+			if (def->func_is_supported(ibdev))
+				continue;
+			rc = uapi_disable_elm(
+				uapi, def, cur_obj_key, cur_method_key);
+			if (rc)
+				return rc;
+			continue;
+
+		case UAPI_DEF_OBJECT_START: {
+			struct uverbs_api_object *obj_elm;
+
+			cur_obj_key = uapi_key_obj(def->object_start.object_id);
+			obj_elm = uapi_add_get_elm(uapi, cur_obj_key,
+						   sizeof(*obj_elm), &exists);
+			if (IS_ERR(obj_elm))
+				return PTR_ERR(obj_elm);
+			continue;
+		}
+
+		case UAPI_DEF_WRITE:
+			rc = uapi_create_write(
+				uapi, ibdev, def, cur_obj_key, &cur_method_key);
+			if (rc)
+				return rc;
+			continue;
+		}
+		WARN_ON(true);
+		return -EINVAL;
+	}
+}
+
 static int
 uapi_finalize_ioctl_method(struct uverbs_api *uapi,
 			   struct uverbs_api_ioctl_method *method_elm,
@@ -174,13 +371,16 @@
 		u32 attr_bkey = uapi_bkey_attr(attr_key);
 		u8 type = elm->spec.type;
 
-		if (uapi_key_attr_to_method(iter.index) !=
-		    uapi_key_attr_to_method(method_key))
+		if (uapi_key_attr_to_ioctl_method(iter.index) !=
+		    uapi_key_attr_to_ioctl_method(method_key))
 			break;
 
 		if (elm->spec.mandatory)
 			__set_bit(attr_bkey, method_elm->attr_mandatory);
 
+		if (elm->spec.is_udata)
+			method_elm->has_udata = true;
+
 		if (type == UVERBS_ATTR_TYPE_IDR ||
 		    type == UVERBS_ATTR_TYPE_FD) {
 			u8 access = elm->spec.u.obj.access;
@@ -217,9 +417,13 @@
 
 static int uapi_finalize(struct uverbs_api *uapi)
 {
+	const struct uverbs_api_write_method **data;
+	unsigned long max_write_ex = 0;
+	unsigned long max_write = 0;
 	struct radix_tree_iter iter;
 	void __rcu **slot;
 	int rc;
+	int i;
 
 	radix_tree_for_each_slot (slot, &uapi->radix, &iter, 0) {
 		struct uverbs_api_ioctl_method *method_elm =
@@ -231,29 +435,209 @@
 			if (rc)
 				return rc;
 		}
+
+		if (uapi_key_is_write_method(iter.index))
+			max_write = max(max_write,
+					iter.index & UVERBS_API_ATTR_KEY_MASK);
+		if (uapi_key_is_write_ex_method(iter.index))
+			max_write_ex =
+				max(max_write_ex,
+				    iter.index & UVERBS_API_ATTR_KEY_MASK);
+	}
+
+	uapi->notsupp_method.handler = ib_uverbs_notsupp;
+	uapi->num_write = max_write + 1;
+	uapi->num_write_ex = max_write_ex + 1;
+	data = kmalloc_array(uapi->num_write + uapi->num_write_ex,
+			     sizeof(*uapi->write_methods), GFP_KERNEL);
+	for (i = 0; i != uapi->num_write + uapi->num_write_ex; i++)
+		data[i] = &uapi->notsupp_method;
+	uapi->write_methods = data;
+	uapi->write_ex_methods = data + uapi->num_write;
+
+	radix_tree_for_each_slot (slot, &uapi->radix, &iter, 0) {
+		if (uapi_key_is_write_method(iter.index))
+			uapi->write_methods[iter.index &
+					    UVERBS_API_ATTR_KEY_MASK] =
+				rcu_dereference_protected(*slot, true);
+		if (uapi_key_is_write_ex_method(iter.index))
+			uapi->write_ex_methods[iter.index &
+					       UVERBS_API_ATTR_KEY_MASK] =
+				rcu_dereference_protected(*slot, true);
 	}
 
 	return 0;
 }
 
-void uverbs_destroy_api(struct uverbs_api *uapi)
+static void uapi_remove_range(struct uverbs_api *uapi, u32 start, u32 last)
 {
 	struct radix_tree_iter iter;
 	void __rcu **slot;
 
-	if (!uapi)
-		return;
-
-	radix_tree_for_each_slot (slot, &uapi->radix, &iter, 0) {
+	radix_tree_for_each_slot (slot, &uapi->radix, &iter, start) {
+		if (iter.index > last)
+			return;
 		kfree(rcu_dereference_protected(*slot, true));
 		radix_tree_iter_delete(&uapi->radix, &iter, slot);
 	}
+}
+
+static void uapi_remove_object(struct uverbs_api *uapi, u32 obj_key)
+{
+	uapi_remove_range(uapi, obj_key,
+			  obj_key | UVERBS_API_METHOD_KEY_MASK |
+				  UVERBS_API_ATTR_KEY_MASK);
+}
+
+static void uapi_remove_method(struct uverbs_api *uapi, u32 method_key)
+{
+	uapi_remove_range(uapi, method_key,
+			  method_key | UVERBS_API_ATTR_KEY_MASK);
+}
+
+
+static u32 uapi_get_obj_id(struct uverbs_attr_spec *spec)
+{
+	if (spec->type == UVERBS_ATTR_TYPE_IDR ||
+	    spec->type == UVERBS_ATTR_TYPE_FD)
+		return spec->u.obj.obj_type;
+	if (spec->type == UVERBS_ATTR_TYPE_IDRS_ARRAY)
+		return spec->u2.objs_arr.obj_type;
+	return UVERBS_API_KEY_ERR;
+}
+
+static void uapi_key_okay(u32 key)
+{
+	unsigned int count = 0;
+
+	if (uapi_key_is_object(key))
+		count++;
+	if (uapi_key_is_ioctl_method(key))
+		count++;
+	if (uapi_key_is_write_method(key))
+		count++;
+	if (uapi_key_is_write_ex_method(key))
+		count++;
+	if (uapi_key_is_attr(key))
+		count++;
+	WARN(count != 1, "Bad count %d key=%x", count, key);
+}
+
+static void uapi_finalize_disable(struct uverbs_api *uapi)
+{
+	struct radix_tree_iter iter;
+	u32 starting_key = 0;
+	bool scan_again = false;
+	void __rcu **slot;
+
+again:
+	radix_tree_for_each_slot (slot, &uapi->radix, &iter, starting_key) {
+		uapi_key_okay(iter.index);
+
+		if (uapi_key_is_object(iter.index)) {
+			struct uverbs_api_object *obj_elm =
+				rcu_dereference_protected(*slot, true);
+
+			if (obj_elm->disabled) {
+				/* Have to check all the attrs again */
+				scan_again = true;
+				starting_key = iter.index;
+				uapi_remove_object(uapi, iter.index);
+				goto again;
+			}
+			continue;
+		}
+
+		if (uapi_key_is_ioctl_method(iter.index)) {
+			struct uverbs_api_ioctl_method *method_elm =
+				rcu_dereference_protected(*slot, true);
+
+			if (method_elm->disabled) {
+				starting_key = iter.index;
+				uapi_remove_method(uapi, iter.index);
+				goto again;
+			}
+			continue;
+		}
+
+		if (uapi_key_is_write_method(iter.index) ||
+		    uapi_key_is_write_ex_method(iter.index)) {
+			struct uverbs_api_write_method *method_elm =
+				rcu_dereference_protected(*slot, true);
+
+			if (method_elm->disabled) {
+				kfree(method_elm);
+				radix_tree_iter_delete(&uapi->radix, &iter, slot);
+			}
+			continue;
+		}
+
+		if (uapi_key_is_attr(iter.index)) {
+			struct uverbs_api_attr *attr_elm =
+				rcu_dereference_protected(*slot, true);
+			const struct uverbs_api_object *tmp_obj;
+			u32 obj_key;
+
+			/*
+			 * If the method has a mandatory object handle
+			 * attribute which relies on an object which is not
+			 * present then the entire method is uncallable.
+			 */
+			if (!attr_elm->spec.mandatory)
+				continue;
+			obj_key = uapi_get_obj_id(&attr_elm->spec);
+			if (obj_key == UVERBS_API_KEY_ERR)
+				continue;
+			tmp_obj = uapi_get_object(uapi, obj_key);
+			if (IS_ERR(tmp_obj)) {
+				if (PTR_ERR(tmp_obj) == -ENOMSG)
+					continue;
+			} else {
+				if (!tmp_obj->disabled)
+					continue;
+			}
+
+			starting_key = iter.index;
+			uapi_remove_method(
+				uapi,
+				iter.index & (UVERBS_API_OBJ_KEY_MASK |
+					      UVERBS_API_METHOD_KEY_MASK));
+			goto again;
+		}
+
+		WARN_ON(false);
+	}
+
+	if (!scan_again)
+		return;
+	scan_again = false;
+	starting_key = 0;
+	goto again;
+}
+
+void uverbs_destroy_api(struct uverbs_api *uapi)
+{
+	if (!uapi)
+		return;
+
+	uapi_remove_range(uapi, 0, U32_MAX);
+	kfree(uapi->write_methods);
 	kfree(uapi);
 }
 
-struct uverbs_api *uverbs_alloc_api(
-	const struct uverbs_object_tree_def *const *driver_specs,
-	enum rdma_driver_id driver_id)
+static const struct uapi_definition uverbs_core_api[] = {
+	UAPI_DEF_CHAIN(uverbs_def_obj_counters),
+	UAPI_DEF_CHAIN(uverbs_def_obj_cq),
+	UAPI_DEF_CHAIN(uverbs_def_obj_device),
+	UAPI_DEF_CHAIN(uverbs_def_obj_dm),
+	UAPI_DEF_CHAIN(uverbs_def_obj_flow_action),
+	UAPI_DEF_CHAIN(uverbs_def_obj_intf),
+	UAPI_DEF_CHAIN(uverbs_def_obj_mr),
+	UAPI_DEF_CHAIN(uverbs_def_write_intf),
+	{},
+};
+
+struct uverbs_api *uverbs_alloc_api(struct ib_device *ibdev)
 {
 	struct uverbs_api *uapi;
 	int rc;
@@ -263,18 +647,16 @@
 		return ERR_PTR(-ENOMEM);
 
 	INIT_RADIX_TREE(&uapi->radix, GFP_KERNEL);
-	uapi->driver_id = driver_id;
+	uapi->driver_id = ibdev->ops.driver_id;
 
-	rc = uapi_merge_tree(uapi, uverbs_default_get_objects(), false);
+	rc = uapi_merge_def(uapi, ibdev, uverbs_core_api, false);
+	if (rc)
+		goto err;
+	rc = uapi_merge_def(uapi, ibdev, ibdev->driver_def, true);
 	if (rc)
 		goto err;
 
-	for (; driver_specs && *driver_specs; driver_specs++) {
-		rc = uapi_merge_tree(uapi, *driver_specs, true);
-		if (rc)
-			goto err;
-	}
-
+	uapi_finalize_disable(uapi);
 	rc = uapi_finalize(uapi);
 	if (rc)
 		goto err;
@@ -282,8 +664,9 @@
 	return uapi;
 err:
 	if (rc != -ENOMEM)
-		pr_err("Setup of uverbs_api failed, kernel parsing tree description is not valid (%d)??\n",
-		       rc);
+		dev_err(&ibdev->dev,
+			"Setup of uverbs_api failed, kernel parsing tree description is not valid (%d)??\n",
+			rc);
 
 	uverbs_destroy_api(uapi);
 	return ERR_PTR(rc);
diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
index 6ee03d6..35c2841 100644
--- a/drivers/infiniband/core/verbs.c
+++ b/drivers/infiniband/core/verbs.c
@@ -141,6 +141,10 @@
 	case IB_RATE_100_GBPS: return  40;
 	case IB_RATE_200_GBPS: return  80;
 	case IB_RATE_300_GBPS: return 120;
+	case IB_RATE_28_GBPS:  return  11;
+	case IB_RATE_50_GBPS:  return  20;
+	case IB_RATE_400_GBPS: return 160;
+	case IB_RATE_600_GBPS: return 240;
 	default:	       return  -1;
 	}
 }
@@ -166,6 +170,10 @@
 	case 40:  return IB_RATE_100_GBPS;
 	case 80:  return IB_RATE_200_GBPS;
 	case 120: return IB_RATE_300_GBPS;
+	case 11:  return IB_RATE_28_GBPS;
+	case 20:  return IB_RATE_50_GBPS;
+	case 160: return IB_RATE_400_GBPS;
+	case 240: return IB_RATE_600_GBPS;
 	default:  return IB_RATE_PORT_CURRENT;
 	}
 }
@@ -191,13 +199,17 @@
 	case IB_RATE_100_GBPS: return 103125;
 	case IB_RATE_200_GBPS: return 206250;
 	case IB_RATE_300_GBPS: return 309375;
+	case IB_RATE_28_GBPS:  return 28125;
+	case IB_RATE_50_GBPS:  return 53125;
+	case IB_RATE_400_GBPS: return 425000;
+	case IB_RATE_600_GBPS: return 637500;
 	default:	       return -1;
 	}
 }
 EXPORT_SYMBOL(ib_rate_to_mbps);
 
 __attribute_const__ enum rdma_transport_type
-rdma_node_get_transport(enum rdma_node_type node_type)
+rdma_node_get_transport(unsigned int node_type)
 {
 
 	if (node_type == RDMA_NODE_USNIC)
@@ -206,6 +218,8 @@
 		return RDMA_TRANSPORT_USNIC_UDP;
 	if (node_type == RDMA_NODE_RNIC)
 		return RDMA_TRANSPORT_IWARP;
+	if (node_type == RDMA_NODE_UNSPECIFIED)
+		return RDMA_TRANSPORT_UNSPECIFIED;
 
 	return RDMA_TRANSPORT_IB;
 }
@@ -214,8 +228,8 @@
 enum rdma_link_layer rdma_port_get_link_layer(struct ib_device *device, u8 port_num)
 {
 	enum rdma_transport_type lt;
-	if (device->get_link_layer)
-		return device->get_link_layer(device, port_num);
+	if (device->ops.get_link_layer)
+		return device->ops.get_link_layer(device, port_num);
 
 	lt = rdma_node_get_transport(device->node_type);
 	if (lt == RDMA_TRANSPORT_IB)
@@ -242,10 +256,11 @@
 {
 	struct ib_pd *pd;
 	int mr_access_flags = 0;
+	int ret;
 
-	pd = device->alloc_pd(device, NULL, NULL);
-	if (IS_ERR(pd))
-		return pd;
+	pd = rdma_zalloc_drv_obj(device, ib_pd);
+	if (!pd)
+		return ERR_PTR(-ENOMEM);
 
 	pd->device = device;
 	pd->uobject = NULL;
@@ -253,6 +268,16 @@
 	atomic_set(&pd->usecnt, 0);
 	pd->flags = flags;
 
+	pd->res.type = RDMA_RESTRACK_PD;
+	rdma_restrack_set_task(&pd->res, caller);
+
+	ret = device->ops.alloc_pd(pd, NULL);
+	if (ret) {
+		kfree(pd);
+		return ERR_PTR(ret);
+	}
+	rdma_restrack_kadd(&pd->res);
+
 	if (device->attrs.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY)
 		pd->local_dma_lkey = device->local_dma_lkey;
 	else
@@ -263,14 +288,10 @@
 		mr_access_flags |= IB_ACCESS_REMOTE_READ | IB_ACCESS_REMOTE_WRITE;
 	}
 
-	pd->res.type = RDMA_RESTRACK_PD;
-	pd->res.kern_name = caller;
-	rdma_restrack_add(&pd->res);
-
 	if (mr_access_flags) {
 		struct ib_mr *mr;
 
-		mr = pd->device->get_dma_mr(pd, mr_access_flags);
+		mr = pd->device->ops.get_dma_mr(pd, mr_access_flags);
 		if (IS_ERR(mr)) {
 			ib_dealloc_pd(pd);
 			return ERR_CAST(mr);
@@ -278,6 +299,7 @@
 
 		mr->device	= pd->device;
 		mr->pd		= pd;
+		mr->type        = IB_MR_TYPE_DMA;
 		mr->uobject	= NULL;
 		mr->need_inval	= false;
 
@@ -295,19 +317,20 @@
 EXPORT_SYMBOL(__ib_alloc_pd);
 
 /**
- * ib_dealloc_pd - Deallocates a protection domain.
+ * ib_dealloc_pd_user - Deallocates a protection domain.
  * @pd: The protection domain to deallocate.
+ * @udata: Valid user data or NULL for kernel object
  *
  * It is an error to call this function while any resources in the pd still
  * exist.  The caller is responsible to synchronously destroy them and
  * guarantee no new allocations will happen.
  */
-void ib_dealloc_pd(struct ib_pd *pd)
+void ib_dealloc_pd_user(struct ib_pd *pd, struct ib_udata *udata)
 {
 	int ret;
 
 	if (pd->__internal_mr) {
-		ret = pd->device->dereg_mr(pd->__internal_mr);
+		ret = pd->device->ops.dereg_mr(pd->__internal_mr, NULL);
 		WARN_ON(ret);
 		pd->__internal_mr = NULL;
 	}
@@ -317,12 +340,10 @@
 	WARN_ON(atomic_read(&pd->usecnt));
 
 	rdma_restrack_del(&pd->res);
-	/* Making delalloc_pd a void return is a WIP, no driver should return
-	   an error here. */
-	ret = pd->device->dealloc_pd(pd);
-	WARN_ONCE(ret, "Infiniband HW driver failed dealloc_pd");
+	pd->device->ops.dealloc_pd(pd, udata);
+	kfree(pd);
 }
-EXPORT_SYMBOL(ib_dealloc_pd);
+EXPORT_SYMBOL(ib_dealloc_pd_user);
 
 /* Address handles */
 
@@ -475,25 +496,36 @@
 
 static struct ib_ah *_rdma_create_ah(struct ib_pd *pd,
 				     struct rdma_ah_attr *ah_attr,
+				     u32 flags,
 				     struct ib_udata *udata)
 {
+	struct ib_device *device = pd->device;
 	struct ib_ah *ah;
+	int ret;
 
-	if (!pd->device->create_ah)
+	might_sleep_if(flags & RDMA_CREATE_AH_SLEEPABLE);
+
+	if (!device->ops.create_ah)
 		return ERR_PTR(-EOPNOTSUPP);
 
-	ah = pd->device->create_ah(pd, ah_attr, udata);
+	ah = rdma_zalloc_drv_obj_gfp(
+		device, ib_ah,
+		(flags & RDMA_CREATE_AH_SLEEPABLE) ? GFP_KERNEL : GFP_ATOMIC);
+	if (!ah)
+		return ERR_PTR(-ENOMEM);
 
-	if (!IS_ERR(ah)) {
-		ah->device  = pd->device;
-		ah->pd      = pd;
-		ah->uobject = NULL;
-		ah->type    = ah_attr->type;
-		ah->sgid_attr = rdma_update_sgid_attr(ah_attr, NULL);
+	ah->device = device;
+	ah->pd = pd;
+	ah->type = ah_attr->type;
+	ah->sgid_attr = rdma_update_sgid_attr(ah_attr, NULL);
 
-		atomic_inc(&pd->usecnt);
+	ret = device->ops.create_ah(ah, ah_attr, flags, udata);
+	if (ret) {
+		kfree(ah);
+		return ERR_PTR(ret);
 	}
 
+	atomic_inc(&pd->usecnt);
 	return ah;
 }
 
@@ -502,12 +534,14 @@
  * given address vector.
  * @pd: The protection domain associated with the address handle.
  * @ah_attr: The attributes of the address vector.
+ * @flags: Create address handle flags (see enum rdma_create_ah_flags).
  *
  * It returns 0 on success and returns appropriate error code on error.
  * The address handle is used to reference a local or global destination
  * in all UD QP post sends.
  */
-struct ib_ah *rdma_create_ah(struct ib_pd *pd, struct rdma_ah_attr *ah_attr)
+struct ib_ah *rdma_create_ah(struct ib_pd *pd, struct rdma_ah_attr *ah_attr,
+			     u32 flags)
 {
 	const struct ib_gid_attr *old_sgid_attr;
 	struct ib_ah *ah;
@@ -517,7 +551,7 @@
 	if (ret)
 		return ERR_PTR(ret);
 
-	ah = _rdma_create_ah(pd, ah_attr, NULL);
+	ah = _rdma_create_ah(pd, ah_attr, flags, NULL);
 
 	rdma_unfill_sgid_attr(ah_attr, old_sgid_attr);
 	return ah;
@@ -557,7 +591,7 @@
 		}
 	}
 
-	ah = _rdma_create_ah(pd, ah_attr, udata);
+	ah = _rdma_create_ah(pd, ah_attr, RDMA_CREATE_AH_SLEEPABLE, udata);
 
 out:
 	rdma_unfill_sgid_attr(ah_attr, old_sgid_attr);
@@ -628,16 +662,17 @@
 			   void *context)
 {
 	struct find_gid_index_context *ctx = context;
+	u16 vlan_id = 0xffff;
+	int ret;
 
 	if (ctx->gid_type != gid_attr->gid_type)
 		return false;
 
-	if ((!!(ctx->vlan_id != 0xffff) == !is_vlan_dev(gid_attr->ndev)) ||
-	    (is_vlan_dev(gid_attr->ndev) &&
-	     vlan_dev_vlan_id(gid_attr->ndev) != ctx->vlan_id))
+	ret = rdma_read_gid_l2_fields(gid_attr, &vlan_id, NULL);
+	if (ret)
 		return false;
 
-	return true;
+	return ctx->vlan_id == vlan_id;
 }
 
 static const struct ib_gid_attr *
@@ -710,7 +745,7 @@
 
 	ret = rdma_addr_find_l2_eth_by_grh(&sgid_attr->gid, &grh->dgid,
 					   ah_attr->roce.dmac,
-					   sgid_attr->ndev, &hop_limit);
+					   sgid_attr, &hop_limit);
 
 	grh->hop_limit = hop_limit;
 	return ret;
@@ -869,7 +904,7 @@
 	if (ret)
 		return ERR_PTR(ret);
 
-	ah = rdma_create_ah(pd, &ah_attr);
+	ah = rdma_create_ah(pd, &ah_attr, RDMA_CREATE_AH_SLEEPABLE);
 
 	rdma_destroy_ah_attr(&ah_attr);
 	return ah;
@@ -888,8 +923,8 @@
 	if (ret)
 		return ret;
 
-	ret = ah->device->modify_ah ?
-		ah->device->modify_ah(ah, ah_attr) :
+	ret = ah->device->ops.modify_ah ?
+		ah->device->ops.modify_ah(ah, ah_attr) :
 		-EOPNOTSUPP;
 
 	ah->sgid_attr = rdma_update_sgid_attr(ah_attr, ah->sgid_attr);
@@ -902,29 +937,30 @@
 {
 	ah_attr->grh.sgid_attr = NULL;
 
-	return ah->device->query_ah ?
-		ah->device->query_ah(ah, ah_attr) :
+	return ah->device->ops.query_ah ?
+		ah->device->ops.query_ah(ah, ah_attr) :
 		-EOPNOTSUPP;
 }
 EXPORT_SYMBOL(rdma_query_ah);
 
-int rdma_destroy_ah(struct ib_ah *ah)
+int rdma_destroy_ah_user(struct ib_ah *ah, u32 flags, struct ib_udata *udata)
 {
 	const struct ib_gid_attr *sgid_attr = ah->sgid_attr;
 	struct ib_pd *pd;
-	int ret;
+
+	might_sleep_if(flags & RDMA_DESTROY_AH_SLEEPABLE);
 
 	pd = ah->pd;
-	ret = ah->device->destroy_ah(ah);
-	if (!ret) {
-		atomic_dec(&pd->usecnt);
-		if (sgid_attr)
-			rdma_put_gid_attr(sgid_attr);
-	}
 
-	return ret;
+	ah->device->ops.destroy_ah(ah, flags);
+	atomic_dec(&pd->usecnt);
+	if (sgid_attr)
+		rdma_put_gid_attr(sgid_attr);
+
+	kfree(ah);
+	return 0;
 }
-EXPORT_SYMBOL(rdma_destroy_ah);
+EXPORT_SYMBOL(rdma_destroy_ah_user);
 
 /* Shared receive queues */
 
@@ -932,29 +968,40 @@
 			     struct ib_srq_init_attr *srq_init_attr)
 {
 	struct ib_srq *srq;
+	int ret;
 
-	if (!pd->device->create_srq)
+	if (!pd->device->ops.create_srq)
 		return ERR_PTR(-EOPNOTSUPP);
 
-	srq = pd->device->create_srq(pd, srq_init_attr, NULL);
+	srq = rdma_zalloc_drv_obj(pd->device, ib_srq);
+	if (!srq)
+		return ERR_PTR(-ENOMEM);
 
-	if (!IS_ERR(srq)) {
-		srq->device    	   = pd->device;
-		srq->pd        	   = pd;
-		srq->uobject       = NULL;
-		srq->event_handler = srq_init_attr->event_handler;
-		srq->srq_context   = srq_init_attr->srq_context;
-		srq->srq_type      = srq_init_attr->srq_type;
-		if (ib_srq_has_cq(srq->srq_type)) {
-			srq->ext.cq   = srq_init_attr->ext.cq;
-			atomic_inc(&srq->ext.cq->usecnt);
-		}
-		if (srq->srq_type == IB_SRQT_XRC) {
-			srq->ext.xrc.xrcd = srq_init_attr->ext.xrc.xrcd;
-			atomic_inc(&srq->ext.xrc.xrcd->usecnt);
-		}
-		atomic_inc(&pd->usecnt);
-		atomic_set(&srq->usecnt, 0);
+	srq->device = pd->device;
+	srq->pd = pd;
+	srq->event_handler = srq_init_attr->event_handler;
+	srq->srq_context = srq_init_attr->srq_context;
+	srq->srq_type = srq_init_attr->srq_type;
+
+	if (ib_srq_has_cq(srq->srq_type)) {
+		srq->ext.cq = srq_init_attr->ext.cq;
+		atomic_inc(&srq->ext.cq->usecnt);
+	}
+	if (srq->srq_type == IB_SRQT_XRC) {
+		srq->ext.xrc.xrcd = srq_init_attr->ext.xrc.xrcd;
+		atomic_inc(&srq->ext.xrc.xrcd->usecnt);
+	}
+	atomic_inc(&pd->usecnt);
+
+	ret = pd->device->ops.create_srq(srq, srq_init_attr, NULL);
+	if (ret) {
+		atomic_dec(&srq->pd->usecnt);
+		if (srq->srq_type == IB_SRQT_XRC)
+			atomic_dec(&srq->ext.xrc.xrcd->usecnt);
+		if (ib_srq_has_cq(srq->srq_type))
+			atomic_dec(&srq->ext.cq->usecnt);
+		kfree(srq);
+		return ERR_PTR(ret);
 	}
 
 	return srq;
@@ -965,50 +1012,37 @@
 		  struct ib_srq_attr *srq_attr,
 		  enum ib_srq_attr_mask srq_attr_mask)
 {
-	return srq->device->modify_srq ?
-		srq->device->modify_srq(srq, srq_attr, srq_attr_mask, NULL) :
-		-EOPNOTSUPP;
+	return srq->device->ops.modify_srq ?
+		srq->device->ops.modify_srq(srq, srq_attr, srq_attr_mask,
+					    NULL) : -EOPNOTSUPP;
 }
 EXPORT_SYMBOL(ib_modify_srq);
 
 int ib_query_srq(struct ib_srq *srq,
 		 struct ib_srq_attr *srq_attr)
 {
-	return srq->device->query_srq ?
-		srq->device->query_srq(srq, srq_attr) : -EOPNOTSUPP;
+	return srq->device->ops.query_srq ?
+		srq->device->ops.query_srq(srq, srq_attr) : -EOPNOTSUPP;
 }
 EXPORT_SYMBOL(ib_query_srq);
 
-int ib_destroy_srq(struct ib_srq *srq)
+int ib_destroy_srq_user(struct ib_srq *srq, struct ib_udata *udata)
 {
-	struct ib_pd *pd;
-	enum ib_srq_type srq_type;
-	struct ib_xrcd *uninitialized_var(xrcd);
-	struct ib_cq *uninitialized_var(cq);
-	int ret;
-
 	if (atomic_read(&srq->usecnt))
 		return -EBUSY;
 
-	pd = srq->pd;
-	srq_type = srq->srq_type;
-	if (ib_srq_has_cq(srq_type))
-		cq = srq->ext.cq;
-	if (srq_type == IB_SRQT_XRC)
-		xrcd = srq->ext.xrc.xrcd;
+	srq->device->ops.destroy_srq(srq, udata);
 
-	ret = srq->device->destroy_srq(srq);
-	if (!ret) {
-		atomic_dec(&pd->usecnt);
-		if (srq_type == IB_SRQT_XRC)
-			atomic_dec(&xrcd->usecnt);
-		if (ib_srq_has_cq(srq_type))
-			atomic_dec(&cq->usecnt);
-	}
+	atomic_dec(&srq->pd->usecnt);
+	if (srq->srq_type == IB_SRQT_XRC)
+		atomic_dec(&srq->ext.xrc.xrcd->usecnt);
+	if (ib_srq_has_cq(srq->srq_type))
+		atomic_dec(&srq->ext.cq->usecnt);
+	kfree(srq);
 
-	return ret;
+	return 0;
 }
-EXPORT_SYMBOL(ib_destroy_srq);
+EXPORT_SYMBOL(ib_destroy_srq_user);
 
 /* Queue pairs */
 
@@ -1087,8 +1121,9 @@
 }
 EXPORT_SYMBOL(ib_open_qp);
 
-static struct ib_qp *ib_create_xrc_qp(struct ib_qp *qp,
-		struct ib_qp_init_attr *qp_init_attr)
+static struct ib_qp *create_xrc_qp_user(struct ib_qp *qp,
+					struct ib_qp_init_attr *qp_init_attr,
+					struct ib_udata *udata)
 {
 	struct ib_qp *real_qp = qp;
 
@@ -1103,15 +1138,16 @@
 
 	qp = __ib_open_qp(real_qp, qp_init_attr->event_handler,
 			  qp_init_attr->qp_context);
-	if (!IS_ERR(qp))
-		__ib_insert_xrcd_qp(qp_init_attr->xrcd, real_qp);
-	else
-		real_qp->device->destroy_qp(real_qp);
+	if (IS_ERR(qp))
+		return qp;
+
+	__ib_insert_xrcd_qp(qp_init_attr->xrcd, real_qp);
 	return qp;
 }
 
-struct ib_qp *ib_create_qp(struct ib_pd *pd,
-			   struct ib_qp_init_attr *qp_init_attr)
+struct ib_qp *ib_create_qp_user(struct ib_pd *pd,
+				struct ib_qp_init_attr *qp_init_attr,
+				struct ib_udata *udata)
 {
 	struct ib_device *device = pd ? pd->device : qp_init_attr->xrcd->device;
 	struct ib_qp *qp;
@@ -1123,6 +1159,10 @@
 	    qp_init_attr->cap.max_recv_sge))
 		return ERR_PTR(-EINVAL);
 
+	if ((qp_init_attr->create_flags & IB_QP_CREATE_INTEGRITY_EN) &&
+	    !(device->attrs.device_cap_flags & IB_DEVICE_INTEGRITY_HANDOVER))
+		return ERR_PTR(-EINVAL);
+
 	/*
 	 * If the callers is using the RDMA API calculate the resources
 	 * needed for the RDMA READ/WRITE operations.
@@ -1137,12 +1177,9 @@
 		return qp;
 
 	ret = ib_create_qp_security(qp, device);
-	if (ret) {
-		ib_destroy_qp(qp);
-		return ERR_PTR(ret);
-	}
+	if (ret)
+		goto err;
 
-	qp->real_qp    = qp;
 	qp->qp_type    = qp_init_attr->qp_type;
 	qp->rwq_ind_tbl = qp_init_attr->rwq_ind_tbl;
 
@@ -1153,8 +1190,16 @@
 	INIT_LIST_HEAD(&qp->sig_mrs);
 	qp->port = 0;
 
-	if (qp_init_attr->qp_type == IB_QPT_XRC_TGT)
-		return ib_create_xrc_qp(qp, qp_init_attr);
+	if (qp_init_attr->qp_type == IB_QPT_XRC_TGT) {
+		struct ib_qp *xrc_qp =
+			create_xrc_qp_user(qp, qp_init_attr, udata);
+
+		if (IS_ERR(xrc_qp)) {
+			ret = PTR_ERR(xrc_qp);
+			goto err;
+		}
+		return xrc_qp;
+	}
 
 	qp->event_handler = qp_init_attr->event_handler;
 	qp->qp_context = qp_init_attr->qp_context;
@@ -1181,11 +1226,8 @@
 
 	if (qp_init_attr->cap.max_rdma_ctxs) {
 		ret = rdma_rw_init_mrs(qp, qp_init_attr);
-		if (ret) {
-			pr_err("failed to init MR pool ret= %d\n", ret);
-			ib_destroy_qp(qp);
-			return ERR_PTR(ret);
-		}
+		if (ret)
+			goto err;
 	}
 
 	/*
@@ -1196,10 +1238,17 @@
 	qp->max_write_sge = qp_init_attr->cap.max_send_sge;
 	qp->max_read_sge = min_t(u32, qp_init_attr->cap.max_send_sge,
 				 device->attrs.max_sge_rd);
+	if (qp_init_attr->create_flags & IB_QP_CREATE_INTEGRITY_EN)
+		qp->integrity_en = true;
 
 	return qp;
+
+err:
+	ib_destroy_qp(qp);
+	return ERR_PTR(ret);
+
 }
-EXPORT_SYMBOL(ib_create_qp);
+EXPORT_SYMBOL(ib_create_qp_user);
 
 static const struct {
 	int			valid;
@@ -1509,8 +1558,7 @@
 };
 
 bool ib_modify_qp_is_ok(enum ib_qp_state cur_state, enum ib_qp_state next_state,
-			enum ib_qp_type type, enum ib_qp_attr_mask mask,
-			enum rdma_link_layer ll)
+			enum ib_qp_type type, enum ib_qp_attr_mask mask)
 {
 	enum ib_qp_attr_mask req_param, opt_param;
 
@@ -1629,18 +1677,28 @@
 
 	if (rdma_ib_or_roce(qp->device, port)) {
 		if (attr_mask & IB_QP_RQ_PSN && attr->rq_psn & ~0xffffff) {
-			pr_warn("%s: %s rq_psn overflow, masking to 24 bits\n",
-				__func__, qp->device->name);
+			dev_warn(&qp->device->dev,
+				 "%s rq_psn overflow, masking to 24 bits\n",
+				 __func__);
 			attr->rq_psn &= 0xffffff;
 		}
 
 		if (attr_mask & IB_QP_SQ_PSN && attr->sq_psn & ~0xffffff) {
-			pr_warn("%s: %s sq_psn overflow, masking to 24 bits\n",
-				__func__, qp->device->name);
+			dev_warn(&qp->device->dev,
+				 " %s sq_psn overflow, masking to 24 bits\n",
+				 __func__);
 			attr->sq_psn &= 0xffffff;
 		}
 	}
 
+	/*
+	 * Bind this qp to a counter automatically based on the rdma counter
+	 * rules. This only set in RST2INIT with port specified
+	 */
+	if (!qp->counter && (attr_mask & IB_QP_PORT) &&
+	    ((attr_mask & IB_QP_STATE) && attr->qp_state == IB_QPS_INIT))
+		rdma_counter_bind_qp_auto(qp, attr->port_num);
+
 	ret = ib_security_modify_qp(qp, attr, attr_mask, udata);
 	if (ret)
 		goto out;
@@ -1691,10 +1749,7 @@
 	if (rdma_port_get_link_layer(dev, port_num) != IB_LINK_LAYER_ETHERNET)
 		return -EINVAL;
 
-	if (!dev->get_netdev)
-		return -EOPNOTSUPP;
-
-	netdev = dev->get_netdev(dev, port_num);
+	netdev = ib_device_get_netdev(dev, port_num);
 	if (!netdev)
 		return -ENODEV;
 
@@ -1752,9 +1807,9 @@
 	qp_attr->ah_attr.grh.sgid_attr = NULL;
 	qp_attr->alt_ah_attr.grh.sgid_attr = NULL;
 
-	return qp->device->query_qp ?
-		qp->device->query_qp(qp->real_qp, qp_attr, qp_attr_mask, qp_init_attr) :
-		-EOPNOTSUPP;
+	return qp->device->ops.query_qp ?
+		qp->device->ops.query_qp(qp->real_qp, qp_attr, qp_attr_mask,
+					 qp_init_attr) : -EOPNOTSUPP;
 }
 EXPORT_SYMBOL(ib_query_qp);
 
@@ -1808,7 +1863,7 @@
 	return 0;
 }
 
-int ib_destroy_qp(struct ib_qp *qp)
+int ib_destroy_qp_user(struct ib_qp *qp, struct ib_udata *udata)
 {
 	const struct ib_gid_attr *alt_path_sgid_attr = qp->alt_path_sgid_attr;
 	const struct ib_gid_attr *av_sgid_attr = qp->av_sgid_attr;
@@ -1839,8 +1894,9 @@
 	if (!qp->uobject)
 		rdma_rw_cleanup_mrs(qp);
 
+	rdma_counter_unbind_qp(qp, true);
 	rdma_restrack_del(&qp->res);
-	ret = qp->device->destroy_qp(qp);
+	ret = qp->device->ops.destroy_qp(qp, udata);
 	if (!ret) {
 		if (alt_path_sgid_attr)
 			rdma_put_gid_attr(alt_path_sgid_attr);
@@ -1865,7 +1921,7 @@
 
 	return ret;
 }
-EXPORT_SYMBOL(ib_destroy_qp);
+EXPORT_SYMBOL(ib_destroy_qp_user);
 
 /* Completion queues */
 
@@ -1877,74 +1933,87 @@
 			     const char *caller)
 {
 	struct ib_cq *cq;
+	int ret;
 
-	cq = device->create_cq(device, cq_attr, NULL, NULL);
+	cq = rdma_zalloc_drv_obj(device, ib_cq);
+	if (!cq)
+		return ERR_PTR(-ENOMEM);
 
-	if (!IS_ERR(cq)) {
-		cq->device        = device;
-		cq->uobject       = NULL;
-		cq->comp_handler  = comp_handler;
-		cq->event_handler = event_handler;
-		cq->cq_context    = cq_context;
-		atomic_set(&cq->usecnt, 0);
-		cq->res.type = RDMA_RESTRACK_CQ;
-		cq->res.kern_name = caller;
-		rdma_restrack_add(&cq->res);
+	cq->device = device;
+	cq->uobject = NULL;
+	cq->comp_handler = comp_handler;
+	cq->event_handler = event_handler;
+	cq->cq_context = cq_context;
+	atomic_set(&cq->usecnt, 0);
+	cq->res.type = RDMA_RESTRACK_CQ;
+	rdma_restrack_set_task(&cq->res, caller);
+
+	ret = device->ops.create_cq(cq, cq_attr, NULL);
+	if (ret) {
+		kfree(cq);
+		return ERR_PTR(ret);
 	}
 
+	rdma_restrack_kadd(&cq->res);
 	return cq;
 }
 EXPORT_SYMBOL(__ib_create_cq);
 
 int rdma_set_cq_moderation(struct ib_cq *cq, u16 cq_count, u16 cq_period)
 {
-	return cq->device->modify_cq ?
-		cq->device->modify_cq(cq, cq_count, cq_period) : -EOPNOTSUPP;
+	return cq->device->ops.modify_cq ?
+		cq->device->ops.modify_cq(cq, cq_count,
+					  cq_period) : -EOPNOTSUPP;
 }
 EXPORT_SYMBOL(rdma_set_cq_moderation);
 
-int ib_destroy_cq(struct ib_cq *cq)
+int ib_destroy_cq_user(struct ib_cq *cq, struct ib_udata *udata)
 {
 	if (atomic_read(&cq->usecnt))
 		return -EBUSY;
 
 	rdma_restrack_del(&cq->res);
-	return cq->device->destroy_cq(cq);
+	cq->device->ops.destroy_cq(cq, udata);
+	kfree(cq);
+	return 0;
 }
-EXPORT_SYMBOL(ib_destroy_cq);
+EXPORT_SYMBOL(ib_destroy_cq_user);
 
 int ib_resize_cq(struct ib_cq *cq, int cqe)
 {
-	return cq->device->resize_cq ?
-		cq->device->resize_cq(cq, cqe, NULL) : -EOPNOTSUPP;
+	return cq->device->ops.resize_cq ?
+		cq->device->ops.resize_cq(cq, cqe, NULL) : -EOPNOTSUPP;
 }
 EXPORT_SYMBOL(ib_resize_cq);
 
 /* Memory regions */
 
-int ib_dereg_mr(struct ib_mr *mr)
+int ib_dereg_mr_user(struct ib_mr *mr, struct ib_udata *udata)
 {
 	struct ib_pd *pd = mr->pd;
 	struct ib_dm *dm = mr->dm;
+	struct ib_sig_attrs *sig_attrs = mr->sig_attrs;
 	int ret;
 
 	rdma_restrack_del(&mr->res);
-	ret = mr->device->dereg_mr(mr);
+	ret = mr->device->ops.dereg_mr(mr, udata);
 	if (!ret) {
 		atomic_dec(&pd->usecnt);
 		if (dm)
 			atomic_dec(&dm->usecnt);
+		kfree(sig_attrs);
 	}
 
 	return ret;
 }
-EXPORT_SYMBOL(ib_dereg_mr);
+EXPORT_SYMBOL(ib_dereg_mr_user);
 
 /**
- * ib_alloc_mr() - Allocates a memory region
+ * ib_alloc_mr_user() - Allocates a memory region
  * @pd:            protection domain associated with the region
  * @mr_type:       memory region type
  * @max_num_sg:    maximum sg entries available for registration.
+ * @udata:	   user data or null for kernel objects
  *
  * Notes:
  * Memory registeration page/sg lists must not exceed max_num_sg.
@@ -1952,16 +2021,18 @@
  * max_num_sg * used_page_size.
  *
  */
-struct ib_mr *ib_alloc_mr(struct ib_pd *pd,
-			  enum ib_mr_type mr_type,
-			  u32 max_num_sg)
+struct ib_mr *ib_alloc_mr_user(struct ib_pd *pd, enum ib_mr_type mr_type,
+			       u32 max_num_sg, struct ib_udata *udata)
 {
 	struct ib_mr *mr;
 
-	if (!pd->device->alloc_mr)
+	if (!pd->device->ops.alloc_mr)
 		return ERR_PTR(-EOPNOTSUPP);
 
-	mr = pd->device->alloc_mr(pd, mr_type, max_num_sg);
+	if (WARN_ON_ONCE(mr_type == IB_MR_TYPE_INTEGRITY))
+		return ERR_PTR(-EINVAL);
+
+	mr = pd->device->ops.alloc_mr(pd, mr_type, max_num_sg, udata);
 	if (!IS_ERR(mr)) {
 		mr->device  = pd->device;
 		mr->pd      = pd;
@@ -1970,12 +2041,66 @@
 		atomic_inc(&pd->usecnt);
 		mr->need_inval = false;
 		mr->res.type = RDMA_RESTRACK_MR;
-		rdma_restrack_add(&mr->res);
+		rdma_restrack_kadd(&mr->res);
+		mr->type = mr_type;
+		mr->sig_attrs = NULL;
 	}
 
 	return mr;
 }
-EXPORT_SYMBOL(ib_alloc_mr);
+EXPORT_SYMBOL(ib_alloc_mr_user);
+
+/**
+ * ib_alloc_mr_integrity() - Allocates an integrity memory region
+ * @pd:                      protection domain associated with the region
+ * @max_num_data_sg:         maximum data sg entries available for registration
+ * @max_num_meta_sg:         maximum metadata sg entries available for
+ *                           registration
+ *
+ * Notes:
+ * Memory registration page/sg lists must not exceed max_num_sg,
+ * also the integrity page/sg lists must not exceed max_num_meta_sg.
+ *
+ */
+struct ib_mr *ib_alloc_mr_integrity(struct ib_pd *pd,
+				    u32 max_num_data_sg,
+				    u32 max_num_meta_sg)
+{
+	struct ib_mr *mr;
+	struct ib_sig_attrs *sig_attrs;
+
+	if (!pd->device->ops.alloc_mr_integrity ||
+	    !pd->device->ops.map_mr_sg_pi)
+		return ERR_PTR(-EOPNOTSUPP);
+
+	if (!max_num_meta_sg)
+		return ERR_PTR(-EINVAL);
+
+	sig_attrs = kzalloc(sizeof(struct ib_sig_attrs), GFP_KERNEL);
+	if (!sig_attrs)
+		return ERR_PTR(-ENOMEM);
+
+	mr = pd->device->ops.alloc_mr_integrity(pd, max_num_data_sg,
+						max_num_meta_sg);
+	if (IS_ERR(mr)) {
+		kfree(sig_attrs);
+		return mr;
+	}
+
+	mr->device = pd->device;
+	mr->pd = pd;
+	mr->dm = NULL;
+	mr->uobject = NULL;
+	atomic_inc(&pd->usecnt);
+	mr->need_inval = false;
+	mr->res.type = RDMA_RESTRACK_MR;
+	rdma_restrack_kadd(&mr->res);
+	mr->type = IB_MR_TYPE_INTEGRITY;
+	mr->sig_attrs = sig_attrs;
+
+	return mr;
+}
+EXPORT_SYMBOL(ib_alloc_mr_integrity);
 
 /* "Fast" memory regions */
 
@@ -1985,10 +2110,10 @@
 {
 	struct ib_fmr *fmr;
 
-	if (!pd->device->alloc_fmr)
+	if (!pd->device->ops.alloc_fmr)
 		return ERR_PTR(-EOPNOTSUPP);
 
-	fmr = pd->device->alloc_fmr(pd, mr_access_flags, fmr_attr);
+	fmr = pd->device->ops.alloc_fmr(pd, mr_access_flags, fmr_attr);
 	if (!IS_ERR(fmr)) {
 		fmr->device = pd->device;
 		fmr->pd     = pd;
@@ -2007,7 +2132,7 @@
 		return 0;
 
 	fmr = list_entry(fmr_list->next, struct ib_fmr, list);
-	return fmr->device->unmap_fmr(fmr_list);
+	return fmr->device->ops.unmap_fmr(fmr_list);
 }
 EXPORT_SYMBOL(ib_unmap_fmr);
 
@@ -2017,7 +2142,7 @@
 	int ret;
 
 	pd = fmr->pd;
-	ret = fmr->device->dealloc_fmr(fmr);
+	ret = fmr->device->ops.dealloc_fmr(fmr);
 	if (!ret)
 		atomic_dec(&pd->usecnt);
 
@@ -2069,14 +2194,14 @@
 {
 	int ret;
 
-	if (!qp->device->attach_mcast)
+	if (!qp->device->ops.attach_mcast)
 		return -EOPNOTSUPP;
 
 	if (!rdma_is_multicast_addr((struct in6_addr *)gid->raw) ||
 	    qp->qp_type != IB_QPT_UD || !is_valid_mcast_lid(qp, lid))
 		return -EINVAL;
 
-	ret = qp->device->attach_mcast(qp, gid, lid);
+	ret = qp->device->ops.attach_mcast(qp, gid, lid);
 	if (!ret)
 		atomic_inc(&qp->usecnt);
 	return ret;
@@ -2087,14 +2212,14 @@
 {
 	int ret;
 
-	if (!qp->device->detach_mcast)
+	if (!qp->device->ops.detach_mcast)
 		return -EOPNOTSUPP;
 
 	if (!rdma_is_multicast_addr((struct in6_addr *)gid->raw) ||
 	    qp->qp_type != IB_QPT_UD || !is_valid_mcast_lid(qp, lid))
 		return -EINVAL;
 
-	ret = qp->device->detach_mcast(qp, gid, lid);
+	ret = qp->device->ops.detach_mcast(qp, gid, lid);
 	if (!ret)
 		atomic_dec(&qp->usecnt);
 	return ret;
@@ -2105,10 +2230,10 @@
 {
 	struct ib_xrcd *xrcd;
 
-	if (!device->alloc_xrcd)
+	if (!device->ops.alloc_xrcd)
 		return ERR_PTR(-EOPNOTSUPP);
 
-	xrcd = device->alloc_xrcd(device, NULL, NULL);
+	xrcd = device->ops.alloc_xrcd(device, NULL);
 	if (!IS_ERR(xrcd)) {
 		xrcd->device = device;
 		xrcd->inode = NULL;
@@ -2121,7 +2246,7 @@
 }
 EXPORT_SYMBOL(__ib_alloc_xrcd);
 
-int ib_dealloc_xrcd(struct ib_xrcd *xrcd)
+int ib_dealloc_xrcd(struct ib_xrcd *xrcd, struct ib_udata *udata)
 {
 	struct ib_qp *qp;
 	int ret;
@@ -2135,8 +2260,9 @@
 		if (ret)
 			return ret;
 	}
+	mutex_destroy(&xrcd->tgt_qp_mutex);
 
-	return xrcd->device->dealloc_xrcd(xrcd);
+	return xrcd->device->ops.dealloc_xrcd(xrcd, udata);
 }
 EXPORT_SYMBOL(ib_dealloc_xrcd);
 
@@ -2159,10 +2285,10 @@
 {
 	struct ib_wq *wq;
 
-	if (!pd->device->create_wq)
+	if (!pd->device->ops.create_wq)
 		return ERR_PTR(-EOPNOTSUPP);
 
-	wq = pd->device->create_wq(pd, wq_attr, NULL);
+	wq = pd->device->ops.create_wq(pd, wq_attr, NULL);
 	if (!IS_ERR(wq)) {
 		wq->event_handler = wq_attr->event_handler;
 		wq->wq_context = wq_attr->wq_context;
@@ -2180,24 +2306,23 @@
 EXPORT_SYMBOL(ib_create_wq);
 
 /**
- * ib_destroy_wq - Destroys the specified WQ.
+ * ib_destroy_wq - Destroys the specified user WQ.
  * @wq: The WQ to destroy.
+ * @udata: Valid user data
  */
-int ib_destroy_wq(struct ib_wq *wq)
+int ib_destroy_wq(struct ib_wq *wq, struct ib_udata *udata)
 {
-	int err;
 	struct ib_cq *cq = wq->cq;
 	struct ib_pd *pd = wq->pd;
 
 	if (atomic_read(&wq->usecnt))
 		return -EBUSY;
 
-	err = wq->device->destroy_wq(wq);
-	if (!err) {
-		atomic_dec(&pd->usecnt);
-		atomic_dec(&cq->usecnt);
-	}
-	return err;
+	wq->device->ops.destroy_wq(wq, udata);
+	atomic_dec(&pd->usecnt);
+	atomic_dec(&cq->usecnt);
+
+	return 0;
 }
 EXPORT_SYMBOL(ib_destroy_wq);
 
@@ -2214,10 +2339,10 @@
 {
 	int err;
 
-	if (!wq->device->modify_wq)
+	if (!wq->device->ops.modify_wq)
 		return -EOPNOTSUPP;
 
-	err = wq->device->modify_wq(wq, wq_attr, wq_attr_mask, NULL);
+	err = wq->device->ops.modify_wq(wq, wq_attr, wq_attr_mask, NULL);
 	return err;
 }
 EXPORT_SYMBOL(ib_modify_wq);
@@ -2239,12 +2364,12 @@
 	int i;
 	u32 table_size;
 
-	if (!device->create_rwq_ind_table)
+	if (!device->ops.create_rwq_ind_table)
 		return ERR_PTR(-EOPNOTSUPP);
 
 	table_size = (1 << init_attr->log_ind_tbl_size);
-	rwq_ind_table = device->create_rwq_ind_table(device,
-				init_attr, NULL);
+	rwq_ind_table = device->ops.create_rwq_ind_table(device,
+							 init_attr, NULL);
 	if (IS_ERR(rwq_ind_table))
 		return rwq_ind_table;
 
@@ -2274,7 +2399,7 @@
 	if (atomic_read(&rwq_ind_table->usecnt))
 		return -EBUSY;
 
-	err = rwq_ind_table->device->destroy_rwq_ind_table(rwq_ind_table);
+	err = rwq_ind_table->device->ops.destroy_rwq_ind_table(rwq_ind_table);
 	if (!err) {
 		for (i = 0; i < table_size; i++)
 			atomic_dec(&ind_tbl[i]->usecnt);
@@ -2287,52 +2412,91 @@
 int ib_check_mr_status(struct ib_mr *mr, u32 check_mask,
 		       struct ib_mr_status *mr_status)
 {
-	return mr->device->check_mr_status ?
-		mr->device->check_mr_status(mr, check_mask, mr_status) : -EOPNOTSUPP;
+	if (!mr->device->ops.check_mr_status)
+		return -EOPNOTSUPP;
+
+	return mr->device->ops.check_mr_status(mr, check_mask, mr_status);
 }
 EXPORT_SYMBOL(ib_check_mr_status);
 
 int ib_set_vf_link_state(struct ib_device *device, int vf, u8 port,
 			 int state)
 {
-	if (!device->set_vf_link_state)
+	if (!device->ops.set_vf_link_state)
 		return -EOPNOTSUPP;
 
-	return device->set_vf_link_state(device, vf, port, state);
+	return device->ops.set_vf_link_state(device, vf, port, state);
 }
 EXPORT_SYMBOL(ib_set_vf_link_state);
 
 int ib_get_vf_config(struct ib_device *device, int vf, u8 port,
 		     struct ifla_vf_info *info)
 {
-	if (!device->get_vf_config)
+	if (!device->ops.get_vf_config)
 		return -EOPNOTSUPP;
 
-	return device->get_vf_config(device, vf, port, info);
+	return device->ops.get_vf_config(device, vf, port, info);
 }
 EXPORT_SYMBOL(ib_get_vf_config);
 
 int ib_get_vf_stats(struct ib_device *device, int vf, u8 port,
 		    struct ifla_vf_stats *stats)
 {
-	if (!device->get_vf_stats)
+	if (!device->ops.get_vf_stats)
 		return -EOPNOTSUPP;
 
-	return device->get_vf_stats(device, vf, port, stats);
+	return device->ops.get_vf_stats(device, vf, port, stats);
 }
 EXPORT_SYMBOL(ib_get_vf_stats);
 
 int ib_set_vf_guid(struct ib_device *device, int vf, u8 port, u64 guid,
 		   int type)
 {
-	if (!device->set_vf_guid)
+	if (!device->ops.set_vf_guid)
 		return -EOPNOTSUPP;
 
-	return device->set_vf_guid(device, vf, port, guid, type);
+	return device->ops.set_vf_guid(device, vf, port, guid, type);
 }
 EXPORT_SYMBOL(ib_set_vf_guid);
 
 /**
+ * ib_map_mr_sg_pi() - Map the dma mapped SG lists for PI (protection
+ *     information) and set an appropriate memory region for registration.
+ * @mr:             memory region
+ * @data_sg:        dma mapped scatterlist for data
+ * @data_sg_nents:  number of entries in data_sg
+ * @data_sg_offset: offset in bytes into data_sg
+ * @meta_sg:        dma mapped scatterlist for metadata
+ * @meta_sg_nents:  number of entries in meta_sg
+ * @meta_sg_offset: offset in bytes into meta_sg
+ * @page_size:      page vector desired page size
+ *
+ * Constraints:
+ * - The MR must be allocated with type IB_MR_TYPE_INTEGRITY.
+ *
+ * Return: 0 on success.
+ *
+ * After this completes successfully, the  memory region
+ * is ready for registration.
+ */
+int ib_map_mr_sg_pi(struct ib_mr *mr, struct scatterlist *data_sg,
+		    int data_sg_nents, unsigned int *data_sg_offset,
+		    struct scatterlist *meta_sg, int meta_sg_nents,
+		    unsigned int *meta_sg_offset, unsigned int page_size)
+{
+	if (unlikely(!mr->device->ops.map_mr_sg_pi ||
+		     WARN_ON_ONCE(mr->type != IB_MR_TYPE_INTEGRITY)))
+		return -EOPNOTSUPP;
+
+	mr->page_size = page_size;
+
+	return mr->device->ops.map_mr_sg_pi(mr, data_sg, data_sg_nents,
+					    data_sg_offset, meta_sg,
+					    meta_sg_nents, meta_sg_offset);
+}
+EXPORT_SYMBOL(ib_map_mr_sg_pi);
+
+/**
  * ib_map_mr_sg() - Map the largest prefix of a dma mapped SG list
  *     and set it the memory region.
  * @mr:            memory region
@@ -2360,12 +2524,12 @@
 int ib_map_mr_sg(struct ib_mr *mr, struct scatterlist *sg, int sg_nents,
 		 unsigned int *sg_offset, unsigned int page_size)
 {
-	if (unlikely(!mr->device->map_mr_sg))
+	if (unlikely(!mr->device->ops.map_mr_sg))
 		return -EOPNOTSUPP;
 
 	mr->page_size = page_size;
 
-	return mr->device->map_mr_sg(mr, sg, sg_nents, sg_offset);
+	return mr->device->ops.map_mr_sg(mr, sg, sg_nents, sg_offset);
 }
 EXPORT_SYMBOL(ib_map_mr_sg);
 
@@ -2564,8 +2728,8 @@
  */
 void ib_drain_sq(struct ib_qp *qp)
 {
-	if (qp->device->drain_sq)
-		qp->device->drain_sq(qp);
+	if (qp->device->ops.drain_sq)
+		qp->device->ops.drain_sq(qp);
 	else
 		__ib_drain_sq(qp);
 }
@@ -2592,8 +2756,8 @@
  */
 void ib_drain_rq(struct ib_qp *qp)
 {
-	if (qp->device->drain_rq)
-		qp->device->drain_rq(qp);
+	if (qp->device->ops.drain_rq)
+		qp->device->ops.drain_rq(qp);
 	else
 		__ib_drain_rq(qp);
 }
@@ -2621,3 +2785,85 @@
 		ib_drain_rq(qp);
 }
 EXPORT_SYMBOL(ib_drain_qp);
+
+struct net_device *rdma_alloc_netdev(struct ib_device *device, u8 port_num,
+				     enum rdma_netdev_t type, const char *name,
+				     unsigned char name_assign_type,
+				     void (*setup)(struct net_device *))
+{
+	struct rdma_netdev_alloc_params params;
+	struct net_device *netdev;
+	int rc;
+
+	if (!device->ops.rdma_netdev_get_params)
+		return ERR_PTR(-EOPNOTSUPP);
+
+	rc = device->ops.rdma_netdev_get_params(device, port_num, type,
+						&params);
+	if (rc)
+		return ERR_PTR(rc);
+
+	netdev = alloc_netdev_mqs(params.sizeof_priv, name, name_assign_type,
+				  setup, params.txqs, params.rxqs);
+	if (!netdev)
+		return ERR_PTR(-ENOMEM);
+
+	return netdev;
+}
+EXPORT_SYMBOL(rdma_alloc_netdev);
+
+int rdma_init_netdev(struct ib_device *device, u8 port_num,
+		     enum rdma_netdev_t type, const char *name,
+		     unsigned char name_assign_type,
+		     void (*setup)(struct net_device *),
+		     struct net_device *netdev)
+{
+	struct rdma_netdev_alloc_params params;
+	int rc;
+
+	if (!device->ops.rdma_netdev_get_params)
+		return -EOPNOTSUPP;
+
+	rc = device->ops.rdma_netdev_get_params(device, port_num, type,
+						&params);
+	if (rc)
+		return rc;
+
+	return params.initialize_rdma_netdev(device, port_num,
+					     netdev, params.param);
+}
+EXPORT_SYMBOL(rdma_init_netdev);
+
+void __rdma_block_iter_start(struct ib_block_iter *biter,
+			     struct scatterlist *sglist, unsigned int nents,
+			     unsigned long pgsz)
+{
+	memset(biter, 0, sizeof(struct ib_block_iter));
+	biter->__sg = sglist;
+	biter->__sg_nents = nents;
+
+	/* Driver provides best block size to use */
+	biter->__pg_bit = __fls(pgsz);
+}
+EXPORT_SYMBOL(__rdma_block_iter_start);
+
+bool __rdma_block_iter_next(struct ib_block_iter *biter)
+{
+	unsigned int block_offset;
+
+	if (!biter->__sg_nents || !biter->__sg)
+		return false;
+
+	biter->__dma_addr = sg_dma_address(biter->__sg) + biter->__sg_advance;
+	block_offset = biter->__dma_addr & (BIT_ULL(biter->__pg_bit) - 1);
+	biter->__sg_advance += BIT_ULL(biter->__pg_bit) - block_offset;
+
+	if (biter->__sg_advance >= sg_dma_len(biter->__sg)) {
+		biter->__sg_advance = 0;
+		biter->__sg = sg_next(biter->__sg);
+		biter->__sg_nents--;
+	}
+
+	return true;
+}
+EXPORT_SYMBOL(__rdma_block_iter_next);
diff --git a/drivers/infiniband/hw/Makefile b/drivers/infiniband/hw/Makefile
index e4f31c1..433fca5 100644
--- a/drivers/infiniband/hw/Makefile
+++ b/drivers/infiniband/hw/Makefile
@@ -3,10 +3,10 @@
 obj-$(CONFIG_INFINIBAND_QIB)		+= qib/
 obj-$(CONFIG_INFINIBAND_CXGB3)		+= cxgb3/
 obj-$(CONFIG_INFINIBAND_CXGB4)		+= cxgb4/
+obj-$(CONFIG_INFINIBAND_EFA)		+= efa/
 obj-$(CONFIG_INFINIBAND_I40IW)		+= i40iw/
 obj-$(CONFIG_MLX4_INFINIBAND)		+= mlx4/
 obj-$(CONFIG_MLX5_INFINIBAND)		+= mlx5/
-obj-$(CONFIG_INFINIBAND_NES)		+= nes/
 obj-$(CONFIG_INFINIBAND_OCRDMA)		+= ocrdma/
 obj-$(CONFIG_INFINIBAND_VMWARE_PVRDMA)	+= vmw_pvrdma/
 obj-$(CONFIG_INFINIBAND_USNIC)		+= usnic/
diff --git a/drivers/infiniband/hw/bnxt_re/Kconfig b/drivers/infiniband/hw/bnxt_re/Kconfig
index 18f5ed0..ab8779d 100644
--- a/drivers/infiniband/hw/bnxt_re/Kconfig
+++ b/drivers/infiniband/hw/bnxt_re/Kconfig
@@ -1,10 +1,11 @@
+# SPDX-License-Identifier: GPL-2.0-only
 config INFINIBAND_BNXT_RE
-    tristate "Broadcom Netxtreme HCA support"
-    depends on ETHERNET && NETDEVICES && PCI && INET && DCB
-    depends on MAY_USE_DEVLINK
-    select NET_VENDOR_BROADCOM
-    select BNXT
-    ---help---
+        tristate "Broadcom Netxtreme HCA support"
+        depends on 64BIT
+        depends on ETHERNET && NETDEVICES && PCI && INET && DCB
+        select NET_VENDOR_BROADCOM
+        select BNXT
+        ---help---
 	  This driver supports Broadcom NetXtreme-E 10/25/40/50 gigabit
 	  RoCE HCAs.  To compile this driver as a module, choose M here:
 	  the module will be called bnxt_re.
diff --git a/drivers/infiniband/hw/bnxt_re/Makefile b/drivers/infiniband/hw/bnxt_re/Makefile
index 6e3bc25..ee9bb1b 100644
--- a/drivers/infiniband/hw/bnxt_re/Makefile
+++ b/drivers/infiniband/hw/bnxt_re/Makefile
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0
 
-ccflags-y := -Idrivers/net/ethernet/broadcom/bnxt
+ccflags-y := -I $(srctree)/drivers/net/ethernet/broadcom/bnxt
 obj-$(CONFIG_INFINIBAND_BNXT_RE) += bnxt_re.o
 bnxt_re-y := main.o ib_verbs.o \
 	     qplib_res.o qplib_rcfw.o	\
diff --git a/drivers/infiniband/hw/bnxt_re/bnxt_re.h b/drivers/infiniband/hw/bnxt_re/bnxt_re.h
index 96f7689..e55a166 100644
--- a/drivers/infiniband/hw/bnxt_re/bnxt_re.h
+++ b/drivers/infiniband/hw/bnxt_re/bnxt_re.h
@@ -40,7 +40,6 @@
 #ifndef __BNXT_RE_H__
 #define __BNXT_RE_H__
 #define ROCE_DRV_MODULE_NAME		"bnxt_re"
-#define ROCE_DRV_MODULE_VERSION		"1.0.0"
 
 #define BNXT_RE_DESC	"Broadcom NetXtreme-C/E RoCE Driver"
 #define BNXT_RE_PAGE_SHIFT_4K		(12)
@@ -120,9 +119,12 @@
 #define BNXT_RE_FLAG_HAVE_L2_REF		3
 #define BNXT_RE_FLAG_RCFW_CHANNEL_EN		4
 #define BNXT_RE_FLAG_QOS_WORK_REG		5
+#define BNXT_RE_FLAG_RESOURCES_ALLOCATED	7
+#define BNXT_RE_FLAG_RESOURCES_INITIALIZED	8
 #define BNXT_RE_FLAG_ISSUE_ROCE_STATS          29
 	struct net_device		*netdev;
 	unsigned int			version, major, minor;
+	struct bnxt_qplib_chip_ctx	chip_ctx;
 	struct bnxt_en_dev		*en_dev;
 	struct bnxt_msix_entry		msix_entries[BNXT_RE_MAX_MSIX];
 	int				num_msix;
diff --git a/drivers/infiniband/hw/bnxt_re/hw_counters.c b/drivers/infiniband/hw/bnxt_re/hw_counters.c
index 77416bc..3421a0b 100644
--- a/drivers/infiniband/hw/bnxt_re/hw_counters.c
+++ b/drivers/infiniband/hw/bnxt_re/hw_counters.c
@@ -68,11 +68,13 @@
 	[BNXT_RE_TX_PKTS]		=  "tx_pkts",
 	[BNXT_RE_TX_BYTES]		=  "tx_bytes",
 	[BNXT_RE_RECOVERABLE_ERRORS]	=  "recoverable_errors",
+	[BNXT_RE_RX_DROPS]		=  "rx_roce_drops",
+	[BNXT_RE_RX_DISCARDS]		=  "rx_roce_discards",
 	[BNXT_RE_TO_RETRANSMITS]        = "to_retransmits",
 	[BNXT_RE_SEQ_ERR_NAKS_RCVD]     = "seq_err_naks_rcvd",
 	[BNXT_RE_MAX_RETRY_EXCEEDED]    = "max_retry_exceeded",
 	[BNXT_RE_RNR_NAKS_RCVD]         = "rnr_naks_rcvd",
-	[BNXT_RE_MISSING_RESP]          = "missin_resp",
+	[BNXT_RE_MISSING_RESP]          = "missing_resp",
 	[BNXT_RE_UNRECOVERABLE_ERR]     = "unrecoverable_err",
 	[BNXT_RE_BAD_RESP_ERR]          = "bad_resp_err",
 	[BNXT_RE_LOCAL_QP_OP_ERR]       = "local_qp_op_err",
@@ -106,7 +108,8 @@
 	[BNXT_RE_RES_CQ_LOAD_ERR]       = "res_cq_load_err",
 	[BNXT_RE_RES_SRQ_LOAD_ERR]      = "res_srq_load_err",
 	[BNXT_RE_RES_TX_PCI_ERR]        = "res_tx_pci_err",
-	[BNXT_RE_RES_RX_PCI_ERR]        = "res_rx_pci_err"
+	[BNXT_RE_RES_RX_PCI_ERR]        = "res_rx_pci_err",
+	[BNXT_RE_OUT_OF_SEQ_ERR]        = "oos_drop_count"
 };
 
 int bnxt_re_ib_get_hw_stats(struct ib_device *ibdev,
@@ -128,6 +131,10 @@
 	if (bnxt_re_stats) {
 		stats->value[BNXT_RE_RECOVERABLE_ERRORS] =
 			le64_to_cpu(bnxt_re_stats->tx_bcast_pkts);
+		stats->value[BNXT_RE_RX_DROPS] =
+			le64_to_cpu(bnxt_re_stats->rx_drop_pkts);
+		stats->value[BNXT_RE_RX_DISCARDS] =
+			le64_to_cpu(bnxt_re_stats->rx_discard_pkts);
 		stats->value[BNXT_RE_RX_PKTS] =
 			le64_to_cpu(bnxt_re_stats->rx_ucast_pkts);
 		stats->value[BNXT_RE_RX_BYTES] =
@@ -220,6 +227,8 @@
 				rdev->stats.res_tx_pci_err;
 		stats->value[BNXT_RE_RES_RX_PCI_ERR]    =
 				rdev->stats.res_rx_pci_err;
+		stats->value[BNXT_RE_OUT_OF_SEQ_ERR]    =
+				rdev->stats.res_oos_drop_count;
 	}
 
 	return ARRAY_SIZE(bnxt_re_stat_name);
diff --git a/drivers/infiniband/hw/bnxt_re/hw_counters.h b/drivers/infiniband/hw/bnxt_re/hw_counters.h
index a01a922..76399f4 100644
--- a/drivers/infiniband/hw/bnxt_re/hw_counters.h
+++ b/drivers/infiniband/hw/bnxt_re/hw_counters.h
@@ -51,6 +51,8 @@
 	BNXT_RE_TX_PKTS,
 	BNXT_RE_TX_BYTES,
 	BNXT_RE_RECOVERABLE_ERRORS,
+	BNXT_RE_RX_DROPS,
+	BNXT_RE_RX_DISCARDS,
 	BNXT_RE_TO_RETRANSMITS,
 	BNXT_RE_SEQ_ERR_NAKS_RCVD,
 	BNXT_RE_MAX_RETRY_EXCEEDED,
@@ -90,6 +92,7 @@
 	BNXT_RE_RES_SRQ_LOAD_ERR,
 	BNXT_RE_RES_TX_PCI_ERR,
 	BNXT_RE_RES_RX_PCI_ERR,
+	BNXT_RE_OUT_OF_SEQ_ERR,
 	BNXT_RE_NUM_COUNTERS
 };
 
diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c
index bc2b9e0..b4149dc 100644
--- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c
+++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c
@@ -48,6 +48,7 @@
 #include <rdma/ib_addr.h>
 #include <rdma/ib_mad.h>
 #include <rdma/ib_cache.h>
+#include <rdma/uverbs_ioctl.h>
 
 #include "bnxt_ulp.h"
 
@@ -118,21 +119,6 @@
 }
 
 /* Device */
-struct net_device *bnxt_re_get_netdev(struct ib_device *ibdev, u8 port_num)
-{
-	struct bnxt_re_dev *rdev = to_bnxt_re_dev(ibdev, ibdev);
-	struct net_device *netdev = NULL;
-
-	rcu_read_lock();
-	if (rdev)
-		netdev = rdev->netdev;
-	if (netdev)
-		dev_hold(netdev);
-
-	rcu_read_unlock();
-	return netdev;
-}
-
 int bnxt_re_query_device(struct ib_device *ibdev,
 			 struct ib_device_attr *ib_attr,
 			 struct ib_udata *udata)
@@ -234,10 +220,10 @@
 
 	if (netif_running(rdev->netdev) && netif_carrier_ok(rdev->netdev)) {
 		port_attr->state = IB_PORT_ACTIVE;
-		port_attr->phys_state = 5;
+		port_attr->phys_state = IB_PORT_PHYS_STATE_LINK_UP;
 	} else {
 		port_attr->state = IB_PORT_DOWN;
-		port_attr->phys_state = 3;
+		port_attr->phys_state = IB_PORT_PHYS_STATE_DISABLED;
 	}
 	port_attr->max_mtu = IB_MTU_4096;
 	port_attr->active_mtu = iboe_get_mtu(rdev->netdev->mtu);
@@ -322,6 +308,7 @@
 	struct bnxt_re_dev *rdev = to_bnxt_re_dev(attr->device, ibdev);
 	struct bnxt_qplib_sgid_tbl *sgid_tbl = &rdev->qplib_res.sgid_tbl;
 	struct bnxt_qplib_gid *gid_to_del;
+	u16 vlan_id = 0xFFFF;
 
 	/* Delete the entry from the hardware */
 	ctx = *context;
@@ -331,7 +318,8 @@
 	if (sgid_tbl && sgid_tbl->active) {
 		if (ctx->idx >= sgid_tbl->max)
 			return -EINVAL;
-		gid_to_del = &sgid_tbl->tbl[ctx->idx];
+		gid_to_del = &sgid_tbl->tbl[ctx->idx].gid;
+		vlan_id = sgid_tbl->tbl[ctx->idx].vlan_id;
 		/* DEL_GID is called in WQ context(netdevice_event_work_handler)
 		 * or via the ib_unregister_device path. In the former case QP1
 		 * may not be destroyed yet, in which case just return as FW
@@ -349,7 +337,8 @@
 		}
 		ctx->refcnt--;
 		if (!ctx->refcnt) {
-			rc = bnxt_qplib_del_sgid(sgid_tbl, gid_to_del, true);
+			rc = bnxt_qplib_del_sgid(sgid_tbl, gid_to_del,
+						 vlan_id,  true);
 			if (rc) {
 				dev_err(rdev_to_dev(rdev),
 					"Failed to remove GID: %#x", rc);
@@ -374,8 +363,9 @@
 	struct bnxt_re_dev *rdev = to_bnxt_re_dev(attr->device, ibdev);
 	struct bnxt_qplib_sgid_tbl *sgid_tbl = &rdev->qplib_res.sgid_tbl;
 
-	if ((attr->ndev) && is_vlan_dev(attr->ndev))
-		vlan_id = vlan_dev_vlan_id(attr->ndev);
+	rc = rdma_read_gid_l2_fields(attr, &vlan_id, NULL);
+	if (rc)
+		return rc;
 
 	rc = bnxt_qplib_add_sgid(sgid_tbl, (struct bnxt_qplib_gid *)&attr->gid,
 				 rdev->qplib_res.netdev->dev_addr,
@@ -563,41 +553,27 @@
 }
 
 /* Protection Domains */
-int bnxt_re_dealloc_pd(struct ib_pd *ib_pd)
+void bnxt_re_dealloc_pd(struct ib_pd *ib_pd, struct ib_udata *udata)
 {
 	struct bnxt_re_pd *pd = container_of(ib_pd, struct bnxt_re_pd, ib_pd);
 	struct bnxt_re_dev *rdev = pd->rdev;
-	int rc;
 
 	bnxt_re_destroy_fence_mr(pd);
 
-	if (pd->qplib_pd.id) {
-		rc = bnxt_qplib_dealloc_pd(&rdev->qplib_res,
-					   &rdev->qplib_res.pd_tbl,
-					   &pd->qplib_pd);
-		if (rc)
-			dev_err(rdev_to_dev(rdev), "Failed to deallocate HW PD");
-	}
-
-	kfree(pd);
-	return 0;
+	if (pd->qplib_pd.id)
+		bnxt_qplib_dealloc_pd(&rdev->qplib_res, &rdev->qplib_res.pd_tbl,
+				      &pd->qplib_pd);
 }
 
-struct ib_pd *bnxt_re_alloc_pd(struct ib_device *ibdev,
-			       struct ib_ucontext *ucontext,
-			       struct ib_udata *udata)
+int bnxt_re_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata)
 {
+	struct ib_device *ibdev = ibpd->device;
 	struct bnxt_re_dev *rdev = to_bnxt_re_dev(ibdev, ibdev);
-	struct bnxt_re_ucontext *ucntx = container_of(ucontext,
-						      struct bnxt_re_ucontext,
-						      ib_uctx);
-	struct bnxt_re_pd *pd;
+	struct bnxt_re_ucontext *ucntx = rdma_udata_to_drv_context(
+		udata, struct bnxt_re_ucontext, ib_uctx);
+	struct bnxt_re_pd *pd = container_of(ibpd, struct bnxt_re_pd, ib_pd);
 	int rc;
 
-	pd = kzalloc(sizeof(*pd), GFP_KERNEL);
-	if (!pd)
-		return ERR_PTR(-ENOMEM);
-
 	pd->rdev = rdev;
 	if (bnxt_qplib_alloc_pd(&rdev->qplib_res.pd_tbl, &pd->qplib_pd)) {
 		dev_err(rdev_to_dev(rdev), "Failed to allocate HW PD");
@@ -637,49 +613,58 @@
 		if (bnxt_re_create_fence_mr(pd))
 			dev_warn(rdev_to_dev(rdev),
 				 "Failed to create Fence-MR\n");
-	return &pd->ib_pd;
+	return 0;
 dbfail:
-	(void)bnxt_qplib_dealloc_pd(&rdev->qplib_res, &rdev->qplib_res.pd_tbl,
-				    &pd->qplib_pd);
+	bnxt_qplib_dealloc_pd(&rdev->qplib_res, &rdev->qplib_res.pd_tbl,
+			      &pd->qplib_pd);
 fail:
-	kfree(pd);
-	return ERR_PTR(rc);
+	return rc;
 }
 
 /* Address Handles */
-int bnxt_re_destroy_ah(struct ib_ah *ib_ah)
+void bnxt_re_destroy_ah(struct ib_ah *ib_ah, u32 flags)
 {
 	struct bnxt_re_ah *ah = container_of(ib_ah, struct bnxt_re_ah, ib_ah);
 	struct bnxt_re_dev *rdev = ah->rdev;
-	int rc;
 
-	rc = bnxt_qplib_destroy_ah(&rdev->qplib_res, &ah->qplib_ah);
-	if (rc) {
-		dev_err(rdev_to_dev(rdev), "Failed to destroy HW AH");
-		return rc;
-	}
-	kfree(ah);
-	return 0;
+	bnxt_qplib_destroy_ah(&rdev->qplib_res, &ah->qplib_ah,
+			      !(flags & RDMA_DESTROY_AH_SLEEPABLE));
 }
 
-struct ib_ah *bnxt_re_create_ah(struct ib_pd *ib_pd,
-				struct rdma_ah_attr *ah_attr,
-				struct ib_udata *udata)
+static u8 bnxt_re_stack_to_dev_nw_type(enum rdma_network_type ntype)
 {
-	struct bnxt_re_pd *pd = container_of(ib_pd, struct bnxt_re_pd, ib_pd);
-	struct bnxt_re_dev *rdev = pd->rdev;
-	struct bnxt_re_ah *ah;
-	const struct ib_global_route *grh = rdma_ah_read_grh(ah_attr);
-	int rc;
 	u8 nw_type;
 
+	switch (ntype) {
+	case RDMA_NETWORK_IPV4:
+		nw_type = CMDQ_CREATE_AH_TYPE_V2IPV4;
+		break;
+	case RDMA_NETWORK_IPV6:
+		nw_type = CMDQ_CREATE_AH_TYPE_V2IPV6;
+		break;
+	default:
+		nw_type = CMDQ_CREATE_AH_TYPE_V1;
+		break;
+	}
+	return nw_type;
+}
+
+int bnxt_re_create_ah(struct ib_ah *ib_ah, struct rdma_ah_attr *ah_attr,
+		      u32 flags, struct ib_udata *udata)
+{
+	struct ib_pd *ib_pd = ib_ah->pd;
+	struct bnxt_re_pd *pd = container_of(ib_pd, struct bnxt_re_pd, ib_pd);
+	const struct ib_global_route *grh = rdma_ah_read_grh(ah_attr);
+	struct bnxt_re_dev *rdev = pd->rdev;
+	const struct ib_gid_attr *sgid_attr;
+	struct bnxt_re_ah *ah = container_of(ib_ah, struct bnxt_re_ah, ib_ah);
+	u8 nw_type;
+	int rc;
+
 	if (!(rdma_ah_get_ah_flags(ah_attr) & IB_AH_GRH)) {
 		dev_err(rdev_to_dev(rdev), "Failed to alloc AH: GRH not set");
-		return ERR_PTR(-EINVAL);
+		return -EINVAL;
 	}
-	ah = kzalloc(sizeof(*ah), GFP_ATOMIC);
-	if (!ah)
-		return ERR_PTR(-ENOMEM);
 
 	ah->rdev = rdev;
 	ah->qplib_ah.pd = &pd->qplib_pd;
@@ -698,44 +683,27 @@
 	ah->qplib_ah.flow_label = grh->flow_label;
 	ah->qplib_ah.hop_limit = grh->hop_limit;
 	ah->qplib_ah.sl = rdma_ah_get_sl(ah_attr);
-	if (ib_pd->uobject &&
-	    !rdma_is_multicast_addr((struct in6_addr *)
-				    grh->dgid.raw) &&
-	    !rdma_link_local_addr((struct in6_addr *)
-				  grh->dgid.raw)) {
-		const struct ib_gid_attr *sgid_attr;
 
-		sgid_attr = grh->sgid_attr;
-		/* Get network header type for this GID */
-		nw_type = rdma_gid_attr_network_type(sgid_attr);
-		switch (nw_type) {
-		case RDMA_NETWORK_IPV4:
-			ah->qplib_ah.nw_type = CMDQ_CREATE_AH_TYPE_V2IPV4;
-			break;
-		case RDMA_NETWORK_IPV6:
-			ah->qplib_ah.nw_type = CMDQ_CREATE_AH_TYPE_V2IPV6;
-			break;
-		default:
-			ah->qplib_ah.nw_type = CMDQ_CREATE_AH_TYPE_V1;
-			break;
-		}
-	}
+	sgid_attr = grh->sgid_attr;
+	/* Get network header type for this GID */
+	nw_type = rdma_gid_attr_network_type(sgid_attr);
+	ah->qplib_ah.nw_type = bnxt_re_stack_to_dev_nw_type(nw_type);
 
 	memcpy(ah->qplib_ah.dmac, ah_attr->roce.dmac, ETH_ALEN);
-	rc = bnxt_qplib_create_ah(&rdev->qplib_res, &ah->qplib_ah);
+	rc = bnxt_qplib_create_ah(&rdev->qplib_res, &ah->qplib_ah,
+				  !(flags & RDMA_CREATE_AH_SLEEPABLE));
 	if (rc) {
 		dev_err(rdev_to_dev(rdev), "Failed to allocate HW AH");
-		goto fail;
+		return rc;
 	}
 
 	/* Write AVID to shared page. */
-	if (ib_pd->uobject) {
-		struct ib_ucontext *ib_uctx = ib_pd->uobject->context;
-		struct bnxt_re_ucontext *uctx;
+	if (udata) {
+		struct bnxt_re_ucontext *uctx = rdma_udata_to_drv_context(
+			udata, struct bnxt_re_ucontext, ib_uctx);
 		unsigned long flag;
 		u32 *wrptr;
 
-		uctx = container_of(ib_uctx, struct bnxt_re_ucontext, ib_uctx);
 		spin_lock_irqsave(&uctx->sh_lock, flag);
 		wrptr = (u32 *)(uctx->shpg + BNXT_RE_AVID_OFFT);
 		*wrptr = ah->qplib_ah.id;
@@ -743,11 +711,7 @@
 		spin_unlock_irqrestore(&uctx->sh_lock, flag);
 	}
 
-	return &ah->ib_ah;
-
-fail:
-	kfree(ah);
-	return ERR_PTR(rc);
+	return 0;
 }
 
 int bnxt_re_modify_ah(struct ib_ah *ib_ah, struct rdma_ah_attr *ah_attr)
@@ -797,12 +761,12 @@
 }
 
 /* Queue Pairs */
-int bnxt_re_destroy_qp(struct ib_qp *ib_qp)
+int bnxt_re_destroy_qp(struct ib_qp *ib_qp, struct ib_udata *udata)
 {
 	struct bnxt_re_qp *qp = container_of(ib_qp, struct bnxt_re_qp, ib_qp);
 	struct bnxt_re_dev *rdev = qp->rdev;
-	int rc;
 	unsigned int flags;
+	int rc;
 
 	bnxt_qplib_flush_cqn_wq(&qp->qplib_qp);
 	rc = bnxt_qplib_destroy_qp(&rdev->qplib_res, &qp->qplib_qp);
@@ -811,19 +775,17 @@
 		return rc;
 	}
 
-	flags = bnxt_re_lock_cqs(qp);
-	bnxt_qplib_clean_qp(&qp->qplib_qp);
-	bnxt_re_unlock_cqs(qp, flags);
+	if (rdma_is_kernel_res(&qp->ib_qp.res)) {
+		flags = bnxt_re_lock_cqs(qp);
+		bnxt_qplib_clean_qp(&qp->qplib_qp);
+		bnxt_re_unlock_cqs(qp, flags);
+	}
+
 	bnxt_qplib_free_qp_res(&rdev->qplib_res, &qp->qplib_qp);
 
 	if (ib_qp->qp_type == IB_QPT_GSI && rdev->qp1_sqp) {
-		rc = bnxt_qplib_destroy_ah(&rdev->qplib_res,
-					   &rdev->sqp_ah->qplib_ah);
-		if (rc) {
-			dev_err(rdev_to_dev(rdev),
-				"Failed to destroy HW AH for shadow QP");
-			return rc;
-		}
+		bnxt_qplib_destroy_ah(&rdev->qplib_res, &rdev->sqp_ah->qplib_ah,
+				      false);
 
 		bnxt_qplib_clean_qp(&qp->qplib_qp);
 		rc = bnxt_qplib_destroy_qp(&rdev->qplib_res,
@@ -846,10 +808,8 @@
 		rdev->sqp_ah = NULL;
 	}
 
-	if (!IS_ERR_OR_NULL(qp->rumem))
-		ib_umem_release(qp->rumem);
-	if (!IS_ERR_OR_NULL(qp->sumem))
-		ib_umem_release(qp->sumem);
+	ib_umem_release(qp->rumem);
+	ib_umem_release(qp->sumem);
 
 	mutex_lock(&rdev->qp_lock);
 	list_del(&qp->list);
@@ -879,39 +839,43 @@
 	struct bnxt_re_qp_req ureq;
 	struct bnxt_qplib_qp *qplib_qp = &qp->qplib_qp;
 	struct ib_umem *umem;
-	int bytes = 0;
-	struct ib_ucontext *context = pd->ib_pd.uobject->context;
-	struct bnxt_re_ucontext *cntx = container_of(context,
-						     struct bnxt_re_ucontext,
-						     ib_uctx);
+	int bytes = 0, psn_sz;
+	struct bnxt_re_ucontext *cntx = rdma_udata_to_drv_context(
+		udata, struct bnxt_re_ucontext, ib_uctx);
+
 	if (ib_copy_from_udata(&ureq, udata, sizeof(ureq)))
 		return -EFAULT;
 
 	bytes = (qplib_qp->sq.max_wqe * BNXT_QPLIB_MAX_SQE_ENTRY_SIZE);
 	/* Consider mapping PSN search memory only for RC QPs. */
-	if (qplib_qp->type == CMDQ_CREATE_QP_TYPE_RC)
-		bytes += (qplib_qp->sq.max_wqe * sizeof(struct sq_psn_search));
+	if (qplib_qp->type == CMDQ_CREATE_QP_TYPE_RC) {
+		psn_sz = bnxt_qplib_is_chip_gen_p5(&rdev->chip_ctx) ?
+					sizeof(struct sq_psn_search_ext) :
+					sizeof(struct sq_psn_search);
+		bytes += (qplib_qp->sq.max_wqe * psn_sz);
+	}
 	bytes = PAGE_ALIGN(bytes);
-	umem = ib_umem_get(context, ureq.qpsva, bytes,
-			   IB_ACCESS_LOCAL_WRITE, 1);
+	umem = ib_umem_get(udata, ureq.qpsva, bytes, IB_ACCESS_LOCAL_WRITE, 1);
 	if (IS_ERR(umem))
 		return PTR_ERR(umem);
 
 	qp->sumem = umem;
-	qplib_qp->sq.sglist = umem->sg_head.sgl;
-	qplib_qp->sq.nmap = umem->nmap;
+	qplib_qp->sq.sg_info.sglist = umem->sg_head.sgl;
+	qplib_qp->sq.sg_info.npages = ib_umem_num_pages(umem);
+	qplib_qp->sq.sg_info.nmap = umem->nmap;
 	qplib_qp->qp_handle = ureq.qp_handle;
 
 	if (!qp->qplib_qp.srq) {
 		bytes = (qplib_qp->rq.max_wqe * BNXT_QPLIB_MAX_RQE_ENTRY_SIZE);
 		bytes = PAGE_ALIGN(bytes);
-		umem = ib_umem_get(context, ureq.qprva, bytes,
+		umem = ib_umem_get(udata, ureq.qprva, bytes,
 				   IB_ACCESS_LOCAL_WRITE, 1);
 		if (IS_ERR(umem))
 			goto rqfail;
 		qp->rumem = umem;
-		qplib_qp->rq.sglist = umem->sg_head.sgl;
-		qplib_qp->rq.nmap = umem->nmap;
+		qplib_qp->rq.sg_info.sglist = umem->sg_head.sgl;
+		qplib_qp->rq.sg_info.npages = ib_umem_num_pages(umem);
+		qplib_qp->rq.sg_info.nmap = umem->nmap;
 	}
 
 	qplib_qp->dpi = &cntx->dpi;
@@ -919,8 +883,7 @@
 rqfail:
 	ib_umem_release(qp->sumem);
 	qp->sumem = NULL;
-	qplib_qp->sq.sglist = NULL;
-	qplib_qp->sq.nmap = 0;
+	memset(&qplib_qp->sq.sg_info, 0, sizeof(qplib_qp->sq.sg_info));
 
 	return PTR_ERR(umem);
 }
@@ -958,7 +921,7 @@
 	/* Have DMAC same as SMAC */
 	ether_addr_copy(ah->qplib_ah.dmac, rdev->netdev->dev_addr);
 
-	rc = bnxt_qplib_create_ah(&rdev->qplib_res, &ah->qplib_ah);
+	rc = bnxt_qplib_create_ah(&rdev->qplib_res, &ah->qplib_ah, false);
 	if (rc) {
 		dev_err(rdev_to_dev(rdev),
 			"Failed to allocate HW AH for Shadow QP");
@@ -1063,12 +1026,17 @@
 	qp->qplib_qp.pd = &pd->qplib_pd;
 	qp->qplib_qp.qp_handle = (u64)(unsigned long)(&qp->qplib_qp);
 	qp->qplib_qp.type = __from_ib_qp_type(qp_init_attr->qp_type);
+
+	if (qp_init_attr->qp_type == IB_QPT_GSI &&
+	    bnxt_qplib_is_chip_gen_p5(&rdev->chip_ctx))
+		qp->qplib_qp.type = CMDQ_CREATE_QP_TYPE_GSI;
 	if (qp->qplib_qp.type == IB_QPT_MAX) {
 		dev_err(rdev_to_dev(rdev), "QP type 0x%x not supported",
 			qp->qplib_qp.type);
 		rc = -EINVAL;
 		goto fail;
 	}
+
 	qp->qplib_qp.max_inline_data = qp_init_attr->cap.max_inline_data;
 	qp->qplib_qp.sig_type = ((qp_init_attr->sq_sig_type ==
 				  IB_SIGNAL_ALL_WR) ? true : false);
@@ -1129,7 +1097,8 @@
 
 	qp->qplib_qp.mtu = ib_mtu_enum_to_int(iboe_get_mtu(rdev->netdev->mtu));
 
-	if (qp_init_attr->qp_type == IB_QPT_GSI) {
+	if (qp_init_attr->qp_type == IB_QPT_GSI &&
+	    !(bnxt_qplib_is_chip_gen_p5(&rdev->chip_ctx))) {
 		/* Allocate 1 more than what's provided */
 		entries = roundup_pow_of_two(qp_init_attr->cap.max_send_wr + 1);
 		qp->qplib_qp.sq.max_wqe = min_t(u32, entries,
@@ -1233,12 +1202,8 @@
 qp_destroy:
 	bnxt_qplib_destroy_qp(&rdev->qplib_res, &qp->qplib_qp);
 free_umem:
-	if (udata) {
-		if (qp->rumem)
-			ib_umem_release(qp->rumem);
-		if (qp->sumem)
-			ib_umem_release(qp->sumem);
-	}
+	ib_umem_release(qp->rumem);
+	ib_umem_release(qp->sumem);
 fail:
 	kfree(qp);
 	return ERR_PTR(rc);
@@ -1323,30 +1288,21 @@
 }
 
 /* Shared Receive Queues */
-int bnxt_re_destroy_srq(struct ib_srq *ib_srq)
+void bnxt_re_destroy_srq(struct ib_srq *ib_srq, struct ib_udata *udata)
 {
 	struct bnxt_re_srq *srq = container_of(ib_srq, struct bnxt_re_srq,
 					       ib_srq);
 	struct bnxt_re_dev *rdev = srq->rdev;
 	struct bnxt_qplib_srq *qplib_srq = &srq->qplib_srq;
 	struct bnxt_qplib_nq *nq = NULL;
-	int rc;
 
 	if (qplib_srq->cq)
 		nq = qplib_srq->cq->nq;
-	rc = bnxt_qplib_destroy_srq(&rdev->qplib_res, qplib_srq);
-	if (rc) {
-		dev_err(rdev_to_dev(rdev), "Destroy HW SRQ failed!");
-		return rc;
-	}
-
-	if (srq->umem)
-		ib_umem_release(srq->umem);
-	kfree(srq);
+	bnxt_qplib_destroy_srq(&rdev->qplib_res, qplib_srq);
+	ib_umem_release(srq->umem);
 	atomic_dec(&rdev->srq_count);
 	if (nq)
 		nq->budget--;
-	return 0;
 }
 
 static int bnxt_re_init_user_srq(struct bnxt_re_dev *rdev,
@@ -1358,37 +1314,38 @@
 	struct bnxt_qplib_srq *qplib_srq = &srq->qplib_srq;
 	struct ib_umem *umem;
 	int bytes = 0;
-	struct ib_ucontext *context = pd->ib_pd.uobject->context;
-	struct bnxt_re_ucontext *cntx = container_of(context,
-						     struct bnxt_re_ucontext,
-						     ib_uctx);
+	struct bnxt_re_ucontext *cntx = rdma_udata_to_drv_context(
+		udata, struct bnxt_re_ucontext, ib_uctx);
+
 	if (ib_copy_from_udata(&ureq, udata, sizeof(ureq)))
 		return -EFAULT;
 
 	bytes = (qplib_srq->max_wqe * BNXT_QPLIB_MAX_RQE_ENTRY_SIZE);
 	bytes = PAGE_ALIGN(bytes);
-	umem = ib_umem_get(context, ureq.srqva, bytes,
-			   IB_ACCESS_LOCAL_WRITE, 1);
+	umem = ib_umem_get(udata, ureq.srqva, bytes, IB_ACCESS_LOCAL_WRITE, 1);
 	if (IS_ERR(umem))
 		return PTR_ERR(umem);
 
 	srq->umem = umem;
-	qplib_srq->nmap = umem->nmap;
-	qplib_srq->sglist = umem->sg_head.sgl;
+	qplib_srq->sg_info.sglist = umem->sg_head.sgl;
+	qplib_srq->sg_info.npages = ib_umem_num_pages(umem);
+	qplib_srq->sg_info.nmap = umem->nmap;
 	qplib_srq->srq_handle = ureq.srq_handle;
 	qplib_srq->dpi = &cntx->dpi;
 
 	return 0;
 }
 
-struct ib_srq *bnxt_re_create_srq(struct ib_pd *ib_pd,
-				  struct ib_srq_init_attr *srq_init_attr,
-				  struct ib_udata *udata)
+int bnxt_re_create_srq(struct ib_srq *ib_srq,
+		       struct ib_srq_init_attr *srq_init_attr,
+		       struct ib_udata *udata)
 {
+	struct ib_pd *ib_pd = ib_srq->pd;
 	struct bnxt_re_pd *pd = container_of(ib_pd, struct bnxt_re_pd, ib_pd);
 	struct bnxt_re_dev *rdev = pd->rdev;
 	struct bnxt_qplib_dev_attr *dev_attr = &rdev->dev_attr;
-	struct bnxt_re_srq *srq;
+	struct bnxt_re_srq *srq =
+		container_of(ib_srq, struct bnxt_re_srq, ib_srq);
 	struct bnxt_qplib_nq *nq = NULL;
 	int rc, entries;
 
@@ -1403,11 +1360,6 @@
 		goto exit;
 	}
 
-	srq = kzalloc(sizeof(*srq), GFP_KERNEL);
-	if (!srq) {
-		rc = -ENOMEM;
-		goto exit;
-	}
 	srq->rdev = rdev;
 	srq->qplib_srq.pd = &pd->qplib_pd;
 	srq->qplib_srq.dpi = &rdev->dpi_privileged;
@@ -1446,21 +1398,19 @@
 			dev_err(rdev_to_dev(rdev), "SRQ copy to udata failed!");
 			bnxt_qplib_destroy_srq(&rdev->qplib_res,
 					       &srq->qplib_srq);
-			goto exit;
+			goto fail;
 		}
 	}
 	if (nq)
 		nq->budget++;
 	atomic_inc(&rdev->srq_count);
 
-	return &srq->ib_srq;
+	return 0;
 
 fail:
-	if (srq->umem)
-		ib_umem_release(srq->umem);
-	kfree(srq);
+	ib_umem_release(srq->umem);
 exit:
-	return ERR_PTR(rc);
+	return rc;
 }
 
 int bnxt_re_modify_srq(struct ib_srq *ib_srq, struct ib_srq_attr *srq_attr,
@@ -1598,8 +1548,7 @@
 		curr_qp_state = __to_ib_qp_state(qp->qplib_qp.cur_qp_state);
 		new_qp_state = qp_attr->qp_state;
 		if (!ib_modify_qp_is_ok(curr_qp_state, new_qp_state,
-					ib_qp->qp_type, qp_attr_mask,
-					IB_LINK_LAYER_ETHERNET)) {
+					ib_qp->qp_type, qp_attr_mask)) {
 			dev_err(rdev_to_dev(rdev),
 				"Invalid attribute mask: %#x specified ",
 				qp_attr_mask);
@@ -1644,6 +1593,9 @@
 			__from_ib_access_flags(qp_attr->qp_access_flags);
 		/* LOCAL_WRITE access must be set to allow RC receive */
 		qp->qplib_qp.access |= BNXT_QPLIB_ACCESS_LOCAL_WRITE;
+		/* Temp: Set all params on QP as of now */
+		qp->qplib_qp.access |= CMDQ_MODIFY_QP_ACCESS_REMOTE_WRITE;
+		qp->qplib_qp.access |= CMDQ_MODIFY_QP_ACCESS_REMOTE_READ;
 	}
 	if (qp_attr_mask & IB_QP_PKEY_INDEX) {
 		qp->qplib_qp.modify_flags |= CMDQ_MODIFY_QP_MODIFY_MASK_PKEY;
@@ -1681,8 +1633,11 @@
 				qp_attr->ah_attr.roce.dmac);
 
 		sgid_attr = qp_attr->ah_attr.grh.sgid_attr;
-		memcpy(qp->qplib_qp.smac, sgid_attr->ndev->dev_addr,
-		       ETH_ALEN);
+		rc = rdma_read_gid_l2_fields(sgid_attr, NULL,
+					     &qp->qplib_qp.smac[0]);
+		if (rc)
+			return rc;
+
 		nw_type = rdma_gid_attr_network_type(sgid_attr);
 		switch (nw_type) {
 		case RDMA_NETWORK_IPV4:
@@ -1901,8 +1856,10 @@
 
 	memset(&qp->qp1_hdr, 0, sizeof(qp->qp1_hdr));
 
-	if (is_vlan_dev(sgid_attr->ndev))
-		vlan_id = vlan_dev_vlan_id(sgid_attr->ndev);
+	rc = rdma_read_gid_l2_fields(sgid_attr, &vlan_id, NULL);
+	if (rc)
+		return rc;
+
 	/* Get network header type for this GID */
 	nw_type = rdma_gid_attr_network_type(sgid_attr);
 	switch (nw_type) {
@@ -2091,7 +2048,8 @@
 
 static int is_ud_qp(struct bnxt_re_qp *qp)
 {
-	return qp->qplib_qp.type == CMDQ_CREATE_QP_TYPE_UD;
+	return (qp->qplib_qp.type == CMDQ_CREATE_QP_TYPE_UD ||
+		qp->qplib_qp.type == CMDQ_CREATE_QP_TYPE_GSI);
 }
 
 static int bnxt_re_build_send_wqe(struct bnxt_re_qp *qp,
@@ -2395,7 +2353,7 @@
 		switch (wr->opcode) {
 		case IB_WR_SEND:
 		case IB_WR_SEND_WITH_IMM:
-			if (ib_qp->qp_type == IB_QPT_GSI) {
+			if (qp->qplib_qp.type == CMDQ_CREATE_QP1_TYPE_GSI) {
 				rc = bnxt_re_build_qp1_send_v2(qp, wr, &wqe,
 							       payload_sz);
 				if (rc)
@@ -2525,7 +2483,8 @@
 		wqe.wr_id = wr->wr_id;
 		wqe.type = BNXT_QPLIB_SWQE_TYPE_RECV;
 
-		if (ib_qp->qp_type == IB_QPT_GSI)
+		if (ib_qp->qp_type == IB_QPT_GSI &&
+		    qp->qplib_qp.type != CMDQ_CREATE_QP_TYPE_GSI)
 			rc = bnxt_re_build_qp1_shadow_qp_recv(qp, wr, &wqe,
 							      payload_sz);
 		if (!rc)
@@ -2553,9 +2512,8 @@
 }
 
 /* Completion Queues */
-int bnxt_re_destroy_cq(struct ib_cq *ib_cq)
+void bnxt_re_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata)
 {
-	int rc;
 	struct bnxt_re_cq *cq;
 	struct bnxt_qplib_nq *nq;
 	struct bnxt_re_dev *rdev;
@@ -2564,30 +2522,20 @@
 	rdev = cq->rdev;
 	nq = cq->qplib_cq.nq;
 
-	rc = bnxt_qplib_destroy_cq(&rdev->qplib_res, &cq->qplib_cq);
-	if (rc) {
-		dev_err(rdev_to_dev(rdev), "Failed to destroy HW CQ");
-		return rc;
-	}
-	if (!IS_ERR_OR_NULL(cq->umem))
-		ib_umem_release(cq->umem);
+	bnxt_qplib_destroy_cq(&rdev->qplib_res, &cq->qplib_cq);
+	ib_umem_release(cq->umem);
 
 	atomic_dec(&rdev->cq_count);
 	nq->budget--;
 	kfree(cq->cql);
-	kfree(cq);
-
-	return 0;
 }
 
-struct ib_cq *bnxt_re_create_cq(struct ib_device *ibdev,
-				const struct ib_cq_init_attr *attr,
-				struct ib_ucontext *context,
-				struct ib_udata *udata)
+int bnxt_re_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
+		      struct ib_udata *udata)
 {
-	struct bnxt_re_dev *rdev = to_bnxt_re_dev(ibdev, ibdev);
+	struct bnxt_re_dev *rdev = to_bnxt_re_dev(ibcq->device, ibdev);
 	struct bnxt_qplib_dev_attr *dev_attr = &rdev->dev_attr;
-	struct bnxt_re_cq *cq = NULL;
+	struct bnxt_re_cq *cq = container_of(ibcq, struct bnxt_re_cq, ib_cq);
 	int rc, entries;
 	int cqe = attr->cqe;
 	struct bnxt_qplib_nq *nq = NULL;
@@ -2596,11 +2544,8 @@
 	/* Validate CQ fields */
 	if (cqe < 1 || cqe > dev_attr->max_cq_wqes) {
 		dev_err(rdev_to_dev(rdev), "Failed to create CQ -max exceeded");
-		return ERR_PTR(-EINVAL);
+		return -EINVAL;
 	}
-	cq = kzalloc(sizeof(*cq), GFP_KERNEL);
-	if (!cq)
-		return ERR_PTR(-ENOMEM);
 
 	cq->rdev = rdev;
 	cq->qplib_cq.cq_handle = (u64)(unsigned long)(&cq->qplib_cq);
@@ -2609,26 +2554,25 @@
 	if (entries > dev_attr->max_cq_wqes + 1)
 		entries = dev_attr->max_cq_wqes + 1;
 
-	if (context) {
+	if (udata) {
 		struct bnxt_re_cq_req req;
-		struct bnxt_re_ucontext *uctx = container_of
-						(context,
-						 struct bnxt_re_ucontext,
-						 ib_uctx);
+		struct bnxt_re_ucontext *uctx = rdma_udata_to_drv_context(
+			udata, struct bnxt_re_ucontext, ib_uctx);
 		if (ib_copy_from_udata(&req, udata, sizeof(req))) {
 			rc = -EFAULT;
 			goto fail;
 		}
 
-		cq->umem = ib_umem_get(context, req.cq_va,
+		cq->umem = ib_umem_get(udata, req.cq_va,
 				       entries * sizeof(struct cq_base),
 				       IB_ACCESS_LOCAL_WRITE, 1);
 		if (IS_ERR(cq->umem)) {
 			rc = PTR_ERR(cq->umem);
 			goto fail;
 		}
-		cq->qplib_cq.sghead = cq->umem->sg_head.sgl;
-		cq->qplib_cq.nmap = cq->umem->nmap;
+		cq->qplib_cq.sg_info.sglist = cq->umem->sg_head.sgl;
+		cq->qplib_cq.sg_info.npages = ib_umem_num_pages(cq->umem);
+		cq->qplib_cq.sg_info.nmap = cq->umem->nmap;
 		cq->qplib_cq.dpi = &uctx->dpi;
 	} else {
 		cq->max_cql = min_t(u32, entries, MAX_CQL_PER_POLL);
@@ -2640,8 +2584,6 @@
 		}
 
 		cq->qplib_cq.dpi = &rdev->dpi_privileged;
-		cq->qplib_cq.sghead = NULL;
-		cq->qplib_cq.nmap = 0;
 	}
 	/*
 	 * Allocating the NQ in a round robin fashion. nq_alloc_cnt is a
@@ -2664,8 +2606,9 @@
 	nq->budget++;
 
 	atomic_inc(&rdev->cq_count);
+	spin_lock_init(&cq->cq_lock);
 
-	if (context) {
+	if (udata) {
 		struct bnxt_re_cq_resp resp;
 
 		resp.cqid = cq->qplib_cq.id;
@@ -2680,15 +2623,13 @@
 		}
 	}
 
-	return &cq->ib_cq;
+	return 0;
 
 c2fail:
-	if (context)
-		ib_umem_release(cq->umem);
+	ib_umem_release(cq->umem);
 fail:
 	kfree(cq->cql);
-	kfree(cq);
-	return ERR_PTR(rc);
+	return rc;
 }
 
 static u8 __req_to_ib_wc_status(u8 qstatus)
@@ -3119,19 +3060,33 @@
 	}
 }
 
-static void bnxt_re_process_res_ud_wc(struct ib_wc *wc,
+static void bnxt_re_process_res_ud_wc(struct bnxt_re_qp *qp,
+				      struct ib_wc *wc,
 				      struct bnxt_qplib_cqe *cqe)
 {
+	u8 nw_type;
+
 	wc->opcode = IB_WC_RECV;
 	wc->status = __rc_to_ib_wc_status(cqe->status);
 
-	if (cqe->flags & CQ_RES_RC_FLAGS_IMM)
+	if (cqe->flags & CQ_RES_UD_FLAGS_IMM)
 		wc->wc_flags |= IB_WC_WITH_IMM;
-	if (cqe->flags & CQ_RES_RC_FLAGS_INV)
-		wc->wc_flags |= IB_WC_WITH_INVALIDATE;
-	if ((cqe->flags & (CQ_RES_RC_FLAGS_RDMA | CQ_RES_RC_FLAGS_IMM)) ==
-	    (CQ_RES_RC_FLAGS_RDMA | CQ_RES_RC_FLAGS_IMM))
-		wc->opcode = IB_WC_RECV_RDMA_WITH_IMM;
+	/* report only on GSI QP for Thor */
+	if (qp->qplib_qp.type == CMDQ_CREATE_QP_TYPE_GSI) {
+		wc->wc_flags |= IB_WC_GRH;
+		memcpy(wc->smac, cqe->smac, ETH_ALEN);
+		wc->wc_flags |= IB_WC_WITH_SMAC;
+		if (cqe->flags & CQ_RES_UD_FLAGS_META_FORMAT_VLAN) {
+			wc->vlan_id = (cqe->cfa_meta & 0xFFF);
+			if (wc->vlan_id < 0x1000)
+				wc->wc_flags |= IB_WC_WITH_VLAN;
+		}
+		nw_type = (cqe->flags & CQ_RES_UD_FLAGS_ROCE_IP_VER_MASK) >>
+			   CQ_RES_UD_FLAGS_ROCE_IP_VER_SFT;
+		wc->network_hdr_type = bnxt_re_to_ib_nw_type(nw_type);
+		wc->wc_flags |= IB_WC_WITH_NETWORK_HDR_TYPE;
+	}
+
 }
 
 static int send_phantom_wqe(struct bnxt_re_qp *qp)
@@ -3223,7 +3178,7 @@
 
 			switch (cqe->opcode) {
 			case CQ_BASE_CQE_TYPE_REQ:
-				if (qp->qplib_qp.id ==
+				if (qp->rdev->qp1_sqp && qp->qplib_qp.id ==
 				    qp->rdev->qp1_sqp->qplib_qp.id) {
 					/* Handle this completion with
 					 * the stored completion
@@ -3258,7 +3213,7 @@
 				bnxt_re_process_res_rc_wc(wc, cqe);
 				break;
 			case CQ_BASE_CQE_TYPE_RES_UD:
-				if (qp->qplib_qp.id ==
+				if (qp->rdev->qp1_sqp && qp->qplib_qp.id ==
 				    qp->rdev->qp1_sqp->qplib_qp.id) {
 					/* Handle this completion with
 					 * the stored completion
@@ -3271,7 +3226,7 @@
 						break;
 					}
 				}
-				bnxt_re_process_res_ud_wc(wc, cqe);
+				bnxt_re_process_res_ud_wc(qp, wc, cqe);
 				break;
 			default:
 				dev_err(rdev_to_dev(cq->rdev),
@@ -3298,10 +3253,10 @@
 	spin_lock_irqsave(&cq->cq_lock, flags);
 	/* Trigger on the very next completion */
 	if (ib_cqn_flags & IB_CQ_NEXT_COMP)
-		type = DBR_DBR_TYPE_CQ_ARMALL;
+		type = DBC_DBC_TYPE_CQ_ARMALL;
 	/* Trigger on the next solicited completion */
 	else if (ib_cqn_flags & IB_CQ_SOLICITED)
-		type = DBR_DBR_TYPE_CQ_ARMSE;
+		type = DBC_DBC_TYPE_CQ_ARMSE;
 
 	/* Poll to see if there are missed events */
 	if ((ib_cqn_flags & IB_CQ_REPORT_MISSED_EVENTS) &&
@@ -3361,7 +3316,7 @@
 	return ERR_PTR(rc);
 }
 
-int bnxt_re_dereg_mr(struct ib_mr *ib_mr)
+int bnxt_re_dereg_mr(struct ib_mr *ib_mr, struct ib_udata *udata)
 {
 	struct bnxt_re_mr *mr = container_of(ib_mr, struct bnxt_re_mr, ib_mr);
 	struct bnxt_re_dev *rdev = mr->rdev;
@@ -3378,8 +3333,7 @@
 		mr->npages = 0;
 		mr->pages = NULL;
 	}
-	if (!IS_ERR_OR_NULL(mr->ib_umem))
-		ib_umem_release(mr->ib_umem);
+	ib_umem_release(mr->ib_umem);
 
 	kfree(mr);
 	atomic_dec(&rdev->mr_count);
@@ -3407,7 +3361,7 @@
 }
 
 struct ib_mr *bnxt_re_alloc_mr(struct ib_pd *ib_pd, enum ib_mr_type type,
-			       u32 max_num_sg)
+			       u32 max_num_sg, struct ib_udata *udata)
 {
 	struct bnxt_re_pd *pd = container_of(ib_pd, struct bnxt_re_pd, ib_pd);
 	struct bnxt_re_dev *rdev = pd->rdev;
@@ -3532,22 +3486,12 @@
 			     int page_shift)
 {
 	u64 *pbl_tbl = pbl_tbl_orig;
-	u64 paddr;
-	u64 page_mask = (1ULL << page_shift) - 1;
-	int i, pages;
-	struct scatterlist *sg;
-	int entry;
+	u64 page_size =  BIT_ULL(page_shift);
+	struct ib_block_iter biter;
 
-	for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) {
-		pages = sg_dma_len(sg) >> PAGE_SHIFT;
-		for (i = 0; i < pages; i++) {
-			paddr = sg_dma_address(sg) + (i << PAGE_SHIFT);
-			if (pbl_tbl == pbl_tbl_orig)
-				*pbl_tbl++ = paddr & ~page_mask;
-			else if ((paddr & page_mask) == 0)
-				*pbl_tbl++ = paddr;
-		}
-	}
+	rdma_for_each_block(umem->sg_head.sgl, &biter, umem->nmap, page_size)
+		*pbl_tbl++ = rdma_block_iter_dma_address(&biter);
+
 	return pbl_tbl - pbl_tbl_orig;
 }
 
@@ -3586,8 +3530,7 @@
 	/* The fixed portion of the rkey is the same as the lkey */
 	mr->ib_mr.rkey = mr->qplib_mr.rkey;
 
-	umem = ib_umem_get(ib_pd->uobject->context, start, length,
-			   mr_access_flags, 0);
+	umem = ib_umem_get(udata, start, length, mr_access_flags, 0);
 	if (IS_ERR(umem)) {
 		dev_err(rdev_to_dev(rdev), "Failed to get umem");
 		rc = -EFAULT;
@@ -3610,7 +3553,9 @@
 		goto free_umem;
 	}
 
-	page_shift = umem->page_shift;
+	page_shift = __ffs(ib_umem_find_best_pgsz(umem,
+				BNXT_RE_PAGE_SIZE_4K | BNXT_RE_PAGE_SIZE_2M,
+				virt_addr));
 
 	if (!bnxt_re_page_size_ok(page_shift)) {
 		dev_err(rdev_to_dev(rdev), "umem page size unsupported!");
@@ -3618,17 +3563,13 @@
 		goto fail;
 	}
 
-	if (!umem->hugetlb && length > BNXT_RE_MAX_MR_SIZE_LOW) {
+	if (page_shift == BNXT_RE_PAGE_SHIFT_4K &&
+	    length > BNXT_RE_MAX_MR_SIZE_LOW) {
 		dev_err(rdev_to_dev(rdev), "Requested MR Sz:%llu Max sup:%llu",
 			length,	(u64)BNXT_RE_MAX_MR_SIZE_LOW);
 		rc = -EINVAL;
 		goto fail;
 	}
-	if (umem->hugetlb && length > BNXT_RE_PAGE_SIZE_2M) {
-		page_shift = BNXT_RE_PAGE_SHIFT_2M;
-		dev_warn(rdev_to_dev(rdev), "umem hugetlb set page_size %x",
-			 1 << page_shift);
-	}
 
 	/* Map umem buf ptrs to the PBL */
 	umem_pgs = fill_umem_pbl_tbl(umem, pbl_tbl, page_shift);
@@ -3657,28 +3598,26 @@
 	return ERR_PTR(rc);
 }
 
-struct ib_ucontext *bnxt_re_alloc_ucontext(struct ib_device *ibdev,
-					   struct ib_udata *udata)
+int bnxt_re_alloc_ucontext(struct ib_ucontext *ctx, struct ib_udata *udata)
 {
+	struct ib_device *ibdev = ctx->device;
+	struct bnxt_re_ucontext *uctx =
+		container_of(ctx, struct bnxt_re_ucontext, ib_uctx);
 	struct bnxt_re_dev *rdev = to_bnxt_re_dev(ibdev, ibdev);
-	struct bnxt_re_uctx_resp resp;
-	struct bnxt_re_ucontext *uctx;
 	struct bnxt_qplib_dev_attr *dev_attr = &rdev->dev_attr;
+	struct bnxt_re_uctx_resp resp;
+	u32 chip_met_rev_num = 0;
 	int rc;
 
-	dev_dbg(rdev_to_dev(rdev), "ABI version requested %d",
-		ibdev->uverbs_abi_ver);
+	dev_dbg(rdev_to_dev(rdev), "ABI version requested %u",
+		ibdev->ops.uverbs_abi_ver);
 
-	if (ibdev->uverbs_abi_ver != BNXT_RE_ABI_VERSION) {
+	if (ibdev->ops.uverbs_abi_ver != BNXT_RE_ABI_VERSION) {
 		dev_dbg(rdev_to_dev(rdev), " is different from the device %d ",
 			BNXT_RE_ABI_VERSION);
-		return ERR_PTR(-EPERM);
+		return -EPERM;
 	}
 
-	uctx = kzalloc(sizeof(*uctx), GFP_KERNEL);
-	if (!uctx)
-		return ERR_PTR(-ENOMEM);
-
 	uctx->rdev = rdev;
 
 	uctx->shpg = (void *)__get_free_page(GFP_KERNEL);
@@ -3688,37 +3627,45 @@
 	}
 	spin_lock_init(&uctx->sh_lock);
 
-	resp.dev_id = rdev->en_dev->pdev->devfn; /*Temp, Use idr_alloc instead*/
+	resp.comp_mask = BNXT_RE_UCNTX_CMASK_HAVE_CCTX;
+	chip_met_rev_num = rdev->chip_ctx.chip_num;
+	chip_met_rev_num |= ((u32)rdev->chip_ctx.chip_rev & 0xFF) <<
+			     BNXT_RE_CHIP_ID0_CHIP_REV_SFT;
+	chip_met_rev_num |= ((u32)rdev->chip_ctx.chip_metal & 0xFF) <<
+			     BNXT_RE_CHIP_ID0_CHIP_MET_SFT;
+	resp.chip_id0 = chip_met_rev_num;
+	/* Future extension of chip info */
+	resp.chip_id1 = 0;
+	/*Temp, Use xa_alloc instead */
+	resp.dev_id = rdev->en_dev->pdev->devfn;
 	resp.max_qp = rdev->qplib_ctx.qpc_count;
 	resp.pg_size = PAGE_SIZE;
 	resp.cqe_sz = sizeof(struct cq_base);
 	resp.max_cqd = dev_attr->max_cq_wqes;
 	resp.rsvd    = 0;
 
-	rc = ib_copy_to_udata(udata, &resp, sizeof(resp));
+	rc = ib_copy_to_udata(udata, &resp, min(udata->outlen, sizeof(resp)));
 	if (rc) {
 		dev_err(rdev_to_dev(rdev), "Failed to copy user context");
 		rc = -EFAULT;
 		goto cfail;
 	}
 
-	return &uctx->ib_uctx;
+	return 0;
 cfail:
 	free_page((unsigned long)uctx->shpg);
 	uctx->shpg = NULL;
 fail:
-	kfree(uctx);
-	return ERR_PTR(rc);
+	return rc;
 }
 
-int bnxt_re_dealloc_ucontext(struct ib_ucontext *ib_uctx)
+void bnxt_re_dealloc_ucontext(struct ib_ucontext *ib_uctx)
 {
 	struct bnxt_re_ucontext *uctx = container_of(ib_uctx,
 						   struct bnxt_re_ucontext,
 						   ib_uctx);
 
 	struct bnxt_re_dev *rdev = uctx->rdev;
-	int rc = 0;
 
 	if (uctx->shpg)
 		free_page((unsigned long)uctx->shpg);
@@ -3727,17 +3674,10 @@
 		/* Free DPI only if this is the first PD allocated by the
 		 * application and mark the context dpi as NULL
 		 */
-		rc = bnxt_qplib_dealloc_dpi(&rdev->qplib_res,
-					    &rdev->qplib_res.dpi_tbl,
-					    &uctx->dpi);
-		if (rc)
-			dev_err(rdev_to_dev(rdev), "Deallocate HW DPI failed!");
-			/* Don't fail, continue*/
+		bnxt_qplib_dealloc_dpi(&rdev->qplib_res,
+				       &rdev->qplib_res.dpi_tbl, &uctx->dpi);
 		uctx->dpi.dbr = NULL;
 	}
-
-	kfree(uctx);
-	return 0;
 }
 
 /* Helper function to mmap the virtual memory from user app */
diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.h b/drivers/infiniband/hw/bnxt_re/ib_verbs.h
index aa33e7b..31662b1 100644
--- a/drivers/infiniband/hw/bnxt_re/ib_verbs.h
+++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.h
@@ -56,22 +56,22 @@
 };
 
 struct bnxt_re_pd {
+	struct ib_pd            ib_pd;
 	struct bnxt_re_dev	*rdev;
-	struct ib_pd		ib_pd;
 	struct bnxt_qplib_pd	qplib_pd;
 	struct bnxt_re_fence_data fence;
 };
 
 struct bnxt_re_ah {
-	struct bnxt_re_dev	*rdev;
 	struct ib_ah		ib_ah;
+	struct bnxt_re_dev	*rdev;
 	struct bnxt_qplib_ah	qplib_ah;
 };
 
 struct bnxt_re_srq {
+	struct ib_srq		ib_srq;
 	struct bnxt_re_dev	*rdev;
 	u32			srq_limit;
-	struct ib_srq		ib_srq;
 	struct bnxt_qplib_srq	qplib_srq;
 	struct ib_umem		*umem;
 	spinlock_t		lock;		/* protect srq */
@@ -94,11 +94,11 @@
 };
 
 struct bnxt_re_cq {
+	struct ib_cq		ib_cq;
 	struct bnxt_re_dev	*rdev;
 	spinlock_t              cq_lock;	/* protect cq */
 	u16			cq_count;
 	u16			cq_period;
-	struct ib_cq		ib_cq;
 	struct bnxt_qplib_cq	qplib_cq;
 	struct bnxt_qplib_cqe	*cql;
 #define MAX_CQL_PER_POLL	1024
@@ -135,15 +135,13 @@
 };
 
 struct bnxt_re_ucontext {
+	struct ib_ucontext      ib_uctx;
 	struct bnxt_re_dev	*rdev;
-	struct ib_ucontext	ib_uctx;
 	struct bnxt_qplib_dpi	dpi;
 	void			*shpg;
 	spinlock_t		sh_lock;	/* protect shpg */
 };
 
-struct net_device *bnxt_re_get_netdev(struct ib_device *ibdev, u8 port_num);
-
 int bnxt_re_query_device(struct ib_device *ibdev,
 			 struct ib_device_attr *ib_attr,
 			 struct ib_udata *udata);
@@ -163,24 +161,21 @@
 		      int index, union ib_gid *gid);
 enum rdma_link_layer bnxt_re_get_link_layer(struct ib_device *ibdev,
 					    u8 port_num);
-struct ib_pd *bnxt_re_alloc_pd(struct ib_device *ibdev,
-			       struct ib_ucontext *context,
-			       struct ib_udata *udata);
-int bnxt_re_dealloc_pd(struct ib_pd *pd);
-struct ib_ah *bnxt_re_create_ah(struct ib_pd *pd,
-				struct rdma_ah_attr *ah_attr,
-				struct ib_udata *udata);
+int bnxt_re_alloc_pd(struct ib_pd *pd, struct ib_udata *udata);
+void bnxt_re_dealloc_pd(struct ib_pd *pd, struct ib_udata *udata);
+int bnxt_re_create_ah(struct ib_ah *ah, struct rdma_ah_attr *ah_attr, u32 flags,
+		      struct ib_udata *udata);
 int bnxt_re_modify_ah(struct ib_ah *ah, struct rdma_ah_attr *ah_attr);
 int bnxt_re_query_ah(struct ib_ah *ah, struct rdma_ah_attr *ah_attr);
-int bnxt_re_destroy_ah(struct ib_ah *ah);
-struct ib_srq *bnxt_re_create_srq(struct ib_pd *pd,
-				  struct ib_srq_init_attr *srq_init_attr,
-				  struct ib_udata *udata);
+void bnxt_re_destroy_ah(struct ib_ah *ah, u32 flags);
+int bnxt_re_create_srq(struct ib_srq *srq,
+		       struct ib_srq_init_attr *srq_init_attr,
+		       struct ib_udata *udata);
 int bnxt_re_modify_srq(struct ib_srq *srq, struct ib_srq_attr *srq_attr,
 		       enum ib_srq_attr_mask srq_attr_mask,
 		       struct ib_udata *udata);
 int bnxt_re_query_srq(struct ib_srq *srq, struct ib_srq_attr *srq_attr);
-int bnxt_re_destroy_srq(struct ib_srq *srq);
+void bnxt_re_destroy_srq(struct ib_srq *srq, struct ib_udata *udata);
 int bnxt_re_post_srq_recv(struct ib_srq *srq, const struct ib_recv_wr *recv_wr,
 			  const struct ib_recv_wr **bad_recv_wr);
 struct ib_qp *bnxt_re_create_qp(struct ib_pd *pd,
@@ -190,16 +185,14 @@
 		      int qp_attr_mask, struct ib_udata *udata);
 int bnxt_re_query_qp(struct ib_qp *qp, struct ib_qp_attr *qp_attr,
 		     int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr);
-int bnxt_re_destroy_qp(struct ib_qp *qp);
+int bnxt_re_destroy_qp(struct ib_qp *qp, struct ib_udata *udata);
 int bnxt_re_post_send(struct ib_qp *qp, const struct ib_send_wr *send_wr,
 		      const struct ib_send_wr **bad_send_wr);
 int bnxt_re_post_recv(struct ib_qp *qp, const struct ib_recv_wr *recv_wr,
 		      const struct ib_recv_wr **bad_recv_wr);
-struct ib_cq *bnxt_re_create_cq(struct ib_device *ibdev,
-				const struct ib_cq_init_attr *attr,
-				struct ib_ucontext *context,
-				struct ib_udata *udata);
-int bnxt_re_destroy_cq(struct ib_cq *cq);
+int bnxt_re_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
+		      struct ib_udata *udata);
+void bnxt_re_destroy_cq(struct ib_cq *cq, struct ib_udata *udata);
 int bnxt_re_poll_cq(struct ib_cq *cq, int num_entries, struct ib_wc *wc);
 int bnxt_re_req_notify_cq(struct ib_cq *cq, enum ib_cq_notify_flags flags);
 struct ib_mr *bnxt_re_get_dma_mr(struct ib_pd *pd, int mr_access_flags);
@@ -207,17 +200,16 @@
 int bnxt_re_map_mr_sg(struct ib_mr *ib_mr, struct scatterlist *sg, int sg_nents,
 		      unsigned int *sg_offset);
 struct ib_mr *bnxt_re_alloc_mr(struct ib_pd *ib_pd, enum ib_mr_type mr_type,
-			       u32 max_num_sg);
-int bnxt_re_dereg_mr(struct ib_mr *mr);
+			       u32 max_num_sg, struct ib_udata *udata);
+int bnxt_re_dereg_mr(struct ib_mr *mr, struct ib_udata *udata);
 struct ib_mw *bnxt_re_alloc_mw(struct ib_pd *ib_pd, enum ib_mw_type type,
 			       struct ib_udata *udata);
 int bnxt_re_dealloc_mw(struct ib_mw *mw);
 struct ib_mr *bnxt_re_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 				  u64 virt_addr, int mr_access_flags,
 				  struct ib_udata *udata);
-struct ib_ucontext *bnxt_re_alloc_ucontext(struct ib_device *ibdev,
-					   struct ib_udata *udata);
-int bnxt_re_dealloc_ucontext(struct ib_ucontext *context);
+int bnxt_re_alloc_ucontext(struct ib_ucontext *ctx, struct ib_udata *udata);
+void bnxt_re_dealloc_ucontext(struct ib_ucontext *context);
 int bnxt_re_mmap(struct ib_ucontext *context, struct vm_area_struct *vma);
 
 unsigned long bnxt_re_lock_cqs(struct bnxt_re_qp *qp);
diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c
index 22bd978..30a54f8 100644
--- a/drivers/infiniband/hw/bnxt_re/main.c
+++ b/drivers/infiniband/hw/bnxt_re/main.c
@@ -67,7 +67,7 @@
 #include "hw_counters.h"
 
 static char version[] =
-		BNXT_RE_DESC " v" ROCE_DRV_MODULE_VERSION "\n";
+		BNXT_RE_DESC "\n";
 
 MODULE_AUTHOR("Eddie Wai <eddie.wai@broadcom.com>");
 MODULE_DESCRIPTION(BNXT_RE_DESC " Driver");
@@ -80,6 +80,29 @@
 static struct workqueue_struct *bnxt_re_wq;
 static void bnxt_re_ib_unreg(struct bnxt_re_dev *rdev);
 
+static void bnxt_re_destroy_chip_ctx(struct bnxt_re_dev *rdev)
+{
+	rdev->rcfw.res = NULL;
+	rdev->qplib_res.cctx = NULL;
+}
+
+static int bnxt_re_setup_chip_ctx(struct bnxt_re_dev *rdev)
+{
+	struct bnxt_en_dev *en_dev;
+	struct bnxt *bp;
+
+	en_dev = rdev->en_dev;
+	bp = netdev_priv(en_dev->net);
+
+	rdev->chip_ctx.chip_num = bp->chip_num;
+	/* rest members to follow eventually */
+
+	rdev->qplib_res.cctx = &rdev->chip_ctx;
+	rdev->rcfw.res = &rdev->qplib_res;
+
+	return 0;
+}
+
 /* SR-IOV helper functions */
 
 static void bnxt_re_get_sriov_func_type(struct bnxt_re_dev *rdev)
@@ -278,6 +301,7 @@
 
 	rc = en_dev->en_ops->bnxt_register_device(en_dev, BNXT_ROCE_ULP,
 						  &bnxt_re_ulp_ops, rdev);
+	rdev->qplib_res.pdev = rdev->en_dev->pdev;
 	return rc;
 }
 
@@ -345,7 +369,8 @@
 	fw_msg->timeout = timeout;
 }
 
-static int bnxt_re_net_ring_free(struct bnxt_re_dev *rdev, u16 fw_ring_id)
+static int bnxt_re_net_ring_free(struct bnxt_re_dev *rdev,
+				 u16 fw_ring_id, int type)
 {
 	struct bnxt_en_dev *en_dev = rdev->en_dev;
 	struct hwrm_ring_free_input req = {0};
@@ -359,7 +384,7 @@
 	memset(&fw_msg, 0, sizeof(fw_msg));
 
 	bnxt_re_init_hwrm_hdr(rdev, (void *)&req, HWRM_RING_FREE, -1, -1);
-	req.ring_type = RING_ALLOC_REQ_RING_TYPE_L2_CMPL;
+	req.ring_type = type;
 	req.ring_id = cpu_to_le16(fw_ring_id);
 	bnxt_re_fill_fw_msg(&fw_msg, (void *)&req, sizeof(req), (void *)&resp,
 			    sizeof(resp), DFLT_HWRM_CMD_TIMEOUT);
@@ -396,7 +421,7 @@
 	/* Association of ring index with doorbell index and MSIX number */
 	req.logical_id = cpu_to_le16(map_index);
 	req.length = cpu_to_le32(ring_mask + 1);
-	req.ring_type = RING_ALLOC_REQ_RING_TYPE_L2_CMPL;
+	req.ring_type = type;
 	req.int_mode = RING_ALLOC_REQ_INT_MODE_MSIX;
 	bnxt_re_fill_fw_msg(&fw_msg, (void *)&req, sizeof(req), (void *)&resp,
 			    sizeof(resp), DFLT_HWRM_CMD_TIMEOUT);
@@ -535,19 +560,100 @@
 	return en_dev;
 }
 
+static ssize_t hw_rev_show(struct device *device, struct device_attribute *attr,
+			   char *buf)
+{
+	struct bnxt_re_dev *rdev =
+		rdma_device_to_drv_device(device, struct bnxt_re_dev, ibdev);
+
+	return scnprintf(buf, PAGE_SIZE, "0x%x\n", rdev->en_dev->pdev->vendor);
+}
+static DEVICE_ATTR_RO(hw_rev);
+
+static ssize_t hca_type_show(struct device *device,
+			     struct device_attribute *attr, char *buf)
+{
+	struct bnxt_re_dev *rdev =
+		rdma_device_to_drv_device(device, struct bnxt_re_dev, ibdev);
+
+	return scnprintf(buf, PAGE_SIZE, "%s\n", rdev->ibdev.node_desc);
+}
+static DEVICE_ATTR_RO(hca_type);
+
+static struct attribute *bnxt_re_attributes[] = {
+	&dev_attr_hw_rev.attr,
+	&dev_attr_hca_type.attr,
+	NULL
+};
+
+static const struct attribute_group bnxt_re_dev_attr_group = {
+	.attrs = bnxt_re_attributes,
+};
+
 static void bnxt_re_unregister_ib(struct bnxt_re_dev *rdev)
 {
 	ib_unregister_device(&rdev->ibdev);
 }
 
+static const struct ib_device_ops bnxt_re_dev_ops = {
+	.owner = THIS_MODULE,
+	.driver_id = RDMA_DRIVER_BNXT_RE,
+	.uverbs_abi_ver = BNXT_RE_ABI_VERSION,
+
+	.add_gid = bnxt_re_add_gid,
+	.alloc_hw_stats = bnxt_re_ib_alloc_hw_stats,
+	.alloc_mr = bnxt_re_alloc_mr,
+	.alloc_pd = bnxt_re_alloc_pd,
+	.alloc_ucontext = bnxt_re_alloc_ucontext,
+	.create_ah = bnxt_re_create_ah,
+	.create_cq = bnxt_re_create_cq,
+	.create_qp = bnxt_re_create_qp,
+	.create_srq = bnxt_re_create_srq,
+	.dealloc_pd = bnxt_re_dealloc_pd,
+	.dealloc_ucontext = bnxt_re_dealloc_ucontext,
+	.del_gid = bnxt_re_del_gid,
+	.dereg_mr = bnxt_re_dereg_mr,
+	.destroy_ah = bnxt_re_destroy_ah,
+	.destroy_cq = bnxt_re_destroy_cq,
+	.destroy_qp = bnxt_re_destroy_qp,
+	.destroy_srq = bnxt_re_destroy_srq,
+	.get_dev_fw_str = bnxt_re_query_fw_str,
+	.get_dma_mr = bnxt_re_get_dma_mr,
+	.get_hw_stats = bnxt_re_ib_get_hw_stats,
+	.get_link_layer = bnxt_re_get_link_layer,
+	.get_port_immutable = bnxt_re_get_port_immutable,
+	.map_mr_sg = bnxt_re_map_mr_sg,
+	.mmap = bnxt_re_mmap,
+	.modify_ah = bnxt_re_modify_ah,
+	.modify_device = bnxt_re_modify_device,
+	.modify_qp = bnxt_re_modify_qp,
+	.modify_srq = bnxt_re_modify_srq,
+	.poll_cq = bnxt_re_poll_cq,
+	.post_recv = bnxt_re_post_recv,
+	.post_send = bnxt_re_post_send,
+	.post_srq_recv = bnxt_re_post_srq_recv,
+	.query_ah = bnxt_re_query_ah,
+	.query_device = bnxt_re_query_device,
+	.query_pkey = bnxt_re_query_pkey,
+	.query_port = bnxt_re_query_port,
+	.query_qp = bnxt_re_query_qp,
+	.query_srq = bnxt_re_query_srq,
+	.reg_user_mr = bnxt_re_reg_user_mr,
+	.req_notify_cq = bnxt_re_req_notify_cq,
+	INIT_RDMA_OBJ_SIZE(ib_ah, bnxt_re_ah, ib_ah),
+	INIT_RDMA_OBJ_SIZE(ib_cq, bnxt_re_cq, ib_cq),
+	INIT_RDMA_OBJ_SIZE(ib_pd, bnxt_re_pd, ib_pd),
+	INIT_RDMA_OBJ_SIZE(ib_srq, bnxt_re_srq, ib_srq),
+	INIT_RDMA_OBJ_SIZE(ib_ucontext, bnxt_re_ucontext, ib_uctx),
+};
+
 static int bnxt_re_register_ib(struct bnxt_re_dev *rdev)
 {
 	struct ib_device *ibdev = &rdev->ibdev;
+	int ret;
 
 	/* ib device init */
-	ibdev->owner = THIS_MODULE;
 	ibdev->node_type = RDMA_NODE_IB_CA;
-	strlcpy(ibdev->name, "bnxt_re%d", IB_DEVICE_NAME_MAX);
 	strlcpy(ibdev->node_desc, BNXT_RE_DESC " HCA",
 		strlen(BNXT_RE_DESC) + 5);
 	ibdev->phys_port_cnt = 1;
@@ -559,7 +665,6 @@
 	ibdev->local_dma_lkey = BNXT_QPLIB_RSVD_LKEY;
 
 	/* User space */
-	ibdev->uverbs_abi_ver = BNXT_RE_ABI_VERSION;
 	ibdev->uverbs_cmd_mask =
 			(1ull << IB_USER_VERBS_CMD_GET_CONTEXT)		|
 			(1ull << IB_USER_VERBS_CMD_QUERY_DEVICE)	|
@@ -587,86 +692,16 @@
 			(1ull << IB_USER_VERBS_CMD_DESTROY_AH);
 	/* POLL_CQ and REQ_NOTIFY_CQ is directly handled in libbnxt_re */
 
-	/* Kernel verbs */
-	ibdev->query_device		= bnxt_re_query_device;
-	ibdev->modify_device		= bnxt_re_modify_device;
 
-	ibdev->query_port		= bnxt_re_query_port;
-	ibdev->get_port_immutable	= bnxt_re_get_port_immutable;
-	ibdev->get_dev_fw_str           = bnxt_re_query_fw_str;
-	ibdev->query_pkey		= bnxt_re_query_pkey;
-	ibdev->get_netdev		= bnxt_re_get_netdev;
-	ibdev->add_gid			= bnxt_re_add_gid;
-	ibdev->del_gid			= bnxt_re_del_gid;
-	ibdev->get_link_layer		= bnxt_re_get_link_layer;
+	rdma_set_device_sysfs_group(ibdev, &bnxt_re_dev_attr_group);
+	ib_set_device_ops(ibdev, &bnxt_re_dev_ops);
+	ret = ib_device_set_netdev(&rdev->ibdev, rdev->netdev, 1);
+	if (ret)
+		return ret;
 
-	ibdev->alloc_pd			= bnxt_re_alloc_pd;
-	ibdev->dealloc_pd		= bnxt_re_dealloc_pd;
-
-	ibdev->create_ah		= bnxt_re_create_ah;
-	ibdev->modify_ah		= bnxt_re_modify_ah;
-	ibdev->query_ah			= bnxt_re_query_ah;
-	ibdev->destroy_ah		= bnxt_re_destroy_ah;
-
-	ibdev->create_srq		= bnxt_re_create_srq;
-	ibdev->modify_srq		= bnxt_re_modify_srq;
-	ibdev->query_srq		= bnxt_re_query_srq;
-	ibdev->destroy_srq		= bnxt_re_destroy_srq;
-	ibdev->post_srq_recv		= bnxt_re_post_srq_recv;
-
-	ibdev->create_qp		= bnxt_re_create_qp;
-	ibdev->modify_qp		= bnxt_re_modify_qp;
-	ibdev->query_qp			= bnxt_re_query_qp;
-	ibdev->destroy_qp		= bnxt_re_destroy_qp;
-
-	ibdev->post_send		= bnxt_re_post_send;
-	ibdev->post_recv		= bnxt_re_post_recv;
-
-	ibdev->create_cq		= bnxt_re_create_cq;
-	ibdev->destroy_cq		= bnxt_re_destroy_cq;
-	ibdev->poll_cq			= bnxt_re_poll_cq;
-	ibdev->req_notify_cq		= bnxt_re_req_notify_cq;
-
-	ibdev->get_dma_mr		= bnxt_re_get_dma_mr;
-	ibdev->dereg_mr			= bnxt_re_dereg_mr;
-	ibdev->alloc_mr			= bnxt_re_alloc_mr;
-	ibdev->map_mr_sg		= bnxt_re_map_mr_sg;
-
-	ibdev->reg_user_mr		= bnxt_re_reg_user_mr;
-	ibdev->alloc_ucontext		= bnxt_re_alloc_ucontext;
-	ibdev->dealloc_ucontext		= bnxt_re_dealloc_ucontext;
-	ibdev->mmap			= bnxt_re_mmap;
-	ibdev->get_hw_stats             = bnxt_re_ib_get_hw_stats;
-	ibdev->alloc_hw_stats           = bnxt_re_ib_alloc_hw_stats;
-
-	ibdev->driver_id = RDMA_DRIVER_BNXT_RE;
-	return ib_register_device(ibdev, NULL);
+	return ib_register_device(ibdev, "bnxt_re%d");
 }
 
-static ssize_t show_rev(struct device *device, struct device_attribute *attr,
-			char *buf)
-{
-	struct bnxt_re_dev *rdev = to_bnxt_re_dev(device, ibdev.dev);
-
-	return scnprintf(buf, PAGE_SIZE, "0x%x\n", rdev->en_dev->pdev->vendor);
-}
-
-static ssize_t show_hca(struct device *device, struct device_attribute *attr,
-			char *buf)
-{
-	struct bnxt_re_dev *rdev = to_bnxt_re_dev(device, ibdev.dev);
-
-	return scnprintf(buf, PAGE_SIZE, "%s\n", rdev->ibdev.node_desc);
-}
-
-static DEVICE_ATTR(hw_rev, 0444, show_rev, NULL);
-static DEVICE_ATTR(hca_type, 0444, show_hca, NULL);
-
-static struct device_attribute *bnxt_re_attributes[] = {
-	&dev_attr_hw_rev,
-	&dev_attr_hca_type
-};
-
 static void bnxt_re_dev_remove(struct bnxt_re_dev *rdev)
 {
 	dev_put(rdev->netdev);
@@ -688,7 +723,7 @@
 	struct bnxt_re_dev *rdev;
 
 	/* Allocate bnxt_re_dev instance here */
-	rdev = (struct bnxt_re_dev *)ib_alloc_device(sizeof(*rdev));
+	rdev = ib_alloc_device(bnxt_re_dev, ibdev);
 	if (!rdev) {
 		dev_err(NULL, "%s: bnxt_re_dev allocation failure!",
 			ROCE_DRV_MODULE_NAME);
@@ -860,14 +895,18 @@
 	return 0;
 }
 
+static u32 bnxt_re_get_nqdb_offset(struct bnxt_re_dev *rdev, u16 indx)
+{
+	return bnxt_qplib_is_chip_gen_p5(&rdev->chip_ctx) ?
+				0x10000 : rdev->msix_entries[indx].db_offset;
+}
+
 static void bnxt_re_cleanup_res(struct bnxt_re_dev *rdev)
 {
 	int i;
 
-	if (rdev->nq[0].hwq.max_elements) {
-		for (i = 1; i < rdev->num_msix; i++)
-			bnxt_qplib_disable_nq(&rdev->nq[i - 1]);
-	}
+	for (i = 1; i < rdev->num_msix; i++)
+		bnxt_qplib_disable_nq(&rdev->nq[i - 1]);
 
 	if (rdev->qplib_res.rcfw)
 		bnxt_qplib_cleanup_res(&rdev->qplib_res);
@@ -875,34 +914,41 @@
 
 static int bnxt_re_init_res(struct bnxt_re_dev *rdev)
 {
+	int num_vec_enabled = 0;
 	int rc = 0, i;
+	u32 db_offt;
 
 	bnxt_qplib_init_res(&rdev->qplib_res);
 
 	for (i = 1; i < rdev->num_msix ; i++) {
+		db_offt = bnxt_re_get_nqdb_offset(rdev, i);
 		rc = bnxt_qplib_enable_nq(rdev->en_dev->pdev, &rdev->nq[i - 1],
 					  i - 1, rdev->msix_entries[i].vector,
-					  rdev->msix_entries[i].db_offset,
-					  &bnxt_re_cqn_handler,
+					  db_offt, &bnxt_re_cqn_handler,
 					  &bnxt_re_srqn_handler);
-
 		if (rc) {
 			dev_err(rdev_to_dev(rdev),
 				"Failed to enable NQ with rc = 0x%x", rc);
 			goto fail;
 		}
+		num_vec_enabled++;
 	}
 	return 0;
 fail:
+	for (i = num_vec_enabled; i >= 0; i--)
+		bnxt_qplib_disable_nq(&rdev->nq[i]);
 	return rc;
 }
 
 static void bnxt_re_free_nq_res(struct bnxt_re_dev *rdev)
 {
+	u8 type;
 	int i;
 
 	for (i = 0; i < rdev->num_msix - 1; i++) {
-		bnxt_re_net_ring_free(rdev, rdev->nq[i].ring_id);
+		type = bnxt_qplib_get_ring_type(&rdev->chip_ctx);
+		bnxt_re_net_ring_free(rdev, rdev->nq[i].ring_id, type);
+		rdev->nq[i].res = NULL;
 		bnxt_qplib_free_nq(&rdev->nq[i]);
 	}
 }
@@ -924,7 +970,11 @@
 
 static int bnxt_re_alloc_res(struct bnxt_re_dev *rdev)
 {
+	int num_vec_created = 0;
+	dma_addr_t *pg_map;
 	int rc = 0, i;
+	int pages;
+	u8 type;
 
 	/* Configure and allocate resources for qplib */
 	rdev->qplib_res.rcfw = &rdev->rcfw;
@@ -945,33 +995,38 @@
 		goto dealloc_res;
 
 	for (i = 0; i < rdev->num_msix - 1; i++) {
+		rdev->nq[i].res = &rdev->qplib_res;
 		rdev->nq[i].hwq.max_elements = BNXT_RE_MAX_CQ_COUNT +
 			BNXT_RE_MAX_SRQC_COUNT + 2;
 		rc = bnxt_qplib_alloc_nq(rdev->en_dev->pdev, &rdev->nq[i]);
 		if (rc) {
 			dev_err(rdev_to_dev(rdev), "Alloc Failed NQ%d rc:%#x",
 				i, rc);
-			goto dealloc_dpi;
+			goto free_nq;
 		}
-		rc = bnxt_re_net_ring_alloc
-			(rdev, rdev->nq[i].hwq.pbl[PBL_LVL_0].pg_map_arr,
-			 rdev->nq[i].hwq.pbl[rdev->nq[i].hwq.level].pg_count,
-			 HWRM_RING_ALLOC_CMPL,
-			 BNXT_QPLIB_NQE_MAX_CNT - 1,
-			 rdev->msix_entries[i + 1].ring_idx,
-			 &rdev->nq[i].ring_id);
+		type = bnxt_qplib_get_ring_type(&rdev->chip_ctx);
+		pg_map = rdev->nq[i].hwq.pbl[PBL_LVL_0].pg_map_arr;
+		pages = rdev->nq[i].hwq.pbl[rdev->nq[i].hwq.level].pg_count;
+		rc = bnxt_re_net_ring_alloc(rdev, pg_map, pages, type,
+					    BNXT_QPLIB_NQE_MAX_CNT - 1,
+					    rdev->msix_entries[i + 1].ring_idx,
+					    &rdev->nq[i].ring_id);
 		if (rc) {
 			dev_err(rdev_to_dev(rdev),
 				"Failed to allocate NQ fw id with rc = 0x%x",
 				rc);
+			bnxt_qplib_free_nq(&rdev->nq[i]);
 			goto free_nq;
 		}
+		num_vec_created++;
 	}
 	return 0;
 free_nq:
-	for (i = 0; i < rdev->num_msix - 1; i++)
+	for (i = num_vec_created; i >= 0; i--) {
+		type = bnxt_qplib_get_ring_type(&rdev->chip_ctx);
+		bnxt_re_net_ring_free(rdev, rdev->nq[i].ring_id, type);
 		bnxt_qplib_free_nq(&rdev->nq[i]);
-dealloc_dpi:
+	}
 	bnxt_qplib_dealloc_dpi(&rdev->qplib_res,
 			       &rdev->qplib_res.dpi_tbl,
 			       &rdev->dpi_privileged);
@@ -989,12 +1044,17 @@
 	struct ib_event ib_event;
 
 	ib_event.device = ibdev;
-	if (qp)
+	if (qp) {
 		ib_event.element.qp = qp;
-	else
+		ib_event.event = event;
+		if (qp->event_handler)
+			qp->event_handler(&ib_event, qp->qp_context);
+
+	} else {
 		ib_event.element.port_num = port_num;
-	ib_event.event = event;
-	ib_dispatch_event(&ib_event);
+		ib_event.event = event;
+		ib_dispatch_event(&ib_event);
+	}
 }
 
 #define HWRM_QUEUE_PRI2COS_QCFG_INPUT_FLAGS_IVLAN      0x02
@@ -1187,22 +1247,52 @@
 	return 0;
 }
 
+static void bnxt_re_query_hwrm_intf_version(struct bnxt_re_dev *rdev)
+{
+	struct bnxt_en_dev *en_dev = rdev->en_dev;
+	struct hwrm_ver_get_output resp = {0};
+	struct hwrm_ver_get_input req = {0};
+	struct bnxt_fw_msg fw_msg;
+	int rc = 0;
+
+	memset(&fw_msg, 0, sizeof(fw_msg));
+	bnxt_re_init_hwrm_hdr(rdev, (void *)&req,
+			      HWRM_VER_GET, -1, -1);
+	req.hwrm_intf_maj = HWRM_VERSION_MAJOR;
+	req.hwrm_intf_min = HWRM_VERSION_MINOR;
+	req.hwrm_intf_upd = HWRM_VERSION_UPDATE;
+	bnxt_re_fill_fw_msg(&fw_msg, (void *)&req, sizeof(req), (void *)&resp,
+			    sizeof(resp), DFLT_HWRM_CMD_TIMEOUT);
+	rc = en_dev->en_ops->bnxt_send_fw_msg(en_dev, BNXT_ROCE_ULP, &fw_msg);
+	if (rc) {
+		dev_err(rdev_to_dev(rdev),
+			"Failed to query HW version, rc = 0x%x", rc);
+		return;
+	}
+	rdev->qplib_ctx.hwrm_intf_ver =
+		(u64)resp.hwrm_intf_major << 48 |
+		(u64)resp.hwrm_intf_minor << 32 |
+		(u64)resp.hwrm_intf_build << 16 |
+		resp.hwrm_intf_patch;
+}
+
 static void bnxt_re_ib_unreg(struct bnxt_re_dev *rdev)
 {
-	int i, rc;
+	u8 type;
+	int rc;
 
 	if (test_and_clear_bit(BNXT_RE_FLAG_IBDEV_REGISTERED, &rdev->flags)) {
-		for (i = 0; i < ARRAY_SIZE(bnxt_re_attributes); i++)
-			device_remove_file(&rdev->ibdev.dev,
-					   bnxt_re_attributes[i]);
 		/* Cleanup ib dev */
 		bnxt_re_unregister_ib(rdev);
 	}
 	if (test_and_clear_bit(BNXT_RE_FLAG_QOS_WORK_REG, &rdev->flags))
-		cancel_delayed_work(&rdev->worker);
+		cancel_delayed_work_sync(&rdev->worker);
 
-	bnxt_re_cleanup_res(rdev);
-	bnxt_re_free_res(rdev);
+	if (test_and_clear_bit(BNXT_RE_FLAG_RESOURCES_INITIALIZED,
+			       &rdev->flags))
+		bnxt_re_cleanup_res(rdev);
+	if (test_and_clear_bit(BNXT_RE_FLAG_RESOURCES_ALLOCATED, &rdev->flags))
+		bnxt_re_free_res(rdev);
 
 	if (test_and_clear_bit(BNXT_RE_FLAG_RCFW_CHANNEL_EN, &rdev->flags)) {
 		rc = bnxt_qplib_deinit_rcfw(&rdev->rcfw);
@@ -1212,7 +1302,8 @@
 		bnxt_re_net_stats_ctx_free(rdev, rdev->qplib_ctx.stats.fw_id);
 		bnxt_qplib_free_ctx(rdev->en_dev->pdev, &rdev->qplib_ctx);
 		bnxt_qplib_disable_rcfw_channel(&rdev->rcfw);
-		bnxt_re_net_ring_free(rdev, rdev->rcfw.creq_ring_id);
+		type = bnxt_qplib_get_ring_type(&rdev->chip_ctx);
+		bnxt_re_net_ring_free(rdev, rdev->rcfw.creq_ring_id, type);
 		bnxt_qplib_free_rcfw_channel(&rdev->rcfw);
 	}
 	if (test_and_clear_bit(BNXT_RE_FLAG_GOT_MSIX, &rdev->flags)) {
@@ -1221,6 +1312,8 @@
 			dev_warn(rdev_to_dev(rdev),
 				 "Failed to free MSI-X vectors: %#x", rc);
 	}
+
+	bnxt_re_destroy_chip_ctx(rdev);
 	if (test_and_clear_bit(BNXT_RE_FLAG_NETDEV_REGISTERED, &rdev->flags)) {
 		rc = bnxt_re_unregister_netdev(rdev);
 		if (rc)
@@ -1241,9 +1334,12 @@
 
 static int bnxt_re_ib_reg(struct bnxt_re_dev *rdev)
 {
-	int i, j, rc;
-
+	dma_addr_t *pg_map;
+	u32 db_offt, ridx;
+	int pages, vid;
 	bool locked;
+	u8 type;
+	int rc;
 
 	/* Acquire rtnl lock through out this function */
 	rtnl_lock();
@@ -1258,6 +1354,12 @@
 	}
 	set_bit(BNXT_RE_FLAG_NETDEV_REGISTERED, &rdev->flags);
 
+	rc = bnxt_re_setup_chip_ctx(rdev);
+	if (rc) {
+		dev_err(rdev_to_dev(rdev), "Failed to get chip context\n");
+		return -EINVAL;
+	}
+
 	/* Check whether VF or PF */
 	bnxt_re_get_sriov_func_type(rdev);
 
@@ -1269,30 +1371,34 @@
 	}
 	set_bit(BNXT_RE_FLAG_GOT_MSIX, &rdev->flags);
 
+	bnxt_re_query_hwrm_intf_version(rdev);
+
 	/* Establish RCFW Communication Channel to initialize the context
 	 * memory for the function and all child VFs
 	 */
 	rc = bnxt_qplib_alloc_rcfw_channel(rdev->en_dev->pdev, &rdev->rcfw,
+					   &rdev->qplib_ctx,
 					   BNXT_RE_MAX_QPC_COUNT);
 	if (rc) {
 		pr_err("Failed to allocate RCFW Channel: %#x\n", rc);
 		goto fail;
 	}
-	rc = bnxt_re_net_ring_alloc
-			(rdev, rdev->rcfw.creq.pbl[PBL_LVL_0].pg_map_arr,
-			 rdev->rcfw.creq.pbl[rdev->rcfw.creq.level].pg_count,
-			 HWRM_RING_ALLOC_CMPL, BNXT_QPLIB_CREQE_MAX_CNT - 1,
-			 rdev->msix_entries[BNXT_RE_AEQ_IDX].ring_idx,
-			 &rdev->rcfw.creq_ring_id);
+	type = bnxt_qplib_get_ring_type(&rdev->chip_ctx);
+	pg_map = rdev->rcfw.creq.pbl[PBL_LVL_0].pg_map_arr;
+	pages = rdev->rcfw.creq.pbl[rdev->rcfw.creq.level].pg_count;
+	ridx = rdev->msix_entries[BNXT_RE_AEQ_IDX].ring_idx;
+	rc = bnxt_re_net_ring_alloc(rdev, pg_map, pages, type,
+				    BNXT_QPLIB_CREQE_MAX_CNT - 1,
+				    ridx, &rdev->rcfw.creq_ring_id);
 	if (rc) {
 		pr_err("Failed to allocate CREQ: %#x\n", rc);
 		goto free_rcfw;
 	}
-	rc = bnxt_qplib_enable_rcfw_channel
-				(rdev->en_dev->pdev, &rdev->rcfw,
-				 rdev->msix_entries[BNXT_RE_AEQ_IDX].vector,
-				 rdev->msix_entries[BNXT_RE_AEQ_IDX].db_offset,
-				 rdev->is_virtfn, &bnxt_re_aeq_handler);
+	db_offt = bnxt_re_get_nqdb_offset(rdev, BNXT_RE_AEQ_IDX);
+	vid = rdev->msix_entries[BNXT_RE_AEQ_IDX].vector;
+	rc = bnxt_qplib_enable_rcfw_channel(rdev->en_dev->pdev, &rdev->rcfw,
+					    vid, db_offt, rdev->is_virtfn,
+					    &bnxt_re_aeq_handler);
 	if (rc) {
 		pr_err("Failed to enable RCFW channel: %#x\n", rc);
 		goto free_ring;
@@ -1305,7 +1411,8 @@
 	if (!rdev->is_virtfn)
 		bnxt_re_set_resource_limits(rdev);
 
-	rc = bnxt_qplib_alloc_ctx(rdev->en_dev->pdev, &rdev->qplib_ctx, 0);
+	rc = bnxt_qplib_alloc_ctx(rdev->en_dev->pdev, &rdev->qplib_ctx, 0,
+				  bnxt_qplib_is_chip_gen_p5(&rdev->chip_ctx));
 	if (rc) {
 		pr_err("Failed to allocate QPLIB context: %#x\n", rc);
 		goto disable_rcfw;
@@ -1332,12 +1439,15 @@
 		pr_err("Failed to allocate resources: %#x\n", rc);
 		goto fail;
 	}
+	set_bit(BNXT_RE_FLAG_RESOURCES_ALLOCATED, &rdev->flags);
 	rc = bnxt_re_init_res(rdev);
 	if (rc) {
 		pr_err("Failed to initialize resources: %#x\n", rc);
 		goto fail;
 	}
 
+	set_bit(BNXT_RE_FLAG_RESOURCES_INITIALIZED, &rdev->flags);
+
 	if (!rdev->is_virtfn) {
 		rc = bnxt_re_setup_qos(rdev);
 		if (rc)
@@ -1359,25 +1469,10 @@
 	}
 	set_bit(BNXT_RE_FLAG_IBDEV_REGISTERED, &rdev->flags);
 	dev_info(rdev_to_dev(rdev), "Device registered successfully");
-	for (i = 0; i < ARRAY_SIZE(bnxt_re_attributes); i++) {
-		rc = device_create_file(&rdev->ibdev.dev,
-					bnxt_re_attributes[i]);
-		if (rc) {
-			dev_err(rdev_to_dev(rdev),
-				"Failed to create IB sysfs: %#x", rc);
-			/* Must clean up all created device files */
-			for (j = 0; j < i; j++)
-				device_remove_file(&rdev->ibdev.dev,
-						   bnxt_re_attributes[j]);
-			bnxt_re_unregister_ib(rdev);
-			goto fail;
-		}
-	}
 	ib_get_eth_speed(&rdev->ibdev, 1, &rdev->active_speed,
 			 &rdev->active_width);
 	set_bit(BNXT_RE_FLAG_ISSUE_ROCE_STATS, &rdev->flags);
 	bnxt_re_dispatch_event(&rdev->ibdev, NULL, 1, IB_EVENT_PORT_ACTIVE);
-	bnxt_re_dispatch_event(&rdev->ibdev, NULL, 1, IB_EVENT_GID_CHANGE);
 
 	return 0;
 free_sctx:
@@ -1387,7 +1482,8 @@
 disable_rcfw:
 	bnxt_qplib_disable_rcfw_channel(&rdev->rcfw);
 free_ring:
-	bnxt_re_net_ring_free(rdev, rdev->rcfw.creq_ring_id);
+	type = bnxt_qplib_get_ring_type(&rdev->chip_ctx);
+	bnxt_re_net_ring_free(rdev, rdev->rcfw.creq_ring_id, type);
 free_rcfw:
 	bnxt_qplib_free_rcfw_channel(&rdev->rcfw);
 fail:
diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.c b/drivers/infiniband/hw/bnxt_re/qplib_fp.c
index 249efa0..958c1ff 100644
--- a/drivers/infiniband/hw/bnxt_re/qplib_fp.c
+++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.c
@@ -36,12 +36,15 @@
  * Description: Fast Path Operators
  */
 
+#define dev_fmt(fmt) "QPLIB: " fmt
+
 #include <linux/interrupt.h>
 #include <linux/spinlock.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/pci.h>
 #include <linux/prefetch.h>
+#include <linux/if_ether.h>
 
 #include "roce_hsi.h"
 
@@ -71,8 +74,7 @@
 
 	if (!qp->sq.flushed) {
 		dev_dbg(&scq->hwq.pdev->dev,
-			"QPLIB: FP: Adding to SQ Flush list = %p",
-			qp);
+			"FP: Adding to SQ Flush list = %p\n", qp);
 		bnxt_qplib_cancel_phantom_processing(qp);
 		list_add_tail(&qp->sq_flush, &scq->sqf_head);
 		qp->sq.flushed = true;
@@ -80,8 +82,7 @@
 	if (!qp->srq) {
 		if (!qp->rq.flushed) {
 			dev_dbg(&rcq->hwq.pdev->dev,
-				"QPLIB: FP: Adding to RQ Flush list = %p",
-				qp);
+				"FP: Adding to RQ Flush list = %p\n", qp);
 			list_add_tail(&qp->rq_flush, &rcq->rqf_head);
 			qp->rq.flushed = true;
 		}
@@ -207,7 +208,7 @@
 		if (!qp->sq_hdr_buf) {
 			rc = -ENOMEM;
 			dev_err(&res->pdev->dev,
-				"QPLIB: Failed to create sq_hdr_buf");
+				"Failed to create sq_hdr_buf\n");
 			goto fail;
 		}
 	}
@@ -221,7 +222,7 @@
 		if (!qp->rq_hdr_buf) {
 			rc = -ENOMEM;
 			dev_err(&res->pdev->dev,
-				"QPLIB: Failed to create rq_hdr_buf");
+				"Failed to create rq_hdr_buf\n");
 			goto fail;
 		}
 	}
@@ -244,6 +245,7 @@
 	u16 type;
 	int budget = nq->budget;
 	uintptr_t q_handle;
+	bool gen_p5 = bnxt_qplib_is_chip_gen_p5(nq->res->cctx);
 
 	/* Service the NQ until empty */
 	raw_cons = hwq->cons;
@@ -277,8 +279,7 @@
 				num_cqne_processed++;
 			else
 				dev_warn(&nq->pdev->dev,
-					 "QPLIB: cqn - type 0x%x not handled",
-					 type);
+					 "cqn - type 0x%x not handled\n", type);
 			spin_unlock_bh(&cq->compl_lock);
 			break;
 		}
@@ -291,14 +292,14 @@
 			q_handle |= (u64)le32_to_cpu(nqsrqe->srq_handle_high)
 				     << 32;
 			bnxt_qplib_arm_srq((struct bnxt_qplib_srq *)q_handle,
-					   DBR_DBR_TYPE_SRQ_ARMENA);
+					   DBC_DBC_TYPE_SRQ_ARMENA);
 			if (!nq->srqn_handler(nq,
 					      (struct bnxt_qplib_srq *)q_handle,
 					      nqsrqe->event))
 				num_srqne_processed++;
 			else
 				dev_warn(&nq->pdev->dev,
-					 "QPLIB: SRQ event 0x%x not handled",
+					 "SRQ event 0x%x not handled\n",
 					 nqsrqe->event);
 			break;
 		}
@@ -306,15 +307,16 @@
 			break;
 		default:
 			dev_warn(&nq->pdev->dev,
-				 "QPLIB: nqe with type = 0x%x not handled",
-				 type);
+				 "nqe with type = 0x%x not handled\n", type);
 			break;
 		}
 		raw_cons++;
 	}
 	if (hwq->cons != raw_cons) {
 		hwq->cons = raw_cons;
-		NQ_DB_REARM(nq->bar_reg_iomem, hwq->cons, hwq->max_elements);
+		bnxt_qplib_ring_nq_db_rearm(nq->bar_reg_iomem, hwq->cons,
+					    hwq->max_elements, nq->ring_id,
+					    gen_p5);
 	}
 }
 
@@ -338,9 +340,11 @@
 
 void bnxt_qplib_nq_stop_irq(struct bnxt_qplib_nq *nq, bool kill)
 {
+	bool gen_p5 = bnxt_qplib_is_chip_gen_p5(nq->res->cctx);
 	tasklet_disable(&nq->worker);
 	/* Mask h/w interrupt */
-	NQ_DB(nq->bar_reg_iomem, nq->hwq.cons, nq->hwq.max_elements);
+	bnxt_qplib_ring_nq_db(nq->bar_reg_iomem, nq->hwq.cons,
+			      nq->hwq.max_elements, nq->ring_id, gen_p5);
 	/* Sync with last running IRQ handler */
 	synchronize_irq(nq->vector);
 	if (kill)
@@ -375,6 +379,7 @@
 int bnxt_qplib_nq_start_irq(struct bnxt_qplib_nq *nq, int nq_indx,
 			    int msix_vector, bool need_init)
 {
+	bool gen_p5 = bnxt_qplib_is_chip_gen_p5(nq->res->cctx);
 	int rc;
 
 	if (nq->requested)
@@ -397,11 +402,12 @@
 	rc = irq_set_affinity_hint(nq->vector, &nq->mask);
 	if (rc) {
 		dev_warn(&nq->pdev->dev,
-			 "QPLIB: set affinity failed; vector: %d nq_idx: %d\n",
+			 "set affinity failed; vector: %d nq_idx: %d\n",
 			 nq->vector, nq_indx);
 	}
 	nq->requested = true;
-	NQ_DB_REARM(nq->bar_reg_iomem, nq->hwq.cons, nq->hwq.max_elements);
+	bnxt_qplib_ring_nq_db_rearm(nq->bar_reg_iomem, nq->hwq.cons,
+				    nq->hwq.max_elements, nq->ring_id, gen_p5);
 
 	return rc;
 }
@@ -435,7 +441,8 @@
 		rc = -ENOMEM;
 		goto fail;
 	}
-	nq->bar_reg_iomem = ioremap_nocache(nq_base + nq->bar_reg_off, 4);
+	/* Unconditionally map 8 bytes to support 57500 series */
+	nq->bar_reg_iomem = ioremap_nocache(nq_base + nq->bar_reg_off, 8);
 	if (!nq->bar_reg_iomem) {
 		rc = -ENOMEM;
 		goto fail;
@@ -444,7 +451,7 @@
 	rc = bnxt_qplib_nq_start_irq(nq, nq_idx, msix_vector, true);
 	if (rc) {
 		dev_err(&nq->pdev->dev,
-			"QPLIB: Failed to request irq for nq-idx %d", nq_idx);
+			"Failed to request irq for nq-idx %d\n", nq_idx);
 		goto fail;
 	}
 
@@ -464,15 +471,17 @@
 
 int bnxt_qplib_alloc_nq(struct pci_dev *pdev, struct bnxt_qplib_nq *nq)
 {
+	u8 hwq_type;
+
 	nq->pdev = pdev;
 	if (!nq->hwq.max_elements ||
 	    nq->hwq.max_elements > BNXT_QPLIB_NQE_MAX_CNT)
 		nq->hwq.max_elements = BNXT_QPLIB_NQE_MAX_CNT;
-
-	if (bnxt_qplib_alloc_init_hwq(nq->pdev, &nq->hwq, NULL, 0,
+	hwq_type = bnxt_qplib_get_hwq_type(nq->res);
+	if (bnxt_qplib_alloc_init_hwq(nq->pdev, &nq->hwq, NULL,
 				      &nq->hwq.max_elements,
 				      BNXT_QPLIB_MAX_NQE_ENTRY_SIZE, 0,
-				      PAGE_SIZE, HWQ_TYPE_L2_CMPL))
+				      PAGE_SIZE, hwq_type))
 		return -ENOMEM;
 
 	nq->budget = 8;
@@ -483,24 +492,22 @@
 static void bnxt_qplib_arm_srq(struct bnxt_qplib_srq *srq, u32 arm_type)
 {
 	struct bnxt_qplib_hwq *srq_hwq = &srq->hwq;
-	struct dbr_dbr db_msg = { 0 };
 	void __iomem *db;
-	u32 sw_prod = 0;
+	u32 sw_prod;
+	u64 val = 0;
 
 	/* Ring DB */
-	sw_prod = (arm_type == DBR_DBR_TYPE_SRQ_ARM) ? srq->threshold :
-		   HWQ_CMP(srq_hwq->prod, srq_hwq);
-	db_msg.index = cpu_to_le32((sw_prod << DBR_DBR_INDEX_SFT) &
-				   DBR_DBR_INDEX_MASK);
-	db_msg.type_xid = cpu_to_le32(((srq->id << DBR_DBR_XID_SFT) &
-					DBR_DBR_XID_MASK) | arm_type);
-	db = (arm_type == DBR_DBR_TYPE_SRQ_ARMENA) ?
-		srq->dbr_base : srq->dpi->dbr;
-	wmb(); /* barrier before db ring */
-	__iowrite64_copy(db, &db_msg, sizeof(db_msg) / sizeof(u64));
+	sw_prod = (arm_type == DBC_DBC_TYPE_SRQ_ARM) ?
+		   srq->threshold : HWQ_CMP(srq_hwq->prod, srq_hwq);
+	db = (arm_type == DBC_DBC_TYPE_SRQ_ARMENA) ? srq->dbr_base :
+						     srq->dpi->dbr;
+	val = ((srq->id << DBC_DBC_XID_SFT) & DBC_DBC_XID_MASK) | arm_type;
+	val <<= 32;
+	val |= (sw_prod << DBC_DBC_INDEX_SFT) & DBC_DBC_INDEX_MASK;
+	writeq(val, db);
 }
 
-int bnxt_qplib_destroy_srq(struct bnxt_qplib_res *res,
+void bnxt_qplib_destroy_srq(struct bnxt_qplib_res *res,
 			   struct bnxt_qplib_srq *srq)
 {
 	struct bnxt_qplib_rcfw *rcfw = res->rcfw;
@@ -514,14 +521,12 @@
 	/* Configure the request */
 	req.srq_cid = cpu_to_le32(srq->id);
 
-	rc = bnxt_qplib_rcfw_send_message(rcfw, (void *)&req,
-					  (void *)&resp, NULL, 0);
-	if (rc)
-		return rc;
-
-	bnxt_qplib_free_hwq(res->pdev, &srq->hwq);
+	rc = bnxt_qplib_rcfw_send_message(rcfw, (struct cmdq_base *)&req,
+					  (struct creq_base *)&resp, NULL, 0);
 	kfree(srq->swq);
-	return 0;
+	if (rc)
+		return;
+	bnxt_qplib_free_hwq(res->pdev, &srq->hwq);
 }
 
 int bnxt_qplib_create_srq(struct bnxt_qplib_res *res,
@@ -535,8 +540,8 @@
 	int rc, idx;
 
 	srq->hwq.max_elements = srq->max_wqe;
-	rc = bnxt_qplib_alloc_init_hwq(res->pdev, &srq->hwq, srq->sglist,
-				       srq->nmap, &srq->hwq.max_elements,
+	rc = bnxt_qplib_alloc_init_hwq(res->pdev, &srq->hwq, &srq->sg_info,
+				       &srq->hwq.max_elements,
 				       BNXT_QPLIB_MAX_RQE_ENTRY_SIZE, 0,
 				       PAGE_SIZE, HWQ_TYPE_QUEUE);
 	if (rc)
@@ -592,7 +597,7 @@
 	srq->id = le32_to_cpu(resp.xid);
 	srq->dbr_base = res->dpi_tbl.dbr_bar_reg_iomem;
 	if (srq->threshold)
-		bnxt_qplib_arm_srq(srq, DBR_DBR_TYPE_SRQ_ARMENA);
+		bnxt_qplib_arm_srq(srq, DBC_DBC_TYPE_SRQ_ARMENA);
 	srq->arm_req = false;
 
 	return 0;
@@ -616,7 +621,7 @@
 				    srq_hwq->max_elements - sw_cons + sw_prod;
 	if (count > srq->threshold) {
 		srq->arm_req = false;
-		bnxt_qplib_arm_srq(srq, DBR_DBR_TYPE_SRQ_ARM);
+		bnxt_qplib_arm_srq(srq, DBC_DBC_TYPE_SRQ_ARM);
 	} else {
 		/* Deferred arming */
 		srq->arm_req = true;
@@ -663,8 +668,8 @@
 
 	spin_lock(&srq_hwq->lock);
 	if (srq->start_idx == srq->last_idx) {
-		dev_err(&srq_hwq->pdev->dev, "QPLIB: FP: SRQ (0x%x) is full!",
-			srq->id);
+		dev_err(&srq_hwq->pdev->dev,
+			"FP: SRQ (0x%x) is full!\n", srq->id);
 		rc = -EINVAL;
 		spin_unlock(&srq_hwq->lock);
 		goto done;
@@ -704,10 +709,10 @@
 				    srq_hwq->max_elements - sw_cons + sw_prod;
 	spin_unlock(&srq_hwq->lock);
 	/* Ring DB */
-	bnxt_qplib_arm_srq(srq, DBR_DBR_TYPE_SRQ);
+	bnxt_qplib_arm_srq(srq, DBC_DBC_TYPE_SRQ);
 	if (srq->arm_req == true && count > srq->threshold) {
 		srq->arm_req = false;
-		bnxt_qplib_arm_srq(srq, DBR_DBR_TYPE_SRQ_ARM);
+		bnxt_qplib_arm_srq(srq, DBC_DBC_TYPE_SRQ_ARM);
 	}
 done:
 	return rc;
@@ -735,7 +740,7 @@
 
 	/* SQ */
 	sq->hwq.max_elements = sq->max_wqe;
-	rc = bnxt_qplib_alloc_init_hwq(res->pdev, &sq->hwq, NULL, 0,
+	rc = bnxt_qplib_alloc_init_hwq(res->pdev, &sq->hwq, NULL,
 				       &sq->hwq.max_elements,
 				       BNXT_QPLIB_MAX_SQE_ENTRY_SIZE, 0,
 				       PAGE_SIZE, HWQ_TYPE_QUEUE);
@@ -774,7 +779,7 @@
 	/* RQ */
 	if (rq->max_wqe) {
 		rq->hwq.max_elements = qp->rq.max_wqe;
-		rc = bnxt_qplib_alloc_init_hwq(res->pdev, &rq->hwq, NULL, 0,
+		rc = bnxt_qplib_alloc_init_hwq(res->pdev, &rq->hwq, NULL,
 					       &rq->hwq.max_elements,
 					       BNXT_QPLIB_MAX_RQE_ENTRY_SIZE, 0,
 					       PAGE_SIZE, HWQ_TYPE_QUEUE);
@@ -855,18 +860,19 @@
 int bnxt_qplib_create_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp)
 {
 	struct bnxt_qplib_rcfw *rcfw = res->rcfw;
-	struct sq_send *hw_sq_send_hdr, **hw_sq_send_ptr;
-	struct cmdq_create_qp req;
-	struct creq_create_qp_resp resp;
-	struct bnxt_qplib_pbl *pbl;
-	struct sq_psn_search **psn_search_ptr;
 	unsigned long int psn_search, poff = 0;
+	struct sq_psn_search **psn_search_ptr;
 	struct bnxt_qplib_q *sq = &qp->sq;
 	struct bnxt_qplib_q *rq = &qp->rq;
+	int i, rc, req_size, psn_sz = 0;
+	struct sq_send **hw_sq_send_ptr;
+	struct creq_create_qp_resp resp;
 	struct bnxt_qplib_hwq *xrrq;
-	int i, rc, req_size, psn_sz;
 	u16 cmd_flags = 0, max_ssge;
-	u32 sw_prod, qp_flags = 0;
+	struct cmdq_create_qp req;
+	struct bnxt_qplib_pbl *pbl;
+	u32 qp_flags = 0;
+	u16 max_rsge;
 
 	RCFW_CMD_PREP(req, CREATE_QP, cmd_flags);
 
@@ -876,11 +882,14 @@
 	req.qp_handle = cpu_to_le64(qp->qp_handle);
 
 	/* SQ */
-	psn_sz = (qp->type == CMDQ_CREATE_QP_TYPE_RC) ?
-		 sizeof(struct sq_psn_search) : 0;
+	if (qp->type == CMDQ_CREATE_QP_TYPE_RC) {
+		psn_sz = bnxt_qplib_is_chip_gen_p5(res->cctx) ?
+			 sizeof(struct sq_psn_search_ext) :
+			 sizeof(struct sq_psn_search);
+	}
 	sq->hwq.max_elements = sq->max_wqe;
-	rc = bnxt_qplib_alloc_init_hwq(res->pdev, &sq->hwq, sq->sglist,
-				       sq->nmap, &sq->hwq.max_elements,
+	rc = bnxt_qplib_alloc_init_hwq(res->pdev, &sq->hwq, &sq->sg_info,
+				       &sq->hwq.max_elements,
 				       BNXT_QPLIB_MAX_SQE_ENTRY_SIZE,
 				       psn_sz,
 				       PAGE_SIZE, HWQ_TYPE_QUEUE);
@@ -907,10 +916,16 @@
 			poff = (psn_search & ~PAGE_MASK) /
 				BNXT_QPLIB_MAX_PSNE_ENTRY_SIZE;
 		}
-		for (i = 0; i < sq->hwq.max_elements; i++)
+		for (i = 0; i < sq->hwq.max_elements; i++) {
 			sq->swq[i].psn_search =
 				&psn_search_ptr[get_psne_pg(i + poff)]
 					       [get_psne_idx(i + poff)];
+			/*psns_ext will be used only for P5 chips. */
+			sq->swq[i].psn_ext =
+				(struct sq_psn_search_ext *)
+				&psn_search_ptr[get_psne_pg(i + poff)]
+					       [get_psne_idx(i + poff)];
+		}
 	}
 	pbl = &sq->hwq.pbl[PBL_LVL_0];
 	req.sq_pbl = cpu_to_le64(pbl->pg_map_arr[0]);
@@ -931,14 +946,6 @@
 				CMDQ_CREATE_QP_SQ_PG_SIZE_PG_1G :
 		 CMDQ_CREATE_QP_SQ_PG_SIZE_PG_4K);
 
-	/* initialize all SQ WQEs to LOCAL_INVALID (sq prep for hw fetch) */
-	hw_sq_send_ptr = (struct sq_send **)sq->hwq.pbl_ptr;
-	for (sw_prod = 0; sw_prod < sq->hwq.max_elements; sw_prod++) {
-		hw_sq_send_hdr = &hw_sq_send_ptr[get_sqe_pg(sw_prod)]
-						[get_sqe_idx(sw_prod)];
-		hw_sq_send_hdr->wqe_type = SQ_BASE_WQE_TYPE_LOCAL_INVALID;
-	}
-
 	if (qp->scq)
 		req.scq_cid = cpu_to_le32(qp->scq->id);
 
@@ -950,8 +957,9 @@
 	/* RQ */
 	if (rq->max_wqe) {
 		rq->hwq.max_elements = rq->max_wqe;
-		rc = bnxt_qplib_alloc_init_hwq(res->pdev, &rq->hwq, rq->sglist,
-					       rq->nmap, &rq->hwq.max_elements,
+		rc = bnxt_qplib_alloc_init_hwq(res->pdev, &rq->hwq,
+					       &rq->sg_info,
+					       &rq->hwq.max_elements,
 					       BNXT_QPLIB_MAX_RQE_ENTRY_SIZE, 0,
 					       PAGE_SIZE, HWQ_TYPE_QUEUE);
 		if (rc)
@@ -1009,8 +1017,9 @@
 	req.sq_fwo_sq_sge = cpu_to_le16(
 				((max_ssge & CMDQ_CREATE_QP_SQ_SGE_MASK)
 				 << CMDQ_CREATE_QP_SQ_SGE_SFT) | 0);
+	max_rsge = bnxt_qplib_is_chip_gen_p5(res->cctx) ? 6 : rq->max_sge;
 	req.rq_fwo_rq_sge = cpu_to_le16(
-				((rq->max_sge & CMDQ_CREATE_QP_RQ_SGE_MASK)
+				((max_rsge & CMDQ_CREATE_QP_RQ_SGE_MASK)
 				 << CMDQ_CREATE_QP_RQ_SGE_SFT) | 0);
 	/* ORRQ and IRRQ */
 	if (psn_sz) {
@@ -1020,7 +1029,7 @@
 		req_size = xrrq->max_elements *
 			   BNXT_QPLIB_MAX_ORRQE_ENTRY_SIZE + PAGE_SIZE - 1;
 		req_size &= ~(PAGE_SIZE - 1);
-		rc = bnxt_qplib_alloc_init_hwq(res->pdev, xrrq, NULL, 0,
+		rc = bnxt_qplib_alloc_init_hwq(res->pdev, xrrq, NULL,
 					       &xrrq->max_elements,
 					       BNXT_QPLIB_MAX_ORRQE_ENTRY_SIZE,
 					       0, req_size, HWQ_TYPE_CTX);
@@ -1036,7 +1045,7 @@
 			   BNXT_QPLIB_MAX_IRRQE_ENTRY_SIZE + PAGE_SIZE - 1;
 		req_size &= ~(PAGE_SIZE - 1);
 
-		rc = bnxt_qplib_alloc_init_hwq(res->pdev, xrrq, NULL, 0,
+		rc = bnxt_qplib_alloc_init_hwq(res->pdev, xrrq, NULL,
 					       &xrrq->max_elements,
 					       BNXT_QPLIB_MAX_IRRQE_ENTRY_SIZE,
 					       0, req_size, HWQ_TYPE_CTX);
@@ -1055,6 +1064,7 @@
 
 	qp->id = le32_to_cpu(resp.xid);
 	qp->cur_qp_state = CMDQ_MODIFY_QP_NEW_STATE_RESET;
+	qp->cctx = res->cctx;
 	INIT_LIST_HEAD(&qp->sq_flush);
 	INIT_LIST_HEAD(&qp->rq_flush);
 	rcfw->qp_tbl[qp->id].qp_id = qp->id;
@@ -1325,7 +1335,7 @@
 		}
 	}
 	if (i == res->sgid_tbl.max)
-		dev_warn(&res->pdev->dev, "QPLIB: SGID not found??");
+		dev_warn(&res->pdev->dev, "SGID not found??\n");
 
 	qp->ah.hop_limit = sb->hop_limit;
 	qp->ah.traffic_class = sb->traffic_class;
@@ -1496,19 +1506,16 @@
 void bnxt_qplib_post_send_db(struct bnxt_qplib_qp *qp)
 {
 	struct bnxt_qplib_q *sq = &qp->sq;
-	struct dbr_dbr db_msg = { 0 };
 	u32 sw_prod;
+	u64 val = 0;
 
+	val = (((qp->id << DBC_DBC_XID_SFT) & DBC_DBC_XID_MASK) |
+	       DBC_DBC_TYPE_SQ);
+	val <<= 32;
 	sw_prod = HWQ_CMP(sq->hwq.prod, &sq->hwq);
-
-	db_msg.index = cpu_to_le32((sw_prod << DBR_DBR_INDEX_SFT) &
-				   DBR_DBR_INDEX_MASK);
-	db_msg.type_xid =
-		cpu_to_le32(((qp->id << DBR_DBR_XID_SFT) & DBR_DBR_XID_MASK) |
-			    DBR_DBR_TYPE_SQ);
+	val |= (sw_prod << DBC_DBC_INDEX_SFT) & DBC_DBC_INDEX_MASK;
 	/* Flush all the WQE writes to HW */
-	wmb();
-	__iowrite64_copy(qp->dpi->dbr, &db_msg, sizeof(db_msg) / sizeof(u64));
+	writeq(val, qp->dpi->dbr);
 }
 
 int bnxt_qplib_post_send(struct bnxt_qplib_qp *qp,
@@ -1537,7 +1544,7 @@
 
 	if (bnxt_qplib_queue_full(sq)) {
 		dev_err(&sq->hwq.pdev->dev,
-			"QPLIB: prod = %#x cons = %#x qdepth = %#x delta = %#x",
+			"prod = %#x cons = %#x qdepth = %#x delta = %#x\n",
 			sq->hwq.prod, sq->hwq.cons, sq->hwq.max_elements,
 			sq->q_full_delta);
 		rc = -ENOMEM;
@@ -1562,7 +1569,7 @@
 		/* Copy the inline data */
 		if (wqe->inline_len > BNXT_QPLIB_SWQE_MAX_INLINE_LENGTH) {
 			dev_warn(&sq->hwq.pdev->dev,
-				 "QPLIB: Inline data length > 96 detected");
+				 "Inline data length > 96 detected\n");
 			data_len = BNXT_QPLIB_SWQE_MAX_INLINE_LENGTH;
 		} else {
 			data_len = wqe->inline_len;
@@ -1619,7 +1626,8 @@
 				((offsetof(typeof(*sqe), data) + 15) >> 4);
 		sqe->inv_key_or_imm_data = cpu_to_le32(
 						wqe->send.inv_key);
-		if (qp->type == CMDQ_CREATE_QP_TYPE_UD) {
+		if (qp->type == CMDQ_CREATE_QP_TYPE_UD ||
+		    qp->type == CMDQ_CREATE_QP_TYPE_GSI) {
 			sqe->q_key = cpu_to_le32(wqe->send.q_key);
 			sqe->dst_qp = cpu_to_le32(
 					wqe->send.dst_qp & SQ_SEND_DST_QP_MASK);
@@ -1743,14 +1751,26 @@
 	}
 	swq->next_psn = sq->psn & BTH_PSN_MASK;
 	if (swq->psn_search) {
-		swq->psn_search->opcode_start_psn = cpu_to_le32(
-			((swq->start_psn << SQ_PSN_SEARCH_START_PSN_SFT) &
-			 SQ_PSN_SEARCH_START_PSN_MASK) |
-			((wqe->type << SQ_PSN_SEARCH_OPCODE_SFT) &
-			 SQ_PSN_SEARCH_OPCODE_MASK));
-		swq->psn_search->flags_next_psn = cpu_to_le32(
-			((swq->next_psn << SQ_PSN_SEARCH_NEXT_PSN_SFT) &
-			 SQ_PSN_SEARCH_NEXT_PSN_MASK));
+		u32 opcd_spsn;
+		u32 flg_npsn;
+
+		opcd_spsn = ((swq->start_psn << SQ_PSN_SEARCH_START_PSN_SFT) &
+			      SQ_PSN_SEARCH_START_PSN_MASK);
+		opcd_spsn |= ((wqe->type << SQ_PSN_SEARCH_OPCODE_SFT) &
+			       SQ_PSN_SEARCH_OPCODE_MASK);
+		flg_npsn = ((swq->next_psn << SQ_PSN_SEARCH_NEXT_PSN_SFT) &
+			     SQ_PSN_SEARCH_NEXT_PSN_MASK);
+		if (bnxt_qplib_is_chip_gen_p5(qp->cctx)) {
+			swq->psn_ext->opcode_start_psn =
+						cpu_to_le32(opcd_spsn);
+			swq->psn_ext->flags_next_psn =
+						cpu_to_le32(flg_npsn);
+		} else {
+			swq->psn_search->opcode_start_psn =
+						cpu_to_le32(opcd_spsn);
+			swq->psn_search->flags_next_psn =
+						cpu_to_le32(flg_npsn);
+		}
 	}
 queue_err:
 	if (sch_handler) {
@@ -1777,7 +1797,7 @@
 			queue_work(qp->scq->nq->cqn_wq, &nq_work->work);
 		} else {
 			dev_err(&sq->hwq.pdev->dev,
-				"QPLIB: FP: Failed to allocate SQ nq_work!");
+				"FP: Failed to allocate SQ nq_work!\n");
 			rc = -ENOMEM;
 		}
 	}
@@ -1787,19 +1807,16 @@
 void bnxt_qplib_post_recv_db(struct bnxt_qplib_qp *qp)
 {
 	struct bnxt_qplib_q *rq = &qp->rq;
-	struct dbr_dbr db_msg = { 0 };
 	u32 sw_prod;
+	u64 val = 0;
 
+	val = (((qp->id << DBC_DBC_XID_SFT) & DBC_DBC_XID_MASK) |
+	       DBC_DBC_TYPE_RQ);
+	val <<= 32;
 	sw_prod = HWQ_CMP(rq->hwq.prod, &rq->hwq);
-	db_msg.index = cpu_to_le32((sw_prod << DBR_DBR_INDEX_SFT) &
-				   DBR_DBR_INDEX_MASK);
-	db_msg.type_xid =
-		cpu_to_le32(((qp->id << DBR_DBR_XID_SFT) & DBR_DBR_XID_MASK) |
-			    DBR_DBR_TYPE_RQ);
-
+	val |= (sw_prod << DBC_DBC_INDEX_SFT) & DBC_DBC_INDEX_MASK;
 	/* Flush the writes to HW Rx WQE before the ringing Rx DB */
-	wmb();
-	__iowrite64_copy(qp->dpi->dbr, &db_msg, sizeof(db_msg) / sizeof(u64));
+	writeq(val, qp->dpi->dbr);
 }
 
 int bnxt_qplib_post_recv(struct bnxt_qplib_qp *qp,
@@ -1816,13 +1833,12 @@
 	if (qp->state == CMDQ_MODIFY_QP_NEW_STATE_ERR) {
 		sch_handler = true;
 		dev_dbg(&rq->hwq.pdev->dev,
-			"%s Error QP. Scheduling for poll_cq\n",
-			__func__);
+			"%s: Error QP. Scheduling for poll_cq\n", __func__);
 		goto queue_err;
 	}
 	if (bnxt_qplib_queue_full(rq)) {
 		dev_err(&rq->hwq.pdev->dev,
-			"QPLIB: FP: QP (0x%x) RQ is full!", qp->id);
+			"FP: QP (0x%x) RQ is full!\n", qp->id);
 		rc = -EINVAL;
 		goto done;
 	}
@@ -1871,7 +1887,7 @@
 			queue_work(qp->rcq->nq->cqn_wq, &nq_work->work);
 		} else {
 			dev_err(&rq->hwq.pdev->dev,
-				"QPLIB: FP: Failed to allocate RQ nq_work!");
+				"FP: Failed to allocate RQ nq_work!\n");
 			rc = -ENOMEM;
 		}
 	}
@@ -1884,32 +1900,28 @@
 /* Spinlock must be held */
 static void bnxt_qplib_arm_cq_enable(struct bnxt_qplib_cq *cq)
 {
-	struct dbr_dbr db_msg = { 0 };
+	u64 val = 0;
 
-	db_msg.type_xid =
-		cpu_to_le32(((cq->id << DBR_DBR_XID_SFT) & DBR_DBR_XID_MASK) |
-			    DBR_DBR_TYPE_CQ_ARMENA);
+	val = ((cq->id << DBC_DBC_XID_SFT) & DBC_DBC_XID_MASK) |
+	       DBC_DBC_TYPE_CQ_ARMENA;
+	val <<= 32;
 	/* Flush memory writes before enabling the CQ */
-	wmb();
-	__iowrite64_copy(cq->dbr_base, &db_msg, sizeof(db_msg) / sizeof(u64));
+	writeq(val, cq->dbr_base);
 }
 
 static void bnxt_qplib_arm_cq(struct bnxt_qplib_cq *cq, u32 arm_type)
 {
 	struct bnxt_qplib_hwq *cq_hwq = &cq->hwq;
-	struct dbr_dbr db_msg = { 0 };
 	u32 sw_cons;
+	u64 val = 0;
 
 	/* Ring DB */
+	val = ((cq->id << DBC_DBC_XID_SFT) & DBC_DBC_XID_MASK) | arm_type;
+	val <<= 32;
 	sw_cons = HWQ_CMP(cq_hwq->cons, cq_hwq);
-	db_msg.index = cpu_to_le32((sw_cons << DBR_DBR_INDEX_SFT) &
-				    DBR_DBR_INDEX_MASK);
-	db_msg.type_xid =
-		cpu_to_le32(((cq->id << DBR_DBR_XID_SFT) & DBR_DBR_XID_MASK) |
-			    arm_type);
+	val |= (sw_cons << DBC_DBC_INDEX_SFT) & DBC_DBC_INDEX_MASK;
 	/* flush memory writes before arming the CQ */
-	wmb();
-	__iowrite64_copy(cq->dpi->dbr, &db_msg, sizeof(db_msg) / sizeof(u64));
+	writeq(val, cq->dpi->dbr);
 }
 
 int bnxt_qplib_create_cq(struct bnxt_qplib_res *res, struct bnxt_qplib_cq *cq)
@@ -1922,8 +1934,8 @@
 	int rc;
 
 	cq->hwq.max_elements = cq->max_wqe;
-	rc = bnxt_qplib_alloc_init_hwq(res->pdev, &cq->hwq, cq->sghead,
-				       cq->nmap, &cq->hwq.max_elements,
+	rc = bnxt_qplib_alloc_init_hwq(res->pdev, &cq->hwq, &cq->sg_info,
+				       &cq->hwq.max_elements,
 				       BNXT_QPLIB_MAX_CQE_ENTRY_SIZE, 0,
 				       PAGE_SIZE, HWQ_TYPE_QUEUE);
 	if (rc)
@@ -1933,7 +1945,7 @@
 
 	if (!cq->dpi) {
 		dev_err(&rcfw->pdev->dev,
-			"QPLIB: FP: CREATE_CQ failed due to NULL DPI");
+			"FP: CREATE_CQ failed due to NULL DPI\n");
 		return -EINVAL;
 	}
 	req.dpi = cpu_to_le32(cq->dpi->dpi);
@@ -1970,6 +1982,7 @@
 	INIT_LIST_HEAD(&cq->sqf_head);
 	INIT_LIST_HEAD(&cq->rqf_head);
 	spin_lock_init(&cq->compl_lock);
+	spin_lock_init(&cq->flush_lock);
 
 	bnxt_qplib_arm_cq_enable(cq);
 	return 0;
@@ -2055,6 +2068,7 @@
 		opcode = CQ_BASE_CQE_TYPE_RES_RC;
 		break;
 	case CMDQ_CREATE_QP_TYPE_UD:
+	case CMDQ_CREATE_QP_TYPE_GSI:
 		opcode = CQ_BASE_CQE_TYPE_RES_UD;
 		break;
 	}
@@ -2127,7 +2141,7 @@
 		sq->send_phantom = true;
 
 		/* TODO: Only ARM if the previous SQE is ARMALL */
-		bnxt_qplib_arm_cq(cq, DBR_DBR_TYPE_CQ_ARMALL);
+		bnxt_qplib_arm_cq(cq, DBC_DBC_TYPE_CQ_ARMALL);
 
 		rc = -EAGAIN;
 		goto out;
@@ -2173,7 +2187,7 @@
 						 *  comes back
 						 */
 						dev_dbg(&cq->hwq.pdev->dev,
-							"FP:Got Phantom CQE");
+							"FP: Got Phantom CQE\n");
 						sq->condition = false;
 						sq->single = true;
 						rc = 0;
@@ -2190,7 +2204,7 @@
 			peek_raw_cq_cons++;
 		}
 		dev_err(&cq->hwq.pdev->dev,
-			"Should not have come here! cq_cons=0x%x qp=0x%x sq cons sw=0x%x hw=0x%x",
+			"Should not have come here! cq_cons=0x%x qp=0x%x sq cons sw=0x%x hw=0x%x\n",
 			cq_cons, qp->id, sw_sq_cons, cqe_sq_cons);
 		rc = -EINVAL;
 	}
@@ -2214,7 +2228,7 @@
 				      le64_to_cpu(hwcqe->qp_handle));
 	if (!qp) {
 		dev_err(&cq->hwq.pdev->dev,
-			"QPLIB: FP: Process Req qp is NULL");
+			"FP: Process Req qp is NULL\n");
 		return -EINVAL;
 	}
 	sq = &qp->sq;
@@ -2222,16 +2236,14 @@
 	cqe_sq_cons = HWQ_CMP(le16_to_cpu(hwcqe->sq_cons_idx), &sq->hwq);
 	if (cqe_sq_cons > sq->hwq.max_elements) {
 		dev_err(&cq->hwq.pdev->dev,
-			"QPLIB: FP: CQ Process req reported ");
-		dev_err(&cq->hwq.pdev->dev,
-			"QPLIB: sq_cons_idx 0x%x which exceeded max 0x%x",
+			"FP: CQ Process req reported sq_cons_idx 0x%x which exceeded max 0x%x\n",
 			cqe_sq_cons, sq->hwq.max_elements);
 		return -EINVAL;
 	}
 
 	if (qp->sq.flushed) {
 		dev_dbg(&cq->hwq.pdev->dev,
-			"%s: QPLIB: QP in Flush QP = %p\n", __func__, qp);
+			"%s: QP in Flush QP = %p\n", __func__, qp);
 		goto done;
 	}
 	/* Require to walk the sq's swq to fabricate CQEs for all previously
@@ -2263,9 +2275,7 @@
 		    hwcqe->status != CQ_REQ_STATUS_OK) {
 			cqe->status = hwcqe->status;
 			dev_err(&cq->hwq.pdev->dev,
-				"QPLIB: FP: CQ Processed Req ");
-			dev_err(&cq->hwq.pdev->dev,
-				"QPLIB: wr_id[%d] = 0x%llx with status 0x%x",
+				"FP: CQ Processed Req wr_id[%d] = 0x%llx with status 0x%x\n",
 				sw_sq_cons, cqe->wr_id, cqe->status);
 			cqe++;
 			(*budget)--;
@@ -2331,12 +2341,12 @@
 	qp = (struct bnxt_qplib_qp *)((unsigned long)
 				      le64_to_cpu(hwcqe->qp_handle));
 	if (!qp) {
-		dev_err(&cq->hwq.pdev->dev, "QPLIB: process_cq RC qp is NULL");
+		dev_err(&cq->hwq.pdev->dev, "process_cq RC qp is NULL\n");
 		return -EINVAL;
 	}
 	if (qp->rq.flushed) {
 		dev_dbg(&cq->hwq.pdev->dev,
-			"%s: QPLIB: QP in Flush QP = %p\n", __func__, qp);
+			"%s: QP in Flush QP = %p\n", __func__, qp);
 		goto done;
 	}
 
@@ -2357,9 +2367,7 @@
 			return -EINVAL;
 		if (wr_id_idx >= srq->hwq.max_elements) {
 			dev_err(&cq->hwq.pdev->dev,
-				"QPLIB: FP: CQ Process RC ");
-			dev_err(&cq->hwq.pdev->dev,
-				"QPLIB: wr_id idx 0x%x exceeded SRQ max 0x%x",
+				"FP: CQ Process RC wr_id idx 0x%x exceeded SRQ max 0x%x\n",
 				wr_id_idx, srq->hwq.max_elements);
 			return -EINVAL;
 		}
@@ -2372,9 +2380,7 @@
 		rq = &qp->rq;
 		if (wr_id_idx >= rq->hwq.max_elements) {
 			dev_err(&cq->hwq.pdev->dev,
-				"QPLIB: FP: CQ Process RC ");
-			dev_err(&cq->hwq.pdev->dev,
-				"QPLIB: wr_id idx 0x%x exceeded RQ max 0x%x",
+				"FP: CQ Process RC wr_id idx 0x%x exceeded RQ max 0x%x\n",
 				wr_id_idx, rq->hwq.max_elements);
 			return -EINVAL;
 		}
@@ -2410,22 +2416,24 @@
 	qp = (struct bnxt_qplib_qp *)((unsigned long)
 				      le64_to_cpu(hwcqe->qp_handle));
 	if (!qp) {
-		dev_err(&cq->hwq.pdev->dev, "QPLIB: process_cq UD qp is NULL");
+		dev_err(&cq->hwq.pdev->dev, "process_cq UD qp is NULL\n");
 		return -EINVAL;
 	}
 	if (qp->rq.flushed) {
 		dev_dbg(&cq->hwq.pdev->dev,
-			"%s: QPLIB: QP in Flush QP = %p\n", __func__, qp);
+			"%s: QP in Flush QP = %p\n", __func__, qp);
 		goto done;
 	}
 	cqe = *pcqe;
 	cqe->opcode = hwcqe->cqe_type_toggle & CQ_BASE_CQE_TYPE_MASK;
-	cqe->length = le32_to_cpu(hwcqe->length);
+	cqe->length = (u32)le16_to_cpu(hwcqe->length);
+	cqe->cfa_meta = le16_to_cpu(hwcqe->cfa_metadata);
 	cqe->invrkey = le32_to_cpu(hwcqe->imm_data);
 	cqe->flags = le16_to_cpu(hwcqe->flags);
 	cqe->status = hwcqe->status;
 	cqe->qp_handle = (u64)(unsigned long)qp;
-	memcpy(cqe->smac, hwcqe->src_mac, 6);
+	/*FIXME: Endianness fix needed for smace */
+	memcpy(cqe->smac, hwcqe->src_mac, ETH_ALEN);
 	wr_id_idx = le32_to_cpu(hwcqe->src_qp_high_srq_or_rq_wr_id)
 				& CQ_RES_UD_SRQ_OR_RQ_WR_ID_MASK;
 	cqe->src_qp = le16_to_cpu(hwcqe->src_qp_low) |
@@ -2440,9 +2448,7 @@
 
 		if (wr_id_idx >= srq->hwq.max_elements) {
 			dev_err(&cq->hwq.pdev->dev,
-				"QPLIB: FP: CQ Process UD ");
-			dev_err(&cq->hwq.pdev->dev,
-				"QPLIB: wr_id idx 0x%x exceeded SRQ max 0x%x",
+				"FP: CQ Process UD wr_id idx 0x%x exceeded SRQ max 0x%x\n",
 				wr_id_idx, srq->hwq.max_elements);
 			return -EINVAL;
 		}
@@ -2455,9 +2461,7 @@
 		rq = &qp->rq;
 		if (wr_id_idx >= rq->hwq.max_elements) {
 			dev_err(&cq->hwq.pdev->dev,
-				"QPLIB: FP: CQ Process UD ");
-			dev_err(&cq->hwq.pdev->dev,
-				"QPLIB: wr_id idx 0x%x exceeded RQ max 0x%x",
+				"FP: CQ Process UD wr_id idx 0x%x exceeded RQ max 0x%x\n",
 				wr_id_idx, rq->hwq.max_elements);
 			return -EINVAL;
 		}
@@ -2509,13 +2513,12 @@
 	qp = (struct bnxt_qplib_qp *)((unsigned long)
 				      le64_to_cpu(hwcqe->qp_handle));
 	if (!qp) {
-		dev_err(&cq->hwq.pdev->dev,
-			"QPLIB: process_cq Raw/QP1 qp is NULL");
+		dev_err(&cq->hwq.pdev->dev, "process_cq Raw/QP1 qp is NULL\n");
 		return -EINVAL;
 	}
 	if (qp->rq.flushed) {
 		dev_dbg(&cq->hwq.pdev->dev,
-			"%s: QPLIB: QP in Flush QP = %p\n", __func__, qp);
+			"%s: QP in Flush QP = %p\n", __func__, qp);
 		goto done;
 	}
 	cqe = *pcqe;
@@ -2544,14 +2547,12 @@
 		srq = qp->srq;
 		if (!srq) {
 			dev_err(&cq->hwq.pdev->dev,
-				"QPLIB: FP: SRQ used but not defined??");
+				"FP: SRQ used but not defined??\n");
 			return -EINVAL;
 		}
 		if (wr_id_idx >= srq->hwq.max_elements) {
 			dev_err(&cq->hwq.pdev->dev,
-				"QPLIB: FP: CQ Process Raw/QP1 ");
-			dev_err(&cq->hwq.pdev->dev,
-				"QPLIB: wr_id idx 0x%x exceeded SRQ max 0x%x",
+				"FP: CQ Process Raw/QP1 wr_id idx 0x%x exceeded SRQ max 0x%x\n",
 				wr_id_idx, srq->hwq.max_elements);
 			return -EINVAL;
 		}
@@ -2564,9 +2565,7 @@
 		rq = &qp->rq;
 		if (wr_id_idx >= rq->hwq.max_elements) {
 			dev_err(&cq->hwq.pdev->dev,
-				"QPLIB: FP: CQ Process Raw/QP1 RQ wr_id ");
-			dev_err(&cq->hwq.pdev->dev,
-				"QPLIB: ix 0x%x exceeded RQ max 0x%x",
+				"FP: CQ Process Raw/QP1 RQ wr_id idx 0x%x exceeded RQ max 0x%x\n",
 				wr_id_idx, rq->hwq.max_elements);
 			return -EINVAL;
 		}
@@ -2601,14 +2600,14 @@
 	/* Check the Status */
 	if (hwcqe->status != CQ_TERMINAL_STATUS_OK)
 		dev_warn(&cq->hwq.pdev->dev,
-			 "QPLIB: FP: CQ Process Terminal Error status = 0x%x",
+			 "FP: CQ Process Terminal Error status = 0x%x\n",
 			 hwcqe->status);
 
 	qp = (struct bnxt_qplib_qp *)((unsigned long)
 				      le64_to_cpu(hwcqe->qp_handle));
 	if (!qp) {
 		dev_err(&cq->hwq.pdev->dev,
-			"QPLIB: FP: CQ Process terminal qp is NULL");
+			"FP: CQ Process terminal qp is NULL\n");
 		return -EINVAL;
 	}
 
@@ -2624,16 +2623,14 @@
 
 	if (cqe_cons > sq->hwq.max_elements) {
 		dev_err(&cq->hwq.pdev->dev,
-			"QPLIB: FP: CQ Process terminal reported ");
-		dev_err(&cq->hwq.pdev->dev,
-			"QPLIB: sq_cons_idx 0x%x which exceeded max 0x%x",
+			"FP: CQ Process terminal reported sq_cons_idx 0x%x which exceeded max 0x%x\n",
 			cqe_cons, sq->hwq.max_elements);
 		goto do_rq;
 	}
 
 	if (qp->sq.flushed) {
 		dev_dbg(&cq->hwq.pdev->dev,
-			"%s: QPLIB: QP in Flush QP = %p\n", __func__, qp);
+			"%s: QP in Flush QP = %p\n", __func__, qp);
 		goto sq_done;
 	}
 
@@ -2674,16 +2671,14 @@
 		goto done;
 	} else if (cqe_cons > rq->hwq.max_elements) {
 		dev_err(&cq->hwq.pdev->dev,
-			"QPLIB: FP: CQ Processed terminal ");
-		dev_err(&cq->hwq.pdev->dev,
-			"QPLIB: reported rq_cons_idx 0x%x exceeds max 0x%x",
+			"FP: CQ Processed terminal reported rq_cons_idx 0x%x exceeds max 0x%x\n",
 			cqe_cons, rq->hwq.max_elements);
 		goto done;
 	}
 
 	if (qp->rq.flushed) {
 		dev_dbg(&cq->hwq.pdev->dev,
-			"%s: QPLIB: QP in Flush QP = %p\n", __func__, qp);
+			"%s: QP in Flush QP = %p\n", __func__, qp);
 		rc = 0;
 		goto done;
 	}
@@ -2705,7 +2700,7 @@
 	/* Check the Status */
 	if (hwcqe->status != CQ_CUTOFF_STATUS_OK) {
 		dev_err(&cq->hwq.pdev->dev,
-			"QPLIB: FP: CQ Process Cutoff Error status = 0x%x",
+			"FP: CQ Process Cutoff Error status = 0x%x\n",
 			hwcqe->status);
 		return -EINVAL;
 	}
@@ -2725,16 +2720,12 @@
 
 	spin_lock_irqsave(&cq->flush_lock, flags);
 	list_for_each_entry(qp, &cq->sqf_head, sq_flush) {
-		dev_dbg(&cq->hwq.pdev->dev,
-			"QPLIB: FP: Flushing SQ QP= %p",
-			qp);
+		dev_dbg(&cq->hwq.pdev->dev, "FP: Flushing SQ QP= %p\n", qp);
 		__flush_sq(&qp->sq, qp, &cqe, &budget);
 	}
 
 	list_for_each_entry(qp, &cq->rqf_head, rq_flush) {
-		dev_dbg(&cq->hwq.pdev->dev,
-			"QPLIB: FP: Flushing RQ QP= %p",
-			qp);
+		dev_dbg(&cq->hwq.pdev->dev, "FP: Flushing RQ QP= %p\n", qp);
 		__flush_rq(&qp->rq, qp, &cqe, &budget);
 	}
 	spin_unlock_irqrestore(&cq->flush_lock, flags);
@@ -2802,7 +2793,7 @@
 			goto exit;
 		default:
 			dev_err(&cq->hwq.pdev->dev,
-				"QPLIB: process_cq unknown type 0x%lx",
+				"process_cq unknown type 0x%lx\n",
 				hw_cqe->cqe_type_toggle &
 				CQ_BASE_CQE_TYPE_MASK);
 			rc = -EINVAL;
@@ -2815,13 +2806,13 @@
 			 * next one
 			 */
 			dev_err(&cq->hwq.pdev->dev,
-				"QPLIB: process_cqe error rc = 0x%x", rc);
+				"process_cqe error rc = 0x%x\n", rc);
 		}
 		raw_cons++;
 	}
 	if (cq->hwq.cons != raw_cons) {
 		cq->hwq.cons = raw_cons;
-		bnxt_qplib_arm_cq(cq, DBR_DBR_TYPE_CQ);
+		bnxt_qplib_arm_cq(cq, DBC_DBC_TYPE_CQ);
 	}
 exit:
 	return num_cqes - budget;
diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.h b/drivers/infiniband/hw/bnxt_re/qplib_fp.h
index 72352ca..99e0a13 100644
--- a/drivers/infiniband/hw/bnxt_re/qplib_fp.h
+++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.h
@@ -52,10 +52,9 @@
 	struct bnxt_qplib_cq		*cq;
 	struct bnxt_qplib_hwq		hwq;
 	struct bnxt_qplib_swq		*swq;
-	struct scatterlist		*sglist;
 	int				start_idx;
 	int				last_idx;
-	u32				nmap;
+	struct bnxt_qplib_sg_info	sg_info;
 	u16				eventq_hw_ring_id;
 	spinlock_t			lock; /* protect SRQE link list */
 };
@@ -106,6 +105,7 @@
 	u32				start_psn;
 	u32				next_psn;
 	struct sq_psn_search		*psn_search;
+	struct sq_psn_search_ext	*psn_ext;
 };
 
 struct bnxt_qplib_swqe {
@@ -236,8 +236,7 @@
 struct bnxt_qplib_q {
 	struct bnxt_qplib_hwq		hwq;
 	struct bnxt_qplib_swq		*swq;
-	struct scatterlist		*sglist;
-	u32				nmap;
+	struct bnxt_qplib_sg_info	sg_info;
 	u32				max_wqe;
 	u16				q_full_delta;
 	u16				max_sge;
@@ -254,6 +253,7 @@
 struct bnxt_qplib_qp {
 	struct bnxt_qplib_pd		*pd;
 	struct bnxt_qplib_dpi		*dpi;
+	struct bnxt_qplib_chip_ctx	*cctx;
 	u64				qp_handle;
 #define        BNXT_QPLIB_QP_ID_INVALID        0xFFFFFFFF
 	u32				id;
@@ -347,6 +347,7 @@
 	u8				type;
 	u8				opcode;
 	u32				length;
+	u16				cfa_meta;
 	u64				wr_id;
 	union {
 		__be32			immdata;
@@ -378,8 +379,7 @@
 	u32				cnq_hw_ring_id;
 	struct bnxt_qplib_nq		*nq;
 	bool				resize_in_progress;
-	struct scatterlist		*sghead;
-	u32				nmap;
+	struct bnxt_qplib_sg_info	sg_info;
 	u64				cq_handle;
 
 #define CQ_RESIZE_WAIT_TIME_MS		500
@@ -432,13 +432,47 @@
 #define NQ_DB_CP_FLAGS			(NQ_DB_KEY_CP    |	\
 					 NQ_DB_IDX_VALID |	\
 					 NQ_DB_IRQ_DIS)
-#define NQ_DB_REARM(db, raw_cons, cp_bit)			\
-	writel(NQ_DB_CP_FLAGS_REARM | ((raw_cons) & ((cp_bit) - 1)), db)
-#define NQ_DB(db, raw_cons, cp_bit)				\
-	writel(NQ_DB_CP_FLAGS | ((raw_cons) & ((cp_bit) - 1)), db)
+
+static inline void bnxt_qplib_ring_nq_db64(void __iomem *db, u32 index,
+					   u32 xid, bool arm)
+{
+	u64 val;
+
+	val = xid & DBC_DBC_XID_MASK;
+	val |= DBC_DBC_PATH_ROCE;
+	val |= arm ? DBC_DBC_TYPE_NQ_ARM : DBC_DBC_TYPE_NQ;
+	val <<= 32;
+	val |= index & DBC_DBC_INDEX_MASK;
+	writeq(val, db);
+}
+
+static inline void bnxt_qplib_ring_nq_db_rearm(void __iomem *db, u32 raw_cons,
+					       u32 max_elements, u32 xid,
+					       bool gen_p5)
+{
+	u32 index = raw_cons & (max_elements - 1);
+
+	if (gen_p5)
+		bnxt_qplib_ring_nq_db64(db, index, xid, true);
+	else
+		writel(NQ_DB_CP_FLAGS_REARM | (index & DBC_DBC32_XID_MASK), db);
+}
+
+static inline void bnxt_qplib_ring_nq_db(void __iomem *db, u32 raw_cons,
+					 u32 max_elements, u32 xid,
+					 bool gen_p5)
+{
+	u32 index = raw_cons & (max_elements - 1);
+
+	if (gen_p5)
+		bnxt_qplib_ring_nq_db64(db, index, xid, false);
+	else
+		writel(NQ_DB_CP_FLAGS | (index & DBC_DBC32_XID_MASK), db);
+}
 
 struct bnxt_qplib_nq {
 	struct pci_dev		*pdev;
+	struct bnxt_qplib_res	*res;
 
 	int			vector;
 	cpumask_t		mask;
@@ -448,7 +482,7 @@
 	struct bnxt_qplib_hwq	hwq;
 
 	u16			bar_reg;
-	u16			bar_reg_off;
+	u32			bar_reg_off;
 	u16			ring_id;
 	void __iomem		*bar_reg_iomem;
 
@@ -484,8 +518,8 @@
 			  struct bnxt_qplib_srq *srq);
 int bnxt_qplib_query_srq(struct bnxt_qplib_res *res,
 			 struct bnxt_qplib_srq *srq);
-int bnxt_qplib_destroy_srq(struct bnxt_qplib_res *res,
-			   struct bnxt_qplib_srq *srq);
+void bnxt_qplib_destroy_srq(struct bnxt_qplib_res *res,
+			    struct bnxt_qplib_srq *srq);
 int bnxt_qplib_post_srq_recv(struct bnxt_qplib_srq *srq,
 			     struct bnxt_qplib_swqe *wqe);
 int bnxt_qplib_create_qp1(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp);
diff --git a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c
index 6637df7..60c8f76 100644
--- a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c
+++ b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c
@@ -35,6 +35,9 @@
  *
  * Description: RDMA Controller HW interface
  */
+
+#define dev_fmt(fmt) "QPLIB: " fmt
+
 #include <linux/interrupt.h>
 #include <linux/spinlock.h>
 #include <linux/pci.h>
@@ -55,7 +58,7 @@
 	u16 cbit;
 	int rc;
 
-	cbit = cookie % RCFW_MAX_OUTSTANDING_CMD;
+	cbit = cookie % rcfw->cmdq_depth;
 	rc = wait_event_timeout(rcfw->waitq,
 				!test_bit(cbit, rcfw->cmdq_bitmap),
 				msecs_to_jiffies(RCFW_CMD_WAIT_TIME_MS));
@@ -67,7 +70,7 @@
 	u32 count = RCFW_BLOCKED_CMD_WAIT_COUNT;
 	u16 cbit;
 
-	cbit = cookie % RCFW_MAX_OUTSTANDING_CMD;
+	cbit = cookie % rcfw->cmdq_depth;
 	if (!test_bit(cbit, rcfw->cmdq_bitmap))
 		goto done;
 	do {
@@ -83,6 +86,7 @@
 {
 	struct bnxt_qplib_cmdqe *cmdqe, **cmdq_ptr;
 	struct bnxt_qplib_hwq *cmdq = &rcfw->cmdq;
+	u32 cmdq_depth = rcfw->cmdq_depth;
 	struct bnxt_qplib_crsq *crsqe;
 	u32 sw_prod, cmdq_prod;
 	unsigned long flags;
@@ -96,14 +100,13 @@
 	     opcode != CMDQ_BASE_OPCODE_INITIALIZE_FW &&
 	     opcode != CMDQ_BASE_OPCODE_QUERY_VERSION)) {
 		dev_err(&rcfw->pdev->dev,
-			"QPLIB: RCFW not initialized, reject opcode 0x%x",
-			opcode);
+			"RCFW not initialized, reject opcode 0x%x\n", opcode);
 		return -EINVAL;
 	}
 
 	if (test_bit(FIRMWARE_INITIALIZED_FLAG, &rcfw->flags) &&
 	    opcode == CMDQ_BASE_OPCODE_INITIALIZE_FW) {
-		dev_err(&rcfw->pdev->dev, "QPLIB: RCFW already initialized!");
+		dev_err(&rcfw->pdev->dev, "RCFW already initialized!\n");
 		return -EINVAL;
 	}
 
@@ -115,14 +118,14 @@
 	 */
 	spin_lock_irqsave(&cmdq->lock, flags);
 	if (req->cmd_size >= HWQ_FREE_SLOTS(cmdq)) {
-		dev_err(&rcfw->pdev->dev, "QPLIB: RCFW: CMDQ is full!");
+		dev_err(&rcfw->pdev->dev, "RCFW: CMDQ is full!\n");
 		spin_unlock_irqrestore(&cmdq->lock, flags);
 		return -EAGAIN;
 	}
 
 
 	cookie = rcfw->seq_num & RCFW_MAX_COOKIE_VALUE;
-	cbit = cookie % RCFW_MAX_OUTSTANDING_CMD;
+	cbit = cookie % rcfw->cmdq_depth;
 	if (is_block)
 		cookie |= RCFW_CMD_IS_BLOCKING;
 
@@ -133,6 +136,13 @@
 		spin_unlock_irqrestore(&cmdq->lock, flags);
 		return -EBUSY;
 	}
+
+	size = req->cmd_size;
+	/* change the cmd_size to the number of 16byte cmdq unit.
+	 * req->cmd_size is modified here
+	 */
+	bnxt_qplib_set_cmd_slots(req);
+
 	memset(resp, 0, sizeof(*resp));
 	crsqe->resp = (struct creq_qp_event *)resp;
 	crsqe->resp->cookie = req->cookie;
@@ -147,14 +157,14 @@
 
 	cmdq_ptr = (struct bnxt_qplib_cmdqe **)cmdq->pbl_ptr;
 	preq = (u8 *)req;
-	size = req->cmd_size * BNXT_QPLIB_CMDQE_UNITS;
 	do {
 		/* Locate the next cmdq slot */
 		sw_prod = HWQ_CMP(cmdq->prod, cmdq);
-		cmdqe = &cmdq_ptr[get_cmdq_pg(sw_prod)][get_cmdq_idx(sw_prod)];
+		cmdqe = &cmdq_ptr[get_cmdq_pg(sw_prod, cmdq_depth)]
+				[get_cmdq_idx(sw_prod, cmdq_depth)];
 		if (!cmdqe) {
 			dev_err(&rcfw->pdev->dev,
-				"QPLIB: RCFW request failed with no cmdqe!");
+				"RCFW request failed with no cmdqe!\n");
 			goto done;
 		}
 		/* Copy a segment of the req cmd to the cmdq */
@@ -210,7 +220,7 @@
 
 		if (!retry_cnt || (rc != -EAGAIN && rc != -EBUSY)) {
 			/* send failed */
-			dev_err(&rcfw->pdev->dev, "QPLIB: cmdq[%#x]=%#x send failed",
+			dev_err(&rcfw->pdev->dev, "cmdq[%#x]=%#x send failed\n",
 				cookie, opcode);
 			return rc;
 		}
@@ -224,7 +234,7 @@
 		rc = __wait_for_resp(rcfw, cookie);
 	if (rc) {
 		/* timed out */
-		dev_err(&rcfw->pdev->dev, "QPLIB: cmdq[%#x]=%#x timedout (%d)msec",
+		dev_err(&rcfw->pdev->dev, "cmdq[%#x]=%#x timedout (%d)msec\n",
 			cookie, opcode, RCFW_CMD_WAIT_TIME_MS);
 		set_bit(FIRMWARE_TIMED_OUT, &rcfw->flags);
 		return rc;
@@ -232,7 +242,7 @@
 
 	if (evnt->status) {
 		/* failed with status */
-		dev_err(&rcfw->pdev->dev, "QPLIB: cmdq[%#x]=%#x status %#x",
+		dev_err(&rcfw->pdev->dev, "cmdq[%#x]=%#x status %#x\n",
 			cookie, opcode, evnt->status);
 		rc = -EFAULT;
 	}
@@ -298,9 +308,9 @@
 		qp_id = le32_to_cpu(err_event->xid);
 		qp = rcfw->qp_tbl[qp_id].qp_handle;
 		dev_dbg(&rcfw->pdev->dev,
-			"QPLIB: Received QP error notification");
+			"Received QP error notification\n");
 		dev_dbg(&rcfw->pdev->dev,
-			"QPLIB: qpid 0x%x, req_err=0x%x, resp_err=0x%x\n",
+			"qpid 0x%x, req_err=0x%x, resp_err=0x%x\n",
 			qp_id, err_event->req_err_state_reason,
 			err_event->res_err_state_reason);
 		if (!qp)
@@ -324,21 +334,23 @@
 		mcookie = qp_event->cookie;
 		blocked = cookie & RCFW_CMD_IS_BLOCKING;
 		cookie &= RCFW_MAX_COOKIE_VALUE;
-		cbit = cookie % RCFW_MAX_OUTSTANDING_CMD;
+		cbit = cookie % rcfw->cmdq_depth;
 		crsqe = &rcfw->crsqe_tbl[cbit];
 		if (crsqe->resp &&
 		    crsqe->resp->cookie  == mcookie) {
 			memcpy(crsqe->resp, qp_event, sizeof(*qp_event));
 			crsqe->resp = NULL;
 		} else {
-			dev_err(&rcfw->pdev->dev,
-				"QPLIB: CMD %s resp->cookie = %#x, evnt->cookie = %#x",
-				crsqe->resp ? "mismatch" : "collision",
-				crsqe->resp ? crsqe->resp->cookie : 0, mcookie);
+			if (crsqe->resp && crsqe->resp->cookie)
+				dev_err(&rcfw->pdev->dev,
+					"CMD %s cookie sent=%#x, recd=%#x\n",
+					crsqe->resp ? "mismatch" : "collision",
+					crsqe->resp ? crsqe->resp->cookie : 0,
+					mcookie);
 		}
 		if (!test_and_clear_bit(cbit, rcfw->cmdq_bitmap))
 			dev_warn(&rcfw->pdev->dev,
-				 "QPLIB: CMD bit %d was not requested", cbit);
+				 "CMD bit %d was not requested\n", cbit);
 		cmdq->cons += crsqe->req_size;
 		crsqe->req_size = 0;
 
@@ -353,11 +365,12 @@
 static void bnxt_qplib_service_creq(unsigned long data)
 {
 	struct bnxt_qplib_rcfw *rcfw = (struct bnxt_qplib_rcfw *)data;
+	bool gen_p5 = bnxt_qplib_is_chip_gen_p5(rcfw->res->cctx);
 	struct bnxt_qplib_hwq *creq = &rcfw->creq;
+	u32 type, budget = CREQ_ENTRY_POLL_BUDGET;
 	struct creq_base *creqe, **creq_ptr;
 	u32 sw_cons, raw_cons;
 	unsigned long flags;
-	u32 type, budget = CREQ_ENTRY_POLL_BUDGET;
 
 	/* Service the CREQ until budget is over */
 	spin_lock_irqsave(&creq->lock, flags);
@@ -385,14 +398,14 @@
 			    (rcfw, (struct creq_func_event *)creqe))
 				rcfw->creq_func_event_processed++;
 			else
-				dev_warn
-				(&rcfw->pdev->dev, "QPLIB:aeqe:%#x Not handled",
-				 type);
+				dev_warn(&rcfw->pdev->dev,
+					 "aeqe:%#x Not handled\n", type);
 			break;
 		default:
-			dev_warn(&rcfw->pdev->dev, "QPLIB: creqe with ");
-			dev_warn(&rcfw->pdev->dev,
-				 "QPLIB: op_event = 0x%x not handled", type);
+			if (type != ASYNC_EVENT_CMPL_TYPE_HWRM_ASYNC_EVENT)
+				dev_warn(&rcfw->pdev->dev,
+					 "creqe with event 0x%x not handled\n",
+					 type);
 			break;
 		}
 		raw_cons++;
@@ -401,8 +414,9 @@
 
 	if (creq->cons != raw_cons) {
 		creq->cons = raw_cons;
-		CREQ_DB_REARM(rcfw->creq_bar_reg_iomem, raw_cons,
-			      creq->max_elements);
+		bnxt_qplib_ring_creq_db_rearm(rcfw->creq_bar_reg_iomem,
+					      raw_cons, creq->max_elements,
+					      rcfw->creq_ring_id, gen_p5);
 	}
 	spin_unlock_irqrestore(&creq->lock, flags);
 }
@@ -474,11 +488,13 @@
 	req.log2_dbr_pg_size = cpu_to_le16(PAGE_SHIFT -
 					   RCFW_DBR_BASE_PAGE_SHIFT);
 	/*
-	 * VFs need not setup the HW context area, PF
+	 * Gen P5 devices doesn't require this allocation
+	 * as the L2 driver does the same for RoCE also.
+	 * Also, VFs need not setup the HW context area, PF
 	 * shall setup this area for VF. Skipping the
 	 * HW programming
 	 */
-	if (is_virtfn)
+	if (is_virtfn || bnxt_qplib_is_chip_gen_p5(rcfw->res->cctx))
 		goto skip_ctx_setup;
 
 	level = ctx->qpc_tbl.level;
@@ -551,25 +567,36 @@
 
 int bnxt_qplib_alloc_rcfw_channel(struct pci_dev *pdev,
 				  struct bnxt_qplib_rcfw *rcfw,
+				  struct bnxt_qplib_ctx *ctx,
 				  int qp_tbl_sz)
 {
+	u8 hwq_type;
+
 	rcfw->pdev = pdev;
 	rcfw->creq.max_elements = BNXT_QPLIB_CREQE_MAX_CNT;
-	if (bnxt_qplib_alloc_init_hwq(rcfw->pdev, &rcfw->creq, NULL, 0,
+	hwq_type = bnxt_qplib_get_hwq_type(rcfw->res);
+	if (bnxt_qplib_alloc_init_hwq(rcfw->pdev, &rcfw->creq, NULL,
 				      &rcfw->creq.max_elements,
-				      BNXT_QPLIB_CREQE_UNITS, 0, PAGE_SIZE,
-				      HWQ_TYPE_L2_CMPL)) {
+				      BNXT_QPLIB_CREQE_UNITS,
+				      0, PAGE_SIZE, hwq_type)) {
 		dev_err(&rcfw->pdev->dev,
-			"QPLIB: HW channel CREQ allocation failed");
+			"HW channel CREQ allocation failed\n");
 		goto fail;
 	}
-	rcfw->cmdq.max_elements = BNXT_QPLIB_CMDQE_MAX_CNT;
-	if (bnxt_qplib_alloc_init_hwq(rcfw->pdev, &rcfw->cmdq, NULL, 0,
-				      &rcfw->cmdq.max_elements,
-				      BNXT_QPLIB_CMDQE_UNITS, 0, PAGE_SIZE,
-				      HWQ_TYPE_CTX)) {
+	if (ctx->hwrm_intf_ver < HWRM_VERSION_RCFW_CMDQ_DEPTH_CHECK)
+		rcfw->cmdq_depth = BNXT_QPLIB_CMDQE_MAX_CNT_256;
+	else
+		rcfw->cmdq_depth = BNXT_QPLIB_CMDQE_MAX_CNT_8192;
+
+	rcfw->cmdq.max_elements = rcfw->cmdq_depth;
+	if (bnxt_qplib_alloc_init_hwq
+			(rcfw->pdev, &rcfw->cmdq, NULL,
+			 &rcfw->cmdq.max_elements,
+			 BNXT_QPLIB_CMDQE_UNITS, 0,
+			 bnxt_qplib_cmdqe_page_size(rcfw->cmdq_depth),
+			 HWQ_TYPE_CTX)) {
 		dev_err(&rcfw->pdev->dev,
-			"QPLIB: HW channel CMDQ allocation failed");
+			"HW channel CMDQ allocation failed\n");
 		goto fail;
 	}
 
@@ -593,10 +620,13 @@
 
 void bnxt_qplib_rcfw_stop_irq(struct bnxt_qplib_rcfw *rcfw, bool kill)
 {
+	bool gen_p5 = bnxt_qplib_is_chip_gen_p5(rcfw->res->cctx);
+
 	tasklet_disable(&rcfw->worker);
 	/* Mask h/w interrupts */
-	CREQ_DB(rcfw->creq_bar_reg_iomem, rcfw->creq.cons,
-		rcfw->creq.max_elements);
+	bnxt_qplib_ring_creq_db(rcfw->creq_bar_reg_iomem, rcfw->creq.cons,
+				rcfw->creq.max_elements, rcfw->creq_ring_id,
+				gen_p5);
 	/* Sync with last running IRQ-handler */
 	synchronize_irq(rcfw->vector);
 	if (kill)
@@ -614,21 +644,18 @@
 
 	bnxt_qplib_rcfw_stop_irq(rcfw, true);
 
-	if (rcfw->cmdq_bar_reg_iomem)
-		iounmap(rcfw->cmdq_bar_reg_iomem);
-	rcfw->cmdq_bar_reg_iomem = NULL;
-
-	if (rcfw->creq_bar_reg_iomem)
-		iounmap(rcfw->creq_bar_reg_iomem);
-	rcfw->creq_bar_reg_iomem = NULL;
+	iounmap(rcfw->cmdq_bar_reg_iomem);
+	iounmap(rcfw->creq_bar_reg_iomem);
 
 	indx = find_first_bit(rcfw->cmdq_bitmap, rcfw->bmap_size);
 	if (indx != rcfw->bmap_size)
 		dev_err(&rcfw->pdev->dev,
-			"QPLIB: disabling RCFW with pending cmd-bit %lx", indx);
+			"disabling RCFW with pending cmd-bit %lx\n", indx);
 	kfree(rcfw->cmdq_bitmap);
 	rcfw->bmap_size = 0;
 
+	rcfw->cmdq_bar_reg_iomem = NULL;
+	rcfw->creq_bar_reg_iomem = NULL;
 	rcfw->aeq_handler = NULL;
 	rcfw->vector = 0;
 }
@@ -636,6 +663,7 @@
 int bnxt_qplib_rcfw_start_irq(struct bnxt_qplib_rcfw *rcfw, int msix_vector,
 			      bool need_init)
 {
+	bool gen_p5 = bnxt_qplib_is_chip_gen_p5(rcfw->res->cctx);
 	int rc;
 
 	if (rcfw->requested)
@@ -652,8 +680,9 @@
 	if (rc)
 		return rc;
 	rcfw->requested = true;
-	CREQ_DB_REARM(rcfw->creq_bar_reg_iomem, rcfw->creq.cons,
-		      rcfw->creq.max_elements);
+	bnxt_qplib_ring_creq_db_rearm(rcfw->creq_bar_reg_iomem,
+				      rcfw->creq.cons, rcfw->creq.max_elements,
+				      rcfw->creq_ring_id, gen_p5);
 
 	return 0;
 }
@@ -673,8 +702,7 @@
 	/* General */
 	rcfw->seq_num = 0;
 	set_bit(FIRMWARE_FIRST_FLAG, &rcfw->flags);
-	bmap_size = BITS_TO_LONGS(RCFW_MAX_OUTSTANDING_CMD *
-				  sizeof(unsigned long));
+	bmap_size = BITS_TO_LONGS(rcfw->cmdq_depth) * sizeof(unsigned long);
 	rcfw->cmdq_bitmap = kzalloc(bmap_size, GFP_KERNEL);
 	if (!rcfw->cmdq_bitmap)
 		return -ENOMEM;
@@ -690,8 +718,7 @@
 					      RCFW_COMM_BASE_OFFSET,
 					      RCFW_COMM_SIZE);
 	if (!rcfw->cmdq_bar_reg_iomem) {
-		dev_err(&rcfw->pdev->dev,
-			"QPLIB: CMDQ BAR region %d mapping failed",
+		dev_err(&rcfw->pdev->dev, "CMDQ BAR region %d mapping failed\n",
 			rcfw->cmdq_bar_reg);
 		return -ENOMEM;
 	}
@@ -706,14 +733,16 @@
 	res_base = pci_resource_start(pdev, rcfw->creq_bar_reg);
 	if (!res_base)
 		dev_err(&rcfw->pdev->dev,
-			"QPLIB: CREQ BAR region %d resc start is 0!",
+			"CREQ BAR region %d resc start is 0!\n",
 			rcfw->creq_bar_reg);
+	/* Unconditionally map 8 bytes to support 57500 series */
 	rcfw->creq_bar_reg_iomem = ioremap_nocache(res_base + cp_bar_reg_off,
-						   4);
+						   8);
 	if (!rcfw->creq_bar_reg_iomem) {
-		dev_err(&rcfw->pdev->dev,
-			"QPLIB: CREQ BAR region %d mapping failed",
+		dev_err(&rcfw->pdev->dev, "CREQ BAR region %d mapping failed\n",
 			rcfw->creq_bar_reg);
+		iounmap(rcfw->cmdq_bar_reg_iomem);
+		rcfw->cmdq_bar_reg_iomem = NULL;
 		return -ENOMEM;
 	}
 	rcfw->creq_qp_event_processed = 0;
@@ -726,14 +755,14 @@
 	rc = bnxt_qplib_rcfw_start_irq(rcfw, msix_vector, true);
 	if (rc) {
 		dev_err(&rcfw->pdev->dev,
-			"QPLIB: Failed to request IRQ for CREQ rc = 0x%x", rc);
+			"Failed to request IRQ for CREQ rc = 0x%x\n", rc);
 		bnxt_qplib_disable_rcfw_channel(rcfw);
 		return rc;
 	}
 
 	init.cmdq_pbl = cpu_to_le64(rcfw->cmdq.pbl[PBL_LVL_0].pg_map_arr[0]);
 	init.cmdq_size_cmdq_lvl = cpu_to_le16(
-		((BNXT_QPLIB_CMDQE_MAX_CNT << CMDQ_INIT_CMDQ_SIZE_SFT) &
+		((rcfw->cmdq_depth << CMDQ_INIT_CMDQ_SIZE_SFT) &
 		 CMDQ_INIT_CMDQ_SIZE_MASK) |
 		((rcfw->cmdq.level << CMDQ_INIT_CMDQ_LVL_SFT) &
 		 CMDQ_INIT_CMDQ_LVL_MASK));
@@ -755,8 +784,8 @@
 		return NULL;
 
 	sbuf->size = size;
-	sbuf->sb = dma_zalloc_coherent(&rcfw->pdev->dev, sbuf->size,
-				       &sbuf->dma_addr, GFP_ATOMIC);
+	sbuf->sb = dma_alloc_coherent(&rcfw->pdev->dev, sbuf->size,
+				      &sbuf->dma_addr, GFP_ATOMIC);
 	if (!sbuf->sb)
 		goto bail;
 
diff --git a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h
index 46416df..dfeadc1 100644
--- a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h
+++ b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h
@@ -55,40 +55,73 @@
 	do {								\
 		memset(&(req), 0, sizeof((req)));			\
 		(req).opcode = CMDQ_BASE_OPCODE_##CMD;			\
-		(req).cmd_size = (sizeof((req)) +			\
-				BNXT_QPLIB_CMDQE_UNITS - 1) /		\
-				BNXT_QPLIB_CMDQE_UNITS;			\
+		(req).cmd_size = sizeof((req));				\
 		(req).flags = cpu_to_le16(cmd_flags);			\
 	} while (0)
 
 #define RCFW_CMD_WAIT_TIME_MS		20000 /* 20 Seconds timeout */
 
-/* CMDQ elements */
-#define BNXT_QPLIB_CMDQE_MAX_CNT	256
-#define BNXT_QPLIB_CMDQE_UNITS		sizeof(struct bnxt_qplib_cmdqe)
-#define BNXT_QPLIB_CMDQE_CNT_PER_PG	(PAGE_SIZE / BNXT_QPLIB_CMDQE_UNITS)
-
-#define MAX_CMDQ_IDX			(BNXT_QPLIB_CMDQE_MAX_CNT - 1)
-#define MAX_CMDQ_IDX_PER_PG		(BNXT_QPLIB_CMDQE_CNT_PER_PG - 1)
-
-#define RCFW_MAX_OUTSTANDING_CMD	BNXT_QPLIB_CMDQE_MAX_CNT
-#define RCFW_MAX_COOKIE_VALUE		0x7FFF
-#define RCFW_CMD_IS_BLOCKING		0x8000
-#define RCFW_BLOCKED_CMD_WAIT_COUNT	0x4E20
-
 /* Cmdq contains a fix number of a 16-Byte slots */
 struct bnxt_qplib_cmdqe {
 	u8		data[16];
 };
 
-static inline u32 get_cmdq_pg(u32 val)
+/* CMDQ elements */
+#define BNXT_QPLIB_CMDQE_MAX_CNT_256	256
+#define BNXT_QPLIB_CMDQE_MAX_CNT_8192	8192
+#define BNXT_QPLIB_CMDQE_UNITS		sizeof(struct bnxt_qplib_cmdqe)
+#define BNXT_QPLIB_CMDQE_BYTES(depth)	((depth) * BNXT_QPLIB_CMDQE_UNITS)
+
+static inline u32 bnxt_qplib_cmdqe_npages(u32 depth)
 {
-	return (val & ~MAX_CMDQ_IDX_PER_PG) / BNXT_QPLIB_CMDQE_CNT_PER_PG;
+	u32 npages;
+
+	npages = BNXT_QPLIB_CMDQE_BYTES(depth) / PAGE_SIZE;
+	if (BNXT_QPLIB_CMDQE_BYTES(depth) % PAGE_SIZE)
+		npages++;
+	return npages;
 }
 
-static inline u32 get_cmdq_idx(u32 val)
+static inline u32 bnxt_qplib_cmdqe_page_size(u32 depth)
 {
-	return val & MAX_CMDQ_IDX_PER_PG;
+	return (bnxt_qplib_cmdqe_npages(depth) * PAGE_SIZE);
+}
+
+static inline u32 bnxt_qplib_cmdqe_cnt_per_pg(u32 depth)
+{
+	return (bnxt_qplib_cmdqe_page_size(depth) /
+		 BNXT_QPLIB_CMDQE_UNITS);
+}
+
+/* Set the cmd_size to a factor of CMDQE unit */
+static inline void bnxt_qplib_set_cmd_slots(struct cmdq_base *req)
+{
+	req->cmd_size = (req->cmd_size + BNXT_QPLIB_CMDQE_UNITS - 1) /
+			 BNXT_QPLIB_CMDQE_UNITS;
+}
+
+#define MAX_CMDQ_IDX(depth)		((depth) - 1)
+
+static inline u32 bnxt_qplib_max_cmdq_idx_per_pg(u32 depth)
+{
+	return (bnxt_qplib_cmdqe_cnt_per_pg(depth) - 1);
+}
+
+#define RCFW_MAX_COOKIE_VALUE		0x7FFF
+#define RCFW_CMD_IS_BLOCKING		0x8000
+#define RCFW_BLOCKED_CMD_WAIT_COUNT	0x4E20
+
+#define HWRM_VERSION_RCFW_CMDQ_DEPTH_CHECK 0x1000900020011ULL
+
+static inline u32 get_cmdq_pg(u32 val, u32 depth)
+{
+	return (val & ~(bnxt_qplib_max_cmdq_idx_per_pg(depth))) /
+		(bnxt_qplib_cmdqe_cnt_per_pg(depth));
+}
+
+static inline u32 get_cmdq_idx(u32 val, u32 depth)
+{
+	return val & (bnxt_qplib_max_cmdq_idx_per_pg(depth));
 }
 
 /* Crsq buf is 1024-Byte */
@@ -129,10 +162,46 @@
 #define CREQ_DB_CP_FLAGS		(CREQ_DB_KEY_CP |	\
 					 CREQ_DB_IDX_VALID |	\
 					 CREQ_DB_IRQ_DIS)
-#define CREQ_DB_REARM(db, raw_cons, cp_bit)			\
-	writel(CREQ_DB_CP_FLAGS_REARM | ((raw_cons) & ((cp_bit) - 1)), db)
-#define CREQ_DB(db, raw_cons, cp_bit)				\
-	writel(CREQ_DB_CP_FLAGS | ((raw_cons) & ((cp_bit) - 1)), db)
+
+static inline void bnxt_qplib_ring_creq_db64(void __iomem *db, u32 index,
+					     u32 xid, bool arm)
+{
+	u64 val = 0;
+
+	val = xid & DBC_DBC_XID_MASK;
+	val |= DBC_DBC_PATH_ROCE;
+	val |= arm ? DBC_DBC_TYPE_NQ_ARM : DBC_DBC_TYPE_NQ;
+	val <<= 32;
+	val |= index & DBC_DBC_INDEX_MASK;
+
+	writeq(val, db);
+}
+
+static inline void bnxt_qplib_ring_creq_db_rearm(void __iomem *db, u32 raw_cons,
+						 u32 max_elements, u32 xid,
+						 bool gen_p5)
+{
+	u32 index = raw_cons & (max_elements - 1);
+
+	if (gen_p5)
+		bnxt_qplib_ring_creq_db64(db, index, xid, true);
+	else
+		writel(CREQ_DB_CP_FLAGS_REARM | (index & DBC_DBC32_XID_MASK),
+		       db);
+}
+
+static inline void bnxt_qplib_ring_creq_db(void __iomem *db, u32 raw_cons,
+					   u32 max_elements, u32 xid,
+					   bool gen_p5)
+{
+	u32 index = raw_cons & (max_elements - 1);
+
+	if (gen_p5)
+		bnxt_qplib_ring_creq_db64(db, index, xid, true);
+	else
+		writel(CREQ_DB_CP_FLAGS | (index & DBC_DBC32_XID_MASK),
+		       db);
+}
 
 #define CREQ_ENTRY_POLL_BUDGET		0x100
 
@@ -154,9 +223,12 @@
 	void *qp_handle;        /* ptr to qplib_qp */
 };
 
+#define BNXT_QPLIB_OOS_COUNT_MASK 0xFFFFFFFF
+
 /* RCFW Communication Channels */
 struct bnxt_qplib_rcfw {
 	struct pci_dev		*pdev;
+	struct bnxt_qplib_res	*res;
 	int			vector;
 	struct tasklet_struct	worker;
 	bool			requested;
@@ -190,11 +262,16 @@
 	struct bnxt_qplib_crsq	*crsqe_tbl;
 	int qp_tbl_size;
 	struct bnxt_qplib_qp_node *qp_tbl;
+	u64 oos_prev;
+	u32 init_oos_stats;
+	u32 cmdq_depth;
 };
 
 void bnxt_qplib_free_rcfw_channel(struct bnxt_qplib_rcfw *rcfw);
 int bnxt_qplib_alloc_rcfw_channel(struct pci_dev *pdev,
-				  struct bnxt_qplib_rcfw *rcfw, int qp_tbl_sz);
+				  struct bnxt_qplib_rcfw *rcfw,
+				  struct bnxt_qplib_ctx *ctx,
+				  int qp_tbl_sz);
 void bnxt_qplib_rcfw_stop_irq(struct bnxt_qplib_rcfw *rcfw, bool kill);
 void bnxt_qplib_disable_rcfw_channel(struct bnxt_qplib_rcfw *rcfw);
 int bnxt_qplib_rcfw_start_irq(struct bnxt_qplib_rcfw *rcfw, int msix_vector,
diff --git a/drivers/infiniband/hw/bnxt_re/qplib_res.c b/drivers/infiniband/hw/bnxt_re/qplib_res.c
index 539a5d4..bdbde8e 100644
--- a/drivers/infiniband/hw/bnxt_re/qplib_res.c
+++ b/drivers/infiniband/hw/bnxt_re/qplib_res.c
@@ -36,6 +36,8 @@
  * Description: QPLib resource manager
  */
 
+#define dev_fmt(fmt) "QPLIB: " fmt
+
 #include <linux/spinlock.h>
 #include <linux/pci.h>
 #include <linux/interrupt.h>
@@ -68,8 +70,7 @@
 						  pbl->pg_map_arr[i]);
 			else
 				dev_warn(&pdev->dev,
-					 "QPLIB: PBL free pg_arr[%d] empty?!",
-					 i);
+					 "PBL free pg_arr[%d] empty?!\n", i);
 			pbl->pg_arr[i] = NULL;
 		}
 	}
@@ -82,9 +83,10 @@
 }
 
 static int __alloc_pbl(struct pci_dev *pdev, struct bnxt_qplib_pbl *pbl,
-		       struct scatterlist *sghead, u32 pages, u32 pg_size)
+		       struct scatterlist *sghead, u32 pages,
+		       u32 nmaps, u32 pg_size)
 {
-	struct scatterlist *sg;
+	struct sg_dma_page_iter sg_iter;
 	bool is_umem = false;
 	int i;
 
@@ -104,10 +106,10 @@
 
 	if (!sghead) {
 		for (i = 0; i < pages; i++) {
-			pbl->pg_arr[i] = dma_zalloc_coherent(&pdev->dev,
-							     pbl->pg_size,
-							     &pbl->pg_map_arr[i],
-							     GFP_KERNEL);
+			pbl->pg_arr[i] = dma_alloc_coherent(&pdev->dev,
+							    pbl->pg_size,
+							    &pbl->pg_map_arr[i],
+							    GFP_KERNEL);
 			if (!pbl->pg_arr[i])
 				goto fail;
 			pbl->pg_count++;
@@ -115,13 +117,11 @@
 	} else {
 		i = 0;
 		is_umem = true;
-		for_each_sg(sghead, sg, pages, i) {
-			pbl->pg_map_arr[i] = sg_dma_address(sg);
-			pbl->pg_arr[i] = sg_virt(sg);
-			if (!pbl->pg_arr[i])
-				goto fail;
-
+		for_each_sg_dma_page(sghead, &sg_iter, nmaps, 0) {
+			pbl->pg_map_arr[i] = sg_page_iter_dma_address(&sg_iter);
+			pbl->pg_arr[i] = NULL;
 			pbl->pg_count++;
+			i++;
 		}
 	}
 
@@ -159,12 +159,13 @@
 
 /* All HWQs are power of 2 in size */
 int bnxt_qplib_alloc_init_hwq(struct pci_dev *pdev, struct bnxt_qplib_hwq *hwq,
-			      struct scatterlist *sghead, int nmap,
+			      struct bnxt_qplib_sg_info *sg_info,
 			      u32 *elements, u32 element_size, u32 aux,
 			      u32 pg_size, enum bnxt_qplib_hwq_type hwq_type)
 {
-	u32 pages, slots, size, aux_pages = 0, aux_size = 0;
+	u32 pages, maps, slots, size, aux_pages = 0, aux_size = 0;
 	dma_addr_t *src_phys_ptr, **dst_virt_ptr;
+	struct scatterlist *sghead = NULL;
 	int i, rc;
 
 	hwq->level = PBL_LVL_MAX;
@@ -178,6 +179,9 @@
 	}
 	size = roundup_pow_of_two(element_size);
 
+	if (sg_info)
+		sghead = sg_info->sglist;
+
 	if (!sghead) {
 		hwq->is_user = false;
 		pages = (slots * size) / pg_size + aux_pages;
@@ -185,17 +189,20 @@
 			pages++;
 		if (!pages)
 			return -EINVAL;
+		maps = 0;
 	} else {
 		hwq->is_user = true;
-		pages = nmap;
+		pages = sg_info->npages;
+		maps = sg_info->nmap;
 	}
 
 	/* Alloc the 1st memory block; can be a PDL/PTL/PBL */
 	if (sghead && (pages == MAX_PBL_LVL_0_PGS))
 		rc = __alloc_pbl(pdev, &hwq->pbl[PBL_LVL_0], sghead,
-				 pages, pg_size);
+				 pages, maps, pg_size);
 	else
-		rc = __alloc_pbl(pdev, &hwq->pbl[PBL_LVL_0], NULL, 1, pg_size);
+		rc = __alloc_pbl(pdev, &hwq->pbl[PBL_LVL_0], NULL,
+				 1, 0, pg_size);
 	if (rc)
 		goto fail;
 
@@ -205,7 +212,8 @@
 		if (pages > MAX_PBL_LVL_1_PGS) {
 			/* 2 levels of indirection */
 			rc = __alloc_pbl(pdev, &hwq->pbl[PBL_LVL_1], NULL,
-					 MAX_PBL_LVL_1_PGS_FOR_LVL_2, pg_size);
+					 MAX_PBL_LVL_1_PGS_FOR_LVL_2,
+					 0, pg_size);
 			if (rc)
 				goto fail;
 			/* Fill in lvl0 PBL */
@@ -218,7 +226,7 @@
 			hwq->level = PBL_LVL_1;
 
 			rc = __alloc_pbl(pdev, &hwq->pbl[PBL_LVL_2], sghead,
-					 pages, pg_size);
+					 pages, maps, pg_size);
 			if (rc)
 				goto fail;
 
@@ -247,7 +255,7 @@
 
 			/* 1 level of indirection */
 			rc = __alloc_pbl(pdev, &hwq->pbl[PBL_LVL_1], sghead,
-					 pages, pg_size);
+					 pages, maps, pg_size);
 			if (rc)
 				goto fail;
 			/* Fill in lvl0 PBL */
@@ -329,18 +337,18 @@
  */
 int bnxt_qplib_alloc_ctx(struct pci_dev *pdev,
 			 struct bnxt_qplib_ctx *ctx,
-			 bool virt_fn)
+			 bool virt_fn, bool is_p5)
 {
 	int i, j, k, rc = 0;
 	int fnz_idx = -1;
 	__le64 **pbl_ptr;
 
-	if (virt_fn)
+	if (virt_fn || is_p5)
 		goto stats_alloc;
 
 	/* QPC Tables */
 	ctx->qpc_tbl.max_elements = ctx->qpc_count;
-	rc = bnxt_qplib_alloc_init_hwq(pdev, &ctx->qpc_tbl, NULL, 0,
+	rc = bnxt_qplib_alloc_init_hwq(pdev, &ctx->qpc_tbl, NULL,
 				       &ctx->qpc_tbl.max_elements,
 				       BNXT_QPLIB_MAX_QP_CTX_ENTRY_SIZE, 0,
 				       PAGE_SIZE, HWQ_TYPE_CTX);
@@ -349,7 +357,7 @@
 
 	/* MRW Tables */
 	ctx->mrw_tbl.max_elements = ctx->mrw_count;
-	rc = bnxt_qplib_alloc_init_hwq(pdev, &ctx->mrw_tbl, NULL, 0,
+	rc = bnxt_qplib_alloc_init_hwq(pdev, &ctx->mrw_tbl, NULL,
 				       &ctx->mrw_tbl.max_elements,
 				       BNXT_QPLIB_MAX_MRW_CTX_ENTRY_SIZE, 0,
 				       PAGE_SIZE, HWQ_TYPE_CTX);
@@ -358,7 +366,7 @@
 
 	/* SRQ Tables */
 	ctx->srqc_tbl.max_elements = ctx->srqc_count;
-	rc = bnxt_qplib_alloc_init_hwq(pdev, &ctx->srqc_tbl, NULL, 0,
+	rc = bnxt_qplib_alloc_init_hwq(pdev, &ctx->srqc_tbl, NULL,
 				       &ctx->srqc_tbl.max_elements,
 				       BNXT_QPLIB_MAX_SRQ_CTX_ENTRY_SIZE, 0,
 				       PAGE_SIZE, HWQ_TYPE_CTX);
@@ -367,7 +375,7 @@
 
 	/* CQ Tables */
 	ctx->cq_tbl.max_elements = ctx->cq_count;
-	rc = bnxt_qplib_alloc_init_hwq(pdev, &ctx->cq_tbl, NULL, 0,
+	rc = bnxt_qplib_alloc_init_hwq(pdev, &ctx->cq_tbl, NULL,
 				       &ctx->cq_tbl.max_elements,
 				       BNXT_QPLIB_MAX_CQ_CTX_ENTRY_SIZE, 0,
 				       PAGE_SIZE, HWQ_TYPE_CTX);
@@ -376,7 +384,7 @@
 
 	/* TQM Buffer */
 	ctx->tqm_pde.max_elements = 512;
-	rc = bnxt_qplib_alloc_init_hwq(pdev, &ctx->tqm_pde, NULL, 0,
+	rc = bnxt_qplib_alloc_init_hwq(pdev, &ctx->tqm_pde, NULL,
 				       &ctx->tqm_pde.max_elements, sizeof(u64),
 				       0, PAGE_SIZE, HWQ_TYPE_CTX);
 	if (rc)
@@ -387,7 +395,7 @@
 			continue;
 		ctx->tqm_tbl[i].max_elements = ctx->qpc_count *
 					       ctx->tqm_count[i];
-		rc = bnxt_qplib_alloc_init_hwq(pdev, &ctx->tqm_tbl[i], NULL, 0,
+		rc = bnxt_qplib_alloc_init_hwq(pdev, &ctx->tqm_tbl[i], NULL,
 					       &ctx->tqm_tbl[i].max_elements, 1,
 					       0, PAGE_SIZE, HWQ_TYPE_CTX);
 		if (rc)
@@ -425,7 +433,7 @@
 
 	/* TIM Buffer */
 	ctx->tim_tbl.max_elements = ctx->qpc_count * 16;
-	rc = bnxt_qplib_alloc_init_hwq(pdev, &ctx->tim_tbl, NULL, 0,
+	rc = bnxt_qplib_alloc_init_hwq(pdev, &ctx->tim_tbl, NULL,
 				       &ctx->tim_tbl.max_elements, 1,
 				       0, PAGE_SIZE, HWQ_TYPE_CTX);
 	if (rc)
@@ -480,7 +488,7 @@
 				     struct bnxt_qplib_sgid_tbl *sgid_tbl,
 				     u16 max)
 {
-	sgid_tbl->tbl = kcalloc(max, sizeof(struct bnxt_qplib_gid), GFP_KERNEL);
+	sgid_tbl->tbl = kcalloc(max, sizeof(*sgid_tbl->tbl), GFP_KERNEL);
 	if (!sgid_tbl->tbl)
 		return -ENOMEM;
 
@@ -518,9 +526,10 @@
 	for (i = 0; i < sgid_tbl->max; i++) {
 		if (memcmp(&sgid_tbl->tbl[i], &bnxt_qplib_gid_zero,
 			   sizeof(bnxt_qplib_gid_zero)))
-			bnxt_qplib_del_sgid(sgid_tbl, &sgid_tbl->tbl[i], true);
+			bnxt_qplib_del_sgid(sgid_tbl, &sgid_tbl->tbl[i].gid,
+					    sgid_tbl->tbl[i].vlan_id, true);
 	}
-	memset(sgid_tbl->tbl, 0, sizeof(struct bnxt_qplib_gid) * sgid_tbl->max);
+	memset(sgid_tbl->tbl, 0, sizeof(*sgid_tbl->tbl) * sgid_tbl->max);
 	memset(sgid_tbl->hw_id, -1, sizeof(u16) * sgid_tbl->max);
 	memset(sgid_tbl->vlan, 0, sizeof(u8) * sgid_tbl->max);
 	sgid_tbl->active = 0;
@@ -529,7 +538,11 @@
 static void bnxt_qplib_init_sgid_tbl(struct bnxt_qplib_sgid_tbl *sgid_tbl,
 				     struct net_device *netdev)
 {
-	memset(sgid_tbl->tbl, 0, sizeof(struct bnxt_qplib_gid) * sgid_tbl->max);
+	u32 i;
+
+	for (i = 0; i < sgid_tbl->max; i++)
+		sgid_tbl->tbl[i].vlan_id = 0xffff;
+
 	memset(sgid_tbl->hw_id, -1, sizeof(u16) * sgid_tbl->max);
 }
 
@@ -537,7 +550,7 @@
 				     struct bnxt_qplib_pkey_tbl *pkey_tbl)
 {
 	if (!pkey_tbl->tbl)
-		dev_dbg(&res->pdev->dev, "QPLIB: PKEY tbl not present");
+		dev_dbg(&res->pdev->dev, "PKEY tbl not present\n");
 	else
 		kfree(pkey_tbl->tbl);
 
@@ -578,7 +591,7 @@
 			  struct bnxt_qplib_pd *pd)
 {
 	if (test_and_set_bit(pd->id, pdt->tbl)) {
-		dev_warn(&res->pdev->dev, "Freeing an unused PD? pdn = %d",
+		dev_warn(&res->pdev->dev, "Freeing an unused PD? pdn = %d\n",
 			 pd->id);
 		return -EINVAL;
 	}
@@ -639,11 +652,11 @@
 			   struct bnxt_qplib_dpi     *dpi)
 {
 	if (dpi->dpi >= dpit->max) {
-		dev_warn(&res->pdev->dev, "Invalid DPI? dpi = %d", dpi->dpi);
+		dev_warn(&res->pdev->dev, "Invalid DPI? dpi = %d\n", dpi->dpi);
 		return -EINVAL;
 	}
 	if (test_and_set_bit(dpi->dpi, dpit->tbl)) {
-		dev_warn(&res->pdev->dev, "Freeing an unused DPI? dpi = %d",
+		dev_warn(&res->pdev->dev, "Freeing an unused DPI? dpi = %d\n",
 			 dpi->dpi);
 		return -EINVAL;
 	}
@@ -673,22 +686,21 @@
 	u32 dbr_len, bytes;
 
 	if (dpit->dbr_bar_reg_iomem) {
-		dev_err(&res->pdev->dev,
-			"QPLIB: DBR BAR region %d already mapped", dbr_bar_reg);
+		dev_err(&res->pdev->dev, "DBR BAR region %d already mapped\n",
+			dbr_bar_reg);
 		return -EALREADY;
 	}
 
 	bar_reg_base = pci_resource_start(res->pdev, dbr_bar_reg);
 	if (!bar_reg_base) {
-		dev_err(&res->pdev->dev,
-			"QPLIB: BAR region %d resc start failed", dbr_bar_reg);
+		dev_err(&res->pdev->dev, "BAR region %d resc start failed\n",
+			dbr_bar_reg);
 		return -ENOMEM;
 	}
 
 	dbr_len = pci_resource_len(res->pdev, dbr_bar_reg) - dbr_offset;
 	if (!dbr_len || ((dbr_len & (PAGE_SIZE - 1)) != 0)) {
-		dev_err(&res->pdev->dev, "QPLIB: Invalid DBR length %d",
-			dbr_len);
+		dev_err(&res->pdev->dev, "Invalid DBR length %d\n", dbr_len);
 		return -ENOMEM;
 	}
 
@@ -696,8 +708,7 @@
 						  dbr_len);
 	if (!dpit->dbr_bar_reg_iomem) {
 		dev_err(&res->pdev->dev,
-			"QPLIB: FP: DBR BAR region %d mapping failed",
-			dbr_bar_reg);
+			"FP: DBR BAR region %d mapping failed\n", dbr_bar_reg);
 		return -ENOMEM;
 	}
 
@@ -763,11 +774,15 @@
 {
 	memset(stats, 0, sizeof(*stats));
 	stats->fw_id = -1;
-	stats->size = sizeof(struct ctx_hw_stats);
+	/* 128 byte aligned context memory is required only for 57500.
+	 * However making this unconditional, it does not harm previous
+	 * generation.
+	 */
+	stats->size = ALIGN(sizeof(struct ctx_hw_stats), 128);
 	stats->dma = dma_alloc_coherent(&pdev->dev, stats->size,
 					&stats->dma_map, GFP_KERNEL);
 	if (!stats->dma) {
-		dev_err(&pdev->dev, "QPLIB: Stats DMA allocation failed");
+		dev_err(&pdev->dev, "Stats DMA allocation failed\n");
 		return -ENOMEM;
 	}
 	return 0;
diff --git a/drivers/infiniband/hw/bnxt_re/qplib_res.h b/drivers/infiniband/hw/bnxt_re/qplib_res.h
index 2e5c052..fbda11a 100644
--- a/drivers/infiniband/hw/bnxt_re/qplib_res.h
+++ b/drivers/infiniband/hw/bnxt_re/qplib_res.h
@@ -111,7 +111,7 @@
 };
 
 struct bnxt_qplib_sgid_tbl {
-	struct bnxt_qplib_gid		*tbl;
+	struct bnxt_qplib_gid_info	*tbl;
 	u16				*hw_id;
 	u16				max;
 	u16				active;
@@ -177,14 +177,23 @@
 	struct bnxt_qplib_hwq		tqm_tbl[MAX_TQM_ALLOC_REQ];
 	struct bnxt_qplib_stats		stats;
 	struct bnxt_qplib_vf_res	vf_res;
+	u64				hwrm_intf_ver;
 };
 
+struct bnxt_qplib_chip_ctx {
+	u16	chip_num;
+	u8	chip_rev;
+	u8	chip_metal;
+};
+
+#define CHIP_NUM_57500          0x1750
+
 struct bnxt_qplib_res {
 	struct pci_dev			*pdev;
+	struct bnxt_qplib_chip_ctx	*cctx;
 	struct net_device		*netdev;
 
 	struct bnxt_qplib_rcfw		*rcfw;
-
 	struct bnxt_qplib_pd_tbl	pd_tbl;
 	struct bnxt_qplib_sgid_tbl	sgid_tbl;
 	struct bnxt_qplib_pkey_tbl	pkey_tbl;
@@ -192,6 +201,30 @@
 	bool				prio;
 };
 
+static inline bool bnxt_qplib_is_chip_gen_p5(struct bnxt_qplib_chip_ctx *cctx)
+{
+	return (cctx->chip_num == CHIP_NUM_57500);
+}
+
+static inline u8 bnxt_qplib_get_hwq_type(struct bnxt_qplib_res *res)
+{
+	return bnxt_qplib_is_chip_gen_p5(res->cctx) ?
+					HWQ_TYPE_QUEUE : HWQ_TYPE_L2_CMPL;
+}
+
+static inline u8 bnxt_qplib_get_ring_type(struct bnxt_qplib_chip_ctx *cctx)
+{
+	return bnxt_qplib_is_chip_gen_p5(cctx) ?
+	       RING_ALLOC_REQ_RING_TYPE_NQ :
+	       RING_ALLOC_REQ_RING_TYPE_ROCE_CMPL;
+}
+
+struct bnxt_qplib_sg_info {
+	struct scatterlist		*sglist;
+	u32				nmap;
+	u32				npages;
+};
+
 #define to_bnxt_qplib(ptr, type, member)	\
 	container_of(ptr, type, member)
 
@@ -200,7 +233,7 @@
 
 void bnxt_qplib_free_hwq(struct pci_dev *pdev, struct bnxt_qplib_hwq *hwq);
 int bnxt_qplib_alloc_init_hwq(struct pci_dev *pdev, struct bnxt_qplib_hwq *hwq,
-			      struct scatterlist *sl, int nmap, u32 *elements,
+			      struct bnxt_qplib_sg_info *sg_info, u32 *elements,
 			      u32 elements_per_page, u32 aux, u32 pg_size,
 			      enum bnxt_qplib_hwq_type hwq_type);
 void bnxt_qplib_get_guid(u8 *dev_addr, u8 *guid);
@@ -225,5 +258,5 @@
 			 struct bnxt_qplib_ctx *ctx);
 int bnxt_qplib_alloc_ctx(struct pci_dev *pdev,
 			 struct bnxt_qplib_ctx *ctx,
-			 bool virt_fn);
+			 bool virt_fn, bool is_p5);
 #endif /* __BNXT_QPLIB_RES_H__ */
diff --git a/drivers/infiniband/hw/bnxt_re/qplib_sp.c b/drivers/infiniband/hw/bnxt_re/qplib_sp.c
index 4097f3f..40296b9 100644
--- a/drivers/infiniband/hw/bnxt_re/qplib_sp.c
+++ b/drivers/infiniband/hw/bnxt_re/qplib_sp.c
@@ -36,6 +36,8 @@
  * Description: Slow Path Operators
  */
 
+#define dev_fmt(fmt) "QPLIB: " fmt
+
 #include <linux/interrupt.h>
 #include <linux/spinlock.h>
 #include <linux/sched.h>
@@ -89,7 +91,7 @@
 	sbuf = bnxt_qplib_rcfw_alloc_sbuf(rcfw, sizeof(*sb));
 	if (!sbuf) {
 		dev_err(&rcfw->pdev->dev,
-			"QPLIB: SP: QUERY_FUNC alloc side buffer failed");
+			"SP: QUERY_FUNC alloc side buffer failed\n");
 		return -ENOMEM;
 	}
 
@@ -117,7 +119,8 @@
 	 * reporting the max number
 	 */
 	attr->max_qp_wqes -= BNXT_QPLIB_RESERVED_QP_WRS;
-	attr->max_qp_sges = sb->max_sge;
+	attr->max_qp_sges = bnxt_qplib_is_chip_gen_p5(rcfw->res->cctx) ?
+			    6 : sb->max_sge;
 	attr->max_cq = le32_to_cpu(sb->max_cq);
 	attr->max_cq_wqes = le32_to_cpu(sb->max_cqe);
 	attr->max_cq_sges = attr->max_qp_sges;
@@ -135,8 +138,16 @@
 	attr->max_srq = le16_to_cpu(sb->max_srq);
 	attr->max_srq_wqes = le32_to_cpu(sb->max_srq_wr) - 1;
 	attr->max_srq_sges = sb->max_srq_sge;
-	/* Bono only reports 1 PKEY for now, but it can support > 1 */
 	attr->max_pkey = le32_to_cpu(sb->max_pkeys);
+	/*
+	 * Some versions of FW reports more than 0xFFFF.
+	 * Restrict it for now to 0xFFFF to avoid
+	 * reporting trucated value
+	 */
+	if (attr->max_pkey > 0xFFFF) {
+		/* ib_port_attr::pkey_tbl_len is u16 */
+		attr->max_pkey = 0xFFFF;
+	}
 
 	attr->max_inline_data = le32_to_cpu(sb->max_inline_data);
 	attr->l2_db_size = (sb->l2_db_space_size + 1) *
@@ -186,8 +197,7 @@
 					  (void *)&resp,
 					  NULL, 0);
 	if (rc) {
-		dev_err(&res->pdev->dev,
-			"QPLIB: Failed to set function resources");
+		dev_err(&res->pdev->dev, "Failed to set function resources\n");
 	}
 	return rc;
 }
@@ -199,16 +209,16 @@
 {
 	if (index >= sgid_tbl->max) {
 		dev_err(&res->pdev->dev,
-			"QPLIB: Index %d exceeded SGID table max (%d)",
+			"Index %d exceeded SGID table max (%d)\n",
 			index, sgid_tbl->max);
 		return -EINVAL;
 	}
-	memcpy(gid, &sgid_tbl->tbl[index], sizeof(*gid));
+	memcpy(gid, &sgid_tbl->tbl[index].gid, sizeof(*gid));
 	return 0;
 }
 
 int bnxt_qplib_del_sgid(struct bnxt_qplib_sgid_tbl *sgid_tbl,
-			struct bnxt_qplib_gid *gid, bool update)
+			struct bnxt_qplib_gid *gid, u16 vlan_id, bool update)
 {
 	struct bnxt_qplib_res *res = to_bnxt_qplib(sgid_tbl,
 						   struct bnxt_qplib_res,
@@ -217,21 +227,21 @@
 	int index;
 
 	if (!sgid_tbl) {
-		dev_err(&res->pdev->dev, "QPLIB: SGID table not allocated");
+		dev_err(&res->pdev->dev, "SGID table not allocated\n");
 		return -EINVAL;
 	}
 	/* Do we need a sgid_lock here? */
 	if (!sgid_tbl->active) {
-		dev_err(&res->pdev->dev,
-			"QPLIB: SGID table has no active entries");
+		dev_err(&res->pdev->dev, "SGID table has no active entries\n");
 		return -ENOMEM;
 	}
 	for (index = 0; index < sgid_tbl->max; index++) {
-		if (!memcmp(&sgid_tbl->tbl[index], gid, sizeof(*gid)))
+		if (!memcmp(&sgid_tbl->tbl[index].gid, gid, sizeof(*gid)) &&
+		    vlan_id == sgid_tbl->tbl[index].vlan_id)
 			break;
 	}
 	if (index == sgid_tbl->max) {
-		dev_warn(&res->pdev->dev, "GID not found in the SGID table");
+		dev_warn(&res->pdev->dev, "GID not found in the SGID table\n");
 		return 0;
 	}
 	/* Remove GID from the SGID table */
@@ -244,7 +254,7 @@
 		RCFW_CMD_PREP(req, DELETE_GID, cmd_flags);
 		if (sgid_tbl->hw_id[index] == 0xFFFF) {
 			dev_err(&res->pdev->dev,
-				"QPLIB: GID entry contains an invalid HW id");
+				"GID entry contains an invalid HW id\n");
 			return -EINVAL;
 		}
 		req.gid_index = cpu_to_le16(sgid_tbl->hw_id[index]);
@@ -253,12 +263,13 @@
 		if (rc)
 			return rc;
 	}
-	memcpy(&sgid_tbl->tbl[index], &bnxt_qplib_gid_zero,
+	memcpy(&sgid_tbl->tbl[index].gid, &bnxt_qplib_gid_zero,
 	       sizeof(bnxt_qplib_gid_zero));
+	sgid_tbl->tbl[index].vlan_id = 0xFFFF;
 	sgid_tbl->vlan[index] = 0;
 	sgid_tbl->active--;
 	dev_dbg(&res->pdev->dev,
-		"QPLIB: SGID deleted hw_id[0x%x] = 0x%x active = 0x%x",
+		"SGID deleted hw_id[0x%x] = 0x%x active = 0x%x\n",
 		 index, sgid_tbl->hw_id[index], sgid_tbl->active);
 	sgid_tbl->hw_id[index] = (u16)-1;
 
@@ -277,20 +288,20 @@
 	int i, free_idx;
 
 	if (!sgid_tbl) {
-		dev_err(&res->pdev->dev, "QPLIB: SGID table not allocated");
+		dev_err(&res->pdev->dev, "SGID table not allocated\n");
 		return -EINVAL;
 	}
 	/* Do we need a sgid_lock here? */
 	if (sgid_tbl->active == sgid_tbl->max) {
-		dev_err(&res->pdev->dev, "QPLIB: SGID table is full");
+		dev_err(&res->pdev->dev, "SGID table is full\n");
 		return -ENOMEM;
 	}
 	free_idx = sgid_tbl->max;
 	for (i = 0; i < sgid_tbl->max; i++) {
-		if (!memcmp(&sgid_tbl->tbl[i], gid, sizeof(*gid))) {
+		if (!memcmp(&sgid_tbl->tbl[i], gid, sizeof(*gid)) &&
+		    sgid_tbl->tbl[i].vlan_id == vlan_id) {
 			dev_dbg(&res->pdev->dev,
-				"QPLIB: SGID entry already exist in entry %d!",
-				i);
+				"SGID entry already exist in entry %d!\n", i);
 			*index = i;
 			return -EALREADY;
 		} else if (!memcmp(&sgid_tbl->tbl[i], &bnxt_qplib_gid_zero,
@@ -301,7 +312,7 @@
 	}
 	if (free_idx == sgid_tbl->max) {
 		dev_err(&res->pdev->dev,
-			"QPLIB: SGID table is FULL but count is not MAX??");
+			"SGID table is FULL but count is not MAX??\n");
 		return -ENOMEM;
 	}
 	if (update) {
@@ -343,12 +354,13 @@
 	}
 	/* Add GID to the sgid_tbl */
 	memcpy(&sgid_tbl->tbl[free_idx], gid, sizeof(*gid));
+	sgid_tbl->tbl[free_idx].vlan_id = vlan_id;
 	sgid_tbl->active++;
 	if (vlan_id != 0xFFFF)
 		sgid_tbl->vlan[free_idx] = 1;
 
 	dev_dbg(&res->pdev->dev,
-		"QPLIB: SGID added hw_id[0x%x] = 0x%x active = 0x%x",
+		"SGID added hw_id[0x%x] = 0x%x active = 0x%x\n",
 		 free_idx, sgid_tbl->hw_id[free_idx], sgid_tbl->active);
 
 	*index = free_idx;
@@ -404,7 +416,7 @@
 	}
 	if (index >= pkey_tbl->max) {
 		dev_err(&res->pdev->dev,
-			"QPLIB: Index %d exceeded PKEY table max (%d)",
+			"Index %d exceeded PKEY table max (%d)\n",
 			index, pkey_tbl->max);
 		return -EINVAL;
 	}
@@ -419,14 +431,13 @@
 	int i, rc = 0;
 
 	if (!pkey_tbl) {
-		dev_err(&res->pdev->dev, "QPLIB: PKEY table not allocated");
+		dev_err(&res->pdev->dev, "PKEY table not allocated\n");
 		return -EINVAL;
 	}
 
 	/* Do we need a pkey_lock here? */
 	if (!pkey_tbl->active) {
-		dev_err(&res->pdev->dev,
-			"QPLIB: PKEY table has no active entries");
+		dev_err(&res->pdev->dev, "PKEY table has no active entries\n");
 		return -ENOMEM;
 	}
 	for (i = 0; i < pkey_tbl->max; i++) {
@@ -435,8 +446,7 @@
 	}
 	if (i == pkey_tbl->max) {
 		dev_err(&res->pdev->dev,
-			"QPLIB: PKEY 0x%04x not found in the pkey table",
-			*pkey);
+			"PKEY 0x%04x not found in the pkey table\n", *pkey);
 		return -ENOMEM;
 	}
 	memset(&pkey_tbl->tbl[i], 0, sizeof(*pkey));
@@ -453,13 +463,13 @@
 	int i, free_idx, rc = 0;
 
 	if (!pkey_tbl) {
-		dev_err(&res->pdev->dev, "QPLIB: PKEY table not allocated");
+		dev_err(&res->pdev->dev, "PKEY table not allocated\n");
 		return -EINVAL;
 	}
 
 	/* Do we need a pkey_lock here? */
 	if (pkey_tbl->active == pkey_tbl->max) {
-		dev_err(&res->pdev->dev, "QPLIB: PKEY table is full");
+		dev_err(&res->pdev->dev, "PKEY table is full\n");
 		return -ENOMEM;
 	}
 	free_idx = pkey_tbl->max;
@@ -471,7 +481,7 @@
 	}
 	if (free_idx == pkey_tbl->max) {
 		dev_err(&res->pdev->dev,
-			"QPLIB: PKEY table is FULL but count is not MAX??");
+			"PKEY table is FULL but count is not MAX??\n");
 		return -ENOMEM;
 	}
 	/* Add PKEY to the pkey_tbl */
@@ -483,7 +493,8 @@
 }
 
 /* AH */
-int bnxt_qplib_create_ah(struct bnxt_qplib_res *res, struct bnxt_qplib_ah *ah)
+int bnxt_qplib_create_ah(struct bnxt_qplib_res *res, struct bnxt_qplib_ah *ah,
+			 bool block)
 {
 	struct bnxt_qplib_rcfw *rcfw = res->rcfw;
 	struct cmdq_create_ah req;
@@ -517,7 +528,7 @@
 	req.dest_mac[2] = cpu_to_le16(temp16[2]);
 
 	rc = bnxt_qplib_rcfw_send_message(rcfw, (void *)&req, (void *)&resp,
-					  NULL, 1);
+					  NULL, block);
 	if (rc)
 		return rc;
 
@@ -525,24 +536,21 @@
 	return 0;
 }
 
-int bnxt_qplib_destroy_ah(struct bnxt_qplib_res *res, struct bnxt_qplib_ah *ah)
+void bnxt_qplib_destroy_ah(struct bnxt_qplib_res *res, struct bnxt_qplib_ah *ah,
+			   bool block)
 {
 	struct bnxt_qplib_rcfw *rcfw = res->rcfw;
 	struct cmdq_destroy_ah req;
 	struct creq_destroy_ah_resp resp;
 	u16 cmd_flags = 0;
-	int rc;
 
 	/* Clean up the AH table in the device */
 	RCFW_CMD_PREP(req, DESTROY_AH, cmd_flags);
 
 	req.ah_cid = cpu_to_le32(ah->id);
 
-	rc = bnxt_qplib_rcfw_send_message(rcfw, (void *)&req, (void *)&resp,
-					  NULL, 1);
-	if (rc)
-		return rc;
-	return 0;
+	bnxt_qplib_rcfw_send_message(rcfw, (void *)&req, (void *)&resp, NULL,
+				     block);
 }
 
 /* MRW */
@@ -555,8 +563,7 @@
 	int rc;
 
 	if (mrw->lkey == 0xFFFFFFFF) {
-		dev_info(&res->pdev->dev,
-			 "QPLIB: SP: Free a reserved lkey MRW");
+		dev_info(&res->pdev->dev, "SP: Free a reserved lkey MRW\n");
 		return 0;
 	}
 
@@ -666,9 +673,8 @@
 			pages++;
 
 		if (pages > MAX_PBL_LVL_1_PGS) {
-			dev_err(&res->pdev->dev, "QPLIB: SP: Reg MR pages ");
 			dev_err(&res->pdev->dev,
-				"requested (0x%x) exceeded max (0x%x)",
+				"SP: Reg MR pages requested (0x%x) exceeded max (0x%x)\n",
 				pages, MAX_PBL_LVL_1_PGS);
 			return -ENOMEM;
 		}
@@ -678,13 +684,13 @@
 
 		mr->hwq.max_elements = pages;
 		/* Use system PAGE_SIZE */
-		rc = bnxt_qplib_alloc_init_hwq(res->pdev, &mr->hwq, NULL, 0,
+		rc = bnxt_qplib_alloc_init_hwq(res->pdev, &mr->hwq, NULL,
 					       &mr->hwq.max_elements,
 					       PAGE_SIZE, 0, PAGE_SIZE,
 					       HWQ_TYPE_CTX);
 		if (rc) {
 			dev_err(&res->pdev->dev,
-				"SP: Reg MR memory allocation failed");
+				"SP: Reg MR memory allocation failed\n");
 			return -ENOMEM;
 		}
 		/* Write to the hwq */
@@ -748,7 +754,7 @@
 		return -ENOMEM;
 
 	frpl->hwq.max_elements = pages;
-	rc = bnxt_qplib_alloc_init_hwq(res->pdev, &frpl->hwq, NULL, 0,
+	rc = bnxt_qplib_alloc_init_hwq(res->pdev, &frpl->hwq, NULL,
 				       &frpl->hwq.max_elements, PAGE_SIZE, 0,
 				       PAGE_SIZE, HWQ_TYPE_CTX);
 	if (!rc)
@@ -775,9 +781,8 @@
 	req.cos0 = cpu_to_le16(cids[0]);
 	req.cos1 = cpu_to_le16(cids[1]);
 
-	bnxt_qplib_rcfw_send_message(rcfw, (void *)&req, (void *)&resp, NULL,
-				     0);
-	return 0;
+	return bnxt_qplib_rcfw_send_message(rcfw, (void *)&req, (void *)&resp,
+						NULL, 0);
 }
 
 int bnxt_qplib_get_roce_stats(struct bnxt_qplib_rcfw *rcfw,
@@ -795,7 +800,7 @@
 	sbuf = bnxt_qplib_rcfw_alloc_sbuf(rcfw, sizeof(*sb));
 	if (!sbuf) {
 		dev_err(&rcfw->pdev->dev,
-			"QPLIB: SP: QUERY_ROCE_STATS alloc side buffer failed");
+			"SP: QUERY_ROCE_STATS alloc side buffer failed\n");
 		return -ENOMEM;
 	}
 
@@ -845,6 +850,16 @@
 	stats->res_srq_load_err = le64_to_cpu(sb->res_srq_load_err);
 	stats->res_tx_pci_err = le64_to_cpu(sb->res_tx_pci_err);
 	stats->res_rx_pci_err = le64_to_cpu(sb->res_rx_pci_err);
+	if (!rcfw->init_oos_stats) {
+		rcfw->oos_prev = le64_to_cpu(sb->res_oos_drop_count);
+		rcfw->init_oos_stats = 1;
+	} else {
+		stats->res_oos_drop_count +=
+				(le64_to_cpu(sb->res_oos_drop_count) -
+				 rcfw->oos_prev) & BNXT_QPLIB_OOS_COUNT_MASK;
+		rcfw->oos_prev = le64_to_cpu(sb->res_oos_drop_count);
+	}
+
 bail:
 	bnxt_qplib_rcfw_free_sbuf(rcfw, sbuf);
 	return rc;
diff --git a/drivers/infiniband/hw/bnxt_re/qplib_sp.h b/drivers/infiniband/hw/bnxt_re/qplib_sp.h
index 9d3e8b9..13d9432 100644
--- a/drivers/infiniband/hw/bnxt_re/qplib_sp.h
+++ b/drivers/infiniband/hw/bnxt_re/qplib_sp.h
@@ -84,6 +84,11 @@
 	u8				data[16];
 };
 
+struct bnxt_qplib_gid_info {
+	struct bnxt_qplib_gid gid;
+	u16 vlan_id;
+};
+
 struct bnxt_qplib_ah {
 	struct bnxt_qplib_gid		dgid;
 	struct bnxt_qplib_pd		*pd;
@@ -205,13 +210,23 @@
 	/* res_tx_pci_err is 64 b */
 	u64 res_rx_pci_err;
 	/* res_rx_pci_err is 64 b */
+	u64 res_oos_drop_count;
+	/* res_oos_drop_count */
+	u64     active_qp_count_p0;
+	/* port 0 active qps */
+	u64     active_qp_count_p1;
+	/* port 1 active qps */
+	u64     active_qp_count_p2;
+	/* port 2 active qps */
+	u64     active_qp_count_p3;
+	/* port 3 active qps */
 };
 
 int bnxt_qplib_get_sgid(struct bnxt_qplib_res *res,
 			struct bnxt_qplib_sgid_tbl *sgid_tbl, int index,
 			struct bnxt_qplib_gid *gid);
 int bnxt_qplib_del_sgid(struct bnxt_qplib_sgid_tbl *sgid_tbl,
-			struct bnxt_qplib_gid *gid, bool update);
+			struct bnxt_qplib_gid *gid, u16 vlan_id, bool update);
 int bnxt_qplib_add_sgid(struct bnxt_qplib_sgid_tbl *sgid_tbl,
 			struct bnxt_qplib_gid *gid, u8 *mac, u16 vlan_id,
 			bool update, u32 *index);
@@ -231,8 +246,10 @@
 int bnxt_qplib_set_func_resources(struct bnxt_qplib_res *res,
 				  struct bnxt_qplib_rcfw *rcfw,
 				  struct bnxt_qplib_ctx *ctx);
-int bnxt_qplib_create_ah(struct bnxt_qplib_res *res, struct bnxt_qplib_ah *ah);
-int bnxt_qplib_destroy_ah(struct bnxt_qplib_res *res, struct bnxt_qplib_ah *ah);
+int bnxt_qplib_create_ah(struct bnxt_qplib_res *res, struct bnxt_qplib_ah *ah,
+			 bool block);
+void bnxt_qplib_destroy_ah(struct bnxt_qplib_res *res, struct bnxt_qplib_ah *ah,
+			   bool block);
 int bnxt_qplib_alloc_mrw(struct bnxt_qplib_res *res,
 			 struct bnxt_qplib_mrw *mrw);
 int bnxt_qplib_dereg_mrw(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mrw,
diff --git a/drivers/infiniband/hw/bnxt_re/roce_hsi.h b/drivers/infiniband/hw/bnxt_re/roce_hsi.h
index 3e5a4f7..e4b09e7 100644
--- a/drivers/infiniband/hw/bnxt_re/roce_hsi.h
+++ b/drivers/infiniband/hw/bnxt_re/roce_hsi.h
@@ -49,11 +49,11 @@
 	#define CMPL_DOORBELL_IDX_SFT				    0
 	#define CMPL_DOORBELL_RESERVED_MASK			    0x3000000UL
 	#define CMPL_DOORBELL_RESERVED_SFT			    24
-	#define CMPL_DOORBELL_IDX_VALID			    0x4000000UL
+	#define CMPL_DOORBELL_IDX_VALID				    0x4000000UL
 	#define CMPL_DOORBELL_MASK				    0x8000000UL
 	#define CMPL_DOORBELL_KEY_MASK				    0xf0000000UL
 	#define CMPL_DOORBELL_KEY_SFT				    28
-	#define CMPL_DOORBELL_KEY_CMPL				   (0x2UL << 28)
+	#define CMPL_DOORBELL_KEY_CMPL				(0x2UL << 28)
 };
 
 /* Status Door Bell Format (4 bytes) */
@@ -71,46 +71,56 @@
 /* RoCE Host Structures */
 
 /* Doorbell Structures */
-/* 64b Doorbell Format (8 bytes) */
-struct dbr_dbr {
-	__le32 index;
-	#define DBR_DBR_INDEX_MASK				    0xfffffUL
-	#define DBR_DBR_INDEX_SFT				    0
-	#define DBR_DBR_RESERVED12_MASK			    0xfff00000UL
-	#define DBR_DBR_RESERVED12_SFT				    20
-	__le32 type_xid;
-	#define DBR_DBR_XID_MASK				    0xfffffUL
-	#define DBR_DBR_XID_SFT				    0
-	#define DBR_DBR_RESERVED8_MASK				    0xff00000UL
-	#define DBR_DBR_RESERVED8_SFT				    20
-	#define DBR_DBR_TYPE_MASK				    0xf0000000UL
-	#define DBR_DBR_TYPE_SFT				    28
-	#define DBR_DBR_TYPE_SQ				   (0x0UL << 28)
-	#define DBR_DBR_TYPE_RQ				   (0x1UL << 28)
-	#define DBR_DBR_TYPE_SRQ				   (0x2UL << 28)
-	#define DBR_DBR_TYPE_SRQ_ARM				   (0x3UL << 28)
-	#define DBR_DBR_TYPE_CQ				   (0x4UL << 28)
-	#define DBR_DBR_TYPE_CQ_ARMSE				   (0x5UL << 28)
-	#define DBR_DBR_TYPE_CQ_ARMALL				   (0x6UL << 28)
-	#define DBR_DBR_TYPE_CQ_ARMENA				   (0x7UL << 28)
-	#define DBR_DBR_TYPE_SRQ_ARMENA			   (0x8UL << 28)
-	#define DBR_DBR_TYPE_CQ_CUTOFF_ACK			   (0x9UL << 28)
-	#define DBR_DBR_TYPE_NULL				   (0xfUL << 28)
+/* dbc_dbc (size:64b/8B) */
+struct dbc_dbc {
+	__le32  index;
+	#define DBC_DBC_INDEX_MASK		0xffffffUL
+	#define DBC_DBC_INDEX_SFT		0
+	__le32  type_path_xid;
+	#define DBC_DBC_XID_MASK		0xfffffUL
+	#define DBC_DBC_XID_SFT			0
+	#define DBC_DBC_PATH_MASK		0x3000000UL
+	#define DBC_DBC_PATH_SFT		24
+	#define DBC_DBC_PATH_ROCE		(0x0UL << 24)
+	#define DBC_DBC_PATH_L2			(0x1UL << 24)
+	#define DBC_DBC_PATH_ENGINE		(0x2UL << 24)
+	#define DBC_DBC_PATH_LAST		DBC_DBC_PATH_ENGINE
+	#define DBC_DBC_DEBUG_TRACE		0x8000000UL
+	#define DBC_DBC_TYPE_MASK		0xf0000000UL
+	#define DBC_DBC_TYPE_SFT		28
+	#define DBC_DBC_TYPE_SQ			(0x0UL << 28)
+	#define DBC_DBC_TYPE_RQ			(0x1UL << 28)
+	#define DBC_DBC_TYPE_SRQ		(0x2UL << 28)
+	#define DBC_DBC_TYPE_SRQ_ARM		(0x3UL << 28)
+	#define DBC_DBC_TYPE_CQ			(0x4UL << 28)
+	#define DBC_DBC_TYPE_CQ_ARMSE		(0x5UL << 28)
+	#define DBC_DBC_TYPE_CQ_ARMALL		(0x6UL << 28)
+	#define DBC_DBC_TYPE_CQ_ARMENA		(0x7UL << 28)
+	#define DBC_DBC_TYPE_SRQ_ARMENA		(0x8UL << 28)
+	#define DBC_DBC_TYPE_CQ_CUTOFF_ACK	(0x9UL << 28)
+	#define DBC_DBC_TYPE_NQ			(0xaUL << 28)
+	#define DBC_DBC_TYPE_NQ_ARM		(0xbUL << 28)
+	#define DBC_DBC_TYPE_NULL		(0xfUL << 28)
+	#define DBC_DBC_TYPE_LAST		DBC_DBC_TYPE_NULL
 };
 
-/* 32b Doorbell Format (4 bytes) */
-struct dbr_dbr32 {
-	__le32 type_abs_incr_xid;
-	#define DBR_DBR32_XID_MASK				    0xfffffUL
-	#define DBR_DBR32_XID_SFT				    0
-	#define DBR_DBR32_RESERVED4_MASK			    0xf00000UL
-	#define DBR_DBR32_RESERVED4_SFT			    20
-	#define DBR_DBR32_INCR_MASK				    0xf000000UL
-	#define DBR_DBR32_INCR_SFT				    24
-	#define DBR_DBR32_ABS					    0x10000000UL
-	#define DBR_DBR32_TYPE_MASK				    0xe0000000UL
-	#define DBR_DBR32_TYPE_SFT				    29
-	#define DBR_DBR32_TYPE_SQ				   (0x0UL << 29)
+/* dbc_dbc32 (size:32b/4B) */
+struct dbc_dbc32 {
+	__le32  type_abs_incr_xid;
+	#define DBC_DBC32_XID_MASK		0xfffffUL
+	#define DBC_DBC32_XID_SFT		0
+	#define DBC_DBC32_PATH_MASK		0xc00000UL
+	#define DBC_DBC32_PATH_SFT		22
+	#define DBC_DBC32_PATH_ROCE		(0x0UL << 22)
+	#define DBC_DBC32_PATH_L2		(0x1UL << 22)
+	#define DBC_DBC32_PATH_LAST		DBC_DBC32_PATH_L2
+	#define DBC_DBC32_INCR_MASK		0xf000000UL
+	#define DBC_DBC32_INCR_SFT		24
+	#define DBC_DBC32_ABS			0x10000000UL
+	#define DBC_DBC32_TYPE_MASK		0xe0000000UL
+	#define DBC_DBC32_TYPE_SFT		29
+	#define DBC_DBC32_TYPE_SQ		(0x0UL << 29)
+	#define DBC_DBC32_TYPE_LAST		DBC_DBC32_TYPE_SQ
 };
 
 /* SQ WQE Structures */
@@ -149,7 +159,24 @@
 	#define SQ_PSN_SEARCH_NEXT_PSN_MASK			    0xffffffUL
 	#define SQ_PSN_SEARCH_NEXT_PSN_SFT			    0
 	#define SQ_PSN_SEARCH_FLAGS_MASK			    0xff000000UL
-	#define SQ_PSN_SEARCH_FLAGS_SFT			    24
+	#define SQ_PSN_SEARCH_FLAGS_SFT				    24
+};
+
+/* sq_psn_search_ext (size:128b/16B) */
+struct sq_psn_search_ext {
+	__le32  opcode_start_psn;
+	#define SQ_PSN_SEARCH_EXT_START_PSN_MASK		    0xffffffUL
+	#define SQ_PSN_SEARCH_EXT_START_PSN_SFT			    0
+	#define SQ_PSN_SEARCH_EXT_OPCODE_MASK			    0xff000000UL
+	#define SQ_PSN_SEARCH_EXT_OPCODE_SFT			    24
+	__le32  flags_next_psn;
+	#define SQ_PSN_SEARCH_EXT_NEXT_PSN_MASK			    0xffffffUL
+	#define SQ_PSN_SEARCH_EXT_NEXT_PSN_SFT			    0
+	#define SQ_PSN_SEARCH_EXT_FLAGS_MASK			    0xff000000UL
+	#define SQ_PSN_SEARCH_EXT_FLAGS_SFT			    24
+	__le16  start_slot_idx;
+	__le16  reserved16;
+	__le32  reserved32;
 };
 
 /* Send SQ WQE (40 bytes) */
@@ -505,22 +532,24 @@
 
 /* Responder UD CQE (32 bytes) */
 struct cq_res_ud {
-	__le32 length;
+	__le16 length;
 	#define CQ_RES_UD_LENGTH_MASK				    0x3fffUL
 	#define CQ_RES_UD_LENGTH_SFT				    0
-	#define CQ_RES_UD_RESERVED18_MASK			    0xffffc000UL
-	#define CQ_RES_UD_RESERVED18_SFT			    14
+	__le16 cfa_metadata;
+	#define CQ_RES_UD_CFA_METADATA_VID_MASK			0xfffUL
+	#define CQ_RES_UD_CFA_METADATA_VID_SFT			0
+	#define CQ_RES_UD_CFA_METADATA_DE			0x1000UL
+	#define CQ_RES_UD_CFA_METADATA_PRI_MASK			0xe000UL
+	#define CQ_RES_UD_CFA_METADATA_PRI_SFT			13
 	__le32 imm_data;
 	__le64 qp_handle;
 	__le16 src_mac[3];
 	__le16 src_qp_low;
 	u8 cqe_type_toggle;
-	#define CQ_RES_UD_TOGGLE				    0x1UL
-	#define CQ_RES_UD_CQE_TYPE_MASK			    0x1eUL
-	#define CQ_RES_UD_CQE_TYPE_SFT				    1
+	#define CQ_RES_UD_TOGGLE				   0x1UL
+	#define CQ_RES_UD_CQE_TYPE_MASK				   0x1eUL
+	#define CQ_RES_UD_CQE_TYPE_SFT				   1
 	#define CQ_RES_UD_CQE_TYPE_RES_UD			   (0x2UL << 1)
-	#define CQ_RES_UD_RESERVED3_MASK			    0xe0UL
-	#define CQ_RES_UD_RESERVED3_SFT			    5
 	u8 status;
 	#define CQ_RES_UD_STATUS_OK				   0x0UL
 	#define CQ_RES_UD_STATUS_LOCAL_ACCESS_ERROR		   0x1UL
@@ -536,18 +565,30 @@
 	#define CQ_RES_UD_FLAGS_SRQ_SRQ			   (0x1UL << 0)
 	#define CQ_RES_UD_FLAGS_SRQ_LAST    CQ_RES_UD_FLAGS_SRQ_SRQ
 	#define CQ_RES_UD_FLAGS_IMM				    0x2UL
-	#define CQ_RES_UD_FLAGS_ROCE_IP_VER_MASK		    0xcUL
-	#define CQ_RES_UD_FLAGS_ROCE_IP_VER_SFT		    2
-	#define CQ_RES_UD_FLAGS_ROCE_IP_VER_V1			   (0x0UL << 2)
-	#define CQ_RES_UD_FLAGS_ROCE_IP_VER_V2IPV4		   (0x2UL << 2)
-	#define CQ_RES_UD_FLAGS_ROCE_IP_VER_V2IPV6		   (0x3UL << 2)
+	#define CQ_RES_UD_FLAGS_UNUSED_MASK			0xcUL
+	#define CQ_RES_UD_FLAGS_UNUSED_SFT			2
+	#define CQ_RES_UD_FLAGS_ROCE_IP_VER_MASK		0x30UL
+	#define CQ_RES_UD_FLAGS_ROCE_IP_VER_SFT			4
+	#define CQ_RES_UD_FLAGS_ROCE_IP_VER_V1			(0x0UL << 4)
+	#define CQ_RES_UD_FLAGS_ROCE_IP_VER_V2IPV4		(0x2UL << 4)
+	#define CQ_RES_UD_FLAGS_ROCE_IP_VER_V2IPV6		(0x3UL << 4)
 	#define CQ_RES_UD_FLAGS_ROCE_IP_VER_LAST		\
 					CQ_RES_UD_FLAGS_ROCE_IP_VER_V2IPV6
+	#define CQ_RES_UD_FLAGS_META_FORMAT_MASK		0x3c0UL
+	#define CQ_RES_UD_FLAGS_META_FORMAT_SFT			6
+	#define CQ_RES_UD_FLAGS_META_FORMAT_NONE		(0x0UL << 6)
+	#define CQ_RES_UD_FLAGS_META_FORMAT_VLAN		(0x1UL << 6)
+	#define CQ_RES_UD_FLAGS_META_FORMAT_TUNNEL_ID		(0x2UL << 6)
+	#define CQ_RES_UD_FLAGS_META_FORMAT_CHDR_DATA		(0x3UL << 6)
+	#define CQ_RES_UD_FLAGS_META_FORMAT_HDR_OFFSET		(0x4UL << 6)
+	#define CQ_RES_UD_FLAGS_META_FORMAT_LAST		\
+					CQ_RES_UD_FLAGS_META_FORMAT_HDR_OFFSET
+	#define CQ_RES_UD_FLAGS_EXT_META_FORMAT_MASK		0xc00UL
+	#define CQ_RES_UD_FLAGS_EXT_META_FORMAT_SFT		10
+
 	__le32 src_qp_high_srq_or_rq_wr_id;
 	#define CQ_RES_UD_SRQ_OR_RQ_WR_ID_MASK			    0xfffffUL
 	#define CQ_RES_UD_SRQ_OR_RQ_WR_ID_SFT			    0
-	#define CQ_RES_UD_RESERVED4_MASK			    0xf00000UL
-	#define CQ_RES_UD_RESERVED4_SFT			    20
 	#define CQ_RES_UD_SRC_QP_HIGH_MASK			    0xff000000UL
 	#define CQ_RES_UD_SRC_QP_HIGH_SFT			    24
 };
@@ -983,6 +1024,7 @@
 	#define CMDQ_CREATE_QP_TYPE_RC				   0x2UL
 	#define CMDQ_CREATE_QP_TYPE_UD				   0x4UL
 	#define CMDQ_CREATE_QP_TYPE_RAW_ETHERTYPE		   0x6UL
+	#define CMDQ_CREATE_QP_TYPE_GSI				   0x7UL
 	u8 sq_pg_size_sq_lvl;
 	#define CMDQ_CREATE_QP_SQ_LVL_MASK			    0xfUL
 	#define CMDQ_CREATE_QP_SQ_LVL_SFT			    0
@@ -2719,6 +2761,8 @@
 	__le16 max_srq;
 	__le32 max_gid;
 	__le32 tqm_alloc_reqs[12];
+	__le32 max_dpi;
+	__le32 reserved_32;
 };
 
 /* Set resources command response (16 bytes) */
@@ -2929,6 +2973,11 @@
 	__le64	res_srq_load_err;
 	__le64	res_tx_pci_err;
 	__le64	res_rx_pci_err;
+	__le64  res_oos_drop_count;
+	__le64  active_qp_count_p0;
+	__le64  active_qp_count_p1;
+	__le64  active_qp_count_p2;
+	__le64  active_qp_count_p3;
 };
 
 /* QP error notification event (16 bytes) */
diff --git a/drivers/infiniband/hw/cxgb3/Kconfig b/drivers/infiniband/hw/cxgb3/Kconfig
index a7b77cb..8c1a72b 100644
--- a/drivers/infiniband/hw/cxgb3/Kconfig
+++ b/drivers/infiniband/hw/cxgb3/Kconfig
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 config INFINIBAND_CXGB3
 	tristate "Chelsio RDMA Driver"
 	depends on CHELSIO_T3
diff --git a/drivers/infiniband/hw/cxgb3/Makefile b/drivers/infiniband/hw/cxgb3/Makefile
index 66fe091..34bb86a 100644
--- a/drivers/infiniband/hw/cxgb3/Makefile
+++ b/drivers/infiniband/hw/cxgb3/Makefile
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: GPL-2.0
-ccflags-y := -Idrivers/net/ethernet/chelsio/cxgb3
+ccflags-y := -I $(srctree)/drivers/net/ethernet/chelsio/cxgb3
 
 obj-$(CONFIG_INFINIBAND_CXGB3) += iw_cxgb3.o
 
diff --git a/drivers/infiniband/hw/cxgb3/cxio_hal.c b/drivers/infiniband/hw/cxgb3/cxio_hal.c
index dcb4bba..95b22a6 100644
--- a/drivers/infiniband/hw/cxgb3/cxio_hal.c
+++ b/drivers/infiniband/hw/cxgb3/cxio_hal.c
@@ -174,7 +174,6 @@
 		return -ENOMEM;
 	}
 	dma_unmap_addr_set(cq, mapping, cq->dma_addr);
-	memset(cq->queue, 0, size);
 	setup.id = cq->cqid;
 	setup.base_addr = (u64) (cq->dma_addr);
 	setup.size = 1UL << cq->size_log2;
@@ -187,20 +186,6 @@
 	return (rdev_p->t3cdev_p->ctl(rdev_p->t3cdev_p, RDMA_CQ_SETUP, &setup));
 }
 
-#ifdef notyet
-int cxio_resize_cq(struct cxio_rdev *rdev_p, struct t3_cq *cq)
-{
-	struct rdma_cq_setup setup;
-	setup.id = cq->cqid;
-	setup.base_addr = (u64) (cq->dma_addr);
-	setup.size = 1UL << cq->size_log2;
-	setup.credits = setup.size;
-	setup.credit_thres = setup.size;	/* TBD: overflow recovery */
-	setup.ovfl_mode = 1;
-	return (rdev_p->t3cdev_p->ctl(rdev_p->t3cdev_p, RDMA_CQ_SETUP, &setup));
-}
-#endif
-
 static u32 get_qpid(struct cxio_rdev *rdev_p, struct cxio_ucontext *uctx)
 {
 	struct cxio_qpid_list *entry;
@@ -219,7 +204,7 @@
 		if (!qpid)
 			goto out;
 		for (i = qpid+1; i & rdev_p->qpmask; i++) {
-			entry = kmalloc(sizeof *entry, GFP_KERNEL);
+			entry = kmalloc(sizeof(*entry), GFP_KERNEL);
 			if (!entry)
 				break;
 			entry->qpid = i;
@@ -237,7 +222,7 @@
 {
 	struct cxio_qpid_list *entry;
 
-	entry = kmalloc(sizeof *entry, GFP_KERNEL);
+	entry = kmalloc(sizeof(*entry), GFP_KERNEL);
 	if (!entry)
 		return;
 	pr_debug("%s qpid 0x%x\n", __func__, qpid);
@@ -292,12 +277,11 @@
 		goto err3;
 
 	wq->queue = dma_alloc_coherent(&(rdev_p->rnic_info.pdev->dev),
-					     depth * sizeof(union t3_wr),
-					     &(wq->dma_addr), GFP_KERNEL);
+				       depth * sizeof(union t3_wr),
+				       &(wq->dma_addr), GFP_KERNEL);
 	if (!wq->queue)
 		goto err4;
 
-	memset(wq->queue, 0, depth * sizeof(union t3_wr));
 	dma_unmap_addr_set(wq, mapping, wq->dma_addr);
 	wq->doorbell = (void __iomem *)rdev_p->rnic_info.kdb_addr;
 	if (!kernel_domain)
@@ -318,17 +302,15 @@
 	return -ENOMEM;
 }
 
-int cxio_destroy_cq(struct cxio_rdev *rdev_p, struct t3_cq *cq)
+void cxio_destroy_cq(struct cxio_rdev *rdev_p, struct t3_cq *cq)
 {
-	int err;
-	err = cxio_hal_clear_cq_ctx(rdev_p, cq->cqid);
+	cxio_hal_clear_cq_ctx(rdev_p, cq->cqid);
 	kfree(cq->sw_queue);
 	dma_free_coherent(&(rdev_p->rnic_info.pdev->dev),
 			  (1UL << (cq->size_log2))
 			  * sizeof(struct t3_cqe) + 1, cq->queue,
 			  dma_unmap_addr(cq, mapping));
 	cxio_hal_put_cqid(rdev_p->rscp, cq->cqid);
-	return err;
 }
 
 int cxio_destroy_qp(struct cxio_rdev *rdev_p, struct t3_wq *wq,
@@ -539,8 +521,6 @@
 	dma_unmap_addr_set(&rdev_p->ctrl_qp, mapping,
 			   rdev_p->ctrl_qp.dma_addr);
 	rdev_p->ctrl_qp.doorbell = (void __iomem *)rdev_p->rnic_info.kdb_addr;
-	memset(rdev_p->ctrl_qp.workq, 0,
-	       (1 << T3_CTRL_QP_SIZE_LOG2) * sizeof(union t3_wr));
 
 	mutex_init(&rdev_p->ctrl_qp.lock);
 	init_waitqueue_head(&rdev_p->ctrl_qp.waitq);
@@ -566,9 +546,9 @@
 	wqe->sge_cmd = cpu_to_be64(sge_cmd);
 	wqe->ctx1 = cpu_to_be64(ctx1);
 	wqe->ctx0 = cpu_to_be64(ctx0);
-	pr_debug("CtrlQP dma_addr 0x%llx workq %p size %d\n",
-		 (unsigned long long)rdev_p->ctrl_qp.dma_addr,
-		 rdev_p->ctrl_qp.workq, 1 << T3_CTRL_QP_SIZE_LOG2);
+	pr_debug("CtrlQP dma_addr %pad workq %p size %d\n",
+		 &rdev_p->ctrl_qp.dma_addr, rdev_p->ctrl_qp.workq,
+		 1 << T3_CTRL_QP_SIZE_LOG2);
 	skb->priority = CPL_PRIORITY_CONTROL;
 	return iwch_cxgb3_ofld_send(rdev_p->t3cdev_p, skb);
 err:
diff --git a/drivers/infiniband/hw/cxgb3/cxio_hal.h b/drivers/infiniband/hw/cxgb3/cxio_hal.h
index c64e50b..40c029f 100644
--- a/drivers/infiniband/hw/cxgb3/cxio_hal.h
+++ b/drivers/infiniband/hw/cxgb3/cxio_hal.h
@@ -158,8 +158,7 @@
 int cxio_hal_cq_op(struct cxio_rdev *rdev, struct t3_cq *cq,
 		   enum t3_cq_opcode op, u32 credit);
 int cxio_create_cq(struct cxio_rdev *rdev, struct t3_cq *cq, int kernel);
-int cxio_destroy_cq(struct cxio_rdev *rdev, struct t3_cq *cq);
-int cxio_resize_cq(struct cxio_rdev *rdev, struct t3_cq *cq);
+void cxio_destroy_cq(struct cxio_rdev *rdev, struct t3_cq *cq);
 void cxio_release_ucontext(struct cxio_rdev *rdev, struct cxio_ucontext *uctx);
 void cxio_init_ucontext(struct cxio_rdev *rdev, struct cxio_ucontext *uctx);
 int cxio_create_qp(struct cxio_rdev *rdev, u32 kernel_domain, struct t3_wq *wq,
diff --git a/drivers/infiniband/hw/cxgb3/cxio_wr.h b/drivers/infiniband/hw/cxgb3/cxio_wr.h
index 83d2e19..53aa5c3 100644
--- a/drivers/infiniband/hw/cxgb3/cxio_wr.h
+++ b/drivers/infiniband/hw/cxgb3/cxio_wr.h
@@ -64,7 +64,7 @@
 	T3_SOLICITED_EVENT_FLAG = 0x04,
 	T3_READ_FENCE_FLAG = 0x08,
 	T3_LOCAL_FENCE_FLAG = 0x10
-} __attribute__ ((packed));
+} __packed;
 
 enum t3_wr_opcode {
 	T3_WR_BP = FW_WROPCODE_RI_BYPASS,
@@ -77,7 +77,7 @@
 	T3_WR_INIT = FW_WROPCODE_RI_RDMA_INIT,
 	T3_WR_QP_MOD = FW_WROPCODE_RI_MODIFY_QP,
 	T3_WR_FASTREG = FW_WROPCODE_RI_FASTREGISTER_MR
-} __attribute__ ((packed));
+} __packed;
 
 enum t3_rdma_opcode {
 	T3_RDMA_WRITE,		/* IETF RDMAP v1.0 ... */
@@ -95,7 +95,7 @@
 	T3_QP_MOD,
 	T3_BYPASS,
 	T3_RDMA_READ_REQ_WITH_INV,
-} __attribute__ ((packed));
+} __packed;
 
 static inline enum t3_rdma_opcode wr2opcode(enum t3_wr_opcode wrop)
 {
@@ -306,7 +306,7 @@
 	uP_RI_MPA_TX_MARKER_ENABLE = 0x2,
 	uP_RI_MPA_CRC_ENABLE = 0x4,
 	uP_RI_MPA_IETF_ENABLE = 0x8
-} __attribute__ ((packed));
+} __packed;
 
 enum t3_qp_caps {
 	uP_RI_QP_RDMA_READ_ENABLE = 0x01,
@@ -314,7 +314,7 @@
 	uP_RI_QP_BIND_ENABLE = 0x04,
 	uP_RI_QP_FAST_REGISTER_ENABLE = 0x08,
 	uP_RI_QP_STAG0_ENABLE = 0x10
-} __attribute__ ((packed));
+} __packed;
 
 enum rdma_init_rtr_types {
 	RTR_READ = 1,
diff --git a/drivers/infiniband/hw/cxgb3/iwch.c b/drivers/infiniband/hw/cxgb3/iwch.c
index 591de31..56a8ab6 100644
--- a/drivers/infiniband/hw/cxgb3/iwch.c
+++ b/drivers/infiniband/hw/cxgb3/iwch.c
@@ -62,37 +62,30 @@
 static LIST_HEAD(dev_list);
 static DEFINE_MUTEX(dev_mutex);
 
-static int disable_qp_db(int id, void *p, void *data)
-{
-	struct iwch_qp *qhp = p;
-
-	cxio_disable_wq_db(&qhp->wq);
-	return 0;
-}
-
-static int enable_qp_db(int id, void *p, void *data)
-{
-	struct iwch_qp *qhp = p;
-
-	if (data)
-		ring_doorbell(qhp->rhp->rdev.ctrl_qp.doorbell, qhp->wq.qpid);
-	cxio_enable_wq_db(&qhp->wq);
-	return 0;
-}
-
 static void disable_dbs(struct iwch_dev *rnicp)
 {
-	spin_lock_irq(&rnicp->lock);
-	idr_for_each(&rnicp->qpidr, disable_qp_db, NULL);
-	spin_unlock_irq(&rnicp->lock);
+	unsigned long index;
+	struct iwch_qp *qhp;
+
+	xa_lock_irq(&rnicp->qps);
+	xa_for_each(&rnicp->qps, index, qhp)
+		cxio_disable_wq_db(&qhp->wq);
+	xa_unlock_irq(&rnicp->qps);
 }
 
 static void enable_dbs(struct iwch_dev *rnicp, int ring_db)
 {
-	spin_lock_irq(&rnicp->lock);
-	idr_for_each(&rnicp->qpidr, enable_qp_db,
-		     (void *)(unsigned long)ring_db);
-	spin_unlock_irq(&rnicp->lock);
+	unsigned long index;
+	struct iwch_qp *qhp;
+
+	xa_lock_irq(&rnicp->qps);
+	xa_for_each(&rnicp->qps, index, qhp) {
+		if (ring_db)
+			ring_doorbell(qhp->rhp->rdev.ctrl_qp.doorbell,
+					qhp->wq.qpid);
+		cxio_enable_wq_db(&qhp->wq);
+	}
+	xa_unlock_irq(&rnicp->qps);
 }
 
 static void iwch_db_drop_task(struct work_struct *work)
@@ -105,10 +98,9 @@
 static void rnic_init(struct iwch_dev *rnicp)
 {
 	pr_debug("%s iwch_dev %p\n", __func__,  rnicp);
-	idr_init(&rnicp->cqidr);
-	idr_init(&rnicp->qpidr);
-	idr_init(&rnicp->mmidr);
-	spin_lock_init(&rnicp->lock);
+	xa_init_flags(&rnicp->cqs, XA_FLAGS_LOCK_IRQ);
+	xa_init_flags(&rnicp->qps, XA_FLAGS_LOCK_IRQ);
+	xa_init_flags(&rnicp->mrs, XA_FLAGS_LOCK_IRQ);
 	INIT_DELAYED_WORK(&rnicp->db_drop_task, iwch_db_drop_task);
 
 	rnicp->attr.max_qps = T3_MAX_NUM_QP - 32;
@@ -146,7 +138,7 @@
 
 	pr_debug("%s t3cdev %p\n", __func__,  tdev);
 	pr_info_once("Chelsio T3 RDMA Driver - version %s\n", DRV_VERSION);
-	rnicp = (struct iwch_dev *)ib_alloc_device(sizeof(*rnicp));
+	rnicp = ib_alloc_device(iwch_dev, ibdev);
 	if (!rnicp) {
 		pr_err("Cannot allocate ib device\n");
 		return;
@@ -190,9 +182,9 @@
 			list_del(&dev->entry);
 			iwch_unregister_device(dev);
 			cxio_rdev_close(&dev->rdev);
-			idr_destroy(&dev->cqidr);
-			idr_destroy(&dev->qpidr);
-			idr_destroy(&dev->mmidr);
+			WARN_ON(!xa_empty(&dev->cqs));
+			WARN_ON(!xa_empty(&dev->qps));
+			WARN_ON(!xa_empty(&dev->mrs));
 			ib_dealloc_device(&dev->ibdev);
 			break;
 		}
diff --git a/drivers/infiniband/hw/cxgb3/iwch.h b/drivers/infiniband/hw/cxgb3/iwch.h
index c69bc4f..310a937 100644
--- a/drivers/infiniband/hw/cxgb3/iwch.h
+++ b/drivers/infiniband/hw/cxgb3/iwch.h
@@ -35,7 +35,7 @@
 #include <linux/mutex.h>
 #include <linux/list.h>
 #include <linux/spinlock.h>
-#include <linux/idr.h>
+#include <linux/xarray.h>
 #include <linux/workqueue.h>
 
 #include <rdma/ib_verbs.h>
@@ -106,10 +106,9 @@
 	struct cxio_rdev rdev;
 	u32 device_cap_flags;
 	struct iwch_rnic_attributes attr;
-	struct idr cqidr;
-	struct idr qpidr;
-	struct idr mmidr;
-	spinlock_t lock;
+	struct xarray cqs;
+	struct xarray qps;
+	struct xarray mrs;
 	struct list_head entry;
 	struct delayed_work db_drop_task;
 };
@@ -136,40 +135,17 @@
 
 static inline struct iwch_cq *get_chp(struct iwch_dev *rhp, u32 cqid)
 {
-	return idr_find(&rhp->cqidr, cqid);
+	return xa_load(&rhp->cqs, cqid);
 }
 
 static inline struct iwch_qp *get_qhp(struct iwch_dev *rhp, u32 qpid)
 {
-	return idr_find(&rhp->qpidr, qpid);
+	return xa_load(&rhp->qps, qpid);
 }
 
 static inline struct iwch_mr *get_mhp(struct iwch_dev *rhp, u32 mmid)
 {
-	return idr_find(&rhp->mmidr, mmid);
-}
-
-static inline int insert_handle(struct iwch_dev *rhp, struct idr *idr,
-				void *handle, u32 id)
-{
-	int ret;
-
-	idr_preload(GFP_KERNEL);
-	spin_lock_irq(&rhp->lock);
-
-	ret = idr_alloc(idr, handle, id, id + 1, GFP_NOWAIT);
-
-	spin_unlock_irq(&rhp->lock);
-	idr_preload_end();
-
-	return ret < 0 ? ret : 0;
-}
-
-static inline void remove_handle(struct iwch_dev *rhp, struct idr *idr, u32 id)
-{
-	spin_lock_irq(&rhp->lock);
-	idr_remove(idr, id);
-	spin_unlock_irq(&rhp->lock);
+	return xa_load(&rhp->mrs, mmid);
 }
 
 extern struct cxgb3_client t3c_client;
diff --git a/drivers/infiniband/hw/cxgb3/iwch_cm.c b/drivers/infiniband/hw/cxgb3/iwch_cm.c
index 1c90c86..0bca72c 100644
--- a/drivers/infiniband/hw/cxgb3/iwch_cm.c
+++ b/drivers/infiniband/hw/cxgb3/iwch_cm.c
@@ -170,7 +170,7 @@
 {
 	struct cpl_tid_release *req;
 
-	skb = get_skb(skb, sizeof *req, GFP_KERNEL);
+	skb = get_skb(skb, sizeof(*req), GFP_KERNEL);
 	if (!skb)
 		return;
 	req = skb_put(skb, sizeof(*req));
diff --git a/drivers/infiniband/hw/cxgb3/iwch_ev.c b/drivers/infiniband/hw/cxgb3/iwch_ev.c
index 4a0c82a..9d356c1 100644
--- a/drivers/infiniband/hw/cxgb3/iwch_ev.c
+++ b/drivers/infiniband/hw/cxgb3/iwch_ev.c
@@ -48,14 +48,14 @@
 	struct iwch_qp *qhp;
 	unsigned long flag;
 
-	spin_lock(&rnicp->lock);
-	qhp = get_qhp(rnicp, CQE_QPID(rsp_msg->cqe));
+	xa_lock(&rnicp->qps);
+	qhp = xa_load(&rnicp->qps, CQE_QPID(rsp_msg->cqe));
 
 	if (!qhp) {
 		pr_err("%s unaffiliated error 0x%x qpid 0x%x\n",
 		       __func__, CQE_STATUS(rsp_msg->cqe),
 		       CQE_QPID(rsp_msg->cqe));
-		spin_unlock(&rnicp->lock);
+		xa_unlock(&rnicp->qps);
 		return;
 	}
 
@@ -65,7 +65,7 @@
 			 __func__,
 			 qhp->attr.state, qhp->wq.qpid,
 			 CQE_STATUS(rsp_msg->cqe));
-		spin_unlock(&rnicp->lock);
+		xa_unlock(&rnicp->qps);
 		return;
 	}
 
@@ -76,7 +76,7 @@
 	       CQE_WRID_HI(rsp_msg->cqe), CQE_WRID_LOW(rsp_msg->cqe));
 
 	atomic_inc(&qhp->refcnt);
-	spin_unlock(&rnicp->lock);
+	xa_unlock(&rnicp->qps);
 
 	if (qhp->attr.state == IWCH_QP_STATE_RTS) {
 		attrs.next_state = IWCH_QP_STATE_TERMINATE;
@@ -114,21 +114,21 @@
 	unsigned long flag;
 
 	rnicp = (struct iwch_dev *) rdev_p->ulp;
-	spin_lock(&rnicp->lock);
+	xa_lock(&rnicp->qps);
 	chp = get_chp(rnicp, cqid);
-	qhp = get_qhp(rnicp, CQE_QPID(rsp_msg->cqe));
+	qhp = xa_load(&rnicp->qps, CQE_QPID(rsp_msg->cqe));
 	if (!chp || !qhp) {
 		pr_err("BAD AE cqid 0x%x qpid 0x%x opcode %d status 0x%x type %d wrid.hi 0x%x wrid.lo 0x%x\n",
 		       cqid, CQE_QPID(rsp_msg->cqe),
 		       CQE_OPCODE(rsp_msg->cqe), CQE_STATUS(rsp_msg->cqe),
 		       CQE_TYPE(rsp_msg->cqe), CQE_WRID_HI(rsp_msg->cqe),
 		       CQE_WRID_LOW(rsp_msg->cqe));
-		spin_unlock(&rnicp->lock);
+		xa_unlock(&rnicp->qps);
 		goto out;
 	}
 	iwch_qp_add_ref(&qhp->ibqp);
 	atomic_inc(&chp->refcnt);
-	spin_unlock(&rnicp->lock);
+	xa_unlock(&rnicp->qps);
 
 	/*
 	 * 1) completion of our sending a TERMINATE.
diff --git a/drivers/infiniband/hw/cxgb3/iwch_mem.c b/drivers/infiniband/hw/cxgb3/iwch_mem.c
index 12886b1..ce0f274 100644
--- a/drivers/infiniband/hw/cxgb3/iwch_mem.c
+++ b/drivers/infiniband/hw/cxgb3/iwch_mem.c
@@ -49,7 +49,7 @@
 	mmid = stag >> 8;
 	mhp->ibmr.rkey = mhp->ibmr.lkey = stag;
 	pr_debug("%s mmid 0x%x mhp %p\n", __func__, mmid, mhp);
-	return insert_handle(mhp->rhp, &mhp->rhp->mmidr, mhp, mmid);
+	return xa_insert_irq(&mhp->rhp->mrs, mmid, mhp, GFP_KERNEL);
 }
 
 int iwch_register_mem(struct iwch_dev *rhp, struct iwch_pd *php,
diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.c b/drivers/infiniband/hw/cxgb3/iwch_provider.c
index 1b9ff21..dcf02ec 100644
--- a/drivers/infiniband/hw/cxgb3/iwch_provider.c
+++ b/drivers/infiniband/hw/cxgb3/iwch_provider.c
@@ -53,6 +53,7 @@
 #include <rdma/ib_smi.h>
 #include <rdma/ib_umem.h>
 #include <rdma/ib_user_verbs.h>
+#include <rdma/uverbs_ioctl.h>
 
 #include "cxio_hal.h"
 #include "iwch.h"
@@ -61,7 +62,7 @@
 #include <rdma/cxgb3-abi.h>
 #include "common.h"
 
-static int iwch_dealloc_ucontext(struct ib_ucontext *context)
+static void iwch_dealloc_ucontext(struct ib_ucontext *context)
 {
 	struct iwch_dev *rhp = to_iwch_dev(context->device);
 	struct iwch_ucontext *ucontext = to_iwch_ucontext(context);
@@ -71,72 +72,58 @@
 	list_for_each_entry_safe(mm, tmp, &ucontext->mmaps, entry)
 		kfree(mm);
 	cxio_release_ucontext(&rhp->rdev, &ucontext->uctx);
-	kfree(ucontext);
-	return 0;
 }
 
-static struct ib_ucontext *iwch_alloc_ucontext(struct ib_device *ibdev,
-					struct ib_udata *udata)
+static int iwch_alloc_ucontext(struct ib_ucontext *ucontext,
+			       struct ib_udata *udata)
 {
-	struct iwch_ucontext *context;
+	struct ib_device *ibdev = ucontext->device;
+	struct iwch_ucontext *context = to_iwch_ucontext(ucontext);
 	struct iwch_dev *rhp = to_iwch_dev(ibdev);
 
 	pr_debug("%s ibdev %p\n", __func__, ibdev);
-	context = kzalloc(sizeof(*context), GFP_KERNEL);
-	if (!context)
-		return ERR_PTR(-ENOMEM);
 	cxio_init_ucontext(&rhp->rdev, &context->uctx);
 	INIT_LIST_HEAD(&context->mmaps);
 	spin_lock_init(&context->mmap_lock);
-	return &context->ibucontext;
+	return 0;
 }
 
-static int iwch_destroy_cq(struct ib_cq *ib_cq)
+static void iwch_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata)
 {
 	struct iwch_cq *chp;
 
 	pr_debug("%s ib_cq %p\n", __func__, ib_cq);
 	chp = to_iwch_cq(ib_cq);
 
-	remove_handle(chp->rhp, &chp->rhp->cqidr, chp->cq.cqid);
+	xa_erase_irq(&chp->rhp->cqs, chp->cq.cqid);
 	atomic_dec(&chp->refcnt);
 	wait_event(chp->wait, !atomic_read(&chp->refcnt));
 
 	cxio_destroy_cq(&chp->rhp->rdev, &chp->cq);
-	kfree(chp);
-	return 0;
 }
 
-static struct ib_cq *iwch_create_cq(struct ib_device *ibdev,
-				    const struct ib_cq_init_attr *attr,
-				    struct ib_ucontext *ib_context,
-				    struct ib_udata *udata)
+static int iwch_create_cq(struct ib_cq *ibcq,
+			  const struct ib_cq_init_attr *attr,
+			  struct ib_udata *udata)
 {
+	struct ib_device *ibdev = ibcq->device;
 	int entries = attr->cqe;
-	struct iwch_dev *rhp;
-	struct iwch_cq *chp;
+	struct iwch_dev *rhp = to_iwch_dev(ibcq->device);
+	struct iwch_cq *chp = to_iwch_cq(ibcq);
 	struct iwch_create_cq_resp uresp;
 	struct iwch_create_cq_req ureq;
-	struct iwch_ucontext *ucontext = NULL;
 	static int warned;
 	size_t resplen;
 
 	pr_debug("%s ib_dev %p entries %d\n", __func__, ibdev, entries);
 	if (attr->flags)
-		return ERR_PTR(-EINVAL);
+		return -EINVAL;
 
-	rhp = to_iwch_dev(ibdev);
-	chp = kzalloc(sizeof(*chp), GFP_KERNEL);
-	if (!chp)
-		return ERR_PTR(-ENOMEM);
-
-	if (ib_context) {
-		ucontext = to_iwch_ucontext(ib_context);
+	if (udata) {
 		if (!t3a_device(rhp)) {
-			if (ib_copy_from_udata(&ureq, udata, sizeof (ureq))) {
-				kfree(chp);
-				return ERR_PTR(-EFAULT);
-			}
+			if (ib_copy_from_udata(&ureq, udata, sizeof(ureq)))
+				return  -EFAULT;
+
 			chp->user_rptr_addr = (u32 __user *)(unsigned long)ureq.user_rptr_addr;
 		}
 	}
@@ -157,29 +144,29 @@
 	entries = roundup_pow_of_two(entries);
 	chp->cq.size_log2 = ilog2(entries);
 
-	if (cxio_create_cq(&rhp->rdev, &chp->cq, !ucontext)) {
-		kfree(chp);
-		return ERR_PTR(-ENOMEM);
-	}
+	if (cxio_create_cq(&rhp->rdev, &chp->cq, !udata))
+		return -ENOMEM;
+
 	chp->rhp = rhp;
 	chp->ibcq.cqe = 1 << chp->cq.size_log2;
 	spin_lock_init(&chp->lock);
 	spin_lock_init(&chp->comp_handler_lock);
 	atomic_set(&chp->refcnt, 1);
 	init_waitqueue_head(&chp->wait);
-	if (insert_handle(rhp, &rhp->cqidr, chp, chp->cq.cqid)) {
+	if (xa_store_irq(&rhp->cqs, chp->cq.cqid, chp, GFP_KERNEL)) {
 		cxio_destroy_cq(&chp->rhp->rdev, &chp->cq);
-		kfree(chp);
-		return ERR_PTR(-ENOMEM);
+		return -ENOMEM;
 	}
 
-	if (ucontext) {
+	if (udata) {
 		struct iwch_mm_entry *mm;
+		struct iwch_ucontext *ucontext = rdma_udata_to_drv_context(
+			udata, struct iwch_ucontext, ibucontext);
 
-		mm = kmalloc(sizeof *mm, GFP_KERNEL);
+		mm = kmalloc(sizeof(*mm), GFP_KERNEL);
 		if (!mm) {
-			iwch_destroy_cq(&chp->ibcq);
-			return ERR_PTR(-ENOMEM);
+			iwch_destroy_cq(&chp->ibcq, udata);
+			return -ENOMEM;
 		}
 		uresp.cqid = chp->cq.cqid;
 		uresp.size_log2 = chp->cq.size_log2;
@@ -189,7 +176,7 @@
 		spin_unlock(&ucontext->mmap_lock);
 		mm->key = uresp.key;
 		mm->addr = virt_to_phys(chp->cq.queue);
-		if (udata->outlen < sizeof uresp) {
+		if (udata->outlen < sizeof(uresp)) {
 			if (!warned++)
 				pr_warn("Warning - downlevel libcxgb3 (non-fatal)\n");
 			mm->len = PAGE_ALIGN((1UL << uresp.size_log2) *
@@ -200,86 +187,19 @@
 					     sizeof(struct t3_cqe));
 			uresp.memsize = mm->len;
 			uresp.reserved = 0;
-			resplen = sizeof uresp;
+			resplen = sizeof(uresp);
 		}
 		if (ib_copy_to_udata(udata, &uresp, resplen)) {
 			kfree(mm);
-			iwch_destroy_cq(&chp->ibcq);
-			return ERR_PTR(-EFAULT);
+			iwch_destroy_cq(&chp->ibcq, udata);
+			return -EFAULT;
 		}
 		insert_mmap(ucontext, mm);
 	}
-	pr_debug("created cqid 0x%0x chp %p size 0x%0x, dma_addr 0x%0llx\n",
+	pr_debug("created cqid 0x%0x chp %p size 0x%0x, dma_addr %pad\n",
 		 chp->cq.cqid, chp, (1 << chp->cq.size_log2),
-		 (unsigned long long)chp->cq.dma_addr);
-	return &chp->ibcq;
-}
-
-static int iwch_resize_cq(struct ib_cq *cq, int cqe, struct ib_udata *udata)
-{
-#ifdef notyet
-	struct iwch_cq *chp = to_iwch_cq(cq);
-	struct t3_cq oldcq, newcq;
-	int ret;
-
-	pr_debug("%s ib_cq %p cqe %d\n", __func__, cq, cqe);
-
-	/* We don't downsize... */
-	if (cqe <= cq->cqe)
-		return 0;
-
-	/* create new t3_cq with new size */
-	cqe = roundup_pow_of_two(cqe+1);
-	newcq.size_log2 = ilog2(cqe);
-
-	/* Dont allow resize to less than the current wce count */
-	if (cqe < Q_COUNT(chp->cq.rptr, chp->cq.wptr)) {
-		return -ENOMEM;
-	}
-
-	/* Quiesce all QPs using this CQ */
-	ret = iwch_quiesce_qps(chp);
-	if (ret) {
-		return ret;
-	}
-
-	ret = cxio_create_cq(&chp->rhp->rdev, &newcq);
-	if (ret) {
-		return ret;
-	}
-
-	/* copy CQEs */
-	memcpy(newcq.queue, chp->cq.queue, (1 << chp->cq.size_log2) *
-				        sizeof(struct t3_cqe));
-
-	/* old iwch_qp gets new t3_cq but keeps old cqid */
-	oldcq = chp->cq;
-	chp->cq = newcq;
-	chp->cq.cqid = oldcq.cqid;
-
-	/* resize new t3_cq to update the HW context */
-	ret = cxio_resize_cq(&chp->rhp->rdev, &chp->cq);
-	if (ret) {
-		chp->cq = oldcq;
-		return ret;
-	}
-	chp->ibcq.cqe = (1<<chp->cq.size_log2) - 1;
-
-	/* destroy old t3_cq */
-	oldcq.cqid = newcq.cqid;
-	ret = cxio_destroy_cq(&chp->rhp->rdev, &oldcq);
-	if (ret) {
-		pr_err("%s - cxio_destroy_cq failed %d\n", __func__, ret);
-	}
-
-	/* add user hooks here */
-
-	/* resume qps */
-	ret = iwch_resume_qps(chp);
-	return ret;
-#else
-	return -ENOSYS;
-#endif
+		 &chp->cq.dma_addr);
+	return 0;
 }
 
 static int iwch_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags)
@@ -370,7 +290,7 @@
 	return ret;
 }
 
-static int iwch_deallocate_pd(struct ib_pd *pd)
+static void iwch_deallocate_pd(struct ib_pd *pd, struct ib_udata *udata)
 {
 	struct iwch_dev *rhp;
 	struct iwch_pd *php;
@@ -379,15 +299,12 @@
 	rhp = php->rhp;
 	pr_debug("%s ibpd %p pdid 0x%x\n", __func__, pd, php->pdid);
 	cxio_hal_put_pdid(rhp->rdev.rscp, php->pdid);
-	kfree(php);
-	return 0;
 }
 
-static struct ib_pd *iwch_allocate_pd(struct ib_device *ibdev,
-			       struct ib_ucontext *context,
-			       struct ib_udata *udata)
+static int iwch_allocate_pd(struct ib_pd *pd, struct ib_udata *udata)
 {
-	struct iwch_pd *php;
+	struct iwch_pd *php = to_iwch_pd(pd);
+	struct ib_device *ibdev = pd->device;
 	u32 pdid;
 	struct iwch_dev *rhp;
 
@@ -395,27 +312,23 @@
 	rhp = (struct iwch_dev *) ibdev;
 	pdid = cxio_hal_get_pdid(rhp->rdev.rscp);
 	if (!pdid)
-		return ERR_PTR(-EINVAL);
-	php = kzalloc(sizeof(*php), GFP_KERNEL);
-	if (!php) {
-		cxio_hal_put_pdid(rhp->rdev.rscp, pdid);
-		return ERR_PTR(-ENOMEM);
-	}
+		return -EINVAL;
+
 	php->pdid = pdid;
 	php->rhp = rhp;
-	if (context) {
+	if (udata) {
 		struct iwch_alloc_pd_resp resp = {.pdid = php->pdid};
 
 		if (ib_copy_to_udata(udata, &resp, sizeof(resp))) {
-			iwch_deallocate_pd(&php->ibpd);
-			return ERR_PTR(-EFAULT);
+			iwch_deallocate_pd(&php->ibpd, udata);
+			return -EFAULT;
 		}
 	}
 	pr_debug("%s pdid 0x%0x ptr 0x%p\n", __func__, pdid, php);
-	return &php->ibpd;
+	return 0;
 }
 
-static int iwch_dereg_mr(struct ib_mr *ib_mr)
+static int iwch_dereg_mr(struct ib_mr *ib_mr, struct ib_udata *udata)
 {
 	struct iwch_dev *rhp;
 	struct iwch_mr *mhp;
@@ -430,11 +343,10 @@
 	cxio_dereg_mem(&rhp->rdev, mhp->attr.stag, mhp->attr.pbl_size,
 		       mhp->attr.pbl_addr);
 	iwch_free_pbl(mhp);
-	remove_handle(rhp, &rhp->mmidr, mmid);
+	xa_erase_irq(&rhp->mrs, mmid);
 	if (mhp->kva)
 		kfree((void *) (unsigned long) mhp->kva);
-	if (mhp->umem)
-		ib_umem_release(mhp->umem);
+	ib_umem_release(mhp->umem);
 	pr_debug("%s mmid 0x%x ptr %p\n", __func__, mmid, mhp);
 	kfree(mhp);
 	return 0;
@@ -522,14 +434,13 @@
 				      u64 virt, int acc, struct ib_udata *udata)
 {
 	__be64 *pages;
-	int shift, n, len;
-	int i, k, entry;
+	int shift, n, i;
 	int err = 0;
 	struct iwch_dev *rhp;
 	struct iwch_pd *php;
 	struct iwch_mr *mhp;
 	struct iwch_reg_user_mr_resp uresp;
-	struct scatterlist *sg;
+	struct sg_dma_page_iter sg_iter;
 	pr_debug("%s ib_pd %p\n", __func__, pd);
 
 	php = to_iwch_pd(pd);
@@ -540,16 +451,16 @@
 
 	mhp->rhp = rhp;
 
-	mhp->umem = ib_umem_get(pd->uobject->context, start, length, acc, 0);
+	mhp->umem = ib_umem_get(udata, start, length, acc, 0);
 	if (IS_ERR(mhp->umem)) {
 		err = PTR_ERR(mhp->umem);
 		kfree(mhp);
 		return ERR_PTR(err);
 	}
 
-	shift = mhp->umem->page_shift;
+	shift = PAGE_SHIFT;
 
-	n = mhp->umem->nmap;
+	n = ib_umem_num_pages(mhp->umem);
 
 	err = iwch_alloc_pbl(mhp, n);
 	if (err)
@@ -563,19 +474,15 @@
 
 	i = n = 0;
 
-	for_each_sg(mhp->umem->sg_head.sgl, sg, mhp->umem->nmap, entry) {
-			len = sg_dma_len(sg) >> shift;
-			for (k = 0; k < len; ++k) {
-				pages[i++] = cpu_to_be64(sg_dma_address(sg) +
-							 (k << shift));
-				if (i == PAGE_SIZE / sizeof *pages) {
-					err = iwch_write_pbl(mhp, pages, i, n);
-					if (err)
-						goto pbl_done;
-					n += i;
-					i = 0;
-				}
-			}
+	for_each_sg_dma_page(mhp->umem->sg_head.sgl, &sg_iter, mhp->umem->nmap, 0) {
+		pages[i++] = cpu_to_be64(sg_page_iter_dma_address(&sg_iter));
+		if (i == PAGE_SIZE / sizeof(*pages)) {
+			err = iwch_write_pbl(mhp, pages, i, n);
+			if (err)
+				goto pbl_done;
+			n += i;
+			i = 0;
+		}
 	}
 
 	if (i)
@@ -603,8 +510,8 @@
 		pr_debug("%s user resp pbl_addr 0x%x\n", __func__,
 			 uresp.pbl_addr);
 
-		if (ib_copy_to_udata(udata, &uresp, sizeof (uresp))) {
-			iwch_dereg_mr(&mhp->ibmr);
+		if (ib_copy_to_udata(udata, &uresp, sizeof(uresp))) {
+			iwch_dereg_mr(&mhp->ibmr, udata);
 			err = -EFAULT;
 			goto err;
 		}
@@ -650,7 +557,7 @@
 	mhp->attr.stag = stag;
 	mmid = (stag) >> 8;
 	mhp->ibmw.rkey = stag;
-	if (insert_handle(rhp, &rhp->mmidr, mhp, mmid)) {
+	if (xa_insert_irq(&rhp->mrs, mmid, mhp, GFP_KERNEL)) {
 		cxio_deallocate_window(&rhp->rdev, mhp->attr.stag);
 		kfree(mhp);
 		return ERR_PTR(-ENOMEM);
@@ -669,15 +576,14 @@
 	rhp = mhp->rhp;
 	mmid = (mw->rkey) >> 8;
 	cxio_deallocate_window(&rhp->rdev, mhp->attr.stag);
-	remove_handle(rhp, &rhp->mmidr, mmid);
+	xa_erase_irq(&rhp->mrs, mmid);
 	pr_debug("%s ib_mw %p mmid 0x%x ptr %p\n", __func__, mw, mmid, mhp);
 	kfree(mhp);
 	return 0;
 }
 
-static struct ib_mr *iwch_alloc_mr(struct ib_pd *pd,
-				   enum ib_mr_type mr_type,
-				   u32 max_num_sg)
+static struct ib_mr *iwch_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
+				   u32 max_num_sg, struct ib_udata *udata)
 {
 	struct iwch_dev *rhp;
 	struct iwch_pd *php;
@@ -715,7 +621,7 @@
 	mhp->attr.state = 1;
 	mmid = (stag) >> 8;
 	mhp->ibmr.rkey = mhp->ibmr.lkey = stag;
-	ret = insert_handle(rhp, &rhp->mmidr, mhp, mmid);
+	ret = xa_insert_irq(&rhp->mrs, mmid, mhp, GFP_KERNEL);
 	if (ret)
 		goto err3;
 
@@ -756,7 +662,7 @@
 	return ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset, iwch_set_page);
 }
 
-static int iwch_destroy_qp(struct ib_qp *ib_qp)
+static int iwch_destroy_qp(struct ib_qp *ib_qp, struct ib_udata *udata)
 {
 	struct iwch_dev *rhp;
 	struct iwch_qp *qhp;
@@ -770,13 +676,13 @@
 	iwch_modify_qp(rhp, qhp, IWCH_QP_ATTR_NEXT_STATE, &attrs, 0);
 	wait_event(qhp->wait, !qhp->ep);
 
-	remove_handle(rhp, &rhp->qpidr, qhp->wq.qpid);
+	xa_erase_irq(&rhp->qps, qhp->wq.qpid);
 
 	atomic_dec(&qhp->refcnt);
 	wait_event(qhp->wait, !atomic_read(&qhp->refcnt));
 
-	ucontext = ib_qp->uobject ? to_iwch_ucontext(ib_qp->uobject->context)
-				  : NULL;
+	ucontext = rdma_udata_to_drv_context(udata, struct iwch_ucontext,
+					     ibucontext);
 	cxio_destroy_qp(&rhp->rdev, &qhp->wq,
 			ucontext ? &ucontext->uctx : &rhp->rdev.uctx);
 
@@ -836,7 +742,8 @@
 	 * Kernel users need more wq space for fastreg WRs which can take
 	 * 2 WR fragments.
 	 */
-	ucontext = pd->uobject ? to_iwch_ucontext(pd->uobject->context) : NULL;
+	ucontext = rdma_udata_to_drv_context(udata, struct iwch_ucontext,
+					     ibucontext);
 	if (!ucontext && wqsize < (rqsize + (2 * sqsize)))
 		wqsize = roundup_pow_of_two(rqsize +
 				roundup_pow_of_two(attrs->cap.max_send_wr * 2));
@@ -885,7 +792,7 @@
 	init_waitqueue_head(&qhp->wait);
 	atomic_set(&qhp->refcnt, 1);
 
-	if (insert_handle(rhp, &rhp->qpidr, qhp, qhp->wq.qpid)) {
+	if (xa_store_irq(&rhp->qps, qhp->wq.qpid, qhp, GFP_KERNEL)) {
 		cxio_destroy_qp(&rhp->rdev, &qhp->wq,
 			ucontext ? &ucontext->uctx : &rhp->rdev.uctx);
 		kfree(qhp);
@@ -896,16 +803,16 @@
 
 		struct iwch_mm_entry *mm1, *mm2;
 
-		mm1 = kmalloc(sizeof *mm1, GFP_KERNEL);
+		mm1 = kmalloc(sizeof(*mm1), GFP_KERNEL);
 		if (!mm1) {
-			iwch_destroy_qp(&qhp->ibqp);
+			iwch_destroy_qp(&qhp->ibqp, udata);
 			return ERR_PTR(-ENOMEM);
 		}
 
-		mm2 = kmalloc(sizeof *mm2, GFP_KERNEL);
+		mm2 = kmalloc(sizeof(*mm2), GFP_KERNEL);
 		if (!mm2) {
 			kfree(mm1);
-			iwch_destroy_qp(&qhp->ibqp);
+			iwch_destroy_qp(&qhp->ibqp, udata);
 			return ERR_PTR(-ENOMEM);
 		}
 
@@ -919,15 +826,15 @@
 		uresp.db_key = ucontext->key;
 		ucontext->key += PAGE_SIZE;
 		spin_unlock(&ucontext->mmap_lock);
-		if (ib_copy_to_udata(udata, &uresp, sizeof (uresp))) {
+		if (ib_copy_to_udata(udata, &uresp, sizeof(uresp))) {
 			kfree(mm1);
 			kfree(mm2);
-			iwch_destroy_qp(&qhp->ibqp);
+			iwch_destroy_qp(&qhp->ibqp, udata);
 			return ERR_PTR(-EFAULT);
 		}
 		mm1->key = uresp.key;
 		mm1->addr = virt_to_phys(qhp->wq.queue);
-		mm1->len = PAGE_ALIGN(wqsize * sizeof (union t3_wr));
+		mm1->len = PAGE_ALIGN(wqsize * sizeof(union t3_wr));
 		insert_mmap(ucontext, mm1);
 		mm2->key = uresp.db_key;
 		mm2->addr = qhp->wq.udb & PAGE_MASK;
@@ -935,10 +842,11 @@
 		insert_mmap(ucontext, mm2);
 	}
 	qhp->ibqp.qp_num = qhp->wq.qpid;
-	pr_debug("%s sq_num_entries %d, rq_num_entries %d qpid 0x%0x qhp %p dma_addr 0x%llx size %d rq_addr 0x%x\n",
-		 __func__, qhp->attr.sq_num_entries, qhp->attr.rq_num_entries,
-		 qhp->wq.qpid, qhp, (unsigned long long)qhp->wq.dma_addr,
-		 1 << qhp->wq.size_log2, qhp->wq.rq_addr);
+	pr_debug(
+		"%s sq_num_entries %d, rq_num_entries %d qpid 0x%0x qhp %p dma_addr %pad size %d rq_addr 0x%x\n",
+		__func__, qhp->attr.sq_num_entries, qhp->attr.rq_num_entries,
+		qhp->wq.qpid, qhp, &qhp->wq.dma_addr, 1 << qhp->wq.size_log2,
+		qhp->wq.rq_addr);
 	return &qhp->ibqp;
 }
 
@@ -948,7 +856,7 @@
 	struct iwch_dev *rhp;
 	struct iwch_qp *qhp;
 	enum iwch_qp_attr_mask mask = 0;
-	struct iwch_qp_attributes attrs;
+	struct iwch_qp_attributes attrs = {};
 
 	pr_debug("%s ib_qp %p\n", __func__, ibqp);
 
@@ -960,7 +868,6 @@
 	if (!attr_mask)
 		return 0;
 
-	memset(&attrs, 0, sizeof attrs);
 	qhp = to_iwch_qp(ibqp);
 	rhp = qhp->rhp;
 
@@ -1056,7 +963,6 @@
 		return -EINVAL;
 
 	dev = to_iwch_dev(ibdev);
-	memset(props, 0, sizeof *props);
 	memcpy(&props->sys_image_guid, dev->rdev.t3cdev_p->lldev->dev_addr, 6);
 	props->hw_ver = dev->rdev.t3cdev_p->type;
 	props->fw_ver = fw_vers_string_to_u64(dev);
@@ -1085,33 +991,8 @@
 static int iwch_query_port(struct ib_device *ibdev,
 			   u8 port, struct ib_port_attr *props)
 {
-	struct iwch_dev *dev;
-	struct net_device *netdev;
-	struct in_device *inetdev;
-
 	pr_debug("%s ibdev %p\n", __func__, ibdev);
 
-	dev = to_iwch_dev(ibdev);
-	netdev = dev->rdev.port_info.lldevs[port-1];
-
-	/* props being zeroed by the caller, avoid zeroing it here */
-	props->max_mtu = IB_MTU_4096;
-	props->active_mtu = ib_mtu_int_to_enum(netdev->mtu);
-
-	if (!netif_carrier_ok(netdev))
-		props->state = IB_PORT_DOWN;
-	else {
-		inetdev = in_dev_get(netdev);
-		if (inetdev) {
-			if (inetdev->ifa_list)
-				props->state = IB_PORT_ACTIVE;
-			else
-				props->state = IB_PORT_INIT;
-			in_dev_put(inetdev);
-		} else
-			props->state = IB_PORT_INIT;
-	}
-
 	props->port_cap_flags =
 	    IB_PORT_CM_SUP |
 	    IB_PORT_SNMP_TUNNEL_SUP |
@@ -1127,20 +1008,22 @@
 	return 0;
 }
 
-static ssize_t show_rev(struct device *dev, struct device_attribute *attr,
-			char *buf)
+static ssize_t hw_rev_show(struct device *dev,
+			   struct device_attribute *attr, char *buf)
 {
-	struct iwch_dev *iwch_dev = container_of(dev, struct iwch_dev,
-						 ibdev.dev);
+	struct iwch_dev *iwch_dev =
+			rdma_device_to_drv_device(dev, struct iwch_dev, ibdev);
+
 	pr_debug("%s dev 0x%p\n", __func__, dev);
 	return sprintf(buf, "%d\n", iwch_dev->rdev.t3cdev_p->type);
 }
+static DEVICE_ATTR_RO(hw_rev);
 
-static ssize_t show_hca(struct device *dev, struct device_attribute *attr,
-			char *buf)
+static ssize_t hca_type_show(struct device *dev,
+			     struct device_attribute *attr, char *buf)
 {
-	struct iwch_dev *iwch_dev = container_of(dev, struct iwch_dev,
-						 ibdev.dev);
+	struct iwch_dev *iwch_dev =
+			rdma_device_to_drv_device(dev, struct iwch_dev, ibdev);
 	struct ethtool_drvinfo info;
 	struct net_device *lldev = iwch_dev->rdev.t3cdev_p->lldev;
 
@@ -1148,16 +1031,19 @@
 	lldev->ethtool_ops->get_drvinfo(lldev, &info);
 	return sprintf(buf, "%s\n", info.driver);
 }
+static DEVICE_ATTR_RO(hca_type);
 
-static ssize_t show_board(struct device *dev, struct device_attribute *attr,
-			  char *buf)
+static ssize_t board_id_show(struct device *dev,
+			     struct device_attribute *attr, char *buf)
 {
-	struct iwch_dev *iwch_dev = container_of(dev, struct iwch_dev,
-						 ibdev.dev);
+	struct iwch_dev *iwch_dev =
+			rdma_device_to_drv_device(dev, struct iwch_dev, ibdev);
+
 	pr_debug("%s dev 0x%p\n", __func__, dev);
 	return sprintf(buf, "%x.%x\n", iwch_dev->rdev.rnic_info.pdev->vendor,
 		       iwch_dev->rdev.rnic_info.pdev->device);
 }
+static DEVICE_ATTR_RO(board_id);
 
 enum counters {
 	IPINRECEIVES,
@@ -1274,14 +1160,15 @@
 	return stats->num_counters;
 }
 
-static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL);
-static DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL);
-static DEVICE_ATTR(board_id, S_IRUGO, show_board, NULL);
+static struct attribute *iwch_class_attributes[] = {
+	&dev_attr_hw_rev.attr,
+	&dev_attr_hca_type.attr,
+	&dev_attr_board_id.attr,
+	NULL
+};
 
-static struct device_attribute *iwch_class_attributes[] = {
-	&dev_attr_hw_rev,
-	&dev_attr_hca_type,
-	&dev_attr_board_id,
+static const struct attribute_group iwch_attr_group = {
+	.attrs = iwch_class_attributes,
 };
 
 static int iwch_port_immutable(struct ib_device *ibdev, u8 port_num,
@@ -1313,16 +1200,75 @@
 	snprintf(str, IB_FW_VERSION_NAME_MAX, "%s", info.fw_version);
 }
 
-int iwch_register_device(struct iwch_dev *dev)
+static const struct ib_device_ops iwch_dev_ops = {
+	.owner = THIS_MODULE,
+	.driver_id = RDMA_DRIVER_CXGB3,
+	.uverbs_abi_ver = IWCH_UVERBS_ABI_VERSION,
+	.uverbs_no_driver_id_binding = 1,
+
+	.alloc_hw_stats	= iwch_alloc_stats,
+	.alloc_mr = iwch_alloc_mr,
+	.alloc_mw = iwch_alloc_mw,
+	.alloc_pd = iwch_allocate_pd,
+	.alloc_ucontext = iwch_alloc_ucontext,
+	.create_cq = iwch_create_cq,
+	.create_qp = iwch_create_qp,
+	.dealloc_mw = iwch_dealloc_mw,
+	.dealloc_pd = iwch_deallocate_pd,
+	.dealloc_ucontext = iwch_dealloc_ucontext,
+	.dereg_mr = iwch_dereg_mr,
+	.destroy_cq = iwch_destroy_cq,
+	.destroy_qp = iwch_destroy_qp,
+	.get_dev_fw_str = get_dev_fw_ver_str,
+	.get_dma_mr = iwch_get_dma_mr,
+	.get_hw_stats = iwch_get_mib,
+	.get_port_immutable = iwch_port_immutable,
+	.iw_accept = iwch_accept_cr,
+	.iw_add_ref = iwch_qp_add_ref,
+	.iw_connect = iwch_connect,
+	.iw_create_listen = iwch_create_listen,
+	.iw_destroy_listen = iwch_destroy_listen,
+	.iw_get_qp = iwch_get_qp,
+	.iw_reject = iwch_reject_cr,
+	.iw_rem_ref = iwch_qp_rem_ref,
+	.map_mr_sg = iwch_map_mr_sg,
+	.mmap = iwch_mmap,
+	.modify_qp = iwch_ib_modify_qp,
+	.poll_cq = iwch_poll_cq,
+	.post_recv = iwch_post_receive,
+	.post_send = iwch_post_send,
+	.query_device = iwch_query_device,
+	.query_gid = iwch_query_gid,
+	.query_pkey = iwch_query_pkey,
+	.query_port = iwch_query_port,
+	.reg_user_mr = iwch_reg_user_mr,
+	.req_notify_cq = iwch_arm_cq,
+	INIT_RDMA_OBJ_SIZE(ib_pd, iwch_pd, ibpd),
+	INIT_RDMA_OBJ_SIZE(ib_cq, iwch_cq, ibcq),
+	INIT_RDMA_OBJ_SIZE(ib_ucontext, iwch_ucontext, ibucontext),
+};
+
+static int set_netdevs(struct ib_device *ib_dev, struct cxio_rdev *rdev)
 {
 	int ret;
 	int i;
 
+	for (i = 0; i < rdev->port_info.nports; i++) {
+		ret = ib_device_set_netdev(ib_dev, rdev->port_info.lldevs[i],
+					   i + 1);
+		if (ret)
+			return ret;
+	}
+	return 0;
+}
+
+int iwch_register_device(struct iwch_dev *dev)
+{
+	int err;
+
 	pr_debug("%s iwch_dev %p\n", __func__, dev);
-	strlcpy(dev->ibdev.name, "cxgb3_%d", IB_DEVICE_NAME_MAX);
 	memset(&dev->ibdev.node_guid, 0, sizeof(dev->ibdev.node_guid));
 	memcpy(&dev->ibdev.node_guid, dev->rdev.t3cdev_p->lldev->dev_addr, 6);
-	dev->ibdev.owner = THIS_MODULE;
 	dev->device_cap_flags = IB_DEVICE_LOCAL_DMA_LKEY |
 				IB_DEVICE_MEM_WINDOW |
 				IB_DEVICE_MEM_MGT_EXTENSIONS;
@@ -1354,82 +1300,22 @@
 	dev->ibdev.phys_port_cnt = dev->rdev.port_info.nports;
 	dev->ibdev.num_comp_vectors = 1;
 	dev->ibdev.dev.parent = &dev->rdev.rnic_info.pdev->dev;
-	dev->ibdev.query_device = iwch_query_device;
-	dev->ibdev.query_port = iwch_query_port;
-	dev->ibdev.query_pkey = iwch_query_pkey;
-	dev->ibdev.query_gid = iwch_query_gid;
-	dev->ibdev.alloc_ucontext = iwch_alloc_ucontext;
-	dev->ibdev.dealloc_ucontext = iwch_dealloc_ucontext;
-	dev->ibdev.mmap = iwch_mmap;
-	dev->ibdev.alloc_pd = iwch_allocate_pd;
-	dev->ibdev.dealloc_pd = iwch_deallocate_pd;
-	dev->ibdev.create_qp = iwch_create_qp;
-	dev->ibdev.modify_qp = iwch_ib_modify_qp;
-	dev->ibdev.destroy_qp = iwch_destroy_qp;
-	dev->ibdev.create_cq = iwch_create_cq;
-	dev->ibdev.destroy_cq = iwch_destroy_cq;
-	dev->ibdev.resize_cq = iwch_resize_cq;
-	dev->ibdev.poll_cq = iwch_poll_cq;
-	dev->ibdev.get_dma_mr = iwch_get_dma_mr;
-	dev->ibdev.reg_user_mr = iwch_reg_user_mr;
-	dev->ibdev.dereg_mr = iwch_dereg_mr;
-	dev->ibdev.alloc_mw = iwch_alloc_mw;
-	dev->ibdev.dealloc_mw = iwch_dealloc_mw;
-	dev->ibdev.alloc_mr = iwch_alloc_mr;
-	dev->ibdev.map_mr_sg = iwch_map_mr_sg;
-	dev->ibdev.req_notify_cq = iwch_arm_cq;
-	dev->ibdev.post_send = iwch_post_send;
-	dev->ibdev.post_recv = iwch_post_receive;
-	dev->ibdev.alloc_hw_stats = iwch_alloc_stats;
-	dev->ibdev.get_hw_stats = iwch_get_mib;
-	dev->ibdev.uverbs_abi_ver = IWCH_UVERBS_ABI_VERSION;
-	dev->ibdev.get_port_immutable = iwch_port_immutable;
-	dev->ibdev.get_dev_fw_str = get_dev_fw_ver_str;
 
-	dev->ibdev.iwcm = kmalloc(sizeof(struct iw_cm_verbs), GFP_KERNEL);
-	if (!dev->ibdev.iwcm)
-		return -ENOMEM;
+	memcpy(dev->ibdev.iw_ifname, dev->rdev.t3cdev_p->lldev->name,
+	       sizeof(dev->ibdev.iw_ifname));
 
-	dev->ibdev.iwcm->connect = iwch_connect;
-	dev->ibdev.iwcm->accept = iwch_accept_cr;
-	dev->ibdev.iwcm->reject = iwch_reject_cr;
-	dev->ibdev.iwcm->create_listen = iwch_create_listen;
-	dev->ibdev.iwcm->destroy_listen = iwch_destroy_listen;
-	dev->ibdev.iwcm->add_ref = iwch_qp_add_ref;
-	dev->ibdev.iwcm->rem_ref = iwch_qp_rem_ref;
-	dev->ibdev.iwcm->get_qp = iwch_get_qp;
-	memcpy(dev->ibdev.iwcm->ifname, dev->rdev.t3cdev_p->lldev->name,
-	       sizeof(dev->ibdev.iwcm->ifname));
+	rdma_set_device_sysfs_group(&dev->ibdev, &iwch_attr_group);
+	ib_set_device_ops(&dev->ibdev, &iwch_dev_ops);
+	err = set_netdevs(&dev->ibdev, &dev->rdev);
+	if (err)
+		return err;
 
-	dev->ibdev.driver_id = RDMA_DRIVER_CXGB3;
-	ret = ib_register_device(&dev->ibdev, NULL);
-	if (ret)
-		goto bail1;
-
-	for (i = 0; i < ARRAY_SIZE(iwch_class_attributes); ++i) {
-		ret = device_create_file(&dev->ibdev.dev,
-					 iwch_class_attributes[i]);
-		if (ret) {
-			goto bail2;
-		}
-	}
-	return 0;
-bail2:
-	ib_unregister_device(&dev->ibdev);
-bail1:
-	kfree(dev->ibdev.iwcm);
-	return ret;
+	return ib_register_device(&dev->ibdev, "cxgb3_%d");
 }
 
 void iwch_unregister_device(struct iwch_dev *dev)
 {
-	int i;
-
 	pr_debug("%s iwch_dev %p\n", __func__, dev);
-	for (i = 0; i < ARRAY_SIZE(iwch_class_attributes); ++i)
-		device_remove_file(&dev->ibdev.dev,
-				   iwch_class_attributes[i]);
 	ib_unregister_device(&dev->ibdev);
-	kfree(dev->ibdev.iwcm);
 	return;
 }
diff --git a/drivers/infiniband/hw/cxgb4/Kconfig b/drivers/infiniband/hw/cxgb4/Kconfig
index e0522a5..b49e8d4 100644
--- a/drivers/infiniband/hw/cxgb4/Kconfig
+++ b/drivers/infiniband/hw/cxgb4/Kconfig
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 config INFINIBAND_CXGB4
 	tristate "Chelsio T4/T5 RDMA Driver"
 	depends on CHELSIO_T4 && INET
diff --git a/drivers/infiniband/hw/cxgb4/Makefile b/drivers/infiniband/hw/cxgb4/Makefile
index 9edd920..291d259 100644
--- a/drivers/infiniband/hw/cxgb4/Makefile
+++ b/drivers/infiniband/hw/cxgb4/Makefile
@@ -1,5 +1,6 @@
-ccflags-y := -Idrivers/net/ethernet/chelsio/cxgb4
-ccflags-y += -Idrivers/net/ethernet/chelsio/libcxgb
+# SPDX-License-Identifier: GPL-2.0-only
+ccflags-y := -I $(srctree)/drivers/net/ethernet/chelsio/cxgb4
+ccflags-y += -I $(srctree)/drivers/net/ethernet/chelsio/libcxgb
 
 obj-$(CONFIG_INFINIBAND_CXGB4) += iw_cxgb4.o
 
diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c
index 0f83cbe..347dc24 100644
--- a/drivers/infiniband/hw/cxgb4/cm.c
+++ b/drivers/infiniband/hw/cxgb4/cm.c
@@ -331,20 +331,23 @@
 {
 	unsigned long flags;
 
-	spin_lock_irqsave(&ep->com.dev->lock, flags);
-	_remove_handle(ep->com.dev, &ep->com.dev->hwtid_idr, ep->hwtid, 0);
-	if (idr_is_empty(&ep->com.dev->hwtid_idr))
+	xa_lock_irqsave(&ep->com.dev->hwtids, flags);
+	__xa_erase(&ep->com.dev->hwtids, ep->hwtid);
+	if (xa_empty(&ep->com.dev->hwtids))
 		wake_up(&ep->com.dev->wait);
-	spin_unlock_irqrestore(&ep->com.dev->lock, flags);
+	xa_unlock_irqrestore(&ep->com.dev->hwtids, flags);
 }
 
-static void insert_ep_tid(struct c4iw_ep *ep)
+static int insert_ep_tid(struct c4iw_ep *ep)
 {
 	unsigned long flags;
+	int err;
 
-	spin_lock_irqsave(&ep->com.dev->lock, flags);
-	_insert_handle(ep->com.dev, &ep->com.dev->hwtid_idr, ep, ep->hwtid, 0);
-	spin_unlock_irqrestore(&ep->com.dev->lock, flags);
+	xa_lock_irqsave(&ep->com.dev->hwtids, flags);
+	err = __xa_insert(&ep->com.dev->hwtids, ep->hwtid, ep, GFP_KERNEL);
+	xa_unlock_irqrestore(&ep->com.dev->hwtids, flags);
+
+	return err;
 }
 
 /*
@@ -355,11 +358,11 @@
 	struct c4iw_ep *ep;
 	unsigned long flags;
 
-	spin_lock_irqsave(&dev->lock, flags);
-	ep = idr_find(&dev->hwtid_idr, tid);
+	xa_lock_irqsave(&dev->hwtids, flags);
+	ep = xa_load(&dev->hwtids, tid);
 	if (ep)
 		c4iw_get_ep(&ep->com);
-	spin_unlock_irqrestore(&dev->lock, flags);
+	xa_unlock_irqrestore(&dev->hwtids, flags);
 	return ep;
 }
 
@@ -372,11 +375,11 @@
 	struct c4iw_listen_ep *ep;
 	unsigned long flags;
 
-	spin_lock_irqsave(&dev->lock, flags);
-	ep = idr_find(&dev->stid_idr, stid);
+	xa_lock_irqsave(&dev->stids, flags);
+	ep = xa_load(&dev->stids, stid);
 	if (ep)
 		c4iw_get_ep(&ep->com);
-	spin_unlock_irqrestore(&dev->lock, flags);
+	xa_unlock_irqrestore(&dev->stids, flags);
 	return ep;
 }
 
@@ -403,8 +406,7 @@
 				 ep->com.local_addr.ss_family);
 		dst_release(ep->dst);
 		cxgb4_l2t_release(ep->l2t);
-		if (ep->mpa_skb)
-			kfree_skb(ep->mpa_skb);
+		kfree_skb(ep->mpa_skb);
 	}
 	if (!skb_queue_empty(&ep->com.ep_skb_list))
 		skb_queue_purge(&ep->com.ep_skb_list);
@@ -458,6 +460,8 @@
 		skb_reset_transport_header(skb);
 	} else {
 		skb = alloc_skb(len, gfp);
+		if (!skb)
+			return NULL;
 	}
 	t4_set_arp_err_handler(skb, NULL, NULL);
 	return skb;
@@ -491,7 +495,6 @@
 
 	ep = *((struct c4iw_ep **)(skb->cb + 2 * sizeof(void *)));
 	release_ep_resources(ep);
-	kfree_skb(skb);
 	return 0;
 }
 
@@ -502,7 +505,6 @@
 	ep = *((struct c4iw_ep **)(skb->cb + 2 * sizeof(void *)));
 	c4iw_put_ep(&ep->parent_ep->com);
 	release_ep_resources(ep);
-	kfree_skb(skb);
 	return 0;
 }
 
@@ -556,7 +558,7 @@
 		cxgb4_clip_release(ep->com.dev->rdev.lldi.ports[0],
 				   (const u32 *)&sin6->sin6_addr.s6_addr, 1);
 	}
-	remove_handle(ep->com.dev, &ep->com.dev->atid_idr, ep->atid);
+	xa_erase_irq(&ep->com.dev->atids, ep->atid);
 	cxgb4_free_atid(ep->com.dev->rdev.lldi.tids, ep->atid);
 	queue_arp_failure_cpl(ep, skb, FAKE_CPL_PUT_EP_SAFE);
 }
@@ -656,7 +658,33 @@
 	return c4iw_l2t_send(&ep->com.dev->rdev, skb, ep->l2t);
 }
 
-static int send_abort(struct c4iw_ep *ep)
+static void read_tcb(struct c4iw_ep *ep)
+{
+	struct sk_buff *skb;
+	struct cpl_get_tcb *req;
+	int wrlen = roundup(sizeof(*req), 16);
+
+	skb = get_skb(NULL, sizeof(*req), GFP_KERNEL);
+	if (WARN_ON(!skb))
+		return;
+
+	set_wr_txq(skb, CPL_PRIORITY_CONTROL, ep->ctrlq_idx);
+	req = (struct cpl_get_tcb *) skb_put(skb, wrlen);
+	memset(req, 0, wrlen);
+	INIT_TP_WR(req, ep->hwtid);
+	OPCODE_TID(req) = cpu_to_be32(MK_OPCODE_TID(CPL_GET_TCB, ep->hwtid));
+	req->reply_ctrl = htons(REPLY_CHAN_V(0) | QUEUENO_V(ep->rss_qid));
+
+	/*
+	 * keep a ref on the ep so the tcb is not unlocked before this
+	 * cpl completes. The ref is released in read_tcb_rpl().
+	 */
+	c4iw_get_ep(&ep->com);
+	if (WARN_ON(c4iw_ofld_send(&ep->com.dev->rdev, skb)))
+		c4iw_put_ep(&ep->com);
+}
+
+static int send_abort_req(struct c4iw_ep *ep)
 {
 	u32 wrlen = roundup(sizeof(struct cpl_abort_req), 16);
 	struct sk_buff *req_skb = skb_dequeue(&ep->com.ep_skb_list);
@@ -671,6 +699,17 @@
 	return c4iw_l2t_send(&ep->com.dev->rdev, req_skb, ep->l2t);
 }
 
+static int send_abort(struct c4iw_ep *ep)
+{
+	if (!ep->com.qp || !ep->com.qp->srq) {
+		send_abort_req(ep);
+		return 0;
+	}
+	set_bit(ABORT_REQ_IN_PROGRESS, &ep->com.flags);
+	read_tcb(ep);
+	return 0;
+}
+
 static int send_connect(struct c4iw_ep *ep)
 {
 	struct cpl_act_open_req *req = NULL;
@@ -912,7 +951,7 @@
 	mpalen = sizeof(*mpa) + ep->plen;
 	if (mpa_rev_to_use == 2)
 		mpalen += sizeof(struct mpa_v2_conn_params);
-	wrlen = roundup(mpalen + sizeof *req, 16);
+	wrlen = roundup(mpalen + sizeof(*req), 16);
 	skb = get_skb(skb, wrlen, GFP_KERNEL);
 	if (!skb) {
 		connect_reply_upcall(ep, -ENOMEM);
@@ -956,8 +995,9 @@
 	}
 
 	if (mpa_rev_to_use == 2) {
-		mpa->private_data_size = htons(ntohs(mpa->private_data_size) +
-					       sizeof (struct mpa_v2_conn_params));
+		mpa->private_data_size =
+			htons(ntohs(mpa->private_data_size) +
+			      sizeof(struct mpa_v2_conn_params));
 		pr_debug("initiator ird %u ord %u\n", ep->ird,
 			 ep->ord);
 		mpa_v2_params.ird = htons((u16)ep->ird);
@@ -1016,7 +1056,7 @@
 	mpalen = sizeof(*mpa) + plen;
 	if (ep->mpa_attr.version == 2 && ep->mpa_attr.enhanced_rdma_conn)
 		mpalen += sizeof(struct mpa_v2_conn_params);
-	wrlen = roundup(mpalen + sizeof *req, 16);
+	wrlen = roundup(mpalen + sizeof(*req), 16);
 
 	skb = get_skb(NULL, wrlen, GFP_KERNEL);
 	if (!skb) {
@@ -1047,8 +1087,9 @@
 
 	if (ep->mpa_attr.version == 2 && ep->mpa_attr.enhanced_rdma_conn) {
 		mpa->flags |= MPA_ENHANCED_RDMA_CONN;
-		mpa->private_data_size = htons(ntohs(mpa->private_data_size) +
-					       sizeof (struct mpa_v2_conn_params));
+		mpa->private_data_size =
+			htons(ntohs(mpa->private_data_size) +
+			      sizeof(struct mpa_v2_conn_params));
 		mpa_v2_params.ird = htons(((u16)ep->ird) |
 					  (peer2peer ? MPA_V2_PEER2PEER_MODEL :
 					   0));
@@ -1095,7 +1136,7 @@
 	mpalen = sizeof(*mpa) + plen;
 	if (ep->mpa_attr.version == 2 && ep->mpa_attr.enhanced_rdma_conn)
 		mpalen += sizeof(struct mpa_v2_conn_params);
-	wrlen = roundup(mpalen + sizeof *req, 16);
+	wrlen = roundup(mpalen + sizeof(*req), 16);
 
 	skb = get_skb(NULL, wrlen, GFP_KERNEL);
 	if (!skb) {
@@ -1130,8 +1171,9 @@
 
 	if (ep->mpa_attr.version == 2 && ep->mpa_attr.enhanced_rdma_conn) {
 		mpa->flags |= MPA_ENHANCED_RDMA_CONN;
-		mpa->private_data_size = htons(ntohs(mpa->private_data_size) +
-					       sizeof (struct mpa_v2_conn_params));
+		mpa->private_data_size =
+			htons(ntohs(mpa->private_data_size) +
+			      sizeof(struct mpa_v2_conn_params));
 		mpa_v2_params.ird = htons((u16)ep->ird);
 		mpa_v2_params.ord = htons((u16)ep->ord);
 		if (peer2peer && (ep->mpa_attr.p2p_type !=
@@ -1199,7 +1241,7 @@
 	set_emss(ep, tcp_opt);
 
 	/* dealloc the atid */
-	remove_handle(ep->com.dev, &ep->com.dev->atid_idr, atid);
+	xa_erase_irq(&ep->com.dev->atids, atid);
 	cxgb4_free_atid(t, atid);
 	set_bit(ACT_ESTAB, &ep->com.history);
 
@@ -1852,14 +1894,11 @@
 	return 0;
 }
 
-static void complete_cached_srq_buffers(struct c4iw_ep *ep,
-					__be32 srqidx_status)
+static void complete_cached_srq_buffers(struct c4iw_ep *ep, u32 srqidx)
 {
 	enum chip_type adapter_type;
-	u32 srqidx;
 
 	adapter_type = ep->com.dev->rdev.lldi.adapter_type;
-	srqidx = ABORT_RSS_SRQIDX_G(be32_to_cpu(srqidx_status));
 
 	/*
 	 * If this TCB had a srq buffer cached, then we must complete
@@ -1877,6 +1916,7 @@
 
 static int abort_rpl(struct c4iw_dev *dev, struct sk_buff *skb)
 {
+	u32 srqidx;
 	struct c4iw_ep *ep;
 	struct cpl_abort_rpl_rss6 *rpl = cplhdr(skb);
 	int release = 0;
@@ -1888,7 +1928,10 @@
 		return 0;
 	}
 
-	complete_cached_srq_buffers(ep, rpl->srqidx_status);
+	if (ep->com.qp && ep->com.qp->srq) {
+		srqidx = ABORT_RSS_SRQIDX_G(be32_to_cpu(rpl->srqidx_status));
+		complete_cached_srq_buffers(ep, srqidx ? srqidx : ep->srqe_idx);
+	}
 
 	pr_debug("ep %p tid %u\n", ep, ep->hwtid);
 	mutex_lock(&ep->com.mutex);
@@ -1904,8 +1947,10 @@
 	}
 	mutex_unlock(&ep->com.mutex);
 
-	if (release)
+	if (release) {
+		close_complete_upcall(ep, -ECONNRESET);
 		release_ep_resources(ep);
+	}
 	c4iw_put_ep(&ep->com);
 	return 0;
 }
@@ -2059,8 +2104,7 @@
 		}
 		ep->mtu = pdev->mtu;
 		ep->tx_chan = cxgb4_port_chan(pdev);
-		ep->smac_idx = cxgb4_tp_smt_idx(adapter_type,
-						cxgb4_port_viid(pdev));
+		ep->smac_idx = ((struct port_info *)netdev_priv(pdev))->smt_idx;
 		step = cdev->rdev.lldi.ntxq /
 			cdev->rdev.lldi.nchan;
 		ep->txq_idx = cxgb4_port_idx(pdev) * step;
@@ -2074,13 +2118,12 @@
 	} else {
 		pdev = get_real_dev(n->dev);
 		ep->l2t = cxgb4_l2t_get(cdev->rdev.lldi.l2t,
-					n, pdev, 0);
+					n, pdev, rt_tos2priority(tos));
 		if (!ep->l2t)
 			goto out;
 		ep->mtu = dst_mtu(dst);
 		ep->tx_chan = cxgb4_port_chan(pdev);
-		ep->smac_idx = cxgb4_tp_smt_idx(adapter_type,
-						cxgb4_port_viid(pdev));
+		ep->smac_idx = ((struct port_info *)netdev_priv(pdev))->smt_idx;
 		step = cdev->rdev.lldi.ntxq /
 			cdev->rdev.lldi.nchan;
 		ep->txq_idx = cxgb4_port_idx(pdev) * step;
@@ -2147,7 +2190,9 @@
 		err = -ENOMEM;
 		goto fail2;
 	}
-	insert_handle(ep->com.dev, &ep->com.dev->atid_idr, ep, ep->atid);
+	err = xa_insert_irq(&ep->com.dev->atids, ep->atid, ep, GFP_KERNEL);
+	if (err)
+		goto fail2a;
 
 	/* find a route */
 	if (ep->com.cm_id->m_local_addr.ss_family == AF_INET) {
@@ -2164,7 +2209,8 @@
 					   laddr6->sin6_addr.s6_addr,
 					   raddr6->sin6_addr.s6_addr,
 					   laddr6->sin6_port,
-					   raddr6->sin6_port, 0,
+					   raddr6->sin6_port,
+					   ep->com.cm_id->tos,
 					   raddr6->sin6_scope_id);
 		iptype = 6;
 		ra = (__u8 *)&raddr6->sin6_addr;
@@ -2198,7 +2244,8 @@
 fail4:
 	dst_release(ep->dst);
 fail3:
-	remove_handle(ep->com.dev, &ep->com.dev->atid_idr, ep->atid);
+	xa_erase_irq(&ep->com.dev->atids, ep->atid);
+fail2a:
 	cxgb4_free_atid(ep->com.dev->rdev.lldi.tids, ep->atid);
 fail2:
 	/*
@@ -2281,8 +2328,7 @@
 						(const u32 *)
 						&sin6->sin6_addr.s6_addr, 1);
 			}
-			remove_handle(ep->com.dev, &ep->com.dev->atid_idr,
-					atid);
+			xa_erase_irq(&ep->com.dev->atids, atid);
 			cxgb4_free_atid(t, atid);
 			dst_release(ep->dst);
 			cxgb4_l2t_release(ep->l2t);
@@ -2319,7 +2365,7 @@
 		cxgb4_remove_tid(ep->com.dev->rdev.lldi.tids, 0, GET_TID(rpl),
 				 ep->com.local_addr.ss_family);
 
-	remove_handle(ep->com.dev, &ep->com.dev->atid_idr, atid);
+	xa_erase_irq(&ep->com.dev->atids, atid);
 	cxgb4_free_atid(t, atid);
 	dst_release(ep->dst);
 	cxgb4_l2t_release(ep->l2t);
@@ -2376,20 +2422,6 @@
 	enum chip_type adapter_type = ep->com.dev->rdev.lldi.adapter_type;
 
 	pr_debug("ep %p tid %u\n", ep, ep->hwtid);
-
-	skb_get(skb);
-	rpl = cplhdr(skb);
-	if (!is_t4(adapter_type)) {
-		skb_trim(skb, roundup(sizeof(*rpl5), 16));
-		rpl5 = (void *)rpl;
-		INIT_TP_WR(rpl5, ep->hwtid);
-	} else {
-		skb_trim(skb, sizeof(*rpl));
-		INIT_TP_WR(rpl, ep->hwtid);
-	}
-	OPCODE_TID(rpl) = cpu_to_be32(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL,
-						    ep->hwtid));
-
 	cxgb_best_mtu(ep->com.dev->rdev.lldi.mtus, ep->mtu, &mtu_idx,
 		      enable_tcp_timestamps && req->tcpopt.tstamp,
 		      (ep->com.remote_addr.ss_family == AF_INET) ? 0 : 1);
@@ -2435,6 +2467,20 @@
 		if (tcph->ece && tcph->cwr)
 			opt2 |= CCTRL_ECN_V(1);
 	}
+
+	skb_get(skb);
+	rpl = cplhdr(skb);
+	if (!is_t4(adapter_type)) {
+		skb_trim(skb, roundup(sizeof(*rpl5), 16));
+		rpl5 = (void *)rpl;
+		INIT_TP_WR(rpl5, ep->hwtid);
+	} else {
+		skb_trim(skb, sizeof(*rpl));
+		INIT_TP_WR(rpl, ep->hwtid);
+	}
+	OPCODE_TID(rpl) = cpu_to_be32(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL,
+						    ep->hwtid));
+
 	if (CHELSIO_CHIP_VERSION(adapter_type) > CHELSIO_T4) {
 		u32 isn = (prandom_u32() & ~7UL) - 1;
 		opt2 |= T5_OPT_2_VALID_F;
@@ -2479,7 +2525,7 @@
 	u16 peer_mss = ntohs(req->tcpopt.mss);
 	int iptype;
 	unsigned short hdrs;
-	u8 tos = PASS_OPEN_TOS_G(ntohl(req->tos_stid));
+	u8 tos;
 
 	parent_ep = (struct c4iw_ep *)get_ep_from_stid(dev, stid);
 	if (!parent_ep) {
@@ -2493,6 +2539,11 @@
 		goto reject;
 	}
 
+	if (parent_ep->com.cm_id->tos_set)
+		tos = parent_ep->com.cm_id->tos;
+	else
+		tos = PASS_OPEN_TOS_G(ntohl(req->tos_stid));
+
 	cxgb_get_4tuple(req, parent_ep->com.dev->rdev.lldi.adapter_type,
 			&iptype, local_ip, peer_ip, &local_port, &peer_port);
 
@@ -2512,7 +2563,7 @@
 			 ntohs(peer_port), peer_mss);
 		dst = cxgb_find_route6(&dev->rdev.lldi, get_real_dev,
 				local_ip, peer_ip, local_port, peer_port,
-				PASS_OPEN_TOS_G(ntohl(req->tos_stid)),
+				tos,
 				((struct sockaddr_in6 *)
 				 &parent_ep->com.local_addr)->sin6_scope_id);
 	}
@@ -2743,6 +2794,21 @@
 	return 0;
 }
 
+static void finish_peer_abort(struct c4iw_dev *dev, struct c4iw_ep *ep)
+{
+	complete_cached_srq_buffers(ep, ep->srqe_idx);
+	if (ep->com.cm_id && ep->com.qp) {
+		struct c4iw_qp_attributes attrs;
+
+		attrs.next_state = C4IW_QP_STATE_ERROR;
+		c4iw_modify_qp(ep->com.qp->rhp, ep->com.qp,
+			       C4IW_QP_ATTR_NEXT_STATE,	&attrs, 1);
+	}
+	peer_abort_upcall(ep);
+	release_ep_resources(ep);
+	c4iw_put_ep(&ep->com);
+}
+
 static int peer_abort(struct c4iw_dev *dev, struct sk_buff *skb)
 {
 	struct cpl_abort_req_rss6 *req = cplhdr(skb);
@@ -2753,6 +2819,7 @@
 	int release = 0;
 	unsigned int tid = GET_TID(req);
 	u8 status;
+	u32 srqidx;
 
 	u32 len = roundup(sizeof(struct cpl_abort_rpl), 16);
 
@@ -2772,8 +2839,6 @@
 		goto deref_ep;
 	}
 
-	complete_cached_srq_buffers(ep, req->srqidx_status);
-
 	pr_debug("ep %p tid %u state %u\n", ep, ep->hwtid,
 		 ep->com.state);
 	set_bit(PEER_ABORT, &ep->com.history);
@@ -2796,7 +2861,8 @@
 		break;
 	case MPA_REQ_SENT:
 		(void)stop_ep_timer(ep);
-		if (mpa_rev == 1 || (mpa_rev == 2 && ep->tried_with_mpa_v1))
+		if (status != CPL_ERR_CONN_RESET || mpa_rev == 1 ||
+		    (mpa_rev == 2 && ep->tried_with_mpa_v1))
 			connect_reply_upcall(ep, -ECONNRESET);
 		else {
 			/*
@@ -2821,6 +2887,23 @@
 		stop_ep_timer(ep);
 		/*FALLTHROUGH*/
 	case FPDU_MODE:
+		if (ep->com.qp && ep->com.qp->srq) {
+			srqidx = ABORT_RSS_SRQIDX_G(
+					be32_to_cpu(req->srqidx_status));
+			if (srqidx) {
+				complete_cached_srq_buffers(ep,
+							    req->srqidx_status);
+			} else {
+				/* Hold ep ref until finish_peer_abort() */
+				c4iw_get_ep(&ep->com);
+				__state_set(&ep->com, ABORTING);
+				set_bit(PEER_ABORT_IN_PROGRESS, &ep->com.flags);
+				read_tcb(ep);
+				break;
+
+			}
+		}
+
 		if (ep->com.cm_id && ep->com.qp) {
 			attrs.next_state = C4IW_QP_STATE_ERROR;
 			ret = c4iw_modify_qp(ep->com.qp->rhp,
@@ -2872,7 +2955,7 @@
 					(const u32 *)&sin6->sin6_addr.s6_addr,
 					1);
 		}
-		remove_handle(ep->com.dev, &ep->com.dev->hwtid_idr, ep->hwtid);
+		xa_erase_irq(&ep->com.dev->hwtids, ep->hwtid);
 		cxgb4_remove_tid(ep->com.dev->rdev.lldi.tids, 0, ep->hwtid,
 				 ep->com.local_addr.ss_family);
 		dst_release(ep->dst);
@@ -2944,15 +3027,18 @@
 
 	ep = get_ep_from_tid(dev, tid);
 
-	if (ep && ep->com.qp) {
-		pr_warn("TERM received tid %u qpid %u\n",
-			tid, ep->com.qp->wq.sq.qid);
-		attrs.next_state = C4IW_QP_STATE_TERMINATE;
-		c4iw_modify_qp(ep->com.qp->rhp, ep->com.qp,
-			       C4IW_QP_ATTR_NEXT_STATE, &attrs, 1);
+	if (ep) {
+		if (ep->com.qp) {
+			pr_warn("TERM received tid %u qpid %u\n", tid,
+				ep->com.qp->wq.sq.qid);
+			attrs.next_state = C4IW_QP_STATE_TERMINATE;
+			c4iw_modify_qp(ep->com.qp->rhp, ep->com.qp,
+				       C4IW_QP_ATTR_NEXT_STATE, &attrs, 1);
+		}
+
+		c4iw_put_ep(&ep->com);
 	} else
 		pr_warn("TERM received tid %u no ep/qp\n", tid);
-	c4iw_put_ep(&ep->com);
 
 	return 0;
 }
@@ -3145,17 +3231,22 @@
 	int found = 0;
 	struct sockaddr_in *laddr = (struct sockaddr_in *)&cm_id->m_local_addr;
 	struct sockaddr_in *raddr = (struct sockaddr_in *)&cm_id->m_remote_addr;
+	const struct in_ifaddr *ifa;
 
 	ind = in_dev_get(dev->rdev.lldi.ports[0]);
 	if (!ind)
 		return -EADDRNOTAVAIL;
-	for_primary_ifa(ind) {
+	rcu_read_lock();
+	in_dev_for_each_ifa_rcu(ifa, ind) {
+		if (ifa->ifa_flags & IFA_F_SECONDARY)
+			continue;
 		laddr->sin_addr.s_addr = ifa->ifa_address;
 		raddr->sin_addr.s_addr = ifa->ifa_address;
 		found = 1;
 		break;
 	}
-	endfor_ifa(ind);
+	rcu_read_unlock();
+
 	in_dev_put(ind);
 	return found ? 0 : -EADDRNOTAVAIL;
 }
@@ -3264,7 +3355,9 @@
 		err = -ENOMEM;
 		goto fail2;
 	}
-	insert_handle(dev, &dev->atid_idr, ep, ep->atid);
+	err = xa_insert_irq(&dev->atids, ep->atid, ep, GFP_KERNEL);
+	if (err)
+		goto fail5;
 
 	memcpy(&ep->com.local_addr, &cm_id->m_local_addr,
 	       sizeof(ep->com.local_addr));
@@ -3320,7 +3413,7 @@
 					   laddr6->sin6_addr.s6_addr,
 					   raddr6->sin6_addr.s6_addr,
 					   laddr6->sin6_port,
-					   raddr6->sin6_port, 0,
+					   raddr6->sin6_port, cm_id->tos,
 					   raddr6->sin6_scope_id);
 	}
 	if (!ep->dst) {
@@ -3352,7 +3445,8 @@
 fail4:
 	dst_release(ep->dst);
 fail3:
-	remove_handle(ep->com.dev, &ep->com.dev->atid_idr, ep->atid);
+	xa_erase_irq(&ep->com.dev->atids, ep->atid);
+fail5:
 	cxgb4_free_atid(ep->com.dev->rdev.lldi.tids, ep->atid);
 fail2:
 	skb_queue_purge(&ep->com.ep_skb_list);
@@ -3475,7 +3569,9 @@
 		err = -ENOMEM;
 		goto fail2;
 	}
-	insert_handle(dev, &dev->stid_idr, ep, ep->stid);
+	err = xa_insert_irq(&dev->stids, ep->stid, ep, GFP_KERNEL);
+	if (err)
+		goto fail3;
 
 	state_set(&ep->com, LISTEN);
 	if (ep->com.local_addr.ss_family == AF_INET)
@@ -3486,7 +3582,8 @@
 		cm_id->provider_data = ep;
 		goto out;
 	}
-	remove_handle(ep->com.dev, &ep->com.dev->stid_idr, ep->stid);
+	xa_erase_irq(&ep->com.dev->stids, ep->stid);
+fail3:
 	cxgb4_free_stid(ep->com.dev->rdev.lldi.tids, ep->stid,
 			ep->com.local_addr.ss_family);
 fail2:
@@ -3525,7 +3622,7 @@
 		cxgb4_clip_release(ep->com.dev->rdev.lldi.ports[0],
 				   (const u32 *)&sin6->sin6_addr.s6_addr, 1);
 	}
-	remove_handle(ep->com.dev, &ep->com.dev->stid_idr, ep->stid);
+	xa_erase_irq(&ep->com.dev->stids, ep->stid);
 	cxgb4_free_stid(ep->com.dev->rdev.lldi.tids, ep->stid,
 			ep->com.local_addr.ss_family);
 done:
@@ -3608,7 +3705,6 @@
 	if (close) {
 		if (abrupt) {
 			set_bit(EP_DISC_ABORT, &ep->com.history);
-			close_complete_upcall(ep, -ECONNRESET);
 			ret = send_abort(ep);
 		} else {
 			set_bit(EP_DISC_CLOSE, &ep->com.history);
@@ -3686,7 +3782,7 @@
 		cxgb4_clip_release(ep->com.dev->rdev.lldi.ports[0],
 				   (const u32 *)&sin6->sin6_addr.s6_addr, 1);
 	}
-	remove_handle(dev, &dev->atid_idr, atid);
+	xa_erase_irq(&dev->atids, atid);
 	cxgb4_free_atid(dev->rdev.lldi.tids, atid);
 	dst_release(ep->dst);
 	cxgb4_l2t_release(ep->l2t);
@@ -3719,6 +3815,80 @@
 	return;
 }
 
+static inline u64 t4_tcb_get_field64(__be64 *tcb, u16 word)
+{
+	u64 tlo = be64_to_cpu(tcb[((31 - word) / 2)]);
+	u64 thi = be64_to_cpu(tcb[((31 - word) / 2) - 1]);
+	u64 t;
+	u32 shift = 32;
+
+	t = (thi << shift) | (tlo >> shift);
+
+	return t;
+}
+
+static inline u32 t4_tcb_get_field32(__be64 *tcb, u16 word, u32 mask, u32 shift)
+{
+	u32 v;
+	u64 t = be64_to_cpu(tcb[(31 - word) / 2]);
+
+	if (word & 0x1)
+		shift += 32;
+	v = (t >> shift) & mask;
+	return v;
+}
+
+static int read_tcb_rpl(struct c4iw_dev *dev, struct sk_buff *skb)
+{
+	struct cpl_get_tcb_rpl *rpl = cplhdr(skb);
+	__be64 *tcb = (__be64 *)(rpl + 1);
+	unsigned int tid = GET_TID(rpl);
+	struct c4iw_ep *ep;
+	u64 t_flags_64;
+	u32 rx_pdu_out;
+
+	ep = get_ep_from_tid(dev, tid);
+	if (!ep)
+		return 0;
+	/* Examine the TF_RX_PDU_OUT (bit 49 of the t_flags) in order to
+	 * determine if there's a rx PDU feedback event pending.
+	 *
+	 * If that bit is set, it means we'll need to re-read the TCB's
+	 * rq_start value. The final value is the one present in a TCB
+	 * with the TF_RX_PDU_OUT bit cleared.
+	 */
+
+	t_flags_64 = t4_tcb_get_field64(tcb, TCB_T_FLAGS_W);
+	rx_pdu_out = (t_flags_64 & TF_RX_PDU_OUT_V(1)) >> TF_RX_PDU_OUT_S;
+
+	c4iw_put_ep(&ep->com); /* from get_ep_from_tid() */
+	c4iw_put_ep(&ep->com); /* from read_tcb() */
+
+	/* If TF_RX_PDU_OUT bit is set, re-read the TCB */
+	if (rx_pdu_out) {
+		if (++ep->rx_pdu_out_cnt >= 2) {
+			WARN_ONCE(1, "tcb re-read() reached the guard limit, finishing the cleanup\n");
+			goto cleanup;
+		}
+		read_tcb(ep);
+		return 0;
+	}
+
+	ep->srqe_idx = t4_tcb_get_field32(tcb, TCB_RQ_START_W, TCB_RQ_START_W,
+			TCB_RQ_START_S);
+cleanup:
+	pr_debug("ep %p tid %u %016x\n", ep, ep->hwtid, ep->srqe_idx);
+
+	if (test_bit(PEER_ABORT_IN_PROGRESS, &ep->com.flags))
+		finish_peer_abort(dev, ep);
+	else if (test_bit(ABORT_REQ_IN_PROGRESS, &ep->com.flags))
+		send_abort_req(ep);
+	else
+		WARN_ONCE(1, "unexpected state!");
+
+	return 0;
+}
+
 static int deferred_fw6_msg(struct c4iw_dev *dev, struct sk_buff *skb)
 {
 	struct cpl_fw6_msg *rpl = cplhdr(skb);
@@ -3945,7 +4115,7 @@
 	} else {
 		vlan_eh = (struct vlan_ethhdr *)(req + 1);
 		iph = (struct iphdr *)(vlan_eh + 1);
-		skb->vlan_tci = ntohs(cpl->vlan);
+		__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), ntohs(cpl->vlan));
 	}
 
 	if (iph->version != 0x4)
@@ -4039,6 +4209,7 @@
 	[CPL_CLOSE_CON_RPL] = close_con_rpl,
 	[CPL_RDMA_TERMINATE] = terminate,
 	[CPL_FW4_ACK] = fw4_ack,
+	[CPL_GET_TCB_RPL] = read_tcb_rpl,
 	[CPL_FW6_MSG] = deferred_fw6_msg,
 	[CPL_RX_PKT] = rx_pkt,
 	[FAKE_CPL_PUT_EP_SAFE] = _put_ep_safe,
@@ -4270,6 +4441,7 @@
 	[CPL_RDMA_TERMINATE] = sched,
 	[CPL_FW4_ACK] = sched,
 	[CPL_SET_TCB_RPL] = set_tcb_rpl,
+	[CPL_GET_TCB_RPL] = sched,
 	[CPL_FW6_MSG] = fw6_msg,
 	[CPL_RX_PKT] = sched
 };
diff --git a/drivers/infiniband/hw/cxgb4/cq.c b/drivers/infiniband/hw/cxgb4/cq.c
index 6d30427..b1bb61c 100644
--- a/drivers/infiniband/hw/cxgb4/cq.c
+++ b/drivers/infiniband/hw/cxgb4/cq.c
@@ -30,18 +30,19 @@
  * SOFTWARE.
  */
 
+#include <rdma/uverbs_ioctl.h>
+
 #include "iw_cxgb4.h"
 
-static int destroy_cq(struct c4iw_rdev *rdev, struct t4_cq *cq,
-		      struct c4iw_dev_ucontext *uctx, struct sk_buff *skb,
-		      struct c4iw_wr_wait *wr_waitp)
+static void destroy_cq(struct c4iw_rdev *rdev, struct t4_cq *cq,
+		       struct c4iw_dev_ucontext *uctx, struct sk_buff *skb,
+		       struct c4iw_wr_wait *wr_waitp)
 {
 	struct fw_ri_res_wr *res_wr;
 	struct fw_ri_res *res;
 	int wr_len;
-	int ret;
 
-	wr_len = sizeof *res_wr + sizeof *res;
+	wr_len = sizeof(*res_wr) + sizeof(*res);
 	set_wr_txq(skb, CPL_PRIORITY_CONTROL, 0);
 
 	res_wr = __skb_put_zero(skb, wr_len);
@@ -57,14 +58,13 @@
 	res->u.cq.iqid = cpu_to_be32(cq->cqid);
 
 	c4iw_init_wr_wait(wr_waitp);
-	ret = c4iw_ref_send_wait(rdev, skb, wr_waitp, 0, 0, __func__);
+	c4iw_ref_send_wait(rdev, skb, wr_waitp, 0, 0, __func__);
 
 	kfree(cq->sw_queue);
 	dma_free_coherent(&(rdev->lldi.pdev->dev),
 			  cq->memsize, cq->queue,
 			  dma_unmap_addr(cq, mapping));
 	c4iw_put_cqid(rdev, cq->cqid, uctx);
-	return ret;
 }
 
 static int create_cq(struct c4iw_rdev *rdev, struct t4_cq *cq,
@@ -102,7 +102,6 @@
 		goto err3;
 	}
 	dma_unmap_addr_set(cq, mapping, cq->dma_addr);
-	memset(cq->queue, 0, cq->memsize);
 
 	if (user && ucontext->is_32b_cqe) {
 		cq->qp_errp = &((struct t4_status_page *)
@@ -115,7 +114,7 @@
 	}
 
 	/* build fw_ri_res_wr */
-	wr_len = sizeof *res_wr + sizeof *res;
+	wr_len = sizeof(*res_wr) + sizeof(*res);
 
 	skb = alloc_skb(wr_len, GFP_KERNEL);
 	if (!skb) {
@@ -161,7 +160,7 @@
 	cq->gts = rdev->lldi.gts_reg;
 	cq->rdev = rdev;
 
-	cq->bar2_va = c4iw_bar2_addrs(rdev, cq->cqid, T4_BAR2_QTYPE_INGRESS,
+	cq->bar2_va = c4iw_bar2_addrs(rdev, cq->cqid, CXGB4_BAR2_QTYPE_INGRESS,
 				      &cq->bar2_qid,
 				      user ? &cq->bar2_pa : NULL);
 	if (user && !cq->bar2_pa) {
@@ -968,7 +967,7 @@
 	return !err || err == -ENODATA ? npolled : err;
 }
 
-int c4iw_destroy_cq(struct ib_cq *ib_cq)
+void c4iw_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata)
 {
 	struct c4iw_cq *chp;
 	struct c4iw_ucontext *ucontext;
@@ -976,55 +975,46 @@
 	pr_debug("ib_cq %p\n", ib_cq);
 	chp = to_c4iw_cq(ib_cq);
 
-	remove_handle(chp->rhp, &chp->rhp->cqidr, chp->cq.cqid);
+	xa_erase_irq(&chp->rhp->cqs, chp->cq.cqid);
 	atomic_dec(&chp->refcnt);
 	wait_event(chp->wait, !atomic_read(&chp->refcnt));
 
-	ucontext = ib_cq->uobject ? to_c4iw_ucontext(ib_cq->uobject->context)
-				  : NULL;
+	ucontext = rdma_udata_to_drv_context(udata, struct c4iw_ucontext,
+					     ibucontext);
 	destroy_cq(&chp->rhp->rdev, &chp->cq,
 		   ucontext ? &ucontext->uctx : &chp->cq.rdev->uctx,
 		   chp->destroy_skb, chp->wr_waitp);
 	c4iw_put_wr_wait(chp->wr_waitp);
-	kfree(chp);
-	return 0;
 }
 
-struct ib_cq *c4iw_create_cq(struct ib_device *ibdev,
-			     const struct ib_cq_init_attr *attr,
-			     struct ib_ucontext *ib_context,
-			     struct ib_udata *udata)
+int c4iw_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
+		   struct ib_udata *udata)
 {
+	struct ib_device *ibdev = ibcq->device;
 	int entries = attr->cqe;
 	int vector = attr->comp_vector;
-	struct c4iw_dev *rhp;
-	struct c4iw_cq *chp;
+	struct c4iw_dev *rhp = to_c4iw_dev(ibcq->device);
+	struct c4iw_cq *chp = to_c4iw_cq(ibcq);
 	struct c4iw_create_cq ucmd;
 	struct c4iw_create_cq_resp uresp;
-	struct c4iw_ucontext *ucontext = NULL;
 	int ret, wr_len;
 	size_t memsize, hwentries;
 	struct c4iw_mm_entry *mm, *mm2;
+	struct c4iw_ucontext *ucontext = rdma_udata_to_drv_context(
+		udata, struct c4iw_ucontext, ibucontext);
 
 	pr_debug("ib_dev %p entries %d\n", ibdev, entries);
 	if (attr->flags)
-		return ERR_PTR(-EINVAL);
-
-	rhp = to_c4iw_dev(ibdev);
+		return -EINVAL;
 
 	if (vector >= rhp->rdev.lldi.nciq)
-		return ERR_PTR(-EINVAL);
+		return -EINVAL;
 
-	if (ib_context) {
-		ucontext = to_c4iw_ucontext(ib_context);
+	if (udata) {
 		if (udata->inlen < sizeof(ucmd))
 			ucontext->is_32b_cqe = 1;
 	}
 
-	chp = kzalloc(sizeof(*chp), GFP_KERNEL);
-	if (!chp)
-		return ERR_PTR(-ENOMEM);
-
 	chp->wr_waitp = c4iw_alloc_wr_wait(GFP_KERNEL);
 	if (!chp->wr_waitp) {
 		ret = -ENOMEM;
@@ -1068,7 +1058,7 @@
 	/*
 	 * memsize must be a multiple of the page size if its a user cq.
 	 */
-	if (ucontext)
+	if (udata)
 		memsize = roundup(memsize, PAGE_SIZE);
 
 	chp->cq.size = hwentries;
@@ -1088,16 +1078,16 @@
 	spin_lock_init(&chp->comp_handler_lock);
 	atomic_set(&chp->refcnt, 1);
 	init_waitqueue_head(&chp->wait);
-	ret = insert_handle(rhp, &rhp->cqidr, chp, chp->cq.cqid);
+	ret = xa_insert_irq(&rhp->cqs, chp->cq.cqid, chp, GFP_KERNEL);
 	if (ret)
 		goto err_destroy_cq;
 
 	if (ucontext) {
 		ret = -ENOMEM;
-		mm = kmalloc(sizeof *mm, GFP_KERNEL);
+		mm = kmalloc(sizeof(*mm), GFP_KERNEL);
 		if (!mm)
 			goto err_remove_handle;
-		mm2 = kmalloc(sizeof *mm2, GFP_KERNEL);
+		mm2 = kmalloc(sizeof(*mm2), GFP_KERNEL);
 		if (!mm2)
 			goto err_free_mm;
 
@@ -1134,16 +1124,17 @@
 		mm2->len = PAGE_SIZE;
 		insert_mmap(ucontext, mm2);
 	}
-	pr_debug("cqid 0x%0x chp %p size %u memsize %zu, dma_addr 0x%0llx\n",
-		 chp->cq.cqid, chp, chp->cq.size,
-		 chp->cq.memsize, (unsigned long long)chp->cq.dma_addr);
-	return &chp->ibcq;
+
+	pr_debug("cqid 0x%0x chp %p size %u memsize %zu, dma_addr %pad\n",
+		 chp->cq.cqid, chp, chp->cq.size, chp->cq.memsize,
+		 &chp->cq.dma_addr);
+	return 0;
 err_free_mm2:
 	kfree(mm2);
 err_free_mm:
 	kfree(mm);
 err_remove_handle:
-	remove_handle(rhp, &rhp->cqidr, chp->cq.cqid);
+	xa_erase_irq(&rhp->cqs, chp->cq.cqid);
 err_destroy_cq:
 	destroy_cq(&chp->rhp->rdev, &chp->cq,
 		   ucontext ? &ucontext->uctx : &rhp->rdev.uctx,
@@ -1153,8 +1144,7 @@
 err_free_wr_wait:
 	c4iw_put_wr_wait(chp->wr_waitp);
 err_free_chp:
-	kfree(chp);
-	return ERR_PTR(ret);
+	return ret;
 }
 
 int c4iw_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags)
diff --git a/drivers/infiniband/hw/cxgb4/device.c b/drivers/infiniband/hw/cxgb4/device.c
index c13c0ba..599340c 100644
--- a/drivers/infiniband/hw/cxgb4/device.c
+++ b/drivers/infiniband/hw/cxgb4/device.c
@@ -81,14 +81,6 @@
 	int pos;
 };
 
-static int count_idrs(int id, void *p, void *data)
-{
-	int *countp = data;
-
-	*countp = *countp + 1;
-	return 0;
-}
-
 static ssize_t debugfs_read(struct file *file, char __user *buf, size_t count,
 			    loff_t *ppos)
 {
@@ -250,13 +242,11 @@
 	}
 }
 
-static int dump_qp(int id, void *p, void *data)
+static int dump_qp(unsigned long id, struct c4iw_qp *qp,
+		   struct c4iw_debugfs_data *qpd)
 {
-	struct c4iw_qp *qp = p;
-	struct c4iw_debugfs_data *qpd = data;
 	int space;
 	int cc;
-
 	if (id != qp->wq.sq.qid)
 		return 0;
 
@@ -335,19 +325,24 @@
 
 static int qp_open(struct inode *inode, struct file *file)
 {
+	struct c4iw_qp *qp;
 	struct c4iw_debugfs_data *qpd;
+	unsigned long index;
 	int count = 1;
 
-	qpd = kmalloc(sizeof *qpd, GFP_KERNEL);
+	qpd = kmalloc(sizeof(*qpd), GFP_KERNEL);
 	if (!qpd)
 		return -ENOMEM;
 
 	qpd->devp = inode->i_private;
 	qpd->pos = 0;
 
-	spin_lock_irq(&qpd->devp->lock);
-	idr_for_each(&qpd->devp->qpidr, count_idrs, &count);
-	spin_unlock_irq(&qpd->devp->lock);
+	/*
+	 * No need to lock; we drop the lock to call vmalloc so it's racy
+	 * anyway.  Someone who cares should switch this over to seq_file
+	 */
+	xa_for_each(&qpd->devp->qps, index, qp)
+		count++;
 
 	qpd->bufsize = count * 180;
 	qpd->buf = vmalloc(qpd->bufsize);
@@ -356,9 +351,10 @@
 		return -ENOMEM;
 	}
 
-	spin_lock_irq(&qpd->devp->lock);
-	idr_for_each(&qpd->devp->qpidr, dump_qp, qpd);
-	spin_unlock_irq(&qpd->devp->lock);
+	xa_lock_irq(&qpd->devp->qps);
+	xa_for_each(&qpd->devp->qps, index, qp)
+		dump_qp(index, qp, qpd);
+	xa_unlock_irq(&qpd->devp->qps);
 
 	qpd->buf[qpd->pos++] = 0;
 	file->private_data = qpd;
@@ -373,9 +369,8 @@
 	.llseek  = default_llseek,
 };
 
-static int dump_stag(int id, void *p, void *data)
+static int dump_stag(unsigned long id, struct c4iw_debugfs_data *stagd)
 {
-	struct c4iw_debugfs_data *stagd = data;
 	int space;
 	int cc;
 	struct fw_ri_tpte tpte;
@@ -424,10 +419,12 @@
 static int stag_open(struct inode *inode, struct file *file)
 {
 	struct c4iw_debugfs_data *stagd;
+	void *p;
+	unsigned long index;
 	int ret = 0;
 	int count = 1;
 
-	stagd = kmalloc(sizeof *stagd, GFP_KERNEL);
+	stagd = kmalloc(sizeof(*stagd), GFP_KERNEL);
 	if (!stagd) {
 		ret = -ENOMEM;
 		goto out;
@@ -435,9 +432,8 @@
 	stagd->devp = inode->i_private;
 	stagd->pos = 0;
 
-	spin_lock_irq(&stagd->devp->lock);
-	idr_for_each(&stagd->devp->mmidr, count_idrs, &count);
-	spin_unlock_irq(&stagd->devp->lock);
+	xa_for_each(&stagd->devp->mrs, index, p)
+		count++;
 
 	stagd->bufsize = count * 256;
 	stagd->buf = vmalloc(stagd->bufsize);
@@ -446,9 +442,10 @@
 		goto err1;
 	}
 
-	spin_lock_irq(&stagd->devp->lock);
-	idr_for_each(&stagd->devp->mmidr, dump_stag, stagd);
-	spin_unlock_irq(&stagd->devp->lock);
+	xa_lock_irq(&stagd->devp->mrs);
+	xa_for_each(&stagd->devp->mrs, index, p)
+		dump_stag(index, stagd);
+	xa_unlock_irq(&stagd->devp->mrs);
 
 	stagd->buf[stagd->pos++] = 0;
 	file->private_data = stagd;
@@ -558,10 +555,8 @@
 	.write   = stats_clear,
 };
 
-static int dump_ep(int id, void *p, void *data)
+static int dump_ep(struct c4iw_ep *ep, struct c4iw_debugfs_data *epd)
 {
-	struct c4iw_ep *ep = p;
-	struct c4iw_debugfs_data *epd = data;
 	int space;
 	int cc;
 
@@ -617,10 +612,9 @@
 	return 0;
 }
 
-static int dump_listen_ep(int id, void *p, void *data)
+static
+int dump_listen_ep(struct c4iw_listen_ep *ep, struct c4iw_debugfs_data *epd)
 {
-	struct c4iw_listen_ep *ep = p;
-	struct c4iw_debugfs_data *epd = data;
 	int space;
 	int cc;
 
@@ -674,6 +668,9 @@
 
 static int ep_open(struct inode *inode, struct file *file)
 {
+	struct c4iw_ep *ep;
+	struct c4iw_listen_ep *lep;
+	unsigned long index;
 	struct c4iw_debugfs_data *epd;
 	int ret = 0;
 	int count = 1;
@@ -686,11 +683,12 @@
 	epd->devp = inode->i_private;
 	epd->pos = 0;
 
-	spin_lock_irq(&epd->devp->lock);
-	idr_for_each(&epd->devp->hwtid_idr, count_idrs, &count);
-	idr_for_each(&epd->devp->atid_idr, count_idrs, &count);
-	idr_for_each(&epd->devp->stid_idr, count_idrs, &count);
-	spin_unlock_irq(&epd->devp->lock);
+	xa_for_each(&epd->devp->hwtids, index, ep)
+		count++;
+	xa_for_each(&epd->devp->atids, index, ep)
+		count++;
+	xa_for_each(&epd->devp->stids, index, lep)
+		count++;
 
 	epd->bufsize = count * 240;
 	epd->buf = vmalloc(epd->bufsize);
@@ -699,11 +697,18 @@
 		goto err1;
 	}
 
-	spin_lock_irq(&epd->devp->lock);
-	idr_for_each(&epd->devp->hwtid_idr, dump_ep, epd);
-	idr_for_each(&epd->devp->atid_idr, dump_ep, epd);
-	idr_for_each(&epd->devp->stid_idr, dump_listen_ep, epd);
-	spin_unlock_irq(&epd->devp->lock);
+	xa_lock_irq(&epd->devp->hwtids);
+	xa_for_each(&epd->devp->hwtids, index, ep)
+		dump_ep(ep, epd);
+	xa_unlock_irq(&epd->devp->hwtids);
+	xa_lock_irq(&epd->devp->atids);
+	xa_for_each(&epd->devp->atids, index, ep)
+		dump_ep(ep, epd);
+	xa_unlock_irq(&epd->devp->atids);
+	xa_lock_irq(&epd->devp->stids);
+	xa_for_each(&epd->devp->stids, index, lep)
+		dump_listen_ep(lep, epd);
+	xa_unlock_irq(&epd->devp->stids);
 
 	file->private_data = epd;
 	goto out;
@@ -720,11 +725,8 @@
 	.read    = debugfs_read,
 };
 
-static int setup_debugfs(struct c4iw_dev *devp)
+static void setup_debugfs(struct c4iw_dev *devp)
 {
-	if (!devp->debugfs_root)
-		return -1;
-
 	debugfs_create_file_size("qps", S_IWUSR, devp->debugfs_root,
 				 (void *)devp, &qp_debugfs_fops, 4096);
 
@@ -740,7 +742,6 @@
 	if (c4iw_wr_log)
 		debugfs_create_file_size("wr_log", S_IWUSR, devp->debugfs_root,
 					 (void *)devp, &wr_log_debugfs_fops, 4096);
-	return 0;
 }
 
 void c4iw_release_dev_ucontext(struct c4iw_rdev *rdev,
@@ -783,6 +784,7 @@
 static int c4iw_rdev_open(struct c4iw_rdev *rdev)
 {
 	int err;
+	unsigned int factor;
 
 	c4iw_init_dev_ucontext(rdev, &rdev->uctx);
 
@@ -806,8 +808,18 @@
 		return -EINVAL;
 	}
 
-	rdev->qpmask = rdev->lldi.udb_density - 1;
-	rdev->cqmask = rdev->lldi.ucq_density - 1;
+	/* This implementation requires a sge_host_page_size <= PAGE_SIZE. */
+	if (rdev->lldi.sge_host_page_size > PAGE_SIZE) {
+		pr_err("%s: unsupported sge host page size %u\n",
+		       pci_name(rdev->lldi.pdev),
+		       rdev->lldi.sge_host_page_size);
+		return -EINVAL;
+	}
+
+	factor = PAGE_SIZE / rdev->lldi.sge_host_page_size;
+	rdev->qpmask = (rdev->lldi.udb_density * factor) - 1;
+	rdev->cqmask = (rdev->lldi.ucq_density * factor) - 1;
+
 	pr_debug("dev %s stag start 0x%0x size 0x%0x num stags %d pbl start 0x%0x size 0x%0x rq start 0x%0x size 0x%0x qp qid start %u size %u cq qid start %u size %u srq size %u\n",
 		 pci_name(rdev->lldi.pdev), rdev->lldi.vr->stag.start,
 		 rdev->lldi.vr->stag.size, c4iw_num_stags(rdev),
@@ -924,16 +936,12 @@
 void c4iw_dealloc(struct uld_ctx *ctx)
 {
 	c4iw_rdev_close(&ctx->dev->rdev);
-	WARN_ON_ONCE(!idr_is_empty(&ctx->dev->cqidr));
-	idr_destroy(&ctx->dev->cqidr);
-	WARN_ON_ONCE(!idr_is_empty(&ctx->dev->qpidr));
-	idr_destroy(&ctx->dev->qpidr);
-	WARN_ON_ONCE(!idr_is_empty(&ctx->dev->mmidr));
-	idr_destroy(&ctx->dev->mmidr);
-	wait_event(ctx->dev->wait, idr_is_empty(&ctx->dev->hwtid_idr));
-	idr_destroy(&ctx->dev->hwtid_idr);
-	idr_destroy(&ctx->dev->stid_idr);
-	idr_destroy(&ctx->dev->atid_idr);
+	WARN_ON(!xa_empty(&ctx->dev->cqs));
+	WARN_ON(!xa_empty(&ctx->dev->qps));
+	WARN_ON(!xa_empty(&ctx->dev->mrs));
+	wait_event(ctx->dev->wait, xa_empty(&ctx->dev->hwtids));
+	WARN_ON(!xa_empty(&ctx->dev->stids));
+	WARN_ON(!xa_empty(&ctx->dev->atids));
 	if (ctx->dev->rdev.bar2_kva)
 		iounmap(ctx->dev->rdev.bar2_kva);
 	if (ctx->dev->rdev.oc_mw_kva)
@@ -970,7 +978,7 @@
 		pr_info("%s: On-Chip Queues not supported on this device\n",
 			pci_name(infop->pdev));
 
-	devp = (struct c4iw_dev *)ib_alloc_device(sizeof(*devp));
+	devp = ib_alloc_device(c4iw_dev, ibdev);
 	if (!devp) {
 		pr_err("Cannot allocate ib device\n");
 		return ERR_PTR(-ENOMEM);
@@ -1037,13 +1045,12 @@
 		return ERR_PTR(ret);
 	}
 
-	idr_init(&devp->cqidr);
-	idr_init(&devp->qpidr);
-	idr_init(&devp->mmidr);
-	idr_init(&devp->hwtid_idr);
-	idr_init(&devp->stid_idr);
-	idr_init(&devp->atid_idr);
-	spin_lock_init(&devp->lock);
+	xa_init_flags(&devp->cqs, XA_FLAGS_LOCK_IRQ);
+	xa_init_flags(&devp->qps, XA_FLAGS_LOCK_IRQ);
+	xa_init_flags(&devp->mrs, XA_FLAGS_LOCK_IRQ);
+	xa_init_flags(&devp->hwtids, XA_FLAGS_LOCK_IRQ);
+	xa_init_flags(&devp->atids, XA_FLAGS_LOCK_IRQ);
+	xa_init_flags(&devp->stids, XA_FLAGS_LOCK_IRQ);
 	mutex_init(&devp->rdev.stats.lock);
 	mutex_init(&devp->db_mutex);
 	INIT_LIST_HEAD(&devp->db_fc_list);
@@ -1071,7 +1078,7 @@
 		pr_info("Chelsio T4/T5 RDMA Driver - version %s\n",
 			DRV_VERSION);
 
-	ctx = kzalloc(sizeof *ctx, GFP_KERNEL);
+	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
 	if (!ctx) {
 		ctx = ERR_PTR(-ENOMEM);
 		goto out;
@@ -1239,10 +1246,9 @@
 	case CXGB4_STATE_START_RECOVERY:
 		pr_info("%s: Fatal Error\n", pci_name(ctx->lldi.pdev));
 		if (ctx->dev) {
-			struct ib_event event;
+			struct ib_event event = {};
 
 			ctx->dev->rdev.flags |= T4_FATAL_ERROR;
-			memset(&event, 0, sizeof event);
 			event.event  = IB_EVENT_DEVICE_FATAL;
 			event.device = &ctx->dev->ibdev;
 			ib_dispatch_event(&event);
@@ -1258,34 +1264,21 @@
 	return 0;
 }
 
-static int disable_qp_db(int id, void *p, void *data)
-{
-	struct c4iw_qp *qp = p;
-
-	t4_disable_wq_db(&qp->wq);
-	return 0;
-}
-
 static void stop_queues(struct uld_ctx *ctx)
 {
-	unsigned long flags;
+	struct c4iw_qp *qp;
+	unsigned long index, flags;
 
-	spin_lock_irqsave(&ctx->dev->lock, flags);
+	xa_lock_irqsave(&ctx->dev->qps, flags);
 	ctx->dev->rdev.stats.db_state_transitions++;
 	ctx->dev->db_state = STOPPED;
-	if (ctx->dev->rdev.flags & T4_STATUS_PAGE_DISABLED)
-		idr_for_each(&ctx->dev->qpidr, disable_qp_db, NULL);
-	else
+	if (ctx->dev->rdev.flags & T4_STATUS_PAGE_DISABLED) {
+		xa_for_each(&ctx->dev->qps, index, qp)
+			t4_disable_wq_db(&qp->wq);
+	} else {
 		ctx->dev->rdev.status_page->db_off = 1;
-	spin_unlock_irqrestore(&ctx->dev->lock, flags);
-}
-
-static int enable_qp_db(int id, void *p, void *data)
-{
-	struct c4iw_qp *qp = p;
-
-	t4_enable_wq_db(&qp->wq);
-	return 0;
+	}
+	xa_unlock_irqrestore(&ctx->dev->qps, flags);
 }
 
 static void resume_rc_qp(struct c4iw_qp *qp)
@@ -1315,18 +1308,21 @@
 
 static void resume_queues(struct uld_ctx *ctx)
 {
-	spin_lock_irq(&ctx->dev->lock);
+	xa_lock_irq(&ctx->dev->qps);
 	if (ctx->dev->db_state != STOPPED)
 		goto out;
 	ctx->dev->db_state = FLOW_CONTROL;
 	while (1) {
 		if (list_empty(&ctx->dev->db_fc_list)) {
+			struct c4iw_qp *qp;
+			unsigned long index;
+
 			WARN_ON(ctx->dev->db_state != FLOW_CONTROL);
 			ctx->dev->db_state = NORMAL;
 			ctx->dev->rdev.stats.db_state_transitions++;
 			if (ctx->dev->rdev.flags & T4_STATUS_PAGE_DISABLED) {
-				idr_for_each(&ctx->dev->qpidr, enable_qp_db,
-					     NULL);
+				xa_for_each(&ctx->dev->qps, index, qp)
+					t4_enable_wq_db(&qp->wq);
 			} else {
 				ctx->dev->rdev.status_page->db_off = 0;
 			}
@@ -1338,12 +1334,12 @@
 				resume_a_chunk(ctx);
 			}
 			if (!list_empty(&ctx->dev->db_fc_list)) {
-				spin_unlock_irq(&ctx->dev->lock);
+				xa_unlock_irq(&ctx->dev->qps);
 				if (DB_FC_RESUME_DELAY) {
 					set_current_state(TASK_UNINTERRUPTIBLE);
 					schedule_timeout(DB_FC_RESUME_DELAY);
 				}
-				spin_lock_irq(&ctx->dev->lock);
+				xa_lock_irq(&ctx->dev->qps);
 				if (ctx->dev->db_state != FLOW_CONTROL)
 					break;
 			}
@@ -1352,7 +1348,7 @@
 out:
 	if (ctx->dev->db_state != NORMAL)
 		ctx->dev->rdev.stats.db_fc_interruptions++;
-	spin_unlock_irq(&ctx->dev->lock);
+	xa_unlock_irq(&ctx->dev->qps);
 }
 
 struct qp_list {
@@ -1360,23 +1356,6 @@
 	struct c4iw_qp **qps;
 };
 
-static int add_and_ref_qp(int id, void *p, void *data)
-{
-	struct qp_list *qp_listp = data;
-	struct c4iw_qp *qp = p;
-
-	c4iw_qp_add_ref(&qp->ibqp);
-	qp_listp->qps[qp_listp->idx++] = qp;
-	return 0;
-}
-
-static int count_qps(int id, void *p, void *data)
-{
-	unsigned *countp = data;
-	(*countp)++;
-	return 0;
-}
-
 static void deref_qps(struct qp_list *qp_list)
 {
 	int idx;
@@ -1393,7 +1372,7 @@
 	for (idx = 0; idx < qp_list->idx; idx++) {
 		struct c4iw_qp *qp = qp_list->qps[idx];
 
-		spin_lock_irq(&qp->rhp->lock);
+		xa_lock_irq(&qp->rhp->qps);
 		spin_lock(&qp->lock);
 		ret = cxgb4_sync_txq_pidx(qp->rhp->rdev.lldi.ports[0],
 					  qp->wq.sq.qid,
@@ -1403,7 +1382,7 @@
 			pr_err("%s: Fatal error - DB overflow recovery failed - error syncing SQ qid %u\n",
 			       pci_name(ctx->lldi.pdev), qp->wq.sq.qid);
 			spin_unlock(&qp->lock);
-			spin_unlock_irq(&qp->rhp->lock);
+			xa_unlock_irq(&qp->rhp->qps);
 			return;
 		}
 		qp->wq.sq.wq_pidx_inc = 0;
@@ -1417,12 +1396,12 @@
 			pr_err("%s: Fatal error - DB overflow recovery failed - error syncing RQ qid %u\n",
 			       pci_name(ctx->lldi.pdev), qp->wq.rq.qid);
 			spin_unlock(&qp->lock);
-			spin_unlock_irq(&qp->rhp->lock);
+			xa_unlock_irq(&qp->rhp->qps);
 			return;
 		}
 		qp->wq.rq.wq_pidx_inc = 0;
 		spin_unlock(&qp->lock);
-		spin_unlock_irq(&qp->rhp->lock);
+		xa_unlock_irq(&qp->rhp->qps);
 
 		/* Wait for the dbfifo to drain */
 		while (cxgb4_dbfifo_count(qp->rhp->rdev.lldi.ports[0], 1) > 0) {
@@ -1434,6 +1413,8 @@
 
 static void recover_queues(struct uld_ctx *ctx)
 {
+	struct c4iw_qp *qp;
+	unsigned long index;
 	int count = 0;
 	struct qp_list qp_list;
 	int ret;
@@ -1451,22 +1432,26 @@
 	}
 
 	/* Count active queues so we can build a list of queues to recover */
-	spin_lock_irq(&ctx->dev->lock);
+	xa_lock_irq(&ctx->dev->qps);
 	WARN_ON(ctx->dev->db_state != STOPPED);
 	ctx->dev->db_state = RECOVERY;
-	idr_for_each(&ctx->dev->qpidr, count_qps, &count);
+	xa_for_each(&ctx->dev->qps, index, qp)
+		count++;
 
 	qp_list.qps = kcalloc(count, sizeof(*qp_list.qps), GFP_ATOMIC);
 	if (!qp_list.qps) {
-		spin_unlock_irq(&ctx->dev->lock);
+		xa_unlock_irq(&ctx->dev->qps);
 		return;
 	}
 	qp_list.idx = 0;
 
 	/* add and ref each qp so it doesn't get freed */
-	idr_for_each(&ctx->dev->qpidr, add_and_ref_qp, &qp_list);
+	xa_for_each(&ctx->dev->qps, index, qp) {
+		c4iw_qp_add_ref(&qp->ibqp);
+		qp_list.qps[qp_list.idx++] = qp;
+	}
 
-	spin_unlock_irq(&ctx->dev->lock);
+	xa_unlock_irq(&ctx->dev->qps);
 
 	/* now traverse the list in a safe context to recover the db state*/
 	recover_lost_dbs(ctx, &qp_list);
@@ -1475,10 +1460,10 @@
 	deref_qps(&qp_list);
 	kfree(qp_list.qps);
 
-	spin_lock_irq(&ctx->dev->lock);
+	xa_lock_irq(&ctx->dev->qps);
 	WARN_ON(ctx->dev->db_state != RECOVERY);
 	ctx->dev->db_state = STOPPED;
-	spin_unlock_irq(&ctx->dev->lock);
+	xa_unlock_irq(&ctx->dev->qps);
 }
 
 static int c4iw_uld_control(void *handle, enum cxgb4_control control, ...)
@@ -1553,8 +1538,6 @@
 		return err;
 
 	c4iw_debugfs_root = debugfs_create_dir(DRV_NAME, NULL);
-	if (!c4iw_debugfs_root)
-		pr_warn("could not create debugfs entry, continuing\n");
 
 	reg_workq = create_singlethread_workqueue("Register_iWARP_device");
 	if (!reg_workq) {
diff --git a/drivers/infiniband/hw/cxgb4/ev.c b/drivers/infiniband/hw/cxgb4/ev.c
index 8741d23..4cd877b 100644
--- a/drivers/infiniband/hw/cxgb4/ev.c
+++ b/drivers/infiniband/hw/cxgb4/ev.c
@@ -123,15 +123,15 @@
 	struct c4iw_qp *qhp;
 	u32 cqid;
 
-	spin_lock_irq(&dev->lock);
-	qhp = get_qhp(dev, CQE_QPID(err_cqe));
+	xa_lock_irq(&dev->qps);
+	qhp = xa_load(&dev->qps, CQE_QPID(err_cqe));
 	if (!qhp) {
 		pr_err("BAD AE qpid 0x%x opcode %d status 0x%x type %d wrid.hi 0x%x wrid.lo 0x%x\n",
 		       CQE_QPID(err_cqe),
 		       CQE_OPCODE(err_cqe), CQE_STATUS(err_cqe),
 		       CQE_TYPE(err_cqe), CQE_WRID_HI(err_cqe),
 		       CQE_WRID_LOW(err_cqe));
-		spin_unlock_irq(&dev->lock);
+		xa_unlock_irq(&dev->qps);
 		goto out;
 	}
 
@@ -146,13 +146,13 @@
 		       CQE_OPCODE(err_cqe), CQE_STATUS(err_cqe),
 		       CQE_TYPE(err_cqe), CQE_WRID_HI(err_cqe),
 		       CQE_WRID_LOW(err_cqe));
-		spin_unlock_irq(&dev->lock);
+		xa_unlock_irq(&dev->qps);
 		goto out;
 	}
 
 	c4iw_qp_add_ref(&qhp->ibqp);
 	atomic_inc(&chp->refcnt);
-	spin_unlock_irq(&dev->lock);
+	xa_unlock_irq(&dev->qps);
 
 	/* Bad incoming write */
 	if (RQ_TYPE(err_cqe) &&
@@ -225,11 +225,11 @@
 	struct c4iw_cq *chp;
 	unsigned long flag;
 
-	spin_lock_irqsave(&dev->lock, flag);
-	chp = get_chp(dev, qid);
+	xa_lock_irqsave(&dev->cqs, flag);
+	chp = xa_load(&dev->cqs, qid);
 	if (chp) {
 		atomic_inc(&chp->refcnt);
-		spin_unlock_irqrestore(&dev->lock, flag);
+		xa_unlock_irqrestore(&dev->cqs, flag);
 		t4_clear_cq_armed(&chp->cq);
 		spin_lock_irqsave(&chp->comp_handler_lock, flag);
 		(*chp->ibcq.comp_handler)(&chp->ibcq, chp->ibcq.cq_context);
@@ -238,7 +238,7 @@
 			wake_up(&chp->wait);
 	} else {
 		pr_debug("unknown cqid 0x%x\n", qid);
-		spin_unlock_irqrestore(&dev->lock, flag);
+		xa_unlock_irqrestore(&dev->cqs, flag);
 	}
 	return 0;
 }
diff --git a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
index f0fcead..7d06b0f 100644
--- a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
+++ b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
@@ -34,7 +34,7 @@
 #include <linux/mutex.h>
 #include <linux/list.h>
 #include <linux/spinlock.h>
-#include <linux/idr.h>
+#include <linux/xarray.h>
 #include <linux/completion.h>
 #include <linux/netdevice.h>
 #include <linux/sched/mm.h>
@@ -315,16 +315,15 @@
 	struct ib_device ibdev;
 	struct c4iw_rdev rdev;
 	u32 device_cap_flags;
-	struct idr cqidr;
-	struct idr qpidr;
-	struct idr mmidr;
-	spinlock_t lock;
+	struct xarray cqs;
+	struct xarray qps;
+	struct xarray mrs;
 	struct mutex db_mutex;
 	struct dentry *debugfs_root;
 	enum db_state db_state;
-	struct idr hwtid_idr;
-	struct idr atid_idr;
-	struct idr stid_idr;
+	struct xarray hwtids;
+	struct xarray atids;
+	struct xarray stids;
 	struct list_head db_fc_list;
 	u32 avail_ird;
 	wait_queue_head_t wait;
@@ -349,70 +348,12 @@
 
 static inline struct c4iw_cq *get_chp(struct c4iw_dev *rhp, u32 cqid)
 {
-	return idr_find(&rhp->cqidr, cqid);
+	return xa_load(&rhp->cqs, cqid);
 }
 
 static inline struct c4iw_qp *get_qhp(struct c4iw_dev *rhp, u32 qpid)
 {
-	return idr_find(&rhp->qpidr, qpid);
-}
-
-static inline struct c4iw_mr *get_mhp(struct c4iw_dev *rhp, u32 mmid)
-{
-	return idr_find(&rhp->mmidr, mmid);
-}
-
-static inline int _insert_handle(struct c4iw_dev *rhp, struct idr *idr,
-				 void *handle, u32 id, int lock)
-{
-	int ret;
-
-	if (lock) {
-		idr_preload(GFP_KERNEL);
-		spin_lock_irq(&rhp->lock);
-	}
-
-	ret = idr_alloc(idr, handle, id, id + 1, GFP_ATOMIC);
-
-	if (lock) {
-		spin_unlock_irq(&rhp->lock);
-		idr_preload_end();
-	}
-
-	return ret < 0 ? ret : 0;
-}
-
-static inline int insert_handle(struct c4iw_dev *rhp, struct idr *idr,
-				void *handle, u32 id)
-{
-	return _insert_handle(rhp, idr, handle, id, 1);
-}
-
-static inline int insert_handle_nolock(struct c4iw_dev *rhp, struct idr *idr,
-				       void *handle, u32 id)
-{
-	return _insert_handle(rhp, idr, handle, id, 0);
-}
-
-static inline void _remove_handle(struct c4iw_dev *rhp, struct idr *idr,
-				   u32 id, int lock)
-{
-	if (lock)
-		spin_lock_irq(&rhp->lock);
-	idr_remove(idr, id);
-	if (lock)
-		spin_unlock_irq(&rhp->lock);
-}
-
-static inline void remove_handle(struct c4iw_dev *rhp, struct idr *idr, u32 id)
-{
-	_remove_handle(rhp, idr, id, 1);
-}
-
-static inline void remove_handle_nolock(struct c4iw_dev *rhp,
-					 struct idr *idr, u32 id)
-{
-	_remove_handle(rhp, idr, id, 0);
+	return xa_load(&rhp->qps, qpid);
 }
 
 extern uint c4iw_max_read_depth;
@@ -549,13 +490,13 @@
 	struct t4_wq wq;
 	spinlock_t lock;
 	struct mutex mutex;
-	struct kref kref;
 	wait_queue_head_t wait;
 	int sq_sig_all;
 	struct c4iw_srq *srq;
-	struct work_struct free_work;
 	struct c4iw_ucontext *ucontext;
 	struct c4iw_wr_wait *wr_waitp;
+	struct completion qp_rel_comp;
+	refcount_t qp_refcnt;
 };
 
 static inline struct c4iw_qp *to_c4iw_qp(struct ib_qp *ibqp)
@@ -589,7 +530,6 @@
 	u32 key;
 	spinlock_t mmap_lock;
 	struct list_head mmaps;
-	struct kref kref;
 	bool is_32b_cqe;
 };
 
@@ -598,18 +538,6 @@
 	return container_of(c, struct c4iw_ucontext, ibucontext);
 }
 
-void _c4iw_free_ucontext(struct kref *kref);
-
-static inline void c4iw_put_ucontext(struct c4iw_ucontext *ucontext)
-{
-	kref_put(&ucontext->kref, _c4iw_free_ucontext);
-}
-
-static inline void c4iw_get_ucontext(struct c4iw_ucontext *ucontext)
-{
-	kref_get(&ucontext->kref);
-}
-
 struct c4iw_mm_entry {
 	struct list_head entry;
 	u64 addr;
@@ -982,6 +910,9 @@
 	int rcv_win;
 	u32 snd_wscale;
 	struct c4iw_ep_stats stats;
+	u32 srqe_idx;
+	u32 rx_pdu_out_cnt;
+	struct sk_buff *peer_abort_skb;
 };
 
 static inline struct c4iw_ep *to_ep(struct iw_cm_id *cm_id)
@@ -1048,9 +979,8 @@
 int c4iw_reject_cr(struct iw_cm_id *cm_id, const void *pdata, u8 pdata_len);
 void c4iw_qp_add_ref(struct ib_qp *qp);
 void c4iw_qp_rem_ref(struct ib_qp *qp);
-struct ib_mr *c4iw_alloc_mr(struct ib_pd *pd,
-			    enum ib_mr_type mr_type,
-			    u32 max_num_sg);
+struct ib_mr *c4iw_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
+			    u32 max_num_sg, struct ib_udata *udata);
 int c4iw_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents,
 		   unsigned int *sg_offset);
 int c4iw_dealloc_mw(struct ib_mw *mw);
@@ -1061,21 +991,18 @@
 					   u64 length, u64 virt, int acc,
 					   struct ib_udata *udata);
 struct ib_mr *c4iw_get_dma_mr(struct ib_pd *pd, int acc);
-int c4iw_dereg_mr(struct ib_mr *ib_mr);
-int c4iw_destroy_cq(struct ib_cq *ib_cq);
-struct ib_cq *c4iw_create_cq(struct ib_device *ibdev,
-			     const struct ib_cq_init_attr *attr,
-			     struct ib_ucontext *ib_context,
-			     struct ib_udata *udata);
+int c4iw_dereg_mr(struct ib_mr *ib_mr, struct ib_udata *udata);
+void c4iw_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata);
+int c4iw_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
+		   struct ib_udata *udata);
 int c4iw_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags);
 int c4iw_modify_srq(struct ib_srq *ib_srq, struct ib_srq_attr *attr,
 		    enum ib_srq_attr_mask srq_attr_mask,
 		    struct ib_udata *udata);
-int c4iw_destroy_srq(struct ib_srq *ib_srq);
-struct ib_srq *c4iw_create_srq(struct ib_pd *pd,
-			       struct ib_srq_init_attr *attrs,
-			       struct ib_udata *udata);
-int c4iw_destroy_qp(struct ib_qp *ib_qp);
+void c4iw_destroy_srq(struct ib_srq *ib_srq, struct ib_udata *udata);
+int c4iw_create_srq(struct ib_srq *srq, struct ib_srq_init_attr *attrs,
+		    struct ib_udata *udata);
+int c4iw_destroy_qp(struct ib_qp *ib_qp, struct ib_udata *udata);
 struct ib_qp *c4iw_create_qp(struct ib_pd *pd,
 			     struct ib_qp_init_attr *attrs,
 			     struct ib_udata *udata);
diff --git a/drivers/infiniband/hw/cxgb4/mem.c b/drivers/infiniband/hw/cxgb4/mem.c
index 7b76e6f..35c284a 100644
--- a/drivers/infiniband/hw/cxgb4/mem.c
+++ b/drivers/infiniband/hw/cxgb4/mem.c
@@ -130,8 +130,9 @@
 
 		copy_len = len > C4IW_MAX_INLINE_SIZE ? C4IW_MAX_INLINE_SIZE :
 			   len;
-		wr_len = roundup(sizeof *req + sizeof *sc +
-				 roundup(copy_len, T4_ULPTX_MIN_IO), 16);
+		wr_len = roundup(sizeof(*req) + sizeof(*sc) +
+					 roundup(copy_len, T4_ULPTX_MIN_IO),
+				 16);
 
 		if (!skb) {
 			skb = alloc_skb(wr_len, GFP_KERNEL | __GFP_NOFAIL);
@@ -274,13 +275,17 @@
 			   struct sk_buff *skb, struct c4iw_wr_wait *wr_waitp)
 {
 	int err;
-	struct fw_ri_tpte tpt;
+	struct fw_ri_tpte *tpt;
 	u32 stag_idx;
 	static atomic_t key;
 
 	if (c4iw_fatal_error(rdev))
 		return -EIO;
 
+	tpt = kmalloc(sizeof(*tpt), GFP_KERNEL);
+	if (!tpt)
+		return -ENOMEM;
+
 	stag_state = stag_state > 0;
 	stag_idx = (*stag) >> 8;
 
@@ -290,6 +295,7 @@
 			mutex_lock(&rdev->stats.lock);
 			rdev->stats.stag.fail++;
 			mutex_unlock(&rdev->stats.lock);
+			kfree(tpt);
 			return -ENOMEM;
 		}
 		mutex_lock(&rdev->stats.lock);
@@ -304,28 +310,28 @@
 
 	/* write TPT entry */
 	if (reset_tpt_entry)
-		memset(&tpt, 0, sizeof(tpt));
+		memset(tpt, 0, sizeof(*tpt));
 	else {
-		tpt.valid_to_pdid = cpu_to_be32(FW_RI_TPTE_VALID_F |
+		tpt->valid_to_pdid = cpu_to_be32(FW_RI_TPTE_VALID_F |
 			FW_RI_TPTE_STAGKEY_V((*stag & FW_RI_TPTE_STAGKEY_M)) |
 			FW_RI_TPTE_STAGSTATE_V(stag_state) |
 			FW_RI_TPTE_STAGTYPE_V(type) | FW_RI_TPTE_PDID_V(pdid));
-		tpt.locread_to_qpid = cpu_to_be32(FW_RI_TPTE_PERM_V(perm) |
+		tpt->locread_to_qpid = cpu_to_be32(FW_RI_TPTE_PERM_V(perm) |
 			(bind_enabled ? FW_RI_TPTE_MWBINDEN_F : 0) |
 			FW_RI_TPTE_ADDRTYPE_V((zbva ? FW_RI_ZERO_BASED_TO :
 						      FW_RI_VA_BASED_TO))|
 			FW_RI_TPTE_PS_V(page_size));
-		tpt.nosnoop_pbladdr = !pbl_size ? 0 : cpu_to_be32(
+		tpt->nosnoop_pbladdr = !pbl_size ? 0 : cpu_to_be32(
 			FW_RI_TPTE_PBLADDR_V(PBL_OFF(rdev, pbl_addr)>>3));
-		tpt.len_lo = cpu_to_be32((u32)(len & 0xffffffffUL));
-		tpt.va_hi = cpu_to_be32((u32)(to >> 32));
-		tpt.va_lo_fbo = cpu_to_be32((u32)(to & 0xffffffffUL));
-		tpt.dca_mwbcnt_pstag = cpu_to_be32(0);
-		tpt.len_hi = cpu_to_be32((u32)(len >> 32));
+		tpt->len_lo = cpu_to_be32((u32)(len & 0xffffffffUL));
+		tpt->va_hi = cpu_to_be32((u32)(to >> 32));
+		tpt->va_lo_fbo = cpu_to_be32((u32)(to & 0xffffffffUL));
+		tpt->dca_mwbcnt_pstag = cpu_to_be32(0);
+		tpt->len_hi = cpu_to_be32((u32)(len >> 32));
 	}
 	err = write_adapter_mem(rdev, stag_idx +
 				(rdev->lldi.vr->stag.start >> 5),
-				sizeof(tpt), &tpt, skb, wr_waitp);
+				sizeof(*tpt), tpt, skb, wr_waitp);
 
 	if (reset_tpt_entry) {
 		c4iw_put_resource(&rdev->resource.tpt_table, stag_idx);
@@ -333,6 +339,7 @@
 		rdev->stats.stag.cur -= 32;
 		mutex_unlock(&rdev->stats.lock);
 	}
+	kfree(tpt);
 	return err;
 }
 
@@ -395,7 +402,7 @@
 	mhp->ibmr.iova = mhp->attr.va_fbo;
 	mhp->ibmr.page_size = 1U << (mhp->attr.page_size + 12);
 	pr_debug("mmid 0x%x mhp %p\n", mmid, mhp);
-	return insert_handle(mhp->rhp, &mhp->rhp->mmidr, mhp, mmid);
+	return xa_insert_irq(&mhp->rhp->mrs, mmid, mhp, GFP_KERNEL);
 }
 
 static int register_mem(struct c4iw_dev *rhp, struct c4iw_pd *php,
@@ -502,10 +509,9 @@
 			       u64 virt, int acc, struct ib_udata *udata)
 {
 	__be64 *pages;
-	int shift, n, len;
-	int i, k, entry;
+	int shift, n, i;
 	int err = -ENOMEM;
-	struct scatterlist *sg;
+	struct sg_dma_page_iter sg_iter;
 	struct c4iw_dev *rhp;
 	struct c4iw_pd *php;
 	struct c4iw_mr *mhp;
@@ -537,13 +543,13 @@
 
 	mhp->rhp = rhp;
 
-	mhp->umem = ib_umem_get(pd->uobject->context, start, length, acc, 0);
+	mhp->umem = ib_umem_get(udata, start, length, acc, 0);
 	if (IS_ERR(mhp->umem))
 		goto err_free_skb;
 
-	shift = mhp->umem->page_shift;
+	shift = PAGE_SHIFT;
 
-	n = mhp->umem->nmap;
+	n = ib_umem_num_pages(mhp->umem);
 	err = alloc_pbl(mhp, n);
 	if (err)
 		goto err_umem_release;
@@ -556,21 +562,16 @@
 
 	i = n = 0;
 
-	for_each_sg(mhp->umem->sg_head.sgl, sg, mhp->umem->nmap, entry) {
-		len = sg_dma_len(sg) >> shift;
-		for (k = 0; k < len; ++k) {
-			pages[i++] = cpu_to_be64(sg_dma_address(sg) +
-						 (k << shift));
-			if (i == PAGE_SIZE / sizeof *pages) {
-				err = write_pbl(&mhp->rhp->rdev,
-				      pages,
-				      mhp->attr.pbl_addr + (n << 3), i,
-				      mhp->wr_waitp);
-				if (err)
-					goto pbl_done;
-				n += i;
-				i = 0;
-			}
+	for_each_sg_dma_page(mhp->umem->sg_head.sgl, &sg_iter, mhp->umem->nmap, 0) {
+		pages[i++] = cpu_to_be64(sg_page_iter_dma_address(&sg_iter));
+		if (i == PAGE_SIZE / sizeof(*pages)) {
+			err = write_pbl(&mhp->rhp->rdev, pages,
+					mhp->attr.pbl_addr + (n << 3), i,
+					mhp->wr_waitp);
+			if (err)
+				goto pbl_done;
+			n += i;
+			i = 0;
 		}
 	}
 
@@ -651,7 +652,7 @@
 	mhp->attr.stag = stag;
 	mmid = (stag) >> 8;
 	mhp->ibmw.rkey = stag;
-	if (insert_handle(rhp, &rhp->mmidr, mhp, mmid)) {
+	if (xa_insert_irq(&rhp->mrs, mmid, mhp, GFP_KERNEL)) {
 		ret = -ENOMEM;
 		goto dealloc_win;
 	}
@@ -679,19 +680,18 @@
 	mhp = to_c4iw_mw(mw);
 	rhp = mhp->rhp;
 	mmid = (mw->rkey) >> 8;
-	remove_handle(rhp, &rhp->mmidr, mmid);
+	xa_erase_irq(&rhp->mrs, mmid);
 	deallocate_window(&rhp->rdev, mhp->attr.stag, mhp->dereg_skb,
 			  mhp->wr_waitp);
 	kfree_skb(mhp->dereg_skb);
 	c4iw_put_wr_wait(mhp->wr_waitp);
-	kfree(mhp);
 	pr_debug("ib_mw %p mmid 0x%x ptr %p\n", mw, mmid, mhp);
+	kfree(mhp);
 	return 0;
 }
 
-struct ib_mr *c4iw_alloc_mr(struct ib_pd *pd,
-			    enum ib_mr_type mr_type,
-			    u32 max_num_sg)
+struct ib_mr *c4iw_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
+			    u32 max_num_sg, struct ib_udata *udata)
 {
 	struct c4iw_dev *rhp;
 	struct c4iw_pd *php;
@@ -746,7 +746,7 @@
 	mhp->attr.state = 0;
 	mmid = (stag) >> 8;
 	mhp->ibmr.rkey = mhp->ibmr.lkey = stag;
-	if (insert_handle(rhp, &rhp->mmidr, mhp, mmid)) {
+	if (xa_insert_irq(&rhp->mrs, mmid, mhp, GFP_KERNEL)) {
 		ret = -ENOMEM;
 		goto err_dereg;
 	}
@@ -792,7 +792,7 @@
 	return ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset, c4iw_set_page);
 }
 
-int c4iw_dereg_mr(struct ib_mr *ib_mr)
+int c4iw_dereg_mr(struct ib_mr *ib_mr, struct ib_udata *udata)
 {
 	struct c4iw_dev *rhp;
 	struct c4iw_mr *mhp;
@@ -803,7 +803,7 @@
 	mhp = to_c4iw_mr(ib_mr);
 	rhp = mhp->rhp;
 	mmid = mhp->attr.stag >> 8;
-	remove_handle(rhp, &rhp->mmidr, mmid);
+	xa_erase_irq(&rhp->mrs, mmid);
 	if (mhp->mpl)
 		dma_free_coherent(&mhp->rhp->rdev.lldi.pdev->dev,
 				  mhp->max_mpl_len, mhp->mpl, mhp->mpl_addr);
@@ -814,8 +814,7 @@
 				  mhp->attr.pbl_size << 3);
 	if (mhp->kva)
 		kfree((void *) (unsigned long) mhp->kva);
-	if (mhp->umem)
-		ib_umem_release(mhp->umem);
+	ib_umem_release(mhp->umem);
 	pr_debug("mmid 0x%x ptr %p\n", mmid, mhp);
 	c4iw_put_wr_wait(mhp->wr_waitp);
 	kfree(mhp);
@@ -827,9 +826,9 @@
 	struct c4iw_mr *mhp;
 	unsigned long flags;
 
-	spin_lock_irqsave(&rhp->lock, flags);
-	mhp = get_mhp(rhp, rkey >> 8);
+	xa_lock_irqsave(&rhp->mrs, flags);
+	mhp = xa_load(&rhp->mrs, rkey >> 8);
 	if (mhp)
 		mhp->attr.state = 0;
-	spin_unlock_irqrestore(&rhp->lock, flags);
+	xa_unlock_irqrestore(&rhp->mrs, flags);
 }
diff --git a/drivers/infiniband/hw/cxgb4/provider.c b/drivers/infiniband/hw/cxgb4/provider.c
index 4eda687..d373ac0 100644
--- a/drivers/infiniband/hw/cxgb4/provider.c
+++ b/drivers/infiniband/hw/cxgb4/provider.c
@@ -58,51 +58,34 @@
 module_param(fastreg_support, int, 0644);
 MODULE_PARM_DESC(fastreg_support, "Advertise fastreg support (default=1)");
 
-void _c4iw_free_ucontext(struct kref *kref)
+static void c4iw_dealloc_ucontext(struct ib_ucontext *context)
 {
-	struct c4iw_ucontext *ucontext;
+	struct c4iw_ucontext *ucontext = to_c4iw_ucontext(context);
 	struct c4iw_dev *rhp;
 	struct c4iw_mm_entry *mm, *tmp;
 
-	ucontext = container_of(kref, struct c4iw_ucontext, kref);
+	pr_debug("context %p\n", context);
 	rhp = to_c4iw_dev(ucontext->ibucontext.device);
 
-	pr_debug("ucontext %p\n", ucontext);
 	list_for_each_entry_safe(mm, tmp, &ucontext->mmaps, entry)
 		kfree(mm);
 	c4iw_release_dev_ucontext(&rhp->rdev, &ucontext->uctx);
-	kfree(ucontext);
 }
 
-static int c4iw_dealloc_ucontext(struct ib_ucontext *context)
+static int c4iw_alloc_ucontext(struct ib_ucontext *ucontext,
+			       struct ib_udata *udata)
 {
-	struct c4iw_ucontext *ucontext = to_c4iw_ucontext(context);
-
-	pr_debug("context %p\n", context);
-	c4iw_put_ucontext(ucontext);
-	return 0;
-}
-
-static struct ib_ucontext *c4iw_alloc_ucontext(struct ib_device *ibdev,
-					       struct ib_udata *udata)
-{
-	struct c4iw_ucontext *context;
+	struct ib_device *ibdev = ucontext->device;
+	struct c4iw_ucontext *context = to_c4iw_ucontext(ucontext);
 	struct c4iw_dev *rhp = to_c4iw_dev(ibdev);
 	struct c4iw_alloc_ucontext_resp uresp;
 	int ret = 0;
 	struct c4iw_mm_entry *mm = NULL;
 
 	pr_debug("ibdev %p\n", ibdev);
-	context = kzalloc(sizeof(*context), GFP_KERNEL);
-	if (!context) {
-		ret = -ENOMEM;
-		goto err;
-	}
-
 	c4iw_init_dev_ucontext(&rhp->rdev, &context->uctx);
 	INIT_LIST_HEAD(&context->mmaps);
 	spin_lock_init(&context->mmap_lock);
-	kref_init(&context->kref);
 
 	if (udata->outlen < sizeof(uresp) - sizeof(uresp.reserved)) {
 		pr_err_once("Warning - downlevel libcxgb4 (non-fatal), device status page disabled\n");
@@ -111,7 +94,7 @@
 		mm = kmalloc(sizeof(*mm), GFP_KERNEL);
 		if (!mm) {
 			ret = -ENOMEM;
-			goto err_free;
+			goto err;
 		}
 
 		uresp.status_page_size = PAGE_SIZE;
@@ -131,13 +114,11 @@
 		mm->len = PAGE_SIZE;
 		insert_mmap(context, mm);
 	}
-	return &context->ibucontext;
+	return 0;
 err_mm:
 	kfree(mm);
-err_free:
-	kfree(context);
 err:
-	return ERR_PTR(ret);
+	return ret;
 }
 
 static int c4iw_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
@@ -209,7 +190,7 @@
 	return ret;
 }
 
-static int c4iw_deallocate_pd(struct ib_pd *pd)
+static void c4iw_deallocate_pd(struct ib_pd *pd, struct ib_udata *udata)
 {
 	struct c4iw_dev *rhp;
 	struct c4iw_pd *php;
@@ -221,15 +202,12 @@
 	mutex_lock(&rhp->rdev.stats.lock);
 	rhp->rdev.stats.pd.cur--;
 	mutex_unlock(&rhp->rdev.stats.lock);
-	kfree(php);
-	return 0;
 }
 
-static struct ib_pd *c4iw_allocate_pd(struct ib_device *ibdev,
-				      struct ib_ucontext *context,
-				      struct ib_udata *udata)
+static int c4iw_allocate_pd(struct ib_pd *pd, struct ib_udata *udata)
 {
-	struct c4iw_pd *php;
+	struct c4iw_pd *php = to_c4iw_pd(pd);
+	struct ib_device *ibdev = pd->device;
 	u32 pdid;
 	struct c4iw_dev *rhp;
 
@@ -237,20 +215,16 @@
 	rhp = (struct c4iw_dev *) ibdev;
 	pdid =  c4iw_get_resource(&rhp->rdev.resource.pdid_table);
 	if (!pdid)
-		return ERR_PTR(-EINVAL);
-	php = kzalloc(sizeof(*php), GFP_KERNEL);
-	if (!php) {
-		c4iw_put_resource(&rhp->rdev.resource.pdid_table, pdid);
-		return ERR_PTR(-ENOMEM);
-	}
+		return -EINVAL;
+
 	php->pdid = pdid;
 	php->rhp = rhp;
-	if (context) {
+	if (udata) {
 		struct c4iw_alloc_pd_resp uresp = {.pdid = php->pdid};
 
 		if (ib_copy_to_udata(udata, &uresp, sizeof(uresp))) {
-			c4iw_deallocate_pd(&php->ibpd);
-			return ERR_PTR(-EFAULT);
+			c4iw_deallocate_pd(&php->ibpd, udata);
+			return -EFAULT;
 		}
 	}
 	mutex_lock(&rhp->rdev.stats.lock);
@@ -259,7 +233,7 @@
 		rhp->rdev.stats.pd.max = rhp->rdev.stats.pd.cur;
 	mutex_unlock(&rhp->rdev.stats.lock);
 	pr_debug("pdid 0x%0x ptr 0x%p\n", pdid, php);
-	return &php->ibpd;
+	return 0;
 }
 
 static int c4iw_query_pkey(struct ib_device *ibdev, u8 port, u16 index,
@@ -297,7 +271,6 @@
 		return -EINVAL;
 
 	dev = to_c4iw_dev(ibdev);
-	memset(props, 0, sizeof *props);
 	memcpy(&props->sys_image_guid, dev->rdev.lldi.ports[0]->dev_addr, 6);
 	props->hw_ver = CHELSIO_CHIP_RELEASE(dev->rdev.lldi.adapter_type);
 	props->fw_ver = dev->rdev.lldi.fw_vers;
@@ -332,32 +305,8 @@
 static int c4iw_query_port(struct ib_device *ibdev, u8 port,
 			   struct ib_port_attr *props)
 {
-	struct c4iw_dev *dev;
-	struct net_device *netdev;
-	struct in_device *inetdev;
-
 	pr_debug("ibdev %p\n", ibdev);
 
-	dev = to_c4iw_dev(ibdev);
-	netdev = dev->rdev.lldi.ports[port-1];
-	/* props being zeroed by the caller, avoid zeroing it here */
-	props->max_mtu = IB_MTU_4096;
-	props->active_mtu = ib_mtu_int_to_enum(netdev->mtu);
-
-	if (!netif_carrier_ok(netdev))
-		props->state = IB_PORT_DOWN;
-	else {
-		inetdev = in_dev_get(netdev);
-		if (inetdev) {
-			if (inetdev->ifa_list)
-				props->state = IB_PORT_ACTIVE;
-			else
-				props->state = IB_PORT_INIT;
-			in_dev_put(inetdev);
-		} else
-			props->state = IB_PORT_INIT;
-	}
-
 	props->port_cap_flags =
 	    IB_PORT_CM_SUP |
 	    IB_PORT_SNMP_TUNNEL_SUP |
@@ -373,21 +322,23 @@
 	return 0;
 }
 
-static ssize_t show_rev(struct device *dev, struct device_attribute *attr,
-			char *buf)
+static ssize_t hw_rev_show(struct device *dev,
+			   struct device_attribute *attr, char *buf)
 {
-	struct c4iw_dev *c4iw_dev = container_of(dev, struct c4iw_dev,
-						 ibdev.dev);
+	struct c4iw_dev *c4iw_dev =
+			rdma_device_to_drv_device(dev, struct c4iw_dev, ibdev);
+
 	pr_debug("dev 0x%p\n", dev);
 	return sprintf(buf, "%d\n",
 		       CHELSIO_CHIP_RELEASE(c4iw_dev->rdev.lldi.adapter_type));
 }
+static DEVICE_ATTR_RO(hw_rev);
 
-static ssize_t show_hca(struct device *dev, struct device_attribute *attr,
-			char *buf)
+static ssize_t hca_type_show(struct device *dev,
+			     struct device_attribute *attr, char *buf)
 {
-	struct c4iw_dev *c4iw_dev = container_of(dev, struct c4iw_dev,
-						 ibdev.dev);
+	struct c4iw_dev *c4iw_dev =
+			rdma_device_to_drv_device(dev, struct c4iw_dev, ibdev);
 	struct ethtool_drvinfo info;
 	struct net_device *lldev = c4iw_dev->rdev.lldi.ports[0];
 
@@ -395,16 +346,19 @@
 	lldev->ethtool_ops->get_drvinfo(lldev, &info);
 	return sprintf(buf, "%s\n", info.driver);
 }
+static DEVICE_ATTR_RO(hca_type);
 
-static ssize_t show_board(struct device *dev, struct device_attribute *attr,
-			  char *buf)
+static ssize_t board_id_show(struct device *dev, struct device_attribute *attr,
+			     char *buf)
 {
-	struct c4iw_dev *c4iw_dev = container_of(dev, struct c4iw_dev,
-						 ibdev.dev);
+	struct c4iw_dev *c4iw_dev =
+			rdma_device_to_drv_device(dev, struct c4iw_dev, ibdev);
+
 	pr_debug("dev 0x%p\n", dev);
 	return sprintf(buf, "%x.%x\n", c4iw_dev->rdev.lldi.pdev->vendor,
 		       c4iw_dev->rdev.lldi.pdev->device);
 }
+static DEVICE_ATTR_RO(board_id);
 
 enum counters {
 	IP4INSEGS,
@@ -461,14 +415,15 @@
 	return stats->num_counters;
 }
 
-static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL);
-static DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL);
-static DEVICE_ATTR(board_id, S_IRUGO, show_board, NULL);
+static struct attribute *c4iw_class_attributes[] = {
+	&dev_attr_hw_rev.attr,
+	&dev_attr_hca_type.attr,
+	&dev_attr_board_id.attr,
+	NULL
+};
 
-static struct device_attribute *c4iw_class_attributes[] = {
-	&dev_attr_hw_rev,
-	&dev_attr_hca_type,
-	&dev_attr_board_id,
+static const struct attribute_group c4iw_attr_group = {
+	.attrs = c4iw_class_attributes,
 };
 
 static int c4iw_port_immutable(struct ib_device *ibdev, u8 port_num,
@@ -502,24 +457,6 @@
 		 FW_HDR_FW_VER_BUILD_G(c4iw_dev->rdev.lldi.fw_vers));
 }
 
-static struct net_device *get_netdev(struct ib_device *dev, u8 port)
-{
-	struct c4iw_dev *c4iw_dev = container_of(dev, struct c4iw_dev, ibdev);
-	struct c4iw_rdev *rdev = &c4iw_dev->rdev;
-	struct net_device *ndev;
-
-	if (!port || port > rdev->lldi.nports)
-		return NULL;
-
-	rcu_read_lock();
-	ndev = rdev->lldi.ports[port - 1];
-	if (ndev)
-		dev_hold(ndev);
-	rcu_read_unlock();
-
-	return ndev;
-}
-
 static int fill_res_entry(struct sk_buff *msg, struct rdma_restrack_entry *res)
 {
 	return (res->type < ARRAY_SIZE(c4iw_restrack_funcs) &&
@@ -527,18 +464,83 @@
 		c4iw_restrack_funcs[res->type](msg, res) : 0;
 }
 
-void c4iw_register_device(struct work_struct *work)
+static const struct ib_device_ops c4iw_dev_ops = {
+	.owner = THIS_MODULE,
+	.driver_id = RDMA_DRIVER_CXGB4,
+	.uverbs_abi_ver = C4IW_UVERBS_ABI_VERSION,
+
+	.alloc_hw_stats = c4iw_alloc_stats,
+	.alloc_mr = c4iw_alloc_mr,
+	.alloc_mw = c4iw_alloc_mw,
+	.alloc_pd = c4iw_allocate_pd,
+	.alloc_ucontext = c4iw_alloc_ucontext,
+	.create_cq = c4iw_create_cq,
+	.create_qp = c4iw_create_qp,
+	.create_srq = c4iw_create_srq,
+	.dealloc_mw = c4iw_dealloc_mw,
+	.dealloc_pd = c4iw_deallocate_pd,
+	.dealloc_ucontext = c4iw_dealloc_ucontext,
+	.dereg_mr = c4iw_dereg_mr,
+	.destroy_cq = c4iw_destroy_cq,
+	.destroy_qp = c4iw_destroy_qp,
+	.destroy_srq = c4iw_destroy_srq,
+	.fill_res_entry = fill_res_entry,
+	.get_dev_fw_str = get_dev_fw_str,
+	.get_dma_mr = c4iw_get_dma_mr,
+	.get_hw_stats = c4iw_get_mib,
+	.get_port_immutable = c4iw_port_immutable,
+	.iw_accept = c4iw_accept_cr,
+	.iw_add_ref = c4iw_qp_add_ref,
+	.iw_connect = c4iw_connect,
+	.iw_create_listen = c4iw_create_listen,
+	.iw_destroy_listen = c4iw_destroy_listen,
+	.iw_get_qp = c4iw_get_qp,
+	.iw_reject = c4iw_reject_cr,
+	.iw_rem_ref = c4iw_qp_rem_ref,
+	.map_mr_sg = c4iw_map_mr_sg,
+	.mmap = c4iw_mmap,
+	.modify_qp = c4iw_ib_modify_qp,
+	.modify_srq = c4iw_modify_srq,
+	.poll_cq = c4iw_poll_cq,
+	.post_recv = c4iw_post_receive,
+	.post_send = c4iw_post_send,
+	.post_srq_recv = c4iw_post_srq_recv,
+	.query_device = c4iw_query_device,
+	.query_gid = c4iw_query_gid,
+	.query_pkey = c4iw_query_pkey,
+	.query_port = c4iw_query_port,
+	.query_qp = c4iw_ib_query_qp,
+	.reg_user_mr = c4iw_reg_user_mr,
+	.req_notify_cq = c4iw_arm_cq,
+	INIT_RDMA_OBJ_SIZE(ib_pd, c4iw_pd, ibpd),
+	INIT_RDMA_OBJ_SIZE(ib_cq, c4iw_cq, ibcq),
+	INIT_RDMA_OBJ_SIZE(ib_srq, c4iw_srq, ibsrq),
+	INIT_RDMA_OBJ_SIZE(ib_ucontext, c4iw_ucontext, ibucontext),
+};
+
+static int set_netdevs(struct ib_device *ib_dev, struct c4iw_rdev *rdev)
 {
 	int ret;
 	int i;
+
+	for (i = 0; i < rdev->lldi.nports; i++) {
+		ret = ib_device_set_netdev(ib_dev, rdev->lldi.ports[i],
+					   i + 1);
+		if (ret)
+			return ret;
+	}
+	return 0;
+}
+
+void c4iw_register_device(struct work_struct *work)
+{
+	int ret;
 	struct uld_ctx *ctx = container_of(work, struct uld_ctx, reg_work);
 	struct c4iw_dev *dev = ctx->dev;
 
 	pr_debug("c4iw_dev %p\n", dev);
-	strlcpy(dev->ibdev.name, "cxgb4_%d", IB_DEVICE_NAME_MAX);
 	memset(&dev->ibdev.node_guid, 0, sizeof(dev->ibdev.node_guid));
 	memcpy(&dev->ibdev.node_guid, dev->rdev.lldi.ports[0]->dev_addr, 6);
-	dev->ibdev.owner = THIS_MODULE;
 	dev->device_cap_flags = IB_DEVICE_LOCAL_DMA_LKEY | IB_DEVICE_MEM_WINDOW;
 	if (fastreg_support)
 		dev->device_cap_flags |= IB_DEVICE_MEM_MGT_EXTENSIONS;
@@ -571,77 +573,20 @@
 	dev->ibdev.phys_port_cnt = dev->rdev.lldi.nports;
 	dev->ibdev.num_comp_vectors =  dev->rdev.lldi.nciq;
 	dev->ibdev.dev.parent = &dev->rdev.lldi.pdev->dev;
-	dev->ibdev.query_device = c4iw_query_device;
-	dev->ibdev.query_port = c4iw_query_port;
-	dev->ibdev.query_pkey = c4iw_query_pkey;
-	dev->ibdev.query_gid = c4iw_query_gid;
-	dev->ibdev.alloc_ucontext = c4iw_alloc_ucontext;
-	dev->ibdev.dealloc_ucontext = c4iw_dealloc_ucontext;
-	dev->ibdev.mmap = c4iw_mmap;
-	dev->ibdev.alloc_pd = c4iw_allocate_pd;
-	dev->ibdev.dealloc_pd = c4iw_deallocate_pd;
-	dev->ibdev.create_qp = c4iw_create_qp;
-	dev->ibdev.modify_qp = c4iw_ib_modify_qp;
-	dev->ibdev.query_qp = c4iw_ib_query_qp;
-	dev->ibdev.destroy_qp = c4iw_destroy_qp;
-	dev->ibdev.create_srq = c4iw_create_srq;
-	dev->ibdev.modify_srq = c4iw_modify_srq;
-	dev->ibdev.destroy_srq = c4iw_destroy_srq;
-	dev->ibdev.create_cq = c4iw_create_cq;
-	dev->ibdev.destroy_cq = c4iw_destroy_cq;
-	dev->ibdev.poll_cq = c4iw_poll_cq;
-	dev->ibdev.get_dma_mr = c4iw_get_dma_mr;
-	dev->ibdev.reg_user_mr = c4iw_reg_user_mr;
-	dev->ibdev.dereg_mr = c4iw_dereg_mr;
-	dev->ibdev.alloc_mw = c4iw_alloc_mw;
-	dev->ibdev.dealloc_mw = c4iw_dealloc_mw;
-	dev->ibdev.alloc_mr = c4iw_alloc_mr;
-	dev->ibdev.map_mr_sg = c4iw_map_mr_sg;
-	dev->ibdev.req_notify_cq = c4iw_arm_cq;
-	dev->ibdev.post_send = c4iw_post_send;
-	dev->ibdev.post_recv = c4iw_post_receive;
-	dev->ibdev.post_srq_recv = c4iw_post_srq_recv;
-	dev->ibdev.alloc_hw_stats = c4iw_alloc_stats;
-	dev->ibdev.get_hw_stats = c4iw_get_mib;
-	dev->ibdev.uverbs_abi_ver = C4IW_UVERBS_ABI_VERSION;
-	dev->ibdev.get_port_immutable = c4iw_port_immutable;
-	dev->ibdev.get_dev_fw_str = get_dev_fw_str;
-	dev->ibdev.get_netdev = get_netdev;
 
-	dev->ibdev.iwcm = kmalloc(sizeof(struct iw_cm_verbs), GFP_KERNEL);
-	if (!dev->ibdev.iwcm) {
-		ret = -ENOMEM;
-		goto err_dealloc_ctx;
-	}
+	memcpy(dev->ibdev.iw_ifname, dev->rdev.lldi.ports[0]->name,
+	       sizeof(dev->ibdev.iw_ifname));
 
-	dev->ibdev.iwcm->connect = c4iw_connect;
-	dev->ibdev.iwcm->accept = c4iw_accept_cr;
-	dev->ibdev.iwcm->reject = c4iw_reject_cr;
-	dev->ibdev.iwcm->create_listen = c4iw_create_listen;
-	dev->ibdev.iwcm->destroy_listen = c4iw_destroy_listen;
-	dev->ibdev.iwcm->add_ref = c4iw_qp_add_ref;
-	dev->ibdev.iwcm->rem_ref = c4iw_qp_rem_ref;
-	dev->ibdev.iwcm->get_qp = c4iw_get_qp;
-	dev->ibdev.res.fill_res_entry = fill_res_entry;
-	memcpy(dev->ibdev.iwcm->ifname, dev->rdev.lldi.ports[0]->name,
-	       sizeof(dev->ibdev.iwcm->ifname));
-
-	dev->ibdev.driver_id = RDMA_DRIVER_CXGB4;
-	ret = ib_register_device(&dev->ibdev, NULL);
+	rdma_set_device_sysfs_group(&dev->ibdev, &c4iw_attr_group);
+	ib_set_device_ops(&dev->ibdev, &c4iw_dev_ops);
+	ret = set_netdevs(&dev->ibdev, &dev->rdev);
 	if (ret)
-		goto err_kfree_iwcm;
-
-	for (i = 0; i < ARRAY_SIZE(c4iw_class_attributes); ++i) {
-		ret = device_create_file(&dev->ibdev.dev,
-					 c4iw_class_attributes[i]);
-		if (ret)
-			goto err_unregister_device;
-	}
+		goto err_dealloc_ctx;
+	ret = ib_register_device(&dev->ibdev, "cxgb4_%d");
+	if (ret)
+		goto err_dealloc_ctx;
 	return;
-err_unregister_device:
-	ib_unregister_device(&dev->ibdev);
-err_kfree_iwcm:
-	kfree(dev->ibdev.iwcm);
+
 err_dealloc_ctx:
 	pr_err("%s - Failed registering iwarp device: %d\n",
 	       pci_name(ctx->lldi.pdev), ret);
@@ -651,13 +596,7 @@
 
 void c4iw_unregister_device(struct c4iw_dev *dev)
 {
-	int i;
-
 	pr_debug("c4iw_dev %p\n", dev);
-	for (i = 0; i < ARRAY_SIZE(c4iw_class_attributes); ++i)
-		device_remove_file(&dev->ibdev.dev,
-				   c4iw_class_attributes[i]);
 	ib_unregister_device(&dev->ibdev);
-	kfree(dev->ibdev.iwcm);
 	return;
 }
diff --git a/drivers/infiniband/hw/cxgb4/qp.c b/drivers/infiniband/hw/cxgb4/qp.c
index 347fe18..bbcac53 100644
--- a/drivers/infiniband/hw/cxgb4/qp.c
+++ b/drivers/infiniband/hw/cxgb4/qp.c
@@ -31,6 +31,7 @@
  */
 
 #include <linux/module.h>
+#include <rdma/uverbs_ioctl.h>
 
 #include "iw_cxgb4.h"
 
@@ -56,18 +57,18 @@
 
 static int max_fr_immd = T4_MAX_FR_IMMD;
 module_param(max_fr_immd, int, 0644);
-MODULE_PARM_DESC(max_fr_immd, "fastreg threshold for using DSGL instead of immedate");
+MODULE_PARM_DESC(max_fr_immd, "fastreg threshold for using DSGL instead of immediate");
 
 static int alloc_ird(struct c4iw_dev *dev, u32 ird)
 {
 	int ret = 0;
 
-	spin_lock_irq(&dev->lock);
+	xa_lock_irq(&dev->qps);
 	if (ird <= dev->avail_ird)
 		dev->avail_ird -= ird;
 	else
 		ret = -ENOMEM;
-	spin_unlock_irq(&dev->lock);
+	xa_unlock_irq(&dev->qps);
 
 	if (ret)
 		dev_warn(&dev->rdev.lldi.pdev->dev,
@@ -78,9 +79,9 @@
 
 static void free_ird(struct c4iw_dev *dev, int ird)
 {
-	spin_lock_irq(&dev->lock);
+	xa_lock_irq(&dev->qps);
 	dev->avail_ird += ird;
-	spin_unlock_irq(&dev->lock);
+	xa_unlock_irq(&dev->qps);
 }
 
 static void set_state(struct c4iw_qp *qhp, enum c4iw_qp_state state)
@@ -99,7 +100,7 @@
 static void dealloc_host_sq(struct c4iw_rdev *rdev, struct t4_sq *sq)
 {
 	dma_free_coherent(&(rdev->lldi.pdev->dev), sq->memsize, sq->queue,
-			  pci_unmap_addr(sq, mapping));
+			  dma_unmap_addr(sq, mapping));
 }
 
 static void dealloc_sq(struct c4iw_rdev *rdev, struct t4_sq *sq)
@@ -132,7 +133,7 @@
 	if (!sq->queue)
 		return -ENOMEM;
 	sq->phys_addr = virt_to_phys(sq->queue);
-	pci_unmap_addr_set(sq, mapping, sq->dma_addr);
+	dma_unmap_addr_set(sq, mapping, sq->dma_addr);
 	return 0;
 }
 
@@ -273,18 +274,18 @@
 			 (unsigned long long)virt_to_phys(wq->sq.queue),
 			 wq->rq.queue,
 			 (unsigned long long)virt_to_phys(wq->rq.queue));
-		memset(wq->rq.queue, 0, wq->rq.memsize);
 		dma_unmap_addr_set(&wq->rq, mapping, wq->rq.dma_addr);
 	}
 
 	wq->db = rdev->lldi.db_reg;
 
-	wq->sq.bar2_va = c4iw_bar2_addrs(rdev, wq->sq.qid, T4_BAR2_QTYPE_EGRESS,
+	wq->sq.bar2_va = c4iw_bar2_addrs(rdev, wq->sq.qid,
+					 CXGB4_BAR2_QTYPE_EGRESS,
 					 &wq->sq.bar2_qid,
 					 user ? &wq->sq.bar2_pa : NULL);
 	if (need_rq)
 		wq->rq.bar2_va = c4iw_bar2_addrs(rdev, wq->rq.qid,
-						 T4_BAR2_QTYPE_EGRESS,
+						 CXGB4_BAR2_QTYPE_EGRESS,
 						 &wq->rq.bar2_qid,
 						 user ? &wq->rq.bar2_pa : NULL);
 
@@ -301,7 +302,7 @@
 	wq->rq.msn = 1;
 
 	/* build fw_ri_res_wr */
-	wr_len = sizeof *res_wr + 2 * sizeof *res;
+	wr_len = sizeof(*res_wr) + 2 * sizeof(*res);
 	if (need_rq)
 		wr_len += sizeof(*res);
 	skb = alloc_skb(wr_len, GFP_KERNEL);
@@ -437,7 +438,7 @@
 			rem -= len;
 		}
 	}
-	len = roundup(plen + sizeof *immdp, 16) - (plen + sizeof *immdp);
+	len = roundup(plen + sizeof(*immdp), 16) - (plen + sizeof(*immdp));
 	if (len)
 		memset(dstp, 0, len);
 	immdp->op = FW_RI_DATA_IMMD;
@@ -526,7 +527,7 @@
 					 T4_MAX_SEND_INLINE, &plen);
 			if (ret)
 				return ret;
-			size = sizeof wqe->send + sizeof(struct fw_ri_immd) +
+			size = sizeof(wqe->send) + sizeof(struct fw_ri_immd) +
 			       plen;
 		} else {
 			ret = build_isgl((__be64 *)sq->queue,
@@ -535,7 +536,7 @@
 					 wr->sg_list, wr->num_sge, &plen);
 			if (ret)
 				return ret;
-			size = sizeof wqe->send + sizeof(struct fw_ri_isgl) +
+			size = sizeof(wqe->send) + sizeof(struct fw_ri_isgl) +
 			       wr->num_sge * sizeof(struct fw_ri_sge);
 		}
 	} else {
@@ -543,7 +544,7 @@
 		wqe->send.u.immd_src[0].r1 = 0;
 		wqe->send.u.immd_src[0].r2 = 0;
 		wqe->send.u.immd_src[0].immdlen = 0;
-		size = sizeof wqe->send + sizeof(struct fw_ri_immd);
+		size = sizeof(wqe->send) + sizeof(struct fw_ri_immd);
 		plen = 0;
 	}
 	*len16 = DIV_ROUND_UP(size, 16);
@@ -577,7 +578,7 @@
 					 T4_MAX_WRITE_INLINE, &plen);
 			if (ret)
 				return ret;
-			size = sizeof wqe->write + sizeof(struct fw_ri_immd) +
+			size = sizeof(wqe->write) + sizeof(struct fw_ri_immd) +
 			       plen;
 		} else {
 			ret = build_isgl((__be64 *)sq->queue,
@@ -586,7 +587,7 @@
 					 wr->sg_list, wr->num_sge, &plen);
 			if (ret)
 				return ret;
-			size = sizeof wqe->write + sizeof(struct fw_ri_isgl) +
+			size = sizeof(wqe->write) + sizeof(struct fw_ri_isgl) +
 			       wr->num_sge * sizeof(struct fw_ri_sge);
 		}
 	} else {
@@ -594,7 +595,7 @@
 		wqe->write.u.immd_src[0].r1 = 0;
 		wqe->write.u.immd_src[0].r2 = 0;
 		wqe->write.u.immd_src[0].immdlen = 0;
-		size = sizeof wqe->write + sizeof(struct fw_ri_immd);
+		size = sizeof(wqe->write) + sizeof(struct fw_ri_immd);
 		plen = 0;
 	}
 	*len16 = DIV_ROUND_UP(size, 16);
@@ -631,7 +632,10 @@
 
 	wcwr->stag_sink = cpu_to_be32(rdma_wr(wr)->rkey);
 	wcwr->to_sink = cpu_to_be64(rdma_wr(wr)->remote_addr);
-	wcwr->stag_inv = cpu_to_be32(wr->next->ex.invalidate_rkey);
+	if (wr->next->opcode == IB_WR_SEND)
+		wcwr->stag_inv = 0;
+	else
+		wcwr->stag_inv = cpu_to_be32(wr->next->ex.invalidate_rkey);
 	wcwr->r2 = 0;
 	wcwr->r3 = 0;
 
@@ -678,7 +682,7 @@
 	}
 	wqe->read.r2 = 0;
 	wqe->read.r5 = 0;
-	*len16 = DIV_ROUND_UP(sizeof wqe->read, 16);
+	*len16 = DIV_ROUND_UP(sizeof(wqe->read), 16);
 	return 0;
 }
 
@@ -725,7 +729,10 @@
 
 	/* SEND_WITH_INV swsqe */
 	swsqe = &qhp->wq.sq.sw_sq[qhp->wq.sq.pidx];
-	swsqe->opcode = FW_RI_SEND_WITH_INV;
+	if (wr->next->opcode == IB_WR_SEND)
+		swsqe->opcode = FW_RI_SEND;
+	else
+		swsqe->opcode = FW_RI_SEND_WITH_INV;
 	swsqe->idx = qhp->wq.sq.pidx;
 	swsqe->complete = 0;
 	swsqe->signaled = send_signaled;
@@ -758,8 +765,8 @@
 			 &wqe->recv.isgl, wr->sg_list, wr->num_sge, NULL);
 	if (ret)
 		return ret;
-	*len16 = DIV_ROUND_UP(sizeof wqe->recv +
-			      wr->num_sge * sizeof(struct fw_ri_sge), 16);
+	*len16 = DIV_ROUND_UP(
+		sizeof(wqe->recv) + wr->num_sge * sizeof(struct fw_ri_sge), 16);
 	return 0;
 }
 
@@ -878,49 +885,21 @@
 {
 	wqe->inv.stag_inv = cpu_to_be32(wr->ex.invalidate_rkey);
 	wqe->inv.r2 = 0;
-	*len16 = DIV_ROUND_UP(sizeof wqe->inv, 16);
+	*len16 = DIV_ROUND_UP(sizeof(wqe->inv), 16);
 	return 0;
 }
 
-static void free_qp_work(struct work_struct *work)
-{
-	struct c4iw_ucontext *ucontext;
-	struct c4iw_qp *qhp;
-	struct c4iw_dev *rhp;
-
-	qhp = container_of(work, struct c4iw_qp, free_work);
-	ucontext = qhp->ucontext;
-	rhp = qhp->rhp;
-
-	pr_debug("qhp %p ucontext %p\n", qhp, ucontext);
-	destroy_qp(&rhp->rdev, &qhp->wq,
-		   ucontext ? &ucontext->uctx : &rhp->rdev.uctx, !qhp->srq);
-
-	if (ucontext)
-		c4iw_put_ucontext(ucontext);
-	c4iw_put_wr_wait(qhp->wr_waitp);
-	kfree(qhp);
-}
-
-static void queue_qp_free(struct kref *kref)
-{
-	struct c4iw_qp *qhp;
-
-	qhp = container_of(kref, struct c4iw_qp, kref);
-	pr_debug("qhp %p\n", qhp);
-	queue_work(qhp->rhp->rdev.free_workq, &qhp->free_work);
-}
-
 void c4iw_qp_add_ref(struct ib_qp *qp)
 {
 	pr_debug("ib_qp %p\n", qp);
-	kref_get(&to_c4iw_qp(qp)->kref);
+	refcount_inc(&to_c4iw_qp(qp)->qp_refcnt);
 }
 
 void c4iw_qp_rem_ref(struct ib_qp *qp)
 {
 	pr_debug("ib_qp %p\n", qp);
-	kref_put(&to_c4iw_qp(qp)->kref, queue_qp_free);
+	if (refcount_dec_and_test(&to_c4iw_qp(qp)->qp_refcnt))
+		complete(&to_c4iw_qp(qp)->qp_rel_comp);
 }
 
 static void add_to_fc_list(struct list_head *head, struct list_head *entry)
@@ -933,7 +912,7 @@
 {
 	unsigned long flags;
 
-	spin_lock_irqsave(&qhp->rhp->lock, flags);
+	xa_lock_irqsave(&qhp->rhp->qps, flags);
 	spin_lock(&qhp->lock);
 	if (qhp->rhp->db_state == NORMAL)
 		t4_ring_sq_db(&qhp->wq, inc, NULL);
@@ -942,7 +921,7 @@
 		qhp->wq.sq.wq_pidx_inc += inc;
 	}
 	spin_unlock(&qhp->lock);
-	spin_unlock_irqrestore(&qhp->rhp->lock, flags);
+	xa_unlock_irqrestore(&qhp->rhp->qps, flags);
 	return 0;
 }
 
@@ -950,7 +929,7 @@
 {
 	unsigned long flags;
 
-	spin_lock_irqsave(&qhp->rhp->lock, flags);
+	xa_lock_irqsave(&qhp->rhp->qps, flags);
 	spin_lock(&qhp->lock);
 	if (qhp->rhp->db_state == NORMAL)
 		t4_ring_rq_db(&qhp->wq, inc, NULL);
@@ -959,7 +938,7 @@
 		qhp->wq.rq.wq_pidx_inc += inc;
 	}
 	spin_unlock(&qhp->lock);
-	spin_unlock_irqrestore(&qhp->rhp->lock, flags);
+	xa_unlock_irqrestore(&qhp->rhp->qps, flags);
 	return 0;
 }
 
@@ -1132,9 +1111,9 @@
 	/*
 	 * Fastpath for NVMe-oF target WRITE + SEND_WITH_INV wr chain which is
 	 * the response for small NVMEe-oF READ requests.  If the chain is
-	 * exactly a WRITE->SEND_WITH_INV and the sgl depths and lengths
-	 * meet the requirements of the fw_ri_write_cmpl_wr work request,
-	 * then build and post the write_cmpl WR.  If any of the tests
+	 * exactly a WRITE->SEND_WITH_INV or a WRITE->SEND and the sgl depths
+	 * and lengths meet the requirements of the fw_ri_write_cmpl_wr work
+	 * request, then build and post the write_cmpl WR. If any of the tests
 	 * below are not true, then we continue on with the tradtional WRITE
 	 * and SEND WRs.
 	 */
@@ -1144,7 +1123,8 @@
 	    wr && wr->next && !wr->next->next &&
 	    wr->opcode == IB_WR_RDMA_WRITE &&
 	    wr->sg_list[0].length && wr->num_sge <= T4_WRITE_CMPL_MAX_SGL &&
-	    wr->next->opcode == IB_WR_SEND_WITH_INV &&
+	    (wr->next->opcode == IB_WR_SEND ||
+	    wr->next->opcode == IB_WR_SEND_WITH_INV) &&
 	    wr->next->sg_list[0].length == T4_WRITE_CMPL_MAX_CQE &&
 	    wr->next->num_sge == 1 && num_wrs >= 2) {
 		post_write_cmpl(qhp, wr);
@@ -1599,7 +1579,7 @@
 		FW_WR_LEN16_V(DIV_ROUND_UP(sizeof(*wqe), 16)));
 
 	wqe->u.terminate.type = FW_RI_TYPE_TERMINATE;
-	wqe->u.terminate.immdlen = cpu_to_be32(sizeof *term);
+	wqe->u.terminate.immdlen = cpu_to_be32(sizeof(*term));
 	term = (struct terminate_message *)wqe->u.terminate.termmsg;
 	if (qhp->attr.layer_etype == (LAYER_MPA|DDP_LLP)) {
 		term->layer_etype = qhp->attr.layer_etype;
@@ -1744,16 +1724,15 @@
 static void build_rtr_msg(u8 p2p_type, struct fw_ri_init *init)
 {
 	pr_debug("p2p_type = %d\n", p2p_type);
-	memset(&init->u, 0, sizeof init->u);
+	memset(&init->u, 0, sizeof(init->u));
 	switch (p2p_type) {
 	case FW_RI_INIT_P2PTYPE_RDMA_WRITE:
 		init->u.write.opcode = FW_RI_RDMA_WRITE_WR;
 		init->u.write.stag_sink = cpu_to_be32(1);
 		init->u.write.to_sink = cpu_to_be64(1);
 		init->u.write.u.immd_src[0].op = FW_RI_DATA_IMMD;
-		init->u.write.len16 = DIV_ROUND_UP(sizeof init->u.write +
-						   sizeof(struct fw_ri_immd),
-						   16);
+		init->u.write.len16 = DIV_ROUND_UP(
+			sizeof(init->u.write) + sizeof(struct fw_ri_immd), 16);
 		break;
 	case FW_RI_INIT_P2PTYPE_READ_REQ:
 		init->u.write.opcode = FW_RI_RDMA_READ_WR;
@@ -1761,7 +1740,7 @@
 		init->u.read.to_src_lo = cpu_to_be32(1);
 		init->u.read.stag_sink = cpu_to_be32(1);
 		init->u.read.to_sink_lo = cpu_to_be32(1);
-		init->u.read.len16 = DIV_ROUND_UP(sizeof init->u.read, 16);
+		init->u.read.len16 = DIV_ROUND_UP(sizeof(init->u.read), 16);
 		break;
 	}
 }
@@ -1775,7 +1754,7 @@
 	pr_debug("qhp %p qid 0x%x tid %u ird %u ord %u\n", qhp,
 		 qhp->wq.sq.qid, qhp->ep->hwtid, qhp->ep->ird, qhp->ep->ord);
 
-	skb = alloc_skb(sizeof *wqe, GFP_KERNEL);
+	skb = alloc_skb(sizeof(*wqe), GFP_KERNEL);
 	if (!skb) {
 		ret = -ENOMEM;
 		goto out;
@@ -1969,10 +1948,10 @@
 			qhp->attr.layer_etype = attrs->layer_etype;
 			qhp->attr.ecode = attrs->ecode;
 			ep = qhp->ep;
+			c4iw_get_ep(&ep->com);
+			disconnect = 1;
 			if (!internal) {
-				c4iw_get_ep(&qhp->ep->com);
 				terminate = 1;
-				disconnect = 1;
 			} else {
 				terminate = qhp->attr.send_term;
 				ret = rdma_fini(rhp, qhp, ep);
@@ -2088,14 +2067,16 @@
 	return ret;
 }
 
-int c4iw_destroy_qp(struct ib_qp *ib_qp)
+int c4iw_destroy_qp(struct ib_qp *ib_qp, struct ib_udata *udata)
 {
 	struct c4iw_dev *rhp;
 	struct c4iw_qp *qhp;
+	struct c4iw_ucontext *ucontext;
 	struct c4iw_qp_attributes attrs;
 
 	qhp = to_c4iw_qp(ib_qp);
 	rhp = qhp->rhp;
+	ucontext = qhp->ucontext;
 
 	attrs.next_state = C4IW_QP_STATE_ERROR;
 	if (qhp->attr.state == C4IW_QP_STATE_TERMINATE)
@@ -2104,17 +2085,26 @@
 		c4iw_modify_qp(rhp, qhp, C4IW_QP_ATTR_NEXT_STATE, &attrs, 0);
 	wait_event(qhp->wait, !qhp->ep);
 
-	remove_handle(rhp, &rhp->qpidr, qhp->wq.sq.qid);
-
-	spin_lock_irq(&rhp->lock);
+	xa_lock_irq(&rhp->qps);
+	__xa_erase(&rhp->qps, qhp->wq.sq.qid);
 	if (!list_empty(&qhp->db_fc_entry))
 		list_del_init(&qhp->db_fc_entry);
-	spin_unlock_irq(&rhp->lock);
+	xa_unlock_irq(&rhp->qps);
 	free_ird(rhp, qhp->attr.max_ird);
 
 	c4iw_qp_rem_ref(ib_qp);
 
+	wait_for_completion(&qhp->qp_rel_comp);
+
 	pr_debug("ib_qp %p qpid 0x%0x\n", ib_qp, qhp->wq.sq.qid);
+	pr_debug("qhp %p ucontext %p\n", qhp, ucontext);
+
+	destroy_qp(&rhp->rdev, &qhp->wq,
+		   ucontext ? &ucontext->uctx : &rhp->rdev.uctx, !qhp->srq);
+
+	c4iw_put_wr_wait(qhp->wr_waitp);
+
+	kfree(qhp);
 	return 0;
 }
 
@@ -2128,7 +2118,8 @@
 	struct c4iw_cq *rchp;
 	struct c4iw_create_qp_resp uresp;
 	unsigned int sqsize, rqsize = 0;
-	struct c4iw_ucontext *ucontext;
+	struct c4iw_ucontext *ucontext = rdma_udata_to_drv_context(
+		udata, struct c4iw_ucontext, ibucontext);
 	int ret;
 	struct c4iw_mm_entry *sq_key_mm, *rq_key_mm = NULL, *sq_db_key_mm;
 	struct c4iw_mm_entry *rq_db_key_mm = NULL, *ma_sync_key_mm = NULL;
@@ -2162,8 +2153,6 @@
 	if (sqsize < 8)
 		sqsize = 8;
 
-	ucontext = pd->uobject ? to_c4iw_ucontext(pd->uobject->context) : NULL;
-
 	qhp = kzalloc(sizeof(*qhp), GFP_KERNEL);
 	if (!qhp)
 		return ERR_PTR(-ENOMEM);
@@ -2225,10 +2214,10 @@
 	spin_lock_init(&qhp->lock);
 	mutex_init(&qhp->mutex);
 	init_waitqueue_head(&qhp->wait);
-	kref_init(&qhp->kref);
-	INIT_WORK(&qhp->free_work, free_qp_work);
+	init_completion(&qhp->qp_rel_comp);
+	refcount_set(&qhp->qp_refcnt, 1);
 
-	ret = insert_handle(rhp, &rhp->qpidr, qhp, qhp->wq.sq.qid);
+	ret = xa_insert_irq(&rhp->qps, qhp->wq.sq.qid, qhp, GFP_KERNEL);
 	if (ret)
 		goto err_destroy_qp;
 
@@ -2297,7 +2286,7 @@
 			ucontext->key += PAGE_SIZE;
 		}
 		spin_unlock(&ucontext->mmap_lock);
-		ret = ib_copy_to_udata(udata, &uresp, sizeof uresp);
+		ret = ib_copy_to_udata(udata, &uresp, sizeof(uresp));
 		if (ret)
 			goto err_free_ma_sync_key;
 		sq_key_mm->key = uresp.sq_key;
@@ -2330,7 +2319,6 @@
 			insert_mmap(ucontext, ma_sync_key_mm);
 		}
 
-		c4iw_get_ucontext(ucontext);
 		qhp->ucontext = ucontext;
 	}
 	if (!attrs->srq) {
@@ -2365,7 +2353,7 @@
 err_free_sq_key:
 	kfree(sq_key_mm);
 err_remove_handle:
-	remove_handle(rhp, &rhp->qpidr, qhp->wq.sq.qid);
+	xa_erase_irq(&rhp->qps, qhp->wq.sq.qid);
 err_destroy_qp:
 	destroy_qp(&rhp->rdev, &qhp->wq,
 		   ucontext ? &ucontext->uctx : &rhp->rdev.uctx, !attrs->srq);
@@ -2382,7 +2370,7 @@
 	struct c4iw_dev *rhp;
 	struct c4iw_qp *qhp;
 	enum c4iw_qp_attr_mask mask = 0;
-	struct c4iw_qp_attributes attrs;
+	struct c4iw_qp_attributes attrs = {};
 
 	pr_debug("ib_qp %p\n", ibqp);
 
@@ -2394,7 +2382,6 @@
 	if (!attr_mask)
 		return 0;
 
-	memset(&attrs, 0, sizeof attrs);
 	qhp = to_c4iw_qp(ibqp);
 	rhp = qhp->rhp;
 
@@ -2478,8 +2465,8 @@
 {
 	struct c4iw_qp *qhp = to_c4iw_qp(ibqp);
 
-	memset(attr, 0, sizeof *attr);
-	memset(init_attr, 0, sizeof *init_attr);
+	memset(attr, 0, sizeof(*attr));
+	memset(init_attr, 0, sizeof(*init_attr));
 	attr->qp_state = to_ib_qp_state(qhp->attr.state);
 	init_attr->cap.max_send_wr = qhp->attr.sq_num_entries;
 	init_attr->cap.max_recv_wr = qhp->attr.rq_num_entries;
@@ -2521,7 +2508,7 @@
 
 	dma_free_coherent(&rdev->lldi.pdev->dev,
 			  wq->memsize, wq->queue,
-			pci_unmap_addr(wq, mapping));
+			dma_unmap_addr(wq, mapping));
 	c4iw_rqtpool_free(rdev, wq->rqt_hwaddr, wq->rqt_size);
 	kfree(wq->sw_rq);
 	c4iw_put_qpid(rdev, wq->qid, uctx);
@@ -2563,16 +2550,14 @@
 	wq->rqt_abs_idx = (wq->rqt_hwaddr - rdev->lldi.vr->rq.start) >>
 		T4_RQT_ENTRY_SHIFT;
 
-	wq->queue = dma_alloc_coherent(&rdev->lldi.pdev->dev,
-				       wq->memsize, &wq->dma_addr,
-			GFP_KERNEL);
+	wq->queue = dma_alloc_coherent(&rdev->lldi.pdev->dev, wq->memsize,
+				       &wq->dma_addr, GFP_KERNEL);
 	if (!wq->queue)
 		goto err_free_rqtpool;
 
-	memset(wq->queue, 0, wq->memsize);
-	pci_unmap_addr_set(wq, mapping, wq->dma_addr);
+	dma_unmap_addr_set(wq, mapping, wq->dma_addr);
 
-	wq->bar2_va = c4iw_bar2_addrs(rdev, wq->qid, T4_BAR2_QTYPE_EGRESS,
+	wq->bar2_va = c4iw_bar2_addrs(rdev, wq->qid, CXGB4_BAR2_QTYPE_EGRESS,
 				      &wq->bar2_qid,
 			user ? &wq->bar2_pa : NULL);
 
@@ -2590,7 +2575,7 @@
 	/* build fw_ri_res_wr */
 	wr_len = sizeof(*res_wr) + sizeof(*res);
 
-	skb = alloc_skb(wr_len, GFP_KERNEL | __GFP_NOFAIL);
+	skb = alloc_skb(wr_len, GFP_KERNEL);
 	if (!skb)
 		goto err_free_queue;
 	set_wr_txq(skb, CPL_PRIORITY_CONTROL, 0);
@@ -2649,7 +2634,7 @@
 err_free_queue:
 	dma_free_coherent(&rdev->lldi.pdev->dev,
 			  wq->memsize, wq->queue,
-			pci_unmap_addr(wq, mapping));
+			dma_unmap_addr(wq, mapping));
 err_free_rqtpool:
 	c4iw_rqtpool_free(rdev, wq->rqt_hwaddr, wq->rqt_size);
 err_free_pending_wrs:
@@ -2681,11 +2666,12 @@
 	}
 }
 
-struct ib_srq *c4iw_create_srq(struct ib_pd *pd, struct ib_srq_init_attr *attrs,
+int c4iw_create_srq(struct ib_srq *ib_srq, struct ib_srq_init_attr *attrs,
 			       struct ib_udata *udata)
 {
+	struct ib_pd *pd = ib_srq->pd;
 	struct c4iw_dev *rhp;
-	struct c4iw_srq *srq;
+	struct c4iw_srq *srq = to_c4iw_srq(ib_srq);
 	struct c4iw_pd *php;
 	struct c4iw_create_srq_resp uresp;
 	struct c4iw_ucontext *ucontext;
@@ -2700,11 +2686,11 @@
 	rhp = php->rhp;
 
 	if (!rhp->rdev.lldi.vr->srq.size)
-		return ERR_PTR(-EINVAL);
+		return -EINVAL;
 	if (attrs->attr.max_wr > rhp->rdev.hw_queue.t4_max_rq_size)
-		return ERR_PTR(-E2BIG);
+		return -E2BIG;
 	if (attrs->attr.max_sge > T4_MAX_RECV_SGE)
-		return ERR_PTR(-E2BIG);
+		return -E2BIG;
 
 	/*
 	 * SRQ RQT and RQ must be a power of 2 and at least 16 deep.
@@ -2712,17 +2698,12 @@
 	rqsize = attrs->attr.max_wr + 1;
 	rqsize = roundup_pow_of_two(max_t(u16, rqsize, 16));
 
-	ucontext = pd->uobject ? to_c4iw_ucontext(pd->uobject->context) : NULL;
-
-	srq = kzalloc(sizeof(*srq), GFP_KERNEL);
-	if (!srq)
-		return ERR_PTR(-ENOMEM);
+	ucontext = rdma_udata_to_drv_context(udata, struct c4iw_ucontext,
+					     ibucontext);
 
 	srq->wr_waitp = c4iw_alloc_wr_wait(GFP_KERNEL);
-	if (!srq->wr_waitp) {
-		ret = -ENOMEM;
-		goto err_free_srq;
-	}
+	if (!srq->wr_waitp)
+		return -ENOMEM;
 
 	srq->idx = c4iw_alloc_srq_idx(&rhp->rdev);
 	if (srq->idx < 0) {
@@ -2756,15 +2737,11 @@
 	if (CHELSIO_CHIP_VERSION(rhp->rdev.lldi.adapter_type) > CHELSIO_T6)
 		srq->flags = T4_SRQ_LIMIT_SUPPORT;
 
-	ret = insert_handle(rhp, &rhp->qpidr, srq, srq->wq.qid);
-	if (ret)
-		goto err_free_queue;
-
 	if (udata) {
 		srq_key_mm = kmalloc(sizeof(*srq_key_mm), GFP_KERNEL);
 		if (!srq_key_mm) {
 			ret = -ENOMEM;
-			goto err_remove_handle;
+			goto err_free_queue;
 		}
 		srq_db_key_mm = kmalloc(sizeof(*srq_db_key_mm), GFP_KERNEL);
 		if (!srq_db_key_mm) {
@@ -2802,29 +2779,25 @@
 			(unsigned long)srq->wq.memsize, attrs->attr.max_wr);
 
 	spin_lock_init(&srq->lock);
-	return &srq->ibsrq;
+	return 0;
+
 err_free_srq_db_key_mm:
 	kfree(srq_db_key_mm);
 err_free_srq_key_mm:
 	kfree(srq_key_mm);
-err_remove_handle:
-	remove_handle(rhp, &rhp->qpidr, srq->wq.qid);
 err_free_queue:
 	free_srq_queue(srq, ucontext ? &ucontext->uctx : &rhp->rdev.uctx,
 		       srq->wr_waitp);
 err_free_skb:
-	if (srq->destroy_skb)
-		kfree_skb(srq->destroy_skb);
+	kfree_skb(srq->destroy_skb);
 err_free_srq_idx:
 	c4iw_free_srq_idx(&rhp->rdev, srq->idx);
 err_free_wr_wait:
 	c4iw_put_wr_wait(srq->wr_waitp);
-err_free_srq:
-	kfree(srq);
-	return ERR_PTR(ret);
+	return ret;
 }
 
-int c4iw_destroy_srq(struct ib_srq *ibsrq)
+void c4iw_destroy_srq(struct ib_srq *ibsrq, struct ib_udata *udata)
 {
 	struct c4iw_dev *rhp;
 	struct c4iw_srq *srq;
@@ -2834,14 +2807,10 @@
 	rhp = srq->rhp;
 
 	pr_debug("%s id %d\n", __func__, srq->wq.qid);
-
-	remove_handle(rhp, &rhp->qpidr, srq->wq.qid);
-	ucontext = ibsrq->uobject ?
-		to_c4iw_ucontext(ibsrq->uobject->context) : NULL;
+	ucontext = rdma_udata_to_drv_context(udata, struct c4iw_ucontext,
+					     ibucontext);
 	free_srq_queue(srq, ucontext ? &ucontext->uctx : &rhp->rdev.uctx,
 		       srq->wr_waitp);
 	c4iw_free_srq_idx(&rhp->rdev, srq->idx);
 	c4iw_put_wr_wait(srq->wr_waitp);
-	kfree(srq);
-	return 0;
 }
diff --git a/drivers/infiniband/hw/cxgb4/resource.c b/drivers/infiniband/hw/cxgb4/resource.c
index 57ed26b..5c95c78 100644
--- a/drivers/infiniband/hw/cxgb4/resource.c
+++ b/drivers/infiniband/hw/cxgb4/resource.c
@@ -126,7 +126,7 @@
 		rdev->stats.qid.cur += rdev->qpmask + 1;
 		mutex_unlock(&rdev->stats.lock);
 		for (i = qid+1; i & rdev->qpmask; i++) {
-			entry = kmalloc(sizeof *entry, GFP_KERNEL);
+			entry = kmalloc(sizeof(*entry), GFP_KERNEL);
 			if (!entry)
 				goto out;
 			entry->qid = i;
@@ -137,13 +137,13 @@
 		 * now put the same ids on the qp list since they all
 		 * map to the same db/gts page.
 		 */
-		entry = kmalloc(sizeof *entry, GFP_KERNEL);
+		entry = kmalloc(sizeof(*entry), GFP_KERNEL);
 		if (!entry)
 			goto out;
 		entry->qid = qid;
 		list_add_tail(&entry->entry, &uctx->qpids);
 		for (i = qid+1; i & rdev->qpmask; i++) {
-			entry = kmalloc(sizeof *entry, GFP_KERNEL);
+			entry = kmalloc(sizeof(*entry), GFP_KERNEL);
 			if (!entry)
 				goto out;
 			entry->qid = i;
@@ -165,7 +165,7 @@
 {
 	struct c4iw_qid_list *entry;
 
-	entry = kmalloc(sizeof *entry, GFP_KERNEL);
+	entry = kmalloc(sizeof(*entry), GFP_KERNEL);
 	if (!entry)
 		return;
 	pr_debug("qid 0x%x\n", qid);
@@ -200,7 +200,7 @@
 		rdev->stats.qid.cur += rdev->qpmask + 1;
 		mutex_unlock(&rdev->stats.lock);
 		for (i = qid+1; i & rdev->qpmask; i++) {
-			entry = kmalloc(sizeof *entry, GFP_KERNEL);
+			entry = kmalloc(sizeof(*entry), GFP_KERNEL);
 			if (!entry)
 				goto out;
 			entry->qid = i;
@@ -211,13 +211,13 @@
 		 * now put the same ids on the cq list since they all
 		 * map to the same db/gts page.
 		 */
-		entry = kmalloc(sizeof *entry, GFP_KERNEL);
+		entry = kmalloc(sizeof(*entry), GFP_KERNEL);
 		if (!entry)
 			goto out;
 		entry->qid = qid;
 		list_add_tail(&entry->entry, &uctx->cqids);
 		for (i = qid; i & rdev->qpmask; i++) {
-			entry = kmalloc(sizeof *entry, GFP_KERNEL);
+			entry = kmalloc(sizeof(*entry), GFP_KERNEL);
 			if (!entry)
 				goto out;
 			entry->qid = i;
@@ -239,7 +239,7 @@
 {
 	struct c4iw_qid_list *entry;
 
-	entry = kmalloc(sizeof *entry, GFP_KERNEL);
+	entry = kmalloc(sizeof(*entry), GFP_KERNEL);
 	if (!entry)
 		return;
 	pr_debug("qid 0x%x\n", qid);
diff --git a/drivers/infiniband/hw/cxgb4/restrack.c b/drivers/infiniband/hw/cxgb4/restrack.c
index 9a7520e..f82d46e 100644
--- a/drivers/infiniband/hw/cxgb4/restrack.c
+++ b/drivers/infiniband/hw/cxgb4/restrack.c
@@ -149,7 +149,7 @@
 	if (qhp->ucontext)
 		return 0;
 
-	table_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_DRIVER);
+	table_attr = nla_nest_start_noflag(msg, RDMA_NLDEV_ATTR_DRIVER);
 	if (!table_attr)
 		goto err;
 
@@ -216,7 +216,7 @@
 	if (!uep)
 		return 0;
 
-	table_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_DRIVER);
+	table_attr = nla_nest_start_noflag(msg, RDMA_NLDEV_ATTR_DRIVER);
 	if (!table_attr)
 		goto err_free_uep;
 
@@ -387,7 +387,7 @@
 	if (ibcq->uobject)
 		return 0;
 
-	table_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_DRIVER);
+	table_attr = nla_nest_start_noflag(msg, RDMA_NLDEV_ATTR_DRIVER);
 	if (!table_attr)
 		goto err;
 
@@ -447,7 +447,7 @@
 	if (!stag)
 		return 0;
 
-	table_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_DRIVER);
+	table_attr = nla_nest_start_noflag(msg, RDMA_NLDEV_ATTR_DRIVER);
 	if (!table_attr)
 		goto err;
 
diff --git a/drivers/infiniband/hw/cxgb4/t4.h b/drivers/infiniband/hw/cxgb4/t4.h
index e42021f..b170817 100644
--- a/drivers/infiniband/hw/cxgb4/t4.h
+++ b/drivers/infiniband/hw/cxgb4/t4.h
@@ -35,6 +35,7 @@
 #include "t4_regs.h"
 #include "t4_values.h"
 #include "t4_msg.h"
+#include "t4_tcb.h"
 #include "t4fw_ri_api.h"
 
 #define T4_MAX_NUM_PD 65536
@@ -397,7 +398,7 @@
 struct t4_srq {
 	union t4_recv_wr *queue;
 	dma_addr_t dma_addr;
-	DECLARE_PCI_UNMAP_ADDR(mapping);
+	DEFINE_DMA_UNMAP_ADDR(mapping);
 	struct t4_swrqe *sw_rq;
 	void __iomem *bar2_va;
 	u64 bar2_pa;
diff --git a/drivers/infiniband/hw/efa/Kconfig b/drivers/infiniband/hw/efa/Kconfig
new file mode 100644
index 0000000..457e18b
--- /dev/null
+++ b/drivers/infiniband/hw/efa/Kconfig
@@ -0,0 +1,15 @@
+# SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause
+# Copyright 2018-2019 Amazon.com, Inc. or its affiliates. All rights reserved.
+#
+# Amazon fabric device configuration
+#
+
+config INFINIBAND_EFA
+	tristate "Amazon Elastic Fabric Adapter (EFA) support"
+	depends on PCI_MSI && 64BIT && !CPU_BIG_ENDIAN
+	depends on INFINIBAND_USER_ACCESS
+	help
+	  This driver supports Amazon Elastic Fabric Adapter (EFA).
+
+	  To compile this driver as a module, choose M here.
+	  The module will be called efa.
diff --git a/drivers/infiniband/hw/efa/Makefile b/drivers/infiniband/hw/efa/Makefile
new file mode 100644
index 0000000..6e83083
--- /dev/null
+++ b/drivers/infiniband/hw/efa/Makefile
@@ -0,0 +1,9 @@
+# SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause
+# Copyright 2018-2019 Amazon.com, Inc. or its affiliates. All rights reserved.
+#
+# Makefile for Amazon Elastic Fabric Adapter (EFA) device driver.
+#
+
+obj-$(CONFIG_INFINIBAND_EFA) += efa.o
+
+efa-y := efa_com_cmd.o efa_com.o efa_main.o efa_verbs.o
diff --git a/drivers/infiniband/hw/efa/efa.h b/drivers/infiniband/hw/efa/efa.h
new file mode 100644
index 0000000..2283e43
--- /dev/null
+++ b/drivers/infiniband/hw/efa/efa.h
@@ -0,0 +1,163 @@
+/* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */
+/*
+ * Copyright 2018-2019 Amazon.com, Inc. or its affiliates. All rights reserved.
+ */
+
+#ifndef _EFA_H_
+#define _EFA_H_
+
+#include <linux/bitops.h>
+#include <linux/interrupt.h>
+#include <linux/pci.h>
+
+#include <rdma/efa-abi.h>
+#include <rdma/ib_verbs.h>
+
+#include "efa_com_cmd.h"
+
+#define DRV_MODULE_NAME         "efa"
+#define DEVICE_NAME             "Elastic Fabric Adapter (EFA)"
+
+#define EFA_IRQNAME_SIZE        40
+
+/* 1 for AENQ + ADMIN */
+#define EFA_NUM_MSIX_VEC                  1
+#define EFA_MGMNT_MSIX_VEC_IDX            0
+
+struct efa_irq {
+	irq_handler_t handler;
+	void *data;
+	int cpu;
+	u32 vector;
+	cpumask_t affinity_hint_mask;
+	char name[EFA_IRQNAME_SIZE];
+};
+
+struct efa_sw_stats {
+	atomic64_t alloc_pd_err;
+	atomic64_t create_qp_err;
+	atomic64_t create_cq_err;
+	atomic64_t reg_mr_err;
+	atomic64_t alloc_ucontext_err;
+	atomic64_t create_ah_err;
+};
+
+/* Don't use anything other than atomic64 */
+struct efa_stats {
+	struct efa_sw_stats sw_stats;
+	atomic64_t keep_alive_rcvd;
+};
+
+struct efa_dev {
+	struct ib_device ibdev;
+	struct efa_com_dev edev;
+	struct pci_dev *pdev;
+	struct efa_com_get_device_attr_result dev_attr;
+
+	u64 reg_bar_addr;
+	u64 reg_bar_len;
+	u64 mem_bar_addr;
+	u64 mem_bar_len;
+	u64 db_bar_addr;
+	u64 db_bar_len;
+	u8 addr[EFA_GID_SIZE];
+	u32 mtu;
+
+	int admin_msix_vector_idx;
+	struct efa_irq admin_irq;
+
+	struct efa_stats stats;
+};
+
+struct efa_ucontext {
+	struct ib_ucontext ibucontext;
+	struct xarray mmap_xa;
+	u32 mmap_xa_page;
+	u16 uarn;
+};
+
+struct efa_pd {
+	struct ib_pd ibpd;
+	u16 pdn;
+};
+
+struct efa_mr {
+	struct ib_mr ibmr;
+	struct ib_umem *umem;
+};
+
+struct efa_cq {
+	struct ib_cq ibcq;
+	struct efa_ucontext *ucontext;
+	dma_addr_t dma_addr;
+	void *cpu_addr;
+	size_t size;
+	u16 cq_idx;
+};
+
+struct efa_qp {
+	struct ib_qp ibqp;
+	dma_addr_t rq_dma_addr;
+	void *rq_cpu_addr;
+	size_t rq_size;
+	enum ib_qp_state state;
+	u32 qp_handle;
+	u32 max_send_wr;
+	u32 max_recv_wr;
+	u32 max_send_sge;
+	u32 max_recv_sge;
+	u32 max_inline_data;
+};
+
+struct efa_ah {
+	struct ib_ah ibah;
+	u16 ah;
+	/* dest_addr */
+	u8 id[EFA_GID_SIZE];
+};
+
+int efa_query_device(struct ib_device *ibdev,
+		     struct ib_device_attr *props,
+		     struct ib_udata *udata);
+int efa_query_port(struct ib_device *ibdev, u8 port,
+		   struct ib_port_attr *props);
+int efa_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr,
+		 int qp_attr_mask,
+		 struct ib_qp_init_attr *qp_init_attr);
+int efa_query_gid(struct ib_device *ibdev, u8 port, int index,
+		  union ib_gid *gid);
+int efa_query_pkey(struct ib_device *ibdev, u8 port, u16 index,
+		   u16 *pkey);
+int efa_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata);
+void efa_dealloc_pd(struct ib_pd *ibpd, struct ib_udata *udata);
+int efa_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata);
+struct ib_qp *efa_create_qp(struct ib_pd *ibpd,
+			    struct ib_qp_init_attr *init_attr,
+			    struct ib_udata *udata);
+void efa_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata);
+int efa_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
+		  struct ib_udata *udata);
+struct ib_mr *efa_reg_mr(struct ib_pd *ibpd, u64 start, u64 length,
+			 u64 virt_addr, int access_flags,
+			 struct ib_udata *udata);
+int efa_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata);
+int efa_get_port_immutable(struct ib_device *ibdev, u8 port_num,
+			   struct ib_port_immutable *immutable);
+int efa_alloc_ucontext(struct ib_ucontext *ibucontext, struct ib_udata *udata);
+void efa_dealloc_ucontext(struct ib_ucontext *ibucontext);
+int efa_mmap(struct ib_ucontext *ibucontext,
+	     struct vm_area_struct *vma);
+int efa_create_ah(struct ib_ah *ibah,
+		  struct rdma_ah_attr *ah_attr,
+		  u32 flags,
+		  struct ib_udata *udata);
+void efa_destroy_ah(struct ib_ah *ibah, u32 flags);
+int efa_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr,
+		  int qp_attr_mask, struct ib_udata *udata);
+enum rdma_link_layer efa_port_link_layer(struct ib_device *ibdev,
+					 u8 port_num);
+struct rdma_hw_stats *efa_alloc_hw_stats(struct ib_device *ibdev, u8 port_num);
+int efa_get_hw_stats(struct ib_device *ibdev, struct rdma_hw_stats *stats,
+		     u8 port_num, int index);
+
+#endif /* _EFA_H_ */
diff --git a/drivers/infiniband/hw/efa/efa_admin_cmds_defs.h b/drivers/infiniband/hw/efa/efa_admin_cmds_defs.h
new file mode 100644
index 0000000..2be0469
--- /dev/null
+++ b/drivers/infiniband/hw/efa/efa_admin_cmds_defs.h
@@ -0,0 +1,794 @@
+/* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */
+/*
+ * Copyright 2018-2019 Amazon.com, Inc. or its affiliates. All rights reserved.
+ */
+
+#ifndef _EFA_ADMIN_CMDS_H_
+#define _EFA_ADMIN_CMDS_H_
+
+#define EFA_ADMIN_API_VERSION_MAJOR          0
+#define EFA_ADMIN_API_VERSION_MINOR          1
+
+/* EFA admin queue opcodes */
+enum efa_admin_aq_opcode {
+	EFA_ADMIN_CREATE_QP                         = 1,
+	EFA_ADMIN_MODIFY_QP                         = 2,
+	EFA_ADMIN_QUERY_QP                          = 3,
+	EFA_ADMIN_DESTROY_QP                        = 4,
+	EFA_ADMIN_CREATE_AH                         = 5,
+	EFA_ADMIN_DESTROY_AH                        = 6,
+	EFA_ADMIN_REG_MR                            = 7,
+	EFA_ADMIN_DEREG_MR                          = 8,
+	EFA_ADMIN_CREATE_CQ                         = 9,
+	EFA_ADMIN_DESTROY_CQ                        = 10,
+	EFA_ADMIN_GET_FEATURE                       = 11,
+	EFA_ADMIN_SET_FEATURE                       = 12,
+	EFA_ADMIN_GET_STATS                         = 13,
+	EFA_ADMIN_ALLOC_PD                          = 14,
+	EFA_ADMIN_DEALLOC_PD                        = 15,
+	EFA_ADMIN_ALLOC_UAR                         = 16,
+	EFA_ADMIN_DEALLOC_UAR                       = 17,
+	EFA_ADMIN_MAX_OPCODE                        = 17,
+};
+
+enum efa_admin_aq_feature_id {
+	EFA_ADMIN_DEVICE_ATTR                       = 1,
+	EFA_ADMIN_AENQ_CONFIG                       = 2,
+	EFA_ADMIN_NETWORK_ATTR                      = 3,
+	EFA_ADMIN_QUEUE_ATTR                        = 4,
+	EFA_ADMIN_HW_HINTS                          = 5,
+	EFA_ADMIN_FEATURES_OPCODE_NUM               = 8,
+};
+
+/* QP transport type */
+enum efa_admin_qp_type {
+	/* Unreliable Datagram */
+	EFA_ADMIN_QP_TYPE_UD                        = 1,
+	/* Scalable Reliable Datagram */
+	EFA_ADMIN_QP_TYPE_SRD                       = 2,
+};
+
+/* QP state */
+enum efa_admin_qp_state {
+	EFA_ADMIN_QP_STATE_RESET                    = 0,
+	EFA_ADMIN_QP_STATE_INIT                     = 1,
+	EFA_ADMIN_QP_STATE_RTR                      = 2,
+	EFA_ADMIN_QP_STATE_RTS                      = 3,
+	EFA_ADMIN_QP_STATE_SQD                      = 4,
+	EFA_ADMIN_QP_STATE_SQE                      = 5,
+	EFA_ADMIN_QP_STATE_ERR                      = 6,
+};
+
+enum efa_admin_get_stats_type {
+	EFA_ADMIN_GET_STATS_TYPE_BASIC              = 0,
+};
+
+enum efa_admin_get_stats_scope {
+	EFA_ADMIN_GET_STATS_SCOPE_ALL               = 0,
+	EFA_ADMIN_GET_STATS_SCOPE_QUEUE             = 1,
+};
+
+enum efa_admin_modify_qp_mask_bits {
+	EFA_ADMIN_QP_STATE_BIT                      = 0,
+	EFA_ADMIN_CUR_QP_STATE_BIT                  = 1,
+	EFA_ADMIN_QKEY_BIT                          = 2,
+	EFA_ADMIN_SQ_PSN_BIT                        = 3,
+	EFA_ADMIN_SQ_DRAINED_ASYNC_NOTIFY_BIT       = 4,
+};
+
+/*
+ * QP allocation sizes, converted by fabric QueuePair (QP) create command
+ * from QP capabilities.
+ */
+struct efa_admin_qp_alloc_size {
+	/* Send descriptor ring size in bytes */
+	u32 send_queue_ring_size;
+
+	/* Max number of WQEs that can be outstanding on send queue. */
+	u32 send_queue_depth;
+
+	/*
+	 * Recv descriptor ring size in bytes, sufficient for user-provided
+	 * number of WQEs
+	 */
+	u32 recv_queue_ring_size;
+
+	/* Max number of WQEs that can be outstanding on recv queue */
+	u32 recv_queue_depth;
+};
+
+struct efa_admin_create_qp_cmd {
+	/* Common Admin Queue descriptor */
+	struct efa_admin_aq_common_desc aq_common_desc;
+
+	/* Protection Domain associated with this QP */
+	u16 pd;
+
+	/* QP type */
+	u8 qp_type;
+
+	/*
+	 * 0 : sq_virt - If set, SQ ring base address is
+	 *    virtual (IOVA returned by MR registration)
+	 * 1 : rq_virt - If set, RQ ring base address is
+	 *    virtual (IOVA returned by MR registration)
+	 * 7:2 : reserved - MBZ
+	 */
+	u8 flags;
+
+	/*
+	 * Send queue (SQ) ring base physical address. This field is not
+	 * used if this is a Low Latency Queue(LLQ).
+	 */
+	u64 sq_base_addr;
+
+	/* Receive queue (RQ) ring base address. */
+	u64 rq_base_addr;
+
+	/* Index of CQ to be associated with Send Queue completions */
+	u32 send_cq_idx;
+
+	/* Index of CQ to be associated with Recv Queue completions */
+	u32 recv_cq_idx;
+
+	/*
+	 * Memory registration key for the SQ ring, used only when not in
+	 * LLQ mode and base address is virtual
+	 */
+	u32 sq_l_key;
+
+	/*
+	 * Memory registration key for the RQ ring, used only when base
+	 * address is virtual
+	 */
+	u32 rq_l_key;
+
+	/* Requested QP allocation sizes */
+	struct efa_admin_qp_alloc_size qp_alloc_size;
+
+	/* UAR number */
+	u16 uar;
+
+	/* MBZ */
+	u16 reserved;
+
+	/* MBZ */
+	u32 reserved2;
+};
+
+struct efa_admin_create_qp_resp {
+	/* Common Admin Queue completion descriptor */
+	struct efa_admin_acq_common_desc acq_common_desc;
+
+	/* Opaque handle to be used for consequent operations on the QP */
+	u32 qp_handle;
+
+	/* QP number in the given EFA virtual device */
+	u16 qp_num;
+
+	/* MBZ */
+	u16 reserved;
+
+	/* Index of sub-CQ for Send Queue completions */
+	u16 send_sub_cq_idx;
+
+	/* Index of sub-CQ for Receive Queue completions */
+	u16 recv_sub_cq_idx;
+
+	/* SQ doorbell address, as offset to PCIe DB BAR */
+	u32 sq_db_offset;
+
+	/* RQ doorbell address, as offset to PCIe DB BAR */
+	u32 rq_db_offset;
+
+	/*
+	 * low latency send queue ring base address as an offset to PCIe
+	 * MMIO LLQ_MEM BAR
+	 */
+	u32 llq_descriptors_offset;
+};
+
+struct efa_admin_modify_qp_cmd {
+	/* Common Admin Queue descriptor */
+	struct efa_admin_aq_common_desc aq_common_desc;
+
+	/*
+	 * Mask indicating which fields should be updated see enum
+	 * efa_admin_modify_qp_mask_bits
+	 */
+	u32 modify_mask;
+
+	/* QP handle returned by create_qp command */
+	u32 qp_handle;
+
+	/* QP state */
+	u32 qp_state;
+
+	/* Override current QP state (before applying the transition) */
+	u32 cur_qp_state;
+
+	/* QKey */
+	u32 qkey;
+
+	/* SQ PSN */
+	u32 sq_psn;
+
+	/* Enable async notification when SQ is drained */
+	u8 sq_drained_async_notify;
+
+	/* MBZ */
+	u8 reserved1;
+
+	/* MBZ */
+	u16 reserved2;
+};
+
+struct efa_admin_modify_qp_resp {
+	/* Common Admin Queue completion descriptor */
+	struct efa_admin_acq_common_desc acq_common_desc;
+};
+
+struct efa_admin_query_qp_cmd {
+	/* Common Admin Queue descriptor */
+	struct efa_admin_aq_common_desc aq_common_desc;
+
+	/* QP handle returned by create_qp command */
+	u32 qp_handle;
+};
+
+struct efa_admin_query_qp_resp {
+	/* Common Admin Queue completion descriptor */
+	struct efa_admin_acq_common_desc acq_common_desc;
+
+	/* QP state */
+	u32 qp_state;
+
+	/* QKey */
+	u32 qkey;
+
+	/* SQ PSN */
+	u32 sq_psn;
+
+	/* Indicates that draining is in progress */
+	u8 sq_draining;
+
+	/* MBZ */
+	u8 reserved1;
+
+	/* MBZ */
+	u16 reserved2;
+};
+
+struct efa_admin_destroy_qp_cmd {
+	/* Common Admin Queue descriptor */
+	struct efa_admin_aq_common_desc aq_common_desc;
+
+	/* QP handle returned by create_qp command */
+	u32 qp_handle;
+};
+
+struct efa_admin_destroy_qp_resp {
+	/* Common Admin Queue completion descriptor */
+	struct efa_admin_acq_common_desc acq_common_desc;
+};
+
+/*
+ * Create Address Handle command parameters. Must not be called more than
+ * once for the same destination
+ */
+struct efa_admin_create_ah_cmd {
+	/* Common Admin Queue descriptor */
+	struct efa_admin_aq_common_desc aq_common_desc;
+
+	/* Destination address in network byte order */
+	u8 dest_addr[16];
+
+	/* PD number */
+	u16 pd;
+
+	u16 reserved;
+};
+
+struct efa_admin_create_ah_resp {
+	/* Common Admin Queue completion descriptor */
+	struct efa_admin_acq_common_desc acq_common_desc;
+
+	/* Target interface address handle (opaque) */
+	u16 ah;
+
+	u16 reserved;
+};
+
+struct efa_admin_destroy_ah_cmd {
+	/* Common Admin Queue descriptor */
+	struct efa_admin_aq_common_desc aq_common_desc;
+
+	/* Target interface address handle (opaque) */
+	u16 ah;
+
+	/* PD number */
+	u16 pd;
+};
+
+struct efa_admin_destroy_ah_resp {
+	/* Common Admin Queue completion descriptor */
+	struct efa_admin_acq_common_desc acq_common_desc;
+};
+
+/*
+ * Registration of MemoryRegion, required for QP working with Virtual
+ * Addresses. In standard verbs semantics, region length is limited to 2GB
+ * space, but EFA offers larger MR support for large memory space, to ease
+ * on users working with very large datasets (i.e. full GPU memory mapping).
+ */
+struct efa_admin_reg_mr_cmd {
+	/* Common Admin Queue descriptor */
+	struct efa_admin_aq_common_desc aq_common_desc;
+
+	/* Protection Domain */
+	u16 pd;
+
+	/* MBZ */
+	u16 reserved16_w1;
+
+	/* Physical Buffer List, each element is page-aligned. */
+	union {
+		/*
+		 * Inline array of guest-physical page addresses of user
+		 * memory pages (optimization for short region
+		 * registrations)
+		 */
+		u64 inline_pbl_array[4];
+
+		/* points to PBL (direct or indirect, chained if needed) */
+		struct efa_admin_ctrl_buff_info pbl;
+	} pbl;
+
+	/* Memory region length, in bytes. */
+	u64 mr_length;
+
+	/*
+	 * flags and page size
+	 * 4:0 : phys_page_size_shift - page size is (1 <<
+	 *    phys_page_size_shift). Page size is used for
+	 *    building the Virtual to Physical address mapping
+	 * 6:5 : reserved - MBZ
+	 * 7 : mem_addr_phy_mode_en - Enable bit for physical
+	 *    memory registration (no translation), can be used
+	 *    only by privileged clients. If set, PBL must
+	 *    contain a single entry.
+	 */
+	u8 flags;
+
+	/*
+	 * permissions
+	 * 0 : local_write_enable - Write permissions: value
+	 *    of 1 needed for RQ buffers and for RDMA write
+	 * 7:1 : reserved1 - remote access flags, etc
+	 */
+	u8 permissions;
+
+	u16 reserved16_w5;
+
+	/* number of pages in PBL (redundant, could be calculated) */
+	u32 page_num;
+
+	/*
+	 * IO Virtual Address associated with this MR. If
+	 * mem_addr_phy_mode_en is set, contains the physical address of
+	 * the region.
+	 */
+	u64 iova;
+};
+
+struct efa_admin_reg_mr_resp {
+	/* Common Admin Queue completion descriptor */
+	struct efa_admin_acq_common_desc acq_common_desc;
+
+	/*
+	 * L_Key, to be used in conjunction with local buffer references in
+	 * SQ and RQ WQE, or with virtual RQ/CQ rings
+	 */
+	u32 l_key;
+
+	/*
+	 * R_Key, to be used in RDMA messages to refer to remotely accessed
+	 * memory region
+	 */
+	u32 r_key;
+};
+
+struct efa_admin_dereg_mr_cmd {
+	/* Common Admin Queue descriptor */
+	struct efa_admin_aq_common_desc aq_common_desc;
+
+	/* L_Key, memory region's l_key */
+	u32 l_key;
+};
+
+struct efa_admin_dereg_mr_resp {
+	/* Common Admin Queue completion descriptor */
+	struct efa_admin_acq_common_desc acq_common_desc;
+};
+
+struct efa_admin_create_cq_cmd {
+	struct efa_admin_aq_common_desc aq_common_desc;
+
+	/*
+	 * 4:0 : reserved5
+	 * 5 : interrupt_mode_enabled - if set, cq operates
+	 *    in interrupt mode (i.e. CQ events and MSI-X are
+	 *    generated), otherwise - polling
+	 * 6 : virt - If set, ring base address is virtual
+	 *    (IOVA returned by MR registration)
+	 * 7 : reserved6
+	 */
+	u8 cq_caps_1;
+
+	/*
+	 * 4:0 : cq_entry_size_words - size of CQ entry in
+	 *    32-bit words, valid values: 4, 8.
+	 * 7:5 : reserved7
+	 */
+	u8 cq_caps_2;
+
+	/* completion queue depth in # of entries. must be power of 2 */
+	u16 cq_depth;
+
+	/* msix vector assigned to this cq */
+	u32 msix_vector_idx;
+
+	/*
+	 * CQ ring base address, virtual or physical depending on 'virt'
+	 * flag
+	 */
+	struct efa_common_mem_addr cq_ba;
+
+	/*
+	 * Memory registration key for the ring, used only when base
+	 * address is virtual
+	 */
+	u32 l_key;
+
+	/*
+	 * number of sub cqs - must be equal to sub_cqs_per_cq of queue
+	 *    attributes.
+	 */
+	u16 num_sub_cqs;
+
+	/* UAR number */
+	u16 uar;
+};
+
+struct efa_admin_create_cq_resp {
+	struct efa_admin_acq_common_desc acq_common_desc;
+
+	u16 cq_idx;
+
+	/* actual cq depth in number of entries */
+	u16 cq_actual_depth;
+};
+
+struct efa_admin_destroy_cq_cmd {
+	struct efa_admin_aq_common_desc aq_common_desc;
+
+	u16 cq_idx;
+
+	u16 reserved1;
+};
+
+struct efa_admin_destroy_cq_resp {
+	struct efa_admin_acq_common_desc acq_common_desc;
+};
+
+/*
+ * EFA AQ Get Statistics command. Extended statistics are placed in control
+ * buffer pointed by AQ entry
+ */
+struct efa_admin_aq_get_stats_cmd {
+	struct efa_admin_aq_common_desc aq_common_descriptor;
+
+	union {
+		/* command specific inline data */
+		u32 inline_data_w1[3];
+
+		struct efa_admin_ctrl_buff_info control_buffer;
+	} u;
+
+	/* stats type as defined in enum efa_admin_get_stats_type */
+	u8 type;
+
+	/* stats scope defined in enum efa_admin_get_stats_scope */
+	u8 scope;
+
+	u16 scope_modifier;
+};
+
+struct efa_admin_basic_stats {
+	u64 tx_bytes;
+
+	u64 tx_pkts;
+
+	u64 rx_bytes;
+
+	u64 rx_pkts;
+
+	u64 rx_drops;
+};
+
+struct efa_admin_acq_get_stats_resp {
+	struct efa_admin_acq_common_desc acq_common_desc;
+
+	struct efa_admin_basic_stats basic_stats;
+};
+
+struct efa_admin_get_set_feature_common_desc {
+	/*
+	 * 1:0 : select - 0x1 - current value; 0x3 - default
+	 *    value
+	 * 7:3 : reserved3
+	 */
+	u8 flags;
+
+	/* as appears in efa_admin_aq_feature_id */
+	u8 feature_id;
+
+	/* MBZ */
+	u16 reserved16;
+};
+
+struct efa_admin_feature_device_attr_desc {
+	/* Bitmap of efa_admin_aq_feature_id */
+	u64 supported_features;
+
+	/* Bitmap of supported page sizes in MR registrations */
+	u64 page_size_cap;
+
+	u32 fw_version;
+
+	u32 admin_api_version;
+
+	u32 device_version;
+
+	/* Bar used for SQ and RQ doorbells */
+	u16 db_bar;
+
+	/* Indicates how many bits are used physical address access */
+	u8 phys_addr_width;
+
+	/* Indicates how many bits are used virtual address access */
+	u8 virt_addr_width;
+};
+
+struct efa_admin_feature_queue_attr_desc {
+	/* The maximum number of queue pairs supported */
+	u32 max_qp;
+
+	u32 max_sq_depth;
+
+	/* max send wr used in inline-buf */
+	u32 inline_buf_size;
+
+	u32 max_rq_depth;
+
+	/* The maximum number of completion queues supported per VF */
+	u32 max_cq;
+
+	u32 max_cq_depth;
+
+	/* Number of sub-CQs to be created for each CQ */
+	u16 sub_cqs_per_cq;
+
+	u16 reserved;
+
+	/*
+	 * Maximum number of SGEs (buffs) allowed for a single send work
+	 *    queue element (WQE)
+	 */
+	u16 max_wr_send_sges;
+
+	/* Maximum number of SGEs allowed for a single recv WQE */
+	u16 max_wr_recv_sges;
+
+	/* The maximum number of memory regions supported */
+	u32 max_mr;
+
+	/* The maximum number of pages can be registered */
+	u32 max_mr_pages;
+
+	/* The maximum number of protection domains supported */
+	u32 max_pd;
+
+	/* The maximum number of address handles supported */
+	u32 max_ah;
+
+	/* The maximum size of LLQ in bytes */
+	u32 max_llq_size;
+};
+
+struct efa_admin_feature_aenq_desc {
+	/* bitmask for AENQ groups the device can report */
+	u32 supported_groups;
+
+	/* bitmask for AENQ groups to report */
+	u32 enabled_groups;
+};
+
+struct efa_admin_feature_network_attr_desc {
+	/* Raw address data in network byte order */
+	u8 addr[16];
+
+	u32 mtu;
+};
+
+/*
+ * When hint value is 0, hints capabilities are not supported or driver
+ * should use its own predefined value
+ */
+struct efa_admin_hw_hints {
+	/* value in ms */
+	u16 mmio_read_timeout;
+
+	/* value in ms */
+	u16 driver_watchdog_timeout;
+
+	/* value in ms */
+	u16 admin_completion_timeout;
+
+	/* poll interval in ms */
+	u16 poll_interval;
+};
+
+struct efa_admin_get_feature_cmd {
+	struct efa_admin_aq_common_desc aq_common_descriptor;
+
+	struct efa_admin_ctrl_buff_info control_buffer;
+
+	struct efa_admin_get_set_feature_common_desc feature_common;
+
+	u32 raw[11];
+};
+
+struct efa_admin_get_feature_resp {
+	struct efa_admin_acq_common_desc acq_common_desc;
+
+	union {
+		u32 raw[14];
+
+		struct efa_admin_feature_device_attr_desc device_attr;
+
+		struct efa_admin_feature_aenq_desc aenq;
+
+		struct efa_admin_feature_network_attr_desc network_attr;
+
+		struct efa_admin_feature_queue_attr_desc queue_attr;
+
+		struct efa_admin_hw_hints hw_hints;
+	} u;
+};
+
+struct efa_admin_set_feature_cmd {
+	struct efa_admin_aq_common_desc aq_common_descriptor;
+
+	struct efa_admin_ctrl_buff_info control_buffer;
+
+	struct efa_admin_get_set_feature_common_desc feature_common;
+
+	union {
+		u32 raw[11];
+
+		/* AENQ configuration */
+		struct efa_admin_feature_aenq_desc aenq;
+	} u;
+};
+
+struct efa_admin_set_feature_resp {
+	struct efa_admin_acq_common_desc acq_common_desc;
+
+	union {
+		u32 raw[14];
+	} u;
+};
+
+struct efa_admin_alloc_pd_cmd {
+	struct efa_admin_aq_common_desc aq_common_descriptor;
+};
+
+struct efa_admin_alloc_pd_resp {
+	struct efa_admin_acq_common_desc acq_common_desc;
+
+	/* PD number */
+	u16 pd;
+
+	/* MBZ */
+	u16 reserved;
+};
+
+struct efa_admin_dealloc_pd_cmd {
+	struct efa_admin_aq_common_desc aq_common_descriptor;
+
+	/* PD number */
+	u16 pd;
+
+	/* MBZ */
+	u16 reserved;
+};
+
+struct efa_admin_dealloc_pd_resp {
+	struct efa_admin_acq_common_desc acq_common_desc;
+};
+
+struct efa_admin_alloc_uar_cmd {
+	struct efa_admin_aq_common_desc aq_common_descriptor;
+};
+
+struct efa_admin_alloc_uar_resp {
+	struct efa_admin_acq_common_desc acq_common_desc;
+
+	/* UAR number */
+	u16 uar;
+
+	/* MBZ */
+	u16 reserved;
+};
+
+struct efa_admin_dealloc_uar_cmd {
+	struct efa_admin_aq_common_desc aq_common_descriptor;
+
+	/* UAR number */
+	u16 uar;
+
+	/* MBZ */
+	u16 reserved;
+};
+
+struct efa_admin_dealloc_uar_resp {
+	struct efa_admin_acq_common_desc acq_common_desc;
+};
+
+/* asynchronous event notification groups */
+enum efa_admin_aenq_group {
+	EFA_ADMIN_FATAL_ERROR                       = 1,
+	EFA_ADMIN_WARNING                           = 2,
+	EFA_ADMIN_NOTIFICATION                      = 3,
+	EFA_ADMIN_KEEP_ALIVE                        = 4,
+	EFA_ADMIN_AENQ_GROUPS_NUM                   = 5,
+};
+
+enum efa_admin_aenq_notification_syndrom {
+	EFA_ADMIN_SUSPEND                           = 0,
+	EFA_ADMIN_RESUME                            = 1,
+	EFA_ADMIN_UPDATE_HINTS                      = 2,
+};
+
+struct efa_admin_mmio_req_read_less_resp {
+	u16 req_id;
+
+	u16 reg_off;
+
+	/* value is valid when poll is cleared */
+	u32 reg_val;
+};
+
+/* create_qp_cmd */
+#define EFA_ADMIN_CREATE_QP_CMD_SQ_VIRT_MASK                BIT(0)
+#define EFA_ADMIN_CREATE_QP_CMD_RQ_VIRT_SHIFT               1
+#define EFA_ADMIN_CREATE_QP_CMD_RQ_VIRT_MASK                BIT(1)
+
+/* reg_mr_cmd */
+#define EFA_ADMIN_REG_MR_CMD_PHYS_PAGE_SIZE_SHIFT_MASK      GENMASK(4, 0)
+#define EFA_ADMIN_REG_MR_CMD_MEM_ADDR_PHY_MODE_EN_SHIFT     7
+#define EFA_ADMIN_REG_MR_CMD_MEM_ADDR_PHY_MODE_EN_MASK      BIT(7)
+#define EFA_ADMIN_REG_MR_CMD_LOCAL_WRITE_ENABLE_MASK        BIT(0)
+
+/* create_cq_cmd */
+#define EFA_ADMIN_CREATE_CQ_CMD_INTERRUPT_MODE_ENABLED_SHIFT 5
+#define EFA_ADMIN_CREATE_CQ_CMD_INTERRUPT_MODE_ENABLED_MASK BIT(5)
+#define EFA_ADMIN_CREATE_CQ_CMD_VIRT_SHIFT                  6
+#define EFA_ADMIN_CREATE_CQ_CMD_VIRT_MASK                   BIT(6)
+#define EFA_ADMIN_CREATE_CQ_CMD_CQ_ENTRY_SIZE_WORDS_MASK    GENMASK(4, 0)
+
+/* get_set_feature_common_desc */
+#define EFA_ADMIN_GET_SET_FEATURE_COMMON_DESC_SELECT_MASK   GENMASK(1, 0)
+
+#endif /* _EFA_ADMIN_CMDS_H_ */
diff --git a/drivers/infiniband/hw/efa/efa_admin_defs.h b/drivers/infiniband/hw/efa/efa_admin_defs.h
new file mode 100644
index 0000000..c8e0c8b
--- /dev/null
+++ b/drivers/infiniband/hw/efa/efa_admin_defs.h
@@ -0,0 +1,136 @@
+/* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */
+/*
+ * Copyright 2018-2019 Amazon.com, Inc. or its affiliates. All rights reserved.
+ */
+
+#ifndef _EFA_ADMIN_H_
+#define _EFA_ADMIN_H_
+
+enum efa_admin_aq_completion_status {
+	EFA_ADMIN_SUCCESS                           = 0,
+	EFA_ADMIN_RESOURCE_ALLOCATION_FAILURE       = 1,
+	EFA_ADMIN_BAD_OPCODE                        = 2,
+	EFA_ADMIN_UNSUPPORTED_OPCODE                = 3,
+	EFA_ADMIN_MALFORMED_REQUEST                 = 4,
+	/* Additional status is provided in ACQ entry extended_status */
+	EFA_ADMIN_ILLEGAL_PARAMETER                 = 5,
+	EFA_ADMIN_UNKNOWN_ERROR                     = 6,
+	EFA_ADMIN_RESOURCE_BUSY                     = 7,
+};
+
+struct efa_admin_aq_common_desc {
+	/*
+	 * 11:0 : command_id
+	 * 15:12 : reserved12
+	 */
+	u16 command_id;
+
+	/* as appears in efa_admin_aq_opcode */
+	u8 opcode;
+
+	/*
+	 * 0 : phase
+	 * 1 : ctrl_data - control buffer address valid
+	 * 2 : ctrl_data_indirect - control buffer address
+	 *    points to list of pages with addresses of control
+	 *    buffers
+	 * 7:3 : reserved3
+	 */
+	u8 flags;
+};
+
+/*
+ * used in efa_admin_aq_entry. Can point directly to control data, or to a
+ * page list chunk. Used also at the end of indirect mode page list chunks,
+ * for chaining.
+ */
+struct efa_admin_ctrl_buff_info {
+	u32 length;
+
+	struct efa_common_mem_addr address;
+};
+
+struct efa_admin_aq_entry {
+	struct efa_admin_aq_common_desc aq_common_descriptor;
+
+	union {
+		u32 inline_data_w1[3];
+
+		struct efa_admin_ctrl_buff_info control_buffer;
+	} u;
+
+	u32 inline_data_w4[12];
+};
+
+struct efa_admin_acq_common_desc {
+	/*
+	 * command identifier to associate it with the aq descriptor
+	 * 11:0 : command_id
+	 * 15:12 : reserved12
+	 */
+	u16 command;
+
+	u8 status;
+
+	/*
+	 * 0 : phase
+	 * 7:1 : reserved1
+	 */
+	u8 flags;
+
+	u16 extended_status;
+
+	/*
+	 * indicates to the driver which AQ entry has been consumed by the
+	 *    device and could be reused
+	 */
+	u16 sq_head_indx;
+};
+
+struct efa_admin_acq_entry {
+	struct efa_admin_acq_common_desc acq_common_descriptor;
+
+	u32 response_specific_data[14];
+};
+
+struct efa_admin_aenq_common_desc {
+	u16 group;
+
+	u16 syndrom;
+
+	/*
+	 * 0 : phase
+	 * 7:1 : reserved - MBZ
+	 */
+	u8 flags;
+
+	u8 reserved1[3];
+
+	u32 timestamp_low;
+
+	u32 timestamp_high;
+};
+
+struct efa_admin_aenq_entry {
+	struct efa_admin_aenq_common_desc aenq_common_desc;
+
+	/* command specific inline data */
+	u32 inline_data_w4[12];
+};
+
+/* aq_common_desc */
+#define EFA_ADMIN_AQ_COMMON_DESC_COMMAND_ID_MASK            GENMASK(11, 0)
+#define EFA_ADMIN_AQ_COMMON_DESC_PHASE_MASK                 BIT(0)
+#define EFA_ADMIN_AQ_COMMON_DESC_CTRL_DATA_SHIFT            1
+#define EFA_ADMIN_AQ_COMMON_DESC_CTRL_DATA_MASK             BIT(1)
+#define EFA_ADMIN_AQ_COMMON_DESC_CTRL_DATA_INDIRECT_SHIFT   2
+#define EFA_ADMIN_AQ_COMMON_DESC_CTRL_DATA_INDIRECT_MASK    BIT(2)
+
+/* acq_common_desc */
+#define EFA_ADMIN_ACQ_COMMON_DESC_COMMAND_ID_MASK           GENMASK(11, 0)
+#define EFA_ADMIN_ACQ_COMMON_DESC_PHASE_MASK                BIT(0)
+
+/* aenq_common_desc */
+#define EFA_ADMIN_AENQ_COMMON_DESC_PHASE_MASK               BIT(0)
+
+#endif /* _EFA_ADMIN_H_ */
diff --git a/drivers/infiniband/hw/efa/efa_com.c b/drivers/infiniband/hw/efa/efa_com.c
new file mode 100644
index 0000000..3c412bc
--- /dev/null
+++ b/drivers/infiniband/hw/efa/efa_com.c
@@ -0,0 +1,1098 @@
+// SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause
+/*
+ * Copyright 2018-2019 Amazon.com, Inc. or its affiliates. All rights reserved.
+ */
+
+#include "efa_com.h"
+#include "efa_regs_defs.h"
+
+#define ADMIN_CMD_TIMEOUT_US 30000000 /* usecs */
+
+#define EFA_REG_READ_TIMEOUT_US 50000 /* usecs */
+#define EFA_MMIO_READ_INVALID 0xffffffff
+
+#define EFA_POLL_INTERVAL_MS 100 /* msecs */
+
+#define EFA_ASYNC_QUEUE_DEPTH 16
+#define EFA_ADMIN_QUEUE_DEPTH 32
+
+#define MIN_EFA_VER\
+	((EFA_ADMIN_API_VERSION_MAJOR << EFA_REGS_VERSION_MAJOR_VERSION_SHIFT) | \
+	 (EFA_ADMIN_API_VERSION_MINOR & EFA_REGS_VERSION_MINOR_VERSION_MASK))
+
+#define EFA_CTRL_MAJOR          0
+#define EFA_CTRL_MINOR          0
+#define EFA_CTRL_SUB_MINOR      1
+
+#define MIN_EFA_CTRL_VER \
+	(((EFA_CTRL_MAJOR) << \
+	(EFA_REGS_CONTROLLER_VERSION_MAJOR_VERSION_SHIFT)) | \
+	((EFA_CTRL_MINOR) << \
+	(EFA_REGS_CONTROLLER_VERSION_MINOR_VERSION_SHIFT)) | \
+	(EFA_CTRL_SUB_MINOR))
+
+#define EFA_DMA_ADDR_TO_UINT32_LOW(x)   ((u32)((u64)(x)))
+#define EFA_DMA_ADDR_TO_UINT32_HIGH(x)  ((u32)(((u64)(x)) >> 32))
+
+#define EFA_REGS_ADMIN_INTR_MASK 1
+
+enum efa_cmd_status {
+	EFA_CMD_SUBMITTED,
+	EFA_CMD_COMPLETED,
+};
+
+struct efa_comp_ctx {
+	struct completion wait_event;
+	struct efa_admin_acq_entry *user_cqe;
+	u32 comp_size;
+	enum efa_cmd_status status;
+	/* status from the device */
+	u8 comp_status;
+	u8 cmd_opcode;
+	u8 occupied;
+};
+
+static const char *efa_com_cmd_str(u8 cmd)
+{
+#define EFA_CMD_STR_CASE(_cmd) case EFA_ADMIN_##_cmd: return #_cmd
+
+	switch (cmd) {
+	EFA_CMD_STR_CASE(CREATE_QP);
+	EFA_CMD_STR_CASE(MODIFY_QP);
+	EFA_CMD_STR_CASE(QUERY_QP);
+	EFA_CMD_STR_CASE(DESTROY_QP);
+	EFA_CMD_STR_CASE(CREATE_AH);
+	EFA_CMD_STR_CASE(DESTROY_AH);
+	EFA_CMD_STR_CASE(REG_MR);
+	EFA_CMD_STR_CASE(DEREG_MR);
+	EFA_CMD_STR_CASE(CREATE_CQ);
+	EFA_CMD_STR_CASE(DESTROY_CQ);
+	EFA_CMD_STR_CASE(GET_FEATURE);
+	EFA_CMD_STR_CASE(SET_FEATURE);
+	EFA_CMD_STR_CASE(GET_STATS);
+	EFA_CMD_STR_CASE(ALLOC_PD);
+	EFA_CMD_STR_CASE(DEALLOC_PD);
+	EFA_CMD_STR_CASE(ALLOC_UAR);
+	EFA_CMD_STR_CASE(DEALLOC_UAR);
+	default: return "unknown command opcode";
+	}
+#undef EFA_CMD_STR_CASE
+}
+
+static u32 efa_com_reg_read32(struct efa_com_dev *edev, u16 offset)
+{
+	struct efa_com_mmio_read *mmio_read = &edev->mmio_read;
+	struct efa_admin_mmio_req_read_less_resp *read_resp;
+	unsigned long exp_time;
+	u32 mmio_read_reg;
+	u32 err;
+
+	read_resp = mmio_read->read_resp;
+
+	spin_lock(&mmio_read->lock);
+	mmio_read->seq_num++;
+
+	/* trash DMA req_id to identify when hardware is done */
+	read_resp->req_id = mmio_read->seq_num + 0x9aL;
+	mmio_read_reg = (offset << EFA_REGS_MMIO_REG_READ_REG_OFF_SHIFT) &
+			EFA_REGS_MMIO_REG_READ_REG_OFF_MASK;
+	mmio_read_reg |= mmio_read->seq_num &
+			 EFA_REGS_MMIO_REG_READ_REQ_ID_MASK;
+
+	writel(mmio_read_reg, edev->reg_bar + EFA_REGS_MMIO_REG_READ_OFF);
+
+	exp_time = jiffies + usecs_to_jiffies(mmio_read->mmio_read_timeout);
+	do {
+		if (READ_ONCE(read_resp->req_id) == mmio_read->seq_num)
+			break;
+		udelay(1);
+	} while (time_is_after_jiffies(exp_time));
+
+	if (read_resp->req_id != mmio_read->seq_num) {
+		ibdev_err_ratelimited(
+			edev->efa_dev,
+			"Reading register timed out. expected: req id[%u] offset[%#x] actual: req id[%u] offset[%#x]\n",
+			mmio_read->seq_num, offset, read_resp->req_id,
+			read_resp->reg_off);
+		err = EFA_MMIO_READ_INVALID;
+		goto out;
+	}
+
+	if (read_resp->reg_off != offset) {
+		ibdev_err_ratelimited(
+			edev->efa_dev,
+			"Reading register failed: wrong offset provided\n");
+		err = EFA_MMIO_READ_INVALID;
+		goto out;
+	}
+
+	err = read_resp->reg_val;
+out:
+	spin_unlock(&mmio_read->lock);
+	return err;
+}
+
+static int efa_com_admin_init_sq(struct efa_com_dev *edev)
+{
+	struct efa_com_admin_queue *aq = &edev->aq;
+	struct efa_com_admin_sq *sq = &aq->sq;
+	u16 size = aq->depth * sizeof(*sq->entries);
+	u32 addr_high;
+	u32 addr_low;
+	u32 aq_caps;
+
+	sq->entries =
+		dma_alloc_coherent(aq->dmadev, size, &sq->dma_addr, GFP_KERNEL);
+	if (!sq->entries)
+		return -ENOMEM;
+
+	spin_lock_init(&sq->lock);
+
+	sq->cc = 0;
+	sq->pc = 0;
+	sq->phase = 1;
+
+	sq->db_addr = (u32 __iomem *)(edev->reg_bar + EFA_REGS_AQ_PROD_DB_OFF);
+
+	addr_high = EFA_DMA_ADDR_TO_UINT32_HIGH(sq->dma_addr);
+	addr_low = EFA_DMA_ADDR_TO_UINT32_LOW(sq->dma_addr);
+
+	writel(addr_low, edev->reg_bar + EFA_REGS_AQ_BASE_LO_OFF);
+	writel(addr_high, edev->reg_bar + EFA_REGS_AQ_BASE_HI_OFF);
+
+	aq_caps = aq->depth & EFA_REGS_AQ_CAPS_AQ_DEPTH_MASK;
+	aq_caps |= (sizeof(struct efa_admin_aq_entry) <<
+			EFA_REGS_AQ_CAPS_AQ_ENTRY_SIZE_SHIFT) &
+			EFA_REGS_AQ_CAPS_AQ_ENTRY_SIZE_MASK;
+
+	writel(aq_caps, edev->reg_bar + EFA_REGS_AQ_CAPS_OFF);
+
+	return 0;
+}
+
+static int efa_com_admin_init_cq(struct efa_com_dev *edev)
+{
+	struct efa_com_admin_queue *aq = &edev->aq;
+	struct efa_com_admin_cq *cq = &aq->cq;
+	u16 size = aq->depth * sizeof(*cq->entries);
+	u32 addr_high;
+	u32 addr_low;
+	u32 acq_caps;
+
+	cq->entries =
+		dma_alloc_coherent(aq->dmadev, size, &cq->dma_addr, GFP_KERNEL);
+	if (!cq->entries)
+		return -ENOMEM;
+
+	spin_lock_init(&cq->lock);
+
+	cq->cc = 0;
+	cq->phase = 1;
+
+	addr_high = EFA_DMA_ADDR_TO_UINT32_HIGH(cq->dma_addr);
+	addr_low = EFA_DMA_ADDR_TO_UINT32_LOW(cq->dma_addr);
+
+	writel(addr_low, edev->reg_bar + EFA_REGS_ACQ_BASE_LO_OFF);
+	writel(addr_high, edev->reg_bar + EFA_REGS_ACQ_BASE_HI_OFF);
+
+	acq_caps = aq->depth & EFA_REGS_ACQ_CAPS_ACQ_DEPTH_MASK;
+	acq_caps |= (sizeof(struct efa_admin_acq_entry) <<
+			EFA_REGS_ACQ_CAPS_ACQ_ENTRY_SIZE_SHIFT) &
+			EFA_REGS_ACQ_CAPS_ACQ_ENTRY_SIZE_MASK;
+	acq_caps |= (aq->msix_vector_idx <<
+			EFA_REGS_ACQ_CAPS_ACQ_MSIX_VECTOR_SHIFT) &
+			EFA_REGS_ACQ_CAPS_ACQ_MSIX_VECTOR_MASK;
+
+	writel(acq_caps, edev->reg_bar + EFA_REGS_ACQ_CAPS_OFF);
+
+	return 0;
+}
+
+static int efa_com_admin_init_aenq(struct efa_com_dev *edev,
+				   struct efa_aenq_handlers *aenq_handlers)
+{
+	struct efa_com_aenq *aenq = &edev->aenq;
+	u32 addr_low, addr_high, aenq_caps;
+	u16 size;
+
+	if (!aenq_handlers) {
+		ibdev_err(edev->efa_dev, "aenq handlers pointer is NULL\n");
+		return -EINVAL;
+	}
+
+	size = EFA_ASYNC_QUEUE_DEPTH * sizeof(*aenq->entries);
+	aenq->entries = dma_alloc_coherent(edev->dmadev, size, &aenq->dma_addr,
+					   GFP_KERNEL);
+	if (!aenq->entries)
+		return -ENOMEM;
+
+	aenq->aenq_handlers = aenq_handlers;
+	aenq->depth = EFA_ASYNC_QUEUE_DEPTH;
+	aenq->cc = 0;
+	aenq->phase = 1;
+
+	addr_low = EFA_DMA_ADDR_TO_UINT32_LOW(aenq->dma_addr);
+	addr_high = EFA_DMA_ADDR_TO_UINT32_HIGH(aenq->dma_addr);
+
+	writel(addr_low, edev->reg_bar + EFA_REGS_AENQ_BASE_LO_OFF);
+	writel(addr_high, edev->reg_bar + EFA_REGS_AENQ_BASE_HI_OFF);
+
+	aenq_caps = aenq->depth & EFA_REGS_AENQ_CAPS_AENQ_DEPTH_MASK;
+	aenq_caps |= (sizeof(struct efa_admin_aenq_entry) <<
+		EFA_REGS_AENQ_CAPS_AENQ_ENTRY_SIZE_SHIFT) &
+		EFA_REGS_AENQ_CAPS_AENQ_ENTRY_SIZE_MASK;
+	aenq_caps |= (aenq->msix_vector_idx
+		      << EFA_REGS_AENQ_CAPS_AENQ_MSIX_VECTOR_SHIFT) &
+		     EFA_REGS_AENQ_CAPS_AENQ_MSIX_VECTOR_MASK;
+	writel(aenq_caps, edev->reg_bar + EFA_REGS_AENQ_CAPS_OFF);
+
+	/*
+	 * Init cons_db to mark that all entries in the queue
+	 * are initially available
+	 */
+	writel(edev->aenq.cc, edev->reg_bar + EFA_REGS_AENQ_CONS_DB_OFF);
+
+	return 0;
+}
+
+/* ID to be used with efa_com_get_comp_ctx */
+static u16 efa_com_alloc_ctx_id(struct efa_com_admin_queue *aq)
+{
+	u16 ctx_id;
+
+	spin_lock(&aq->comp_ctx_lock);
+	ctx_id = aq->comp_ctx_pool[aq->comp_ctx_pool_next];
+	aq->comp_ctx_pool_next++;
+	spin_unlock(&aq->comp_ctx_lock);
+
+	return ctx_id;
+}
+
+static void efa_com_dealloc_ctx_id(struct efa_com_admin_queue *aq,
+				   u16 ctx_id)
+{
+	spin_lock(&aq->comp_ctx_lock);
+	aq->comp_ctx_pool_next--;
+	aq->comp_ctx_pool[aq->comp_ctx_pool_next] = ctx_id;
+	spin_unlock(&aq->comp_ctx_lock);
+}
+
+static inline void efa_com_put_comp_ctx(struct efa_com_admin_queue *aq,
+					struct efa_comp_ctx *comp_ctx)
+{
+	u16 cmd_id = comp_ctx->user_cqe->acq_common_descriptor.command &
+		     EFA_ADMIN_ACQ_COMMON_DESC_COMMAND_ID_MASK;
+	u16 ctx_id = cmd_id & (aq->depth - 1);
+
+	ibdev_dbg(aq->efa_dev, "Put completion command_id %#x\n", cmd_id);
+	comp_ctx->occupied = 0;
+	efa_com_dealloc_ctx_id(aq, ctx_id);
+}
+
+static struct efa_comp_ctx *efa_com_get_comp_ctx(struct efa_com_admin_queue *aq,
+						 u16 cmd_id, bool capture)
+{
+	u16 ctx_id = cmd_id & (aq->depth - 1);
+
+	if (aq->comp_ctx[ctx_id].occupied && capture) {
+		ibdev_err_ratelimited(
+			aq->efa_dev,
+			"Completion context for command_id %#x is occupied\n",
+			cmd_id);
+		return NULL;
+	}
+
+	if (capture) {
+		aq->comp_ctx[ctx_id].occupied = 1;
+		ibdev_dbg(aq->efa_dev,
+			  "Take completion ctxt for command_id %#x\n", cmd_id);
+	}
+
+	return &aq->comp_ctx[ctx_id];
+}
+
+static struct efa_comp_ctx *__efa_com_submit_admin_cmd(struct efa_com_admin_queue *aq,
+						       struct efa_admin_aq_entry *cmd,
+						       size_t cmd_size_in_bytes,
+						       struct efa_admin_acq_entry *comp,
+						       size_t comp_size_in_bytes)
+{
+	struct efa_comp_ctx *comp_ctx;
+	u16 queue_size_mask;
+	u16 cmd_id;
+	u16 ctx_id;
+	u16 pi;
+
+	queue_size_mask = aq->depth - 1;
+	pi = aq->sq.pc & queue_size_mask;
+
+	ctx_id = efa_com_alloc_ctx_id(aq);
+
+	/* cmd_id LSBs are the ctx_id and MSBs are entropy bits from pc */
+	cmd_id = ctx_id & queue_size_mask;
+	cmd_id |= aq->sq.pc & ~queue_size_mask;
+	cmd_id &= EFA_ADMIN_AQ_COMMON_DESC_COMMAND_ID_MASK;
+
+	cmd->aq_common_descriptor.command_id = cmd_id;
+	cmd->aq_common_descriptor.flags |= aq->sq.phase &
+		EFA_ADMIN_AQ_COMMON_DESC_PHASE_MASK;
+
+	comp_ctx = efa_com_get_comp_ctx(aq, cmd_id, true);
+	if (!comp_ctx) {
+		efa_com_dealloc_ctx_id(aq, ctx_id);
+		return ERR_PTR(-EINVAL);
+	}
+
+	comp_ctx->status = EFA_CMD_SUBMITTED;
+	comp_ctx->comp_size = comp_size_in_bytes;
+	comp_ctx->user_cqe = comp;
+	comp_ctx->cmd_opcode = cmd->aq_common_descriptor.opcode;
+
+	reinit_completion(&comp_ctx->wait_event);
+
+	memcpy(&aq->sq.entries[pi], cmd, cmd_size_in_bytes);
+
+	aq->sq.pc++;
+	atomic64_inc(&aq->stats.submitted_cmd);
+
+	if ((aq->sq.pc & queue_size_mask) == 0)
+		aq->sq.phase = !aq->sq.phase;
+
+	/* barrier not needed in case of writel */
+	writel(aq->sq.pc, aq->sq.db_addr);
+
+	return comp_ctx;
+}
+
+static inline int efa_com_init_comp_ctxt(struct efa_com_admin_queue *aq)
+{
+	size_t pool_size = aq->depth * sizeof(*aq->comp_ctx_pool);
+	size_t size = aq->depth * sizeof(struct efa_comp_ctx);
+	struct efa_comp_ctx *comp_ctx;
+	u16 i;
+
+	aq->comp_ctx = devm_kzalloc(aq->dmadev, size, GFP_KERNEL);
+	aq->comp_ctx_pool = devm_kzalloc(aq->dmadev, pool_size, GFP_KERNEL);
+	if (!aq->comp_ctx || !aq->comp_ctx_pool) {
+		devm_kfree(aq->dmadev, aq->comp_ctx_pool);
+		devm_kfree(aq->dmadev, aq->comp_ctx);
+		return -ENOMEM;
+	}
+
+	for (i = 0; i < aq->depth; i++) {
+		comp_ctx = efa_com_get_comp_ctx(aq, i, false);
+		if (comp_ctx)
+			init_completion(&comp_ctx->wait_event);
+
+		aq->comp_ctx_pool[i] = i;
+	}
+
+	spin_lock_init(&aq->comp_ctx_lock);
+
+	aq->comp_ctx_pool_next = 0;
+
+	return 0;
+}
+
+static struct efa_comp_ctx *efa_com_submit_admin_cmd(struct efa_com_admin_queue *aq,
+						     struct efa_admin_aq_entry *cmd,
+						     size_t cmd_size_in_bytes,
+						     struct efa_admin_acq_entry *comp,
+						     size_t comp_size_in_bytes)
+{
+	struct efa_comp_ctx *comp_ctx;
+
+	spin_lock(&aq->sq.lock);
+	if (!test_bit(EFA_AQ_STATE_RUNNING_BIT, &aq->state)) {
+		ibdev_err_ratelimited(aq->efa_dev, "Admin queue is closed\n");
+		spin_unlock(&aq->sq.lock);
+		return ERR_PTR(-ENODEV);
+	}
+
+	comp_ctx = __efa_com_submit_admin_cmd(aq, cmd, cmd_size_in_bytes, comp,
+					      comp_size_in_bytes);
+	spin_unlock(&aq->sq.lock);
+	if (IS_ERR(comp_ctx))
+		clear_bit(EFA_AQ_STATE_RUNNING_BIT, &aq->state);
+
+	return comp_ctx;
+}
+
+static void efa_com_handle_single_admin_completion(struct efa_com_admin_queue *aq,
+						   struct efa_admin_acq_entry *cqe)
+{
+	struct efa_comp_ctx *comp_ctx;
+	u16 cmd_id;
+
+	cmd_id = cqe->acq_common_descriptor.command &
+		 EFA_ADMIN_ACQ_COMMON_DESC_COMMAND_ID_MASK;
+
+	comp_ctx = efa_com_get_comp_ctx(aq, cmd_id, false);
+	if (!comp_ctx) {
+		ibdev_err(aq->efa_dev,
+			  "comp_ctx is NULL. Changing the admin queue running state\n");
+		clear_bit(EFA_AQ_STATE_RUNNING_BIT, &aq->state);
+		return;
+	}
+
+	comp_ctx->status = EFA_CMD_COMPLETED;
+	comp_ctx->comp_status = cqe->acq_common_descriptor.status;
+	if (comp_ctx->user_cqe)
+		memcpy(comp_ctx->user_cqe, cqe, comp_ctx->comp_size);
+
+	if (!test_bit(EFA_AQ_STATE_POLLING_BIT, &aq->state))
+		complete(&comp_ctx->wait_event);
+}
+
+static void efa_com_handle_admin_completion(struct efa_com_admin_queue *aq)
+{
+	struct efa_admin_acq_entry *cqe;
+	u16 queue_size_mask;
+	u16 comp_num = 0;
+	u8 phase;
+	u16 ci;
+
+	queue_size_mask = aq->depth - 1;
+
+	ci = aq->cq.cc & queue_size_mask;
+	phase = aq->cq.phase;
+
+	cqe = &aq->cq.entries[ci];
+
+	/* Go over all the completions */
+	while ((READ_ONCE(cqe->acq_common_descriptor.flags) &
+		EFA_ADMIN_ACQ_COMMON_DESC_PHASE_MASK) == phase) {
+		/*
+		 * Do not read the rest of the completion entry before the
+		 * phase bit was validated
+		 */
+		dma_rmb();
+		efa_com_handle_single_admin_completion(aq, cqe);
+
+		ci++;
+		comp_num++;
+		if (ci == aq->depth) {
+			ci = 0;
+			phase = !phase;
+		}
+
+		cqe = &aq->cq.entries[ci];
+	}
+
+	aq->cq.cc += comp_num;
+	aq->cq.phase = phase;
+	aq->sq.cc += comp_num;
+	atomic64_add(comp_num, &aq->stats.completed_cmd);
+}
+
+static int efa_com_comp_status_to_errno(u8 comp_status)
+{
+	switch (comp_status) {
+	case EFA_ADMIN_SUCCESS:
+		return 0;
+	case EFA_ADMIN_RESOURCE_ALLOCATION_FAILURE:
+		return -ENOMEM;
+	case EFA_ADMIN_UNSUPPORTED_OPCODE:
+		return -EOPNOTSUPP;
+	case EFA_ADMIN_BAD_OPCODE:
+	case EFA_ADMIN_MALFORMED_REQUEST:
+	case EFA_ADMIN_ILLEGAL_PARAMETER:
+	case EFA_ADMIN_UNKNOWN_ERROR:
+		return -EINVAL;
+	default:
+		return -EINVAL;
+	}
+}
+
+static int efa_com_wait_and_process_admin_cq_polling(struct efa_comp_ctx *comp_ctx,
+						     struct efa_com_admin_queue *aq)
+{
+	unsigned long timeout;
+	unsigned long flags;
+	int err;
+
+	timeout = jiffies + usecs_to_jiffies(aq->completion_timeout);
+
+	while (1) {
+		spin_lock_irqsave(&aq->cq.lock, flags);
+		efa_com_handle_admin_completion(aq);
+		spin_unlock_irqrestore(&aq->cq.lock, flags);
+
+		if (comp_ctx->status != EFA_CMD_SUBMITTED)
+			break;
+
+		if (time_is_before_jiffies(timeout)) {
+			ibdev_err_ratelimited(
+				aq->efa_dev,
+				"Wait for completion (polling) timeout\n");
+			/* EFA didn't have any completion */
+			atomic64_inc(&aq->stats.no_completion);
+
+			clear_bit(EFA_AQ_STATE_RUNNING_BIT, &aq->state);
+			err = -ETIME;
+			goto out;
+		}
+
+		msleep(aq->poll_interval);
+	}
+
+	err = efa_com_comp_status_to_errno(comp_ctx->comp_status);
+out:
+	efa_com_put_comp_ctx(aq, comp_ctx);
+	return err;
+}
+
+static int efa_com_wait_and_process_admin_cq_interrupts(struct efa_comp_ctx *comp_ctx,
+							struct efa_com_admin_queue *aq)
+{
+	unsigned long flags;
+	int err;
+
+	wait_for_completion_timeout(&comp_ctx->wait_event,
+				    usecs_to_jiffies(aq->completion_timeout));
+
+	/*
+	 * In case the command wasn't completed find out the root cause.
+	 * There might be 2 kinds of errors
+	 * 1) No completion (timeout reached)
+	 * 2) There is completion but the device didn't get any msi-x interrupt.
+	 */
+	if (comp_ctx->status == EFA_CMD_SUBMITTED) {
+		spin_lock_irqsave(&aq->cq.lock, flags);
+		efa_com_handle_admin_completion(aq);
+		spin_unlock_irqrestore(&aq->cq.lock, flags);
+
+		atomic64_inc(&aq->stats.no_completion);
+
+		if (comp_ctx->status == EFA_CMD_COMPLETED)
+			ibdev_err_ratelimited(
+				aq->efa_dev,
+				"The device sent a completion but the driver didn't receive any MSI-X interrupt for admin cmd %s(%d) status %d (ctx: 0x%p, sq producer: %d, sq consumer: %d, cq consumer: %d)\n",
+				efa_com_cmd_str(comp_ctx->cmd_opcode),
+				comp_ctx->cmd_opcode, comp_ctx->status,
+				comp_ctx, aq->sq.pc, aq->sq.cc, aq->cq.cc);
+		else
+			ibdev_err_ratelimited(
+				aq->efa_dev,
+				"The device didn't send any completion for admin cmd %s(%d) status %d (ctx 0x%p, sq producer: %d, sq consumer: %d, cq consumer: %d)\n",
+				efa_com_cmd_str(comp_ctx->cmd_opcode),
+				comp_ctx->cmd_opcode, comp_ctx->status,
+				comp_ctx, aq->sq.pc, aq->sq.cc, aq->cq.cc);
+
+		clear_bit(EFA_AQ_STATE_RUNNING_BIT, &aq->state);
+		err = -ETIME;
+		goto out;
+	}
+
+	err = efa_com_comp_status_to_errno(comp_ctx->comp_status);
+out:
+	efa_com_put_comp_ctx(aq, comp_ctx);
+	return err;
+}
+
+/*
+ * There are two types to wait for completion.
+ * Polling mode - wait until the completion is available.
+ * Async mode - wait on wait queue until the completion is ready
+ * (or the timeout expired).
+ * It is expected that the IRQ called efa_com_handle_admin_completion
+ * to mark the completions.
+ */
+static int efa_com_wait_and_process_admin_cq(struct efa_comp_ctx *comp_ctx,
+					     struct efa_com_admin_queue *aq)
+{
+	if (test_bit(EFA_AQ_STATE_POLLING_BIT, &aq->state))
+		return efa_com_wait_and_process_admin_cq_polling(comp_ctx, aq);
+
+	return efa_com_wait_and_process_admin_cq_interrupts(comp_ctx, aq);
+}
+
+/**
+ * efa_com_cmd_exec - Execute admin command
+ * @aq: admin queue.
+ * @cmd: the admin command to execute.
+ * @cmd_size: the command size.
+ * @comp: command completion return entry.
+ * @comp_size: command completion size.
+ * Submit an admin command and then wait until the device will return a
+ * completion.
+ * The completion will be copied into comp.
+ *
+ * @return - 0 on success, negative value on failure.
+ */
+int efa_com_cmd_exec(struct efa_com_admin_queue *aq,
+		     struct efa_admin_aq_entry *cmd,
+		     size_t cmd_size,
+		     struct efa_admin_acq_entry *comp,
+		     size_t comp_size)
+{
+	struct efa_comp_ctx *comp_ctx;
+	int err;
+
+	might_sleep();
+
+	/* In case of queue FULL */
+	down(&aq->avail_cmds);
+
+	ibdev_dbg(aq->efa_dev, "%s (opcode %d)\n",
+		  efa_com_cmd_str(cmd->aq_common_descriptor.opcode),
+		  cmd->aq_common_descriptor.opcode);
+	comp_ctx = efa_com_submit_admin_cmd(aq, cmd, cmd_size, comp, comp_size);
+	if (IS_ERR(comp_ctx)) {
+		ibdev_err_ratelimited(
+			aq->efa_dev,
+			"Failed to submit command %s (opcode %u) err %ld\n",
+			efa_com_cmd_str(cmd->aq_common_descriptor.opcode),
+			cmd->aq_common_descriptor.opcode, PTR_ERR(comp_ctx));
+
+		up(&aq->avail_cmds);
+		return PTR_ERR(comp_ctx);
+	}
+
+	err = efa_com_wait_and_process_admin_cq(comp_ctx, aq);
+	if (err)
+		ibdev_err_ratelimited(
+			aq->efa_dev,
+			"Failed to process command %s (opcode %u) comp_status %d err %d\n",
+			efa_com_cmd_str(cmd->aq_common_descriptor.opcode),
+			cmd->aq_common_descriptor.opcode, comp_ctx->comp_status,
+			err);
+
+	up(&aq->avail_cmds);
+
+	return err;
+}
+
+/**
+ * efa_com_admin_destroy - Destroy the admin and the async events queues.
+ * @edev: EFA communication layer struct
+ */
+void efa_com_admin_destroy(struct efa_com_dev *edev)
+{
+	struct efa_com_admin_queue *aq = &edev->aq;
+	struct efa_com_aenq *aenq = &edev->aenq;
+	struct efa_com_admin_cq *cq = &aq->cq;
+	struct efa_com_admin_sq *sq = &aq->sq;
+	u16 size;
+
+	clear_bit(EFA_AQ_STATE_RUNNING_BIT, &aq->state);
+
+	devm_kfree(edev->dmadev, aq->comp_ctx_pool);
+	devm_kfree(edev->dmadev, aq->comp_ctx);
+
+	size = aq->depth * sizeof(*sq->entries);
+	dma_free_coherent(edev->dmadev, size, sq->entries, sq->dma_addr);
+
+	size = aq->depth * sizeof(*cq->entries);
+	dma_free_coherent(edev->dmadev, size, cq->entries, cq->dma_addr);
+
+	size = aenq->depth * sizeof(*aenq->entries);
+	dma_free_coherent(edev->dmadev, size, aenq->entries, aenq->dma_addr);
+}
+
+/**
+ * efa_com_set_admin_polling_mode - Set the admin completion queue polling mode
+ * @edev: EFA communication layer struct
+ * @polling: Enable/Disable polling mode
+ *
+ * Set the admin completion mode.
+ */
+void efa_com_set_admin_polling_mode(struct efa_com_dev *edev, bool polling)
+{
+	u32 mask_value = 0;
+
+	if (polling)
+		mask_value = EFA_REGS_ADMIN_INTR_MASK;
+
+	writel(mask_value, edev->reg_bar + EFA_REGS_INTR_MASK_OFF);
+	if (polling)
+		set_bit(EFA_AQ_STATE_POLLING_BIT, &edev->aq.state);
+	else
+		clear_bit(EFA_AQ_STATE_POLLING_BIT, &edev->aq.state);
+}
+
+static void efa_com_stats_init(struct efa_com_dev *edev)
+{
+	atomic64_t *s = (atomic64_t *)&edev->aq.stats;
+	int i;
+
+	for (i = 0; i < sizeof(edev->aq.stats) / sizeof(*s); i++, s++)
+		atomic64_set(s, 0);
+}
+
+/**
+ * efa_com_admin_init - Init the admin and the async queues
+ * @edev: EFA communication layer struct
+ * @aenq_handlers: Those handlers to be called upon event.
+ *
+ * Initialize the admin submission and completion queues.
+ * Initialize the asynchronous events notification queues.
+ *
+ * @return - 0 on success, negative value on failure.
+ */
+int efa_com_admin_init(struct efa_com_dev *edev,
+		       struct efa_aenq_handlers *aenq_handlers)
+{
+	struct efa_com_admin_queue *aq = &edev->aq;
+	u32 timeout;
+	u32 dev_sts;
+	u32 cap;
+	int err;
+
+	dev_sts = efa_com_reg_read32(edev, EFA_REGS_DEV_STS_OFF);
+	if (!(dev_sts & EFA_REGS_DEV_STS_READY_MASK)) {
+		ibdev_err(edev->efa_dev,
+			  "Device isn't ready, abort com init %#x\n", dev_sts);
+		return -ENODEV;
+	}
+
+	aq->depth = EFA_ADMIN_QUEUE_DEPTH;
+
+	aq->dmadev = edev->dmadev;
+	aq->efa_dev = edev->efa_dev;
+	set_bit(EFA_AQ_STATE_POLLING_BIT, &aq->state);
+
+	sema_init(&aq->avail_cmds, aq->depth);
+
+	efa_com_stats_init(edev);
+
+	err = efa_com_init_comp_ctxt(aq);
+	if (err)
+		return err;
+
+	err = efa_com_admin_init_sq(edev);
+	if (err)
+		goto err_destroy_comp_ctxt;
+
+	err = efa_com_admin_init_cq(edev);
+	if (err)
+		goto err_destroy_sq;
+
+	efa_com_set_admin_polling_mode(edev, false);
+
+	err = efa_com_admin_init_aenq(edev, aenq_handlers);
+	if (err)
+		goto err_destroy_cq;
+
+	cap = efa_com_reg_read32(edev, EFA_REGS_CAPS_OFF);
+	timeout = (cap & EFA_REGS_CAPS_ADMIN_CMD_TO_MASK) >>
+		  EFA_REGS_CAPS_ADMIN_CMD_TO_SHIFT;
+	if (timeout)
+		/* the resolution of timeout reg is 100ms */
+		aq->completion_timeout = timeout * 100000;
+	else
+		aq->completion_timeout = ADMIN_CMD_TIMEOUT_US;
+
+	aq->poll_interval = EFA_POLL_INTERVAL_MS;
+
+	set_bit(EFA_AQ_STATE_RUNNING_BIT, &aq->state);
+
+	return 0;
+
+err_destroy_cq:
+	dma_free_coherent(edev->dmadev, aq->depth * sizeof(*aq->cq.entries),
+			  aq->cq.entries, aq->cq.dma_addr);
+err_destroy_sq:
+	dma_free_coherent(edev->dmadev, aq->depth * sizeof(*aq->sq.entries),
+			  aq->sq.entries, aq->sq.dma_addr);
+err_destroy_comp_ctxt:
+	devm_kfree(edev->dmadev, aq->comp_ctx);
+
+	return err;
+}
+
+/**
+ * efa_com_admin_q_comp_intr_handler - admin queue interrupt handler
+ * @edev: EFA communication layer struct
+ *
+ * This method goes over the admin completion queue and wakes up
+ * all the pending threads that wait on the commands wait event.
+ *
+ * @note: Should be called after MSI-X interrupt.
+ */
+void efa_com_admin_q_comp_intr_handler(struct efa_com_dev *edev)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&edev->aq.cq.lock, flags);
+	efa_com_handle_admin_completion(&edev->aq);
+	spin_unlock_irqrestore(&edev->aq.cq.lock, flags);
+}
+
+/*
+ * efa_handle_specific_aenq_event:
+ * return the handler that is relevant to the specific event group
+ */
+static efa_aenq_handler efa_com_get_specific_aenq_cb(struct efa_com_dev *edev,
+						     u16 group)
+{
+	struct efa_aenq_handlers *aenq_handlers = edev->aenq.aenq_handlers;
+
+	if (group < EFA_MAX_HANDLERS && aenq_handlers->handlers[group])
+		return aenq_handlers->handlers[group];
+
+	return aenq_handlers->unimplemented_handler;
+}
+
+/**
+ * efa_com_aenq_intr_handler - AENQ interrupt handler
+ * @edev: EFA communication layer struct
+ * @data: Data of interrupt handler.
+ *
+ * Go over the async event notification queue and call the proper aenq handler.
+ */
+void efa_com_aenq_intr_handler(struct efa_com_dev *edev, void *data)
+{
+	struct efa_admin_aenq_common_desc *aenq_common;
+	struct efa_com_aenq *aenq = &edev->aenq;
+	struct efa_admin_aenq_entry *aenq_e;
+	efa_aenq_handler handler_cb;
+	u32 processed = 0;
+	u8 phase;
+	u32 ci;
+
+	ci = aenq->cc & (aenq->depth - 1);
+	phase = aenq->phase;
+	aenq_e = &aenq->entries[ci]; /* Get first entry */
+	aenq_common = &aenq_e->aenq_common_desc;
+
+	/* Go over all the events */
+	while ((READ_ONCE(aenq_common->flags) &
+		EFA_ADMIN_AENQ_COMMON_DESC_PHASE_MASK) == phase) {
+		/*
+		 * Do not read the rest of the completion entry before the
+		 * phase bit was validated
+		 */
+		dma_rmb();
+
+		/* Handle specific event*/
+		handler_cb = efa_com_get_specific_aenq_cb(edev,
+							  aenq_common->group);
+		handler_cb(data, aenq_e); /* call the actual event handler*/
+
+		/* Get next event entry */
+		ci++;
+		processed++;
+
+		if (ci == aenq->depth) {
+			ci = 0;
+			phase = !phase;
+		}
+		aenq_e = &aenq->entries[ci];
+		aenq_common = &aenq_e->aenq_common_desc;
+	}
+
+	aenq->cc += processed;
+	aenq->phase = phase;
+
+	/* Don't update aenq doorbell if there weren't any processed events */
+	if (!processed)
+		return;
+
+	/* barrier not needed in case of writel */
+	writel(aenq->cc, edev->reg_bar + EFA_REGS_AENQ_CONS_DB_OFF);
+}
+
+static void efa_com_mmio_reg_read_resp_addr_init(struct efa_com_dev *edev)
+{
+	struct efa_com_mmio_read *mmio_read = &edev->mmio_read;
+	u32 addr_high;
+	u32 addr_low;
+
+	/* dma_addr_bits is unknown at this point */
+	addr_high = (mmio_read->read_resp_dma_addr >> 32) & GENMASK(31, 0);
+	addr_low = mmio_read->read_resp_dma_addr & GENMASK(31, 0);
+
+	writel(addr_high, edev->reg_bar + EFA_REGS_MMIO_RESP_HI_OFF);
+	writel(addr_low, edev->reg_bar + EFA_REGS_MMIO_RESP_LO_OFF);
+}
+
+int efa_com_mmio_reg_read_init(struct efa_com_dev *edev)
+{
+	struct efa_com_mmio_read *mmio_read = &edev->mmio_read;
+
+	spin_lock_init(&mmio_read->lock);
+	mmio_read->read_resp =
+		dma_alloc_coherent(edev->dmadev, sizeof(*mmio_read->read_resp),
+				   &mmio_read->read_resp_dma_addr, GFP_KERNEL);
+	if (!mmio_read->read_resp)
+		return -ENOMEM;
+
+	efa_com_mmio_reg_read_resp_addr_init(edev);
+
+	mmio_read->read_resp->req_id = 0;
+	mmio_read->seq_num = 0;
+	mmio_read->mmio_read_timeout = EFA_REG_READ_TIMEOUT_US;
+
+	return 0;
+}
+
+void efa_com_mmio_reg_read_destroy(struct efa_com_dev *edev)
+{
+	struct efa_com_mmio_read *mmio_read = &edev->mmio_read;
+
+	dma_free_coherent(edev->dmadev, sizeof(*mmio_read->read_resp),
+			  mmio_read->read_resp, mmio_read->read_resp_dma_addr);
+}
+
+int efa_com_validate_version(struct efa_com_dev *edev)
+{
+	u32 ctrl_ver_masked;
+	u32 ctrl_ver;
+	u32 ver;
+
+	/*
+	 * Make sure the EFA version and the controller version are at least
+	 * as the driver expects
+	 */
+	ver = efa_com_reg_read32(edev, EFA_REGS_VERSION_OFF);
+	ctrl_ver = efa_com_reg_read32(edev,
+				      EFA_REGS_CONTROLLER_VERSION_OFF);
+
+	ibdev_dbg(edev->efa_dev, "efa device version: %d.%d\n",
+		  (ver & EFA_REGS_VERSION_MAJOR_VERSION_MASK) >>
+			  EFA_REGS_VERSION_MAJOR_VERSION_SHIFT,
+		  ver & EFA_REGS_VERSION_MINOR_VERSION_MASK);
+
+	if (ver < MIN_EFA_VER) {
+		ibdev_err(edev->efa_dev,
+			  "EFA version is lower than the minimal version the driver supports\n");
+		return -EOPNOTSUPP;
+	}
+
+	ibdev_dbg(edev->efa_dev,
+		  "efa controller version: %d.%d.%d implementation version %d\n",
+		  (ctrl_ver & EFA_REGS_CONTROLLER_VERSION_MAJOR_VERSION_MASK) >>
+			  EFA_REGS_CONTROLLER_VERSION_MAJOR_VERSION_SHIFT,
+		  (ctrl_ver & EFA_REGS_CONTROLLER_VERSION_MINOR_VERSION_MASK) >>
+			  EFA_REGS_CONTROLLER_VERSION_MINOR_VERSION_SHIFT,
+		  (ctrl_ver & EFA_REGS_CONTROLLER_VERSION_SUBMINOR_VERSION_MASK),
+		  (ctrl_ver & EFA_REGS_CONTROLLER_VERSION_IMPL_ID_MASK) >>
+			  EFA_REGS_CONTROLLER_VERSION_IMPL_ID_SHIFT);
+
+	ctrl_ver_masked =
+		(ctrl_ver & EFA_REGS_CONTROLLER_VERSION_MAJOR_VERSION_MASK) |
+		(ctrl_ver & EFA_REGS_CONTROLLER_VERSION_MINOR_VERSION_MASK) |
+		(ctrl_ver & EFA_REGS_CONTROLLER_VERSION_SUBMINOR_VERSION_MASK);
+
+	/* Validate the ctrl version without the implementation ID */
+	if (ctrl_ver_masked < MIN_EFA_CTRL_VER) {
+		ibdev_err(edev->efa_dev,
+			  "EFA ctrl version is lower than the minimal ctrl version the driver supports\n");
+		return -EOPNOTSUPP;
+	}
+
+	return 0;
+}
+
+/**
+ * efa_com_get_dma_width - Retrieve physical dma address width the device
+ * supports.
+ * @edev: EFA communication layer struct
+ *
+ * Retrieve the maximum physical address bits the device can handle.
+ *
+ * @return: > 0 on Success and negative value otherwise.
+ */
+int efa_com_get_dma_width(struct efa_com_dev *edev)
+{
+	u32 caps = efa_com_reg_read32(edev, EFA_REGS_CAPS_OFF);
+	int width;
+
+	width = (caps & EFA_REGS_CAPS_DMA_ADDR_WIDTH_MASK) >>
+		EFA_REGS_CAPS_DMA_ADDR_WIDTH_SHIFT;
+
+	ibdev_dbg(edev->efa_dev, "DMA width: %d\n", width);
+
+	if (width < 32 || width > 64) {
+		ibdev_err(edev->efa_dev, "DMA width illegal value: %d\n", width);
+		return -EINVAL;
+	}
+
+	edev->dma_addr_bits = width;
+
+	return width;
+}
+
+static int wait_for_reset_state(struct efa_com_dev *edev, u32 timeout,
+				u16 exp_state)
+{
+	u32 val, i;
+
+	for (i = 0; i < timeout; i++) {
+		val = efa_com_reg_read32(edev, EFA_REGS_DEV_STS_OFF);
+
+		if ((val & EFA_REGS_DEV_STS_RESET_IN_PROGRESS_MASK) ==
+		    exp_state)
+			return 0;
+
+		ibdev_dbg(edev->efa_dev, "Reset indication val %d\n", val);
+		msleep(EFA_POLL_INTERVAL_MS);
+	}
+
+	return -ETIME;
+}
+
+/**
+ * efa_com_dev_reset - Perform device FLR to the device.
+ * @edev: EFA communication layer struct
+ * @reset_reason: Specify what is the trigger for the reset in case of an error.
+ *
+ * @return - 0 on success, negative value on failure.
+ */
+int efa_com_dev_reset(struct efa_com_dev *edev,
+		      enum efa_regs_reset_reason_types reset_reason)
+{
+	u32 stat, timeout, cap, reset_val;
+	int err;
+
+	stat = efa_com_reg_read32(edev, EFA_REGS_DEV_STS_OFF);
+	cap = efa_com_reg_read32(edev, EFA_REGS_CAPS_OFF);
+
+	if (!(stat & EFA_REGS_DEV_STS_READY_MASK)) {
+		ibdev_err(edev->efa_dev,
+			  "Device isn't ready, can't reset device\n");
+		return -EINVAL;
+	}
+
+	timeout = (cap & EFA_REGS_CAPS_RESET_TIMEOUT_MASK) >>
+		  EFA_REGS_CAPS_RESET_TIMEOUT_SHIFT;
+	if (!timeout) {
+		ibdev_err(edev->efa_dev, "Invalid timeout value\n");
+		return -EINVAL;
+	}
+
+	/* start reset */
+	reset_val = EFA_REGS_DEV_CTL_DEV_RESET_MASK;
+	reset_val |= (reset_reason << EFA_REGS_DEV_CTL_RESET_REASON_SHIFT) &
+		     EFA_REGS_DEV_CTL_RESET_REASON_MASK;
+	writel(reset_val, edev->reg_bar + EFA_REGS_DEV_CTL_OFF);
+
+	/* reset clears the mmio readless address, restore it */
+	efa_com_mmio_reg_read_resp_addr_init(edev);
+
+	err = wait_for_reset_state(edev, timeout,
+				   EFA_REGS_DEV_STS_RESET_IN_PROGRESS_MASK);
+	if (err) {
+		ibdev_err(edev->efa_dev, "Reset indication didn't turn on\n");
+		return err;
+	}
+
+	/* reset done */
+	writel(0, edev->reg_bar + EFA_REGS_DEV_CTL_OFF);
+	err = wait_for_reset_state(edev, timeout, 0);
+	if (err) {
+		ibdev_err(edev->efa_dev, "Reset indication didn't turn off\n");
+		return err;
+	}
+
+	timeout = (cap & EFA_REGS_CAPS_ADMIN_CMD_TO_MASK) >>
+		  EFA_REGS_CAPS_ADMIN_CMD_TO_SHIFT;
+	if (timeout)
+		/* the resolution of timeout reg is 100ms */
+		edev->aq.completion_timeout = timeout * 100000;
+	else
+		edev->aq.completion_timeout = ADMIN_CMD_TIMEOUT_US;
+
+	return 0;
+}
diff --git a/drivers/infiniband/hw/efa/efa_com.h b/drivers/infiniband/hw/efa/efa_com.h
new file mode 100644
index 0000000..c67dd81
--- /dev/null
+++ b/drivers/infiniband/hw/efa/efa_com.h
@@ -0,0 +1,143 @@
+/* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */
+/*
+ * Copyright 2018-2019 Amazon.com, Inc. or its affiliates. All rights reserved.
+ */
+
+#ifndef _EFA_COM_H_
+#define _EFA_COM_H_
+
+#include <linux/delay.h>
+#include <linux/device.h>
+#include <linux/dma-mapping.h>
+#include <linux/semaphore.h>
+#include <linux/sched.h>
+
+#include <rdma/ib_verbs.h>
+
+#include "efa_common_defs.h"
+#include "efa_admin_defs.h"
+#include "efa_admin_cmds_defs.h"
+#include "efa_regs_defs.h"
+
+#define EFA_MAX_HANDLERS 256
+
+struct efa_com_admin_cq {
+	struct efa_admin_acq_entry *entries;
+	dma_addr_t dma_addr;
+	spinlock_t lock; /* Protects ACQ */
+
+	u16 cc; /* consumer counter */
+	u8 phase;
+};
+
+struct efa_com_admin_sq {
+	struct efa_admin_aq_entry *entries;
+	dma_addr_t dma_addr;
+	spinlock_t lock; /* Protects ASQ */
+
+	u32 __iomem *db_addr;
+
+	u16 cc; /* consumer counter */
+	u16 pc; /* producer counter */
+	u8 phase;
+
+};
+
+/* Don't use anything other than atomic64 */
+struct efa_com_stats_admin {
+	atomic64_t submitted_cmd;
+	atomic64_t completed_cmd;
+	atomic64_t no_completion;
+};
+
+enum {
+	EFA_AQ_STATE_RUNNING_BIT = 0,
+	EFA_AQ_STATE_POLLING_BIT = 1,
+};
+
+struct efa_com_admin_queue {
+	void *dmadev;
+	void *efa_dev;
+	struct efa_comp_ctx *comp_ctx;
+	u32 completion_timeout; /* usecs */
+	u16 poll_interval; /* msecs */
+	u16 depth;
+	struct efa_com_admin_cq cq;
+	struct efa_com_admin_sq sq;
+	u16 msix_vector_idx;
+
+	unsigned long state;
+
+	/* Count the number of available admin commands */
+	struct semaphore avail_cmds;
+
+	struct efa_com_stats_admin stats;
+
+	spinlock_t comp_ctx_lock; /* Protects completion context pool */
+	u32 *comp_ctx_pool;
+	u16 comp_ctx_pool_next;
+};
+
+struct efa_aenq_handlers;
+
+struct efa_com_aenq {
+	struct efa_admin_aenq_entry *entries;
+	struct efa_aenq_handlers *aenq_handlers;
+	dma_addr_t dma_addr;
+	u32 cc; /* consumer counter */
+	u16 msix_vector_idx;
+	u16 depth;
+	u8 phase;
+};
+
+struct efa_com_mmio_read {
+	struct efa_admin_mmio_req_read_less_resp *read_resp;
+	dma_addr_t read_resp_dma_addr;
+	u16 seq_num;
+	u16 mmio_read_timeout; /* usecs */
+	/* serializes mmio reads */
+	spinlock_t lock;
+};
+
+struct efa_com_dev {
+	struct efa_com_admin_queue aq;
+	struct efa_com_aenq aenq;
+	u8 __iomem *reg_bar;
+	void *dmadev;
+	void *efa_dev;
+	u32 supported_features;
+	u32 dma_addr_bits;
+
+	struct efa_com_mmio_read mmio_read;
+};
+
+typedef void (*efa_aenq_handler)(void *data,
+	      struct efa_admin_aenq_entry *aenq_e);
+
+/* Holds aenq handlers. Indexed by AENQ event group */
+struct efa_aenq_handlers {
+	efa_aenq_handler handlers[EFA_MAX_HANDLERS];
+	efa_aenq_handler unimplemented_handler;
+};
+
+int efa_com_admin_init(struct efa_com_dev *edev,
+		       struct efa_aenq_handlers *aenq_handlers);
+void efa_com_admin_destroy(struct efa_com_dev *edev);
+int efa_com_dev_reset(struct efa_com_dev *edev,
+		      enum efa_regs_reset_reason_types reset_reason);
+void efa_com_set_admin_polling_mode(struct efa_com_dev *edev, bool polling);
+void efa_com_admin_q_comp_intr_handler(struct efa_com_dev *edev);
+int efa_com_mmio_reg_read_init(struct efa_com_dev *edev);
+void efa_com_mmio_reg_read_destroy(struct efa_com_dev *edev);
+
+int efa_com_validate_version(struct efa_com_dev *edev);
+int efa_com_get_dma_width(struct efa_com_dev *edev);
+
+int efa_com_cmd_exec(struct efa_com_admin_queue *aq,
+		     struct efa_admin_aq_entry *cmd,
+		     size_t cmd_size,
+		     struct efa_admin_acq_entry *comp,
+		     size_t comp_size);
+void efa_com_aenq_intr_handler(struct efa_com_dev *edev, void *data);
+
+#endif /* _EFA_COM_H_ */
diff --git a/drivers/infiniband/hw/efa/efa_com_cmd.c b/drivers/infiniband/hw/efa/efa_com_cmd.c
new file mode 100644
index 0000000..c079f13
--- /dev/null
+++ b/drivers/infiniband/hw/efa/efa_com_cmd.c
@@ -0,0 +1,765 @@
+// SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause
+/*
+ * Copyright 2018-2019 Amazon.com, Inc. or its affiliates. All rights reserved.
+ */
+
+#include "efa_com.h"
+#include "efa_com_cmd.h"
+
+void efa_com_set_dma_addr(dma_addr_t addr, u32 *addr_high, u32 *addr_low)
+{
+	*addr_low = lower_32_bits(addr);
+	*addr_high = upper_32_bits(addr);
+}
+
+int efa_com_create_qp(struct efa_com_dev *edev,
+		      struct efa_com_create_qp_params *params,
+		      struct efa_com_create_qp_result *res)
+{
+	struct efa_admin_create_qp_cmd create_qp_cmd = {};
+	struct efa_admin_create_qp_resp cmd_completion;
+	struct efa_com_admin_queue *aq = &edev->aq;
+	int err;
+
+	create_qp_cmd.aq_common_desc.opcode = EFA_ADMIN_CREATE_QP;
+
+	create_qp_cmd.pd = params->pd;
+	create_qp_cmd.qp_type = params->qp_type;
+	create_qp_cmd.rq_base_addr = params->rq_base_addr;
+	create_qp_cmd.send_cq_idx = params->send_cq_idx;
+	create_qp_cmd.recv_cq_idx = params->recv_cq_idx;
+	create_qp_cmd.qp_alloc_size.send_queue_ring_size =
+		params->sq_ring_size_in_bytes;
+	create_qp_cmd.qp_alloc_size.send_queue_depth =
+			params->sq_depth;
+	create_qp_cmd.qp_alloc_size.recv_queue_ring_size =
+			params->rq_ring_size_in_bytes;
+	create_qp_cmd.qp_alloc_size.recv_queue_depth =
+			params->rq_depth;
+	create_qp_cmd.uar = params->uarn;
+
+	err = efa_com_cmd_exec(aq,
+			       (struct efa_admin_aq_entry *)&create_qp_cmd,
+			       sizeof(create_qp_cmd),
+			       (struct efa_admin_acq_entry *)&cmd_completion,
+			       sizeof(cmd_completion));
+	if (err) {
+		ibdev_err_ratelimited(edev->efa_dev,
+				      "Failed to create qp [%d]\n", err);
+		return err;
+	}
+
+	res->qp_handle = cmd_completion.qp_handle;
+	res->qp_num = cmd_completion.qp_num;
+	res->sq_db_offset = cmd_completion.sq_db_offset;
+	res->rq_db_offset = cmd_completion.rq_db_offset;
+	res->llq_descriptors_offset = cmd_completion.llq_descriptors_offset;
+	res->send_sub_cq_idx = cmd_completion.send_sub_cq_idx;
+	res->recv_sub_cq_idx = cmd_completion.recv_sub_cq_idx;
+
+	return 0;
+}
+
+int efa_com_modify_qp(struct efa_com_dev *edev,
+		      struct efa_com_modify_qp_params *params)
+{
+	struct efa_com_admin_queue *aq = &edev->aq;
+	struct efa_admin_modify_qp_cmd cmd = {};
+	struct efa_admin_modify_qp_resp resp;
+	int err;
+
+	cmd.aq_common_desc.opcode = EFA_ADMIN_MODIFY_QP;
+	cmd.modify_mask = params->modify_mask;
+	cmd.qp_handle = params->qp_handle;
+	cmd.qp_state = params->qp_state;
+	cmd.cur_qp_state = params->cur_qp_state;
+	cmd.qkey = params->qkey;
+	cmd.sq_psn = params->sq_psn;
+	cmd.sq_drained_async_notify = params->sq_drained_async_notify;
+
+	err = efa_com_cmd_exec(aq,
+			       (struct efa_admin_aq_entry *)&cmd,
+			       sizeof(cmd),
+			       (struct efa_admin_acq_entry *)&resp,
+			       sizeof(resp));
+	if (err) {
+		ibdev_err_ratelimited(
+			edev->efa_dev,
+			"Failed to modify qp-%u modify_mask[%#x] [%d]\n",
+			cmd.qp_handle, cmd.modify_mask, err);
+		return err;
+	}
+
+	return 0;
+}
+
+int efa_com_query_qp(struct efa_com_dev *edev,
+		     struct efa_com_query_qp_params *params,
+		     struct efa_com_query_qp_result *result)
+{
+	struct efa_com_admin_queue *aq = &edev->aq;
+	struct efa_admin_query_qp_cmd cmd = {};
+	struct efa_admin_query_qp_resp resp;
+	int err;
+
+	cmd.aq_common_desc.opcode = EFA_ADMIN_QUERY_QP;
+	cmd.qp_handle = params->qp_handle;
+
+	err = efa_com_cmd_exec(aq,
+			       (struct efa_admin_aq_entry *)&cmd,
+			       sizeof(cmd),
+			       (struct efa_admin_acq_entry *)&resp,
+			       sizeof(resp));
+	if (err) {
+		ibdev_err_ratelimited(edev->efa_dev,
+				      "Failed to query qp-%u [%d]\n",
+				      cmd.qp_handle, err);
+		return err;
+	}
+
+	result->qp_state = resp.qp_state;
+	result->qkey = resp.qkey;
+	result->sq_draining = resp.sq_draining;
+	result->sq_psn = resp.sq_psn;
+
+	return 0;
+}
+
+int efa_com_destroy_qp(struct efa_com_dev *edev,
+		       struct efa_com_destroy_qp_params *params)
+{
+	struct efa_admin_destroy_qp_resp cmd_completion;
+	struct efa_admin_destroy_qp_cmd qp_cmd = {};
+	struct efa_com_admin_queue *aq = &edev->aq;
+	int err;
+
+	qp_cmd.aq_common_desc.opcode = EFA_ADMIN_DESTROY_QP;
+	qp_cmd.qp_handle = params->qp_handle;
+
+	err = efa_com_cmd_exec(aq,
+			       (struct efa_admin_aq_entry *)&qp_cmd,
+			       sizeof(qp_cmd),
+			       (struct efa_admin_acq_entry *)&cmd_completion,
+			       sizeof(cmd_completion));
+	if (err) {
+		ibdev_err_ratelimited(edev->efa_dev,
+				      "Failed to destroy qp-%u [%d]\n",
+				      qp_cmd.qp_handle, err);
+		return err;
+	}
+
+	return 0;
+}
+
+int efa_com_create_cq(struct efa_com_dev *edev,
+		      struct efa_com_create_cq_params *params,
+		      struct efa_com_create_cq_result *result)
+{
+	struct efa_admin_create_cq_resp cmd_completion;
+	struct efa_admin_create_cq_cmd create_cmd = {};
+	struct efa_com_admin_queue *aq = &edev->aq;
+	int err;
+
+	create_cmd.aq_common_desc.opcode = EFA_ADMIN_CREATE_CQ;
+	create_cmd.cq_caps_2 = (params->entry_size_in_bytes / 4) &
+				EFA_ADMIN_CREATE_CQ_CMD_CQ_ENTRY_SIZE_WORDS_MASK;
+	create_cmd.cq_depth = params->cq_depth;
+	create_cmd.num_sub_cqs = params->num_sub_cqs;
+	create_cmd.uar = params->uarn;
+
+	efa_com_set_dma_addr(params->dma_addr,
+			     &create_cmd.cq_ba.mem_addr_high,
+			     &create_cmd.cq_ba.mem_addr_low);
+
+	err = efa_com_cmd_exec(aq,
+			       (struct efa_admin_aq_entry *)&create_cmd,
+			       sizeof(create_cmd),
+			       (struct efa_admin_acq_entry *)&cmd_completion,
+			       sizeof(cmd_completion));
+	if (err) {
+		ibdev_err_ratelimited(edev->efa_dev,
+				      "Failed to create cq[%d]\n", err);
+		return err;
+	}
+
+	result->cq_idx = cmd_completion.cq_idx;
+	result->actual_depth = params->cq_depth;
+
+	return 0;
+}
+
+int efa_com_destroy_cq(struct efa_com_dev *edev,
+		       struct efa_com_destroy_cq_params *params)
+{
+	struct efa_admin_destroy_cq_cmd destroy_cmd = {};
+	struct efa_admin_destroy_cq_resp destroy_resp;
+	struct efa_com_admin_queue *aq = &edev->aq;
+	int err;
+
+	destroy_cmd.cq_idx = params->cq_idx;
+	destroy_cmd.aq_common_desc.opcode = EFA_ADMIN_DESTROY_CQ;
+
+	err = efa_com_cmd_exec(aq,
+			       (struct efa_admin_aq_entry *)&destroy_cmd,
+			       sizeof(destroy_cmd),
+			       (struct efa_admin_acq_entry *)&destroy_resp,
+			       sizeof(destroy_resp));
+
+	if (err) {
+		ibdev_err_ratelimited(edev->efa_dev,
+				      "Failed to destroy CQ-%u [%d]\n",
+				      params->cq_idx, err);
+		return err;
+	}
+
+	return 0;
+}
+
+int efa_com_register_mr(struct efa_com_dev *edev,
+			struct efa_com_reg_mr_params *params,
+			struct efa_com_reg_mr_result *result)
+{
+	struct efa_admin_reg_mr_resp cmd_completion;
+	struct efa_com_admin_queue *aq = &edev->aq;
+	struct efa_admin_reg_mr_cmd mr_cmd = {};
+	int err;
+
+	mr_cmd.aq_common_desc.opcode = EFA_ADMIN_REG_MR;
+	mr_cmd.pd = params->pd;
+	mr_cmd.mr_length = params->mr_length_in_bytes;
+	mr_cmd.flags |= params->page_shift &
+		EFA_ADMIN_REG_MR_CMD_PHYS_PAGE_SIZE_SHIFT_MASK;
+	mr_cmd.iova = params->iova;
+	mr_cmd.permissions |= params->permissions &
+			      EFA_ADMIN_REG_MR_CMD_LOCAL_WRITE_ENABLE_MASK;
+
+	if (params->inline_pbl) {
+		memcpy(mr_cmd.pbl.inline_pbl_array,
+		       params->pbl.inline_pbl_array,
+		       sizeof(mr_cmd.pbl.inline_pbl_array));
+	} else {
+		mr_cmd.pbl.pbl.length = params->pbl.pbl.length;
+		mr_cmd.pbl.pbl.address.mem_addr_low =
+			params->pbl.pbl.address.mem_addr_low;
+		mr_cmd.pbl.pbl.address.mem_addr_high =
+			params->pbl.pbl.address.mem_addr_high;
+		mr_cmd.aq_common_desc.flags |=
+			EFA_ADMIN_AQ_COMMON_DESC_CTRL_DATA_MASK;
+		if (params->indirect)
+			mr_cmd.aq_common_desc.flags |=
+				EFA_ADMIN_AQ_COMMON_DESC_CTRL_DATA_INDIRECT_MASK;
+	}
+
+	err = efa_com_cmd_exec(aq,
+			       (struct efa_admin_aq_entry *)&mr_cmd,
+			       sizeof(mr_cmd),
+			       (struct efa_admin_acq_entry *)&cmd_completion,
+			       sizeof(cmd_completion));
+	if (err) {
+		ibdev_err_ratelimited(edev->efa_dev,
+				      "Failed to register mr [%d]\n", err);
+		return err;
+	}
+
+	result->l_key = cmd_completion.l_key;
+	result->r_key = cmd_completion.r_key;
+
+	return 0;
+}
+
+int efa_com_dereg_mr(struct efa_com_dev *edev,
+		     struct efa_com_dereg_mr_params *params)
+{
+	struct efa_admin_dereg_mr_resp cmd_completion;
+	struct efa_com_admin_queue *aq = &edev->aq;
+	struct efa_admin_dereg_mr_cmd mr_cmd = {};
+	int err;
+
+	mr_cmd.aq_common_desc.opcode = EFA_ADMIN_DEREG_MR;
+	mr_cmd.l_key = params->l_key;
+
+	err = efa_com_cmd_exec(aq,
+			       (struct efa_admin_aq_entry *)&mr_cmd,
+			       sizeof(mr_cmd),
+			       (struct efa_admin_acq_entry *)&cmd_completion,
+			       sizeof(cmd_completion));
+	if (err) {
+		ibdev_err_ratelimited(edev->efa_dev,
+				      "Failed to de-register mr(lkey-%u) [%d]\n",
+				      mr_cmd.l_key, err);
+		return err;
+	}
+
+	return 0;
+}
+
+int efa_com_create_ah(struct efa_com_dev *edev,
+		      struct efa_com_create_ah_params *params,
+		      struct efa_com_create_ah_result *result)
+{
+	struct efa_admin_create_ah_resp cmd_completion;
+	struct efa_com_admin_queue *aq = &edev->aq;
+	struct efa_admin_create_ah_cmd ah_cmd = {};
+	int err;
+
+	ah_cmd.aq_common_desc.opcode = EFA_ADMIN_CREATE_AH;
+
+	memcpy(ah_cmd.dest_addr, params->dest_addr, sizeof(ah_cmd.dest_addr));
+	ah_cmd.pd = params->pdn;
+
+	err = efa_com_cmd_exec(aq,
+			       (struct efa_admin_aq_entry *)&ah_cmd,
+			       sizeof(ah_cmd),
+			       (struct efa_admin_acq_entry *)&cmd_completion,
+			       sizeof(cmd_completion));
+	if (err) {
+		ibdev_err_ratelimited(edev->efa_dev,
+				      "Failed to create ah for %pI6 [%d]\n",
+				      ah_cmd.dest_addr, err);
+		return err;
+	}
+
+	result->ah = cmd_completion.ah;
+
+	return 0;
+}
+
+int efa_com_destroy_ah(struct efa_com_dev *edev,
+		       struct efa_com_destroy_ah_params *params)
+{
+	struct efa_admin_destroy_ah_resp cmd_completion;
+	struct efa_admin_destroy_ah_cmd ah_cmd = {};
+	struct efa_com_admin_queue *aq = &edev->aq;
+	int err;
+
+	ah_cmd.aq_common_desc.opcode = EFA_ADMIN_DESTROY_AH;
+	ah_cmd.ah = params->ah;
+	ah_cmd.pd = params->pdn;
+
+	err = efa_com_cmd_exec(aq,
+			       (struct efa_admin_aq_entry *)&ah_cmd,
+			       sizeof(ah_cmd),
+			       (struct efa_admin_acq_entry *)&cmd_completion,
+			       sizeof(cmd_completion));
+	if (err) {
+		ibdev_err_ratelimited(edev->efa_dev,
+				      "Failed to destroy ah-%d pd-%d [%d]\n",
+				      ah_cmd.ah, ah_cmd.pd, err);
+		return err;
+	}
+
+	return 0;
+}
+
+static bool
+efa_com_check_supported_feature_id(struct efa_com_dev *edev,
+				   enum efa_admin_aq_feature_id feature_id)
+{
+	u32 feature_mask = 1 << feature_id;
+
+	/* Device attributes is always supported */
+	if (feature_id != EFA_ADMIN_DEVICE_ATTR &&
+	    !(edev->supported_features & feature_mask))
+		return false;
+
+	return true;
+}
+
+static int efa_com_get_feature_ex(struct efa_com_dev *edev,
+				  struct efa_admin_get_feature_resp *get_resp,
+				  enum efa_admin_aq_feature_id feature_id,
+				  dma_addr_t control_buf_dma_addr,
+				  u32 control_buff_size)
+{
+	struct efa_admin_get_feature_cmd get_cmd = {};
+	struct efa_com_admin_queue *aq;
+	int err;
+
+	if (!efa_com_check_supported_feature_id(edev, feature_id)) {
+		ibdev_err_ratelimited(edev->efa_dev,
+				      "Feature %d isn't supported\n",
+				      feature_id);
+		return -EOPNOTSUPP;
+	}
+
+	aq = &edev->aq;
+
+	get_cmd.aq_common_descriptor.opcode = EFA_ADMIN_GET_FEATURE;
+
+	if (control_buff_size)
+		get_cmd.aq_common_descriptor.flags =
+			EFA_ADMIN_AQ_COMMON_DESC_CTRL_DATA_INDIRECT_MASK;
+
+
+	efa_com_set_dma_addr(control_buf_dma_addr,
+			     &get_cmd.control_buffer.address.mem_addr_high,
+			     &get_cmd.control_buffer.address.mem_addr_low);
+
+	get_cmd.control_buffer.length = control_buff_size;
+	get_cmd.feature_common.feature_id = feature_id;
+	err = efa_com_cmd_exec(aq,
+			       (struct efa_admin_aq_entry *)
+			       &get_cmd,
+			       sizeof(get_cmd),
+			       (struct efa_admin_acq_entry *)
+			       get_resp,
+			       sizeof(*get_resp));
+
+	if (err) {
+		ibdev_err_ratelimited(
+			edev->efa_dev,
+			"Failed to submit get_feature command %d [%d]\n",
+			feature_id, err);
+		return err;
+	}
+
+	return 0;
+}
+
+static int efa_com_get_feature(struct efa_com_dev *edev,
+			       struct efa_admin_get_feature_resp *get_resp,
+			       enum efa_admin_aq_feature_id feature_id)
+{
+	return efa_com_get_feature_ex(edev, get_resp, feature_id, 0, 0);
+}
+
+int efa_com_get_network_attr(struct efa_com_dev *edev,
+			     struct efa_com_get_network_attr_result *result)
+{
+	struct efa_admin_get_feature_resp resp;
+	int err;
+
+	err = efa_com_get_feature(edev, &resp,
+				  EFA_ADMIN_NETWORK_ATTR);
+	if (err) {
+		ibdev_err_ratelimited(edev->efa_dev,
+				      "Failed to get network attributes %d\n",
+				      err);
+		return err;
+	}
+
+	memcpy(result->addr, resp.u.network_attr.addr,
+	       sizeof(resp.u.network_attr.addr));
+	result->mtu = resp.u.network_attr.mtu;
+
+	return 0;
+}
+
+int efa_com_get_device_attr(struct efa_com_dev *edev,
+			    struct efa_com_get_device_attr_result *result)
+{
+	struct efa_admin_get_feature_resp resp;
+	int err;
+
+	err = efa_com_get_feature(edev, &resp, EFA_ADMIN_DEVICE_ATTR);
+	if (err) {
+		ibdev_err_ratelimited(edev->efa_dev,
+				      "Failed to get device attributes %d\n",
+				      err);
+		return err;
+	}
+
+	result->page_size_cap = resp.u.device_attr.page_size_cap;
+	result->fw_version = resp.u.device_attr.fw_version;
+	result->admin_api_version = resp.u.device_attr.admin_api_version;
+	result->device_version = resp.u.device_attr.device_version;
+	result->supported_features = resp.u.device_attr.supported_features;
+	result->phys_addr_width = resp.u.device_attr.phys_addr_width;
+	result->virt_addr_width = resp.u.device_attr.virt_addr_width;
+	result->db_bar = resp.u.device_attr.db_bar;
+
+	if (result->admin_api_version < 1) {
+		ibdev_err_ratelimited(
+			edev->efa_dev,
+			"Failed to get device attr api version [%u < 1]\n",
+			result->admin_api_version);
+		return -EINVAL;
+	}
+
+	edev->supported_features = resp.u.device_attr.supported_features;
+	err = efa_com_get_feature(edev, &resp,
+				  EFA_ADMIN_QUEUE_ATTR);
+	if (err) {
+		ibdev_err_ratelimited(edev->efa_dev,
+				      "Failed to get queue attributes %d\n",
+				      err);
+		return err;
+	}
+
+	result->max_qp = resp.u.queue_attr.max_qp;
+	result->max_sq_depth = resp.u.queue_attr.max_sq_depth;
+	result->max_rq_depth = resp.u.queue_attr.max_rq_depth;
+	result->max_cq = resp.u.queue_attr.max_cq;
+	result->max_cq_depth = resp.u.queue_attr.max_cq_depth;
+	result->inline_buf_size = resp.u.queue_attr.inline_buf_size;
+	result->max_sq_sge = resp.u.queue_attr.max_wr_send_sges;
+	result->max_rq_sge = resp.u.queue_attr.max_wr_recv_sges;
+	result->max_mr = resp.u.queue_attr.max_mr;
+	result->max_mr_pages = resp.u.queue_attr.max_mr_pages;
+	result->max_pd = resp.u.queue_attr.max_pd;
+	result->max_ah = resp.u.queue_attr.max_ah;
+	result->max_llq_size = resp.u.queue_attr.max_llq_size;
+	result->sub_cqs_per_cq = resp.u.queue_attr.sub_cqs_per_cq;
+
+	return 0;
+}
+
+int efa_com_get_hw_hints(struct efa_com_dev *edev,
+			 struct efa_com_get_hw_hints_result *result)
+{
+	struct efa_admin_get_feature_resp resp;
+	int err;
+
+	err = efa_com_get_feature(edev, &resp, EFA_ADMIN_HW_HINTS);
+	if (err) {
+		ibdev_err_ratelimited(edev->efa_dev,
+				      "Failed to get hw hints %d\n", err);
+		return err;
+	}
+
+	result->admin_completion_timeout = resp.u.hw_hints.admin_completion_timeout;
+	result->driver_watchdog_timeout = resp.u.hw_hints.driver_watchdog_timeout;
+	result->mmio_read_timeout = resp.u.hw_hints.mmio_read_timeout;
+	result->poll_interval = resp.u.hw_hints.poll_interval;
+
+	return 0;
+}
+
+static int efa_com_set_feature_ex(struct efa_com_dev *edev,
+				  struct efa_admin_set_feature_resp *set_resp,
+				  struct efa_admin_set_feature_cmd *set_cmd,
+				  enum efa_admin_aq_feature_id feature_id,
+				  dma_addr_t control_buf_dma_addr,
+				  u32 control_buff_size)
+{
+	struct efa_com_admin_queue *aq;
+	int err;
+
+	if (!efa_com_check_supported_feature_id(edev, feature_id)) {
+		ibdev_err_ratelimited(edev->efa_dev,
+				      "Feature %d isn't supported\n",
+				      feature_id);
+		return -EOPNOTSUPP;
+	}
+
+	aq = &edev->aq;
+
+	set_cmd->aq_common_descriptor.opcode = EFA_ADMIN_SET_FEATURE;
+	if (control_buff_size) {
+		set_cmd->aq_common_descriptor.flags =
+			EFA_ADMIN_AQ_COMMON_DESC_CTRL_DATA_INDIRECT_MASK;
+		efa_com_set_dma_addr(control_buf_dma_addr,
+				     &set_cmd->control_buffer.address.mem_addr_high,
+				     &set_cmd->control_buffer.address.mem_addr_low);
+	}
+
+	set_cmd->control_buffer.length = control_buff_size;
+	set_cmd->feature_common.feature_id = feature_id;
+	err = efa_com_cmd_exec(aq,
+			       (struct efa_admin_aq_entry *)set_cmd,
+			       sizeof(*set_cmd),
+			       (struct efa_admin_acq_entry *)set_resp,
+			       sizeof(*set_resp));
+
+	if (err) {
+		ibdev_err_ratelimited(
+			edev->efa_dev,
+			"Failed to submit set_feature command %d error: %d\n",
+			feature_id, err);
+		return err;
+	}
+
+	return 0;
+}
+
+static int efa_com_set_feature(struct efa_com_dev *edev,
+			       struct efa_admin_set_feature_resp *set_resp,
+			       struct efa_admin_set_feature_cmd *set_cmd,
+			       enum efa_admin_aq_feature_id feature_id)
+{
+	return efa_com_set_feature_ex(edev, set_resp, set_cmd, feature_id,
+				      0, 0);
+}
+
+int efa_com_set_aenq_config(struct efa_com_dev *edev, u32 groups)
+{
+	struct efa_admin_get_feature_resp get_resp;
+	struct efa_admin_set_feature_resp set_resp;
+	struct efa_admin_set_feature_cmd cmd = {};
+	int err;
+
+	ibdev_dbg(edev->efa_dev, "Configuring aenq with groups[%#x]\n", groups);
+
+	err = efa_com_get_feature(edev, &get_resp, EFA_ADMIN_AENQ_CONFIG);
+	if (err) {
+		ibdev_err_ratelimited(edev->efa_dev,
+				      "Failed to get aenq attributes: %d\n",
+				      err);
+		return err;
+	}
+
+	ibdev_dbg(edev->efa_dev,
+		  "Get aenq groups: supported[%#x] enabled[%#x]\n",
+		  get_resp.u.aenq.supported_groups,
+		  get_resp.u.aenq.enabled_groups);
+
+	if ((get_resp.u.aenq.supported_groups & groups) != groups) {
+		ibdev_err_ratelimited(
+			edev->efa_dev,
+			"Trying to set unsupported aenq groups[%#x] supported[%#x]\n",
+			groups, get_resp.u.aenq.supported_groups);
+		return -EOPNOTSUPP;
+	}
+
+	cmd.u.aenq.enabled_groups = groups;
+	err = efa_com_set_feature(edev, &set_resp, &cmd,
+				  EFA_ADMIN_AENQ_CONFIG);
+	if (err) {
+		ibdev_err_ratelimited(edev->efa_dev,
+				      "Failed to set aenq attributes: %d\n",
+				      err);
+		return err;
+	}
+
+	return 0;
+}
+
+int efa_com_alloc_pd(struct efa_com_dev *edev,
+		     struct efa_com_alloc_pd_result *result)
+{
+	struct efa_com_admin_queue *aq = &edev->aq;
+	struct efa_admin_alloc_pd_cmd cmd = {};
+	struct efa_admin_alloc_pd_resp resp;
+	int err;
+
+	cmd.aq_common_descriptor.opcode = EFA_ADMIN_ALLOC_PD;
+
+	err = efa_com_cmd_exec(aq,
+			       (struct efa_admin_aq_entry *)&cmd,
+			       sizeof(cmd),
+			       (struct efa_admin_acq_entry *)&resp,
+			       sizeof(resp));
+	if (err) {
+		ibdev_err_ratelimited(edev->efa_dev,
+				      "Failed to allocate pd[%d]\n", err);
+		return err;
+	}
+
+	result->pdn = resp.pd;
+
+	return 0;
+}
+
+int efa_com_dealloc_pd(struct efa_com_dev *edev,
+		       struct efa_com_dealloc_pd_params *params)
+{
+	struct efa_com_admin_queue *aq = &edev->aq;
+	struct efa_admin_dealloc_pd_cmd cmd = {};
+	struct efa_admin_dealloc_pd_resp resp;
+	int err;
+
+	cmd.aq_common_descriptor.opcode = EFA_ADMIN_DEALLOC_PD;
+	cmd.pd = params->pdn;
+
+	err = efa_com_cmd_exec(aq,
+			       (struct efa_admin_aq_entry *)&cmd,
+			       sizeof(cmd),
+			       (struct efa_admin_acq_entry *)&resp,
+			       sizeof(resp));
+	if (err) {
+		ibdev_err_ratelimited(edev->efa_dev,
+				      "Failed to deallocate pd-%u [%d]\n",
+				      cmd.pd, err);
+		return err;
+	}
+
+	return 0;
+}
+
+int efa_com_alloc_uar(struct efa_com_dev *edev,
+		      struct efa_com_alloc_uar_result *result)
+{
+	struct efa_com_admin_queue *aq = &edev->aq;
+	struct efa_admin_alloc_uar_cmd cmd = {};
+	struct efa_admin_alloc_uar_resp resp;
+	int err;
+
+	cmd.aq_common_descriptor.opcode = EFA_ADMIN_ALLOC_UAR;
+
+	err = efa_com_cmd_exec(aq,
+			       (struct efa_admin_aq_entry *)&cmd,
+			       sizeof(cmd),
+			       (struct efa_admin_acq_entry *)&resp,
+			       sizeof(resp));
+	if (err) {
+		ibdev_err_ratelimited(edev->efa_dev,
+				      "Failed to allocate uar[%d]\n", err);
+		return err;
+	}
+
+	result->uarn = resp.uar;
+
+	return 0;
+}
+
+int efa_com_dealloc_uar(struct efa_com_dev *edev,
+			struct efa_com_dealloc_uar_params *params)
+{
+	struct efa_com_admin_queue *aq = &edev->aq;
+	struct efa_admin_dealloc_uar_cmd cmd = {};
+	struct efa_admin_dealloc_uar_resp resp;
+	int err;
+
+	cmd.aq_common_descriptor.opcode = EFA_ADMIN_DEALLOC_UAR;
+	cmd.uar = params->uarn;
+
+	err = efa_com_cmd_exec(aq,
+			       (struct efa_admin_aq_entry *)&cmd,
+			       sizeof(cmd),
+			       (struct efa_admin_acq_entry *)&resp,
+			       sizeof(resp));
+	if (err) {
+		ibdev_err_ratelimited(edev->efa_dev,
+				      "Failed to deallocate uar-%u [%d]\n",
+				      cmd.uar, err);
+		return err;
+	}
+
+	return 0;
+}
+
+int efa_com_get_stats(struct efa_com_dev *edev,
+		      struct efa_com_get_stats_params *params,
+		      union efa_com_get_stats_result *result)
+{
+	struct efa_com_admin_queue *aq = &edev->aq;
+	struct efa_admin_aq_get_stats_cmd cmd = {};
+	struct efa_admin_acq_get_stats_resp resp;
+	int err;
+
+	cmd.aq_common_descriptor.opcode = EFA_ADMIN_GET_STATS;
+	cmd.type = params->type;
+	cmd.scope = params->scope;
+	cmd.scope_modifier = params->scope_modifier;
+
+	err = efa_com_cmd_exec(aq,
+			       (struct efa_admin_aq_entry *)&cmd,
+			       sizeof(cmd),
+			       (struct efa_admin_acq_entry *)&resp,
+			       sizeof(resp));
+	if (err) {
+		ibdev_err_ratelimited(
+			edev->efa_dev,
+			"Failed to get stats type-%u scope-%u.%u [%d]\n",
+			cmd.type, cmd.scope, cmd.scope_modifier, err);
+		return err;
+	}
+
+	result->basic_stats.tx_bytes = resp.basic_stats.tx_bytes;
+	result->basic_stats.tx_pkts = resp.basic_stats.tx_pkts;
+	result->basic_stats.rx_bytes = resp.basic_stats.rx_bytes;
+	result->basic_stats.rx_pkts = resp.basic_stats.rx_pkts;
+	result->basic_stats.rx_drops = resp.basic_stats.rx_drops;
+
+	return 0;
+}
diff --git a/drivers/infiniband/hw/efa/efa_com_cmd.h b/drivers/infiniband/hw/efa/efa_com_cmd.h
new file mode 100644
index 0000000..7f6c130
--- /dev/null
+++ b/drivers/infiniband/hw/efa/efa_com_cmd.h
@@ -0,0 +1,293 @@
+/* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */
+/*
+ * Copyright 2018-2019 Amazon.com, Inc. or its affiliates. All rights reserved.
+ */
+
+#ifndef _EFA_COM_CMD_H_
+#define _EFA_COM_CMD_H_
+
+#include "efa_com.h"
+
+#define EFA_GID_SIZE 16
+
+struct efa_com_create_qp_params {
+	u64 rq_base_addr;
+	u32 send_cq_idx;
+	u32 recv_cq_idx;
+	/*
+	 * Send descriptor ring size in bytes,
+	 * sufficient for user-provided number of WQEs and SGL size
+	 */
+	u32 sq_ring_size_in_bytes;
+	/* Max number of WQEs that will be posted on send queue */
+	u32 sq_depth;
+	/* Recv descriptor ring size in bytes */
+	u32 rq_ring_size_in_bytes;
+	u32 rq_depth;
+	u16 pd;
+	u16 uarn;
+	u8 qp_type;
+};
+
+struct efa_com_create_qp_result {
+	u32 qp_handle;
+	u32 qp_num;
+	u32 sq_db_offset;
+	u32 rq_db_offset;
+	u32 llq_descriptors_offset;
+	u16 send_sub_cq_idx;
+	u16 recv_sub_cq_idx;
+};
+
+struct efa_com_modify_qp_params {
+	u32 modify_mask;
+	u32 qp_handle;
+	u32 qp_state;
+	u32 cur_qp_state;
+	u32 qkey;
+	u32 sq_psn;
+	u8 sq_drained_async_notify;
+};
+
+struct efa_com_query_qp_params {
+	u32 qp_handle;
+};
+
+struct efa_com_query_qp_result {
+	u32 qp_state;
+	u32 qkey;
+	u32 sq_draining;
+	u32 sq_psn;
+};
+
+struct efa_com_destroy_qp_params {
+	u32 qp_handle;
+};
+
+struct efa_com_create_cq_params {
+	/* cq physical base address in OS memory */
+	dma_addr_t dma_addr;
+	/* completion queue depth in # of entries */
+	u16 cq_depth;
+	u16 num_sub_cqs;
+	u16 uarn;
+	u8 entry_size_in_bytes;
+};
+
+struct efa_com_create_cq_result {
+	/* cq identifier */
+	u16 cq_idx;
+	/* actual cq depth in # of entries */
+	u16 actual_depth;
+};
+
+struct efa_com_destroy_cq_params {
+	u16 cq_idx;
+};
+
+struct efa_com_create_ah_params {
+	u16 pdn;
+	/* Destination address in network byte order */
+	u8 dest_addr[EFA_GID_SIZE];
+};
+
+struct efa_com_create_ah_result {
+	u16 ah;
+};
+
+struct efa_com_destroy_ah_params {
+	u16 ah;
+	u16 pdn;
+};
+
+struct efa_com_get_network_attr_result {
+	u8 addr[EFA_GID_SIZE];
+	u32 mtu;
+};
+
+struct efa_com_get_device_attr_result {
+	u64 page_size_cap;
+	u64 max_mr_pages;
+	u32 fw_version;
+	u32 admin_api_version;
+	u32 device_version;
+	u32 supported_features;
+	u32 phys_addr_width;
+	u32 virt_addr_width;
+	u32 max_qp;
+	u32 max_sq_depth; /* wqes */
+	u32 max_rq_depth; /* wqes */
+	u32 max_cq;
+	u32 max_cq_depth; /* cqes */
+	u32 inline_buf_size;
+	u32 max_mr;
+	u32 max_pd;
+	u32 max_ah;
+	u32 max_llq_size;
+	u16 sub_cqs_per_cq;
+	u16 max_sq_sge;
+	u16 max_rq_sge;
+	u8 db_bar;
+};
+
+struct efa_com_get_hw_hints_result {
+	u16 mmio_read_timeout;
+	u16 driver_watchdog_timeout;
+	u16 admin_completion_timeout;
+	u16 poll_interval;
+	u32 reserved[4];
+};
+
+struct efa_com_mem_addr {
+	u32 mem_addr_low;
+	u32 mem_addr_high;
+};
+
+/* Used at indirect mode page list chunks for chaining */
+struct efa_com_ctrl_buff_info {
+	/* indicates length of the buffer pointed by control_buffer_address. */
+	u32 length;
+	/* points to control buffer (direct or indirect) */
+	struct efa_com_mem_addr address;
+};
+
+struct efa_com_reg_mr_params {
+	/* Memory region length, in bytes. */
+	u64 mr_length_in_bytes;
+	/* IO Virtual Address associated with this MR. */
+	u64 iova;
+	/* words 8:15: Physical Buffer List, each element is page-aligned. */
+	union {
+		/*
+		 * Inline array of physical addresses of app pages
+		 * (optimization for short region reservations)
+		 */
+		u64 inline_pbl_array[4];
+		/*
+		 * Describes the next physically contiguous chunk of indirect
+		 * page list. A page list contains physical addresses of command
+		 * data pages. Data pages are 4KB; page list chunks are
+		 * variable-sized.
+		 */
+		struct efa_com_ctrl_buff_info pbl;
+	} pbl;
+	/* number of pages in PBL (redundant, could be calculated) */
+	u32 page_num;
+	/* Protection Domain */
+	u16 pd;
+	/*
+	 * phys_page_size_shift - page size is (1 << phys_page_size_shift)
+	 * Page size is used for building the Virtual to Physical
+	 * address mapping
+	 */
+	u8 page_shift;
+	/*
+	 * permissions
+	 * 0: local_write_enable - Write permissions: value of 1 needed
+	 * for RQ buffers and for RDMA write:1: reserved1 - remote
+	 * access flags, etc
+	 */
+	u8 permissions;
+	u8 inline_pbl;
+	u8 indirect;
+};
+
+struct efa_com_reg_mr_result {
+	/*
+	 * To be used in conjunction with local buffers references in SQ and
+	 * RQ WQE
+	 */
+	u32 l_key;
+	/*
+	 * To be used in incoming RDMA semantics messages to refer to remotely
+	 * accessed memory region
+	 */
+	u32 r_key;
+};
+
+struct efa_com_dereg_mr_params {
+	u32 l_key;
+};
+
+struct efa_com_alloc_pd_result {
+	u16 pdn;
+};
+
+struct efa_com_dealloc_pd_params {
+	u16 pdn;
+};
+
+struct efa_com_alloc_uar_result {
+	u16 uarn;
+};
+
+struct efa_com_dealloc_uar_params {
+	u16 uarn;
+};
+
+struct efa_com_get_stats_params {
+	/* see enum efa_admin_get_stats_type */
+	u8 type;
+	/* see enum efa_admin_get_stats_scope */
+	u8 scope;
+	u16 scope_modifier;
+};
+
+struct efa_com_basic_stats {
+	u64 tx_bytes;
+	u64 tx_pkts;
+	u64 rx_bytes;
+	u64 rx_pkts;
+	u64 rx_drops;
+};
+
+union efa_com_get_stats_result {
+	struct efa_com_basic_stats basic_stats;
+};
+
+void efa_com_set_dma_addr(dma_addr_t addr, u32 *addr_high, u32 *addr_low);
+int efa_com_create_qp(struct efa_com_dev *edev,
+		      struct efa_com_create_qp_params *params,
+		      struct efa_com_create_qp_result *res);
+int efa_com_modify_qp(struct efa_com_dev *edev,
+		      struct efa_com_modify_qp_params *params);
+int efa_com_query_qp(struct efa_com_dev *edev,
+		     struct efa_com_query_qp_params *params,
+		     struct efa_com_query_qp_result *result);
+int efa_com_destroy_qp(struct efa_com_dev *edev,
+		       struct efa_com_destroy_qp_params *params);
+int efa_com_create_cq(struct efa_com_dev *edev,
+		      struct efa_com_create_cq_params *params,
+		      struct efa_com_create_cq_result *result);
+int efa_com_destroy_cq(struct efa_com_dev *edev,
+		       struct efa_com_destroy_cq_params *params);
+int efa_com_register_mr(struct efa_com_dev *edev,
+			struct efa_com_reg_mr_params *params,
+			struct efa_com_reg_mr_result *result);
+int efa_com_dereg_mr(struct efa_com_dev *edev,
+		     struct efa_com_dereg_mr_params *params);
+int efa_com_create_ah(struct efa_com_dev *edev,
+		      struct efa_com_create_ah_params *params,
+		      struct efa_com_create_ah_result *result);
+int efa_com_destroy_ah(struct efa_com_dev *edev,
+		       struct efa_com_destroy_ah_params *params);
+int efa_com_get_network_attr(struct efa_com_dev *edev,
+			     struct efa_com_get_network_attr_result *result);
+int efa_com_get_device_attr(struct efa_com_dev *edev,
+			    struct efa_com_get_device_attr_result *result);
+int efa_com_get_hw_hints(struct efa_com_dev *edev,
+			 struct efa_com_get_hw_hints_result *result);
+int efa_com_set_aenq_config(struct efa_com_dev *edev, u32 groups);
+int efa_com_alloc_pd(struct efa_com_dev *edev,
+		     struct efa_com_alloc_pd_result *result);
+int efa_com_dealloc_pd(struct efa_com_dev *edev,
+		       struct efa_com_dealloc_pd_params *params);
+int efa_com_alloc_uar(struct efa_com_dev *edev,
+		      struct efa_com_alloc_uar_result *result);
+int efa_com_dealloc_uar(struct efa_com_dev *edev,
+			struct efa_com_dealloc_uar_params *params);
+int efa_com_get_stats(struct efa_com_dev *edev,
+		      struct efa_com_get_stats_params *params,
+		      union efa_com_get_stats_result *result);
+
+#endif /* _EFA_COM_CMD_H_ */
diff --git a/drivers/infiniband/hw/efa/efa_common_defs.h b/drivers/infiniband/hw/efa/efa_common_defs.h
new file mode 100644
index 0000000..c559ec0
--- /dev/null
+++ b/drivers/infiniband/hw/efa/efa_common_defs.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */
+/*
+ * Copyright 2018-2019 Amazon.com, Inc. or its affiliates. All rights reserved.
+ */
+
+#ifndef _EFA_COMMON_H_
+#define _EFA_COMMON_H_
+
+#define EFA_COMMON_SPEC_VERSION_MAJOR        2
+#define EFA_COMMON_SPEC_VERSION_MINOR        0
+
+struct efa_common_mem_addr {
+	u32 mem_addr_low;
+
+	u32 mem_addr_high;
+};
+
+#endif /* _EFA_COMMON_H_ */
diff --git a/drivers/infiniband/hw/efa/efa_main.c b/drivers/infiniband/hw/efa/efa_main.c
new file mode 100644
index 0000000..83858f7
--- /dev/null
+++ b/drivers/infiniband/hw/efa/efa_main.c
@@ -0,0 +1,537 @@
+// SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause
+/*
+ * Copyright 2018-2019 Amazon.com, Inc. or its affiliates. All rights reserved.
+ */
+
+#include <linux/module.h>
+#include <linux/pci.h>
+
+#include <rdma/ib_user_verbs.h>
+
+#include "efa.h"
+
+#define PCI_DEV_ID_EFA_VF 0xefa0
+
+static const struct pci_device_id efa_pci_tbl[] = {
+	{ PCI_VDEVICE(AMAZON, PCI_DEV_ID_EFA_VF) },
+	{ }
+};
+
+MODULE_AUTHOR("Amazon.com, Inc. or its affiliates");
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_DESCRIPTION(DEVICE_NAME);
+MODULE_DEVICE_TABLE(pci, efa_pci_tbl);
+
+#define EFA_REG_BAR 0
+#define EFA_MEM_BAR 2
+#define EFA_BASE_BAR_MASK (BIT(EFA_REG_BAR) | BIT(EFA_MEM_BAR))
+
+#define EFA_AENQ_ENABLED_GROUPS \
+	(BIT(EFA_ADMIN_FATAL_ERROR) | BIT(EFA_ADMIN_WARNING) | \
+	 BIT(EFA_ADMIN_NOTIFICATION) | BIT(EFA_ADMIN_KEEP_ALIVE))
+
+static void efa_update_network_attr(struct efa_dev *dev,
+				    struct efa_com_get_network_attr_result *network_attr)
+{
+	memcpy(dev->addr, network_attr->addr, sizeof(network_attr->addr));
+	dev->mtu = network_attr->mtu;
+
+	dev_dbg(&dev->pdev->dev, "Full address %pI6\n", dev->addr);
+}
+
+/* This handler will called for unknown event group or unimplemented handlers */
+static void unimplemented_aenq_handler(void *data,
+				       struct efa_admin_aenq_entry *aenq_e)
+{
+	struct efa_dev *dev = (struct efa_dev *)data;
+
+	ibdev_err(&dev->ibdev,
+		  "Unknown event was received or event with unimplemented handler\n");
+}
+
+static void efa_keep_alive(void *data, struct efa_admin_aenq_entry *aenq_e)
+{
+	struct efa_dev *dev = (struct efa_dev *)data;
+
+	atomic64_inc(&dev->stats.keep_alive_rcvd);
+}
+
+static struct efa_aenq_handlers aenq_handlers = {
+	.handlers = {
+		[EFA_ADMIN_KEEP_ALIVE] = efa_keep_alive,
+	},
+	.unimplemented_handler = unimplemented_aenq_handler
+};
+
+static void efa_release_bars(struct efa_dev *dev, int bars_mask)
+{
+	struct pci_dev *pdev = dev->pdev;
+	int release_bars;
+
+	release_bars = pci_select_bars(pdev, IORESOURCE_MEM) & bars_mask;
+	pci_release_selected_regions(pdev, release_bars);
+}
+
+static irqreturn_t efa_intr_msix_mgmnt(int irq, void *data)
+{
+	struct efa_dev *dev = data;
+
+	efa_com_admin_q_comp_intr_handler(&dev->edev);
+	efa_com_aenq_intr_handler(&dev->edev, data);
+
+	return IRQ_HANDLED;
+}
+
+static int efa_request_mgmnt_irq(struct efa_dev *dev)
+{
+	struct efa_irq *irq;
+	int err;
+
+	irq = &dev->admin_irq;
+	err = request_irq(irq->vector, irq->handler, 0, irq->name,
+			  irq->data);
+	if (err) {
+		dev_err(&dev->pdev->dev, "Failed to request admin irq (%d)\n",
+			err);
+		return err;
+	}
+
+	dev_dbg(&dev->pdev->dev, "Set affinity hint of mgmnt irq to %*pbl (irq vector: %d)\n",
+		nr_cpumask_bits, &irq->affinity_hint_mask, irq->vector);
+	irq_set_affinity_hint(irq->vector, &irq->affinity_hint_mask);
+
+	return 0;
+}
+
+static void efa_setup_mgmnt_irq(struct efa_dev *dev)
+{
+	u32 cpu;
+
+	snprintf(dev->admin_irq.name, EFA_IRQNAME_SIZE,
+		 "efa-mgmnt@pci:%s", pci_name(dev->pdev));
+	dev->admin_irq.handler = efa_intr_msix_mgmnt;
+	dev->admin_irq.data = dev;
+	dev->admin_irq.vector =
+		pci_irq_vector(dev->pdev, dev->admin_msix_vector_idx);
+	cpu = cpumask_first(cpu_online_mask);
+	dev->admin_irq.cpu = cpu;
+	cpumask_set_cpu(cpu,
+			&dev->admin_irq.affinity_hint_mask);
+	dev_info(&dev->pdev->dev, "Setup irq:0x%p vector:%d name:%s\n",
+		 &dev->admin_irq,
+		 dev->admin_irq.vector,
+		 dev->admin_irq.name);
+}
+
+static void efa_free_mgmnt_irq(struct efa_dev *dev)
+{
+	struct efa_irq *irq;
+
+	irq = &dev->admin_irq;
+	irq_set_affinity_hint(irq->vector, NULL);
+	free_irq(irq->vector, irq->data);
+}
+
+static int efa_set_mgmnt_irq(struct efa_dev *dev)
+{
+	efa_setup_mgmnt_irq(dev);
+
+	return efa_request_mgmnt_irq(dev);
+}
+
+static int efa_request_doorbell_bar(struct efa_dev *dev)
+{
+	u8 db_bar_idx = dev->dev_attr.db_bar;
+	struct pci_dev *pdev = dev->pdev;
+	int bars;
+	int err;
+
+	if (!(BIT(db_bar_idx) & EFA_BASE_BAR_MASK)) {
+		bars = pci_select_bars(pdev, IORESOURCE_MEM) & BIT(db_bar_idx);
+
+		err = pci_request_selected_regions(pdev, bars, DRV_MODULE_NAME);
+		if (err) {
+			dev_err(&dev->pdev->dev,
+				"pci_request_selected_regions for bar %d failed %d\n",
+				db_bar_idx, err);
+			return err;
+		}
+	}
+
+	dev->db_bar_addr = pci_resource_start(dev->pdev, db_bar_idx);
+	dev->db_bar_len = pci_resource_len(dev->pdev, db_bar_idx);
+
+	return 0;
+}
+
+static void efa_release_doorbell_bar(struct efa_dev *dev)
+{
+	if (!(BIT(dev->dev_attr.db_bar) & EFA_BASE_BAR_MASK))
+		efa_release_bars(dev, BIT(dev->dev_attr.db_bar));
+}
+
+static void efa_update_hw_hints(struct efa_dev *dev,
+				struct efa_com_get_hw_hints_result *hw_hints)
+{
+	struct efa_com_dev *edev = &dev->edev;
+
+	if (hw_hints->mmio_read_timeout)
+		edev->mmio_read.mmio_read_timeout =
+			hw_hints->mmio_read_timeout * 1000;
+
+	if (hw_hints->poll_interval)
+		edev->aq.poll_interval = hw_hints->poll_interval;
+
+	if (hw_hints->admin_completion_timeout)
+		edev->aq.completion_timeout =
+			hw_hints->admin_completion_timeout;
+}
+
+static void efa_stats_init(struct efa_dev *dev)
+{
+	atomic64_t *s = (atomic64_t *)&dev->stats;
+	int i;
+
+	for (i = 0; i < sizeof(dev->stats) / sizeof(*s); i++, s++)
+		atomic64_set(s, 0);
+}
+
+static const struct ib_device_ops efa_dev_ops = {
+	.owner = THIS_MODULE,
+	.driver_id = RDMA_DRIVER_EFA,
+	.uverbs_abi_ver = EFA_UVERBS_ABI_VERSION,
+
+	.alloc_hw_stats = efa_alloc_hw_stats,
+	.alloc_pd = efa_alloc_pd,
+	.alloc_ucontext = efa_alloc_ucontext,
+	.create_ah = efa_create_ah,
+	.create_cq = efa_create_cq,
+	.create_qp = efa_create_qp,
+	.dealloc_pd = efa_dealloc_pd,
+	.dealloc_ucontext = efa_dealloc_ucontext,
+	.dereg_mr = efa_dereg_mr,
+	.destroy_ah = efa_destroy_ah,
+	.destroy_cq = efa_destroy_cq,
+	.destroy_qp = efa_destroy_qp,
+	.get_hw_stats = efa_get_hw_stats,
+	.get_link_layer = efa_port_link_layer,
+	.get_port_immutable = efa_get_port_immutable,
+	.mmap = efa_mmap,
+	.modify_qp = efa_modify_qp,
+	.query_device = efa_query_device,
+	.query_gid = efa_query_gid,
+	.query_pkey = efa_query_pkey,
+	.query_port = efa_query_port,
+	.query_qp = efa_query_qp,
+	.reg_user_mr = efa_reg_mr,
+
+	INIT_RDMA_OBJ_SIZE(ib_ah, efa_ah, ibah),
+	INIT_RDMA_OBJ_SIZE(ib_cq, efa_cq, ibcq),
+	INIT_RDMA_OBJ_SIZE(ib_pd, efa_pd, ibpd),
+	INIT_RDMA_OBJ_SIZE(ib_ucontext, efa_ucontext, ibucontext),
+};
+
+static int efa_ib_device_add(struct efa_dev *dev)
+{
+	struct efa_com_get_network_attr_result network_attr;
+	struct efa_com_get_hw_hints_result hw_hints;
+	struct pci_dev *pdev = dev->pdev;
+	int err;
+
+	efa_stats_init(dev);
+
+	err = efa_com_get_device_attr(&dev->edev, &dev->dev_attr);
+	if (err)
+		return err;
+
+	dev_dbg(&dev->pdev->dev, "Doorbells bar (%d)\n", dev->dev_attr.db_bar);
+	err = efa_request_doorbell_bar(dev);
+	if (err)
+		return err;
+
+	err = efa_com_get_network_attr(&dev->edev, &network_attr);
+	if (err)
+		goto err_release_doorbell_bar;
+
+	efa_update_network_attr(dev, &network_attr);
+
+	err = efa_com_get_hw_hints(&dev->edev, &hw_hints);
+	if (err)
+		goto err_release_doorbell_bar;
+
+	efa_update_hw_hints(dev, &hw_hints);
+
+	/* Try to enable all the available aenq groups */
+	err = efa_com_set_aenq_config(&dev->edev, EFA_AENQ_ENABLED_GROUPS);
+	if (err)
+		goto err_release_doorbell_bar;
+
+	dev->ibdev.node_type = RDMA_NODE_UNSPECIFIED;
+	dev->ibdev.phys_port_cnt = 1;
+	dev->ibdev.num_comp_vectors = 1;
+	dev->ibdev.dev.parent = &pdev->dev;
+
+	dev->ibdev.uverbs_cmd_mask =
+		(1ull << IB_USER_VERBS_CMD_GET_CONTEXT) |
+		(1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) |
+		(1ull << IB_USER_VERBS_CMD_QUERY_PORT) |
+		(1ull << IB_USER_VERBS_CMD_ALLOC_PD) |
+		(1ull << IB_USER_VERBS_CMD_DEALLOC_PD) |
+		(1ull << IB_USER_VERBS_CMD_REG_MR) |
+		(1ull << IB_USER_VERBS_CMD_DEREG_MR) |
+		(1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) |
+		(1ull << IB_USER_VERBS_CMD_CREATE_CQ) |
+		(1ull << IB_USER_VERBS_CMD_DESTROY_CQ) |
+		(1ull << IB_USER_VERBS_CMD_CREATE_QP) |
+		(1ull << IB_USER_VERBS_CMD_MODIFY_QP) |
+		(1ull << IB_USER_VERBS_CMD_QUERY_QP) |
+		(1ull << IB_USER_VERBS_CMD_DESTROY_QP) |
+		(1ull << IB_USER_VERBS_CMD_CREATE_AH) |
+		(1ull << IB_USER_VERBS_CMD_DESTROY_AH);
+
+	dev->ibdev.uverbs_ex_cmd_mask =
+		(1ull << IB_USER_VERBS_EX_CMD_QUERY_DEVICE);
+
+	ib_set_device_ops(&dev->ibdev, &efa_dev_ops);
+
+	err = ib_register_device(&dev->ibdev, "efa_%d");
+	if (err)
+		goto err_release_doorbell_bar;
+
+	ibdev_info(&dev->ibdev, "IB device registered\n");
+
+	return 0;
+
+err_release_doorbell_bar:
+	efa_release_doorbell_bar(dev);
+	return err;
+}
+
+static void efa_ib_device_remove(struct efa_dev *dev)
+{
+	efa_com_dev_reset(&dev->edev, EFA_REGS_RESET_NORMAL);
+	ibdev_info(&dev->ibdev, "Unregister ib device\n");
+	ib_unregister_device(&dev->ibdev);
+	efa_release_doorbell_bar(dev);
+}
+
+static void efa_disable_msix(struct efa_dev *dev)
+{
+	pci_free_irq_vectors(dev->pdev);
+}
+
+static int efa_enable_msix(struct efa_dev *dev)
+{
+	int msix_vecs, irq_num;
+
+	/* Reserve the max msix vectors we might need */
+	msix_vecs = EFA_NUM_MSIX_VEC;
+	dev_dbg(&dev->pdev->dev, "Trying to enable MSI-X, vectors %d\n",
+		msix_vecs);
+
+	dev->admin_msix_vector_idx = EFA_MGMNT_MSIX_VEC_IDX;
+	irq_num = pci_alloc_irq_vectors(dev->pdev, msix_vecs,
+					msix_vecs, PCI_IRQ_MSIX);
+
+	if (irq_num < 0) {
+		dev_err(&dev->pdev->dev, "Failed to enable MSI-X. irq_num %d\n",
+			irq_num);
+		return -ENOSPC;
+	}
+
+	if (irq_num != msix_vecs) {
+		dev_err(&dev->pdev->dev,
+			"Allocated %d MSI-X (out of %d requested)\n",
+			irq_num, msix_vecs);
+		return -ENOSPC;
+	}
+
+	return 0;
+}
+
+static int efa_device_init(struct efa_com_dev *edev, struct pci_dev *pdev)
+{
+	int dma_width;
+	int err;
+
+	err = efa_com_dev_reset(edev, EFA_REGS_RESET_NORMAL);
+	if (err)
+		return err;
+
+	err = efa_com_validate_version(edev);
+	if (err)
+		return err;
+
+	dma_width = efa_com_get_dma_width(edev);
+	if (dma_width < 0) {
+		err = dma_width;
+		return err;
+	}
+
+	err = pci_set_dma_mask(pdev, DMA_BIT_MASK(dma_width));
+	if (err) {
+		dev_err(&pdev->dev, "pci_set_dma_mask failed %d\n", err);
+		return err;
+	}
+
+	err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(dma_width));
+	if (err) {
+		dev_err(&pdev->dev,
+			"err_pci_set_consistent_dma_mask failed %d\n",
+			err);
+		return err;
+	}
+
+	return 0;
+}
+
+static struct efa_dev *efa_probe_device(struct pci_dev *pdev)
+{
+	struct efa_com_dev *edev;
+	struct efa_dev *dev;
+	int bars;
+	int err;
+
+	err = pci_enable_device_mem(pdev);
+	if (err) {
+		dev_err(&pdev->dev, "pci_enable_device_mem() failed!\n");
+		return ERR_PTR(err);
+	}
+
+	pci_set_master(pdev);
+
+	dev = ib_alloc_device(efa_dev, ibdev);
+	if (!dev) {
+		dev_err(&pdev->dev, "Device alloc failed\n");
+		err = -ENOMEM;
+		goto err_disable_device;
+	}
+
+	pci_set_drvdata(pdev, dev);
+	edev = &dev->edev;
+	edev->efa_dev = dev;
+	edev->dmadev = &pdev->dev;
+	dev->pdev = pdev;
+
+	bars = pci_select_bars(pdev, IORESOURCE_MEM) & EFA_BASE_BAR_MASK;
+	err = pci_request_selected_regions(pdev, bars, DRV_MODULE_NAME);
+	if (err) {
+		dev_err(&pdev->dev, "pci_request_selected_regions failed %d\n",
+			err);
+		goto err_ibdev_destroy;
+	}
+
+	dev->reg_bar_addr = pci_resource_start(pdev, EFA_REG_BAR);
+	dev->reg_bar_len = pci_resource_len(pdev, EFA_REG_BAR);
+	dev->mem_bar_addr = pci_resource_start(pdev, EFA_MEM_BAR);
+	dev->mem_bar_len = pci_resource_len(pdev, EFA_MEM_BAR);
+
+	edev->reg_bar = devm_ioremap(&pdev->dev,
+				     dev->reg_bar_addr,
+				     dev->reg_bar_len);
+	if (!edev->reg_bar) {
+		dev_err(&pdev->dev, "Failed to remap register bar\n");
+		err = -EFAULT;
+		goto err_release_bars;
+	}
+
+	err = efa_com_mmio_reg_read_init(edev);
+	if (err) {
+		dev_err(&pdev->dev, "Failed to init readless MMIO\n");
+		goto err_iounmap;
+	}
+
+	err = efa_device_init(edev, pdev);
+	if (err) {
+		dev_err(&pdev->dev, "EFA device init failed\n");
+		if (err == -ETIME)
+			err = -EPROBE_DEFER;
+		goto err_reg_read_destroy;
+	}
+
+	err = efa_enable_msix(dev);
+	if (err)
+		goto err_reg_read_destroy;
+
+	edev->aq.msix_vector_idx = dev->admin_msix_vector_idx;
+	edev->aenq.msix_vector_idx = dev->admin_msix_vector_idx;
+
+	err = efa_set_mgmnt_irq(dev);
+	if (err)
+		goto err_disable_msix;
+
+	err = efa_com_admin_init(edev, &aenq_handlers);
+	if (err)
+		goto err_free_mgmnt_irq;
+
+	return dev;
+
+err_free_mgmnt_irq:
+	efa_free_mgmnt_irq(dev);
+err_disable_msix:
+	efa_disable_msix(dev);
+err_reg_read_destroy:
+	efa_com_mmio_reg_read_destroy(edev);
+err_iounmap:
+	devm_iounmap(&pdev->dev, edev->reg_bar);
+err_release_bars:
+	efa_release_bars(dev, EFA_BASE_BAR_MASK);
+err_ibdev_destroy:
+	ib_dealloc_device(&dev->ibdev);
+err_disable_device:
+	pci_disable_device(pdev);
+	return ERR_PTR(err);
+}
+
+static void efa_remove_device(struct pci_dev *pdev)
+{
+	struct efa_dev *dev = pci_get_drvdata(pdev);
+	struct efa_com_dev *edev;
+
+	edev = &dev->edev;
+	efa_com_admin_destroy(edev);
+	efa_free_mgmnt_irq(dev);
+	efa_disable_msix(dev);
+	efa_com_mmio_reg_read_destroy(edev);
+	devm_iounmap(&pdev->dev, edev->reg_bar);
+	efa_release_bars(dev, EFA_BASE_BAR_MASK);
+	ib_dealloc_device(&dev->ibdev);
+	pci_disable_device(pdev);
+}
+
+static int efa_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
+{
+	struct efa_dev *dev;
+	int err;
+
+	dev = efa_probe_device(pdev);
+	if (IS_ERR(dev))
+		return PTR_ERR(dev);
+
+	err = efa_ib_device_add(dev);
+	if (err)
+		goto err_remove_device;
+
+	return 0;
+
+err_remove_device:
+	efa_remove_device(pdev);
+	return err;
+}
+
+static void efa_remove(struct pci_dev *pdev)
+{
+	struct efa_dev *dev = pci_get_drvdata(pdev);
+
+	efa_ib_device_remove(dev);
+	efa_remove_device(pdev);
+}
+
+static struct pci_driver efa_pci_driver = {
+	.name           = DRV_MODULE_NAME,
+	.id_table       = efa_pci_tbl,
+	.probe          = efa_probe,
+	.remove         = efa_remove,
+};
+
+module_pci_driver(efa_pci_driver);
diff --git a/drivers/infiniband/hw/efa/efa_regs_defs.h b/drivers/infiniband/hw/efa/efa_regs_defs.h
new file mode 100644
index 0000000..bb9cad3
--- /dev/null
+++ b/drivers/infiniband/hw/efa/efa_regs_defs.h
@@ -0,0 +1,113 @@
+/* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */
+/*
+ * Copyright 2018-2019 Amazon.com, Inc. or its affiliates. All rights reserved.
+ */
+
+#ifndef _EFA_REGS_H_
+#define _EFA_REGS_H_
+
+enum efa_regs_reset_reason_types {
+	EFA_REGS_RESET_NORMAL                       = 0,
+	/* Keep alive timeout */
+	EFA_REGS_RESET_KEEP_ALIVE_TO                = 1,
+	EFA_REGS_RESET_ADMIN_TO                     = 2,
+	EFA_REGS_RESET_INIT_ERR                     = 3,
+	EFA_REGS_RESET_DRIVER_INVALID_STATE         = 4,
+	EFA_REGS_RESET_OS_TRIGGER                   = 5,
+	EFA_REGS_RESET_SHUTDOWN                     = 6,
+	EFA_REGS_RESET_USER_TRIGGER                 = 7,
+	EFA_REGS_RESET_GENERIC                      = 8,
+};
+
+/* efa_registers offsets */
+
+/* 0 base */
+#define EFA_REGS_VERSION_OFF                                0x0
+#define EFA_REGS_CONTROLLER_VERSION_OFF                     0x4
+#define EFA_REGS_CAPS_OFF                                   0x8
+#define EFA_REGS_AQ_BASE_LO_OFF                             0x10
+#define EFA_REGS_AQ_BASE_HI_OFF                             0x14
+#define EFA_REGS_AQ_CAPS_OFF                                0x18
+#define EFA_REGS_ACQ_BASE_LO_OFF                            0x20
+#define EFA_REGS_ACQ_BASE_HI_OFF                            0x24
+#define EFA_REGS_ACQ_CAPS_OFF                               0x28
+#define EFA_REGS_AQ_PROD_DB_OFF                             0x2c
+#define EFA_REGS_AENQ_CAPS_OFF                              0x34
+#define EFA_REGS_AENQ_BASE_LO_OFF                           0x38
+#define EFA_REGS_AENQ_BASE_HI_OFF                           0x3c
+#define EFA_REGS_AENQ_CONS_DB_OFF                           0x40
+#define EFA_REGS_INTR_MASK_OFF                              0x4c
+#define EFA_REGS_DEV_CTL_OFF                                0x54
+#define EFA_REGS_DEV_STS_OFF                                0x58
+#define EFA_REGS_MMIO_REG_READ_OFF                          0x5c
+#define EFA_REGS_MMIO_RESP_LO_OFF                           0x60
+#define EFA_REGS_MMIO_RESP_HI_OFF                           0x64
+
+/* version register */
+#define EFA_REGS_VERSION_MINOR_VERSION_MASK                 0xff
+#define EFA_REGS_VERSION_MAJOR_VERSION_SHIFT                8
+#define EFA_REGS_VERSION_MAJOR_VERSION_MASK                 0xff00
+
+/* controller_version register */
+#define EFA_REGS_CONTROLLER_VERSION_SUBMINOR_VERSION_MASK   0xff
+#define EFA_REGS_CONTROLLER_VERSION_MINOR_VERSION_SHIFT     8
+#define EFA_REGS_CONTROLLER_VERSION_MINOR_VERSION_MASK      0xff00
+#define EFA_REGS_CONTROLLER_VERSION_MAJOR_VERSION_SHIFT     16
+#define EFA_REGS_CONTROLLER_VERSION_MAJOR_VERSION_MASK      0xff0000
+#define EFA_REGS_CONTROLLER_VERSION_IMPL_ID_SHIFT           24
+#define EFA_REGS_CONTROLLER_VERSION_IMPL_ID_MASK            0xff000000
+
+/* caps register */
+#define EFA_REGS_CAPS_CONTIGUOUS_QUEUE_REQUIRED_MASK        0x1
+#define EFA_REGS_CAPS_RESET_TIMEOUT_SHIFT                   1
+#define EFA_REGS_CAPS_RESET_TIMEOUT_MASK                    0x3e
+#define EFA_REGS_CAPS_DMA_ADDR_WIDTH_SHIFT                  8
+#define EFA_REGS_CAPS_DMA_ADDR_WIDTH_MASK                   0xff00
+#define EFA_REGS_CAPS_ADMIN_CMD_TO_SHIFT                    16
+#define EFA_REGS_CAPS_ADMIN_CMD_TO_MASK                     0xf0000
+
+/* aq_caps register */
+#define EFA_REGS_AQ_CAPS_AQ_DEPTH_MASK                      0xffff
+#define EFA_REGS_AQ_CAPS_AQ_ENTRY_SIZE_SHIFT                16
+#define EFA_REGS_AQ_CAPS_AQ_ENTRY_SIZE_MASK                 0xffff0000
+
+/* acq_caps register */
+#define EFA_REGS_ACQ_CAPS_ACQ_DEPTH_MASK                    0xffff
+#define EFA_REGS_ACQ_CAPS_ACQ_ENTRY_SIZE_SHIFT              16
+#define EFA_REGS_ACQ_CAPS_ACQ_ENTRY_SIZE_MASK               0xff0000
+#define EFA_REGS_ACQ_CAPS_ACQ_MSIX_VECTOR_SHIFT             24
+#define EFA_REGS_ACQ_CAPS_ACQ_MSIX_VECTOR_MASK              0xff000000
+
+/* aenq_caps register */
+#define EFA_REGS_AENQ_CAPS_AENQ_DEPTH_MASK                  0xffff
+#define EFA_REGS_AENQ_CAPS_AENQ_ENTRY_SIZE_SHIFT            16
+#define EFA_REGS_AENQ_CAPS_AENQ_ENTRY_SIZE_MASK             0xff0000
+#define EFA_REGS_AENQ_CAPS_AENQ_MSIX_VECTOR_SHIFT           24
+#define EFA_REGS_AENQ_CAPS_AENQ_MSIX_VECTOR_MASK            0xff000000
+
+/* dev_ctl register */
+#define EFA_REGS_DEV_CTL_DEV_RESET_MASK                     0x1
+#define EFA_REGS_DEV_CTL_AQ_RESTART_SHIFT                   1
+#define EFA_REGS_DEV_CTL_AQ_RESTART_MASK                    0x2
+#define EFA_REGS_DEV_CTL_RESET_REASON_SHIFT                 28
+#define EFA_REGS_DEV_CTL_RESET_REASON_MASK                  0xf0000000
+
+/* dev_sts register */
+#define EFA_REGS_DEV_STS_READY_MASK                         0x1
+#define EFA_REGS_DEV_STS_AQ_RESTART_IN_PROGRESS_SHIFT       1
+#define EFA_REGS_DEV_STS_AQ_RESTART_IN_PROGRESS_MASK        0x2
+#define EFA_REGS_DEV_STS_AQ_RESTART_FINISHED_SHIFT          2
+#define EFA_REGS_DEV_STS_AQ_RESTART_FINISHED_MASK           0x4
+#define EFA_REGS_DEV_STS_RESET_IN_PROGRESS_SHIFT            3
+#define EFA_REGS_DEV_STS_RESET_IN_PROGRESS_MASK             0x8
+#define EFA_REGS_DEV_STS_RESET_FINISHED_SHIFT               4
+#define EFA_REGS_DEV_STS_RESET_FINISHED_MASK                0x10
+#define EFA_REGS_DEV_STS_FATAL_ERROR_SHIFT                  5
+#define EFA_REGS_DEV_STS_FATAL_ERROR_MASK                   0x20
+
+/* mmio_reg_read register */
+#define EFA_REGS_MMIO_REG_READ_REQ_ID_MASK                  0xffff
+#define EFA_REGS_MMIO_REG_READ_REG_OFF_SHIFT                16
+#define EFA_REGS_MMIO_REG_READ_REG_OFF_MASK                 0xffff0000
+
+#endif /* _EFA_REGS_H_ */
diff --git a/drivers/infiniband/hw/efa/efa_verbs.c b/drivers/infiniband/hw/efa/efa_verbs.c
new file mode 100644
index 0000000..4edae89
--- /dev/null
+++ b/drivers/infiniband/hw/efa/efa_verbs.c
@@ -0,0 +1,1808 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright 2018-2019 Amazon.com, Inc. or its affiliates. All rights reserved.
+ */
+
+#include <linux/vmalloc.h>
+
+#include <rdma/ib_addr.h>
+#include <rdma/ib_umem.h>
+#include <rdma/ib_user_verbs.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/uverbs_ioctl.h>
+
+#include "efa.h"
+
+#define EFA_MMAP_FLAG_SHIFT 56
+#define EFA_MMAP_PAGE_MASK GENMASK(EFA_MMAP_FLAG_SHIFT - 1, 0)
+#define EFA_MMAP_INVALID U64_MAX
+
+enum {
+	EFA_MMAP_DMA_PAGE = 0,
+	EFA_MMAP_IO_WC,
+	EFA_MMAP_IO_NC,
+};
+
+#define EFA_AENQ_ENABLED_GROUPS \
+	(BIT(EFA_ADMIN_FATAL_ERROR) | BIT(EFA_ADMIN_WARNING) | \
+	 BIT(EFA_ADMIN_NOTIFICATION) | BIT(EFA_ADMIN_KEEP_ALIVE))
+
+struct efa_mmap_entry {
+	void  *obj;
+	u64 address;
+	u64 length;
+	u32 mmap_page;
+	u8 mmap_flag;
+};
+
+static inline u64 get_mmap_key(const struct efa_mmap_entry *efa)
+{
+	return ((u64)efa->mmap_flag << EFA_MMAP_FLAG_SHIFT) |
+	       ((u64)efa->mmap_page << PAGE_SHIFT);
+}
+
+#define EFA_DEFINE_STATS(op) \
+	op(EFA_TX_BYTES, "tx_bytes") \
+	op(EFA_TX_PKTS, "tx_pkts") \
+	op(EFA_RX_BYTES, "rx_bytes") \
+	op(EFA_RX_PKTS, "rx_pkts") \
+	op(EFA_RX_DROPS, "rx_drops") \
+	op(EFA_SUBMITTED_CMDS, "submitted_cmds") \
+	op(EFA_COMPLETED_CMDS, "completed_cmds") \
+	op(EFA_NO_COMPLETION_CMDS, "no_completion_cmds") \
+	op(EFA_KEEP_ALIVE_RCVD, "keep_alive_rcvd") \
+	op(EFA_ALLOC_PD_ERR, "alloc_pd_err") \
+	op(EFA_CREATE_QP_ERR, "create_qp_err") \
+	op(EFA_REG_MR_ERR, "reg_mr_err") \
+	op(EFA_ALLOC_UCONTEXT_ERR, "alloc_ucontext_err") \
+	op(EFA_CREATE_AH_ERR, "create_ah_err")
+
+#define EFA_STATS_ENUM(ename, name) ename,
+#define EFA_STATS_STR(ename, name) [ename] = name,
+
+enum efa_hw_stats {
+	EFA_DEFINE_STATS(EFA_STATS_ENUM)
+};
+
+static const char *const efa_stats_names[] = {
+	EFA_DEFINE_STATS(EFA_STATS_STR)
+};
+
+#define EFA_CHUNK_PAYLOAD_SHIFT       12
+#define EFA_CHUNK_PAYLOAD_SIZE        BIT(EFA_CHUNK_PAYLOAD_SHIFT)
+#define EFA_CHUNK_PAYLOAD_PTR_SIZE    8
+
+#define EFA_CHUNK_SHIFT               12
+#define EFA_CHUNK_SIZE                BIT(EFA_CHUNK_SHIFT)
+#define EFA_CHUNK_PTR_SIZE            sizeof(struct efa_com_ctrl_buff_info)
+
+#define EFA_PTRS_PER_CHUNK \
+	((EFA_CHUNK_SIZE - EFA_CHUNK_PTR_SIZE) / EFA_CHUNK_PAYLOAD_PTR_SIZE)
+
+#define EFA_CHUNK_USED_SIZE \
+	((EFA_PTRS_PER_CHUNK * EFA_CHUNK_PAYLOAD_PTR_SIZE) + EFA_CHUNK_PTR_SIZE)
+
+#define EFA_SUPPORTED_ACCESS_FLAGS IB_ACCESS_LOCAL_WRITE
+
+struct pbl_chunk {
+	dma_addr_t dma_addr;
+	u64 *buf;
+	u32 length;
+};
+
+struct pbl_chunk_list {
+	struct pbl_chunk *chunks;
+	unsigned int size;
+};
+
+struct pbl_context {
+	union {
+		struct {
+			dma_addr_t dma_addr;
+		} continuous;
+		struct {
+			u32 pbl_buf_size_in_pages;
+			struct scatterlist *sgl;
+			int sg_dma_cnt;
+			struct pbl_chunk_list chunk_list;
+		} indirect;
+	} phys;
+	u64 *pbl_buf;
+	u32 pbl_buf_size_in_bytes;
+	u8 physically_continuous;
+};
+
+static inline struct efa_dev *to_edev(struct ib_device *ibdev)
+{
+	return container_of(ibdev, struct efa_dev, ibdev);
+}
+
+static inline struct efa_ucontext *to_eucontext(struct ib_ucontext *ibucontext)
+{
+	return container_of(ibucontext, struct efa_ucontext, ibucontext);
+}
+
+static inline struct efa_pd *to_epd(struct ib_pd *ibpd)
+{
+	return container_of(ibpd, struct efa_pd, ibpd);
+}
+
+static inline struct efa_mr *to_emr(struct ib_mr *ibmr)
+{
+	return container_of(ibmr, struct efa_mr, ibmr);
+}
+
+static inline struct efa_qp *to_eqp(struct ib_qp *ibqp)
+{
+	return container_of(ibqp, struct efa_qp, ibqp);
+}
+
+static inline struct efa_cq *to_ecq(struct ib_cq *ibcq)
+{
+	return container_of(ibcq, struct efa_cq, ibcq);
+}
+
+static inline struct efa_ah *to_eah(struct ib_ah *ibah)
+{
+	return container_of(ibah, struct efa_ah, ibah);
+}
+
+#define field_avail(x, fld, sz) (offsetof(typeof(x), fld) + \
+				 FIELD_SIZEOF(typeof(x), fld) <= (sz))
+
+#define is_reserved_cleared(reserved) \
+	!memchr_inv(reserved, 0, sizeof(reserved))
+
+static void *efa_zalloc_mapped(struct efa_dev *dev, dma_addr_t *dma_addr,
+			       size_t size, enum dma_data_direction dir)
+{
+	void *addr;
+
+	addr = alloc_pages_exact(size, GFP_KERNEL | __GFP_ZERO);
+	if (!addr)
+		return NULL;
+
+	*dma_addr = dma_map_single(&dev->pdev->dev, addr, size, dir);
+	if (dma_mapping_error(&dev->pdev->dev, *dma_addr)) {
+		ibdev_err(&dev->ibdev, "Failed to map DMA address\n");
+		free_pages_exact(addr, size);
+		return NULL;
+	}
+
+	return addr;
+}
+
+/*
+ * This is only called when the ucontext is destroyed and there can be no
+ * concurrent query via mmap or allocate on the xarray, thus we can be sure no
+ * other thread is using the entry pointer. We also know that all the BAR
+ * pages have either been zap'd or munmaped at this point.  Normal pages are
+ * refcounted and will be freed at the proper time.
+ */
+static void mmap_entries_remove_free(struct efa_dev *dev,
+				     struct efa_ucontext *ucontext)
+{
+	struct efa_mmap_entry *entry;
+	unsigned long mmap_page;
+
+	xa_for_each(&ucontext->mmap_xa, mmap_page, entry) {
+		xa_erase(&ucontext->mmap_xa, mmap_page);
+
+		ibdev_dbg(
+			&dev->ibdev,
+			"mmap: obj[0x%p] key[%#llx] addr[%#llx] len[%#llx] removed\n",
+			entry->obj, get_mmap_key(entry), entry->address,
+			entry->length);
+		if (entry->mmap_flag == EFA_MMAP_DMA_PAGE)
+			/* DMA mapping is already gone, now free the pages */
+			free_pages_exact(phys_to_virt(entry->address),
+					 entry->length);
+		kfree(entry);
+	}
+}
+
+static struct efa_mmap_entry *mmap_entry_get(struct efa_dev *dev,
+					     struct efa_ucontext *ucontext,
+					     u64 key, u64 len)
+{
+	struct efa_mmap_entry *entry;
+	u64 mmap_page;
+
+	mmap_page = (key & EFA_MMAP_PAGE_MASK) >> PAGE_SHIFT;
+	if (mmap_page > U32_MAX)
+		return NULL;
+
+	entry = xa_load(&ucontext->mmap_xa, mmap_page);
+	if (!entry || get_mmap_key(entry) != key || entry->length != len)
+		return NULL;
+
+	ibdev_dbg(&dev->ibdev,
+		  "mmap: obj[0x%p] key[%#llx] addr[%#llx] len[%#llx] removed\n",
+		  entry->obj, key, entry->address, entry->length);
+
+	return entry;
+}
+
+/*
+ * Note this locking scheme cannot support removal of entries, except during
+ * ucontext destruction when the core code guarentees no concurrency.
+ */
+static u64 mmap_entry_insert(struct efa_dev *dev, struct efa_ucontext *ucontext,
+			     void *obj, u64 address, u64 length, u8 mmap_flag)
+{
+	struct efa_mmap_entry *entry;
+	u32 next_mmap_page;
+	int err;
+
+	entry = kmalloc(sizeof(*entry), GFP_KERNEL);
+	if (!entry)
+		return EFA_MMAP_INVALID;
+
+	entry->obj = obj;
+	entry->address = address;
+	entry->length = length;
+	entry->mmap_flag = mmap_flag;
+
+	xa_lock(&ucontext->mmap_xa);
+	if (check_add_overflow(ucontext->mmap_xa_page,
+			       (u32)(length >> PAGE_SHIFT),
+			       &next_mmap_page))
+		goto err_unlock;
+
+	entry->mmap_page = ucontext->mmap_xa_page;
+	ucontext->mmap_xa_page = next_mmap_page;
+	err = __xa_insert(&ucontext->mmap_xa, entry->mmap_page, entry,
+			  GFP_KERNEL);
+	if (err)
+		goto err_unlock;
+
+	xa_unlock(&ucontext->mmap_xa);
+
+	ibdev_dbg(
+		&dev->ibdev,
+		"mmap: obj[0x%p] addr[%#llx], len[%#llx], key[%#llx] inserted\n",
+		entry->obj, entry->address, entry->length, get_mmap_key(entry));
+
+	return get_mmap_key(entry);
+
+err_unlock:
+	xa_unlock(&ucontext->mmap_xa);
+	kfree(entry);
+	return EFA_MMAP_INVALID;
+
+}
+
+int efa_query_device(struct ib_device *ibdev,
+		     struct ib_device_attr *props,
+		     struct ib_udata *udata)
+{
+	struct efa_com_get_device_attr_result *dev_attr;
+	struct efa_ibv_ex_query_device_resp resp = {};
+	struct efa_dev *dev = to_edev(ibdev);
+	int err;
+
+	if (udata && udata->inlen &&
+	    !ib_is_udata_cleared(udata, 0, udata->inlen)) {
+		ibdev_dbg(ibdev,
+			  "Incompatible ABI params, udata not cleared\n");
+		return -EINVAL;
+	}
+
+	dev_attr = &dev->dev_attr;
+
+	memset(props, 0, sizeof(*props));
+	props->max_mr_size = dev_attr->max_mr_pages * PAGE_SIZE;
+	props->page_size_cap = dev_attr->page_size_cap;
+	props->vendor_id = dev->pdev->vendor;
+	props->vendor_part_id = dev->pdev->device;
+	props->hw_ver = dev->pdev->subsystem_device;
+	props->max_qp = dev_attr->max_qp;
+	props->max_cq = dev_attr->max_cq;
+	props->max_pd = dev_attr->max_pd;
+	props->max_mr = dev_attr->max_mr;
+	props->max_ah = dev_attr->max_ah;
+	props->max_cqe = dev_attr->max_cq_depth;
+	props->max_qp_wr = min_t(u32, dev_attr->max_sq_depth,
+				 dev_attr->max_rq_depth);
+	props->max_send_sge = dev_attr->max_sq_sge;
+	props->max_recv_sge = dev_attr->max_rq_sge;
+
+	if (udata && udata->outlen) {
+		resp.max_sq_sge = dev_attr->max_sq_sge;
+		resp.max_rq_sge = dev_attr->max_rq_sge;
+		resp.max_sq_wr = dev_attr->max_sq_depth;
+		resp.max_rq_wr = dev_attr->max_rq_depth;
+
+		err = ib_copy_to_udata(udata, &resp,
+				       min(sizeof(resp), udata->outlen));
+		if (err) {
+			ibdev_dbg(ibdev,
+				  "Failed to copy udata for query_device\n");
+			return err;
+		}
+	}
+
+	return 0;
+}
+
+int efa_query_port(struct ib_device *ibdev, u8 port,
+		   struct ib_port_attr *props)
+{
+	struct efa_dev *dev = to_edev(ibdev);
+
+	props->lmc = 1;
+
+	props->state = IB_PORT_ACTIVE;
+	props->phys_state = IB_PORT_PHYS_STATE_LINK_UP;
+	props->gid_tbl_len = 1;
+	props->pkey_tbl_len = 1;
+	props->active_speed = IB_SPEED_EDR;
+	props->active_width = IB_WIDTH_4X;
+	props->max_mtu = ib_mtu_int_to_enum(dev->mtu);
+	props->active_mtu = ib_mtu_int_to_enum(dev->mtu);
+	props->max_msg_sz = dev->mtu;
+	props->max_vl_num = 1;
+
+	return 0;
+}
+
+int efa_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr,
+		 int qp_attr_mask,
+		 struct ib_qp_init_attr *qp_init_attr)
+{
+	struct efa_dev *dev = to_edev(ibqp->device);
+	struct efa_com_query_qp_params params = {};
+	struct efa_com_query_qp_result result;
+	struct efa_qp *qp = to_eqp(ibqp);
+	int err;
+
+#define EFA_QUERY_QP_SUPP_MASK \
+	(IB_QP_STATE | IB_QP_PKEY_INDEX | IB_QP_PORT | \
+	 IB_QP_QKEY | IB_QP_SQ_PSN | IB_QP_CAP)
+
+	if (qp_attr_mask & ~EFA_QUERY_QP_SUPP_MASK) {
+		ibdev_dbg(&dev->ibdev,
+			  "Unsupported qp_attr_mask[%#x] supported[%#x]\n",
+			  qp_attr_mask, EFA_QUERY_QP_SUPP_MASK);
+		return -EOPNOTSUPP;
+	}
+
+	memset(qp_attr, 0, sizeof(*qp_attr));
+	memset(qp_init_attr, 0, sizeof(*qp_init_attr));
+
+	params.qp_handle = qp->qp_handle;
+	err = efa_com_query_qp(&dev->edev, &params, &result);
+	if (err)
+		return err;
+
+	qp_attr->qp_state = result.qp_state;
+	qp_attr->qkey = result.qkey;
+	qp_attr->sq_psn = result.sq_psn;
+	qp_attr->sq_draining = result.sq_draining;
+	qp_attr->port_num = 1;
+
+	qp_attr->cap.max_send_wr = qp->max_send_wr;
+	qp_attr->cap.max_recv_wr = qp->max_recv_wr;
+	qp_attr->cap.max_send_sge = qp->max_send_sge;
+	qp_attr->cap.max_recv_sge = qp->max_recv_sge;
+	qp_attr->cap.max_inline_data = qp->max_inline_data;
+
+	qp_init_attr->qp_type = ibqp->qp_type;
+	qp_init_attr->recv_cq = ibqp->recv_cq;
+	qp_init_attr->send_cq = ibqp->send_cq;
+	qp_init_attr->qp_context = ibqp->qp_context;
+	qp_init_attr->cap = qp_attr->cap;
+
+	return 0;
+}
+
+int efa_query_gid(struct ib_device *ibdev, u8 port, int index,
+		  union ib_gid *gid)
+{
+	struct efa_dev *dev = to_edev(ibdev);
+
+	memcpy(gid->raw, dev->addr, sizeof(dev->addr));
+
+	return 0;
+}
+
+int efa_query_pkey(struct ib_device *ibdev, u8 port, u16 index,
+		   u16 *pkey)
+{
+	if (index > 0)
+		return -EINVAL;
+
+	*pkey = 0xffff;
+	return 0;
+}
+
+static int efa_pd_dealloc(struct efa_dev *dev, u16 pdn)
+{
+	struct efa_com_dealloc_pd_params params = {
+		.pdn = pdn,
+	};
+
+	return efa_com_dealloc_pd(&dev->edev, &params);
+}
+
+int efa_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata)
+{
+	struct efa_dev *dev = to_edev(ibpd->device);
+	struct efa_ibv_alloc_pd_resp resp = {};
+	struct efa_com_alloc_pd_result result;
+	struct efa_pd *pd = to_epd(ibpd);
+	int err;
+
+	if (udata->inlen &&
+	    !ib_is_udata_cleared(udata, 0, udata->inlen)) {
+		ibdev_dbg(&dev->ibdev,
+			  "Incompatible ABI params, udata not cleared\n");
+		err = -EINVAL;
+		goto err_out;
+	}
+
+	err = efa_com_alloc_pd(&dev->edev, &result);
+	if (err)
+		goto err_out;
+
+	pd->pdn = result.pdn;
+	resp.pdn = result.pdn;
+
+	if (udata->outlen) {
+		err = ib_copy_to_udata(udata, &resp,
+				       min(sizeof(resp), udata->outlen));
+		if (err) {
+			ibdev_dbg(&dev->ibdev,
+				  "Failed to copy udata for alloc_pd\n");
+			goto err_dealloc_pd;
+		}
+	}
+
+	ibdev_dbg(&dev->ibdev, "Allocated pd[%d]\n", pd->pdn);
+
+	return 0;
+
+err_dealloc_pd:
+	efa_pd_dealloc(dev, result.pdn);
+err_out:
+	atomic64_inc(&dev->stats.sw_stats.alloc_pd_err);
+	return err;
+}
+
+void efa_dealloc_pd(struct ib_pd *ibpd, struct ib_udata *udata)
+{
+	struct efa_dev *dev = to_edev(ibpd->device);
+	struct efa_pd *pd = to_epd(ibpd);
+
+	ibdev_dbg(&dev->ibdev, "Dealloc pd[%d]\n", pd->pdn);
+	efa_pd_dealloc(dev, pd->pdn);
+}
+
+static int efa_destroy_qp_handle(struct efa_dev *dev, u32 qp_handle)
+{
+	struct efa_com_destroy_qp_params params = { .qp_handle = qp_handle };
+
+	return efa_com_destroy_qp(&dev->edev, &params);
+}
+
+int efa_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata)
+{
+	struct efa_dev *dev = to_edev(ibqp->pd->device);
+	struct efa_qp *qp = to_eqp(ibqp);
+	int err;
+
+	ibdev_dbg(&dev->ibdev, "Destroy qp[%u]\n", ibqp->qp_num);
+	err = efa_destroy_qp_handle(dev, qp->qp_handle);
+	if (err)
+		return err;
+
+	if (qp->rq_cpu_addr) {
+		ibdev_dbg(&dev->ibdev,
+			  "qp->cpu_addr[0x%p] freed: size[%lu], dma[%pad]\n",
+			  qp->rq_cpu_addr, qp->rq_size,
+			  &qp->rq_dma_addr);
+		dma_unmap_single(&dev->pdev->dev, qp->rq_dma_addr, qp->rq_size,
+				 DMA_TO_DEVICE);
+	}
+
+	kfree(qp);
+	return 0;
+}
+
+static int qp_mmap_entries_setup(struct efa_qp *qp,
+				 struct efa_dev *dev,
+				 struct efa_ucontext *ucontext,
+				 struct efa_com_create_qp_params *params,
+				 struct efa_ibv_create_qp_resp *resp)
+{
+	/*
+	 * Once an entry is inserted it might be mmapped, hence cannot be
+	 * cleaned up until dealloc_ucontext.
+	 */
+	resp->sq_db_mmap_key =
+		mmap_entry_insert(dev, ucontext, qp,
+				  dev->db_bar_addr + resp->sq_db_offset,
+				  PAGE_SIZE, EFA_MMAP_IO_NC);
+	if (resp->sq_db_mmap_key == EFA_MMAP_INVALID)
+		return -ENOMEM;
+
+	resp->sq_db_offset &= ~PAGE_MASK;
+
+	resp->llq_desc_mmap_key =
+		mmap_entry_insert(dev, ucontext, qp,
+				  dev->mem_bar_addr + resp->llq_desc_offset,
+				  PAGE_ALIGN(params->sq_ring_size_in_bytes +
+					     (resp->llq_desc_offset & ~PAGE_MASK)),
+				  EFA_MMAP_IO_WC);
+	if (resp->llq_desc_mmap_key == EFA_MMAP_INVALID)
+		return -ENOMEM;
+
+	resp->llq_desc_offset &= ~PAGE_MASK;
+
+	if (qp->rq_size) {
+		resp->rq_db_mmap_key =
+			mmap_entry_insert(dev, ucontext, qp,
+					  dev->db_bar_addr + resp->rq_db_offset,
+					  PAGE_SIZE, EFA_MMAP_IO_NC);
+		if (resp->rq_db_mmap_key == EFA_MMAP_INVALID)
+			return -ENOMEM;
+
+		resp->rq_db_offset &= ~PAGE_MASK;
+
+		resp->rq_mmap_key =
+			mmap_entry_insert(dev, ucontext, qp,
+					  virt_to_phys(qp->rq_cpu_addr),
+					  qp->rq_size, EFA_MMAP_DMA_PAGE);
+		if (resp->rq_mmap_key == EFA_MMAP_INVALID)
+			return -ENOMEM;
+
+		resp->rq_mmap_size = qp->rq_size;
+	}
+
+	return 0;
+}
+
+static int efa_qp_validate_cap(struct efa_dev *dev,
+			       struct ib_qp_init_attr *init_attr)
+{
+	if (init_attr->cap.max_send_wr > dev->dev_attr.max_sq_depth) {
+		ibdev_dbg(&dev->ibdev,
+			  "qp: requested send wr[%u] exceeds the max[%u]\n",
+			  init_attr->cap.max_send_wr,
+			  dev->dev_attr.max_sq_depth);
+		return -EINVAL;
+	}
+	if (init_attr->cap.max_recv_wr > dev->dev_attr.max_rq_depth) {
+		ibdev_dbg(&dev->ibdev,
+			  "qp: requested receive wr[%u] exceeds the max[%u]\n",
+			  init_attr->cap.max_recv_wr,
+			  dev->dev_attr.max_rq_depth);
+		return -EINVAL;
+	}
+	if (init_attr->cap.max_send_sge > dev->dev_attr.max_sq_sge) {
+		ibdev_dbg(&dev->ibdev,
+			  "qp: requested sge send[%u] exceeds the max[%u]\n",
+			  init_attr->cap.max_send_sge, dev->dev_attr.max_sq_sge);
+		return -EINVAL;
+	}
+	if (init_attr->cap.max_recv_sge > dev->dev_attr.max_rq_sge) {
+		ibdev_dbg(&dev->ibdev,
+			  "qp: requested sge recv[%u] exceeds the max[%u]\n",
+			  init_attr->cap.max_recv_sge, dev->dev_attr.max_rq_sge);
+		return -EINVAL;
+	}
+	if (init_attr->cap.max_inline_data > dev->dev_attr.inline_buf_size) {
+		ibdev_dbg(&dev->ibdev,
+			  "qp: requested inline data[%u] exceeds the max[%u]\n",
+			  init_attr->cap.max_inline_data,
+			  dev->dev_attr.inline_buf_size);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int efa_qp_validate_attr(struct efa_dev *dev,
+				struct ib_qp_init_attr *init_attr)
+{
+	if (init_attr->qp_type != IB_QPT_DRIVER &&
+	    init_attr->qp_type != IB_QPT_UD) {
+		ibdev_dbg(&dev->ibdev,
+			  "Unsupported qp type %d\n", init_attr->qp_type);
+		return -EOPNOTSUPP;
+	}
+
+	if (init_attr->srq) {
+		ibdev_dbg(&dev->ibdev, "SRQ is not supported\n");
+		return -EOPNOTSUPP;
+	}
+
+	if (init_attr->create_flags) {
+		ibdev_dbg(&dev->ibdev, "Unsupported create flags\n");
+		return -EOPNOTSUPP;
+	}
+
+	return 0;
+}
+
+struct ib_qp *efa_create_qp(struct ib_pd *ibpd,
+			    struct ib_qp_init_attr *init_attr,
+			    struct ib_udata *udata)
+{
+	struct efa_com_create_qp_params create_qp_params = {};
+	struct efa_com_create_qp_result create_qp_resp;
+	struct efa_dev *dev = to_edev(ibpd->device);
+	struct efa_ibv_create_qp_resp resp = {};
+	struct efa_ibv_create_qp cmd = {};
+	bool rq_entry_inserted = false;
+	struct efa_ucontext *ucontext;
+	struct efa_qp *qp;
+	int err;
+
+	ucontext = rdma_udata_to_drv_context(udata, struct efa_ucontext,
+					     ibucontext);
+
+	err = efa_qp_validate_cap(dev, init_attr);
+	if (err)
+		goto err_out;
+
+	err = efa_qp_validate_attr(dev, init_attr);
+	if (err)
+		goto err_out;
+
+	if (!field_avail(cmd, driver_qp_type, udata->inlen)) {
+		ibdev_dbg(&dev->ibdev,
+			  "Incompatible ABI params, no input udata\n");
+		err = -EINVAL;
+		goto err_out;
+	}
+
+	if (udata->inlen > sizeof(cmd) &&
+	    !ib_is_udata_cleared(udata, sizeof(cmd),
+				 udata->inlen - sizeof(cmd))) {
+		ibdev_dbg(&dev->ibdev,
+			  "Incompatible ABI params, unknown fields in udata\n");
+		err = -EINVAL;
+		goto err_out;
+	}
+
+	err = ib_copy_from_udata(&cmd, udata,
+				 min(sizeof(cmd), udata->inlen));
+	if (err) {
+		ibdev_dbg(&dev->ibdev,
+			  "Cannot copy udata for create_qp\n");
+		goto err_out;
+	}
+
+	if (cmd.comp_mask) {
+		ibdev_dbg(&dev->ibdev,
+			  "Incompatible ABI params, unknown fields in udata\n");
+		err = -EINVAL;
+		goto err_out;
+	}
+
+	qp = kzalloc(sizeof(*qp), GFP_KERNEL);
+	if (!qp) {
+		err = -ENOMEM;
+		goto err_out;
+	}
+
+	create_qp_params.uarn = ucontext->uarn;
+	create_qp_params.pd = to_epd(ibpd)->pdn;
+
+	if (init_attr->qp_type == IB_QPT_UD) {
+		create_qp_params.qp_type = EFA_ADMIN_QP_TYPE_UD;
+	} else if (cmd.driver_qp_type == EFA_QP_DRIVER_TYPE_SRD) {
+		create_qp_params.qp_type = EFA_ADMIN_QP_TYPE_SRD;
+	} else {
+		ibdev_dbg(&dev->ibdev,
+			  "Unsupported qp type %d driver qp type %d\n",
+			  init_attr->qp_type, cmd.driver_qp_type);
+		err = -EOPNOTSUPP;
+		goto err_free_qp;
+	}
+
+	ibdev_dbg(&dev->ibdev, "Create QP: qp type %d driver qp type %#x\n",
+		  init_attr->qp_type, cmd.driver_qp_type);
+	create_qp_params.send_cq_idx = to_ecq(init_attr->send_cq)->cq_idx;
+	create_qp_params.recv_cq_idx = to_ecq(init_attr->recv_cq)->cq_idx;
+	create_qp_params.sq_depth = init_attr->cap.max_send_wr;
+	create_qp_params.sq_ring_size_in_bytes = cmd.sq_ring_size;
+
+	create_qp_params.rq_depth = init_attr->cap.max_recv_wr;
+	create_qp_params.rq_ring_size_in_bytes = cmd.rq_ring_size;
+	qp->rq_size = PAGE_ALIGN(create_qp_params.rq_ring_size_in_bytes);
+	if (qp->rq_size) {
+		qp->rq_cpu_addr = efa_zalloc_mapped(dev, &qp->rq_dma_addr,
+						    qp->rq_size, DMA_TO_DEVICE);
+		if (!qp->rq_cpu_addr) {
+			err = -ENOMEM;
+			goto err_free_qp;
+		}
+
+		ibdev_dbg(&dev->ibdev,
+			  "qp->cpu_addr[0x%p] allocated: size[%lu], dma[%pad]\n",
+			  qp->rq_cpu_addr, qp->rq_size, &qp->rq_dma_addr);
+		create_qp_params.rq_base_addr = qp->rq_dma_addr;
+	}
+
+	err = efa_com_create_qp(&dev->edev, &create_qp_params,
+				&create_qp_resp);
+	if (err)
+		goto err_free_mapped;
+
+	resp.sq_db_offset = create_qp_resp.sq_db_offset;
+	resp.rq_db_offset = create_qp_resp.rq_db_offset;
+	resp.llq_desc_offset = create_qp_resp.llq_descriptors_offset;
+	resp.send_sub_cq_idx = create_qp_resp.send_sub_cq_idx;
+	resp.recv_sub_cq_idx = create_qp_resp.recv_sub_cq_idx;
+
+	err = qp_mmap_entries_setup(qp, dev, ucontext, &create_qp_params,
+				    &resp);
+	if (err)
+		goto err_destroy_qp;
+
+	rq_entry_inserted = true;
+	qp->qp_handle = create_qp_resp.qp_handle;
+	qp->ibqp.qp_num = create_qp_resp.qp_num;
+	qp->ibqp.qp_type = init_attr->qp_type;
+	qp->max_send_wr = init_attr->cap.max_send_wr;
+	qp->max_recv_wr = init_attr->cap.max_recv_wr;
+	qp->max_send_sge = init_attr->cap.max_send_sge;
+	qp->max_recv_sge = init_attr->cap.max_recv_sge;
+	qp->max_inline_data = init_attr->cap.max_inline_data;
+
+	if (udata->outlen) {
+		err = ib_copy_to_udata(udata, &resp,
+				       min(sizeof(resp), udata->outlen));
+		if (err) {
+			ibdev_dbg(&dev->ibdev,
+				  "Failed to copy udata for qp[%u]\n",
+				  create_qp_resp.qp_num);
+			goto err_destroy_qp;
+		}
+	}
+
+	ibdev_dbg(&dev->ibdev, "Created qp[%d]\n", qp->ibqp.qp_num);
+
+	return &qp->ibqp;
+
+err_destroy_qp:
+	efa_destroy_qp_handle(dev, create_qp_resp.qp_handle);
+err_free_mapped:
+	if (qp->rq_size) {
+		dma_unmap_single(&dev->pdev->dev, qp->rq_dma_addr, qp->rq_size,
+				 DMA_TO_DEVICE);
+		if (!rq_entry_inserted)
+			free_pages_exact(qp->rq_cpu_addr, qp->rq_size);
+	}
+err_free_qp:
+	kfree(qp);
+err_out:
+	atomic64_inc(&dev->stats.sw_stats.create_qp_err);
+	return ERR_PTR(err);
+}
+
+static int efa_modify_qp_validate(struct efa_dev *dev, struct efa_qp *qp,
+				  struct ib_qp_attr *qp_attr, int qp_attr_mask,
+				  enum ib_qp_state cur_state,
+				  enum ib_qp_state new_state)
+{
+#define EFA_MODIFY_QP_SUPP_MASK \
+	(IB_QP_STATE | IB_QP_CUR_STATE | IB_QP_EN_SQD_ASYNC_NOTIFY | \
+	 IB_QP_PKEY_INDEX | IB_QP_PORT | IB_QP_QKEY | IB_QP_SQ_PSN)
+
+	if (qp_attr_mask & ~EFA_MODIFY_QP_SUPP_MASK) {
+		ibdev_dbg(&dev->ibdev,
+			  "Unsupported qp_attr_mask[%#x] supported[%#x]\n",
+			  qp_attr_mask, EFA_MODIFY_QP_SUPP_MASK);
+		return -EOPNOTSUPP;
+	}
+
+	if (!ib_modify_qp_is_ok(cur_state, new_state, IB_QPT_UD,
+				qp_attr_mask)) {
+		ibdev_dbg(&dev->ibdev, "Invalid modify QP parameters\n");
+		return -EINVAL;
+	}
+
+	if ((qp_attr_mask & IB_QP_PORT) && qp_attr->port_num != 1) {
+		ibdev_dbg(&dev->ibdev, "Can't change port num\n");
+		return -EOPNOTSUPP;
+	}
+
+	if ((qp_attr_mask & IB_QP_PKEY_INDEX) && qp_attr->pkey_index) {
+		ibdev_dbg(&dev->ibdev, "Can't change pkey index\n");
+		return -EOPNOTSUPP;
+	}
+
+	return 0;
+}
+
+int efa_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr,
+		  int qp_attr_mask, struct ib_udata *udata)
+{
+	struct efa_dev *dev = to_edev(ibqp->device);
+	struct efa_com_modify_qp_params params = {};
+	struct efa_qp *qp = to_eqp(ibqp);
+	enum ib_qp_state cur_state;
+	enum ib_qp_state new_state;
+	int err;
+
+	if (udata->inlen &&
+	    !ib_is_udata_cleared(udata, 0, udata->inlen)) {
+		ibdev_dbg(&dev->ibdev,
+			  "Incompatible ABI params, udata not cleared\n");
+		return -EINVAL;
+	}
+
+	cur_state = qp_attr_mask & IB_QP_CUR_STATE ? qp_attr->cur_qp_state :
+						     qp->state;
+	new_state = qp_attr_mask & IB_QP_STATE ? qp_attr->qp_state : cur_state;
+
+	err = efa_modify_qp_validate(dev, qp, qp_attr, qp_attr_mask, cur_state,
+				     new_state);
+	if (err)
+		return err;
+
+	params.qp_handle = qp->qp_handle;
+
+	if (qp_attr_mask & IB_QP_STATE) {
+		params.modify_mask |= BIT(EFA_ADMIN_QP_STATE_BIT) |
+				      BIT(EFA_ADMIN_CUR_QP_STATE_BIT);
+		params.cur_qp_state = qp_attr->cur_qp_state;
+		params.qp_state = qp_attr->qp_state;
+	}
+
+	if (qp_attr_mask & IB_QP_EN_SQD_ASYNC_NOTIFY) {
+		params.modify_mask |=
+			BIT(EFA_ADMIN_SQ_DRAINED_ASYNC_NOTIFY_BIT);
+		params.sq_drained_async_notify = qp_attr->en_sqd_async_notify;
+	}
+
+	if (qp_attr_mask & IB_QP_QKEY) {
+		params.modify_mask |= BIT(EFA_ADMIN_QKEY_BIT);
+		params.qkey = qp_attr->qkey;
+	}
+
+	if (qp_attr_mask & IB_QP_SQ_PSN) {
+		params.modify_mask |= BIT(EFA_ADMIN_SQ_PSN_BIT);
+		params.sq_psn = qp_attr->sq_psn;
+	}
+
+	err = efa_com_modify_qp(&dev->edev, &params);
+	if (err)
+		return err;
+
+	qp->state = new_state;
+
+	return 0;
+}
+
+static int efa_destroy_cq_idx(struct efa_dev *dev, int cq_idx)
+{
+	struct efa_com_destroy_cq_params params = { .cq_idx = cq_idx };
+
+	return efa_com_destroy_cq(&dev->edev, &params);
+}
+
+void efa_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata)
+{
+	struct efa_dev *dev = to_edev(ibcq->device);
+	struct efa_cq *cq = to_ecq(ibcq);
+
+	ibdev_dbg(&dev->ibdev,
+		  "Destroy cq[%d] virt[0x%p] freed: size[%lu], dma[%pad]\n",
+		  cq->cq_idx, cq->cpu_addr, cq->size, &cq->dma_addr);
+
+	efa_destroy_cq_idx(dev, cq->cq_idx);
+	dma_unmap_single(&dev->pdev->dev, cq->dma_addr, cq->size,
+			 DMA_FROM_DEVICE);
+}
+
+static int cq_mmap_entries_setup(struct efa_dev *dev, struct efa_cq *cq,
+				 struct efa_ibv_create_cq_resp *resp)
+{
+	resp->q_mmap_size = cq->size;
+	resp->q_mmap_key = mmap_entry_insert(dev, cq->ucontext, cq,
+					     virt_to_phys(cq->cpu_addr),
+					     cq->size, EFA_MMAP_DMA_PAGE);
+	if (resp->q_mmap_key == EFA_MMAP_INVALID)
+		return -ENOMEM;
+
+	return 0;
+}
+
+int efa_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
+		  struct ib_udata *udata)
+{
+	struct efa_ucontext *ucontext = rdma_udata_to_drv_context(
+		udata, struct efa_ucontext, ibucontext);
+	struct efa_ibv_create_cq_resp resp = {};
+	struct efa_com_create_cq_params params;
+	struct efa_com_create_cq_result result;
+	struct ib_device *ibdev = ibcq->device;
+	struct efa_dev *dev = to_edev(ibdev);
+	struct efa_ibv_create_cq cmd = {};
+	struct efa_cq *cq = to_ecq(ibcq);
+	bool cq_entry_inserted = false;
+	int entries = attr->cqe;
+	int err;
+
+	ibdev_dbg(ibdev, "create_cq entries %d\n", entries);
+
+	if (entries < 1 || entries > dev->dev_attr.max_cq_depth) {
+		ibdev_dbg(ibdev,
+			  "cq: requested entries[%u] non-positive or greater than max[%u]\n",
+			  entries, dev->dev_attr.max_cq_depth);
+		err = -EINVAL;
+		goto err_out;
+	}
+
+	if (!field_avail(cmd, num_sub_cqs, udata->inlen)) {
+		ibdev_dbg(ibdev,
+			  "Incompatible ABI params, no input udata\n");
+		err = -EINVAL;
+		goto err_out;
+	}
+
+	if (udata->inlen > sizeof(cmd) &&
+	    !ib_is_udata_cleared(udata, sizeof(cmd),
+				 udata->inlen - sizeof(cmd))) {
+		ibdev_dbg(ibdev,
+			  "Incompatible ABI params, unknown fields in udata\n");
+		err = -EINVAL;
+		goto err_out;
+	}
+
+	err = ib_copy_from_udata(&cmd, udata,
+				 min(sizeof(cmd), udata->inlen));
+	if (err) {
+		ibdev_dbg(ibdev, "Cannot copy udata for create_cq\n");
+		goto err_out;
+	}
+
+	if (cmd.comp_mask || !is_reserved_cleared(cmd.reserved_50)) {
+		ibdev_dbg(ibdev,
+			  "Incompatible ABI params, unknown fields in udata\n");
+		err = -EINVAL;
+		goto err_out;
+	}
+
+	if (!cmd.cq_entry_size) {
+		ibdev_dbg(ibdev,
+			  "Invalid entry size [%u]\n", cmd.cq_entry_size);
+		err = -EINVAL;
+		goto err_out;
+	}
+
+	if (cmd.num_sub_cqs != dev->dev_attr.sub_cqs_per_cq) {
+		ibdev_dbg(ibdev,
+			  "Invalid number of sub cqs[%u] expected[%u]\n",
+			  cmd.num_sub_cqs, dev->dev_attr.sub_cqs_per_cq);
+		err = -EINVAL;
+		goto err_out;
+	}
+
+	cq->ucontext = ucontext;
+	cq->size = PAGE_ALIGN(cmd.cq_entry_size * entries * cmd.num_sub_cqs);
+	cq->cpu_addr = efa_zalloc_mapped(dev, &cq->dma_addr, cq->size,
+					 DMA_FROM_DEVICE);
+	if (!cq->cpu_addr) {
+		err = -ENOMEM;
+		goto err_out;
+	}
+
+	params.uarn = cq->ucontext->uarn;
+	params.cq_depth = entries;
+	params.dma_addr = cq->dma_addr;
+	params.entry_size_in_bytes = cmd.cq_entry_size;
+	params.num_sub_cqs = cmd.num_sub_cqs;
+	err = efa_com_create_cq(&dev->edev, &params, &result);
+	if (err)
+		goto err_free_mapped;
+
+	resp.cq_idx = result.cq_idx;
+	cq->cq_idx = result.cq_idx;
+	cq->ibcq.cqe = result.actual_depth;
+	WARN_ON_ONCE(entries != result.actual_depth);
+
+	err = cq_mmap_entries_setup(dev, cq, &resp);
+	if (err) {
+		ibdev_dbg(ibdev, "Could not setup cq[%u] mmap entries\n",
+			  cq->cq_idx);
+		goto err_destroy_cq;
+	}
+
+	cq_entry_inserted = true;
+
+	if (udata->outlen) {
+		err = ib_copy_to_udata(udata, &resp,
+				       min(sizeof(resp), udata->outlen));
+		if (err) {
+			ibdev_dbg(ibdev,
+				  "Failed to copy udata for create_cq\n");
+			goto err_destroy_cq;
+		}
+	}
+
+	ibdev_dbg(ibdev, "Created cq[%d], cq depth[%u]. dma[%pad] virt[0x%p]\n",
+		  cq->cq_idx, result.actual_depth, &cq->dma_addr, cq->cpu_addr);
+
+	return 0;
+
+err_destroy_cq:
+	efa_destroy_cq_idx(dev, cq->cq_idx);
+err_free_mapped:
+	dma_unmap_single(&dev->pdev->dev, cq->dma_addr, cq->size,
+			 DMA_FROM_DEVICE);
+	if (!cq_entry_inserted)
+		free_pages_exact(cq->cpu_addr, cq->size);
+err_out:
+	atomic64_inc(&dev->stats.sw_stats.create_cq_err);
+	return err;
+}
+
+static int umem_to_page_list(struct efa_dev *dev,
+			     struct ib_umem *umem,
+			     u64 *page_list,
+			     u32 hp_cnt,
+			     u8 hp_shift)
+{
+	u32 pages_in_hp = BIT(hp_shift - PAGE_SHIFT);
+	struct ib_block_iter biter;
+	unsigned int hp_idx = 0;
+
+	ibdev_dbg(&dev->ibdev, "hp_cnt[%u], pages_in_hp[%u]\n",
+		  hp_cnt, pages_in_hp);
+
+	rdma_for_each_block(umem->sg_head.sgl, &biter, umem->nmap,
+			    BIT(hp_shift))
+		page_list[hp_idx++] = rdma_block_iter_dma_address(&biter);
+
+	return 0;
+}
+
+static struct scatterlist *efa_vmalloc_buf_to_sg(u64 *buf, int page_cnt)
+{
+	struct scatterlist *sglist;
+	struct page *pg;
+	int i;
+
+	sglist = kcalloc(page_cnt, sizeof(*sglist), GFP_KERNEL);
+	if (!sglist)
+		return NULL;
+	sg_init_table(sglist, page_cnt);
+	for (i = 0; i < page_cnt; i++) {
+		pg = vmalloc_to_page(buf);
+		if (!pg)
+			goto err;
+		sg_set_page(&sglist[i], pg, PAGE_SIZE, 0);
+		buf += PAGE_SIZE / sizeof(*buf);
+	}
+	return sglist;
+
+err:
+	kfree(sglist);
+	return NULL;
+}
+
+/*
+ * create a chunk list of physical pages dma addresses from the supplied
+ * scatter gather list
+ */
+static int pbl_chunk_list_create(struct efa_dev *dev, struct pbl_context *pbl)
+{
+	struct pbl_chunk_list *chunk_list = &pbl->phys.indirect.chunk_list;
+	int page_cnt = pbl->phys.indirect.pbl_buf_size_in_pages;
+	struct scatterlist *pages_sgl = pbl->phys.indirect.sgl;
+	unsigned int chunk_list_size, chunk_idx, payload_idx;
+	int sg_dma_cnt = pbl->phys.indirect.sg_dma_cnt;
+	struct efa_com_ctrl_buff_info *ctrl_buf;
+	u64 *cur_chunk_buf, *prev_chunk_buf;
+	struct ib_block_iter biter;
+	dma_addr_t dma_addr;
+	int i;
+
+	/* allocate a chunk list that consists of 4KB chunks */
+	chunk_list_size = DIV_ROUND_UP(page_cnt, EFA_PTRS_PER_CHUNK);
+
+	chunk_list->size = chunk_list_size;
+	chunk_list->chunks = kcalloc(chunk_list_size,
+				     sizeof(*chunk_list->chunks),
+				     GFP_KERNEL);
+	if (!chunk_list->chunks)
+		return -ENOMEM;
+
+	ibdev_dbg(&dev->ibdev,
+		  "chunk_list_size[%u] - pages[%u]\n", chunk_list_size,
+		  page_cnt);
+
+	/* allocate chunk buffers: */
+	for (i = 0; i < chunk_list_size; i++) {
+		chunk_list->chunks[i].buf = kzalloc(EFA_CHUNK_SIZE, GFP_KERNEL);
+		if (!chunk_list->chunks[i].buf)
+			goto chunk_list_dealloc;
+
+		chunk_list->chunks[i].length = EFA_CHUNK_USED_SIZE;
+	}
+	chunk_list->chunks[chunk_list_size - 1].length =
+		((page_cnt % EFA_PTRS_PER_CHUNK) * EFA_CHUNK_PAYLOAD_PTR_SIZE) +
+			EFA_CHUNK_PTR_SIZE;
+
+	/* fill the dma addresses of sg list pages to chunks: */
+	chunk_idx = 0;
+	payload_idx = 0;
+	cur_chunk_buf = chunk_list->chunks[0].buf;
+	rdma_for_each_block(pages_sgl, &biter, sg_dma_cnt,
+			    EFA_CHUNK_PAYLOAD_SIZE) {
+		cur_chunk_buf[payload_idx++] =
+			rdma_block_iter_dma_address(&biter);
+
+		if (payload_idx == EFA_PTRS_PER_CHUNK) {
+			chunk_idx++;
+			cur_chunk_buf = chunk_list->chunks[chunk_idx].buf;
+			payload_idx = 0;
+		}
+	}
+
+	/* map chunks to dma and fill chunks next ptrs */
+	for (i = chunk_list_size - 1; i >= 0; i--) {
+		dma_addr = dma_map_single(&dev->pdev->dev,
+					  chunk_list->chunks[i].buf,
+					  chunk_list->chunks[i].length,
+					  DMA_TO_DEVICE);
+		if (dma_mapping_error(&dev->pdev->dev, dma_addr)) {
+			ibdev_err(&dev->ibdev,
+				  "chunk[%u] dma_map_failed\n", i);
+			goto chunk_list_unmap;
+		}
+
+		chunk_list->chunks[i].dma_addr = dma_addr;
+		ibdev_dbg(&dev->ibdev,
+			  "chunk[%u] mapped at [%pad]\n", i, &dma_addr);
+
+		if (!i)
+			break;
+
+		prev_chunk_buf = chunk_list->chunks[i - 1].buf;
+
+		ctrl_buf = (struct efa_com_ctrl_buff_info *)
+				&prev_chunk_buf[EFA_PTRS_PER_CHUNK];
+		ctrl_buf->length = chunk_list->chunks[i].length;
+
+		efa_com_set_dma_addr(dma_addr,
+				     &ctrl_buf->address.mem_addr_high,
+				     &ctrl_buf->address.mem_addr_low);
+	}
+
+	return 0;
+
+chunk_list_unmap:
+	for (; i < chunk_list_size; i++) {
+		dma_unmap_single(&dev->pdev->dev, chunk_list->chunks[i].dma_addr,
+				 chunk_list->chunks[i].length, DMA_TO_DEVICE);
+	}
+chunk_list_dealloc:
+	for (i = 0; i < chunk_list_size; i++)
+		kfree(chunk_list->chunks[i].buf);
+
+	kfree(chunk_list->chunks);
+	return -ENOMEM;
+}
+
+static void pbl_chunk_list_destroy(struct efa_dev *dev, struct pbl_context *pbl)
+{
+	struct pbl_chunk_list *chunk_list = &pbl->phys.indirect.chunk_list;
+	int i;
+
+	for (i = 0; i < chunk_list->size; i++) {
+		dma_unmap_single(&dev->pdev->dev, chunk_list->chunks[i].dma_addr,
+				 chunk_list->chunks[i].length, DMA_TO_DEVICE);
+		kfree(chunk_list->chunks[i].buf);
+	}
+
+	kfree(chunk_list->chunks);
+}
+
+/* initialize pbl continuous mode: map pbl buffer to a dma address. */
+static int pbl_continuous_initialize(struct efa_dev *dev,
+				     struct pbl_context *pbl)
+{
+	dma_addr_t dma_addr;
+
+	dma_addr = dma_map_single(&dev->pdev->dev, pbl->pbl_buf,
+				  pbl->pbl_buf_size_in_bytes, DMA_TO_DEVICE);
+	if (dma_mapping_error(&dev->pdev->dev, dma_addr)) {
+		ibdev_err(&dev->ibdev, "Unable to map pbl to DMA address\n");
+		return -ENOMEM;
+	}
+
+	pbl->phys.continuous.dma_addr = dma_addr;
+	ibdev_dbg(&dev->ibdev,
+		  "pbl continuous - dma_addr = %pad, size[%u]\n",
+		  &dma_addr, pbl->pbl_buf_size_in_bytes);
+
+	return 0;
+}
+
+/*
+ * initialize pbl indirect mode:
+ * create a chunk list out of the dma addresses of the physical pages of
+ * pbl buffer.
+ */
+static int pbl_indirect_initialize(struct efa_dev *dev, struct pbl_context *pbl)
+{
+	u32 size_in_pages = DIV_ROUND_UP(pbl->pbl_buf_size_in_bytes, PAGE_SIZE);
+	struct scatterlist *sgl;
+	int sg_dma_cnt, err;
+
+	BUILD_BUG_ON(EFA_CHUNK_PAYLOAD_SIZE > PAGE_SIZE);
+	sgl = efa_vmalloc_buf_to_sg(pbl->pbl_buf, size_in_pages);
+	if (!sgl)
+		return -ENOMEM;
+
+	sg_dma_cnt = dma_map_sg(&dev->pdev->dev, sgl, size_in_pages, DMA_TO_DEVICE);
+	if (!sg_dma_cnt) {
+		err = -EINVAL;
+		goto err_map;
+	}
+
+	pbl->phys.indirect.pbl_buf_size_in_pages = size_in_pages;
+	pbl->phys.indirect.sgl = sgl;
+	pbl->phys.indirect.sg_dma_cnt = sg_dma_cnt;
+	err = pbl_chunk_list_create(dev, pbl);
+	if (err) {
+		ibdev_dbg(&dev->ibdev,
+			  "chunk_list creation failed[%d]\n", err);
+		goto err_chunk;
+	}
+
+	ibdev_dbg(&dev->ibdev,
+		  "pbl indirect - size[%u], chunks[%u]\n",
+		  pbl->pbl_buf_size_in_bytes,
+		  pbl->phys.indirect.chunk_list.size);
+
+	return 0;
+
+err_chunk:
+	dma_unmap_sg(&dev->pdev->dev, sgl, size_in_pages, DMA_TO_DEVICE);
+err_map:
+	kfree(sgl);
+	return err;
+}
+
+static void pbl_indirect_terminate(struct efa_dev *dev, struct pbl_context *pbl)
+{
+	pbl_chunk_list_destroy(dev, pbl);
+	dma_unmap_sg(&dev->pdev->dev, pbl->phys.indirect.sgl,
+		     pbl->phys.indirect.pbl_buf_size_in_pages, DMA_TO_DEVICE);
+	kfree(pbl->phys.indirect.sgl);
+}
+
+/* create a page buffer list from a mapped user memory region */
+static int pbl_create(struct efa_dev *dev,
+		      struct pbl_context *pbl,
+		      struct ib_umem *umem,
+		      int hp_cnt,
+		      u8 hp_shift)
+{
+	int err;
+
+	pbl->pbl_buf_size_in_bytes = hp_cnt * EFA_CHUNK_PAYLOAD_PTR_SIZE;
+	pbl->pbl_buf = kvzalloc(pbl->pbl_buf_size_in_bytes, GFP_KERNEL);
+	if (!pbl->pbl_buf)
+		return -ENOMEM;
+
+	if (is_vmalloc_addr(pbl->pbl_buf)) {
+		pbl->physically_continuous = 0;
+		err = umem_to_page_list(dev, umem, pbl->pbl_buf, hp_cnt,
+					hp_shift);
+		if (err)
+			goto err_free;
+
+		err = pbl_indirect_initialize(dev, pbl);
+		if (err)
+			goto err_free;
+	} else {
+		pbl->physically_continuous = 1;
+		err = umem_to_page_list(dev, umem, pbl->pbl_buf, hp_cnt,
+					hp_shift);
+		if (err)
+			goto err_free;
+
+		err = pbl_continuous_initialize(dev, pbl);
+		if (err)
+			goto err_free;
+	}
+
+	ibdev_dbg(&dev->ibdev,
+		  "user_pbl_created: user_pages[%u], continuous[%u]\n",
+		  hp_cnt, pbl->physically_continuous);
+
+	return 0;
+
+err_free:
+	kvfree(pbl->pbl_buf);
+	return err;
+}
+
+static void pbl_destroy(struct efa_dev *dev, struct pbl_context *pbl)
+{
+	if (pbl->physically_continuous)
+		dma_unmap_single(&dev->pdev->dev, pbl->phys.continuous.dma_addr,
+				 pbl->pbl_buf_size_in_bytes, DMA_TO_DEVICE);
+	else
+		pbl_indirect_terminate(dev, pbl);
+
+	kvfree(pbl->pbl_buf);
+}
+
+static int efa_create_inline_pbl(struct efa_dev *dev, struct efa_mr *mr,
+				 struct efa_com_reg_mr_params *params)
+{
+	int err;
+
+	params->inline_pbl = 1;
+	err = umem_to_page_list(dev, mr->umem, params->pbl.inline_pbl_array,
+				params->page_num, params->page_shift);
+	if (err)
+		return err;
+
+	ibdev_dbg(&dev->ibdev,
+		  "inline_pbl_array - pages[%u]\n", params->page_num);
+
+	return 0;
+}
+
+static int efa_create_pbl(struct efa_dev *dev,
+			  struct pbl_context *pbl,
+			  struct efa_mr *mr,
+			  struct efa_com_reg_mr_params *params)
+{
+	int err;
+
+	err = pbl_create(dev, pbl, mr->umem, params->page_num,
+			 params->page_shift);
+	if (err) {
+		ibdev_dbg(&dev->ibdev, "Failed to create pbl[%d]\n", err);
+		return err;
+	}
+
+	params->inline_pbl = 0;
+	params->indirect = !pbl->physically_continuous;
+	if (pbl->physically_continuous) {
+		params->pbl.pbl.length = pbl->pbl_buf_size_in_bytes;
+
+		efa_com_set_dma_addr(pbl->phys.continuous.dma_addr,
+				     &params->pbl.pbl.address.mem_addr_high,
+				     &params->pbl.pbl.address.mem_addr_low);
+	} else {
+		params->pbl.pbl.length =
+			pbl->phys.indirect.chunk_list.chunks[0].length;
+
+		efa_com_set_dma_addr(pbl->phys.indirect.chunk_list.chunks[0].dma_addr,
+				     &params->pbl.pbl.address.mem_addr_high,
+				     &params->pbl.pbl.address.mem_addr_low);
+	}
+
+	return 0;
+}
+
+struct ib_mr *efa_reg_mr(struct ib_pd *ibpd, u64 start, u64 length,
+			 u64 virt_addr, int access_flags,
+			 struct ib_udata *udata)
+{
+	struct efa_dev *dev = to_edev(ibpd->device);
+	struct efa_com_reg_mr_params params = {};
+	struct efa_com_reg_mr_result result = {};
+	struct pbl_context pbl;
+	unsigned int pg_sz;
+	struct efa_mr *mr;
+	int inline_size;
+	int err;
+
+	if (udata->inlen &&
+	    !ib_is_udata_cleared(udata, 0, sizeof(udata->inlen))) {
+		ibdev_dbg(&dev->ibdev,
+			  "Incompatible ABI params, udata not cleared\n");
+		err = -EINVAL;
+		goto err_out;
+	}
+
+	if (access_flags & ~EFA_SUPPORTED_ACCESS_FLAGS) {
+		ibdev_dbg(&dev->ibdev,
+			  "Unsupported access flags[%#x], supported[%#x]\n",
+			  access_flags, EFA_SUPPORTED_ACCESS_FLAGS);
+		err = -EOPNOTSUPP;
+		goto err_out;
+	}
+
+	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+	if (!mr) {
+		err = -ENOMEM;
+		goto err_out;
+	}
+
+	mr->umem = ib_umem_get(udata, start, length, access_flags, 0);
+	if (IS_ERR(mr->umem)) {
+		err = PTR_ERR(mr->umem);
+		ibdev_dbg(&dev->ibdev,
+			  "Failed to pin and map user space memory[%d]\n", err);
+		goto err_free;
+	}
+
+	params.pd = to_epd(ibpd)->pdn;
+	params.iova = virt_addr;
+	params.mr_length_in_bytes = length;
+	params.permissions = access_flags & 0x1;
+
+	pg_sz = ib_umem_find_best_pgsz(mr->umem,
+				       dev->dev_attr.page_size_cap,
+				       virt_addr);
+	if (!pg_sz) {
+		err = -EOPNOTSUPP;
+		ibdev_dbg(&dev->ibdev, "Failed to find a suitable page size in page_size_cap %#llx\n",
+			  dev->dev_attr.page_size_cap);
+		goto err_unmap;
+	}
+
+	params.page_shift = __ffs(pg_sz);
+	params.page_num = DIV_ROUND_UP(length + (start & (pg_sz - 1)),
+				       pg_sz);
+
+	ibdev_dbg(&dev->ibdev,
+		  "start %#llx length %#llx params.page_shift %u params.page_num %u\n",
+		  start, length, params.page_shift, params.page_num);
+
+	inline_size = ARRAY_SIZE(params.pbl.inline_pbl_array);
+	if (params.page_num <= inline_size) {
+		err = efa_create_inline_pbl(dev, mr, &params);
+		if (err)
+			goto err_unmap;
+
+		err = efa_com_register_mr(&dev->edev, &params, &result);
+		if (err)
+			goto err_unmap;
+	} else {
+		err = efa_create_pbl(dev, &pbl, mr, &params);
+		if (err)
+			goto err_unmap;
+
+		err = efa_com_register_mr(&dev->edev, &params, &result);
+		pbl_destroy(dev, &pbl);
+
+		if (err)
+			goto err_unmap;
+	}
+
+	mr->ibmr.lkey = result.l_key;
+	mr->ibmr.rkey = result.r_key;
+	mr->ibmr.length = length;
+	ibdev_dbg(&dev->ibdev, "Registered mr[%d]\n", mr->ibmr.lkey);
+
+	return &mr->ibmr;
+
+err_unmap:
+	ib_umem_release(mr->umem);
+err_free:
+	kfree(mr);
+err_out:
+	atomic64_inc(&dev->stats.sw_stats.reg_mr_err);
+	return ERR_PTR(err);
+}
+
+int efa_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
+{
+	struct efa_dev *dev = to_edev(ibmr->device);
+	struct efa_com_dereg_mr_params params;
+	struct efa_mr *mr = to_emr(ibmr);
+	int err;
+
+	ibdev_dbg(&dev->ibdev, "Deregister mr[%d]\n", ibmr->lkey);
+
+	params.l_key = mr->ibmr.lkey;
+	err = efa_com_dereg_mr(&dev->edev, &params);
+	if (err)
+		return err;
+
+	ib_umem_release(mr->umem);
+	kfree(mr);
+
+	return 0;
+}
+
+int efa_get_port_immutable(struct ib_device *ibdev, u8 port_num,
+			   struct ib_port_immutable *immutable)
+{
+	struct ib_port_attr attr;
+	int err;
+
+	err = ib_query_port(ibdev, port_num, &attr);
+	if (err) {
+		ibdev_dbg(ibdev, "Couldn't query port err[%d]\n", err);
+		return err;
+	}
+
+	immutable->pkey_tbl_len = attr.pkey_tbl_len;
+	immutable->gid_tbl_len = attr.gid_tbl_len;
+
+	return 0;
+}
+
+static int efa_dealloc_uar(struct efa_dev *dev, u16 uarn)
+{
+	struct efa_com_dealloc_uar_params params = {
+		.uarn = uarn,
+	};
+
+	return efa_com_dealloc_uar(&dev->edev, &params);
+}
+
+int efa_alloc_ucontext(struct ib_ucontext *ibucontext, struct ib_udata *udata)
+{
+	struct efa_ucontext *ucontext = to_eucontext(ibucontext);
+	struct efa_dev *dev = to_edev(ibucontext->device);
+	struct efa_ibv_alloc_ucontext_resp resp = {};
+	struct efa_com_alloc_uar_result result;
+	int err;
+
+	/*
+	 * it's fine if the driver does not know all request fields,
+	 * we will ack input fields in our response.
+	 */
+
+	err = efa_com_alloc_uar(&dev->edev, &result);
+	if (err)
+		goto err_out;
+
+	ucontext->uarn = result.uarn;
+	xa_init(&ucontext->mmap_xa);
+
+	resp.cmds_supp_udata_mask |= EFA_USER_CMDS_SUPP_UDATA_QUERY_DEVICE;
+	resp.cmds_supp_udata_mask |= EFA_USER_CMDS_SUPP_UDATA_CREATE_AH;
+	resp.sub_cqs_per_cq = dev->dev_attr.sub_cqs_per_cq;
+	resp.inline_buf_size = dev->dev_attr.inline_buf_size;
+	resp.max_llq_size = dev->dev_attr.max_llq_size;
+
+	if (udata && udata->outlen) {
+		err = ib_copy_to_udata(udata, &resp,
+				       min(sizeof(resp), udata->outlen));
+		if (err)
+			goto err_dealloc_uar;
+	}
+
+	return 0;
+
+err_dealloc_uar:
+	efa_dealloc_uar(dev, result.uarn);
+err_out:
+	atomic64_inc(&dev->stats.sw_stats.alloc_ucontext_err);
+	return err;
+}
+
+void efa_dealloc_ucontext(struct ib_ucontext *ibucontext)
+{
+	struct efa_ucontext *ucontext = to_eucontext(ibucontext);
+	struct efa_dev *dev = to_edev(ibucontext->device);
+
+	mmap_entries_remove_free(dev, ucontext);
+	efa_dealloc_uar(dev, ucontext->uarn);
+}
+
+static int __efa_mmap(struct efa_dev *dev, struct efa_ucontext *ucontext,
+		      struct vm_area_struct *vma, u64 key, u64 length)
+{
+	struct efa_mmap_entry *entry;
+	unsigned long va;
+	u64 pfn;
+	int err;
+
+	entry = mmap_entry_get(dev, ucontext, key, length);
+	if (!entry) {
+		ibdev_dbg(&dev->ibdev, "key[%#llx] does not have valid entry\n",
+			  key);
+		return -EINVAL;
+	}
+
+	ibdev_dbg(&dev->ibdev,
+		  "Mapping address[%#llx], length[%#llx], mmap_flag[%d]\n",
+		  entry->address, length, entry->mmap_flag);
+
+	pfn = entry->address >> PAGE_SHIFT;
+	switch (entry->mmap_flag) {
+	case EFA_MMAP_IO_NC:
+		err = rdma_user_mmap_io(&ucontext->ibucontext, vma, pfn, length,
+					pgprot_noncached(vma->vm_page_prot));
+		break;
+	case EFA_MMAP_IO_WC:
+		err = rdma_user_mmap_io(&ucontext->ibucontext, vma, pfn, length,
+					pgprot_writecombine(vma->vm_page_prot));
+		break;
+	case EFA_MMAP_DMA_PAGE:
+		for (va = vma->vm_start; va < vma->vm_end;
+		     va += PAGE_SIZE, pfn++) {
+			err = vm_insert_page(vma, va, pfn_to_page(pfn));
+			if (err)
+				break;
+		}
+		break;
+	default:
+		err = -EINVAL;
+	}
+
+	if (err) {
+		ibdev_dbg(
+			&dev->ibdev,
+			"Couldn't mmap address[%#llx] length[%#llx] mmap_flag[%d] err[%d]\n",
+			entry->address, length, entry->mmap_flag, err);
+		return err;
+	}
+
+	return 0;
+}
+
+int efa_mmap(struct ib_ucontext *ibucontext,
+	     struct vm_area_struct *vma)
+{
+	struct efa_ucontext *ucontext = to_eucontext(ibucontext);
+	struct efa_dev *dev = to_edev(ibucontext->device);
+	u64 length = vma->vm_end - vma->vm_start;
+	u64 key = vma->vm_pgoff << PAGE_SHIFT;
+
+	ibdev_dbg(&dev->ibdev,
+		  "start %#lx, end %#lx, length = %#llx, key = %#llx\n",
+		  vma->vm_start, vma->vm_end, length, key);
+
+	if (length % PAGE_SIZE != 0 || !(vma->vm_flags & VM_SHARED)) {
+		ibdev_dbg(&dev->ibdev,
+			  "length[%#llx] is not page size aligned[%#lx] or VM_SHARED is not set [%#lx]\n",
+			  length, PAGE_SIZE, vma->vm_flags);
+		return -EINVAL;
+	}
+
+	if (vma->vm_flags & VM_EXEC) {
+		ibdev_dbg(&dev->ibdev, "Mapping executable pages is not permitted\n");
+		return -EPERM;
+	}
+
+	return __efa_mmap(dev, ucontext, vma, key, length);
+}
+
+static int efa_ah_destroy(struct efa_dev *dev, struct efa_ah *ah)
+{
+	struct efa_com_destroy_ah_params params = {
+		.ah = ah->ah,
+		.pdn = to_epd(ah->ibah.pd)->pdn,
+	};
+
+	return efa_com_destroy_ah(&dev->edev, &params);
+}
+
+int efa_create_ah(struct ib_ah *ibah,
+		  struct rdma_ah_attr *ah_attr,
+		  u32 flags,
+		  struct ib_udata *udata)
+{
+	struct efa_dev *dev = to_edev(ibah->device);
+	struct efa_com_create_ah_params params = {};
+	struct efa_ibv_create_ah_resp resp = {};
+	struct efa_com_create_ah_result result;
+	struct efa_ah *ah = to_eah(ibah);
+	int err;
+
+	if (!(flags & RDMA_CREATE_AH_SLEEPABLE)) {
+		ibdev_dbg(&dev->ibdev,
+			  "Create address handle is not supported in atomic context\n");
+		err = -EOPNOTSUPP;
+		goto err_out;
+	}
+
+	if (udata->inlen &&
+	    !ib_is_udata_cleared(udata, 0, udata->inlen)) {
+		ibdev_dbg(&dev->ibdev, "Incompatible ABI params\n");
+		err = -EINVAL;
+		goto err_out;
+	}
+
+	memcpy(params.dest_addr, ah_attr->grh.dgid.raw,
+	       sizeof(params.dest_addr));
+	params.pdn = to_epd(ibah->pd)->pdn;
+	err = efa_com_create_ah(&dev->edev, &params, &result);
+	if (err)
+		goto err_out;
+
+	memcpy(ah->id, ah_attr->grh.dgid.raw, sizeof(ah->id));
+	ah->ah = result.ah;
+
+	resp.efa_address_handle = result.ah;
+
+	if (udata->outlen) {
+		err = ib_copy_to_udata(udata, &resp,
+				       min(sizeof(resp), udata->outlen));
+		if (err) {
+			ibdev_dbg(&dev->ibdev,
+				  "Failed to copy udata for create_ah response\n");
+			goto err_destroy_ah;
+		}
+	}
+	ibdev_dbg(&dev->ibdev, "Created ah[%d]\n", ah->ah);
+
+	return 0;
+
+err_destroy_ah:
+	efa_ah_destroy(dev, ah);
+err_out:
+	atomic64_inc(&dev->stats.sw_stats.create_ah_err);
+	return err;
+}
+
+void efa_destroy_ah(struct ib_ah *ibah, u32 flags)
+{
+	struct efa_dev *dev = to_edev(ibah->pd->device);
+	struct efa_ah *ah = to_eah(ibah);
+
+	ibdev_dbg(&dev->ibdev, "Destroy ah[%d]\n", ah->ah);
+
+	if (!(flags & RDMA_DESTROY_AH_SLEEPABLE)) {
+		ibdev_dbg(&dev->ibdev,
+			  "Destroy address handle is not supported in atomic context\n");
+		return;
+	}
+
+	efa_ah_destroy(dev, ah);
+}
+
+struct rdma_hw_stats *efa_alloc_hw_stats(struct ib_device *ibdev, u8 port_num)
+{
+	return rdma_alloc_hw_stats_struct(efa_stats_names,
+					  ARRAY_SIZE(efa_stats_names),
+					  RDMA_HW_STATS_DEFAULT_LIFESPAN);
+}
+
+int efa_get_hw_stats(struct ib_device *ibdev, struct rdma_hw_stats *stats,
+		     u8 port_num, int index)
+{
+	struct efa_com_get_stats_params params = {};
+	union efa_com_get_stats_result result;
+	struct efa_dev *dev = to_edev(ibdev);
+	struct efa_com_basic_stats *bs;
+	struct efa_com_stats_admin *as;
+	struct efa_stats *s;
+	int err;
+
+	params.type = EFA_ADMIN_GET_STATS_TYPE_BASIC;
+	params.scope = EFA_ADMIN_GET_STATS_SCOPE_ALL;
+
+	err = efa_com_get_stats(&dev->edev, &params, &result);
+	if (err)
+		return err;
+
+	bs = &result.basic_stats;
+	stats->value[EFA_TX_BYTES] = bs->tx_bytes;
+	stats->value[EFA_TX_PKTS] = bs->tx_pkts;
+	stats->value[EFA_RX_BYTES] = bs->rx_bytes;
+	stats->value[EFA_RX_PKTS] = bs->rx_pkts;
+	stats->value[EFA_RX_DROPS] = bs->rx_drops;
+
+	as = &dev->edev.aq.stats;
+	stats->value[EFA_SUBMITTED_CMDS] = atomic64_read(&as->submitted_cmd);
+	stats->value[EFA_COMPLETED_CMDS] = atomic64_read(&as->completed_cmd);
+	stats->value[EFA_NO_COMPLETION_CMDS] = atomic64_read(&as->no_completion);
+
+	s = &dev->stats;
+	stats->value[EFA_KEEP_ALIVE_RCVD] = atomic64_read(&s->keep_alive_rcvd);
+	stats->value[EFA_ALLOC_PD_ERR] = atomic64_read(&s->sw_stats.alloc_pd_err);
+	stats->value[EFA_CREATE_QP_ERR] = atomic64_read(&s->sw_stats.create_qp_err);
+	stats->value[EFA_REG_MR_ERR] = atomic64_read(&s->sw_stats.reg_mr_err);
+	stats->value[EFA_ALLOC_UCONTEXT_ERR] = atomic64_read(&s->sw_stats.alloc_ucontext_err);
+	stats->value[EFA_CREATE_AH_ERR] = atomic64_read(&s->sw_stats.create_ah_err);
+
+	return ARRAY_SIZE(efa_stats_names);
+}
+
+enum rdma_link_layer efa_port_link_layer(struct ib_device *ibdev,
+					 u8 port_num)
+{
+	return IB_LINK_LAYER_UNSPECIFIED;
+}
+
diff --git a/drivers/infiniband/hw/hfi1/Kconfig b/drivers/infiniband/hw/hfi1/Kconfig
index 7b146b6..0653f4f 100644
--- a/drivers/infiniband/hw/hfi1/Kconfig
+++ b/drivers/infiniband/hw/hfi1/Kconfig
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 config INFINIBAND_HFI1
 	tristate "Intel OPA Gen1 support"
 	depends on X86_64 && INFINIBAND_RDMAVT && I2C
diff --git a/drivers/infiniband/hw/hfi1/Makefile b/drivers/infiniband/hw/hfi1/Makefile
index f451ba9..0405d26 100644
--- a/drivers/infiniband/hw/hfi1/Makefile
+++ b/drivers/infiniband/hw/hfi1/Makefile
@@ -8,12 +8,45 @@
 #
 obj-$(CONFIG_INFINIBAND_HFI1) += hfi1.o
 
-hfi1-y := affinity.o chip.o device.o driver.o efivar.o \
-	eprom.o exp_rcv.o file_ops.o firmware.o \
-	init.o intr.o mad.o mmu_rb.o pcie.o pio.o pio_copy.o platform.o \
-	qp.o qsfp.o rc.o ruc.o sdma.o sysfs.o trace.o \
-	uc.o ud.o user_exp_rcv.o user_pages.o user_sdma.o verbs.o \
-	verbs_txreq.o vnic_main.o vnic_sdma.o
+hfi1-y := \
+	affinity.o \
+	aspm.o \
+	chip.o \
+	device.o \
+	driver.o \
+	efivar.o \
+	eprom.o \
+	exp_rcv.o \
+	file_ops.o \
+	firmware.o \
+	init.o \
+	intr.o \
+	iowait.o \
+	mad.o \
+	mmu_rb.o \
+	msix.o \
+	opfn.o \
+	pcie.o \
+	pio.o \
+	pio_copy.o \
+	platform.o \
+	qp.o \
+	qsfp.o \
+	rc.o \
+	ruc.o \
+	sdma.o \
+	sysfs.o \
+	tid_rdma.o \
+	trace.o \
+	uc.o \
+	ud.o \
+	user_exp_rcv.o \
+	user_pages.o \
+	user_sdma.o \
+	verbs.o \
+	verbs_txreq.o \
+	vnic_main.o \
+	vnic_sdma.o
 
 ifdef CONFIG_DEBUG_FS
 hfi1-y += debugfs.o
diff --git a/drivers/infiniband/hw/hfi1/affinity.c b/drivers/infiniband/hw/hfi1/affinity.c
index bedd5fb..c142b23 100644
--- a/drivers/infiniband/hw/hfi1/affinity.c
+++ b/drivers/infiniband/hw/hfi1/affinity.c
@@ -48,6 +48,7 @@
 #include <linux/cpumask.h>
 #include <linux/module.h>
 #include <linux/interrupt.h>
+#include <linux/numa.h>
 
 #include "hfi.h"
 #include "affinity.h"
@@ -777,7 +778,7 @@
 	_dev_comp_vect_cpu_mask_clean_up(dd, entry);
 unlock:
 	mutex_unlock(&node_affinity.lock);
-	dd->node = -1;
+	dd->node = NUMA_NO_NODE;
 }
 
 /*
@@ -817,10 +818,10 @@
 	set = &entry->def_intr;
 	cpumask_set_cpu(cpu, &set->mask);
 	cpumask_set_cpu(cpu, &set->used);
-	for (i = 0; i < dd->num_msix_entries; i++) {
+	for (i = 0; i < dd->msix_info.max_requested; i++) {
 		struct hfi1_msix_entry *other_msix;
 
-		other_msix = &dd->msix_entries[i];
+		other_msix = &dd->msix_info.msix_entries[i];
 		if (other_msix->type != IRQ_SDMA || other_msix == msix)
 			continue;
 
@@ -1037,7 +1038,7 @@
 	struct hfi1_affinity_node *entry;
 	cpumask_var_t diff, hw_thread_mask, available_mask, intrs_mask;
 	const struct cpumask *node_mask,
-		*proc_mask = &current->cpus_allowed;
+		*proc_mask = current->cpus_ptr;
 	struct hfi1_affinity_node_list *affinity = &node_affinity;
 	struct cpu_mask_set *set = &affinity->proc;
 
@@ -1045,7 +1046,7 @@
 	 * check whether process/context affinity has already
 	 * been set
 	 */
-	if (cpumask_weight(proc_mask) == 1) {
+	if (current->nr_cpus_allowed == 1) {
 		hfi1_cdbg(PROC, "PID %u %s affinity set to CPU %*pbl",
 			  current->pid, current->comm,
 			  cpumask_pr_args(proc_mask));
@@ -1056,7 +1057,7 @@
 		cpu = cpumask_first(proc_mask);
 		cpumask_set_cpu(cpu, &set->used);
 		goto done;
-	} else if (cpumask_weight(proc_mask) < cpumask_weight(&set->mask)) {
+	} else if (current->nr_cpus_allowed < cpumask_weight(&set->mask)) {
 		hfi1_cdbg(PROC, "PID %u %s affinity set to CPU set(s) %*pbl",
 			  current->pid, current->comm,
 			  cpumask_pr_args(proc_mask));
diff --git a/drivers/infiniband/hw/hfi1/aspm.c b/drivers/infiniband/hw/hfi1/aspm.c
new file mode 100644
index 0000000..a3c53be
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/aspm.c
@@ -0,0 +1,270 @@
+// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
+/*
+ * Copyright(c) 2019 Intel Corporation.
+ *
+ */
+
+#include "aspm.h"
+
+/* Time after which the timer interrupt will re-enable ASPM */
+#define ASPM_TIMER_MS 1000
+/* Time for which interrupts are ignored after a timer has been scheduled */
+#define ASPM_RESCHED_TIMER_MS (ASPM_TIMER_MS / 2)
+/* Two interrupts within this time trigger ASPM disable */
+#define ASPM_TRIGGER_MS 1
+#define ASPM_TRIGGER_NS (ASPM_TRIGGER_MS * 1000 * 1000ull)
+#define ASPM_L1_SUPPORTED(reg) \
+	((((reg) & PCI_EXP_LNKCAP_ASPMS) >> 10) & 0x2)
+
+uint aspm_mode = ASPM_MODE_DISABLED;
+module_param_named(aspm, aspm_mode, uint, 0444);
+MODULE_PARM_DESC(aspm, "PCIe ASPM: 0: disable, 1: enable, 2: dynamic");
+
+static bool aspm_hw_l1_supported(struct hfi1_devdata *dd)
+{
+	struct pci_dev *parent = dd->pcidev->bus->self;
+	u32 up, dn;
+
+	/*
+	 * If the driver does not have access to the upstream component,
+	 * it cannot support ASPM L1 at all.
+	 */
+	if (!parent)
+		return false;
+
+	pcie_capability_read_dword(dd->pcidev, PCI_EXP_LNKCAP, &dn);
+	dn = ASPM_L1_SUPPORTED(dn);
+
+	pcie_capability_read_dword(parent, PCI_EXP_LNKCAP, &up);
+	up = ASPM_L1_SUPPORTED(up);
+
+	/* ASPM works on A-step but is reported as not supported */
+	return (!!dn || is_ax(dd)) && !!up;
+}
+
+/* Set L1 entrance latency for slower entry to L1 */
+static void aspm_hw_set_l1_ent_latency(struct hfi1_devdata *dd)
+{
+	u32 l1_ent_lat = 0x4u;
+	u32 reg32;
+
+	pci_read_config_dword(dd->pcidev, PCIE_CFG_REG_PL3, &reg32);
+	reg32 &= ~PCIE_CFG_REG_PL3_L1_ENT_LATENCY_SMASK;
+	reg32 |= l1_ent_lat << PCIE_CFG_REG_PL3_L1_ENT_LATENCY_SHIFT;
+	pci_write_config_dword(dd->pcidev, PCIE_CFG_REG_PL3, reg32);
+}
+
+static void aspm_hw_enable_l1(struct hfi1_devdata *dd)
+{
+	struct pci_dev *parent = dd->pcidev->bus->self;
+
+	/*
+	 * If the driver does not have access to the upstream component,
+	 * it cannot support ASPM L1 at all.
+	 */
+	if (!parent)
+		return;
+
+	/* Enable ASPM L1 first in upstream component and then downstream */
+	pcie_capability_clear_and_set_word(parent, PCI_EXP_LNKCTL,
+					   PCI_EXP_LNKCTL_ASPMC,
+					   PCI_EXP_LNKCTL_ASPM_L1);
+	pcie_capability_clear_and_set_word(dd->pcidev, PCI_EXP_LNKCTL,
+					   PCI_EXP_LNKCTL_ASPMC,
+					   PCI_EXP_LNKCTL_ASPM_L1);
+}
+
+void aspm_hw_disable_l1(struct hfi1_devdata *dd)
+{
+	struct pci_dev *parent = dd->pcidev->bus->self;
+
+	/* Disable ASPM L1 first in downstream component and then upstream */
+	pcie_capability_clear_and_set_word(dd->pcidev, PCI_EXP_LNKCTL,
+					   PCI_EXP_LNKCTL_ASPMC, 0x0);
+	if (parent)
+		pcie_capability_clear_and_set_word(parent, PCI_EXP_LNKCTL,
+						   PCI_EXP_LNKCTL_ASPMC, 0x0);
+}
+
+static  void aspm_enable(struct hfi1_devdata *dd)
+{
+	if (dd->aspm_enabled || aspm_mode == ASPM_MODE_DISABLED ||
+	    !dd->aspm_supported)
+		return;
+
+	aspm_hw_enable_l1(dd);
+	dd->aspm_enabled = true;
+}
+
+static  void aspm_disable(struct hfi1_devdata *dd)
+{
+	if (!dd->aspm_enabled || aspm_mode == ASPM_MODE_ENABLED)
+		return;
+
+	aspm_hw_disable_l1(dd);
+	dd->aspm_enabled = false;
+}
+
+static  void aspm_disable_inc(struct hfi1_devdata *dd)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&dd->aspm_lock, flags);
+	aspm_disable(dd);
+	atomic_inc(&dd->aspm_disabled_cnt);
+	spin_unlock_irqrestore(&dd->aspm_lock, flags);
+}
+
+static  void aspm_enable_dec(struct hfi1_devdata *dd)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&dd->aspm_lock, flags);
+	if (atomic_dec_and_test(&dd->aspm_disabled_cnt))
+		aspm_enable(dd);
+	spin_unlock_irqrestore(&dd->aspm_lock, flags);
+}
+
+/* ASPM processing for each receive context interrupt */
+void __aspm_ctx_disable(struct hfi1_ctxtdata *rcd)
+{
+	bool restart_timer;
+	bool close_interrupts;
+	unsigned long flags;
+	ktime_t now, prev;
+
+	spin_lock_irqsave(&rcd->aspm_lock, flags);
+	/* PSM contexts are open */
+	if (!rcd->aspm_intr_enable)
+		goto unlock;
+
+	prev = rcd->aspm_ts_last_intr;
+	now = ktime_get();
+	rcd->aspm_ts_last_intr = now;
+
+	/* An interrupt pair close together in time */
+	close_interrupts = ktime_to_ns(ktime_sub(now, prev)) < ASPM_TRIGGER_NS;
+
+	/* Don't push out our timer till this much time has elapsed */
+	restart_timer = ktime_to_ns(ktime_sub(now, rcd->aspm_ts_timer_sched)) >
+				    ASPM_RESCHED_TIMER_MS * NSEC_PER_MSEC;
+	restart_timer = restart_timer && close_interrupts;
+
+	/* Disable ASPM and schedule timer */
+	if (rcd->aspm_enabled && close_interrupts) {
+		aspm_disable_inc(rcd->dd);
+		rcd->aspm_enabled = false;
+		restart_timer = true;
+	}
+
+	if (restart_timer) {
+		mod_timer(&rcd->aspm_timer,
+			  jiffies + msecs_to_jiffies(ASPM_TIMER_MS));
+		rcd->aspm_ts_timer_sched = now;
+	}
+unlock:
+	spin_unlock_irqrestore(&rcd->aspm_lock, flags);
+}
+
+/* Timer function for re-enabling ASPM in the absence of interrupt activity */
+static  void aspm_ctx_timer_function(struct timer_list *t)
+{
+	struct hfi1_ctxtdata *rcd = from_timer(rcd, t, aspm_timer);
+	unsigned long flags;
+
+	spin_lock_irqsave(&rcd->aspm_lock, flags);
+	aspm_enable_dec(rcd->dd);
+	rcd->aspm_enabled = true;
+	spin_unlock_irqrestore(&rcd->aspm_lock, flags);
+}
+
+/*
+ * Disable interrupt processing for verbs contexts when PSM or VNIC contexts
+ * are open.
+ */
+void aspm_disable_all(struct hfi1_devdata *dd)
+{
+	struct hfi1_ctxtdata *rcd;
+	unsigned long flags;
+	u16 i;
+
+	for (i = 0; i < dd->first_dyn_alloc_ctxt; i++) {
+		rcd = hfi1_rcd_get_by_index(dd, i);
+		if (rcd) {
+			del_timer_sync(&rcd->aspm_timer);
+			spin_lock_irqsave(&rcd->aspm_lock, flags);
+			rcd->aspm_intr_enable = false;
+			spin_unlock_irqrestore(&rcd->aspm_lock, flags);
+			hfi1_rcd_put(rcd);
+		}
+	}
+
+	aspm_disable(dd);
+	atomic_set(&dd->aspm_disabled_cnt, 0);
+}
+
+/* Re-enable interrupt processing for verbs contexts */
+void aspm_enable_all(struct hfi1_devdata *dd)
+{
+	struct hfi1_ctxtdata *rcd;
+	unsigned long flags;
+	u16 i;
+
+	aspm_enable(dd);
+
+	if (aspm_mode != ASPM_MODE_DYNAMIC)
+		return;
+
+	for (i = 0; i < dd->first_dyn_alloc_ctxt; i++) {
+		rcd = hfi1_rcd_get_by_index(dd, i);
+		if (rcd) {
+			spin_lock_irqsave(&rcd->aspm_lock, flags);
+			rcd->aspm_intr_enable = true;
+			rcd->aspm_enabled = true;
+			spin_unlock_irqrestore(&rcd->aspm_lock, flags);
+			hfi1_rcd_put(rcd);
+		}
+	}
+}
+
+static  void aspm_ctx_init(struct hfi1_ctxtdata *rcd)
+{
+	spin_lock_init(&rcd->aspm_lock);
+	timer_setup(&rcd->aspm_timer, aspm_ctx_timer_function, 0);
+	rcd->aspm_intr_supported = rcd->dd->aspm_supported &&
+		aspm_mode == ASPM_MODE_DYNAMIC &&
+		rcd->ctxt < rcd->dd->first_dyn_alloc_ctxt;
+}
+
+void aspm_init(struct hfi1_devdata *dd)
+{
+	struct hfi1_ctxtdata *rcd;
+	u16 i;
+
+	spin_lock_init(&dd->aspm_lock);
+	dd->aspm_supported = aspm_hw_l1_supported(dd);
+
+	for (i = 0; i < dd->first_dyn_alloc_ctxt; i++) {
+		rcd = hfi1_rcd_get_by_index(dd, i);
+		if (rcd)
+			aspm_ctx_init(rcd);
+		hfi1_rcd_put(rcd);
+	}
+
+	/* Start with ASPM disabled */
+	aspm_hw_set_l1_ent_latency(dd);
+	dd->aspm_enabled = false;
+	aspm_hw_disable_l1(dd);
+
+	/* Now turn on ASPM if configured */
+	aspm_enable_all(dd);
+}
+
+void aspm_exit(struct hfi1_devdata *dd)
+{
+	aspm_disable_all(dd);
+
+	/* Turn on ASPM on exit to conserve power */
+	aspm_enable(dd);
+}
+
diff --git a/drivers/infiniband/hw/hfi1/aspm.h b/drivers/infiniband/hw/hfi1/aspm.h
index e813387..75d5d18 100644
--- a/drivers/infiniband/hw/hfi1/aspm.h
+++ b/drivers/infiniband/hw/hfi1/aspm.h
@@ -57,266 +57,20 @@
 	ASPM_MODE_DYNAMIC = 2,	/* ASPM enabled/disabled dynamically */
 };
 
-/* Time after which the timer interrupt will re-enable ASPM */
-#define ASPM_TIMER_MS 1000
-/* Time for which interrupts are ignored after a timer has been scheduled */
-#define ASPM_RESCHED_TIMER_MS (ASPM_TIMER_MS / 2)
-/* Two interrupts within this time trigger ASPM disable */
-#define ASPM_TRIGGER_MS 1
-#define ASPM_TRIGGER_NS (ASPM_TRIGGER_MS * 1000 * 1000ull)
-#define ASPM_L1_SUPPORTED(reg) \
-	(((reg & PCI_EXP_LNKCAP_ASPMS) >> 10) & 0x2)
+void aspm_init(struct hfi1_devdata *dd);
+void aspm_exit(struct hfi1_devdata *dd);
+void aspm_hw_disable_l1(struct hfi1_devdata *dd);
+void __aspm_ctx_disable(struct hfi1_ctxtdata *rcd);
+void aspm_disable_all(struct hfi1_devdata *dd);
+void aspm_enable_all(struct hfi1_devdata *dd);
 
-static inline bool aspm_hw_l1_supported(struct hfi1_devdata *dd)
-{
-	struct pci_dev *parent = dd->pcidev->bus->self;
-	u32 up, dn;
-
-	/*
-	 * If the driver does not have access to the upstream component,
-	 * it cannot support ASPM L1 at all.
-	 */
-	if (!parent)
-		return false;
-
-	pcie_capability_read_dword(dd->pcidev, PCI_EXP_LNKCAP, &dn);
-	dn = ASPM_L1_SUPPORTED(dn);
-
-	pcie_capability_read_dword(parent, PCI_EXP_LNKCAP, &up);
-	up = ASPM_L1_SUPPORTED(up);
-
-	/* ASPM works on A-step but is reported as not supported */
-	return (!!dn || is_ax(dd)) && !!up;
-}
-
-/* Set L1 entrance latency for slower entry to L1 */
-static inline void aspm_hw_set_l1_ent_latency(struct hfi1_devdata *dd)
-{
-	u32 l1_ent_lat = 0x4u;
-	u32 reg32;
-
-	pci_read_config_dword(dd->pcidev, PCIE_CFG_REG_PL3, &reg32);
-	reg32 &= ~PCIE_CFG_REG_PL3_L1_ENT_LATENCY_SMASK;
-	reg32 |= l1_ent_lat << PCIE_CFG_REG_PL3_L1_ENT_LATENCY_SHIFT;
-	pci_write_config_dword(dd->pcidev, PCIE_CFG_REG_PL3, reg32);
-}
-
-static inline void aspm_hw_enable_l1(struct hfi1_devdata *dd)
-{
-	struct pci_dev *parent = dd->pcidev->bus->self;
-
-	/*
-	 * If the driver does not have access to the upstream component,
-	 * it cannot support ASPM L1 at all.
-	 */
-	if (!parent)
-		return;
-
-	/* Enable ASPM L1 first in upstream component and then downstream */
-	pcie_capability_clear_and_set_word(parent, PCI_EXP_LNKCTL,
-					   PCI_EXP_LNKCTL_ASPMC,
-					   PCI_EXP_LNKCTL_ASPM_L1);
-	pcie_capability_clear_and_set_word(dd->pcidev, PCI_EXP_LNKCTL,
-					   PCI_EXP_LNKCTL_ASPMC,
-					   PCI_EXP_LNKCTL_ASPM_L1);
-}
-
-static inline void aspm_hw_disable_l1(struct hfi1_devdata *dd)
-{
-	struct pci_dev *parent = dd->pcidev->bus->self;
-
-	/* Disable ASPM L1 first in downstream component and then upstream */
-	pcie_capability_clear_and_set_word(dd->pcidev, PCI_EXP_LNKCTL,
-					   PCI_EXP_LNKCTL_ASPMC, 0x0);
-	if (parent)
-		pcie_capability_clear_and_set_word(parent, PCI_EXP_LNKCTL,
-						   PCI_EXP_LNKCTL_ASPMC, 0x0);
-}
-
-static inline void aspm_enable(struct hfi1_devdata *dd)
-{
-	if (dd->aspm_enabled || aspm_mode == ASPM_MODE_DISABLED ||
-	    !dd->aspm_supported)
-		return;
-
-	aspm_hw_enable_l1(dd);
-	dd->aspm_enabled = true;
-}
-
-static inline void aspm_disable(struct hfi1_devdata *dd)
-{
-	if (!dd->aspm_enabled || aspm_mode == ASPM_MODE_ENABLED)
-		return;
-
-	aspm_hw_disable_l1(dd);
-	dd->aspm_enabled = false;
-}
-
-static inline void aspm_disable_inc(struct hfi1_devdata *dd)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&dd->aspm_lock, flags);
-	aspm_disable(dd);
-	atomic_inc(&dd->aspm_disabled_cnt);
-	spin_unlock_irqrestore(&dd->aspm_lock, flags);
-}
-
-static inline void aspm_enable_dec(struct hfi1_devdata *dd)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&dd->aspm_lock, flags);
-	if (atomic_dec_and_test(&dd->aspm_disabled_cnt))
-		aspm_enable(dd);
-	spin_unlock_irqrestore(&dd->aspm_lock, flags);
-}
-
-/* ASPM processing for each receive context interrupt */
 static inline void aspm_ctx_disable(struct hfi1_ctxtdata *rcd)
 {
-	bool restart_timer;
-	bool close_interrupts;
-	unsigned long flags;
-	ktime_t now, prev;
-
 	/* Quickest exit for minimum impact */
-	if (!rcd->aspm_intr_supported)
+	if (likely(!rcd->aspm_intr_supported))
 		return;
 
-	spin_lock_irqsave(&rcd->aspm_lock, flags);
-	/* PSM contexts are open */
-	if (!rcd->aspm_intr_enable)
-		goto unlock;
-
-	prev = rcd->aspm_ts_last_intr;
-	now = ktime_get();
-	rcd->aspm_ts_last_intr = now;
-
-	/* An interrupt pair close together in time */
-	close_interrupts = ktime_to_ns(ktime_sub(now, prev)) < ASPM_TRIGGER_NS;
-
-	/* Don't push out our timer till this much time has elapsed */
-	restart_timer = ktime_to_ns(ktime_sub(now, rcd->aspm_ts_timer_sched)) >
-				    ASPM_RESCHED_TIMER_MS * NSEC_PER_MSEC;
-	restart_timer = restart_timer && close_interrupts;
-
-	/* Disable ASPM and schedule timer */
-	if (rcd->aspm_enabled && close_interrupts) {
-		aspm_disable_inc(rcd->dd);
-		rcd->aspm_enabled = false;
-		restart_timer = true;
-	}
-
-	if (restart_timer) {
-		mod_timer(&rcd->aspm_timer,
-			  jiffies + msecs_to_jiffies(ASPM_TIMER_MS));
-		rcd->aspm_ts_timer_sched = now;
-	}
-unlock:
-	spin_unlock_irqrestore(&rcd->aspm_lock, flags);
-}
-
-/* Timer function for re-enabling ASPM in the absence of interrupt activity */
-static inline void aspm_ctx_timer_function(struct timer_list *t)
-{
-	struct hfi1_ctxtdata *rcd = from_timer(rcd, t, aspm_timer);
-	unsigned long flags;
-
-	spin_lock_irqsave(&rcd->aspm_lock, flags);
-	aspm_enable_dec(rcd->dd);
-	rcd->aspm_enabled = true;
-	spin_unlock_irqrestore(&rcd->aspm_lock, flags);
-}
-
-/*
- * Disable interrupt processing for verbs contexts when PSM or VNIC contexts
- * are open.
- */
-static inline void aspm_disable_all(struct hfi1_devdata *dd)
-{
-	struct hfi1_ctxtdata *rcd;
-	unsigned long flags;
-	u16 i;
-
-	for (i = 0; i < dd->first_dyn_alloc_ctxt; i++) {
-		rcd = hfi1_rcd_get_by_index(dd, i);
-		if (rcd) {
-			del_timer_sync(&rcd->aspm_timer);
-			spin_lock_irqsave(&rcd->aspm_lock, flags);
-			rcd->aspm_intr_enable = false;
-			spin_unlock_irqrestore(&rcd->aspm_lock, flags);
-			hfi1_rcd_put(rcd);
-		}
-	}
-
-	aspm_disable(dd);
-	atomic_set(&dd->aspm_disabled_cnt, 0);
-}
-
-/* Re-enable interrupt processing for verbs contexts */
-static inline void aspm_enable_all(struct hfi1_devdata *dd)
-{
-	struct hfi1_ctxtdata *rcd;
-	unsigned long flags;
-	u16 i;
-
-	aspm_enable(dd);
-
-	if (aspm_mode != ASPM_MODE_DYNAMIC)
-		return;
-
-	for (i = 0; i < dd->first_dyn_alloc_ctxt; i++) {
-		rcd = hfi1_rcd_get_by_index(dd, i);
-		if (rcd) {
-			spin_lock_irqsave(&rcd->aspm_lock, flags);
-			rcd->aspm_intr_enable = true;
-			rcd->aspm_enabled = true;
-			spin_unlock_irqrestore(&rcd->aspm_lock, flags);
-			hfi1_rcd_put(rcd);
-		}
-	}
-}
-
-static inline void aspm_ctx_init(struct hfi1_ctxtdata *rcd)
-{
-	spin_lock_init(&rcd->aspm_lock);
-	timer_setup(&rcd->aspm_timer, aspm_ctx_timer_function, 0);
-	rcd->aspm_intr_supported = rcd->dd->aspm_supported &&
-		aspm_mode == ASPM_MODE_DYNAMIC &&
-		rcd->ctxt < rcd->dd->first_dyn_alloc_ctxt;
-}
-
-static inline void aspm_init(struct hfi1_devdata *dd)
-{
-	struct hfi1_ctxtdata *rcd;
-	u16 i;
-
-	spin_lock_init(&dd->aspm_lock);
-	dd->aspm_supported = aspm_hw_l1_supported(dd);
-
-	for (i = 0; i < dd->first_dyn_alloc_ctxt; i++) {
-		rcd = hfi1_rcd_get_by_index(dd, i);
-		if (rcd)
-			aspm_ctx_init(rcd);
-		hfi1_rcd_put(rcd);
-	}
-
-	/* Start with ASPM disabled */
-	aspm_hw_set_l1_ent_latency(dd);
-	dd->aspm_enabled = false;
-	aspm_hw_disable_l1(dd);
-
-	/* Now turn on ASPM if configured */
-	aspm_enable_all(dd);
-}
-
-static inline void aspm_exit(struct hfi1_devdata *dd)
-{
-	aspm_disable_all(dd);
-
-	/* Turn on ASPM on exit to conserve power */
-	aspm_enable(dd);
+	__aspm_ctx_disable(rcd);
 }
 
 #endif /* _ASPM_H */
diff --git a/drivers/infiniband/hw/hfi1/chip.c b/drivers/infiniband/hw/hfi1/chip.c
index 902d12d..9b1fb84 100644
--- a/drivers/infiniband/hw/hfi1/chip.c
+++ b/drivers/infiniband/hw/hfi1/chip.c
@@ -67,8 +67,6 @@
 #include "debugfs.h"
 #include "fault.h"
 
-#define NUM_IB_PORTS 1
-
 uint kdeth_qp;
 module_param_named(kdeth_qp, kdeth_qp, uint, S_IRUGO);
 MODULE_PARM_DESC(kdeth_qp, "Set the KDETH queue pair prefix");
@@ -1074,6 +1072,8 @@
 static void log_physical_state(struct hfi1_pportdata *ppd, u32 state);
 static int wait_physical_linkstate(struct hfi1_pportdata *ppd, u32 state,
 				   int msecs);
+static int wait_phys_link_out_of_offline(struct hfi1_pportdata *ppd,
+					 int msecs);
 static void read_planned_down_reason_code(struct hfi1_devdata *dd, u8 *pdrrc);
 static void read_link_down_reason(struct hfi1_devdata *dd, u8 *ldr);
 static void handle_temp_err(struct hfi1_devdata *dd);
@@ -1100,9 +1100,9 @@
 	const char *desc;
 };
 
-#define NUM_MISC_ERRS (IS_GENERAL_ERR_END - IS_GENERAL_ERR_START)
-#define NUM_DC_ERRS (IS_DC_END - IS_DC_START)
-#define NUM_VARIOUS (IS_VARIOUS_END - IS_VARIOUS_START)
+#define NUM_MISC_ERRS (IS_GENERAL_ERR_END + 1 - IS_GENERAL_ERR_START)
+#define NUM_DC_ERRS (IS_DC_END + 1 - IS_DC_START)
+#define NUM_VARIOUS (IS_VARIOUS_END + 1 - IS_VARIOUS_START)
 
 /*
  * Helpers for building HFI and DC error interrupt table entries.  Different
@@ -4101,9 +4101,13 @@
 def_access_ibp_counter(rdma_seq);
 def_access_ibp_counter(unaligned);
 def_access_ibp_counter(seq_naks);
+def_access_ibp_counter(rc_crwaits);
 
 static struct cntr_entry dev_cntrs[DEV_CNTR_LAST] = {
 [C_RCV_OVF] = RXE32_DEV_CNTR_ELEM(RcvOverflow, RCV_BUF_OVFL_CNT, CNTR_SYNTH),
+[C_RX_LEN_ERR] = RXE32_DEV_CNTR_ELEM(RxLenErr, RCV_LENGTH_ERR_CNT, CNTR_SYNTH),
+[C_RX_ICRC_ERR] = RXE32_DEV_CNTR_ELEM(RxICrcErr, RCV_ICRC_ERR_CNT, CNTR_SYNTH),
+[C_RX_EBP] = RXE32_DEV_CNTR_ELEM(RxEbpCnt, RCV_EBP_CNT, CNTR_SYNTH),
 [C_RX_TID_FULL] = RXE32_DEV_CNTR_ELEM(RxTIDFullEr, RCV_TID_FULL_ERR_CNT,
 			CNTR_NORMAL),
 [C_RX_TID_INVALID] = RXE32_DEV_CNTR_ELEM(RxTIDInvalid, RCV_TID_VALID_ERR_CNT,
@@ -4253,6 +4257,8 @@
 			    access_sw_pio_drain),
 [C_SW_KMEM_WAIT] = CNTR_ELEM("KmemWait", 0, 0, CNTR_NORMAL,
 			    access_sw_kmem_wait),
+[C_SW_TID_WAIT] = CNTR_ELEM("TidWait", 0, 0, CNTR_NORMAL,
+			    hfi1_access_sw_tid_wait),
 [C_SW_SEND_SCHED] = CNTR_ELEM("SendSched", 0, 0, CNTR_NORMAL,
 			    access_sw_send_schedule),
 [C_SDMA_DESC_FETCHED_CNT] = CNTR_ELEM("SDEDscFdCn",
@@ -5114,6 +5120,7 @@
 [C_SW_IBP_RDMA_SEQ] = SW_IBP_CNTR(RdmaSeq, rdma_seq),
 [C_SW_IBP_UNALIGNED] = SW_IBP_CNTR(Unaligned, unaligned),
 [C_SW_IBP_SEQ_NAK] = SW_IBP_CNTR(SeqNak, seq_naks),
+[C_SW_IBP_RC_CRWAITS] = SW_IBP_CNTR(RcCrWait, rc_crwaits),
 [C_SW_CPU_RC_ACKS] = CNTR_ELEM("RcAcks", 0, 0, CNTR_NORMAL,
 			       access_sw_cpu_rc_acks),
 [C_SW_CPU_RC_QACKS] = CNTR_ELEM("RcQacks", 0, 0, CNTR_NORMAL,
@@ -5222,6 +5229,17 @@
 	return (chip_rev_minor & 0xF0) == 0x10;
 }
 
+/* return true is kernel urg disabled for rcd */
+bool is_urg_masked(struct hfi1_ctxtdata *rcd)
+{
+	u64 mask;
+	u32 is = IS_RCVURGENT_START + rcd->ctxt;
+	u8 bit = is % 64;
+
+	mask = read_csr(rcd->dd, CCE_INT_MASK + (8 * (is / 64)));
+	return !(mask & BIT_ULL(bit));
+}
+
 /*
  * Append string s to buffer buf.  Arguments curp and len are the current
  * position and remaining length, respectively.
@@ -8181,7 +8199,7 @@
 /**
  * is_rcv_urgent_int() - User receive context urgent IRQ handler
  * @dd: valid dd
- * @source: logical IRQ source (ofse from IS_RCVURGENT_START)
+ * @source: logical IRQ source (offset from IS_RCVURGENT_START)
  *
  * RX block receive urgent interrupt.  Source is < 160.
  *
@@ -8231,7 +8249,7 @@
 				is_sdma_eng_err_name,	is_sdma_eng_err_int },
 { IS_SENDCTXT_ERR_START, IS_SENDCTXT_ERR_END,
 				is_sendctxt_err_name,	is_sendctxt_err_int },
-{ IS_SDMA_START,	     IS_SDMA_END,
+{ IS_SDMA_START,	     IS_SDMA_IDLE_END,
 				is_sdma_eng_name,	is_sdma_eng_int },
 { IS_VARIOUS_START,	     IS_VARIOUS_END,
 				is_various_name,	is_various_int },
@@ -8257,7 +8275,7 @@
 
 	/* avoids a double compare by walking the table in-order */
 	for (entry = &is_table[0]; entry->is_name; entry++) {
-		if (source < entry->end) {
+		if (source <= entry->end) {
 			trace_hfi1_interrupt(dd, entry, source);
 			entry->is_int(dd, source - entry->start);
 			return;
@@ -8276,7 +8294,7 @@
  * context DATA IRQs are threaded and are not supported by this handler.
  *
  */
-static irqreturn_t general_interrupt(int irq, void *data)
+irqreturn_t general_interrupt(int irq, void *data)
 {
 	struct hfi1_devdata *dd = data;
 	u64 regs[CCE_NUM_INT_CSRS];
@@ -8309,7 +8327,7 @@
 	return handled;
 }
 
-static irqreturn_t sdma_interrupt(int irq, void *data)
+irqreturn_t sdma_interrupt(int irq, void *data)
 {
 	struct sdma_engine *sde = data;
 	struct hfi1_devdata *dd = sde->dd;
@@ -8352,7 +8370,6 @@
 	struct hfi1_devdata *dd = rcd->dd;
 	u32 addr = CCE_INT_CLEAR + (8 * rcd->ireg);
 
-	mmiowb();	/* make sure everything before is written */
 	write_csr(dd, addr, rcd->imask);
 	/* force the above write on the chip and get a value back */
 	(void)read_csr(dd, addr);
@@ -8401,7 +8418,7 @@
  * invoked) is finished.  The intent is to avoid extra interrupts while we
  * are processing packets anyway.
  */
-static irqreturn_t receive_context_interrupt(int irq, void *data)
+irqreturn_t receive_context_interrupt(int irq, void *data)
 {
 	struct hfi1_ctxtdata *rcd = data;
 	struct hfi1_devdata *dd = rcd->dd;
@@ -8441,7 +8458,7 @@
  * Receive packet thread handler.  This expects to be invoked with the
  * receive interrupt still blocked.
  */
-static irqreturn_t receive_context_thread(int irq, void *data)
+irqreturn_t receive_context_thread(int irq, void *data)
 {
 	struct hfi1_ctxtdata *rcd = data;
 	int present;
@@ -9651,30 +9668,10 @@
 	}
 }
 
-static void init_qsfp_int(struct hfi1_devdata *dd)
+void init_qsfp_int(struct hfi1_devdata *dd)
 {
 	struct hfi1_pportdata *ppd = dd->pport;
-	u64 qsfp_mask, cce_int_mask;
-	const int qsfp1_int_smask = QSFP1_INT % 64;
-	const int qsfp2_int_smask = QSFP2_INT % 64;
-
-	/*
-	 * disable QSFP1 interrupts for HFI1, QSFP2 interrupts for HFI0
-	 * Qsfp1Int and Qsfp2Int are adjacent bits in the same CSR,
-	 * therefore just one of QSFP1_INT/QSFP2_INT can be used to find
-	 * the index of the appropriate CSR in the CCEIntMask CSR array
-	 */
-	cce_int_mask = read_csr(dd, CCE_INT_MASK +
-				(8 * (QSFP1_INT / 64)));
-	if (dd->hfi1_id) {
-		cce_int_mask &= ~((u64)1 << qsfp1_int_smask);
-		write_csr(dd, CCE_INT_MASK + (8 * (QSFP1_INT / 64)),
-			  cce_int_mask);
-	} else {
-		cce_int_mask &= ~((u64)1 << qsfp2_int_smask);
-		write_csr(dd, CCE_INT_MASK + (8 * (QSFP2_INT / 64)),
-			  cce_int_mask);
-	}
+	u64 qsfp_mask;
 
 	qsfp_mask = (u64)(QSFP_HFI0_INT_N | QSFP_HFI0_MODPRST_N);
 	/* Clear current status to avoid spurious interrupts */
@@ -9691,6 +9688,12 @@
 	write_csr(dd,
 		  dd->hfi1_id ? ASIC_QSFP2_INVERT : ASIC_QSFP1_INVERT,
 		  qsfp_mask);
+
+	/* Enable the appropriate QSFP IRQ source */
+	if (!dd->hfi1_id)
+		set_intr_bits(dd, QSFP1_INT, QSFP1_INT, true);
+	else
+		set_intr_bits(dd, QSFP2_INT, QSFP2_INT, true);
 }
 
 /*
@@ -9849,6 +9852,7 @@
 
 	/* disable the port */
 	clear_rcvctrl(dd, RCV_CTRL_RCV_PORT_ENABLE_SMASK);
+	cancel_work_sync(&ppd->freeze_work);
 }
 
 static inline int init_cpu_counters(struct hfi1_devdata *dd)
@@ -10577,12 +10581,29 @@
 	}
 }
 
-/*
- * Verify if BCT for data VLs is non-zero.
+/**
+ * data_vls_operational() - Verify if data VL BCT credits and MTU
+ *			    are both set.
+ * @ppd: pointer to hfi1_pportdata structure
+ *
+ * Return: true - Ok, false -otherwise.
  */
 static inline bool data_vls_operational(struct hfi1_pportdata *ppd)
 {
-	return !!ppd->actual_vls_operational;
+	int i;
+	u64 reg;
+
+	if (!ppd->actual_vls_operational)
+		return false;
+
+	for (i = 0; i < ppd->vls_supported; i++) {
+		reg = read_csr(ppd->dd, SEND_CM_CREDIT_VL + (8 * i));
+		if ((reg && !ppd->dd->vld[i].mtu) ||
+		    (!reg && ppd->dd->vld[i].mtu))
+			return false;
+	}
+
+	return true;
 }
 
 /*
@@ -10695,7 +10716,8 @@
 
 		if (!data_vls_operational(ppd)) {
 			dd_dev_err(dd,
-				   "%s: data VLs not operational\n", __func__);
+				   "%s: Invalid data VL credits or mtu\n",
+				   __func__);
 			ret = -EINVAL;
 			break;
 		}
@@ -10768,13 +10790,15 @@
 			break;
 
 		ppd->port_error_action = 0;
-		ppd->host_link_state = HLS_DN_POLL;
 
 		if (quick_linkup) {
 			/* quick linkup does not go into polling */
 			ret = do_quick_linkup(dd);
 		} else {
 			ret1 = set_physical_link_state(dd, PLS_POLLING);
+			if (!ret1)
+				ret1 = wait_phys_link_out_of_offline(ppd,
+								     3000);
 			if (ret1 != HCMD_SUCCESS) {
 				dd_dev_err(dd,
 					   "Failed to transition to Polling link state, return 0x%x\n",
@@ -10782,6 +10806,14 @@
 				ret = -EINVAL;
 			}
 		}
+
+		/*
+		 * Change the host link state after requesting DC8051 to
+		 * change its physical state so that we can ignore any
+		 * interrupt with stale LNI(XX) error, which will not be
+		 * cleared until DC8051 transitions to Polling state.
+		 */
+		ppd->host_link_state = HLS_DN_POLL;
 		ppd->offline_disabled_reason =
 			HFI1_ODR_MASK(OPA_LINKDOWN_REASON_NONE);
 		/*
@@ -11776,12 +11808,10 @@
 			<< RCV_EGR_INDEX_HEAD_HEAD_SHIFT;
 		write_uctxt_csr(dd, ctxt, RCV_EGR_INDEX_HEAD, reg);
 	}
-	mmiowb();
 	reg = ((u64)rcv_intr_count << RCV_HDR_HEAD_COUNTER_SHIFT) |
 		(((u64)hd & RCV_HDR_HEAD_HEAD_MASK)
 			<< RCV_HDR_HEAD_HEAD_SHIFT);
 	write_uctxt_csr(dd, ctxt, RCV_HDR_HEAD, reg);
-	mmiowb();
 }
 
 u32 hdrqempty(struct hfi1_ctxtdata *rcd)
@@ -11932,10 +11962,16 @@
 
 		rcvctrl &= ~RCV_CTXT_CTRL_ENABLE_SMASK;
 	}
-	if (op & HFI1_RCVCTRL_INTRAVAIL_ENB)
+	if (op & HFI1_RCVCTRL_INTRAVAIL_ENB) {
+		set_intr_bits(dd, IS_RCVAVAIL_START + rcd->ctxt,
+			      IS_RCVAVAIL_START + rcd->ctxt, true);
 		rcvctrl |= RCV_CTXT_CTRL_INTR_AVAIL_SMASK;
-	if (op & HFI1_RCVCTRL_INTRAVAIL_DIS)
+	}
+	if (op & HFI1_RCVCTRL_INTRAVAIL_DIS) {
+		set_intr_bits(dd, IS_RCVAVAIL_START + rcd->ctxt,
+			      IS_RCVAVAIL_START + rcd->ctxt, false);
 		rcvctrl &= ~RCV_CTXT_CTRL_INTR_AVAIL_SMASK;
+	}
 	if ((op & HFI1_RCVCTRL_TAILUPD_ENB) && rcd->rcvhdrtail_kvaddr)
 		rcvctrl |= RCV_CTXT_CTRL_TAIL_UPD_SMASK;
 	if (op & HFI1_RCVCTRL_TAILUPD_DIS) {
@@ -11965,6 +12001,13 @@
 		rcvctrl |= RCV_CTXT_CTRL_DONT_DROP_EGR_FULL_SMASK;
 	if (op & HFI1_RCVCTRL_NO_EGR_DROP_DIS)
 		rcvctrl &= ~RCV_CTXT_CTRL_DONT_DROP_EGR_FULL_SMASK;
+	if (op & HFI1_RCVCTRL_URGENT_ENB)
+		set_intr_bits(dd, IS_RCVURGENT_START + rcd->ctxt,
+			      IS_RCVURGENT_START + rcd->ctxt, true);
+	if (op & HFI1_RCVCTRL_URGENT_DIS)
+		set_intr_bits(dd, IS_RCVURGENT_START + rcd->ctxt,
+			      IS_RCVURGENT_START + rcd->ctxt, false);
+
 	hfi1_cdbg(RCVCTRL, "ctxt %d rcvctrl 0x%llx\n", ctxt, rcvctrl);
 	write_kctxt_csr(dd, ctxt, RCV_CTXT_CTRL, rcvctrl);
 
@@ -12913,6 +12956,39 @@
 	return read_state;
 }
 
+/*
+ * wait_phys_link_out_of_offline - wait for any out of offline state
+ * @ppd: port device
+ * @msecs: the number of milliseconds to wait
+ *
+ * Wait up to msecs milliseconds for any out of offline physical link
+ * state change to occur.
+ * Returns 0 if at least one state is reached, otherwise -ETIMEDOUT.
+ */
+static int wait_phys_link_out_of_offline(struct hfi1_pportdata *ppd,
+					 int msecs)
+{
+	u32 read_state;
+	unsigned long timeout;
+
+	timeout = jiffies + msecs_to_jiffies(msecs);
+	while (1) {
+		read_state = read_physical_state(ppd->dd);
+		if ((read_state & 0xF0) != PLS_OFFLINE)
+			break;
+		if (time_after(jiffies, timeout)) {
+			dd_dev_err(ppd->dd,
+				   "timeout waiting for phy link out of offline. Read state 0x%x, %dms\n",
+				   read_state, msecs);
+			return -ETIMEDOUT;
+		}
+		usleep_range(1950, 2050); /* sleep 2ms-ish */
+	}
+
+	log_state_transition(ppd, read_state);
+	return read_state;
+}
+
 #define CLEAR_STATIC_RATE_CONTROL_SMASK(r) \
 (r &= ~SEND_CTXT_CHECK_ENABLE_DISALLOW_PBC_STATIC_RATE_CONTROL_SMASK)
 
@@ -12964,63 +13040,71 @@
 	return ret;
 }
 
-/**
- * get_int_mask - get 64 bit int mask
- * @dd - the devdata
- * @i - the csr (relative to CCE_INT_MASK)
- *
- * Returns the mask with the urgent interrupt mask
- * bit clear for kernel receive contexts.
- */
-static u64 get_int_mask(struct hfi1_devdata *dd, u32 i)
-{
-	u64 mask = U64_MAX; /* default to no change */
-
-	if (i >= (IS_RCVURGENT_START / 64) && i < (IS_RCVURGENT_END / 64)) {
-		int j = (i - (IS_RCVURGENT_START / 64)) * 64;
-		int k = !j ? IS_RCVURGENT_START % 64 : 0;
-
-		if (j)
-			j -= IS_RCVURGENT_START % 64;
-		/* j = 0..dd->first_dyn_alloc_ctxt - 1,k = 0..63 */
-		for (; j < dd->first_dyn_alloc_ctxt && k < 64; j++, k++)
-			/* convert to bit in mask and clear */
-			mask &= ~BIT_ULL(k);
-	}
-	return mask;
-}
-
 /* ========================================================================= */
 
-/*
- * Enable/disable chip from delivering interrupts.
+/**
+ * read_mod_write() - Calculate the IRQ register index and set/clear the bits
+ * @dd: valid devdata
+ * @src: IRQ source to determine register index from
+ * @bits: the bits to set or clear
+ * @set: true == set the bits, false == clear the bits
+ *
  */
-void set_intr_state(struct hfi1_devdata *dd, u32 enable)
+static void read_mod_write(struct hfi1_devdata *dd, u16 src, u64 bits,
+			   bool set)
 {
-	int i;
+	u64 reg;
+	u16 idx = src / BITS_PER_REGISTER;
 
-	/*
-	 * In HFI, the mask needs to be 1 to allow interrupts.
-	 */
-	if (enable) {
-		/* enable all interrupts but urgent on kernel contexts */
-		for (i = 0; i < CCE_NUM_INT_CSRS; i++) {
-			u64 mask = get_int_mask(dd, i);
+	spin_lock(&dd->irq_src_lock);
+	reg = read_csr(dd, CCE_INT_MASK + (8 * idx));
+	if (set)
+		reg |= bits;
+	else
+		reg &= ~bits;
+	write_csr(dd, CCE_INT_MASK + (8 * idx), reg);
+	spin_unlock(&dd->irq_src_lock);
+}
 
-			write_csr(dd, CCE_INT_MASK + (8 * i), mask);
+/**
+ * set_intr_bits() - Enable/disable a range (one or more) IRQ sources
+ * @dd: valid devdata
+ * @first: first IRQ source to set/clear
+ * @last: last IRQ source (inclusive) to set/clear
+ * @set: true == set the bits, false == clear the bits
+ *
+ * If first == last, set the exact source.
+ */
+int set_intr_bits(struct hfi1_devdata *dd, u16 first, u16 last, bool set)
+{
+	u64 bits = 0;
+	u64 bit;
+	u16 src;
+
+	if (first > NUM_INTERRUPT_SOURCES || last > NUM_INTERRUPT_SOURCES)
+		return -EINVAL;
+
+	if (last < first)
+		return -ERANGE;
+
+	for (src = first; src <= last; src++) {
+		bit = src % BITS_PER_REGISTER;
+		/* wrapped to next register? */
+		if (!bit && bits) {
+			read_mod_write(dd, src - 1, bits, set);
+			bits = 0;
 		}
-
-		init_qsfp_int(dd);
-	} else {
-		for (i = 0; i < CCE_NUM_INT_CSRS; i++)
-			write_csr(dd, CCE_INT_MASK + (8 * i), 0ull);
+		bits |= BIT_ULL(bit);
 	}
+	read_mod_write(dd, last, bits, set);
+
+	return 0;
 }
 
 /*
  * Clear all interrupt sources on the chip.
  */
-static void clear_all_interrupts(struct hfi1_devdata *dd)
+void clear_all_interrupts(struct hfi1_devdata *dd)
 {
 	int i;
 
@@ -13044,38 +13128,11 @@
 	write_csr(dd, DC_DC8051_ERR_CLR, ~(u64)0);
 }
 
-/**
- * hfi1_clean_up_interrupts() - Free all IRQ resources
- * @dd: valid device data data structure
- *
- * Free the MSIx and assoicated PCI resources, if they have been allocated.
- */
-void hfi1_clean_up_interrupts(struct hfi1_devdata *dd)
-{
-	int i;
-	struct hfi1_msix_entry *me = dd->msix_entries;
-
-	/* remove irqs - must happen before disabling/turning off */
-	for (i = 0; i < dd->num_msix_entries; i++, me++) {
-		if (!me->arg) /* => no irq, no affinity */
-			continue;
-		hfi1_put_irq_affinity(dd, me);
-		pci_free_irq(dd->pcidev, i, me->arg);
-	}
-
-	/* clean structures */
-	kfree(dd->msix_entries);
-	dd->msix_entries = NULL;
-	dd->num_msix_entries = 0;
-
-	pci_free_irq_vectors(dd->pcidev);
-}
-
 /*
  * Remap the interrupt source from the general handler to the given MSI-X
  * interrupt.
  */
-static void remap_intr(struct hfi1_devdata *dd, int isrc, int msix_intr)
+void remap_intr(struct hfi1_devdata *dd, int isrc, int msix_intr)
 {
 	u64 reg;
 	int m, n;
@@ -13099,8 +13156,7 @@
 	write_csr(dd, CCE_INT_MAP + (8 * m), reg);
 }
 
-static void remap_sdma_interrupts(struct hfi1_devdata *dd,
-				  int engine, int msix_intr)
+void remap_sdma_interrupts(struct hfi1_devdata *dd, int engine, int msix_intr)
 {
 	/*
 	 * SDMA engine interrupt sources grouped by type, rather than
@@ -13109,204 +13165,16 @@
 	 *	SDMAProgress
 	 *	SDMAIdle
 	 */
-	remap_intr(dd, IS_SDMA_START + 0 * TXE_NUM_SDMA_ENGINES + engine,
-		   msix_intr);
-	remap_intr(dd, IS_SDMA_START + 1 * TXE_NUM_SDMA_ENGINES + engine,
-		   msix_intr);
-	remap_intr(dd, IS_SDMA_START + 2 * TXE_NUM_SDMA_ENGINES + engine,
-		   msix_intr);
-}
-
-static int request_msix_irqs(struct hfi1_devdata *dd)
-{
-	int first_general, last_general;
-	int first_sdma, last_sdma;
-	int first_rx, last_rx;
-	int i, ret = 0;
-
-	/* calculate the ranges we are going to use */
-	first_general = 0;
-	last_general = first_general + 1;
-	first_sdma = last_general;
-	last_sdma = first_sdma + dd->num_sdma;
-	first_rx = last_sdma;
-	last_rx = first_rx + dd->n_krcv_queues + dd->num_vnic_contexts;
-
-	/* VNIC MSIx interrupts get mapped when VNIC contexts are created */
-	dd->first_dyn_msix_idx = first_rx + dd->n_krcv_queues;
-
-	/*
-	 * Sanity check - the code expects all SDMA chip source
-	 * interrupts to be in the same CSR, starting at bit 0.  Verify
-	 * that this is true by checking the bit location of the start.
-	 */
-	BUILD_BUG_ON(IS_SDMA_START % 64);
-
-	for (i = 0; i < dd->num_msix_entries; i++) {
-		struct hfi1_msix_entry *me = &dd->msix_entries[i];
-		const char *err_info;
-		irq_handler_t handler;
-		irq_handler_t thread = NULL;
-		void *arg = NULL;
-		int idx;
-		struct hfi1_ctxtdata *rcd = NULL;
-		struct sdma_engine *sde = NULL;
-		char name[MAX_NAME_SIZE];
-
-		/* obtain the arguments to pci_request_irq */
-		if (first_general <= i && i < last_general) {
-			idx = i - first_general;
-			handler = general_interrupt;
-			arg = dd;
-			snprintf(name, sizeof(name),
-				 DRIVER_NAME "_%d", dd->unit);
-			err_info = "general";
-			me->type = IRQ_GENERAL;
-		} else if (first_sdma <= i && i < last_sdma) {
-			idx = i - first_sdma;
-			sde = &dd->per_sdma[idx];
-			handler = sdma_interrupt;
-			arg = sde;
-			snprintf(name, sizeof(name),
-				 DRIVER_NAME "_%d sdma%d", dd->unit, idx);
-			err_info = "sdma";
-			remap_sdma_interrupts(dd, idx, i);
-			me->type = IRQ_SDMA;
-		} else if (first_rx <= i && i < last_rx) {
-			idx = i - first_rx;
-			rcd = hfi1_rcd_get_by_index_safe(dd, idx);
-			if (rcd) {
-				/*
-				 * Set the interrupt register and mask for this
-				 * context's interrupt.
-				 */
-				rcd->ireg = (IS_RCVAVAIL_START + idx) / 64;
-				rcd->imask = ((u64)1) <<
-					  ((IS_RCVAVAIL_START + idx) % 64);
-				handler = receive_context_interrupt;
-				thread = receive_context_thread;
-				arg = rcd;
-				snprintf(name, sizeof(name),
-					 DRIVER_NAME "_%d kctxt%d",
-					 dd->unit, idx);
-				err_info = "receive context";
-				remap_intr(dd, IS_RCVAVAIL_START + idx, i);
-				me->type = IRQ_RCVCTXT;
-				rcd->msix_intr = i;
-				hfi1_rcd_put(rcd);
-			}
-		} else {
-			/* not in our expected range - complain, then
-			 * ignore it
-			 */
-			dd_dev_err(dd,
-				   "Unexpected extra MSI-X interrupt %d\n", i);
-			continue;
-		}
-		/* no argument, no interrupt */
-		if (!arg)
-			continue;
-		/* make sure the name is terminated */
-		name[sizeof(name) - 1] = 0;
-		me->irq = pci_irq_vector(dd->pcidev, i);
-		ret = pci_request_irq(dd->pcidev, i, handler, thread, arg,
-				      name);
-		if (ret) {
-			dd_dev_err(dd,
-				   "unable to allocate %s interrupt, irq %d, index %d, err %d\n",
-				   err_info, me->irq, idx, ret);
-			return ret;
-		}
-		/*
-		 * assign arg after pci_request_irq call, so it will be
-		 * cleaned up
-		 */
-		me->arg = arg;
-
-		ret = hfi1_get_irq_affinity(dd, me);
-		if (ret)
-			dd_dev_err(dd, "unable to pin IRQ %d\n", ret);
-	}
-
-	return ret;
-}
-
-void hfi1_vnic_synchronize_irq(struct hfi1_devdata *dd)
-{
-	int i;
-
-	for (i = 0; i < dd->vnic.num_ctxt; i++) {
-		struct hfi1_ctxtdata *rcd = dd->vnic.ctxt[i];
-		struct hfi1_msix_entry *me = &dd->msix_entries[rcd->msix_intr];
-
-		synchronize_irq(me->irq);
-	}
-}
-
-void hfi1_reset_vnic_msix_info(struct hfi1_ctxtdata *rcd)
-{
-	struct hfi1_devdata *dd = rcd->dd;
-	struct hfi1_msix_entry *me = &dd->msix_entries[rcd->msix_intr];
-
-	if (!me->arg) /* => no irq, no affinity */
-		return;
-
-	hfi1_put_irq_affinity(dd, me);
-	pci_free_irq(dd->pcidev, rcd->msix_intr, me->arg);
-
-	me->arg = NULL;
-}
-
-void hfi1_set_vnic_msix_info(struct hfi1_ctxtdata *rcd)
-{
-	struct hfi1_devdata *dd = rcd->dd;
-	struct hfi1_msix_entry *me;
-	int idx = rcd->ctxt;
-	void *arg = rcd;
-	int ret;
-
-	rcd->msix_intr = dd->vnic.msix_idx++;
-	me = &dd->msix_entries[rcd->msix_intr];
-
-	/*
-	 * Set the interrupt register and mask for this
-	 * context's interrupt.
-	 */
-	rcd->ireg = (IS_RCVAVAIL_START + idx) / 64;
-	rcd->imask = ((u64)1) <<
-		  ((IS_RCVAVAIL_START + idx) % 64);
-	me->type = IRQ_RCVCTXT;
-	me->irq = pci_irq_vector(dd->pcidev, rcd->msix_intr);
-	remap_intr(dd, IS_RCVAVAIL_START + idx, rcd->msix_intr);
-
-	ret = pci_request_irq(dd->pcidev, rcd->msix_intr,
-			      receive_context_interrupt,
-			      receive_context_thread, arg,
-			      DRIVER_NAME "_%d kctxt%d", dd->unit, idx);
-	if (ret) {
-		dd_dev_err(dd, "vnic irq request (irq %d, idx %d) fail %d\n",
-			   me->irq, idx, ret);
-		return;
-	}
-	/*
-	 * assign arg after pci_request_irq call, so it will be
-	 * cleaned up
-	 */
-	me->arg = arg;
-
-	ret = hfi1_get_irq_affinity(dd, me);
-	if (ret) {
-		dd_dev_err(dd,
-			   "unable to pin IRQ %d\n", ret);
-		pci_free_irq(dd->pcidev, rcd->msix_intr, me->arg);
-	}
+	remap_intr(dd, IS_SDMA_START + engine, msix_intr);
+	remap_intr(dd, IS_SDMA_PROGRESS_START + engine, msix_intr);
+	remap_intr(dd, IS_SDMA_IDLE_START + engine, msix_intr);
 }
 
 /*
  * Set the general handler to accept all interrupts, remap all
  * chip interrupts back to MSI-X 0.
  */
-static void reset_interrupts(struct hfi1_devdata *dd)
+void reset_interrupts(struct hfi1_devdata *dd)
 {
 	int i;
 
@@ -13319,54 +13187,33 @@
 		write_csr(dd, CCE_INT_MAP + (8 * i), 0);
 }
 
+/**
+ * set_up_interrupts() - Initialize the IRQ resources and state
+ * @dd: valid devdata
+ *
+ */
 static int set_up_interrupts(struct hfi1_devdata *dd)
 {
-	u32 total;
-	int ret, request;
-
-	/*
-	 * Interrupt count:
-	 *	1 general, "slow path" interrupt (includes the SDMA engines
-	 *		slow source, SDMACleanupDone)
-	 *	N interrupts - one per used SDMA engine
-	 *	M interrupt - one per kernel receive context
-	 *	V interrupt - one for each VNIC context
-	 */
-	total = 1 + dd->num_sdma + dd->n_krcv_queues + dd->num_vnic_contexts;
-
-	/* ask for MSI-X interrupts */
-	request = request_msix(dd, total);
-	if (request < 0) {
-		ret = request;
-		goto fail;
-	} else {
-		dd->msix_entries = kcalloc(total, sizeof(*dd->msix_entries),
-					   GFP_KERNEL);
-		if (!dd->msix_entries) {
-			ret = -ENOMEM;
-			goto fail;
-		}
-		/* using MSI-X */
-		dd->num_msix_entries = total;
-		dd_dev_info(dd, "%u MSI-X interrupts allocated\n", total);
-	}
+	int ret;
 
 	/* mask all interrupts */
-	set_intr_state(dd, 0);
+	set_intr_bits(dd, IS_FIRST_SOURCE, IS_LAST_SOURCE, false);
+
 	/* clear all pending interrupts */
 	clear_all_interrupts(dd);
 
 	/* reset general handler mask, chip MSI-X mappings */
 	reset_interrupts(dd);
 
-	ret = request_msix_irqs(dd);
+	/* ask for MSI-X interrupts */
+	ret = msix_initialize(dd);
 	if (ret)
-		goto fail;
+		return ret;
 
-	return 0;
+	ret = msix_request_irqs(dd);
+	if (ret)
+		msix_clean_up_interrupts(dd);
 
-fail:
-	hfi1_clean_up_interrupts(dd);
 	return ret;
 }
 
@@ -13388,7 +13235,7 @@
 	int total_contexts;
 	int ret;
 	unsigned ngroups;
-	int qos_rmt_count;
+	int rmt_count;
 	int user_rmt_reduced;
 	u32 n_usr_ctxts;
 	u32 send_contexts = chip_send_contexts(dd);
@@ -13450,10 +13297,23 @@
 		n_usr_ctxts = rcv_contexts - total_contexts;
 	}
 
-	/* each user context requires an entry in the RMT */
-	qos_rmt_count = qos_rmt_entries(dd, NULL, NULL);
-	if (qos_rmt_count + n_usr_ctxts > NUM_MAP_ENTRIES) {
-		user_rmt_reduced = NUM_MAP_ENTRIES - qos_rmt_count;
+	/*
+	 * The RMT entries are currently allocated as shown below:
+	 * 1. QOS (0 to 128 entries);
+	 * 2. FECN (num_kernel_context - 1 + num_user_contexts +
+	 *    num_vnic_contexts);
+	 * 3. VNIC (num_vnic_contexts).
+	 * It should be noted that FECN oversubscribe num_vnic_contexts
+	 * entries of RMT because both VNIC and PSM could allocate any receive
+	 * context between dd->first_dyn_alloc_text and dd->num_rcv_contexts,
+	 * and PSM FECN must reserve an RMT entry for each possible PSM receive
+	 * context.
+	 */
+	rmt_count = qos_rmt_entries(dd, NULL, NULL) + (num_vnic_contexts * 2);
+	if (HFI1_CAP_IS_KSET(TID_RDMA))
+		rmt_count += num_kernel_contexts - 1;
+	if (rmt_count + n_usr_ctxts > NUM_MAP_ENTRIES) {
+		user_rmt_reduced = NUM_MAP_ENTRIES - rmt_count;
 		dd_dev_err(dd,
 			   "RMT size is reducing the number of user receive contexts from %u to %d\n",
 			   n_usr_ctxts,
@@ -14174,6 +14034,19 @@
 }
 
 /**
+ * hfi1_get_qp_map
+ * @dd: device data
+ * @idx: index to read
+ */
+u8 hfi1_get_qp_map(struct hfi1_devdata *dd, u8 idx)
+{
+	u64 reg = read_csr(dd, RCV_QP_MAP_TABLE + (idx / 8) * 8);
+
+	reg >>= (idx % 8) * 8;
+	return reg;
+}
+
+/**
  * init_qpmap_table
  * @dd - device data
  * @first_ctxt - first context
@@ -14434,35 +14307,43 @@
 	init_qpmap_table(dd, FIRST_KERNEL_KCTXT, dd->n_krcv_queues - 1);
 }
 
-static void init_user_fecn_handling(struct hfi1_devdata *dd,
-				    struct rsm_map_table *rmt)
+static void init_fecn_handling(struct hfi1_devdata *dd,
+			       struct rsm_map_table *rmt)
 {
 	struct rsm_rule_data rrd;
 	u64 reg;
-	int i, idx, regoff, regidx;
+	int i, idx, regoff, regidx, start;
 	u8 offset;
+	u32 total_cnt;
+
+	if (HFI1_CAP_IS_KSET(TID_RDMA))
+		/* Exclude context 0 */
+		start = 1;
+	else
+		start = dd->first_dyn_alloc_ctxt;
+
+	total_cnt = dd->num_rcv_contexts - start;
 
 	/* there needs to be enough room in the map table */
-	if (rmt->used + dd->num_user_contexts >= NUM_MAP_ENTRIES) {
-		dd_dev_err(dd, "User FECN handling disabled - too many user contexts allocated\n");
+	if (rmt->used + total_cnt >= NUM_MAP_ENTRIES) {
+		dd_dev_err(dd, "FECN handling disabled - too many contexts allocated\n");
 		return;
 	}
 
 	/*
 	 * RSM will extract the destination context as an index into the
 	 * map table.  The destination contexts are a sequential block
-	 * in the range first_dyn_alloc_ctxt...num_rcv_contexts-1 (inclusive).
+	 * in the range start...num_rcv_contexts-1 (inclusive).
 	 * Map entries are accessed as offset + extracted value.  Adjust
 	 * the added offset so this sequence can be placed anywhere in
 	 * the table - as long as the entries themselves do not wrap.
 	 * There are only enough bits in offset for the table size, so
 	 * start with that to allow for a "negative" offset.
 	 */
-	offset = (u8)(NUM_MAP_ENTRIES + (int)rmt->used -
-						(int)dd->first_dyn_alloc_ctxt);
+	offset = (u8)(NUM_MAP_ENTRIES + rmt->used - start);
 
-	for (i = dd->first_dyn_alloc_ctxt, idx = rmt->used;
-				i < dd->num_rcv_contexts; i++, idx++) {
+	for (i = start, idx = rmt->used; i < dd->num_rcv_contexts;
+	     i++, idx++) {
 		/* replace with identity mapping */
 		regoff = (idx % 8) * 8;
 		regidx = idx / 8;
@@ -14497,7 +14378,7 @@
 	/* add rule 1 */
 	add_rsm_rule(dd, RSM_INS_FECN, &rrd);
 
-	rmt->used += dd->num_user_contexts;
+	rmt->used += total_cnt;
 }
 
 /* Initialize RSM for VNIC */
@@ -14573,7 +14454,7 @@
 		clear_rcvctrl(dd, RCV_CTRL_RCV_RSM_ENABLE_SMASK);
 }
 
-static void init_rxe(struct hfi1_devdata *dd)
+static int init_rxe(struct hfi1_devdata *dd)
 {
 	struct rsm_map_table *rmt;
 	u64 val;
@@ -14582,9 +14463,12 @@
 	write_csr(dd, RCV_ERR_MASK, ~0ull);
 
 	rmt = alloc_rsm_map_table(dd);
+	if (!rmt)
+		return -ENOMEM;
+
 	/* set up QOS, including the QPN map table */
 	init_qos(dd, rmt);
-	init_user_fecn_handling(dd, rmt);
+	init_fecn_handling(dd, rmt);
 	complete_rsm_map_table(dd, rmt);
 	/* record number of used rsm map entries for vnic */
 	dd->vnic.rmt_start = rmt->used;
@@ -14608,6 +14492,7 @@
 	val |= ((4ull & RCV_BYPASS_HDR_SIZE_MASK) <<
 		RCV_BYPASS_HDR_SIZE_SHIFT);
 	write_csr(dd, RCV_BYPASS, val);
+	return 0;
 }
 
 static void init_other(struct hfi1_devdata *dd)
@@ -14810,8 +14695,8 @@
  */
 static int init_asic_data(struct hfi1_devdata *dd)
 {
-	unsigned long flags;
-	struct hfi1_devdata *tmp, *peer = NULL;
+	unsigned long index;
+	struct hfi1_devdata *peer;
 	struct hfi1_asic_data *asic_data;
 	int ret = 0;
 
@@ -14820,14 +14705,12 @@
 	if (!asic_data)
 		return -ENOMEM;
 
-	spin_lock_irqsave(&hfi1_devs_lock, flags);
+	xa_lock_irq(&hfi1_dev_table);
 	/* Find our peer device */
-	list_for_each_entry(tmp, &hfi1_dev_list, list) {
-		if ((HFI_BASE_GUID(dd) == HFI_BASE_GUID(tmp)) &&
-		    dd->unit != tmp->unit) {
-			peer = tmp;
+	xa_for_each(&hfi1_dev_table, index, peer) {
+		if ((HFI_BASE_GUID(dd) == HFI_BASE_GUID(peer)) &&
+		    dd->unit != peer->unit)
 			break;
-		}
 	}
 
 	if (peer) {
@@ -14839,7 +14722,7 @@
 		mutex_init(&dd->asic_data->asic_resource_mutex);
 	}
 	dd->asic_data->dds[dd->hfi1_id] = dd; /* self back-pointer */
-	spin_unlock_irqrestore(&hfi1_devs_lock, flags);
+	xa_unlock_irq(&hfi1_dev_table);
 
 	/* first one through - set up i2c devices */
 	if (!peer)
@@ -14919,20 +14802,16 @@
 }
 
 /**
- * Allocate and initialize the device structure for the hfi.
+ * hfi1_init_dd() - Initialize most of the dd structure.
  * @dev: the pci_dev for hfi1_ib device
  * @ent: pci_device_id struct for this dev
  *
- * Also allocates, initializes, and returns the devdata struct for this
- * device instance
- *
  * This is global, and is called directly at init to set up the
  * chip-specific function pointers for later use.
  */
-struct hfi1_devdata *hfi1_init_dd(struct pci_dev *pdev,
-				  const struct pci_device_id *ent)
+int hfi1_init_dd(struct hfi1_devdata *dd)
 {
-	struct hfi1_devdata *dd;
+	struct pci_dev *pdev = dd->pcidev;
 	struct hfi1_pportdata *ppd;
 	u64 reg;
 	int i, ret;
@@ -14943,13 +14822,8 @@
 		"Functional simulator"
 	};
 	struct pci_dev *parent = pdev->bus->self;
-	u32 sdma_engines;
+	u32 sdma_engines = chip_sdma_engines(dd);
 
-	dd = hfi1_alloc_devdata(pdev, NUM_IB_PORTS *
-				sizeof(struct hfi1_pportdata));
-	if (IS_ERR(dd))
-		goto bail;
-	sdma_engines = chip_sdma_engines(dd);
 	ppd = dd->pport;
 	for (i = 0; i < dd->num_pports; i++, ppd++) {
 		int vl;
@@ -15128,6 +15002,12 @@
 	if (ret)
 		goto bail_cleanup;
 
+	/*
+	 * This should probably occur in hfi1_pcie_init(), but historically
+	 * occurs after the do_pcie_gen3_transition() code.
+	 */
+	tune_pcie_caps(dd);
+
 	/* start setting dd values and adjusting CSRs */
 	init_early_variables(dd);
 
@@ -15150,7 +15030,10 @@
 		goto bail_cleanup;
 
 	/* set initial RXE CSRs */
-	init_rxe(dd);
+	ret = init_rxe(dd);
+	if (ret)
+		goto bail_cleanup;
+
 	/* set initial TXE CSRs */
 	init_txe(dd);
 	/* set initial non-RXE, non-TXE CSRs */
@@ -15240,14 +15123,13 @@
 	free_cntrs(dd);
 bail_clear_intr:
 	hfi1_comp_vectors_clean_up(dd);
-	hfi1_clean_up_interrupts(dd);
+	msix_clean_up_interrupts(dd);
 bail_cleanup:
 	hfi1_pcie_ddcleanup(dd);
 bail_free:
 	hfi1_free_devdata(dd);
-	dd = ERR_PTR(ret);
 bail:
-	return dd;
+	return ret;
 }
 
 static u16 delay_cycles(struct hfi1_pportdata *ppd, u32 desired_egress_rate,
diff --git a/drivers/infiniband/hw/hfi1/chip.h b/drivers/infiniband/hw/hfi1/chip.h
index 36b04d6..4ca5ac8 100644
--- a/drivers/infiniband/hw/hfi1/chip.h
+++ b/drivers/infiniband/hw/hfi1/chip.h
@@ -1,7 +1,7 @@
 #ifndef _CHIP_H
 #define _CHIP_H
 /*
- * Copyright(c) 2015 - 2017 Intel Corporation.
+ * Copyright(c) 2015 - 2018 Intel Corporation.
  *
  * This file is provided under a dual BSD/GPLv2 license.  When using or
  * redistributing this file, you may do so under either license.
@@ -52,9 +52,7 @@
  */
 
 /* sizes */
-#define CCE_NUM_MSIX_VECTORS 256
-#define CCE_NUM_INT_CSRS 12
-#define CCE_NUM_INT_MAP_CSRS 96
+#define BITS_PER_REGISTER (BITS_PER_BYTE * sizeof(u64))
 #define NUM_INTERRUPT_SOURCES 768
 #define RXE_NUM_CONTEXTS 160
 #define RXE_PER_CONTEXT_SIZE 0x1000	/* 4k */
@@ -161,34 +159,49 @@
 	(CR_CREDIT_RETURN_DUE_TO_FORCE_MASK << \
 	CR_CREDIT_RETURN_DUE_TO_FORCE_SHIFT)
 
-/* interrupt source numbers */
-#define IS_GENERAL_ERR_START	  0
-#define IS_SDMAENG_ERR_START	 16
-#define IS_SENDCTXT_ERR_START	 32
-#define IS_SDMA_START		192 /* includes SDmaProgress,SDmaIdle */
+/* Specific IRQ sources */
+#define CCE_ERR_INT		  0
+#define RXE_ERR_INT		  1
+#define MISC_ERR_INT		  2
+#define PIO_ERR_INT		  4
+#define SDMA_ERR_INT		  5
+#define EGRESS_ERR_INT		  6
+#define TXE_ERR_INT		  7
+#define PBC_INT			240
+#define GPIO_ASSERT_INT		241
+#define QSFP1_INT		242
+#define QSFP2_INT		243
+#define TCRIT_INT		244
+
+/* interrupt source ranges */
+#define IS_FIRST_SOURCE		CCE_ERR_INT
+#define IS_GENERAL_ERR_START		  0
+#define IS_SDMAENG_ERR_START		 16
+#define IS_SENDCTXT_ERR_START		 32
+#define IS_SDMA_START			192
+#define IS_SDMA_PROGRESS_START		208
+#define IS_SDMA_IDLE_START		224
 #define IS_VARIOUS_START		240
 #define IS_DC_START			248
 #define IS_RCVAVAIL_START		256
 #define IS_RCVURGENT_START		416
 #define IS_SENDCREDIT_START		576
 #define IS_RESERVED_START		736
-#define IS_MAX_SOURCES		768
+#define IS_LAST_SOURCE			767
 
 /* derived interrupt source values */
-#define IS_GENERAL_ERR_END		IS_SDMAENG_ERR_START
-#define IS_SDMAENG_ERR_END		IS_SENDCTXT_ERR_START
-#define IS_SENDCTXT_ERR_END		IS_SDMA_START
-#define IS_SDMA_END			IS_VARIOUS_START
-#define IS_VARIOUS_END		IS_DC_START
-#define IS_DC_END			IS_RCVAVAIL_START
-#define IS_RCVAVAIL_END		IS_RCVURGENT_START
-#define IS_RCVURGENT_END		IS_SENDCREDIT_START
-#define IS_SENDCREDIT_END		IS_RESERVED_START
-#define IS_RESERVED_END		IS_MAX_SOURCES
-
-/* absolute interrupt numbers for QSFP1Int and QSFP2Int */
-#define QSFP1_INT		242
-#define QSFP2_INT		243
+#define IS_GENERAL_ERR_END		7
+#define IS_SDMAENG_ERR_END		31
+#define IS_SENDCTXT_ERR_END		191
+#define IS_SDMA_END                     207
+#define IS_SDMA_PROGRESS_END            223
+#define IS_SDMA_IDLE_END		239
+#define IS_VARIOUS_END			244
+#define IS_DC_END			255
+#define IS_RCVAVAIL_END			415
+#define IS_RCVURGENT_END		575
+#define IS_SENDCREDIT_END		735
+#define IS_RESERVED_END			IS_LAST_SOURCE
 
 /* DCC_CFG_PORT_CONFIG logical link states */
 #define LSTATE_DOWN    0x1
@@ -791,6 +804,7 @@
 u32 hdrqempty(struct hfi1_ctxtdata *rcd);
 int is_ax(struct hfi1_devdata *dd);
 int is_bx(struct hfi1_devdata *dd);
+bool is_urg_masked(struct hfi1_ctxtdata *rcd);
 u32 read_physical_state(struct hfi1_devdata *dd);
 u32 chip_to_opa_pstate(struct hfi1_devdata *dd, u32 chip_pstate);
 const char *opa_lstate_name(u32 lstate);
@@ -844,6 +858,9 @@
 /* Per device counter indexes */
 enum {
 	C_RCV_OVF = 0,
+	C_RX_LEN_ERR,
+	C_RX_ICRC_ERR,
+	C_RX_EBP,
 	C_RX_TID_FULL,
 	C_RX_TID_INVALID,
 	C_RX_TID_FLGMS,
@@ -913,6 +930,7 @@
 	C_SW_PIO_WAIT,
 	C_SW_PIO_DRAIN,
 	C_SW_KMEM_WAIT,
+	C_SW_TID_WAIT,
 	C_SW_SEND_SCHED,
 	C_SDMA_DESC_FETCHED_CNT,
 	C_SDMA_INT_CNT,
@@ -1227,6 +1245,7 @@
 	C_SW_IBP_RDMA_SEQ,
 	C_SW_IBP_UNALIGNED,
 	C_SW_IBP_SEQ_NAK,
+	C_SW_IBP_RC_CRWAITS,
 	C_SW_CPU_RC_ACKS,
 	C_SW_CPU_RC_QACKS,
 	C_SW_CPU_RC_DELAYED_COMP,
@@ -1416,6 +1435,19 @@
 void hfi1_init_vnic_rsm(struct hfi1_devdata *dd);
 void hfi1_deinit_vnic_rsm(struct hfi1_devdata *dd);
 
+irqreturn_t general_interrupt(int irq, void *data);
+irqreturn_t sdma_interrupt(int irq, void *data);
+irqreturn_t receive_context_interrupt(int irq, void *data);
+irqreturn_t receive_context_thread(int irq, void *data);
+
+int set_intr_bits(struct hfi1_devdata *dd, u16 first, u16 last, bool set);
+void init_qsfp_int(struct hfi1_devdata *dd);
+void clear_all_interrupts(struct hfi1_devdata *dd);
+void remap_intr(struct hfi1_devdata *dd, int isrc, int msix_intr);
+void remap_sdma_interrupts(struct hfi1_devdata *dd, int engine, int msix_intr);
+void reset_interrupts(struct hfi1_devdata *dd);
+u8 hfi1_get_qp_map(struct hfi1_devdata *dd, u8 idx);
+
 /*
  * Interrupt source table.
  *
diff --git a/drivers/infiniband/hw/hfi1/chip_registers.h b/drivers/infiniband/hw/hfi1/chip_registers.h
index ee6dca5..ab3589d 100644
--- a/drivers/infiniband/hw/hfi1/chip_registers.h
+++ b/drivers/infiniband/hw/hfi1/chip_registers.h
@@ -380,6 +380,9 @@
 #define DC_LCB_PRF_TX_FLIT_CNT (DC_LCB_CSRS + 0x000000000418)
 #define DC_LCB_STS_LINK_TRANSFER_ACTIVE (DC_LCB_CSRS + 0x000000000468)
 #define DC_LCB_STS_ROUND_TRIP_LTP_CNT (DC_LCB_CSRS + 0x0000000004B0)
+#define RCV_LENGTH_ERR_CNT 0
+#define RCV_ICRC_ERR_CNT 6
+#define RCV_EBP_CNT 9
 #define RCV_BUF_OVFL_CNT 10
 #define RCV_CONTEXT_EGR_STALL 22
 #define RCV_DATA_PKT_CNT 0
@@ -878,6 +881,10 @@
 #define SEND_CTRL (TXE + 0x000000000000)
 #define SEND_CTRL_CM_RESET_SMASK 0x4ull
 #define SEND_CTRL_SEND_ENABLE_SMASK 0x1ull
+#define SEND_CTRL_UNSUPPORTED_VL_SHIFT 3
+#define SEND_CTRL_UNSUPPORTED_VL_MASK 0xFFull
+#define SEND_CTRL_UNSUPPORTED_VL_SMASK (SEND_CTRL_UNSUPPORTED_VL_MASK \
+		<< SEND_CTRL_UNSUPPORTED_VL_SHIFT)
 #define SEND_CTRL_VL_ARBITER_ENABLE_SMASK 0x2ull
 #define SEND_CTXT_CHECK_ENABLE (TXE + 0x000000100080)
 #define SEND_CTXT_CHECK_ENABLE_CHECK_BYPASS_VL_MAPPING_SMASK 0x80ull
@@ -931,6 +938,10 @@
 #define SEND_CTXT_CREDIT_CTRL_THRESHOLD_MASK 0x7FFull
 #define SEND_CTXT_CREDIT_CTRL_THRESHOLD_SHIFT 0
 #define SEND_CTXT_CREDIT_CTRL_THRESHOLD_SMASK 0x7FFull
+#define SEND_CTXT_CREDIT_STATUS (TXE + 0x000000100018)
+#define SEND_CTXT_CREDIT_STATUS_CURRENT_FREE_COUNTER_MASK 0x7FFull
+#define SEND_CTXT_CREDIT_STATUS_CURRENT_FREE_COUNTER_SHIFT 32
+#define SEND_CTXT_CREDIT_STATUS_LAST_RETURNED_COUNTER_SMASK 0x7FFull
 #define SEND_CTXT_CREDIT_FORCE (TXE + 0x000000100028)
 #define SEND_CTXT_CREDIT_FORCE_FORCE_RETURN_SMASK 0x1ull
 #define SEND_CTXT_CREDIT_RETURN_ADDR (TXE + 0x000000100020)
diff --git a/drivers/infiniband/hw/hfi1/common.h b/drivers/infiniband/hw/hfi1/common.h
index 7108d4d..d47da7b 100644
--- a/drivers/infiniband/hw/hfi1/common.h
+++ b/drivers/infiniband/hw/hfi1/common.h
@@ -1,5 +1,5 @@
 /*
- * Copyright(c) 2015, 2016 Intel Corporation.
+ * Copyright(c) 2015 - 2018 Intel Corporation.
  *
  * This file is provided under a dual BSD/GPLv2 license.  When using or
  * redistributing this file, you may do so under either license.
@@ -136,18 +136,21 @@
 				  HFI1_CAP_ALLOW_PERM_JKEY |		\
 				  HFI1_CAP_STATIC_RATE_CTRL |		\
 				  HFI1_CAP_PRINT_UNIMPL |		\
-				  HFI1_CAP_TID_UNMAP)
+				  HFI1_CAP_TID_UNMAP |			\
+				  HFI1_CAP_OPFN)
 /*
  * A set of capability bits that are "global" and are not allowed to be
  * set in the user bitmask.
  */
 #define HFI1_CAP_RESERVED_MASK   ((HFI1_CAP_SDMA |			\
-				  HFI1_CAP_USE_SDMA_HEAD |		\
-				  HFI1_CAP_EXTENDED_PSN |		\
-				  HFI1_CAP_PRINT_UNIMPL |		\
-				  HFI1_CAP_NO_INTEGRITY |		\
-				  HFI1_CAP_PKEY_CHECK) <<		\
-				 HFI1_CAP_USER_SHIFT)
+				   HFI1_CAP_USE_SDMA_HEAD |		\
+				   HFI1_CAP_EXTENDED_PSN |		\
+				   HFI1_CAP_PRINT_UNIMPL |		\
+				   HFI1_CAP_NO_INTEGRITY |		\
+				   HFI1_CAP_PKEY_CHECK |		\
+				   HFI1_CAP_TID_RDMA |			\
+				   HFI1_CAP_OPFN) <<			\
+				  HFI1_CAP_USER_SHIFT)
 /*
  * Set of capabilities that need to be enabled for kernel context in
  * order to be allowed for user contexts, as well.
@@ -283,7 +286,7 @@
 #define RHF_TID_ERR		(0x1ull << 59)
 #define RHF_LEN_ERR		(0x1ull << 60)
 #define RHF_ECC_ERR		(0x1ull << 61)
-#define RHF_VCRC_ERR		(0x1ull << 62)
+#define RHF_RESERVED		(0x1ull << 62)
 #define RHF_ICRC_ERR		(0x1ull << 63)
 
 #define RHF_ERROR_SMASK 0xffe0000000000000ull		/* bits 63:53 */
@@ -337,6 +340,10 @@
 
 #define HFI1_PSM_IOC_BASE_SEQ 0x0
 
+/* Number of BTH.PSN bits used for sequence number in expected rcvs */
+#define HFI1_KDETH_BTH_SEQ_SHIFT 11
+#define HFI1_KDETH_BTH_SEQ_MASK (BIT(HFI1_KDETH_BTH_SEQ_SHIFT) - 1)
+
 static inline __u64 rhf_to_cpu(const __le32 *rbuf)
 {
 	return __le64_to_cpu(*((__le64 *)rbuf));
diff --git a/drivers/infiniband/hw/hfi1/debugfs.c b/drivers/infiniband/hw/hfi1/debugfs.c
index 9f992ae..d268bf9 100644
--- a/drivers/infiniband/hw/hfi1/debugfs.c
+++ b/drivers/infiniband/hw/hfi1/debugfs.c
@@ -407,6 +407,54 @@
 DEBUGFS_SEQ_FILE_OPEN(rcds)
 DEBUGFS_FILE_OPS(rcds);
 
+static void *_pios_seq_start(struct seq_file *s, loff_t *pos)
+{
+	struct hfi1_ibdev *ibd;
+	struct hfi1_devdata *dd;
+
+	ibd = (struct hfi1_ibdev *)s->private;
+	dd = dd_from_dev(ibd);
+	if (!dd->send_contexts || *pos >= dd->num_send_contexts)
+		return NULL;
+	return pos;
+}
+
+static void *_pios_seq_next(struct seq_file *s, void *v, loff_t *pos)
+{
+	struct hfi1_ibdev *ibd = (struct hfi1_ibdev *)s->private;
+	struct hfi1_devdata *dd = dd_from_dev(ibd);
+
+	++*pos;
+	if (!dd->send_contexts || *pos >= dd->num_send_contexts)
+		return NULL;
+	return pos;
+}
+
+static void _pios_seq_stop(struct seq_file *s, void *v)
+{
+}
+
+static int _pios_seq_show(struct seq_file *s, void *v)
+{
+	struct hfi1_ibdev *ibd = (struct hfi1_ibdev *)s->private;
+	struct hfi1_devdata *dd = dd_from_dev(ibd);
+	struct send_context_info *sci;
+	loff_t *spos = v;
+	loff_t i = *spos;
+	unsigned long flags;
+
+	spin_lock_irqsave(&dd->sc_lock, flags);
+	sci = &dd->send_contexts[i];
+	if (sci && sci->type != SC_USER && sci->allocated && sci->sc)
+		seqfile_dump_sci(s, i, sci);
+	spin_unlock_irqrestore(&dd->sc_lock, flags);
+	return 0;
+}
+
+DEBUGFS_SEQ_FILE_OPS(pios);
+DEBUGFS_SEQ_FILE_OPEN(pios)
+DEBUGFS_FILE_OPS(pios);
+
 /* read the per-device counters */
 static ssize_t dev_counters_read(struct file *file, char __user *buf,
 				 size_t count, loff_t *ppos)
@@ -939,9 +987,6 @@
 	struct hfi1_pportdata *ppd;
 	int ret;
 
-	if (!try_module_get(THIS_MODULE))
-		return -ENODEV;
-
 	ppd = private2ppd(fp);
 
 	ret = acquire_chip_resource(ppd->dd, i2c_target(target), 0);
@@ -1032,10 +1077,82 @@
 	return __qsfp_debugfs_release(in, fp, 1);
 }
 
+#define EXPROM_WRITE_ENABLE BIT_ULL(14)
+
+static bool exprom_wp_disabled;
+
+static int exprom_wp_set(struct hfi1_devdata *dd, bool disable)
+{
+	u64 gpio_val = 0;
+
+	if (disable) {
+		gpio_val = EXPROM_WRITE_ENABLE;
+		exprom_wp_disabled = true;
+		dd_dev_info(dd, "Disable Expansion ROM Write Protection\n");
+	} else {
+		exprom_wp_disabled = false;
+		dd_dev_info(dd, "Enable Expansion ROM Write Protection\n");
+	}
+
+	write_csr(dd, ASIC_GPIO_OUT, gpio_val);
+	write_csr(dd, ASIC_GPIO_OE, gpio_val);
+
+	return 0;
+}
+
+static ssize_t exprom_wp_debugfs_read(struct file *file, char __user *buf,
+				      size_t count, loff_t *ppos)
+{
+	return 0;
+}
+
+static ssize_t exprom_wp_debugfs_write(struct file *file,
+				       const char __user *buf, size_t count,
+				       loff_t *ppos)
+{
+	struct hfi1_pportdata *ppd = private2ppd(file);
+	char cdata;
+
+	if (count != 1)
+		return -EINVAL;
+	if (get_user(cdata, buf))
+		return -EFAULT;
+	if (cdata == '0')
+		exprom_wp_set(ppd->dd, false);
+	else if (cdata == '1')
+		exprom_wp_set(ppd->dd, true);
+	else
+		return -EINVAL;
+
+	return 1;
+}
+
+static unsigned long exprom_in_use;
+
+static int exprom_wp_debugfs_open(struct inode *in, struct file *fp)
+{
+	if (test_and_set_bit(0, &exprom_in_use))
+		return -EBUSY;
+
+	return 0;
+}
+
+static int exprom_wp_debugfs_release(struct inode *in, struct file *fp)
+{
+	struct hfi1_pportdata *ppd = private2ppd(fp);
+
+	if (exprom_wp_disabled)
+		exprom_wp_set(ppd->dd, false);
+	clear_bit(0, &exprom_in_use);
+
+	return 0;
+}
+
 #define DEBUGFS_OPS(nm, readroutine, writeroutine)	\
 { \
 	.name = nm, \
 	.ops = { \
+		.owner = THIS_MODULE, \
 		.read = readroutine, \
 		.write = writeroutine, \
 		.llseek = generic_file_llseek, \
@@ -1046,6 +1163,7 @@
 { \
 	.name = nm, \
 	.ops = { \
+		.owner = THIS_MODULE, \
 		.read = readf, \
 		.write = writef, \
 		.llseek = generic_file_llseek, \
@@ -1071,6 +1189,9 @@
 		     qsfp1_debugfs_open, qsfp1_debugfs_release),
 	DEBUGFS_XOPS("qsfp2", qsfp2_debugfs_read, qsfp2_debugfs_write,
 		     qsfp2_debugfs_open, qsfp2_debugfs_release),
+	DEBUGFS_XOPS("exprom_wp", exprom_wp_debugfs_read,
+		     exprom_wp_debugfs_write, exprom_wp_debugfs_open,
+		     exprom_wp_debugfs_release),
 	DEBUGFS_OPS("asic_flags", asic_flags_read, asic_flags_write),
 	DEBUGFS_OPS("dc8051_memory", dc8051_memory_read, NULL),
 	DEBUGFS_OPS("lcb", debugfs_lcb_read, debugfs_lcb_write),
@@ -1119,6 +1240,7 @@
 	char link[10];
 	struct hfi1_devdata *dd = dd_from_dev(ibd);
 	struct hfi1_pportdata *ppd;
+	struct dentry *root;
 	int unit = dd->unit;
 	int i, j;
 
@@ -1126,30 +1248,29 @@
 		return;
 	snprintf(name, sizeof(name), "%s_%d", class_name(), unit);
 	snprintf(link, sizeof(link), "%d", unit);
-	ibd->hfi1_ibdev_dbg = debugfs_create_dir(name, hfi1_dbg_root);
-	if (!ibd->hfi1_ibdev_dbg) {
-		pr_warn("create of %s failed\n", name);
-		return;
-	}
+	root = debugfs_create_dir(name, hfi1_dbg_root);
+	ibd->hfi1_ibdev_dbg = root;
+
 	ibd->hfi1_ibdev_link =
 		debugfs_create_symlink(link, hfi1_dbg_root, name);
-	if (!ibd->hfi1_ibdev_link) {
-		pr_warn("create of %s symlink failed\n", name);
-		return;
-	}
-	DEBUGFS_SEQ_FILE_CREATE(opcode_stats, ibd->hfi1_ibdev_dbg, ibd);
-	DEBUGFS_SEQ_FILE_CREATE(tx_opcode_stats, ibd->hfi1_ibdev_dbg, ibd);
-	DEBUGFS_SEQ_FILE_CREATE(ctx_stats, ibd->hfi1_ibdev_dbg, ibd);
-	DEBUGFS_SEQ_FILE_CREATE(qp_stats, ibd->hfi1_ibdev_dbg, ibd);
-	DEBUGFS_SEQ_FILE_CREATE(sdes, ibd->hfi1_ibdev_dbg, ibd);
-	DEBUGFS_SEQ_FILE_CREATE(rcds, ibd->hfi1_ibdev_dbg, ibd);
-	DEBUGFS_SEQ_FILE_CREATE(sdma_cpu_list, ibd->hfi1_ibdev_dbg, ibd);
+
+	debugfs_create_file("opcode_stats", 0444, root, ibd,
+			    &_opcode_stats_file_ops);
+	debugfs_create_file("tx_opcode_stats", 0444, root, ibd,
+			    &_tx_opcode_stats_file_ops);
+	debugfs_create_file("ctx_stats", 0444, root, ibd, &_ctx_stats_file_ops);
+	debugfs_create_file("qp_stats", 0444, root, ibd, &_qp_stats_file_ops);
+	debugfs_create_file("sdes", 0444, root, ibd, &_sdes_file_ops);
+	debugfs_create_file("rcds", 0444, root, ibd, &_rcds_file_ops);
+	debugfs_create_file("pios", 0444, root, ibd, &_pios_file_ops);
+	debugfs_create_file("sdma_cpu_list", 0444, root, ibd,
+			    &_sdma_cpu_list_file_ops);
+
 	/* dev counter files */
 	for (i = 0; i < ARRAY_SIZE(cntr_ops); i++)
-		DEBUGFS_FILE_CREATE(cntr_ops[i].name,
-				    ibd->hfi1_ibdev_dbg,
-				    dd,
-				    &cntr_ops[i].ops, S_IRUGO);
+		debugfs_create_file(cntr_ops[i].name, 0444, root, dd,
+				    &cntr_ops[i].ops);
+
 	/* per port files */
 	for (ppd = dd->pport, j = 0; j < dd->num_pports; j++, ppd++)
 		for (i = 0; i < ARRAY_SIZE(port_cntr_ops); i++) {
@@ -1157,12 +1278,11 @@
 				 sizeof(name),
 				 port_cntr_ops[i].name,
 				 j + 1);
-			DEBUGFS_FILE_CREATE(name,
-					    ibd->hfi1_ibdev_dbg,
-					    ppd,
-					    &port_cntr_ops[i].ops,
+			debugfs_create_file(name,
 					    !port_cntr_ops[i].ops.write ?
-					    S_IRUGO : S_IRUGO | S_IWUSR);
+						    S_IRUGO :
+						    S_IRUGO | S_IWUSR,
+					    root, ppd, &port_cntr_ops[i].ops);
 		}
 
 	hfi1_fault_init_debugfs(ibd);
@@ -1255,15 +1375,15 @@
 
 static u64 hfi1_sps_ints(void)
 {
-	unsigned long flags;
+	unsigned long index, flags;
 	struct hfi1_devdata *dd;
 	u64 sps_ints = 0;
 
-	spin_lock_irqsave(&hfi1_devs_lock, flags);
-	list_for_each_entry(dd, &hfi1_dev_list, list) {
+	xa_lock_irqsave(&hfi1_dev_table, flags);
+	xa_for_each(&hfi1_dev_table, index, dd) {
 		sps_ints += get_all_cpu_total(dd->int_counter);
 	}
-	spin_unlock_irqrestore(&hfi1_devs_lock, flags);
+	xa_unlock_irqrestore(&hfi1_dev_table, flags);
 	return sps_ints;
 }
 
@@ -1292,10 +1412,10 @@
 void hfi1_dbg_init(void)
 {
 	hfi1_dbg_root  = debugfs_create_dir(DRIVER_NAME, NULL);
-	if (!hfi1_dbg_root)
-		pr_warn("init of debugfs failed\n");
-	DEBUGFS_SEQ_FILE_CREATE(driver_stats_names, hfi1_dbg_root, NULL);
-	DEBUGFS_SEQ_FILE_CREATE(driver_stats, hfi1_dbg_root, NULL);
+	debugfs_create_file("driver_stats_names", 0444, hfi1_dbg_root, NULL,
+			    &_driver_stats_names_file_ops);
+	debugfs_create_file("driver_stats", 0444, hfi1_dbg_root, NULL,
+			    &_driver_stats_file_ops);
 }
 
 void hfi1_dbg_exit(void)
diff --git a/drivers/infiniband/hw/hfi1/debugfs.h b/drivers/infiniband/hw/hfi1/debugfs.h
index d5d8244..57e582c 100644
--- a/drivers/infiniband/hw/hfi1/debugfs.h
+++ b/drivers/infiniband/hw/hfi1/debugfs.h
@@ -49,16 +49,6 @@
 
 struct hfi1_ibdev;
 
-#define DEBUGFS_FILE_CREATE(name, parent, data, ops, mode)	\
-do { \
-	struct dentry *ent; \
-	const char *__name = name; \
-	ent = debugfs_create_file(__name, mode, parent, \
-		data, ops); \
-	if (!ent) \
-		pr_warn("create of %s failed\n", __name); \
-} while (0)
-
 #define DEBUGFS_SEQ_FILE_OPS(name) \
 static const struct seq_operations _##name##_seq_ops = { \
 	.start = _##name##_seq_start, \
@@ -89,8 +79,6 @@
 	.release = seq_release \
 }
 
-#define DEBUGFS_SEQ_FILE_CREATE(name, parent, data) \
-	DEBUGFS_FILE_CREATE(#name, parent, data, &_##name##_file_ops, 0444)
 
 ssize_t hfi1_seq_read(struct file *file, char __user *buf, size_t size,
 		      loff_t *ppos);
diff --git a/drivers/infiniband/hw/hfi1/driver.c b/drivers/infiniband/hw/hfi1/driver.c
index a41f855..01aa1f1 100644
--- a/drivers/infiniband/hw/hfi1/driver.c
+++ b/drivers/infiniband/hw/hfi1/driver.c
@@ -72,8 +72,6 @@
  */
 const char ib_hfi1_version[] = HFI1_DRIVER_VERSION "\n";
 
-DEFINE_SPINLOCK(hfi1_devs_lock);
-LIST_HEAD(hfi1_dev_list);
 DEFINE_MUTEX(hfi1_mutex);	/* general driver use */
 
 unsigned int hfi1_max_mtu = HFI1_DEFAULT_MAX_MTU;
@@ -175,11 +173,11 @@
 {
 	struct hfi1_devdata *dd;
 	struct hfi1_pportdata *ppd;
-	unsigned long flags;
+	unsigned long index, flags;
 	int pidx, nunits_active = 0;
 
-	spin_lock_irqsave(&hfi1_devs_lock, flags);
-	list_for_each_entry(dd, &hfi1_dev_list, list) {
+	xa_lock_irqsave(&hfi1_dev_table, flags);
+	xa_for_each(&hfi1_dev_table, index, dd) {
 		if (!(dd->flags & HFI1_PRESENT) || !dd->kregbase1)
 			continue;
 		for (pidx = 0; pidx < dd->num_pports; ++pidx) {
@@ -190,7 +188,7 @@
 			}
 		}
 	}
-	spin_unlock_irqrestore(&hfi1_devs_lock, flags);
+	xa_unlock_irqrestore(&hfi1_dev_table, flags);
 	return nunits_active;
 }
 
@@ -264,7 +262,7 @@
 	    hfi1_dbg_fault_suppress_err(verbs_dev))
 		return;
 
-	if (packet->rhf & (RHF_VCRC_ERR | RHF_ICRC_ERR))
+	if (packet->rhf & RHF_ICRC_ERR)
 		return;
 
 	if (packet->etype == RHF_RCV_TYPE_BYPASS) {
@@ -430,40 +428,60 @@
 	[HFI1_PKT_TYPE_16B] = &return_cnp_16B
 };
 
-void hfi1_process_ecn_slowpath(struct rvt_qp *qp, struct hfi1_packet *pkt,
-			       bool do_cnp)
+/**
+ * hfi1_process_ecn_slowpath - Process FECN or BECN bits
+ * @qp: The packet's destination QP
+ * @pkt: The packet itself.
+ * @prescan: Is the caller the RXQ prescan
+ *
+ * Process the packet's FECN or BECN bits. By now, the packet
+ * has already been evaluated whether processing of those bit should
+ * be done.
+ * The significance of the @prescan argument is that if the caller
+ * is the RXQ prescan, a CNP will be send out instead of waiting for the
+ * normal packet processing to send an ACK with BECN set (or a CNP).
+ */
+bool hfi1_process_ecn_slowpath(struct rvt_qp *qp, struct hfi1_packet *pkt,
+			       bool prescan)
 {
 	struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
 	struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
 	struct ib_other_headers *ohdr = pkt->ohdr;
 	struct ib_grh *grh = pkt->grh;
-	u32 rqpn = 0, bth1;
+	u32 rqpn = 0;
 	u16 pkey;
 	u32 rlid, slid, dlid = 0;
-	u8 hdr_type, sc, svc_type;
-	bool is_mcast = false;
+	u8 hdr_type, sc, svc_type, opcode;
+	bool is_mcast = false, ignore_fecn = false, do_cnp = false,
+		fecn, becn;
 
 	/* can be called from prescan */
 	if (pkt->etype == RHF_RCV_TYPE_BYPASS) {
-		is_mcast = hfi1_is_16B_mcast(dlid);
 		pkey = hfi1_16B_get_pkey(pkt->hdr);
 		sc = hfi1_16B_get_sc(pkt->hdr);
 		dlid = hfi1_16B_get_dlid(pkt->hdr);
 		slid = hfi1_16B_get_slid(pkt->hdr);
+		is_mcast = hfi1_is_16B_mcast(dlid);
+		opcode = ib_bth_get_opcode(ohdr);
 		hdr_type = HFI1_PKT_TYPE_16B;
+		fecn = hfi1_16B_get_fecn(pkt->hdr);
+		becn = hfi1_16B_get_becn(pkt->hdr);
 	} else {
-		is_mcast = (dlid > be16_to_cpu(IB_MULTICAST_LID_BASE)) &&
-			   (dlid != be16_to_cpu(IB_LID_PERMISSIVE));
 		pkey = ib_bth_get_pkey(ohdr);
 		sc = hfi1_9B_get_sc5(pkt->hdr, pkt->rhf);
-		dlid = ib_get_dlid(pkt->hdr);
+		dlid = qp->ibqp.qp_type != IB_QPT_UD ? ib_get_dlid(pkt->hdr) :
+			ppd->lid;
 		slid = ib_get_slid(pkt->hdr);
+		is_mcast = (dlid > be16_to_cpu(IB_MULTICAST_LID_BASE)) &&
+			   (dlid != be16_to_cpu(IB_LID_PERMISSIVE));
+		opcode = ib_bth_get_opcode(ohdr);
 		hdr_type = HFI1_PKT_TYPE_9B;
+		fecn = ib_bth_get_fecn(ohdr);
+		becn = ib_bth_get_becn(ohdr);
 	}
 
 	switch (qp->ibqp.qp_type) {
 	case IB_QPT_UD:
-		dlid = ppd->lid;
 		rlid = slid;
 		rqpn = ib_get_sqpn(pkt->ohdr);
 		svc_type = IB_CC_SVCTYPE_UD;
@@ -485,22 +503,33 @@
 		svc_type = IB_CC_SVCTYPE_RC;
 		break;
 	default:
-		return;
+		return false;
 	}
 
-	bth1 = be32_to_cpu(ohdr->bth[1]);
+	ignore_fecn = is_mcast || (opcode == IB_OPCODE_CNP) ||
+		(opcode == IB_OPCODE_RC_ACKNOWLEDGE);
+	/*
+	 * ACKNOWLEDGE packets do not get a CNP but this will be
+	 * guarded by ignore_fecn above.
+	 */
+	do_cnp = prescan ||
+		(opcode >= IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST &&
+		 opcode <= IB_OPCODE_RC_ATOMIC_ACKNOWLEDGE) ||
+		opcode == TID_OP(READ_RESP) ||
+		opcode == TID_OP(ACK);
+
 	/* Call appropriate CNP handler */
-	if (do_cnp && (bth1 & IB_FECN_SMASK))
+	if (!ignore_fecn && do_cnp && fecn)
 		hfi1_handle_cnp_tbl[hdr_type](ibp, qp, rqpn, pkey,
 					      dlid, rlid, sc, grh);
 
-	if (!is_mcast && (bth1 & IB_BECN_SMASK)) {
-		u32 lqpn = bth1 & RVT_QPN_MASK;
+	if (becn) {
+		u32 lqpn = be32_to_cpu(ohdr->bth[1]) & RVT_QPN_MASK;
 		u8 sl = ibp->sc_to_sl[sc];
 
 		process_becn(ppd, sl, rlid, lqpn, rqpn, svc_type);
 	}
-
+	return !ignore_fecn && fecn;
 }
 
 struct ps_mdata {
@@ -599,7 +628,6 @@
 		struct rvt_dev_info *rdi = &rcd->dd->verbs_dev.rdi;
 		u64 rhf = rhf_to_cpu(rhf_addr);
 		u32 etype = rhf_rcv_type(rhf), qpn, bth1;
-		int is_ecn = 0;
 		u8 lnh;
 
 		if (ps_done(&mdata, rhf, rcd))
@@ -625,12 +653,10 @@
 			goto next; /* just in case */
 		}
 
-		bth1 = be32_to_cpu(packet->ohdr->bth[1]);
-		is_ecn = !!(bth1 & (IB_FECN_SMASK | IB_BECN_SMASK));
-
-		if (!is_ecn)
+		if (!hfi1_may_ecn(packet))
 			goto next;
 
+		bth1 = be32_to_cpu(packet->ohdr->bth[1]);
 		qpn = bth1 & RVT_QPN_MASK;
 		rcu_read_lock();
 		qp = rvt_lookup_qpn(rdi, &ibp->rvp, qpn);
@@ -640,7 +666,7 @@
 			goto next;
 		}
 
-		process_ecn(qp, packet, true);
+		hfi1_process_ecn_slowpath(qp, packet, true);
 		rcu_read_unlock();
 
 		/* turn off BECN, FECN */
@@ -1400,7 +1426,7 @@
 	if ((!(hfi1_is_16B_mcast(packet->dlid))) &&
 	    (packet->dlid !=
 		opa_get_lid(be32_to_cpu(OPA_LID_PERMISSIVE), 16B))) {
-		if (packet->dlid != ppd->lid)
+		if ((packet->dlid & ~((1 << ppd->lmc) - 1)) != ppd->lid)
 			return -EINVAL;
 	}
 
@@ -1549,25 +1575,31 @@
 	return -EINVAL;
 }
 
-void handle_eflags(struct hfi1_packet *packet)
+static void show_eflags_errs(struct hfi1_packet *packet)
 {
 	struct hfi1_ctxtdata *rcd = packet->rcd;
 	u32 rte = rhf_rcv_type_err(packet->rhf);
 
+	dd_dev_err(rcd->dd,
+		   "receive context %d: rhf 0x%016llx, errs [ %s%s%s%s%s%s%s] rte 0x%x\n",
+		   rcd->ctxt, packet->rhf,
+		   packet->rhf & RHF_K_HDR_LEN_ERR ? "k_hdr_len " : "",
+		   packet->rhf & RHF_DC_UNC_ERR ? "dc_unc " : "",
+		   packet->rhf & RHF_DC_ERR ? "dc " : "",
+		   packet->rhf & RHF_TID_ERR ? "tid " : "",
+		   packet->rhf & RHF_LEN_ERR ? "len " : "",
+		   packet->rhf & RHF_ECC_ERR ? "ecc " : "",
+		   packet->rhf & RHF_ICRC_ERR ? "icrc " : "",
+		   rte);
+}
+
+void handle_eflags(struct hfi1_packet *packet)
+{
+	struct hfi1_ctxtdata *rcd = packet->rcd;
+
 	rcv_hdrerr(rcd, rcd->ppd, packet);
 	if (rhf_err_flags(packet->rhf))
-		dd_dev_err(rcd->dd,
-			   "receive context %d: rhf 0x%016llx, errs [ %s%s%s%s%s%s%s%s] rte 0x%x\n",
-			   rcd->ctxt, packet->rhf,
-			   packet->rhf & RHF_K_HDR_LEN_ERR ? "k_hdr_len " : "",
-			   packet->rhf & RHF_DC_UNC_ERR ? "dc_unc " : "",
-			   packet->rhf & RHF_DC_ERR ? "dc " : "",
-			   packet->rhf & RHF_TID_ERR ? "tid " : "",
-			   packet->rhf & RHF_LEN_ERR ? "len " : "",
-			   packet->rhf & RHF_ECC_ERR ? "ecc " : "",
-			   packet->rhf & RHF_VCRC_ERR ? "vcrc " : "",
-			   packet->rhf & RHF_ICRC_ERR ? "icrc " : "",
-			   rte);
+		show_eflags_errs(packet);
 }
 
 /*
@@ -1673,11 +1705,14 @@
 	if (unlikely(hfi1_dbg_should_fault_rx(packet)))
 		return RHF_RCV_CONTINUE;
 
-	if (unlikely(rhf_err_flags(packet->rhf)))
-		handle_eflags(packet);
+	if (unlikely(rhf_err_flags(packet->rhf))) {
+		struct hfi1_ctxtdata *rcd = packet->rcd;
 
-	dd_dev_err(packet->rcd->dd,
-		   "Unhandled expected packet received. Dropping.\n");
+		if (hfi1_handle_kdeth_eflags(rcd, rcd->ppd, packet))
+			return RHF_RCV_CONTINUE;
+	}
+
+	hfi1_kdeth_expected_rcv(packet);
 	return RHF_RCV_CONTINUE;
 }
 
@@ -1686,11 +1721,17 @@
 	hfi1_setup_9B_packet(packet);
 	if (unlikely(hfi1_dbg_should_fault_rx(packet)))
 		return RHF_RCV_CONTINUE;
-	if (unlikely(rhf_err_flags(packet->rhf)))
-		handle_eflags(packet);
 
-	dd_dev_err(packet->rcd->dd,
-		   "Unhandled eager packet received. Dropping.\n");
+	trace_hfi1_rcvhdr(packet);
+	if (unlikely(rhf_err_flags(packet->rhf))) {
+		struct hfi1_ctxtdata *rcd = packet->rcd;
+
+		show_eflags_errs(packet);
+		if (hfi1_handle_kdeth_eflags(rcd, rcd->ppd, packet))
+			return RHF_RCV_CONTINUE;
+	}
+
+	hfi1_kdeth_eager_rcv(packet);
 	return RHF_RCV_CONTINUE;
 }
 
diff --git a/drivers/infiniband/hw/hfi1/exp_rcv.c b/drivers/infiniband/hw/hfi1/exp_rcv.c
index 1be49a0..e9d5cc8 100644
--- a/drivers/infiniband/hw/hfi1/exp_rcv.c
+++ b/drivers/infiniband/hw/hfi1/exp_rcv.c
@@ -112,9 +112,6 @@
  */
 void hfi1_free_ctxt_rcv_groups(struct hfi1_ctxtdata *rcd)
 {
-	WARN_ON(!EXP_TID_SET_EMPTY(rcd->tid_full_list));
-	WARN_ON(!EXP_TID_SET_EMPTY(rcd->tid_used_list));
-
 	kfree(rcd->groups);
 	rcd->groups = NULL;
 	hfi1_exp_tid_group_init(rcd);
diff --git a/drivers/infiniband/hw/hfi1/fault.c b/drivers/infiniband/hw/hfi1/fault.c
index e2290f3..986c121 100644
--- a/drivers/infiniband/hw/hfi1/fault.c
+++ b/drivers/infiniband/hw/hfi1/fault.c
@@ -141,18 +141,21 @@
 	if (!data)
 		return -ENOMEM;
 	copy = min(len, datalen - 1);
-	if (copy_from_user(data, buf, copy))
-		return -EFAULT;
+	if (copy_from_user(data, buf, copy)) {
+		ret = -EFAULT;
+		goto free_data;
+	}
 
 	ret = debugfs_file_get(file->f_path.dentry);
 	if (unlikely(ret))
-		return ret;
+		goto free_data;
 	ptr = data;
 	token = ptr;
 	for (ptr = data; *ptr; ptr = end + 1, token = ptr) {
 		char *dash;
 		unsigned long range_start, range_end, i;
 		bool remove = false;
+		unsigned long bound = 1U << BITS_PER_BYTE;
 
 		end = strchr(ptr, ',');
 		if (end)
@@ -178,6 +181,10 @@
 				    BITS_PER_BYTE);
 			break;
 		}
+		/* Check the inputs */
+		if (range_start >= bound || range_end >= bound)
+			break;
+
 		for (i = range_start; i <= range_end; i++) {
 			if (remove)
 				clear_bit(i, fault->opcodes);
@@ -190,6 +197,7 @@
 	ret = len;
 
 	debugfs_file_put(file->f_path.dentry);
+free_data:
 	kfree(data);
 	return ret;
 }
@@ -209,7 +217,7 @@
 		return -ENOMEM;
 	ret = debugfs_file_get(file->f_path.dentry);
 	if (unlikely(ret))
-		return ret;
+		goto free_data;
 	bit = find_first_bit(fault->opcodes, bitsize);
 	while (bit < bitsize) {
 		zero = find_next_zero_bit(fault->opcodes, bitsize, bit);
@@ -227,6 +235,7 @@
 	data[size - 1] = '\n';
 	data[size] = '\0';
 	ret = simple_read_from_buffer(buf, len, pos, data, size);
+free_data:
 	kfree(data);
 	return ret;
 }
@@ -250,6 +259,7 @@
 int hfi1_fault_init_debugfs(struct hfi1_ibdev *ibd)
 {
 	struct dentry *parent = ibd->hfi1_ibdev_dbg;
+	struct dentry *fault_dir;
 
 	ibd->fault = kzalloc(sizeof(*ibd->fault), GFP_KERNEL);
 	if (!ibd->fault)
@@ -269,45 +279,31 @@
 	bitmap_zero(ibd->fault->opcodes,
 		    sizeof(ibd->fault->opcodes) * BITS_PER_BYTE);
 
-	ibd->fault->dir =
-		fault_create_debugfs_attr("fault", parent,
-					  &ibd->fault->attr);
-	if (IS_ERR(ibd->fault->dir)) {
+	fault_dir =
+		fault_create_debugfs_attr("fault", parent, &ibd->fault->attr);
+	if (IS_ERR(fault_dir)) {
 		kfree(ibd->fault);
 		ibd->fault = NULL;
 		return -ENOENT;
 	}
+	ibd->fault->dir = fault_dir;
 
-	DEBUGFS_SEQ_FILE_CREATE(fault_stats, ibd->fault->dir, ibd);
-	if (!debugfs_create_bool("enable", 0600, ibd->fault->dir,
-				 &ibd->fault->enable))
-		goto fail;
-	if (!debugfs_create_bool("suppress_err", 0600,
-				 ibd->fault->dir,
-				 &ibd->fault->suppress_err))
-		goto fail;
-	if (!debugfs_create_bool("opcode_mode", 0600, ibd->fault->dir,
-				 &ibd->fault->opcode))
-		goto fail;
-	if (!debugfs_create_file("opcodes", 0600, ibd->fault->dir,
-				 ibd->fault, &__fault_opcodes_fops))
-		goto fail;
-	if (!debugfs_create_u64("skip_pkts", 0600,
-				ibd->fault->dir,
-				&ibd->fault->fault_skip))
-		goto fail;
-	if (!debugfs_create_u64("skip_usec", 0600,
-				ibd->fault->dir,
-				&ibd->fault->fault_skip_usec))
-		goto fail;
-	if (!debugfs_create_u8("direction", 0600, ibd->fault->dir,
-			       &ibd->fault->direction))
-		goto fail;
+	debugfs_create_file("fault_stats", 0444, fault_dir, ibd,
+			    &_fault_stats_file_ops);
+	debugfs_create_bool("enable", 0600, fault_dir, &ibd->fault->enable);
+	debugfs_create_bool("suppress_err", 0600, fault_dir,
+			    &ibd->fault->suppress_err);
+	debugfs_create_bool("opcode_mode", 0600, fault_dir,
+			    &ibd->fault->opcode);
+	debugfs_create_file("opcodes", 0600, fault_dir, ibd->fault,
+			    &__fault_opcodes_fops);
+	debugfs_create_u64("skip_pkts", 0600, fault_dir,
+			   &ibd->fault->fault_skip);
+	debugfs_create_u64("skip_usec", 0600, fault_dir,
+			   &ibd->fault->fault_skip_usec);
+	debugfs_create_u8("direction", 0600, fault_dir, &ibd->fault->direction);
 
 	return 0;
-fail:
-	hfi1_fault_exit_debugfs(ibd);
-	return -ENOMEM;
 }
 
 bool hfi1_dbg_fault_suppress_err(struct hfi1_ibdev *ibd)
diff --git a/drivers/infiniband/hw/hfi1/file_ops.c b/drivers/infiniband/hw/hfi1/file_ops.c
index 1fc7564..f9a7e9d 100644
--- a/drivers/infiniband/hw/hfi1/file_ops.c
+++ b/drivers/infiniband/hw/hfi1/file_ops.c
@@ -488,7 +488,7 @@
 		vmf = 1;
 		break;
 	case STATUS:
-		if (flags & (unsigned long)(VM_WRITE | VM_EXEC)) {
+		if (flags & VM_WRITE) {
 			ret = -EPERM;
 			goto done;
 		}
@@ -681,7 +681,8 @@
 		     HFI1_RCVCTRL_TAILUPD_DIS |
 		     HFI1_RCVCTRL_ONE_PKT_EGR_DIS |
 		     HFI1_RCVCTRL_NO_RHQ_DROP_DIS |
-		     HFI1_RCVCTRL_NO_EGR_DROP_DIS, uctxt);
+		     HFI1_RCVCTRL_NO_EGR_DROP_DIS |
+		     HFI1_RCVCTRL_URGENT_DIS, uctxt);
 	/* Clear the context's J_KEY */
 	hfi1_clear_ctxt_jkey(dd, uctxt);
 	/*
@@ -1096,6 +1097,7 @@
 	hfi1_set_ctxt_jkey(uctxt->dd, uctxt, uctxt->jkey);
 
 	rcvctrl_ops = HFI1_RCVCTRL_CTXT_ENB;
+	rcvctrl_ops |= HFI1_RCVCTRL_URGENT_ENB;
 	if (HFI1_CAP_UGET_MASK(uctxt->flags, HDRSUPP))
 		rcvctrl_ops |= HFI1_RCVCTRL_TIDFLOW_ENB;
 	/*
diff --git a/drivers/infiniband/hw/hfi1/hfi.h b/drivers/infiniband/hw/hfi1/hfi.h
index cfd2523..fa45350 100644
--- a/drivers/infiniband/hw/hfi1/hfi.h
+++ b/drivers/infiniband/hw/hfi1/hfi.h
@@ -54,7 +54,6 @@
 #include <linux/list.h>
 #include <linux/scatterlist.h>
 #include <linux/slab.h>
-#include <linux/idr.h>
 #include <linux/io.h>
 #include <linux/fs.h>
 #include <linux/completion.h>
@@ -65,6 +64,7 @@
 #include <linux/kthread.h>
 #include <linux/i2c.h>
 #include <linux/i2c-algo-bit.h>
+#include <linux/xarray.h>
 #include <rdma/ib_hdrs.h>
 #include <rdma/opa_addr.h>
 #include <linux/rhashtable.h>
@@ -73,6 +73,7 @@
 
 #include "chip_registers.h"
 #include "common.h"
+#include "opfn.h"
 #include "verbs.h"
 #include "pio.h"
 #include "chip.h"
@@ -80,6 +81,7 @@
 #include "qsfp.h"
 #include "platform.h"
 #include "affinity.h"
+#include "msix.h"
 
 /* bumped 1 from s/w major version of TrueScale */
 #define HFI1_CHIP_VERS_MAJ 3U
@@ -97,6 +99,8 @@
 #define NEIGHBOR_TYPE_HFI		0
 #define NEIGHBOR_TYPE_SWITCH	1
 
+#define HFI1_MAX_ACTIVE_WORKQUEUE_ENTRIES 5
+
 extern unsigned long hfi1_cap_mask;
 #define HFI1_CAP_KGET_MASK(mask, cap) ((mask) & HFI1_CAP_##cap)
 #define HFI1_CAP_UGET_MASK(mask, cap) \
@@ -194,6 +198,14 @@
 };
 
 typedef int (*rhf_rcv_function_ptr)(struct hfi1_packet *packet);
+
+struct tid_queue {
+	struct list_head queue_head;
+			/* queue head for QP TID resource waiters */
+	u32 enqueue;	/* count of tid enqueues */
+	u32 dequeue;	/* count of tid dequeues */
+};
+
 struct hfi1_ctxtdata {
 	/* rcvhdrq base, needs mmap before useful */
 	void *rcvhdrq;
@@ -287,6 +299,12 @@
 	/* PSM Specific fields */
 	/* lock protecting all Expected TID data */
 	struct mutex exp_mutex;
+	/* lock protecting all Expected TID data of kernel contexts */
+	spinlock_t exp_lock;
+	/* Queue for QP's waiting for HW TID flows */
+	struct tid_queue flow_queue;
+	/* Queue for QP's waiting for HW receive array entries */
+	struct tid_queue rarr_queue;
 	/* when waiting for rcv or pioavail */
 	wait_queue_head_t wait;
 	/* uuid from PSM */
@@ -319,6 +337,9 @@
 	 */
 	u8 subctxt_cnt;
 
+	/* Bit mask to track free TID RDMA HW flows */
+	unsigned long flow_mask;
+	struct tid_flow_state flows[RXE_NUM_TID_FLOWS];
 };
 
 /**
@@ -518,6 +539,37 @@
 	mgmt->src_qpn = cpu_to_be32(src_qp & OPA_16B_MGMT_QPN_MASK);
 }
 
+/**
+ * hfi1_get_rc_ohdr - get extended header
+ * @opah - the opaheader
+ */
+static inline struct ib_other_headers *
+hfi1_get_rc_ohdr(struct hfi1_opa_header *opah)
+{
+	struct ib_other_headers *ohdr;
+	struct ib_header *hdr = NULL;
+	struct hfi1_16b_header *hdr_16b = NULL;
+
+	/* Find out where the BTH is */
+	if (opah->hdr_type == HFI1_PKT_TYPE_9B) {
+		hdr = &opah->ibh;
+		if (ib_get_lnh(hdr) == HFI1_LRH_BTH)
+			ohdr = &hdr->u.oth;
+		else
+			ohdr = &hdr->u.l.oth;
+	} else {
+		u8 l4;
+
+		hdr_16b = &opah->opah;
+		l4  = hfi1_16B_get_l4(hdr_16b);
+		if (l4 == OPA_16B_L4_IB_LOCAL)
+			ohdr = &hdr_16b->u.oth;
+		else
+			ohdr = &hdr_16b->u.l.oth;
+	}
+	return ohdr;
+}
+
 struct rvt_sge_state;
 
 /*
@@ -622,6 +674,8 @@
 #define HFI1_RCVCTRL_NO_RHQ_DROP_DIS 0x8000
 #define HFI1_RCVCTRL_NO_EGR_DROP_ENB 0x10000
 #define HFI1_RCVCTRL_NO_EGR_DROP_DIS 0x20000
+#define HFI1_RCVCTRL_URGENT_ENB 0x40000
+#define HFI1_RCVCTRL_URGENT_DIS 0x80000
 
 /* partition enforcement flags */
 #define HFI1_PART_ENFORCE_IN	0x1
@@ -669,6 +723,14 @@
 	struct irq_affinity_notify notify;
 };
 
+struct hfi1_msix_info {
+	/* lock to synchronize in_use_msix access */
+	spinlock_t msix_lock;
+	DECLARE_BITMAP(in_use_msix, CCE_NUM_MSIX_VECTORS);
+	struct hfi1_msix_entry *msix_entries;
+	u16 max_requested;
+};
+
 /* per-SL CCA information */
 struct cca_timer {
 	struct hrtimer hrtimer;
@@ -990,11 +1052,10 @@
 struct hfi1_vnic_data {
 	struct hfi1_ctxtdata *ctxt[HFI1_NUM_VNIC_CTXT];
 	struct kmem_cache *txreq_cache;
+	struct xarray vesws;
 	u8 num_vports;
-	struct idr vesw_idr;
 	u8 rmt_start;
 	u8 num_ctxt;
-	u32 msix_idx;
 };
 
 struct hfi1_vnic_vport_info;
@@ -1011,7 +1072,6 @@
 typedef int (*send_routine)(struct rvt_qp *, struct hfi1_pkt_state *, u64);
 struct hfi1_devdata {
 	struct hfi1_ibdev verbs_dev;     /* must be first */
-	struct list_head list;
 	/* pointers to related structs for this device */
 	/* pci access data structure */
 	struct pci_dev *pcidev;
@@ -1207,11 +1267,6 @@
 
 	struct diag_client *diag_client;
 
-	/* MSI-X information */
-	struct hfi1_msix_entry *msix_entries;
-	u32 num_msix_entries;
-	u32 first_dyn_msix_idx;
-
 	/* general interrupt: mask of handled interrupts */
 	u64 gi_mask[CCE_NUM_INT_CSRS];
 
@@ -1225,6 +1280,9 @@
 	 */
 	struct timer_list synth_stats_timer;
 
+	/* MSI-X information */
+	struct hfi1_msix_info msix_info;
+
 	/*
 	 * device counters
 	 */
@@ -1351,6 +1409,8 @@
 
 	/* vnic data */
 	struct hfi1_vnic_data vnic;
+	/* Lock to protect IRQ SRC register access */
+	spinlock_t irq_src_lock;
 };
 
 static inline bool hfi1_vnic_is_rsm_full(struct hfi1_devdata *dd, int spare)
@@ -1396,8 +1456,7 @@
 	struct mm_struct *mm;
 };
 
-extern struct list_head hfi1_dev_list;
-extern spinlock_t hfi1_devs_lock;
+extern struct xarray hfi1_dev_table;
 struct hfi1_devdata *hfi1_lookup(int unit);
 
 static inline unsigned long uctxt_offset(struct hfi1_ctxtdata *uctxt)
@@ -1425,7 +1484,7 @@
 			 struct hfi1_devdata *dd, u8 hw_pidx, u8 port);
 void hfi1_free_ctxtdata(struct hfi1_devdata *dd, struct hfi1_ctxtdata *rcd);
 int hfi1_rcd_put(struct hfi1_ctxtdata *rcd);
-void hfi1_rcd_get(struct hfi1_ctxtdata *rcd);
+int hfi1_rcd_get(struct hfi1_ctxtdata *rcd);
 struct hfi1_ctxtdata *hfi1_rcd_get_by_index_safe(struct hfi1_devdata *dd,
 						 u16 ctxt);
 struct hfi1_ctxtdata *hfi1_rcd_get_by_index(struct hfi1_devdata *dd, u16 ctxt);
@@ -1433,9 +1492,6 @@
 int handle_receive_interrupt_nodma_rtail(struct hfi1_ctxtdata *rcd, int thread);
 int handle_receive_interrupt_dma_rtail(struct hfi1_ctxtdata *rcd, int thread);
 void set_all_slowpath(struct hfi1_devdata *dd);
-void hfi1_vnic_synchronize_irq(struct hfi1_devdata *dd);
-void hfi1_set_vnic_msix_info(struct hfi1_ctxtdata *rcd);
-void hfi1_reset_vnic_msix_info(struct hfi1_ctxtdata *rcd);
 
 extern const struct pci_device_id hfi1_pci_tbl[];
 void hfi1_make_ud_req_9B(struct rvt_qp *qp,
@@ -1797,13 +1853,20 @@
 	return &rcd->ppd->ibport_data;
 }
 
-void hfi1_process_ecn_slowpath(struct rvt_qp *qp, struct hfi1_packet *pkt,
-			       bool do_cnp);
-static inline bool process_ecn(struct rvt_qp *qp, struct hfi1_packet *pkt,
-			       bool do_cnp)
+/**
+ * hfi1_may_ecn - Check whether FECN or BECN processing should be done
+ * @pkt: the packet to be evaluated
+ *
+ * Check whether the FECN or BECN bits in the packet's header are
+ * enabled, depending on packet type.
+ *
+ * This function only checks for FECN and BECN bits. Additional checks
+ * are done in the slowpath (hfi1_process_ecn_slowpath()) in order to
+ * ensure correct handling.
+ */
+static inline bool hfi1_may_ecn(struct hfi1_packet *pkt)
 {
-	bool becn;
-	bool fecn;
+	bool fecn, becn;
 
 	if (pkt->etype == RHF_RCV_TYPE_BYPASS) {
 		fecn = hfi1_16B_get_fecn(pkt->hdr);
@@ -1812,10 +1875,18 @@
 		fecn = ib_bth_get_fecn(pkt->ohdr);
 		becn = ib_bth_get_becn(pkt->ohdr);
 	}
-	if (unlikely(fecn || becn)) {
-		hfi1_process_ecn_slowpath(qp, pkt, do_cnp);
-		return fecn;
-	}
+	return fecn || becn;
+}
+
+bool hfi1_process_ecn_slowpath(struct rvt_qp *qp, struct hfi1_packet *pkt,
+			       bool prescan);
+static inline bool process_ecn(struct rvt_qp *qp, struct hfi1_packet *pkt)
+{
+	bool do_work;
+
+	do_work = hfi1_may_ecn(pkt);
+	if (unlikely(do_work))
+		return hfi1_process_ecn_slowpath(qp, pkt, false);
 	return false;
 }
 
@@ -1889,10 +1960,8 @@
 #define HFI1_CTXT_WAITING_URG 4
 
 /* free up any allocated data at closes */
-struct hfi1_devdata *hfi1_init_dd(struct pci_dev *pdev,
-				  const struct pci_device_id *ent);
+int hfi1_init_dd(struct hfi1_devdata *dd);
 void hfi1_free_devdata(struct hfi1_devdata *dd);
-struct hfi1_devdata *hfi1_alloc_devdata(struct pci_dev *pdev, size_t extra);
 
 /* LED beaconing functions */
 void hfi1_start_led_override(struct hfi1_pportdata *ppd, unsigned int timeon,
@@ -1965,6 +2034,7 @@
  */
 
 extern const char ib_hfi1_version[];
+extern const struct attribute_group ib_hfi1_attr_group;
 
 int hfi1_device_create(struct hfi1_devdata *dd);
 void hfi1_device_remove(struct hfi1_devdata *dd);
@@ -1976,16 +2046,15 @@
 /* Hook for sysfs read of QSFP */
 int qsfp_dump(struct hfi1_pportdata *ppd, char *buf, int len);
 
-int hfi1_pcie_init(struct pci_dev *pdev, const struct pci_device_id *ent);
-void hfi1_clean_up_interrupts(struct hfi1_devdata *dd);
+int hfi1_pcie_init(struct hfi1_devdata *dd);
 void hfi1_pcie_cleanup(struct pci_dev *pdev);
 int hfi1_pcie_ddinit(struct hfi1_devdata *dd, struct pci_dev *pdev);
 void hfi1_pcie_ddcleanup(struct hfi1_devdata *);
 int pcie_speeds(struct hfi1_devdata *dd);
-int request_msix(struct hfi1_devdata *dd, u32 msireq);
 int restore_pci_variables(struct hfi1_devdata *dd);
 int save_pci_variables(struct hfi1_devdata *dd);
 int do_pcie_gen3_transition(struct hfi1_devdata *dd);
+void tune_pcie_caps(struct hfi1_devdata *dd);
 int parse_platform_config(struct hfi1_devdata *dd);
 int get_platform_config_field(struct hfi1_devdata *dd,
 			      enum platform_config_table_type_encoding
@@ -2080,7 +2149,7 @@
 			SEND_CTXT_CHECK_ENABLE_DISALLOW_PBC_TEST_SMASK |
 #endif
 			HFI1_PKT_USER_SC_INTEGRITY;
-	else
+	else if (ctxt_type != SC_KERNEL)
 		base_sc_integrity |= HFI1_PKT_KERNEL_SC_INTEGRITY;
 
 	/* turn on send-side job key checks if !A0 */
@@ -2126,19 +2195,6 @@
 	return base_sdma_integrity;
 }
 
-/*
- * hfi1_early_err is used (only!) to print early errors before devdata is
- * allocated, or when dd->pcidev may not be valid, and at the tail end of
- * cleanup when devdata may have been freed, etc.  hfi1_dev_porterr is
- * the same as dd_dev_err, but is used when the message really needs
- * the IB port# to be definitive as to what's happening..
- */
-#define hfi1_early_err(dev, fmt, ...) \
-	dev_err(dev, fmt, ##__VA_ARGS__)
-
-#define hfi1_early_info(dev, fmt, ...) \
-	dev_info(dev, fmt, ##__VA_ARGS__)
-
 #define dd_dev_emerg(dd, fmt, ...) \
 	dev_emerg(&(dd)->pcidev->dev, "%s: " fmt, \
 		  rvt_get_ibdev_name(&(dd)->verbs_dev.rdi), ##__VA_ARGS__)
diff --git a/drivers/infiniband/hw/hfi1/init.c b/drivers/infiniband/hw/hfi1/init.c
index 758d273..26b792b 100644
--- a/drivers/infiniband/hw/hfi1/init.c
+++ b/drivers/infiniband/hw/hfi1/init.c
@@ -49,11 +49,12 @@
 #include <linux/netdevice.h>
 #include <linux/vmalloc.h>
 #include <linux/delay.h>
-#include <linux/idr.h>
+#include <linux/xarray.h>
 #include <linux/module.h>
 #include <linux/printk.h>
 #include <linux/hrtimer.h>
 #include <linux/bitmap.h>
+#include <linux/numa.h>
 #include <rdma/rdma_vt.h>
 
 #include "hfi.h"
@@ -72,7 +73,6 @@
 #undef pr_fmt
 #define pr_fmt(fmt) DRIVER_NAME ": " fmt
 
-#define HFI1_MAX_ACTIVE_WORKQUEUE_ENTRIES 5
 /*
  * min buffers we want to have per context, after driver
  */
@@ -83,6 +83,8 @@
 #define HFI1_MIN_EAGER_BUFFER_SIZE (4 * 1024) /* 4KB */
 #define HFI1_MAX_EAGER_BUFFER_SIZE (256 * 1024) /* 256KB */
 
+#define NUM_IB_PORTS 1
+
 /*
  * Number of user receive contexts we are configured to use (to allow for more
  * pio buffers per ctxt, etc.)  Zero means use one user context per CPU.
@@ -122,7 +124,7 @@
 
 static inline u64 encode_rcv_header_entry_size(u16 size);
 
-static struct idr hfi1_unit_table;
+DEFINE_XARRAY_FLAGS(hfi1_dev_table, XA_FLAGS_ALLOC | XA_FLAGS_LOCK_IRQ);
 
 static int hfi1_create_kctxt(struct hfi1_devdata *dd,
 			     struct hfi1_pportdata *ppd)
@@ -213,12 +215,12 @@
 	struct hfi1_ctxtdata *rcd =
 		container_of(kref, struct hfi1_ctxtdata, kref);
 
-	hfi1_free_ctxtdata(rcd->dd, rcd);
-
 	spin_lock_irqsave(&rcd->dd->uctxt_lock, flags);
 	rcd->dd->rcd[rcd->ctxt] = NULL;
 	spin_unlock_irqrestore(&rcd->dd->uctxt_lock, flags);
 
+	hfi1_free_ctxtdata(rcd->dd, rcd);
+
 	kfree(rcd);
 }
 
@@ -241,10 +243,13 @@
  * @rcd: pointer to an initialized rcd data structure
  *
  * Use this to get a reference after the init.
+ *
+ * Return : reflect kref_get_unless_zero(), which returns non-zero on
+ * increment, otherwise 0.
  */
-void hfi1_rcd_get(struct hfi1_ctxtdata *rcd)
+int hfi1_rcd_get(struct hfi1_ctxtdata *rcd)
 {
-	kref_get(&rcd->kref);
+	return kref_get_unless_zero(&rcd->kref);
 }
 
 /**
@@ -324,7 +329,8 @@
 	spin_lock_irqsave(&dd->uctxt_lock, flags);
 	if (dd->rcd[ctxt]) {
 		rcd = dd->rcd[ctxt];
-		hfi1_rcd_get(rcd);
+		if (!hfi1_rcd_get(rcd))
+			rcd = NULL;
 	}
 	spin_unlock_irqrestore(&dd->uctxt_lock, flags);
 
@@ -369,6 +375,9 @@
 		rcd->rhf_rcv_function_map = normal_rhf_rcv_functions;
 
 		mutex_init(&rcd->exp_mutex);
+		spin_lock_init(&rcd->exp_lock);
+		INIT_LIST_HEAD(&rcd->flow_queue.queue_head);
+		INIT_LIST_HEAD(&rcd->rarr_queue.queue_head);
 
 		hfi1_cdbg(PROC, "setting up context %u\n", rcd->ctxt);
 
@@ -460,7 +469,7 @@
 		if (rcd->egrbufs.size < hfi1_max_mtu) {
 			rcd->egrbufs.size = __roundup_pow_of_two(hfi1_max_mtu);
 			hfi1_cdbg(PROC,
-				  "ctxt%u: eager bufs size too small. Adjusting to %zu\n",
+				  "ctxt%u: eager bufs size too small. Adjusting to %u\n",
 				    rcd->ctxt, rcd->egrbufs.size);
 		}
 		rcd->egrbufs.rcvtid_size = HFI1_MAX_EAGER_BUFFER_SIZE;
@@ -471,6 +480,9 @@
 						    GFP_KERNEL, numa);
 			if (!rcd->opstats)
 				goto bail;
+
+			/* Initialize TID flow generations for the context */
+			hfi1_kern_init_ctxt_generations(rcd);
 		}
 
 		*context = rcd;
@@ -654,9 +666,8 @@
 	ppd->part_enforce |= HFI1_PART_ENFORCE_IN;
 
 	if (loopback) {
-		hfi1_early_err(&pdev->dev,
-			       "Faking data partition 0x8001 in idx %u\n",
-			       !default_pkey_idx);
+		dd_dev_err(dd, "Faking data partition 0x8001 in idx %u\n",
+			   !default_pkey_idx);
 		ppd->pkeys[!default_pkey_idx] = 0x8001;
 	}
 
@@ -702,9 +713,7 @@
 	return;
 
 bail:
-
-	hfi1_early_err(&pdev->dev,
-		       "Congestion Control Agent disabled for port %d\n", port);
+	dd_dev_err(dd, "Congestion Control Agent disabled for port %d\n", port);
 }
 
 /*
@@ -773,6 +782,8 @@
 			rcvmask |= HFI1_RCVCTRL_NO_RHQ_DROP_ENB;
 		if (HFI1_CAP_KGET_MASK(rcd->flags, NODROP_EGR_FULL))
 			rcvmask |= HFI1_RCVCTRL_NO_EGR_DROP_ENB;
+		if (HFI1_CAP_IS_KSET(TID_RDMA))
+			rcvmask |= HFI1_RCVCTRL_TIDFLOW_ENB;
 		hfi1_rcvctrl(dd, rcvmask, rcd);
 		sc_enable(rcd->sc);
 		hfi1_rcd_put(rcd);
@@ -794,7 +805,8 @@
 			ppd->hfi1_wq =
 				alloc_workqueue(
 				    "hfi%d_%d",
-				    WQ_SYSFS | WQ_HIGHPRI | WQ_CPU_INTENSIVE,
+				    WQ_SYSFS | WQ_HIGHPRI | WQ_CPU_INTENSIVE |
+				    WQ_MEM_RECLAIM,
 				    HFI1_MAX_ACTIVE_WORKQUEUE_ENTRIES,
 				    dd->unit, pidx);
 			if (!ppd->hfi1_wq)
@@ -833,6 +845,23 @@
 }
 
 /**
+ * enable_general_intr() - Enable the IRQs that will be handled by the
+ * general interrupt handler.
+ * @dd: valid devdata
+ *
+ */
+static void enable_general_intr(struct hfi1_devdata *dd)
+{
+	set_intr_bits(dd, CCE_ERR_INT, MISC_ERR_INT, true);
+	set_intr_bits(dd, PIO_ERR_INT, TXE_ERR_INT, true);
+	set_intr_bits(dd, IS_SENDCTXT_ERR_START, IS_SENDCTXT_ERR_END, true);
+	set_intr_bits(dd, PBC_INT, GPIO_ASSERT_INT, true);
+	set_intr_bits(dd, TCRIT_INT, TCRIT_INT, true);
+	set_intr_bits(dd, IS_DC_START, IS_DC_END, true);
+	set_intr_bits(dd, IS_SENDCREDIT_START, IS_SENDCREDIT_END, true);
+}
+
+/**
  * hfi1_init - do the actual initialization sequence on the chip
  * @dd: the hfi1_ib device
  * @reinit: re-initializing, so don't allocate new memory
@@ -883,10 +912,10 @@
 		goto done;
 
 	/* allocate dummy tail memory for all receive contexts */
-	dd->rcvhdrtail_dummy_kvaddr = dma_zalloc_coherent(
-		&dd->pcidev->dev, sizeof(u64),
-		&dd->rcvhdrtail_dummy_dma,
-		GFP_KERNEL);
+	dd->rcvhdrtail_dummy_kvaddr = dma_alloc_coherent(&dd->pcidev->dev,
+							 sizeof(u64),
+							 &dd->rcvhdrtail_dummy_dma,
+							 GFP_KERNEL);
 
 	if (!dd->rcvhdrtail_dummy_kvaddr) {
 		dd_dev_err(dd, "cannot allocate dummy tail memory\n");
@@ -911,11 +940,14 @@
 		lastfail = hfi1_create_rcvhdrq(dd, rcd);
 		if (!lastfail)
 			lastfail = hfi1_setup_eagerbufs(rcd);
+		if (!lastfail)
+			lastfail = hfi1_kern_exp_rcv_init(rcd, reinit);
 		if (lastfail) {
 			dd_dev_err(dd,
 				   "failed to allocate kernel ctxt's rcvhdrq and/or egr bufs\n");
 			ret = lastfail;
 		}
+		/* enable IRQ */
 		hfi1_rcd_put(rcd);
 	}
 
@@ -954,7 +986,8 @@
 			HFI1_STATUS_INITTED;
 	if (!ret) {
 		/* enable all interrupts from the chip */
-		set_intr_state(dd, 1);
+		enable_general_intr(dd);
+		init_qsfp_int(dd);
 
 		/* chip is OK for user apps; mark it as initialized */
 		for (pidx = 0; pidx < dd->num_pports; ++pidx) {
@@ -986,21 +1019,9 @@
 	return ret;
 }
 
-static inline struct hfi1_devdata *__hfi1_lookup(int unit)
-{
-	return idr_find(&hfi1_unit_table, unit);
-}
-
 struct hfi1_devdata *hfi1_lookup(int unit)
 {
-	struct hfi1_devdata *dd;
-	unsigned long flags;
-
-	spin_lock_irqsave(&hfi1_devs_lock, flags);
-	dd = __hfi1_lookup(unit);
-	spin_unlock_irqrestore(&hfi1_devs_lock, flags);
-
-	return dd;
+	return xa_load(&hfi1_dev_table, unit);
 }
 
 /*
@@ -1051,9 +1072,9 @@
 	}
 	dd->flags &= ~HFI1_INITTED;
 
-	/* mask and clean up interrupts, but not errors */
-	set_intr_state(dd, 0);
-	hfi1_clean_up_interrupts(dd);
+	/* mask and clean up interrupts */
+	set_intr_bits(dd, IS_FIRST_SOURCE, IS_LAST_SOURCE, false);
+	msix_clean_up_interrupts(dd);
 
 	for (pidx = 0; pidx < dd->num_pports; ++pidx) {
 		ppd = dd->pport + pidx;
@@ -1168,7 +1189,7 @@
 /*
  * Release our hold on the shared asic data.  If we are the last one,
  * return the structure to be finalized outside the lock.  Must be
- * holding hfi1_devs_lock.
+ * holding hfi1_dev_table lock.
  */
 static struct hfi1_asic_data *release_asic_data(struct hfi1_devdata *dd)
 {
@@ -1204,13 +1225,10 @@
 	struct hfi1_asic_data *ad;
 	unsigned long flags;
 
-	spin_lock_irqsave(&hfi1_devs_lock, flags);
-	if (!list_empty(&dd->list)) {
-		idr_remove(&hfi1_unit_table, dd->unit);
-		list_del_init(&dd->list);
-	}
+	xa_lock_irqsave(&hfi1_dev_table, flags);
+	__xa_erase(&hfi1_dev_table, dd->unit);
 	ad = release_asic_data(dd);
-	spin_unlock_irqrestore(&hfi1_devs_lock, flags);
+	xa_unlock_irqrestore(&hfi1_dev_table, flags);
 
 	finalize_asic_data(dd, ad);
 	free_platform_config(dd);
@@ -1246,17 +1264,18 @@
 	kobject_put(&dd->kobj);
 }
 
-/*
- * Allocate our primary per-unit data structure.  Must be done via verbs
- * allocator, because the verbs cleanup process both does cleanup and
- * free of the data structure.
- * "extra" is for chip-specific data.
+/**
+ * hfi1_alloc_devdata - Allocate our primary per-unit data structure.
+ * @pdev: Valid PCI device
+ * @extra: How many bytes to alloc past the default
  *
- * Use the idr mechanism to get a unit number for this unit.
+ * Must be done via verbs allocator, because the verbs cleanup process
+ * both does cleanup and free of the data structure.
+ * "extra" is for chip-specific data.
  */
-struct hfi1_devdata *hfi1_alloc_devdata(struct pci_dev *pdev, size_t extra)
+static struct hfi1_devdata *hfi1_alloc_devdata(struct pci_dev *pdev,
+					       size_t extra)
 {
-	unsigned long flags;
 	struct hfi1_devdata *dd;
 	int ret, nports;
 
@@ -1271,24 +1290,13 @@
 	dd->pport = (struct hfi1_pportdata *)(dd + 1);
 	dd->pcidev = pdev;
 	pci_set_drvdata(pdev, dd);
+	dd->node = NUMA_NO_NODE;
 
-	INIT_LIST_HEAD(&dd->list);
-	idr_preload(GFP_KERNEL);
-	spin_lock_irqsave(&hfi1_devs_lock, flags);
-
-	ret = idr_alloc(&hfi1_unit_table, dd, 0, 0, GFP_NOWAIT);
-	if (ret >= 0) {
-		dd->unit = ret;
-		list_add(&dd->list, &hfi1_dev_list);
-	}
-	dd->node = -1;
-
-	spin_unlock_irqrestore(&hfi1_devs_lock, flags);
-	idr_preload_end();
-
+	ret = xa_alloc_irq(&hfi1_dev_table, &dd->unit, dd, xa_limit_32b,
+			GFP_KERNEL);
 	if (ret < 0) {
-		hfi1_early_err(&pdev->dev,
-			       "Could not allocate unit ID: error %d\n", -ret);
+		dev_err(&pdev->dev,
+			"Could not allocate unit ID: error %d\n", -ret);
 		goto bail;
 	}
 	rvt_set_ibdev_name(&dd->verbs_dev.rdi, "%s_%d", class_name(), dd->unit);
@@ -1309,6 +1317,7 @@
 	spin_lock_init(&dd->pio_map_lock);
 	mutex_init(&dd->dc8051_lock);
 	init_waitqueue_head(&dd->event_queue);
+	spin_lock_init(&dd->irq_src_lock);
 
 	dd->int_counter = alloc_percpu(u64);
 	if (!dd->int_counter) {
@@ -1474,16 +1483,17 @@
 	/* sanitize link CRC options */
 	link_crc_mask &= SUPPORTED_CRCS;
 
+	ret = opfn_init();
+	if (ret < 0) {
+		pr_err("Failed to allocate opfn_wq");
+		goto bail_dev;
+	}
+
 	/*
 	 * These must be called before the driver is registered with
 	 * the PCI subsystem.
 	 */
-	idr_init(&hfi1_unit_table);
-
 	hfi1_dbg_init();
-	ret = hfi1_wss_init();
-	if (ret < 0)
-		goto bail_wss;
 	ret = pci_register_driver(&hfi1_pci_driver);
 	if (ret < 0) {
 		pr_err("Unable to register driver: error %d\n", -ret);
@@ -1492,10 +1502,7 @@
 	goto bail; /* all OK */
 
 bail_dev:
-	hfi1_wss_exit();
-bail_wss:
 	hfi1_dbg_exit();
-	idr_destroy(&hfi1_unit_table);
 	dev_cleanup();
 bail:
 	return ret;
@@ -1509,11 +1516,11 @@
 static void __exit hfi1_mod_cleanup(void)
 {
 	pci_unregister_driver(&hfi1_pci_driver);
+	opfn_exit();
 	node_affinity_destroy_all();
-	hfi1_wss_exit();
 	hfi1_dbg_exit();
 
-	idr_destroy(&hfi1_unit_table);
+	WARN_ON(!xa_empty(&hfi1_dev_table));
 	dispose_firmware();	/* asymmetric with obtain_firmware() */
 	dev_cleanup();
 }
@@ -1564,7 +1571,7 @@
 		struct hfi1_ctxtdata *rcd = dd->rcd[ctxt];
 
 		if (rcd) {
-			hfi1_clear_tids(rcd);
+			hfi1_free_ctxt_rcv_groups(rcd);
 			hfi1_free_ctxt(rcd);
 		}
 	}
@@ -1604,23 +1611,23 @@
 	hfi1_free_devdata(dd);
 }
 
-static int init_validate_rcvhdrcnt(struct device *dev, uint thecnt)
+static int init_validate_rcvhdrcnt(struct hfi1_devdata *dd, uint thecnt)
 {
 	if (thecnt <= HFI1_MIN_HDRQ_EGRBUF_CNT) {
-		hfi1_early_err(dev, "Receive header queue count too small\n");
+		dd_dev_err(dd, "Receive header queue count too small\n");
 		return -EINVAL;
 	}
 
 	if (thecnt > HFI1_MAX_HDRQ_EGRBUF_CNT) {
-		hfi1_early_err(dev,
-			       "Receive header queue count cannot be greater than %u\n",
-			       HFI1_MAX_HDRQ_EGRBUF_CNT);
+		dd_dev_err(dd,
+			   "Receive header queue count cannot be greater than %u\n",
+			   HFI1_MAX_HDRQ_EGRBUF_CNT);
 		return -EINVAL;
 	}
 
 	if (thecnt % HDRQ_INCREMENT) {
-		hfi1_early_err(dev, "Receive header queue count %d must be divisible by %lu\n",
-			       thecnt, HDRQ_INCREMENT);
+		dd_dev_err(dd, "Receive header queue count %d must be divisible by %lu\n",
+			   thecnt, HDRQ_INCREMENT);
 		return -EINVAL;
 	}
 
@@ -1639,22 +1646,29 @@
 	/* Validate dev ids */
 	if (!(ent->device == PCI_DEVICE_ID_INTEL0 ||
 	      ent->device == PCI_DEVICE_ID_INTEL1)) {
-		hfi1_early_err(&pdev->dev,
-			       "Failing on unknown Intel deviceid 0x%x\n",
-			       ent->device);
+		dev_err(&pdev->dev, "Failing on unknown Intel deviceid 0x%x\n",
+			ent->device);
 		ret = -ENODEV;
 		goto bail;
 	}
 
+	/* Allocate the dd so we can get to work */
+	dd = hfi1_alloc_devdata(pdev, NUM_IB_PORTS *
+				sizeof(struct hfi1_pportdata));
+	if (IS_ERR(dd)) {
+		ret = PTR_ERR(dd);
+		goto bail;
+	}
+
 	/* Validate some global module parameters */
-	ret = init_validate_rcvhdrcnt(&pdev->dev, rcvhdrcnt);
+	ret = init_validate_rcvhdrcnt(dd, rcvhdrcnt);
 	if (ret)
 		goto bail;
 
 	/* use the encoding function as a sanitization check */
 	if (!encode_rcv_header_entry_size(hfi1_hdrq_entsize)) {
-		hfi1_early_err(&pdev->dev, "Invalid HdrQ Entry size %u\n",
-			       hfi1_hdrq_entsize);
+		dd_dev_err(dd, "Invalid HdrQ Entry size %u\n",
+			   hfi1_hdrq_entsize);
 		ret = -EINVAL;
 		goto bail;
 	}
@@ -1676,10 +1690,10 @@
 			clamp_val(eager_buffer_size,
 				  MIN_EAGER_BUFFER * 8,
 				  MAX_EAGER_BUFFER_TOTAL);
-		hfi1_early_info(&pdev->dev, "Eager buffer size %u\n",
-				eager_buffer_size);
+		dd_dev_info(dd, "Eager buffer size %u\n",
+			    eager_buffer_size);
 	} else {
-		hfi1_early_err(&pdev->dev, "Invalid Eager buffer size of 0\n");
+		dd_dev_err(dd, "Invalid Eager buffer size of 0\n");
 		ret = -EINVAL;
 		goto bail;
 	}
@@ -1687,7 +1701,7 @@
 	/* restrict value of hfi1_rcvarr_split */
 	hfi1_rcvarr_split = clamp_val(hfi1_rcvarr_split, 0, 100);
 
-	ret = hfi1_pcie_init(pdev, ent);
+	ret = hfi1_pcie_init(dd);
 	if (ret)
 		goto bail;
 
@@ -1695,12 +1709,9 @@
 	 * Do device-specific initialization, function table setup, dd
 	 * allocation, etc.
 	 */
-	dd = hfi1_init_dd(pdev, ent);
-
-	if (IS_ERR(dd)) {
-		ret = PTR_ERR(dd);
+	ret = hfi1_init_dd(dd);
+	if (ret)
 		goto clean_bail; /* error already printed */
-	}
 
 	ret = create_workqueues(dd);
 	if (ret)
@@ -1731,7 +1742,7 @@
 		dd_dev_err(dd, "Failed to create /dev devices: %d\n", -j);
 
 	if (initfail || ret) {
-		hfi1_clean_up_interrupts(dd);
+		msix_clean_up_interrupts(dd);
 		stop_timers(dd);
 		flush_workqueue(ib_wq);
 		for (pidx = 0; pidx < dd->num_pports; ++pidx) {
@@ -1842,9 +1853,9 @@
 			gfp_flags = GFP_KERNEL;
 		else
 			gfp_flags = GFP_USER;
-		rcd->rcvhdrq = dma_zalloc_coherent(
-			&dd->pcidev->dev, amt, &rcd->rcvhdrq_dma,
-			gfp_flags | __GFP_COMP);
+		rcd->rcvhdrq = dma_alloc_coherent(&dd->pcidev->dev, amt,
+						  &rcd->rcvhdrq_dma,
+						  gfp_flags | __GFP_COMP);
 
 		if (!rcd->rcvhdrq) {
 			dd_dev_err(dd,
@@ -1855,9 +1866,10 @@
 
 		if (HFI1_CAP_KGET_MASK(rcd->flags, DMA_RTAIL) ||
 		    HFI1_CAP_UGET_MASK(rcd->flags, DMA_RTAIL)) {
-			rcd->rcvhdrtail_kvaddr = dma_zalloc_coherent(
-				&dd->pcidev->dev, PAGE_SIZE,
-				&rcd->rcvhdrqtailaddr_dma, gfp_flags);
+			rcd->rcvhdrtail_kvaddr = dma_alloc_coherent(&dd->pcidev->dev,
+								    PAGE_SIZE,
+								    &rcd->rcvhdrqtailaddr_dma,
+								    gfp_flags);
 			if (!rcd->rcvhdrtail_kvaddr)
 				goto bail_free;
 		}
@@ -1953,10 +1965,10 @@
 	while (alloced_bytes < rcd->egrbufs.size &&
 	       rcd->egrbufs.alloced < rcd->egrbufs.count) {
 		rcd->egrbufs.buffers[idx].addr =
-			dma_zalloc_coherent(&dd->pcidev->dev,
-					    rcd->egrbufs.rcvtid_size,
-					    &rcd->egrbufs.buffers[idx].dma,
-					    gfp_flags);
+			dma_alloc_coherent(&dd->pcidev->dev,
+					   rcd->egrbufs.rcvtid_size,
+					   &rcd->egrbufs.buffers[idx].dma,
+					   gfp_flags);
 		if (rcd->egrbufs.buffers[idx].addr) {
 			rcd->egrbufs.buffers[idx].len =
 				rcd->egrbufs.rcvtid_size;
@@ -2027,7 +2039,7 @@
 	rcd->egrbufs.size = alloced_bytes;
 
 	hfi1_cdbg(PROC,
-		  "ctxt%u: Alloced %u rcv tid entries @ %uKB, total %zuKB\n",
+		  "ctxt%u: Alloced %u rcv tid entries @ %uKB, total %uKB\n",
 		  rcd->ctxt, rcd->egrbufs.alloced,
 		  rcd->egrbufs.rcvtid_size / 1024, rcd->egrbufs.size / 1024);
 
diff --git a/drivers/infiniband/hw/hfi1/iowait.c b/drivers/infiniband/hw/hfi1/iowait.c
new file mode 100644
index 0000000..adb4a1b
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/iowait.c
@@ -0,0 +1,126 @@
+// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
+/*
+ * Copyright(c) 2018 Intel Corporation.
+ *
+ */
+#include "iowait.h"
+#include "trace_iowait.h"
+
+/* 1 priority == 16 starve_cnt */
+#define IOWAIT_PRIORITY_STARVE_SHIFT 4
+
+void iowait_set_flag(struct iowait *wait, u32 flag)
+{
+	trace_hfi1_iowait_set(wait, flag);
+	set_bit(flag, &wait->flags);
+}
+
+bool iowait_flag_set(struct iowait *wait, u32 flag)
+{
+	return test_bit(flag, &wait->flags);
+}
+
+inline void iowait_clear_flag(struct iowait *wait, u32 flag)
+{
+	trace_hfi1_iowait_clear(wait, flag);
+	clear_bit(flag, &wait->flags);
+}
+
+/**
+ * iowait_init() - initialize wait structure
+ * @wait: wait struct to initialize
+ * @tx_limit: limit for overflow queuing
+ * @func: restart function for workqueue
+ * @sleep: sleep function for no space
+ * @resume: wakeup function for no space
+ *
+ * This function initializes the iowait
+ * structure embedded in the QP or PQ.
+ *
+ */
+void iowait_init(struct iowait *wait, u32 tx_limit,
+		 void (*func)(struct work_struct *work),
+		 void (*tidfunc)(struct work_struct *work),
+		 int (*sleep)(struct sdma_engine *sde,
+			      struct iowait_work *wait,
+			      struct sdma_txreq *tx,
+			      uint seq,
+			      bool pkts_sent),
+		 void (*wakeup)(struct iowait *wait, int reason),
+		 void (*sdma_drained)(struct iowait *wait),
+		 void (*init_priority)(struct iowait *wait))
+{
+	int i;
+
+	wait->count = 0;
+	INIT_LIST_HEAD(&wait->list);
+	init_waitqueue_head(&wait->wait_dma);
+	init_waitqueue_head(&wait->wait_pio);
+	atomic_set(&wait->sdma_busy, 0);
+	atomic_set(&wait->pio_busy, 0);
+	wait->tx_limit = tx_limit;
+	wait->sleep = sleep;
+	wait->wakeup = wakeup;
+	wait->sdma_drained = sdma_drained;
+	wait->init_priority = init_priority;
+	wait->flags = 0;
+	for (i = 0; i < IOWAIT_SES; i++) {
+		wait->wait[i].iow = wait;
+		INIT_LIST_HEAD(&wait->wait[i].tx_head);
+		if (i == IOWAIT_IB_SE)
+			INIT_WORK(&wait->wait[i].iowork, func);
+		else
+			INIT_WORK(&wait->wait[i].iowork, tidfunc);
+	}
+}
+
+/**
+ * iowait_cancel_work - cancel all work in iowait
+ * @w: the iowait struct
+ */
+void iowait_cancel_work(struct iowait *w)
+{
+	cancel_work_sync(&iowait_get_ib_work(w)->iowork);
+	cancel_work_sync(&iowait_get_tid_work(w)->iowork);
+}
+
+/**
+ * iowait_set_work_flag - set work flag based on leg
+ * @w - the iowait work struct
+ */
+int iowait_set_work_flag(struct iowait_work *w)
+{
+	if (w == &w->iow->wait[IOWAIT_IB_SE]) {
+		iowait_set_flag(w->iow, IOWAIT_PENDING_IB);
+		return IOWAIT_IB_SE;
+	}
+	iowait_set_flag(w->iow, IOWAIT_PENDING_TID);
+	return IOWAIT_TID_SE;
+}
+
+/**
+ * iowait_priority_update_top - update the top priority entry
+ * @w: the iowait struct
+ * @top: a pointer to the top priority entry
+ * @idx: the index of the current iowait in an array
+ * @top_idx: the array index for the iowait entry that has the top priority
+ *
+ * This function is called to compare the priority of a given
+ * iowait with the given top priority entry. The top index will
+ * be returned.
+ */
+uint iowait_priority_update_top(struct iowait *w,
+				struct iowait *top,
+				uint idx, uint top_idx)
+{
+	u8 cnt, tcnt;
+
+	/* Convert priority into starve_cnt and compare the total.*/
+	cnt = (w->priority << IOWAIT_PRIORITY_STARVE_SHIFT) + w->starved_cnt;
+	tcnt = (top->priority << IOWAIT_PRIORITY_STARVE_SHIFT) +
+		top->starved_cnt;
+	if (cnt > tcnt)
+		return idx;
+	else
+		return top_idx;
+}
diff --git a/drivers/infiniband/hw/hfi1/iowait.h b/drivers/infiniband/hw/hfi1/iowait.h
index 3d9c32c..07847cb 100644
--- a/drivers/infiniband/hw/hfi1/iowait.h
+++ b/drivers/infiniband/hw/hfi1/iowait.h
@@ -1,7 +1,7 @@
 #ifndef _HFI1_IOWAIT_H
 #define _HFI1_IOWAIT_H
 /*
- * Copyright(c) 2015, 2016 Intel Corporation.
+ * Copyright(c) 2015 - 2018 Intel Corporation.
  *
  * This file is provided under a dual BSD/GPLv2 license.  When using or
  * redistributing this file, you may do so under either license.
@@ -49,6 +49,7 @@
 
 #include <linux/list.h>
 #include <linux/workqueue.h>
+#include <linux/wait.h>
 #include <linux/sched.h>
 
 #include "sdma_txreq.h"
@@ -59,16 +60,48 @@
  */
 typedef void (*restart_t)(struct work_struct *work);
 
+#define IOWAIT_PENDING_IB  0x0
+#define IOWAIT_PENDING_TID 0x1
+
+/*
+ * A QP can have multiple Send Engines (SEs).
+ *
+ * The current use case is for supporting a TID RDMA
+ * packet build/xmit mechanism independent from verbs.
+ */
+#define IOWAIT_SES 2
+#define IOWAIT_IB_SE 0
+#define IOWAIT_TID_SE 1
+
 struct sdma_txreq;
 struct sdma_engine;
 /**
- * struct iowait - linkage for delayed progress/waiting
+ * @iowork: the work struct
+ * @tx_head: list of prebuilt packets
+ * @iow: the parent iowait structure
+ *
+ * This structure is the work item (process) specific
+ * details associated with the each of the two SEs of the
+ * QP.
+ *
+ * The workstruct and the queued TXs are unique to each
+ * SE.
+ */
+struct iowait;
+struct iowait_work {
+	struct work_struct iowork;
+	struct list_head tx_head;
+	struct iowait *iow;
+};
+
+/**
  * @list: used to add/insert into QP/PQ wait lists
- * @lock: uses to record the list head lock
  * @tx_head: overflow list of sdma_txreq's
  * @sleep: no space callback
  * @wakeup: space callback wakeup
  * @sdma_drained: sdma count drained
+ * @init_priority: callback to manipulate priority
+ * @lock: lock protected head of wait queue
  * @iowork: workqueue overhead
  * @wait_dma: wait for sdma_busy == 0
  * @wait_pio: wait for pio_busy == 0
@@ -76,6 +109,8 @@
  * @count: total number of descriptors in tx_head'ed list
  * @tx_limit: limit for overflow queuing
  * @tx_count: number of tx entry's in tx_head'ed list
+ * @flags: wait flags (one per QP)
+ * @wait: SE array for multiple legs
  *
  * This is to be embedded in user's state structure
  * (QP or PQ).
@@ -86,10 +121,13 @@
  * are callbacks for the ULP to implement
  * what ever queuing/dequeuing of
  * the embedded iowait and its containing struct
- * when a resource shortage like SDMA ring space is seen.
+ * when a resource shortage like SDMA ring space
+ * or PIO credit space is seen.
  *
  * Both potentially have locks help
- * so sleeping is not allowed.
+ * so sleeping is not allowed and it is not
+ * supported to submit txreqs from the wakeup
+ * call directly because of lock conflicts.
  *
  * The wait_dma member along with the iow
  *
@@ -98,21 +136,19 @@
  * Waiters explicity know that, but the destroy
  * code that unwaits QPs does not.
  */
-
 struct iowait {
 	struct list_head list;
-	struct list_head tx_head;
 	int (*sleep)(
 		struct sdma_engine *sde,
-		struct iowait *wait,
+		struct iowait_work *wait,
 		struct sdma_txreq *tx,
 		uint seq,
 		bool pkts_sent
 		);
 	void (*wakeup)(struct iowait *wait, int reason);
 	void (*sdma_drained)(struct iowait *wait);
+	void (*init_priority)(struct iowait *wait);
 	seqlock_t *lock;
-	struct work_struct iowork;
 	wait_queue_head_t wait_dma;
 	wait_queue_head_t wait_pio;
 	atomic_t sdma_busy;
@@ -121,63 +157,51 @@
 	u32 tx_limit;
 	u32 tx_count;
 	u8 starved_cnt;
+	u8 priority;
+	unsigned long flags;
+	struct iowait_work wait[IOWAIT_SES];
 };
 
 #define SDMA_AVAIL_REASON 0
 
-/**
- * iowait_init() - initialize wait structure
- * @wait: wait struct to initialize
- * @tx_limit: limit for overflow queuing
- * @func: restart function for workqueue
- * @sleep: sleep function for no space
- * @resume: wakeup function for no space
- *
- * This function initializes the iowait
- * structure embedded in the QP or PQ.
- *
- */
+void iowait_set_flag(struct iowait *wait, u32 flag);
+bool iowait_flag_set(struct iowait *wait, u32 flag);
+void iowait_clear_flag(struct iowait *wait, u32 flag);
 
-static inline void iowait_init(
-	struct iowait *wait,
-	u32 tx_limit,
-	void (*func)(struct work_struct *work),
-	int (*sleep)(
-		struct sdma_engine *sde,
-		struct iowait *wait,
-		struct sdma_txreq *tx,
-		uint seq,
-		bool pkts_sent),
-	void (*wakeup)(struct iowait *wait, int reason),
-	void (*sdma_drained)(struct iowait *wait))
-{
-	wait->count = 0;
-	wait->lock = NULL;
-	INIT_LIST_HEAD(&wait->list);
-	INIT_LIST_HEAD(&wait->tx_head);
-	INIT_WORK(&wait->iowork, func);
-	init_waitqueue_head(&wait->wait_dma);
-	init_waitqueue_head(&wait->wait_pio);
-	atomic_set(&wait->sdma_busy, 0);
-	atomic_set(&wait->pio_busy, 0);
-	wait->tx_limit = tx_limit;
-	wait->sleep = sleep;
-	wait->wakeup = wakeup;
-	wait->sdma_drained = sdma_drained;
-}
+void iowait_init(struct iowait *wait, u32 tx_limit,
+		 void (*func)(struct work_struct *work),
+		 void (*tidfunc)(struct work_struct *work),
+		 int (*sleep)(struct sdma_engine *sde,
+			      struct iowait_work *wait,
+			      struct sdma_txreq *tx,
+			      uint seq,
+			      bool pkts_sent),
+		 void (*wakeup)(struct iowait *wait, int reason),
+		 void (*sdma_drained)(struct iowait *wait),
+		 void (*init_priority)(struct iowait *wait));
 
 /**
- * iowait_schedule() - initialize wait structure
+ * iowait_schedule() - schedule the default send engine work
  * @wait: wait struct to schedule
  * @wq: workqueue for schedule
  * @cpu: cpu
  */
-static inline void iowait_schedule(
-	struct iowait *wait,
-	struct workqueue_struct *wq,
-	int cpu)
+static inline bool iowait_schedule(struct iowait *wait,
+				   struct workqueue_struct *wq, int cpu)
 {
-	queue_work_on(cpu, wq, &wait->iowork);
+	return !!queue_work_on(cpu, wq, &wait->wait[IOWAIT_IB_SE].iowork);
+}
+
+/**
+ * iowait_tid_schedule - schedule the tid SE
+ * @wait: the iowait structure
+ * @wq: the work queue
+ * @cpu: the cpu
+ */
+static inline bool iowait_tid_schedule(struct iowait *wait,
+				       struct workqueue_struct *wq, int cpu)
+{
+	return !!queue_work_on(cpu, wq, &wait->wait[IOWAIT_TID_SE].iowork);
 }
 
 /**
@@ -228,6 +252,8 @@
  */
 static inline int iowait_sdma_dec(struct iowait *wait)
 {
+	if (!wait)
+		return 0;
 	return atomic_dec_and_test(&wait->sdma_busy);
 }
 
@@ -267,11 +293,13 @@
 }
 
 /**
- * iowait_sdma_dec - note pio complete
+ * iowait_pio_dec - note pio complete
  * @wait: iowait structure
  */
 static inline int iowait_pio_dec(struct iowait *wait)
 {
+	if (!wait)
+		return 0;
 	return atomic_dec_and_test(&wait->pio_busy);
 }
 
@@ -293,9 +321,9 @@
 /**
  * iowait_get_txhead() - get packet off of iowait list
  *
- * @wait wait struture
+ * @wait iowait_work struture
  */
-static inline struct sdma_txreq *iowait_get_txhead(struct iowait *wait)
+static inline struct sdma_txreq *iowait_get_txhead(struct iowait_work *wait)
 {
 	struct sdma_txreq *tx = NULL;
 
@@ -309,6 +337,61 @@
 	return tx;
 }
 
+static inline u16 iowait_get_desc(struct iowait_work *w)
+{
+	u16 num_desc = 0;
+	struct sdma_txreq *tx = NULL;
+
+	if (!list_empty(&w->tx_head)) {
+		tx = list_first_entry(&w->tx_head, struct sdma_txreq,
+				      list);
+		num_desc = tx->num_desc;
+		if (tx->flags & SDMA_TXREQ_F_VIP)
+			w->iow->priority++;
+	}
+	return num_desc;
+}
+
+static inline u32 iowait_get_all_desc(struct iowait *w)
+{
+	u32 num_desc = 0;
+
+	num_desc = iowait_get_desc(&w->wait[IOWAIT_IB_SE]);
+	num_desc += iowait_get_desc(&w->wait[IOWAIT_TID_SE]);
+	return num_desc;
+}
+
+static inline void iowait_update_priority(struct iowait_work *w)
+{
+	struct sdma_txreq *tx = NULL;
+
+	if (!list_empty(&w->tx_head)) {
+		tx = list_first_entry(&w->tx_head, struct sdma_txreq,
+				      list);
+		if (tx->flags & SDMA_TXREQ_F_VIP)
+			w->iow->priority++;
+	}
+}
+
+static inline void iowait_update_all_priority(struct iowait *w)
+{
+	iowait_update_priority(&w->wait[IOWAIT_IB_SE]);
+	iowait_update_priority(&w->wait[IOWAIT_TID_SE]);
+}
+
+static inline void iowait_init_priority(struct iowait *w)
+{
+	w->priority = 0;
+	if (w->init_priority)
+		w->init_priority(w);
+}
+
+static inline void iowait_get_priority(struct iowait *w)
+{
+	iowait_init_priority(w);
+	iowait_update_all_priority(w);
+}
+
 /**
  * iowait_queue - Put the iowait on a wait queue
  * @pkts_sent: have some packets been sent before queuing?
@@ -325,14 +408,18 @@
 	/*
 	 * To play fair, insert the iowait at the tail of the wait queue if it
 	 * has already sent some packets; Otherwise, put it at the head.
+	 * However, if it has priority packets to send, also put it at the
+	 * head.
 	 */
-	if (pkts_sent) {
-		list_add_tail(&w->list, wait_head);
+	if (pkts_sent)
 		w->starved_cnt = 0;
-	} else {
-		list_add(&w->list, wait_head);
+	else
 		w->starved_cnt++;
-	}
+
+	if (w->priority > 0 || !pkts_sent)
+		list_add(&w->list, wait_head);
+	else
+		list_add_tail(&w->list, wait_head);
 }
 
 /**
@@ -349,35 +436,63 @@
 		w->starved_cnt = 0;
 }
 
-/**
- * iowait_starve_find_max - Find the maximum of the starve count
- * @w: the iowait struct
- * @max: a variable containing the max starve count
- * @idx: the index of the current iowait in an array
- * @max_idx: a variable containing the array index for the
- *         iowait entry that has the max starve count
- *
- * This function is called to compare the starve count of a
- * given iowait with the given max starve count. The max starve
- * count and the index will be updated if the iowait's start
- * count is larger.
- */
-static inline void iowait_starve_find_max(struct iowait *w, u8 *max,
-					  uint idx, uint *max_idx)
-{
-	if (w->starved_cnt > *max) {
-		*max = w->starved_cnt;
-		*max_idx = idx;
-	}
-}
+/* Update the top priority index */
+uint iowait_priority_update_top(struct iowait *w,
+				struct iowait *top,
+				uint idx, uint top_idx);
 
 /**
- * iowait_packet_queued() - determine if a packet is already built
- * @wait: the wait structure
+ * iowait_packet_queued() - determine if a packet is queued
+ * @wait: the iowait_work structure
  */
-static inline bool iowait_packet_queued(struct iowait *wait)
+static inline bool iowait_packet_queued(struct iowait_work *wait)
 {
 	return !list_empty(&wait->tx_head);
 }
 
+/**
+ * inc_wait_count - increment wait counts
+ * @w: the log work struct
+ * @n: the count
+ */
+static inline void iowait_inc_wait_count(struct iowait_work *w, u16 n)
+{
+	if (!w)
+		return;
+	w->iow->tx_count++;
+	w->iow->count += n;
+}
+
+/**
+ * iowait_get_tid_work - return iowait_work for tid SE
+ * @w: the iowait struct
+ */
+static inline struct iowait_work *iowait_get_tid_work(struct iowait *w)
+{
+	return &w->wait[IOWAIT_TID_SE];
+}
+
+/**
+ * iowait_get_ib_work - return iowait_work for ib SE
+ * @w: the iowait struct
+ */
+static inline struct iowait_work *iowait_get_ib_work(struct iowait *w)
+{
+	return &w->wait[IOWAIT_IB_SE];
+}
+
+/**
+ * iowait_ioww_to_iow - return iowait given iowait_work
+ * @w: the iowait_work struct
+ */
+static inline struct iowait *iowait_ioww_to_iow(struct iowait_work *w)
+{
+	if (likely(w))
+		return w->iow;
+	return NULL;
+}
+
+void iowait_cancel_work(struct iowait *w);
+int iowait_set_work_flag(struct iowait_work *w);
+
 #endif
diff --git a/drivers/infiniband/hw/hfi1/mad.c b/drivers/infiniband/hw/hfi1/mad.c
index 0307405..d8ff063 100644
--- a/drivers/infiniband/hw/hfi1/mad.c
+++ b/drivers/infiniband/hw/hfi1/mad.c
@@ -1,5 +1,5 @@
 /*
- * Copyright(c) 2015-2017 Intel Corporation.
+ * Copyright(c) 2015-2018 Intel Corporation.
  *
  * This file is provided under a dual BSD/GPLv2 license.  When using or
  * redistributing this file, you may do so under either license.
@@ -305,7 +305,7 @@
 	rcu_read_lock();
 	qp0 = rcu_dereference(ibp->rvp.qp[0]);
 	if (qp0)
-		ah = rdma_create_ah(qp0->ibqp.pd, &attr);
+		ah = rdma_create_ah(qp0->ibqp.pd, &attr, 0);
 	rcu_read_unlock();
 	return ah;
 }
@@ -2326,7 +2326,7 @@
 	__be32 vl_select_mask;
 };
 
-#define VL_MASK_ALL		0x000080ff
+#define VL_MASK_ALL		0x00000000000080ffUL
 
 struct opa_port_status_rsp {
 	__u8 port_num;
@@ -2625,15 +2625,14 @@
 }
 
 static void a0_portstatus(struct hfi1_pportdata *ppd,
-			  struct opa_port_status_rsp *rsp, u32 vl_select_mask)
+			  struct opa_port_status_rsp *rsp)
 {
 	if (!is_bx(ppd->dd)) {
 		unsigned long vl;
 		u64 sum_vl_xmit_wait = 0;
-		u32 vl_all_mask = VL_MASK_ALL;
+		unsigned long vl_all_mask = VL_MASK_ALL;
 
-		for_each_set_bit(vl, (unsigned long *)&(vl_all_mask),
-				 8 * sizeof(vl_all_mask)) {
+		for_each_set_bit(vl, &vl_all_mask, BITS_PER_LONG) {
 			u64 tmp = sum_vl_xmit_wait +
 				  read_port_cntr(ppd, C_TX_WAIT_VL,
 						 idx_from_vl(vl));
@@ -2730,12 +2729,12 @@
 		(struct opa_port_status_req *)pmp->data;
 	struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
 	struct opa_port_status_rsp *rsp;
-	u32 vl_select_mask = be32_to_cpu(req->vl_select_mask);
+	unsigned long vl_select_mask = be32_to_cpu(req->vl_select_mask);
 	unsigned long vl;
 	size_t response_data_size;
 	u32 nports = be32_to_cpu(pmp->mad_hdr.attr_mod) >> 24;
 	u8 port_num = req->port_num;
-	u8 num_vls = hweight32(vl_select_mask);
+	u8 num_vls = hweight64(vl_select_mask);
 	struct _vls_pctrs *vlinfo;
 	struct hfi1_ibport *ibp = to_iport(ibdev, port);
 	struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
@@ -2744,8 +2743,7 @@
 	u16 link_width;
 	u16 link_speed;
 
-	response_data_size = sizeof(struct opa_port_status_rsp) +
-				num_vls * sizeof(struct _vls_pctrs);
+	response_data_size = struct_size(rsp, vls, num_vls);
 	if (response_data_size > sizeof(pmp->data)) {
 		pmp->mad_hdr.status |= OPA_PM_STATUS_REQUEST_TOO_LARGE;
 		return reply((struct ib_mad_hdr *)pmp);
@@ -2771,7 +2769,7 @@
 
 	hfi1_read_link_quality(dd, &rsp->link_quality_indicator);
 
-	rsp->vl_select_mask = cpu_to_be32(vl_select_mask);
+	rsp->vl_select_mask = cpu_to_be32((u32)vl_select_mask);
 	rsp->port_xmit_data = cpu_to_be64(read_dev_cntr(dd, C_DC_XMIT_FLITS,
 					  CNTR_INVALID_VL));
 	rsp->port_rcv_data = cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_FLITS,
@@ -2842,8 +2840,7 @@
 	 * So in the for_each_set_bit() loop below, we don't need
 	 * any additional checks for vl.
 	 */
-	for_each_set_bit(vl, (unsigned long *)&(vl_select_mask),
-			 8 * sizeof(vl_select_mask)) {
+	for_each_set_bit(vl, &vl_select_mask, BITS_PER_LONG) {
 		memset(vlinfo, 0, sizeof(*vlinfo));
 
 		tmp = read_dev_cntr(dd, C_DC_RX_FLIT_VL, idx_from_vl(vl));
@@ -2884,7 +2881,7 @@
 		vfi++;
 	}
 
-	a0_portstatus(ppd, rsp, vl_select_mask);
+	a0_portstatus(ppd, rsp);
 
 	if (resp_len)
 		*resp_len += response_data_size;
@@ -2931,16 +2928,14 @@
 	return error_counter_summary;
 }
 
-static void a0_datacounters(struct hfi1_pportdata *ppd, struct _port_dctrs *rsp,
-			    u32 vl_select_mask)
+static void a0_datacounters(struct hfi1_pportdata *ppd, struct _port_dctrs *rsp)
 {
 	if (!is_bx(ppd->dd)) {
 		unsigned long vl;
 		u64 sum_vl_xmit_wait = 0;
-		u32 vl_all_mask = VL_MASK_ALL;
+		unsigned long vl_all_mask = VL_MASK_ALL;
 
-		for_each_set_bit(vl, (unsigned long *)&(vl_all_mask),
-				 8 * sizeof(vl_all_mask)) {
+		for_each_set_bit(vl, &vl_all_mask, BITS_PER_LONG) {
 			u64 tmp = sum_vl_xmit_wait +
 				  read_port_cntr(ppd, C_TX_WAIT_VL,
 						 idx_from_vl(vl));
@@ -2995,7 +2990,7 @@
 	u64 port_mask;
 	u8 port_num;
 	unsigned long vl;
-	u32 vl_select_mask;
+	unsigned long vl_select_mask;
 	int vfi;
 	u16 link_width;
 	u16 link_speed;
@@ -3014,8 +3009,7 @@
 	}
 
 	/* Sanity check */
-	response_data_size = sizeof(struct opa_port_data_counters_msg) +
-				num_vls * sizeof(struct _vls_dctrs);
+	response_data_size = struct_size(req, port[0].vls, num_vls);
 
 	if (response_data_size > sizeof(pmp->data)) {
 		pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
@@ -3073,8 +3067,7 @@
 	 * So in the for_each_set_bit() loop below, we don't need
 	 * any additional checks for vl.
 	 */
-	for_each_set_bit(vl, (unsigned long *)&(vl_select_mask),
-			 8 * sizeof(req->vl_select_mask)) {
+	for_each_set_bit(vl, &vl_select_mask, BITS_PER_LONG) {
 		memset(vlinfo, 0, sizeof(*vlinfo));
 
 		rsp->vls[vfi].port_vl_xmit_data =
@@ -3122,7 +3115,7 @@
 		vfi++;
 	}
 
-	a0_datacounters(ppd, rsp, vl_select_mask);
+	a0_datacounters(ppd, rsp);
 
 	if (resp_len)
 		*resp_len += response_data_size;
@@ -3217,7 +3210,7 @@
 	struct _vls_ectrs *vlinfo;
 	unsigned long vl;
 	u64 port_mask, tmp;
-	u32 vl_select_mask;
+	unsigned long vl_select_mask;
 	int vfi;
 
 	req = (struct opa_port_error_counters64_msg *)pmp->data;
@@ -3232,8 +3225,7 @@
 		return reply((struct ib_mad_hdr *)pmp);
 	}
 
-	response_data_size = sizeof(struct opa_port_error_counters64_msg) +
-				num_vls * sizeof(struct _vls_ectrs);
+	response_data_size = struct_size(req, port[0].vls, num_vls);
 
 	if (response_data_size > sizeof(pmp->data)) {
 		pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
@@ -3276,8 +3268,7 @@
 	vlinfo = &rsp->vls[0];
 	vfi = 0;
 	vl_select_mask = be32_to_cpu(req->vl_select_mask);
-	for_each_set_bit(vl, (unsigned long *)&(vl_select_mask),
-			 8 * sizeof(req->vl_select_mask)) {
+	for_each_set_bit(vl, &vl_select_mask, BITS_PER_LONG) {
 		memset(vlinfo, 0, sizeof(*vlinfo));
 		rsp->vls[vfi].port_vl_xmit_discards =
 			cpu_to_be64(read_port_cntr(ppd, C_SW_XMIT_DSCD_VL,
@@ -3488,7 +3479,7 @@
 	u32 nports = be32_to_cpu(pmp->mad_hdr.attr_mod) >> 24;
 	u64 portn = be64_to_cpu(req->port_select_mask[3]);
 	u32 counter_select = be32_to_cpu(req->counter_select_mask);
-	u32 vl_select_mask = VL_MASK_ALL; /* clear all per-vl cnts */
+	unsigned long vl_select_mask = VL_MASK_ALL; /* clear all per-vl cnts */
 	unsigned long vl;
 
 	if ((nports != 1) || (portn != 1 << port)) {
@@ -3582,8 +3573,7 @@
 	if (counter_select & CS_UNCORRECTABLE_ERRORS)
 		write_dev_cntr(dd, C_DC_UNC_ERR, CNTR_INVALID_VL, 0);
 
-	for_each_set_bit(vl, (unsigned long *)&(vl_select_mask),
-			 8 * sizeof(vl_select_mask)) {
+	for_each_set_bit(vl, &vl_select_mask, BITS_PER_LONG) {
 		if (counter_select & CS_PORT_XMIT_DATA)
 			write_port_cntr(ppd, C_TX_FLIT_VL, idx_from_vl(vl), 0);
 
@@ -4836,7 +4826,7 @@
 	int ret;
 	int pkey_idx;
 	int local_mad = 0;
-	u32 resp_len = 0;
+	u32 resp_len = in_wc->byte_len - sizeof(*in_grh);
 	struct hfi1_ibport *ibp = to_iport(ibdev, port);
 
 	pkey_idx = hfi1_lookup_pkey_idx(ibp, LIM_MGMT_P_KEY);
diff --git a/drivers/infiniband/hw/hfi1/mmu_rb.c b/drivers/infiniband/hw/hfi1/mmu_rb.c
index e1c7996..14d2a90 100644
--- a/drivers/infiniband/hw/hfi1/mmu_rb.c
+++ b/drivers/infiniband/hw/hfi1/mmu_rb.c
@@ -68,8 +68,7 @@
 static unsigned long mmu_node_start(struct mmu_rb_node *);
 static unsigned long mmu_node_last(struct mmu_rb_node *);
 static int mmu_notifier_range_start(struct mmu_notifier *,
-				     struct mm_struct *,
-				     unsigned long, unsigned long, bool);
+		const struct mmu_notifier_range *);
 static struct mmu_rb_node *__mmu_rb_search(struct mmu_rb_handler *,
 					   unsigned long, unsigned long);
 static void do_remove(struct mmu_rb_handler *handler,
@@ -77,7 +76,6 @@
 static void handle_remove(struct work_struct *work);
 
 static const struct mmu_notifier_ops mn_opts = {
-	.flags = MMU_INVALIDATE_DOES_NOT_BLOCK,
 	.invalidate_range_start = mmu_notifier_range_start,
 };
 
@@ -285,10 +283,7 @@
 }
 
 static int mmu_notifier_range_start(struct mmu_notifier *mn,
-				     struct mm_struct *mm,
-				     unsigned long start,
-				     unsigned long end,
-				     bool blockable)
+		const struct mmu_notifier_range *range)
 {
 	struct mmu_rb_handler *handler =
 		container_of(mn, struct mmu_rb_handler, mn);
@@ -298,10 +293,11 @@
 	bool added = false;
 
 	spin_lock_irqsave(&handler->lock, flags);
-	for (node = __mmu_int_rb_iter_first(root, start, end - 1);
+	for (node = __mmu_int_rb_iter_first(root, range->start, range->end-1);
 	     node; node = ptr) {
 		/* Guard against node removal. */
-		ptr = __mmu_int_rb_iter_next(node, start, end - 1);
+		ptr = __mmu_int_rb_iter_next(node, range->start,
+					     range->end - 1);
 		trace_hfi1_mmu_mem_invalidate(node->addr, node->len);
 		if (handler->ops->invalidate(handler->ops_arg, node)) {
 			__mmu_int_rb_remove(node, root);
diff --git a/drivers/infiniband/hw/hfi1/msix.c b/drivers/infiniband/hw/hfi1/msix.c
new file mode 100644
index 0000000..d920b16
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/msix.c
@@ -0,0 +1,363 @@
+// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
+/*
+ * Copyright(c) 2018 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include "hfi.h"
+#include "affinity.h"
+#include "sdma.h"
+
+/**
+ * msix_initialize() - Calculate, request and configure MSIx IRQs
+ * @dd: valid hfi1 devdata
+ *
+ */
+int msix_initialize(struct hfi1_devdata *dd)
+{
+	u32 total;
+	int ret;
+	struct hfi1_msix_entry *entries;
+
+	/*
+	 * MSIx interrupt count:
+	 *	one for the general, "slow path" interrupt
+	 *	one per used SDMA engine
+	 *	one per kernel receive context
+	 *	one for each VNIC context
+	 *      ...any new IRQs should be added here.
+	 */
+	total = 1 + dd->num_sdma + dd->n_krcv_queues + dd->num_vnic_contexts;
+
+	if (total >= CCE_NUM_MSIX_VECTORS)
+		return -EINVAL;
+
+	ret = pci_alloc_irq_vectors(dd->pcidev, total, total, PCI_IRQ_MSIX);
+	if (ret < 0) {
+		dd_dev_err(dd, "pci_alloc_irq_vectors() failed: %d\n", ret);
+		return ret;
+	}
+
+	entries = kcalloc(total, sizeof(*dd->msix_info.msix_entries),
+			  GFP_KERNEL);
+	if (!entries) {
+		pci_free_irq_vectors(dd->pcidev);
+		return -ENOMEM;
+	}
+
+	dd->msix_info.msix_entries = entries;
+	spin_lock_init(&dd->msix_info.msix_lock);
+	bitmap_zero(dd->msix_info.in_use_msix, total);
+	dd->msix_info.max_requested = total;
+	dd_dev_info(dd, "%u MSI-X interrupts allocated\n", total);
+
+	return 0;
+}
+
+/**
+ * msix_request_irq() - Allocate a free MSIx IRQ
+ * @dd: valid devdata
+ * @arg: context information for the IRQ
+ * @handler: IRQ handler
+ * @thread: IRQ thread handler (could be NULL)
+ * @idx: zero base idx if multiple devices are needed
+ * @type: affinty IRQ type
+ *
+ * Allocated an MSIx vector if available, and then create the appropriate
+ * meta data needed to keep track of the pci IRQ request.
+ *
+ * Return:
+ *   < 0   Error
+ *   >= 0  MSIx vector
+ *
+ */
+static int msix_request_irq(struct hfi1_devdata *dd, void *arg,
+			    irq_handler_t handler, irq_handler_t thread,
+			    u32 idx, enum irq_type type)
+{
+	unsigned long nr;
+	int irq;
+	int ret;
+	const char *err_info;
+	char name[MAX_NAME_SIZE];
+	struct hfi1_msix_entry *me;
+
+	/* Allocate an MSIx vector */
+	spin_lock(&dd->msix_info.msix_lock);
+	nr = find_first_zero_bit(dd->msix_info.in_use_msix,
+				 dd->msix_info.max_requested);
+	if (nr < dd->msix_info.max_requested)
+		__set_bit(nr, dd->msix_info.in_use_msix);
+	spin_unlock(&dd->msix_info.msix_lock);
+
+	if (nr == dd->msix_info.max_requested)
+		return -ENOSPC;
+
+	/* Specific verification and determine the name */
+	switch (type) {
+	case IRQ_GENERAL:
+		/* general interrupt must be MSIx vector 0 */
+		if (nr) {
+			spin_lock(&dd->msix_info.msix_lock);
+			__clear_bit(nr, dd->msix_info.in_use_msix);
+			spin_unlock(&dd->msix_info.msix_lock);
+			dd_dev_err(dd, "Invalid index %lu for GENERAL IRQ\n",
+				   nr);
+			return -EINVAL;
+		}
+		snprintf(name, sizeof(name), DRIVER_NAME "_%d", dd->unit);
+		err_info = "general";
+		break;
+	case IRQ_SDMA:
+		snprintf(name, sizeof(name), DRIVER_NAME "_%d sdma%d",
+			 dd->unit, idx);
+		err_info = "sdma";
+		break;
+	case IRQ_RCVCTXT:
+		snprintf(name, sizeof(name), DRIVER_NAME "_%d kctxt%d",
+			 dd->unit, idx);
+		err_info = "receive context";
+		break;
+	case IRQ_OTHER:
+	default:
+		return -EINVAL;
+	}
+	name[sizeof(name) - 1] = 0;
+
+	irq = pci_irq_vector(dd->pcidev, nr);
+	ret = pci_request_irq(dd->pcidev, nr, handler, thread, arg, name);
+	if (ret) {
+		dd_dev_err(dd,
+			   "%s: request for IRQ %d failed, MSIx %d, err %d\n",
+			   err_info, irq, idx, ret);
+		spin_lock(&dd->msix_info.msix_lock);
+		__clear_bit(nr, dd->msix_info.in_use_msix);
+		spin_unlock(&dd->msix_info.msix_lock);
+		return ret;
+	}
+
+	/*
+	 * assign arg after pci_request_irq call, so it will be
+	 * cleaned up
+	 */
+	me = &dd->msix_info.msix_entries[nr];
+	me->irq = irq;
+	me->arg = arg;
+	me->type = type;
+
+	/* This is a request, so a failure is not fatal */
+	ret = hfi1_get_irq_affinity(dd, me);
+	if (ret)
+		dd_dev_err(dd, "unable to pin IRQ %d\n", ret);
+
+	return nr;
+}
+
+/**
+ * msix_request_rcd_irq() - Helper function for RCVAVAIL IRQs
+ * @rcd: valid rcd context
+ *
+ */
+int msix_request_rcd_irq(struct hfi1_ctxtdata *rcd)
+{
+	int nr;
+
+	nr = msix_request_irq(rcd->dd, rcd, receive_context_interrupt,
+			      receive_context_thread, rcd->ctxt, IRQ_RCVCTXT);
+	if (nr < 0)
+		return nr;
+
+	/*
+	 * Set the interrupt register and mask for this
+	 * context's interrupt.
+	 */
+	rcd->ireg = (IS_RCVAVAIL_START + rcd->ctxt) / 64;
+	rcd->imask = ((u64)1) << ((IS_RCVAVAIL_START + rcd->ctxt) % 64);
+	rcd->msix_intr = nr;
+	remap_intr(rcd->dd, IS_RCVAVAIL_START + rcd->ctxt, nr);
+
+	return 0;
+}
+
+/**
+ * msix_request_smda_ira() - Helper for getting SDMA IRQ resources
+ * @sde: valid sdma engine
+ *
+ */
+int msix_request_sdma_irq(struct sdma_engine *sde)
+{
+	int nr;
+
+	nr = msix_request_irq(sde->dd, sde, sdma_interrupt, NULL,
+			      sde->this_idx, IRQ_SDMA);
+	if (nr < 0)
+		return nr;
+	sde->msix_intr = nr;
+	remap_sdma_interrupts(sde->dd, sde->this_idx, nr);
+
+	return 0;
+}
+
+/**
+ * enable_sdma_src() - Helper to enable SDMA IRQ srcs
+ * @dd: valid devdata structure
+ * @i: index of SDMA engine
+ */
+static void enable_sdma_srcs(struct hfi1_devdata *dd, int i)
+{
+	set_intr_bits(dd, IS_SDMA_START + i, IS_SDMA_START + i, true);
+	set_intr_bits(dd, IS_SDMA_PROGRESS_START + i,
+		      IS_SDMA_PROGRESS_START + i, true);
+	set_intr_bits(dd, IS_SDMA_IDLE_START + i, IS_SDMA_IDLE_START + i, true);
+	set_intr_bits(dd, IS_SDMAENG_ERR_START + i, IS_SDMAENG_ERR_START + i,
+		      true);
+}
+
+/**
+ * msix_request_irqs() - Allocate all MSIx IRQs
+ * @dd: valid devdata structure
+ *
+ * Helper function to request the used MSIx IRQs.
+ *
+ */
+int msix_request_irqs(struct hfi1_devdata *dd)
+{
+	int i;
+	int ret;
+
+	ret = msix_request_irq(dd, dd, general_interrupt, NULL, 0, IRQ_GENERAL);
+	if (ret < 0)
+		return ret;
+
+	for (i = 0; i < dd->num_sdma; i++) {
+		struct sdma_engine *sde = &dd->per_sdma[i];
+
+		ret = msix_request_sdma_irq(sde);
+		if (ret)
+			return ret;
+		enable_sdma_srcs(sde->dd, i);
+	}
+
+	for (i = 0; i < dd->n_krcv_queues; i++) {
+		struct hfi1_ctxtdata *rcd = hfi1_rcd_get_by_index_safe(dd, i);
+
+		if (rcd)
+			ret = msix_request_rcd_irq(rcd);
+		hfi1_rcd_put(rcd);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+/**
+ * msix_free_irq() - Free the specified MSIx resources and IRQ
+ * @dd: valid devdata
+ * @msix_intr: MSIx vector to free.
+ *
+ */
+void msix_free_irq(struct hfi1_devdata *dd, u8 msix_intr)
+{
+	struct hfi1_msix_entry *me;
+
+	if (msix_intr >= dd->msix_info.max_requested)
+		return;
+
+	me = &dd->msix_info.msix_entries[msix_intr];
+
+	if (!me->arg) /* => no irq, no affinity */
+		return;
+
+	hfi1_put_irq_affinity(dd, me);
+	pci_free_irq(dd->pcidev, msix_intr, me->arg);
+
+	me->arg = NULL;
+
+	spin_lock(&dd->msix_info.msix_lock);
+	__clear_bit(msix_intr, dd->msix_info.in_use_msix);
+	spin_unlock(&dd->msix_info.msix_lock);
+}
+
+/**
+ * hfi1_clean_up_msix_interrupts() - Free all MSIx IRQ resources
+ * @dd: valid device data data structure
+ *
+ * Free the MSIx and associated PCI resources, if they have been allocated.
+ */
+void msix_clean_up_interrupts(struct hfi1_devdata *dd)
+{
+	int i;
+	struct hfi1_msix_entry *me = dd->msix_info.msix_entries;
+
+	/* remove irqs - must happen before disabling/turning off */
+	for (i = 0; i < dd->msix_info.max_requested; i++, me++)
+		msix_free_irq(dd, i);
+
+	/* clean structures */
+	kfree(dd->msix_info.msix_entries);
+	dd->msix_info.msix_entries = NULL;
+	dd->msix_info.max_requested = 0;
+
+	pci_free_irq_vectors(dd->pcidev);
+}
+
+/**
+ * msix_vnic_syncrhonize_irq() - Vnic IRQ synchronize
+ * @dd: valid devdata
+ */
+void msix_vnic_synchronize_irq(struct hfi1_devdata *dd)
+{
+	int i;
+
+	for (i = 0; i < dd->vnic.num_ctxt; i++) {
+		struct hfi1_ctxtdata *rcd = dd->vnic.ctxt[i];
+		struct hfi1_msix_entry *me;
+
+		me = &dd->msix_info.msix_entries[rcd->msix_intr];
+
+		synchronize_irq(me->irq);
+	}
+}
diff --git a/drivers/infiniband/hw/hfi1/msix.h b/drivers/infiniband/hw/hfi1/msix.h
new file mode 100644
index 0000000..a514881
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/msix.h
@@ -0,0 +1,64 @@
+/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */
+/*
+ * Copyright(c) 2018 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#ifndef _HFI1_MSIX_H
+#define _HFI1_MSIX_H
+
+#include "hfi.h"
+
+/* MSIx interface */
+int msix_initialize(struct hfi1_devdata *dd);
+int msix_request_irqs(struct hfi1_devdata *dd);
+void msix_clean_up_interrupts(struct hfi1_devdata *dd);
+int msix_request_rcd_irq(struct hfi1_ctxtdata *rcd);
+int msix_request_sdma_irq(struct sdma_engine *sde);
+void msix_free_irq(struct hfi1_devdata *dd, u8 msix_intr);
+
+/* VNIC interface */
+void msix_vnic_synchronize_irq(struct hfi1_devdata *dd);
+
+#endif
diff --git a/drivers/infiniband/hw/hfi1/opfn.c b/drivers/infiniband/hw/hfi1/opfn.c
new file mode 100644
index 0000000..370a5a8
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/opfn.c
@@ -0,0 +1,323 @@
+// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
+/*
+ * Copyright(c) 2018 Intel Corporation.
+ *
+ */
+#include "hfi.h"
+#include "trace.h"
+#include "qp.h"
+#include "opfn.h"
+
+#define IB_BTHE_E                 BIT(IB_BTHE_E_SHIFT)
+
+#define OPFN_CODE(code) BIT((code) - 1)
+#define OPFN_MASK(code) OPFN_CODE(STL_VERBS_EXTD_##code)
+
+struct hfi1_opfn_type {
+	bool (*request)(struct rvt_qp *qp, u64 *data);
+	bool (*response)(struct rvt_qp *qp, u64 *data);
+	bool (*reply)(struct rvt_qp *qp, u64 data);
+	void (*error)(struct rvt_qp *qp);
+};
+
+static struct hfi1_opfn_type hfi1_opfn_handlers[STL_VERBS_EXTD_MAX] = {
+	[STL_VERBS_EXTD_TID_RDMA] = {
+		.request = tid_rdma_conn_req,
+		.response = tid_rdma_conn_resp,
+		.reply = tid_rdma_conn_reply,
+		.error = tid_rdma_conn_error,
+	},
+};
+
+static struct workqueue_struct *opfn_wq;
+
+static void opfn_schedule_conn_request(struct rvt_qp *qp);
+
+static bool hfi1_opfn_extended(u32 bth1)
+{
+	return !!(bth1 & IB_BTHE_E);
+}
+
+static void opfn_conn_request(struct rvt_qp *qp)
+{
+	struct hfi1_qp_priv *priv = qp->priv;
+	struct ib_atomic_wr wr;
+	u16 mask, capcode;
+	struct hfi1_opfn_type *extd;
+	u64 data;
+	unsigned long flags;
+	int ret = 0;
+
+	trace_hfi1_opfn_state_conn_request(qp);
+	spin_lock_irqsave(&priv->opfn.lock, flags);
+	/*
+	 * Exit if the extended bit is not set, or if nothing is requested, or
+	 * if we have completed all requests, or if a previous request is in
+	 * progress
+	 */
+	if (!priv->opfn.extended || !priv->opfn.requested ||
+	    priv->opfn.requested == priv->opfn.completed || priv->opfn.curr)
+		goto done;
+
+	mask = priv->opfn.requested & ~priv->opfn.completed;
+	capcode = ilog2(mask & ~(mask - 1)) + 1;
+	if (capcode >= STL_VERBS_EXTD_MAX) {
+		priv->opfn.completed |= OPFN_CODE(capcode);
+		goto done;
+	}
+
+	extd = &hfi1_opfn_handlers[capcode];
+	if (!extd || !extd->request || !extd->request(qp, &data)) {
+		/*
+		 * Either there is no handler for this capability or the request
+		 * packet could not be generated. Either way, mark it as done so
+		 * we don't keep attempting to complete it.
+		 */
+		priv->opfn.completed |= OPFN_CODE(capcode);
+		goto done;
+	}
+
+	trace_hfi1_opfn_data_conn_request(qp, capcode, data);
+	data = (data & ~0xf) | capcode;
+
+	memset(&wr, 0, sizeof(wr));
+	wr.wr.opcode = IB_WR_OPFN;
+	wr.remote_addr = HFI1_VERBS_E_ATOMIC_VADDR;
+	wr.compare_add = data;
+
+	priv->opfn.curr = capcode;	/* A new request is now in progress */
+	/* Drop opfn.lock before calling ib_post_send() */
+	spin_unlock_irqrestore(&priv->opfn.lock, flags);
+
+	ret = ib_post_send(&qp->ibqp, &wr.wr, NULL);
+	if (ret)
+		goto err;
+	trace_hfi1_opfn_state_conn_request(qp);
+	return;
+err:
+	trace_hfi1_msg_opfn_conn_request(qp, "ib_ost_send failed: ret = ",
+					 (u64)ret);
+	spin_lock_irqsave(&priv->opfn.lock, flags);
+	/*
+	 * In case of an unexpected error return from ib_post_send
+	 * clear opfn.curr and reschedule to try again
+	 */
+	priv->opfn.curr = STL_VERBS_EXTD_NONE;
+	opfn_schedule_conn_request(qp);
+done:
+	spin_unlock_irqrestore(&priv->opfn.lock, flags);
+}
+
+void opfn_send_conn_request(struct work_struct *work)
+{
+	struct hfi1_opfn_data *od;
+	struct hfi1_qp_priv *qpriv;
+
+	od = container_of(work, struct hfi1_opfn_data, opfn_work);
+	qpriv = container_of(od, struct hfi1_qp_priv, opfn);
+
+	opfn_conn_request(qpriv->owner);
+}
+
+/*
+ * When QP s_lock is held in the caller, the OPFN request must be scheduled
+ * to a different workqueue to avoid double locking QP s_lock in call to
+ * ib_post_send in opfn_conn_request
+ */
+static void opfn_schedule_conn_request(struct rvt_qp *qp)
+{
+	struct hfi1_qp_priv *priv = qp->priv;
+
+	trace_hfi1_opfn_state_sched_conn_request(qp);
+	queue_work(opfn_wq, &priv->opfn.opfn_work);
+}
+
+void opfn_conn_response(struct rvt_qp *qp, struct rvt_ack_entry *e,
+			struct ib_atomic_eth *ateth)
+{
+	struct hfi1_qp_priv *priv = qp->priv;
+	u64 data = be64_to_cpu(ateth->compare_data);
+	struct hfi1_opfn_type *extd;
+	u8 capcode;
+	unsigned long flags;
+
+	trace_hfi1_opfn_state_conn_response(qp);
+	capcode = data & 0xf;
+	trace_hfi1_opfn_data_conn_response(qp, capcode, data);
+	if (!capcode || capcode >= STL_VERBS_EXTD_MAX)
+		return;
+
+	extd = &hfi1_opfn_handlers[capcode];
+
+	if (!extd || !extd->response) {
+		e->atomic_data = capcode;
+		return;
+	}
+
+	spin_lock_irqsave(&priv->opfn.lock, flags);
+	if (priv->opfn.completed & OPFN_CODE(capcode)) {
+		/*
+		 * We are receiving a request for a feature that has already
+		 * been negotiated. This may mean that the other side has reset
+		 */
+		priv->opfn.completed &= ~OPFN_CODE(capcode);
+		if (extd->error)
+			extd->error(qp);
+	}
+
+	if (extd->response(qp, &data))
+		priv->opfn.completed |= OPFN_CODE(capcode);
+	e->atomic_data = (data & ~0xf) | capcode;
+	trace_hfi1_opfn_state_conn_response(qp);
+	spin_unlock_irqrestore(&priv->opfn.lock, flags);
+}
+
+void opfn_conn_reply(struct rvt_qp *qp, u64 data)
+{
+	struct hfi1_qp_priv *priv = qp->priv;
+	struct hfi1_opfn_type *extd;
+	u8 capcode;
+	unsigned long flags;
+
+	trace_hfi1_opfn_state_conn_reply(qp);
+	capcode = data & 0xf;
+	trace_hfi1_opfn_data_conn_reply(qp, capcode, data);
+	if (!capcode || capcode >= STL_VERBS_EXTD_MAX)
+		return;
+
+	spin_lock_irqsave(&priv->opfn.lock, flags);
+	/*
+	 * Either there is no previous request or the reply is not for the
+	 * current request
+	 */
+	if (!priv->opfn.curr || capcode != priv->opfn.curr)
+		goto done;
+
+	extd = &hfi1_opfn_handlers[capcode];
+
+	if (!extd || !extd->reply)
+		goto clear;
+
+	if (extd->reply(qp, data))
+		priv->opfn.completed |= OPFN_CODE(capcode);
+clear:
+	/*
+	 * Clear opfn.curr to indicate that the previous request is no longer in
+	 * progress
+	 */
+	priv->opfn.curr = STL_VERBS_EXTD_NONE;
+	trace_hfi1_opfn_state_conn_reply(qp);
+done:
+	spin_unlock_irqrestore(&priv->opfn.lock, flags);
+}
+
+void opfn_conn_error(struct rvt_qp *qp)
+{
+	struct hfi1_qp_priv *priv = qp->priv;
+	struct hfi1_opfn_type *extd = NULL;
+	unsigned long flags;
+	u16 capcode;
+
+	trace_hfi1_opfn_state_conn_error(qp);
+	trace_hfi1_msg_opfn_conn_error(qp, "error. qp state ", (u64)qp->state);
+	/*
+	 * The QP has gone into the Error state. We have to invalidate all
+	 * negotiated feature, including the one in progress (if any). The RC
+	 * QP handling will clean the WQE for the connection request.
+	 */
+	spin_lock_irqsave(&priv->opfn.lock, flags);
+	while (priv->opfn.completed) {
+		capcode = priv->opfn.completed & ~(priv->opfn.completed - 1);
+		extd = &hfi1_opfn_handlers[ilog2(capcode) + 1];
+		if (extd->error)
+			extd->error(qp);
+		priv->opfn.completed &= ~OPFN_CODE(capcode);
+	}
+	priv->opfn.extended = 0;
+	priv->opfn.requested = 0;
+	priv->opfn.curr = STL_VERBS_EXTD_NONE;
+	spin_unlock_irqrestore(&priv->opfn.lock, flags);
+}
+
+void opfn_qp_init(struct rvt_qp *qp, struct ib_qp_attr *attr, int attr_mask)
+{
+	struct ib_qp *ibqp = &qp->ibqp;
+	struct hfi1_qp_priv *priv = qp->priv;
+	unsigned long flags;
+
+	if (attr_mask & IB_QP_RETRY_CNT)
+		priv->s_retry = attr->retry_cnt;
+
+	spin_lock_irqsave(&priv->opfn.lock, flags);
+	if (ibqp->qp_type == IB_QPT_RC && HFI1_CAP_IS_KSET(TID_RDMA)) {
+		struct tid_rdma_params *local = &priv->tid_rdma.local;
+
+		if (attr_mask & IB_QP_TIMEOUT)
+			priv->tid_retry_timeout_jiffies = qp->timeout_jiffies;
+		if (qp->pmtu == enum_to_mtu(OPA_MTU_4096) ||
+		    qp->pmtu == enum_to_mtu(OPA_MTU_8192)) {
+			tid_rdma_opfn_init(qp, local);
+			/*
+			 * We only want to set the OPFN requested bit when the
+			 * QP transitions to RTS.
+			 */
+			if (attr_mask & IB_QP_STATE &&
+			    attr->qp_state == IB_QPS_RTS) {
+				priv->opfn.requested |= OPFN_MASK(TID_RDMA);
+				/*
+				 * If the QP is transitioning to RTS and the
+				 * opfn.completed for TID RDMA has already been
+				 * set, the QP is being moved *back* into RTS.
+				 * We can now renegotiate the TID RDMA
+				 * parameters.
+				 */
+				if (priv->opfn.completed &
+				    OPFN_MASK(TID_RDMA)) {
+					priv->opfn.completed &=
+						~OPFN_MASK(TID_RDMA);
+					/*
+					 * Since the opfn.completed bit was
+					 * already set, it is safe to assume
+					 * that the opfn.extended is also set.
+					 */
+					opfn_schedule_conn_request(qp);
+				}
+			}
+		} else {
+			memset(local, 0, sizeof(*local));
+		}
+	}
+	spin_unlock_irqrestore(&priv->opfn.lock, flags);
+}
+
+void opfn_trigger_conn_request(struct rvt_qp *qp, u32 bth1)
+{
+	struct hfi1_qp_priv *priv = qp->priv;
+
+	if (!priv->opfn.extended && hfi1_opfn_extended(bth1) &&
+	    HFI1_CAP_IS_KSET(OPFN)) {
+		priv->opfn.extended = 1;
+		if (qp->state == IB_QPS_RTS)
+			opfn_conn_request(qp);
+	}
+}
+
+int opfn_init(void)
+{
+	opfn_wq = alloc_workqueue("hfi_opfn",
+				  WQ_SYSFS | WQ_HIGHPRI | WQ_CPU_INTENSIVE |
+				  WQ_MEM_RECLAIM,
+				  HFI1_MAX_ACTIVE_WORKQUEUE_ENTRIES);
+	if (!opfn_wq)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void opfn_exit(void)
+{
+	if (opfn_wq) {
+		destroy_workqueue(opfn_wq);
+		opfn_wq = NULL;
+	}
+}
diff --git a/drivers/infiniband/hw/hfi1/opfn.h b/drivers/infiniband/hw/hfi1/opfn.h
new file mode 100644
index 0000000..62f93c1
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/opfn.h
@@ -0,0 +1,87 @@
+/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */
+/*
+ * Copyright(c) 2018 Intel Corporation.
+ *
+ */
+#ifndef _HFI1_OPFN_H
+#define _HFI1_OPFN_H
+
+/**
+ * DOC: Omni Path Feature Negotion (OPFN)
+ *
+ * OPFN is a discovery protocol for Intel Omni-Path fabric that
+ * allows two RC QPs to negotiate a common feature that both QPs
+ * can support. Currently, the only OPA feature that OPFN
+ * supports is TID RDMA.
+ *
+ * Architecture
+ *
+ * OPFN involves the communication between two QPs on the HFI
+ * level on an Omni-Path fabric, and ULPs have no knowledge of
+ * OPFN at all.
+ *
+ * Implementation
+ *
+ * OPFN extends the existing IB RC protocol with the following
+ * changes:
+ * -- Uses Bit 24 (reserved) of DWORD 1 of Base Transport
+ *    Header (BTH1) to indicate that the RC QP supports OPFN;
+ * -- Uses a combination of RC COMPARE_SWAP opcode (0x13) and
+ *    the address U64_MAX (0xFFFFFFFFFFFFFFFF) as an OPFN
+ *    request; The 64-bit data carried with the request/response
+ *    contains the parameters for negotiation and will be
+ *    defined in tid_rdma.c file;
+ * -- Defines IB_WR_RESERVED3 as IB_WR_OPFN.
+ *
+ * The OPFN communication will be triggered when an RC QP
+ * receives a request with Bit 24 of BTH1 set. The responder QP
+ * will then post send an OPFN request with its local
+ * parameters, which will be sent to the requester QP once all
+ * existing requests on the responder QP side have been sent.
+ * Once the requester QP receives the OPFN request, it will
+ * keep a copy of the responder QP's parameters, and return a
+ * response packet with its own local parameters. The responder
+ * QP receives the response packet and keeps a copy of the requester
+ * QP's parameters. After this exchange, each side has the parameters
+ * for both sides and therefore can select the right parameters
+ * for future transactions
+ */
+
+#include <linux/workqueue.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/rdmavt_qp.h>
+
+/* STL Verbs Extended */
+#define IB_BTHE_E_SHIFT           24
+#define HFI1_VERBS_E_ATOMIC_VADDR U64_MAX
+
+enum hfi1_opfn_codes {
+	STL_VERBS_EXTD_NONE = 0,
+	STL_VERBS_EXTD_TID_RDMA,
+	STL_VERBS_EXTD_MAX
+};
+
+struct hfi1_opfn_data {
+	u8 extended;
+	u16 requested;
+	u16 completed;
+	enum hfi1_opfn_codes curr;
+	/* serialize opfn function calls */
+	spinlock_t lock;
+	struct work_struct opfn_work;
+};
+
+/* WR opcode for OPFN */
+#define IB_WR_OPFN IB_WR_RESERVED3
+
+void opfn_send_conn_request(struct work_struct *work);
+void opfn_conn_response(struct rvt_qp *qp, struct rvt_ack_entry *e,
+			struct ib_atomic_eth *ateth);
+void opfn_conn_reply(struct rvt_qp *qp, u64 data);
+void opfn_conn_error(struct rvt_qp *qp);
+void opfn_qp_init(struct rvt_qp *qp, struct ib_qp_attr *attr, int attr_mask);
+void opfn_trigger_conn_request(struct rvt_qp *qp, u32 bth1);
+int opfn_init(void);
+void opfn_exit(void);
+
+#endif /* _HFI1_OPFN_H */
diff --git a/drivers/infiniband/hw/hfi1/pcie.c b/drivers/infiniband/hw/hfi1/pcie.c
index 6c967dd..61362bd 100644
--- a/drivers/infiniband/hw/hfi1/pcie.c
+++ b/drivers/infiniband/hw/hfi1/pcie.c
@@ -1,5 +1,5 @@
 /*
- * Copyright(c) 2015 - 2017 Intel Corporation.
+ * Copyright(c) 2015 - 2019 Intel Corporation.
  *
  * This file is provided under a dual BSD/GPLv2 license.  When using or
  * redistributing this file, you may do so under either license.
@@ -61,19 +61,12 @@
  */
 
 /*
- * Code to adjust PCIe capabilities.
- */
-static void tune_pcie_caps(struct hfi1_devdata *);
-
-/*
  * Do all the common PCIe setup and initialization.
- * devdata is not yet allocated, and is not allocated until after this
- * routine returns success.  Therefore dd_dev_err() can't be used for error
- * printing.
  */
-int hfi1_pcie_init(struct pci_dev *pdev, const struct pci_device_id *ent)
+int hfi1_pcie_init(struct hfi1_devdata *dd)
 {
 	int ret;
+	struct pci_dev *pdev = dd->pcidev;
 
 	ret = pci_enable_device(pdev);
 	if (ret) {
@@ -89,15 +82,13 @@
 		 * about that, it appears.  If the original BAR was retained
 		 * in the kernel data structures, this may be OK.
 		 */
-		hfi1_early_err(&pdev->dev, "pci enable failed: error %d\n",
-			       -ret);
-		goto done;
+		dd_dev_err(dd, "pci enable failed: error %d\n", -ret);
+		return ret;
 	}
 
 	ret = pci_request_regions(pdev, DRIVER_NAME);
 	if (ret) {
-		hfi1_early_err(&pdev->dev,
-			       "pci_request_regions fails: err %d\n", -ret);
+		dd_dev_err(dd, "pci_request_regions fails: err %d\n", -ret);
 		goto bail;
 	}
 
@@ -110,8 +101,7 @@
 		 */
 		ret = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
 		if (ret) {
-			hfi1_early_err(&pdev->dev,
-				       "Unable to set DMA mask: %d\n", ret);
+			dd_dev_err(dd, "Unable to set DMA mask: %d\n", ret);
 			goto bail;
 		}
 		ret = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32));
@@ -119,18 +109,16 @@
 		ret = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64));
 	}
 	if (ret) {
-		hfi1_early_err(&pdev->dev,
-			       "Unable to set DMA consistent mask: %d\n", ret);
+		dd_dev_err(dd, "Unable to set DMA consistent mask: %d\n", ret);
 		goto bail;
 	}
 
 	pci_set_master(pdev);
 	(void)pci_enable_pcie_error_reporting(pdev);
-	goto done;
+	return 0;
 
 bail:
 	hfi1_pcie_cleanup(pdev);
-done:
 	return ret;
 }
 
@@ -206,7 +194,7 @@
 		dd_dev_err(dd, "WC mapping of send buffers failed\n");
 		goto nomem;
 	}
-	dd_dev_info(dd, "WC piobase: %p\n for %x", dd->piobase, TXE_PIO_SIZE);
+	dd_dev_info(dd, "WC piobase: %p for %x\n", dd->piobase, TXE_PIO_SIZE);
 
 	dd->physaddr = addr;        /* used for io_remap, etc. */
 
@@ -331,7 +319,9 @@
 	/*
 	 * bus->max_bus_speed is set from the bridge's linkcap Max Link Speed
 	 */
-	if (parent && dd->pcidev->bus->max_bus_speed != PCIE_SPEED_8_0GT) {
+	if (parent &&
+	    (dd->pcidev->bus->max_bus_speed == PCIE_SPEED_2_5GT ||
+	     dd->pcidev->bus->max_bus_speed == PCIE_SPEED_5_0GT)) {
 		dd_dev_info(dd, "Parent PCIe bridge does not support Gen3\n");
 		dd->link_gen3_capable = 0;
 	}
@@ -344,26 +334,6 @@
 	return 0;
 }
 
-/*
- * Returns:
- *	- actual number of interrupts allocated or
- *      - error
- */
-int request_msix(struct hfi1_devdata *dd, u32 msireq)
-{
-	int nvec;
-
-	nvec = pci_alloc_irq_vectors(dd->pcidev, msireq, msireq, PCI_IRQ_MSIX);
-	if (nvec < 0) {
-		dd_dev_err(dd, "pci_alloc_irq_vectors() failed: %d\n", nvec);
-		return nvec;
-	}
-
-	tune_pcie_caps(dd);
-
-	return nvec;
-}
-
 /* restore command and BARs after a reset has wiped them out */
 int restore_pci_variables(struct hfi1_devdata *dd)
 {
@@ -479,14 +449,15 @@
  * Check and optionally adjust them to maximize our throughput.
  */
 static int hfi1_pcie_caps;
-module_param_named(pcie_caps, hfi1_pcie_caps, int, S_IRUGO);
+module_param_named(pcie_caps, hfi1_pcie_caps, int, 0444);
 MODULE_PARM_DESC(pcie_caps, "Max PCIe tuning: Payload (0..3), ReadReq (4..7)");
 
-uint aspm_mode = ASPM_MODE_DISABLED;
-module_param_named(aspm, aspm_mode, uint, S_IRUGO);
-MODULE_PARM_DESC(aspm, "PCIe ASPM: 0: disable, 1: enable, 2: dynamic");
-
-static void tune_pcie_caps(struct hfi1_devdata *dd)
+/**
+ * tune_pcie_caps() - Code to adjust PCIe capabilities.
+ * @dd: Valid device data structure
+ *
+ */
+void tune_pcie_caps(struct hfi1_devdata *dd)
 {
 	struct pci_dev *parent;
 	u16 rc_mpss, rc_mps, ep_mpss, ep_mps;
@@ -650,7 +621,6 @@
 	struct hfi1_devdata *dd = pci_get_drvdata(pdev);
 
 	dd_dev_info(dd, "HFI1 resume function called\n");
-	pci_cleanup_aer_uncorrect_error_status(pdev);
 	/*
 	 * Running jobs will fail, since it's asynchronous
 	 * unlike sysfs-requested reset.   Better than
@@ -1029,6 +999,7 @@
 	const u8 (*ctle_tunings)[4];
 	uint static_ctle_mode;
 	int return_error = 0;
+	u32 target_width;
 
 	/* PCIe Gen3 is for the ASIC only */
 	if (dd->icode != ICODE_RTL_SILICON)
@@ -1068,6 +1039,9 @@
 		return 0;
 	}
 
+	/* Previous Gen1/Gen2 bus width */
+	target_width = dd->lbus_width;
+
 	/*
 	 * Do the Gen3 transition.  Steps are those of the PCIe Gen3
 	 * recipe.
@@ -1436,11 +1410,12 @@
 	dd_dev_info(dd, "%s: new speed and width: %s\n", __func__,
 		    dd->lbus_info);
 
-	if (dd->lbus_speed != target_speed) { /* not target */
+	if (dd->lbus_speed != target_speed ||
+	    dd->lbus_width < target_width) { /* not target */
 		/* maybe retry */
 		do_retry = retry_count < pcie_retry;
-		dd_dev_err(dd, "PCIe link speed did not switch to Gen%d%s\n",
-			   pcie_target, do_retry ? ", retrying" : "");
+		dd_dev_err(dd, "PCIe link speed or width did not match target%s\n",
+			   do_retry ? ", retrying" : "");
 		retry_count++;
 		if (do_retry) {
 			msleep(100); /* allow time to settle */
diff --git a/drivers/infiniband/hw/hfi1/pio.c b/drivers/infiniband/hw/hfi1/pio.c
index 7520576..79126b2 100644
--- a/drivers/infiniband/hw/hfi1/pio.c
+++ b/drivers/infiniband/hw/hfi1/pio.c
@@ -71,14 +71,6 @@
 	}
 }
 
-/* defined in header release 48 and higher */
-#ifndef SEND_CTRL_UNSUPPORTED_VL_SHIFT
-#define SEND_CTRL_UNSUPPORTED_VL_SHIFT 3
-#define SEND_CTRL_UNSUPPORTED_VL_MASK 0xffull
-#define SEND_CTRL_UNSUPPORTED_VL_SMASK (SEND_CTRL_UNSUPPORTED_VL_MASK \
-		<< SEND_CTRL_UNSUPPORTED_VL_SHIFT)
-#endif
-
 /* global control of PIO send */
 void pio_send_control(struct hfi1_devdata *dd, int op)
 {
@@ -750,6 +742,7 @@
 	spin_lock_init(&sc->alloc_lock);
 	spin_lock_init(&sc->release_lock);
 	spin_lock_init(&sc->credit_ctrl_lock);
+	seqlock_init(&sc->waitlock);
 	INIT_LIST_HEAD(&sc->piowait);
 	INIT_WORK(&sc->halt_work, sc_halted);
 	init_waitqueue_head(&sc->halt_wait);
@@ -959,6 +952,22 @@
 		}
 	}
 	spin_unlock(&sc->release_lock);
+
+	write_seqlock(&sc->waitlock);
+	while (!list_empty(&sc->piowait)) {
+		struct iowait *wait;
+		struct rvt_qp *qp;
+		struct hfi1_qp_priv *priv;
+
+		wait = list_first_entry(&sc->piowait, struct iowait, list);
+		qp = iowait_to_qp(wait);
+		priv = qp->priv;
+		list_del_init(&priv->s_iowait.list);
+		priv->s_iowait.lock = NULL;
+		hfi1_qp_wakeup(qp, RVT_S_WAIT_PIO | HFI1_S_WAIT_PIO_DRAIN);
+	}
+	write_sequnlock(&sc->waitlock);
+
 	spin_unlock_irq(&sc->alloc_lock);
 }
 
@@ -1434,7 +1443,8 @@
  * @cb: optional callback to call when the buffer is finished sending
  * @arg: argument for cb
  *
- * Return a pointer to a PIO buffer if successful, NULL if not enough room.
+ * Return a pointer to a PIO buffer, NULL if not enough room, -ECOMM
+ * when link is down.
  */
 struct pio_buf *sc_buffer_alloc(struct send_context *sc, u32 dw_len,
 				pio_release_cb cb, void *arg)
@@ -1450,7 +1460,7 @@
 	spin_lock_irqsave(&sc->alloc_lock, flags);
 	if (!(sc->flags & SCF_ENABLED)) {
 		spin_unlock_irqrestore(&sc->alloc_lock, flags);
-		goto done;
+		return ERR_PTR(-ECOMM);
 	}
 
 retry:
@@ -1584,10 +1594,8 @@
 	else
 		sc_del_credit_return_intr(sc);
 	trace_hfi1_wantpiointr(sc, needint, sc->credit_ctrl);
-	if (needint) {
-		mmiowb();
+	if (needint)
 		sc_return_credits(sc);
-	}
 }
 
 /**
@@ -1601,14 +1609,12 @@
 static void sc_piobufavail(struct send_context *sc)
 {
 	struct hfi1_devdata *dd = sc->dd;
-	struct hfi1_ibdev *dev = &dd->verbs_dev;
 	struct list_head *list;
 	struct rvt_qp *qps[PIO_WAIT_BATCH_SIZE];
 	struct rvt_qp *qp;
 	struct hfi1_qp_priv *priv;
 	unsigned long flags;
-	uint i, n = 0, max_idx = 0;
-	u8 max_starved_cnt = 0;
+	uint i, n = 0, top_idx = 0;
 
 	if (dd->send_contexts[sc->sw_index].type != SC_KERNEL &&
 	    dd->send_contexts[sc->sw_index].type != SC_VL15)
@@ -1620,18 +1626,25 @@
 	 * could end up with QPs on the wait list with the interrupt
 	 * disabled.
 	 */
-	write_seqlock_irqsave(&dev->iowait_lock, flags);
+	write_seqlock_irqsave(&sc->waitlock, flags);
 	while (!list_empty(list)) {
 		struct iowait *wait;
 
 		if (n == ARRAY_SIZE(qps))
 			break;
 		wait = list_first_entry(list, struct iowait, list);
+		iowait_get_priority(wait);
 		qp = iowait_to_qp(wait);
 		priv = qp->priv;
 		list_del_init(&priv->s_iowait.list);
 		priv->s_iowait.lock = NULL;
-		iowait_starve_find_max(wait, &max_starved_cnt, n, &max_idx);
+		if (n) {
+			priv = qps[top_idx]->priv;
+			top_idx = iowait_priority_update_top(wait,
+							     &priv->s_iowait,
+							     n, top_idx);
+		}
+
 		/* refcount held until actual wake up */
 		qps[n++] = qp;
 	}
@@ -1644,14 +1657,14 @@
 		if (!list_empty(list))
 			hfi1_sc_wantpiobuf_intr(sc, 1);
 	}
-	write_sequnlock_irqrestore(&dev->iowait_lock, flags);
+	write_sequnlock_irqrestore(&sc->waitlock, flags);
 
-	/* Wake up the most starved one first */
+	/* Wake up the top-priority one first */
 	if (n)
-		hfi1_qp_wakeup(qps[max_idx],
+		hfi1_qp_wakeup(qps[top_idx],
 			       RVT_S_WAIT_PIO | HFI1_S_WAIT_PIO_DRAIN);
 	for (i = 0; i < n; i++)
-		if (i != max_idx)
+		if (i != top_idx)
 			hfi1_qp_wakeup(qps[i],
 				       RVT_S_WAIT_PIO | HFI1_S_WAIT_PIO_DRAIN);
 }
@@ -2106,11 +2119,10 @@
 		int bytes = TXE_NUM_CONTEXTS * sizeof(struct credit_return);
 
 		set_dev_node(&dd->pcidev->dev, i);
-		dd->cr_base[i].va = dma_zalloc_coherent(
-					&dd->pcidev->dev,
-					bytes,
-					&dd->cr_base[i].dma,
-					GFP_KERNEL);
+		dd->cr_base[i].va = dma_alloc_coherent(&dd->pcidev->dev,
+						       bytes,
+						       &dd->cr_base[i].dma,
+						       GFP_KERNEL);
 		if (!dd->cr_base[i].va) {
 			set_dev_node(&dd->pcidev->dev, dd->node);
 			dd_dev_err(dd,
@@ -2145,3 +2157,28 @@
 	kfree(dd->cr_base);
 	dd->cr_base = NULL;
 }
+
+void seqfile_dump_sci(struct seq_file *s, u32 i,
+		      struct send_context_info *sci)
+{
+	struct send_context *sc = sci->sc;
+	u64 reg;
+
+	seq_printf(s, "SCI %u: type %u base %u credits %u\n",
+		   i, sci->type, sci->base, sci->credits);
+	seq_printf(s, "  flags 0x%x sw_inx %u hw_ctxt %u grp %u\n",
+		   sc->flags,  sc->sw_index, sc->hw_context, sc->group);
+	seq_printf(s, "  sr_size %u credits %u sr_head %u sr_tail %u\n",
+		   sc->sr_size, sc->credits, sc->sr_head, sc->sr_tail);
+	seq_printf(s, "  fill %lu free %lu fill_wrap %u alloc_free %lu\n",
+		   sc->fill, sc->free, sc->fill_wrap, sc->alloc_free);
+	seq_printf(s, "  credit_intr_count %u credit_ctrl 0x%llx\n",
+		   sc->credit_intr_count, sc->credit_ctrl);
+	reg = read_kctxt_csr(sc->dd, sc->hw_context, SC(CREDIT_STATUS));
+	seq_printf(s, "  *hw_free %llu CurrentFree %llu LastReturned %llu\n",
+		   (le64_to_cpu(*sc->hw_free) & CR_COUNTER_SMASK) >>
+		    CR_COUNTER_SHIFT,
+		   (reg >> SC(CREDIT_STATUS_CURRENT_FREE_COUNTER_SHIFT)) &
+		    SC(CREDIT_STATUS_CURRENT_FREE_COUNTER_MASK),
+		   reg & SC(CREDIT_STATUS_LAST_RETURNED_COUNTER_SMASK));
+}
diff --git a/drivers/infiniband/hw/hfi1/pio.h b/drivers/infiniband/hw/hfi1/pio.h
index aaf372c..c9a58b6 100644
--- a/drivers/infiniband/hw/hfi1/pio.h
+++ b/drivers/infiniband/hw/hfi1/pio.h
@@ -127,6 +127,8 @@
 	volatile __le64 *hw_free;	/* HW free counter */
 	/* list for PIO waiters */
 	struct list_head piowait  ____cacheline_aligned_in_smp;
+	seqlock_t waitlock;
+
 	spinlock_t credit_ctrl_lock ____cacheline_aligned_in_smp;
 	u32 credit_intr_count;		/* count of credit intr users */
 	u64 credit_ctrl;		/* cache for credit control */
@@ -329,4 +331,7 @@
 void seg_pio_copy_mid(struct pio_buf *pbuf, const void *from, size_t nbytes);
 void seg_pio_copy_end(struct pio_buf *pbuf);
 
+void seqfile_dump_sci(struct seq_file *s, u32 i,
+		      struct send_context_info *sci);
+
 #endif /* _PIO_H */
diff --git a/drivers/infiniband/hw/hfi1/qp.c b/drivers/infiniband/hw/hfi1/qp.c
index 9b1e84a..f8e733a 100644
--- a/drivers/infiniband/hw/hfi1/qp.c
+++ b/drivers/infiniband/hw/hfi1/qp.c
@@ -1,5 +1,5 @@
 /*
- * Copyright(c) 2015 - 2018 Intel Corporation.
+ * Copyright(c) 2015 - 2019 Intel Corporation.
  *
  * This file is provided under a dual BSD/GPLv2 license.  When using or
  * redistributing this file, you may do so under either license.
@@ -66,7 +66,7 @@
 static void flush_tx_list(struct rvt_qp *qp);
 static int iowait_sleep(
 	struct sdma_engine *sde,
-	struct iowait *wait,
+	struct iowait_work *wait,
 	struct sdma_txreq *stx,
 	unsigned int seq,
 	bool pkts_sent);
@@ -132,17 +132,27 @@
 	.qpt_support = BIT(IB_QPT_RC),
 },
 
+[IB_WR_OPFN] = {
+	.length = sizeof(struct ib_atomic_wr),
+	.qpt_support = BIT(IB_QPT_RC),
+	.flags = RVT_OPERATION_USE_RESERVE,
+},
+
+[IB_WR_TID_RDMA_WRITE] = {
+	.length = sizeof(struct ib_rdma_wr),
+	.qpt_support = BIT(IB_QPT_RC),
+	.flags = RVT_OPERATION_IGN_RNR_CNT,
+},
+
 };
 
-static void flush_tx_list(struct rvt_qp *qp)
+static void flush_list_head(struct list_head *l)
 {
-	struct hfi1_qp_priv *priv = qp->priv;
-
-	while (!list_empty(&priv->s_iowait.tx_head)) {
+	while (!list_empty(l)) {
 		struct sdma_txreq *tx;
 
 		tx = list_first_entry(
-			&priv->s_iowait.tx_head,
+			l,
 			struct sdma_txreq,
 			list);
 		list_del_init(&tx->list);
@@ -151,6 +161,14 @@
 	}
 }
 
+static void flush_tx_list(struct rvt_qp *qp)
+{
+	struct hfi1_qp_priv *priv = qp->priv;
+
+	flush_list_head(&iowait_get_ib_work(&priv->s_iowait)->tx_head);
+	flush_list_head(&iowait_get_tid_work(&priv->s_iowait)->tx_head);
+}
+
 static void flush_iowait(struct rvt_qp *qp)
 {
 	struct hfi1_qp_priv *priv = qp->priv;
@@ -279,41 +297,58 @@
 		priv->s_sendcontext = qp_to_send_context(qp, priv->s_sc);
 		qp_set_16b(qp);
 	}
+
+	opfn_qp_init(qp, attr, attr_mask);
 }
 
 /**
- * hfi1_check_send_wqe - validate wqe
+ * hfi1_setup_wqe - set up the wqe
  * @qp - The qp
  * @wqe - The built wqe
+ * @call_send - Determine if the send should be posted or scheduled.
  *
- * validate wqe.  This is called
- * prior to inserting the wqe into
- * the ring but after the wqe has been
- * setup.
+ * Perform setup of the wqe.  This is called
+ * prior to inserting the wqe into the ring but after
+ * the wqe has been setup by RDMAVT. This function
+ * allows the driver the opportunity to perform
+ * validation and additional setup of the wqe.
  *
  * Returns 0 on success, -EINVAL on failure
  *
  */
-int hfi1_check_send_wqe(struct rvt_qp *qp,
-			struct rvt_swqe *wqe)
+int hfi1_setup_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe, bool *call_send)
 {
 	struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
 	struct rvt_ah *ah;
+	struct hfi1_pportdata *ppd;
+	struct hfi1_devdata *dd;
 
 	switch (qp->ibqp.qp_type) {
 	case IB_QPT_RC:
+		hfi1_setup_tid_rdma_wqe(qp, wqe);
+		/* fall through */
 	case IB_QPT_UC:
 		if (wqe->length > 0x80000000U)
 			return -EINVAL;
+		if (wqe->length > qp->pmtu)
+			*call_send = false;
 		break;
 	case IB_QPT_SMI:
-		ah = ibah_to_rvtah(wqe->ud_wr.ah);
-		if (wqe->length > (1 << ah->log_pmtu))
+		/*
+		 * SM packets should exclusively use VL15 and their SL is
+		 * ignored (IBTA v1.3, Section 3.5.8.2). Therefore, when ah
+		 * is created, SL is 0 in most cases and as a result some
+		 * fields (vl and pmtu) in ah may not be set correctly,
+		 * depending on the SL2SC and SC2VL tables at the time.
+		 */
+		ppd = ppd_from_ibp(ibp);
+		dd = dd_from_ppd(ppd);
+		if (wqe->length > dd->vld[15].mtu)
 			return -EINVAL;
 		break;
 	case IB_QPT_GSI:
 	case IB_QPT_UD:
-		ah = ibah_to_rvtah(wqe->ud_wr.ah);
+		ah = rvt_get_swqe_ah(wqe);
 		if (wqe->length > (1 << ah->log_pmtu))
 			return -EINVAL;
 		if (ibp->sl_to_sc[rdma_ah_get_sl(&ah->attr)] == 0xf)
@@ -321,7 +356,14 @@
 	default:
 		break;
 	}
-	return wqe->length <= piothreshold;
+
+	/*
+	 * System latency between send and schedule is large enough that
+	 * forcing call_send to true for piothreshold packets is necessary.
+	 */
+	if (wqe->length <= piothreshold)
+		*call_send = true;
+	return 0;
 }
 
 /**
@@ -333,7 +375,7 @@
  * It is only used in the post send, which doesn't hold
  * the s_lock.
  */
-void _hfi1_schedule_send(struct rvt_qp *qp)
+bool _hfi1_schedule_send(struct rvt_qp *qp)
 {
 	struct hfi1_qp_priv *priv = qp->priv;
 	struct hfi1_ibport *ibp =
@@ -341,28 +383,26 @@
 	struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
 	struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device);
 
-	iowait_schedule(&priv->s_iowait, ppd->hfi1_wq,
-			priv->s_sde ?
-			priv->s_sde->cpu :
-			cpumask_first(cpumask_of_node(dd->node)));
+	return iowait_schedule(&priv->s_iowait, ppd->hfi1_wq,
+			       priv->s_sde ?
+			       priv->s_sde->cpu :
+			       cpumask_first(cpumask_of_node(dd->node)));
 }
 
 static void qp_pio_drain(struct rvt_qp *qp)
 {
-	struct hfi1_ibdev *dev;
 	struct hfi1_qp_priv *priv = qp->priv;
 
 	if (!priv->s_sendcontext)
 		return;
-	dev = to_idev(qp->ibqp.device);
 	while (iowait_pio_pending(&priv->s_iowait)) {
-		write_seqlock_irq(&dev->iowait_lock);
+		write_seqlock_irq(&priv->s_sendcontext->waitlock);
 		hfi1_sc_wantpiobuf_intr(priv->s_sendcontext, 1);
-		write_sequnlock_irq(&dev->iowait_lock);
+		write_sequnlock_irq(&priv->s_sendcontext->waitlock);
 		iowait_pio_drain(&priv->s_iowait);
-		write_seqlock_irq(&dev->iowait_lock);
+		write_seqlock_irq(&priv->s_sendcontext->waitlock);
 		hfi1_sc_wantpiobuf_intr(priv->s_sendcontext, 0);
-		write_sequnlock_irq(&dev->iowait_lock);
+		write_sequnlock_irq(&priv->s_sendcontext->waitlock);
 	}
 }
 
@@ -372,12 +412,37 @@
  *
  * This schedules qp progress and caller should hold
  * the s_lock.
+ * @return true if the first leg is scheduled;
+ * false if the first leg is not scheduled.
  */
-void hfi1_schedule_send(struct rvt_qp *qp)
+bool hfi1_schedule_send(struct rvt_qp *qp)
 {
 	lockdep_assert_held(&qp->s_lock);
-	if (hfi1_send_ok(qp))
+	if (hfi1_send_ok(qp)) {
 		_hfi1_schedule_send(qp);
+		return true;
+	}
+	if (qp->s_flags & HFI1_S_ANY_WAIT_IO)
+		iowait_set_flag(&((struct hfi1_qp_priv *)qp->priv)->s_iowait,
+				IOWAIT_PENDING_IB);
+	return false;
+}
+
+static void hfi1_qp_schedule(struct rvt_qp *qp)
+{
+	struct hfi1_qp_priv *priv = qp->priv;
+	bool ret;
+
+	if (iowait_flag_set(&priv->s_iowait, IOWAIT_PENDING_IB)) {
+		ret = hfi1_schedule_send(qp);
+		if (ret)
+			iowait_clear_flag(&priv->s_iowait, IOWAIT_PENDING_IB);
+	}
+	if (iowait_flag_set(&priv->s_iowait, IOWAIT_PENDING_TID)) {
+		ret = hfi1_schedule_tid_send(qp);
+		if (ret)
+			iowait_clear_flag(&priv->s_iowait, IOWAIT_PENDING_TID);
+	}
 }
 
 void hfi1_qp_wakeup(struct rvt_qp *qp, u32 flag)
@@ -388,16 +453,41 @@
 	if (qp->s_flags & flag) {
 		qp->s_flags &= ~flag;
 		trace_hfi1_qpwakeup(qp, flag);
-		hfi1_schedule_send(qp);
+		hfi1_qp_schedule(qp);
 	}
 	spin_unlock_irqrestore(&qp->s_lock, flags);
 	/* Notify hfi1_destroy_qp() if it is waiting. */
 	rvt_put_qp(qp);
 }
 
+void hfi1_qp_unbusy(struct rvt_qp *qp, struct iowait_work *wait)
+{
+	struct hfi1_qp_priv *priv = qp->priv;
+
+	if (iowait_set_work_flag(wait) == IOWAIT_IB_SE) {
+		qp->s_flags &= ~RVT_S_BUSY;
+		/*
+		 * If we are sending a first-leg packet from the second leg,
+		 * we need to clear the busy flag from priv->s_flags to
+		 * avoid a race condition when the qp wakes up before
+		 * the call to hfi1_verbs_send() returns to the second
+		 * leg. In that case, the second leg will terminate without
+		 * being re-scheduled, resulting in failure to send TID RDMA
+		 * WRITE DATA and TID RDMA ACK packets.
+		 */
+		if (priv->s_flags & HFI1_S_TID_BUSY_SET) {
+			priv->s_flags &= ~(HFI1_S_TID_BUSY_SET |
+					   RVT_S_BUSY);
+			iowait_set_flag(&priv->s_iowait, IOWAIT_PENDING_TID);
+		}
+	} else {
+		priv->s_flags &= ~RVT_S_BUSY;
+	}
+}
+
 static int iowait_sleep(
 	struct sdma_engine *sde,
-	struct iowait *wait,
+	struct iowait_work *wait,
 	struct sdma_txreq *stx,
 	uint seq,
 	bool pkts_sent)
@@ -407,7 +497,6 @@
 	struct hfi1_qp_priv *priv;
 	unsigned long flags;
 	int ret = 0;
-	struct hfi1_ibdev *dev;
 
 	qp = tx->qp;
 	priv = qp->priv;
@@ -420,9 +509,8 @@
 		 * buffer and undoing the side effects of the copy.
 		 */
 		/* Make a common routine? */
-		dev = &sde->dd->verbs_dev;
 		list_add_tail(&stx->list, &wait->tx_head);
-		write_seqlock(&dev->iowait_lock);
+		write_seqlock(&sde->waitlock);
 		if (sdma_progress(sde, seq, stx))
 			goto eagain;
 		if (list_empty(&priv->s_iowait.list)) {
@@ -431,14 +519,15 @@
 
 			ibp->rvp.n_dmawait++;
 			qp->s_flags |= RVT_S_WAIT_DMA_DESC;
+			iowait_get_priority(&priv->s_iowait);
 			iowait_queue(pkts_sent, &priv->s_iowait,
 				     &sde->dmawait);
-			priv->s_iowait.lock = &dev->iowait_lock;
+			priv->s_iowait.lock = &sde->waitlock;
 			trace_hfi1_qpsleep(qp, RVT_S_WAIT_DMA_DESC);
 			rvt_get_qp(qp);
 		}
-		write_sequnlock(&dev->iowait_lock);
-		qp->s_flags &= ~RVT_S_BUSY;
+		write_sequnlock(&sde->waitlock);
+		hfi1_qp_unbusy(qp, wait);
 		spin_unlock_irqrestore(&qp->s_lock, flags);
 		ret = -EBUSY;
 	} else {
@@ -447,7 +536,7 @@
 	}
 	return ret;
 eagain:
-	write_sequnlock(&dev->iowait_lock);
+	write_sequnlock(&sde->waitlock);
 	spin_unlock_irqrestore(&qp->s_lock, flags);
 	list_del_init(&stx->list);
 	return -EAGAIN;
@@ -480,6 +569,17 @@
 	spin_unlock_irqrestore(&qp->s_lock, flags);
 }
 
+static void hfi1_init_priority(struct iowait *w)
+{
+	struct rvt_qp *qp = iowait_to_qp(w);
+	struct hfi1_qp_priv *priv = qp->priv;
+
+	if (qp->s_flags & RVT_S_ACK_PENDING)
+		w->priority++;
+	if (priv->s_flags & RVT_S_ACK_PENDING)
+		w->priority++;
+}
+
 /**
  * qp_to_sdma_engine - map a qp to a send engine
  * @qp: the QP
@@ -602,8 +702,8 @@
 		   sde ? sde->this_idx : 0,
 		   send_context,
 		   send_context ? send_context->sw_index : 0,
-		   ibcq_to_rvtcq(qp->ibqp.send_cq)->queue->head,
-		   ibcq_to_rvtcq(qp->ibqp.send_cq)->queue->tail,
+		   ib_cq_head(qp->ibqp.send_cq),
+		   ib_cq_tail(qp->ibqp.send_cq),
 		   qp->pid,
 		   qp->s_state,
 		   qp->s_ack_state,
@@ -637,9 +737,13 @@
 		&priv->s_iowait,
 		1,
 		_hfi1_do_send,
+		_hfi1_do_tid_send,
 		iowait_sleep,
 		iowait_wakeup,
-		iowait_sdma_drained);
+		iowait_sdma_drained,
+		hfi1_init_priority);
+	/* Init to a value to start the running average correctly */
+	priv->s_running_pkt_size = piothreshold / 2;
 	return priv;
 }
 
@@ -647,6 +751,7 @@
 {
 	struct hfi1_qp_priv *priv = qp->priv;
 
+	hfi1_qp_priv_tid_free(rdi, qp);
 	kfree(priv->s_ahg);
 	kfree(priv);
 }
@@ -680,19 +785,24 @@
 {
 	lockdep_assert_held(&qp->s_lock);
 	flush_iowait(qp);
+	hfi1_tid_rdma_flush_wait(qp);
 }
 
 void stop_send_queue(struct rvt_qp *qp)
 {
 	struct hfi1_qp_priv *priv = qp->priv;
 
-	cancel_work_sync(&priv->s_iowait.iowork);
+	iowait_cancel_work(&priv->s_iowait);
+	if (cancel_work_sync(&priv->tid_rdma.trigger_work))
+		rvt_put_qp(qp);
 }
 
 void quiesce_qp(struct rvt_qp *qp)
 {
 	struct hfi1_qp_priv *priv = qp->priv;
 
+	hfi1_del_tid_reap_timer(qp);
+	hfi1_del_tid_retry_timer(qp);
 	iowait_sdma_drain(&priv->s_iowait);
 	qp_pio_drain(qp);
 	flush_tx_list(qp);
@@ -700,8 +810,13 @@
 
 void notify_qp_reset(struct rvt_qp *qp)
 {
+	hfi1_qp_kern_exp_rcv_clear_all(qp);
 	qp->r_adefered = 0;
 	clear_ahg(qp);
+
+	/* Clear any OPFN state */
+	if (qp->ibqp.qp_type == IB_QPT_RC)
+		opfn_conn_error(qp);
 }
 
 /*
@@ -783,8 +898,11 @@
 	if (lock) {
 		write_seqlock(lock);
 		if (!list_empty(&priv->s_iowait.list) &&
-		    !(qp->s_flags & RVT_S_BUSY)) {
-			qp->s_flags &= ~RVT_S_ANY_WAIT_IO;
+		    !(qp->s_flags & RVT_S_BUSY) &&
+		    !(priv->s_flags & RVT_S_BUSY)) {
+			qp->s_flags &= ~HFI1_S_ANY_WAIT_IO;
+			iowait_clear_flag(&priv->s_iowait, IOWAIT_PENDING_IB);
+			iowait_clear_flag(&priv->s_iowait, IOWAIT_PENDING_TID);
 			list_del_init(&priv->s_iowait.list);
 			priv->s_iowait.lock = NULL;
 			rvt_put_qp(qp);
@@ -792,7 +910,8 @@
 		write_sequnlock(lock);
 	}
 
-	if (!(qp->s_flags & RVT_S_BUSY)) {
+	if (!(qp->s_flags & RVT_S_BUSY) && !(priv->s_flags & RVT_S_BUSY)) {
+		qp->s_hdrwords = 0;
 		if (qp->s_rdma_mr) {
 			rvt_put_mr(qp->s_rdma_mr);
 			qp->s_rdma_mr = NULL;
diff --git a/drivers/infiniband/hw/hfi1/qp.h b/drivers/infiniband/hw/hfi1/qp.h
index 078cff7..b670321 100644
--- a/drivers/infiniband/hw/hfi1/qp.h
+++ b/drivers/infiniband/hw/hfi1/qp.h
@@ -58,28 +58,22 @@
 extern const struct rvt_operation_params hfi1_post_parms[];
 
 /*
- * Send if not busy or waiting for I/O and either
- * a RC response is pending or we can process send work requests.
- */
-static inline int hfi1_send_ok(struct rvt_qp *qp)
-{
-	return !(qp->s_flags & (RVT_S_BUSY | RVT_S_ANY_WAIT_IO)) &&
-		(verbs_txreq_queued(qp) ||
-		(qp->s_flags & RVT_S_RESP_PENDING) ||
-		 !(qp->s_flags & RVT_S_ANY_WAIT_SEND));
-}
-
-/*
  * Driver specific s_flags starting at bit 31 down to HFI1_S_MIN_BIT_MASK
  *
  * HFI1_S_AHG_VALID - ahg header valid on chip
  * HFI1_S_AHG_CLEAR - have send engine clear ahg state
  * HFI1_S_WAIT_PIO_DRAIN - qp waiting for PIOs to drain
+ * HFI1_S_WAIT_TID_SPACE - a QP is waiting for TID resource
+ * HFI1_S_WAIT_TID_RESP - waiting for a TID RDMA WRITE response
+ * HFI1_S_WAIT_HALT - halt the first leg send engine
  * HFI1_S_MIN_BIT_MASK - the lowest bit that can be used by hfi1
  */
 #define HFI1_S_AHG_VALID         0x80000000
 #define HFI1_S_AHG_CLEAR         0x40000000
 #define HFI1_S_WAIT_PIO_DRAIN    0x20000000
+#define HFI1_S_WAIT_TID_SPACE    0x10000000
+#define HFI1_S_WAIT_TID_RESP     0x08000000
+#define HFI1_S_WAIT_HALT         0x04000000
 #define HFI1_S_MIN_BIT_MASK      0x01000000
 
 /*
@@ -88,6 +82,21 @@
 
 #define HFI1_S_ANY_WAIT_IO (RVT_S_ANY_WAIT_IO | HFI1_S_WAIT_PIO_DRAIN)
 #define HFI1_S_ANY_WAIT (HFI1_S_ANY_WAIT_IO | RVT_S_ANY_WAIT_SEND)
+#define HFI1_S_ANY_TID_WAIT_SEND (RVT_S_WAIT_SSN_CREDIT | RVT_S_WAIT_DMA)
+
+/*
+ * Send if not busy or waiting for I/O and either
+ * a RC response is pending or we can process send work requests.
+ */
+static inline int hfi1_send_ok(struct rvt_qp *qp)
+{
+	struct hfi1_qp_priv *priv = qp->priv;
+
+	return !(qp->s_flags & (RVT_S_BUSY | HFI1_S_ANY_WAIT_IO)) &&
+		(verbs_txreq_queued(iowait_get_ib_work(&priv->s_iowait)) ||
+		(qp->s_flags & RVT_S_RESP_PENDING) ||
+		 !(qp->s_flags & RVT_S_ANY_WAIT_SEND));
+}
 
 /*
  * free_ahg - clear ahg from QP
@@ -129,8 +138,8 @@
 
 void qp_iter_print(struct seq_file *s, struct rvt_qp_iter *iter);
 
-void _hfi1_schedule_send(struct rvt_qp *qp);
-void hfi1_schedule_send(struct rvt_qp *qp);
+bool _hfi1_schedule_send(struct rvt_qp *qp);
+bool hfi1_schedule_send(struct rvt_qp *qp);
 
 void hfi1_migrate_qp(struct rvt_qp *qp);
 
@@ -150,4 +159,5 @@
 u32 mtu_from_qp(struct rvt_dev_info *rdi, struct rvt_qp *qp, u32 pmtu);
 int mtu_to_path_mtu(u32 mtu);
 void hfi1_error_port_qps(struct hfi1_ibport *ibp, u8 sl);
+void hfi1_qp_unbusy(struct rvt_qp *qp, struct iowait_work *wait);
 #endif /* _QP_H */
diff --git a/drivers/infiniband/hw/hfi1/rc.c b/drivers/infiniband/hw/hfi1/rc.c
index 9bd63ab..1a3c647 100644
--- a/drivers/infiniband/hw/hfi1/rc.c
+++ b/drivers/infiniband/hw/hfi1/rc.c
@@ -51,24 +51,48 @@
 
 #include "hfi.h"
 #include "qp.h"
+#include "rc.h"
 #include "verbs_txreq.h"
 #include "trace.h"
 
-/* cut down ridiculously long IB macro names */
-#define OP(x) RC_OP(x)
-
-static u32 restart_sge(struct rvt_sge_state *ss, struct rvt_swqe *wqe,
-		       u32 psn, u32 pmtu)
+struct rvt_ack_entry *find_prev_entry(struct rvt_qp *qp, u32 psn, u8 *prev,
+				      u8 *prev_ack, bool *scheduled)
+	__must_hold(&qp->s_lock)
 {
-	u32 len;
+	struct rvt_ack_entry *e = NULL;
+	u8 i, p;
+	bool s = true;
 
-	len = delta_psn(psn, wqe->psn) * pmtu;
-	ss->sge = wqe->sg_list[0];
-	ss->sg_list = wqe->sg_list + 1;
-	ss->num_sge = wqe->wr.num_sge;
-	ss->total_len = wqe->length;
-	rvt_skip_sge(ss, len, false);
-	return wqe->length - len;
+	for (i = qp->r_head_ack_queue; ; i = p) {
+		if (i == qp->s_tail_ack_queue)
+			s = false;
+		if (i)
+			p = i - 1;
+		else
+			p = rvt_size_atomic(ib_to_rvt(qp->ibqp.device));
+		if (p == qp->r_head_ack_queue) {
+			e = NULL;
+			break;
+		}
+		e = &qp->s_ack_queue[p];
+		if (!e->opcode) {
+			e = NULL;
+			break;
+		}
+		if (cmp_psn(psn, e->psn) >= 0) {
+			if (p == qp->s_tail_ack_queue &&
+			    cmp_psn(psn, e->lpsn) <= 0)
+				s = false;
+			break;
+		}
+	}
+	if (prev)
+		*prev = p;
+	if (prev_ack)
+		*prev_ack = i;
+	if (scheduled)
+		*scheduled = s;
+	return e;
 }
 
 /**
@@ -87,20 +111,25 @@
 		       struct hfi1_pkt_state *ps)
 {
 	struct rvt_ack_entry *e;
-	u32 hwords;
-	u32 len;
-	u32 bth0;
-	u32 bth2;
+	u32 hwords, hdrlen;
+	u32 len = 0;
+	u32 bth0 = 0, bth2 = 0;
+	u32 bth1 = qp->remote_qpn | (HFI1_CAP_IS_KSET(OPFN) << IB_BTHE_E_SHIFT);
 	int middle = 0;
 	u32 pmtu = qp->pmtu;
-	struct hfi1_qp_priv *priv = qp->priv;
+	struct hfi1_qp_priv *qpriv = qp->priv;
+	bool last_pkt;
+	u32 delta;
+	u8 next = qp->s_tail_ack_queue;
+	struct tid_rdma_request *req;
 
+	trace_hfi1_rsp_make_rc_ack(qp, 0);
 	lockdep_assert_held(&qp->s_lock);
 	/* Don't send an ACK if we aren't supposed to. */
 	if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK))
 		goto bail;
 
-	if (priv->hdr_type == HFI1_PKT_TYPE_9B)
+	if (qpriv->hdr_type == HFI1_PKT_TYPE_9B)
 		/* header size in 32-bit words LRH+BTH = (8+12)/4. */
 		hwords = 5;
 	else
@@ -111,10 +140,7 @@
 	case OP(RDMA_READ_RESPONSE_LAST):
 	case OP(RDMA_READ_RESPONSE_ONLY):
 		e = &qp->s_ack_queue[qp->s_tail_ack_queue];
-		if (e->rdma_sge.mr) {
-			rvt_put_mr(e->rdma_sge.mr);
-			e->rdma_sge.mr = NULL;
-		}
+		release_rdma_sge_mr(e);
 		/* FALLTHROUGH */
 	case OP(ATOMIC_ACKNOWLEDGE):
 		/*
@@ -122,8 +148,18 @@
 		 * response has been sent instead of only being
 		 * constructed.
 		 */
-		if (++qp->s_tail_ack_queue > HFI1_MAX_RDMA_ATOMIC)
-			qp->s_tail_ack_queue = 0;
+		if (++next > rvt_size_atomic(&dev->rdi))
+			next = 0;
+		/*
+		 * Only advance the s_acked_ack_queue pointer if there
+		 * have been no TID RDMA requests.
+		 */
+		e = &qp->s_ack_queue[qp->s_tail_ack_queue];
+		if (e->opcode != TID_OP(WRITE_REQ) &&
+		    qp->s_acked_ack_queue == qp->s_tail_ack_queue)
+			qp->s_acked_ack_queue = next;
+		qp->s_tail_ack_queue = next;
+		trace_hfi1_rsp_make_rc_ack(qp, e->psn);
 		/* FALLTHROUGH */
 	case OP(SEND_ONLY):
 	case OP(ACKNOWLEDGE):
@@ -135,6 +171,12 @@
 		}
 
 		e = &qp->s_ack_queue[qp->s_tail_ack_queue];
+		/* Check for tid write fence */
+		if ((qpriv->s_flags & HFI1_R_TID_WAIT_INTERLCK) ||
+		    hfi1_tid_rdma_ack_interlock(qp, e)) {
+			iowait_set_flag(&qpriv->s_iowait, IOWAIT_PENDING_IB);
+			goto bail;
+		}
 		if (e->opcode == OP(RDMA_READ_REQUEST)) {
 			/*
 			 * If a RDMA read response is being resent and
@@ -144,6 +186,10 @@
 			 */
 			len = e->rdma_sge.sge_length;
 			if (len && !e->rdma_sge.mr) {
+				if (qp->s_acked_ack_queue ==
+				    qp->s_tail_ack_queue)
+					qp->s_acked_ack_queue =
+						qp->r_head_ack_queue;
 				qp->s_tail_ack_queue = qp->r_head_ack_queue;
 				goto bail;
 			}
@@ -165,6 +211,45 @@
 			hwords++;
 			qp->s_ack_rdma_psn = e->psn;
 			bth2 = mask_psn(qp->s_ack_rdma_psn++);
+		} else if (e->opcode == TID_OP(WRITE_REQ)) {
+			/*
+			 * If a TID RDMA WRITE RESP is being resent, we have to
+			 * wait for the actual request. All requests that are to
+			 * be resent will have their state set to
+			 * TID_REQUEST_RESEND. When the new request arrives, the
+			 * state will be changed to TID_REQUEST_RESEND_ACTIVE.
+			 */
+			req = ack_to_tid_req(e);
+			if (req->state == TID_REQUEST_RESEND ||
+			    req->state == TID_REQUEST_INIT_RESEND)
+				goto bail;
+			qp->s_ack_state = TID_OP(WRITE_RESP);
+			qp->s_ack_rdma_psn = mask_psn(e->psn + req->cur_seg);
+			goto write_resp;
+		} else if (e->opcode == TID_OP(READ_REQ)) {
+			/*
+			 * If a TID RDMA read response is being resent and
+			 * we haven't seen the duplicate request yet,
+			 * then stop sending the remaining responses the
+			 * responder has seen until the requester re-sends it.
+			 */
+			len = e->rdma_sge.sge_length;
+			if (len && !e->rdma_sge.mr) {
+				if (qp->s_acked_ack_queue ==
+				    qp->s_tail_ack_queue)
+					qp->s_acked_ack_queue =
+						qp->r_head_ack_queue;
+				qp->s_tail_ack_queue = qp->r_head_ack_queue;
+				goto bail;
+			}
+			/* Copy SGE state in case we need to resend */
+			ps->s_txreq->mr = e->rdma_sge.mr;
+			if (ps->s_txreq->mr)
+				rvt_get_mr(ps->s_txreq->mr);
+			qp->s_ack_rdma_sge.sge = e->rdma_sge;
+			qp->s_ack_rdma_sge.num_sge = 1;
+			qp->s_ack_state = TID_OP(READ_RESP);
+			goto read_resp;
 		} else {
 			/* COMPARE_SWAP or FETCH_ADD */
 			ps->s_txreq->ss = NULL;
@@ -176,6 +261,7 @@
 			bth2 = mask_psn(e->psn);
 			e->sent = 1;
 		}
+		trace_hfi1_tid_write_rsp_make_rc_ack(qp);
 		bth0 = qp->s_ack_state << 24;
 		break;
 
@@ -202,6 +288,84 @@
 		bth2 = mask_psn(qp->s_ack_rdma_psn++);
 		break;
 
+	case TID_OP(WRITE_RESP):
+write_resp:
+		/*
+		 * 1. Check if RVT_S_ACK_PENDING is set. If yes,
+		 *    goto normal.
+		 * 2. Attempt to allocate TID resources.
+		 * 3. Remove RVT_S_RESP_PENDING flags from s_flags
+		 * 4. If resources not available:
+		 *    4.1 Set RVT_S_WAIT_TID_SPACE
+		 *    4.2 Queue QP on RCD TID queue
+		 *    4.3 Put QP on iowait list.
+		 *    4.4 Build IB RNR NAK with appropriate timeout value
+		 *    4.5 Return indication progress made.
+		 * 5. If resources are available:
+		 *    5.1 Program HW flow CSRs
+		 *    5.2 Build TID RDMA WRITE RESP packet
+		 *    5.3 If more resources needed, do 2.1 - 2.3.
+		 *    5.4 Wake up next QP on RCD TID queue.
+		 *    5.5 Return indication progress made.
+		 */
+
+		e = &qp->s_ack_queue[qp->s_tail_ack_queue];
+		req = ack_to_tid_req(e);
+
+		/*
+		 * Send scheduled RNR NAK's. RNR NAK's need to be sent at
+		 * segment boundaries, not at request boundaries. Don't change
+		 * s_ack_state because we are still in the middle of a request
+		 */
+		if (qpriv->rnr_nak_state == TID_RNR_NAK_SEND &&
+		    qp->s_tail_ack_queue == qpriv->r_tid_alloc &&
+		    req->cur_seg == req->alloc_seg) {
+			qpriv->rnr_nak_state = TID_RNR_NAK_SENT;
+			goto normal_no_state;
+		}
+
+		bth2 = mask_psn(qp->s_ack_rdma_psn);
+		hdrlen = hfi1_build_tid_rdma_write_resp(qp, e, ohdr, &bth1,
+							bth2, &len,
+							&ps->s_txreq->ss);
+		if (!hdrlen)
+			return 0;
+
+		hwords += hdrlen;
+		bth0 = qp->s_ack_state << 24;
+		qp->s_ack_rdma_psn++;
+		trace_hfi1_tid_req_make_rc_ack_write(qp, 0, e->opcode, e->psn,
+						     e->lpsn, req);
+		if (req->cur_seg != req->total_segs)
+			break;
+
+		e->sent = 1;
+		/* Do not free e->rdma_sge until all data are received */
+		qp->s_ack_state = OP(ATOMIC_ACKNOWLEDGE);
+		break;
+
+	case TID_OP(READ_RESP):
+read_resp:
+		e = &qp->s_ack_queue[qp->s_tail_ack_queue];
+		ps->s_txreq->ss = &qp->s_ack_rdma_sge;
+		delta = hfi1_build_tid_rdma_read_resp(qp, e, ohdr, &bth0,
+						      &bth1, &bth2, &len,
+						      &last_pkt);
+		if (delta == 0)
+			goto error_qp;
+		hwords += delta;
+		if (last_pkt) {
+			e->sent = 1;
+			/*
+			 * Increment qp->s_tail_ack_queue through s_ack_state
+			 * transition.
+			 */
+			qp->s_ack_state = OP(RDMA_READ_RESPONSE_LAST);
+		}
+		break;
+	case TID_OP(READ_REQ):
+		goto bail;
+
 	default:
 normal:
 		/*
@@ -211,8 +375,7 @@
 		 * (see above).
 		 */
 		qp->s_ack_state = OP(SEND_ONLY);
-		qp->s_flags &= ~RVT_S_ACK_PENDING;
-		ps->s_txreq->ss = NULL;
+normal_no_state:
 		if (qp->s_nak_state)
 			ohdr->u.aeth =
 				cpu_to_be32((qp->r_msn & IB_MSN_MASK) |
@@ -224,14 +387,24 @@
 		len = 0;
 		bth0 = OP(ACKNOWLEDGE) << 24;
 		bth2 = mask_psn(qp->s_ack_psn);
+		qp->s_flags &= ~RVT_S_ACK_PENDING;
+		ps->s_txreq->txreq.flags |= SDMA_TXREQ_F_VIP;
+		ps->s_txreq->ss = NULL;
 	}
 	qp->s_rdma_ack_cnt++;
-	ps->s_txreq->sde = priv->s_sde;
+	ps->s_txreq->sde = qpriv->s_sde;
 	ps->s_txreq->s_cur_size = len;
 	ps->s_txreq->hdr_dwords = hwords;
-	hfi1_make_ruc_header(qp, ohdr, bth0, bth2, middle, ps);
+	hfi1_make_ruc_header(qp, ohdr, bth0, bth1, bth2, middle, ps);
 	return 1;
-
+error_qp:
+	spin_unlock_irqrestore(&qp->s_lock, ps->flags);
+	spin_lock_irqsave(&qp->r_lock, ps->flags);
+	spin_lock(&qp->s_lock);
+	rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR);
+	spin_unlock(&qp->s_lock);
+	spin_unlock_irqrestore(&qp->r_lock, ps->flags);
+	spin_lock_irqsave(&qp->s_lock, ps->flags);
 bail:
 	qp->s_ack_state = OP(ACKNOWLEDGE);
 	/*
@@ -258,17 +431,23 @@
 	struct hfi1_qp_priv *priv = qp->priv;
 	struct hfi1_ibdev *dev = to_idev(qp->ibqp.device);
 	struct ib_other_headers *ohdr;
-	struct rvt_sge_state *ss;
+	struct rvt_sge_state *ss = NULL;
 	struct rvt_swqe *wqe;
-	u32 hwords;
-	u32 len;
-	u32 bth0 = 0;
-	u32 bth2;
+	struct hfi1_swqe_priv *wpriv;
+	struct tid_rdma_request *req = NULL;
+	/* header size in 32-bit words LRH+BTH = (8+12)/4. */
+	u32 hwords = 5;
+	u32 len = 0;
+	u32 bth0 = 0, bth2 = 0;
+	u32 bth1 = qp->remote_qpn | (HFI1_CAP_IS_KSET(OPFN) << IB_BTHE_E_SHIFT);
 	u32 pmtu = qp->pmtu;
 	char newreq;
 	int middle = 0;
 	int delta;
+	struct tid_rdma_flow *flow = NULL;
+	struct tid_rdma_params *remote;
 
+	trace_hfi1_sender_make_rc_req(qp);
 	lockdep_assert_held(&qp->s_lock);
 	ps->s_txreq = get_txreq(ps->dev, qp);
 	if (!ps->s_txreq)
@@ -309,13 +488,13 @@
 		}
 		clear_ahg(qp);
 		wqe = rvt_get_swqe_ptr(qp, qp->s_last);
-		hfi1_send_complete(qp, wqe, qp->s_last != qp->s_acked ?
-			IB_WC_SUCCESS : IB_WC_WR_FLUSH_ERR);
+		hfi1_trdma_send_complete(qp, wqe, qp->s_last != qp->s_acked ?
+					 IB_WC_SUCCESS : IB_WC_WR_FLUSH_ERR);
 		/* will get called again */
 		goto done_free_tx;
 	}
 
-	if (qp->s_flags & (RVT_S_WAIT_RNR | RVT_S_WAIT_ACK))
+	if (qp->s_flags & (RVT_S_WAIT_RNR | RVT_S_WAIT_ACK | HFI1_S_WAIT_HALT))
 		goto bail;
 
 	if (cmp_psn(qp->s_psn, qp->s_sending_hpsn) <= 0) {
@@ -329,6 +508,7 @@
 
 	/* Send a request. */
 	wqe = rvt_get_swqe_ptr(qp, qp->s_cur);
+check_s_state:
 	switch (qp->s_state) {
 	default:
 		if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_NEXT_SEND_OK))
@@ -350,9 +530,13 @@
 			/*
 			 * If a fence is requested, wait for previous
 			 * RDMA read and atomic operations to finish.
+			 * However, there is no need to guard against
+			 * TID RDMA READ after TID RDMA READ.
 			 */
 			if ((wqe->wr.send_flags & IB_SEND_FENCE) &&
-			    qp->s_num_rd_atomic) {
+			    qp->s_num_rd_atomic &&
+			    (wqe->wr.opcode != IB_WR_TID_RDMA_READ ||
+			     priv->pending_tid_r_segs < qp->s_num_rd_atomic)) {
 				qp->s_flags |= RVT_S_WAIT_FENCE;
 				goto bail;
 			}
@@ -378,9 +562,9 @@
 						wqe->wr.ex.invalidate_rkey);
 					local_ops = 1;
 				}
-				hfi1_send_complete(qp, wqe,
-						   err ? IB_WC_LOC_PROT_ERR
-						       : IB_WC_SUCCESS);
+				rvt_send_complete(qp, wqe,
+						  err ? IB_WC_LOC_PROT_ERR
+						      : IB_WC_SUCCESS);
 				if (local_ops)
 					atomic_dec(&qp->local_ops_pending);
 				goto done_free_tx;
@@ -397,16 +581,22 @@
 		len = wqe->length;
 		ss = &qp->s_sge;
 		bth2 = mask_psn(qp->s_psn);
+
+		/*
+		 * Interlock between various IB requests and TID RDMA
+		 * if necessary.
+		 */
+		if ((priv->s_flags & HFI1_S_TID_WAIT_INTERLCK) ||
+		    hfi1_tid_rdma_wqe_interlock(qp, wqe))
+			goto bail;
+
 		switch (wqe->wr.opcode) {
 		case IB_WR_SEND:
 		case IB_WR_SEND_WITH_IMM:
 		case IB_WR_SEND_WITH_INV:
 			/* If no credit, return. */
-			if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT) &&
-			    rvt_cmp_msn(wqe->ssn, qp->s_lsn + 1) > 0) {
-				qp->s_flags |= RVT_S_WAIT_SSN_CREDIT;
+			if (!rvt_rc_credit_avail(qp, wqe))
 				goto bail;
-			}
 			if (len > pmtu) {
 				qp->s_state = OP(SEND_FIRST);
 				len = pmtu;
@@ -439,11 +629,8 @@
 			goto no_flow_control;
 		case IB_WR_RDMA_WRITE_WITH_IMM:
 			/* If no credit, return. */
-			if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT) &&
-			    rvt_cmp_msn(wqe->ssn, qp->s_lsn + 1) > 0) {
-				qp->s_flags |= RVT_S_WAIT_SSN_CREDIT;
+			if (!rvt_rc_credit_avail(qp, wqe))
 				goto bail;
-			}
 no_flow_control:
 			put_ib_reth_vaddr(
 				wqe->rdma_wr.remote_addr,
@@ -473,21 +660,126 @@
 				qp->s_cur = 0;
 			break;
 
+		case IB_WR_TID_RDMA_WRITE:
+			if (newreq) {
+				/*
+				 * Limit the number of TID RDMA WRITE requests.
+				 */
+				if (atomic_read(&priv->n_tid_requests) >=
+				    HFI1_TID_RDMA_WRITE_CNT)
+					goto bail;
+
+				if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT))
+					qp->s_lsn++;
+			}
+
+			hwords += hfi1_build_tid_rdma_write_req(qp, wqe, ohdr,
+								&bth1, &bth2,
+								&len);
+			ss = NULL;
+			if (priv->s_tid_cur == HFI1_QP_WQE_INVALID) {
+				priv->s_tid_cur = qp->s_cur;
+				if (priv->s_tid_tail == HFI1_QP_WQE_INVALID) {
+					priv->s_tid_tail = qp->s_cur;
+					priv->s_state = TID_OP(WRITE_RESP);
+				}
+			} else if (priv->s_tid_cur == priv->s_tid_head) {
+				struct rvt_swqe *__w;
+				struct tid_rdma_request *__r;
+
+				__w = rvt_get_swqe_ptr(qp, priv->s_tid_cur);
+				__r = wqe_to_tid_req(__w);
+
+				/*
+				 * The s_tid_cur pointer is advanced to s_cur if
+				 * any of the following conditions about the WQE
+				 * to which s_ti_cur currently points to are
+				 * satisfied:
+				 *   1. The request is not a TID RDMA WRITE
+				 *      request,
+				 *   2. The request is in the INACTIVE or
+				 *      COMPLETE states (TID RDMA READ requests
+				 *      stay at INACTIVE and TID RDMA WRITE
+				 *      transition to COMPLETE when done),
+				 *   3. The request is in the ACTIVE or SYNC
+				 *      state and the number of completed
+				 *      segments is equal to the total segment
+				 *      count.
+				 *      (If ACTIVE, the request is waiting for
+				 *       ACKs. If SYNC, the request has not
+				 *       received any responses because it's
+				 *       waiting on a sync point.)
+				 */
+				if (__w->wr.opcode != IB_WR_TID_RDMA_WRITE ||
+				    __r->state == TID_REQUEST_INACTIVE ||
+				    __r->state == TID_REQUEST_COMPLETE ||
+				    ((__r->state == TID_REQUEST_ACTIVE ||
+				      __r->state == TID_REQUEST_SYNC) &&
+				     __r->comp_seg == __r->total_segs)) {
+					if (priv->s_tid_tail ==
+					    priv->s_tid_cur &&
+					    priv->s_state ==
+					    TID_OP(WRITE_DATA_LAST)) {
+						priv->s_tid_tail = qp->s_cur;
+						priv->s_state =
+							TID_OP(WRITE_RESP);
+					}
+					priv->s_tid_cur = qp->s_cur;
+				}
+				/*
+				 * A corner case: when the last TID RDMA WRITE
+				 * request was completed, s_tid_head,
+				 * s_tid_cur, and s_tid_tail all point to the
+				 * same location. Other requests are posted and
+				 * s_cur wraps around to the same location,
+				 * where a new TID RDMA WRITE is posted. In
+				 * this case, none of the indices need to be
+				 * updated. However, the priv->s_state should.
+				 */
+				if (priv->s_tid_tail == qp->s_cur &&
+				    priv->s_state == TID_OP(WRITE_DATA_LAST))
+					priv->s_state = TID_OP(WRITE_RESP);
+			}
+			req = wqe_to_tid_req(wqe);
+			if (newreq) {
+				priv->s_tid_head = qp->s_cur;
+				priv->pending_tid_w_resp += req->total_segs;
+				atomic_inc(&priv->n_tid_requests);
+				atomic_dec(&priv->n_requests);
+			} else {
+				req->state = TID_REQUEST_RESEND;
+				req->comp_seg = delta_psn(bth2, wqe->psn);
+				/*
+				 * Pull back any segments since we are going
+				 * to re-receive them.
+				 */
+				req->setup_head = req->clear_tail;
+				priv->pending_tid_w_resp +=
+					delta_psn(wqe->lpsn, bth2) + 1;
+			}
+
+			trace_hfi1_tid_write_sender_make_req(qp, newreq);
+			trace_hfi1_tid_req_make_req_write(qp, newreq,
+							  wqe->wr.opcode,
+							  wqe->psn, wqe->lpsn,
+							  req);
+			if (++qp->s_cur == qp->s_size)
+				qp->s_cur = 0;
+			break;
+
 		case IB_WR_RDMA_READ:
 			/*
 			 * Don't allow more operations to be started
 			 * than the QP limits allow.
 			 */
-			if (newreq) {
-				if (qp->s_num_rd_atomic >=
-				    qp->s_max_rd_atomic) {
-					qp->s_flags |= RVT_S_WAIT_RDMAR;
-					goto bail;
-				}
-				qp->s_num_rd_atomic++;
-				if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT))
-					qp->s_lsn++;
+			if (qp->s_num_rd_atomic >=
+			    qp->s_max_rd_atomic) {
+				qp->s_flags |= RVT_S_WAIT_RDMAR;
+				goto bail;
 			}
+			qp->s_num_rd_atomic++;
+			if (newreq && !(qp->s_flags & RVT_S_UNLIMITED_CREDIT))
+				qp->s_lsn++;
 			put_ib_reth_vaddr(
 				wqe->rdma_wr.remote_addr,
 				&ohdr->u.rc.reth);
@@ -503,23 +795,99 @@
 				qp->s_cur = 0;
 			break;
 
+		case IB_WR_TID_RDMA_READ:
+			trace_hfi1_tid_read_sender_make_req(qp, newreq);
+			wpriv = wqe->priv;
+			req = wqe_to_tid_req(wqe);
+			trace_hfi1_tid_req_make_req_read(qp, newreq,
+							 wqe->wr.opcode,
+							 wqe->psn, wqe->lpsn,
+							 req);
+			delta = cmp_psn(qp->s_psn, wqe->psn);
+
+			/*
+			 * Don't allow more operations to be started
+			 * than the QP limits allow. We could get here under
+			 * three conditions; (1) It's a new request; (2) We are
+			 * sending the second or later segment of a request,
+			 * but the qp->s_state is set to OP(RDMA_READ_REQUEST)
+			 * when the last segment of a previous request is
+			 * received just before this; (3) We are re-sending a
+			 * request.
+			 */
+			if (qp->s_num_rd_atomic >= qp->s_max_rd_atomic) {
+				qp->s_flags |= RVT_S_WAIT_RDMAR;
+				goto bail;
+			}
+			if (newreq) {
+				struct tid_rdma_flow *flow =
+					&req->flows[req->setup_head];
+
+				/*
+				 * Set up s_sge as it is needed for TID
+				 * allocation. However, if the pages have been
+				 * walked and mapped, skip it. An earlier try
+				 * has failed to allocate the TID entries.
+				 */
+				if (!flow->npagesets) {
+					qp->s_sge.sge = wqe->sg_list[0];
+					qp->s_sge.sg_list = wqe->sg_list + 1;
+					qp->s_sge.num_sge = wqe->wr.num_sge;
+					qp->s_sge.total_len = wqe->length;
+					qp->s_len = wqe->length;
+					req->isge = 0;
+					req->clear_tail = req->setup_head;
+					req->flow_idx = req->setup_head;
+					req->state = TID_REQUEST_ACTIVE;
+				}
+			} else if (delta == 0) {
+				/* Re-send a request */
+				req->cur_seg = 0;
+				req->comp_seg = 0;
+				req->ack_pending = 0;
+				req->flow_idx = req->clear_tail;
+				req->state = TID_REQUEST_RESEND;
+			}
+			req->s_next_psn = qp->s_psn;
+			/* Read one segment at a time */
+			len = min_t(u32, req->seg_len,
+				    wqe->length - req->seg_len * req->cur_seg);
+			delta = hfi1_build_tid_rdma_read_req(qp, wqe, ohdr,
+							     &bth1, &bth2,
+							     &len);
+			if (delta <= 0) {
+				/* Wait for TID space */
+				goto bail;
+			}
+			if (newreq && !(qp->s_flags & RVT_S_UNLIMITED_CREDIT))
+				qp->s_lsn++;
+			hwords += delta;
+			ss = &wpriv->ss;
+			/* Check if this is the last segment */
+			if (req->cur_seg >= req->total_segs &&
+			    ++qp->s_cur == qp->s_size)
+				qp->s_cur = 0;
+			break;
+
 		case IB_WR_ATOMIC_CMP_AND_SWP:
 		case IB_WR_ATOMIC_FETCH_AND_ADD:
 			/*
 			 * Don't allow more operations to be started
 			 * than the QP limits allow.
 			 */
-			if (newreq) {
-				if (qp->s_num_rd_atomic >=
-				    qp->s_max_rd_atomic) {
-					qp->s_flags |= RVT_S_WAIT_RDMAR;
-					goto bail;
-				}
-				qp->s_num_rd_atomic++;
-				if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT))
-					qp->s_lsn++;
+			if (qp->s_num_rd_atomic >=
+			    qp->s_max_rd_atomic) {
+				qp->s_flags |= RVT_S_WAIT_RDMAR;
+				goto bail;
 			}
-			if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP) {
+			qp->s_num_rd_atomic++;
+
+			/* FALLTHROUGH */
+		case IB_WR_OPFN:
+			if (newreq && !(qp->s_flags & RVT_S_UNLIMITED_CREDIT))
+				qp->s_lsn++;
+			if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
+			    wqe->wr.opcode == IB_WR_OPFN) {
 				qp->s_state = OP(COMPARE_SWAP);
 				put_ib_ateth_swap(wqe->atomic_wr.swap,
 						  &ohdr->u.atomic_eth);
@@ -546,18 +914,23 @@
 		default:
 			goto bail;
 		}
-		qp->s_sge.sge = wqe->sg_list[0];
-		qp->s_sge.sg_list = wqe->sg_list + 1;
-		qp->s_sge.num_sge = wqe->wr.num_sge;
-		qp->s_sge.total_len = wqe->length;
-		qp->s_len = wqe->length;
+		if (wqe->wr.opcode != IB_WR_TID_RDMA_READ) {
+			qp->s_sge.sge = wqe->sg_list[0];
+			qp->s_sge.sg_list = wqe->sg_list + 1;
+			qp->s_sge.num_sge = wqe->wr.num_sge;
+			qp->s_sge.total_len = wqe->length;
+			qp->s_len = wqe->length;
+		}
 		if (newreq) {
 			qp->s_tail++;
 			if (qp->s_tail >= qp->s_size)
 				qp->s_tail = 0;
 		}
-		if (wqe->wr.opcode == IB_WR_RDMA_READ)
+		if (wqe->wr.opcode == IB_WR_RDMA_READ ||
+		    wqe->wr.opcode == IB_WR_TID_RDMA_WRITE)
 			qp->s_psn = wqe->lpsn + 1;
+		else if (wqe->wr.opcode == IB_WR_TID_RDMA_READ)
+			qp->s_psn = req->s_next_psn;
 		else
 			qp->s_psn++;
 		break;
@@ -674,10 +1047,137 @@
 		if (qp->s_cur == qp->s_size)
 			qp->s_cur = 0;
 		break;
+
+	case TID_OP(WRITE_RESP):
+		/*
+		 * This value for s_state is used for restarting a TID RDMA
+		 * WRITE request. See comment in OP(RDMA_READ_RESPONSE_MIDDLE
+		 * for more).
+		 */
+		req = wqe_to_tid_req(wqe);
+		req->state = TID_REQUEST_RESEND;
+		rcu_read_lock();
+		remote = rcu_dereference(priv->tid_rdma.remote);
+		req->comp_seg = delta_psn(qp->s_psn, wqe->psn);
+		len = wqe->length - (req->comp_seg * remote->max_len);
+		rcu_read_unlock();
+
+		bth2 = mask_psn(qp->s_psn);
+		hwords += hfi1_build_tid_rdma_write_req(qp, wqe, ohdr, &bth1,
+							&bth2, &len);
+		qp->s_psn = wqe->lpsn + 1;
+		ss = NULL;
+		qp->s_state = TID_OP(WRITE_REQ);
+		priv->pending_tid_w_resp += delta_psn(wqe->lpsn, bth2) + 1;
+		priv->s_tid_cur = qp->s_cur;
+		if (++qp->s_cur == qp->s_size)
+			qp->s_cur = 0;
+		trace_hfi1_tid_req_make_req_write(qp, 0, wqe->wr.opcode,
+						  wqe->psn, wqe->lpsn, req);
+		break;
+
+	case TID_OP(READ_RESP):
+		if (wqe->wr.opcode != IB_WR_TID_RDMA_READ)
+			goto bail;
+		/* This is used to restart a TID read request */
+		req = wqe_to_tid_req(wqe);
+		wpriv = wqe->priv;
+		/*
+		 * Back down. The field qp->s_psn has been set to the psn with
+		 * which the request should be restart. It's OK to use division
+		 * as this is on the retry path.
+		 */
+		req->cur_seg = delta_psn(qp->s_psn, wqe->psn) / priv->pkts_ps;
+
+		/*
+		 * The following function need to be redefined to return the
+		 * status to make sure that we find the flow. At the same
+		 * time, we can use the req->state change to check if the
+		 * call succeeds or not.
+		 */
+		req->state = TID_REQUEST_RESEND;
+		hfi1_tid_rdma_restart_req(qp, wqe, &bth2);
+		if (req->state != TID_REQUEST_ACTIVE) {
+			/*
+			 * Failed to find the flow. Release all allocated tid
+			 * resources.
+			 */
+			hfi1_kern_exp_rcv_clear_all(req);
+			hfi1_kern_clear_hw_flow(priv->rcd, qp);
+
+			hfi1_trdma_send_complete(qp, wqe, IB_WC_LOC_QP_OP_ERR);
+			goto bail;
+		}
+		req->state = TID_REQUEST_RESEND;
+		len = min_t(u32, req->seg_len,
+			    wqe->length - req->seg_len * req->cur_seg);
+		flow = &req->flows[req->flow_idx];
+		len -= flow->sent;
+		req->s_next_psn = flow->flow_state.ib_lpsn + 1;
+		delta = hfi1_build_tid_rdma_read_packet(wqe, ohdr, &bth1,
+							&bth2, &len);
+		if (delta <= 0) {
+			/* Wait for TID space */
+			goto bail;
+		}
+		hwords += delta;
+		ss = &wpriv->ss;
+		/* Check if this is the last segment */
+		if (req->cur_seg >= req->total_segs &&
+		    ++qp->s_cur == qp->s_size)
+			qp->s_cur = 0;
+		qp->s_psn = req->s_next_psn;
+		trace_hfi1_tid_req_make_req_read(qp, 0, wqe->wr.opcode,
+						 wqe->psn, wqe->lpsn, req);
+		break;
+	case TID_OP(READ_REQ):
+		req = wqe_to_tid_req(wqe);
+		delta = cmp_psn(qp->s_psn, wqe->psn);
+		/*
+		 * If the current WR is not TID RDMA READ, or this is the start
+		 * of a new request, we need to change the qp->s_state so that
+		 * the request can be set up properly.
+		 */
+		if (wqe->wr.opcode != IB_WR_TID_RDMA_READ || delta == 0 ||
+		    qp->s_cur == qp->s_tail) {
+			qp->s_state = OP(RDMA_READ_REQUEST);
+			if (delta == 0 || qp->s_cur == qp->s_tail)
+				goto check_s_state;
+			else
+				goto bail;
+		}
+
+		/* Rate limiting */
+		if (qp->s_num_rd_atomic >= qp->s_max_rd_atomic) {
+			qp->s_flags |= RVT_S_WAIT_RDMAR;
+			goto bail;
+		}
+
+		wpriv = wqe->priv;
+		/* Read one segment at a time */
+		len = min_t(u32, req->seg_len,
+			    wqe->length - req->seg_len * req->cur_seg);
+		delta = hfi1_build_tid_rdma_read_req(qp, wqe, ohdr, &bth1,
+						     &bth2, &len);
+		if (delta <= 0) {
+			/* Wait for TID space */
+			goto bail;
+		}
+		hwords += delta;
+		ss = &wpriv->ss;
+		/* Check if this is the last segment */
+		if (req->cur_seg >= req->total_segs &&
+		    ++qp->s_cur == qp->s_size)
+			qp->s_cur = 0;
+		qp->s_psn = req->s_next_psn;
+		trace_hfi1_tid_req_make_req_read(qp, 0, wqe->wr.opcode,
+						 wqe->psn, wqe->lpsn, req);
+		break;
 	}
 	qp->s_sending_hpsn = bth2;
 	delta = delta_psn(bth2, wqe->psn);
-	if (delta && delta % HFI1_PSN_CREDIT == 0)
+	if (delta && delta % HFI1_PSN_CREDIT == 0 &&
+	    wqe->wr.opcode != IB_WR_TID_RDMA_WRITE)
 		bth2 |= IB_BTH_REQ_ACK;
 	if (qp->s_flags & RVT_S_SEND_ONE) {
 		qp->s_flags &= ~RVT_S_SEND_ONE;
@@ -693,6 +1193,7 @@
 		qp,
 		ohdr,
 		bth0 | (qp->s_state << 24),
+		bth1,
 		bth2,
 		middle,
 		ps);
@@ -709,6 +1210,12 @@
 bail_no_tx:
 	ps->s_txreq = NULL;
 	qp->s_flags &= ~RVT_S_BUSY;
+	/*
+	 * If we didn't get a txreq, the QP will be woken up later to try
+	 * again. Set the flags to indicate which work item to wake
+	 * up.
+	 */
+	iowait_set_flag(&priv->s_iowait, IOWAIT_PENDING_IB);
 	return 0;
 }
 
@@ -796,6 +1303,11 @@
 	if (qp->s_mig_state == IB_MIG_MIGRATED)
 		bth0 |= IB_BTH_MIG_REQ;
 	bth1 = (!!is_fecn) << IB_BECN_SHIFT;
+	/*
+	 * Inline ACKs go out without the use of the Verbs send engine, so
+	 * we need to set the STL Verbs Extended bit here
+	 */
+	bth1 |= HFI1_CAP_IS_KSET(OPFN) << IB_BTHE_E_SHIFT;
 	hfi1_make_bth_aeth(qp, ohdr, bth0, bth1);
 }
 
@@ -914,7 +1426,7 @@
 	pbc = create_pbc(ppd, pbc_flags, qp->srate_mbps,
 			 sc_to_vlt(ppd->dd, sc5), plen);
 	pbuf = sc_buffer_alloc(rcd->sc, plen, NULL, NULL);
-	if (!pbuf) {
+	if (IS_ERR_OR_NULL(pbuf)) {
 		/*
 		 * We have no room to send at the moment.  Pass
 		 * responsibility for sending the ACK to the send engine
@@ -936,6 +1448,48 @@
 }
 
 /**
+ * update_num_rd_atomic - update the qp->s_num_rd_atomic
+ * @qp: the QP
+ * @psn: the packet sequence number to restart at
+ * @wqe: the wqe
+ *
+ * This is called from reset_psn() to update qp->s_num_rd_atomic
+ * for the current wqe.
+ * Called at interrupt level with the QP s_lock held.
+ */
+static void update_num_rd_atomic(struct rvt_qp *qp, u32 psn,
+				 struct rvt_swqe *wqe)
+{
+	u32 opcode = wqe->wr.opcode;
+
+	if (opcode == IB_WR_RDMA_READ ||
+	    opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
+	    opcode == IB_WR_ATOMIC_FETCH_AND_ADD) {
+		qp->s_num_rd_atomic++;
+	} else if (opcode == IB_WR_TID_RDMA_READ) {
+		struct tid_rdma_request *req = wqe_to_tid_req(wqe);
+		struct hfi1_qp_priv *priv = qp->priv;
+
+		if (cmp_psn(psn, wqe->lpsn) <= 0) {
+			u32 cur_seg;
+
+			cur_seg = (psn - wqe->psn) / priv->pkts_ps;
+			req->ack_pending = cur_seg - req->comp_seg;
+			priv->pending_tid_r_segs += req->ack_pending;
+			qp->s_num_rd_atomic += req->ack_pending;
+			trace_hfi1_tid_req_update_num_rd_atomic(qp, 0,
+								wqe->wr.opcode,
+								wqe->psn,
+								wqe->lpsn,
+								req);
+		} else {
+			priv->pending_tid_r_segs += req->total_segs;
+			qp->s_num_rd_atomic += req->total_segs;
+		}
+	}
+}
+
+/**
  * reset_psn - reset the QP state to send starting from PSN
  * @qp: the QP
  * @psn: the packet sequence number to restart at
@@ -949,9 +1503,13 @@
 	u32 n = qp->s_acked;
 	struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, n);
 	u32 opcode;
+	struct hfi1_qp_priv *priv = qp->priv;
 
 	lockdep_assert_held(&qp->s_lock);
 	qp->s_cur = n;
+	priv->pending_tid_r_segs = 0;
+	priv->pending_tid_w_resp = 0;
+	qp->s_num_rd_atomic = 0;
 
 	/*
 	 * If we are starting the request from the beginning,
@@ -961,9 +1519,9 @@
 		qp->s_state = OP(SEND_LAST);
 		goto done;
 	}
+	update_num_rd_atomic(qp, psn, wqe);
 
 	/* Find the work request opcode corresponding to the given PSN. */
-	opcode = wqe->wr.opcode;
 	for (;;) {
 		int diff;
 
@@ -973,8 +1531,11 @@
 			break;
 		wqe = rvt_get_swqe_ptr(qp, n);
 		diff = cmp_psn(psn, wqe->psn);
-		if (diff < 0)
+		if (diff < 0) {
+			/* Point wqe back to the previous one*/
+			wqe = rvt_get_swqe_ptr(qp, qp->s_cur);
 			break;
+		}
 		qp->s_cur = n;
 		/*
 		 * If we are starting the request from the beginning,
@@ -984,8 +1545,10 @@
 			qp->s_state = OP(SEND_LAST);
 			goto done;
 		}
-		opcode = wqe->wr.opcode;
+
+		update_num_rd_atomic(qp, psn, wqe);
 	}
+	opcode = wqe->wr.opcode;
 
 	/*
 	 * Set the state to restart in the middle of a request.
@@ -1003,10 +1566,18 @@
 		qp->s_state = OP(RDMA_READ_RESPONSE_LAST);
 		break;
 
+	case IB_WR_TID_RDMA_WRITE:
+		qp->s_state = TID_OP(WRITE_RESP);
+		break;
+
 	case IB_WR_RDMA_READ:
 		qp->s_state = OP(RDMA_READ_RESPONSE_MIDDLE);
 		break;
 
+	case IB_WR_TID_RDMA_READ:
+		qp->s_state = TID_OP(READ_RESP);
+		break;
+
 	default:
 		/*
 		 * This case shouldn't happen since its only
@@ -1015,6 +1586,7 @@
 		qp->s_state = OP(SEND_LAST);
 	}
 done:
+	priv->s_flags &= ~HFI1_S_TID_WAIT_INTERLCK;
 	qp->s_psn = psn;
 	/*
 	 * Set RVT_S_WAIT_PSN as rc_complete() may start the timer
@@ -1025,6 +1597,7 @@
 	    (cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) <= 0))
 		qp->s_flags |= RVT_S_WAIT_PSN;
 	qp->s_flags &= ~HFI1_S_AHG_VALID;
+	trace_hfi1_sender_reset_psn(qp);
 }
 
 /*
@@ -1033,18 +1606,47 @@
  */
 void hfi1_restart_rc(struct rvt_qp *qp, u32 psn, int wait)
 {
+	struct hfi1_qp_priv *priv = qp->priv;
 	struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
 	struct hfi1_ibport *ibp;
 
 	lockdep_assert_held(&qp->r_lock);
 	lockdep_assert_held(&qp->s_lock);
+	trace_hfi1_sender_restart_rc(qp);
 	if (qp->s_retry == 0) {
 		if (qp->s_mig_state == IB_MIG_ARMED) {
 			hfi1_migrate_qp(qp);
 			qp->s_retry = qp->s_retry_cnt;
 		} else if (qp->s_last == qp->s_acked) {
-			hfi1_send_complete(qp, wqe, IB_WC_RETRY_EXC_ERR);
-			rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR);
+			/*
+			 * We need special handling for the OPFN request WQEs as
+			 * they are not allowed to generate real user errors
+			 */
+			if (wqe->wr.opcode == IB_WR_OPFN) {
+				struct hfi1_ibport *ibp =
+					to_iport(qp->ibqp.device, qp->port_num);
+				/*
+				 * Call opfn_conn_reply() with capcode and
+				 * remaining data as 0 to close out the
+				 * current request
+				 */
+				opfn_conn_reply(qp, priv->opfn.curr);
+				wqe = do_rc_completion(qp, wqe, ibp);
+				qp->s_flags &= ~RVT_S_WAIT_ACK;
+			} else {
+				trace_hfi1_tid_write_sender_restart_rc(qp, 0);
+				if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) {
+					struct tid_rdma_request *req;
+
+					req = wqe_to_tid_req(wqe);
+					hfi1_kern_exp_rcv_clear_all(req);
+					hfi1_kern_clear_hw_flow(priv->rcd, qp);
+				}
+
+				hfi1_trdma_send_complete(qp, wqe,
+							 IB_WC_RETRY_EXC_ERR);
+				rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR);
+			}
 			return;
 		} else { /* need to handle delayed completion */
 			return;
@@ -1054,14 +1656,15 @@
 	}
 
 	ibp = to_iport(qp->ibqp.device, qp->port_num);
-	if (wqe->wr.opcode == IB_WR_RDMA_READ)
+	if (wqe->wr.opcode == IB_WR_RDMA_READ ||
+	    wqe->wr.opcode == IB_WR_TID_RDMA_READ)
 		ibp->rvp.n_rc_resends++;
 	else
 		ibp->rvp.n_rc_resends += delta_psn(qp->s_psn, psn);
 
 	qp->s_flags &= ~(RVT_S_WAIT_FENCE | RVT_S_WAIT_RDMAR |
 			 RVT_S_WAIT_SSN_CREDIT | RVT_S_WAIT_PSN |
-			 RVT_S_WAIT_ACK);
+			 RVT_S_WAIT_ACK | HFI1_S_WAIT_TID_RESP);
 	if (wait)
 		qp->s_flags |= RVT_S_SEND_ONE;
 	reset_psn(qp, psn);
@@ -1069,7 +1672,8 @@
 
 /*
  * Set qp->s_sending_psn to the next PSN after the given one.
- * This would be psn+1 except when RDMA reads are present.
+ * This would be psn+1 except when RDMA reads or TID RDMA ops
+ * are present.
  */
 static void reset_sending_psn(struct rvt_qp *qp, u32 psn)
 {
@@ -1081,7 +1685,9 @@
 	for (;;) {
 		wqe = rvt_get_swqe_ptr(qp, n);
 		if (cmp_psn(psn, wqe->lpsn) <= 0) {
-			if (wqe->wr.opcode == IB_WR_RDMA_READ)
+			if (wqe->wr.opcode == IB_WR_RDMA_READ ||
+			    wqe->wr.opcode == IB_WR_TID_RDMA_READ ||
+			    wqe->wr.opcode == IB_WR_TID_RDMA_WRITE)
 				qp->s_sending_psn = wqe->lpsn + 1;
 			else
 				qp->s_sending_psn = psn + 1;
@@ -1094,6 +1700,36 @@
 	}
 }
 
+/**
+ * hfi1_rc_verbs_aborted - handle abort status
+ * @qp: the QP
+ * @opah: the opa header
+ *
+ * This code modifies both ACK bit in BTH[2]
+ * and the s_flags to go into send one mode.
+ *
+ * This serves to throttle the send engine to only
+ * send a single packet in the likely case the
+ * a link has gone down.
+ */
+void hfi1_rc_verbs_aborted(struct rvt_qp *qp, struct hfi1_opa_header *opah)
+{
+	struct ib_other_headers *ohdr = hfi1_get_rc_ohdr(opah);
+	u8 opcode = ib_bth_get_opcode(ohdr);
+	u32 psn;
+
+	/* ignore responses */
+	if ((opcode >= OP(RDMA_READ_RESPONSE_FIRST) &&
+	     opcode <= OP(ATOMIC_ACKNOWLEDGE)) ||
+	    opcode == TID_OP(READ_RESP) ||
+	    opcode == TID_OP(WRITE_RESP))
+		return;
+
+	psn = ib_bth_get_psn(ohdr) | IB_BTH_REQ_ACK;
+	ohdr->bth[2] = cpu_to_be32(psn);
+	qp->s_flags |= RVT_S_SEND_ONE;
+}
+
 /*
  * This should be called with the QP s_lock held and interrupts disabled.
  */
@@ -1102,70 +1738,104 @@
 	struct ib_other_headers *ohdr;
 	struct hfi1_qp_priv *priv = qp->priv;
 	struct rvt_swqe *wqe;
-	struct ib_header *hdr = NULL;
-	struct hfi1_16b_header *hdr_16b = NULL;
-	u32 opcode;
+	u32 opcode, head, tail;
 	u32 psn;
+	struct tid_rdma_request *req;
 
 	lockdep_assert_held(&qp->s_lock);
 	if (!(ib_rvt_state_ops[qp->state] & RVT_SEND_OR_FLUSH_OR_RECV_OK))
 		return;
 
-	/* Find out where the BTH is */
-	if (priv->hdr_type == HFI1_PKT_TYPE_9B) {
-		hdr = &opah->ibh;
-		if (ib_get_lnh(hdr) == HFI1_LRH_BTH)
-			ohdr = &hdr->u.oth;
-		else
-			ohdr = &hdr->u.l.oth;
-	} else {
-		u8 l4;
-
-		hdr_16b = &opah->opah;
-		l4  = hfi1_16B_get_l4(hdr_16b);
-		if (l4 == OPA_16B_L4_IB_LOCAL)
-			ohdr = &hdr_16b->u.oth;
-		else
-			ohdr = &hdr_16b->u.l.oth;
-	}
-
+	ohdr = hfi1_get_rc_ohdr(opah);
 	opcode = ib_bth_get_opcode(ohdr);
-	if (opcode >= OP(RDMA_READ_RESPONSE_FIRST) &&
-	    opcode <= OP(ATOMIC_ACKNOWLEDGE)) {
+	if ((opcode >= OP(RDMA_READ_RESPONSE_FIRST) &&
+	     opcode <= OP(ATOMIC_ACKNOWLEDGE)) ||
+	    opcode == TID_OP(READ_RESP) ||
+	    opcode == TID_OP(WRITE_RESP)) {
 		WARN_ON(!qp->s_rdma_ack_cnt);
 		qp->s_rdma_ack_cnt--;
 		return;
 	}
 
 	psn = ib_bth_get_psn(ohdr);
-	reset_sending_psn(qp, psn);
+	/*
+	 * Don't attempt to reset the sending PSN for packets in the
+	 * KDETH PSN space since the PSN does not match anything.
+	 */
+	if (opcode != TID_OP(WRITE_DATA) &&
+	    opcode != TID_OP(WRITE_DATA_LAST) &&
+	    opcode != TID_OP(ACK) && opcode != TID_OP(RESYNC))
+		reset_sending_psn(qp, psn);
+
+	/* Handle TID RDMA WRITE packets differently */
+	if (opcode >= TID_OP(WRITE_REQ) &&
+	    opcode <= TID_OP(WRITE_DATA_LAST)) {
+		head = priv->s_tid_head;
+		tail = priv->s_tid_cur;
+		/*
+		 * s_tid_cur is set to s_tid_head in the case, where
+		 * a new TID RDMA request is being started and all
+		 * previous ones have been completed.
+		 * Therefore, we need to do a secondary check in order
+		 * to properly determine whether we should start the
+		 * RC timer.
+		 */
+		wqe = rvt_get_swqe_ptr(qp, tail);
+		req = wqe_to_tid_req(wqe);
+		if (head == tail && req->comp_seg < req->total_segs) {
+			if (tail == 0)
+				tail = qp->s_size - 1;
+			else
+				tail -= 1;
+		}
+	} else {
+		head = qp->s_tail;
+		tail = qp->s_acked;
+	}
 
 	/*
 	 * Start timer after a packet requesting an ACK has been sent and
 	 * there are still requests that haven't been acked.
 	 */
-	if ((psn & IB_BTH_REQ_ACK) && qp->s_acked != qp->s_tail &&
+	if ((psn & IB_BTH_REQ_ACK) && tail != head &&
+	    opcode != TID_OP(WRITE_DATA) && opcode != TID_OP(WRITE_DATA_LAST) &&
+	    opcode != TID_OP(RESYNC) &&
 	    !(qp->s_flags &
-		(RVT_S_TIMER | RVT_S_WAIT_RNR | RVT_S_WAIT_PSN)) &&
-		(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK))
-		rvt_add_retry_timer(qp);
+	      (RVT_S_TIMER | RVT_S_WAIT_RNR | RVT_S_WAIT_PSN)) &&
+	    (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) {
+		if (opcode == TID_OP(READ_REQ))
+			rvt_add_retry_timer_ext(qp, priv->timeout_shift);
+		else
+			rvt_add_retry_timer(qp);
+	}
+
+	/* Start TID RDMA ACK timer */
+	if ((opcode == TID_OP(WRITE_DATA) ||
+	     opcode == TID_OP(WRITE_DATA_LAST) ||
+	     opcode == TID_OP(RESYNC)) &&
+	    (psn & IB_BTH_REQ_ACK) &&
+	    !(priv->s_flags & HFI1_S_TID_RETRY_TIMER) &&
+	    (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) {
+		/*
+		 * The TID RDMA ACK packet could be received before this
+		 * function is called. Therefore, add the timer only if TID
+		 * RDMA ACK packets are actually pending.
+		 */
+		wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
+		req = wqe_to_tid_req(wqe);
+		if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE &&
+		    req->ack_seg < req->cur_seg)
+			hfi1_add_tid_retry_timer(qp);
+	}
 
 	while (qp->s_last != qp->s_acked) {
-		u32 s_last;
-
 		wqe = rvt_get_swqe_ptr(qp, qp->s_last);
 		if (cmp_psn(wqe->lpsn, qp->s_sending_psn) >= 0 &&
 		    cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) <= 0)
 			break;
-		s_last = qp->s_last;
-		trace_hfi1_qp_send_completion(qp, wqe, s_last);
-		if (++s_last >= qp->s_size)
-			s_last = 0;
-		qp->s_last = s_last;
-		/* see post_send() */
-		barrier();
-		rvt_put_swqe(wqe);
-		rvt_qp_swqe_complete(qp,
+		trdma_clean_swqe(qp, wqe);
+		trace_hfi1_qp_send_completion(qp, wqe, qp->s_last);
+		rvt_qp_complete_swqe(qp,
 				     wqe,
 				     ib_hfi1_wc_opcode[wqe->wr.opcode],
 				     IB_WC_SUCCESS);
@@ -1194,29 +1864,24 @@
  * This is similar to hfi1_send_complete but has to check to be sure
  * that the SGEs are not being referenced if the SWQE is being resent.
  */
-static struct rvt_swqe *do_rc_completion(struct rvt_qp *qp,
-					 struct rvt_swqe *wqe,
-					 struct hfi1_ibport *ibp)
+struct rvt_swqe *do_rc_completion(struct rvt_qp *qp,
+				  struct rvt_swqe *wqe,
+				  struct hfi1_ibport *ibp)
 {
+	struct hfi1_qp_priv *priv = qp->priv;
+
 	lockdep_assert_held(&qp->s_lock);
 	/*
 	 * Don't decrement refcount and don't generate a
 	 * completion if the SWQE is being resent until the send
 	 * is finished.
 	 */
+	trace_hfi1_rc_completion(qp, wqe->lpsn);
 	if (cmp_psn(wqe->lpsn, qp->s_sending_psn) < 0 ||
 	    cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) > 0) {
-		u32 s_last;
-
-		rvt_put_swqe(wqe);
-		s_last = qp->s_last;
-		trace_hfi1_qp_send_completion(qp, wqe, s_last);
-		if (++s_last >= qp->s_size)
-			s_last = 0;
-		qp->s_last = s_last;
-		/* see post_send() */
-		barrier();
-		rvt_qp_swqe_complete(qp,
+		trdma_clean_swqe(qp, wqe);
+		trace_hfi1_qp_send_completion(qp, wqe, qp->s_last);
+		rvt_qp_complete_swqe(qp,
 				     wqe,
 				     ib_hfi1_wc_opcode[wqe->wr.opcode],
 				     IB_WC_SUCCESS);
@@ -1241,7 +1906,16 @@
 	}
 
 	qp->s_retry = qp->s_retry_cnt;
-	update_last_psn(qp, wqe->lpsn);
+	/*
+	 * Don't update the last PSN if the request being completed is
+	 * a TID RDMA WRITE request.
+	 * Completion of the TID RDMA WRITE requests are done by the
+	 * TID RDMA ACKs and as such could be for a request that has
+	 * already been ACKed as far as the IB state machine is
+	 * concerned.
+	 */
+	if (wqe->wr.opcode != IB_WR_TID_RDMA_WRITE)
+		update_last_psn(qp, wqe->lpsn);
 
 	/*
 	 * If we are completing a request which is in the process of
@@ -1264,9 +1938,61 @@
 			qp->s_draining = 0;
 		wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
 	}
+	if (priv->s_flags & HFI1_S_TID_WAIT_INTERLCK) {
+		priv->s_flags &= ~HFI1_S_TID_WAIT_INTERLCK;
+		hfi1_schedule_send(qp);
+	}
 	return wqe;
 }
 
+static void set_restart_qp(struct rvt_qp *qp, struct hfi1_ctxtdata *rcd)
+{
+	/* Retry this request. */
+	if (!(qp->r_flags & RVT_R_RDMAR_SEQ)) {
+		qp->r_flags |= RVT_R_RDMAR_SEQ;
+		hfi1_restart_rc(qp, qp->s_last_psn + 1, 0);
+		if (list_empty(&qp->rspwait)) {
+			qp->r_flags |= RVT_R_RSP_SEND;
+			rvt_get_qp(qp);
+			list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
+		}
+	}
+}
+
+/**
+ * update_qp_retry_state - Update qp retry state.
+ * @qp: the QP
+ * @psn: the packet sequence number of the TID RDMA WRITE RESP.
+ * @spsn:  The start psn for the given TID RDMA WRITE swqe.
+ * @lpsn:  The last psn for the given TID RDMA WRITE swqe.
+ *
+ * This function is called to update the qp retry state upon
+ * receiving a TID WRITE RESP after the qp is scheduled to retry
+ * a request.
+ */
+static void update_qp_retry_state(struct rvt_qp *qp, u32 psn, u32 spsn,
+				  u32 lpsn)
+{
+	struct hfi1_qp_priv *qpriv = qp->priv;
+
+	qp->s_psn = psn + 1;
+	/*
+	 * If this is the first TID RDMA WRITE RESP packet for the current
+	 * request, change the s_state so that the retry will be processed
+	 * correctly. Similarly, if this is the last TID RDMA WRITE RESP
+	 * packet, change the s_state and advance the s_cur.
+	 */
+	if (cmp_psn(psn, lpsn) >= 0) {
+		qp->s_cur = qpriv->s_tid_cur + 1;
+		if (qp->s_cur >= qp->s_size)
+			qp->s_cur = 0;
+		qp->s_state = TID_OP(WRITE_REQ);
+	} else  if (!cmp_psn(psn, spsn)) {
+		qp->s_cur = qpriv->s_tid_cur;
+		qp->s_state = TID_OP(WRITE_RESP);
+	}
+}
+
 /**
  * do_rc_ack - process an incoming RC ACK
  * @qp: the QP the ACK came in on
@@ -1278,15 +2004,17 @@
  * May be called at interrupt level, with the QP s_lock held.
  * Returns 1 if OK, 0 if current operation should be aborted (NAK).
  */
-static int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode,
-		     u64 val, struct hfi1_ctxtdata *rcd)
+int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode,
+	      u64 val, struct hfi1_ctxtdata *rcd)
 {
 	struct hfi1_ibport *ibp;
 	enum ib_wc_status status;
+	struct hfi1_qp_priv *qpriv = qp->priv;
 	struct rvt_swqe *wqe;
 	int ret = 0;
 	u32 ack_psn;
 	int diff;
+	struct rvt_dev_info *rdi;
 
 	lockdep_assert_held(&qp->s_lock);
 	/*
@@ -1329,20 +2057,14 @@
 		 */
 		if ((wqe->wr.opcode == IB_WR_RDMA_READ &&
 		     (opcode != OP(RDMA_READ_RESPONSE_LAST) || diff != 0)) ||
+		    (wqe->wr.opcode == IB_WR_TID_RDMA_READ &&
+		     (opcode != TID_OP(READ_RESP) || diff != 0)) ||
 		    ((wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
 		      wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) &&
-		     (opcode != OP(ATOMIC_ACKNOWLEDGE) || diff != 0))) {
-			/* Retry this request. */
-			if (!(qp->r_flags & RVT_R_RDMAR_SEQ)) {
-				qp->r_flags |= RVT_R_RDMAR_SEQ;
-				hfi1_restart_rc(qp, qp->s_last_psn + 1, 0);
-				if (list_empty(&qp->rspwait)) {
-					qp->r_flags |= RVT_R_RSP_SEND;
-					rvt_get_qp(qp);
-					list_add_tail(&qp->rspwait,
-						      &rcd->qp_wait_list);
-				}
-			}
+		     (opcode != OP(ATOMIC_ACKNOWLEDGE) || diff != 0)) ||
+		    (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE &&
+		     (delta_psn(psn, qp->s_last_psn) != 1))) {
+			set_restart_qp(qp, rcd);
 			/*
 			 * No need to process the ACK/NAK since we are
 			 * restarting an earlier request.
@@ -1354,6 +2076,9 @@
 			u64 *vaddr = wqe->sg_list[0].vaddr;
 			*vaddr = val;
 		}
+		if (wqe->wr.opcode == IB_WR_OPFN)
+			opfn_conn_reply(qp, val);
+
 		if (qp->s_num_rd_atomic &&
 		    (wqe->wr.opcode == IB_WR_RDMA_READ ||
 		     wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
@@ -1371,26 +2096,85 @@
 				hfi1_schedule_send(qp);
 			}
 		}
+
+		/*
+		 * TID RDMA WRITE requests will be completed by the TID RDMA
+		 * ACK packet handler (see tid_rdma.c).
+		 */
+		if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE)
+			break;
+
 		wqe = do_rc_completion(qp, wqe, ibp);
 		if (qp->s_acked == qp->s_tail)
 			break;
 	}
 
+	trace_hfi1_rc_ack_do(qp, aeth, psn, wqe);
+	trace_hfi1_sender_do_rc_ack(qp);
 	switch (aeth >> IB_AETH_NAK_SHIFT) {
 	case 0:         /* ACK */
 		this_cpu_inc(*ibp->rvp.rc_acks);
-		if (qp->s_acked != qp->s_tail) {
+		if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) {
+			if (wqe_to_tid_req(wqe)->ack_pending)
+				rvt_mod_retry_timer_ext(qp,
+							qpriv->timeout_shift);
+			else
+				rvt_stop_rc_timers(qp);
+		} else if (qp->s_acked != qp->s_tail) {
+			struct rvt_swqe *__w = NULL;
+
+			if (qpriv->s_tid_cur != HFI1_QP_WQE_INVALID)
+				__w = rvt_get_swqe_ptr(qp, qpriv->s_tid_cur);
+
 			/*
-			 * We are expecting more ACKs so
-			 * mod the retry timer.
+			 * Stop timers if we've received all of the TID RDMA
+			 * WRITE * responses.
 			 */
-			rvt_mod_retry_timer(qp);
-			/*
-			 * We can stop re-sending the earlier packets and
-			 * continue with the next packet the receiver wants.
-			 */
-			if (cmp_psn(qp->s_psn, psn) <= 0)
-				reset_psn(qp, psn + 1);
+			if (__w && __w->wr.opcode == IB_WR_TID_RDMA_WRITE &&
+			    opcode == TID_OP(WRITE_RESP)) {
+				/*
+				 * Normally, the loop above would correctly
+				 * process all WQEs from s_acked onward and
+				 * either complete them or check for correct
+				 * PSN sequencing.
+				 * However, for TID RDMA, due to pipelining,
+				 * the response may not be for the request at
+				 * s_acked so the above look would just be
+				 * skipped. This does not allow for checking
+				 * the PSN sequencing. It has to be done
+				 * separately.
+				 */
+				if (cmp_psn(psn, qp->s_last_psn + 1)) {
+					set_restart_qp(qp, rcd);
+					goto bail_stop;
+				}
+				/*
+				 * If the psn is being resent, stop the
+				 * resending.
+				 */
+				if (qp->s_cur != qp->s_tail &&
+				    cmp_psn(qp->s_psn, psn) <= 0)
+					update_qp_retry_state(qp, psn,
+							      __w->psn,
+							      __w->lpsn);
+				else if (--qpriv->pending_tid_w_resp)
+					rvt_mod_retry_timer(qp);
+				else
+					rvt_stop_rc_timers(qp);
+			} else {
+				/*
+				 * We are expecting more ACKs so
+				 * mod the retry timer.
+				 */
+				rvt_mod_retry_timer(qp);
+				/*
+				 * We can stop re-sending the earlier packets
+				 * and continue with the next packet the
+				 * receiver wants.
+				 */
+				if (cmp_psn(qp->s_psn, psn) <= 0)
+					reset_psn(qp, psn + 1);
+			}
 		} else {
 			/* No more acks - kill all timers */
 			rvt_stop_rc_timers(qp);
@@ -1406,6 +2190,15 @@
 		rvt_get_credit(qp, aeth);
 		qp->s_rnr_retry = qp->s_rnr_retry_cnt;
 		qp->s_retry = qp->s_retry_cnt;
+		/*
+		 * If the current request is a TID RDMA WRITE request and the
+		 * response is not a TID RDMA WRITE RESP packet, s_last_psn
+		 * can't be advanced.
+		 */
+		if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE &&
+		    opcode != TID_OP(WRITE_RESP) &&
+		    cmp_psn(psn, wqe->psn) >= 0)
+			return 1;
 		update_last_psn(qp, psn);
 		return 1;
 
@@ -1415,20 +2208,31 @@
 			goto bail_stop;
 		if (qp->s_flags & RVT_S_WAIT_RNR)
 			goto bail_stop;
-		if (qp->s_rnr_retry == 0) {
-			status = IB_WC_RNR_RETRY_EXC_ERR;
-			goto class_b;
+		rdi = ib_to_rvt(qp->ibqp.device);
+		if (!(rdi->post_parms[wqe->wr.opcode].flags &
+		       RVT_OPERATION_IGN_RNR_CNT)) {
+			if (qp->s_rnr_retry == 0) {
+				status = IB_WC_RNR_RETRY_EXC_ERR;
+				goto class_b;
+			}
+			if (qp->s_rnr_retry_cnt < 7 && qp->s_rnr_retry_cnt > 0)
+				qp->s_rnr_retry--;
 		}
-		if (qp->s_rnr_retry_cnt < 7)
-			qp->s_rnr_retry--;
 
-		/* The last valid PSN is the previous PSN. */
-		update_last_psn(qp, psn - 1);
+		/*
+		 * The last valid PSN is the previous PSN. For TID RDMA WRITE
+		 * request, s_last_psn should be incremented only when a TID
+		 * RDMA WRITE RESP is received to avoid skipping lost TID RDMA
+		 * WRITE RESP packets.
+		 */
+		if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE) {
+			reset_psn(qp, qp->s_last_psn + 1);
+		} else {
+			update_last_psn(qp, psn - 1);
+			reset_psn(qp, psn);
+		}
 
 		ibp->rvp.n_rc_resends += delta_psn(qp->s_psn, psn);
-
-		reset_psn(qp, psn);
-
 		qp->s_flags &= ~(RVT_S_WAIT_SSN_CREDIT | RVT_S_WAIT_ACK);
 		rvt_stop_rc_timers(qp);
 		rvt_add_rnr_timer(qp, aeth);
@@ -1468,7 +2272,10 @@
 			ibp->rvp.n_other_naks++;
 class_b:
 			if (qp->s_last == qp->s_acked) {
-				hfi1_send_complete(qp, wqe, status);
+				if (wqe->wr.opcode == IB_WR_TID_RDMA_READ)
+					hfi1_kern_read_tid_flow_free(qp);
+
+				hfi1_trdma_send_complete(qp, wqe, status);
 				rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR);
 			}
 			break;
@@ -1509,6 +2316,8 @@
 
 	while (cmp_psn(psn, wqe->lpsn) > 0) {
 		if (wqe->wr.opcode == IB_WR_RDMA_READ ||
+		    wqe->wr.opcode == IB_WR_TID_RDMA_READ ||
+		    wqe->wr.opcode == IB_WR_TID_RDMA_WRITE ||
 		    wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
 		    wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD)
 			break;
@@ -1644,7 +2453,8 @@
 		qp->s_rdma_read_len -= pmtu;
 		update_last_psn(qp, psn);
 		spin_unlock_irqrestore(&qp->s_lock, flags);
-		hfi1_copy_sge(&qp->s_rdma_read_sge, data, pmtu, false, false);
+		rvt_copy_sge(qp, &qp->s_rdma_read_sge,
+			     data, pmtu, false, false);
 		goto bail;
 
 	case OP(RDMA_READ_RESPONSE_ONLY):
@@ -1684,7 +2494,8 @@
 		if (unlikely(tlen != qp->s_rdma_read_len))
 			goto ack_len_err;
 		aeth = be32_to_cpu(ohdr->u.aeth);
-		hfi1_copy_sge(&qp->s_rdma_read_sge, data, tlen, false, false);
+		rvt_copy_sge(qp, &qp->s_rdma_read_sge,
+			     data, tlen, false, false);
 		WARN_ON(qp->s_rdma_read_sge.num_sge);
 		(void)do_rc_ack(qp, aeth, psn,
 				 OP(RDMA_READ_RESPONSE_LAST), 0, rcd);
@@ -1704,7 +2515,7 @@
 	status = IB_WC_LOC_LEN_ERR;
 ack_err:
 	if (qp->s_last == qp->s_acked) {
-		hfi1_send_complete(qp, wqe, status);
+		rvt_send_complete(qp, wqe, status);
 		rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR);
 	}
 ack_done:
@@ -1713,16 +2524,6 @@
 	return;
 }
 
-static inline void rc_defered_ack(struct hfi1_ctxtdata *rcd,
-				  struct rvt_qp *qp)
-{
-	if (list_empty(&qp->rspwait)) {
-		qp->r_flags |= RVT_R_RSP_NAK;
-		rvt_get_qp(qp);
-		list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
-	}
-}
-
 static inline void rc_cancel_ack(struct rvt_qp *qp)
 {
 	qp->r_adefered = 0;
@@ -1755,8 +2556,9 @@
 	struct hfi1_ibport *ibp = rcd_to_iport(rcd);
 	struct rvt_ack_entry *e;
 	unsigned long flags;
-	u8 i, prev;
-	int old_req;
+	u8 prev;
+	u8 mra; /* most recent ACK */
+	bool old_req;
 
 	trace_hfi1_rcv_error(qp, psn);
 	if (diff > 0) {
@@ -1802,29 +2604,8 @@
 
 	spin_lock_irqsave(&qp->s_lock, flags);
 
-	for (i = qp->r_head_ack_queue; ; i = prev) {
-		if (i == qp->s_tail_ack_queue)
-			old_req = 0;
-		if (i)
-			prev = i - 1;
-		else
-			prev = HFI1_MAX_RDMA_ATOMIC;
-		if (prev == qp->r_head_ack_queue) {
-			e = NULL;
-			break;
-		}
-		e = &qp->s_ack_queue[prev];
-		if (!e->opcode) {
-			e = NULL;
-			break;
-		}
-		if (cmp_psn(psn, e->psn) >= 0) {
-			if (prev == qp->s_tail_ack_queue &&
-			    cmp_psn(psn, e->lpsn) <= 0)
-				old_req = 0;
-			break;
-		}
-	}
+	e = find_prev_entry(qp, psn, &prev, &mra, &old_req);
+
 	switch (opcode) {
 	case OP(RDMA_READ_REQUEST): {
 		struct ib_reth *reth;
@@ -1850,10 +2631,7 @@
 		len = be32_to_cpu(reth->length);
 		if (unlikely(offset + len != e->rdma_sge.sge_length))
 			goto unlock_done;
-		if (e->rdma_sge.mr) {
-			rvt_put_mr(e->rdma_sge.mr);
-			e->rdma_sge.mr = NULL;
-		}
+		release_rdma_sge_mr(e);
 		if (len != 0) {
 			u32 rkey = be32_to_cpu(reth->rkey);
 			u64 vaddr = get_ib_reth_vaddr(reth);
@@ -1871,6 +2649,8 @@
 		e->psn = psn;
 		if (old_req)
 			goto unlock_done;
+		if (qp->s_acked_ack_queue == qp->s_tail_ack_queue)
+			qp->s_acked_ack_queue = prev;
 		qp->s_tail_ack_queue = prev;
 		break;
 	}
@@ -1884,6 +2664,8 @@
 		 */
 		if (!e || e->opcode != (u8)opcode || old_req)
 			goto unlock_done;
+		if (qp->s_tail_ack_queue == qp->s_acked_ack_queue)
+			qp->s_acked_ack_queue = prev;
 		qp->s_tail_ack_queue = prev;
 		break;
 	}
@@ -1899,7 +2681,7 @@
 		 * Resend the most recent ACK if this request is
 		 * after all the previous RDMA reads and atomics.
 		 */
-		if (i == qp->r_head_ack_queue) {
+		if (mra == qp->r_head_ack_queue) {
 			spin_unlock_irqrestore(&qp->s_lock, flags);
 			qp->r_nak_state = 0;
 			qp->r_ack_psn = qp->r_psn - 1;
@@ -1910,7 +2692,9 @@
 		 * Resend the RDMA read or atomic op which
 		 * ACKs this duplicate request.
 		 */
-		qp->s_tail_ack_queue = i;
+		if (qp->s_tail_ack_queue == qp->s_acked_ack_queue)
+			qp->s_acked_ack_queue = mra;
+		qp->s_tail_ack_queue = mra;
 		break;
 	}
 	qp->s_ack_state = OP(ACKNOWLEDGE);
@@ -1927,17 +2711,6 @@
 	return 0;
 }
 
-static inline void update_ack_queue(struct rvt_qp *qp, unsigned n)
-{
-	unsigned next;
-
-	next = n + 1;
-	if (next > HFI1_MAX_RDMA_ATOMIC)
-		next = 0;
-	qp->s_tail_ack_queue = next;
-	qp->s_ack_state = OP(ACKNOWLEDGE);
-}
-
 static void log_cca_event(struct hfi1_pportdata *ppd, u8 sl, u32 rlid,
 			  u32 lqpn, u32 rqpn, u8 svc_type)
 {
@@ -2035,6 +2808,7 @@
 	void *data = packet->payload;
 	u32 tlen = packet->tlen;
 	struct rvt_qp *qp = packet->qp;
+	struct hfi1_qp_priv *qpriv = qp->priv;
 	struct hfi1_ibport *ibp = rcd_to_iport(rcd);
 	struct ib_other_headers *ohdr = packet->ohdr;
 	u32 opcode = packet->opcode;
@@ -2047,8 +2821,7 @@
 	struct ib_reth *reth;
 	unsigned long flags;
 	int ret;
-	bool is_fecn = false;
-	bool copy_last = false;
+	bool copy_last = false, fecn;
 	u32 rkey;
 	u8 extra_bytes = pad + packet->extra_byte + (SIZE_OF_CRC << 2);
 
@@ -2057,7 +2830,8 @@
 	if (hfi1_ruc_check_hdr(ibp, packet))
 		return;
 
-	is_fecn = process_ecn(qp, packet, false);
+	fecn = process_ecn(qp, packet);
+	opfn_trigger_conn_request(qp, be32_to_cpu(ohdr->bth[1]));
 
 	/*
 	 * Process responses (ACKs) before anything else.  Note that the
@@ -2068,8 +2842,6 @@
 	if (opcode >= OP(RDMA_READ_RESPONSE_FIRST) &&
 	    opcode <= OP(ATOMIC_ACKNOWLEDGE)) {
 		rc_rcv_resp(packet);
-		if (is_fecn)
-			goto send_ack;
 		return;
 	}
 
@@ -2144,7 +2916,7 @@
 		qp->r_rcv_len += pmtu;
 		if (unlikely(qp->r_rcv_len > qp->r_len))
 			goto nack_inv;
-		hfi1_copy_sge(&qp->r_sge, data, pmtu, true, false);
+		rvt_copy_sge(qp, &qp->r_sge, data, pmtu, true, false);
 		break;
 
 	case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE):
@@ -2200,7 +2972,7 @@
 		wc.byte_len = tlen + qp->r_rcv_len;
 		if (unlikely(wc.byte_len > qp->r_len))
 			goto nack_inv;
-		hfi1_copy_sge(&qp->r_sge, data, tlen, true, copy_last);
+		rvt_copy_sge(qp, &qp->r_sge, data, tlen, true, copy_last);
 		rvt_put_ss(&qp->r_sge);
 		qp->r_msn++;
 		if (!__test_and_clear_bit(RVT_R_WRID_VALID, &qp->r_aflags))
@@ -2233,8 +3005,7 @@
 		wc.dlid_path_bits = 0;
 		wc.port_num = 0;
 		/* Signal completion event if the solicited bit is set. */
-		rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc,
-			     ib_bth_is_solicited(ohdr));
+		rvt_recv_cq(qp, &wc, ib_bth_is_solicited(ohdr));
 		break;
 
 	case OP(RDMA_WRITE_ONLY):
@@ -2291,20 +3062,17 @@
 		if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ)))
 			goto nack_inv;
 		next = qp->r_head_ack_queue + 1;
-		/* s_ack_queue is size HFI1_MAX_RDMA_ATOMIC+1 so use > not >= */
-		if (next > HFI1_MAX_RDMA_ATOMIC)
+		/* s_ack_queue is size rvt_size_atomic()+1 so use > not >= */
+		if (next > rvt_size_atomic(ib_to_rvt(qp->ibqp.device)))
 			next = 0;
 		spin_lock_irqsave(&qp->s_lock, flags);
-		if (unlikely(next == qp->s_tail_ack_queue)) {
+		if (unlikely(next == qp->s_acked_ack_queue)) {
 			if (!qp->s_ack_queue[next].sent)
 				goto nack_inv_unlck;
 			update_ack_queue(qp, next);
 		}
 		e = &qp->s_ack_queue[qp->r_head_ack_queue];
-		if (e->opcode == OP(RDMA_READ_REQUEST) && e->rdma_sge.mr) {
-			rvt_put_mr(e->rdma_sge.mr);
-			e->rdma_sge.mr = NULL;
-		}
+		release_rdma_sge_mr(e);
 		reth = &ohdr->u.rc.reth;
 		len = be32_to_cpu(reth->length);
 		if (len) {
@@ -2342,45 +3110,49 @@
 		qp->r_state = opcode;
 		qp->r_nak_state = 0;
 		qp->r_head_ack_queue = next;
+		qpriv->r_tid_alloc = qp->r_head_ack_queue;
 
 		/* Schedule the send engine. */
 		qp->s_flags |= RVT_S_RESP_PENDING;
+		if (fecn)
+			qp->s_flags |= RVT_S_ECN;
 		hfi1_schedule_send(qp);
 
 		spin_unlock_irqrestore(&qp->s_lock, flags);
-		if (is_fecn)
-			goto send_ack;
 		return;
 	}
 
 	case OP(COMPARE_SWAP):
 	case OP(FETCH_ADD): {
-		struct ib_atomic_eth *ateth;
+		struct ib_atomic_eth *ateth = &ohdr->u.atomic_eth;
+		u64 vaddr = get_ib_ateth_vaddr(ateth);
+		bool opfn = opcode == OP(COMPARE_SWAP) &&
+			vaddr == HFI1_VERBS_E_ATOMIC_VADDR;
 		struct rvt_ack_entry *e;
-		u64 vaddr;
 		atomic64_t *maddr;
 		u64 sdata;
 		u32 rkey;
 		u8 next;
 
-		if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC)))
+		if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC) &&
+			     !opfn))
 			goto nack_inv;
 		next = qp->r_head_ack_queue + 1;
-		if (next > HFI1_MAX_RDMA_ATOMIC)
+		if (next > rvt_size_atomic(ib_to_rvt(qp->ibqp.device)))
 			next = 0;
 		spin_lock_irqsave(&qp->s_lock, flags);
-		if (unlikely(next == qp->s_tail_ack_queue)) {
+		if (unlikely(next == qp->s_acked_ack_queue)) {
 			if (!qp->s_ack_queue[next].sent)
 				goto nack_inv_unlck;
 			update_ack_queue(qp, next);
 		}
 		e = &qp->s_ack_queue[qp->r_head_ack_queue];
-		if (e->opcode == OP(RDMA_READ_REQUEST) && e->rdma_sge.mr) {
-			rvt_put_mr(e->rdma_sge.mr);
-			e->rdma_sge.mr = NULL;
+		release_rdma_sge_mr(e);
+		/* Process OPFN special virtual address */
+		if (opfn) {
+			opfn_conn_response(qp, e, ateth);
+			goto ack;
 		}
-		ateth = &ohdr->u.atomic_eth;
-		vaddr = get_ib_ateth_vaddr(ateth);
 		if (unlikely(vaddr & (sizeof(u64) - 1)))
 			goto nack_inv_unlck;
 		rkey = be32_to_cpu(ateth->rkey);
@@ -2399,6 +3171,7 @@
 				      sdata);
 		rvt_put_mr(qp->r_sge.sge.mr);
 		qp->r_sge.num_sge = 0;
+ack:
 		e->opcode = opcode;
 		e->sent = 0;
 		e->psn = psn;
@@ -2408,14 +3181,15 @@
 		qp->r_state = opcode;
 		qp->r_nak_state = 0;
 		qp->r_head_ack_queue = next;
+		qpriv->r_tid_alloc = qp->r_head_ack_queue;
 
 		/* Schedule the send engine. */
 		qp->s_flags |= RVT_S_RESP_PENDING;
+		if (fecn)
+			qp->s_flags |= RVT_S_ECN;
 		hfi1_schedule_send(qp);
 
 		spin_unlock_irqrestore(&qp->s_lock, flags);
-		if (is_fecn)
-			goto send_ack;
 		return;
 	}
 
@@ -2428,16 +3202,9 @@
 	qp->r_ack_psn = psn;
 	qp->r_nak_state = 0;
 	/* Send an ACK if requested or required. */
-	if (psn & IB_BTH_REQ_ACK) {
-		if (packet->numpkt == 0) {
-			rc_cancel_ack(qp);
-			goto send_ack;
-		}
-		if (qp->r_adefered >= HFI1_PSN_CREDIT) {
-			rc_cancel_ack(qp);
-			goto send_ack;
-		}
-		if (unlikely(is_fecn)) {
+	if (psn & IB_BTH_REQ_ACK || fecn) {
+		if (packet->numpkt == 0 || fecn ||
+		    qp->r_adefered >= HFI1_PSN_CREDIT) {
 			rc_cancel_ack(qp);
 			goto send_ack;
 		}
@@ -2478,7 +3245,7 @@
 	qp->r_nak_state = IB_NAK_REMOTE_ACCESS_ERROR;
 	qp->r_ack_psn = qp->r_psn;
 send_ack:
-	hfi1_send_rc_ack(packet, is_fecn);
+	hfi1_send_rc_ack(packet, fecn);
 }
 
 void hfi1_rc_hdrerr(
diff --git a/drivers/infiniband/hw/hfi1/rc.h b/drivers/infiniband/hw/hfi1/rc.h
new file mode 100644
index 0000000..5ed5e85
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/rc.h
@@ -0,0 +1,59 @@
+/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */
+/*
+ * Copyright(c) 2018 Intel Corporation.
+ *
+ */
+
+#ifndef HFI1_RC_H
+#define HFI1_RC_H
+
+/* cut down ridiculously long IB macro names */
+#define OP(x) IB_OPCODE_RC_##x
+
+static inline void update_ack_queue(struct rvt_qp *qp, unsigned int n)
+{
+	unsigned int next;
+
+	next = n + 1;
+	if (next > rvt_size_atomic(ib_to_rvt(qp->ibqp.device)))
+		next = 0;
+	qp->s_tail_ack_queue = next;
+	qp->s_acked_ack_queue = next;
+	qp->s_ack_state = OP(ACKNOWLEDGE);
+}
+
+static inline void rc_defered_ack(struct hfi1_ctxtdata *rcd,
+				  struct rvt_qp *qp)
+{
+	if (list_empty(&qp->rspwait)) {
+		qp->r_flags |= RVT_R_RSP_NAK;
+		rvt_get_qp(qp);
+		list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
+	}
+}
+
+static inline u32 restart_sge(struct rvt_sge_state *ss, struct rvt_swqe *wqe,
+			      u32 psn, u32 pmtu)
+{
+	u32 len;
+
+	len = delta_psn(psn, wqe->psn) * pmtu;
+	return rvt_restart_sge(ss, wqe, len);
+}
+
+static inline void release_rdma_sge_mr(struct rvt_ack_entry *e)
+{
+	if (e->rdma_sge.mr) {
+		rvt_put_mr(e->rdma_sge.mr);
+		e->rdma_sge.mr = NULL;
+	}
+}
+
+struct rvt_ack_entry *find_prev_entry(struct rvt_qp *qp, u32 psn, u8 *prev,
+				      u8 *prev_ack, bool *scheduled);
+int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode, u64 val,
+	      struct hfi1_ctxtdata *rcd);
+struct rvt_swqe *do_rc_completion(struct rvt_qp *qp, struct rvt_swqe *wqe,
+				  struct hfi1_ibport *ibp);
+
+#endif /* HFI1_RC_H */
diff --git a/drivers/infiniband/hw/hfi1/ruc.c b/drivers/infiniband/hw/hfi1/ruc.c
index 5f56f3c..23ac605 100644
--- a/drivers/infiniband/hw/hfi1/ruc.c
+++ b/drivers/infiniband/hw/hfi1/ruc.c
@@ -156,333 +156,6 @@
 }
 
 /**
- * ruc_loopback - handle UC and RC loopback requests
- * @sqp: the sending QP
- *
- * This is called from hfi1_do_send() to
- * forward a WQE addressed to the same HFI.
- * Note that although we are single threaded due to the send engine, we still
- * have to protect against post_send().  We don't have to worry about
- * receive interrupts since this is a connected protocol and all packets
- * will pass through here.
- */
-static void ruc_loopback(struct rvt_qp *sqp)
-{
-	struct hfi1_ibport *ibp = to_iport(sqp->ibqp.device, sqp->port_num);
-	struct rvt_qp *qp;
-	struct rvt_swqe *wqe;
-	struct rvt_sge *sge;
-	unsigned long flags;
-	struct ib_wc wc;
-	u64 sdata;
-	atomic64_t *maddr;
-	enum ib_wc_status send_status;
-	bool release;
-	int ret;
-	bool copy_last = false;
-	int local_ops = 0;
-
-	rcu_read_lock();
-
-	/*
-	 * Note that we check the responder QP state after
-	 * checking the requester's state.
-	 */
-	qp = rvt_lookup_qpn(ib_to_rvt(sqp->ibqp.device), &ibp->rvp,
-			    sqp->remote_qpn);
-
-	spin_lock_irqsave(&sqp->s_lock, flags);
-
-	/* Return if we are already busy processing a work request. */
-	if ((sqp->s_flags & (RVT_S_BUSY | HFI1_S_ANY_WAIT)) ||
-	    !(ib_rvt_state_ops[sqp->state] & RVT_PROCESS_OR_FLUSH_SEND))
-		goto unlock;
-
-	sqp->s_flags |= RVT_S_BUSY;
-
-again:
-	if (sqp->s_last == READ_ONCE(sqp->s_head))
-		goto clr_busy;
-	wqe = rvt_get_swqe_ptr(sqp, sqp->s_last);
-
-	/* Return if it is not OK to start a new work request. */
-	if (!(ib_rvt_state_ops[sqp->state] & RVT_PROCESS_NEXT_SEND_OK)) {
-		if (!(ib_rvt_state_ops[sqp->state] & RVT_FLUSH_SEND))
-			goto clr_busy;
-		/* We are in the error state, flush the work request. */
-		send_status = IB_WC_WR_FLUSH_ERR;
-		goto flush_send;
-	}
-
-	/*
-	 * We can rely on the entry not changing without the s_lock
-	 * being held until we update s_last.
-	 * We increment s_cur to indicate s_last is in progress.
-	 */
-	if (sqp->s_last == sqp->s_cur) {
-		if (++sqp->s_cur >= sqp->s_size)
-			sqp->s_cur = 0;
-	}
-	spin_unlock_irqrestore(&sqp->s_lock, flags);
-
-	if (!qp || !(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) ||
-	    qp->ibqp.qp_type != sqp->ibqp.qp_type) {
-		ibp->rvp.n_pkt_drops++;
-		/*
-		 * For RC, the requester would timeout and retry so
-		 * shortcut the timeouts and just signal too many retries.
-		 */
-		if (sqp->ibqp.qp_type == IB_QPT_RC)
-			send_status = IB_WC_RETRY_EXC_ERR;
-		else
-			send_status = IB_WC_SUCCESS;
-		goto serr;
-	}
-
-	memset(&wc, 0, sizeof(wc));
-	send_status = IB_WC_SUCCESS;
-
-	release = true;
-	sqp->s_sge.sge = wqe->sg_list[0];
-	sqp->s_sge.sg_list = wqe->sg_list + 1;
-	sqp->s_sge.num_sge = wqe->wr.num_sge;
-	sqp->s_len = wqe->length;
-	switch (wqe->wr.opcode) {
-	case IB_WR_REG_MR:
-		goto send_comp;
-
-	case IB_WR_LOCAL_INV:
-		if (!(wqe->wr.send_flags & RVT_SEND_COMPLETION_ONLY)) {
-			if (rvt_invalidate_rkey(sqp,
-						wqe->wr.ex.invalidate_rkey))
-				send_status = IB_WC_LOC_PROT_ERR;
-			local_ops = 1;
-		}
-		goto send_comp;
-
-	case IB_WR_SEND_WITH_INV:
-		if (!rvt_invalidate_rkey(qp, wqe->wr.ex.invalidate_rkey)) {
-			wc.wc_flags = IB_WC_WITH_INVALIDATE;
-			wc.ex.invalidate_rkey = wqe->wr.ex.invalidate_rkey;
-		}
-		goto send;
-
-	case IB_WR_SEND_WITH_IMM:
-		wc.wc_flags = IB_WC_WITH_IMM;
-		wc.ex.imm_data = wqe->wr.ex.imm_data;
-		/* FALLTHROUGH */
-	case IB_WR_SEND:
-send:
-		ret = rvt_get_rwqe(qp, false);
-		if (ret < 0)
-			goto op_err;
-		if (!ret)
-			goto rnr_nak;
-		break;
-
-	case IB_WR_RDMA_WRITE_WITH_IMM:
-		if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE)))
-			goto inv_err;
-		wc.wc_flags = IB_WC_WITH_IMM;
-		wc.ex.imm_data = wqe->wr.ex.imm_data;
-		ret = rvt_get_rwqe(qp, true);
-		if (ret < 0)
-			goto op_err;
-		if (!ret)
-			goto rnr_nak;
-		/* skip copy_last set and qp_access_flags recheck */
-		goto do_write;
-	case IB_WR_RDMA_WRITE:
-		copy_last = rvt_is_user_qp(qp);
-		if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE)))
-			goto inv_err;
-do_write:
-		if (wqe->length == 0)
-			break;
-		if (unlikely(!rvt_rkey_ok(qp, &qp->r_sge.sge, wqe->length,
-					  wqe->rdma_wr.remote_addr,
-					  wqe->rdma_wr.rkey,
-					  IB_ACCESS_REMOTE_WRITE)))
-			goto acc_err;
-		qp->r_sge.sg_list = NULL;
-		qp->r_sge.num_sge = 1;
-		qp->r_sge.total_len = wqe->length;
-		break;
-
-	case IB_WR_RDMA_READ:
-		if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ)))
-			goto inv_err;
-		if (unlikely(!rvt_rkey_ok(qp, &sqp->s_sge.sge, wqe->length,
-					  wqe->rdma_wr.remote_addr,
-					  wqe->rdma_wr.rkey,
-					  IB_ACCESS_REMOTE_READ)))
-			goto acc_err;
-		release = false;
-		sqp->s_sge.sg_list = NULL;
-		sqp->s_sge.num_sge = 1;
-		qp->r_sge.sge = wqe->sg_list[0];
-		qp->r_sge.sg_list = wqe->sg_list + 1;
-		qp->r_sge.num_sge = wqe->wr.num_sge;
-		qp->r_sge.total_len = wqe->length;
-		break;
-
-	case IB_WR_ATOMIC_CMP_AND_SWP:
-	case IB_WR_ATOMIC_FETCH_AND_ADD:
-		if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC)))
-			goto inv_err;
-		if (unlikely(!rvt_rkey_ok(qp, &qp->r_sge.sge, sizeof(u64),
-					  wqe->atomic_wr.remote_addr,
-					  wqe->atomic_wr.rkey,
-					  IB_ACCESS_REMOTE_ATOMIC)))
-			goto acc_err;
-		/* Perform atomic OP and save result. */
-		maddr = (atomic64_t *)qp->r_sge.sge.vaddr;
-		sdata = wqe->atomic_wr.compare_add;
-		*(u64 *)sqp->s_sge.sge.vaddr =
-			(wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) ?
-			(u64)atomic64_add_return(sdata, maddr) - sdata :
-			(u64)cmpxchg((u64 *)qp->r_sge.sge.vaddr,
-				      sdata, wqe->atomic_wr.swap);
-		rvt_put_mr(qp->r_sge.sge.mr);
-		qp->r_sge.num_sge = 0;
-		goto send_comp;
-
-	default:
-		send_status = IB_WC_LOC_QP_OP_ERR;
-		goto serr;
-	}
-
-	sge = &sqp->s_sge.sge;
-	while (sqp->s_len) {
-		u32 len = sqp->s_len;
-
-		if (len > sge->length)
-			len = sge->length;
-		if (len > sge->sge_length)
-			len = sge->sge_length;
-		WARN_ON_ONCE(len == 0);
-		hfi1_copy_sge(&qp->r_sge, sge->vaddr, len, release, copy_last);
-		sge->vaddr += len;
-		sge->length -= len;
-		sge->sge_length -= len;
-		if (sge->sge_length == 0) {
-			if (!release)
-				rvt_put_mr(sge->mr);
-			if (--sqp->s_sge.num_sge)
-				*sge = *sqp->s_sge.sg_list++;
-		} else if (sge->length == 0 && sge->mr->lkey) {
-			if (++sge->n >= RVT_SEGSZ) {
-				if (++sge->m >= sge->mr->mapsz)
-					break;
-				sge->n = 0;
-			}
-			sge->vaddr =
-				sge->mr->map[sge->m]->segs[sge->n].vaddr;
-			sge->length =
-				sge->mr->map[sge->m]->segs[sge->n].length;
-		}
-		sqp->s_len -= len;
-	}
-	if (release)
-		rvt_put_ss(&qp->r_sge);
-
-	if (!test_and_clear_bit(RVT_R_WRID_VALID, &qp->r_aflags))
-		goto send_comp;
-
-	if (wqe->wr.opcode == IB_WR_RDMA_WRITE_WITH_IMM)
-		wc.opcode = IB_WC_RECV_RDMA_WITH_IMM;
-	else
-		wc.opcode = IB_WC_RECV;
-	wc.wr_id = qp->r_wr_id;
-	wc.status = IB_WC_SUCCESS;
-	wc.byte_len = wqe->length;
-	wc.qp = &qp->ibqp;
-	wc.src_qp = qp->remote_qpn;
-	wc.slid = rdma_ah_get_dlid(&qp->remote_ah_attr) & U16_MAX;
-	wc.sl = rdma_ah_get_sl(&qp->remote_ah_attr);
-	wc.port_num = 1;
-	/* Signal completion event if the solicited bit is set. */
-	rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc,
-		     wqe->wr.send_flags & IB_SEND_SOLICITED);
-
-send_comp:
-	spin_lock_irqsave(&sqp->s_lock, flags);
-	ibp->rvp.n_loop_pkts++;
-flush_send:
-	sqp->s_rnr_retry = sqp->s_rnr_retry_cnt;
-	hfi1_send_complete(sqp, wqe, send_status);
-	if (local_ops) {
-		atomic_dec(&sqp->local_ops_pending);
-		local_ops = 0;
-	}
-	goto again;
-
-rnr_nak:
-	/* Handle RNR NAK */
-	if (qp->ibqp.qp_type == IB_QPT_UC)
-		goto send_comp;
-	ibp->rvp.n_rnr_naks++;
-	/*
-	 * Note: we don't need the s_lock held since the BUSY flag
-	 * makes this single threaded.
-	 */
-	if (sqp->s_rnr_retry == 0) {
-		send_status = IB_WC_RNR_RETRY_EXC_ERR;
-		goto serr;
-	}
-	if (sqp->s_rnr_retry_cnt < 7)
-		sqp->s_rnr_retry--;
-	spin_lock_irqsave(&sqp->s_lock, flags);
-	if (!(ib_rvt_state_ops[sqp->state] & RVT_PROCESS_RECV_OK))
-		goto clr_busy;
-	rvt_add_rnr_timer(sqp, qp->r_min_rnr_timer <<
-				IB_AETH_CREDIT_SHIFT);
-	goto clr_busy;
-
-op_err:
-	send_status = IB_WC_REM_OP_ERR;
-	wc.status = IB_WC_LOC_QP_OP_ERR;
-	goto err;
-
-inv_err:
-	send_status = IB_WC_REM_INV_REQ_ERR;
-	wc.status = IB_WC_LOC_QP_OP_ERR;
-	goto err;
-
-acc_err:
-	send_status = IB_WC_REM_ACCESS_ERR;
-	wc.status = IB_WC_LOC_PROT_ERR;
-err:
-	/* responder goes to error state */
-	rvt_rc_error(qp, wc.status);
-
-serr:
-	spin_lock_irqsave(&sqp->s_lock, flags);
-	hfi1_send_complete(sqp, wqe, send_status);
-	if (sqp->ibqp.qp_type == IB_QPT_RC) {
-		int lastwqe = rvt_error_qp(sqp, IB_WC_WR_FLUSH_ERR);
-
-		sqp->s_flags &= ~RVT_S_BUSY;
-		spin_unlock_irqrestore(&sqp->s_lock, flags);
-		if (lastwqe) {
-			struct ib_event ev;
-
-			ev.device = sqp->ibqp.device;
-			ev.element.qp = &sqp->ibqp;
-			ev.event = IB_EVENT_QP_LAST_WQE_REACHED;
-			sqp->ibqp.event_handler(&ev, sqp->ibqp.qp_context);
-		}
-		goto done;
-	}
-clr_busy:
-	sqp->s_flags &= ~RVT_S_BUSY;
-unlock:
-	spin_unlock_irqrestore(&sqp->s_lock, flags);
-done:
-	rcu_read_unlock();
-}
-
-/**
  * hfi1_make_grh - construct a GRH header
  * @ibp: a pointer to the IB port
  * @hdr: a pointer to the GRH header being constructed
@@ -577,7 +250,6 @@
 				     struct ib_other_headers *ohdr,
 				     u32 bth0, u32 bth1, u32 bth2)
 {
-	bth1 |= qp->remote_qpn;
 	ohdr->bth[0] = cpu_to_be32(bth0);
 	ohdr->bth[1] = cpu_to_be32(bth1);
 	ohdr->bth[2] = cpu_to_be32(bth2);
@@ -599,13 +271,13 @@
  */
 static inline void hfi1_make_ruc_header_16B(struct rvt_qp *qp,
 					    struct ib_other_headers *ohdr,
-					    u32 bth0, u32 bth2, int middle,
+					    u32 bth0, u32 bth1, u32 bth2,
+					    int middle,
 					    struct hfi1_pkt_state *ps)
 {
 	struct hfi1_qp_priv *priv = qp->priv;
 	struct hfi1_ibport *ibp = ps->ibp;
 	struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
-	u32 bth1 = 0;
 	u32 slid;
 	u16 pkey = hfi1_get_pkey(ibp, qp->s_pkey_index);
 	u8 l4 = OPA_16B_L4_IB_LOCAL;
@@ -687,12 +359,12 @@
  */
 static inline void hfi1_make_ruc_header_9B(struct rvt_qp *qp,
 					   struct ib_other_headers *ohdr,
-					   u32 bth0, u32 bth2, int middle,
+					   u32 bth0, u32 bth1, u32 bth2,
+					   int middle,
 					   struct hfi1_pkt_state *ps)
 {
 	struct hfi1_qp_priv *priv = qp->priv;
 	struct hfi1_ibport *ibp = ps->ibp;
-	u32 bth1 = 0;
 	u16 pkey = hfi1_get_pkey(ibp, qp->s_pkey_index);
 	u16 lrh0 = HFI1_LRH_BTH;
 	u8 extra_bytes = -ps->s_txreq->s_cur_size & 3;
@@ -742,7 +414,7 @@
 
 typedef void (*hfi1_make_ruc_hdr)(struct rvt_qp *qp,
 				  struct ib_other_headers *ohdr,
-				  u32 bth0, u32 bth2, int middle,
+				  u32 bth0, u32 bth1, u32 bth2, int middle,
 				  struct hfi1_pkt_state *ps);
 
 /* We support only two types - 9B and 16B for now */
@@ -752,7 +424,7 @@
 };
 
 void hfi1_make_ruc_header(struct rvt_qp *qp, struct ib_other_headers *ohdr,
-			  u32 bth0, u32 bth2, int middle,
+			  u32 bth0, u32 bth1, u32 bth2, int middle,
 			  struct hfi1_pkt_state *ps)
 {
 	struct hfi1_qp_priv *priv = qp->priv;
@@ -773,18 +445,21 @@
 	priv->s_ahg->ahgidx = 0;
 
 	/* Make the appropriate header */
-	hfi1_ruc_header_tbl[priv->hdr_type](qp, ohdr, bth0, bth2, middle, ps);
+	hfi1_ruc_header_tbl[priv->hdr_type](qp, ohdr, bth0, bth1, bth2, middle,
+					    ps);
 }
 
 /* when sending, force a reschedule every one of these periods */
 #define SEND_RESCHED_TIMEOUT (5 * HZ)  /* 5s in jiffies */
 
 /**
- * schedule_send_yield - test for a yield required for QP send engine
+ * hfi1_schedule_send_yield - test for a yield required for QP
+ * send engine
  * @timeout: Final time for timeout slice for jiffies
  * @qp: a pointer to QP
  * @ps: a pointer to a structure with commonly lookup values for
  *      the the send engine progress
+ * @tid - true if it is the tid leg
  *
  * This routine checks if the time slice for the QP has expired
  * for RC QPs, if so an additional work entry is queued. At this
@@ -792,8 +467,8 @@
  * returns true if a yield is required, otherwise, false
  * is returned.
  */
-static bool schedule_send_yield(struct rvt_qp *qp,
-				struct hfi1_pkt_state *ps)
+bool hfi1_schedule_send_yield(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
+			      bool tid)
 {
 	ps->pkts_sent = true;
 
@@ -801,8 +476,24 @@
 		if (!ps->in_thread ||
 		    workqueue_congested(ps->cpu, ps->ppd->hfi1_wq)) {
 			spin_lock_irqsave(&qp->s_lock, ps->flags);
-			qp->s_flags &= ~RVT_S_BUSY;
-			hfi1_schedule_send(qp);
+			if (!tid) {
+				qp->s_flags &= ~RVT_S_BUSY;
+				hfi1_schedule_send(qp);
+			} else {
+				struct hfi1_qp_priv *priv = qp->priv;
+
+				if (priv->s_flags &
+				    HFI1_S_TID_BUSY_SET) {
+					qp->s_flags &= ~RVT_S_BUSY;
+					priv->s_flags &=
+						~(HFI1_S_TID_BUSY_SET |
+						  RVT_S_BUSY);
+				} else {
+					priv->s_flags &= ~RVT_S_BUSY;
+				}
+				hfi1_schedule_tid_send(qp);
+			}
+
 			spin_unlock_irqrestore(&qp->s_lock, ps->flags);
 			this_cpu_inc(*ps->ppd->dd->send_schedule);
 			trace_hfi1_rc_expired_time_slice(qp, true);
@@ -825,15 +516,15 @@
 
 void _hfi1_do_send(struct work_struct *work)
 {
-	struct iowait *wait = container_of(work, struct iowait, iowork);
-	struct rvt_qp *qp = iowait_to_qp(wait);
+	struct iowait_work *w = container_of(work, struct iowait_work, iowork);
+	struct rvt_qp *qp = iowait_to_qp(w->iow);
 
 	hfi1_do_send(qp, true);
 }
 
 /**
  * hfi1_do_send - perform a send on a QP
- * @work: contains a pointer to the QP
+ * @qp: a pointer to the QP
  * @in_thread: true if in a workqueue thread
  *
  * Process entries in the send work queue until credit or queue is
@@ -850,6 +541,7 @@
 	ps.ibp = to_iport(qp->ibqp.device, qp->port_num);
 	ps.ppd = ppd_from_ibp(ps.ibp);
 	ps.in_thread = in_thread;
+	ps.wait = iowait_get_ib_work(&priv->s_iowait);
 
 	trace_hfi1_rc_do_send(qp, in_thread);
 
@@ -858,7 +550,7 @@
 		if (!loopback && ((rdma_ah_get_dlid(&qp->remote_ah_attr) &
 				   ~((1 << ps.ppd->lmc) - 1)) ==
 				  ps.ppd->lid)) {
-			ruc_loopback(qp);
+			rvt_ruc_loopback(qp);
 			return;
 		}
 		make_req = hfi1_make_rc_req;
@@ -868,7 +560,7 @@
 		if (!loopback && ((rdma_ah_get_dlid(&qp->remote_ah_attr) &
 				   ~((1 << ps.ppd->lmc) - 1)) ==
 				  ps.ppd->lid)) {
-			ruc_loopback(qp);
+			rvt_ruc_loopback(qp);
 			return;
 		}
 		make_req = hfi1_make_uc_req;
@@ -883,6 +575,8 @@
 
 	/* Return if we are already busy processing a work request. */
 	if (!hfi1_send_ok(qp)) {
+		if (qp->s_flags & HFI1_S_ANY_WAIT_IO)
+			iowait_set_flag(&priv->s_iowait, IOWAIT_PENDING_IB);
 		spin_unlock_irqrestore(&qp->s_lock, ps.flags);
 		return;
 	}
@@ -896,10 +590,12 @@
 	ps.pkts_sent = false;
 
 	/* insure a pre-built packet is handled  */
-	ps.s_txreq = get_waiting_verbs_txreq(qp);
+	ps.s_txreq = get_waiting_verbs_txreq(ps.wait);
 	do {
 		/* Check for a constructed packet to be sent. */
 		if (ps.s_txreq) {
+			if (priv->s_flags & HFI1_S_TID_BUSY_SET)
+				qp->s_flags |= RVT_S_BUSY;
 			spin_unlock_irqrestore(&qp->s_lock, ps.flags);
 			/*
 			 * If the packet cannot be sent now, return and
@@ -907,8 +603,9 @@
 			 */
 			if (hfi1_verbs_send(qp, &ps))
 				return;
+
 			/* allow other tasks to run */
-			if (schedule_send_yield(qp, &ps))
+			if (hfi1_schedule_send_yield(qp, &ps, false))
 				return;
 
 			spin_lock_irqsave(&qp->s_lock, ps.flags);
@@ -917,44 +614,3 @@
 	iowait_starve_clear(ps.pkts_sent, &priv->s_iowait);
 	spin_unlock_irqrestore(&qp->s_lock, ps.flags);
 }
-
-/*
- * This should be called with s_lock held.
- */
-void hfi1_send_complete(struct rvt_qp *qp, struct rvt_swqe *wqe,
-			enum ib_wc_status status)
-{
-	u32 old_last, last;
-
-	if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_OR_FLUSH_SEND))
-		return;
-
-	last = qp->s_last;
-	old_last = last;
-	trace_hfi1_qp_send_completion(qp, wqe, last);
-	if (++last >= qp->s_size)
-		last = 0;
-	trace_hfi1_qp_send_completion(qp, wqe, last);
-	qp->s_last = last;
-	/* See post_send() */
-	barrier();
-	rvt_put_swqe(wqe);
-	if (qp->ibqp.qp_type == IB_QPT_UD ||
-	    qp->ibqp.qp_type == IB_QPT_SMI ||
-	    qp->ibqp.qp_type == IB_QPT_GSI)
-		atomic_dec(&ibah_to_rvtah(wqe->ud_wr.ah)->refcount);
-
-	rvt_qp_swqe_complete(qp,
-			     wqe,
-			     ib_hfi1_wc_opcode[wqe->wr.opcode],
-			     status);
-
-	if (qp->s_acked == old_last)
-		qp->s_acked = last;
-	if (qp->s_cur == old_last)
-		qp->s_cur = last;
-	if (qp->s_tail == old_last)
-		qp->s_tail = last;
-	if (qp->state == IB_QPS_SQD && last == qp->s_cur)
-		qp->s_draining = 0;
-}
diff --git a/drivers/infiniband/hw/hfi1/sdma.c b/drivers/infiniband/hw/hfi1/sdma.c
index 88e326d..c61b602 100644
--- a/drivers/infiniband/hw/hfi1/sdma.c
+++ b/drivers/infiniband/hw/hfi1/sdma.c
@@ -65,6 +65,7 @@
 #define SDMA_DESCQ_CNT 2048
 #define SDMA_DESC_INTR 64
 #define INVALID_TAIL 0xffff
+#define SDMA_PAD max_t(size_t, MAX_16B_PADDING, sizeof(u32))
 
 static uint sdma_descq_cnt = SDMA_DESCQ_CNT;
 module_param(sdma_descq_cnt, uint, S_IRUGO);
@@ -378,7 +379,7 @@
 	__sdma_txclean(sde->dd, tx);
 	if (complete)
 		(*complete)(tx, res);
-	if (wait && iowait_sdma_dec(wait))
+	if (iowait_sdma_dec(wait))
 		iowait_drain_wakeup(wait);
 }
 
@@ -405,19 +406,33 @@
 	struct sdma_txreq *txp, *txp_next;
 	LIST_HEAD(flushlist);
 	unsigned long flags;
+	uint seq;
 
 	/* flush from head to tail */
 	sdma_flush_descq(sde);
 	spin_lock_irqsave(&sde->flushlist_lock, flags);
 	/* copy flush list */
-	list_for_each_entry_safe(txp, txp_next, &sde->flushlist, list) {
-		list_del_init(&txp->list);
-		list_add_tail(&txp->list, &flushlist);
-	}
+	list_splice_init(&sde->flushlist, &flushlist);
 	spin_unlock_irqrestore(&sde->flushlist_lock, flags);
 	/* flush from flush list */
 	list_for_each_entry_safe(txp, txp_next, &flushlist, list)
 		complete_tx(sde, txp, SDMA_TXREQ_S_ABORTED);
+	/* wakeup QPs orphaned on the dmawait list */
+	do {
+		struct iowait *w, *nw;
+
+		seq = read_seqbegin(&sde->waitlock);
+		if (!list_empty(&sde->dmawait)) {
+			write_seqlock(&sde->waitlock);
+			list_for_each_entry_safe(w, nw, &sde->dmawait, list) {
+				if (w->wakeup) {
+					w->wakeup(w, SDMA_AVAIL_REASON);
+					list_del_init(&w->list);
+				}
+			}
+			write_sequnlock(&sde->waitlock);
+		}
+	} while (read_seqretry(&sde->waitlock, seq));
 }
 
 /*
@@ -855,14 +870,13 @@
 {
 	struct sdma_rht_node *rht_node;
 	struct sdma_engine *sde = NULL;
-	const struct cpumask *current_mask = &current->cpus_allowed;
 	unsigned long cpu_id;
 
 	/*
 	 * To ensure that always the same sdma engine(s) will be
 	 * selected make sure the process is pinned to this CPU only.
 	 */
-	if (cpumask_weight(current_mask) != 1)
+	if (current->nr_cpus_allowed != 1)
 		goto out;
 
 	cpu_id = smp_processor_id();
@@ -1283,7 +1297,7 @@
 	struct sdma_engine *sde;
 
 	if (dd->sdma_pad_dma) {
-		dma_free_coherent(&dd->pcidev->dev, 4,
+		dma_free_coherent(&dd->pcidev->dev, SDMA_PAD,
 				  (void *)dd->sdma_pad_dma,
 				  dd->sdma_pad_phys);
 		dd->sdma_pad_dma = NULL;
@@ -1424,6 +1438,7 @@
 		seqlock_init(&sde->head_lock);
 		spin_lock_init(&sde->senddmactrl_lock);
 		spin_lock_init(&sde->flushlist_lock);
+		seqlock_init(&sde->waitlock);
 		/* insure there is always a zero bit */
 		sde->ahg_bits = 0xfffffffe00000000ULL;
 
@@ -1452,12 +1467,9 @@
 		timer_setup(&sde->err_progress_check_timer,
 			    sdma_err_progress_check, 0);
 
-		sde->descq = dma_zalloc_coherent(
-			&dd->pcidev->dev,
-			descq_cnt * sizeof(u64[2]),
-			&sde->descq_phys,
-			GFP_KERNEL
-		);
+		sde->descq = dma_alloc_coherent(&dd->pcidev->dev,
+						descq_cnt * sizeof(u64[2]),
+						&sde->descq_phys, GFP_KERNEL);
 		if (!sde->descq)
 			goto bail;
 		sde->tx_ring =
@@ -1470,24 +1482,18 @@
 
 	dd->sdma_heads_size = L1_CACHE_BYTES * num_engines;
 	/* Allocate memory for DMA of head registers to memory */
-	dd->sdma_heads_dma = dma_zalloc_coherent(
-		&dd->pcidev->dev,
-		dd->sdma_heads_size,
-		&dd->sdma_heads_phys,
-		GFP_KERNEL
-	);
+	dd->sdma_heads_dma = dma_alloc_coherent(&dd->pcidev->dev,
+						dd->sdma_heads_size,
+						&dd->sdma_heads_phys,
+						GFP_KERNEL);
 	if (!dd->sdma_heads_dma) {
 		dd_dev_err(dd, "failed to allocate SendDMA head memory\n");
 		goto bail;
 	}
 
 	/* Allocate memory for pad */
-	dd->sdma_pad_dma = dma_zalloc_coherent(
-		&dd->pcidev->dev,
-		sizeof(u32),
-		&dd->sdma_pad_phys,
-		GFP_KERNEL
-	);
+	dd->sdma_pad_dma = dma_alloc_coherent(&dd->pcidev->dev, SDMA_PAD,
+					      &dd->sdma_pad_phys, GFP_KERNEL);
 	if (!dd->sdma_pad_dma) {
 		dd_dev_err(dd, "failed to allocate SendDMA pad memory\n");
 		goto bail;
@@ -1521,8 +1527,11 @@
 	}
 
 	ret = rhashtable_init(tmp_sdma_rht, &sdma_rht_params);
-	if (ret < 0)
+	if (ret < 0) {
+		kfree(tmp_sdma_rht);
 		goto bail;
+	}
+
 	dd->sdma_rht = tmp_sdma_rht;
 
 	dd_dev_info(dd, "SDMA num_sdma: %u\n", dd->num_sdma);
@@ -1755,12 +1764,9 @@
  */
 static void sdma_desc_avail(struct sdma_engine *sde, uint avail)
 {
-	struct iowait *wait, *nw;
+	struct iowait *wait, *nw, *twait;
 	struct iowait *waits[SDMA_WAIT_BATCH_SIZE];
-	uint i, n = 0, seq, max_idx = 0;
-	struct sdma_txreq *stx;
-	struct hfi1_ibdev *dev = &sde->dd->verbs_dev;
-	u8 max_starved_cnt = 0;
+	uint i, n = 0, seq, tidx = 0;
 
 #ifdef CONFIG_SDMA_VERBOSITY
 	dd_dev_err(sde->dd, "CONFIG SDMA(%u) %s:%d %s()\n", sde->this_idx,
@@ -1769,49 +1775,50 @@
 #endif
 
 	do {
-		seq = read_seqbegin(&dev->iowait_lock);
+		seq = read_seqbegin(&sde->waitlock);
 		if (!list_empty(&sde->dmawait)) {
 			/* at least one item */
-			write_seqlock(&dev->iowait_lock);
+			write_seqlock(&sde->waitlock);
 			/* Harvest waiters wanting DMA descriptors */
 			list_for_each_entry_safe(
 					wait,
 					nw,
 					&sde->dmawait,
 					list) {
-				u16 num_desc = 0;
+				u32 num_desc;
 
 				if (!wait->wakeup)
 					continue;
 				if (n == ARRAY_SIZE(waits))
 					break;
-				if (!list_empty(&wait->tx_head)) {
-					stx = list_first_entry(
-						&wait->tx_head,
-						struct sdma_txreq,
-						list);
-					num_desc = stx->num_desc;
-				}
+				iowait_init_priority(wait);
+				num_desc = iowait_get_all_desc(wait);
 				if (num_desc > avail)
 					break;
 				avail -= num_desc;
-				/* Find the most starved wait memeber */
-				iowait_starve_find_max(wait, &max_starved_cnt,
-						       n, &max_idx);
+				/* Find the top-priority wait memeber */
+				if (n) {
+					twait = waits[tidx];
+					tidx =
+					    iowait_priority_update_top(wait,
+								       twait,
+								       n,
+								       tidx);
+				}
 				list_del_init(&wait->list);
 				waits[n++] = wait;
 			}
-			write_sequnlock(&dev->iowait_lock);
+			write_sequnlock(&sde->waitlock);
 			break;
 		}
-	} while (read_seqretry(&dev->iowait_lock, seq));
+	} while (read_seqretry(&sde->waitlock, seq));
 
-	/* Schedule the most starved one first */
+	/* Schedule the top-priority entry first */
 	if (n)
-		waits[max_idx]->wakeup(waits[max_idx], SDMA_AVAIL_REASON);
+		waits[tidx]->wakeup(waits[tidx], SDMA_AVAIL_REASON);
 
 	for (i = 0; i < n; i++)
-		if (i != max_idx)
+		if (i != tidx)
 			waits[i]->wakeup(waits[i], SDMA_AVAIL_REASON);
 }
 
@@ -2346,7 +2353,7 @@
  */
 static int sdma_check_progress(
 	struct sdma_engine *sde,
-	struct iowait *wait,
+	struct iowait_work *wait,
 	struct sdma_txreq *tx,
 	bool pkts_sent)
 {
@@ -2356,12 +2363,12 @@
 	if (tx->num_desc <= sde->desc_avail)
 		return -EAGAIN;
 	/* pulse the head_lock */
-	if (wait && wait->sleep) {
+	if (wait && iowait_ioww_to_iow(wait)->sleep) {
 		unsigned seq;
 
 		seq = raw_seqcount_begin(
 			(const seqcount_t *)&sde->head_lock.seqcount);
-		ret = wait->sleep(sde, wait, tx, seq, pkts_sent);
+		ret = wait->iow->sleep(sde, wait, tx, seq, pkts_sent);
 		if (ret == -EAGAIN)
 			sde->desc_avail = sdma_descq_freecnt(sde);
 	} else {
@@ -2373,7 +2380,7 @@
 /**
  * sdma_send_txreq() - submit a tx req to ring
  * @sde: sdma engine to use
- * @wait: wait structure to use when full (may be NULL)
+ * @wait: SE wait structure to use when full (may be NULL)
  * @tx: sdma_txreq to submit
  * @pkts_sent: has any packet been sent yet?
  *
@@ -2386,7 +2393,7 @@
  * -EIOCBQUEUED - tx queued to iowait, -ECOMM bad sdma state
  */
 int sdma_send_txreq(struct sdma_engine *sde,
-		    struct iowait *wait,
+		    struct iowait_work *wait,
 		    struct sdma_txreq *tx,
 		    bool pkts_sent)
 {
@@ -2397,7 +2404,7 @@
 	/* user should have supplied entire packet */
 	if (unlikely(tx->tlen))
 		return -EINVAL;
-	tx->wait = wait;
+	tx->wait = iowait_ioww_to_iow(wait);
 	spin_lock_irqsave(&sde->tail_lock, flags);
 retry:
 	if (unlikely(!__sdma_running(sde)))
@@ -2406,14 +2413,14 @@
 		goto nodesc;
 	tail = submit_tx(sde, tx);
 	if (wait)
-		iowait_sdma_inc(wait);
+		iowait_sdma_inc(iowait_ioww_to_iow(wait));
 	sdma_update_tail(sde, tail);
 unlock:
 	spin_unlock_irqrestore(&sde->tail_lock, flags);
 	return ret;
 unlock_noconn:
 	if (wait)
-		iowait_sdma_inc(wait);
+		iowait_sdma_inc(iowait_ioww_to_iow(wait));
 	tx->next_descq_idx = 0;
 #ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
 	tx->sn = sde->tail_sn++;
@@ -2422,11 +2429,8 @@
 	spin_lock(&sde->flushlist_lock);
 	list_add_tail(&tx->list, &sde->flushlist);
 	spin_unlock(&sde->flushlist_lock);
-	if (wait) {
-		wait->tx_count++;
-		wait->count += tx->num_desc;
-	}
-	schedule_work(&sde->flush_worker);
+	iowait_inc_wait_count(wait, tx->num_desc);
+	queue_work_on(sde->cpu, system_highpri_wq, &sde->flush_worker);
 	ret = -ECOMM;
 	goto unlock;
 nodesc:
@@ -2442,9 +2446,9 @@
 /**
  * sdma_send_txlist() - submit a list of tx req to ring
  * @sde: sdma engine to use
- * @wait: wait structure to use when full (may be NULL)
+ * @wait: SE wait structure to use when full (may be NULL)
  * @tx_list: list of sdma_txreqs to submit
- * @count: pointer to a u32 which, after return will contain the total number of
+ * @count: pointer to a u16 which, after return will contain the total number of
  *         sdma_txreqs removed from the tx_list. This will include sdma_txreqs
  *         whose SDMA descriptors are submitted to the ring and the sdma_txreqs
  *         which are added to SDMA engine flush list if the SDMA engine state is
@@ -2467,8 +2471,8 @@
  * -EINVAL - sdma_txreq incomplete, -EBUSY - no space in ring (wait == NULL)
  * -EIOCBQUEUED - tx queued to iowait, -ECOMM bad sdma state
  */
-int sdma_send_txlist(struct sdma_engine *sde, struct iowait *wait,
-		     struct list_head *tx_list, u32 *count_out)
+int sdma_send_txlist(struct sdma_engine *sde, struct iowait_work *wait,
+		     struct list_head *tx_list, u16 *count_out)
 {
 	struct sdma_txreq *tx, *tx_next;
 	int ret = 0;
@@ -2479,7 +2483,7 @@
 	spin_lock_irqsave(&sde->tail_lock, flags);
 retry:
 	list_for_each_entry_safe(tx, tx_next, tx_list, list) {
-		tx->wait = wait;
+		tx->wait = iowait_ioww_to_iow(wait);
 		if (unlikely(!__sdma_running(sde)))
 			goto unlock_noconn;
 		if (unlikely(tx->num_desc > sde->desc_avail))
@@ -2500,8 +2504,9 @@
 update_tail:
 	total_count = submit_count + flush_count;
 	if (wait) {
-		iowait_sdma_add(wait, total_count);
-		iowait_starve_clear(submit_count > 0, wait);
+		iowait_sdma_add(iowait_ioww_to_iow(wait), total_count);
+		iowait_starve_clear(submit_count > 0,
+				    iowait_ioww_to_iow(wait));
 	}
 	if (tail != INVALID_TAIL)
 		sdma_update_tail(sde, tail);
@@ -2511,7 +2516,7 @@
 unlock_noconn:
 	spin_lock(&sde->flushlist_lock);
 	list_for_each_entry_safe(tx, tx_next, tx_list, list) {
-		tx->wait = wait;
+		tx->wait = iowait_ioww_to_iow(wait);
 		list_del_init(&tx->list);
 		tx->next_descq_idx = 0;
 #ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
@@ -2520,13 +2525,10 @@
 #endif
 		list_add_tail(&tx->list, &sde->flushlist);
 		flush_count++;
-		if (wait) {
-			wait->tx_count++;
-			wait->count += tx->num_desc;
-		}
+		iowait_inc_wait_count(wait, tx->num_desc);
 	}
 	spin_unlock(&sde->flushlist_lock);
-	schedule_work(&sde->flush_worker);
+	queue_work_on(sde->cpu, system_highpri_wq, &sde->flush_worker);
 	ret = -ECOMM;
 	goto update_tail;
 nodesc:
diff --git a/drivers/infiniband/hw/hfi1/sdma.h b/drivers/infiniband/hw/hfi1/sdma.h
index 46c775f..1e2e40f 100644
--- a/drivers/infiniband/hw/hfi1/sdma.h
+++ b/drivers/infiniband/hw/hfi1/sdma.h
@@ -1,7 +1,7 @@
 #ifndef _HFI1_SDMA_H
 #define _HFI1_SDMA_H
 /*
- * Copyright(c) 2015, 2016 Intel Corporation.
+ * Copyright(c) 2015 - 2018 Intel Corporation.
  *
  * This file is provided under a dual BSD/GPLv2 license.  When using or
  * redistributing this file, you may do so under either license.
@@ -62,16 +62,6 @@
 /* Hardware limit for SDMA packet size */
 #define MAX_SDMA_PKT_SIZE ((16 * 1024) - 1)
 
-#define SDMA_TXREQ_S_OK        0
-#define SDMA_TXREQ_S_SENDERROR 1
-#define SDMA_TXREQ_S_ABORTED   2
-#define SDMA_TXREQ_S_SHUTDOWN  3
-
-/* flags bits */
-#define SDMA_TXREQ_F_URGENT       0x0001
-#define SDMA_TXREQ_F_AHG_COPY     0x0002
-#define SDMA_TXREQ_F_USE_AHG      0x0004
-
 #define SDMA_MAP_NONE          0
 #define SDMA_MAP_SINGLE        1
 #define SDMA_MAP_PAGE          2
@@ -392,6 +382,7 @@
 	u64                     progress_int_cnt;
 
 	/* private: */
+	seqlock_t            waitlock;
 	struct list_head      dmawait;
 
 	/* CONFIG SDMA for now, just blindly duplicate */
@@ -415,6 +406,7 @@
 	struct list_head flushlist;
 	struct cpumask cpu_mask;
 	struct kobject kobj;
+	u32 msix_intr;
 };
 
 int sdma_init(struct hfi1_devdata *dd, u8 port);
@@ -849,16 +841,16 @@
 			dd, SDMA_MAP_SINGLE, tx, addr, len);
 }
 
-struct iowait;
+struct iowait_work;
 
 int sdma_send_txreq(struct sdma_engine *sde,
-		    struct iowait *wait,
+		    struct iowait_work *wait,
 		    struct sdma_txreq *tx,
 		    bool pkts_sent);
 int sdma_send_txlist(struct sdma_engine *sde,
-		     struct iowait *wait,
+		     struct iowait_work *wait,
 		     struct list_head *tx_list,
-		     u32 *count);
+		     u16 *count_out);
 
 int sdma_ahg_alloc(struct sdma_engine *sde);
 void sdma_ahg_free(struct sdma_engine *sde, int ahg_index);
diff --git a/drivers/infiniband/hw/hfi1/sdma_txreq.h b/drivers/infiniband/hw/hfi1/sdma_txreq.h
index bf7d777..514a478 100644
--- a/drivers/infiniband/hw/hfi1/sdma_txreq.h
+++ b/drivers/infiniband/hw/hfi1/sdma_txreq.h
@@ -91,6 +91,7 @@
 #define SDMA_TXREQ_F_URGENT       0x0001
 #define SDMA_TXREQ_F_AHG_COPY     0x0002
 #define SDMA_TXREQ_F_USE_AHG      0x0004
+#define SDMA_TXREQ_F_VIP          0x0010
 
 struct sdma_txreq;
 typedef void (*callback_t)(struct sdma_txreq *, int);
diff --git a/drivers/infiniband/hw/hfi1/sysfs.c b/drivers/infiniband/hw/hfi1/sysfs.c
index 25e8673..90f62c4 100644
--- a/drivers/infiniband/hw/hfi1/sysfs.c
+++ b/drivers/infiniband/hw/hfi1/sysfs.c
@@ -494,20 +494,21 @@
  * Start of per-unit (or driver, in some cases, but replicated
  * per unit) functions (these get a device *)
  */
-static ssize_t show_rev(struct device *device, struct device_attribute *attr,
-			char *buf)
+static ssize_t hw_rev_show(struct device *device, struct device_attribute *attr,
+			   char *buf)
 {
 	struct hfi1_ibdev *dev =
-		container_of(device, struct hfi1_ibdev, rdi.ibdev.dev);
+		rdma_device_to_drv_device(device, struct hfi1_ibdev, rdi.ibdev);
 
 	return sprintf(buf, "%x\n", dd_from_dev(dev)->minrev);
 }
+static DEVICE_ATTR_RO(hw_rev);
 
-static ssize_t show_hfi(struct device *device, struct device_attribute *attr,
-			char *buf)
+static ssize_t board_id_show(struct device *device,
+			     struct device_attribute *attr, char *buf)
 {
 	struct hfi1_ibdev *dev =
-		container_of(device, struct hfi1_ibdev, rdi.ibdev.dev);
+		rdma_device_to_drv_device(device, struct hfi1_ibdev, rdi.ibdev);
 	struct hfi1_devdata *dd = dd_from_dev(dev);
 	int ret;
 
@@ -517,23 +518,25 @@
 		ret = scnprintf(buf, PAGE_SIZE, "%s\n", dd->boardname);
 	return ret;
 }
+static DEVICE_ATTR_RO(board_id);
 
-static ssize_t show_boardversion(struct device *device,
+static ssize_t boardversion_show(struct device *device,
 				 struct device_attribute *attr, char *buf)
 {
 	struct hfi1_ibdev *dev =
-		container_of(device, struct hfi1_ibdev, rdi.ibdev.dev);
+		rdma_device_to_drv_device(device, struct hfi1_ibdev, rdi.ibdev);
 	struct hfi1_devdata *dd = dd_from_dev(dev);
 
 	/* The string printed here is already newline-terminated. */
 	return scnprintf(buf, PAGE_SIZE, "%s", dd->boardversion);
 }
+static DEVICE_ATTR_RO(boardversion);
 
-static ssize_t show_nctxts(struct device *device,
+static ssize_t nctxts_show(struct device *device,
 			   struct device_attribute *attr, char *buf)
 {
 	struct hfi1_ibdev *dev =
-		container_of(device, struct hfi1_ibdev, rdi.ibdev.dev);
+		rdma_device_to_drv_device(device, struct hfi1_ibdev, rdi.ibdev);
 	struct hfi1_devdata *dd = dd_from_dev(dev);
 
 	/*
@@ -546,34 +549,37 @@
 			 min(dd->num_user_contexts,
 			     (u32)dd->sc_sizes[SC_USER].count));
 }
+static DEVICE_ATTR_RO(nctxts);
 
-static ssize_t show_nfreectxts(struct device *device,
+static ssize_t nfreectxts_show(struct device *device,
 			       struct device_attribute *attr, char *buf)
 {
 	struct hfi1_ibdev *dev =
-		container_of(device, struct hfi1_ibdev, rdi.ibdev.dev);
+		rdma_device_to_drv_device(device, struct hfi1_ibdev, rdi.ibdev);
 	struct hfi1_devdata *dd = dd_from_dev(dev);
 
 	/* Return the number of free user ports (contexts) available. */
 	return scnprintf(buf, PAGE_SIZE, "%u\n", dd->freectxts);
 }
+static DEVICE_ATTR_RO(nfreectxts);
 
-static ssize_t show_serial(struct device *device,
+static ssize_t serial_show(struct device *device,
 			   struct device_attribute *attr, char *buf)
 {
 	struct hfi1_ibdev *dev =
-		container_of(device, struct hfi1_ibdev, rdi.ibdev.dev);
+		rdma_device_to_drv_device(device, struct hfi1_ibdev, rdi.ibdev);
 	struct hfi1_devdata *dd = dd_from_dev(dev);
 
 	return scnprintf(buf, PAGE_SIZE, "%s", dd->serial);
 }
+static DEVICE_ATTR_RO(serial);
 
-static ssize_t store_chip_reset(struct device *device,
+static ssize_t chip_reset_store(struct device *device,
 				struct device_attribute *attr, const char *buf,
 				size_t count)
 {
 	struct hfi1_ibdev *dev =
-		container_of(device, struct hfi1_ibdev, rdi.ibdev.dev);
+		rdma_device_to_drv_device(device, struct hfi1_ibdev, rdi.ibdev);
 	struct hfi1_devdata *dd = dd_from_dev(dev);
 	int ret;
 
@@ -586,6 +592,7 @@
 bail:
 	return ret < 0 ? ret : count;
 }
+static DEVICE_ATTR_WO(chip_reset);
 
 /*
  * Convert the reported temperature from an integer (reported in
@@ -598,11 +605,11 @@
 /*
  * Dump tempsense values, in decimal, to ease shell-scripts.
  */
-static ssize_t show_tempsense(struct device *device,
+static ssize_t tempsense_show(struct device *device,
 			      struct device_attribute *attr, char *buf)
 {
 	struct hfi1_ibdev *dev =
-		container_of(device, struct hfi1_ibdev, rdi.ibdev.dev);
+		rdma_device_to_drv_device(device, struct hfi1_ibdev, rdi.ibdev);
 	struct hfi1_devdata *dd = dd_from_dev(dev);
 	struct hfi1_temp temp;
 	int ret;
@@ -622,6 +629,7 @@
 	}
 	return ret;
 }
+static DEVICE_ATTR_RO(tempsense);
 
 /*
  * end of per-unit (or driver, in some cases, but replicated
@@ -629,24 +637,20 @@
  */
 
 /* start of per-unit file structures and support code */
-static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL);
-static DEVICE_ATTR(board_id, S_IRUGO, show_hfi, NULL);
-static DEVICE_ATTR(nctxts, S_IRUGO, show_nctxts, NULL);
-static DEVICE_ATTR(nfreectxts, S_IRUGO, show_nfreectxts, NULL);
-static DEVICE_ATTR(serial, S_IRUGO, show_serial, NULL);
-static DEVICE_ATTR(boardversion, S_IRUGO, show_boardversion, NULL);
-static DEVICE_ATTR(tempsense, S_IRUGO, show_tempsense, NULL);
-static DEVICE_ATTR(chip_reset, S_IWUSR, NULL, store_chip_reset);
+static struct attribute *hfi1_attributes[] = {
+	&dev_attr_hw_rev.attr,
+	&dev_attr_board_id.attr,
+	&dev_attr_nctxts.attr,
+	&dev_attr_nfreectxts.attr,
+	&dev_attr_serial.attr,
+	&dev_attr_boardversion.attr,
+	&dev_attr_tempsense.attr,
+	&dev_attr_chip_reset.attr,
+	NULL,
+};
 
-static struct device_attribute *hfi1_attributes[] = {
-	&dev_attr_hw_rev,
-	&dev_attr_board_id,
-	&dev_attr_nctxts,
-	&dev_attr_nfreectxts,
-	&dev_attr_serial,
-	&dev_attr_boardversion,
-	&dev_attr_tempsense,
-	&dev_attr_chip_reset,
+const struct attribute_group ib_hfi1_attr_group = {
+	.attrs = hfi1_attributes,
 };
 
 int hfi1_create_port_files(struct ib_device *ibdev, u8 port_num,
@@ -832,12 +836,6 @@
 	struct device *class_dev = &dev->dev;
 	int i, j, ret;
 
-	for (i = 0; i < ARRAY_SIZE(hfi1_attributes); ++i) {
-		ret = device_create_file(&dev->dev, hfi1_attributes[i]);
-		if (ret)
-			goto bail;
-	}
-
 	for (i = 0; i < dd->num_sdma; i++) {
 		ret = kobject_init_and_add(&dd->per_sdma[i].kobj,
 					   &sde_ktype, &class_dev->kobj,
@@ -855,9 +853,6 @@
 
 	return 0;
 bail:
-	for (i = 0; i < ARRAY_SIZE(hfi1_attributes); ++i)
-		device_remove_file(&dev->dev, hfi1_attributes[i]);
-
 	for (i = 0; i < dd->num_sdma; i++)
 		kobject_del(&dd->per_sdma[i].kobj);
 
diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c
new file mode 100644
index 0000000..e53f542
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/tid_rdma.c
@@ -0,0 +1,5507 @@
+// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
+/*
+ * Copyright(c) 2018 Intel Corporation.
+ *
+ */
+
+#include "hfi.h"
+#include "qp.h"
+#include "rc.h"
+#include "verbs.h"
+#include "tid_rdma.h"
+#include "exp_rcv.h"
+#include "trace.h"
+
+/**
+ * DOC: TID RDMA READ protocol
+ *
+ * This is an end-to-end protocol at the hfi1 level between two nodes that
+ * improves performance by avoiding data copy on the requester side. It
+ * converts a qualified RDMA READ request into a TID RDMA READ request on
+ * the requester side and thereafter handles the request and response
+ * differently. To be qualified, the RDMA READ request should meet the
+ * following:
+ * -- The total data length should be greater than 256K;
+ * -- The total data length should be a multiple of 4K page size;
+ * -- Each local scatter-gather entry should be 4K page aligned;
+ * -- Each local scatter-gather entry should be a multiple of 4K page size;
+ */
+
+#define RCV_TID_FLOW_TABLE_CTRL_FLOW_VALID_SMASK BIT_ULL(32)
+#define RCV_TID_FLOW_TABLE_CTRL_HDR_SUPP_EN_SMASK BIT_ULL(33)
+#define RCV_TID_FLOW_TABLE_CTRL_KEEP_AFTER_SEQ_ERR_SMASK BIT_ULL(34)
+#define RCV_TID_FLOW_TABLE_CTRL_KEEP_ON_GEN_ERR_SMASK BIT_ULL(35)
+#define RCV_TID_FLOW_TABLE_STATUS_SEQ_MISMATCH_SMASK BIT_ULL(37)
+#define RCV_TID_FLOW_TABLE_STATUS_GEN_MISMATCH_SMASK BIT_ULL(38)
+
+/* Maximum number of packets within a flow generation. */
+#define MAX_TID_FLOW_PSN BIT(HFI1_KDETH_BTH_SEQ_SHIFT)
+
+#define GENERATION_MASK 0xFFFFF
+
+static u32 mask_generation(u32 a)
+{
+	return a & GENERATION_MASK;
+}
+
+/* Reserved generation value to set to unused flows for kernel contexts */
+#define KERN_GENERATION_RESERVED mask_generation(U32_MAX)
+
+/*
+ * J_KEY for kernel contexts when TID RDMA is used.
+ * See generate_jkey() in hfi.h for more information.
+ */
+#define TID_RDMA_JKEY                   32
+#define HFI1_KERNEL_MIN_JKEY HFI1_ADMIN_JKEY_RANGE
+#define HFI1_KERNEL_MAX_JKEY (2 * HFI1_ADMIN_JKEY_RANGE - 1)
+
+/* Maximum number of segments in flight per QP request. */
+#define TID_RDMA_MAX_READ_SEGS_PER_REQ  6
+#define TID_RDMA_MAX_WRITE_SEGS_PER_REQ 4
+#define MAX_REQ max_t(u16, TID_RDMA_MAX_READ_SEGS_PER_REQ, \
+			TID_RDMA_MAX_WRITE_SEGS_PER_REQ)
+#define MAX_FLOWS roundup_pow_of_two(MAX_REQ + 1)
+
+#define MAX_EXPECTED_PAGES     (MAX_EXPECTED_BUFFER / PAGE_SIZE)
+
+#define TID_RDMA_DESTQP_FLOW_SHIFT      11
+#define TID_RDMA_DESTQP_FLOW_MASK       0x1f
+
+#define TID_OPFN_QP_CTXT_MASK 0xff
+#define TID_OPFN_QP_CTXT_SHIFT 56
+#define TID_OPFN_QP_KDETH_MASK 0xff
+#define TID_OPFN_QP_KDETH_SHIFT 48
+#define TID_OPFN_MAX_LEN_MASK 0x7ff
+#define TID_OPFN_MAX_LEN_SHIFT 37
+#define TID_OPFN_TIMEOUT_MASK 0x1f
+#define TID_OPFN_TIMEOUT_SHIFT 32
+#define TID_OPFN_RESERVED_MASK 0x3f
+#define TID_OPFN_RESERVED_SHIFT 26
+#define TID_OPFN_URG_MASK 0x1
+#define TID_OPFN_URG_SHIFT 25
+#define TID_OPFN_VER_MASK 0x7
+#define TID_OPFN_VER_SHIFT 22
+#define TID_OPFN_JKEY_MASK 0x3f
+#define TID_OPFN_JKEY_SHIFT 16
+#define TID_OPFN_MAX_READ_MASK 0x3f
+#define TID_OPFN_MAX_READ_SHIFT 10
+#define TID_OPFN_MAX_WRITE_MASK 0x3f
+#define TID_OPFN_MAX_WRITE_SHIFT 4
+
+/*
+ * OPFN TID layout
+ *
+ * 63               47               31               15
+ * NNNNNNNNKKKKKKKK MMMMMMMMMMMTTTTT DDDDDDUVVVJJJJJJ RRRRRRWWWWWWCCCC
+ * 3210987654321098 7654321098765432 1098765432109876 5432109876543210
+ * N - the context Number
+ * K - the Kdeth_qp
+ * M - Max_len
+ * T - Timeout
+ * D - reserveD
+ * V - version
+ * U - Urg capable
+ * J - Jkey
+ * R - max_Read
+ * W - max_Write
+ * C - Capcode
+ */
+
+static void tid_rdma_trigger_resume(struct work_struct *work);
+static void hfi1_kern_exp_rcv_free_flows(struct tid_rdma_request *req);
+static int hfi1_kern_exp_rcv_alloc_flows(struct tid_rdma_request *req,
+					 gfp_t gfp);
+static void hfi1_init_trdma_req(struct rvt_qp *qp,
+				struct tid_rdma_request *req);
+static void hfi1_tid_write_alloc_resources(struct rvt_qp *qp, bool intr_ctx);
+static void hfi1_tid_timeout(struct timer_list *t);
+static void hfi1_add_tid_reap_timer(struct rvt_qp *qp);
+static void hfi1_mod_tid_reap_timer(struct rvt_qp *qp);
+static void hfi1_mod_tid_retry_timer(struct rvt_qp *qp);
+static int hfi1_stop_tid_retry_timer(struct rvt_qp *qp);
+static void hfi1_tid_retry_timeout(struct timer_list *t);
+static int make_tid_rdma_ack(struct rvt_qp *qp,
+			     struct ib_other_headers *ohdr,
+			     struct hfi1_pkt_state *ps);
+static void hfi1_do_tid_send(struct rvt_qp *qp);
+static u32 read_r_next_psn(struct hfi1_devdata *dd, u8 ctxt, u8 fidx);
+static void tid_rdma_rcv_err(struct hfi1_packet *packet,
+			     struct ib_other_headers *ohdr,
+			     struct rvt_qp *qp, u32 psn, int diff, bool fecn);
+static void update_r_next_psn_fecn(struct hfi1_packet *packet,
+				   struct hfi1_qp_priv *priv,
+				   struct hfi1_ctxtdata *rcd,
+				   struct tid_rdma_flow *flow,
+				   bool fecn);
+
+static void validate_r_tid_ack(struct hfi1_qp_priv *priv)
+{
+	if (priv->r_tid_ack == HFI1_QP_WQE_INVALID)
+		priv->r_tid_ack = priv->r_tid_tail;
+}
+
+static void tid_rdma_schedule_ack(struct rvt_qp *qp)
+{
+	struct hfi1_qp_priv *priv = qp->priv;
+
+	priv->s_flags |= RVT_S_ACK_PENDING;
+	hfi1_schedule_tid_send(qp);
+}
+
+static void tid_rdma_trigger_ack(struct rvt_qp *qp)
+{
+	validate_r_tid_ack(qp->priv);
+	tid_rdma_schedule_ack(qp);
+}
+
+static u64 tid_rdma_opfn_encode(struct tid_rdma_params *p)
+{
+	return
+		(((u64)p->qp & TID_OPFN_QP_CTXT_MASK) <<
+			TID_OPFN_QP_CTXT_SHIFT) |
+		((((u64)p->qp >> 16) & TID_OPFN_QP_KDETH_MASK) <<
+			TID_OPFN_QP_KDETH_SHIFT) |
+		(((u64)((p->max_len >> PAGE_SHIFT) - 1) &
+			TID_OPFN_MAX_LEN_MASK) << TID_OPFN_MAX_LEN_SHIFT) |
+		(((u64)p->timeout & TID_OPFN_TIMEOUT_MASK) <<
+			TID_OPFN_TIMEOUT_SHIFT) |
+		(((u64)p->urg & TID_OPFN_URG_MASK) << TID_OPFN_URG_SHIFT) |
+		(((u64)p->jkey & TID_OPFN_JKEY_MASK) << TID_OPFN_JKEY_SHIFT) |
+		(((u64)p->max_read & TID_OPFN_MAX_READ_MASK) <<
+			TID_OPFN_MAX_READ_SHIFT) |
+		(((u64)p->max_write & TID_OPFN_MAX_WRITE_MASK) <<
+			TID_OPFN_MAX_WRITE_SHIFT);
+}
+
+static void tid_rdma_opfn_decode(struct tid_rdma_params *p, u64 data)
+{
+	p->max_len = (((data >> TID_OPFN_MAX_LEN_SHIFT) &
+		TID_OPFN_MAX_LEN_MASK) + 1) << PAGE_SHIFT;
+	p->jkey = (data >> TID_OPFN_JKEY_SHIFT) & TID_OPFN_JKEY_MASK;
+	p->max_write = (data >> TID_OPFN_MAX_WRITE_SHIFT) &
+		TID_OPFN_MAX_WRITE_MASK;
+	p->max_read = (data >> TID_OPFN_MAX_READ_SHIFT) &
+		TID_OPFN_MAX_READ_MASK;
+	p->qp =
+		((((data >> TID_OPFN_QP_KDETH_SHIFT) & TID_OPFN_QP_KDETH_MASK)
+			<< 16) |
+		((data >> TID_OPFN_QP_CTXT_SHIFT) & TID_OPFN_QP_CTXT_MASK));
+	p->urg = (data >> TID_OPFN_URG_SHIFT) & TID_OPFN_URG_MASK;
+	p->timeout = (data >> TID_OPFN_TIMEOUT_SHIFT) & TID_OPFN_TIMEOUT_MASK;
+}
+
+void tid_rdma_opfn_init(struct rvt_qp *qp, struct tid_rdma_params *p)
+{
+	struct hfi1_qp_priv *priv = qp->priv;
+
+	p->qp = (kdeth_qp << 16) | priv->rcd->ctxt;
+	p->max_len = TID_RDMA_MAX_SEGMENT_SIZE;
+	p->jkey = priv->rcd->jkey;
+	p->max_read = TID_RDMA_MAX_READ_SEGS_PER_REQ;
+	p->max_write = TID_RDMA_MAX_WRITE_SEGS_PER_REQ;
+	p->timeout = qp->timeout;
+	p->urg = is_urg_masked(priv->rcd);
+}
+
+bool tid_rdma_conn_req(struct rvt_qp *qp, u64 *data)
+{
+	struct hfi1_qp_priv *priv = qp->priv;
+
+	*data = tid_rdma_opfn_encode(&priv->tid_rdma.local);
+	return true;
+}
+
+bool tid_rdma_conn_reply(struct rvt_qp *qp, u64 data)
+{
+	struct hfi1_qp_priv *priv = qp->priv;
+	struct tid_rdma_params *remote, *old;
+	bool ret = true;
+
+	old = rcu_dereference_protected(priv->tid_rdma.remote,
+					lockdep_is_held(&priv->opfn.lock));
+	data &= ~0xfULL;
+	/*
+	 * If data passed in is zero, return true so as not to continue the
+	 * negotiation process
+	 */
+	if (!data || !HFI1_CAP_IS_KSET(TID_RDMA))
+		goto null;
+	/*
+	 * If kzalloc fails, return false. This will result in:
+	 * * at the requester a new OPFN request being generated to retry
+	 *   the negotiation
+	 * * at the responder, 0 being returned to the requester so as to
+	 *   disable TID RDMA at both the requester and the responder
+	 */
+	remote = kzalloc(sizeof(*remote), GFP_ATOMIC);
+	if (!remote) {
+		ret = false;
+		goto null;
+	}
+
+	tid_rdma_opfn_decode(remote, data);
+	priv->tid_timer_timeout_jiffies =
+		usecs_to_jiffies((((4096UL * (1UL << remote->timeout)) /
+				   1000UL) << 3) * 7);
+	trace_hfi1_opfn_param(qp, 0, &priv->tid_rdma.local);
+	trace_hfi1_opfn_param(qp, 1, remote);
+	rcu_assign_pointer(priv->tid_rdma.remote, remote);
+	/*
+	 * A TID RDMA READ request's segment size is not equal to
+	 * remote->max_len only when the request's data length is smaller
+	 * than remote->max_len. In that case, there will be only one segment.
+	 * Therefore, when priv->pkts_ps is used to calculate req->cur_seg
+	 * during retry, it will lead to req->cur_seg = 0, which is exactly
+	 * what is expected.
+	 */
+	priv->pkts_ps = (u16)rvt_div_mtu(qp, remote->max_len);
+	priv->timeout_shift = ilog2(priv->pkts_ps - 1) + 1;
+	goto free;
+null:
+	RCU_INIT_POINTER(priv->tid_rdma.remote, NULL);
+	priv->timeout_shift = 0;
+free:
+	if (old)
+		kfree_rcu(old, rcu_head);
+	return ret;
+}
+
+bool tid_rdma_conn_resp(struct rvt_qp *qp, u64 *data)
+{
+	bool ret;
+
+	ret = tid_rdma_conn_reply(qp, *data);
+	*data = 0;
+	/*
+	 * If tid_rdma_conn_reply() returns error, set *data as 0 to indicate
+	 * TID RDMA could not be enabled. This will result in TID RDMA being
+	 * disabled at the requester too.
+	 */
+	if (ret)
+		(void)tid_rdma_conn_req(qp, data);
+	return ret;
+}
+
+void tid_rdma_conn_error(struct rvt_qp *qp)
+{
+	struct hfi1_qp_priv *priv = qp->priv;
+	struct tid_rdma_params *old;
+
+	old = rcu_dereference_protected(priv->tid_rdma.remote,
+					lockdep_is_held(&priv->opfn.lock));
+	RCU_INIT_POINTER(priv->tid_rdma.remote, NULL);
+	if (old)
+		kfree_rcu(old, rcu_head);
+}
+
+/* This is called at context initialization time */
+int hfi1_kern_exp_rcv_init(struct hfi1_ctxtdata *rcd, int reinit)
+{
+	if (reinit)
+		return 0;
+
+	BUILD_BUG_ON(TID_RDMA_JKEY < HFI1_KERNEL_MIN_JKEY);
+	BUILD_BUG_ON(TID_RDMA_JKEY > HFI1_KERNEL_MAX_JKEY);
+	rcd->jkey = TID_RDMA_JKEY;
+	hfi1_set_ctxt_jkey(rcd->dd, rcd, rcd->jkey);
+	return hfi1_alloc_ctxt_rcv_groups(rcd);
+}
+
+/**
+ * qp_to_rcd - determine the receive context used by a qp
+ * @qp - the qp
+ *
+ * This routine returns the receive context associated
+ * with a a qp's qpn.
+ *
+ * Returns the context.
+ */
+static struct hfi1_ctxtdata *qp_to_rcd(struct rvt_dev_info *rdi,
+				       struct rvt_qp *qp)
+{
+	struct hfi1_ibdev *verbs_dev = container_of(rdi,
+						    struct hfi1_ibdev,
+						    rdi);
+	struct hfi1_devdata *dd = container_of(verbs_dev,
+					       struct hfi1_devdata,
+					       verbs_dev);
+	unsigned int ctxt;
+
+	if (qp->ibqp.qp_num == 0)
+		ctxt = 0;
+	else
+		ctxt = hfi1_get_qp_map(dd, qp->ibqp.qp_num >> dd->qos_shift);
+	return dd->rcd[ctxt];
+}
+
+int hfi1_qp_priv_init(struct rvt_dev_info *rdi, struct rvt_qp *qp,
+		      struct ib_qp_init_attr *init_attr)
+{
+	struct hfi1_qp_priv *qpriv = qp->priv;
+	int i, ret;
+
+	qpriv->rcd = qp_to_rcd(rdi, qp);
+
+	spin_lock_init(&qpriv->opfn.lock);
+	INIT_WORK(&qpriv->opfn.opfn_work, opfn_send_conn_request);
+	INIT_WORK(&qpriv->tid_rdma.trigger_work, tid_rdma_trigger_resume);
+	qpriv->flow_state.psn = 0;
+	qpriv->flow_state.index = RXE_NUM_TID_FLOWS;
+	qpriv->flow_state.last_index = RXE_NUM_TID_FLOWS;
+	qpriv->flow_state.generation = KERN_GENERATION_RESERVED;
+	qpriv->s_state = TID_OP(WRITE_RESP);
+	qpriv->s_tid_cur = HFI1_QP_WQE_INVALID;
+	qpriv->s_tid_head = HFI1_QP_WQE_INVALID;
+	qpriv->s_tid_tail = HFI1_QP_WQE_INVALID;
+	qpriv->rnr_nak_state = TID_RNR_NAK_INIT;
+	qpriv->r_tid_head = HFI1_QP_WQE_INVALID;
+	qpriv->r_tid_tail = HFI1_QP_WQE_INVALID;
+	qpriv->r_tid_ack = HFI1_QP_WQE_INVALID;
+	qpriv->r_tid_alloc = HFI1_QP_WQE_INVALID;
+	atomic_set(&qpriv->n_requests, 0);
+	atomic_set(&qpriv->n_tid_requests, 0);
+	timer_setup(&qpriv->s_tid_timer, hfi1_tid_timeout, 0);
+	timer_setup(&qpriv->s_tid_retry_timer, hfi1_tid_retry_timeout, 0);
+	INIT_LIST_HEAD(&qpriv->tid_wait);
+
+	if (init_attr->qp_type == IB_QPT_RC && HFI1_CAP_IS_KSET(TID_RDMA)) {
+		struct hfi1_devdata *dd = qpriv->rcd->dd;
+
+		qpriv->pages = kzalloc_node(TID_RDMA_MAX_PAGES *
+						sizeof(*qpriv->pages),
+					    GFP_KERNEL, dd->node);
+		if (!qpriv->pages)
+			return -ENOMEM;
+		for (i = 0; i < qp->s_size; i++) {
+			struct hfi1_swqe_priv *priv;
+			struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, i);
+
+			priv = kzalloc_node(sizeof(*priv), GFP_KERNEL,
+					    dd->node);
+			if (!priv)
+				return -ENOMEM;
+
+			hfi1_init_trdma_req(qp, &priv->tid_req);
+			priv->tid_req.e.swqe = wqe;
+			wqe->priv = priv;
+		}
+		for (i = 0; i < rvt_max_atomic(rdi); i++) {
+			struct hfi1_ack_priv *priv;
+
+			priv = kzalloc_node(sizeof(*priv), GFP_KERNEL,
+					    dd->node);
+			if (!priv)
+				return -ENOMEM;
+
+			hfi1_init_trdma_req(qp, &priv->tid_req);
+			priv->tid_req.e.ack = &qp->s_ack_queue[i];
+
+			ret = hfi1_kern_exp_rcv_alloc_flows(&priv->tid_req,
+							    GFP_KERNEL);
+			if (ret) {
+				kfree(priv);
+				return ret;
+			}
+			qp->s_ack_queue[i].priv = priv;
+		}
+	}
+
+	return 0;
+}
+
+void hfi1_qp_priv_tid_free(struct rvt_dev_info *rdi, struct rvt_qp *qp)
+{
+	struct hfi1_qp_priv *qpriv = qp->priv;
+	struct rvt_swqe *wqe;
+	u32 i;
+
+	if (qp->ibqp.qp_type == IB_QPT_RC && HFI1_CAP_IS_KSET(TID_RDMA)) {
+		for (i = 0; i < qp->s_size; i++) {
+			wqe = rvt_get_swqe_ptr(qp, i);
+			kfree(wqe->priv);
+			wqe->priv = NULL;
+		}
+		for (i = 0; i < rvt_max_atomic(rdi); i++) {
+			struct hfi1_ack_priv *priv = qp->s_ack_queue[i].priv;
+
+			if (priv)
+				hfi1_kern_exp_rcv_free_flows(&priv->tid_req);
+			kfree(priv);
+			qp->s_ack_queue[i].priv = NULL;
+		}
+		cancel_work_sync(&qpriv->opfn.opfn_work);
+		kfree(qpriv->pages);
+		qpriv->pages = NULL;
+	}
+}
+
+/* Flow and tid waiter functions */
+/**
+ * DOC: lock ordering
+ *
+ * There are two locks involved with the queuing
+ * routines: the qp s_lock and the exp_lock.
+ *
+ * Since the tid space allocation is called from
+ * the send engine, the qp s_lock is already held.
+ *
+ * The allocation routines will get the exp_lock.
+ *
+ * The first_qp() call is provided to allow the head of
+ * the rcd wait queue to be fetched under the exp_lock and
+ * followed by a drop of the exp_lock.
+ *
+ * Any qp in the wait list will have the qp reference count held
+ * to hold the qp in memory.
+ */
+
+/*
+ * return head of rcd wait list
+ *
+ * Must hold the exp_lock.
+ *
+ * Get a reference to the QP to hold the QP in memory.
+ *
+ * The caller must release the reference when the local
+ * is no longer being used.
+ */
+static struct rvt_qp *first_qp(struct hfi1_ctxtdata *rcd,
+			       struct tid_queue *queue)
+	__must_hold(&rcd->exp_lock)
+{
+	struct hfi1_qp_priv *priv;
+
+	lockdep_assert_held(&rcd->exp_lock);
+	priv = list_first_entry_or_null(&queue->queue_head,
+					struct hfi1_qp_priv,
+					tid_wait);
+	if (!priv)
+		return NULL;
+	rvt_get_qp(priv->owner);
+	return priv->owner;
+}
+
+/**
+ * kernel_tid_waiters - determine rcd wait
+ * @rcd: the receive context
+ * @qp: the head of the qp being processed
+ *
+ * This routine will return false IFF
+ * the list is NULL or the head of the
+ * list is the indicated qp.
+ *
+ * Must hold the qp s_lock and the exp_lock.
+ *
+ * Return:
+ * false if either of the conditions below are satisfied:
+ * 1. The list is empty or
+ * 2. The indicated qp is at the head of the list and the
+ *    HFI1_S_WAIT_TID_SPACE bit is set in qp->s_flags.
+ * true is returned otherwise.
+ */
+static bool kernel_tid_waiters(struct hfi1_ctxtdata *rcd,
+			       struct tid_queue *queue, struct rvt_qp *qp)
+	__must_hold(&rcd->exp_lock) __must_hold(&qp->s_lock)
+{
+	struct rvt_qp *fqp;
+	bool ret = true;
+
+	lockdep_assert_held(&qp->s_lock);
+	lockdep_assert_held(&rcd->exp_lock);
+	fqp = first_qp(rcd, queue);
+	if (!fqp || (fqp == qp && (qp->s_flags & HFI1_S_WAIT_TID_SPACE)))
+		ret = false;
+	rvt_put_qp(fqp);
+	return ret;
+}
+
+/**
+ * dequeue_tid_waiter - dequeue the qp from the list
+ * @qp - the qp to remove the wait list
+ *
+ * This routine removes the indicated qp from the
+ * wait list if it is there.
+ *
+ * This should be done after the hardware flow and
+ * tid array resources have been allocated.
+ *
+ * Must hold the qp s_lock and the rcd exp_lock.
+ *
+ * It assumes the s_lock to protect the s_flags
+ * field and to reliably test the HFI1_S_WAIT_TID_SPACE flag.
+ */
+static void dequeue_tid_waiter(struct hfi1_ctxtdata *rcd,
+			       struct tid_queue *queue, struct rvt_qp *qp)
+	__must_hold(&rcd->exp_lock) __must_hold(&qp->s_lock)
+{
+	struct hfi1_qp_priv *priv = qp->priv;
+
+	lockdep_assert_held(&qp->s_lock);
+	lockdep_assert_held(&rcd->exp_lock);
+	if (list_empty(&priv->tid_wait))
+		return;
+	list_del_init(&priv->tid_wait);
+	qp->s_flags &= ~HFI1_S_WAIT_TID_SPACE;
+	queue->dequeue++;
+	rvt_put_qp(qp);
+}
+
+/**
+ * queue_qp_for_tid_wait - suspend QP on tid space
+ * @rcd: the receive context
+ * @qp: the qp
+ *
+ * The qp is inserted at the tail of the rcd
+ * wait queue and the HFI1_S_WAIT_TID_SPACE s_flag is set.
+ *
+ * Must hold the qp s_lock and the exp_lock.
+ */
+static void queue_qp_for_tid_wait(struct hfi1_ctxtdata *rcd,
+				  struct tid_queue *queue, struct rvt_qp *qp)
+	__must_hold(&rcd->exp_lock) __must_hold(&qp->s_lock)
+{
+	struct hfi1_qp_priv *priv = qp->priv;
+
+	lockdep_assert_held(&qp->s_lock);
+	lockdep_assert_held(&rcd->exp_lock);
+	if (list_empty(&priv->tid_wait)) {
+		qp->s_flags |= HFI1_S_WAIT_TID_SPACE;
+		list_add_tail(&priv->tid_wait, &queue->queue_head);
+		priv->tid_enqueue = ++queue->enqueue;
+		rcd->dd->verbs_dev.n_tidwait++;
+		trace_hfi1_qpsleep(qp, HFI1_S_WAIT_TID_SPACE);
+		rvt_get_qp(qp);
+	}
+}
+
+/**
+ * __trigger_tid_waiter - trigger tid waiter
+ * @qp: the qp
+ *
+ * This is a private entrance to schedule the qp
+ * assuming the caller is holding the qp->s_lock.
+ */
+static void __trigger_tid_waiter(struct rvt_qp *qp)
+	__must_hold(&qp->s_lock)
+{
+	lockdep_assert_held(&qp->s_lock);
+	if (!(qp->s_flags & HFI1_S_WAIT_TID_SPACE))
+		return;
+	trace_hfi1_qpwakeup(qp, HFI1_S_WAIT_TID_SPACE);
+	hfi1_schedule_send(qp);
+}
+
+/**
+ * tid_rdma_schedule_tid_wakeup - schedule wakeup for a qp
+ * @qp - the qp
+ *
+ * trigger a schedule or a waiting qp in a deadlock
+ * safe manner.  The qp reference is held prior
+ * to this call via first_qp().
+ *
+ * If the qp trigger was already scheduled (!rval)
+ * the the reference is dropped, otherwise the resume
+ * or the destroy cancel will dispatch the reference.
+ */
+static void tid_rdma_schedule_tid_wakeup(struct rvt_qp *qp)
+{
+	struct hfi1_qp_priv *priv;
+	struct hfi1_ibport *ibp;
+	struct hfi1_pportdata *ppd;
+	struct hfi1_devdata *dd;
+	bool rval;
+
+	if (!qp)
+		return;
+
+	priv = qp->priv;
+	ibp = to_iport(qp->ibqp.device, qp->port_num);
+	ppd = ppd_from_ibp(ibp);
+	dd = dd_from_ibdev(qp->ibqp.device);
+
+	rval = queue_work_on(priv->s_sde ?
+			     priv->s_sde->cpu :
+			     cpumask_first(cpumask_of_node(dd->node)),
+			     ppd->hfi1_wq,
+			     &priv->tid_rdma.trigger_work);
+	if (!rval)
+		rvt_put_qp(qp);
+}
+
+/**
+ * tid_rdma_trigger_resume - field a trigger work request
+ * @work - the work item
+ *
+ * Complete the off qp trigger processing by directly
+ * calling the progress routine.
+ */
+static void tid_rdma_trigger_resume(struct work_struct *work)
+{
+	struct tid_rdma_qp_params *tr;
+	struct hfi1_qp_priv *priv;
+	struct rvt_qp *qp;
+
+	tr = container_of(work, struct tid_rdma_qp_params, trigger_work);
+	priv = container_of(tr, struct hfi1_qp_priv, tid_rdma);
+	qp = priv->owner;
+	spin_lock_irq(&qp->s_lock);
+	if (qp->s_flags & HFI1_S_WAIT_TID_SPACE) {
+		spin_unlock_irq(&qp->s_lock);
+		hfi1_do_send(priv->owner, true);
+	} else {
+		spin_unlock_irq(&qp->s_lock);
+	}
+	rvt_put_qp(qp);
+}
+
+/**
+ * tid_rdma_flush_wait - unwind any tid space wait
+ *
+ * This is called when resetting a qp to
+ * allow a destroy or reset to get rid
+ * of any tid space linkage and reference counts.
+ */
+static void _tid_rdma_flush_wait(struct rvt_qp *qp, struct tid_queue *queue)
+	__must_hold(&qp->s_lock)
+{
+	struct hfi1_qp_priv *priv;
+
+	if (!qp)
+		return;
+	lockdep_assert_held(&qp->s_lock);
+	priv = qp->priv;
+	qp->s_flags &= ~HFI1_S_WAIT_TID_SPACE;
+	spin_lock(&priv->rcd->exp_lock);
+	if (!list_empty(&priv->tid_wait)) {
+		list_del_init(&priv->tid_wait);
+		qp->s_flags &= ~HFI1_S_WAIT_TID_SPACE;
+		queue->dequeue++;
+		rvt_put_qp(qp);
+	}
+	spin_unlock(&priv->rcd->exp_lock);
+}
+
+void hfi1_tid_rdma_flush_wait(struct rvt_qp *qp)
+	__must_hold(&qp->s_lock)
+{
+	struct hfi1_qp_priv *priv = qp->priv;
+
+	_tid_rdma_flush_wait(qp, &priv->rcd->flow_queue);
+	_tid_rdma_flush_wait(qp, &priv->rcd->rarr_queue);
+}
+
+/* Flow functions */
+/**
+ * kern_reserve_flow - allocate a hardware flow
+ * @rcd - the context to use for allocation
+ * @last - the index of the preferred flow. Use RXE_NUM_TID_FLOWS to
+ *         signify "don't care".
+ *
+ * Use a bit mask based allocation to reserve a hardware
+ * flow for use in receiving KDETH data packets. If a preferred flow is
+ * specified the function will attempt to reserve that flow again, if
+ * available.
+ *
+ * The exp_lock must be held.
+ *
+ * Return:
+ * On success: a value postive value between 0 and RXE_NUM_TID_FLOWS - 1
+ * On failure: -EAGAIN
+ */
+static int kern_reserve_flow(struct hfi1_ctxtdata *rcd, int last)
+	__must_hold(&rcd->exp_lock)
+{
+	int nr;
+
+	/* Attempt to reserve the preferred flow index */
+	if (last >= 0 && last < RXE_NUM_TID_FLOWS &&
+	    !test_and_set_bit(last, &rcd->flow_mask))
+		return last;
+
+	nr = ffz(rcd->flow_mask);
+	BUILD_BUG_ON(RXE_NUM_TID_FLOWS >=
+		     (sizeof(rcd->flow_mask) * BITS_PER_BYTE));
+	if (nr > (RXE_NUM_TID_FLOWS - 1))
+		return -EAGAIN;
+	set_bit(nr, &rcd->flow_mask);
+	return nr;
+}
+
+static void kern_set_hw_flow(struct hfi1_ctxtdata *rcd, u32 generation,
+			     u32 flow_idx)
+{
+	u64 reg;
+
+	reg = ((u64)generation << HFI1_KDETH_BTH_SEQ_SHIFT) |
+		RCV_TID_FLOW_TABLE_CTRL_FLOW_VALID_SMASK |
+		RCV_TID_FLOW_TABLE_CTRL_KEEP_AFTER_SEQ_ERR_SMASK |
+		RCV_TID_FLOW_TABLE_CTRL_KEEP_ON_GEN_ERR_SMASK |
+		RCV_TID_FLOW_TABLE_STATUS_SEQ_MISMATCH_SMASK |
+		RCV_TID_FLOW_TABLE_STATUS_GEN_MISMATCH_SMASK;
+
+	if (generation != KERN_GENERATION_RESERVED)
+		reg |= RCV_TID_FLOW_TABLE_CTRL_HDR_SUPP_EN_SMASK;
+
+	write_uctxt_csr(rcd->dd, rcd->ctxt,
+			RCV_TID_FLOW_TABLE + 8 * flow_idx, reg);
+}
+
+static u32 kern_setup_hw_flow(struct hfi1_ctxtdata *rcd, u32 flow_idx)
+	__must_hold(&rcd->exp_lock)
+{
+	u32 generation = rcd->flows[flow_idx].generation;
+
+	kern_set_hw_flow(rcd, generation, flow_idx);
+	return generation;
+}
+
+static u32 kern_flow_generation_next(u32 gen)
+{
+	u32 generation = mask_generation(gen + 1);
+
+	if (generation == KERN_GENERATION_RESERVED)
+		generation = mask_generation(generation + 1);
+	return generation;
+}
+
+static void kern_clear_hw_flow(struct hfi1_ctxtdata *rcd, u32 flow_idx)
+	__must_hold(&rcd->exp_lock)
+{
+	rcd->flows[flow_idx].generation =
+		kern_flow_generation_next(rcd->flows[flow_idx].generation);
+	kern_set_hw_flow(rcd, KERN_GENERATION_RESERVED, flow_idx);
+}
+
+int hfi1_kern_setup_hw_flow(struct hfi1_ctxtdata *rcd, struct rvt_qp *qp)
+{
+	struct hfi1_qp_priv *qpriv = (struct hfi1_qp_priv *)qp->priv;
+	struct tid_flow_state *fs = &qpriv->flow_state;
+	struct rvt_qp *fqp;
+	unsigned long flags;
+	int ret = 0;
+
+	/* The QP already has an allocated flow */
+	if (fs->index != RXE_NUM_TID_FLOWS)
+		return ret;
+
+	spin_lock_irqsave(&rcd->exp_lock, flags);
+	if (kernel_tid_waiters(rcd, &rcd->flow_queue, qp))
+		goto queue;
+
+	ret = kern_reserve_flow(rcd, fs->last_index);
+	if (ret < 0)
+		goto queue;
+	fs->index = ret;
+	fs->last_index = fs->index;
+
+	/* Generation received in a RESYNC overrides default flow generation */
+	if (fs->generation != KERN_GENERATION_RESERVED)
+		rcd->flows[fs->index].generation = fs->generation;
+	fs->generation = kern_setup_hw_flow(rcd, fs->index);
+	fs->psn = 0;
+	dequeue_tid_waiter(rcd, &rcd->flow_queue, qp);
+	/* get head before dropping lock */
+	fqp = first_qp(rcd, &rcd->flow_queue);
+	spin_unlock_irqrestore(&rcd->exp_lock, flags);
+
+	tid_rdma_schedule_tid_wakeup(fqp);
+	return 0;
+queue:
+	queue_qp_for_tid_wait(rcd, &rcd->flow_queue, qp);
+	spin_unlock_irqrestore(&rcd->exp_lock, flags);
+	return -EAGAIN;
+}
+
+void hfi1_kern_clear_hw_flow(struct hfi1_ctxtdata *rcd, struct rvt_qp *qp)
+{
+	struct hfi1_qp_priv *qpriv = (struct hfi1_qp_priv *)qp->priv;
+	struct tid_flow_state *fs = &qpriv->flow_state;
+	struct rvt_qp *fqp;
+	unsigned long flags;
+
+	if (fs->index >= RXE_NUM_TID_FLOWS)
+		return;
+	spin_lock_irqsave(&rcd->exp_lock, flags);
+	kern_clear_hw_flow(rcd, fs->index);
+	clear_bit(fs->index, &rcd->flow_mask);
+	fs->index = RXE_NUM_TID_FLOWS;
+	fs->psn = 0;
+	fs->generation = KERN_GENERATION_RESERVED;
+
+	/* get head before dropping lock */
+	fqp = first_qp(rcd, &rcd->flow_queue);
+	spin_unlock_irqrestore(&rcd->exp_lock, flags);
+
+	if (fqp == qp) {
+		__trigger_tid_waiter(fqp);
+		rvt_put_qp(fqp);
+	} else {
+		tid_rdma_schedule_tid_wakeup(fqp);
+	}
+}
+
+void hfi1_kern_init_ctxt_generations(struct hfi1_ctxtdata *rcd)
+{
+	int i;
+
+	for (i = 0; i < RXE_NUM_TID_FLOWS; i++) {
+		rcd->flows[i].generation = mask_generation(prandom_u32());
+		kern_set_hw_flow(rcd, KERN_GENERATION_RESERVED, i);
+	}
+}
+
+/* TID allocation functions */
+static u8 trdma_pset_order(struct tid_rdma_pageset *s)
+{
+	u8 count = s->count;
+
+	return ilog2(count) + 1;
+}
+
+/**
+ * tid_rdma_find_phys_blocks_4k - get groups base on mr info
+ * @npages - number of pages
+ * @pages - pointer to an array of page structs
+ * @list - page set array to return
+ *
+ * This routine returns the number of groups associated with
+ * the current sge information.  This implementation is based
+ * on the expected receive find_phys_blocks() adjusted to
+ * use the MR information vs. the pfn.
+ *
+ * Return:
+ * the number of RcvArray entries
+ */
+static u32 tid_rdma_find_phys_blocks_4k(struct tid_rdma_flow *flow,
+					struct page **pages,
+					u32 npages,
+					struct tid_rdma_pageset *list)
+{
+	u32 pagecount, pageidx, setcount = 0, i;
+	void *vaddr, *this_vaddr;
+
+	if (!npages)
+		return 0;
+
+	/*
+	 * Look for sets of physically contiguous pages in the user buffer.
+	 * This will allow us to optimize Expected RcvArray entry usage by
+	 * using the bigger supported sizes.
+	 */
+	vaddr = page_address(pages[0]);
+	trace_hfi1_tid_flow_page(flow->req->qp, flow, 0, 0, 0, vaddr);
+	for (pageidx = 0, pagecount = 1, i = 1; i <= npages; i++) {
+		this_vaddr = i < npages ? page_address(pages[i]) : NULL;
+		trace_hfi1_tid_flow_page(flow->req->qp, flow, i, 0, 0,
+					 this_vaddr);
+		/*
+		 * If the vaddr's are not sequential, pages are not physically
+		 * contiguous.
+		 */
+		if (this_vaddr != (vaddr + PAGE_SIZE)) {
+			/*
+			 * At this point we have to loop over the set of
+			 * physically contiguous pages and break them down it
+			 * sizes supported by the HW.
+			 * There are two main constraints:
+			 *     1. The max buffer size is MAX_EXPECTED_BUFFER.
+			 *        If the total set size is bigger than that
+			 *        program only a MAX_EXPECTED_BUFFER chunk.
+			 *     2. The buffer size has to be a power of two. If
+			 *        it is not, round down to the closes power of
+			 *        2 and program that size.
+			 */
+			while (pagecount) {
+				int maxpages = pagecount;
+				u32 bufsize = pagecount * PAGE_SIZE;
+
+				if (bufsize > MAX_EXPECTED_BUFFER)
+					maxpages =
+						MAX_EXPECTED_BUFFER >>
+						PAGE_SHIFT;
+				else if (!is_power_of_2(bufsize))
+					maxpages =
+						rounddown_pow_of_two(bufsize) >>
+						PAGE_SHIFT;
+
+				list[setcount].idx = pageidx;
+				list[setcount].count = maxpages;
+				trace_hfi1_tid_pageset(flow->req->qp, setcount,
+						       list[setcount].idx,
+						       list[setcount].count);
+				pagecount -= maxpages;
+				pageidx += maxpages;
+				setcount++;
+			}
+			pageidx = i;
+			pagecount = 1;
+			vaddr = this_vaddr;
+		} else {
+			vaddr += PAGE_SIZE;
+			pagecount++;
+		}
+	}
+	/* insure we always return an even number of sets */
+	if (setcount & 1)
+		list[setcount++].count = 0;
+	return setcount;
+}
+
+/**
+ * tid_flush_pages - dump out pages into pagesets
+ * @list - list of pagesets
+ * @idx - pointer to current page index
+ * @pages - number of pages to dump
+ * @sets - current number of pagesset
+ *
+ * This routine flushes out accumuated pages.
+ *
+ * To insure an even number of sets the
+ * code may add a filler.
+ *
+ * This can happen with when pages is not
+ * a power of 2 or pages is a power of 2
+ * less than the maximum pages.
+ *
+ * Return:
+ * The new number of sets
+ */
+
+static u32 tid_flush_pages(struct tid_rdma_pageset *list,
+			   u32 *idx, u32 pages, u32 sets)
+{
+	while (pages) {
+		u32 maxpages = pages;
+
+		if (maxpages > MAX_EXPECTED_PAGES)
+			maxpages = MAX_EXPECTED_PAGES;
+		else if (!is_power_of_2(maxpages))
+			maxpages = rounddown_pow_of_two(maxpages);
+		list[sets].idx = *idx;
+		list[sets++].count = maxpages;
+		*idx += maxpages;
+		pages -= maxpages;
+	}
+	/* might need a filler */
+	if (sets & 1)
+		list[sets++].count = 0;
+	return sets;
+}
+
+/**
+ * tid_rdma_find_phys_blocks_8k - get groups base on mr info
+ * @pages - pointer to an array of page structs
+ * @npages - number of pages
+ * @list - page set array to return
+ *
+ * This routine parses an array of pages to compute pagesets
+ * in an 8k compatible way.
+ *
+ * pages are tested two at a time, i, i + 1 for contiguous
+ * pages and i - 1 and i contiguous pages.
+ *
+ * If any condition is false, any accumlated pages are flushed and
+ * v0,v1 are emitted as separate PAGE_SIZE pagesets
+ *
+ * Otherwise, the current 8k is totaled for a future flush.
+ *
+ * Return:
+ * The number of pagesets
+ * list set with the returned number of pagesets
+ *
+ */
+static u32 tid_rdma_find_phys_blocks_8k(struct tid_rdma_flow *flow,
+					struct page **pages,
+					u32 npages,
+					struct tid_rdma_pageset *list)
+{
+	u32 idx, sets = 0, i;
+	u32 pagecnt = 0;
+	void *v0, *v1, *vm1;
+
+	if (!npages)
+		return 0;
+	for (idx = 0, i = 0, vm1 = NULL; i < npages; i += 2) {
+		/* get a new v0 */
+		v0 = page_address(pages[i]);
+		trace_hfi1_tid_flow_page(flow->req->qp, flow, i, 1, 0, v0);
+		v1 = i + 1 < npages ?
+				page_address(pages[i + 1]) : NULL;
+		trace_hfi1_tid_flow_page(flow->req->qp, flow, i, 1, 1, v1);
+		/* compare i, i + 1 vaddr */
+		if (v1 != (v0 + PAGE_SIZE)) {
+			/* flush out pages */
+			sets = tid_flush_pages(list, &idx, pagecnt, sets);
+			/* output v0,v1 as two pagesets */
+			list[sets].idx = idx++;
+			list[sets++].count = 1;
+			if (v1) {
+				list[sets].count = 1;
+				list[sets++].idx = idx++;
+			} else {
+				list[sets++].count = 0;
+			}
+			vm1 = NULL;
+			pagecnt = 0;
+			continue;
+		}
+		/* i,i+1 consecutive, look at i-1,i */
+		if (vm1 && v0 != (vm1 + PAGE_SIZE)) {
+			/* flush out pages */
+			sets = tid_flush_pages(list, &idx, pagecnt, sets);
+			pagecnt = 0;
+		}
+		/* pages will always be a multiple of 8k */
+		pagecnt += 2;
+		/* save i-1 */
+		vm1 = v1;
+		/* move to next pair */
+	}
+	/* dump residual pages at end */
+	sets = tid_flush_pages(list, &idx, npages - idx, sets);
+	/* by design cannot be odd sets */
+	WARN_ON(sets & 1);
+	return sets;
+}
+
+/**
+ * Find pages for one segment of a sge array represented by @ss. The function
+ * does not check the sge, the sge must have been checked for alignment with a
+ * prior call to hfi1_kern_trdma_ok. Other sge checking is done as part of
+ * rvt_lkey_ok and rvt_rkey_ok. Also, the function only modifies the local sge
+ * copy maintained in @ss->sge, the original sge is not modified.
+ *
+ * Unlike IB RDMA WRITE, we can't decrement ss->num_sge here because we are not
+ * releasing the MR reference count at the same time. Otherwise, we'll "leak"
+ * references to the MR. This difference requires that we keep track of progress
+ * into the sg_list. This is done by the cur_seg cursor in the tid_rdma_request
+ * structure.
+ */
+static u32 kern_find_pages(struct tid_rdma_flow *flow,
+			   struct page **pages,
+			   struct rvt_sge_state *ss, bool *last)
+{
+	struct tid_rdma_request *req = flow->req;
+	struct rvt_sge *sge = &ss->sge;
+	u32 length = flow->req->seg_len;
+	u32 len = PAGE_SIZE;
+	u32 i = 0;
+
+	while (length && req->isge < ss->num_sge) {
+		pages[i++] = virt_to_page(sge->vaddr);
+
+		sge->vaddr += len;
+		sge->length -= len;
+		sge->sge_length -= len;
+		if (!sge->sge_length) {
+			if (++req->isge < ss->num_sge)
+				*sge = ss->sg_list[req->isge - 1];
+		} else if (sge->length == 0 && sge->mr->lkey) {
+			if (++sge->n >= RVT_SEGSZ) {
+				++sge->m;
+				sge->n = 0;
+			}
+			sge->vaddr = sge->mr->map[sge->m]->segs[sge->n].vaddr;
+			sge->length = sge->mr->map[sge->m]->segs[sge->n].length;
+		}
+		length -= len;
+	}
+
+	flow->length = flow->req->seg_len - length;
+	*last = req->isge == ss->num_sge ? false : true;
+	return i;
+}
+
+static void dma_unmap_flow(struct tid_rdma_flow *flow)
+{
+	struct hfi1_devdata *dd;
+	int i;
+	struct tid_rdma_pageset *pset;
+
+	dd = flow->req->rcd->dd;
+	for (i = 0, pset = &flow->pagesets[0]; i < flow->npagesets;
+			i++, pset++) {
+		if (pset->count && pset->addr) {
+			dma_unmap_page(&dd->pcidev->dev,
+				       pset->addr,
+				       PAGE_SIZE * pset->count,
+				       DMA_FROM_DEVICE);
+			pset->mapped = 0;
+		}
+	}
+}
+
+static int dma_map_flow(struct tid_rdma_flow *flow, struct page **pages)
+{
+	int i;
+	struct hfi1_devdata *dd = flow->req->rcd->dd;
+	struct tid_rdma_pageset *pset;
+
+	for (i = 0, pset = &flow->pagesets[0]; i < flow->npagesets;
+			i++, pset++) {
+		if (pset->count) {
+			pset->addr = dma_map_page(&dd->pcidev->dev,
+						  pages[pset->idx],
+						  0,
+						  PAGE_SIZE * pset->count,
+						  DMA_FROM_DEVICE);
+
+			if (dma_mapping_error(&dd->pcidev->dev, pset->addr)) {
+				dma_unmap_flow(flow);
+				return -ENOMEM;
+			}
+			pset->mapped = 1;
+		}
+	}
+	return 0;
+}
+
+static inline bool dma_mapped(struct tid_rdma_flow *flow)
+{
+	return !!flow->pagesets[0].mapped;
+}
+
+/*
+ * Get pages pointers and identify contiguous physical memory chunks for a
+ * segment. All segments are of length flow->req->seg_len.
+ */
+static int kern_get_phys_blocks(struct tid_rdma_flow *flow,
+				struct page **pages,
+				struct rvt_sge_state *ss, bool *last)
+{
+	u8 npages;
+
+	/* Reuse previously computed pagesets, if any */
+	if (flow->npagesets) {
+		trace_hfi1_tid_flow_alloc(flow->req->qp, flow->req->setup_head,
+					  flow);
+		if (!dma_mapped(flow))
+			return dma_map_flow(flow, pages);
+		return 0;
+	}
+
+	npages = kern_find_pages(flow, pages, ss, last);
+
+	if (flow->req->qp->pmtu == enum_to_mtu(OPA_MTU_4096))
+		flow->npagesets =
+			tid_rdma_find_phys_blocks_4k(flow, pages, npages,
+						     flow->pagesets);
+	else
+		flow->npagesets =
+			tid_rdma_find_phys_blocks_8k(flow, pages, npages,
+						     flow->pagesets);
+
+	return dma_map_flow(flow, pages);
+}
+
+static inline void kern_add_tid_node(struct tid_rdma_flow *flow,
+				     struct hfi1_ctxtdata *rcd, char *s,
+				     struct tid_group *grp, u8 cnt)
+{
+	struct kern_tid_node *node = &flow->tnode[flow->tnode_cnt++];
+
+	WARN_ON_ONCE(flow->tnode_cnt >=
+		     (TID_RDMA_MAX_SEGMENT_SIZE >> PAGE_SHIFT));
+	if (WARN_ON_ONCE(cnt & 1))
+		dd_dev_err(rcd->dd,
+			   "unexpected odd allocation cnt %u map 0x%x used %u",
+			   cnt, grp->map, grp->used);
+
+	node->grp = grp;
+	node->map = grp->map;
+	node->cnt = cnt;
+	trace_hfi1_tid_node_add(flow->req->qp, s, flow->tnode_cnt - 1,
+				grp->base, grp->map, grp->used, cnt);
+}
+
+/*
+ * Try to allocate pageset_count TID's from TID groups for a context
+ *
+ * This function allocates TID's without moving groups between lists or
+ * modifying grp->map. This is done as follows, being cogizant of the lists
+ * between which the TID groups will move:
+ * 1. First allocate complete groups of 8 TID's since this is more efficient,
+ *    these groups will move from group->full without affecting used
+ * 2. If more TID's are needed allocate from used (will move from used->full or
+ *    stay in used)
+ * 3. If we still don't have the required number of TID's go back and look again
+ *    at a complete group (will move from group->used)
+ */
+static int kern_alloc_tids(struct tid_rdma_flow *flow)
+{
+	struct hfi1_ctxtdata *rcd = flow->req->rcd;
+	struct hfi1_devdata *dd = rcd->dd;
+	u32 ngroups, pageidx = 0;
+	struct tid_group *group = NULL, *used;
+	u8 use;
+
+	flow->tnode_cnt = 0;
+	ngroups = flow->npagesets / dd->rcv_entries.group_size;
+	if (!ngroups)
+		goto used_list;
+
+	/* First look at complete groups */
+	list_for_each_entry(group,  &rcd->tid_group_list.list, list) {
+		kern_add_tid_node(flow, rcd, "complete groups", group,
+				  group->size);
+
+		pageidx += group->size;
+		if (!--ngroups)
+			break;
+	}
+
+	if (pageidx >= flow->npagesets)
+		goto ok;
+
+used_list:
+	/* Now look at partially used groups */
+	list_for_each_entry(used, &rcd->tid_used_list.list, list) {
+		use = min_t(u32, flow->npagesets - pageidx,
+			    used->size - used->used);
+		kern_add_tid_node(flow, rcd, "used groups", used, use);
+
+		pageidx += use;
+		if (pageidx >= flow->npagesets)
+			goto ok;
+	}
+
+	/*
+	 * Look again at a complete group, continuing from where we left.
+	 * However, if we are at the head, we have reached the end of the
+	 * complete groups list from the first loop above
+	 */
+	if (group && &group->list == &rcd->tid_group_list.list)
+		goto bail_eagain;
+	group = list_prepare_entry(group, &rcd->tid_group_list.list,
+				   list);
+	if (list_is_last(&group->list, &rcd->tid_group_list.list))
+		goto bail_eagain;
+	group = list_next_entry(group, list);
+	use = min_t(u32, flow->npagesets - pageidx, group->size);
+	kern_add_tid_node(flow, rcd, "complete continue", group, use);
+	pageidx += use;
+	if (pageidx >= flow->npagesets)
+		goto ok;
+bail_eagain:
+	trace_hfi1_msg_alloc_tids(flow->req->qp, " insufficient tids: needed ",
+				  (u64)flow->npagesets);
+	return -EAGAIN;
+ok:
+	return 0;
+}
+
+static void kern_program_rcv_group(struct tid_rdma_flow *flow, int grp_num,
+				   u32 *pset_idx)
+{
+	struct hfi1_ctxtdata *rcd = flow->req->rcd;
+	struct hfi1_devdata *dd = rcd->dd;
+	struct kern_tid_node *node = &flow->tnode[grp_num];
+	struct tid_group *grp = node->grp;
+	struct tid_rdma_pageset *pset;
+	u32 pmtu_pg = flow->req->qp->pmtu >> PAGE_SHIFT;
+	u32 rcventry, npages = 0, pair = 0, tidctrl;
+	u8 i, cnt = 0;
+
+	for (i = 0; i < grp->size; i++) {
+		rcventry = grp->base + i;
+
+		if (node->map & BIT(i) || cnt >= node->cnt) {
+			rcv_array_wc_fill(dd, rcventry);
+			continue;
+		}
+		pset = &flow->pagesets[(*pset_idx)++];
+		if (pset->count) {
+			hfi1_put_tid(dd, rcventry, PT_EXPECTED,
+				     pset->addr, trdma_pset_order(pset));
+		} else {
+			hfi1_put_tid(dd, rcventry, PT_INVALID, 0, 0);
+		}
+		npages += pset->count;
+
+		rcventry -= rcd->expected_base;
+		tidctrl = pair ? 0x3 : rcventry & 0x1 ? 0x2 : 0x1;
+		/*
+		 * A single TID entry will be used to use a rcvarr pair (with
+		 * tidctrl 0x3), if ALL these are true (a) the bit pos is even
+		 * (b) the group map shows current and the next bits as free
+		 * indicating two consecutive rcvarry entries are available (c)
+		 * we actually need 2 more entries
+		 */
+		pair = !(i & 0x1) && !((node->map >> i) & 0x3) &&
+			node->cnt >= cnt + 2;
+		if (!pair) {
+			if (!pset->count)
+				tidctrl = 0x1;
+			flow->tid_entry[flow->tidcnt++] =
+				EXP_TID_SET(IDX, rcventry >> 1) |
+				EXP_TID_SET(CTRL, tidctrl) |
+				EXP_TID_SET(LEN, npages);
+			trace_hfi1_tid_entry_alloc(/* entry */
+			   flow->req->qp, flow->tidcnt - 1,
+			   flow->tid_entry[flow->tidcnt - 1]);
+
+			/* Efficient DIV_ROUND_UP(npages, pmtu_pg) */
+			flow->npkts += (npages + pmtu_pg - 1) >> ilog2(pmtu_pg);
+			npages = 0;
+		}
+
+		if (grp->used == grp->size - 1)
+			tid_group_move(grp, &rcd->tid_used_list,
+				       &rcd->tid_full_list);
+		else if (!grp->used)
+			tid_group_move(grp, &rcd->tid_group_list,
+				       &rcd->tid_used_list);
+
+		grp->used++;
+		grp->map |= BIT(i);
+		cnt++;
+	}
+}
+
+static void kern_unprogram_rcv_group(struct tid_rdma_flow *flow, int grp_num)
+{
+	struct hfi1_ctxtdata *rcd = flow->req->rcd;
+	struct hfi1_devdata *dd = rcd->dd;
+	struct kern_tid_node *node = &flow->tnode[grp_num];
+	struct tid_group *grp = node->grp;
+	u32 rcventry;
+	u8 i, cnt = 0;
+
+	for (i = 0; i < grp->size; i++) {
+		rcventry = grp->base + i;
+
+		if (node->map & BIT(i) || cnt >= node->cnt) {
+			rcv_array_wc_fill(dd, rcventry);
+			continue;
+		}
+
+		hfi1_put_tid(dd, rcventry, PT_INVALID, 0, 0);
+
+		grp->used--;
+		grp->map &= ~BIT(i);
+		cnt++;
+
+		if (grp->used == grp->size - 1)
+			tid_group_move(grp, &rcd->tid_full_list,
+				       &rcd->tid_used_list);
+		else if (!grp->used)
+			tid_group_move(grp, &rcd->tid_used_list,
+				       &rcd->tid_group_list);
+	}
+	if (WARN_ON_ONCE(cnt & 1)) {
+		struct hfi1_ctxtdata *rcd = flow->req->rcd;
+		struct hfi1_devdata *dd = rcd->dd;
+
+		dd_dev_err(dd, "unexpected odd free cnt %u map 0x%x used %u",
+			   cnt, grp->map, grp->used);
+	}
+}
+
+static void kern_program_rcvarray(struct tid_rdma_flow *flow)
+{
+	u32 pset_idx = 0;
+	int i;
+
+	flow->npkts = 0;
+	flow->tidcnt = 0;
+	for (i = 0; i < flow->tnode_cnt; i++)
+		kern_program_rcv_group(flow, i, &pset_idx);
+	trace_hfi1_tid_flow_alloc(flow->req->qp, flow->req->setup_head, flow);
+}
+
+/**
+ * hfi1_kern_exp_rcv_setup() - setup TID's and flow for one segment of a
+ * TID RDMA request
+ *
+ * @req: TID RDMA request for which the segment/flow is being set up
+ * @ss: sge state, maintains state across successive segments of a sge
+ * @last: set to true after the last sge segment has been processed
+ *
+ * This function
+ * (1) finds a free flow entry in the flow circular buffer
+ * (2) finds pages and continuous physical chunks constituing one segment
+ *     of an sge
+ * (3) allocates TID group entries for those chunks
+ * (4) programs rcvarray entries in the hardware corresponding to those
+ *     TID's
+ * (5) computes a tidarray with formatted TID entries which can be sent
+ *     to the sender
+ * (6) Reserves and programs HW flows.
+ * (7) It also manages queing the QP when TID/flow resources are not
+ *     available.
+ *
+ * @req points to struct tid_rdma_request of which the segments are a part. The
+ * function uses qp, rcd and seg_len members of @req. In the absence of errors,
+ * req->flow_idx is the index of the flow which has been prepared in this
+ * invocation of function call. With flow = &req->flows[req->flow_idx],
+ * flow->tid_entry contains the TID array which the sender can use for TID RDMA
+ * sends and flow->npkts contains number of packets required to send the
+ * segment.
+ *
+ * hfi1_check_sge_align should be called prior to calling this function and if
+ * it signals error TID RDMA cannot be used for this sge and this function
+ * should not be called.
+ *
+ * For the queuing, caller must hold the flow->req->qp s_lock from the send
+ * engine and the function will procure the exp_lock.
+ *
+ * Return:
+ * The function returns -EAGAIN if sufficient number of TID/flow resources to
+ * map the segment could not be allocated. In this case the function should be
+ * called again with previous arguments to retry the TID allocation. There are
+ * no other error returns. The function returns 0 on success.
+ */
+int hfi1_kern_exp_rcv_setup(struct tid_rdma_request *req,
+			    struct rvt_sge_state *ss, bool *last)
+	__must_hold(&req->qp->s_lock)
+{
+	struct tid_rdma_flow *flow = &req->flows[req->setup_head];
+	struct hfi1_ctxtdata *rcd = req->rcd;
+	struct hfi1_qp_priv *qpriv = req->qp->priv;
+	unsigned long flags;
+	struct rvt_qp *fqp;
+	u16 clear_tail = req->clear_tail;
+
+	lockdep_assert_held(&req->qp->s_lock);
+	/*
+	 * We return error if either (a) we don't have space in the flow
+	 * circular buffer, or (b) we already have max entries in the buffer.
+	 * Max entries depend on the type of request we are processing and the
+	 * negotiated TID RDMA parameters.
+	 */
+	if (!CIRC_SPACE(req->setup_head, clear_tail, MAX_FLOWS) ||
+	    CIRC_CNT(req->setup_head, clear_tail, MAX_FLOWS) >=
+	    req->n_flows)
+		return -EINVAL;
+
+	/*
+	 * Get pages, identify contiguous physical memory chunks for the segment
+	 * If we can not determine a DMA address mapping we will treat it just
+	 * like if we ran out of space above.
+	 */
+	if (kern_get_phys_blocks(flow, qpriv->pages, ss, last)) {
+		hfi1_wait_kmem(flow->req->qp);
+		return -ENOMEM;
+	}
+
+	spin_lock_irqsave(&rcd->exp_lock, flags);
+	if (kernel_tid_waiters(rcd, &rcd->rarr_queue, flow->req->qp))
+		goto queue;
+
+	/*
+	 * At this point we know the number of pagesets and hence the number of
+	 * TID's to map the segment. Allocate the TID's from the TID groups. If
+	 * we cannot allocate the required number we exit and try again later
+	 */
+	if (kern_alloc_tids(flow))
+		goto queue;
+	/*
+	 * Finally program the TID entries with the pagesets, compute the
+	 * tidarray and enable the HW flow
+	 */
+	kern_program_rcvarray(flow);
+
+	/*
+	 * Setup the flow state with relevant information.
+	 * This information is used for tracking the sequence of data packets
+	 * for the segment.
+	 * The flow is setup here as this is the most accurate time and place
+	 * to do so. Doing at a later time runs the risk of the flow data in
+	 * qpriv getting out of sync.
+	 */
+	memset(&flow->flow_state, 0x0, sizeof(flow->flow_state));
+	flow->idx = qpriv->flow_state.index;
+	flow->flow_state.generation = qpriv->flow_state.generation;
+	flow->flow_state.spsn = qpriv->flow_state.psn;
+	flow->flow_state.lpsn = flow->flow_state.spsn + flow->npkts - 1;
+	flow->flow_state.r_next_psn =
+		full_flow_psn(flow, flow->flow_state.spsn);
+	qpriv->flow_state.psn += flow->npkts;
+
+	dequeue_tid_waiter(rcd, &rcd->rarr_queue, flow->req->qp);
+	/* get head before dropping lock */
+	fqp = first_qp(rcd, &rcd->rarr_queue);
+	spin_unlock_irqrestore(&rcd->exp_lock, flags);
+	tid_rdma_schedule_tid_wakeup(fqp);
+
+	req->setup_head = (req->setup_head + 1) & (MAX_FLOWS - 1);
+	return 0;
+queue:
+	queue_qp_for_tid_wait(rcd, &rcd->rarr_queue, flow->req->qp);
+	spin_unlock_irqrestore(&rcd->exp_lock, flags);
+	return -EAGAIN;
+}
+
+static void hfi1_tid_rdma_reset_flow(struct tid_rdma_flow *flow)
+{
+	flow->npagesets = 0;
+}
+
+/*
+ * This function is called after one segment has been successfully sent to
+ * release the flow and TID HW/SW resources for that segment. The segments for a
+ * TID RDMA request are setup and cleared in FIFO order which is managed using a
+ * circular buffer.
+ */
+int hfi1_kern_exp_rcv_clear(struct tid_rdma_request *req)
+	__must_hold(&req->qp->s_lock)
+{
+	struct tid_rdma_flow *flow = &req->flows[req->clear_tail];
+	struct hfi1_ctxtdata *rcd = req->rcd;
+	unsigned long flags;
+	int i;
+	struct rvt_qp *fqp;
+
+	lockdep_assert_held(&req->qp->s_lock);
+	/* Exit if we have nothing in the flow circular buffer */
+	if (!CIRC_CNT(req->setup_head, req->clear_tail, MAX_FLOWS))
+		return -EINVAL;
+
+	spin_lock_irqsave(&rcd->exp_lock, flags);
+
+	for (i = 0; i < flow->tnode_cnt; i++)
+		kern_unprogram_rcv_group(flow, i);
+	/* To prevent double unprogramming */
+	flow->tnode_cnt = 0;
+	/* get head before dropping lock */
+	fqp = first_qp(rcd, &rcd->rarr_queue);
+	spin_unlock_irqrestore(&rcd->exp_lock, flags);
+
+	dma_unmap_flow(flow);
+
+	hfi1_tid_rdma_reset_flow(flow);
+	req->clear_tail = (req->clear_tail + 1) & (MAX_FLOWS - 1);
+
+	if (fqp == req->qp) {
+		__trigger_tid_waiter(fqp);
+		rvt_put_qp(fqp);
+	} else {
+		tid_rdma_schedule_tid_wakeup(fqp);
+	}
+
+	return 0;
+}
+
+/*
+ * This function is called to release all the tid entries for
+ * a request.
+ */
+void hfi1_kern_exp_rcv_clear_all(struct tid_rdma_request *req)
+	__must_hold(&req->qp->s_lock)
+{
+	/* Use memory barrier for proper ordering */
+	while (CIRC_CNT(req->setup_head, req->clear_tail, MAX_FLOWS)) {
+		if (hfi1_kern_exp_rcv_clear(req))
+			break;
+	}
+}
+
+/**
+ * hfi1_kern_exp_rcv_free_flows - free priviously allocated flow information
+ * @req - the tid rdma request to be cleaned
+ */
+static void hfi1_kern_exp_rcv_free_flows(struct tid_rdma_request *req)
+{
+	kfree(req->flows);
+	req->flows = NULL;
+}
+
+/**
+ * __trdma_clean_swqe - clean up for large sized QPs
+ * @qp: the queue patch
+ * @wqe: the send wqe
+ */
+void __trdma_clean_swqe(struct rvt_qp *qp, struct rvt_swqe *wqe)
+{
+	struct hfi1_swqe_priv *p = wqe->priv;
+
+	hfi1_kern_exp_rcv_free_flows(&p->tid_req);
+}
+
+/*
+ * This can be called at QP create time or in the data path.
+ */
+static int hfi1_kern_exp_rcv_alloc_flows(struct tid_rdma_request *req,
+					 gfp_t gfp)
+{
+	struct tid_rdma_flow *flows;
+	int i;
+
+	if (likely(req->flows))
+		return 0;
+	flows = kmalloc_node(MAX_FLOWS * sizeof(*flows), gfp,
+			     req->rcd->numa_id);
+	if (!flows)
+		return -ENOMEM;
+	/* mini init */
+	for (i = 0; i < MAX_FLOWS; i++) {
+		flows[i].req = req;
+		flows[i].npagesets = 0;
+		flows[i].pagesets[0].mapped =  0;
+		flows[i].resync_npkts = 0;
+	}
+	req->flows = flows;
+	return 0;
+}
+
+static void hfi1_init_trdma_req(struct rvt_qp *qp,
+				struct tid_rdma_request *req)
+{
+	struct hfi1_qp_priv *qpriv = qp->priv;
+
+	/*
+	 * Initialize various TID RDMA request variables.
+	 * These variables are "static", which is why they
+	 * can be pre-initialized here before the WRs has
+	 * even been submitted.
+	 * However, non-NULL values for these variables do not
+	 * imply that this WQE has been enabled for TID RDMA.
+	 * Drivers should check the WQE's opcode to determine
+	 * if a request is a TID RDMA one or not.
+	 */
+	req->qp = qp;
+	req->rcd = qpriv->rcd;
+}
+
+u64 hfi1_access_sw_tid_wait(const struct cntr_entry *entry,
+			    void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = context;
+
+	return dd->verbs_dev.n_tidwait;
+}
+
+static struct tid_rdma_flow *find_flow_ib(struct tid_rdma_request *req,
+					  u32 psn, u16 *fidx)
+{
+	u16 head, tail;
+	struct tid_rdma_flow *flow;
+
+	head = req->setup_head;
+	tail = req->clear_tail;
+	for ( ; CIRC_CNT(head, tail, MAX_FLOWS);
+	     tail = CIRC_NEXT(tail, MAX_FLOWS)) {
+		flow = &req->flows[tail];
+		if (cmp_psn(psn, flow->flow_state.ib_spsn) >= 0 &&
+		    cmp_psn(psn, flow->flow_state.ib_lpsn) <= 0) {
+			if (fidx)
+				*fidx = tail;
+			return flow;
+		}
+	}
+	return NULL;
+}
+
+/* TID RDMA READ functions */
+u32 hfi1_build_tid_rdma_read_packet(struct rvt_swqe *wqe,
+				    struct ib_other_headers *ohdr, u32 *bth1,
+				    u32 *bth2, u32 *len)
+{
+	struct tid_rdma_request *req = wqe_to_tid_req(wqe);
+	struct tid_rdma_flow *flow = &req->flows[req->flow_idx];
+	struct rvt_qp *qp = req->qp;
+	struct hfi1_qp_priv *qpriv = qp->priv;
+	struct hfi1_swqe_priv *wpriv = wqe->priv;
+	struct tid_rdma_read_req *rreq = &ohdr->u.tid_rdma.r_req;
+	struct tid_rdma_params *remote;
+	u32 req_len = 0;
+	void *req_addr = NULL;
+
+	/* This is the IB psn used to send the request */
+	*bth2 = mask_psn(flow->flow_state.ib_spsn + flow->pkt);
+	trace_hfi1_tid_flow_build_read_pkt(qp, req->flow_idx, flow);
+
+	/* TID Entries for TID RDMA READ payload */
+	req_addr = &flow->tid_entry[flow->tid_idx];
+	req_len = sizeof(*flow->tid_entry) *
+			(flow->tidcnt - flow->tid_idx);
+
+	memset(&ohdr->u.tid_rdma.r_req, 0, sizeof(ohdr->u.tid_rdma.r_req));
+	wpriv->ss.sge.vaddr = req_addr;
+	wpriv->ss.sge.sge_length = req_len;
+	wpriv->ss.sge.length = wpriv->ss.sge.sge_length;
+	/*
+	 * We can safely zero these out. Since the first SGE covers the
+	 * entire packet, nothing else should even look at the MR.
+	 */
+	wpriv->ss.sge.mr = NULL;
+	wpriv->ss.sge.m = 0;
+	wpriv->ss.sge.n = 0;
+
+	wpriv->ss.sg_list = NULL;
+	wpriv->ss.total_len = wpriv->ss.sge.sge_length;
+	wpriv->ss.num_sge = 1;
+
+	/* Construct the TID RDMA READ REQ packet header */
+	rcu_read_lock();
+	remote = rcu_dereference(qpriv->tid_rdma.remote);
+
+	KDETH_RESET(rreq->kdeth0, KVER, 0x1);
+	KDETH_RESET(rreq->kdeth1, JKEY, remote->jkey);
+	rreq->reth.vaddr = cpu_to_be64(wqe->rdma_wr.remote_addr +
+			   req->cur_seg * req->seg_len + flow->sent);
+	rreq->reth.rkey = cpu_to_be32(wqe->rdma_wr.rkey);
+	rreq->reth.length = cpu_to_be32(*len);
+	rreq->tid_flow_psn =
+		cpu_to_be32((flow->flow_state.generation <<
+			     HFI1_KDETH_BTH_SEQ_SHIFT) |
+			    ((flow->flow_state.spsn + flow->pkt) &
+			     HFI1_KDETH_BTH_SEQ_MASK));
+	rreq->tid_flow_qp =
+		cpu_to_be32(qpriv->tid_rdma.local.qp |
+			    ((flow->idx & TID_RDMA_DESTQP_FLOW_MASK) <<
+			     TID_RDMA_DESTQP_FLOW_SHIFT) |
+			    qpriv->rcd->ctxt);
+	rreq->verbs_qp = cpu_to_be32(qp->remote_qpn);
+	*bth1 &= ~RVT_QPN_MASK;
+	*bth1 |= remote->qp;
+	*bth2 |= IB_BTH_REQ_ACK;
+	rcu_read_unlock();
+
+	/* We are done with this segment */
+	flow->sent += *len;
+	req->cur_seg++;
+	qp->s_state = TID_OP(READ_REQ);
+	req->ack_pending++;
+	req->flow_idx = (req->flow_idx + 1) & (MAX_FLOWS - 1);
+	qpriv->pending_tid_r_segs++;
+	qp->s_num_rd_atomic++;
+
+	/* Set the TID RDMA READ request payload size */
+	*len = req_len;
+
+	return sizeof(ohdr->u.tid_rdma.r_req) / sizeof(u32);
+}
+
+/*
+ * @len: contains the data length to read upon entry and the read request
+ *       payload length upon exit.
+ */
+u32 hfi1_build_tid_rdma_read_req(struct rvt_qp *qp, struct rvt_swqe *wqe,
+				 struct ib_other_headers *ohdr, u32 *bth1,
+				 u32 *bth2, u32 *len)
+	__must_hold(&qp->s_lock)
+{
+	struct hfi1_qp_priv *qpriv = qp->priv;
+	struct tid_rdma_request *req = wqe_to_tid_req(wqe);
+	struct tid_rdma_flow *flow = NULL;
+	u32 hdwords = 0;
+	bool last;
+	bool retry = true;
+	u32 npkts = rvt_div_round_up_mtu(qp, *len);
+
+	trace_hfi1_tid_req_build_read_req(qp, 0, wqe->wr.opcode, wqe->psn,
+					  wqe->lpsn, req);
+	/*
+	 * Check sync conditions. Make sure that there are no pending
+	 * segments before freeing the flow.
+	 */
+sync_check:
+	if (req->state == TID_REQUEST_SYNC) {
+		if (qpriv->pending_tid_r_segs)
+			goto done;
+
+		hfi1_kern_clear_hw_flow(req->rcd, qp);
+		qpriv->s_flags &= ~HFI1_R_TID_SW_PSN;
+		req->state = TID_REQUEST_ACTIVE;
+	}
+
+	/*
+	 * If the request for this segment is resent, the tid resources should
+	 * have been allocated before. In this case, req->flow_idx should
+	 * fall behind req->setup_head.
+	 */
+	if (req->flow_idx == req->setup_head) {
+		retry = false;
+		if (req->state == TID_REQUEST_RESEND) {
+			/*
+			 * This is the first new segment for a request whose
+			 * earlier segments have been re-sent. We need to
+			 * set up the sge pointer correctly.
+			 */
+			restart_sge(&qp->s_sge, wqe, req->s_next_psn,
+				    qp->pmtu);
+			req->isge = 0;
+			req->state = TID_REQUEST_ACTIVE;
+		}
+
+		/*
+		 * Check sync. The last PSN of each generation is reserved for
+		 * RESYNC.
+		 */
+		if ((qpriv->flow_state.psn + npkts) > MAX_TID_FLOW_PSN - 1) {
+			req->state = TID_REQUEST_SYNC;
+			goto sync_check;
+		}
+
+		/* Allocate the flow if not yet */
+		if (hfi1_kern_setup_hw_flow(qpriv->rcd, qp))
+			goto done;
+
+		/*
+		 * The following call will advance req->setup_head after
+		 * allocating the tid entries.
+		 */
+		if (hfi1_kern_exp_rcv_setup(req, &qp->s_sge, &last)) {
+			req->state = TID_REQUEST_QUEUED;
+
+			/*
+			 * We don't have resources for this segment. The QP has
+			 * already been queued.
+			 */
+			goto done;
+		}
+	}
+
+	/* req->flow_idx should only be one slot behind req->setup_head */
+	flow = &req->flows[req->flow_idx];
+	flow->pkt = 0;
+	flow->tid_idx = 0;
+	flow->sent = 0;
+	if (!retry) {
+		/* Set the first and last IB PSN for the flow in use.*/
+		flow->flow_state.ib_spsn = req->s_next_psn;
+		flow->flow_state.ib_lpsn =
+			flow->flow_state.ib_spsn + flow->npkts - 1;
+	}
+
+	/* Calculate the next segment start psn.*/
+	req->s_next_psn += flow->npkts;
+
+	/* Build the packet header */
+	hdwords = hfi1_build_tid_rdma_read_packet(wqe, ohdr, bth1, bth2, len);
+done:
+	return hdwords;
+}
+
+/*
+ * Validate and accept the TID RDMA READ request parameters.
+ * Return 0 if the request is accepted successfully;
+ * Return 1 otherwise.
+ */
+static int tid_rdma_rcv_read_request(struct rvt_qp *qp,
+				     struct rvt_ack_entry *e,
+				     struct hfi1_packet *packet,
+				     struct ib_other_headers *ohdr,
+				     u32 bth0, u32 psn, u64 vaddr, u32 len)
+{
+	struct hfi1_qp_priv *qpriv = qp->priv;
+	struct tid_rdma_request *req;
+	struct tid_rdma_flow *flow;
+	u32 flow_psn, i, tidlen = 0, pktlen, tlen;
+
+	req = ack_to_tid_req(e);
+
+	/* Validate the payload first */
+	flow = &req->flows[req->setup_head];
+
+	/* payload length = packet length - (header length + ICRC length) */
+	pktlen = packet->tlen - (packet->hlen + 4);
+	if (pktlen > sizeof(flow->tid_entry))
+		return 1;
+	memcpy(flow->tid_entry, packet->ebuf, pktlen);
+	flow->tidcnt = pktlen / sizeof(*flow->tid_entry);
+
+	/*
+	 * Walk the TID_ENTRY list to make sure we have enough space for a
+	 * complete segment. Also calculate the number of required packets.
+	 */
+	flow->npkts = rvt_div_round_up_mtu(qp, len);
+	for (i = 0; i < flow->tidcnt; i++) {
+		trace_hfi1_tid_entry_rcv_read_req(qp, i,
+						  flow->tid_entry[i]);
+		tlen = EXP_TID_GET(flow->tid_entry[i], LEN);
+		if (!tlen)
+			return 1;
+
+		/*
+		 * For tid pair (tidctr == 3), the buffer size of the pair
+		 * should be the sum of the buffer size described by each
+		 * tid entry. However, only the first entry needs to be
+		 * specified in the request (see WFR HAS Section 8.5.7.1).
+		 */
+		tidlen += tlen;
+	}
+	if (tidlen * PAGE_SIZE < len)
+		return 1;
+
+	/* Empty the flow array */
+	req->clear_tail = req->setup_head;
+	flow->pkt = 0;
+	flow->tid_idx = 0;
+	flow->tid_offset = 0;
+	flow->sent = 0;
+	flow->tid_qpn = be32_to_cpu(ohdr->u.tid_rdma.r_req.tid_flow_qp);
+	flow->idx = (flow->tid_qpn >> TID_RDMA_DESTQP_FLOW_SHIFT) &
+		    TID_RDMA_DESTQP_FLOW_MASK;
+	flow_psn = mask_psn(be32_to_cpu(ohdr->u.tid_rdma.r_req.tid_flow_psn));
+	flow->flow_state.generation = flow_psn >> HFI1_KDETH_BTH_SEQ_SHIFT;
+	flow->flow_state.spsn = flow_psn & HFI1_KDETH_BTH_SEQ_MASK;
+	flow->length = len;
+
+	flow->flow_state.lpsn = flow->flow_state.spsn +
+		flow->npkts - 1;
+	flow->flow_state.ib_spsn = psn;
+	flow->flow_state.ib_lpsn = flow->flow_state.ib_spsn + flow->npkts - 1;
+
+	trace_hfi1_tid_flow_rcv_read_req(qp, req->setup_head, flow);
+	/* Set the initial flow index to the current flow. */
+	req->flow_idx = req->setup_head;
+
+	/* advance circular buffer head */
+	req->setup_head = (req->setup_head + 1) & (MAX_FLOWS - 1);
+
+	/*
+	 * Compute last PSN for request.
+	 */
+	e->opcode = (bth0 >> 24) & 0xff;
+	e->psn = psn;
+	e->lpsn = psn + flow->npkts - 1;
+	e->sent = 0;
+
+	req->n_flows = qpriv->tid_rdma.local.max_read;
+	req->state = TID_REQUEST_ACTIVE;
+	req->cur_seg = 0;
+	req->comp_seg = 0;
+	req->ack_seg = 0;
+	req->isge = 0;
+	req->seg_len = qpriv->tid_rdma.local.max_len;
+	req->total_len = len;
+	req->total_segs = 1;
+	req->r_flow_psn = e->psn;
+
+	trace_hfi1_tid_req_rcv_read_req(qp, 0, e->opcode, e->psn, e->lpsn,
+					req);
+	return 0;
+}
+
+static int tid_rdma_rcv_error(struct hfi1_packet *packet,
+			      struct ib_other_headers *ohdr,
+			      struct rvt_qp *qp, u32 psn, int diff)
+{
+	struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
+	struct hfi1_ctxtdata *rcd = ((struct hfi1_qp_priv *)qp->priv)->rcd;
+	struct hfi1_ibdev *dev = to_idev(qp->ibqp.device);
+	struct hfi1_qp_priv *qpriv = qp->priv;
+	struct rvt_ack_entry *e;
+	struct tid_rdma_request *req;
+	unsigned long flags;
+	u8 prev;
+	bool old_req;
+
+	trace_hfi1_rsp_tid_rcv_error(qp, psn);
+	trace_hfi1_tid_rdma_rcv_err(qp, 0, psn, diff);
+	if (diff > 0) {
+		/* sequence error */
+		if (!qp->r_nak_state) {
+			ibp->rvp.n_rc_seqnak++;
+			qp->r_nak_state = IB_NAK_PSN_ERROR;
+			qp->r_ack_psn = qp->r_psn;
+			rc_defered_ack(rcd, qp);
+		}
+		goto done;
+	}
+
+	ibp->rvp.n_rc_dupreq++;
+
+	spin_lock_irqsave(&qp->s_lock, flags);
+	e = find_prev_entry(qp, psn, &prev, NULL, &old_req);
+	if (!e || (e->opcode != TID_OP(READ_REQ) &&
+		   e->opcode != TID_OP(WRITE_REQ)))
+		goto unlock;
+
+	req = ack_to_tid_req(e);
+	req->r_flow_psn = psn;
+	trace_hfi1_tid_req_rcv_err(qp, 0, e->opcode, e->psn, e->lpsn, req);
+	if (e->opcode == TID_OP(READ_REQ)) {
+		struct ib_reth *reth;
+		u32 len;
+		u32 rkey;
+		u64 vaddr;
+		int ok;
+		u32 bth0;
+
+		reth = &ohdr->u.tid_rdma.r_req.reth;
+		/*
+		 * The requester always restarts from the start of the original
+		 * request.
+		 */
+		len = be32_to_cpu(reth->length);
+		if (psn != e->psn || len != req->total_len)
+			goto unlock;
+
+		release_rdma_sge_mr(e);
+
+		rkey = be32_to_cpu(reth->rkey);
+		vaddr = get_ib_reth_vaddr(reth);
+
+		qp->r_len = len;
+		ok = rvt_rkey_ok(qp, &e->rdma_sge, len, vaddr, rkey,
+				 IB_ACCESS_REMOTE_READ);
+		if (unlikely(!ok))
+			goto unlock;
+
+		/*
+		 * If all the response packets for the current request have
+		 * been sent out and this request is complete (old_request
+		 * == false) and the TID flow may be unusable (the
+		 * req->clear_tail is advanced). However, when an earlier
+		 * request is received, this request will not be complete any
+		 * more (qp->s_tail_ack_queue is moved back, see below).
+		 * Consequently, we need to update the TID flow info everytime
+		 * a duplicate request is received.
+		 */
+		bth0 = be32_to_cpu(ohdr->bth[0]);
+		if (tid_rdma_rcv_read_request(qp, e, packet, ohdr, bth0, psn,
+					      vaddr, len))
+			goto unlock;
+
+		/*
+		 * True if the request is already scheduled (between
+		 * qp->s_tail_ack_queue and qp->r_head_ack_queue);
+		 */
+		if (old_req)
+			goto unlock;
+	} else {
+		struct flow_state *fstate;
+		bool schedule = false;
+		u8 i;
+
+		if (req->state == TID_REQUEST_RESEND) {
+			req->state = TID_REQUEST_RESEND_ACTIVE;
+		} else if (req->state == TID_REQUEST_INIT_RESEND) {
+			req->state = TID_REQUEST_INIT;
+			schedule = true;
+		}
+
+		/*
+		 * True if the request is already scheduled (between
+		 * qp->s_tail_ack_queue and qp->r_head_ack_queue).
+		 * Also, don't change requests, which are at the SYNC
+		 * point and haven't generated any responses yet.
+		 * There is nothing to retransmit for them yet.
+		 */
+		if (old_req || req->state == TID_REQUEST_INIT ||
+		    (req->state == TID_REQUEST_SYNC && !req->cur_seg)) {
+			for (i = prev + 1; ; i++) {
+				if (i > rvt_size_atomic(&dev->rdi))
+					i = 0;
+				if (i == qp->r_head_ack_queue)
+					break;
+				e = &qp->s_ack_queue[i];
+				req = ack_to_tid_req(e);
+				if (e->opcode == TID_OP(WRITE_REQ) &&
+				    req->state == TID_REQUEST_INIT)
+					req->state = TID_REQUEST_INIT_RESEND;
+			}
+			/*
+			 * If the state of the request has been changed,
+			 * the first leg needs to get scheduled in order to
+			 * pick up the change. Otherwise, normal response
+			 * processing should take care of it.
+			 */
+			if (!schedule)
+				goto unlock;
+		}
+
+		/*
+		 * If there is no more allocated segment, just schedule the qp
+		 * without changing any state.
+		 */
+		if (req->clear_tail == req->setup_head)
+			goto schedule;
+		/*
+		 * If this request has sent responses for segments, which have
+		 * not received data yet (flow_idx != clear_tail), the flow_idx
+		 * pointer needs to be adjusted so the same responses can be
+		 * re-sent.
+		 */
+		if (CIRC_CNT(req->flow_idx, req->clear_tail, MAX_FLOWS)) {
+			fstate = &req->flows[req->clear_tail].flow_state;
+			qpriv->pending_tid_w_segs -=
+				CIRC_CNT(req->flow_idx, req->clear_tail,
+					 MAX_FLOWS);
+			req->flow_idx =
+				CIRC_ADD(req->clear_tail,
+					 delta_psn(psn, fstate->resp_ib_psn),
+					 MAX_FLOWS);
+			qpriv->pending_tid_w_segs +=
+				delta_psn(psn, fstate->resp_ib_psn);
+			/*
+			 * When flow_idx == setup_head, we've gotten a duplicate
+			 * request for a segment, which has not been allocated
+			 * yet. In that case, don't adjust this request.
+			 * However, we still want to go through the loop below
+			 * to adjust all subsequent requests.
+			 */
+			if (CIRC_CNT(req->setup_head, req->flow_idx,
+				     MAX_FLOWS)) {
+				req->cur_seg = delta_psn(psn, e->psn);
+				req->state = TID_REQUEST_RESEND_ACTIVE;
+			}
+		}
+
+		for (i = prev + 1; ; i++) {
+			/*
+			 * Look at everything up to and including
+			 * s_tail_ack_queue
+			 */
+			if (i > rvt_size_atomic(&dev->rdi))
+				i = 0;
+			if (i == qp->r_head_ack_queue)
+				break;
+			e = &qp->s_ack_queue[i];
+			req = ack_to_tid_req(e);
+			trace_hfi1_tid_req_rcv_err(qp, 0, e->opcode, e->psn,
+						   e->lpsn, req);
+			if (e->opcode != TID_OP(WRITE_REQ) ||
+			    req->cur_seg == req->comp_seg ||
+			    req->state == TID_REQUEST_INIT ||
+			    req->state == TID_REQUEST_INIT_RESEND) {
+				if (req->state == TID_REQUEST_INIT)
+					req->state = TID_REQUEST_INIT_RESEND;
+				continue;
+			}
+			qpriv->pending_tid_w_segs -=
+				CIRC_CNT(req->flow_idx,
+					 req->clear_tail,
+					 MAX_FLOWS);
+			req->flow_idx = req->clear_tail;
+			req->state = TID_REQUEST_RESEND;
+			req->cur_seg = req->comp_seg;
+		}
+		qpriv->s_flags &= ~HFI1_R_TID_WAIT_INTERLCK;
+	}
+	/* Re-process old requests.*/
+	if (qp->s_acked_ack_queue == qp->s_tail_ack_queue)
+		qp->s_acked_ack_queue = prev;
+	qp->s_tail_ack_queue = prev;
+	/*
+	 * Since the qp->s_tail_ack_queue is modified, the
+	 * qp->s_ack_state must be changed to re-initialize
+	 * qp->s_ack_rdma_sge; Otherwise, we will end up in
+	 * wrong memory region.
+	 */
+	qp->s_ack_state = OP(ACKNOWLEDGE);
+schedule:
+	/*
+	 * It's possible to receive a retry psn that is earlier than an RNRNAK
+	 * psn. In this case, the rnrnak state should be cleared.
+	 */
+	if (qpriv->rnr_nak_state) {
+		qp->s_nak_state = 0;
+		qpriv->rnr_nak_state = TID_RNR_NAK_INIT;
+		qp->r_psn = e->lpsn + 1;
+		hfi1_tid_write_alloc_resources(qp, true);
+	}
+
+	qp->r_state = e->opcode;
+	qp->r_nak_state = 0;
+	qp->s_flags |= RVT_S_RESP_PENDING;
+	hfi1_schedule_send(qp);
+unlock:
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+done:
+	return 1;
+}
+
+void hfi1_rc_rcv_tid_rdma_read_req(struct hfi1_packet *packet)
+{
+	/* HANDLER FOR TID RDMA READ REQUEST packet (Responder side)*/
+
+	/*
+	 * 1. Verify TID RDMA READ REQ as per IB_OPCODE_RC_RDMA_READ
+	 *    (see hfi1_rc_rcv())
+	 * 2. Put TID RDMA READ REQ into the response queueu (s_ack_queue)
+	 *     - Setup struct tid_rdma_req with request info
+	 *     - Initialize struct tid_rdma_flow info;
+	 *     - Copy TID entries;
+	 * 3. Set the qp->s_ack_state.
+	 * 4. Set RVT_S_RESP_PENDING in s_flags.
+	 * 5. Kick the send engine (hfi1_schedule_send())
+	 */
+	struct hfi1_ctxtdata *rcd = packet->rcd;
+	struct rvt_qp *qp = packet->qp;
+	struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
+	struct ib_other_headers *ohdr = packet->ohdr;
+	struct rvt_ack_entry *e;
+	unsigned long flags;
+	struct ib_reth *reth;
+	struct hfi1_qp_priv *qpriv = qp->priv;
+	u32 bth0, psn, len, rkey;
+	bool fecn;
+	u8 next;
+	u64 vaddr;
+	int diff;
+	u8 nack_state = IB_NAK_INVALID_REQUEST;
+
+	bth0 = be32_to_cpu(ohdr->bth[0]);
+	if (hfi1_ruc_check_hdr(ibp, packet))
+		return;
+
+	fecn = process_ecn(qp, packet);
+	psn = mask_psn(be32_to_cpu(ohdr->bth[2]));
+	trace_hfi1_rsp_rcv_tid_read_req(qp, psn);
+
+	if (qp->state == IB_QPS_RTR && !(qp->r_flags & RVT_R_COMM_EST))
+		rvt_comm_est(qp);
+
+	if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ)))
+		goto nack_inv;
+
+	reth = &ohdr->u.tid_rdma.r_req.reth;
+	vaddr = be64_to_cpu(reth->vaddr);
+	len = be32_to_cpu(reth->length);
+	/* The length needs to be in multiples of PAGE_SIZE */
+	if (!len || len & ~PAGE_MASK || len > qpriv->tid_rdma.local.max_len)
+		goto nack_inv;
+
+	diff = delta_psn(psn, qp->r_psn);
+	if (unlikely(diff)) {
+		tid_rdma_rcv_err(packet, ohdr, qp, psn, diff, fecn);
+		return;
+	}
+
+	/* We've verified the request, insert it into the ack queue. */
+	next = qp->r_head_ack_queue + 1;
+	if (next > rvt_size_atomic(ib_to_rvt(qp->ibqp.device)))
+		next = 0;
+	spin_lock_irqsave(&qp->s_lock, flags);
+	if (unlikely(next == qp->s_tail_ack_queue)) {
+		if (!qp->s_ack_queue[next].sent) {
+			nack_state = IB_NAK_REMOTE_OPERATIONAL_ERROR;
+			goto nack_inv_unlock;
+		}
+		update_ack_queue(qp, next);
+	}
+	e = &qp->s_ack_queue[qp->r_head_ack_queue];
+	release_rdma_sge_mr(e);
+
+	rkey = be32_to_cpu(reth->rkey);
+	qp->r_len = len;
+
+	if (unlikely(!rvt_rkey_ok(qp, &e->rdma_sge, qp->r_len, vaddr,
+				  rkey, IB_ACCESS_REMOTE_READ)))
+		goto nack_acc;
+
+	/* Accept the request parameters */
+	if (tid_rdma_rcv_read_request(qp, e, packet, ohdr, bth0, psn, vaddr,
+				      len))
+		goto nack_inv_unlock;
+
+	qp->r_state = e->opcode;
+	qp->r_nak_state = 0;
+	/*
+	 * We need to increment the MSN here instead of when we
+	 * finish sending the result since a duplicate request would
+	 * increment it more than once.
+	 */
+	qp->r_msn++;
+	qp->r_psn += e->lpsn - e->psn + 1;
+
+	qp->r_head_ack_queue = next;
+
+	/*
+	 * For all requests other than TID WRITE which are added to the ack
+	 * queue, qpriv->r_tid_alloc follows qp->r_head_ack_queue. It is ok to
+	 * do this because of interlocks between these and TID WRITE
+	 * requests. The same change has also been made in hfi1_rc_rcv().
+	 */
+	qpriv->r_tid_alloc = qp->r_head_ack_queue;
+
+	/* Schedule the send tasklet. */
+	qp->s_flags |= RVT_S_RESP_PENDING;
+	if (fecn)
+		qp->s_flags |= RVT_S_ECN;
+	hfi1_schedule_send(qp);
+
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+	return;
+
+nack_inv_unlock:
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+nack_inv:
+	rvt_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
+	qp->r_nak_state = nack_state;
+	qp->r_ack_psn = qp->r_psn;
+	/* Queue NAK for later */
+	rc_defered_ack(rcd, qp);
+	return;
+nack_acc:
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+	rvt_rc_error(qp, IB_WC_LOC_PROT_ERR);
+	qp->r_nak_state = IB_NAK_REMOTE_ACCESS_ERROR;
+	qp->r_ack_psn = qp->r_psn;
+}
+
+u32 hfi1_build_tid_rdma_read_resp(struct rvt_qp *qp, struct rvt_ack_entry *e,
+				  struct ib_other_headers *ohdr, u32 *bth0,
+				  u32 *bth1, u32 *bth2, u32 *len, bool *last)
+{
+	struct hfi1_ack_priv *epriv = e->priv;
+	struct tid_rdma_request *req = &epriv->tid_req;
+	struct hfi1_qp_priv *qpriv = qp->priv;
+	struct tid_rdma_flow *flow = &req->flows[req->clear_tail];
+	u32 tidentry = flow->tid_entry[flow->tid_idx];
+	u32 tidlen = EXP_TID_GET(tidentry, LEN) << PAGE_SHIFT;
+	struct tid_rdma_read_resp *resp = &ohdr->u.tid_rdma.r_rsp;
+	u32 next_offset, om = KDETH_OM_LARGE;
+	bool last_pkt;
+	u32 hdwords = 0;
+	struct tid_rdma_params *remote;
+
+	*len = min_t(u32, qp->pmtu, tidlen - flow->tid_offset);
+	flow->sent += *len;
+	next_offset = flow->tid_offset + *len;
+	last_pkt = (flow->sent >= flow->length);
+
+	trace_hfi1_tid_entry_build_read_resp(qp, flow->tid_idx, tidentry);
+	trace_hfi1_tid_flow_build_read_resp(qp, req->clear_tail, flow);
+
+	rcu_read_lock();
+	remote = rcu_dereference(qpriv->tid_rdma.remote);
+	if (!remote) {
+		rcu_read_unlock();
+		goto done;
+	}
+	KDETH_RESET(resp->kdeth0, KVER, 0x1);
+	KDETH_SET(resp->kdeth0, SH, !last_pkt);
+	KDETH_SET(resp->kdeth0, INTR, !!(!last_pkt && remote->urg));
+	KDETH_SET(resp->kdeth0, TIDCTRL, EXP_TID_GET(tidentry, CTRL));
+	KDETH_SET(resp->kdeth0, TID, EXP_TID_GET(tidentry, IDX));
+	KDETH_SET(resp->kdeth0, OM, om == KDETH_OM_LARGE);
+	KDETH_SET(resp->kdeth0, OFFSET, flow->tid_offset / om);
+	KDETH_RESET(resp->kdeth1, JKEY, remote->jkey);
+	resp->verbs_qp = cpu_to_be32(qp->remote_qpn);
+	rcu_read_unlock();
+
+	resp->aeth = rvt_compute_aeth(qp);
+	resp->verbs_psn = cpu_to_be32(mask_psn(flow->flow_state.ib_spsn +
+					       flow->pkt));
+
+	*bth0 = TID_OP(READ_RESP) << 24;
+	*bth1 = flow->tid_qpn;
+	*bth2 = mask_psn(((flow->flow_state.spsn + flow->pkt++) &
+			  HFI1_KDETH_BTH_SEQ_MASK) |
+			 (flow->flow_state.generation <<
+			  HFI1_KDETH_BTH_SEQ_SHIFT));
+	*last = last_pkt;
+	if (last_pkt)
+		/* Advance to next flow */
+		req->clear_tail = (req->clear_tail + 1) &
+				  (MAX_FLOWS - 1);
+
+	if (next_offset >= tidlen) {
+		flow->tid_offset = 0;
+		flow->tid_idx++;
+	} else {
+		flow->tid_offset = next_offset;
+	}
+
+	hdwords = sizeof(ohdr->u.tid_rdma.r_rsp) / sizeof(u32);
+
+done:
+	return hdwords;
+}
+
+static inline struct tid_rdma_request *
+find_tid_request(struct rvt_qp *qp, u32 psn, enum ib_wr_opcode opcode)
+	__must_hold(&qp->s_lock)
+{
+	struct rvt_swqe *wqe;
+	struct tid_rdma_request *req = NULL;
+	u32 i, end;
+
+	end = qp->s_cur + 1;
+	if (end == qp->s_size)
+		end = 0;
+	for (i = qp->s_acked; i != end;) {
+		wqe = rvt_get_swqe_ptr(qp, i);
+		if (cmp_psn(psn, wqe->psn) >= 0 &&
+		    cmp_psn(psn, wqe->lpsn) <= 0) {
+			if (wqe->wr.opcode == opcode)
+				req = wqe_to_tid_req(wqe);
+			break;
+		}
+		if (++i == qp->s_size)
+			i = 0;
+	}
+
+	return req;
+}
+
+void hfi1_rc_rcv_tid_rdma_read_resp(struct hfi1_packet *packet)
+{
+	/* HANDLER FOR TID RDMA READ RESPONSE packet (Requestor side */
+
+	/*
+	 * 1. Find matching SWQE
+	 * 2. Check that the entire segment has been read.
+	 * 3. Remove HFI1_S_WAIT_TID_RESP from s_flags.
+	 * 4. Free the TID flow resources.
+	 * 5. Kick the send engine (hfi1_schedule_send())
+	 */
+	struct ib_other_headers *ohdr = packet->ohdr;
+	struct rvt_qp *qp = packet->qp;
+	struct hfi1_qp_priv *priv = qp->priv;
+	struct hfi1_ctxtdata *rcd = packet->rcd;
+	struct tid_rdma_request *req;
+	struct tid_rdma_flow *flow;
+	u32 opcode, aeth;
+	bool fecn;
+	unsigned long flags;
+	u32 kpsn, ipsn;
+
+	trace_hfi1_sender_rcv_tid_read_resp(qp);
+	fecn = process_ecn(qp, packet);
+	kpsn = mask_psn(be32_to_cpu(ohdr->bth[2]));
+	aeth = be32_to_cpu(ohdr->u.tid_rdma.r_rsp.aeth);
+	opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff;
+
+	spin_lock_irqsave(&qp->s_lock, flags);
+	ipsn = mask_psn(be32_to_cpu(ohdr->u.tid_rdma.r_rsp.verbs_psn));
+	req = find_tid_request(qp, ipsn, IB_WR_TID_RDMA_READ);
+	if (unlikely(!req))
+		goto ack_op_err;
+
+	flow = &req->flows[req->clear_tail];
+	/* When header suppression is disabled */
+	if (cmp_psn(ipsn, flow->flow_state.ib_lpsn)) {
+		update_r_next_psn_fecn(packet, priv, rcd, flow, fecn);
+
+		if (cmp_psn(kpsn, flow->flow_state.r_next_psn))
+			goto ack_done;
+		flow->flow_state.r_next_psn = mask_psn(kpsn + 1);
+		/*
+		 * Copy the payload to destination buffer if this packet is
+		 * delivered as an eager packet due to RSM rule and FECN.
+		 * The RSM rule selects FECN bit in BTH and SH bit in
+		 * KDETH header and therefore will not match the last
+		 * packet of each segment that has SH bit cleared.
+		 */
+		if (fecn && packet->etype == RHF_RCV_TYPE_EAGER) {
+			struct rvt_sge_state ss;
+			u32 len;
+			u32 tlen = packet->tlen;
+			u16 hdrsize = packet->hlen;
+			u8 pad = packet->pad;
+			u8 extra_bytes = pad + packet->extra_byte +
+				(SIZE_OF_CRC << 2);
+			u32 pmtu = qp->pmtu;
+
+			if (unlikely(tlen != (hdrsize + pmtu + extra_bytes)))
+				goto ack_op_err;
+			len = restart_sge(&ss, req->e.swqe, ipsn, pmtu);
+			if (unlikely(len < pmtu))
+				goto ack_op_err;
+			rvt_copy_sge(qp, &ss, packet->payload, pmtu, false,
+				     false);
+			/* Raise the sw sequence check flag for next packet */
+			priv->s_flags |= HFI1_R_TID_SW_PSN;
+		}
+
+		goto ack_done;
+	}
+	flow->flow_state.r_next_psn = mask_psn(kpsn + 1);
+	req->ack_pending--;
+	priv->pending_tid_r_segs--;
+	qp->s_num_rd_atomic--;
+	if ((qp->s_flags & RVT_S_WAIT_FENCE) &&
+	    !qp->s_num_rd_atomic) {
+		qp->s_flags &= ~(RVT_S_WAIT_FENCE |
+				 RVT_S_WAIT_ACK);
+		hfi1_schedule_send(qp);
+	}
+	if (qp->s_flags & RVT_S_WAIT_RDMAR) {
+		qp->s_flags &= ~(RVT_S_WAIT_RDMAR | RVT_S_WAIT_ACK);
+		hfi1_schedule_send(qp);
+	}
+
+	trace_hfi1_ack(qp, ipsn);
+	trace_hfi1_tid_req_rcv_read_resp(qp, 0, req->e.swqe->wr.opcode,
+					 req->e.swqe->psn, req->e.swqe->lpsn,
+					 req);
+	trace_hfi1_tid_flow_rcv_read_resp(qp, req->clear_tail, flow);
+
+	/* Release the tid resources */
+	hfi1_kern_exp_rcv_clear(req);
+
+	if (!do_rc_ack(qp, aeth, ipsn, opcode, 0, rcd))
+		goto ack_done;
+
+	/* If not done yet, build next read request */
+	if (++req->comp_seg >= req->total_segs) {
+		priv->tid_r_comp++;
+		req->state = TID_REQUEST_COMPLETE;
+	}
+
+	/*
+	 * Clear the hw flow under two conditions:
+	 * 1. This request is a sync point and it is complete;
+	 * 2. Current request is completed and there are no more requests.
+	 */
+	if ((req->state == TID_REQUEST_SYNC &&
+	     req->comp_seg == req->cur_seg) ||
+	    priv->tid_r_comp == priv->tid_r_reqs) {
+		hfi1_kern_clear_hw_flow(priv->rcd, qp);
+		priv->s_flags &= ~HFI1_R_TID_SW_PSN;
+		if (req->state == TID_REQUEST_SYNC)
+			req->state = TID_REQUEST_ACTIVE;
+	}
+
+	hfi1_schedule_send(qp);
+	goto ack_done;
+
+ack_op_err:
+	/*
+	 * The test indicates that the send engine has finished its cleanup
+	 * after sending the request and it's now safe to put the QP into error
+	 * state. However, if the wqe queue is empty (qp->s_acked == qp->s_tail
+	 * == qp->s_head), it would be unsafe to complete the wqe pointed by
+	 * qp->s_acked here. Putting the qp into error state will safely flush
+	 * all remaining requests.
+	 */
+	if (qp->s_last == qp->s_acked)
+		rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR);
+
+ack_done:
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+}
+
+void hfi1_kern_read_tid_flow_free(struct rvt_qp *qp)
+	__must_hold(&qp->s_lock)
+{
+	u32 n = qp->s_acked;
+	struct rvt_swqe *wqe;
+	struct tid_rdma_request *req;
+	struct hfi1_qp_priv *priv = qp->priv;
+
+	lockdep_assert_held(&qp->s_lock);
+	/* Free any TID entries */
+	while (n != qp->s_tail) {
+		wqe = rvt_get_swqe_ptr(qp, n);
+		if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) {
+			req = wqe_to_tid_req(wqe);
+			hfi1_kern_exp_rcv_clear_all(req);
+		}
+
+		if (++n == qp->s_size)
+			n = 0;
+	}
+	/* Free flow */
+	hfi1_kern_clear_hw_flow(priv->rcd, qp);
+}
+
+static bool tid_rdma_tid_err(struct hfi1_packet *packet, u8 rcv_type)
+{
+	struct rvt_qp *qp = packet->qp;
+
+	if (rcv_type >= RHF_RCV_TYPE_IB)
+		goto done;
+
+	spin_lock(&qp->s_lock);
+
+	/*
+	 * We've ran out of space in the eager buffer.
+	 * Eagerly received KDETH packets which require space in the
+	 * Eager buffer (packet that have payload) are TID RDMA WRITE
+	 * response packets. In this case, we have to re-transmit the
+	 * TID RDMA WRITE request.
+	 */
+	if (rcv_type == RHF_RCV_TYPE_EAGER) {
+		hfi1_restart_rc(qp, qp->s_last_psn + 1, 1);
+		hfi1_schedule_send(qp);
+	}
+
+	/* Since no payload is delivered, just drop the packet */
+	spin_unlock(&qp->s_lock);
+done:
+	return true;
+}
+
+static void restart_tid_rdma_read_req(struct hfi1_ctxtdata *rcd,
+				      struct rvt_qp *qp, struct rvt_swqe *wqe)
+{
+	struct tid_rdma_request *req;
+	struct tid_rdma_flow *flow;
+
+	/* Start from the right segment */
+	qp->r_flags |= RVT_R_RDMAR_SEQ;
+	req = wqe_to_tid_req(wqe);
+	flow = &req->flows[req->clear_tail];
+	hfi1_restart_rc(qp, flow->flow_state.ib_spsn, 0);
+	if (list_empty(&qp->rspwait)) {
+		qp->r_flags |= RVT_R_RSP_SEND;
+		rvt_get_qp(qp);
+		list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
+	}
+}
+
+/*
+ * Handle the KDETH eflags for TID RDMA READ response.
+ *
+ * Return true if the last packet for a segment has been received and it is
+ * time to process the response normally; otherwise, return true.
+ *
+ * The caller must hold the packet->qp->r_lock and the rcu_read_lock.
+ */
+static bool handle_read_kdeth_eflags(struct hfi1_ctxtdata *rcd,
+				     struct hfi1_packet *packet, u8 rcv_type,
+				     u8 rte, u32 psn, u32 ibpsn)
+	__must_hold(&packet->qp->r_lock) __must_hold(RCU)
+{
+	struct hfi1_pportdata *ppd = rcd->ppd;
+	struct hfi1_devdata *dd = ppd->dd;
+	struct hfi1_ibport *ibp;
+	struct rvt_swqe *wqe;
+	struct tid_rdma_request *req;
+	struct tid_rdma_flow *flow;
+	u32 ack_psn;
+	struct rvt_qp *qp = packet->qp;
+	struct hfi1_qp_priv *priv = qp->priv;
+	bool ret = true;
+	int diff = 0;
+	u32 fpsn;
+
+	lockdep_assert_held(&qp->r_lock);
+	trace_hfi1_rsp_read_kdeth_eflags(qp, ibpsn);
+	trace_hfi1_sender_read_kdeth_eflags(qp);
+	trace_hfi1_tid_read_sender_kdeth_eflags(qp, 0);
+	spin_lock(&qp->s_lock);
+	/* If the psn is out of valid range, drop the packet */
+	if (cmp_psn(ibpsn, qp->s_last_psn) < 0 ||
+	    cmp_psn(ibpsn, qp->s_psn) > 0)
+		goto s_unlock;
+
+	/*
+	 * Note that NAKs implicitly ACK outstanding SEND and RDMA write
+	 * requests and implicitly NAK RDMA read and atomic requests issued
+	 * before the NAK'ed request.
+	 */
+	ack_psn = ibpsn - 1;
+	wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
+	ibp = to_iport(qp->ibqp.device, qp->port_num);
+
+	/* Complete WQEs that the PSN finishes. */
+	while ((int)delta_psn(ack_psn, wqe->lpsn) >= 0) {
+		/*
+		 * If this request is a RDMA read or atomic, and the NACK is
+		 * for a later operation, this NACK NAKs the RDMA read or
+		 * atomic.
+		 */
+		if (wqe->wr.opcode == IB_WR_RDMA_READ ||
+		    wqe->wr.opcode == IB_WR_TID_RDMA_READ ||
+		    wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
+		    wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) {
+			/* Retry this request. */
+			if (!(qp->r_flags & RVT_R_RDMAR_SEQ)) {
+				qp->r_flags |= RVT_R_RDMAR_SEQ;
+				if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) {
+					restart_tid_rdma_read_req(rcd, qp,
+								  wqe);
+				} else {
+					hfi1_restart_rc(qp, qp->s_last_psn + 1,
+							0);
+					if (list_empty(&qp->rspwait)) {
+						qp->r_flags |= RVT_R_RSP_SEND;
+						rvt_get_qp(qp);
+						list_add_tail(/* wait */
+						   &qp->rspwait,
+						   &rcd->qp_wait_list);
+					}
+				}
+			}
+			/*
+			 * No need to process the NAK since we are
+			 * restarting an earlier request.
+			 */
+			break;
+		}
+
+		wqe = do_rc_completion(qp, wqe, ibp);
+		if (qp->s_acked == qp->s_tail)
+			goto s_unlock;
+	}
+
+	if (qp->s_acked == qp->s_tail)
+		goto s_unlock;
+
+	/* Handle the eflags for the request */
+	if (wqe->wr.opcode != IB_WR_TID_RDMA_READ)
+		goto s_unlock;
+
+	req = wqe_to_tid_req(wqe);
+	trace_hfi1_tid_req_read_kdeth_eflags(qp, 0, wqe->wr.opcode, wqe->psn,
+					     wqe->lpsn, req);
+	switch (rcv_type) {
+	case RHF_RCV_TYPE_EXPECTED:
+		switch (rte) {
+		case RHF_RTE_EXPECTED_FLOW_SEQ_ERR:
+			/*
+			 * On the first occurrence of a Flow Sequence error,
+			 * the flag TID_FLOW_SW_PSN is set.
+			 *
+			 * After that, the flow is *not* reprogrammed and the
+			 * protocol falls back to SW PSN checking. This is done
+			 * to prevent continuous Flow Sequence errors for any
+			 * packets that could be still in the fabric.
+			 */
+			flow = &req->flows[req->clear_tail];
+			trace_hfi1_tid_flow_read_kdeth_eflags(qp,
+							      req->clear_tail,
+							      flow);
+			if (priv->s_flags & HFI1_R_TID_SW_PSN) {
+				diff = cmp_psn(psn,
+					       flow->flow_state.r_next_psn);
+				if (diff > 0) {
+					/* Drop the packet.*/
+					goto s_unlock;
+				} else if (diff < 0) {
+					/*
+					 * If a response packet for a restarted
+					 * request has come back, reset the
+					 * restart flag.
+					 */
+					if (qp->r_flags & RVT_R_RDMAR_SEQ)
+						qp->r_flags &=
+							~RVT_R_RDMAR_SEQ;
+
+					/* Drop the packet.*/
+					goto s_unlock;
+				}
+
+				/*
+				 * If SW PSN verification is successful and
+				 * this is the last packet in the segment, tell
+				 * the caller to process it as a normal packet.
+				 */
+				fpsn = full_flow_psn(flow,
+						     flow->flow_state.lpsn);
+				if (cmp_psn(fpsn, psn) == 0) {
+					ret = false;
+					if (qp->r_flags & RVT_R_RDMAR_SEQ)
+						qp->r_flags &=
+							~RVT_R_RDMAR_SEQ;
+				}
+				flow->flow_state.r_next_psn =
+					mask_psn(psn + 1);
+			} else {
+				u32 last_psn;
+
+				last_psn = read_r_next_psn(dd, rcd->ctxt,
+							   flow->idx);
+				flow->flow_state.r_next_psn = last_psn;
+				priv->s_flags |= HFI1_R_TID_SW_PSN;
+				/*
+				 * If no request has been restarted yet,
+				 * restart the current one.
+				 */
+				if (!(qp->r_flags & RVT_R_RDMAR_SEQ))
+					restart_tid_rdma_read_req(rcd, qp,
+								  wqe);
+			}
+
+			break;
+
+		case RHF_RTE_EXPECTED_FLOW_GEN_ERR:
+			/*
+			 * Since the TID flow is able to ride through
+			 * generation mismatch, drop this stale packet.
+			 */
+			break;
+
+		default:
+			break;
+		}
+		break;
+
+	case RHF_RCV_TYPE_ERROR:
+		switch (rte) {
+		case RHF_RTE_ERROR_OP_CODE_ERR:
+		case RHF_RTE_ERROR_KHDR_MIN_LEN_ERR:
+		case RHF_RTE_ERROR_KHDR_HCRC_ERR:
+		case RHF_RTE_ERROR_KHDR_KVER_ERR:
+		case RHF_RTE_ERROR_CONTEXT_ERR:
+		case RHF_RTE_ERROR_KHDR_TID_ERR:
+		default:
+			break;
+		}
+	default:
+		break;
+	}
+s_unlock:
+	spin_unlock(&qp->s_lock);
+	return ret;
+}
+
+bool hfi1_handle_kdeth_eflags(struct hfi1_ctxtdata *rcd,
+			      struct hfi1_pportdata *ppd,
+			      struct hfi1_packet *packet)
+{
+	struct hfi1_ibport *ibp = &ppd->ibport_data;
+	struct hfi1_devdata *dd = ppd->dd;
+	struct rvt_dev_info *rdi = &dd->verbs_dev.rdi;
+	u8 rcv_type = rhf_rcv_type(packet->rhf);
+	u8 rte = rhf_rcv_type_err(packet->rhf);
+	struct ib_header *hdr = packet->hdr;
+	struct ib_other_headers *ohdr = NULL;
+	int lnh = be16_to_cpu(hdr->lrh[0]) & 3;
+	u16 lid  = be16_to_cpu(hdr->lrh[1]);
+	u8 opcode;
+	u32 qp_num, psn, ibpsn;
+	struct rvt_qp *qp;
+	struct hfi1_qp_priv *qpriv;
+	unsigned long flags;
+	bool ret = true;
+	struct rvt_ack_entry *e;
+	struct tid_rdma_request *req;
+	struct tid_rdma_flow *flow;
+	int diff = 0;
+
+	trace_hfi1_msg_handle_kdeth_eflags(NULL, "Kdeth error: rhf ",
+					   packet->rhf);
+	if (packet->rhf & RHF_ICRC_ERR)
+		return ret;
+
+	packet->ohdr = &hdr->u.oth;
+	ohdr = packet->ohdr;
+	trace_input_ibhdr(rcd->dd, packet, !!(rhf_dc_info(packet->rhf)));
+
+	/* Get the destination QP number. */
+	qp_num = be32_to_cpu(ohdr->u.tid_rdma.r_rsp.verbs_qp) &
+		RVT_QPN_MASK;
+	if (lid >= be16_to_cpu(IB_MULTICAST_LID_BASE))
+		goto drop;
+
+	psn = mask_psn(be32_to_cpu(ohdr->bth[2]));
+	opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff;
+
+	rcu_read_lock();
+	qp = rvt_lookup_qpn(rdi, &ibp->rvp, qp_num);
+	if (!qp)
+		goto rcu_unlock;
+
+	packet->qp = qp;
+
+	/* Check for valid receive state. */
+	spin_lock_irqsave(&qp->r_lock, flags);
+	if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) {
+		ibp->rvp.n_pkt_drops++;
+		goto r_unlock;
+	}
+
+	if (packet->rhf & RHF_TID_ERR) {
+		/* For TIDERR and RC QPs preemptively schedule a NAK */
+		u32 tlen = rhf_pkt_len(packet->rhf); /* in bytes */
+
+		/* Sanity check packet */
+		if (tlen < 24)
+			goto r_unlock;
+
+		/*
+		 * Check for GRH. We should never get packets with GRH in this
+		 * path.
+		 */
+		if (lnh == HFI1_LRH_GRH)
+			goto r_unlock;
+
+		if (tid_rdma_tid_err(packet, rcv_type))
+			goto r_unlock;
+	}
+
+	/* handle TID RDMA READ */
+	if (opcode == TID_OP(READ_RESP)) {
+		ibpsn = be32_to_cpu(ohdr->u.tid_rdma.r_rsp.verbs_psn);
+		ibpsn = mask_psn(ibpsn);
+		ret = handle_read_kdeth_eflags(rcd, packet, rcv_type, rte, psn,
+					       ibpsn);
+		goto r_unlock;
+	}
+
+	/*
+	 * qp->s_tail_ack_queue points to the rvt_ack_entry currently being
+	 * processed. These a completed sequentially so we can be sure that
+	 * the pointer will not change until the entire request has completed.
+	 */
+	spin_lock(&qp->s_lock);
+	qpriv = qp->priv;
+	if (qpriv->r_tid_tail == HFI1_QP_WQE_INVALID ||
+	    qpriv->r_tid_tail == qpriv->r_tid_head)
+		goto unlock;
+	e = &qp->s_ack_queue[qpriv->r_tid_tail];
+	if (e->opcode != TID_OP(WRITE_REQ))
+		goto unlock;
+	req = ack_to_tid_req(e);
+	if (req->comp_seg == req->cur_seg)
+		goto unlock;
+	flow = &req->flows[req->clear_tail];
+	trace_hfi1_eflags_err_write(qp, rcv_type, rte, psn);
+	trace_hfi1_rsp_handle_kdeth_eflags(qp, psn);
+	trace_hfi1_tid_write_rsp_handle_kdeth_eflags(qp);
+	trace_hfi1_tid_req_handle_kdeth_eflags(qp, 0, e->opcode, e->psn,
+					       e->lpsn, req);
+	trace_hfi1_tid_flow_handle_kdeth_eflags(qp, req->clear_tail, flow);
+
+	switch (rcv_type) {
+	case RHF_RCV_TYPE_EXPECTED:
+		switch (rte) {
+		case RHF_RTE_EXPECTED_FLOW_SEQ_ERR:
+			if (!(qpriv->s_flags & HFI1_R_TID_SW_PSN)) {
+				qpriv->s_flags |= HFI1_R_TID_SW_PSN;
+				flow->flow_state.r_next_psn =
+					read_r_next_psn(dd, rcd->ctxt,
+							flow->idx);
+				qpriv->r_next_psn_kdeth =
+					flow->flow_state.r_next_psn;
+				goto nak_psn;
+			} else {
+				/*
+				 * If the received PSN does not match the next
+				 * expected PSN, NAK the packet.
+				 * However, only do that if we know that the a
+				 * NAK has already been sent. Otherwise, this
+				 * mismatch could be due to packets that were
+				 * already in flight.
+				 */
+				diff = cmp_psn(psn,
+					       flow->flow_state.r_next_psn);
+				if (diff > 0)
+					goto nak_psn;
+				else if (diff < 0)
+					break;
+
+				qpriv->s_nak_state = 0;
+				/*
+				 * If SW PSN verification is successful and this
+				 * is the last packet in the segment, tell the
+				 * caller to process it as a normal packet.
+				 */
+				if (psn == full_flow_psn(flow,
+							 flow->flow_state.lpsn))
+					ret = false;
+				flow->flow_state.r_next_psn =
+					mask_psn(psn + 1);
+				qpriv->r_next_psn_kdeth =
+					flow->flow_state.r_next_psn;
+			}
+			break;
+
+		case RHF_RTE_EXPECTED_FLOW_GEN_ERR:
+			goto nak_psn;
+
+		default:
+			break;
+		}
+		break;
+
+	case RHF_RCV_TYPE_ERROR:
+		switch (rte) {
+		case RHF_RTE_ERROR_OP_CODE_ERR:
+		case RHF_RTE_ERROR_KHDR_MIN_LEN_ERR:
+		case RHF_RTE_ERROR_KHDR_HCRC_ERR:
+		case RHF_RTE_ERROR_KHDR_KVER_ERR:
+		case RHF_RTE_ERROR_CONTEXT_ERR:
+		case RHF_RTE_ERROR_KHDR_TID_ERR:
+		default:
+			break;
+		}
+	default:
+		break;
+	}
+
+unlock:
+	spin_unlock(&qp->s_lock);
+r_unlock:
+	spin_unlock_irqrestore(&qp->r_lock, flags);
+rcu_unlock:
+	rcu_read_unlock();
+drop:
+	return ret;
+nak_psn:
+	ibp->rvp.n_rc_seqnak++;
+	if (!qpriv->s_nak_state) {
+		qpriv->s_nak_state = IB_NAK_PSN_ERROR;
+		/* We are NAK'ing the next expected PSN */
+		qpriv->s_nak_psn = mask_psn(flow->flow_state.r_next_psn);
+		tid_rdma_trigger_ack(qp);
+	}
+	goto unlock;
+}
+
+/*
+ * "Rewind" the TID request information.
+ * This means that we reset the state back to ACTIVE,
+ * find the proper flow, set the flow index to that flow,
+ * and reset the flow information.
+ */
+void hfi1_tid_rdma_restart_req(struct rvt_qp *qp, struct rvt_swqe *wqe,
+			       u32 *bth2)
+{
+	struct tid_rdma_request *req = wqe_to_tid_req(wqe);
+	struct tid_rdma_flow *flow;
+	struct hfi1_qp_priv *qpriv = qp->priv;
+	int diff, delta_pkts;
+	u32 tididx = 0, i;
+	u16 fidx;
+
+	if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) {
+		*bth2 = mask_psn(qp->s_psn);
+		flow = find_flow_ib(req, *bth2, &fidx);
+		if (!flow) {
+			trace_hfi1_msg_tid_restart_req(/* msg */
+			   qp, "!!!!!! Could not find flow to restart: bth2 ",
+			   (u64)*bth2);
+			trace_hfi1_tid_req_restart_req(qp, 0, wqe->wr.opcode,
+						       wqe->psn, wqe->lpsn,
+						       req);
+			return;
+		}
+	} else {
+		fidx = req->acked_tail;
+		flow = &req->flows[fidx];
+		*bth2 = mask_psn(req->r_ack_psn);
+	}
+
+	if (wqe->wr.opcode == IB_WR_TID_RDMA_READ)
+		delta_pkts = delta_psn(*bth2, flow->flow_state.ib_spsn);
+	else
+		delta_pkts = delta_psn(*bth2,
+				       full_flow_psn(flow,
+						     flow->flow_state.spsn));
+
+	trace_hfi1_tid_flow_restart_req(qp, fidx, flow);
+	diff = delta_pkts + flow->resync_npkts;
+
+	flow->sent = 0;
+	flow->pkt = 0;
+	flow->tid_idx = 0;
+	flow->tid_offset = 0;
+	if (diff) {
+		for (tididx = 0; tididx < flow->tidcnt; tididx++) {
+			u32 tidentry = flow->tid_entry[tididx], tidlen,
+				tidnpkts, npkts;
+
+			flow->tid_offset = 0;
+			tidlen = EXP_TID_GET(tidentry, LEN) * PAGE_SIZE;
+			tidnpkts = rvt_div_round_up_mtu(qp, tidlen);
+			npkts = min_t(u32, diff, tidnpkts);
+			flow->pkt += npkts;
+			flow->sent += (npkts == tidnpkts ? tidlen :
+				       npkts * qp->pmtu);
+			flow->tid_offset += npkts * qp->pmtu;
+			diff -= npkts;
+			if (!diff)
+				break;
+		}
+	}
+	if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE) {
+		rvt_skip_sge(&qpriv->tid_ss, (req->cur_seg * req->seg_len) +
+			     flow->sent, 0);
+		/*
+		 * Packet PSN is based on flow_state.spsn + flow->pkt. However,
+		 * during a RESYNC, the generation is incremented and the
+		 * sequence is reset to 0. Since we've adjusted the npkts in the
+		 * flow and the SGE has been sufficiently advanced, we have to
+		 * adjust flow->pkt in order to calculate the correct PSN.
+		 */
+		flow->pkt -= flow->resync_npkts;
+	}
+
+	if (flow->tid_offset ==
+	    EXP_TID_GET(flow->tid_entry[tididx], LEN) * PAGE_SIZE) {
+		tididx++;
+		flow->tid_offset = 0;
+	}
+	flow->tid_idx = tididx;
+	if (wqe->wr.opcode == IB_WR_TID_RDMA_READ)
+		/* Move flow_idx to correct index */
+		req->flow_idx = fidx;
+	else
+		req->clear_tail = fidx;
+
+	trace_hfi1_tid_flow_restart_req(qp, fidx, flow);
+	trace_hfi1_tid_req_restart_req(qp, 0, wqe->wr.opcode, wqe->psn,
+				       wqe->lpsn, req);
+	req->state = TID_REQUEST_ACTIVE;
+	if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE) {
+		/* Reset all the flows that we are going to resend */
+		fidx = CIRC_NEXT(fidx, MAX_FLOWS);
+		i = qpriv->s_tid_tail;
+		do {
+			for (; CIRC_CNT(req->setup_head, fidx, MAX_FLOWS);
+			      fidx = CIRC_NEXT(fidx, MAX_FLOWS)) {
+				req->flows[fidx].sent = 0;
+				req->flows[fidx].pkt = 0;
+				req->flows[fidx].tid_idx = 0;
+				req->flows[fidx].tid_offset = 0;
+				req->flows[fidx].resync_npkts = 0;
+			}
+			if (i == qpriv->s_tid_cur)
+				break;
+			do {
+				i = (++i == qp->s_size ? 0 : i);
+				wqe = rvt_get_swqe_ptr(qp, i);
+			} while (wqe->wr.opcode != IB_WR_TID_RDMA_WRITE);
+			req = wqe_to_tid_req(wqe);
+			req->cur_seg = req->ack_seg;
+			fidx = req->acked_tail;
+			/* Pull req->clear_tail back */
+			req->clear_tail = fidx;
+		} while (1);
+	}
+}
+
+void hfi1_qp_kern_exp_rcv_clear_all(struct rvt_qp *qp)
+{
+	int i, ret;
+	struct hfi1_qp_priv *qpriv = qp->priv;
+	struct tid_flow_state *fs;
+
+	if (qp->ibqp.qp_type != IB_QPT_RC || !HFI1_CAP_IS_KSET(TID_RDMA))
+		return;
+
+	/*
+	 * First, clear the flow to help prevent any delayed packets from
+	 * being delivered.
+	 */
+	fs = &qpriv->flow_state;
+	if (fs->index != RXE_NUM_TID_FLOWS)
+		hfi1_kern_clear_hw_flow(qpriv->rcd, qp);
+
+	for (i = qp->s_acked; i != qp->s_head;) {
+		struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, i);
+
+		if (++i == qp->s_size)
+			i = 0;
+		/* Free only locally allocated TID entries */
+		if (wqe->wr.opcode != IB_WR_TID_RDMA_READ)
+			continue;
+		do {
+			struct hfi1_swqe_priv *priv = wqe->priv;
+
+			ret = hfi1_kern_exp_rcv_clear(&priv->tid_req);
+		} while (!ret);
+	}
+	for (i = qp->s_acked_ack_queue; i != qp->r_head_ack_queue;) {
+		struct rvt_ack_entry *e = &qp->s_ack_queue[i];
+
+		if (++i == rvt_max_atomic(ib_to_rvt(qp->ibqp.device)))
+			i = 0;
+		/* Free only locally allocated TID entries */
+		if (e->opcode != TID_OP(WRITE_REQ))
+			continue;
+		do {
+			struct hfi1_ack_priv *priv = e->priv;
+
+			ret = hfi1_kern_exp_rcv_clear(&priv->tid_req);
+		} while (!ret);
+	}
+}
+
+bool hfi1_tid_rdma_wqe_interlock(struct rvt_qp *qp, struct rvt_swqe *wqe)
+{
+	struct rvt_swqe *prev;
+	struct hfi1_qp_priv *priv = qp->priv;
+	u32 s_prev;
+	struct tid_rdma_request *req;
+
+	s_prev = (qp->s_cur == 0 ? qp->s_size : qp->s_cur) - 1;
+	prev = rvt_get_swqe_ptr(qp, s_prev);
+
+	switch (wqe->wr.opcode) {
+	case IB_WR_SEND:
+	case IB_WR_SEND_WITH_IMM:
+	case IB_WR_SEND_WITH_INV:
+	case IB_WR_ATOMIC_CMP_AND_SWP:
+	case IB_WR_ATOMIC_FETCH_AND_ADD:
+	case IB_WR_RDMA_WRITE:
+		switch (prev->wr.opcode) {
+		case IB_WR_TID_RDMA_WRITE:
+			req = wqe_to_tid_req(prev);
+			if (req->ack_seg != req->total_segs)
+				goto interlock;
+		default:
+			break;
+		}
+		break;
+	case IB_WR_RDMA_READ:
+		if (prev->wr.opcode != IB_WR_TID_RDMA_WRITE)
+			break;
+		/* fall through */
+	case IB_WR_TID_RDMA_READ:
+		switch (prev->wr.opcode) {
+		case IB_WR_RDMA_READ:
+			if (qp->s_acked != qp->s_cur)
+				goto interlock;
+			break;
+		case IB_WR_TID_RDMA_WRITE:
+			req = wqe_to_tid_req(prev);
+			if (req->ack_seg != req->total_segs)
+				goto interlock;
+		default:
+			break;
+		}
+	default:
+		break;
+	}
+	return false;
+
+interlock:
+	priv->s_flags |= HFI1_S_TID_WAIT_INTERLCK;
+	return true;
+}
+
+/* Does @sge meet the alignment requirements for tid rdma? */
+static inline bool hfi1_check_sge_align(struct rvt_qp *qp,
+					struct rvt_sge *sge, int num_sge)
+{
+	int i;
+
+	for (i = 0; i < num_sge; i++, sge++) {
+		trace_hfi1_sge_check_align(qp, i, sge);
+		if ((u64)sge->vaddr & ~PAGE_MASK ||
+		    sge->sge_length & ~PAGE_MASK)
+			return false;
+	}
+	return true;
+}
+
+void setup_tid_rdma_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe)
+{
+	struct hfi1_qp_priv *qpriv = (struct hfi1_qp_priv *)qp->priv;
+	struct hfi1_swqe_priv *priv = wqe->priv;
+	struct tid_rdma_params *remote;
+	enum ib_wr_opcode new_opcode;
+	bool do_tid_rdma = false;
+	struct hfi1_pportdata *ppd = qpriv->rcd->ppd;
+
+	if ((rdma_ah_get_dlid(&qp->remote_ah_attr) & ~((1 << ppd->lmc) - 1)) ==
+				ppd->lid)
+		return;
+	if (qpriv->hdr_type != HFI1_PKT_TYPE_9B)
+		return;
+
+	rcu_read_lock();
+	remote = rcu_dereference(qpriv->tid_rdma.remote);
+	/*
+	 * If TID RDMA is disabled by the negotiation, don't
+	 * use it.
+	 */
+	if (!remote)
+		goto exit;
+
+	if (wqe->wr.opcode == IB_WR_RDMA_READ) {
+		if (hfi1_check_sge_align(qp, &wqe->sg_list[0],
+					 wqe->wr.num_sge)) {
+			new_opcode = IB_WR_TID_RDMA_READ;
+			do_tid_rdma = true;
+		}
+	} else if (wqe->wr.opcode == IB_WR_RDMA_WRITE) {
+		/*
+		 * TID RDMA is enabled for this RDMA WRITE request iff:
+		 *   1. The remote address is page-aligned,
+		 *   2. The length is larger than the minimum segment size,
+		 *   3. The length is page-multiple.
+		 */
+		if (!(wqe->rdma_wr.remote_addr & ~PAGE_MASK) &&
+		    !(wqe->length & ~PAGE_MASK)) {
+			new_opcode = IB_WR_TID_RDMA_WRITE;
+			do_tid_rdma = true;
+		}
+	}
+
+	if (do_tid_rdma) {
+		if (hfi1_kern_exp_rcv_alloc_flows(&priv->tid_req, GFP_ATOMIC))
+			goto exit;
+		wqe->wr.opcode = new_opcode;
+		priv->tid_req.seg_len =
+			min_t(u32, remote->max_len, wqe->length);
+		priv->tid_req.total_segs =
+			DIV_ROUND_UP(wqe->length, priv->tid_req.seg_len);
+		/* Compute the last PSN of the request */
+		wqe->lpsn = wqe->psn;
+		if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) {
+			priv->tid_req.n_flows = remote->max_read;
+			qpriv->tid_r_reqs++;
+			wqe->lpsn += rvt_div_round_up_mtu(qp, wqe->length) - 1;
+		} else {
+			wqe->lpsn += priv->tid_req.total_segs - 1;
+			atomic_inc(&qpriv->n_requests);
+		}
+
+		priv->tid_req.cur_seg = 0;
+		priv->tid_req.comp_seg = 0;
+		priv->tid_req.ack_seg = 0;
+		priv->tid_req.state = TID_REQUEST_INACTIVE;
+		/*
+		 * Reset acked_tail.
+		 * TID RDMA READ does not have ACKs so it does not
+		 * update the pointer. We have to reset it so TID RDMA
+		 * WRITE does not get confused.
+		 */
+		priv->tid_req.acked_tail = priv->tid_req.setup_head;
+		trace_hfi1_tid_req_setup_tid_wqe(qp, 1, wqe->wr.opcode,
+						 wqe->psn, wqe->lpsn,
+						 &priv->tid_req);
+	}
+exit:
+	rcu_read_unlock();
+}
+
+/* TID RDMA WRITE functions */
+
+u32 hfi1_build_tid_rdma_write_req(struct rvt_qp *qp, struct rvt_swqe *wqe,
+				  struct ib_other_headers *ohdr,
+				  u32 *bth1, u32 *bth2, u32 *len)
+{
+	struct hfi1_qp_priv *qpriv = qp->priv;
+	struct tid_rdma_request *req = wqe_to_tid_req(wqe);
+	struct tid_rdma_params *remote;
+
+	rcu_read_lock();
+	remote = rcu_dereference(qpriv->tid_rdma.remote);
+	/*
+	 * Set the number of flow to be used based on negotiated
+	 * parameters.
+	 */
+	req->n_flows = remote->max_write;
+	req->state = TID_REQUEST_ACTIVE;
+
+	KDETH_RESET(ohdr->u.tid_rdma.w_req.kdeth0, KVER, 0x1);
+	KDETH_RESET(ohdr->u.tid_rdma.w_req.kdeth1, JKEY, remote->jkey);
+	ohdr->u.tid_rdma.w_req.reth.vaddr =
+		cpu_to_be64(wqe->rdma_wr.remote_addr + (wqe->length - *len));
+	ohdr->u.tid_rdma.w_req.reth.rkey =
+		cpu_to_be32(wqe->rdma_wr.rkey);
+	ohdr->u.tid_rdma.w_req.reth.length = cpu_to_be32(*len);
+	ohdr->u.tid_rdma.w_req.verbs_qp = cpu_to_be32(qp->remote_qpn);
+	*bth1 &= ~RVT_QPN_MASK;
+	*bth1 |= remote->qp;
+	qp->s_state = TID_OP(WRITE_REQ);
+	qp->s_flags |= HFI1_S_WAIT_TID_RESP;
+	*bth2 |= IB_BTH_REQ_ACK;
+	*len = 0;
+
+	rcu_read_unlock();
+	return sizeof(ohdr->u.tid_rdma.w_req) / sizeof(u32);
+}
+
+static u32 hfi1_compute_tid_rdma_flow_wt(struct rvt_qp *qp)
+{
+	/*
+	 * Heuristic for computing the RNR timeout when waiting on the flow
+	 * queue. Rather than a computationaly expensive exact estimate of when
+	 * a flow will be available, we assume that if a QP is at position N in
+	 * the flow queue it has to wait approximately (N + 1) * (number of
+	 * segments between two sync points). The rationale for this is that
+	 * flows are released and recycled at each sync point.
+	 */
+	return (MAX_TID_FLOW_PSN * qp->pmtu) >> TID_RDMA_SEGMENT_SHIFT;
+}
+
+static u32 position_in_queue(struct hfi1_qp_priv *qpriv,
+			     struct tid_queue *queue)
+{
+	return qpriv->tid_enqueue - queue->dequeue;
+}
+
+/*
+ * @qp: points to rvt_qp context.
+ * @to_seg: desired RNR timeout in segments.
+ * Return: index of the next highest timeout in the ib_hfi1_rnr_table[]
+ */
+static u32 hfi1_compute_tid_rnr_timeout(struct rvt_qp *qp, u32 to_seg)
+{
+	struct hfi1_qp_priv *qpriv = qp->priv;
+	u64 timeout;
+	u32 bytes_per_us;
+	u8 i;
+
+	bytes_per_us = active_egress_rate(qpriv->rcd->ppd) / 8;
+	timeout = (to_seg * TID_RDMA_MAX_SEGMENT_SIZE) / bytes_per_us;
+	/*
+	 * Find the next highest value in the RNR table to the required
+	 * timeout. This gives the responder some padding.
+	 */
+	for (i = 1; i <= IB_AETH_CREDIT_MASK; i++)
+		if (rvt_rnr_tbl_to_usec(i) >= timeout)
+			return i;
+	return 0;
+}
+
+/**
+ * Central place for resource allocation at TID write responder,
+ * is called from write_req and write_data interrupt handlers as
+ * well as the send thread when a queued QP is scheduled for
+ * resource allocation.
+ *
+ * Iterates over (a) segments of a request and then (b) queued requests
+ * themselves to allocate resources for up to local->max_write
+ * segments across multiple requests. Stop allocating when we
+ * hit a sync point, resume allocating after data packets at
+ * sync point have been received.
+ *
+ * Resource allocation and sending of responses is decoupled. The
+ * request/segment which are being allocated and sent are as follows.
+ * Resources are allocated for:
+ *     [request: qpriv->r_tid_alloc, segment: req->alloc_seg]
+ * The send thread sends:
+ *     [request: qp->s_tail_ack_queue, segment:req->cur_seg]
+ */
+static void hfi1_tid_write_alloc_resources(struct rvt_qp *qp, bool intr_ctx)
+{
+	struct tid_rdma_request *req;
+	struct hfi1_qp_priv *qpriv = qp->priv;
+	struct hfi1_ctxtdata *rcd = qpriv->rcd;
+	struct tid_rdma_params *local = &qpriv->tid_rdma.local;
+	struct rvt_ack_entry *e;
+	u32 npkts, to_seg;
+	bool last;
+	int ret = 0;
+
+	lockdep_assert_held(&qp->s_lock);
+
+	while (1) {
+		trace_hfi1_rsp_tid_write_alloc_res(qp, 0);
+		trace_hfi1_tid_write_rsp_alloc_res(qp);
+		/*
+		 * Don't allocate more segments if a RNR NAK has already been
+		 * scheduled to avoid messing up qp->r_psn: the RNR NAK will
+		 * be sent only when all allocated segments have been sent.
+		 * However, if more segments are allocated before that, TID RDMA
+		 * WRITE RESP packets will be sent out for these new segments
+		 * before the RNR NAK packet. When the requester receives the
+		 * RNR NAK packet, it will restart with qp->s_last_psn + 1,
+		 * which does not match qp->r_psn and will be dropped.
+		 * Consequently, the requester will exhaust its retries and
+		 * put the qp into error state.
+		 */
+		if (qpriv->rnr_nak_state == TID_RNR_NAK_SEND)
+			break;
+
+		/* No requests left to process */
+		if (qpriv->r_tid_alloc == qpriv->r_tid_head) {
+			/* If all data has been received, clear the flow */
+			if (qpriv->flow_state.index < RXE_NUM_TID_FLOWS &&
+			    !qpriv->alloc_w_segs) {
+				hfi1_kern_clear_hw_flow(rcd, qp);
+				qpriv->s_flags &= ~HFI1_R_TID_SW_PSN;
+			}
+			break;
+		}
+
+		e = &qp->s_ack_queue[qpriv->r_tid_alloc];
+		if (e->opcode != TID_OP(WRITE_REQ))
+			goto next_req;
+		req = ack_to_tid_req(e);
+		trace_hfi1_tid_req_write_alloc_res(qp, 0, e->opcode, e->psn,
+						   e->lpsn, req);
+		/* Finished allocating for all segments of this request */
+		if (req->alloc_seg >= req->total_segs)
+			goto next_req;
+
+		/* Can allocate only a maximum of local->max_write for a QP */
+		if (qpriv->alloc_w_segs >= local->max_write)
+			break;
+
+		/* Don't allocate at a sync point with data packets pending */
+		if (qpriv->sync_pt && qpriv->alloc_w_segs)
+			break;
+
+		/* All data received at the sync point, continue */
+		if (qpriv->sync_pt && !qpriv->alloc_w_segs) {
+			hfi1_kern_clear_hw_flow(rcd, qp);
+			qpriv->sync_pt = false;
+			qpriv->s_flags &= ~HFI1_R_TID_SW_PSN;
+		}
+
+		/* Allocate flow if we don't have one */
+		if (qpriv->flow_state.index >= RXE_NUM_TID_FLOWS) {
+			ret = hfi1_kern_setup_hw_flow(qpriv->rcd, qp);
+			if (ret) {
+				to_seg = hfi1_compute_tid_rdma_flow_wt(qp) *
+					position_in_queue(qpriv,
+							  &rcd->flow_queue);
+				break;
+			}
+		}
+
+		npkts = rvt_div_round_up_mtu(qp, req->seg_len);
+
+		/*
+		 * We are at a sync point if we run out of KDETH PSN space.
+		 * Last PSN of every generation is reserved for RESYNC.
+		 */
+		if (qpriv->flow_state.psn + npkts > MAX_TID_FLOW_PSN - 1) {
+			qpriv->sync_pt = true;
+			break;
+		}
+
+		/*
+		 * If overtaking req->acked_tail, send an RNR NAK. Because the
+		 * QP is not queued in this case, and the issue can only be
+		 * caused by a delay in scheduling the second leg which we
+		 * cannot estimate, we use a rather arbitrary RNR timeout of
+		 * (MAX_FLOWS / 2) segments
+		 */
+		if (!CIRC_SPACE(req->setup_head, req->acked_tail,
+				MAX_FLOWS)) {
+			ret = -EAGAIN;
+			to_seg = MAX_FLOWS >> 1;
+			tid_rdma_trigger_ack(qp);
+			break;
+		}
+
+		/* Try to allocate rcv array / TID entries */
+		ret = hfi1_kern_exp_rcv_setup(req, &req->ss, &last);
+		if (ret == -EAGAIN)
+			to_seg = position_in_queue(qpriv, &rcd->rarr_queue);
+		if (ret)
+			break;
+
+		qpriv->alloc_w_segs++;
+		req->alloc_seg++;
+		continue;
+next_req:
+		/* Begin processing the next request */
+		if (++qpriv->r_tid_alloc >
+		    rvt_size_atomic(ib_to_rvt(qp->ibqp.device)))
+			qpriv->r_tid_alloc = 0;
+	}
+
+	/*
+	 * Schedule an RNR NAK to be sent if (a) flow or rcv array allocation
+	 * has failed (b) we are called from the rcv handler interrupt context
+	 * (c) an RNR NAK has not already been scheduled
+	 */
+	if (ret == -EAGAIN && intr_ctx && !qp->r_nak_state)
+		goto send_rnr_nak;
+
+	return;
+
+send_rnr_nak:
+	lockdep_assert_held(&qp->r_lock);
+
+	/* Set r_nak_state to prevent unrelated events from generating NAK's */
+	qp->r_nak_state = hfi1_compute_tid_rnr_timeout(qp, to_seg) | IB_RNR_NAK;
+
+	/* Pull back r_psn to the segment being RNR NAK'd */
+	qp->r_psn = e->psn + req->alloc_seg;
+	qp->r_ack_psn = qp->r_psn;
+	/*
+	 * Pull back r_head_ack_queue to the ack entry following the request
+	 * being RNR NAK'd. This allows resources to be allocated to the request
+	 * if the queued QP is scheduled.
+	 */
+	qp->r_head_ack_queue = qpriv->r_tid_alloc + 1;
+	if (qp->r_head_ack_queue > rvt_size_atomic(ib_to_rvt(qp->ibqp.device)))
+		qp->r_head_ack_queue = 0;
+	qpriv->r_tid_head = qp->r_head_ack_queue;
+	/*
+	 * These send side fields are used in make_rc_ack(). They are set in
+	 * hfi1_send_rc_ack() but must be set here before dropping qp->s_lock
+	 * for consistency
+	 */
+	qp->s_nak_state = qp->r_nak_state;
+	qp->s_ack_psn = qp->r_ack_psn;
+	/*
+	 * Clear the ACK PENDING flag to prevent unwanted ACK because we
+	 * have modified qp->s_ack_psn here.
+	 */
+	qp->s_flags &= ~(RVT_S_ACK_PENDING);
+
+	trace_hfi1_rsp_tid_write_alloc_res(qp, qp->r_psn);
+	/*
+	 * qpriv->rnr_nak_state is used to determine when the scheduled RNR NAK
+	 * has actually been sent. qp->s_flags RVT_S_ACK_PENDING bit cannot be
+	 * used for this because qp->s_lock is dropped before calling
+	 * hfi1_send_rc_ack() leading to inconsistency between the receive
+	 * interrupt handlers and the send thread in make_rc_ack()
+	 */
+	qpriv->rnr_nak_state = TID_RNR_NAK_SEND;
+
+	/*
+	 * Schedule RNR NAK to be sent. RNR NAK's are scheduled from the receive
+	 * interrupt handlers but will be sent from the send engine behind any
+	 * previous responses that may have been scheduled
+	 */
+	rc_defered_ack(rcd, qp);
+}
+
+void hfi1_rc_rcv_tid_rdma_write_req(struct hfi1_packet *packet)
+{
+	/* HANDLER FOR TID RDMA WRITE REQUEST packet (Responder side)*/
+
+	/*
+	 * 1. Verify TID RDMA WRITE REQ as per IB_OPCODE_RC_RDMA_WRITE_FIRST
+	 *    (see hfi1_rc_rcv())
+	 *     - Don't allow 0-length requests.
+	 * 2. Put TID RDMA WRITE REQ into the response queueu (s_ack_queue)
+	 *     - Setup struct tid_rdma_req with request info
+	 *     - Prepare struct tid_rdma_flow array?
+	 * 3. Set the qp->s_ack_state as state diagram in design doc.
+	 * 4. Set RVT_S_RESP_PENDING in s_flags.
+	 * 5. Kick the send engine (hfi1_schedule_send())
+	 */
+	struct hfi1_ctxtdata *rcd = packet->rcd;
+	struct rvt_qp *qp = packet->qp;
+	struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
+	struct ib_other_headers *ohdr = packet->ohdr;
+	struct rvt_ack_entry *e;
+	unsigned long flags;
+	struct ib_reth *reth;
+	struct hfi1_qp_priv *qpriv = qp->priv;
+	struct tid_rdma_request *req;
+	u32 bth0, psn, len, rkey, num_segs;
+	bool fecn;
+	u8 next;
+	u64 vaddr;
+	int diff;
+
+	bth0 = be32_to_cpu(ohdr->bth[0]);
+	if (hfi1_ruc_check_hdr(ibp, packet))
+		return;
+
+	fecn = process_ecn(qp, packet);
+	psn = mask_psn(be32_to_cpu(ohdr->bth[2]));
+	trace_hfi1_rsp_rcv_tid_write_req(qp, psn);
+
+	if (qp->state == IB_QPS_RTR && !(qp->r_flags & RVT_R_COMM_EST))
+		rvt_comm_est(qp);
+
+	if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE)))
+		goto nack_inv;
+
+	reth = &ohdr->u.tid_rdma.w_req.reth;
+	vaddr = be64_to_cpu(reth->vaddr);
+	len = be32_to_cpu(reth->length);
+
+	num_segs = DIV_ROUND_UP(len, qpriv->tid_rdma.local.max_len);
+	diff = delta_psn(psn, qp->r_psn);
+	if (unlikely(diff)) {
+		tid_rdma_rcv_err(packet, ohdr, qp, psn, diff, fecn);
+		return;
+	}
+
+	/*
+	 * The resent request which was previously RNR NAK'd is inserted at the
+	 * location of the original request, which is one entry behind
+	 * r_head_ack_queue
+	 */
+	if (qpriv->rnr_nak_state)
+		qp->r_head_ack_queue = qp->r_head_ack_queue ?
+			qp->r_head_ack_queue - 1 :
+			rvt_size_atomic(ib_to_rvt(qp->ibqp.device));
+
+	/* We've verified the request, insert it into the ack queue. */
+	next = qp->r_head_ack_queue + 1;
+	if (next > rvt_size_atomic(ib_to_rvt(qp->ibqp.device)))
+		next = 0;
+	spin_lock_irqsave(&qp->s_lock, flags);
+	if (unlikely(next == qp->s_acked_ack_queue)) {
+		if (!qp->s_ack_queue[next].sent)
+			goto nack_inv_unlock;
+		update_ack_queue(qp, next);
+	}
+	e = &qp->s_ack_queue[qp->r_head_ack_queue];
+	req = ack_to_tid_req(e);
+
+	/* Bring previously RNR NAK'd request back to life */
+	if (qpriv->rnr_nak_state) {
+		qp->r_nak_state = 0;
+		qp->s_nak_state = 0;
+		qpriv->rnr_nak_state = TID_RNR_NAK_INIT;
+		qp->r_psn = e->lpsn + 1;
+		req->state = TID_REQUEST_INIT;
+		goto update_head;
+	}
+
+	release_rdma_sge_mr(e);
+
+	/* The length needs to be in multiples of PAGE_SIZE */
+	if (!len || len & ~PAGE_MASK)
+		goto nack_inv_unlock;
+
+	rkey = be32_to_cpu(reth->rkey);
+	qp->r_len = len;
+
+	if (e->opcode == TID_OP(WRITE_REQ) &&
+	    (req->setup_head != req->clear_tail ||
+	     req->clear_tail != req->acked_tail))
+		goto nack_inv_unlock;
+
+	if (unlikely(!rvt_rkey_ok(qp, &e->rdma_sge, qp->r_len, vaddr,
+				  rkey, IB_ACCESS_REMOTE_WRITE)))
+		goto nack_acc;
+
+	qp->r_psn += num_segs - 1;
+
+	e->opcode = (bth0 >> 24) & 0xff;
+	e->psn = psn;
+	e->lpsn = qp->r_psn;
+	e->sent = 0;
+
+	req->n_flows = min_t(u16, num_segs, qpriv->tid_rdma.local.max_write);
+	req->state = TID_REQUEST_INIT;
+	req->cur_seg = 0;
+	req->comp_seg = 0;
+	req->ack_seg = 0;
+	req->alloc_seg = 0;
+	req->isge = 0;
+	req->seg_len = qpriv->tid_rdma.local.max_len;
+	req->total_len = len;
+	req->total_segs = num_segs;
+	req->r_flow_psn = e->psn;
+	req->ss.sge = e->rdma_sge;
+	req->ss.num_sge = 1;
+
+	req->flow_idx = req->setup_head;
+	req->clear_tail = req->setup_head;
+	req->acked_tail = req->setup_head;
+
+	qp->r_state = e->opcode;
+	qp->r_nak_state = 0;
+	/*
+	 * We need to increment the MSN here instead of when we
+	 * finish sending the result since a duplicate request would
+	 * increment it more than once.
+	 */
+	qp->r_msn++;
+	qp->r_psn++;
+
+	trace_hfi1_tid_req_rcv_write_req(qp, 0, e->opcode, e->psn, e->lpsn,
+					 req);
+
+	if (qpriv->r_tid_tail == HFI1_QP_WQE_INVALID) {
+		qpriv->r_tid_tail = qp->r_head_ack_queue;
+	} else if (qpriv->r_tid_tail == qpriv->r_tid_head) {
+		struct tid_rdma_request *ptr;
+
+		e = &qp->s_ack_queue[qpriv->r_tid_tail];
+		ptr = ack_to_tid_req(e);
+
+		if (e->opcode != TID_OP(WRITE_REQ) ||
+		    ptr->comp_seg == ptr->total_segs) {
+			if (qpriv->r_tid_tail == qpriv->r_tid_ack)
+				qpriv->r_tid_ack = qp->r_head_ack_queue;
+			qpriv->r_tid_tail = qp->r_head_ack_queue;
+		}
+	}
+update_head:
+	qp->r_head_ack_queue = next;
+	qpriv->r_tid_head = qp->r_head_ack_queue;
+
+	hfi1_tid_write_alloc_resources(qp, true);
+	trace_hfi1_tid_write_rsp_rcv_req(qp);
+
+	/* Schedule the send tasklet. */
+	qp->s_flags |= RVT_S_RESP_PENDING;
+	if (fecn)
+		qp->s_flags |= RVT_S_ECN;
+	hfi1_schedule_send(qp);
+
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+	return;
+
+nack_inv_unlock:
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+nack_inv:
+	rvt_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
+	qp->r_nak_state = IB_NAK_INVALID_REQUEST;
+	qp->r_ack_psn = qp->r_psn;
+	/* Queue NAK for later */
+	rc_defered_ack(rcd, qp);
+	return;
+nack_acc:
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+	rvt_rc_error(qp, IB_WC_LOC_PROT_ERR);
+	qp->r_nak_state = IB_NAK_REMOTE_ACCESS_ERROR;
+	qp->r_ack_psn = qp->r_psn;
+}
+
+u32 hfi1_build_tid_rdma_write_resp(struct rvt_qp *qp, struct rvt_ack_entry *e,
+				   struct ib_other_headers *ohdr, u32 *bth1,
+				   u32 bth2, u32 *len,
+				   struct rvt_sge_state **ss)
+{
+	struct hfi1_ack_priv *epriv = e->priv;
+	struct tid_rdma_request *req = &epriv->tid_req;
+	struct hfi1_qp_priv *qpriv = qp->priv;
+	struct tid_rdma_flow *flow = NULL;
+	u32 resp_len = 0, hdwords = 0;
+	void *resp_addr = NULL;
+	struct tid_rdma_params *remote;
+
+	trace_hfi1_tid_req_build_write_resp(qp, 0, e->opcode, e->psn, e->lpsn,
+					    req);
+	trace_hfi1_tid_write_rsp_build_resp(qp);
+	trace_hfi1_rsp_build_tid_write_resp(qp, bth2);
+	flow = &req->flows[req->flow_idx];
+	switch (req->state) {
+	default:
+		/*
+		 * Try to allocate resources here in case QP was queued and was
+		 * later scheduled when resources became available
+		 */
+		hfi1_tid_write_alloc_resources(qp, false);
+
+		/* We've already sent everything which is ready */
+		if (req->cur_seg >= req->alloc_seg)
+			goto done;
+
+		/*
+		 * Resources can be assigned but responses cannot be sent in
+		 * rnr_nak state, till the resent request is received
+		 */
+		if (qpriv->rnr_nak_state == TID_RNR_NAK_SENT)
+			goto done;
+
+		req->state = TID_REQUEST_ACTIVE;
+		trace_hfi1_tid_flow_build_write_resp(qp, req->flow_idx, flow);
+		req->flow_idx = CIRC_NEXT(req->flow_idx, MAX_FLOWS);
+		hfi1_add_tid_reap_timer(qp);
+		break;
+
+	case TID_REQUEST_RESEND_ACTIVE:
+	case TID_REQUEST_RESEND:
+		trace_hfi1_tid_flow_build_write_resp(qp, req->flow_idx, flow);
+		req->flow_idx = CIRC_NEXT(req->flow_idx, MAX_FLOWS);
+		if (!CIRC_CNT(req->setup_head, req->flow_idx, MAX_FLOWS))
+			req->state = TID_REQUEST_ACTIVE;
+
+		hfi1_mod_tid_reap_timer(qp);
+		break;
+	}
+	flow->flow_state.resp_ib_psn = bth2;
+	resp_addr = (void *)flow->tid_entry;
+	resp_len = sizeof(*flow->tid_entry) * flow->tidcnt;
+	req->cur_seg++;
+
+	memset(&ohdr->u.tid_rdma.w_rsp, 0, sizeof(ohdr->u.tid_rdma.w_rsp));
+	epriv->ss.sge.vaddr = resp_addr;
+	epriv->ss.sge.sge_length = resp_len;
+	epriv->ss.sge.length = epriv->ss.sge.sge_length;
+	/*
+	 * We can safely zero these out. Since the first SGE covers the
+	 * entire packet, nothing else should even look at the MR.
+	 */
+	epriv->ss.sge.mr = NULL;
+	epriv->ss.sge.m = 0;
+	epriv->ss.sge.n = 0;
+
+	epriv->ss.sg_list = NULL;
+	epriv->ss.total_len = epriv->ss.sge.sge_length;
+	epriv->ss.num_sge = 1;
+
+	*ss = &epriv->ss;
+	*len = epriv->ss.total_len;
+
+	/* Construct the TID RDMA WRITE RESP packet header */
+	rcu_read_lock();
+	remote = rcu_dereference(qpriv->tid_rdma.remote);
+
+	KDETH_RESET(ohdr->u.tid_rdma.w_rsp.kdeth0, KVER, 0x1);
+	KDETH_RESET(ohdr->u.tid_rdma.w_rsp.kdeth1, JKEY, remote->jkey);
+	ohdr->u.tid_rdma.w_rsp.aeth = rvt_compute_aeth(qp);
+	ohdr->u.tid_rdma.w_rsp.tid_flow_psn =
+		cpu_to_be32((flow->flow_state.generation <<
+			     HFI1_KDETH_BTH_SEQ_SHIFT) |
+			    (flow->flow_state.spsn &
+			     HFI1_KDETH_BTH_SEQ_MASK));
+	ohdr->u.tid_rdma.w_rsp.tid_flow_qp =
+		cpu_to_be32(qpriv->tid_rdma.local.qp |
+			    ((flow->idx & TID_RDMA_DESTQP_FLOW_MASK) <<
+			     TID_RDMA_DESTQP_FLOW_SHIFT) |
+			    qpriv->rcd->ctxt);
+	ohdr->u.tid_rdma.w_rsp.verbs_qp = cpu_to_be32(qp->remote_qpn);
+	*bth1 = remote->qp;
+	rcu_read_unlock();
+	hdwords = sizeof(ohdr->u.tid_rdma.w_rsp) / sizeof(u32);
+	qpriv->pending_tid_w_segs++;
+done:
+	return hdwords;
+}
+
+static void hfi1_add_tid_reap_timer(struct rvt_qp *qp)
+{
+	struct hfi1_qp_priv *qpriv = qp->priv;
+
+	lockdep_assert_held(&qp->s_lock);
+	if (!(qpriv->s_flags & HFI1_R_TID_RSC_TIMER)) {
+		qpriv->s_flags |= HFI1_R_TID_RSC_TIMER;
+		qpriv->s_tid_timer.expires = jiffies +
+			qpriv->tid_timer_timeout_jiffies;
+		add_timer(&qpriv->s_tid_timer);
+	}
+}
+
+static void hfi1_mod_tid_reap_timer(struct rvt_qp *qp)
+{
+	struct hfi1_qp_priv *qpriv = qp->priv;
+
+	lockdep_assert_held(&qp->s_lock);
+	qpriv->s_flags |= HFI1_R_TID_RSC_TIMER;
+	mod_timer(&qpriv->s_tid_timer, jiffies +
+		  qpriv->tid_timer_timeout_jiffies);
+}
+
+static int hfi1_stop_tid_reap_timer(struct rvt_qp *qp)
+{
+	struct hfi1_qp_priv *qpriv = qp->priv;
+	int rval = 0;
+
+	lockdep_assert_held(&qp->s_lock);
+	if (qpriv->s_flags & HFI1_R_TID_RSC_TIMER) {
+		rval = del_timer(&qpriv->s_tid_timer);
+		qpriv->s_flags &= ~HFI1_R_TID_RSC_TIMER;
+	}
+	return rval;
+}
+
+void hfi1_del_tid_reap_timer(struct rvt_qp *qp)
+{
+	struct hfi1_qp_priv *qpriv = qp->priv;
+
+	del_timer_sync(&qpriv->s_tid_timer);
+	qpriv->s_flags &= ~HFI1_R_TID_RSC_TIMER;
+}
+
+static void hfi1_tid_timeout(struct timer_list *t)
+{
+	struct hfi1_qp_priv *qpriv = from_timer(qpriv, t, s_tid_timer);
+	struct rvt_qp *qp = qpriv->owner;
+	struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device);
+	unsigned long flags;
+	u32 i;
+
+	spin_lock_irqsave(&qp->r_lock, flags);
+	spin_lock(&qp->s_lock);
+	if (qpriv->s_flags & HFI1_R_TID_RSC_TIMER) {
+		dd_dev_warn(dd_from_ibdev(qp->ibqp.device), "[QP%u] %s %d\n",
+			    qp->ibqp.qp_num, __func__, __LINE__);
+		trace_hfi1_msg_tid_timeout(/* msg */
+			qp, "resource timeout = ",
+			(u64)qpriv->tid_timer_timeout_jiffies);
+		hfi1_stop_tid_reap_timer(qp);
+		/*
+		 * Go though the entire ack queue and clear any outstanding
+		 * HW flow and RcvArray resources.
+		 */
+		hfi1_kern_clear_hw_flow(qpriv->rcd, qp);
+		for (i = 0; i < rvt_max_atomic(rdi); i++) {
+			struct tid_rdma_request *req =
+				ack_to_tid_req(&qp->s_ack_queue[i]);
+
+			hfi1_kern_exp_rcv_clear_all(req);
+		}
+		spin_unlock(&qp->s_lock);
+		if (qp->ibqp.event_handler) {
+			struct ib_event ev;
+
+			ev.device = qp->ibqp.device;
+			ev.element.qp = &qp->ibqp;
+			ev.event = IB_EVENT_QP_FATAL;
+			qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);
+		}
+		rvt_rc_error(qp, IB_WC_RESP_TIMEOUT_ERR);
+		goto unlock_r_lock;
+	}
+	spin_unlock(&qp->s_lock);
+unlock_r_lock:
+	spin_unlock_irqrestore(&qp->r_lock, flags);
+}
+
+void hfi1_rc_rcv_tid_rdma_write_resp(struct hfi1_packet *packet)
+{
+	/* HANDLER FOR TID RDMA WRITE RESPONSE packet (Requestor side */
+
+	/*
+	 * 1. Find matching SWQE
+	 * 2. Check that TIDENTRY array has enough space for a complete
+	 *    segment. If not, put QP in error state.
+	 * 3. Save response data in struct tid_rdma_req and struct tid_rdma_flow
+	 * 4. Remove HFI1_S_WAIT_TID_RESP from s_flags.
+	 * 5. Set qp->s_state
+	 * 6. Kick the send engine (hfi1_schedule_send())
+	 */
+	struct ib_other_headers *ohdr = packet->ohdr;
+	struct rvt_qp *qp = packet->qp;
+	struct hfi1_qp_priv *qpriv = qp->priv;
+	struct hfi1_ctxtdata *rcd = packet->rcd;
+	struct rvt_swqe *wqe;
+	struct tid_rdma_request *req;
+	struct tid_rdma_flow *flow;
+	enum ib_wc_status status;
+	u32 opcode, aeth, psn, flow_psn, i, tidlen = 0, pktlen;
+	bool fecn;
+	unsigned long flags;
+
+	fecn = process_ecn(qp, packet);
+	psn = mask_psn(be32_to_cpu(ohdr->bth[2]));
+	aeth = be32_to_cpu(ohdr->u.tid_rdma.w_rsp.aeth);
+	opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff;
+
+	spin_lock_irqsave(&qp->s_lock, flags);
+
+	/* Ignore invalid responses */
+	if (cmp_psn(psn, qp->s_next_psn) >= 0)
+		goto ack_done;
+
+	/* Ignore duplicate responses. */
+	if (unlikely(cmp_psn(psn, qp->s_last_psn) <= 0))
+		goto ack_done;
+
+	if (unlikely(qp->s_acked == qp->s_tail))
+		goto ack_done;
+
+	/*
+	 * If we are waiting for a particular packet sequence number
+	 * due to a request being resent, check for it. Otherwise,
+	 * ensure that we haven't missed anything.
+	 */
+	if (qp->r_flags & RVT_R_RDMAR_SEQ) {
+		if (cmp_psn(psn, qp->s_last_psn + 1) != 0)
+			goto ack_done;
+		qp->r_flags &= ~RVT_R_RDMAR_SEQ;
+	}
+
+	wqe = rvt_get_swqe_ptr(qp, qpriv->s_tid_cur);
+	if (unlikely(wqe->wr.opcode != IB_WR_TID_RDMA_WRITE))
+		goto ack_op_err;
+
+	req = wqe_to_tid_req(wqe);
+	/*
+	 * If we've lost ACKs and our acked_tail pointer is too far
+	 * behind, don't overwrite segments. Just drop the packet and
+	 * let the reliability protocol take care of it.
+	 */
+	if (!CIRC_SPACE(req->setup_head, req->acked_tail, MAX_FLOWS))
+		goto ack_done;
+
+	/*
+	 * The call to do_rc_ack() should be last in the chain of
+	 * packet checks because it will end up updating the QP state.
+	 * Therefore, anything that would prevent the packet from
+	 * being accepted as a successful response should be prior
+	 * to it.
+	 */
+	if (!do_rc_ack(qp, aeth, psn, opcode, 0, rcd))
+		goto ack_done;
+
+	trace_hfi1_ack(qp, psn);
+
+	flow = &req->flows[req->setup_head];
+	flow->pkt = 0;
+	flow->tid_idx = 0;
+	flow->tid_offset = 0;
+	flow->sent = 0;
+	flow->resync_npkts = 0;
+	flow->tid_qpn = be32_to_cpu(ohdr->u.tid_rdma.w_rsp.tid_flow_qp);
+	flow->idx = (flow->tid_qpn >> TID_RDMA_DESTQP_FLOW_SHIFT) &
+		TID_RDMA_DESTQP_FLOW_MASK;
+	flow_psn = mask_psn(be32_to_cpu(ohdr->u.tid_rdma.w_rsp.tid_flow_psn));
+	flow->flow_state.generation = flow_psn >> HFI1_KDETH_BTH_SEQ_SHIFT;
+	flow->flow_state.spsn = flow_psn & HFI1_KDETH_BTH_SEQ_MASK;
+	flow->flow_state.resp_ib_psn = psn;
+	flow->length = min_t(u32, req->seg_len,
+			     (wqe->length - (req->comp_seg * req->seg_len)));
+
+	flow->npkts = rvt_div_round_up_mtu(qp, flow->length);
+	flow->flow_state.lpsn = flow->flow_state.spsn +
+		flow->npkts - 1;
+	/* payload length = packet length - (header length + ICRC length) */
+	pktlen = packet->tlen - (packet->hlen + 4);
+	if (pktlen > sizeof(flow->tid_entry)) {
+		status = IB_WC_LOC_LEN_ERR;
+		goto ack_err;
+	}
+	memcpy(flow->tid_entry, packet->ebuf, pktlen);
+	flow->tidcnt = pktlen / sizeof(*flow->tid_entry);
+	trace_hfi1_tid_flow_rcv_write_resp(qp, req->setup_head, flow);
+
+	req->comp_seg++;
+	trace_hfi1_tid_write_sender_rcv_resp(qp, 0);
+	/*
+	 * Walk the TID_ENTRY list to make sure we have enough space for a
+	 * complete segment.
+	 */
+	for (i = 0; i < flow->tidcnt; i++) {
+		trace_hfi1_tid_entry_rcv_write_resp(/* entry */
+			qp, i, flow->tid_entry[i]);
+		if (!EXP_TID_GET(flow->tid_entry[i], LEN)) {
+			status = IB_WC_LOC_LEN_ERR;
+			goto ack_err;
+		}
+		tidlen += EXP_TID_GET(flow->tid_entry[i], LEN);
+	}
+	if (tidlen * PAGE_SIZE < flow->length) {
+		status = IB_WC_LOC_LEN_ERR;
+		goto ack_err;
+	}
+
+	trace_hfi1_tid_req_rcv_write_resp(qp, 0, wqe->wr.opcode, wqe->psn,
+					  wqe->lpsn, req);
+	/*
+	 * If this is the first response for this request, set the initial
+	 * flow index to the current flow.
+	 */
+	if (!cmp_psn(psn, wqe->psn)) {
+		req->r_last_acked = mask_psn(wqe->psn - 1);
+		/* Set acked flow index to head index */
+		req->acked_tail = req->setup_head;
+	}
+
+	/* advance circular buffer head */
+	req->setup_head = CIRC_NEXT(req->setup_head, MAX_FLOWS);
+	req->state = TID_REQUEST_ACTIVE;
+
+	/*
+	 * If all responses for this TID RDMA WRITE request have been received
+	 * advance the pointer to the next one.
+	 * Since TID RDMA requests could be mixed in with regular IB requests,
+	 * they might not appear sequentially in the queue. Therefore, the
+	 * next request needs to be "found".
+	 */
+	if (qpriv->s_tid_cur != qpriv->s_tid_head &&
+	    req->comp_seg == req->total_segs) {
+		for (i = qpriv->s_tid_cur + 1; ; i++) {
+			if (i == qp->s_size)
+				i = 0;
+			wqe = rvt_get_swqe_ptr(qp, i);
+			if (i == qpriv->s_tid_head)
+				break;
+			if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE)
+				break;
+		}
+		qpriv->s_tid_cur = i;
+	}
+	qp->s_flags &= ~HFI1_S_WAIT_TID_RESP;
+	hfi1_schedule_tid_send(qp);
+	goto ack_done;
+
+ack_op_err:
+	status = IB_WC_LOC_QP_OP_ERR;
+ack_err:
+	rvt_error_qp(qp, status);
+ack_done:
+	if (fecn)
+		qp->s_flags |= RVT_S_ECN;
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+}
+
+bool hfi1_build_tid_rdma_packet(struct rvt_swqe *wqe,
+				struct ib_other_headers *ohdr,
+				u32 *bth1, u32 *bth2, u32 *len)
+{
+	struct tid_rdma_request *req = wqe_to_tid_req(wqe);
+	struct tid_rdma_flow *flow = &req->flows[req->clear_tail];
+	struct tid_rdma_params *remote;
+	struct rvt_qp *qp = req->qp;
+	struct hfi1_qp_priv *qpriv = qp->priv;
+	u32 tidentry = flow->tid_entry[flow->tid_idx];
+	u32 tidlen = EXP_TID_GET(tidentry, LEN) << PAGE_SHIFT;
+	struct tid_rdma_write_data *wd = &ohdr->u.tid_rdma.w_data;
+	u32 next_offset, om = KDETH_OM_LARGE;
+	bool last_pkt;
+
+	if (!tidlen) {
+		hfi1_trdma_send_complete(qp, wqe, IB_WC_REM_INV_RD_REQ_ERR);
+		rvt_error_qp(qp, IB_WC_REM_INV_RD_REQ_ERR);
+	}
+
+	*len = min_t(u32, qp->pmtu, tidlen - flow->tid_offset);
+	flow->sent += *len;
+	next_offset = flow->tid_offset + *len;
+	last_pkt = (flow->tid_idx == (flow->tidcnt - 1) &&
+		    next_offset >= tidlen) || (flow->sent >= flow->length);
+	trace_hfi1_tid_entry_build_write_data(qp, flow->tid_idx, tidentry);
+	trace_hfi1_tid_flow_build_write_data(qp, req->clear_tail, flow);
+
+	rcu_read_lock();
+	remote = rcu_dereference(qpriv->tid_rdma.remote);
+	KDETH_RESET(wd->kdeth0, KVER, 0x1);
+	KDETH_SET(wd->kdeth0, SH, !last_pkt);
+	KDETH_SET(wd->kdeth0, INTR, !!(!last_pkt && remote->urg));
+	KDETH_SET(wd->kdeth0, TIDCTRL, EXP_TID_GET(tidentry, CTRL));
+	KDETH_SET(wd->kdeth0, TID, EXP_TID_GET(tidentry, IDX));
+	KDETH_SET(wd->kdeth0, OM, om == KDETH_OM_LARGE);
+	KDETH_SET(wd->kdeth0, OFFSET, flow->tid_offset / om);
+	KDETH_RESET(wd->kdeth1, JKEY, remote->jkey);
+	wd->verbs_qp = cpu_to_be32(qp->remote_qpn);
+	rcu_read_unlock();
+
+	*bth1 = flow->tid_qpn;
+	*bth2 = mask_psn(((flow->flow_state.spsn + flow->pkt++) &
+			 HFI1_KDETH_BTH_SEQ_MASK) |
+			 (flow->flow_state.generation <<
+			  HFI1_KDETH_BTH_SEQ_SHIFT));
+	if (last_pkt) {
+		/* PSNs are zero-based, so +1 to count number of packets */
+		if (flow->flow_state.lpsn + 1 +
+		    rvt_div_round_up_mtu(qp, req->seg_len) >
+		    MAX_TID_FLOW_PSN)
+			req->state = TID_REQUEST_SYNC;
+		*bth2 |= IB_BTH_REQ_ACK;
+	}
+
+	if (next_offset >= tidlen) {
+		flow->tid_offset = 0;
+		flow->tid_idx++;
+	} else {
+		flow->tid_offset = next_offset;
+	}
+	return last_pkt;
+}
+
+void hfi1_rc_rcv_tid_rdma_write_data(struct hfi1_packet *packet)
+{
+	struct rvt_qp *qp = packet->qp;
+	struct hfi1_qp_priv *priv = qp->priv;
+	struct hfi1_ctxtdata *rcd = priv->rcd;
+	struct ib_other_headers *ohdr = packet->ohdr;
+	struct rvt_ack_entry *e;
+	struct tid_rdma_request *req;
+	struct tid_rdma_flow *flow;
+	struct hfi1_ibdev *dev = to_idev(qp->ibqp.device);
+	unsigned long flags;
+	u32 psn, next;
+	u8 opcode;
+	bool fecn;
+
+	fecn = process_ecn(qp, packet);
+	psn = mask_psn(be32_to_cpu(ohdr->bth[2]));
+	opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff;
+
+	/*
+	 * All error handling should be done by now. If we are here, the packet
+	 * is either good or been accepted by the error handler.
+	 */
+	spin_lock_irqsave(&qp->s_lock, flags);
+	e = &qp->s_ack_queue[priv->r_tid_tail];
+	req = ack_to_tid_req(e);
+	flow = &req->flows[req->clear_tail];
+	if (cmp_psn(psn, full_flow_psn(flow, flow->flow_state.lpsn))) {
+		update_r_next_psn_fecn(packet, priv, rcd, flow, fecn);
+
+		if (cmp_psn(psn, flow->flow_state.r_next_psn))
+			goto send_nak;
+
+		flow->flow_state.r_next_psn = mask_psn(psn + 1);
+		/*
+		 * Copy the payload to destination buffer if this packet is
+		 * delivered as an eager packet due to RSM rule and FECN.
+		 * The RSM rule selects FECN bit in BTH and SH bit in
+		 * KDETH header and therefore will not match the last
+		 * packet of each segment that has SH bit cleared.
+		 */
+		if (fecn && packet->etype == RHF_RCV_TYPE_EAGER) {
+			struct rvt_sge_state ss;
+			u32 len;
+			u32 tlen = packet->tlen;
+			u16 hdrsize = packet->hlen;
+			u8 pad = packet->pad;
+			u8 extra_bytes = pad + packet->extra_byte +
+				(SIZE_OF_CRC << 2);
+			u32 pmtu = qp->pmtu;
+
+			if (unlikely(tlen != (hdrsize + pmtu + extra_bytes)))
+				goto send_nak;
+			len = req->comp_seg * req->seg_len;
+			len += delta_psn(psn,
+				full_flow_psn(flow, flow->flow_state.spsn)) *
+				pmtu;
+			if (unlikely(req->total_len - len < pmtu))
+				goto send_nak;
+
+			/*
+			 * The e->rdma_sge field is set when TID RDMA WRITE REQ
+			 * is first received and is never modified thereafter.
+			 */
+			ss.sge = e->rdma_sge;
+			ss.sg_list = NULL;
+			ss.num_sge = 1;
+			ss.total_len = req->total_len;
+			rvt_skip_sge(&ss, len, false);
+			rvt_copy_sge(qp, &ss, packet->payload, pmtu, false,
+				     false);
+			/* Raise the sw sequence check flag for next packet */
+			priv->r_next_psn_kdeth = mask_psn(psn + 1);
+			priv->s_flags |= HFI1_R_TID_SW_PSN;
+		}
+		goto exit;
+	}
+	flow->flow_state.r_next_psn = mask_psn(psn + 1);
+	hfi1_kern_exp_rcv_clear(req);
+	priv->alloc_w_segs--;
+	rcd->flows[flow->idx].psn = psn & HFI1_KDETH_BTH_SEQ_MASK;
+	req->comp_seg++;
+	priv->s_nak_state = 0;
+
+	/*
+	 * Release the flow if one of the following conditions has been met:
+	 *  - The request has reached a sync point AND all outstanding
+	 *    segments have been completed, or
+	 *  - The entire request is complete and there are no more requests
+	 *    (of any kind) in the queue.
+	 */
+	trace_hfi1_rsp_rcv_tid_write_data(qp, psn);
+	trace_hfi1_tid_req_rcv_write_data(qp, 0, e->opcode, e->psn, e->lpsn,
+					  req);
+	trace_hfi1_tid_write_rsp_rcv_data(qp);
+	validate_r_tid_ack(priv);
+
+	if (opcode == TID_OP(WRITE_DATA_LAST)) {
+		release_rdma_sge_mr(e);
+		for (next = priv->r_tid_tail + 1; ; next++) {
+			if (next > rvt_size_atomic(&dev->rdi))
+				next = 0;
+			if (next == priv->r_tid_head)
+				break;
+			e = &qp->s_ack_queue[next];
+			if (e->opcode == TID_OP(WRITE_REQ))
+				break;
+		}
+		priv->r_tid_tail = next;
+		if (++qp->s_acked_ack_queue > rvt_size_atomic(&dev->rdi))
+			qp->s_acked_ack_queue = 0;
+	}
+
+	hfi1_tid_write_alloc_resources(qp, true);
+
+	/*
+	 * If we need to generate more responses, schedule the
+	 * send engine.
+	 */
+	if (req->cur_seg < req->total_segs ||
+	    qp->s_tail_ack_queue != qp->r_head_ack_queue) {
+		qp->s_flags |= RVT_S_RESP_PENDING;
+		hfi1_schedule_send(qp);
+	}
+
+	priv->pending_tid_w_segs--;
+	if (priv->s_flags & HFI1_R_TID_RSC_TIMER) {
+		if (priv->pending_tid_w_segs)
+			hfi1_mod_tid_reap_timer(req->qp);
+		else
+			hfi1_stop_tid_reap_timer(req->qp);
+	}
+
+done:
+	tid_rdma_schedule_ack(qp);
+exit:
+	priv->r_next_psn_kdeth = flow->flow_state.r_next_psn;
+	if (fecn)
+		qp->s_flags |= RVT_S_ECN;
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+	return;
+
+send_nak:
+	if (!priv->s_nak_state) {
+		priv->s_nak_state = IB_NAK_PSN_ERROR;
+		priv->s_nak_psn = flow->flow_state.r_next_psn;
+		tid_rdma_trigger_ack(qp);
+	}
+	goto done;
+}
+
+static bool hfi1_tid_rdma_is_resync_psn(u32 psn)
+{
+	return (bool)((psn & HFI1_KDETH_BTH_SEQ_MASK) ==
+		      HFI1_KDETH_BTH_SEQ_MASK);
+}
+
+u32 hfi1_build_tid_rdma_write_ack(struct rvt_qp *qp, struct rvt_ack_entry *e,
+				  struct ib_other_headers *ohdr, u16 iflow,
+				  u32 *bth1, u32 *bth2)
+{
+	struct hfi1_qp_priv *qpriv = qp->priv;
+	struct tid_flow_state *fs = &qpriv->flow_state;
+	struct tid_rdma_request *req = ack_to_tid_req(e);
+	struct tid_rdma_flow *flow = &req->flows[iflow];
+	struct tid_rdma_params *remote;
+
+	rcu_read_lock();
+	remote = rcu_dereference(qpriv->tid_rdma.remote);
+	KDETH_RESET(ohdr->u.tid_rdma.ack.kdeth1, JKEY, remote->jkey);
+	ohdr->u.tid_rdma.ack.verbs_qp = cpu_to_be32(qp->remote_qpn);
+	*bth1 = remote->qp;
+	rcu_read_unlock();
+
+	if (qpriv->resync) {
+		*bth2 = mask_psn((fs->generation <<
+				  HFI1_KDETH_BTH_SEQ_SHIFT) - 1);
+		ohdr->u.tid_rdma.ack.aeth = rvt_compute_aeth(qp);
+	} else if (qpriv->s_nak_state) {
+		*bth2 = mask_psn(qpriv->s_nak_psn);
+		ohdr->u.tid_rdma.ack.aeth =
+			cpu_to_be32((qp->r_msn & IB_MSN_MASK) |
+				    (qpriv->s_nak_state <<
+				     IB_AETH_CREDIT_SHIFT));
+	} else {
+		*bth2 = full_flow_psn(flow, flow->flow_state.lpsn);
+		ohdr->u.tid_rdma.ack.aeth = rvt_compute_aeth(qp);
+	}
+	KDETH_RESET(ohdr->u.tid_rdma.ack.kdeth0, KVER, 0x1);
+	ohdr->u.tid_rdma.ack.tid_flow_qp =
+		cpu_to_be32(qpriv->tid_rdma.local.qp |
+			    ((flow->idx & TID_RDMA_DESTQP_FLOW_MASK) <<
+			     TID_RDMA_DESTQP_FLOW_SHIFT) |
+			    qpriv->rcd->ctxt);
+
+	ohdr->u.tid_rdma.ack.tid_flow_psn = 0;
+	ohdr->u.tid_rdma.ack.verbs_psn =
+		cpu_to_be32(flow->flow_state.resp_ib_psn);
+
+	if (qpriv->resync) {
+		/*
+		 * If the PSN before the current expect KDETH PSN is the
+		 * RESYNC PSN, then we never received a good TID RDMA WRITE
+		 * DATA packet after a previous RESYNC.
+		 * In this case, the next expected KDETH PSN stays the same.
+		 */
+		if (hfi1_tid_rdma_is_resync_psn(qpriv->r_next_psn_kdeth - 1)) {
+			ohdr->u.tid_rdma.ack.tid_flow_psn =
+				cpu_to_be32(qpriv->r_next_psn_kdeth_save);
+		} else {
+			/*
+			 * Because the KDETH PSNs jump during a RESYNC, it's
+			 * not possible to infer (or compute) the previous value
+			 * of r_next_psn_kdeth in the case of back-to-back
+			 * RESYNC packets. Therefore, we save it.
+			 */
+			qpriv->r_next_psn_kdeth_save =
+				qpriv->r_next_psn_kdeth - 1;
+			ohdr->u.tid_rdma.ack.tid_flow_psn =
+				cpu_to_be32(qpriv->r_next_psn_kdeth_save);
+			qpriv->r_next_psn_kdeth = mask_psn(*bth2 + 1);
+		}
+		qpriv->resync = false;
+	}
+
+	return sizeof(ohdr->u.tid_rdma.ack) / sizeof(u32);
+}
+
+void hfi1_rc_rcv_tid_rdma_ack(struct hfi1_packet *packet)
+{
+	struct ib_other_headers *ohdr = packet->ohdr;
+	struct rvt_qp *qp = packet->qp;
+	struct hfi1_qp_priv *qpriv = qp->priv;
+	struct rvt_swqe *wqe;
+	struct tid_rdma_request *req;
+	struct tid_rdma_flow *flow;
+	u32 aeth, psn, req_psn, ack_psn, flpsn, resync_psn, ack_kpsn;
+	unsigned long flags;
+	u16 fidx;
+
+	trace_hfi1_tid_write_sender_rcv_tid_ack(qp, 0);
+	process_ecn(qp, packet);
+	psn = mask_psn(be32_to_cpu(ohdr->bth[2]));
+	aeth = be32_to_cpu(ohdr->u.tid_rdma.ack.aeth);
+	req_psn = mask_psn(be32_to_cpu(ohdr->u.tid_rdma.ack.verbs_psn));
+	resync_psn = mask_psn(be32_to_cpu(ohdr->u.tid_rdma.ack.tid_flow_psn));
+
+	spin_lock_irqsave(&qp->s_lock, flags);
+	trace_hfi1_rcv_tid_ack(qp, aeth, psn, req_psn, resync_psn);
+
+	/* If we are waiting for an ACK to RESYNC, drop any other packets */
+	if ((qp->s_flags & HFI1_S_WAIT_HALT) &&
+	    cmp_psn(psn, qpriv->s_resync_psn))
+		goto ack_op_err;
+
+	ack_psn = req_psn;
+	if (hfi1_tid_rdma_is_resync_psn(psn))
+		ack_kpsn = resync_psn;
+	else
+		ack_kpsn = psn;
+	if (aeth >> 29) {
+		ack_psn--;
+		ack_kpsn--;
+	}
+
+	if (unlikely(qp->s_acked == qp->s_tail))
+		goto ack_op_err;
+
+	wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
+
+	if (wqe->wr.opcode != IB_WR_TID_RDMA_WRITE)
+		goto ack_op_err;
+
+	req = wqe_to_tid_req(wqe);
+	trace_hfi1_tid_req_rcv_tid_ack(qp, 0, wqe->wr.opcode, wqe->psn,
+				       wqe->lpsn, req);
+	flow = &req->flows[req->acked_tail];
+	trace_hfi1_tid_flow_rcv_tid_ack(qp, req->acked_tail, flow);
+
+	/* Drop stale ACK/NAK */
+	if (cmp_psn(psn, full_flow_psn(flow, flow->flow_state.spsn)) < 0 ||
+	    cmp_psn(req_psn, flow->flow_state.resp_ib_psn) < 0)
+		goto ack_op_err;
+
+	while (cmp_psn(ack_kpsn,
+		       full_flow_psn(flow, flow->flow_state.lpsn)) >= 0 &&
+	       req->ack_seg < req->cur_seg) {
+		req->ack_seg++;
+		/* advance acked segment pointer */
+		req->acked_tail = CIRC_NEXT(req->acked_tail, MAX_FLOWS);
+		req->r_last_acked = flow->flow_state.resp_ib_psn;
+		trace_hfi1_tid_req_rcv_tid_ack(qp, 0, wqe->wr.opcode, wqe->psn,
+					       wqe->lpsn, req);
+		if (req->ack_seg == req->total_segs) {
+			req->state = TID_REQUEST_COMPLETE;
+			wqe = do_rc_completion(qp, wqe,
+					       to_iport(qp->ibqp.device,
+							qp->port_num));
+			trace_hfi1_sender_rcv_tid_ack(qp);
+			atomic_dec(&qpriv->n_tid_requests);
+			if (qp->s_acked == qp->s_tail)
+				break;
+			if (wqe->wr.opcode != IB_WR_TID_RDMA_WRITE)
+				break;
+			req = wqe_to_tid_req(wqe);
+		}
+		flow = &req->flows[req->acked_tail];
+		trace_hfi1_tid_flow_rcv_tid_ack(qp, req->acked_tail, flow);
+	}
+
+	trace_hfi1_tid_req_rcv_tid_ack(qp, 0, wqe->wr.opcode, wqe->psn,
+				       wqe->lpsn, req);
+	switch (aeth >> 29) {
+	case 0:         /* ACK */
+		if (qpriv->s_flags & RVT_S_WAIT_ACK)
+			qpriv->s_flags &= ~RVT_S_WAIT_ACK;
+		if (!hfi1_tid_rdma_is_resync_psn(psn)) {
+			/* Check if there is any pending TID ACK */
+			if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE &&
+			    req->ack_seg < req->cur_seg)
+				hfi1_mod_tid_retry_timer(qp);
+			else
+				hfi1_stop_tid_retry_timer(qp);
+			hfi1_schedule_send(qp);
+		} else {
+			u32 spsn, fpsn, last_acked, generation;
+			struct tid_rdma_request *rptr;
+
+			/* ACK(RESYNC) */
+			hfi1_stop_tid_retry_timer(qp);
+			/* Allow new requests (see hfi1_make_tid_rdma_pkt) */
+			qp->s_flags &= ~HFI1_S_WAIT_HALT;
+			/*
+			 * Clear RVT_S_SEND_ONE flag in case that the TID RDMA
+			 * ACK is received after the TID retry timer is fired
+			 * again. In this case, do not send any more TID
+			 * RESYNC request or wait for any more TID ACK packet.
+			 */
+			qpriv->s_flags &= ~RVT_S_SEND_ONE;
+			hfi1_schedule_send(qp);
+
+			if ((qp->s_acked == qpriv->s_tid_tail &&
+			     req->ack_seg == req->total_segs) ||
+			    qp->s_acked == qp->s_tail) {
+				qpriv->s_state = TID_OP(WRITE_DATA_LAST);
+				goto done;
+			}
+
+			if (req->ack_seg == req->comp_seg) {
+				qpriv->s_state = TID_OP(WRITE_DATA);
+				goto done;
+			}
+
+			/*
+			 * The PSN to start with is the next PSN after the
+			 * RESYNC PSN.
+			 */
+			psn = mask_psn(psn + 1);
+			generation = psn >> HFI1_KDETH_BTH_SEQ_SHIFT;
+			spsn = 0;
+
+			/*
+			 * Update to the correct WQE when we get an ACK(RESYNC)
+			 * in the middle of a request.
+			 */
+			if (delta_psn(ack_psn, wqe->lpsn))
+				wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
+			req = wqe_to_tid_req(wqe);
+			flow = &req->flows[req->acked_tail];
+			/*
+			 * RESYNC re-numbers the PSN ranges of all remaining
+			 * segments. Also, PSN's start from 0 in the middle of a
+			 * segment and the first segment size is less than the
+			 * default number of packets. flow->resync_npkts is used
+			 * to track the number of packets from the start of the
+			 * real segment to the point of 0 PSN after the RESYNC
+			 * in order to later correctly rewind the SGE.
+			 */
+			fpsn = full_flow_psn(flow, flow->flow_state.spsn);
+			req->r_ack_psn = psn;
+			flow->resync_npkts +=
+				delta_psn(mask_psn(resync_psn + 1), fpsn);
+			/*
+			 * Renumber all packet sequence number ranges
+			 * based on the new generation.
+			 */
+			last_acked = qp->s_acked;
+			rptr = req;
+			while (1) {
+				/* start from last acked segment */
+				for (fidx = rptr->acked_tail;
+				     CIRC_CNT(rptr->setup_head, fidx,
+					      MAX_FLOWS);
+				     fidx = CIRC_NEXT(fidx, MAX_FLOWS)) {
+					u32 lpsn;
+					u32 gen;
+
+					flow = &rptr->flows[fidx];
+					gen = flow->flow_state.generation;
+					if (WARN_ON(gen == generation &&
+						    flow->flow_state.spsn !=
+						     spsn))
+						continue;
+					lpsn = flow->flow_state.lpsn;
+					lpsn = full_flow_psn(flow, lpsn);
+					flow->npkts =
+						delta_psn(lpsn,
+							  mask_psn(resync_psn)
+							  );
+					flow->flow_state.generation =
+						generation;
+					flow->flow_state.spsn = spsn;
+					flow->flow_state.lpsn =
+						flow->flow_state.spsn +
+						flow->npkts - 1;
+					flow->pkt = 0;
+					spsn += flow->npkts;
+					resync_psn += flow->npkts;
+					trace_hfi1_tid_flow_rcv_tid_ack(qp,
+									fidx,
+									flow);
+				}
+				if (++last_acked == qpriv->s_tid_cur + 1)
+					break;
+				if (last_acked == qp->s_size)
+					last_acked = 0;
+				wqe = rvt_get_swqe_ptr(qp, last_acked);
+				rptr = wqe_to_tid_req(wqe);
+			}
+			req->cur_seg = req->ack_seg;
+			qpriv->s_tid_tail = qp->s_acked;
+			qpriv->s_state = TID_OP(WRITE_REQ);
+			hfi1_schedule_tid_send(qp);
+		}
+done:
+		qpriv->s_retry = qp->s_retry_cnt;
+		break;
+
+	case 3:         /* NAK */
+		hfi1_stop_tid_retry_timer(qp);
+		switch ((aeth >> IB_AETH_CREDIT_SHIFT) &
+			IB_AETH_CREDIT_MASK) {
+		case 0: /* PSN sequence error */
+			if (!req->flows)
+				break;
+			flow = &req->flows[req->acked_tail];
+			flpsn = full_flow_psn(flow, flow->flow_state.lpsn);
+			if (cmp_psn(psn, flpsn) > 0)
+				break;
+			trace_hfi1_tid_flow_rcv_tid_ack(qp, req->acked_tail,
+							flow);
+			req->r_ack_psn = mask_psn(be32_to_cpu(ohdr->bth[2]));
+			req->cur_seg = req->ack_seg;
+			qpriv->s_tid_tail = qp->s_acked;
+			qpriv->s_state = TID_OP(WRITE_REQ);
+			qpriv->s_retry = qp->s_retry_cnt;
+			hfi1_schedule_tid_send(qp);
+			break;
+
+		default:
+			break;
+		}
+		break;
+
+	default:
+		break;
+	}
+
+ack_op_err:
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+}
+
+void hfi1_add_tid_retry_timer(struct rvt_qp *qp)
+{
+	struct hfi1_qp_priv *priv = qp->priv;
+	struct ib_qp *ibqp = &qp->ibqp;
+	struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device);
+
+	lockdep_assert_held(&qp->s_lock);
+	if (!(priv->s_flags & HFI1_S_TID_RETRY_TIMER)) {
+		priv->s_flags |= HFI1_S_TID_RETRY_TIMER;
+		priv->s_tid_retry_timer.expires = jiffies +
+			priv->tid_retry_timeout_jiffies + rdi->busy_jiffies;
+		add_timer(&priv->s_tid_retry_timer);
+	}
+}
+
+static void hfi1_mod_tid_retry_timer(struct rvt_qp *qp)
+{
+	struct hfi1_qp_priv *priv = qp->priv;
+	struct ib_qp *ibqp = &qp->ibqp;
+	struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device);
+
+	lockdep_assert_held(&qp->s_lock);
+	priv->s_flags |= HFI1_S_TID_RETRY_TIMER;
+	mod_timer(&priv->s_tid_retry_timer, jiffies +
+		  priv->tid_retry_timeout_jiffies + rdi->busy_jiffies);
+}
+
+static int hfi1_stop_tid_retry_timer(struct rvt_qp *qp)
+{
+	struct hfi1_qp_priv *priv = qp->priv;
+	int rval = 0;
+
+	lockdep_assert_held(&qp->s_lock);
+	if (priv->s_flags & HFI1_S_TID_RETRY_TIMER) {
+		rval = del_timer(&priv->s_tid_retry_timer);
+		priv->s_flags &= ~HFI1_S_TID_RETRY_TIMER;
+	}
+	return rval;
+}
+
+void hfi1_del_tid_retry_timer(struct rvt_qp *qp)
+{
+	struct hfi1_qp_priv *priv = qp->priv;
+
+	del_timer_sync(&priv->s_tid_retry_timer);
+	priv->s_flags &= ~HFI1_S_TID_RETRY_TIMER;
+}
+
+static void hfi1_tid_retry_timeout(struct timer_list *t)
+{
+	struct hfi1_qp_priv *priv = from_timer(priv, t, s_tid_retry_timer);
+	struct rvt_qp *qp = priv->owner;
+	struct rvt_swqe *wqe;
+	unsigned long flags;
+	struct tid_rdma_request *req;
+
+	spin_lock_irqsave(&qp->r_lock, flags);
+	spin_lock(&qp->s_lock);
+	trace_hfi1_tid_write_sender_retry_timeout(qp, 0);
+	if (priv->s_flags & HFI1_S_TID_RETRY_TIMER) {
+		hfi1_stop_tid_retry_timer(qp);
+		if (!priv->s_retry) {
+			trace_hfi1_msg_tid_retry_timeout(/* msg */
+				qp,
+				"Exhausted retries. Tid retry timeout = ",
+				(u64)priv->tid_retry_timeout_jiffies);
+
+			wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
+			hfi1_trdma_send_complete(qp, wqe, IB_WC_RETRY_EXC_ERR);
+			rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR);
+		} else {
+			wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
+			req = wqe_to_tid_req(wqe);
+			trace_hfi1_tid_req_tid_retry_timeout(/* req */
+			   qp, 0, wqe->wr.opcode, wqe->psn, wqe->lpsn, req);
+
+			priv->s_flags &= ~RVT_S_WAIT_ACK;
+			/* Only send one packet (the RESYNC) */
+			priv->s_flags |= RVT_S_SEND_ONE;
+			/*
+			 * No additional request shall be made by this QP until
+			 * the RESYNC has been complete.
+			 */
+			qp->s_flags |= HFI1_S_WAIT_HALT;
+			priv->s_state = TID_OP(RESYNC);
+			priv->s_retry--;
+			hfi1_schedule_tid_send(qp);
+		}
+	}
+	spin_unlock(&qp->s_lock);
+	spin_unlock_irqrestore(&qp->r_lock, flags);
+}
+
+u32 hfi1_build_tid_rdma_resync(struct rvt_qp *qp, struct rvt_swqe *wqe,
+			       struct ib_other_headers *ohdr, u32 *bth1,
+			       u32 *bth2, u16 fidx)
+{
+	struct hfi1_qp_priv *qpriv = qp->priv;
+	struct tid_rdma_params *remote;
+	struct tid_rdma_request *req = wqe_to_tid_req(wqe);
+	struct tid_rdma_flow *flow = &req->flows[fidx];
+	u32 generation;
+
+	rcu_read_lock();
+	remote = rcu_dereference(qpriv->tid_rdma.remote);
+	KDETH_RESET(ohdr->u.tid_rdma.ack.kdeth1, JKEY, remote->jkey);
+	ohdr->u.tid_rdma.ack.verbs_qp = cpu_to_be32(qp->remote_qpn);
+	*bth1 = remote->qp;
+	rcu_read_unlock();
+
+	generation = kern_flow_generation_next(flow->flow_state.generation);
+	*bth2 = mask_psn((generation << HFI1_KDETH_BTH_SEQ_SHIFT) - 1);
+	qpriv->s_resync_psn = *bth2;
+	*bth2 |= IB_BTH_REQ_ACK;
+	KDETH_RESET(ohdr->u.tid_rdma.ack.kdeth0, KVER, 0x1);
+
+	return sizeof(ohdr->u.tid_rdma.resync) / sizeof(u32);
+}
+
+void hfi1_rc_rcv_tid_rdma_resync(struct hfi1_packet *packet)
+{
+	struct ib_other_headers *ohdr = packet->ohdr;
+	struct rvt_qp *qp = packet->qp;
+	struct hfi1_qp_priv *qpriv = qp->priv;
+	struct hfi1_ctxtdata *rcd = qpriv->rcd;
+	struct hfi1_ibdev *dev = to_idev(qp->ibqp.device);
+	struct rvt_ack_entry *e;
+	struct tid_rdma_request *req;
+	struct tid_rdma_flow *flow;
+	struct tid_flow_state *fs = &qpriv->flow_state;
+	u32 psn, generation, idx, gen_next;
+	bool fecn;
+	unsigned long flags;
+
+	fecn = process_ecn(qp, packet);
+	psn = mask_psn(be32_to_cpu(ohdr->bth[2]));
+
+	generation = mask_psn(psn + 1) >> HFI1_KDETH_BTH_SEQ_SHIFT;
+	spin_lock_irqsave(&qp->s_lock, flags);
+
+	gen_next = (fs->generation == KERN_GENERATION_RESERVED) ?
+		generation : kern_flow_generation_next(fs->generation);
+	/*
+	 * RESYNC packet contains the "next" generation and can only be
+	 * from the current or previous generations
+	 */
+	if (generation != mask_generation(gen_next - 1) &&
+	    generation != gen_next)
+		goto bail;
+	/* Already processing a resync */
+	if (qpriv->resync)
+		goto bail;
+
+	spin_lock(&rcd->exp_lock);
+	if (fs->index >= RXE_NUM_TID_FLOWS) {
+		/*
+		 * If we don't have a flow, save the generation so it can be
+		 * applied when a new flow is allocated
+		 */
+		fs->generation = generation;
+	} else {
+		/* Reprogram the QP flow with new generation */
+		rcd->flows[fs->index].generation = generation;
+		fs->generation = kern_setup_hw_flow(rcd, fs->index);
+	}
+	fs->psn = 0;
+	/*
+	 * Disable SW PSN checking since a RESYNC is equivalent to a
+	 * sync point and the flow has/will be reprogrammed
+	 */
+	qpriv->s_flags &= ~HFI1_R_TID_SW_PSN;
+	trace_hfi1_tid_write_rsp_rcv_resync(qp);
+
+	/*
+	 * Reset all TID flow information with the new generation.
+	 * This is done for all requests and segments after the
+	 * last received segment
+	 */
+	for (idx = qpriv->r_tid_tail; ; idx++) {
+		u16 flow_idx;
+
+		if (idx > rvt_size_atomic(&dev->rdi))
+			idx = 0;
+		e = &qp->s_ack_queue[idx];
+		if (e->opcode == TID_OP(WRITE_REQ)) {
+			req = ack_to_tid_req(e);
+			trace_hfi1_tid_req_rcv_resync(qp, 0, e->opcode, e->psn,
+						      e->lpsn, req);
+
+			/* start from last unacked segment */
+			for (flow_idx = req->clear_tail;
+			     CIRC_CNT(req->setup_head, flow_idx,
+				      MAX_FLOWS);
+			     flow_idx = CIRC_NEXT(flow_idx, MAX_FLOWS)) {
+				u32 lpsn;
+				u32 next;
+
+				flow = &req->flows[flow_idx];
+				lpsn = full_flow_psn(flow,
+						     flow->flow_state.lpsn);
+				next = flow->flow_state.r_next_psn;
+				flow->npkts = delta_psn(lpsn, next - 1);
+				flow->flow_state.generation = fs->generation;
+				flow->flow_state.spsn = fs->psn;
+				flow->flow_state.lpsn =
+					flow->flow_state.spsn + flow->npkts - 1;
+				flow->flow_state.r_next_psn =
+					full_flow_psn(flow,
+						      flow->flow_state.spsn);
+				fs->psn += flow->npkts;
+				trace_hfi1_tid_flow_rcv_resync(qp, flow_idx,
+							       flow);
+			}
+		}
+		if (idx == qp->s_tail_ack_queue)
+			break;
+	}
+
+	spin_unlock(&rcd->exp_lock);
+	qpriv->resync = true;
+	/* RESYNC request always gets a TID RDMA ACK. */
+	qpriv->s_nak_state = 0;
+	tid_rdma_trigger_ack(qp);
+bail:
+	if (fecn)
+		qp->s_flags |= RVT_S_ECN;
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+}
+
+/*
+ * Call this function when the last TID RDMA WRITE DATA packet for a request
+ * is built.
+ */
+static void update_tid_tail(struct rvt_qp *qp)
+	__must_hold(&qp->s_lock)
+{
+	struct hfi1_qp_priv *priv = qp->priv;
+	u32 i;
+	struct rvt_swqe *wqe;
+
+	lockdep_assert_held(&qp->s_lock);
+	/* Can't move beyond s_tid_cur */
+	if (priv->s_tid_tail == priv->s_tid_cur)
+		return;
+	for (i = priv->s_tid_tail + 1; ; i++) {
+		if (i == qp->s_size)
+			i = 0;
+
+		if (i == priv->s_tid_cur)
+			break;
+		wqe = rvt_get_swqe_ptr(qp, i);
+		if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE)
+			break;
+	}
+	priv->s_tid_tail = i;
+	priv->s_state = TID_OP(WRITE_RESP);
+}
+
+int hfi1_make_tid_rdma_pkt(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
+	__must_hold(&qp->s_lock)
+{
+	struct hfi1_qp_priv *priv = qp->priv;
+	struct rvt_swqe *wqe;
+	u32 bth1 = 0, bth2 = 0, hwords = 5, len, middle = 0;
+	struct ib_other_headers *ohdr;
+	struct rvt_sge_state *ss = &qp->s_sge;
+	struct rvt_ack_entry *e = &qp->s_ack_queue[qp->s_tail_ack_queue];
+	struct tid_rdma_request *req = ack_to_tid_req(e);
+	bool last = false;
+	u8 opcode = TID_OP(WRITE_DATA);
+
+	lockdep_assert_held(&qp->s_lock);
+	trace_hfi1_tid_write_sender_make_tid_pkt(qp, 0);
+	/*
+	 * Prioritize the sending of the requests and responses over the
+	 * sending of the TID RDMA data packets.
+	 */
+	if (((atomic_read(&priv->n_tid_requests) < HFI1_TID_RDMA_WRITE_CNT) &&
+	     atomic_read(&priv->n_requests) &&
+	     !(qp->s_flags & (RVT_S_BUSY | RVT_S_WAIT_ACK |
+			     HFI1_S_ANY_WAIT_IO))) ||
+	    (e->opcode == TID_OP(WRITE_REQ) && req->cur_seg < req->alloc_seg &&
+	     !(qp->s_flags & (RVT_S_BUSY | HFI1_S_ANY_WAIT_IO)))) {
+		struct iowait_work *iowork;
+
+		iowork = iowait_get_ib_work(&priv->s_iowait);
+		ps->s_txreq = get_waiting_verbs_txreq(iowork);
+		if (ps->s_txreq || hfi1_make_rc_req(qp, ps)) {
+			priv->s_flags |= HFI1_S_TID_BUSY_SET;
+			return 1;
+		}
+	}
+
+	ps->s_txreq = get_txreq(ps->dev, qp);
+	if (!ps->s_txreq)
+		goto bail_no_tx;
+
+	ohdr = &ps->s_txreq->phdr.hdr.ibh.u.oth;
+
+	if ((priv->s_flags & RVT_S_ACK_PENDING) &&
+	    make_tid_rdma_ack(qp, ohdr, ps))
+		return 1;
+
+	/*
+	 * Bail out if we can't send data.
+	 * Be reminded that this check must been done after the call to
+	 * make_tid_rdma_ack() because the responding QP could be in
+	 * RTR state where it can send TID RDMA ACK, not TID RDMA WRITE DATA.
+	 */
+	if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_SEND_OK))
+		goto bail;
+
+	if (priv->s_flags & RVT_S_WAIT_ACK)
+		goto bail;
+
+	/* Check whether there is anything to do. */
+	if (priv->s_tid_tail == HFI1_QP_WQE_INVALID)
+		goto bail;
+	wqe = rvt_get_swqe_ptr(qp, priv->s_tid_tail);
+	req = wqe_to_tid_req(wqe);
+	trace_hfi1_tid_req_make_tid_pkt(qp, 0, wqe->wr.opcode, wqe->psn,
+					wqe->lpsn, req);
+	switch (priv->s_state) {
+	case TID_OP(WRITE_REQ):
+	case TID_OP(WRITE_RESP):
+		priv->tid_ss.sge = wqe->sg_list[0];
+		priv->tid_ss.sg_list = wqe->sg_list + 1;
+		priv->tid_ss.num_sge = wqe->wr.num_sge;
+		priv->tid_ss.total_len = wqe->length;
+
+		if (priv->s_state == TID_OP(WRITE_REQ))
+			hfi1_tid_rdma_restart_req(qp, wqe, &bth2);
+		priv->s_state = TID_OP(WRITE_DATA);
+		/* fall through */
+
+	case TID_OP(WRITE_DATA):
+		/*
+		 * 1. Check whether TID RDMA WRITE RESP available.
+		 * 2. If no:
+		 *    2.1 If have more segments and no TID RDMA WRITE RESP,
+		 *        set HFI1_S_WAIT_TID_RESP
+		 *    2.2 Return indicating no progress made.
+		 * 3. If yes:
+		 *    3.1 Build TID RDMA WRITE DATA packet.
+		 *    3.2 If last packet in segment:
+		 *        3.2.1 Change KDETH header bits
+		 *        3.2.2 Advance RESP pointers.
+		 *    3.3 Return indicating progress made.
+		 */
+		trace_hfi1_sender_make_tid_pkt(qp);
+		trace_hfi1_tid_write_sender_make_tid_pkt(qp, 0);
+		wqe = rvt_get_swqe_ptr(qp, priv->s_tid_tail);
+		req = wqe_to_tid_req(wqe);
+		len = wqe->length;
+
+		if (!req->comp_seg || req->cur_seg == req->comp_seg)
+			goto bail;
+
+		trace_hfi1_tid_req_make_tid_pkt(qp, 0, wqe->wr.opcode,
+						wqe->psn, wqe->lpsn, req);
+		last = hfi1_build_tid_rdma_packet(wqe, ohdr, &bth1, &bth2,
+						  &len);
+
+		if (last) {
+			/* move pointer to next flow */
+			req->clear_tail = CIRC_NEXT(req->clear_tail,
+						    MAX_FLOWS);
+			if (++req->cur_seg < req->total_segs) {
+				if (!CIRC_CNT(req->setup_head, req->clear_tail,
+					      MAX_FLOWS))
+					qp->s_flags |= HFI1_S_WAIT_TID_RESP;
+			} else {
+				priv->s_state = TID_OP(WRITE_DATA_LAST);
+				opcode = TID_OP(WRITE_DATA_LAST);
+
+				/* Advance the s_tid_tail now */
+				update_tid_tail(qp);
+			}
+		}
+		hwords += sizeof(ohdr->u.tid_rdma.w_data) / sizeof(u32);
+		ss = &priv->tid_ss;
+		break;
+
+	case TID_OP(RESYNC):
+		trace_hfi1_sender_make_tid_pkt(qp);
+		/* Use generation from the most recently received response */
+		wqe = rvt_get_swqe_ptr(qp, priv->s_tid_cur);
+		req = wqe_to_tid_req(wqe);
+		/* If no responses for this WQE look at the previous one */
+		if (!req->comp_seg) {
+			wqe = rvt_get_swqe_ptr(qp,
+					       (!priv->s_tid_cur ? qp->s_size :
+						priv->s_tid_cur) - 1);
+			req = wqe_to_tid_req(wqe);
+		}
+		hwords += hfi1_build_tid_rdma_resync(qp, wqe, ohdr, &bth1,
+						     &bth2,
+						     CIRC_PREV(req->setup_head,
+							       MAX_FLOWS));
+		ss = NULL;
+		len = 0;
+		opcode = TID_OP(RESYNC);
+		break;
+
+	default:
+		goto bail;
+	}
+	if (priv->s_flags & RVT_S_SEND_ONE) {
+		priv->s_flags &= ~RVT_S_SEND_ONE;
+		priv->s_flags |= RVT_S_WAIT_ACK;
+		bth2 |= IB_BTH_REQ_ACK;
+	}
+	qp->s_len -= len;
+	ps->s_txreq->hdr_dwords = hwords;
+	ps->s_txreq->sde = priv->s_sde;
+	ps->s_txreq->ss = ss;
+	ps->s_txreq->s_cur_size = len;
+	hfi1_make_ruc_header(qp, ohdr, (opcode << 24), bth1, bth2,
+			     middle, ps);
+	return 1;
+bail:
+	hfi1_put_txreq(ps->s_txreq);
+bail_no_tx:
+	ps->s_txreq = NULL;
+	priv->s_flags &= ~RVT_S_BUSY;
+	/*
+	 * If we didn't get a txreq, the QP will be woken up later to try
+	 * again, set the flags to the the wake up which work item to wake
+	 * up.
+	 * (A better algorithm should be found to do this and generalize the
+	 * sleep/wakeup flags.)
+	 */
+	iowait_set_flag(&priv->s_iowait, IOWAIT_PENDING_TID);
+	return 0;
+}
+
+static int make_tid_rdma_ack(struct rvt_qp *qp,
+			     struct ib_other_headers *ohdr,
+			     struct hfi1_pkt_state *ps)
+{
+	struct rvt_ack_entry *e;
+	struct hfi1_qp_priv *qpriv = qp->priv;
+	struct hfi1_ibdev *dev = to_idev(qp->ibqp.device);
+	u32 hwords, next;
+	u32 len = 0;
+	u32 bth1 = 0, bth2 = 0;
+	int middle = 0;
+	u16 flow;
+	struct tid_rdma_request *req, *nreq;
+
+	trace_hfi1_tid_write_rsp_make_tid_ack(qp);
+	/* Don't send an ACK if we aren't supposed to. */
+	if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK))
+		goto bail;
+
+	/* header size in 32-bit words LRH+BTH = (8+12)/4. */
+	hwords = 5;
+
+	e = &qp->s_ack_queue[qpriv->r_tid_ack];
+	req = ack_to_tid_req(e);
+	/*
+	 * In the RESYNC case, we are exactly one segment past the
+	 * previously sent ack or at the previously sent NAK. So to send
+	 * the resync ack, we go back one segment (which might be part of
+	 * the previous request) and let the do-while loop execute again.
+	 * The advantage of executing the do-while loop is that any data
+	 * received after the previous ack is automatically acked in the
+	 * RESYNC ack. It turns out that for the do-while loop we only need
+	 * to pull back qpriv->r_tid_ack, not the segment
+	 * indices/counters. The scheme works even if the previous request
+	 * was not a TID WRITE request.
+	 */
+	if (qpriv->resync) {
+		if (!req->ack_seg || req->ack_seg == req->total_segs)
+			qpriv->r_tid_ack = !qpriv->r_tid_ack ?
+				rvt_size_atomic(&dev->rdi) :
+				qpriv->r_tid_ack - 1;
+		e = &qp->s_ack_queue[qpriv->r_tid_ack];
+		req = ack_to_tid_req(e);
+	}
+
+	trace_hfi1_rsp_make_tid_ack(qp, e->psn);
+	trace_hfi1_tid_req_make_tid_ack(qp, 0, e->opcode, e->psn, e->lpsn,
+					req);
+	/*
+	 * If we've sent all the ACKs that we can, we are done
+	 * until we get more segments...
+	 */
+	if (!qpriv->s_nak_state && !qpriv->resync &&
+	    req->ack_seg == req->comp_seg)
+		goto bail;
+
+	do {
+		/*
+		 * To deal with coalesced ACKs, the acked_tail pointer
+		 * into the flow array is used. The distance between it
+		 * and the clear_tail is the number of flows that are
+		 * being ACK'ed.
+		 */
+		req->ack_seg +=
+			/* Get up-to-date value */
+			CIRC_CNT(req->clear_tail, req->acked_tail,
+				 MAX_FLOWS);
+		/* Advance acked index */
+		req->acked_tail = req->clear_tail;
+
+		/*
+		 * req->clear_tail points to the segment currently being
+		 * received. So, when sending an ACK, the previous
+		 * segment is being ACK'ed.
+		 */
+		flow = CIRC_PREV(req->acked_tail, MAX_FLOWS);
+		if (req->ack_seg != req->total_segs)
+			break;
+		req->state = TID_REQUEST_COMPLETE;
+
+		next = qpriv->r_tid_ack + 1;
+		if (next > rvt_size_atomic(&dev->rdi))
+			next = 0;
+		qpriv->r_tid_ack = next;
+		if (qp->s_ack_queue[next].opcode != TID_OP(WRITE_REQ))
+			break;
+		nreq = ack_to_tid_req(&qp->s_ack_queue[next]);
+		if (!nreq->comp_seg || nreq->ack_seg == nreq->comp_seg)
+			break;
+
+		/* Move to the next ack entry now */
+		e = &qp->s_ack_queue[qpriv->r_tid_ack];
+		req = ack_to_tid_req(e);
+	} while (1);
+
+	/*
+	 * At this point qpriv->r_tid_ack == qpriv->r_tid_tail but e and
+	 * req could be pointing at the previous ack queue entry
+	 */
+	if (qpriv->s_nak_state ||
+	    (qpriv->resync &&
+	     !hfi1_tid_rdma_is_resync_psn(qpriv->r_next_psn_kdeth - 1) &&
+	     (cmp_psn(qpriv->r_next_psn_kdeth - 1,
+		      full_flow_psn(&req->flows[flow],
+				    req->flows[flow].flow_state.lpsn)) > 0))) {
+		/*
+		 * A NAK will implicitly acknowledge all previous TID RDMA
+		 * requests. Therefore, we NAK with the req->acked_tail
+		 * segment for the request at qpriv->r_tid_ack (same at
+		 * this point as the req->clear_tail segment for the
+		 * qpriv->r_tid_tail request)
+		 */
+		e = &qp->s_ack_queue[qpriv->r_tid_ack];
+		req = ack_to_tid_req(e);
+		flow = req->acked_tail;
+	} else if (req->ack_seg == req->total_segs &&
+		   qpriv->s_flags & HFI1_R_TID_WAIT_INTERLCK)
+		qpriv->s_flags &= ~HFI1_R_TID_WAIT_INTERLCK;
+
+	trace_hfi1_tid_write_rsp_make_tid_ack(qp);
+	trace_hfi1_tid_req_make_tid_ack(qp, 0, e->opcode, e->psn, e->lpsn,
+					req);
+	hwords += hfi1_build_tid_rdma_write_ack(qp, e, ohdr, flow, &bth1,
+						&bth2);
+	len = 0;
+	qpriv->s_flags &= ~RVT_S_ACK_PENDING;
+	ps->s_txreq->hdr_dwords = hwords;
+	ps->s_txreq->sde = qpriv->s_sde;
+	ps->s_txreq->s_cur_size = len;
+	ps->s_txreq->ss = NULL;
+	hfi1_make_ruc_header(qp, ohdr, (TID_OP(ACK) << 24), bth1, bth2, middle,
+			     ps);
+	ps->s_txreq->txreq.flags |= SDMA_TXREQ_F_VIP;
+	return 1;
+bail:
+	/*
+	 * Ensure s_rdma_ack_cnt changes are committed prior to resetting
+	 * RVT_S_RESP_PENDING
+	 */
+	smp_wmb();
+	qpriv->s_flags &= ~RVT_S_ACK_PENDING;
+	return 0;
+}
+
+static int hfi1_send_tid_ok(struct rvt_qp *qp)
+{
+	struct hfi1_qp_priv *priv = qp->priv;
+
+	return !(priv->s_flags & RVT_S_BUSY ||
+		 qp->s_flags & HFI1_S_ANY_WAIT_IO) &&
+		(verbs_txreq_queued(iowait_get_tid_work(&priv->s_iowait)) ||
+		 (priv->s_flags & RVT_S_RESP_PENDING) ||
+		 !(qp->s_flags & HFI1_S_ANY_TID_WAIT_SEND));
+}
+
+void _hfi1_do_tid_send(struct work_struct *work)
+{
+	struct iowait_work *w = container_of(work, struct iowait_work, iowork);
+	struct rvt_qp *qp = iowait_to_qp(w->iow);
+
+	hfi1_do_tid_send(qp);
+}
+
+static void hfi1_do_tid_send(struct rvt_qp *qp)
+{
+	struct hfi1_pkt_state ps;
+	struct hfi1_qp_priv *priv = qp->priv;
+
+	ps.dev = to_idev(qp->ibqp.device);
+	ps.ibp = to_iport(qp->ibqp.device, qp->port_num);
+	ps.ppd = ppd_from_ibp(ps.ibp);
+	ps.wait = iowait_get_tid_work(&priv->s_iowait);
+	ps.in_thread = false;
+	ps.timeout_int = qp->timeout_jiffies / 8;
+
+	trace_hfi1_rc_do_tid_send(qp, false);
+	spin_lock_irqsave(&qp->s_lock, ps.flags);
+
+	/* Return if we are already busy processing a work request. */
+	if (!hfi1_send_tid_ok(qp)) {
+		if (qp->s_flags & HFI1_S_ANY_WAIT_IO)
+			iowait_set_flag(&priv->s_iowait, IOWAIT_PENDING_TID);
+		spin_unlock_irqrestore(&qp->s_lock, ps.flags);
+		return;
+	}
+
+	priv->s_flags |= RVT_S_BUSY;
+
+	ps.timeout = jiffies + ps.timeout_int;
+	ps.cpu = priv->s_sde ? priv->s_sde->cpu :
+		cpumask_first(cpumask_of_node(ps.ppd->dd->node));
+	ps.pkts_sent = false;
+
+	/* insure a pre-built packet is handled  */
+	ps.s_txreq = get_waiting_verbs_txreq(ps.wait);
+	do {
+		/* Check for a constructed packet to be sent. */
+		if (ps.s_txreq) {
+			if (priv->s_flags & HFI1_S_TID_BUSY_SET) {
+				qp->s_flags |= RVT_S_BUSY;
+				ps.wait = iowait_get_ib_work(&priv->s_iowait);
+			}
+			spin_unlock_irqrestore(&qp->s_lock, ps.flags);
+
+			/*
+			 * If the packet cannot be sent now, return and
+			 * the send tasklet will be woken up later.
+			 */
+			if (hfi1_verbs_send(qp, &ps))
+				return;
+
+			/* allow other tasks to run */
+			if (hfi1_schedule_send_yield(qp, &ps, true))
+				return;
+
+			spin_lock_irqsave(&qp->s_lock, ps.flags);
+			if (priv->s_flags & HFI1_S_TID_BUSY_SET) {
+				qp->s_flags &= ~RVT_S_BUSY;
+				priv->s_flags &= ~HFI1_S_TID_BUSY_SET;
+				ps.wait = iowait_get_tid_work(&priv->s_iowait);
+				if (iowait_flag_set(&priv->s_iowait,
+						    IOWAIT_PENDING_IB))
+					hfi1_schedule_send(qp);
+			}
+		}
+	} while (hfi1_make_tid_rdma_pkt(qp, &ps));
+	iowait_starve_clear(ps.pkts_sent, &priv->s_iowait);
+	spin_unlock_irqrestore(&qp->s_lock, ps.flags);
+}
+
+static bool _hfi1_schedule_tid_send(struct rvt_qp *qp)
+{
+	struct hfi1_qp_priv *priv = qp->priv;
+	struct hfi1_ibport *ibp =
+		to_iport(qp->ibqp.device, qp->port_num);
+	struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+	struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device);
+
+	return iowait_tid_schedule(&priv->s_iowait, ppd->hfi1_wq,
+				   priv->s_sde ?
+				   priv->s_sde->cpu :
+				   cpumask_first(cpumask_of_node(dd->node)));
+}
+
+/**
+ * hfi1_schedule_tid_send - schedule progress on TID RDMA state machine
+ * @qp: the QP
+ *
+ * This schedules qp progress on the TID RDMA state machine. Caller
+ * should hold the s_lock.
+ * Unlike hfi1_schedule_send(), this cannot use hfi1_send_ok() because
+ * the two state machines can step on each other with respect to the
+ * RVT_S_BUSY flag.
+ * Therefore, a modified test is used.
+ * @return true if the second leg is scheduled;
+ *  false if the second leg is not scheduled.
+ */
+bool hfi1_schedule_tid_send(struct rvt_qp *qp)
+{
+	lockdep_assert_held(&qp->s_lock);
+	if (hfi1_send_tid_ok(qp)) {
+		/*
+		 * The following call returns true if the qp is not on the
+		 * queue and false if the qp is already on the queue before
+		 * this call. Either way, the qp will be on the queue when the
+		 * call returns.
+		 */
+		_hfi1_schedule_tid_send(qp);
+		return true;
+	}
+	if (qp->s_flags & HFI1_S_ANY_WAIT_IO)
+		iowait_set_flag(&((struct hfi1_qp_priv *)qp->priv)->s_iowait,
+				IOWAIT_PENDING_TID);
+	return false;
+}
+
+bool hfi1_tid_rdma_ack_interlock(struct rvt_qp *qp, struct rvt_ack_entry *e)
+{
+	struct rvt_ack_entry *prev;
+	struct tid_rdma_request *req;
+	struct hfi1_ibdev *dev = to_idev(qp->ibqp.device);
+	struct hfi1_qp_priv *priv = qp->priv;
+	u32 s_prev;
+
+	s_prev = qp->s_tail_ack_queue == 0 ? rvt_size_atomic(&dev->rdi) :
+		(qp->s_tail_ack_queue - 1);
+	prev = &qp->s_ack_queue[s_prev];
+
+	if ((e->opcode == TID_OP(READ_REQ) ||
+	     e->opcode == OP(RDMA_READ_REQUEST)) &&
+	    prev->opcode == TID_OP(WRITE_REQ)) {
+		req = ack_to_tid_req(prev);
+		if (req->ack_seg != req->total_segs) {
+			priv->s_flags |= HFI1_R_TID_WAIT_INTERLCK;
+			return true;
+		}
+	}
+	return false;
+}
+
+static u32 read_r_next_psn(struct hfi1_devdata *dd, u8 ctxt, u8 fidx)
+{
+	u64 reg;
+
+	/*
+	 * The only sane way to get the amount of
+	 * progress is to read the HW flow state.
+	 */
+	reg = read_uctxt_csr(dd, ctxt, RCV_TID_FLOW_TABLE + (8 * fidx));
+	return mask_psn(reg);
+}
+
+static void tid_rdma_rcv_err(struct hfi1_packet *packet,
+			     struct ib_other_headers *ohdr,
+			     struct rvt_qp *qp, u32 psn, int diff, bool fecn)
+{
+	unsigned long flags;
+
+	tid_rdma_rcv_error(packet, ohdr, qp, psn, diff);
+	if (fecn) {
+		spin_lock_irqsave(&qp->s_lock, flags);
+		qp->s_flags |= RVT_S_ECN;
+		spin_unlock_irqrestore(&qp->s_lock, flags);
+	}
+}
+
+static void update_r_next_psn_fecn(struct hfi1_packet *packet,
+				   struct hfi1_qp_priv *priv,
+				   struct hfi1_ctxtdata *rcd,
+				   struct tid_rdma_flow *flow,
+				   bool fecn)
+{
+	/*
+	 * If a start/middle packet is delivered here due to
+	 * RSM rule and FECN, we need to update the r_next_psn.
+	 */
+	if (fecn && packet->etype == RHF_RCV_TYPE_EAGER &&
+	    !(priv->s_flags & HFI1_R_TID_SW_PSN)) {
+		struct hfi1_devdata *dd = rcd->dd;
+
+		flow->flow_state.r_next_psn =
+			read_r_next_psn(dd, rcd->ctxt, flow->idx);
+	}
+}
diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.h b/drivers/infiniband/hw/hfi1/tid_rdma.h
new file mode 100644
index 0000000..6e82df2
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/tid_rdma.h
@@ -0,0 +1,319 @@
+/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */
+/*
+ * Copyright(c) 2018 Intel Corporation.
+ *
+ */
+#ifndef HFI1_TID_RDMA_H
+#define HFI1_TID_RDMA_H
+
+#include <linux/circ_buf.h>
+#include "common.h"
+
+/* Add a convenience helper */
+#define CIRC_ADD(val, add, size) (((val) + (add)) & ((size) - 1))
+#define CIRC_NEXT(val, size) CIRC_ADD(val, 1, size)
+#define CIRC_PREV(val, size) CIRC_ADD(val, -1, size)
+
+#define TID_RDMA_MIN_SEGMENT_SIZE       BIT(18)   /* 256 KiB (for now) */
+#define TID_RDMA_MAX_SEGMENT_SIZE       BIT(18)   /* 256 KiB (for now) */
+#define TID_RDMA_MAX_PAGES              (BIT(18) >> PAGE_SHIFT)
+#define TID_RDMA_SEGMENT_SHIFT		18
+
+/*
+ * Bit definitions for priv->s_flags.
+ * These bit flags overload the bit flags defined for the QP's s_flags.
+ * Due to the fact that these bit fields are used only for the QP priv
+ * s_flags, there are no collisions.
+ *
+ * HFI1_S_TID_WAIT_INTERLCK - QP is waiting for requester interlock
+ * HFI1_R_TID_WAIT_INTERLCK - QP is waiting for responder interlock
+ */
+#define HFI1_S_TID_BUSY_SET       BIT(0)
+/* BIT(1) reserved for RVT_S_BUSY. */
+#define HFI1_R_TID_RSC_TIMER      BIT(2)
+/* BIT(3) reserved for RVT_S_RESP_PENDING. */
+/* BIT(4) reserved for RVT_S_ACK_PENDING. */
+#define HFI1_S_TID_WAIT_INTERLCK  BIT(5)
+#define HFI1_R_TID_WAIT_INTERLCK  BIT(6)
+/* BIT(7) - BIT(15) reserved for RVT_S_WAIT_*. */
+/* BIT(16) reserved for RVT_S_SEND_ONE */
+#define HFI1_S_TID_RETRY_TIMER    BIT(17)
+/* BIT(18) reserved for RVT_S_ECN. */
+#define HFI1_R_TID_SW_PSN         BIT(19)
+/* BIT(26) reserved for HFI1_S_WAIT_HALT */
+/* BIT(27) reserved for HFI1_S_WAIT_TID_RESP */
+/* BIT(28) reserved for HFI1_S_WAIT_TID_SPACE */
+
+/*
+ * Unlike regular IB RDMA VERBS, which do not require an entry
+ * in the s_ack_queue, TID RDMA WRITE requests do because they
+ * generate responses.
+ * Therefore, the s_ack_queue needs to be extended by a certain
+ * amount. The key point is that the queue needs to be extended
+ * without letting the "user" know so they user doesn't end up
+ * using these extra entries.
+ */
+#define HFI1_TID_RDMA_WRITE_CNT 8
+
+struct tid_rdma_params {
+	struct rcu_head rcu_head;
+	u32 qp;
+	u32 max_len;
+	u16 jkey;
+	u8 max_read;
+	u8 max_write;
+	u8 timeout;
+	u8 urg;
+	u8 version;
+};
+
+struct tid_rdma_qp_params {
+	struct work_struct trigger_work;
+	struct tid_rdma_params local;
+	struct tid_rdma_params __rcu *remote;
+};
+
+/* Track state for each hardware flow */
+struct tid_flow_state {
+	u32 generation;
+	u32 psn;
+	u8 index;
+	u8 last_index;
+};
+
+enum tid_rdma_req_state {
+	TID_REQUEST_INACTIVE = 0,
+	TID_REQUEST_INIT,
+	TID_REQUEST_INIT_RESEND,
+	TID_REQUEST_ACTIVE,
+	TID_REQUEST_RESEND,
+	TID_REQUEST_RESEND_ACTIVE,
+	TID_REQUEST_QUEUED,
+	TID_REQUEST_SYNC,
+	TID_REQUEST_RNR_NAK,
+	TID_REQUEST_COMPLETE,
+};
+
+struct tid_rdma_request {
+	struct rvt_qp *qp;
+	struct hfi1_ctxtdata *rcd;
+	union {
+		struct rvt_swqe *swqe;
+		struct rvt_ack_entry *ack;
+	} e;
+
+	struct tid_rdma_flow *flows;	/* array of tid flows */
+	struct rvt_sge_state ss; /* SGE state for TID RDMA requests */
+	u16 n_flows;		/* size of the flow buffer window */
+	u16 setup_head;		/* flow index we are setting up */
+	u16 clear_tail;		/* flow index we are clearing */
+	u16 flow_idx;		/* flow index most recently set up */
+	u16 acked_tail;
+
+	u32 seg_len;
+	u32 total_len;
+	u32 r_ack_psn;          /* next expected ack PSN */
+	u32 r_flow_psn;         /* IB PSN of next segment start */
+	u32 r_last_acked;       /* IB PSN of last ACK'ed packet */
+	u32 s_next_psn;		/* IB PSN of next segment start for read */
+
+	u32 total_segs;		/* segments required to complete a request */
+	u32 cur_seg;		/* index of current segment */
+	u32 comp_seg;           /* index of last completed segment */
+	u32 ack_seg;            /* index of last ack'ed segment */
+	u32 alloc_seg;          /* index of next segment to be allocated */
+	u32 isge;		/* index of "current" sge */
+	u32 ack_pending;        /* num acks pending for this request */
+
+	enum tid_rdma_req_state state;
+};
+
+/*
+ * When header suppression is used, PSNs associated with a "flow" are
+ * relevant (and not the PSNs maintained by verbs). Track per-flow
+ * PSNs here for a TID RDMA segment.
+ *
+ */
+struct flow_state {
+	u32 flags;
+	u32 resp_ib_psn;     /* The IB PSN of the response for this flow */
+	u32 generation;      /* generation of flow */
+	u32 spsn;            /* starting PSN in TID space */
+	u32 lpsn;            /* last PSN in TID space */
+	u32 r_next_psn;      /* next PSN to be received (in TID space) */
+
+	/* For tid rdma read */
+	u32 ib_spsn;         /* starting PSN in Verbs space */
+	u32 ib_lpsn;         /* last PSn in Verbs space */
+};
+
+struct tid_rdma_pageset {
+	dma_addr_t addr : 48; /* Only needed for the first page */
+	u8 idx: 8;
+	u8 count : 7;
+	u8 mapped: 1;
+};
+
+/**
+ * kern_tid_node - used for managing TID's in TID groups
+ *
+ * @grp_idx: rcd relative index to tid_group
+ * @map: grp->map captured prior to programming this TID group in HW
+ * @cnt: Only @cnt of available group entries are actually programmed
+ */
+struct kern_tid_node {
+	struct tid_group *grp;
+	u8 map;
+	u8 cnt;
+};
+
+/* Overall info for a TID RDMA segment */
+struct tid_rdma_flow {
+	/*
+	 * While a TID RDMA segment is being transferred, it uses a QP number
+	 * from the "KDETH section of QP numbers" (which is different from the
+	 * QP number that originated the request). Bits 11-15 of these QP
+	 * numbers identify the "TID flow" for the segment.
+	 */
+	struct flow_state flow_state;
+	struct tid_rdma_request *req;
+	u32 tid_qpn;
+	u32 tid_offset;
+	u32 length;
+	u32 sent;
+	u8 tnode_cnt;
+	u8 tidcnt;
+	u8 tid_idx;
+	u8 idx;
+	u8 npagesets;
+	u8 npkts;
+	u8 pkt;
+	u8 resync_npkts;
+	struct kern_tid_node tnode[TID_RDMA_MAX_PAGES];
+	struct tid_rdma_pageset pagesets[TID_RDMA_MAX_PAGES];
+	u32 tid_entry[TID_RDMA_MAX_PAGES];
+};
+
+enum tid_rnr_nak_state {
+	TID_RNR_NAK_INIT = 0,
+	TID_RNR_NAK_SEND,
+	TID_RNR_NAK_SENT,
+};
+
+bool tid_rdma_conn_req(struct rvt_qp *qp, u64 *data);
+bool tid_rdma_conn_reply(struct rvt_qp *qp, u64 data);
+bool tid_rdma_conn_resp(struct rvt_qp *qp, u64 *data);
+void tid_rdma_conn_error(struct rvt_qp *qp);
+void tid_rdma_opfn_init(struct rvt_qp *qp, struct tid_rdma_params *p);
+
+int hfi1_kern_exp_rcv_init(struct hfi1_ctxtdata *rcd, int reinit);
+int hfi1_kern_exp_rcv_setup(struct tid_rdma_request *req,
+			    struct rvt_sge_state *ss, bool *last);
+int hfi1_kern_exp_rcv_clear(struct tid_rdma_request *req);
+void hfi1_kern_exp_rcv_clear_all(struct tid_rdma_request *req);
+void __trdma_clean_swqe(struct rvt_qp *qp, struct rvt_swqe *wqe);
+
+/**
+ * trdma_clean_swqe - clean flows for swqe if large send queue
+ * @qp: the qp
+ * @wqe: the send wqe
+ */
+static inline void trdma_clean_swqe(struct rvt_qp *qp, struct rvt_swqe *wqe)
+{
+	if (!wqe->priv)
+		return;
+	__trdma_clean_swqe(qp, wqe);
+}
+
+void hfi1_kern_read_tid_flow_free(struct rvt_qp *qp);
+
+int hfi1_qp_priv_init(struct rvt_dev_info *rdi, struct rvt_qp *qp,
+		      struct ib_qp_init_attr *init_attr);
+void hfi1_qp_priv_tid_free(struct rvt_dev_info *rdi, struct rvt_qp *qp);
+
+void hfi1_tid_rdma_flush_wait(struct rvt_qp *qp);
+
+int hfi1_kern_setup_hw_flow(struct hfi1_ctxtdata *rcd, struct rvt_qp *qp);
+void hfi1_kern_clear_hw_flow(struct hfi1_ctxtdata *rcd, struct rvt_qp *qp);
+void hfi1_kern_init_ctxt_generations(struct hfi1_ctxtdata *rcd);
+
+struct cntr_entry;
+u64 hfi1_access_sw_tid_wait(const struct cntr_entry *entry,
+			    void *context, int vl, int mode, u64 data);
+
+u32 hfi1_build_tid_rdma_read_packet(struct rvt_swqe *wqe,
+				    struct ib_other_headers *ohdr,
+				    u32 *bth1, u32 *bth2, u32 *len);
+u32 hfi1_build_tid_rdma_read_req(struct rvt_qp *qp, struct rvt_swqe *wqe,
+				 struct ib_other_headers *ohdr, u32 *bth1,
+				 u32 *bth2, u32 *len);
+void hfi1_rc_rcv_tid_rdma_read_req(struct hfi1_packet *packet);
+u32 hfi1_build_tid_rdma_read_resp(struct rvt_qp *qp, struct rvt_ack_entry *e,
+				  struct ib_other_headers *ohdr, u32 *bth0,
+				  u32 *bth1, u32 *bth2, u32 *len, bool *last);
+void hfi1_rc_rcv_tid_rdma_read_resp(struct hfi1_packet *packet);
+bool hfi1_handle_kdeth_eflags(struct hfi1_ctxtdata *rcd,
+			      struct hfi1_pportdata *ppd,
+			      struct hfi1_packet *packet);
+void hfi1_tid_rdma_restart_req(struct rvt_qp *qp, struct rvt_swqe *wqe,
+			       u32 *bth2);
+void hfi1_qp_kern_exp_rcv_clear_all(struct rvt_qp *qp);
+bool hfi1_tid_rdma_wqe_interlock(struct rvt_qp *qp, struct rvt_swqe *wqe);
+
+void setup_tid_rdma_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe);
+static inline void hfi1_setup_tid_rdma_wqe(struct rvt_qp *qp,
+					   struct rvt_swqe *wqe)
+{
+	if (wqe->priv &&
+	    (wqe->wr.opcode == IB_WR_RDMA_READ ||
+	     wqe->wr.opcode == IB_WR_RDMA_WRITE) &&
+	    wqe->length >= TID_RDMA_MIN_SEGMENT_SIZE)
+		setup_tid_rdma_wqe(qp, wqe);
+}
+
+u32 hfi1_build_tid_rdma_write_req(struct rvt_qp *qp, struct rvt_swqe *wqe,
+				  struct ib_other_headers *ohdr,
+				  u32 *bth1, u32 *bth2, u32 *len);
+
+void hfi1_rc_rcv_tid_rdma_write_req(struct hfi1_packet *packet);
+
+u32 hfi1_build_tid_rdma_write_resp(struct rvt_qp *qp, struct rvt_ack_entry *e,
+				   struct ib_other_headers *ohdr, u32 *bth1,
+				   u32 bth2, u32 *len,
+				   struct rvt_sge_state **ss);
+
+void hfi1_del_tid_reap_timer(struct rvt_qp *qp);
+
+void hfi1_rc_rcv_tid_rdma_write_resp(struct hfi1_packet *packet);
+
+bool hfi1_build_tid_rdma_packet(struct rvt_swqe *wqe,
+				struct ib_other_headers *ohdr,
+				u32 *bth1, u32 *bth2, u32 *len);
+
+void hfi1_rc_rcv_tid_rdma_write_data(struct hfi1_packet *packet);
+
+u32 hfi1_build_tid_rdma_write_ack(struct rvt_qp *qp, struct rvt_ack_entry *e,
+				  struct ib_other_headers *ohdr, u16 iflow,
+				  u32 *bth1, u32 *bth2);
+
+void hfi1_rc_rcv_tid_rdma_ack(struct hfi1_packet *packet);
+
+void hfi1_add_tid_retry_timer(struct rvt_qp *qp);
+void hfi1_del_tid_retry_timer(struct rvt_qp *qp);
+
+u32 hfi1_build_tid_rdma_resync(struct rvt_qp *qp, struct rvt_swqe *wqe,
+			       struct ib_other_headers *ohdr, u32 *bth1,
+			       u32 *bth2, u16 fidx);
+
+void hfi1_rc_rcv_tid_rdma_resync(struct hfi1_packet *packet);
+
+struct hfi1_pkt_state;
+int hfi1_make_tid_rdma_pkt(struct rvt_qp *qp, struct hfi1_pkt_state *ps);
+
+void _hfi1_do_tid_send(struct work_struct *work);
+
+bool hfi1_schedule_tid_send(struct rvt_qp *qp);
+
+bool hfi1_tid_rdma_ack_interlock(struct rvt_qp *qp, struct rvt_ack_entry *e);
+
+#endif /* HFI1_TID_RDMA_H */
diff --git a/drivers/infiniband/hw/hfi1/trace.c b/drivers/infiniband/hw/hfi1/trace.c
index 7c8aed0..9a3d236 100644
--- a/drivers/infiniband/hw/hfi1/trace.c
+++ b/drivers/infiniband/hw/hfi1/trace.c
@@ -46,6 +46,7 @@
  */
 #define CREATE_TRACE_POINTS
 #include "trace.h"
+#include "exp_rcv.h"
 
 static u8 __get_ib_hdr_len(struct ib_header *hdr)
 {
@@ -128,6 +129,15 @@
 #define IETH_PRN "ieth rkey:0x%.8x"
 #define ATOMICACKETH_PRN "origdata:%llx"
 #define ATOMICETH_PRN "vaddr:0x%llx rkey:0x%.8x sdata:%llx cdata:%llx"
+#define TID_RDMA_KDETH "kdeth0 0x%x kdeth1 0x%x"
+#define TID_RDMA_KDETH_DATA "kdeth0 0x%x: kver %u sh %u intr %u tidctrl %u tid %x offset %x kdeth1 0x%x: jkey %x"
+#define TID_READ_REQ_PRN "tid_flow_psn 0x%x tid_flow_qp 0x%x verbs_qp 0x%x"
+#define TID_READ_RSP_PRN "verbs_qp 0x%x"
+#define TID_WRITE_REQ_PRN "original_qp 0x%x"
+#define TID_WRITE_RSP_PRN "tid_flow_psn 0x%x tid_flow_qp 0x%x verbs_qp 0x%x"
+#define TID_WRITE_DATA_PRN "verbs_qp 0x%x"
+#define TID_ACK_PRN "tid_flow_psn 0x%x verbs_psn 0x%x tid_flow_qp 0x%x verbs_qp 0x%x"
+#define TID_RESYNC_PRN "verbs_qp 0x%x"
 
 #define OP(transport, op) IB_OPCODE_## transport ## _ ## op
 
@@ -322,6 +332,99 @@
 				 parse_syndrome(be32_to_cpu(eh->aeth) >> 24),
 				 be32_to_cpu(eh->aeth) & IB_MSN_MASK);
 		break;
+	case OP(TID_RDMA, WRITE_REQ):
+		trace_seq_printf(p, TID_RDMA_KDETH " " RETH_PRN " "
+				 TID_WRITE_REQ_PRN,
+				 le32_to_cpu(eh->tid_rdma.w_req.kdeth0),
+				 le32_to_cpu(eh->tid_rdma.w_req.kdeth1),
+				 ib_u64_get(&eh->tid_rdma.w_req.reth.vaddr),
+				 be32_to_cpu(eh->tid_rdma.w_req.reth.rkey),
+				 be32_to_cpu(eh->tid_rdma.w_req.reth.length),
+				 be32_to_cpu(eh->tid_rdma.w_req.verbs_qp));
+		break;
+	case OP(TID_RDMA, WRITE_RESP):
+		trace_seq_printf(p, TID_RDMA_KDETH " " AETH_PRN " "
+				 TID_WRITE_RSP_PRN,
+				 le32_to_cpu(eh->tid_rdma.w_rsp.kdeth0),
+				 le32_to_cpu(eh->tid_rdma.w_rsp.kdeth1),
+				 be32_to_cpu(eh->tid_rdma.w_rsp.aeth) >> 24,
+				 parse_syndrome(/* aeth */
+					 be32_to_cpu(eh->tid_rdma.w_rsp.aeth)
+					 >> 24),
+				 (be32_to_cpu(eh->tid_rdma.w_rsp.aeth) &
+				  IB_MSN_MASK),
+				 be32_to_cpu(eh->tid_rdma.w_rsp.tid_flow_psn),
+				 be32_to_cpu(eh->tid_rdma.w_rsp.tid_flow_qp),
+				 be32_to_cpu(eh->tid_rdma.w_rsp.verbs_qp));
+		break;
+	case OP(TID_RDMA, WRITE_DATA_LAST):
+	case OP(TID_RDMA, WRITE_DATA):
+		trace_seq_printf(p, TID_RDMA_KDETH_DATA " " TID_WRITE_DATA_PRN,
+				 le32_to_cpu(eh->tid_rdma.w_data.kdeth0),
+				 KDETH_GET(eh->tid_rdma.w_data.kdeth0, KVER),
+				 KDETH_GET(eh->tid_rdma.w_data.kdeth0, SH),
+				 KDETH_GET(eh->tid_rdma.w_data.kdeth0, INTR),
+				 KDETH_GET(eh->tid_rdma.w_data.kdeth0, TIDCTRL),
+				 KDETH_GET(eh->tid_rdma.w_data.kdeth0, TID),
+				 KDETH_GET(eh->tid_rdma.w_data.kdeth0, OFFSET),
+				 le32_to_cpu(eh->tid_rdma.w_data.kdeth1),
+				 KDETH_GET(eh->tid_rdma.w_data.kdeth1, JKEY),
+				 be32_to_cpu(eh->tid_rdma.w_data.verbs_qp));
+		break;
+	case OP(TID_RDMA, READ_REQ):
+		trace_seq_printf(p, TID_RDMA_KDETH " " RETH_PRN " "
+				 TID_READ_REQ_PRN,
+				 le32_to_cpu(eh->tid_rdma.r_req.kdeth0),
+				 le32_to_cpu(eh->tid_rdma.r_req.kdeth1),
+				 ib_u64_get(&eh->tid_rdma.r_req.reth.vaddr),
+				 be32_to_cpu(eh->tid_rdma.r_req.reth.rkey),
+				 be32_to_cpu(eh->tid_rdma.r_req.reth.length),
+				 be32_to_cpu(eh->tid_rdma.r_req.tid_flow_psn),
+				 be32_to_cpu(eh->tid_rdma.r_req.tid_flow_qp),
+				 be32_to_cpu(eh->tid_rdma.r_req.verbs_qp));
+		break;
+	case OP(TID_RDMA, READ_RESP):
+		trace_seq_printf(p, TID_RDMA_KDETH_DATA " " AETH_PRN " "
+				 TID_READ_RSP_PRN,
+				 le32_to_cpu(eh->tid_rdma.r_rsp.kdeth0),
+				 KDETH_GET(eh->tid_rdma.r_rsp.kdeth0, KVER),
+				 KDETH_GET(eh->tid_rdma.r_rsp.kdeth0, SH),
+				 KDETH_GET(eh->tid_rdma.r_rsp.kdeth0, INTR),
+				 KDETH_GET(eh->tid_rdma.r_rsp.kdeth0, TIDCTRL),
+				 KDETH_GET(eh->tid_rdma.r_rsp.kdeth0, TID),
+				 KDETH_GET(eh->tid_rdma.r_rsp.kdeth0, OFFSET),
+				 le32_to_cpu(eh->tid_rdma.r_rsp.kdeth1),
+				 KDETH_GET(eh->tid_rdma.r_rsp.kdeth1, JKEY),
+				 be32_to_cpu(eh->tid_rdma.r_rsp.aeth) >> 24,
+				 parse_syndrome(/* aeth */
+					 be32_to_cpu(eh->tid_rdma.r_rsp.aeth)
+					 >> 24),
+				 (be32_to_cpu(eh->tid_rdma.r_rsp.aeth) &
+				  IB_MSN_MASK),
+				 be32_to_cpu(eh->tid_rdma.r_rsp.verbs_qp));
+		break;
+	case OP(TID_RDMA, ACK):
+		trace_seq_printf(p, TID_RDMA_KDETH " " AETH_PRN " "
+				 TID_ACK_PRN,
+				 le32_to_cpu(eh->tid_rdma.ack.kdeth0),
+				 le32_to_cpu(eh->tid_rdma.ack.kdeth1),
+				 be32_to_cpu(eh->tid_rdma.ack.aeth) >> 24,
+				 parse_syndrome(/* aeth */
+					 be32_to_cpu(eh->tid_rdma.ack.aeth)
+					 >> 24),
+				 (be32_to_cpu(eh->tid_rdma.ack.aeth) &
+				  IB_MSN_MASK),
+				 be32_to_cpu(eh->tid_rdma.ack.tid_flow_psn),
+				 be32_to_cpu(eh->tid_rdma.ack.verbs_psn),
+				 be32_to_cpu(eh->tid_rdma.ack.tid_flow_qp),
+				 be32_to_cpu(eh->tid_rdma.ack.verbs_qp));
+		break;
+	case OP(TID_RDMA, RESYNC):
+		trace_seq_printf(p, TID_RDMA_KDETH " " TID_RESYNC_PRN,
+				 le32_to_cpu(eh->tid_rdma.resync.kdeth0),
+				 le32_to_cpu(eh->tid_rdma.resync.kdeth1),
+				 be32_to_cpu(eh->tid_rdma.resync.verbs_qp));
+		break;
 	/* aeth + atomicacketh */
 	case OP(RC, ATOMIC_ACKNOWLEDGE):
 		trace_seq_printf(p, AETH_PRN " " ATOMICACKETH_PRN,
@@ -394,6 +497,21 @@
 	return ret;
 }
 
+u8 hfi1_trace_get_tid_ctrl(u32 ent)
+{
+	return EXP_TID_GET(ent, CTRL);
+}
+
+u16 hfi1_trace_get_tid_len(u32 ent)
+{
+	return EXP_TID_GET(ent, LEN);
+}
+
+u16 hfi1_trace_get_tid_idx(u32 ent)
+{
+	return EXP_TID_GET(ent, IDX);
+}
+
 __hfi1_trace_fn(AFFINITY);
 __hfi1_trace_fn(PKT);
 __hfi1_trace_fn(PROC);
diff --git a/drivers/infiniband/hw/hfi1/trace.h b/drivers/infiniband/hw/hfi1/trace.h
index 8540463..1ce5518 100644
--- a/drivers/infiniband/hw/hfi1/trace.h
+++ b/drivers/infiniband/hw/hfi1/trace.h
@@ -1,5 +1,5 @@
 /*
- * Copyright(c) 2015 - 2017 Intel Corporation.
+ * Copyright(c) 2015 - 2018 Intel Corporation.
  *
  * This file is provided under a dual BSD/GPLv2 license.  When using or
  * redistributing this file, you may do so under either license.
@@ -62,3 +62,5 @@
 #include "trace_rx.h"
 #include "trace_tx.h"
 #include "trace_mmu.h"
+#include "trace_iowait.h"
+#include "trace_tid.h"
diff --git a/drivers/infiniband/hw/hfi1/trace_dbg.h b/drivers/infiniband/hw/hfi1/trace_dbg.h
index e62171f..de7a873 100644
--- a/drivers/infiniband/hw/hfi1/trace_dbg.h
+++ b/drivers/infiniband/hw/hfi1/trace_dbg.h
@@ -86,14 +86,14 @@
  * actual function to work and can not be in a macro.
  */
 #define __hfi1_trace_def(lvl) \
-void __hfi1_trace_##lvl(const char *funct, char *fmt, ...);		\
+void __printf(2, 3) __hfi1_trace_##lvl(const char *funct, char *fmt, ...); \
 									\
 DEFINE_EVENT(hfi1_trace_template, hfi1_ ##lvl,				\
 	TP_PROTO(const char *function, struct va_format *vaf),		\
 	TP_ARGS(function, vaf))
 
 #define __hfi1_trace_fn(lvl) \
-void __hfi1_trace_##lvl(const char *func, char *fmt, ...)		\
+void __printf(2, 3) __hfi1_trace_##lvl(const char *func, char *fmt, ...)\
 {									\
 	struct va_format vaf = {					\
 		.fmt = fmt,						\
diff --git a/drivers/infiniband/hw/hfi1/trace_ibhdrs.h b/drivers/infiniband/hw/hfi1/trace_ibhdrs.h
index 1dc2c28..2f84290 100644
--- a/drivers/infiniband/hw/hfi1/trace_ibhdrs.h
+++ b/drivers/infiniband/hw/hfi1/trace_ibhdrs.h
@@ -79,6 +79,16 @@
 	ib_opcode_name(RC_ATOMIC_ACKNOWLEDGE),             \
 	ib_opcode_name(RC_COMPARE_SWAP),                   \
 	ib_opcode_name(RC_FETCH_ADD),                      \
+	ib_opcode_name(RC_SEND_LAST_WITH_INVALIDATE),      \
+	ib_opcode_name(RC_SEND_ONLY_WITH_INVALIDATE),      \
+	ib_opcode_name(TID_RDMA_WRITE_REQ),	           \
+	ib_opcode_name(TID_RDMA_WRITE_RESP),	           \
+	ib_opcode_name(TID_RDMA_WRITE_DATA),	           \
+	ib_opcode_name(TID_RDMA_WRITE_DATA_LAST),          \
+	ib_opcode_name(TID_RDMA_READ_REQ),	           \
+	ib_opcode_name(TID_RDMA_READ_RESP),	           \
+	ib_opcode_name(TID_RDMA_RESYNC),	           \
+	ib_opcode_name(TID_RDMA_ACK),                      \
 	ib_opcode_name(UC_SEND_FIRST),                     \
 	ib_opcode_name(UC_SEND_MIDDLE),                    \
 	ib_opcode_name(UC_SEND_LAST),                      \
diff --git a/drivers/infiniband/hw/hfi1/trace_iowait.h b/drivers/infiniband/hw/hfi1/trace_iowait.h
new file mode 100644
index 0000000..27f4334
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/trace_iowait.h
@@ -0,0 +1,54 @@
+/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */
+/*
+ * Copyright(c) 2018 Intel Corporation.
+ *
+ */
+#if !defined(__HFI1_TRACE_IOWAIT_H) || defined(TRACE_HEADER_MULTI_READ)
+#define __HFI1_TRACE_IOWAIT_H
+
+#include <linux/tracepoint.h>
+#include "iowait.h"
+#include "verbs.h"
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM hfi1_iowait
+
+DECLARE_EVENT_CLASS(hfi1_iowait_template,
+		    TP_PROTO(struct iowait *wait, u32 flag),
+		    TP_ARGS(wait, flag),
+		    TP_STRUCT__entry(/* entry */
+			    __field(unsigned long, addr)
+			    __field(unsigned long, flags)
+			    __field(u32, flag)
+			    __field(u32, qpn)
+			    ),
+		    TP_fast_assign(/* assign */
+			    __entry->addr = (unsigned long)wait;
+			    __entry->flags = wait->flags;
+			    __entry->flag = (1 << flag);
+			    __entry->qpn = iowait_to_qp(wait)->ibqp.qp_num;
+			    ),
+		    TP_printk(/* print */
+			    "iowait 0x%lx qp %u flags 0x%lx flag 0x%x",
+			    __entry->addr,
+			    __entry->qpn,
+			    __entry->flags,
+			    __entry->flag
+			    )
+	);
+
+DEFINE_EVENT(hfi1_iowait_template, hfi1_iowait_set,
+	     TP_PROTO(struct iowait *wait, u32 flag),
+	     TP_ARGS(wait, flag));
+
+DEFINE_EVENT(hfi1_iowait_template, hfi1_iowait_clear,
+	     TP_PROTO(struct iowait *wait, u32 flag),
+	     TP_ARGS(wait, flag));
+
+#endif /* __HFI1_TRACE_IOWAIT_H */
+
+#undef TRACE_INCLUDE_PATH
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE trace_iowait
+#include <trace/define_trace.h>
diff --git a/drivers/infiniband/hw/hfi1/trace_rc.h b/drivers/infiniband/hw/hfi1/trace_rc.h
index 8ce4765..1ebca37 100644
--- a/drivers/infiniband/hw/hfi1/trace_rc.h
+++ b/drivers/infiniband/hw/hfi1/trace_rc.h
@@ -109,6 +109,54 @@
 	     TP_ARGS(qp, psn)
 );
 
+DEFINE_EVENT(/* event */
+	hfi1_rc_template, hfi1_rc_completion,
+	TP_PROTO(struct rvt_qp *qp, u32 psn),
+	TP_ARGS(qp, psn)
+);
+
+DECLARE_EVENT_CLASS(/* rc_ack */
+	hfi1_rc_ack_template,
+	TP_PROTO(struct rvt_qp *qp, u32 aeth, u32 psn,
+		 struct rvt_swqe *wqe),
+	TP_ARGS(qp, aeth, psn, wqe),
+	TP_STRUCT__entry(/* entry */
+		DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
+		__field(u32, qpn)
+		__field(u32, aeth)
+		__field(u32, psn)
+		__field(u8, opcode)
+		__field(u32, spsn)
+		__field(u32, lpsn)
+	),
+	TP_fast_assign(/* assign */
+		DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device))
+		__entry->qpn = qp->ibqp.qp_num;
+		__entry->aeth = aeth;
+		__entry->psn = psn;
+		__entry->opcode = wqe->wr.opcode;
+		__entry->spsn = wqe->psn;
+		__entry->lpsn = wqe->lpsn;
+	),
+	TP_printk(/* print */
+		"[%s] qpn 0x%x aeth 0x%x psn 0x%x opcode 0x%x spsn 0x%x lpsn 0x%x",
+		__get_str(dev),
+		__entry->qpn,
+		__entry->aeth,
+		__entry->psn,
+		__entry->opcode,
+		__entry->spsn,
+		__entry->lpsn
+	)
+);
+
+DEFINE_EVENT(/* do_rc_ack */
+	hfi1_rc_ack_template, hfi1_rc_ack_do,
+	TP_PROTO(struct rvt_qp *qp, u32 aeth, u32 psn,
+		 struct rvt_swqe *wqe),
+	TP_ARGS(qp, aeth, psn, wqe)
+);
+
 #endif /* __HFI1_TRACE_RC_H */
 
 #undef TRACE_INCLUDE_PATH
diff --git a/drivers/infiniband/hw/hfi1/trace_rx.h b/drivers/infiniband/hw/hfi1/trace_rx.h
index 7eceb57..3cec960 100644
--- a/drivers/infiniband/hw/hfi1/trace_rx.h
+++ b/drivers/infiniband/hw/hfi1/trace_rx.h
@@ -1,5 +1,5 @@
 /*
- * Copyright(c) 2015 - 2017 Intel Corporation.
+ * Copyright(c) 2015 - 2018 Intel Corporation.
  *
  * This file is provided under a dual BSD/GPLv2 license.  When using or
  * redistributing this file, you may do so under either license.
@@ -128,111 +128,6 @@
 		      )
 );
 
-DECLARE_EVENT_CLASS(
-	    hfi1_exp_tid_reg_unreg,
-	    TP_PROTO(unsigned int ctxt, u16 subctxt, u32 rarr,
-		     u32 npages, unsigned long va, unsigned long pa,
-		     dma_addr_t dma),
-	    TP_ARGS(ctxt, subctxt, rarr, npages, va, pa, dma),
-	    TP_STRUCT__entry(
-			     __field(unsigned int, ctxt)
-			     __field(u16, subctxt)
-			     __field(u32, rarr)
-			     __field(u32, npages)
-			     __field(unsigned long, va)
-			     __field(unsigned long, pa)
-			     __field(dma_addr_t, dma)
-			     ),
-	    TP_fast_assign(
-			   __entry->ctxt = ctxt;
-			   __entry->subctxt = subctxt;
-			   __entry->rarr = rarr;
-			   __entry->npages = npages;
-			   __entry->va = va;
-			   __entry->pa = pa;
-			   __entry->dma = dma;
-			   ),
-	    TP_printk("[%u:%u] entry:%u, %u pages @ 0x%lx, va:0x%lx dma:0x%llx",
-		      __entry->ctxt,
-		      __entry->subctxt,
-		      __entry->rarr,
-		      __entry->npages,
-		      __entry->pa,
-		      __entry->va,
-		      __entry->dma
-		      )
-	);
-
-DEFINE_EVENT(
-	hfi1_exp_tid_reg_unreg, hfi1_exp_tid_unreg,
-	TP_PROTO(unsigned int ctxt, u16 subctxt, u32 rarr, u32 npages,
-		 unsigned long va, unsigned long pa, dma_addr_t dma),
-	TP_ARGS(ctxt, subctxt, rarr, npages, va, pa, dma));
-
-DEFINE_EVENT(
-	hfi1_exp_tid_reg_unreg, hfi1_exp_tid_reg,
-	TP_PROTO(unsigned int ctxt, u16 subctxt, u32 rarr, u32 npages,
-		 unsigned long va, unsigned long pa, dma_addr_t dma),
-	TP_ARGS(ctxt, subctxt, rarr, npages, va, pa, dma));
-
-TRACE_EVENT(
-	hfi1_put_tid,
-	TP_PROTO(struct hfi1_devdata *dd,
-		 u32 index, u32 type, unsigned long pa, u16 order),
-	TP_ARGS(dd, index, type, pa, order),
-	TP_STRUCT__entry(
-		DD_DEV_ENTRY(dd)
-		__field(unsigned long, pa);
-		__field(u32, index);
-		__field(u32, type);
-		__field(u16, order);
-	),
-	TP_fast_assign(
-		DD_DEV_ASSIGN(dd);
-		__entry->pa = pa;
-		__entry->index = index;
-		__entry->type = type;
-		__entry->order = order;
-	),
-	TP_printk("[%s] type %s pa %lx index %u order %u",
-		  __get_str(dev),
-		  show_tidtype(__entry->type),
-		  __entry->pa,
-		  __entry->index,
-		  __entry->order
-	)
-);
-
-TRACE_EVENT(hfi1_exp_tid_inval,
-	    TP_PROTO(unsigned int ctxt, u16 subctxt, unsigned long va, u32 rarr,
-		     u32 npages, dma_addr_t dma),
-	    TP_ARGS(ctxt, subctxt, va, rarr, npages, dma),
-	    TP_STRUCT__entry(
-			     __field(unsigned int, ctxt)
-			     __field(u16, subctxt)
-			     __field(unsigned long, va)
-			     __field(u32, rarr)
-			     __field(u32, npages)
-			     __field(dma_addr_t, dma)
-			     ),
-	    TP_fast_assign(
-			   __entry->ctxt = ctxt;
-			   __entry->subctxt = subctxt;
-			   __entry->va = va;
-			   __entry->rarr = rarr;
-			   __entry->npages = npages;
-			   __entry->dma = dma;
-			  ),
-	    TP_printk("[%u:%u] entry:%u, %u pages @ 0x%lx dma: 0x%llx",
-		      __entry->ctxt,
-		      __entry->subctxt,
-		      __entry->rarr,
-		      __entry->npages,
-		      __entry->va,
-		      __entry->dma
-		      )
-	    );
-
 TRACE_EVENT(hfi1_mmu_invalidate,
 	    TP_PROTO(unsigned int ctxt, u16 subctxt, const char *type,
 		     unsigned long start, unsigned long end),
diff --git a/drivers/infiniband/hw/hfi1/trace_tid.h b/drivers/infiniband/hw/hfi1/trace_tid.h
new file mode 100644
index 0000000..343fb98
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/trace_tid.h
@@ -0,0 +1,1642 @@
+/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */
+/*
+ * Copyright(c) 2018 Intel Corporation.
+ *
+ */
+#if !defined(__HFI1_TRACE_TID_H) || defined(TRACE_HEADER_MULTI_READ)
+#define __HFI1_TRACE_TID_H
+
+#include <linux/tracepoint.h>
+#include <linux/trace_seq.h>
+
+#include "hfi.h"
+
+#define tidtype_name(type) { PT_##type, #type }
+#define show_tidtype(type)                   \
+__print_symbolic(type,                       \
+	tidtype_name(EXPECTED),              \
+	tidtype_name(EAGER),                 \
+	tidtype_name(INVALID))               \
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM hfi1_tid
+
+u8 hfi1_trace_get_tid_ctrl(u32 ent);
+u16 hfi1_trace_get_tid_len(u32 ent);
+u16 hfi1_trace_get_tid_idx(u32 ent);
+
+#define OPFN_PARAM_PRN "[%s] qpn 0x%x %s OPFN: qp 0x%x, max read %u, " \
+		       "max write %u, max length %u, jkey 0x%x timeout %u " \
+		       "urg %u"
+
+#define TID_FLOW_PRN "[%s] qpn 0x%x flow %d: idx %d resp_ib_psn 0x%x " \
+		     "generation 0x%x fpsn 0x%x-%x r_next_psn 0x%x " \
+		     "ib_psn 0x%x-%x npagesets %u tnode_cnt %u " \
+		     "tidcnt %u tid_idx %u tid_offset %u length %u sent %u"
+
+#define TID_NODE_PRN "[%s] qpn 0x%x  %s idx %u grp base 0x%x map 0x%x " \
+		     "used %u cnt %u"
+
+#define RSP_INFO_PRN "[%s] qpn 0x%x state 0x%x s_state 0x%x psn 0x%x " \
+		     "r_psn 0x%x r_state 0x%x r_flags 0x%x " \
+		     "r_head_ack_queue %u s_tail_ack_queue %u " \
+		     "s_acked_ack_queue %u s_ack_state 0x%x " \
+		     "s_nak_state 0x%x s_flags 0x%x ps_flags 0x%x " \
+		     "iow_flags 0x%lx"
+
+#define SENDER_INFO_PRN "[%s] qpn 0x%x state 0x%x s_cur %u s_tail %u " \
+			"s_head %u s_acked %u s_last %u s_psn 0x%x " \
+			"s_last_psn 0x%x s_flags 0x%x ps_flags 0x%x " \
+			"iow_flags 0x%lx s_state 0x%x s_num_rd %u s_retry %u"
+
+#define TID_READ_SENDER_PRN "[%s] qpn 0x%x newreq %u tid_r_reqs %u " \
+			    "tid_r_comp %u pending_tid_r_segs %u " \
+			    "s_flags 0x%x ps_flags 0x%x iow_flags 0x%lx " \
+			    "s_state 0x%x hw_flow_index %u generation 0x%x " \
+			    "fpsn 0x%x"
+
+#define TID_REQ_PRN "[%s] qpn 0x%x newreq %u opcode 0x%x psn 0x%x lpsn 0x%x " \
+		    "cur_seg %u comp_seg %u ack_seg %u alloc_seg %u " \
+		    "total_segs %u setup_head %u clear_tail %u flow_idx %u " \
+		    "acked_tail %u state %u r_ack_psn 0x%x r_flow_psn 0x%x " \
+		    "r_last_ackd 0x%x s_next_psn 0x%x"
+
+#define RCV_ERR_PRN "[%s] qpn 0x%x s_flags 0x%x state 0x%x " \
+		    "s_acked_ack_queue %u s_tail_ack_queue %u " \
+		    "r_head_ack_queue %u opcode 0x%x psn 0x%x r_psn 0x%x " \
+		    " diff %d"
+
+#define TID_WRITE_RSPDR_PRN "[%s] qpn 0x%x r_tid_head %u r_tid_tail %u " \
+			    "r_tid_ack %u r_tid_alloc %u alloc_w_segs %u " \
+			    "pending_tid_w_segs %u sync_pt %s " \
+			    "ps_nak_psn 0x%x ps_nak_state 0x%x " \
+			    "prnr_nak_state 0x%x hw_flow_index %u generation "\
+			    "0x%x fpsn 0x%x resync %s" \
+			    "r_next_psn_kdeth 0x%x"
+
+#define TID_WRITE_SENDER_PRN "[%s] qpn 0x%x newreq %u s_tid_cur %u " \
+			     "s_tid_tail %u s_tid_head %u " \
+			     "pending_tid_w_resp %u n_requests %u " \
+			     "n_tid_requests %u s_flags 0x%x ps_flags 0x%x "\
+			     "iow_flags 0x%lx s_state 0x%x s_retry %u"
+
+#define KDETH_EFLAGS_ERR_PRN "[%s] qpn 0x%x  TID ERR: RcvType 0x%x " \
+			     "RcvTypeError 0x%x PSN 0x%x"
+
+DECLARE_EVENT_CLASS(/* class */
+	hfi1_exp_tid_reg_unreg,
+	TP_PROTO(unsigned int ctxt, u16 subctxt, u32 rarr, u32 npages,
+		 unsigned long va, unsigned long pa, dma_addr_t dma),
+	TP_ARGS(ctxt, subctxt, rarr, npages, va, pa, dma),
+	TP_STRUCT__entry(/* entry */
+		__field(unsigned int, ctxt)
+		__field(u16, subctxt)
+		__field(u32, rarr)
+		__field(u32, npages)
+		__field(unsigned long, va)
+		__field(unsigned long, pa)
+		__field(dma_addr_t, dma)
+	),
+	TP_fast_assign(/* assign */
+		__entry->ctxt = ctxt;
+		__entry->subctxt = subctxt;
+		__entry->rarr = rarr;
+		__entry->npages = npages;
+		__entry->va = va;
+		__entry->pa = pa;
+		__entry->dma = dma;
+	),
+	TP_printk("[%u:%u] entry:%u, %u pages @ 0x%lx, va:0x%lx dma:0x%llx",
+		  __entry->ctxt,
+		  __entry->subctxt,
+		  __entry->rarr,
+		  __entry->npages,
+		  __entry->pa,
+		  __entry->va,
+		  __entry->dma
+	)
+);
+
+DEFINE_EVENT(/* exp_tid_unreg */
+	hfi1_exp_tid_reg_unreg, hfi1_exp_tid_unreg,
+	TP_PROTO(unsigned int ctxt, u16 subctxt, u32 rarr, u32 npages,
+		 unsigned long va, unsigned long pa, dma_addr_t dma),
+	TP_ARGS(ctxt, subctxt, rarr, npages, va, pa, dma)
+);
+
+DEFINE_EVENT(/* exp_tid_reg */
+	hfi1_exp_tid_reg_unreg, hfi1_exp_tid_reg,
+	TP_PROTO(unsigned int ctxt, u16 subctxt, u32 rarr, u32 npages,
+		 unsigned long va, unsigned long pa, dma_addr_t dma),
+	TP_ARGS(ctxt, subctxt, rarr, npages, va, pa, dma)
+);
+
+TRACE_EVENT(/* put_tid */
+	hfi1_put_tid,
+	TP_PROTO(struct hfi1_devdata *dd,
+		 u32 index, u32 type, unsigned long pa, u16 order),
+	TP_ARGS(dd, index, type, pa, order),
+	TP_STRUCT__entry(/* entry */
+		DD_DEV_ENTRY(dd)
+		__field(unsigned long, pa);
+		__field(u32, index);
+		__field(u32, type);
+		__field(u16, order);
+	),
+	TP_fast_assign(/* assign */
+		DD_DEV_ASSIGN(dd);
+		__entry->pa = pa;
+		__entry->index = index;
+		__entry->type = type;
+		__entry->order = order;
+	),
+	TP_printk("[%s] type %s pa %lx index %u order %u",
+		  __get_str(dev),
+		  show_tidtype(__entry->type),
+		  __entry->pa,
+		  __entry->index,
+		  __entry->order
+	)
+);
+
+TRACE_EVENT(/* exp_tid_inval */
+	hfi1_exp_tid_inval,
+	TP_PROTO(unsigned int ctxt, u16 subctxt, unsigned long va, u32 rarr,
+		 u32 npages, dma_addr_t dma),
+	TP_ARGS(ctxt, subctxt, va, rarr, npages, dma),
+	TP_STRUCT__entry(/* entry */
+		__field(unsigned int, ctxt)
+		__field(u16, subctxt)
+		__field(unsigned long, va)
+		__field(u32, rarr)
+		__field(u32, npages)
+		__field(dma_addr_t, dma)
+	),
+	TP_fast_assign(/* assign */
+		__entry->ctxt = ctxt;
+		__entry->subctxt = subctxt;
+		__entry->va = va;
+		__entry->rarr = rarr;
+		__entry->npages = npages;
+		__entry->dma = dma;
+	),
+	TP_printk("[%u:%u] entry:%u, %u pages @ 0x%lx dma: 0x%llx",
+		  __entry->ctxt,
+		  __entry->subctxt,
+		  __entry->rarr,
+		  __entry->npages,
+		  __entry->va,
+		  __entry->dma
+	)
+);
+
+DECLARE_EVENT_CLASS(/* opfn_state */
+	hfi1_opfn_state_template,
+	TP_PROTO(struct rvt_qp *qp),
+	TP_ARGS(qp),
+	TP_STRUCT__entry(/* entry */
+		DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
+		__field(u32, qpn)
+		__field(u16, requested)
+		__field(u16, completed)
+		__field(u8, curr)
+	),
+	TP_fast_assign(/* assign */
+		struct hfi1_qp_priv *priv = qp->priv;
+
+		DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device));
+		__entry->qpn = qp->ibqp.qp_num;
+		__entry->requested = priv->opfn.requested;
+		__entry->completed = priv->opfn.completed;
+		__entry->curr = priv->opfn.curr;
+	),
+	TP_printk(/* print */
+		"[%s] qpn 0x%x requested 0x%x completed 0x%x curr 0x%x",
+		__get_str(dev),
+		__entry->qpn,
+		__entry->requested,
+		__entry->completed,
+		__entry->curr
+	)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_opfn_state_template, hfi1_opfn_state_conn_request,
+	TP_PROTO(struct rvt_qp *qp),
+	TP_ARGS(qp)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_opfn_state_template, hfi1_opfn_state_sched_conn_request,
+	TP_PROTO(struct rvt_qp *qp),
+	TP_ARGS(qp)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_opfn_state_template, hfi1_opfn_state_conn_response,
+	TP_PROTO(struct rvt_qp *qp),
+	TP_ARGS(qp)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_opfn_state_template, hfi1_opfn_state_conn_reply,
+	TP_PROTO(struct rvt_qp *qp),
+	TP_ARGS(qp)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_opfn_state_template, hfi1_opfn_state_conn_error,
+	TP_PROTO(struct rvt_qp *qp),
+	TP_ARGS(qp)
+);
+
+DECLARE_EVENT_CLASS(/* opfn_data */
+	hfi1_opfn_data_template,
+	TP_PROTO(struct rvt_qp *qp, u8 capcode, u64 data),
+	TP_ARGS(qp, capcode, data),
+	TP_STRUCT__entry(/* entry */
+		DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
+		__field(u32, qpn)
+		__field(u32, state)
+		__field(u8, capcode)
+		__field(u64, data)
+	),
+	TP_fast_assign(/* assign */
+		DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device));
+		__entry->qpn = qp->ibqp.qp_num;
+		__entry->state = qp->state;
+		__entry->capcode = capcode;
+		__entry->data = data;
+	),
+	TP_printk(/* printk */
+		"[%s] qpn 0x%x (state 0x%x) Capcode %u data 0x%llx",
+		__get_str(dev),
+		__entry->qpn,
+		__entry->state,
+		__entry->capcode,
+		__entry->data
+	)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_opfn_data_template, hfi1_opfn_data_conn_request,
+	TP_PROTO(struct rvt_qp *qp, u8 capcode, u64 data),
+	TP_ARGS(qp, capcode, data)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_opfn_data_template, hfi1_opfn_data_conn_response,
+	TP_PROTO(struct rvt_qp *qp, u8 capcode, u64 data),
+	TP_ARGS(qp, capcode, data)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_opfn_data_template, hfi1_opfn_data_conn_reply,
+	TP_PROTO(struct rvt_qp *qp, u8 capcode, u64 data),
+	TP_ARGS(qp, capcode, data)
+);
+
+DECLARE_EVENT_CLASS(/* opfn_param */
+	hfi1_opfn_param_template,
+	TP_PROTO(struct rvt_qp *qp, char remote,
+		 struct tid_rdma_params *param),
+	TP_ARGS(qp, remote, param),
+	TP_STRUCT__entry(/* entry */
+		DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
+		__field(u32, qpn)
+		__field(char, remote)
+		__field(u32, param_qp)
+		__field(u32, max_len)
+		__field(u16, jkey)
+		__field(u8, max_read)
+		__field(u8, max_write)
+		__field(u8, timeout)
+		__field(u8, urg)
+	),
+	TP_fast_assign(/* assign */
+		DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device));
+		__entry->qpn = qp->ibqp.qp_num;
+		__entry->remote = remote;
+		__entry->param_qp = param->qp;
+		__entry->max_len = param->max_len;
+		__entry->jkey = param->jkey;
+		__entry->max_read = param->max_read;
+		__entry->max_write = param->max_write;
+		__entry->timeout = param->timeout;
+		__entry->urg = param->urg;
+	),
+	TP_printk(/* print */
+		OPFN_PARAM_PRN,
+		__get_str(dev),
+		__entry->qpn,
+		__entry->remote ? "remote" : "local",
+		__entry->param_qp,
+		__entry->max_read,
+		__entry->max_write,
+		__entry->max_len,
+		__entry->jkey,
+		__entry->timeout,
+		__entry->urg
+	)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_opfn_param_template, hfi1_opfn_param,
+	TP_PROTO(struct rvt_qp *qp, char remote,
+		 struct tid_rdma_params *param),
+	TP_ARGS(qp, remote, param)
+);
+
+DECLARE_EVENT_CLASS(/* msg */
+	hfi1_msg_template,
+	TP_PROTO(struct rvt_qp *qp, const char *msg, u64 more),
+	TP_ARGS(qp, msg, more),
+	TP_STRUCT__entry(/* entry */
+		__field(u32, qpn)
+		__string(msg, msg)
+		__field(u64, more)
+	),
+	TP_fast_assign(/* assign */
+		__entry->qpn = qp ? qp->ibqp.qp_num : 0;
+		__assign_str(msg, msg);
+		__entry->more = more;
+	),
+	TP_printk(/* print */
+		"qpn 0x%x %s 0x%llx",
+		__entry->qpn,
+		__get_str(msg),
+		__entry->more
+	)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_msg_template, hfi1_msg_opfn_conn_request,
+	TP_PROTO(struct rvt_qp *qp, const char *msg, u64 more),
+	TP_ARGS(qp, msg, more)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_msg_template, hfi1_msg_opfn_conn_error,
+	TP_PROTO(struct rvt_qp *qp, const char *msg, u64 more),
+	TP_ARGS(qp, msg, more)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_msg_template, hfi1_msg_alloc_tids,
+	TP_PROTO(struct rvt_qp *qp, const char *msg, u64 more),
+	TP_ARGS(qp, msg, more)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_msg_template, hfi1_msg_tid_restart_req,
+	TP_PROTO(struct rvt_qp *qp, const char *msg, u64 more),
+	TP_ARGS(qp, msg, more)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_msg_template, hfi1_msg_handle_kdeth_eflags,
+	TP_PROTO(struct rvt_qp *qp, const char *msg, u64 more),
+	TP_ARGS(qp, msg, more)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_msg_template, hfi1_msg_tid_timeout,
+	TP_PROTO(struct rvt_qp *qp, const char *msg, u64 more),
+	TP_ARGS(qp, msg, more)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_msg_template, hfi1_msg_tid_retry_timeout,
+	TP_PROTO(struct rvt_qp *qp, const char *msg, u64 more),
+	TP_ARGS(qp, msg, more)
+);
+
+DECLARE_EVENT_CLASS(/* tid_flow_page */
+	hfi1_tid_flow_page_template,
+	TP_PROTO(struct rvt_qp *qp, struct tid_rdma_flow *flow, u32 index,
+		 char mtu8k, char v1, void *vaddr),
+	TP_ARGS(qp, flow, index, mtu8k, v1, vaddr),
+	TP_STRUCT__entry(/* entry */
+		DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
+		__field(u32, qpn)
+		__field(char, mtu8k)
+		__field(char, v1)
+		__field(u32, index)
+		__field(u64, page)
+		__field(u64, vaddr)
+	),
+	TP_fast_assign(/* assign */
+		DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device));
+		__entry->qpn = qp->ibqp.qp_num;
+		__entry->mtu8k = mtu8k;
+		__entry->v1 = v1;
+		__entry->index = index;
+		__entry->page = vaddr ? (u64)virt_to_page(vaddr) : 0ULL;
+		__entry->vaddr = (u64)vaddr;
+	),
+	TP_printk(/* print */
+		"[%s] qpn 0x%x page[%u]: page 0x%llx %s 0x%llx",
+		__get_str(dev),
+		__entry->qpn,
+		__entry->index,
+		__entry->page,
+		__entry->mtu8k ? (__entry->v1 ? "v1" : "v0") : "vaddr",
+		__entry->vaddr
+	)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_tid_flow_page_template, hfi1_tid_flow_page,
+	TP_PROTO(struct rvt_qp *qp, struct tid_rdma_flow *flow, u32 index,
+		 char mtu8k, char v1, void *vaddr),
+	TP_ARGS(qp, flow, index, mtu8k, v1, vaddr)
+);
+
+DECLARE_EVENT_CLASS(/* tid_pageset */
+	hfi1_tid_pageset_template,
+	TP_PROTO(struct rvt_qp *qp, u32 index, u16 idx, u16 count),
+	TP_ARGS(qp, index, idx, count),
+	TP_STRUCT__entry(/* entry */
+		DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
+		__field(u32, qpn)
+		__field(u32, index)
+		__field(u16, idx)
+		__field(u16, count)
+	),
+	TP_fast_assign(/* assign */
+		DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device));
+		__entry->qpn = qp->ibqp.qp_num;
+		__entry->index = index;
+		__entry->idx = idx;
+		__entry->count = count;
+	),
+	TP_printk(/* print */
+		"[%s] qpn 0x%x list[%u]: idx %u count %u",
+		__get_str(dev),
+		__entry->qpn,
+		__entry->index,
+		__entry->idx,
+		__entry->count
+	)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_tid_pageset_template, hfi1_tid_pageset,
+	TP_PROTO(struct rvt_qp *qp, u32 index, u16 idx, u16 count),
+	TP_ARGS(qp, index, idx, count)
+);
+
+DECLARE_EVENT_CLASS(/* tid_fow */
+	hfi1_tid_flow_template,
+	TP_PROTO(struct rvt_qp *qp, int index, struct tid_rdma_flow *flow),
+	TP_ARGS(qp, index, flow),
+	TP_STRUCT__entry(/* entry */
+		DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
+		__field(u32, qpn)
+		__field(int, index)
+		__field(int, idx)
+		__field(u32, resp_ib_psn)
+		__field(u32, generation)
+		__field(u32, fspsn)
+		__field(u32, flpsn)
+		__field(u32, r_next_psn)
+		__field(u32, ib_spsn)
+		__field(u32, ib_lpsn)
+		__field(u32, npagesets)
+		__field(u32, tnode_cnt)
+		__field(u32, tidcnt)
+		__field(u32, tid_idx)
+		__field(u32, tid_offset)
+		__field(u32, length)
+		__field(u32, sent)
+	),
+	TP_fast_assign(/* assign */
+		DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device));
+		__entry->qpn = qp->ibqp.qp_num;
+		__entry->index = index;
+		__entry->idx = flow->idx;
+		__entry->resp_ib_psn = flow->flow_state.resp_ib_psn;
+		__entry->generation = flow->flow_state.generation;
+		__entry->fspsn = full_flow_psn(flow,
+					       flow->flow_state.spsn);
+		__entry->flpsn = full_flow_psn(flow,
+					       flow->flow_state.lpsn);
+		__entry->r_next_psn = flow->flow_state.r_next_psn;
+		__entry->ib_spsn = flow->flow_state.ib_spsn;
+		__entry->ib_lpsn = flow->flow_state.ib_lpsn;
+		__entry->npagesets = flow->npagesets;
+		__entry->tnode_cnt = flow->tnode_cnt;
+		__entry->tidcnt = flow->tidcnt;
+		__entry->tid_idx = flow->tid_idx;
+		__entry->tid_offset =  flow->tid_offset;
+		__entry->length = flow->length;
+		__entry->sent = flow->sent;
+	),
+	TP_printk(/* print */
+		TID_FLOW_PRN,
+		__get_str(dev),
+		__entry->qpn,
+		__entry->index,
+		__entry->idx,
+		__entry->resp_ib_psn,
+		__entry->generation,
+		__entry->fspsn,
+		__entry->flpsn,
+		__entry->r_next_psn,
+		__entry->ib_spsn,
+		__entry->ib_lpsn,
+		__entry->npagesets,
+		__entry->tnode_cnt,
+		__entry->tidcnt,
+		__entry->tid_idx,
+		__entry->tid_offset,
+		__entry->length,
+		__entry->sent
+	)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_tid_flow_template, hfi1_tid_flow_alloc,
+	TP_PROTO(struct rvt_qp *qp, int index, struct tid_rdma_flow *flow),
+	TP_ARGS(qp, index, flow)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_tid_flow_template, hfi1_tid_flow_build_read_pkt,
+	TP_PROTO(struct rvt_qp *qp, int index, struct tid_rdma_flow *flow),
+	TP_ARGS(qp, index, flow)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_tid_flow_template, hfi1_tid_flow_build_read_resp,
+	TP_PROTO(struct rvt_qp *qp, int index, struct tid_rdma_flow *flow),
+	TP_ARGS(qp, index, flow)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_tid_flow_template, hfi1_tid_flow_rcv_read_req,
+	TP_PROTO(struct rvt_qp *qp, int index, struct tid_rdma_flow *flow),
+	TP_ARGS(qp, index, flow)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_tid_flow_template, hfi1_tid_flow_rcv_read_resp,
+	TP_PROTO(struct rvt_qp *qp, int index, struct tid_rdma_flow *flow),
+	TP_ARGS(qp, index, flow)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_tid_flow_template, hfi1_tid_flow_restart_req,
+	TP_PROTO(struct rvt_qp *qp, int index, struct tid_rdma_flow *flow),
+	TP_ARGS(qp, index, flow)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_tid_flow_template, hfi1_tid_flow_build_write_resp,
+	TP_PROTO(struct rvt_qp *qp, int index, struct tid_rdma_flow *flow),
+	TP_ARGS(qp, index, flow)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_tid_flow_template, hfi1_tid_flow_rcv_write_resp,
+	TP_PROTO(struct rvt_qp *qp, int index, struct tid_rdma_flow *flow),
+	TP_ARGS(qp, index, flow)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_tid_flow_template, hfi1_tid_flow_build_write_data,
+	TP_PROTO(struct rvt_qp *qp, int index, struct tid_rdma_flow *flow),
+	TP_ARGS(qp, index, flow)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_tid_flow_template, hfi1_tid_flow_rcv_tid_ack,
+	TP_PROTO(struct rvt_qp *qp, int index, struct tid_rdma_flow *flow),
+	TP_ARGS(qp, index, flow)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_tid_flow_template, hfi1_tid_flow_rcv_resync,
+	TP_PROTO(struct rvt_qp *qp, int index, struct tid_rdma_flow *flow),
+	TP_ARGS(qp, index, flow)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_tid_flow_template, hfi1_tid_flow_handle_kdeth_eflags,
+	TP_PROTO(struct rvt_qp *qp, int index, struct tid_rdma_flow *flow),
+	TP_ARGS(qp, index, flow)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_tid_flow_template, hfi1_tid_flow_read_kdeth_eflags,
+	TP_PROTO(struct rvt_qp *qp, int index, struct tid_rdma_flow *flow),
+	TP_ARGS(qp, index, flow)
+);
+
+DECLARE_EVENT_CLASS(/* tid_node */
+	hfi1_tid_node_template,
+	TP_PROTO(struct rvt_qp *qp, const char *msg, u32 index, u32 base,
+		 u8 map, u8 used, u8 cnt),
+	TP_ARGS(qp, msg, index, base, map, used, cnt),
+	TP_STRUCT__entry(/* entry */
+		DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
+		__field(u32, qpn)
+		__string(msg, msg)
+		__field(u32, index)
+		__field(u32, base)
+		__field(u8, map)
+		__field(u8, used)
+		__field(u8, cnt)
+	),
+	TP_fast_assign(/* assign */
+		DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device));
+		__entry->qpn = qp->ibqp.qp_num;
+		__assign_str(msg, msg);
+		__entry->index = index;
+		__entry->base = base;
+		__entry->map = map;
+		__entry->used = used;
+		__entry->cnt = cnt;
+	),
+	TP_printk(/* print */
+		TID_NODE_PRN,
+		__get_str(dev),
+		__entry->qpn,
+		__get_str(msg),
+		__entry->index,
+		__entry->base,
+		__entry->map,
+		__entry->used,
+		__entry->cnt
+	)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_tid_node_template, hfi1_tid_node_add,
+	TP_PROTO(struct rvt_qp *qp, const char *msg, u32 index, u32 base,
+		 u8 map, u8 used, u8 cnt),
+	TP_ARGS(qp, msg, index, base, map, used, cnt)
+);
+
+DECLARE_EVENT_CLASS(/* tid_entry */
+	hfi1_tid_entry_template,
+	TP_PROTO(struct rvt_qp *qp, int index, u32 ent),
+	TP_ARGS(qp, index, ent),
+	TP_STRUCT__entry(/* entry */
+		DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
+		__field(u32, qpn)
+		__field(int, index)
+		__field(u8, ctrl)
+		__field(u16, idx)
+		__field(u16, len)
+	),
+	TP_fast_assign(/* assign */
+		DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device));
+		__entry->qpn = qp->ibqp.qp_num;
+		__entry->index = index;
+		__entry->ctrl = hfi1_trace_get_tid_ctrl(ent);
+		__entry->idx = hfi1_trace_get_tid_idx(ent);
+		__entry->len = hfi1_trace_get_tid_len(ent);
+	),
+	TP_printk(/* print */
+		"[%s] qpn 0x%x TID entry %d: idx %u len %u ctrl 0x%x",
+		__get_str(dev),
+		__entry->qpn,
+		__entry->index,
+		__entry->idx,
+		__entry->len,
+		__entry->ctrl
+	)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_tid_entry_template, hfi1_tid_entry_alloc,
+	TP_PROTO(struct rvt_qp *qp, int index, u32 entry),
+	TP_ARGS(qp, index, entry)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_tid_entry_template, hfi1_tid_entry_build_read_resp,
+	TP_PROTO(struct rvt_qp *qp, int index, u32 ent),
+	TP_ARGS(qp, index, ent)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_tid_entry_template, hfi1_tid_entry_rcv_read_req,
+	TP_PROTO(struct rvt_qp *qp, int index, u32 ent),
+	TP_ARGS(qp, index, ent)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_tid_entry_template, hfi1_tid_entry_rcv_write_resp,
+	TP_PROTO(struct rvt_qp *qp, int index, u32 entry),
+	TP_ARGS(qp, index, entry)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_tid_entry_template, hfi1_tid_entry_build_write_data,
+	TP_PROTO(struct rvt_qp *qp, int index, u32 entry),
+	TP_ARGS(qp, index, entry)
+);
+
+DECLARE_EVENT_CLASS(/* rsp_info */
+	hfi1_responder_info_template,
+	TP_PROTO(struct rvt_qp *qp, u32 psn),
+	TP_ARGS(qp, psn),
+	TP_STRUCT__entry(/* entry */
+		DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
+		__field(u32, qpn)
+		__field(u8, state)
+		__field(u8, s_state)
+		__field(u32, psn)
+		__field(u32, r_psn)
+		__field(u8, r_state)
+		__field(u8, r_flags)
+		__field(u8, r_head_ack_queue)
+		__field(u8, s_tail_ack_queue)
+		__field(u8, s_acked_ack_queue)
+		__field(u8, s_ack_state)
+		__field(u8, s_nak_state)
+		__field(u8, r_nak_state)
+		__field(u32, s_flags)
+		__field(u32, ps_flags)
+		__field(unsigned long, iow_flags)
+	),
+	TP_fast_assign(/* assign */
+		struct hfi1_qp_priv *priv = qp->priv;
+
+		DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device));
+		__entry->qpn = qp->ibqp.qp_num;
+		__entry->state = qp->state;
+		__entry->s_state = qp->s_state;
+		__entry->psn = psn;
+		__entry->r_psn = qp->r_psn;
+		__entry->r_state = qp->r_state;
+		__entry->r_flags = qp->r_flags;
+		__entry->r_head_ack_queue = qp->r_head_ack_queue;
+		__entry->s_tail_ack_queue = qp->s_tail_ack_queue;
+		__entry->s_acked_ack_queue = qp->s_acked_ack_queue;
+		__entry->s_ack_state = qp->s_ack_state;
+		__entry->s_nak_state = qp->s_nak_state;
+		__entry->s_flags = qp->s_flags;
+		__entry->ps_flags = priv->s_flags;
+		__entry->iow_flags = priv->s_iowait.flags;
+	),
+	TP_printk(/* print */
+		RSP_INFO_PRN,
+		__get_str(dev),
+		__entry->qpn,
+		__entry->state,
+		__entry->s_state,
+		__entry->psn,
+		__entry->r_psn,
+		__entry->r_state,
+		__entry->r_flags,
+		__entry->r_head_ack_queue,
+		__entry->s_tail_ack_queue,
+		__entry->s_acked_ack_queue,
+		__entry->s_ack_state,
+		__entry->s_nak_state,
+		__entry->s_flags,
+		__entry->ps_flags,
+		__entry->iow_flags
+	)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_responder_info_template, hfi1_rsp_make_rc_ack,
+	TP_PROTO(struct rvt_qp *qp, u32 psn),
+	TP_ARGS(qp, psn)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_responder_info_template, hfi1_rsp_rcv_tid_read_req,
+	TP_PROTO(struct rvt_qp *qp, u32 psn),
+	TP_ARGS(qp, psn)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_responder_info_template, hfi1_rsp_tid_rcv_error,
+	TP_PROTO(struct rvt_qp *qp, u32 psn),
+	TP_ARGS(qp, psn)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_responder_info_template, hfi1_rsp_tid_write_alloc_res,
+	TP_PROTO(struct rvt_qp *qp, u32 psn),
+	TP_ARGS(qp, psn)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_responder_info_template, hfi1_rsp_rcv_tid_write_req,
+	TP_PROTO(struct rvt_qp *qp, u32 psn),
+	TP_ARGS(qp, psn)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_responder_info_template, hfi1_rsp_build_tid_write_resp,
+	TP_PROTO(struct rvt_qp *qp, u32 psn),
+	TP_ARGS(qp, psn)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_responder_info_template, hfi1_rsp_rcv_tid_write_data,
+	TP_PROTO(struct rvt_qp *qp, u32 psn),
+	TP_ARGS(qp, psn)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_responder_info_template, hfi1_rsp_make_tid_ack,
+	TP_PROTO(struct rvt_qp *qp, u32 psn),
+	TP_ARGS(qp, psn)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_responder_info_template, hfi1_rsp_handle_kdeth_eflags,
+	TP_PROTO(struct rvt_qp *qp, u32 psn),
+	TP_ARGS(qp, psn)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_responder_info_template, hfi1_rsp_read_kdeth_eflags,
+	TP_PROTO(struct rvt_qp *qp, u32 psn),
+	TP_ARGS(qp, psn)
+);
+
+DECLARE_EVENT_CLASS(/* sender_info */
+	hfi1_sender_info_template,
+	TP_PROTO(struct rvt_qp *qp),
+	TP_ARGS(qp),
+	TP_STRUCT__entry(/* entry */
+		DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
+		__field(u32, qpn)
+		__field(u8, state)
+		__field(u32, s_cur)
+		__field(u32, s_tail)
+		__field(u32, s_head)
+		__field(u32, s_acked)
+		__field(u32, s_last)
+		__field(u32, s_psn)
+		__field(u32, s_last_psn)
+		__field(u32, s_flags)
+		__field(u32, ps_flags)
+		__field(unsigned long, iow_flags)
+		__field(u8, s_state)
+		__field(u8, s_num_rd)
+		__field(u8, s_retry)
+	),
+	TP_fast_assign(/* assign */
+		DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device))
+		__entry->qpn = qp->ibqp.qp_num;
+		__entry->state = qp->state;
+		__entry->s_cur = qp->s_cur;
+		__entry->s_tail = qp->s_tail;
+		__entry->s_head = qp->s_head;
+		__entry->s_acked = qp->s_acked;
+		__entry->s_last = qp->s_last;
+		__entry->s_psn = qp->s_psn;
+		__entry->s_last_psn = qp->s_last_psn;
+		__entry->s_flags = qp->s_flags;
+		__entry->ps_flags = ((struct hfi1_qp_priv *)qp->priv)->s_flags;
+		__entry->iow_flags =
+			((struct hfi1_qp_priv *)qp->priv)->s_iowait.flags;
+		__entry->s_state = qp->s_state;
+		__entry->s_num_rd = qp->s_num_rd_atomic;
+		__entry->s_retry = qp->s_retry;
+	),
+	TP_printk(/* print */
+		SENDER_INFO_PRN,
+		__get_str(dev),
+		__entry->qpn,
+		__entry->state,
+		__entry->s_cur,
+		__entry->s_tail,
+		__entry->s_head,
+		__entry->s_acked,
+		__entry->s_last,
+		__entry->s_psn,
+		__entry->s_last_psn,
+		__entry->s_flags,
+		__entry->ps_flags,
+		__entry->iow_flags,
+		__entry->s_state,
+		__entry->s_num_rd,
+		__entry->s_retry
+	)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_sender_info_template, hfi1_sender_make_rc_req,
+	TP_PROTO(struct rvt_qp *qp),
+	TP_ARGS(qp)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_sender_info_template, hfi1_sender_reset_psn,
+	TP_PROTO(struct rvt_qp *qp),
+	TP_ARGS(qp)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_sender_info_template, hfi1_sender_restart_rc,
+	TP_PROTO(struct rvt_qp *qp),
+	TP_ARGS(qp)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_sender_info_template, hfi1_sender_do_rc_ack,
+	TP_PROTO(struct rvt_qp *qp),
+	TP_ARGS(qp)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_sender_info_template, hfi1_sender_rcv_tid_read_resp,
+	TP_PROTO(struct rvt_qp *qp),
+	TP_ARGS(qp)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_sender_info_template, hfi1_sender_rcv_tid_ack,
+	TP_PROTO(struct rvt_qp *qp),
+	TP_ARGS(qp)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_sender_info_template, hfi1_sender_make_tid_pkt,
+	TP_PROTO(struct rvt_qp *qp),
+	TP_ARGS(qp)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_sender_info_template, hfi1_sender_read_kdeth_eflags,
+	TP_PROTO(struct rvt_qp *qp),
+	TP_ARGS(qp)
+);
+
+DECLARE_EVENT_CLASS(/* tid_read_sender */
+	hfi1_tid_read_sender_template,
+	TP_PROTO(struct rvt_qp *qp, char newreq),
+	TP_ARGS(qp, newreq),
+	TP_STRUCT__entry(/* entry */
+		DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
+		__field(u32, qpn)
+		__field(char, newreq)
+		__field(u32, tid_r_reqs)
+		__field(u32, tid_r_comp)
+		__field(u32, pending_tid_r_segs)
+		__field(u32, s_flags)
+		__field(u32, ps_flags)
+		__field(unsigned long, iow_flags)
+		__field(u8, s_state)
+		__field(u32, hw_flow_index)
+		__field(u32, generation)
+		__field(u32, fpsn)
+	),
+	TP_fast_assign(/* assign */
+		struct hfi1_qp_priv *priv = qp->priv;
+
+		DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device));
+		__entry->qpn = qp->ibqp.qp_num;
+		__entry->newreq = newreq;
+		__entry->tid_r_reqs = priv->tid_r_reqs;
+		__entry->tid_r_comp = priv->tid_r_comp;
+		__entry->pending_tid_r_segs = priv->pending_tid_r_segs;
+		__entry->s_flags = qp->s_flags;
+		__entry->ps_flags = priv->s_flags;
+		__entry->iow_flags = priv->s_iowait.flags;
+		__entry->s_state = priv->s_state;
+		__entry->hw_flow_index = priv->flow_state.index;
+		__entry->generation = priv->flow_state.generation;
+		__entry->fpsn = priv->flow_state.psn;
+	),
+	TP_printk(/* print */
+		TID_READ_SENDER_PRN,
+		__get_str(dev),
+		__entry->qpn,
+		__entry->newreq,
+		__entry->tid_r_reqs,
+		__entry->tid_r_comp,
+		__entry->pending_tid_r_segs,
+		__entry->s_flags,
+		__entry->ps_flags,
+		__entry->iow_flags,
+		__entry->s_state,
+		__entry->hw_flow_index,
+		__entry->generation,
+		__entry->fpsn
+	)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_tid_read_sender_template, hfi1_tid_read_sender_make_req,
+	TP_PROTO(struct rvt_qp *qp, char newreq),
+	TP_ARGS(qp, newreq)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_tid_read_sender_template, hfi1_tid_read_sender_kdeth_eflags,
+	TP_PROTO(struct rvt_qp *qp, char newreq),
+	TP_ARGS(qp, newreq)
+);
+
+DECLARE_EVENT_CLASS(/* tid_rdma_request */
+	hfi1_tid_rdma_request_template,
+	TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn,
+		 struct tid_rdma_request *req),
+	TP_ARGS(qp, newreq, opcode, psn, lpsn, req),
+	TP_STRUCT__entry(/* entry */
+		DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
+		__field(u32, qpn)
+		__field(char, newreq)
+		__field(u8, opcode)
+		__field(u32, psn)
+		__field(u32, lpsn)
+		__field(u32, cur_seg)
+		__field(u32, comp_seg)
+		__field(u32, ack_seg)
+		__field(u32, alloc_seg)
+		__field(u32, total_segs)
+		__field(u16, setup_head)
+		__field(u16, clear_tail)
+		__field(u16, flow_idx)
+		__field(u16, acked_tail)
+		__field(u32, state)
+		__field(u32, r_ack_psn)
+		__field(u32, r_flow_psn)
+		__field(u32, r_last_acked)
+		__field(u32, s_next_psn)
+	),
+	TP_fast_assign(/* assign */
+		DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device));
+		__entry->qpn = qp->ibqp.qp_num;
+		__entry->newreq = newreq;
+		__entry->opcode = opcode;
+		__entry->psn = psn;
+		__entry->lpsn = lpsn;
+		__entry->cur_seg = req->cur_seg;
+		__entry->comp_seg = req->comp_seg;
+		__entry->ack_seg = req->ack_seg;
+		__entry->alloc_seg = req->alloc_seg;
+		__entry->total_segs = req->total_segs;
+		__entry->setup_head = req->setup_head;
+		__entry->clear_tail = req->clear_tail;
+		__entry->flow_idx = req->flow_idx;
+		__entry->acked_tail = req->acked_tail;
+		__entry->state = req->state;
+		__entry->r_ack_psn = req->r_ack_psn;
+		__entry->r_flow_psn = req->r_flow_psn;
+		__entry->r_last_acked = req->r_last_acked;
+		__entry->s_next_psn = req->s_next_psn;
+	),
+	TP_printk(/* print */
+		TID_REQ_PRN,
+		__get_str(dev),
+		__entry->qpn,
+		__entry->newreq,
+		__entry->opcode,
+		__entry->psn,
+		__entry->lpsn,
+		__entry->cur_seg,
+		__entry->comp_seg,
+		__entry->ack_seg,
+		__entry->alloc_seg,
+		__entry->total_segs,
+		__entry->setup_head,
+		__entry->clear_tail,
+		__entry->flow_idx,
+		__entry->acked_tail,
+		__entry->state,
+		__entry->r_ack_psn,
+		__entry->r_flow_psn,
+		__entry->r_last_acked,
+		__entry->s_next_psn
+	)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_tid_rdma_request_template, hfi1_tid_req_make_req_read,
+	TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn,
+		 struct tid_rdma_request *req),
+	TP_ARGS(qp, newreq, opcode, psn, lpsn, req)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_tid_rdma_request_template, hfi1_tid_req_build_read_req,
+	TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn,
+		 struct tid_rdma_request *req),
+	TP_ARGS(qp, newreq, opcode, psn, lpsn, req)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_tid_rdma_request_template, hfi1_tid_req_rcv_read_req,
+	TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn,
+		 struct tid_rdma_request *req),
+	TP_ARGS(qp, newreq, opcode, psn, lpsn, req)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_tid_rdma_request_template, hfi1_tid_req_rcv_read_resp,
+	TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn,
+		 struct tid_rdma_request *req),
+	TP_ARGS(qp, newreq, opcode, psn, lpsn, req)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_tid_rdma_request_template, hfi1_tid_req_rcv_err,
+	TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn,
+		 struct tid_rdma_request *req),
+	TP_ARGS(qp, newreq, opcode, psn, lpsn, req)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_tid_rdma_request_template, hfi1_tid_req_restart_req,
+	TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn,
+		 struct tid_rdma_request *req),
+	TP_ARGS(qp, newreq, opcode, psn, lpsn, req)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_tid_rdma_request_template, hfi1_tid_req_setup_tid_wqe,
+	TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn,
+		 struct tid_rdma_request *req),
+	TP_ARGS(qp, newreq, opcode, psn, lpsn, req)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_tid_rdma_request_template, hfi1_tid_req_write_alloc_res,
+	TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn,
+		 struct tid_rdma_request *req),
+	TP_ARGS(qp, newreq, opcode, psn, lpsn, req)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_tid_rdma_request_template, hfi1_tid_req_rcv_write_req,
+	TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn,
+		 struct tid_rdma_request *req),
+	TP_ARGS(qp, newreq, opcode, psn, lpsn, req)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_tid_rdma_request_template, hfi1_tid_req_build_write_resp,
+	TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn,
+		 struct tid_rdma_request *req),
+	TP_ARGS(qp, newreq, opcode, psn, lpsn, req)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_tid_rdma_request_template, hfi1_tid_req_rcv_write_resp,
+	TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn,
+		 struct tid_rdma_request *req),
+	TP_ARGS(qp, newreq, opcode, psn, lpsn, req)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_tid_rdma_request_template, hfi1_tid_req_rcv_write_data,
+	TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn,
+		 struct tid_rdma_request *req),
+	TP_ARGS(qp, newreq, opcode, psn, lpsn, req)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_tid_rdma_request_template, hfi1_tid_req_rcv_tid_ack,
+	TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn,
+		 struct tid_rdma_request *req),
+	TP_ARGS(qp, newreq, opcode, psn, lpsn, req)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_tid_rdma_request_template, hfi1_tid_req_tid_retry_timeout,
+	TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn,
+		 struct tid_rdma_request *req),
+	TP_ARGS(qp, newreq, opcode, psn, lpsn, req)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_tid_rdma_request_template, hfi1_tid_req_rcv_resync,
+	TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn,
+		 struct tid_rdma_request *req),
+	TP_ARGS(qp, newreq, opcode, psn, lpsn, req)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_tid_rdma_request_template, hfi1_tid_req_make_tid_pkt,
+	TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn,
+		 struct tid_rdma_request *req),
+	TP_ARGS(qp, newreq, opcode, psn, lpsn, req)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_tid_rdma_request_template, hfi1_tid_req_make_tid_ack,
+	TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn,
+		 struct tid_rdma_request *req),
+	TP_ARGS(qp, newreq, opcode, psn, lpsn, req)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_tid_rdma_request_template, hfi1_tid_req_handle_kdeth_eflags,
+	TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn,
+		 struct tid_rdma_request *req),
+	TP_ARGS(qp, newreq, opcode, psn, lpsn, req)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_tid_rdma_request_template, hfi1_tid_req_read_kdeth_eflags,
+	TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn,
+		 struct tid_rdma_request *req),
+	TP_ARGS(qp, newreq, opcode, psn, lpsn, req)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_tid_rdma_request_template, hfi1_tid_req_make_rc_ack_write,
+	TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn,
+		 struct tid_rdma_request *req),
+	TP_ARGS(qp, newreq, opcode, psn, lpsn, req)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_tid_rdma_request_template, hfi1_tid_req_make_req_write,
+	TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn,
+		 struct tid_rdma_request *req),
+	TP_ARGS(qp, newreq, opcode, psn, lpsn, req)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_tid_rdma_request_template, hfi1_tid_req_update_num_rd_atomic,
+	TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn,
+		 struct tid_rdma_request *req),
+	TP_ARGS(qp, newreq, opcode, psn, lpsn, req)
+);
+
+DECLARE_EVENT_CLASS(/* rc_rcv_err */
+	hfi1_rc_rcv_err_template,
+	TP_PROTO(struct rvt_qp *qp, u32 opcode, u32 psn, int diff),
+	TP_ARGS(qp, opcode, psn, diff),
+	TP_STRUCT__entry(/* entry */
+		DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
+		__field(u32, qpn)
+		__field(u32, s_flags)
+		__field(u8, state)
+		__field(u8, s_acked_ack_queue)
+		__field(u8, s_tail_ack_queue)
+		__field(u8, r_head_ack_queue)
+		__field(u32, opcode)
+		__field(u32, psn)
+		__field(u32, r_psn)
+		__field(int, diff)
+	),
+	TP_fast_assign(/* assign */
+		DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device))
+		__entry->qpn = qp->ibqp.qp_num;
+		__entry->s_flags = qp->s_flags;
+		__entry->state = qp->state;
+		__entry->s_acked_ack_queue = qp->s_acked_ack_queue;
+		__entry->s_tail_ack_queue = qp->s_tail_ack_queue;
+		__entry->r_head_ack_queue = qp->r_head_ack_queue;
+		__entry->opcode = opcode;
+		__entry->psn = psn;
+		__entry->r_psn = qp->r_psn;
+		__entry->diff = diff;
+	),
+	TP_printk(/* print */
+		RCV_ERR_PRN,
+		__get_str(dev),
+		__entry->qpn,
+		__entry->s_flags,
+		__entry->state,
+		__entry->s_acked_ack_queue,
+		__entry->s_tail_ack_queue,
+		__entry->r_head_ack_queue,
+		__entry->opcode,
+		__entry->psn,
+		__entry->r_psn,
+		__entry->diff
+	)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_rc_rcv_err_template, hfi1_tid_rdma_rcv_err,
+	TP_PROTO(struct rvt_qp *qp, u32 opcode, u32 psn, int diff),
+	TP_ARGS(qp, opcode, psn, diff)
+);
+
+DECLARE_EVENT_CLASS(/* sge  */
+	hfi1_sge_template,
+	TP_PROTO(struct rvt_qp *qp, int index, struct rvt_sge *sge),
+	TP_ARGS(qp, index, sge),
+	TP_STRUCT__entry(/* entry */
+		DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
+		__field(u32, qpn)
+		__field(int, index)
+		__field(u64, vaddr)
+		__field(u32, sge_length)
+	),
+	TP_fast_assign(/* assign */
+		DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device));
+		__entry->qpn = qp->ibqp.qp_num;
+		__entry->index = index;
+		__entry->vaddr = (u64)sge->vaddr;
+		__entry->sge_length = sge->sge_length;
+	),
+	TP_printk(/* print */
+		"[%s] qpn 0x%x sge %d: vaddr 0x%llx sge_length %u",
+		__get_str(dev),
+		__entry->qpn,
+		__entry->index,
+		__entry->vaddr,
+		__entry->sge_length
+	)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_sge_template, hfi1_sge_check_align,
+	TP_PROTO(struct rvt_qp *qp, int index, struct rvt_sge *sge),
+	TP_ARGS(qp, index, sge)
+);
+
+DECLARE_EVENT_CLASS(/* tid_write_sp */
+	hfi1_tid_write_rsp_template,
+	TP_PROTO(struct rvt_qp *qp),
+	TP_ARGS(qp),
+	TP_STRUCT__entry(/* entry */
+		DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
+		__field(u32, qpn)
+		__field(u32, r_tid_head)
+		__field(u32, r_tid_tail)
+		__field(u32, r_tid_ack)
+		__field(u32, r_tid_alloc)
+		__field(u32, alloc_w_segs)
+		__field(u32, pending_tid_w_segs)
+		__field(bool, sync_pt)
+		__field(u32, ps_nak_psn)
+		__field(u8, ps_nak_state)
+		__field(u8, prnr_nak_state)
+		__field(u32, hw_flow_index)
+		__field(u32, generation)
+		__field(u32, fpsn)
+		__field(bool, resync)
+		__field(u32, r_next_psn_kdeth)
+	),
+	TP_fast_assign(/* assign */
+		struct hfi1_qp_priv *priv = qp->priv;
+
+		DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device));
+		__entry->qpn = qp->ibqp.qp_num;
+		__entry->r_tid_head = priv->r_tid_head;
+		__entry->r_tid_tail = priv->r_tid_tail;
+		__entry->r_tid_ack = priv->r_tid_ack;
+		__entry->r_tid_alloc = priv->r_tid_alloc;
+		__entry->alloc_w_segs = priv->alloc_w_segs;
+		__entry->pending_tid_w_segs = priv->pending_tid_w_segs;
+		__entry->sync_pt = priv->sync_pt;
+		__entry->ps_nak_psn = priv->s_nak_psn;
+		__entry->ps_nak_state = priv->s_nak_state;
+		__entry->prnr_nak_state = priv->rnr_nak_state;
+		__entry->hw_flow_index = priv->flow_state.index;
+		__entry->generation = priv->flow_state.generation;
+		__entry->fpsn = priv->flow_state.psn;
+		__entry->resync = priv->resync;
+		__entry->r_next_psn_kdeth = priv->r_next_psn_kdeth;
+	),
+	TP_printk(/* print */
+		TID_WRITE_RSPDR_PRN,
+		__get_str(dev),
+		__entry->qpn,
+		__entry->r_tid_head,
+		__entry->r_tid_tail,
+		__entry->r_tid_ack,
+		__entry->r_tid_alloc,
+		__entry->alloc_w_segs,
+		__entry->pending_tid_w_segs,
+		__entry->sync_pt ? "yes" : "no",
+		__entry->ps_nak_psn,
+		__entry->ps_nak_state,
+		__entry->prnr_nak_state,
+		__entry->hw_flow_index,
+		__entry->generation,
+		__entry->fpsn,
+		__entry->resync ? "yes" : "no",
+		__entry->r_next_psn_kdeth
+	)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_tid_write_rsp_template, hfi1_tid_write_rsp_alloc_res,
+	TP_PROTO(struct rvt_qp *qp),
+	TP_ARGS(qp)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_tid_write_rsp_template, hfi1_tid_write_rsp_rcv_req,
+	TP_PROTO(struct rvt_qp *qp),
+	TP_ARGS(qp)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_tid_write_rsp_template, hfi1_tid_write_rsp_build_resp,
+	TP_PROTO(struct rvt_qp *qp),
+	TP_ARGS(qp)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_tid_write_rsp_template, hfi1_tid_write_rsp_rcv_data,
+	TP_PROTO(struct rvt_qp *qp),
+	TP_ARGS(qp)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_tid_write_rsp_template, hfi1_tid_write_rsp_rcv_resync,
+	TP_PROTO(struct rvt_qp *qp),
+	TP_ARGS(qp)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_tid_write_rsp_template, hfi1_tid_write_rsp_make_tid_ack,
+	TP_PROTO(struct rvt_qp *qp),
+	TP_ARGS(qp)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_tid_write_rsp_template, hfi1_tid_write_rsp_handle_kdeth_eflags,
+	TP_PROTO(struct rvt_qp *qp),
+	TP_ARGS(qp)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_tid_write_rsp_template, hfi1_tid_write_rsp_make_rc_ack,
+	TP_PROTO(struct rvt_qp *qp),
+	TP_ARGS(qp)
+);
+
+DECLARE_EVENT_CLASS(/* tid_write_sender */
+	hfi1_tid_write_sender_template,
+	TP_PROTO(struct rvt_qp *qp, char newreq),
+	TP_ARGS(qp, newreq),
+	TP_STRUCT__entry(/* entry */
+		DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
+		__field(u32, qpn)
+		__field(char, newreq)
+		__field(u32, s_tid_cur)
+		__field(u32, s_tid_tail)
+		__field(u32, s_tid_head)
+		__field(u32, pending_tid_w_resp)
+		__field(u32, n_requests)
+		__field(u32, n_tid_requests)
+		__field(u32, s_flags)
+		__field(u32, ps_flags)
+		__field(unsigned long, iow_flags)
+		__field(u8, s_state)
+		__field(u8, s_retry)
+	),
+	TP_fast_assign(/* assign */
+		struct hfi1_qp_priv *priv = qp->priv;
+
+		DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device));
+		__entry->qpn = qp->ibqp.qp_num;
+		__entry->newreq = newreq;
+		__entry->s_tid_cur = priv->s_tid_cur;
+		__entry->s_tid_tail = priv->s_tid_tail;
+		__entry->s_tid_head = priv->s_tid_head;
+		__entry->pending_tid_w_resp = priv->pending_tid_w_resp;
+		__entry->n_requests = atomic_read(&priv->n_requests);
+		__entry->n_tid_requests = atomic_read(&priv->n_tid_requests);
+		__entry->s_flags = qp->s_flags;
+		__entry->ps_flags = priv->s_flags;
+		__entry->iow_flags = priv->s_iowait.flags;
+		__entry->s_state = priv->s_state;
+		__entry->s_retry = priv->s_retry;
+	),
+	TP_printk(/* print */
+		TID_WRITE_SENDER_PRN,
+		__get_str(dev),
+		__entry->qpn,
+		__entry->newreq,
+		__entry->s_tid_cur,
+		__entry->s_tid_tail,
+		__entry->s_tid_head,
+		__entry->pending_tid_w_resp,
+		__entry->n_requests,
+		__entry->n_tid_requests,
+		__entry->s_flags,
+		__entry->ps_flags,
+		__entry->iow_flags,
+		__entry->s_state,
+		__entry->s_retry
+	)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_tid_write_sender_template, hfi1_tid_write_sender_rcv_resp,
+	TP_PROTO(struct rvt_qp *qp, char newreq),
+	TP_ARGS(qp, newreq)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_tid_write_sender_template, hfi1_tid_write_sender_rcv_tid_ack,
+	TP_PROTO(struct rvt_qp *qp, char newreq),
+	TP_ARGS(qp, newreq)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_tid_write_sender_template, hfi1_tid_write_sender_retry_timeout,
+	TP_PROTO(struct rvt_qp *qp, char newreq),
+	TP_ARGS(qp, newreq)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_tid_write_sender_template, hfi1_tid_write_sender_make_tid_pkt,
+	TP_PROTO(struct rvt_qp *qp, char newreq),
+	TP_ARGS(qp, newreq)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_tid_write_sender_template, hfi1_tid_write_sender_make_req,
+	TP_PROTO(struct rvt_qp *qp, char newreq),
+	TP_ARGS(qp, newreq)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_tid_write_sender_template, hfi1_tid_write_sender_restart_rc,
+	TP_PROTO(struct rvt_qp *qp, char newreq),
+	TP_ARGS(qp, newreq)
+);
+
+DECLARE_EVENT_CLASS(/* tid_ack */
+	hfi1_tid_ack_template,
+	TP_PROTO(struct rvt_qp *qp, u32 aeth, u32 psn,
+		 u32 req_psn, u32 resync_psn),
+	TP_ARGS(qp, aeth, psn, req_psn, resync_psn),
+	TP_STRUCT__entry(/* entry */
+		DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
+		__field(u32, qpn)
+		__field(u32, aeth)
+		__field(u32, psn)
+		__field(u32, req_psn)
+		__field(u32, resync_psn)
+	),
+	TP_fast_assign(/* assign */
+		DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device))
+		__entry->qpn = qp->ibqp.qp_num;
+		__entry->aeth = aeth;
+		__entry->psn = psn;
+		__entry->req_psn = req_psn;
+		__entry->resync_psn = resync_psn;
+		),
+	TP_printk(/* print */
+		"[%s] qpn 0x%x aeth 0x%x psn 0x%x req_psn 0x%x resync_psn 0x%x",
+		__get_str(dev),
+		__entry->qpn,
+		__entry->aeth,
+		__entry->psn,
+		__entry->req_psn,
+		__entry->resync_psn
+	)
+);
+
+DEFINE_EVENT(/* rcv_tid_ack */
+	hfi1_tid_ack_template, hfi1_rcv_tid_ack,
+	TP_PROTO(struct rvt_qp *qp, u32 aeth, u32 psn,
+		 u32 req_psn, u32 resync_psn),
+	TP_ARGS(qp, aeth, psn, req_psn, resync_psn)
+);
+
+DECLARE_EVENT_CLASS(/* kdeth_eflags_error */
+	hfi1_kdeth_eflags_error_template,
+	TP_PROTO(struct rvt_qp *qp, u8 rcv_type, u8 rte, u32 psn),
+	TP_ARGS(qp, rcv_type, rte, psn),
+	TP_STRUCT__entry(/* entry */
+		DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
+		__field(u32, qpn)
+		__field(u8, rcv_type)
+		__field(u8, rte)
+		__field(u32, psn)
+	),
+	TP_fast_assign(/* assign */
+		DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device));
+		__entry->qpn = qp->ibqp.qp_num;
+		__entry->rcv_type = rcv_type;
+		__entry->rte = rte;
+		__entry->psn = psn;
+	),
+	TP_printk(/* print */
+		KDETH_EFLAGS_ERR_PRN,
+		__get_str(dev),
+		__entry->qpn,
+		__entry->rcv_type,
+		__entry->rte,
+		__entry->psn
+	)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_kdeth_eflags_error_template, hfi1_eflags_err_write,
+	TP_PROTO(struct rvt_qp *qp, u8 rcv_type, u8 rte, u32 psn),
+	TP_ARGS(qp, rcv_type, rte, psn)
+);
+
+#endif /* __HFI1_TRACE_TID_H */
+
+#undef TRACE_INCLUDE_PATH
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE trace_tid
+#include <trace/define_trace.h>
diff --git a/drivers/infiniband/hw/hfi1/trace_tx.h b/drivers/infiniband/hw/hfi1/trace_tx.h
index c57af3b..09eb0c9 100644
--- a/drivers/infiniband/hw/hfi1/trace_tx.h
+++ b/drivers/infiniband/hw/hfi1/trace_tx.h
@@ -114,19 +114,27 @@
 		    __field(u32, qpn)
 		    __field(u32, flags)
 		    __field(u32, s_flags)
+		    __field(u32, ps_flags)
+		    __field(unsigned long, iow_flags)
 		    ),
 		    TP_fast_assign(
 		    DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device))
 		    __entry->flags = flags;
 		    __entry->qpn = qp->ibqp.qp_num;
 		    __entry->s_flags = qp->s_flags;
+		    __entry->ps_flags =
+			((struct hfi1_qp_priv *)qp->priv)->s_flags;
+		    __entry->iow_flags =
+			((struct hfi1_qp_priv *)qp->priv)->s_iowait.flags;
 		    ),
 		    TP_printk(
-		    "[%s] qpn 0x%x flags 0x%x s_flags 0x%x",
+		    "[%s] qpn 0x%x flags 0x%x s_flags 0x%x ps_flags 0x%x iow_flags 0x%lx",
 		    __get_str(dev),
 		    __entry->qpn,
 		    __entry->flags,
-		    __entry->s_flags
+		    __entry->s_flags,
+		    __entry->ps_flags,
+		    __entry->iow_flags
 		    )
 );
 
@@ -838,6 +846,12 @@
 	TP_ARGS(qp, flag)
 );
 
+DEFINE_EVENT(/* event */
+	hfi1_do_send_template, hfi1_rc_do_tid_send,
+	TP_PROTO(struct rvt_qp *qp, bool flag),
+	TP_ARGS(qp, flag)
+);
+
 DEFINE_EVENT(
 	hfi1_do_send_template, hfi1_rc_expired_time_slice,
 	TP_PROTO(struct rvt_qp *qp, bool flag),
diff --git a/drivers/infiniband/hw/hfi1/uc.c b/drivers/infiniband/hw/hfi1/uc.c
index e254dce..0c77f18 100644
--- a/drivers/infiniband/hw/hfi1/uc.c
+++ b/drivers/infiniband/hw/hfi1/uc.c
@@ -88,7 +88,7 @@
 		}
 		clear_ahg(qp);
 		wqe = rvt_get_swqe_ptr(qp, qp->s_last);
-		hfi1_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR);
+		rvt_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR);
 		goto done_free_tx;
 	}
 
@@ -140,7 +140,7 @@
 					qp, wqe->wr.ex.invalidate_rkey);
 				local_ops = 1;
 			}
-			hfi1_send_complete(qp, wqe, err ? IB_WC_LOC_PROT_ERR
+			rvt_send_complete(qp, wqe, err ? IB_WC_LOC_PROT_ERR
 							: IB_WC_SUCCESS);
 			if (local_ops)
 				atomic_dec(&qp->local_ops_pending);
@@ -271,7 +271,8 @@
 	ps->s_txreq->ss = &qp->s_sge;
 	ps->s_txreq->s_cur_size = len;
 	hfi1_make_ruc_header(qp, ohdr, bth0 | (qp->s_state << 24),
-			     mask_psn(qp->s_psn++), middle, ps);
+			     qp->remote_qpn, mask_psn(qp->s_psn++),
+			     middle, ps);
 	return 1;
 
 done_free_tx:
@@ -321,7 +322,7 @@
 	if (hfi1_ruc_check_hdr(ibp, packet))
 		return;
 
-	process_ecn(qp, packet, true);
+	process_ecn(qp, packet);
 
 	psn = ib_bth_get_psn(ohdr);
 	/* Compare the PSN verses the expected PSN. */
@@ -426,7 +427,7 @@
 		qp->r_rcv_len += pmtu;
 		if (unlikely(qp->r_rcv_len > qp->r_len))
 			goto rewind;
-		hfi1_copy_sge(&qp->r_sge, data, pmtu, false, false);
+		rvt_copy_sge(qp, &qp->r_sge, data, pmtu, false, false);
 		break;
 
 	case OP(SEND_LAST_WITH_IMMEDIATE):
@@ -449,7 +450,7 @@
 		if (unlikely(wc.byte_len > qp->r_len))
 			goto rewind;
 		wc.opcode = IB_WC_RECV;
-		hfi1_copy_sge(&qp->r_sge, data, tlen, false, false);
+		rvt_copy_sge(qp, &qp->r_sge, data, tlen, false, false);
 		rvt_put_ss(&qp->s_rdma_read_sge);
 last_imm:
 		wc.wr_id = qp->r_wr_id;
@@ -475,8 +476,7 @@
 		wc.dlid_path_bits = 0;
 		wc.port_num = 0;
 		/* Signal completion event if the solicited bit is set. */
-		rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc,
-			     ib_bth_is_solicited(ohdr));
+		rvt_recv_cq(qp, &wc, ib_bth_is_solicited(ohdr));
 		break;
 
 	case OP(RDMA_WRITE_FIRST):
@@ -523,7 +523,7 @@
 		qp->r_rcv_len += pmtu;
 		if (unlikely(qp->r_rcv_len > qp->r_len))
 			goto drop;
-		hfi1_copy_sge(&qp->r_sge, data, pmtu, true, false);
+		rvt_copy_sge(qp, &qp->r_sge, data, pmtu, true, false);
 		break;
 
 	case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE):
@@ -550,7 +550,7 @@
 		}
 		wc.byte_len = qp->r_len;
 		wc.opcode = IB_WC_RECV_RDMA_WITH_IMM;
-		hfi1_copy_sge(&qp->r_sge, data, tlen, true, false);
+		rvt_copy_sge(qp, &qp->r_sge, data, tlen, true, false);
 		rvt_put_ss(&qp->r_sge);
 		goto last_imm;
 
@@ -564,7 +564,7 @@
 		tlen -= (hdrsize + extra_bytes);
 		if (unlikely(tlen + qp->r_rcv_len != qp->r_len))
 			goto drop;
-		hfi1_copy_sge(&qp->r_sge, data, tlen, true, false);
+		rvt_copy_sge(qp, &qp->r_sge, data, tlen, true, false);
 		rvt_put_ss(&qp->r_sge);
 		break;
 
diff --git a/drivers/infiniband/hw/hfi1/ud.c b/drivers/infiniband/hw/hfi1/ud.c
index 70d39fc..e804af7 100644
--- a/drivers/infiniband/hw/hfi1/ud.c
+++ b/drivers/infiniband/hw/hfi1/ud.c
@@ -1,5 +1,5 @@
 /*
- * Copyright(c) 2015 - 2018 Intel Corporation.
+ * Copyright(c) 2015 - 2019 Intel Corporation.
  *
  * This file is provided under a dual BSD/GPLv2 license.  When using or
  * redistributing this file, you may do so under either license.
@@ -51,6 +51,7 @@
 #include "hfi.h"
 #include "mad.h"
 #include "verbs_txreq.h"
+#include "trace_ibhdrs.h"
 #include "qp.h"
 
 /* We support only two types - 9B and 16B for now */
@@ -86,7 +87,7 @@
 	rcu_read_lock();
 
 	qp = rvt_lookup_qpn(ib_to_rvt(sqp->ibqp.device), &ibp->rvp,
-			    swqe->ud_wr.remote_qpn);
+			    rvt_get_swqe_remote_qpn(swqe));
 	if (!qp) {
 		ibp->rvp.n_pkt_drops++;
 		rcu_read_unlock();
@@ -104,7 +105,7 @@
 		goto drop;
 	}
 
-	ah_attr = &ibah_to_rvtah(swqe->ud_wr.ah)->attr;
+	ah_attr = rvt_get_swqe_ah_attr(swqe);
 	ppd = ppd_from_ibp(ibp);
 
 	if (qp->ibqp.qp_num > 1) {
@@ -134,8 +135,8 @@
 	if (qp->ibqp.qp_num) {
 		u32 qkey;
 
-		qkey = (int)swqe->ud_wr.remote_qkey < 0 ?
-			sqp->qkey : swqe->ud_wr.remote_qkey;
+		qkey = (int)rvt_get_swqe_remote_qkey(swqe) < 0 ?
+			sqp->qkey : rvt_get_swqe_remote_qkey(swqe);
 		if (unlikely(qkey != qp->qkey))
 			goto drop; /* silently drop per IBTA spec */
 	}
@@ -210,8 +211,8 @@
 		}
 
 		hfi1_make_grh(ibp, &grh, &grd, 0, 0);
-		hfi1_copy_sge(&qp->r_sge, &grh,
-			      sizeof(grh), true, false);
+		rvt_copy_sge(qp, &qp->r_sge, &grh,
+			     sizeof(grh), true, false);
 		wc.wc_flags |= IB_WC_GRH;
 	} else {
 		rvt_skip_sge(&qp->r_sge, sizeof(struct ib_grh), true);
@@ -221,31 +222,11 @@
 	ssge.num_sge = swqe->wr.num_sge;
 	sge = &ssge.sge;
 	while (length) {
-		u32 len = sge->length;
+		u32 len = rvt_get_sge_length(sge, length);
 
-		if (len > length)
-			len = length;
-		if (len > sge->sge_length)
-			len = sge->sge_length;
 		WARN_ON_ONCE(len == 0);
-		hfi1_copy_sge(&qp->r_sge, sge->vaddr, len, true, false);
-		sge->vaddr += len;
-		sge->length -= len;
-		sge->sge_length -= len;
-		if (sge->sge_length == 0) {
-			if (--ssge.num_sge)
-				*sge = *ssge.sg_list++;
-		} else if (sge->length == 0 && sge->mr->lkey) {
-			if (++sge->n >= RVT_SEGSZ) {
-				if (++sge->m >= sge->mr->mapsz)
-					break;
-				sge->n = 0;
-			}
-			sge->vaddr =
-				sge->mr->map[sge->m]->segs[sge->n].vaddr;
-			sge->length =
-				sge->mr->map[sge->m]->segs[sge->n].length;
-		}
+		rvt_copy_sge(qp, &qp->r_sge, sge->vaddr, len, true, false);
+		rvt_update_sge(&ssge, len, false);
 		length -= len;
 	}
 	rvt_put_ss(&qp->r_sge);
@@ -259,7 +240,7 @@
 	if (qp->ibqp.qp_type == IB_QPT_GSI || qp->ibqp.qp_type == IB_QPT_SMI) {
 		if (sqp->ibqp.qp_type == IB_QPT_GSI ||
 		    sqp->ibqp.qp_type == IB_QPT_SMI)
-			wc.pkey_index = swqe->ud_wr.pkey_index;
+			wc.pkey_index = rvt_get_swqe_pkey_index(swqe);
 		else
 			wc.pkey_index = sqp->s_pkey_index;
 	} else {
@@ -274,8 +255,7 @@
 	wc.dlid_path_bits = rdma_ah_get_dlid(ah_attr) & ((1 << ppd->lmc) - 1);
 	wc.port_num = qp->port_num;
 	/* Signal completion event if the solicited bit is set. */
-	rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc,
-		     swqe->wr.send_flags & IB_SEND_SOLICITED);
+	rvt_recv_cq(qp, &wc, swqe->wr.send_flags & IB_SEND_SOLICITED);
 	ibp->rvp.n_loop_pkts++;
 bail_unlock:
 	spin_unlock_irqrestore(&qp->r_lock, flags);
@@ -302,20 +282,21 @@
 		bth0 |= IB_BTH_SOLICITED;
 	bth0 |= extra_bytes << 20;
 	if (qp->ibqp.qp_type == IB_QPT_GSI || qp->ibqp.qp_type == IB_QPT_SMI)
-		*pkey = hfi1_get_pkey(ibp, wqe->ud_wr.pkey_index);
+		*pkey = hfi1_get_pkey(ibp, rvt_get_swqe_pkey_index(wqe));
 	else
 		*pkey = hfi1_get_pkey(ibp, qp->s_pkey_index);
 	if (!bypass)
 		bth0 |= *pkey;
 	ohdr->bth[0] = cpu_to_be32(bth0);
-	ohdr->bth[1] = cpu_to_be32(wqe->ud_wr.remote_qpn);
+	ohdr->bth[1] = cpu_to_be32(rvt_get_swqe_remote_qpn(wqe));
 	ohdr->bth[2] = cpu_to_be32(mask_psn(wqe->psn));
 	/*
 	 * Qkeys with the high order bit set mean use the
 	 * qkey from the QP context instead of the WR (see 10.2.5).
 	 */
-	ohdr->u.ud.deth[0] = cpu_to_be32((int)wqe->ud_wr.remote_qkey < 0 ?
-					 qp->qkey : wqe->ud_wr.remote_qkey);
+	ohdr->u.ud.deth[0] =
+		cpu_to_be32((int)rvt_get_swqe_remote_qkey(wqe) < 0 ? qp->qkey :
+			    rvt_get_swqe_remote_qkey(wqe));
 	ohdr->u.ud.deth[1] = cpu_to_be32(qp->ibqp.qp_num);
 }
 
@@ -335,7 +316,7 @@
 
 	ibp = to_iport(qp->ibqp.device, qp->port_num);
 	ppd = ppd_from_ibp(ibp);
-	ah_attr = &ibah_to_rvtah(wqe->ud_wr.ah)->attr;
+	ah_attr = rvt_get_swqe_ah_attr(wqe);
 
 	extra_bytes = -wqe->length & 3;
 	nwords = ((wqe->length + extra_bytes) >> 2) + SIZE_OF_CRC;
@@ -399,7 +380,7 @@
 	struct hfi1_pportdata *ppd;
 	struct hfi1_ibport *ibp;
 	u32 dlid, slid, nwords, extra_bytes;
-	u32 dest_qp = wqe->ud_wr.remote_qpn;
+	u32 dest_qp = rvt_get_swqe_remote_qpn(wqe);
 	u32 src_qp = qp->ibqp.qp_num;
 	u16 len, pkey;
 	u8 l4, sc5;
@@ -407,7 +388,7 @@
 
 	ibp = to_iport(qp->ibqp.device, qp->port_num);
 	ppd = ppd_from_ibp(ibp);
-	ah_attr = &ibah_to_rvtah(wqe->ud_wr.ah)->attr;
+	ah_attr = rvt_get_swqe_ah_attr(wqe);
 
 	/*
 	 * Build 16B Management Packet if either the destination
@@ -469,7 +450,7 @@
 
 	if (is_mgmt) {
 		l4 = OPA_16B_L4_FM;
-		pkey = hfi1_get_pkey(ibp, wqe->ud_wr.pkey_index);
+		pkey = hfi1_get_pkey(ibp, rvt_get_swqe_pkey_index(wqe));
 		hfi1_16B_set_qpn(&ps->s_txreq->phdr.hdr.opah.u.mgmt,
 				 dest_qp, src_qp);
 	} else {
@@ -518,7 +499,7 @@
 			goto bail;
 		}
 		wqe = rvt_get_swqe_ptr(qp, qp->s_last);
-		hfi1_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR);
+		rvt_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR);
 		goto done_free_tx;
 	}
 
@@ -534,7 +515,7 @@
 	/* Construct the header. */
 	ibp = to_iport(qp->ibqp.device, qp->port_num);
 	ppd = ppd_from_ibp(ibp);
-	ah_attr = &ibah_to_rvtah(wqe->ud_wr.ah)->attr;
+	ah_attr = rvt_get_swqe_ah_attr(wqe);
 	priv->hdr_type = hfi1_get_hdr_type(ppd->lid, ah_attr);
 	if ((!hfi1_check_mcast(rdma_ah_get_dlid(ah_attr))) ||
 	    (rdma_ah_get_dlid(ah_attr) == be32_to_cpu(OPA_LID_PERMISSIVE))) {
@@ -560,7 +541,7 @@
 			ud_loopback(qp, wqe);
 			spin_lock_irqsave(&qp->s_lock, tflags);
 			ps->flags = tflags;
-			hfi1_send_complete(qp, wqe, IB_WC_SUCCESS);
+			rvt_send_complete(qp, wqe, IB_WC_SUCCESS);
 			goto done_free_tx;
 		}
 	}
@@ -656,18 +637,19 @@
 	u32 bth0, plen, vl, hwords = 7;
 	u16 len;
 	u8 l4;
-	struct hfi1_16b_header hdr;
+	struct hfi1_opa_header hdr;
 	struct ib_other_headers *ohdr;
 	struct pio_buf *pbuf;
 	struct send_context *ctxt = qp_to_send_context(qp, sc5);
 	struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
 	u32 nwords;
 
+	hdr.hdr_type = HFI1_PKT_TYPE_16B;
 	/* Populate length */
 	nwords = ((hfi1_get_16b_padding(hwords << 2, 0) +
 		   SIZE_OF_LT) >> 2) + SIZE_OF_CRC;
 	if (old_grh) {
-		struct ib_grh *grh = &hdr.u.l.grh;
+		struct ib_grh *grh = &hdr.opah.u.l.grh;
 
 		grh->version_tclass_flow = old_grh->version_tclass_flow;
 		grh->paylen = cpu_to_be16(
@@ -675,11 +657,11 @@
 		grh->hop_limit = 0xff;
 		grh->sgid = old_grh->dgid;
 		grh->dgid = old_grh->sgid;
-		ohdr = &hdr.u.l.oth;
+		ohdr = &hdr.opah.u.l.oth;
 		l4 = OPA_16B_L4_IB_GLOBAL;
 		hwords += sizeof(struct ib_grh) / sizeof(u32);
 	} else {
-		ohdr = &hdr.u.oth;
+		ohdr = &hdr.opah.u.oth;
 		l4 = OPA_16B_L4_IB_LOCAL;
 	}
 
@@ -693,7 +675,7 @@
 
 	/* Convert dwords to flits */
 	len = (hwords + nwords) >> 1;
-	hfi1_make_16b_hdr(&hdr, slid, dlid, len, pkey, 1, 0, l4, sc5);
+	hfi1_make_16b_hdr(&hdr.opah, slid, dlid, len, pkey, 1, 0, l4, sc5);
 
 	plen = 2 /* PBC */ + hwords + nwords;
 	pbc_flags |= PBC_PACKET_BYPASS | PBC_INSERT_BYPASS_ICRC;
@@ -701,9 +683,11 @@
 	pbc = create_pbc(ppd, pbc_flags, qp->srate_mbps, vl, plen);
 	if (ctxt) {
 		pbuf = sc_buffer_alloc(ctxt, plen, NULL, NULL);
-		if (pbuf)
+		if (!IS_ERR_OR_NULL(pbuf)) {
+			trace_pio_output_ibhdr(ppd->dd, &hdr, sc5);
 			ppd->dd->pio_inline_send(ppd->dd, pbuf, pbc,
 						 &hdr, hwords);
+		}
 	}
 }
 
@@ -715,14 +699,15 @@
 	u32 bth0, plen, vl, hwords = 5;
 	u16 lrh0;
 	u8 sl = ibp->sc_to_sl[sc5];
-	struct ib_header hdr;
+	struct hfi1_opa_header hdr;
 	struct ib_other_headers *ohdr;
 	struct pio_buf *pbuf;
 	struct send_context *ctxt = qp_to_send_context(qp, sc5);
 	struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
 
+	hdr.hdr_type = HFI1_PKT_TYPE_9B;
 	if (old_grh) {
-		struct ib_grh *grh = &hdr.u.l.grh;
+		struct ib_grh *grh = &hdr.ibh.u.l.grh;
 
 		grh->version_tclass_flow = old_grh->version_tclass_flow;
 		grh->paylen = cpu_to_be16(
@@ -730,11 +715,11 @@
 		grh->hop_limit = 0xff;
 		grh->sgid = old_grh->dgid;
 		grh->dgid = old_grh->sgid;
-		ohdr = &hdr.u.l.oth;
+		ohdr = &hdr.ibh.u.l.oth;
 		lrh0 = HFI1_LRH_GRH;
 		hwords += sizeof(struct ib_grh) / sizeof(u32);
 	} else {
-		ohdr = &hdr.u.oth;
+		ohdr = &hdr.ibh.u.oth;
 		lrh0 = HFI1_LRH_BTH;
 	}
 
@@ -746,16 +731,18 @@
 	ohdr->bth[1] = cpu_to_be32(remote_qpn | (1 << IB_BECN_SHIFT));
 	ohdr->bth[2] = 0; /* PSN 0 */
 
-	hfi1_make_ib_hdr(&hdr, lrh0, hwords + SIZE_OF_CRC, dlid, slid);
+	hfi1_make_ib_hdr(&hdr.ibh, lrh0, hwords + SIZE_OF_CRC, dlid, slid);
 	plen = 2 /* PBC */ + hwords;
 	pbc_flags |= (ib_is_sc5(sc5) << PBC_DC_INFO_SHIFT);
 	vl = sc_to_vlt(ppd->dd, sc5);
 	pbc = create_pbc(ppd, pbc_flags, qp->srate_mbps, vl, plen);
 	if (ctxt) {
 		pbuf = sc_buffer_alloc(ctxt, plen, NULL, NULL);
-		if (pbuf)
+		if (!IS_ERR_OR_NULL(pbuf)) {
+			trace_pio_output_ibhdr(ppd->dd, &hdr, sc5);
 			ppd->dd->pio_inline_send(ppd->dd, pbuf, pbc,
 						 &hdr, hwords);
+		}
 	}
 }
 
@@ -912,7 +899,7 @@
 		src_qp = hfi1_16B_get_src_qpn(packet->mgmt);
 	}
 
-	process_ecn(qp, packet, (opcode != IB_OPCODE_CNP));
+	process_ecn(qp, packet);
 	/*
 	 * Get the number of bytes the message was padded by
 	 * and drop incomplete packets.
@@ -980,7 +967,6 @@
 	    opcode == IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE) {
 		wc.ex.imm_data = packet->ohdr->u.ud.imm_data;
 		wc.wc_flags = IB_WC_WITH_IMM;
-		tlen -= sizeof(u32);
 	} else if (opcode == IB_OPCODE_UD_SEND_ONLY) {
 		wc.ex.imm_data = 0;
 		wc.wc_flags = 0;
@@ -1019,8 +1005,8 @@
 		goto drop;
 	}
 	if (packet->grh) {
-		hfi1_copy_sge(&qp->r_sge, packet->grh,
-			      sizeof(struct ib_grh), true, false);
+		rvt_copy_sge(qp, &qp->r_sge, packet->grh,
+			     sizeof(struct ib_grh), true, false);
 		wc.wc_flags |= IB_WC_GRH;
 	} else if (packet->etype == RHF_RCV_TYPE_BYPASS) {
 		struct ib_grh grh;
@@ -1030,14 +1016,14 @@
 		 * out when creating 16B, add back the GRH here.
 		 */
 		hfi1_make_ext_grh(packet, &grh, slid, dlid);
-		hfi1_copy_sge(&qp->r_sge, &grh,
-			      sizeof(struct ib_grh), true, false);
+		rvt_copy_sge(qp, &qp->r_sge, &grh,
+			     sizeof(struct ib_grh), true, false);
 		wc.wc_flags |= IB_WC_GRH;
 	} else {
 		rvt_skip_sge(&qp->r_sge, sizeof(struct ib_grh), true);
 	}
-	hfi1_copy_sge(&qp->r_sge, data, wc.byte_len - sizeof(struct ib_grh),
-		      true, false);
+	rvt_copy_sge(qp, &qp->r_sge, data, wc.byte_len - sizeof(struct ib_grh),
+		     true, false);
 	rvt_put_ss(&qp->r_sge);
 	if (!test_and_clear_bit(RVT_R_WRID_VALID, &qp->r_aflags))
 		return;
@@ -1075,7 +1061,7 @@
 		dlid & ((1 << ppd_from_ibp(ibp)->lmc) - 1);
 	wc.port_num = qp->port_num;
 	/* Signal completion event if the solicited bit is set. */
-	rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc, solicited);
+	rvt_recv_cq(qp, &wc, solicited);
 	return;
 
 drop:
diff --git a/drivers/infiniband/hw/hfi1/user_exp_rcv.c b/drivers/infiniband/hw/hfi1/user_exp_rcv.c
index dbe7d14..3592a9e 100644
--- a/drivers/infiniband/hw/hfi1/user_exp_rcv.c
+++ b/drivers/infiniband/hw/hfi1/user_exp_rcv.c
@@ -232,7 +232,7 @@
 	}
 
 	/* Verify that access is OK for the user buffer */
-	if (!access_ok(VERIFY_WRITE, (void __user *)vaddr,
+	if (!access_ok((void __user *)vaddr,
 		       npages * PAGE_SIZE)) {
 		dd_dev_err(dd, "Fail vaddr %p, %u pages, !access_ok\n",
 			   (void *)vaddr, npages);
@@ -324,6 +324,9 @@
 	u32 *tidlist = NULL;
 	struct tid_user_buf *tidbuf;
 
+	if (!PAGE_ALIGNED(tinfo->vaddr))
+		return -EINVAL;
+
 	tidbuf = kzalloc(sizeof(*tidbuf), GFP_KERNEL);
 	if (!tidbuf)
 		return -ENOMEM;
diff --git a/drivers/infiniband/hw/hfi1/user_exp_rcv.h b/drivers/infiniband/hw/hfi1/user_exp_rcv.h
index e383cc0..43b105d 100644
--- a/drivers/infiniband/hw/hfi1/user_exp_rcv.h
+++ b/drivers/infiniband/hw/hfi1/user_exp_rcv.h
@@ -48,7 +48,6 @@
  */
 
 #include "hfi.h"
-
 #include "exp_rcv.h"
 
 struct tid_pageset {
diff --git a/drivers/infiniband/hw/hfi1/user_pages.c b/drivers/infiniband/hw/hfi1/user_pages.c
index e341e6d..469acb9 100644
--- a/drivers/infiniband/hw/hfi1/user_pages.c
+++ b/drivers/infiniband/hw/hfi1/user_pages.c
@@ -91,9 +91,7 @@
 	/* Convert to number of pages */
 	size = DIV_ROUND_UP(size, PAGE_SIZE);
 
-	down_read(&mm->mmap_sem);
-	pinned = mm->pinned_vm;
-	up_read(&mm->mmap_sem);
+	pinned = atomic64_read(&mm->pinned_vm);
 
 	/* First, check the absolute limit against all pinned pages. */
 	if (pinned + npages >= ulimit && !can_lock)
@@ -106,14 +104,13 @@
 			    bool writable, struct page **pages)
 {
 	int ret;
+	unsigned int gup_flags = FOLL_LONGTERM | (writable ? FOLL_WRITE : 0);
 
-	ret = get_user_pages_fast(vaddr, npages, writable, pages);
+	ret = get_user_pages_fast(vaddr, npages, gup_flags, pages);
 	if (ret < 0)
 		return ret;
 
-	down_write(&mm->mmap_sem);
-	mm->pinned_vm += ret;
-	up_write(&mm->mmap_sem);
+	atomic64_add(ret, &mm->pinned_vm);
 
 	return ret;
 }
@@ -121,17 +118,9 @@
 void hfi1_release_user_pages(struct mm_struct *mm, struct page **p,
 			     size_t npages, bool dirty)
 {
-	size_t i;
-
-	for (i = 0; i < npages; i++) {
-		if (dirty)
-			set_page_dirty_lock(p[i]);
-		put_page(p[i]);
-	}
+	put_user_pages_dirty_lock(p, npages, dirty);
 
 	if (mm) { /* during close after signal, mm can be NULL */
-		down_write(&mm->mmap_sem);
-		mm->pinned_vm -= npages;
-		up_write(&mm->mmap_sem);
+		atomic64_sub(npages, &mm->pinned_vm);
 	}
 }
diff --git a/drivers/infiniband/hw/hfi1/user_sdma.c b/drivers/infiniband/hw/hfi1/user_sdma.c
index 51831bf..fd754a1 100644
--- a/drivers/infiniband/hw/hfi1/user_sdma.c
+++ b/drivers/infiniband/hw/hfi1/user_sdma.c
@@ -1,5 +1,5 @@
 /*
- * Copyright(c) 2015 - 2017 Intel Corporation.
+ * Copyright(c) 2015 - 2018 Intel Corporation.
  *
  * This file is provided under a dual BSD/GPLv2 license.  When using or
  * redistributing this file, you may do so under either license.
@@ -76,8 +76,7 @@
 
 static unsigned initial_pkt_count = 8;
 
-static int user_sdma_send_pkts(struct user_sdma_request *req,
-			       unsigned maxpkts);
+static int user_sdma_send_pkts(struct user_sdma_request *req, u16 maxpkts);
 static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status);
 static inline void pq_update(struct hfi1_user_sdma_pkt_q *pq);
 static void user_sdma_free_request(struct user_sdma_request *req, bool unpin);
@@ -101,7 +100,7 @@
 
 static int defer_packet_queue(
 	struct sdma_engine *sde,
-	struct iowait *wait,
+	struct iowait_work *wait,
 	struct sdma_txreq *txreq,
 	uint seq,
 	bool pkts_sent);
@@ -124,33 +123,31 @@
 
 static int defer_packet_queue(
 	struct sdma_engine *sde,
-	struct iowait *wait,
+	struct iowait_work *wait,
 	struct sdma_txreq *txreq,
 	uint seq,
 	bool pkts_sent)
 {
 	struct hfi1_user_sdma_pkt_q *pq =
-		container_of(wait, struct hfi1_user_sdma_pkt_q, busy);
-	struct hfi1_ibdev *dev = &pq->dd->verbs_dev;
-	struct user_sdma_txreq *tx =
-		container_of(txreq, struct user_sdma_txreq, txreq);
+		container_of(wait->iow, struct hfi1_user_sdma_pkt_q, busy);
 
-	if (sdma_progress(sde, seq, txreq)) {
-		if (tx->busycount++ < MAX_DEFER_RETRY_COUNT)
-			goto eagain;
-	}
+	write_seqlock(&sde->waitlock);
+	if (sdma_progress(sde, seq, txreq))
+		goto eagain;
 	/*
 	 * We are assuming that if the list is enqueued somewhere, it
 	 * is to the dmawait list since that is the only place where
 	 * it is supposed to be enqueued.
 	 */
 	xchg(&pq->state, SDMA_PKT_Q_DEFERRED);
-	write_seqlock(&dev->iowait_lock);
-	if (list_empty(&pq->busy.list))
+	if (list_empty(&pq->busy.list)) {
+		iowait_get_priority(&pq->busy);
 		iowait_queue(pkts_sent, &pq->busy, &sde->dmawait);
-	write_sequnlock(&dev->iowait_lock);
+	}
+	write_sequnlock(&sde->waitlock);
 	return -EBUSY;
 eagain:
+	write_sequnlock(&sde->waitlock);
 	return -EAGAIN;
 }
 
@@ -192,8 +189,8 @@
 	atomic_set(&pq->n_locked, 0);
 	pq->mm = fd->mm;
 
-	iowait_init(&pq->busy, 0, NULL, defer_packet_queue,
-		    activate_packet_queue, NULL);
+	iowait_init(&pq->busy, 0, NULL, NULL, defer_packet_queue,
+		    activate_packet_queue, NULL, NULL);
 	pq->reqidx = 0;
 
 	pq->reqs = kcalloc(hfi1_sdma_comp_ring_size,
@@ -756,9 +753,10 @@
 	return ret;
 }
 
-static int user_sdma_send_pkts(struct user_sdma_request *req, unsigned maxpkts)
+static int user_sdma_send_pkts(struct user_sdma_request *req, u16 maxpkts)
 {
-	int ret = 0, count;
+	int ret = 0;
+	u16 count;
 	unsigned npkts = 0;
 	struct user_sdma_txreq *tx = NULL;
 	struct hfi1_user_sdma_pkt_q *pq = NULL;
@@ -803,7 +801,6 @@
 
 		tx->flags = 0;
 		tx->req = req;
-		tx->busycount = 0;
 		INIT_LIST_HEAD(&tx->list);
 
 		/*
@@ -860,8 +857,10 @@
 
 				changes = set_txreq_header_ahg(req, tx,
 							       datalen);
-				if (changes < 0)
+				if (changes < 0) {
+					ret = changes;
 					goto free_tx;
+				}
 			}
 		} else {
 			ret = sdma_txinit(&tx->txreq, 0, sizeof(req->hdr) +
@@ -910,7 +909,9 @@
 		npkts++;
 	}
 dosend:
-	ret = sdma_send_txlist(req->sde, &pq->busy, &req->txps, &count);
+	ret = sdma_send_txlist(req->sde,
+			       iowait_get_ib_work(&pq->busy),
+			       &req->txps, &count);
 	req->seqsubmitted += count;
 	if (req->seqsubmitted == req->info.npkts) {
 		/*
@@ -1123,7 +1124,8 @@
 			0xffffffull),
 		psn = val & mask;
 	if (expct)
-		psn = (psn & ~BTH_SEQ_MASK) | ((psn + frags) & BTH_SEQ_MASK);
+		psn = (psn & ~HFI1_KDETH_BTH_SEQ_MASK) |
+			((psn + frags) & HFI1_KDETH_BTH_SEQ_MASK);
 	else
 		psn = psn + frags;
 	return psn & mask;
diff --git a/drivers/infiniband/hw/hfi1/user_sdma.h b/drivers/infiniband/hw/hfi1/user_sdma.h
index 91c343f..9972e0e 100644
--- a/drivers/infiniband/hw/hfi1/user_sdma.h
+++ b/drivers/infiniband/hw/hfi1/user_sdma.h
@@ -110,12 +110,6 @@
 	SDMA_PKT_Q_DEFERRED,
 };
 
-/*
- * Maximum retry attempts to submit a TX request
- * before putting the process to sleep.
- */
-#define MAX_DEFER_RETRY_COUNT 1
-
 #define SDMA_IOWAIT_TIMEOUT 1000 /* in milliseconds */
 
 #define SDMA_DBG(req, fmt, ...)				     \
@@ -204,12 +198,12 @@
 	s8 ahg_idx;
 
 	/* Writeable fields shared with interrupt */
-	u64 seqcomp ____cacheline_aligned_in_smp;
-	u64 seqsubmitted;
+	u16 seqcomp ____cacheline_aligned_in_smp;
+	u16 seqsubmitted;
 
 	/* Send side fields */
 	struct list_head txps ____cacheline_aligned_in_smp;
-	u64 seqnum;
+	u16 seqnum;
 	/*
 	 * KDETH.OFFSET (TID) field
 	 * The offset can cover multiple packets, depending on the
@@ -245,8 +239,7 @@
 	struct list_head list;
 	struct user_sdma_request *req;
 	u16 flags;
-	unsigned int busycount;
-	u64 seqnum;
+	u16 seqnum;
 };
 
 int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *uctxt,
diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c
index 3dfb4cf..089e201 100644
--- a/drivers/infiniband/hw/hfi1/verbs.c
+++ b/drivers/infiniband/hw/hfi1/verbs.c
@@ -54,6 +54,7 @@
 #include <linux/mm.h>
 #include <linux/vmalloc.h>
 #include <rdma/opa_addr.h>
+#include <linux/nospec.h>
 
 #include "hfi.h"
 #include "common.h"
@@ -129,8 +130,6 @@
 module_param(piothreshold, ushort, S_IRUGO);
 MODULE_PARM_DESC(piothreshold, "size used to determine sdma vs. pio");
 
-#define COPY_CACHELESS 1
-#define COPY_ADAPTIVE  2
 static unsigned int sge_copy_mode;
 module_param(sge_copy_mode, uint, S_IRUGO);
 MODULE_PARM_DESC(sge_copy_mode,
@@ -148,171 +147,24 @@
 /* Length of buffer to create verbs txreq cache name */
 #define TXREQ_NAME_LEN 24
 
-/* 16B trailing buffer */
-static const u8 trail_buf[MAX_16B_PADDING];
-
-static uint wss_threshold;
+static uint wss_threshold = 80;
 module_param(wss_threshold, uint, S_IRUGO);
 MODULE_PARM_DESC(wss_threshold, "Percentage (1-100) of LLC to use as a threshold for a cacheless copy");
 static uint wss_clean_period = 256;
 module_param(wss_clean_period, uint, S_IRUGO);
 MODULE_PARM_DESC(wss_clean_period, "Count of verbs copies before an entry in the page copy table is cleaned");
 
-/* memory working set size */
-struct hfi1_wss {
-	unsigned long *entries;
-	atomic_t total_count;
-	atomic_t clean_counter;
-	atomic_t clean_entry;
-
-	int threshold;
-	int num_entries;
-	long pages_mask;
-};
-
-static struct hfi1_wss wss;
-
-int hfi1_wss_init(void)
-{
-	long llc_size;
-	long llc_bits;
-	long table_size;
-	long table_bits;
-
-	/* check for a valid percent range - default to 80 if none or invalid */
-	if (wss_threshold < 1 || wss_threshold > 100)
-		wss_threshold = 80;
-	/* reject a wildly large period */
-	if (wss_clean_period > 1000000)
-		wss_clean_period = 256;
-	/* reject a zero period */
-	if (wss_clean_period == 0)
-		wss_clean_period = 1;
-
-	/*
-	 * Calculate the table size - the next power of 2 larger than the
-	 * LLC size.  LLC size is in KiB.
-	 */
-	llc_size = wss_llc_size() * 1024;
-	table_size = roundup_pow_of_two(llc_size);
-
-	/* one bit per page in rounded up table */
-	llc_bits = llc_size / PAGE_SIZE;
-	table_bits = table_size / PAGE_SIZE;
-	wss.pages_mask = table_bits - 1;
-	wss.num_entries = table_bits / BITS_PER_LONG;
-
-	wss.threshold = (llc_bits * wss_threshold) / 100;
-	if (wss.threshold == 0)
-		wss.threshold = 1;
-
-	atomic_set(&wss.clean_counter, wss_clean_period);
-
-	wss.entries = kcalloc(wss.num_entries, sizeof(*wss.entries),
-			      GFP_KERNEL);
-	if (!wss.entries) {
-		hfi1_wss_exit();
-		return -ENOMEM;
-	}
-
-	return 0;
-}
-
-void hfi1_wss_exit(void)
-{
-	/* coded to handle partially initialized and repeat callers */
-	kfree(wss.entries);
-	wss.entries = NULL;
-}
-
-/*
- * Advance the clean counter.  When the clean period has expired,
- * clean an entry.
- *
- * This is implemented in atomics to avoid locking.  Because multiple
- * variables are involved, it can be racy which can lead to slightly
- * inaccurate information.  Since this is only a heuristic, this is
- * OK.  Any innaccuracies will clean themselves out as the counter
- * advances.  That said, it is unlikely the entry clean operation will
- * race - the next possible racer will not start until the next clean
- * period.
- *
- * The clean counter is implemented as a decrement to zero.  When zero
- * is reached an entry is cleaned.
- */
-static void wss_advance_clean_counter(void)
-{
-	int entry;
-	int weight;
-	unsigned long bits;
-
-	/* become the cleaner if we decrement the counter to zero */
-	if (atomic_dec_and_test(&wss.clean_counter)) {
-		/*
-		 * Set, not add, the clean period.  This avoids an issue
-		 * where the counter could decrement below the clean period.
-		 * Doing a set can result in lost decrements, slowing the
-		 * clean advance.  Since this a heuristic, this possible
-		 * slowdown is OK.
-		 *
-		 * An alternative is to loop, advancing the counter by a
-		 * clean period until the result is > 0. However, this could
-		 * lead to several threads keeping another in the clean loop.
-		 * This could be mitigated by limiting the number of times
-		 * we stay in the loop.
-		 */
-		atomic_set(&wss.clean_counter, wss_clean_period);
-
-		/*
-		 * Uniquely grab the entry to clean and move to next.
-		 * The current entry is always the lower bits of
-		 * wss.clean_entry.  The table size, wss.num_entries,
-		 * is always a power-of-2.
-		 */
-		entry = (atomic_inc_return(&wss.clean_entry) - 1)
-			& (wss.num_entries - 1);
-
-		/* clear the entry and count the bits */
-		bits = xchg(&wss.entries[entry], 0);
-		weight = hweight64((u64)bits);
-		/* only adjust the contended total count if needed */
-		if (weight)
-			atomic_sub(weight, &wss.total_count);
-	}
-}
-
-/*
- * Insert the given address into the working set array.
- */
-static void wss_insert(void *address)
-{
-	u32 page = ((unsigned long)address >> PAGE_SHIFT) & wss.pages_mask;
-	u32 entry = page / BITS_PER_LONG; /* assumes this ends up a shift */
-	u32 nr = page & (BITS_PER_LONG - 1);
-
-	if (!test_and_set_bit(nr, &wss.entries[entry]))
-		atomic_inc(&wss.total_count);
-
-	wss_advance_clean_counter();
-}
-
-/*
- * Is the working set larger than the threshold?
- */
-static inline bool wss_exceeds_threshold(void)
-{
-	return atomic_read(&wss.total_count) >= wss.threshold;
-}
-
 /*
  * Translate ib_wr_opcode into ib_wc_opcode.
  */
 const enum ib_wc_opcode ib_hfi1_wc_opcode[] = {
 	[IB_WR_RDMA_WRITE] = IB_WC_RDMA_WRITE,
+	[IB_WR_TID_RDMA_WRITE] = IB_WC_RDMA_WRITE,
 	[IB_WR_RDMA_WRITE_WITH_IMM] = IB_WC_RDMA_WRITE,
 	[IB_WR_SEND] = IB_WC_SEND,
 	[IB_WR_SEND_WITH_IMM] = IB_WC_SEND,
 	[IB_WR_RDMA_READ] = IB_WC_RDMA_READ,
+	[IB_WR_TID_RDMA_READ] = IB_WC_RDMA_READ,
 	[IB_WR_ATOMIC_CMP_AND_SWP] = IB_WC_COMP_SWAP,
 	[IB_WR_ATOMIC_FETCH_AND_ADD] = IB_WC_FETCH_ADD,
 	[IB_WR_SEND_WITH_INV] = IB_WC_SEND,
@@ -348,6 +200,14 @@
 	[IB_OPCODE_RC_FETCH_ADD]                      = 12 + 8 + 28,
 	[IB_OPCODE_RC_SEND_LAST_WITH_INVALIDATE]      = 12 + 8 + 4,
 	[IB_OPCODE_RC_SEND_ONLY_WITH_INVALIDATE]      = 12 + 8 + 4,
+	[IB_OPCODE_TID_RDMA_READ_REQ]                 = 12 + 8 + 36,
+	[IB_OPCODE_TID_RDMA_READ_RESP]                = 12 + 8 + 36,
+	[IB_OPCODE_TID_RDMA_WRITE_REQ]                = 12 + 8 + 36,
+	[IB_OPCODE_TID_RDMA_WRITE_RESP]               = 12 + 8 + 36,
+	[IB_OPCODE_TID_RDMA_WRITE_DATA]               = 12 + 8 + 36,
+	[IB_OPCODE_TID_RDMA_WRITE_DATA_LAST]          = 12 + 8 + 36,
+	[IB_OPCODE_TID_RDMA_ACK]                      = 12 + 8 + 36,
+	[IB_OPCODE_TID_RDMA_RESYNC]                   = 12 + 8 + 36,
 	/* UC */
 	[IB_OPCODE_UC_SEND_FIRST]                     = 12 + 8,
 	[IB_OPCODE_UC_SEND_MIDDLE]                    = 12 + 8,
@@ -391,6 +251,17 @@
 	[IB_OPCODE_RC_FETCH_ADD]                      = &hfi1_rc_rcv,
 	[IB_OPCODE_RC_SEND_LAST_WITH_INVALIDATE]      = &hfi1_rc_rcv,
 	[IB_OPCODE_RC_SEND_ONLY_WITH_INVALIDATE]      = &hfi1_rc_rcv,
+
+	/* TID RDMA has separate handlers for different opcodes.*/
+	[IB_OPCODE_TID_RDMA_WRITE_REQ]       = &hfi1_rc_rcv_tid_rdma_write_req,
+	[IB_OPCODE_TID_RDMA_WRITE_RESP]      = &hfi1_rc_rcv_tid_rdma_write_resp,
+	[IB_OPCODE_TID_RDMA_WRITE_DATA]      = &hfi1_rc_rcv_tid_rdma_write_data,
+	[IB_OPCODE_TID_RDMA_WRITE_DATA_LAST] = &hfi1_rc_rcv_tid_rdma_write_data,
+	[IB_OPCODE_TID_RDMA_READ_REQ]        = &hfi1_rc_rcv_tid_rdma_read_req,
+	[IB_OPCODE_TID_RDMA_READ_RESP]       = &hfi1_rc_rcv_tid_rdma_read_resp,
+	[IB_OPCODE_TID_RDMA_RESYNC]          = &hfi1_rc_rcv_tid_rdma_resync,
+	[IB_OPCODE_TID_RDMA_ACK]             = &hfi1_rc_rcv_tid_rdma_ack,
+
 	/* UC */
 	[IB_OPCODE_UC_SEND_FIRST]                     = &hfi1_uc_rcv,
 	[IB_OPCODE_UC_SEND_MIDDLE]                    = &hfi1_uc_rcv,
@@ -438,79 +309,6 @@
  */
 __be64 ib_hfi1_sys_image_guid;
 
-/**
- * hfi1_copy_sge - copy data to SGE memory
- * @ss: the SGE state
- * @data: the data to copy
- * @length: the length of the data
- * @release: boolean to release MR
- * @copy_last: do a separate copy of the last 8 bytes
- */
-void hfi1_copy_sge(
-	struct rvt_sge_state *ss,
-	void *data, u32 length,
-	bool release,
-	bool copy_last)
-{
-	struct rvt_sge *sge = &ss->sge;
-	int i;
-	bool in_last = false;
-	bool cacheless_copy = false;
-
-	if (sge_copy_mode == COPY_CACHELESS) {
-		cacheless_copy = length >= PAGE_SIZE;
-	} else if (sge_copy_mode == COPY_ADAPTIVE) {
-		if (length >= PAGE_SIZE) {
-			/*
-			 * NOTE: this *assumes*:
-			 * o The first vaddr is the dest.
-			 * o If multiple pages, then vaddr is sequential.
-			 */
-			wss_insert(sge->vaddr);
-			if (length >= (2 * PAGE_SIZE))
-				wss_insert(sge->vaddr + PAGE_SIZE);
-
-			cacheless_copy = wss_exceeds_threshold();
-		} else {
-			wss_advance_clean_counter();
-		}
-	}
-	if (copy_last) {
-		if (length > 8) {
-			length -= 8;
-		} else {
-			copy_last = false;
-			in_last = true;
-		}
-	}
-
-again:
-	while (length) {
-		u32 len = rvt_get_sge_length(sge, length);
-
-		WARN_ON_ONCE(len == 0);
-		if (unlikely(in_last)) {
-			/* enforce byte transfer ordering */
-			for (i = 0; i < len; i++)
-				((u8 *)sge->vaddr)[i] = ((u8 *)data)[i];
-		} else if (cacheless_copy) {
-			cacheless_memcpy(sge->vaddr, data, len);
-		} else {
-			memcpy(sge->vaddr, data, len);
-		}
-		rvt_update_sge(ss, len, release);
-		data += len;
-		length -= len;
-	}
-
-	if (copy_last) {
-		copy_last = false;
-		in_last = true;
-		length = 8;
-		goto again;
-	}
-}
-
 /*
  * Make sure the QP is ready and able to accept the given opcode.
  */
@@ -529,7 +327,7 @@
 static u64 hfi1_fault_tx(struct rvt_qp *qp, u8 opcode, u64 pbc)
 {
 #ifdef CONFIG_FAULT_INJECTION
-	if ((opcode & IB_OPCODE_MSP) == IB_OPCODE_MSP)
+	if ((opcode & IB_OPCODE_MSP) == IB_OPCODE_MSP) {
 		/*
 		 * In order to drop non-IB traffic we
 		 * set PbcInsertHrc to NONE (0x2).
@@ -540,8 +338,9 @@
 		 * packet will not be delivered to the
 		 * correct context.
 		 */
+		pbc &= ~PBC_INSERT_HCRC_SMASK;
 		pbc |= (u64)PBC_IHCRC_NONE << PBC_INSERT_HCRC_SHIFT;
-	else
+	} else {
 		/*
 		 * In order to drop regular verbs
 		 * traffic we set the PbcTestEbp
@@ -551,10 +350,129 @@
 		 * triggered and will be dropped.
 		 */
 		pbc |= PBC_TEST_EBP;
+	}
 #endif
 	return pbc;
 }
 
+static opcode_handler tid_qp_ok(int opcode, struct hfi1_packet *packet)
+{
+	if (packet->qp->ibqp.qp_type != IB_QPT_RC ||
+	    !(ib_rvt_state_ops[packet->qp->state] & RVT_PROCESS_RECV_OK))
+		return NULL;
+	if ((opcode & RVT_OPCODE_QP_MASK) == IB_OPCODE_TID_RDMA)
+		return opcode_handler_tbl[opcode];
+	return NULL;
+}
+
+void hfi1_kdeth_eager_rcv(struct hfi1_packet *packet)
+{
+	struct hfi1_ctxtdata *rcd = packet->rcd;
+	struct ib_header *hdr = packet->hdr;
+	u32 tlen = packet->tlen;
+	struct hfi1_pportdata *ppd = rcd->ppd;
+	struct hfi1_ibport *ibp = &ppd->ibport_data;
+	struct rvt_dev_info *rdi = &ppd->dd->verbs_dev.rdi;
+	opcode_handler opcode_handler;
+	unsigned long flags;
+	u32 qp_num;
+	int lnh;
+	u8 opcode;
+
+	/* DW == LRH (2) + BTH (3) + KDETH (9) + CRC (1) */
+	if (unlikely(tlen < 15 * sizeof(u32)))
+		goto drop;
+
+	lnh = be16_to_cpu(hdr->lrh[0]) & 3;
+	if (lnh != HFI1_LRH_BTH)
+		goto drop;
+
+	packet->ohdr = &hdr->u.oth;
+	trace_input_ibhdr(rcd->dd, packet, !!(rhf_dc_info(packet->rhf)));
+
+	opcode = (be32_to_cpu(packet->ohdr->bth[0]) >> 24);
+	inc_opstats(tlen, &rcd->opstats->stats[opcode]);
+
+	/* verbs_qp can be picked up from any tid_rdma header struct */
+	qp_num = be32_to_cpu(packet->ohdr->u.tid_rdma.r_req.verbs_qp) &
+		RVT_QPN_MASK;
+
+	rcu_read_lock();
+	packet->qp = rvt_lookup_qpn(rdi, &ibp->rvp, qp_num);
+	if (!packet->qp)
+		goto drop_rcu;
+	spin_lock_irqsave(&packet->qp->r_lock, flags);
+	opcode_handler = tid_qp_ok(opcode, packet);
+	if (likely(opcode_handler))
+		opcode_handler(packet);
+	else
+		goto drop_unlock;
+	spin_unlock_irqrestore(&packet->qp->r_lock, flags);
+	rcu_read_unlock();
+
+	return;
+drop_unlock:
+	spin_unlock_irqrestore(&packet->qp->r_lock, flags);
+drop_rcu:
+	rcu_read_unlock();
+drop:
+	ibp->rvp.n_pkt_drops++;
+}
+
+void hfi1_kdeth_expected_rcv(struct hfi1_packet *packet)
+{
+	struct hfi1_ctxtdata *rcd = packet->rcd;
+	struct ib_header *hdr = packet->hdr;
+	u32 tlen = packet->tlen;
+	struct hfi1_pportdata *ppd = rcd->ppd;
+	struct hfi1_ibport *ibp = &ppd->ibport_data;
+	struct rvt_dev_info *rdi = &ppd->dd->verbs_dev.rdi;
+	opcode_handler opcode_handler;
+	unsigned long flags;
+	u32 qp_num;
+	int lnh;
+	u8 opcode;
+
+	/* DW == LRH (2) + BTH (3) + KDETH (9) + CRC (1) */
+	if (unlikely(tlen < 15 * sizeof(u32)))
+		goto drop;
+
+	lnh = be16_to_cpu(hdr->lrh[0]) & 3;
+	if (lnh != HFI1_LRH_BTH)
+		goto drop;
+
+	packet->ohdr = &hdr->u.oth;
+	trace_input_ibhdr(rcd->dd, packet, !!(rhf_dc_info(packet->rhf)));
+
+	opcode = (be32_to_cpu(packet->ohdr->bth[0]) >> 24);
+	inc_opstats(tlen, &rcd->opstats->stats[opcode]);
+
+	/* verbs_qp can be picked up from any tid_rdma header struct */
+	qp_num = be32_to_cpu(packet->ohdr->u.tid_rdma.r_rsp.verbs_qp) &
+		RVT_QPN_MASK;
+
+	rcu_read_lock();
+	packet->qp = rvt_lookup_qpn(rdi, &ibp->rvp, qp_num);
+	if (!packet->qp)
+		goto drop_rcu;
+	spin_lock_irqsave(&packet->qp->r_lock, flags);
+	opcode_handler = tid_qp_ok(opcode, packet);
+	if (likely(opcode_handler))
+		opcode_handler(packet);
+	else
+		goto drop_unlock;
+	spin_unlock_irqrestore(&packet->qp->r_lock, flags);
+	rcu_read_unlock();
+
+	return;
+drop_unlock:
+	spin_unlock_irqrestore(&packet->qp->r_lock, flags);
+drop_rcu:
+	rcu_read_unlock();
+drop:
+	ibp->rvp.n_pkt_drops++;
+}
+
 static int hfi1_do_pkey_check(struct hfi1_packet *packet)
 {
 	struct hfi1_ctxtdata *rcd = packet->rcd;
@@ -713,11 +631,13 @@
 
 	spin_lock(&qp->s_lock);
 	if (tx->wqe) {
-		hfi1_send_complete(qp, tx->wqe, IB_WC_SUCCESS);
+		rvt_send_complete(qp, tx->wqe, IB_WC_SUCCESS);
 	} else if (qp->ibqp.qp_type == IB_QPT_RC) {
 		struct hfi1_opa_header *hdr;
 
 		hdr = &tx->phdr.hdr;
+		if (unlikely(status == SDMA_TXREQ_S_ABORTED))
+			hfi1_rc_verbs_aborted(qp, hdr);
 		hfi1_rc_send_complete(qp, hdr);
 	}
 	spin_unlock(&qp->s_lock);
@@ -725,11 +645,28 @@
 	hfi1_put_txreq(tx);
 }
 
+void hfi1_wait_kmem(struct rvt_qp *qp)
+{
+	struct hfi1_qp_priv *priv = qp->priv;
+	struct ib_qp *ibqp = &qp->ibqp;
+	struct ib_device *ibdev = ibqp->device;
+	struct hfi1_ibdev *dev = to_idev(ibdev);
+
+	if (list_empty(&priv->s_iowait.list)) {
+		if (list_empty(&dev->memwait))
+			mod_timer(&dev->mem_timer, jiffies + 1);
+		qp->s_flags |= RVT_S_WAIT_KMEM;
+		list_add_tail(&priv->s_iowait.list, &dev->memwait);
+		priv->s_iowait.lock = &dev->iowait_lock;
+		trace_hfi1_qpsleep(qp, RVT_S_WAIT_KMEM);
+		rvt_get_qp(qp);
+	}
+}
+
 static int wait_kmem(struct hfi1_ibdev *dev,
 		     struct rvt_qp *qp,
 		     struct hfi1_pkt_state *ps)
 {
-	struct hfi1_qp_priv *priv = qp->priv;
 	unsigned long flags;
 	int ret = 0;
 
@@ -737,18 +674,10 @@
 	if (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) {
 		write_seqlock(&dev->iowait_lock);
 		list_add_tail(&ps->s_txreq->txreq.list,
-			      &priv->s_iowait.tx_head);
-		if (list_empty(&priv->s_iowait.list)) {
-			if (list_empty(&dev->memwait))
-				mod_timer(&dev->mem_timer, jiffies + 1);
-			qp->s_flags |= RVT_S_WAIT_KMEM;
-			list_add_tail(&priv->s_iowait.list, &dev->memwait);
-			priv->s_iowait.lock = &dev->iowait_lock;
-			trace_hfi1_qpsleep(qp, RVT_S_WAIT_KMEM);
-			rvt_get_qp(qp);
-		}
+			      &ps->wait->tx_head);
+		hfi1_wait_kmem(qp);
 		write_sequnlock(&dev->iowait_lock);
-		qp->s_flags &= ~RVT_S_BUSY;
+		hfi1_qp_unbusy(qp, ps->wait);
 		ret = -EBUSY;
 	}
 	spin_unlock_irqrestore(&qp->s_lock, flags);
@@ -774,11 +703,7 @@
 	int ret = 0;
 
 	while (length) {
-		len = ss->sge.length;
-		if (len > length)
-			len = length;
-		if (len > ss->sge.sge_length)
-			len = ss->sge.sge_length;
+		len = rvt_get_sge_length(&ss->sge, length);
 		WARN_ON_ONCE(len == 0);
 		ret = sdma_txadd_kvaddr(
 			sde->dd,
@@ -892,13 +817,22 @@
 
 	/* add icrc, lt byte, and padding to flit */
 	if (extra_bytes)
-		ret = sdma_txadd_kvaddr(sde->dd, &tx->txreq,
-					(void *)trail_buf, extra_bytes);
+		ret = sdma_txadd_daddr(sde->dd, &tx->txreq,
+				       sde->dd->sdma_pad_phys, extra_bytes);
 
 bail_txadd:
 	return ret;
 }
 
+static u64 update_hcrc(u8 opcode, u64 pbc)
+{
+	if ((opcode & IB_OPCODE_TID_RDMA) == IB_OPCODE_TID_RDMA) {
+		pbc &= ~PBC_INSERT_HCRC_SMASK;
+		pbc |= (u64)PBC_IHCRC_LKDETH << PBC_INSERT_HCRC_SHIFT;
+	}
+	return pbc;
+}
+
 int hfi1_verbs_send_dma(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
 			u64 pbc)
 {
@@ -937,21 +871,24 @@
 			else
 				pbc |= (ib_is_sc5(sc5) << PBC_DC_INFO_SHIFT);
 
-			if (unlikely(hfi1_dbg_should_fault_tx(qp, ps->opcode)))
-				pbc = hfi1_fault_tx(qp, ps->opcode, pbc);
 			pbc = create_pbc(ppd,
 					 pbc,
 					 qp->srate_mbps,
 					 vl,
 					 plen);
+
+			if (unlikely(hfi1_dbg_should_fault_tx(qp, ps->opcode)))
+				pbc = hfi1_fault_tx(qp, ps->opcode, pbc);
+			else
+				/* Update HCRC based on packet opcode */
+				pbc = update_hcrc(ps->opcode, pbc);
 		}
 		tx->wqe = qp->s_wqe;
 		ret = build_verbs_tx_desc(tx->sde, len, tx, ahg_info, pbc);
 		if (unlikely(ret))
 			goto bail_build;
 	}
-	ret =  sdma_send_txreq(tx->sde, &priv->s_iowait, &tx->txreq,
-			       ps->pkts_sent);
+	ret =  sdma_send_txreq(tx->sde, ps->wait, &tx->txreq, ps->pkts_sent);
 	if (unlikely(ret < 0)) {
 		if (ret == -ECOMM)
 			goto bail_ecomm;
@@ -987,7 +924,6 @@
 {
 	struct hfi1_qp_priv *priv = qp->priv;
 	struct hfi1_devdata *dd = sc->dd;
-	struct hfi1_ibdev *dev = &dd->verbs_dev;
 	unsigned long flags;
 	int ret = 0;
 
@@ -999,9 +935,9 @@
 	 */
 	spin_lock_irqsave(&qp->s_lock, flags);
 	if (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) {
-		write_seqlock(&dev->iowait_lock);
+		write_seqlock(&sc->waitlock);
 		list_add_tail(&ps->s_txreq->txreq.list,
-			      &priv->s_iowait.tx_head);
+			      &ps->wait->tx_head);
 		if (list_empty(&priv->s_iowait.list)) {
 			struct hfi1_ibdev *dev = &dd->verbs_dev;
 			int was_empty;
@@ -1010,17 +946,18 @@
 			dev->n_piodrain += !!(flag & HFI1_S_WAIT_PIO_DRAIN);
 			qp->s_flags |= flag;
 			was_empty = list_empty(&sc->piowait);
+			iowait_get_priority(&priv->s_iowait);
 			iowait_queue(ps->pkts_sent, &priv->s_iowait,
 				     &sc->piowait);
-			priv->s_iowait.lock = &dev->iowait_lock;
+			priv->s_iowait.lock = &sc->waitlock;
 			trace_hfi1_qpsleep(qp, RVT_S_WAIT_PIO);
 			rvt_get_qp(qp);
 			/* counting: only call wantpiobuf_intr if first user */
 			if (was_empty)
 				hfi1_sc_wantpiobuf_intr(sc, 1);
 		}
-		write_sequnlock(&dev->iowait_lock);
-		qp->s_flags &= ~RVT_S_BUSY;
+		write_sequnlock(&sc->waitlock);
+		hfi1_qp_unbusy(qp, ps->wait);
 		ret = -EBUSY;
 	}
 	spin_unlock_irqrestore(&qp->s_lock, flags);
@@ -1091,17 +1028,20 @@
 		else
 			pbc |= (ib_is_sc5(sc5) << PBC_DC_INFO_SHIFT);
 
+		pbc = create_pbc(ppd, pbc, qp->srate_mbps, vl, plen);
 		if (unlikely(hfi1_dbg_should_fault_tx(qp, ps->opcode)))
 			pbc = hfi1_fault_tx(qp, ps->opcode, pbc);
-		pbc = create_pbc(ppd, pbc, qp->srate_mbps, vl, plen);
+		else
+			/* Update HCRC based on packet opcode */
+			pbc = update_hcrc(ps->opcode, pbc);
 	}
 	if (cb)
 		iowait_pio_inc(&priv->s_iowait);
 	pbuf = sc_buffer_alloc(sc, plen, cb, qp);
-	if (unlikely(!pbuf)) {
+	if (IS_ERR_OR_NULL(pbuf)) {
 		if (cb)
 			verbs_pio_complete(qp, 0);
-		if (ppd->host_link_state != HLS_UP_ACTIVE) {
+		if (IS_ERR(pbuf)) {
 			/*
 			 * If we have filled the PIO buffers to capacity and are
 			 * not in an active state this request is not going to
@@ -1137,10 +1077,8 @@
 		if (ss) {
 			while (len) {
 				void *addr = ss->sge.vaddr;
-				u32 slen = ss->sge.length;
+				u32 slen = rvt_get_sge_length(&ss->sge, len);
 
-				if (slen > len)
-					slen = len;
 				rvt_update_sge(ss, slen, false);
 				seg_pio_copy_mid(pbuf, addr, slen);
 				len -= slen;
@@ -1148,7 +1086,8 @@
 		}
 		/* add icrc, lt byte, and padding to flit */
 		if (extra_bytes)
-			seg_pio_copy_mid(pbuf, trail_buf, extra_bytes);
+			seg_pio_copy_mid(pbuf, ppd->dd->sdma_pad_dma,
+					 extra_bytes);
 
 		seg_pio_copy_end(pbuf);
 	}
@@ -1158,15 +1097,15 @@
 			       &ps->s_txreq->phdr.hdr, ib_is_sc5(sc5));
 
 pio_bail:
+	spin_lock_irqsave(&qp->s_lock, flags);
 	if (qp->s_wqe) {
-		spin_lock_irqsave(&qp->s_lock, flags);
-		hfi1_send_complete(qp, qp->s_wqe, wc_status);
-		spin_unlock_irqrestore(&qp->s_lock, flags);
+		rvt_send_complete(qp, qp->s_wqe, wc_status);
 	} else if (qp->ibqp.qp_type == IB_QPT_RC) {
-		spin_lock_irqsave(&qp->s_lock, flags);
+		if (unlikely(wc_status == IB_WC_GENERAL_ERR))
+			hfi1_rc_verbs_aborted(qp, &ps->s_txreq->phdr.hdr);
 		hfi1_rc_send_complete(qp, &ps->s_txreq->phdr.hdr);
-		spin_unlock_irqrestore(&qp->s_lock, flags);
 	}
+	spin_unlock_irqrestore(&qp->s_lock, flags);
 
 	ret = 0;
 
@@ -1286,15 +1225,16 @@
 	case IB_QPT_UD:
 		break;
 	case IB_QPT_UC:
-	case IB_QPT_RC: {
+	case IB_QPT_RC:
+		priv->s_running_pkt_size =
+			(tx->s_cur_size + priv->s_running_pkt_size) / 2;
 		if (piothreshold &&
-		    tx->s_cur_size <= min(piothreshold, qp->pmtu) &&
+		    priv->s_running_pkt_size <= min(piothreshold, qp->pmtu) &&
 		    (BIT(ps->opcode & OPMASK) & pio_opmask[ps->opcode >> 5]) &&
 		    iowait_sdma_pending(&priv->s_iowait) == 0 &&
 		    !sdma_txreq_built(&tx->txreq))
 			return dd->process_pio_send;
 		break;
-	}
 	default:
 		break;
 	}
@@ -1367,7 +1307,7 @@
 			hfi1_cdbg(PIO, "%s() Failed. Completing with err",
 				  __func__);
 			spin_lock_irqsave(&qp->s_lock, flags);
-			hfi1_send_complete(qp, qp->s_wqe, IB_WC_GENERAL_ERR);
+			rvt_send_complete(qp, qp->s_wqe, IB_WC_GENERAL_ERR);
 			spin_unlock_irqrestore(&qp->s_lock, flags);
 		}
 		return -EINVAL;
@@ -1409,15 +1349,15 @@
 	rdi->dparms.props.max_mr_size = U64_MAX;
 	rdi->dparms.props.max_fast_reg_page_list_len = UINT_MAX;
 	rdi->dparms.props.max_qp = hfi1_max_qps;
-	rdi->dparms.props.max_qp_wr = hfi1_max_qp_wrs;
+	rdi->dparms.props.max_qp_wr =
+		(hfi1_max_qp_wrs >= HFI1_QP_WQE_INVALID ?
+		 HFI1_QP_WQE_INVALID - 1 : hfi1_max_qp_wrs);
 	rdi->dparms.props.max_send_sge = hfi1_max_sges;
 	rdi->dparms.props.max_recv_sge = hfi1_max_sges;
 	rdi->dparms.props.max_sge_rd = hfi1_max_sges;
 	rdi->dparms.props.max_cq = hfi1_max_cqs;
 	rdi->dparms.props.max_ah = hfi1_max_ahs;
 	rdi->dparms.props.max_cqe = hfi1_max_cqes;
-	rdi->dparms.props.max_mr = rdi->lkey_table.max;
-	rdi->dparms.props.max_fmr = rdi->lkey_table.max;
 	rdi->dparms.props.max_map_per_fmr = 32767;
 	rdi->dparms.props.max_pd = hfi1_max_pds;
 	rdi->dparms.props.max_qp_rd_atom = HFI1_MAX_RDMA_ATOMIC;
@@ -1596,6 +1536,7 @@
 	sl = rdma_ah_get_sl(ah_attr);
 	if (sl >= ARRAY_SIZE(ibp->sl_to_sc))
 		return -EINVAL;
+	sl = array_index_nospec(sl, ARRAY_SIZE(ibp->sl_to_sc));
 
 	sc5 = ibp->sl_to_sc[sl];
 	if (sc_to_vlt(dd, sc5) > num_vls && sc_to_vlt(dd, sc5) != 0xf)
@@ -1800,15 +1741,15 @@
 
 static u64 hfi1_sps_ints(void)
 {
-	unsigned long flags;
+	unsigned long index, flags;
 	struct hfi1_devdata *dd;
 	u64 sps_ints = 0;
 
-	spin_lock_irqsave(&hfi1_devs_lock, flags);
-	list_for_each_entry(dd, &hfi1_dev_list, list) {
+	xa_lock_irqsave(&hfi1_dev_table, flags);
+	xa_for_each(&hfi1_dev_table, index, dd) {
 		sps_ints += get_all_cpu_total(dd->int_counter);
 	}
-	spin_unlock_irqrestore(&hfi1_devs_lock, flags);
+	xa_unlock_irqrestore(&hfi1_dev_table, flags);
 	return sps_ints;
 }
 
@@ -1838,6 +1779,20 @@
 	return count;
 }
 
+static const struct ib_device_ops hfi1_dev_ops = {
+	.owner = THIS_MODULE,
+	.driver_id = RDMA_DRIVER_HFI1,
+
+	.alloc_hw_stats = alloc_hw_stats,
+	.alloc_rdma_netdev = hfi1_vnic_alloc_rn,
+	.get_dev_fw_str = hfi1_get_dev_fw_str,
+	.get_hw_stats = get_hw_stats,
+	.init_port = hfi1_create_port_files,
+	.modify_device = modify_device,
+	/* keep process mad in the driver */
+	.process_mad = hfi1_process_mad,
+};
+
 /**
  * hfi1_register_ib_device - register our device with the infiniband core
  * @dd: the device data structure
@@ -1878,17 +1833,10 @@
 	 */
 	if (!ib_hfi1_sys_image_guid)
 		ib_hfi1_sys_image_guid = ibdev->node_guid;
-	ibdev->owner = THIS_MODULE;
 	ibdev->phys_port_cnt = dd->num_pports;
 	ibdev->dev.parent = &dd->pcidev->dev;
-	ibdev->modify_device = modify_device;
-	ibdev->alloc_hw_stats = alloc_hw_stats;
-	ibdev->get_hw_stats = get_hw_stats;
-	ibdev->alloc_rdma_netdev = hfi1_vnic_alloc_rn;
 
-	/* keep process mad in the driver */
-	ibdev->process_mad = hfi1_process_mad;
-	ibdev->get_dev_fw_str = hfi1_get_dev_fw_str;
+	ib_set_device_ops(ibdev, &hfi1_dev_ops);
 
 	strlcpy(ibdev->node_desc, init_utsname()->nodename,
 		sizeof(ibdev->node_desc));
@@ -1896,7 +1844,6 @@
 	/*
 	 * Fill in rvt info object.
 	 */
-	dd->verbs_dev.rdi.driver_f.port_callback = hfi1_create_port_files;
 	dd->verbs_dev.rdi.driver_f.get_pci_dev = get_pci_dev;
 	dd->verbs_dev.rdi.driver_f.check_ah = hfi1_check_ah;
 	dd->verbs_dev.rdi.driver_f.notify_new_ah = hfi1_notify_new_ah;
@@ -1926,6 +1873,7 @@
 	dd->verbs_dev.rdi.dparms.max_mad_size = OPA_MGMT_MAD_SIZE;
 
 	dd->verbs_dev.rdi.driver_f.qp_priv_alloc = qp_priv_alloc;
+	dd->verbs_dev.rdi.driver_f.qp_priv_init = hfi1_qp_priv_init;
 	dd->verbs_dev.rdi.driver_f.qp_priv_free = qp_priv_free;
 	dd->verbs_dev.rdi.driver_f.free_all_qps = free_all_qps;
 	dd->verbs_dev.rdi.driver_f.notify_qp_reset = notify_qp_reset;
@@ -1943,7 +1891,7 @@
 	dd->verbs_dev.rdi.driver_f.check_modify_qp = hfi1_check_modify_qp;
 	dd->verbs_dev.rdi.driver_f.modify_qp = hfi1_modify_qp;
 	dd->verbs_dev.rdi.driver_f.notify_restart_rc = hfi1_restart_rc;
-	dd->verbs_dev.rdi.driver_f.check_send_wqe = hfi1_check_send_wqe;
+	dd->verbs_dev.rdi.driver_f.setup_wqe = hfi1_setup_wqe;
 	dd->verbs_dev.rdi.driver_f.comp_vect_cpu_lookup =
 						hfi1_comp_vect_mappings_lookup;
 
@@ -1956,10 +1904,18 @@
 	dd->verbs_dev.rdi.dparms.lkey_table_size = hfi1_lkey_table_size;
 	dd->verbs_dev.rdi.dparms.nports = dd->num_pports;
 	dd->verbs_dev.rdi.dparms.npkeys = hfi1_get_npkeys(dd);
+	dd->verbs_dev.rdi.dparms.sge_copy_mode = sge_copy_mode;
+	dd->verbs_dev.rdi.dparms.wss_threshold = wss_threshold;
+	dd->verbs_dev.rdi.dparms.wss_clean_period = wss_clean_period;
+	dd->verbs_dev.rdi.dparms.reserved_operations = 1;
+	dd->verbs_dev.rdi.dparms.extra_rdma_atomic = HFI1_TID_RDMA_WRITE_CNT;
 
 	/* post send table */
 	dd->verbs_dev.rdi.post_parms = hfi1_post_parms;
 
+	/* opcode translation table */
+	dd->verbs_dev.rdi.wc_opcode = ib_hfi1_wc_opcode;
+
 	ppd = dd->pport;
 	for (i = 0; i < dd->num_pports; i++, ppd++)
 		rvt_init_port(&dd->verbs_dev.rdi,
@@ -1967,7 +1923,10 @@
 			      i,
 			      ppd->pkeys);
 
-	ret = rvt_register_device(&dd->verbs_dev.rdi, RDMA_DRIVER_HFI1);
+	rdma_set_device_sysfs_group(&dd->verbs_dev.rdi.ibdev,
+				    &ib_hfi1_attr_group);
+
+	ret = rvt_register_device(&dd->verbs_dev.rdi);
 	if (ret)
 		goto err_verbs_txreq;
 
diff --git a/drivers/infiniband/hw/hfi1/verbs.h b/drivers/infiniband/hw/hfi1/verbs.h
index a4d0650..ae9582d 100644
--- a/drivers/infiniband/hw/hfi1/verbs.h
+++ b/drivers/infiniband/hw/hfi1/verbs.h
@@ -71,6 +71,8 @@
 struct hfi1_packet;
 
 #include "iowait.h"
+#include "tid_rdma.h"
+#include "opfn.h"
 
 #define HFI1_MAX_RDMA_ATOMIC     16
 
@@ -156,21 +158,83 @@
 	struct hfi1_ahg_info *s_ahg;              /* ahg info for next header */
 	struct sdma_engine *s_sde;                /* current sde */
 	struct send_context *s_sendcontext;       /* current sendcontext */
+	struct hfi1_ctxtdata *rcd;                /* QP's receive context */
+	struct page **pages;                      /* for TID page scan */
+	u32 tid_enqueue;                          /* saved when tid waited */
 	u8 s_sc;		                  /* SC[0..4] for next packet */
 	struct iowait s_iowait;
+	struct timer_list s_tid_timer;            /* for timing tid wait */
+	struct timer_list s_tid_retry_timer;      /* for timing tid ack */
+	struct list_head tid_wait;                /* for queueing tid space */
+	struct hfi1_opfn_data opfn;
+	struct tid_flow_state flow_state;
+	struct tid_rdma_qp_params tid_rdma;
 	struct rvt_qp *owner;
+	u16 s_running_pkt_size;
 	u8 hdr_type; /* 9B or 16B */
+	struct rvt_sge_state tid_ss;       /* SGE state pointer for 2nd leg */
+	atomic_t n_requests;               /* # of TID RDMA requests in the */
+					   /* queue */
+	atomic_t n_tid_requests;            /* # of sent TID RDMA requests */
+	unsigned long tid_timer_timeout_jiffies;
+	unsigned long tid_retry_timeout_jiffies;
+
+	/* variables for the TID RDMA SE state machine */
+	u8 s_state;
+	u8 s_retry;
+	u8 rnr_nak_state;       /* RNR NAK state */
+	u8 s_nak_state;
+	u32 s_nak_psn;
+	u32 s_flags;
+	u32 s_tid_cur;
+	u32 s_tid_head;
+	u32 s_tid_tail;
+	u32 r_tid_head;     /* Most recently added TID RDMA request */
+	u32 r_tid_tail;     /* the last completed TID RDMA request */
+	u32 r_tid_ack;      /* the TID RDMA request to be ACK'ed */
+	u32 r_tid_alloc;    /* Request for which we are allocating resources */
+	u32 pending_tid_w_segs; /* Num of pending tid write segments */
+	u32 pending_tid_w_resp; /* Num of pending tid write responses */
+	u32 alloc_w_segs;       /* Number of segments for which write */
+			       /* resources have been allocated for this QP */
+
+	/* For TID RDMA READ */
+	u32 tid_r_reqs;         /* Num of tid reads requested */
+	u32 tid_r_comp;         /* Num of tid reads completed */
+	u32 pending_tid_r_segs; /* Num of pending tid read segments */
+	u16 pkts_ps;            /* packets per segment */
+	u8 timeout_shift;       /* account for number of packets per segment */
+
+	u32 r_next_psn_kdeth;
+	u32 r_next_psn_kdeth_save;
+	u32 s_resync_psn;
+	u8 sync_pt;           /* Set when QP reaches sync point */
+	u8 resync;
+};
+
+#define HFI1_QP_WQE_INVALID   ((u32)-1)
+
+struct hfi1_swqe_priv {
+	struct tid_rdma_request tid_req;
+	struct rvt_sge_state ss;  /* Used for TID RDMA READ Request */
+};
+
+struct hfi1_ack_priv {
+	struct rvt_sge_state ss;               /* used for TID WRITE RESP */
+	struct tid_rdma_request tid_req;
 };
 
 /*
  * This structure is used to hold commonly lookedup and computed values during
  * the send engine progress.
  */
+struct iowait_work;
 struct hfi1_pkt_state {
 	struct hfi1_ibdev *dev;
 	struct hfi1_ibport *ibp;
 	struct hfi1_pportdata *ppd;
 	struct verbs_txreq *s_txreq;
+	struct iowait_work *wait;
 	unsigned long flags;
 	unsigned long timeout;
 	unsigned long timeout_int;
@@ -221,6 +285,7 @@
 	struct kmem_cache *verbs_txreq_cache;
 	u64 n_txwait;
 	u64 n_kmem_wait;
+	u64 n_tidwait;
 
 	/* protect iowait lists */
 	seqlock_t iowait_lock ____cacheline_aligned_in_smp;
@@ -247,7 +312,7 @@
 	return container_of(rdi, struct hfi1_ibdev, rdi);
 }
 
-static inline struct rvt_qp *iowait_to_qp(struct  iowait *s_iowait)
+static inline struct rvt_qp *iowait_to_qp(struct iowait *s_iowait)
 {
 	struct hfi1_qp_priv *priv;
 
@@ -308,14 +373,36 @@
 	return (((int)a - (int)b) << PSN_SHIFT) >> PSN_SHIFT;
 }
 
+static inline struct tid_rdma_request *wqe_to_tid_req(struct rvt_swqe *wqe)
+{
+	return &((struct hfi1_swqe_priv *)wqe->priv)->tid_req;
+}
+
+static inline struct tid_rdma_request *ack_to_tid_req(struct rvt_ack_entry *e)
+{
+	return &((struct hfi1_ack_priv *)e->priv)->tid_req;
+}
+
+/*
+ * Look through all the active flows for a TID RDMA request and find
+ * the one (if it exists) that contains the specified PSN.
+ */
+static inline u32 __full_flow_psn(struct flow_state *state, u32 psn)
+{
+	return mask_psn((state->generation << HFI1_KDETH_BTH_SEQ_SHIFT) |
+			(psn & HFI1_KDETH_BTH_SEQ_MASK));
+}
+
+static inline u32 full_flow_psn(struct tid_rdma_flow *flow, u32 psn)
+{
+	return __full_flow_psn(&flow->flow_state, psn);
+}
+
 struct verbs_txreq;
 void hfi1_put_txreq(struct verbs_txreq *tx);
 
 int hfi1_verbs_send(struct rvt_qp *qp, struct hfi1_pkt_state *ps);
 
-void hfi1_copy_sge(struct rvt_sge_state *ss, void *data, u32 length,
-		   bool release, bool copy_last);
-
 void hfi1_cnp_rcv(struct hfi1_packet *packet);
 
 void hfi1_uc_rcv(struct hfi1_packet *packet);
@@ -329,6 +416,7 @@
 
 u8 ah_to_sc(struct ib_device *ibdev, struct rdma_ah_attr *ah_attr);
 
+void hfi1_rc_verbs_aborted(struct rvt_qp *qp, struct hfi1_opa_header *opah);
 void hfi1_rc_send_complete(struct rvt_qp *qp, struct hfi1_opa_header *opah);
 
 void hfi1_ud_rcv(struct hfi1_packet *packet);
@@ -343,7 +431,8 @@
 void hfi1_modify_qp(struct rvt_qp *qp, struct ib_qp_attr *attr,
 		    int attr_mask, struct ib_udata *udata);
 void hfi1_restart_rc(struct rvt_qp *qp, u32 psn, int wait);
-int hfi1_check_send_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe);
+int hfi1_setup_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe,
+		   bool *call_send);
 
 extern const u32 rc_only_opcode;
 extern const u32 uc_only_opcode;
@@ -354,18 +443,18 @@
 		  const struct ib_global_route *grh, u32 hwords, u32 nwords);
 
 void hfi1_make_ruc_header(struct rvt_qp *qp, struct ib_other_headers *ohdr,
-			  u32 bth0, u32 bth2, int middle,
+			  u32 bth0, u32 bth1, u32 bth2, int middle,
 			  struct hfi1_pkt_state *ps);
 
+bool hfi1_schedule_send_yield(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
+			      bool tid);
+
 void _hfi1_do_send(struct work_struct *work);
 
 void hfi1_do_send_from_rvt(struct rvt_qp *qp);
 
 void hfi1_do_send(struct rvt_qp *qp, bool in_thread);
 
-void hfi1_send_complete(struct rvt_qp *qp, struct rvt_swqe *wqe,
-			enum ib_wc_status status);
-
 void hfi1_send_rc_ack(struct hfi1_packet *packet, bool is_fecn);
 
 int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps);
@@ -378,6 +467,10 @@
 
 void hfi1_unregister_ib_device(struct hfi1_devdata *);
 
+void hfi1_kdeth_eager_rcv(struct hfi1_packet *packet);
+
+void hfi1_kdeth_expected_rcv(struct hfi1_packet *packet);
+
 void hfi1_ib_rcv(struct hfi1_packet *packet);
 
 void hfi1_16B_rcv(struct hfi1_packet *packet);
@@ -390,33 +483,21 @@
 int hfi1_verbs_send_pio(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
 			u64 pbc);
 
-int hfi1_wss_init(void);
-void hfi1_wss_exit(void);
-
-/* platform specific: return the lowest level cache (llc) size, in KiB */
-static inline int wss_llc_size(void)
-{
-	/* assume that the boot CPU value is universal for all CPUs */
-	return boot_cpu_data.x86_cache_size;
-}
-
-/* platform specific: cacheless copy */
-static inline void cacheless_memcpy(void *dst, void *src, size_t n)
-{
-	/*
-	 * Use the only available X64 cacheless copy.  Add a __user cast
-	 * to quiet sparse.  The src agument is already in the kernel so
-	 * there are no security issues.  The extra fault recovery machinery
-	 * is not invoked.
-	 */
-	__copy_user_nocache(dst, (void __user *)src, n, 0);
-}
-
 static inline bool opa_bth_is_migration(struct ib_other_headers *ohdr)
 {
 	return ohdr->bth[1] & cpu_to_be32(OPA_BTH_MIG_REQ);
 }
 
+void hfi1_wait_kmem(struct rvt_qp *qp);
+
+static inline void hfi1_trdma_send_complete(struct rvt_qp *qp,
+					    struct rvt_swqe *wqe,
+					    enum ib_wc_status status)
+{
+	trdma_clean_swqe(qp, wqe);
+	rvt_send_complete(qp, wqe, status);
+}
+
 extern const enum ib_wc_opcode ib_hfi1_wc_opcode[];
 
 extern const u8 hdr_len_by_opcode[];
diff --git a/drivers/infiniband/hw/hfi1/verbs_txreq.c b/drivers/infiniband/hw/hfi1/verbs_txreq.c
index c4ab2d5..8f766dd 100644
--- a/drivers/infiniband/hw/hfi1/verbs_txreq.c
+++ b/drivers/infiniband/hw/hfi1/verbs_txreq.c
@@ -100,7 +100,7 @@
 	if (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) {
 		struct hfi1_qp_priv *priv;
 
-		tx = kmem_cache_alloc(dev->verbs_txreq_cache, GFP_ATOMIC);
+		tx = kmem_cache_alloc(dev->verbs_txreq_cache, VERBS_TXREQ_GFP);
 		if (tx)
 			goto out;
 		priv = qp->priv;
diff --git a/drivers/infiniband/hw/hfi1/verbs_txreq.h b/drivers/infiniband/hw/hfi1/verbs_txreq.h
index 1c19bbc..bfa6e08 100644
--- a/drivers/infiniband/hw/hfi1/verbs_txreq.h
+++ b/drivers/infiniband/hw/hfi1/verbs_txreq.h
@@ -72,6 +72,7 @@
 struct verbs_txreq *__get_txreq(struct hfi1_ibdev *dev,
 				struct rvt_qp *qp);
 
+#define VERBS_TXREQ_GFP (GFP_ATOMIC | __GFP_NOWARN)
 static inline struct verbs_txreq *get_txreq(struct hfi1_ibdev *dev,
 					    struct rvt_qp *qp)
 	__must_hold(&qp->slock)
@@ -79,7 +80,7 @@
 	struct verbs_txreq *tx;
 	struct hfi1_qp_priv *priv = qp->priv;
 
-	tx = kmem_cache_alloc(dev->verbs_txreq_cache, GFP_ATOMIC);
+	tx = kmem_cache_alloc(dev->verbs_txreq_cache, VERBS_TXREQ_GFP);
 	if (unlikely(!tx)) {
 		/* call slow path to get the lock */
 		tx = __get_txreq(dev, qp);
@@ -94,6 +95,7 @@
 	tx->txreq.num_desc = 0;
 	/* Set the header type */
 	tx->phdr.hdr.hdr_type = priv->hdr_type;
+	tx->txreq.flags = 0;
 	return tx;
 }
 
@@ -102,22 +104,19 @@
 	return &tx->txreq;
 }
 
-static inline struct verbs_txreq *get_waiting_verbs_txreq(struct rvt_qp *qp)
+static inline struct verbs_txreq *get_waiting_verbs_txreq(struct iowait_work *w)
 {
 	struct sdma_txreq *stx;
-	struct hfi1_qp_priv *priv = qp->priv;
 
-	stx = iowait_get_txhead(&priv->s_iowait);
+	stx = iowait_get_txhead(w);
 	if (stx)
 		return container_of(stx, struct verbs_txreq, txreq);
 	return NULL;
 }
 
-static inline bool verbs_txreq_queued(struct rvt_qp *qp)
+static inline bool verbs_txreq_queued(struct iowait_work *w)
 {
-	struct hfi1_qp_priv *priv = qp->priv;
-
-	return iowait_packet_queued(&priv->s_iowait);
+	return iowait_packet_queued(w);
 }
 
 void hfi1_put_txreq(struct verbs_txreq *tx);
diff --git a/drivers/infiniband/hw/hfi1/vnic_main.c b/drivers/infiniband/hw/hfi1/vnic_main.c
index c643d80..b49e60e 100644
--- a/drivers/infiniband/hw/hfi1/vnic_main.c
+++ b/drivers/infiniband/hw/hfi1/vnic_main.c
@@ -120,7 +120,7 @@
 	uctxt->seq_cnt = 1;
 	uctxt->is_vnic = true;
 
-	hfi1_set_vnic_msix_info(uctxt);
+	msix_request_rcd_irq(uctxt);
 
 	hfi1_stats.sps_ctxts++;
 	dd_dev_dbg(dd, "created vnic context %d\n", uctxt->ctxt);
@@ -135,8 +135,6 @@
 	dd_dev_dbg(dd, "closing vnic context %d\n", uctxt->ctxt);
 	flush_wc();
 
-	hfi1_reset_vnic_msix_info(uctxt);
-
 	/*
 	 * Disable receive context and interrupt available, reset all
 	 * RcvCtxtCtrl bits to default values.
@@ -148,6 +146,10 @@
 		     HFI1_RCVCTRL_NO_RHQ_DROP_DIS |
 		     HFI1_RCVCTRL_NO_EGR_DROP_DIS, uctxt);
 
+	/* msix_intr will always be > 0, only clean up if this is true */
+	if (uctxt->msix_intr)
+		msix_free_irq(dd, uctxt->msix_intr);
+
 	uctxt->event_flags = 0;
 
 	hfi1_clear_tids(uctxt);
@@ -160,12 +162,12 @@
 
 void hfi1_vnic_setup(struct hfi1_devdata *dd)
 {
-	idr_init(&dd->vnic.vesw_idr);
+	xa_init(&dd->vnic.vesws);
 }
 
 void hfi1_vnic_cleanup(struct hfi1_devdata *dd)
 {
-	idr_destroy(&dd->vnic.vesw_idr);
+	WARN_ON(!xa_empty(&dd->vnic.vesws));
 }
 
 #define SUM_GRP_COUNTERS(stats, qstats, x_grp) do {            \
@@ -421,8 +423,7 @@
 
 static u16 hfi1_vnic_select_queue(struct net_device *netdev,
 				  struct sk_buff *skb,
-				  struct net_device *sb_dev,
-				  select_queue_fallback_t fallback)
+				  struct net_device *sb_dev)
 {
 	struct hfi1_vnic_vport_info *vinfo = opa_vnic_dev_priv(netdev);
 	struct opa_vnic_skb_mdata *mdata;
@@ -532,7 +533,7 @@
 	l4_type = hfi1_16B_get_l4(packet->ebuf);
 	if (likely(l4_type == OPA_16B_L4_ETHR)) {
 		vesw_id = HFI1_VNIC_GET_VESWID(packet->ebuf);
-		vinfo = idr_find(&dd->vnic.vesw_idr, vesw_id);
+		vinfo = xa_load(&dd->vnic.vesws, vesw_id);
 
 		/*
 		 * In case of invalid vesw id, count the error on
@@ -540,9 +541,10 @@
 		 */
 		if (unlikely(!vinfo)) {
 			struct hfi1_vnic_vport_info *vinfo_tmp;
-			int id_tmp = 0;
+			unsigned long index = 0;
 
-			vinfo_tmp =  idr_get_next(&dd->vnic.vesw_idr, &id_tmp);
+			vinfo_tmp = xa_find(&dd->vnic.vesws, &index, ULONG_MAX,
+					XA_PRESENT);
 			if (vinfo_tmp) {
 				spin_lock(&vport_cntr_lock);
 				vinfo_tmp->stats[0].netstats.rx_nohandler++;
@@ -596,8 +598,7 @@
 	if (!vinfo->vesw_id)
 		return -EINVAL;
 
-	rc = idr_alloc(&dd->vnic.vesw_idr, vinfo, vinfo->vesw_id,
-		       vinfo->vesw_id + 1, GFP_NOWAIT);
+	rc = xa_insert(&dd->vnic.vesws, vinfo->vesw_id, vinfo, GFP_KERNEL);
 	if (rc < 0)
 		return rc;
 
@@ -623,10 +624,10 @@
 	clear_bit(HFI1_VNIC_UP, &vinfo->flags);
 	netif_carrier_off(vinfo->netdev);
 	netif_tx_disable(vinfo->netdev);
-	idr_remove(&dd->vnic.vesw_idr, vinfo->vesw_id);
+	xa_erase(&dd->vnic.vesws, vinfo->vesw_id);
 
 	/* ensure irqs see the change */
-	hfi1_vnic_synchronize_irq(dd);
+	msix_vnic_synchronize_irq(dd);
 
 	/* remove unread skbs */
 	for (i = 0; i < vinfo->num_rx_q; i++) {
@@ -690,8 +691,6 @@
 		rc = hfi1_vnic_txreq_init(dd);
 		if (rc)
 			goto txreq_fail;
-
-		dd->vnic.msix_idx = dd->first_dyn_msix_idx;
 	}
 
 	for (i = dd->vnic.num_ctxt; i < vinfo->num_rx_q; i++) {
@@ -816,14 +815,14 @@
 
 	size = sizeof(struct opa_vnic_rdma_netdev) + sizeof(*vinfo);
 	netdev = alloc_netdev_mqs(size, name, name_assign_type, setup,
-				  chip_sdma_engines(dd), dd->num_vnic_contexts);
+				  dd->num_sdma, dd->num_vnic_contexts);
 	if (!netdev)
 		return ERR_PTR(-ENOMEM);
 
 	rn = netdev_priv(netdev);
 	vinfo = opa_vnic_dev_priv(netdev);
 	vinfo->dd = dd;
-	vinfo->num_tx_q = chip_sdma_engines(dd);
+	vinfo->num_tx_q = dd->num_sdma;
 	vinfo->num_rx_q = dd->num_vnic_contexts;
 	vinfo->netdev = netdev;
 	rn->free_rdma_netdev = hfi1_vnic_free_rn;
diff --git a/drivers/infiniband/hw/hfi1/vnic_sdma.c b/drivers/infiniband/hw/hfi1/vnic_sdma.c
index c3c96c5..7d90b90 100644
--- a/drivers/infiniband/hw/hfi1/vnic_sdma.c
+++ b/drivers/infiniband/hw/hfi1/vnic_sdma.c
@@ -1,5 +1,5 @@
 /*
- * Copyright(c) 2017 Intel Corporation.
+ * Copyright(c) 2017 - 2018 Intel Corporation.
  *
  * This file is provided under a dual BSD/GPLv2 license.  When using or
  * redistributing this file, you may do so under either license.
@@ -57,7 +57,6 @@
 
 #define HFI1_VNIC_TXREQ_NAME_LEN   32
 #define HFI1_VNIC_SDMA_DESC_WTRMRK 64
-#define HFI1_VNIC_SDMA_RETRY_COUNT 1
 
 /*
  * struct vnic_txreq - VNIC transmit descriptor
@@ -67,7 +66,6 @@
  * @pad: pad buffer
  * @plen: pad length
  * @pbc_val: pbc value
- * @retry_count: tx retry count
  */
 struct vnic_txreq {
 	struct sdma_txreq       txreq;
@@ -77,8 +75,6 @@
 	unsigned char           pad[HFI1_VNIC_MAX_PAD];
 	u16                     plen;
 	__le64                  pbc_val;
-
-	u32                     retry_count;
 };
 
 static void vnic_sdma_complete(struct sdma_txreq *txreq,
@@ -106,13 +102,13 @@
 		goto bail_txadd;
 
 	for (i = 0; i < skb_shinfo(tx->skb)->nr_frags; i++) {
-		struct skb_frag_struct *frag = &skb_shinfo(tx->skb)->frags[i];
+		skb_frag_t *frag = &skb_shinfo(tx->skb)->frags[i];
 
 		/* combine physically continuous fragments later? */
 		ret = sdma_txadd_page(sde->dd,
 				      &tx->txreq,
 				      skb_frag_page(frag),
-				      frag->page_offset,
+				      skb_frag_off(frag),
 				      skb_frag_size(frag));
 		if (unlikely(ret))
 			goto bail_txadd;
@@ -196,10 +192,9 @@
 	ret = build_vnic_tx_desc(sde, tx, pbc);
 	if (unlikely(ret))
 		goto free_desc;
-	tx->retry_count = 0;
 
-	ret = sdma_send_txreq(sde, &vnic_sdma->wait, &tx->txreq,
-			      vnic_sdma->pkts_sent);
+	ret = sdma_send_txreq(sde, iowait_get_ib_work(&vnic_sdma->wait),
+			      &tx->txreq, vnic_sdma->pkts_sent);
 	/* When -ECOMM, sdma callback will be called with ABORT status */
 	if (unlikely(ret && unlikely(ret != -ECOMM)))
 		goto free_desc;
@@ -230,25 +225,26 @@
  * become available.
  */
 static int hfi1_vnic_sdma_sleep(struct sdma_engine *sde,
-				struct iowait *wait,
+				struct iowait_work *wait,
 				struct sdma_txreq *txreq,
 				uint seq,
 				bool pkts_sent)
 {
 	struct hfi1_vnic_sdma *vnic_sdma =
-		container_of(wait, struct hfi1_vnic_sdma, wait);
-	struct hfi1_ibdev *dev = &vnic_sdma->dd->verbs_dev;
-	struct vnic_txreq *tx = container_of(txreq, struct vnic_txreq, txreq);
+		container_of(wait->iow, struct hfi1_vnic_sdma, wait);
 
-	if (sdma_progress(sde, seq, txreq))
-		if (tx->retry_count++ < HFI1_VNIC_SDMA_RETRY_COUNT)
-			return -EAGAIN;
+	write_seqlock(&sde->waitlock);
+	if (sdma_progress(sde, seq, txreq)) {
+		write_sequnlock(&sde->waitlock);
+		return -EAGAIN;
+	}
 
 	vnic_sdma->state = HFI1_VNIC_SDMA_Q_DEFERRED;
-	write_seqlock(&dev->iowait_lock);
-	if (list_empty(&vnic_sdma->wait.list))
-		iowait_queue(pkts_sent, wait, &sde->dmawait);
-	write_sequnlock(&dev->iowait_lock);
+	if (list_empty(&vnic_sdma->wait.list)) {
+		iowait_get_priority(wait->iow);
+		iowait_queue(pkts_sent, wait->iow, &sde->dmawait);
+	}
+	write_sequnlock(&sde->waitlock);
 	return -EBUSY;
 }
 
@@ -285,8 +281,9 @@
 	for (i = 0; i < vinfo->num_tx_q; i++) {
 		struct hfi1_vnic_sdma *vnic_sdma = &vinfo->sdma[i];
 
-		iowait_init(&vnic_sdma->wait, 0, NULL, hfi1_vnic_sdma_sleep,
-			    hfi1_vnic_sdma_wakeup, NULL);
+		iowait_init(&vnic_sdma->wait, 0, NULL, NULL,
+			    hfi1_vnic_sdma_sleep,
+			    hfi1_vnic_sdma_wakeup, NULL, NULL);
 		vnic_sdma->sde = &vinfo->dd->per_sdma[i];
 		vnic_sdma->dd = vinfo->dd;
 		vnic_sdma->vinfo = vinfo;
@@ -295,10 +292,12 @@
 
 		/* Add a free descriptor watermark for wakeups */
 		if (vnic_sdma->sde->descq_cnt > HFI1_VNIC_SDMA_DESC_WTRMRK) {
+			struct iowait_work *work;
+
 			INIT_LIST_HEAD(&vnic_sdma->stx.list);
 			vnic_sdma->stx.num_desc = HFI1_VNIC_SDMA_DESC_WTRMRK;
-			list_add_tail(&vnic_sdma->stx.list,
-				      &vnic_sdma->wait.tx_head);
+			work = iowait_get_ib_work(&vnic_sdma->wait);
+			list_add_tail(&vnic_sdma->stx.list, &work->tx_head);
 		}
 	}
 }
diff --git a/drivers/infiniband/hw/hns/Kconfig b/drivers/infiniband/hw/hns/Kconfig
index fddb5fd..d602b69 100644
--- a/drivers/infiniband/hw/hns/Kconfig
+++ b/drivers/infiniband/hw/hns/Kconfig
@@ -1,5 +1,6 @@
+# SPDX-License-Identifier: GPL-2.0-only
 config INFINIBAND_HNS
-	tristate "HNS RoCE Driver"
+	bool "HNS RoCE Driver"
 	depends on NET_VENDOR_HISILICON
 	depends on ARM64 || (COMPILE_TEST && 64BIT)
 	---help---
@@ -7,9 +8,6 @@
 	  is used in Hisilicon Hip06 and more further ICT SoC based on
 	  platform device.
 
-	  To compile this driver as a module, choose M here: the module
-	  will be called hns-roce.
-
 config INFINIBAND_HNS_HIP06
 	tristate "Hisilicon Hip06 Family RoCE support"
 	depends on INFINIBAND_HNS && HNS && HNS_DSAF && HNS_ENET
@@ -17,15 +15,9 @@
 	  RoCE driver support for Hisilicon RoCE engine in Hisilicon Hip06 and
 	  Hip07 SoC. These RoCE engines are platform devices.
 
-	  To compile this driver as a module, choose M here: the module
-	  will be called hns-roce-hw-v1.
-
 config INFINIBAND_HNS_HIP08
 	tristate "Hisilicon Hip08 Family RoCE support"
 	depends on INFINIBAND_HNS && PCI && HNS3
 	---help---
 	  RoCE driver support for Hisilicon RoCE engine in Hisilicon Hip08 SoC.
 	  The RoCE engine is a PCI device.
-
-	  To compile this driver as a module, choose M here: the module
-	  will be called hns-roce-hw-v2.
diff --git a/drivers/infiniband/hw/hns/Makefile b/drivers/infiniband/hw/hns/Makefile
index cf03404..449a2d8 100644
--- a/drivers/infiniband/hw/hns/Makefile
+++ b/drivers/infiniband/hw/hns/Makefile
@@ -1,14 +1,16 @@
+# SPDX-License-Identifier: GPL-2.0-only
 #
 # Makefile for the Hisilicon RoCE drivers.
 #
 
-ccflags-y :=  -Idrivers/net/ethernet/hisilicon/hns3
+ccflags-y :=  -I $(srctree)/drivers/net/ethernet/hisilicon/hns3
 
-obj-$(CONFIG_INFINIBAND_HNS) += hns-roce.o
 hns-roce-objs := hns_roce_main.o hns_roce_cmd.o hns_roce_pd.o \
 	hns_roce_ah.o hns_roce_hem.o hns_roce_mr.o hns_roce_qp.o \
-	hns_roce_cq.o hns_roce_alloc.o hns_roce_db.o
+	hns_roce_cq.o hns_roce_alloc.o hns_roce_db.o hns_roce_srq.o hns_roce_restrack.o
+
+hns-roce-hw-v1-objs := hns_roce_hw_v1.o $(hns-roce-objs)
 obj-$(CONFIG_INFINIBAND_HNS_HIP06) += hns-roce-hw-v1.o
-hns-roce-hw-v1-objs := hns_roce_hw_v1.o
+
+hns-roce-hw-v2-objs := hns_roce_hw_v2.o hns_roce_hw_v2_dfx.o $(hns-roce-objs)
 obj-$(CONFIG_INFINIBAND_HNS_HIP08) += hns-roce-hw-v2.o
-hns-roce-hw-v2-objs := hns_roce_hw_v2.o
diff --git a/drivers/infiniband/hw/hns/hns_roce_ah.c b/drivers/infiniband/hw/hns/hns_roce_ah.c
index 0d96c5b..90e08c0 100644
--- a/drivers/infiniband/hw/hns/hns_roce_ah.c
+++ b/drivers/infiniband/hw/hns/hns_roce_ah.c
@@ -39,38 +39,37 @@
 #define HNS_ROCE_VLAN_SL_BIT_MASK	7
 #define HNS_ROCE_VLAN_SL_SHIFT		13
 
-struct ib_ah *hns_roce_create_ah(struct ib_pd *ibpd,
-				 struct rdma_ah_attr *ah_attr,
-				 struct ib_udata *udata)
+int hns_roce_create_ah(struct ib_ah *ibah, struct rdma_ah_attr *ah_attr,
+		       u32 flags, struct ib_udata *udata)
 {
-	struct hns_roce_dev *hr_dev = to_hr_dev(ibpd->device);
+	struct hns_roce_dev *hr_dev = to_hr_dev(ibah->device);
 	const struct ib_gid_attr *gid_attr;
 	struct device *dev = hr_dev->dev;
-	struct hns_roce_ah *ah;
+	struct hns_roce_ah *ah = to_hr_ah(ibah);
 	u16 vlan_tag = 0xffff;
 	const struct ib_global_route *grh = rdma_ah_read_grh(ah_attr);
+	bool vlan_en = false;
+	int ret;
 
-	ah = kzalloc(sizeof(*ah), GFP_ATOMIC);
-	if (!ah)
-		return ERR_PTR(-ENOMEM);
+	gid_attr = ah_attr->grh.sgid_attr;
+	ret = rdma_read_gid_l2_fields(gid_attr, &vlan_tag, NULL);
+	if (ret)
+		return ret;
 
 	/* Get mac address */
 	memcpy(ah->av.mac, ah_attr->roce.dmac, ETH_ALEN);
 
-	gid_attr = ah_attr->grh.sgid_attr;
-	if (is_vlan_dev(gid_attr->ndev))
-		vlan_tag = vlan_dev_vlan_id(gid_attr->ndev);
-
-	if (vlan_tag < 0x1000)
+	if (vlan_tag < VLAN_CFI_MASK) {
+		vlan_en = true;
 		vlan_tag |= (rdma_ah_get_sl(ah_attr) &
 			     HNS_ROCE_VLAN_SL_BIT_MASK) <<
 			     HNS_ROCE_VLAN_SL_SHIFT;
+	}
 
-	ah->av.port_pd = cpu_to_be32(to_hr_pd(ibpd)->pdn |
-				     (rdma_ah_get_port_num(ah_attr) <<
-				     HNS_ROCE_PORT_NUM_SHIFT));
+	ah->av.port = rdma_ah_get_port_num(ah_attr);
 	ah->av.gid_index = grh->sgid_index;
-	ah->av.vlan = cpu_to_le16(vlan_tag);
+	ah->av.vlan = vlan_tag;
+	ah->av.vlan_en = vlan_en;
 	dev_dbg(dev, "gid_index = 0x%x,vlan = 0x%x\n", ah->av.gid_index,
 		ah->av.vlan);
 
@@ -78,10 +77,9 @@
 		ah->av.stat_rate = IB_RATE_10_GBPS;
 
 	memcpy(ah->av.dgid, grh->dgid.raw, HNS_ROCE_GID_SIZE);
-	ah->av.sl_tclass_flowlabel = cpu_to_le32(rdma_ah_get_sl(ah_attr) <<
-						 HNS_ROCE_SL_SHIFT);
+	ah->av.sl = rdma_ah_get_sl(ah_attr);
 
-	return &ah->ibah;
+	return 0;
 }
 
 int hns_roce_query_ah(struct ib_ah *ibah, struct rdma_ah_attr *ah_attr)
@@ -90,25 +88,17 @@
 
 	memset(ah_attr, 0, sizeof(*ah_attr));
 
-	rdma_ah_set_sl(ah_attr, (le32_to_cpu(ah->av.sl_tclass_flowlabel) >>
-				 HNS_ROCE_SL_SHIFT));
-	rdma_ah_set_port_num(ah_attr, (le32_to_cpu(ah->av.port_pd) >>
-				       HNS_ROCE_PORT_NUM_SHIFT));
+	rdma_ah_set_sl(ah_attr, ah->av.sl);
+	rdma_ah_set_port_num(ah_attr, ah->av.port);
 	rdma_ah_set_static_rate(ah_attr, ah->av.stat_rate);
-	rdma_ah_set_grh(ah_attr, NULL,
-			(le32_to_cpu(ah->av.sl_tclass_flowlabel) &
-			 HNS_ROCE_FLOW_LABEL_MASK), ah->av.gid_index,
-			ah->av.hop_limit,
-			(le32_to_cpu(ah->av.sl_tclass_flowlabel) >>
-			 HNS_ROCE_TCLASS_SHIFT));
+	rdma_ah_set_grh(ah_attr, NULL, ah->av.flowlabel,
+			ah->av.gid_index, ah->av.hop_limit, ah->av.tclass);
 	rdma_ah_set_dgid_raw(ah_attr, ah->av.dgid);
 
 	return 0;
 }
 
-int hns_roce_destroy_ah(struct ib_ah *ah)
+void hns_roce_destroy_ah(struct ib_ah *ah, u32 flags)
 {
-	kfree(to_hr_ah(ah));
-
-	return 0;
+	return;
 }
diff --git a/drivers/infiniband/hw/hns/hns_roce_alloc.c b/drivers/infiniband/hw/hns/hns_roce_alloc.c
index 46f65f9..8c063c5 100644
--- a/drivers/infiniband/hw/hns/hns_roce_alloc.c
+++ b/drivers/infiniband/hw/hns/hns_roce_alloc.c
@@ -34,6 +34,7 @@
 #include <linux/platform_device.h>
 #include <linux/vmalloc.h>
 #include "hns_roce_device.h"
+#include <rdma/ib_umem.h>
 
 int hns_roce_bitmap_alloc(struct hns_roce_bitmap *bitmap, unsigned long *obj)
 {
@@ -67,7 +68,6 @@
 {
 	hns_roce_bitmap_free_range(bitmap, obj, 1, rr);
 }
-EXPORT_SYMBOL_GPL(hns_roce_bitmap_free);
 
 int hns_roce_bitmap_alloc_range(struct hns_roce_bitmap *bitmap, int cnt,
 				int align, unsigned long *obj)
@@ -174,7 +174,6 @@
 		kfree(buf->page_list);
 	}
 }
-EXPORT_SYMBOL_GPL(hns_roce_buf_free);
 
 int hns_roce_buf_alloc(struct hns_roce_dev *hr_dev, u32 size, u32 max_direct,
 		       struct hns_roce_buf *buf, u32 page_shift)
@@ -197,8 +196,8 @@
 		buf->npages = 1 << order;
 		buf->page_shift = page_shift;
 		/* MTT PA must be recorded in 4k alignment, t is 4k aligned */
-		buf->direct.buf = dma_zalloc_coherent(dev,
-						      size, &t, GFP_KERNEL);
+		buf->direct.buf = dma_alloc_coherent(dev, size, &t,
+						     GFP_KERNEL);
 		if (!buf->direct.buf)
 			return -ENOMEM;
 
@@ -219,9 +218,10 @@
 			return -ENOMEM;
 
 		for (i = 0; i < buf->nbufs; ++i) {
-			buf->page_list[i].buf = dma_zalloc_coherent(dev,
-								  page_size, &t,
-								  GFP_KERNEL);
+			buf->page_list[i].buf = dma_alloc_coherent(dev,
+								   page_size,
+								   &t,
+								   GFP_KERNEL);
 
 			if (!buf->page_list[i].buf)
 				goto err_free;
@@ -237,8 +237,108 @@
 	return -ENOMEM;
 }
 
+int hns_roce_get_kmem_bufs(struct hns_roce_dev *hr_dev, dma_addr_t *bufs,
+			   int buf_cnt, int start, struct hns_roce_buf *buf)
+{
+	int i, end;
+	int total;
+
+	end = start + buf_cnt;
+	if (end > buf->npages) {
+		dev_err(hr_dev->dev,
+			"invalid kmem region,offset %d,buf_cnt %d,total %d!\n",
+			start, buf_cnt, buf->npages);
+		return -EINVAL;
+	}
+
+	total = 0;
+	for (i = start; i < end; i++)
+		if (buf->nbufs == 1)
+			bufs[total++] = buf->direct.map +
+					((dma_addr_t)i << buf->page_shift);
+		else
+			bufs[total++] = buf->page_list[i].map;
+
+	return total;
+}
+
+int hns_roce_get_umem_bufs(struct hns_roce_dev *hr_dev, dma_addr_t *bufs,
+			   int buf_cnt, int start, struct ib_umem *umem,
+			   int page_shift)
+{
+	struct ib_block_iter biter;
+	int total = 0;
+	int idx = 0;
+	u64 addr;
+
+	if (page_shift < PAGE_SHIFT) {
+		dev_err(hr_dev->dev, "invalid page shift %d!\n", page_shift);
+		return -EINVAL;
+	}
+
+	/* convert system page cnt to hw page cnt */
+	rdma_for_each_block(umem->sg_head.sgl, &biter, umem->nmap,
+			    1 << page_shift) {
+		addr = rdma_block_iter_dma_address(&biter);
+		if (idx >= start) {
+			bufs[total++] = addr;
+			if (total >= buf_cnt)
+				goto done;
+		}
+		idx++;
+	}
+
+done:
+	return total;
+}
+
+void hns_roce_init_buf_region(struct hns_roce_buf_region *region, int hopnum,
+			      int offset, int buf_cnt)
+{
+	if (hopnum == HNS_ROCE_HOP_NUM_0)
+		region->hopnum = 0;
+	else
+		region->hopnum = hopnum;
+
+	region->offset = offset;
+	region->count = buf_cnt;
+}
+
+void hns_roce_free_buf_list(dma_addr_t **bufs, int region_cnt)
+{
+	int i;
+
+	for (i = 0; i < region_cnt; i++) {
+		kfree(bufs[i]);
+		bufs[i] = NULL;
+	}
+}
+
+int hns_roce_alloc_buf_list(struct hns_roce_buf_region *regions,
+			    dma_addr_t **bufs, int region_cnt)
+{
+	struct hns_roce_buf_region *r;
+	int i;
+
+	for (i = 0; i < region_cnt; i++) {
+		r = &regions[i];
+		bufs[i] = kcalloc(r->count, sizeof(dma_addr_t), GFP_KERNEL);
+		if (!bufs[i])
+			goto err_alloc;
+	}
+
+	return 0;
+
+err_alloc:
+	hns_roce_free_buf_list(bufs, i);
+
+	return -ENOMEM;
+}
+
 void hns_roce_cleanup_bitmap(struct hns_roce_dev *hr_dev)
 {
+	if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_SRQ)
+		hns_roce_cleanup_srq_table(hr_dev);
 	hns_roce_cleanup_qp_table(hr_dev);
 	hns_roce_cleanup_cq_table(hr_dev);
 	hns_roce_cleanup_mr_table(hr_dev);
diff --git a/drivers/infiniband/hw/hns/hns_roce_cmd.c b/drivers/infiniband/hw/hns/hns_roce_cmd.c
index a0ba19d..455d533 100644
--- a/drivers/infiniband/hw/hns/hns_roce_cmd.c
+++ b/drivers/infiniband/hw/hns/hns_roce_cmd.c
@@ -103,7 +103,6 @@
 	context->out_param = out_param;
 	complete(&context->done);
 }
-EXPORT_SYMBOL_GPL(hns_roce_cmd_event);
 
 /* this should be called with "use_events" */
 static int __hns_roce_cmd_mbox_wait(struct hns_roce_dev *hr_dev, u64 in_param,
@@ -162,7 +161,7 @@
 				  u64 out_param, unsigned long in_modifier,
 				  u8 op_modifier, u16 op, unsigned long timeout)
 {
-	int ret = 0;
+	int ret;
 
 	down(&hr_dev->cmd.event_sem);
 	ret = __hns_roce_cmd_mbox_wait(hr_dev, in_param, out_param,
@@ -176,19 +175,34 @@
 		      unsigned long in_modifier, u8 op_modifier, u16 op,
 		      unsigned long timeout)
 {
-	if (hr_dev->is_reset)
-		return 0;
+	int ret;
+
+	if (hr_dev->hw->rst_prc_mbox) {
+		ret = hr_dev->hw->rst_prc_mbox(hr_dev);
+		if (ret == CMD_RST_PRC_SUCCESS)
+			return 0;
+		else if (ret == CMD_RST_PRC_EBUSY)
+			return -EBUSY;
+	}
 
 	if (hr_dev->cmd.use_events)
-		return hns_roce_cmd_mbox_wait(hr_dev, in_param, out_param,
-					      in_modifier, op_modifier, op,
-					      timeout);
+		ret = hns_roce_cmd_mbox_wait(hr_dev, in_param, out_param,
+					     in_modifier, op_modifier, op,
+					     timeout);
 	else
-		return hns_roce_cmd_mbox_poll(hr_dev, in_param, out_param,
-					      in_modifier, op_modifier, op,
-					      timeout);
+		ret = hns_roce_cmd_mbox_poll(hr_dev, in_param, out_param,
+					     in_modifier, op_modifier, op,
+					     timeout);
+
+	if (ret == CMD_RST_PRC_EBUSY)
+		return -EBUSY;
+
+	if (ret && (hr_dev->hw->rst_prc_mbox &&
+		    hr_dev->hw->rst_prc_mbox(hr_dev) == CMD_RST_PRC_SUCCESS))
+		return 0;
+
+	return ret;
 }
-EXPORT_SYMBOL_GPL(hns_roce_cmd_mbox);
 
 int hns_roce_cmd_init(struct hns_roce_dev *hr_dev)
 {
@@ -197,7 +211,6 @@
 	mutex_init(&hr_dev->cmd.hcr_mutex);
 	sema_init(&hr_dev->cmd.poll_sem, 1);
 	hr_dev->cmd.use_events = 0;
-	hr_dev->cmd.toggle = 1;
 	hr_dev->cmd.max_cmds = CMD_MAX_NUM;
 	hr_dev->cmd.pool = dma_pool_create("hns_roce_cmd", dev,
 					   HNS_ROCE_MAILBOX_SIZE,
@@ -238,23 +251,15 @@
 	hr_cmd->token_mask = CMD_TOKEN_MASK;
 	hr_cmd->use_events = 1;
 
-	down(&hr_cmd->poll_sem);
-
 	return 0;
 }
 
 void hns_roce_cmd_use_polling(struct hns_roce_dev *hr_dev)
 {
 	struct hns_roce_cmdq *hr_cmd = &hr_dev->cmd;
-	int i;
-
-	hr_cmd->use_events = 0;
-
-	for (i = 0; i < hr_cmd->max_cmds; ++i)
-		down(&hr_cmd->event_sem);
 
 	kfree(hr_cmd->context);
-	up(&hr_cmd->poll_sem);
+	hr_cmd->use_events = 0;
 }
 
 struct hns_roce_cmd_mailbox
@@ -275,7 +280,6 @@
 
 	return mailbox;
 }
-EXPORT_SYMBOL_GPL(hns_roce_alloc_cmd_mailbox);
 
 void hns_roce_free_cmd_mailbox(struct hns_roce_dev *hr_dev,
 			       struct hns_roce_cmd_mailbox *mailbox)
@@ -286,4 +290,3 @@
 	dma_pool_free(hr_dev->cmd.pool, mailbox->buf, mailbox->dma);
 	kfree(mailbox);
 }
-EXPORT_SYMBOL_GPL(hns_roce_free_cmd_mailbox);
diff --git a/drivers/infiniband/hw/hns/hns_roce_cmd.h b/drivers/infiniband/hw/hns/hns_roce_cmd.h
index 9549ae5..2b6ac64 100644
--- a/drivers/infiniband/hw/hns/hns_roce_cmd.h
+++ b/drivers/infiniband/hw/hns/hns_roce_cmd.h
@@ -53,6 +53,7 @@
 	HNS_ROCE_CMD_QUERY_QPC		= 0x42,
 
 	HNS_ROCE_CMD_MODIFY_CQC		= 0x52,
+	HNS_ROCE_CMD_QUERY_CQC		= 0x53,
 	/* CQC BT commands */
 	HNS_ROCE_CMD_WRITE_CQC_BT0	= 0x10,
 	HNS_ROCE_CMD_WRITE_CQC_BT1	= 0x11,
@@ -75,6 +76,10 @@
 	HNS_ROCE_CMD_DESTROY_MPT_BT1	= 0x29,
 	HNS_ROCE_CMD_DESTROY_MPT_BT2	= 0x2a,
 
+	/* CQC TIMER commands */
+	HNS_ROCE_CMD_WRITE_CQC_TIMER_BT0 = 0x23,
+	HNS_ROCE_CMD_READ_CQC_TIMER_BT0  = 0x27,
+
 	/* MPT commands */
 	HNS_ROCE_CMD_QUERY_MPT		= 0x62,
 
@@ -89,6 +94,10 @@
 	HNS_ROCE_CMD_DESTROY_SRQC_BT1	= 0x39,
 	HNS_ROCE_CMD_DESTROY_SRQC_BT2	= 0x3a,
 
+	/* QPC TIMER commands */
+	HNS_ROCE_CMD_WRITE_QPC_TIMER_BT0 = 0x33,
+	HNS_ROCE_CMD_READ_QPC_TIMER_BT0  = 0x37,
+
 	/* EQC commands */
 	HNS_ROCE_CMD_CREATE_AEQC	= 0x80,
 	HNS_ROCE_CMD_MODIFY_AEQC	= 0x81,
@@ -98,6 +107,10 @@
 	HNS_ROCE_CMD_MODIFY_CEQC	= 0x91,
 	HNS_ROCE_CMD_QUERY_CEQC		= 0x92,
 	HNS_ROCE_CMD_DESTROY_CEQC	= 0x93,
+
+	/* SCC CTX BT commands */
+	HNS_ROCE_CMD_READ_SCCC_BT0	= 0xa4,
+	HNS_ROCE_CMD_WRITE_SCCC_BT0	= 0xa5,
 };
 
 enum {
@@ -120,6 +133,10 @@
 	HNS_ROCE_CMD_SQD2RTS_QP		= 0x20,
 	HNS_ROCE_CMD_2RST_QP		= 0x21,
 	HNS_ROCE_CMD_QUERY_QP		= 0x22,
+	HNS_ROCE_CMD_SW2HW_SRQ		= 0x70,
+	HNS_ROCE_CMD_MODIFY_SRQC	= 0x72,
+	HNS_ROCE_CMD_QUERY_SRQC		= 0x73,
+	HNS_ROCE_CMD_HW2SW_SRQ		= 0x74,
 };
 
 int hns_roce_cmd_mbox(struct hns_roce_dev *hr_dev, u64 in_param, u64 out_param,
diff --git a/drivers/infiniband/hw/hns/hns_roce_common.h b/drivers/infiniband/hw/hns/hns_roce_common.h
index 93d4b4e..8e95a1a 100644
--- a/drivers/infiniband/hw/hns/hns_roce_common.h
+++ b/drivers/infiniband/hw/hns/hns_roce_common.h
@@ -57,32 +57,6 @@
 #define roce_set_bit(origin, shift, val) \
 	roce_set_field((origin), (1ul << (shift)), (shift), (val))
 
-/*
- * roce_hw_index_cmp_lt - Compare two hardware index values in hisilicon
- *                        SOC, check if a is less than b.
- * @a: hardware index value
- * @b: hardware index value
- * @bits: the number of bits of a and b, range: 0~31.
- *
- * Hardware index increases continuously till max value, and then restart
- * from zero, again and again. Because the bits of reg field is often
- * limited, the reg field can only hold the low bits of the hardware index
- * in hisilicon SOC.
- * In some scenes we need to compare two values(a,b) getted from two reg
- * fields in this driver, for example:
- * If a equals 0xfffe, b equals 0x1 and bits equals 16, we think b has
- * incresed from 0xffff to 0x1 and a is less than b.
- * If a equals 0xfffe, b equals 0x0xf001 and bits equals 16, we think a
- * is bigger than b.
- *
- * Return true on a less than b, otherwise false.
- */
-#define roce_hw_index_mask(bits)	((1ul << (bits)) - 1)
-#define roce_hw_index_shift(bits)	(32 - (bits))
-#define roce_hw_index_cmp_lt(a, b, bits) \
-	((int)((((a) - (b)) & roce_hw_index_mask(bits)) << \
-		roce_hw_index_shift(bits)) < 0)
-
 #define ROCEE_GLB_CFG_ROCEE_DB_SQ_MODE_S 3
 #define ROCEE_GLB_CFG_ROCEE_DB_OTH_MODE_S 4
 
@@ -271,8 +245,6 @@
 #define ROCEE_SDB_SEND_PTR_SDB_SEND_PTR_M   \
 	(((1UL << 28) - 1) << ROCEE_SDB_SEND_PTR_SDB_SEND_PTR_S)
 
-#define ROCEE_SDB_PTR_CMP_BITS 28
-
 #define ROCEE_SDB_INV_CNT_SDB_INV_CNT_S 0
 #define ROCEE_SDB_INV_CNT_SDB_INV_CNT_M   \
 	(((1UL << 16) - 1) << ROCEE_SDB_INV_CNT_SDB_INV_CNT_S)
@@ -353,13 +325,8 @@
 #define ROCEE_CAEP_AE_MASK_REG			0x6C8
 #define ROCEE_CAEP_AE_ST_REG			0x6CC
 
-#define ROCEE_SDB_ISSUE_PTR_REG			0x758
-#define ROCEE_SDB_SEND_PTR_REG			0x75C
 #define ROCEE_CAEP_CQE_WCMD_EMPTY		0x850
 #define ROCEE_SCAEP_WR_CQE_CNT			0x8D0
-#define ROCEE_SDB_INV_CNT_REG			0x9A4
-#define ROCEE_SDB_RETRY_CNT_REG			0x9AC
-#define ROCEE_TSP_BP_ST_REG			0x9EC
 #define ROCEE_ECC_UCERR_ALM0_REG		0xB34
 #define ROCEE_ECC_CERR_ALM0_REG			0xB40
 
@@ -376,9 +343,6 @@
 #define ROCEE_RX_CMQ_TAIL_REG			0x07024
 #define ROCEE_RX_CMQ_HEAD_REG			0x07028
 
-#define ROCEE_VF_MB_CFG0_REG			0x40
-#define ROCEE_VF_MB_STATUS_REG			0x58
-
 #define ROCEE_VF_EQ_DB_CFG0_REG			0x238
 #define ROCEE_VF_EQ_DB_CFG1_REG			0x23C
 
diff --git a/drivers/infiniband/hw/hns/hns_roce_cq.c b/drivers/infiniband/hw/hns/hns_roce_cq.c
index 3a485f5..22541d1 100644
--- a/drivers/infiniband/hw/hns/hns_roce_cq.c
+++ b/drivers/infiniband/hw/hns/hns_roce_cq.c
@@ -32,6 +32,7 @@
 
 #include <linux/platform_device.h>
 #include <rdma/ib_umem.h>
+#include <rdma/uverbs_ioctl.h>
 #include "hns_roce_device.h"
 #include "hns_roce_cmd.h"
 #include "hns_roce_hem.h"
@@ -82,7 +83,6 @@
 
 static int hns_roce_cq_alloc(struct hns_roce_dev *hr_dev, int nent,
 			     struct hns_roce_mtt *hr_mtt,
-			     struct hns_roce_uar *hr_uar,
 			     struct hns_roce_cq *hr_cq, int vector)
 {
 	struct hns_roce_cmd_mailbox *mailbox;
@@ -127,13 +127,9 @@
 		goto err_out;
 	}
 
-	/* The cq insert radix tree */
-	spin_lock_irq(&cq_table->lock);
-	/* Radix_tree: The associated pointer and long integer key value like */
-	ret = radix_tree_insert(&cq_table->tree, hr_cq->cqn, hr_cq);
-	spin_unlock_irq(&cq_table->lock);
+	ret = xa_err(xa_store(&cq_table->array, hr_cq->cqn, hr_cq, GFP_KERNEL));
 	if (ret) {
-		dev_err(dev, "CQ alloc.Failed to radix_tree_insert.\n");
+		dev_err(dev, "CQ alloc failed xa_store.\n");
 		goto err_put;
 	}
 
@@ -141,7 +137,7 @@
 	mailbox = hns_roce_alloc_cmd_mailbox(hr_dev);
 	if (IS_ERR(mailbox)) {
 		ret = PTR_ERR(mailbox);
-		goto err_radix;
+		goto err_xa;
 	}
 
 	hr_dev->hw->write_cqc(hr_dev, hr_cq, mailbox->buf, mtts, dma_handle,
@@ -152,22 +148,19 @@
 	hns_roce_free_cmd_mailbox(hr_dev, mailbox);
 	if (ret) {
 		dev_err(dev, "CQ alloc.Failed to cmd mailbox.\n");
-		goto err_radix;
+		goto err_xa;
 	}
 
 	hr_cq->cons_index = 0;
 	hr_cq->arm_sn = 1;
-	hr_cq->uar = hr_uar;
 
 	atomic_set(&hr_cq->refcount, 1);
 	init_completion(&hr_cq->free);
 
 	return 0;
 
-err_radix:
-	spin_lock_irq(&cq_table->lock);
-	radix_tree_delete(&cq_table->tree, hr_cq->cqn);
-	spin_unlock_irq(&cq_table->lock);
+err_xa:
+	xa_erase(&cq_table->array, hr_cq->cqn);
 
 err_put:
 	hns_roce_table_put(hr_dev, &cq_table->table, hr_cq->cqn);
@@ -197,6 +190,8 @@
 		dev_err(dev, "HW2SW_CQ failed (%d) for CQN %06lx\n", ret,
 			hr_cq->cqn);
 
+	xa_erase(&cq_table->array, hr_cq->cqn);
+
 	/* Waiting interrupt process procedure carried out */
 	synchronize_irq(hr_dev->eq_table.eq[hr_cq->vector].irq);
 
@@ -205,17 +200,12 @@
 		complete(&hr_cq->free);
 	wait_for_completion(&hr_cq->free);
 
-	spin_lock_irq(&cq_table->lock);
-	radix_tree_delete(&cq_table->tree, hr_cq->cqn);
-	spin_unlock_irq(&cq_table->lock);
-
 	hns_roce_table_put(hr_dev, &cq_table->table, hr_cq->cqn);
 	hns_roce_bitmap_free(&cq_table->bitmap, hr_cq->cqn, BITMAP_NO_RR);
 }
-EXPORT_SYMBOL_GPL(hns_roce_free_cq);
 
 static int hns_roce_ib_get_cq_umem(struct hns_roce_dev *hr_dev,
-				   struct ib_ucontext *context,
+				   struct ib_udata *udata,
 				   struct hns_roce_cq_buf *buf,
 				   struct ib_umem **umem, u64 buf_addr, int cqe)
 {
@@ -223,7 +213,7 @@
 	u32 page_shift;
 	u32 npages;
 
-	*umem = ib_umem_get(context, buf_addr, cqe * hr_dev->caps.cq_entry_sz,
+	*umem = ib_umem_get(udata, buf_addr, cqe * hr_dev->caps.cq_entry_sz,
 			    IB_ACCESS_LOCAL_WRITE, 1);
 	if (IS_ERR(*umem))
 		return PTR_ERR(*umem);
@@ -242,8 +232,7 @@
 					&buf->hr_mtt);
 	} else {
 		ret = hns_roce_mtt_init(hr_dev, ib_umem_page_count(*umem),
-				(*umem)->page_shift,
-				&buf->hr_mtt);
+					PAGE_SHIFT, &buf->hr_mtt);
 	}
 	if (ret)
 		goto err_buf;
@@ -307,17 +296,124 @@
 			  &buf->hr_buf);
 }
 
-struct ib_cq *hns_roce_ib_create_cq(struct ib_device *ib_dev,
-				    const struct ib_cq_init_attr *attr,
-				    struct ib_ucontext *context,
-				    struct ib_udata *udata)
+static int create_user_cq(struct hns_roce_dev *hr_dev,
+			  struct hns_roce_cq *hr_cq,
+			  struct ib_udata *udata,
+			  struct hns_roce_ib_create_cq_resp *resp,
+			  int cq_entries)
 {
-	struct hns_roce_dev *hr_dev = to_hr_dev(ib_dev);
-	struct device *dev = hr_dev->dev;
 	struct hns_roce_ib_create_cq ucmd;
+	struct device *dev = hr_dev->dev;
+	int ret;
+	struct hns_roce_ucontext *context = rdma_udata_to_drv_context(
+				   udata, struct hns_roce_ucontext, ibucontext);
+
+	if (ib_copy_from_udata(&ucmd, udata, sizeof(ucmd))) {
+		dev_err(dev, "Failed to copy_from_udata.\n");
+		return -EFAULT;
+	}
+
+	/* Get user space address, write it into mtt table */
+	ret = hns_roce_ib_get_cq_umem(hr_dev, udata, &hr_cq->hr_buf,
+				      &hr_cq->umem, ucmd.buf_addr,
+				      cq_entries);
+	if (ret) {
+		dev_err(dev, "Failed to get_cq_umem.\n");
+		return ret;
+	}
+
+	if ((hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB) &&
+	    (udata->outlen >= sizeof(*resp))) {
+		ret = hns_roce_db_map_user(context, udata, ucmd.db_addr,
+					   &hr_cq->db);
+		if (ret) {
+			dev_err(dev, "cq record doorbell map failed!\n");
+			goto err_mtt;
+		}
+		hr_cq->db_en = 1;
+		resp->cap_flags |= HNS_ROCE_SUPPORT_CQ_RECORD_DB;
+	}
+
+	return 0;
+
+err_mtt:
+	hns_roce_mtt_cleanup(hr_dev, &hr_cq->hr_buf.hr_mtt);
+	ib_umem_release(hr_cq->umem);
+
+	return ret;
+}
+
+static int create_kernel_cq(struct hns_roce_dev *hr_dev,
+			    struct hns_roce_cq *hr_cq, int cq_entries)
+{
+	struct device *dev = hr_dev->dev;
+	struct hns_roce_uar *uar;
+	int ret;
+
+	if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB) {
+		ret = hns_roce_alloc_db(hr_dev, &hr_cq->db, 1);
+		if (ret)
+			return ret;
+
+		hr_cq->set_ci_db = hr_cq->db.db_record;
+		*hr_cq->set_ci_db = 0;
+		hr_cq->db_en = 1;
+	}
+
+	/* Init mtt table and write buff address to mtt table */
+	ret = hns_roce_ib_alloc_cq_buf(hr_dev, &hr_cq->hr_buf, cq_entries);
+	if (ret) {
+		dev_err(dev, "Failed to alloc_cq_buf.\n");
+		goto err_db;
+	}
+
+	uar = &hr_dev->priv_uar;
+	hr_cq->cq_db_l = hr_dev->reg_base + hr_dev->odb_offset +
+			 DB_REG_OFFSET * uar->index;
+
+	return 0;
+
+err_db:
+	if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB)
+		hns_roce_free_db(hr_dev, &hr_cq->db);
+
+	return ret;
+}
+
+static void destroy_user_cq(struct hns_roce_dev *hr_dev,
+			    struct hns_roce_cq *hr_cq,
+			    struct ib_udata *udata,
+			    struct hns_roce_ib_create_cq_resp *resp)
+{
+	struct hns_roce_ucontext *context = rdma_udata_to_drv_context(
+				   udata, struct hns_roce_ucontext, ibucontext);
+
+	if ((hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB) &&
+	    (udata->outlen >= sizeof(*resp)))
+		hns_roce_db_unmap_user(context, &hr_cq->db);
+
+	hns_roce_mtt_cleanup(hr_dev, &hr_cq->hr_buf.hr_mtt);
+	ib_umem_release(hr_cq->umem);
+}
+
+static void destroy_kernel_cq(struct hns_roce_dev *hr_dev,
+			      struct hns_roce_cq *hr_cq)
+{
+	hns_roce_mtt_cleanup(hr_dev, &hr_cq->hr_buf.hr_mtt);
+	hns_roce_ib_free_cq_buf(hr_dev, &hr_cq->hr_buf, hr_cq->ib_cq.cqe);
+
+	if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB)
+		hns_roce_free_db(hr_dev, &hr_cq->db);
+}
+
+int hns_roce_ib_create_cq(struct ib_cq *ib_cq,
+			  const struct ib_cq_init_attr *attr,
+			  struct ib_udata *udata)
+{
+	struct hns_roce_dev *hr_dev = to_hr_dev(ib_cq->device);
+	struct device *dev = hr_dev->dev;
 	struct hns_roce_ib_create_cq_resp resp = {};
-	struct hns_roce_cq *hr_cq = NULL;
-	struct hns_roce_uar *uar = NULL;
+	struct hns_roce_cq *hr_cq = to_hr_cq(ib_cq);
 	int vector = attr->comp_vector;
 	int cq_entries = attr->cqe;
 	int ret;
@@ -325,13 +421,9 @@
 	if (cq_entries < 1 || cq_entries > hr_dev->caps.max_cqes) {
 		dev_err(dev, "Creat CQ failed. entries=%d, max=%d\n",
 			cq_entries, hr_dev->caps.max_cqes);
-		return ERR_PTR(-EINVAL);
+		return -EINVAL;
 	}
 
-	hr_cq = kzalloc(sizeof(*hr_cq), GFP_KERNEL);
-	if (!hr_cq)
-		return ERR_PTR(-ENOMEM);
-
 	if (hr_dev->caps.min_cqes)
 		cq_entries = max(cq_entries, hr_dev->caps.min_cqes);
 
@@ -339,62 +431,22 @@
 	hr_cq->ib_cq.cqe = cq_entries - 1;
 	spin_lock_init(&hr_cq->lock);
 
-	if (context) {
-		if (ib_copy_from_udata(&ucmd, udata, sizeof(ucmd))) {
-			dev_err(dev, "Failed to copy_from_udata.\n");
-			ret = -EFAULT;
-			goto err_cq;
-		}
-
-		/* Get user space address, write it into mtt table */
-		ret = hns_roce_ib_get_cq_umem(hr_dev, context, &hr_cq->hr_buf,
-					      &hr_cq->umem, ucmd.buf_addr,
-					      cq_entries);
+	if (udata) {
+		ret = create_user_cq(hr_dev, hr_cq, udata, &resp, cq_entries);
 		if (ret) {
-			dev_err(dev, "Failed to get_cq_umem.\n");
+			dev_err(dev, "Create cq failed in user mode!\n");
 			goto err_cq;
 		}
-
-		if ((hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB) &&
-		    (udata->outlen >= sizeof(resp))) {
-			ret = hns_roce_db_map_user(to_hr_ucontext(context),
-						   ucmd.db_addr, &hr_cq->db);
-			if (ret) {
-				dev_err(dev, "cq record doorbell map failed!\n");
-				goto err_mtt;
-			}
-			hr_cq->db_en = 1;
-			resp.cap_flags |= HNS_ROCE_SUPPORT_CQ_RECORD_DB;
-		}
-
-		/* Get user space parameters */
-		uar = &to_hr_ucontext(context)->uar;
 	} else {
-		if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB) {
-			ret = hns_roce_alloc_db(hr_dev, &hr_cq->db, 1);
-			if (ret)
-				goto err_cq;
-
-			hr_cq->set_ci_db = hr_cq->db.db_record;
-			*hr_cq->set_ci_db = 0;
-			hr_cq->db_en = 1;
-		}
-
-		/* Init mmt table and write buff address to mtt table */
-		ret = hns_roce_ib_alloc_cq_buf(hr_dev, &hr_cq->hr_buf,
-					       cq_entries);
+		ret = create_kernel_cq(hr_dev, hr_cq, cq_entries);
 		if (ret) {
-			dev_err(dev, "Failed to alloc_cq_buf.\n");
-			goto err_db;
+			dev_err(dev, "Create cq failed in kernel mode!\n");
+			goto err_cq;
 		}
-
-		uar = &hr_dev->priv_uar;
-		hr_cq->cq_db_l = hr_dev->reg_base + hr_dev->odb_offset +
-				DB_REG_OFFSET * uar->index;
 	}
 
 	/* Allocate cq index, fill cq_context */
-	ret = hns_roce_cq_alloc(hr_dev, cq_entries, &hr_cq->hr_buf.hr_mtt, uar,
+	ret = hns_roce_cq_alloc(hr_dev, cq_entries, &hr_cq->hr_buf.hr_mtt,
 				hr_cq, vector);
 	if (ret) {
 		dev_err(dev, "Creat CQ .Failed to cq_alloc.\n");
@@ -407,7 +459,7 @@
 	 * problems if tptr is set to zero here, so we initialze it in user
 	 * space.
 	 */
-	if (!context && hr_cq->tptr_addr)
+	if (!udata && hr_cq->tptr_addr)
 		*hr_cq->tptr_addr = 0;
 
 	/* Get created cq handler and carry out event */
@@ -415,83 +467,63 @@
 	hr_cq->event = hns_roce_ib_cq_event;
 	hr_cq->cq_depth = cq_entries;
 
-	if (context) {
+	if (udata) {
 		resp.cqn = hr_cq->cqn;
 		ret = ib_copy_to_udata(udata, &resp, sizeof(resp));
 		if (ret)
 			goto err_cqc;
 	}
 
-	return &hr_cq->ib_cq;
+	return 0;
 
 err_cqc:
 	hns_roce_free_cq(hr_dev, hr_cq);
 
 err_dbmap:
-	if (context && (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB) &&
-	    (udata->outlen >= sizeof(resp)))
-		hns_roce_db_unmap_user(to_hr_ucontext(context),
-				       &hr_cq->db);
-
-err_mtt:
-	hns_roce_mtt_cleanup(hr_dev, &hr_cq->hr_buf.hr_mtt);
-	if (context)
-		ib_umem_release(hr_cq->umem);
+	if (udata)
+		destroy_user_cq(hr_dev, hr_cq, udata, &resp);
 	else
-		hns_roce_ib_free_cq_buf(hr_dev, &hr_cq->hr_buf,
-					hr_cq->ib_cq.cqe);
-
-err_db:
-	if (!context && (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB))
-		hns_roce_free_db(hr_dev, &hr_cq->db);
+		destroy_kernel_cq(hr_dev, hr_cq);
 
 err_cq:
-	kfree(hr_cq);
-	return ERR_PTR(ret);
+	return ret;
 }
-EXPORT_SYMBOL_GPL(hns_roce_ib_create_cq);
 
-int hns_roce_ib_destroy_cq(struct ib_cq *ib_cq)
+void hns_roce_ib_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata)
 {
 	struct hns_roce_dev *hr_dev = to_hr_dev(ib_cq->device);
 	struct hns_roce_cq *hr_cq = to_hr_cq(ib_cq);
-	int ret = 0;
 
 	if (hr_dev->hw->destroy_cq) {
-		ret = hr_dev->hw->destroy_cq(ib_cq);
-	} else {
-		hns_roce_free_cq(hr_dev, hr_cq);
-		hns_roce_mtt_cleanup(hr_dev, &hr_cq->hr_buf.hr_mtt);
-
-		if (ib_cq->uobject) {
-			ib_umem_release(hr_cq->umem);
-
-			if (hr_cq->db_en == 1)
-				hns_roce_db_unmap_user(
-					to_hr_ucontext(ib_cq->uobject->context),
-					&hr_cq->db);
-		} else {
-			/* Free the buff of stored cq */
-			hns_roce_ib_free_cq_buf(hr_dev, &hr_cq->hr_buf,
-						ib_cq->cqe);
-			if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB)
-				hns_roce_free_db(hr_dev, &hr_cq->db);
-		}
-
-		kfree(hr_cq);
+		hr_dev->hw->destroy_cq(ib_cq, udata);
+		return;
 	}
 
-	return ret;
+	hns_roce_free_cq(hr_dev, hr_cq);
+	hns_roce_mtt_cleanup(hr_dev, &hr_cq->hr_buf.hr_mtt);
+
+	ib_umem_release(hr_cq->umem);
+	if (udata) {
+		if (hr_cq->db_en == 1)
+			hns_roce_db_unmap_user(rdma_udata_to_drv_context(
+						       udata,
+						       struct hns_roce_ucontext,
+						       ibucontext),
+					       &hr_cq->db);
+	} else {
+		/* Free the buff of stored cq */
+		hns_roce_ib_free_cq_buf(hr_dev, &hr_cq->hr_buf, ib_cq->cqe);
+		if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB)
+			hns_roce_free_db(hr_dev, &hr_cq->db);
+	}
 }
-EXPORT_SYMBOL_GPL(hns_roce_ib_destroy_cq);
 
 void hns_roce_cq_completion(struct hns_roce_dev *hr_dev, u32 cqn)
 {
 	struct device *dev = hr_dev->dev;
 	struct hns_roce_cq *cq;
 
-	cq = radix_tree_lookup(&hr_dev->cq_table.tree,
-			       cqn & (hr_dev->caps.num_cqs - 1));
+	cq = xa_load(&hr_dev->cq_table.array, cqn & (hr_dev->caps.num_cqs - 1));
 	if (!cq) {
 		dev_warn(dev, "Completion event for bogus CQ 0x%08x\n", cqn);
 		return;
@@ -500,7 +532,6 @@
 	++cq->arm_sn;
 	cq->comp(cq);
 }
-EXPORT_SYMBOL_GPL(hns_roce_cq_completion);
 
 void hns_roce_cq_event(struct hns_roce_dev *hr_dev, u32 cqn, int event_type)
 {
@@ -508,8 +539,7 @@
 	struct device *dev = hr_dev->dev;
 	struct hns_roce_cq *cq;
 
-	cq = radix_tree_lookup(&cq_table->tree,
-			       cqn & (hr_dev->caps.num_cqs - 1));
+	cq = xa_load(&cq_table->array, cqn & (hr_dev->caps.num_cqs - 1));
 	if (cq)
 		atomic_inc(&cq->refcount);
 
@@ -523,14 +553,12 @@
 	if (atomic_dec_and_test(&cq->refcount))
 		complete(&cq->free);
 }
-EXPORT_SYMBOL_GPL(hns_roce_cq_event);
 
 int hns_roce_init_cq_table(struct hns_roce_dev *hr_dev)
 {
 	struct hns_roce_cq_table *cq_table = &hr_dev->cq_table;
 
-	spin_lock_init(&cq_table->lock);
-	INIT_RADIX_TREE(&cq_table->tree, GFP_ATOMIC);
+	xa_init(&cq_table->array);
 
 	return hns_roce_bitmap_init(&cq_table->bitmap, hr_dev->caps.num_cqs,
 				    hr_dev->caps.num_cqs - 1,
diff --git a/drivers/infiniband/hw/hns/hns_roce_db.c b/drivers/infiniband/hw/hns/hns_roce_db.c
index e2f93c1..c00714c 100644
--- a/drivers/infiniband/hw/hns/hns_roce_db.c
+++ b/drivers/infiniband/hw/hns/hns_roce_db.c
@@ -8,16 +8,19 @@
 #include <rdma/ib_umem.h>
 #include "hns_roce_device.h"
 
-int hns_roce_db_map_user(struct hns_roce_ucontext *context, unsigned long virt,
+int hns_roce_db_map_user(struct hns_roce_ucontext *context,
+			 struct ib_udata *udata, unsigned long virt,
 			 struct hns_roce_db *db)
 {
+	unsigned long page_addr = virt & PAGE_MASK;
 	struct hns_roce_user_db_page *page;
+	unsigned int offset;
 	int ret = 0;
 
 	mutex_lock(&context->page_mutex);
 
 	list_for_each_entry(page, &context->page_list, list)
-		if (page->user_virt == (virt & PAGE_MASK))
+		if (page->user_virt == page_addr)
 			goto found;
 
 	page = kmalloc(sizeof(*page), GFP_KERNEL);
@@ -27,9 +30,8 @@
 	}
 
 	refcount_set(&page->refcount, 1);
-	page->user_virt = (virt & PAGE_MASK);
-	page->umem = ib_umem_get(&context->ibucontext, virt & PAGE_MASK,
-				 PAGE_SIZE, 0, 0);
+	page->user_virt = page_addr;
+	page->umem = ib_umem_get(udata, page_addr, PAGE_SIZE, 0, 0);
 	if (IS_ERR(page->umem)) {
 		ret = PTR_ERR(page->umem);
 		kfree(page);
@@ -39,10 +41,9 @@
 	list_add(&page->list, &context->page_list);
 
 found:
-	db->dma = sg_dma_address(page->umem->sg_head.sgl) +
-		  (virt & ~PAGE_MASK);
-	page->umem->sg_head.sgl->offset = virt & ~PAGE_MASK;
-	db->virt_addr = sg_virt(page->umem->sg_head.sgl);
+	offset = virt - page_addr;
+	db->dma = sg_dma_address(page->umem->sg_head.sgl) + offset;
+	db->virt_addr = sg_virt(page->umem->sg_head.sgl) + offset;
 	db->u.user_page = page;
 	refcount_inc(&page->refcount);
 
@@ -51,7 +52,6 @@
 
 	return ret;
 }
-EXPORT_SYMBOL(hns_roce_db_map_user);
 
 void hns_roce_db_unmap_user(struct hns_roce_ucontext *context,
 			    struct hns_roce_db *db)
@@ -67,7 +67,6 @@
 
 	mutex_unlock(&context->page_mutex);
 }
-EXPORT_SYMBOL(hns_roce_db_unmap_user);
 
 static struct hns_roce_db_pgdir *hns_roce_alloc_db_pgdir(
 					struct device *dma_device)
@@ -78,7 +77,8 @@
 	if (!pgdir)
 		return NULL;
 
-	bitmap_fill(pgdir->order1, HNS_ROCE_DB_PER_PAGE / 2);
+	bitmap_fill(pgdir->order1,
+		    HNS_ROCE_DB_PER_PAGE / HNS_ROCE_DB_TYPE_COUNT);
 	pgdir->bits[0] = pgdir->order0;
 	pgdir->bits[1] = pgdir->order1;
 	pgdir->page = dma_alloc_coherent(dma_device, PAGE_SIZE,
@@ -116,7 +116,7 @@
 	db->u.pgdir	= pgdir;
 	db->index	= i;
 	db->db_record	= pgdir->page + db->index;
-	db->dma		= pgdir->db_dma  + db->index * 4;
+	db->dma		= pgdir->db_dma  + db->index * HNS_ROCE_DB_UNIT_SIZE;
 	db->order	= order;
 
 	return 0;
@@ -150,7 +150,6 @@
 
 	return ret;
 }
-EXPORT_SYMBOL_GPL(hns_roce_alloc_db);
 
 void hns_roce_free_db(struct hns_roce_dev *hr_dev, struct hns_roce_db *db)
 {
@@ -170,7 +169,8 @@
 	i >>= o;
 	set_bit(i, db->u.pgdir->bits[o]);
 
-	if (bitmap_full(db->u.pgdir->order1, HNS_ROCE_DB_PER_PAGE / 2)) {
+	if (bitmap_full(db->u.pgdir->order1,
+			HNS_ROCE_DB_PER_PAGE / HNS_ROCE_DB_TYPE_COUNT)) {
 		dma_free_coherent(hr_dev->dev, PAGE_SIZE, db->u.pgdir->page,
 				  db->u.pgdir->db_dma);
 		list_del(&db->u.pgdir->list);
@@ -179,4 +179,3 @@
 
 	mutex_unlock(&hr_dev->pgdir_mutex);
 }
-EXPORT_SYMBOL_GPL(hns_roce_free_db);
diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h
index 9a24fd0..96d1302 100644
--- a/drivers/infiniband/hw/hns/hns_roce_device.h
+++ b/drivers/infiniband/hw/hns/hns_roce_device.h
@@ -37,9 +37,12 @@
 
 #define DRV_NAME "hns_roce"
 
+/* hip08 is a pci device, it includes two version according pci version id */
+#define PCI_REVISION_ID_HIP08_A			0x20
+#define PCI_REVISION_ID_HIP08_B			0x21
+
 #define HNS_ROCE_HW_VER1	('h' << 24 | 'i' << 16 | '0' << 8 | '6')
 
-#define MAC_ADDR_OCTET_NUM			6
 #define HNS_ROCE_MAX_MSG_LEN			0x80000000
 
 #define HNS_ROCE_ALOGN_UP(a, b) ((((a) + (b) - 1) / (b)) * (b))
@@ -48,6 +51,10 @@
 
 #define HNS_ROCE_BA_SIZE			(32 * 4096)
 
+#define BA_BYTE_LEN				8
+
+#define BITS_PER_BYTE				8
+
 /* Hardware specification only for v1 engine */
 #define HNS_ROCE_MIN_CQE_NUM			0x40
 #define HNS_ROCE_MIN_WQE_NUM			0x20
@@ -55,6 +62,7 @@
 /* Hardware specification only for v1 engine */
 #define HNS_ROCE_MAX_INNER_MTPT_NUM		0x7
 #define HNS_ROCE_MAX_MTPT_PBL_NUM		0x100000
+#define HNS_ROCE_MAX_SGE_NUM			2
 
 #define HNS_ROCE_EACH_FREE_CQ_WAIT_MSECS	20
 #define HNS_ROCE_MAX_FREE_CQ_WAIT_CNT	\
@@ -64,6 +72,9 @@
 
 #define HNS_ROCE_MAX_IRQ_NUM			128
 
+#define HNS_ROCE_SGE_IN_WQE			2
+#define HNS_ROCE_SGE_SHIFT			4
+
 #define EQ_ENABLE				1
 #define EQ_DISABLE				0
 
@@ -73,7 +84,6 @@
 #define HNS_ROCE_CEQ_ENTRY_SIZE			0x4
 #define HNS_ROCE_AEQ_ENTRY_SIZE			0x10
 
-/* 4G/4K = 1M */
 #define HNS_ROCE_SL_SHIFT			28
 #define HNS_ROCE_TCLASS_SHIFT			20
 #define HNS_ROCE_FLOW_LABEL_MASK		0xfffff
@@ -81,6 +91,7 @@
 #define HNS_ROCE_MAX_PORTS			6
 #define HNS_ROCE_MAX_GID_NUM			16
 #define HNS_ROCE_GID_SIZE			16
+#define HNS_ROCE_SGE_SIZE			16
 
 #define HNS_ROCE_HOP_NUM_0			0xff
 
@@ -88,8 +99,11 @@
 #define BITMAP_RR				1
 
 #define MR_TYPE_MR				0x00
+#define MR_TYPE_FRMR				0x01
 #define MR_TYPE_DMA				0x03
 
+#define HNS_ROCE_FRMR_MAX_PA			512
+
 #define PKEY_ID					0xffff
 #define GUID_LEN				8
 #define NODE_DESC_SIZE				64
@@ -108,6 +122,16 @@
 #define PAGES_SHIFT_24				24
 #define PAGES_SHIFT_32				32
 
+#define HNS_ROCE_PCI_BAR_NUM			2
+
+#define HNS_ROCE_IDX_QUE_ENTRY_SZ		4
+#define SRQ_DB_REG				0x230
+
+/* The chip implementation of the consumer index is calculated
+ * according to twice the actual EQ depth
+ */
+#define EQ_DEPTH_COEFF				2
+
 enum {
 	HNS_ROCE_SUPPORT_RQ_RECORD_DB = 1 << 0,
 	HNS_ROCE_SUPPORT_SQ_RECORD_DB = 1 << 1,
@@ -193,17 +217,53 @@
 	HNS_ROCE_CAP_FLAG_RQ_INLINE		= BIT(2),
 	HNS_ROCE_CAP_FLAG_RECORD_DB		= BIT(3),
 	HNS_ROCE_CAP_FLAG_SQ_RECORD_DB		= BIT(4),
+	HNS_ROCE_CAP_FLAG_SRQ			= BIT(5),
+	HNS_ROCE_CAP_FLAG_MW			= BIT(7),
+	HNS_ROCE_CAP_FLAG_FRMR                  = BIT(8),
+	HNS_ROCE_CAP_FLAG_QP_FLOW_CTRL		= BIT(9),
+	HNS_ROCE_CAP_FLAG_ATOMIC		= BIT(10),
 };
 
 enum hns_roce_mtt_type {
 	MTT_TYPE_WQE,
 	MTT_TYPE_CQE,
+	MTT_TYPE_SRQWQE,
+	MTT_TYPE_IDX
 };
 
+#define HNS_ROCE_DB_TYPE_COUNT			2
+#define HNS_ROCE_DB_UNIT_SIZE			4
+
 enum {
 	HNS_ROCE_DB_PER_PAGE = PAGE_SIZE / 4
 };
 
+enum hns_roce_reset_stage {
+	HNS_ROCE_STATE_NON_RST,
+	HNS_ROCE_STATE_RST_BEF_DOWN,
+	HNS_ROCE_STATE_RST_DOWN,
+	HNS_ROCE_STATE_RST_UNINIT,
+	HNS_ROCE_STATE_RST_INIT,
+	HNS_ROCE_STATE_RST_INITED,
+};
+
+enum hns_roce_instance_state {
+	HNS_ROCE_STATE_NON_INIT,
+	HNS_ROCE_STATE_INIT,
+	HNS_ROCE_STATE_INITED,
+	HNS_ROCE_STATE_UNINIT,
+};
+
+enum {
+	HNS_ROCE_RST_DIRECT_RETURN		= 0,
+};
+
+enum {
+	CMD_RST_PRC_OTHERS,
+	CMD_RST_PRC_SUCCESS,
+	CMD_RST_PRC_EBUSY,
+};
+
 #define HNS_ROCE_CMD_SUCCESS			1
 
 #define HNS_ROCE_PORT_DOWN			0
@@ -219,19 +279,11 @@
 	unsigned long	logic_idx;
 };
 
-struct hns_roce_vma_data {
-	struct list_head list;
-	struct vm_area_struct *vma;
-	struct mutex *vma_list_mutex;
-};
-
 struct hns_roce_ucontext {
 	struct ib_ucontext	ibucontext;
 	struct hns_roce_uar	uar;
 	struct list_head	page_list;
 	struct mutex		page_mutex;
-	struct list_head	vma_list;
-	struct mutex		vma_list_mutex;
 };
 
 struct hns_roce_pd {
@@ -274,7 +326,7 @@
 	unsigned long	num_hem;
 	/* HEM entry record obj total num */
 	unsigned long	num_obj;
-	/*Single obj size */
+	/* Single obj size */
 	unsigned long	obj_size;
 	unsigned long	table_chunk_size;
 	int		lowmem;
@@ -293,6 +345,39 @@
 	enum hns_roce_mtt_type	mtt_type;
 };
 
+struct hns_roce_buf_region {
+	int offset; /* page offset */
+	u32 count; /* page count */
+	int hopnum; /* addressing hop num */
+};
+
+#define HNS_ROCE_MAX_BT_REGION	3
+#define HNS_ROCE_MAX_BT_LEVEL	3
+struct hns_roce_hem_list {
+	struct list_head root_bt;
+	/* link all bt dma mem by hop config */
+	struct list_head mid_bt[HNS_ROCE_MAX_BT_REGION][HNS_ROCE_MAX_BT_LEVEL];
+	struct list_head btm_bt; /* link all bottom bt in @mid_bt */
+	dma_addr_t root_ba; /* pointer to the root ba table */
+	int bt_pg_shift;
+};
+
+/* memory translate region */
+struct hns_roce_mtr {
+	struct hns_roce_hem_list hem_list;
+	int buf_pg_shift;
+};
+
+struct hns_roce_mw {
+	struct ib_mw		ibmw;
+	u32			pdn;
+	u32			rkey;
+	int			enabled; /* MW's active status */
+	u32			pbl_hop_num;
+	u32			pbl_ba_pg_sz;
+	u32			pbl_buf_pg_sz;
+};
+
 /* Only support 4K page size for mr register */
 #define MR_SIZE_4K 0
 
@@ -303,24 +388,25 @@
 	u64			size; /* Address range of MR */
 	u32			key; /* Key of MR */
 	u32			pd;   /* PD num of MR */
-	u32			access;/* Access permission of MR */
+	u32			access;	/* Access permission of MR */
+	u32			npages;
 	int			enabled; /* MR's active status */
 	int			type;	/* MR's register type */
-	u64			*pbl_buf;/* MR's PBL space */
+	u64			*pbl_buf;	/* MR's PBL space */
 	dma_addr_t		pbl_dma_addr;	/* MR's PBL space PA */
-	u32			pbl_size;/* PA number in the PBL */
-	u64			pbl_ba;/* page table address */
-	u32			l0_chunk_last_num;/* L0 last number */
-	u32			l1_chunk_last_num;/* L1 last number */
-	u64			**pbl_bt_l2;/* PBL BT L2 */
-	u64			**pbl_bt_l1;/* PBL BT L1 */
-	u64			*pbl_bt_l0;/* PBL BT L0 */
-	dma_addr_t		*pbl_l2_dma_addr;/* PBL BT L2 dma addr */
-	dma_addr_t		*pbl_l1_dma_addr;/* PBL BT L1 dma addr */
-	dma_addr_t		pbl_l0_dma_addr;/* PBL BT L0 dma addr */
-	u32			pbl_ba_pg_sz;/* BT chunk page size */
-	u32			pbl_buf_pg_sz;/* buf chunk page size */
-	u32			pbl_hop_num;/* multi-hop number */
+	u32			pbl_size;	/* PA number in the PBL */
+	u64			pbl_ba;		/* page table address */
+	u32			l0_chunk_last_num;	/* L0 last number */
+	u32			l1_chunk_last_num;	/* L1 last number */
+	u64			**pbl_bt_l2;	/* PBL BT L2 */
+	u64			**pbl_bt_l1;	/* PBL BT L1 */
+	u64			*pbl_bt_l0;	/* PBL BT L0 */
+	dma_addr_t		*pbl_l2_dma_addr;	/* PBL BT L2 dma addr */
+	dma_addr_t		*pbl_l1_dma_addr;	/* PBL BT L1 dma addr */
+	dma_addr_t		pbl_l0_dma_addr;	/* PBL BT L0 dma addr */
+	u32			pbl_ba_pg_sz;	/* BT chunk page size */
+	u32			pbl_buf_pg_sz;	/* buf chunk page size */
+	u32			pbl_hop_num;	/* multi-hop number */
 };
 
 struct hns_roce_mr_table {
@@ -330,6 +416,10 @@
 	struct hns_roce_hem_table	mtpt_table;
 	struct hns_roce_buddy		mtt_cqe_buddy;
 	struct hns_roce_hem_table	mtt_cqe_table;
+	struct hns_roce_buddy		mtt_srqwqe_buddy;
+	struct hns_roce_hem_table	mtt_srqwqe_table;
+	struct hns_roce_buddy		mtt_idx_buddy;
+	struct hns_roce_hem_table	mtt_idx_table;
 };
 
 struct hns_roce_wq {
@@ -339,16 +429,16 @@
 	u32		max_post;
 	int		max_gs;
 	int		offset;
-	int		wqe_shift;/* WQE size */
+	int		wqe_shift;	/* WQE size */
 	u32		head;
 	u32		tail;
 	void __iomem	*db_reg_l;
 };
 
 struct hns_roce_sge {
-	int		sge_cnt;  /* SGE num */
+	int		sge_cnt;	/* SGE num */
 	int		offset;
-	int		sge_shift;/* SGE size */
+	int		sge_shift;	/* SGE size */
 };
 
 struct hns_roce_buf_list {
@@ -367,8 +457,8 @@
 struct hns_roce_db_pgdir {
 	struct list_head	list;
 	DECLARE_BITMAP(order0, HNS_ROCE_DB_PER_PAGE);
-	DECLARE_BITMAP(order1, HNS_ROCE_DB_PER_PAGE / 2);
-	unsigned long		*bits[2];
+	DECLARE_BITMAP(order1, HNS_ROCE_DB_PER_PAGE / HNS_ROCE_DB_TYPE_COUNT);
+	unsigned long		*bits[HNS_ROCE_DB_TYPE_COUNT];
 	u32			*page;
 	dma_addr_t		db_dma;
 };
@@ -420,9 +510,37 @@
 	struct completion		free;
 };
 
+struct hns_roce_idx_que {
+	struct hns_roce_buf		idx_buf;
+	int				entry_sz;
+	u32				buf_size;
+	struct ib_umem			*umem;
+	struct hns_roce_mtt		mtt;
+	unsigned long			*bitmap;
+};
+
 struct hns_roce_srq {
 	struct ib_srq		ibsrq;
-	int			srqn;
+	void (*event)(struct hns_roce_srq *srq, enum hns_roce_event event);
+	unsigned long		srqn;
+	int			max;
+	int			max_gs;
+	int			wqe_shift;
+	void __iomem		*db_reg_l;
+
+	atomic_t		refcount;
+	struct completion	free;
+
+	struct hns_roce_buf	buf;
+	u64		       *wrid;
+	struct ib_umem	       *umem;
+	struct hns_roce_mtt	mtt;
+	struct hns_roce_idx_que idx_que;
+	spinlock_t		lock;
+	int			head;
+	int			tail;
+	u16			wqe_ctr;
+	struct mutex		mutex;
 };
 
 struct hns_roce_uar_table {
@@ -431,16 +549,22 @@
 
 struct hns_roce_qp_table {
 	struct hns_roce_bitmap		bitmap;
-	spinlock_t			lock;
 	struct hns_roce_hem_table	qp_table;
 	struct hns_roce_hem_table	irrl_table;
 	struct hns_roce_hem_table	trrl_table;
+	struct hns_roce_hem_table	sccc_table;
+	struct mutex			scc_mutex;
 };
 
 struct hns_roce_cq_table {
 	struct hns_roce_bitmap		bitmap;
-	spinlock_t			lock;
-	struct radix_tree_root		tree;
+	struct xarray			array;
+	struct hns_roce_hem_table	table;
+};
+
+struct hns_roce_srq_table {
+	struct hns_roce_bitmap		bitmap;
+	struct xarray			xa;
 	struct hns_roce_hem_table	table;
 };
 
@@ -449,14 +573,17 @@
 };
 
 struct hns_roce_av {
-	__le32      port_pd;
+	u8          port;
 	u8          gid_index;
 	u8          stat_rate;
 	u8          hop_limit;
-	__le32      sl_tclass_flowlabel;
+	u32         flowlabel;
+	u8          sl;
+	u8          tclass;
 	u8          dgid[HNS_ROCE_GID_SIZE];
-	u8          mac[6];
-	__le16      vlan;
+	u8          mac[ETH_ALEN];
+	u16         vlan;
+	bool	    vlan_en;
 };
 
 struct hns_roce_ah {
@@ -497,7 +624,6 @@
 	 * close device, switch into poll mode(non event mode)
 	 */
 	u8			use_events;
-	u8			toggle;
 };
 
 struct hns_roce_cmd_mailbox {
@@ -531,14 +657,20 @@
 	u8			rdb_en;
 	u8			sdb_en;
 	u32			doorbell_qpn;
-	__le32			sq_signal_bits;
+	u32			sq_signal_bits;
 	u32			sq_next_wqe;
-	int			sq_max_wqes_per_wr;
-	int			sq_spare_wqes;
 	struct hns_roce_wq	sq;
 
 	struct ib_umem		*umem;
 	struct hns_roce_mtt	mtt;
+	struct hns_roce_mtr	mtr;
+
+	/* this define must less than HNS_ROCE_MAX_BT_REGION */
+#define HNS_ROCE_WQE_REGION_MAX	 3
+	struct hns_roce_buf_region regions[HNS_ROCE_WQE_REGION_MAX];
+	int			region_cnt;
+	int                     wqe_bt_pg_shift;
+
 	u32			buff_size;
 	struct mutex		mutex;
 	u8			port;
@@ -580,7 +712,7 @@
 };
 
 struct hns_roce_ceqe {
-	u32			comp;
+	__le32			comp;
 };
 
 struct hns_roce_aeqe {
@@ -593,6 +725,12 @@
 		} qp_event;
 
 		struct {
+			__le32 srq;
+			u32 rsv0;
+			u32 rsv1;
+		} srq_event;
+
+		struct {
 			__le32 cq;
 			u32 rsv0;
 			u32 rsv1;
@@ -617,7 +755,7 @@
 	struct hns_roce_dev		*hr_dev;
 	void __iomem			*doorbell;
 
-	int				type_flag;/* Aeq:1 ceq:0 */
+	int				type_flag; /* Aeq:1 ceq:0 */
 	int				eqn;
 	u32				entries;
 	int				log_entries;
@@ -656,19 +794,29 @@
 };
 
 struct hns_roce_caps {
+	u64		fw_ver;
 	u8		num_ports;
 	int		gid_table_len[HNS_ROCE_MAX_PORTS];
 	int		pkey_table_len[HNS_ROCE_MAX_PORTS];
 	int		local_ca_ack_delay;
 	int		num_uars;
 	u32		phy_num_uars;
-	u32		max_sq_sg;	/* 2 */
-	u32		max_sq_inline;	/* 32 */
-	u32		max_rq_sg;	/* 2 */
-	int		num_qps;	/* 256k */
-	u32		max_wqes;	/* 16k */
-	u32		max_sq_desc_sz;	/* 64 */
-	u32		max_rq_desc_sz;	/* 64 */
+	u32		max_sq_sg;
+	u32		max_sq_inline;
+	u32		max_rq_sg;
+	u32		max_extend_sg;
+	int		num_qps;
+	int             reserved_qps;
+	int		num_qpc_timer;
+	int		num_cqc_timer;
+	u32		max_srq_sg;
+	int		num_srqs;
+	u32		max_wqes;
+	u32		max_srqs;
+	u32		max_srq_wrs;
+	u32		max_srq_sges;
+	u32		max_sq_desc_sz;
+	u32		max_rq_desc_sz;
 	u32		max_srq_desc_sz;
 	int		max_qp_init_rdma;
 	int		max_qp_dest_rdma;
@@ -677,12 +825,16 @@
 	int		min_cqes;
 	u32		min_wqes;
 	int		reserved_cqs;
-	int		num_aeq_vectors;	/* 1 */
+	int		reserved_srqs;
+	u32		max_srqwqes;
+	int		num_aeq_vectors;
 	int		num_comp_vectors;
 	int		num_other_vectors;
 	int		num_mtpts;
 	u32		num_mtt_segs;
 	u32		num_cqe_segs;
+	u32		num_srqwqe_segs;
+	u32		num_idx_segs;
 	int		reserved_mrws;
 	int		reserved_uars;
 	int		num_pds;
@@ -696,6 +848,11 @@
 	int		irrl_entry_sz;
 	int		trrl_entry_sz;
 	int		cqc_entry_sz;
+	int		sccc_entry_sz;
+	int		qpc_timer_entry_sz;
+	int		cqc_timer_entry_sz;
+	int		srqc_entry_sz;
+	int		idx_entry_sz;
 	u32		pbl_ba_pg_sz;
 	u32		pbl_buf_pg_sz;
 	u32		pbl_hop_num;
@@ -703,9 +860,12 @@
 	int		ceqe_depth;
 	enum ib_mtu	max_mtu;
 	u32		qpc_bt_num;
+	u32		qpc_timer_bt_num;
 	u32		srqc_bt_num;
 	u32		cqc_bt_num;
+	u32		cqc_timer_bt_num;
 	u32		mpt_bt_num;
+	u32		sccc_bt_num;
 	u32		qpc_ba_pg_sz;
 	u32		qpc_buf_pg_sz;
 	u32		qpc_hop_num;
@@ -721,16 +881,34 @@
 	u32		mtt_ba_pg_sz;
 	u32		mtt_buf_pg_sz;
 	u32		mtt_hop_num;
+	u32		wqe_sq_hop_num;
+	u32		wqe_sge_hop_num;
+	u32		wqe_rq_hop_num;
+	u32		sccc_ba_pg_sz;
+	u32		sccc_buf_pg_sz;
+	u32		sccc_hop_num;
+	u32		qpc_timer_ba_pg_sz;
+	u32		qpc_timer_buf_pg_sz;
+	u32		qpc_timer_hop_num;
+	u32		cqc_timer_ba_pg_sz;
+	u32		cqc_timer_buf_pg_sz;
+	u32		cqc_timer_hop_num;
 	u32		cqe_ba_pg_sz;
 	u32		cqe_buf_pg_sz;
 	u32		cqe_hop_num;
+	u32		srqwqe_ba_pg_sz;
+	u32		srqwqe_buf_pg_sz;
+	u32		srqwqe_hop_num;
+	u32		idx_ba_pg_sz;
+	u32		idx_buf_pg_sz;
+	u32		idx_hop_num;
 	u32		eqe_ba_pg_sz;
 	u32		eqe_buf_pg_sz;
 	u32		eqe_hop_num;
 	u32		sl_num;
 	u32		tsq_buf_pg_sz;
 	u32		tpq_buf_pg_sz;
-	u32		chunk_sz;	/* chunk size in non multihop mode*/
+	u32		chunk_sz;	/* chunk size in non multihop mode */
 	u64		flags;
 };
 
@@ -738,10 +916,16 @@
 	struct hns_roce_dev *hr_dev;
 	struct work_struct work;
 	u32 qpn;
+	u32 cqn;
 	int event_type;
 	int sub_type;
 };
 
+struct hns_roce_dfx_hw {
+	int (*query_cqc_info)(struct hns_roce_dev *hr_dev, u32 cqn,
+			      int *buffer);
+};
+
 struct hns_roce_hw {
 	int (*reset)(struct hns_roce_dev *hr_dev, bool enable);
 	int (*cmq_init)(struct hns_roce_dev *hr_dev);
@@ -753,6 +937,7 @@
 			 u64 out_param, u32 in_modifier, u8 op_modifier, u16 op,
 			 u16 token, int event);
 	int (*chk_mbox)(struct hns_roce_dev *hr_dev, unsigned long timeout);
+	int (*rst_prc_mbox)(struct hns_roce_dev *hr_dev);
 	int (*set_gid)(struct hns_roce_dev *hr_dev, u8 port, int gid_index,
 		       const union ib_gid *gid, const struct ib_gid_attr *attr);
 	int (*set_mac)(struct hns_roce_dev *hr_dev, u8 phy_port, u8 *addr);
@@ -764,6 +949,8 @@
 				struct hns_roce_mr *mr, int flags, u32 pdn,
 				int mr_access_flags, u64 iova, u64 size,
 				void *mb_buf);
+	int (*frmr_write_mtpt)(void *mb_buf, struct hns_roce_mr *mr);
+	int (*mw_write_mtpt)(void *mb_buf, struct hns_roce_mw *mw);
 	void (*write_cqc)(struct hns_roce_dev *hr_dev,
 			  struct hns_roce_cq *hr_cq, void *mb_buf, u64 *mtts,
 			  dma_addr_t dma_handle, int nent, u32 vector);
@@ -777,18 +964,34 @@
 	int (*modify_qp)(struct ib_qp *ibqp, const struct ib_qp_attr *attr,
 			 int attr_mask, enum ib_qp_state cur_state,
 			 enum ib_qp_state new_state);
-	int (*destroy_qp)(struct ib_qp *ibqp);
+	int (*destroy_qp)(struct ib_qp *ibqp, struct ib_udata *udata);
+	int (*qp_flow_control_init)(struct hns_roce_dev *hr_dev,
+			 struct hns_roce_qp *hr_qp);
 	int (*post_send)(struct ib_qp *ibqp, const struct ib_send_wr *wr,
 			 const struct ib_send_wr **bad_wr);
 	int (*post_recv)(struct ib_qp *qp, const struct ib_recv_wr *recv_wr,
 			 const struct ib_recv_wr **bad_recv_wr);
 	int (*req_notify_cq)(struct ib_cq *ibcq, enum ib_cq_notify_flags flags);
 	int (*poll_cq)(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc);
-	int (*dereg_mr)(struct hns_roce_dev *hr_dev, struct hns_roce_mr *mr);
-	int (*destroy_cq)(struct ib_cq *ibcq);
+	int (*dereg_mr)(struct hns_roce_dev *hr_dev, struct hns_roce_mr *mr,
+			struct ib_udata *udata);
+	void (*destroy_cq)(struct ib_cq *ibcq, struct ib_udata *udata);
 	int (*modify_cq)(struct ib_cq *cq, u16 cq_count, u16 cq_period);
 	int (*init_eq)(struct hns_roce_dev *hr_dev);
 	void (*cleanup_eq)(struct hns_roce_dev *hr_dev);
+	void (*write_srqc)(struct hns_roce_dev *hr_dev,
+			   struct hns_roce_srq *srq, u32 pdn, u16 xrcd, u32 cqn,
+			   void *mb_buf, u64 *mtts_wqe, u64 *mtts_idx,
+			   dma_addr_t dma_handle_wqe,
+			   dma_addr_t dma_handle_idx);
+	int (*modify_srq)(struct ib_srq *ibsrq, struct ib_srq_attr *srq_attr,
+		       enum ib_srq_attr_mask srq_attr_mask,
+		       struct ib_udata *udata);
+	int (*query_srq)(struct ib_srq *ibsrq, struct ib_srq_attr *attr);
+	int (*post_srq_recv)(struct ib_srq *ibsrq, const struct ib_recv_wr *wr,
+			     const struct ib_recv_wr **bad_wr);
+	const struct ib_device_ops *hns_roce_dev_ops;
+	const struct ib_device_ops *hns_roce_dev_srq_ops;
 };
 
 struct hns_roce_dev {
@@ -802,6 +1005,8 @@
 	spinlock_t		bt_cmd_lock;
 	bool			active;
 	bool			is_reset;
+	bool			dis_db;
+	unsigned long		reset_cnt;
 	struct hns_roce_ib_iboe iboe;
 
 	struct list_head        pgdir_list;
@@ -809,9 +1014,9 @@
 	int			irq[HNS_ROCE_MAX_IRQ_NUM];
 	u8 __iomem		*reg_base;
 	struct hns_roce_caps	caps;
-	struct radix_tree_root  qp_table_tree;
+	struct xarray		qp_table_xa;
 
-	unsigned char	dev_addr[HNS_ROCE_MAX_PORTS][MAC_ADDR_OCTET_NUM];
+	unsigned char	dev_addr[HNS_ROCE_MAX_PORTS][ETH_ALEN];
 	u64			sys_image_guid;
 	u32                     vendor_id;
 	u32                     vendor_part_id;
@@ -823,18 +1028,22 @@
 	struct hns_roce_uar_table uar_table;
 	struct hns_roce_mr_table  mr_table;
 	struct hns_roce_cq_table  cq_table;
+	struct hns_roce_srq_table srq_table;
 	struct hns_roce_qp_table  qp_table;
 	struct hns_roce_eq_table  eq_table;
+	struct hns_roce_hem_table  qpc_timer_table;
+	struct hns_roce_hem_table  cqc_timer_table;
 
 	int			cmd_mod;
 	int			loop_idc;
 	u32			sdb_offset;
 	u32			odb_offset;
-	dma_addr_t		tptr_dma_addr; /*only for hw v1*/
-	u32			tptr_size; /*only for hw v1*/
+	dma_addr_t		tptr_dma_addr;	/* only for hw v1 */
+	u32			tptr_size;	/* only for hw v1 */
 	const struct hns_roce_hw *hw;
 	void			*priv;
 	struct workqueue_struct *irq_workq;
+	const struct hns_roce_dfx_hw *dfx;
 };
 
 static inline struct hns_roce_dev *to_hr_dev(struct ib_device *ib_dev)
@@ -863,6 +1072,11 @@
 	return container_of(ibmr, struct hns_roce_mr, ibmr);
 }
 
+static inline struct hns_roce_mw *to_hr_mw(struct ib_mw *ibmw)
+{
+	return container_of(ibmw, struct hns_roce_mw, ibmw);
+}
+
 static inline struct hns_roce_qp *to_hr_qp(struct ib_qp *ibqp)
 {
 	return container_of(ibqp, struct hns_roce_qp, ibqp);
@@ -891,8 +1105,7 @@
 static inline struct hns_roce_qp
 	*__hns_roce_qp_lookup(struct hns_roce_dev *hr_dev, u32 qpn)
 {
-	return radix_tree_lookup(&hr_dev->qp_table_tree,
-				 qpn & (hr_dev->caps.num_qps - 1));
+	return xa_load(&hr_dev->qp_table_xa, qpn & (hr_dev->caps.num_qps - 1));
 }
 
 static inline void *hns_roce_buf_offset(struct hns_roce_buf *buf, int offset)
@@ -925,17 +1138,32 @@
 int hns_roce_buf_write_mtt(struct hns_roce_dev *hr_dev,
 			   struct hns_roce_mtt *mtt, struct hns_roce_buf *buf);
 
+void hns_roce_mtr_init(struct hns_roce_mtr *mtr, int bt_pg_shift,
+		       int buf_pg_shift);
+int hns_roce_mtr_attach(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr,
+			dma_addr_t **bufs, struct hns_roce_buf_region *regions,
+			int region_cnt);
+void hns_roce_mtr_cleanup(struct hns_roce_dev *hr_dev,
+			  struct hns_roce_mtr *mtr);
+
+/* hns roce hw need current block and next block addr from mtt */
+#define MTT_MIN_COUNT	 2
+int hns_roce_mtr_find(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr,
+		      int offset, u64 *mtt_buf, int mtt_max, u64 *base_addr);
+
 int hns_roce_init_pd_table(struct hns_roce_dev *hr_dev);
 int hns_roce_init_mr_table(struct hns_roce_dev *hr_dev);
 int hns_roce_init_eq_table(struct hns_roce_dev *hr_dev);
 int hns_roce_init_cq_table(struct hns_roce_dev *hr_dev);
 int hns_roce_init_qp_table(struct hns_roce_dev *hr_dev);
+int hns_roce_init_srq_table(struct hns_roce_dev *hr_dev);
 
 void hns_roce_cleanup_pd_table(struct hns_roce_dev *hr_dev);
 void hns_roce_cleanup_mr_table(struct hns_roce_dev *hr_dev);
 void hns_roce_cleanup_eq_table(struct hns_roce_dev *hr_dev);
 void hns_roce_cleanup_cq_table(struct hns_roce_dev *hr_dev);
 void hns_roce_cleanup_qp_table(struct hns_roce_dev *hr_dev);
+void hns_roce_cleanup_srq_table(struct hns_roce_dev *hr_dev);
 
 int hns_roce_bitmap_alloc(struct hns_roce_bitmap *bitmap, unsigned long *obj);
 void hns_roce_bitmap_free(struct hns_roce_bitmap *bitmap, unsigned long obj,
@@ -950,16 +1178,13 @@
 				unsigned long obj, int cnt,
 				int rr);
 
-struct ib_ah *hns_roce_create_ah(struct ib_pd *pd,
-				 struct rdma_ah_attr *ah_attr,
-				 struct ib_udata *udata);
+int hns_roce_create_ah(struct ib_ah *ah, struct rdma_ah_attr *ah_attr,
+		       u32 flags, struct ib_udata *udata);
 int hns_roce_query_ah(struct ib_ah *ibah, struct rdma_ah_attr *ah_attr);
-int hns_roce_destroy_ah(struct ib_ah *ah);
+void hns_roce_destroy_ah(struct ib_ah *ah, u32 flags);
 
-struct ib_pd *hns_roce_alloc_pd(struct ib_device *ib_dev,
-				struct ib_ucontext *context,
-				struct ib_udata *udata);
-int hns_roce_dealloc_pd(struct ib_pd *pd);
+int hns_roce_alloc_pd(struct ib_pd *pd, struct ib_udata *udata);
+void hns_roce_dealloc_pd(struct ib_pd *pd, struct ib_udata *udata);
 
 struct ib_mr *hns_roce_get_dma_mr(struct ib_pd *pd, int acc);
 struct ib_mr *hns_roce_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
@@ -968,12 +1193,20 @@
 int hns_roce_rereg_user_mr(struct ib_mr *mr, int flags, u64 start, u64 length,
 			   u64 virt_addr, int mr_access_flags, struct ib_pd *pd,
 			   struct ib_udata *udata);
-int hns_roce_dereg_mr(struct ib_mr *ibmr);
+struct ib_mr *hns_roce_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
+				u32 max_num_sg, struct ib_udata *udata);
+int hns_roce_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents,
+		       unsigned int *sg_offset);
+int hns_roce_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata);
 int hns_roce_hw2sw_mpt(struct hns_roce_dev *hr_dev,
 		       struct hns_roce_cmd_mailbox *mailbox,
 		       unsigned long mpt_index);
 unsigned long key_to_hw_index(u32 key);
 
+struct ib_mw *hns_roce_alloc_mw(struct ib_pd *pd, enum ib_mw_type,
+				struct ib_udata *udata);
+int hns_roce_dealloc_mw(struct ib_mw *ibmw);
+
 void hns_roce_buf_free(struct hns_roce_dev *hr_dev, u32 size,
 		       struct hns_roce_buf *buf);
 int hns_roce_buf_alloc(struct hns_roce_dev *hr_dev, u32 size, u32 max_direct,
@@ -982,6 +1215,26 @@
 int hns_roce_ib_umem_write_mtt(struct hns_roce_dev *hr_dev,
 			       struct hns_roce_mtt *mtt, struct ib_umem *umem);
 
+void hns_roce_init_buf_region(struct hns_roce_buf_region *region, int hopnum,
+			      int offset, int buf_cnt);
+int hns_roce_alloc_buf_list(struct hns_roce_buf_region *regions,
+			    dma_addr_t **bufs, int count);
+void hns_roce_free_buf_list(dma_addr_t **bufs, int count);
+
+int hns_roce_get_kmem_bufs(struct hns_roce_dev *hr_dev, dma_addr_t *bufs,
+			   int buf_cnt, int start, struct hns_roce_buf *buf);
+int hns_roce_get_umem_bufs(struct hns_roce_dev *hr_dev, dma_addr_t *bufs,
+			   int buf_cnt, int start, struct ib_umem *umem,
+			   int page_shift);
+
+int hns_roce_create_srq(struct ib_srq *srq,
+			struct ib_srq_init_attr *srq_init_attr,
+			struct ib_udata *udata);
+int hns_roce_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *srq_attr,
+			enum ib_srq_attr_mask srq_attr_mask,
+			struct ib_udata *udata);
+void hns_roce_destroy_srq(struct ib_srq *ibsrq, struct ib_udata *udata);
+
 struct ib_qp *hns_roce_create_qp(struct ib_pd *ib_pd,
 				 struct ib_qp_init_attr *init_attr,
 				 struct ib_udata *udata);
@@ -1004,15 +1257,15 @@
 __be32 send_ieth(const struct ib_send_wr *wr);
 int to_hr_qp_type(int qp_type);
 
-struct ib_cq *hns_roce_ib_create_cq(struct ib_device *ib_dev,
-				    const struct ib_cq_init_attr *attr,
-				    struct ib_ucontext *context,
-				    struct ib_udata *udata);
+int hns_roce_ib_create_cq(struct ib_cq *ib_cq,
+			  const struct ib_cq_init_attr *attr,
+			  struct ib_udata *udata);
 
-int hns_roce_ib_destroy_cq(struct ib_cq *ib_cq);
+void hns_roce_ib_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata);
 void hns_roce_free_cq(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq);
 
-int hns_roce_db_map_user(struct hns_roce_ucontext *context, unsigned long virt,
+int hns_roce_db_map_user(struct hns_roce_ucontext *context,
+			 struct ib_udata *udata, unsigned long virt,
 			 struct hns_roce_db *db);
 void hns_roce_db_unmap_user(struct hns_roce_ucontext *context,
 			    struct hns_roce_db *db);
@@ -1023,8 +1276,11 @@
 void hns_roce_cq_completion(struct hns_roce_dev *hr_dev, u32 cqn);
 void hns_roce_cq_event(struct hns_roce_dev *hr_dev, u32 cqn, int event_type);
 void hns_roce_qp_event(struct hns_roce_dev *hr_dev, u32 qpn, int event_type);
+void hns_roce_srq_event(struct hns_roce_dev *hr_dev, u32 srqn, int event_type);
 int hns_get_gid_index(struct hns_roce_dev *hr_dev, u8 port, int gid_index);
 int hns_roce_init(struct hns_roce_dev *hr_dev);
 void hns_roce_exit(struct hns_roce_dev *hr_dev);
 
+int hns_roce_fill_res_entry(struct sk_buff *msg,
+			    struct rdma_restrack_entry *res);
 #endif /* _HNS_ROCE_DEVICE_H */
diff --git a/drivers/infiniband/hw/hns/hns_roce_hem.c b/drivers/infiniband/hw/hns/hns_roce_hem.c
index f6faefe..e822157 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hem.c
+++ b/drivers/infiniband/hw/hns/hns_roce_hem.c
@@ -41,25 +41,57 @@
 
 bool hns_roce_check_whether_mhop(struct hns_roce_dev *hr_dev, u32 type)
 {
-	if ((hr_dev->caps.qpc_hop_num && type == HEM_TYPE_QPC) ||
-	    (hr_dev->caps.mpt_hop_num && type == HEM_TYPE_MTPT) ||
-	    (hr_dev->caps.cqc_hop_num && type == HEM_TYPE_CQC) ||
-	    (hr_dev->caps.srqc_hop_num && type == HEM_TYPE_SRQC) ||
-	    (hr_dev->caps.cqe_hop_num && type == HEM_TYPE_CQE) ||
-	    (hr_dev->caps.mtt_hop_num && type == HEM_TYPE_MTT))
-		return true;
+	int hop_num = 0;
 
-	return false;
+	switch (type) {
+	case HEM_TYPE_QPC:
+		hop_num = hr_dev->caps.qpc_hop_num;
+		break;
+	case HEM_TYPE_MTPT:
+		hop_num = hr_dev->caps.mpt_hop_num;
+		break;
+	case HEM_TYPE_CQC:
+		hop_num = hr_dev->caps.cqc_hop_num;
+		break;
+	case HEM_TYPE_SRQC:
+		hop_num = hr_dev->caps.srqc_hop_num;
+		break;
+	case HEM_TYPE_SCCC:
+		hop_num = hr_dev->caps.sccc_hop_num;
+		break;
+	case HEM_TYPE_QPC_TIMER:
+		hop_num = hr_dev->caps.qpc_timer_hop_num;
+		break;
+	case HEM_TYPE_CQC_TIMER:
+		hop_num = hr_dev->caps.cqc_timer_hop_num;
+		break;
+	case HEM_TYPE_CQE:
+		hop_num = hr_dev->caps.cqe_hop_num;
+		break;
+	case HEM_TYPE_MTT:
+		hop_num = hr_dev->caps.mtt_hop_num;
+		break;
+	case HEM_TYPE_SRQWQE:
+		hop_num = hr_dev->caps.srqwqe_hop_num;
+		break;
+	case HEM_TYPE_IDX:
+		hop_num = hr_dev->caps.idx_hop_num;
+		break;
+	default:
+		return false;
+	}
+
+	return hop_num ? true : false;
 }
-EXPORT_SYMBOL_GPL(hns_roce_check_whether_mhop);
 
 static bool hns_roce_check_hem_null(struct hns_roce_hem **hem, u64 start_idx,
-			    u32 bt_chunk_num)
+			    u32 bt_chunk_num, u64 hem_max_num)
 {
-	int i;
+	u64 check_max_num = start_idx + bt_chunk_num;
+	u64 i;
 
-	for (i = 0; i < bt_chunk_num; i++)
-		if (hem[start_idx + i])
+	for (i = start_idx; (i < check_max_num) && (i < hem_max_num); i++)
+		if (hem[i])
 			return false;
 
 	return true;
@@ -88,17 +120,13 @@
 		return 0;
 }
 
-int hns_roce_calc_hem_mhop(struct hns_roce_dev *hr_dev,
-			   struct hns_roce_hem_table *table, unsigned long *obj,
-			   struct hns_roce_hem_mhop *mhop)
+static int get_hem_table_config(struct hns_roce_dev *hr_dev,
+				struct hns_roce_hem_mhop *mhop,
+				u32 type)
 {
 	struct device *dev = hr_dev->dev;
-	u32 chunk_ba_num;
-	u32 table_idx;
-	u32 bt_num;
-	u32 chunk_size;
 
-	switch (table->type) {
+	switch (type) {
 	case HEM_TYPE_QPC:
 		mhop->buf_chunk_size = 1 << (hr_dev->caps.qpc_buf_pg_sz
 					     + PAGE_SHIFT);
@@ -123,6 +151,30 @@
 		mhop->ba_l0_num = hr_dev->caps.cqc_bt_num;
 		mhop->hop_num = hr_dev->caps.cqc_hop_num;
 		break;
+	case HEM_TYPE_SCCC:
+		mhop->buf_chunk_size = 1 << (hr_dev->caps.sccc_buf_pg_sz
+					     + PAGE_SHIFT);
+		mhop->bt_chunk_size = 1 << (hr_dev->caps.sccc_ba_pg_sz
+					    + PAGE_SHIFT);
+		mhop->ba_l0_num = hr_dev->caps.sccc_bt_num;
+		mhop->hop_num = hr_dev->caps.sccc_hop_num;
+		break;
+	case HEM_TYPE_QPC_TIMER:
+		mhop->buf_chunk_size = 1 << (hr_dev->caps.qpc_timer_buf_pg_sz
+					     + PAGE_SHIFT);
+		mhop->bt_chunk_size = 1 << (hr_dev->caps.qpc_timer_ba_pg_sz
+					    + PAGE_SHIFT);
+		mhop->ba_l0_num = hr_dev->caps.qpc_timer_bt_num;
+		mhop->hop_num = hr_dev->caps.qpc_timer_hop_num;
+		break;
+	case HEM_TYPE_CQC_TIMER:
+		mhop->buf_chunk_size = 1 << (hr_dev->caps.cqc_timer_buf_pg_sz
+					     + PAGE_SHIFT);
+		mhop->bt_chunk_size = 1 << (hr_dev->caps.cqc_timer_ba_pg_sz
+					    + PAGE_SHIFT);
+		mhop->ba_l0_num = hr_dev->caps.cqc_timer_bt_num;
+		mhop->hop_num = hr_dev->caps.cqc_timer_hop_num;
+		break;
 	case HEM_TYPE_SRQC:
 		mhop->buf_chunk_size = 1 << (hr_dev->caps.srqc_buf_pg_sz
 					     + PAGE_SHIFT);
@@ -136,7 +188,7 @@
 					     + PAGE_SHIFT);
 		mhop->bt_chunk_size = 1 << (hr_dev->caps.mtt_ba_pg_sz
 					     + PAGE_SHIFT);
-		mhop->ba_l0_num = mhop->bt_chunk_size / 8;
+		mhop->ba_l0_num = mhop->bt_chunk_size / BA_BYTE_LEN;
 		mhop->hop_num = hr_dev->caps.mtt_hop_num;
 		break;
 	case HEM_TYPE_CQE:
@@ -144,24 +196,56 @@
 					     + PAGE_SHIFT);
 		mhop->bt_chunk_size = 1 << (hr_dev->caps.cqe_ba_pg_sz
 					     + PAGE_SHIFT);
-		mhop->ba_l0_num = mhop->bt_chunk_size / 8;
+		mhop->ba_l0_num = mhop->bt_chunk_size / BA_BYTE_LEN;
 		mhop->hop_num = hr_dev->caps.cqe_hop_num;
 		break;
+	case HEM_TYPE_SRQWQE:
+		mhop->buf_chunk_size = 1 << (hr_dev->caps.srqwqe_buf_pg_sz
+					    + PAGE_SHIFT);
+		mhop->bt_chunk_size = 1 << (hr_dev->caps.srqwqe_ba_pg_sz
+					    + PAGE_SHIFT);
+		mhop->ba_l0_num = mhop->bt_chunk_size / BA_BYTE_LEN;
+		mhop->hop_num = hr_dev->caps.srqwqe_hop_num;
+		break;
+	case HEM_TYPE_IDX:
+		mhop->buf_chunk_size = 1 << (hr_dev->caps.idx_buf_pg_sz
+				       + PAGE_SHIFT);
+		mhop->bt_chunk_size = 1 << (hr_dev->caps.idx_ba_pg_sz
+				       + PAGE_SHIFT);
+		mhop->ba_l0_num = mhop->bt_chunk_size / BA_BYTE_LEN;
+		mhop->hop_num = hr_dev->caps.idx_hop_num;
+		break;
 	default:
 		dev_err(dev, "Table %d not support multi-hop addressing!\n",
-			 table->type);
+			type);
 		return -EINVAL;
 	}
 
+	return 0;
+}
+
+int hns_roce_calc_hem_mhop(struct hns_roce_dev *hr_dev,
+			   struct hns_roce_hem_table *table, unsigned long *obj,
+			   struct hns_roce_hem_mhop *mhop)
+{
+	struct device *dev = hr_dev->dev;
+	u32 chunk_ba_num;
+	u32 table_idx;
+	u32 bt_num;
+	u32 chunk_size;
+
+	if (get_hem_table_config(hr_dev, mhop, table->type))
+		return -EINVAL;
+
 	if (!obj)
 		return 0;
 
 	/*
-	 * QPC/MTPT/CQC/SRQC alloc hem for buffer pages.
+	 * QPC/MTPT/CQC/SRQC/SCCC alloc hem for buffer pages.
 	 * MTT/CQE alloc hem for bt pages.
 	 */
 	bt_num = hns_roce_get_bt_num(table->type, mhop->hop_num);
-	chunk_ba_num = mhop->bt_chunk_size / 8;
+	chunk_ba_num = mhop->bt_chunk_size / BA_BYTE_LEN;
 	chunk_size = table->type < HEM_TYPE_MTT ? mhop->buf_chunk_size :
 			      mhop->bt_chunk_size;
 	table_idx = (*obj & (table->num_obj - 1)) /
@@ -189,7 +273,6 @@
 
 	return 0;
 }
-EXPORT_SYMBOL_GPL(hns_roce_calc_hem_mhop);
 
 static struct hns_roce_hem *hns_roce_alloc_hem(struct hns_roce_dev *hr_dev,
 					       int npages,
@@ -281,13 +364,13 @@
 {
 	spinlock_t *lock = &hr_dev->bt_cmd_lock;
 	struct device *dev = hr_dev->dev;
-	unsigned long end = 0;
+	long end;
 	unsigned long flags;
 	struct hns_roce_hem_iter iter;
 	void __iomem *bt_cmd;
-	u32 bt_cmd_h_val = 0;
-	u32 bt_cmd_val[2];
-	u32 bt_cmd_l = 0;
+	__le32 bt_cmd_val[2];
+	__le32 bt_cmd_h = 0;
+	__le32 bt_cmd_l = 0;
 	u64 bt_ba = 0;
 	int ret = 0;
 
@@ -297,30 +380,20 @@
 
 	switch (table->type) {
 	case HEM_TYPE_QPC:
-		roce_set_field(bt_cmd_h_val, ROCEE_BT_CMD_H_ROCEE_BT_CMD_MDF_M,
-			       ROCEE_BT_CMD_H_ROCEE_BT_CMD_MDF_S, HEM_TYPE_QPC);
-		break;
 	case HEM_TYPE_MTPT:
-		roce_set_field(bt_cmd_h_val, ROCEE_BT_CMD_H_ROCEE_BT_CMD_MDF_M,
-			       ROCEE_BT_CMD_H_ROCEE_BT_CMD_MDF_S,
-			       HEM_TYPE_MTPT);
-		break;
 	case HEM_TYPE_CQC:
-		roce_set_field(bt_cmd_h_val, ROCEE_BT_CMD_H_ROCEE_BT_CMD_MDF_M,
-			       ROCEE_BT_CMD_H_ROCEE_BT_CMD_MDF_S, HEM_TYPE_CQC);
-		break;
 	case HEM_TYPE_SRQC:
-		roce_set_field(bt_cmd_h_val, ROCEE_BT_CMD_H_ROCEE_BT_CMD_MDF_M,
-			       ROCEE_BT_CMD_H_ROCEE_BT_CMD_MDF_S,
-			       HEM_TYPE_SRQC);
+		roce_set_field(bt_cmd_h, ROCEE_BT_CMD_H_ROCEE_BT_CMD_MDF_M,
+			ROCEE_BT_CMD_H_ROCEE_BT_CMD_MDF_S, table->type);
 		break;
 	default:
 		return ret;
 	}
-	roce_set_field(bt_cmd_h_val, ROCEE_BT_CMD_H_ROCEE_BT_CMD_IN_MDF_M,
+
+	roce_set_field(bt_cmd_h, ROCEE_BT_CMD_H_ROCEE_BT_CMD_IN_MDF_M,
 		       ROCEE_BT_CMD_H_ROCEE_BT_CMD_IN_MDF_S, obj);
-	roce_set_bit(bt_cmd_h_val, ROCEE_BT_CMD_H_ROCEE_BT_CMD_S, 0);
-	roce_set_bit(bt_cmd_h_val, ROCEE_BT_CMD_H_ROCEE_BT_CMD_HW_SYNS_S, 1);
+	roce_set_bit(bt_cmd_h, ROCEE_BT_CMD_H_ROCEE_BT_CMD_S, 0);
+	roce_set_bit(bt_cmd_h, ROCEE_BT_CMD_H_ROCEE_BT_CMD_HW_SYNS_S, 1);
 
 	/* Currently iter only a chunk */
 	for (hns_roce_hem_first(table->hem[i], &iter);
@@ -331,27 +404,28 @@
 
 		bt_cmd = hr_dev->reg_base + ROCEE_BT_CMD_H_REG;
 
-		end = msecs_to_jiffies(HW_SYNC_TIMEOUT_MSECS) + jiffies;
-		while (1) {
-			if (readl(bt_cmd) >> BT_CMD_SYNC_SHIFT) {
-				if (!(time_before(jiffies, end))) {
-					dev_err(dev, "Write bt_cmd err,hw_sync is not zero.\n");
-					spin_unlock_irqrestore(lock, flags);
-					return -EBUSY;
-				}
-			} else {
+		end = HW_SYNC_TIMEOUT_MSECS;
+		while (end > 0) {
+			if (!(readl(bt_cmd) >> BT_CMD_SYNC_SHIFT))
 				break;
-			}
+
 			mdelay(HW_SYNC_SLEEP_TIME_INTERVAL);
+			end -= HW_SYNC_SLEEP_TIME_INTERVAL;
 		}
 
-		bt_cmd_l = (u32)bt_ba;
-		roce_set_field(bt_cmd_h_val, ROCEE_BT_CMD_H_ROCEE_BT_CMD_BA_H_M,
+		if (end <= 0) {
+			dev_err(dev, "Write bt_cmd err,hw_sync is not zero.\n");
+			spin_unlock_irqrestore(lock, flags);
+			return -EBUSY;
+		}
+
+		bt_cmd_l = cpu_to_le32(bt_ba);
+		roce_set_field(bt_cmd_h, ROCEE_BT_CMD_H_ROCEE_BT_CMD_BA_H_M,
 			       ROCEE_BT_CMD_H_ROCEE_BT_CMD_BA_H_S,
 			       bt_ba >> BT_BA_SHIFT);
 
 		bt_cmd_val[0] = bt_cmd_l;
-		bt_cmd_val[1] = bt_cmd_h_val;
+		bt_cmd_val[1] = bt_cmd_h;
 		hns_roce_write64_k(bt_cmd_val,
 				   hr_dev->reg_base + ROCEE_BT_CMD_L_REG);
 		spin_unlock_irqrestore(lock, flags);
@@ -390,7 +464,7 @@
 	buf_chunk_size = mhop.buf_chunk_size;
 	bt_chunk_size = mhop.bt_chunk_size;
 	hop_num = mhop.hop_num;
-	chunk_ba_num = bt_chunk_size / 8;
+	chunk_ba_num = bt_chunk_size / BA_BYTE_LEN;
 
 	bt_num = hns_roce_get_bt_num(table->type, hop_num);
 	switch (bt_num) {
@@ -413,6 +487,12 @@
 		return -EINVAL;
 	}
 
+	if (unlikely(hem_idx >= table->num_hem)) {
+		dev_err(dev, "Table %d exceed hem limt idx = %llu,max = %lu!\n",
+			     table->type, hem_idx, table->num_hem);
+		return -EINVAL;
+	}
+
 	mutex_lock(&table->mutex);
 
 	if (table->hem[hem_idx]) {
@@ -468,7 +548,7 @@
 	}
 
 	/*
-	 * alloc buffer space chunk for QPC/MTPT/CQC/SRQC.
+	 * alloc buffer space chunk for QPC/MTPT/CQC/SRQC/SCCC.
 	 * alloc bt space chunk for MTT/CQE.
 	 */
 	size = table->type < HEM_TYPE_MTT ? buf_chunk_size : bt_chunk_size;
@@ -599,7 +679,7 @@
 
 	bt_chunk_size = mhop.bt_chunk_size;
 	hop_num = mhop.hop_num;
-	chunk_ba_num = bt_chunk_size / 8;
+	chunk_ba_num = bt_chunk_size / BA_BYTE_LEN;
 
 	bt_num = hns_roce_get_bt_num(table->type, hop_num);
 	switch (bt_num) {
@@ -640,7 +720,7 @@
 	}
 
 	/*
-	 * free buffer space chunk for QPC/MTPT/CQC/SRQC.
+	 * free buffer space chunk for QPC/MTPT/CQC/SRQC/SCCC.
 	 * free bt space chunk for MTT/CQE.
 	 */
 	hns_roce_free_hem(hr_dev, table->hem[hem_idx]);
@@ -649,7 +729,7 @@
 	if (check_whether_bt_num_2(table->type, hop_num)) {
 		start_idx = mhop.l0_idx * chunk_ba_num;
 		if (hns_roce_check_hem_null(table->hem, start_idx,
-					    chunk_ba_num)) {
+					    chunk_ba_num, table->num_hem)) {
 			if (table->type < HEM_TYPE_MTT &&
 			    hr_dev->hw->clear_hem(hr_dev, table, obj, 0))
 				dev_warn(dev, "Clear HEM base address failed.\n");
@@ -663,7 +743,7 @@
 		start_idx = mhop.l0_idx * chunk_ba_num * chunk_ba_num +
 			    mhop.l1_idx * chunk_ba_num;
 		if (hns_roce_check_hem_null(table->hem, start_idx,
-					    chunk_ba_num)) {
+					    chunk_ba_num, table->num_hem)) {
 			if (hr_dev->hw->clear_hem(hr_dev, table, obj, 1))
 				dev_warn(dev, "Clear HEM base address failed.\n");
 
@@ -745,19 +825,22 @@
 		idx_offset = (obj & (table->num_obj - 1)) % obj_per_chunk;
 		dma_offset = offset = idx_offset * table->obj_size;
 	} else {
-		hns_roce_calc_hem_mhop(hr_dev, table, &mhop_obj, &mhop);
+		u32 seg_size = 64; /* 8 bytes per BA and 8 BA per segment */
+
+		if (hns_roce_calc_hem_mhop(hr_dev, table, &mhop_obj, &mhop))
+			goto out;
 		/* mtt mhop */
 		i = mhop.l0_idx;
 		j = mhop.l1_idx;
 		if (mhop.hop_num == 2)
-			hem_idx = i * (mhop.bt_chunk_size / 8) + j;
+			hem_idx = i * (mhop.bt_chunk_size / BA_BYTE_LEN) + j;
 		else if (mhop.hop_num == 1 ||
 			 mhop.hop_num == HNS_ROCE_HOP_NUM_0)
 			hem_idx = i;
 
 		hem = table->hem[hem_idx];
-		dma_offset = offset = (obj & (table->num_obj - 1)) *
-				       table->obj_size % mhop.bt_chunk_size;
+		dma_offset = offset = (obj & (table->num_obj - 1)) * seg_size %
+				       mhop.bt_chunk_size;
 		if (mhop.hop_num == 2)
 			dma_offset = offset = 0;
 	}
@@ -787,7 +870,6 @@
 	mutex_unlock(&table->mutex);
 	return addr;
 }
-EXPORT_SYMBOL_GPL(hns_roce_table_find);
 
 int hns_roce_table_get_range(struct hns_roce_dev *hr_dev,
 			     struct hns_roce_hem_table *table,
@@ -795,11 +877,13 @@
 {
 	struct hns_roce_hem_mhop mhop;
 	unsigned long inc = table->table_chunk_size / table->obj_size;
-	unsigned long i;
+	unsigned long i = 0;
 	int ret;
 
 	if (hns_roce_check_whether_mhop(hr_dev, table->type)) {
-		hns_roce_calc_hem_mhop(hr_dev, table, NULL, &mhop);
+		ret = hns_roce_calc_hem_mhop(hr_dev, table, NULL, &mhop);
+		if (ret)
+			goto fail;
 		inc = mhop.bt_chunk_size / table->obj_size;
 	}
 
@@ -829,7 +913,8 @@
 	unsigned long i;
 
 	if (hns_roce_check_whether_mhop(hr_dev, table->type)) {
-		hns_roce_calc_hem_mhop(hr_dev, table, NULL, &mhop);
+		if (hns_roce_calc_hem_mhop(hr_dev, table, NULL, &mhop))
+			return;
 		inc = mhop.bt_chunk_size / table->obj_size;
 	}
 
@@ -842,7 +927,6 @@
 			    unsigned long obj_size, unsigned long nobj,
 			    int use_lowmem)
 {
-	struct device *dev = hr_dev->dev;
 	unsigned long obj_per_chunk;
 	unsigned long num_hem;
 
@@ -855,66 +939,24 @@
 		if (!table->hem)
 			return -ENOMEM;
 	} else {
+		struct hns_roce_hem_mhop mhop = {};
 		unsigned long buf_chunk_size;
 		unsigned long bt_chunk_size;
 		unsigned long bt_chunk_num;
 		unsigned long num_bt_l0 = 0;
 		u32 hop_num;
 
-		switch (type) {
-		case HEM_TYPE_QPC:
-			buf_chunk_size = 1 << (hr_dev->caps.qpc_buf_pg_sz
-					+ PAGE_SHIFT);
-			bt_chunk_size = 1 << (hr_dev->caps.qpc_ba_pg_sz
-					+ PAGE_SHIFT);
-			num_bt_l0 = hr_dev->caps.qpc_bt_num;
-			hop_num = hr_dev->caps.qpc_hop_num;
-			break;
-		case HEM_TYPE_MTPT:
-			buf_chunk_size = 1 << (hr_dev->caps.mpt_buf_pg_sz
-					+ PAGE_SHIFT);
-			bt_chunk_size = 1 << (hr_dev->caps.mpt_ba_pg_sz
-					+ PAGE_SHIFT);
-			num_bt_l0 = hr_dev->caps.mpt_bt_num;
-			hop_num = hr_dev->caps.mpt_hop_num;
-			break;
-		case HEM_TYPE_CQC:
-			buf_chunk_size = 1 << (hr_dev->caps.cqc_buf_pg_sz
-					+ PAGE_SHIFT);
-			bt_chunk_size = 1 << (hr_dev->caps.cqc_ba_pg_sz
-					+ PAGE_SHIFT);
-			num_bt_l0 = hr_dev->caps.cqc_bt_num;
-			hop_num = hr_dev->caps.cqc_hop_num;
-			break;
-		case HEM_TYPE_SRQC:
-			buf_chunk_size = 1 << (hr_dev->caps.srqc_buf_pg_sz
-					+ PAGE_SHIFT);
-			bt_chunk_size = 1 << (hr_dev->caps.srqc_ba_pg_sz
-					+ PAGE_SHIFT);
-			num_bt_l0 = hr_dev->caps.srqc_bt_num;
-			hop_num = hr_dev->caps.srqc_hop_num;
-			break;
-		case HEM_TYPE_MTT:
-			buf_chunk_size = 1 << (hr_dev->caps.mtt_ba_pg_sz
-					+ PAGE_SHIFT);
-			bt_chunk_size = buf_chunk_size;
-			hop_num = hr_dev->caps.mtt_hop_num;
-			break;
-		case HEM_TYPE_CQE:
-			buf_chunk_size = 1 << (hr_dev->caps.cqe_ba_pg_sz
-					+ PAGE_SHIFT);
-			bt_chunk_size = buf_chunk_size;
-			hop_num = hr_dev->caps.cqe_hop_num;
-			break;
-		default:
-			dev_err(dev,
-			  "Table %d not support to init hem table here!\n",
-			  type);
+		if (get_hem_table_config(hr_dev, &mhop, type))
 			return -EINVAL;
-		}
+
+		buf_chunk_size = mhop.buf_chunk_size;
+		bt_chunk_size = mhop.bt_chunk_size;
+		num_bt_l0 = mhop.ba_l0_num;
+		hop_num = mhop.hop_num;
+
 		obj_per_chunk = buf_chunk_size / obj_size;
 		num_hem = (nobj + obj_per_chunk - 1) / obj_per_chunk;
-		bt_chunk_num = bt_chunk_size / 8;
+		bt_chunk_num = bt_chunk_size / BA_BYTE_LEN;
 		if (type >= HEM_TYPE_MTT)
 			num_bt_l0 = bt_chunk_num;
 
@@ -994,7 +1036,8 @@
 	int i;
 	u64 obj;
 
-	hns_roce_calc_hem_mhop(hr_dev, table, NULL, &mhop);
+	if (hns_roce_calc_hem_mhop(hr_dev, table, NULL, &mhop))
+		return;
 	buf_chunk_size = table->type < HEM_TYPE_MTT ? mhop.buf_chunk_size :
 					mhop.bt_chunk_size;
 
@@ -1041,7 +1084,25 @@
 
 void hns_roce_cleanup_hem(struct hns_roce_dev *hr_dev)
 {
+	if ((hr_dev->caps.num_idx_segs))
+		hns_roce_cleanup_hem_table(hr_dev,
+					   &hr_dev->mr_table.mtt_idx_table);
+	if (hr_dev->caps.num_srqwqe_segs)
+		hns_roce_cleanup_hem_table(hr_dev,
+					   &hr_dev->mr_table.mtt_srqwqe_table);
+	if (hr_dev->caps.srqc_entry_sz)
+		hns_roce_cleanup_hem_table(hr_dev,
+					   &hr_dev->srq_table.table);
 	hns_roce_cleanup_hem_table(hr_dev, &hr_dev->cq_table.table);
+	if (hr_dev->caps.qpc_timer_entry_sz)
+		hns_roce_cleanup_hem_table(hr_dev,
+					   &hr_dev->qpc_timer_table);
+	if (hr_dev->caps.cqc_timer_entry_sz)
+		hns_roce_cleanup_hem_table(hr_dev,
+					   &hr_dev->cqc_timer_table);
+	if (hr_dev->caps.sccc_entry_sz)
+		hns_roce_cleanup_hem_table(hr_dev,
+					   &hr_dev->qp_table.sccc_table);
 	if (hr_dev->caps.trrl_entry_sz)
 		hns_roce_cleanup_hem_table(hr_dev,
 					   &hr_dev->qp_table.trrl_table);
@@ -1053,3 +1114,463 @@
 					   &hr_dev->mr_table.mtt_cqe_table);
 	hns_roce_cleanup_hem_table(hr_dev, &hr_dev->mr_table.mtt_table);
 }
+
+struct roce_hem_item {
+	struct list_head list; /* link all hems in the same bt level */
+	struct list_head sibling; /* link all hems in last hop for mtt */
+	void *addr;
+	dma_addr_t dma_addr;
+	size_t count; /* max ba numbers */
+	int start; /* start buf offset in this hem */
+	int end; /* end buf offset in this hem */
+};
+
+static struct roce_hem_item *hem_list_alloc_item(struct hns_roce_dev *hr_dev,
+						   int start, int end,
+						   int count, bool exist_bt,
+						   int bt_level)
+{
+	struct roce_hem_item *hem;
+
+	hem = kzalloc(sizeof(*hem), GFP_KERNEL);
+	if (!hem)
+		return NULL;
+
+	if (exist_bt) {
+		hem->addr = dma_alloc_coherent(hr_dev->dev,
+						   count * BA_BYTE_LEN,
+						   &hem->dma_addr, GFP_KERNEL);
+		if (!hem->addr) {
+			kfree(hem);
+			return NULL;
+		}
+	}
+
+	hem->count = count;
+	hem->start = start;
+	hem->end = end;
+	INIT_LIST_HEAD(&hem->list);
+	INIT_LIST_HEAD(&hem->sibling);
+
+	return hem;
+}
+
+static void hem_list_free_item(struct hns_roce_dev *hr_dev,
+			       struct roce_hem_item *hem, bool exist_bt)
+{
+	if (exist_bt)
+		dma_free_coherent(hr_dev->dev, hem->count * BA_BYTE_LEN,
+				  hem->addr, hem->dma_addr);
+	kfree(hem);
+}
+
+static void hem_list_free_all(struct hns_roce_dev *hr_dev,
+			      struct list_head *head, bool exist_bt)
+{
+	struct roce_hem_item *hem, *temp_hem;
+
+	list_for_each_entry_safe(hem, temp_hem, head, list) {
+		list_del(&hem->list);
+		hem_list_free_item(hr_dev, hem, exist_bt);
+	}
+}
+
+static void hem_list_link_bt(struct hns_roce_dev *hr_dev, void *base_addr,
+			     u64 table_addr)
+{
+	*(u64 *)(base_addr) = table_addr;
+}
+
+/* assign L0 table address to hem from root bt */
+static void hem_list_assign_bt(struct hns_roce_dev *hr_dev,
+			       struct roce_hem_item *hem, void *cpu_addr,
+			       u64 phy_addr)
+{
+	hem->addr = cpu_addr;
+	hem->dma_addr = (dma_addr_t)phy_addr;
+}
+
+static inline bool hem_list_page_is_in_range(struct roce_hem_item *hem,
+					     int offset)
+{
+	return (hem->start <= offset && offset <= hem->end);
+}
+
+static struct roce_hem_item *hem_list_search_item(struct list_head *ba_list,
+						    int page_offset)
+{
+	struct roce_hem_item *hem, *temp_hem;
+	struct roce_hem_item *found = NULL;
+
+	list_for_each_entry_safe(hem, temp_hem, ba_list, list) {
+		if (hem_list_page_is_in_range(hem, page_offset)) {
+			found = hem;
+			break;
+		}
+	}
+
+	return found;
+}
+
+static bool hem_list_is_bottom_bt(int hopnum, int bt_level)
+{
+	/*
+	 * hopnum    base address table levels
+	 * 0		L0(buf)
+	 * 1		L0 -> buf
+	 * 2		L0 -> L1 -> buf
+	 * 3		L0 -> L1 -> L2 -> buf
+	 */
+	return bt_level >= (hopnum ? hopnum - 1 : hopnum);
+}
+
+/**
+ * calc base address entries num
+ * @hopnum: num of mutihop addressing
+ * @bt_level: base address table level
+ * @unit: ba entries per bt page
+ */
+static u32 hem_list_calc_ba_range(int hopnum, int bt_level, int unit)
+{
+	u32 step;
+	int max;
+	int i;
+
+	if (hopnum <= bt_level)
+		return 0;
+	/*
+	 * hopnum  bt_level   range
+	 * 1	      0       unit
+	 * ------------
+	 * 2	      0       unit * unit
+	 * 2	      1       unit
+	 * ------------
+	 * 3	      0       unit * unit * unit
+	 * 3	      1       unit * unit
+	 * 3	      2       unit
+	 */
+	step = 1;
+	max = hopnum - bt_level;
+	for (i = 0; i < max; i++)
+		step = step * unit;
+
+	return step;
+}
+
+/**
+ * calc the root ba entries which could cover all regions
+ * @regions: buf region array
+ * @region_cnt: array size of @regions
+ * @unit: ba entries per bt page
+ */
+int hns_roce_hem_list_calc_root_ba(const struct hns_roce_buf_region *regions,
+				   int region_cnt, int unit)
+{
+	struct hns_roce_buf_region *r;
+	int total = 0;
+	int step;
+	int i;
+
+	for (i = 0; i < region_cnt; i++) {
+		r = (struct hns_roce_buf_region *)&regions[i];
+		if (r->hopnum > 1) {
+			step = hem_list_calc_ba_range(r->hopnum, 1, unit);
+			if (step > 0)
+				total += (r->count + step - 1) / step;
+		} else {
+			total += r->count;
+		}
+	}
+
+	return total;
+}
+
+static int hem_list_alloc_mid_bt(struct hns_roce_dev *hr_dev,
+				 const struct hns_roce_buf_region *r, int unit,
+				 int offset, struct list_head *mid_bt,
+				 struct list_head *btm_bt)
+{
+	struct roce_hem_item *hem_ptrs[HNS_ROCE_MAX_BT_LEVEL] = { NULL };
+	struct list_head temp_list[HNS_ROCE_MAX_BT_LEVEL];
+	struct roce_hem_item *cur, *pre;
+	const int hopnum = r->hopnum;
+	int start_aligned;
+	int distance;
+	int ret = 0;
+	int max_ofs;
+	int level;
+	u32 step;
+	int end;
+
+	if (hopnum <= 1)
+		return 0;
+
+	if (hopnum > HNS_ROCE_MAX_BT_LEVEL) {
+		dev_err(hr_dev->dev, "invalid hopnum %d!\n", hopnum);
+		return -EINVAL;
+	}
+
+	if (offset < r->offset) {
+		dev_err(hr_dev->dev, "invalid offset %d,min %d!\n",
+			offset, r->offset);
+		return -EINVAL;
+	}
+
+	distance = offset - r->offset;
+	max_ofs = r->offset + r->count - 1;
+	for (level = 0; level < hopnum; level++)
+		INIT_LIST_HEAD(&temp_list[level]);
+
+	/* config L1 bt to last bt and link them to corresponding parent */
+	for (level = 1; level < hopnum; level++) {
+		cur = hem_list_search_item(&mid_bt[level], offset);
+		if (cur) {
+			hem_ptrs[level] = cur;
+			continue;
+		}
+
+		step = hem_list_calc_ba_range(hopnum, level, unit);
+		if (step < 1) {
+			ret = -EINVAL;
+			goto err_exit;
+		}
+
+		start_aligned = (distance / step) * step + r->offset;
+		end = min_t(int, start_aligned + step - 1, max_ofs);
+		cur = hem_list_alloc_item(hr_dev, start_aligned, end, unit,
+					  true, level);
+		if (!cur) {
+			ret = -ENOMEM;
+			goto err_exit;
+		}
+		hem_ptrs[level] = cur;
+		list_add(&cur->list, &temp_list[level]);
+		if (hem_list_is_bottom_bt(hopnum, level))
+			list_add(&cur->sibling, &temp_list[0]);
+
+		/* link bt to parent bt */
+		if (level > 1) {
+			pre = hem_ptrs[level - 1];
+			step = (cur->start - pre->start) / step * BA_BYTE_LEN;
+			hem_list_link_bt(hr_dev, pre->addr + step,
+					 cur->dma_addr);
+		}
+	}
+
+	list_splice(&temp_list[0], btm_bt);
+	for (level = 1; level < hopnum; level++)
+		list_splice(&temp_list[level], &mid_bt[level]);
+
+	return 0;
+
+err_exit:
+	for (level = 1; level < hopnum; level++)
+		hem_list_free_all(hr_dev, &temp_list[level], true);
+
+	return ret;
+}
+
+static int hem_list_alloc_root_bt(struct hns_roce_dev *hr_dev,
+				  struct hns_roce_hem_list *hem_list, int unit,
+				  const struct hns_roce_buf_region *regions,
+				  int region_cnt)
+{
+	struct roce_hem_item *hem, *temp_hem, *root_hem;
+	struct list_head temp_list[HNS_ROCE_MAX_BT_REGION];
+	const struct hns_roce_buf_region *r;
+	struct list_head temp_root;
+	struct list_head temp_btm;
+	void *cpu_base;
+	u64 phy_base;
+	int ret = 0;
+	int offset;
+	int total;
+	int step;
+	int i;
+
+	r = &regions[0];
+	root_hem = hem_list_search_item(&hem_list->root_bt, r->offset);
+	if (root_hem)
+		return 0;
+
+	INIT_LIST_HEAD(&temp_root);
+	total = r->offset;
+	/* indicate to last region */
+	r = &regions[region_cnt - 1];
+	root_hem = hem_list_alloc_item(hr_dev, total, r->offset + r->count - 1,
+				       unit, true, 0);
+	if (!root_hem)
+		return -ENOMEM;
+	list_add(&root_hem->list, &temp_root);
+
+	hem_list->root_ba = root_hem->dma_addr;
+
+	INIT_LIST_HEAD(&temp_btm);
+	for (i = 0; i < region_cnt; i++)
+		INIT_LIST_HEAD(&temp_list[i]);
+
+	total = 0;
+	for (i = 0; i < region_cnt && total < unit; i++) {
+		r = &regions[i];
+		if (!r->count)
+			continue;
+
+		/* all regions's mid[x][0] shared the root_bt's trunk */
+		cpu_base = root_hem->addr + total * BA_BYTE_LEN;
+		phy_base = root_hem->dma_addr + total * BA_BYTE_LEN;
+
+		/* if hopnum is 0 or 1, cut a new fake hem from the root bt
+		 * which's address share to all regions.
+		 */
+		if (hem_list_is_bottom_bt(r->hopnum, 0)) {
+			hem = hem_list_alloc_item(hr_dev, r->offset,
+						  r->offset + r->count - 1,
+						  r->count, false, 0);
+			if (!hem) {
+				ret = -ENOMEM;
+				goto err_exit;
+			}
+			hem_list_assign_bt(hr_dev, hem, cpu_base, phy_base);
+			list_add(&hem->list, &temp_list[i]);
+			list_add(&hem->sibling, &temp_btm);
+			total += r->count;
+		} else {
+			step = hem_list_calc_ba_range(r->hopnum, 1, unit);
+			if (step < 1) {
+				ret = -EINVAL;
+				goto err_exit;
+			}
+			/* if exist mid bt, link L1 to L0 */
+			list_for_each_entry_safe(hem, temp_hem,
+					  &hem_list->mid_bt[i][1], list) {
+				offset = hem->start / step * BA_BYTE_LEN;
+				hem_list_link_bt(hr_dev, cpu_base + offset,
+						 hem->dma_addr);
+				total++;
+			}
+		}
+	}
+
+	list_splice(&temp_btm, &hem_list->btm_bt);
+	list_splice(&temp_root, &hem_list->root_bt);
+	for (i = 0; i < region_cnt; i++)
+		list_splice(&temp_list[i], &hem_list->mid_bt[i][0]);
+
+	return 0;
+
+err_exit:
+	for (i = 0; i < region_cnt; i++)
+		hem_list_free_all(hr_dev, &temp_list[i], false);
+
+	hem_list_free_all(hr_dev, &temp_root, true);
+
+	return ret;
+}
+
+/* construct the base address table and link them by address hop config */
+int hns_roce_hem_list_request(struct hns_roce_dev *hr_dev,
+			      struct hns_roce_hem_list *hem_list,
+			      const struct hns_roce_buf_region *regions,
+			      int region_cnt)
+{
+	const struct hns_roce_buf_region *r;
+	int ofs, end;
+	int ret = 0;
+	int unit;
+	int i;
+
+	if (region_cnt > HNS_ROCE_MAX_BT_REGION) {
+		dev_err(hr_dev->dev, "invalid region region_cnt %d!\n",
+			region_cnt);
+		return -EINVAL;
+	}
+
+	unit = (1 << hem_list->bt_pg_shift) / BA_BYTE_LEN;
+	for (i = 0; i < region_cnt; i++) {
+		r = &regions[i];
+		if (!r->count)
+			continue;
+
+		end = r->offset + r->count;
+		for (ofs = r->offset; ofs < end; ofs += unit) {
+			ret = hem_list_alloc_mid_bt(hr_dev, r, unit, ofs,
+						    hem_list->mid_bt[i],
+						    &hem_list->btm_bt);
+			if (ret) {
+				dev_err(hr_dev->dev,
+					"alloc hem trunk fail ret=%d!\n", ret);
+				goto err_alloc;
+			}
+		}
+	}
+
+	ret = hem_list_alloc_root_bt(hr_dev, hem_list, unit, regions,
+				     region_cnt);
+	if (ret)
+		dev_err(hr_dev->dev, "alloc hem root fail ret=%d!\n", ret);
+	else
+		return 0;
+
+err_alloc:
+	hns_roce_hem_list_release(hr_dev, hem_list);
+
+	return ret;
+}
+
+void hns_roce_hem_list_release(struct hns_roce_dev *hr_dev,
+			       struct hns_roce_hem_list *hem_list)
+{
+	int i, j;
+
+	for (i = 0; i < HNS_ROCE_MAX_BT_REGION; i++)
+		for (j = 0; j < HNS_ROCE_MAX_BT_LEVEL; j++)
+			hem_list_free_all(hr_dev, &hem_list->mid_bt[i][j],
+					  j != 0);
+
+	hem_list_free_all(hr_dev, &hem_list->root_bt, true);
+	INIT_LIST_HEAD(&hem_list->btm_bt);
+	hem_list->root_ba = 0;
+}
+
+void hns_roce_hem_list_init(struct hns_roce_hem_list *hem_list,
+			    int bt_page_order)
+{
+	int i, j;
+
+	INIT_LIST_HEAD(&hem_list->root_bt);
+	INIT_LIST_HEAD(&hem_list->btm_bt);
+	for (i = 0; i < HNS_ROCE_MAX_BT_REGION; i++)
+		for (j = 0; j < HNS_ROCE_MAX_BT_LEVEL; j++)
+			INIT_LIST_HEAD(&hem_list->mid_bt[i][j]);
+
+	hem_list->bt_pg_shift = bt_page_order;
+}
+
+void *hns_roce_hem_list_find_mtt(struct hns_roce_dev *hr_dev,
+				 struct hns_roce_hem_list *hem_list,
+				 int offset, int *mtt_cnt, u64 *phy_addr)
+{
+	struct list_head *head = &hem_list->btm_bt;
+	struct roce_hem_item *hem, *temp_hem;
+	void *cpu_base = NULL;
+	u64 phy_base = 0;
+	int nr = 0;
+
+	list_for_each_entry_safe(hem, temp_hem, head, sibling) {
+		if (hem_list_page_is_in_range(hem, offset)) {
+			nr = offset - hem->start;
+			cpu_base = hem->addr + nr * BA_BYTE_LEN;
+			phy_base = hem->dma_addr + nr * BA_BYTE_LEN;
+			nr = hem->end + 1 - offset;
+			break;
+		}
+	}
+
+	if (mtt_cnt)
+		*mtt_cnt = nr;
+
+	if (phy_addr)
+		*phy_addr = phy_base;
+
+	return cpu_base;
+}
diff --git a/drivers/infiniband/hw/hns/hns_roce_hem.h b/drivers/infiniband/hw/hns/hns_roce_hem.h
index e8850d5..3bb8f78 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hem.h
+++ b/drivers/infiniband/hw/hns/hns_roce_hem.h
@@ -34,8 +34,8 @@
 #ifndef _HNS_ROCE_HEM_H
 #define _HNS_ROCE_HEM_H
 
-#define HW_SYNC_TIMEOUT_MSECS		500
 #define HW_SYNC_SLEEP_TIME_INTERVAL	20
+#define HW_SYNC_TIMEOUT_MSECS           (25 * HW_SYNC_SLEEP_TIME_INTERVAL)
 #define BT_CMD_SYNC_SHIFT		31
 
 enum {
@@ -44,17 +44,22 @@
 	HEM_TYPE_MTPT,
 	HEM_TYPE_CQC,
 	HEM_TYPE_SRQC,
+	HEM_TYPE_SCCC,
+	HEM_TYPE_QPC_TIMER,
+	HEM_TYPE_CQC_TIMER,
 
 	 /* UNMAP HEM */
 	HEM_TYPE_MTT,
 	HEM_TYPE_CQE,
+	HEM_TYPE_SRQWQE,
+	HEM_TYPE_IDX,
 	HEM_TYPE_IRRL,
 	HEM_TYPE_TRRL,
 };
 
 #define HNS_ROCE_HEM_CHUNK_LEN	\
 	 ((256 - sizeof(struct list_head) - 2 * sizeof(int)) /	 \
-	 (sizeof(struct scatterlist)))
+	 (sizeof(struct scatterlist) + sizeof(void *)))
 
 #define check_whether_bt_num_3(type, hop_num) \
 	(type < HEM_TYPE_MTT && hop_num == 2)
@@ -97,9 +102,9 @@
 	u32	buf_chunk_size;
 	u32	bt_chunk_size;
 	u32	ba_l0_num;
-	u32	l0_idx;/* level 0 base address table index */
-	u32	l1_idx;/* level 1 base address table index */
-	u32	l2_idx;/* level 2 base address table index */
+	u32	l0_idx; /* level 0 base address table index */
+	u32	l1_idx; /* level 1 base address table index */
+	u32	l2_idx; /* level 2 base address table index */
 };
 
 void hns_roce_free_hem(struct hns_roce_dev *hr_dev, struct hns_roce_hem *hem);
@@ -128,6 +133,20 @@
 			   struct hns_roce_hem_mhop *mhop);
 bool hns_roce_check_whether_mhop(struct hns_roce_dev *hr_dev, u32 type);
 
+void hns_roce_hem_list_init(struct hns_roce_hem_list *hem_list,
+			    int bt_page_order);
+int hns_roce_hem_list_calc_root_ba(const struct hns_roce_buf_region *regions,
+				   int region_cnt, int unit);
+int hns_roce_hem_list_request(struct hns_roce_dev *hr_dev,
+			      struct hns_roce_hem_list *hem_list,
+			      const struct hns_roce_buf_region *regions,
+			      int region_cnt);
+void hns_roce_hem_list_release(struct hns_roce_dev *hr_dev,
+			       struct hns_roce_hem_list *hem_list);
+void *hns_roce_hem_list_find_mtt(struct hns_roce_dev *hr_dev,
+				 struct hns_roce_hem_list *hem_list,
+				 int offset, int *mtt_cnt, u64 *phy_addr);
+
 static inline void hns_roce_hem_first(struct hns_roce_hem *hem,
 				      struct hns_roce_hem_iter *iter)
 {
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c
index 081aa91..5f74bf5 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c
@@ -73,7 +73,7 @@
 	int ps_opcode = 0, i = 0;
 	unsigned long flags = 0;
 	void *wqe = NULL;
-	u32 doorbell[2];
+	__le32 doorbell[2];
 	int nreq = 0;
 	u32 ind = 0;
 	int ret = 0;
@@ -175,13 +175,11 @@
 			roce_set_field(ud_sq_wqe->u32_36,
 				       UD_SEND_WQE_U32_36_FLOW_LABEL_M,
 				       UD_SEND_WQE_U32_36_FLOW_LABEL_S,
-				       ah->av.sl_tclass_flowlabel &
-				       HNS_ROCE_FLOW_LABEL_MASK);
+				       ah->av.flowlabel);
 			roce_set_field(ud_sq_wqe->u32_36,
 				      UD_SEND_WQE_U32_36_PRIORITY_M,
 				      UD_SEND_WQE_U32_36_PRIORITY_S,
-				      le32_to_cpu(ah->av.sl_tclass_flowlabel) >>
-				      HNS_ROCE_SL_SHIFT);
+				      ah->av.sl);
 			roce_set_field(ud_sq_wqe->u32_36,
 				       UD_SEND_WQE_U32_36_SGID_INDEX_M,
 				       UD_SEND_WQE_U32_36_SGID_INDEX_S,
@@ -195,8 +193,7 @@
 			roce_set_field(ud_sq_wqe->u32_40,
 				       UD_SEND_WQE_U32_40_TRAFFIC_CLASS_M,
 				       UD_SEND_WQE_U32_40_TRAFFIC_CLASS_S,
-				       ah->av.sl_tclass_flowlabel >>
-				       HNS_ROCE_TCLASS_SHIFT);
+				       ah->av.tclass);
 
 			memcpy(&ud_sq_wqe->dgid[0], &ah->av.dgid[0], GID_LEN);
 
@@ -335,10 +332,10 @@
 			       SQ_DOORBELL_U32_8_QPN_S, qp->doorbell_qpn);
 		roce_set_bit(sq_db.u32_8, SQ_DOORBELL_HW_SYNC_S, 1);
 
-		doorbell[0] = le32_to_cpu(sq_db.u32_4);
-		doorbell[1] = le32_to_cpu(sq_db.u32_8);
+		doorbell[0] = sq_db.u32_4;
+		doorbell[1] = sq_db.u32_8;
 
-		hns_roce_write64_k((__le32 *)doorbell, qp->sq.db_reg_l);
+		hns_roce_write64_k(doorbell, qp->sq.db_reg_l);
 		qp->sq_next_wqe = ind;
 	}
 
@@ -363,7 +360,7 @@
 	struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device);
 	struct device *dev = &hr_dev->pdev->dev;
 	struct hns_roce_rq_db rq_db;
-	uint32_t doorbell[2] = {0};
+	__le32 doorbell[2] = {0};
 
 	spin_lock_irqsave(&hr_qp->rq.lock, flags);
 	ind = hr_qp->rq.head & (hr_qp->rq.wqe_cnt - 1);
@@ -437,11 +434,10 @@
 			roce_set_bit(rq_db.u32_8, RQ_DOORBELL_U32_8_HW_SYNC_S,
 				     1);
 
-			doorbell[0] = le32_to_cpu(rq_db.u32_4);
-			doorbell[1] = le32_to_cpu(rq_db.u32_8);
+			doorbell[0] = rq_db.u32_4;
+			doorbell[1] = rq_db.u32_8;
 
-			hns_roce_write64_k((__le32 *)doorbell,
-					   hr_qp->rq.db_reg_l);
+			hns_roce_write64_k(doorbell, hr_qp->rq.db_reg_l);
 		}
 	}
 	spin_unlock_irqrestore(&hr_qp->rq.lock, flags);
@@ -711,13 +707,14 @@
 	struct ib_qp_attr attr = { 0 };
 	struct hns_roce_v1_priv *priv;
 	struct hns_roce_qp *hr_qp;
+	struct ib_device *ibdev;
 	struct ib_cq *cq;
 	struct ib_pd *pd;
 	union ib_gid dgid;
-	u64 subnet_prefix;
+	__be64 subnet_prefix;
 	int attr_mask = 0;
-	int i, j;
 	int ret;
+	int i, j;
 	u8 queue_en[HNS_ROCE_V1_RESV_QP] = { 0 };
 	u8 phy_port;
 	u8 port = 0;
@@ -729,10 +726,16 @@
 	/* Reserved cq for loop qp */
 	cq_init_attr.cqe		= HNS_ROCE_MIN_WQE_NUM * 2;
 	cq_init_attr.comp_vector	= 0;
-	cq = hns_roce_ib_create_cq(&hr_dev->ib_dev, &cq_init_attr, NULL, NULL);
-	if (IS_ERR(cq)) {
-		dev_err(dev, "Create cq for reseved loop qp failed!");
+
+	ibdev = &hr_dev->ib_dev;
+	cq = rdma_zalloc_drv_obj(ibdev, ib_cq);
+	if (!cq)
 		return -ENOMEM;
+
+	ret = hns_roce_ib_create_cq(cq, &cq_init_attr, NULL);
+	if (ret) {
+		dev_err(dev, "Create cq for reserved loop qp failed!");
+		goto alloc_cq_failed;
 	}
 	free_mr->mr_free_cq = to_hr_cq(cq);
 	free_mr->mr_free_cq->ib_cq.device		= &hr_dev->ib_dev;
@@ -742,12 +745,17 @@
 	free_mr->mr_free_cq->ib_cq.cq_context		= NULL;
 	atomic_set(&free_mr->mr_free_cq->ib_cq.usecnt, 0);
 
-	pd = hns_roce_alloc_pd(&hr_dev->ib_dev, NULL, NULL);
-	if (IS_ERR(pd)) {
-		dev_err(dev, "Create pd for reseved loop qp failed!");
+	pd = rdma_zalloc_drv_obj(ibdev, ib_pd);
+	if (!pd) {
 		ret = -ENOMEM;
-		goto alloc_pd_failed;
+		goto alloc_mem_failed;
 	}
+
+	pd->device  = ibdev;
+	ret = hns_roce_alloc_pd(pd, NULL);
+	if (ret)
+		goto alloc_pd_failed;
+
 	free_mr->mr_free_pd = to_hr_pd(pd);
 	free_mr->mr_free_pd->ibpd.device  = &hr_dev->ib_dev;
 	free_mr->mr_free_pd->ibpd.uobject = NULL;
@@ -813,7 +821,7 @@
 		attr.dest_qp_num	= hr_qp->qpn;
 		memcpy(rdma_ah_retrieve_dmac(&attr.ah_attr),
 		       hr_dev->dev_addr[port],
-		       MAC_ADDR_OCTET_NUM);
+		       ETH_ALEN);
 
 		memcpy(&dgid.raw, &subnet_prefix, sizeof(u64));
 		memcpy(&dgid.raw[8], hr_dev->dev_addr[port], 3);
@@ -850,17 +858,19 @@
 create_lp_qp_failed:
 	for (i -= 1; i >= 0; i--) {
 		hr_qp = free_mr->mr_free_qp[i];
-		if (hns_roce_v1_destroy_qp(&hr_qp->ibqp))
+		if (hns_roce_v1_destroy_qp(&hr_qp->ibqp, NULL))
 			dev_err(dev, "Destroy qp %d for mr free failed!\n", i);
 	}
 
-	if (hns_roce_dealloc_pd(pd))
-		dev_err(dev, "Destroy pd for create_lp_qp failed!\n");
+	hns_roce_dealloc_pd(pd, NULL);
 
 alloc_pd_failed:
-	if (hns_roce_ib_destroy_cq(cq))
-		dev_err(dev, "Destroy cq for create_lp_qp failed!\n");
+	kfree(pd);
 
+alloc_mem_failed:
+	hns_roce_ib_destroy_cq(cq, NULL);
+alloc_cq_failed:
+	kfree(cq);
 	return ret;
 }
 
@@ -881,19 +891,16 @@
 		if (!hr_qp)
 			continue;
 
-		ret = hns_roce_v1_destroy_qp(&hr_qp->ibqp);
+		ret = hns_roce_v1_destroy_qp(&hr_qp->ibqp, NULL);
 		if (ret)
 			dev_err(dev, "Destroy qp %d for mr free failed(%d)!\n",
 				i, ret);
 	}
 
-	ret = hns_roce_ib_destroy_cq(&free_mr->mr_free_cq->ib_cq);
-	if (ret)
-		dev_err(dev, "Destroy cq for mr_free failed(%d)!\n", ret);
-
-	ret = hns_roce_dealloc_pd(&free_mr->mr_free_pd->ibpd);
-	if (ret)
-		dev_err(dev, "Destroy pd for mr_free failed(%d)!\n", ret);
+	hns_roce_ib_destroy_cq(&free_mr->mr_free_cq->ib_cq, NULL);
+	kfree(&free_mr->mr_free_cq->ib_cq);
+	hns_roce_dealloc_pd(&free_mr->mr_free_pd->ibpd, NULL);
+	kfree(&free_mr->mr_free_pd->ibpd);
 }
 
 static int hns_roce_db_init(struct hns_roce_dev *hr_dev)
@@ -960,8 +967,7 @@
 	struct hns_roce_free_mr *free_mr;
 	struct hns_roce_v1_priv *priv;
 	struct completion comp;
-	unsigned long end =
-	  msecs_to_jiffies(HNS_ROCE_V1_RECREATE_LP_QP_TIMEOUT_MSECS) + jiffies;
+	long end = HNS_ROCE_V1_RECREATE_LP_QP_TIMEOUT_MSECS;
 
 	priv = (struct hns_roce_v1_priv *)hr_dev->priv;
 	free_mr = &priv->free_mr;
@@ -981,10 +987,11 @@
 
 	queue_work(free_mr->free_mr_wq, &(lp_qp_work->work));
 
-	while (time_before_eq(jiffies, end)) {
+	while (end > 0) {
 		if (try_wait_for_completion(&comp))
 			return 0;
 		msleep(HNS_ROCE_V1_RECREATE_LP_QP_WAIT_VALUE);
+		end -= HNS_ROCE_V1_RECREATE_LP_QP_WAIT_VALUE;
 	}
 
 	lp_qp_work->comp_flag = 0;
@@ -1091,15 +1098,14 @@
 }
 
 static int hns_roce_v1_dereg_mr(struct hns_roce_dev *hr_dev,
-				struct hns_roce_mr *mr)
+				struct hns_roce_mr *mr, struct ib_udata *udata)
 {
 	struct device *dev = &hr_dev->pdev->dev;
 	struct hns_roce_mr_free_work *mr_work;
 	struct hns_roce_free_mr *free_mr;
 	struct hns_roce_v1_priv *priv;
 	struct completion comp;
-	unsigned long end =
-		msecs_to_jiffies(HNS_ROCE_V1_FREE_MR_TIMEOUT_MSECS) + jiffies;
+	long end = HNS_ROCE_V1_FREE_MR_TIMEOUT_MSECS;
 	unsigned long start = jiffies;
 	int npages;
 	int ret = 0;
@@ -1129,10 +1135,11 @@
 
 	queue_work(free_mr->free_mr_wq, &(mr_work->work));
 
-	while (time_before_eq(jiffies, end)) {
+	while (end > 0) {
 		if (try_wait_for_completion(&comp))
 			goto free_mr;
 		msleep(HNS_ROCE_V1_FREE_MR_WAIT_VALUE);
+		end -= HNS_ROCE_V1_FREE_MR_WAIT_VALUE;
 	}
 
 	mr_work->comp_flag = 0;
@@ -1155,8 +1162,7 @@
 	hns_roce_bitmap_free(&hr_dev->mr_table.mtpt_bitmap,
 			     key_to_hw_index(mr->key), 0);
 
-	if (mr->umem)
-		ib_umem_release(mr->umem);
+	ib_umem_release(mr->umem);
 
 	kfree(mr);
 
@@ -1506,38 +1512,6 @@
 	return ret;
 }
 
-static int hns_roce_des_qp_init(struct hns_roce_dev *hr_dev)
-{
-	struct device *dev = &hr_dev->pdev->dev;
-	struct hns_roce_v1_priv *priv;
-	struct hns_roce_des_qp *des_qp;
-
-	priv = (struct hns_roce_v1_priv *)hr_dev->priv;
-	des_qp = &priv->des_qp;
-
-	des_qp->requeue_flag = 1;
-	des_qp->qp_wq = create_singlethread_workqueue("hns_roce_destroy_qp");
-	if (!des_qp->qp_wq) {
-		dev_err(dev, "Create destroy qp workqueue failed!\n");
-		return -ENOMEM;
-	}
-
-	return 0;
-}
-
-static void hns_roce_des_qp_free(struct hns_roce_dev *hr_dev)
-{
-	struct hns_roce_v1_priv *priv;
-	struct hns_roce_des_qp *des_qp;
-
-	priv = (struct hns_roce_v1_priv *)hr_dev->priv;
-	des_qp = &priv->des_qp;
-
-	des_qp->requeue_flag = 0;
-	flush_workqueue(des_qp->qp_wq);
-	destroy_workqueue(des_qp->qp_wq);
-}
-
 static int hns_roce_v1_profile(struct hns_roce_dev *hr_dev)
 {
 	int i = 0;
@@ -1583,6 +1557,7 @@
 	caps->reserved_mrws	= 1;
 	caps->reserved_uars	= 0;
 	caps->reserved_cqs	= 0;
+	caps->reserved_qps	= 12; /* 2 SQP per port, six ports total 12 */
 	caps->chunk_sz		= HNS_ROCE_V1_TABLE_CHUNK_SIZE;
 
 	for (i = 0; i < caps->num_ports; i++)
@@ -1656,12 +1631,6 @@
 		goto error_failed_tptr_init;
 	}
 
-	ret = hns_roce_des_qp_init(hr_dev);
-	if (ret) {
-		dev_err(dev, "des qp init failed!\n");
-		goto error_failed_des_qp_init;
-	}
-
 	ret = hns_roce_free_mr_init(hr_dev);
 	if (ret) {
 		dev_err(dev, "free mr init failed!\n");
@@ -1673,9 +1642,6 @@
 	return 0;
 
 error_failed_free_mr_init:
-	hns_roce_des_qp_free(hr_dev);
-
-error_failed_des_qp_init:
 	hns_roce_tptr_free(hr_dev);
 
 error_failed_tptr_init:
@@ -1693,7 +1659,6 @@
 {
 	hns_roce_port_enable(hr_dev, HNS_ROCE_PORT_DOWN);
 	hns_roce_free_mr_free(hr_dev);
-	hns_roce_des_qp_free(hr_dev);
 	hns_roce_tptr_free(hr_dev);
 	hns_roce_bt_free(hr_dev);
 	hns_roce_raq_free(hr_dev);
@@ -1745,8 +1710,6 @@
 
 	writel(val, hcr + 5);
 
-	mmiowb();
-
 	return 0;
 }
 
@@ -1780,11 +1743,14 @@
 			       int gid_index, const union ib_gid *gid,
 			       const struct ib_gid_attr *attr)
 {
+	unsigned long flags;
 	u32 *p = NULL;
 	u8 gid_idx = 0;
 
 	gid_idx = hns_get_gid_index(hr_dev, port, gid_index);
 
+	spin_lock_irqsave(&hr_dev->iboe.lock, flags);
+
 	p = (u32 *)&gid->raw[0];
 	roce_raw_write(*p, hr_dev->reg_base + ROCEE_PORT_GID_L_0_REG +
 		       (HNS_ROCE_V1_GID_NUM * gid_idx));
@@ -1801,6 +1767,8 @@
 	roce_raw_write(*p, hr_dev->reg_base + ROCEE_PORT_GID_H_0_REG +
 		       (HNS_ROCE_V1_GID_NUM * gid_idx));
 
+	spin_unlock_irqrestore(&hr_dev->iboe.lock, flags);
+
 	return 0;
 }
 
@@ -1866,9 +1834,8 @@
 				  unsigned long mtpt_idx)
 {
 	struct hns_roce_v1_mpt_entry *mpt_entry;
-	struct scatterlist *sg;
+	struct sg_dma_page_iter sg_iter;
 	u64 *pages;
-	int entry;
 	int i;
 
 	/* MPT filled into mailbox buf */
@@ -1923,8 +1890,8 @@
 		return -ENOMEM;
 
 	i = 0;
-	for_each_sg(mr->umem->sg_head.sgl, sg, mr->umem->nmap, entry) {
-		pages[i] = ((u64)sg_dma_address(sg)) >> 12;
+	for_each_sg_dma_page(mr->umem->sg_head.sgl, &sg_iter, mr->umem->nmap, 0) {
+		pages[i] = ((u64)sg_page_iter_dma_address(&sg_iter)) >> 12;
 
 		/* Directly record to MTPT table firstly 7 entry */
 		if (i >= HNS_ROCE_MAX_INNER_MTPT_NUM)
@@ -2194,7 +2161,7 @@
 {
 	struct hns_roce_cq *hr_cq = to_hr_cq(ibcq);
 	u32 notification_flag;
-	__le32 doorbell[2];
+	__le32 doorbell[2] = {};
 
 	notification_flag = (flags & IB_CQ_SOLICITED_MASK) ==
 			    IB_CQ_SOLICITED ? CQ_DB_REQ_NOT : CQ_DB_REQ_NOT_SOL;
@@ -2459,7 +2426,8 @@
 {
 	struct device *dev = &hr_dev->pdev->dev;
 	struct hns_roce_v1_priv *priv;
-	unsigned long end = 0, flags = 0;
+	unsigned long flags = 0;
+	long end = HW_SYNC_TIMEOUT_MSECS;
 	__le32 bt_cmd_val[2] = {0};
 	void __iomem *bt_cmd;
 	u64 bt_ba = 0;
@@ -2468,18 +2436,12 @@
 
 	switch (table->type) {
 	case HEM_TYPE_QPC:
-		roce_set_field(bt_cmd_val[1], ROCEE_BT_CMD_H_ROCEE_BT_CMD_MDF_M,
-			ROCEE_BT_CMD_H_ROCEE_BT_CMD_MDF_S, HEM_TYPE_QPC);
 		bt_ba = priv->bt_table.qpc_buf.map >> 12;
 		break;
 	case HEM_TYPE_MTPT:
-		roce_set_field(bt_cmd_val[1], ROCEE_BT_CMD_H_ROCEE_BT_CMD_MDF_M,
-			ROCEE_BT_CMD_H_ROCEE_BT_CMD_MDF_S, HEM_TYPE_MTPT);
 		bt_ba = priv->bt_table.mtpt_buf.map >> 12;
 		break;
 	case HEM_TYPE_CQC:
-		roce_set_field(bt_cmd_val[1], ROCEE_BT_CMD_H_ROCEE_BT_CMD_MDF_M,
-			ROCEE_BT_CMD_H_ROCEE_BT_CMD_MDF_S, HEM_TYPE_CQC);
 		bt_ba = priv->bt_table.cqc_buf.map >> 12;
 		break;
 	case HEM_TYPE_SRQC:
@@ -2488,6 +2450,8 @@
 	default:
 		return 0;
 	}
+	roce_set_field(bt_cmd_val[1], ROCEE_BT_CMD_H_ROCEE_BT_CMD_MDF_M,
+			ROCEE_BT_CMD_H_ROCEE_BT_CMD_MDF_S, table->type);
 	roce_set_field(bt_cmd_val[1], ROCEE_BT_CMD_H_ROCEE_BT_CMD_IN_MDF_M,
 		ROCEE_BT_CMD_H_ROCEE_BT_CMD_IN_MDF_S, obj);
 	roce_set_bit(bt_cmd_val[1], ROCEE_BT_CMD_H_ROCEE_BT_CMD_S, 0);
@@ -2497,10 +2461,9 @@
 
 	bt_cmd = hr_dev->reg_base + ROCEE_BT_CMD_H_REG;
 
-	end = msecs_to_jiffies(HW_SYNC_TIMEOUT_MSECS) + jiffies;
 	while (1) {
 		if (readl(bt_cmd) >> BT_CMD_SYNC_SHIFT) {
-			if (!(time_before(jiffies, end))) {
+			if (!end) {
 				dev_err(dev, "Write bt_cmd err,hw_sync is not zero.\n");
 				spin_unlock_irqrestore(&hr_dev->bt_cmd_lock,
 					flags);
@@ -2509,10 +2472,11 @@
 		} else {
 			break;
 		}
-		msleep(HW_SYNC_SLEEP_TIME_INTERVAL);
+		mdelay(HW_SYNC_SLEEP_TIME_INTERVAL);
+		end -= HW_SYNC_SLEEP_TIME_INTERVAL;
 	}
 
-	bt_cmd_val[0] = (__le32)bt_ba;
+	bt_cmd_val[0] = cpu_to_le32(bt_ba);
 	roce_set_field(bt_cmd_val[1], ROCEE_BT_CMD_H_ROCEE_BT_CMD_BA_H_M,
 		ROCEE_BT_CMD_H_ROCEE_BT_CMD_BA_H_S, bt_ba >> 32);
 	hns_roce_write64_k(bt_cmd_val, hr_dev->reg_base + ROCEE_BT_CMD_L_REG);
@@ -2655,7 +2619,7 @@
 			       QP1C_BYTES_16_PORT_NUM_S, hr_qp->phy_port);
 		roce_set_bit(context->qp1c_bytes_16,
 			     QP1C_BYTES_16_SIGNALING_TYPE_S,
-			     le32_to_cpu(hr_qp->sq_signal_bits));
+			     hr_qp->sq_signal_bits);
 		roce_set_bit(context->qp1c_bytes_16, QP1C_BYTES_16_RQ_BA_FLG_S,
 			     1);
 		roce_set_bit(context->qp1c_bytes_16, QP1C_BYTES_16_SQ_BA_FLG_S,
@@ -2961,7 +2925,7 @@
 			     1);
 		roce_set_bit(context->qpc_bytes_32,
 			     QP_CONTEXT_QPC_BYTE_32_SIGNALING_TYPE_S,
-			     le32_to_cpu(hr_qp->sq_signal_bits));
+			     hr_qp->sq_signal_bits);
 
 		port = (attr_mask & IB_QP_PORT) ? (attr->port_num - 1) :
 			hr_qp->port;
@@ -3606,7 +3570,7 @@
 	qp_attr->retry_cnt = roce_get_field(context->qpc_bytes_148,
 			     QP_CONTEXT_QPC_BYTES_148_RETRY_COUNT_M,
 			     QP_CONTEXT_QPC_BYTES_148_RETRY_COUNT_S);
-	qp_attr->rnr_retry = (u8)context->rnr_retry;
+	qp_attr->rnr_retry = (u8)le32_to_cpu(context->rnr_retry);
 
 done:
 	qp_attr->cur_qp_state = qp_attr->qp_state;
@@ -3640,307 +3604,22 @@
 		hns_roce_v1_q_qp(ibqp, qp_attr, qp_attr_mask, qp_init_attr);
 }
 
-static void hns_roce_check_sdb_status(struct hns_roce_dev *hr_dev,
-				      u32 *old_send, u32 *old_retry,
-				      u32 *tsp_st, u32 *success_flags)
-{
-	__le32 *old_send_tmp, *old_retry_tmp;
-	u32 sdb_retry_cnt;
-	u32 sdb_send_ptr;
-	u32 cur_cnt, old_cnt;
-	__le32 tmp, tmp1;
-	u32 send_ptr;
-
-	sdb_send_ptr = roce_read(hr_dev, ROCEE_SDB_SEND_PTR_REG);
-	sdb_retry_cnt =	roce_read(hr_dev, ROCEE_SDB_RETRY_CNT_REG);
-	tmp = cpu_to_le32(sdb_send_ptr);
-	tmp1 = cpu_to_le32(sdb_retry_cnt);
-	cur_cnt = roce_get_field(tmp, ROCEE_SDB_SEND_PTR_SDB_SEND_PTR_M,
-				 ROCEE_SDB_SEND_PTR_SDB_SEND_PTR_S) +
-		  roce_get_field(tmp1, ROCEE_SDB_RETRY_CNT_SDB_RETRY_CT_M,
-				 ROCEE_SDB_RETRY_CNT_SDB_RETRY_CT_S);
-
-	old_send_tmp = (__le32 *)old_send;
-	old_retry_tmp = (__le32 *)old_retry;
-	if (!roce_get_bit(*tsp_st, ROCEE_CNT_CLR_CE_CNT_CLR_CE_S)) {
-		old_cnt = roce_get_field(*old_send_tmp,
-					 ROCEE_SDB_SEND_PTR_SDB_SEND_PTR_M,
-					 ROCEE_SDB_SEND_PTR_SDB_SEND_PTR_S) +
-			  roce_get_field(*old_retry_tmp,
-					 ROCEE_SDB_RETRY_CNT_SDB_RETRY_CT_M,
-					 ROCEE_SDB_RETRY_CNT_SDB_RETRY_CT_S);
-		if (cur_cnt - old_cnt > SDB_ST_CMP_VAL)
-			*success_flags = 1;
-	} else {
-		old_cnt = roce_get_field(*old_send_tmp,
-					 ROCEE_SDB_SEND_PTR_SDB_SEND_PTR_M,
-					 ROCEE_SDB_SEND_PTR_SDB_SEND_PTR_S);
-		if (cur_cnt - old_cnt > SDB_ST_CMP_VAL) {
-			*success_flags = 1;
-		} else {
-			send_ptr = roce_get_field(*old_send_tmp,
-					    ROCEE_SDB_SEND_PTR_SDB_SEND_PTR_M,
-					    ROCEE_SDB_SEND_PTR_SDB_SEND_PTR_S) +
-				   roce_get_field(tmp1,
-					    ROCEE_SDB_RETRY_CNT_SDB_RETRY_CT_M,
-					    ROCEE_SDB_RETRY_CNT_SDB_RETRY_CT_S);
-			roce_set_field(*old_send_tmp,
-				       ROCEE_SDB_SEND_PTR_SDB_SEND_PTR_M,
-				       ROCEE_SDB_SEND_PTR_SDB_SEND_PTR_S,
-				       send_ptr);
-		}
-	}
-}
-
-static int check_qp_db_process_status(struct hns_roce_dev *hr_dev,
-				      struct hns_roce_qp *hr_qp,
-				      u32 sdb_issue_ptr,
-				      u32 *sdb_inv_cnt,
-				      u32 *wait_stage)
-{
-	struct device *dev = &hr_dev->pdev->dev;
-	u32 sdb_send_ptr, old_send;
-	__le32 sdb_issue_ptr_tmp;
-	__le32 sdb_send_ptr_tmp;
-	u32 success_flags = 0;
-	unsigned long end;
-	u32 old_retry;
-	u32 inv_cnt;
-	u32 tsp_st;
-	__le32 tmp;
-
-	if (*wait_stage > HNS_ROCE_V1_DB_STAGE2 ||
-	    *wait_stage < HNS_ROCE_V1_DB_STAGE1) {
-		dev_err(dev, "QP(0x%lx) db status wait stage(%d) error!\n",
-			hr_qp->qpn, *wait_stage);
-		return -EINVAL;
-	}
-
-	/* Calculate the total timeout for the entire verification process */
-	end = msecs_to_jiffies(HNS_ROCE_V1_CHECK_DB_TIMEOUT_MSECS) + jiffies;
-
-	if (*wait_stage == HNS_ROCE_V1_DB_STAGE1) {
-		/* Query db process status, until hw process completely */
-		sdb_send_ptr = roce_read(hr_dev, ROCEE_SDB_SEND_PTR_REG);
-		while (roce_hw_index_cmp_lt(sdb_send_ptr, sdb_issue_ptr,
-					    ROCEE_SDB_PTR_CMP_BITS)) {
-			if (!time_before(jiffies, end)) {
-				dev_dbg(dev, "QP(0x%lx) db process stage1 timeout. issue 0x%x send 0x%x.\n",
-					hr_qp->qpn, sdb_issue_ptr,
-					sdb_send_ptr);
-				return 0;
-			}
-
-			msleep(HNS_ROCE_V1_CHECK_DB_SLEEP_MSECS);
-			sdb_send_ptr = roce_read(hr_dev,
-						 ROCEE_SDB_SEND_PTR_REG);
-		}
-
-		sdb_send_ptr_tmp = cpu_to_le32(sdb_send_ptr);
-		sdb_issue_ptr_tmp = cpu_to_le32(sdb_issue_ptr);
-		if (roce_get_field(sdb_issue_ptr_tmp,
-				   ROCEE_SDB_ISSUE_PTR_SDB_ISSUE_PTR_M,
-				   ROCEE_SDB_ISSUE_PTR_SDB_ISSUE_PTR_S) ==
-		    roce_get_field(sdb_send_ptr_tmp,
-				   ROCEE_SDB_SEND_PTR_SDB_SEND_PTR_M,
-				   ROCEE_SDB_SEND_PTR_SDB_SEND_PTR_S)) {
-			old_send = roce_read(hr_dev, ROCEE_SDB_SEND_PTR_REG);
-			old_retry = roce_read(hr_dev, ROCEE_SDB_RETRY_CNT_REG);
-
-			do {
-				tsp_st = roce_read(hr_dev, ROCEE_TSP_BP_ST_REG);
-				tmp = cpu_to_le32(tsp_st);
-				if (roce_get_bit(tmp,
-					ROCEE_TSP_BP_ST_QH_FIFO_ENTRY_S) == 1) {
-					*wait_stage = HNS_ROCE_V1_DB_WAIT_OK;
-					return 0;
-				}
-
-				if (!time_before(jiffies, end)) {
-					dev_dbg(dev, "QP(0x%lx) db process stage1 timeout when send ptr equals issue ptr.\n"
-						     "issue 0x%x send 0x%x.\n",
-						hr_qp->qpn,
-						le32_to_cpu(sdb_issue_ptr_tmp),
-						le32_to_cpu(sdb_send_ptr_tmp));
-					return 0;
-				}
-
-				msleep(HNS_ROCE_V1_CHECK_DB_SLEEP_MSECS);
-
-				hns_roce_check_sdb_status(hr_dev, &old_send,
-							  &old_retry, &tsp_st,
-							  &success_flags);
-			} while (!success_flags);
-		}
-
-		*wait_stage = HNS_ROCE_V1_DB_STAGE2;
-
-		/* Get list pointer */
-		*sdb_inv_cnt = roce_read(hr_dev, ROCEE_SDB_INV_CNT_REG);
-		dev_dbg(dev, "QP(0x%lx) db process stage2. inv cnt = 0x%x.\n",
-			hr_qp->qpn, *sdb_inv_cnt);
-	}
-
-	if (*wait_stage == HNS_ROCE_V1_DB_STAGE2) {
-		/* Query db's list status, until hw reversal */
-		inv_cnt = roce_read(hr_dev, ROCEE_SDB_INV_CNT_REG);
-		while (roce_hw_index_cmp_lt(inv_cnt,
-					    *sdb_inv_cnt + SDB_INV_CNT_OFFSET,
-					    ROCEE_SDB_CNT_CMP_BITS)) {
-			if (!time_before(jiffies, end)) {
-				dev_dbg(dev, "QP(0x%lx) db process stage2 timeout. inv cnt 0x%x.\n",
-					hr_qp->qpn, inv_cnt);
-				return 0;
-			}
-
-			msleep(HNS_ROCE_V1_CHECK_DB_SLEEP_MSECS);
-			inv_cnt = roce_read(hr_dev, ROCEE_SDB_INV_CNT_REG);
-		}
-
-		*wait_stage = HNS_ROCE_V1_DB_WAIT_OK;
-	}
-
-	return 0;
-}
-
-static int check_qp_reset_state(struct hns_roce_dev *hr_dev,
-				struct hns_roce_qp *hr_qp,
-				struct hns_roce_qp_work *qp_work_entry,
-				int *is_timeout)
-{
-	struct device *dev = &hr_dev->pdev->dev;
-	u32 sdb_issue_ptr;
-	int ret;
-
-	if (hr_qp->state != IB_QPS_RESET) {
-		/* Set qp to ERR, waiting for hw complete processing all dbs */
-		ret = hns_roce_v1_modify_qp(&hr_qp->ibqp, NULL, 0, hr_qp->state,
-					    IB_QPS_ERR);
-		if (ret) {
-			dev_err(dev, "Modify QP(0x%lx) to ERR failed!\n",
-				hr_qp->qpn);
-			return ret;
-		}
-
-		/* Record issued doorbell */
-		sdb_issue_ptr = roce_read(hr_dev, ROCEE_SDB_ISSUE_PTR_REG);
-		qp_work_entry->sdb_issue_ptr = sdb_issue_ptr;
-		qp_work_entry->db_wait_stage = HNS_ROCE_V1_DB_STAGE1;
-
-		/* Query db process status, until hw process completely */
-		ret = check_qp_db_process_status(hr_dev, hr_qp, sdb_issue_ptr,
-						 &qp_work_entry->sdb_inv_cnt,
-						 &qp_work_entry->db_wait_stage);
-		if (ret) {
-			dev_err(dev, "Check QP(0x%lx) db process status failed!\n",
-				hr_qp->qpn);
-			return ret;
-		}
-
-		if (qp_work_entry->db_wait_stage != HNS_ROCE_V1_DB_WAIT_OK) {
-			qp_work_entry->sche_cnt = 0;
-			*is_timeout = 1;
-			return 0;
-		}
-
-		/* Modify qp to reset before destroying qp */
-		ret = hns_roce_v1_modify_qp(&hr_qp->ibqp, NULL, 0, hr_qp->state,
-					    IB_QPS_RESET);
-		if (ret) {
-			dev_err(dev, "Modify QP(0x%lx) to RST failed!\n",
-				hr_qp->qpn);
-			return ret;
-		}
-	}
-
-	return 0;
-}
-
-static void hns_roce_v1_destroy_qp_work_fn(struct work_struct *work)
-{
-	struct hns_roce_qp_work *qp_work_entry;
-	struct hns_roce_v1_priv *priv;
-	struct hns_roce_dev *hr_dev;
-	struct hns_roce_qp *hr_qp;
-	struct device *dev;
-	unsigned long qpn;
-	int ret;
-
-	qp_work_entry = container_of(work, struct hns_roce_qp_work, work);
-	hr_dev = to_hr_dev(qp_work_entry->ib_dev);
-	dev = &hr_dev->pdev->dev;
-	priv = (struct hns_roce_v1_priv *)hr_dev->priv;
-	hr_qp = qp_work_entry->qp;
-	qpn = hr_qp->qpn;
-
-	dev_dbg(dev, "Schedule destroy QP(0x%lx) work.\n", qpn);
-
-	qp_work_entry->sche_cnt++;
-
-	/* Query db process status, until hw process completely */
-	ret = check_qp_db_process_status(hr_dev, hr_qp,
-					 qp_work_entry->sdb_issue_ptr,
-					 &qp_work_entry->sdb_inv_cnt,
-					 &qp_work_entry->db_wait_stage);
-	if (ret) {
-		dev_err(dev, "Check QP(0x%lx) db process status failed!\n",
-			qpn);
-		return;
-	}
-
-	if (qp_work_entry->db_wait_stage != HNS_ROCE_V1_DB_WAIT_OK &&
-	    priv->des_qp.requeue_flag) {
-		queue_work(priv->des_qp.qp_wq, work);
-		return;
-	}
-
-	/* Modify qp to reset before destroying qp */
-	ret = hns_roce_v1_modify_qp(&hr_qp->ibqp, NULL, 0, hr_qp->state,
-				    IB_QPS_RESET);
-	if (ret) {
-		dev_err(dev, "Modify QP(0x%lx) to RST failed!\n", qpn);
-		return;
-	}
-
-	hns_roce_qp_remove(hr_dev, hr_qp);
-	hns_roce_qp_free(hr_dev, hr_qp);
-
-	if (hr_qp->ibqp.qp_type == IB_QPT_RC) {
-		/* RC QP, release QPN */
-		hns_roce_release_range_qp(hr_dev, qpn, 1);
-		kfree(hr_qp);
-	} else
-		kfree(hr_to_hr_sqp(hr_qp));
-
-	kfree(qp_work_entry);
-
-	dev_dbg(dev, "Accomplished destroy QP(0x%lx) work.\n", qpn);
-}
-
-int hns_roce_v1_destroy_qp(struct ib_qp *ibqp)
+int hns_roce_v1_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata)
 {
 	struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device);
 	struct hns_roce_qp *hr_qp = to_hr_qp(ibqp);
-	struct device *dev = &hr_dev->pdev->dev;
-	struct hns_roce_qp_work qp_work_entry;
-	struct hns_roce_qp_work *qp_work;
-	struct hns_roce_v1_priv *priv;
 	struct hns_roce_cq *send_cq, *recv_cq;
-	int is_user = !!ibqp->pd->uobject;
-	int is_timeout = 0;
 	int ret;
 
-	ret = check_qp_reset_state(hr_dev, hr_qp, &qp_work_entry, &is_timeout);
-	if (ret) {
-		dev_err(dev, "QP reset state check failed(%d)!\n", ret);
+	ret = hns_roce_v1_modify_qp(ibqp, NULL, 0, hr_qp->state, IB_QPS_RESET);
+	if (ret)
 		return ret;
-	}
 
 	send_cq = to_hr_cq(hr_qp->ibqp.send_cq);
 	recv_cq = to_hr_cq(hr_qp->ibqp.recv_cq);
 
 	hns_roce_lock_cqs(send_cq, recv_cq);
-	if (!is_user) {
+	if (!udata) {
 		__hns_roce_v1_cq_clean(recv_cq, hr_qp->qpn, hr_qp->ibqp.srq ?
 				       to_hr_srq(hr_qp->ibqp.srq) : NULL);
 		if (send_cq != recv_cq)
@@ -3948,53 +3627,31 @@
 	}
 	hns_roce_unlock_cqs(send_cq, recv_cq);
 
-	if (!is_timeout) {
-		hns_roce_qp_remove(hr_dev, hr_qp);
-		hns_roce_qp_free(hr_dev, hr_qp);
+	hns_roce_qp_remove(hr_dev, hr_qp);
+	hns_roce_qp_free(hr_dev, hr_qp);
 
-		/* RC QP, release QPN */
-		if (hr_qp->ibqp.qp_type == IB_QPT_RC)
-			hns_roce_release_range_qp(hr_dev, hr_qp->qpn, 1);
-	}
+	/* RC QP, release QPN */
+	if (hr_qp->ibqp.qp_type == IB_QPT_RC)
+		hns_roce_release_range_qp(hr_dev, hr_qp->qpn, 1);
 
 	hns_roce_mtt_cleanup(hr_dev, &hr_qp->mtt);
 
-	if (is_user)
-		ib_umem_release(hr_qp->umem);
-	else {
+	ib_umem_release(hr_qp->umem);
+	if (!udata) {
 		kfree(hr_qp->sq.wrid);
 		kfree(hr_qp->rq.wrid);
 
 		hns_roce_buf_free(hr_dev, hr_qp->buff_size, &hr_qp->hr_buf);
 	}
 
-	if (!is_timeout) {
-		if (hr_qp->ibqp.qp_type == IB_QPT_RC)
-			kfree(hr_qp);
-		else
-			kfree(hr_to_hr_sqp(hr_qp));
-	} else {
-		qp_work = kzalloc(sizeof(*qp_work), GFP_KERNEL);
-		if (!qp_work)
-			return -ENOMEM;
-
-		INIT_WORK(&qp_work->work, hns_roce_v1_destroy_qp_work_fn);
-		qp_work->ib_dev	= &hr_dev->ib_dev;
-		qp_work->qp		= hr_qp;
-		qp_work->db_wait_stage	= qp_work_entry.db_wait_stage;
-		qp_work->sdb_issue_ptr	= qp_work_entry.sdb_issue_ptr;
-		qp_work->sdb_inv_cnt	= qp_work_entry.sdb_inv_cnt;
-		qp_work->sche_cnt	= qp_work_entry.sche_cnt;
-
-		priv = (struct hns_roce_v1_priv *)hr_dev->priv;
-		queue_work(priv->des_qp.qp_wq, &qp_work->work);
-		dev_dbg(dev, "Begin destroy QP(0x%lx) work.\n", hr_qp->qpn);
-	}
-
+	if (hr_qp->ibqp.qp_type == IB_QPT_RC)
+		kfree(hr_qp);
+	else
+		kfree(hr_to_hr_sqp(hr_qp));
 	return 0;
 }
 
-static int hns_roce_v1_destroy_cq(struct ib_cq *ibcq)
+static void hns_roce_v1_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata)
 {
 	struct hns_roce_dev *hr_dev = to_hr_dev(ibcq->device);
 	struct hns_roce_cq *hr_cq = to_hr_cq(ibcq);
@@ -4003,7 +3660,6 @@
 	u32 cqe_cnt_cur;
 	u32 cq_buf_size;
 	int wait_time = 0;
-	int ret = 0;
 
 	hns_roce_free_cq(hr_dev, hr_cq);
 
@@ -4025,7 +3681,6 @@
 		if (wait_time > HNS_ROCE_MAX_FREE_CQ_WAIT_CNT) {
 			dev_warn(dev, "Destroy cq 0x%lx timeout!\n",
 				hr_cq->cqn);
-			ret = -ETIMEDOUT;
 			break;
 		}
 		wait_time++;
@@ -4033,17 +3688,12 @@
 
 	hns_roce_mtt_cleanup(hr_dev, &hr_cq->hr_buf.hr_mtt);
 
-	if (ibcq->uobject)
-		ib_umem_release(hr_cq->umem);
-	else {
+	ib_umem_release(hr_cq->umem);
+	if (!udata) {
 		/* Free the buff of stored cq */
 		cq_buf_size = (ibcq->cqe + 1) * hr_dev->caps.cq_entry_sz;
 		hns_roce_buf_free(hr_dev, cq_buf_size, &hr_cq->hr_buf.hr_buf);
 	}
-
-	kfree(hr_cq);
-
-	return ret;
 }
 
 static void set_eq_cons_index_v1(struct hns_roce_eq *eq, int req_not)
@@ -4247,7 +3897,8 @@
 		 */
 		dma_rmb();
 
-		dev_dbg(dev, "aeqe = %p, aeqe->asyn.event_type = 0x%lx\n", aeqe,
+		dev_dbg(dev, "aeqe = %pK, aeqe->asyn.event_type = 0x%lx\n",
+			aeqe,
 			roce_get_field(aeqe->asyn,
 				       HNS_ROCE_AEQE_U32_4_EVENT_TYPE_M,
 				       HNS_ROCE_AEQE_U32_4_EVENT_TYPE_S));
@@ -4362,7 +4013,8 @@
 		++eq->cons_index;
 		ceqes_found = 1;
 
-		if (eq->cons_index > 2 * hr_dev->caps.ceqe_depth - 1) {
+		if (eq->cons_index >
+		    EQ_DEPTH_COEFF * hr_dev->caps.ceqe_depth - 1) {
 			dev_warn(&eq->hr_dev->pdev->dev,
 				"cons_index overflow, set back to 0.\n");
 			eq->cons_index = 0;
@@ -4610,7 +4262,6 @@
 		}
 
 		eq->buf_list[i].map = tmp_dma_addr;
-		memset(eq->buf_list[i].buf, 0, HNS_ROCE_BA_SIZE);
 	}
 	eq->cons_index = 0;
 	roce_set_field(tmp, ROCEE_CAEP_AEQC_AEQE_SHIFT_CAEP_AEQC_STATE_M,
@@ -4793,6 +4444,16 @@
 	kfree(eq_table->eq);
 }
 
+static const struct ib_device_ops hns_roce_v1_dev_ops = {
+	.destroy_qp = hns_roce_v1_destroy_qp,
+	.modify_cq = hns_roce_v1_modify_cq,
+	.poll_cq = hns_roce_v1_poll_cq,
+	.post_recv = hns_roce_v1_post_recv,
+	.post_send = hns_roce_v1_post_send,
+	.query_qp = hns_roce_v1_query_qp,
+	.req_notify_cq = hns_roce_v1_req_notify_cq,
+};
+
 static const struct hns_roce_hw hns_roce_hw_v1 = {
 	.reset = hns_roce_v1_reset,
 	.hw_profile = hns_roce_v1_profile,
@@ -4818,6 +4479,7 @@
 	.destroy_cq = hns_roce_v1_destroy_cq,
 	.init_eq = hns_roce_v1_init_eq_table,
 	.cleanup_eq = hns_roce_v1_cleanup_eq_table,
+	.hns_roce_dev_ops = &hns_roce_v1_dev_ops,
 };
 
 static const struct of_device_id hns_roce_of_match[] = {
@@ -4832,19 +4494,13 @@
 };
 MODULE_DEVICE_TABLE(acpi, hns_roce_acpi_match);
 
-static int hns_roce_node_match(struct device *dev, void *fwnode)
-{
-	return dev->fwnode == fwnode;
-}
-
 static struct
 platform_device *hns_roce_find_pdev(struct fwnode_handle *fwnode)
 {
 	struct device *dev;
 
 	/* get the 'device' corresponding to the matching 'fwnode' */
-	dev = bus_find_device(&platform_bus_type, NULL,
-			      fwnode, hns_roce_node_match);
+	dev = bus_find_device_by_fwnode(&platform_bus_type, fwnode);
 	/* get the platform device */
 	return dev ? to_platform_device(dev) : NULL;
 }
@@ -4855,7 +4511,6 @@
 	struct platform_device *pdev = NULL;
 	struct net_device *netdev = NULL;
 	struct device_node *net_node;
-	struct resource *res;
 	int port_cnt = 0;
 	u8 phy_port;
 	int ret;
@@ -4894,8 +4549,7 @@
 	}
 
 	/* get the mapped register base address */
-	res = platform_get_resource(hr_dev->pdev, IORESOURCE_MEM, 0);
-	hr_dev->reg_base = devm_ioremap_resource(dev, res);
+	hr_dev->reg_base = devm_platform_ioremap_resource(hr_dev->pdev, 0);
 	if (IS_ERR(hr_dev->reg_base))
 		return PTR_ERR(hr_dev->reg_base);
 
@@ -4970,10 +4624,8 @@
 	/* fetch the interrupt numbers */
 	for (i = 0; i < HNS_ROCE_V1_MAX_IRQ_NUM; i++) {
 		hr_dev->irq[i] = platform_get_irq(hr_dev->pdev, i);
-		if (hr_dev->irq[i] <= 0) {
-			dev_err(dev, "platform get of irq[=%d] failed!\n", i);
+		if (hr_dev->irq[i] <= 0)
 			return -EINVAL;
-		}
 	}
 
 	return 0;
@@ -4991,7 +4643,7 @@
 	struct hns_roce_dev *hr_dev;
 	struct device *dev = &pdev->dev;
 
-	hr_dev = (struct hns_roce_dev *)ib_alloc_device(sizeof(*hr_dev));
+	hr_dev = ib_alloc_device(hns_roce_dev, ib_dev);
 	if (!hr_dev)
 		return -ENOMEM;
 
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v1.h b/drivers/infiniband/hw/hns/hns_roce_hw_v1.h
index 6644014..52307b2 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v1.h
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v1.h
@@ -110,11 +110,6 @@
 #define HNS_ROCE_V1_EXT_ODB_ALFUL	\
 	(HNS_ROCE_V1_EXT_ODB_DEPTH - HNS_ROCE_V1_DB_RSVD)
 
-#define HNS_ROCE_V1_DB_WAIT_OK				0
-#define HNS_ROCE_V1_DB_STAGE1				1
-#define HNS_ROCE_V1_DB_STAGE2				2
-#define HNS_ROCE_V1_CHECK_DB_TIMEOUT_MSECS		10000
-#define HNS_ROCE_V1_CHECK_DB_SLEEP_MSECS		20
 #define HNS_ROCE_V1_FREE_MR_TIMEOUT_MSECS		50000
 #define HNS_ROCE_V1_RECREATE_LP_QP_TIMEOUT_MSECS	10000
 #define HNS_ROCE_V1_FREE_MR_WAIT_VALUE			5
@@ -162,7 +157,6 @@
 #define SQ_PSN_SHIFT					8
 #define QKEY_VAL					0x80010000
 #define SDB_INV_CNT_OFFSET				8
-#define SDB_ST_CMP_VAL					8
 
 #define HNS_ROCE_CEQ_DEFAULT_INTERVAL			0x10
 #define HNS_ROCE_CEQ_DEFAULT_BURST_NUM			0x10
@@ -1068,11 +1062,6 @@
 	u32	sche_cnt;
 };
 
-struct hns_roce_des_qp {
-	struct workqueue_struct	*qp_wq;
-	int	requeue_flag;
-};
-
 struct hns_roce_mr_free_work {
 	struct	work_struct work;
 	struct	ib_device *ib_dev;
@@ -1100,12 +1089,11 @@
 	struct hns_roce_raq_table raq_table;
 	struct hns_roce_bt_table  bt_table;
 	struct hns_roce_tptr_table tptr_table;
-	struct hns_roce_des_qp des_qp;
 	struct hns_roce_free_mr free_mr;
 };
 
 int hns_dsaf_roce_reset(struct fwnode_handle *dsaf_fwnode, bool dereset);
 int hns_roce_v1_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc);
-int hns_roce_v1_destroy_qp(struct ib_qp *ibqp);
+int hns_roce_v1_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata);
 
 #endif
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
index a442b29..e82567f 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
@@ -37,7 +37,9 @@
 #include <linux/types.h>
 #include <net/addrconf.h>
 #include <rdma/ib_addr.h>
+#include <rdma/ib_cache.h>
 #include <rdma/ib_umem.h>
+#include <rdma/uverbs_ioctl.h>
 
 #include "hnae3.h"
 #include "hns_roce_common.h"
@@ -54,6 +56,59 @@
 	dseg->len  = cpu_to_le32(sg->length);
 }
 
+static void set_frmr_seg(struct hns_roce_v2_rc_send_wqe *rc_sq_wqe,
+			 struct hns_roce_wqe_frmr_seg *fseg,
+			 const struct ib_reg_wr *wr)
+{
+	struct hns_roce_mr *mr = to_hr_mr(wr->mr);
+
+	/* use ib_access_flags */
+	roce_set_bit(rc_sq_wqe->byte_4,
+		     V2_RC_FRMR_WQE_BYTE_4_BIND_EN_S,
+		     wr->access & IB_ACCESS_MW_BIND ? 1 : 0);
+	roce_set_bit(rc_sq_wqe->byte_4,
+		     V2_RC_FRMR_WQE_BYTE_4_ATOMIC_S,
+		     wr->access & IB_ACCESS_REMOTE_ATOMIC ? 1 : 0);
+	roce_set_bit(rc_sq_wqe->byte_4,
+		     V2_RC_FRMR_WQE_BYTE_4_RR_S,
+		     wr->access & IB_ACCESS_REMOTE_READ ? 1 : 0);
+	roce_set_bit(rc_sq_wqe->byte_4,
+		     V2_RC_FRMR_WQE_BYTE_4_RW_S,
+		     wr->access & IB_ACCESS_REMOTE_WRITE ? 1 : 0);
+	roce_set_bit(rc_sq_wqe->byte_4,
+		     V2_RC_FRMR_WQE_BYTE_4_LW_S,
+		     wr->access & IB_ACCESS_LOCAL_WRITE ? 1 : 0);
+
+	/* Data structure reuse may lead to confusion */
+	rc_sq_wqe->msg_len = cpu_to_le32(mr->pbl_ba & 0xffffffff);
+	rc_sq_wqe->inv_key = cpu_to_le32(mr->pbl_ba >> 32);
+
+	rc_sq_wqe->byte_16 = cpu_to_le32(wr->mr->length & 0xffffffff);
+	rc_sq_wqe->byte_20 = cpu_to_le32(wr->mr->length >> 32);
+	rc_sq_wqe->rkey = cpu_to_le32(wr->key);
+	rc_sq_wqe->va = cpu_to_le64(wr->mr->iova);
+
+	fseg->pbl_size = cpu_to_le32(mr->pbl_size);
+	roce_set_field(fseg->mode_buf_pg_sz,
+		       V2_RC_FRMR_WQE_BYTE_40_PBL_BUF_PG_SZ_M,
+		       V2_RC_FRMR_WQE_BYTE_40_PBL_BUF_PG_SZ_S,
+		       mr->pbl_buf_pg_sz + PG_SHIFT_OFFSET);
+	roce_set_bit(fseg->mode_buf_pg_sz,
+		     V2_RC_FRMR_WQE_BYTE_40_BLK_MODE_S, 0);
+}
+
+static void set_atomic_seg(struct hns_roce_wqe_atomic_seg *aseg,
+			   const struct ib_atomic_wr *wr)
+{
+	if (wr->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP) {
+		aseg->fetchadd_swap_data = cpu_to_le64(wr->swap);
+		aseg->cmp_data  = cpu_to_le64(wr->compare_add);
+	} else {
+		aseg->fetchadd_swap_data = cpu_to_le64(wr->compare_add);
+		aseg->cmp_data  = 0;
+	}
+}
+
 static void set_extend_sge(struct hns_roce_qp *qp, const struct ib_send_wr *wr,
 			   unsigned int *sge_ind)
 {
@@ -121,6 +176,7 @@
 		}
 
 		if (wr->opcode == IB_WR_RDMA_READ) {
+			*bad_wr =  wr;
 			dev_err(hr_dev->dev, "Not support inline data!\n");
 			return -EINVAL;
 		}
@@ -179,10 +235,11 @@
 	struct hns_roce_v2_ud_send_wqe *ud_sq_wqe;
 	struct hns_roce_v2_rc_send_wqe *rc_sq_wqe;
 	struct hns_roce_qp *qp = to_hr_qp(ibqp);
+	struct hns_roce_wqe_frmr_seg *fseg;
 	struct device *dev = hr_dev->dev;
 	struct hns_roce_v2_db sq_db;
 	struct ib_qp_attr attr;
-	unsigned int sge_ind = 0;
+	unsigned int sge_ind;
 	unsigned int owner_bit;
 	unsigned long flags;
 	unsigned int ind;
@@ -191,6 +248,7 @@
 	int attr_mask;
 	u32 tmp_len;
 	int ret = 0;
+	u32 hr_op;
 	u8 *smac;
 	int nreq;
 	int i;
@@ -339,23 +397,23 @@
 			roce_set_field(ud_sq_wqe->byte_36,
 				       V2_UD_SEND_WQE_BYTE_36_TCLASS_M,
 				       V2_UD_SEND_WQE_BYTE_36_TCLASS_S,
-				       ah->av.sl_tclass_flowlabel >>
-				       HNS_ROCE_TCLASS_SHIFT);
+				       ah->av.tclass);
 			roce_set_field(ud_sq_wqe->byte_40,
 				       V2_UD_SEND_WQE_BYTE_40_FLOW_LABEL_M,
 				       V2_UD_SEND_WQE_BYTE_40_FLOW_LABEL_S,
-				       ah->av.sl_tclass_flowlabel &
-				       HNS_ROCE_FLOW_LABEL_MASK);
+				       ah->av.flowlabel);
 			roce_set_field(ud_sq_wqe->byte_40,
 				       V2_UD_SEND_WQE_BYTE_40_SL_M,
 				       V2_UD_SEND_WQE_BYTE_40_SL_S,
-				      le32_to_cpu(ah->av.sl_tclass_flowlabel) >>
-				      HNS_ROCE_SL_SHIFT);
+				       ah->av.sl);
 			roce_set_field(ud_sq_wqe->byte_40,
 				       V2_UD_SEND_WQE_BYTE_40_PORTN_M,
 				       V2_UD_SEND_WQE_BYTE_40_PORTN_S,
 				       qp->port);
 
+			roce_set_bit(ud_sq_wqe->byte_40,
+				     V2_UD_SEND_WQE_BYTE_40_UD_VLAN_EN_S,
+				     ah->av.vlan_en ? 1 : 0);
 			roce_set_field(ud_sq_wqe->byte_48,
 				       V2_UD_SEND_WQE_BYTE_48_SGID_INDX_M,
 				       V2_UD_SEND_WQE_BYTE_48_SGID_INDX_S,
@@ -406,99 +464,100 @@
 			roce_set_bit(rc_sq_wqe->byte_4,
 				     V2_RC_SEND_WQE_BYTE_4_OWNER_S, owner_bit);
 
+			wqe += sizeof(struct hns_roce_v2_rc_send_wqe);
 			switch (wr->opcode) {
 			case IB_WR_RDMA_READ:
-				roce_set_field(rc_sq_wqe->byte_4,
-					       V2_RC_SEND_WQE_BYTE_4_OPCODE_M,
-					       V2_RC_SEND_WQE_BYTE_4_OPCODE_S,
-					       HNS_ROCE_V2_WQE_OP_RDMA_READ);
+				hr_op = HNS_ROCE_V2_WQE_OP_RDMA_READ;
 				rc_sq_wqe->rkey =
 					cpu_to_le32(rdma_wr(wr)->rkey);
 				rc_sq_wqe->va =
 					cpu_to_le64(rdma_wr(wr)->remote_addr);
 				break;
 			case IB_WR_RDMA_WRITE:
-				roce_set_field(rc_sq_wqe->byte_4,
-					       V2_RC_SEND_WQE_BYTE_4_OPCODE_M,
-					       V2_RC_SEND_WQE_BYTE_4_OPCODE_S,
-					       HNS_ROCE_V2_WQE_OP_RDMA_WRITE);
+				hr_op = HNS_ROCE_V2_WQE_OP_RDMA_WRITE;
 				rc_sq_wqe->rkey =
 					cpu_to_le32(rdma_wr(wr)->rkey);
 				rc_sq_wqe->va =
 					cpu_to_le64(rdma_wr(wr)->remote_addr);
 				break;
 			case IB_WR_RDMA_WRITE_WITH_IMM:
-				roce_set_field(rc_sq_wqe->byte_4,
-				       V2_RC_SEND_WQE_BYTE_4_OPCODE_M,
-				       V2_RC_SEND_WQE_BYTE_4_OPCODE_S,
-				       HNS_ROCE_V2_WQE_OP_RDMA_WRITE_WITH_IMM);
+				hr_op = HNS_ROCE_V2_WQE_OP_RDMA_WRITE_WITH_IMM;
 				rc_sq_wqe->rkey =
 					cpu_to_le32(rdma_wr(wr)->rkey);
 				rc_sq_wqe->va =
 					cpu_to_le64(rdma_wr(wr)->remote_addr);
 				break;
 			case IB_WR_SEND:
-				roce_set_field(rc_sq_wqe->byte_4,
-					       V2_RC_SEND_WQE_BYTE_4_OPCODE_M,
-					       V2_RC_SEND_WQE_BYTE_4_OPCODE_S,
-					       HNS_ROCE_V2_WQE_OP_SEND);
+				hr_op = HNS_ROCE_V2_WQE_OP_SEND;
 				break;
 			case IB_WR_SEND_WITH_INV:
-				roce_set_field(rc_sq_wqe->byte_4,
-				       V2_RC_SEND_WQE_BYTE_4_OPCODE_M,
-				       V2_RC_SEND_WQE_BYTE_4_OPCODE_S,
-				       HNS_ROCE_V2_WQE_OP_SEND_WITH_INV);
+				hr_op = HNS_ROCE_V2_WQE_OP_SEND_WITH_INV;
 				break;
 			case IB_WR_SEND_WITH_IMM:
-				roce_set_field(rc_sq_wqe->byte_4,
-					      V2_RC_SEND_WQE_BYTE_4_OPCODE_M,
-					      V2_RC_SEND_WQE_BYTE_4_OPCODE_S,
-					      HNS_ROCE_V2_WQE_OP_SEND_WITH_IMM);
+				hr_op = HNS_ROCE_V2_WQE_OP_SEND_WITH_IMM;
 				break;
 			case IB_WR_LOCAL_INV:
-				roce_set_field(rc_sq_wqe->byte_4,
-					       V2_RC_SEND_WQE_BYTE_4_OPCODE_M,
-					       V2_RC_SEND_WQE_BYTE_4_OPCODE_S,
-					       HNS_ROCE_V2_WQE_OP_LOCAL_INV);
+				hr_op = HNS_ROCE_V2_WQE_OP_LOCAL_INV;
+				roce_set_bit(rc_sq_wqe->byte_4,
+					       V2_RC_SEND_WQE_BYTE_4_SO_S, 1);
+				rc_sq_wqe->inv_key =
+					    cpu_to_le32(wr->ex.invalidate_rkey);
+				break;
+			case IB_WR_REG_MR:
+				hr_op = HNS_ROCE_V2_WQE_OP_FAST_REG_PMR;
+				fseg = wqe;
+				set_frmr_seg(rc_sq_wqe, fseg, reg_wr(wr));
 				break;
 			case IB_WR_ATOMIC_CMP_AND_SWP:
-				roce_set_field(rc_sq_wqe->byte_4,
-					  V2_RC_SEND_WQE_BYTE_4_OPCODE_M,
-					  V2_RC_SEND_WQE_BYTE_4_OPCODE_S,
-					  HNS_ROCE_V2_WQE_OP_ATOM_CMP_AND_SWAP);
+				hr_op = HNS_ROCE_V2_WQE_OP_ATOM_CMP_AND_SWAP;
+				rc_sq_wqe->rkey =
+					cpu_to_le32(atomic_wr(wr)->rkey);
+				rc_sq_wqe->va =
+					cpu_to_le64(atomic_wr(wr)->remote_addr);
 				break;
 			case IB_WR_ATOMIC_FETCH_AND_ADD:
-				roce_set_field(rc_sq_wqe->byte_4,
-					 V2_RC_SEND_WQE_BYTE_4_OPCODE_M,
-					 V2_RC_SEND_WQE_BYTE_4_OPCODE_S,
-					 HNS_ROCE_V2_WQE_OP_ATOM_FETCH_AND_ADD);
+				hr_op = HNS_ROCE_V2_WQE_OP_ATOM_FETCH_AND_ADD;
+				rc_sq_wqe->rkey =
+					cpu_to_le32(atomic_wr(wr)->rkey);
+				rc_sq_wqe->va =
+					cpu_to_le64(atomic_wr(wr)->remote_addr);
 				break;
 			case IB_WR_MASKED_ATOMIC_CMP_AND_SWP:
-				roce_set_field(rc_sq_wqe->byte_4,
-				      V2_RC_SEND_WQE_BYTE_4_OPCODE_M,
-				      V2_RC_SEND_WQE_BYTE_4_OPCODE_S,
-				      HNS_ROCE_V2_WQE_OP_ATOM_MSK_CMP_AND_SWAP);
+				hr_op =
+				       HNS_ROCE_V2_WQE_OP_ATOM_MSK_CMP_AND_SWAP;
 				break;
 			case IB_WR_MASKED_ATOMIC_FETCH_AND_ADD:
-				roce_set_field(rc_sq_wqe->byte_4,
-				     V2_RC_SEND_WQE_BYTE_4_OPCODE_M,
-				     V2_RC_SEND_WQE_BYTE_4_OPCODE_S,
-				     HNS_ROCE_V2_WQE_OP_ATOM_MSK_FETCH_AND_ADD);
+				hr_op =
+				      HNS_ROCE_V2_WQE_OP_ATOM_MSK_FETCH_AND_ADD;
 				break;
 			default:
-				roce_set_field(rc_sq_wqe->byte_4,
-					       V2_RC_SEND_WQE_BYTE_4_OPCODE_M,
-					       V2_RC_SEND_WQE_BYTE_4_OPCODE_S,
-					       HNS_ROCE_V2_WQE_OP_MASK);
+				hr_op = HNS_ROCE_V2_WQE_OP_MASK;
 				break;
 			}
 
-			wqe += sizeof(struct hns_roce_v2_rc_send_wqe);
+			roce_set_field(rc_sq_wqe->byte_4,
+				       V2_RC_SEND_WQE_BYTE_4_OPCODE_M,
+				       V2_RC_SEND_WQE_BYTE_4_OPCODE_S, hr_op);
 
-			ret = set_rwqe_data_seg(ibqp, wr, rc_sq_wqe, wqe,
-						&sge_ind, bad_wr);
-			if (ret)
-				goto out;
+			if (wr->opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
+			    wr->opcode == IB_WR_ATOMIC_FETCH_AND_ADD) {
+				struct hns_roce_v2_wqe_data_seg *dseg;
+
+				dseg = wqe;
+				set_data_seg_v2(dseg, wr->sg_list);
+				wqe += sizeof(struct hns_roce_v2_wqe_data_seg);
+				set_atomic_seg(wqe, atomic_wr(wr));
+				roce_set_field(rc_sq_wqe->byte_16,
+					       V2_RC_SEND_WQE_BYTE_16_SGE_NUM_M,
+					       V2_RC_SEND_WQE_BYTE_16_SGE_NUM_S,
+					       wr->num_sge);
+			} else if (wr->opcode != IB_WR_REG_MR) {
+				ret = set_rwqe_data_seg(ibqp, wr, rc_sq_wqe,
+							wqe, &sge_ind, bad_wr);
+				if (ret)
+					goto out;
+			}
+
 			ind++;
 		} else {
 			dev_err(dev, "Illegal qp_type(0x%x)\n", ibqp->qp_type);
@@ -527,7 +586,7 @@
 		roce_set_field(sq_db.parameter, V2_DB_PARAMETER_SL_M,
 			       V2_DB_PARAMETER_SL_S, qp->sl);
 
-		hns_roce_write64_k((__le32 *)&sq_db, qp->sq.db_reg_l);
+		hns_roce_write64(hr_dev, (__le32 *)&sq_db, qp->sq.db_reg_l);
 
 		qp->sq_next_wqe = ind;
 		qp->next_sge = sge_ind;
@@ -652,6 +711,113 @@
 	return ret;
 }
 
+static int hns_roce_v2_cmd_hw_reseted(struct hns_roce_dev *hr_dev,
+				      unsigned long instance_stage,
+				      unsigned long reset_stage)
+{
+	/* When hardware reset has been completed once or more, we should stop
+	 * sending mailbox&cmq&doorbell to hardware. If now in .init_instance()
+	 * function, we should exit with error. If now at HNAE3_INIT_CLIENT
+	 * stage of soft reset process, we should exit with error, and then
+	 * HNAE3_INIT_CLIENT related process can rollback the operation like
+	 * notifing hardware to free resources, HNAE3_INIT_CLIENT related
+	 * process will exit with error to notify NIC driver to reschedule soft
+	 * reset process once again.
+	 */
+	hr_dev->is_reset = true;
+	hr_dev->dis_db = true;
+
+	if (reset_stage == HNS_ROCE_STATE_RST_INIT ||
+	    instance_stage == HNS_ROCE_STATE_INIT)
+		return CMD_RST_PRC_EBUSY;
+
+	return CMD_RST_PRC_SUCCESS;
+}
+
+static int hns_roce_v2_cmd_hw_resetting(struct hns_roce_dev *hr_dev,
+					unsigned long instance_stage,
+					unsigned long reset_stage)
+{
+	struct hns_roce_v2_priv *priv = (struct hns_roce_v2_priv *)hr_dev->priv;
+	struct hnae3_handle *handle = priv->handle;
+	const struct hnae3_ae_ops *ops = handle->ae_algo->ops;
+
+	/* When hardware reset is detected, we should stop sending mailbox&cmq&
+	 * doorbell to hardware. If now in .init_instance() function, we should
+	 * exit with error. If now at HNAE3_INIT_CLIENT stage of soft reset
+	 * process, we should exit with error, and then HNAE3_INIT_CLIENT
+	 * related process can rollback the operation like notifing hardware to
+	 * free resources, HNAE3_INIT_CLIENT related process will exit with
+	 * error to notify NIC driver to reschedule soft reset process once
+	 * again.
+	 */
+	hr_dev->dis_db = true;
+	if (!ops->get_hw_reset_stat(handle))
+		hr_dev->is_reset = true;
+
+	if (!hr_dev->is_reset || reset_stage == HNS_ROCE_STATE_RST_INIT ||
+	    instance_stage == HNS_ROCE_STATE_INIT)
+		return CMD_RST_PRC_EBUSY;
+
+	return CMD_RST_PRC_SUCCESS;
+}
+
+static int hns_roce_v2_cmd_sw_resetting(struct hns_roce_dev *hr_dev)
+{
+	struct hns_roce_v2_priv *priv = (struct hns_roce_v2_priv *)hr_dev->priv;
+	struct hnae3_handle *handle = priv->handle;
+	const struct hnae3_ae_ops *ops = handle->ae_algo->ops;
+
+	/* When software reset is detected at .init_instance() function, we
+	 * should stop sending mailbox&cmq&doorbell to hardware, and exit
+	 * with error.
+	 */
+	hr_dev->dis_db = true;
+	if (ops->ae_dev_reset_cnt(handle) != hr_dev->reset_cnt)
+		hr_dev->is_reset = true;
+
+	return CMD_RST_PRC_EBUSY;
+}
+
+static int hns_roce_v2_rst_process_cmd(struct hns_roce_dev *hr_dev)
+{
+	struct hns_roce_v2_priv *priv = (struct hns_roce_v2_priv *)hr_dev->priv;
+	struct hnae3_handle *handle = priv->handle;
+	const struct hnae3_ae_ops *ops = handle->ae_algo->ops;
+	unsigned long instance_stage;	/* the current instance stage */
+	unsigned long reset_stage;	/* the current reset stage */
+	unsigned long reset_cnt;
+	bool sw_resetting;
+	bool hw_resetting;
+
+	if (hr_dev->is_reset)
+		return CMD_RST_PRC_SUCCESS;
+
+	/* Get information about reset from NIC driver or RoCE driver itself,
+	 * the meaning of the following variables from NIC driver are described
+	 * as below:
+	 * reset_cnt -- The count value of completed hardware reset.
+	 * hw_resetting -- Whether hardware device is resetting now.
+	 * sw_resetting -- Whether NIC's software reset process is running now.
+	 */
+	instance_stage = handle->rinfo.instance_state;
+	reset_stage = handle->rinfo.reset_state;
+	reset_cnt = ops->ae_dev_reset_cnt(handle);
+	hw_resetting = ops->get_hw_reset_stat(handle);
+	sw_resetting = ops->ae_dev_resetting(handle);
+
+	if (reset_cnt != hr_dev->reset_cnt)
+		return hns_roce_v2_cmd_hw_reseted(hr_dev, instance_stage,
+						  reset_stage);
+	else if (hw_resetting)
+		return hns_roce_v2_cmd_hw_resetting(hr_dev, instance_stage,
+						    reset_stage);
+	else if (sw_resetting && instance_stage == HNS_ROCE_STATE_INIT)
+		return hns_roce_v2_cmd_sw_resetting(hr_dev);
+
+	return 0;
+}
+
 static int hns_roce_cmq_space(struct hns_roce_v2_cmq_ring *ring)
 {
 	int ntu = ring->next_to_use;
@@ -718,8 +884,7 @@
 		roce_write(hr_dev, ROCEE_TX_CMQ_BASEADDR_H_REG,
 			   upper_32_bits(dma));
 		roce_write(hr_dev, ROCEE_TX_CMQ_DEPTH_REG,
-			  (ring->desc_num >> HNS_ROCE_CMQ_DESC_NUM_S) |
-			   HNS_ROCE_CMQ_ENABLE);
+			   ring->desc_num >> HNS_ROCE_CMQ_DESC_NUM_S);
 		roce_write(hr_dev, ROCEE_TX_CMQ_HEAD_REG, 0);
 		roce_write(hr_dev, ROCEE_TX_CMQ_TAIL_REG, 0);
 	} else {
@@ -727,8 +892,7 @@
 		roce_write(hr_dev, ROCEE_RX_CMQ_BASEADDR_H_REG,
 			   upper_32_bits(dma));
 		roce_write(hr_dev, ROCEE_RX_CMQ_DEPTH_REG,
-			  (ring->desc_num >> HNS_ROCE_CMQ_DESC_NUM_S) |
-			   HNS_ROCE_CMQ_ENABLE);
+			   ring->desc_num >> HNS_ROCE_CMQ_DESC_NUM_S);
 		roce_write(hr_dev, ROCEE_RX_CMQ_HEAD_REG, 0);
 		roce_write(hr_dev, ROCEE_RX_CMQ_TAIL_REG, 0);
 	}
@@ -832,8 +996,8 @@
 	return clean;
 }
 
-static int hns_roce_cmq_send(struct hns_roce_dev *hr_dev,
-			     struct hns_roce_cmq_desc *desc, int num)
+static int __hns_roce_cmq_send(struct hns_roce_dev *hr_dev,
+			       struct hns_roce_cmq_desc *desc, int num)
 {
 	struct hns_roce_v2_priv *priv = (struct hns_roce_v2_priv *)hr_dev->priv;
 	struct hns_roce_v2_cmq_ring *csq = &priv->cmq.csq;
@@ -845,9 +1009,6 @@
 	int ret = 0;
 	int ntc;
 
-	if (hr_dev->is_reset)
-		return 0;
-
 	spin_lock_bh(&csq->lock);
 
 	if (num > hns_roce_cmq_space(csq)) {
@@ -878,7 +1039,7 @@
 	 * If the command is sync, wait for the firmware to write back,
 	 * if multi descriptors to be sent, use the first one to check
 	 */
-	if ((desc->flag) & HNS_ROCE_CMD_FLAG_NO_INTR) {
+	if (le16_to_cpu(desc->flag) & HNS_ROCE_CMD_FLAG_NO_INTR) {
 		do {
 			if (hns_roce_cmq_csq_done(hr_dev))
 				break;
@@ -895,7 +1056,7 @@
 			desc_to_use = &csq->desc[ntc];
 			desc[handle] = *desc_to_use;
 			dev_dbg(hr_dev->dev, "Get cmq desc:\n");
-			desc_ret = desc[handle].retval;
+			desc_ret = le16_to_cpu(desc[handle].retval);
 			if (desc_ret == CMD_EXEC_SUCCESS)
 				ret = 0;
 			else
@@ -922,6 +1083,30 @@
 	return ret;
 }
 
+static int hns_roce_cmq_send(struct hns_roce_dev *hr_dev,
+			     struct hns_roce_cmq_desc *desc, int num)
+{
+	int retval;
+	int ret;
+
+	ret = hns_roce_v2_rst_process_cmd(hr_dev);
+	if (ret == CMD_RST_PRC_SUCCESS)
+		return 0;
+	if (ret == CMD_RST_PRC_EBUSY)
+		return -EBUSY;
+
+	ret = __hns_roce_cmq_send(hr_dev, desc, num);
+	if (ret) {
+		retval = hns_roce_v2_rst_process_cmd(hr_dev);
+		if (retval == CMD_RST_PRC_SUCCESS)
+			return 0;
+		else if (retval == CMD_RST_PRC_EBUSY)
+			return -EBUSY;
+	}
+
+	return ret;
+}
+
 static int hns_roce_cmq_query_hw_info(struct hns_roce_dev *hr_dev)
 {
 	struct hns_roce_query_version *resp;
@@ -934,8 +1119,158 @@
 		return ret;
 
 	resp = (struct hns_roce_query_version *)desc.data;
-	hr_dev->hw_rev = le32_to_cpu(resp->rocee_hw_version);
-	hr_dev->vendor_id = le32_to_cpu(resp->rocee_vendor_id);
+	hr_dev->hw_rev = le16_to_cpu(resp->rocee_hw_version);
+	hr_dev->vendor_id = hr_dev->pci_dev->vendor;
+
+	return 0;
+}
+
+static bool hns_roce_func_clr_chk_rst(struct hns_roce_dev *hr_dev)
+{
+	struct hns_roce_v2_priv *priv = (struct hns_roce_v2_priv *)hr_dev->priv;
+	struct hnae3_handle *handle = priv->handle;
+	const struct hnae3_ae_ops *ops = handle->ae_algo->ops;
+	unsigned long reset_cnt;
+	bool sw_resetting;
+	bool hw_resetting;
+
+	reset_cnt = ops->ae_dev_reset_cnt(handle);
+	hw_resetting = ops->get_hw_reset_stat(handle);
+	sw_resetting = ops->ae_dev_resetting(handle);
+
+	if (reset_cnt != hr_dev->reset_cnt || hw_resetting || sw_resetting)
+		return true;
+
+	return false;
+}
+
+static void hns_roce_func_clr_rst_prc(struct hns_roce_dev *hr_dev, int retval,
+				      int flag)
+{
+	struct hns_roce_v2_priv *priv = (struct hns_roce_v2_priv *)hr_dev->priv;
+	struct hnae3_handle *handle = priv->handle;
+	const struct hnae3_ae_ops *ops = handle->ae_algo->ops;
+	unsigned long instance_stage;
+	unsigned long reset_cnt;
+	unsigned long end;
+	bool sw_resetting;
+	bool hw_resetting;
+
+	instance_stage = handle->rinfo.instance_state;
+	reset_cnt = ops->ae_dev_reset_cnt(handle);
+	hw_resetting = ops->get_hw_reset_stat(handle);
+	sw_resetting = ops->ae_dev_resetting(handle);
+
+	if (reset_cnt != hr_dev->reset_cnt) {
+		hr_dev->dis_db = true;
+		hr_dev->is_reset = true;
+		dev_info(hr_dev->dev, "Func clear success after reset.\n");
+	} else if (hw_resetting) {
+		hr_dev->dis_db = true;
+
+		dev_warn(hr_dev->dev,
+			 "Func clear is pending, device in resetting state.\n");
+		end = HNS_ROCE_V2_HW_RST_TIMEOUT;
+		while (end) {
+			if (!ops->get_hw_reset_stat(handle)) {
+				hr_dev->is_reset = true;
+				dev_info(hr_dev->dev,
+					 "Func clear success after reset.\n");
+				return;
+			}
+			msleep(HNS_ROCE_V2_HW_RST_COMPLETION_WAIT);
+			end -= HNS_ROCE_V2_HW_RST_COMPLETION_WAIT;
+		}
+
+		dev_warn(hr_dev->dev, "Func clear failed.\n");
+	} else if (sw_resetting && instance_stage == HNS_ROCE_STATE_INIT) {
+		hr_dev->dis_db = true;
+
+		dev_warn(hr_dev->dev,
+			 "Func clear is pending, device in resetting state.\n");
+		end = HNS_ROCE_V2_HW_RST_TIMEOUT;
+		while (end) {
+			if (ops->ae_dev_reset_cnt(handle) !=
+			    hr_dev->reset_cnt) {
+				hr_dev->is_reset = true;
+				dev_info(hr_dev->dev,
+					 "Func clear success after sw reset\n");
+				return;
+			}
+			msleep(HNS_ROCE_V2_HW_RST_COMPLETION_WAIT);
+			end -= HNS_ROCE_V2_HW_RST_COMPLETION_WAIT;
+		}
+
+		dev_warn(hr_dev->dev, "Func clear failed because of unfinished sw reset\n");
+	} else {
+		if (retval && !flag)
+			dev_warn(hr_dev->dev,
+				 "Func clear read failed, ret = %d.\n", retval);
+
+		dev_warn(hr_dev->dev, "Func clear failed.\n");
+	}
+}
+static void hns_roce_function_clear(struct hns_roce_dev *hr_dev)
+{
+	bool fclr_write_fail_flag = false;
+	struct hns_roce_func_clear *resp;
+	struct hns_roce_cmq_desc desc;
+	unsigned long end;
+	int ret = 0;
+
+	if (hns_roce_func_clr_chk_rst(hr_dev))
+		goto out;
+
+	hns_roce_cmq_setup_basic_desc(&desc, HNS_ROCE_OPC_FUNC_CLEAR, false);
+	resp = (struct hns_roce_func_clear *)desc.data;
+
+	ret = hns_roce_cmq_send(hr_dev, &desc, 1);
+	if (ret) {
+		fclr_write_fail_flag = true;
+		dev_err(hr_dev->dev, "Func clear write failed, ret = %d.\n",
+			 ret);
+		goto out;
+	}
+
+	msleep(HNS_ROCE_V2_READ_FUNC_CLEAR_FLAG_INTERVAL);
+	end = HNS_ROCE_V2_FUNC_CLEAR_TIMEOUT_MSECS;
+	while (end) {
+		if (hns_roce_func_clr_chk_rst(hr_dev))
+			goto out;
+		msleep(HNS_ROCE_V2_READ_FUNC_CLEAR_FLAG_FAIL_WAIT);
+		end -= HNS_ROCE_V2_READ_FUNC_CLEAR_FLAG_FAIL_WAIT;
+
+		hns_roce_cmq_setup_basic_desc(&desc, HNS_ROCE_OPC_FUNC_CLEAR,
+					      true);
+
+		ret = hns_roce_cmq_send(hr_dev, &desc, 1);
+		if (ret)
+			continue;
+
+		if (roce_get_bit(resp->func_done, FUNC_CLEAR_RST_FUN_DONE_S)) {
+			hr_dev->is_reset = true;
+			return;
+		}
+	}
+
+out:
+	dev_err(hr_dev->dev, "Func clear fail.\n");
+	hns_roce_func_clr_rst_prc(hr_dev, ret, fclr_write_fail_flag);
+}
+
+static int hns_roce_query_fw_ver(struct hns_roce_dev *hr_dev)
+{
+	struct hns_roce_query_fw_info *resp;
+	struct hns_roce_cmq_desc desc;
+	int ret;
+
+	hns_roce_cmq_setup_basic_desc(&desc, HNS_QUERY_FW_VER, true);
+	ret = hns_roce_cmq_send(hr_dev, &desc, 1);
+	if (ret)
+		return ret;
+
+	resp = (struct hns_roce_query_fw_info *)desc.data;
+	hr_dev->caps.fw_ver = (u64)(le32_to_cpu(resp->fw_ver));
 
 	return 0;
 }
@@ -1001,10 +1336,75 @@
 	hr_dev->caps.sl_num = roce_get_field(req_b->qid_idx_sl_num,
 					     PF_RES_DATA_3_PF_SL_NUM_M,
 					     PF_RES_DATA_3_PF_SL_NUM_S);
+	hr_dev->caps.sccc_bt_num = roce_get_field(req_b->sccc_bt_idx_num,
+					     PF_RES_DATA_4_PF_SCCC_BT_NUM_M,
+					     PF_RES_DATA_4_PF_SCCC_BT_NUM_S);
 
 	return 0;
 }
 
+static int hns_roce_query_pf_timer_resource(struct hns_roce_dev *hr_dev)
+{
+	struct hns_roce_pf_timer_res_a *req_a;
+	struct hns_roce_cmq_desc desc[2];
+	int ret, i;
+
+	for (i = 0; i < 2; i++) {
+		hns_roce_cmq_setup_basic_desc(&desc[i],
+					      HNS_ROCE_OPC_QUERY_PF_TIMER_RES,
+					      true);
+
+		if (i == 0)
+			desc[i].flag |= cpu_to_le16(HNS_ROCE_CMD_FLAG_NEXT);
+		else
+			desc[i].flag &= ~cpu_to_le16(HNS_ROCE_CMD_FLAG_NEXT);
+	}
+
+	ret = hns_roce_cmq_send(hr_dev, desc, 2);
+	if (ret)
+		return ret;
+
+	req_a = (struct hns_roce_pf_timer_res_a *)desc[0].data;
+
+	hr_dev->caps.qpc_timer_bt_num =
+				roce_get_field(req_a->qpc_timer_bt_idx_num,
+					PF_RES_DATA_1_PF_QPC_TIMER_BT_NUM_M,
+					PF_RES_DATA_1_PF_QPC_TIMER_BT_NUM_S);
+	hr_dev->caps.cqc_timer_bt_num =
+				roce_get_field(req_a->cqc_timer_bt_idx_num,
+					PF_RES_DATA_2_PF_CQC_TIMER_BT_NUM_M,
+					PF_RES_DATA_2_PF_CQC_TIMER_BT_NUM_S);
+
+	return 0;
+}
+
+static int hns_roce_set_vf_switch_param(struct hns_roce_dev *hr_dev,
+						  int vf_id)
+{
+	struct hns_roce_cmq_desc desc;
+	struct hns_roce_vf_switch *swt;
+	int ret;
+
+	swt = (struct hns_roce_vf_switch *)desc.data;
+	hns_roce_cmq_setup_basic_desc(&desc, HNS_SWITCH_PARAMETER_CFG, true);
+	swt->rocee_sel |= cpu_to_le32(HNS_ICL_SWITCH_CMD_ROCEE_SEL);
+	roce_set_field(swt->fun_id,
+			VF_SWITCH_DATA_FUN_ID_VF_ID_M,
+			VF_SWITCH_DATA_FUN_ID_VF_ID_S,
+			vf_id);
+	ret = hns_roce_cmq_send(hr_dev, &desc, 1);
+	if (ret)
+		return ret;
+	desc.flag =
+		cpu_to_le16(HNS_ROCE_CMD_FLAG_NO_INTR | HNS_ROCE_CMD_FLAG_IN);
+	desc.flag &= cpu_to_le16(~HNS_ROCE_CMD_FLAG_WR);
+	roce_set_bit(swt->cfg, VF_SWITCH_DATA_CFG_ALW_LPBK_S, 1);
+	roce_set_bit(swt->cfg, VF_SWITCH_DATA_CFG_ALW_LCL_LPBK_S, 0);
+	roce_set_bit(swt->cfg, VF_SWITCH_DATA_CFG_ALW_DST_OVRD_S, 1);
+
+	return hns_roce_cmq_send(hr_dev, &desc, 1);
+}
+
 static int hns_roce_alloc_vf_resource(struct hns_roce_dev *hr_dev)
 {
 	struct hns_roce_cmq_desc desc[2];
@@ -1089,6 +1489,14 @@
 				       VF_RES_B_DATA_3_VF_SL_NUM_M,
 				       VF_RES_B_DATA_3_VF_SL_NUM_S,
 				       HNS_ROCE_VF_SL_NUM);
+
+			roce_set_field(req_b->vf_sccc_idx_num,
+				       VF_RES_B_DATA_4_VF_SCCC_BT_IDX_M,
+				       VF_RES_B_DATA_4_VF_SCCC_BT_IDX_S, 0);
+			roce_set_field(req_b->vf_sccc_idx_num,
+				       VF_RES_B_DATA_4_VF_SCCC_BT_NUM_M,
+				       VF_RES_B_DATA_4_VF_SCCC_BT_NUM_S,
+				       HNS_ROCE_VF_SCCC_BT_NUM);
 		}
 	}
 
@@ -1101,6 +1509,7 @@
 	u8 qpc_hop_num = hr_dev->caps.qpc_hop_num;
 	u8 cqc_hop_num = hr_dev->caps.cqc_hop_num;
 	u8 mpt_hop_num = hr_dev->caps.mpt_hop_num;
+	u8 sccc_hop_num = hr_dev->caps.sccc_hop_num;
 	struct hns_roce_cfg_bt_attr *req;
 	struct hns_roce_cmq_desc desc;
 
@@ -1148,6 +1557,20 @@
 		       CFG_BT_ATTR_DATA_3_VF_MPT_HOPNUM_S,
 		       mpt_hop_num == HNS_ROCE_HOP_NUM_0 ? 0 : mpt_hop_num);
 
+	roce_set_field(req->vf_sccc_cfg,
+		       CFG_BT_ATTR_DATA_4_VF_SCCC_BA_PGSZ_M,
+		       CFG_BT_ATTR_DATA_4_VF_SCCC_BA_PGSZ_S,
+		       hr_dev->caps.sccc_ba_pg_sz + PG_SHIFT_OFFSET);
+	roce_set_field(req->vf_sccc_cfg,
+		       CFG_BT_ATTR_DATA_4_VF_SCCC_BUF_PGSZ_M,
+		       CFG_BT_ATTR_DATA_4_VF_SCCC_BUF_PGSZ_S,
+		       hr_dev->caps.sccc_buf_pg_sz + PG_SHIFT_OFFSET);
+	roce_set_field(req->vf_sccc_cfg,
+		       CFG_BT_ATTR_DATA_4_VF_SCCC_HOPNUM_M,
+		       CFG_BT_ATTR_DATA_4_VF_SCCC_HOPNUM_S,
+		       sccc_hop_num ==
+			      HNS_ROCE_HOP_NUM_0 ? 0 : sccc_hop_num);
+
 	return hns_roce_cmq_send(hr_dev, &desc, 1);
 }
 
@@ -1158,6 +1581,13 @@
 
 	ret = hns_roce_cmq_query_hw_info(hr_dev);
 	if (ret) {
+		dev_err(hr_dev->dev, "Query hardware version fail, ret = %d.\n",
+			ret);
+		return ret;
+	}
+
+	ret = hns_roce_query_fw_ver(hr_dev);
+	if (ret) {
 		dev_err(hr_dev->dev, "Query firmware version fail, ret = %d.\n",
 			ret);
 		return ret;
@@ -1178,6 +1608,16 @@
 		return ret;
 	}
 
+	if (hr_dev->pci_dev->revision == 0x21) {
+		ret = hns_roce_query_pf_timer_resource(hr_dev);
+		if (ret) {
+			dev_err(hr_dev->dev,
+				"Query pf timer resource fail, ret = %d.\n",
+				ret);
+			return ret;
+		}
+	}
+
 	ret = hns_roce_alloc_vf_resource(hr_dev);
 	if (ret) {
 		dev_err(hr_dev->dev, "Allocate vf resource fail, ret = %d.\n",
@@ -1185,16 +1625,31 @@
 		return ret;
 	}
 
-	hr_dev->vendor_part_id = 0;
-	hr_dev->sys_image_guid = 0;
+	if (hr_dev->pci_dev->revision == 0x21) {
+		ret = hns_roce_set_vf_switch_param(hr_dev, 0);
+		if (ret) {
+			dev_err(hr_dev->dev,
+				"Set function switch param fail, ret = %d.\n",
+				ret);
+			return ret;
+		}
+	}
+
+	hr_dev->vendor_part_id = hr_dev->pci_dev->device;
+	hr_dev->sys_image_guid = be64_to_cpu(hr_dev->ib_dev.node_guid);
 
 	caps->num_qps		= HNS_ROCE_V2_MAX_QP_NUM;
 	caps->max_wqes		= HNS_ROCE_V2_MAX_WQE_NUM;
 	caps->num_cqs		= HNS_ROCE_V2_MAX_CQ_NUM;
+	caps->num_srqs		= HNS_ROCE_V2_MAX_SRQ_NUM;
+	caps->min_cqes		= HNS_ROCE_MIN_CQE_NUM;
 	caps->max_cqes		= HNS_ROCE_V2_MAX_CQE_NUM;
+	caps->max_srqwqes	= HNS_ROCE_V2_MAX_SRQWQE_NUM;
 	caps->max_sq_sg		= HNS_ROCE_V2_MAX_SQ_SGE_NUM;
+	caps->max_extend_sg	= HNS_ROCE_V2_MAX_EXTEND_SGE_NUM;
 	caps->max_rq_sg		= HNS_ROCE_V2_MAX_RQ_SGE_NUM;
 	caps->max_sq_inline	= HNS_ROCE_V2_MAX_SQ_INLINE;
+	caps->max_srq_sg	= HNS_ROCE_V2_MAX_SRQ_SGE_NUM;
 	caps->num_uars		= HNS_ROCE_V2_UAR_NUM;
 	caps->phy_num_uars	= HNS_ROCE_V2_PHY_UAR_NUM;
 	caps->num_aeq_vectors	= HNS_ROCE_V2_AEQE_VEC_NUM;
@@ -1203,6 +1658,8 @@
 	caps->num_mtpts		= HNS_ROCE_V2_MAX_MTPT_NUM;
 	caps->num_mtt_segs	= HNS_ROCE_V2_MAX_MTT_SEGS;
 	caps->num_cqe_segs	= HNS_ROCE_V2_MAX_CQE_SEGS;
+	caps->num_srqwqe_segs	= HNS_ROCE_V2_MAX_SRQWQE_SEGS;
+	caps->num_idx_segs	= HNS_ROCE_V2_MAX_IDX_SEGS;
 	caps->num_pds		= HNS_ROCE_V2_MAX_PD_NUM;
 	caps->max_qp_init_rdma	= HNS_ROCE_V2_MAX_QP_INIT_RDMA;
 	caps->max_qp_dest_rdma	= HNS_ROCE_V2_MAX_QP_DEST_RDMA;
@@ -1213,8 +1670,10 @@
 	caps->irrl_entry_sz	= HNS_ROCE_V2_IRRL_ENTRY_SZ;
 	caps->trrl_entry_sz	= HNS_ROCE_V2_TRRL_ENTRY_SZ;
 	caps->cqc_entry_sz	= HNS_ROCE_V2_CQC_ENTRY_SZ;
+	caps->srqc_entry_sz	= HNS_ROCE_V2_SRQC_ENTRY_SZ;
 	caps->mtpt_entry_sz	= HNS_ROCE_V2_MTPT_ENTRY_SZ;
 	caps->mtt_entry_sz	= HNS_ROCE_V2_MTT_ENTRY_SZ;
+	caps->idx_entry_sz	= 4;
 	caps->cq_entry_sz	= HNS_ROCE_V2_CQE_ENTRY_SIZE;
 	caps->page_size_cap	= HNS_ROCE_V2_PAGE_SIZE_SUPPORTED;
 	caps->reserved_lkey	= 0;
@@ -1222,28 +1681,39 @@
 	caps->reserved_mrws	= 1;
 	caps->reserved_uars	= 0;
 	caps->reserved_cqs	= 0;
+	caps->reserved_srqs	= 0;
+	caps->reserved_qps	= HNS_ROCE_V2_RSV_QPS;
 
 	caps->qpc_ba_pg_sz	= 0;
 	caps->qpc_buf_pg_sz	= 0;
 	caps->qpc_hop_num	= HNS_ROCE_CONTEXT_HOP_NUM;
 	caps->srqc_ba_pg_sz	= 0;
 	caps->srqc_buf_pg_sz	= 0;
-	caps->srqc_hop_num	= HNS_ROCE_HOP_NUM_0;
+	caps->srqc_hop_num	= HNS_ROCE_CONTEXT_HOP_NUM;
 	caps->cqc_ba_pg_sz	= 0;
 	caps->cqc_buf_pg_sz	= 0;
 	caps->cqc_hop_num	= HNS_ROCE_CONTEXT_HOP_NUM;
 	caps->mpt_ba_pg_sz	= 0;
 	caps->mpt_buf_pg_sz	= 0;
 	caps->mpt_hop_num	= HNS_ROCE_CONTEXT_HOP_NUM;
-	caps->pbl_ba_pg_sz	= 0;
+	caps->pbl_ba_pg_sz	= 2;
 	caps->pbl_buf_pg_sz	= 0;
 	caps->pbl_hop_num	= HNS_ROCE_PBL_HOP_NUM;
 	caps->mtt_ba_pg_sz	= 0;
 	caps->mtt_buf_pg_sz	= 0;
 	caps->mtt_hop_num	= HNS_ROCE_MTT_HOP_NUM;
-	caps->cqe_ba_pg_sz	= 0;
+	caps->wqe_sq_hop_num	= 2;
+	caps->wqe_sge_hop_num	= 1;
+	caps->wqe_rq_hop_num	= 2;
+	caps->cqe_ba_pg_sz	= 6;
 	caps->cqe_buf_pg_sz	= 0;
 	caps->cqe_hop_num	= HNS_ROCE_CQE_HOP_NUM;
+	caps->srqwqe_ba_pg_sz	= 0;
+	caps->srqwqe_buf_pg_sz	= 0;
+	caps->srqwqe_hop_num	= HNS_ROCE_SRQWQE_HOP_NUM;
+	caps->idx_ba_pg_sz	= 0;
+	caps->idx_buf_pg_sz	= 0;
+	caps->idx_hop_num	= HNS_ROCE_IDX_HOP_NUM;
 	caps->eqe_ba_pg_sz	= 0;
 	caps->eqe_buf_pg_sz	= 0;
 	caps->eqe_hop_num	= HNS_ROCE_EQE_HOP_NUM;
@@ -1255,6 +1725,11 @@
 				  HNS_ROCE_CAP_FLAG_RQ_INLINE |
 				  HNS_ROCE_CAP_FLAG_RECORD_DB |
 				  HNS_ROCE_CAP_FLAG_SQ_RECORD_DB;
+
+	if (hr_dev->pci_dev->revision == 0x21)
+		caps->flags |= HNS_ROCE_CAP_FLAG_MW |
+			       HNS_ROCE_CAP_FLAG_FRMR;
+
 	caps->pkey_table_len[0] = 1;
 	caps->gid_table_len[0] = HNS_ROCE_V2_GID_INDEX_NUM;
 	caps->ceqe_depth	= HNS_ROCE_V2_COMP_EQE_NUM;
@@ -1262,6 +1737,32 @@
 	caps->local_ca_ack_delay = 0;
 	caps->max_mtu = IB_MTU_4096;
 
+	caps->max_srqs		= HNS_ROCE_V2_MAX_SRQ;
+	caps->max_srq_wrs	= HNS_ROCE_V2_MAX_SRQ_WR;
+	caps->max_srq_sges	= HNS_ROCE_V2_MAX_SRQ_SGE;
+
+	if (hr_dev->pci_dev->revision == 0x21) {
+		caps->flags |= HNS_ROCE_CAP_FLAG_ATOMIC |
+			       HNS_ROCE_CAP_FLAG_SRQ |
+			       HNS_ROCE_CAP_FLAG_QP_FLOW_CTRL;
+
+		caps->num_qpc_timer	  = HNS_ROCE_V2_MAX_QPC_TIMER_NUM;
+		caps->qpc_timer_entry_sz  = HNS_ROCE_V2_QPC_TIMER_ENTRY_SZ;
+		caps->qpc_timer_ba_pg_sz  = 0;
+		caps->qpc_timer_buf_pg_sz = 0;
+		caps->qpc_timer_hop_num   = HNS_ROCE_HOP_NUM_0;
+		caps->num_cqc_timer	  = HNS_ROCE_V2_MAX_CQC_TIMER_NUM;
+		caps->cqc_timer_entry_sz  = HNS_ROCE_V2_CQC_TIMER_ENTRY_SZ;
+		caps->cqc_timer_ba_pg_sz  = 0;
+		caps->cqc_timer_buf_pg_sz = 0;
+		caps->cqc_timer_hop_num   = HNS_ROCE_HOP_NUM_0;
+
+		caps->sccc_entry_sz	= HNS_ROCE_V2_SCCC_ENTRY_SZ;
+		caps->sccc_ba_pg_sz	= 0;
+		caps->sccc_buf_pg_sz    = 0;
+		caps->sccc_hop_num	= HNS_ROCE_SCCC_HOP_NUM;
+	}
+
 	ret = hns_roce_v2_set_bt(hr_dev);
 	if (ret)
 		dev_err(hr_dev->dev, "Configure bt attribute fail, ret = %d.\n",
@@ -1312,9 +1813,10 @@
 			desc[i].flag &= ~cpu_to_le16(HNS_ROCE_CMD_FLAG_NEXT);
 
 		if (i == 0) {
-			req_a->base_addr_l = link_tbl->table.map & 0xffffffff;
-			req_a->base_addr_h = (link_tbl->table.map >> 32) &
-					     0xffffffff;
+			req_a->base_addr_l =
+				cpu_to_le32(link_tbl->table.map & 0xffffffff);
+			req_a->base_addr_h =
+				cpu_to_le32(link_tbl->table.map >> 32);
 			roce_set_field(req_a->depth_pgsz_init_en,
 				       CFG_LLM_QUE_DEPTH_M,
 				       CFG_LLM_QUE_DEPTH_S,
@@ -1323,13 +1825,15 @@
 				       CFG_LLM_QUE_PGSZ_M,
 				       CFG_LLM_QUE_PGSZ_S,
 				       link_tbl->pg_sz);
-			req_a->head_ba_l = entry[0].blk_ba0;
-			req_a->head_ba_h_nxtptr = entry[0].blk_ba1_nxt_ptr;
+			req_a->head_ba_l = cpu_to_le32(entry[0].blk_ba0);
+			req_a->head_ba_h_nxtptr =
+				cpu_to_le32(entry[0].blk_ba1_nxt_ptr);
 			roce_set_field(req_a->head_ptr,
 				       CFG_LLM_HEAD_PTR_M,
 				       CFG_LLM_HEAD_PTR_S, 0);
 		} else {
-			req_b->tail_ba_l = entry[page_num - 1].blk_ba0;
+			req_b->tail_ba_l =
+				cpu_to_le32(entry[page_num - 1].blk_ba0);
 			roce_set_field(req_b->tail_ba_h,
 				       CFG_LLM_TAIL_BA_H_M,
 				       CFG_LLM_TAIL_BA_H_S,
@@ -1404,19 +1908,14 @@
 			goto err_alloc_buf_failed;
 
 		link_tbl->pg_list[i].map = t;
-		memset(link_tbl->pg_list[i].buf, 0, buf_chk_sz);
 
-		entry[i].blk_ba0 = (t >> 12) & 0xffffffff;
-		roce_set_field(entry[i].blk_ba1_nxt_ptr,
-			       HNS_ROCE_LINK_TABLE_BA1_M,
-			       HNS_ROCE_LINK_TABLE_BA1_S,
-			       t >> 44);
+		entry[i].blk_ba0 = (u32)(t >> 12);
+		entry[i].blk_ba1_nxt_ptr = (u32)(t >> 44);
 
 		if (i < (pg_num - 1))
-			roce_set_field(entry[i].blk_ba1_nxt_ptr,
-				       HNS_ROCE_LINK_TABLE_NXT_PTR_M,
-				       HNS_ROCE_LINK_TABLE_NXT_PTR_S,
-				       i + 1);
+			entry[i].blk_ba1_nxt_ptr |=
+				(i + 1) << HNS_ROCE_LINK_TABLE_NXT_PTR_S;
+
 	}
 	link_tbl->npages = pg_num;
 	link_tbl->pg_sz = buf_chk_sz;
@@ -1461,7 +1960,8 @@
 static int hns_roce_v2_init(struct hns_roce_dev *hr_dev)
 {
 	struct hns_roce_v2_priv *priv = hr_dev->priv;
-	int ret;
+	int qpc_count, cqc_count;
+	int ret, i;
 
 	/* TSQ includes SQ doorbell and ack doorbell */
 	ret = hns_roce_init_link_table(hr_dev, TSQ_LINK_TABLE);
@@ -1476,8 +1976,40 @@
 		goto err_tpq_init_failed;
 	}
 
+	/* Alloc memory for QPC Timer buffer space chunk */
+	for (qpc_count = 0; qpc_count < hr_dev->caps.qpc_timer_bt_num;
+	     qpc_count++) {
+		ret = hns_roce_table_get(hr_dev, &hr_dev->qpc_timer_table,
+					 qpc_count);
+		if (ret) {
+			dev_err(hr_dev->dev, "QPC Timer get failed\n");
+			goto err_qpc_timer_failed;
+		}
+	}
+
+	/* Alloc memory for CQC Timer buffer space chunk */
+	for (cqc_count = 0; cqc_count < hr_dev->caps.cqc_timer_bt_num;
+	     cqc_count++) {
+		ret = hns_roce_table_get(hr_dev, &hr_dev->cqc_timer_table,
+					 cqc_count);
+		if (ret) {
+			dev_err(hr_dev->dev, "CQC Timer get failed\n");
+			goto err_cqc_timer_failed;
+		}
+	}
+
 	return 0;
 
+err_cqc_timer_failed:
+	for (i = 0; i < cqc_count; i++)
+		hns_roce_table_put(hr_dev, &hr_dev->cqc_timer_table, i);
+
+err_qpc_timer_failed:
+	for (i = 0; i < qpc_count; i++)
+		hns_roce_table_put(hr_dev, &hr_dev->qpc_timer_table, i);
+
+	hns_roce_free_link_table(hr_dev, &priv->tpq);
+
 err_tpq_init_failed:
 	hns_roce_free_link_table(hr_dev, &priv->tsq);
 
@@ -1488,34 +2020,69 @@
 {
 	struct hns_roce_v2_priv *priv = hr_dev->priv;
 
+	if (hr_dev->pci_dev->revision == 0x21)
+		hns_roce_function_clear(hr_dev);
+
 	hns_roce_free_link_table(hr_dev, &priv->tpq);
 	hns_roce_free_link_table(hr_dev, &priv->tsq);
 }
 
+static int hns_roce_query_mbox_status(struct hns_roce_dev *hr_dev)
+{
+	struct hns_roce_cmq_desc desc;
+	struct hns_roce_mbox_status *mb_st =
+				       (struct hns_roce_mbox_status *)desc.data;
+	enum hns_roce_cmd_return_status status;
+
+	hns_roce_cmq_setup_basic_desc(&desc, HNS_ROCE_OPC_QUERY_MB_ST, true);
+
+	status = hns_roce_cmq_send(hr_dev, &desc, 1);
+	if (status)
+		return status;
+
+	return le32_to_cpu(mb_st->mb_status_hw_run);
+}
+
 static int hns_roce_v2_cmd_pending(struct hns_roce_dev *hr_dev)
 {
-	u32 status = readl(hr_dev->reg_base + ROCEE_VF_MB_STATUS_REG);
+	u32 status = hns_roce_query_mbox_status(hr_dev);
 
 	return status >> HNS_ROCE_HW_RUN_BIT_SHIFT;
 }
 
 static int hns_roce_v2_cmd_complete(struct hns_roce_dev *hr_dev)
 {
-	u32 status = readl(hr_dev->reg_base + ROCEE_VF_MB_STATUS_REG);
+	u32 status = hns_roce_query_mbox_status(hr_dev);
 
 	return status & HNS_ROCE_HW_MB_STATUS_MASK;
 }
 
+static int hns_roce_mbox_post(struct hns_roce_dev *hr_dev, u64 in_param,
+			      u64 out_param, u32 in_modifier, u8 op_modifier,
+			      u16 op, u16 token, int event)
+{
+	struct hns_roce_cmq_desc desc;
+	struct hns_roce_post_mbox *mb = (struct hns_roce_post_mbox *)desc.data;
+
+	hns_roce_cmq_setup_basic_desc(&desc, HNS_ROCE_OPC_POST_MB, false);
+
+	mb->in_param_l = cpu_to_le32(in_param);
+	mb->in_param_h = cpu_to_le32(in_param >> 32);
+	mb->out_param_l = cpu_to_le32(out_param);
+	mb->out_param_h = cpu_to_le32(out_param >> 32);
+	mb->cmd_tag = cpu_to_le32(in_modifier << 8 | op);
+	mb->token_event_en = cpu_to_le32(event << 16 | token);
+
+	return hns_roce_cmq_send(hr_dev, &desc, 1);
+}
+
 static int hns_roce_v2_post_mbox(struct hns_roce_dev *hr_dev, u64 in_param,
 				 u64 out_param, u32 in_modifier, u8 op_modifier,
 				 u16 op, u16 token, int event)
 {
 	struct device *dev = hr_dev->dev;
-	u32 __iomem *hcr = (u32 __iomem *)(hr_dev->reg_base +
-					   ROCEE_VF_MB_CFG0_REG);
 	unsigned long end;
-	u32 val0 = 0;
-	u32 val1 = 0;
+	int ret;
 
 	end = msecs_to_jiffies(HNS_ROCE_V2_GO_BIT_TIMEOUT_MSECS) + jiffies;
 	while (hns_roce_v2_cmd_pending(hr_dev)) {
@@ -1527,34 +2094,19 @@
 		cond_resched();
 	}
 
-	roce_set_field(val0, HNS_ROCE_VF_MB4_TAG_MASK,
-		       HNS_ROCE_VF_MB4_TAG_SHIFT, in_modifier);
-	roce_set_field(val0, HNS_ROCE_VF_MB4_CMD_MASK,
-		       HNS_ROCE_VF_MB4_CMD_SHIFT, op);
-	roce_set_field(val1, HNS_ROCE_VF_MB5_EVENT_MASK,
-		       HNS_ROCE_VF_MB5_EVENT_SHIFT, event);
-	roce_set_field(val1, HNS_ROCE_VF_MB5_TOKEN_MASK,
-		       HNS_ROCE_VF_MB5_TOKEN_SHIFT, token);
+	ret = hns_roce_mbox_post(hr_dev, in_param, out_param, in_modifier,
+				 op_modifier, op, token, event);
+	if (ret)
+		dev_err(dev, "Post mailbox fail(%d)\n", ret);
 
-	writeq(in_param, hcr + 0);
-	writeq(out_param, hcr + 2);
-
-	/* Memory barrier */
-	wmb();
-
-	writel(val0, hcr + 4);
-	writel(val1, hcr + 5);
-
-	mmiowb();
-
-	return 0;
+	return ret;
 }
 
 static int hns_roce_v2_chk_mbox(struct hns_roce_dev *hr_dev,
 				unsigned long timeout)
 {
 	struct device *dev = hr_dev->dev;
-	unsigned long end = 0;
+	unsigned long end;
 	u32 status;
 
 	end = msecs_to_jiffies(timeout) + jiffies;
@@ -1568,6 +2120,9 @@
 
 	status = hns_roce_v2_cmd_complete(hr_dev);
 	if (status != 0x1) {
+		if (status == CMD_RST_PRC_EBUSY)
+			return status;
+
 		dev_err(dev, "mailbox status 0x%x!\n", status);
 		return -EBUSY;
 	}
@@ -1656,7 +2211,7 @@
 	roce_set_field(smac_tb->vf_smac_h_rsv,
 		       CFG_SMAC_TB_VF_SMAC_H_M,
 		       CFG_SMAC_TB_VF_SMAC_H_S, reg_smac_h);
-	smac_tb->vf_smac_l = reg_smac_l;
+	smac_tb->vf_smac_l = cpu_to_le32(reg_smac_l);
 
 	return hns_roce_cmq_send(hr_dev, &desc, 1);
 }
@@ -1664,12 +2219,10 @@
 static int set_mtpt_pbl(struct hns_roce_v2_mpt_entry *mpt_entry,
 			struct hns_roce_mr *mr)
 {
-	struct scatterlist *sg;
+	struct sg_dma_page_iter sg_iter;
 	u64 page_addr;
 	u64 *pages;
-	int i, j;
-	int len;
-	int entry;
+	int i;
 
 	mpt_entry->pbl_size = cpu_to_le32(mr->pbl_size);
 	mpt_entry->pbl_ba_l = cpu_to_le32(lower_32_bits(mr->pbl_ba >> 3));
@@ -1682,17 +2235,14 @@
 		return -ENOMEM;
 
 	i = 0;
-	for_each_sg(mr->umem->sg_head.sgl, sg, mr->umem->nmap, entry) {
-		len = sg_dma_len(sg) >> PAGE_SHIFT;
-		for (j = 0; j < len; ++j) {
-			page_addr = sg_dma_address(sg) +
-				(j << mr->umem->page_shift);
-			pages[i] = page_addr >> 6;
-			/* Record the first 2 entry directly to MTPT table */
-			if (i >= HNS_ROCE_V2_MAX_INNER_MTPT_NUM - 1)
-				goto found;
-			i++;
-		}
+	for_each_sg_dma_page(mr->umem->sg_head.sgl, &sg_iter, mr->umem->nmap, 0) {
+		page_addr = sg_page_iter_dma_address(&sg_iter);
+		pages[i] = page_addr >> 6;
+
+		/* Record the first 2 entry directly to MTPT table */
+		if (i >= HNS_ROCE_V2_MAX_INNER_MTPT_NUM - 1)
+			goto found;
+		i++;
 	}
 found:
 	mpt_entry->pa0_l = cpu_to_le32(lower_32_bits(pages[0]));
@@ -1734,11 +2284,12 @@
 		       V2_MPT_BYTE_4_PD_S, mr->pd);
 
 	roce_set_bit(mpt_entry->byte_8_mw_cnt_en, V2_MPT_BYTE_8_RA_EN_S, 0);
-	roce_set_bit(mpt_entry->byte_8_mw_cnt_en, V2_MPT_BYTE_8_R_INV_EN_S, 1);
-	roce_set_bit(mpt_entry->byte_8_mw_cnt_en, V2_MPT_BYTE_8_L_INV_EN_S, 0);
+	roce_set_bit(mpt_entry->byte_8_mw_cnt_en, V2_MPT_BYTE_8_R_INV_EN_S, 0);
+	roce_set_bit(mpt_entry->byte_8_mw_cnt_en, V2_MPT_BYTE_8_L_INV_EN_S, 1);
 	roce_set_bit(mpt_entry->byte_8_mw_cnt_en, V2_MPT_BYTE_8_BIND_EN_S,
 		     (mr->access & IB_ACCESS_MW_BIND ? 1 : 0));
-	roce_set_bit(mpt_entry->byte_8_mw_cnt_en, V2_MPT_BYTE_8_ATOMIC_EN_S, 0);
+	roce_set_bit(mpt_entry->byte_8_mw_cnt_en, V2_MPT_BYTE_8_ATOMIC_EN_S,
+		     mr->access & IB_ACCESS_REMOTE_ATOMIC ? 1 : 0);
 	roce_set_bit(mpt_entry->byte_8_mw_cnt_en, V2_MPT_BYTE_8_RR_EN_S,
 		     (mr->access & IB_ACCESS_REMOTE_READ ? 1 : 0));
 	roce_set_bit(mpt_entry->byte_8_mw_cnt_en, V2_MPT_BYTE_8_RW_EN_S,
@@ -1773,6 +2324,9 @@
 	struct hns_roce_v2_mpt_entry *mpt_entry = mb_buf;
 	int ret = 0;
 
+	roce_set_field(mpt_entry->byte_4_pd_hop_st, V2_MPT_BYTE_4_MPT_ST_M,
+		       V2_MPT_BYTE_4_MPT_ST_S, V2_MPT_ST_VALID);
+
 	if (flags & IB_MR_REREG_PD) {
 		roce_set_field(mpt_entry->byte_4_pd_hop_st, V2_MPT_BYTE_4_PD_M,
 			       V2_MPT_BYTE_4_PD_S, pdn);
@@ -1809,6 +2363,88 @@
 	return ret;
 }
 
+static int hns_roce_v2_frmr_write_mtpt(void *mb_buf, struct hns_roce_mr *mr)
+{
+	struct hns_roce_v2_mpt_entry *mpt_entry;
+
+	mpt_entry = mb_buf;
+	memset(mpt_entry, 0, sizeof(*mpt_entry));
+
+	roce_set_field(mpt_entry->byte_4_pd_hop_st, V2_MPT_BYTE_4_MPT_ST_M,
+		       V2_MPT_BYTE_4_MPT_ST_S, V2_MPT_ST_FREE);
+	roce_set_field(mpt_entry->byte_4_pd_hop_st, V2_MPT_BYTE_4_PBL_HOP_NUM_M,
+		       V2_MPT_BYTE_4_PBL_HOP_NUM_S, 1);
+	roce_set_field(mpt_entry->byte_4_pd_hop_st,
+		       V2_MPT_BYTE_4_PBL_BA_PG_SZ_M,
+		       V2_MPT_BYTE_4_PBL_BA_PG_SZ_S,
+		       mr->pbl_ba_pg_sz + PG_SHIFT_OFFSET);
+	roce_set_field(mpt_entry->byte_4_pd_hop_st, V2_MPT_BYTE_4_PD_M,
+		       V2_MPT_BYTE_4_PD_S, mr->pd);
+
+	roce_set_bit(mpt_entry->byte_8_mw_cnt_en, V2_MPT_BYTE_8_RA_EN_S, 1);
+	roce_set_bit(mpt_entry->byte_8_mw_cnt_en, V2_MPT_BYTE_8_R_INV_EN_S, 1);
+	roce_set_bit(mpt_entry->byte_8_mw_cnt_en, V2_MPT_BYTE_8_L_INV_EN_S, 1);
+
+	roce_set_bit(mpt_entry->byte_12_mw_pa, V2_MPT_BYTE_12_FRE_S, 1);
+	roce_set_bit(mpt_entry->byte_12_mw_pa, V2_MPT_BYTE_12_PA_S, 0);
+	roce_set_bit(mpt_entry->byte_12_mw_pa, V2_MPT_BYTE_12_MR_MW_S, 0);
+	roce_set_bit(mpt_entry->byte_12_mw_pa, V2_MPT_BYTE_12_BPD_S, 1);
+
+	mpt_entry->pbl_size = cpu_to_le32(mr->pbl_size);
+
+	mpt_entry->pbl_ba_l = cpu_to_le32(lower_32_bits(mr->pbl_ba >> 3));
+	roce_set_field(mpt_entry->byte_48_mode_ba, V2_MPT_BYTE_48_PBL_BA_H_M,
+		       V2_MPT_BYTE_48_PBL_BA_H_S,
+		       upper_32_bits(mr->pbl_ba >> 3));
+
+	roce_set_field(mpt_entry->byte_64_buf_pa1,
+		       V2_MPT_BYTE_64_PBL_BUF_PG_SZ_M,
+		       V2_MPT_BYTE_64_PBL_BUF_PG_SZ_S,
+		       mr->pbl_buf_pg_sz + PG_SHIFT_OFFSET);
+
+	return 0;
+}
+
+static int hns_roce_v2_mw_write_mtpt(void *mb_buf, struct hns_roce_mw *mw)
+{
+	struct hns_roce_v2_mpt_entry *mpt_entry;
+
+	mpt_entry = mb_buf;
+	memset(mpt_entry, 0, sizeof(*mpt_entry));
+
+	roce_set_field(mpt_entry->byte_4_pd_hop_st, V2_MPT_BYTE_4_MPT_ST_M,
+		       V2_MPT_BYTE_4_MPT_ST_S, V2_MPT_ST_FREE);
+	roce_set_field(mpt_entry->byte_4_pd_hop_st, V2_MPT_BYTE_4_PD_M,
+		       V2_MPT_BYTE_4_PD_S, mw->pdn);
+	roce_set_field(mpt_entry->byte_4_pd_hop_st,
+		       V2_MPT_BYTE_4_PBL_HOP_NUM_M,
+		       V2_MPT_BYTE_4_PBL_HOP_NUM_S,
+		       mw->pbl_hop_num == HNS_ROCE_HOP_NUM_0 ?
+		       0 : mw->pbl_hop_num);
+	roce_set_field(mpt_entry->byte_4_pd_hop_st,
+		       V2_MPT_BYTE_4_PBL_BA_PG_SZ_M,
+		       V2_MPT_BYTE_4_PBL_BA_PG_SZ_S,
+		       mw->pbl_ba_pg_sz + PG_SHIFT_OFFSET);
+
+	roce_set_bit(mpt_entry->byte_8_mw_cnt_en, V2_MPT_BYTE_8_R_INV_EN_S, 1);
+	roce_set_bit(mpt_entry->byte_8_mw_cnt_en, V2_MPT_BYTE_8_L_INV_EN_S, 1);
+
+	roce_set_bit(mpt_entry->byte_12_mw_pa, V2_MPT_BYTE_12_PA_S, 0);
+	roce_set_bit(mpt_entry->byte_12_mw_pa, V2_MPT_BYTE_12_MR_MW_S, 1);
+	roce_set_bit(mpt_entry->byte_12_mw_pa, V2_MPT_BYTE_12_BPD_S, 1);
+	roce_set_bit(mpt_entry->byte_12_mw_pa, V2_MPT_BYTE_12_BQP_S,
+		     mw->ibmw.type == IB_MW_TYPE_1 ? 0 : 1);
+
+	roce_set_field(mpt_entry->byte_64_buf_pa1,
+		       V2_MPT_BYTE_64_PBL_BUF_PG_SZ_M,
+		       V2_MPT_BYTE_64_PBL_BUF_PG_SZ_S,
+		       mw->pbl_buf_pg_sz + PG_SHIFT_OFFSET);
+
+	mpt_entry->lkey = cpu_to_le32(mw->rkey);
+
+	return 0;
+}
+
 static void *get_cqe_v2(struct hns_roce_cq *hr_cq, int n)
 {
 	return hns_roce_buf_offset(&hr_cq->hr_buf.hr_buf,
@@ -1829,6 +2465,22 @@
 	return get_sw_cqe_v2(hr_cq, hr_cq->cons_index);
 }
 
+static void *get_srq_wqe(struct hns_roce_srq *srq, int n)
+{
+	return hns_roce_buf_offset(&srq->buf, n << srq->wqe_shift);
+}
+
+static void hns_roce_free_srq_wqe(struct hns_roce_srq *srq, int wqe_index)
+{
+	/* always called with interrupts disabled. */
+	spin_lock(&srq->lock);
+
+	bitmap_clear(srq->idx_que.bitmap, wqe_index, 1);
+	srq->tail++;
+
+	spin_unlock(&srq->lock);
+}
+
 static void hns_roce_v2_cq_set_ci(struct hns_roce_cq *hr_cq, u32 cons_index)
 {
 	*hr_cq->set_ci_db = cons_index & 0xffffff;
@@ -1840,11 +2492,12 @@
 	struct hns_roce_v2_cqe *cqe, *dest;
 	u32 prod_index;
 	int nfreed = 0;
+	int wqe_index;
 	u8 owner_bit;
 
 	for (prod_index = hr_cq->cons_index; get_sw_cqe_v2(hr_cq, prod_index);
 	     ++prod_index) {
-		if (prod_index == hr_cq->cons_index + hr_cq->ib_cq.cqe)
+		if (prod_index > hr_cq->cons_index + hr_cq->ib_cq.cqe)
 			break;
 	}
 
@@ -1857,7 +2510,13 @@
 		if ((roce_get_field(cqe->byte_16, V2_CQE_BYTE_16_LCL_QPN_M,
 				    V2_CQE_BYTE_16_LCL_QPN_S) &
 				    HNS_ROCE_V2_CQE_QPN_MASK) == qpn) {
-			/* In v1 engine, not support SRQ */
+			if (srq &&
+			    roce_get_bit(cqe->byte_4, V2_CQE_BYTE_4_S_R_S)) {
+				wqe_index = roce_get_field(cqe->byte_4,
+						     V2_CQE_BYTE_4_WQE_INDX_M,
+						     V2_CQE_BYTE_4_WQE_INDX_S);
+				hns_roce_free_srq_wqe(srq, wqe_index);
+			}
 			++nfreed;
 		} else if (nfreed) {
 			dest = get_cqe_v2(hr_cq, (prod_index + nfreed) &
@@ -1907,29 +2566,26 @@
 		       V2_CQC_BYTE_4_SHIFT_S, ilog2((unsigned int)nent));
 	roce_set_field(cq_context->byte_4_pg_ceqn, V2_CQC_BYTE_4_CEQN_M,
 		       V2_CQC_BYTE_4_CEQN_S, vector);
-	cq_context->byte_4_pg_ceqn = cpu_to_le32(cq_context->byte_4_pg_ceqn);
 
 	roce_set_field(cq_context->byte_8_cqn, V2_CQC_BYTE_8_CQN_M,
 		       V2_CQC_BYTE_8_CQN_S, hr_cq->cqn);
 
-	cq_context->cqe_cur_blk_addr = (u32)(mtts[0] >> PAGE_ADDR_SHIFT);
-	cq_context->cqe_cur_blk_addr =
-				cpu_to_le32(cq_context->cqe_cur_blk_addr);
+	cq_context->cqe_cur_blk_addr = cpu_to_le32(mtts[0] >> PAGE_ADDR_SHIFT);
 
 	roce_set_field(cq_context->byte_16_hop_addr,
 		       V2_CQC_BYTE_16_CQE_CUR_BLK_ADDR_M,
 		       V2_CQC_BYTE_16_CQE_CUR_BLK_ADDR_S,
-		       cpu_to_le32((mtts[0]) >> (32 + PAGE_ADDR_SHIFT)));
+		       mtts[0] >> (32 + PAGE_ADDR_SHIFT));
 	roce_set_field(cq_context->byte_16_hop_addr,
 		       V2_CQC_BYTE_16_CQE_HOP_NUM_M,
 		       V2_CQC_BYTE_16_CQE_HOP_NUM_S, hr_dev->caps.cqe_hop_num ==
 		       HNS_ROCE_HOP_NUM_0 ? 0 : hr_dev->caps.cqe_hop_num);
 
-	cq_context->cqe_nxt_blk_addr = (u32)(mtts[1] >> PAGE_ADDR_SHIFT);
+	cq_context->cqe_nxt_blk_addr = cpu_to_le32(mtts[1] >> PAGE_ADDR_SHIFT);
 	roce_set_field(cq_context->byte_24_pgsz_addr,
 		       V2_CQC_BYTE_24_CQE_NXT_BLK_ADDR_M,
 		       V2_CQC_BYTE_24_CQE_NXT_BLK_ADDR_S,
-		       cpu_to_le32((mtts[1]) >> (32 + PAGE_ADDR_SHIFT)));
+		       mtts[1] >> (32 + PAGE_ADDR_SHIFT));
 	roce_set_field(cq_context->byte_24_pgsz_addr,
 		       V2_CQC_BYTE_24_CQE_BA_PG_SZ_M,
 		       V2_CQC_BYTE_24_CQE_BA_PG_SZ_S,
@@ -1939,7 +2595,7 @@
 		       V2_CQC_BYTE_24_CQE_BUF_PG_SZ_S,
 		       hr_dev->caps.cqe_buf_pg_sz + PG_SHIFT_OFFSET);
 
-	cq_context->cqe_ba = (u32)(dma_handle >> 3);
+	cq_context->cqe_ba = cpu_to_le32(dma_handle >> 3);
 
 	roce_set_field(cq_context->byte_40_cqe_ba, V2_CQC_BYTE_40_CQE_BA_M,
 		       V2_CQC_BYTE_40_CQE_BA_S, (dma_handle >> (32 + 3)));
@@ -1952,7 +2608,7 @@
 		       V2_CQC_BYTE_44_DB_RECORD_ADDR_M,
 		       V2_CQC_BYTE_44_DB_RECORD_ADDR_S,
 		       ((u32)hr_cq->db.dma) >> 1);
-	cq_context->db_record_addr = hr_cq->db.dma >> 32;
+	cq_context->db_record_addr = cpu_to_le32(hr_cq->db.dma >> 32);
 
 	roce_set_field(cq_context->byte_56_cqe_period_maxcnt,
 		       V2_CQC_BYTE_56_CQ_MAX_CNT_M,
@@ -1967,9 +2623,10 @@
 static int hns_roce_v2_req_notify_cq(struct ib_cq *ibcq,
 				     enum ib_cq_notify_flags flags)
 {
+	struct hns_roce_dev *hr_dev = to_hr_dev(ibcq->device);
 	struct hns_roce_cq *hr_cq = to_hr_cq(ibcq);
 	u32 notification_flag;
-	u32 doorbell[2];
+	__le32 doorbell[2];
 
 	doorbell[0] = 0;
 	doorbell[1] = 0;
@@ -1992,7 +2649,7 @@
 	roce_set_bit(doorbell[1], V2_CQ_DB_PARAMETER_NOTIFY_S,
 		     notification_flag);
 
-	hns_roce_write64_k(doorbell, hr_cq->cq_db_l);
+	hns_roce_write64(hr_dev, doorbell, hr_cq->cq_db_l);
 
 	return 0;
 }
@@ -2034,6 +2691,7 @@
 static int hns_roce_v2_poll_one(struct hns_roce_cq *hr_cq,
 				struct hns_roce_qp **cur_qp, struct ib_wc *wc)
 {
+	struct hns_roce_srq *srq = NULL;
 	struct hns_roce_dev *hr_dev;
 	struct hns_roce_v2_cqe *cqe;
 	struct hns_roce_qp *hr_qp;
@@ -2076,6 +2734,37 @@
 	wc->qp = &(*cur_qp)->ibqp;
 	wc->vendor_err = 0;
 
+	if (is_send) {
+		wq = &(*cur_qp)->sq;
+		if ((*cur_qp)->sq_signal_bits) {
+			/*
+			 * If sg_signal_bit is 1,
+			 * firstly tail pointer updated to wqe
+			 * which current cqe correspond to
+			 */
+			wqe_ctr = (u16)roce_get_field(cqe->byte_4,
+						      V2_CQE_BYTE_4_WQE_INDX_M,
+						      V2_CQE_BYTE_4_WQE_INDX_S);
+			wq->tail += (wqe_ctr - (u16)wq->tail) &
+				    (wq->wqe_cnt - 1);
+		}
+
+		wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)];
+		++wq->tail;
+	} else if ((*cur_qp)->ibqp.srq) {
+		srq = to_hr_srq((*cur_qp)->ibqp.srq);
+		wqe_ctr = (u16)roce_get_field(cqe->byte_4,
+					      V2_CQE_BYTE_4_WQE_INDX_M,
+					      V2_CQE_BYTE_4_WQE_INDX_S);
+		wc->wr_id = srq->wrid[wqe_ctr];
+		hns_roce_free_srq_wqe(srq, wqe_ctr);
+	} else {
+		/* Update tail pointer, record wr_id */
+		wq = &(*cur_qp)->rq;
+		wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)];
+		++wq->tail;
+	}
+
 	status = roce_get_field(cqe->byte_4, V2_CQE_BYTE_4_STATUS_M,
 				V2_CQE_BYTE_4_STATUS_S);
 	switch (status & HNS_ROCE_V2_CQE_STATUS_MASK) {
@@ -2195,23 +2884,6 @@
 			wc->status = IB_WC_GENERAL_ERR;
 			break;
 		}
-
-		wq = &(*cur_qp)->sq;
-		if ((*cur_qp)->sq_signal_bits) {
-			/*
-			 * If sg_signal_bit is 1,
-			 * firstly tail pointer updated to wqe
-			 * which current cqe correspond to
-			 */
-			wqe_ctr = (u16)roce_get_field(cqe->byte_4,
-						      V2_CQE_BYTE_4_WQE_INDX_M,
-						      V2_CQE_BYTE_4_WQE_INDX_S);
-			wq->tail += (wqe_ctr - (u16)wq->tail) &
-				    (wq->wqe_cnt - 1);
-		}
-
-		wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)];
-		++wq->tail;
 	} else {
 		/* RQ correspond to CQE */
 		wc->byte_len = le32_to_cpu(cqe->byte_cnt);
@@ -2256,16 +2928,12 @@
 				return -EAGAIN;
 		}
 
-		/* Update tail pointer, record wr_id */
-		wq = &(*cur_qp)->rq;
-		wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)];
-		++wq->tail;
-
 		wc->sl = (u8)roce_get_field(cqe->byte_32, V2_CQE_BYTE_32_SL_M,
 					    V2_CQE_BYTE_32_SL_S);
 		wc->src_qp = (u8)roce_get_field(cqe->byte_32,
 						V2_CQE_BYTE_32_RMT_QPN_M,
 						V2_CQE_BYTE_32_RMT_QPN_S);
+		wc->slid = 0;
 		wc->wc_flags |= (roce_get_bit(cqe->byte_32,
 					      V2_CQE_BYTE_32_GRH_S) ?
 					      IB_WC_GRH : 0);
@@ -2279,8 +2947,16 @@
 		wc->smac[5] = roce_get_field(cqe->byte_28,
 					     V2_CQE_BYTE_28_SMAC_5_M,
 					     V2_CQE_BYTE_28_SMAC_5_S);
-		wc->vlan_id = 0xffff;
-		wc->wc_flags |= (IB_WC_WITH_VLAN | IB_WC_WITH_SMAC);
+		wc->wc_flags |= IB_WC_WITH_SMAC;
+		if (roce_get_bit(cqe->byte_28, V2_CQE_BYTE_28_VID_VLD_S)) {
+			wc->vlan_id = (u16)roce_get_field(cqe->byte_28,
+							  V2_CQE_BYTE_28_VID_M,
+							  V2_CQE_BYTE_28_VID_S);
+			wc->wc_flags |= IB_WC_WITH_VLAN;
+		} else {
+			wc->vlan_id = 0xffff;
+		}
+
 		wc->network_hdr_type = roce_get_field(cqe->byte_28,
 						    V2_CQE_BYTE_28_PORT_TYPE_M,
 						    V2_CQE_BYTE_28_PORT_TYPE_S);
@@ -2315,11 +2991,49 @@
 	return npolled;
 }
 
+static int get_op_for_set_hem(struct hns_roce_dev *hr_dev, u32 type,
+			      int step_idx)
+{
+	int op;
+
+	if (type == HEM_TYPE_SCCC && step_idx)
+		return -EINVAL;
+
+	switch (type) {
+	case HEM_TYPE_QPC:
+		op = HNS_ROCE_CMD_WRITE_QPC_BT0;
+		break;
+	case HEM_TYPE_MTPT:
+		op = HNS_ROCE_CMD_WRITE_MPT_BT0;
+		break;
+	case HEM_TYPE_CQC:
+		op = HNS_ROCE_CMD_WRITE_CQC_BT0;
+		break;
+	case HEM_TYPE_SRQC:
+		op = HNS_ROCE_CMD_WRITE_SRQC_BT0;
+		break;
+	case HEM_TYPE_SCCC:
+		op = HNS_ROCE_CMD_WRITE_SCCC_BT0;
+		break;
+	case HEM_TYPE_QPC_TIMER:
+		op = HNS_ROCE_CMD_WRITE_QPC_TIMER_BT0;
+		break;
+	case HEM_TYPE_CQC_TIMER:
+		op = HNS_ROCE_CMD_WRITE_CQC_TIMER_BT0;
+		break;
+	default:
+		dev_warn(hr_dev->dev,
+			 "Table %d not to be written by mailbox!\n", type);
+		return -EINVAL;
+	}
+
+	return op + step_idx;
+}
+
 static int hns_roce_v2_set_hem(struct hns_roce_dev *hr_dev,
 			       struct hns_roce_hem_table *table, int obj,
 			       int step_idx)
 {
-	struct device *dev = hr_dev->dev;
 	struct hns_roce_cmd_mailbox *mailbox;
 	struct hns_roce_hem_iter iter;
 	struct hns_roce_hem_mhop mhop;
@@ -2332,7 +3046,7 @@
 	u64 bt_ba = 0;
 	u32 chunk_ba_num;
 	u32 hop_num;
-	u16 op = 0xff;
+	int op;
 
 	if (!hns_roce_check_whether_mhop(hr_dev, table->type))
 		return 0;
@@ -2354,30 +3068,17 @@
 		hem_idx = i;
 	}
 
-	switch (table->type) {
-	case HEM_TYPE_QPC:
-		op = HNS_ROCE_CMD_WRITE_QPC_BT0;
-		break;
-	case HEM_TYPE_MTPT:
-		op = HNS_ROCE_CMD_WRITE_MPT_BT0;
-		break;
-	case HEM_TYPE_CQC:
-		op = HNS_ROCE_CMD_WRITE_CQC_BT0;
-		break;
-	case HEM_TYPE_SRQC:
-		op = HNS_ROCE_CMD_WRITE_SRQC_BT0;
-		break;
-	default:
-		dev_warn(dev, "Table %d not to be written by mailbox!\n",
-			 table->type);
+	op = get_op_for_set_hem(hr_dev, table->type, step_idx);
+	if (op == -EINVAL)
 		return 0;
-	}
-	op += step_idx;
 
 	mailbox = hns_roce_alloc_cmd_mailbox(hr_dev);
 	if (IS_ERR(mailbox))
 		return PTR_ERR(mailbox);
 
+	if (table->type == HEM_TYPE_SCCC)
+		obj = mhop.l0_idx;
+
 	if (check_whether_last_step(hop_num, step_idx)) {
 		hem = table->hem[hem_idx];
 		for (hns_roce_hem_first(hem, &iter);
@@ -2410,7 +3111,7 @@
 {
 	struct device *dev = hr_dev->dev;
 	struct hns_roce_cmd_mailbox *mailbox;
-	int ret = 0;
+	int ret;
 	u16 op = 0xff;
 
 	if (!hns_roce_check_whether_mhop(hr_dev, table->type))
@@ -2426,6 +3127,10 @@
 	case HEM_TYPE_CQC:
 		op = HNS_ROCE_CMD_DESTROY_CQC_BT0;
 		break;
+	case HEM_TYPE_SCCC:
+	case HEM_TYPE_QPC_TIMER:
+	case HEM_TYPE_CQC_TIMER:
+		break;
 	case HEM_TYPE_SRQC:
 		op = HNS_ROCE_CMD_DESTROY_SRQC_BT0;
 		break;
@@ -2434,6 +3139,12 @@
 			 table->type);
 		return 0;
 	}
+
+	if (table->type == HEM_TYPE_SCCC ||
+	    table->type == HEM_TYPE_QPC_TIMER ||
+	    table->type == HEM_TYPE_CQC_TIMER)
+		return 0;
+
 	op += step_idx;
 
 	mailbox = hns_roce_alloc_cmd_mailbox(hr_dev);
@@ -2449,7 +3160,6 @@
 }
 
 static int hns_roce_v2_qp_modify(struct hns_roce_dev *hr_dev,
-				 struct hns_roce_mtt *mtt,
 				 enum ib_qp_state cur_state,
 				 enum ib_qp_state new_state,
 				 struct hns_roce_v2_qp_context *context,
@@ -2503,6 +3213,43 @@
 	roce_set_bit(qpc_mask->byte_76_srqn_op_en, V2_QPC_BYTE_76_ATE_S, 0);
 }
 
+static void set_qpc_wqe_cnt(struct hns_roce_qp *hr_qp,
+			    struct hns_roce_v2_qp_context *context,
+			    struct hns_roce_v2_qp_context *qpc_mask)
+{
+	if (hr_qp->ibqp.qp_type == IB_QPT_GSI)
+		roce_set_field(context->byte_4_sqpn_tst,
+			       V2_QPC_BYTE_4_SGE_SHIFT_M,
+			       V2_QPC_BYTE_4_SGE_SHIFT_S,
+			       ilog2((unsigned int)hr_qp->sge.sge_cnt));
+	else
+		roce_set_field(context->byte_4_sqpn_tst,
+			       V2_QPC_BYTE_4_SGE_SHIFT_M,
+			       V2_QPC_BYTE_4_SGE_SHIFT_S,
+			       hr_qp->sq.max_gs >
+			       HNS_ROCE_V2_UC_RC_SGE_NUM_IN_WQE ?
+			       ilog2((unsigned int)hr_qp->sge.sge_cnt) : 0);
+
+	roce_set_field(qpc_mask->byte_4_sqpn_tst, V2_QPC_BYTE_4_SGE_SHIFT_M,
+		       V2_QPC_BYTE_4_SGE_SHIFT_S, 0);
+
+	roce_set_field(context->byte_20_smac_sgid_idx,
+		       V2_QPC_BYTE_20_SQ_SHIFT_M, V2_QPC_BYTE_20_SQ_SHIFT_S,
+		       ilog2((unsigned int)hr_qp->sq.wqe_cnt));
+	roce_set_field(qpc_mask->byte_20_smac_sgid_idx,
+		       V2_QPC_BYTE_20_SQ_SHIFT_M, V2_QPC_BYTE_20_SQ_SHIFT_S, 0);
+
+	roce_set_field(context->byte_20_smac_sgid_idx,
+		       V2_QPC_BYTE_20_RQ_SHIFT_M, V2_QPC_BYTE_20_RQ_SHIFT_S,
+		       (hr_qp->ibqp.qp_type == IB_QPT_XRC_INI ||
+		       hr_qp->ibqp.qp_type == IB_QPT_XRC_TGT ||
+		       hr_qp->ibqp.srq) ? 0 :
+		       ilog2((unsigned int)hr_qp->rq.wqe_cnt));
+
+	roce_set_field(qpc_mask->byte_20_smac_sgid_idx,
+		       V2_QPC_BYTE_20_RQ_SHIFT_M, V2_QPC_BYTE_20_RQ_SHIFT_S, 0);
+}
+
 static void modify_qp_reset_to_init(struct ib_qp *ibqp,
 				    const struct ib_qp_attr *attr,
 				    int attr_mask,
@@ -2523,21 +3270,6 @@
 	roce_set_field(qpc_mask->byte_4_sqpn_tst, V2_QPC_BYTE_4_TST_M,
 		       V2_QPC_BYTE_4_TST_S, 0);
 
-	if (ibqp->qp_type == IB_QPT_GSI)
-		roce_set_field(context->byte_4_sqpn_tst,
-			       V2_QPC_BYTE_4_SGE_SHIFT_M,
-			       V2_QPC_BYTE_4_SGE_SHIFT_S,
-			       ilog2((unsigned int)hr_qp->sge.sge_cnt));
-	else
-		roce_set_field(context->byte_4_sqpn_tst,
-			       V2_QPC_BYTE_4_SGE_SHIFT_M,
-			       V2_QPC_BYTE_4_SGE_SHIFT_S,
-			       hr_qp->sq.max_gs > 2 ?
-			       ilog2((unsigned int)hr_qp->sge.sge_cnt) : 0);
-
-	roce_set_field(qpc_mask->byte_4_sqpn_tst, V2_QPC_BYTE_4_SGE_SHIFT_M,
-		       V2_QPC_BYTE_4_SGE_SHIFT_S, 0);
-
 	roce_set_field(context->byte_4_sqpn_tst, V2_QPC_BYTE_4_SQPN_M,
 		       V2_QPC_BYTE_4_SQPN_S, hr_qp->qpn);
 	roce_set_field(qpc_mask->byte_4_sqpn_tst, V2_QPC_BYTE_4_SQPN_M,
@@ -2553,17 +3285,7 @@
 	roce_set_field(qpc_mask->byte_20_smac_sgid_idx, V2_QPC_BYTE_20_RQWS_M,
 		       V2_QPC_BYTE_20_RQWS_S, 0);
 
-	roce_set_field(context->byte_20_smac_sgid_idx,
-		       V2_QPC_BYTE_20_SQ_SHIFT_M, V2_QPC_BYTE_20_SQ_SHIFT_S,
-		       ilog2((unsigned int)hr_qp->sq.wqe_cnt));
-	roce_set_field(qpc_mask->byte_20_smac_sgid_idx,
-		       V2_QPC_BYTE_20_SQ_SHIFT_M, V2_QPC_BYTE_20_SQ_SHIFT_S, 0);
-
-	roce_set_field(context->byte_20_smac_sgid_idx,
-		       V2_QPC_BYTE_20_RQ_SHIFT_M, V2_QPC_BYTE_20_RQ_SHIFT_S,
-		       ilog2((unsigned int)hr_qp->rq.wqe_cnt));
-	roce_set_field(qpc_mask->byte_20_smac_sgid_idx,
-		       V2_QPC_BYTE_20_RQ_SHIFT_M, V2_QPC_BYTE_20_RQ_SHIFT_S, 0);
+	set_qpc_wqe_cnt(hr_qp, context, qpc_mask);
 
 	/* No VLAN need to set 0xFFF */
 	roce_set_field(context->byte_24_mtu_tc, V2_QPC_BYTE_24_VLAN_ID_M,
@@ -2581,30 +3303,19 @@
 	roce_set_bit(qpc_mask->byte_56_dqpn_err, V2_QPC_BYTE_56_RQ_TX_ERR_S, 0);
 	roce_set_bit(qpc_mask->byte_56_dqpn_err, V2_QPC_BYTE_56_RQ_RX_ERR_S, 0);
 
-	roce_set_field(qpc_mask->byte_60_qpst_mapid, V2_QPC_BYTE_60_MAPID_M,
-		       V2_QPC_BYTE_60_MAPID_S, 0);
+	roce_set_field(qpc_mask->byte_60_qpst_tempid, V2_QPC_BYTE_60_TEMPID_M,
+		       V2_QPC_BYTE_60_TEMPID_S, 0);
 
-	roce_set_bit(qpc_mask->byte_60_qpst_mapid,
-		     V2_QPC_BYTE_60_INNER_MAP_IND_S, 0);
-	roce_set_bit(qpc_mask->byte_60_qpst_mapid, V2_QPC_BYTE_60_SQ_MAP_IND_S,
-		     0);
-	roce_set_bit(qpc_mask->byte_60_qpst_mapid, V2_QPC_BYTE_60_RQ_MAP_IND_S,
-		     0);
-	roce_set_bit(qpc_mask->byte_60_qpst_mapid, V2_QPC_BYTE_60_EXT_MAP_IND_S,
-		     0);
-	roce_set_bit(qpc_mask->byte_60_qpst_mapid, V2_QPC_BYTE_60_SQ_RLS_IND_S,
-		     0);
-	roce_set_bit(qpc_mask->byte_60_qpst_mapid, V2_QPC_BYTE_60_SQ_EXT_IND_S,
-		     0);
+	roce_set_field(qpc_mask->byte_60_qpst_tempid,
+		       V2_QPC_BYTE_60_SCC_TOKEN_M, V2_QPC_BYTE_60_SCC_TOKEN_S,
+		       0);
+	roce_set_bit(qpc_mask->byte_60_qpst_tempid,
+		     V2_QPC_BYTE_60_SQ_DB_DOING_S, 0);
+	roce_set_bit(qpc_mask->byte_60_qpst_tempid,
+		     V2_QPC_BYTE_60_RQ_DB_DOING_S, 0);
 	roce_set_bit(qpc_mask->byte_28_at_fl, V2_QPC_BYTE_28_CNP_TX_FLAG_S, 0);
 	roce_set_bit(qpc_mask->byte_28_at_fl, V2_QPC_BYTE_28_CE_FLAG_S, 0);
 
-	if (attr_mask & IB_QP_QKEY) {
-		context->qkey_xrcd = attr->qkey;
-		qpc_mask->qkey_xrcd = 0;
-		hr_qp->qkey = attr->qkey;
-	}
-
 	if (hr_qp->rdb_en) {
 		roce_set_bit(context->byte_68_rq_db,
 			     V2_QPC_BYTE_68_RQ_RECORD_EN_S, 1);
@@ -2619,7 +3330,7 @@
 	roce_set_field(qpc_mask->byte_68_rq_db,
 		       V2_QPC_BYTE_68_RQ_DB_RECORD_ADDR_M,
 		       V2_QPC_BYTE_68_RQ_DB_RECORD_ADDR_S, 0);
-	context->rq_db_record_addr = hr_qp->rdb.dma >> 32;
+	context->rq_db_record_addr = cpu_to_le32(hr_qp->rdb.dma >> 32);
 	qpc_mask->rq_db_record_addr = 0;
 
 	roce_set_bit(context->byte_76_srqn_op_en, V2_QPC_BYTE_76_RQIE_S,
@@ -2677,7 +3388,8 @@
 	roce_set_field(qpc_mask->byte_132_trrl, V2_QPC_BYTE_132_TRRL_TAIL_MAX_M,
 		       V2_QPC_BYTE_132_TRRL_TAIL_MAX_S, 0);
 
-	roce_set_bit(qpc_mask->byte_140_raq, V2_QPC_BYTE_140_RSVD_RAQ_MAP_S, 0);
+	roce_set_bit(qpc_mask->byte_140_raq, V2_QPC_BYTE_140_RQ_RTY_WAIT_DO_S,
+		     0);
 	roce_set_field(qpc_mask->byte_140_raq, V2_QPC_BYTE_140_RAQ_TRRL_HEAD_M,
 		       V2_QPC_BYTE_140_RAQ_TRRL_HEAD_S, 0);
 	roce_set_field(qpc_mask->byte_140_raq, V2_QPC_BYTE_140_RAQ_TRRL_TAIL_M,
@@ -2686,8 +3398,6 @@
 	roce_set_field(qpc_mask->byte_144_raq,
 		       V2_QPC_BYTE_144_RAQ_RTY_INI_PSN_M,
 		       V2_QPC_BYTE_144_RAQ_RTY_INI_PSN_S, 0);
-	roce_set_bit(qpc_mask->byte_144_raq, V2_QPC_BYTE_144_RAQ_RTY_INI_IND_S,
-		     0);
 	roce_set_field(qpc_mask->byte_144_raq, V2_QPC_BYTE_144_RAQ_CREDIT_M,
 		       V2_QPC_BYTE_144_RAQ_CREDIT_S, 0);
 	roce_set_bit(qpc_mask->byte_144_raq, V2_QPC_BYTE_144_RESP_RTY_FLG_S, 0);
@@ -2713,14 +3423,12 @@
 		       V2_QPC_BYTE_160_SQ_CONSUMER_IDX_M,
 		       V2_QPC_BYTE_160_SQ_CONSUMER_IDX_S, 0);
 
-	roce_set_field(context->byte_168_irrl_idx,
-		       V2_QPC_BYTE_168_SQ_SHIFT_BAK_M,
-		       V2_QPC_BYTE_168_SQ_SHIFT_BAK_S,
-		       ilog2((unsigned int)hr_qp->sq.wqe_cnt));
-	roce_set_field(qpc_mask->byte_168_irrl_idx,
-		       V2_QPC_BYTE_168_SQ_SHIFT_BAK_M,
-		       V2_QPC_BYTE_168_SQ_SHIFT_BAK_S, 0);
-
+	roce_set_bit(qpc_mask->byte_168_irrl_idx,
+		     V2_QPC_BYTE_168_POLL_DB_WAIT_DO_S, 0);
+	roce_set_bit(qpc_mask->byte_168_irrl_idx,
+		     V2_QPC_BYTE_168_SCC_TOKEN_FORBID_SQ_DEQ_S, 0);
+	roce_set_bit(qpc_mask->byte_168_irrl_idx,
+		     V2_QPC_BYTE_168_WAIT_ACK_TIMEOUT_S, 0);
 	roce_set_bit(qpc_mask->byte_168_irrl_idx,
 		     V2_QPC_BYTE_168_MSG_RTY_LP_FLG_S, 0);
 	roce_set_bit(qpc_mask->byte_168_irrl_idx,
@@ -2738,6 +3446,9 @@
 	roce_set_bit(qpc_mask->byte_172_sq_psn, V2_QPC_BYTE_172_MSG_RNR_FLG_S,
 		     0);
 
+	roce_set_bit(context->byte_172_sq_psn, V2_QPC_BYTE_172_FRE_S, 1);
+	roce_set_bit(qpc_mask->byte_172_sq_psn, V2_QPC_BYTE_172_FRE_S, 0);
+
 	roce_set_field(qpc_mask->byte_176_msg_pktn,
 		       V2_QPC_BYTE_176_MSG_USE_PKTN_M,
 		       V2_QPC_BYTE_176_MSG_USE_PKTN_S, 0);
@@ -2782,6 +3493,13 @@
 		       V2_QPC_BYTE_232_IRRL_SGE_IDX_M,
 		       V2_QPC_BYTE_232_IRRL_SGE_IDX_S, 0);
 
+	roce_set_bit(qpc_mask->byte_232_irrl_sge, V2_QPC_BYTE_232_SO_LP_VLD_S,
+		     0);
+	roce_set_bit(qpc_mask->byte_232_irrl_sge,
+		     V2_QPC_BYTE_232_FENCE_LP_VLD_S, 0);
+	roce_set_bit(qpc_mask->byte_232_irrl_sge, V2_QPC_BYTE_232_IRRL_LP_VLD_S,
+		     0);
+
 	qpc_mask->irrl_cur_sge_offset = 0;
 
 	roce_set_field(qpc_mask->byte_240_irrl_tail,
@@ -2809,7 +3527,6 @@
 		     0);
 
 	hr_qp->access_flags = attr->qp_access_flags;
-	hr_qp->pkey_index = attr->pkey_index;
 	roce_set_field(context->byte_252_err_txcqn, V2_QPC_BYTE_252_TX_CQN_M,
 		       V2_QPC_BYTE_252_TX_CQN_S, to_hr_cq(ibqp->send_cq)->cqn);
 	roce_set_field(qpc_mask->byte_252_err_txcqn, V2_QPC_BYTE_252_TX_CQN_M,
@@ -2844,20 +3561,6 @@
 	roce_set_field(qpc_mask->byte_4_sqpn_tst, V2_QPC_BYTE_4_TST_M,
 		       V2_QPC_BYTE_4_TST_S, 0);
 
-	if (ibqp->qp_type == IB_QPT_GSI)
-		roce_set_field(context->byte_4_sqpn_tst,
-			       V2_QPC_BYTE_4_SGE_SHIFT_M,
-			       V2_QPC_BYTE_4_SGE_SHIFT_S,
-			       ilog2((unsigned int)hr_qp->sge.sge_cnt));
-	else
-		roce_set_field(context->byte_4_sqpn_tst,
-			       V2_QPC_BYTE_4_SGE_SHIFT_M,
-			       V2_QPC_BYTE_4_SGE_SHIFT_S, hr_qp->sq.max_gs > 2 ?
-			       ilog2((unsigned int)hr_qp->sge.sge_cnt) : 0);
-
-	roce_set_field(qpc_mask->byte_4_sqpn_tst, V2_QPC_BYTE_4_SGE_SHIFT_M,
-		       V2_QPC_BYTE_4_SGE_SHIFT_S, 0);
-
 	if (attr_mask & IB_QP_ACCESS_FLAGS) {
 		roce_set_bit(context->byte_76_srqn_op_en, V2_QPC_BYTE_76_RRE_S,
 			     !!(attr->qp_access_flags & IB_ACCESS_REMOTE_READ));
@@ -2892,18 +3595,6 @@
 			     0);
 	}
 
-	roce_set_field(context->byte_20_smac_sgid_idx,
-		       V2_QPC_BYTE_20_SQ_SHIFT_M, V2_QPC_BYTE_20_SQ_SHIFT_S,
-		       ilog2((unsigned int)hr_qp->sq.wqe_cnt));
-	roce_set_field(qpc_mask->byte_20_smac_sgid_idx,
-		       V2_QPC_BYTE_20_SQ_SHIFT_M, V2_QPC_BYTE_20_SQ_SHIFT_S, 0);
-
-	roce_set_field(context->byte_20_smac_sgid_idx,
-		       V2_QPC_BYTE_20_RQ_SHIFT_M, V2_QPC_BYTE_20_RQ_SHIFT_S,
-		       ilog2((unsigned int)hr_qp->rq.wqe_cnt));
-	roce_set_field(qpc_mask->byte_20_smac_sgid_idx,
-		       V2_QPC_BYTE_20_RQ_SHIFT_M, V2_QPC_BYTE_20_RQ_SHIFT_S, 0);
-
 	roce_set_field(context->byte_16_buf_ba_pg_sz, V2_QPC_BYTE_16_PD_M,
 		       V2_QPC_BYTE_16_PD_S, to_hr_pd(ibqp->pd)->pdn);
 	roce_set_field(qpc_mask->byte_16_buf_ba_pg_sz, V2_QPC_BYTE_16_PD_M,
@@ -2931,11 +3622,6 @@
 			       V2_QPC_BYTE_76_SRQN_M, V2_QPC_BYTE_76_SRQN_S, 0);
 	}
 
-	if (attr_mask & IB_QP_QKEY) {
-		context->qkey_xrcd = attr->qkey;
-		qpc_mask->qkey_xrcd = 0;
-	}
-
 	roce_set_field(context->byte_4_sqpn_tst, V2_QPC_BYTE_4_SQPN_M,
 		       V2_QPC_BYTE_4_SQPN_S, hr_qp->qpn);
 	roce_set_field(qpc_mask->byte_4_sqpn_tst, V2_QPC_BYTE_4_SQPN_M,
@@ -2947,13 +3633,31 @@
 		roce_set_field(qpc_mask->byte_56_dqpn_err,
 			       V2_QPC_BYTE_56_DQPN_M, V2_QPC_BYTE_56_DQPN_S, 0);
 	}
-	roce_set_field(context->byte_168_irrl_idx,
-		       V2_QPC_BYTE_168_SQ_SHIFT_BAK_M,
-		       V2_QPC_BYTE_168_SQ_SHIFT_BAK_S,
-		       ilog2((unsigned int)hr_qp->sq.wqe_cnt));
-	roce_set_field(qpc_mask->byte_168_irrl_idx,
-		       V2_QPC_BYTE_168_SQ_SHIFT_BAK_M,
-		       V2_QPC_BYTE_168_SQ_SHIFT_BAK_S, 0);
+}
+
+static bool check_wqe_rq_mtt_count(struct hns_roce_dev *hr_dev,
+				   struct hns_roce_qp *hr_qp, int mtt_cnt,
+				   u32 page_size)
+{
+	struct device *dev = hr_dev->dev;
+
+	if (hr_qp->rq.wqe_cnt < 1)
+		return true;
+
+	if (mtt_cnt < 1) {
+		dev_err(dev, "qp(0x%lx) rqwqe buf ba find failed\n",
+			hr_qp->qpn);
+		return false;
+	}
+
+	if (mtt_cnt < MTT_MIN_COUNT &&
+		(hr_qp->rq.offset + page_size) < hr_qp->buff_size) {
+		dev_err(dev, "qp(0x%lx) next rqwqe buf ba find failed\n",
+			hr_qp->qpn);
+		return false;
+	}
+
+	return true;
 }
 
 static int modify_qp_init_to_rtr(struct ib_qp *ibqp,
@@ -2965,25 +3669,27 @@
 	struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device);
 	struct hns_roce_qp *hr_qp = to_hr_qp(ibqp);
 	struct device *dev = hr_dev->dev;
+	u64 mtts[MTT_MIN_COUNT] = { 0 };
 	dma_addr_t dma_handle_3;
 	dma_addr_t dma_handle_2;
-	dma_addr_t dma_handle;
+	u64 wqe_sge_ba;
 	u32 page_size;
 	u8 port_num;
 	u64 *mtts_3;
 	u64 *mtts_2;
-	u64 *mtts;
+	int count;
 	u8 *dmac;
 	u8 *smac;
 	int port;
 
 	/* Search qp buf's mtts */
-	mtts = hns_roce_table_find(hr_dev, &hr_dev->mr_table.mtt_table,
-				   hr_qp->mtt.first_seg, &dma_handle);
-	if (!mtts) {
-		dev_err(dev, "qp buf pa find failed\n");
-		return -EINVAL;
-	}
+	page_size = 1 << (hr_dev->caps.mtt_buf_pg_sz + PAGE_SHIFT);
+	count = hns_roce_mtr_find(hr_dev, &hr_qp->mtr,
+				  hr_qp->rq.offset / page_size, mtts,
+				  MTT_MIN_COUNT, &wqe_sge_ba);
+	if (!ibqp->srq)
+		if (!check_wqe_rq_mtt_count(hr_dev, hr_qp, count, page_size))
+			return -EINVAL;
 
 	/* Search IRRL's mtts */
 	mtts_2 = hns_roce_table_find(hr_dev, &hr_dev->qp_table.irrl_table,
@@ -3007,7 +3713,7 @@
 	}
 
 	dmac = (u8 *)attr->ah_attr.roce.dmac;
-	context->wqe_sge_ba = (u32)(dma_handle >> 3);
+	context->wqe_sge_ba = cpu_to_le32(wqe_sge_ba >> 3);
 	qpc_mask->wqe_sge_ba = 0;
 
 	/*
@@ -3017,22 +3723,23 @@
 	 * 0 at the same time, else set them to 0x1.
 	 */
 	roce_set_field(context->byte_12_sq_hop, V2_QPC_BYTE_12_WQE_SGE_BA_M,
-		       V2_QPC_BYTE_12_WQE_SGE_BA_S, dma_handle >> (32 + 3));
+		       V2_QPC_BYTE_12_WQE_SGE_BA_S, wqe_sge_ba >> (32 + 3));
 	roce_set_field(qpc_mask->byte_12_sq_hop, V2_QPC_BYTE_12_WQE_SGE_BA_M,
 		       V2_QPC_BYTE_12_WQE_SGE_BA_S, 0);
 
 	roce_set_field(context->byte_12_sq_hop, V2_QPC_BYTE_12_SQ_HOP_NUM_M,
 		       V2_QPC_BYTE_12_SQ_HOP_NUM_S,
-		       hr_dev->caps.mtt_hop_num == HNS_ROCE_HOP_NUM_0 ?
-		       0 : hr_dev->caps.mtt_hop_num);
+		       hr_dev->caps.wqe_sq_hop_num == HNS_ROCE_HOP_NUM_0 ?
+		       0 : hr_dev->caps.wqe_sq_hop_num);
 	roce_set_field(qpc_mask->byte_12_sq_hop, V2_QPC_BYTE_12_SQ_HOP_NUM_M,
 		       V2_QPC_BYTE_12_SQ_HOP_NUM_S, 0);
 
 	roce_set_field(context->byte_20_smac_sgid_idx,
 		       V2_QPC_BYTE_20_SGE_HOP_NUM_M,
 		       V2_QPC_BYTE_20_SGE_HOP_NUM_S,
-		       ((ibqp->qp_type == IB_QPT_GSI) || hr_qp->sq.max_gs > 2) ?
-		       hr_dev->caps.mtt_hop_num : 0);
+		       ((ibqp->qp_type == IB_QPT_GSI) ||
+		       hr_qp->sq.max_gs > HNS_ROCE_V2_UC_RC_SGE_NUM_IN_WQE) ?
+		       hr_dev->caps.wqe_sge_hop_num : 0);
 	roce_set_field(qpc_mask->byte_20_smac_sgid_idx,
 		       V2_QPC_BYTE_20_SGE_HOP_NUM_M,
 		       V2_QPC_BYTE_20_SGE_HOP_NUM_S, 0);
@@ -3040,8 +3747,8 @@
 	roce_set_field(context->byte_20_smac_sgid_idx,
 		       V2_QPC_BYTE_20_RQ_HOP_NUM_M,
 		       V2_QPC_BYTE_20_RQ_HOP_NUM_S,
-		       hr_dev->caps.mtt_hop_num == HNS_ROCE_HOP_NUM_0 ?
-		       0 : hr_dev->caps.mtt_hop_num);
+		       hr_dev->caps.wqe_rq_hop_num == HNS_ROCE_HOP_NUM_0 ?
+		       0 : hr_dev->caps.wqe_rq_hop_num);
 	roce_set_field(qpc_mask->byte_20_smac_sgid_idx,
 		       V2_QPC_BYTE_20_RQ_HOP_NUM_M,
 		       V2_QPC_BYTE_20_RQ_HOP_NUM_S, 0);
@@ -3049,7 +3756,7 @@
 	roce_set_field(context->byte_16_buf_ba_pg_sz,
 		       V2_QPC_BYTE_16_WQE_SGE_BA_PG_SZ_M,
 		       V2_QPC_BYTE_16_WQE_SGE_BA_PG_SZ_S,
-		       hr_dev->caps.mtt_ba_pg_sz + PG_SHIFT_OFFSET);
+		       hr_qp->wqe_bt_pg_shift + PG_SHIFT_OFFSET);
 	roce_set_field(qpc_mask->byte_16_buf_ba_pg_sz,
 		       V2_QPC_BYTE_16_WQE_SGE_BA_PG_SZ_M,
 		       V2_QPC_BYTE_16_WQE_SGE_BA_PG_SZ_S, 0);
@@ -3062,52 +3769,33 @@
 		       V2_QPC_BYTE_16_WQE_SGE_BUF_PG_SZ_M,
 		       V2_QPC_BYTE_16_WQE_SGE_BUF_PG_SZ_S, 0);
 
-	roce_set_field(context->byte_80_rnr_rx_cqn,
-		       V2_QPC_BYTE_80_MIN_RNR_TIME_M,
-		       V2_QPC_BYTE_80_MIN_RNR_TIME_S, attr->min_rnr_timer);
-	roce_set_field(qpc_mask->byte_80_rnr_rx_cqn,
-		       V2_QPC_BYTE_80_MIN_RNR_TIME_M,
-		       V2_QPC_BYTE_80_MIN_RNR_TIME_S, 0);
-
-	page_size = 1 << (hr_dev->caps.mtt_buf_pg_sz + PAGE_SHIFT);
-	context->rq_cur_blk_addr = (u32)(mtts[hr_qp->rq.offset / page_size]
-				    >> PAGE_ADDR_SHIFT);
+	context->rq_cur_blk_addr = cpu_to_le32(mtts[0] >> PAGE_ADDR_SHIFT);
 	qpc_mask->rq_cur_blk_addr = 0;
 
 	roce_set_field(context->byte_92_srq_info,
 		       V2_QPC_BYTE_92_RQ_CUR_BLK_ADDR_M,
 		       V2_QPC_BYTE_92_RQ_CUR_BLK_ADDR_S,
-		       mtts[hr_qp->rq.offset / page_size]
-		       >> (32 + PAGE_ADDR_SHIFT));
+		       mtts[0] >> (32 + PAGE_ADDR_SHIFT));
 	roce_set_field(qpc_mask->byte_92_srq_info,
 		       V2_QPC_BYTE_92_RQ_CUR_BLK_ADDR_M,
 		       V2_QPC_BYTE_92_RQ_CUR_BLK_ADDR_S, 0);
 
-	context->rq_nxt_blk_addr = (u32)(mtts[hr_qp->rq.offset / page_size + 1]
-				    >> PAGE_ADDR_SHIFT);
+	context->rq_nxt_blk_addr = cpu_to_le32(mtts[1] >> PAGE_ADDR_SHIFT);
 	qpc_mask->rq_nxt_blk_addr = 0;
 
 	roce_set_field(context->byte_104_rq_sge,
 		       V2_QPC_BYTE_104_RQ_NXT_BLK_ADDR_M,
 		       V2_QPC_BYTE_104_RQ_NXT_BLK_ADDR_S,
-		       mtts[hr_qp->rq.offset / page_size + 1]
-		       >> (32 + PAGE_ADDR_SHIFT));
+		       mtts[1] >> (32 + PAGE_ADDR_SHIFT));
 	roce_set_field(qpc_mask->byte_104_rq_sge,
 		       V2_QPC_BYTE_104_RQ_NXT_BLK_ADDR_M,
 		       V2_QPC_BYTE_104_RQ_NXT_BLK_ADDR_S, 0);
 
-	roce_set_field(context->byte_108_rx_reqepsn,
-		       V2_QPC_BYTE_108_RX_REQ_EPSN_M,
-		       V2_QPC_BYTE_108_RX_REQ_EPSN_S, attr->rq_psn);
-	roce_set_field(qpc_mask->byte_108_rx_reqepsn,
-		       V2_QPC_BYTE_108_RX_REQ_EPSN_M,
-		       V2_QPC_BYTE_108_RX_REQ_EPSN_S, 0);
-
 	roce_set_field(context->byte_132_trrl, V2_QPC_BYTE_132_TRRL_BA_M,
 		       V2_QPC_BYTE_132_TRRL_BA_S, dma_handle_3 >> 4);
 	roce_set_field(qpc_mask->byte_132_trrl, V2_QPC_BYTE_132_TRRL_BA_M,
 		       V2_QPC_BYTE_132_TRRL_BA_S, 0);
-	context->trrl_ba = (u32)(dma_handle_3 >> (16 + 4));
+	context->trrl_ba = cpu_to_le32(dma_handle_3 >> (16 + 4));
 	qpc_mask->trrl_ba = 0;
 	roce_set_field(context->byte_140_raq, V2_QPC_BYTE_140_TRRL_BA_M,
 		       V2_QPC_BYTE_140_TRRL_BA_S,
@@ -3115,7 +3803,7 @@
 	roce_set_field(qpc_mask->byte_140_raq, V2_QPC_BYTE_140_TRRL_BA_M,
 		       V2_QPC_BYTE_140_TRRL_BA_S, 0);
 
-	context->irrl_ba = (u32)(dma_handle_2 >> 6);
+	context->irrl_ba = cpu_to_le32(dma_handle_2 >> 6);
 	qpc_mask->irrl_ba = 0;
 	roce_set_field(context->byte_208_irrl, V2_QPC_BYTE_208_IRRL_BA_M,
 		       V2_QPC_BYTE_208_IRRL_BA_S,
@@ -3141,15 +3829,6 @@
 		roce_set_bit(qpc_mask->byte_28_at_fl, V2_QPC_BYTE_28_LBI_S, 0);
 	}
 
-	if ((attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) &&
-	     attr->max_dest_rd_atomic) {
-		roce_set_field(context->byte_140_raq, V2_QPC_BYTE_140_RR_MAX_M,
-			       V2_QPC_BYTE_140_RR_MAX_S,
-			       fls(attr->max_dest_rd_atomic - 1));
-		roce_set_field(qpc_mask->byte_140_raq, V2_QPC_BYTE_140_RR_MAX_M,
-			       V2_QPC_BYTE_140_RR_MAX_S, 0);
-	}
-
 	if (attr_mask & IB_QP_DEST_QPN) {
 		roce_set_field(context->byte_56_dqpn_err, V2_QPC_BYTE_56_DQPN_M,
 			       V2_QPC_BYTE_56_DQPN_S, attr->dest_qp_num);
@@ -3167,13 +3846,14 @@
 	roce_set_field(qpc_mask->byte_20_smac_sgid_idx,
 		       V2_QPC_BYTE_20_SGID_IDX_M,
 		       V2_QPC_BYTE_20_SGID_IDX_S, 0);
-	memcpy(&(context->dmac), dmac, 4);
+	memcpy(&(context->dmac), dmac, sizeof(u32));
 	roce_set_field(context->byte_52_udpspn_dmac, V2_QPC_BYTE_52_DMAC_M,
 		       V2_QPC_BYTE_52_DMAC_S, *((u16 *)(&dmac[4])));
 	qpc_mask->dmac = 0;
 	roce_set_field(qpc_mask->byte_52_udpspn_dmac, V2_QPC_BYTE_52_DMAC_M,
 		       V2_QPC_BYTE_52_DMAC_S, 0);
 
+	/* mtu*(2^LP_PKTN_INI) should not bigger than 1 message length 64kb */
 	roce_set_field(context->byte_56_dqpn_err, V2_QPC_BYTE_56_LP_PKTN_INI_M,
 		       V2_QPC_BYTE_56_LP_PKTN_INI_S, 4);
 	roce_set_field(qpc_mask->byte_56_dqpn_err, V2_QPC_BYTE_56_LP_PKTN_INI_M,
@@ -3210,16 +3890,12 @@
 	context->rq_rnr_timer = 0;
 	qpc_mask->rq_rnr_timer = 0;
 
-	roce_set_field(context->byte_152_raq, V2_QPC_BYTE_152_RAQ_PSN_M,
-		       V2_QPC_BYTE_152_RAQ_PSN_S, attr->rq_psn - 1);
-	roce_set_field(qpc_mask->byte_152_raq, V2_QPC_BYTE_152_RAQ_PSN_M,
-		       V2_QPC_BYTE_152_RAQ_PSN_S, 0);
-
 	roce_set_field(qpc_mask->byte_132_trrl, V2_QPC_BYTE_132_TRRL_HEAD_MAX_M,
 		       V2_QPC_BYTE_132_TRRL_HEAD_MAX_S, 0);
 	roce_set_field(qpc_mask->byte_132_trrl, V2_QPC_BYTE_132_TRRL_TAIL_MAX_M,
 		       V2_QPC_BYTE_132_TRRL_TAIL_MAX_S, 0);
 
+	/* rocee send 2^lp_sgen_ini segs every time */
 	roce_set_field(context->byte_168_irrl_idx,
 		       V2_QPC_BYTE_168_LP_SGEN_INI_M,
 		       V2_QPC_BYTE_168_LP_SGEN_INI_S, 3);
@@ -3238,18 +3914,30 @@
 	struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device);
 	struct hns_roce_qp *hr_qp = to_hr_qp(ibqp);
 	struct device *dev = hr_dev->dev;
-	dma_addr_t dma_handle;
+	u64 sge_cur_blk = 0;
+	u64 sq_cur_blk = 0;
 	u32 page_size;
-	u64 *mtts;
+	int count;
 
 	/* Search qp buf's mtts */
-	mtts = hns_roce_table_find(hr_dev, &hr_dev->mr_table.mtt_table,
-				   hr_qp->mtt.first_seg, &dma_handle);
-	if (!mtts) {
-		dev_err(dev, "qp buf pa find failed\n");
+	count = hns_roce_mtr_find(hr_dev, &hr_qp->mtr, 0, &sq_cur_blk, 1, NULL);
+	if (count < 1) {
+		dev_err(dev, "qp(0x%lx) buf pa find failed\n", hr_qp->qpn);
 		return -EINVAL;
 	}
 
+	if (hr_qp->sge.offset) {
+		page_size = 1 << (hr_dev->caps.mtt_buf_pg_sz + PAGE_SHIFT);
+		count = hns_roce_mtr_find(hr_dev, &hr_qp->mtr,
+					  hr_qp->sge.offset / page_size,
+					  &sge_cur_blk, 1, NULL);
+		if (count < 1) {
+			dev_err(dev, "qp(0x%lx) sge pa find failed\n",
+				hr_qp->qpn);
+			return -EINVAL;
+		}
+	}
+
 	/* Not support alternate path and path migration */
 	if ((attr_mask & IB_QP_ALT_PATH) ||
 	    (attr_mask & IB_QP_PATH_MIG_STATE)) {
@@ -3263,44 +3951,38 @@
 	 * we should set all bits of the relevant fields in context mask to
 	 * 0 at the same time, else set them to 0x1.
 	 */
-	roce_set_field(context->byte_60_qpst_mapid,
-		       V2_QPC_BYTE_60_RTY_NUM_INI_BAK_M,
-		       V2_QPC_BYTE_60_RTY_NUM_INI_BAK_S, attr->retry_cnt);
-	roce_set_field(qpc_mask->byte_60_qpst_mapid,
-		       V2_QPC_BYTE_60_RTY_NUM_INI_BAK_M,
-		       V2_QPC_BYTE_60_RTY_NUM_INI_BAK_S, 0);
-
-	context->sq_cur_blk_addr = (u32)(mtts[0] >> PAGE_ADDR_SHIFT);
+	context->sq_cur_blk_addr = cpu_to_le32(sq_cur_blk >> PAGE_ADDR_SHIFT);
 	roce_set_field(context->byte_168_irrl_idx,
 		       V2_QPC_BYTE_168_SQ_CUR_BLK_ADDR_M,
 		       V2_QPC_BYTE_168_SQ_CUR_BLK_ADDR_S,
-		       mtts[0] >> (32 + PAGE_ADDR_SHIFT));
+		       sq_cur_blk >> (32 + PAGE_ADDR_SHIFT));
 	qpc_mask->sq_cur_blk_addr = 0;
 	roce_set_field(qpc_mask->byte_168_irrl_idx,
 		       V2_QPC_BYTE_168_SQ_CUR_BLK_ADDR_M,
 		       V2_QPC_BYTE_168_SQ_CUR_BLK_ADDR_S, 0);
 
-	page_size = 1 << (hr_dev->caps.mtt_buf_pg_sz + PAGE_SHIFT);
-	context->sq_cur_sge_blk_addr =
-		       ((ibqp->qp_type == IB_QPT_GSI) || hr_qp->sq.max_gs > 2) ?
-				      ((u32)(mtts[hr_qp->sge.offset / page_size]
-				      >> PAGE_ADDR_SHIFT)) : 0;
+	context->sq_cur_sge_blk_addr = ((ibqp->qp_type == IB_QPT_GSI) ||
+		       hr_qp->sq.max_gs > HNS_ROCE_V2_UC_RC_SGE_NUM_IN_WQE) ?
+		       cpu_to_le32(sge_cur_blk >>
+		       PAGE_ADDR_SHIFT) : 0;
 	roce_set_field(context->byte_184_irrl_idx,
 		       V2_QPC_BYTE_184_SQ_CUR_SGE_BLK_ADDR_M,
 		       V2_QPC_BYTE_184_SQ_CUR_SGE_BLK_ADDR_S,
-		       ((ibqp->qp_type == IB_QPT_GSI) || hr_qp->sq.max_gs > 2) ?
-		       (mtts[hr_qp->sge.offset / page_size] >>
+		       ((ibqp->qp_type == IB_QPT_GSI) || hr_qp->sq.max_gs >
+		       HNS_ROCE_V2_UC_RC_SGE_NUM_IN_WQE) ?
+		       (sge_cur_blk >>
 		       (32 + PAGE_ADDR_SHIFT)) : 0);
 	qpc_mask->sq_cur_sge_blk_addr = 0;
 	roce_set_field(qpc_mask->byte_184_irrl_idx,
 		       V2_QPC_BYTE_184_SQ_CUR_SGE_BLK_ADDR_M,
 		       V2_QPC_BYTE_184_SQ_CUR_SGE_BLK_ADDR_S, 0);
 
-	context->rx_sq_cur_blk_addr = (u32)(mtts[0] >> PAGE_ADDR_SHIFT);
+	context->rx_sq_cur_blk_addr =
+		cpu_to_le32(sq_cur_blk >> PAGE_ADDR_SHIFT);
 	roce_set_field(context->byte_232_irrl_sge,
 		       V2_QPC_BYTE_232_RX_SQ_CUR_BLK_ADDR_M,
 		       V2_QPC_BYTE_232_RX_SQ_CUR_BLK_ADDR_S,
-		       mtts[0] >> (32 + PAGE_ADDR_SHIFT));
+		       sq_cur_blk >> (32 + PAGE_ADDR_SHIFT));
 	qpc_mask->rx_sq_cur_blk_addr = 0;
 	roce_set_field(qpc_mask->byte_232_irrl_sge,
 		       V2_QPC_BYTE_232_RX_SQ_CUR_BLK_ADDR_M,
@@ -3319,13 +4001,6 @@
 		       V2_QPC_BYTE_240_RX_ACK_MSN_M,
 		       V2_QPC_BYTE_240_RX_ACK_MSN_S, 0);
 
-	roce_set_field(context->byte_244_rnr_rxack,
-		       V2_QPC_BYTE_244_RX_ACK_EPSN_M,
-		       V2_QPC_BYTE_244_RX_ACK_EPSN_S, attr->sq_psn);
-	roce_set_field(qpc_mask->byte_244_rnr_rxack,
-		       V2_QPC_BYTE_244_RX_ACK_EPSN_M,
-		       V2_QPC_BYTE_244_RX_ACK_EPSN_S, 0);
-
 	roce_set_field(qpc_mask->byte_248_ack_psn,
 		       V2_QPC_BYTE_248_ACK_LAST_OPTYPE_M,
 		       V2_QPC_BYTE_248_ACK_LAST_OPTYPE_S, 0);
@@ -3339,27 +4014,6 @@
 		       V2_QPC_BYTE_240_IRRL_TAIL_REAL_M,
 		       V2_QPC_BYTE_240_IRRL_TAIL_REAL_S, 0);
 
-	roce_set_field(context->byte_220_retry_psn_msn,
-		       V2_QPC_BYTE_220_RETRY_MSG_PSN_M,
-		       V2_QPC_BYTE_220_RETRY_MSG_PSN_S, attr->sq_psn);
-	roce_set_field(qpc_mask->byte_220_retry_psn_msn,
-		       V2_QPC_BYTE_220_RETRY_MSG_PSN_M,
-		       V2_QPC_BYTE_220_RETRY_MSG_PSN_S, 0);
-
-	roce_set_field(context->byte_224_retry_msg,
-		       V2_QPC_BYTE_224_RETRY_MSG_PSN_M,
-		       V2_QPC_BYTE_224_RETRY_MSG_PSN_S, attr->sq_psn >> 16);
-	roce_set_field(qpc_mask->byte_224_retry_msg,
-		       V2_QPC_BYTE_224_RETRY_MSG_PSN_M,
-		       V2_QPC_BYTE_224_RETRY_MSG_PSN_S, 0);
-
-	roce_set_field(context->byte_224_retry_msg,
-		       V2_QPC_BYTE_224_RETRY_MSG_FPKT_PSN_M,
-		       V2_QPC_BYTE_224_RETRY_MSG_FPKT_PSN_S, attr->sq_psn);
-	roce_set_field(qpc_mask->byte_224_retry_msg,
-		       V2_QPC_BYTE_224_RETRY_MSG_FPKT_PSN_M,
-		       V2_QPC_BYTE_224_RETRY_MSG_FPKT_PSN_S, 0);
-
 	roce_set_field(qpc_mask->byte_220_retry_psn_msn,
 		       V2_QPC_BYTE_220_RETRY_MSG_MSN_M,
 		       V2_QPC_BYTE_220_RETRY_MSG_MSN_S, 0);
@@ -3370,88 +4024,147 @@
 	roce_set_field(qpc_mask->byte_212_lsn, V2_QPC_BYTE_212_CHECK_FLG_M,
 		       V2_QPC_BYTE_212_CHECK_FLG_S, 0);
 
-	roce_set_field(context->byte_212_lsn, V2_QPC_BYTE_212_RETRY_CNT_M,
-		       V2_QPC_BYTE_212_RETRY_CNT_S, attr->retry_cnt);
-	roce_set_field(qpc_mask->byte_212_lsn, V2_QPC_BYTE_212_RETRY_CNT_M,
-		       V2_QPC_BYTE_212_RETRY_CNT_S, 0);
-
-	roce_set_field(context->byte_212_lsn, V2_QPC_BYTE_212_RETRY_NUM_INIT_M,
-		       V2_QPC_BYTE_212_RETRY_NUM_INIT_S, attr->retry_cnt);
-	roce_set_field(qpc_mask->byte_212_lsn, V2_QPC_BYTE_212_RETRY_NUM_INIT_M,
-		       V2_QPC_BYTE_212_RETRY_NUM_INIT_S, 0);
-
-	roce_set_field(context->byte_244_rnr_rxack,
-		       V2_QPC_BYTE_244_RNR_NUM_INIT_M,
-		       V2_QPC_BYTE_244_RNR_NUM_INIT_S, attr->rnr_retry);
-	roce_set_field(qpc_mask->byte_244_rnr_rxack,
-		       V2_QPC_BYTE_244_RNR_NUM_INIT_M,
-		       V2_QPC_BYTE_244_RNR_NUM_INIT_S, 0);
-
-	roce_set_field(context->byte_244_rnr_rxack, V2_QPC_BYTE_244_RNR_CNT_M,
-		       V2_QPC_BYTE_244_RNR_CNT_S, attr->rnr_retry);
-	roce_set_field(qpc_mask->byte_244_rnr_rxack, V2_QPC_BYTE_244_RNR_CNT_M,
-		       V2_QPC_BYTE_244_RNR_CNT_S, 0);
-
 	roce_set_field(context->byte_212_lsn, V2_QPC_BYTE_212_LSN_M,
 		       V2_QPC_BYTE_212_LSN_S, 0x100);
 	roce_set_field(qpc_mask->byte_212_lsn, V2_QPC_BYTE_212_LSN_M,
 		       V2_QPC_BYTE_212_LSN_S, 0);
 
-	if (attr_mask & IB_QP_TIMEOUT) {
-		roce_set_field(context->byte_28_at_fl, V2_QPC_BYTE_28_AT_M,
-			       V2_QPC_BYTE_28_AT_S, attr->timeout);
-		roce_set_field(qpc_mask->byte_28_at_fl, V2_QPC_BYTE_28_AT_M,
-			      V2_QPC_BYTE_28_AT_S, 0);
-	}
-
-	roce_set_field(context->byte_172_sq_psn, V2_QPC_BYTE_172_SQ_CUR_PSN_M,
-		       V2_QPC_BYTE_172_SQ_CUR_PSN_S, attr->sq_psn);
-	roce_set_field(qpc_mask->byte_172_sq_psn, V2_QPC_BYTE_172_SQ_CUR_PSN_M,
-		       V2_QPC_BYTE_172_SQ_CUR_PSN_S, 0);
-
 	roce_set_field(qpc_mask->byte_196_sq_psn, V2_QPC_BYTE_196_IRRL_HEAD_M,
 		       V2_QPC_BYTE_196_IRRL_HEAD_S, 0);
-	roce_set_field(context->byte_196_sq_psn, V2_QPC_BYTE_196_SQ_MAX_PSN_M,
-		       V2_QPC_BYTE_196_SQ_MAX_PSN_S, attr->sq_psn);
-	roce_set_field(qpc_mask->byte_196_sq_psn, V2_QPC_BYTE_196_SQ_MAX_PSN_M,
-		       V2_QPC_BYTE_196_SQ_MAX_PSN_S, 0);
 
-	if ((attr_mask & IB_QP_MAX_QP_RD_ATOMIC) && attr->max_rd_atomic) {
-		roce_set_field(context->byte_208_irrl, V2_QPC_BYTE_208_SR_MAX_M,
-			       V2_QPC_BYTE_208_SR_MAX_S,
-			       fls(attr->max_rd_atomic - 1));
-		roce_set_field(qpc_mask->byte_208_irrl,
-			       V2_QPC_BYTE_208_SR_MAX_M,
-			       V2_QPC_BYTE_208_SR_MAX_S, 0);
-	}
 	return 0;
 }
 
-static int hns_roce_v2_modify_qp(struct ib_qp *ibqp,
-				 const struct ib_qp_attr *attr,
-				 int attr_mask, enum ib_qp_state cur_state,
-				 enum ib_qp_state new_state)
+static inline bool hns_roce_v2_check_qp_stat(enum ib_qp_state cur_state,
+					     enum ib_qp_state new_state)
 {
+
+	if ((cur_state != IB_QPS_RESET &&
+	    (new_state == IB_QPS_ERR || new_state == IB_QPS_RESET)) ||
+	    ((cur_state == IB_QPS_RTS || cur_state == IB_QPS_SQD) &&
+	    (new_state == IB_QPS_RTS || new_state == IB_QPS_SQD)) ||
+	    (cur_state == IB_QPS_SQE && new_state == IB_QPS_RTS))
+		return true;
+
+	return false;
+
+}
+
+static int hns_roce_v2_set_path(struct ib_qp *ibqp,
+				const struct ib_qp_attr *attr,
+				int attr_mask,
+				struct hns_roce_v2_qp_context *context,
+				struct hns_roce_v2_qp_context *qpc_mask)
+{
+	const struct ib_global_route *grh = rdma_ah_read_grh(&attr->ah_attr);
 	struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device);
 	struct hns_roce_qp *hr_qp = to_hr_qp(ibqp);
-	struct hns_roce_v2_qp_context *context;
-	struct hns_roce_v2_qp_context *qpc_mask;
-	struct device *dev = hr_dev->dev;
-	int ret = -EINVAL;
+	const struct ib_gid_attr *gid_attr = NULL;
+	int is_roce_protocol;
+	bool is_udp = false;
+	u16 vlan = 0xffff;
+	u8 ib_port;
+	u8 hr_port;
+	int ret;
 
-	context = kcalloc(2, sizeof(*context), GFP_KERNEL);
-	if (!context)
-		return -ENOMEM;
+	ib_port = (attr_mask & IB_QP_PORT) ? attr->port_num : hr_qp->port + 1;
+	hr_port = ib_port - 1;
+	is_roce_protocol = rdma_cap_eth_ah(&hr_dev->ib_dev, ib_port) &&
+			   rdma_ah_get_ah_flags(&attr->ah_attr) & IB_AH_GRH;
 
-	qpc_mask = context + 1;
-	/*
-	 * In v2 engine, software pass context and context mask to hardware
-	 * when modifying qp. If software need modify some fields in context,
-	 * we should set all bits of the relevant fields in context mask to
-	 * 0 at the same time, else set them to 0x1.
-	 */
-	memset(qpc_mask, 0xff, sizeof(*qpc_mask));
+	if (is_roce_protocol) {
+		gid_attr = attr->ah_attr.grh.sgid_attr;
+		ret = rdma_read_gid_l2_fields(gid_attr, &vlan, NULL);
+		if (ret)
+			return ret;
+
+		if (gid_attr)
+			is_udp = (gid_attr->gid_type ==
+				 IB_GID_TYPE_ROCE_UDP_ENCAP);
+	}
+
+	if (vlan < VLAN_CFI_MASK) {
+		roce_set_bit(context->byte_76_srqn_op_en,
+			     V2_QPC_BYTE_76_RQ_VLAN_EN_S, 1);
+		roce_set_bit(qpc_mask->byte_76_srqn_op_en,
+			     V2_QPC_BYTE_76_RQ_VLAN_EN_S, 0);
+		roce_set_bit(context->byte_168_irrl_idx,
+			     V2_QPC_BYTE_168_SQ_VLAN_EN_S, 1);
+		roce_set_bit(qpc_mask->byte_168_irrl_idx,
+			     V2_QPC_BYTE_168_SQ_VLAN_EN_S, 0);
+	}
+
+	roce_set_field(context->byte_24_mtu_tc, V2_QPC_BYTE_24_VLAN_ID_M,
+		       V2_QPC_BYTE_24_VLAN_ID_S, vlan);
+	roce_set_field(qpc_mask->byte_24_mtu_tc, V2_QPC_BYTE_24_VLAN_ID_M,
+		       V2_QPC_BYTE_24_VLAN_ID_S, 0);
+
+	if (grh->sgid_index >= hr_dev->caps.gid_table_len[hr_port]) {
+		dev_err(hr_dev->dev, "sgid_index(%u) too large. max is %d\n",
+			grh->sgid_index, hr_dev->caps.gid_table_len[hr_port]);
+		return -EINVAL;
+	}
+
+	if (attr->ah_attr.type != RDMA_AH_ATTR_TYPE_ROCE) {
+		dev_err(hr_dev->dev, "ah attr is not RDMA roce type\n");
+		return -EINVAL;
+	}
+
+	roce_set_field(context->byte_52_udpspn_dmac, V2_QPC_BYTE_52_UDPSPN_M,
+		       V2_QPC_BYTE_52_UDPSPN_S,
+		       is_udp ? 0x12b7 : 0);
+
+	roce_set_field(qpc_mask->byte_52_udpspn_dmac, V2_QPC_BYTE_52_UDPSPN_M,
+		       V2_QPC_BYTE_52_UDPSPN_S, 0);
+
+	roce_set_field(context->byte_20_smac_sgid_idx,
+		       V2_QPC_BYTE_20_SGID_IDX_M, V2_QPC_BYTE_20_SGID_IDX_S,
+		       grh->sgid_index);
+
+	roce_set_field(qpc_mask->byte_20_smac_sgid_idx,
+		       V2_QPC_BYTE_20_SGID_IDX_M, V2_QPC_BYTE_20_SGID_IDX_S, 0);
+
+	roce_set_field(context->byte_24_mtu_tc, V2_QPC_BYTE_24_HOP_LIMIT_M,
+		       V2_QPC_BYTE_24_HOP_LIMIT_S, grh->hop_limit);
+	roce_set_field(qpc_mask->byte_24_mtu_tc, V2_QPC_BYTE_24_HOP_LIMIT_M,
+		       V2_QPC_BYTE_24_HOP_LIMIT_S, 0);
+
+	if (hr_dev->pci_dev->revision == 0x21 && is_udp)
+		roce_set_field(context->byte_24_mtu_tc, V2_QPC_BYTE_24_TC_M,
+			       V2_QPC_BYTE_24_TC_S, grh->traffic_class >> 2);
+	else
+		roce_set_field(context->byte_24_mtu_tc, V2_QPC_BYTE_24_TC_M,
+			       V2_QPC_BYTE_24_TC_S, grh->traffic_class);
+	roce_set_field(qpc_mask->byte_24_mtu_tc, V2_QPC_BYTE_24_TC_M,
+		       V2_QPC_BYTE_24_TC_S, 0);
+	roce_set_field(context->byte_28_at_fl, V2_QPC_BYTE_28_FL_M,
+		       V2_QPC_BYTE_28_FL_S, grh->flow_label);
+	roce_set_field(qpc_mask->byte_28_at_fl, V2_QPC_BYTE_28_FL_M,
+		       V2_QPC_BYTE_28_FL_S, 0);
+	memcpy(context->dgid, grh->dgid.raw, sizeof(grh->dgid.raw));
+	memset(qpc_mask->dgid, 0, sizeof(grh->dgid.raw));
+	roce_set_field(context->byte_28_at_fl, V2_QPC_BYTE_28_SL_M,
+		       V2_QPC_BYTE_28_SL_S, rdma_ah_get_sl(&attr->ah_attr));
+	roce_set_field(qpc_mask->byte_28_at_fl, V2_QPC_BYTE_28_SL_M,
+		       V2_QPC_BYTE_28_SL_S, 0);
+	hr_qp->sl = rdma_ah_get_sl(&attr->ah_attr);
+
+	return 0;
+}
+
+static int hns_roce_v2_set_abs_fields(struct ib_qp *ibqp,
+				      const struct ib_qp_attr *attr,
+				      int attr_mask,
+				      enum ib_qp_state cur_state,
+				      enum ib_qp_state new_state,
+				      struct hns_roce_v2_qp_context *context,
+				      struct hns_roce_v2_qp_context *qpc_mask)
+{
+	struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device);
+	int ret = 0;
+
 	if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) {
+		memset(qpc_mask, 0, sizeof(*qpc_mask));
 		modify_qp_reset_to_init(ibqp, attr, attr_mask, context,
 					qpc_mask);
 	} else if (cur_state == IB_QPS_INIT && new_state == IB_QPS_INIT) {
@@ -3467,151 +4180,193 @@
 					   qpc_mask);
 		if (ret)
 			goto out;
-	} else if ((cur_state == IB_QPS_RTS && new_state == IB_QPS_RTS) ||
-		   (cur_state == IB_QPS_SQE && new_state == IB_QPS_RTS) ||
-		   (cur_state == IB_QPS_RTS && new_state == IB_QPS_SQD) ||
-		   (cur_state == IB_QPS_SQD && new_state == IB_QPS_SQD) ||
-		   (cur_state == IB_QPS_SQD && new_state == IB_QPS_RTS) ||
-		   (cur_state == IB_QPS_INIT && new_state == IB_QPS_RESET) ||
-		   (cur_state == IB_QPS_RTR && new_state == IB_QPS_RESET) ||
-		   (cur_state == IB_QPS_RTS && new_state == IB_QPS_RESET) ||
-		   (cur_state == IB_QPS_ERR && new_state == IB_QPS_RESET) ||
-		   (cur_state == IB_QPS_INIT && new_state == IB_QPS_ERR) ||
-		   (cur_state == IB_QPS_RTR && new_state == IB_QPS_ERR) ||
-		   (cur_state == IB_QPS_RTS && new_state == IB_QPS_ERR) ||
-		   (cur_state == IB_QPS_SQD && new_state == IB_QPS_ERR) ||
-		   (cur_state == IB_QPS_SQE && new_state == IB_QPS_ERR) ||
-		   (cur_state == IB_QPS_ERR && new_state == IB_QPS_ERR)) {
+	} else if (hns_roce_v2_check_qp_stat(cur_state, new_state)) {
 		/* Nothing */
 		;
 	} else {
-		dev_err(dev, "Illegal state for QP!\n");
+		dev_err(hr_dev->dev, "Illegal state for QP!\n");
 		ret = -EINVAL;
 		goto out;
 	}
 
-	/* When QP state is err, SQ and RQ WQE should be flushed */
-	if (new_state == IB_QPS_ERR) {
-		roce_set_field(context->byte_160_sq_ci_pi,
-			       V2_QPC_BYTE_160_SQ_PRODUCER_IDX_M,
-			       V2_QPC_BYTE_160_SQ_PRODUCER_IDX_S,
-			       hr_qp->sq.head);
-		roce_set_field(qpc_mask->byte_160_sq_ci_pi,
-			       V2_QPC_BYTE_160_SQ_PRODUCER_IDX_M,
-			       V2_QPC_BYTE_160_SQ_PRODUCER_IDX_S, 0);
-		roce_set_field(context->byte_84_rq_ci_pi,
-			       V2_QPC_BYTE_84_RQ_PRODUCER_IDX_M,
-			       V2_QPC_BYTE_84_RQ_PRODUCER_IDX_S,
-			       hr_qp->rq.head);
-		roce_set_field(qpc_mask->byte_84_rq_ci_pi,
-			       V2_QPC_BYTE_84_RQ_PRODUCER_IDX_M,
-			       V2_QPC_BYTE_84_RQ_PRODUCER_IDX_S, 0);
-	}
+out:
+	return ret;
+}
+
+static int hns_roce_v2_set_opt_fields(struct ib_qp *ibqp,
+				      const struct ib_qp_attr *attr,
+				      int attr_mask,
+				      struct hns_roce_v2_qp_context *context,
+				      struct hns_roce_v2_qp_context *qpc_mask)
+{
+	struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device);
+	struct hns_roce_qp *hr_qp = to_hr_qp(ibqp);
+	int ret = 0;
 
 	if (attr_mask & IB_QP_AV) {
-		const struct ib_global_route *grh =
-					    rdma_ah_read_grh(&attr->ah_attr);
-		const struct ib_gid_attr *gid_attr = NULL;
-		u8 src_mac[ETH_ALEN];
-		int is_roce_protocol;
-		u16 vlan = 0xffff;
-		u8 ib_port;
-		u8 hr_port;
+		ret = hns_roce_v2_set_path(ibqp, attr, attr_mask, context,
+					   qpc_mask);
+		if (ret)
+			return ret;
+	}
 
-		ib_port = (attr_mask & IB_QP_PORT) ? attr->port_num :
-			   hr_qp->port + 1;
-		hr_port = ib_port - 1;
-		is_roce_protocol = rdma_cap_eth_ah(&hr_dev->ib_dev, ib_port) &&
-			       rdma_ah_get_ah_flags(&attr->ah_attr) & IB_AH_GRH;
-
-		if (is_roce_protocol) {
-			gid_attr = attr->ah_attr.grh.sgid_attr;
-			vlan = rdma_vlan_dev_vlan_id(gid_attr->ndev);
-			memcpy(src_mac, gid_attr->ndev->dev_addr, ETH_ALEN);
+	if (attr_mask & IB_QP_TIMEOUT) {
+		if (attr->timeout < 31) {
+			roce_set_field(context->byte_28_at_fl,
+				       V2_QPC_BYTE_28_AT_M, V2_QPC_BYTE_28_AT_S,
+				       attr->timeout);
+			roce_set_field(qpc_mask->byte_28_at_fl,
+				       V2_QPC_BYTE_28_AT_M, V2_QPC_BYTE_28_AT_S,
+				       0);
+		} else {
+			dev_warn(hr_dev->dev,
+				 "Local ACK timeout shall be 0 to 30.\n");
 		}
+	}
 
-		roce_set_field(context->byte_24_mtu_tc,
-			       V2_QPC_BYTE_24_VLAN_ID_M,
-			       V2_QPC_BYTE_24_VLAN_ID_S, vlan);
-		roce_set_field(qpc_mask->byte_24_mtu_tc,
-			       V2_QPC_BYTE_24_VLAN_ID_M,
-			       V2_QPC_BYTE_24_VLAN_ID_S, 0);
+	if (attr_mask & IB_QP_RETRY_CNT) {
+		roce_set_field(context->byte_212_lsn,
+			       V2_QPC_BYTE_212_RETRY_NUM_INIT_M,
+			       V2_QPC_BYTE_212_RETRY_NUM_INIT_S,
+			       attr->retry_cnt);
+		roce_set_field(qpc_mask->byte_212_lsn,
+			       V2_QPC_BYTE_212_RETRY_NUM_INIT_M,
+			       V2_QPC_BYTE_212_RETRY_NUM_INIT_S, 0);
 
-		if (grh->sgid_index >= hr_dev->caps.gid_table_len[hr_port]) {
-			dev_err(hr_dev->dev,
-				"sgid_index(%u) too large. max is %d\n",
-				grh->sgid_index,
-				hr_dev->caps.gid_table_len[hr_port]);
-			ret = -EINVAL;
-			goto out;
-		}
+		roce_set_field(context->byte_212_lsn,
+			       V2_QPC_BYTE_212_RETRY_CNT_M,
+			       V2_QPC_BYTE_212_RETRY_CNT_S,
+			       attr->retry_cnt);
+		roce_set_field(qpc_mask->byte_212_lsn,
+			       V2_QPC_BYTE_212_RETRY_CNT_M,
+			       V2_QPC_BYTE_212_RETRY_CNT_S, 0);
+	}
 
-		if (attr->ah_attr.type != RDMA_AH_ATTR_TYPE_ROCE) {
-			dev_err(hr_dev->dev, "ah attr is not RDMA roce type\n");
-			ret = -EINVAL;
-			goto out;
-		}
+	if (attr_mask & IB_QP_RNR_RETRY) {
+		roce_set_field(context->byte_244_rnr_rxack,
+			       V2_QPC_BYTE_244_RNR_NUM_INIT_M,
+			       V2_QPC_BYTE_244_RNR_NUM_INIT_S, attr->rnr_retry);
+		roce_set_field(qpc_mask->byte_244_rnr_rxack,
+			       V2_QPC_BYTE_244_RNR_NUM_INIT_M,
+			       V2_QPC_BYTE_244_RNR_NUM_INIT_S, 0);
 
-		roce_set_field(context->byte_52_udpspn_dmac,
-			   V2_QPC_BYTE_52_UDPSPN_M, V2_QPC_BYTE_52_UDPSPN_S,
-			   (gid_attr->gid_type != IB_GID_TYPE_ROCE_UDP_ENCAP) ?
-			   0 : 0x12b7);
+		roce_set_field(context->byte_244_rnr_rxack,
+			       V2_QPC_BYTE_244_RNR_CNT_M,
+			       V2_QPC_BYTE_244_RNR_CNT_S, attr->rnr_retry);
+		roce_set_field(qpc_mask->byte_244_rnr_rxack,
+			       V2_QPC_BYTE_244_RNR_CNT_M,
+			       V2_QPC_BYTE_244_RNR_CNT_S, 0);
+	}
 
-		roce_set_field(qpc_mask->byte_52_udpspn_dmac,
-			       V2_QPC_BYTE_52_UDPSPN_M,
-			       V2_QPC_BYTE_52_UDPSPN_S, 0);
+	/* RC&UC&UD required attr */
+	if (attr_mask & IB_QP_SQ_PSN) {
+		roce_set_field(context->byte_172_sq_psn,
+			       V2_QPC_BYTE_172_SQ_CUR_PSN_M,
+			       V2_QPC_BYTE_172_SQ_CUR_PSN_S, attr->sq_psn);
+		roce_set_field(qpc_mask->byte_172_sq_psn,
+			       V2_QPC_BYTE_172_SQ_CUR_PSN_M,
+			       V2_QPC_BYTE_172_SQ_CUR_PSN_S, 0);
 
-		roce_set_field(context->byte_20_smac_sgid_idx,
-			       V2_QPC_BYTE_20_SGID_IDX_M,
-			       V2_QPC_BYTE_20_SGID_IDX_S, grh->sgid_index);
+		roce_set_field(context->byte_196_sq_psn,
+			       V2_QPC_BYTE_196_SQ_MAX_PSN_M,
+			       V2_QPC_BYTE_196_SQ_MAX_PSN_S, attr->sq_psn);
+		roce_set_field(qpc_mask->byte_196_sq_psn,
+			       V2_QPC_BYTE_196_SQ_MAX_PSN_M,
+			       V2_QPC_BYTE_196_SQ_MAX_PSN_S, 0);
 
-		roce_set_field(qpc_mask->byte_20_smac_sgid_idx,
-			       V2_QPC_BYTE_20_SGID_IDX_M,
-			       V2_QPC_BYTE_20_SGID_IDX_S, 0);
+		roce_set_field(context->byte_220_retry_psn_msn,
+			       V2_QPC_BYTE_220_RETRY_MSG_PSN_M,
+			       V2_QPC_BYTE_220_RETRY_MSG_PSN_S, attr->sq_psn);
+		roce_set_field(qpc_mask->byte_220_retry_psn_msn,
+			       V2_QPC_BYTE_220_RETRY_MSG_PSN_M,
+			       V2_QPC_BYTE_220_RETRY_MSG_PSN_S, 0);
 
-		roce_set_field(context->byte_24_mtu_tc,
-			       V2_QPC_BYTE_24_HOP_LIMIT_M,
-			       V2_QPC_BYTE_24_HOP_LIMIT_S, grh->hop_limit);
-		roce_set_field(qpc_mask->byte_24_mtu_tc,
-			       V2_QPC_BYTE_24_HOP_LIMIT_M,
-			       V2_QPC_BYTE_24_HOP_LIMIT_S, 0);
+		roce_set_field(context->byte_224_retry_msg,
+			       V2_QPC_BYTE_224_RETRY_MSG_PSN_M,
+			       V2_QPC_BYTE_224_RETRY_MSG_PSN_S,
+			       attr->sq_psn >> V2_QPC_BYTE_220_RETRY_MSG_PSN_S);
+		roce_set_field(qpc_mask->byte_224_retry_msg,
+			       V2_QPC_BYTE_224_RETRY_MSG_PSN_M,
+			       V2_QPC_BYTE_224_RETRY_MSG_PSN_S, 0);
 
-		roce_set_field(context->byte_24_mtu_tc, V2_QPC_BYTE_24_TC_M,
-			       V2_QPC_BYTE_24_TC_S, grh->traffic_class);
-		roce_set_field(qpc_mask->byte_24_mtu_tc, V2_QPC_BYTE_24_TC_M,
-			       V2_QPC_BYTE_24_TC_S, 0);
-		roce_set_field(context->byte_28_at_fl, V2_QPC_BYTE_28_FL_M,
-			       V2_QPC_BYTE_28_FL_S, grh->flow_label);
-		roce_set_field(qpc_mask->byte_28_at_fl, V2_QPC_BYTE_28_FL_M,
-			       V2_QPC_BYTE_28_FL_S, 0);
-		memcpy(context->dgid, grh->dgid.raw, sizeof(grh->dgid.raw));
-		memset(qpc_mask->dgid, 0, sizeof(grh->dgid.raw));
-		roce_set_field(context->byte_28_at_fl, V2_QPC_BYTE_28_SL_M,
-			       V2_QPC_BYTE_28_SL_S,
-			       rdma_ah_get_sl(&attr->ah_attr));
-		roce_set_field(qpc_mask->byte_28_at_fl, V2_QPC_BYTE_28_SL_M,
-			       V2_QPC_BYTE_28_SL_S, 0);
-		hr_qp->sl = rdma_ah_get_sl(&attr->ah_attr);
+		roce_set_field(context->byte_224_retry_msg,
+			       V2_QPC_BYTE_224_RETRY_MSG_FPKT_PSN_M,
+			       V2_QPC_BYTE_224_RETRY_MSG_FPKT_PSN_S,
+			       attr->sq_psn);
+		roce_set_field(qpc_mask->byte_224_retry_msg,
+			       V2_QPC_BYTE_224_RETRY_MSG_FPKT_PSN_M,
+			       V2_QPC_BYTE_224_RETRY_MSG_FPKT_PSN_S, 0);
+
+		roce_set_field(context->byte_244_rnr_rxack,
+			       V2_QPC_BYTE_244_RX_ACK_EPSN_M,
+			       V2_QPC_BYTE_244_RX_ACK_EPSN_S, attr->sq_psn);
+		roce_set_field(qpc_mask->byte_244_rnr_rxack,
+			       V2_QPC_BYTE_244_RX_ACK_EPSN_M,
+			       V2_QPC_BYTE_244_RX_ACK_EPSN_S, 0);
+	}
+
+	if ((attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) &&
+	     attr->max_dest_rd_atomic) {
+		roce_set_field(context->byte_140_raq, V2_QPC_BYTE_140_RR_MAX_M,
+			       V2_QPC_BYTE_140_RR_MAX_S,
+			       fls(attr->max_dest_rd_atomic - 1));
+		roce_set_field(qpc_mask->byte_140_raq, V2_QPC_BYTE_140_RR_MAX_M,
+			       V2_QPC_BYTE_140_RR_MAX_S, 0);
+	}
+
+	if ((attr_mask & IB_QP_MAX_QP_RD_ATOMIC) && attr->max_rd_atomic) {
+		roce_set_field(context->byte_208_irrl, V2_QPC_BYTE_208_SR_MAX_M,
+			       V2_QPC_BYTE_208_SR_MAX_S,
+			       fls(attr->max_rd_atomic - 1));
+		roce_set_field(qpc_mask->byte_208_irrl,
+			       V2_QPC_BYTE_208_SR_MAX_M,
+			       V2_QPC_BYTE_208_SR_MAX_S, 0);
 	}
 
 	if (attr_mask & (IB_QP_ACCESS_FLAGS | IB_QP_MAX_DEST_RD_ATOMIC))
 		set_access_flags(hr_qp, context, qpc_mask, attr, attr_mask);
 
-	/* Every status migrate must change state */
-	roce_set_field(context->byte_60_qpst_mapid, V2_QPC_BYTE_60_QP_ST_M,
-		       V2_QPC_BYTE_60_QP_ST_S, new_state);
-	roce_set_field(qpc_mask->byte_60_qpst_mapid, V2_QPC_BYTE_60_QP_ST_M,
-		       V2_QPC_BYTE_60_QP_ST_S, 0);
-
-	/* SW pass context to HW */
-	ret = hns_roce_v2_qp_modify(hr_dev, &hr_qp->mtt, cur_state, new_state,
-				    context, hr_qp);
-	if (ret) {
-		dev_err(dev, "hns_roce_qp_modify failed(%d)\n", ret);
-		goto out;
+	if (attr_mask & IB_QP_MIN_RNR_TIMER) {
+		roce_set_field(context->byte_80_rnr_rx_cqn,
+			       V2_QPC_BYTE_80_MIN_RNR_TIME_M,
+			       V2_QPC_BYTE_80_MIN_RNR_TIME_S,
+			       attr->min_rnr_timer);
+		roce_set_field(qpc_mask->byte_80_rnr_rx_cqn,
+			       V2_QPC_BYTE_80_MIN_RNR_TIME_M,
+			       V2_QPC_BYTE_80_MIN_RNR_TIME_S, 0);
 	}
 
-	hr_qp->state = new_state;
+	/* RC&UC required attr */
+	if (attr_mask & IB_QP_RQ_PSN) {
+		roce_set_field(context->byte_108_rx_reqepsn,
+			       V2_QPC_BYTE_108_RX_REQ_EPSN_M,
+			       V2_QPC_BYTE_108_RX_REQ_EPSN_S, attr->rq_psn);
+		roce_set_field(qpc_mask->byte_108_rx_reqepsn,
+			       V2_QPC_BYTE_108_RX_REQ_EPSN_M,
+			       V2_QPC_BYTE_108_RX_REQ_EPSN_S, 0);
+
+		roce_set_field(context->byte_152_raq, V2_QPC_BYTE_152_RAQ_PSN_M,
+			       V2_QPC_BYTE_152_RAQ_PSN_S, attr->rq_psn - 1);
+		roce_set_field(qpc_mask->byte_152_raq,
+			       V2_QPC_BYTE_152_RAQ_PSN_M,
+			       V2_QPC_BYTE_152_RAQ_PSN_S, 0);
+	}
+
+	if (attr_mask & IB_QP_QKEY) {
+		context->qkey_xrcd = cpu_to_le32(attr->qkey);
+		qpc_mask->qkey_xrcd = 0;
+		hr_qp->qkey = attr->qkey;
+	}
+
+	return ret;
+}
+
+static void hns_roce_v2_record_opt_fields(struct ib_qp *ibqp,
+					  const struct ib_qp_attr *attr,
+					  int attr_mask)
+{
+	struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device);
+	struct hns_roce_qp *hr_qp = to_hr_qp(ibqp);
 
 	if (attr_mask & IB_QP_ACCESS_FLAGS)
 		hr_qp->atomic_rd_en = attr->qp_access_flags;
@@ -3622,6 +4377,82 @@
 		hr_qp->port = attr->port_num - 1;
 		hr_qp->phy_port = hr_dev->iboe.phy_port[hr_qp->port];
 	}
+}
+
+static int hns_roce_v2_modify_qp(struct ib_qp *ibqp,
+				 const struct ib_qp_attr *attr,
+				 int attr_mask, enum ib_qp_state cur_state,
+				 enum ib_qp_state new_state)
+{
+	struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device);
+	struct hns_roce_qp *hr_qp = to_hr_qp(ibqp);
+	struct hns_roce_v2_qp_context ctx[2];
+	struct hns_roce_v2_qp_context *context = ctx;
+	struct hns_roce_v2_qp_context *qpc_mask = ctx + 1;
+	struct device *dev = hr_dev->dev;
+	int ret;
+
+	/*
+	 * In v2 engine, software pass context and context mask to hardware
+	 * when modifying qp. If software need modify some fields in context,
+	 * we should set all bits of the relevant fields in context mask to
+	 * 0 at the same time, else set them to 0x1.
+	 */
+	memset(context, 0, sizeof(*context));
+	memset(qpc_mask, 0xff, sizeof(*qpc_mask));
+	ret = hns_roce_v2_set_abs_fields(ibqp, attr, attr_mask, cur_state,
+					 new_state, context, qpc_mask);
+	if (ret)
+		goto out;
+
+	/* When QP state is err, SQ and RQ WQE should be flushed */
+	if (new_state == IB_QPS_ERR) {
+		roce_set_field(context->byte_160_sq_ci_pi,
+			       V2_QPC_BYTE_160_SQ_PRODUCER_IDX_M,
+			       V2_QPC_BYTE_160_SQ_PRODUCER_IDX_S,
+			       hr_qp->sq.head);
+		roce_set_field(qpc_mask->byte_160_sq_ci_pi,
+			       V2_QPC_BYTE_160_SQ_PRODUCER_IDX_M,
+			       V2_QPC_BYTE_160_SQ_PRODUCER_IDX_S, 0);
+
+		if (!ibqp->srq) {
+			roce_set_field(context->byte_84_rq_ci_pi,
+			       V2_QPC_BYTE_84_RQ_PRODUCER_IDX_M,
+			       V2_QPC_BYTE_84_RQ_PRODUCER_IDX_S,
+			       hr_qp->rq.head);
+			roce_set_field(qpc_mask->byte_84_rq_ci_pi,
+			       V2_QPC_BYTE_84_RQ_PRODUCER_IDX_M,
+			       V2_QPC_BYTE_84_RQ_PRODUCER_IDX_S, 0);
+		}
+	}
+
+	/* Configure the optional fields */
+	ret = hns_roce_v2_set_opt_fields(ibqp, attr, attr_mask, context,
+					 qpc_mask);
+	if (ret)
+		goto out;
+
+	roce_set_bit(context->byte_108_rx_reqepsn, V2_QPC_BYTE_108_INV_CREDIT_S,
+		     ibqp->srq ? 1 : 0);
+	roce_set_bit(qpc_mask->byte_108_rx_reqepsn,
+		     V2_QPC_BYTE_108_INV_CREDIT_S, 0);
+
+	/* Every status migrate must change state */
+	roce_set_field(context->byte_60_qpst_tempid, V2_QPC_BYTE_60_QP_ST_M,
+		       V2_QPC_BYTE_60_QP_ST_S, new_state);
+	roce_set_field(qpc_mask->byte_60_qpst_tempid, V2_QPC_BYTE_60_QP_ST_M,
+		       V2_QPC_BYTE_60_QP_ST_S, 0);
+
+	/* SW pass context to HW */
+	ret = hns_roce_v2_qp_modify(hr_dev, cur_state, new_state, ctx, hr_qp);
+	if (ret) {
+		dev_err(dev, "hns_roce_qp_modify failed(%d)\n", ret);
+		goto out;
+	}
+
+	hr_qp->state = new_state;
+
+	hns_roce_v2_record_opt_fields(ibqp, attr, attr_mask);
 
 	if (new_state == IB_QPS_RESET && !ibqp->uobject) {
 		hns_roce_v2_cq_clean(to_hr_cq(ibqp->recv_cq), hr_qp->qpn,
@@ -3641,7 +4472,6 @@
 	}
 
 out:
-	kfree(context);
 	return ret;
 }
 
@@ -3692,16 +4522,12 @@
 {
 	struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device);
 	struct hns_roce_qp *hr_qp = to_hr_qp(ibqp);
-	struct hns_roce_v2_qp_context *context;
+	struct hns_roce_v2_qp_context context = {};
 	struct device *dev = hr_dev->dev;
 	int tmp_qp_state;
 	int state;
 	int ret;
 
-	context = kzalloc(sizeof(*context), GFP_KERNEL);
-	if (!context)
-		return -ENOMEM;
-
 	memset(qp_attr, 0, sizeof(*qp_attr));
 	memset(qp_init_attr, 0, sizeof(*qp_init_attr));
 
@@ -3713,14 +4539,14 @@
 		goto done;
 	}
 
-	ret = hns_roce_v2_query_qpc(hr_dev, hr_qp, context);
+	ret = hns_roce_v2_query_qpc(hr_dev, hr_qp, &context);
 	if (ret) {
 		dev_err(dev, "query qpc error\n");
 		ret = -EINVAL;
 		goto out;
 	}
 
-	state = roce_get_field(context->byte_60_qpst_mapid,
+	state = roce_get_field(context.byte_60_qpst_tempid,
 			       V2_QPC_BYTE_60_QP_ST_M, V2_QPC_BYTE_60_QP_ST_S);
 	tmp_qp_state = to_ib_qp_st((enum hns_roce_v2_qp_state)state);
 	if (tmp_qp_state == -1) {
@@ -3730,7 +4556,7 @@
 	}
 	hr_qp->state = (u8)tmp_qp_state;
 	qp_attr->qp_state = (enum ib_qp_state)hr_qp->state;
-	qp_attr->path_mtu = (enum ib_mtu)roce_get_field(context->byte_24_mtu_tc,
+	qp_attr->path_mtu = (enum ib_mtu)roce_get_field(context.byte_24_mtu_tc,
 							V2_QPC_BYTE_24_MTU_M,
 							V2_QPC_BYTE_24_MTU_S);
 	qp_attr->path_mig_state = IB_MIG_ARMED;
@@ -3738,64 +4564,65 @@
 	if (hr_qp->ibqp.qp_type == IB_QPT_UD)
 		qp_attr->qkey = V2_QKEY_VAL;
 
-	qp_attr->rq_psn = roce_get_field(context->byte_108_rx_reqepsn,
+	qp_attr->rq_psn = roce_get_field(context.byte_108_rx_reqepsn,
 					 V2_QPC_BYTE_108_RX_REQ_EPSN_M,
 					 V2_QPC_BYTE_108_RX_REQ_EPSN_S);
-	qp_attr->sq_psn = (u32)roce_get_field(context->byte_172_sq_psn,
+	qp_attr->sq_psn = (u32)roce_get_field(context.byte_172_sq_psn,
 					      V2_QPC_BYTE_172_SQ_CUR_PSN_M,
 					      V2_QPC_BYTE_172_SQ_CUR_PSN_S);
-	qp_attr->dest_qp_num = (u8)roce_get_field(context->byte_56_dqpn_err,
+	qp_attr->dest_qp_num = (u8)roce_get_field(context.byte_56_dqpn_err,
 						  V2_QPC_BYTE_56_DQPN_M,
 						  V2_QPC_BYTE_56_DQPN_S);
-	qp_attr->qp_access_flags = ((roce_get_bit(context->byte_76_srqn_op_en,
-						  V2_QPC_BYTE_76_RRE_S)) << 2) |
-				   ((roce_get_bit(context->byte_76_srqn_op_en,
-						  V2_QPC_BYTE_76_RWE_S)) << 1) |
-				   ((roce_get_bit(context->byte_76_srqn_op_en,
-						  V2_QPC_BYTE_76_ATE_S)) << 3);
+	qp_attr->qp_access_flags = ((roce_get_bit(context.byte_76_srqn_op_en,
+				    V2_QPC_BYTE_76_RRE_S)) << V2_QP_RRE_S) |
+				    ((roce_get_bit(context.byte_76_srqn_op_en,
+				    V2_QPC_BYTE_76_RWE_S)) << V2_QP_RWE_S) |
+				    ((roce_get_bit(context.byte_76_srqn_op_en,
+				    V2_QPC_BYTE_76_ATE_S)) << V2_QP_ATE_S);
+
 	if (hr_qp->ibqp.qp_type == IB_QPT_RC ||
 	    hr_qp->ibqp.qp_type == IB_QPT_UC) {
 		struct ib_global_route *grh =
 				rdma_ah_retrieve_grh(&qp_attr->ah_attr);
 
 		rdma_ah_set_sl(&qp_attr->ah_attr,
-			       roce_get_field(context->byte_28_at_fl,
+			       roce_get_field(context.byte_28_at_fl,
 					      V2_QPC_BYTE_28_SL_M,
 					      V2_QPC_BYTE_28_SL_S));
-		grh->flow_label = roce_get_field(context->byte_28_at_fl,
+		grh->flow_label = roce_get_field(context.byte_28_at_fl,
 						 V2_QPC_BYTE_28_FL_M,
 						 V2_QPC_BYTE_28_FL_S);
-		grh->sgid_index = roce_get_field(context->byte_20_smac_sgid_idx,
+		grh->sgid_index = roce_get_field(context.byte_20_smac_sgid_idx,
 						 V2_QPC_BYTE_20_SGID_IDX_M,
 						 V2_QPC_BYTE_20_SGID_IDX_S);
-		grh->hop_limit = roce_get_field(context->byte_24_mtu_tc,
+		grh->hop_limit = roce_get_field(context.byte_24_mtu_tc,
 						V2_QPC_BYTE_24_HOP_LIMIT_M,
 						V2_QPC_BYTE_24_HOP_LIMIT_S);
-		grh->traffic_class = roce_get_field(context->byte_24_mtu_tc,
+		grh->traffic_class = roce_get_field(context.byte_24_mtu_tc,
 						    V2_QPC_BYTE_24_TC_M,
 						    V2_QPC_BYTE_24_TC_S);
 
-		memcpy(grh->dgid.raw, context->dgid, sizeof(grh->dgid.raw));
+		memcpy(grh->dgid.raw, context.dgid, sizeof(grh->dgid.raw));
 	}
 
 	qp_attr->port_num = hr_qp->port + 1;
 	qp_attr->sq_draining = 0;
-	qp_attr->max_rd_atomic = 1 << roce_get_field(context->byte_208_irrl,
+	qp_attr->max_rd_atomic = 1 << roce_get_field(context.byte_208_irrl,
 						     V2_QPC_BYTE_208_SR_MAX_M,
 						     V2_QPC_BYTE_208_SR_MAX_S);
-	qp_attr->max_dest_rd_atomic = 1 << roce_get_field(context->byte_140_raq,
+	qp_attr->max_dest_rd_atomic = 1 << roce_get_field(context.byte_140_raq,
 						     V2_QPC_BYTE_140_RR_MAX_M,
 						     V2_QPC_BYTE_140_RR_MAX_S);
-	qp_attr->min_rnr_timer = (u8)roce_get_field(context->byte_80_rnr_rx_cqn,
+	qp_attr->min_rnr_timer = (u8)roce_get_field(context.byte_80_rnr_rx_cqn,
 						 V2_QPC_BYTE_80_MIN_RNR_TIME_M,
 						 V2_QPC_BYTE_80_MIN_RNR_TIME_S);
-	qp_attr->timeout = (u8)roce_get_field(context->byte_28_at_fl,
+	qp_attr->timeout = (u8)roce_get_field(context.byte_28_at_fl,
 					      V2_QPC_BYTE_28_AT_M,
 					      V2_QPC_BYTE_28_AT_S);
-	qp_attr->retry_cnt = roce_get_field(context->byte_212_lsn,
+	qp_attr->retry_cnt = roce_get_field(context.byte_212_lsn,
 					    V2_QPC_BYTE_212_RETRY_CNT_M,
 					    V2_QPC_BYTE_212_RETRY_CNT_S);
-	qp_attr->rnr_retry = context->rq_rnr_timer;
+	qp_attr->rnr_retry = le32_to_cpu(context.rq_rnr_timer);
 
 done:
 	qp_attr->cur_qp_state = qp_attr->qp_state;
@@ -3814,16 +4641,15 @@
 
 out:
 	mutex_unlock(&hr_qp->mutex);
-	kfree(context);
 	return ret;
 }
 
 static int hns_roce_v2_destroy_qp_common(struct hns_roce_dev *hr_dev,
 					 struct hns_roce_qp *hr_qp,
-					 int is_user)
+					 struct ib_udata *udata)
 {
 	struct hns_roce_cq *send_cq, *recv_cq;
-	struct device *dev = hr_dev->dev;
+	struct ib_device *ibdev = &hr_dev->ib_dev;
 	int ret;
 
 	if (hr_qp->ibqp.qp_type == IB_QPT_RC && hr_qp->state != IB_QPS_RESET) {
@@ -3831,8 +4657,7 @@
 		ret = hns_roce_v2_modify_qp(&hr_qp->ibqp, NULL, 0,
 					    hr_qp->state, IB_QPS_RESET);
 		if (ret) {
-			dev_err(dev, "modify QP %06lx to ERR failed.\n",
-				hr_qp->qpn);
+			ibdev_err(ibdev, "modify QP to Reset failed.\n");
 			return ret;
 		}
 	}
@@ -3842,7 +4667,7 @@
 
 	hns_roce_lock_cqs(send_cq, recv_cq);
 
-	if (!is_user) {
+	if (!udata) {
 		__hns_roce_v2_cq_clean(recv_cq, hr_qp->qpn, hr_qp->ibqp.srq ?
 				       to_hr_srq(hr_qp->ibqp.srq) : NULL);
 		if (send_cq != recv_cq)
@@ -3861,19 +4686,20 @@
 	    (hr_qp->ibqp.qp_type == IB_QPT_UD))
 		hns_roce_release_range_qp(hr_dev, hr_qp->qpn, 1);
 
-	hns_roce_mtt_cleanup(hr_dev, &hr_qp->mtt);
+	hns_roce_mtr_cleanup(hr_dev, &hr_qp->mtr);
 
-	if (is_user) {
+	if (udata) {
+		struct hns_roce_ucontext *context =
+			rdma_udata_to_drv_context(
+				udata,
+				struct hns_roce_ucontext,
+				ibucontext);
+
 		if (hr_qp->sq.wqe_cnt && (hr_qp->sdb_en == 1))
-			hns_roce_db_unmap_user(
-				to_hr_ucontext(hr_qp->ibqp.uobject->context),
-				&hr_qp->sdb);
+			hns_roce_db_unmap_user(context, &hr_qp->sdb);
 
 		if (hr_qp->rq.wqe_cnt && (hr_qp->rdb_en == 1))
-			hns_roce_db_unmap_user(
-				to_hr_ucontext(hr_qp->ibqp.uobject->context),
-				&hr_qp->rdb);
-		ib_umem_release(hr_qp->umem);
+			hns_roce_db_unmap_user(context, &hr_qp->rdb);
 	} else {
 		kfree(hr_qp->sq.wrid);
 		kfree(hr_qp->rq.wrid);
@@ -3881,8 +4707,10 @@
 		if (hr_qp->rq.wqe_cnt)
 			hns_roce_free_db(hr_dev, &hr_qp->rdb);
 	}
+	ib_umem_release(hr_qp->umem);
 
-	if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RQ_INLINE) {
+	if ((hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RQ_INLINE) &&
+	     hr_qp->rq.wqe_cnt) {
 		kfree(hr_qp->rq_inl_buf.wqe_list[0].sg_list);
 		kfree(hr_qp->rq_inl_buf.wqe_list);
 	}
@@ -3890,15 +4718,16 @@
 	return 0;
 }
 
-static int hns_roce_v2_destroy_qp(struct ib_qp *ibqp)
+static int hns_roce_v2_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata)
 {
 	struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device);
 	struct hns_roce_qp *hr_qp = to_hr_qp(ibqp);
 	int ret;
 
-	ret = hns_roce_v2_destroy_qp_common(hr_dev, hr_qp, !!ibqp->pd->uobject);
+	ret = hns_roce_v2_destroy_qp_common(hr_dev, hr_qp, udata);
 	if (ret) {
-		dev_err(hr_dev->dev, "Destroy qp failed(%d)\n", ret);
+		ibdev_err(&hr_dev->ib_dev, "Destroy qp 0x%06lx failed(%d)\n",
+			  hr_qp->qpn, ret);
 		return ret;
 	}
 
@@ -3910,6 +4739,59 @@
 	return 0;
 }
 
+static int hns_roce_v2_qp_flow_control_init(struct hns_roce_dev *hr_dev,
+						struct hns_roce_qp *hr_qp)
+{
+	struct hns_roce_sccc_clr_done *resp;
+	struct hns_roce_sccc_clr *clr;
+	struct hns_roce_cmq_desc desc;
+	int ret, i;
+
+	mutex_lock(&hr_dev->qp_table.scc_mutex);
+
+	/* set scc ctx clear done flag */
+	hns_roce_cmq_setup_basic_desc(&desc, HNS_ROCE_OPC_RESET_SCCC, false);
+	ret =  hns_roce_cmq_send(hr_dev, &desc, 1);
+	if (ret) {
+		dev_err(hr_dev->dev, "Reset SCC ctx  failed(%d)\n", ret);
+		goto out;
+	}
+
+	/* clear scc context */
+	hns_roce_cmq_setup_basic_desc(&desc, HNS_ROCE_OPC_CLR_SCCC, false);
+	clr = (struct hns_roce_sccc_clr *)desc.data;
+	clr->qpn = cpu_to_le32(hr_qp->qpn);
+	ret =  hns_roce_cmq_send(hr_dev, &desc, 1);
+	if (ret) {
+		dev_err(hr_dev->dev, "Clear SCC ctx failed(%d)\n", ret);
+		goto out;
+	}
+
+	/* query scc context clear is done or not */
+	resp = (struct hns_roce_sccc_clr_done *)desc.data;
+	for (i = 0; i <= HNS_ROCE_CMQ_SCC_CLR_DONE_CNT; i++) {
+		hns_roce_cmq_setup_basic_desc(&desc,
+					      HNS_ROCE_OPC_QUERY_SCCC, true);
+		ret = hns_roce_cmq_send(hr_dev, &desc, 1);
+		if (ret) {
+			dev_err(hr_dev->dev, "Query clr cmq failed(%d)\n", ret);
+			goto out;
+		}
+
+		if (resp->clr_done)
+			goto out;
+
+		msleep(20);
+	}
+
+	dev_err(hr_dev->dev, "Query SCC clr done flag overtime.\n");
+	ret = -ETIMEDOUT;
+
+out:
+	mutex_unlock(&hr_dev->qp_table.scc_mutex);
+	return ret;
+}
+
 static int hns_roce_v2_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period)
 {
 	struct hns_roce_dev *hr_dev = to_hr_dev(cq->device);
@@ -3967,7 +4849,8 @@
 	if (hr_qp->ibqp.uobject) {
 		if (hr_qp->sdb_en == 1) {
 			hr_qp->sq.head = *(int *)(hr_qp->sdb.virt_addr);
-			hr_qp->rq.head = *(int *)(hr_qp->rdb.virt_addr);
+			if (hr_qp->rdb_en == 1)
+				hr_qp->rq.head = *(int *)(hr_qp->rdb.virt_addr);
 		} else {
 			dev_warn(hr_dev->dev, "flush cqe is unsupported in userspace!\n");
 			return;
@@ -3987,14 +4870,58 @@
 {
 	struct hns_roce_work *irq_work =
 				container_of(work, struct hns_roce_work, work);
+	struct device *dev = irq_work->hr_dev->dev;
 	u32 qpn = irq_work->qpn;
+	u32 cqn = irq_work->cqn;
 
 	switch (irq_work->event_type) {
+	case HNS_ROCE_EVENT_TYPE_PATH_MIG:
+		dev_info(dev, "Path migrated succeeded.\n");
+		break;
+	case HNS_ROCE_EVENT_TYPE_PATH_MIG_FAILED:
+		dev_warn(dev, "Path migration failed.\n");
+		break;
+	case HNS_ROCE_EVENT_TYPE_COMM_EST:
+		break;
+	case HNS_ROCE_EVENT_TYPE_SQ_DRAINED:
+		dev_warn(dev, "Send queue drained.\n");
+		break;
 	case HNS_ROCE_EVENT_TYPE_WQ_CATAS_ERROR:
-	case HNS_ROCE_EVENT_TYPE_INV_REQ_LOCAL_WQ_ERROR:
-	case HNS_ROCE_EVENT_TYPE_LOCAL_WQ_ACCESS_ERROR:
+		dev_err(dev, "Local work queue 0x%x catas error, sub_type:%d\n",
+			qpn, irq_work->sub_type);
 		hns_roce_set_qps_to_err(irq_work->hr_dev, qpn);
 		break;
+	case HNS_ROCE_EVENT_TYPE_INV_REQ_LOCAL_WQ_ERROR:
+		dev_err(dev, "Invalid request local work queue 0x%x error.\n",
+			qpn);
+		hns_roce_set_qps_to_err(irq_work->hr_dev, qpn);
+		break;
+	case HNS_ROCE_EVENT_TYPE_LOCAL_WQ_ACCESS_ERROR:
+		dev_err(dev, "Local access violation work queue 0x%x error, sub_type:%d\n",
+			qpn, irq_work->sub_type);
+		hns_roce_set_qps_to_err(irq_work->hr_dev, qpn);
+		break;
+	case HNS_ROCE_EVENT_TYPE_SRQ_LIMIT_REACH:
+		dev_warn(dev, "SRQ limit reach.\n");
+		break;
+	case HNS_ROCE_EVENT_TYPE_SRQ_LAST_WQE_REACH:
+		dev_warn(dev, "SRQ last wqe reach.\n");
+		break;
+	case HNS_ROCE_EVENT_TYPE_SRQ_CATAS_ERROR:
+		dev_err(dev, "SRQ catas error.\n");
+		break;
+	case HNS_ROCE_EVENT_TYPE_CQ_ACCESS_ERROR:
+		dev_err(dev, "CQ 0x%x access err.\n", cqn);
+		break;
+	case HNS_ROCE_EVENT_TYPE_CQ_OVERFLOW:
+		dev_warn(dev, "CQ 0x%x overflow\n", cqn);
+		break;
+	case HNS_ROCE_EVENT_TYPE_DB_OVERFLOW:
+		dev_warn(dev, "DB overflow.\n");
+		break;
+	case HNS_ROCE_EVENT_TYPE_FLR:
+		dev_warn(dev, "Function level reset.\n");
+		break;
 	default:
 		break;
 	}
@@ -4003,7 +4930,8 @@
 }
 
 static void hns_roce_v2_init_irq_work(struct hns_roce_dev *hr_dev,
-				      struct hns_roce_eq *eq, u32 qpn)
+				      struct hns_roce_eq *eq,
+				      u32 qpn, u32 cqn)
 {
 	struct hns_roce_work *irq_work;
 
@@ -4014,6 +4942,7 @@
 	INIT_WORK(&(irq_work->work), hns_roce_irq_work_handle);
 	irq_work->hr_dev = hr_dev;
 	irq_work->qpn = qpn;
+	irq_work->cqn = cqn;
 	irq_work->event_type = eq->event_type;
 	irq_work->sub_type = eq->sub_type;
 	queue_work(hr_dev->irq_workq, &(irq_work->work));
@@ -4021,7 +4950,8 @@
 
 static void set_eq_cons_index_v2(struct hns_roce_eq *eq)
 {
-	u32 doorbell[2];
+	struct hns_roce_dev *hr_dev = eq->hr_dev;
+	__le32 doorbell[2];
 
 	doorbell[0] = 0;
 	doorbell[1] = 0;
@@ -4047,125 +4977,7 @@
 		       HNS_ROCE_V2_EQ_DB_PARA_S,
 		       (eq->cons_index & HNS_ROCE_V2_CONS_IDX_M));
 
-	hns_roce_write64_k(doorbell, eq->doorbell);
-}
-
-static void hns_roce_v2_wq_catas_err_handle(struct hns_roce_dev *hr_dev,
-						  struct hns_roce_aeqe *aeqe,
-						  u32 qpn)
-{
-	struct device *dev = hr_dev->dev;
-	int sub_type;
-
-	dev_warn(dev, "Local work queue catastrophic error.\n");
-	sub_type = roce_get_field(aeqe->asyn, HNS_ROCE_V2_AEQE_SUB_TYPE_M,
-				  HNS_ROCE_V2_AEQE_SUB_TYPE_S);
-	switch (sub_type) {
-	case HNS_ROCE_LWQCE_QPC_ERROR:
-		dev_warn(dev, "QP %d, QPC error.\n", qpn);
-		break;
-	case HNS_ROCE_LWQCE_MTU_ERROR:
-		dev_warn(dev, "QP %d, MTU error.\n", qpn);
-		break;
-	case HNS_ROCE_LWQCE_WQE_BA_ADDR_ERROR:
-		dev_warn(dev, "QP %d, WQE BA addr error.\n", qpn);
-		break;
-	case HNS_ROCE_LWQCE_WQE_ADDR_ERROR:
-		dev_warn(dev, "QP %d, WQE addr error.\n", qpn);
-		break;
-	case HNS_ROCE_LWQCE_SQ_WQE_SHIFT_ERROR:
-		dev_warn(dev, "QP %d, WQE shift error.\n", qpn);
-		break;
-	default:
-		dev_err(dev, "Unhandled sub_event type %d.\n", sub_type);
-		break;
-	}
-}
-
-static void hns_roce_v2_local_wq_access_err_handle(struct hns_roce_dev *hr_dev,
-					    struct hns_roce_aeqe *aeqe, u32 qpn)
-{
-	struct device *dev = hr_dev->dev;
-	int sub_type;
-
-	dev_warn(dev, "Local access violation work queue error.\n");
-	sub_type = roce_get_field(aeqe->asyn, HNS_ROCE_V2_AEQE_SUB_TYPE_M,
-				  HNS_ROCE_V2_AEQE_SUB_TYPE_S);
-	switch (sub_type) {
-	case HNS_ROCE_LAVWQE_R_KEY_VIOLATION:
-		dev_warn(dev, "QP %d, R_key violation.\n", qpn);
-		break;
-	case HNS_ROCE_LAVWQE_LENGTH_ERROR:
-		dev_warn(dev, "QP %d, length error.\n", qpn);
-		break;
-	case HNS_ROCE_LAVWQE_VA_ERROR:
-		dev_warn(dev, "QP %d, VA error.\n", qpn);
-		break;
-	case HNS_ROCE_LAVWQE_PD_ERROR:
-		dev_err(dev, "QP %d, PD error.\n", qpn);
-		break;
-	case HNS_ROCE_LAVWQE_RW_ACC_ERROR:
-		dev_warn(dev, "QP %d, rw acc error.\n", qpn);
-		break;
-	case HNS_ROCE_LAVWQE_KEY_STATE_ERROR:
-		dev_warn(dev, "QP %d, key state error.\n", qpn);
-		break;
-	case HNS_ROCE_LAVWQE_MR_OPERATION_ERROR:
-		dev_warn(dev, "QP %d, MR operation error.\n", qpn);
-		break;
-	default:
-		dev_err(dev, "Unhandled sub_event type %d.\n", sub_type);
-		break;
-	}
-}
-
-static void hns_roce_v2_qp_err_handle(struct hns_roce_dev *hr_dev,
-				      struct hns_roce_aeqe *aeqe,
-				      int event_type, u32 qpn)
-{
-	struct device *dev = hr_dev->dev;
-
-	switch (event_type) {
-	case HNS_ROCE_EVENT_TYPE_COMM_EST:
-		dev_warn(dev, "Communication established.\n");
-		break;
-	case HNS_ROCE_EVENT_TYPE_SQ_DRAINED:
-		dev_warn(dev, "Send queue drained.\n");
-		break;
-	case HNS_ROCE_EVENT_TYPE_WQ_CATAS_ERROR:
-		hns_roce_v2_wq_catas_err_handle(hr_dev, aeqe, qpn);
-		break;
-	case HNS_ROCE_EVENT_TYPE_INV_REQ_LOCAL_WQ_ERROR:
-		dev_warn(dev, "Invalid request local work queue error.\n");
-		break;
-	case HNS_ROCE_EVENT_TYPE_LOCAL_WQ_ACCESS_ERROR:
-		hns_roce_v2_local_wq_access_err_handle(hr_dev, aeqe, qpn);
-		break;
-	default:
-		break;
-	}
-
-	hns_roce_qp_event(hr_dev, qpn, event_type);
-}
-
-static void hns_roce_v2_cq_err_handle(struct hns_roce_dev *hr_dev,
-				      struct hns_roce_aeqe *aeqe,
-				      int event_type, u32 cqn)
-{
-	struct device *dev = hr_dev->dev;
-
-	switch (event_type) {
-	case HNS_ROCE_EVENT_TYPE_CQ_ACCESS_ERROR:
-		dev_warn(dev, "CQ 0x%x access err.\n", cqn);
-		break;
-	case HNS_ROCE_EVENT_TYPE_CQ_OVERFLOW:
-		dev_warn(dev, "CQ 0x%x overflow\n", cqn);
-		break;
-	default:
-		break;
-	}
-
-	hns_roce_cq_event(hr_dev, cqn, event_type);
+	hns_roce_write64(hr_dev, doorbell, eq->doorbell);
 }
 
 static struct hns_roce_aeqe *get_aeqe_v2(struct hns_roce_eq *eq, u32 entry)
@@ -4214,15 +5026,15 @@
 			       struct hns_roce_eq *eq)
 {
 	struct device *dev = hr_dev->dev;
-	struct hns_roce_aeqe *aeqe;
+	struct hns_roce_aeqe *aeqe = next_aeqe_sw_v2(eq);
 	int aeqe_found = 0;
 	int event_type;
 	int sub_type;
+	u32 srqn;
 	u32 qpn;
 	u32 cqn;
 
-	while ((aeqe = next_aeqe_sw_v2(eq))) {
-
+	while (aeqe) {
 		/* Make sure we read AEQ entry after we have checked the
 		 * ownership bit
 		 */
@@ -4240,34 +5052,30 @@
 		cqn = roce_get_field(aeqe->event.cq_event.cq,
 				     HNS_ROCE_V2_AEQE_EVENT_QUEUE_NUM_M,
 				     HNS_ROCE_V2_AEQE_EVENT_QUEUE_NUM_S);
+		srqn = roce_get_field(aeqe->event.srq_event.srq,
+				     HNS_ROCE_V2_AEQE_EVENT_QUEUE_NUM_M,
+				     HNS_ROCE_V2_AEQE_EVENT_QUEUE_NUM_S);
 
 		switch (event_type) {
 		case HNS_ROCE_EVENT_TYPE_PATH_MIG:
-			dev_warn(dev, "Path migrated succeeded.\n");
-			break;
 		case HNS_ROCE_EVENT_TYPE_PATH_MIG_FAILED:
-			dev_warn(dev, "Path migration failed.\n");
-			break;
 		case HNS_ROCE_EVENT_TYPE_COMM_EST:
 		case HNS_ROCE_EVENT_TYPE_SQ_DRAINED:
 		case HNS_ROCE_EVENT_TYPE_WQ_CATAS_ERROR:
+		case HNS_ROCE_EVENT_TYPE_SRQ_LAST_WQE_REACH:
 		case HNS_ROCE_EVENT_TYPE_INV_REQ_LOCAL_WQ_ERROR:
 		case HNS_ROCE_EVENT_TYPE_LOCAL_WQ_ACCESS_ERROR:
-			hns_roce_v2_qp_err_handle(hr_dev, aeqe, event_type,
-						  qpn);
+			hns_roce_qp_event(hr_dev, qpn, event_type);
 			break;
 		case HNS_ROCE_EVENT_TYPE_SRQ_LIMIT_REACH:
-		case HNS_ROCE_EVENT_TYPE_SRQ_LAST_WQE_REACH:
 		case HNS_ROCE_EVENT_TYPE_SRQ_CATAS_ERROR:
-			dev_warn(dev, "SRQ not support.\n");
+			hns_roce_srq_event(hr_dev, srqn, event_type);
 			break;
 		case HNS_ROCE_EVENT_TYPE_CQ_ACCESS_ERROR:
 		case HNS_ROCE_EVENT_TYPE_CQ_OVERFLOW:
-			hns_roce_v2_cq_err_handle(hr_dev, aeqe, event_type,
-						  cqn);
+			hns_roce_cq_event(hr_dev, cqn, event_type);
 			break;
 		case HNS_ROCE_EVENT_TYPE_DB_OVERFLOW:
-			dev_warn(dev, "DB overflow.\n");
 			break;
 		case HNS_ROCE_EVENT_TYPE_MB:
 			hns_roce_cmd_event(hr_dev,
@@ -4276,27 +5084,26 @@
 					le64_to_cpu(aeqe->event.cmd.out_param));
 			break;
 		case HNS_ROCE_EVENT_TYPE_CEQ_OVERFLOW:
-			dev_warn(dev, "CEQ overflow.\n");
 			break;
 		case HNS_ROCE_EVENT_TYPE_FLR:
-			dev_warn(dev, "Function level reset.\n");
 			break;
 		default:
 			dev_err(dev, "Unhandled event %d on EQ %d at idx %u.\n",
 				event_type, eq->eqn, eq->cons_index);
 			break;
-		};
+		}
 
 		eq->event_type = event_type;
 		eq->sub_type = sub_type;
 		++eq->cons_index;
 		aeqe_found = 1;
 
-		if (eq->cons_index > (2 * eq->entries - 1)) {
-			dev_warn(dev, "cons_index overflow, set back to 0.\n");
+		if (eq->cons_index > (2 * eq->entries - 1))
 			eq->cons_index = 0;
-		}
-		hns_roce_v2_init_irq_work(hr_dev, eq, qpn);
+
+		hns_roce_v2_init_irq_work(hr_dev, eq, qpn, cqn);
+
+		aeqe = next_aeqe_sw_v2(eq);
 	}
 
 	set_eq_cons_index_v2(eq);
@@ -4349,12 +5156,11 @@
 			       struct hns_roce_eq *eq)
 {
 	struct device *dev = hr_dev->dev;
-	struct hns_roce_ceqe *ceqe;
+	struct hns_roce_ceqe *ceqe = next_ceqe_sw_v2(eq);
 	int ceqe_found = 0;
 	u32 cqn;
 
-	while ((ceqe = next_ceqe_sw_v2(eq))) {
-
+	while (ceqe) {
 		/* Make sure we read CEQ entry after we have checked the
 		 * ownership bit
 		 */
@@ -4369,10 +5175,12 @@
 		++eq->cons_index;
 		ceqe_found = 1;
 
-		if (eq->cons_index > (2 * eq->entries - 1)) {
+		if (eq->cons_index > (EQ_DEPTH_COEFF * eq->entries - 1)) {
 			dev_warn(dev, "cons_index overflow, set back to 0.\n");
 			eq->cons_index = 0;
 		}
+
+		ceqe = next_ceqe_sw_v2(eq);
 	}
 
 	set_eq_cons_index_v2(eq);
@@ -4408,33 +5216,44 @@
 	int_st = roce_read(hr_dev, ROCEE_VF_ABN_INT_ST_REG);
 	int_en = roce_read(hr_dev, ROCEE_VF_ABN_INT_EN_REG);
 
-	if (roce_get_bit(int_st, HNS_ROCE_V2_VF_INT_ST_AEQ_OVERFLOW_S)) {
+	if (int_st & BIT(HNS_ROCE_V2_VF_INT_ST_AEQ_OVERFLOW_S)) {
+		struct pci_dev *pdev = hr_dev->pci_dev;
+		struct hnae3_ae_dev *ae_dev = pci_get_drvdata(pdev);
+		const struct hnae3_ae_ops *ops = ae_dev->ops;
+
 		dev_err(dev, "AEQ overflow!\n");
 
-		roce_set_bit(int_st, HNS_ROCE_V2_VF_INT_ST_AEQ_OVERFLOW_S, 1);
+		int_st |= 1 << HNS_ROCE_V2_VF_INT_ST_AEQ_OVERFLOW_S;
 		roce_write(hr_dev, ROCEE_VF_ABN_INT_ST_REG, int_st);
 
-		roce_set_bit(int_en, HNS_ROCE_V2_VF_ABN_INT_EN_S, 1);
+		/* Set reset level for reset_event() */
+		if (ops->set_default_reset_request)
+			ops->set_default_reset_request(ae_dev,
+						       HNAE3_FUNC_RESET);
+		if (ops->reset_event)
+			ops->reset_event(pdev, NULL);
+
+		int_en |= 1 << HNS_ROCE_V2_VF_ABN_INT_EN_S;
 		roce_write(hr_dev, ROCEE_VF_ABN_INT_EN_REG, int_en);
 
 		int_work = 1;
-	} else if (roce_get_bit(int_st,	HNS_ROCE_V2_VF_INT_ST_BUS_ERR_S)) {
+	} else if (int_st & BIT(HNS_ROCE_V2_VF_INT_ST_BUS_ERR_S)) {
 		dev_err(dev, "BUS ERR!\n");
 
-		roce_set_bit(int_st, HNS_ROCE_V2_VF_INT_ST_BUS_ERR_S, 1);
+		int_st |= 1 << HNS_ROCE_V2_VF_INT_ST_BUS_ERR_S;
 		roce_write(hr_dev, ROCEE_VF_ABN_INT_ST_REG, int_st);
 
-		roce_set_bit(int_en, HNS_ROCE_V2_VF_ABN_INT_EN_S, 1);
+		int_en |= 1 << HNS_ROCE_V2_VF_ABN_INT_EN_S;
 		roce_write(hr_dev, ROCEE_VF_ABN_INT_EN_REG, int_en);
 
 		int_work = 1;
-	} else if (roce_get_bit(int_st,	HNS_ROCE_V2_VF_INT_ST_OTHER_ERR_S)) {
+	} else if (int_st & BIT(HNS_ROCE_V2_VF_INT_ST_OTHER_ERR_S)) {
 		dev_err(dev, "OTHER ERR!\n");
 
-		roce_set_bit(int_st, HNS_ROCE_V2_VF_INT_ST_OTHER_ERR_S, 1);
+		int_st |= 1 << HNS_ROCE_V2_VF_INT_ST_OTHER_ERR_S;
 		roce_write(hr_dev, ROCEE_VF_ABN_INT_ST_REG, int_st);
 
-		roce_set_bit(int_en, HNS_ROCE_V2_VF_ABN_INT_EN_S, 1);
+		int_en |= 1 << HNS_ROCE_V2_VF_ABN_INT_EN_S;
 		roce_write(hr_dev, ROCEE_VF_ABN_INT_EN_REG, int_en);
 
 		int_work = 1;
@@ -4506,14 +5325,12 @@
 	buf_chk_sz = 1 << (hr_dev->caps.eqe_buf_pg_sz + PAGE_SHIFT);
 	bt_chk_sz = 1 << (hr_dev->caps.eqe_ba_pg_sz + PAGE_SHIFT);
 
-	/* hop_num = 0 */
 	if (mhop_num == HNS_ROCE_HOP_NUM_0) {
 		dma_free_coherent(dev, (unsigned int)(eq->entries *
 				  eq->eqe_size), eq->bt_l0, eq->l0_dma);
 		return;
 	}
 
-	/* hop_num = 1 or hop = 2 */
 	dma_free_coherent(dev, bt_chk_sz, eq->bt_l0, eq->l0_dma);
 	if (mhop_num == 1) {
 		for (i = 0; i < eq->l0_last_num; i++) {
@@ -4532,8 +5349,8 @@
 			dma_free_coherent(dev, bt_chk_sz, eq->bt_l1[i],
 					  eq->l1_dma[i]);
 
-			for (j = 0; j < bt_chk_sz / 8; j++) {
-				idx = i * (bt_chk_sz / 8) + j;
+			for (j = 0; j < bt_chk_sz / BA_BYTE_LEN; j++) {
+				idx = i * (bt_chk_sz / BA_BYTE_LEN) + j;
 				if ((i == eq->l0_last_num - 1)
 				     && j == eq->l1_last_num - 1) {
 					eqe_alloc = (buf_chk_sz / eq->eqe_size)
@@ -4572,9 +5389,9 @@
 		return;
 	}
 
-	if (eq->buf_list)
-		dma_free_coherent(hr_dev->dev, buf_chk_sz,
-				  eq->buf_list->buf, eq->buf_list->map);
+	dma_free_coherent(hr_dev->dev, buf_chk_sz, eq->buf_list->buf,
+			  eq->buf_list->map);
+	kfree(eq->buf_list);
 }
 
 static void hns_roce_config_eqc(struct hns_roce_dev *hr_dev,
@@ -4749,11 +5566,10 @@
 	buf_chk_sz = 1 << (hr_dev->caps.eqe_buf_pg_sz + PAGE_SHIFT);
 	bt_chk_sz = 1 << (hr_dev->caps.eqe_ba_pg_sz + PAGE_SHIFT);
 
-	ba_num = (PAGE_ALIGN(eq->entries * eq->eqe_size) + buf_chk_sz - 1)
-		  / buf_chk_sz;
-	bt_num = (ba_num + bt_chk_sz / 8 - 1) / (bt_chk_sz / 8);
+	ba_num = DIV_ROUND_UP(PAGE_ALIGN(eq->entries * eq->eqe_size),
+			      buf_chk_sz);
+	bt_num = DIV_ROUND_UP(ba_num, bt_chk_sz / BA_BYTE_LEN);
 
-	/* hop_num = 0 */
 	if (mhop_num == HNS_ROCE_HOP_NUM_0) {
 		if (eq->entries > buf_chk_sz / eq->eqe_size) {
 			dev_err(dev, "eq entries %d is larger than buf_pg_sz!",
@@ -4768,8 +5584,6 @@
 		eq->cur_eqe_ba = eq->l0_dma;
 		eq->nxt_eqe_ba = 0;
 
-		memset(eq->bt_l0, 0, eq->entries * eq->eqe_size);
-
 		return 0;
 	}
 
@@ -4796,12 +5610,12 @@
 		goto err_dma_alloc_l0;
 
 	if (mhop_num == 1) {
-		if (ba_num > (bt_chk_sz / 8))
+		if (ba_num > (bt_chk_sz / BA_BYTE_LEN))
 			dev_err(dev, "ba_num %d is too large for 1 hop\n",
 				ba_num);
 
 		/* alloc buf */
-		for (i = 0; i < bt_chk_sz / 8; i++) {
+		for (i = 0; i < bt_chk_sz / BA_BYTE_LEN; i++) {
 			if (eq_buf_cnt + 1 < ba_num) {
 				size = buf_chk_sz;
 			} else {
@@ -4814,7 +5628,6 @@
 			if (!eq->buf[i])
 				goto err_dma_alloc_buf;
 
-			memset(eq->buf[i], 0, size);
 			*(eq->bt_l0 + i) = eq->buf_dma[i];
 
 			eq_buf_cnt++;
@@ -4822,11 +5635,12 @@
 				break;
 		}
 		eq->cur_eqe_ba = eq->buf_dma[0];
-		eq->nxt_eqe_ba = eq->buf_dma[1];
+		if (ba_num > 1)
+			eq->nxt_eqe_ba = eq->buf_dma[1];
 
 	} else if (mhop_num == 2) {
 		/* alloc L1 BT and buf */
-		for (i = 0; i < bt_chk_sz / 8; i++) {
+		for (i = 0; i < bt_chk_sz / BA_BYTE_LEN; i++) {
 			eq->bt_l1[i] = dma_alloc_coherent(dev, bt_chk_sz,
 							  &(eq->l1_dma[i]),
 							  GFP_KERNEL);
@@ -4834,8 +5648,8 @@
 				goto err_dma_alloc_l1;
 			*(eq->bt_l0 + i) = eq->l1_dma[i];
 
-			for (j = 0; j < bt_chk_sz / 8; j++) {
-				idx = i * bt_chk_sz / 8 + j;
+			for (j = 0; j < bt_chk_sz / BA_BYTE_LEN; j++) {
+				idx = i * bt_chk_sz / BA_BYTE_LEN + j;
 				if (eq_buf_cnt + 1 < ba_num) {
 					size = buf_chk_sz;
 				} else {
@@ -4845,12 +5659,11 @@
 						* eq->eqe_size;
 				}
 				eq->buf[idx] = dma_alloc_coherent(dev, size,
-							    &(eq->buf_dma[idx]),
-							    GFP_KERNEL);
+								  &(eq->buf_dma[idx]),
+								  GFP_KERNEL);
 				if (!eq->buf[idx])
 					goto err_dma_alloc_buf;
 
-				memset(eq->buf[idx], 0, size);
 				*(eq->bt_l1[i] + j) = eq->buf_dma[idx];
 
 				eq_buf_cnt++;
@@ -4864,7 +5677,8 @@
 				break;
 		}
 		eq->cur_eqe_ba = eq->buf_dma[0];
-		eq->nxt_eqe_ba = eq->buf_dma[1];
+		if (ba_num > 1)
+			eq->nxt_eqe_ba = eq->buf_dma[1];
 	}
 
 	eq->l0_last_num = i + 1;
@@ -4881,8 +5695,8 @@
 		dma_free_coherent(dev, bt_chk_sz, eq->bt_l1[i],
 				  eq->l1_dma[i]);
 
-		for (j = 0; j < bt_chk_sz / 8; j++) {
-			idx = i * bt_chk_sz / 8 + j;
+		for (j = 0; j < bt_chk_sz / BA_BYTE_LEN; j++) {
+			idx = i * bt_chk_sz / BA_BYTE_LEN + j;
 			dma_free_coherent(dev, buf_chk_sz, eq->buf[idx],
 					  eq->buf_dma[idx]);
 		}
@@ -4905,11 +5719,11 @@
 			dma_free_coherent(dev, bt_chk_sz, eq->bt_l1[i],
 					  eq->l1_dma[i]);
 
-			for (j = 0; j < bt_chk_sz / 8; j++) {
+			for (j = 0; j < bt_chk_sz / BA_BYTE_LEN; j++) {
 				if (i == record_i && j >= record_j)
 					break;
 
-				idx = i * bt_chk_sz / 8 + j;
+				idx = i * bt_chk_sz / BA_BYTE_LEN + j;
 				dma_free_coherent(dev, buf_chk_sz,
 						  eq->buf[idx],
 						  eq->buf_dma[idx]);
@@ -4968,7 +5782,6 @@
 			goto err_alloc_buf;
 		}
 
-		memset(eq->buf_list->buf, 0, buf_chk_sz);
 	} else {
 		ret = hns_roce_mhop_alloc_eq(hr_dev, eq);
 		if (ret) {
@@ -5008,6 +5821,95 @@
 	return ret;
 }
 
+static int __hns_roce_request_irq(struct hns_roce_dev *hr_dev, int irq_num,
+				  int comp_num, int aeq_num, int other_num)
+{
+	struct hns_roce_eq_table *eq_table = &hr_dev->eq_table;
+	int i, j;
+	int ret;
+
+	for (i = 0; i < irq_num; i++) {
+		hr_dev->irq_names[i] = kzalloc(HNS_ROCE_INT_NAME_LEN,
+					       GFP_KERNEL);
+		if (!hr_dev->irq_names[i]) {
+			ret = -ENOMEM;
+			goto err_kzalloc_failed;
+		}
+	}
+
+	/* irq contains: abnormal + AEQ + CEQ */
+	for (j = 0; j < other_num; j++)
+		snprintf((char *)hr_dev->irq_names[j],
+			 HNS_ROCE_INT_NAME_LEN, "hns-abn-%d", j);
+
+	for (j = other_num; j < (other_num + aeq_num); j++)
+		snprintf((char *)hr_dev->irq_names[j],
+			 HNS_ROCE_INT_NAME_LEN, "hns-aeq-%d",
+			 j - other_num);
+
+	for (j = (other_num + aeq_num); j < irq_num; j++)
+		snprintf((char *)hr_dev->irq_names[j],
+			 HNS_ROCE_INT_NAME_LEN, "hns-ceq-%d",
+			 j - other_num - aeq_num);
+
+	for (j = 0; j < irq_num; j++) {
+		if (j < other_num)
+			ret = request_irq(hr_dev->irq[j],
+					  hns_roce_v2_msix_interrupt_abn,
+					  0, hr_dev->irq_names[j], hr_dev);
+
+		else if (j < (other_num + comp_num))
+			ret = request_irq(eq_table->eq[j - other_num].irq,
+					  hns_roce_v2_msix_interrupt_eq,
+					  0, hr_dev->irq_names[j + aeq_num],
+					  &eq_table->eq[j - other_num]);
+		else
+			ret = request_irq(eq_table->eq[j - other_num].irq,
+					  hns_roce_v2_msix_interrupt_eq,
+					  0, hr_dev->irq_names[j - comp_num],
+					  &eq_table->eq[j - other_num]);
+		if (ret) {
+			dev_err(hr_dev->dev, "Request irq error!\n");
+			goto err_request_failed;
+		}
+	}
+
+	return 0;
+
+err_request_failed:
+	for (j -= 1; j >= 0; j--)
+		if (j < other_num)
+			free_irq(hr_dev->irq[j], hr_dev);
+		else
+			free_irq(eq_table->eq[j - other_num].irq,
+				 &eq_table->eq[j - other_num]);
+
+err_kzalloc_failed:
+	for (i -= 1; i >= 0; i--)
+		kfree(hr_dev->irq_names[i]);
+
+	return ret;
+}
+
+static void __hns_roce_free_irq(struct hns_roce_dev *hr_dev)
+{
+	int irq_num;
+	int eq_num;
+	int i;
+
+	eq_num = hr_dev->caps.num_comp_vectors + hr_dev->caps.num_aeq_vectors;
+	irq_num = eq_num + hr_dev->caps.num_other_vectors;
+
+	for (i = 0; i < hr_dev->caps.num_other_vectors; i++)
+		free_irq(hr_dev->irq[i], hr_dev);
+
+	for (i = 0; i < eq_num; i++)
+		free_irq(hr_dev->eq_table.eq[i].irq, &hr_dev->eq_table.eq[i]);
+
+	for (i = 0; i < irq_num; i++)
+		kfree(hr_dev->irq_names[i]);
+}
+
 static int hns_roce_v2_init_eq_table(struct hns_roce_dev *hr_dev)
 {
 	struct hns_roce_eq_table *eq_table = &hr_dev->eq_table;
@@ -5019,7 +5921,7 @@
 	int other_num;
 	int comp_num;
 	int aeq_num;
-	int i, j, k;
+	int i;
 	int ret;
 
 	other_num = hr_dev->caps.num_other_vectors;
@@ -5033,27 +5935,18 @@
 	if (!eq_table->eq)
 		return -ENOMEM;
 
-	for (i = 0; i < irq_num; i++) {
-		hr_dev->irq_names[i] = kzalloc(HNS_ROCE_INT_NAME_LEN,
-					       GFP_KERNEL);
-		if (!hr_dev->irq_names[i]) {
-			ret = -ENOMEM;
-			goto err_failed_kzalloc;
-		}
-	}
-
 	/* create eq */
-	for (j = 0; j < eq_num; j++) {
-		eq = &eq_table->eq[j];
+	for (i = 0; i < eq_num; i++) {
+		eq = &eq_table->eq[i];
 		eq->hr_dev = hr_dev;
-		eq->eqn = j;
-		if (j < comp_num) {
+		eq->eqn = i;
+		if (i < comp_num) {
 			/* CEQ */
 			eq_cmd = HNS_ROCE_CMD_CREATE_CEQC;
 			eq->type_flag = HNS_ROCE_CEQ;
 			eq->entries = hr_dev->caps.ceqe_depth;
 			eq->eqe_size = HNS_ROCE_CEQ_ENTRY_SIZE;
-			eq->irq = hr_dev->irq[j + other_num + aeq_num];
+			eq->irq = hr_dev->irq[i + other_num + aeq_num];
 			eq->eq_max_cnt = HNS_ROCE_CEQ_DEFAULT_BURST_NUM;
 			eq->eq_period = HNS_ROCE_CEQ_DEFAULT_INTERVAL;
 		} else {
@@ -5062,7 +5955,7 @@
 			eq->type_flag = HNS_ROCE_AEQ;
 			eq->entries = hr_dev->caps.aeqe_depth;
 			eq->eqe_size = HNS_ROCE_AEQ_ENTRY_SIZE;
-			eq->irq = hr_dev->irq[j - comp_num + other_num];
+			eq->irq = hr_dev->irq[i - comp_num + other_num];
 			eq->eq_max_cnt = HNS_ROCE_AEQ_DEFAULT_BURST_NUM;
 			eq->eq_period = HNS_ROCE_AEQ_DEFAULT_INTERVAL;
 		}
@@ -5077,66 +5970,32 @@
 	/* enable irq */
 	hns_roce_v2_int_mask_enable(hr_dev, eq_num, EQ_ENABLE);
 
-	/* irq contains: abnormal + AEQ + CEQ*/
-	for (k = 0; k < irq_num; k++)
-		if (k < other_num)
-			snprintf((char *)hr_dev->irq_names[k],
-				 HNS_ROCE_INT_NAME_LEN, "hns-abn-%d", k);
-		else if (k < (other_num + aeq_num))
-			snprintf((char *)hr_dev->irq_names[k],
-				 HNS_ROCE_INT_NAME_LEN, "hns-aeq-%d",
-				 k - other_num);
-		else
-			snprintf((char *)hr_dev->irq_names[k],
-				 HNS_ROCE_INT_NAME_LEN, "hns-ceq-%d",
-				 k - other_num - aeq_num);
-
-	for (k = 0; k < irq_num; k++) {
-		if (k < other_num)
-			ret = request_irq(hr_dev->irq[k],
-					  hns_roce_v2_msix_interrupt_abn,
-					  0, hr_dev->irq_names[k], hr_dev);
-
-		else if (k < (other_num + comp_num))
-			ret = request_irq(eq_table->eq[k - other_num].irq,
-					  hns_roce_v2_msix_interrupt_eq,
-					  0, hr_dev->irq_names[k + aeq_num],
-					  &eq_table->eq[k - other_num]);
-		else
-			ret = request_irq(eq_table->eq[k - other_num].irq,
-					  hns_roce_v2_msix_interrupt_eq,
-					  0, hr_dev->irq_names[k - comp_num],
-					  &eq_table->eq[k - other_num]);
-		if (ret) {
-			dev_err(dev, "Request irq error!\n");
-			goto err_request_irq_fail;
-		}
+	ret = __hns_roce_request_irq(hr_dev, irq_num, comp_num,
+				     aeq_num, other_num);
+	if (ret) {
+		dev_err(dev, "Request irq failed.\n");
+		goto err_request_irq_fail;
 	}
 
 	hr_dev->irq_workq =
 		create_singlethread_workqueue("hns_roce_irq_workqueue");
 	if (!hr_dev->irq_workq) {
 		dev_err(dev, "Create irq workqueue failed!\n");
-		goto err_request_irq_fail;
+		ret = -ENOMEM;
+		goto err_create_wq_fail;
 	}
 
 	return 0;
 
+err_create_wq_fail:
+	__hns_roce_free_irq(hr_dev);
+
 err_request_irq_fail:
-	for (k -= 1; k >= 0; k--)
-		if (k < other_num)
-			free_irq(hr_dev->irq[k], hr_dev);
-		else
-			free_irq(eq_table->eq[k - other_num].irq,
-				 &eq_table->eq[k - other_num]);
+	hns_roce_v2_int_mask_enable(hr_dev, eq_num, EQ_DISABLE);
 
 err_create_eq_fail:
-	for (j -= 1; j >= 0; j--)
-		hns_roce_v2_free_eq(hr_dev, &eq_table->eq[j]);
-
-err_failed_kzalloc:
 	for (i -= 1; i >= 0; i--)
-		kfree(hr_dev->irq_names[i]);
+		hns_roce_v2_free_eq(hr_dev, &eq_table->eq[i]);
 	kfree(eq_table->eq);
 
 	return ret;
@@ -5145,36 +6004,333 @@
 static void hns_roce_v2_cleanup_eq_table(struct hns_roce_dev *hr_dev)
 {
 	struct hns_roce_eq_table *eq_table = &hr_dev->eq_table;
-	int irq_num;
 	int eq_num;
 	int i;
 
 	eq_num = hr_dev->caps.num_comp_vectors + hr_dev->caps.num_aeq_vectors;
-	irq_num = eq_num + hr_dev->caps.num_other_vectors;
 
 	/* Disable irq */
 	hns_roce_v2_int_mask_enable(hr_dev, eq_num, EQ_DISABLE);
 
-	for (i = 0; i < hr_dev->caps.num_other_vectors; i++)
-		free_irq(hr_dev->irq[i], hr_dev);
+	__hns_roce_free_irq(hr_dev);
 
 	for (i = 0; i < eq_num; i++) {
 		hns_roce_v2_destroy_eqc(hr_dev, i);
 
-		free_irq(eq_table->eq[i].irq, &eq_table->eq[i]);
-
 		hns_roce_v2_free_eq(hr_dev, &eq_table->eq[i]);
 	}
 
-	for (i = 0; i < irq_num; i++)
-		kfree(hr_dev->irq_names[i]);
-
 	kfree(eq_table->eq);
 
 	flush_workqueue(hr_dev->irq_workq);
 	destroy_workqueue(hr_dev->irq_workq);
 }
 
+static void hns_roce_v2_write_srqc(struct hns_roce_dev *hr_dev,
+				   struct hns_roce_srq *srq, u32 pdn, u16 xrcd,
+				   u32 cqn, void *mb_buf, u64 *mtts_wqe,
+				   u64 *mtts_idx, dma_addr_t dma_handle_wqe,
+				   dma_addr_t dma_handle_idx)
+{
+	struct hns_roce_srq_context *srq_context;
+
+	srq_context = mb_buf;
+	memset(srq_context, 0, sizeof(*srq_context));
+
+	roce_set_field(srq_context->byte_4_srqn_srqst, SRQC_BYTE_4_SRQ_ST_M,
+		       SRQC_BYTE_4_SRQ_ST_S, 1);
+
+	roce_set_field(srq_context->byte_4_srqn_srqst,
+		       SRQC_BYTE_4_SRQ_WQE_HOP_NUM_M,
+		       SRQC_BYTE_4_SRQ_WQE_HOP_NUM_S,
+		       (hr_dev->caps.srqwqe_hop_num == HNS_ROCE_HOP_NUM_0 ? 0 :
+		       hr_dev->caps.srqwqe_hop_num));
+	roce_set_field(srq_context->byte_4_srqn_srqst,
+		       SRQC_BYTE_4_SRQ_SHIFT_M, SRQC_BYTE_4_SRQ_SHIFT_S,
+		       ilog2(srq->max));
+
+	roce_set_field(srq_context->byte_4_srqn_srqst, SRQC_BYTE_4_SRQN_M,
+		       SRQC_BYTE_4_SRQN_S, srq->srqn);
+
+	roce_set_field(srq_context->byte_8_limit_wl, SRQC_BYTE_8_SRQ_LIMIT_WL_M,
+		       SRQC_BYTE_8_SRQ_LIMIT_WL_S, 0);
+
+	roce_set_field(srq_context->byte_12_xrcd, SRQC_BYTE_12_SRQ_XRCD_M,
+		       SRQC_BYTE_12_SRQ_XRCD_S, xrcd);
+
+	srq_context->wqe_bt_ba = cpu_to_le32((u32)(dma_handle_wqe >> 3));
+
+	roce_set_field(srq_context->byte_24_wqe_bt_ba,
+		       SRQC_BYTE_24_SRQ_WQE_BT_BA_M,
+		       SRQC_BYTE_24_SRQ_WQE_BT_BA_S,
+		       dma_handle_wqe >> 35);
+
+	roce_set_field(srq_context->byte_28_rqws_pd, SRQC_BYTE_28_PD_M,
+		       SRQC_BYTE_28_PD_S, pdn);
+	roce_set_field(srq_context->byte_28_rqws_pd, SRQC_BYTE_28_RQWS_M,
+		       SRQC_BYTE_28_RQWS_S, srq->max_gs <= 0 ? 0 :
+		       fls(srq->max_gs - 1));
+
+	srq_context->idx_bt_ba = cpu_to_le32(dma_handle_idx >> 3);
+	roce_set_field(srq_context->rsv_idx_bt_ba,
+		       SRQC_BYTE_36_SRQ_IDX_BT_BA_M,
+		       SRQC_BYTE_36_SRQ_IDX_BT_BA_S,
+		       dma_handle_idx >> 35);
+
+	srq_context->idx_cur_blk_addr =
+		cpu_to_le32(mtts_idx[0] >> PAGE_ADDR_SHIFT);
+	roce_set_field(srq_context->byte_44_idxbufpgsz_addr,
+		       SRQC_BYTE_44_SRQ_IDX_CUR_BLK_ADDR_M,
+		       SRQC_BYTE_44_SRQ_IDX_CUR_BLK_ADDR_S,
+		       mtts_idx[0] >> (32 + PAGE_ADDR_SHIFT));
+	roce_set_field(srq_context->byte_44_idxbufpgsz_addr,
+		       SRQC_BYTE_44_SRQ_IDX_HOP_NUM_M,
+		       SRQC_BYTE_44_SRQ_IDX_HOP_NUM_S,
+		       hr_dev->caps.idx_hop_num == HNS_ROCE_HOP_NUM_0 ? 0 :
+		       hr_dev->caps.idx_hop_num);
+
+	roce_set_field(srq_context->byte_44_idxbufpgsz_addr,
+		       SRQC_BYTE_44_SRQ_IDX_BA_PG_SZ_M,
+		       SRQC_BYTE_44_SRQ_IDX_BA_PG_SZ_S,
+		       hr_dev->caps.idx_ba_pg_sz);
+	roce_set_field(srq_context->byte_44_idxbufpgsz_addr,
+		       SRQC_BYTE_44_SRQ_IDX_BUF_PG_SZ_M,
+		       SRQC_BYTE_44_SRQ_IDX_BUF_PG_SZ_S,
+		       hr_dev->caps.idx_buf_pg_sz);
+
+	srq_context->idx_nxt_blk_addr =
+		cpu_to_le32(mtts_idx[1] >> PAGE_ADDR_SHIFT);
+	roce_set_field(srq_context->rsv_idxnxtblkaddr,
+		       SRQC_BYTE_52_SRQ_IDX_NXT_BLK_ADDR_M,
+		       SRQC_BYTE_52_SRQ_IDX_NXT_BLK_ADDR_S,
+		       mtts_idx[1] >> (32 + PAGE_ADDR_SHIFT));
+	roce_set_field(srq_context->byte_56_xrc_cqn,
+		       SRQC_BYTE_56_SRQ_XRC_CQN_M, SRQC_BYTE_56_SRQ_XRC_CQN_S,
+		       cqn);
+	roce_set_field(srq_context->byte_56_xrc_cqn,
+		       SRQC_BYTE_56_SRQ_WQE_BA_PG_SZ_M,
+		       SRQC_BYTE_56_SRQ_WQE_BA_PG_SZ_S,
+		       hr_dev->caps.srqwqe_ba_pg_sz + PG_SHIFT_OFFSET);
+	roce_set_field(srq_context->byte_56_xrc_cqn,
+		       SRQC_BYTE_56_SRQ_WQE_BUF_PG_SZ_M,
+		       SRQC_BYTE_56_SRQ_WQE_BUF_PG_SZ_S,
+		       hr_dev->caps.srqwqe_buf_pg_sz + PG_SHIFT_OFFSET);
+
+	roce_set_bit(srq_context->db_record_addr_record_en,
+		     SRQC_BYTE_60_SRQ_RECORD_EN_S, 0);
+}
+
+static int hns_roce_v2_modify_srq(struct ib_srq *ibsrq,
+				  struct ib_srq_attr *srq_attr,
+				  enum ib_srq_attr_mask srq_attr_mask,
+				  struct ib_udata *udata)
+{
+	struct hns_roce_dev *hr_dev = to_hr_dev(ibsrq->device);
+	struct hns_roce_srq *srq = to_hr_srq(ibsrq);
+	struct hns_roce_srq_context *srq_context;
+	struct hns_roce_srq_context *srqc_mask;
+	struct hns_roce_cmd_mailbox *mailbox;
+	int ret;
+
+	if (srq_attr_mask & IB_SRQ_LIMIT) {
+		if (srq_attr->srq_limit >= srq->max)
+			return -EINVAL;
+
+		mailbox = hns_roce_alloc_cmd_mailbox(hr_dev);
+		if (IS_ERR(mailbox))
+			return PTR_ERR(mailbox);
+
+		srq_context = mailbox->buf;
+		srqc_mask = (struct hns_roce_srq_context *)mailbox->buf + 1;
+
+		memset(srqc_mask, 0xff, sizeof(*srqc_mask));
+
+		roce_set_field(srq_context->byte_8_limit_wl,
+			       SRQC_BYTE_8_SRQ_LIMIT_WL_M,
+			       SRQC_BYTE_8_SRQ_LIMIT_WL_S, srq_attr->srq_limit);
+		roce_set_field(srqc_mask->byte_8_limit_wl,
+			       SRQC_BYTE_8_SRQ_LIMIT_WL_M,
+			       SRQC_BYTE_8_SRQ_LIMIT_WL_S, 0);
+
+		ret = hns_roce_cmd_mbox(hr_dev, mailbox->dma, 0, srq->srqn, 0,
+					HNS_ROCE_CMD_MODIFY_SRQC,
+					HNS_ROCE_CMD_TIMEOUT_MSECS);
+		hns_roce_free_cmd_mailbox(hr_dev, mailbox);
+		if (ret) {
+			dev_err(hr_dev->dev,
+				"MODIFY SRQ Failed to cmd mailbox.\n");
+			return ret;
+		}
+	}
+
+	return 0;
+}
+
+static int hns_roce_v2_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr)
+{
+	struct hns_roce_dev *hr_dev = to_hr_dev(ibsrq->device);
+	struct hns_roce_srq *srq = to_hr_srq(ibsrq);
+	struct hns_roce_srq_context *srq_context;
+	struct hns_roce_cmd_mailbox *mailbox;
+	int limit_wl;
+	int ret;
+
+	mailbox = hns_roce_alloc_cmd_mailbox(hr_dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	srq_context = mailbox->buf;
+	ret = hns_roce_cmd_mbox(hr_dev, 0, mailbox->dma, srq->srqn, 0,
+				HNS_ROCE_CMD_QUERY_SRQC,
+				HNS_ROCE_CMD_TIMEOUT_MSECS);
+	if (ret) {
+		dev_err(hr_dev->dev, "QUERY SRQ cmd process error\n");
+		goto out;
+	}
+
+	limit_wl = roce_get_field(srq_context->byte_8_limit_wl,
+				  SRQC_BYTE_8_SRQ_LIMIT_WL_M,
+				  SRQC_BYTE_8_SRQ_LIMIT_WL_S);
+
+	attr->srq_limit = limit_wl;
+	attr->max_wr    = srq->max - 1;
+	attr->max_sge   = srq->max_gs;
+
+	memcpy(srq_context, mailbox->buf, sizeof(*srq_context));
+
+out:
+	hns_roce_free_cmd_mailbox(hr_dev, mailbox);
+	return ret;
+}
+
+static int find_empty_entry(struct hns_roce_idx_que *idx_que,
+			    unsigned long size)
+{
+	int wqe_idx;
+
+	if (unlikely(bitmap_full(idx_que->bitmap, size)))
+		return -ENOSPC;
+
+	wqe_idx = find_first_zero_bit(idx_que->bitmap, size);
+
+	bitmap_set(idx_que->bitmap, wqe_idx, 1);
+
+	return wqe_idx;
+}
+
+static void fill_idx_queue(struct hns_roce_idx_que *idx_que,
+			   int cur_idx, int wqe_idx)
+{
+	unsigned int *addr;
+
+	addr = (unsigned int *)hns_roce_buf_offset(&idx_que->idx_buf,
+						   cur_idx * idx_que->entry_sz);
+	*addr = wqe_idx;
+}
+
+static int hns_roce_v2_post_srq_recv(struct ib_srq *ibsrq,
+				     const struct ib_recv_wr *wr,
+				     const struct ib_recv_wr **bad_wr)
+{
+	struct hns_roce_dev *hr_dev = to_hr_dev(ibsrq->device);
+	struct hns_roce_srq *srq = to_hr_srq(ibsrq);
+	struct hns_roce_v2_wqe_data_seg *dseg;
+	struct hns_roce_v2_db srq_db;
+	unsigned long flags;
+	int ret = 0;
+	int wqe_idx;
+	void *wqe;
+	int nreq;
+	int ind;
+	int i;
+
+	spin_lock_irqsave(&srq->lock, flags);
+
+	ind = srq->head & (srq->max - 1);
+
+	for (nreq = 0; wr; ++nreq, wr = wr->next) {
+		if (unlikely(wr->num_sge > srq->max_gs)) {
+			ret = -EINVAL;
+			*bad_wr = wr;
+			break;
+		}
+
+		if (unlikely(srq->head == srq->tail)) {
+			ret = -ENOMEM;
+			*bad_wr = wr;
+			break;
+		}
+
+		wqe_idx = find_empty_entry(&srq->idx_que, srq->max);
+		if (wqe_idx < 0) {
+			ret = -ENOMEM;
+			*bad_wr = wr;
+			break;
+		}
+
+		fill_idx_queue(&srq->idx_que, ind, wqe_idx);
+		wqe = get_srq_wqe(srq, wqe_idx);
+		dseg = (struct hns_roce_v2_wqe_data_seg *)wqe;
+
+		for (i = 0; i < wr->num_sge; ++i) {
+			dseg[i].len = cpu_to_le32(wr->sg_list[i].length);
+			dseg[i].lkey = cpu_to_le32(wr->sg_list[i].lkey);
+			dseg[i].addr = cpu_to_le64(wr->sg_list[i].addr);
+		}
+
+		if (i < srq->max_gs) {
+			dseg[i].len = 0;
+			dseg[i].lkey = cpu_to_le32(0x100);
+			dseg[i].addr = 0;
+		}
+
+		srq->wrid[wqe_idx] = wr->wr_id;
+		ind = (ind + 1) & (srq->max - 1);
+	}
+
+	if (likely(nreq)) {
+		srq->head += nreq;
+
+		/*
+		 * Make sure that descriptors are written before
+		 * doorbell record.
+		 */
+		wmb();
+
+		srq_db.byte_4 =
+			cpu_to_le32(HNS_ROCE_V2_SRQ_DB << V2_DB_BYTE_4_CMD_S |
+				    (srq->srqn & V2_DB_BYTE_4_TAG_M));
+		srq_db.parameter = cpu_to_le32(srq->head);
+
+		hns_roce_write64(hr_dev, (__le32 *)&srq_db, srq->db_reg_l);
+
+	}
+
+	spin_unlock_irqrestore(&srq->lock, flags);
+
+	return ret;
+}
+
+static const struct hns_roce_dfx_hw hns_roce_dfx_hw_v2 = {
+	.query_cqc_info = hns_roce_v2_query_cqc_info,
+};
+
+static const struct ib_device_ops hns_roce_v2_dev_ops = {
+	.destroy_qp = hns_roce_v2_destroy_qp,
+	.modify_cq = hns_roce_v2_modify_cq,
+	.poll_cq = hns_roce_v2_poll_cq,
+	.post_recv = hns_roce_v2_post_recv,
+	.post_send = hns_roce_v2_post_send,
+	.query_qp = hns_roce_v2_query_qp,
+	.req_notify_cq = hns_roce_v2_req_notify_cq,
+};
+
+static const struct ib_device_ops hns_roce_v2_dev_srq_ops = {
+	.modify_srq = hns_roce_v2_modify_srq,
+	.post_srq_recv = hns_roce_v2_post_srq_recv,
+	.query_srq = hns_roce_v2_query_srq,
+};
+
 static const struct hns_roce_hw hns_roce_hw_v2 = {
 	.cmq_init = hns_roce_v2_cmq_init,
 	.cmq_exit = hns_roce_v2_cmq_exit,
@@ -5183,16 +6339,20 @@
 	.hw_exit = hns_roce_v2_exit,
 	.post_mbox = hns_roce_v2_post_mbox,
 	.chk_mbox = hns_roce_v2_chk_mbox,
+	.rst_prc_mbox = hns_roce_v2_rst_process_cmd,
 	.set_gid = hns_roce_v2_set_gid,
 	.set_mac = hns_roce_v2_set_mac,
 	.write_mtpt = hns_roce_v2_write_mtpt,
 	.rereg_write_mtpt = hns_roce_v2_rereg_write_mtpt,
+	.frmr_write_mtpt = hns_roce_v2_frmr_write_mtpt,
+	.mw_write_mtpt = hns_roce_v2_mw_write_mtpt,
 	.write_cqc = hns_roce_v2_write_cqc,
 	.set_hem = hns_roce_v2_set_hem,
 	.clear_hem = hns_roce_v2_clear_hem,
 	.modify_qp = hns_roce_v2_modify_qp,
 	.query_qp = hns_roce_v2_query_qp,
 	.destroy_qp = hns_roce_v2_destroy_qp,
+	.qp_flow_control_init = hns_roce_v2_qp_flow_control_init,
 	.modify_cq = hns_roce_v2_modify_cq,
 	.post_send = hns_roce_v2_post_send,
 	.post_recv = hns_roce_v2_post_recv,
@@ -5200,6 +6360,12 @@
 	.poll_cq = hns_roce_v2_poll_cq,
 	.init_eq = hns_roce_v2_init_eq_table,
 	.cleanup_eq = hns_roce_v2_cleanup_eq_table,
+	.write_srqc = hns_roce_v2_write_srqc,
+	.modify_srq = hns_roce_v2_modify_srq,
+	.query_srq = hns_roce_v2_query_srq,
+	.post_srq_recv = hns_roce_v2_post_srq_recv,
+	.hns_roce_dev_ops = &hns_roce_v2_dev_ops,
+	.hns_roce_dev_srq_ops = &hns_roce_v2_dev_srq_ops,
 };
 
 static const struct pci_device_id hns_roce_hw_v2_pci_tbl[] = {
@@ -5217,16 +6383,11 @@
 static int hns_roce_hw_v2_get_cfg(struct hns_roce_dev *hr_dev,
 				  struct hnae3_handle *handle)
 {
-	const struct pci_device_id *id;
+	struct hns_roce_v2_priv *priv = hr_dev->priv;
 	int i;
 
-	id = pci_match_id(hns_roce_hw_v2_pci_tbl, hr_dev->pci_dev);
-	if (!id) {
-		dev_err(hr_dev->dev, "device is not compatible!\n");
-		return -ENXIO;
-	}
-
 	hr_dev->hw = &hns_roce_hw_v2;
+	hr_dev->dfx = &hns_roce_dfx_hw_v2;
 	hr_dev->sdb_offset = ROCEE_DB_SQ_L_0_REG;
 	hr_dev->odb_offset = hr_dev->sdb_offset;
 
@@ -5247,15 +6408,18 @@
 	hr_dev->cmd_mod = 1;
 	hr_dev->loop_idc = 0;
 
+	hr_dev->reset_cnt = handle->ae_algo->ops->ae_dev_reset_cnt(handle);
+	priv->handle = handle;
+
 	return 0;
 }
 
-static int hns_roce_hw_v2_init_instance(struct hnae3_handle *handle)
+static int __hns_roce_hw_v2_init_instance(struct hnae3_handle *handle)
 {
 	struct hns_roce_dev *hr_dev;
 	int ret;
 
-	hr_dev = (struct hns_roce_dev *)ib_alloc_device(sizeof(*hr_dev));
+	hr_dev = ib_alloc_device(hns_roce_dev, ib_dev);
 	if (!hr_dev)
 		return -ENOMEM;
 
@@ -5267,7 +6431,6 @@
 
 	hr_dev->pci_dev = handle->pdev;
 	hr_dev->dev = &handle->pdev->dev;
-	handle->priv = hr_dev;
 
 	ret = hns_roce_hw_v2_get_cfg(hr_dev, handle);
 	if (ret) {
@@ -5281,6 +6444,8 @@
 		goto error_failed_get_cfg;
 	}
 
+	handle->priv = hr_dev;
+
 	return 0;
 
 error_failed_get_cfg:
@@ -5292,7 +6457,7 @@
 	return ret;
 }
 
-static void hns_roce_hw_v2_uninit_instance(struct hnae3_handle *handle,
+static void __hns_roce_hw_v2_uninit_instance(struct hnae3_handle *handle,
 					   bool reset)
 {
 	struct hns_roce_dev *hr_dev = (struct hns_roce_dev *)handle->priv;
@@ -5300,24 +6465,85 @@
 	if (!hr_dev)
 		return;
 
+	handle->priv = NULL;
 	hns_roce_exit(hr_dev);
 	kfree(hr_dev->priv);
 	ib_dealloc_device(&hr_dev->ib_dev);
 }
 
-static int hns_roce_hw_v2_reset_notify_down(struct hnae3_handle *handle)
+static int hns_roce_hw_v2_init_instance(struct hnae3_handle *handle)
 {
-	struct hns_roce_dev *hr_dev = (struct hns_roce_dev *)handle->priv;
-	struct ib_event event;
+	const struct hnae3_ae_ops *ops = handle->ae_algo->ops;
+	const struct pci_device_id *id;
+	struct device *dev = &handle->pdev->dev;
+	int ret;
 
-	if (!hr_dev) {
-		dev_err(&handle->pdev->dev,
-			"Input parameter handle->priv is NULL!\n");
-		return -EINVAL;
+	handle->rinfo.instance_state = HNS_ROCE_STATE_INIT;
+
+	if (ops->ae_dev_resetting(handle) || ops->get_hw_reset_stat(handle)) {
+		handle->rinfo.instance_state = HNS_ROCE_STATE_NON_INIT;
+		goto reset_chk_err;
 	}
 
-	hr_dev->active = false;
+	id = pci_match_id(hns_roce_hw_v2_pci_tbl, handle->pdev);
+	if (!id)
+		return 0;
+
+	ret = __hns_roce_hw_v2_init_instance(handle);
+	if (ret) {
+		handle->rinfo.instance_state = HNS_ROCE_STATE_NON_INIT;
+		dev_err(dev, "RoCE instance init failed! ret = %d\n", ret);
+		if (ops->ae_dev_resetting(handle) ||
+		    ops->get_hw_reset_stat(handle))
+			goto reset_chk_err;
+		else
+			return ret;
+	}
+
+	handle->rinfo.instance_state = HNS_ROCE_STATE_INITED;
+
+
+	return 0;
+
+reset_chk_err:
+	dev_err(dev, "Device is busy in resetting state.\n"
+		     "please retry later.\n");
+
+	return -EBUSY;
+}
+
+static void hns_roce_hw_v2_uninit_instance(struct hnae3_handle *handle,
+					   bool reset)
+{
+	if (handle->rinfo.instance_state != HNS_ROCE_STATE_INITED)
+		return;
+
+	handle->rinfo.instance_state = HNS_ROCE_STATE_UNINIT;
+
+	__hns_roce_hw_v2_uninit_instance(handle, reset);
+
+	handle->rinfo.instance_state = HNS_ROCE_STATE_NON_INIT;
+}
+static int hns_roce_hw_v2_reset_notify_down(struct hnae3_handle *handle)
+{
+	struct hns_roce_dev *hr_dev;
+	struct ib_event event;
+
+	if (handle->rinfo.instance_state != HNS_ROCE_STATE_INITED) {
+		set_bit(HNS_ROCE_RST_DIRECT_RETURN, &handle->rinfo.state);
+		return 0;
+	}
+
+	handle->rinfo.reset_state = HNS_ROCE_STATE_RST_DOWN;
+	clear_bit(HNS_ROCE_RST_DIRECT_RETURN, &handle->rinfo.state);
+
+	hr_dev = (struct hns_roce_dev *)handle->priv;
+	if (!hr_dev)
+		return 0;
+
 	hr_dev->is_reset = true;
+	hr_dev->active = false;
+	hr_dev->dis_db = true;
 
 	event.event = IB_EVENT_DEVICE_FATAL;
 	event.device = &hr_dev->ib_dev;
@@ -5329,17 +6555,29 @@
 
 static int hns_roce_hw_v2_reset_notify_init(struct hnae3_handle *handle)
 {
+	struct device *dev = &handle->pdev->dev;
 	int ret;
 
-	ret = hns_roce_hw_v2_init_instance(handle);
+	if (test_and_clear_bit(HNS_ROCE_RST_DIRECT_RETURN,
+			       &handle->rinfo.state)) {
+		handle->rinfo.reset_state = HNS_ROCE_STATE_RST_INITED;
+		return 0;
+	}
+
+	handle->rinfo.reset_state = HNS_ROCE_STATE_RST_INIT;
+
+	dev_info(&handle->pdev->dev, "In reset process RoCE client reinit.\n");
+	ret = __hns_roce_hw_v2_init_instance(handle);
 	if (ret) {
 		/* when reset notify type is HNAE3_INIT_CLIENT In reset notify
 		 * callback function, RoCE Engine reinitialize. If RoCE reinit
 		 * failed, we should inform NIC driver.
 		 */
 		handle->priv = NULL;
-		dev_err(&handle->pdev->dev,
-			"In reset process RoCE reinit failed %d.\n", ret);
+		dev_err(dev, "In reset process RoCE reinit failed %d.\n", ret);
+	} else {
+		handle->rinfo.reset_state = HNS_ROCE_STATE_RST_INITED;
+		dev_info(dev, "Reset done, RoCE client reinit finished.\n");
 	}
 
 	return ret;
@@ -5347,8 +6585,14 @@
 
 static int hns_roce_hw_v2_reset_notify_uninit(struct hnae3_handle *handle)
 {
-	msleep(100);
-	hns_roce_hw_v2_uninit_instance(handle, false);
+	if (test_bit(HNS_ROCE_RST_DIRECT_RETURN, &handle->rinfo.state))
+		return 0;
+
+	handle->rinfo.reset_state = HNS_ROCE_STATE_RST_UNINIT;
+	dev_info(&handle->pdev->dev, "In reset process RoCE client uninit.\n");
+	msleep(HNS_ROCE_V2_HW_RST_UNINT_DELAY);
+	__hns_roce_hw_v2_uninit_instance(handle, false);
+
 	return 0;
 }
 
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h
index 14aa308..43219d2 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h
@@ -36,6 +36,7 @@
 #include <linux/bitops.h>
 
 #define HNS_ROCE_VF_QPC_BT_NUM			256
+#define HNS_ROCE_VF_SCCC_BT_NUM			64
 #define HNS_ROCE_VF_SRQC_BT_NUM			64
 #define HNS_ROCE_VF_CQC_BT_NUM			64
 #define HNS_ROCE_VF_MPT_BT_NUM			64
@@ -44,12 +45,21 @@
 #define HNS_ROCE_VF_SGID_NUM			32
 #define HNS_ROCE_VF_SL_NUM			8
 
-#define HNS_ROCE_V2_MAX_QP_NUM			0x2000
+#define HNS_ROCE_V2_MAX_QP_NUM			0x100000
+#define HNS_ROCE_V2_MAX_QPC_TIMER_NUM		0x200
 #define HNS_ROCE_V2_MAX_WQE_NUM			0x8000
-#define HNS_ROCE_V2_MAX_CQ_NUM			0x8000
-#define HNS_ROCE_V2_MAX_CQE_NUM			0x10000
+#define	HNS_ROCE_V2_MAX_SRQ			0x100000
+#define HNS_ROCE_V2_MAX_SRQ_WR			0x8000
+#define HNS_ROCE_V2_MAX_SRQ_SGE			0x100
+#define HNS_ROCE_V2_MAX_CQ_NUM			0x100000
+#define HNS_ROCE_V2_MAX_CQC_TIMER_NUM		0x100
+#define HNS_ROCE_V2_MAX_SRQ_NUM			0x100000
+#define HNS_ROCE_V2_MAX_CQE_NUM			0x400000
+#define HNS_ROCE_V2_MAX_SRQWQE_NUM		0x8000
 #define HNS_ROCE_V2_MAX_RQ_SGE_NUM		0x100
 #define HNS_ROCE_V2_MAX_SQ_SGE_NUM		0xff
+#define HNS_ROCE_V2_MAX_SRQ_SGE_NUM		0x100
+#define HNS_ROCE_V2_MAX_EXTEND_SGE_NUM		0x200000
 #define HNS_ROCE_V2_MAX_SQ_INLINE		0x20
 #define HNS_ROCE_V2_UAR_NUM			256
 #define HNS_ROCE_V2_PHY_UAR_NUM			1
@@ -57,9 +67,11 @@
 #define HNS_ROCE_V2_COMP_VEC_NUM		63
 #define HNS_ROCE_V2_AEQE_VEC_NUM		1
 #define HNS_ROCE_V2_ABNORMAL_VEC_NUM		1
-#define HNS_ROCE_V2_MAX_MTPT_NUM		0x8000
+#define HNS_ROCE_V2_MAX_MTPT_NUM		0x100000
 #define HNS_ROCE_V2_MAX_MTT_SEGS		0x1000000
 #define HNS_ROCE_V2_MAX_CQE_SEGS		0x1000000
+#define HNS_ROCE_V2_MAX_SRQWQE_SEGS		0x1000000
+#define HNS_ROCE_V2_MAX_IDX_SEGS		0x1000000
 #define HNS_ROCE_V2_MAX_PD_NUM			0x1000000
 #define HNS_ROCE_V2_MAX_QP_INIT_RDMA		128
 #define HNS_ROCE_V2_MAX_QP_DEST_RDMA		128
@@ -70,20 +82,33 @@
 #define HNS_ROCE_V2_IRRL_ENTRY_SZ		64
 #define HNS_ROCE_V2_TRRL_ENTRY_SZ		48
 #define HNS_ROCE_V2_CQC_ENTRY_SZ		64
+#define HNS_ROCE_V2_SRQC_ENTRY_SZ		64
 #define HNS_ROCE_V2_MTPT_ENTRY_SZ		64
 #define HNS_ROCE_V2_MTT_ENTRY_SZ		64
 #define HNS_ROCE_V2_CQE_ENTRY_SIZE		32
+#define HNS_ROCE_V2_SCCC_ENTRY_SZ		32
+#define HNS_ROCE_V2_QPC_TIMER_ENTRY_SZ		4096
+#define HNS_ROCE_V2_CQC_TIMER_ENTRY_SZ		4096
 #define HNS_ROCE_V2_PAGE_SIZE_SUPPORTED		0xFFFFF000
 #define HNS_ROCE_V2_MAX_INNER_MTPT_NUM		2
 #define HNS_ROCE_INVALID_LKEY			0x100
 #define HNS_ROCE_CMQ_TX_TIMEOUT			30000
 #define HNS_ROCE_V2_UC_RC_SGE_NUM_IN_WQE	2
+#define HNS_ROCE_V2_RSV_QPS			8
+
+#define HNS_ROCE_V2_HW_RST_TIMEOUT		1000
+#define HNS_ROCE_V2_HW_RST_UNINT_DELAY		100
+
+#define HNS_ROCE_V2_HW_RST_COMPLETION_WAIT	20
 
 #define HNS_ROCE_CONTEXT_HOP_NUM		1
+#define HNS_ROCE_SCCC_HOP_NUM			1
 #define HNS_ROCE_MTT_HOP_NUM			1
 #define HNS_ROCE_CQE_HOP_NUM			1
+#define HNS_ROCE_SRQWQE_HOP_NUM			1
 #define HNS_ROCE_PBL_HOP_NUM			2
 #define HNS_ROCE_EQE_HOP_NUM			2
+#define HNS_ROCE_IDX_HOP_NUM			1
 
 #define HNS_ROCE_V2_GID_INDEX_NUM		256
 
@@ -104,13 +129,15 @@
 #define HNS_ROCE_CMD_FLAG_ERR_INTR	BIT(HNS_ROCE_CMD_FLAG_ERR_INTR_SHIFT)
 
 #define HNS_ROCE_CMQ_DESC_NUM_S		3
-#define HNS_ROCE_CMQ_EN_B		16
-#define HNS_ROCE_CMQ_ENABLE		BIT(HNS_ROCE_CMQ_EN_B)
+
+#define HNS_ROCE_CMQ_SCC_CLR_DONE_CNT		5
 
 #define check_whether_last_step(hop_num, step_idx) \
 	((step_idx == 0 && hop_num == HNS_ROCE_HOP_NUM_0) || \
 	(step_idx == 1 && hop_num == 1) || \
 	(step_idx == 2 && hop_num == 2))
+#define HNS_ICL_SWITCH_CMD_ROCEE_SEL_SHIFT	0
+#define HNS_ICL_SWITCH_CMD_ROCEE_SEL	BIT(HNS_ICL_SWITCH_CMD_ROCEE_SEL_SHIFT)
 
 #define CMD_CSQ_DESC_NUM		1024
 #define CMD_CRQ_DESC_NUM		1024
@@ -201,6 +228,7 @@
 
 /* CMQ command */
 enum hns_roce_opcode_type {
+	HNS_QUERY_FW_VER				= 0x0001,
 	HNS_ROCE_OPC_QUERY_HW_VER			= 0x8000,
 	HNS_ROCE_OPC_CFG_GLOBAL_PARAM			= 0x8001,
 	HNS_ROCE_OPC_ALLOC_PF_RES			= 0x8004,
@@ -208,9 +236,17 @@
 	HNS_ROCE_OPC_ALLOC_VF_RES			= 0x8401,
 	HNS_ROCE_OPC_CFG_EXT_LLM			= 0x8403,
 	HNS_ROCE_OPC_CFG_TMOUT_LLM			= 0x8404,
+	HNS_ROCE_OPC_QUERY_PF_TIMER_RES			= 0x8406,
 	HNS_ROCE_OPC_CFG_SGID_TB			= 0x8500,
 	HNS_ROCE_OPC_CFG_SMAC_TB			= 0x8501,
+	HNS_ROCE_OPC_POST_MB				= 0x8504,
+	HNS_ROCE_OPC_QUERY_MB_ST			= 0x8505,
 	HNS_ROCE_OPC_CFG_BT_ATTR			= 0x8506,
+	HNS_ROCE_OPC_FUNC_CLEAR				= 0x8508,
+	HNS_ROCE_OPC_CLR_SCCC				= 0x8509,
+	HNS_ROCE_OPC_QUERY_SCCC				= 0x850a,
+	HNS_ROCE_OPC_RESET_SCCC				= 0x850b,
+	HNS_SWITCH_PARAMETER_CFG			= 0x1033,
 };
 
 enum {
@@ -322,8 +358,93 @@
 #define	V2_CQC_BYTE_64_SE_CQE_IDX_S 0
 #define	V2_CQC_BYTE_64_SE_CQE_IDX_M GENMASK(23, 0)
 
+struct hns_roce_srq_context {
+	__le32	byte_4_srqn_srqst;
+	__le32	byte_8_limit_wl;
+	__le32	byte_12_xrcd;
+	__le32	byte_16_pi_ci;
+	__le32	wqe_bt_ba;
+	__le32	byte_24_wqe_bt_ba;
+	__le32	byte_28_rqws_pd;
+	__le32	idx_bt_ba;
+	__le32	rsv_idx_bt_ba;
+	__le32	idx_cur_blk_addr;
+	__le32	byte_44_idxbufpgsz_addr;
+	__le32	idx_nxt_blk_addr;
+	__le32	rsv_idxnxtblkaddr;
+	__le32	byte_56_xrc_cqn;
+	__le32	db_record_addr_record_en;
+	__le32	db_record_addr;
+};
+
+#define SRQC_BYTE_4_SRQ_ST_S 0
+#define SRQC_BYTE_4_SRQ_ST_M GENMASK(1, 0)
+
+#define SRQC_BYTE_4_SRQ_WQE_HOP_NUM_S 2
+#define SRQC_BYTE_4_SRQ_WQE_HOP_NUM_M GENMASK(3, 2)
+
+#define SRQC_BYTE_4_SRQ_SHIFT_S 4
+#define SRQC_BYTE_4_SRQ_SHIFT_M GENMASK(7, 4)
+
+#define SRQC_BYTE_4_SRQN_S 8
+#define SRQC_BYTE_4_SRQN_M GENMASK(31, 8)
+
+#define SRQC_BYTE_8_SRQ_LIMIT_WL_S 0
+#define SRQC_BYTE_8_SRQ_LIMIT_WL_M GENMASK(15, 0)
+
+#define SRQC_BYTE_12_SRQ_XRCD_S 0
+#define SRQC_BYTE_12_SRQ_XRCD_M GENMASK(23, 0)
+
+#define SRQC_BYTE_16_SRQ_PRODUCER_IDX_S 0
+#define SRQC_BYTE_16_SRQ_PRODUCER_IDX_M GENMASK(15, 0)
+
+#define SRQC_BYTE_16_SRQ_CONSUMER_IDX_S 0
+#define SRQC_BYTE_16_SRQ_CONSUMER_IDX_M GENMASK(31, 16)
+
+#define SRQC_BYTE_24_SRQ_WQE_BT_BA_S 0
+#define SRQC_BYTE_24_SRQ_WQE_BT_BA_M GENMASK(28, 0)
+
+#define SRQC_BYTE_28_PD_S 0
+#define SRQC_BYTE_28_PD_M GENMASK(23, 0)
+
+#define SRQC_BYTE_28_RQWS_S 24
+#define SRQC_BYTE_28_RQWS_M GENMASK(27, 24)
+
+#define SRQC_BYTE_36_SRQ_IDX_BT_BA_S 0
+#define SRQC_BYTE_36_SRQ_IDX_BT_BA_M GENMASK(28, 0)
+
+#define SRQC_BYTE_44_SRQ_IDX_CUR_BLK_ADDR_S 0
+#define SRQC_BYTE_44_SRQ_IDX_CUR_BLK_ADDR_M GENMASK(19, 0)
+
+#define SRQC_BYTE_44_SRQ_IDX_HOP_NUM_S 22
+#define SRQC_BYTE_44_SRQ_IDX_HOP_NUM_M GENMASK(23, 22)
+
+#define SRQC_BYTE_44_SRQ_IDX_BA_PG_SZ_S 24
+#define SRQC_BYTE_44_SRQ_IDX_BA_PG_SZ_M GENMASK(27, 24)
+
+#define SRQC_BYTE_44_SRQ_IDX_BUF_PG_SZ_S 28
+#define SRQC_BYTE_44_SRQ_IDX_BUF_PG_SZ_M GENMASK(31, 28)
+
+#define SRQC_BYTE_52_SRQ_IDX_NXT_BLK_ADDR_S 0
+#define SRQC_BYTE_52_SRQ_IDX_NXT_BLK_ADDR_M GENMASK(19, 0)
+
+#define SRQC_BYTE_56_SRQ_XRC_CQN_S 0
+#define SRQC_BYTE_56_SRQ_XRC_CQN_M GENMASK(23, 0)
+
+#define SRQC_BYTE_56_SRQ_WQE_BA_PG_SZ_S 24
+#define SRQC_BYTE_56_SRQ_WQE_BA_PG_SZ_M GENMASK(27, 24)
+
+#define SRQC_BYTE_56_SRQ_WQE_BUF_PG_SZ_S 28
+#define SRQC_BYTE_56_SRQ_WQE_BUF_PG_SZ_M GENMASK(31, 28)
+
+#define SRQC_BYTE_60_SRQ_RECORD_EN_S 0
+
+#define SRQC_BYTE_60_SRQ_DB_RECORD_ADDR_S 1
+#define SRQC_BYTE_60_SRQ_DB_RECORD_ADDR_M GENMASK(31, 1)
+
 enum{
 	V2_MPT_ST_VALID = 0x1,
+	V2_MPT_ST_FREE	= 0x2,
 };
 
 enum hns_roce_v2_qp_state {
@@ -350,7 +471,7 @@
 	__le32	dmac;
 	__le32	byte_52_udpspn_dmac;
 	__le32	byte_56_dqpn_err;
-	__le32	byte_60_qpst_mapid;
+	__le32	byte_60_qpst_tempid;
 	__le32	qkey_xrcd;
 	__le32	byte_68_rq_db;
 	__le32	rq_db_record_addr;
@@ -492,26 +613,15 @@
 #define	V2_QPC_BYTE_56_LP_PKTN_INI_S 28
 #define V2_QPC_BYTE_56_LP_PKTN_INI_M GENMASK(31, 28)
 
-#define	V2_QPC_BYTE_60_MAPID_S 0
-#define V2_QPC_BYTE_60_MAPID_M GENMASK(12, 0)
+#define	V2_QPC_BYTE_60_TEMPID_S 0
+#define V2_QPC_BYTE_60_TEMPID_M GENMASK(7, 0)
 
-#define	V2_QPC_BYTE_60_INNER_MAP_IND_S 13
+#define V2_QPC_BYTE_60_SCC_TOKEN_S 8
+#define V2_QPC_BYTE_60_SCC_TOKEN_M GENMASK(26, 8)
 
-#define	V2_QPC_BYTE_60_SQ_MAP_IND_S 14
+#define	V2_QPC_BYTE_60_SQ_DB_DOING_S 27
 
-#define	V2_QPC_BYTE_60_RQ_MAP_IND_S 15
-
-#define	V2_QPC_BYTE_60_TEMPID_S 16
-#define V2_QPC_BYTE_60_TEMPID_M  GENMASK(22, 16)
-
-#define	V2_QPC_BYTE_60_EXT_MAP_IND_S 23
-
-#define	V2_QPC_BYTE_60_RTY_NUM_INI_BAK_S 24
-#define V2_QPC_BYTE_60_RTY_NUM_INI_BAK_M GENMASK(26, 24)
-
-#define V2_QPC_BYTE_60_SQ_RLS_IND_S 27
-
-#define	V2_QPC_BYTE_60_SQ_EXT_IND_S 28
+#define	V2_QPC_BYTE_60_RQ_DB_DOING_S 28
 
 #define	V2_QPC_BYTE_60_QP_ST_S 29
 #define V2_QPC_BYTE_60_QP_ST_M GENMASK(31, 29)
@@ -534,6 +644,7 @@
 
 #define	V2_QPC_BYTE_76_RQIE_S 28
 
+#define	V2_QPC_BYTE_76_RQ_VLAN_EN_S 30
 #define	V2_QPC_BYTE_80_RX_CQN_S 0
 #define V2_QPC_BYTE_80_RX_CQN_M GENMASK(23, 0)
 
@@ -588,7 +699,7 @@
 #define	V2_QPC_BYTE_140_RR_MAX_S 12
 #define V2_QPC_BYTE_140_RR_MAX_M GENMASK(14, 12)
 
-#define	V2_QPC_BYTE_140_RSVD_RAQ_MAP_S 15
+#define	V2_QPC_BYTE_140_RQ_RTY_WAIT_DO_S 15
 
 #define	V2_QPC_BYTE_140_RAQ_TRRL_HEAD_S 16
 #define V2_QPC_BYTE_140_RAQ_TRRL_HEAD_M GENMASK(23, 16)
@@ -599,8 +710,6 @@
 #define	V2_QPC_BYTE_144_RAQ_RTY_INI_PSN_S 0
 #define V2_QPC_BYTE_144_RAQ_RTY_INI_PSN_M GENMASK(23, 0)
 
-#define V2_QPC_BYTE_144_RAQ_RTY_INI_IND_S 24
-
 #define V2_QPC_BYTE_144_RAQ_CREDIT_S 25
 #define V2_QPC_BYTE_144_RAQ_CREDIT_M GENMASK(29, 25)
 
@@ -612,8 +721,8 @@
 #define	V2_QPC_BYTE_148_RAQ_SYNDROME_S 24
 #define V2_QPC_BYTE_148_RAQ_SYNDROME_M GENMASK(31, 24)
 
-#define	V2_QPC_BYTE_152_RAQ_PSN_S 8
-#define V2_QPC_BYTE_152_RAQ_PSN_M GENMASK(31, 8)
+#define	V2_QPC_BYTE_152_RAQ_PSN_S 0
+#define V2_QPC_BYTE_152_RAQ_PSN_M GENMASK(23, 0)
 
 #define	V2_QPC_BYTE_152_RAQ_TRRL_RTY_HEAD_S 24
 #define V2_QPC_BYTE_152_RAQ_TRRL_RTY_HEAD_M GENMASK(31, 24)
@@ -637,9 +746,10 @@
 #define	V2_QPC_BYTE_168_LP_SGEN_INI_S 22
 #define V2_QPC_BYTE_168_LP_SGEN_INI_M GENMASK(23, 22)
 
-#define	V2_QPC_BYTE_168_SQ_SHIFT_BAK_S 24
-#define V2_QPC_BYTE_168_SQ_SHIFT_BAK_M GENMASK(27, 24)
-
+#define V2_QPC_BYTE_168_SQ_VLAN_EN_S 24
+#define V2_QPC_BYTE_168_POLL_DB_WAIT_DO_S 25
+#define V2_QPC_BYTE_168_SCC_TOKEN_FORBID_SQ_DEQ_S 26
+#define V2_QPC_BYTE_168_WAIT_ACK_TIMEOUT_S 27
 #define	V2_QPC_BYTE_168_IRRL_IDX_LSB_S 28
 #define V2_QPC_BYTE_168_IRRL_IDX_LSB_M GENMASK(31, 28)
 
@@ -725,6 +835,10 @@
 #define	V2_QPC_BYTE_232_IRRL_SGE_IDX_S 20
 #define V2_QPC_BYTE_232_IRRL_SGE_IDX_M GENMASK(28, 20)
 
+#define V2_QPC_BYTE_232_SO_LP_VLD_S 29
+#define V2_QPC_BYTE_232_FENCE_LP_VLD_S 30
+#define V2_QPC_BYTE_232_IRRL_LP_VLD_S 31
+
 #define	V2_QPC_BYTE_240_IRRL_TAIL_REAL_S 0
 #define V2_QPC_BYTE_240_IRRL_TAIL_REAL_M GENMASK(7, 0)
 
@@ -743,6 +857,9 @@
 #define	V2_QPC_BYTE_244_RNR_CNT_S 27
 #define V2_QPC_BYTE_244_RNR_CNT_M GENMASK(29, 27)
 
+#define V2_QPC_BYTE_244_LCL_OP_FLG_S 30
+#define V2_QPC_BYTE_244_IRRL_RD_FLG_S 31
+
 #define	V2_QPC_BYTE_248_IRRL_PSN_S 0
 #define V2_QPC_BYTE_248_IRRL_PSN_M GENMASK(23, 0)
 
@@ -771,6 +888,10 @@
 #define	V2_QPC_BYTE_256_SQ_FLUSH_IDX_S 16
 #define V2_QPC_BYTE_256_SQ_FLUSH_IDX_M GENMASK(31, 16)
 
+#define	V2_QP_RWE_S 1 /* rdma write enable */
+#define	V2_QP_RRE_S 2 /* rdma read enable */
+#define	V2_QP_ATE_S 3 /* rdma atomic enable */
+
 struct hns_roce_v2_cqe {
 	__le32	byte_4;
 	union {
@@ -818,6 +939,11 @@
 #define	V2_CQE_BYTE_28_PORT_TYPE_S 16
 #define V2_CQE_BYTE_28_PORT_TYPE_M GENMASK(17, 16)
 
+#define V2_CQE_BYTE_28_VID_S 18
+#define V2_CQE_BYTE_28_VID_M GENMASK(29, 18)
+
+#define V2_CQE_BYTE_28_VID_VLD_S 30
+
 #define	V2_CQE_BYTE_32_RMT_QPN_S 0
 #define V2_CQE_BYTE_32_RMT_QPN_M GENMASK(23, 0)
 
@@ -878,8 +1004,19 @@
 
 #define V2_MPT_BYTE_8_LW_EN_S 7
 
+#define V2_MPT_BYTE_8_MW_CNT_S 8
+#define V2_MPT_BYTE_8_MW_CNT_M GENMASK(31, 8)
+
+#define V2_MPT_BYTE_12_FRE_S 0
+
 #define V2_MPT_BYTE_12_PA_S 1
 
+#define V2_MPT_BYTE_12_MR_MW_S 4
+
+#define V2_MPT_BYTE_12_BPD_S 5
+
+#define V2_MPT_BYTE_12_BQP_S 6
+
 #define V2_MPT_BYTE_12_INNER_PA_VLD_S 7
 
 #define V2_MPT_BYTE_12_MW_BIND_QPN_S 8
@@ -988,6 +1125,8 @@
 #define	V2_UD_SEND_WQE_BYTE_40_PORTN_S 24
 #define V2_UD_SEND_WQE_BYTE_40_PORTN_M GENMASK(26, 24)
 
+#define V2_UD_SEND_WQE_BYTE_40_UD_VLAN_EN_S 30
+
 #define	V2_UD_SEND_WQE_BYTE_40_LBI_S 31
 
 #define	V2_UD_SEND_WQE_DMAC_0_S 0
@@ -1042,6 +1181,16 @@
 
 #define V2_RC_SEND_WQE_BYTE_4_INLINE_S 12
 
+#define V2_RC_FRMR_WQE_BYTE_4_BIND_EN_S 19
+
+#define V2_RC_FRMR_WQE_BYTE_4_ATOMIC_S 20
+
+#define V2_RC_FRMR_WQE_BYTE_4_RR_S 21
+
+#define V2_RC_FRMR_WQE_BYTE_4_RW_S 22
+
+#define V2_RC_FRMR_WQE_BYTE_4_LW_S 23
+
 #define	V2_RC_SEND_WQE_BYTE_16_XRC_SRQN_S 0
 #define V2_RC_SEND_WQE_BYTE_16_XRC_SRQN_M GENMASK(23, 0)
 
@@ -1051,6 +1200,16 @@
 #define V2_RC_SEND_WQE_BYTE_20_MSG_START_SGE_IDX_S 0
 #define V2_RC_SEND_WQE_BYTE_20_MSG_START_SGE_IDX_M GENMASK(23, 0)
 
+struct hns_roce_wqe_frmr_seg {
+	__le32	pbl_size;
+	__le32	mode_buf_pg_sz;
+};
+
+#define V2_RC_FRMR_WQE_BYTE_40_PBL_BUF_PG_SZ_S	4
+#define V2_RC_FRMR_WQE_BYTE_40_PBL_BUF_PG_SZ_M	GENMASK(7, 4)
+
+#define V2_RC_FRMR_WQE_BYTE_40_BLK_MODE_S 8
+
 struct hns_roce_v2_wqe_data_seg {
 	__le32    len;
 	__le32    lkey;
@@ -1068,6 +1227,27 @@
 	__le32 rsv[5];
 };
 
+struct hns_roce_query_fw_info {
+	__le32 fw_ver;
+	__le32 rsv[5];
+};
+
+struct hns_roce_func_clear {
+	__le32 rst_funcid_en;
+	__le32 func_done;
+	__le32 rsv[4];
+};
+
+#define FUNC_CLEAR_RST_FUN_DONE_S 0
+/* Each physical function manages up to 248 virtual functionsï¼›
+ * it takes up to 100ms for each function to execute clearï¼›
+ * if an abnormal reset occurs, it is executed twice at most;
+ * so it takes up to 249 * 2 * 100ms.
+ */
+#define HNS_ROCE_V2_FUNC_CLEAR_TIMEOUT_MSECS	(249 * 2 * 100)
+#define HNS_ROCE_V2_READ_FUNC_CLEAR_FLAG_INTERVAL	40
+#define HNS_ROCE_V2_READ_FUNC_CLEAR_FLAG_FAIL_WAIT	20
+
 struct hns_roce_cfg_llm_a {
 	__le32 base_addr_l;
 	__le32 base_addr_h;
@@ -1157,7 +1337,8 @@
 	__le32	smac_idx_num;
 	__le32	sgid_idx_num;
 	__le32	qid_idx_sl_num;
-	__le32	rsv[2];
+	__le32	sccc_bt_idx_num;
+	__le32	rsv;
 };
 
 #define PF_RES_DATA_1_PF_SMAC_IDX_S 0
@@ -1178,6 +1359,31 @@
 #define PF_RES_DATA_3_PF_SL_NUM_S 16
 #define PF_RES_DATA_3_PF_SL_NUM_M GENMASK(26, 16)
 
+#define PF_RES_DATA_4_PF_SCCC_BT_IDX_S 0
+#define PF_RES_DATA_4_PF_SCCC_BT_IDX_M GENMASK(8, 0)
+
+#define PF_RES_DATA_4_PF_SCCC_BT_NUM_S 9
+#define PF_RES_DATA_4_PF_SCCC_BT_NUM_M GENMASK(17, 9)
+
+struct hns_roce_pf_timer_res_a {
+	__le32	rsv0;
+	__le32	qpc_timer_bt_idx_num;
+	__le32	cqc_timer_bt_idx_num;
+	__le32	rsv[3];
+};
+
+#define PF_RES_DATA_1_PF_QPC_TIMER_BT_IDX_S 0
+#define PF_RES_DATA_1_PF_QPC_TIMER_BT_IDX_M GENMASK(11, 0)
+
+#define PF_RES_DATA_1_PF_QPC_TIMER_BT_NUM_S 16
+#define PF_RES_DATA_1_PF_QPC_TIMER_BT_NUM_M GENMASK(28, 16)
+
+#define PF_RES_DATA_2_PF_CQC_TIMER_BT_IDX_S 0
+#define PF_RES_DATA_2_PF_CQC_TIMER_BT_IDX_M GENMASK(10, 0)
+
+#define PF_RES_DATA_2_PF_CQC_TIMER_BT_NUM_S 16
+#define PF_RES_DATA_2_PF_CQC_TIMER_BT_NUM_M GENMASK(27, 16)
+
 struct hns_roce_vf_res_a {
 	__le32 vf_id;
 	__le32 vf_qpc_bt_idx_num;
@@ -1222,7 +1428,8 @@
 	__le32 vf_smac_idx_num;
 	__le32 vf_sgid_idx_num;
 	__le32 vf_qid_idx_sl_num;
-	__le32 rsv[2];
+	__le32 vf_sccc_idx_num;
+	__le32 rsv1;
 };
 
 #define VF_RES_B_DATA_0_VF_ID_S 0
@@ -1246,12 +1453,49 @@
 #define VF_RES_B_DATA_3_VF_SL_NUM_S 16
 #define VF_RES_B_DATA_3_VF_SL_NUM_M GENMASK(19, 16)
 
+#define VF_RES_B_DATA_4_VF_SCCC_BT_IDX_S 0
+#define VF_RES_B_DATA_4_VF_SCCC_BT_IDX_M GENMASK(8, 0)
+
+#define VF_RES_B_DATA_4_VF_SCCC_BT_NUM_S 9
+#define VF_RES_B_DATA_4_VF_SCCC_BT_NUM_M GENMASK(17, 9)
+
+struct hns_roce_vf_switch {
+	__le32 rocee_sel;
+	__le32 fun_id;
+	__le32 cfg;
+	__le32 resv1;
+	__le32 resv2;
+	__le32 resv3;
+};
+
+#define VF_SWITCH_DATA_FUN_ID_VF_ID_S 3
+#define VF_SWITCH_DATA_FUN_ID_VF_ID_M GENMASK(10, 3)
+
+#define VF_SWITCH_DATA_CFG_ALW_LPBK_S 1
+#define VF_SWITCH_DATA_CFG_ALW_LCL_LPBK_S 2
+#define VF_SWITCH_DATA_CFG_ALW_DST_OVRD_S 3
+
+struct hns_roce_post_mbox {
+	__le32	in_param_l;
+	__le32	in_param_h;
+	__le32	out_param_l;
+	__le32	out_param_h;
+	__le32	cmd_tag;
+	__le32	token_event_en;
+};
+
+struct hns_roce_mbox_status {
+	__le32	mb_status_hw_run;
+	__le32	rsv[5];
+};
+
 struct hns_roce_cfg_bt_attr {
 	__le32 vf_qpc_cfg;
 	__le32 vf_srqc_cfg;
 	__le32 vf_cqc_cfg;
 	__le32 vf_mpt_cfg;
-	__le32 rsv[2];
+	__le32 vf_sccc_cfg;
+	__le32 rsv;
 };
 
 #define CFG_BT_ATTR_DATA_0_VF_QPC_BA_PGSZ_S 0
@@ -1290,6 +1534,15 @@
 #define CFG_BT_ATTR_DATA_3_VF_MPT_HOPNUM_S 8
 #define CFG_BT_ATTR_DATA_3_VF_MPT_HOPNUM_M GENMASK(9, 8)
 
+#define CFG_BT_ATTR_DATA_4_VF_SCCC_BA_PGSZ_S 0
+#define CFG_BT_ATTR_DATA_4_VF_SCCC_BA_PGSZ_M GENMASK(3, 0)
+
+#define CFG_BT_ATTR_DATA_4_VF_SCCC_BUF_PGSZ_S 4
+#define CFG_BT_ATTR_DATA_4_VF_SCCC_BUF_PGSZ_M GENMASK(7, 4)
+
+#define CFG_BT_ATTR_DATA_4_VF_SCCC_HOPNUM_S 8
+#define CFG_BT_ATTR_DATA_4_VF_SCCC_HOPNUM_M GENMASK(9, 8)
+
 struct hns_roce_cfg_sgid_tb {
 	__le32	table_idx_rsv;
 	__le32	vf_sgid_l;
@@ -1329,18 +1582,6 @@
 #define HNS_ROCE_HW_RUN_BIT_SHIFT	31
 #define HNS_ROCE_HW_MB_STATUS_MASK	0xFF
 
-#define HNS_ROCE_VF_MB4_TAG_MASK	0xFFFFFF00
-#define HNS_ROCE_VF_MB4_TAG_SHIFT	8
-
-#define HNS_ROCE_VF_MB4_CMD_MASK	0xFF
-#define HNS_ROCE_VF_MB4_CMD_SHIFT	0
-
-#define HNS_ROCE_VF_MB5_EVENT_MASK	0x10000
-#define HNS_ROCE_VF_MB5_EVENT_SHIFT	16
-
-#define HNS_ROCE_VF_MB5_TOKEN_MASK	0xFFFF
-#define HNS_ROCE_VF_MB5_TOKEN_SHIFT	0
-
 struct hns_roce_v2_cmq_ring {
 	dma_addr_t desc_dma_addr;
 	struct hns_roce_cmq_desc *desc;
@@ -1385,6 +1626,7 @@
 #define HNS_ROCE_LINK_TABLE_NXT_PTR_M GENMASK(31, 20)
 
 struct hns_roce_v2_priv {
+	struct hnae3_handle *handle;
 	struct hns_roce_v2_cmq cmq;
 	struct hns_roce_link_table tsq;
 	struct hns_roce_link_table tpq;
@@ -1564,4 +1806,33 @@
 #define HNS_ROCE_V2_AEQE_EVENT_QUEUE_NUM_S 0
 #define HNS_ROCE_V2_AEQE_EVENT_QUEUE_NUM_M GENMASK(23, 0)
 
+struct hns_roce_wqe_atomic_seg {
+	__le64          fetchadd_swap_data;
+	__le64          cmp_data;
+};
+
+struct hns_roce_sccc_clr {
+	__le32 qpn;
+	__le32 rsv[5];
+};
+
+struct hns_roce_sccc_clr_done {
+	__le32 clr_done;
+	__le32 rsv[5];
+};
+
+int hns_roce_v2_query_cqc_info(struct hns_roce_dev *hr_dev, u32 cqn,
+			       int *buffer);
+
+static inline void hns_roce_write64(struct hns_roce_dev *hr_dev, __le32 val[2],
+				    void __iomem *dest)
+{
+	struct hns_roce_v2_priv *priv = (struct hns_roce_v2_priv *)hr_dev->priv;
+	struct hnae3_handle *handle = priv->handle;
+	const struct hnae3_ae_ops *ops = handle->ae_algo->ops;
+
+	if (!hr_dev->dis_db && !ops->get_hw_reset_stat(handle))
+		hns_roce_write64_k(val, dest);
+}
+
 #endif
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2_dfx.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2_dfx.c
new file mode 100644
index 0000000..5a97b5a
--- /dev/null
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2_dfx.c
@@ -0,0 +1,35 @@
+// SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
+// Copyright (c) 2019 Hisilicon Limited.
+
+#include "hnae3.h"
+#include "hns_roce_device.h"
+#include "hns_roce_cmd.h"
+#include "hns_roce_hw_v2.h"
+
+int hns_roce_v2_query_cqc_info(struct hns_roce_dev *hr_dev, u32 cqn,
+			       int *buffer)
+{
+	struct hns_roce_v2_cq_context *cq_context;
+	struct hns_roce_cmd_mailbox *mailbox;
+	int ret;
+
+	mailbox = hns_roce_alloc_cmd_mailbox(hr_dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	cq_context = mailbox->buf;
+	ret = hns_roce_cmd_mbox(hr_dev, 0, mailbox->dma, cqn, 0,
+				HNS_ROCE_CMD_QUERY_CQC,
+				HNS_ROCE_CMD_TIMEOUT_MSECS);
+	if (ret) {
+		dev_err(hr_dev->dev, "QUERY cqc cmd process error\n");
+		goto err_mailbox;
+	}
+
+	memcpy(buffer, cq_context, sizeof(*cq_context));
+
+err_mailbox:
+	hns_roce_free_cmd_mailbox(hr_dev, mailbox);
+
+	return ret;
+}
diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c
index c5cae9a..b5d196c 100644
--- a/drivers/infiniband/hw/hns/hns_roce_main.c
+++ b/drivers/infiniband/hw/hns/hns_roce_main.c
@@ -57,17 +57,16 @@
 {
 	return gid_index * hr_dev->caps.num_ports + port;
 }
-EXPORT_SYMBOL_GPL(hns_get_gid_index);
 
 static int hns_roce_set_mac(struct hns_roce_dev *hr_dev, u8 port, u8 *addr)
 {
 	u8 phy_port;
 	u32 i = 0;
 
-	if (!memcmp(hr_dev->dev_addr[port], addr, MAC_ADDR_OCTET_NUM))
+	if (!memcmp(hr_dev->dev_addr[port], addr, ETH_ALEN))
 		return 0;
 
-	for (i = 0; i < MAC_ADDR_OCTET_NUM; i++)
+	for (i = 0; i < ETH_ALEN; i++)
 		hr_dev->dev_addr[port][i] = addr[i];
 
 	phy_port = hr_dev->iboe.phy_port[port];
@@ -78,18 +77,13 @@
 {
 	struct hns_roce_dev *hr_dev = to_hr_dev(attr->device);
 	u8 port = attr->port_num - 1;
-	unsigned long flags;
 	int ret;
 
 	if (port >= hr_dev->caps.num_ports)
 		return -EINVAL;
 
-	spin_lock_irqsave(&hr_dev->iboe.lock, flags);
-
 	ret = hr_dev->hw->set_gid(hr_dev, port, attr->index, &attr->gid, attr);
 
-	spin_unlock_irqrestore(&hr_dev->iboe.lock, flags);
-
 	return ret;
 }
 
@@ -98,18 +92,13 @@
 	struct hns_roce_dev *hr_dev = to_hr_dev(attr->device);
 	struct ib_gid_attr zattr = { };
 	u8 port = attr->port_num - 1;
-	unsigned long flags;
 	int ret;
 
 	if (port >= hr_dev->caps.num_ports)
 		return -EINVAL;
 
-	spin_lock_irqsave(&hr_dev->iboe.lock, flags);
-
 	ret = hr_dev->hw->set_gid(hr_dev, port, attr->index, &zgid, &zattr);
 
-	spin_unlock_irqrestore(&hr_dev->iboe.lock, flags);
-
 	return ret;
 }
 
@@ -196,6 +185,7 @@
 
 	memset(props, 0, sizeof(*props));
 
+	props->fw_ver = hr_dev->caps.fw_ver;
 	props->sys_image_guid = cpu_to_be64(hr_dev->sys_image_guid);
 	props->max_mr_size = (u64)(~(0ULL));
 	props->page_size_cap = hr_dev->caps.page_size_cap;
@@ -215,32 +205,24 @@
 	props->max_pd = hr_dev->caps.num_pds;
 	props->max_qp_rd_atom = hr_dev->caps.max_qp_dest_rdma;
 	props->max_qp_init_rd_atom = hr_dev->caps.max_qp_init_rdma;
-	props->atomic_cap = IB_ATOMIC_NONE;
+	props->atomic_cap = hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_ATOMIC ?
+			    IB_ATOMIC_HCA : IB_ATOMIC_NONE;
 	props->max_pkeys = 1;
 	props->local_ca_ack_delay = hr_dev->caps.local_ca_ack_delay;
+	if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_SRQ) {
+		props->max_srq = hr_dev->caps.max_srqs;
+		props->max_srq_wr = hr_dev->caps.max_srq_wrs;
+		props->max_srq_sge = hr_dev->caps.max_srq_sges;
+	}
+
+	if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_FRMR) {
+		props->device_cap_flags |= IB_DEVICE_MEM_MGT_EXTENSIONS;
+		props->max_fast_reg_page_list_len = HNS_ROCE_FRMR_MAX_PA;
+	}
 
 	return 0;
 }
 
-static struct net_device *hns_roce_get_netdev(struct ib_device *ib_dev,
-					      u8 port_num)
-{
-	struct hns_roce_dev *hr_dev = to_hr_dev(ib_dev);
-	struct net_device *ndev;
-
-	if (port_num < 1 || port_num > hr_dev->caps.num_ports)
-		return NULL;
-
-	rcu_read_lock();
-
-	ndev = hr_dev->iboe.netdevs[port_num - 1];
-	if (ndev)
-		dev_hold(ndev);
-
-	rcu_read_unlock();
-	return ndev;
-}
-
 static int hns_roce_query_port(struct ib_device *ib_dev, u8 port_num,
 			       struct ib_port_attr *props)
 {
@@ -279,7 +261,9 @@
 	props->active_mtu = mtu ? min(props->max_mtu, mtu) : IB_MTU_256;
 	props->state = (netif_running(net_dev) && netif_carrier_ok(net_dev)) ?
 			IB_PORT_ACTIVE : IB_PORT_DOWN;
-	props->phys_state = (props->state == IB_PORT_ACTIVE) ? 5 : 3;
+	props->phys_state = (props->state == IB_PORT_ACTIVE) ?
+			     IB_PORT_PHYS_STATE_LINK_UP :
+			     IB_PORT_PHYS_STATE_DISABLED;
 
 	spin_unlock_irqrestore(&hr_dev->iboe.lock, flags);
 
@@ -323,29 +307,23 @@
 	return 0;
 }
 
-static struct ib_ucontext *hns_roce_alloc_ucontext(struct ib_device *ib_dev,
-						   struct ib_udata *udata)
+static int hns_roce_alloc_ucontext(struct ib_ucontext *uctx,
+				   struct ib_udata *udata)
 {
-	int ret = 0;
-	struct hns_roce_ucontext *context;
+	int ret;
+	struct hns_roce_ucontext *context = to_hr_ucontext(uctx);
 	struct hns_roce_ib_alloc_ucontext_resp resp = {};
-	struct hns_roce_dev *hr_dev = to_hr_dev(ib_dev);
+	struct hns_roce_dev *hr_dev = to_hr_dev(uctx->device);
 
 	if (!hr_dev->active)
-		return ERR_PTR(-EAGAIN);
+		return -EAGAIN;
 
 	resp.qp_tab_size = hr_dev->caps.num_qps;
 
-	context = kmalloc(sizeof(*context), GFP_KERNEL);
-	if (!context)
-		return ERR_PTR(-ENOMEM);
-
 	ret = hns_roce_uar_alloc(hr_dev, &context->uar);
 	if (ret)
 		goto error_fail_uar_alloc;
 
-	INIT_LIST_HEAD(&context->vma_list);
-	mutex_init(&context->vma_list_mutex);
 	if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB) {
 		INIT_LIST_HEAD(&context->page_list);
 		mutex_init(&context->page_mutex);
@@ -355,69 +333,20 @@
 	if (ret)
 		goto error_fail_copy_to_udata;
 
-	return &context->ibucontext;
+	return 0;
 
 error_fail_copy_to_udata:
 	hns_roce_uar_free(hr_dev, &context->uar);
 
 error_fail_uar_alloc:
-	kfree(context);
-
-	return ERR_PTR(ret);
+	return ret;
 }
 
-static int hns_roce_dealloc_ucontext(struct ib_ucontext *ibcontext)
+static void hns_roce_dealloc_ucontext(struct ib_ucontext *ibcontext)
 {
 	struct hns_roce_ucontext *context = to_hr_ucontext(ibcontext);
 
 	hns_roce_uar_free(to_hr_dev(ibcontext->device), &context->uar);
-	kfree(context);
-
-	return 0;
-}
-
-static void hns_roce_vma_open(struct vm_area_struct *vma)
-{
-	vma->vm_ops = NULL;
-}
-
-static void hns_roce_vma_close(struct vm_area_struct *vma)
-{
-	struct hns_roce_vma_data *vma_data;
-
-	vma_data = (struct hns_roce_vma_data *)vma->vm_private_data;
-	vma_data->vma = NULL;
-	mutex_lock(vma_data->vma_list_mutex);
-	list_del(&vma_data->list);
-	mutex_unlock(vma_data->vma_list_mutex);
-	kfree(vma_data);
-}
-
-static const struct vm_operations_struct hns_roce_vm_ops = {
-	.open = hns_roce_vma_open,
-	.close = hns_roce_vma_close,
-};
-
-static int hns_roce_set_vma_data(struct vm_area_struct *vma,
-				 struct hns_roce_ucontext *context)
-{
-	struct list_head *vma_head = &context->vma_list;
-	struct hns_roce_vma_data *vma_data;
-
-	vma_data = kzalloc(sizeof(*vma_data), GFP_KERNEL);
-	if (!vma_data)
-		return -ENOMEM;
-
-	vma_data->vma = vma;
-	vma_data->vma_list_mutex = &context->vma_list_mutex;
-	vma->vm_private_data = vma_data;
-	vma->vm_ops = &hns_roce_vm_ops;
-
-	mutex_lock(&context->vma_list_mutex);
-	list_add(&vma_data->list, vma_head);
-	mutex_unlock(&context->vma_list_mutex);
-
-	return 0;
 }
 
 static int hns_roce_mmap(struct ib_ucontext *context,
@@ -425,27 +354,29 @@
 {
 	struct hns_roce_dev *hr_dev = to_hr_dev(context->device);
 
-	if (((vma->vm_end - vma->vm_start) % PAGE_SIZE) != 0)
-		return -EINVAL;
+	switch (vma->vm_pgoff) {
+	case 0:
+		return rdma_user_mmap_io(context, vma,
+					 to_hr_ucontext(context)->uar.pfn,
+					 PAGE_SIZE,
+					 pgprot_noncached(vma->vm_page_prot));
 
-	if (vma->vm_pgoff == 0) {
-		vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
-		if (io_remap_pfn_range(vma, vma->vm_start,
-				       to_hr_ucontext(context)->uar.pfn,
-				       PAGE_SIZE, vma->vm_page_prot))
-			return -EAGAIN;
-	} else if (vma->vm_pgoff == 1 && hr_dev->tptr_dma_addr &&
-		   hr_dev->tptr_size) {
-		/* vm_pgoff: 1 -- TPTR */
-		if (io_remap_pfn_range(vma, vma->vm_start,
-				       hr_dev->tptr_dma_addr >> PAGE_SHIFT,
-				       hr_dev->tptr_size,
-				       vma->vm_page_prot))
-			return -EAGAIN;
-	} else
-		return -EINVAL;
+	/* vm_pgoff: 1 -- TPTR */
+	case 1:
+		if (!hr_dev->tptr_dma_addr || !hr_dev->tptr_size)
+			return -EINVAL;
+		/*
+		 * FIXME: using io_remap_pfn_range on the dma address returned
+		 * by dma_alloc_coherent is totally wrong.
+		 */
+		return rdma_user_mmap_io(context, vma,
+					 hr_dev->tptr_dma_addr >> PAGE_SHIFT,
+					 hr_dev->tptr_size,
+					 vma->vm_page_prot);
 
-	return hns_roce_set_vma_data(vma, to_hr_ucontext(context));
+	default:
+		return -EINVAL;
+	}
 }
 
 static int hns_roce_port_immutable(struct ib_device *ib_dev, u8 port_num,
@@ -471,21 +402,6 @@
 
 static void hns_roce_disassociate_ucontext(struct ib_ucontext *ibcontext)
 {
-	struct hns_roce_ucontext *context = to_hr_ucontext(ibcontext);
-	struct hns_roce_vma_data *vma_data, *n;
-	struct vm_area_struct *vma;
-
-	mutex_lock(&context->vma_list_mutex);
-	list_for_each_entry_safe(vma_data, n, &context->vma_list, list) {
-		vma = vma_data->vma;
-		zap_vma_ptes(vma, vma->vm_start, PAGE_SIZE);
-
-		vma->vm_flags &= ~(VM_SHARED | VM_MAYSHARE);
-		vma->vm_ops = NULL;
-		list_del(&vma_data->list);
-		kfree(vma_data);
-	}
-	mutex_unlock(&context->vma_list_mutex);
 }
 
 static void hns_roce_unregister_device(struct hns_roce_dev *hr_dev)
@@ -497,27 +413,85 @@
 	ib_unregister_device(&hr_dev->ib_dev);
 }
 
+static const struct ib_device_ops hns_roce_dev_ops = {
+	.owner = THIS_MODULE,
+	.driver_id = RDMA_DRIVER_HNS,
+	.uverbs_abi_ver = 1,
+	.uverbs_no_driver_id_binding = 1,
+
+	.add_gid = hns_roce_add_gid,
+	.alloc_pd = hns_roce_alloc_pd,
+	.alloc_ucontext = hns_roce_alloc_ucontext,
+	.create_ah = hns_roce_create_ah,
+	.create_cq = hns_roce_ib_create_cq,
+	.create_qp = hns_roce_create_qp,
+	.dealloc_pd = hns_roce_dealloc_pd,
+	.dealloc_ucontext = hns_roce_dealloc_ucontext,
+	.del_gid = hns_roce_del_gid,
+	.dereg_mr = hns_roce_dereg_mr,
+	.destroy_ah = hns_roce_destroy_ah,
+	.destroy_cq = hns_roce_ib_destroy_cq,
+	.disassociate_ucontext = hns_roce_disassociate_ucontext,
+	.fill_res_entry = hns_roce_fill_res_entry,
+	.get_dma_mr = hns_roce_get_dma_mr,
+	.get_link_layer = hns_roce_get_link_layer,
+	.get_port_immutable = hns_roce_port_immutable,
+	.mmap = hns_roce_mmap,
+	.modify_device = hns_roce_modify_device,
+	.modify_port = hns_roce_modify_port,
+	.modify_qp = hns_roce_modify_qp,
+	.query_ah = hns_roce_query_ah,
+	.query_device = hns_roce_query_device,
+	.query_pkey = hns_roce_query_pkey,
+	.query_port = hns_roce_query_port,
+	.reg_user_mr = hns_roce_reg_user_mr,
+
+	INIT_RDMA_OBJ_SIZE(ib_ah, hns_roce_ah, ibah),
+	INIT_RDMA_OBJ_SIZE(ib_cq, hns_roce_cq, ib_cq),
+	INIT_RDMA_OBJ_SIZE(ib_pd, hns_roce_pd, ibpd),
+	INIT_RDMA_OBJ_SIZE(ib_ucontext, hns_roce_ucontext, ibucontext),
+};
+
+static const struct ib_device_ops hns_roce_dev_mr_ops = {
+	.rereg_user_mr = hns_roce_rereg_user_mr,
+};
+
+static const struct ib_device_ops hns_roce_dev_mw_ops = {
+	.alloc_mw = hns_roce_alloc_mw,
+	.dealloc_mw = hns_roce_dealloc_mw,
+};
+
+static const struct ib_device_ops hns_roce_dev_frmr_ops = {
+	.alloc_mr = hns_roce_alloc_mr,
+	.map_mr_sg = hns_roce_map_mr_sg,
+};
+
+static const struct ib_device_ops hns_roce_dev_srq_ops = {
+	.create_srq = hns_roce_create_srq,
+	.destroy_srq = hns_roce_destroy_srq,
+
+	INIT_RDMA_OBJ_SIZE(ib_srq, hns_roce_srq, ibsrq),
+};
+
 static int hns_roce_register_device(struct hns_roce_dev *hr_dev)
 {
 	int ret;
 	struct hns_roce_ib_iboe *iboe = NULL;
 	struct ib_device *ib_dev = NULL;
 	struct device *dev = hr_dev->dev;
+	unsigned int i;
 
 	iboe = &hr_dev->iboe;
 	spin_lock_init(&iboe->lock);
 
 	ib_dev = &hr_dev->ib_dev;
-	strlcpy(ib_dev->name, "hns_%d", IB_DEVICE_NAME_MAX);
 
-	ib_dev->owner			= THIS_MODULE;
 	ib_dev->node_type		= RDMA_NODE_IB_CA;
 	ib_dev->dev.parent		= dev;
 
 	ib_dev->phys_port_cnt		= hr_dev->caps.num_ports;
 	ib_dev->local_dma_lkey		= hr_dev->caps.reserved_lkey;
 	ib_dev->num_comp_vectors	= hr_dev->caps.num_comp_vectors;
-	ib_dev->uverbs_abi_ver		= 1;
 	ib_dev->uverbs_cmd_mask		=
 		(1ULL << IB_USER_VERBS_CMD_GET_CONTEXT) |
 		(1ULL << IB_USER_VERBS_CMD_QUERY_DEVICE) |
@@ -537,59 +511,47 @@
 	ib_dev->uverbs_ex_cmd_mask |=
 		(1ULL << IB_USER_VERBS_EX_CMD_MODIFY_CQ);
 
-	/* HCA||device||port */
-	ib_dev->modify_device		= hns_roce_modify_device;
-	ib_dev->query_device		= hns_roce_query_device;
-	ib_dev->query_port		= hns_roce_query_port;
-	ib_dev->modify_port		= hns_roce_modify_port;
-	ib_dev->get_link_layer		= hns_roce_get_link_layer;
-	ib_dev->get_netdev		= hns_roce_get_netdev;
-	ib_dev->add_gid			= hns_roce_add_gid;
-	ib_dev->del_gid			= hns_roce_del_gid;
-	ib_dev->query_pkey		= hns_roce_query_pkey;
-	ib_dev->alloc_ucontext		= hns_roce_alloc_ucontext;
-	ib_dev->dealloc_ucontext	= hns_roce_dealloc_ucontext;
-	ib_dev->mmap			= hns_roce_mmap;
-
-	/* PD */
-	ib_dev->alloc_pd		= hns_roce_alloc_pd;
-	ib_dev->dealloc_pd		= hns_roce_dealloc_pd;
-
-	/* AH */
-	ib_dev->create_ah		= hns_roce_create_ah;
-	ib_dev->query_ah		= hns_roce_query_ah;
-	ib_dev->destroy_ah		= hns_roce_destroy_ah;
-
-	/* QP */
-	ib_dev->create_qp		= hns_roce_create_qp;
-	ib_dev->modify_qp		= hns_roce_modify_qp;
-	ib_dev->query_qp		= hr_dev->hw->query_qp;
-	ib_dev->destroy_qp		= hr_dev->hw->destroy_qp;
-	ib_dev->post_send		= hr_dev->hw->post_send;
-	ib_dev->post_recv		= hr_dev->hw->post_recv;
-
-	/* CQ */
-	ib_dev->create_cq		= hns_roce_ib_create_cq;
-	ib_dev->modify_cq		= hr_dev->hw->modify_cq;
-	ib_dev->destroy_cq		= hns_roce_ib_destroy_cq;
-	ib_dev->req_notify_cq		= hr_dev->hw->req_notify_cq;
-	ib_dev->poll_cq			= hr_dev->hw->poll_cq;
-
-	/* MR */
-	ib_dev->get_dma_mr		= hns_roce_get_dma_mr;
-	ib_dev->reg_user_mr		= hns_roce_reg_user_mr;
-	ib_dev->dereg_mr		= hns_roce_dereg_mr;
 	if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_REREG_MR) {
-		ib_dev->rereg_user_mr	= hns_roce_rereg_user_mr;
 		ib_dev->uverbs_cmd_mask |= (1ULL << IB_USER_VERBS_CMD_REREG_MR);
+		ib_set_device_ops(ib_dev, &hns_roce_dev_mr_ops);
 	}
 
-	/* OTHERS */
-	ib_dev->get_port_immutable	= hns_roce_port_immutable;
-	ib_dev->disassociate_ucontext	= hns_roce_disassociate_ucontext;
+	/* MW */
+	if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_MW) {
+		ib_dev->uverbs_cmd_mask |=
+					(1ULL << IB_USER_VERBS_CMD_ALLOC_MW) |
+					(1ULL << IB_USER_VERBS_CMD_DEALLOC_MW);
+		ib_set_device_ops(ib_dev, &hns_roce_dev_mw_ops);
+	}
 
-	ib_dev->driver_id = RDMA_DRIVER_HNS;
-	ret = ib_register_device(ib_dev, NULL);
+	/* FRMR */
+	if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_FRMR)
+		ib_set_device_ops(ib_dev, &hns_roce_dev_frmr_ops);
+
+	/* SRQ */
+	if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_SRQ) {
+		ib_dev->uverbs_cmd_mask |=
+				(1ULL << IB_USER_VERBS_CMD_CREATE_SRQ) |
+				(1ULL << IB_USER_VERBS_CMD_MODIFY_SRQ) |
+				(1ULL << IB_USER_VERBS_CMD_QUERY_SRQ) |
+				(1ULL << IB_USER_VERBS_CMD_DESTROY_SRQ) |
+				(1ULL << IB_USER_VERBS_CMD_POST_SRQ_RECV);
+		ib_set_device_ops(ib_dev, &hns_roce_dev_srq_ops);
+		ib_set_device_ops(ib_dev, hr_dev->hw->hns_roce_dev_srq_ops);
+	}
+
+	ib_set_device_ops(ib_dev, hr_dev->hw->hns_roce_dev_ops);
+	ib_set_device_ops(ib_dev, &hns_roce_dev_ops);
+	for (i = 0; i < hr_dev->caps.num_ports; i++) {
+		if (!hr_dev->iboe.netdevs[i])
+			continue;
+
+		ret = ib_device_set_netdev(ib_dev, hr_dev->iboe.netdevs[i],
+					   i + 1);
+		if (ret)
+			return ret;
+	}
+	ret = ib_register_device(ib_dev, "hns_%d");
 	if (ret) {
 		dev_err(dev, "ib_register_device failed!\n");
 		return ret;
@@ -689,8 +651,112 @@
 		goto err_unmap_trrl;
 	}
 
+	if (hr_dev->caps.srqc_entry_sz) {
+		ret = hns_roce_init_hem_table(hr_dev, &hr_dev->srq_table.table,
+					      HEM_TYPE_SRQC,
+					      hr_dev->caps.srqc_entry_sz,
+					      hr_dev->caps.num_srqs, 1);
+		if (ret) {
+			dev_err(dev,
+			      "Failed to init SRQ context memory, aborting.\n");
+			goto err_unmap_cq;
+		}
+	}
+
+	if (hr_dev->caps.num_srqwqe_segs) {
+		ret = hns_roce_init_hem_table(hr_dev,
+					     &hr_dev->mr_table.mtt_srqwqe_table,
+					     HEM_TYPE_SRQWQE,
+					     hr_dev->caps.mtt_entry_sz,
+					     hr_dev->caps.num_srqwqe_segs, 1);
+		if (ret) {
+			dev_err(dev,
+				"Failed to init MTT srqwqe memory, aborting.\n");
+			goto err_unmap_srq;
+		}
+	}
+
+	if (hr_dev->caps.num_idx_segs) {
+		ret = hns_roce_init_hem_table(hr_dev,
+					      &hr_dev->mr_table.mtt_idx_table,
+					      HEM_TYPE_IDX,
+					      hr_dev->caps.idx_entry_sz,
+					      hr_dev->caps.num_idx_segs, 1);
+		if (ret) {
+			dev_err(dev,
+				"Failed to init MTT idx memory, aborting.\n");
+			goto err_unmap_srqwqe;
+		}
+	}
+
+	if (hr_dev->caps.sccc_entry_sz) {
+		ret = hns_roce_init_hem_table(hr_dev,
+					      &hr_dev->qp_table.sccc_table,
+					      HEM_TYPE_SCCC,
+					      hr_dev->caps.sccc_entry_sz,
+					      hr_dev->caps.num_qps, 1);
+		if (ret) {
+			dev_err(dev,
+			      "Failed to init SCC context memory, aborting.\n");
+			goto err_unmap_idx;
+		}
+	}
+
+	if (hr_dev->caps.qpc_timer_entry_sz) {
+		ret = hns_roce_init_hem_table(hr_dev,
+					      &hr_dev->qpc_timer_table,
+					      HEM_TYPE_QPC_TIMER,
+					      hr_dev->caps.qpc_timer_entry_sz,
+					      hr_dev->caps.num_qpc_timer, 1);
+		if (ret) {
+			dev_err(dev,
+			      "Failed to init QPC timer memory, aborting.\n");
+			goto err_unmap_ctx;
+		}
+	}
+
+	if (hr_dev->caps.cqc_timer_entry_sz) {
+		ret = hns_roce_init_hem_table(hr_dev,
+					      &hr_dev->cqc_timer_table,
+					      HEM_TYPE_CQC_TIMER,
+					      hr_dev->caps.cqc_timer_entry_sz,
+					      hr_dev->caps.num_cqc_timer, 1);
+		if (ret) {
+			dev_err(dev,
+			      "Failed to init CQC timer memory, aborting.\n");
+			goto err_unmap_qpc_timer;
+		}
+	}
+
 	return 0;
 
+err_unmap_qpc_timer:
+	if (hr_dev->caps.qpc_timer_entry_sz)
+		hns_roce_cleanup_hem_table(hr_dev,
+					   &hr_dev->qpc_timer_table);
+
+err_unmap_ctx:
+	if (hr_dev->caps.sccc_entry_sz)
+		hns_roce_cleanup_hem_table(hr_dev,
+					   &hr_dev->qp_table.sccc_table);
+
+err_unmap_idx:
+	if (hr_dev->caps.num_idx_segs)
+		hns_roce_cleanup_hem_table(hr_dev,
+					   &hr_dev->mr_table.mtt_idx_table);
+
+err_unmap_srqwqe:
+	if (hr_dev->caps.num_srqwqe_segs)
+		hns_roce_cleanup_hem_table(hr_dev,
+					   &hr_dev->mr_table.mtt_srqwqe_table);
+
+err_unmap_srq:
+	if (hr_dev->caps.srqc_entry_sz)
+		hns_roce_cleanup_hem_table(hr_dev, &hr_dev->srq_table.table);
+
+err_unmap_cq:
+	hns_roce_cleanup_hem_table(hr_dev, &hr_dev->cq_table.table);
+
 err_unmap_trrl:
 	if (hr_dev->caps.trrl_entry_sz)
 		hns_roce_cleanup_hem_table(hr_dev,
@@ -770,8 +836,21 @@
 		goto err_cq_table_free;
 	}
 
+	if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_SRQ) {
+		ret = hns_roce_init_srq_table(hr_dev);
+		if (ret) {
+			dev_err(dev,
+				"Failed to init share receive queue table.\n");
+			goto err_qp_table_free;
+		}
+	}
+
 	return 0;
 
+err_qp_table_free:
+	if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_SRQ)
+		hns_roce_cleanup_qp_table(hr_dev);
+
 err_cq_table_free:
 	hns_roce_cleanup_cq_table(hr_dev);
 
@@ -823,6 +902,7 @@
 		goto error_failed_cmd_init;
 	}
 
+	/* EQ depends on poll mode, event mode depends on EQ */
 	ret = hr_dev->hw->init_eq(hr_dev);
 	if (ret) {
 		dev_err(dev, "eq init failed!\n");
@@ -832,8 +912,9 @@
 	if (hr_dev->cmd_mod) {
 		ret = hns_roce_cmd_use_events(hr_dev);
 		if (ret) {
-			dev_err(dev, "Switch to event-driven cmd failed!\n");
-			goto error_failed_use_event;
+			dev_warn(dev,
+				 "Cmd event  mode failed, set back to poll!\n");
+			hns_roce_cmd_use_polling(hr_dev);
 		}
 	}
 
@@ -876,8 +957,6 @@
 error_failed_init_hem:
 	if (hr_dev->cmd_mod)
 		hns_roce_cmd_use_polling(hr_dev);
-
-error_failed_use_event:
 	hr_dev->hw->cleanup_eq(hr_dev);
 
 error_failed_eq_table:
@@ -895,7 +974,6 @@
 
 	return ret;
 }
-EXPORT_SYMBOL_GPL(hns_roce_init);
 
 void hns_roce_exit(struct hns_roce_dev *hr_dev)
 {
@@ -916,7 +994,6 @@
 	if (hr_dev->hw->reset)
 		hr_dev->hw->reset(hr_dev, false);
 }
-EXPORT_SYMBOL_GPL(hns_roce_exit);
 
 MODULE_LICENSE("Dual BSD/GPL");
 MODULE_AUTHOR("Wei Hu <xavier.huwei@huawei.com>");
diff --git a/drivers/infiniband/hw/hns/hns_roce_mr.c b/drivers/infiniband/hw/hns/hns_roce_mr.c
index eb26a5f..5f8416b 100644
--- a/drivers/infiniband/hw/hns/hns_roce_mr.c
+++ b/drivers/infiniband/hw/hns/hns_roce_mr.c
@@ -47,7 +47,6 @@
 {
 	return (key << 24) | (key >> 8);
 }
-EXPORT_SYMBOL_GPL(key_to_hw_index);
 
 static int hns_roce_sw2hw_mpt(struct hns_roce_dev *hr_dev,
 			      struct hns_roce_cmd_mailbox *mailbox,
@@ -66,7 +65,6 @@
 				 mpt_index, !mailbox, HNS_ROCE_CMD_HW2SW_MPT,
 				 HNS_ROCE_CMD_TIMEOUT_MSECS);
 }
-EXPORT_SYMBOL_GPL(hns_roce_hw2sw_mpt);
 
 static int hns_roce_buddy_alloc(struct hns_roce_buddy *buddy, int order,
 				unsigned long *seg)
@@ -184,12 +182,27 @@
 	struct hns_roce_buddy *buddy;
 	int ret;
 
-	if (mtt_type == MTT_TYPE_WQE) {
+	switch (mtt_type) {
+	case MTT_TYPE_WQE:
 		buddy = &mr_table->mtt_buddy;
 		table = &mr_table->mtt_table;
-	} else {
+		break;
+	case MTT_TYPE_CQE:
 		buddy = &mr_table->mtt_cqe_buddy;
 		table = &mr_table->mtt_cqe_table;
+		break;
+	case MTT_TYPE_SRQWQE:
+		buddy = &mr_table->mtt_srqwqe_buddy;
+		table = &mr_table->mtt_srqwqe_table;
+		break;
+	case MTT_TYPE_IDX:
+		buddy = &mr_table->mtt_idx_buddy;
+		table = &mr_table->mtt_idx_table;
+		break;
+	default:
+		dev_err(hr_dev->dev, "Unsupport MTT table type: %d\n",
+			mtt_type);
+		return -EINVAL;
 	}
 
 	ret = hns_roce_buddy_alloc(buddy, order, seg);
@@ -242,21 +255,42 @@
 	if (mtt->order < 0)
 		return;
 
-	if (mtt->mtt_type == MTT_TYPE_WQE) {
+	switch (mtt->mtt_type) {
+	case MTT_TYPE_WQE:
 		hns_roce_buddy_free(&mr_table->mtt_buddy, mtt->first_seg,
 				    mtt->order);
 		hns_roce_table_put_range(hr_dev, &mr_table->mtt_table,
 					mtt->first_seg,
 					mtt->first_seg + (1 << mtt->order) - 1);
-	} else {
+		break;
+	case MTT_TYPE_CQE:
 		hns_roce_buddy_free(&mr_table->mtt_cqe_buddy, mtt->first_seg,
 				    mtt->order);
 		hns_roce_table_put_range(hr_dev, &mr_table->mtt_cqe_table,
 					mtt->first_seg,
 					mtt->first_seg + (1 << mtt->order) - 1);
+		break;
+	case MTT_TYPE_SRQWQE:
+		hns_roce_buddy_free(&mr_table->mtt_srqwqe_buddy, mtt->first_seg,
+				    mtt->order);
+		hns_roce_table_put_range(hr_dev, &mr_table->mtt_srqwqe_table,
+					mtt->first_seg,
+					mtt->first_seg + (1 << mtt->order) - 1);
+		break;
+	case MTT_TYPE_IDX:
+		hns_roce_buddy_free(&mr_table->mtt_idx_buddy, mtt->first_seg,
+				    mtt->order);
+		hns_roce_table_put_range(hr_dev, &mr_table->mtt_idx_table,
+					mtt->first_seg,
+					mtt->first_seg + (1 << mtt->order) - 1);
+		break;
+	default:
+		dev_err(hr_dev->dev,
+			"Unsupport mtt type %d, clean mtt failed\n",
+			mtt->mtt_type);
+		break;
 	}
 }
-EXPORT_SYMBOL_GPL(hns_roce_mtt_cleanup);
 
 static void hns_roce_loop_free(struct hns_roce_dev *hr_dev,
 			       struct hns_roce_mr *mr, int err_loop_index,
@@ -277,11 +311,11 @@
 			dma_free_coherent(dev, pbl_bt_sz, mr->pbl_bt_l1[i],
 					  mr->pbl_l1_dma_addr[i]);
 
-			for (j = 0; j < pbl_bt_sz / 8; j++) {
+			for (j = 0; j < pbl_bt_sz / BA_BYTE_LEN; j++) {
 				if (i == loop_i && j >= loop_j)
 					break;
 
-				bt_idx = i * pbl_bt_sz / 8 + j;
+				bt_idx = i * pbl_bt_sz / BA_BYTE_LEN + j;
 				dma_free_coherent(dev, pbl_bt_sz,
 						  mr->pbl_bt_l2[bt_idx],
 						  mr->pbl_l2_dma_addr[bt_idx]);
@@ -292,8 +326,8 @@
 			dma_free_coherent(dev, pbl_bt_sz, mr->pbl_bt_l1[i],
 					  mr->pbl_l1_dma_addr[i]);
 
-			for (j = 0; j < pbl_bt_sz / 8; j++) {
-				bt_idx = i * pbl_bt_sz / 8 + j;
+			for (j = 0; j < pbl_bt_sz / BA_BYTE_LEN; j++) {
+				bt_idx = i * pbl_bt_sz / BA_BYTE_LEN + j;
 				dma_free_coherent(dev, pbl_bt_sz,
 						  mr->pbl_bt_l2[bt_idx],
 						  mr->pbl_l2_dma_addr[bt_idx]);
@@ -313,49 +347,178 @@
 	mr->pbl_bt_l0 = NULL;
 	mr->pbl_l0_dma_addr = 0;
 }
+static int pbl_1hop_alloc(struct hns_roce_dev *hr_dev, int npages,
+			       struct hns_roce_mr *mr, u32 pbl_bt_sz)
+{
+	struct device *dev = hr_dev->dev;
+
+	if (npages > pbl_bt_sz / 8) {
+		dev_err(dev, "npages %d is larger than buf_pg_sz!",
+			npages);
+		return -EINVAL;
+	}
+	mr->pbl_buf = dma_alloc_coherent(dev, npages * 8,
+					 &(mr->pbl_dma_addr),
+					 GFP_KERNEL);
+	if (!mr->pbl_buf)
+		return -ENOMEM;
+
+	mr->pbl_size = npages;
+	mr->pbl_ba = mr->pbl_dma_addr;
+	mr->pbl_hop_num = 1;
+	mr->pbl_ba_pg_sz = hr_dev->caps.pbl_ba_pg_sz;
+	mr->pbl_buf_pg_sz = hr_dev->caps.pbl_buf_pg_sz;
+	return 0;
+
+}
+
+
+static int pbl_2hop_alloc(struct hns_roce_dev *hr_dev, int npages,
+			       struct hns_roce_mr *mr, u32 pbl_bt_sz)
+{
+	struct device *dev = hr_dev->dev;
+	int npages_allocated;
+	u64 pbl_last_bt_num;
+	u64 pbl_bt_cnt = 0;
+	u64 size;
+	int i;
+
+	pbl_last_bt_num = (npages + pbl_bt_sz / 8 - 1) / (pbl_bt_sz / 8);
+
+	/* alloc L1 BT */
+	for (i = 0; i < pbl_bt_sz / 8; i++) {
+		if (pbl_bt_cnt + 1 < pbl_last_bt_num) {
+			size = pbl_bt_sz;
+		} else {
+			npages_allocated = i * (pbl_bt_sz / 8);
+			size = (npages - npages_allocated) * 8;
+		}
+		mr->pbl_bt_l1[i] = dma_alloc_coherent(dev, size,
+					    &(mr->pbl_l1_dma_addr[i]),
+					    GFP_KERNEL);
+		if (!mr->pbl_bt_l1[i]) {
+			hns_roce_loop_free(hr_dev, mr, 1, i, 0);
+			return -ENOMEM;
+		}
+
+		*(mr->pbl_bt_l0 + i) = mr->pbl_l1_dma_addr[i];
+
+		pbl_bt_cnt++;
+		if (pbl_bt_cnt >= pbl_last_bt_num)
+			break;
+	}
+
+	mr->l0_chunk_last_num = i + 1;
+
+	return 0;
+}
+
+static int pbl_3hop_alloc(struct hns_roce_dev *hr_dev, int npages,
+			       struct hns_roce_mr *mr, u32 pbl_bt_sz)
+{
+	struct device *dev = hr_dev->dev;
+	int mr_alloc_done = 0;
+	int npages_allocated;
+	u64 pbl_last_bt_num;
+	u64 pbl_bt_cnt = 0;
+	u64 bt_idx;
+	u64 size;
+	int i;
+	int j = 0;
+
+	pbl_last_bt_num = (npages + pbl_bt_sz / 8 - 1) / (pbl_bt_sz / 8);
+
+	mr->pbl_l2_dma_addr = kcalloc(pbl_last_bt_num,
+				      sizeof(*mr->pbl_l2_dma_addr),
+				      GFP_KERNEL);
+	if (!mr->pbl_l2_dma_addr)
+		return -ENOMEM;
+
+	mr->pbl_bt_l2 = kcalloc(pbl_last_bt_num,
+				sizeof(*mr->pbl_bt_l2),
+				GFP_KERNEL);
+	if (!mr->pbl_bt_l2)
+		goto err_kcalloc_bt_l2;
+
+	/* alloc L1, L2 BT */
+	for (i = 0; i < pbl_bt_sz / 8; i++) {
+		mr->pbl_bt_l1[i] = dma_alloc_coherent(dev, pbl_bt_sz,
+					    &(mr->pbl_l1_dma_addr[i]),
+					    GFP_KERNEL);
+		if (!mr->pbl_bt_l1[i]) {
+			hns_roce_loop_free(hr_dev, mr, 1, i, 0);
+			goto err_dma_alloc_l0;
+		}
+
+		*(mr->pbl_bt_l0 + i) = mr->pbl_l1_dma_addr[i];
+
+		for (j = 0; j < pbl_bt_sz / 8; j++) {
+			bt_idx = i * pbl_bt_sz / 8 + j;
+
+			if (pbl_bt_cnt + 1 < pbl_last_bt_num) {
+				size = pbl_bt_sz;
+			} else {
+				npages_allocated = bt_idx *
+						   (pbl_bt_sz / 8);
+				size = (npages - npages_allocated) * 8;
+			}
+			mr->pbl_bt_l2[bt_idx] = dma_alloc_coherent(
+				      dev, size,
+				      &(mr->pbl_l2_dma_addr[bt_idx]),
+				      GFP_KERNEL);
+			if (!mr->pbl_bt_l2[bt_idx]) {
+				hns_roce_loop_free(hr_dev, mr, 2, i, j);
+				goto err_dma_alloc_l0;
+			}
+
+			*(mr->pbl_bt_l1[i] + j) =
+					mr->pbl_l2_dma_addr[bt_idx];
+
+			pbl_bt_cnt++;
+			if (pbl_bt_cnt >= pbl_last_bt_num) {
+				mr_alloc_done = 1;
+				break;
+			}
+		}
+
+		if (mr_alloc_done)
+			break;
+	}
+
+	mr->l0_chunk_last_num = i + 1;
+	mr->l1_chunk_last_num = j + 1;
+
+
+	return 0;
+
+err_dma_alloc_l0:
+	kfree(mr->pbl_bt_l2);
+	mr->pbl_bt_l2 = NULL;
+
+err_kcalloc_bt_l2:
+	kfree(mr->pbl_l2_dma_addr);
+	mr->pbl_l2_dma_addr = NULL;
+
+	return -ENOMEM;
+}
+
 
 /* PBL multi hop addressing */
 static int hns_roce_mhop_alloc(struct hns_roce_dev *hr_dev, int npages,
 			       struct hns_roce_mr *mr)
 {
 	struct device *dev = hr_dev->dev;
-	int mr_alloc_done = 0;
-	int npages_allocated;
-	int i = 0, j = 0;
 	u32 pbl_bt_sz;
 	u32 mhop_num;
-	u64 pbl_last_bt_num;
-	u64 pbl_bt_cnt = 0;
-	u64 bt_idx;
-	u64 size;
 
-	mhop_num = hr_dev->caps.pbl_hop_num;
+	mhop_num = (mr->type == MR_TYPE_FRMR ? 1 : hr_dev->caps.pbl_hop_num);
 	pbl_bt_sz = 1 << (hr_dev->caps.pbl_ba_pg_sz + PAGE_SHIFT);
-	pbl_last_bt_num = (npages + pbl_bt_sz / 8 - 1) / (pbl_bt_sz / 8);
 
 	if (mhop_num == HNS_ROCE_HOP_NUM_0)
 		return 0;
 
-	/* hop_num = 1 */
-	if (mhop_num == 1) {
-		if (npages > pbl_bt_sz / 8) {
-			dev_err(dev, "npages %d is larger than buf_pg_sz!",
-				npages);
-			return -EINVAL;
-		}
-		mr->pbl_buf = dma_alloc_coherent(dev, npages * 8,
-						 &(mr->pbl_dma_addr),
-						 GFP_KERNEL);
-		if (!mr->pbl_buf)
-			return -ENOMEM;
-
-		mr->pbl_size = npages;
-		mr->pbl_ba = mr->pbl_dma_addr;
-		mr->pbl_hop_num = hr_dev->caps.pbl_hop_num;
-		mr->pbl_ba_pg_sz = hr_dev->caps.pbl_ba_pg_sz;
-		mr->pbl_buf_pg_sz = hr_dev->caps.pbl_buf_pg_sz;
-		return 0;
-	}
+	if (mhop_num == 1)
+		return pbl_1hop_alloc(hr_dev, npages, mr, pbl_bt_sz);
 
 	mr->pbl_l1_dma_addr = kcalloc(pbl_bt_sz / 8,
 				      sizeof(*mr->pbl_l1_dma_addr),
@@ -368,100 +531,23 @@
 	if (!mr->pbl_bt_l1)
 		goto err_kcalloc_bt_l1;
 
-	if (mhop_num == 3) {
-		mr->pbl_l2_dma_addr = kcalloc(pbl_last_bt_num,
-					      sizeof(*mr->pbl_l2_dma_addr),
-					      GFP_KERNEL);
-		if (!mr->pbl_l2_dma_addr)
-			goto err_kcalloc_l2_dma;
-
-		mr->pbl_bt_l2 = kcalloc(pbl_last_bt_num,
-					sizeof(*mr->pbl_bt_l2),
-					GFP_KERNEL);
-		if (!mr->pbl_bt_l2)
-			goto err_kcalloc_bt_l2;
-	}
-
 	/* alloc L0 BT */
 	mr->pbl_bt_l0 = dma_alloc_coherent(dev, pbl_bt_sz,
 					   &(mr->pbl_l0_dma_addr),
 					   GFP_KERNEL);
 	if (!mr->pbl_bt_l0)
-		goto err_dma_alloc_l0;
+		goto err_kcalloc_l2_dma;
 
 	if (mhop_num == 2) {
-		/* alloc L1 BT */
-		for (i = 0; i < pbl_bt_sz / 8; i++) {
-			if (pbl_bt_cnt + 1 < pbl_last_bt_num) {
-				size = pbl_bt_sz;
-			} else {
-				npages_allocated = i * (pbl_bt_sz / 8);
-				size = (npages - npages_allocated) * 8;
-			}
-			mr->pbl_bt_l1[i] = dma_alloc_coherent(dev, size,
-						    &(mr->pbl_l1_dma_addr[i]),
-						    GFP_KERNEL);
-			if (!mr->pbl_bt_l1[i]) {
-				hns_roce_loop_free(hr_dev, mr, 1, i, 0);
-				goto err_dma_alloc_l0;
-			}
-
-			*(mr->pbl_bt_l0 + i) = mr->pbl_l1_dma_addr[i];
-
-			pbl_bt_cnt++;
-			if (pbl_bt_cnt >= pbl_last_bt_num)
-				break;
-		}
-	} else if (mhop_num == 3) {
-		/* alloc L1, L2 BT */
-		for (i = 0; i < pbl_bt_sz / 8; i++) {
-			mr->pbl_bt_l1[i] = dma_alloc_coherent(dev, pbl_bt_sz,
-						    &(mr->pbl_l1_dma_addr[i]),
-						    GFP_KERNEL);
-			if (!mr->pbl_bt_l1[i]) {
-				hns_roce_loop_free(hr_dev, mr, 1, i, 0);
-				goto err_dma_alloc_l0;
-			}
-
-			*(mr->pbl_bt_l0 + i) = mr->pbl_l1_dma_addr[i];
-
-			for (j = 0; j < pbl_bt_sz / 8; j++) {
-				bt_idx = i * pbl_bt_sz / 8 + j;
-
-				if (pbl_bt_cnt + 1 < pbl_last_bt_num) {
-					size = pbl_bt_sz;
-				} else {
-					npages_allocated = bt_idx *
-							   (pbl_bt_sz / 8);
-					size = (npages - npages_allocated) * 8;
-				}
-				mr->pbl_bt_l2[bt_idx] = dma_alloc_coherent(
-					      dev, size,
-					      &(mr->pbl_l2_dma_addr[bt_idx]),
-					      GFP_KERNEL);
-				if (!mr->pbl_bt_l2[bt_idx]) {
-					hns_roce_loop_free(hr_dev, mr, 2, i, j);
-					goto err_dma_alloc_l0;
-				}
-
-				*(mr->pbl_bt_l1[i] + j) =
-						mr->pbl_l2_dma_addr[bt_idx];
-
-				pbl_bt_cnt++;
-				if (pbl_bt_cnt >= pbl_last_bt_num) {
-					mr_alloc_done = 1;
-					break;
-				}
-			}
-
-			if (mr_alloc_done)
-				break;
-		}
+		if (pbl_2hop_alloc(hr_dev, npages, mr, pbl_bt_sz))
+			goto err_kcalloc_l2_dma;
 	}
 
-	mr->l0_chunk_last_num = i + 1;
-	if (mhop_num == 3)
-		mr->l1_chunk_last_num = j + 1;
+	if (mhop_num == 3) {
+		if (pbl_3hop_alloc(hr_dev, npages, mr, pbl_bt_sz))
+			goto err_kcalloc_l2_dma;
+	}
+
 
 	mr->pbl_size = npages;
 	mr->pbl_ba = mr->pbl_l0_dma_addr;
@@ -471,14 +557,6 @@
 
 	return 0;
 
-err_dma_alloc_l0:
-	kfree(mr->pbl_bt_l2);
-	mr->pbl_bt_l2 = NULL;
-
-err_kcalloc_bt_l2:
-	kfree(mr->pbl_l2_dma_addr);
-	mr->pbl_l2_dma_addr = NULL;
-
 err_kcalloc_l2_dma:
 	kfree(mr->pbl_bt_l1);
 	mr->pbl_bt_l1 = NULL;
@@ -496,7 +574,7 @@
 {
 	struct device *dev = hr_dev->dev;
 	unsigned long index = 0;
-	int ret = 0;
+	int ret;
 
 	/* Allocate a key for mr from mr_table */
 	ret = hns_roce_bitmap_alloc(&hr_dev->mr_table.mtpt_bitmap, &index);
@@ -511,7 +589,6 @@
 	mr->key = hw_index_to_key(index);	/* MR key */
 
 	if (size == ~0ull) {
-		mr->type = MR_TYPE_DMA;
 		mr->pbl_buf = NULL;
 		mr->pbl_dma_addr = 0;
 		/* PBL multi-hop addressing parameters */
@@ -522,9 +599,9 @@
 		mr->pbl_l1_dma_addr = NULL;
 		mr->pbl_l0_dma_addr = 0;
 	} else {
-		mr->type = MR_TYPE_MR;
 		if (!hr_dev->caps.pbl_hop_num) {
-			mr->pbl_buf = dma_alloc_coherent(dev, npages * 8,
+			mr->pbl_buf = dma_alloc_coherent(dev,
+							 npages * BA_BYTE_LEN,
 							 &(mr->pbl_dma_addr),
 							 GFP_KERNEL);
 			if (!mr->pbl_buf)
@@ -548,16 +625,15 @@
 	u32 mhop_num;
 	u64 bt_idx;
 
-	npages = ib_umem_page_count(mr->umem);
+	npages = mr->pbl_size;
 	pbl_bt_sz = 1 << (hr_dev->caps.pbl_ba_pg_sz + PAGE_SHIFT);
-	mhop_num = hr_dev->caps.pbl_hop_num;
+	mhop_num = (mr->type == MR_TYPE_FRMR) ? 1 : hr_dev->caps.pbl_hop_num;
 
 	if (mhop_num == HNS_ROCE_HOP_NUM_0)
 		return;
 
-	/* hop_num = 1 */
 	if (mhop_num == 1) {
-		dma_free_coherent(dev, (unsigned int)(npages * 8),
+		dma_free_coherent(dev, (unsigned int)(npages * BA_BYTE_LEN),
 				  mr->pbl_buf, mr->pbl_dma_addr);
 		return;
 	}
@@ -568,12 +644,13 @@
 	if (mhop_num == 2) {
 		for (i = 0; i < mr->l0_chunk_last_num; i++) {
 			if (i == mr->l0_chunk_last_num - 1) {
-				npages_allocated = i * (pbl_bt_sz / 8);
+				npages_allocated =
+						i * (pbl_bt_sz / BA_BYTE_LEN);
 
 				dma_free_coherent(dev,
-					      (npages - npages_allocated) * 8,
-					      mr->pbl_bt_l1[i],
-					      mr->pbl_l1_dma_addr[i]);
+				      (npages - npages_allocated) * BA_BYTE_LEN,
+				       mr->pbl_bt_l1[i],
+				       mr->pbl_l1_dma_addr[i]);
 
 				break;
 			}
@@ -586,16 +663,17 @@
 			dma_free_coherent(dev, pbl_bt_sz, mr->pbl_bt_l1[i],
 					  mr->pbl_l1_dma_addr[i]);
 
-			for (j = 0; j < pbl_bt_sz / 8; j++) {
-				bt_idx = i * (pbl_bt_sz / 8) + j;
+			for (j = 0; j < pbl_bt_sz / BA_BYTE_LEN; j++) {
+				bt_idx = i * (pbl_bt_sz / BA_BYTE_LEN) + j;
 
 				if ((i == mr->l0_chunk_last_num - 1)
 				    && j == mr->l1_chunk_last_num - 1) {
 					npages_allocated = bt_idx *
-							   (pbl_bt_sz / 8);
+						      (pbl_bt_sz / BA_BYTE_LEN);
 
 					dma_free_coherent(dev,
-					      (npages - npages_allocated) * 8,
+					      (npages - npages_allocated) *
+					      BA_BYTE_LEN,
 					      mr->pbl_bt_l2[bt_idx],
 					      mr->pbl_l2_dma_addr[bt_idx]);
 
@@ -636,10 +714,12 @@
 	}
 
 	if (mr->size != ~0ULL) {
-		npages = ib_umem_page_count(mr->umem);
+		if (mr->type == MR_TYPE_MR)
+			npages = ib_umem_page_count(mr->umem);
 
 		if (!hr_dev->caps.pbl_hop_num)
-			dma_free_coherent(dev, (unsigned int)(npages * 8),
+			dma_free_coherent(dev,
+					  (unsigned int)(npages * BA_BYTE_LEN),
 					  mr->pbl_buf, mr->pbl_dma_addr);
 		else
 			hns_roce_mhop_free(hr_dev, mr);
@@ -674,7 +754,10 @@
 		goto err_table;
 	}
 
-	ret = hr_dev->hw->write_mtpt(mailbox->buf, mr, mtpt_idx);
+	if (mr->type != MR_TYPE_FRMR)
+		ret = hr_dev->hw->write_mtpt(mailbox->buf, mr, mtpt_idx);
+	else
+		ret = hr_dev->hw->frmr_write_mtpt(mailbox->buf, mr);
 	if (ret) {
 		dev_err(dev, "Write mtpt fail!\n");
 		goto err_page;
@@ -707,14 +790,29 @@
 	struct hns_roce_hem_table *table;
 	dma_addr_t dma_handle;
 	__le64 *mtts;
-	u32 s = start_index * sizeof(u64);
 	u32 bt_page_size;
 	u32 i;
 
-	if (mtt->mtt_type == MTT_TYPE_WQE)
+	switch (mtt->mtt_type) {
+	case MTT_TYPE_WQE:
+		table = &hr_dev->mr_table.mtt_table;
 		bt_page_size = 1 << (hr_dev->caps.mtt_ba_pg_sz + PAGE_SHIFT);
-	else
+		break;
+	case MTT_TYPE_CQE:
+		table = &hr_dev->mr_table.mtt_cqe_table;
 		bt_page_size = 1 << (hr_dev->caps.cqe_ba_pg_sz + PAGE_SHIFT);
+		break;
+	case MTT_TYPE_SRQWQE:
+		table = &hr_dev->mr_table.mtt_srqwqe_table;
+		bt_page_size = 1 << (hr_dev->caps.srqwqe_ba_pg_sz + PAGE_SHIFT);
+		break;
+	case MTT_TYPE_IDX:
+		table = &hr_dev->mr_table.mtt_idx_table;
+		bt_page_size = 1 << (hr_dev->caps.idx_ba_pg_sz + PAGE_SHIFT);
+		break;
+	default:
+		return -EINVAL;
+	}
 
 	/* All MTTs must fit in the same page */
 	if (start_index / (bt_page_size / sizeof(u64)) !=
@@ -724,13 +822,9 @@
 	if (start_index & (HNS_ROCE_MTT_ENTRY_PER_SEG - 1))
 		return -EINVAL;
 
-	if (mtt->mtt_type == MTT_TYPE_WQE)
-		table = &hr_dev->mr_table.mtt_table;
-	else
-		table = &hr_dev->mr_table.mtt_cqe_table;
-
 	mtts = hns_roce_table_find(hr_dev, table,
-				mtt->first_seg + s / hr_dev->caps.mtt_entry_sz,
+				mtt->first_seg +
+				start_index / HNS_ROCE_MTT_ENTRY_PER_SEG,
 				&dma_handle);
 	if (!mtts)
 		return -ENOMEM;
@@ -757,10 +851,25 @@
 	if (mtt->order < 0)
 		return -EINVAL;
 
-	if (mtt->mtt_type == MTT_TYPE_WQE)
+	switch (mtt->mtt_type) {
+	case MTT_TYPE_WQE:
 		bt_page_size = 1 << (hr_dev->caps.mtt_ba_pg_sz + PAGE_SHIFT);
-	else
+		break;
+	case MTT_TYPE_CQE:
 		bt_page_size = 1 << (hr_dev->caps.cqe_ba_pg_sz + PAGE_SHIFT);
+		break;
+	case MTT_TYPE_SRQWQE:
+		bt_page_size = 1 << (hr_dev->caps.srqwqe_ba_pg_sz + PAGE_SHIFT);
+		break;
+	case MTT_TYPE_IDX:
+		bt_page_size = 1 << (hr_dev->caps.idx_ba_pg_sz + PAGE_SHIFT);
+		break;
+	default:
+		dev_err(hr_dev->dev,
+			"Unsupport mtt type %d, write mtt failed\n",
+			mtt->mtt_type);
+		return -EINVAL;
+	}
 
 	while (npages > 0) {
 		chunk = min_t(int, bt_page_size / sizeof(u64), npages);
@@ -826,8 +935,31 @@
 		if (ret)
 			goto err_buddy_cqe;
 	}
+
+	if (hr_dev->caps.num_srqwqe_segs) {
+		ret = hns_roce_buddy_init(&mr_table->mtt_srqwqe_buddy,
+					  ilog2(hr_dev->caps.num_srqwqe_segs));
+		if (ret)
+			goto err_buddy_srqwqe;
+	}
+
+	if (hr_dev->caps.num_idx_segs) {
+		ret = hns_roce_buddy_init(&mr_table->mtt_idx_buddy,
+					  ilog2(hr_dev->caps.num_idx_segs));
+		if (ret)
+			goto err_buddy_idx;
+	}
+
 	return 0;
 
+err_buddy_idx:
+	if (hr_dev->caps.num_srqwqe_segs)
+		hns_roce_buddy_cleanup(&mr_table->mtt_srqwqe_buddy);
+
+err_buddy_srqwqe:
+	if (hns_roce_check_whether_mhop(hr_dev, HEM_TYPE_CQE))
+		hns_roce_buddy_cleanup(&mr_table->mtt_cqe_buddy);
+
 err_buddy_cqe:
 	hns_roce_buddy_cleanup(&mr_table->mtt_buddy);
 
@@ -840,6 +972,10 @@
 {
 	struct hns_roce_mr_table *mr_table = &hr_dev->mr_table;
 
+	if (hr_dev->caps.num_idx_segs)
+		hns_roce_buddy_cleanup(&mr_table->mtt_idx_buddy);
+	if (hr_dev->caps.num_srqwqe_segs)
+		hns_roce_buddy_cleanup(&mr_table->mtt_srqwqe_buddy);
 	hns_roce_buddy_cleanup(&mr_table->mtt_buddy);
 	if (hns_roce_check_whether_mhop(hr_dev, HEM_TYPE_CQE))
 		hns_roce_buddy_cleanup(&mr_table->mtt_cqe_buddy);
@@ -855,6 +991,8 @@
 	if (mr == NULL)
 		return  ERR_PTR(-ENOMEM);
 
+	mr->type = MR_TYPE_DMA;
+
 	/* Allocate memory region key */
 	ret = hns_roce_mr_alloc(to_hr_dev(pd->device), to_hr_pd(pd)->pdn, 0,
 				~0ULL, acc, 0, mr);
@@ -882,19 +1020,35 @@
 			       struct hns_roce_mtt *mtt, struct ib_umem *umem)
 {
 	struct device *dev = hr_dev->dev;
-	struct scatterlist *sg;
+	struct sg_dma_page_iter sg_iter;
 	unsigned int order;
-	int i, k, entry;
 	int npage = 0;
 	int ret = 0;
-	int len;
+	int i;
 	u64 page_addr;
 	u64 *pages;
 	u32 bt_page_size;
 	u32 n;
 
-	order = mtt->mtt_type == MTT_TYPE_WQE ? hr_dev->caps.mtt_ba_pg_sz :
-		hr_dev->caps.cqe_ba_pg_sz;
+	switch (mtt->mtt_type) {
+	case MTT_TYPE_WQE:
+		order = hr_dev->caps.mtt_ba_pg_sz;
+		break;
+	case MTT_TYPE_CQE:
+		order = hr_dev->caps.cqe_ba_pg_sz;
+		break;
+	case MTT_TYPE_SRQWQE:
+		order = hr_dev->caps.srqwqe_ba_pg_sz;
+		break;
+	case MTT_TYPE_IDX:
+		order = hr_dev->caps.idx_ba_pg_sz;
+		break;
+	default:
+		dev_err(dev, "Unsupport mtt type %d, write mtt failed\n",
+			mtt->mtt_type);
+		return -EINVAL;
+	}
+
 	bt_page_size = 1 << (order + PAGE_SHIFT);
 
 	pages = (u64 *) __get_free_pages(GFP_KERNEL, order);
@@ -903,29 +1057,25 @@
 
 	i = n = 0;
 
-	for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) {
-		len = sg_dma_len(sg) >> PAGE_SHIFT;
-		for (k = 0; k < len; ++k) {
-			page_addr =
-				sg_dma_address(sg) + (k << umem->page_shift);
-			if (!(npage % (1 << (mtt->page_shift - PAGE_SHIFT)))) {
-				if (page_addr & ((1 << mtt->page_shift) - 1)) {
-					dev_err(dev, "page_addr 0x%llx is not page_shift %d alignment!\n",
-						page_addr, mtt->page_shift);
-					ret = -EINVAL;
-					goto out;
-				}
-				pages[i++] = page_addr;
+	for_each_sg_dma_page(umem->sg_head.sgl, &sg_iter, umem->nmap, 0) {
+		page_addr = sg_page_iter_dma_address(&sg_iter);
+		if (!(npage % (1 << (mtt->page_shift - PAGE_SHIFT)))) {
+			if (page_addr & ((1 << mtt->page_shift) - 1)) {
+				dev_err(dev,
+					"page_addr 0x%llx is not page_shift %d alignment!\n",
+					page_addr, mtt->page_shift);
+				ret = -EINVAL;
+				goto out;
 			}
-			npage++;
-			if (i == bt_page_size / sizeof(u64)) {
-				ret = hns_roce_write_mtt(hr_dev, mtt, n, i,
-							 pages);
-				if (ret)
-					goto out;
-				n += i;
-				i = 0;
-			}
+			pages[i++] = page_addr;
+		}
+		npage++;
+		if (i == bt_page_size / sizeof(u64)) {
+			ret = hns_roce_write_mtt(hr_dev, mtt, n, i, pages);
+			if (ret)
+				goto out;
+			n += i;
+			i = 0;
 		}
 	}
 
@@ -941,10 +1091,8 @@
 				     struct hns_roce_mr *mr,
 				     struct ib_umem *umem)
 {
-	struct scatterlist *sg;
-	int i = 0, j = 0, k;
-	int entry;
-	int len;
+	struct sg_dma_page_iter sg_iter;
+	int i = 0, j = 0;
 	u64 page_addr;
 	u32 pbl_bt_sz;
 
@@ -952,27 +1100,23 @@
 		return 0;
 
 	pbl_bt_sz = 1 << (hr_dev->caps.pbl_ba_pg_sz + PAGE_SHIFT);
-	for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) {
-		len = sg_dma_len(sg) >> PAGE_SHIFT;
-		for (k = 0; k < len; ++k) {
-			page_addr = sg_dma_address(sg) +
-				    (k << umem->page_shift);
+	for_each_sg_dma_page(umem->sg_head.sgl, &sg_iter, umem->nmap, 0) {
+		page_addr = sg_page_iter_dma_address(&sg_iter);
+		if (!hr_dev->caps.pbl_hop_num) {
+			/* for hip06, page addr is aligned to 4K */
+			mr->pbl_buf[i++] = page_addr >> 12;
+		} else if (hr_dev->caps.pbl_hop_num == 1) {
+			mr->pbl_buf[i++] = page_addr;
+		} else {
+			if (hr_dev->caps.pbl_hop_num == 2)
+				mr->pbl_bt_l1[i][j] = page_addr;
+			else if (hr_dev->caps.pbl_hop_num == 3)
+				mr->pbl_bt_l2[i][j] = page_addr;
 
-			if (!hr_dev->caps.pbl_hop_num) {
-				mr->pbl_buf[i++] = page_addr >> 12;
-			} else if (hr_dev->caps.pbl_hop_num == 1) {
-				mr->pbl_buf[i++] = page_addr;
-			} else {
-				if (hr_dev->caps.pbl_hop_num == 2)
-					mr->pbl_bt_l1[i][j] = page_addr;
-				else if (hr_dev->caps.pbl_hop_num == 3)
-					mr->pbl_bt_l2[i][j] = page_addr;
-
-				j++;
-				if (j >= (pbl_bt_sz / 8)) {
-					i++;
-					j = 0;
-				}
+			j++;
+			if (j >= (pbl_bt_sz / BA_BYTE_LEN)) {
+				i++;
+				j = 0;
 			}
 		}
 	}
@@ -999,8 +1143,7 @@
 	if (!mr)
 		return ERR_PTR(-ENOMEM);
 
-	mr->umem = ib_umem_get(pd->uobject->context, start, length,
-			       access_flags, 0);
+	mr->umem = ib_umem_get(udata, start, length, access_flags, 0);
 	if (IS_ERR(mr->umem)) {
 		ret = PTR_ERR(mr->umem);
 		goto err_free;
@@ -1017,20 +1160,23 @@
 			goto err_umem;
 		}
 	} else {
-		int pbl_size = 1;
+		u64 pbl_size = 1;
 
-		bt_size = (1 << (hr_dev->caps.pbl_ba_pg_sz + PAGE_SHIFT)) / 8;
+		bt_size = (1 << (hr_dev->caps.pbl_ba_pg_sz + PAGE_SHIFT)) /
+			  BA_BYTE_LEN;
 		for (i = 0; i < hr_dev->caps.pbl_hop_num; i++)
 			pbl_size *= bt_size;
 		if (n > pbl_size) {
 			dev_err(dev,
-			    " MR len %lld err. MR page num is limited to %d!\n",
+			    " MR len %lld err. MR page num is limited to %lld!\n",
 			    length, pbl_size);
 			ret = -EINVAL;
 			goto err_umem;
 		}
 	}
 
+	mr->type = MR_TYPE_MR;
+
 	ret = hns_roce_mr_alloc(hr_dev, to_hr_pd(pd)->pdn, virt_addr, length,
 				access_flags, n, mr);
 	if (ret)
@@ -1059,6 +1205,83 @@
 	return ERR_PTR(ret);
 }
 
+static int rereg_mr_trans(struct ib_mr *ibmr, int flags,
+			  u64 start, u64 length,
+			  u64 virt_addr, int mr_access_flags,
+			  struct hns_roce_cmd_mailbox *mailbox,
+			  u32 pdn, struct ib_udata *udata)
+{
+	struct hns_roce_dev *hr_dev = to_hr_dev(ibmr->device);
+	struct hns_roce_mr *mr = to_hr_mr(ibmr);
+	struct device *dev = hr_dev->dev;
+	int npages;
+	int ret;
+
+	if (mr->size != ~0ULL) {
+		npages = ib_umem_page_count(mr->umem);
+
+		if (hr_dev->caps.pbl_hop_num)
+			hns_roce_mhop_free(hr_dev, mr);
+		else
+			dma_free_coherent(dev, npages * 8,
+					  mr->pbl_buf, mr->pbl_dma_addr);
+	}
+	ib_umem_release(mr->umem);
+
+	mr->umem = ib_umem_get(udata, start, length, mr_access_flags, 0);
+	if (IS_ERR(mr->umem)) {
+		ret = PTR_ERR(mr->umem);
+		mr->umem = NULL;
+		return -ENOMEM;
+	}
+	npages = ib_umem_page_count(mr->umem);
+
+	if (hr_dev->caps.pbl_hop_num) {
+		ret = hns_roce_mhop_alloc(hr_dev, npages, mr);
+		if (ret)
+			goto release_umem;
+	} else {
+		mr->pbl_buf = dma_alloc_coherent(dev, npages * 8,
+						 &(mr->pbl_dma_addr),
+						 GFP_KERNEL);
+		if (!mr->pbl_buf) {
+			ret = -ENOMEM;
+			goto release_umem;
+		}
+	}
+
+	ret = hr_dev->hw->rereg_write_mtpt(hr_dev, mr, flags, pdn,
+					   mr_access_flags, virt_addr,
+					   length, mailbox->buf);
+	if (ret)
+		goto release_umem;
+
+
+	ret = hns_roce_ib_umem_write_mr(hr_dev, mr, mr->umem);
+	if (ret) {
+		if (mr->size != ~0ULL) {
+			npages = ib_umem_page_count(mr->umem);
+
+			if (hr_dev->caps.pbl_hop_num)
+				hns_roce_mhop_free(hr_dev, mr);
+			else
+				dma_free_coherent(dev, npages * 8,
+						  mr->pbl_buf,
+						  mr->pbl_dma_addr);
+		}
+
+		goto release_umem;
+	}
+
+	return 0;
+
+release_umem:
+	ib_umem_release(mr->umem);
+	return ret;
+
+}
+
+
 int hns_roce_rereg_user_mr(struct ib_mr *ibmr, int flags, u64 start, u64 length,
 			   u64 virt_addr, int mr_access_flags, struct ib_pd *pd,
 			   struct ib_udata *udata)
@@ -1069,7 +1292,6 @@
 	struct device *dev = hr_dev->dev;
 	unsigned long mtpt_idx;
 	u32 pdn = 0;
-	int npages;
 	int ret;
 
 	if (!mr->enabled)
@@ -1096,73 +1318,25 @@
 		pdn = to_hr_pd(pd)->pdn;
 
 	if (flags & IB_MR_REREG_TRANS) {
-		if (mr->size != ~0ULL) {
-			npages = ib_umem_page_count(mr->umem);
-
-			if (hr_dev->caps.pbl_hop_num)
-				hns_roce_mhop_free(hr_dev, mr);
-			else
-				dma_free_coherent(dev, npages * 8, mr->pbl_buf,
-						  mr->pbl_dma_addr);
-		}
-		ib_umem_release(mr->umem);
-
-		mr->umem = ib_umem_get(ibmr->uobject->context, start, length,
-				       mr_access_flags, 0);
-		if (IS_ERR(mr->umem)) {
-			ret = PTR_ERR(mr->umem);
-			mr->umem = NULL;
+		ret = rereg_mr_trans(ibmr, flags,
+				     start, length,
+				     virt_addr, mr_access_flags,
+				     mailbox, pdn, udata);
+		if (ret)
 			goto free_cmd_mbox;
-		}
-		npages = ib_umem_page_count(mr->umem);
-
-		if (hr_dev->caps.pbl_hop_num) {
-			ret = hns_roce_mhop_alloc(hr_dev, npages, mr);
-			if (ret)
-				goto release_umem;
-		} else {
-			mr->pbl_buf = dma_alloc_coherent(dev, npages * 8,
-							 &(mr->pbl_dma_addr),
-							 GFP_KERNEL);
-			if (!mr->pbl_buf) {
-				ret = -ENOMEM;
-				goto release_umem;
-			}
-		}
-	}
-
-	ret = hr_dev->hw->rereg_write_mtpt(hr_dev, mr, flags, pdn,
-					   mr_access_flags, virt_addr,
-					   length, mailbox->buf);
-	if (ret) {
-		if (flags & IB_MR_REREG_TRANS)
-			goto release_umem;
-		else
+	} else {
+		ret = hr_dev->hw->rereg_write_mtpt(hr_dev, mr, flags, pdn,
+						   mr_access_flags, virt_addr,
+						   length, mailbox->buf);
+		if (ret)
 			goto free_cmd_mbox;
 	}
 
-	if (flags & IB_MR_REREG_TRANS) {
-		ret = hns_roce_ib_umem_write_mr(hr_dev, mr, mr->umem);
-		if (ret) {
-			if (mr->size != ~0ULL) {
-				npages = ib_umem_page_count(mr->umem);
-
-				if (hr_dev->caps.pbl_hop_num)
-					hns_roce_mhop_free(hr_dev, mr);
-				else
-					dma_free_coherent(dev, npages * 8,
-							  mr->pbl_buf,
-							  mr->pbl_dma_addr);
-			}
-
-			goto release_umem;
-		}
-	}
-
 	ret = hns_roce_sw2hw_mpt(hr_dev, mailbox, mtpt_idx);
 	if (ret) {
 		dev_err(dev, "SW2HW_MPT failed (%d)\n", ret);
-		goto release_umem;
+		ib_umem_release(mr->umem);
+		goto free_cmd_mbox;
 	}
 
 	mr->enabled = 1;
@@ -1173,31 +1347,331 @@
 
 	return 0;
 
-release_umem:
-	ib_umem_release(mr->umem);
-
 free_cmd_mbox:
 	hns_roce_free_cmd_mailbox(hr_dev, mailbox);
 
 	return ret;
 }
 
-int hns_roce_dereg_mr(struct ib_mr *ibmr)
+int hns_roce_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
 {
 	struct hns_roce_dev *hr_dev = to_hr_dev(ibmr->device);
 	struct hns_roce_mr *mr = to_hr_mr(ibmr);
 	int ret = 0;
 
 	if (hr_dev->hw->dereg_mr) {
-		ret = hr_dev->hw->dereg_mr(hr_dev, mr);
+		ret = hr_dev->hw->dereg_mr(hr_dev, mr, udata);
 	} else {
 		hns_roce_mr_free(hr_dev, mr);
 
-		if (mr->umem)
-			ib_umem_release(mr->umem);
-
+		ib_umem_release(mr->umem);
 		kfree(mr);
 	}
 
 	return ret;
 }
+
+struct ib_mr *hns_roce_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
+				u32 max_num_sg, struct ib_udata *udata)
+{
+	struct hns_roce_dev *hr_dev = to_hr_dev(pd->device);
+	struct device *dev = hr_dev->dev;
+	struct hns_roce_mr *mr;
+	u64 length;
+	u32 page_size;
+	int ret;
+
+	page_size = 1 << (hr_dev->caps.pbl_buf_pg_sz + PAGE_SHIFT);
+	length = max_num_sg * page_size;
+
+	if (mr_type != IB_MR_TYPE_MEM_REG)
+		return ERR_PTR(-EINVAL);
+
+	if (max_num_sg > HNS_ROCE_FRMR_MAX_PA) {
+		dev_err(dev, "max_num_sg larger than %d\n",
+			HNS_ROCE_FRMR_MAX_PA);
+		return ERR_PTR(-EINVAL);
+	}
+
+	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+	if (!mr)
+		return ERR_PTR(-ENOMEM);
+
+	mr->type = MR_TYPE_FRMR;
+
+	/* Allocate memory region key */
+	ret = hns_roce_mr_alloc(hr_dev, to_hr_pd(pd)->pdn, 0, length,
+				0, max_num_sg, mr);
+	if (ret)
+		goto err_free;
+
+	ret = hns_roce_mr_enable(hr_dev, mr);
+	if (ret)
+		goto err_mr;
+
+	mr->ibmr.rkey = mr->ibmr.lkey = mr->key;
+	mr->umem = NULL;
+
+	return &mr->ibmr;
+
+err_mr:
+	hns_roce_mr_free(to_hr_dev(pd->device), mr);
+
+err_free:
+	kfree(mr);
+	return ERR_PTR(ret);
+}
+
+static int hns_roce_set_page(struct ib_mr *ibmr, u64 addr)
+{
+	struct hns_roce_mr *mr = to_hr_mr(ibmr);
+
+	mr->pbl_buf[mr->npages++] = addr;
+
+	return 0;
+}
+
+int hns_roce_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents,
+		       unsigned int *sg_offset)
+{
+	struct hns_roce_mr *mr = to_hr_mr(ibmr);
+
+	mr->npages = 0;
+
+	return ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset, hns_roce_set_page);
+}
+
+static void hns_roce_mw_free(struct hns_roce_dev *hr_dev,
+			     struct hns_roce_mw *mw)
+{
+	struct device *dev = hr_dev->dev;
+	int ret;
+
+	if (mw->enabled) {
+		ret = hns_roce_hw2sw_mpt(hr_dev, NULL, key_to_hw_index(mw->rkey)
+					 & (hr_dev->caps.num_mtpts - 1));
+		if (ret)
+			dev_warn(dev, "MW HW2SW_MPT failed (%d)\n", ret);
+
+		hns_roce_table_put(hr_dev, &hr_dev->mr_table.mtpt_table,
+				   key_to_hw_index(mw->rkey));
+	}
+
+	hns_roce_bitmap_free(&hr_dev->mr_table.mtpt_bitmap,
+			     key_to_hw_index(mw->rkey), BITMAP_NO_RR);
+}
+
+static int hns_roce_mw_enable(struct hns_roce_dev *hr_dev,
+			      struct hns_roce_mw *mw)
+{
+	struct hns_roce_mr_table *mr_table = &hr_dev->mr_table;
+	struct hns_roce_cmd_mailbox *mailbox;
+	struct device *dev = hr_dev->dev;
+	unsigned long mtpt_idx = key_to_hw_index(mw->rkey);
+	int ret;
+
+	/* prepare HEM entry memory */
+	ret = hns_roce_table_get(hr_dev, &mr_table->mtpt_table, mtpt_idx);
+	if (ret)
+		return ret;
+
+	mailbox = hns_roce_alloc_cmd_mailbox(hr_dev);
+	if (IS_ERR(mailbox)) {
+		ret = PTR_ERR(mailbox);
+		goto err_table;
+	}
+
+	ret = hr_dev->hw->mw_write_mtpt(mailbox->buf, mw);
+	if (ret) {
+		dev_err(dev, "MW write mtpt fail!\n");
+		goto err_page;
+	}
+
+	ret = hns_roce_sw2hw_mpt(hr_dev, mailbox,
+				 mtpt_idx & (hr_dev->caps.num_mtpts - 1));
+	if (ret) {
+		dev_err(dev, "MW sw2hw_mpt failed (%d)\n", ret);
+		goto err_page;
+	}
+
+	mw->enabled = 1;
+
+	hns_roce_free_cmd_mailbox(hr_dev, mailbox);
+
+	return 0;
+
+err_page:
+	hns_roce_free_cmd_mailbox(hr_dev, mailbox);
+
+err_table:
+	hns_roce_table_put(hr_dev, &mr_table->mtpt_table, mtpt_idx);
+
+	return ret;
+}
+
+struct ib_mw *hns_roce_alloc_mw(struct ib_pd *ib_pd, enum ib_mw_type type,
+				struct ib_udata *udata)
+{
+	struct hns_roce_dev *hr_dev = to_hr_dev(ib_pd->device);
+	struct hns_roce_mw *mw;
+	unsigned long index = 0;
+	int ret;
+
+	mw = kmalloc(sizeof(*mw), GFP_KERNEL);
+	if (!mw)
+		return ERR_PTR(-ENOMEM);
+
+	/* Allocate a key for mw from bitmap */
+	ret = hns_roce_bitmap_alloc(&hr_dev->mr_table.mtpt_bitmap, &index);
+	if (ret)
+		goto err_bitmap;
+
+	mw->rkey = hw_index_to_key(index);
+
+	mw->ibmw.rkey = mw->rkey;
+	mw->ibmw.type = type;
+	mw->pdn = to_hr_pd(ib_pd)->pdn;
+	mw->pbl_hop_num = hr_dev->caps.pbl_hop_num;
+	mw->pbl_ba_pg_sz = hr_dev->caps.pbl_ba_pg_sz;
+	mw->pbl_buf_pg_sz = hr_dev->caps.pbl_buf_pg_sz;
+
+	ret = hns_roce_mw_enable(hr_dev, mw);
+	if (ret)
+		goto err_mw;
+
+	return &mw->ibmw;
+
+err_mw:
+	hns_roce_mw_free(hr_dev, mw);
+
+err_bitmap:
+	kfree(mw);
+
+	return ERR_PTR(ret);
+}
+
+int hns_roce_dealloc_mw(struct ib_mw *ibmw)
+{
+	struct hns_roce_dev *hr_dev = to_hr_dev(ibmw->device);
+	struct hns_roce_mw *mw = to_hr_mw(ibmw);
+
+	hns_roce_mw_free(hr_dev, mw);
+	kfree(mw);
+
+	return 0;
+}
+
+void hns_roce_mtr_init(struct hns_roce_mtr *mtr, int bt_pg_shift,
+		       int buf_pg_shift)
+{
+	hns_roce_hem_list_init(&mtr->hem_list, bt_pg_shift);
+	mtr->buf_pg_shift = buf_pg_shift;
+}
+
+void hns_roce_mtr_cleanup(struct hns_roce_dev *hr_dev,
+			  struct hns_roce_mtr *mtr)
+{
+	hns_roce_hem_list_release(hr_dev, &mtr->hem_list);
+}
+
+static int hns_roce_write_mtr(struct hns_roce_dev *hr_dev,
+			      struct hns_roce_mtr *mtr, dma_addr_t *bufs,
+			      struct hns_roce_buf_region *r)
+{
+	int offset;
+	int count;
+	int npage;
+	u64 *mtts;
+	int end;
+	int i;
+
+	offset = r->offset;
+	end = offset + r->count;
+	npage = 0;
+	while (offset < end) {
+		mtts = hns_roce_hem_list_find_mtt(hr_dev, &mtr->hem_list,
+						  offset, &count, NULL);
+		if (!mtts)
+			return -ENOBUFS;
+
+		/* Save page addr, low 12 bits : 0 */
+		for (i = 0; i < count; i++) {
+			if (hr_dev->hw_rev == HNS_ROCE_HW_VER1)
+				mtts[i] = bufs[npage] >> PAGE_ADDR_SHIFT;
+			else
+				mtts[i] = bufs[npage];
+
+			npage++;
+		}
+		offset += count;
+	}
+
+	return 0;
+}
+
+int hns_roce_mtr_attach(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr,
+			dma_addr_t **bufs, struct hns_roce_buf_region *regions,
+			int region_cnt)
+{
+	struct hns_roce_buf_region *r;
+	int ret;
+	int i;
+
+	ret = hns_roce_hem_list_request(hr_dev, &mtr->hem_list, regions,
+					region_cnt);
+	if (ret)
+		return ret;
+
+	for (i = 0; i < region_cnt; i++) {
+		r = &regions[i];
+		ret = hns_roce_write_mtr(hr_dev, mtr, bufs[i], r);
+		if (ret) {
+			dev_err(hr_dev->dev,
+				"write mtr[%d/%d] err %d,offset=%d.\n",
+				i, region_cnt, ret,  r->offset);
+			goto err_write;
+		}
+	}
+
+	return 0;
+
+err_write:
+	hns_roce_hem_list_release(hr_dev, &mtr->hem_list);
+
+	return ret;
+}
+
+int hns_roce_mtr_find(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr,
+		      int offset, u64 *mtt_buf, int mtt_max, u64 *base_addr)
+{
+	u64 *mtts = mtt_buf;
+	int mtt_count;
+	int total = 0;
+	u64 *addr;
+	int npage;
+	int left;
+
+	if (mtts == NULL || mtt_max < 1)
+		goto done;
+
+	left = mtt_max;
+	while (left > 0) {
+		mtt_count = 0;
+		addr = hns_roce_hem_list_find_mtt(hr_dev, &mtr->hem_list,
+						  offset + total,
+						  &mtt_count, NULL);
+		if (!addr || !mtt_count)
+			goto done;
+
+		npage = min(mtt_count, left);
+		memcpy(&mtts[total], addr, BA_BYTE_LEN * npage);
+		left -= npage;
+		total += npage;
+	}
+
+done:
+	if (base_addr)
+		*base_addr = mtr->hem_list.root_ba;
+
+	return total;
+}
diff --git a/drivers/infiniband/hw/hns/hns_roce_pd.c b/drivers/infiniband/hw/hns/hns_roce_pd.c
index e11c149..912b89b 100644
--- a/drivers/infiniband/hw/hns/hns_roce_pd.c
+++ b/drivers/infiniband/hw/hns/hns_roce_pd.c
@@ -57,54 +57,42 @@
 	hns_roce_bitmap_cleanup(&hr_dev->pd_bitmap);
 }
 
-struct ib_pd *hns_roce_alloc_pd(struct ib_device *ib_dev,
-				struct ib_ucontext *context,
-				struct ib_udata *udata)
+int hns_roce_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata)
 {
+	struct ib_device *ib_dev = ibpd->device;
 	struct hns_roce_dev *hr_dev = to_hr_dev(ib_dev);
 	struct device *dev = hr_dev->dev;
-	struct hns_roce_pd *pd;
+	struct hns_roce_pd *pd = to_hr_pd(ibpd);
 	int ret;
 
-	pd = kmalloc(sizeof(*pd), GFP_KERNEL);
-	if (!pd)
-		return ERR_PTR(-ENOMEM);
-
 	ret = hns_roce_pd_alloc(to_hr_dev(ib_dev), &pd->pdn);
 	if (ret) {
-		kfree(pd);
 		dev_err(dev, "[alloc_pd]hns_roce_pd_alloc failed!\n");
-		return ERR_PTR(ret);
+		return ret;
 	}
 
-	if (context) {
+	if (udata) {
 		struct hns_roce_ib_alloc_pd_resp uresp = {.pdn = pd->pdn};
 
 		if (ib_copy_to_udata(udata, &uresp, sizeof(uresp))) {
 			hns_roce_pd_free(to_hr_dev(ib_dev), pd->pdn);
 			dev_err(dev, "[alloc_pd]ib_copy_to_udata failed!\n");
-			kfree(pd);
-			return ERR_PTR(-EFAULT);
+			return -EFAULT;
 		}
 	}
 
-	return &pd->ibpd;
-}
-EXPORT_SYMBOL_GPL(hns_roce_alloc_pd);
-
-int hns_roce_dealloc_pd(struct ib_pd *pd)
-{
-	hns_roce_pd_free(to_hr_dev(pd->device), to_hr_pd(pd)->pdn);
-	kfree(to_hr_pd(pd));
-
 	return 0;
 }
-EXPORT_SYMBOL_GPL(hns_roce_dealloc_pd);
+
+void hns_roce_dealloc_pd(struct ib_pd *pd, struct ib_udata *udata)
+{
+	hns_roce_pd_free(to_hr_dev(pd->device), to_hr_pd(pd)->pdn);
+}
 
 int hns_roce_uar_alloc(struct hns_roce_dev *hr_dev, struct hns_roce_uar *uar)
 {
 	struct resource *res;
-	int ret = 0;
+	int ret;
 
 	/* Using bitmap to manager UAR index */
 	ret = hns_roce_bitmap_alloc(&hr_dev->uar_table.bitmap, &uar->logic_idx);
diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c
index efb7e96..bd78ff9 100644
--- a/drivers/infiniband/hw/hns/hns_roce_qp.c
+++ b/drivers/infiniband/hw/hns/hns_roce_qp.c
@@ -31,9 +31,11 @@
  * SOFTWARE.
  */
 
+#include <linux/pci.h>
 #include <linux/platform_device.h>
 #include <rdma/ib_addr.h>
 #include <rdma/ib_umem.h>
+#include <rdma/uverbs_ioctl.h>
 #include "hns_roce_common.h"
 #include "hns_roce_device.h"
 #include "hns_roce_hem.h"
@@ -43,17 +45,14 @@
 
 void hns_roce_qp_event(struct hns_roce_dev *hr_dev, u32 qpn, int event_type)
 {
-	struct hns_roce_qp_table *qp_table = &hr_dev->qp_table;
 	struct device *dev = hr_dev->dev;
 	struct hns_roce_qp *qp;
 
-	spin_lock(&qp_table->lock);
-
+	xa_lock(&hr_dev->qp_table_xa);
 	qp = __hns_roce_qp_lookup(hr_dev, qpn);
 	if (qp)
 		atomic_inc(&qp->refcount);
-
-	spin_unlock(&qp_table->lock);
+	xa_unlock(&hr_dev->qp_table_xa);
 
 	if (!qp) {
 		dev_warn(dev, "Async event for bogus QP %08x\n", qpn);
@@ -65,7 +64,6 @@
 	if (atomic_dec_and_test(&qp->refcount))
 		complete(&qp->free);
 }
-EXPORT_SYMBOL_GPL(hns_roce_qp_event);
 
 static void hns_roce_ib_qp_event(struct hns_roce_qp *hr_qp,
 				 enum hns_roce_event type)
@@ -140,34 +138,24 @@
 		return HNS_ROCE_QP_NUM_STATE;
 	}
 }
-EXPORT_SYMBOL_GPL(to_hns_roce_state);
 
 static int hns_roce_gsi_qp_alloc(struct hns_roce_dev *hr_dev, unsigned long qpn,
 				 struct hns_roce_qp *hr_qp)
 {
-	struct hns_roce_qp_table *qp_table = &hr_dev->qp_table;
+	struct xarray *xa = &hr_dev->qp_table_xa;
 	int ret;
 
 	if (!qpn)
 		return -EINVAL;
 
 	hr_qp->qpn = qpn;
-
-	spin_lock_irq(&qp_table->lock);
-	ret = radix_tree_insert(&hr_dev->qp_table_tree,
-				hr_qp->qpn & (hr_dev->caps.num_qps - 1), hr_qp);
-	spin_unlock_irq(&qp_table->lock);
-	if (ret) {
-		dev_err(hr_dev->dev, "QPC radix_tree_insert failed\n");
-		goto err_put_irrl;
-	}
-
 	atomic_set(&hr_qp->refcount, 1);
 	init_completion(&hr_qp->free);
 
-	return 0;
-
-err_put_irrl:
+	ret = xa_err(xa_store_irq(xa, hr_qp->qpn & (hr_dev->caps.num_qps - 1),
+				hr_qp, GFP_KERNEL));
+	if (ret)
+		dev_err(hr_dev->dev, "QPC xa_store failed\n");
 
 	return ret;
 }
@@ -208,20 +196,27 @@
 		}
 	}
 
-	spin_lock_irq(&qp_table->lock);
-	ret = radix_tree_insert(&hr_dev->qp_table_tree,
-				hr_qp->qpn & (hr_dev->caps.num_qps - 1), hr_qp);
-	spin_unlock_irq(&qp_table->lock);
-	if (ret) {
-		dev_err(dev, "QPC radix_tree_insert failed\n");
-		goto err_put_trrl;
+	if (hr_dev->caps.sccc_entry_sz) {
+		/* Alloc memory for SCC CTX */
+		ret = hns_roce_table_get(hr_dev, &qp_table->sccc_table,
+					 hr_qp->qpn);
+		if (ret) {
+			dev_err(dev, "SCC CTX table get failed\n");
+			goto err_put_trrl;
+		}
 	}
 
-	atomic_set(&hr_qp->refcount, 1);
-	init_completion(&hr_qp->free);
+	ret = hns_roce_gsi_qp_alloc(hr_dev, qpn, hr_qp);
+	if (ret)
+		goto err_put_sccc;
 
 	return 0;
 
+err_put_sccc:
+	if (hr_dev->caps.sccc_entry_sz)
+		hns_roce_table_put(hr_dev, &qp_table->sccc_table,
+				   hr_qp->qpn);
+
 err_put_trrl:
 	if (hr_dev->caps.trrl_entry_sz)
 		hns_roce_table_put(hr_dev, &qp_table->trrl_table, hr_qp->qpn);
@@ -238,15 +233,13 @@
 
 void hns_roce_qp_remove(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp)
 {
-	struct hns_roce_qp_table *qp_table = &hr_dev->qp_table;
+	struct xarray *xa = &hr_dev->qp_table_xa;
 	unsigned long flags;
 
-	spin_lock_irqsave(&qp_table->lock, flags);
-	radix_tree_delete(&hr_dev->qp_table_tree,
-			  hr_qp->qpn & (hr_dev->caps.num_qps - 1));
-	spin_unlock_irqrestore(&qp_table->lock, flags);
+	xa_lock_irqsave(xa, flags);
+	__xa_erase(xa, hr_qp->qpn & (hr_dev->caps.num_qps - 1));
+	xa_unlock_irqrestore(xa, flags);
 }
-EXPORT_SYMBOL_GPL(hns_roce_qp_remove);
 
 void hns_roce_qp_free(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp)
 {
@@ -261,25 +254,22 @@
 			hns_roce_table_put(hr_dev, &qp_table->trrl_table,
 					   hr_qp->qpn);
 		hns_roce_table_put(hr_dev, &qp_table->irrl_table, hr_qp->qpn);
-		hns_roce_table_put(hr_dev, &qp_table->qp_table, hr_qp->qpn);
 	}
 }
-EXPORT_SYMBOL_GPL(hns_roce_qp_free);
 
 void hns_roce_release_range_qp(struct hns_roce_dev *hr_dev, int base_qpn,
 			       int cnt)
 {
 	struct hns_roce_qp_table *qp_table = &hr_dev->qp_table;
 
-	if (base_qpn < SQP_NUM)
+	if (base_qpn < hr_dev->caps.reserved_qps)
 		return;
 
 	hns_roce_bitmap_free_range(&qp_table->bitmap, base_qpn, cnt, BITMAP_RR);
 }
-EXPORT_SYMBOL_GPL(hns_roce_release_range_qp);
 
 static int hns_roce_set_rq_size(struct hns_roce_dev *hr_dev,
-				struct ib_qp_cap *cap, int is_user, int has_srq,
+				struct ib_qp_cap *cap, bool is_user, int has_rq,
 				struct hns_roce_qp *hr_qp)
 {
 	struct device *dev = hr_dev->dev;
@@ -293,14 +283,12 @@
 		return -EINVAL;
 	}
 
-	/* If srq exit, set zero for relative number of rq */
-	if (has_srq) {
-		if (cap->max_recv_wr) {
-			dev_dbg(dev, "srq no need config max_recv_wr\n");
-			return -EINVAL;
-		}
-
-		hr_qp->rq.wqe_cnt = hr_qp->rq.max_gs = 0;
+	/* If srq exist, set zero for relative number of rq */
+	if (!has_rq) {
+		hr_qp->rq.wqe_cnt = 0;
+		hr_qp->rq.max_gs = 0;
+		cap->max_recv_wr = 0;
+		cap->max_recv_sge = 0;
 	} else {
 		if (is_user && (!cap->max_recv_wr || !cap->max_recv_sge)) {
 			dev_err(dev, "user space no need config max_recv_wr max_recv_sge\n");
@@ -336,30 +324,46 @@
 	return 0;
 }
 
-static int hns_roce_set_user_sq_size(struct hns_roce_dev *hr_dev,
-				     struct ib_qp_cap *cap,
-				     struct hns_roce_qp *hr_qp,
-				     struct hns_roce_ib_create_qp *ucmd)
+static int check_sq_size_with_integrity(struct hns_roce_dev *hr_dev,
+					struct ib_qp_cap *cap,
+					struct hns_roce_ib_create_qp *ucmd)
 {
 	u32 roundup_sq_stride = roundup_pow_of_two(hr_dev->caps.max_sq_desc_sz);
 	u8 max_sq_stride = ilog2(roundup_sq_stride);
-	u32 page_size;
-	u32 max_cnt;
 
 	/* Sanity check SQ size before proceeding */
 	if ((u32)(1 << ucmd->log_sq_bb_count) > hr_dev->caps.max_wqes ||
 	     ucmd->log_sq_stride > max_sq_stride ||
 	     ucmd->log_sq_stride < HNS_ROCE_IB_MIN_SQ_STRIDE) {
-		dev_err(hr_dev->dev, "check SQ size error!\n");
+		ibdev_err(&hr_dev->ib_dev, "check SQ size error!\n");
 		return -EINVAL;
 	}
 
 	if (cap->max_send_sge > hr_dev->caps.max_sq_sg) {
-		dev_err(hr_dev->dev, "SQ sge error! max_send_sge=%d\n",
-			cap->max_send_sge);
+		ibdev_err(&hr_dev->ib_dev, "SQ sge error! max_send_sge=%d\n",
+			  cap->max_send_sge);
 		return -EINVAL;
 	}
 
+	return 0;
+}
+
+static int hns_roce_set_user_sq_size(struct hns_roce_dev *hr_dev,
+				     struct ib_qp_cap *cap,
+				     struct hns_roce_qp *hr_qp,
+				     struct hns_roce_ib_create_qp *ucmd)
+{
+	u32 ex_sge_num;
+	u32 page_size;
+	u32 max_cnt;
+	int ret;
+
+	ret = check_sq_size_with_integrity(hr_dev, cap, ucmd);
+	if (ret) {
+		ibdev_err(&hr_dev->ib_dev, "Sanity check sq size failed\n");
+		return ret;
+	}
+
 	hr_qp->sq.wqe_cnt = 1 << ucmd->log_sq_bb_count;
 	hr_qp->sq.wqe_shift = ucmd->log_sq_stride;
 
@@ -372,7 +376,18 @@
 	if (hr_qp->sq.max_gs > 2)
 		hr_qp->sge.sge_cnt = roundup_pow_of_two(hr_qp->sq.wqe_cnt *
 							(hr_qp->sq.max_gs - 2));
+
+	if ((hr_qp->sq.max_gs > 2) && (hr_dev->pci_dev->revision == 0x20)) {
+		if (hr_qp->sge.sge_cnt > hr_dev->caps.max_extend_sg) {
+			dev_err(hr_dev->dev,
+				"The extended sge cnt error! sge_cnt=%d\n",
+				hr_qp->sge.sge_cnt);
+			return -EINVAL;
+		}
+	}
+
 	hr_qp->sge.sge_shift = 4;
+	ex_sge_num = hr_qp->sge.sge_cnt;
 
 	/* Get buf size, SQ and RQ  are aligned to page_szie */
 	if (hr_dev->caps.max_sq_sg <= 2) {
@@ -386,6 +401,8 @@
 					     hr_qp->sq.wqe_shift), PAGE_SIZE);
 	} else {
 		page_size = 1 << (hr_dev->caps.mtt_buf_pg_sz + PAGE_SHIFT);
+		hr_qp->sge.sge_cnt = ex_sge_num ?
+		   max(page_size / (1 << hr_qp->sge.sge_shift), ex_sge_num) : 0;
 		hr_qp->buff_size = HNS_ROCE_ALOGN_UP((hr_qp->rq.wqe_cnt <<
 					     hr_qp->rq.wqe_shift), page_size) +
 				   HNS_ROCE_ALOGN_UP((hr_qp->sge.sge_cnt <<
@@ -394,7 +411,7 @@
 					     hr_qp->sq.wqe_shift), page_size);
 
 		hr_qp->sq.offset = 0;
-		if (hr_qp->sge.sge_cnt) {
+		if (ex_sge_num) {
 			hr_qp->sge.offset = HNS_ROCE_ALOGN_UP(
 							(hr_qp->sq.wqe_cnt <<
 							hr_qp->sq.wqe_shift),
@@ -414,6 +431,120 @@
 	return 0;
 }
 
+static int split_wqe_buf_region(struct hns_roce_dev *hr_dev,
+				struct hns_roce_qp *hr_qp,
+				struct hns_roce_buf_region *regions,
+				int region_max, int page_shift)
+{
+	int page_size = 1 << page_shift;
+	bool is_extend_sge;
+	int region_cnt = 0;
+	int buf_size;
+	int buf_cnt;
+
+	if (hr_qp->buff_size < 1 || region_max < 1)
+		return region_cnt;
+
+	if (hr_qp->sge.sge_cnt > 0)
+		is_extend_sge = true;
+	else
+		is_extend_sge = false;
+
+	/* sq region */
+	if (is_extend_sge)
+		buf_size = hr_qp->sge.offset - hr_qp->sq.offset;
+	else
+		buf_size = hr_qp->rq.offset - hr_qp->sq.offset;
+
+	if (buf_size > 0 && region_cnt < region_max) {
+		buf_cnt = DIV_ROUND_UP(buf_size, page_size);
+		hns_roce_init_buf_region(&regions[region_cnt],
+					 hr_dev->caps.wqe_sq_hop_num,
+					 hr_qp->sq.offset / page_size,
+					 buf_cnt);
+		region_cnt++;
+	}
+
+	/* sge region */
+	if (is_extend_sge) {
+		buf_size = hr_qp->rq.offset - hr_qp->sge.offset;
+		if (buf_size > 0 && region_cnt < region_max) {
+			buf_cnt = DIV_ROUND_UP(buf_size, page_size);
+			hns_roce_init_buf_region(&regions[region_cnt],
+						 hr_dev->caps.wqe_sge_hop_num,
+						 hr_qp->sge.offset / page_size,
+						 buf_cnt);
+			region_cnt++;
+		}
+	}
+
+	/* rq region */
+	buf_size = hr_qp->buff_size - hr_qp->rq.offset;
+	if (buf_size > 0) {
+		buf_cnt = DIV_ROUND_UP(buf_size, page_size);
+		hns_roce_init_buf_region(&regions[region_cnt],
+					 hr_dev->caps.wqe_rq_hop_num,
+					 hr_qp->rq.offset / page_size,
+					 buf_cnt);
+		region_cnt++;
+	}
+
+	return region_cnt;
+}
+
+static int calc_wqe_bt_page_shift(struct hns_roce_dev *hr_dev,
+				  struct hns_roce_buf_region *regions,
+				  int region_cnt)
+{
+	int bt_pg_shift;
+	int ba_num;
+	int ret;
+
+	bt_pg_shift = PAGE_SHIFT + hr_dev->caps.mtt_ba_pg_sz;
+
+	/* all root ba entries must in one bt page */
+	do {
+		ba_num = (1 << bt_pg_shift) / BA_BYTE_LEN;
+		ret = hns_roce_hem_list_calc_root_ba(regions, region_cnt,
+						     ba_num);
+		if (ret <= ba_num)
+			break;
+
+		bt_pg_shift++;
+	} while (ret > ba_num);
+
+	return bt_pg_shift - PAGE_SHIFT;
+}
+
+static int set_extend_sge_param(struct hns_roce_dev *hr_dev,
+				struct hns_roce_qp *hr_qp)
+{
+	struct device *dev = hr_dev->dev;
+
+	if (hr_qp->sq.max_gs > 2) {
+		hr_qp->sge.sge_cnt = roundup_pow_of_two(hr_qp->sq.wqe_cnt *
+				     (hr_qp->sq.max_gs - 2));
+		hr_qp->sge.sge_shift = 4;
+	}
+
+	/* ud sqwqe's sge use extend sge */
+	if (hr_dev->caps.max_sq_sg > 2 && hr_qp->ibqp.qp_type == IB_QPT_GSI) {
+		hr_qp->sge.sge_cnt = roundup_pow_of_two(hr_qp->sq.wqe_cnt *
+				     hr_qp->sq.max_gs);
+		hr_qp->sge.sge_shift = 4;
+	}
+
+	if ((hr_qp->sq.max_gs > 2) && hr_dev->pci_dev->revision == 0x20) {
+		if (hr_qp->sge.sge_cnt > hr_dev->caps.max_extend_sg) {
+			dev_err(dev, "The extended sge cnt error! sge_cnt=%d\n",
+				hr_qp->sge.sge_cnt);
+			return -EINVAL;
+		}
+	}
+
+	return 0;
+}
+
 static int hns_roce_set_kernel_sq_size(struct hns_roce_dev *hr_dev,
 				       struct ib_qp_cap *cap,
 				       struct hns_roce_qp *hr_qp)
@@ -422,6 +553,7 @@
 	u32 page_size;
 	u32 max_cnt;
 	int size;
+	int ret;
 
 	if (cap->max_send_wr  > hr_dev->caps.max_wqes  ||
 	    cap->max_send_sge > hr_dev->caps.max_sq_sg ||
@@ -431,8 +563,6 @@
 	}
 
 	hr_qp->sq.wqe_shift = ilog2(hr_dev->caps.max_sq_desc_sz);
-	hr_qp->sq_max_wqes_per_wr = 1;
-	hr_qp->sq_spare_wqes = 0;
 
 	if (hr_dev->caps.min_wqes)
 		max_cnt = max(cap->max_send_wr, hr_dev->caps.min_wqes);
@@ -452,17 +582,10 @@
 	else
 		hr_qp->sq.max_gs = max_cnt;
 
-	if (hr_qp->sq.max_gs > 2) {
-		hr_qp->sge.sge_cnt = roundup_pow_of_two(hr_qp->sq.wqe_cnt *
-				     (hr_qp->sq.max_gs - 2));
-		hr_qp->sge.sge_shift = 4;
-	}
-
-	/* ud sqwqe's sge use extend sge */
-	if (hr_dev->caps.max_sq_sg > 2 && hr_qp->ibqp.qp_type == IB_QPT_GSI) {
-		hr_qp->sge.sge_cnt = roundup_pow_of_two(hr_qp->sq.wqe_cnt *
-				     hr_qp->sq.max_gs);
-		hr_qp->sge.sge_shift = 4;
+	ret = set_extend_sge_param(hr_dev, hr_qp);
+	if (ret) {
+		dev_err(dev, "set extend sge parameters fail\n");
+		return ret;
 	}
 
 	/* Get buf size, SQ and RQ are aligned to PAGE_SIZE */
@@ -472,6 +595,8 @@
 				 page_size);
 
 	if (hr_dev->caps.max_sq_sg > 2 && hr_qp->sge.sge_cnt) {
+		hr_qp->sge.sge_cnt = max(page_size/(1 << hr_qp->sge.sge_shift),
+					(u32)hr_qp->sge.sge_cnt);
 		hr_qp->sge.offset = size;
 		size += HNS_ROCE_ALOGN_UP(hr_qp->sge.sge_cnt <<
 					  hr_qp->sge.sge_shift, page_size);
@@ -494,7 +619,7 @@
 
 static int hns_roce_qp_has_sq(struct ib_qp_init_attr *attr)
 {
-	if (attr->qp_type == IB_QPT_XRC_TGT)
+	if (attr->qp_type == IB_QPT_XRC_TGT || !attr->cap.max_send_wr)
 		return 0;
 
 	return 1;
@@ -503,25 +628,74 @@
 static int hns_roce_qp_has_rq(struct ib_qp_init_attr *attr)
 {
 	if (attr->qp_type == IB_QPT_XRC_INI ||
-	    attr->qp_type == IB_QPT_XRC_TGT || attr->srq)
+	    attr->qp_type == IB_QPT_XRC_TGT || attr->srq ||
+	    !attr->cap.max_recv_wr)
 		return 0;
 
 	return 1;
 }
 
+static int alloc_rq_inline_buf(struct hns_roce_qp *hr_qp,
+			       struct ib_qp_init_attr *init_attr)
+{
+	u32 max_recv_sge = init_attr->cap.max_recv_sge;
+	struct hns_roce_rinl_wqe *wqe_list;
+	u32 wqe_cnt = hr_qp->rq.wqe_cnt;
+	int i;
+
+	/* allocate recv inline buf */
+	wqe_list = kcalloc(wqe_cnt, sizeof(struct hns_roce_rinl_wqe),
+			   GFP_KERNEL);
+
+	if (!wqe_list)
+		goto err;
+
+	/* Allocate a continuous buffer for all inline sge we need */
+	wqe_list[0].sg_list = kcalloc(wqe_cnt, (max_recv_sge *
+				      sizeof(struct hns_roce_rinl_sge)),
+				      GFP_KERNEL);
+	if (!wqe_list[0].sg_list)
+		goto err_wqe_list;
+
+	/* Assign buffers of sg_list to each inline wqe */
+	for (i = 1; i < wqe_cnt; i++)
+		wqe_list[i].sg_list = &wqe_list[0].sg_list[i * max_recv_sge];
+
+	hr_qp->rq_inl_buf.wqe_list = wqe_list;
+	hr_qp->rq_inl_buf.wqe_cnt = wqe_cnt;
+
+	return 0;
+
+err_wqe_list:
+	kfree(wqe_list);
+
+err:
+	return -ENOMEM;
+}
+
+static void free_rq_inline_buf(struct hns_roce_qp *hr_qp)
+{
+	kfree(hr_qp->rq_inl_buf.wqe_list[0].sg_list);
+	kfree(hr_qp->rq_inl_buf.wqe_list);
+}
+
 static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev,
 				     struct ib_pd *ib_pd,
 				     struct ib_qp_init_attr *init_attr,
 				     struct ib_udata *udata, unsigned long sqpn,
 				     struct hns_roce_qp *hr_qp)
 {
+	dma_addr_t *buf_list[ARRAY_SIZE(hr_qp->regions)] = { NULL };
 	struct device *dev = hr_dev->dev;
 	struct hns_roce_ib_create_qp ucmd;
 	struct hns_roce_ib_create_qp_resp resp = {};
+	struct hns_roce_ucontext *uctx = rdma_udata_to_drv_context(
+		udata, struct hns_roce_ucontext, ibucontext);
+	struct hns_roce_buf_region *r;
 	unsigned long qpn = 0;
-	int ret = 0;
 	u32 page_shift;
-	u32 npages;
+	int buf_count;
+	int ret;
 	int i;
 
 	mutex_init(&hr_qp->mutex);
@@ -533,107 +707,81 @@
 	hr_qp->ibqp.qp_type = init_attr->qp_type;
 
 	if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR)
-		hr_qp->sq_signal_bits = cpu_to_le32(IB_SIGNAL_ALL_WR);
+		hr_qp->sq_signal_bits = IB_SIGNAL_ALL_WR;
 	else
-		hr_qp->sq_signal_bits = cpu_to_le32(IB_SIGNAL_REQ_WR);
+		hr_qp->sq_signal_bits = IB_SIGNAL_REQ_WR;
 
-	ret = hns_roce_set_rq_size(hr_dev, &init_attr->cap, !!ib_pd->uobject,
-				   !!init_attr->srq, hr_qp);
+	ret = hns_roce_set_rq_size(hr_dev, &init_attr->cap, udata,
+				   hns_roce_qp_has_rq(init_attr), hr_qp);
 	if (ret) {
 		dev_err(dev, "hns_roce_set_rq_size failed\n");
 		goto err_out;
 	}
 
-	if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RQ_INLINE) {
-		/* allocate recv inline buf */
-		hr_qp->rq_inl_buf.wqe_list = kcalloc(hr_qp->rq.wqe_cnt,
-					       sizeof(struct hns_roce_rinl_wqe),
-					       GFP_KERNEL);
-		if (!hr_qp->rq_inl_buf.wqe_list) {
-			ret = -ENOMEM;
+	if ((hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RQ_INLINE) &&
+	    hns_roce_qp_has_rq(init_attr)) {
+		ret = alloc_rq_inline_buf(hr_qp, init_attr);
+		if (ret) {
+			dev_err(dev, "allocate receive inline buffer failed\n");
 			goto err_out;
 		}
-
-		hr_qp->rq_inl_buf.wqe_cnt = hr_qp->rq.wqe_cnt;
-
-		/* Firstly, allocate a list of sge space buffer */
-		hr_qp->rq_inl_buf.wqe_list[0].sg_list =
-					kcalloc(hr_qp->rq_inl_buf.wqe_cnt,
-					       init_attr->cap.max_recv_sge *
-					       sizeof(struct hns_roce_rinl_sge),
-					       GFP_KERNEL);
-		if (!hr_qp->rq_inl_buf.wqe_list[0].sg_list) {
-			ret = -ENOMEM;
-			goto err_wqe_list;
-		}
-
-		for (i = 1; i < hr_qp->rq_inl_buf.wqe_cnt; i++)
-			/* Secondly, reallocate the buffer */
-			hr_qp->rq_inl_buf.wqe_list[i].sg_list =
-				&hr_qp->rq_inl_buf.wqe_list[0].sg_list[i *
-				init_attr->cap.max_recv_sge];
 	}
 
-	if (ib_pd->uobject) {
+	page_shift = PAGE_SHIFT + hr_dev->caps.mtt_buf_pg_sz;
+	if (udata) {
 		if (ib_copy_from_udata(&ucmd, udata, sizeof(ucmd))) {
 			dev_err(dev, "ib_copy_from_udata error for create qp\n");
 			ret = -EFAULT;
-			goto err_rq_sge_list;
+			goto err_alloc_rq_inline_buf;
 		}
 
 		ret = hns_roce_set_user_sq_size(hr_dev, &init_attr->cap, hr_qp,
 						&ucmd);
 		if (ret) {
 			dev_err(dev, "hns_roce_set_user_sq_size error for create qp\n");
-			goto err_rq_sge_list;
+			goto err_alloc_rq_inline_buf;
 		}
 
-		hr_qp->umem = ib_umem_get(ib_pd->uobject->context,
-					  ucmd.buf_addr, hr_qp->buff_size, 0,
-					  0);
+		hr_qp->umem = ib_umem_get(udata, ucmd.buf_addr,
+					  hr_qp->buff_size, 0, 0);
 		if (IS_ERR(hr_qp->umem)) {
 			dev_err(dev, "ib_umem_get error for create qp\n");
 			ret = PTR_ERR(hr_qp->umem);
-			goto err_rq_sge_list;
+			goto err_alloc_rq_inline_buf;
+		}
+		hr_qp->region_cnt = split_wqe_buf_region(hr_dev, hr_qp,
+				hr_qp->regions, ARRAY_SIZE(hr_qp->regions),
+				page_shift);
+		ret = hns_roce_alloc_buf_list(hr_qp->regions, buf_list,
+					      hr_qp->region_cnt);
+		if (ret) {
+			dev_err(dev, "alloc buf_list error for create qp\n");
+			goto err_alloc_list;
 		}
 
-		hr_qp->mtt.mtt_type = MTT_TYPE_WQE;
-		if (hr_dev->caps.mtt_buf_pg_sz) {
-			npages = (ib_umem_page_count(hr_qp->umem) +
-				  (1 << hr_dev->caps.mtt_buf_pg_sz) - 1) /
-				  (1 << hr_dev->caps.mtt_buf_pg_sz);
-			page_shift = PAGE_SHIFT + hr_dev->caps.mtt_buf_pg_sz;
-			ret = hns_roce_mtt_init(hr_dev, npages,
-				    page_shift,
-				    &hr_qp->mtt);
-		} else {
-			ret = hns_roce_mtt_init(hr_dev,
-				    ib_umem_page_count(hr_qp->umem),
-				    hr_qp->umem->page_shift,
-				    &hr_qp->mtt);
-		}
-		if (ret) {
-			dev_err(dev, "hns_roce_mtt_init error for create qp\n");
-			goto err_buf;
-		}
-
-		ret = hns_roce_ib_umem_write_mtt(hr_dev, &hr_qp->mtt,
-						 hr_qp->umem);
-		if (ret) {
-			dev_err(dev, "hns_roce_ib_umem_write_mtt error for create qp\n");
-			goto err_mtt;
+		for (i = 0; i < hr_qp->region_cnt; i++) {
+			r = &hr_qp->regions[i];
+			buf_count = hns_roce_get_umem_bufs(hr_dev,
+					buf_list[i], r->count, r->offset,
+					hr_qp->umem, page_shift);
+			if (buf_count != r->count) {
+				dev_err(dev,
+					"get umem buf err, expect %d,ret %d.\n",
+					r->count, buf_count);
+				ret = -ENOBUFS;
+				goto err_get_bufs;
+			}
 		}
 
 		if ((hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_SQ_RECORD_DB) &&
 		    (udata->inlen >= sizeof(ucmd)) &&
 		    (udata->outlen >= sizeof(resp)) &&
 		    hns_roce_qp_has_sq(init_attr)) {
-			ret = hns_roce_db_map_user(
-					to_hr_ucontext(ib_pd->uobject->context),
-					ucmd.sdb_addr, &hr_qp->sdb);
+			ret = hns_roce_db_map_user(uctx, udata, ucmd.sdb_addr,
+						   &hr_qp->sdb);
 			if (ret) {
 				dev_err(dev, "sq record doorbell map failed!\n");
-				goto err_mtt;
+				goto err_get_bufs;
 			}
 
 			/* indicate kernel supports sq record db */
@@ -644,26 +792,29 @@
 		if ((hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB) &&
 		    (udata->outlen >= sizeof(resp)) &&
 		    hns_roce_qp_has_rq(init_attr)) {
-			ret = hns_roce_db_map_user(
-					to_hr_ucontext(ib_pd->uobject->context),
-					ucmd.db_addr, &hr_qp->rdb);
+			ret = hns_roce_db_map_user(uctx, udata, ucmd.db_addr,
+						   &hr_qp->rdb);
 			if (ret) {
 				dev_err(dev, "rq record doorbell map failed!\n");
 				goto err_sq_dbmap;
 			}
+
+			/* indicate kernel supports rq record db */
+			resp.cap_flags |= HNS_ROCE_SUPPORT_RQ_RECORD_DB;
+			hr_qp->rdb_en = 1;
 		}
 	} else {
 		if (init_attr->create_flags &
 		    IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK) {
 			dev_err(dev, "init_attr->create_flags error!\n");
 			ret = -EINVAL;
-			goto err_rq_sge_list;
+			goto err_alloc_rq_inline_buf;
 		}
 
 		if (init_attr->create_flags & IB_QP_CREATE_IPOIB_UD_LSO) {
 			dev_err(dev, "init_attr->create_flags error!\n");
 			ret = -EINVAL;
-			goto err_rq_sge_list;
+			goto err_alloc_rq_inline_buf;
 		}
 
 		/* Set SQ size */
@@ -671,7 +822,7 @@
 						  hr_qp);
 		if (ret) {
 			dev_err(dev, "hns_roce_set_kernel_sq_size error!\n");
-			goto err_rq_sge_list;
+			goto err_alloc_rq_inline_buf;
 		}
 
 		/* QP doorbell register address */
@@ -685,14 +836,13 @@
 			ret = hns_roce_alloc_db(hr_dev, &hr_qp->rdb, 0);
 			if (ret) {
 				dev_err(dev, "rq record doorbell alloc failed!\n");
-				goto err_rq_sge_list;
+				goto err_alloc_rq_inline_buf;
 			}
 			*hr_qp->rdb.db_record = 0;
 			hr_qp->rdb_en = 1;
 		}
 
 		/* Allocate QP buf */
-		page_shift = PAGE_SHIFT + hr_dev->caps.mtt_buf_pg_sz;
 		if (hns_roce_buf_alloc(hr_dev, hr_qp->buff_size,
 				       (1 << page_shift) * 2,
 				       &hr_qp->hr_buf, page_shift)) {
@@ -700,30 +850,44 @@
 			ret = -ENOMEM;
 			goto err_db;
 		}
-
-		hr_qp->mtt.mtt_type = MTT_TYPE_WQE;
-		/* Write MTT */
-		ret = hns_roce_mtt_init(hr_dev, hr_qp->hr_buf.npages,
-					hr_qp->hr_buf.page_shift, &hr_qp->mtt);
+		hr_qp->region_cnt = split_wqe_buf_region(hr_dev, hr_qp,
+				hr_qp->regions, ARRAY_SIZE(hr_qp->regions),
+				page_shift);
+		ret = hns_roce_alloc_buf_list(hr_qp->regions, buf_list,
+					      hr_qp->region_cnt);
 		if (ret) {
-			dev_err(dev, "hns_roce_mtt_init error for kernel create qp\n");
-			goto err_buf;
+			dev_err(dev, "alloc buf_list error for create qp!\n");
+			goto err_alloc_list;
 		}
 
-		ret = hns_roce_buf_write_mtt(hr_dev, &hr_qp->mtt,
-					     &hr_qp->hr_buf);
-		if (ret) {
-			dev_err(dev, "hns_roce_buf_write_mtt error for kernel create qp\n");
-			goto err_mtt;
+		for (i = 0; i < hr_qp->region_cnt; i++) {
+			r = &hr_qp->regions[i];
+			buf_count = hns_roce_get_kmem_bufs(hr_dev,
+					buf_list[i], r->count, r->offset,
+					&hr_qp->hr_buf);
+			if (buf_count != r->count) {
+				dev_err(dev,
+					"get kmem buf err, expect %d,ret %d.\n",
+					r->count, buf_count);
+				ret = -ENOBUFS;
+				goto err_get_bufs;
+			}
 		}
 
-		hr_qp->sq.wrid = kmalloc_array(hr_qp->sq.wqe_cnt, sizeof(u64),
-					       GFP_KERNEL);
-		hr_qp->rq.wrid = kmalloc_array(hr_qp->rq.wqe_cnt, sizeof(u64),
-					       GFP_KERNEL);
-		if (!hr_qp->sq.wrid || !hr_qp->rq.wrid) {
+		hr_qp->sq.wrid = kcalloc(hr_qp->sq.wqe_cnt, sizeof(u64),
+					 GFP_KERNEL);
+		if (ZERO_OR_NULL_PTR(hr_qp->sq.wrid)) {
 			ret = -ENOMEM;
-			goto err_wrid;
+			goto err_get_bufs;
+		}
+
+		if (hr_qp->rq.wqe_cnt) {
+			hr_qp->rq.wrid = kcalloc(hr_qp->rq.wqe_cnt, sizeof(u64),
+						 GFP_KERNEL);
+			if (ZERO_OR_NULL_PTR(hr_qp->rq.wrid)) {
+				ret = -ENOMEM;
+				goto err_sq_wrid;
+			}
 		}
 	}
 
@@ -738,6 +902,17 @@
 		}
 	}
 
+	hr_qp->wqe_bt_pg_shift = calc_wqe_bt_page_shift(hr_dev, hr_qp->regions,
+							hr_qp->region_cnt);
+	hns_roce_mtr_init(&hr_qp->mtr, PAGE_SHIFT + hr_qp->wqe_bt_pg_shift,
+			  page_shift);
+	ret = hns_roce_mtr_attach(hr_dev, &hr_qp->mtr, buf_list,
+				  hr_qp->regions, hr_qp->region_cnt);
+	if (ret) {
+		dev_err(dev, "mtr attach error for create qp\n");
+		goto err_mtr;
+	}
+
 	if (init_attr->qp_type == IB_QPT_GSI &&
 	    hr_dev->hw_rev == HNS_ROCE_HW_VER1) {
 		/* In v1 engine, GSI QP context in RoCE engine's register */
@@ -757,20 +932,23 @@
 	if (sqpn)
 		hr_qp->doorbell_qpn = 1;
 	else
-		hr_qp->doorbell_qpn = cpu_to_le64(hr_qp->qpn);
+		hr_qp->doorbell_qpn = (u32)hr_qp->qpn;
 
-	if (ib_pd->uobject && (udata->outlen >= sizeof(resp)) &&
-		(hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB)) {
-
-		/* indicate kernel supports rq record db */
-		resp.cap_flags |= HNS_ROCE_SUPPORT_RQ_RECORD_DB;
-		ret = ib_copy_to_udata(udata, &resp, sizeof(resp));
+	if (udata) {
+		ret = ib_copy_to_udata(udata, &resp,
+				       min(udata->outlen, sizeof(resp)));
 		if (ret)
 			goto err_qp;
-
-		hr_qp->rdb_en = 1;
 	}
+
+	if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_QP_FLOW_CTRL) {
+		ret = hr_dev->hw->qp_flow_control_init(hr_dev, hr_qp);
+		if (ret)
+			goto err_qp;
+	}
+
 	hr_qp->event = hns_roce_ib_qp_event;
+	hns_roce_free_buf_list(buf_list, hr_qp->region_cnt);
 
 	return 0;
 
@@ -785,50 +963,49 @@
 	if (!sqpn)
 		hns_roce_release_range_qp(hr_dev, qpn, 1);
 
+err_mtr:
+	hns_roce_mtr_cleanup(hr_dev, &hr_qp->mtr);
+
 err_wrid:
-	if (ib_pd->uobject) {
+	if (udata) {
 		if ((hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB) &&
 		    (udata->outlen >= sizeof(resp)) &&
 		    hns_roce_qp_has_rq(init_attr))
-			hns_roce_db_unmap_user(
-					to_hr_ucontext(ib_pd->uobject->context),
-					&hr_qp->rdb);
+			hns_roce_db_unmap_user(uctx, &hr_qp->rdb);
 	} else {
-		kfree(hr_qp->sq.wrid);
-		kfree(hr_qp->rq.wrid);
+		if (hr_qp->rq.wqe_cnt)
+			kfree(hr_qp->rq.wrid);
 	}
 
 err_sq_dbmap:
-	if (ib_pd->uobject)
+	if (udata)
 		if ((hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_SQ_RECORD_DB) &&
 		    (udata->inlen >= sizeof(ucmd)) &&
 		    (udata->outlen >= sizeof(resp)) &&
 		    hns_roce_qp_has_sq(init_attr))
-			hns_roce_db_unmap_user(
-					to_hr_ucontext(ib_pd->uobject->context),
-					&hr_qp->sdb);
+			hns_roce_db_unmap_user(uctx, &hr_qp->sdb);
 
-err_mtt:
-	hns_roce_mtt_cleanup(hr_dev, &hr_qp->mtt);
+err_sq_wrid:
+	if (!udata)
+		kfree(hr_qp->sq.wrid);
 
-err_buf:
-	if (ib_pd->uobject)
-		ib_umem_release(hr_qp->umem);
-	else
+err_get_bufs:
+	hns_roce_free_buf_list(buf_list, hr_qp->region_cnt);
+
+err_alloc_list:
+	if (!hr_qp->umem)
 		hns_roce_buf_free(hr_dev, hr_qp->buff_size, &hr_qp->hr_buf);
+	ib_umem_release(hr_qp->umem);
 
 err_db:
-	if (!ib_pd->uobject && hns_roce_qp_has_rq(init_attr) &&
+	if (!udata && hns_roce_qp_has_rq(init_attr) &&
 	    (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB))
 		hns_roce_free_db(hr_dev, &hr_qp->rdb);
 
-err_rq_sge_list:
-	if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RQ_INLINE)
-		kfree(hr_qp->rq_inl_buf.wqe_list[0].sg_list);
-
-err_wqe_list:
-	if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RQ_INLINE)
-		kfree(hr_qp->rq_inl_buf.wqe_list);
+err_alloc_rq_inline_buf:
+	if ((hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RQ_INLINE) &&
+	     hns_roce_qp_has_rq(init_attr))
+		free_rq_inline_buf(hr_qp);
 
 err_out:
 	return ret;
@@ -839,7 +1016,7 @@
 				 struct ib_udata *udata)
 {
 	struct hns_roce_dev *hr_dev = to_hr_dev(pd->device);
-	struct device *dev = hr_dev->dev;
+	struct ib_device *ibdev = &hr_dev->ib_dev;
 	struct hns_roce_sqp *hr_sqp;
 	struct hns_roce_qp *hr_qp;
 	int ret;
@@ -853,7 +1030,8 @@
 		ret = hns_roce_create_qp_common(hr_dev, pd, init_attr, udata, 0,
 						hr_qp);
 		if (ret) {
-			dev_err(dev, "Create RC QP failed\n");
+			ibdev_err(ibdev, "Create RC QP 0x%06lx failed(%d)\n",
+				  hr_qp->qpn, ret);
 			kfree(hr_qp);
 			return ERR_PTR(ret);
 		}
@@ -864,8 +1042,8 @@
 	}
 	case IB_QPT_GSI: {
 		/* Userspace is not allowed to create special QPs: */
-		if (pd->uobject) {
-			dev_err(dev, "not support usr space GSI\n");
+		if (udata) {
+			ibdev_err(ibdev, "not support usr space GSI\n");
 			return ERR_PTR(-EINVAL);
 		}
 
@@ -887,7 +1065,7 @@
 		ret = hns_roce_create_qp_common(hr_dev, pd, init_attr, udata,
 						hr_qp->ibqp.qp_num, hr_qp);
 		if (ret) {
-			dev_err(dev, "Create GSI QP failed!\n");
+			ibdev_err(ibdev, "Create GSI QP failed!\n");
 			kfree(hr_sqp);
 			return ERR_PTR(ret);
 		}
@@ -895,14 +1073,14 @@
 		break;
 	}
 	default:{
-		dev_err(dev, "not support QP type %d\n", init_attr->qp_type);
+		ibdev_err(ibdev, "not support QP type %d\n",
+			  init_attr->qp_type);
 		return ERR_PTR(-EINVAL);
 	}
 	}
 
 	return &hr_qp->ibqp;
 }
-EXPORT_SYMBOL_GPL(hns_roce_create_qp);
 
 int to_hr_qp_type(int qp_type)
 {
@@ -921,7 +1099,75 @@
 
 	return transport_type;
 }
-EXPORT_SYMBOL_GPL(to_hr_qp_type);
+
+static int check_mtu_validate(struct hns_roce_dev *hr_dev,
+			      struct hns_roce_qp *hr_qp,
+			      struct ib_qp_attr *attr, int attr_mask)
+{
+	enum ib_mtu active_mtu;
+	int p;
+
+	p = attr_mask & IB_QP_PORT ? (attr->port_num - 1) : hr_qp->port;
+	active_mtu = iboe_get_mtu(hr_dev->iboe.netdevs[p]->mtu);
+
+	if ((hr_dev->caps.max_mtu >= IB_MTU_2048 &&
+	    attr->path_mtu > hr_dev->caps.max_mtu) ||
+	    attr->path_mtu < IB_MTU_256 || attr->path_mtu > active_mtu) {
+		ibdev_err(&hr_dev->ib_dev,
+			"attr path_mtu(%d)invalid while modify qp",
+			attr->path_mtu);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int hns_roce_check_qp_attr(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+				  int attr_mask)
+{
+	struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device);
+	struct hns_roce_qp *hr_qp = to_hr_qp(ibqp);
+	int p;
+
+	if ((attr_mask & IB_QP_PORT) &&
+	    (attr->port_num == 0 || attr->port_num > hr_dev->caps.num_ports)) {
+		ibdev_err(&hr_dev->ib_dev,
+			"attr port_num invalid.attr->port_num=%d\n",
+			attr->port_num);
+		return -EINVAL;
+	}
+
+	if (attr_mask & IB_QP_PKEY_INDEX) {
+		p = attr_mask & IB_QP_PORT ? (attr->port_num - 1) : hr_qp->port;
+		if (attr->pkey_index >= hr_dev->caps.pkey_table_len[p]) {
+			ibdev_err(&hr_dev->ib_dev,
+				"attr pkey_index invalid.attr->pkey_index=%d\n",
+				attr->pkey_index);
+			return -EINVAL;
+		}
+	}
+
+	if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC &&
+	    attr->max_rd_atomic > hr_dev->caps.max_qp_init_rdma) {
+		ibdev_err(&hr_dev->ib_dev,
+			"attr max_rd_atomic invalid.attr->max_rd_atomic=%d\n",
+			attr->max_rd_atomic);
+		return -EINVAL;
+	}
+
+	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC &&
+	    attr->max_dest_rd_atomic > hr_dev->caps.max_qp_dest_rdma) {
+		ibdev_err(&hr_dev->ib_dev,
+			"attr max_dest_rd_atomic invalid.attr->max_dest_rd_atomic=%d\n",
+			attr->max_dest_rd_atomic);
+		return -EINVAL;
+	}
+
+	if (attr_mask & IB_QP_PATH_MTU)
+		return check_mtu_validate(hr_dev, hr_qp, attr, attr_mask);
+
+	return 0;
+}
 
 int hns_roce_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
 		       int attr_mask, struct ib_udata *udata)
@@ -929,85 +1175,43 @@
 	struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device);
 	struct hns_roce_qp *hr_qp = to_hr_qp(ibqp);
 	enum ib_qp_state cur_state, new_state;
-	struct device *dev = hr_dev->dev;
 	int ret = -EINVAL;
-	int p;
-	enum ib_mtu active_mtu;
 
 	mutex_lock(&hr_qp->mutex);
 
 	cur_state = attr_mask & IB_QP_CUR_STATE ?
 		    attr->cur_qp_state : (enum ib_qp_state)hr_qp->state;
-	new_state = attr_mask & IB_QP_STATE ?
-		    attr->qp_state : cur_state;
+	new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state;
 
 	if (ibqp->uobject &&
 	    (attr_mask & IB_QP_STATE) && new_state == IB_QPS_ERR) {
 		if (hr_qp->sdb_en == 1) {
 			hr_qp->sq.head = *(int *)(hr_qp->sdb.virt_addr);
-			hr_qp->rq.head = *(int *)(hr_qp->rdb.virt_addr);
+
+			if (hr_qp->rdb_en == 1)
+				hr_qp->rq.head = *(int *)(hr_qp->rdb.virt_addr);
 		} else {
-			dev_warn(dev, "flush cqe is not supported in userspace!\n");
+			ibdev_warn(&hr_dev->ib_dev,
+				  "flush cqe is not supported in userspace!\n");
 			goto out;
 		}
 	}
 
-	if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, attr_mask,
-				IB_LINK_LAYER_ETHERNET)) {
-		dev_err(dev, "ib_modify_qp_is_ok failed\n");
+	if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type,
+				attr_mask)) {
+		ibdev_err(&hr_dev->ib_dev, "ib_modify_qp_is_ok failed\n");
 		goto out;
 	}
 
-	if ((attr_mask & IB_QP_PORT) &&
-	    (attr->port_num == 0 || attr->port_num > hr_dev->caps.num_ports)) {
-		dev_err(dev, "attr port_num invalid.attr->port_num=%d\n",
-			attr->port_num);
+	ret = hns_roce_check_qp_attr(ibqp, attr, attr_mask);
+	if (ret)
 		goto out;
-	}
-
-	if (attr_mask & IB_QP_PKEY_INDEX) {
-		p = attr_mask & IB_QP_PORT ? (attr->port_num - 1) : hr_qp->port;
-		if (attr->pkey_index >= hr_dev->caps.pkey_table_len[p]) {
-			dev_err(dev, "attr pkey_index invalid.attr->pkey_index=%d\n",
-				attr->pkey_index);
-			goto out;
-		}
-	}
-
-	if (attr_mask & IB_QP_PATH_MTU) {
-		p = attr_mask & IB_QP_PORT ? (attr->port_num - 1) : hr_qp->port;
-		active_mtu = iboe_get_mtu(hr_dev->iboe.netdevs[p]->mtu);
-
-		if ((hr_dev->caps.max_mtu == IB_MTU_4096 &&
-		    attr->path_mtu > IB_MTU_4096) ||
-		    (hr_dev->caps.max_mtu == IB_MTU_2048 &&
-		    attr->path_mtu > IB_MTU_2048) ||
-		    attr->path_mtu < IB_MTU_256 ||
-		    attr->path_mtu > active_mtu) {
-			dev_err(dev, "attr path_mtu(%d)invalid while modify qp",
-				attr->path_mtu);
-			goto out;
-		}
-	}
-
-	if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC &&
-	    attr->max_rd_atomic > hr_dev->caps.max_qp_init_rdma) {
-		dev_err(dev, "attr max_rd_atomic invalid.attr->max_rd_atomic=%d\n",
-			attr->max_rd_atomic);
-		goto out;
-	}
-
-	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC &&
-	    attr->max_dest_rd_atomic > hr_dev->caps.max_qp_dest_rdma) {
-		dev_err(dev, "attr max_dest_rd_atomic invalid.attr->max_dest_rd_atomic=%d\n",
-			attr->max_dest_rd_atomic);
-		goto out;
-	}
 
 	if (cur_state == new_state && cur_state == IB_QPS_RESET) {
 		if (hr_dev->caps.min_wqes) {
 			ret = -EPERM;
-			dev_err(dev, "cur_state=%d new_state=%d\n", cur_state,
+			ibdev_err(&hr_dev->ib_dev,
+				"cur_state=%d new_state=%d\n", cur_state,
 				new_state);
 		} else {
 			ret = 0;
@@ -1039,7 +1243,6 @@
 		spin_lock_nested(&send_cq->lock, SINGLE_DEPTH_NESTING);
 	}
 }
-EXPORT_SYMBOL_GPL(hns_roce_lock_cqs);
 
 void hns_roce_unlock_cqs(struct hns_roce_cq *send_cq,
 			 struct hns_roce_cq *recv_cq) __releases(&send_cq->lock)
@@ -1056,7 +1259,6 @@
 		spin_unlock_irq(&recv_cq->lock);
 	}
 }
-EXPORT_SYMBOL_GPL(hns_roce_unlock_cqs);
 
 static void *get_wqe(struct hns_roce_qp *hr_qp, int offset)
 {
@@ -1068,20 +1270,17 @@
 {
 	return get_wqe(hr_qp, hr_qp->rq.offset + (n << hr_qp->rq.wqe_shift));
 }
-EXPORT_SYMBOL_GPL(get_recv_wqe);
 
 void *get_send_wqe(struct hns_roce_qp *hr_qp, int n)
 {
 	return get_wqe(hr_qp, hr_qp->sq.offset + (n << hr_qp->sq.wqe_shift));
 }
-EXPORT_SYMBOL_GPL(get_send_wqe);
 
 void *get_send_extend_sge(struct hns_roce_qp *hr_qp, int n)
 {
 	return hns_roce_buf_offset(&hr_qp->hr_buf, hr_qp->sge.offset +
 					(n << hr_qp->sge.sge_shift));
 }
-EXPORT_SYMBOL_GPL(get_send_extend_sge);
 
 bool hns_roce_wq_overflow(struct hns_roce_wq *hr_wq, int nreq,
 			  struct ib_cq *ib_cq)
@@ -1100,20 +1299,21 @@
 
 	return cur + nreq >= hr_wq->max_post;
 }
-EXPORT_SYMBOL_GPL(hns_roce_wq_overflow);
 
 int hns_roce_init_qp_table(struct hns_roce_dev *hr_dev)
 {
 	struct hns_roce_qp_table *qp_table = &hr_dev->qp_table;
 	int reserved_from_top = 0;
+	int reserved_from_bot;
 	int ret;
 
-	spin_lock_init(&qp_table->lock);
-	INIT_RADIX_TREE(&hr_dev->qp_table_tree, GFP_ATOMIC);
+	mutex_init(&qp_table->scc_mutex);
+	xa_init(&hr_dev->qp_table_xa);
 
-	/* A port include two SQP, six port total 12 */
+	reserved_from_bot = hr_dev->caps.reserved_qps;
+
 	ret = hns_roce_bitmap_init(&qp_table->bitmap, hr_dev->caps.num_qps,
-				   hr_dev->caps.num_qps - 1, SQP_NUM,
+				   hr_dev->caps.num_qps - 1, reserved_from_bot,
 				   reserved_from_top);
 	if (ret) {
 		dev_err(hr_dev->dev, "qp bitmap init failed!error=%d\n",
diff --git a/drivers/infiniband/hw/hns/hns_roce_restrack.c b/drivers/infiniband/hw/hns/hns_roce_restrack.c
new file mode 100644
index 0000000..0a31d0a
--- /dev/null
+++ b/drivers/infiniband/hw/hns/hns_roce_restrack.c
@@ -0,0 +1,126 @@
+// SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
+// Copyright (c) 2019 Hisilicon Limited.
+
+#include <rdma/rdma_cm.h>
+#include <rdma/restrack.h>
+#include <uapi/rdma/rdma_netlink.h>
+#include "hnae3.h"
+#include "hns_roce_common.h"
+#include "hns_roce_device.h"
+#include "hns_roce_hw_v2.h"
+
+static int hns_roce_fill_cq(struct sk_buff *msg,
+			    struct hns_roce_v2_cq_context *context)
+{
+	if (rdma_nl_put_driver_u32(msg, "state",
+				   roce_get_field(context->byte_4_pg_ceqn,
+						  V2_CQC_BYTE_4_ARM_ST_M,
+						  V2_CQC_BYTE_4_ARM_ST_S)))
+		goto err;
+
+	if (rdma_nl_put_driver_u32(msg, "ceqn",
+				   roce_get_field(context->byte_4_pg_ceqn,
+						  V2_CQC_BYTE_4_CEQN_M,
+						  V2_CQC_BYTE_4_CEQN_S)))
+		goto err;
+
+	if (rdma_nl_put_driver_u32(msg, "cqn",
+				   roce_get_field(context->byte_8_cqn,
+						  V2_CQC_BYTE_8_CQN_M,
+						  V2_CQC_BYTE_8_CQN_S)))
+		goto err;
+
+	if (rdma_nl_put_driver_u32(msg, "hopnum",
+				   roce_get_field(context->byte_16_hop_addr,
+						  V2_CQC_BYTE_16_CQE_HOP_NUM_M,
+						  V2_CQC_BYTE_16_CQE_HOP_NUM_S)))
+		goto err;
+
+	if (rdma_nl_put_driver_u32(
+		    msg, "pi",
+		    roce_get_field(context->byte_28_cq_pi,
+				   V2_CQC_BYTE_28_CQ_PRODUCER_IDX_M,
+				   V2_CQC_BYTE_28_CQ_PRODUCER_IDX_S)))
+		goto err;
+
+	if (rdma_nl_put_driver_u32(
+		    msg, "ci",
+		    roce_get_field(context->byte_32_cq_ci,
+				   V2_CQC_BYTE_32_CQ_CONSUMER_IDX_M,
+				   V2_CQC_BYTE_32_CQ_CONSUMER_IDX_S)))
+		goto err;
+
+	if (rdma_nl_put_driver_u32(
+		    msg, "coalesce",
+		    roce_get_field(context->byte_56_cqe_period_maxcnt,
+				   V2_CQC_BYTE_56_CQ_MAX_CNT_M,
+				   V2_CQC_BYTE_56_CQ_MAX_CNT_S)))
+		goto err;
+
+	if (rdma_nl_put_driver_u32(
+		    msg, "period",
+		    roce_get_field(context->byte_56_cqe_period_maxcnt,
+				   V2_CQC_BYTE_56_CQ_PERIOD_M,
+				   V2_CQC_BYTE_56_CQ_PERIOD_S)))
+		goto err;
+
+	if (rdma_nl_put_driver_u32(msg, "cnt",
+				   roce_get_field(context->byte_52_cqe_cnt,
+						  V2_CQC_BYTE_52_CQE_CNT_M,
+						  V2_CQC_BYTE_52_CQE_CNT_S)))
+		goto err;
+
+	return 0;
+
+err:
+	return -EMSGSIZE;
+}
+
+static int hns_roce_fill_res_cq_entry(struct sk_buff *msg,
+				      struct rdma_restrack_entry *res)
+{
+	struct ib_cq *ib_cq = container_of(res, struct ib_cq, res);
+	struct hns_roce_dev *hr_dev = to_hr_dev(ib_cq->device);
+	struct hns_roce_cq *hr_cq = to_hr_cq(ib_cq);
+	struct hns_roce_v2_cq_context *context;
+	struct nlattr *table_attr;
+	int ret;
+
+	if (!hr_dev->dfx->query_cqc_info)
+		return -EINVAL;
+
+	context = kzalloc(sizeof(struct hns_roce_v2_cq_context), GFP_KERNEL);
+	if (!context)
+		return -ENOMEM;
+
+	ret = hr_dev->dfx->query_cqc_info(hr_dev, hr_cq->cqn, (int *)context);
+	if (ret)
+		goto err;
+
+	table_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_DRIVER);
+	if (!table_attr)
+		goto err;
+
+	if (hns_roce_fill_cq(msg, context))
+		goto err_cancel_table;
+
+	nla_nest_end(msg, table_attr);
+	kfree(context);
+
+	return 0;
+
+err_cancel_table:
+	nla_nest_cancel(msg, table_attr);
+err:
+	kfree(context);
+	return -EMSGSIZE;
+}
+
+int hns_roce_fill_res_entry(struct sk_buff *msg,
+			    struct rdma_restrack_entry *res)
+{
+	if (res->type == RDMA_RESTRACK_CQ)
+		return hns_roce_fill_res_cq_entry(msg, res);
+
+	return 0;
+}
diff --git a/drivers/infiniband/hw/hns/hns_roce_srq.c b/drivers/infiniband/hw/hns/hns_roce_srq.c
new file mode 100644
index 0000000..43ea2c1
--- /dev/null
+++ b/drivers/infiniband/hw/hns/hns_roce_srq.c
@@ -0,0 +1,473 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright (c) 2018 Hisilicon Limited.
+ */
+
+#include <rdma/ib_umem.h>
+#include <rdma/hns-abi.h>
+#include "hns_roce_device.h"
+#include "hns_roce_cmd.h"
+#include "hns_roce_hem.h"
+
+void hns_roce_srq_event(struct hns_roce_dev *hr_dev, u32 srqn, int event_type)
+{
+	struct hns_roce_srq_table *srq_table = &hr_dev->srq_table;
+	struct hns_roce_srq *srq;
+
+	xa_lock(&srq_table->xa);
+	srq = xa_load(&srq_table->xa, srqn & (hr_dev->caps.num_srqs - 1));
+	if (srq)
+		atomic_inc(&srq->refcount);
+	xa_unlock(&srq_table->xa);
+
+	if (!srq) {
+		dev_warn(hr_dev->dev, "Async event for bogus SRQ %08x\n", srqn);
+		return;
+	}
+
+	srq->event(srq, event_type);
+
+	if (atomic_dec_and_test(&srq->refcount))
+		complete(&srq->free);
+}
+
+static void hns_roce_ib_srq_event(struct hns_roce_srq *srq,
+				  enum hns_roce_event event_type)
+{
+	struct hns_roce_dev *hr_dev = to_hr_dev(srq->ibsrq.device);
+	struct ib_srq *ibsrq = &srq->ibsrq;
+	struct ib_event event;
+
+	if (ibsrq->event_handler) {
+		event.device      = ibsrq->device;
+		event.element.srq = ibsrq;
+		switch (event_type) {
+		case HNS_ROCE_EVENT_TYPE_SRQ_LIMIT_REACH:
+			event.event = IB_EVENT_SRQ_LIMIT_REACHED;
+			break;
+		case HNS_ROCE_EVENT_TYPE_SRQ_CATAS_ERROR:
+			event.event = IB_EVENT_SRQ_ERR;
+			break;
+		default:
+			dev_err(hr_dev->dev,
+			   "hns_roce:Unexpected event type 0x%x on SRQ %06lx\n",
+			   event_type, srq->srqn);
+			return;
+		}
+
+		ibsrq->event_handler(&event, ibsrq->srq_context);
+	}
+}
+
+static int hns_roce_sw2hw_srq(struct hns_roce_dev *dev,
+			      struct hns_roce_cmd_mailbox *mailbox,
+			      unsigned long srq_num)
+{
+	return hns_roce_cmd_mbox(dev, mailbox->dma, 0, srq_num, 0,
+				 HNS_ROCE_CMD_SW2HW_SRQ,
+				 HNS_ROCE_CMD_TIMEOUT_MSECS);
+}
+
+static int hns_roce_hw2sw_srq(struct hns_roce_dev *dev,
+			     struct hns_roce_cmd_mailbox *mailbox,
+			     unsigned long srq_num)
+{
+	return hns_roce_cmd_mbox(dev, 0, mailbox ? mailbox->dma : 0, srq_num,
+				 mailbox ? 0 : 1, HNS_ROCE_CMD_HW2SW_SRQ,
+				 HNS_ROCE_CMD_TIMEOUT_MSECS);
+}
+
+static int hns_roce_srq_alloc(struct hns_roce_dev *hr_dev, u32 pdn, u32 cqn,
+			      u16 xrcd, struct hns_roce_mtt *hr_mtt,
+			      u64 db_rec_addr, struct hns_roce_srq *srq)
+{
+	struct hns_roce_srq_table *srq_table = &hr_dev->srq_table;
+	struct hns_roce_cmd_mailbox *mailbox;
+	dma_addr_t dma_handle_wqe;
+	dma_addr_t dma_handle_idx;
+	u64 *mtts_wqe;
+	u64 *mtts_idx;
+	int ret;
+
+	/* Get the physical address of srq buf */
+	mtts_wqe = hns_roce_table_find(hr_dev,
+				       &hr_dev->mr_table.mtt_srqwqe_table,
+				       srq->mtt.first_seg,
+				       &dma_handle_wqe);
+	if (!mtts_wqe) {
+		dev_err(hr_dev->dev,
+			"SRQ alloc.Failed to find srq buf addr.\n");
+		return -EINVAL;
+	}
+
+	/* Get physical address of idx que buf */
+	mtts_idx = hns_roce_table_find(hr_dev, &hr_dev->mr_table.mtt_idx_table,
+				       srq->idx_que.mtt.first_seg,
+				       &dma_handle_idx);
+	if (!mtts_idx) {
+		dev_err(hr_dev->dev,
+			"SRQ alloc.Failed to find idx que buf addr.\n");
+		return -EINVAL;
+	}
+
+	ret = hns_roce_bitmap_alloc(&srq_table->bitmap, &srq->srqn);
+	if (ret == -1) {
+		dev_err(hr_dev->dev, "SRQ alloc.Failed to alloc index.\n");
+		return -ENOMEM;
+	}
+
+	ret = hns_roce_table_get(hr_dev, &srq_table->table, srq->srqn);
+	if (ret)
+		goto err_out;
+
+	ret = xa_err(xa_store(&srq_table->xa, srq->srqn, srq, GFP_KERNEL));
+	if (ret)
+		goto err_put;
+
+	mailbox = hns_roce_alloc_cmd_mailbox(hr_dev);
+	if (IS_ERR(mailbox)) {
+		ret = PTR_ERR(mailbox);
+		goto err_xa;
+	}
+
+	hr_dev->hw->write_srqc(hr_dev, srq, pdn, xrcd, cqn, mailbox->buf,
+			       mtts_wqe, mtts_idx, dma_handle_wqe,
+			       dma_handle_idx);
+
+	ret = hns_roce_sw2hw_srq(hr_dev, mailbox, srq->srqn);
+	hns_roce_free_cmd_mailbox(hr_dev, mailbox);
+	if (ret)
+		goto err_xa;
+
+	atomic_set(&srq->refcount, 1);
+	init_completion(&srq->free);
+	return ret;
+
+err_xa:
+	xa_erase(&srq_table->xa, srq->srqn);
+
+err_put:
+	hns_roce_table_put(hr_dev, &srq_table->table, srq->srqn);
+
+err_out:
+	hns_roce_bitmap_free(&srq_table->bitmap, srq->srqn, BITMAP_NO_RR);
+	return ret;
+}
+
+static void hns_roce_srq_free(struct hns_roce_dev *hr_dev,
+			      struct hns_roce_srq *srq)
+{
+	struct hns_roce_srq_table *srq_table = &hr_dev->srq_table;
+	int ret;
+
+	ret = hns_roce_hw2sw_srq(hr_dev, NULL, srq->srqn);
+	if (ret)
+		dev_err(hr_dev->dev, "HW2SW_SRQ failed (%d) for CQN %06lx\n",
+			ret, srq->srqn);
+
+	xa_erase(&srq_table->xa, srq->srqn);
+
+	if (atomic_dec_and_test(&srq->refcount))
+		complete(&srq->free);
+	wait_for_completion(&srq->free);
+
+	hns_roce_table_put(hr_dev, &srq_table->table, srq->srqn);
+	hns_roce_bitmap_free(&srq_table->bitmap, srq->srqn, BITMAP_NO_RR);
+}
+
+static int create_user_srq(struct hns_roce_srq *srq, struct ib_udata *udata,
+			   int srq_buf_size)
+{
+	struct hns_roce_dev *hr_dev = to_hr_dev(srq->ibsrq.device);
+	struct hns_roce_ib_create_srq  ucmd;
+	u32 page_shift;
+	u32 npages;
+	int ret;
+
+	if (ib_copy_from_udata(&ucmd, udata, sizeof(ucmd)))
+		return -EFAULT;
+
+	srq->umem = ib_umem_get(udata, ucmd.buf_addr, srq_buf_size, 0, 0);
+	if (IS_ERR(srq->umem))
+		return PTR_ERR(srq->umem);
+
+	npages = (ib_umem_page_count(srq->umem) +
+		(1 << hr_dev->caps.srqwqe_buf_pg_sz) - 1) /
+		(1 << hr_dev->caps.srqwqe_buf_pg_sz);
+	page_shift = PAGE_SHIFT + hr_dev->caps.srqwqe_buf_pg_sz;
+	ret = hns_roce_mtt_init(hr_dev, npages, page_shift, &srq->mtt);
+	if (ret)
+		goto err_user_buf;
+
+	ret = hns_roce_ib_umem_write_mtt(hr_dev, &srq->mtt, srq->umem);
+	if (ret)
+		goto err_user_srq_mtt;
+
+	/* config index queue BA */
+	srq->idx_que.umem = ib_umem_get(udata, ucmd.que_addr,
+					srq->idx_que.buf_size, 0, 0);
+	if (IS_ERR(srq->idx_que.umem)) {
+		dev_err(hr_dev->dev, "ib_umem_get error for index queue\n");
+		ret = PTR_ERR(srq->idx_que.umem);
+		goto err_user_srq_mtt;
+	}
+
+	ret = hns_roce_mtt_init(hr_dev, ib_umem_page_count(srq->idx_que.umem),
+				PAGE_SHIFT, &srq->idx_que.mtt);
+
+	if (ret) {
+		dev_err(hr_dev->dev, "hns_roce_mtt_init error for idx que\n");
+		goto err_user_idx_mtt;
+	}
+
+	ret = hns_roce_ib_umem_write_mtt(hr_dev, &srq->idx_que.mtt,
+					 srq->idx_que.umem);
+	if (ret) {
+		dev_err(hr_dev->dev,
+			"hns_roce_ib_umem_write_mtt error for idx que\n");
+		goto err_user_idx_buf;
+	}
+
+	return 0;
+
+err_user_idx_buf:
+	hns_roce_mtt_cleanup(hr_dev, &srq->idx_que.mtt);
+
+err_user_idx_mtt:
+	ib_umem_release(srq->idx_que.umem);
+
+err_user_srq_mtt:
+	hns_roce_mtt_cleanup(hr_dev, &srq->mtt);
+
+err_user_buf:
+	ib_umem_release(srq->umem);
+
+	return ret;
+}
+
+static int hns_roce_create_idx_que(struct ib_pd *pd, struct hns_roce_srq *srq,
+				   u32 page_shift)
+{
+	struct hns_roce_dev *hr_dev = to_hr_dev(pd->device);
+	struct hns_roce_idx_que *idx_que = &srq->idx_que;
+
+	idx_que->bitmap = bitmap_zalloc(srq->max, GFP_KERNEL);
+	if (!idx_que->bitmap)
+		return -ENOMEM;
+
+	idx_que->buf_size = srq->idx_que.buf_size;
+
+	if (hns_roce_buf_alloc(hr_dev, idx_que->buf_size, (1 << page_shift) * 2,
+			       &idx_que->idx_buf, page_shift)) {
+		bitmap_free(idx_que->bitmap);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static int create_kernel_srq(struct hns_roce_srq *srq, int srq_buf_size)
+{
+	struct hns_roce_dev *hr_dev = to_hr_dev(srq->ibsrq.device);
+	u32 page_shift = PAGE_SHIFT + hr_dev->caps.srqwqe_buf_pg_sz;
+	int ret;
+
+	if (hns_roce_buf_alloc(hr_dev, srq_buf_size, (1 << page_shift) * 2,
+			       &srq->buf, page_shift))
+		return -ENOMEM;
+
+	srq->head = 0;
+	srq->tail = srq->max - 1;
+
+	ret = hns_roce_mtt_init(hr_dev, srq->buf.npages, srq->buf.page_shift,
+				&srq->mtt);
+	if (ret)
+		goto err_kernel_buf;
+
+	ret = hns_roce_buf_write_mtt(hr_dev, &srq->mtt, &srq->buf);
+	if (ret)
+		goto err_kernel_srq_mtt;
+
+	page_shift = PAGE_SHIFT + hr_dev->caps.idx_buf_pg_sz;
+	ret = hns_roce_create_idx_que(srq->ibsrq.pd, srq, page_shift);
+	if (ret) {
+		dev_err(hr_dev->dev, "Create idx queue fail(%d)!\n", ret);
+		goto err_kernel_srq_mtt;
+	}
+
+	/* Init mtt table for idx_que */
+	ret = hns_roce_mtt_init(hr_dev, srq->idx_que.idx_buf.npages,
+				srq->idx_que.idx_buf.page_shift,
+				&srq->idx_que.mtt);
+	if (ret)
+		goto err_kernel_create_idx;
+
+	/* Write buffer address into the mtt table */
+	ret = hns_roce_buf_write_mtt(hr_dev, &srq->idx_que.mtt,
+				     &srq->idx_que.idx_buf);
+	if (ret)
+		goto err_kernel_idx_buf;
+
+	srq->wrid = kvmalloc_array(srq->max, sizeof(u64), GFP_KERNEL);
+	if (!srq->wrid) {
+		ret = -ENOMEM;
+		goto err_kernel_idx_buf;
+	}
+
+	return 0;
+
+err_kernel_idx_buf:
+	hns_roce_mtt_cleanup(hr_dev, &srq->idx_que.mtt);
+
+err_kernel_create_idx:
+	hns_roce_buf_free(hr_dev, srq->idx_que.buf_size,
+			  &srq->idx_que.idx_buf);
+	kfree(srq->idx_que.bitmap);
+
+err_kernel_srq_mtt:
+	hns_roce_mtt_cleanup(hr_dev, &srq->mtt);
+
+err_kernel_buf:
+	hns_roce_buf_free(hr_dev, srq_buf_size, &srq->buf);
+
+	return ret;
+}
+
+static void destroy_user_srq(struct hns_roce_dev *hr_dev,
+			     struct hns_roce_srq *srq)
+{
+	hns_roce_mtt_cleanup(hr_dev, &srq->idx_que.mtt);
+	ib_umem_release(srq->idx_que.umem);
+	hns_roce_mtt_cleanup(hr_dev, &srq->mtt);
+	ib_umem_release(srq->umem);
+}
+
+static void destroy_kernel_srq(struct hns_roce_dev *hr_dev,
+			       struct hns_roce_srq *srq, int srq_buf_size)
+{
+	kvfree(srq->wrid);
+	hns_roce_mtt_cleanup(hr_dev, &srq->idx_que.mtt);
+	hns_roce_buf_free(hr_dev, srq->idx_que.buf_size, &srq->idx_que.idx_buf);
+	kfree(srq->idx_que.bitmap);
+	hns_roce_mtt_cleanup(hr_dev, &srq->mtt);
+	hns_roce_buf_free(hr_dev, srq_buf_size, &srq->buf);
+}
+
+int hns_roce_create_srq(struct ib_srq *ib_srq,
+			struct ib_srq_init_attr *srq_init_attr,
+			struct ib_udata *udata)
+{
+	struct hns_roce_dev *hr_dev = to_hr_dev(ib_srq->device);
+	struct hns_roce_ib_create_srq_resp resp = {};
+	struct hns_roce_srq *srq = to_hr_srq(ib_srq);
+	int srq_desc_size;
+	int srq_buf_size;
+	int ret = 0;
+	u32 cqn;
+
+	/* Check the actual SRQ wqe and SRQ sge num */
+	if (srq_init_attr->attr.max_wr >= hr_dev->caps.max_srq_wrs ||
+	    srq_init_attr->attr.max_sge > hr_dev->caps.max_srq_sges)
+		return -EINVAL;
+
+	mutex_init(&srq->mutex);
+	spin_lock_init(&srq->lock);
+
+	srq->max = roundup_pow_of_two(srq_init_attr->attr.max_wr + 1);
+	srq->max_gs = srq_init_attr->attr.max_sge;
+
+	srq_desc_size = roundup_pow_of_two(max(16, 16 * srq->max_gs));
+
+	srq->wqe_shift = ilog2(srq_desc_size);
+
+	srq_buf_size = srq->max * srq_desc_size;
+
+	srq->idx_que.entry_sz = HNS_ROCE_IDX_QUE_ENTRY_SZ;
+	srq->idx_que.buf_size = srq->max * srq->idx_que.entry_sz;
+	srq->mtt.mtt_type = MTT_TYPE_SRQWQE;
+	srq->idx_que.mtt.mtt_type = MTT_TYPE_IDX;
+
+	if (udata) {
+		ret = create_user_srq(srq, udata, srq_buf_size);
+		if (ret) {
+			dev_err(hr_dev->dev, "Create user srq failed\n");
+			goto err_srq;
+		}
+	} else {
+		ret = create_kernel_srq(srq, srq_buf_size);
+		if (ret) {
+			dev_err(hr_dev->dev, "Create kernel srq failed\n");
+			goto err_srq;
+		}
+	}
+
+	cqn = ib_srq_has_cq(srq_init_attr->srq_type) ?
+	      to_hr_cq(srq_init_attr->ext.cq)->cqn : 0;
+
+	srq->db_reg_l = hr_dev->reg_base + SRQ_DB_REG;
+
+	ret = hns_roce_srq_alloc(hr_dev, to_hr_pd(ib_srq->pd)->pdn, cqn, 0,
+				 &srq->mtt, 0, srq);
+	if (ret)
+		goto err_wrid;
+
+	srq->event = hns_roce_ib_srq_event;
+	resp.srqn = srq->srqn;
+
+	if (udata) {
+		if (ib_copy_to_udata(udata, &resp,
+				     min(udata->outlen, sizeof(resp)))) {
+			ret = -EFAULT;
+			goto err_srqc_alloc;
+		}
+	}
+
+	return 0;
+
+err_srqc_alloc:
+	hns_roce_srq_free(hr_dev, srq);
+
+err_wrid:
+	if (udata)
+		destroy_user_srq(hr_dev, srq);
+	else
+		destroy_kernel_srq(hr_dev, srq, srq_buf_size);
+
+err_srq:
+	return ret;
+}
+
+void hns_roce_destroy_srq(struct ib_srq *ibsrq, struct ib_udata *udata)
+{
+	struct hns_roce_dev *hr_dev = to_hr_dev(ibsrq->device);
+	struct hns_roce_srq *srq = to_hr_srq(ibsrq);
+
+	hns_roce_srq_free(hr_dev, srq);
+	hns_roce_mtt_cleanup(hr_dev, &srq->mtt);
+
+	if (udata) {
+		hns_roce_mtt_cleanup(hr_dev, &srq->idx_que.mtt);
+	} else {
+		kvfree(srq->wrid);
+		hns_roce_buf_free(hr_dev, srq->max << srq->wqe_shift,
+				  &srq->buf);
+	}
+	ib_umem_release(srq->idx_que.umem);
+	ib_umem_release(srq->umem);
+}
+
+int hns_roce_init_srq_table(struct hns_roce_dev *hr_dev)
+{
+	struct hns_roce_srq_table *srq_table = &hr_dev->srq_table;
+
+	xa_init(&srq_table->xa);
+
+	return hns_roce_bitmap_init(&srq_table->bitmap, hr_dev->caps.num_srqs,
+				    hr_dev->caps.num_srqs - 1,
+				    hr_dev->caps.reserved_srqs, 0);
+}
+
+void hns_roce_cleanup_srq_table(struct hns_roce_dev *hr_dev)
+{
+	hns_roce_bitmap_cleanup(&hr_dev->srq_table.bitmap);
+}
diff --git a/drivers/infiniband/hw/i40iw/Kconfig b/drivers/infiniband/hw/i40iw/Kconfig
index d867ef1..e4b45f4 100644
--- a/drivers/infiniband/hw/i40iw/Kconfig
+++ b/drivers/infiniband/hw/i40iw/Kconfig
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 config INFINIBAND_I40IW
 	tristate "Intel(R) Ethernet X722 iWARP Driver"
 	depends on INET && I40E
diff --git a/drivers/infiniband/hw/i40iw/Makefile b/drivers/infiniband/hw/i40iw/Makefile
index 5a8a7a3..8942f82 100644
--- a/drivers/infiniband/hw/i40iw/Makefile
+++ b/drivers/infiniband/hw/i40iw/Makefile
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: GPL-2.0
-ccflags-y :=  -Idrivers/net/ethernet/intel/i40e
+ccflags-y :=  -I $(srctree)/drivers/net/ethernet/intel/i40e
 
 obj-$(CONFIG_INFINIBAND_I40IW) += i40iw.o
 
diff --git a/drivers/infiniband/hw/i40iw/i40iw.h b/drivers/infiniband/hw/i40iw/i40iw.h
index 2f2b442..8feec35 100644
--- a/drivers/infiniband/hw/i40iw/i40iw.h
+++ b/drivers/infiniband/hw/i40iw/i40iw.h
@@ -552,7 +552,7 @@
 
 void i40iw_request_reset(struct i40iw_device *iwdev);
 void i40iw_destroy_rdma_device(struct i40iw_ib_device *iwibdev);
-void i40iw_setup_cm_core(struct i40iw_device *iwdev);
+int i40iw_setup_cm_core(struct i40iw_device *iwdev);
 void i40iw_cleanup_cm_core(struct i40iw_cm_core *cm_core);
 void i40iw_process_ceq(struct i40iw_device *, struct i40iw_ceq *iwceq);
 void i40iw_process_aeq(struct i40iw_device *);
diff --git a/drivers/infiniband/hw/i40iw/i40iw_cm.c b/drivers/infiniband/hw/i40iw/i40iw_cm.c
index 423818a..2d6a378 100644
--- a/drivers/infiniband/hw/i40iw/i40iw_cm.c
+++ b/drivers/infiniband/hw/i40iw/i40iw_cm.c
@@ -404,7 +404,7 @@
 	if (pdata)
 		pd_len = pdata->size;
 
-	if (cm_node->vlan_id < VLAN_TAG_PRESENT)
+	if (cm_node->vlan_id <= VLAN_VID_MASK)
 		eth_hlen += 4;
 
 	if (cm_node->ipv4)
@@ -433,7 +433,7 @@
 
 		ether_addr_copy(ethh->h_dest, cm_node->rem_mac);
 		ether_addr_copy(ethh->h_source, cm_node->loc_mac);
-		if (cm_node->vlan_id < VLAN_TAG_PRESENT) {
+		if (cm_node->vlan_id <= VLAN_VID_MASK) {
 			((struct vlan_ethhdr *)ethh)->h_vlan_proto = htons(ETH_P_8021Q);
 			vtag = (cm_node->user_pri << VLAN_PRIO_SHIFT) | cm_node->vlan_id;
 			((struct vlan_ethhdr *)ethh)->h_vlan_TCI = htons(vtag);
@@ -463,7 +463,7 @@
 
 		ether_addr_copy(ethh->h_dest, cm_node->rem_mac);
 		ether_addr_copy(ethh->h_source, cm_node->loc_mac);
-		if (cm_node->vlan_id < VLAN_TAG_PRESENT) {
+		if (cm_node->vlan_id <= VLAN_VID_MASK) {
 			((struct vlan_ethhdr *)ethh)->h_vlan_proto = htons(ETH_P_8021Q);
 			vtag = (cm_node->user_pri << VLAN_PRIO_SHIFT) | cm_node->vlan_id;
 			((struct vlan_ethhdr *)ethh)->h_vlan_TCI = htons(vtag);
@@ -1689,7 +1689,7 @@
 	unsigned long flags;
 
 	rtnl_lock();
-	for_each_netdev_rcu(&init_net, ip_dev) {
+	for_each_netdev(&init_net, ip_dev) {
 		if ((((rdma_vlan_dev_vlan_id(ip_dev) < I40IW_NO_VLAN) &&
 		      (rdma_vlan_dev_real_dev(ip_dev) == iwdev->netdev)) ||
 		     (ip_dev == iwdev->netdev)) && (ip_dev->flags & IFF_UP)) {
@@ -1773,8 +1773,11 @@
 		if ((((rdma_vlan_dev_vlan_id(dev) < I40IW_NO_VLAN) &&
 		      (rdma_vlan_dev_real_dev(dev) == iwdev->netdev)) ||
 		    (dev == iwdev->netdev)) && (dev->flags & IFF_UP)) {
+			const struct in_ifaddr *ifa;
+
 			idev = in_dev_get(dev);
-			for_ifa(idev) {
+
+			in_dev_for_each_ifa_rtnl(ifa, idev) {
 				i40iw_debug(&iwdev->sc_dev,
 					    I40IW_DEBUG_CM,
 					    "Allocating child CM Listener forIP=%pI4, vlan_id=%d, MAC=%pM\n",
@@ -1819,7 +1822,7 @@
 					cm_parent_listen_node->cm_core->stats_listen_nodes_created--;
 				}
 			}
-			endfor_ifa(idev);
+
 			in_dev_put(idev);
 		}
 	}
@@ -3237,7 +3240,7 @@
  * core
  * @iwdev: iwarp device structure
  */
-void i40iw_setup_cm_core(struct i40iw_device *iwdev)
+int i40iw_setup_cm_core(struct i40iw_device *iwdev)
 {
 	struct i40iw_cm_core *cm_core = &iwdev->cm_core;
 
@@ -3256,9 +3259,19 @@
 
 	cm_core->event_wq = alloc_ordered_workqueue("iwewq",
 						    WQ_MEM_RECLAIM);
+	if (!cm_core->event_wq)
+		goto error;
 
 	cm_core->disconn_wq = alloc_ordered_workqueue("iwdwq",
 						      WQ_MEM_RECLAIM);
+	if (!cm_core->disconn_wq)
+		goto error;
+
+	return 0;
+error:
+	i40iw_cleanup_cm_core(&iwdev->cm_core);
+
+	return -ENOMEM;
 }
 
 /**
@@ -3278,8 +3291,10 @@
 		del_timer_sync(&cm_core->tcp_timer);
 	spin_unlock_irqrestore(&cm_core->ht_lock, flags);
 
-	destroy_workqueue(cm_core->event_wq);
-	destroy_workqueue(cm_core->disconn_wq);
+	if (cm_core->event_wq)
+		destroy_workqueue(cm_core->event_wq);
+	if (cm_core->disconn_wq)
+		destroy_workqueue(cm_core->disconn_wq);
 }
 
 /**
@@ -3323,7 +3338,7 @@
 
 	tcp_info->flow_label = 0;
 	tcp_info->snd_mss = cpu_to_le32(((u32)cm_node->tcp_cntxt.mss));
-	if (cm_node->vlan_id < VLAN_TAG_PRESENT) {
+	if (cm_node->vlan_id <= VLAN_VID_MASK) {
 		tcp_info->insert_vlan_tag = true;
 		tcp_info->vlan_tag = cpu_to_le16(((u16)cm_node->user_pri << I40IW_VLAN_PRIO_SHIFT) |
 						  cm_node->vlan_id);
@@ -3478,7 +3493,8 @@
 		/* Need to free the Last Streaming Mode Message */
 		if (iwqp->ietf_mem.va) {
 			if (iwqp->lsmm_mr)
-				iwibdev->ibdev.dereg_mr(iwqp->lsmm_mr);
+				iwibdev->ibdev.ops.dereg_mr(iwqp->lsmm_mr,
+							    NULL);
 			i40iw_free_dma_mem(iwdev->sc_dev.hw, &iwqp->ietf_mem);
 		}
 	}
@@ -4263,11 +4279,11 @@
 	/* if not found then add a child listener if interface is going up */
 	if (!ifup)
 		return;
-	child_listen_node = kzalloc(sizeof(*child_listen_node), GFP_ATOMIC);
+	child_listen_node = kmemdup(parent_listen_node,
+			sizeof(*child_listen_node), GFP_ATOMIC);
 	if (!child_listen_node)
 		return;
 	node_allocated = true;
-	memcpy(child_listen_node, parent_listen_node, sizeof(*child_listen_node));
 
 	memcpy(child_listen_node->loc_addr, ipaddr,  ipv4 ? 4 : 16);
 
diff --git a/drivers/infiniband/hw/i40iw/i40iw_main.c b/drivers/infiniband/hw/i40iw/i40iw_main.c
index 68095f0..d44cf33 100644
--- a/drivers/infiniband/hw/i40iw/i40iw_main.c
+++ b/drivers/infiniband/hw/i40iw/i40iw_main.c
@@ -1222,8 +1222,10 @@
 		if ((((rdma_vlan_dev_vlan_id(dev) < 0xFFFF) &&
 		      (rdma_vlan_dev_real_dev(dev) == iwdev->netdev)) ||
 		    (dev == iwdev->netdev)) && (dev->flags & IFF_UP)) {
+			const struct in_ifaddr *ifa;
+
 			idev = in_dev_get(dev);
-			for_ifa(idev) {
+			in_dev_for_each_ifa_rtnl(ifa, idev) {
 				i40iw_debug(&iwdev->sc_dev, I40IW_DEBUG_CM,
 					    "IP=%pI4, vlan_id=%d, MAC=%pM\n", &ifa->ifa_address,
 					     rdma_vlan_dev_vlan_id(dev), dev->dev_addr);
@@ -1235,7 +1237,7 @@
 						       true,
 						       I40IW_ARP_ADD);
 			}
-			endfor_ifa(idev);
+
 			in_dev_put(idev);
 		}
 	}
@@ -1641,7 +1643,10 @@
 	iwdev = &hdl->device;
 	iwdev->hdl = hdl;
 	dev = &iwdev->sc_dev;
-	i40iw_setup_cm_core(iwdev);
+	if (i40iw_setup_cm_core(iwdev)) {
+		kfree(iwdev->hdl);
+		return -ENOMEM;
+	}
 
 	dev->back_dev = (void *)iwdev;
 	iwdev->ldev = &hdl->ldev;
diff --git a/drivers/infiniband/hw/i40iw/i40iw_osdep.h b/drivers/infiniband/hw/i40iw/i40iw_osdep.h
index f27be3e..d474aad 100644
--- a/drivers/infiniband/hw/i40iw/i40iw_osdep.h
+++ b/drivers/infiniband/hw/i40iw/i40iw_osdep.h
@@ -211,7 +211,7 @@
 struct i40iw_sc_vsi;
 void i40iw_hw_stats_start_timer(struct i40iw_sc_vsi *vsi);
 void i40iw_hw_stats_stop_timer(struct i40iw_sc_vsi *vsi);
-#define i40iw_mmiowb() mmiowb()
+#define i40iw_mmiowb() do { } while (0)
 void i40iw_wr32(struct i40iw_hw *hw, u32 reg, u32 value);
 u32  i40iw_rd32(struct i40iw_hw *hw, u32 reg);
 #endif				/* _I40IW_OSDEP_H_ */
diff --git a/drivers/infiniband/hw/i40iw/i40iw_utils.c b/drivers/infiniband/hw/i40iw/i40iw_utils.c
index a9ea966..0165246 100644
--- a/drivers/infiniband/hw/i40iw/i40iw_utils.c
+++ b/drivers/infiniband/hw/i40iw/i40iw_utils.c
@@ -173,7 +173,16 @@
 
 		rcu_read_lock();
 		in = __in_dev_get_rcu(upper_dev);
-		local_ipaddr = ntohl(in->ifa_list->ifa_address);
+
+		local_ipaddr = 0;
+		if (in) {
+			struct in_ifaddr *ifa;
+
+			ifa = rcu_dereference(in->ifa_list);
+			if (ifa)
+				local_ipaddr = ntohl(ifa->ifa_address);
+		}
+
 		rcu_read_unlock();
 	} else {
 		local_ipaddr = ntohl(ifa->ifa_address);
@@ -185,6 +194,11 @@
 	case NETDEV_UP:
 		/* Fall through */
 	case NETDEV_CHANGEADDR:
+
+		/* Just skip if no need to handle ARP cache */
+		if (!local_ipaddr)
+			break;
+
 		i40iw_manage_arp_cache(iwdev,
 				       netdev->dev_addr,
 				       &local_ipaddr,
@@ -601,7 +615,6 @@
 	if (!atomic_dec_and_test(&iwpd->usecount))
 		return;
 	i40iw_free_resource(iwdev, iwdev->allocated_pds, iwpd->sc_pd.pd_id);
-	kfree(iwpd);
 }
 
 /**
@@ -745,8 +758,8 @@
 	if (!mem)
 		return I40IW_ERR_PARAM;
 	mem->size = ALIGN(size, alignment);
-	mem->va = dma_zalloc_coherent(&pcidev->dev, mem->size,
-				      (dma_addr_t *)&mem->pa, GFP_KERNEL);
+	mem->va = dma_alloc_coherent(&pcidev->dev, mem->size,
+				     (dma_addr_t *)&mem->pa, GFP_KERNEL);
 	if (!mem->va)
 		return I40IW_ERR_NO_MEMORY;
 	return 0;
diff --git a/drivers/infiniband/hw/i40iw/i40iw_verbs.c b/drivers/infiniband/hw/i40iw/i40iw_verbs.c
index e2e6c74..cd9ee16 100644
--- a/drivers/infiniband/hw/i40iw/i40iw_verbs.c
+++ b/drivers/infiniband/hw/i40iw/i40iw_verbs.c
@@ -45,6 +45,7 @@
 #include <rdma/iw_cm.h>
 #include <rdma/ib_user_verbs.h>
 #include <rdma/ib_umem.h>
+#include <rdma/uverbs_ioctl.h>
 #include "i40iw.h"
 
 /**
@@ -96,18 +97,7 @@
 			    u8 port,
 			    struct ib_port_attr *props)
 {
-	struct i40iw_device *iwdev = to_iwdev(ibdev);
-	struct net_device *netdev = iwdev->netdev;
-
-	/* props being zeroed by the caller, avoid zeroing it here */
-	props->max_mtu = IB_MTU_4096;
-	props->active_mtu = ib_mtu_int_to_enum(netdev->mtu);
-
 	props->lid = 1;
-	if (netif_carrier_ok(iwdev->netdev))
-		props->state = IB_PORT_ACTIVE;
-	else
-		props->state = IB_PORT_DOWN;
 	props->port_cap_flags = IB_PORT_CM_SUP | IB_PORT_REINIT_SUP |
 		IB_PORT_VENDOR_CLASS_SUP | IB_PORT_BOOT_MGMT_SUP;
 	props->gid_tbl_len = 1;
@@ -120,78 +110,55 @@
 
 /**
  * i40iw_alloc_ucontext - Allocate the user context data structure
- * @ibdev: device pointer from stack
+ * @uctx: Uverbs context pointer from stack
  * @udata: user data
  *
  * This keeps track of all objects associated with a particular
  * user-mode client.
  */
-static struct ib_ucontext *i40iw_alloc_ucontext(struct ib_device *ibdev,
-						struct ib_udata *udata)
+static int i40iw_alloc_ucontext(struct ib_ucontext *uctx,
+				struct ib_udata *udata)
 {
+	struct ib_device *ibdev = uctx->device;
 	struct i40iw_device *iwdev = to_iwdev(ibdev);
 	struct i40iw_alloc_ucontext_req req;
-	struct i40iw_alloc_ucontext_resp uresp;
-	struct i40iw_ucontext *ucontext;
+	struct i40iw_alloc_ucontext_resp uresp = {};
+	struct i40iw_ucontext *ucontext = to_ucontext(uctx);
 
 	if (ib_copy_from_udata(&req, udata, sizeof(req)))
-		return ERR_PTR(-EINVAL);
+		return -EINVAL;
 
 	if (req.userspace_ver < 4 || req.userspace_ver > I40IW_ABI_VER) {
 		i40iw_pr_err("Unsupported provider library version %u.\n", req.userspace_ver);
-		return ERR_PTR(-EINVAL);
+		return -EINVAL;
 	}
 
-	memset(&uresp, 0, sizeof(uresp));
 	uresp.max_qps = iwdev->max_qp;
 	uresp.max_pds = iwdev->max_pd;
 	uresp.wq_size = iwdev->max_qp_wr * 2;
 	uresp.kernel_ver = req.userspace_ver;
 
-	ucontext = kzalloc(sizeof(*ucontext), GFP_KERNEL);
-	if (!ucontext)
-		return ERR_PTR(-ENOMEM);
-
 	ucontext->iwdev = iwdev;
 	ucontext->abi_ver = req.userspace_ver;
 
-	if (ib_copy_to_udata(udata, &uresp, sizeof(uresp))) {
-		kfree(ucontext);
-		return ERR_PTR(-EFAULT);
-	}
+	if (ib_copy_to_udata(udata, &uresp, sizeof(uresp)))
+		return -EFAULT;
 
 	INIT_LIST_HEAD(&ucontext->cq_reg_mem_list);
 	spin_lock_init(&ucontext->cq_reg_mem_list_lock);
 	INIT_LIST_HEAD(&ucontext->qp_reg_mem_list);
 	spin_lock_init(&ucontext->qp_reg_mem_list_lock);
 
-	return &ucontext->ibucontext;
+	return 0;
 }
 
 /**
  * i40iw_dealloc_ucontext - deallocate the user context data structure
  * @context: user context created during alloc
  */
-static int i40iw_dealloc_ucontext(struct ib_ucontext *context)
+static void i40iw_dealloc_ucontext(struct ib_ucontext *context)
 {
-	struct i40iw_ucontext *ucontext = to_ucontext(context);
-	unsigned long flags;
-
-	spin_lock_irqsave(&ucontext->cq_reg_mem_list_lock, flags);
-	if (!list_empty(&ucontext->cq_reg_mem_list)) {
-		spin_unlock_irqrestore(&ucontext->cq_reg_mem_list_lock, flags);
-		return -EBUSY;
-	}
-	spin_unlock_irqrestore(&ucontext->cq_reg_mem_list_lock, flags);
-	spin_lock_irqsave(&ucontext->qp_reg_mem_list_lock, flags);
-	if (!list_empty(&ucontext->qp_reg_mem_list)) {
-		spin_unlock_irqrestore(&ucontext->qp_reg_mem_list_lock, flags);
-		return -EBUSY;
-	}
-	spin_unlock_irqrestore(&ucontext->qp_reg_mem_list_lock, flags);
-
-	kfree(ucontext);
-	return 0;
+	return;
 }
 
 /**
@@ -312,43 +279,34 @@
 
 /**
  * i40iw_alloc_pd - allocate protection domain
- * @ibdev: device pointer from stack
- * @context: user context created during alloc
+ * @pd: PD pointer
  * @udata: user data
  */
-static struct ib_pd *i40iw_alloc_pd(struct ib_device *ibdev,
-				    struct ib_ucontext *context,
-				    struct ib_udata *udata)
+static int i40iw_alloc_pd(struct ib_pd *pd, struct ib_udata *udata)
 {
-	struct i40iw_pd *iwpd;
-	struct i40iw_device *iwdev = to_iwdev(ibdev);
+	struct i40iw_pd *iwpd = to_iwpd(pd);
+	struct i40iw_device *iwdev = to_iwdev(pd->device);
 	struct i40iw_sc_dev *dev = &iwdev->sc_dev;
 	struct i40iw_alloc_pd_resp uresp;
 	struct i40iw_sc_pd *sc_pd;
-	struct i40iw_ucontext *ucontext;
 	u32 pd_id = 0;
 	int err;
 
 	if (iwdev->closing)
-		return ERR_PTR(-ENODEV);
+		return -ENODEV;
 
 	err = i40iw_alloc_resource(iwdev, iwdev->allocated_pds,
 				   iwdev->max_pd, &pd_id, &iwdev->next_pd);
 	if (err) {
 		i40iw_pr_err("alloc resource failed\n");
-		return ERR_PTR(err);
-	}
-
-	iwpd = kzalloc(sizeof(*iwpd), GFP_KERNEL);
-	if (!iwpd) {
-		err = -ENOMEM;
-		goto free_res;
+		return err;
 	}
 
 	sc_pd = &iwpd->sc_pd;
 
-	if (context) {
-		ucontext = to_ucontext(context);
+	if (udata) {
+		struct i40iw_ucontext *ucontext = rdma_udata_to_drv_context(
+			udata, struct i40iw_ucontext, ibucontext);
 		dev->iw_pd_ops->pd_init(dev, sc_pd, pd_id, ucontext->abi_ver);
 		memset(&uresp, 0, sizeof(uresp));
 		uresp.pd_id = pd_id;
@@ -361,25 +319,24 @@
 	}
 
 	i40iw_add_pdusecount(iwpd);
-	return &iwpd->ibpd;
+	return 0;
+
 error:
-	kfree(iwpd);
-free_res:
 	i40iw_free_resource(iwdev, iwdev->allocated_pds, pd_id);
-	return ERR_PTR(err);
+	return err;
 }
 
 /**
  * i40iw_dealloc_pd - deallocate pd
  * @ibpd: ptr of pd to be deallocated
+ * @udata: user data or null for kernel object
  */
-static int i40iw_dealloc_pd(struct ib_pd *ibpd)
+static void i40iw_dealloc_pd(struct ib_pd *ibpd, struct ib_udata *udata)
 {
 	struct i40iw_pd *iwpd = to_iwpd(ibpd);
 	struct i40iw_device *iwdev = to_iwdev(ibpd->device);
 
 	i40iw_rem_pdusecount(iwpd, iwdev);
-	return 0;
 }
 
 /**
@@ -444,7 +401,7 @@
  * i40iw_destroy_qp - destroy qp
  * @ibqp: qp's ib pointer also to get to device's qp address
  */
-static int i40iw_destroy_qp(struct ib_qp *ibqp)
+static int i40iw_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata)
 {
 	struct i40iw_qp *iwqp = to_iwqp(ibqp);
 
@@ -565,7 +522,8 @@
 	struct i40iw_device *iwdev = to_iwdev(ibpd->device);
 	struct i40iw_cqp *iwcqp = &iwdev->cqp;
 	struct i40iw_qp *iwqp;
-	struct i40iw_ucontext *ucontext;
+	struct i40iw_ucontext *ucontext = rdma_udata_to_drv_context(
+		udata, struct i40iw_ucontext, ibucontext);
 	struct i40iw_create_qp_req req;
 	struct i40iw_create_qp_resp uresp;
 	u32 qp_num = 0;
@@ -673,28 +631,25 @@
 			goto error;
 		}
 		iwqp->ctx_info.qp_compl_ctx = req.user_compl_ctx;
-		if (ibpd->uobject && ibpd->uobject->context) {
-			iwqp->user_mode = 1;
-			ucontext = to_ucontext(ibpd->uobject->context);
+		iwqp->user_mode = 1;
 
-			if (req.user_wqe_buffers) {
-				struct i40iw_pbl *iwpbl;
+		if (req.user_wqe_buffers) {
+			struct i40iw_pbl *iwpbl;
 
-				spin_lock_irqsave(
-				    &ucontext->qp_reg_mem_list_lock, flags);
-				iwpbl = i40iw_get_pbl(
-				    (unsigned long)req.user_wqe_buffers,
-				    &ucontext->qp_reg_mem_list);
-				spin_unlock_irqrestore(
-				    &ucontext->qp_reg_mem_list_lock, flags);
+			spin_lock_irqsave(
+			    &ucontext->qp_reg_mem_list_lock, flags);
+			iwpbl = i40iw_get_pbl(
+			    (unsigned long)req.user_wqe_buffers,
+			    &ucontext->qp_reg_mem_list);
+			spin_unlock_irqrestore(
+			    &ucontext->qp_reg_mem_list_lock, flags);
 
-				if (!iwpbl) {
-					err_code = -ENODATA;
-					i40iw_pr_err("no pbl info\n");
-					goto error;
-				}
-				memcpy(&iwqp->iwpbl, iwpbl, sizeof(iwqp->iwpbl));
+			if (!iwpbl) {
+				err_code = -ENODATA;
+				i40iw_pr_err("no pbl info\n");
+				goto error;
 			}
+			memcpy(&iwqp->iwpbl, iwpbl, sizeof(iwqp->iwpbl));
 		}
 		err_code = i40iw_setup_virt_qp(iwdev, iwqp, &init_info);
 	} else {
@@ -768,7 +723,7 @@
 	iwdev->qp_table[qp_num] = iwqp;
 	i40iw_add_pdusecount(iwqp->iwpd);
 	i40iw_add_devusecount(iwdev);
-	if (ibpd->uobject && udata) {
+	if (udata) {
 		memset(&uresp, 0, sizeof(uresp));
 		uresp.actual_sq_size = sq_size;
 		uresp.actual_rq_size = rq_size;
@@ -777,8 +732,8 @@
 		err_code = ib_copy_to_udata(udata, &uresp, sizeof(uresp));
 		if (err_code) {
 			i40iw_pr_err("copy_to_udata failed\n");
-			i40iw_destroy_qp(&iwqp->ibqp);
-			   /* let the completion of the qp destroy free the qp */
+			i40iw_destroy_qp(&iwqp->ibqp, udata);
+			/* let the completion of the qp destroy free the qp */
 			return ERR_PTR(err_code);
 		}
 	}
@@ -806,6 +761,8 @@
 	struct i40iw_qp *iwqp = to_iwqp(ibqp);
 	struct i40iw_sc_qp *qp = &iwqp->sc_qp;
 
+	attr->qp_state = iwqp->ibqp_state;
+	attr->cur_qp_state = attr->qp_state;
 	attr->qp_access_flags = 0;
 	attr->cap.max_send_wr = qp->qp_uk.sq_size;
 	attr->cap.max_recv_wr = qp->qp_uk.rq_size;
@@ -1096,47 +1053,40 @@
 /**
  * i40iw_destroy_cq - destroy cq
  * @ib_cq: cq pointer
+ * @udata: user data or NULL for kernel object
  */
-static int i40iw_destroy_cq(struct ib_cq *ib_cq)
+static void i40iw_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata)
 {
 	struct i40iw_cq *iwcq;
 	struct i40iw_device *iwdev;
 	struct i40iw_sc_cq *cq;
 
-	if (!ib_cq) {
-		i40iw_pr_err("ib_cq == NULL\n");
-		return 0;
-	}
-
 	iwcq = to_iwcq(ib_cq);
 	iwdev = to_iwdev(ib_cq->device);
 	cq = &iwcq->sc_cq;
 	i40iw_cq_wq_destroy(iwdev, cq);
 	cq_free_resources(iwdev, iwcq);
-	kfree(iwcq);
 	i40iw_rem_devusecount(iwdev);
-	return 0;
 }
 
 /**
  * i40iw_create_cq - create cq
- * @ibdev: device pointer from stack
+ * @ibcq: CQ allocated
  * @attr: attributes for cq
- * @context: user context created during alloc
  * @udata: user data
  */
-static struct ib_cq *i40iw_create_cq(struct ib_device *ibdev,
-				     const struct ib_cq_init_attr *attr,
-				     struct ib_ucontext *context,
-				     struct ib_udata *udata)
+static int i40iw_create_cq(struct ib_cq *ibcq,
+			   const struct ib_cq_init_attr *attr,
+			   struct ib_udata *udata)
 {
+	struct ib_device *ibdev = ibcq->device;
 	struct i40iw_device *iwdev = to_iwdev(ibdev);
-	struct i40iw_cq *iwcq;
+	struct i40iw_cq *iwcq = to_iwcq(ibcq);
 	struct i40iw_pbl *iwpbl;
 	u32 cq_num = 0;
 	struct i40iw_sc_cq *cq;
 	struct i40iw_sc_dev *dev = &iwdev->sc_dev;
-	struct i40iw_cq_init_info info;
+	struct i40iw_cq_init_info info = {};
 	enum i40iw_status_code status;
 	struct i40iw_cqp_request *cqp_request;
 	struct cqp_commands_info *cqp_info;
@@ -1146,22 +1096,16 @@
 	int entries = attr->cqe;
 
 	if (iwdev->closing)
-		return ERR_PTR(-ENODEV);
+		return -ENODEV;
 
 	if (entries > iwdev->max_cqe)
-		return ERR_PTR(-EINVAL);
-
-	iwcq = kzalloc(sizeof(*iwcq), GFP_KERNEL);
-	if (!iwcq)
-		return ERR_PTR(-ENOMEM);
-
-	memset(&info, 0, sizeof(info));
+		return -EINVAL;
 
 	err_code = i40iw_alloc_resource(iwdev, iwdev->allocated_cqs,
 					iwdev->max_cq, &cq_num,
 					&iwdev->next_cq);
 	if (err_code)
-		goto error;
+		return err_code;
 
 	cq = &iwcq->sc_cq;
 	cq->back_cq = (void *)iwcq;
@@ -1177,14 +1121,14 @@
 	info.ceq_id_valid = true;
 	info.ceqe_mask = 1;
 	info.type = I40IW_CQ_TYPE_IWARP;
-	if (context) {
-		struct i40iw_ucontext *ucontext;
+	if (udata) {
+		struct i40iw_ucontext *ucontext = rdma_udata_to_drv_context(
+			udata, struct i40iw_ucontext, ibucontext);
 		struct i40iw_create_cq_req req;
 		struct i40iw_cq_mr *cqmr;
 
 		memset(&req, 0, sizeof(req));
 		iwcq->user_mode = true;
-		ucontext = to_ucontext(context);
 		if (ib_copy_from_udata(&req, udata, sizeof(struct i40iw_create_cq_req))) {
 			err_code = -EFAULT;
 			goto cq_free_resources;
@@ -1254,7 +1198,7 @@
 		goto cq_free_resources;
 	}
 
-	if (context) {
+	if (udata) {
 		struct i40iw_create_cq_resp resp;
 
 		memset(&resp, 0, sizeof(resp));
@@ -1268,15 +1212,13 @@
 	}
 
 	i40iw_add_devusecount(iwdev);
-	return (struct ib_cq *)iwcq;
+	return 0;
 
 cq_destroy:
 	i40iw_cq_wq_destroy(iwdev, cq);
 cq_free_resources:
 	cq_free_resources(iwdev, iwcq);
-error:
-	kfree(iwcq);
-	return ERR_PTR(err_code);
+	return err_code;
 }
 
 /**
@@ -1371,58 +1313,24 @@
 {
 	struct ib_umem *region = iwmr->region;
 	struct i40iw_pbl *iwpbl = &iwmr->iwpbl;
-	int chunk_pages, entry, i;
 	struct i40iw_pble_alloc *palloc = &iwpbl->pble_alloc;
 	struct i40iw_pble_info *pinfo;
-	struct scatterlist *sg;
-	u64 pg_addr = 0;
+	struct ib_block_iter biter;
 	u32 idx = 0;
 
 	pinfo = (level == I40IW_LEVEL_1) ? NULL : palloc->level2.leaf;
 
-	for_each_sg(region->sg_head.sgl, sg, region->nmap, entry) {
-		chunk_pages = sg_dma_len(sg) >> region->page_shift;
-		if ((iwmr->type == IW_MEMREG_TYPE_QP) &&
-		    !iwpbl->qp_mr.sq_page)
-			iwpbl->qp_mr.sq_page = sg_page(sg);
-		for (i = 0; i < chunk_pages; i++) {
-			pg_addr = sg_dma_address(sg) +
-				(i << region->page_shift);
+	if (iwmr->type == IW_MEMREG_TYPE_QP)
+		iwpbl->qp_mr.sq_page = sg_page(region->sg_head.sgl);
 
-			if ((entry + i) == 0)
-				*pbl = cpu_to_le64(pg_addr & iwmr->page_msk);
-			else if (!(pg_addr & ~iwmr->page_msk))
-				*pbl = cpu_to_le64(pg_addr);
-			else
-				continue;
-			pbl = i40iw_next_pbl_addr(pbl, &pinfo, &idx);
-		}
+	rdma_for_each_block(region->sg_head.sgl, &biter, region->nmap,
+			    iwmr->page_size) {
+		*pbl = rdma_block_iter_dma_address(&biter);
+		pbl = i40iw_next_pbl_addr(pbl, &pinfo, &idx);
 	}
 }
 
 /**
- * i40iw_set_hugetlb_params - set MR pg size and mask to huge pg values.
- * @addr: virtual address
- * @iwmr: mr pointer for this memory registration
- */
-static void i40iw_set_hugetlb_values(u64 addr, struct i40iw_mr *iwmr)
-{
-	struct vm_area_struct *vma;
-	struct hstate *h;
-
-	down_read(&current->mm->mmap_sem);
-	vma = find_vma(current->mm, addr);
-	if (vma && is_vm_hugetlb_page(vma)) {
-		h = hstate_vma(vma);
-		if (huge_page_size(h) == 0x200000) {
-			iwmr->page_size = huge_page_size(h);
-			iwmr->page_msk = huge_page_mask(h);
-		}
-	}
-	up_read(&current->mm->mmap_sem);
-}
-
-/**
  * i40iw_check_mem_contiguous - check if pbls stored in arr are contiguous
  * @arr: lvl1 pbl array
  * @npages: page count
@@ -1637,10 +1545,10 @@
  * @pd: ibpd pointer
  * @mr_type: memory for stag registrion
  * @max_num_sg: man number of pages
+ * @udata: user data or NULL for kernel objects
  */
-static struct ib_mr *i40iw_alloc_mr(struct ib_pd *pd,
-				    enum ib_mr_type mr_type,
-				    u32 max_num_sg)
+static struct ib_mr *i40iw_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
+				    u32 max_num_sg, struct ib_udata *udata)
 {
 	struct i40iw_pd *iwpd = to_iwpd(pd);
 	struct i40iw_device *iwdev = to_iwdev(pd->device);
@@ -1833,7 +1741,8 @@
 {
 	struct i40iw_pd *iwpd = to_iwpd(pd);
 	struct i40iw_device *iwdev = to_iwdev(pd->device);
-	struct i40iw_ucontext *ucontext;
+	struct i40iw_ucontext *ucontext = rdma_udata_to_drv_context(
+		udata, struct i40iw_ucontext, ibucontext);
 	struct i40iw_pble_alloc *palloc;
 	struct i40iw_pbl *iwpbl;
 	struct i40iw_mr *iwmr;
@@ -1854,7 +1763,7 @@
 
 	if (length > I40IW_MAX_MR_SIZE)
 		return ERR_PTR(-EINVAL);
-	region = ib_umem_get(pd->uobject->context, start, length, acc, 0);
+	region = ib_umem_get(udata, start, length, acc, 0);
 	if (IS_ERR(region))
 		return (struct ib_mr *)region;
 
@@ -1874,13 +1783,11 @@
 	iwmr->region = region;
 	iwmr->ibmr.pd = pd;
 	iwmr->ibmr.device = pd->device;
-	ucontext = to_ucontext(pd->uobject->context);
 
 	iwmr->page_size = PAGE_SIZE;
-	iwmr->page_msk = PAGE_MASK;
-
-	if (region->hugetlb && (req.reg_type == IW_MEMREG_TYPE_MEM))
-		i40iw_set_hugetlb_values(start, iwmr);
+	if (req.reg_type == IW_MEMREG_TYPE_MEM)
+		iwmr->page_size = ib_umem_find_best_pgsz(region, SZ_4K | SZ_2M,
+							 virt);
 
 	region_length = region->length + (start & (iwmr->page_size - 1));
 	pg_shift = ffs(iwmr->page_size) - 1;
@@ -2074,7 +1981,7 @@
  * i40iw_dereg_mr - deregister mr
  * @ib_mr: mr ptr for dereg
  */
-static int i40iw_dereg_mr(struct ib_mr *ib_mr)
+static int i40iw_dereg_mr(struct ib_mr *ib_mr, struct ib_udata *udata)
 {
 	struct ib_pd *ibpd = ib_mr->pd;
 	struct i40iw_pd *iwpd = to_iwpd(ibpd);
@@ -2088,14 +1995,17 @@
 	struct cqp_commands_info *cqp_info;
 	u32 stag_idx;
 
-	if (iwmr->region)
-		ib_umem_release(iwmr->region);
+	ib_umem_release(iwmr->region);
 
 	if (iwmr->type != IW_MEMREG_TYPE_MEM) {
-		if (ibpd->uobject) {
-			struct i40iw_ucontext *ucontext;
+		/* region is released. only test for userness. */
+		if (iwmr->region) {
+			struct i40iw_ucontext *ucontext =
+				rdma_udata_to_drv_context(
+					udata,
+					struct i40iw_ucontext,
+					ibucontext);
 
-			ucontext = to_ucontext(ibpd->uobject->context);
 			i40iw_del_memlist(iwmr, ucontext);
 		}
 		if (iwpbl->pbl_allocated && iwmr->type != IW_MEMREG_TYPE_QP)
@@ -2135,46 +2045,48 @@
 }
 
 /**
- * i40iw_show_rev
+ * hw_rev_show
  */
-static ssize_t i40iw_show_rev(struct device *dev,
-			      struct device_attribute *attr, char *buf)
+static ssize_t hw_rev_show(struct device *dev,
+			   struct device_attribute *attr, char *buf)
 {
-	struct i40iw_ib_device *iwibdev = container_of(dev,
-						       struct i40iw_ib_device,
-						       ibdev.dev);
+	struct i40iw_ib_device *iwibdev =
+		rdma_device_to_drv_device(dev, struct i40iw_ib_device, ibdev);
 	u32 hw_rev = iwibdev->iwdev->sc_dev.hw_rev;
 
 	return sprintf(buf, "%x\n", hw_rev);
 }
+static DEVICE_ATTR_RO(hw_rev);
 
 /**
- * i40iw_show_hca
+ * hca_type_show
  */
-static ssize_t i40iw_show_hca(struct device *dev,
-			      struct device_attribute *attr, char *buf)
+static ssize_t hca_type_show(struct device *dev,
+			     struct device_attribute *attr, char *buf)
 {
 	return sprintf(buf, "I40IW\n");
 }
+static DEVICE_ATTR_RO(hca_type);
 
 /**
- * i40iw_show_board
+ * board_id_show
  */
-static ssize_t i40iw_show_board(struct device *dev,
-				struct device_attribute *attr,
-				char *buf)
+static ssize_t board_id_show(struct device *dev,
+			     struct device_attribute *attr, char *buf)
 {
 	return sprintf(buf, "%.*s\n", 32, "I40IW Board ID");
 }
+static DEVICE_ATTR_RO(board_id);
 
-static DEVICE_ATTR(hw_rev, S_IRUGO, i40iw_show_rev, NULL);
-static DEVICE_ATTR(hca_type, S_IRUGO, i40iw_show_hca, NULL);
-static DEVICE_ATTR(board_id, S_IRUGO, i40iw_show_board, NULL);
+static struct attribute *i40iw_dev_attributes[] = {
+	&dev_attr_hw_rev.attr,
+	&dev_attr_hca_type.attr,
+	&dev_attr_board_id.attr,
+	NULL
+};
 
-static struct device_attribute *i40iw_dev_attributes[] = {
-	&dev_attr_hw_rev,
-	&dev_attr_hca_type,
-	&dev_attr_board_id
+static const struct attribute_group i40iw_attr_group = {
+	.attrs = i40iw_dev_attributes,
 };
 
 /**
@@ -2718,24 +2630,54 @@
 	return 0;
 }
 
-/**
- * i40iw_get_vector_affinity - report IRQ affinity mask
- * @ibdev: IB device
- * @comp_vector: completion vector index
- */
-static const struct cpumask *i40iw_get_vector_affinity(struct ib_device *ibdev,
-						       int comp_vector)
-{
-	struct i40iw_device *iwdev = to_iwdev(ibdev);
-	struct i40iw_msix_vector *msix_vec;
+static const struct ib_device_ops i40iw_dev_ops = {
+	.owner = THIS_MODULE,
+	.driver_id = RDMA_DRIVER_I40IW,
+	/* NOTE: Older kernels wrongly use 0 for the uverbs_abi_ver */
+	.uverbs_abi_ver = I40IW_ABI_VER,
 
-	if (iwdev->msix_shared)
-		msix_vec = &iwdev->iw_msixtbl[comp_vector];
-	else
-		msix_vec = &iwdev->iw_msixtbl[comp_vector + 1];
-
-	return irq_get_affinity_mask(msix_vec->irq);
-}
+	.alloc_hw_stats = i40iw_alloc_hw_stats,
+	.alloc_mr = i40iw_alloc_mr,
+	.alloc_pd = i40iw_alloc_pd,
+	.alloc_ucontext = i40iw_alloc_ucontext,
+	.create_cq = i40iw_create_cq,
+	.create_qp = i40iw_create_qp,
+	.dealloc_pd = i40iw_dealloc_pd,
+	.dealloc_ucontext = i40iw_dealloc_ucontext,
+	.dereg_mr = i40iw_dereg_mr,
+	.destroy_cq = i40iw_destroy_cq,
+	.destroy_qp = i40iw_destroy_qp,
+	.drain_rq = i40iw_drain_rq,
+	.drain_sq = i40iw_drain_sq,
+	.get_dev_fw_str = i40iw_get_dev_fw_str,
+	.get_dma_mr = i40iw_get_dma_mr,
+	.get_hw_stats = i40iw_get_hw_stats,
+	.get_port_immutable = i40iw_port_immutable,
+	.iw_accept = i40iw_accept,
+	.iw_add_ref = i40iw_add_ref,
+	.iw_connect = i40iw_connect,
+	.iw_create_listen = i40iw_create_listen,
+	.iw_destroy_listen = i40iw_destroy_listen,
+	.iw_get_qp = i40iw_get_qp,
+	.iw_reject = i40iw_reject,
+	.iw_rem_ref = i40iw_rem_ref,
+	.map_mr_sg = i40iw_map_mr_sg,
+	.mmap = i40iw_mmap,
+	.modify_qp = i40iw_modify_qp,
+	.poll_cq = i40iw_poll_cq,
+	.post_recv = i40iw_post_recv,
+	.post_send = i40iw_post_send,
+	.query_device = i40iw_query_device,
+	.query_gid = i40iw_query_gid,
+	.query_pkey = i40iw_query_pkey,
+	.query_port = i40iw_query_port,
+	.query_qp = i40iw_query_qp,
+	.reg_user_mr = i40iw_reg_user_mr,
+	.req_notify_cq = i40iw_req_notify_cq,
+	INIT_RDMA_OBJ_SIZE(ib_pd, i40iw_pd, ibpd),
+	INIT_RDMA_OBJ_SIZE(ib_cq, i40iw_cq, ibcq),
+	INIT_RDMA_OBJ_SIZE(ib_ucontext, i40iw_ucontext, ibucontext),
+};
 
 /**
  * i40iw_init_rdma_device - initialization of iwarp device
@@ -2747,13 +2689,11 @@
 	struct net_device *netdev = iwdev->netdev;
 	struct pci_dev *pcidev = (struct pci_dev *)iwdev->hw.dev_context;
 
-	iwibdev = (struct i40iw_ib_device *)ib_alloc_device(sizeof(*iwibdev));
+	iwibdev = ib_alloc_device(i40iw_ib_device, ibdev);
 	if (!iwibdev) {
 		i40iw_pr_err("iwdev == NULL\n");
 		return NULL;
 	}
-	strlcpy(iwibdev->ibdev.name, "i40iw%d", IB_DEVICE_NAME_MAX);
-	iwibdev->ibdev.owner = THIS_MODULE;
 	iwdev->iwibdev = iwibdev;
 	iwibdev->iwdev = iwdev;
 
@@ -2784,53 +2724,9 @@
 	iwibdev->ibdev.phys_port_cnt = 1;
 	iwibdev->ibdev.num_comp_vectors = iwdev->ceqs_count;
 	iwibdev->ibdev.dev.parent = &pcidev->dev;
-	iwibdev->ibdev.query_port = i40iw_query_port;
-	iwibdev->ibdev.query_pkey = i40iw_query_pkey;
-	iwibdev->ibdev.query_gid = i40iw_query_gid;
-	iwibdev->ibdev.alloc_ucontext = i40iw_alloc_ucontext;
-	iwibdev->ibdev.dealloc_ucontext = i40iw_dealloc_ucontext;
-	iwibdev->ibdev.mmap = i40iw_mmap;
-	iwibdev->ibdev.alloc_pd = i40iw_alloc_pd;
-	iwibdev->ibdev.dealloc_pd = i40iw_dealloc_pd;
-	iwibdev->ibdev.create_qp = i40iw_create_qp;
-	iwibdev->ibdev.modify_qp = i40iw_modify_qp;
-	iwibdev->ibdev.query_qp = i40iw_query_qp;
-	iwibdev->ibdev.destroy_qp = i40iw_destroy_qp;
-	iwibdev->ibdev.create_cq = i40iw_create_cq;
-	iwibdev->ibdev.destroy_cq = i40iw_destroy_cq;
-	iwibdev->ibdev.get_dma_mr = i40iw_get_dma_mr;
-	iwibdev->ibdev.reg_user_mr = i40iw_reg_user_mr;
-	iwibdev->ibdev.dereg_mr = i40iw_dereg_mr;
-	iwibdev->ibdev.alloc_hw_stats = i40iw_alloc_hw_stats;
-	iwibdev->ibdev.get_hw_stats = i40iw_get_hw_stats;
-	iwibdev->ibdev.query_device = i40iw_query_device;
-	iwibdev->ibdev.drain_sq = i40iw_drain_sq;
-	iwibdev->ibdev.drain_rq = i40iw_drain_rq;
-	iwibdev->ibdev.alloc_mr = i40iw_alloc_mr;
-	iwibdev->ibdev.map_mr_sg = i40iw_map_mr_sg;
-	iwibdev->ibdev.iwcm = kzalloc(sizeof(*iwibdev->ibdev.iwcm), GFP_KERNEL);
-	if (!iwibdev->ibdev.iwcm) {
-		ib_dealloc_device(&iwibdev->ibdev);
-		return NULL;
-	}
-
-	iwibdev->ibdev.iwcm->add_ref = i40iw_add_ref;
-	iwibdev->ibdev.iwcm->rem_ref = i40iw_rem_ref;
-	iwibdev->ibdev.iwcm->get_qp = i40iw_get_qp;
-	iwibdev->ibdev.iwcm->connect = i40iw_connect;
-	iwibdev->ibdev.iwcm->accept = i40iw_accept;
-	iwibdev->ibdev.iwcm->reject = i40iw_reject;
-	iwibdev->ibdev.iwcm->create_listen = i40iw_create_listen;
-	iwibdev->ibdev.iwcm->destroy_listen = i40iw_destroy_listen;
-	memcpy(iwibdev->ibdev.iwcm->ifname, netdev->name,
-	       sizeof(iwibdev->ibdev.iwcm->ifname));
-	iwibdev->ibdev.get_port_immutable   = i40iw_port_immutable;
-	iwibdev->ibdev.get_dev_fw_str       = i40iw_get_dev_fw_str;
-	iwibdev->ibdev.poll_cq = i40iw_poll_cq;
-	iwibdev->ibdev.req_notify_cq = i40iw_req_notify_cq;
-	iwibdev->ibdev.post_send = i40iw_post_send;
-	iwibdev->ibdev.post_recv = i40iw_post_recv;
-	iwibdev->ibdev.get_vector_affinity = i40iw_get_vector_affinity;
+	memcpy(iwibdev->ibdev.iw_ifname, netdev->name,
+	       sizeof(iwibdev->ibdev.iw_ifname));
+	ib_set_device_ops(&iwibdev->ibdev, &i40iw_dev_ops);
 
 	return iwibdev;
 }
@@ -2851,31 +2747,12 @@
 }
 
 /**
- * i40iw_unregister_rdma_device - unregister of iwarp from IB
- * @iwibdev: rdma device ptr
- */
-static void i40iw_unregister_rdma_device(struct i40iw_ib_device *iwibdev)
-{
-	int i;
-
-	for (i = 0; i < ARRAY_SIZE(i40iw_dev_attributes); ++i)
-		device_remove_file(&iwibdev->ibdev.dev,
-				   i40iw_dev_attributes[i]);
-	ib_unregister_device(&iwibdev->ibdev);
-}
-
-/**
  * i40iw_destroy_rdma_device - destroy rdma device and free resources
  * @iwibdev: IB device ptr
  */
 void i40iw_destroy_rdma_device(struct i40iw_ib_device *iwibdev)
 {
-	if (!iwibdev)
-		return;
-
-	i40iw_unregister_rdma_device(iwibdev);
-	kfree(iwibdev->ibdev.iwcm);
-	iwibdev->ibdev.iwcm = NULL;
+	ib_unregister_device(&iwibdev->ibdev);
 	wait_event_timeout(iwibdev->iwdev->close_wq,
 			   !atomic64_read(&iwibdev->iwdev->use_count),
 			   I40IW_EVENT_TIMEOUT);
@@ -2888,36 +2765,24 @@
  */
 int i40iw_register_rdma_device(struct i40iw_device *iwdev)
 {
-	int i, ret;
+	int ret;
 	struct i40iw_ib_device *iwibdev;
 
 	iwdev->iwibdev = i40iw_init_rdma_device(iwdev);
 	if (!iwdev->iwibdev)
 		return -ENOMEM;
 	iwibdev = iwdev->iwibdev;
-
-	iwibdev->ibdev.driver_id = RDMA_DRIVER_I40IW;
-	ret = ib_register_device(&iwibdev->ibdev, NULL);
+	rdma_set_device_sysfs_group(&iwibdev->ibdev, &i40iw_attr_group);
+	ret = ib_device_set_netdev(&iwibdev->ibdev, iwdev->netdev, 1);
 	if (ret)
 		goto error;
 
-	for (i = 0; i < ARRAY_SIZE(i40iw_dev_attributes); ++i) {
-		ret =
-		    device_create_file(&iwibdev->ibdev.dev,
-				       i40iw_dev_attributes[i]);
-		if (ret) {
-			while (i > 0) {
-				i--;
-				device_remove_file(&iwibdev->ibdev.dev, i40iw_dev_attributes[i]);
-			}
-			ib_unregister_device(&iwibdev->ibdev);
-			goto error;
-		}
-	}
+	ret = ib_register_device(&iwibdev->ibdev, "i40iw%d");
+	if (ret)
+		goto error;
+
 	return 0;
 error:
-	kfree(iwdev->iwibdev->ibdev.iwcm);
-	iwdev->iwibdev->ibdev.iwcm = NULL;
 	ib_dealloc_device(&iwdev->iwibdev->ibdev);
 	return ret;
 }
diff --git a/drivers/infiniband/hw/i40iw/i40iw_verbs.h b/drivers/infiniband/hw/i40iw/i40iw_verbs.h
index 76cf173..3a41375 100644
--- a/drivers/infiniband/hw/i40iw/i40iw_verbs.h
+++ b/drivers/infiniband/hw/i40iw/i40iw_verbs.h
@@ -94,8 +94,7 @@
 	struct ib_umem *region;
 	u16 type;
 	u32 page_cnt;
-	u32 page_size;
-	u64 page_msk;
+	u64 page_size;
 	u32 npages;
 	u32 stag;
 	u64 length;
diff --git a/drivers/infiniband/hw/mlx4/Kconfig b/drivers/infiniband/hw/mlx4/Kconfig
index db4aa13..cc7c42f 100644
--- a/drivers/infiniband/hw/mlx4/Kconfig
+++ b/drivers/infiniband/hw/mlx4/Kconfig
@@ -1,7 +1,7 @@
+# SPDX-License-Identifier: GPL-2.0-only
 config MLX4_INFINIBAND
 	tristate "Mellanox ConnectX HCA support"
 	depends on NETDEVICES && ETHERNET && PCI && INET
-	depends on MAY_USE_DEVLINK
 	select NET_VENDOR_MELLANOX
 	select MLX4_CORE
 	---help---
diff --git a/drivers/infiniband/hw/mlx4/Makefile b/drivers/infiniband/hw/mlx4/Makefile
index f4213b3..7b6757b 100644
--- a/drivers/infiniband/hw/mlx4/Makefile
+++ b/drivers/infiniband/hw/mlx4/Makefile
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 obj-$(CONFIG_MLX4_INFINIBAND)	+= mlx4_ib.o
 
 mlx4_ib-y :=	ah.o cq.o doorbell.o mad.o main.o mr.o qp.o srq.o mcg.o cm.o alias_GUID.o sysfs.o
diff --git a/drivers/infiniband/hw/mlx4/ah.c b/drivers/infiniband/hw/mlx4/ah.c
index e9e3a6f..02a169f 100644
--- a/drivers/infiniband/hw/mlx4/ah.c
+++ b/drivers/infiniband/hw/mlx4/ah.c
@@ -40,13 +40,12 @@
 
 #include "mlx4_ib.h"
 
-static struct ib_ah *create_ib_ah(struct ib_pd *pd,
-				  struct rdma_ah_attr *ah_attr,
-				  struct mlx4_ib_ah *ah)
+static void create_ib_ah(struct ib_ah *ib_ah, struct rdma_ah_attr *ah_attr)
 {
-	struct mlx4_dev *dev = to_mdev(pd->device)->dev;
+	struct mlx4_ib_ah *ah = to_mah(ib_ah);
+	struct mlx4_dev *dev = to_mdev(ib_ah->device)->dev;
 
-	ah->av.ib.port_pd = cpu_to_be32(to_mpd(pd)->pdn |
+	ah->av.ib.port_pd = cpu_to_be32(to_mpd(ib_ah->pd)->pdn |
 			    (rdma_ah_get_port_num(ah_attr) << 24));
 	ah->av.ib.g_slid  = rdma_ah_get_path_bits(ah_attr);
 	ah->av.ib.sl_tclass_flowlabel =
@@ -73,15 +72,12 @@
 			--static_rate;
 		ah->av.ib.stat_rate = static_rate;
 	}
-
-	return &ah->ibah;
 }
 
-static struct ib_ah *create_iboe_ah(struct ib_pd *pd,
-				    struct rdma_ah_attr *ah_attr,
-				    struct mlx4_ib_ah *ah)
+static int create_iboe_ah(struct ib_ah *ib_ah, struct rdma_ah_attr *ah_attr)
 {
-	struct mlx4_ib_dev *ibdev = to_mdev(pd->device);
+	struct mlx4_ib_dev *ibdev = to_mdev(ib_ah->device);
+	struct mlx4_ib_ah *ah = to_mah(ib_ah);
 	const struct ib_gid_attr *gid_attr;
 	struct mlx4_dev *dev = ibdev->dev;
 	int is_mcast = 0;
@@ -103,12 +99,14 @@
 	 */
 	gid_attr = ah_attr->grh.sgid_attr;
 	if (gid_attr) {
-		if (is_vlan_dev(gid_attr->ndev))
-			vlan_tag = vlan_dev_vlan_id(gid_attr->ndev);
-		memcpy(ah->av.eth.s_mac, gid_attr->ndev->dev_addr, ETH_ALEN);
+		ret = rdma_read_gid_l2_fields(gid_attr, &vlan_tag,
+					      &ah->av.eth.s_mac[0]);
+		if (ret)
+			return ret;
+
 		ret = mlx4_ib_gid_index_to_real_index(ibdev, gid_attr);
 		if (ret < 0)
-			return ERR_PTR(ret);
+			return ret;
 		ah->av.eth.gid_index = ret;
 	} else {
 		/* mlx4_ib_create_ah_slave fills in the s_mac and the vlan */
@@ -117,7 +115,7 @@
 
 	if (vlan_tag < 0x1000)
 		vlan_tag |= (rdma_ah_get_sl(ah_attr) & 7) << 13;
-	ah->av.eth.port_pd = cpu_to_be32(to_mpd(pd)->pdn |
+	ah->av.eth.port_pd = cpu_to_be32(to_mpd(ib_ah->pd)->pdn |
 					 (rdma_ah_get_port_num(ah_attr) << 24));
 	ah->av.eth.vlan = cpu_to_be16(vlan_tag);
 	ah->av.eth.hop_limit = grh->hop_limit;
@@ -140,63 +138,45 @@
 	memcpy(ah->av.eth.dgid, grh->dgid.raw, 16);
 	ah->av.eth.sl_tclass_flowlabel |= cpu_to_be32(rdma_ah_get_sl(ah_attr)
 						      << 29);
-	return &ah->ibah;
+	return 0;
 }
 
-struct ib_ah *mlx4_ib_create_ah(struct ib_pd *pd, struct rdma_ah_attr *ah_attr,
-				struct ib_udata *udata)
+int mlx4_ib_create_ah(struct ib_ah *ib_ah, struct rdma_ah_attr *ah_attr,
+		      u32 flags, struct ib_udata *udata)
 
 {
-	struct mlx4_ib_ah *ah;
-	struct ib_ah *ret;
-
-	ah = kzalloc(sizeof *ah, GFP_ATOMIC);
-	if (!ah)
-		return ERR_PTR(-ENOMEM);
-
 	if (ah_attr->type == RDMA_AH_ATTR_TYPE_ROCE) {
-		if (!(rdma_ah_get_ah_flags(ah_attr) & IB_AH_GRH)) {
-			ret = ERR_PTR(-EINVAL);
-		} else {
-			/*
-			 * TBD: need to handle the case when we get
-			 * called in an atomic context and there we
-			 * might sleep.  We don't expect this
-			 * currently since we're working with link
-			 * local addresses which we can translate
-			 * without going to sleep.
-			 */
-			ret = create_iboe_ah(pd, ah_attr, ah);
-		}
+		if (!(rdma_ah_get_ah_flags(ah_attr) & IB_AH_GRH))
+			return -EINVAL;
+		/*
+		 * TBD: need to handle the case when we get
+		 * called in an atomic context and there we
+		 * might sleep.  We don't expect this
+		 * currently since we're working with link
+		 * local addresses which we can translate
+		 * without going to sleep.
+		 */
+		return create_iboe_ah(ib_ah, ah_attr);
+	}
 
-		if (IS_ERR(ret))
-			kfree(ah);
-
-		return ret;
-	} else
-		return create_ib_ah(pd, ah_attr, ah); /* never fails */
+	create_ib_ah(ib_ah, ah_attr);
+	return 0;
 }
 
-/* AH's created via this call must be free'd by mlx4_ib_destroy_ah. */
-struct ib_ah *mlx4_ib_create_ah_slave(struct ib_pd *pd,
-				      struct rdma_ah_attr *ah_attr,
-				      int slave_sgid_index, u8 *s_mac,
-				      u16 vlan_tag)
+int mlx4_ib_create_ah_slave(struct ib_ah *ah, struct rdma_ah_attr *ah_attr,
+			    int slave_sgid_index, u8 *s_mac, u16 vlan_tag)
 {
 	struct rdma_ah_attr slave_attr = *ah_attr;
-	struct mlx4_ib_ah *mah;
-	struct ib_ah *ah;
+	struct mlx4_ib_ah *mah = to_mah(ah);
+	int ret;
 
 	slave_attr.grh.sgid_attr = NULL;
 	slave_attr.grh.sgid_index = slave_sgid_index;
-	ah = mlx4_ib_create_ah(pd, &slave_attr, NULL);
-	if (IS_ERR(ah))
-		return ah;
+	ret = mlx4_ib_create_ah(ah, &slave_attr, 0, NULL);
+	if (ret)
+		return ret;
 
-	ah->device = pd->device;
-	ah->pd = pd;
 	ah->type = ah_attr->type;
-	mah = to_mah(ah);
 
 	/* get rid of force-loopback bit */
 	mah->av.ib.port_pd &= cpu_to_be32(0x7FFFFFFF);
@@ -208,7 +188,7 @@
 		vlan_tag |= (rdma_ah_get_sl(ah_attr) & 7) << 13;
 	mah->av.eth.vlan = cpu_to_be16(vlan_tag);
 
-	return ah;
+	return 0;
 }
 
 int mlx4_ib_query_ah(struct ib_ah *ibah, struct rdma_ah_attr *ah_attr)
@@ -250,8 +230,7 @@
 	return 0;
 }
 
-int mlx4_ib_destroy_ah(struct ib_ah *ah)
+void mlx4_ib_destroy_ah(struct ib_ah *ah, u32 flags)
 {
-	kfree(to_mah(ah));
-	return 0;
+	return;
 }
diff --git a/drivers/infiniband/hw/mlx4/alias_GUID.c b/drivers/infiniband/hw/mlx4/alias_GUID.c
index 155b4df..cca414e 100644
--- a/drivers/infiniband/hw/mlx4/alias_GUID.c
+++ b/drivers/infiniband/hw/mlx4/alias_GUID.c
@@ -310,7 +310,7 @@
 	if (status) {
 		pr_debug("(port: %d) failed: status = %d\n",
 			 cb_ctx->port, status);
-		rec->time_to_run = ktime_get_boot_ns() + 1 * NSEC_PER_SEC;
+		rec->time_to_run = ktime_get_boottime_ns() + 1 * NSEC_PER_SEC;
 		goto out;
 	}
 
@@ -416,7 +416,7 @@
 			 be64_to_cpu((__force __be64)rec->guid_indexes),
 			 be64_to_cpu((__force __be64)applied_guid_indexes),
 			 be64_to_cpu((__force __be64)declined_guid_indexes));
-		rec->time_to_run = ktime_get_boot_ns() +
+		rec->time_to_run = ktime_get_boottime_ns() +
 			resched_delay_sec * NSEC_PER_SEC;
 	} else {
 		rec->status = MLX4_GUID_INFO_STATUS_SET;
@@ -709,7 +709,7 @@
 		}
 	}
 	if (resched_delay_sec) {
-		u64 curr_time = ktime_get_boot_ns();
+		u64 curr_time = ktime_get_boottime_ns();
 
 		*resched_delay_sec = (low_record_time < curr_time) ? 0 :
 			div_u64((low_record_time - curr_time), NSEC_PER_SEC);
@@ -804,8 +804,8 @@
 	unsigned long flags;
 
 	for (i = 0 ; i < dev->num_ports; i++) {
-		cancel_delayed_work(&dev->sriov.alias_guid.ports_guid[i].alias_guid_work);
 		det = &sriov->alias_guid.ports_guid[i];
+		cancel_delayed_work_sync(&det->alias_guid_work);
 		spin_lock_irqsave(&sriov->alias_guid.ag_work_lock, flags);
 		while (!list_empty(&det->cb_list)) {
 			cb_ctx = list_entry(det->cb_list.next,
@@ -849,7 +849,7 @@
 	spin_lock_init(&dev->sriov.alias_guid.ag_work_lock);
 
 	for (i = 1; i <= dev->num_ports; ++i) {
-		if (dev->ib_dev.query_gid(&dev->ib_dev , i, 0, &gid)) {
+		if (dev->ib_dev.ops.query_gid(&dev->ib_dev, i, 0, &gid)) {
 			ret = -EFAULT;
 			goto err_unregister;
 		}
diff --git a/drivers/infiniband/hw/mlx4/cm.c b/drivers/infiniband/hw/mlx4/cm.c
index fedaf82..ecd6cad 100644
--- a/drivers/infiniband/hw/mlx4/cm.c
+++ b/drivers/infiniband/hw/mlx4/cm.c
@@ -39,7 +39,7 @@
 
 #include "mlx4_ib.h"
 
-#define CM_CLEANUP_CACHE_TIMEOUT  (5 * HZ)
+#define CM_CLEANUP_CACHE_TIMEOUT  (30 * HZ)
 
 struct id_map_entry {
 	struct rb_node node;
@@ -168,20 +168,17 @@
 {
 	struct delayed_work *delay = to_delayed_work(work);
 	struct id_map_entry *ent = container_of(delay, struct id_map_entry, timeout);
-	struct id_map_entry *db_ent, *found_ent;
+	struct id_map_entry *found_ent;
 	struct mlx4_ib_dev *dev = ent->dev;
 	struct mlx4_ib_sriov *sriov = &dev->sriov;
 	struct rb_root *sl_id_map = &sriov->sl_id_map;
-	int pv_id = (int) ent->pv_cm_id;
 
 	spin_lock(&sriov->id_map_lock);
-	db_ent = (struct id_map_entry *)idr_find(&sriov->pv_id_table, pv_id);
-	if (!db_ent)
+	if (!xa_erase(&sriov->pv_id_table, ent->pv_cm_id))
 		goto out;
 	found_ent = id_map_find_by_sl_id(&dev->ib_dev, ent->slave_id, ent->sl_cm_id);
 	if (found_ent && found_ent == ent)
 		rb_erase(&found_ent->node, sl_id_map);
-	idr_remove(&sriov->pv_id_table, pv_id);
 
 out:
 	list_del(&ent->list);
@@ -196,13 +193,12 @@
 	struct id_map_entry *ent, *found_ent;
 
 	spin_lock(&sriov->id_map_lock);
-	ent = (struct id_map_entry *)idr_find(&sriov->pv_id_table, pv_cm_id);
+	ent = xa_erase(&sriov->pv_id_table, pv_cm_id);
 	if (!ent)
 		goto out;
 	found_ent = id_map_find_by_sl_id(ibdev, ent->slave_id, ent->sl_cm_id);
 	if (found_ent && found_ent == ent)
 		rb_erase(&found_ent->node, sl_id_map);
-	idr_remove(&sriov->pv_id_table, pv_cm_id);
 out:
 	spin_unlock(&sriov->id_map_lock);
 }
@@ -256,25 +252,19 @@
 	ent->dev = to_mdev(ibdev);
 	INIT_DELAYED_WORK(&ent->timeout, id_map_ent_timeout);
 
-	idr_preload(GFP_KERNEL);
-	spin_lock(&to_mdev(ibdev)->sriov.id_map_lock);
-
-	ret = idr_alloc_cyclic(&sriov->pv_id_table, ent, 0, 0, GFP_NOWAIT);
+	ret = xa_alloc_cyclic(&sriov->pv_id_table, &ent->pv_cm_id, ent,
+			xa_limit_32b, &sriov->pv_id_next, GFP_KERNEL);
 	if (ret >= 0) {
-		ent->pv_cm_id = (u32)ret;
+		spin_lock(&sriov->id_map_lock);
 		sl_id_map_add(ibdev, ent);
 		list_add_tail(&ent->list, &sriov->cm_list);
-	}
-
-	spin_unlock(&sriov->id_map_lock);
-	idr_preload_end();
-
-	if (ret >= 0)
+		spin_unlock(&sriov->id_map_lock);
 		return ent;
+	}
 
 	/*error flow*/
 	kfree(ent);
-	mlx4_ib_warn(ibdev, "No more space in the idr (err:0x%x)\n", ret);
+	mlx4_ib_warn(ibdev, "Allocation failed (err:0x%x)\n", ret);
 	return ERR_PTR(-ENOMEM);
 }
 
@@ -290,7 +280,7 @@
 		if (ent)
 			*pv_cm_id = (int) ent->pv_cm_id;
 	} else
-		ent = (struct id_map_entry *)idr_find(&sriov->pv_id_table, *pv_cm_id);
+		ent = xa_load(&sriov->pv_id_table, *pv_cm_id);
 	spin_unlock(&sriov->id_map_lock);
 
 	return ent;
@@ -407,7 +397,7 @@
 	spin_lock_init(&dev->sriov.id_map_lock);
 	INIT_LIST_HEAD(&dev->sriov.cm_list);
 	dev->sriov.sl_id_map = RB_ROOT;
-	idr_init(&dev->sriov.pv_id_table);
+	xa_init_flags(&dev->sriov.pv_id_table, XA_FLAGS_ALLOC);
 }
 
 /* slave = -1 ==> all slaves */
@@ -444,7 +434,7 @@
 					 struct id_map_entry, node);
 
 			rb_erase(&ent->node, sl_id_map);
-			idr_remove(&sriov->pv_id_table, (int) ent->pv_cm_id);
+			xa_erase(&sriov->pv_id_table, ent->pv_cm_id);
 		}
 		list_splice_init(&dev->sriov.cm_list, &lh);
 	} else {
@@ -460,7 +450,7 @@
 		/* remove those nodes from databases */
 		list_for_each_entry_safe(map, tmp_map, &lh, list) {
 			rb_erase(&map->node, sl_id_map);
-			idr_remove(&sriov->pv_id_table, (int) map->pv_cm_id);
+			xa_erase(&sriov->pv_id_table, map->pv_cm_id);
 		}
 
 		/* add remaining nodes from cm_list */
diff --git a/drivers/infiniband/hw/mlx4/cq.c b/drivers/infiniband/hw/mlx4/cq.c
index 82adc0d..a7d238d 100644
--- a/drivers/infiniband/hw/mlx4/cq.c
+++ b/drivers/infiniband/hw/mlx4/cq.c
@@ -38,6 +38,7 @@
 
 #include "mlx4_ib.h"
 #include <rdma/mlx4-abi.h>
+#include <rdma/uverbs_ioctl.h>
 
 static void mlx4_ib_cq_comp(struct mlx4_cq *cq)
 {
@@ -134,16 +135,16 @@
 	mlx4_buf_free(dev->dev, (cqe + 1) * buf->entry_size, &buf->buf);
 }
 
-static int mlx4_ib_get_cq_umem(struct mlx4_ib_dev *dev, struct ib_ucontext *context,
-			       struct mlx4_ib_cq_buf *buf, struct ib_umem **umem,
-			       u64 buf_addr, int cqe)
+static int mlx4_ib_get_cq_umem(struct mlx4_ib_dev *dev, struct ib_udata *udata,
+			       struct mlx4_ib_cq_buf *buf,
+			       struct ib_umem **umem, u64 buf_addr, int cqe)
 {
 	int err;
 	int cqe_size = dev->dev->caps.cqe_size;
 	int shift;
 	int n;
 
-	*umem = ib_umem_get(context, buf_addr, cqe * cqe_size,
+	*umem = ib_umem_get(udata, buf_addr, cqe * cqe_size,
 			    IB_ACCESS_LOCAL_WRITE, 1);
 	if (IS_ERR(*umem))
 		return PTR_ERR(*umem);
@@ -171,27 +172,25 @@
 }
 
 #define CQ_CREATE_FLAGS_SUPPORTED IB_UVERBS_CQ_FLAGS_TIMESTAMP_COMPLETION
-struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev,
-				const struct ib_cq_init_attr *attr,
-				struct ib_ucontext *context,
-				struct ib_udata *udata)
+int mlx4_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
+		      struct ib_udata *udata)
 {
+	struct ib_device *ibdev = ibcq->device;
 	int entries = attr->cqe;
 	int vector = attr->comp_vector;
 	struct mlx4_ib_dev *dev = to_mdev(ibdev);
-	struct mlx4_ib_cq *cq;
+	struct mlx4_ib_cq *cq = to_mcq(ibcq);
 	struct mlx4_uar *uar;
+	void *buf_addr;
 	int err;
+	struct mlx4_ib_ucontext *context = rdma_udata_to_drv_context(
+		udata, struct mlx4_ib_ucontext, ibucontext);
 
 	if (entries < 1 || entries > dev->dev->caps.max_cqes)
-		return ERR_PTR(-EINVAL);
+		return -EINVAL;
 
 	if (attr->flags & ~CQ_CREATE_FLAGS_SUPPORTED)
-		return ERR_PTR(-EINVAL);
-
-	cq = kmalloc(sizeof *cq, GFP_KERNEL);
-	if (!cq)
-		return ERR_PTR(-ENOMEM);
+		return -EINVAL;
 
 	entries      = roundup_pow_of_two(entries + 1);
 	cq->ibcq.cqe = entries - 1;
@@ -203,7 +202,7 @@
 	INIT_LIST_HEAD(&cq->send_qp_list);
 	INIT_LIST_HEAD(&cq->recv_qp_list);
 
-	if (context) {
+	if (udata) {
 		struct mlx4_ib_create_cq ucmd;
 
 		if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd)) {
@@ -211,17 +210,17 @@
 			goto err_cq;
 		}
 
-		err = mlx4_ib_get_cq_umem(dev, context, &cq->buf, &cq->umem,
+		buf_addr = (void *)(unsigned long)ucmd.buf_addr;
+		err = mlx4_ib_get_cq_umem(dev, udata, &cq->buf, &cq->umem,
 					  ucmd.buf_addr, entries);
 		if (err)
 			goto err_cq;
 
-		err = mlx4_ib_db_map_user(to_mucontext(context), ucmd.db_addr,
-					  &cq->db);
+		err = mlx4_ib_db_map_user(udata, ucmd.db_addr, &cq->db);
 		if (err)
 			goto err_mtt;
 
-		uar = &to_mucontext(context)->uar;
+		uar = &context->uar;
 		cq->mcq.usage = MLX4_RES_USAGE_USER_VERBS;
 	} else {
 		err = mlx4_db_alloc(dev->dev, &cq->db, 1);
@@ -237,6 +236,8 @@
 		if (err)
 			goto err_db;
 
+		buf_addr = &cq->buf.buf;
+
 		uar = &dev->priv_uar;
 		cq->mcq.usage = MLX4_RES_USAGE_DRIVER;
 	}
@@ -244,49 +245,47 @@
 	if (dev->eq_table)
 		vector = dev->eq_table[vector % ibdev->num_comp_vectors];
 
-	err = mlx4_cq_alloc(dev->dev, entries, &cq->buf.mtt, uar,
-			    cq->db.dma, &cq->mcq, vector, 0,
-			    !!(cq->create_flags & IB_UVERBS_CQ_FLAGS_TIMESTAMP_COMPLETION));
+	err = mlx4_cq_alloc(dev->dev, entries, &cq->buf.mtt, uar, cq->db.dma,
+			    &cq->mcq, vector, 0,
+			    !!(cq->create_flags &
+			       IB_UVERBS_CQ_FLAGS_TIMESTAMP_COMPLETION),
+			    buf_addr, !!udata);
 	if (err)
 		goto err_dbmap;
 
-	if (context)
+	if (udata)
 		cq->mcq.tasklet_ctx.comp = mlx4_ib_cq_comp;
 	else
 		cq->mcq.comp = mlx4_ib_cq_comp;
 	cq->mcq.event = mlx4_ib_cq_event;
 
-	if (context)
+	if (udata)
 		if (ib_copy_to_udata(udata, &cq->mcq.cqn, sizeof (__u32))) {
 			err = -EFAULT;
 			goto err_cq_free;
 		}
 
-	return &cq->ibcq;
+	return 0;
 
 err_cq_free:
 	mlx4_cq_free(dev->dev, &cq->mcq);
 
 err_dbmap:
-	if (context)
-		mlx4_ib_db_unmap_user(to_mucontext(context), &cq->db);
+	if (udata)
+		mlx4_ib_db_unmap_user(context, &cq->db);
 
 err_mtt:
 	mlx4_mtt_cleanup(dev->dev, &cq->buf.mtt);
 
-	if (context)
-		ib_umem_release(cq->umem);
-	else
+	ib_umem_release(cq->umem);
+	if (!udata)
 		mlx4_ib_free_cq_buf(dev, &cq->buf, cq->ibcq.cqe);
 
 err_db:
-	if (!context)
+	if (!udata)
 		mlx4_db_free(dev->dev, &cq->db);
-
 err_cq:
-	kfree(cq);
-
-	return ERR_PTR(err);
+	return err;
 }
 
 static int mlx4_alloc_resize_buf(struct mlx4_ib_dev *dev, struct mlx4_ib_cq *cq,
@@ -329,7 +328,7 @@
 	if (!cq->resize_buf)
 		return -ENOMEM;
 
-	err = mlx4_ib_get_cq_umem(dev, cq->umem->context, &cq->resize_buf->buf,
+	err = mlx4_ib_get_cq_umem(dev, udata, &cq->resize_buf->buf,
 				  &cq->resize_umem, ucmd.buf_addr, entries);
 	if (err) {
 		kfree(cq->resize_buf);
@@ -468,18 +467,15 @@
 	kfree(cq->resize_buf);
 	cq->resize_buf = NULL;
 
-	if (cq->resize_umem) {
-		ib_umem_release(cq->resize_umem);
-		cq->resize_umem = NULL;
-	}
-
+	ib_umem_release(cq->resize_umem);
+	cq->resize_umem = NULL;
 out:
 	mutex_unlock(&cq->resize_mutex);
 
 	return err;
 }
 
-int mlx4_ib_destroy_cq(struct ib_cq *cq)
+void mlx4_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata)
 {
 	struct mlx4_ib_dev *dev = to_mdev(cq->device);
 	struct mlx4_ib_cq *mcq = to_mcq(cq);
@@ -487,17 +483,18 @@
 	mlx4_cq_free(dev->dev, &mcq->mcq);
 	mlx4_mtt_cleanup(dev->dev, &mcq->buf.mtt);
 
-	if (cq->uobject) {
-		mlx4_ib_db_unmap_user(to_mucontext(cq->uobject->context), &mcq->db);
-		ib_umem_release(mcq->umem);
+	if (udata) {
+		mlx4_ib_db_unmap_user(
+			rdma_udata_to_drv_context(
+				udata,
+				struct mlx4_ib_ucontext,
+				ibucontext),
+			&mcq->db);
 	} else {
 		mlx4_ib_free_cq_buf(dev, &mcq->buf, cq->cqe);
 		mlx4_db_free(dev->dev, &mcq->db);
 	}
-
-	kfree(mcq);
-
-	return 0;
+	ib_umem_release(mcq->umem);
 }
 
 static void dump_cqe(void *cqe)
diff --git a/drivers/infiniband/hw/mlx4/doorbell.c b/drivers/infiniband/hw/mlx4/doorbell.c
index c517409..0f39035 100644
--- a/drivers/infiniband/hw/mlx4/doorbell.c
+++ b/drivers/infiniband/hw/mlx4/doorbell.c
@@ -31,6 +31,7 @@
  */
 
 #include <linux/slab.h>
+#include <rdma/uverbs_ioctl.h>
 
 #include "mlx4_ib.h"
 
@@ -41,11 +42,13 @@
 	int			refcnt;
 };
 
-int mlx4_ib_db_map_user(struct mlx4_ib_ucontext *context, unsigned long virt,
+int mlx4_ib_db_map_user(struct ib_udata *udata, unsigned long virt,
 			struct mlx4_db *db)
 {
 	struct mlx4_ib_user_db_page *page;
 	int err = 0;
+	struct mlx4_ib_ucontext *context = rdma_udata_to_drv_context(
+		udata, struct mlx4_ib_ucontext, ibucontext);
 
 	mutex_lock(&context->db_page_mutex);
 
@@ -61,8 +64,7 @@
 
 	page->user_virt = (virt & PAGE_MASK);
 	page->refcnt    = 0;
-	page->umem      = ib_umem_get(&context->ibucontext, virt & PAGE_MASK,
-				      PAGE_SIZE, 0, 0);
+	page->umem = ib_umem_get(udata, virt & PAGE_MASK, PAGE_SIZE, 0, 0);
 	if (IS_ERR(page->umem)) {
 		err = PTR_ERR(page->umem);
 		kfree(page);
diff --git a/drivers/infiniband/hw/mlx4/mad.c b/drivers/infiniband/hw/mlx4/mad.c
index e5466d7..5707911 100644
--- a/drivers/infiniband/hw/mlx4/mad.c
+++ b/drivers/infiniband/hw/mlx4/mad.c
@@ -202,13 +202,13 @@
 	rdma_ah_set_port_num(&ah_attr, port_num);
 
 	new_ah = rdma_create_ah(dev->send_agent[port_num - 1][0]->qp->pd,
-				&ah_attr);
+				&ah_attr, 0);
 	if (IS_ERR(new_ah))
 		return;
 
 	spin_lock_irqsave(&dev->sm_lock, flags);
 	if (dev->sm_ah[port_num - 1])
-		rdma_destroy_ah(dev->sm_ah[port_num - 1]);
+		rdma_destroy_ah(dev->sm_ah[port_num - 1], 0);
 	dev->sm_ah[port_num - 1] = new_ah;
 	spin_unlock_irqrestore(&dev->sm_lock, flags);
 }
@@ -567,7 +567,7 @@
 			return -EINVAL;
 		rdma_ah_set_grh(&attr, &dgid, 0, 0, 0, 0);
 	}
-	ah = rdma_create_ah(tun_ctx->pd, &attr);
+	ah = rdma_create_ah(tun_ctx->pd, &attr, 0);
 	if (IS_ERR(ah))
 		return -ENOMEM;
 
@@ -584,7 +584,7 @@
 
 	tun_mad = (struct mlx4_rcv_tunnel_mad *) (tun_qp->tx_ring[tun_tx_ix].buf.addr);
 	if (tun_qp->tx_ring[tun_tx_ix].ah)
-		rdma_destroy_ah(tun_qp->tx_ring[tun_tx_ix].ah);
+		rdma_destroy_ah(tun_qp->tx_ring[tun_tx_ix].ah, 0);
 	tun_qp->tx_ring[tun_tx_ix].ah = ah;
 	ib_dma_sync_single_for_cpu(&dev->ib_dev,
 				   tun_qp->tx_ring[tun_tx_ix].buf.map,
@@ -657,7 +657,7 @@
 	spin_unlock(&tun_qp->tx_lock);
 	tun_qp->tx_ring[tun_tx_ix].ah = NULL;
 end:
-	rdma_destroy_ah(ah);
+	rdma_destroy_ah(ah, 0);
 	return ret;
 }
 
@@ -807,15 +807,17 @@
 	int err;
 	struct ib_port_attr pattr;
 
-	if (in_wc && in_wc->qp->qp_num) {
-		pr_debug("received MAD: slid:%d sqpn:%d "
-			"dlid_bits:%d dqpn:%d wc_flags:0x%x, cls %x, mtd %x, atr %x\n",
-			in_wc->slid, in_wc->src_qp,
-			in_wc->dlid_path_bits,
-			in_wc->qp->qp_num,
-			in_wc->wc_flags,
-			in_mad->mad_hdr.mgmt_class, in_mad->mad_hdr.method,
-			be16_to_cpu(in_mad->mad_hdr.attr_id));
+	if (in_wc && in_wc->qp) {
+		pr_debug("received MAD: port:%d slid:%d sqpn:%d "
+			 "dlid_bits:%d dqpn:%d wc_flags:0x%x tid:%016llx cls:%x mtd:%x atr:%x\n",
+			 port_num,
+			 in_wc->slid, in_wc->src_qp,
+			 in_wc->dlid_path_bits,
+			 in_wc->qp->qp_num,
+			 in_wc->wc_flags,
+			 be64_to_cpu(in_mad->mad_hdr.tid),
+			 in_mad->mad_hdr.mgmt_class, in_mad->mad_hdr.method,
+			 be16_to_cpu(in_mad->mad_hdr.attr_id));
 		if (in_wc->wc_flags & IB_WC_GRH) {
 			pr_debug("sgid_hi:0x%016llx sgid_lo:0x%016llx\n",
 				 be64_to_cpu(in_grh->sgid.global.subnet_prefix),
@@ -1022,7 +1024,7 @@
 			 struct ib_mad_send_wc *mad_send_wc)
 {
 	if (mad_send_wc->send_buf->context[0])
-		rdma_destroy_ah(mad_send_wc->send_buf->context[0]);
+		rdma_destroy_ah(mad_send_wc->send_buf->context[0], 0);
 	ib_free_send_mad(mad_send_wc->send_buf);
 }
 
@@ -1077,7 +1079,7 @@
 		}
 
 		if (dev->sm_ah[p])
-			rdma_destroy_ah(dev->sm_ah[p]);
+			rdma_destroy_ah(dev->sm_ah[p], 0);
 	}
 }
 
@@ -1369,9 +1371,9 @@
 	struct ib_ah *ah;
 	struct ib_qp *send_qp = NULL;
 	unsigned wire_tx_ix = 0;
-	int ret = 0;
 	u16 wire_pkey_ix;
 	int src_qpnum;
+	int ret;
 
 	sqp_ctx = dev->sriov.sqps[port-1];
 
@@ -1391,12 +1393,20 @@
 
 	send_qp = sqp->qp;
 
-	/* create ah */
-	ah = mlx4_ib_create_ah_slave(sqp_ctx->pd, attr,
-				     rdma_ah_retrieve_grh(attr)->sgid_index,
-				     s_mac, vlan_id);
-	if (IS_ERR(ah))
+	ah = rdma_zalloc_drv_obj(sqp_ctx->pd->device, ib_ah);
+	if (!ah)
 		return -ENOMEM;
+
+	ah->device = sqp_ctx->pd->device;
+	ah->pd = sqp_ctx->pd;
+
+	/* create ah */
+	ret = mlx4_ib_create_ah_slave(ah, attr,
+				      rdma_ah_retrieve_grh(attr)->sgid_index,
+				      s_mac, vlan_id);
+	if (ret)
+		goto out;
+
 	spin_lock(&sqp->tx_lock);
 	if (sqp->tx_ix_head - sqp->tx_ix_tail >=
 	    (MLX4_NUM_TUNNEL_BUFS - 1))
@@ -1408,8 +1418,7 @@
 		goto out;
 
 	sqp_mad = (struct mlx4_mad_snd_buf *) (sqp->tx_ring[wire_tx_ix].buf.addr);
-	if (sqp->tx_ring[wire_tx_ix].ah)
-		rdma_destroy_ah(sqp->tx_ring[wire_tx_ix].ah);
+	kfree(sqp->tx_ring[wire_tx_ix].ah);
 	sqp->tx_ring[wire_tx_ix].ah = ah;
 	ib_dma_sync_single_for_cpu(&dev->ib_dev,
 				   sqp->tx_ring[wire_tx_ix].buf.map,
@@ -1448,7 +1457,7 @@
 	spin_unlock(&sqp->tx_lock);
 	sqp->tx_ring[wire_tx_ix].ah = NULL;
 out:
-	mlx4_ib_destroy_ah(ah);
+	kfree(ah);
 	return ret;
 }
 
@@ -1668,8 +1677,6 @@
 				    tx_buf_size, DMA_TO_DEVICE);
 		kfree(tun_qp->tx_ring[i].buf.addr);
 	}
-	kfree(tun_qp->tx_ring);
-	tun_qp->tx_ring = NULL;
 	i = MLX4_NUM_TUNNEL_BUFS;
 err:
 	while (i > 0) {
@@ -1678,6 +1685,8 @@
 				    rx_buf_size, DMA_FROM_DEVICE);
 		kfree(tun_qp->ring[i].addr);
 	}
+	kfree(tun_qp->tx_ring);
+	tun_qp->tx_ring = NULL;
 	kfree(tun_qp->ring);
 	tun_qp->ring = NULL;
 	return -ENOMEM;
@@ -1714,7 +1723,7 @@
 				    tx_buf_size, DMA_TO_DEVICE);
 		kfree(tun_qp->tx_ring[i].buf.addr);
 		if (tun_qp->tx_ring[i].ah)
-			rdma_destroy_ah(tun_qp->tx_ring[i].ah);
+			rdma_destroy_ah(tun_qp->tx_ring[i].ah, 0);
 	}
 	kfree(tun_qp->tx_ring);
 	kfree(tun_qp->ring);
@@ -1747,7 +1756,7 @@
 					 "wrid=0x%llx, status=0x%x\n",
 					 wc.wr_id, wc.status);
 				rdma_destroy_ah(tun_qp->tx_ring[wc.wr_id &
-					      (MLX4_NUM_TUNNEL_BUFS - 1)].ah);
+					      (MLX4_NUM_TUNNEL_BUFS - 1)].ah, 0);
 				tun_qp->tx_ring[wc.wr_id & (MLX4_NUM_TUNNEL_BUFS - 1)].ah
 					= NULL;
 				spin_lock(&tun_qp->tx_lock);
@@ -1764,7 +1773,7 @@
 				 ctx->slave, wc.status, wc.wr_id);
 			if (!MLX4_TUN_IS_RECV(wc.wr_id)) {
 				rdma_destroy_ah(tun_qp->tx_ring[wc.wr_id &
-					      (MLX4_NUM_TUNNEL_BUFS - 1)].ah);
+					      (MLX4_NUM_TUNNEL_BUFS - 1)].ah, 0);
 				tun_qp->tx_ring[wc.wr_id & (MLX4_NUM_TUNNEL_BUFS - 1)].ah
 					= NULL;
 				spin_lock(&tun_qp->tx_lock);
@@ -1900,8 +1909,8 @@
 		if (wc.status == IB_WC_SUCCESS) {
 			switch (wc.opcode) {
 			case IB_WC_SEND:
-				rdma_destroy_ah(sqp->tx_ring[wc.wr_id &
-					      (MLX4_NUM_TUNNEL_BUFS - 1)].ah);
+				kfree(sqp->tx_ring[wc.wr_id &
+				      (MLX4_NUM_TUNNEL_BUFS - 1)].ah);
 				sqp->tx_ring[wc.wr_id & (MLX4_NUM_TUNNEL_BUFS - 1)].ah
 					= NULL;
 				spin_lock(&sqp->tx_lock);
@@ -1929,8 +1938,8 @@
 				 " status = %d, wrid = 0x%llx\n",
 				 ctx->slave, wc.status, wc.wr_id);
 			if (!MLX4_TUN_IS_RECV(wc.wr_id)) {
-				rdma_destroy_ah(sqp->tx_ring[wc.wr_id &
-					      (MLX4_NUM_TUNNEL_BUFS - 1)].ah);
+				kfree(sqp->tx_ring[wc.wr_id &
+				      (MLX4_NUM_TUNNEL_BUFS - 1)].ah);
 				sqp->tx_ring[wc.wr_id & (MLX4_NUM_TUNNEL_BUFS - 1)].ah
 					= NULL;
 				spin_lock(&sqp->tx_lock);
diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c
index 0bbeaaa..8d2f1e3 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -734,7 +734,8 @@
 
 static u8 state_to_phys_state(enum ib_port_state state)
 {
-	return state == IB_PORT_ACTIVE ? 5 : 3;
+	return state == IB_PORT_ACTIVE ?
+		IB_PORT_PHYS_STATE_LINK_UP : IB_PORT_PHYS_STATE_DISABLED;
 }
 
 static int eth_link_query_port(struct ib_device *ibdev, u8 port,
@@ -1076,19 +1077,21 @@
 	return err;
 }
 
-static struct ib_ucontext *mlx4_ib_alloc_ucontext(struct ib_device *ibdev,
-						  struct ib_udata *udata)
+static int mlx4_ib_alloc_ucontext(struct ib_ucontext *uctx,
+				  struct ib_udata *udata)
 {
+	struct ib_device *ibdev = uctx->device;
 	struct mlx4_ib_dev *dev = to_mdev(ibdev);
-	struct mlx4_ib_ucontext *context;
+	struct mlx4_ib_ucontext *context = to_mucontext(uctx);
 	struct mlx4_ib_alloc_ucontext_resp_v3 resp_v3;
 	struct mlx4_ib_alloc_ucontext_resp resp;
 	int err;
 
 	if (!dev->ib_active)
-		return ERR_PTR(-EAGAIN);
+		return -EAGAIN;
 
-	if (ibdev->uverbs_abi_ver == MLX4_IB_UVERBS_NO_DEV_CAPS_ABI_VERSION) {
+	if (ibdev->ops.uverbs_abi_ver ==
+	    MLX4_IB_UVERBS_NO_DEV_CAPS_ABI_VERSION) {
 		resp_v3.qp_tab_size      = dev->dev->caps.num_qps;
 		resp_v3.bf_reg_size      = dev->dev->caps.bf_reg_size;
 		resp_v3.bf_regs_per_page = dev->dev->caps.bf_regs_per_page;
@@ -1100,15 +1103,9 @@
 		resp.cqe_size	      = dev->dev->caps.cqe_size;
 	}
 
-	context = kzalloc(sizeof(*context), GFP_KERNEL);
-	if (!context)
-		return ERR_PTR(-ENOMEM);
-
 	err = mlx4_uar_alloc(to_mdev(ibdev)->dev, &context->uar);
-	if (err) {
-		kfree(context);
-		return ERR_PTR(err);
-	}
+	if (err)
+		return err;
 
 	INIT_LIST_HEAD(&context->db_page_list);
 	mutex_init(&context->db_page_mutex);
@@ -1116,206 +1113,95 @@
 	INIT_LIST_HEAD(&context->wqn_ranges_list);
 	mutex_init(&context->wqn_ranges_mutex);
 
-	if (ibdev->uverbs_abi_ver == MLX4_IB_UVERBS_NO_DEV_CAPS_ABI_VERSION)
+	if (ibdev->ops.uverbs_abi_ver == MLX4_IB_UVERBS_NO_DEV_CAPS_ABI_VERSION)
 		err = ib_copy_to_udata(udata, &resp_v3, sizeof(resp_v3));
 	else
 		err = ib_copy_to_udata(udata, &resp, sizeof(resp));
 
 	if (err) {
 		mlx4_uar_free(to_mdev(ibdev)->dev, &context->uar);
-		kfree(context);
-		return ERR_PTR(-EFAULT);
+		return -EFAULT;
 	}
 
-	return &context->ibucontext;
+	return err;
 }
 
-static int mlx4_ib_dealloc_ucontext(struct ib_ucontext *ibcontext)
+static void mlx4_ib_dealloc_ucontext(struct ib_ucontext *ibcontext)
 {
 	struct mlx4_ib_ucontext *context = to_mucontext(ibcontext);
 
 	mlx4_uar_free(to_mdev(ibcontext->device)->dev, &context->uar);
-	kfree(context);
-
-	return 0;
 }
 
-static void  mlx4_ib_vma_open(struct vm_area_struct *area)
-{
-	/* vma_open is called when a new VMA is created on top of our VMA.
-	 * This is done through either mremap flow or split_vma (usually due
-	 * to mlock, madvise, munmap, etc.). We do not support a clone of the
-	 * vma, as this VMA is strongly hardware related. Therefore we set the
-	 * vm_ops of the newly created/cloned VMA to NULL, to prevent it from
-	 * calling us again and trying to do incorrect actions. We assume that
-	 * the original vma size is exactly a single page that there will be no
-	 * "splitting" operations on.
-	 */
-	area->vm_ops = NULL;
-}
-
-static void  mlx4_ib_vma_close(struct vm_area_struct *area)
-{
-	struct mlx4_ib_vma_private_data *mlx4_ib_vma_priv_data;
-
-	/* It's guaranteed that all VMAs opened on a FD are closed before the
-	 * file itself is closed, therefore no sync is needed with the regular
-	 * closing flow. (e.g. mlx4_ib_dealloc_ucontext) However need a sync
-	 * with accessing the vma as part of mlx4_ib_disassociate_ucontext.
-	 * The close operation is usually called under mm->mmap_sem except when
-	 * process is exiting.  The exiting case is handled explicitly as part
-	 * of mlx4_ib_disassociate_ucontext.
-	 */
-	mlx4_ib_vma_priv_data = (struct mlx4_ib_vma_private_data *)
-				area->vm_private_data;
-
-	/* set the vma context pointer to null in the mlx4_ib driver's private
-	 * data to protect against a race condition in mlx4_ib_dissassociate_ucontext().
-	 */
-	mlx4_ib_vma_priv_data->vma = NULL;
-}
-
-static const struct vm_operations_struct mlx4_ib_vm_ops = {
-	.open = mlx4_ib_vma_open,
-	.close = mlx4_ib_vma_close
-};
-
 static void mlx4_ib_disassociate_ucontext(struct ib_ucontext *ibcontext)
 {
-	int i;
-	struct vm_area_struct *vma;
-	struct mlx4_ib_ucontext *context = to_mucontext(ibcontext);
-
-	/* need to protect from a race on closing the vma as part of
-	 * mlx4_ib_vma_close().
-	 */
-	for (i = 0; i < HW_BAR_COUNT; i++) {
-		vma = context->hw_bar_info[i].vma;
-		if (!vma)
-			continue;
-
-		zap_vma_ptes(context->hw_bar_info[i].vma,
-			     context->hw_bar_info[i].vma->vm_start, PAGE_SIZE);
-
-		context->hw_bar_info[i].vma->vm_flags &=
-			~(VM_SHARED | VM_MAYSHARE);
-		/* context going to be destroyed, should not access ops any more */
-		context->hw_bar_info[i].vma->vm_ops = NULL;
-	}
-}
-
-static void mlx4_ib_set_vma_data(struct vm_area_struct *vma,
-				 struct mlx4_ib_vma_private_data *vma_private_data)
-{
-	vma_private_data->vma = vma;
-	vma->vm_private_data = vma_private_data;
-	vma->vm_ops =  &mlx4_ib_vm_ops;
 }
 
 static int mlx4_ib_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
 {
 	struct mlx4_ib_dev *dev = to_mdev(context->device);
-	struct mlx4_ib_ucontext *mucontext = to_mucontext(context);
 
-	if (vma->vm_end - vma->vm_start != PAGE_SIZE)
-		return -EINVAL;
+	switch (vma->vm_pgoff) {
+	case 0:
+		return rdma_user_mmap_io(context, vma,
+					 to_mucontext(context)->uar.pfn,
+					 PAGE_SIZE,
+					 pgprot_noncached(vma->vm_page_prot));
 
-	if (vma->vm_pgoff == 0) {
-		/* We prevent double mmaping on same context */
-		if (mucontext->hw_bar_info[HW_BAR_DB].vma)
+	case 1:
+		if (dev->dev->caps.bf_reg_size == 0)
 			return -EINVAL;
+		return rdma_user_mmap_io(
+			context, vma,
+			to_mucontext(context)->uar.pfn +
+				dev->dev->caps.num_uars,
+			PAGE_SIZE, pgprot_writecombine(vma->vm_page_prot));
 
-		vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
-
-		if (io_remap_pfn_range(vma, vma->vm_start,
-				       to_mucontext(context)->uar.pfn,
-				       PAGE_SIZE, vma->vm_page_prot))
-			return -EAGAIN;
-
-		mlx4_ib_set_vma_data(vma, &mucontext->hw_bar_info[HW_BAR_DB]);
-
-	} else if (vma->vm_pgoff == 1 && dev->dev->caps.bf_reg_size != 0) {
-		/* We prevent double mmaping on same context */
-		if (mucontext->hw_bar_info[HW_BAR_BF].vma)
-			return -EINVAL;
-
-		vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot);
-
-		if (io_remap_pfn_range(vma, vma->vm_start,
-				       to_mucontext(context)->uar.pfn +
-				       dev->dev->caps.num_uars,
-				       PAGE_SIZE, vma->vm_page_prot))
-			return -EAGAIN;
-
-		mlx4_ib_set_vma_data(vma, &mucontext->hw_bar_info[HW_BAR_BF]);
-
-	} else if (vma->vm_pgoff == 3) {
+	case 3: {
 		struct mlx4_clock_params params;
 		int ret;
 
-		/* We prevent double mmaping on same context */
-		if (mucontext->hw_bar_info[HW_BAR_CLOCK].vma)
-			return -EINVAL;
-
 		ret = mlx4_get_internal_clock_params(dev->dev, &params);
-
 		if (ret)
 			return ret;
 
-		vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
-		if (io_remap_pfn_range(vma, vma->vm_start,
-				       (pci_resource_start(dev->dev->persist->pdev,
-							   params.bar) +
-					params.offset)
-				       >> PAGE_SHIFT,
-				       PAGE_SIZE, vma->vm_page_prot))
-			return -EAGAIN;
+		return rdma_user_mmap_io(
+			context, vma,
+			(pci_resource_start(dev->dev->persist->pdev,
+					    params.bar) +
+			 params.offset) >>
+				PAGE_SHIFT,
+			PAGE_SIZE, pgprot_noncached(vma->vm_page_prot));
+	}
 
-		mlx4_ib_set_vma_data(vma,
-				     &mucontext->hw_bar_info[HW_BAR_CLOCK]);
-	} else {
+	default:
 		return -EINVAL;
 	}
-
-	return 0;
 }
 
-static struct ib_pd *mlx4_ib_alloc_pd(struct ib_device *ibdev,
-				      struct ib_ucontext *context,
-				      struct ib_udata *udata)
+static int mlx4_ib_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata)
 {
-	struct mlx4_ib_pd *pd;
+	struct mlx4_ib_pd *pd = to_mpd(ibpd);
+	struct ib_device *ibdev = ibpd->device;
 	int err;
 
-	pd = kzalloc(sizeof(*pd), GFP_KERNEL);
-	if (!pd)
-		return ERR_PTR(-ENOMEM);
-
 	err = mlx4_pd_alloc(to_mdev(ibdev)->dev, &pd->pdn);
-	if (err) {
-		kfree(pd);
-		return ERR_PTR(err);
-	}
+	if (err)
+		return err;
 
-	if (context)
-		if (ib_copy_to_udata(udata, &pd->pdn, sizeof (__u32))) {
-			mlx4_pd_free(to_mdev(ibdev)->dev, pd->pdn);
-			kfree(pd);
-			return ERR_PTR(-EFAULT);
-		}
-	return &pd->ibpd;
+	if (udata && ib_copy_to_udata(udata, &pd->pdn, sizeof(__u32))) {
+		mlx4_pd_free(to_mdev(ibdev)->dev, pd->pdn);
+		return -EFAULT;
+	}
+	return 0;
 }
 
-static int mlx4_ib_dealloc_pd(struct ib_pd *pd)
+static void mlx4_ib_dealloc_pd(struct ib_pd *pd, struct ib_udata *udata)
 {
 	mlx4_pd_free(to_mdev(pd->device)->dev, to_mpd(pd)->pdn);
-	kfree(pd);
-
-	return 0;
 }
 
 static struct ib_xrcd *mlx4_ib_alloc_xrcd(struct ib_device *ibdev,
-					  struct ib_ucontext *context,
 					  struct ib_udata *udata)
 {
 	struct mlx4_ib_xrcd *xrcd;
@@ -1357,7 +1243,7 @@
 	return ERR_PTR(err);
 }
 
-static int mlx4_ib_dealloc_xrcd(struct ib_xrcd *xrcd)
+static int mlx4_ib_dealloc_xrcd(struct ib_xrcd *xrcd, struct ib_udata *udata)
 {
 	ib_destroy_cq(to_mxrcd(xrcd)->cq);
 	ib_dealloc_pd(to_mxrcd(xrcd)->pd);
@@ -2133,39 +2019,44 @@
 	return err;
 }
 
-static ssize_t show_hca(struct device *device, struct device_attribute *attr,
-			char *buf)
+static ssize_t hca_type_show(struct device *device,
+			     struct device_attribute *attr, char *buf)
 {
 	struct mlx4_ib_dev *dev =
-		container_of(device, struct mlx4_ib_dev, ib_dev.dev);
+		rdma_device_to_drv_device(device, struct mlx4_ib_dev, ib_dev);
 	return sprintf(buf, "MT%d\n", dev->dev->persist->pdev->device);
 }
+static DEVICE_ATTR_RO(hca_type);
 
-static ssize_t show_rev(struct device *device, struct device_attribute *attr,
-			char *buf)
+static ssize_t hw_rev_show(struct device *device,
+			   struct device_attribute *attr, char *buf)
 {
 	struct mlx4_ib_dev *dev =
-		container_of(device, struct mlx4_ib_dev, ib_dev.dev);
+		rdma_device_to_drv_device(device, struct mlx4_ib_dev, ib_dev);
 	return sprintf(buf, "%x\n", dev->dev->rev_id);
 }
+static DEVICE_ATTR_RO(hw_rev);
 
-static ssize_t show_board(struct device *device, struct device_attribute *attr,
-			  char *buf)
+static ssize_t board_id_show(struct device *device,
+			     struct device_attribute *attr, char *buf)
 {
 	struct mlx4_ib_dev *dev =
-		container_of(device, struct mlx4_ib_dev, ib_dev.dev);
+		rdma_device_to_drv_device(device, struct mlx4_ib_dev, ib_dev);
+
 	return sprintf(buf, "%.*s\n", MLX4_BOARD_ID_LEN,
 		       dev->dev->board_id);
 }
+static DEVICE_ATTR_RO(board_id);
 
-static DEVICE_ATTR(hw_rev,   S_IRUGO, show_rev,    NULL);
-static DEVICE_ATTR(hca_type, S_IRUGO, show_hca,    NULL);
-static DEVICE_ATTR(board_id, S_IRUGO, show_board,  NULL);
+static struct attribute *mlx4_class_attributes[] = {
+	&dev_attr_hw_rev.attr,
+	&dev_attr_hca_type.attr,
+	&dev_attr_board_id.attr,
+	NULL
+};
 
-static struct device_attribute *mlx4_class_attributes[] = {
-	&dev_attr_hw_rev,
-	&dev_attr_hca_type,
-	&dev_attr_board_id
+static const struct attribute_group mlx4_attr_group = {
+	.attrs = mlx4_class_attributes,
 };
 
 struct diag_counter {
@@ -2310,6 +2201,11 @@
 	}
 }
 
+static const struct ib_device_ops mlx4_ib_hw_stats_ops = {
+	.alloc_hw_stats = mlx4_ib_alloc_hw_stats,
+	.get_hw_stats = mlx4_ib_get_hw_stats,
+};
+
 static int mlx4_ib_alloc_diag_counters(struct mlx4_ib_dev *ibdev)
 {
 	struct mlx4_ib_diag_counters *diag = ibdev->diag_counters;
@@ -2336,8 +2232,7 @@
 					   diag[i].offset, i);
 	}
 
-	ibdev->ib_dev.get_hw_stats	= mlx4_ib_get_hw_stats;
-	ibdev->ib_dev.alloc_hw_stats	= mlx4_ib_alloc_hw_stats;
+	ib_set_device_ops(&ibdev->ib_dev, &mlx4_ib_hw_stats_ops);
 
 	return 0;
 
@@ -2442,6 +2337,32 @@
 		     event == NETDEV_UP || event == NETDEV_CHANGE))
 			update_qps_port = port;
 
+		if (dev == iboe->netdevs[port - 1] &&
+		    (event == NETDEV_UP || event == NETDEV_DOWN)) {
+			enum ib_port_state port_state;
+			struct ib_event ibev = { };
+
+			if (ib_get_cached_port_state(&ibdev->ib_dev, port,
+						     &port_state))
+				continue;
+
+			if (event == NETDEV_UP &&
+			    (port_state != IB_PORT_ACTIVE ||
+			     iboe->last_port_state[port - 1] != IB_PORT_DOWN))
+				continue;
+			if (event == NETDEV_DOWN &&
+			    (port_state != IB_PORT_DOWN ||
+			     iboe->last_port_state[port - 1] != IB_PORT_ACTIVE))
+				continue;
+			iboe->last_port_state[port - 1] = port_state;
+
+			ibev.device = &ibdev->ib_dev;
+			ibev.element.port_num = port;
+			ibev.event = event == NETDEV_UP ? IB_EVENT_PORT_ACTIVE :
+							  IB_EVENT_PORT_ERR;
+			ib_dispatch_event(&ibev);
+		}
+
 	}
 	spin_unlock_bh(&iboe->lock);
 
@@ -2589,6 +2510,98 @@
 		 (int) dev->dev->caps.fw_ver & 0xffff);
 }
 
+static const struct ib_device_ops mlx4_ib_dev_ops = {
+	.owner = THIS_MODULE,
+	.driver_id = RDMA_DRIVER_MLX4,
+	.uverbs_abi_ver = MLX4_IB_UVERBS_ABI_VERSION,
+
+	.add_gid = mlx4_ib_add_gid,
+	.alloc_mr = mlx4_ib_alloc_mr,
+	.alloc_pd = mlx4_ib_alloc_pd,
+	.alloc_ucontext = mlx4_ib_alloc_ucontext,
+	.attach_mcast = mlx4_ib_mcg_attach,
+	.create_ah = mlx4_ib_create_ah,
+	.create_cq = mlx4_ib_create_cq,
+	.create_qp = mlx4_ib_create_qp,
+	.create_srq = mlx4_ib_create_srq,
+	.dealloc_pd = mlx4_ib_dealloc_pd,
+	.dealloc_ucontext = mlx4_ib_dealloc_ucontext,
+	.del_gid = mlx4_ib_del_gid,
+	.dereg_mr = mlx4_ib_dereg_mr,
+	.destroy_ah = mlx4_ib_destroy_ah,
+	.destroy_cq = mlx4_ib_destroy_cq,
+	.destroy_qp = mlx4_ib_destroy_qp,
+	.destroy_srq = mlx4_ib_destroy_srq,
+	.detach_mcast = mlx4_ib_mcg_detach,
+	.disassociate_ucontext = mlx4_ib_disassociate_ucontext,
+	.drain_rq = mlx4_ib_drain_rq,
+	.drain_sq = mlx4_ib_drain_sq,
+	.get_dev_fw_str = get_fw_ver_str,
+	.get_dma_mr = mlx4_ib_get_dma_mr,
+	.get_link_layer = mlx4_ib_port_link_layer,
+	.get_netdev = mlx4_ib_get_netdev,
+	.get_port_immutable = mlx4_port_immutable,
+	.map_mr_sg = mlx4_ib_map_mr_sg,
+	.mmap = mlx4_ib_mmap,
+	.modify_cq = mlx4_ib_modify_cq,
+	.modify_device = mlx4_ib_modify_device,
+	.modify_port = mlx4_ib_modify_port,
+	.modify_qp = mlx4_ib_modify_qp,
+	.modify_srq = mlx4_ib_modify_srq,
+	.poll_cq = mlx4_ib_poll_cq,
+	.post_recv = mlx4_ib_post_recv,
+	.post_send = mlx4_ib_post_send,
+	.post_srq_recv = mlx4_ib_post_srq_recv,
+	.process_mad = mlx4_ib_process_mad,
+	.query_ah = mlx4_ib_query_ah,
+	.query_device = mlx4_ib_query_device,
+	.query_gid = mlx4_ib_query_gid,
+	.query_pkey = mlx4_ib_query_pkey,
+	.query_port = mlx4_ib_query_port,
+	.query_qp = mlx4_ib_query_qp,
+	.query_srq = mlx4_ib_query_srq,
+	.reg_user_mr = mlx4_ib_reg_user_mr,
+	.req_notify_cq = mlx4_ib_arm_cq,
+	.rereg_user_mr = mlx4_ib_rereg_user_mr,
+	.resize_cq = mlx4_ib_resize_cq,
+
+	INIT_RDMA_OBJ_SIZE(ib_ah, mlx4_ib_ah, ibah),
+	INIT_RDMA_OBJ_SIZE(ib_cq, mlx4_ib_cq, ibcq),
+	INIT_RDMA_OBJ_SIZE(ib_pd, mlx4_ib_pd, ibpd),
+	INIT_RDMA_OBJ_SIZE(ib_srq, mlx4_ib_srq, ibsrq),
+	INIT_RDMA_OBJ_SIZE(ib_ucontext, mlx4_ib_ucontext, ibucontext),
+};
+
+static const struct ib_device_ops mlx4_ib_dev_wq_ops = {
+	.create_rwq_ind_table = mlx4_ib_create_rwq_ind_table,
+	.create_wq = mlx4_ib_create_wq,
+	.destroy_rwq_ind_table = mlx4_ib_destroy_rwq_ind_table,
+	.destroy_wq = mlx4_ib_destroy_wq,
+	.modify_wq = mlx4_ib_modify_wq,
+};
+
+static const struct ib_device_ops mlx4_ib_dev_fmr_ops = {
+	.alloc_fmr = mlx4_ib_fmr_alloc,
+	.dealloc_fmr = mlx4_ib_fmr_dealloc,
+	.map_phys_fmr = mlx4_ib_map_phys_fmr,
+	.unmap_fmr = mlx4_ib_unmap_fmr,
+};
+
+static const struct ib_device_ops mlx4_ib_dev_mw_ops = {
+	.alloc_mw = mlx4_ib_alloc_mw,
+	.dealloc_mw = mlx4_ib_dealloc_mw,
+};
+
+static const struct ib_device_ops mlx4_ib_dev_xrc_ops = {
+	.alloc_xrcd = mlx4_ib_alloc_xrcd,
+	.dealloc_xrcd = mlx4_ib_dealloc_xrcd,
+};
+
+static const struct ib_device_ops mlx4_ib_dev_fs_ops = {
+	.create_flow = mlx4_ib_create_flow,
+	.destroy_flow = mlx4_ib_destroy_flow,
+};
+
 static void *mlx4_ib_add(struct mlx4_dev *dev)
 {
 	struct mlx4_ib_dev *ibdev;
@@ -2612,7 +2625,7 @@
 	if (num_ports == 0)
 		return NULL;
 
-	ibdev = (struct mlx4_ib_dev *) ib_alloc_device(sizeof *ibdev);
+	ibdev = ib_alloc_device(mlx4_ib_dev, ib_dev);
 	if (!ibdev) {
 		dev_err(&dev->persist->pdev->dev,
 			"Device struct alloc failed\n");
@@ -2636,8 +2649,6 @@
 	ibdev->dev = dev;
 	ibdev->bond_next_port	= 0;
 
-	strlcpy(ibdev->ib_dev.name, "mlx4_%d", IB_DEVICE_NAME_MAX);
-	ibdev->ib_dev.owner		= THIS_MODULE;
 	ibdev->ib_dev.node_type		= RDMA_NODE_IB_CA;
 	ibdev->ib_dev.local_dma_lkey	= dev->caps.reserved_lkey;
 	ibdev->num_ports		= num_ports;
@@ -2645,14 +2656,6 @@
 						1 : ibdev->num_ports;
 	ibdev->ib_dev.num_comp_vectors	= dev->caps.num_comp_vectors;
 	ibdev->ib_dev.dev.parent	= &dev->persist->pdev->dev;
-	ibdev->ib_dev.get_netdev	= mlx4_ib_get_netdev;
-	ibdev->ib_dev.add_gid		= mlx4_ib_add_gid;
-	ibdev->ib_dev.del_gid		= mlx4_ib_del_gid;
-
-	if (dev->caps.userspace_caps)
-		ibdev->ib_dev.uverbs_abi_ver = MLX4_IB_UVERBS_ABI_VERSION;
-	else
-		ibdev->ib_dev.uverbs_abi_ver = MLX4_IB_UVERBS_NO_DEV_CAPS_ABI_VERSION;
 
 	ibdev->ib_dev.uverbs_cmd_mask	=
 		(1ull << IB_USER_VERBS_CMD_GET_CONTEXT)		|
@@ -2680,115 +2683,56 @@
 		(1ull << IB_USER_VERBS_CMD_CREATE_XSRQ)		|
 		(1ull << IB_USER_VERBS_CMD_OPEN_QP);
 
-	ibdev->ib_dev.query_device	= mlx4_ib_query_device;
-	ibdev->ib_dev.query_port	= mlx4_ib_query_port;
-	ibdev->ib_dev.get_link_layer	= mlx4_ib_port_link_layer;
-	ibdev->ib_dev.query_gid		= mlx4_ib_query_gid;
-	ibdev->ib_dev.query_pkey	= mlx4_ib_query_pkey;
-	ibdev->ib_dev.modify_device	= mlx4_ib_modify_device;
-	ibdev->ib_dev.modify_port	= mlx4_ib_modify_port;
-	ibdev->ib_dev.alloc_ucontext	= mlx4_ib_alloc_ucontext;
-	ibdev->ib_dev.dealloc_ucontext	= mlx4_ib_dealloc_ucontext;
-	ibdev->ib_dev.mmap		= mlx4_ib_mmap;
-	ibdev->ib_dev.alloc_pd		= mlx4_ib_alloc_pd;
-	ibdev->ib_dev.dealloc_pd	= mlx4_ib_dealloc_pd;
-	ibdev->ib_dev.create_ah		= mlx4_ib_create_ah;
-	ibdev->ib_dev.query_ah		= mlx4_ib_query_ah;
-	ibdev->ib_dev.destroy_ah	= mlx4_ib_destroy_ah;
-	ibdev->ib_dev.create_srq	= mlx4_ib_create_srq;
-	ibdev->ib_dev.modify_srq	= mlx4_ib_modify_srq;
-	ibdev->ib_dev.query_srq		= mlx4_ib_query_srq;
-	ibdev->ib_dev.destroy_srq	= mlx4_ib_destroy_srq;
-	ibdev->ib_dev.post_srq_recv	= mlx4_ib_post_srq_recv;
-	ibdev->ib_dev.create_qp		= mlx4_ib_create_qp;
-	ibdev->ib_dev.modify_qp		= mlx4_ib_modify_qp;
-	ibdev->ib_dev.query_qp		= mlx4_ib_query_qp;
-	ibdev->ib_dev.destroy_qp	= mlx4_ib_destroy_qp;
-	ibdev->ib_dev.drain_sq		= mlx4_ib_drain_sq;
-	ibdev->ib_dev.drain_rq		= mlx4_ib_drain_rq;
-	ibdev->ib_dev.post_send		= mlx4_ib_post_send;
-	ibdev->ib_dev.post_recv		= mlx4_ib_post_recv;
-	ibdev->ib_dev.create_cq		= mlx4_ib_create_cq;
-	ibdev->ib_dev.modify_cq		= mlx4_ib_modify_cq;
-	ibdev->ib_dev.resize_cq		= mlx4_ib_resize_cq;
-	ibdev->ib_dev.destroy_cq	= mlx4_ib_destroy_cq;
-	ibdev->ib_dev.poll_cq		= mlx4_ib_poll_cq;
-	ibdev->ib_dev.req_notify_cq	= mlx4_ib_arm_cq;
-	ibdev->ib_dev.get_dma_mr	= mlx4_ib_get_dma_mr;
-	ibdev->ib_dev.reg_user_mr	= mlx4_ib_reg_user_mr;
-	ibdev->ib_dev.rereg_user_mr	= mlx4_ib_rereg_user_mr;
-	ibdev->ib_dev.dereg_mr		= mlx4_ib_dereg_mr;
-	ibdev->ib_dev.alloc_mr		= mlx4_ib_alloc_mr;
-	ibdev->ib_dev.map_mr_sg		= mlx4_ib_map_mr_sg;
-	ibdev->ib_dev.attach_mcast	= mlx4_ib_mcg_attach;
-	ibdev->ib_dev.detach_mcast	= mlx4_ib_mcg_detach;
-	ibdev->ib_dev.process_mad	= mlx4_ib_process_mad;
-	ibdev->ib_dev.get_port_immutable = mlx4_port_immutable;
-	ibdev->ib_dev.get_dev_fw_str    = get_fw_ver_str;
-	ibdev->ib_dev.disassociate_ucontext = mlx4_ib_disassociate_ucontext;
-
+	ib_set_device_ops(&ibdev->ib_dev, &mlx4_ib_dev_ops);
 	ibdev->ib_dev.uverbs_ex_cmd_mask |=
-		(1ull << IB_USER_VERBS_EX_CMD_MODIFY_CQ);
+		(1ull << IB_USER_VERBS_EX_CMD_MODIFY_CQ) |
+		(1ull << IB_USER_VERBS_EX_CMD_QUERY_DEVICE) |
+		(1ull << IB_USER_VERBS_EX_CMD_CREATE_CQ) |
+		(1ull << IB_USER_VERBS_EX_CMD_CREATE_QP);
 
 	if ((dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_RSS) &&
 	    ((mlx4_ib_port_link_layer(&ibdev->ib_dev, 1) ==
 	    IB_LINK_LAYER_ETHERNET) ||
 	    (mlx4_ib_port_link_layer(&ibdev->ib_dev, 2) ==
 	    IB_LINK_LAYER_ETHERNET))) {
-		ibdev->ib_dev.create_wq		= mlx4_ib_create_wq;
-		ibdev->ib_dev.modify_wq		= mlx4_ib_modify_wq;
-		ibdev->ib_dev.destroy_wq	= mlx4_ib_destroy_wq;
-		ibdev->ib_dev.create_rwq_ind_table  =
-			mlx4_ib_create_rwq_ind_table;
-		ibdev->ib_dev.destroy_rwq_ind_table =
-			mlx4_ib_destroy_rwq_ind_table;
 		ibdev->ib_dev.uverbs_ex_cmd_mask |=
 			(1ull << IB_USER_VERBS_EX_CMD_CREATE_WQ)	  |
 			(1ull << IB_USER_VERBS_EX_CMD_MODIFY_WQ)	  |
 			(1ull << IB_USER_VERBS_EX_CMD_DESTROY_WQ)	  |
 			(1ull << IB_USER_VERBS_EX_CMD_CREATE_RWQ_IND_TBL) |
 			(1ull << IB_USER_VERBS_EX_CMD_DESTROY_RWQ_IND_TBL);
+		ib_set_device_ops(&ibdev->ib_dev, &mlx4_ib_dev_wq_ops);
 	}
 
-	if (!mlx4_is_slave(ibdev->dev)) {
-		ibdev->ib_dev.alloc_fmr		= mlx4_ib_fmr_alloc;
-		ibdev->ib_dev.map_phys_fmr	= mlx4_ib_map_phys_fmr;
-		ibdev->ib_dev.unmap_fmr		= mlx4_ib_unmap_fmr;
-		ibdev->ib_dev.dealloc_fmr	= mlx4_ib_fmr_dealloc;
-	}
+	if (!mlx4_is_slave(ibdev->dev))
+		ib_set_device_ops(&ibdev->ib_dev, &mlx4_ib_dev_fmr_ops);
 
 	if (dev->caps.flags & MLX4_DEV_CAP_FLAG_MEM_WINDOW ||
 	    dev->caps.bmme_flags & MLX4_BMME_FLAG_TYPE_2_WIN) {
-		ibdev->ib_dev.alloc_mw = mlx4_ib_alloc_mw;
-		ibdev->ib_dev.dealloc_mw = mlx4_ib_dealloc_mw;
-
 		ibdev->ib_dev.uverbs_cmd_mask |=
 			(1ull << IB_USER_VERBS_CMD_ALLOC_MW) |
 			(1ull << IB_USER_VERBS_CMD_DEALLOC_MW);
+		ib_set_device_ops(&ibdev->ib_dev, &mlx4_ib_dev_mw_ops);
 	}
 
 	if (dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC) {
-		ibdev->ib_dev.alloc_xrcd = mlx4_ib_alloc_xrcd;
-		ibdev->ib_dev.dealloc_xrcd = mlx4_ib_dealloc_xrcd;
 		ibdev->ib_dev.uverbs_cmd_mask |=
 			(1ull << IB_USER_VERBS_CMD_OPEN_XRCD) |
 			(1ull << IB_USER_VERBS_CMD_CLOSE_XRCD);
+		ib_set_device_ops(&ibdev->ib_dev, &mlx4_ib_dev_xrc_ops);
 	}
 
 	if (check_flow_steering_support(dev)) {
 		ibdev->steering_support = MLX4_STEERING_MODE_DEVICE_MANAGED;
-		ibdev->ib_dev.create_flow	= mlx4_ib_create_flow;
-		ibdev->ib_dev.destroy_flow	= mlx4_ib_destroy_flow;
-
 		ibdev->ib_dev.uverbs_ex_cmd_mask	|=
 			(1ull << IB_USER_VERBS_EX_CMD_CREATE_FLOW) |
 			(1ull << IB_USER_VERBS_EX_CMD_DESTROY_FLOW);
+		ib_set_device_ops(&ibdev->ib_dev, &mlx4_ib_dev_fs_ops);
 	}
 
-	ibdev->ib_dev.uverbs_ex_cmd_mask |=
-		(1ull << IB_USER_VERBS_EX_CMD_QUERY_DEVICE) |
-		(1ull << IB_USER_VERBS_EX_CMD_CREATE_CQ) |
-		(1ull << IB_USER_VERBS_EX_CMD_CREATE_QP);
+	if (!dev->caps.userspace_caps)
+		ibdev->ib_dev.ops.uverbs_abi_ver =
+			MLX4_IB_UVERBS_NO_DEV_CAPS_ABI_VERSION;
 
 	mlx4_ib_alloc_eqs(dev, ibdev);
 
@@ -2801,6 +2745,7 @@
 	for (i = 0; i < ibdev->num_ports; ++i) {
 		mutex_init(&ibdev->counters_table[i].mutex);
 		INIT_LIST_HEAD(&ibdev->counters_table[i].counters_list);
+		iboe->last_port_state[i] = IB_PORT_DOWN;
 	}
 
 	num_req_counters = mlx4_is_bonded(dev) ? 1 : ibdev->num_ports;
@@ -2898,8 +2843,8 @@
 	if (mlx4_ib_alloc_diag_counters(ibdev))
 		goto err_steer_free_bitmap;
 
-	ibdev->ib_dev.driver_id = RDMA_DRIVER_MLX4;
-	if (ib_register_device(&ibdev->ib_dev, NULL))
+	rdma_set_device_sysfs_group(&ibdev->ib_dev, &mlx4_attr_group);
+	if (ib_register_device(&ibdev->ib_dev, "mlx4_%d"))
 		goto err_diag_counters;
 
 	if (mlx4_ib_mad_init(ibdev))
@@ -2922,12 +2867,6 @@
 			goto err_notif;
 	}
 
-	for (j = 0; j < ARRAY_SIZE(mlx4_class_attributes); ++j) {
-		if (device_create_file(&ibdev->ib_dev.dev,
-				       mlx4_class_attributes[j]))
-			goto err_notif;
-	}
-
 	ibdev->ib_active = true;
 	mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_IB)
 		devlink_port_type_ib_set(mlx4_get_devlink_port(dev, i),
diff --git a/drivers/infiniband/hw/mlx4/mcg.c b/drivers/infiniband/hw/mlx4/mcg.c
index 81ffc00..d844831 100644
--- a/drivers/infiniband/hw/mlx4/mcg.c
+++ b/drivers/infiniband/hw/mlx4/mcg.c
@@ -673,7 +673,7 @@
 			if (!list_empty(&group->pending_list))
 				req = list_first_entry(&group->pending_list,
 						struct mcast_req, group_list);
-			if ((method == IB_MGMT_METHOD_GET_RESP)) {
+			if (method == IB_MGMT_METHOD_GET_RESP) {
 					if (req) {
 						send_reply_to_slave(req->func, group, &req->sa_mad, status);
 						--group->func[req->func].num_pend_reqs;
diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h
index e10dccc..eb53bb4 100644
--- a/drivers/infiniband/hw/mlx4/mlx4_ib.h
+++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h
@@ -80,16 +80,11 @@
 	HW_BAR_COUNT
 };
 
-struct mlx4_ib_vma_private_data {
-	struct vm_area_struct *vma;
-};
-
 struct mlx4_ib_ucontext {
 	struct ib_ucontext	ibucontext;
 	struct mlx4_uar		uar;
 	struct list_head	db_page_list;
 	struct mutex		db_page_mutex;
-	struct mlx4_ib_vma_private_data hw_bar_info[HW_BAR_COUNT];
 	struct list_head	wqn_ranges_list;
 	struct mutex		wqn_ranges_mutex; /* protect wqn_ranges_list */
 };
@@ -497,10 +492,11 @@
 	struct mlx4_sriov_alias_guid alias_guid;
 
 	/* CM paravirtualization fields */
-	struct list_head cm_list;
+	struct xarray pv_id_table;
+	u32 pv_id_next;
 	spinlock_t id_map_lock;
 	struct rb_root sl_id_map;
-	struct idr pv_id_table;
+	struct list_head cm_list;
 };
 
 struct gid_cache_context {
@@ -524,6 +520,7 @@
 	atomic64_t		mac[MLX4_MAX_PORTS];
 	struct notifier_block 	nb;
 	struct mlx4_port_gid_table gids[MLX4_MAX_PORTS];
+	enum ib_port_state	last_port_state[MLX4_MAX_PORTS];
 };
 
 struct pkey_mgt {
@@ -726,7 +723,7 @@
 int mlx4_ib_init_sriov(struct mlx4_ib_dev *dev);
 void mlx4_ib_close_sriov(struct mlx4_ib_dev *dev);
 
-int mlx4_ib_db_map_user(struct mlx4_ib_ucontext *context, unsigned long virt,
+int mlx4_ib_db_map_user(struct ib_udata *udata, unsigned long virt,
 			struct mlx4_db *db);
 void mlx4_ib_db_unmap_user(struct mlx4_ib_ucontext *context, struct mlx4_db *db);
 
@@ -736,43 +733,37 @@
 struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 				  u64 virt_addr, int access_flags,
 				  struct ib_udata *udata);
-int mlx4_ib_dereg_mr(struct ib_mr *mr);
+int mlx4_ib_dereg_mr(struct ib_mr *mr, struct ib_udata *udata);
 struct ib_mw *mlx4_ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type,
 			       struct ib_udata *udata);
 int mlx4_ib_dealloc_mw(struct ib_mw *mw);
-struct ib_mr *mlx4_ib_alloc_mr(struct ib_pd *pd,
-			       enum ib_mr_type mr_type,
-			       u32 max_num_sg);
+struct ib_mr *mlx4_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
+			       u32 max_num_sg, struct ib_udata *udata);
 int mlx4_ib_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents,
 		      unsigned int *sg_offset);
 int mlx4_ib_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period);
 int mlx4_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata);
-struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev,
-				const struct ib_cq_init_attr *attr,
-				struct ib_ucontext *context,
-				struct ib_udata *udata);
-int mlx4_ib_destroy_cq(struct ib_cq *cq);
+int mlx4_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
+		      struct ib_udata *udata);
+void mlx4_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata);
 int mlx4_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc);
 int mlx4_ib_arm_cq(struct ib_cq *cq, enum ib_cq_notify_flags flags);
 void __mlx4_ib_cq_clean(struct mlx4_ib_cq *cq, u32 qpn, struct mlx4_ib_srq *srq);
 void mlx4_ib_cq_clean(struct mlx4_ib_cq *cq, u32 qpn, struct mlx4_ib_srq *srq);
 
-struct ib_ah *mlx4_ib_create_ah(struct ib_pd *pd, struct rdma_ah_attr *ah_attr,
-				struct ib_udata *udata);
-struct ib_ah *mlx4_ib_create_ah_slave(struct ib_pd *pd,
-				      struct rdma_ah_attr *ah_attr,
-				      int slave_sgid_index, u8 *s_mac,
-				      u16 vlan_tag);
+int mlx4_ib_create_ah(struct ib_ah *ah, struct rdma_ah_attr *ah_attr, u32 flags,
+		      struct ib_udata *udata);
+int mlx4_ib_create_ah_slave(struct ib_ah *ah, struct rdma_ah_attr *ah_attr,
+			    int slave_sgid_index, u8 *s_mac, u16 vlan_tag);
 int mlx4_ib_query_ah(struct ib_ah *ibah, struct rdma_ah_attr *ah_attr);
-int mlx4_ib_destroy_ah(struct ib_ah *ah);
+void mlx4_ib_destroy_ah(struct ib_ah *ah, u32 flags);
 
-struct ib_srq *mlx4_ib_create_srq(struct ib_pd *pd,
-				  struct ib_srq_init_attr *init_attr,
-				  struct ib_udata *udata);
+int mlx4_ib_create_srq(struct ib_srq *srq, struct ib_srq_init_attr *init_attr,
+		       struct ib_udata *udata);
 int mlx4_ib_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
 		       enum ib_srq_attr_mask attr_mask, struct ib_udata *udata);
 int mlx4_ib_query_srq(struct ib_srq *srq, struct ib_srq_attr *srq_attr);
-int mlx4_ib_destroy_srq(struct ib_srq *srq);
+void mlx4_ib_destroy_srq(struct ib_srq *srq, struct ib_udata *udata);
 void mlx4_ib_free_srq_wqe(struct mlx4_ib_srq *srq, int wqe_index);
 int mlx4_ib_post_srq_recv(struct ib_srq *ibsrq, const struct ib_recv_wr *wr,
 			  const struct ib_recv_wr **bad_wr);
@@ -780,7 +771,7 @@
 struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd,
 				struct ib_qp_init_attr *init_attr,
 				struct ib_udata *udata);
-int mlx4_ib_destroy_qp(struct ib_qp *qp);
+int mlx4_ib_destroy_qp(struct ib_qp *qp, struct ib_udata *udata);
 void mlx4_ib_drain_sq(struct ib_qp *qp);
 void mlx4_ib_drain_rq(struct ib_qp *qp);
 int mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
@@ -915,7 +906,7 @@
 struct ib_wq *mlx4_ib_create_wq(struct ib_pd *pd,
 				struct ib_wq_init_attr *init_attr,
 				struct ib_udata *udata);
-int mlx4_ib_destroy_wq(struct ib_wq *wq);
+void mlx4_ib_destroy_wq(struct ib_wq *wq, struct ib_udata *udata);
 int mlx4_ib_modify_wq(struct ib_wq *wq, struct ib_wq_attr *wq_attr,
 		      u32 wq_attr_mask, struct ib_udata *udata);
 
diff --git a/drivers/infiniband/hw/mlx4/mr.c b/drivers/infiniband/hw/mlx4/mr.c
index c7c85c2..6ae503c 100644
--- a/drivers/infiniband/hw/mlx4/mr.c
+++ b/drivers/infiniband/hw/mlx4/mr.c
@@ -258,7 +258,7 @@
 				       int *num_of_mtts)
 {
 	u64 block_shift = MLX4_MAX_MTT_SHIFT;
-	u64 min_shift = umem->page_shift;
+	u64 min_shift = PAGE_SHIFT;
 	u64 last_block_aligned_end = 0;
 	u64 current_block_start = 0;
 	u64 first_block_start = 0;
@@ -295,8 +295,8 @@
 			 * in access to the wrong data.
 			 */
 			misalignment_bits =
-			(start_va & (~(((u64)(BIT(umem->page_shift))) - 1ULL)))
-			^ current_block_start;
+				(start_va & (~(((u64)(PAGE_SIZE)) - 1ULL))) ^
+				current_block_start;
 			block_shift = min(alignment_of(misalignment_bits),
 					  block_shift);
 		}
@@ -367,9 +367,8 @@
 	return block_shift;
 }
 
-static struct ib_umem *mlx4_get_umem_mr(struct ib_ucontext *context, u64 start,
-					u64 length, u64 virt_addr,
-					int access_flags)
+static struct ib_umem *mlx4_get_umem_mr(struct ib_udata *udata, u64 start,
+					u64 length, int access_flags)
 {
 	/*
 	 * Force registering the memory as writable if the underlying pages
@@ -378,6 +377,7 @@
 	 * again
 	 */
 	if (!ib_access_writable(access_flags)) {
+		unsigned long untagged_start = untagged_addr(start);
 		struct vm_area_struct *vma;
 
 		down_read(&current->mm->mmap_sem);
@@ -386,9 +386,9 @@
 		 * cover the memory, but for now it requires a single vma to
 		 * entirely cover the MR to support RO mappings.
 		 */
-		vma = find_vma(current->mm, start);
-		if (vma && vma->vm_end >= start + length &&
-		    vma->vm_start <= start) {
+		vma = find_vma(current->mm, untagged_start);
+		if (vma && vma->vm_end >= untagged_start + length &&
+		    vma->vm_start <= untagged_start) {
 			if (vma->vm_flags & VM_WRITE)
 				access_flags |= IB_ACCESS_LOCAL_WRITE;
 		} else {
@@ -398,7 +398,7 @@
 		up_read(&current->mm->mmap_sem);
 	}
 
-	return ib_umem_get(context, start, length, access_flags, 0);
+	return ib_umem_get(udata, start, length, access_flags, 0);
 }
 
 struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
@@ -415,8 +415,7 @@
 	if (!mr)
 		return ERR_PTR(-ENOMEM);
 
-	mr->umem = mlx4_get_umem_mr(pd->uobject->context, start, length,
-				    virt_addr, access_flags);
+	mr->umem = mlx4_get_umem_mr(udata, start, length, access_flags);
 	if (IS_ERR(mr->umem)) {
 		err = PTR_ERR(mr->umem);
 		goto err_free;
@@ -505,9 +504,8 @@
 
 		mlx4_mr_rereg_mem_cleanup(dev->dev, &mmr->mmr);
 		ib_umem_release(mmr->umem);
-		mmr->umem =
-			mlx4_get_umem_mr(mr->uobject->context, start, length,
-					 virt_addr, mr_access_flags);
+		mmr->umem = mlx4_get_umem_mr(udata, start, length,
+					     mr_access_flags);
 		if (IS_ERR(mmr->umem)) {
 			err = PTR_ERR(mmr->umem);
 			/* Prevent mlx4_ib_dereg_mr from free'ing invalid pointer */
@@ -515,7 +513,7 @@
 			goto release_mpt_entry;
 		}
 		n = ib_umem_page_count(mmr->umem);
-		shift = mmr->umem->page_shift;
+		shift = PAGE_SHIFT;
 
 		err = mlx4_mr_rereg_mem_write(dev->dev, &mmr->mmr,
 					      virt_addr, length, n, shift,
@@ -596,7 +594,7 @@
 	}
 }
 
-int mlx4_ib_dereg_mr(struct ib_mr *ibmr)
+int mlx4_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
 {
 	struct mlx4_ib_mr *mr = to_mmr(ibmr);
 	int ret;
@@ -656,9 +654,8 @@
 	return 0;
 }
 
-struct ib_mr *mlx4_ib_alloc_mr(struct ib_pd *pd,
-			       enum ib_mr_type mr_type,
-			       u32 max_num_sg)
+struct ib_mr *mlx4_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
+			       u32 max_num_sg, struct ib_udata *udata)
 {
 	struct mlx4_ib_dev *dev = to_mdev(pd->device);
 	struct mlx4_ib_mr *mr;
diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c
index 6dd3cd2..bd4aa04 100644
--- a/drivers/infiniband/hw/mlx4/qp.c
+++ b/drivers/infiniband/hw/mlx4/qp.c
@@ -41,6 +41,7 @@
 #include <rdma/ib_pack.h>
 #include <rdma/ib_addr.h>
 #include <rdma/ib_mad.h>
+#include <rdma/uverbs_ioctl.h>
 
 #include <linux/mlx4/driver.h>
 #include <linux/mlx4/qp.h>
@@ -52,7 +53,8 @@
 			     struct mlx4_ib_cq *recv_cq);
 static void mlx4_ib_unlock_cqs(struct mlx4_ib_cq *send_cq,
 			       struct mlx4_ib_cq *recv_cq);
-static int _mlx4_ib_modify_wq(struct ib_wq *ibwq, enum ib_wq_state new_state);
+static int _mlx4_ib_modify_wq(struct ib_wq *ibwq, enum ib_wq_state new_state,
+			      struct ib_udata *udata);
 
 enum {
 	MLX4_IB_ACK_REQ_FREQ	= 8,
@@ -323,7 +325,7 @@
 }
 
 static int set_rq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap,
-		       int is_user, int has_rq, struct mlx4_ib_qp *qp,
+		       bool is_user, bool has_rq, struct mlx4_ib_qp *qp,
 		       u32 inl_recv_sz)
 {
 	/* Sanity check RQ size before proceeding */
@@ -401,7 +403,7 @@
 	 * We need to leave 2 KB + 1 WR of headroom in the SQ to
 	 * allow HW to prefetch.
 	 */
-	qp->sq_spare_wqes = (2048 >> qp->sq.wqe_shift) + 1;
+	qp->sq_spare_wqes = MLX4_IB_SQ_HEADROOM(qp->sq.wqe_shift);
 	qp->sq.wqe_cnt = roundup_pow_of_two(cap->max_send_wr +
 					    qp->sq_spare_wqes);
 
@@ -504,10 +506,10 @@
 	kfree(qp->sqp_proxy_rcv);
 }
 
-static int qp_has_rq(struct ib_qp_init_attr *attr)
+static bool qp_has_rq(struct ib_qp_init_attr *attr)
 {
 	if (attr->qp_type == IB_QPT_XRC_INI || attr->qp_type == IB_QPT_XRC_TGT)
-		return 0;
+		return false;
 
 	return !attr->srq;
 }
@@ -853,20 +855,152 @@
 	mutex_unlock(&context->wqn_ranges_mutex);
 }
 
-static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
-			    enum mlx4_ib_source_type src,
-			    struct ib_qp_init_attr *init_attr,
+static int create_rq(struct ib_pd *pd, struct ib_qp_init_attr *init_attr,
+		     struct ib_udata *udata, struct mlx4_ib_qp *qp)
+{
+	struct mlx4_ib_dev *dev = to_mdev(pd->device);
+	int qpn;
+	int err;
+	struct mlx4_ib_ucontext *context = rdma_udata_to_drv_context(
+		udata, struct mlx4_ib_ucontext, ibucontext);
+	struct mlx4_ib_cq *mcq;
+	unsigned long flags;
+	int range_size;
+	struct mlx4_ib_create_wq wq;
+	size_t copy_len;
+	int shift;
+	int n;
+
+	qp->mlx4_ib_qp_type = MLX4_IB_QPT_RAW_PACKET;
+
+	mutex_init(&qp->mutex);
+	spin_lock_init(&qp->sq.lock);
+	spin_lock_init(&qp->rq.lock);
+	INIT_LIST_HEAD(&qp->gid_list);
+	INIT_LIST_HEAD(&qp->steering_rules);
+
+	qp->state = IB_QPS_RESET;
+
+	copy_len = min(sizeof(struct mlx4_ib_create_wq), udata->inlen);
+
+	if (ib_copy_from_udata(&wq, udata, copy_len)) {
+		err = -EFAULT;
+		goto err;
+	}
+
+	if (wq.comp_mask || wq.reserved[0] || wq.reserved[1] ||
+	    wq.reserved[2]) {
+		pr_debug("user command isn't supported\n");
+		err = -EOPNOTSUPP;
+		goto err;
+	}
+
+	if (wq.log_range_size > ilog2(dev->dev->caps.max_rss_tbl_sz)) {
+		pr_debug("WQN range size must be equal or smaller than %d\n",
+			 dev->dev->caps.max_rss_tbl_sz);
+		err = -EOPNOTSUPP;
+		goto err;
+	}
+	range_size = 1 << wq.log_range_size;
+
+	if (init_attr->create_flags & IB_QP_CREATE_SCATTER_FCS)
+		qp->flags |= MLX4_IB_QP_SCATTER_FCS;
+
+	err = set_rq_size(dev, &init_attr->cap, true, true, qp, qp->inl_recv_sz);
+	if (err)
+		goto err;
+
+	qp->sq_no_prefetch = 1;
+	qp->sq.wqe_cnt = 1;
+	qp->sq.wqe_shift = MLX4_IB_MIN_SQ_STRIDE;
+	qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) +
+		       (qp->sq.wqe_cnt << qp->sq.wqe_shift);
+
+	qp->umem = ib_umem_get(udata, wq.buf_addr, qp->buf_size, 0, 0);
+	if (IS_ERR(qp->umem)) {
+		err = PTR_ERR(qp->umem);
+		goto err;
+	}
+
+	n = ib_umem_page_count(qp->umem);
+	shift = mlx4_ib_umem_calc_optimal_mtt_size(qp->umem, 0, &n);
+	err = mlx4_mtt_init(dev->dev, n, shift, &qp->mtt);
+
+	if (err)
+		goto err_buf;
+
+	err = mlx4_ib_umem_write_mtt(dev, &qp->mtt, qp->umem);
+	if (err)
+		goto err_mtt;
+
+	err = mlx4_ib_db_map_user(udata, wq.db_addr, &qp->db);
+	if (err)
+		goto err_mtt;
+	qp->mqp.usage = MLX4_RES_USAGE_USER_VERBS;
+
+	err = mlx4_ib_alloc_wqn(context, qp, range_size, &qpn);
+	if (err)
+		goto err_wrid;
+
+	err = mlx4_qp_alloc(dev->dev, qpn, &qp->mqp);
+	if (err)
+		goto err_qpn;
+
+	/*
+	 * Hardware wants QPN written in big-endian order (after
+	 * shifting) for send doorbell.  Precompute this value to save
+	 * a little bit when posting sends.
+	 */
+	qp->doorbell_qpn = swab32(qp->mqp.qpn << 8);
+
+	qp->mqp.event = mlx4_ib_wq_event;
+
+	spin_lock_irqsave(&dev->reset_flow_resource_lock, flags);
+	mlx4_ib_lock_cqs(to_mcq(init_attr->send_cq),
+			 to_mcq(init_attr->recv_cq));
+	/* Maintain device to QPs access, needed for further handling
+	 * via reset flow
+	 */
+	list_add_tail(&qp->qps_list, &dev->qp_list);
+	/* Maintain CQ to QPs access, needed for further handling
+	 * via reset flow
+	 */
+	mcq = to_mcq(init_attr->send_cq);
+	list_add_tail(&qp->cq_send_list, &mcq->send_qp_list);
+	mcq = to_mcq(init_attr->recv_cq);
+	list_add_tail(&qp->cq_recv_list, &mcq->recv_qp_list);
+	mlx4_ib_unlock_cqs(to_mcq(init_attr->send_cq),
+			   to_mcq(init_attr->recv_cq));
+	spin_unlock_irqrestore(&dev->reset_flow_resource_lock, flags);
+	return 0;
+
+err_qpn:
+	mlx4_ib_release_wqn(context, qp, 0);
+err_wrid:
+	mlx4_ib_db_unmap_user(context, &qp->db);
+
+err_mtt:
+	mlx4_mtt_cleanup(dev->dev, &qp->mtt);
+err_buf:
+	ib_umem_release(qp->umem);
+err:
+	return err;
+}
+
+static int create_qp_common(struct ib_pd *pd, struct ib_qp_init_attr *init_attr,
 			    struct ib_udata *udata, int sqpn,
 			    struct mlx4_ib_qp **caller_qp)
 {
+	struct mlx4_ib_dev *dev = to_mdev(pd->device);
 	int qpn;
 	int err;
 	struct mlx4_ib_sqp *sqp = NULL;
 	struct mlx4_ib_qp *qp;
+	struct mlx4_ib_ucontext *context = rdma_udata_to_drv_context(
+		udata, struct mlx4_ib_ucontext, ibucontext);
 	enum mlx4_ib_qp_type qp_type = (enum mlx4_ib_qp_type) init_attr->qp_type;
 	struct mlx4_ib_cq *mcq;
 	unsigned long flags;
-	int range_size = 0;
 
 	/* When tunneling special qps, we use a plain UD qp */
 	if (sqpn) {
@@ -917,15 +1051,13 @@
 			if (!sqp)
 				return -ENOMEM;
 			qp = &sqp->qp;
-			qp->pri.vid = 0xFFFF;
-			qp->alt.vid = 0xFFFF;
 		} else {
 			qp = kzalloc(sizeof(struct mlx4_ib_qp), GFP_KERNEL);
 			if (!qp)
 				return -ENOMEM;
-			qp->pri.vid = 0xFFFF;
-			qp->alt.vid = 0xFFFF;
 		}
+		qp->pri.vid = 0xFFFF;
+		qp->alt.vid = 0xFFFF;
 	} else
 		qp = *caller_qp;
 
@@ -937,48 +1069,24 @@
 	INIT_LIST_HEAD(&qp->gid_list);
 	INIT_LIST_HEAD(&qp->steering_rules);
 
-	qp->state	 = IB_QPS_RESET;
+	qp->state = IB_QPS_RESET;
 	if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR)
 		qp->sq_signal_bits = cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE);
 
-
-	if (pd->uobject) {
-		union {
-			struct mlx4_ib_create_qp qp;
-			struct mlx4_ib_create_wq wq;
-		} ucmd;
+	if (udata) {
+		struct mlx4_ib_create_qp ucmd;
 		size_t copy_len;
 		int shift;
 		int n;
 
-		copy_len = (src == MLX4_IB_QP_SRC) ?
-			   sizeof(struct mlx4_ib_create_qp) :
-			   min(sizeof(struct mlx4_ib_create_wq), udata->inlen);
+		copy_len = sizeof(struct mlx4_ib_create_qp);
 
 		if (ib_copy_from_udata(&ucmd, udata, copy_len)) {
 			err = -EFAULT;
 			goto err;
 		}
 
-		if (src == MLX4_IB_RWQ_SRC) {
-			if (ucmd.wq.comp_mask || ucmd.wq.reserved[0] ||
-			    ucmd.wq.reserved[1] || ucmd.wq.reserved[2]) {
-				pr_debug("user command isn't supported\n");
-				err = -EOPNOTSUPP;
-				goto err;
-			}
-
-			if (ucmd.wq.log_range_size >
-			    ilog2(dev->dev->caps.max_rss_tbl_sz)) {
-				pr_debug("WQN range size must be equal or smaller than %d\n",
-					 dev->dev->caps.max_rss_tbl_sz);
-				err = -EOPNOTSUPP;
-				goto err;
-			}
-			range_size = 1 << ucmd.wq.log_range_size;
-		} else {
-			qp->inl_recv_sz = ucmd.qp.inl_recv_sz;
-		}
+		qp->inl_recv_sz = ucmd.inl_recv_sz;
 
 		if (init_attr->create_flags & IB_QP_CREATE_SCATTER_FCS) {
 			if (!(dev->dev->caps.flags &
@@ -991,33 +1099,19 @@
 			qp->flags |= MLX4_IB_QP_SCATTER_FCS;
 		}
 
-		err = set_rq_size(dev, &init_attr->cap, !!pd->uobject,
+		err = set_rq_size(dev, &init_attr->cap, udata,
 				  qp_has_rq(init_attr), qp, qp->inl_recv_sz);
 		if (err)
 			goto err;
 
-		if (src == MLX4_IB_QP_SRC) {
-			qp->sq_no_prefetch = ucmd.qp.sq_no_prefetch;
+		qp->sq_no_prefetch = ucmd.sq_no_prefetch;
 
-			err = set_user_sq_size(dev, qp,
-					       (struct mlx4_ib_create_qp *)
-					       &ucmd);
-			if (err)
-				goto err;
-		} else {
-			qp->sq_no_prefetch = 1;
-			qp->sq.wqe_cnt = 1;
-			qp->sq.wqe_shift = MLX4_IB_MIN_SQ_STRIDE;
-			/* Allocated buffer expects to have at least that SQ
-			 * size.
-			 */
-			qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) +
-				(qp->sq.wqe_cnt << qp->sq.wqe_shift);
-		}
+		err = set_user_sq_size(dev, qp, &ucmd);
+		if (err)
+			goto err;
 
-		qp->umem = ib_umem_get(pd->uobject->context,
-				(src == MLX4_IB_QP_SRC) ? ucmd.qp.buf_addr :
-				ucmd.wq.buf_addr, qp->buf_size, 0, 0);
+		qp->umem =
+			ib_umem_get(udata, ucmd.buf_addr, qp->buf_size, 0, 0);
 		if (IS_ERR(qp->umem)) {
 			err = PTR_ERR(qp->umem);
 			goto err;
@@ -1035,15 +1129,13 @@
 			goto err_mtt;
 
 		if (qp_has_rq(init_attr)) {
-			err = mlx4_ib_db_map_user(to_mucontext(pd->uobject->context),
-				(src == MLX4_IB_QP_SRC) ? ucmd.qp.db_addr :
-				ucmd.wq.db_addr, &qp->db);
+			err = mlx4_ib_db_map_user(udata, ucmd.db_addr, &qp->db);
 			if (err)
 				goto err_mtt;
 		}
 		qp->mqp.usage = MLX4_RES_USAGE_USER_VERBS;
 	} else {
-		err = set_rq_size(dev, &init_attr->cap, !!pd->uobject,
+		err = set_rq_size(dev, &init_attr->cap, udata,
 				  qp_has_rq(init_attr), qp, 0);
 		if (err)
 			goto err;
@@ -1107,11 +1199,6 @@
 				goto err_wrid;
 			}
 		}
-	} else if (src == MLX4_IB_RWQ_SRC) {
-		err = mlx4_ib_alloc_wqn(to_mucontext(pd->uobject->context), qp,
-					range_size, &qpn);
-		if (err)
-			goto err_wrid;
 	} else {
 		/* Raw packet QPNs may not have bits 6,7 set in their qp_num;
 		 * otherwise, the WQE BlueFlame setup flow wrongly causes
@@ -1150,8 +1237,7 @@
 	 */
 	qp->doorbell_qpn = swab32(qp->mqp.qpn << 8);
 
-	qp->mqp.event = (src == MLX4_IB_QP_SRC) ? mlx4_ib_qp_event :
-						  mlx4_ib_wq_event;
+	qp->mqp.event = mlx4_ib_qp_event;
 
 	if (!*caller_qp)
 		*caller_qp = qp;
@@ -1179,9 +1265,6 @@
 	if (!sqpn) {
 		if (qp->flags & MLX4_IB_QP_NETIF)
 			mlx4_ib_steer_qp_free(dev, qpn, 1);
-		else if (src == MLX4_IB_RWQ_SRC)
-			mlx4_ib_release_wqn(to_mucontext(pd->uobject->context),
-					    qp, 0);
 		else
 			mlx4_qp_release_range(dev->dev, qpn, 1);
 	}
@@ -1189,9 +1272,9 @@
 	if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_GSI)
 		free_proxy_bufs(pd->device, qp);
 err_wrid:
-	if (pd->uobject) {
+	if (udata) {
 		if (qp_has_rq(init_attr))
-			mlx4_ib_db_unmap_user(to_mucontext(pd->uobject->context), &qp->db);
+			mlx4_ib_db_unmap_user(context, &qp->db);
 	} else {
 		kvfree(qp->sq.wrid);
 		kvfree(qp->rq.wrid);
@@ -1201,20 +1284,19 @@
 	mlx4_mtt_cleanup(dev->dev, &qp->mtt);
 
 err_buf:
-	if (pd->uobject)
-		ib_umem_release(qp->umem);
-	else
+	if (!qp->umem)
 		mlx4_buf_free(dev->dev, qp->buf_size, &qp->buf);
+	ib_umem_release(qp->umem);
 
 err_db:
-	if (!pd->uobject && qp_has_rq(init_attr))
+	if (!udata && qp_has_rq(init_attr))
 		mlx4_db_free(dev->dev, &qp->db);
 
 err:
-	if (sqp)
-		kfree(sqp);
-	else if (!*caller_qp)
+	if (!sqp && !*caller_qp)
 		kfree(qp);
+	kfree(sqp);
+
 	return err;
 }
 
@@ -1332,7 +1414,8 @@
 }
 
 static void destroy_qp_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp,
-			      enum mlx4_ib_source_type src, int is_user)
+			      enum mlx4_ib_source_type src,
+			      struct ib_udata *udata)
 {
 	struct mlx4_ib_cq *send_cq, *recv_cq;
 	unsigned long flags;
@@ -1374,7 +1457,7 @@
 	list_del(&qp->qps_list);
 	list_del(&qp->cq_send_list);
 	list_del(&qp->cq_recv_list);
-	if (!is_user) {
+	if (!udata) {
 		__mlx4_ib_cq_clean(recv_cq, qp->mqp.qpn,
 				 qp->ibqp.srq ? to_msrq(qp->ibqp.srq): NULL);
 		if (send_cq != recv_cq)
@@ -1392,22 +1475,28 @@
 		if (qp->flags & MLX4_IB_QP_NETIF)
 			mlx4_ib_steer_qp_free(dev, qp->mqp.qpn, 1);
 		else if (src == MLX4_IB_RWQ_SRC)
-			mlx4_ib_release_wqn(to_mucontext(
-					    qp->ibwq.uobject->context), qp, 1);
+			mlx4_ib_release_wqn(
+				rdma_udata_to_drv_context(
+					udata,
+					struct mlx4_ib_ucontext,
+					ibucontext),
+				qp, 1);
 		else
 			mlx4_qp_release_range(dev->dev, qp->mqp.qpn, 1);
 	}
 
 	mlx4_mtt_cleanup(dev->dev, &qp->mtt);
 
-	if (is_user) {
+	if (udata) {
 		if (qp->rq.wqe_cnt) {
-			struct mlx4_ib_ucontext *mcontext = !src ?
-				to_mucontext(qp->ibqp.uobject->context) :
-				to_mucontext(qp->ibwq.uobject->context);
+			struct mlx4_ib_ucontext *mcontext =
+				rdma_udata_to_drv_context(
+					udata,
+					struct mlx4_ib_ucontext,
+					ibucontext);
+
 			mlx4_ib_db_unmap_user(mcontext, &qp->db);
 		}
-		ib_umem_release(qp->umem);
 	} else {
 		kvfree(qp->sq.wrid);
 		kvfree(qp->rq.wrid);
@@ -1418,6 +1507,7 @@
 		if (qp->rq.wqe_cnt)
 			mlx4_db_free(dev->dev, &qp->db);
 	}
+	ib_umem_release(qp->umem);
 
 	del_gid_entries(qp);
 }
@@ -1505,8 +1595,7 @@
 		/* fall through */
 	case IB_QPT_UD:
 	{
-		err = create_qp_common(to_mdev(pd->device), pd,	MLX4_IB_QP_SRC,
-				       init_attr, udata, 0, &qp);
+		err = create_qp_common(pd, init_attr, udata, 0, &qp);
 		if (err) {
 			kfree(qp);
 			return ERR_PTR(err);
@@ -1536,8 +1625,7 @@
 			sqpn = get_sqp_num(to_mdev(pd->device), init_attr);
 		}
 
-		err = create_qp_common(to_mdev(pd->device), pd, MLX4_IB_QP_SRC,
-				       init_attr, udata, sqpn, &qp);
+		err = create_qp_common(pd, init_attr, udata, sqpn, &qp);
 		if (err)
 			return ERR_PTR(err);
 
@@ -1588,7 +1676,7 @@
 	return ibqp;
 }
 
-static int _mlx4_ib_destroy_qp(struct ib_qp *qp)
+static int _mlx4_ib_destroy_qp(struct ib_qp *qp, struct ib_udata *udata)
 {
 	struct mlx4_ib_dev *dev = to_mdev(qp->device);
 	struct mlx4_ib_qp *mqp = to_mqp(qp);
@@ -1609,10 +1697,7 @@
 	if (qp->rwq_ind_tbl) {
 		destroy_qp_rss(dev, mqp);
 	} else {
-		struct mlx4_ib_pd *pd;
-
-		pd = get_pd(mqp);
-		destroy_qp_common(dev, mqp, MLX4_IB_QP_SRC, !!pd->ibpd.uobject);
+		destroy_qp_common(dev, mqp, MLX4_IB_QP_SRC, udata);
 	}
 
 	if (is_sqp(dev, mqp))
@@ -1623,7 +1708,7 @@
 	return 0;
 }
 
-int mlx4_ib_destroy_qp(struct ib_qp *qp)
+int mlx4_ib_destroy_qp(struct ib_qp *qp, struct ib_udata *udata)
 {
 	struct mlx4_ib_qp *mqp = to_mqp(qp);
 
@@ -1634,7 +1719,7 @@
 			ib_destroy_qp(sqp->roce_v2_gsi);
 	}
 
-	return _mlx4_ib_destroy_qp(qp);
+	return _mlx4_ib_destroy_qp(qp, udata);
 }
 
 static int to_mlx4_st(struct mlx4_ib_dev *dev, enum mlx4_ib_qp_type type)
@@ -1941,7 +2026,8 @@
  * Go over all RSS QP's childes (WQs) and apply their HW state according to
  * their logic state if the RSS QP is the first RSS QP associated for the WQ.
  */
-static int bringup_rss_rwqs(struct ib_rwq_ind_table *ind_tbl, u8 port_num)
+static int bringup_rss_rwqs(struct ib_rwq_ind_table *ind_tbl, u8 port_num,
+			    struct ib_udata *udata)
 {
 	int err = 0;
 	int i;
@@ -1965,7 +2051,7 @@
 		}
 		wq->port = port_num;
 		if ((wq->rss_usecnt == 0) && (ibwq->state == IB_WQS_RDY)) {
-			err = _mlx4_ib_modify_wq(ibwq, IB_WQS_RDY);
+			err = _mlx4_ib_modify_wq(ibwq, IB_WQS_RDY, udata);
 			if (err) {
 				mutex_unlock(&wq->mutex);
 				break;
@@ -1987,7 +2073,8 @@
 
 			if ((wq->rss_usecnt == 1) &&
 			    (ibwq->state == IB_WQS_RDY))
-				if (_mlx4_ib_modify_wq(ibwq, IB_WQS_RESET))
+				if (_mlx4_ib_modify_wq(ibwq, IB_WQS_RESET,
+						       udata))
 					pr_warn("failed to reverse WQN=0x%06x\n",
 						ibwq->wq_num);
 			wq->rss_usecnt--;
@@ -1999,7 +2086,8 @@
 	return err;
 }
 
-static void bring_down_rss_rwqs(struct ib_rwq_ind_table *ind_tbl)
+static void bring_down_rss_rwqs(struct ib_rwq_ind_table *ind_tbl,
+				struct ib_udata *udata)
 {
 	int i;
 
@@ -2010,7 +2098,7 @@
 		mutex_lock(&wq->mutex);
 
 		if ((wq->rss_usecnt == 1) && (ibwq->state == IB_WQS_RDY))
-			if (_mlx4_ib_modify_wq(ibwq, IB_WQS_RESET))
+			if (_mlx4_ib_modify_wq(ibwq, IB_WQS_RESET, udata))
 				pr_warn("failed to reverse WQN=%x\n",
 					ibwq->wq_num);
 		wq->rss_usecnt--;
@@ -2042,9 +2130,10 @@
 
 static int __mlx4_ib_modify_qp(void *src, enum mlx4_ib_source_type src_type,
 			       const struct ib_qp_attr *attr, int attr_mask,
-			       enum ib_qp_state cur_state, enum ib_qp_state new_state)
+			       enum ib_qp_state cur_state,
+			       enum ib_qp_state new_state,
+			       struct ib_udata *udata)
 {
-	struct ib_uobject *ibuobject;
 	struct ib_srq  *ibsrq;
 	const struct ib_gid_attr *gid_attr = NULL;
 	struct ib_rwq_ind_table *rwq_ind_tbl;
@@ -2053,6 +2142,8 @@
 	struct mlx4_ib_qp *qp;
 	struct mlx4_ib_pd *pd;
 	struct mlx4_ib_cq *send_cq, *recv_cq;
+	struct mlx4_ib_ucontext *ucontext = rdma_udata_to_drv_context(
+		udata, struct mlx4_ib_ucontext, ibucontext);
 	struct mlx4_qp_context *context;
 	enum mlx4_qp_optpar optpar = 0;
 	int sqd_event;
@@ -2064,7 +2155,6 @@
 		struct ib_wq *ibwq;
 
 		ibwq	    = (struct ib_wq *)src;
-		ibuobject   = ibwq->uobject;
 		ibsrq	    = NULL;
 		rwq_ind_tbl = NULL;
 		qp_type     = IB_QPT_RAW_PACKET;
@@ -2075,7 +2165,6 @@
 		struct ib_qp *ibqp;
 
 		ibqp	    = (struct ib_qp *)src;
-		ibuobject   = ibqp->uobject;
 		ibsrq	    = ibqp->srq;
 		rwq_ind_tbl = ibqp->rwq_ind_tbl;
 		qp_type     = ibqp->qp_type;
@@ -2160,11 +2249,9 @@
 			context->param3 |= cpu_to_be32(1 << 30);
 	}
 
-	if (ibuobject)
+	if (ucontext)
 		context->usr_page = cpu_to_be32(
-			mlx4_to_hw_uar_index(dev->dev,
-					     to_mucontext(ibuobject->context)
-					     ->uar.index));
+			mlx4_to_hw_uar_index(dev->dev, ucontext->uar.index));
 	else
 		context->usr_page = cpu_to_be32(
 			mlx4_to_hw_uar_index(dev->dev, dev->priv_uar.index));
@@ -2235,8 +2322,10 @@
 
 		if (is_eth) {
 			gid_attr = attr->ah_attr.grh.sgid_attr;
-			vlan = rdma_vlan_dev_vlan_id(gid_attr->ndev);
-			memcpy(smac, gid_attr->ndev->dev_addr, ETH_ALEN);
+			err = rdma_read_gid_l2_fields(gid_attr, &vlan,
+						      &smac[0]);
+			if (err)
+				goto out;
 		}
 
 		if (mlx4_set_path(dev, attr, attr_mask, qp, &context->pri_path,
@@ -2296,7 +2385,7 @@
 	context->cqn_recv = cpu_to_be32(recv_cq->mcq.cqn);
 
 	/* Set "fast registration enabled" for all kernel QPs */
-	if (!ibuobject)
+	if (!ucontext)
 		context->params1 |= cpu_to_be32(1 << 11);
 
 	if (attr_mask & IB_QP_RNR_RETRY) {
@@ -2433,7 +2522,7 @@
 	else
 		sqd_event = 0;
 
-	if (!ibuobject &&
+	if (!ucontext &&
 	    cur_state == IB_QPS_RESET &&
 	    new_state == IB_QPS_INIT)
 		context->rlkey_roce_mode |= (1 << 4);
@@ -2444,7 +2533,7 @@
 	 * headroom is stamped so that the hardware doesn't start
 	 * processing stale work requests.
 	 */
-	if (!ibuobject &&
+	if (!ucontext &&
 	    cur_state == IB_QPS_RESET &&
 	    new_state == IB_QPS_INIT) {
 		struct mlx4_wqe_ctrl_seg *ctrl;
@@ -2508,7 +2597,7 @@
 	 * entries and reinitialize the QP.
 	 */
 	if (new_state == IB_QPS_RESET) {
-		if (!ibuobject) {
+		if (!ucontext) {
 			mlx4_ib_cq_clean(recv_cq, qp->mqp.qpn,
 					 ibsrq ? to_msrq(ibsrq) : NULL);
 			if (send_cq != recv_cq)
@@ -2629,7 +2718,6 @@
 static int _mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
 			      int attr_mask, struct ib_udata *udata)
 {
-	enum rdma_link_layer ll = IB_LINK_LAYER_UNSPECIFIED;
 	struct mlx4_ib_dev *dev = to_mdev(ibqp->device);
 	struct mlx4_ib_qp *qp = to_mqp(ibqp);
 	enum ib_qp_state cur_state, new_state;
@@ -2639,13 +2727,8 @@
 	cur_state = attr_mask & IB_QP_CUR_STATE ? attr->cur_qp_state : qp->state;
 	new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state;
 
-	if (cur_state != new_state || cur_state != IB_QPS_RESET) {
-		int port = attr_mask & IB_QP_PORT ? attr->port_num : qp->port;
-		ll = rdma_port_get_link_layer(&dev->ib_dev, port);
-	}
-
 	if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type,
-				attr_mask, ll)) {
+				attr_mask)) {
 		pr_debug("qpn 0x%x: invalid attribute mask specified "
 			 "for transition %d to %d. qp_type %d,"
 			 " attr_mask 0x%x\n",
@@ -2740,16 +2823,17 @@
 	}
 
 	if (ibqp->rwq_ind_tbl && (new_state == IB_QPS_INIT)) {
-		err = bringup_rss_rwqs(ibqp->rwq_ind_tbl, attr->port_num);
+		err = bringup_rss_rwqs(ibqp->rwq_ind_tbl, attr->port_num,
+				       udata);
 		if (err)
 			goto out;
 	}
 
 	err = __mlx4_ib_modify_qp(ibqp, MLX4_IB_QP_SRC, attr, attr_mask,
-				  cur_state, new_state);
+				  cur_state, new_state, udata);
 
 	if (ibqp->rwq_ind_tbl && err)
-		bring_down_rss_rwqs(ibqp->rwq_ind_tbl);
+		bring_down_rss_rwqs(ibqp->rwq_ind_tbl, udata);
 
 	if (mlx4_is_bonded(dev->dev) && (attr_mask & IB_QP_PORT))
 		attr->port_num = 1;
@@ -3744,12 +3828,6 @@
 		writel_relaxed(qp->doorbell_qpn,
 			to_mdev(ibqp->device)->uar_map + MLX4_SEND_DOORBELL);
 
-		/*
-		 * Make sure doorbells don't leak out of SQ spinlock
-		 * and reach the HCA out of order.
-		 */
-		mmiowb();
-
 		stamp_send_wqe(qp, ind + qp->sq_spare_wqes - 1);
 
 		qp->sq_next_wqe = ind;
@@ -4044,13 +4122,13 @@
 				struct ib_wq_init_attr *init_attr,
 				struct ib_udata *udata)
 {
-	struct mlx4_ib_dev *dev;
-	struct ib_qp_init_attr ib_qp_init_attr;
+	struct mlx4_dev *dev = to_mdev(pd->device)->dev;
+	struct ib_qp_init_attr ib_qp_init_attr = {};
 	struct mlx4_ib_qp *qp;
 	struct mlx4_ib_create_wq ucmd;
 	int err, required_cmd_sz;
 
-	if (!(udata && pd->uobject))
+	if (!udata)
 		return ERR_PTR(-EINVAL);
 
 	required_cmd_sz = offsetof(typeof(ucmd), comp_mask) +
@@ -4070,14 +4148,13 @@
 	if (udata->outlen)
 		return ERR_PTR(-EOPNOTSUPP);
 
-	dev = to_mdev(pd->device);
-
 	if (init_attr->wq_type != IB_WQT_RQ) {
 		pr_debug("unsupported wq type %d\n", init_attr->wq_type);
 		return ERR_PTR(-EOPNOTSUPP);
 	}
 
-	if (init_attr->create_flags & ~IB_WQ_FLAGS_SCATTER_FCS) {
+	if (init_attr->create_flags & ~IB_WQ_FLAGS_SCATTER_FCS ||
+	    !(dev->caps.flags & MLX4_DEV_CAP_FLAG_FCS_KEEP)) {
 		pr_debug("unsupported create_flags %u\n",
 			 init_attr->create_flags);
 		return ERR_PTR(-EOPNOTSUPP);
@@ -4090,7 +4167,6 @@
 	qp->pri.vid = 0xFFFF;
 	qp->alt.vid = 0xFFFF;
 
-	memset(&ib_qp_init_attr, 0, sizeof(ib_qp_init_attr));
 	ib_qp_init_attr.qp_context = init_attr->wq_context;
 	ib_qp_init_attr.qp_type = IB_QPT_RAW_PACKET;
 	ib_qp_init_attr.cap.max_recv_wr = init_attr->max_wr;
@@ -4101,8 +4177,7 @@
 	if (init_attr->create_flags & IB_WQ_FLAGS_SCATTER_FCS)
 		ib_qp_init_attr.create_flags |= IB_QP_CREATE_SCATTER_FCS;
 
-	err = create_qp_common(dev, pd, MLX4_IB_RWQ_SRC, &ib_qp_init_attr,
-			       udata, 0, &qp);
+	err = create_rq(pd, &ib_qp_init_attr, udata, qp);
 	if (err) {
 		kfree(qp);
 		return ERR_PTR(err);
@@ -4127,7 +4202,8 @@
 	}
 }
 
-static int _mlx4_ib_modify_wq(struct ib_wq *ibwq, enum ib_wq_state new_state)
+static int _mlx4_ib_modify_wq(struct ib_wq *ibwq, enum ib_wq_state new_state,
+			      struct ib_udata *udata)
 {
 	struct mlx4_ib_qp *qp = to_mqp((struct ib_qp *)ibwq);
 	enum ib_qp_state qp_cur_state;
@@ -4151,7 +4227,8 @@
 		attr_mask = IB_QP_PORT;
 
 		err = __mlx4_ib_modify_qp(ibwq, MLX4_IB_RWQ_SRC, &attr,
-					  attr_mask, IB_QPS_RESET, IB_QPS_INIT);
+					  attr_mask, IB_QPS_RESET, IB_QPS_INIT,
+					  udata);
 		if (err) {
 			pr_debug("WQN=0x%06x failed to apply RST->INIT on the HW QP\n",
 				 ibwq->wq_num);
@@ -4163,12 +4240,13 @@
 
 	attr_mask = 0;
 	err = __mlx4_ib_modify_qp(ibwq, MLX4_IB_RWQ_SRC, NULL, attr_mask,
-				  qp_cur_state,  qp_new_state);
+				  qp_cur_state,  qp_new_state, udata);
 
 	if (err && (qp_cur_state == IB_QPS_INIT)) {
 		qp_new_state = IB_QPS_RESET;
 		if (__mlx4_ib_modify_qp(ibwq, MLX4_IB_RWQ_SRC, NULL,
-					attr_mask, IB_QPS_INIT, IB_QPS_RESET)) {
+					attr_mask, IB_QPS_INIT, IB_QPS_RESET,
+					udata)) {
 			pr_warn("WQN=0x%06x failed with reverting HW's resources failure\n",
 				ibwq->wq_num);
 			qp_new_state = IB_QPS_INIT;
@@ -4231,7 +4309,7 @@
 	 * WQ, so we can apply its port on the WQ.
 	 */
 	if (qp->rss_usecnt)
-		err = _mlx4_ib_modify_wq(ibwq, new_state);
+		err = _mlx4_ib_modify_wq(ibwq, new_state, udata);
 
 	if (!err)
 		ibwq->state = new_state;
@@ -4241,7 +4319,7 @@
 	return err;
 }
 
-int mlx4_ib_destroy_wq(struct ib_wq *ibwq)
+void mlx4_ib_destroy_wq(struct ib_wq *ibwq, struct ib_udata *udata)
 {
 	struct mlx4_ib_dev *dev = to_mdev(ibwq->device);
 	struct mlx4_ib_qp *qp = to_mqp((struct ib_qp *)ibwq);
@@ -4249,11 +4327,9 @@
 	if (qp->counter_index)
 		mlx4_ib_free_qp_counter(dev, qp);
 
-	destroy_qp_common(dev, qp, MLX4_IB_RWQ_SRC, 1);
+	destroy_qp_common(dev, qp, MLX4_IB_RWQ_SRC, udata);
 
 	kfree(qp);
-
-	return 0;
 }
 
 struct ib_rwq_ind_table
diff --git a/drivers/infiniband/hw/mlx4/srq.c b/drivers/infiniband/hw/mlx4/srq.c
index 3731b31..848db72 100644
--- a/drivers/infiniband/hw/mlx4/srq.c
+++ b/drivers/infiniband/hw/mlx4/srq.c
@@ -37,6 +37,7 @@
 
 #include "mlx4_ib.h"
 #include <rdma/mlx4-abi.h>
+#include <rdma/uverbs_ioctl.h>
 
 static void *get_wqe(struct mlx4_ib_srq *srq, int n)
 {
@@ -68,12 +69,14 @@
 	}
 }
 
-struct ib_srq *mlx4_ib_create_srq(struct ib_pd *pd,
-				  struct ib_srq_init_attr *init_attr,
-				  struct ib_udata *udata)
+int mlx4_ib_create_srq(struct ib_srq *ib_srq,
+		       struct ib_srq_init_attr *init_attr,
+		       struct ib_udata *udata)
 {
-	struct mlx4_ib_dev *dev = to_mdev(pd->device);
-	struct mlx4_ib_srq *srq;
+	struct mlx4_ib_dev *dev = to_mdev(ib_srq->device);
+	struct mlx4_ib_ucontext *ucontext = rdma_udata_to_drv_context(
+		udata, struct mlx4_ib_ucontext, ibucontext);
+	struct mlx4_ib_srq *srq = to_msrq(ib_srq);
 	struct mlx4_wqe_srq_next_seg *next;
 	struct mlx4_wqe_data_seg *scatter;
 	u32 cqn;
@@ -86,11 +89,7 @@
 	/* Sanity check SRQ size before proceeding */
 	if (init_attr->attr.max_wr  >= dev->dev->caps.max_srq_wqes ||
 	    init_attr->attr.max_sge >  dev->dev->caps.max_srq_sge)
-		return ERR_PTR(-EINVAL);
-
-	srq = kmalloc(sizeof *srq, GFP_KERNEL);
-	if (!srq)
-		return ERR_PTR(-ENOMEM);
+		return -EINVAL;
 
 	mutex_init(&srq->mutex);
 	spin_lock_init(&srq->lock);
@@ -105,23 +104,18 @@
 
 	buf_size = srq->msrq.max * desc_size;
 
-	if (pd->uobject) {
+	if (udata) {
 		struct mlx4_ib_create_srq ucmd;
 
-		if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd)) {
-			err = -EFAULT;
-			goto err_srq;
-		}
+		if (ib_copy_from_udata(&ucmd, udata, sizeof(ucmd)))
+			return -EFAULT;
 
-		srq->umem = ib_umem_get(pd->uobject->context, ucmd.buf_addr,
-					buf_size, 0, 0);
-		if (IS_ERR(srq->umem)) {
-			err = PTR_ERR(srq->umem);
-			goto err_srq;
-		}
+		srq->umem = ib_umem_get(udata, ucmd.buf_addr, buf_size, 0, 0);
+		if (IS_ERR(srq->umem))
+			return PTR_ERR(srq->umem);
 
 		err = mlx4_mtt_init(dev->dev, ib_umem_page_count(srq->umem),
-				    srq->umem->page_shift, &srq->mtt);
+				    PAGE_SHIFT, &srq->mtt);
 		if (err)
 			goto err_buf;
 
@@ -129,14 +123,13 @@
 		if (err)
 			goto err_mtt;
 
-		err = mlx4_ib_db_map_user(to_mucontext(pd->uobject->context),
-					  ucmd.db_addr, &srq->db);
+		err = mlx4_ib_db_map_user(udata, ucmd.db_addr, &srq->db);
 		if (err)
 			goto err_mtt;
 	} else {
 		err = mlx4_db_alloc(dev->dev, &srq->db, 0);
 		if (err)
-			goto err_srq;
+			return err;
 
 		*srq->db.db = 0;
 
@@ -183,15 +176,15 @@
 	xrcdn = (init_attr->srq_type == IB_SRQT_XRC) ?
 		to_mxrcd(init_attr->ext.xrc.xrcd)->xrcdn :
 		(u16) dev->dev->caps.reserved_xrcds;
-	err = mlx4_srq_alloc(dev->dev, to_mpd(pd)->pdn, cqn, xrcdn, &srq->mtt,
-			     srq->db.dma, &srq->msrq);
+	err = mlx4_srq_alloc(dev->dev, to_mpd(ib_srq->pd)->pdn, cqn, xrcdn,
+			     &srq->mtt, srq->db.dma, &srq->msrq);
 	if (err)
 		goto err_wrid;
 
 	srq->msrq.event = mlx4_ib_srq_event;
 	srq->ibsrq.ext.xrc.srq_num = srq->msrq.srqn;
 
-	if (pd->uobject)
+	if (udata)
 		if (ib_copy_to_udata(udata, &srq->msrq.srqn, sizeof (__u32))) {
 			err = -EFAULT;
 			goto err_wrid;
@@ -199,11 +192,11 @@
 
 	init_attr->attr.max_wr = srq->msrq.max - 1;
 
-	return &srq->ibsrq;
+	return 0;
 
 err_wrid:
-	if (pd->uobject)
-		mlx4_ib_db_unmap_user(to_mucontext(pd->uobject->context), &srq->db);
+	if (udata)
+		mlx4_ib_db_unmap_user(ucontext, &srq->db);
 	else
 		kvfree(srq->wrid);
 
@@ -211,19 +204,15 @@
 	mlx4_mtt_cleanup(dev->dev, &srq->mtt);
 
 err_buf:
-	if (pd->uobject)
-		ib_umem_release(srq->umem);
-	else
+	if (!srq->umem)
 		mlx4_buf_free(dev->dev, buf_size, &srq->buf);
+	ib_umem_release(srq->umem);
 
 err_db:
-	if (!pd->uobject)
+	if (!udata)
 		mlx4_db_free(dev->dev, &srq->db);
 
-err_srq:
-	kfree(srq);
-
-	return ERR_PTR(err);
+	return err;
 }
 
 int mlx4_ib_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
@@ -270,7 +259,7 @@
 	return 0;
 }
 
-int mlx4_ib_destroy_srq(struct ib_srq *srq)
+void mlx4_ib_destroy_srq(struct ib_srq *srq, struct ib_udata *udata)
 {
 	struct mlx4_ib_dev *dev = to_mdev(srq->device);
 	struct mlx4_ib_srq *msrq = to_msrq(srq);
@@ -278,19 +267,20 @@
 	mlx4_srq_free(dev->dev, &msrq->msrq);
 	mlx4_mtt_cleanup(dev->dev, &msrq->mtt);
 
-	if (srq->uobject) {
-		mlx4_ib_db_unmap_user(to_mucontext(srq->uobject->context), &msrq->db);
-		ib_umem_release(msrq->umem);
+	if (udata) {
+		mlx4_ib_db_unmap_user(
+			rdma_udata_to_drv_context(
+				udata,
+				struct mlx4_ib_ucontext,
+				ibucontext),
+			&msrq->db);
 	} else {
 		kvfree(msrq->wrid);
 		mlx4_buf_free(dev->dev, msrq->msrq.max << msrq->msrq.wqe_shift,
 			      &msrq->buf);
 		mlx4_db_free(dev->dev, &msrq->db);
 	}
-
-	kfree(msrq);
-
-	return 0;
+	ib_umem_release(msrq->umem);
 }
 
 void mlx4_ib_free_srq_wqe(struct mlx4_ib_srq *srq, int wqe_index)
diff --git a/drivers/infiniband/hw/mlx4/sysfs.c b/drivers/infiniband/hw/mlx4/sysfs.c
index e219093..ea1f3a0 100644
--- a/drivers/infiniband/hw/mlx4/sysfs.c
+++ b/drivers/infiniband/hw/mlx4/sysfs.c
@@ -353,16 +353,12 @@
 
 static void get_name(struct mlx4_ib_dev *dev, char *name, int i, int max)
 {
-	char base_name[9];
-
-	/* pci_name format is: bus:dev:func -> xxxx:yy:zz.n */
-	strlcpy(name, pci_name(dev->dev->persist->pdev), max);
-	strncpy(base_name, name, 8); /*till xxxx:yy:*/
-	base_name[8] = '\0';
-	/* with no ARI only 3 last bits are used so when the fn is higher than 8
+	/* pci_name format is: bus:dev:func -> xxxx:yy:zz.n
+	 * with no ARI only 3 last bits are used so when the fn is higher than 8
 	 * need to add it to the dev num, so count in the last number will be
 	 * modulo 8 */
-	sprintf(name, "%s%.2d.%d", base_name, (i/8), (i%8));
+	snprintf(name, max, "%.8s%.2d.%d", pci_name(dev->dev->persist->pdev),
+		 i / 8, i % 8);
 }
 
 struct mlx4_port {
@@ -818,9 +814,7 @@
 	if (!mlx4_is_master(dev->dev))
 		return 0;
 
-	dev->iov_parent =
-		kobject_create_and_add("iov",
-				       kobject_get(dev->ib_dev.ports_parent->parent));
+	dev->iov_parent = kobject_create_and_add("iov", &dev->ib_dev.dev.kobj);
 	if (!dev->iov_parent) {
 		ret = -ENOMEM;
 		goto err;
@@ -850,7 +844,6 @@
 err_ports:
 	kobject_put(dev->iov_parent);
 err:
-	kobject_put(dev->ib_dev.ports_parent->parent);
 	pr_err("mlx4_ib_device_register_sysfs error (%d)\n", ret);
 	return ret;
 }
@@ -886,5 +879,4 @@
 	kobject_put(device->ports_parent);
 	kobject_put(device->iov_parent);
 	kobject_put(device->iov_parent);
-	kobject_put(device->ib_dev.ports_parent->parent);
 }
diff --git a/drivers/infiniband/hw/mlx5/Kconfig b/drivers/infiniband/hw/mlx5/Kconfig
index 0440966..ea248de 100644
--- a/drivers/infiniband/hw/mlx5/Kconfig
+++ b/drivers/infiniband/hw/mlx5/Kconfig
@@ -1,7 +1,7 @@
+# SPDX-License-Identifier: GPL-2.0-only
 config MLX5_INFINIBAND
 	tristate "Mellanox 5th generation network adapters (ConnectX series) support"
 	depends on NETDEVICES && ETHERNET && PCI && MLX5_CORE
-	depends on INFINIBAND_USER_ACCESS || INFINIBAND_USER_ACCESS=n
 	---help---
 	  This driver provides low-level InfiniBand support for
 	  Mellanox Connect-IB PCI Express host channel adapters (HCAs).
diff --git a/drivers/infiniband/hw/mlx5/Makefile b/drivers/infiniband/hw/mlx5/Makefile
index b8e4b15..9924be8 100644
--- a/drivers/infiniband/hw/mlx5/Makefile
+++ b/drivers/infiniband/hw/mlx5/Makefile
@@ -1,6 +1,9 @@
+# SPDX-License-Identifier: GPL-2.0-only
 obj-$(CONFIG_MLX5_INFINIBAND)	+= mlx5_ib.o
 
-mlx5_ib-y :=	main.o cq.o doorbell.o qp.o mem.o srq.o mr.o ah.o mad.o gsi.o ib_virt.o cmd.o cong.o
+mlx5_ib-y :=	main.o cq.o doorbell.o qp.o mem.o srq_cmd.o \
+		srq.o mr.o ah.o mad.o gsi.o ib_virt.o cmd.o \
+		cong.o
 mlx5_ib-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += odp.o
 mlx5_ib-$(CONFIG_MLX5_ESWITCH) += ib_rep.o
 mlx5_ib-$(CONFIG_INFINIBAND_USER_ACCESS) += devx.o
diff --git a/drivers/infiniband/hw/mlx5/ah.c b/drivers/infiniband/hw/mlx5/ah.c
index ffd03bf..80642dd 100644
--- a/drivers/infiniband/hw/mlx5/ah.c
+++ b/drivers/infiniband/hw/mlx5/ah.c
@@ -32,9 +32,8 @@
 
 #include "mlx5_ib.h"
 
-static struct ib_ah *create_ib_ah(struct mlx5_ib_dev *dev,
-				  struct mlx5_ib_ah *ah,
-				  struct rdma_ah_attr *ah_attr)
+static void create_ib_ah(struct mlx5_ib_dev *dev, struct mlx5_ib_ah *ah,
+			 struct rdma_ah_attr *ah_attr)
 {
 	enum ib_gid_type gid_type;
 
@@ -67,21 +66,19 @@
 		ah->av.fl_mlid = rdma_ah_get_path_bits(ah_attr) & 0x7f;
 		ah->av.stat_rate_sl |= (rdma_ah_get_sl(ah_attr) & 0xf);
 	}
-
-	return &ah->ibah;
 }
 
-struct ib_ah *mlx5_ib_create_ah(struct ib_pd *pd, struct rdma_ah_attr *ah_attr,
-				struct ib_udata *udata)
+int mlx5_ib_create_ah(struct ib_ah *ibah, struct rdma_ah_attr *ah_attr,
+		      u32 flags, struct ib_udata *udata)
 
 {
-	struct mlx5_ib_ah *ah;
-	struct mlx5_ib_dev *dev = to_mdev(pd->device);
+	struct mlx5_ib_ah *ah = to_mah(ibah);
+	struct mlx5_ib_dev *dev = to_mdev(ibah->device);
 	enum rdma_ah_attr_type ah_type = ah_attr->type;
 
 	if ((ah_type == RDMA_AH_ATTR_TYPE_ROCE) &&
 	    !(rdma_ah_get_ah_flags(ah_attr) & IB_AH_GRH))
-		return ERR_PTR(-EINVAL);
+		return -EINVAL;
 
 	if (ah_type == RDMA_AH_ATTR_TYPE_ROCE && udata) {
 		int err;
@@ -90,21 +87,18 @@
 				   sizeof(resp.dmac);
 
 		if (udata->outlen < min_resp_len)
-			return ERR_PTR(-EINVAL);
+			return -EINVAL;
 
 		resp.response_length = min_resp_len;
 
 		memcpy(resp.dmac, ah_attr->roce.dmac, ETH_ALEN);
 		err = ib_copy_to_udata(udata, &resp, resp.response_length);
 		if (err)
-			return ERR_PTR(err);
+			return err;
 	}
 
-	ah = kzalloc(sizeof(*ah), GFP_ATOMIC);
-	if (!ah)
-		return ERR_PTR(-ENOMEM);
-
-	return create_ib_ah(dev, ah, ah_attr); /* never fails */
+	create_ib_ah(dev, ah, ah_attr);
+	return 0;
 }
 
 int mlx5_ib_query_ah(struct ib_ah *ibah, struct rdma_ah_attr *ah_attr)
@@ -131,8 +125,7 @@
 	return 0;
 }
 
-int mlx5_ib_destroy_ah(struct ib_ah *ah)
+void mlx5_ib_destroy_ah(struct ib_ah *ah, u32 flags)
 {
-	kfree(to_mah(ah));
-	return 0;
+	return;
 }
diff --git a/drivers/infiniband/hw/mlx5/cmd.c b/drivers/infiniband/hw/mlx5/cmd.c
index c84fef9..4937947 100644
--- a/drivers/infiniband/hw/mlx5/cmd.c
+++ b/drivers/infiniband/hw/mlx5/cmd.c
@@ -82,10 +82,10 @@
 	return mlx5_cmd_exec(dev, in, in_size, out, sizeof(out));
 }
 
-int mlx5_cmd_alloc_memic(struct mlx5_memic *memic, phys_addr_t *addr,
-			  u64 length, u32 alignment)
+int mlx5_cmd_alloc_memic(struct mlx5_dm *dm, phys_addr_t *addr,
+			 u64 length, u32 alignment)
 {
-	struct mlx5_core_dev *dev = memic->dev;
+	struct mlx5_core_dev *dev = dm->dev;
 	u64 num_memic_hw_pages = MLX5_CAP_DEV_MEM(dev, memic_bar_size)
 					>> PAGE_SHIFT;
 	u64 hw_start_addr = MLX5_CAP64_DEV_MEM(dev, memic_bar_start_addr);
@@ -115,17 +115,17 @@
 		 mlx5_alignment);
 
 	while (page_idx < num_memic_hw_pages) {
-		spin_lock(&memic->memic_lock);
-		page_idx = bitmap_find_next_zero_area(memic->memic_alloc_pages,
+		spin_lock(&dm->lock);
+		page_idx = bitmap_find_next_zero_area(dm->memic_alloc_pages,
 						      num_memic_hw_pages,
 						      page_idx,
 						      num_pages, 0);
 
 		if (page_idx < num_memic_hw_pages)
-			bitmap_set(memic->memic_alloc_pages,
+			bitmap_set(dm->memic_alloc_pages,
 				   page_idx, num_pages);
 
-		spin_unlock(&memic->memic_lock);
+		spin_unlock(&dm->lock);
 
 		if (page_idx >= num_memic_hw_pages)
 			break;
@@ -135,10 +135,10 @@
 
 		ret = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
 		if (ret) {
-			spin_lock(&memic->memic_lock);
-			bitmap_clear(memic->memic_alloc_pages,
+			spin_lock(&dm->lock);
+			bitmap_clear(dm->memic_alloc_pages,
 				     page_idx, num_pages);
-			spin_unlock(&memic->memic_lock);
+			spin_unlock(&dm->lock);
 
 			if (ret == -EAGAIN) {
 				page_idx++;
@@ -148,7 +148,7 @@
 			return ret;
 		}
 
-		*addr = pci_resource_start(dev->pdev, 0) +
+		*addr = dev->bar_addr +
 			MLX5_GET64(alloc_memic_out, out, memic_start_addr);
 
 		return 0;
@@ -157,9 +157,9 @@
 	return -ENOMEM;
 }
 
-int mlx5_cmd_dealloc_memic(struct mlx5_memic *memic, u64 addr, u64 length)
+int mlx5_cmd_dealloc_memic(struct mlx5_dm *dm, phys_addr_t addr, u64 length)
 {
-	struct mlx5_core_dev *dev = memic->dev;
+	struct mlx5_core_dev *dev = dm->dev;
 	u64 hw_start_addr = MLX5_CAP64_DEV_MEM(dev, memic_bar_start_addr);
 	u32 num_pages = DIV_ROUND_UP(length, PAGE_SIZE);
 	u32 out[MLX5_ST_SZ_DW(dealloc_memic_out)] = {0};
@@ -167,7 +167,7 @@
 	u64 start_page_idx;
 	int err;
 
-	addr -= pci_resource_start(dev->pdev, 0);
+	addr -= dev->bar_addr;
 	start_page_idx = (addr - hw_start_addr) >> PAGE_SHIFT;
 
 	MLX5_SET(dealloc_memic_in, in, opcode, MLX5_CMD_OP_DEALLOC_MEMIC);
@@ -177,10 +177,10 @@
 	err =  mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
 
 	if (!err) {
-		spin_lock(&memic->memic_lock);
-		bitmap_clear(memic->memic_alloc_pages,
+		spin_lock(&dm->lock);
+		bitmap_clear(dm->memic_alloc_pages,
 			     start_page_idx, num_pages);
-		spin_unlock(&memic->memic_lock);
+		spin_unlock(&dm->lock);
 	}
 
 	return err;
@@ -197,3 +197,188 @@
 	return  mlx5_core_access_reg(dev, in, sz, out, sz, MLX5_REG_PPCNT,
 				     0, 0);
 }
+
+void mlx5_cmd_destroy_tir(struct mlx5_core_dev *dev, u32 tirn, u16 uid)
+{
+	u32 in[MLX5_ST_SZ_DW(destroy_tir_in)]   = {};
+	u32 out[MLX5_ST_SZ_DW(destroy_tir_out)] = {};
+
+	MLX5_SET(destroy_tir_in, in, opcode, MLX5_CMD_OP_DESTROY_TIR);
+	MLX5_SET(destroy_tir_in, in, tirn, tirn);
+	MLX5_SET(destroy_tir_in, in, uid, uid);
+	mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+}
+
+void mlx5_cmd_destroy_tis(struct mlx5_core_dev *dev, u32 tisn, u16 uid)
+{
+	u32 in[MLX5_ST_SZ_DW(destroy_tis_in)]   = {0};
+	u32 out[MLX5_ST_SZ_DW(destroy_tis_out)] = {0};
+
+	MLX5_SET(destroy_tis_in, in, opcode, MLX5_CMD_OP_DESTROY_TIS);
+	MLX5_SET(destroy_tis_in, in, tisn, tisn);
+	MLX5_SET(destroy_tis_in, in, uid, uid);
+	mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+}
+
+void mlx5_cmd_destroy_rqt(struct mlx5_core_dev *dev, u32 rqtn, u16 uid)
+{
+	u32 in[MLX5_ST_SZ_DW(destroy_rqt_in)]   = {};
+	u32 out[MLX5_ST_SZ_DW(destroy_rqt_out)] = {};
+
+	MLX5_SET(destroy_rqt_in, in, opcode, MLX5_CMD_OP_DESTROY_RQT);
+	MLX5_SET(destroy_rqt_in, in, rqtn, rqtn);
+	MLX5_SET(destroy_rqt_in, in, uid, uid);
+	mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+}
+
+int mlx5_cmd_alloc_transport_domain(struct mlx5_core_dev *dev, u32 *tdn,
+				    u16 uid)
+{
+	u32 in[MLX5_ST_SZ_DW(alloc_transport_domain_in)]   = {0};
+	u32 out[MLX5_ST_SZ_DW(alloc_transport_domain_out)] = {0};
+	int err;
+
+	MLX5_SET(alloc_transport_domain_in, in, opcode,
+		 MLX5_CMD_OP_ALLOC_TRANSPORT_DOMAIN);
+	MLX5_SET(alloc_transport_domain_in, in, uid, uid);
+
+	err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+	if (!err)
+		*tdn = MLX5_GET(alloc_transport_domain_out, out,
+				transport_domain);
+
+	return err;
+}
+
+void mlx5_cmd_dealloc_transport_domain(struct mlx5_core_dev *dev, u32 tdn,
+				       u16 uid)
+{
+	u32 in[MLX5_ST_SZ_DW(dealloc_transport_domain_in)]   = {0};
+	u32 out[MLX5_ST_SZ_DW(dealloc_transport_domain_out)] = {0};
+
+	MLX5_SET(dealloc_transport_domain_in, in, opcode,
+		 MLX5_CMD_OP_DEALLOC_TRANSPORT_DOMAIN);
+	MLX5_SET(dealloc_transport_domain_in, in, uid, uid);
+	MLX5_SET(dealloc_transport_domain_in, in, transport_domain, tdn);
+	mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+}
+
+void mlx5_cmd_dealloc_pd(struct mlx5_core_dev *dev, u32 pdn, u16 uid)
+{
+	u32 out[MLX5_ST_SZ_DW(dealloc_pd_out)] = {};
+	u32 in[MLX5_ST_SZ_DW(dealloc_pd_in)]   = {};
+
+	MLX5_SET(dealloc_pd_in, in, opcode, MLX5_CMD_OP_DEALLOC_PD);
+	MLX5_SET(dealloc_pd_in, in, pd, pdn);
+	MLX5_SET(dealloc_pd_in, in, uid, uid);
+	mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+}
+
+int mlx5_cmd_attach_mcg(struct mlx5_core_dev *dev, union ib_gid *mgid,
+			u32 qpn, u16 uid)
+{
+	u32 out[MLX5_ST_SZ_DW(attach_to_mcg_out)] = {};
+	u32 in[MLX5_ST_SZ_DW(attach_to_mcg_in)]   = {};
+	void *gid;
+
+	MLX5_SET(attach_to_mcg_in, in, opcode, MLX5_CMD_OP_ATTACH_TO_MCG);
+	MLX5_SET(attach_to_mcg_in, in, qpn, qpn);
+	MLX5_SET(attach_to_mcg_in, in, uid, uid);
+	gid = MLX5_ADDR_OF(attach_to_mcg_in, in, multicast_gid);
+	memcpy(gid, mgid, sizeof(*mgid));
+	return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+}
+
+int mlx5_cmd_detach_mcg(struct mlx5_core_dev *dev, union ib_gid *mgid,
+			u32 qpn, u16 uid)
+{
+	u32 out[MLX5_ST_SZ_DW(detach_from_mcg_out)] = {};
+	u32 in[MLX5_ST_SZ_DW(detach_from_mcg_in)]   = {};
+	void *gid;
+
+	MLX5_SET(detach_from_mcg_in, in, opcode, MLX5_CMD_OP_DETACH_FROM_MCG);
+	MLX5_SET(detach_from_mcg_in, in, qpn, qpn);
+	MLX5_SET(detach_from_mcg_in, in, uid, uid);
+	gid = MLX5_ADDR_OF(detach_from_mcg_in, in, multicast_gid);
+	memcpy(gid, mgid, sizeof(*mgid));
+	return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+}
+
+int mlx5_cmd_xrcd_alloc(struct mlx5_core_dev *dev, u32 *xrcdn, u16 uid)
+{
+	u32 out[MLX5_ST_SZ_DW(alloc_xrcd_out)] = {};
+	u32 in[MLX5_ST_SZ_DW(alloc_xrcd_in)]   = {};
+	int err;
+
+	MLX5_SET(alloc_xrcd_in, in, opcode, MLX5_CMD_OP_ALLOC_XRCD);
+	MLX5_SET(alloc_xrcd_in, in, uid, uid);
+	err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+	if (!err)
+		*xrcdn = MLX5_GET(alloc_xrcd_out, out, xrcd);
+	return err;
+}
+
+int mlx5_cmd_xrcd_dealloc(struct mlx5_core_dev *dev, u32 xrcdn, u16 uid)
+{
+	u32 out[MLX5_ST_SZ_DW(dealloc_xrcd_out)] = {};
+	u32 in[MLX5_ST_SZ_DW(dealloc_xrcd_in)]   = {};
+
+	MLX5_SET(dealloc_xrcd_in, in, opcode, MLX5_CMD_OP_DEALLOC_XRCD);
+	MLX5_SET(dealloc_xrcd_in, in, xrcd, xrcdn);
+	MLX5_SET(dealloc_xrcd_in, in, uid, uid);
+	return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+}
+
+int mlx5_cmd_alloc_q_counter(struct mlx5_core_dev *dev, u16 *counter_id,
+			     u16 uid)
+{
+	u32 in[MLX5_ST_SZ_DW(alloc_q_counter_in)]   = {0};
+	u32 out[MLX5_ST_SZ_DW(alloc_q_counter_out)] = {0};
+	int err;
+
+	MLX5_SET(alloc_q_counter_in, in, opcode, MLX5_CMD_OP_ALLOC_Q_COUNTER);
+	MLX5_SET(alloc_q_counter_in, in, uid, uid);
+
+	err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+	if (!err)
+		*counter_id = MLX5_GET(alloc_q_counter_out, out,
+				       counter_set_id);
+	return err;
+}
+
+int mlx5_cmd_mad_ifc(struct mlx5_core_dev *dev, const void *inb, void *outb,
+		     u16 opmod, u8 port)
+{
+	int outlen = MLX5_ST_SZ_BYTES(mad_ifc_out);
+	int inlen = MLX5_ST_SZ_BYTES(mad_ifc_in);
+	int err = -ENOMEM;
+	void *data;
+	void *resp;
+	u32 *out;
+	u32 *in;
+
+	in = kzalloc(inlen, GFP_KERNEL);
+	out = kzalloc(outlen, GFP_KERNEL);
+	if (!in || !out)
+		goto out;
+
+	MLX5_SET(mad_ifc_in, in, opcode, MLX5_CMD_OP_MAD_IFC);
+	MLX5_SET(mad_ifc_in, in, op_mod, opmod);
+	MLX5_SET(mad_ifc_in, in, port, port);
+
+	data = MLX5_ADDR_OF(mad_ifc_in, in, mad);
+	memcpy(data, inb, MLX5_FLD_SZ_BYTES(mad_ifc_in, mad));
+
+	err = mlx5_cmd_exec(dev, in, inlen, out, outlen);
+	if (err)
+		goto out;
+
+	resp = MLX5_ADDR_OF(mad_ifc_out, out, response_mad_packet);
+	memcpy(outb, resp,
+	       MLX5_FLD_SZ_BYTES(mad_ifc_out, response_mad_packet));
+
+out:
+	kfree(out);
+	kfree(in);
+	return err;
+}
diff --git a/drivers/infiniband/hw/mlx5/cmd.h b/drivers/infiniband/hw/mlx5/cmd.h
index 88cbb1c..169cab4 100644
--- a/drivers/infiniband/hw/mlx5/cmd.h
+++ b/drivers/infiniband/hw/mlx5/cmd.h
@@ -44,7 +44,25 @@
 int mlx5_cmd_query_ext_ppcnt_counters(struct mlx5_core_dev *dev, void *out);
 int mlx5_cmd_modify_cong_params(struct mlx5_core_dev *mdev,
 				void *in, int in_size);
-int mlx5_cmd_alloc_memic(struct mlx5_memic *memic, phys_addr_t *addr,
+int mlx5_cmd_alloc_memic(struct mlx5_dm *dm, phys_addr_t *addr,
 			 u64 length, u32 alignment);
-int mlx5_cmd_dealloc_memic(struct mlx5_memic *memic, u64 addr, u64 length);
+int mlx5_cmd_dealloc_memic(struct mlx5_dm *dm, phys_addr_t addr, u64 length);
+void mlx5_cmd_dealloc_pd(struct mlx5_core_dev *dev, u32 pdn, u16 uid);
+void mlx5_cmd_destroy_tir(struct mlx5_core_dev *dev, u32 tirn, u16 uid);
+void mlx5_cmd_destroy_tis(struct mlx5_core_dev *dev, u32 tisn, u16 uid);
+void mlx5_cmd_destroy_rqt(struct mlx5_core_dev *dev, u32 rqtn, u16 uid);
+int mlx5_cmd_alloc_transport_domain(struct mlx5_core_dev *dev, u32 *tdn,
+				    u16 uid);
+void mlx5_cmd_dealloc_transport_domain(struct mlx5_core_dev *dev, u32 tdn,
+				       u16 uid);
+int mlx5_cmd_attach_mcg(struct mlx5_core_dev *dev, union ib_gid *mgid,
+			u32 qpn, u16 uid);
+int mlx5_cmd_detach_mcg(struct mlx5_core_dev *dev, union ib_gid *mgid,
+			u32 qpn, u16 uid);
+int mlx5_cmd_xrcd_alloc(struct mlx5_core_dev *dev, u32 *xrcdn, u16 uid);
+int mlx5_cmd_xrcd_dealloc(struct mlx5_core_dev *dev, u32 xrcdn, u16 uid);
+int mlx5_cmd_alloc_q_counter(struct mlx5_core_dev *dev, u16 *counter_id,
+			     u16 uid);
+int mlx5_cmd_mad_ifc(struct mlx5_core_dev *dev, const void *inb, void *outb,
+		     u16 opmod, u8 port);
 #endif /* MLX5_IB_CMD_H */
diff --git a/drivers/infiniband/hw/mlx5/cong.c b/drivers/infiniband/hw/mlx5/cong.c
index 7e4e358..8ba439f 100644
--- a/drivers/infiniband/hw/mlx5/cong.c
+++ b/drivers/infiniband/hw/mlx5/cong.c
@@ -389,19 +389,19 @@
 	dev->port[port_num].dbg_cc_params = NULL;
 }
 
-int mlx5_ib_init_cong_debugfs(struct mlx5_ib_dev *dev, u8 port_num)
+void mlx5_ib_init_cong_debugfs(struct mlx5_ib_dev *dev, u8 port_num)
 {
 	struct mlx5_ib_dbg_cc_params *dbg_cc_params;
 	struct mlx5_core_dev *mdev;
 	int i;
 
 	if (!mlx5_debugfs_root)
-		goto out;
+		return;
 
 	/* Takes a 1-based port number */
 	mdev = mlx5_ib_get_native_port_mdev(dev, port_num + 1, NULL);
 	if (!mdev)
-		goto out;
+		return;
 
 	if (!MLX5_CAP_GEN(mdev, cc_query_allowed) ||
 	    !MLX5_CAP_GEN(mdev, cc_modify_allowed))
@@ -415,8 +415,6 @@
 
 	dbg_cc_params->root = debugfs_create_dir("cc_params",
 						 mdev->priv.dbg_root);
-	if (!dbg_cc_params->root)
-		goto err;
 
 	for (i = 0; i < MLX5_IB_DBG_CC_MAX; i++) {
 		dbg_cc_params->params[i].offset = i;
@@ -427,14 +425,11 @@
 					    0600, dbg_cc_params->root,
 					    &dbg_cc_params->params[i],
 					    &dbg_cc_fops);
-		if (!dbg_cc_params->params[i].dentry)
-			goto err;
 	}
 
 put_mdev:
 	mlx5_ib_put_native_port_mdev(dev, port_num + 1);
-out:
-	return 0;
+	return;
 
 err:
 	mlx5_ib_warn(dev, "cong debugfs failure\n");
@@ -445,5 +440,5 @@
 	 * We don't want to fail driver if debugfs failed to initialize,
 	 * so we are not forwarding error to the user.
 	 */
-	return 0;
+	return;
 }
diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c
index 088205d..45f48cd 100644
--- a/drivers/infiniband/hw/mlx5/cq.c
+++ b/drivers/infiniband/hw/mlx5/cq.c
@@ -35,8 +35,9 @@
 #include <rdma/ib_user_verbs.h>
 #include <rdma/ib_cache.h>
 #include "mlx5_ib.h"
+#include "srq.h"
 
-static void mlx5_ib_cq_comp(struct mlx5_core_cq *cq)
+static void mlx5_ib_cq_comp(struct mlx5_core_cq *cq, struct mlx5_eqe *eqe)
 {
 	struct ib_cq *ibcq = &to_mibcq(cq)->ibcq;
 
@@ -81,7 +82,7 @@
 
 	cqe64 = (cq->mcq.cqe_sz == 64) ? cqe : cqe + 64;
 
-	if (likely((cqe64->op_own) >> 4 != MLX5_CQE_INVALID) &&
+	if (likely(get_cqe_opcode(cqe64) != MLX5_CQE_INVALID) &&
 	    !((cqe64->op_own & MLX5_CQE_OWNER_MASK) ^ !!(n & (cq->ibcq.cqe + 1)))) {
 		return cqe;
 	} else {
@@ -177,8 +178,7 @@
 		struct mlx5_core_srq *msrq = NULL;
 
 		if (qp->ibqp.xrcd) {
-			msrq = mlx5_core_get_srq(dev->mdev,
-						 be32_to_cpu(cqe->srqn));
+			msrq = mlx5_cmd_get_srq(dev, be32_to_cpu(cqe->srqn));
 			srq = to_mibsrq(msrq);
 		} else {
 			srq = to_msrq(qp->ibqp.srq);
@@ -187,8 +187,8 @@
 			wqe_ctr = be16_to_cpu(cqe->wqe_counter);
 			wc->wr_id = srq->wrid[wqe_ctr];
 			mlx5_ib_free_srq_wqe(srq, wqe_ctr);
-			if (msrq && atomic_dec_and_test(&msrq->refcount))
-				complete(&msrq->free);
+			if (msrq)
+				mlx5_core_res_put(&msrq->common);
 		}
 	} else {
 		wq	  = &qp->rq;
@@ -197,7 +197,7 @@
 	}
 	wc->byte_len = be32_to_cpu(cqe->byte_cnt);
 
-	switch (cqe->op_own >> 4) {
+	switch (get_cqe_opcode(cqe)) {
 	case MLX5_CQE_RESP_WR_IMM:
 		wc->opcode	= IB_WC_RECV_RDMA_WITH_IMM;
 		wc->wc_flags	= IB_WC_WITH_IMM;
@@ -330,70 +330,9 @@
 		dump_cqe(dev, cqe);
 }
 
-static int is_atomic_response(struct mlx5_ib_qp *qp, uint16_t idx)
-{
-	/* TBD: waiting decision
-	*/
-	return 0;
-}
-
-static void *mlx5_get_atomic_laddr(struct mlx5_ib_qp *qp, uint16_t idx)
-{
-	struct mlx5_wqe_data_seg *dpseg;
-	void *addr;
-
-	dpseg = mlx5_get_send_wqe(qp, idx) + sizeof(struct mlx5_wqe_ctrl_seg) +
-		sizeof(struct mlx5_wqe_raddr_seg) +
-		sizeof(struct mlx5_wqe_atomic_seg);
-	addr = (void *)(unsigned long)be64_to_cpu(dpseg->addr);
-	return addr;
-}
-
-static void handle_atomic(struct mlx5_ib_qp *qp, struct mlx5_cqe64 *cqe64,
-			  uint16_t idx)
-{
-	void *addr;
-	int byte_count;
-	int i;
-
-	if (!is_atomic_response(qp, idx))
-		return;
-
-	byte_count = be32_to_cpu(cqe64->byte_cnt);
-	addr = mlx5_get_atomic_laddr(qp, idx);
-
-	if (byte_count == 4) {
-		*(uint32_t *)addr = be32_to_cpu(*((__be32 *)addr));
-	} else {
-		for (i = 0; i < byte_count; i += 8) {
-			*(uint64_t *)addr = be64_to_cpu(*((__be64 *)addr));
-			addr += 8;
-		}
-	}
-
-	return;
-}
-
-static void handle_atomics(struct mlx5_ib_qp *qp, struct mlx5_cqe64 *cqe64,
-			   u16 tail, u16 head)
-{
-	u16 idx;
-
-	do {
-		idx = tail & (qp->sq.wqe_cnt - 1);
-		handle_atomic(qp, cqe64, idx);
-		if (idx == head)
-			break;
-
-		tail = qp->sq.w_list[idx].next;
-	} while (1);
-	tail = qp->sq.w_list[idx].next;
-	qp->sq.last_poll = tail;
-}
-
 static void free_cq_buf(struct mlx5_ib_dev *dev, struct mlx5_ib_cq_buf *buf)
 {
-	mlx5_frag_buf_free(dev->mdev, &buf->fbc.frag_buf);
+	mlx5_frag_buf_free(dev->mdev, &buf->frag_buf);
 }
 
 static void get_sig_err_item(struct mlx5_sig_err_cqe *cqe,
@@ -428,45 +367,15 @@
 	item->key = be32_to_cpu(cqe->mkey);
 }
 
-static void sw_send_comp(struct mlx5_ib_qp *qp, int num_entries,
-			 struct ib_wc *wc, int *npolled)
-{
-	struct mlx5_ib_wq *wq;
-	unsigned int cur;
-	unsigned int idx;
-	int np;
-	int i;
-
-	wq = &qp->sq;
-	cur = wq->head - wq->tail;
-	np = *npolled;
-
-	if (cur == 0)
-		return;
-
-	for (i = 0;  i < cur && np < num_entries; i++) {
-		idx = wq->last_poll & (wq->wqe_cnt - 1);
-		wc->wr_id = wq->wrid[idx];
-		wc->status = IB_WC_WR_FLUSH_ERR;
-		wc->vendor_err = MLX5_CQE_SYNDROME_WR_FLUSH_ERR;
-		wq->tail++;
-		np++;
-		wc->qp = &qp->ibqp;
-		wc++;
-		wq->last_poll = wq->w_list[idx].next;
-	}
-	*npolled = np;
-}
-
-static void sw_recv_comp(struct mlx5_ib_qp *qp, int num_entries,
-			 struct ib_wc *wc, int *npolled)
+static void sw_comp(struct mlx5_ib_qp *qp, int num_entries, struct ib_wc *wc,
+		    int *npolled, int is_send)
 {
 	struct mlx5_ib_wq *wq;
 	unsigned int cur;
 	int np;
 	int i;
 
-	wq = &qp->rq;
+	wq = (is_send) ? &qp->sq : &qp->rq;
 	cur = wq->head - wq->tail;
 	np = *npolled;
 
@@ -493,13 +402,13 @@
 	*npolled = 0;
 	/* Find uncompleted WQEs belonging to that cq and return mmics ones */
 	list_for_each_entry(qp, &cq->list_send_qp, cq_send_list) {
-		sw_send_comp(qp, num_entries, wc + *npolled, npolled);
+		sw_comp(qp, num_entries, wc + *npolled, npolled, true);
 		if (*npolled >= num_entries)
 			return;
 	}
 
 	list_for_each_entry(qp, &cq->list_recv_qp, cq_recv_list) {
-		sw_recv_comp(qp, num_entries, wc + *npolled, npolled);
+		sw_comp(qp, num_entries, wc + *npolled, npolled, false);
 		if (*npolled >= num_entries)
 			return;
 	}
@@ -537,7 +446,7 @@
 	 */
 	rmb();
 
-	opcode = cqe64->op_own >> 4;
+	opcode = get_cqe_opcode(cqe64);
 	if (unlikely(opcode == MLX5_CQE_RESIZE_CQ)) {
 		if (likely(cq->resize_buf)) {
 			free_cq_buf(dev, &cq->buf);
@@ -567,7 +476,6 @@
 		wqe_ctr = be16_to_cpu(cqe64->wqe_counter);
 		idx = wqe_ctr & (wq->wqe_cnt - 1);
 		handle_good_req(wc, cqe64, wq, idx);
-		handle_atomics(*cur_qp, cqe64, wq->last_poll, idx);
 		wc->wr_id = wq->wrid[idx];
 		wq->tail = wq->wqe_head[idx] + 1;
 		wc->status = IB_WC_SUCCESS;
@@ -614,9 +522,9 @@
 	case MLX5_CQE_SIG_ERR:
 		sig_err_cqe = (struct mlx5_sig_err_cqe *)cqe64;
 
-		read_lock(&dev->mdev->priv.mkey_table.lock);
-		mmkey = __mlx5_mr_lookup(dev->mdev,
-					 mlx5_base_mkey(be32_to_cpu(sig_err_cqe->mkey)));
+		xa_lock(&dev->mdev->priv.mkey_table);
+		mmkey = xa_load(&dev->mdev->priv.mkey_table,
+				mlx5_base_mkey(be32_to_cpu(sig_err_cqe->mkey)));
 		mr = to_mibmr(mmkey);
 		get_sig_err_item(sig_err_cqe, &mr->sig->err_item);
 		mr->sig->sig_err_exists = true;
@@ -629,7 +537,7 @@
 			     mr->sig->err_item.expected,
 			     mr->sig->err_item.actual);
 
-		read_unlock(&dev->mdev->priv.mkey_table.lock);
+		xa_unlock(&dev->mdev->priv.mkey_table);
 		goto repoll;
 	}
 
@@ -728,16 +636,11 @@
 			     int nent,
 			     int cqe_size)
 {
-	struct mlx5_frag_buf_ctrl *c = &buf->fbc;
-	struct mlx5_frag_buf *frag_buf = &c->frag_buf;
-	u32 cqc_buff[MLX5_ST_SZ_DW(cqc)] = {0};
+	struct mlx5_frag_buf *frag_buf = &buf->frag_buf;
+	u8 log_wq_stride = 6 + (cqe_size == 128 ? 1 : 0);
+	u8 log_wq_sz     = ilog2(cqe_size);
 	int err;
 
-	MLX5_SET(cqc, cqc_buff, log_cq_size, ilog2(cqe_size));
-	MLX5_SET(cqc, cqc_buff, cqe_sz, (cqe_size == 128) ? 1 : 0);
-
-	mlx5_core_init_cq_frag_buf(&buf->fbc, cqc_buff);
-
 	err = mlx5_frag_buf_alloc_node(dev->mdev,
 				       nent * cqe_size,
 				       frag_buf,
@@ -745,6 +648,8 @@
 	if (err)
 		return err;
 
+	mlx5_init_fbc(frag_buf->frags, log_wq_stride, log_wq_sz, &buf->fbc);
+
 	buf->cqe_size = cqe_size;
 	buf->nent = nent;
 
@@ -774,8 +679,7 @@
 }
 
 static int create_cq_user(struct mlx5_ib_dev *dev, struct ib_udata *udata,
-			  struct ib_ucontext *context, struct mlx5_ib_cq *cq,
-			  int entries, u32 **cqb,
+			  struct mlx5_ib_cq *cq, int entries, u32 **cqb,
 			  int *cqe_size, int *index, int *inlen)
 {
 	struct mlx5_ib_create_cq ucmd = {};
@@ -786,6 +690,8 @@
 	int ncont;
 	void *cqc;
 	int err;
+	struct mlx5_ib_ucontext *context = rdma_udata_to_drv_context(
+		udata, struct mlx5_ib_ucontext, ibucontext);
 
 	ucmdlen = udata->inlen < sizeof(ucmd) ?
 		  (sizeof(ucmd) - sizeof(ucmd.flags)) : sizeof(ucmd);
@@ -802,16 +708,15 @@
 
 	*cqe_size = ucmd.cqe_size;
 
-	cq->buf.umem = ib_umem_get(context, ucmd.buf_addr,
-				   entries * ucmd.cqe_size,
-				   IB_ACCESS_LOCAL_WRITE, 1);
+	cq->buf.umem =
+		ib_umem_get(udata, ucmd.buf_addr, entries * ucmd.cqe_size,
+			    IB_ACCESS_LOCAL_WRITE, 1);
 	if (IS_ERR(cq->buf.umem)) {
 		err = PTR_ERR(cq->buf.umem);
 		return err;
 	}
 
-	err = mlx5_ib_db_map_user(to_mucontext(context), ucmd.db_addr,
-				  &cq->db);
+	err = mlx5_ib_db_map_user(context, udata, ucmd.db_addr, &cq->db);
 	if (err)
 		goto err_umem;
 
@@ -835,7 +740,7 @@
 	MLX5_SET(cqc, cqc, log_page_size,
 		 page_shift - MLX5_ADAPTER_PAGE_SHIFT);
 
-	*index = to_mucontext(context)->bfregi.sys_pages[0];
+	*index = context->bfregi.sys_pages[0];
 
 	if (ucmd.cqe_comp_en == 1) {
 		int mini_cqe_format;
@@ -877,22 +782,26 @@
 		cq->private_flags |= MLX5_IB_CQ_PR_FLAGS_CQE_128_PAD;
 	}
 
+	MLX5_SET(create_cq_in, *cqb, uid, context->devx_uid);
 	return 0;
 
 err_cqb:
 	kvfree(*cqb);
 
 err_db:
-	mlx5_ib_db_unmap_user(to_mucontext(context), &cq->db);
+	mlx5_ib_db_unmap_user(context, &cq->db);
 
 err_umem:
 	ib_umem_release(cq->buf.umem);
 	return err;
 }
 
-static void destroy_cq_user(struct mlx5_ib_cq *cq, struct ib_ucontext *context)
+static void destroy_cq_user(struct mlx5_ib_cq *cq, struct ib_udata *udata)
 {
-	mlx5_ib_db_unmap_user(to_mucontext(context), &cq->db);
+	struct mlx5_ib_ucontext *context = rdma_udata_to_drv_context(
+		udata, struct mlx5_ib_ucontext, ibucontext);
+
+	mlx5_ib_db_unmap_user(context, &cq->db);
 	ib_umem_release(cq->buf.umem);
 }
 
@@ -934,7 +843,7 @@
 
 	*inlen = MLX5_ST_SZ_BYTES(create_cq_in) +
 		 MLX5_FLD_SZ_BYTES(create_cq_in, pas[0]) *
-		 cq->buf.fbc.frag_buf.npages;
+		 cq->buf.frag_buf.npages;
 	*cqb = kvzalloc(*inlen, GFP_KERNEL);
 	if (!*cqb) {
 		err = -ENOMEM;
@@ -942,11 +851,11 @@
 	}
 
 	pas = (__be64 *)MLX5_ADDR_OF(create_cq_in, *cqb, pas);
-	mlx5_fill_page_frag_array(&cq->buf.fbc.frag_buf, pas);
+	mlx5_fill_page_frag_array(&cq->buf.frag_buf, pas);
 
 	cqc = MLX5_ADDR_OF(create_cq_in, *cqb, cq_context);
 	MLX5_SET(cqc, cqc, log_page_size,
-		 cq->buf.fbc.frag_buf.page_shift -
+		 cq->buf.frag_buf.page_shift -
 		 MLX5_ADAPTER_PAGE_SHIFT);
 
 	*index = dev->mdev->priv.uar->index;
@@ -975,15 +884,15 @@
 	cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context);
 }
 
-struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev,
-				const struct ib_cq_init_attr *attr,
-				struct ib_ucontext *context,
-				struct ib_udata *udata)
+int mlx5_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
+		      struct ib_udata *udata)
 {
+	struct ib_device *ibdev = ibcq->device;
 	int entries = attr->cqe;
 	int vector = attr->comp_vector;
 	struct mlx5_ib_dev *dev = to_mdev(ibdev);
-	struct mlx5_ib_cq *cq;
+	struct mlx5_ib_cq *cq = to_mcq(ibcq);
+	u32 out[MLX5_ST_SZ_DW(create_cq_out)];
 	int uninitialized_var(index);
 	int uninitialized_var(inlen);
 	u32 *cqb = NULL;
@@ -995,18 +904,14 @@
 
 	if (entries < 0 ||
 	    (entries > (1 << MLX5_CAP_GEN(dev->mdev, log_max_cq_sz))))
-		return ERR_PTR(-EINVAL);
+		return -EINVAL;
 
 	if (check_cq_create_flags(attr->flags))
-		return ERR_PTR(-EOPNOTSUPP);
+		return -EOPNOTSUPP;
 
 	entries = roundup_pow_of_two(entries + 1);
 	if (entries > (1 << MLX5_CAP_GEN(dev->mdev, log_max_cq_sz)))
-		return ERR_PTR(-EINVAL);
-
-	cq = kzalloc(sizeof(*cq), GFP_KERNEL);
-	if (!cq)
-		return ERR_PTR(-ENOMEM);
+		return -EINVAL;
 
 	cq->ibcq.cqe = entries - 1;
 	mutex_init(&cq->resize_mutex);
@@ -1017,17 +922,17 @@
 	INIT_LIST_HEAD(&cq->list_send_qp);
 	INIT_LIST_HEAD(&cq->list_recv_qp);
 
-	if (context) {
-		err = create_cq_user(dev, udata, context, cq, entries,
-				     &cqb, &cqe_size, &index, &inlen);
+	if (udata) {
+		err = create_cq_user(dev, udata, cq, entries, &cqb, &cqe_size,
+				     &index, &inlen);
 		if (err)
-			goto err_create;
+			return err;
 	} else {
 		cqe_size = cache_line_size() == 128 ? 128 : 64;
 		err = create_cq_kernel(dev, cq, entries, cqe_size, &cqb,
 				       &index, &inlen);
 		if (err)
-			goto err_create;
+			return err;
 
 		INIT_WORK(&cq->notify_work, notify_soft_wc_handler);
 	}
@@ -1050,13 +955,13 @@
 	if (cq->create_flags & IB_UVERBS_CQ_FLAGS_IGNORE_OVERRUN)
 		MLX5_SET(cqc, cqc, oi, 1);
 
-	err = mlx5_core_create_cq(dev->mdev, &cq->mcq, cqb, inlen);
+	err = mlx5_core_create_cq(dev->mdev, &cq->mcq, cqb, inlen, out, sizeof(out));
 	if (err)
 		goto err_cqb;
 
 	mlx5_ib_dbg(dev, "cqn 0x%x\n", cq->mcq.cqn);
 	cq->mcq.irqn = irqn;
-	if (context)
+	if (udata)
 		cq->mcq.tasklet_ctx.comp = mlx5_ib_cq_comp;
 	else
 		cq->mcq.comp  = mlx5_ib_cq_comp;
@@ -1064,7 +969,7 @@
 
 	INIT_LIST_HEAD(&cq->wc_list);
 
-	if (context)
+	if (udata)
 		if (ib_copy_to_udata(udata, &cq->mcq.cqn, sizeof(__u32))) {
 			err = -EFAULT;
 			goto err_cmd;
@@ -1072,43 +977,30 @@
 
 
 	kvfree(cqb);
-	return &cq->ibcq;
+	return 0;
 
 err_cmd:
 	mlx5_core_destroy_cq(dev->mdev, &cq->mcq);
 
 err_cqb:
 	kvfree(cqb);
-	if (context)
-		destroy_cq_user(cq, context);
+	if (udata)
+		destroy_cq_user(cq, udata);
 	else
 		destroy_cq_kernel(dev, cq);
-
-err_create:
-	kfree(cq);
-
-	return ERR_PTR(err);
+	return err;
 }
 
-
-int mlx5_ib_destroy_cq(struct ib_cq *cq)
+void mlx5_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata)
 {
 	struct mlx5_ib_dev *dev = to_mdev(cq->device);
 	struct mlx5_ib_cq *mcq = to_mcq(cq);
-	struct ib_ucontext *context = NULL;
-
-	if (cq->uobject)
-		context = cq->uobject->context;
 
 	mlx5_core_destroy_cq(dev->mdev, &mcq->mcq);
-	if (context)
-		destroy_cq_user(mcq, context);
+	if (udata)
+		destroy_cq_user(mcq, udata);
 	else
 		destroy_cq_kernel(dev, mcq);
-
-	kfree(mcq);
-
-	return 0;
 }
 
 static int is_equal_rsn(struct mlx5_cqe64 *cqe64, u32 rsn)
@@ -1205,7 +1097,6 @@
 	struct ib_umem *umem;
 	int err;
 	int npages;
-	struct ib_ucontext *context = cq->buf.umem->context;
 
 	err = ib_copy_from_udata(&ucmd, udata, sizeof(ucmd));
 	if (err)
@@ -1218,7 +1109,7 @@
 	if (ucmd.cqe_size && SIZE_MAX / ucmd.cqe_size <= entries - 1)
 		return -EINVAL;
 
-	umem = ib_umem_get(context, ucmd.buf_addr,
+	umem = ib_umem_get(udata, ucmd.buf_addr,
 			   (size_t)ucmd.cqe_size * entries,
 			   IB_ACCESS_LOCAL_WRITE, 1);
 	if (IS_ERR(umem)) {
@@ -1235,11 +1126,6 @@
 	return 0;
 }
 
-static void un_resize_user(struct mlx5_ib_cq *cq)
-{
-	ib_umem_release(cq->resize_umem);
-}
-
 static int resize_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq,
 			 int entries, int cqe_size)
 {
@@ -1262,12 +1148,6 @@
 	return err;
 }
 
-static void un_resize_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq)
-{
-	free_cq_buf(dev, cq->resize_buf);
-	cq->resize_buf = NULL;
-}
-
 static int copy_resize_cqes(struct mlx5_ib_cq *cq)
 {
 	struct mlx5_ib_dev *dev = to_mdev(cq->ibcq.device);
@@ -1297,7 +1177,7 @@
 		return -EINVAL;
 	}
 
-	while ((scqe64->op_own >> 4) != MLX5_CQE_RESIZE_CQ) {
+	while (get_cqe_opcode(scqe64) != MLX5_CQE_RESIZE_CQ) {
 		dcqe = mlx5_frag_buf_get_wqe(&cq->resize_buf->fbc,
 					     (i + 1) & cq->resize_buf->nent);
 		dcqe64 = dsize == 64 ? dcqe : dcqe + 64;
@@ -1365,11 +1245,10 @@
 		cqe_size = 64;
 		err = resize_kernel(dev, cq, entries, cqe_size);
 		if (!err) {
-			struct mlx5_frag_buf_ctrl *c;
+			struct mlx5_frag_buf *frag_buf = &cq->resize_buf->frag_buf;
 
-			c = &cq->resize_buf->fbc;
-			npas = c->frag_buf.npages;
-			page_shift = c->frag_buf.page_shift;
+			npas = frag_buf->npages;
+			page_shift = frag_buf->page_shift;
 		}
 	}
 
@@ -1390,8 +1269,7 @@
 		mlx5_ib_populate_pas(dev, cq->resize_umem, page_shift,
 				     pas, 0);
 	else
-		mlx5_fill_page_frag_array(&cq->resize_buf->fbc.frag_buf,
-					  pas);
+		mlx5_fill_page_frag_array(&cq->resize_buf->frag_buf, pas);
 
 	MLX5_SET(modify_cq_in, in,
 		 modify_field_select_resize_field_select.resize_field_select.resize_field_select,
@@ -1450,16 +1328,17 @@
 	kvfree(in);
 
 ex_resize:
-	if (udata)
-		un_resize_user(cq);
-	else
-		un_resize_kernel(dev, cq);
+	ib_umem_release(cq->resize_umem);
+	if (!udata) {
+		free_cq_buf(dev, cq->resize_buf);
+		cq->resize_buf = NULL;
+	}
 ex:
 	mutex_unlock(&cq->resize_mutex);
 	return err;
 }
 
-int mlx5_ib_get_cqe_size(struct mlx5_ib_dev *dev, struct ib_cq *ibcq)
+int mlx5_ib_get_cqe_size(struct ib_cq *ibcq)
 {
 	struct mlx5_ib_cq *cq;
 
diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c
index f2f11e6..d609f46 100644
--- a/drivers/infiniband/hw/mlx5/devx.c
+++ b/drivers/infiniband/hw/mlx5/devx.c
@@ -8,20 +8,100 @@
 #include <rdma/uverbs_types.h>
 #include <rdma/uverbs_ioctl.h>
 #include <rdma/mlx5_user_ioctl_cmds.h>
+#include <rdma/mlx5_user_ioctl_verbs.h>
 #include <rdma/ib_umem.h>
+#include <rdma/uverbs_std_types.h>
 #include <linux/mlx5/driver.h>
 #include <linux/mlx5/fs.h>
 #include "mlx5_ib.h"
+#include <linux/xarray.h>
 
 #define UVERBS_MODULE_NAME mlx5_ib
 #include <rdma/uverbs_named_ioctl.h>
 
+static void dispatch_event_fd(struct list_head *fd_list, const void *data);
+
+enum devx_obj_flags {
+	DEVX_OBJ_FLAGS_INDIRECT_MKEY = 1 << 0,
+	DEVX_OBJ_FLAGS_DCT = 1 << 1,
+	DEVX_OBJ_FLAGS_CQ = 1 << 2,
+};
+
+struct devx_async_data {
+	struct mlx5_ib_dev *mdev;
+	struct list_head list;
+	struct ib_uobject *fd_uobj;
+	struct mlx5_async_work cb_work;
+	u16 cmd_out_len;
+	/* must be last field in this structure */
+	struct mlx5_ib_uapi_devx_async_cmd_hdr hdr;
+};
+
+struct devx_async_event_data {
+	struct list_head list; /* headed in ev_file->event_list */
+	struct mlx5_ib_uapi_devx_async_event_hdr hdr;
+};
+
+/* first level XA value data structure */
+struct devx_event {
+	struct xarray object_ids; /* second XA level, Key = object id */
+	struct list_head unaffiliated_list;
+};
+
+/* second level XA value data structure */
+struct devx_obj_event {
+	struct rcu_head rcu;
+	struct list_head obj_sub_list;
+};
+
+struct devx_event_subscription {
+	struct list_head file_list; /* headed in ev_file->
+				     * subscribed_events_list
+				     */
+	struct list_head xa_list; /* headed in devx_event->unaffiliated_list or
+				   * devx_obj_event->obj_sub_list
+				   */
+	struct list_head obj_list; /* headed in devx_object */
+	struct list_head event_list; /* headed in ev_file->event_list or in
+				      * temp list via subscription
+				      */
+
+	u8 is_cleaned:1;
+	u32 xa_key_level1;
+	u32 xa_key_level2;
+	struct rcu_head	rcu;
+	u64 cookie;
+	struct devx_async_event_file *ev_file;
+	struct file *filp; /* Upon hot unplug we need a direct access to */
+	struct eventfd_ctx *eventfd;
+};
+
+struct devx_async_event_file {
+	struct ib_uobject uobj;
+	/* Head of events that are subscribed to this FD */
+	struct list_head subscribed_events_list;
+	spinlock_t lock;
+	wait_queue_head_t poll_wait;
+	struct list_head event_list;
+	struct mlx5_ib_dev *dev;
+	u8 omit_data:1;
+	u8 is_overflow_err:1;
+	u8 is_destroyed:1;
+};
+
 #define MLX5_MAX_DESTROY_INBOX_SIZE_DW MLX5_ST_SZ_DW(delete_fte_in)
 struct devx_obj {
-	struct mlx5_core_dev	*mdev;
-	u32			obj_id;
+	struct mlx5_ib_dev	*ib_dev;
+	u64			obj_id;
 	u32			dinlen; /* destroy inbox length */
 	u32			dinbox[MLX5_MAX_DESTROY_INBOX_SIZE_DW];
+	u32			flags;
+	union {
+		struct mlx5_ib_devx_mr	devx_mr;
+		struct mlx5_core_dct	core_dct;
+		struct mlx5_core_cq	core_cq;
+	};
+	struct list_head event_sub; /* holds devx_event_subscription entries */
 };
 
 struct devx_umem {
@@ -40,49 +120,52 @@
 	u32				out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)];
 };
 
-static struct mlx5_ib_ucontext *devx_ufile2uctx(struct ib_uverbs_file *file)
+static struct mlx5_ib_ucontext *
+devx_ufile2uctx(const struct uverbs_attr_bundle *attrs)
 {
-	return to_mucontext(ib_uverbs_get_ucontext(file));
+	return to_mucontext(ib_uverbs_get_ucontext(attrs));
 }
 
-int mlx5_ib_devx_create(struct mlx5_ib_dev *dev, struct mlx5_ib_ucontext *context)
+int mlx5_ib_devx_create(struct mlx5_ib_dev *dev, bool is_user)
 {
 	u32 in[MLX5_ST_SZ_DW(create_uctx_in)] = {0};
 	u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {0};
-	u64 general_obj_types;
-	void *hdr;
+	void *uctx;
 	int err;
+	u16 uid;
+	u32 cap = 0;
 
-	hdr = MLX5_ADDR_OF(create_uctx_in, in, hdr);
-
-	general_obj_types = MLX5_CAP_GEN_64(dev->mdev, general_obj_types);
-	if (!(general_obj_types & MLX5_GENERAL_OBJ_TYPES_CAP_UCTX) ||
-	    !(general_obj_types & MLX5_GENERAL_OBJ_TYPES_CAP_UMEM))
+	/* 0 means not supported */
+	if (!MLX5_CAP_GEN(dev->mdev, log_max_uctx))
 		return -EINVAL;
 
-	if (!capable(CAP_NET_RAW))
-		return -EPERM;
+	uctx = MLX5_ADDR_OF(create_uctx_in, in, uctx);
+	if (is_user && capable(CAP_NET_RAW) &&
+	    (MLX5_CAP_GEN(dev->mdev, uctx_cap) & MLX5_UCTX_CAP_RAW_TX))
+		cap |= MLX5_UCTX_CAP_RAW_TX;
+	if (is_user && capable(CAP_SYS_RAWIO) &&
+	    (MLX5_CAP_GEN(dev->mdev, uctx_cap) &
+	     MLX5_UCTX_CAP_INTERNAL_DEV_RES))
+		cap |= MLX5_UCTX_CAP_INTERNAL_DEV_RES;
 
-	MLX5_SET(general_obj_in_cmd_hdr, hdr, opcode, MLX5_CMD_OP_CREATE_GENERAL_OBJECT);
-	MLX5_SET(general_obj_in_cmd_hdr, hdr, obj_type, MLX5_OBJ_TYPE_UCTX);
+	MLX5_SET(create_uctx_in, in, opcode, MLX5_CMD_OP_CREATE_UCTX);
+	MLX5_SET(uctx, uctx, cap, cap);
 
 	err = mlx5_cmd_exec(dev->mdev, in, sizeof(in), out, sizeof(out));
 	if (err)
 		return err;
 
-	context->devx_uid = MLX5_GET(general_obj_out_cmd_hdr, out, obj_id);
-	return 0;
+	uid = MLX5_GET(general_obj_out_cmd_hdr, out, obj_id);
+	return uid;
 }
 
-void mlx5_ib_devx_destroy(struct mlx5_ib_dev *dev,
-			  struct mlx5_ib_ucontext *context)
+void mlx5_ib_devx_destroy(struct mlx5_ib_dev *dev, u16 uid)
 {
-	u32 in[MLX5_ST_SZ_DW(general_obj_in_cmd_hdr)] = {0};
+	u32 in[MLX5_ST_SZ_DW(destroy_uctx_in)] = {0};
 	u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {0};
 
-	MLX5_SET(general_obj_in_cmd_hdr, in, opcode, MLX5_CMD_OP_DESTROY_GENERAL_OBJECT);
-	MLX5_SET(general_obj_in_cmd_hdr, in, obj_type, MLX5_OBJ_TYPE_UCTX);
-	MLX5_SET(general_obj_in_cmd_hdr, in, obj_id, context->devx_uid);
+	MLX5_SET(destroy_uctx_in, in, opcode, MLX5_CMD_OP_DESTROY_UCTX);
+	MLX5_SET(destroy_uctx_in, in, uid, uid);
 
 	mlx5_cmd_exec(dev->mdev, in, sizeof(in), out, sizeof(out));
 }
@@ -109,166 +192,561 @@
 	}
 }
 
-static int devx_is_valid_obj_id(struct devx_obj *obj, const void *in)
+bool mlx5_ib_devx_is_flow_counter(void *obj, u32 *counter_id)
 {
-	u16 opcode = MLX5_GET(general_obj_in_cmd_hdr, in, opcode);
-	u32 obj_id;
+	struct devx_obj *devx_obj = obj;
+	u16 opcode = MLX5_GET(general_obj_in_cmd_hdr, devx_obj->dinbox, opcode);
 
-	switch (opcode) {
-	case MLX5_CMD_OP_MODIFY_GENERAL_OBJECT:
-	case MLX5_CMD_OP_QUERY_GENERAL_OBJECT:
-		obj_id = MLX5_GET(general_obj_in_cmd_hdr, in, obj_id);
-		break;
-	case MLX5_CMD_OP_QUERY_MKEY:
-		obj_id = MLX5_GET(query_mkey_in, in, mkey_index);
-		break;
-	case MLX5_CMD_OP_QUERY_CQ:
-		obj_id = MLX5_GET(query_cq_in, in, cqn);
-		break;
-	case MLX5_CMD_OP_MODIFY_CQ:
-		obj_id = MLX5_GET(modify_cq_in, in, cqn);
-		break;
-	case MLX5_CMD_OP_QUERY_SQ:
-		obj_id = MLX5_GET(query_sq_in, in, sqn);
-		break;
-	case MLX5_CMD_OP_MODIFY_SQ:
-		obj_id = MLX5_GET(modify_sq_in, in, sqn);
-		break;
-	case MLX5_CMD_OP_QUERY_RQ:
-		obj_id = MLX5_GET(query_rq_in, in, rqn);
-		break;
-	case MLX5_CMD_OP_MODIFY_RQ:
-		obj_id = MLX5_GET(modify_rq_in, in, rqn);
-		break;
-	case MLX5_CMD_OP_QUERY_RMP:
-		obj_id = MLX5_GET(query_rmp_in, in, rmpn);
-		break;
-	case MLX5_CMD_OP_MODIFY_RMP:
-		obj_id = MLX5_GET(modify_rmp_in, in, rmpn);
-		break;
-	case MLX5_CMD_OP_QUERY_RQT:
-		obj_id = MLX5_GET(query_rqt_in, in, rqtn);
-		break;
-	case MLX5_CMD_OP_MODIFY_RQT:
-		obj_id = MLX5_GET(modify_rqt_in, in, rqtn);
-		break;
-	case MLX5_CMD_OP_QUERY_TIR:
-		obj_id = MLX5_GET(query_tir_in, in, tirn);
-		break;
-	case MLX5_CMD_OP_MODIFY_TIR:
-		obj_id = MLX5_GET(modify_tir_in, in, tirn);
-		break;
-	case MLX5_CMD_OP_QUERY_TIS:
-		obj_id = MLX5_GET(query_tis_in, in, tisn);
-		break;
-	case MLX5_CMD_OP_MODIFY_TIS:
-		obj_id = MLX5_GET(modify_tis_in, in, tisn);
-		break;
-	case MLX5_CMD_OP_QUERY_FLOW_TABLE:
-		obj_id = MLX5_GET(query_flow_table_in, in, table_id);
-		break;
-	case MLX5_CMD_OP_MODIFY_FLOW_TABLE:
-		obj_id = MLX5_GET(modify_flow_table_in, in, table_id);
-		break;
-	case MLX5_CMD_OP_QUERY_FLOW_GROUP:
-		obj_id = MLX5_GET(query_flow_group_in, in, group_id);
-		break;
-	case MLX5_CMD_OP_QUERY_FLOW_TABLE_ENTRY:
-		obj_id = MLX5_GET(query_fte_in, in, flow_index);
-		break;
-	case MLX5_CMD_OP_SET_FLOW_TABLE_ENTRY:
-		obj_id = MLX5_GET(set_fte_in, in, flow_index);
-		break;
-	case MLX5_CMD_OP_QUERY_Q_COUNTER:
-		obj_id = MLX5_GET(query_q_counter_in, in, counter_set_id);
-		break;
-	case MLX5_CMD_OP_QUERY_FLOW_COUNTER:
-		obj_id = MLX5_GET(query_flow_counter_in, in, flow_counter_id);
-		break;
-	case MLX5_CMD_OP_QUERY_MODIFY_HEADER_CONTEXT:
-		obj_id = MLX5_GET(general_obj_in_cmd_hdr, in, obj_id);
-		break;
-	case MLX5_CMD_OP_QUERY_SCHEDULING_ELEMENT:
-		obj_id = MLX5_GET(query_scheduling_element_in, in,
-				  scheduling_element_id);
-		break;
-	case MLX5_CMD_OP_MODIFY_SCHEDULING_ELEMENT:
-		obj_id = MLX5_GET(modify_scheduling_element_in, in,
-				  scheduling_element_id);
-		break;
-	case MLX5_CMD_OP_ADD_VXLAN_UDP_DPORT:
-		obj_id = MLX5_GET(add_vxlan_udp_dport_in, in, vxlan_udp_port);
-		break;
-	case MLX5_CMD_OP_QUERY_L2_TABLE_ENTRY:
-		obj_id = MLX5_GET(query_l2_table_entry_in, in, table_index);
-		break;
-	case MLX5_CMD_OP_SET_L2_TABLE_ENTRY:
-		obj_id = MLX5_GET(set_l2_table_entry_in, in, table_index);
-		break;
-	case MLX5_CMD_OP_QUERY_QP:
-		obj_id = MLX5_GET(query_qp_in, in, qpn);
-		break;
-	case MLX5_CMD_OP_RST2INIT_QP:
-		obj_id = MLX5_GET(rst2init_qp_in, in, qpn);
-		break;
-	case MLX5_CMD_OP_INIT2RTR_QP:
-		obj_id = MLX5_GET(init2rtr_qp_in, in, qpn);
-		break;
-	case MLX5_CMD_OP_RTR2RTS_QP:
-		obj_id = MLX5_GET(rtr2rts_qp_in, in, qpn);
-		break;
-	case MLX5_CMD_OP_RTS2RTS_QP:
-		obj_id = MLX5_GET(rts2rts_qp_in, in, qpn);
-		break;
-	case MLX5_CMD_OP_SQERR2RTS_QP:
-		obj_id = MLX5_GET(sqerr2rts_qp_in, in, qpn);
-		break;
-	case MLX5_CMD_OP_2ERR_QP:
-		obj_id = MLX5_GET(qp_2err_in, in, qpn);
-		break;
-	case MLX5_CMD_OP_2RST_QP:
-		obj_id = MLX5_GET(qp_2rst_in, in, qpn);
-		break;
-	case MLX5_CMD_OP_QUERY_DCT:
-		obj_id = MLX5_GET(query_dct_in, in, dctn);
-		break;
-	case MLX5_CMD_OP_QUERY_XRQ:
-		obj_id = MLX5_GET(query_xrq_in, in, xrqn);
-		break;
-	case MLX5_CMD_OP_QUERY_XRC_SRQ:
-		obj_id = MLX5_GET(query_xrc_srq_in, in, xrc_srqn);
-		break;
-	case MLX5_CMD_OP_ARM_XRC_SRQ:
-		obj_id = MLX5_GET(arm_xrc_srq_in, in, xrc_srqn);
-		break;
-	case MLX5_CMD_OP_QUERY_SRQ:
-		obj_id = MLX5_GET(query_srq_in, in, srqn);
-		break;
-	case MLX5_CMD_OP_ARM_RQ:
-		obj_id = MLX5_GET(arm_rq_in, in, srq_number);
-		break;
-	case MLX5_CMD_OP_DRAIN_DCT:
-	case MLX5_CMD_OP_ARM_DCT_FOR_KEY_VIOLATION:
-		obj_id = MLX5_GET(drain_dct_in, in, dctn);
-		break;
-	case MLX5_CMD_OP_ARM_XRQ:
-		obj_id = MLX5_GET(arm_xrq_in, in, xrqn);
-		break;
-	default:
-		return false;
-	}
-
-	if (obj_id == obj->obj_id)
+	if (opcode == MLX5_CMD_OP_DEALLOC_FLOW_COUNTER) {
+		*counter_id = MLX5_GET(dealloc_flow_counter_in,
+				       devx_obj->dinbox,
+				       flow_counter_id);
 		return true;
+	}
 
 	return false;
 }
 
-static bool devx_is_obj_create_cmd(const void *in)
+static bool is_legacy_unaffiliated_event_num(u16 event_num)
+{
+	switch (event_num) {
+	case MLX5_EVENT_TYPE_PORT_CHANGE:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static bool is_legacy_obj_event_num(u16 event_num)
+{
+	switch (event_num) {
+	case MLX5_EVENT_TYPE_PATH_MIG:
+	case MLX5_EVENT_TYPE_COMM_EST:
+	case MLX5_EVENT_TYPE_SQ_DRAINED:
+	case MLX5_EVENT_TYPE_SRQ_LAST_WQE:
+	case MLX5_EVENT_TYPE_SRQ_RQ_LIMIT:
+	case MLX5_EVENT_TYPE_CQ_ERROR:
+	case MLX5_EVENT_TYPE_WQ_CATAS_ERROR:
+	case MLX5_EVENT_TYPE_PATH_MIG_FAILED:
+	case MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR:
+	case MLX5_EVENT_TYPE_WQ_ACCESS_ERROR:
+	case MLX5_EVENT_TYPE_SRQ_CATAS_ERROR:
+	case MLX5_EVENT_TYPE_DCT_DRAINED:
+	case MLX5_EVENT_TYPE_COMP:
+	case MLX5_EVENT_TYPE_DCT_KEY_VIOLATION:
+	case MLX5_EVENT_TYPE_XRQ_ERROR:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static u16 get_legacy_obj_type(u16 opcode)
+{
+	switch (opcode) {
+	case MLX5_CMD_OP_CREATE_RQ:
+		return MLX5_EVENT_QUEUE_TYPE_RQ;
+	case MLX5_CMD_OP_CREATE_QP:
+		return MLX5_EVENT_QUEUE_TYPE_QP;
+	case MLX5_CMD_OP_CREATE_SQ:
+		return MLX5_EVENT_QUEUE_TYPE_SQ;
+	case MLX5_CMD_OP_CREATE_DCT:
+		return MLX5_EVENT_QUEUE_TYPE_DCT;
+	default:
+		return 0;
+	}
+}
+
+static u16 get_dec_obj_type(struct devx_obj *obj, u16 event_num)
+{
+	u16 opcode;
+
+	opcode = (obj->obj_id >> 32) & 0xffff;
+
+	if (is_legacy_obj_event_num(event_num))
+		return get_legacy_obj_type(opcode);
+
+	switch (opcode) {
+	case MLX5_CMD_OP_CREATE_GENERAL_OBJECT:
+		return (obj->obj_id >> 48);
+	case MLX5_CMD_OP_CREATE_RQ:
+		return MLX5_OBJ_TYPE_RQ;
+	case MLX5_CMD_OP_CREATE_QP:
+		return MLX5_OBJ_TYPE_QP;
+	case MLX5_CMD_OP_CREATE_SQ:
+		return MLX5_OBJ_TYPE_SQ;
+	case MLX5_CMD_OP_CREATE_DCT:
+		return MLX5_OBJ_TYPE_DCT;
+	case MLX5_CMD_OP_CREATE_TIR:
+		return MLX5_OBJ_TYPE_TIR;
+	case MLX5_CMD_OP_CREATE_TIS:
+		return MLX5_OBJ_TYPE_TIS;
+	case MLX5_CMD_OP_CREATE_PSV:
+		return MLX5_OBJ_TYPE_PSV;
+	case MLX5_OBJ_TYPE_MKEY:
+		return MLX5_OBJ_TYPE_MKEY;
+	case MLX5_CMD_OP_CREATE_RMP:
+		return MLX5_OBJ_TYPE_RMP;
+	case MLX5_CMD_OP_CREATE_XRC_SRQ:
+		return MLX5_OBJ_TYPE_XRC_SRQ;
+	case MLX5_CMD_OP_CREATE_XRQ:
+		return MLX5_OBJ_TYPE_XRQ;
+	case MLX5_CMD_OP_CREATE_RQT:
+		return MLX5_OBJ_TYPE_RQT;
+	case MLX5_CMD_OP_ALLOC_FLOW_COUNTER:
+		return MLX5_OBJ_TYPE_FLOW_COUNTER;
+	case MLX5_CMD_OP_CREATE_CQ:
+		return MLX5_OBJ_TYPE_CQ;
+	default:
+		return 0;
+	}
+}
+
+static u16 get_event_obj_type(unsigned long event_type, struct mlx5_eqe *eqe)
+{
+	switch (event_type) {
+	case MLX5_EVENT_TYPE_WQ_CATAS_ERROR:
+	case MLX5_EVENT_TYPE_WQ_ACCESS_ERROR:
+	case MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR:
+	case MLX5_EVENT_TYPE_SRQ_LAST_WQE:
+	case MLX5_EVENT_TYPE_PATH_MIG:
+	case MLX5_EVENT_TYPE_PATH_MIG_FAILED:
+	case MLX5_EVENT_TYPE_COMM_EST:
+	case MLX5_EVENT_TYPE_SQ_DRAINED:
+	case MLX5_EVENT_TYPE_SRQ_RQ_LIMIT:
+	case MLX5_EVENT_TYPE_SRQ_CATAS_ERROR:
+		return eqe->data.qp_srq.type;
+	case MLX5_EVENT_TYPE_CQ_ERROR:
+	case MLX5_EVENT_TYPE_XRQ_ERROR:
+		return 0;
+	case MLX5_EVENT_TYPE_DCT_DRAINED:
+	case MLX5_EVENT_TYPE_DCT_KEY_VIOLATION:
+		return MLX5_EVENT_QUEUE_TYPE_DCT;
+	default:
+		return MLX5_GET(affiliated_event_header, &eqe->data, obj_type);
+	}
+}
+
+static u32 get_dec_obj_id(u64 obj_id)
+{
+	return (obj_id & 0xffffffff);
+}
+
+/*
+ * As the obj_id in the firmware is not globally unique the object type
+ * must be considered upon checking for a valid object id.
+ * For that the opcode of the creator command is encoded as part of the obj_id.
+ */
+static u64 get_enc_obj_id(u32 opcode, u32 obj_id)
+{
+	return ((u64)opcode << 32) | obj_id;
+}
+
+static u64 devx_get_obj_id(const void *in)
+{
+	u16 opcode = MLX5_GET(general_obj_in_cmd_hdr, in, opcode);
+	u64 obj_id;
+
+	switch (opcode) {
+	case MLX5_CMD_OP_MODIFY_GENERAL_OBJECT:
+	case MLX5_CMD_OP_QUERY_GENERAL_OBJECT:
+		obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_GENERAL_OBJECT |
+					MLX5_GET(general_obj_in_cmd_hdr, in,
+						 obj_type) << 16,
+					MLX5_GET(general_obj_in_cmd_hdr, in,
+						 obj_id));
+		break;
+	case MLX5_CMD_OP_QUERY_MKEY:
+		obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_MKEY,
+					MLX5_GET(query_mkey_in, in,
+						 mkey_index));
+		break;
+	case MLX5_CMD_OP_QUERY_CQ:
+		obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_CQ,
+					MLX5_GET(query_cq_in, in, cqn));
+		break;
+	case MLX5_CMD_OP_MODIFY_CQ:
+		obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_CQ,
+					MLX5_GET(modify_cq_in, in, cqn));
+		break;
+	case MLX5_CMD_OP_QUERY_SQ:
+		obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_SQ,
+					MLX5_GET(query_sq_in, in, sqn));
+		break;
+	case MLX5_CMD_OP_MODIFY_SQ:
+		obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_SQ,
+					MLX5_GET(modify_sq_in, in, sqn));
+		break;
+	case MLX5_CMD_OP_QUERY_RQ:
+		obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_RQ,
+					MLX5_GET(query_rq_in, in, rqn));
+		break;
+	case MLX5_CMD_OP_MODIFY_RQ:
+		obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_RQ,
+					MLX5_GET(modify_rq_in, in, rqn));
+		break;
+	case MLX5_CMD_OP_QUERY_RMP:
+		obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_RMP,
+					MLX5_GET(query_rmp_in, in, rmpn));
+		break;
+	case MLX5_CMD_OP_MODIFY_RMP:
+		obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_RMP,
+					MLX5_GET(modify_rmp_in, in, rmpn));
+		break;
+	case MLX5_CMD_OP_QUERY_RQT:
+		obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_RQT,
+					MLX5_GET(query_rqt_in, in, rqtn));
+		break;
+	case MLX5_CMD_OP_MODIFY_RQT:
+		obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_RQT,
+					MLX5_GET(modify_rqt_in, in, rqtn));
+		break;
+	case MLX5_CMD_OP_QUERY_TIR:
+		obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_TIR,
+					MLX5_GET(query_tir_in, in, tirn));
+		break;
+	case MLX5_CMD_OP_MODIFY_TIR:
+		obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_TIR,
+					MLX5_GET(modify_tir_in, in, tirn));
+		break;
+	case MLX5_CMD_OP_QUERY_TIS:
+		obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_TIS,
+					MLX5_GET(query_tis_in, in, tisn));
+		break;
+	case MLX5_CMD_OP_MODIFY_TIS:
+		obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_TIS,
+					MLX5_GET(modify_tis_in, in, tisn));
+		break;
+	case MLX5_CMD_OP_QUERY_FLOW_TABLE:
+		obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_FLOW_TABLE,
+					MLX5_GET(query_flow_table_in, in,
+						 table_id));
+		break;
+	case MLX5_CMD_OP_MODIFY_FLOW_TABLE:
+		obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_FLOW_TABLE,
+					MLX5_GET(modify_flow_table_in, in,
+						 table_id));
+		break;
+	case MLX5_CMD_OP_QUERY_FLOW_GROUP:
+		obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_FLOW_GROUP,
+					MLX5_GET(query_flow_group_in, in,
+						 group_id));
+		break;
+	case MLX5_CMD_OP_QUERY_FLOW_TABLE_ENTRY:
+		obj_id = get_enc_obj_id(MLX5_CMD_OP_SET_FLOW_TABLE_ENTRY,
+					MLX5_GET(query_fte_in, in,
+						 flow_index));
+		break;
+	case MLX5_CMD_OP_SET_FLOW_TABLE_ENTRY:
+		obj_id = get_enc_obj_id(MLX5_CMD_OP_SET_FLOW_TABLE_ENTRY,
+					MLX5_GET(set_fte_in, in, flow_index));
+		break;
+	case MLX5_CMD_OP_QUERY_Q_COUNTER:
+		obj_id = get_enc_obj_id(MLX5_CMD_OP_ALLOC_Q_COUNTER,
+					MLX5_GET(query_q_counter_in, in,
+						 counter_set_id));
+		break;
+	case MLX5_CMD_OP_QUERY_FLOW_COUNTER:
+		obj_id = get_enc_obj_id(MLX5_CMD_OP_ALLOC_FLOW_COUNTER,
+					MLX5_GET(query_flow_counter_in, in,
+						 flow_counter_id));
+		break;
+	case MLX5_CMD_OP_QUERY_MODIFY_HEADER_CONTEXT:
+		obj_id = get_enc_obj_id(MLX5_CMD_OP_ALLOC_MODIFY_HEADER_CONTEXT,
+					MLX5_GET(general_obj_in_cmd_hdr, in,
+						 obj_id));
+		break;
+	case MLX5_CMD_OP_QUERY_SCHEDULING_ELEMENT:
+		obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_SCHEDULING_ELEMENT,
+					MLX5_GET(query_scheduling_element_in,
+						 in, scheduling_element_id));
+		break;
+	case MLX5_CMD_OP_MODIFY_SCHEDULING_ELEMENT:
+		obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_SCHEDULING_ELEMENT,
+					MLX5_GET(modify_scheduling_element_in,
+						 in, scheduling_element_id));
+		break;
+	case MLX5_CMD_OP_ADD_VXLAN_UDP_DPORT:
+		obj_id = get_enc_obj_id(MLX5_CMD_OP_ADD_VXLAN_UDP_DPORT,
+					MLX5_GET(add_vxlan_udp_dport_in, in,
+						 vxlan_udp_port));
+		break;
+	case MLX5_CMD_OP_QUERY_L2_TABLE_ENTRY:
+		obj_id = get_enc_obj_id(MLX5_CMD_OP_SET_L2_TABLE_ENTRY,
+					MLX5_GET(query_l2_table_entry_in, in,
+						 table_index));
+		break;
+	case MLX5_CMD_OP_SET_L2_TABLE_ENTRY:
+		obj_id = get_enc_obj_id(MLX5_CMD_OP_SET_L2_TABLE_ENTRY,
+					MLX5_GET(set_l2_table_entry_in, in,
+						 table_index));
+		break;
+	case MLX5_CMD_OP_QUERY_QP:
+		obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_QP,
+					MLX5_GET(query_qp_in, in, qpn));
+		break;
+	case MLX5_CMD_OP_RST2INIT_QP:
+		obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_QP,
+					MLX5_GET(rst2init_qp_in, in, qpn));
+		break;
+	case MLX5_CMD_OP_INIT2RTR_QP:
+		obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_QP,
+					MLX5_GET(init2rtr_qp_in, in, qpn));
+		break;
+	case MLX5_CMD_OP_RTR2RTS_QP:
+		obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_QP,
+					MLX5_GET(rtr2rts_qp_in, in, qpn));
+		break;
+	case MLX5_CMD_OP_RTS2RTS_QP:
+		obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_QP,
+					MLX5_GET(rts2rts_qp_in, in, qpn));
+		break;
+	case MLX5_CMD_OP_SQERR2RTS_QP:
+		obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_QP,
+					MLX5_GET(sqerr2rts_qp_in, in, qpn));
+		break;
+	case MLX5_CMD_OP_2ERR_QP:
+		obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_QP,
+					MLX5_GET(qp_2err_in, in, qpn));
+		break;
+	case MLX5_CMD_OP_2RST_QP:
+		obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_QP,
+					MLX5_GET(qp_2rst_in, in, qpn));
+		break;
+	case MLX5_CMD_OP_QUERY_DCT:
+		obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_DCT,
+					MLX5_GET(query_dct_in, in, dctn));
+		break;
+	case MLX5_CMD_OP_QUERY_XRQ:
+	case MLX5_CMD_OP_QUERY_XRQ_DC_PARAMS_ENTRY:
+	case MLX5_CMD_OP_QUERY_XRQ_ERROR_PARAMS:
+		obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_XRQ,
+					MLX5_GET(query_xrq_in, in, xrqn));
+		break;
+	case MLX5_CMD_OP_QUERY_XRC_SRQ:
+		obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_XRC_SRQ,
+					MLX5_GET(query_xrc_srq_in, in,
+						 xrc_srqn));
+		break;
+	case MLX5_CMD_OP_ARM_XRC_SRQ:
+		obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_XRC_SRQ,
+					MLX5_GET(arm_xrc_srq_in, in, xrc_srqn));
+		break;
+	case MLX5_CMD_OP_QUERY_SRQ:
+		obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_SRQ,
+					MLX5_GET(query_srq_in, in, srqn));
+		break;
+	case MLX5_CMD_OP_ARM_RQ:
+		obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_RQ,
+					MLX5_GET(arm_rq_in, in, srq_number));
+		break;
+	case MLX5_CMD_OP_ARM_DCT_FOR_KEY_VIOLATION:
+		obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_DCT,
+					MLX5_GET(drain_dct_in, in, dctn));
+		break;
+	case MLX5_CMD_OP_ARM_XRQ:
+	case MLX5_CMD_OP_SET_XRQ_DC_PARAMS_ENTRY:
+	case MLX5_CMD_OP_RELEASE_XRQ_ERROR:
+	case MLX5_CMD_OP_MODIFY_XRQ:
+		obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_XRQ,
+					MLX5_GET(arm_xrq_in, in, xrqn));
+		break;
+	case MLX5_CMD_OP_QUERY_PACKET_REFORMAT_CONTEXT:
+		obj_id = get_enc_obj_id
+				(MLX5_CMD_OP_ALLOC_PACKET_REFORMAT_CONTEXT,
+				 MLX5_GET(query_packet_reformat_context_in,
+					  in, packet_reformat_id));
+		break;
+	default:
+		obj_id = 0;
+	}
+
+	return obj_id;
+}
+
+static bool devx_is_valid_obj_id(struct uverbs_attr_bundle *attrs,
+				 struct ib_uobject *uobj, const void *in)
+{
+	struct mlx5_ib_dev *dev = mlx5_udata_to_mdev(&attrs->driver_udata);
+	u64 obj_id = devx_get_obj_id(in);
+
+	if (!obj_id)
+		return false;
+
+	switch (uobj_get_object_id(uobj)) {
+	case UVERBS_OBJECT_CQ:
+		return get_enc_obj_id(MLX5_CMD_OP_CREATE_CQ,
+				      to_mcq(uobj->object)->mcq.cqn) ==
+				      obj_id;
+
+	case UVERBS_OBJECT_SRQ:
+	{
+		struct mlx5_core_srq *srq = &(to_msrq(uobj->object)->msrq);
+		u16 opcode;
+
+		switch (srq->common.res) {
+		case MLX5_RES_XSRQ:
+			opcode = MLX5_CMD_OP_CREATE_XRC_SRQ;
+			break;
+		case MLX5_RES_XRQ:
+			opcode = MLX5_CMD_OP_CREATE_XRQ;
+			break;
+		default:
+			if (!dev->mdev->issi)
+				opcode = MLX5_CMD_OP_CREATE_SRQ;
+			else
+				opcode = MLX5_CMD_OP_CREATE_RMP;
+		}
+
+		return get_enc_obj_id(opcode,
+				      to_msrq(uobj->object)->msrq.srqn) ==
+				      obj_id;
+	}
+
+	case UVERBS_OBJECT_QP:
+	{
+		struct mlx5_ib_qp *qp = to_mqp(uobj->object);
+		enum ib_qp_type	qp_type = qp->ibqp.qp_type;
+
+		if (qp_type == IB_QPT_RAW_PACKET ||
+		    (qp->flags & MLX5_IB_QP_UNDERLAY)) {
+			struct mlx5_ib_raw_packet_qp *raw_packet_qp =
+							 &qp->raw_packet_qp;
+			struct mlx5_ib_rq *rq = &raw_packet_qp->rq;
+			struct mlx5_ib_sq *sq = &raw_packet_qp->sq;
+
+			return (get_enc_obj_id(MLX5_CMD_OP_CREATE_RQ,
+					       rq->base.mqp.qpn) == obj_id ||
+				get_enc_obj_id(MLX5_CMD_OP_CREATE_SQ,
+					       sq->base.mqp.qpn) == obj_id ||
+				get_enc_obj_id(MLX5_CMD_OP_CREATE_TIR,
+					       rq->tirn) == obj_id ||
+				get_enc_obj_id(MLX5_CMD_OP_CREATE_TIS,
+					       sq->tisn) == obj_id);
+		}
+
+		if (qp_type == MLX5_IB_QPT_DCT)
+			return get_enc_obj_id(MLX5_CMD_OP_CREATE_DCT,
+					      qp->dct.mdct.mqp.qpn) == obj_id;
+
+		return get_enc_obj_id(MLX5_CMD_OP_CREATE_QP,
+				      qp->ibqp.qp_num) == obj_id;
+	}
+
+	case UVERBS_OBJECT_WQ:
+		return get_enc_obj_id(MLX5_CMD_OP_CREATE_RQ,
+				      to_mrwq(uobj->object)->core_qp.qpn) ==
+				      obj_id;
+
+	case UVERBS_OBJECT_RWQ_IND_TBL:
+		return get_enc_obj_id(MLX5_CMD_OP_CREATE_RQT,
+				      to_mrwq_ind_table(uobj->object)->rqtn) ==
+				      obj_id;
+
+	case MLX5_IB_OBJECT_DEVX_OBJ:
+		return ((struct devx_obj *)uobj->object)->obj_id == obj_id;
+
+	default:
+		return false;
+	}
+}
+
+static void devx_set_umem_valid(const void *in)
 {
 	u16 opcode = MLX5_GET(general_obj_in_cmd_hdr, in, opcode);
 
 	switch (opcode) {
+	case MLX5_CMD_OP_CREATE_MKEY:
+		MLX5_SET(create_mkey_in, in, mkey_umem_valid, 1);
+		break;
+	case MLX5_CMD_OP_CREATE_CQ:
+	{
+		void *cqc;
+
+		MLX5_SET(create_cq_in, in, cq_umem_valid, 1);
+		cqc = MLX5_ADDR_OF(create_cq_in, in, cq_context);
+		MLX5_SET(cqc, cqc, dbr_umem_valid, 1);
+		break;
+	}
+	case MLX5_CMD_OP_CREATE_QP:
+	{
+		void *qpc;
+
+		qpc = MLX5_ADDR_OF(create_qp_in, in, qpc);
+		MLX5_SET(qpc, qpc, dbr_umem_valid, 1);
+		MLX5_SET(create_qp_in, in, wq_umem_valid, 1);
+		break;
+	}
+
+	case MLX5_CMD_OP_CREATE_RQ:
+	{
+		void *rqc, *wq;
+
+		rqc = MLX5_ADDR_OF(create_rq_in, in, ctx);
+		wq  = MLX5_ADDR_OF(rqc, rqc, wq);
+		MLX5_SET(wq, wq, dbr_umem_valid, 1);
+		MLX5_SET(wq, wq, wq_umem_valid, 1);
+		break;
+	}
+
+	case MLX5_CMD_OP_CREATE_SQ:
+	{
+		void *sqc, *wq;
+
+		sqc = MLX5_ADDR_OF(create_sq_in, in, ctx);
+		wq = MLX5_ADDR_OF(sqc, sqc, wq);
+		MLX5_SET(wq, wq, dbr_umem_valid, 1);
+		MLX5_SET(wq, wq, wq_umem_valid, 1);
+		break;
+	}
+
+	case MLX5_CMD_OP_MODIFY_CQ:
+		MLX5_SET(modify_cq_in, in, cq_umem_valid, 1);
+		break;
+
+	case MLX5_CMD_OP_CREATE_RMP:
+	{
+		void *rmpc, *wq;
+
+		rmpc = MLX5_ADDR_OF(create_rmp_in, in, ctx);
+		wq = MLX5_ADDR_OF(rmpc, rmpc, wq);
+		MLX5_SET(wq, wq, dbr_umem_valid, 1);
+		MLX5_SET(wq, wq, wq_umem_valid, 1);
+		break;
+	}
+
+	case MLX5_CMD_OP_CREATE_XRQ:
+	{
+		void *xrqc, *wq;
+
+		xrqc = MLX5_ADDR_OF(create_xrq_in, in, xrq_context);
+		wq = MLX5_ADDR_OF(xrqc, xrqc, wq);
+		MLX5_SET(wq, wq, dbr_umem_valid, 1);
+		MLX5_SET(wq, wq, wq_umem_valid, 1);
+		break;
+	}
+
+	case MLX5_CMD_OP_CREATE_XRC_SRQ:
+	{
+		void *xrc_srqc;
+
+		MLX5_SET(create_xrc_srq_in, in, xrc_srq_umem_valid, 1);
+		xrc_srqc = MLX5_ADDR_OF(create_xrc_srq_in, in,
+					xrc_srq_context_entry);
+		MLX5_SET(xrc_srqc, xrc_srqc, dbr_umem_valid, 1);
+		break;
+	}
+
+	default:
+		return;
+	}
+}
+
+static bool devx_is_obj_create_cmd(const void *in, u16 *opcode)
+{
+	*opcode = MLX5_GET(general_obj_in_cmd_hdr, in, opcode);
+
+	switch (*opcode) {
 	case MLX5_CMD_OP_CREATE_GENERAL_OBJECT:
 	case MLX5_CMD_OP_CREATE_MKEY:
 	case MLX5_CMD_OP_CREATE_CQ:
@@ -284,7 +762,7 @@
 	case MLX5_CMD_OP_CREATE_FLOW_TABLE:
 	case MLX5_CMD_OP_CREATE_FLOW_GROUP:
 	case MLX5_CMD_OP_ALLOC_FLOW_COUNTER:
-	case MLX5_CMD_OP_ALLOC_ENCAP_HEADER:
+	case MLX5_CMD_OP_ALLOC_PACKET_REFORMAT_CONTEXT:
 	case MLX5_CMD_OP_ALLOC_MODIFY_HEADER_CONTEXT:
 	case MLX5_CMD_OP_CREATE_SCHEDULING_ELEMENT:
 	case MLX5_CMD_OP_ADD_VXLAN_UDP_DPORT:
@@ -304,6 +782,14 @@
 			return true;
 		return false;
 	}
+	case MLX5_CMD_OP_CREATE_PSV:
+	{
+		u8 num_psv = MLX5_GET(create_psv_in, in, num_psv);
+
+		if (num_psv == 1)
+			return true;
+		return false;
+	}
 	default:
 		return false;
 	}
@@ -335,9 +821,11 @@
 	case MLX5_CMD_OP_2RST_QP:
 	case MLX5_CMD_OP_ARM_XRC_SRQ:
 	case MLX5_CMD_OP_ARM_RQ:
-	case MLX5_CMD_OP_DRAIN_DCT:
 	case MLX5_CMD_OP_ARM_DCT_FOR_KEY_VIOLATION:
 	case MLX5_CMD_OP_ARM_XRQ:
+	case MLX5_CMD_OP_SET_XRQ_DC_PARAMS_ENTRY:
+	case MLX5_CMD_OP_RELEASE_XRQ_ERROR:
+	case MLX5_CMD_OP_MODIFY_XRQ:
 		return true;
 	case MLX5_CMD_OP_SET_FLOW_TABLE_ENTRY:
 	{
@@ -379,18 +867,65 @@
 	case MLX5_CMD_OP_QUERY_XRC_SRQ:
 	case MLX5_CMD_OP_QUERY_DCT:
 	case MLX5_CMD_OP_QUERY_XRQ:
+	case MLX5_CMD_OP_QUERY_XRQ_DC_PARAMS_ENTRY:
+	case MLX5_CMD_OP_QUERY_XRQ_ERROR_PARAMS:
+	case MLX5_CMD_OP_QUERY_PACKET_REFORMAT_CONTEXT:
 		return true;
 	default:
 		return false;
 	}
 }
 
-static bool devx_is_general_cmd(void *in)
+static bool devx_is_whitelist_cmd(void *in)
 {
 	u16 opcode = MLX5_GET(general_obj_in_cmd_hdr, in, opcode);
 
 	switch (opcode) {
 	case MLX5_CMD_OP_QUERY_HCA_CAP:
+	case MLX5_CMD_OP_QUERY_HCA_VPORT_CONTEXT:
+	case MLX5_CMD_OP_QUERY_ESW_VPORT_CONTEXT:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static int devx_get_uid(struct mlx5_ib_ucontext *c, void *cmd_in)
+{
+	if (devx_is_whitelist_cmd(cmd_in)) {
+		struct mlx5_ib_dev *dev;
+
+		if (c->devx_uid)
+			return c->devx_uid;
+
+		dev = to_mdev(c->ibucontext.device);
+		if (dev->devx_whitelist_uid)
+			return dev->devx_whitelist_uid;
+
+		return -EOPNOTSUPP;
+	}
+
+	if (!c->devx_uid)
+		return -EINVAL;
+
+	return c->devx_uid;
+}
+
+static bool devx_is_general_cmd(void *in, struct mlx5_ib_dev *dev)
+{
+	u16 opcode = MLX5_GET(general_obj_in_cmd_hdr, in, opcode);
+
+	/* Pass all cmds for vhca_tunnel as general, tracking is done in FW */
+	if ((MLX5_CAP_GEN_64(dev->mdev, vhca_tunnel_commands) &&
+	     MLX5_GET(general_obj_in_cmd_hdr, in, vhca_tunnel_id)) ||
+	    (opcode >= MLX5_CMD_OP_GENERAL_START &&
+	     opcode < MLX5_CMD_OP_GENERAL_END))
+		return true;
+
+	switch (opcode) {
+	case MLX5_CMD_OP_QUERY_HCA_CAP:
+	case MLX5_CMD_OP_QUERY_HCA_VPORT_CONTEXT:
+	case MLX5_CMD_OP_QUERY_ESW_VPORT_CONTEXT:
 	case MLX5_CMD_OP_QUERY_VPORT_STATE:
 	case MLX5_CMD_OP_QUERY_ADAPTER:
 	case MLX5_CMD_OP_QUERY_ISSI:
@@ -403,6 +938,7 @@
 	case MLX5_CMD_OP_QUERY_CONG_STATUS:
 	case MLX5_CMD_OP_QUERY_CONG_PARAMS:
 	case MLX5_CMD_OP_QUERY_CONG_STATISTICS:
+	case MLX5_CMD_OP_QUERY_LAG:
 		return true;
 	default:
 		return false;
@@ -410,7 +946,7 @@
 }
 
 static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_QUERY_EQN)(
-	struct ib_uverbs_file *file, struct uverbs_attr_bundle *attrs)
+	struct uverbs_attr_bundle *attrs)
 {
 	struct mlx5_ib_ucontext *c;
 	struct mlx5_ib_dev *dev;
@@ -423,7 +959,7 @@
 			     MLX5_IB_ATTR_DEVX_QUERY_EQN_USER_VEC))
 		return -EFAULT;
 
-	c = devx_ufile2uctx(file);
+	c = devx_ufile2uctx(attrs);
 	if (IS_ERR(c))
 		return PTR_ERR(c);
 	dev = to_mdev(c->ibucontext.device);
@@ -460,14 +996,14 @@
  * queue or arm its CQ for event generation), no further harm is expected.
  */
 static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_QUERY_UAR)(
-	struct ib_uverbs_file *file, struct uverbs_attr_bundle *attrs)
+	struct uverbs_attr_bundle *attrs)
 {
 	struct mlx5_ib_ucontext *c;
 	struct mlx5_ib_dev *dev;
 	u32 user_idx;
 	s32 dev_idx;
 
-	c = devx_ufile2uctx(file);
+	c = devx_ufile2uctx(attrs);
 	if (IS_ERR(c))
 		return PTR_ERR(c);
 	dev = to_mdev(c->ibucontext.device);
@@ -488,7 +1024,7 @@
 }
 
 static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OTHER)(
-	struct ib_uverbs_file *file, struct uverbs_attr_bundle *attrs)
+	struct uverbs_attr_bundle *attrs)
 {
 	struct mlx5_ib_ucontext *c;
 	struct mlx5_ib_dev *dev;
@@ -498,24 +1034,26 @@
 					MLX5_IB_ATTR_DEVX_OTHER_CMD_OUT);
 	void *cmd_out;
 	int err;
+	int uid;
 
-	c = devx_ufile2uctx(file);
+	c = devx_ufile2uctx(attrs);
 	if (IS_ERR(c))
 		return PTR_ERR(c);
 	dev = to_mdev(c->ibucontext.device);
 
-	if (!c->devx_uid)
-		return -EPERM;
+	uid = devx_get_uid(c, cmd_in);
+	if (uid < 0)
+		return uid;
 
 	/* Only white list of some general HCA commands are allowed for this method. */
-	if (!devx_is_general_cmd(cmd_in))
+	if (!devx_is_general_cmd(cmd_in, dev))
 		return -EINVAL;
 
 	cmd_out = uverbs_zalloc(attrs, cmd_out_len);
 	if (IS_ERR(cmd_out))
 		return PTR_ERR(cmd_out);
 
-	MLX5_SET(general_obj_in_cmd_hdr, cmd_in, uid, c->devx_uid);
+	MLX5_SET(general_obj_in_cmd_hdr, cmd_in, uid, uid);
 	err = mlx5_cmd_exec(dev->mdev, cmd_in,
 			    uverbs_attr_get_len(attrs, MLX5_IB_ATTR_DEVX_OTHER_CMD_IN),
 			    cmd_out, cmd_out_len);
@@ -545,6 +1083,10 @@
 		MLX5_SET(general_obj_in_cmd_hdr, din, obj_type, obj_type);
 		break;
 
+	case MLX5_CMD_OP_CREATE_UMEM:
+		MLX5_SET(general_obj_in_cmd_hdr, din, opcode,
+			 MLX5_CMD_OP_DESTROY_UMEM);
+		break;
 	case MLX5_CMD_OP_CREATE_MKEY:
 		MLX5_SET(general_obj_in_cmd_hdr, din, opcode, MLX5_CMD_OP_DESTROY_MKEY);
 		break;
@@ -627,9 +1169,9 @@
 		MLX5_SET(general_obj_in_cmd_hdr, din, opcode,
 			 MLX5_CMD_OP_DEALLOC_FLOW_COUNTER);
 		break;
-	case MLX5_CMD_OP_ALLOC_ENCAP_HEADER:
+	case MLX5_CMD_OP_ALLOC_PACKET_REFORMAT_CONTEXT:
 		MLX5_SET(general_obj_in_cmd_hdr, din, opcode,
-			 MLX5_CMD_OP_DEALLOC_ENCAP_HEADER);
+			 MLX5_CMD_OP_DEALLOC_PACKET_REFORMAT_CONTEXT);
 		break;
 	case MLX5_CMD_OP_ALLOC_MODIFY_HEADER_CONTEXT:
 		MLX5_SET(general_obj_in_cmd_hdr, din, opcode,
@@ -690,6 +1232,12 @@
 	case MLX5_CMD_OP_ALLOC_XRCD:
 		MLX5_SET(general_obj_in_cmd_hdr, din, opcode, MLX5_CMD_OP_DEALLOC_XRCD);
 		break;
+	case MLX5_CMD_OP_CREATE_PSV:
+		MLX5_SET(general_obj_in_cmd_hdr, din, opcode,
+			 MLX5_CMD_OP_DESTROY_PSV);
+		MLX5_SET(destroy_psv_in, din, psvn,
+			 MLX5_GET(create_psv_out, out, psv0_index));
+		break;
 	default:
 		/* The entry must match to one of the devx_is_obj_create_cmd */
 		WARN_ON(true);
@@ -697,40 +1245,185 @@
 	}
 }
 
+static int devx_handle_mkey_indirect(struct devx_obj *obj,
+				     struct mlx5_ib_dev *dev,
+				     void *in, void *out)
+{
+	struct mlx5_ib_devx_mr *devx_mr = &obj->devx_mr;
+	struct mlx5_core_mkey *mkey;
+	void *mkc;
+	u8 key;
+
+	mkey = &devx_mr->mmkey;
+	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
+	key = MLX5_GET(mkc, mkc, mkey_7_0);
+	mkey->key = mlx5_idx_to_mkey(
+			MLX5_GET(create_mkey_out, out, mkey_index)) | key;
+	mkey->type = MLX5_MKEY_INDIRECT_DEVX;
+	mkey->iova = MLX5_GET64(mkc, mkc, start_addr);
+	mkey->size = MLX5_GET64(mkc, mkc, len);
+	mkey->pd = MLX5_GET(mkc, mkc, pd);
+	devx_mr->ndescs = MLX5_GET(mkc, mkc, translations_octword_size);
+
+	return xa_err(xa_store(&dev->mdev->priv.mkey_table,
+			       mlx5_base_mkey(mkey->key), mkey, GFP_KERNEL));
+}
+
+static int devx_handle_mkey_create(struct mlx5_ib_dev *dev,
+				   struct devx_obj *obj,
+				   void *in, int in_len)
+{
+	int min_len = MLX5_BYTE_OFF(create_mkey_in, memory_key_mkey_entry) +
+			MLX5_FLD_SZ_BYTES(create_mkey_in,
+			memory_key_mkey_entry);
+	void *mkc;
+	u8 access_mode;
+
+	if (in_len < min_len)
+		return -EINVAL;
+
+	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
+
+	access_mode = MLX5_GET(mkc, mkc, access_mode_1_0);
+	access_mode |= MLX5_GET(mkc, mkc, access_mode_4_2) << 2;
+
+	if (access_mode == MLX5_MKC_ACCESS_MODE_KLMS ||
+		access_mode == MLX5_MKC_ACCESS_MODE_KSM) {
+		if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING))
+			obj->flags |= DEVX_OBJ_FLAGS_INDIRECT_MKEY;
+		return 0;
+	}
+
+	MLX5_SET(create_mkey_in, in, mkey_umem_valid, 1);
+	return 0;
+}
+
+static void devx_cleanup_subscription(struct mlx5_ib_dev *dev,
+				      struct devx_event_subscription *sub)
+{
+	struct devx_event *event;
+	struct devx_obj_event *xa_val_level2;
+
+	if (sub->is_cleaned)
+		return;
+
+	sub->is_cleaned = 1;
+	list_del_rcu(&sub->xa_list);
+
+	if (list_empty(&sub->obj_list))
+		return;
+
+	list_del_rcu(&sub->obj_list);
+	/* check whether key level 1 for this obj_sub_list is empty */
+	event = xa_load(&dev->devx_event_table.event_xa,
+			sub->xa_key_level1);
+	WARN_ON(!event);
+
+	xa_val_level2 = xa_load(&event->object_ids, sub->xa_key_level2);
+	if (list_empty(&xa_val_level2->obj_sub_list)) {
+		xa_erase(&event->object_ids,
+			 sub->xa_key_level2);
+		kfree_rcu(xa_val_level2, rcu);
+	}
+}
+
 static int devx_obj_cleanup(struct ib_uobject *uobject,
-			    enum rdma_remove_reason why)
+			    enum rdma_remove_reason why,
+			    struct uverbs_attr_bundle *attrs)
 {
 	u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)];
+	struct mlx5_devx_event_table *devx_event_table;
 	struct devx_obj *obj = uobject->object;
+	struct devx_event_subscription *sub_entry, *tmp;
+	struct mlx5_ib_dev *dev;
 	int ret;
 
-	ret = mlx5_cmd_exec(obj->mdev, obj->dinbox, obj->dinlen, out, sizeof(out));
+	dev = mlx5_udata_to_mdev(&attrs->driver_udata);
+	if (obj->flags & DEVX_OBJ_FLAGS_INDIRECT_MKEY) {
+		/*
+		 * The pagefault_single_data_segment() does commands against
+		 * the mmkey, we must wait for that to stop before freeing the
+		 * mkey, as another allocation could get the same mkey #.
+		 */
+		xa_erase(&obj->ib_dev->mdev->priv.mkey_table,
+			 mlx5_base_mkey(obj->devx_mr.mmkey.key));
+		synchronize_srcu(&dev->mr_srcu);
+	}
+
+	if (obj->flags & DEVX_OBJ_FLAGS_DCT)
+		ret = mlx5_core_destroy_dct(obj->ib_dev->mdev, &obj->core_dct);
+	else if (obj->flags & DEVX_OBJ_FLAGS_CQ)
+		ret = mlx5_core_destroy_cq(obj->ib_dev->mdev, &obj->core_cq);
+	else
+		ret = mlx5_cmd_exec(obj->ib_dev->mdev, obj->dinbox,
+				    obj->dinlen, out, sizeof(out));
 	if (ib_is_destroy_retryable(ret, why, uobject))
 		return ret;
 
+	devx_event_table = &dev->devx_event_table;
+
+	mutex_lock(&devx_event_table->event_xa_lock);
+	list_for_each_entry_safe(sub_entry, tmp, &obj->event_sub, obj_list)
+		devx_cleanup_subscription(dev, sub_entry);
+	mutex_unlock(&devx_event_table->event_xa_lock);
+
 	kfree(obj);
 	return ret;
 }
 
+static void devx_cq_comp(struct mlx5_core_cq *mcq, struct mlx5_eqe *eqe)
+{
+	struct devx_obj *obj = container_of(mcq, struct devx_obj, core_cq);
+	struct mlx5_devx_event_table *table;
+	struct devx_event *event;
+	struct devx_obj_event *obj_event;
+	u32 obj_id = mcq->cqn;
+
+	table = &obj->ib_dev->devx_event_table;
+	rcu_read_lock();
+	event = xa_load(&table->event_xa, MLX5_EVENT_TYPE_COMP);
+	if (!event)
+		goto out;
+
+	obj_event = xa_load(&event->object_ids, obj_id);
+	if (!obj_event)
+		goto out;
+
+	dispatch_event_fd(&obj_event->obj_sub_list, eqe);
+out:
+	rcu_read_unlock();
+}
+
 static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OBJ_CREATE)(
-	struct ib_uverbs_file *file, struct uverbs_attr_bundle *attrs)
+	struct uverbs_attr_bundle *attrs)
 {
 	void *cmd_in = uverbs_attr_get_alloced_ptr(attrs, MLX5_IB_ATTR_DEVX_OBJ_CREATE_CMD_IN);
 	int cmd_out_len =  uverbs_attr_get_len(attrs,
 					MLX5_IB_ATTR_DEVX_OBJ_CREATE_CMD_OUT);
+	int cmd_in_len = uverbs_attr_get_len(attrs,
+					MLX5_IB_ATTR_DEVX_OBJ_CREATE_CMD_IN);
 	void *cmd_out;
 	struct ib_uobject *uobj = uverbs_attr_get_uobject(
 		attrs, MLX5_IB_ATTR_DEVX_OBJ_CREATE_HANDLE);
-	struct mlx5_ib_ucontext *c = to_mucontext(uobj->context);
+	struct mlx5_ib_ucontext *c = rdma_udata_to_drv_context(
+		&attrs->driver_udata, struct mlx5_ib_ucontext, ibucontext);
 	struct mlx5_ib_dev *dev = to_mdev(c->ibucontext.device);
 	u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)];
 	struct devx_obj *obj;
+	u16 obj_type = 0;
 	int err;
+	int uid;
+	u32 obj_id;
+	u16 opcode;
 
-	if (!c->devx_uid)
-		return -EPERM;
+	if (MLX5_GET(general_obj_in_cmd_hdr, cmd_in, vhca_tunnel_id))
+		return -EINVAL;
 
-	if (!devx_is_obj_create_cmd(cmd_in))
+	uid = devx_get_uid(c, cmd_in);
+	if (uid < 0)
+		return uid;
+
+	if (!devx_is_obj_create_cmd(cmd_in, &opcode))
 		return -EINVAL;
 
 	cmd_out = uverbs_zalloc(attrs, cmd_out_len);
@@ -741,59 +1434,106 @@
 	if (!obj)
 		return -ENOMEM;
 
-	MLX5_SET(general_obj_in_cmd_hdr, cmd_in, uid, c->devx_uid);
-	err = mlx5_cmd_exec(dev->mdev, cmd_in,
-			    uverbs_attr_get_len(attrs, MLX5_IB_ATTR_DEVX_OBJ_CREATE_CMD_IN),
-			    cmd_out, cmd_out_len);
+	MLX5_SET(general_obj_in_cmd_hdr, cmd_in, uid, uid);
+	if (opcode == MLX5_CMD_OP_CREATE_MKEY) {
+		err = devx_handle_mkey_create(dev, obj, cmd_in, cmd_in_len);
+		if (err)
+			goto obj_free;
+	} else {
+		devx_set_umem_valid(cmd_in);
+	}
+
+	if (opcode == MLX5_CMD_OP_CREATE_DCT) {
+		obj->flags |= DEVX_OBJ_FLAGS_DCT;
+		err = mlx5_core_create_dct(dev->mdev, &obj->core_dct,
+					   cmd_in, cmd_in_len,
+					   cmd_out, cmd_out_len);
+	} else if (opcode == MLX5_CMD_OP_CREATE_CQ) {
+		obj->flags |= DEVX_OBJ_FLAGS_CQ;
+		obj->core_cq.comp = devx_cq_comp;
+		err = mlx5_core_create_cq(dev->mdev, &obj->core_cq,
+					  cmd_in, cmd_in_len, cmd_out,
+					  cmd_out_len);
+	} else {
+		err = mlx5_cmd_exec(dev->mdev, cmd_in,
+				    cmd_in_len,
+				    cmd_out, cmd_out_len);
+	}
+
 	if (err)
 		goto obj_free;
 
 	uobj->object = obj;
-	obj->mdev = dev->mdev;
-	devx_obj_build_destroy_cmd(cmd_in, cmd_out, obj->dinbox, &obj->dinlen, &obj->obj_id);
+	INIT_LIST_HEAD(&obj->event_sub);
+	obj->ib_dev = dev;
+	devx_obj_build_destroy_cmd(cmd_in, cmd_out, obj->dinbox, &obj->dinlen,
+				   &obj_id);
 	WARN_ON(obj->dinlen > MLX5_MAX_DESTROY_INBOX_SIZE_DW * sizeof(u32));
 
 	err = uverbs_copy_to(attrs, MLX5_IB_ATTR_DEVX_OBJ_CREATE_CMD_OUT, cmd_out, cmd_out_len);
 	if (err)
 		goto obj_destroy;
 
+	if (opcode == MLX5_CMD_OP_CREATE_GENERAL_OBJECT)
+		obj_type = MLX5_GET(general_obj_in_cmd_hdr, cmd_in, obj_type);
+	obj->obj_id = get_enc_obj_id(opcode | obj_type << 16, obj_id);
+
+	if (obj->flags & DEVX_OBJ_FLAGS_INDIRECT_MKEY) {
+		err = devx_handle_mkey_indirect(obj, dev, cmd_in, cmd_out);
+		if (err)
+			goto obj_destroy;
+	}
 	return 0;
 
 obj_destroy:
-	mlx5_cmd_exec(obj->mdev, obj->dinbox, obj->dinlen, out, sizeof(out));
+	if (obj->flags & DEVX_OBJ_FLAGS_DCT)
+		mlx5_core_destroy_dct(obj->ib_dev->mdev, &obj->core_dct);
+	else if (obj->flags & DEVX_OBJ_FLAGS_CQ)
+		mlx5_core_destroy_cq(obj->ib_dev->mdev, &obj->core_cq);
+	else
+		mlx5_cmd_exec(obj->ib_dev->mdev, obj->dinbox, obj->dinlen, out,
+			      sizeof(out));
 obj_free:
 	kfree(obj);
 	return err;
 }
 
 static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OBJ_MODIFY)(
-	struct ib_uverbs_file *file, struct uverbs_attr_bundle *attrs)
+	struct uverbs_attr_bundle *attrs)
 {
 	void *cmd_in = uverbs_attr_get_alloced_ptr(attrs, MLX5_IB_ATTR_DEVX_OBJ_MODIFY_CMD_IN);
 	int cmd_out_len = uverbs_attr_get_len(attrs,
 					MLX5_IB_ATTR_DEVX_OBJ_MODIFY_CMD_OUT);
 	struct ib_uobject *uobj = uverbs_attr_get_uobject(attrs,
 							  MLX5_IB_ATTR_DEVX_OBJ_MODIFY_HANDLE);
-	struct mlx5_ib_ucontext *c = to_mucontext(uobj->context);
-	struct devx_obj *obj = uobj->object;
+	struct mlx5_ib_ucontext *c = rdma_udata_to_drv_context(
+		&attrs->driver_udata, struct mlx5_ib_ucontext, ibucontext);
+	struct mlx5_ib_dev *mdev = to_mdev(c->ibucontext.device);
 	void *cmd_out;
 	int err;
+	int uid;
 
-	if (!c->devx_uid)
-		return -EPERM;
+	if (MLX5_GET(general_obj_in_cmd_hdr, cmd_in, vhca_tunnel_id))
+		return -EINVAL;
+
+	uid = devx_get_uid(c, cmd_in);
+	if (uid < 0)
+		return uid;
 
 	if (!devx_is_obj_modify_cmd(cmd_in))
 		return -EINVAL;
 
-	if (!devx_is_valid_obj_id(obj, cmd_in))
+	if (!devx_is_valid_obj_id(attrs, uobj, cmd_in))
 		return -EINVAL;
 
 	cmd_out = uverbs_zalloc(attrs, cmd_out_len);
 	if (IS_ERR(cmd_out))
 		return PTR_ERR(cmd_out);
 
-	MLX5_SET(general_obj_in_cmd_hdr, cmd_in, uid, c->devx_uid);
-	err = mlx5_cmd_exec(obj->mdev, cmd_in,
+	MLX5_SET(general_obj_in_cmd_hdr, cmd_in, uid, uid);
+	devx_set_umem_valid(cmd_in);
+
+	err = mlx5_cmd_exec(mdev->mdev, cmd_in,
 			    uverbs_attr_get_len(attrs, MLX5_IB_ATTR_DEVX_OBJ_MODIFY_CMD_IN),
 			    cmd_out, cmd_out_len);
 	if (err)
@@ -804,33 +1544,39 @@
 }
 
 static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OBJ_QUERY)(
-	struct ib_uverbs_file *file, struct uverbs_attr_bundle *attrs)
+	struct uverbs_attr_bundle *attrs)
 {
 	void *cmd_in = uverbs_attr_get_alloced_ptr(attrs, MLX5_IB_ATTR_DEVX_OBJ_QUERY_CMD_IN);
 	int cmd_out_len = uverbs_attr_get_len(attrs,
 					      MLX5_IB_ATTR_DEVX_OBJ_QUERY_CMD_OUT);
 	struct ib_uobject *uobj = uverbs_attr_get_uobject(attrs,
 							  MLX5_IB_ATTR_DEVX_OBJ_QUERY_HANDLE);
-	struct mlx5_ib_ucontext *c = to_mucontext(uobj->context);
-	struct devx_obj *obj = uobj->object;
+	struct mlx5_ib_ucontext *c = rdma_udata_to_drv_context(
+		&attrs->driver_udata, struct mlx5_ib_ucontext, ibucontext);
 	void *cmd_out;
 	int err;
+	int uid;
+	struct mlx5_ib_dev *mdev = to_mdev(c->ibucontext.device);
 
-	if (!c->devx_uid)
-		return -EPERM;
+	if (MLX5_GET(general_obj_in_cmd_hdr, cmd_in, vhca_tunnel_id))
+		return -EINVAL;
+
+	uid = devx_get_uid(c, cmd_in);
+	if (uid < 0)
+		return uid;
 
 	if (!devx_is_obj_query_cmd(cmd_in))
 		return -EINVAL;
 
-	if (!devx_is_valid_obj_id(obj, cmd_in))
+	if (!devx_is_valid_obj_id(attrs, uobj, cmd_in))
 		return -EINVAL;
 
 	cmd_out = uverbs_zalloc(attrs, cmd_out_len);
 	if (IS_ERR(cmd_out))
 		return PTR_ERR(cmd_out);
 
-	MLX5_SET(general_obj_in_cmd_hdr, cmd_in, uid, c->devx_uid);
-	err = mlx5_cmd_exec(obj->mdev, cmd_in,
+	MLX5_SET(general_obj_in_cmd_hdr, cmd_in, uid, uid);
+	err = mlx5_cmd_exec(mdev->mdev, cmd_in,
 			    uverbs_attr_get_len(attrs, MLX5_IB_ATTR_DEVX_OBJ_QUERY_CMD_IN),
 			    cmd_out, cmd_out_len);
 	if (err)
@@ -840,6 +1586,514 @@
 			      cmd_out, cmd_out_len);
 }
 
+struct devx_async_event_queue {
+	spinlock_t		lock;
+	wait_queue_head_t	poll_wait;
+	struct list_head	event_list;
+	atomic_t		bytes_in_use;
+	u8			is_destroyed:1;
+};
+
+struct devx_async_cmd_event_file {
+	struct ib_uobject		uobj;
+	struct devx_async_event_queue	ev_queue;
+	struct mlx5_async_ctx		async_ctx;
+};
+
+static void devx_init_event_queue(struct devx_async_event_queue *ev_queue)
+{
+	spin_lock_init(&ev_queue->lock);
+	INIT_LIST_HEAD(&ev_queue->event_list);
+	init_waitqueue_head(&ev_queue->poll_wait);
+	atomic_set(&ev_queue->bytes_in_use, 0);
+	ev_queue->is_destroyed = 0;
+}
+
+static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_ASYNC_CMD_FD_ALLOC)(
+	struct uverbs_attr_bundle *attrs)
+{
+	struct devx_async_cmd_event_file *ev_file;
+
+	struct ib_uobject *uobj = uverbs_attr_get_uobject(
+		attrs, MLX5_IB_ATTR_DEVX_ASYNC_CMD_FD_ALLOC_HANDLE);
+	struct mlx5_ib_dev *mdev = mlx5_udata_to_mdev(&attrs->driver_udata);
+
+	ev_file = container_of(uobj, struct devx_async_cmd_event_file,
+			       uobj);
+	devx_init_event_queue(&ev_file->ev_queue);
+	mlx5_cmd_init_async_ctx(mdev->mdev, &ev_file->async_ctx);
+	return 0;
+}
+
+static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_ASYNC_EVENT_FD_ALLOC)(
+	struct uverbs_attr_bundle *attrs)
+{
+	struct ib_uobject *uobj = uverbs_attr_get_uobject(
+		attrs, MLX5_IB_ATTR_DEVX_ASYNC_EVENT_FD_ALLOC_HANDLE);
+	struct devx_async_event_file *ev_file;
+	struct mlx5_ib_ucontext *c = rdma_udata_to_drv_context(
+		&attrs->driver_udata, struct mlx5_ib_ucontext, ibucontext);
+	struct mlx5_ib_dev *dev = to_mdev(c->ibucontext.device);
+	u32 flags;
+	int err;
+
+	err = uverbs_get_flags32(&flags, attrs,
+		MLX5_IB_ATTR_DEVX_ASYNC_EVENT_FD_ALLOC_FLAGS,
+		MLX5_IB_UAPI_DEVX_CR_EV_CH_FLAGS_OMIT_DATA);
+
+	if (err)
+		return err;
+
+	ev_file = container_of(uobj, struct devx_async_event_file,
+			       uobj);
+	spin_lock_init(&ev_file->lock);
+	INIT_LIST_HEAD(&ev_file->event_list);
+	init_waitqueue_head(&ev_file->poll_wait);
+	if (flags & MLX5_IB_UAPI_DEVX_CR_EV_CH_FLAGS_OMIT_DATA)
+		ev_file->omit_data = 1;
+	INIT_LIST_HEAD(&ev_file->subscribed_events_list);
+	ev_file->dev = dev;
+	get_device(&dev->ib_dev.dev);
+	return 0;
+}
+
+static void devx_query_callback(int status, struct mlx5_async_work *context)
+{
+	struct devx_async_data *async_data =
+		container_of(context, struct devx_async_data, cb_work);
+	struct ib_uobject *fd_uobj = async_data->fd_uobj;
+	struct devx_async_cmd_event_file *ev_file;
+	struct devx_async_event_queue *ev_queue;
+	unsigned long flags;
+
+	ev_file = container_of(fd_uobj, struct devx_async_cmd_event_file,
+			       uobj);
+	ev_queue = &ev_file->ev_queue;
+
+	spin_lock_irqsave(&ev_queue->lock, flags);
+	list_add_tail(&async_data->list, &ev_queue->event_list);
+	spin_unlock_irqrestore(&ev_queue->lock, flags);
+
+	wake_up_interruptible(&ev_queue->poll_wait);
+	fput(fd_uobj->object);
+}
+
+#define MAX_ASYNC_BYTES_IN_USE (1024 * 1024) /* 1MB */
+
+static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OBJ_ASYNC_QUERY)(
+	struct uverbs_attr_bundle *attrs)
+{
+	void *cmd_in = uverbs_attr_get_alloced_ptr(attrs,
+				MLX5_IB_ATTR_DEVX_OBJ_QUERY_ASYNC_CMD_IN);
+	struct ib_uobject *uobj = uverbs_attr_get_uobject(
+				attrs,
+				MLX5_IB_ATTR_DEVX_OBJ_QUERY_ASYNC_HANDLE);
+	u16 cmd_out_len;
+	struct mlx5_ib_ucontext *c = rdma_udata_to_drv_context(
+		&attrs->driver_udata, struct mlx5_ib_ucontext, ibucontext);
+	struct ib_uobject *fd_uobj;
+	int err;
+	int uid;
+	struct mlx5_ib_dev *mdev = to_mdev(c->ibucontext.device);
+	struct devx_async_cmd_event_file *ev_file;
+	struct devx_async_data *async_data;
+
+	if (MLX5_GET(general_obj_in_cmd_hdr, cmd_in, vhca_tunnel_id))
+		return -EINVAL;
+
+	uid = devx_get_uid(c, cmd_in);
+	if (uid < 0)
+		return uid;
+
+	if (!devx_is_obj_query_cmd(cmd_in))
+		return -EINVAL;
+
+	err = uverbs_get_const(&cmd_out_len, attrs,
+			       MLX5_IB_ATTR_DEVX_OBJ_QUERY_ASYNC_OUT_LEN);
+	if (err)
+		return err;
+
+	if (!devx_is_valid_obj_id(attrs, uobj, cmd_in))
+		return -EINVAL;
+
+	fd_uobj = uverbs_attr_get_uobject(attrs,
+				MLX5_IB_ATTR_DEVX_OBJ_QUERY_ASYNC_FD);
+	if (IS_ERR(fd_uobj))
+		return PTR_ERR(fd_uobj);
+
+	ev_file = container_of(fd_uobj, struct devx_async_cmd_event_file,
+			       uobj);
+
+	if (atomic_add_return(cmd_out_len, &ev_file->ev_queue.bytes_in_use) >
+			MAX_ASYNC_BYTES_IN_USE) {
+		atomic_sub(cmd_out_len, &ev_file->ev_queue.bytes_in_use);
+		return -EAGAIN;
+	}
+
+	async_data = kvzalloc(struct_size(async_data, hdr.out_data,
+					  cmd_out_len), GFP_KERNEL);
+	if (!async_data) {
+		err = -ENOMEM;
+		goto sub_bytes;
+	}
+
+	err = uverbs_copy_from(&async_data->hdr.wr_id, attrs,
+			       MLX5_IB_ATTR_DEVX_OBJ_QUERY_ASYNC_WR_ID);
+	if (err)
+		goto free_async;
+
+	async_data->cmd_out_len = cmd_out_len;
+	async_data->mdev = mdev;
+	async_data->fd_uobj = fd_uobj;
+
+	get_file(fd_uobj->object);
+	MLX5_SET(general_obj_in_cmd_hdr, cmd_in, uid, uid);
+	err = mlx5_cmd_exec_cb(&ev_file->async_ctx, cmd_in,
+		    uverbs_attr_get_len(attrs,
+				MLX5_IB_ATTR_DEVX_OBJ_QUERY_ASYNC_CMD_IN),
+		    async_data->hdr.out_data,
+		    async_data->cmd_out_len,
+		    devx_query_callback, &async_data->cb_work);
+
+	if (err)
+		goto cb_err;
+
+	return 0;
+
+cb_err:
+	fput(fd_uobj->object);
+free_async:
+	kvfree(async_data);
+sub_bytes:
+	atomic_sub(cmd_out_len, &ev_file->ev_queue.bytes_in_use);
+	return err;
+}
+
+static void
+subscribe_event_xa_dealloc(struct mlx5_devx_event_table *devx_event_table,
+			   u32 key_level1,
+			   bool is_level2,
+			   u32 key_level2)
+{
+	struct devx_event *event;
+	struct devx_obj_event *xa_val_level2;
+
+	/* Level 1 is valid for future use, no need to free */
+	if (!is_level2)
+		return;
+
+	event = xa_load(&devx_event_table->event_xa, key_level1);
+	WARN_ON(!event);
+
+	xa_val_level2 = xa_load(&event->object_ids,
+				key_level2);
+	if (list_empty(&xa_val_level2->obj_sub_list)) {
+		xa_erase(&event->object_ids,
+			 key_level2);
+		kfree_rcu(xa_val_level2, rcu);
+	}
+}
+
+static int
+subscribe_event_xa_alloc(struct mlx5_devx_event_table *devx_event_table,
+			 u32 key_level1,
+			 bool is_level2,
+			 u32 key_level2)
+{
+	struct devx_obj_event *obj_event;
+	struct devx_event *event;
+	int err;
+
+	event = xa_load(&devx_event_table->event_xa, key_level1);
+	if (!event) {
+		event = kzalloc(sizeof(*event), GFP_KERNEL);
+		if (!event)
+			return -ENOMEM;
+
+		INIT_LIST_HEAD(&event->unaffiliated_list);
+		xa_init(&event->object_ids);
+
+		err = xa_insert(&devx_event_table->event_xa,
+				key_level1,
+				event,
+				GFP_KERNEL);
+		if (err) {
+			kfree(event);
+			return err;
+		}
+	}
+
+	if (!is_level2)
+		return 0;
+
+	obj_event = xa_load(&event->object_ids, key_level2);
+	if (!obj_event) {
+		obj_event = kzalloc(sizeof(*obj_event), GFP_KERNEL);
+		if (!obj_event)
+			/* Level1 is valid for future use, no need to free */
+			return -ENOMEM;
+
+		err = xa_insert(&event->object_ids,
+				key_level2,
+				obj_event,
+				GFP_KERNEL);
+		if (err)
+			return err;
+		INIT_LIST_HEAD(&obj_event->obj_sub_list);
+	}
+
+	return 0;
+}
+
+static bool is_valid_events_legacy(int num_events, u16 *event_type_num_list,
+				   struct devx_obj *obj)
+{
+	int i;
+
+	for (i = 0; i < num_events; i++) {
+		if (obj) {
+			if (!is_legacy_obj_event_num(event_type_num_list[i]))
+				return false;
+		} else if (!is_legacy_unaffiliated_event_num(
+				event_type_num_list[i])) {
+			return false;
+		}
+	}
+
+	return true;
+}
+
+#define MAX_SUPP_EVENT_NUM 255
+static bool is_valid_events(struct mlx5_core_dev *dev,
+			    int num_events, u16 *event_type_num_list,
+			    struct devx_obj *obj)
+{
+	__be64 *aff_events;
+	__be64 *unaff_events;
+	int mask_entry;
+	int mask_bit;
+	int i;
+
+	if (MLX5_CAP_GEN(dev, event_cap)) {
+		aff_events = MLX5_CAP_DEV_EVENT(dev,
+						user_affiliated_events);
+		unaff_events = MLX5_CAP_DEV_EVENT(dev,
+						  user_unaffiliated_events);
+	} else {
+		return is_valid_events_legacy(num_events, event_type_num_list,
+					      obj);
+	}
+
+	for (i = 0; i < num_events; i++) {
+		if (event_type_num_list[i] > MAX_SUPP_EVENT_NUM)
+			return false;
+
+		mask_entry = event_type_num_list[i] / 64;
+		mask_bit = event_type_num_list[i] % 64;
+
+		if (obj) {
+			/* CQ completion */
+			if (event_type_num_list[i] == 0)
+				continue;
+
+			if (!(be64_to_cpu(aff_events[mask_entry]) &
+					(1ull << mask_bit)))
+				return false;
+
+			continue;
+		}
+
+		if (!(be64_to_cpu(unaff_events[mask_entry]) &
+				(1ull << mask_bit)))
+			return false;
+	}
+
+	return true;
+}
+
+#define MAX_NUM_EVENTS 16
+static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_SUBSCRIBE_EVENT)(
+	struct uverbs_attr_bundle *attrs)
+{
+	struct ib_uobject *devx_uobj = uverbs_attr_get_uobject(
+				attrs,
+				MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_OBJ_HANDLE);
+	struct mlx5_ib_ucontext *c = rdma_udata_to_drv_context(
+		&attrs->driver_udata, struct mlx5_ib_ucontext, ibucontext);
+	struct mlx5_ib_dev *dev = to_mdev(c->ibucontext.device);
+	struct ib_uobject *fd_uobj;
+	struct devx_obj *obj = NULL;
+	struct devx_async_event_file *ev_file;
+	struct mlx5_devx_event_table *devx_event_table = &dev->devx_event_table;
+	u16 *event_type_num_list;
+	struct devx_event_subscription *event_sub, *tmp_sub;
+	struct list_head sub_list;
+	int redirect_fd;
+	bool use_eventfd = false;
+	int num_events;
+	int num_alloc_xa_entries = 0;
+	u16 obj_type = 0;
+	u64 cookie = 0;
+	u32 obj_id = 0;
+	int err;
+	int i;
+
+	if (!c->devx_uid)
+		return -EINVAL;
+
+	if (!IS_ERR(devx_uobj)) {
+		obj = (struct devx_obj *)devx_uobj->object;
+		if (obj)
+			obj_id = get_dec_obj_id(obj->obj_id);
+	}
+
+	fd_uobj = uverbs_attr_get_uobject(attrs,
+				MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_FD_HANDLE);
+	if (IS_ERR(fd_uobj))
+		return PTR_ERR(fd_uobj);
+
+	ev_file = container_of(fd_uobj, struct devx_async_event_file,
+			       uobj);
+
+	if (uverbs_attr_is_valid(attrs,
+				 MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_FD_NUM)) {
+		err = uverbs_copy_from(&redirect_fd, attrs,
+			       MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_FD_NUM);
+		if (err)
+			return err;
+
+		use_eventfd = true;
+	}
+
+	if (uverbs_attr_is_valid(attrs,
+				 MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_COOKIE)) {
+		if (use_eventfd)
+			return -EINVAL;
+
+		err = uverbs_copy_from(&cookie, attrs,
+				MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_COOKIE);
+		if (err)
+			return err;
+	}
+
+	num_events = uverbs_attr_ptr_get_array_size(
+		attrs, MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_TYPE_NUM_LIST,
+		sizeof(u16));
+
+	if (num_events < 0)
+		return num_events;
+
+	if (num_events > MAX_NUM_EVENTS)
+		return -EINVAL;
+
+	event_type_num_list = uverbs_attr_get_alloced_ptr(attrs,
+			MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_TYPE_NUM_LIST);
+
+	if (!is_valid_events(dev->mdev, num_events, event_type_num_list, obj))
+		return -EINVAL;
+
+	INIT_LIST_HEAD(&sub_list);
+
+	/* Protect from concurrent subscriptions to same XA entries to allow
+	 * both to succeed
+	 */
+	mutex_lock(&devx_event_table->event_xa_lock);
+	for (i = 0; i < num_events; i++) {
+		u32 key_level1;
+
+		if (obj)
+			obj_type = get_dec_obj_type(obj,
+						    event_type_num_list[i]);
+		key_level1 = event_type_num_list[i] | obj_type << 16;
+
+		err = subscribe_event_xa_alloc(devx_event_table,
+					       key_level1,
+					       obj,
+					       obj_id);
+		if (err)
+			goto err;
+
+		num_alloc_xa_entries++;
+		event_sub = kzalloc(sizeof(*event_sub), GFP_KERNEL);
+		if (!event_sub)
+			goto err;
+
+		list_add_tail(&event_sub->event_list, &sub_list);
+		if (use_eventfd) {
+			event_sub->eventfd =
+				eventfd_ctx_fdget(redirect_fd);
+
+			if (IS_ERR(event_sub->eventfd)) {
+				err = PTR_ERR(event_sub->eventfd);
+				event_sub->eventfd = NULL;
+				goto err;
+			}
+		}
+
+		event_sub->cookie = cookie;
+		event_sub->ev_file = ev_file;
+		event_sub->filp = fd_uobj->object;
+		/* May be needed upon cleanup the devx object/subscription */
+		event_sub->xa_key_level1 = key_level1;
+		event_sub->xa_key_level2 = obj_id;
+		INIT_LIST_HEAD(&event_sub->obj_list);
+	}
+
+	/* Once all the allocations and the XA data insertions were done we
+	 * can go ahead and add all the subscriptions to the relevant lists
+	 * without concern of a failure.
+	 */
+	list_for_each_entry_safe(event_sub, tmp_sub, &sub_list, event_list) {
+		struct devx_event *event;
+		struct devx_obj_event *obj_event;
+
+		list_del_init(&event_sub->event_list);
+
+		spin_lock_irq(&ev_file->lock);
+		list_add_tail_rcu(&event_sub->file_list,
+				  &ev_file->subscribed_events_list);
+		spin_unlock_irq(&ev_file->lock);
+
+		event = xa_load(&devx_event_table->event_xa,
+				event_sub->xa_key_level1);
+		WARN_ON(!event);
+
+		if (!obj) {
+			list_add_tail_rcu(&event_sub->xa_list,
+					  &event->unaffiliated_list);
+			continue;
+		}
+
+		obj_event = xa_load(&event->object_ids, obj_id);
+		WARN_ON(!obj_event);
+		list_add_tail_rcu(&event_sub->xa_list,
+				  &obj_event->obj_sub_list);
+		list_add_tail_rcu(&event_sub->obj_list,
+				  &obj->event_sub);
+	}
+
+	mutex_unlock(&devx_event_table->event_xa_lock);
+	return 0;
+
+err:
+	list_for_each_entry_safe(event_sub, tmp_sub, &sub_list, event_list) {
+		list_del(&event_sub->event_list);
+
+		subscribe_event_xa_dealloc(devx_event_table,
+					   event_sub->xa_key_level1,
+					   obj,
+					   obj_id);
+
+		if (event_sub->eventfd)
+			eventfd_ctx_put(event_sub->eventfd);
+
+		kfree(event_sub);
+	}
+
+	mutex_unlock(&devx_event_table->event_xa_lock);
+	return err;
+}
+
 static int devx_umem_get(struct mlx5_ib_dev *dev, struct ib_ucontext *ucontext,
 			 struct uverbs_attr_bundle *attrs,
 			 struct devx_umem *obj)
@@ -857,7 +2111,9 @@
 
 	err = uverbs_get_flags32(&access, attrs,
 				 MLX5_IB_ATTR_DEVX_UMEM_REG_ACCESS,
-				 IB_ACCESS_SUPPORTED);
+				 IB_ACCESS_LOCAL_WRITE |
+				 IB_ACCESS_REMOTE_WRITE |
+				 IB_ACCESS_REMOTE_READ);
 	if (err)
 		return err;
 
@@ -865,7 +2121,7 @@
 	if (err)
 		return err;
 
-	obj->umem = ib_umem_get(ucontext, addr, size, access, 0);
+	obj->umem = ib_umem_get(&attrs->driver_udata, addr, size, access, 0);
 	if (IS_ERR(obj->umem))
 		return PTR_ERR(obj->umem);
 
@@ -904,8 +2160,7 @@
 	umem = MLX5_ADDR_OF(create_umem_in, cmd->in, umem);
 	mtt = (__be64 *)MLX5_ADDR_OF(umem, umem, mtt);
 
-	MLX5_SET(general_obj_in_cmd_hdr, cmd->in, opcode, MLX5_CMD_OP_CREATE_GENERAL_OBJECT);
-	MLX5_SET(general_obj_in_cmd_hdr, cmd->in, obj_type, MLX5_OBJ_TYPE_UMEM);
+	MLX5_SET(create_umem_in, cmd->in, opcode, MLX5_CMD_OP_CREATE_UMEM);
 	MLX5_SET64(umem, umem, num_of_mtt, obj->ncont);
 	MLX5_SET(umem, umem, log_page_size, obj->page_shift -
 					    MLX5_ADAPTER_PAGE_SHIFT);
@@ -916,19 +2171,20 @@
 }
 
 static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_UMEM_REG)(
-	struct ib_uverbs_file *file, struct uverbs_attr_bundle *attrs)
+	struct uverbs_attr_bundle *attrs)
 {
 	struct devx_umem_reg_cmd cmd;
 	struct devx_umem *obj;
 	struct ib_uobject *uobj = uverbs_attr_get_uobject(
 		attrs, MLX5_IB_ATTR_DEVX_UMEM_REG_HANDLE);
 	u32 obj_id;
-	struct mlx5_ib_ucontext *c = to_mucontext(uobj->context);
+	struct mlx5_ib_ucontext *c = rdma_udata_to_drv_context(
+		&attrs->driver_udata, struct mlx5_ib_ucontext, ibucontext);
 	struct mlx5_ib_dev *dev = to_mdev(c->ibucontext.device);
 	int err;
 
 	if (!c->devx_uid)
-		return -EPERM;
+		return -EINVAL;
 
 	obj = kzalloc(sizeof(struct devx_umem), GFP_KERNEL);
 	if (!obj)
@@ -944,7 +2200,7 @@
 
 	devx_umem_reg_cmd_build(dev, obj, &cmd);
 
-	MLX5_SET(general_obj_in_cmd_hdr, cmd.in, uid, c->devx_uid);
+	MLX5_SET(create_umem_in, cmd.in, uid, c->devx_uid);
 	err = mlx5_cmd_exec(dev->mdev, cmd.in, cmd.inlen, cmd.out,
 			    sizeof(cmd.out));
 	if (err)
@@ -969,7 +2225,8 @@
 }
 
 static int devx_umem_cleanup(struct ib_uobject *uobject,
-			     enum rdma_remove_reason why)
+			     enum rdma_remove_reason why,
+			     struct uverbs_attr_bundle *attrs)
 {
 	struct devx_umem *obj = uobject->object;
 	u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)];
@@ -984,6 +2241,483 @@
 	return 0;
 }
 
+static bool is_unaffiliated_event(struct mlx5_core_dev *dev,
+				  unsigned long event_type)
+{
+	__be64 *unaff_events;
+	int mask_entry;
+	int mask_bit;
+
+	if (!MLX5_CAP_GEN(dev, event_cap))
+		return is_legacy_unaffiliated_event_num(event_type);
+
+	unaff_events = MLX5_CAP_DEV_EVENT(dev,
+					  user_unaffiliated_events);
+	WARN_ON(event_type > MAX_SUPP_EVENT_NUM);
+
+	mask_entry = event_type / 64;
+	mask_bit = event_type % 64;
+
+	if (!(be64_to_cpu(unaff_events[mask_entry]) & (1ull << mask_bit)))
+		return false;
+
+	return true;
+}
+
+static u32 devx_get_obj_id_from_event(unsigned long event_type, void *data)
+{
+	struct mlx5_eqe *eqe = data;
+	u32 obj_id = 0;
+
+	switch (event_type) {
+	case MLX5_EVENT_TYPE_SRQ_CATAS_ERROR:
+	case MLX5_EVENT_TYPE_SRQ_RQ_LIMIT:
+	case MLX5_EVENT_TYPE_PATH_MIG:
+	case MLX5_EVENT_TYPE_COMM_EST:
+	case MLX5_EVENT_TYPE_SQ_DRAINED:
+	case MLX5_EVENT_TYPE_SRQ_LAST_WQE:
+	case MLX5_EVENT_TYPE_WQ_CATAS_ERROR:
+	case MLX5_EVENT_TYPE_PATH_MIG_FAILED:
+	case MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR:
+	case MLX5_EVENT_TYPE_WQ_ACCESS_ERROR:
+		obj_id = be32_to_cpu(eqe->data.qp_srq.qp_srq_n) & 0xffffff;
+		break;
+	case MLX5_EVENT_TYPE_XRQ_ERROR:
+		obj_id = be32_to_cpu(eqe->data.xrq_err.type_xrqn) & 0xffffff;
+		break;
+	case MLX5_EVENT_TYPE_DCT_DRAINED:
+	case MLX5_EVENT_TYPE_DCT_KEY_VIOLATION:
+		obj_id = be32_to_cpu(eqe->data.dct.dctn) & 0xffffff;
+		break;
+	case MLX5_EVENT_TYPE_CQ_ERROR:
+		obj_id = be32_to_cpu(eqe->data.cq_err.cqn) & 0xffffff;
+		break;
+	default:
+		obj_id = MLX5_GET(affiliated_event_header, &eqe->data, obj_id);
+		break;
+	}
+
+	return obj_id;
+}
+
+static int deliver_event(struct devx_event_subscription *event_sub,
+			 const void *data)
+{
+	struct devx_async_event_file *ev_file;
+	struct devx_async_event_data *event_data;
+	unsigned long flags;
+
+	ev_file = event_sub->ev_file;
+
+	if (ev_file->omit_data) {
+		spin_lock_irqsave(&ev_file->lock, flags);
+		if (!list_empty(&event_sub->event_list)) {
+			spin_unlock_irqrestore(&ev_file->lock, flags);
+			return 0;
+		}
+
+		list_add_tail(&event_sub->event_list, &ev_file->event_list);
+		spin_unlock_irqrestore(&ev_file->lock, flags);
+		wake_up_interruptible(&ev_file->poll_wait);
+		return 0;
+	}
+
+	event_data = kzalloc(sizeof(*event_data) + sizeof(struct mlx5_eqe),
+			     GFP_ATOMIC);
+	if (!event_data) {
+		spin_lock_irqsave(&ev_file->lock, flags);
+		ev_file->is_overflow_err = 1;
+		spin_unlock_irqrestore(&ev_file->lock, flags);
+		return -ENOMEM;
+	}
+
+	event_data->hdr.cookie = event_sub->cookie;
+	memcpy(event_data->hdr.out_data, data, sizeof(struct mlx5_eqe));
+
+	spin_lock_irqsave(&ev_file->lock, flags);
+	list_add_tail(&event_data->list, &ev_file->event_list);
+	spin_unlock_irqrestore(&ev_file->lock, flags);
+	wake_up_interruptible(&ev_file->poll_wait);
+
+	return 0;
+}
+
+static void dispatch_event_fd(struct list_head *fd_list,
+			      const void *data)
+{
+	struct devx_event_subscription *item;
+
+	list_for_each_entry_rcu(item, fd_list, xa_list) {
+		if (!get_file_rcu(item->filp))
+			continue;
+
+		if (item->eventfd) {
+			eventfd_signal(item->eventfd, 1);
+			fput(item->filp);
+			continue;
+		}
+
+		deliver_event(item, data);
+		fput(item->filp);
+	}
+}
+
+static int devx_event_notifier(struct notifier_block *nb,
+			       unsigned long event_type, void *data)
+{
+	struct mlx5_devx_event_table *table;
+	struct mlx5_ib_dev *dev;
+	struct devx_event *event;
+	struct devx_obj_event *obj_event;
+	u16 obj_type = 0;
+	bool is_unaffiliated;
+	u32 obj_id;
+
+	/* Explicit filtering to kernel events which may occur frequently */
+	if (event_type == MLX5_EVENT_TYPE_CMD ||
+	    event_type == MLX5_EVENT_TYPE_PAGE_REQUEST)
+		return NOTIFY_OK;
+
+	table = container_of(nb, struct mlx5_devx_event_table, devx_nb.nb);
+	dev = container_of(table, struct mlx5_ib_dev, devx_event_table);
+	is_unaffiliated = is_unaffiliated_event(dev->mdev, event_type);
+
+	if (!is_unaffiliated)
+		obj_type = get_event_obj_type(event_type, data);
+
+	rcu_read_lock();
+	event = xa_load(&table->event_xa, event_type | (obj_type << 16));
+	if (!event) {
+		rcu_read_unlock();
+		return NOTIFY_DONE;
+	}
+
+	if (is_unaffiliated) {
+		dispatch_event_fd(&event->unaffiliated_list, data);
+		rcu_read_unlock();
+		return NOTIFY_OK;
+	}
+
+	obj_id = devx_get_obj_id_from_event(event_type, data);
+	obj_event = xa_load(&event->object_ids, obj_id);
+	if (!obj_event) {
+		rcu_read_unlock();
+		return NOTIFY_DONE;
+	}
+
+	dispatch_event_fd(&obj_event->obj_sub_list, data);
+
+	rcu_read_unlock();
+	return NOTIFY_OK;
+}
+
+void mlx5_ib_devx_init_event_table(struct mlx5_ib_dev *dev)
+{
+	struct mlx5_devx_event_table *table = &dev->devx_event_table;
+
+	xa_init(&table->event_xa);
+	mutex_init(&table->event_xa_lock);
+	MLX5_NB_INIT(&table->devx_nb, devx_event_notifier, NOTIFY_ANY);
+	mlx5_eq_notifier_register(dev->mdev, &table->devx_nb);
+}
+
+void mlx5_ib_devx_cleanup_event_table(struct mlx5_ib_dev *dev)
+{
+	struct mlx5_devx_event_table *table = &dev->devx_event_table;
+	struct devx_event_subscription *sub, *tmp;
+	struct devx_event *event;
+	void *entry;
+	unsigned long id;
+
+	mlx5_eq_notifier_unregister(dev->mdev, &table->devx_nb);
+	mutex_lock(&dev->devx_event_table.event_xa_lock);
+	xa_for_each(&table->event_xa, id, entry) {
+		event = entry;
+		list_for_each_entry_safe(sub, tmp, &event->unaffiliated_list,
+					 xa_list)
+			devx_cleanup_subscription(dev, sub);
+		kfree(entry);
+	}
+	mutex_unlock(&dev->devx_event_table.event_xa_lock);
+	xa_destroy(&table->event_xa);
+}
+
+static ssize_t devx_async_cmd_event_read(struct file *filp, char __user *buf,
+					 size_t count, loff_t *pos)
+{
+	struct devx_async_cmd_event_file *comp_ev_file = filp->private_data;
+	struct devx_async_event_queue *ev_queue = &comp_ev_file->ev_queue;
+	struct devx_async_data *event;
+	int ret = 0;
+	size_t eventsz;
+
+	spin_lock_irq(&ev_queue->lock);
+
+	while (list_empty(&ev_queue->event_list)) {
+		spin_unlock_irq(&ev_queue->lock);
+
+		if (filp->f_flags & O_NONBLOCK)
+			return -EAGAIN;
+
+		if (wait_event_interruptible(
+			    ev_queue->poll_wait,
+			    (!list_empty(&ev_queue->event_list) ||
+			     ev_queue->is_destroyed))) {
+			return -ERESTARTSYS;
+		}
+
+		if (list_empty(&ev_queue->event_list) &&
+		    ev_queue->is_destroyed)
+			return -EIO;
+
+		spin_lock_irq(&ev_queue->lock);
+	}
+
+	event = list_entry(ev_queue->event_list.next,
+			   struct devx_async_data, list);
+	eventsz = event->cmd_out_len +
+			sizeof(struct mlx5_ib_uapi_devx_async_cmd_hdr);
+
+	if (eventsz > count) {
+		spin_unlock_irq(&ev_queue->lock);
+		return -ENOSPC;
+	}
+
+	list_del(ev_queue->event_list.next);
+	spin_unlock_irq(&ev_queue->lock);
+
+	if (copy_to_user(buf, &event->hdr, eventsz))
+		ret = -EFAULT;
+	else
+		ret = eventsz;
+
+	atomic_sub(event->cmd_out_len, &ev_queue->bytes_in_use);
+	kvfree(event);
+	return ret;
+}
+
+static int devx_async_cmd_event_close(struct inode *inode, struct file *filp)
+{
+	struct ib_uobject *uobj = filp->private_data;
+	struct devx_async_cmd_event_file *comp_ev_file = container_of(
+		uobj, struct devx_async_cmd_event_file, uobj);
+	struct devx_async_data *entry, *tmp;
+
+	spin_lock_irq(&comp_ev_file->ev_queue.lock);
+	list_for_each_entry_safe(entry, tmp,
+				 &comp_ev_file->ev_queue.event_list, list)
+		kvfree(entry);
+	spin_unlock_irq(&comp_ev_file->ev_queue.lock);
+
+	uverbs_close_fd(filp);
+	return 0;
+}
+
+static __poll_t devx_async_cmd_event_poll(struct file *filp,
+					      struct poll_table_struct *wait)
+{
+	struct devx_async_cmd_event_file *comp_ev_file = filp->private_data;
+	struct devx_async_event_queue *ev_queue = &comp_ev_file->ev_queue;
+	__poll_t pollflags = 0;
+
+	poll_wait(filp, &ev_queue->poll_wait, wait);
+
+	spin_lock_irq(&ev_queue->lock);
+	if (ev_queue->is_destroyed)
+		pollflags = EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
+	else if (!list_empty(&ev_queue->event_list))
+		pollflags = EPOLLIN | EPOLLRDNORM;
+	spin_unlock_irq(&ev_queue->lock);
+
+	return pollflags;
+}
+
+static const struct file_operations devx_async_cmd_event_fops = {
+	.owner	 = THIS_MODULE,
+	.read	 = devx_async_cmd_event_read,
+	.poll    = devx_async_cmd_event_poll,
+	.release = devx_async_cmd_event_close,
+	.llseek	 = no_llseek,
+};
+
+static ssize_t devx_async_event_read(struct file *filp, char __user *buf,
+				     size_t count, loff_t *pos)
+{
+	struct devx_async_event_file *ev_file = filp->private_data;
+	struct devx_event_subscription *event_sub;
+	struct devx_async_event_data *uninitialized_var(event);
+	int ret = 0;
+	size_t eventsz;
+	bool omit_data;
+	void *event_data;
+
+	omit_data = ev_file->omit_data;
+
+	spin_lock_irq(&ev_file->lock);
+
+	if (ev_file->is_overflow_err) {
+		ev_file->is_overflow_err = 0;
+		spin_unlock_irq(&ev_file->lock);
+		return -EOVERFLOW;
+	}
+
+	if (ev_file->is_destroyed) {
+		spin_unlock_irq(&ev_file->lock);
+		return -EIO;
+	}
+
+	while (list_empty(&ev_file->event_list)) {
+		spin_unlock_irq(&ev_file->lock);
+
+		if (filp->f_flags & O_NONBLOCK)
+			return -EAGAIN;
+
+		if (wait_event_interruptible(ev_file->poll_wait,
+			    (!list_empty(&ev_file->event_list) ||
+			     ev_file->is_destroyed))) {
+			return -ERESTARTSYS;
+		}
+
+		spin_lock_irq(&ev_file->lock);
+		if (ev_file->is_destroyed) {
+			spin_unlock_irq(&ev_file->lock);
+			return -EIO;
+		}
+	}
+
+	if (omit_data) {
+		event_sub = list_first_entry(&ev_file->event_list,
+					struct devx_event_subscription,
+					event_list);
+		eventsz = sizeof(event_sub->cookie);
+		event_data = &event_sub->cookie;
+	} else {
+		event = list_first_entry(&ev_file->event_list,
+				      struct devx_async_event_data, list);
+		eventsz = sizeof(struct mlx5_eqe) +
+			sizeof(struct mlx5_ib_uapi_devx_async_event_hdr);
+		event_data = &event->hdr;
+	}
+
+	if (eventsz > count) {
+		spin_unlock_irq(&ev_file->lock);
+		return -EINVAL;
+	}
+
+	if (omit_data)
+		list_del_init(&event_sub->event_list);
+	else
+		list_del(&event->list);
+
+	spin_unlock_irq(&ev_file->lock);
+
+	if (copy_to_user(buf, event_data, eventsz))
+		/* This points to an application issue, not a kernel concern */
+		ret = -EFAULT;
+	else
+		ret = eventsz;
+
+	if (!omit_data)
+		kfree(event);
+	return ret;
+}
+
+static __poll_t devx_async_event_poll(struct file *filp,
+				      struct poll_table_struct *wait)
+{
+	struct devx_async_event_file *ev_file = filp->private_data;
+	__poll_t pollflags = 0;
+
+	poll_wait(filp, &ev_file->poll_wait, wait);
+
+	spin_lock_irq(&ev_file->lock);
+	if (ev_file->is_destroyed)
+		pollflags = EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
+	else if (!list_empty(&ev_file->event_list))
+		pollflags = EPOLLIN | EPOLLRDNORM;
+	spin_unlock_irq(&ev_file->lock);
+
+	return pollflags;
+}
+
+static int devx_async_event_close(struct inode *inode, struct file *filp)
+{
+	struct devx_async_event_file *ev_file = filp->private_data;
+	struct devx_event_subscription *event_sub, *event_sub_tmp;
+	struct devx_async_event_data *entry, *tmp;
+	struct mlx5_ib_dev *dev = ev_file->dev;
+
+	mutex_lock(&dev->devx_event_table.event_xa_lock);
+	/* delete the subscriptions which are related to this FD */
+	list_for_each_entry_safe(event_sub, event_sub_tmp,
+				 &ev_file->subscribed_events_list, file_list) {
+		devx_cleanup_subscription(dev, event_sub);
+		if (event_sub->eventfd)
+			eventfd_ctx_put(event_sub->eventfd);
+
+		list_del_rcu(&event_sub->file_list);
+		/* subscription may not be used by the read API any more */
+		kfree_rcu(event_sub, rcu);
+	}
+
+	mutex_unlock(&dev->devx_event_table.event_xa_lock);
+
+	/* free the pending events allocation */
+	if (!ev_file->omit_data) {
+		spin_lock_irq(&ev_file->lock);
+		list_for_each_entry_safe(entry, tmp,
+					 &ev_file->event_list, list)
+			kfree(entry); /* read can't come any more */
+		spin_unlock_irq(&ev_file->lock);
+	}
+
+	uverbs_close_fd(filp);
+	put_device(&dev->ib_dev.dev);
+	return 0;
+}
+
+static const struct file_operations devx_async_event_fops = {
+	.owner	 = THIS_MODULE,
+	.read	 = devx_async_event_read,
+	.poll    = devx_async_event_poll,
+	.release = devx_async_event_close,
+	.llseek	 = no_llseek,
+};
+
+static int devx_hot_unplug_async_cmd_event_file(struct ib_uobject *uobj,
+						   enum rdma_remove_reason why)
+{
+	struct devx_async_cmd_event_file *comp_ev_file =
+		container_of(uobj, struct devx_async_cmd_event_file,
+			     uobj);
+	struct devx_async_event_queue *ev_queue = &comp_ev_file->ev_queue;
+
+	spin_lock_irq(&ev_queue->lock);
+	ev_queue->is_destroyed = 1;
+	spin_unlock_irq(&ev_queue->lock);
+
+	if (why == RDMA_REMOVE_DRIVER_REMOVE)
+		wake_up_interruptible(&ev_queue->poll_wait);
+
+	mlx5_cmd_cleanup_async_ctx(&comp_ev_file->async_ctx);
+	return 0;
+};
+
+static int devx_hot_unplug_async_event_file(struct ib_uobject *uobj,
+					    enum rdma_remove_reason why)
+{
+	struct devx_async_event_file *ev_file =
+		container_of(uobj, struct devx_async_event_file,
+			     uobj);
+
+	spin_lock_irq(&ev_file->lock);
+	ev_file->is_destroyed = 1;
+	spin_unlock_irq(&ev_file->lock);
+
+	wake_up_interruptible(&ev_file->poll_wait);
+	return 0;
+};
+
 DECLARE_UVERBS_NAMED_METHOD(
 	MLX5_IB_METHOD_DEVX_UMEM_REG,
 	UVERBS_ATTR_IDR(MLX5_IB_ATTR_DEVX_UMEM_REG_HANDLE,
@@ -1065,7 +2799,7 @@
 DECLARE_UVERBS_NAMED_METHOD(
 	MLX5_IB_METHOD_DEVX_OBJ_MODIFY,
 	UVERBS_ATTR_IDR(MLX5_IB_ATTR_DEVX_OBJ_MODIFY_HANDLE,
-			MLX5_IB_OBJECT_DEVX_OBJ,
+			UVERBS_IDR_ANY_OBJECT,
 			UVERBS_ACCESS_WRITE,
 			UA_MANDATORY),
 	UVERBS_ATTR_PTR_IN(
@@ -1081,7 +2815,7 @@
 DECLARE_UVERBS_NAMED_METHOD(
 	MLX5_IB_METHOD_DEVX_OBJ_QUERY,
 	UVERBS_ATTR_IDR(MLX5_IB_ATTR_DEVX_OBJ_QUERY_HANDLE,
-			MLX5_IB_OBJECT_DEVX_OBJ,
+			UVERBS_IDR_ANY_OBJECT,
 			UVERBS_ACCESS_READ,
 			UA_MANDATORY),
 	UVERBS_ATTR_PTR_IN(
@@ -1094,29 +2828,123 @@
 		UVERBS_ATTR_MIN_SIZE(MLX5_ST_SZ_BYTES(general_obj_out_cmd_hdr)),
 		UA_MANDATORY));
 
+DECLARE_UVERBS_NAMED_METHOD(
+	MLX5_IB_METHOD_DEVX_OBJ_ASYNC_QUERY,
+	UVERBS_ATTR_IDR(MLX5_IB_ATTR_DEVX_OBJ_QUERY_HANDLE,
+			UVERBS_IDR_ANY_OBJECT,
+			UVERBS_ACCESS_READ,
+			UA_MANDATORY),
+	UVERBS_ATTR_PTR_IN(
+		MLX5_IB_ATTR_DEVX_OBJ_QUERY_CMD_IN,
+		UVERBS_ATTR_MIN_SIZE(MLX5_ST_SZ_BYTES(general_obj_in_cmd_hdr)),
+		UA_MANDATORY,
+		UA_ALLOC_AND_COPY),
+	UVERBS_ATTR_CONST_IN(MLX5_IB_ATTR_DEVX_OBJ_QUERY_ASYNC_OUT_LEN,
+		u16, UA_MANDATORY),
+	UVERBS_ATTR_FD(MLX5_IB_ATTR_DEVX_OBJ_QUERY_ASYNC_FD,
+		MLX5_IB_OBJECT_DEVX_ASYNC_CMD_FD,
+		UVERBS_ACCESS_READ,
+		UA_MANDATORY),
+	UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_DEVX_OBJ_QUERY_ASYNC_WR_ID,
+		UVERBS_ATTR_TYPE(u64),
+		UA_MANDATORY));
+
+DECLARE_UVERBS_NAMED_METHOD(
+	MLX5_IB_METHOD_DEVX_SUBSCRIBE_EVENT,
+	UVERBS_ATTR_FD(MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_FD_HANDLE,
+		MLX5_IB_OBJECT_DEVX_ASYNC_EVENT_FD,
+		UVERBS_ACCESS_READ,
+		UA_MANDATORY),
+	UVERBS_ATTR_IDR(MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_OBJ_HANDLE,
+		MLX5_IB_OBJECT_DEVX_OBJ,
+		UVERBS_ACCESS_READ,
+		UA_OPTIONAL),
+	UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_TYPE_NUM_LIST,
+		UVERBS_ATTR_MIN_SIZE(sizeof(u16)),
+		UA_MANDATORY,
+		UA_ALLOC_AND_COPY),
+	UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_COOKIE,
+		UVERBS_ATTR_TYPE(u64),
+		UA_OPTIONAL),
+	UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_FD_NUM,
+		UVERBS_ATTR_TYPE(u32),
+		UA_OPTIONAL));
+
 DECLARE_UVERBS_GLOBAL_METHODS(MLX5_IB_OBJECT_DEVX,
 			      &UVERBS_METHOD(MLX5_IB_METHOD_DEVX_OTHER),
 			      &UVERBS_METHOD(MLX5_IB_METHOD_DEVX_QUERY_UAR),
-			      &UVERBS_METHOD(MLX5_IB_METHOD_DEVX_QUERY_EQN));
+			      &UVERBS_METHOD(MLX5_IB_METHOD_DEVX_QUERY_EQN),
+			      &UVERBS_METHOD(MLX5_IB_METHOD_DEVX_SUBSCRIBE_EVENT));
 
 DECLARE_UVERBS_NAMED_OBJECT(MLX5_IB_OBJECT_DEVX_OBJ,
 			    UVERBS_TYPE_ALLOC_IDR(devx_obj_cleanup),
 			    &UVERBS_METHOD(MLX5_IB_METHOD_DEVX_OBJ_CREATE),
 			    &UVERBS_METHOD(MLX5_IB_METHOD_DEVX_OBJ_DESTROY),
 			    &UVERBS_METHOD(MLX5_IB_METHOD_DEVX_OBJ_MODIFY),
-			    &UVERBS_METHOD(MLX5_IB_METHOD_DEVX_OBJ_QUERY));
+			    &UVERBS_METHOD(MLX5_IB_METHOD_DEVX_OBJ_QUERY),
+			    &UVERBS_METHOD(MLX5_IB_METHOD_DEVX_OBJ_ASYNC_QUERY));
 
 DECLARE_UVERBS_NAMED_OBJECT(MLX5_IB_OBJECT_DEVX_UMEM,
 			    UVERBS_TYPE_ALLOC_IDR(devx_umem_cleanup),
 			    &UVERBS_METHOD(MLX5_IB_METHOD_DEVX_UMEM_REG),
 			    &UVERBS_METHOD(MLX5_IB_METHOD_DEVX_UMEM_DEREG));
 
-DECLARE_UVERBS_OBJECT_TREE(devx_objects,
-			   &UVERBS_OBJECT(MLX5_IB_OBJECT_DEVX),
-			   &UVERBS_OBJECT(MLX5_IB_OBJECT_DEVX_OBJ),
-			   &UVERBS_OBJECT(MLX5_IB_OBJECT_DEVX_UMEM));
 
-const struct uverbs_object_tree_def *mlx5_ib_get_devx_tree(void)
+DECLARE_UVERBS_NAMED_METHOD(
+	MLX5_IB_METHOD_DEVX_ASYNC_CMD_FD_ALLOC,
+	UVERBS_ATTR_FD(MLX5_IB_ATTR_DEVX_ASYNC_CMD_FD_ALLOC_HANDLE,
+			MLX5_IB_OBJECT_DEVX_ASYNC_CMD_FD,
+			UVERBS_ACCESS_NEW,
+			UA_MANDATORY));
+
+DECLARE_UVERBS_NAMED_OBJECT(
+	MLX5_IB_OBJECT_DEVX_ASYNC_CMD_FD,
+	UVERBS_TYPE_ALLOC_FD(sizeof(struct devx_async_cmd_event_file),
+			     devx_hot_unplug_async_cmd_event_file,
+			     &devx_async_cmd_event_fops, "[devx_async_cmd]",
+			     O_RDONLY),
+	&UVERBS_METHOD(MLX5_IB_METHOD_DEVX_ASYNC_CMD_FD_ALLOC));
+
+DECLARE_UVERBS_NAMED_METHOD(
+	MLX5_IB_METHOD_DEVX_ASYNC_EVENT_FD_ALLOC,
+	UVERBS_ATTR_FD(MLX5_IB_ATTR_DEVX_ASYNC_EVENT_FD_ALLOC_HANDLE,
+			MLX5_IB_OBJECT_DEVX_ASYNC_EVENT_FD,
+			UVERBS_ACCESS_NEW,
+			UA_MANDATORY),
+	UVERBS_ATTR_FLAGS_IN(MLX5_IB_ATTR_DEVX_ASYNC_EVENT_FD_ALLOC_FLAGS,
+			enum mlx5_ib_uapi_devx_create_event_channel_flags,
+			UA_MANDATORY));
+
+DECLARE_UVERBS_NAMED_OBJECT(
+	MLX5_IB_OBJECT_DEVX_ASYNC_EVENT_FD,
+	UVERBS_TYPE_ALLOC_FD(sizeof(struct devx_async_event_file),
+			     devx_hot_unplug_async_event_file,
+			     &devx_async_event_fops, "[devx_async_event]",
+			     O_RDONLY),
+	&UVERBS_METHOD(MLX5_IB_METHOD_DEVX_ASYNC_EVENT_FD_ALLOC));
+
+static bool devx_is_supported(struct ib_device *device)
 {
-	return &devx_objects;
+	struct mlx5_ib_dev *dev = to_mdev(device);
+
+	return MLX5_CAP_GEN(dev->mdev, log_max_uctx);
 }
+
+const struct uapi_definition mlx5_ib_devx_defs[] = {
+	UAPI_DEF_CHAIN_OBJ_TREE_NAMED(
+		MLX5_IB_OBJECT_DEVX,
+		UAPI_DEF_IS_OBJ_SUPPORTED(devx_is_supported)),
+	UAPI_DEF_CHAIN_OBJ_TREE_NAMED(
+		MLX5_IB_OBJECT_DEVX_OBJ,
+		UAPI_DEF_IS_OBJ_SUPPORTED(devx_is_supported)),
+	UAPI_DEF_CHAIN_OBJ_TREE_NAMED(
+		MLX5_IB_OBJECT_DEVX_UMEM,
+		UAPI_DEF_IS_OBJ_SUPPORTED(devx_is_supported)),
+	UAPI_DEF_CHAIN_OBJ_TREE_NAMED(
+		MLX5_IB_OBJECT_DEVX_ASYNC_CMD_FD,
+		UAPI_DEF_IS_OBJ_SUPPORTED(devx_is_supported)),
+	UAPI_DEF_CHAIN_OBJ_TREE_NAMED(
+		MLX5_IB_OBJECT_DEVX_ASYNC_EVENT_FD,
+		UAPI_DEF_IS_OBJ_SUPPORTED(devx_is_supported)),
+	{},
+};
diff --git a/drivers/infiniband/hw/mlx5/doorbell.c b/drivers/infiniband/hw/mlx5/doorbell.c
index a0e4e6d..8f4e5f2 100644
--- a/drivers/infiniband/hw/mlx5/doorbell.c
+++ b/drivers/infiniband/hw/mlx5/doorbell.c
@@ -43,7 +43,8 @@
 	int			refcnt;
 };
 
-int mlx5_ib_db_map_user(struct mlx5_ib_ucontext *context, unsigned long virt,
+int mlx5_ib_db_map_user(struct mlx5_ib_ucontext *context,
+			struct ib_udata *udata, unsigned long virt,
 			struct mlx5_db *db)
 {
 	struct mlx5_ib_user_db_page *page;
@@ -63,8 +64,7 @@
 
 	page->user_virt = (virt & PAGE_MASK);
 	page->refcnt    = 0;
-	page->umem      = ib_umem_get(&context->ibucontext, virt & PAGE_MASK,
-				      PAGE_SIZE, 0, 0);
+	page->umem = ib_umem_get(udata, virt & PAGE_MASK, PAGE_SIZE, 0, 0);
 	if (IS_ERR(page->umem)) {
 		err = PTR_ERR(page->umem);
 		kfree(page);
diff --git a/drivers/infiniband/hw/mlx5/flow.c b/drivers/infiniband/hw/mlx5/flow.c
index 1a29f47..b198ff1 100644
--- a/drivers/infiniband/hw/mlx5/flow.c
+++ b/drivers/infiniband/hw/mlx5/flow.c
@@ -7,7 +7,9 @@
 #include <rdma/ib_verbs.h>
 #include <rdma/uverbs_types.h>
 #include <rdma/uverbs_ioctl.h>
+#include <rdma/uverbs_std_types.h>
 #include <rdma/mlx5_user_ioctl_cmds.h>
+#include <rdma/mlx5_user_ioctl_verbs.h>
 #include <rdma/ib_umem.h>
 #include <linux/mlx5/driver.h>
 #include <linux/mlx5/fs.h>
@@ -16,6 +18,30 @@
 #define UVERBS_MODULE_NAME mlx5_ib
 #include <rdma/uverbs_named_ioctl.h>
 
+static int
+mlx5_ib_ft_type_to_namespace(enum mlx5_ib_uapi_flow_table_type table_type,
+			     enum mlx5_flow_namespace_type *namespace)
+{
+	switch (table_type) {
+	case MLX5_IB_UAPI_FLOW_TABLE_TYPE_NIC_RX:
+		*namespace = MLX5_FLOW_NAMESPACE_BYPASS;
+		break;
+	case MLX5_IB_UAPI_FLOW_TABLE_TYPE_NIC_TX:
+		*namespace = MLX5_FLOW_NAMESPACE_EGRESS;
+		break;
+	case MLX5_IB_UAPI_FLOW_TABLE_TYPE_FDB:
+		*namespace = MLX5_FLOW_NAMESPACE_FDB;
+		break;
+	case MLX5_IB_UAPI_FLOW_TABLE_TYPE_RDMA_RX:
+		*namespace = MLX5_FLOW_NAMESPACE_RDMA_RX;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
 static const struct uverbs_attr_spec mlx5_ib_flow_type[] = {
 	[MLX5_IB_FLOW_TYPE_NORMAL] = {
 		.type = UVERBS_ATTR_TYPE_PTR_IN,
@@ -38,11 +64,16 @@
 	},
 };
 
+#define MLX5_IB_CREATE_FLOW_MAX_FLOW_ACTIONS 2
 static int UVERBS_HANDLER(MLX5_IB_METHOD_CREATE_FLOW)(
-	struct ib_uverbs_file *file, struct uverbs_attr_bundle *attrs)
+	struct uverbs_attr_bundle *attrs)
 {
+	struct mlx5_flow_context flow_context = {.flow_tag = MLX5_FS_DEFAULT_FLOW_TAG};
 	struct mlx5_ib_flow_handler *flow_handler;
 	struct mlx5_ib_flow_matcher *fs_matcher;
+	struct ib_uobject **arr_flow_actions;
+	struct ib_uflow_resources *uflow_res;
+	struct mlx5_flow_act flow_act = {};
 	void *devx_obj;
 	int dest_id, dest_type;
 	void *cmd_in;
@@ -51,7 +82,9 @@
 	struct ib_qp *qp = NULL;
 	struct ib_uobject *uobj =
 		uverbs_attr_get_uobject(attrs, MLX5_IB_ATTR_CREATE_FLOW_HANDLE);
-	struct mlx5_ib_dev *dev = to_mdev(uobj->context->device);
+	struct mlx5_ib_dev *dev = mlx5_udata_to_mdev(&attrs->driver_udata);
+	int len, ret, i;
+	u32 counter_id = 0;
 
 	if (!capable(CAP_NET_RAW))
 		return -EPERM;
@@ -61,7 +94,19 @@
 	dest_qp = uverbs_attr_is_valid(attrs,
 				       MLX5_IB_ATTR_CREATE_FLOW_DEST_QP);
 
-	if ((dest_devx && dest_qp) || (!dest_devx && !dest_qp))
+	fs_matcher = uverbs_attr_get_obj(attrs,
+					 MLX5_IB_ATTR_CREATE_FLOW_MATCHER);
+	if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_BYPASS &&
+	    ((dest_devx && dest_qp) || (!dest_devx && !dest_qp)))
+		return -EINVAL;
+
+	/* Allow only DEVX object as dest when inserting to FDB */
+	if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_FDB && !dest_devx)
+		return -EINVAL;
+
+	/* Allow only DEVX object or QP as dest when inserting to RDMA_RX */
+	if ((fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_RDMA_RX) &&
+	    ((!dest_devx && !dest_qp) || (dest_devx && dest_qp)))
 		return -EINVAL;
 
 	if (dest_devx) {
@@ -75,7 +120,12 @@
 		 */
 		if (!mlx5_ib_devx_is_flow_dest(devx_obj, &dest_id, &dest_type))
 			return -EINVAL;
-	} else {
+		/* Allow only flow table as dest when inserting to FDB or RDMA_RX */
+		if ((fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_FDB ||
+		     fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_RDMA_RX) &&
+		    dest_type != MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE)
+			return -EINVAL;
+	} else if (dest_qp) {
 		struct mlx5_ib_qp *mqp;
 
 		qp = uverbs_attr_get_obj(attrs,
@@ -92,29 +142,78 @@
 		else
 			dest_id = mqp->raw_packet_qp.rq.tirn;
 		dest_type = MLX5_FLOW_DESTINATION_TYPE_TIR;
+	} else {
+		dest_type = MLX5_FLOW_DESTINATION_TYPE_PORT;
 	}
 
-	if (dev->rep)
-		return -ENOTSUPP;
+	len = uverbs_attr_get_uobjs_arr(attrs,
+		MLX5_IB_ATTR_CREATE_FLOW_ARR_COUNTERS_DEVX, &arr_flow_actions);
+	if (len) {
+		devx_obj = arr_flow_actions[0]->object;
+
+		if (!mlx5_ib_devx_is_flow_counter(devx_obj, &counter_id))
+			return -EINVAL;
+		flow_act.action |= MLX5_FLOW_CONTEXT_ACTION_COUNT;
+	}
+
+	if (dest_type == MLX5_FLOW_DESTINATION_TYPE_TIR &&
+	    fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_EGRESS)
+		return -EINVAL;
 
 	cmd_in = uverbs_attr_get_alloced_ptr(
 		attrs, MLX5_IB_ATTR_CREATE_FLOW_MATCH_VALUE);
 	inlen = uverbs_attr_get_len(attrs,
 				    MLX5_IB_ATTR_CREATE_FLOW_MATCH_VALUE);
-	fs_matcher = uverbs_attr_get_obj(attrs,
-					 MLX5_IB_ATTR_CREATE_FLOW_MATCHER);
-	flow_handler = mlx5_ib_raw_fs_rule_add(dev, fs_matcher, cmd_in, inlen,
-					       dest_id, dest_type);
-	if (IS_ERR(flow_handler))
-		return PTR_ERR(flow_handler);
 
-	ib_set_flow(uobj, &flow_handler->ibflow, qp, &dev->ib_dev);
+	uflow_res = flow_resources_alloc(MLX5_IB_CREATE_FLOW_MAX_FLOW_ACTIONS);
+	if (!uflow_res)
+		return -ENOMEM;
+
+	len = uverbs_attr_get_uobjs_arr(attrs,
+		MLX5_IB_ATTR_CREATE_FLOW_ARR_FLOW_ACTIONS, &arr_flow_actions);
+	for (i = 0; i < len; i++) {
+		struct mlx5_ib_flow_action *maction =
+			to_mflow_act(arr_flow_actions[i]->object);
+
+		ret = parse_flow_flow_action(maction, false, &flow_act);
+		if (ret)
+			goto err_out;
+		flow_resources_add(uflow_res, IB_FLOW_SPEC_ACTION_HANDLE,
+				   arr_flow_actions[i]->object);
+	}
+
+	ret = uverbs_copy_from(&flow_context.flow_tag, attrs,
+			       MLX5_IB_ATTR_CREATE_FLOW_TAG);
+	if (!ret) {
+		if (flow_context.flow_tag >= BIT(24)) {
+			ret = -EINVAL;
+			goto err_out;
+		}
+		flow_context.flags |= FLOW_CONTEXT_HAS_TAG;
+	}
+
+	flow_handler = mlx5_ib_raw_fs_rule_add(dev, fs_matcher,
+					       &flow_context,
+					       &flow_act,
+					       counter_id,
+					       cmd_in, inlen,
+					       dest_id, dest_type);
+	if (IS_ERR(flow_handler)) {
+		ret = PTR_ERR(flow_handler);
+		goto err_out;
+	}
+
+	ib_set_flow(uobj, &flow_handler->ibflow, qp, &dev->ib_dev, uflow_res);
 
 	return 0;
+err_out:
+	ib_uverbs_flow_resources_free(uflow_res);
+	return ret;
 }
 
 static int flow_matcher_cleanup(struct ib_uobject *uobject,
-				enum rdma_remove_reason why)
+				enum rdma_remove_reason why,
+				struct uverbs_attr_bundle *attrs)
 {
 	struct mlx5_ib_flow_matcher *obj = uobject->object;
 	int ret;
@@ -127,12 +226,60 @@
 	return 0;
 }
 
+static int mlx5_ib_matcher_ns(struct uverbs_attr_bundle *attrs,
+			      struct mlx5_ib_flow_matcher *obj)
+{
+	enum mlx5_ib_uapi_flow_table_type ft_type =
+		MLX5_IB_UAPI_FLOW_TABLE_TYPE_NIC_RX;
+	u32 flags;
+	int err;
+
+	/* New users should use MLX5_IB_ATTR_FLOW_MATCHER_FT_TYPE and older
+	 * users should switch to it. We leave this to not break userspace
+	 */
+	if (uverbs_attr_is_valid(attrs, MLX5_IB_ATTR_FLOW_MATCHER_FT_TYPE) &&
+	    uverbs_attr_is_valid(attrs, MLX5_IB_ATTR_FLOW_MATCHER_FLOW_FLAGS))
+		return -EINVAL;
+
+	if (uverbs_attr_is_valid(attrs, MLX5_IB_ATTR_FLOW_MATCHER_FT_TYPE)) {
+		err = uverbs_get_const(&ft_type, attrs,
+				       MLX5_IB_ATTR_FLOW_MATCHER_FT_TYPE);
+		if (err)
+			return err;
+
+		err = mlx5_ib_ft_type_to_namespace(ft_type, &obj->ns_type);
+		if (err)
+			return err;
+
+		return 0;
+	}
+
+	if (uverbs_attr_is_valid(attrs, MLX5_IB_ATTR_FLOW_MATCHER_FLOW_FLAGS)) {
+		err = uverbs_get_flags32(&flags, attrs,
+					 MLX5_IB_ATTR_FLOW_MATCHER_FLOW_FLAGS,
+					 IB_FLOW_ATTR_FLAGS_EGRESS);
+		if (err)
+			return err;
+
+		if (flags) {
+			mlx5_ib_ft_type_to_namespace(
+				MLX5_IB_UAPI_FLOW_TABLE_TYPE_NIC_TX,
+				&obj->ns_type);
+			return 0;
+		}
+	}
+
+	obj->ns_type = MLX5_FLOW_NAMESPACE_BYPASS;
+
+	return 0;
+}
+
 static int UVERBS_HANDLER(MLX5_IB_METHOD_FLOW_MATCHER_CREATE)(
-	struct ib_uverbs_file *file, struct uverbs_attr_bundle *attrs)
+	struct uverbs_attr_bundle *attrs)
 {
 	struct ib_uobject *uobj = uverbs_attr_get_uobject(
 		attrs, MLX5_IB_ATTR_FLOW_MATCHER_CREATE_HANDLE);
-	struct mlx5_ib_dev *dev = to_mdev(uobj->context->device);
+	struct mlx5_ib_dev *dev = mlx5_udata_to_mdev(&attrs->driver_udata);
 	struct mlx5_ib_flow_matcher *obj;
 	int err;
 
@@ -165,6 +312,10 @@
 	if (err)
 		goto end;
 
+	err = mlx5_ib_matcher_ns(attrs, obj);
+	if (err)
+		goto end;
+
 	uobj->object = obj;
 	obj->mdev = dev->mdev;
 	atomic_set(&obj->usecnt, 0);
@@ -175,6 +326,246 @@
 	return err;
 }
 
+void mlx5_ib_destroy_flow_action_raw(struct mlx5_ib_flow_action *maction)
+{
+	switch (maction->flow_action_raw.sub_type) {
+	case MLX5_IB_FLOW_ACTION_MODIFY_HEADER:
+		mlx5_modify_header_dealloc(maction->flow_action_raw.dev->mdev,
+					   maction->flow_action_raw.modify_hdr);
+		break;
+	case MLX5_IB_FLOW_ACTION_PACKET_REFORMAT:
+		mlx5_packet_reformat_dealloc(maction->flow_action_raw.dev->mdev,
+					     maction->flow_action_raw.pkt_reformat);
+		break;
+	case MLX5_IB_FLOW_ACTION_DECAP:
+		break;
+	default:
+		break;
+	}
+}
+
+static struct ib_flow_action *
+mlx5_ib_create_modify_header(struct mlx5_ib_dev *dev,
+			     enum mlx5_ib_uapi_flow_table_type ft_type,
+			     u8 num_actions, void *in)
+{
+	enum mlx5_flow_namespace_type namespace;
+	struct mlx5_ib_flow_action *maction;
+	int ret;
+
+	ret = mlx5_ib_ft_type_to_namespace(ft_type, &namespace);
+	if (ret)
+		return ERR_PTR(-EINVAL);
+
+	maction = kzalloc(sizeof(*maction), GFP_KERNEL);
+	if (!maction)
+		return ERR_PTR(-ENOMEM);
+
+	maction->flow_action_raw.modify_hdr =
+		mlx5_modify_header_alloc(dev->mdev, namespace, num_actions, in);
+
+	if (IS_ERR(maction->flow_action_raw.modify_hdr)) {
+		ret = PTR_ERR(maction->flow_action_raw.modify_hdr);
+		kfree(maction);
+		return ERR_PTR(ret);
+	}
+	maction->flow_action_raw.sub_type =
+		MLX5_IB_FLOW_ACTION_MODIFY_HEADER;
+	maction->flow_action_raw.dev = dev;
+
+	return &maction->ib_action;
+}
+
+static bool mlx5_ib_modify_header_supported(struct mlx5_ib_dev *dev)
+{
+	return MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev,
+					 max_modify_header_actions) ||
+	       MLX5_CAP_FLOWTABLE_NIC_TX(dev->mdev, max_modify_header_actions);
+}
+
+static int UVERBS_HANDLER(MLX5_IB_METHOD_FLOW_ACTION_CREATE_MODIFY_HEADER)(
+	struct uverbs_attr_bundle *attrs)
+{
+	struct ib_uobject *uobj = uverbs_attr_get_uobject(
+		attrs, MLX5_IB_ATTR_CREATE_MODIFY_HEADER_HANDLE);
+	struct mlx5_ib_dev *mdev = mlx5_udata_to_mdev(&attrs->driver_udata);
+	enum mlx5_ib_uapi_flow_table_type ft_type;
+	struct ib_flow_action *action;
+	int num_actions;
+	void *in;
+	int ret;
+
+	if (!mlx5_ib_modify_header_supported(mdev))
+		return -EOPNOTSUPP;
+
+	in = uverbs_attr_get_alloced_ptr(attrs,
+		MLX5_IB_ATTR_CREATE_MODIFY_HEADER_ACTIONS_PRM);
+
+	num_actions = uverbs_attr_ptr_get_array_size(
+		attrs, MLX5_IB_ATTR_CREATE_MODIFY_HEADER_ACTIONS_PRM,
+		MLX5_UN_SZ_BYTES(set_action_in_add_action_in_auto));
+	if (num_actions < 0)
+		return num_actions;
+
+	ret = uverbs_get_const(&ft_type, attrs,
+			       MLX5_IB_ATTR_CREATE_MODIFY_HEADER_FT_TYPE);
+	if (ret)
+		return ret;
+	action = mlx5_ib_create_modify_header(mdev, ft_type, num_actions, in);
+	if (IS_ERR(action))
+		return PTR_ERR(action);
+
+	uverbs_flow_action_fill_action(action, uobj, &mdev->ib_dev,
+				       IB_FLOW_ACTION_UNSPECIFIED);
+
+	return 0;
+}
+
+static bool mlx5_ib_flow_action_packet_reformat_valid(struct mlx5_ib_dev *ibdev,
+						      u8 packet_reformat_type,
+						      u8 ft_type)
+{
+	switch (packet_reformat_type) {
+	case MLX5_IB_UAPI_FLOW_ACTION_PACKET_REFORMAT_TYPE_L2_TO_L2_TUNNEL:
+		if (ft_type == MLX5_IB_UAPI_FLOW_TABLE_TYPE_NIC_TX)
+			return MLX5_CAP_FLOWTABLE(ibdev->mdev,
+						  encap_general_header);
+		break;
+	case MLX5_IB_UAPI_FLOW_ACTION_PACKET_REFORMAT_TYPE_L2_TO_L3_TUNNEL:
+		if (ft_type == MLX5_IB_UAPI_FLOW_TABLE_TYPE_NIC_TX)
+			return MLX5_CAP_FLOWTABLE_NIC_TX(ibdev->mdev,
+				reformat_l2_to_l3_tunnel);
+		break;
+	case MLX5_IB_UAPI_FLOW_ACTION_PACKET_REFORMAT_TYPE_L3_TUNNEL_TO_L2:
+		if (ft_type == MLX5_IB_UAPI_FLOW_TABLE_TYPE_NIC_RX)
+			return MLX5_CAP_FLOWTABLE_NIC_RX(ibdev->mdev,
+				reformat_l3_tunnel_to_l2);
+		break;
+	case MLX5_IB_UAPI_FLOW_ACTION_PACKET_REFORMAT_TYPE_L2_TUNNEL_TO_L2:
+		if (ft_type == MLX5_IB_UAPI_FLOW_TABLE_TYPE_NIC_RX)
+			return MLX5_CAP_FLOWTABLE_NIC_RX(ibdev->mdev, decap);
+		break;
+	default:
+		break;
+	}
+
+	return false;
+}
+
+static int mlx5_ib_dv_to_prm_packet_reforamt_type(u8 dv_prt, u8 *prm_prt)
+{
+	switch (dv_prt) {
+	case MLX5_IB_UAPI_FLOW_ACTION_PACKET_REFORMAT_TYPE_L2_TO_L2_TUNNEL:
+		*prm_prt = MLX5_REFORMAT_TYPE_L2_TO_L2_TUNNEL;
+		break;
+	case MLX5_IB_UAPI_FLOW_ACTION_PACKET_REFORMAT_TYPE_L3_TUNNEL_TO_L2:
+		*prm_prt = MLX5_REFORMAT_TYPE_L3_TUNNEL_TO_L2;
+		break;
+	case MLX5_IB_UAPI_FLOW_ACTION_PACKET_REFORMAT_TYPE_L2_TO_L3_TUNNEL:
+		*prm_prt = MLX5_REFORMAT_TYPE_L2_TO_L3_TUNNEL;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int mlx5_ib_flow_action_create_packet_reformat_ctx(
+	struct mlx5_ib_dev *dev,
+	struct mlx5_ib_flow_action *maction,
+	u8 ft_type, u8 dv_prt,
+	void *in, size_t len)
+{
+	enum mlx5_flow_namespace_type namespace;
+	u8 prm_prt;
+	int ret;
+
+	ret = mlx5_ib_ft_type_to_namespace(ft_type, &namespace);
+	if (ret)
+		return ret;
+
+	ret = mlx5_ib_dv_to_prm_packet_reforamt_type(dv_prt, &prm_prt);
+	if (ret)
+		return ret;
+
+	maction->flow_action_raw.pkt_reformat =
+		mlx5_packet_reformat_alloc(dev->mdev, prm_prt, len,
+					   in, namespace);
+	if (IS_ERR(maction->flow_action_raw.pkt_reformat)) {
+		ret = PTR_ERR(maction->flow_action_raw.pkt_reformat);
+		return ret;
+	}
+
+	maction->flow_action_raw.sub_type =
+		MLX5_IB_FLOW_ACTION_PACKET_REFORMAT;
+	maction->flow_action_raw.dev = dev;
+
+	return 0;
+}
+
+static int UVERBS_HANDLER(MLX5_IB_METHOD_FLOW_ACTION_CREATE_PACKET_REFORMAT)(
+	struct uverbs_attr_bundle *attrs)
+{
+	struct ib_uobject *uobj = uverbs_attr_get_uobject(attrs,
+		MLX5_IB_ATTR_CREATE_PACKET_REFORMAT_HANDLE);
+	struct mlx5_ib_dev *mdev = mlx5_udata_to_mdev(&attrs->driver_udata);
+	enum mlx5_ib_uapi_flow_action_packet_reformat_type dv_prt;
+	enum mlx5_ib_uapi_flow_table_type ft_type;
+	struct mlx5_ib_flow_action *maction;
+	int ret;
+
+	ret = uverbs_get_const(&ft_type, attrs,
+			       MLX5_IB_ATTR_CREATE_PACKET_REFORMAT_FT_TYPE);
+	if (ret)
+		return ret;
+
+	ret = uverbs_get_const(&dv_prt, attrs,
+			       MLX5_IB_ATTR_CREATE_PACKET_REFORMAT_TYPE);
+	if (ret)
+		return ret;
+
+	if (!mlx5_ib_flow_action_packet_reformat_valid(mdev, dv_prt, ft_type))
+		return -EOPNOTSUPP;
+
+	maction = kzalloc(sizeof(*maction), GFP_KERNEL);
+	if (!maction)
+		return -ENOMEM;
+
+	if (dv_prt ==
+	    MLX5_IB_UAPI_FLOW_ACTION_PACKET_REFORMAT_TYPE_L2_TUNNEL_TO_L2) {
+		maction->flow_action_raw.sub_type =
+			MLX5_IB_FLOW_ACTION_DECAP;
+		maction->flow_action_raw.dev = mdev;
+	} else {
+		void *in;
+		int len;
+
+		in = uverbs_attr_get_alloced_ptr(attrs,
+			MLX5_IB_ATTR_CREATE_PACKET_REFORMAT_DATA_BUF);
+		if (IS_ERR(in)) {
+			ret = PTR_ERR(in);
+			goto free_maction;
+		}
+
+		len = uverbs_attr_get_len(attrs,
+			MLX5_IB_ATTR_CREATE_PACKET_REFORMAT_DATA_BUF);
+
+		ret = mlx5_ib_flow_action_create_packet_reformat_ctx(mdev,
+			maction, ft_type, dv_prt, in, len);
+		if (ret)
+			goto free_maction;
+	}
+
+	uverbs_flow_action_fill_action(&maction->ib_action, uobj, &mdev->ib_dev,
+				       IB_FLOW_ACTION_UNSPECIFIED);
+	return 0;
+
+free_maction:
+	kfree(maction);
+	return ret;
+}
+
 DECLARE_UVERBS_NAMED_METHOD(
 	MLX5_IB_METHOD_CREATE_FLOW,
 	UVERBS_ATTR_IDR(MLX5_IB_ATTR_CREATE_FLOW_HANDLE,
@@ -195,7 +586,19 @@
 			UVERBS_ACCESS_READ),
 	UVERBS_ATTR_IDR(MLX5_IB_ATTR_CREATE_FLOW_DEST_DEVX,
 			MLX5_IB_OBJECT_DEVX_OBJ,
-			UVERBS_ACCESS_READ));
+			UVERBS_ACCESS_READ),
+	UVERBS_ATTR_IDRS_ARR(MLX5_IB_ATTR_CREATE_FLOW_ARR_FLOW_ACTIONS,
+			     UVERBS_OBJECT_FLOW_ACTION,
+			     UVERBS_ACCESS_READ, 1,
+			     MLX5_IB_CREATE_FLOW_MAX_FLOW_ACTIONS,
+			     UA_OPTIONAL),
+	UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_CREATE_FLOW_TAG,
+			   UVERBS_ATTR_TYPE(u32),
+			   UA_OPTIONAL),
+	UVERBS_ATTR_IDRS_ARR(MLX5_IB_ATTR_CREATE_FLOW_ARR_COUNTERS_DEVX,
+			     MLX5_IB_OBJECT_DEVX_OBJ,
+			     UVERBS_ACCESS_READ, 1, 1,
+			     UA_OPTIONAL));
 
 DECLARE_UVERBS_NAMED_METHOD_DESTROY(
 	MLX5_IB_METHOD_DESTROY_FLOW,
@@ -210,6 +613,44 @@
 		   &UVERBS_METHOD(MLX5_IB_METHOD_DESTROY_FLOW));
 
 DECLARE_UVERBS_NAMED_METHOD(
+	MLX5_IB_METHOD_FLOW_ACTION_CREATE_MODIFY_HEADER,
+	UVERBS_ATTR_IDR(MLX5_IB_ATTR_CREATE_MODIFY_HEADER_HANDLE,
+			UVERBS_OBJECT_FLOW_ACTION,
+			UVERBS_ACCESS_NEW,
+			UA_MANDATORY),
+	UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_CREATE_MODIFY_HEADER_ACTIONS_PRM,
+			   UVERBS_ATTR_MIN_SIZE(MLX5_UN_SZ_BYTES(
+				   set_action_in_add_action_in_auto)),
+			   UA_MANDATORY,
+			   UA_ALLOC_AND_COPY),
+	UVERBS_ATTR_CONST_IN(MLX5_IB_ATTR_CREATE_MODIFY_HEADER_FT_TYPE,
+			     enum mlx5_ib_uapi_flow_table_type,
+			     UA_MANDATORY));
+
+DECLARE_UVERBS_NAMED_METHOD(
+	MLX5_IB_METHOD_FLOW_ACTION_CREATE_PACKET_REFORMAT,
+	UVERBS_ATTR_IDR(MLX5_IB_ATTR_CREATE_PACKET_REFORMAT_HANDLE,
+			UVERBS_OBJECT_FLOW_ACTION,
+			UVERBS_ACCESS_NEW,
+			UA_MANDATORY),
+	UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_CREATE_PACKET_REFORMAT_DATA_BUF,
+			   UVERBS_ATTR_MIN_SIZE(1),
+			   UA_ALLOC_AND_COPY,
+			   UA_OPTIONAL),
+	UVERBS_ATTR_CONST_IN(MLX5_IB_ATTR_CREATE_PACKET_REFORMAT_TYPE,
+			     enum mlx5_ib_uapi_flow_action_packet_reformat_type,
+			     UA_MANDATORY),
+	UVERBS_ATTR_CONST_IN(MLX5_IB_ATTR_CREATE_PACKET_REFORMAT_FT_TYPE,
+			     enum mlx5_ib_uapi_flow_table_type,
+			     UA_MANDATORY));
+
+ADD_UVERBS_METHODS(
+	mlx5_ib_flow_actions,
+	UVERBS_OBJECT_FLOW_ACTION,
+	&UVERBS_METHOD(MLX5_IB_METHOD_FLOW_ACTION_CREATE_MODIFY_HEADER),
+	&UVERBS_METHOD(MLX5_IB_METHOD_FLOW_ACTION_CREATE_PACKET_REFORMAT));
+
+DECLARE_UVERBS_NAMED_METHOD(
 	MLX5_IB_METHOD_FLOW_MATCHER_CREATE,
 	UVERBS_ATTR_IDR(MLX5_IB_ATTR_FLOW_MATCHER_CREATE_HANDLE,
 			MLX5_IB_OBJECT_FLOW_MATCHER,
@@ -224,7 +665,13 @@
 			    UA_MANDATORY),
 	UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_FLOW_MATCHER_MATCH_CRITERIA,
 			   UVERBS_ATTR_TYPE(u8),
-			   UA_MANDATORY));
+			   UA_MANDATORY),
+	UVERBS_ATTR_FLAGS_IN(MLX5_IB_ATTR_FLOW_MATCHER_FLOW_FLAGS,
+			     enum ib_flow_flags,
+			     UA_OPTIONAL),
+	UVERBS_ATTR_CONST_IN(MLX5_IB_ATTR_FLOW_MATCHER_FT_TYPE,
+			     enum mlx5_ib_uapi_flow_table_type,
+			     UA_OPTIONAL));
 
 DECLARE_UVERBS_NAMED_METHOD_DESTROY(
 	MLX5_IB_METHOD_FLOW_MATCHER_DESTROY,
@@ -238,15 +685,13 @@
 			    &UVERBS_METHOD(MLX5_IB_METHOD_FLOW_MATCHER_CREATE),
 			    &UVERBS_METHOD(MLX5_IB_METHOD_FLOW_MATCHER_DESTROY));
 
-DECLARE_UVERBS_OBJECT_TREE(flow_objects,
-			   &UVERBS_OBJECT(MLX5_IB_OBJECT_FLOW_MATCHER));
-
-int mlx5_ib_get_flow_trees(const struct uverbs_object_tree_def **root)
-{
-	int i = 0;
-
-	root[i++] = &flow_objects;
-	root[i++] = &mlx5_ib_fs;
-
-	return i;
-}
+const struct uapi_definition mlx5_ib_flow_defs[] = {
+	UAPI_DEF_CHAIN_OBJ_TREE_NAMED(
+		MLX5_IB_OBJECT_FLOW_MATCHER),
+	UAPI_DEF_CHAIN_OBJ_TREE(
+		UVERBS_OBJECT_FLOW,
+		&mlx5_ib_fs),
+	UAPI_DEF_CHAIN_OBJ_TREE(UVERBS_OBJECT_FLOW_ACTION,
+				&mlx5_ib_flow_actions),
+	{},
+};
diff --git a/drivers/infiniband/hw/mlx5/ib_rep.c b/drivers/infiniband/hw/mlx5/ib_rep.c
index 35a0e04..74ce924 100644
--- a/drivers/infiniband/hw/mlx5/ib_rep.c
+++ b/drivers/infiniband/hw/mlx5/ib_rep.c
@@ -3,76 +3,65 @@
  * Copyright (c) 2018 Mellanox Technologies. All rights reserved.
  */
 
+#include <linux/mlx5/vport.h>
 #include "ib_rep.h"
-
-static const struct mlx5_ib_profile rep_profile = {
-	STAGE_CREATE(MLX5_IB_STAGE_INIT,
-		     mlx5_ib_stage_init_init,
-		     mlx5_ib_stage_init_cleanup),
-	STAGE_CREATE(MLX5_IB_STAGE_FLOW_DB,
-		     mlx5_ib_stage_rep_flow_db_init,
-		     NULL),
-	STAGE_CREATE(MLX5_IB_STAGE_CAPS,
-		     mlx5_ib_stage_caps_init,
-		     NULL),
-	STAGE_CREATE(MLX5_IB_STAGE_NON_DEFAULT_CB,
-		     mlx5_ib_stage_rep_non_default_cb,
-		     NULL),
-	STAGE_CREATE(MLX5_IB_STAGE_ROCE,
-		     mlx5_ib_stage_rep_roce_init,
-		     mlx5_ib_stage_rep_roce_cleanup),
-	STAGE_CREATE(MLX5_IB_STAGE_DEVICE_RESOURCES,
-		     mlx5_ib_stage_dev_res_init,
-		     mlx5_ib_stage_dev_res_cleanup),
-	STAGE_CREATE(MLX5_IB_STAGE_COUNTERS,
-		     mlx5_ib_stage_counters_init,
-		     mlx5_ib_stage_counters_cleanup),
-	STAGE_CREATE(MLX5_IB_STAGE_BFREG,
-		     mlx5_ib_stage_bfrag_init,
-		     mlx5_ib_stage_bfrag_cleanup),
-	STAGE_CREATE(MLX5_IB_STAGE_PRE_IB_REG_UMR,
-		     NULL,
-		     mlx5_ib_stage_pre_ib_reg_umr_cleanup),
-	STAGE_CREATE(MLX5_IB_STAGE_IB_REG,
-		     mlx5_ib_stage_ib_reg_init,
-		     mlx5_ib_stage_ib_reg_cleanup),
-	STAGE_CREATE(MLX5_IB_STAGE_POST_IB_REG_UMR,
-		     mlx5_ib_stage_post_ib_reg_umr_init,
-		     NULL),
-	STAGE_CREATE(MLX5_IB_STAGE_CLASS_ATTR,
-		     mlx5_ib_stage_class_attr_init,
-		     NULL),
-};
+#include "srq.h"
 
 static int
-mlx5_ib_nic_rep_load(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep)
+mlx5_ib_set_vport_rep(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep)
 {
-	return 0;
-}
+	struct mlx5_ib_dev *ibdev;
+	int vport_index;
 
-static void
-mlx5_ib_nic_rep_unload(struct mlx5_eswitch_rep *rep)
-{
-	rep->rep_if[REP_IB].priv = NULL;
+	ibdev = mlx5_ib_get_uplink_ibdev(dev->priv.eswitch);
+	vport_index = rep->vport_index;
+
+	ibdev->port[vport_index].rep = rep;
+	rep->rep_data[REP_IB].priv = ibdev;
+	write_lock(&ibdev->port[vport_index].roce.netdev_lock);
+	ibdev->port[vport_index].roce.netdev =
+		mlx5_ib_get_rep_netdev(dev->priv.eswitch, rep->vport);
+	write_unlock(&ibdev->port[vport_index].roce.netdev_lock);
+
+	return 0;
 }
 
 static int
 mlx5_ib_vport_rep_load(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep)
 {
+	int num_ports = mlx5_eswitch_get_total_vports(dev);
+	const struct mlx5_ib_profile *profile;
 	struct mlx5_ib_dev *ibdev;
+	int vport_index;
 
-	ibdev = (struct mlx5_ib_dev *)ib_alloc_device(sizeof(*ibdev));
+	if (rep->vport == MLX5_VPORT_UPLINK)
+		profile = &uplink_rep_profile;
+	else
+		return mlx5_ib_set_vport_rep(dev, rep);
+
+	ibdev = ib_alloc_device(mlx5_ib_dev, ib_dev);
 	if (!ibdev)
 		return -ENOMEM;
 
-	ibdev->rep = rep;
+	ibdev->port = kcalloc(num_ports, sizeof(*ibdev->port),
+			      GFP_KERNEL);
+	if (!ibdev->port) {
+		ib_dealloc_device(&ibdev->ib_dev);
+		return -ENOMEM;
+	}
+
+	ibdev->is_rep = true;
+	vport_index = rep->vport_index;
+	ibdev->port[vport_index].rep = rep;
+	ibdev->port[vport_index].roce.netdev =
+		mlx5_ib_get_rep_netdev(dev->priv.eswitch, rep->vport);
 	ibdev->mdev = dev;
-	ibdev->num_ports = max(MLX5_CAP_GEN(dev, num_ports),
-			       MLX5_CAP_GEN(dev, num_vhca_ports));
-	if (!__mlx5_ib_add(ibdev, &rep_profile))
+	ibdev->num_ports = num_ports;
+
+	if (!__mlx5_ib_add(ibdev, profile))
 		return -EINVAL;
 
-	rep->rep_if[REP_IB].priv = ibdev;
+	rep->rep_data[REP_IB].priv = ibdev;
 
 	return 0;
 }
@@ -80,14 +69,18 @@
 static void
 mlx5_ib_vport_rep_unload(struct mlx5_eswitch_rep *rep)
 {
-	struct mlx5_ib_dev *dev;
+	struct mlx5_ib_dev *dev = mlx5_ib_rep_to_dev(rep);
+	struct mlx5_ib_port *port;
 
-	if (!rep->rep_if[REP_IB].priv)
-		return;
+	port = &dev->port[rep->vport_index];
+	write_lock(&port->roce.netdev_lock);
+	port->roce.netdev = NULL;
+	write_unlock(&port->roce.netdev_lock);
+	rep->rep_data[REP_IB].priv = NULL;
+	port->rep = NULL;
 
-	dev = mlx5_ib_rep_to_dev(rep);
-	__mlx5_ib_remove(dev, dev->profile, MLX5_IB_STAGE_MAX);
-	rep->rep_if[REP_IB].priv = NULL;
+	if (rep->vport == MLX5_VPORT_UPLINK)
+		__mlx5_ib_remove(dev, dev->profile, MLX5_IB_STAGE_MAX);
 }
 
 static void *mlx5_ib_vport_get_proto_dev(struct mlx5_eswitch_rep *rep)
@@ -95,53 +88,24 @@
 	return mlx5_ib_rep_to_dev(rep);
 }
 
-static void mlx5_ib_rep_register_vf_vports(struct mlx5_ib_dev *dev)
+static const struct mlx5_eswitch_rep_ops rep_ops = {
+	.load = mlx5_ib_vport_rep_load,
+	.unload = mlx5_ib_vport_rep_unload,
+	.get_proto_dev = mlx5_ib_vport_get_proto_dev,
+};
+
+void mlx5_ib_register_vport_reps(struct mlx5_core_dev *mdev)
 {
-	struct mlx5_eswitch *esw   = dev->mdev->priv.eswitch;
-	int total_vfs = MLX5_TOTAL_VPORTS(dev->mdev);
-	int vport;
+	struct mlx5_eswitch *esw = mdev->priv.eswitch;
 
-	for (vport = 1; vport < total_vfs; vport++) {
-		struct mlx5_eswitch_rep_if rep_if = {};
-
-		rep_if.load = mlx5_ib_vport_rep_load;
-		rep_if.unload = mlx5_ib_vport_rep_unload;
-		rep_if.get_proto_dev = mlx5_ib_vport_get_proto_dev;
-		mlx5_eswitch_register_vport_rep(esw, vport, &rep_if, REP_IB);
-	}
+	mlx5_eswitch_register_vport_reps(esw, &rep_ops, REP_IB);
 }
 
-static void mlx5_ib_rep_unregister_vf_vports(struct mlx5_ib_dev *dev)
+void mlx5_ib_unregister_vport_reps(struct mlx5_core_dev *mdev)
 {
-	struct mlx5_eswitch *esw   = dev->mdev->priv.eswitch;
-	int total_vfs = MLX5_TOTAL_VPORTS(dev->mdev);
-	int vport;
+	struct mlx5_eswitch *esw = mdev->priv.eswitch;
 
-	for (vport = 1; vport < total_vfs; vport++)
-		mlx5_eswitch_unregister_vport_rep(esw, vport, REP_IB);
-}
-
-void mlx5_ib_register_vport_reps(struct mlx5_ib_dev *dev)
-{
-	struct mlx5_eswitch *esw = dev->mdev->priv.eswitch;
-	struct mlx5_eswitch_rep_if rep_if = {};
-
-	rep_if.load = mlx5_ib_nic_rep_load;
-	rep_if.unload = mlx5_ib_nic_rep_unload;
-	rep_if.get_proto_dev = mlx5_ib_vport_get_proto_dev;
-	rep_if.priv = dev;
-
-	mlx5_eswitch_register_vport_rep(esw, 0, &rep_if, REP_IB);
-
-	mlx5_ib_rep_register_vf_vports(dev);
-}
-
-void mlx5_ib_unregister_vport_reps(struct mlx5_ib_dev *dev)
-{
-	struct mlx5_eswitch *esw   = dev->mdev->priv.eswitch;
-
-	mlx5_ib_rep_unregister_vf_vports(dev); /* VFs vports */
-	mlx5_eswitch_unregister_vport_rep(esw, 0, REP_IB); /* UPLINK PF*/
+	mlx5_eswitch_unregister_vport_reps(esw, REP_IB);
 }
 
 u8 mlx5_ib_eswitch_mode(struct mlx5_eswitch *esw)
@@ -150,15 +114,15 @@
 }
 
 struct mlx5_ib_dev *mlx5_ib_get_rep_ibdev(struct mlx5_eswitch *esw,
-					  int vport_index)
+					  u16 vport_num)
 {
-	return mlx5_eswitch_get_proto_dev(esw, vport_index, REP_IB);
+	return mlx5_eswitch_get_proto_dev(esw, vport_num, REP_IB);
 }
 
 struct net_device *mlx5_ib_get_rep_netdev(struct mlx5_eswitch *esw,
-					  int vport_index)
+					  u16 vport_num)
 {
-	return mlx5_eswitch_get_proto_dev(esw, vport_index, REP_ETH);
+	return mlx5_eswitch_get_proto_dev(esw, vport_num, REP_ETH);
 }
 
 struct mlx5_ib_dev *mlx5_ib_get_uplink_ibdev(struct mlx5_eswitch *esw)
@@ -166,27 +130,27 @@
 	return mlx5_eswitch_uplink_get_proto_dev(esw, REP_IB);
 }
 
-struct mlx5_eswitch_rep *mlx5_ib_vport_rep(struct mlx5_eswitch *esw, int vport)
+struct mlx5_eswitch_rep *mlx5_ib_vport_rep(struct mlx5_eswitch *esw,
+					   u16 vport_num)
 {
-	return mlx5_eswitch_vport_rep(esw, vport);
+	return mlx5_eswitch_vport_rep(esw, vport_num);
 }
 
-int create_flow_rule_vport_sq(struct mlx5_ib_dev *dev,
-			      struct mlx5_ib_sq *sq)
+struct mlx5_flow_handle *create_flow_rule_vport_sq(struct mlx5_ib_dev *dev,
+						   struct mlx5_ib_sq *sq,
+						   u16 port)
 {
-	struct mlx5_flow_handle *flow_rule;
 	struct mlx5_eswitch *esw = dev->mdev->priv.eswitch;
+	struct mlx5_eswitch_rep *rep;
 
-	if (!dev->rep)
-		return 0;
+	if (!dev->is_rep || !port)
+		return NULL;
 
-	flow_rule =
-		mlx5_eswitch_add_send_to_vport_rule(esw,
-						    dev->rep->vport,
-						    sq->base.mqp.qpn);
-	if (IS_ERR(flow_rule))
-		return PTR_ERR(flow_rule);
-	sq->flow_rule = flow_rule;
+	if (!dev->port[port - 1].rep)
+		return ERR_PTR(-EINVAL);
 
-	return 0;
+	rep = dev->port[port - 1].rep;
+
+	return mlx5_eswitch_add_send_to_vport_rule(esw, rep->vport,
+						   sq->base.mqp.qpn);
 }
diff --git a/drivers/infiniband/hw/mlx5/ib_rep.h b/drivers/infiniband/hw/mlx5/ib_rep.h
index 2ba7363..de43b42 100644
--- a/drivers/infiniband/hw/mlx5/ib_rep.h
+++ b/drivers/infiniband/hw/mlx5/ib_rep.h
@@ -10,27 +10,30 @@
 #include "mlx5_ib.h"
 
 #ifdef CONFIG_MLX5_ESWITCH
+extern const struct mlx5_ib_profile uplink_rep_profile;
+
 u8 mlx5_ib_eswitch_mode(struct mlx5_eswitch *esw);
 struct mlx5_ib_dev *mlx5_ib_get_rep_ibdev(struct mlx5_eswitch *esw,
-					  int vport_index);
+					  u16 vport_num);
 struct mlx5_ib_dev *mlx5_ib_get_uplink_ibdev(struct mlx5_eswitch *esw);
 struct mlx5_eswitch_rep *mlx5_ib_vport_rep(struct mlx5_eswitch *esw,
-					   int vport_index);
-void mlx5_ib_register_vport_reps(struct mlx5_ib_dev *dev);
-void mlx5_ib_unregister_vport_reps(struct mlx5_ib_dev *dev);
-int create_flow_rule_vport_sq(struct mlx5_ib_dev *dev,
-			      struct mlx5_ib_sq *sq);
+					   u16 vport_num);
+void mlx5_ib_register_vport_reps(struct mlx5_core_dev *mdev);
+void mlx5_ib_unregister_vport_reps(struct mlx5_core_dev *mdev);
+struct mlx5_flow_handle *create_flow_rule_vport_sq(struct mlx5_ib_dev *dev,
+						   struct mlx5_ib_sq *sq,
+						   u16 port);
 struct net_device *mlx5_ib_get_rep_netdev(struct mlx5_eswitch *esw,
-					  int vport_index);
+					  u16 vport_num);
 #else /* CONFIG_MLX5_ESWITCH */
 static inline u8 mlx5_ib_eswitch_mode(struct mlx5_eswitch *esw)
 {
-	return SRIOV_NONE;
+	return MLX5_ESWITCH_NONE;
 }
 
 static inline
 struct mlx5_ib_dev *mlx5_ib_get_rep_ibdev(struct mlx5_eswitch *esw,
-					  int vport_index)
+					  u16 vport_num)
 {
 	return NULL;
 }
@@ -43,22 +46,24 @@
 
 static inline
 struct mlx5_eswitch_rep *mlx5_ib_vport_rep(struct mlx5_eswitch *esw,
-					   int vport_index)
+					   u16 vport_num)
 {
 	return NULL;
 }
 
-static inline void mlx5_ib_register_vport_reps(struct mlx5_ib_dev *dev) {}
-static inline void mlx5_ib_unregister_vport_reps(struct mlx5_ib_dev *dev) {}
-static inline int create_flow_rule_vport_sq(struct mlx5_ib_dev *dev,
-					    struct mlx5_ib_sq *sq)
+static inline void mlx5_ib_register_vport_reps(struct mlx5_core_dev *mdev) {}
+static inline void mlx5_ib_unregister_vport_reps(struct mlx5_core_dev *mdev) {}
+static inline
+struct mlx5_flow_handle *create_flow_rule_vport_sq(struct mlx5_ib_dev *dev,
+						   struct mlx5_ib_sq *sq,
+						   u16 port)
 {
-	return 0;
+	return NULL;
 }
 
 static inline
 struct net_device *mlx5_ib_get_rep_netdev(struct mlx5_eswitch *esw,
-					  int vport_index)
+					  u16 vport_num)
 {
 	return NULL;
 }
@@ -67,6 +72,6 @@
 static inline
 struct mlx5_ib_dev *mlx5_ib_rep_to_dev(struct mlx5_eswitch_rep *rep)
 {
-	return (struct mlx5_ib_dev *)rep->rep_if[REP_IB].priv;
+	return rep->rep_data[REP_IB].priv;
 }
 #endif /* __MLX5_IB_REP_H__ */
diff --git a/drivers/infiniband/hw/mlx5/mad.c b/drivers/infiniband/hw/mlx5/mad.c
index 32a9e92..348c1df 100644
--- a/drivers/infiniband/hw/mlx5/mad.c
+++ b/drivers/infiniband/hw/mlx5/mad.c
@@ -36,6 +36,7 @@
 #include <rdma/ib_smi.h>
 #include <rdma/ib_pma.h>
 #include "mlx5_ib.h"
+#include "cmd.h"
 
 enum {
 	MLX5_IB_VENDOR_CLASS1 = 0x9,
@@ -51,9 +52,10 @@
 	return dev->mdev->port_caps[port_num - 1].has_smi;
 }
 
-int mlx5_MAD_IFC(struct mlx5_ib_dev *dev, int ignore_mkey, int ignore_bkey,
-		 u8 port, const struct ib_wc *in_wc, const struct ib_grh *in_grh,
-		 const void *in_mad, void *response_mad)
+static int mlx5_MAD_IFC(struct mlx5_ib_dev *dev, int ignore_mkey,
+			int ignore_bkey, u8 port, const struct ib_wc *in_wc,
+			const struct ib_grh *in_grh, const void *in_mad,
+			void *response_mad)
 {
 	u8 op_modifier = 0;
 
@@ -68,7 +70,8 @@
 	if (ignore_bkey || !in_wc)
 		op_modifier |= 0x2;
 
-	return mlx5_core_mad_ifc(dev->mdev, in_mad, response_mad, op_modifier, port);
+	return mlx5_cmd_mad_ifc(dev->mdev, in_mad, response_mad, op_modifier,
+				port);
 }
 
 static int process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
@@ -197,19 +200,33 @@
 			     vl_15_dropped);
 }
 
-static int process_pma_cmd(struct mlx5_core_dev *mdev, u8 port_num,
+static int process_pma_cmd(struct mlx5_ib_dev *dev, u8 port_num,
 			   const struct ib_mad *in_mad, struct ib_mad *out_mad)
 {
-	int err;
+	struct mlx5_core_dev *mdev;
+	bool native_port = true;
+	u8 mdev_port_num;
 	void *out_cnt;
+	int err;
 
+	mdev = mlx5_ib_get_native_port_mdev(dev, port_num, &mdev_port_num);
+	if (!mdev) {
+		/* Fail to get the native port, likely due to 2nd port is still
+		 * unaffiliated. In such case default to 1st port and attached
+		 * PF device.
+		 */
+		native_port = false;
+		mdev = dev->mdev;
+		mdev_port_num = 1;
+	}
 	/* Declaring support of extended counters */
 	if (in_mad->mad_hdr.attr_id == IB_PMA_CLASS_PORT_INFO) {
 		struct ib_class_port_info cpi = {};
 
 		cpi.capability_mask = IB_PMA_CLASS_CAP_EXT_WIDTH;
 		memcpy((out_mad->data + 40), &cpi, sizeof(cpi));
-		return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY;
+		err = IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY;
+		goto done;
 	}
 
 	if (in_mad->mad_hdr.attr_id == IB_PMA_PORT_COUNTERS_EXT) {
@@ -218,11 +235,13 @@
 		int sz = MLX5_ST_SZ_BYTES(query_vport_counter_out);
 
 		out_cnt = kvzalloc(sz, GFP_KERNEL);
-		if (!out_cnt)
-			return IB_MAD_RESULT_FAILURE;
+		if (!out_cnt) {
+			err = IB_MAD_RESULT_FAILURE;
+			goto done;
+		}
 
 		err = mlx5_core_query_vport_counter(mdev, 0, 0,
-						    port_num, out_cnt, sz);
+						    mdev_port_num, out_cnt, sz);
 		if (!err)
 			pma_cnt_ext_assign(pma_cnt_ext, out_cnt);
 	} else {
@@ -231,20 +250,23 @@
 		int sz = MLX5_ST_SZ_BYTES(ppcnt_reg);
 
 		out_cnt = kvzalloc(sz, GFP_KERNEL);
-		if (!out_cnt)
-			return IB_MAD_RESULT_FAILURE;
+		if (!out_cnt) {
+			err = IB_MAD_RESULT_FAILURE;
+			goto done;
+		}
 
-		err = mlx5_core_query_ib_ppcnt(mdev, port_num,
+		err = mlx5_core_query_ib_ppcnt(mdev, mdev_port_num,
 					       out_cnt, sz);
 		if (!err)
 			pma_cnt_assign(pma_cnt, out_cnt);
-		}
-
+	}
 	kvfree(out_cnt);
-	if (err)
-		return IB_MAD_RESULT_FAILURE;
-
-	return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY;
+	err = err ? IB_MAD_RESULT_FAILURE :
+		    IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY;
+done:
+	if (native_port)
+		mlx5_ib_put_native_port_mdev(dev, port_num);
+	return err;
 }
 
 int mlx5_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
@@ -256,8 +278,6 @@
 	struct mlx5_ib_dev *dev = to_mdev(ibdev);
 	const struct ib_mad *in_mad = (const struct ib_mad *)in;
 	struct ib_mad *out_mad = (struct ib_mad *)out;
-	struct mlx5_core_dev *mdev;
-	u8 mdev_port_num;
 	int ret;
 
 	if (WARN_ON_ONCE(in_mad_size != sizeof(*in_mad) ||
@@ -266,19 +286,14 @@
 
 	memset(out_mad->data, 0, sizeof(out_mad->data));
 
-	mdev = mlx5_ib_get_native_port_mdev(dev, port_num, &mdev_port_num);
-	if (!mdev)
-		return IB_MAD_RESULT_FAILURE;
-
-	if (MLX5_CAP_GEN(mdev, vport_counters) &&
+	if (MLX5_CAP_GEN(dev->mdev, vport_counters) &&
 	    in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_PERF_MGMT &&
 	    in_mad->mad_hdr.method == IB_MGMT_METHOD_GET) {
-		ret = process_pma_cmd(mdev, mdev_port_num, in_mad, out_mad);
+		ret = process_pma_cmd(dev, port_num, in_mad, out_mad);
 	} else {
 		ret =  process_mad(ibdev, mad_flags, port_num, in_wc, in_grh,
 				   in_mad, out_mad);
 	}
-	mlx5_ib_put_native_port_mdev(dev, port_num);
 	return ret;
 }
 
@@ -526,11 +541,6 @@
 	int ext_active_speed;
 	int err = -ENOMEM;
 
-	if (port < 1 || port > dev->num_ports) {
-		mlx5_ib_warn(dev, "invalid port number %d\n", port);
-		return -EINVAL;
-	}
-
 	in_mad  = kzalloc(sizeof(*in_mad), GFP_KERNEL);
 	out_mad = kmalloc(sizeof(*out_mad), GFP_KERNEL);
 	if (!in_mad || !out_mad)
@@ -568,6 +578,14 @@
 	props->max_vl_num	= out_mad->data[37] >> 4;
 	props->init_type_reply	= out_mad->data[41] >> 4;
 
+	if (props->port_cap_flags & IB_PORT_CAP_MASK2_SUP) {
+		props->port_cap_flags2 =
+			be16_to_cpup((__be16 *)(out_mad->data + 60));
+
+		if (props->port_cap_flags2 & IB_PORT_LINK_WIDTH_2X_SUP)
+			props->active_width = out_mad->data[31] & 0x1f;
+	}
+
 	/* Check if extended speeds (EDR/FDR/...) are supported */
 	if (props->port_cap_flags & IB_PORT_EXTENDED_SPEEDS_SUP) {
 		ext_active_speed = out_mad->data[62] >> 4;
@@ -579,6 +597,11 @@
 		case 2:
 			props->active_speed = 32; /* EDR */
 			break;
+		case 4:
+			if (props->port_cap_flags & IB_PORT_CAP_MASK2_SUP &&
+			    props->port_cap_flags2 & IB_PORT_LINK_SPEED_HDR_SUP)
+				props->active_speed = IB_SPEED_HDR;
+			break;
 		}
 	}
 
diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index 50be240..8315394 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -52,6 +52,7 @@
 #include <linux/mlx5/port.h>
 #include <linux/mlx5/vport.h>
 #include <linux/mlx5/fs.h>
+#include <linux/mlx5/eswitch.h>
 #include <linux/list.h>
 #include <rdma/ib_smi.h>
 #include <rdma/ib_umem.h>
@@ -60,6 +61,7 @@
 #include "mlx5_ib.h"
 #include "ib_rep.h"
 #include "cmd.h"
+#include "srq.h"
 #include <linux/mlx5/fs_helpers.h>
 #include <linux/mlx5/accel.h>
 #include <rdma/uverbs_std_types.h>
@@ -82,10 +84,13 @@
 
 struct mlx5_ib_event_work {
 	struct work_struct	work;
-	struct mlx5_core_dev	*dev;
-	void			*context;
-	enum mlx5_dev_event	event;
-	unsigned long		param;
+	union {
+		struct mlx5_ib_dev	      *dev;
+		struct mlx5_ib_multiport_info *mpi;
+	};
+	bool			is_slave;
+	unsigned int		event;
+	void			*param;
 };
 
 enum {
@@ -146,12 +151,40 @@
 	int ret;
 
 	memset(&attr, 0, sizeof(attr));
-	ret = ibdev->query_port(ibdev, port_num, &attr);
+	ret = ibdev->ops.query_port(ibdev, port_num, &attr);
 	if (!ret)
 		*state = attr.state;
 	return ret;
 }
 
+static struct mlx5_roce *mlx5_get_rep_roce(struct mlx5_ib_dev *dev,
+					   struct net_device *ndev,
+					   u8 *port_num)
+{
+	struct mlx5_eswitch *esw = dev->mdev->priv.eswitch;
+	struct net_device *rep_ndev;
+	struct mlx5_ib_port *port;
+	int i;
+
+	for (i = 0; i < dev->num_ports; i++) {
+		port  = &dev->port[i];
+		if (!port->rep)
+			continue;
+
+		read_lock(&port->roce.netdev_lock);
+		rep_ndev = mlx5_ib_get_rep_netdev(esw,
+						  port->rep->vport);
+		if (rep_ndev == ndev) {
+			read_unlock(&port->roce.netdev_lock);
+			*port_num = i + 1;
+			return &port->roce;
+		}
+		read_unlock(&port->roce.netdev_lock);
+	}
+
+	return NULL;
+}
+
 static int mlx5_netdev_event(struct notifier_block *this,
 			     unsigned long event, void *ptr)
 {
@@ -168,21 +201,20 @@
 
 	switch (event) {
 	case NETDEV_REGISTER:
-	case NETDEV_UNREGISTER:
+		/* Should already be registered during the load */
+		if (ibdev->is_rep)
+			break;
 		write_lock(&roce->netdev_lock);
-		if (ibdev->rep) {
-			struct mlx5_eswitch *esw = ibdev->mdev->priv.eswitch;
-			struct net_device *rep_ndev;
+		if (ndev->dev.parent == mdev->device)
+			roce->netdev = ndev;
+		write_unlock(&roce->netdev_lock);
+		break;
 
-			rep_ndev = mlx5_ib_get_rep_netdev(esw,
-							  ibdev->rep->vport);
-			if (rep_ndev == ndev)
-				roce->netdev = (event == NETDEV_UNREGISTER) ?
-					NULL : ndev;
-		} else if (ndev->dev.parent == &mdev->pdev->dev) {
-			roce->netdev = (event == NETDEV_UNREGISTER) ?
-				NULL : ndev;
-		}
+	case NETDEV_UNREGISTER:
+		/* In case of reps, ib device goes away before the netdevs */
+		write_lock(&roce->netdev_lock);
+		if (roce->netdev == ndev)
+			roce->netdev = NULL;
 		write_unlock(&roce->netdev_lock);
 		break;
 
@@ -197,6 +229,10 @@
 			dev_put(lag_ndev);
 		}
 
+		if (ibdev->is_rep)
+			roce = mlx5_get_rep_roce(ibdev, ndev, &port_num);
+		if (!roce)
+			return NOTIFY_DONE;
 		if ((upper == ndev || (!upper && ndev == roce->netdev))
 		    && ibdev->ib_active) {
 			struct ib_event ibev = { };
@@ -249,11 +285,11 @@
 
 	/* Ensure ndev does not disappear before we invoke dev_hold()
 	 */
-	read_lock(&ibdev->roce[port_num - 1].netdev_lock);
-	ndev = ibdev->roce[port_num - 1].netdev;
+	read_lock(&ibdev->port[port_num - 1].roce.netdev_lock);
+	ndev = ibdev->port[port_num - 1].roce.netdev;
 	if (ndev)
 		dev_hold(ndev);
-	read_unlock(&ibdev->roce[port_num - 1].netdev_lock);
+	read_unlock(&ibdev->port[port_num - 1].roce.netdev_lock);
 
 out:
 	mlx5_ib_put_native_port_mdev(ibdev, port_num);
@@ -323,8 +359,8 @@
 	spin_unlock(&port->mp.mpi_lock);
 }
 
-static int translate_eth_proto_oper(u32 eth_proto_oper, u8 *active_speed,
-				    u8 *active_width)
+static int translate_eth_legacy_proto_oper(u32 eth_proto_oper, u8 *active_speed,
+					   u8 *active_width)
 {
 	switch (eth_proto_oper) {
 	case MLX5E_PROT_MASK(MLX5E_1000BASE_CX_SGMII):
@@ -381,10 +417,73 @@
 	return 0;
 }
 
+static int translate_eth_ext_proto_oper(u32 eth_proto_oper, u8 *active_speed,
+					u8 *active_width)
+{
+	switch (eth_proto_oper) {
+	case MLX5E_PROT_MASK(MLX5E_SGMII_100M):
+	case MLX5E_PROT_MASK(MLX5E_1000BASE_X_SGMII):
+		*active_width = IB_WIDTH_1X;
+		*active_speed = IB_SPEED_SDR;
+		break;
+	case MLX5E_PROT_MASK(MLX5E_5GBASE_R):
+		*active_width = IB_WIDTH_1X;
+		*active_speed = IB_SPEED_DDR;
+		break;
+	case MLX5E_PROT_MASK(MLX5E_10GBASE_XFI_XAUI_1):
+		*active_width = IB_WIDTH_1X;
+		*active_speed = IB_SPEED_QDR;
+		break;
+	case MLX5E_PROT_MASK(MLX5E_40GBASE_XLAUI_4_XLPPI_4):
+		*active_width = IB_WIDTH_4X;
+		*active_speed = IB_SPEED_QDR;
+		break;
+	case MLX5E_PROT_MASK(MLX5E_25GAUI_1_25GBASE_CR_KR):
+		*active_width = IB_WIDTH_1X;
+		*active_speed = IB_SPEED_EDR;
+		break;
+	case MLX5E_PROT_MASK(MLX5E_50GAUI_2_LAUI_2_50GBASE_CR2_KR2):
+		*active_width = IB_WIDTH_2X;
+		*active_speed = IB_SPEED_EDR;
+		break;
+	case MLX5E_PROT_MASK(MLX5E_50GAUI_1_LAUI_1_50GBASE_CR_KR):
+		*active_width = IB_WIDTH_1X;
+		*active_speed = IB_SPEED_HDR;
+		break;
+	case MLX5E_PROT_MASK(MLX5E_CAUI_4_100GBASE_CR4_KR4):
+		*active_width = IB_WIDTH_4X;
+		*active_speed = IB_SPEED_EDR;
+		break;
+	case MLX5E_PROT_MASK(MLX5E_100GAUI_2_100GBASE_CR2_KR2):
+		*active_width = IB_WIDTH_2X;
+		*active_speed = IB_SPEED_HDR;
+		break;
+	case MLX5E_PROT_MASK(MLX5E_200GAUI_4_200GBASE_CR4_KR4):
+		*active_width = IB_WIDTH_4X;
+		*active_speed = IB_SPEED_HDR;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int translate_eth_proto_oper(u32 eth_proto_oper, u8 *active_speed,
+				    u8 *active_width, bool ext)
+{
+	return ext ?
+		translate_eth_ext_proto_oper(eth_proto_oper, active_speed,
+					     active_width) :
+		translate_eth_legacy_proto_oper(eth_proto_oper, active_speed,
+						active_width);
+}
+
 static int mlx5_query_port_roce(struct ib_device *device, u8 port_num,
 				struct ib_port_attr *props)
 {
 	struct mlx5_ib_dev *dev = to_mdev(device);
+	u32 out[MLX5_ST_SZ_DW(ptys_reg)] = {0};
 	struct mlx5_core_dev *mdev;
 	struct net_device *ndev, *upper;
 	enum ib_mtu ndev_ib_mtu;
@@ -392,6 +491,7 @@
 	u16 qkey_viol_cntr;
 	u32 eth_prot_oper;
 	u8 mdev_port_num;
+	bool ext;
 	int err;
 
 	mdev = mlx5_ib_get_native_port_mdev(dev, port_num, &mdev_port_num);
@@ -407,17 +507,24 @@
 
 	/* Possible bad flows are checked before filling out props so in case
 	 * of an error it will still be zeroed out.
+	 * Use native port in case of reps
 	 */
-	err = mlx5_query_port_eth_proto_oper(mdev, &eth_prot_oper,
-					     mdev_port_num);
+	if (dev->is_rep)
+		err = mlx5_query_port_ptys(mdev, out, sizeof(out), MLX5_PTYS_EN,
+					   1);
+	else
+		err = mlx5_query_port_ptys(mdev, out, sizeof(out), MLX5_PTYS_EN,
+					   mdev_port_num);
 	if (err)
 		goto out;
+	ext = MLX5_CAP_PCAM_FEATURE(dev->mdev, ptys_extended_ethernet);
+	eth_prot_oper = MLX5_GET_ETH_PROTO(ptys_reg, out, ext, eth_proto_oper);
 
 	props->active_width     = IB_WIDTH_4X;
 	props->active_speed     = IB_SPEED_QDR;
 
 	translate_eth_proto_oper(eth_prot_oper, &props->active_speed,
-				 &props->active_width);
+				 &props->active_width, ext);
 
 	props->port_cap_flags |= IB_PORT_CM_SUP;
 	props->ip_gids = true;
@@ -428,7 +535,7 @@
 	props->max_msg_sz       = 1 << MLX5_CAP_GEN(dev->mdev, log_max_msg);
 	props->pkey_tbl_len     = 1;
 	props->state            = IB_PORT_DOWN;
-	props->phys_state       = 3;
+	props->phys_state       = IB_PORT_PHYS_STATE_DISABLED;
 
 	mlx5_query_nic_vport_qkey_viol_cntr(mdev, &qkey_viol_cntr);
 	props->qkey_viol_cntr = qkey_viol_cntr;
@@ -441,7 +548,7 @@
 	if (!ndev)
 		goto out;
 
-	if (mlx5_lag_is_active(dev->mdev)) {
+	if (dev->lag_active) {
 		rcu_read_lock();
 		upper = netdev_master_upper_dev_get_rcu(ndev);
 		if (upper) {
@@ -454,7 +561,7 @@
 
 	if (netif_running(ndev) && netif_carrier_ok(ndev)) {
 		props->state      = IB_PORT_ACTIVE;
-		props->phys_state = 5;
+		props->phys_state = IB_PORT_PHYS_STATE_LINK_UP;
 	}
 
 	ndev_ib_mtu = iboe_get_mtu(ndev->mtu);
@@ -473,20 +580,17 @@
 			 const struct ib_gid_attr *attr)
 {
 	enum ib_gid_type gid_type = IB_GID_TYPE_IB;
+	u16 vlan_id = 0xffff;
 	u8 roce_version = 0;
 	u8 roce_l3_type = 0;
-	bool vlan = false;
 	u8 mac[ETH_ALEN];
-	u16 vlan_id = 0;
+	int ret;
 
 	if (gid) {
 		gid_type = attr->gid_type;
-		ether_addr_copy(mac, attr->ndev->dev_addr);
-
-		if (is_vlan_dev(attr->ndev)) {
-			vlan = true;
-			vlan_id = vlan_dev_vlan_id(attr->ndev);
-		}
+		ret = rdma_read_gid_l2_fields(attr, &vlan_id, &mac[0]);
+		if (ret)
+			return ret;
 	}
 
 	switch (gid_type) {
@@ -506,8 +610,9 @@
 	}
 
 	return mlx5_core_roce_gid_set(dev->mdev, index, roce_version,
-				      roce_l3_type, gid->raw, mac, vlan,
-				      vlan_id, port_num);
+				      roce_l3_type, gid->raw, mac,
+				      vlan_id < VLAN_CFI_MASK, vlan_id,
+				      port_num);
 }
 
 static int mlx5_ib_add_gid(const struct ib_gid_attr *attr,
@@ -784,7 +889,7 @@
 	}
 	props->device_cap_flags |= IB_DEVICE_MEM_MGT_EXTENSIONS;
 	if (MLX5_CAP_GEN(mdev, sho)) {
-		props->device_cap_flags |= IB_DEVICE_SIGNATURE_HANDOVER;
+		props->device_cap_flags |= IB_DEVICE_INTEGRITY_HANDOVER;
 		/* At this stage no support for signature handover */
 		props->sig_prot_cap = IB_PROT_T10DIF_TYPE_1 |
 				      IB_PROT_T10DIF_TYPE_2 |
@@ -904,6 +1009,8 @@
 	props->max_srq_sge	   = max_rq_sg - 1;
 	props->max_fast_reg_page_list_len =
 		1 << MLX5_CAP_GEN(mdev, log_max_klm_list_size);
+	props->max_pi_fast_reg_page_list_len =
+		props->max_fast_reg_page_list_len / 2;
 	get_atomic_caps_qp(dev, props);
 	props->masked_atomic_cap   = IB_ATOMIC_NONE;
 	props->max_mcast_grp	   = 1 << MLX5_CAP_GEN(mdev, log_max_mcg);
@@ -915,11 +1022,11 @@
 	props->hca_core_clock = MLX5_CAP_GEN(mdev, device_frequency_khz);
 	props->timestamp_mask = 0x7FFFFFFFFFFFFFFFULL;
 
-#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
-	if (MLX5_CAP_GEN(mdev, pg))
-		props->device_cap_flags |= IB_DEVICE_ON_DEMAND_PAGING;
-	props->odp_caps = dev->odp_caps;
-#endif
+	if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) {
+		if (dev->odp_caps.general_caps & IB_ODP_SUPPORT)
+			props->device_cap_flags |= IB_DEVICE_ON_DEMAND_PAGING;
+		props->odp_caps = dev->odp_caps;
+	}
 
 	if (MLX5_CAP_GEN(mdev, cd))
 		props->device_cap_flags |= IB_DEVICE_CROSS_CHANNEL;
@@ -939,15 +1046,19 @@
 	}
 
 	if (MLX5_CAP_GEN(mdev, tag_matching)) {
-		props->tm_caps.max_rndv_hdr_size = MLX5_TM_MAX_RNDV_MSG_SIZE;
 		props->tm_caps.max_num_tags =
 			(1 << MLX5_CAP_GEN(mdev, log_tag_matching_list_sz)) - 1;
-		props->tm_caps.flags = IB_TM_CAP_RC;
 		props->tm_caps.max_ops =
 			1 << MLX5_CAP_GEN(mdev, log_max_qp_sz);
 		props->tm_caps.max_sge = MLX5_TM_MAX_SGE;
 	}
 
+	if (MLX5_CAP_GEN(mdev, tag_matching) &&
+	    MLX5_CAP_GEN(mdev, rndv_offload_rc)) {
+		props->tm_caps.flags = IB_TM_CAP_RNDV_RC;
+		props->tm_caps.max_rndv_hdr_size = MLX5_TM_MAX_RNDV_MSG_SIZE;
+	}
+
 	if (MLX5_CAP_GEN(dev->mdev, cq_moderation)) {
 		props->cq_caps.max_cq_moderation_count =
 						MLX5_MAX_CQ_COUNT;
@@ -1014,6 +1125,11 @@
 
 		if (MLX5_CAP_GEN(mdev, cqe_128_always))
 			resp.flags |= MLX5_IB_QUERY_DEV_RESP_FLAGS_CQE_128B_PAD;
+		if (MLX5_CAP_GEN(mdev, qp_packet_based))
+			resp.flags |=
+				MLX5_IB_QUERY_DEV_RESP_PACKET_BASED_CREDIT_MODE;
+
+		resp.flags |= MLX5_IB_QUERY_DEV_RESP_FLAGS_SCAT2CQE_DCT;
 	}
 
 	if (field_avail(typeof(resp), sw_parsing_caps,
@@ -1101,6 +1217,8 @@
 
 	if (active_width & MLX5_IB_WIDTH_1X)
 		*ib_width = IB_WIDTH_1X;
+	else if (active_width & MLX5_IB_WIDTH_2X)
+		*ib_width = IB_WIDTH_2X;
 	else if (active_width & MLX5_IB_WIDTH_4X)
 		*ib_width = IB_WIDTH_4X;
 	else if (active_width & MLX5_IB_WIDTH_8X)
@@ -1216,6 +1334,9 @@
 	props->subnet_timeout	= rep->subnet_timeout;
 	props->init_type_reply	= rep->init_type_reply;
 
+	if (props->port_cap_flags & IB_PORT_CAP_MASK2_SUP)
+		props->port_cap_flags2 = rep->cap_mask2;
+
 	err = mlx5_query_port_link_width_oper(mdev, &ib_link_width_oper, port);
 	if (err)
 		goto out;
@@ -1295,7 +1416,9 @@
 {
 	int ret;
 
-	/* Only link layer == ethernet is valid for representors */
+	/* Only link layer == ethernet is valid for representors
+	 * and we always use port 1
+	 */
 	ret = mlx5_query_port_roce(ibdev, port, props);
 	if (ret || !props)
 		return ret;
@@ -1564,14 +1687,57 @@
 			mlx5_cmd_free_uar(dev->mdev, bfregi->sys_pages[i]);
 }
 
-static int mlx5_ib_alloc_transport_domain(struct mlx5_ib_dev *dev, u32 *tdn)
+int mlx5_ib_enable_lb(struct mlx5_ib_dev *dev, bool td, bool qp)
+{
+	int err = 0;
+
+	mutex_lock(&dev->lb.mutex);
+	if (td)
+		dev->lb.user_td++;
+	if (qp)
+		dev->lb.qps++;
+
+	if (dev->lb.user_td == 2 ||
+	    dev->lb.qps == 1) {
+		if (!dev->lb.enabled) {
+			err = mlx5_nic_vport_update_local_lb(dev->mdev, true);
+			dev->lb.enabled = true;
+		}
+	}
+
+	mutex_unlock(&dev->lb.mutex);
+
+	return err;
+}
+
+void mlx5_ib_disable_lb(struct mlx5_ib_dev *dev, bool td, bool qp)
+{
+	mutex_lock(&dev->lb.mutex);
+	if (td)
+		dev->lb.user_td--;
+	if (qp)
+		dev->lb.qps--;
+
+	if (dev->lb.user_td == 1 &&
+	    dev->lb.qps == 0) {
+		if (dev->lb.enabled) {
+			mlx5_nic_vport_update_local_lb(dev->mdev, false);
+			dev->lb.enabled = false;
+		}
+	}
+
+	mutex_unlock(&dev->lb.mutex);
+}
+
+static int mlx5_ib_alloc_transport_domain(struct mlx5_ib_dev *dev, u32 *tdn,
+					  u16 uid)
 {
 	int err;
 
 	if (!MLX5_CAP_GEN(dev->mdev, log_max_transport_domain))
 		return 0;
 
-	err = mlx5_core_alloc_transport_domain(dev->mdev, tdn);
+	err = mlx5_cmd_alloc_transport_domain(dev->mdev, tdn, uid);
 	if (err)
 		return err;
 
@@ -1580,45 +1746,34 @@
 	     !MLX5_CAP_GEN(dev->mdev, disable_local_lb_mc)))
 		return err;
 
-	mutex_lock(&dev->lb_mutex);
-	dev->user_td++;
-
-	if (dev->user_td == 2)
-		err = mlx5_nic_vport_update_local_lb(dev->mdev, true);
-
-	mutex_unlock(&dev->lb_mutex);
-	return err;
+	return mlx5_ib_enable_lb(dev, true, false);
 }
 
-static void mlx5_ib_dealloc_transport_domain(struct mlx5_ib_dev *dev, u32 tdn)
+static void mlx5_ib_dealloc_transport_domain(struct mlx5_ib_dev *dev, u32 tdn,
+					     u16 uid)
 {
 	if (!MLX5_CAP_GEN(dev->mdev, log_max_transport_domain))
 		return;
 
-	mlx5_core_dealloc_transport_domain(dev->mdev, tdn);
+	mlx5_cmd_dealloc_transport_domain(dev->mdev, tdn, uid);
 
 	if ((MLX5_CAP_GEN(dev->mdev, port_type) != MLX5_CAP_PORT_TYPE_ETH) ||
 	    (!MLX5_CAP_GEN(dev->mdev, disable_local_lb_uc) &&
 	     !MLX5_CAP_GEN(dev->mdev, disable_local_lb_mc)))
 		return;
 
-	mutex_lock(&dev->lb_mutex);
-	dev->user_td--;
-
-	if (dev->user_td < 2)
-		mlx5_nic_vport_update_local_lb(dev->mdev, false);
-
-	mutex_unlock(&dev->lb_mutex);
+	mlx5_ib_disable_lb(dev, true, false);
 }
 
-static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,
-						  struct ib_udata *udata)
+static int mlx5_ib_alloc_ucontext(struct ib_ucontext *uctx,
+				  struct ib_udata *udata)
 {
+	struct ib_device *ibdev = uctx->device;
 	struct mlx5_ib_dev *dev = to_mdev(ibdev);
 	struct mlx5_ib_alloc_ucontext_req_v2 req = {};
 	struct mlx5_ib_alloc_ucontext_resp resp = {};
 	struct mlx5_core_dev *mdev = dev->mdev;
-	struct mlx5_ib_ucontext *context;
+	struct mlx5_ib_ucontext *context = to_mucontext(uctx);
 	struct mlx5_bfreg_info *bfregi;
 	int ver;
 	int err;
@@ -1628,29 +1783,29 @@
 	bool lib_uar_4k;
 
 	if (!dev->ib_active)
-		return ERR_PTR(-EAGAIN);
+		return -EAGAIN;
 
 	if (udata->inlen == sizeof(struct mlx5_ib_alloc_ucontext_req))
 		ver = 0;
 	else if (udata->inlen >= min_req_v2)
 		ver = 2;
 	else
-		return ERR_PTR(-EINVAL);
+		return -EINVAL;
 
 	err = ib_copy_from_udata(&req, udata, min(udata->inlen, sizeof(req)));
 	if (err)
-		return ERR_PTR(err);
+		return err;
 
 	if (req.flags & ~MLX5_IB_ALLOC_UCTX_DEVX)
-		return ERR_PTR(-EOPNOTSUPP);
+		return -EOPNOTSUPP;
 
 	if (req.comp_mask || req.reserved0 || req.reserved1 || req.reserved2)
-		return ERR_PTR(-EOPNOTSUPP);
+		return -EOPNOTSUPP;
 
 	req.total_num_bfregs = ALIGN(req.total_num_bfregs,
 				    MLX5_NON_FP_BFREGS_PER_UAR);
 	if (req.num_low_latency_bfregs > req.total_num_bfregs - 1)
-		return ERR_PTR(-EINVAL);
+		return -EINVAL;
 
 	resp.qp_tab_size = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp);
 	if (mlx5_core_is_pf(dev->mdev) && MLX5_CAP_GEN(dev->mdev, bf))
@@ -1683,10 +1838,6 @@
 		/* MLX5_USER_ALLOC_UCONTEXT_FLOW_ACTION_FLAGS_ESP_AES_GCM_FULL_OFFLOAD is currently always 0 */
 	}
 
-	context = kzalloc(sizeof(*context), GFP_KERNEL);
-	if (!context)
-		return ERR_PTR(-ENOMEM);
-
 	lib_uar_4k = req.lib_caps & MLX5_LIB_CAP_4K_UAR;
 	bfregi = &context->bfregi;
 
@@ -1716,34 +1867,24 @@
 	if (err)
 		goto out_sys_pages;
 
-#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
-	context->ibucontext.invalidate_range = &mlx5_ib_invalidate_range;
-#endif
-
-	err = mlx5_ib_alloc_transport_domain(dev, &context->tdn);
-	if (err)
-		goto out_uars;
-
 	if (req.flags & MLX5_IB_ALLOC_UCTX_DEVX) {
-		/* Block DEVX on Infiniband as of SELinux */
-		if (mlx5_ib_port_link_layer(ibdev, 1) != IB_LINK_LAYER_ETHERNET) {
-			err = -EPERM;
-			goto out_td;
-		}
-
-		err = mlx5_ib_devx_create(dev, context);
-		if (err)
-			goto out_td;
+		err = mlx5_ib_devx_create(dev, true);
+		if (err < 0)
+			goto out_uars;
+		context->devx_uid = err;
 	}
 
+	err = mlx5_ib_alloc_transport_domain(dev, &context->tdn,
+					     context->devx_uid);
+	if (err)
+		goto out_devx;
+
 	if (MLX5_CAP_GEN(dev->mdev, dump_fill_mkey)) {
 		err = mlx5_cmd_dump_fill_mkey(dev->mdev, &dump_fill_mkey);
 		if (err)
 			goto out_mdev;
 	}
 
-	INIT_LIST_HEAD(&context->vma_private_list);
-	mutex_init(&context->vma_private_list_mutex);
 	INIT_LIST_HEAD(&context->db_page_list);
 	mutex_init(&context->db_page_mutex);
 
@@ -1819,13 +1960,21 @@
 	context->lib_caps = req.lib_caps;
 	print_lib_caps(dev, context->lib_caps);
 
-	return &context->ibucontext;
+	if (dev->lag_active) {
+		u8 port = mlx5_core_native_port_num(dev->mdev) - 1;
+
+		atomic_set(&context->tx_port_affinity,
+			   atomic_add_return(
+				   1, &dev->port[port].roce.tx_port_affinity));
+	}
+
+	return 0;
 
 out_mdev:
+	mlx5_ib_dealloc_transport_domain(dev, context->tdn, context->devx_uid);
+out_devx:
 	if (req.flags & MLX5_IB_ALLOC_UCTX_DEVX)
-		mlx5_ib_devx_destroy(dev, context);
-out_td:
-	mlx5_ib_dealloc_transport_domain(dev, context->tdn);
+		mlx5_ib_devx_destroy(dev, context->devx_uid);
 
 out_uars:
 	deallocate_uars(dev, context);
@@ -1837,29 +1986,24 @@
 	kfree(bfregi->count);
 
 out_ctx:
-	kfree(context);
-
-	return ERR_PTR(err);
+	return err;
 }
 
-static int mlx5_ib_dealloc_ucontext(struct ib_ucontext *ibcontext)
+static void mlx5_ib_dealloc_ucontext(struct ib_ucontext *ibcontext)
 {
 	struct mlx5_ib_ucontext *context = to_mucontext(ibcontext);
 	struct mlx5_ib_dev *dev = to_mdev(ibcontext->device);
 	struct mlx5_bfreg_info *bfregi;
 
-	if (context->devx_uid)
-		mlx5_ib_devx_destroy(dev, context);
-
 	bfregi = &context->bfregi;
-	mlx5_ib_dealloc_transport_domain(dev, context->tdn);
+	mlx5_ib_dealloc_transport_domain(dev, context->tdn, context->devx_uid);
+
+	if (context->devx_uid)
+		mlx5_ib_devx_destroy(dev, context->devx_uid);
 
 	deallocate_uars(dev, context);
 	kfree(bfregi->sys_pages);
 	kfree(bfregi->count);
-	kfree(context);
-
-	return 0;
 }
 
 static phys_addr_t uar_index2pfn(struct mlx5_ib_dev *dev,
@@ -1869,7 +2013,7 @@
 
 	fw_uars_per_page = MLX5_CAP_GEN(dev->mdev, uar_4k) ? MLX5_UARS_IN_PAGE : 1;
 
-	return (pci_resource_start(dev->mdev->pdev, 0) >> PAGE_SHIFT) + uar_idx / fw_uars_per_page;
+	return (dev->mdev->bar_addr >> PAGE_SHIFT) + uar_idx / fw_uars_per_page;
 }
 
 static int get_command(unsigned long offset)
@@ -1893,94 +2037,9 @@
 	return get_arg(offset) | ((offset >> 16) & 0xff) << 8;
 }
 
-static void  mlx5_ib_vma_open(struct vm_area_struct *area)
-{
-	/* vma_open is called when a new VMA is created on top of our VMA.  This
-	 * is done through either mremap flow or split_vma (usually due to
-	 * mlock, madvise, munmap, etc.) We do not support a clone of the VMA,
-	 * as this VMA is strongly hardware related.  Therefore we set the
-	 * vm_ops of the newly created/cloned VMA to NULL, to prevent it from
-	 * calling us again and trying to do incorrect actions.  We assume that
-	 * the original VMA size is exactly a single page, and therefore all
-	 * "splitting" operation will not happen to it.
-	 */
-	area->vm_ops = NULL;
-}
-
-static void  mlx5_ib_vma_close(struct vm_area_struct *area)
-{
-	struct mlx5_ib_vma_private_data *mlx5_ib_vma_priv_data;
-
-	/* It's guaranteed that all VMAs opened on a FD are closed before the
-	 * file itself is closed, therefore no sync is needed with the regular
-	 * closing flow. (e.g. mlx5 ib_dealloc_ucontext)
-	 * However need a sync with accessing the vma as part of
-	 * mlx5_ib_disassociate_ucontext.
-	 * The close operation is usually called under mm->mmap_sem except when
-	 * process is exiting.
-	 * The exiting case is handled explicitly as part of
-	 * mlx5_ib_disassociate_ucontext.
-	 */
-	mlx5_ib_vma_priv_data = (struct mlx5_ib_vma_private_data *)area->vm_private_data;
-
-	/* setting the vma context pointer to null in the mlx5_ib driver's
-	 * private data, to protect a race condition in
-	 * mlx5_ib_disassociate_ucontext().
-	 */
-	mlx5_ib_vma_priv_data->vma = NULL;
-	mutex_lock(mlx5_ib_vma_priv_data->vma_private_list_mutex);
-	list_del(&mlx5_ib_vma_priv_data->list);
-	mutex_unlock(mlx5_ib_vma_priv_data->vma_private_list_mutex);
-	kfree(mlx5_ib_vma_priv_data);
-}
-
-static const struct vm_operations_struct mlx5_ib_vm_ops = {
-	.open = mlx5_ib_vma_open,
-	.close = mlx5_ib_vma_close
-};
-
-static int mlx5_ib_set_vma_data(struct vm_area_struct *vma,
-				struct mlx5_ib_ucontext *ctx)
-{
-	struct mlx5_ib_vma_private_data *vma_prv;
-	struct list_head *vma_head = &ctx->vma_private_list;
-
-	vma_prv = kzalloc(sizeof(*vma_prv), GFP_KERNEL);
-	if (!vma_prv)
-		return -ENOMEM;
-
-	vma_prv->vma = vma;
-	vma_prv->vma_private_list_mutex = &ctx->vma_private_list_mutex;
-	vma->vm_private_data = vma_prv;
-	vma->vm_ops =  &mlx5_ib_vm_ops;
-
-	mutex_lock(&ctx->vma_private_list_mutex);
-	list_add(&vma_prv->list, vma_head);
-	mutex_unlock(&ctx->vma_private_list_mutex);
-
-	return 0;
-}
 
 static void mlx5_ib_disassociate_ucontext(struct ib_ucontext *ibcontext)
 {
-	struct vm_area_struct *vma;
-	struct mlx5_ib_vma_private_data *vma_private, *n;
-	struct mlx5_ib_ucontext *context = to_mucontext(ibcontext);
-
-	mutex_lock(&context->vma_private_list_mutex);
-	list_for_each_entry_safe(vma_private, n, &context->vma_private_list,
-				 list) {
-		vma = vma_private->vma;
-		zap_vma_ptes(vma, vma->vm_start, PAGE_SIZE);
-		/* context going to be destroyed, should
-		 * not access ops any more.
-		 */
-		vma->vm_flags &= ~(VM_SHARED | VM_MAYSHARE);
-		vma->vm_ops = NULL;
-		list_del(&vma_private->list);
-		kfree(vma_private);
-	}
-	mutex_unlock(&context->vma_private_list_mutex);
 }
 
 static inline char *mmap_cmd2str(enum mlx5_ib_mmap_cmd cmd)
@@ -2003,28 +2062,22 @@
 					struct vm_area_struct *vma,
 					struct mlx5_ib_ucontext *context)
 {
-	phys_addr_t pfn;
-	int err;
-
-	if (vma->vm_end - vma->vm_start != PAGE_SIZE)
+	if ((vma->vm_end - vma->vm_start != PAGE_SIZE) ||
+	    !(vma->vm_flags & VM_SHARED))
 		return -EINVAL;
 
 	if (get_index(vma->vm_pgoff) != MLX5_IB_CLOCK_INFO_V1)
 		return -EOPNOTSUPP;
 
-	if (vma->vm_flags & VM_WRITE)
+	if (vma->vm_flags & (VM_WRITE | VM_EXEC))
 		return -EPERM;
+	vma->vm_flags &= ~VM_MAYWRITE;
 
-	if (!dev->mdev->clock_info_page)
+	if (!dev->mdev->clock_info)
 		return -EOPNOTSUPP;
 
-	pfn = page_to_pfn(dev->mdev->clock_info_page);
-	err = remap_pfn_range(vma, vma->vm_start, pfn, PAGE_SIZE,
-			      vma->vm_page_prot);
-	if (err)
-		return err;
-
-	return mlx5_ib_set_vma_data(vma, context);
+	return vm_insert_page(vma, vma->vm_start,
+			      virt_to_page(dev->mdev->clock_info));
 }
 
 static int uar_mmap(struct mlx5_ib_dev *dev, enum mlx5_ib_mmap_cmd cmd,
@@ -2114,21 +2167,15 @@
 	pfn = uar_index2pfn(dev, uar_index);
 	mlx5_ib_dbg(dev, "uar idx 0x%lx, pfn %pa\n", idx, &pfn);
 
-	vma->vm_page_prot = prot;
-	err = io_remap_pfn_range(vma, vma->vm_start, pfn,
-				 PAGE_SIZE, vma->vm_page_prot);
+	err = rdma_user_mmap_io(&context->ibucontext, vma, pfn, PAGE_SIZE,
+				prot);
 	if (err) {
 		mlx5_ib_err(dev,
-			    "io_remap_pfn_range failed with error=%d, mmap_cmd=%s\n",
+			    "rdma_user_mmap_io failed with error=%d, mmap_cmd=%s\n",
 			    err, mmap_cmd2str(cmd));
-		err = -EAGAIN;
 		goto err;
 	}
 
-	err = mlx5_ib_set_vma_data(vma, context);
-	if (err)
-		goto err;
-
 	if (dyn_uar)
 		bfregi->sys_pages[idx] = uar_index;
 	return 0;
@@ -2153,24 +2200,17 @@
 	size_t map_size = vma->vm_end - vma->vm_start;
 	u32 npages = map_size >> PAGE_SHIFT;
 	phys_addr_t pfn;
-	pgprot_t prot;
 
 	if (find_next_zero_bit(mctx->dm_pages, page_idx + npages, page_idx) !=
 	    page_idx + npages)
 		return -EINVAL;
 
-	pfn = ((pci_resource_start(dev->mdev->pdev, 0) +
+	pfn = ((dev->mdev->bar_addr +
 	      MLX5_CAP64_DEV_MEM(dev->mdev, memic_bar_start_addr)) >>
 	      PAGE_SHIFT) +
 	      page_idx;
-	prot = pgprot_writecombine(vma->vm_page_prot);
-	vma->vm_page_prot = prot;
-
-	if (io_remap_pfn_range(vma, vma->vm_start, pfn, map_size,
-			       vma->vm_page_prot))
-		return -EAGAIN;
-
-	return mlx5_ib_set_vma_data(vma, mctx);
+	return rdma_user_mmap_io(context, vma, pfn, map_size,
+				 pgprot_writecombine(vma->vm_page_prot));
 }
 
 static int mlx5_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vma)
@@ -2197,19 +2237,18 @@
 
 		if (vma->vm_flags & VM_WRITE)
 			return -EPERM;
+		vma->vm_flags &= ~VM_MAYWRITE;
 
 		/* Don't expose to user-space information it shouldn't have */
 		if (PAGE_SIZE > 4096)
 			return -EOPNOTSUPP;
 
-		vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
 		pfn = (dev->mdev->iseg_base +
 		       offsetof(struct mlx5_init_seg, internal_timer_h)) >>
 			PAGE_SHIFT;
-		if (io_remap_pfn_range(vma, vma->vm_start, pfn,
-				       PAGE_SIZE, vma->vm_page_prot))
-			return -EAGAIN;
-		break;
+		return rdma_user_mmap_io(&context->ibucontext, vma, pfn,
+					 PAGE_SIZE,
+					 pgprot_noncached(vma->vm_page_prot));
 	case MLX5_IB_MMAP_CLOCK_INFO:
 		return mlx5_ib_mmap_clock_info_page(dev, vma, context);
 
@@ -2223,126 +2262,250 @@
 	return 0;
 }
 
-struct ib_dm *mlx5_ib_alloc_dm(struct ib_device *ibdev,
-			       struct ib_ucontext *context,
-			       struct ib_dm_alloc_attr *attr,
-			       struct uverbs_attr_bundle *attrs)
+static inline int check_dm_type_support(struct mlx5_ib_dev *dev,
+					u32 type)
 {
-	u64 act_size = roundup(attr->length, MLX5_MEMIC_BASE_SIZE);
-	struct mlx5_memic *memic = &to_mdev(ibdev)->memic;
-	phys_addr_t memic_addr;
-	struct mlx5_ib_dm *dm;
+	switch (type) {
+	case MLX5_IB_UAPI_DM_TYPE_MEMIC:
+		if (!MLX5_CAP_DEV_MEM(dev->mdev, memic))
+			return -EOPNOTSUPP;
+		break;
+	case MLX5_IB_UAPI_DM_TYPE_STEERING_SW_ICM:
+	case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_SW_ICM:
+		if (!capable(CAP_SYS_RAWIO) ||
+		    !capable(CAP_NET_RAW))
+			return -EPERM;
+
+		if (!(MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev, sw_owner) ||
+		      MLX5_CAP_FLOWTABLE_NIC_TX(dev->mdev, sw_owner)))
+			return -EOPNOTSUPP;
+		break;
+	}
+
+	return 0;
+}
+
+static int handle_alloc_dm_memic(struct ib_ucontext *ctx,
+				 struct mlx5_ib_dm *dm,
+				 struct ib_dm_alloc_attr *attr,
+				 struct uverbs_attr_bundle *attrs)
+{
+	struct mlx5_dm *dm_db = &to_mdev(ctx->device)->dm;
 	u64 start_offset;
 	u32 page_idx;
 	int err;
 
-	dm = kzalloc(sizeof(*dm), GFP_KERNEL);
-	if (!dm)
-		return ERR_PTR(-ENOMEM);
+	dm->size = roundup(attr->length, MLX5_MEMIC_BASE_SIZE);
 
-	mlx5_ib_dbg(to_mdev(ibdev), "alloc_memic req: user_length=0x%llx act_length=0x%llx log_alignment=%d\n",
-		    attr->length, act_size, attr->alignment);
-
-	err = mlx5_cmd_alloc_memic(memic, &memic_addr,
-				   act_size, attr->alignment);
+	err = mlx5_cmd_alloc_memic(dm_db, &dm->dev_addr,
+				   dm->size, attr->alignment);
 	if (err)
-		goto err_free;
+		return err;
 
-	start_offset = memic_addr & ~PAGE_MASK;
-	page_idx = (memic_addr - pci_resource_start(memic->dev->pdev, 0) -
-		    MLX5_CAP64_DEV_MEM(memic->dev, memic_bar_start_addr)) >>
+	page_idx = (dm->dev_addr - pci_resource_start(dm_db->dev->pdev, 0) -
+		    MLX5_CAP64_DEV_MEM(dm_db->dev, memic_bar_start_addr)) >>
 		    PAGE_SHIFT;
 
 	err = uverbs_copy_to(attrs,
-			     MLX5_IB_ATTR_ALLOC_DM_RESP_START_OFFSET,
-			     &start_offset, sizeof(start_offset));
-	if (err)
-		goto err_dealloc;
-
-	err = uverbs_copy_to(attrs,
 			     MLX5_IB_ATTR_ALLOC_DM_RESP_PAGE_INDEX,
 			     &page_idx, sizeof(page_idx));
 	if (err)
 		goto err_dealloc;
 
-	bitmap_set(to_mucontext(context)->dm_pages, page_idx,
-		   DIV_ROUND_UP(act_size, PAGE_SIZE));
+	start_offset = dm->dev_addr & ~PAGE_MASK;
+	err = uverbs_copy_to(attrs,
+			     MLX5_IB_ATTR_ALLOC_DM_RESP_START_OFFSET,
+			     &start_offset, sizeof(start_offset));
+	if (err)
+		goto err_dealloc;
 
-	dm->dev_addr = memic_addr;
+	bitmap_set(to_mucontext(ctx)->dm_pages, page_idx,
+		   DIV_ROUND_UP(dm->size, PAGE_SIZE));
+
+	return 0;
+
+err_dealloc:
+	mlx5_cmd_dealloc_memic(dm_db, dm->dev_addr, dm->size);
+
+	return err;
+}
+
+static int handle_alloc_dm_sw_icm(struct ib_ucontext *ctx,
+				  struct mlx5_ib_dm *dm,
+				  struct ib_dm_alloc_attr *attr,
+				  struct uverbs_attr_bundle *attrs,
+				  int type)
+{
+	struct mlx5_core_dev *dev = to_mdev(ctx->device)->mdev;
+	u64 act_size;
+	int err;
+
+	/* Allocation size must a multiple of the basic block size
+	 * and a power of 2.
+	 */
+	act_size = round_up(attr->length, MLX5_SW_ICM_BLOCK_SIZE(dev));
+	act_size = roundup_pow_of_two(act_size);
+
+	dm->size = act_size;
+	err = mlx5_dm_sw_icm_alloc(dev, type, act_size,
+				   to_mucontext(ctx)->devx_uid, &dm->dev_addr,
+				   &dm->icm_dm.obj_id);
+	if (err)
+		return err;
+
+	err = uverbs_copy_to(attrs,
+			     MLX5_IB_ATTR_ALLOC_DM_RESP_START_OFFSET,
+			     &dm->dev_addr, sizeof(dm->dev_addr));
+	if (err)
+		mlx5_dm_sw_icm_dealloc(dev, type, dm->size,
+				       to_mucontext(ctx)->devx_uid, dm->dev_addr,
+				       dm->icm_dm.obj_id);
+
+	return err;
+}
+
+struct ib_dm *mlx5_ib_alloc_dm(struct ib_device *ibdev,
+			       struct ib_ucontext *context,
+			       struct ib_dm_alloc_attr *attr,
+			       struct uverbs_attr_bundle *attrs)
+{
+	struct mlx5_ib_dm *dm;
+	enum mlx5_ib_uapi_dm_type type;
+	int err;
+
+	err = uverbs_get_const_default(&type, attrs,
+				       MLX5_IB_ATTR_ALLOC_DM_REQ_TYPE,
+				       MLX5_IB_UAPI_DM_TYPE_MEMIC);
+	if (err)
+		return ERR_PTR(err);
+
+	mlx5_ib_dbg(to_mdev(ibdev), "alloc_dm req: dm_type=%d user_length=0x%llx log_alignment=%d\n",
+		    type, attr->length, attr->alignment);
+
+	err = check_dm_type_support(to_mdev(ibdev), type);
+	if (err)
+		return ERR_PTR(err);
+
+	dm = kzalloc(sizeof(*dm), GFP_KERNEL);
+	if (!dm)
+		return ERR_PTR(-ENOMEM);
+
+	dm->type = type;
+
+	switch (type) {
+	case MLX5_IB_UAPI_DM_TYPE_MEMIC:
+		err = handle_alloc_dm_memic(context, dm,
+					    attr,
+					    attrs);
+		break;
+	case MLX5_IB_UAPI_DM_TYPE_STEERING_SW_ICM:
+		err = handle_alloc_dm_sw_icm(context, dm,
+					     attr, attrs,
+					     MLX5_SW_ICM_TYPE_STEERING);
+		break;
+	case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_SW_ICM:
+		err = handle_alloc_dm_sw_icm(context, dm,
+					     attr, attrs,
+					     MLX5_SW_ICM_TYPE_HEADER_MODIFY);
+		break;
+	default:
+		err = -EOPNOTSUPP;
+	}
+
+	if (err)
+		goto err_free;
 
 	return &dm->ibdm;
 
-err_dealloc:
-	mlx5_cmd_dealloc_memic(memic, memic_addr,
-			       act_size);
 err_free:
 	kfree(dm);
 	return ERR_PTR(err);
 }
 
-int mlx5_ib_dealloc_dm(struct ib_dm *ibdm)
+int mlx5_ib_dealloc_dm(struct ib_dm *ibdm, struct uverbs_attr_bundle *attrs)
 {
-	struct mlx5_memic *memic = &to_mdev(ibdm->device)->memic;
+	struct mlx5_ib_ucontext *ctx = rdma_udata_to_drv_context(
+		&attrs->driver_udata, struct mlx5_ib_ucontext, ibucontext);
+	struct mlx5_core_dev *dev = to_mdev(ibdm->device)->mdev;
+	struct mlx5_dm *dm_db = &to_mdev(ibdm->device)->dm;
 	struct mlx5_ib_dm *dm = to_mdm(ibdm);
-	u64 act_size = roundup(dm->ibdm.length, MLX5_MEMIC_BASE_SIZE);
 	u32 page_idx;
 	int ret;
 
-	ret = mlx5_cmd_dealloc_memic(memic, dm->dev_addr, act_size);
-	if (ret)
-		return ret;
+	switch (dm->type) {
+	case MLX5_IB_UAPI_DM_TYPE_MEMIC:
+		ret = mlx5_cmd_dealloc_memic(dm_db, dm->dev_addr, dm->size);
+		if (ret)
+			return ret;
 
-	page_idx = (dm->dev_addr - pci_resource_start(memic->dev->pdev, 0) -
-		    MLX5_CAP64_DEV_MEM(memic->dev, memic_bar_start_addr)) >>
-		    PAGE_SHIFT;
-	bitmap_clear(to_mucontext(ibdm->uobject->context)->dm_pages,
-		     page_idx,
-		     DIV_ROUND_UP(act_size, PAGE_SIZE));
+		page_idx = (dm->dev_addr - pci_resource_start(dev->pdev, 0) -
+			    MLX5_CAP64_DEV_MEM(dev, memic_bar_start_addr)) >>
+			    PAGE_SHIFT;
+		bitmap_clear(ctx->dm_pages, page_idx,
+			     DIV_ROUND_UP(dm->size, PAGE_SIZE));
+		break;
+	case MLX5_IB_UAPI_DM_TYPE_STEERING_SW_ICM:
+		ret = mlx5_dm_sw_icm_dealloc(dev, MLX5_SW_ICM_TYPE_STEERING,
+					     dm->size, ctx->devx_uid, dm->dev_addr,
+					     dm->icm_dm.obj_id);
+		if (ret)
+			return ret;
+		break;
+	case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_SW_ICM:
+		ret = mlx5_dm_sw_icm_dealloc(dev, MLX5_SW_ICM_TYPE_HEADER_MODIFY,
+					     dm->size, ctx->devx_uid, dm->dev_addr,
+					     dm->icm_dm.obj_id);
+		if (ret)
+			return ret;
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
 
 	kfree(dm);
 
 	return 0;
 }
 
-static struct ib_pd *mlx5_ib_alloc_pd(struct ib_device *ibdev,
-				      struct ib_ucontext *context,
-				      struct ib_udata *udata)
+static int mlx5_ib_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata)
 {
+	struct mlx5_ib_pd *pd = to_mpd(ibpd);
+	struct ib_device *ibdev = ibpd->device;
 	struct mlx5_ib_alloc_pd_resp resp;
-	struct mlx5_ib_pd *pd;
 	int err;
+	u32 out[MLX5_ST_SZ_DW(alloc_pd_out)] = {};
+	u32 in[MLX5_ST_SZ_DW(alloc_pd_in)]   = {};
+	u16 uid = 0;
+	struct mlx5_ib_ucontext *context = rdma_udata_to_drv_context(
+		udata, struct mlx5_ib_ucontext, ibucontext);
 
-	pd = kmalloc(sizeof(*pd), GFP_KERNEL);
-	if (!pd)
-		return ERR_PTR(-ENOMEM);
+	uid = context ? context->devx_uid : 0;
+	MLX5_SET(alloc_pd_in, in, opcode, MLX5_CMD_OP_ALLOC_PD);
+	MLX5_SET(alloc_pd_in, in, uid, uid);
+	err = mlx5_cmd_exec(to_mdev(ibdev)->mdev, in, sizeof(in),
+			    out, sizeof(out));
+	if (err)
+		return err;
 
-	err = mlx5_core_alloc_pd(to_mdev(ibdev)->mdev, &pd->pdn);
-	if (err) {
-		kfree(pd);
-		return ERR_PTR(err);
-	}
-
-	if (context) {
+	pd->pdn = MLX5_GET(alloc_pd_out, out, pd);
+	pd->uid = uid;
+	if (udata) {
 		resp.pdn = pd->pdn;
 		if (ib_copy_to_udata(udata, &resp, sizeof(resp))) {
-			mlx5_core_dealloc_pd(to_mdev(ibdev)->mdev, pd->pdn);
-			kfree(pd);
-			return ERR_PTR(-EFAULT);
+			mlx5_cmd_dealloc_pd(to_mdev(ibdev)->mdev, pd->pdn, uid);
+			return -EFAULT;
 		}
 	}
 
-	return &pd->ibpd;
+	return 0;
 }
 
-static int mlx5_ib_dealloc_pd(struct ib_pd *pd)
+static void mlx5_ib_dealloc_pd(struct ib_pd *pd, struct ib_udata *udata)
 {
 	struct mlx5_ib_dev *mdev = to_mdev(pd->device);
 	struct mlx5_ib_pd *mpd = to_mpd(pd);
 
-	mlx5_core_dealloc_pd(mdev->mdev, mpd->pdn);
-	kfree(mpd);
-
-	return 0;
+	mlx5_cmd_dealloc_pd(mdev->mdev, mpd->pdn, mpd->uid);
 }
 
 enum {
@@ -2376,10 +2539,29 @@
 	return match_criteria_enable;
 }
 
-static void set_proto(void *outer_c, void *outer_v, u8 mask, u8 val)
+static int set_proto(void *outer_c, void *outer_v, u8 mask, u8 val)
 {
-	MLX5_SET(fte_match_set_lyr_2_4, outer_c, ip_protocol, mask);
-	MLX5_SET(fte_match_set_lyr_2_4, outer_v, ip_protocol, val);
+	u8 entry_mask;
+	u8 entry_val;
+	int err = 0;
+
+	if (!mask)
+		goto out;
+
+	entry_mask = MLX5_GET(fte_match_set_lyr_2_4, outer_c,
+			      ip_protocol);
+	entry_val = MLX5_GET(fte_match_set_lyr_2_4, outer_v,
+			     ip_protocol);
+	if (!entry_mask) {
+		MLX5_SET(fte_match_set_lyr_2_4, outer_c, ip_protocol, mask);
+		MLX5_SET(fte_match_set_lyr_2_4, outer_v, ip_protocol, val);
+		goto out;
+	}
+	/* Don't override existing ip protocol */
+	if (mask != entry_mask || val != entry_val)
+		err = -EINVAL;
+out:
+	return err;
 }
 
 static void set_flow_label(void *misc_c, void *misc_v, u32 mask, u32 val,
@@ -2445,30 +2627,65 @@
 		   offsetof(typeof(filter), field) -\
 		   sizeof(filter.field))
 
-static int parse_flow_flow_action(const union ib_flow_spec *ib_spec,
-				  const struct ib_flow_attr *flow_attr,
-				  struct mlx5_flow_act *action)
+int parse_flow_flow_action(struct mlx5_ib_flow_action *maction,
+			   bool is_egress,
+			   struct mlx5_flow_act *action)
 {
-	struct mlx5_ib_flow_action *maction = to_mflow_act(ib_spec->action.act);
 
 	switch (maction->ib_action.type) {
 	case IB_FLOW_ACTION_ESP:
+		if (action->action & (MLX5_FLOW_CONTEXT_ACTION_ENCRYPT |
+				      MLX5_FLOW_CONTEXT_ACTION_DECRYPT))
+			return -EINVAL;
 		/* Currently only AES_GCM keymat is supported by the driver */
 		action->esp_id = (uintptr_t)maction->esp_aes_gcm.ctx;
-		action->action |= flow_attr->flags & IB_FLOW_ATTR_FLAGS_EGRESS ?
+		action->action |= is_egress ?
 			MLX5_FLOW_CONTEXT_ACTION_ENCRYPT :
 			MLX5_FLOW_CONTEXT_ACTION_DECRYPT;
 		return 0;
+	case IB_FLOW_ACTION_UNSPECIFIED:
+		if (maction->flow_action_raw.sub_type ==
+		    MLX5_IB_FLOW_ACTION_MODIFY_HEADER) {
+			if (action->action & MLX5_FLOW_CONTEXT_ACTION_MOD_HDR)
+				return -EINVAL;
+			action->action |= MLX5_FLOW_CONTEXT_ACTION_MOD_HDR;
+			action->modify_hdr =
+				maction->flow_action_raw.modify_hdr;
+			return 0;
+		}
+		if (maction->flow_action_raw.sub_type ==
+		    MLX5_IB_FLOW_ACTION_DECAP) {
+			if (action->action & MLX5_FLOW_CONTEXT_ACTION_DECAP)
+				return -EINVAL;
+			action->action |= MLX5_FLOW_CONTEXT_ACTION_DECAP;
+			return 0;
+		}
+		if (maction->flow_action_raw.sub_type ==
+		    MLX5_IB_FLOW_ACTION_PACKET_REFORMAT) {
+			if (action->action &
+			    MLX5_FLOW_CONTEXT_ACTION_PACKET_REFORMAT)
+				return -EINVAL;
+			action->action |=
+				MLX5_FLOW_CONTEXT_ACTION_PACKET_REFORMAT;
+			action->pkt_reformat =
+				maction->flow_action_raw.pkt_reformat;
+			return 0;
+		}
+		/* fall through */
 	default:
 		return -EOPNOTSUPP;
 	}
 }
 
-static int parse_flow_attr(struct mlx5_core_dev *mdev, u32 *match_c,
-			   u32 *match_v, const union ib_flow_spec *ib_spec,
+static int parse_flow_attr(struct mlx5_core_dev *mdev,
+			   struct mlx5_flow_spec *spec,
+			   const union ib_flow_spec *ib_spec,
 			   const struct ib_flow_attr *flow_attr,
 			   struct mlx5_flow_act *action, u32 prev_type)
 {
+	struct mlx5_flow_context *flow_context = &spec->flow_context;
+	u32 *match_c = spec->match_criteria;
+	u32 *match_v = spec->match_value;
 	void *misc_params_c = MLX5_ADDR_OF(fte_match_param, match_c,
 					   misc_parameters);
 	void *misc_params_v = MLX5_ADDR_OF(fte_match_param, match_v,
@@ -2583,8 +2800,10 @@
 		set_tos(headers_c, headers_v,
 			ib_spec->ipv4.mask.tos, ib_spec->ipv4.val.tos);
 
-		set_proto(headers_c, headers_v,
-			  ib_spec->ipv4.mask.proto, ib_spec->ipv4.val.proto);
+		if (set_proto(headers_c, headers_v,
+			      ib_spec->ipv4.mask.proto,
+			      ib_spec->ipv4.val.proto))
+			return -EINVAL;
 		break;
 	case IB_FLOW_SPEC_IPV6:
 		if (FIELDS_NOT_SUPPORTED(ib_spec->ipv6.mask, LAST_IPV6_FIELD))
@@ -2623,9 +2842,10 @@
 			ib_spec->ipv6.mask.traffic_class,
 			ib_spec->ipv6.val.traffic_class);
 
-		set_proto(headers_c, headers_v,
-			  ib_spec->ipv6.mask.next_hdr,
-			  ib_spec->ipv6.val.next_hdr);
+		if (set_proto(headers_c, headers_v,
+			      ib_spec->ipv6.mask.next_hdr,
+			      ib_spec->ipv6.val.next_hdr))
+			return -EINVAL;
 
 		set_flow_label(misc_params_c, misc_params_v,
 			       ntohl(ib_spec->ipv6.mask.flow_label),
@@ -2646,10 +2866,8 @@
 					 LAST_TCP_UDP_FIELD))
 			return -EOPNOTSUPP;
 
-		MLX5_SET(fte_match_set_lyr_2_4, headers_c, ip_protocol,
-			 0xff);
-		MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_protocol,
-			 IPPROTO_TCP);
+		if (set_proto(headers_c, headers_v, 0xff, IPPROTO_TCP))
+			return -EINVAL;
 
 		MLX5_SET(fte_match_set_lyr_2_4, headers_c, tcp_sport,
 			 ntohs(ib_spec->tcp_udp.mask.src_port));
@@ -2666,10 +2884,8 @@
 					 LAST_TCP_UDP_FIELD))
 			return -EOPNOTSUPP;
 
-		MLX5_SET(fte_match_set_lyr_2_4, headers_c, ip_protocol,
-			 0xff);
-		MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_protocol,
-			 IPPROTO_UDP);
+		if (set_proto(headers_c, headers_v, 0xff, IPPROTO_UDP))
+			return -EINVAL;
 
 		MLX5_SET(fte_match_set_lyr_2_4, headers_c, udp_sport,
 			 ntohs(ib_spec->tcp_udp.mask.src_port));
@@ -2685,6 +2901,9 @@
 		if (ib_spec->gre.mask.c_ks_res0_ver)
 			return -EOPNOTSUPP;
 
+		if (set_proto(headers_c, headers_v, 0xff, IPPROTO_GRE))
+			return -EINVAL;
+
 		MLX5_SET(fte_match_set_lyr_2_4, headers_c, ip_protocol,
 			 0xff);
 		MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_protocol,
@@ -2696,11 +2915,11 @@
 			 ntohs(ib_spec->gre.val.protocol));
 
 		memcpy(MLX5_ADDR_OF(fte_match_set_misc, misc_params_c,
-				    gre_key_h),
+				    gre_key.nvgre.hi),
 		       &ib_spec->gre.mask.key,
 		       sizeof(ib_spec->gre.mask.key));
 		memcpy(MLX5_ADDR_OF(fte_match_set_misc, misc_params_v,
-				    gre_key_h),
+				    gre_key.nvgre.hi),
 		       &ib_spec->gre.val.key,
 		       sizeof(ib_spec->gre.val.key));
 		break;
@@ -2785,8 +3004,8 @@
 		if (ib_spec->flow_tag.tag_id >= BIT(24))
 			return -EINVAL;
 
-		action->flow_tag = ib_spec->flow_tag.tag_id;
-		action->has_flow_tag = true;
+		flow_context->flow_tag = ib_spec->flow_tag.tag_id;
+		flow_context->flags |= FLOW_CONTEXT_HAS_TAG;
 		break;
 	case IB_FLOW_SPEC_ACTION_DROP:
 		if (FIELDS_NOT_SUPPORTED(ib_spec->drop,
@@ -2795,7 +3014,8 @@
 		action->action |= MLX5_FLOW_CONTEXT_ACTION_DROP;
 		break;
 	case IB_FLOW_SPEC_ACTION_HANDLE:
-		ret = parse_flow_flow_action(ib_spec, flow_attr, action);
+		ret = parse_flow_flow_action(to_mflow_act(ib_spec->action.act),
+			flow_attr->flags & IB_FLOW_ATTR_FLAGS_EGRESS, action);
 		if (ret)
 			return ret;
 		break;
@@ -2876,10 +3096,11 @@
 	 * rules would be supported, always return VALID_SPEC_NA.
 	 */
 	if (!is_crypto)
-		return egress ? VALID_SPEC_INVALID : VALID_SPEC_NA;
+		return VALID_SPEC_NA;
 
 	return is_crypto && is_ipsec &&
-		(!egress || (!is_drop && !flow_act->has_flow_tag)) ?
+		(!egress || (!is_drop &&
+			     !(spec->flow_context.flags & FLOW_CONTEXT_HAS_TAG))) ?
 		VALID_SPEC_VALID : VALID_SPEC_INVALID;
 }
 
@@ -3019,14 +3240,15 @@
 static struct mlx5_ib_flow_prio *_get_prio(struct mlx5_flow_namespace *ns,
 					   struct mlx5_ib_flow_prio *prio,
 					   int priority,
-					   int num_entries, int num_groups)
+					   int num_entries, int num_groups,
+					   u32 flags)
 {
 	struct mlx5_flow_table *ft;
 
 	ft = mlx5_create_auto_grouped_flow_table(ns, priority,
 						 num_entries,
 						 num_groups,
-						 0, 0);
+						 0, flags);
 	if (IS_ERR(ft))
 		return ERR_CAST(ft);
 
@@ -3046,26 +3268,46 @@
 	int max_table_size;
 	int num_entries;
 	int num_groups;
+	bool esw_encap;
+	u32 flags = 0;
 	int priority;
 
 	max_table_size = BIT(MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev,
 						       log_max_ft_size));
+	esw_encap = mlx5_eswitch_get_encap_mode(dev->mdev) !=
+		DEVLINK_ESWITCH_ENCAP_MODE_NONE;
 	if (flow_attr->type == IB_FLOW_ATTR_NORMAL) {
-		if (ft_type == MLX5_IB_FT_TX)
-			priority = 0;
-		else if (flow_is_multicast_only(flow_attr) &&
-			 !dont_trap)
+		enum mlx5_flow_namespace_type fn_type;
+
+		if (flow_is_multicast_only(flow_attr) &&
+		    !dont_trap)
 			priority = MLX5_IB_FLOW_MCAST_PRIO;
 		else
 			priority = ib_prio_to_core_prio(flow_attr->priority,
 							dont_trap);
-		ns = mlx5_get_flow_namespace(dev->mdev,
-					     ft_type == MLX5_IB_FT_TX ?
-					     MLX5_FLOW_NAMESPACE_EGRESS :
-					     MLX5_FLOW_NAMESPACE_BYPASS);
+		if (ft_type == MLX5_IB_FT_RX) {
+			fn_type = MLX5_FLOW_NAMESPACE_BYPASS;
+			prio = &dev->flow_db->prios[priority];
+			if (!dev->is_rep && !esw_encap &&
+			    MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev, decap))
+				flags |= MLX5_FLOW_TABLE_TUNNEL_EN_DECAP;
+			if (!dev->is_rep && !esw_encap &&
+			    MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev,
+					reformat_l3_tunnel_to_l2))
+				flags |= MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT;
+		} else {
+			max_table_size =
+				BIT(MLX5_CAP_FLOWTABLE_NIC_TX(dev->mdev,
+							      log_max_ft_size));
+			fn_type = MLX5_FLOW_NAMESPACE_EGRESS;
+			prio = &dev->flow_db->egress_prios[priority];
+			if (!dev->is_rep && !esw_encap &&
+			    MLX5_CAP_FLOWTABLE_NIC_TX(dev->mdev, reformat))
+				flags |= MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT;
+		}
+		ns = mlx5_get_flow_namespace(dev->mdev, fn_type);
 		num_entries = MLX5_FS_MAX_ENTRIES;
 		num_groups = MLX5_FS_MAX_TYPES;
-		prio = &dev->flow_db->prios[priority];
 	} else if (flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT ||
 		   flow_attr->type == IB_FLOW_ATTR_MC_DEFAULT) {
 		ns = mlx5_get_flow_namespace(dev->mdev,
@@ -3092,12 +3334,12 @@
 	if (!ns)
 		return ERR_PTR(-ENOTSUPP);
 
-	if (num_entries > max_table_size)
-		return ERR_PTR(-ENOMEM);
+	max_table_size = min_t(int, num_entries, max_table_size);
 
 	ft = prio->flow_table;
 	if (!ft)
-		return _get_prio(ns, prio, priority, num_entries, num_groups);
+		return _get_prio(ns, prio, priority, max_table_size, num_groups,
+				 flags);
 
 	return prio;
 }
@@ -3241,6 +3483,37 @@
 	return ret;
 }
 
+static void mlx5_ib_set_rule_source_port(struct mlx5_ib_dev *dev,
+					 struct mlx5_flow_spec *spec,
+					 struct mlx5_eswitch_rep *rep)
+{
+	struct mlx5_eswitch *esw = dev->mdev->priv.eswitch;
+	void *misc;
+
+	if (mlx5_eswitch_vport_match_metadata_enabled(esw)) {
+		misc = MLX5_ADDR_OF(fte_match_param, spec->match_value,
+				    misc_parameters_2);
+
+		MLX5_SET(fte_match_set_misc2, misc, metadata_reg_c_0,
+			 mlx5_eswitch_get_vport_metadata_for_match(esw,
+								   rep->vport));
+		misc = MLX5_ADDR_OF(fte_match_param, spec->match_criteria,
+				    misc_parameters_2);
+
+		MLX5_SET_TO_ONES(fte_match_set_misc2, misc, metadata_reg_c_0);
+	} else {
+		misc = MLX5_ADDR_OF(fte_match_param, spec->match_value,
+				    misc_parameters);
+
+		MLX5_SET(fte_match_set_misc, misc, source_port, rep->vport);
+
+		misc = MLX5_ADDR_OF(fte_match_param, spec->match_criteria,
+				    misc_parameters);
+
+		MLX5_SET_TO_ONES(fte_match_set_misc, misc, source_port);
+	}
+}
+
 static struct mlx5_ib_flow_handler *_create_flow_rule(struct mlx5_ib_dev *dev,
 						      struct mlx5_ib_flow_prio *ft_prio,
 						      const struct ib_flow_attr *flow_attr,
@@ -3250,7 +3523,7 @@
 {
 	struct mlx5_flow_table	*ft = ft_prio->flow_table;
 	struct mlx5_ib_flow_handler *handler;
-	struct mlx5_flow_act flow_act = {.flow_tag = MLX5_FS_DEFAULT_FLOW_TAG};
+	struct mlx5_flow_act flow_act = {};
 	struct mlx5_flow_spec *spec;
 	struct mlx5_flow_destination dest_arr[2] = {};
 	struct mlx5_flow_destination *rule_dst = dest_arr;
@@ -3264,6 +3537,9 @@
 	if (!is_valid_attr(dev->mdev, flow_attr))
 		return ERR_PTR(-EINVAL);
 
+	if (dev->is_rep && is_egress)
+		return ERR_PTR(-EINVAL);
+
 	spec = kvzalloc(sizeof(*spec), GFP_KERNEL);
 	handler = kzalloc(sizeof(*handler), GFP_KERNEL);
 	if (!handler || !spec) {
@@ -3278,8 +3554,7 @@
 	}
 
 	for (spec_index = 0; spec_index < flow_attr->num_of_specs; spec_index++) {
-		err = parse_flow_attr(dev->mdev, spec->match_criteria,
-				      spec->match_value,
+		err = parse_flow_attr(dev->mdev, spec,
 				      ib_flow, flow_attr, &flow_act,
 				      prev_type);
 		if (err < 0)
@@ -3292,16 +3567,16 @@
 	if (!flow_is_multicast_only(flow_attr))
 		set_underlay_qp(dev, spec, underlay_qpn);
 
-	if (dev->rep) {
-		void *misc;
+	if (dev->is_rep) {
+		struct mlx5_eswitch_rep *rep;
 
-		misc = MLX5_ADDR_OF(fte_match_param, spec->match_value,
-				    misc_parameters);
-		MLX5_SET(fte_match_set_misc, misc, source_port,
-			 dev->rep->vport);
-		misc = MLX5_ADDR_OF(fte_match_param, spec->match_criteria,
-				    misc_parameters);
-		MLX5_SET_TO_ONES(fte_match_set_misc, misc, source_port);
+		rep = dev->port[flow_attr->port - 1].rep;
+		if (!rep) {
+			err = -EINVAL;
+			goto free;
+		}
+
+		mlx5_ib_set_rule_source_port(dev, spec, rep);
 	}
 
 	spec->match_criteria_enable = get_match_criteria_enable(spec->match_criteria);
@@ -3313,15 +3588,18 @@
 	}
 
 	if (flow_act.action & MLX5_FLOW_CONTEXT_ACTION_COUNT) {
+		struct mlx5_ib_mcounters *mcounters;
+
 		err = flow_counters_set_data(flow_act.counters, ucmd);
 		if (err)
 			goto free;
 
+		mcounters = to_mcounters(flow_act.counters);
 		handler->ibcounters = flow_act.counters;
 		dest_arr[dest_num].type =
 			MLX5_FLOW_DESTINATION_TYPE_COUNTER;
-		dest_arr[dest_num].counter =
-			to_mcounters(flow_act.counters)->hw_cntrs_hndl;
+		dest_arr[dest_num].counter_id =
+			mlx5_fc_id(mcounters->hw_cntrs_hndl);
 		dest_num++;
 	}
 
@@ -3339,11 +3617,11 @@
 					MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO;
 	}
 
-	if (flow_act.has_flow_tag &&
+	if ((spec->flow_context.flags & FLOW_CONTEXT_HAS_TAG)  &&
 	    (flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT ||
 	     flow_attr->type == IB_FLOW_ATTR_MC_DEFAULT)) {
 		mlx5_ib_warn(dev, "Flow tag %u and attribute type %x isn't allowed in leftovers\n",
-			     flow_act.flow_tag, flow_attr->type);
+			     spec->flow_context.flow_tag, flow_attr->type);
 		err = -EINVAL;
 		goto free;
 	}
@@ -3651,34 +3929,78 @@
 	return ERR_PTR(err);
 }
 
-static struct mlx5_ib_flow_prio *_get_flow_table(struct mlx5_ib_dev *dev,
-						 int priority, bool mcast)
+static struct mlx5_ib_flow_prio *
+_get_flow_table(struct mlx5_ib_dev *dev,
+		struct mlx5_ib_flow_matcher *fs_matcher,
+		bool mcast)
 {
-	int max_table_size;
 	struct mlx5_flow_namespace *ns = NULL;
-	struct mlx5_ib_flow_prio *prio;
-
-	max_table_size = BIT(MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev,
-			     log_max_ft_size));
-	if (max_table_size < MLX5_FS_MAX_ENTRIES)
-		return ERR_PTR(-ENOMEM);
+	struct mlx5_ib_flow_prio *prio = NULL;
+	int max_table_size = 0;
+	bool esw_encap;
+	u32 flags = 0;
+	int priority;
 
 	if (mcast)
 		priority = MLX5_IB_FLOW_MCAST_PRIO;
 	else
-		priority = ib_prio_to_core_prio(priority, false);
+		priority = ib_prio_to_core_prio(fs_matcher->priority, false);
 
-	ns = mlx5_get_flow_namespace(dev->mdev, MLX5_FLOW_NAMESPACE_BYPASS);
+	esw_encap = mlx5_eswitch_get_encap_mode(dev->mdev) !=
+		DEVLINK_ESWITCH_ENCAP_MODE_NONE;
+	if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_BYPASS) {
+		max_table_size = BIT(MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev,
+					log_max_ft_size));
+		if (MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev, decap) && !esw_encap)
+			flags |= MLX5_FLOW_TABLE_TUNNEL_EN_DECAP;
+		if (MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev,
+					      reformat_l3_tunnel_to_l2) &&
+		    !esw_encap)
+			flags |= MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT;
+	} else if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_EGRESS) {
+		max_table_size = BIT(
+			MLX5_CAP_FLOWTABLE_NIC_TX(dev->mdev, log_max_ft_size));
+		if (MLX5_CAP_FLOWTABLE_NIC_TX(dev->mdev, reformat) && !esw_encap)
+			flags |= MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT;
+	} else if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_FDB) {
+		max_table_size = BIT(
+			MLX5_CAP_ESW_FLOWTABLE_FDB(dev->mdev, log_max_ft_size));
+		if (MLX5_CAP_ESW_FLOWTABLE_FDB(dev->mdev, decap) && esw_encap)
+			flags |= MLX5_FLOW_TABLE_TUNNEL_EN_DECAP;
+		if (MLX5_CAP_ESW_FLOWTABLE_FDB(dev->mdev, reformat_l3_tunnel_to_l2) &&
+		    esw_encap)
+			flags |= MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT;
+		priority = FDB_BYPASS_PATH;
+	} else if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_RDMA_RX) {
+		max_table_size =
+			BIT(MLX5_CAP_FLOWTABLE_RDMA_RX(dev->mdev,
+						       log_max_ft_size));
+		priority = fs_matcher->priority;
+	}
+
+	max_table_size = min_t(int, max_table_size, MLX5_FS_MAX_ENTRIES);
+
+	ns = mlx5_get_flow_namespace(dev->mdev, fs_matcher->ns_type);
 	if (!ns)
 		return ERR_PTR(-ENOTSUPP);
 
-	prio = &dev->flow_db->prios[priority];
+	if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_BYPASS)
+		prio = &dev->flow_db->prios[priority];
+	else if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_EGRESS)
+		prio = &dev->flow_db->egress_prios[priority];
+	else if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_FDB)
+		prio = &dev->flow_db->fdb;
+	else if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_RDMA_RX)
+		prio = &dev->flow_db->rdma_rx[priority];
+
+	if (!prio)
+		return ERR_PTR(-EINVAL);
 
 	if (prio->flow_table)
 		return prio;
 
-	return _get_prio(ns, prio, priority, MLX5_FS_MAX_ENTRIES,
-			 MLX5_FS_MAX_TYPES);
+	return _get_prio(ns, prio, priority, max_table_size,
+			 MLX5_FS_MAX_TYPES, flags);
 }
 
 static struct mlx5_ib_flow_handler *
@@ -3686,10 +4008,12 @@
 		      struct mlx5_ib_flow_prio *ft_prio,
 		      struct mlx5_flow_destination *dst,
 		      struct mlx5_ib_flow_matcher  *fs_matcher,
-		      void *cmd_in, int inlen)
+		      struct mlx5_flow_context *flow_context,
+		      struct mlx5_flow_act *flow_act,
+		      void *cmd_in, int inlen,
+		      int dst_num)
 {
 	struct mlx5_ib_flow_handler *handler;
-	struct mlx5_flow_act flow_act = {.flow_tag = MLX5_FS_DEFAULT_FLOW_TAG};
 	struct mlx5_flow_spec *spec;
 	struct mlx5_flow_table *ft = ft_prio->flow_table;
 	int err = 0;
@@ -3707,10 +4031,10 @@
 	memcpy(spec->match_criteria, fs_matcher->matcher_mask.match_params,
 	       fs_matcher->mask_len);
 	spec->match_criteria_enable = fs_matcher->match_criteria_enable;
+	spec->flow_context = *flow_context;
 
-	flow_act.action |= MLX5_FLOW_CONTEXT_ACTION_FWD_DEST;
 	handler->rule = mlx5_add_flow_rules(ft, spec,
-					    &flow_act, dst, 1);
+					    flow_act, dst, dst_num);
 
 	if (IS_ERR(handler->rule)) {
 		err = PTR_ERR(handler->rule);
@@ -3772,13 +4096,16 @@
 struct mlx5_ib_flow_handler *
 mlx5_ib_raw_fs_rule_add(struct mlx5_ib_dev *dev,
 			struct mlx5_ib_flow_matcher *fs_matcher,
+			struct mlx5_flow_context *flow_context,
+			struct mlx5_flow_act *flow_act,
+			u32 counter_id,
 			void *cmd_in, int inlen, int dest_id,
 			int dest_type)
 {
 	struct mlx5_flow_destination *dst;
 	struct mlx5_ib_flow_prio *ft_prio;
-	int priority = fs_matcher->priority;
 	struct mlx5_ib_flow_handler *handler;
+	int dst_num = 0;
 	bool mcast;
 	int err;
 
@@ -3788,29 +4115,43 @@
 	if (fs_matcher->priority > MLX5_IB_FLOW_LAST_PRIO)
 		return ERR_PTR(-ENOMEM);
 
-	dst = kzalloc(sizeof(*dst), GFP_KERNEL);
+	dst = kcalloc(2, sizeof(*dst), GFP_KERNEL);
 	if (!dst)
 		return ERR_PTR(-ENOMEM);
 
 	mcast = raw_fs_is_multicast(fs_matcher, cmd_in);
 	mutex_lock(&dev->flow_db->lock);
 
-	ft_prio = _get_flow_table(dev, priority, mcast);
+	ft_prio = _get_flow_table(dev, fs_matcher, mcast);
 	if (IS_ERR(ft_prio)) {
 		err = PTR_ERR(ft_prio);
 		goto unlock;
 	}
 
 	if (dest_type == MLX5_FLOW_DESTINATION_TYPE_TIR) {
-		dst->type = dest_type;
-		dst->tir_num = dest_id;
+		dst[dst_num].type = dest_type;
+		dst[dst_num].tir_num = dest_id;
+		flow_act->action |= MLX5_FLOW_CONTEXT_ACTION_FWD_DEST;
+	} else if (dest_type == MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE) {
+		dst[dst_num].type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE_NUM;
+		dst[dst_num].ft_num = dest_id;
+		flow_act->action |= MLX5_FLOW_CONTEXT_ACTION_FWD_DEST;
 	} else {
-		dst->type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE_NUM;
-		dst->ft_num = dest_id;
+		dst[dst_num].type = MLX5_FLOW_DESTINATION_TYPE_PORT;
+		flow_act->action |= MLX5_FLOW_CONTEXT_ACTION_ALLOW;
 	}
 
-	handler = _create_raw_flow_rule(dev, ft_prio, dst, fs_matcher, cmd_in,
-					inlen);
+	dst_num++;
+
+	if (flow_act->action & MLX5_FLOW_CONTEXT_ACTION_COUNT) {
+		dst[dst_num].type = MLX5_FLOW_DESTINATION_TYPE_COUNTER;
+		dst[dst_num].counter_id = counter_id;
+		dst_num++;
+	}
+
+	handler = _create_raw_flow_rule(dev, ft_prio, dst, fs_matcher,
+					flow_context, flow_act,
+					cmd_in, inlen, dst_num);
 
 	if (IS_ERR(handler)) {
 		err = PTR_ERR(handler);
@@ -3988,6 +4329,9 @@
 		 */
 		mlx5_accel_esp_destroy_xfrm(maction->esp_aes_gcm.ctx);
 		break;
+	case IB_FLOW_ACTION_UNSPECIFIED:
+		mlx5_ib_destroy_flow_action_raw(maction);
+		break;
 	default:
 		WARN_ON(true);
 		break;
@@ -4002,13 +4346,17 @@
 	struct mlx5_ib_dev *dev = to_mdev(ibqp->device);
 	struct mlx5_ib_qp *mqp = to_mqp(ibqp);
 	int err;
+	u16 uid;
+
+	uid = ibqp->pd ?
+		to_mpd(ibqp->pd)->uid : 0;
 
 	if (mqp->flags & MLX5_IB_QP_UNDERLAY) {
 		mlx5_ib_dbg(dev, "Attaching a multi cast group to underlay QP is not supported\n");
 		return -EOPNOTSUPP;
 	}
 
-	err = mlx5_core_attach_mcg(dev->mdev, gid, ibqp->qp_num);
+	err = mlx5_cmd_attach_mcg(dev->mdev, gid, ibqp->qp_num, uid);
 	if (err)
 		mlx5_ib_warn(dev, "failed attaching QPN 0x%x, MGID %pI6\n",
 			     ibqp->qp_num, gid->raw);
@@ -4020,8 +4368,11 @@
 {
 	struct mlx5_ib_dev *dev = to_mdev(ibqp->device);
 	int err;
+	u16 uid;
 
-	err = mlx5_core_detach_mcg(dev->mdev, gid, ibqp->qp_num);
+	uid = ibqp->pd ?
+		to_mpd(ibqp->pd)->uid : 0;
+	err = mlx5_cmd_detach_mcg(dev->mdev, gid, ibqp->qp_num, uid);
 	if (err)
 		mlx5_ib_warn(dev, "failed detaching QPN 0x%x, MGID %pI6\n",
 			     ibqp->qp_num, gid->raw);
@@ -4042,61 +4393,68 @@
 	return mlx5_query_node_guid(dev, &dev->ib_dev.node_guid);
 }
 
-static ssize_t show_fw_pages(struct device *device, struct device_attribute *attr,
-			     char *buf)
+static ssize_t fw_pages_show(struct device *device,
+			     struct device_attribute *attr, char *buf)
 {
 	struct mlx5_ib_dev *dev =
-		container_of(device, struct mlx5_ib_dev, ib_dev.dev);
+		rdma_device_to_drv_device(device, struct mlx5_ib_dev, ib_dev);
 
 	return sprintf(buf, "%d\n", dev->mdev->priv.fw_pages);
 }
+static DEVICE_ATTR_RO(fw_pages);
 
-static ssize_t show_reg_pages(struct device *device,
+static ssize_t reg_pages_show(struct device *device,
 			      struct device_attribute *attr, char *buf)
 {
 	struct mlx5_ib_dev *dev =
-		container_of(device, struct mlx5_ib_dev, ib_dev.dev);
+		rdma_device_to_drv_device(device, struct mlx5_ib_dev, ib_dev);
 
 	return sprintf(buf, "%d\n", atomic_read(&dev->mdev->priv.reg_pages));
 }
+static DEVICE_ATTR_RO(reg_pages);
 
-static ssize_t show_hca(struct device *device, struct device_attribute *attr,
-			char *buf)
+static ssize_t hca_type_show(struct device *device,
+			     struct device_attribute *attr, char *buf)
 {
 	struct mlx5_ib_dev *dev =
-		container_of(device, struct mlx5_ib_dev, ib_dev.dev);
+		rdma_device_to_drv_device(device, struct mlx5_ib_dev, ib_dev);
+
 	return sprintf(buf, "MT%d\n", dev->mdev->pdev->device);
 }
+static DEVICE_ATTR_RO(hca_type);
 
-static ssize_t show_rev(struct device *device, struct device_attribute *attr,
-			char *buf)
+static ssize_t hw_rev_show(struct device *device,
+			   struct device_attribute *attr, char *buf)
 {
 	struct mlx5_ib_dev *dev =
-		container_of(device, struct mlx5_ib_dev, ib_dev.dev);
+		rdma_device_to_drv_device(device, struct mlx5_ib_dev, ib_dev);
+
 	return sprintf(buf, "%x\n", dev->mdev->rev_id);
 }
+static DEVICE_ATTR_RO(hw_rev);
 
-static ssize_t show_board(struct device *device, struct device_attribute *attr,
-			  char *buf)
+static ssize_t board_id_show(struct device *device,
+			     struct device_attribute *attr, char *buf)
 {
 	struct mlx5_ib_dev *dev =
-		container_of(device, struct mlx5_ib_dev, ib_dev.dev);
+		rdma_device_to_drv_device(device, struct mlx5_ib_dev, ib_dev);
+
 	return sprintf(buf, "%.*s\n", MLX5_BOARD_ID_LEN,
 		       dev->mdev->board_id);
 }
+static DEVICE_ATTR_RO(board_id);
 
-static DEVICE_ATTR(hw_rev,   S_IRUGO, show_rev,    NULL);
-static DEVICE_ATTR(hca_type, S_IRUGO, show_hca,    NULL);
-static DEVICE_ATTR(board_id, S_IRUGO, show_board,  NULL);
-static DEVICE_ATTR(fw_pages, S_IRUGO, show_fw_pages, NULL);
-static DEVICE_ATTR(reg_pages, S_IRUGO, show_reg_pages, NULL);
+static struct attribute *mlx5_class_attributes[] = {
+	&dev_attr_hw_rev.attr,
+	&dev_attr_hca_type.attr,
+	&dev_attr_board_id.attr,
+	&dev_attr_fw_pages.attr,
+	&dev_attr_reg_pages.attr,
+	NULL,
+};
 
-static struct device_attribute *mlx5_class_attributes[] = {
-	&dev_attr_hw_rev,
-	&dev_attr_hca_type,
-	&dev_attr_board_id,
-	&dev_attr_fw_pages,
-	&dev_attr_reg_pages,
+static const struct attribute_group mlx5_attr_group = {
+	.attrs = mlx5_class_attributes,
 };
 
 static void pkey_change_handler(struct work_struct *work)
@@ -4164,7 +4522,7 @@
 	 * lock/unlock above locks Now need to arm all involved CQs.
 	 */
 	list_for_each_entry(mcq, &cq_armed_list, reset_notify) {
-		mcq->comp(mcq);
+		mcq->comp(mcq, NULL);
 	}
 	spin_unlock_irqrestore(&ibdev->reset_flow_resource_lock, flags);
 }
@@ -4189,6 +4547,67 @@
 	mutex_unlock(&delay_drop->lock);
 }
 
+static void handle_general_event(struct mlx5_ib_dev *ibdev, struct mlx5_eqe *eqe,
+				 struct ib_event *ibev)
+{
+	u8 port = (eqe->data.port.port >> 4) & 0xf;
+
+	switch (eqe->sub_type) {
+	case MLX5_GENERAL_SUBTYPE_DELAY_DROP_TIMEOUT:
+		if (mlx5_ib_port_link_layer(&ibdev->ib_dev, port) ==
+					    IB_LINK_LAYER_ETHERNET)
+			schedule_work(&ibdev->delay_drop.delay_drop_work);
+		break;
+	default: /* do nothing */
+		return;
+	}
+}
+
+static int handle_port_change(struct mlx5_ib_dev *ibdev, struct mlx5_eqe *eqe,
+			      struct ib_event *ibev)
+{
+	u8 port = (eqe->data.port.port >> 4) & 0xf;
+
+	ibev->element.port_num = port;
+
+	switch (eqe->sub_type) {
+	case MLX5_PORT_CHANGE_SUBTYPE_ACTIVE:
+	case MLX5_PORT_CHANGE_SUBTYPE_DOWN:
+	case MLX5_PORT_CHANGE_SUBTYPE_INITIALIZED:
+		/* In RoCE, port up/down events are handled in
+		 * mlx5_netdev_event().
+		 */
+		if (mlx5_ib_port_link_layer(&ibdev->ib_dev, port) ==
+					    IB_LINK_LAYER_ETHERNET)
+			return -EINVAL;
+
+		ibev->event = (eqe->sub_type == MLX5_PORT_CHANGE_SUBTYPE_ACTIVE) ?
+				IB_EVENT_PORT_ACTIVE : IB_EVENT_PORT_ERR;
+		break;
+
+	case MLX5_PORT_CHANGE_SUBTYPE_LID:
+		ibev->event = IB_EVENT_LID_CHANGE;
+		break;
+
+	case MLX5_PORT_CHANGE_SUBTYPE_PKEY:
+		ibev->event = IB_EVENT_PKEY_CHANGE;
+		schedule_work(&ibdev->devr.ports[port - 1].pkey_change_work);
+		break;
+
+	case MLX5_PORT_CHANGE_SUBTYPE_GUID:
+		ibev->event = IB_EVENT_GID_CHANGE;
+		break;
+
+	case MLX5_PORT_CHANGE_SUBTYPE_CLIENT_REREG:
+		ibev->event = IB_EVENT_CLIENT_REREGISTER;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
 static void mlx5_ib_handle_event(struct work_struct *_work)
 {
 	struct mlx5_ib_event_work *work =
@@ -4196,65 +4615,37 @@
 	struct mlx5_ib_dev *ibdev;
 	struct ib_event ibev;
 	bool fatal = false;
-	u8 port = (u8)work->param;
 
-	if (mlx5_core_is_mp_slave(work->dev)) {
-		ibdev = mlx5_ib_get_ibdev_from_mpi(work->context);
+	if (work->is_slave) {
+		ibdev = mlx5_ib_get_ibdev_from_mpi(work->mpi);
 		if (!ibdev)
 			goto out;
 	} else {
-		ibdev = work->context;
+		ibdev = work->dev;
 	}
 
 	switch (work->event) {
 	case MLX5_DEV_EVENT_SYS_ERROR:
 		ibev.event = IB_EVENT_DEVICE_FATAL;
 		mlx5_ib_handle_internal_error(ibdev);
+		ibev.element.port_num  = (u8)(unsigned long)work->param;
 		fatal = true;
 		break;
-
-	case MLX5_DEV_EVENT_PORT_UP:
-	case MLX5_DEV_EVENT_PORT_DOWN:
-	case MLX5_DEV_EVENT_PORT_INITIALIZED:
-		/* In RoCE, port up/down events are handled in
-		 * mlx5_netdev_event().
-		 */
-		if (mlx5_ib_port_link_layer(&ibdev->ib_dev, port) ==
-			IB_LINK_LAYER_ETHERNET)
+	case MLX5_EVENT_TYPE_PORT_CHANGE:
+		if (handle_port_change(ibdev, work->param, &ibev))
 			goto out;
-
-		ibev.event = (work->event == MLX5_DEV_EVENT_PORT_UP) ?
-			     IB_EVENT_PORT_ACTIVE : IB_EVENT_PORT_ERR;
 		break;
-
-	case MLX5_DEV_EVENT_LID_CHANGE:
-		ibev.event = IB_EVENT_LID_CHANGE;
-		break;
-
-	case MLX5_DEV_EVENT_PKEY_CHANGE:
-		ibev.event = IB_EVENT_PKEY_CHANGE;
-		schedule_work(&ibdev->devr.ports[port - 1].pkey_change_work);
-		break;
-
-	case MLX5_DEV_EVENT_GUID_CHANGE:
-		ibev.event = IB_EVENT_GID_CHANGE;
-		break;
-
-	case MLX5_DEV_EVENT_CLIENT_REREG:
-		ibev.event = IB_EVENT_CLIENT_REREGISTER;
-		break;
-	case MLX5_DEV_EVENT_DELAY_DROP_TIMEOUT:
-		schedule_work(&ibdev->delay_drop.delay_drop_work);
-		goto out;
+	case MLX5_EVENT_TYPE_GENERAL_EVENT:
+		handle_general_event(ibdev, work->param, &ibev);
+		/* fall through */
 	default:
 		goto out;
 	}
 
-	ibev.device	      = &ibdev->ib_dev;
-	ibev.element.port_num = port;
+	ibev.device = &ibdev->ib_dev;
 
-	if (!rdma_is_port_valid(&ibdev->ib_dev, port)) {
-		mlx5_ib_warn(ibdev, "warning: event on port %d\n", port);
+	if (!rdma_is_port_valid(&ibdev->ib_dev, ibev.element.port_num)) {
+		mlx5_ib_warn(ibdev, "warning: event on port %d\n",  ibev.element.port_num);
 		goto out;
 	}
 
@@ -4267,22 +4658,43 @@
 	kfree(work);
 }
 
-static void mlx5_ib_event(struct mlx5_core_dev *dev, void *context,
-			  enum mlx5_dev_event event, unsigned long param)
+static int mlx5_ib_event(struct notifier_block *nb,
+			 unsigned long event, void *param)
 {
 	struct mlx5_ib_event_work *work;
 
 	work = kmalloc(sizeof(*work), GFP_ATOMIC);
 	if (!work)
-		return;
+		return NOTIFY_DONE;
 
 	INIT_WORK(&work->work, mlx5_ib_handle_event);
-	work->dev = dev;
+	work->dev = container_of(nb, struct mlx5_ib_dev, mdev_events);
+	work->is_slave = false;
 	work->param = param;
-	work->context = context;
 	work->event = event;
 
 	queue_work(mlx5_ib_event_wq, &work->work);
+
+	return NOTIFY_OK;
+}
+
+static int mlx5_ib_event_slave_port(struct notifier_block *nb,
+				    unsigned long event, void *param)
+{
+	struct mlx5_ib_event_work *work;
+
+	work = kmalloc(sizeof(*work), GFP_ATOMIC);
+	if (!work)
+		return NOTIFY_DONE;
+
+	INIT_WORK(&work->work, mlx5_ib_handle_event);
+	work->mpi = container_of(nb, struct mlx5_ib_multiport_info, mdev_events);
+	work->is_slave = true;
+	work->param = param;
+	work->event = event;
+	queue_work(mlx5_ib_event_wq, &work->work);
+
+	return NOTIFY_OK;
 }
 
 static int set_has_smi_cap(struct mlx5_ib_dev *dev)
@@ -4291,7 +4703,7 @@
 	int err;
 	int port;
 
-	for (port = 1; port <= dev->num_ports; port++) {
+	for (port = 1; port <= ARRAY_SIZE(dev->mdev->port_caps); port++) {
 		dev->mdev->port_caps[port - 1].has_smi = false;
 		if (MLX5_CAP_GEN(dev->mdev, port_type) ==
 		    MLX5_CAP_PORT_TYPE_IB) {
@@ -4322,14 +4734,14 @@
 		mlx5_query_ext_port_caps(dev, port);
 }
 
-static int get_port_caps(struct mlx5_ib_dev *dev, u8 port)
+static int __get_port_caps(struct mlx5_ib_dev *dev, u8 port)
 {
 	struct ib_device_attr *dprops = NULL;
 	struct ib_port_attr *pprops = NULL;
 	int err = -ENOMEM;
 	struct ib_udata uhw = {.inlen = 0, .outlen = 0};
 
-	pprops = kmalloc(sizeof(*pprops), GFP_KERNEL);
+	pprops = kzalloc(sizeof(*pprops), GFP_KERNEL);
 	if (!pprops)
 		goto out;
 
@@ -4337,17 +4749,12 @@
 	if (!dprops)
 		goto out;
 
-	err = set_has_smi_cap(dev);
-	if (err)
-		goto out;
-
 	err = mlx5_ib_query_device(&dev->ib_dev, dprops, &uhw);
 	if (err) {
 		mlx5_ib_warn(dev, "query_device failed %d\n", err);
 		goto out;
 	}
 
-	memset(pprops, 0, sizeof(*pprops));
 	err = mlx5_ib_query_port(&dev->ib_dev, port, pprops);
 	if (err) {
 		mlx5_ib_warn(dev, "query_port %d failed %d\n",
@@ -4369,6 +4776,16 @@
 	return err;
 }
 
+static int get_port_caps(struct mlx5_ib_dev *dev, u8 port)
+{
+	/* For representors use port 1, is this is the only native
+	 * port
+	 */
+	if (dev->is_rep)
+		return __get_port_caps(dev, 1);
+	return __get_port_caps(dev, port);
+}
+
 static void destroy_umrc_res(struct mlx5_ib_dev *dev)
 {
 	int err;
@@ -4378,7 +4795,7 @@
 		mlx5_ib_warn(dev, "mr cache cleanup failed\n");
 
 	if (dev->umrc.qp)
-		mlx5_ib_destroy_qp(dev->umrc.qp);
+		mlx5_ib_destroy_qp(dev->umrc.qp, NULL);
 	if (dev->umrc.cq)
 		ib_free_cq(dev->umrc.cq);
 	if (dev->umrc.pd)
@@ -4483,7 +4900,7 @@
 	return 0;
 
 error_4:
-	mlx5_ib_destroy_qp(qp);
+	mlx5_ib_destroy_qp(qp, NULL);
 	dev->umrc.qp = NULL;
 
 error_3:
@@ -4516,36 +4933,42 @@
 {
 	struct ib_srq_init_attr attr;
 	struct mlx5_ib_dev *dev;
+	struct ib_device *ibdev;
 	struct ib_cq_init_attr cq_attr = {.cqe = 1};
 	int port;
 	int ret = 0;
 
 	dev = container_of(devr, struct mlx5_ib_dev, devr);
+	ibdev = &dev->ib_dev;
 
 	mutex_init(&devr->mutex);
 
-	devr->p0 = mlx5_ib_alloc_pd(&dev->ib_dev, NULL, NULL);
-	if (IS_ERR(devr->p0)) {
-		ret = PTR_ERR(devr->p0);
-		goto error0;
-	}
-	devr->p0->device  = &dev->ib_dev;
+	devr->p0 = rdma_zalloc_drv_obj(ibdev, ib_pd);
+	if (!devr->p0)
+		return -ENOMEM;
+
+	devr->p0->device  = ibdev;
 	devr->p0->uobject = NULL;
 	atomic_set(&devr->p0->usecnt, 0);
 
-	devr->c0 = mlx5_ib_create_cq(&dev->ib_dev, &cq_attr, NULL, NULL);
-	if (IS_ERR(devr->c0)) {
-		ret = PTR_ERR(devr->c0);
+	ret = mlx5_ib_alloc_pd(devr->p0, NULL);
+	if (ret)
+		goto error0;
+
+	devr->c0 = rdma_zalloc_drv_obj(ibdev, ib_cq);
+	if (!devr->c0) {
+		ret = -ENOMEM;
 		goto error1;
 	}
-	devr->c0->device        = &dev->ib_dev;
-	devr->c0->uobject       = NULL;
-	devr->c0->comp_handler  = NULL;
-	devr->c0->event_handler = NULL;
-	devr->c0->cq_context    = NULL;
+
+	devr->c0->device = &dev->ib_dev;
 	atomic_set(&devr->c0->usecnt, 0);
 
-	devr->x0 = mlx5_ib_alloc_xrcd(&dev->ib_dev, NULL, NULL);
+	ret = mlx5_ib_create_cq(devr->c0, &cq_attr, NULL);
+	if (ret)
+		goto err_create_cq;
+
+	devr->x0 = mlx5_ib_alloc_xrcd(&dev->ib_dev, NULL);
 	if (IS_ERR(devr->x0)) {
 		ret = PTR_ERR(devr->x0);
 		goto error2;
@@ -4556,7 +4979,7 @@
 	mutex_init(&devr->x0->tgt_qp_mutex);
 	INIT_LIST_HEAD(&devr->x0->tgt_qp_list);
 
-	devr->x1 = mlx5_ib_alloc_xrcd(&dev->ib_dev, NULL, NULL);
+	devr->x1 = mlx5_ib_alloc_xrcd(&dev->ib_dev, NULL);
 	if (IS_ERR(devr->x1)) {
 		ret = PTR_ERR(devr->x1);
 		goto error3;
@@ -4574,19 +4997,21 @@
 	attr.ext.cq = devr->c0;
 	attr.ext.xrc.xrcd = devr->x0;
 
-	devr->s0 = mlx5_ib_create_srq(devr->p0, &attr, NULL);
-	if (IS_ERR(devr->s0)) {
-		ret = PTR_ERR(devr->s0);
+	devr->s0 = rdma_zalloc_drv_obj(ibdev, ib_srq);
+	if (!devr->s0) {
+		ret = -ENOMEM;
 		goto error4;
 	}
+
 	devr->s0->device	= &dev->ib_dev;
 	devr->s0->pd		= devr->p0;
-	devr->s0->uobject       = NULL;
-	devr->s0->event_handler = NULL;
-	devr->s0->srq_context   = NULL;
 	devr->s0->srq_type      = IB_SRQT_XRC;
 	devr->s0->ext.xrc.xrcd	= devr->x0;
 	devr->s0->ext.cq	= devr->c0;
+	ret = mlx5_ib_create_srq(devr->s0, &attr, NULL);
+	if (ret)
+		goto err_create;
+
 	atomic_inc(&devr->s0->ext.xrc.xrcd->usecnt);
 	atomic_inc(&devr->s0->ext.cq->usecnt);
 	atomic_inc(&devr->p0->usecnt);
@@ -4596,18 +5021,21 @@
 	attr.attr.max_sge = 1;
 	attr.attr.max_wr = 1;
 	attr.srq_type = IB_SRQT_BASIC;
-	devr->s1 = mlx5_ib_create_srq(devr->p0, &attr, NULL);
-	if (IS_ERR(devr->s1)) {
-		ret = PTR_ERR(devr->s1);
+	devr->s1 = rdma_zalloc_drv_obj(ibdev, ib_srq);
+	if (!devr->s1) {
+		ret = -ENOMEM;
 		goto error5;
 	}
+
 	devr->s1->device	= &dev->ib_dev;
 	devr->s1->pd		= devr->p0;
-	devr->s1->uobject       = NULL;
-	devr->s1->event_handler = NULL;
-	devr->s1->srq_context   = NULL;
 	devr->s1->srq_type      = IB_SRQT_BASIC;
 	devr->s1->ext.cq	= devr->c0;
+
+	ret = mlx5_ib_create_srq(devr->s1, &attr, NULL);
+	if (ret)
+		goto error6;
+
 	atomic_inc(&devr->p0->usecnt);
 	atomic_set(&devr->s1->usecnt, 0);
 
@@ -4619,35 +5047,44 @@
 
 	return 0;
 
+error6:
+	kfree(devr->s1);
 error5:
-	mlx5_ib_destroy_srq(devr->s0);
+	mlx5_ib_destroy_srq(devr->s0, NULL);
+err_create:
+	kfree(devr->s0);
 error4:
-	mlx5_ib_dealloc_xrcd(devr->x1);
+	mlx5_ib_dealloc_xrcd(devr->x1, NULL);
 error3:
-	mlx5_ib_dealloc_xrcd(devr->x0);
+	mlx5_ib_dealloc_xrcd(devr->x0, NULL);
 error2:
-	mlx5_ib_destroy_cq(devr->c0);
+	mlx5_ib_destroy_cq(devr->c0, NULL);
+err_create_cq:
+	kfree(devr->c0);
 error1:
-	mlx5_ib_dealloc_pd(devr->p0);
+	mlx5_ib_dealloc_pd(devr->p0, NULL);
 error0:
+	kfree(devr->p0);
 	return ret;
 }
 
 static void destroy_dev_resources(struct mlx5_ib_resources *devr)
 {
-	struct mlx5_ib_dev *dev =
-		container_of(devr, struct mlx5_ib_dev, devr);
 	int port;
 
-	mlx5_ib_destroy_srq(devr->s1);
-	mlx5_ib_destroy_srq(devr->s0);
-	mlx5_ib_dealloc_xrcd(devr->x0);
-	mlx5_ib_dealloc_xrcd(devr->x1);
-	mlx5_ib_destroy_cq(devr->c0);
-	mlx5_ib_dealloc_pd(devr->p0);
+	mlx5_ib_destroy_srq(devr->s1, NULL);
+	kfree(devr->s1);
+	mlx5_ib_destroy_srq(devr->s0, NULL);
+	kfree(devr->s0);
+	mlx5_ib_dealloc_xrcd(devr->x0, NULL);
+	mlx5_ib_dealloc_xrcd(devr->x1, NULL);
+	mlx5_ib_destroy_cq(devr->c0, NULL);
+	kfree(devr->c0);
+	mlx5_ib_dealloc_pd(devr->p0, NULL);
+	kfree(devr->p0);
 
 	/* Make sure no change P_Key work items are still executing */
-	for (port = 0; port < dev->num_ports; ++port)
+	for (port = 0; port < ARRAY_SIZE(devr->ports); ++port)
 		cancel_work_sync(&devr->ports[port].pkey_change_work);
 }
 
@@ -4750,7 +5187,7 @@
 	struct mlx5_flow_table *ft;
 	int err;
 
-	if (!ns || !mlx5_lag_is_active(mdev))
+	if (!ns || !mlx5_lag_is_roce(mdev))
 		return 0;
 
 	err = mlx5_cmd_create_vport_lag(mdev);
@@ -4764,6 +5201,7 @@
 	}
 
 	dev->flow_db->lag_demux_ft = ft;
+	dev->lag_active = true;
 	return 0;
 
 err_destroy_vport_lag:
@@ -4775,7 +5213,9 @@
 {
 	struct mlx5_core_dev *mdev = dev->mdev;
 
-	if (dev->flow_db->lag_demux_ft) {
+	if (dev->lag_active) {
+		dev->lag_active = false;
+
 		mlx5_destroy_flow_table(dev->flow_db->lag_demux_ft);
 		dev->flow_db->lag_demux_ft = NULL;
 
@@ -4787,10 +5227,10 @@
 {
 	int err;
 
-	dev->roce[port_num].nb.notifier_call = mlx5_netdev_event;
-	err = register_netdevice_notifier(&dev->roce[port_num].nb);
+	dev->port[port_num].roce.nb.notifier_call = mlx5_netdev_event;
+	err = register_netdevice_notifier(&dev->port[port_num].roce.nb);
 	if (err) {
-		dev->roce[port_num].nb.notifier_call = NULL;
+		dev->port[port_num].roce.nb.notifier_call = NULL;
 		return err;
 	}
 
@@ -4799,9 +5239,9 @@
 
 static void mlx5_remove_netdev_notifier(struct mlx5_ib_dev *dev, u8 port_num)
 {
-	if (dev->roce[port_num].nb.notifier_call) {
-		unregister_netdevice_notifier(&dev->roce[port_num].nb);
-		dev->roce[port_num].nb.notifier_call = NULL;
+	if (dev->port[port_num].roce.nb.notifier_call) {
+		unregister_netdevice_notifier(&dev->port[port_num].roce.nb);
+		dev->port[port_num].roce.nb.notifier_call = NULL;
 	}
 }
 
@@ -4893,11 +5333,21 @@
 	INIT_EXT_PPCNT_COUNTER(rx_icrc_encapsulated),
 };
 
+static bool is_mdev_switchdev_mode(const struct mlx5_core_dev *mdev)
+{
+	return MLX5_ESWITCH_MANAGER(mdev) &&
+	       mlx5_ib_eswitch_mode(mdev->priv.eswitch) ==
+		       MLX5_ESWITCH_OFFLOADS;
+}
+
 static void mlx5_ib_dealloc_counters(struct mlx5_ib_dev *dev)
 {
+	int num_cnt_ports;
 	int i;
 
-	for (i = 0; i < dev->num_ports; i++) {
+	num_cnt_ports = is_mdev_switchdev_mode(dev->mdev) ? 1 : dev->num_ports;
+
+	for (i = 0; i < num_cnt_ports; i++) {
 		if (dev->port[i].cnts.set_id_valid)
 			mlx5_core_dealloc_q_counter(dev->mdev,
 						    dev->port[i].cnts.set_id);
@@ -4999,10 +5449,15 @@
 
 static int mlx5_ib_alloc_counters(struct mlx5_ib_dev *dev)
 {
+	int num_cnt_ports;
 	int err = 0;
 	int i;
+	bool is_shared;
 
-	for (i = 0; i < dev->num_ports; i++) {
+	is_shared = MLX5_CAP_GEN(dev->mdev, log_max_uctx) != 0;
+	num_cnt_ports = is_mdev_switchdev_mode(dev->mdev) ? 1 : dev->num_ports;
+
+	for (i = 0; i < num_cnt_ports; i++) {
 		err = __mlx5_ib_alloc_counters(dev, &dev->port[i].cnts);
 		if (err)
 			goto err_alloc;
@@ -5010,8 +5465,10 @@
 		mlx5_ib_fill_counters(dev, dev->port[i].cnts.names,
 				      dev->port[i].cnts.offsets);
 
-		err = mlx5_core_alloc_q_counter(dev->mdev,
-						&dev->port[i].cnts.set_id);
+		err = mlx5_cmd_alloc_q_counter(dev->mdev,
+					       &dev->port[i].cnts.set_id,
+					       is_shared ?
+					       MLX5_SHARED_RESOURCE_UID : 0);
 		if (err) {
 			mlx5_ib_warn(dev,
 				     "couldn't allocate queue counter for port %d, err %d\n",
@@ -5020,7 +5477,6 @@
 		}
 		dev->port[i].cnts.set_id_valid = true;
 	}
-
 	return 0;
 
 err_alloc:
@@ -5028,26 +5484,52 @@
 	return err;
 }
 
+static const struct mlx5_ib_counters *get_counters(struct mlx5_ib_dev *dev,
+						   u8 port_num)
+{
+	return is_mdev_switchdev_mode(dev->mdev) ? &dev->port[0].cnts :
+						   &dev->port[port_num].cnts;
+}
+
+/**
+ * mlx5_ib_get_counters_id - Returns counters id to use for device+port
+ * @dev:	Pointer to mlx5 IB device
+ * @port_num:	Zero based port number
+ *
+ * mlx5_ib_get_counters_id() Returns counters set id to use for given
+ * device port combination in switchdev and non switchdev mode of the
+ * parent device.
+ */
+u16 mlx5_ib_get_counters_id(struct mlx5_ib_dev *dev, u8 port_num)
+{
+	const struct mlx5_ib_counters *cnts = get_counters(dev, port_num);
+
+	return cnts->set_id;
+}
+
 static struct rdma_hw_stats *mlx5_ib_alloc_hw_stats(struct ib_device *ibdev,
 						    u8 port_num)
 {
 	struct mlx5_ib_dev *dev = to_mdev(ibdev);
-	struct mlx5_ib_port *port = &dev->port[port_num - 1];
+	const struct mlx5_ib_counters *cnts;
+	bool is_switchdev = is_mdev_switchdev_mode(dev->mdev);
 
-	/* We support only per port stats */
-	if (port_num == 0)
+	if ((is_switchdev && port_num) || (!is_switchdev && !port_num))
 		return NULL;
 
-	return rdma_alloc_hw_stats_struct(port->cnts.names,
-					  port->cnts.num_q_counters +
-					  port->cnts.num_cong_counters +
-					  port->cnts.num_ext_ppcnt_counters,
+	cnts = get_counters(dev, port_num - 1);
+
+	return rdma_alloc_hw_stats_struct(cnts->names,
+					  cnts->num_q_counters +
+					  cnts->num_cong_counters +
+					  cnts->num_ext_ppcnt_counters,
 					  RDMA_HW_STATS_DEFAULT_LIFESPAN);
 }
 
 static int mlx5_ib_query_q_counters(struct mlx5_core_dev *mdev,
-				    struct mlx5_ib_port *port,
-				    struct rdma_hw_stats *stats)
+				    const struct mlx5_ib_counters *cnts,
+				    struct rdma_hw_stats *stats,
+				    u16 set_id)
 {
 	int outlen = MLX5_ST_SZ_BYTES(query_q_counter_out);
 	void *out;
@@ -5058,14 +5540,12 @@
 	if (!out)
 		return -ENOMEM;
 
-	ret = mlx5_core_query_q_counter(mdev,
-					port->cnts.set_id, 0,
-					out, outlen);
+	ret = mlx5_core_query_q_counter(mdev, set_id, 0, out, outlen);
 	if (ret)
 		goto free;
 
-	for (i = 0; i < port->cnts.num_q_counters; i++) {
-		val = *(__be32 *)(out + port->cnts.offsets[i]);
+	for (i = 0; i < cnts->num_q_counters; i++) {
+		val = *(__be32 *)(out + cnts->offsets[i]);
 		stats->value[i] = (u64)be32_to_cpu(val);
 	}
 
@@ -5075,10 +5555,10 @@
 }
 
 static int mlx5_ib_query_ext_ppcnt_counters(struct mlx5_ib_dev *dev,
-					  struct mlx5_ib_port *port,
-					  struct rdma_hw_stats *stats)
+					    const struct mlx5_ib_counters *cnts,
+					    struct rdma_hw_stats *stats)
 {
-	int offset = port->cnts.num_q_counters + port->cnts.num_cong_counters;
+	int offset = cnts->num_q_counters + cnts->num_cong_counters;
 	int sz = MLX5_ST_SZ_BYTES(ppcnt_reg);
 	int ret, i;
 	void *out;
@@ -5091,12 +5571,10 @@
 	if (ret)
 		goto free;
 
-	for (i = 0; i < port->cnts.num_ext_ppcnt_counters; i++) {
+	for (i = 0; i < cnts->num_ext_ppcnt_counters; i++)
 		stats->value[i + offset] =
 			be64_to_cpup((__be64 *)(out +
-				    port->cnts.offsets[i + offset]));
-	}
-
+				    cnts->offsets[i + offset]));
 free:
 	kvfree(out);
 	return ret;
@@ -5107,7 +5585,7 @@
 				u8 port_num, int index)
 {
 	struct mlx5_ib_dev *dev = to_mdev(ibdev);
-	struct mlx5_ib_port *port = &dev->port[port_num - 1];
+	const struct mlx5_ib_counters *cnts = get_counters(dev, port_num - 1);
 	struct mlx5_core_dev *mdev;
 	int ret, num_counters;
 	u8 mdev_port_num;
@@ -5115,17 +5593,17 @@
 	if (!stats)
 		return -EINVAL;
 
-	num_counters = port->cnts.num_q_counters +
-		       port->cnts.num_cong_counters +
-		       port->cnts.num_ext_ppcnt_counters;
+	num_counters = cnts->num_q_counters +
+		       cnts->num_cong_counters +
+		       cnts->num_ext_ppcnt_counters;
 
 	/* q_counters are per IB device, query the master mdev */
-	ret = mlx5_ib_query_q_counters(dev->mdev, port, stats);
+	ret = mlx5_ib_query_q_counters(dev->mdev, cnts, stats, cnts->set_id);
 	if (ret)
 		return ret;
 
 	if (MLX5_CAP_PCAM_FEATURE(dev->mdev, rx_icrc_encapsulated_counter)) {
-		ret =  mlx5_ib_query_ext_ppcnt_counters(dev, port, stats);
+		ret =  mlx5_ib_query_ext_ppcnt_counters(dev, cnts, stats);
 		if (ret)
 			return ret;
 	}
@@ -5142,10 +5620,10 @@
 		}
 		ret = mlx5_lag_query_cong_counters(dev->mdev,
 						   stats->value +
-						   port->cnts.num_q_counters,
-						   port->cnts.num_cong_counters,
-						   port->cnts.offsets +
-						   port->cnts.num_q_counters);
+						   cnts->num_q_counters,
+						   cnts->num_cong_counters,
+						   cnts->offsets +
+						   cnts->num_q_counters);
 
 		mlx5_ib_put_native_port_mdev(dev, port_num);
 		if (ret)
@@ -5156,22 +5634,78 @@
 	return num_counters;
 }
 
-static struct net_device*
-mlx5_ib_alloc_rdma_netdev(struct ib_device *hca,
-			  u8 port_num,
-			  enum rdma_netdev_t type,
-			  const char *name,
-			  unsigned char name_assign_type,
-			  void (*setup)(struct net_device *))
+static struct rdma_hw_stats *
+mlx5_ib_counter_alloc_stats(struct rdma_counter *counter)
 {
-	struct net_device *netdev;
+	struct mlx5_ib_dev *dev = to_mdev(counter->device);
+	const struct mlx5_ib_counters *cnts =
+		get_counters(dev, counter->port - 1);
 
+	/* Q counters are in the beginning of all counters */
+	return rdma_alloc_hw_stats_struct(cnts->names,
+					  cnts->num_q_counters,
+					  RDMA_HW_STATS_DEFAULT_LIFESPAN);
+}
+
+static int mlx5_ib_counter_update_stats(struct rdma_counter *counter)
+{
+	struct mlx5_ib_dev *dev = to_mdev(counter->device);
+	const struct mlx5_ib_counters *cnts =
+		get_counters(dev, counter->port - 1);
+
+	return mlx5_ib_query_q_counters(dev->mdev, cnts,
+					counter->stats, counter->id);
+}
+
+static int mlx5_ib_counter_bind_qp(struct rdma_counter *counter,
+				   struct ib_qp *qp)
+{
+	struct mlx5_ib_dev *dev = to_mdev(qp->device);
+	u16 cnt_set_id = 0;
+	int err;
+
+	if (!counter->id) {
+		err = mlx5_cmd_alloc_q_counter(dev->mdev,
+					       &cnt_set_id,
+					       MLX5_SHARED_RESOURCE_UID);
+		if (err)
+			return err;
+		counter->id = cnt_set_id;
+	}
+
+	err = mlx5_ib_qp_set_counter(qp, counter);
+	if (err)
+		goto fail_set_counter;
+
+	return 0;
+
+fail_set_counter:
+	mlx5_core_dealloc_q_counter(dev->mdev, cnt_set_id);
+	counter->id = 0;
+
+	return err;
+}
+
+static int mlx5_ib_counter_unbind_qp(struct ib_qp *qp)
+{
+	return mlx5_ib_qp_set_counter(qp, NULL);
+}
+
+static int mlx5_ib_counter_dealloc(struct rdma_counter *counter)
+{
+	struct mlx5_ib_dev *dev = to_mdev(counter->device);
+
+	return mlx5_core_dealloc_q_counter(dev->mdev, counter->id);
+}
+
+static int mlx5_ib_rn_get_params(struct ib_device *device, u8 port_num,
+				 enum rdma_netdev_t type,
+				 struct rdma_netdev_alloc_params *params)
+{
 	if (type != RDMA_NETDEV_IPOIB)
-		return ERR_PTR(-EOPNOTSUPP);
+		return -EOPNOTSUPP;
 
-	netdev = mlx5_rdma_netdev_alloc(to_mdev(hca)->mdev, hca,
-					name, setup);
-	return netdev;
+	return mlx5_rdma_rn_get_params(to_mdev(device)->mdev, device, params);
 }
 
 static void delay_drop_debugfs_cleanup(struct mlx5_ib_dev *dev)
@@ -5296,15 +5830,6 @@
 		mlx5_ib_warn(dev, "Failed to init delay drop debugfs\n");
 }
 
-static const struct cpumask *
-mlx5_ib_get_vector_affinity(struct ib_device *ibdev, int comp_vector)
-{
-	struct mlx5_ib_dev *dev = to_mdev(ibdev);
-
-	return mlx5_get_vector_affinity_hint(dev->mdev, comp_vector);
-}
-
-/* The mlx5_ib_multiport_mutex should be held when calling this function */
 static void mlx5_ib_unbind_slave_port(struct mlx5_ib_dev *ibdev,
 				      struct mlx5_ib_multiport_info *mpi)
 {
@@ -5314,6 +5839,8 @@
 	int err;
 	int i;
 
+	lockdep_assert_held(&mlx5_ib_multiport_mutex);
+
 	mlx5_ib_cleanup_cong_debugfs(ibdev, port_num);
 
 	spin_lock(&port->mp.mpi_lock);
@@ -5321,9 +5848,13 @@
 		spin_unlock(&port->mp.mpi_lock);
 		return;
 	}
+
 	mpi->ibdev = NULL;
 
 	spin_unlock(&port->mp.mpi_lock);
+	if (mpi->mdev_events.notifier_call)
+		mlx5_notifier_unregister(mpi->mdev, &mpi->mdev_events);
+	mpi->mdev_events.notifier_call = NULL;
 	mlx5_remove_netdev_notifier(ibdev, port_num);
 	spin_lock(&port->mp.mpi_lock);
 
@@ -5356,16 +5887,17 @@
 		mlx5_ib_err(ibdev, "Failed to unaffiliate port %u\n",
 			    port_num + 1);
 
-	ibdev->roce[port_num].last_port_state = IB_PORT_DOWN;
+	ibdev->port[port_num].roce.last_port_state = IB_PORT_DOWN;
 }
 
-/* The mlx5_ib_multiport_mutex should be held when calling this function */
 static bool mlx5_ib_bind_slave_port(struct mlx5_ib_dev *ibdev,
 				    struct mlx5_ib_multiport_info *mpi)
 {
 	u8 port_num = mlx5_core_native_port_num(mpi->mdev) - 1;
 	int err;
 
+	lockdep_assert_held(&mlx5_ib_multiport_mutex);
+
 	spin_lock(&ibdev->port[port_num].mp.mpi_lock);
 	if (ibdev->port[port_num].mp.mpi) {
 		mlx5_ib_dbg(ibdev, "port %d already affiliated.\n",
@@ -5376,6 +5908,7 @@
 
 	ibdev->port[port_num].mp.mpi = mpi;
 	mpi->ibdev = ibdev;
+	mpi->mdev_events.notifier_call = NULL;
 	spin_unlock(&ibdev->port[port_num].mp.mpi_lock);
 
 	err = mlx5_nic_vport_affiliate_multiport(ibdev->mdev, mpi->mdev);
@@ -5393,9 +5926,10 @@
 		goto unbind;
 	}
 
-	err = mlx5_ib_init_cong_debugfs(ibdev, port_num);
-	if (err)
-		goto unbind;
+	mpi->mdev_events.notifier_call = mlx5_ib_event_slave_port;
+	mlx5_notifier_register(mpi->mdev, &mpi->mdev_events);
+
+	mlx5_ib_init_cong_debugfs(ibdev, port_num);
 
 	return true;
 
@@ -5455,7 +5989,8 @@
 			}
 
 			if (bound) {
-				dev_dbg(&mpi->mdev->pdev->dev, "removing port from unaffiliated list.\n");
+				dev_dbg(mpi->mdev->device,
+					"removing port from unaffiliated list.\n");
 				mlx5_ib_dbg(dev, "port %d bound\n", i + 1);
 				list_del(&mpi->list);
 				break;
@@ -5513,7 +6048,10 @@
 			    UA_MANDATORY),
 	UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_ALLOC_DM_RESP_PAGE_INDEX,
 			    UVERBS_ATTR_TYPE(u16),
-			    UA_MANDATORY));
+			    UA_OPTIONAL),
+	UVERBS_ATTR_CONST_IN(MLX5_IB_ATTR_ALLOC_DM_REQ_TYPE,
+			     enum mlx5_ib_uapi_dm_type,
+			     UA_OPTIONAL));
 
 ADD_UVERBS_ATTRIBUTES_SIMPLE(
 	mlx5_ib_flow_action,
@@ -5522,30 +6060,17 @@
 	UVERBS_ATTR_FLAGS_IN(MLX5_IB_ATTR_CREATE_FLOW_ACTION_FLAGS,
 			     enum mlx5_ib_uapi_flow_action_flags));
 
-static int populate_specs_root(struct mlx5_ib_dev *dev)
-{
-	const struct uverbs_object_tree_def **trees = dev->driver_trees;
-	size_t num_trees = 0;
+static const struct uapi_definition mlx5_ib_defs[] = {
+#if IS_ENABLED(CONFIG_INFINIBAND_USER_ACCESS)
+	UAPI_DEF_CHAIN(mlx5_ib_devx_defs),
+	UAPI_DEF_CHAIN(mlx5_ib_flow_defs),
+#endif
 
-	if (mlx5_accel_ipsec_device_caps(dev->mdev) &
-	    MLX5_ACCEL_IPSEC_CAP_DEVICE)
-		trees[num_trees++] = &mlx5_ib_flow_action;
-
-	if (MLX5_CAP_DEV_MEM(dev->mdev, memic))
-		trees[num_trees++] = &mlx5_ib_dm;
-
-	if (MLX5_CAP_GEN_64(dev->mdev, general_obj_types) &
-	    MLX5_GENERAL_OBJ_TYPES_CAP_UCTX)
-		trees[num_trees++] = mlx5_ib_get_devx_tree();
-
-	num_trees += mlx5_ib_get_flow_trees(trees + num_trees);
-
-	WARN_ON(num_trees >= ARRAY_SIZE(dev->driver_trees));
-	trees[num_trees] = NULL;
-	dev->ib_dev.driver_specs = trees;
-
-	return 0;
-}
+	UAPI_DEF_CHAIN_OBJ_TREE(UVERBS_OBJECT_FLOW_ACTION,
+				&mlx5_ib_flow_action),
+	UAPI_DEF_CHAIN_OBJ_TREE(UVERBS_OBJECT_DM, &mlx5_ib_dm),
+	{}
+};
 
 static int mlx5_ib_read_counters(struct ib_counters *counters,
 				 struct ib_counters_read_attr *read_attr,
@@ -5617,35 +6142,40 @@
 	return &mcounters->ibcntrs;
 }
 
-void mlx5_ib_stage_init_cleanup(struct mlx5_ib_dev *dev)
+static void mlx5_ib_stage_init_cleanup(struct mlx5_ib_dev *dev)
 {
 	mlx5_ib_cleanup_multiport_master(dev);
-#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
-	cleanup_srcu_struct(&dev->mr_srcu);
-#endif
-	kfree(dev->port);
+	if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) {
+		srcu_barrier(&dev->mr_srcu);
+		cleanup_srcu_struct(&dev->mr_srcu);
+	}
+
+	WARN_ON(!bitmap_empty(dev->dm.memic_alloc_pages, MLX5_MAX_MEMIC_PAGES));
 }
 
-int mlx5_ib_stage_init_init(struct mlx5_ib_dev *dev)
+static int mlx5_ib_stage_init_init(struct mlx5_ib_dev *dev)
 {
 	struct mlx5_core_dev *mdev = dev->mdev;
-	const char *name;
 	int err;
 	int i;
 
-	dev->port = kcalloc(dev->num_ports, sizeof(*dev->port),
-			    GFP_KERNEL);
-	if (!dev->port)
-		return -ENOMEM;
-
 	for (i = 0; i < dev->num_ports; i++) {
 		spin_lock_init(&dev->port[i].mp.mpi_lock);
-		rwlock_init(&dev->roce[i].netdev_lock);
+		rwlock_init(&dev->port[i].roce.netdev_lock);
+		dev->port[i].roce.dev = dev;
+		dev->port[i].roce.native_port_num = i + 1;
+		dev->port[i].roce.last_port_state = IB_PORT_DOWN;
 	}
 
+	mlx5_ib_internal_fill_odp_caps(dev);
+
 	err = mlx5_ib_init_multiport_master(dev);
 	if (err)
-		goto err_free_port;
+		return err;
+
+	err = set_has_smi_cap(dev);
+	if (err)
+		return err;
 
 	if (!mlx5_core_mp_enabled(mdev)) {
 		for (i = 1; i <= dev->num_ports; i++) {
@@ -5662,40 +6192,30 @@
 	if (mlx5_use_mad_ifc(dev))
 		get_ext_port_caps(dev);
 
-	if (!mlx5_lag_is_active(mdev))
-		name = "mlx5_%d";
-	else
-		name = "mlx5_bond_%d";
-
-	strlcpy(dev->ib_dev.name, name, IB_DEVICE_NAME_MAX);
-	dev->ib_dev.owner		= THIS_MODULE;
 	dev->ib_dev.node_type		= RDMA_NODE_IB_CA;
 	dev->ib_dev.local_dma_lkey	= 0 /* not supported for now */;
 	dev->ib_dev.phys_port_cnt	= dev->num_ports;
-	dev->ib_dev.num_comp_vectors    =
-		dev->mdev->priv.eq_table.num_comp_vectors;
-	dev->ib_dev.dev.parent		= &mdev->pdev->dev;
+	dev->ib_dev.num_comp_vectors    = mlx5_comp_vectors_count(mdev);
+	dev->ib_dev.dev.parent		= mdev->device;
 
 	mutex_init(&dev->cap_mask_mutex);
 	INIT_LIST_HEAD(&dev->qp_list);
 	spin_lock_init(&dev->reset_flow_resource_lock);
 
-	spin_lock_init(&dev->memic.memic_lock);
-	dev->memic.dev = mdev;
+	spin_lock_init(&dev->dm.lock);
+	dev->dm.dev = mdev;
 
-#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
-	err = init_srcu_struct(&dev->mr_srcu);
-	if (err)
-		goto err_free_port;
-#endif
+	if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) {
+		err = init_srcu_struct(&dev->mr_srcu);
+		if (err)
+			goto err_mp;
+	}
 
 	return 0;
+
 err_mp:
 	mlx5_ib_cleanup_multiport_master(dev);
 
-err_free_port:
-	kfree(dev->port);
-
 	return -ENOMEM;
 }
 
@@ -5711,31 +6231,116 @@
 	return 0;
 }
 
-int mlx5_ib_stage_rep_flow_db_init(struct mlx5_ib_dev *dev)
-{
-	struct mlx5_ib_dev *nic_dev;
-
-	nic_dev = mlx5_ib_get_uplink_ibdev(dev->mdev->priv.eswitch);
-
-	if (!nic_dev)
-		return -EINVAL;
-
-	dev->flow_db = nic_dev->flow_db;
-
-	return 0;
-}
-
 static void mlx5_ib_stage_flow_db_cleanup(struct mlx5_ib_dev *dev)
 {
 	kfree(dev->flow_db);
 }
 
-int mlx5_ib_stage_caps_init(struct mlx5_ib_dev *dev)
+static const struct ib_device_ops mlx5_ib_dev_ops = {
+	.owner = THIS_MODULE,
+	.driver_id = RDMA_DRIVER_MLX5,
+	.uverbs_abi_ver	= MLX5_IB_UVERBS_ABI_VERSION,
+
+	.add_gid = mlx5_ib_add_gid,
+	.alloc_mr = mlx5_ib_alloc_mr,
+	.alloc_mr_integrity = mlx5_ib_alloc_mr_integrity,
+	.alloc_pd = mlx5_ib_alloc_pd,
+	.alloc_ucontext = mlx5_ib_alloc_ucontext,
+	.attach_mcast = mlx5_ib_mcg_attach,
+	.check_mr_status = mlx5_ib_check_mr_status,
+	.create_ah = mlx5_ib_create_ah,
+	.create_counters = mlx5_ib_create_counters,
+	.create_cq = mlx5_ib_create_cq,
+	.create_flow = mlx5_ib_create_flow,
+	.create_qp = mlx5_ib_create_qp,
+	.create_srq = mlx5_ib_create_srq,
+	.dealloc_pd = mlx5_ib_dealloc_pd,
+	.dealloc_ucontext = mlx5_ib_dealloc_ucontext,
+	.del_gid = mlx5_ib_del_gid,
+	.dereg_mr = mlx5_ib_dereg_mr,
+	.destroy_ah = mlx5_ib_destroy_ah,
+	.destroy_counters = mlx5_ib_destroy_counters,
+	.destroy_cq = mlx5_ib_destroy_cq,
+	.destroy_flow = mlx5_ib_destroy_flow,
+	.destroy_flow_action = mlx5_ib_destroy_flow_action,
+	.destroy_qp = mlx5_ib_destroy_qp,
+	.destroy_srq = mlx5_ib_destroy_srq,
+	.detach_mcast = mlx5_ib_mcg_detach,
+	.disassociate_ucontext = mlx5_ib_disassociate_ucontext,
+	.drain_rq = mlx5_ib_drain_rq,
+	.drain_sq = mlx5_ib_drain_sq,
+	.get_dev_fw_str = get_dev_fw_str,
+	.get_dma_mr = mlx5_ib_get_dma_mr,
+	.get_link_layer = mlx5_ib_port_link_layer,
+	.map_mr_sg = mlx5_ib_map_mr_sg,
+	.map_mr_sg_pi = mlx5_ib_map_mr_sg_pi,
+	.mmap = mlx5_ib_mmap,
+	.modify_cq = mlx5_ib_modify_cq,
+	.modify_device = mlx5_ib_modify_device,
+	.modify_port = mlx5_ib_modify_port,
+	.modify_qp = mlx5_ib_modify_qp,
+	.modify_srq = mlx5_ib_modify_srq,
+	.poll_cq = mlx5_ib_poll_cq,
+	.post_recv = mlx5_ib_post_recv,
+	.post_send = mlx5_ib_post_send,
+	.post_srq_recv = mlx5_ib_post_srq_recv,
+	.process_mad = mlx5_ib_process_mad,
+	.query_ah = mlx5_ib_query_ah,
+	.query_device = mlx5_ib_query_device,
+	.query_gid = mlx5_ib_query_gid,
+	.query_pkey = mlx5_ib_query_pkey,
+	.query_qp = mlx5_ib_query_qp,
+	.query_srq = mlx5_ib_query_srq,
+	.read_counters = mlx5_ib_read_counters,
+	.reg_user_mr = mlx5_ib_reg_user_mr,
+	.req_notify_cq = mlx5_ib_arm_cq,
+	.rereg_user_mr = mlx5_ib_rereg_user_mr,
+	.resize_cq = mlx5_ib_resize_cq,
+
+	INIT_RDMA_OBJ_SIZE(ib_ah, mlx5_ib_ah, ibah),
+	INIT_RDMA_OBJ_SIZE(ib_cq, mlx5_ib_cq, ibcq),
+	INIT_RDMA_OBJ_SIZE(ib_pd, mlx5_ib_pd, ibpd),
+	INIT_RDMA_OBJ_SIZE(ib_srq, mlx5_ib_srq, ibsrq),
+	INIT_RDMA_OBJ_SIZE(ib_ucontext, mlx5_ib_ucontext, ibucontext),
+};
+
+static const struct ib_device_ops mlx5_ib_dev_flow_ipsec_ops = {
+	.create_flow_action_esp = mlx5_ib_create_flow_action_esp,
+	.modify_flow_action_esp = mlx5_ib_modify_flow_action_esp,
+};
+
+static const struct ib_device_ops mlx5_ib_dev_ipoib_enhanced_ops = {
+	.rdma_netdev_get_params = mlx5_ib_rn_get_params,
+};
+
+static const struct ib_device_ops mlx5_ib_dev_sriov_ops = {
+	.get_vf_config = mlx5_ib_get_vf_config,
+	.get_vf_stats = mlx5_ib_get_vf_stats,
+	.set_vf_guid = mlx5_ib_set_vf_guid,
+	.set_vf_link_state = mlx5_ib_set_vf_link_state,
+};
+
+static const struct ib_device_ops mlx5_ib_dev_mw_ops = {
+	.alloc_mw = mlx5_ib_alloc_mw,
+	.dealloc_mw = mlx5_ib_dealloc_mw,
+};
+
+static const struct ib_device_ops mlx5_ib_dev_xrc_ops = {
+	.alloc_xrcd = mlx5_ib_alloc_xrcd,
+	.dealloc_xrcd = mlx5_ib_dealloc_xrcd,
+};
+
+static const struct ib_device_ops mlx5_ib_dev_dm_ops = {
+	.alloc_dm = mlx5_ib_alloc_dm,
+	.dealloc_dm = mlx5_ib_dealloc_dm,
+	.reg_dm_mr = mlx5_ib_reg_dm_mr,
+};
+
+static int mlx5_ib_stage_caps_init(struct mlx5_ib_dev *dev)
 {
 	struct mlx5_core_dev *mdev = dev->mdev;
 	int err;
 
-	dev->ib_dev.uverbs_abi_ver	= MLX5_IB_UVERBS_ABI_VERSION;
 	dev->ib_dev.uverbs_cmd_mask	=
 		(1ull << IB_USER_VERBS_CMD_GET_CONTEXT)		|
 		(1ull << IB_USER_VERBS_CMD_QUERY_DEVICE)	|
@@ -5768,103 +6373,46 @@
 		(1ull << IB_USER_VERBS_EX_CMD_CREATE_CQ)	|
 		(1ull << IB_USER_VERBS_EX_CMD_CREATE_QP)	|
 		(1ull << IB_USER_VERBS_EX_CMD_MODIFY_QP)	|
-		(1ull << IB_USER_VERBS_EX_CMD_MODIFY_CQ);
+		(1ull << IB_USER_VERBS_EX_CMD_MODIFY_CQ)	|
+		(1ull << IB_USER_VERBS_EX_CMD_CREATE_FLOW)	|
+		(1ull << IB_USER_VERBS_EX_CMD_DESTROY_FLOW);
 
-	dev->ib_dev.query_device	= mlx5_ib_query_device;
-	dev->ib_dev.get_link_layer	= mlx5_ib_port_link_layer;
-	dev->ib_dev.query_gid		= mlx5_ib_query_gid;
-	dev->ib_dev.add_gid		= mlx5_ib_add_gid;
-	dev->ib_dev.del_gid		= mlx5_ib_del_gid;
-	dev->ib_dev.query_pkey		= mlx5_ib_query_pkey;
-	dev->ib_dev.modify_device	= mlx5_ib_modify_device;
-	dev->ib_dev.modify_port		= mlx5_ib_modify_port;
-	dev->ib_dev.alloc_ucontext	= mlx5_ib_alloc_ucontext;
-	dev->ib_dev.dealloc_ucontext	= mlx5_ib_dealloc_ucontext;
-	dev->ib_dev.mmap		= mlx5_ib_mmap;
-	dev->ib_dev.alloc_pd		= mlx5_ib_alloc_pd;
-	dev->ib_dev.dealloc_pd		= mlx5_ib_dealloc_pd;
-	dev->ib_dev.create_ah		= mlx5_ib_create_ah;
-	dev->ib_dev.query_ah		= mlx5_ib_query_ah;
-	dev->ib_dev.destroy_ah		= mlx5_ib_destroy_ah;
-	dev->ib_dev.create_srq		= mlx5_ib_create_srq;
-	dev->ib_dev.modify_srq		= mlx5_ib_modify_srq;
-	dev->ib_dev.query_srq		= mlx5_ib_query_srq;
-	dev->ib_dev.destroy_srq		= mlx5_ib_destroy_srq;
-	dev->ib_dev.post_srq_recv	= mlx5_ib_post_srq_recv;
-	dev->ib_dev.create_qp		= mlx5_ib_create_qp;
-	dev->ib_dev.modify_qp		= mlx5_ib_modify_qp;
-	dev->ib_dev.query_qp		= mlx5_ib_query_qp;
-	dev->ib_dev.destroy_qp		= mlx5_ib_destroy_qp;
-	dev->ib_dev.drain_sq		= mlx5_ib_drain_sq;
-	dev->ib_dev.drain_rq		= mlx5_ib_drain_rq;
-	dev->ib_dev.post_send		= mlx5_ib_post_send;
-	dev->ib_dev.post_recv		= mlx5_ib_post_recv;
-	dev->ib_dev.create_cq		= mlx5_ib_create_cq;
-	dev->ib_dev.modify_cq		= mlx5_ib_modify_cq;
-	dev->ib_dev.resize_cq		= mlx5_ib_resize_cq;
-	dev->ib_dev.destroy_cq		= mlx5_ib_destroy_cq;
-	dev->ib_dev.poll_cq		= mlx5_ib_poll_cq;
-	dev->ib_dev.req_notify_cq	= mlx5_ib_arm_cq;
-	dev->ib_dev.get_dma_mr		= mlx5_ib_get_dma_mr;
-	dev->ib_dev.reg_user_mr		= mlx5_ib_reg_user_mr;
-	dev->ib_dev.rereg_user_mr	= mlx5_ib_rereg_user_mr;
-	dev->ib_dev.dereg_mr		= mlx5_ib_dereg_mr;
-	dev->ib_dev.attach_mcast	= mlx5_ib_mcg_attach;
-	dev->ib_dev.detach_mcast	= mlx5_ib_mcg_detach;
-	dev->ib_dev.process_mad		= mlx5_ib_process_mad;
-	dev->ib_dev.alloc_mr		= mlx5_ib_alloc_mr;
-	dev->ib_dev.map_mr_sg		= mlx5_ib_map_mr_sg;
-	dev->ib_dev.check_mr_status	= mlx5_ib_check_mr_status;
-	dev->ib_dev.get_dev_fw_str      = get_dev_fw_str;
-	dev->ib_dev.get_vector_affinity	= mlx5_ib_get_vector_affinity;
-	if (MLX5_CAP_GEN(mdev, ipoib_enhanced_offloads))
-		dev->ib_dev.alloc_rdma_netdev	= mlx5_ib_alloc_rdma_netdev;
+	if (MLX5_CAP_GEN(mdev, ipoib_enhanced_offloads) &&
+	    IS_ENABLED(CONFIG_MLX5_CORE_IPOIB))
+		ib_set_device_ops(&dev->ib_dev,
+				  &mlx5_ib_dev_ipoib_enhanced_ops);
 
-	if (mlx5_core_is_pf(mdev)) {
-		dev->ib_dev.get_vf_config	= mlx5_ib_get_vf_config;
-		dev->ib_dev.set_vf_link_state	= mlx5_ib_set_vf_link_state;
-		dev->ib_dev.get_vf_stats	= mlx5_ib_get_vf_stats;
-		dev->ib_dev.set_vf_guid		= mlx5_ib_set_vf_guid;
-	}
-
-	dev->ib_dev.disassociate_ucontext = mlx5_ib_disassociate_ucontext;
+	if (mlx5_core_is_pf(mdev))
+		ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_sriov_ops);
 
 	dev->umr_fence = mlx5_get_umr_fence(MLX5_CAP_GEN(mdev, umr_fence));
 
 	if (MLX5_CAP_GEN(mdev, imaicl)) {
-		dev->ib_dev.alloc_mw		= mlx5_ib_alloc_mw;
-		dev->ib_dev.dealloc_mw		= mlx5_ib_dealloc_mw;
 		dev->ib_dev.uverbs_cmd_mask |=
 			(1ull << IB_USER_VERBS_CMD_ALLOC_MW)	|
 			(1ull << IB_USER_VERBS_CMD_DEALLOC_MW);
+		ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_mw_ops);
 	}
 
 	if (MLX5_CAP_GEN(mdev, xrc)) {
-		dev->ib_dev.alloc_xrcd = mlx5_ib_alloc_xrcd;
-		dev->ib_dev.dealloc_xrcd = mlx5_ib_dealloc_xrcd;
 		dev->ib_dev.uverbs_cmd_mask |=
 			(1ull << IB_USER_VERBS_CMD_OPEN_XRCD) |
 			(1ull << IB_USER_VERBS_CMD_CLOSE_XRCD);
+		ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_xrc_ops);
 	}
 
-	if (MLX5_CAP_DEV_MEM(mdev, memic)) {
-		dev->ib_dev.alloc_dm = mlx5_ib_alloc_dm;
-		dev->ib_dev.dealloc_dm = mlx5_ib_dealloc_dm;
-		dev->ib_dev.reg_dm_mr = mlx5_ib_reg_dm_mr;
-	}
+	if (MLX5_CAP_DEV_MEM(mdev, memic) ||
+	    MLX5_CAP_GEN_64(dev->mdev, general_obj_types) &
+	    MLX5_GENERAL_OBJ_TYPES_CAP_SW_ICM)
+		ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_dm_ops);
 
-	dev->ib_dev.create_flow	= mlx5_ib_create_flow;
-	dev->ib_dev.destroy_flow = mlx5_ib_destroy_flow;
-	dev->ib_dev.uverbs_ex_cmd_mask |=
-			(1ull << IB_USER_VERBS_EX_CMD_CREATE_FLOW) |
-			(1ull << IB_USER_VERBS_EX_CMD_DESTROY_FLOW);
-	dev->ib_dev.create_flow_action_esp = mlx5_ib_create_flow_action_esp;
-	dev->ib_dev.destroy_flow_action = mlx5_ib_destroy_flow_action;
-	dev->ib_dev.modify_flow_action_esp = mlx5_ib_modify_flow_action_esp;
-	dev->ib_dev.driver_id = RDMA_DRIVER_MLX5;
-	dev->ib_dev.create_counters = mlx5_ib_create_counters;
-	dev->ib_dev.destroy_counters = mlx5_ib_destroy_counters;
-	dev->ib_dev.read_counters = mlx5_ib_read_counters;
+	if (mlx5_accel_ipsec_device_caps(dev->mdev) &
+	    MLX5_ACCEL_IPSEC_CAP_DEVICE)
+		ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_flow_ipsec_ops);
+	ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_ops);
+
+	if (IS_ENABLED(CONFIG_INFINIBAND_USER_ACCESS))
+		dev->ib_dev.driver_def = mlx5_ib_defs;
 
 	err = init_node_data(dev);
 	if (err)
@@ -5873,44 +6421,47 @@
 	if ((MLX5_CAP_GEN(dev->mdev, port_type) == MLX5_CAP_PORT_TYPE_ETH) &&
 	    (MLX5_CAP_GEN(dev->mdev, disable_local_lb_uc) ||
 	     MLX5_CAP_GEN(dev->mdev, disable_local_lb_mc)))
-		mutex_init(&dev->lb_mutex);
+		mutex_init(&dev->lb.mutex);
+
+	dev->ib_dev.use_cq_dim = true;
 
 	return 0;
 }
 
+static const struct ib_device_ops mlx5_ib_dev_port_ops = {
+	.get_port_immutable = mlx5_port_immutable,
+	.query_port = mlx5_ib_query_port,
+};
+
 static int mlx5_ib_stage_non_default_cb(struct mlx5_ib_dev *dev)
 {
-	dev->ib_dev.get_port_immutable  = mlx5_port_immutable;
-	dev->ib_dev.query_port		= mlx5_ib_query_port;
-
+	ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_port_ops);
 	return 0;
 }
 
-int mlx5_ib_stage_rep_non_default_cb(struct mlx5_ib_dev *dev)
+static const struct ib_device_ops mlx5_ib_dev_port_rep_ops = {
+	.get_port_immutable = mlx5_port_rep_immutable,
+	.query_port = mlx5_ib_rep_query_port,
+};
+
+static int mlx5_ib_stage_rep_non_default_cb(struct mlx5_ib_dev *dev)
 {
-	dev->ib_dev.get_port_immutable  = mlx5_port_rep_immutable;
-	dev->ib_dev.query_port		= mlx5_ib_rep_query_port;
-
+	ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_port_rep_ops);
 	return 0;
 }
 
+static const struct ib_device_ops mlx5_ib_dev_common_roce_ops = {
+	.create_rwq_ind_table = mlx5_ib_create_rwq_ind_table,
+	.create_wq = mlx5_ib_create_wq,
+	.destroy_rwq_ind_table = mlx5_ib_destroy_rwq_ind_table,
+	.destroy_wq = mlx5_ib_destroy_wq,
+	.get_netdev = mlx5_ib_get_netdev,
+	.modify_wq = mlx5_ib_modify_wq,
+};
+
 static int mlx5_ib_stage_common_roce_init(struct mlx5_ib_dev *dev)
 {
 	u8 port_num;
-	int i;
-
-	for (i = 0; i < dev->num_ports; i++) {
-		dev->roce[i].dev = dev;
-		dev->roce[i].native_port_num = i + 1;
-		dev->roce[i].last_port_state = IB_PORT_DOWN;
-	}
-
-	dev->ib_dev.get_netdev	= mlx5_ib_get_netdev;
-	dev->ib_dev.create_wq	 = mlx5_ib_create_wq;
-	dev->ib_dev.modify_wq	 = mlx5_ib_modify_wq;
-	dev->ib_dev.destroy_wq	 = mlx5_ib_destroy_wq;
-	dev->ib_dev.create_rwq_ind_table = mlx5_ib_create_rwq_ind_table;
-	dev->ib_dev.destroy_rwq_ind_table = mlx5_ib_destroy_rwq_ind_table;
 
 	dev->ib_dev.uverbs_ex_cmd_mask |=
 			(1ull << IB_USER_VERBS_EX_CMD_CREATE_WQ) |
@@ -5918,9 +6469,11 @@
 			(1ull << IB_USER_VERBS_EX_CMD_DESTROY_WQ) |
 			(1ull << IB_USER_VERBS_EX_CMD_CREATE_RWQ_IND_TBL) |
 			(1ull << IB_USER_VERBS_EX_CMD_DESTROY_RWQ_IND_TBL);
+	ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_common_roce_ops);
 
 	port_num = mlx5_core_native_port_num(dev->mdev) - 1;
 
+	/* Register only for native ports */
 	return mlx5_add_netdev_notifier(dev, port_num);
 }
 
@@ -5931,7 +6484,7 @@
 	mlx5_remove_netdev_notifier(dev, port_num);
 }
 
-int mlx5_ib_stage_rep_roce_init(struct mlx5_ib_dev *dev)
+static int mlx5_ib_stage_rep_roce_init(struct mlx5_ib_dev *dev)
 {
 	struct mlx5_core_dev *mdev = dev->mdev;
 	enum rdma_link_layer ll;
@@ -5947,7 +6500,7 @@
 	return err;
 }
 
-void mlx5_ib_stage_rep_roce_cleanup(struct mlx5_ib_dev *dev)
+static void mlx5_ib_stage_rep_roce_cleanup(struct mlx5_ib_dev *dev)
 {
 	mlx5_ib_stage_common_roce_cleanup(dev);
 }
@@ -5994,28 +6547,40 @@
 	}
 }
 
-int mlx5_ib_stage_dev_res_init(struct mlx5_ib_dev *dev)
+static int mlx5_ib_stage_dev_res_init(struct mlx5_ib_dev *dev)
 {
 	return create_dev_resources(&dev->devr);
 }
 
-void mlx5_ib_stage_dev_res_cleanup(struct mlx5_ib_dev *dev)
+static void mlx5_ib_stage_dev_res_cleanup(struct mlx5_ib_dev *dev)
 {
 	destroy_dev_resources(&dev->devr);
 }
 
 static int mlx5_ib_stage_odp_init(struct mlx5_ib_dev *dev)
 {
-	mlx5_ib_internal_fill_odp_caps(dev);
-
 	return mlx5_ib_odp_init_one(dev);
 }
 
-int mlx5_ib_stage_counters_init(struct mlx5_ib_dev *dev)
+static void mlx5_ib_stage_odp_cleanup(struct mlx5_ib_dev *dev)
+{
+	mlx5_ib_odp_cleanup_one(dev);
+}
+
+static const struct ib_device_ops mlx5_ib_dev_hw_stats_ops = {
+	.alloc_hw_stats = mlx5_ib_alloc_hw_stats,
+	.get_hw_stats = mlx5_ib_get_hw_stats,
+	.counter_bind_qp = mlx5_ib_counter_bind_qp,
+	.counter_unbind_qp = mlx5_ib_counter_unbind_qp,
+	.counter_dealloc = mlx5_ib_counter_dealloc,
+	.counter_alloc_stats = mlx5_ib_counter_alloc_stats,
+	.counter_update_stats = mlx5_ib_counter_update_stats,
+};
+
+static int mlx5_ib_stage_counters_init(struct mlx5_ib_dev *dev)
 {
 	if (MLX5_CAP_GEN(dev->mdev, max_qp_cnt)) {
-		dev->ib_dev.get_hw_stats	= mlx5_ib_get_hw_stats;
-		dev->ib_dev.alloc_hw_stats	= mlx5_ib_alloc_hw_stats;
+		ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_hw_stats_ops);
 
 		return mlx5_ib_alloc_counters(dev);
 	}
@@ -6023,7 +6588,7 @@
 	return 0;
 }
 
-void mlx5_ib_stage_counters_cleanup(struct mlx5_ib_dev *dev)
+static void mlx5_ib_stage_counters_cleanup(struct mlx5_ib_dev *dev)
 {
 	if (MLX5_CAP_GEN(dev->mdev, max_qp_cnt))
 		mlx5_ib_dealloc_counters(dev);
@@ -6031,8 +6596,9 @@
 
 static int mlx5_ib_stage_cong_debugfs_init(struct mlx5_ib_dev *dev)
 {
-	return mlx5_ib_init_cong_debugfs(dev,
-					 mlx5_core_native_port_num(dev->mdev) - 1);
+	mlx5_ib_init_cong_debugfs(dev,
+				  mlx5_core_native_port_num(dev->mdev) - 1);
+	return 0;
 }
 
 static void mlx5_ib_stage_cong_debugfs_cleanup(struct mlx5_ib_dev *dev)
@@ -6052,7 +6618,7 @@
 	mlx5_put_uars_page(dev->mdev, dev->mdev->priv.uar);
 }
 
-int mlx5_ib_stage_bfrag_init(struct mlx5_ib_dev *dev)
+static int mlx5_ib_stage_bfrag_init(struct mlx5_ib_dev *dev)
 {
 	int err;
 
@@ -6067,33 +6633,35 @@
 	return err;
 }
 
-void mlx5_ib_stage_bfrag_cleanup(struct mlx5_ib_dev *dev)
+static void mlx5_ib_stage_bfrag_cleanup(struct mlx5_ib_dev *dev)
 {
 	mlx5_free_bfreg(dev->mdev, &dev->fp_bfreg);
 	mlx5_free_bfreg(dev->mdev, &dev->bfreg);
 }
 
-static int mlx5_ib_stage_populate_specs(struct mlx5_ib_dev *dev)
+static int mlx5_ib_stage_ib_reg_init(struct mlx5_ib_dev *dev)
 {
-	return populate_specs_root(dev);
+	const char *name;
+
+	rdma_set_device_sysfs_group(&dev->ib_dev, &mlx5_attr_group);
+	if (!mlx5_lag_is_roce(dev->mdev))
+		name = "mlx5_%d";
+	else
+		name = "mlx5_bond_%d";
+	return ib_register_device(&dev->ib_dev, name);
 }
 
-int mlx5_ib_stage_ib_reg_init(struct mlx5_ib_dev *dev)
-{
-	return ib_register_device(&dev->ib_dev, NULL);
-}
-
-void mlx5_ib_stage_pre_ib_reg_umr_cleanup(struct mlx5_ib_dev *dev)
+static void mlx5_ib_stage_pre_ib_reg_umr_cleanup(struct mlx5_ib_dev *dev)
 {
 	destroy_umrc_res(dev);
 }
 
-void mlx5_ib_stage_ib_reg_cleanup(struct mlx5_ib_dev *dev)
+static void mlx5_ib_stage_ib_reg_cleanup(struct mlx5_ib_dev *dev)
 {
 	ib_unregister_device(&dev->ib_dev);
 }
 
-int mlx5_ib_stage_post_ib_reg_umr_init(struct mlx5_ib_dev *dev)
+static int mlx5_ib_stage_post_ib_reg_umr_init(struct mlx5_ib_dev *dev)
 {
 	return create_umr_res(dev);
 }
@@ -6110,31 +6678,36 @@
 	cancel_delay_drop(dev);
 }
 
-int mlx5_ib_stage_class_attr_init(struct mlx5_ib_dev *dev)
+static int mlx5_ib_stage_dev_notifier_init(struct mlx5_ib_dev *dev)
 {
-	int err;
-	int i;
+	dev->mdev_events.notifier_call = mlx5_ib_event;
+	mlx5_notifier_register(dev->mdev, &dev->mdev_events);
+	return 0;
+}
 
-	for (i = 0; i < ARRAY_SIZE(mlx5_class_attributes); i++) {
-		err = device_create_file(&dev->ib_dev.dev,
-					 mlx5_class_attributes[i]);
-		if (err)
-			return err;
+static void mlx5_ib_stage_dev_notifier_cleanup(struct mlx5_ib_dev *dev)
+{
+	mlx5_notifier_unregister(dev->mdev, &dev->mdev_events);
+}
+
+static int mlx5_ib_stage_devx_init(struct mlx5_ib_dev *dev)
+{
+	int uid;
+
+	uid = mlx5_ib_devx_create(dev, false);
+	if (uid > 0) {
+		dev->devx_whitelist_uid = uid;
+		mlx5_ib_devx_init_event_table(dev);
 	}
 
 	return 0;
 }
-
-static int mlx5_ib_stage_rep_reg_init(struct mlx5_ib_dev *dev)
+static void mlx5_ib_stage_devx_cleanup(struct mlx5_ib_dev *dev)
 {
-	mlx5_ib_register_vport_reps(dev);
-
-	return 0;
-}
-
-static void mlx5_ib_stage_rep_reg_cleanup(struct mlx5_ib_dev *dev)
-{
-	mlx5_ib_unregister_vport_reps(dev);
+	if (dev->devx_whitelist_uid) {
+		mlx5_ib_devx_cleanup_event_table(dev);
+		mlx5_ib_devx_destroy(dev, dev->devx_whitelist_uid);
+	}
 }
 
 void __mlx5_ib_remove(struct mlx5_ib_dev *dev,
@@ -6148,7 +6721,8 @@
 			profile->stage[stage].cleanup(dev);
 	}
 
-	ib_dealloc_device((struct ib_device *)dev);
+	kfree(dev->port);
+	ib_dealloc_device(&dev->ib_dev);
 }
 
 void *__mlx5_ib_add(struct mlx5_ib_dev *dev,
@@ -6157,8 +6731,6 @@
 	int err;
 	int i;
 
-	printk_once(KERN_INFO "%s", mlx5_version);
-
 	for (i = 0; i < MLX5_IB_STAGE_MAX; i++) {
 		if (profile->stage[i].init) {
 			err = profile->stage[i].init(dev);
@@ -6194,12 +6766,18 @@
 	STAGE_CREATE(MLX5_IB_STAGE_ROCE,
 		     mlx5_ib_stage_roce_init,
 		     mlx5_ib_stage_roce_cleanup),
+	STAGE_CREATE(MLX5_IB_STAGE_SRQ,
+		     mlx5_init_srq_table,
+		     mlx5_cleanup_srq_table),
 	STAGE_CREATE(MLX5_IB_STAGE_DEVICE_RESOURCES,
 		     mlx5_ib_stage_dev_res_init,
 		     mlx5_ib_stage_dev_res_cleanup),
+	STAGE_CREATE(MLX5_IB_STAGE_DEVICE_NOTIFIER,
+		     mlx5_ib_stage_dev_notifier_init,
+		     mlx5_ib_stage_dev_notifier_cleanup),
 	STAGE_CREATE(MLX5_IB_STAGE_ODP,
 		     mlx5_ib_stage_odp_init,
-		     NULL),
+		     mlx5_ib_stage_odp_cleanup),
 	STAGE_CREATE(MLX5_IB_STAGE_COUNTERS,
 		     mlx5_ib_stage_counters_init,
 		     mlx5_ib_stage_counters_cleanup),
@@ -6215,9 +6793,9 @@
 	STAGE_CREATE(MLX5_IB_STAGE_PRE_IB_REG_UMR,
 		     NULL,
 		     mlx5_ib_stage_pre_ib_reg_umr_cleanup),
-	STAGE_CREATE(MLX5_IB_STAGE_SPECS,
-		     mlx5_ib_stage_populate_specs,
-		     NULL),
+	STAGE_CREATE(MLX5_IB_STAGE_WHITELIST_UID,
+		     mlx5_ib_stage_devx_init,
+		     mlx5_ib_stage_devx_cleanup),
 	STAGE_CREATE(MLX5_IB_STAGE_IB_REG,
 		     mlx5_ib_stage_ib_reg_init,
 		     mlx5_ib_stage_ib_reg_cleanup),
@@ -6227,12 +6805,9 @@
 	STAGE_CREATE(MLX5_IB_STAGE_DELAY_DROP,
 		     mlx5_ib_stage_delay_drop_init,
 		     mlx5_ib_stage_delay_drop_cleanup),
-	STAGE_CREATE(MLX5_IB_STAGE_CLASS_ATTR,
-		     mlx5_ib_stage_class_attr_init,
-		     NULL),
 };
 
-static const struct mlx5_ib_profile nic_rep_profile = {
+const struct mlx5_ib_profile uplink_rep_profile = {
 	STAGE_CREATE(MLX5_IB_STAGE_INIT,
 		     mlx5_ib_stage_init_init,
 		     mlx5_ib_stage_init_cleanup),
@@ -6248,9 +6823,15 @@
 	STAGE_CREATE(MLX5_IB_STAGE_ROCE,
 		     mlx5_ib_stage_rep_roce_init,
 		     mlx5_ib_stage_rep_roce_cleanup),
+	STAGE_CREATE(MLX5_IB_STAGE_SRQ,
+		     mlx5_init_srq_table,
+		     mlx5_cleanup_srq_table),
 	STAGE_CREATE(MLX5_IB_STAGE_DEVICE_RESOURCES,
 		     mlx5_ib_stage_dev_res_init,
 		     mlx5_ib_stage_dev_res_cleanup),
+	STAGE_CREATE(MLX5_IB_STAGE_DEVICE_NOTIFIER,
+		     mlx5_ib_stage_dev_notifier_init,
+		     mlx5_ib_stage_dev_notifier_cleanup),
 	STAGE_CREATE(MLX5_IB_STAGE_COUNTERS,
 		     mlx5_ib_stage_counters_init,
 		     mlx5_ib_stage_counters_cleanup),
@@ -6263,21 +6844,15 @@
 	STAGE_CREATE(MLX5_IB_STAGE_PRE_IB_REG_UMR,
 		     NULL,
 		     mlx5_ib_stage_pre_ib_reg_umr_cleanup),
-	STAGE_CREATE(MLX5_IB_STAGE_SPECS,
-		     mlx5_ib_stage_populate_specs,
-		     NULL),
+	STAGE_CREATE(MLX5_IB_STAGE_WHITELIST_UID,
+		     mlx5_ib_stage_devx_init,
+		     mlx5_ib_stage_devx_cleanup),
 	STAGE_CREATE(MLX5_IB_STAGE_IB_REG,
 		     mlx5_ib_stage_ib_reg_init,
 		     mlx5_ib_stage_ib_reg_cleanup),
 	STAGE_CREATE(MLX5_IB_STAGE_POST_IB_REG_UMR,
 		     mlx5_ib_stage_post_ib_reg_umr_init,
 		     NULL),
-	STAGE_CREATE(MLX5_IB_STAGE_CLASS_ATTR,
-		     mlx5_ib_stage_class_attr_init,
-		     NULL),
-	STAGE_CREATE(MLX5_IB_STAGE_REP_REG,
-		     mlx5_ib_stage_rep_reg_init,
-		     mlx5_ib_stage_rep_reg_cleanup),
 };
 
 static void *mlx5_ib_add_slave_port(struct mlx5_core_dev *mdev)
@@ -6313,7 +6888,8 @@
 
 	if (!bound) {
 		list_add_tail(&mpi->list, &mlx5_ib_unaffiliated_port_list);
-		dev_dbg(&mdev->pdev->dev, "no suitable IB device found to bind to, added to unaffiliated list.\n");
+		dev_dbg(mdev->device,
+			"no suitable IB device found to bind to, added to unaffiliated list.\n");
 	}
 	mutex_unlock(&mlx5_ib_multiport_mutex);
 
@@ -6325,29 +6901,37 @@
 	enum rdma_link_layer ll;
 	struct mlx5_ib_dev *dev;
 	int port_type_cap;
+	int num_ports;
 
 	printk_once(KERN_INFO "%s", mlx5_version);
 
+	if (MLX5_ESWITCH_MANAGER(mdev) &&
+	    mlx5_ib_eswitch_mode(mdev->priv.eswitch) == MLX5_ESWITCH_OFFLOADS) {
+		if (!mlx5_core_mp_enabled(mdev))
+			mlx5_ib_register_vport_reps(mdev);
+		return mdev;
+	}
+
 	port_type_cap = MLX5_CAP_GEN(mdev, port_type);
 	ll = mlx5_port_type_cap_to_rdma_ll(port_type_cap);
 
 	if (mlx5_core_is_mp_slave(mdev) && ll == IB_LINK_LAYER_ETHERNET)
 		return mlx5_ib_add_slave_port(mdev);
 
-	dev = (struct mlx5_ib_dev *)ib_alloc_device(sizeof(*dev));
+	num_ports = max(MLX5_CAP_GEN(mdev, num_ports),
+			MLX5_CAP_GEN(mdev, num_vhca_ports));
+	dev = ib_alloc_device(mlx5_ib_dev, ib_dev);
 	if (!dev)
 		return NULL;
+	dev->port = kcalloc(num_ports, sizeof(*dev->port),
+			     GFP_KERNEL);
+	if (!dev->port) {
+		ib_dealloc_device(&dev->ib_dev);
+		return NULL;
+	}
 
 	dev->mdev = mdev;
-	dev->num_ports = max(MLX5_CAP_GEN(mdev, num_ports),
-			     MLX5_CAP_GEN(mdev, num_vhca_ports));
-
-	if (MLX5_ESWITCH_MANAGER(mdev) &&
-	    mlx5_ib_eswitch_mode(mdev->priv.eswitch) == SRIOV_OFFLOADS) {
-		dev->rep = mlx5_ib_vport_rep(mdev->priv.eswitch, 0);
-
-		return __mlx5_ib_add(dev, &nic_rep_profile);
-	}
+	dev->num_ports = num_ports;
 
 	return __mlx5_ib_add(dev, &pf_profile);
 }
@@ -6357,6 +6941,11 @@
 	struct mlx5_ib_multiport_info *mpi;
 	struct mlx5_ib_dev *dev;
 
+	if (MLX5_ESWITCH_MANAGER(mdev) && context == mdev) {
+		mlx5_ib_unregister_vport_reps(mdev);
+		return;
+	}
+
 	if (mlx5_core_is_mp_slave(mdev)) {
 		mpi = context;
 		mutex_lock(&mlx5_ib_multiport_mutex);
@@ -6364,6 +6953,7 @@
 			mlx5_ib_unbind_slave_port(mpi->ibdev, mpi);
 		list_del(&mpi->list);
 		mutex_unlock(&mlx5_ib_multiport_mutex);
+		kfree(mpi);
 		return;
 	}
 
@@ -6374,10 +6964,6 @@
 static struct mlx5_interface mlx5_ib_interface = {
 	.add            = mlx5_ib_add,
 	.remove         = mlx5_ib_remove,
-	.event          = mlx5_ib_event,
-#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
-	.pfault		= mlx5_ib_pfault,
-#endif
 	.protocol	= MLX5_INTERFACE_PROTOCOL_IB,
 };
 
diff --git a/drivers/infiniband/hw/mlx5/mem.c b/drivers/infiniband/hw/mlx5/mem.c
index f3dbd75..b5aece7 100644
--- a/drivers/infiniband/hw/mlx5/mem.c
+++ b/drivers/infiniband/hw/mlx5/mem.c
@@ -55,27 +55,16 @@
 	int i = 0;
 	struct scatterlist *sg;
 	int entry;
-	unsigned long page_shift = umem->page_shift;
 
-	if (umem->odp_data) {
-		*ncont = ib_umem_page_count(umem);
-		*count = *ncont << (page_shift - PAGE_SHIFT);
-		*shift = page_shift;
-		if (order)
-			*order = ilog2(roundup_pow_of_two(*ncont));
-
-		return;
-	}
-
-	addr = addr >> page_shift;
+	addr = addr >> PAGE_SHIFT;
 	tmp = (unsigned long)addr;
 	m = find_first_bit(&tmp, BITS_PER_LONG);
 	if (max_page_shift)
-		m = min_t(unsigned long, max_page_shift - page_shift, m);
+		m = min_t(unsigned long, max_page_shift - PAGE_SHIFT, m);
 
 	for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) {
-		len = sg_dma_len(sg) >> page_shift;
-		pfn = sg_dma_address(sg) >> page_shift;
+		len = sg_dma_len(sg) >> PAGE_SHIFT;
+		pfn = sg_dma_address(sg) >> PAGE_SHIFT;
 		if (base + p != pfn) {
 			/* If either the offset or the new
 			 * base are unaligned update m
@@ -107,11 +96,10 @@
 
 		*ncont = 0;
 	}
-	*shift = page_shift + m;
+	*shift = PAGE_SHIFT + m;
 	*count = i;
 }
 
-#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
 static u64 umem_dma_to_mtt(dma_addr_t umem_dma)
 {
 	u64 mtt_entry = umem_dma & ODP_DMA_ADDR_MASK;
@@ -123,7 +111,6 @@
 
 	return mtt_entry;
 }
-#endif
 
 /*
  * Populate the given array with bus addresses from the umem.
@@ -142,8 +129,7 @@
 			    int page_shift, size_t offset, size_t num_pages,
 			    __be64 *pas, int access_flags)
 {
-	unsigned long umem_page_shift = umem->page_shift;
-	int shift = page_shift - umem_page_shift;
+	int shift = page_shift - PAGE_SHIFT;
 	int mask = (1 << shift) - 1;
 	int i, k, idx;
 	u64 cur = 0;
@@ -151,25 +137,23 @@
 	int len;
 	struct scatterlist *sg;
 	int entry;
-#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
-	const bool odp = umem->odp_data != NULL;
 
-	if (odp) {
+	if (umem->is_odp) {
 		WARN_ON(shift != 0);
 		WARN_ON(access_flags != (MLX5_IB_MTT_READ | MLX5_IB_MTT_WRITE));
 
 		for (i = 0; i < num_pages; ++i) {
-			dma_addr_t pa = umem->odp_data->dma_list[offset + i];
+			dma_addr_t pa =
+				to_ib_umem_odp(umem)->dma_list[offset + i];
 
 			pas[i] = cpu_to_be64(umem_dma_to_mtt(pa));
 		}
 		return;
 	}
-#endif
 
 	i = 0;
 	for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) {
-		len = sg_dma_len(sg) >> umem_page_shift;
+		len = sg_dma_len(sg) >> PAGE_SHIFT;
 		base = sg_dma_address(sg);
 
 		/* Skip elements below offset */
@@ -188,7 +172,7 @@
 
 		for (; k < len; k++) {
 			if (!(i & mask)) {
-				cur = base + (k << umem_page_shift);
+				cur = base + (k << PAGE_SHIFT);
 				cur |= access_flags;
 				idx = (i >> shift) - offset;
 
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index 320d4df..1a98ee2 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -36,29 +36,33 @@
 #include <linux/kernel.h>
 #include <linux/sched.h>
 #include <rdma/ib_verbs.h>
+#include <rdma/ib_umem.h>
 #include <rdma/ib_smi.h>
 #include <linux/mlx5/driver.h>
 #include <linux/mlx5/cq.h>
+#include <linux/mlx5/fs.h>
 #include <linux/mlx5/qp.h>
-#include <linux/mlx5/srq.h>
 #include <linux/types.h>
 #include <linux/mlx5/transobj.h>
 #include <rdma/ib_user_verbs.h>
 #include <rdma/mlx5-abi.h>
 #include <rdma/uverbs_ioctl.h>
 #include <rdma/mlx5_user_ioctl_cmds.h>
+#include <rdma/mlx5_user_ioctl_verbs.h>
 
-#define mlx5_ib_dbg(dev, format, arg...)				\
-pr_debug("%s:%s:%d:(pid %d): " format, (dev)->ib_dev.name, __func__,	\
-	 __LINE__, current->pid, ##arg)
+#include "srq.h"
 
-#define mlx5_ib_err(dev, format, arg...)				\
-pr_err("%s:%s:%d:(pid %d): " format, (dev)->ib_dev.name, __func__,	\
-	__LINE__, current->pid, ##arg)
+#define mlx5_ib_dbg(_dev, format, arg...)                                      \
+	dev_dbg(&(_dev)->ib_dev.dev, "%s:%d:(pid %d): " format, __func__,      \
+		__LINE__, current->pid, ##arg)
 
-#define mlx5_ib_warn(dev, format, arg...)				\
-pr_warn("%s:%s:%d:(pid %d): " format, (dev)->ib_dev.name, __func__,	\
-	__LINE__, current->pid, ##arg)
+#define mlx5_ib_err(_dev, format, arg...)                                      \
+	dev_err(&(_dev)->ib_dev.dev, "%s:%d:(pid %d): " format, __func__,      \
+		__LINE__, current->pid, ##arg)
+
+#define mlx5_ib_warn(_dev, format, arg...)                                     \
+	dev_warn(&(_dev)->ib_dev.dev, "%s:%d:(pid %d): " format, __func__,     \
+		 __LINE__, current->pid, ##arg)
 
 #define field_avail(type, fld, sz) (offsetof(type, fld) +		\
 				    sizeof(((type *)0)->fld) <= (sz))
@@ -114,12 +118,9 @@
 	MLX5_MEMIC_BASE_SIZE	= 1 << MLX5_MEMIC_BASE_ALIGN,
 };
 
-struct mlx5_ib_vma_private_data {
-	struct list_head list;
-	struct vm_area_struct *vma;
-	/* protect vma_private_list add/del */
-	struct mutex *vma_private_list_mutex;
-};
+#define MLX5_LOG_SW_ICM_BLOCK_SIZE(dev)                                        \
+	(MLX5_CAP_DEV_MEM(dev, log_sw_icm_alloc_granularity))
+#define MLX5_SW_ICM_BLOCK_SIZE(dev) (1 << MLX5_LOG_SW_ICM_BLOCK_SIZE(dev))
 
 struct mlx5_ib_ucontext {
 	struct ib_ucontext	ibucontext;
@@ -132,13 +133,12 @@
 	u8			cqe_version;
 	/* Transport Domain number */
 	u32			tdn;
-	struct list_head	vma_private_list;
-	/* protect vma_private_list add/del */
-	struct mutex		vma_private_list_mutex;
 
 	u64			lib_caps;
 	DECLARE_BITMAP(dm_pages, MLX5_MAX_MEMIC_PAGES);
 	u16			devx_uid;
+	/* For RoCE LAG TX affinity */
+	atomic_t		tx_port_affinity;
 };
 
 static inline struct mlx5_ib_ucontext *to_mucontext(struct ib_ucontext *ibucontext)
@@ -149,6 +149,13 @@
 struct mlx5_ib_pd {
 	struct ib_pd		ibpd;
 	u32			pdn;
+	u16			uid;
+};
+
+enum {
+	MLX5_IB_FLOW_ACTION_MODIFY_HEADER,
+	MLX5_IB_FLOW_ACTION_PACKET_REFORMAT,
+	MLX5_IB_FLOW_ACTION_DECAP,
 };
 
 #define MLX5_IB_FLOW_MCAST_PRIO		(MLX5_BY_PASS_NUM_PRIOS - 1)
@@ -180,6 +187,7 @@
 	struct mlx5_ib_match_params matcher_mask;
 	int			mask_len;
 	enum mlx5_ib_flow_type	flow_type;
+	enum mlx5_flow_namespace_type ns_type;
 	u16			priority;
 	struct mlx5_core_dev	*mdev;
 	atomic_t		usecnt;
@@ -188,8 +196,11 @@
 
 struct mlx5_ib_flow_db {
 	struct mlx5_ib_flow_prio	prios[MLX5_IB_NUM_FLOW_FT];
+	struct mlx5_ib_flow_prio	egress_prios[MLX5_IB_NUM_FLOW_FT];
 	struct mlx5_ib_flow_prio	sniffer[MLX5_IB_NUM_SNIFFER_FTS];
 	struct mlx5_ib_flow_prio	egress[MLX5_IB_NUM_EGRESS_FTS];
+	struct mlx5_ib_flow_prio	fdb;
+	struct mlx5_ib_flow_prio	rdma_rx[MLX5_IB_NUM_FLOW_FT];
 	struct mlx5_flow_table		*lag_demux_ft;
 	/* Protect flow steering bypass flow tables
 	 * when add/del flow rules.
@@ -254,6 +265,7 @@
 };
 
 struct mlx5_ib_wq {
+	struct mlx5_frag_buf_ctrl fbc;
 	u64		       *wrid;
 	u32		       *wr_data;
 	struct wr_list	       *w_list;
@@ -271,8 +283,7 @@
 	unsigned		head;
 	unsigned		tail;
 	u16			cur_post;
-	u16			last_poll;
-	void		       *qend;
+	void			*cur_edge;
 };
 
 enum mlx5_ib_wq_flags {
@@ -322,6 +333,7 @@
 struct mlx5_ib_rwq_ind_table {
 	struct ib_rwq_ind_table ib_rwq_ind_tbl;
 	u32			rqtn;
+	u16			uid;
 };
 
 struct mlx5_ib_ubuffer {
@@ -420,21 +432,23 @@
 
 	int			create_type;
 
-	/* Store signature errors */
-	bool			signature_en;
-
 	struct list_head	qps_list;
 	struct list_head	cq_recv_list;
 	struct list_head	cq_send_list;
 	struct mlx5_rate_limit	rl;
 	u32                     underlay_qpn;
-	bool			tunnel_offload_en;
+	u32			flags_en;
 	/* storage for qp sub type when core qp type is IB_QPT_DRIVER */
 	enum ib_qp_type		qp_sub_type;
+	/* A flag to indicate if there's a new counter is configured
+	 * but not take effective
+	 */
+	u32                     counter_pending;
 };
 
 struct mlx5_ib_cq_buf {
 	struct mlx5_frag_buf_ctrl fbc;
+	struct mlx5_frag_buf    frag_buf;
 	struct ib_umem		*umem;
 	int			cqe_size;
 	int			nent;
@@ -455,6 +469,7 @@
 	MLX5_IB_QP_UNDERLAY			= 1 << 10,
 	MLX5_IB_QP_PCI_WRITE_END_PADDING	= 1 << 11,
 	MLX5_IB_QP_TUNNEL_OFFLOAD		= 1 << 12,
+	MLX5_IB_QP_PACKET_BASED_CREDIT		= 1 << 13,
 };
 
 struct mlx5_umr_wr {
@@ -467,6 +482,7 @@
 	u64				length;
 	int				access_flags;
 	u32				mkey;
+	u8				ignore_free_state:1;
 };
 
 static inline const struct mlx5_umr_wr *umr_wr(const struct ib_send_wr *wr)
@@ -518,6 +534,7 @@
 	struct mlx5_core_srq	msrq;
 	struct mlx5_frag_buf	buf;
 	struct mlx5_db		db;
+	struct mlx5_frag_buf_ctrl fbc;
 	u64		       *wrid;
 	/* protect SRQ hanlding
 	 */
@@ -545,21 +562,37 @@
 struct mlx5_ib_dm {
 	struct ib_dm		ibdm;
 	phys_addr_t		dev_addr;
+	u32			type;
+	size_t			size;
+	union {
+		struct {
+			u32	obj_id;
+		} icm_dm;
+		/* other dm types specific params should be added here */
+	};
 };
 
 #define MLX5_IB_MTT_PRESENT (MLX5_IB_MTT_READ | MLX5_IB_MTT_WRITE)
 
-#define MLX5_IB_DM_ALLOWED_ACCESS (IB_ACCESS_LOCAL_WRITE   |\
-				   IB_ACCESS_REMOTE_WRITE  |\
-				   IB_ACCESS_REMOTE_READ   |\
-				   IB_ACCESS_REMOTE_ATOMIC |\
-				   IB_ZERO_BASED)
+#define MLX5_IB_DM_MEMIC_ALLOWED_ACCESS (IB_ACCESS_LOCAL_WRITE   |\
+					 IB_ACCESS_REMOTE_WRITE  |\
+					 IB_ACCESS_REMOTE_READ   |\
+					 IB_ACCESS_REMOTE_ATOMIC |\
+					 IB_ZERO_BASED)
+
+#define MLX5_IB_DM_SW_ICM_ALLOWED_ACCESS (IB_ACCESS_LOCAL_WRITE   |\
+					  IB_ACCESS_REMOTE_WRITE  |\
+					  IB_ACCESS_REMOTE_READ   |\
+					  IB_ZERO_BASED)
 
 struct mlx5_ib_mr {
 	struct ib_mr		ibmr;
 	void			*descs;
 	dma_addr_t		desc_map;
 	int			ndescs;
+	int			data_length;
+	int			meta_ndescs;
+	int			meta_length;
 	int			max_descs;
 	int			desc_size;
 	int			access_mode;
@@ -573,21 +606,41 @@
 	struct mlx5_ib_dev     *dev;
 	u32 out[MLX5_ST_SZ_DW(create_mkey_out)];
 	struct mlx5_core_sig_ctx    *sig;
-	int			live;
+	unsigned int		live;
 	void			*descs_alloc;
 	int			access_flags; /* Needed for rereg MR */
 
 	struct mlx5_ib_mr      *parent;
+	/* Needed for IB_MR_TYPE_INTEGRITY */
+	struct mlx5_ib_mr      *pi_mr;
+	struct mlx5_ib_mr      *klm_mr;
+	struct mlx5_ib_mr      *mtt_mr;
+	u64			data_iova;
+	u64			pi_iova;
+
 	atomic_t		num_leaf_free;
 	wait_queue_head_t       q_leaf_free;
+	struct mlx5_async_work  cb_work;
+	atomic_t		num_pending_prefetch;
 };
 
+static inline bool is_odp_mr(struct mlx5_ib_mr *mr)
+{
+	return IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING) && mr->umem &&
+	       mr->umem->is_odp;
+}
+
 struct mlx5_ib_mw {
 	struct ib_mw		ibmw;
 	struct mlx5_core_mkey	mmkey;
 	int			ndescs;
 };
 
+struct mlx5_ib_devx_mr {
+	struct mlx5_core_mkey	mmkey;
+	int			ndescs;
+};
+
 struct mlx5_ib_umr_context {
 	struct ib_cqe		cqe;
 	enum ib_wc_status	status;
@@ -616,7 +669,6 @@
 	spinlock_t		lock;
 
 
-	struct dentry	       *dir;
 	char                    name[4];
 	u32                     order;
 	u32			xlt;
@@ -628,11 +680,6 @@
 	u32                     miss;
 	u32			limit;
 
-	struct dentry          *fsize;
-	struct dentry          *fcur;
-	struct dentry          *fmiss;
-	struct dentry          *flimit;
-
 	struct mlx5_ib_dev     *dev;
 	struct work_struct	work;
 	struct delayed_work	dwork;
@@ -686,12 +733,6 @@
 	spinlock_t mpi_lock;
 };
 
-struct mlx5_ib_port {
-	struct mlx5_ib_counters cnts;
-	struct mlx5_ib_multiport mp;
-	struct mlx5_ib_dbg_cc_params	*dbg_cc_params;
-};
-
 struct mlx5_roce {
 	/* Protect mlx5_ib_get_netdev from invoking dev_hold() with a NULL
 	 * netdev pointer
@@ -699,12 +740,20 @@
 	rwlock_t		netdev_lock;
 	struct net_device	*netdev;
 	struct notifier_block	nb;
-	atomic_t		next_port;
+	atomic_t		tx_port_affinity;
 	enum ib_port_state last_port_state;
 	struct mlx5_ib_dev	*dev;
 	u8			native_port_num;
 };
 
+struct mlx5_ib_port {
+	struct mlx5_ib_counters cnts;
+	struct mlx5_ib_multiport mp;
+	struct mlx5_ib_dbg_cc_params *dbg_cc_params;
+	struct mlx5_roce roce;
+	struct mlx5_eswitch_rep		*rep;
+};
+
 struct mlx5_ib_dbg_param {
 	int			offset;
 	struct mlx5_ib_dev	*dev;
@@ -768,19 +817,20 @@
 	MLX5_IB_STAGE_CAPS,
 	MLX5_IB_STAGE_NON_DEFAULT_CB,
 	MLX5_IB_STAGE_ROCE,
+	MLX5_IB_STAGE_SRQ,
 	MLX5_IB_STAGE_DEVICE_RESOURCES,
+	MLX5_IB_STAGE_DEVICE_NOTIFIER,
 	MLX5_IB_STAGE_ODP,
 	MLX5_IB_STAGE_COUNTERS,
 	MLX5_IB_STAGE_CONG_DEBUGFS,
 	MLX5_IB_STAGE_UAR,
 	MLX5_IB_STAGE_BFREG,
 	MLX5_IB_STAGE_PRE_IB_REG_UMR,
-	MLX5_IB_STAGE_SPECS,
+	MLX5_IB_STAGE_WHITELIST_UID,
 	MLX5_IB_STAGE_IB_REG,
 	MLX5_IB_STAGE_POST_IB_REG_UMR,
 	MLX5_IB_STAGE_DELAY_DROP,
 	MLX5_IB_STAGE_CLASS_ATTR,
-	MLX5_IB_STAGE_REP_REG,
 	MLX5_IB_STAGE_MAX,
 };
 
@@ -800,6 +850,7 @@
 	struct list_head list;
 	struct mlx5_ib_dev *ibdev;
 	struct mlx5_core_dev *mdev;
+	struct notifier_block mdev_events;
 	struct completion unref_comp;
 	u64 sys_image_guid;
 	u32 mdev_refcnt;
@@ -814,12 +865,24 @@
 			u64			    ib_flags;
 			struct mlx5_accel_esp_xfrm *ctx;
 		} esp_aes_gcm;
+		struct {
+			struct mlx5_ib_dev *dev;
+			u32 sub_type;
+			union {
+				struct mlx5_modify_hdr *modify_hdr;
+				struct mlx5_pkt_reformat *pkt_reformat;
+			};
+		} flow_action_raw;
 	};
 };
 
-struct mlx5_memic {
+struct mlx5_dm {
 	struct mlx5_core_dev *dev;
-	spinlock_t		memic_lock;
+	/* This lock is used to protect the access to the shared
+	 * allocation map when concurrent requests by different
+	 * processes are handled.
+	 */
+	spinlock_t lock;
 	DECLARE_BITMAP(memic_alloc_pages, MLX5_MAX_MEMIC_PAGES);
 };
 
@@ -858,11 +921,38 @@
 	return container_of(ibcntrs, struct mlx5_ib_mcounters, ibcntrs);
 }
 
+int parse_flow_flow_action(struct mlx5_ib_flow_action *maction,
+			   bool is_egress,
+			   struct mlx5_flow_act *action);
+struct mlx5_ib_lb_state {
+	/* protect the user_td */
+	struct mutex		mutex;
+	u32			user_td;
+	int			qps;
+	bool			enabled;
+};
+
+struct mlx5_ib_pf_eq {
+	struct notifier_block irq_nb;
+	struct mlx5_ib_dev *dev;
+	struct mlx5_eq *core;
+	struct work_struct work;
+	spinlock_t lock; /* Pagefaults spinlock */
+	struct workqueue_struct *wq;
+	mempool_t *pool;
+};
+
+struct mlx5_devx_event_table {
+	struct mlx5_nb devx_nb;
+	/* serialize updating the event_xa */
+	struct mutex event_xa_lock;
+	struct xarray event_xa;
+};
+
 struct mlx5_ib_dev {
 	struct ib_device		ib_dev;
-	const struct uverbs_object_tree_def *driver_trees[6];
 	struct mlx5_core_dev		*mdev;
-	struct mlx5_roce		roce[MLX5_MAX_PORTS];
+	struct notifier_block		mdev_events;
 	int				num_ports;
 	/* serialize update of capability mask
 	 */
@@ -877,16 +967,16 @@
 	/* Prevents soft lock on massive reg MRs */
 	struct mutex			slow_path_mutex;
 	int				fill_delay;
-#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
 	struct ib_odp_caps	odp_caps;
 	u64			odp_max_size;
+	struct mlx5_ib_pf_eq	odp_pf_eq;
+
 	/*
 	 * Sleepable RCU that prevents destruction of MRs while they are still
 	 * being used by a page fault handler.
 	 */
 	struct srcu_struct      mr_srcu;
 	u32			null_mkey;
-#endif
 	struct mlx5_ib_flow_db	*flow_db;
 	/* protect resources needed as part of reset flow */
 	spinlock_t		reset_flow_resource_lock;
@@ -897,15 +987,18 @@
 	struct mlx5_sq_bfreg	fp_bfreg;
 	struct mlx5_ib_delay_drop	delay_drop;
 	const struct mlx5_ib_profile	*profile;
-	struct mlx5_eswitch_rep		*rep;
+	bool			is_rep;
+	int				lag_active;
 
-	/* protect the user_td */
-	struct mutex		lb_mutex;
-	u32			user_td;
+	struct mlx5_ib_lb_state		lb;
 	u8			umr_fence;
 	struct list_head	ib_dev_list;
 	u64			sys_image_guid;
-	struct mlx5_memic	memic;
+	struct mlx5_dm		dm;
+	u16			devx_whitelist_uid;
+	struct mlx5_srq_table   srq_table;
+	struct mlx5_async_ctx   async_ctx;
+	struct mlx5_devx_event_table devx_event_table;
 };
 
 static inline struct mlx5_ib_cq *to_mibcq(struct mlx5_core_cq *mcq)
@@ -923,6 +1016,14 @@
 	return container_of(ibdev, struct mlx5_ib_dev, ib_dev);
 }
 
+static inline struct mlx5_ib_dev *mlx5_udata_to_mdev(struct ib_udata *udata)
+{
+	struct mlx5_ib_ucontext *context = rdma_udata_to_drv_context(
+		udata, struct mlx5_ib_ucontext, ibucontext);
+
+	return to_mdev(context->ibucontext.device);
+}
+
 static inline struct mlx5_ib_cq *to_mcq(struct ib_cq *ibcq)
 {
 	return container_of(ibcq, struct mlx5_ib_cq, ibcq);
@@ -994,28 +1095,27 @@
 	return container_of(ibact, struct mlx5_ib_flow_action, ib_action);
 }
 
-int mlx5_ib_db_map_user(struct mlx5_ib_ucontext *context, unsigned long virt,
+int mlx5_ib_db_map_user(struct mlx5_ib_ucontext *context,
+			struct ib_udata *udata, unsigned long virt,
 			struct mlx5_db *db);
 void mlx5_ib_db_unmap_user(struct mlx5_ib_ucontext *context, struct mlx5_db *db);
 void __mlx5_ib_cq_clean(struct mlx5_ib_cq *cq, u32 qpn, struct mlx5_ib_srq *srq);
 void mlx5_ib_cq_clean(struct mlx5_ib_cq *cq, u32 qpn, struct mlx5_ib_srq *srq);
 void mlx5_ib_free_srq_wqe(struct mlx5_ib_srq *srq, int wqe_index);
-int mlx5_MAD_IFC(struct mlx5_ib_dev *dev, int ignore_mkey, int ignore_bkey,
-		 u8 port, const struct ib_wc *in_wc, const struct ib_grh *in_grh,
-		 const void *in_mad, void *response_mad);
-struct ib_ah *mlx5_ib_create_ah(struct ib_pd *pd, struct rdma_ah_attr *ah_attr,
-				struct ib_udata *udata);
+int mlx5_ib_create_ah(struct ib_ah *ah, struct rdma_ah_attr *ah_attr, u32 flags,
+		      struct ib_udata *udata);
 int mlx5_ib_query_ah(struct ib_ah *ibah, struct rdma_ah_attr *ah_attr);
-int mlx5_ib_destroy_ah(struct ib_ah *ah);
-struct ib_srq *mlx5_ib_create_srq(struct ib_pd *pd,
-				  struct ib_srq_init_attr *init_attr,
-				  struct ib_udata *udata);
+void mlx5_ib_destroy_ah(struct ib_ah *ah, u32 flags);
+int mlx5_ib_create_srq(struct ib_srq *srq, struct ib_srq_init_attr *init_attr,
+		       struct ib_udata *udata);
 int mlx5_ib_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
 		       enum ib_srq_attr_mask attr_mask, struct ib_udata *udata);
 int mlx5_ib_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *srq_attr);
-int mlx5_ib_destroy_srq(struct ib_srq *srq);
+void mlx5_ib_destroy_srq(struct ib_srq *srq, struct ib_udata *udata);
 int mlx5_ib_post_srq_recv(struct ib_srq *ibsrq, const struct ib_recv_wr *wr,
 			  const struct ib_recv_wr **bad_wr);
+int mlx5_ib_enable_lb(struct mlx5_ib_dev *dev, bool td, bool qp);
+void mlx5_ib_disable_lb(struct mlx5_ib_dev *dev, bool td, bool qp);
 struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd,
 				struct ib_qp_init_attr *init_attr,
 				struct ib_udata *udata);
@@ -1023,22 +1123,22 @@
 		      int attr_mask, struct ib_udata *udata);
 int mlx5_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr_mask,
 		     struct ib_qp_init_attr *qp_init_attr);
-int mlx5_ib_destroy_qp(struct ib_qp *qp);
+int mlx5_ib_destroy_qp(struct ib_qp *qp, struct ib_udata *udata);
 void mlx5_ib_drain_sq(struct ib_qp *qp);
 void mlx5_ib_drain_rq(struct ib_qp *qp);
 int mlx5_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr,
 		      const struct ib_send_wr **bad_wr);
 int mlx5_ib_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr,
 		      const struct ib_recv_wr **bad_wr);
-void *mlx5_get_send_wqe(struct mlx5_ib_qp *qp, int n);
-int mlx5_ib_read_user_wqe(struct mlx5_ib_qp *qp, int send, int wqe_index,
-			  void *buffer, u32 length,
-			  struct mlx5_ib_qp_base *base);
-struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev,
-				const struct ib_cq_init_attr *attr,
-				struct ib_ucontext *context,
-				struct ib_udata *udata);
-int mlx5_ib_destroy_cq(struct ib_cq *cq);
+int mlx5_ib_read_user_wqe_sq(struct mlx5_ib_qp *qp, int wqe_index, void *buffer,
+			     int buflen, size_t *bc);
+int mlx5_ib_read_user_wqe_rq(struct mlx5_ib_qp *qp, int wqe_index, void *buffer,
+			     int buflen, size_t *bc);
+int mlx5_ib_read_user_wqe_srq(struct mlx5_ib_srq *srq, int wqe_index,
+			      void *buffer, int buflen, size_t *bc);
+int mlx5_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
+		      struct ib_udata *udata);
+void mlx5_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata);
 int mlx5_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc);
 int mlx5_ib_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags);
 int mlx5_ib_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period);
@@ -1047,32 +1147,44 @@
 struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 				  u64 virt_addr, int access_flags,
 				  struct ib_udata *udata);
+int mlx5_ib_advise_mr(struct ib_pd *pd,
+		      enum ib_uverbs_advise_mr_advice advice,
+		      u32 flags,
+		      struct ib_sge *sg_list,
+		      u32 num_sge,
+		      struct uverbs_attr_bundle *attrs);
 struct ib_mw *mlx5_ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type,
 			       struct ib_udata *udata);
 int mlx5_ib_dealloc_mw(struct ib_mw *mw);
 int mlx5_ib_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages,
 		       int page_shift, int flags);
 struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd,
+					     struct ib_udata *udata,
 					     int access_flags);
 void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *mr);
 int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start,
 			  u64 length, u64 virt_addr, int access_flags,
 			  struct ib_pd *pd, struct ib_udata *udata);
-int mlx5_ib_dereg_mr(struct ib_mr *ibmr);
-struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd,
-			       enum ib_mr_type mr_type,
-			       u32 max_num_sg);
+int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata);
+struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
+			       u32 max_num_sg, struct ib_udata *udata);
+struct ib_mr *mlx5_ib_alloc_mr_integrity(struct ib_pd *pd,
+					 u32 max_num_sg,
+					 u32 max_num_meta_sg);
 int mlx5_ib_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents,
 		      unsigned int *sg_offset);
+int mlx5_ib_map_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg,
+			 int data_sg_nents, unsigned int *data_sg_offset,
+			 struct scatterlist *meta_sg, int meta_sg_nents,
+			 unsigned int *meta_sg_offset);
 int mlx5_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
 			const struct ib_wc *in_wc, const struct ib_grh *in_grh,
 			const struct ib_mad_hdr *in, size_t in_mad_size,
 			struct ib_mad_hdr *out, size_t *out_mad_size,
 			u16 *out_mad_pkey_index);
 struct ib_xrcd *mlx5_ib_alloc_xrcd(struct ib_device *ibdev,
-					  struct ib_ucontext *context,
-					  struct ib_udata *udata);
-int mlx5_ib_dealloc_xrcd(struct ib_xrcd *xrcd);
+				   struct ib_udata *udata);
+int mlx5_ib_dealloc_xrcd(struct ib_xrcd *xrcd, struct ib_udata *udata);
 int mlx5_ib_get_buf_offset(u64 addr, int page_shift, u32 *offset);
 int mlx5_query_ext_port_caps(struct mlx5_ib_dev *dev, u8 port);
 int mlx5_query_mad_ifc_smp_attr_node_info(struct ib_device *ibdev,
@@ -1105,7 +1217,7 @@
 void mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem,
 			  int page_shift, __be64 *pas, int access_flags);
 void mlx5_ib_copy_pas(u64 *old, u64 *new, int step, int num);
-int mlx5_ib_get_cqe_size(struct mlx5_ib_dev *dev, struct ib_cq *ibcq);
+int mlx5_ib_get_cqe_size(struct ib_cq *ibcq);
 int mlx5_mr_cache_init(struct mlx5_ib_dev *dev);
 int mlx5_mr_cache_cleanup(struct mlx5_ib_dev *dev);
 
@@ -1116,7 +1228,7 @@
 struct ib_wq *mlx5_ib_create_wq(struct ib_pd *pd,
 				struct ib_wq_init_attr *init_attr,
 				struct ib_udata *udata);
-int mlx5_ib_destroy_wq(struct ib_wq *wq);
+void mlx5_ib_destroy_wq(struct ib_wq *wq, struct ib_udata *udata);
 int mlx5_ib_modify_wq(struct ib_wq *wq, struct ib_wq_attr *wq_attr,
 		      u32 wq_attr_mask, struct ib_udata *udata);
 struct ib_rwq_ind_table *mlx5_ib_create_rwq_ind_table(struct ib_device *device,
@@ -1128,23 +1240,26 @@
 			       struct ib_ucontext *context,
 			       struct ib_dm_alloc_attr *attr,
 			       struct uverbs_attr_bundle *attrs);
-int mlx5_ib_dealloc_dm(struct ib_dm *ibdm);
+int mlx5_ib_dealloc_dm(struct ib_dm *ibdm, struct uverbs_attr_bundle *attrs);
 struct ib_mr *mlx5_ib_reg_dm_mr(struct ib_pd *pd, struct ib_dm *dm,
 				struct ib_dm_mr_attr *attr,
 				struct uverbs_attr_bundle *attrs);
 
 #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
 void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev);
-void mlx5_ib_pfault(struct mlx5_core_dev *mdev, void *context,
-		    struct mlx5_pagefault *pfault);
 int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev);
+void mlx5_ib_odp_cleanup_one(struct mlx5_ib_dev *ibdev);
 int __init mlx5_ib_odp_init(void);
 void mlx5_ib_odp_cleanup(void);
-void mlx5_ib_invalidate_range(struct ib_umem *umem, unsigned long start,
+void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp, unsigned long start,
 			      unsigned long end);
 void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent);
 void mlx5_odp_populate_klm(struct mlx5_klm *pklm, size_t offset,
 			   size_t nentries, struct mlx5_ib_mr *mr, int flags);
+
+int mlx5_ib_advise_mr_prefetch(struct ib_pd *pd,
+			       enum ib_uverbs_advise_mr_advice advice,
+			       u32 flags, struct ib_sge *sg_list, u32 num_sge);
 #else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
 static inline void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev)
 {
@@ -1152,6 +1267,7 @@
 }
 
 static inline int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev) { return 0; }
+static inline void mlx5_ib_odp_cleanup_one(struct mlx5_ib_dev *ibdev) {}
 static inline int mlx5_ib_odp_init(void) { return 0; }
 static inline void mlx5_ib_odp_cleanup(void)				    {}
 static inline void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent) {}
@@ -1159,27 +1275,19 @@
 					 size_t nentries, struct mlx5_ib_mr *mr,
 					 int flags) {}
 
+static inline int
+mlx5_ib_advise_mr_prefetch(struct ib_pd *pd,
+			   enum ib_uverbs_advise_mr_advice advice, u32 flags,
+			   struct ib_sge *sg_list, u32 num_sge)
+{
+	return -EOPNOTSUPP;
+}
+static inline void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp,
+					    unsigned long start,
+					    unsigned long end){};
 #endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
 
 /* Needed for rep profile */
-int mlx5_ib_stage_init_init(struct mlx5_ib_dev *dev);
-void mlx5_ib_stage_init_cleanup(struct mlx5_ib_dev *dev);
-int mlx5_ib_stage_rep_flow_db_init(struct mlx5_ib_dev *dev);
-int mlx5_ib_stage_caps_init(struct mlx5_ib_dev *dev);
-int mlx5_ib_stage_rep_non_default_cb(struct mlx5_ib_dev *dev);
-int mlx5_ib_stage_rep_roce_init(struct mlx5_ib_dev *dev);
-void mlx5_ib_stage_rep_roce_cleanup(struct mlx5_ib_dev *dev);
-int mlx5_ib_stage_dev_res_init(struct mlx5_ib_dev *dev);
-void mlx5_ib_stage_dev_res_cleanup(struct mlx5_ib_dev *dev);
-int mlx5_ib_stage_counters_init(struct mlx5_ib_dev *dev);
-void mlx5_ib_stage_counters_cleanup(struct mlx5_ib_dev *dev);
-int mlx5_ib_stage_bfrag_init(struct mlx5_ib_dev *dev);
-void mlx5_ib_stage_bfrag_cleanup(struct mlx5_ib_dev *dev);
-void mlx5_ib_stage_pre_ib_reg_umr_cleanup(struct mlx5_ib_dev *dev);
-int mlx5_ib_stage_ib_reg_init(struct mlx5_ib_dev *dev);
-void mlx5_ib_stage_ib_reg_cleanup(struct mlx5_ib_dev *dev);
-int mlx5_ib_stage_post_ib_reg_umr_init(struct mlx5_ib_dev *dev);
-int mlx5_ib_stage_class_attr_init(struct mlx5_ib_dev *dev);
 void __mlx5_ib_remove(struct mlx5_ib_dev *dev,
 		      const struct mlx5_ib_profile *profile,
 		      int stage);
@@ -1199,7 +1307,7 @@
 			       const struct ib_gid_attr *attr);
 
 void mlx5_ib_cleanup_cong_debugfs(struct mlx5_ib_dev *dev, u8 port_num);
-int mlx5_ib_init_cong_debugfs(struct mlx5_ib_dev *dev, u8 port_num);
+void mlx5_ib_init_cong_debugfs(struct mlx5_ib_dev *dev, u8 port_num);
 
 /* GSI QP helper functions */
 struct ib_qp *mlx5_ib_gsi_create_qp(struct ib_pd *pd,
@@ -1228,34 +1336,39 @@
 				  u8 port_num);
 
 #if IS_ENABLED(CONFIG_INFINIBAND_USER_ACCESS)
-int mlx5_ib_devx_create(struct mlx5_ib_dev *dev,
-			struct mlx5_ib_ucontext *context);
-void mlx5_ib_devx_destroy(struct mlx5_ib_dev *dev,
-			  struct mlx5_ib_ucontext *context);
+int mlx5_ib_devx_create(struct mlx5_ib_dev *dev, bool is_user);
+void mlx5_ib_devx_destroy(struct mlx5_ib_dev *dev, u16 uid);
+void mlx5_ib_devx_init_event_table(struct mlx5_ib_dev *dev);
+void mlx5_ib_devx_cleanup_event_table(struct mlx5_ib_dev *dev);
 const struct uverbs_object_tree_def *mlx5_ib_get_devx_tree(void);
+extern const struct uapi_definition mlx5_ib_devx_defs[];
+extern const struct uapi_definition mlx5_ib_flow_defs[];
 struct mlx5_ib_flow_handler *mlx5_ib_raw_fs_rule_add(
 	struct mlx5_ib_dev *dev, struct mlx5_ib_flow_matcher *fs_matcher,
+	struct mlx5_flow_context *flow_context,
+	struct mlx5_flow_act *flow_act, u32 counter_id,
 	void *cmd_in, int inlen, int dest_id, int dest_type);
 bool mlx5_ib_devx_is_flow_dest(void *obj, int *dest_id, int *dest_type);
+bool mlx5_ib_devx_is_flow_counter(void *obj, u32 *counter_id);
 int mlx5_ib_get_flow_trees(const struct uverbs_object_tree_def **root);
+void mlx5_ib_destroy_flow_action_raw(struct mlx5_ib_flow_action *maction);
 #else
 static inline int
 mlx5_ib_devx_create(struct mlx5_ib_dev *dev,
-		    struct mlx5_ib_ucontext *context) { return -EOPNOTSUPP; };
-static inline void mlx5_ib_devx_destroy(struct mlx5_ib_dev *dev,
-					struct mlx5_ib_ucontext *context) {}
-static inline const struct uverbs_object_tree_def *
-mlx5_ib_get_devx_tree(void) { return NULL; }
+			   bool is_user) { return -EOPNOTSUPP; }
+static inline void mlx5_ib_devx_destroy(struct mlx5_ib_dev *dev, u16 uid) {}
+static inline void mlx5_ib_devx_init_event_table(struct mlx5_ib_dev *dev) {}
+static inline void mlx5_ib_devx_cleanup_event_table(struct mlx5_ib_dev *dev) {}
 static inline bool mlx5_ib_devx_is_flow_dest(void *obj, int *dest_id,
 					     int *dest_type)
 {
 	return false;
 }
-static inline int
-mlx5_ib_get_flow_trees(const struct uverbs_object_tree_def **root)
+static inline void
+mlx5_ib_destroy_flow_action_raw(struct mlx5_ib_flow_action *maction)
 {
-	return 0;
-}
+	return;
+};
 #endif
 static inline void init_query_mad(struct ib_smp *mad)
 {
@@ -1361,4 +1474,21 @@
 int bfregn_to_uar_index(struct mlx5_ib_dev *dev,
 			struct mlx5_bfreg_info *bfregi, u32 bfregn,
 			bool dyn_bfreg);
+
+int mlx5_ib_qp_set_counter(struct ib_qp *qp, struct rdma_counter *counter);
+u16 mlx5_ib_get_counters_id(struct mlx5_ib_dev *dev, u8 port_num);
+
+static inline bool mlx5_ib_can_use_umr(struct mlx5_ib_dev *dev,
+				       bool do_modify_atomic)
+{
+	if (MLX5_CAP_GEN(dev->mdev, umr_modify_entity_size_disabled))
+		return false;
+
+	if (do_modify_atomic &&
+	    MLX5_CAP_GEN(dev->mdev, atomic) &&
+	    MLX5_CAP_GEN(dev->mdev, umr_modify_atomic_disabled))
+		return false;
+
+	return true;
+}
 #endif /* MLX5_IB_H */
diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index 7df4a4f..7019c12 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -51,30 +51,19 @@
 static void dereg_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr);
 static int mr_cache_max_order(struct mlx5_ib_dev *dev);
 static int unreg_umr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr);
-static bool umr_can_modify_entity_size(struct mlx5_ib_dev *dev)
-{
-	return !MLX5_CAP_GEN(dev->mdev, umr_modify_entity_size_disabled);
-}
 
 static bool umr_can_use_indirect_mkey(struct mlx5_ib_dev *dev)
 {
 	return !MLX5_CAP_GEN(dev->mdev, umr_indirect_mkey_disabled);
 }
 
-static bool use_umr(struct mlx5_ib_dev *dev, int order)
-{
-	return order <= mr_cache_max_order(dev) &&
-		umr_can_modify_entity_size(dev);
-}
-
 static int destroy_mkey(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
 {
 	int err = mlx5_core_destroy_mkey(dev->mdev, &mr->mmkey);
 
-#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
-	/* Wait until all page fault handlers using the mr complete. */
-	synchronize_srcu(&dev->mr_srcu);
-#endif
+	if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING))
+		/* Wait until all page fault handlers using the mr complete. */
+		synchronize_srcu(&dev->mr_srcu);
 
 	return err;
 }
@@ -95,44 +84,17 @@
 		length + (start & (MLX5_ADAPTER_PAGE_SIZE - 1));
 }
 
-#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
-static void update_odp_mr(struct mlx5_ib_mr *mr)
+static void reg_mr_callback(int status, struct mlx5_async_work *context)
 {
-	if (mr->umem->odp_data) {
-		/*
-		 * This barrier prevents the compiler from moving the
-		 * setting of umem->odp_data->private to point to our
-		 * MR, before reg_umr finished, to ensure that the MR
-		 * initialization have finished before starting to
-		 * handle invalidations.
-		 */
-		smp_wmb();
-		mr->umem->odp_data->private = mr;
-		/*
-		 * Make sure we will see the new
-		 * umem->odp_data->private value in the invalidation
-		 * routines, before we can get page faults on the
-		 * MR. Page faults can happen once we put the MR in
-		 * the tree, below this line. Without the barrier,
-		 * there can be a fault handling and an invalidation
-		 * before umem->odp_data->private == mr is visible to
-		 * the invalidation handler.
-		 */
-		smp_wmb();
-	}
-}
-#endif
-
-static void reg_mr_callback(int status, void *context)
-{
-	struct mlx5_ib_mr *mr = context;
+	struct mlx5_ib_mr *mr =
+		container_of(context, struct mlx5_ib_mr, cb_work);
 	struct mlx5_ib_dev *dev = mr->dev;
 	struct mlx5_mr_cache *cache = &dev->cache;
 	int c = order2idx(dev, mr->order);
 	struct mlx5_cache_ent *ent = &cache->ent[c];
 	u8 key;
 	unsigned long flags;
-	struct mlx5_mkey_table *table = &dev->mdev->priv.mkey_table;
+	struct xarray *mkeys = &dev->mdev->priv.mkey_table;
 	int err;
 
 	spin_lock_irqsave(&ent->lock, flags);
@@ -160,12 +122,12 @@
 	ent->size++;
 	spin_unlock_irqrestore(&ent->lock, flags);
 
-	write_lock_irqsave(&table->lock, flags);
-	err = radix_tree_insert(&table->tree, mlx5_base_mkey(mr->mmkey.key),
-				&mr->mmkey);
+	xa_lock_irqsave(mkeys, flags);
+	err = xa_err(__xa_store(mkeys, mlx5_base_mkey(mr->mmkey.key),
+				&mr->mmkey, GFP_ATOMIC));
+	xa_unlock_irqrestore(mkeys, flags);
 	if (err)
 		pr_err("Error inserting to mkey tree. 0x%x\n", -err);
-	write_unlock_irqrestore(&table->lock, flags);
 
 	if (!completion_done(&ent->compl))
 		complete(&ent->compl);
@@ -216,9 +178,9 @@
 		ent->pending++;
 		spin_unlock_irq(&ent->lock);
 		err = mlx5_core_create_mkey_cb(dev->mdev, &mr->mmkey,
-					       in, inlen,
+					       &dev->async_ctx, in, inlen,
 					       mr->out, sizeof(mr->out),
-					       reg_mr_callback, mr);
+					       reg_mr_callback, &mr->cb_work);
 		if (err) {
 			spin_lock_irq(&ent->lock);
 			ent->pending--;
@@ -256,9 +218,8 @@
 		mlx5_core_destroy_mkey(dev->mdev, &mr->mmkey);
 	}
 
-#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
-	synchronize_srcu(&dev->mr_srcu);
-#endif
+	if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING))
+		synchronize_srcu(&dev->mr_srcu);
 
 	list_for_each_entry_safe(mr, tmp_mr, &del_list, list) {
 		list_del(&mr->list);
@@ -548,14 +509,17 @@
 		return;
 
 	c = order2idx(dev, mr->order);
-	if (c < 0 || c >= MAX_MR_CACHE_ENTRIES) {
-		mlx5_ib_warn(dev, "order %d, cache index %d\n", mr->order, c);
+	WARN_ON(c < 0 || c >= MAX_MR_CACHE_ENTRIES);
+
+	if (unreg_umr(dev, mr)) {
+		mr->allocated_from_cache = false;
+		destroy_mkey(dev, mr);
+		ent = &cache->ent[c];
+		if (ent->cur < ent->limit)
+			queue_work(cache->wq, &ent->work);
 		return;
 	}
 
-	if (unreg_umr(dev, mr))
-		return;
-
 	ent = &cache->ent[c];
 	spin_lock_irq(&ent->lock);
 	list_add_tail(&mr->list, &ent->head);
@@ -603,59 +567,34 @@
 
 static void mlx5_mr_cache_debugfs_cleanup(struct mlx5_ib_dev *dev)
 {
-	if (!mlx5_debugfs_root || dev->rep)
+	if (!mlx5_debugfs_root || dev->is_rep)
 		return;
 
 	debugfs_remove_recursive(dev->cache.root);
 	dev->cache.root = NULL;
 }
 
-static int mlx5_mr_cache_debugfs_init(struct mlx5_ib_dev *dev)
+static void mlx5_mr_cache_debugfs_init(struct mlx5_ib_dev *dev)
 {
 	struct mlx5_mr_cache *cache = &dev->cache;
 	struct mlx5_cache_ent *ent;
+	struct dentry *dir;
 	int i;
 
-	if (!mlx5_debugfs_root || dev->rep)
-		return 0;
+	if (!mlx5_debugfs_root || dev->is_rep)
+		return;
 
 	cache->root = debugfs_create_dir("mr_cache", dev->mdev->priv.dbg_root);
-	if (!cache->root)
-		return -ENOMEM;
 
 	for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
 		ent = &cache->ent[i];
 		sprintf(ent->name, "%d", ent->order);
-		ent->dir = debugfs_create_dir(ent->name,  cache->root);
-		if (!ent->dir)
-			goto err;
-
-		ent->fsize = debugfs_create_file("size", 0600, ent->dir, ent,
-						 &size_fops);
-		if (!ent->fsize)
-			goto err;
-
-		ent->flimit = debugfs_create_file("limit", 0600, ent->dir, ent,
-						  &limit_fops);
-		if (!ent->flimit)
-			goto err;
-
-		ent->fcur = debugfs_create_u32("cur", 0400, ent->dir,
-					       &ent->cur);
-		if (!ent->fcur)
-			goto err;
-
-		ent->fmiss = debugfs_create_u32("miss", 0600, ent->dir,
-						&ent->miss);
-		if (!ent->fmiss)
-			goto err;
+		dir = debugfs_create_dir(ent->name, cache->root);
+		debugfs_create_file("size", 0600, dir, ent, &size_fops);
+		debugfs_create_file("limit", 0600, dir, ent, &limit_fops);
+		debugfs_create_u32("cur", 0400, dir, &ent->cur);
+		debugfs_create_u32("miss", 0600, dir, &ent->miss);
 	}
-
-	return 0;
-err:
-	mlx5_mr_cache_debugfs_cleanup(dev);
-
-	return -ENOMEM;
 }
 
 static void delay_time_func(struct timer_list *t)
@@ -669,7 +608,6 @@
 {
 	struct mlx5_mr_cache *cache = &dev->cache;
 	struct mlx5_cache_ent *ent;
-	int err;
 	int i;
 
 	mutex_init(&dev->slow_path_mutex);
@@ -679,6 +617,7 @@
 		return -ENOMEM;
 	}
 
+	mlx5_cmd_init_async_ctx(dev->mdev, &dev->async_ctx);
 	timer_setup(&dev->delay_timer, delay_time_func, 0);
 	for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
 		ent = &cache->ent[i];
@@ -705,7 +644,7 @@
 			   MLX5_IB_UMR_OCTOWORD;
 		ent->access_mode = MLX5_MKC_ACCESS_MODE_MTT;
 		if ((dev->mdev->profile->mask & MLX5_PROF_MASK_MR_CACHE) &&
-		    !dev->rep &&
+		    !dev->is_rep &&
 		    mlx5_core_is_pf(dev->mdev))
 			ent->limit = dev->mdev->profile->mr_cache[i].limit;
 		else
@@ -713,45 +652,11 @@
 		queue_work(cache->wq, &ent->work);
 	}
 
-	err = mlx5_mr_cache_debugfs_init(dev);
-	if (err)
-		mlx5_ib_warn(dev, "cache debugfs failure\n");
-
-	/*
-	 * We don't want to fail driver if debugfs failed to initialize,
-	 * so we are not forwarding error to the user.
-	 */
+	mlx5_mr_cache_debugfs_init(dev);
 
 	return 0;
 }
 
-static void wait_for_async_commands(struct mlx5_ib_dev *dev)
-{
-	struct mlx5_mr_cache *cache = &dev->cache;
-	struct mlx5_cache_ent *ent;
-	int total = 0;
-	int i;
-	int j;
-
-	for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
-		ent = &cache->ent[i];
-		for (j = 0 ; j < 1000; j++) {
-			if (!ent->pending)
-				break;
-			msleep(50);
-		}
-	}
-	for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
-		ent = &cache->ent[i];
-		total += ent->pending;
-	}
-
-	if (total)
-		mlx5_ib_warn(dev, "aborted while there are %d pending mr requests\n", total);
-	else
-		mlx5_ib_warn(dev, "done with all pending requests\n");
-}
-
 int mlx5_mr_cache_cleanup(struct mlx5_ib_dev *dev)
 {
 	int i;
@@ -763,12 +668,12 @@
 	flush_workqueue(dev->cache.wq);
 
 	mlx5_mr_cache_debugfs_cleanup(dev);
+	mlx5_cmd_cleanup_async_ctx(&dev->async_ctx);
 
 	for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++)
 		clean_keys(dev, i);
 
 	destroy_workqueue(dev->cache.wq);
-	wait_for_async_commands(dev);
 	del_timer_sync(&dev->delay_timer);
 
 	return 0;
@@ -847,26 +752,43 @@
 	return MLX5_MAX_UMR_SHIFT;
 }
 
-static int mr_umem_get(struct ib_pd *pd, u64 start, u64 length,
-		       int access_flags, struct ib_umem **umem,
-		       int *npages, int *page_shift, int *ncont,
-		       int *order)
+static int mr_umem_get(struct mlx5_ib_dev *dev, struct ib_udata *udata,
+		       u64 start, u64 length, int access_flags,
+		       struct ib_umem **umem, int *npages, int *page_shift,
+		       int *ncont, int *order)
 {
-	struct mlx5_ib_dev *dev = to_mdev(pd->device);
 	struct ib_umem *u;
-	int err;
 
 	*umem = NULL;
 
-	u = ib_umem_get(pd->uobject->context, start, length, access_flags, 0);
-	err = PTR_ERR_OR_ZERO(u);
-	if (err) {
-		mlx5_ib_dbg(dev, "umem get failed (%d)\n", err);
-		return err;
+	if (access_flags & IB_ACCESS_ON_DEMAND) {
+		struct ib_umem_odp *odp;
+
+		odp = ib_umem_odp_get(udata, start, length, access_flags);
+		if (IS_ERR(odp)) {
+			mlx5_ib_dbg(dev, "umem get failed (%ld)\n",
+				    PTR_ERR(odp));
+			return PTR_ERR(odp);
+		}
+
+		u = &odp->umem;
+
+		*page_shift = odp->page_shift;
+		*ncont = ib_umem_odp_num_pages(odp);
+		*npages = *ncont << (*page_shift - PAGE_SHIFT);
+		if (order)
+			*order = ilog2(roundup_pow_of_two(*ncont));
+	} else {
+		u = ib_umem_get(udata, start, length, access_flags, 0);
+		if (IS_ERR(u)) {
+			mlx5_ib_dbg(dev, "umem get failed (%ld)\n", PTR_ERR(u));
+			return PTR_ERR(u);
+		}
+
+		mlx5_ib_cont_pages(u, start, MLX5_MKEY_PAGE_SHIFT_MASK, npages,
+				   page_shift, ncont, order);
 	}
 
-	mlx5_ib_cont_pages(u, start, MLX5_MKEY_PAGE_SHIFT_MASK, npages,
-			   page_shift, ncont, order);
 	if (!*npages) {
 		mlx5_ib_warn(dev, "avoid zero region\n");
 		ib_umem_release(u);
@@ -1211,7 +1133,7 @@
 	return ERR_PTR(err);
 }
 
-static void set_mr_fileds(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr,
+static void set_mr_fields(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr,
 			  int npages, u64 length, int access_flags)
 {
 	mr->npages = npages;
@@ -1222,8 +1144,8 @@
 	mr->access_flags = access_flags;
 }
 
-static struct ib_mr *mlx5_ib_get_memic_mr(struct ib_pd *pd, u64 memic_addr,
-					  u64 length, int acc)
+static struct ib_mr *mlx5_ib_get_dm_mr(struct ib_pd *pd, u64 start_addr,
+				       u64 length, int acc, int mode)
 {
 	struct mlx5_ib_dev *dev = to_mdev(pd->device);
 	int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
@@ -1245,9 +1167,8 @@
 
 	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
 
-	MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_MEMIC & 0x3);
-	MLX5_SET(mkc, mkc, access_mode_4_2,
-		 (MLX5_MKC_ACCESS_MODE_MEMIC >> 2) & 0x7);
+	MLX5_SET(mkc, mkc, access_mode_1_0, mode & 0x3);
+	MLX5_SET(mkc, mkc, access_mode_4_2, (mode >> 2) & 0x7);
 	MLX5_SET(mkc, mkc, a, !!(acc & IB_ACCESS_REMOTE_ATOMIC));
 	MLX5_SET(mkc, mkc, rw, !!(acc & IB_ACCESS_REMOTE_WRITE));
 	MLX5_SET(mkc, mkc, rr, !!(acc & IB_ACCESS_REMOTE_READ));
@@ -1257,8 +1178,7 @@
 	MLX5_SET64(mkc, mkc, len, length);
 	MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn);
 	MLX5_SET(mkc, mkc, qpn, 0xffffff);
-	MLX5_SET64(mkc, mkc, start_addr,
-		   memic_addr - pci_resource_start(dev->mdev->pdev, 0));
+	MLX5_SET64(mkc, mkc, start_addr, start_addr);
 
 	err = mlx5_core_create_mkey(mdev, &mr->mmkey, in, inlen);
 	if (err)
@@ -1267,7 +1187,7 @@
 	kfree(in);
 
 	mr->umem = NULL;
-	set_mr_fileds(dev, mr, 0, length, acc);
+	set_mr_fields(dev, mr, 0, length, acc);
 
 	return &mr->ibmr;
 
@@ -1280,20 +1200,51 @@
 	return ERR_PTR(err);
 }
 
+int mlx5_ib_advise_mr(struct ib_pd *pd,
+		      enum ib_uverbs_advise_mr_advice advice,
+		      u32 flags,
+		      struct ib_sge *sg_list,
+		      u32 num_sge,
+		      struct uverbs_attr_bundle *attrs)
+{
+	if (advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH &&
+	    advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE)
+		return -EOPNOTSUPP;
+
+	return mlx5_ib_advise_mr_prefetch(pd, advice, flags,
+					 sg_list, num_sge);
+}
+
 struct ib_mr *mlx5_ib_reg_dm_mr(struct ib_pd *pd, struct ib_dm *dm,
 				struct ib_dm_mr_attr *attr,
 				struct uverbs_attr_bundle *attrs)
 {
 	struct mlx5_ib_dm *mdm = to_mdm(dm);
-	u64 memic_addr;
+	struct mlx5_core_dev *dev = to_mdev(dm->device)->mdev;
+	u64 start_addr = mdm->dev_addr + attr->offset;
+	int mode;
 
-	if (attr->access_flags & ~MLX5_IB_DM_ALLOWED_ACCESS)
+	switch (mdm->type) {
+	case MLX5_IB_UAPI_DM_TYPE_MEMIC:
+		if (attr->access_flags & ~MLX5_IB_DM_MEMIC_ALLOWED_ACCESS)
+			return ERR_PTR(-EINVAL);
+
+		mode = MLX5_MKC_ACCESS_MODE_MEMIC;
+		start_addr -= pci_resource_start(dev->pdev, 0);
+		break;
+	case MLX5_IB_UAPI_DM_TYPE_STEERING_SW_ICM:
+	case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_SW_ICM:
+		if (attr->access_flags & ~MLX5_IB_DM_SW_ICM_ALLOWED_ACCESS)
+			return ERR_PTR(-EINVAL);
+
+		mode = MLX5_MKC_ACCESS_MODE_SW_ICM;
+		break;
+	default:
 		return ERR_PTR(-EINVAL);
+	}
 
-	memic_addr = mdm->dev_addr + attr->offset;
-
-	return mlx5_ib_get_memic_mr(pd, memic_addr, attr->length,
-				    attr->access_flags);
+	return mlx5_ib_get_dm_mr(pd, start_addr, attr->length,
+				 attr->access_flags, mode);
 }
 
 struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
@@ -1302,7 +1253,7 @@
 {
 	struct mlx5_ib_dev *dev = to_mdev(pd->device);
 	struct mlx5_ib_mr *mr = NULL;
-	bool populate_mtts = false;
+	bool use_umr;
 	struct ib_umem *umem;
 	int page_shift;
 	int npages;
@@ -1316,48 +1267,46 @@
 	mlx5_ib_dbg(dev, "start 0x%llx, virt_addr 0x%llx, length 0x%llx, access_flags 0x%x\n",
 		    start, virt_addr, length, access_flags);
 
-#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
-	if (!start && length == U64_MAX) {
+	if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING) && !start &&
+	    length == U64_MAX) {
 		if (!(access_flags & IB_ACCESS_ON_DEMAND) ||
 		    !(dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT))
 			return ERR_PTR(-EINVAL);
 
-		mr = mlx5_ib_alloc_implicit_mr(to_mpd(pd), access_flags);
+		mr = mlx5_ib_alloc_implicit_mr(to_mpd(pd), udata, access_flags);
 		if (IS_ERR(mr))
 			return ERR_CAST(mr);
 		return &mr->ibmr;
 	}
-#endif
 
-	err = mr_umem_get(pd, start, length, access_flags, &umem, &npages,
-			   &page_shift, &ncont, &order);
+	err = mr_umem_get(dev, udata, start, length, access_flags, &umem,
+			  &npages, &page_shift, &ncont, &order);
 
 	if (err < 0)
 		return ERR_PTR(err);
 
-	if (use_umr(dev, order)) {
+	use_umr = mlx5_ib_can_use_umr(dev, true);
+
+	if (order <= mr_cache_max_order(dev) && use_umr) {
 		mr = alloc_mr_from_cache(pd, umem, virt_addr, length, ncont,
 					 page_shift, order, access_flags);
 		if (PTR_ERR(mr) == -EAGAIN) {
 			mlx5_ib_dbg(dev, "cache empty for order %d\n", order);
 			mr = NULL;
 		}
-		populate_mtts = false;
 	} else if (!MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset)) {
 		if (access_flags & IB_ACCESS_ON_DEMAND) {
 			err = -EINVAL;
 			pr_err("Got MR registration for ODP MR > 512MB, not supported for Connect-IB\n");
 			goto error;
 		}
-		populate_mtts = true;
+		use_umr = false;
 	}
 
 	if (!mr) {
-		if (!umr_can_modify_entity_size(dev))
-			populate_mtts = true;
 		mutex_lock(&dev->slow_path_mutex);
 		mr = reg_create(NULL, pd, virt_addr, length, umem, ncont,
-				page_shift, access_flags, populate_mtts);
+				page_shift, access_flags, !use_umr);
 		mutex_unlock(&dev->slow_path_mutex);
 	}
 
@@ -1369,13 +1318,9 @@
 	mlx5_ib_dbg(dev, "mkey 0x%x\n", mr->mmkey.key);
 
 	mr->umem = umem;
-	set_mr_fileds(dev, mr, npages, length, access_flags);
+	set_mr_fields(dev, mr, npages, length, access_flags);
 
-#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
-	update_odp_mr(mr);
-#endif
-
-	if (!populate_mtts) {
+	if (use_umr) {
 		int update_xlt_flags = MLX5_IB_UPD_XLT_ENABLE;
 
 		if (access_flags & IB_ACCESS_ON_DEMAND)
@@ -1390,9 +1335,13 @@
 		}
 	}
 
-#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
-	mr->live = 1;
-#endif
+	if (is_odp_mr(mr)) {
+		to_ib_umem_odp(mr->umem)->private = mr;
+		atomic_set(&mr->num_pending_prefetch, 0);
+	}
+	if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING))
+		smp_store_release(&mr->live, 1);
+
 	return &mr->ibmr;
 error:
 	ib_umem_release(umem);
@@ -1408,9 +1357,11 @@
 		return 0;
 
 	umrwr.wr.send_flags = MLX5_IB_SEND_UMR_DISABLE_MR |
-			      MLX5_IB_SEND_UMR_FAIL_IF_FREE;
+			      MLX5_IB_SEND_UMR_UPDATE_PD_ACCESS;
 	umrwr.wr.opcode = MLX5_IB_WR_UMR;
+	umrwr.pd = dev->umrc.pd;
 	umrwr.mkey = mr->mmkey.key;
+	umrwr.ignore_free_state = 1;
 
 	return mlx5_ib_post_send_wait(dev, &umrwr);
 }
@@ -1464,6 +1415,9 @@
 	if (!mr->umem)
 		return -EINVAL;
 
+	if (is_odp_mr(mr))
+		return -EOPNOTSUPP;
+
 	if (flags & IB_MR_REREG_TRANS) {
 		addr = virt_addr;
 		len = length;
@@ -1480,13 +1434,15 @@
 		flags |= IB_MR_REREG_TRANS;
 		ib_umem_release(mr->umem);
 		mr->umem = NULL;
-		err = mr_umem_get(pd, addr, len, access_flags, &mr->umem,
-				  &npages, &page_shift, &ncont, &order);
+		err = mr_umem_get(dev, udata, addr, len, access_flags,
+				  &mr->umem, &npages, &page_shift, &ncont,
+				  &order);
 		if (err)
 			goto err;
 	}
 
-	if (flags & IB_MR_REREG_TRANS && !use_umr_mtt_update(mr, addr, len)) {
+	if (!mlx5_ib_can_use_umr(dev, true) ||
+	    (flags & IB_MR_REREG_TRANS && !use_umr_mtt_update(mr, addr, len))) {
 		/*
 		 * UMR can't be used - MKey needs to be replaced.
 		 */
@@ -1507,9 +1463,6 @@
 		}
 
 		mr->allocated_from_cache = 0;
-#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
-		mr->live = 1;
-#endif
 	} else {
 		/*
 		 * Send a UMR WQE
@@ -1536,18 +1489,14 @@
 			goto err;
 	}
 
-	set_mr_fileds(dev, mr, npages, len, access_flags);
+	set_mr_fields(dev, mr, npages, len, access_flags);
 
-#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
-	update_odp_mr(mr);
-#endif
 	return 0;
 
 err:
-	if (mr->umem) {
-		ib_umem_release(mr->umem);
-		mr->umem = NULL;
-	}
+	ib_umem_release(mr->umem);
+	mr->umem = NULL;
+
 	clean_mr(dev, mr);
 	return err;
 }
@@ -1615,10 +1564,10 @@
 		mr->sig = NULL;
 	}
 
-	mlx5_free_priv_descs(mr);
-
-	if (!allocated_from_cache)
+	if (!allocated_from_cache) {
 		destroy_mkey(dev, mr);
+		mlx5_free_priv_descs(mr);
+	}
 }
 
 static void dereg_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
@@ -1626,16 +1575,27 @@
 	int npages = mr->npages;
 	struct ib_umem *umem = mr->umem;
 
-#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
-	if (umem && umem->odp_data) {
-		/* Prevent new page faults from succeeding */
-		mr->live = 0;
+	if (is_odp_mr(mr)) {
+		struct ib_umem_odp *umem_odp = to_ib_umem_odp(umem);
+
+		/* Prevent new page faults and
+		 * prefetch requests from succeeding
+		 */
+		WRITE_ONCE(mr->live, 0);
+
 		/* Wait for all running page-fault handlers to finish. */
 		synchronize_srcu(&dev->mr_srcu);
+
+		/* dequeue pending prefetch requests for the mr */
+		if (atomic_read(&mr->num_pending_prefetch))
+			flush_workqueue(system_unbound_wq);
+		WARN_ON(atomic_read(&mr->num_pending_prefetch));
+
 		/* Destroy all page mappings */
-		if (umem->odp_data->page_list)
-			mlx5_ib_invalidate_range(umem, ib_umem_start(umem),
-						 ib_umem_end(umem));
+		if (!umem_odp->is_implicit_odp)
+			mlx5_ib_invalidate_range(umem_odp,
+						 ib_umem_start(umem_odp),
+						 ib_umem_end(umem_odp));
 		else
 			mlx5_ib_free_implicit_mr(mr);
 		/*
@@ -1643,13 +1603,13 @@
 		 * so that there will not be any invalidations in
 		 * flight, looking at the *mr struct.
 		 */
-		ib_umem_release(umem);
+		ib_umem_odp_release(umem_odp);
 		atomic_sub(npages, &dev->mdev->priv.reg_pages);
 
 		/* Avoid double-freeing the umem. */
 		umem = NULL;
 	}
-#endif
+
 	clean_mr(dev, mr);
 
 	/*
@@ -1657,29 +1617,215 @@
 	 * remove the DMA mapping.
 	 */
 	mlx5_mr_cache_free(dev, mr);
-	if (umem) {
-		ib_umem_release(umem);
+	ib_umem_release(umem);
+	if (umem)
 		atomic_sub(npages, &dev->mdev->priv.reg_pages);
-	}
+
 	if (!mr->allocated_from_cache)
 		kfree(mr);
 }
 
-int mlx5_ib_dereg_mr(struct ib_mr *ibmr)
+int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
 {
-	dereg_mr(to_mdev(ibmr->device), to_mmr(ibmr));
+	struct mlx5_ib_mr *mmr = to_mmr(ibmr);
+
+	if (ibmr->type == IB_MR_TYPE_INTEGRITY) {
+		dereg_mr(to_mdev(mmr->mtt_mr->ibmr.device), mmr->mtt_mr);
+		dereg_mr(to_mdev(mmr->klm_mr->ibmr.device), mmr->klm_mr);
+	}
+
+	dereg_mr(to_mdev(ibmr->device), mmr);
+
 	return 0;
 }
 
-struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd,
-			       enum ib_mr_type mr_type,
-			       u32 max_num_sg)
+static void mlx5_set_umr_free_mkey(struct ib_pd *pd, u32 *in, int ndescs,
+				   int access_mode, int page_shift)
+{
+	void *mkc;
+
+	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
+
+	MLX5_SET(mkc, mkc, free, 1);
+	MLX5_SET(mkc, mkc, qpn, 0xffffff);
+	MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn);
+	MLX5_SET(mkc, mkc, translations_octword_size, ndescs);
+	MLX5_SET(mkc, mkc, access_mode_1_0, access_mode & 0x3);
+	MLX5_SET(mkc, mkc, access_mode_4_2, (access_mode >> 2) & 0x7);
+	MLX5_SET(mkc, mkc, umr_en, 1);
+	MLX5_SET(mkc, mkc, log_page_size, page_shift);
+}
+
+static int _mlx5_alloc_mkey_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr,
+				  int ndescs, int desc_size, int page_shift,
+				  int access_mode, u32 *in, int inlen)
+{
+	struct mlx5_ib_dev *dev = to_mdev(pd->device);
+	int err;
+
+	mr->access_mode = access_mode;
+	mr->desc_size = desc_size;
+	mr->max_descs = ndescs;
+
+	err = mlx5_alloc_priv_descs(pd->device, mr, ndescs, desc_size);
+	if (err)
+		return err;
+
+	mlx5_set_umr_free_mkey(pd, in, ndescs, access_mode, page_shift);
+
+	err = mlx5_core_create_mkey(dev->mdev, &mr->mmkey, in, inlen);
+	if (err)
+		goto err_free_descs;
+
+	mr->mmkey.type = MLX5_MKEY_MR;
+	mr->ibmr.lkey = mr->mmkey.key;
+	mr->ibmr.rkey = mr->mmkey.key;
+
+	return 0;
+
+err_free_descs:
+	mlx5_free_priv_descs(mr);
+	return err;
+}
+
+static struct mlx5_ib_mr *mlx5_ib_alloc_pi_mr(struct ib_pd *pd,
+				u32 max_num_sg, u32 max_num_meta_sg,
+				int desc_size, int access_mode)
+{
+	int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
+	int ndescs = ALIGN(max_num_sg + max_num_meta_sg, 4);
+	int page_shift = 0;
+	struct mlx5_ib_mr *mr;
+	u32 *in;
+	int err;
+
+	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+	if (!mr)
+		return ERR_PTR(-ENOMEM);
+
+	mr->ibmr.pd = pd;
+	mr->ibmr.device = pd->device;
+
+	in = kzalloc(inlen, GFP_KERNEL);
+	if (!in) {
+		err = -ENOMEM;
+		goto err_free;
+	}
+
+	if (access_mode == MLX5_MKC_ACCESS_MODE_MTT)
+		page_shift = PAGE_SHIFT;
+
+	err = _mlx5_alloc_mkey_descs(pd, mr, ndescs, desc_size, page_shift,
+				     access_mode, in, inlen);
+	if (err)
+		goto err_free_in;
+
+	mr->umem = NULL;
+	kfree(in);
+
+	return mr;
+
+err_free_in:
+	kfree(in);
+err_free:
+	kfree(mr);
+	return ERR_PTR(err);
+}
+
+static int mlx5_alloc_mem_reg_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr,
+				    int ndescs, u32 *in, int inlen)
+{
+	return _mlx5_alloc_mkey_descs(pd, mr, ndescs, sizeof(struct mlx5_mtt),
+				      PAGE_SHIFT, MLX5_MKC_ACCESS_MODE_MTT, in,
+				      inlen);
+}
+
+static int mlx5_alloc_sg_gaps_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr,
+				    int ndescs, u32 *in, int inlen)
+{
+	return _mlx5_alloc_mkey_descs(pd, mr, ndescs, sizeof(struct mlx5_klm),
+				      0, MLX5_MKC_ACCESS_MODE_KLMS, in, inlen);
+}
+
+static int mlx5_alloc_integrity_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr,
+				      int max_num_sg, int max_num_meta_sg,
+				      u32 *in, int inlen)
+{
+	struct mlx5_ib_dev *dev = to_mdev(pd->device);
+	u32 psv_index[2];
+	void *mkc;
+	int err;
+
+	mr->sig = kzalloc(sizeof(*mr->sig), GFP_KERNEL);
+	if (!mr->sig)
+		return -ENOMEM;
+
+	/* create mem & wire PSVs */
+	err = mlx5_core_create_psv(dev->mdev, to_mpd(pd)->pdn, 2, psv_index);
+	if (err)
+		goto err_free_sig;
+
+	mr->sig->psv_memory.psv_idx = psv_index[0];
+	mr->sig->psv_wire.psv_idx = psv_index[1];
+
+	mr->sig->sig_status_checked = true;
+	mr->sig->sig_err_exists = false;
+	/* Next UMR, Arm SIGERR */
+	++mr->sig->sigerr_count;
+	mr->klm_mr = mlx5_ib_alloc_pi_mr(pd, max_num_sg, max_num_meta_sg,
+					 sizeof(struct mlx5_klm),
+					 MLX5_MKC_ACCESS_MODE_KLMS);
+	if (IS_ERR(mr->klm_mr)) {
+		err = PTR_ERR(mr->klm_mr);
+		goto err_destroy_psv;
+	}
+	mr->mtt_mr = mlx5_ib_alloc_pi_mr(pd, max_num_sg, max_num_meta_sg,
+					 sizeof(struct mlx5_mtt),
+					 MLX5_MKC_ACCESS_MODE_MTT);
+	if (IS_ERR(mr->mtt_mr)) {
+		err = PTR_ERR(mr->mtt_mr);
+		goto err_free_klm_mr;
+	}
+
+	/* Set bsf descriptors for mkey */
+	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
+	MLX5_SET(mkc, mkc, bsf_en, 1);
+	MLX5_SET(mkc, mkc, bsf_octword_size, MLX5_MKEY_BSF_OCTO_SIZE);
+
+	err = _mlx5_alloc_mkey_descs(pd, mr, 4, sizeof(struct mlx5_klm), 0,
+				     MLX5_MKC_ACCESS_MODE_KLMS, in, inlen);
+	if (err)
+		goto err_free_mtt_mr;
+
+	return 0;
+
+err_free_mtt_mr:
+	dereg_mr(to_mdev(mr->mtt_mr->ibmr.device), mr->mtt_mr);
+	mr->mtt_mr = NULL;
+err_free_klm_mr:
+	dereg_mr(to_mdev(mr->klm_mr->ibmr.device), mr->klm_mr);
+	mr->klm_mr = NULL;
+err_destroy_psv:
+	if (mlx5_core_destroy_psv(dev->mdev, mr->sig->psv_memory.psv_idx))
+		mlx5_ib_warn(dev, "failed to destroy mem psv %d\n",
+			     mr->sig->psv_memory.psv_idx);
+	if (mlx5_core_destroy_psv(dev->mdev, mr->sig->psv_wire.psv_idx))
+		mlx5_ib_warn(dev, "failed to destroy wire psv %d\n",
+			     mr->sig->psv_wire.psv_idx);
+err_free_sig:
+	kfree(mr->sig);
+
+	return err;
+}
+
+static struct ib_mr *__mlx5_ib_alloc_mr(struct ib_pd *pd,
+					enum ib_mr_type mr_type, u32 max_num_sg,
+					u32 max_num_meta_sg)
 {
 	struct mlx5_ib_dev *dev = to_mdev(pd->device);
 	int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
 	int ndescs = ALIGN(max_num_sg, 4);
 	struct mlx5_ib_mr *mr;
-	void *mkc;
 	u32 *in;
 	int err;
 
@@ -1693,93 +1839,32 @@
 		goto err_free;
 	}
 
-	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
-	MLX5_SET(mkc, mkc, free, 1);
-	MLX5_SET(mkc, mkc, translations_octword_size, ndescs);
-	MLX5_SET(mkc, mkc, qpn, 0xffffff);
-	MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn);
+	mr->ibmr.device = pd->device;
+	mr->umem = NULL;
 
-	if (mr_type == IB_MR_TYPE_MEM_REG) {
-		mr->access_mode = MLX5_MKC_ACCESS_MODE_MTT;
-		MLX5_SET(mkc, mkc, log_page_size, PAGE_SHIFT);
-		err = mlx5_alloc_priv_descs(pd->device, mr,
-					    ndescs, sizeof(struct mlx5_mtt));
-		if (err)
-			goto err_free_in;
-
-		mr->desc_size = sizeof(struct mlx5_mtt);
-		mr->max_descs = ndescs;
-	} else if (mr_type == IB_MR_TYPE_SG_GAPS) {
-		mr->access_mode = MLX5_MKC_ACCESS_MODE_KLMS;
-
-		err = mlx5_alloc_priv_descs(pd->device, mr,
-					    ndescs, sizeof(struct mlx5_klm));
-		if (err)
-			goto err_free_in;
-		mr->desc_size = sizeof(struct mlx5_klm);
-		mr->max_descs = ndescs;
-	} else if (mr_type == IB_MR_TYPE_SIGNATURE) {
-		u32 psv_index[2];
-
-		MLX5_SET(mkc, mkc, bsf_en, 1);
-		MLX5_SET(mkc, mkc, bsf_octword_size, MLX5_MKEY_BSF_OCTO_SIZE);
-		mr->sig = kzalloc(sizeof(*mr->sig), GFP_KERNEL);
-		if (!mr->sig) {
-			err = -ENOMEM;
-			goto err_free_in;
-		}
-
-		/* create mem & wire PSVs */
-		err = mlx5_core_create_psv(dev->mdev, to_mpd(pd)->pdn,
-					   2, psv_index);
-		if (err)
-			goto err_free_sig;
-
-		mr->access_mode = MLX5_MKC_ACCESS_MODE_KLMS;
-		mr->sig->psv_memory.psv_idx = psv_index[0];
-		mr->sig->psv_wire.psv_idx = psv_index[1];
-
-		mr->sig->sig_status_checked = true;
-		mr->sig->sig_err_exists = false;
-		/* Next UMR, Arm SIGERR */
-		++mr->sig->sigerr_count;
-	} else {
+	switch (mr_type) {
+	case IB_MR_TYPE_MEM_REG:
+		err = mlx5_alloc_mem_reg_descs(pd, mr, ndescs, in, inlen);
+		break;
+	case IB_MR_TYPE_SG_GAPS:
+		err = mlx5_alloc_sg_gaps_descs(pd, mr, ndescs, in, inlen);
+		break;
+	case IB_MR_TYPE_INTEGRITY:
+		err = mlx5_alloc_integrity_descs(pd, mr, max_num_sg,
+						 max_num_meta_sg, in, inlen);
+		break;
+	default:
 		mlx5_ib_warn(dev, "Invalid mr type %d\n", mr_type);
 		err = -EINVAL;
-		goto err_free_in;
 	}
 
-	MLX5_SET(mkc, mkc, access_mode_1_0, mr->access_mode & 0x3);
-	MLX5_SET(mkc, mkc, access_mode_4_2, (mr->access_mode >> 2) & 0x7);
-	MLX5_SET(mkc, mkc, umr_en, 1);
-
-	mr->ibmr.device = pd->device;
-	err = mlx5_core_create_mkey(dev->mdev, &mr->mmkey, in, inlen);
 	if (err)
-		goto err_destroy_psv;
+		goto err_free_in;
 
-	mr->mmkey.type = MLX5_MKEY_MR;
-	mr->ibmr.lkey = mr->mmkey.key;
-	mr->ibmr.rkey = mr->mmkey.key;
-	mr->umem = NULL;
 	kfree(in);
 
 	return &mr->ibmr;
 
-err_destroy_psv:
-	if (mr->sig) {
-		if (mlx5_core_destroy_psv(dev->mdev,
-					  mr->sig->psv_memory.psv_idx))
-			mlx5_ib_warn(dev, "failed to destroy mem psv %d\n",
-				     mr->sig->psv_memory.psv_idx);
-		if (mlx5_core_destroy_psv(dev->mdev,
-					  mr->sig->psv_wire.psv_idx))
-			mlx5_ib_warn(dev, "failed to destroy wire psv %d\n",
-				     mr->sig->psv_wire.psv_idx);
-	}
-	mlx5_free_priv_descs(mr);
-err_free_sig:
-	kfree(mr->sig);
 err_free_in:
 	kfree(in);
 err_free:
@@ -1787,6 +1872,19 @@
 	return ERR_PTR(err);
 }
 
+struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
+			       u32 max_num_sg, struct ib_udata *udata)
+{
+	return __mlx5_ib_alloc_mr(pd, mr_type, max_num_sg, 0);
+}
+
+struct ib_mr *mlx5_ib_alloc_mr_integrity(struct ib_pd *pd,
+					 u32 max_num_sg, u32 max_num_meta_sg)
+{
+	return __mlx5_ib_alloc_mr(pd, IB_MR_TYPE_INTEGRITY, max_num_sg,
+				  max_num_meta_sg);
+}
+
 struct ib_mw *mlx5_ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type,
 			       struct ib_udata *udata)
 {
@@ -1864,14 +1962,25 @@
 
 int mlx5_ib_dealloc_mw(struct ib_mw *mw)
 {
+	struct mlx5_ib_dev *dev = to_mdev(mw->device);
 	struct mlx5_ib_mw *mmw = to_mmw(mw);
 	int err;
 
-	err =  mlx5_core_destroy_mkey((to_mdev(mw->device))->mdev,
-				      &mmw->mmkey);
-	if (!err)
-		kfree(mmw);
-	return err;
+	if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) {
+		xa_erase_irq(&dev->mdev->priv.mkey_table,
+			     mlx5_base_mkey(mmw->mmkey.key));
+		/*
+		 * pagefault_single_data_segment() may be accessing mmw under
+		 * SRCU if the user bound an ODP MR to this MW.
+		 */
+		synchronize_srcu(&dev->mr_srcu);
+	}
+
+	err = mlx5_core_destroy_mkey(dev->mdev, &mmw->mmkey);
+	if (err)
+		return err;
+	kfree(mmw);
+	return 0;
 }
 
 int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask,
@@ -1916,16 +2025,53 @@
 }
 
 static int
+mlx5_ib_map_pa_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg,
+			int data_sg_nents, unsigned int *data_sg_offset,
+			struct scatterlist *meta_sg, int meta_sg_nents,
+			unsigned int *meta_sg_offset)
+{
+	struct mlx5_ib_mr *mr = to_mmr(ibmr);
+	unsigned int sg_offset = 0;
+	int n = 0;
+
+	mr->meta_length = 0;
+	if (data_sg_nents == 1) {
+		n++;
+		mr->ndescs = 1;
+		if (data_sg_offset)
+			sg_offset = *data_sg_offset;
+		mr->data_length = sg_dma_len(data_sg) - sg_offset;
+		mr->data_iova = sg_dma_address(data_sg) + sg_offset;
+		if (meta_sg_nents == 1) {
+			n++;
+			mr->meta_ndescs = 1;
+			if (meta_sg_offset)
+				sg_offset = *meta_sg_offset;
+			else
+				sg_offset = 0;
+			mr->meta_length = sg_dma_len(meta_sg) - sg_offset;
+			mr->pi_iova = sg_dma_address(meta_sg) + sg_offset;
+		}
+		ibmr->length = mr->data_length + mr->meta_length;
+	}
+
+	return n;
+}
+
+static int
 mlx5_ib_sg_to_klms(struct mlx5_ib_mr *mr,
 		   struct scatterlist *sgl,
 		   unsigned short sg_nents,
-		   unsigned int *sg_offset_p)
+		   unsigned int *sg_offset_p,
+		   struct scatterlist *meta_sgl,
+		   unsigned short meta_sg_nents,
+		   unsigned int *meta_sg_offset_p)
 {
 	struct scatterlist *sg = sgl;
 	struct mlx5_klm *klms = mr->descs;
 	unsigned int sg_offset = sg_offset_p ? *sg_offset_p : 0;
 	u32 lkey = mr->ibmr.pd->local_dma_lkey;
-	int i;
+	int i, j = 0;
 
 	mr->ibmr.iova = sg_dma_address(sg) + sg_offset;
 	mr->ibmr.length = 0;
@@ -1940,12 +2086,36 @@
 
 		sg_offset = 0;
 	}
-	mr->ndescs = i;
 
 	if (sg_offset_p)
 		*sg_offset_p = sg_offset;
 
-	return i;
+	mr->ndescs = i;
+	mr->data_length = mr->ibmr.length;
+
+	if (meta_sg_nents) {
+		sg = meta_sgl;
+		sg_offset = meta_sg_offset_p ? *meta_sg_offset_p : 0;
+		for_each_sg(meta_sgl, sg, meta_sg_nents, j) {
+			if (unlikely(i + j >= mr->max_descs))
+				break;
+			klms[i + j].va = cpu_to_be64(sg_dma_address(sg) +
+						     sg_offset);
+			klms[i + j].bcount = cpu_to_be32(sg_dma_len(sg) -
+							 sg_offset);
+			klms[i + j].key = cpu_to_be32(lkey);
+			mr->ibmr.length += sg_dma_len(sg) - sg_offset;
+
+			sg_offset = 0;
+		}
+		if (meta_sg_offset_p)
+			*meta_sg_offset_p = sg_offset;
+
+		mr->meta_ndescs = j;
+		mr->meta_length = mr->ibmr.length - mr->data_length;
+	}
+
+	return i + j;
 }
 
 static int mlx5_set_page(struct ib_mr *ibmr, u64 addr)
@@ -1962,6 +2132,181 @@
 	return 0;
 }
 
+static int mlx5_set_page_pi(struct ib_mr *ibmr, u64 addr)
+{
+	struct mlx5_ib_mr *mr = to_mmr(ibmr);
+	__be64 *descs;
+
+	if (unlikely(mr->ndescs + mr->meta_ndescs == mr->max_descs))
+		return -ENOMEM;
+
+	descs = mr->descs;
+	descs[mr->ndescs + mr->meta_ndescs++] =
+		cpu_to_be64(addr | MLX5_EN_RD | MLX5_EN_WR);
+
+	return 0;
+}
+
+static int
+mlx5_ib_map_mtt_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg,
+			 int data_sg_nents, unsigned int *data_sg_offset,
+			 struct scatterlist *meta_sg, int meta_sg_nents,
+			 unsigned int *meta_sg_offset)
+{
+	struct mlx5_ib_mr *mr = to_mmr(ibmr);
+	struct mlx5_ib_mr *pi_mr = mr->mtt_mr;
+	int n;
+
+	pi_mr->ndescs = 0;
+	pi_mr->meta_ndescs = 0;
+	pi_mr->meta_length = 0;
+
+	ib_dma_sync_single_for_cpu(ibmr->device, pi_mr->desc_map,
+				   pi_mr->desc_size * pi_mr->max_descs,
+				   DMA_TO_DEVICE);
+
+	pi_mr->ibmr.page_size = ibmr->page_size;
+	n = ib_sg_to_pages(&pi_mr->ibmr, data_sg, data_sg_nents, data_sg_offset,
+			   mlx5_set_page);
+	if (n != data_sg_nents)
+		return n;
+
+	pi_mr->data_iova = pi_mr->ibmr.iova;
+	pi_mr->data_length = pi_mr->ibmr.length;
+	pi_mr->ibmr.length = pi_mr->data_length;
+	ibmr->length = pi_mr->data_length;
+
+	if (meta_sg_nents) {
+		u64 page_mask = ~((u64)ibmr->page_size - 1);
+		u64 iova = pi_mr->data_iova;
+
+		n += ib_sg_to_pages(&pi_mr->ibmr, meta_sg, meta_sg_nents,
+				    meta_sg_offset, mlx5_set_page_pi);
+
+		pi_mr->meta_length = pi_mr->ibmr.length;
+		/*
+		 * PI address for the HW is the offset of the metadata address
+		 * relative to the first data page address.
+		 * It equals to first data page address + size of data pages +
+		 * metadata offset at the first metadata page
+		 */
+		pi_mr->pi_iova = (iova & page_mask) +
+				 pi_mr->ndescs * ibmr->page_size +
+				 (pi_mr->ibmr.iova & ~page_mask);
+		/*
+		 * In order to use one MTT MR for data and metadata, we register
+		 * also the gaps between the end of the data and the start of
+		 * the metadata (the sig MR will verify that the HW will access
+		 * to right addresses). This mapping is safe because we use
+		 * internal mkey for the registration.
+		 */
+		pi_mr->ibmr.length = pi_mr->pi_iova + pi_mr->meta_length - iova;
+		pi_mr->ibmr.iova = iova;
+		ibmr->length += pi_mr->meta_length;
+	}
+
+	ib_dma_sync_single_for_device(ibmr->device, pi_mr->desc_map,
+				      pi_mr->desc_size * pi_mr->max_descs,
+				      DMA_TO_DEVICE);
+
+	return n;
+}
+
+static int
+mlx5_ib_map_klm_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg,
+			 int data_sg_nents, unsigned int *data_sg_offset,
+			 struct scatterlist *meta_sg, int meta_sg_nents,
+			 unsigned int *meta_sg_offset)
+{
+	struct mlx5_ib_mr *mr = to_mmr(ibmr);
+	struct mlx5_ib_mr *pi_mr = mr->klm_mr;
+	int n;
+
+	pi_mr->ndescs = 0;
+	pi_mr->meta_ndescs = 0;
+	pi_mr->meta_length = 0;
+
+	ib_dma_sync_single_for_cpu(ibmr->device, pi_mr->desc_map,
+				   pi_mr->desc_size * pi_mr->max_descs,
+				   DMA_TO_DEVICE);
+
+	n = mlx5_ib_sg_to_klms(pi_mr, data_sg, data_sg_nents, data_sg_offset,
+			       meta_sg, meta_sg_nents, meta_sg_offset);
+
+	ib_dma_sync_single_for_device(ibmr->device, pi_mr->desc_map,
+				      pi_mr->desc_size * pi_mr->max_descs,
+				      DMA_TO_DEVICE);
+
+	/* This is zero-based memory region */
+	pi_mr->data_iova = 0;
+	pi_mr->ibmr.iova = 0;
+	pi_mr->pi_iova = pi_mr->data_length;
+	ibmr->length = pi_mr->ibmr.length;
+
+	return n;
+}
+
+int mlx5_ib_map_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg,
+			 int data_sg_nents, unsigned int *data_sg_offset,
+			 struct scatterlist *meta_sg, int meta_sg_nents,
+			 unsigned int *meta_sg_offset)
+{
+	struct mlx5_ib_mr *mr = to_mmr(ibmr);
+	struct mlx5_ib_mr *pi_mr = NULL;
+	int n;
+
+	WARN_ON(ibmr->type != IB_MR_TYPE_INTEGRITY);
+
+	mr->ndescs = 0;
+	mr->data_length = 0;
+	mr->data_iova = 0;
+	mr->meta_ndescs = 0;
+	mr->pi_iova = 0;
+	/*
+	 * As a performance optimization, if possible, there is no need to
+	 * perform UMR operation to register the data/metadata buffers.
+	 * First try to map the sg lists to PA descriptors with local_dma_lkey.
+	 * Fallback to UMR only in case of a failure.
+	 */
+	n = mlx5_ib_map_pa_mr_sg_pi(ibmr, data_sg, data_sg_nents,
+				    data_sg_offset, meta_sg, meta_sg_nents,
+				    meta_sg_offset);
+	if (n == data_sg_nents + meta_sg_nents)
+		goto out;
+	/*
+	 * As a performance optimization, if possible, there is no need to map
+	 * the sg lists to KLM descriptors. First try to map the sg lists to MTT
+	 * descriptors and fallback to KLM only in case of a failure.
+	 * It's more efficient for the HW to work with MTT descriptors
+	 * (especially in high load).
+	 * Use KLM (indirect access) only if it's mandatory.
+	 */
+	pi_mr = mr->mtt_mr;
+	n = mlx5_ib_map_mtt_mr_sg_pi(ibmr, data_sg, data_sg_nents,
+				     data_sg_offset, meta_sg, meta_sg_nents,
+				     meta_sg_offset);
+	if (n == data_sg_nents + meta_sg_nents)
+		goto out;
+
+	pi_mr = mr->klm_mr;
+	n = mlx5_ib_map_klm_mr_sg_pi(ibmr, data_sg, data_sg_nents,
+				     data_sg_offset, meta_sg, meta_sg_nents,
+				     meta_sg_offset);
+	if (unlikely(n != data_sg_nents + meta_sg_nents))
+		return -ENOMEM;
+
+out:
+	/* This is zero-based memory region */
+	ibmr->iova = 0;
+	mr->pi_mr = pi_mr;
+	if (pi_mr)
+		ibmr->sig_attrs->meta_length = pi_mr->meta_length;
+	else
+		ibmr->sig_attrs->meta_length = mr->meta_length;
+
+	return 0;
+}
+
 int mlx5_ib_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents,
 		      unsigned int *sg_offset)
 {
@@ -1975,7 +2320,8 @@
 				   DMA_TO_DEVICE);
 
 	if (mr->access_mode == MLX5_MKC_ACCESS_MODE_KLMS)
-		n = mlx5_ib_sg_to_klms(mr, sg, sg_nents, sg_offset);
+		n = mlx5_ib_sg_to_klms(mr, sg, sg_nents, sg_offset, NULL, 0,
+				       NULL);
 	else
 		n = ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset,
 				mlx5_set_page);
diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c
index 9e1cac8..3f9478d 100644
--- a/drivers/infiniband/hw/mlx5/odp.c
+++ b/drivers/infiniband/hw/mlx5/odp.c
@@ -37,6 +37,46 @@
 #include "mlx5_ib.h"
 #include "cmd.h"
 
+#include <linux/mlx5/eq.h>
+
+/* Contains the details of a pagefault. */
+struct mlx5_pagefault {
+	u32			bytes_committed;
+	u32			token;
+	u8			event_subtype;
+	u8			type;
+	union {
+		/* Initiator or send message responder pagefault details. */
+		struct {
+			/* Received packet size, only valid for responders. */
+			u32	packet_size;
+			/*
+			 * Number of resource holding WQE, depends on type.
+			 */
+			u32	wq_num;
+			/*
+			 * WQE index. Refers to either the send queue or
+			 * receive queue, according to event_subtype.
+			 */
+			u16	wqe_index;
+		} wqe;
+		/* RDMA responder pagefault details */
+		struct {
+			u32	r_key;
+			/*
+			 * Received packet size, minimal size page fault
+			 * resolution required for forward progress.
+			 */
+			u32	packet_size;
+			u32	rdma_op_len;
+			u64	rdma_va;
+		} rdma;
+	};
+
+	struct mlx5_ib_pf_eq	*eq;
+	struct work_struct	work;
+};
+
 #define MAX_PREFETCH_LEN (4*1024*1024U)
 
 /* Timeout in ms to wait for an active mmu notifier to complete when handling
@@ -61,13 +101,21 @@
 	return mr && mr->parent == parent && !odp->dying;
 }
 
+static struct ib_ucontext_per_mm *mr_to_per_mm(struct mlx5_ib_mr *mr)
+{
+	if (WARN_ON(!mr || !is_odp_mr(mr)))
+		return NULL;
+
+	return to_ib_umem_odp(mr->umem)->per_mm;
+}
+
 static struct ib_umem_odp *odp_next(struct ib_umem_odp *odp)
 {
 	struct mlx5_ib_mr *mr = odp->private, *parent = mr->parent;
-	struct ib_ucontext *ctx = odp->umem->context;
+	struct ib_ucontext_per_mm *per_mm = odp->per_mm;
 	struct rb_node *rb;
 
-	down_read(&ctx->umem_rwsem);
+	down_read(&per_mm->umem_rwsem);
 	while (1) {
 		rb = rb_next(&odp->interval_tree.rb);
 		if (!rb)
@@ -79,19 +127,19 @@
 not_found:
 	odp = NULL;
 end:
-	up_read(&ctx->umem_rwsem);
+	up_read(&per_mm->umem_rwsem);
 	return odp;
 }
 
-static struct ib_umem_odp *odp_lookup(struct ib_ucontext *ctx,
-				      u64 start, u64 length,
+static struct ib_umem_odp *odp_lookup(u64 start, u64 length,
 				      struct mlx5_ib_mr *parent)
 {
+	struct ib_ucontext_per_mm *per_mm = mr_to_per_mm(parent);
 	struct ib_umem_odp *odp;
 	struct rb_node *rb;
 
-	down_read(&ctx->umem_rwsem);
-	odp = rbt_ib_umem_lookup(&ctx->umem_tree, start, length);
+	down_read(&per_mm->umem_rwsem);
+	odp = rbt_ib_umem_lookup(&per_mm->umem_tree, start, length);
 	if (!odp)
 		goto end;
 
@@ -102,13 +150,13 @@
 		if (!rb)
 			goto not_found;
 		odp = rb_entry(rb, struct ib_umem_odp, interval_tree.rb);
-		if (ib_umem_start(odp->umem) > start + length)
+		if (ib_umem_start(odp) > start + length)
 			goto not_found;
 	}
 not_found:
 	odp = NULL;
 end:
-	up_read(&ctx->umem_rwsem);
+	up_read(&per_mm->umem_rwsem);
 	return odp;
 }
 
@@ -116,7 +164,6 @@
 			   size_t nentries, struct mlx5_ib_mr *mr, int flags)
 {
 	struct ib_pd *pd = mr->ibmr.pd;
-	struct ib_ucontext *ctx = pd->uobject->context;
 	struct mlx5_ib_dev *dev = to_mdev(pd->device);
 	struct ib_umem_odp *odp;
 	unsigned long va;
@@ -131,13 +178,36 @@
 		return;
 	}
 
-	odp = odp_lookup(ctx, offset * MLX5_IMR_MTT_SIZE,
-			     nentries * MLX5_IMR_MTT_SIZE, mr);
+	/*
+	 * The locking here is pretty subtle. Ideally the implicit children
+	 * list would be protected by the umem_mutex, however that is not
+	 * possible. Instead this uses a weaker update-then-lock pattern:
+	 *
+	 *  srcu_read_lock()
+	 *    <change children list>
+	 *    mutex_lock(umem_mutex)
+	 *     mlx5_ib_update_xlt()
+	 *    mutex_unlock(umem_mutex)
+	 *    destroy lkey
+	 *
+	 * ie any change the children list must be followed by the locked
+	 * update_xlt before destroying.
+	 *
+	 * The umem_mutex provides the acquire/release semantic needed to make
+	 * the children list visible to a racing thread. While SRCU is not
+	 * technically required, using it gives consistent use of the SRCU
+	 * locking around the children list.
+	 */
+	lockdep_assert_held(&to_ib_umem_odp(mr->umem)->umem_mutex);
+	lockdep_assert_held(&mr->dev->mr_srcu);
+
+	odp = odp_lookup(offset * MLX5_IMR_MTT_SIZE,
+			 nentries * MLX5_IMR_MTT_SIZE, mr);
 
 	for (i = 0; i < nentries; i++, pklm++) {
 		pklm->bcount = cpu_to_be32(MLX5_IMR_MTT_SIZE);
 		va = (offset + i) * MLX5_IMR_MTT_SIZE;
-		if (odp && odp->umem->address == va) {
+		if (odp && ib_umem_start(odp) == va) {
 			struct mlx5_ib_mr *mtt = odp->private;
 
 			pklm->key = cpu_to_be32(mtt->ibmr.lkey);
@@ -153,24 +223,31 @@
 static void mr_leaf_free_action(struct work_struct *work)
 {
 	struct ib_umem_odp *odp = container_of(work, struct ib_umem_odp, work);
-	int idx = ib_umem_start(odp->umem) >> MLX5_IMR_MTT_SHIFT;
+	int idx = ib_umem_start(odp) >> MLX5_IMR_MTT_SHIFT;
 	struct mlx5_ib_mr *mr = odp->private, *imr = mr->parent;
+	struct ib_umem_odp *odp_imr = to_ib_umem_odp(imr->umem);
+	int srcu_key;
 
 	mr->parent = NULL;
 	synchronize_srcu(&mr->dev->mr_srcu);
 
-	ib_umem_release(odp->umem);
-	if (imr->live)
+	if (smp_load_acquire(&imr->live)) {
+		srcu_key = srcu_read_lock(&mr->dev->mr_srcu);
+		mutex_lock(&odp_imr->umem_mutex);
 		mlx5_ib_update_xlt(imr, idx, 1, 0,
 				   MLX5_IB_UPD_XLT_INDIRECT |
 				   MLX5_IB_UPD_XLT_ATOMIC);
+		mutex_unlock(&odp_imr->umem_mutex);
+		srcu_read_unlock(&mr->dev->mr_srcu, srcu_key);
+	}
+	ib_umem_odp_release(odp);
 	mlx5_mr_cache_free(mr->dev, mr);
 
 	if (atomic_dec_and_test(&imr->num_leaf_free))
 		wake_up(&imr->q_leaf_free);
 }
 
-void mlx5_ib_invalidate_range(struct ib_umem *umem, unsigned long start,
+void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp, unsigned long start,
 			      unsigned long end)
 {
 	struct mlx5_ib_mr *mr;
@@ -180,18 +257,18 @@
 	int in_block = 0;
 	u64 addr;
 
-	if (!umem || !umem->odp_data) {
+	if (!umem_odp) {
 		pr_err("invalidation called on NULL umem or non-ODP umem\n");
 		return;
 	}
 
-	mr = umem->odp_data->private;
+	mr = umem_odp->private;
 
 	if (!mr || !mr->ibmr.pd)
 		return;
 
-	start = max_t(u64, ib_umem_start(umem), start);
-	end = min_t(u64, ib_umem_end(umem), end);
+	start = max_t(u64, ib_umem_start(umem_odp), start);
+	end = min_t(u64, ib_umem_end(umem_odp), end);
 
 	/*
 	 * Iteration one - zap the HW's MTTs. The notifiers_count ensures that
@@ -199,16 +276,16 @@
 	 * overwrite the same MTTs.  Concurent invalidations might race us,
 	 * but they will write 0s as well, so no difference in the end result.
 	 */
-
-	for (addr = start; addr < end; addr += BIT(umem->page_shift)) {
-		idx = (addr - ib_umem_start(umem)) >> umem->page_shift;
+	mutex_lock(&umem_odp->umem_mutex);
+	for (addr = start; addr < end; addr += BIT(umem_odp->page_shift)) {
+		idx = (addr - ib_umem_start(umem_odp)) >> umem_odp->page_shift;
 		/*
 		 * Strive to write the MTTs in chunks, but avoid overwriting
 		 * non-existing MTTs. The huristic here can be improved to
 		 * estimate the cost of another UMR vs. the cost of bigger
 		 * UMR.
 		 */
-		if (umem->odp_data->dma_list[idx] &
+		if (umem_odp->dma_list[idx] &
 		    (ODP_READ_ALLOWED_BIT | ODP_WRITE_ALLOWED_BIT)) {
 			if (!in_block) {
 				blk_start_idx = idx;
@@ -237,14 +314,16 @@
 	 * needed.
 	 */
 
-	ib_umem_odp_unmap_dma_pages(umem, start, end);
+	ib_umem_odp_unmap_dma_pages(umem_odp, start, end);
 
-	if (unlikely(!umem->npages && mr->parent &&
-		     !umem->odp_data->dying)) {
-		WRITE_ONCE(umem->odp_data->dying, 1);
+	if (unlikely(!umem_odp->npages && mr->parent &&
+		     !umem_odp->dying)) {
+		WRITE_ONCE(mr->live, 0);
+		umem_odp->dying = 1;
 		atomic_inc(&mr->parent->num_leaf_free);
-		schedule_work(&umem->odp_data->work);
+		schedule_work(&umem_odp->work);
 	}
+	mutex_unlock(&umem_odp->umem_mutex);
 }
 
 void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev)
@@ -253,7 +332,8 @@
 
 	memset(caps, 0, sizeof(*caps));
 
-	if (!MLX5_CAP_GEN(dev->mdev, pg))
+	if (!MLX5_CAP_GEN(dev->mdev, pg) ||
+	    !mlx5_ib_can_use_umr(dev, true))
 		return;
 
 	caps->general_caps = IB_ODP_SUPPORT;
@@ -266,6 +346,9 @@
 	if (MLX5_CAP_ODP(dev->mdev, ud_odp_caps.send))
 		caps->per_transport_caps.ud_odp_caps |= IB_ODP_SUPPORT_SEND;
 
+	if (MLX5_CAP_ODP(dev->mdev, ud_odp_caps.srq_receive))
+		caps->per_transport_caps.ud_odp_caps |= IB_ODP_SUPPORT_SRQ_RECV;
+
 	if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.send))
 		caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_SEND;
 
@@ -281,9 +364,31 @@
 	if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.atomic))
 		caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_ATOMIC;
 
+	if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.srq_receive))
+		caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_SRQ_RECV;
+
+	if (MLX5_CAP_ODP(dev->mdev, xrc_odp_caps.send))
+		caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_SEND;
+
+	if (MLX5_CAP_ODP(dev->mdev, xrc_odp_caps.receive))
+		caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_RECV;
+
+	if (MLX5_CAP_ODP(dev->mdev, xrc_odp_caps.write))
+		caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_WRITE;
+
+	if (MLX5_CAP_ODP(dev->mdev, xrc_odp_caps.read))
+		caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_READ;
+
+	if (MLX5_CAP_ODP(dev->mdev, xrc_odp_caps.atomic))
+		caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_ATOMIC;
+
+	if (MLX5_CAP_ODP(dev->mdev, xrc_odp_caps.srq_receive))
+		caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_SRQ_RECV;
+
 	if (MLX5_CAP_GEN(dev->mdev, fixed_buffer_size) &&
 	    MLX5_CAP_GEN(dev->mdev, null_mkey) &&
-	    MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset))
+	    MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset) &&
+	    !MLX5_CAP_GEN(dev->mdev, umr_indirect_mkey_disabled))
 		caps->general_caps |= IB_ODP_SUPPORT_IMPLICIT;
 
 	return;
@@ -295,18 +400,24 @@
 {
 	int wq_num = pfault->event_subtype == MLX5_PFAULT_SUBTYPE_WQE ?
 		     pfault->wqe.wq_num : pfault->token;
-	int ret = mlx5_core_page_fault_resume(dev->mdev,
-					      pfault->token,
-					      wq_num,
-					      pfault->type,
-					      error);
-	if (ret)
-		mlx5_ib_err(dev, "Failed to resolve the page fault on WQ 0x%x\n",
-			    wq_num);
+	u32 out[MLX5_ST_SZ_DW(page_fault_resume_out)] = { };
+	u32 in[MLX5_ST_SZ_DW(page_fault_resume_in)]   = { };
+	int err;
+
+	MLX5_SET(page_fault_resume_in, in, opcode, MLX5_CMD_OP_PAGE_FAULT_RESUME);
+	MLX5_SET(page_fault_resume_in, in, page_fault_type, pfault->type);
+	MLX5_SET(page_fault_resume_in, in, token, pfault->token);
+	MLX5_SET(page_fault_resume_in, in, wq_number, wq_num);
+	MLX5_SET(page_fault_resume_in, in, error, !!error);
+
+	err = mlx5_cmd_exec(dev->mdev, in, sizeof(in), out, sizeof(out));
+	if (err)
+		mlx5_ib_err(dev, "Failed to resolve the page fault on WQ 0x%x err %d\n",
+			    wq_num, err);
 }
 
 static struct mlx5_ib_mr *implicit_mr_alloc(struct ib_pd *pd,
-					    struct ib_umem *umem,
+					    struct ib_umem_odp *umem_odp,
 					    bool ksm, int access_flags)
 {
 	struct mlx5_ib_dev *dev = to_mdev(pd->device);
@@ -324,7 +435,7 @@
 	mr->dev = dev;
 	mr->access_flags = access_flags;
 	mr->mmkey.iova = 0;
-	mr->umem = umem;
+	mr->umem = &umem_odp->umem;
 
 	if (ksm) {
 		err = mlx5_ib_update_xlt(mr, 0,
@@ -349,8 +460,6 @@
 	mr->ibmr.lkey = mr->mmkey.key;
 	mr->ibmr.rkey = mr->mmkey.key;
 
-	mr->live = 1;
-
 	mlx5_ib_dbg(dev, "key %x dev %p mr %p\n",
 		    mr->mmkey.key, dev->mdev, mr);
 
@@ -366,16 +475,15 @@
 static struct ib_umem_odp *implicit_mr_get_data(struct mlx5_ib_mr *mr,
 						u64 io_virt, size_t bcnt)
 {
-	struct ib_ucontext *ctx = mr->ibmr.pd->uobject->context;
 	struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.pd->device);
 	struct ib_umem_odp *odp, *result = NULL;
+	struct ib_umem_odp *odp_mr = to_ib_umem_odp(mr->umem);
 	u64 addr = io_virt & MLX5_IMR_MTT_MASK;
 	int nentries = 0, start_idx = 0, ret;
 	struct mlx5_ib_mr *mtt;
-	struct ib_umem *umem;
 
-	mutex_lock(&mr->umem->odp_data->umem_mutex);
-	odp = odp_lookup(ctx, addr, 1, mr);
+	mutex_lock(&odp_mr->umem_mutex);
+	odp = odp_lookup(addr, 1, mr);
 
 	mlx5_ib_dbg(dev, "io_virt:%llx bcnt:%zx addr:%llx odp:%p\n",
 		    io_virt, bcnt, addr, odp);
@@ -385,26 +493,28 @@
 		if (nentries)
 			nentries++;
 	} else {
-		umem = ib_alloc_odp_umem(ctx, addr, MLX5_IMR_MTT_SIZE);
-		if (IS_ERR(umem)) {
-			mutex_unlock(&mr->umem->odp_data->umem_mutex);
-			return ERR_CAST(umem);
+		odp = ib_umem_odp_alloc_child(odp_mr, addr, MLX5_IMR_MTT_SIZE);
+		if (IS_ERR(odp)) {
+			mutex_unlock(&odp_mr->umem_mutex);
+			return ERR_CAST(odp);
 		}
 
-		mtt = implicit_mr_alloc(mr->ibmr.pd, umem, 0, mr->access_flags);
+		mtt = implicit_mr_alloc(mr->ibmr.pd, odp, 0,
+					mr->access_flags);
 		if (IS_ERR(mtt)) {
-			mutex_unlock(&mr->umem->odp_data->umem_mutex);
-			ib_umem_release(umem);
+			mutex_unlock(&odp_mr->umem_mutex);
+			ib_umem_odp_release(odp);
 			return ERR_CAST(mtt);
 		}
 
-		odp = umem->odp_data;
 		odp->private = mtt;
-		mtt->umem = umem;
+		mtt->umem = &odp->umem;
 		mtt->mmkey.iova = addr;
 		mtt->parent = mr;
 		INIT_WORK(&odp->work, mr_leaf_free_action);
 
+		smp_store_release(&mtt->live, 1);
+
 		if (!nentries)
 			start_idx = addr >> MLX5_IMR_MTT_SHIFT;
 		nentries++;
@@ -417,7 +527,7 @@
 	addr += MLX5_IMR_MTT_SIZE;
 	if (unlikely(addr < io_virt + bcnt)) {
 		odp = odp_next(odp);
-		if (odp && odp->umem->address != addr)
+		if (odp && ib_umem_start(odp) != addr)
 			odp = NULL;
 		goto next_mr;
 	}
@@ -432,98 +542,112 @@
 		}
 	}
 
-	mutex_unlock(&mr->umem->odp_data->umem_mutex);
+	mutex_unlock(&odp_mr->umem_mutex);
 	return result;
 }
 
 struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd,
+					     struct ib_udata *udata,
 					     int access_flags)
 {
-	struct ib_ucontext *ctx = pd->ibpd.uobject->context;
 	struct mlx5_ib_mr *imr;
-	struct ib_umem *umem;
+	struct ib_umem_odp *umem_odp;
 
-	umem = ib_umem_get(ctx, 0, 0, IB_ACCESS_ON_DEMAND, 0);
-	if (IS_ERR(umem))
-		return ERR_CAST(umem);
+	umem_odp = ib_umem_odp_alloc_implicit(udata, access_flags);
+	if (IS_ERR(umem_odp))
+		return ERR_CAST(umem_odp);
 
-	imr = implicit_mr_alloc(&pd->ibpd, umem, 1, access_flags);
+	imr = implicit_mr_alloc(&pd->ibpd, umem_odp, 1, access_flags);
 	if (IS_ERR(imr)) {
-		ib_umem_release(umem);
+		ib_umem_odp_release(umem_odp);
 		return ERR_CAST(imr);
 	}
 
-	imr->umem = umem;
+	imr->umem = &umem_odp->umem;
 	init_waitqueue_head(&imr->q_leaf_free);
 	atomic_set(&imr->num_leaf_free, 0);
+	atomic_set(&imr->num_pending_prefetch, 0);
+	smp_store_release(&imr->live, 1);
 
 	return imr;
 }
 
-static int mr_leaf_free(struct ib_umem *umem, u64 start,
-			u64 end, void *cookie)
-{
-	struct mlx5_ib_mr *mr = umem->odp_data->private, *imr = cookie;
-
-	if (mr->parent != imr)
-		return 0;
-
-	ib_umem_odp_unmap_dma_pages(umem,
-				    ib_umem_start(umem),
-				    ib_umem_end(umem));
-
-	if (umem->odp_data->dying)
-		return 0;
-
-	WRITE_ONCE(umem->odp_data->dying, 1);
-	atomic_inc(&imr->num_leaf_free);
-	schedule_work(&umem->odp_data->work);
-
-	return 0;
-}
-
 void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *imr)
 {
-	struct ib_ucontext *ctx = imr->ibmr.pd->uobject->context;
+	struct ib_ucontext_per_mm *per_mm = mr_to_per_mm(imr);
+	struct rb_node *node;
 
-	down_read(&ctx->umem_rwsem);
-	rbt_ib_umem_for_each_in_range(&ctx->umem_tree, 0, ULLONG_MAX,
-				      mr_leaf_free, true, imr);
-	up_read(&ctx->umem_rwsem);
+	down_read(&per_mm->umem_rwsem);
+	for (node = rb_first_cached(&per_mm->umem_tree); node;
+	     node = rb_next(node)) {
+		struct ib_umem_odp *umem_odp =
+			rb_entry(node, struct ib_umem_odp, interval_tree.rb);
+		struct mlx5_ib_mr *mr = umem_odp->private;
+
+		if (mr->parent != imr)
+			continue;
+
+		mutex_lock(&umem_odp->umem_mutex);
+		ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem_odp),
+					    ib_umem_end(umem_odp));
+
+		if (umem_odp->dying) {
+			mutex_unlock(&umem_odp->umem_mutex);
+			continue;
+		}
+
+		umem_odp->dying = 1;
+		atomic_inc(&imr->num_leaf_free);
+		schedule_work(&umem_odp->work);
+		mutex_unlock(&umem_odp->umem_mutex);
+	}
+	up_read(&per_mm->umem_rwsem);
 
 	wait_event(imr->q_leaf_free, !atomic_read(&imr->num_leaf_free));
 }
 
+#define MLX5_PF_FLAGS_PREFETCH  BIT(0)
+#define MLX5_PF_FLAGS_DOWNGRADE BIT(1)
 static int pagefault_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr,
-			u64 io_virt, size_t bcnt, u32 *bytes_mapped)
+			u64 io_virt, size_t bcnt, u32 *bytes_mapped,
+			u32 flags)
 {
-	u64 access_mask = ODP_READ_ALLOWED_BIT;
-	int npages = 0, page_shift, np;
+	int npages = 0, current_seq, page_shift, ret, np;
+	struct ib_umem_odp *odp_mr = to_ib_umem_odp(mr->umem);
+	bool downgrade = flags & MLX5_PF_FLAGS_DOWNGRADE;
+	bool prefetch = flags & MLX5_PF_FLAGS_PREFETCH;
+	u64 access_mask;
 	u64 start_idx, page_mask;
 	struct ib_umem_odp *odp;
-	int current_seq;
 	size_t size;
-	int ret;
 
-	if (!mr->umem->odp_data->page_list) {
+	if (odp_mr->is_implicit_odp) {
 		odp = implicit_mr_get_data(mr, io_virt, bcnt);
 
 		if (IS_ERR(odp))
 			return PTR_ERR(odp);
 		mr = odp->private;
-
 	} else {
-		odp = mr->umem->odp_data;
+		odp = odp_mr;
 	}
 
 next_mr:
-	size = min_t(size_t, bcnt, ib_umem_end(odp->umem) - io_virt);
+	size = min_t(size_t, bcnt, ib_umem_end(odp) - io_virt);
 
-	page_shift = mr->umem->page_shift;
+	page_shift = odp->page_shift;
 	page_mask = ~(BIT(page_shift) - 1);
 	start_idx = (io_virt - (mr->mmkey.iova & page_mask)) >> page_shift;
+	access_mask = ODP_READ_ALLOWED_BIT;
 
-	if (mr->umem->writable)
+	if (prefetch && !downgrade && !odp->umem.writable) {
+		/* prefetch with write-access must
+		 * be supported by the MR
+		 */
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (odp->umem.writable && !downgrade)
 		access_mask |= ODP_WRITE_ALLOWED_BIT;
 
 	current_seq = READ_ONCE(odp->notifiers_seq);
@@ -533,8 +657,8 @@
 	 */
 	smp_rmb();
 
-	ret = ib_umem_odp_map_dma_pages(mr->umem, io_virt, size,
-					access_mask, current_seq);
+	ret = ib_umem_odp_map_dma_pages(odp, io_virt, size, access_mask,
+					current_seq);
 
 	if (ret < 0)
 		goto out;
@@ -542,7 +666,7 @@
 	np = ret;
 
 	mutex_lock(&odp->umem_mutex);
-	if (!ib_umem_mmu_notifier_retry(mr->umem, current_seq)) {
+	if (!ib_umem_mmu_notifier_retry(odp, current_seq)) {
 		/*
 		 * No need to check whether the MTTs really belong to
 		 * this MR, since ib_umem_odp_map_dma_pages already
@@ -575,7 +699,7 @@
 
 		io_virt += size;
 		next = odp_next(odp);
-		if (unlikely(!next || next->umem->address != io_virt)) {
+		if (unlikely(!next || ib_umem_start(next) != io_virt)) {
 			mlx5_ib_dbg(dev, "next implicit leaf removed at 0x%llx. got %p\n",
 				    io_virt, next);
 			return -EAGAIN;
@@ -589,19 +713,15 @@
 
 out:
 	if (ret == -EAGAIN) {
-		if (mr->parent || !odp->dying) {
-			unsigned long timeout =
-				msecs_to_jiffies(MMU_NOTIFIER_TIMEOUT);
+		unsigned long timeout = msecs_to_jiffies(MMU_NOTIFIER_TIMEOUT);
 
-			if (!wait_for_completion_timeout(
-					&odp->notifier_completion,
-					timeout)) {
-				mlx5_ib_warn(dev, "timeout waiting for mmu notifier. seq %d against %d\n",
-					     current_seq, odp->notifiers_seq);
-			}
-		} else {
-			/* The MR is being killed, kill the QP as well. */
-			ret = -EFAULT;
+		if (!wait_for_completion_timeout(&odp->notifier_completion,
+						 timeout)) {
+			mlx5_ib_warn(
+				dev,
+				"timeout waiting for mmu notifier. seq %d against %d. notifiers_count=%d\n",
+				current_seq, odp->notifiers_seq,
+				odp->notifiers_count);
 		}
 	}
 
@@ -616,6 +736,30 @@
 	int depth;
 };
 
+static bool mkey_is_eq(struct mlx5_core_mkey *mmkey, u32 key)
+{
+	if (!mmkey)
+		return false;
+	if (mmkey->type == MLX5_MKEY_MW)
+		return mlx5_base_mkey(mmkey->key) == mlx5_base_mkey(key);
+	return mmkey->key == key;
+}
+
+static int get_indirect_num_descs(struct mlx5_core_mkey *mmkey)
+{
+	struct mlx5_ib_mw *mw;
+	struct mlx5_ib_devx_mr *devx_mr;
+
+	if (mmkey->type == MLX5_MKEY_MW) {
+		mw = container_of(mmkey, struct mlx5_ib_mw, mmkey);
+		return mw->ndescs;
+	}
+
+	devx_mr = container_of(mmkey, struct mlx5_ib_devx_mr,
+			       mmkey);
+	return devx_mr->ndescs;
+}
+
 /*
  * Handle a single data segment in a page-fault WQE or RDMA region.
  *
@@ -628,18 +772,20 @@
  *  abort the page fault handling.
  */
 static int pagefault_single_data_segment(struct mlx5_ib_dev *dev,
-					 u32 key, u64 io_virt, size_t bcnt,
+					 struct ib_pd *pd, u32 key,
+					 u64 io_virt, size_t bcnt,
 					 u32 *bytes_committed,
-					 u32 *bytes_mapped)
+					 u32 *bytes_mapped, u32 flags)
 {
 	int npages = 0, srcu_key, ret, i, outlen, cur_outlen = 0, depth = 0;
+	bool prefetch = flags & MLX5_PF_FLAGS_PREFETCH;
 	struct pf_frame *head = NULL, *frame;
 	struct mlx5_core_mkey *mmkey;
-	struct mlx5_ib_mw *mw;
 	struct mlx5_ib_mr *mr;
 	struct mlx5_klm *pklm;
 	u32 *out = NULL;
 	size_t offset;
+	int ndescs;
 
 	srcu_key = srcu_read_lock(&dev->mr_srcu);
 
@@ -647,23 +793,49 @@
 	bcnt -= *bytes_committed;
 
 next_mr:
-	mmkey = __mlx5_mr_lookup(dev->mdev, mlx5_base_mkey(key));
-	if (!mmkey || mmkey->key != key) {
+	mmkey = xa_load(&dev->mdev->priv.mkey_table, mlx5_base_mkey(key));
+	if (!mkey_is_eq(mmkey, key)) {
 		mlx5_ib_dbg(dev, "failed to find mkey %x\n", key);
 		ret = -EFAULT;
 		goto srcu_unlock;
 	}
 
+	if (prefetch && mmkey->type != MLX5_MKEY_MR) {
+		mlx5_ib_dbg(dev, "prefetch is allowed only for MR\n");
+		ret = -EINVAL;
+		goto srcu_unlock;
+	}
+
 	switch (mmkey->type) {
 	case MLX5_MKEY_MR:
 		mr = container_of(mmkey, struct mlx5_ib_mr, mmkey);
-		if (!mr->live || !mr->ibmr.pd) {
+		if (!smp_load_acquire(&mr->live) || !mr->ibmr.pd) {
 			mlx5_ib_dbg(dev, "got dead MR\n");
 			ret = -EFAULT;
 			goto srcu_unlock;
 		}
 
-		ret = pagefault_mr(dev, mr, io_virt, bcnt, bytes_mapped);
+		if (prefetch) {
+			if (!is_odp_mr(mr) ||
+			    mr->ibmr.pd != pd) {
+				mlx5_ib_dbg(dev, "Invalid prefetch request: %s\n",
+					    is_odp_mr(mr) ?  "MR is not ODP" :
+					    "PD is not of the MR");
+				ret = -EINVAL;
+				goto srcu_unlock;
+			}
+		}
+
+		if (!is_odp_mr(mr)) {
+			mlx5_ib_dbg(dev, "skipping non ODP MR (lkey=0x%06x) in page fault handler.\n",
+				    key);
+			if (bytes_mapped)
+				*bytes_mapped += bcnt;
+			ret = 0;
+			goto srcu_unlock;
+		}
+
+		ret = pagefault_mr(dev, mr, io_virt, bcnt, bytes_mapped, flags);
 		if (ret < 0)
 			goto srcu_unlock;
 
@@ -672,7 +844,8 @@
 		break;
 
 	case MLX5_MKEY_MW:
-		mw = container_of(mmkey, struct mlx5_ib_mw, mmkey);
+	case MLX5_MKEY_INDIRECT_DEVX:
+		ndescs = get_indirect_num_descs(mmkey);
 
 		if (depth >= MLX5_CAP_GEN(dev->mdev, max_indirection)) {
 			mlx5_ib_dbg(dev, "indirection level exceeded\n");
@@ -681,7 +854,7 @@
 		}
 
 		outlen = MLX5_ST_SZ_BYTES(query_mkey_out) +
-			sizeof(*pklm) * (mw->ndescs - 2);
+			sizeof(*pklm) * (ndescs - 2);
 
 		if (outlen > cur_outlen) {
 			kfree(out);
@@ -696,14 +869,14 @@
 		pklm = (struct mlx5_klm *)MLX5_ADDR_OF(query_mkey_out, out,
 						       bsf0_klm0_pas_mtt0_1);
 
-		ret = mlx5_core_query_mkey(dev->mdev, &mw->mmkey, out, outlen);
+		ret = mlx5_core_query_mkey(dev->mdev, mmkey, out, outlen);
 		if (ret)
 			goto srcu_unlock;
 
 		offset = io_virt - MLX5_GET64(query_mkey_out, out,
 					      memory_key_mkey_entry.start_addr);
 
-		for (i = 0; bcnt && i < mw->ndescs; i++, pklm++) {
+		for (i = 0; bcnt && i < ndescs; i++, pklm++) {
 			if (offset >= be32_to_cpu(pklm->bcount)) {
 				offset -= be32_to_cpu(pklm->bcount);
 				continue;
@@ -763,7 +936,6 @@
 /**
  * Parse a series of data segments for page fault handling.
  *
- * @qp the QP on which the fault occurred.
  * @pfault contains page fault information.
  * @wqe points at the first data segment in the WQE.
  * @wqe_end points after the end of the WQE.
@@ -780,9 +952,9 @@
  */
 static int pagefault_data_segments(struct mlx5_ib_dev *dev,
 				   struct mlx5_pagefault *pfault,
-				   struct mlx5_ib_qp *qp, void *wqe,
+				   void *wqe,
 				   void *wqe_end, u32 *bytes_mapped,
-				   u32 *total_wqe_bytes, int receive_queue)
+				   u32 *total_wqe_bytes, bool receive_queue)
 {
 	int ret = 0, npages = 0;
 	u64 io_virt;
@@ -791,10 +963,6 @@
 	size_t bcnt;
 	int inline_segment;
 
-	/* Skip SRQ next-WQE segment. */
-	if (receive_queue && qp->ibqp.srq)
-		wqe += sizeof(struct mlx5_wqe_srq_next_seg);
-
 	if (bytes_mapped)
 		*bytes_mapped = 0;
 	if (total_wqe_bytes)
@@ -838,9 +1006,10 @@
 			continue;
 		}
 
-		ret = pagefault_single_data_segment(dev, key, io_virt, bcnt,
+		ret = pagefault_single_data_segment(dev, NULL, key,
+						    io_virt, bcnt,
 						    &pfault->bytes_committed,
-						    bytes_mapped);
+						    bytes_mapped, 0);
 		if (ret < 0)
 			break;
 		npages += ret;
@@ -849,17 +1018,6 @@
 	return ret < 0 ? ret : npages;
 }
 
-static const u32 mlx5_ib_odp_opcode_cap[] = {
-	[MLX5_OPCODE_SEND]	       = IB_ODP_SUPPORT_SEND,
-	[MLX5_OPCODE_SEND_IMM]	       = IB_ODP_SUPPORT_SEND,
-	[MLX5_OPCODE_SEND_INVAL]       = IB_ODP_SUPPORT_SEND,
-	[MLX5_OPCODE_RDMA_WRITE]       = IB_ODP_SUPPORT_WRITE,
-	[MLX5_OPCODE_RDMA_WRITE_IMM]   = IB_ODP_SUPPORT_WRITE,
-	[MLX5_OPCODE_RDMA_READ]	       = IB_ODP_SUPPORT_READ,
-	[MLX5_OPCODE_ATOMIC_CS]	       = IB_ODP_SUPPORT_ATOMIC,
-	[MLX5_OPCODE_ATOMIC_FA]	       = IB_ODP_SUPPORT_ATOMIC,
-};
-
 /*
  * Parse initiator WQE. Advances the wqe pointer to point at the
  * scatter-gather list, and set wqe_end to the end of the WQE.
@@ -870,12 +1028,8 @@
 {
 	struct mlx5_wqe_ctrl_seg *ctrl = *wqe;
 	u16 wqe_index = pfault->wqe.wqe_index;
-	u32 transport_caps;
 	struct mlx5_base_av *av;
 	unsigned ds, opcode;
-#if defined(DEBUG)
-	u32 ctrl_wqe_index, ctrl_qpn;
-#endif
 	u32 qpn = qp->trans_qp.base.mqp.qpn;
 
 	ds = be32_to_cpu(ctrl->qpn_ds) & MLX5_WQE_CTRL_DS_MASK;
@@ -891,54 +1045,17 @@
 		return -EFAULT;
 	}
 
-#if defined(DEBUG)
-	ctrl_wqe_index = (be32_to_cpu(ctrl->opmod_idx_opcode) &
-			MLX5_WQE_CTRL_WQE_INDEX_MASK) >>
-			MLX5_WQE_CTRL_WQE_INDEX_SHIFT;
-	if (wqe_index != ctrl_wqe_index) {
-		mlx5_ib_err(dev, "Got WQE with invalid wqe_index. wqe_index=0x%x, qpn=0x%x ctrl->wqe_index=0x%x\n",
-			    wqe_index, qpn,
-			    ctrl_wqe_index);
-		return -EFAULT;
-	}
-
-	ctrl_qpn = (be32_to_cpu(ctrl->qpn_ds) & MLX5_WQE_CTRL_QPN_MASK) >>
-		MLX5_WQE_CTRL_QPN_SHIFT;
-	if (qpn != ctrl_qpn) {
-		mlx5_ib_err(dev, "Got WQE with incorrect QP number. wqe_index=0x%x, qpn=0x%x ctrl->qpn=0x%x\n",
-			    wqe_index, qpn,
-			    ctrl_qpn);
-		return -EFAULT;
-	}
-#endif /* DEBUG */
-
 	*wqe_end = *wqe + ds * MLX5_WQE_DS_UNITS;
 	*wqe += sizeof(*ctrl);
 
 	opcode = be32_to_cpu(ctrl->opmod_idx_opcode) &
 		 MLX5_WQE_CTRL_OPCODE_MASK;
 
-	switch (qp->ibqp.qp_type) {
-	case IB_QPT_RC:
-		transport_caps = dev->odp_caps.per_transport_caps.rc_odp_caps;
-		break;
-	case IB_QPT_UD:
-		transport_caps = dev->odp_caps.per_transport_caps.ud_odp_caps;
-		break;
-	default:
-		mlx5_ib_err(dev, "ODP fault on QP of an unsupported transport 0x%x\n",
-			    qp->ibqp.qp_type);
-		return -EFAULT;
-	}
+	if (qp->ibqp.qp_type == IB_QPT_XRC_INI)
+		*wqe += sizeof(struct mlx5_wqe_xrc_seg);
 
-	if (unlikely(opcode >= ARRAY_SIZE(mlx5_ib_odp_opcode_cap) ||
-		     !(transport_caps & mlx5_ib_odp_opcode_cap[opcode]))) {
-		mlx5_ib_err(dev, "ODP fault on QP of an unsupported opcode 0x%x\n",
-			    opcode);
-		return -EFAULT;
-	}
-
-	if (qp->ibqp.qp_type != IB_QPT_RC) {
+	if (qp->ibqp.qp_type == IB_QPT_UD ||
+	    qp->qp_sub_type == MLX5_IB_QPT_DCI) {
 		av = *wqe;
 		if (av->dqp_dct & cpu_to_be32(MLX5_EXTENDED_UD_AV))
 			*wqe += sizeof(struct mlx5_av);
@@ -963,21 +1080,34 @@
 }
 
 /*
- * Parse responder WQE. Advances the wqe pointer to point at the
- * scatter-gather list, and set wqe_end to the end of the WQE.
+ * Parse responder WQE and set wqe_end to the end of the WQE.
  */
-static int mlx5_ib_mr_responder_pfault_handler(
-	struct mlx5_ib_dev *dev, struct mlx5_pagefault *pfault,
-	struct mlx5_ib_qp *qp, void **wqe, void **wqe_end, int wqe_length)
+static int mlx5_ib_mr_responder_pfault_handler_srq(struct mlx5_ib_dev *dev,
+						   struct mlx5_ib_srq *srq,
+						   void **wqe, void **wqe_end,
+						   int wqe_length)
+{
+	int wqe_size = 1 << srq->msrq.wqe_shift;
+
+	if (wqe_size > wqe_length) {
+		mlx5_ib_err(dev, "Couldn't read all of the receive WQE's content\n");
+		return -EFAULT;
+	}
+
+	*wqe_end = *wqe + wqe_size;
+	*wqe += sizeof(struct mlx5_wqe_srq_next_seg);
+
+	return 0;
+}
+
+static int mlx5_ib_mr_responder_pfault_handler_rq(struct mlx5_ib_dev *dev,
+						  struct mlx5_ib_qp *qp,
+						  void *wqe, void **wqe_end,
+						  int wqe_length)
 {
 	struct mlx5_ib_wq *wq = &qp->rq;
 	int wqe_size = 1 << wq->wqe_shift;
 
-	if (qp->ibqp.srq) {
-		mlx5_ib_err(dev, "ODP fault on SRQ is not supported\n");
-		return -EFAULT;
-	}
-
 	if (qp->wq_sig) {
 		mlx5_ib_err(dev, "ODP fault with WQE signatures is not supported\n");
 		return -EFAULT;
@@ -988,99 +1118,138 @@
 		return -EFAULT;
 	}
 
-	switch (qp->ibqp.qp_type) {
-	case IB_QPT_RC:
-		if (!(dev->odp_caps.per_transport_caps.rc_odp_caps &
-		      IB_ODP_SUPPORT_RECV))
-			goto invalid_transport_or_opcode;
-		break;
-	default:
-invalid_transport_or_opcode:
-		mlx5_ib_err(dev, "ODP fault on QP of an unsupported transport. transport: 0x%x\n",
-			    qp->ibqp.qp_type);
-		return -EFAULT;
-	}
-
-	*wqe_end = *wqe + wqe_size;
+	*wqe_end = wqe + wqe_size;
 
 	return 0;
 }
 
-static struct mlx5_ib_qp *mlx5_ib_odp_find_qp(struct mlx5_ib_dev *dev,
-					      u32 wq_num)
+static inline struct mlx5_core_rsc_common *odp_get_rsc(struct mlx5_ib_dev *dev,
+						       u32 wq_num, int pf_type)
 {
-	struct mlx5_core_qp *mqp = __mlx5_qp_lookup(dev->mdev, wq_num);
+	struct mlx5_core_rsc_common *common = NULL;
+	struct mlx5_core_srq *srq;
 
-	if (!mqp) {
-		mlx5_ib_err(dev, "QPN 0x%6x not found\n", wq_num);
-		return NULL;
+	switch (pf_type) {
+	case MLX5_WQE_PF_TYPE_RMP:
+		srq = mlx5_cmd_get_srq(dev, wq_num);
+		if (srq)
+			common = &srq->common;
+		break;
+	case MLX5_WQE_PF_TYPE_REQ_SEND_OR_WRITE:
+	case MLX5_WQE_PF_TYPE_RESP:
+	case MLX5_WQE_PF_TYPE_REQ_READ_OR_ATOMIC:
+		common = mlx5_core_res_hold(dev->mdev, wq_num, MLX5_RES_QP);
+		break;
+	default:
+		break;
 	}
 
+	return common;
+}
+
+static inline struct mlx5_ib_qp *res_to_qp(struct mlx5_core_rsc_common *res)
+{
+	struct mlx5_core_qp *mqp = (struct mlx5_core_qp *)res;
+
 	return to_mibqp(mqp);
 }
 
+static inline struct mlx5_ib_srq *res_to_srq(struct mlx5_core_rsc_common *res)
+{
+	struct mlx5_core_srq *msrq =
+		container_of(res, struct mlx5_core_srq, common);
+
+	return to_mibsrq(msrq);
+}
+
 static void mlx5_ib_mr_wqe_pfault_handler(struct mlx5_ib_dev *dev,
 					  struct mlx5_pagefault *pfault)
 {
-	int ret;
-	void *wqe, *wqe_end;
-	u32 bytes_mapped, total_wqe_bytes;
-	char *buffer = NULL;
-	int resume_with_error = 1;
+	bool sq = pfault->type & MLX5_PFAULT_REQUESTOR;
 	u16 wqe_index = pfault->wqe.wqe_index;
-	int requestor = pfault->type & MLX5_PFAULT_REQUESTOR;
+	void *wqe, *wqe_start = NULL, *wqe_end = NULL;
+	u32 bytes_mapped, total_wqe_bytes;
+	struct mlx5_core_rsc_common *res;
+	int resume_with_error = 1;
 	struct mlx5_ib_qp *qp;
+	size_t bytes_copied;
+	int ret = 0;
 
-	buffer = (char *)__get_free_page(GFP_KERNEL);
-	if (!buffer) {
+	res = odp_get_rsc(dev, pfault->wqe.wq_num, pfault->type);
+	if (!res) {
+		mlx5_ib_dbg(dev, "wqe page fault for missing resource %d\n", pfault->wqe.wq_num);
+		return;
+	}
+
+	if (res->res != MLX5_RES_QP && res->res != MLX5_RES_SRQ &&
+	    res->res != MLX5_RES_XSRQ) {
+		mlx5_ib_err(dev, "wqe page fault for unsupported type %d\n",
+			    pfault->type);
+		goto resolve_page_fault;
+	}
+
+	wqe_start = (void *)__get_free_page(GFP_KERNEL);
+	if (!wqe_start) {
 		mlx5_ib_err(dev, "Error allocating memory for IO page fault handling.\n");
 		goto resolve_page_fault;
 	}
 
-	qp = mlx5_ib_odp_find_qp(dev, pfault->wqe.wq_num);
-	if (!qp)
-		goto resolve_page_fault;
+	wqe = wqe_start;
+	qp = (res->res == MLX5_RES_QP) ? res_to_qp(res) : NULL;
+	if (qp && sq) {
+		ret = mlx5_ib_read_user_wqe_sq(qp, wqe_index, wqe, PAGE_SIZE,
+					       &bytes_copied);
+		if (ret)
+			goto read_user;
+		ret = mlx5_ib_mr_initiator_pfault_handler(
+			dev, pfault, qp, &wqe, &wqe_end, bytes_copied);
+	} else if (qp && !sq) {
+		ret = mlx5_ib_read_user_wqe_rq(qp, wqe_index, wqe, PAGE_SIZE,
+					       &bytes_copied);
+		if (ret)
+			goto read_user;
+		ret = mlx5_ib_mr_responder_pfault_handler_rq(
+			dev, qp, wqe, &wqe_end, bytes_copied);
+	} else if (!qp) {
+		struct mlx5_ib_srq *srq = res_to_srq(res);
 
-	ret = mlx5_ib_read_user_wqe(qp, requestor, wqe_index, buffer,
-				    PAGE_SIZE, &qp->trans_qp.base);
-	if (ret < 0) {
-		mlx5_ib_err(dev, "Failed reading a WQE following page fault, error=%d, wqe_index=%x, qpn=%x\n",
-			    ret, wqe_index, pfault->token);
-		goto resolve_page_fault;
+		ret = mlx5_ib_read_user_wqe_srq(srq, wqe_index, wqe, PAGE_SIZE,
+						&bytes_copied);
+		if (ret)
+			goto read_user;
+		ret = mlx5_ib_mr_responder_pfault_handler_srq(
+			dev, srq, &wqe, &wqe_end, bytes_copied);
 	}
 
-	wqe = buffer;
-	if (requestor)
-		ret = mlx5_ib_mr_initiator_pfault_handler(dev, pfault, qp, &wqe,
-							  &wqe_end, ret);
-	else
-		ret = mlx5_ib_mr_responder_pfault_handler(dev, pfault, qp, &wqe,
-							  &wqe_end, ret);
-	if (ret < 0)
+	if (ret < 0 || wqe >= wqe_end)
 		goto resolve_page_fault;
 
-	if (wqe >= wqe_end) {
-		mlx5_ib_err(dev, "ODP fault on invalid WQE.\n");
-		goto resolve_page_fault;
-	}
+	ret = pagefault_data_segments(dev, pfault, wqe, wqe_end, &bytes_mapped,
+				      &total_wqe_bytes, !sq);
+	if (ret == -EAGAIN)
+		goto out;
 
-	ret = pagefault_data_segments(dev, pfault, qp, wqe, wqe_end,
-				      &bytes_mapped, &total_wqe_bytes,
-				      !requestor);
-	if (ret == -EAGAIN) {
-		resume_with_error = 0;
+	if (ret < 0 || total_wqe_bytes > bytes_mapped)
 		goto resolve_page_fault;
-	} else if (ret < 0 || total_wqe_bytes > bytes_mapped) {
-		goto resolve_page_fault;
-	}
 
+out:
+	ret = 0;
 	resume_with_error = 0;
+
+read_user:
+	if (ret)
+		mlx5_ib_err(
+			dev,
+			"Failed reading a WQE following page fault, error %d, wqe_index %x, qpn %x\n",
+			ret, wqe_index, pfault->token);
+
 resolve_page_fault:
 	mlx5_ib_page_fault_resume(dev, pfault, resume_with_error);
 	mlx5_ib_dbg(dev, "PAGE FAULT completed. QP 0x%x resume_with_error=%d, type: 0x%x\n",
 		    pfault->wqe.wq_num, resume_with_error,
 		    pfault->type);
-	free_page((unsigned long)buffer);
+	mlx5_core_res_put(res);
+	free_page((unsigned long)wqe_start);
 }
 
 static int pages_in_range(u64 address, u32 length)
@@ -1122,8 +1291,9 @@
 		prefetch_len = min(MAX_PREFETCH_LEN, prefetch_len);
 	}
 
-	ret = pagefault_single_data_segment(dev, rkey, address, length,
-					    &pfault->bytes_committed, NULL);
+	ret = pagefault_single_data_segment(dev, NULL, rkey, address, length,
+					    &pfault->bytes_committed, NULL,
+					    0);
 	if (ret == -EAGAIN) {
 		/* We're racing with an invalidation, don't prefetch */
 		prefetch_activated = 0;
@@ -1148,9 +1318,10 @@
 	if (prefetch_activated) {
 		u32 bytes_committed = 0;
 
-		ret = pagefault_single_data_segment(dev, rkey, address,
+		ret = pagefault_single_data_segment(dev, NULL, rkey, address,
 						    prefetch_len,
-						    &bytes_committed, NULL);
+						    &bytes_committed, NULL,
+						    0);
 		if (ret < 0 && ret != -EAGAIN) {
 			mlx5_ib_dbg(dev, "Prefetch failed. ret: %d, QP 0x%x, address: 0x%.16llx, length = 0x%.16x\n",
 				    ret, pfault->token, address, prefetch_len);
@@ -1158,10 +1329,8 @@
 	}
 }
 
-void mlx5_ib_pfault(struct mlx5_core_dev *mdev, void *context,
-		    struct mlx5_pagefault *pfault)
+static void mlx5_ib_pfault(struct mlx5_ib_dev *dev, struct mlx5_pagefault *pfault)
 {
-	struct mlx5_ib_dev *dev = context;
 	u8 event_subtype = pfault->event_subtype;
 
 	switch (event_subtype) {
@@ -1178,6 +1347,212 @@
 	}
 }
 
+static void mlx5_ib_eqe_pf_action(struct work_struct *work)
+{
+	struct mlx5_pagefault *pfault = container_of(work,
+						     struct mlx5_pagefault,
+						     work);
+	struct mlx5_ib_pf_eq *eq = pfault->eq;
+
+	mlx5_ib_pfault(eq->dev, pfault);
+	mempool_free(pfault, eq->pool);
+}
+
+static void mlx5_ib_eq_pf_process(struct mlx5_ib_pf_eq *eq)
+{
+	struct mlx5_eqe_page_fault *pf_eqe;
+	struct mlx5_pagefault *pfault;
+	struct mlx5_eqe *eqe;
+	int cc = 0;
+
+	while ((eqe = mlx5_eq_get_eqe(eq->core, cc))) {
+		pfault = mempool_alloc(eq->pool, GFP_ATOMIC);
+		if (!pfault) {
+			schedule_work(&eq->work);
+			break;
+		}
+
+		pf_eqe = &eqe->data.page_fault;
+		pfault->event_subtype = eqe->sub_type;
+		pfault->bytes_committed = be32_to_cpu(pf_eqe->bytes_committed);
+
+		mlx5_ib_dbg(eq->dev,
+			    "PAGE_FAULT: subtype: 0x%02x, bytes_committed: 0x%06x\n",
+			    eqe->sub_type, pfault->bytes_committed);
+
+		switch (eqe->sub_type) {
+		case MLX5_PFAULT_SUBTYPE_RDMA:
+			/* RDMA based event */
+			pfault->type =
+				be32_to_cpu(pf_eqe->rdma.pftype_token) >> 24;
+			pfault->token =
+				be32_to_cpu(pf_eqe->rdma.pftype_token) &
+				MLX5_24BIT_MASK;
+			pfault->rdma.r_key =
+				be32_to_cpu(pf_eqe->rdma.r_key);
+			pfault->rdma.packet_size =
+				be16_to_cpu(pf_eqe->rdma.packet_length);
+			pfault->rdma.rdma_op_len =
+				be32_to_cpu(pf_eqe->rdma.rdma_op_len);
+			pfault->rdma.rdma_va =
+				be64_to_cpu(pf_eqe->rdma.rdma_va);
+			mlx5_ib_dbg(eq->dev,
+				    "PAGE_FAULT: type:0x%x, token: 0x%06x, r_key: 0x%08x\n",
+				    pfault->type, pfault->token,
+				    pfault->rdma.r_key);
+			mlx5_ib_dbg(eq->dev,
+				    "PAGE_FAULT: rdma_op_len: 0x%08x, rdma_va: 0x%016llx\n",
+				    pfault->rdma.rdma_op_len,
+				    pfault->rdma.rdma_va);
+			break;
+
+		case MLX5_PFAULT_SUBTYPE_WQE:
+			/* WQE based event */
+			pfault->type =
+				(be32_to_cpu(pf_eqe->wqe.pftype_wq) >> 24) & 0x7;
+			pfault->token =
+				be32_to_cpu(pf_eqe->wqe.token);
+			pfault->wqe.wq_num =
+				be32_to_cpu(pf_eqe->wqe.pftype_wq) &
+				MLX5_24BIT_MASK;
+			pfault->wqe.wqe_index =
+				be16_to_cpu(pf_eqe->wqe.wqe_index);
+			pfault->wqe.packet_size =
+				be16_to_cpu(pf_eqe->wqe.packet_length);
+			mlx5_ib_dbg(eq->dev,
+				    "PAGE_FAULT: type:0x%x, token: 0x%06x, wq_num: 0x%06x, wqe_index: 0x%04x\n",
+				    pfault->type, pfault->token,
+				    pfault->wqe.wq_num,
+				    pfault->wqe.wqe_index);
+			break;
+
+		default:
+			mlx5_ib_warn(eq->dev,
+				     "Unsupported page fault event sub-type: 0x%02hhx\n",
+				     eqe->sub_type);
+			/* Unsupported page faults should still be
+			 * resolved by the page fault handler
+			 */
+		}
+
+		pfault->eq = eq;
+		INIT_WORK(&pfault->work, mlx5_ib_eqe_pf_action);
+		queue_work(eq->wq, &pfault->work);
+
+		cc = mlx5_eq_update_cc(eq->core, ++cc);
+	}
+
+	mlx5_eq_update_ci(eq->core, cc, 1);
+}
+
+static int mlx5_ib_eq_pf_int(struct notifier_block *nb, unsigned long type,
+			     void *data)
+{
+	struct mlx5_ib_pf_eq *eq =
+		container_of(nb, struct mlx5_ib_pf_eq, irq_nb);
+	unsigned long flags;
+
+	if (spin_trylock_irqsave(&eq->lock, flags)) {
+		mlx5_ib_eq_pf_process(eq);
+		spin_unlock_irqrestore(&eq->lock, flags);
+	} else {
+		schedule_work(&eq->work);
+	}
+
+	return IRQ_HANDLED;
+}
+
+/* mempool_refill() was proposed but unfortunately wasn't accepted
+ * http://lkml.iu.edu/hypermail/linux/kernel/1512.1/05073.html
+ * Cheap workaround.
+ */
+static void mempool_refill(mempool_t *pool)
+{
+	while (pool->curr_nr < pool->min_nr)
+		mempool_free(mempool_alloc(pool, GFP_KERNEL), pool);
+}
+
+static void mlx5_ib_eq_pf_action(struct work_struct *work)
+{
+	struct mlx5_ib_pf_eq *eq =
+		container_of(work, struct mlx5_ib_pf_eq, work);
+
+	mempool_refill(eq->pool);
+
+	spin_lock_irq(&eq->lock);
+	mlx5_ib_eq_pf_process(eq);
+	spin_unlock_irq(&eq->lock);
+}
+
+enum {
+	MLX5_IB_NUM_PF_EQE	= 0x1000,
+	MLX5_IB_NUM_PF_DRAIN	= 64,
+};
+
+static int
+mlx5_ib_create_pf_eq(struct mlx5_ib_dev *dev, struct mlx5_ib_pf_eq *eq)
+{
+	struct mlx5_eq_param param = {};
+	int err;
+
+	INIT_WORK(&eq->work, mlx5_ib_eq_pf_action);
+	spin_lock_init(&eq->lock);
+	eq->dev = dev;
+
+	eq->pool = mempool_create_kmalloc_pool(MLX5_IB_NUM_PF_DRAIN,
+					       sizeof(struct mlx5_pagefault));
+	if (!eq->pool)
+		return -ENOMEM;
+
+	eq->wq = alloc_workqueue("mlx5_ib_page_fault",
+				 WQ_HIGHPRI | WQ_UNBOUND | WQ_MEM_RECLAIM,
+				 MLX5_NUM_CMD_EQE);
+	if (!eq->wq) {
+		err = -ENOMEM;
+		goto err_mempool;
+	}
+
+	eq->irq_nb.notifier_call = mlx5_ib_eq_pf_int;
+	param = (struct mlx5_eq_param) {
+		.irq_index = 0,
+		.nent = MLX5_IB_NUM_PF_EQE,
+	};
+	param.mask[0] = 1ull << MLX5_EVENT_TYPE_PAGE_FAULT;
+	eq->core = mlx5_eq_create_generic(dev->mdev, &param);
+	if (IS_ERR(eq->core)) {
+		err = PTR_ERR(eq->core);
+		goto err_wq;
+	}
+	err = mlx5_eq_enable(dev->mdev, eq->core, &eq->irq_nb);
+	if (err) {
+		mlx5_ib_err(dev, "failed to enable odp EQ %d\n", err);
+		goto err_eq;
+	}
+
+	return 0;
+err_eq:
+	mlx5_eq_destroy_generic(dev->mdev, eq->core);
+err_wq:
+	destroy_workqueue(eq->wq);
+err_mempool:
+	mempool_destroy(eq->pool);
+	return err;
+}
+
+static int
+mlx5_ib_destroy_pf_eq(struct mlx5_ib_dev *dev, struct mlx5_ib_pf_eq *eq)
+{
+	int err;
+
+	mlx5_eq_disable(dev->mdev, eq->core, &eq->irq_nb);
+	err = mlx5_eq_destroy_generic(dev->mdev, eq->core);
+	cancel_work_sync(&eq->work);
+	destroy_workqueue(eq->wq);
+	mempool_destroy(eq->pool);
+
+	return err;
+}
+
 void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent)
 {
 	if (!(ent->dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT))
@@ -1204,9 +1579,19 @@
 	}
 }
 
+static const struct ib_device_ops mlx5_ib_dev_odp_ops = {
+	.advise_mr = mlx5_ib_advise_mr,
+	.invalidate_range = mlx5_ib_invalidate_range,
+};
+
 int mlx5_ib_odp_init_one(struct mlx5_ib_dev *dev)
 {
-	int ret;
+	int ret = 0;
+
+	if (!(dev->odp_caps.general_caps & IB_ODP_SUPPORT))
+		return ret;
+
+	ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_odp_ops);
 
 	if (dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT) {
 		ret = mlx5_cmd_null_mkey(dev->mdev, &dev->null_mkey);
@@ -1216,7 +1601,17 @@
 		}
 	}
 
-	return 0;
+	ret = mlx5_ib_create_pf_eq(dev, &dev->odp_pf_eq);
+
+	return ret;
+}
+
+void mlx5_ib_odp_cleanup_one(struct mlx5_ib_dev *dev)
+{
+	if (!(dev->odp_caps.general_caps & IB_ODP_SUPPORT))
+		return;
+
+	mlx5_ib_destroy_pf_eq(dev, &dev->odp_pf_eq);
 }
 
 int mlx5_ib_odp_init(void)
@@ -1227,3 +1622,160 @@
 	return 0;
 }
 
+struct prefetch_mr_work {
+	struct work_struct work;
+	struct ib_pd *pd;
+	u32 pf_flags;
+	u32 num_sge;
+	struct ib_sge sg_list[0];
+};
+
+static void num_pending_prefetch_dec(struct mlx5_ib_dev *dev,
+				     struct ib_sge *sg_list, u32 num_sge,
+				     u32 from)
+{
+	u32 i;
+	int srcu_key;
+
+	srcu_key = srcu_read_lock(&dev->mr_srcu);
+
+	for (i = from; i < num_sge; ++i) {
+		struct mlx5_core_mkey *mmkey;
+		struct mlx5_ib_mr *mr;
+
+		mmkey = xa_load(&dev->mdev->priv.mkey_table,
+				mlx5_base_mkey(sg_list[i].lkey));
+		mr = container_of(mmkey, struct mlx5_ib_mr, mmkey);
+		atomic_dec(&mr->num_pending_prefetch);
+	}
+
+	srcu_read_unlock(&dev->mr_srcu, srcu_key);
+}
+
+static bool num_pending_prefetch_inc(struct ib_pd *pd,
+				     struct ib_sge *sg_list, u32 num_sge)
+{
+	struct mlx5_ib_dev *dev = to_mdev(pd->device);
+	bool ret = true;
+	u32 i;
+
+	for (i = 0; i < num_sge; ++i) {
+		struct mlx5_core_mkey *mmkey;
+		struct mlx5_ib_mr *mr;
+
+		mmkey = xa_load(&dev->mdev->priv.mkey_table,
+				mlx5_base_mkey(sg_list[i].lkey));
+		if (!mmkey || mmkey->key != sg_list[i].lkey) {
+			ret = false;
+			break;
+		}
+
+		if (mmkey->type != MLX5_MKEY_MR) {
+			ret = false;
+			break;
+		}
+
+		mr = container_of(mmkey, struct mlx5_ib_mr, mmkey);
+
+		if (!smp_load_acquire(&mr->live)) {
+			ret = false;
+			break;
+		}
+
+		if (mr->ibmr.pd != pd) {
+			ret = false;
+			break;
+		}
+
+		atomic_inc(&mr->num_pending_prefetch);
+	}
+
+	if (!ret)
+		num_pending_prefetch_dec(dev, sg_list, i, 0);
+
+	return ret;
+}
+
+static int mlx5_ib_prefetch_sg_list(struct ib_pd *pd, u32 pf_flags,
+				    struct ib_sge *sg_list, u32 num_sge)
+{
+	u32 i;
+	int ret = 0;
+	struct mlx5_ib_dev *dev = to_mdev(pd->device);
+
+	for (i = 0; i < num_sge; ++i) {
+		struct ib_sge *sg = &sg_list[i];
+		int bytes_committed = 0;
+
+		ret = pagefault_single_data_segment(dev, pd, sg->lkey, sg->addr,
+						    sg->length,
+						    &bytes_committed, NULL,
+						    pf_flags);
+		if (ret < 0)
+			break;
+	}
+
+	return ret < 0 ? ret : 0;
+}
+
+static void mlx5_ib_prefetch_mr_work(struct work_struct *work)
+{
+	struct prefetch_mr_work *w =
+		container_of(work, struct prefetch_mr_work, work);
+
+	if (ib_device_try_get(w->pd->device)) {
+		mlx5_ib_prefetch_sg_list(w->pd, w->pf_flags, w->sg_list,
+					 w->num_sge);
+		ib_device_put(w->pd->device);
+	}
+
+	num_pending_prefetch_dec(to_mdev(w->pd->device), w->sg_list,
+				 w->num_sge, 0);
+	kvfree(w);
+}
+
+int mlx5_ib_advise_mr_prefetch(struct ib_pd *pd,
+			       enum ib_uverbs_advise_mr_advice advice,
+			       u32 flags, struct ib_sge *sg_list, u32 num_sge)
+{
+	struct mlx5_ib_dev *dev = to_mdev(pd->device);
+	u32 pf_flags = MLX5_PF_FLAGS_PREFETCH;
+	struct prefetch_mr_work *work;
+	bool valid_req;
+	int srcu_key;
+
+	if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH)
+		pf_flags |= MLX5_PF_FLAGS_DOWNGRADE;
+
+	if (flags & IB_UVERBS_ADVISE_MR_FLAG_FLUSH)
+		return mlx5_ib_prefetch_sg_list(pd, pf_flags, sg_list,
+						num_sge);
+
+	work = kvzalloc(struct_size(work, sg_list, num_sge), GFP_KERNEL);
+	if (!work)
+		return -ENOMEM;
+
+	memcpy(work->sg_list, sg_list, num_sge * sizeof(struct ib_sge));
+
+	/* It is guaranteed that the pd when work is executed is the pd when
+	 * work was queued since pd can't be destroyed while it holds MRs and
+	 * destroying a MR leads to flushing the workquque
+	 */
+	work->pd = pd;
+	work->pf_flags = pf_flags;
+	work->num_sge = num_sge;
+
+	INIT_WORK(&work->work, mlx5_ib_prefetch_mr_work);
+
+	srcu_key = srcu_read_lock(&dev->mr_srcu);
+
+	valid_req = num_pending_prefetch_inc(pd, sg_list, num_sge);
+	if (valid_req)
+		queue_work(system_unbound_wq, &work->work);
+	else
+		kvfree(work);
+
+	srcu_read_unlock(&dev->mr_srcu, srcu_key);
+
+	return valid_req ? 0 : -EINVAL;
+}
diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c
index 183fe5c..5fd071c 100644
--- a/drivers/infiniband/hw/mlx5/qp.c
+++ b/drivers/infiniband/hw/mlx5/qp.c
@@ -34,9 +34,11 @@
 #include <rdma/ib_umem.h>
 #include <rdma/ib_cache.h>
 #include <rdma/ib_user_verbs.h>
+#include <rdma/rdma_counter.h>
 #include <linux/mlx5/fs.h>
 #include "mlx5_ib.h"
 #include "ib_rep.h"
+#include "cmd.h"
 
 /* not supported currently */
 static int wq_signature;
@@ -91,6 +93,7 @@
 	struct mlx5_rate_limit rl;
 
 	u8 rq_q_ctr_id;
+	u16 port;
 };
 
 static void get_cqs(enum ib_qp_type qp_type,
@@ -107,91 +110,174 @@
 	return is_qp0(qp_type) || is_qp1(qp_type);
 }
 
-static void *get_wqe(struct mlx5_ib_qp *qp, int offset)
-{
-	return mlx5_buf_offset(&qp->buf, offset);
-}
-
-static void *get_recv_wqe(struct mlx5_ib_qp *qp, int n)
-{
-	return get_wqe(qp, qp->rq.offset + (n << qp->rq.wqe_shift));
-}
-
-void *mlx5_get_send_wqe(struct mlx5_ib_qp *qp, int n)
-{
-	return get_wqe(qp, qp->sq.offset + (n << MLX5_IB_SQ_STRIDE));
-}
-
 /**
- * mlx5_ib_read_user_wqe() - Copy a user-space WQE to kernel space.
+ * mlx5_ib_read_user_wqe_common() - Copy a WQE (or part of) from user WQ
+ * to kernel buffer
  *
- * @qp: QP to copy from.
- * @send: copy from the send queue when non-zero, use the receive queue
- *	  otherwise.
- * @wqe_index:  index to start copying from. For send work queues, the
- *		wqe_index is in units of MLX5_SEND_WQE_BB.
- *		For receive work queue, it is the number of work queue
- *		element in the queue.
- * @buffer: destination buffer.
- * @length: maximum number of bytes to copy.
+ * @umem: User space memory where the WQ is
+ * @buffer: buffer to copy to
+ * @buflen: buffer length
+ * @wqe_index: index of WQE to copy from
+ * @wq_offset: offset to start of WQ
+ * @wq_wqe_cnt: number of WQEs in WQ
+ * @wq_wqe_shift: log2 of WQE size
+ * @bcnt: number of bytes to copy
+ * @bytes_copied: number of bytes to copy (return value)
  *
- * Copies at least a single WQE, but may copy more data.
+ * Copies from start of WQE bcnt or less bytes.
+ * Does not gurantee to copy the entire WQE.
  *
- * Return: the number of bytes copied, or an error code.
+ * Return: zero on success, or an error code.
  */
-int mlx5_ib_read_user_wqe(struct mlx5_ib_qp *qp, int send, int wqe_index,
-			  void *buffer, u32 length,
-			  struct mlx5_ib_qp_base *base)
+static int mlx5_ib_read_user_wqe_common(struct ib_umem *umem,
+					void *buffer,
+					u32 buflen,
+					int wqe_index,
+					int wq_offset,
+					int wq_wqe_cnt,
+					int wq_wqe_shift,
+					int bcnt,
+					size_t *bytes_copied)
 {
-	struct ib_device *ibdev = qp->ibqp.device;
-	struct mlx5_ib_dev *dev = to_mdev(ibdev);
-	struct mlx5_ib_wq *wq = send ? &qp->sq : &qp->rq;
-	size_t offset;
-	size_t wq_end;
-	struct ib_umem *umem = base->ubuffer.umem;
-	u32 first_copy_length;
-	int wqe_length;
+	size_t offset = wq_offset + ((wqe_index % wq_wqe_cnt) << wq_wqe_shift);
+	size_t wq_end = wq_offset + (wq_wqe_cnt << wq_wqe_shift);
+	size_t copy_length;
 	int ret;
 
-	if (wq->wqe_cnt == 0) {
-		mlx5_ib_dbg(dev, "mlx5_ib_read_user_wqe for a QP with wqe_cnt == 0. qp_type: 0x%x\n",
-			    qp->ibqp.qp_type);
-		return -EINVAL;
-	}
+	/* don't copy more than requested, more than buffer length or
+	 * beyond WQ end
+	 */
+	copy_length = min_t(u32, buflen, wq_end - offset);
+	copy_length = min_t(u32, copy_length, bcnt);
 
-	offset = wq->offset + ((wqe_index % wq->wqe_cnt) << wq->wqe_shift);
-	wq_end = wq->offset + (wq->wqe_cnt << wq->wqe_shift);
-
-	if (send && length < sizeof(struct mlx5_wqe_ctrl_seg))
-		return -EINVAL;
-
-	if (offset > umem->length ||
-	    (send && offset + sizeof(struct mlx5_wqe_ctrl_seg) > umem->length))
-		return -EINVAL;
-
-	first_copy_length = min_t(u32, offset + length, wq_end) - offset;
-	ret = ib_umem_copy_from(buffer, umem, offset, first_copy_length);
+	ret = ib_umem_copy_from(buffer, umem, offset, copy_length);
 	if (ret)
 		return ret;
 
-	if (send) {
-		struct mlx5_wqe_ctrl_seg *ctrl = buffer;
-		int ds = be32_to_cpu(ctrl->qpn_ds) & MLX5_WQE_CTRL_DS_MASK;
+	if (!ret && bytes_copied)
+		*bytes_copied = copy_length;
 
-		wqe_length = ds * MLX5_WQE_DS_UNITS;
-	} else {
-		wqe_length = 1 << wq->wqe_shift;
-	}
+	return 0;
+}
 
-	if (wqe_length <= first_copy_length)
-		return first_copy_length;
+int mlx5_ib_read_user_wqe_sq(struct mlx5_ib_qp *qp,
+			     int wqe_index,
+			     void *buffer,
+			     int buflen,
+			     size_t *bc)
+{
+	struct mlx5_ib_qp_base *base = &qp->trans_qp.base;
+	struct ib_umem *umem = base->ubuffer.umem;
+	struct mlx5_ib_wq *wq = &qp->sq;
+	struct mlx5_wqe_ctrl_seg *ctrl;
+	size_t bytes_copied;
+	size_t bytes_copied2;
+	size_t wqe_length;
+	int ret;
+	int ds;
 
-	ret = ib_umem_copy_from(buffer + first_copy_length, umem, wq->offset,
-				wqe_length - first_copy_length);
+	if (buflen < sizeof(*ctrl))
+		return -EINVAL;
+
+	/* at first read as much as possible */
+	ret = mlx5_ib_read_user_wqe_common(umem,
+					   buffer,
+					   buflen,
+					   wqe_index,
+					   wq->offset,
+					   wq->wqe_cnt,
+					   wq->wqe_shift,
+					   buflen,
+					   &bytes_copied);
 	if (ret)
 		return ret;
 
-	return wqe_length;
+	/* we need at least control segment size to proceed */
+	if (bytes_copied < sizeof(*ctrl))
+		return -EINVAL;
+
+	ctrl = buffer;
+	ds = be32_to_cpu(ctrl->qpn_ds) & MLX5_WQE_CTRL_DS_MASK;
+	wqe_length = ds * MLX5_WQE_DS_UNITS;
+
+	/* if we copied enough then we are done */
+	if (bytes_copied >= wqe_length) {
+		*bc = bytes_copied;
+		return 0;
+	}
+
+	/* otherwise this a wrapped around wqe
+	 * so read the remaining bytes starting
+	 * from  wqe_index 0
+	 */
+	ret = mlx5_ib_read_user_wqe_common(umem,
+					   buffer + bytes_copied,
+					   buflen - bytes_copied,
+					   0,
+					   wq->offset,
+					   wq->wqe_cnt,
+					   wq->wqe_shift,
+					   wqe_length - bytes_copied,
+					   &bytes_copied2);
+
+	if (ret)
+		return ret;
+	*bc = bytes_copied + bytes_copied2;
+	return 0;
+}
+
+int mlx5_ib_read_user_wqe_rq(struct mlx5_ib_qp *qp,
+			     int wqe_index,
+			     void *buffer,
+			     int buflen,
+			     size_t *bc)
+{
+	struct mlx5_ib_qp_base *base = &qp->trans_qp.base;
+	struct ib_umem *umem = base->ubuffer.umem;
+	struct mlx5_ib_wq *wq = &qp->rq;
+	size_t bytes_copied;
+	int ret;
+
+	ret = mlx5_ib_read_user_wqe_common(umem,
+					   buffer,
+					   buflen,
+					   wqe_index,
+					   wq->offset,
+					   wq->wqe_cnt,
+					   wq->wqe_shift,
+					   buflen,
+					   &bytes_copied);
+
+	if (ret)
+		return ret;
+	*bc = bytes_copied;
+	return 0;
+}
+
+int mlx5_ib_read_user_wqe_srq(struct mlx5_ib_srq *srq,
+			      int wqe_index,
+			      void *buffer,
+			      int buflen,
+			      size_t *bc)
+{
+	struct ib_umem *umem = srq->umem;
+	size_t bytes_copied;
+	int ret;
+
+	ret = mlx5_ib_read_user_wqe_common(umem,
+					   buffer,
+					   buflen,
+					   wqe_index,
+					   0,
+					   srq->msrq.max,
+					   srq->msrq.wqe_shift,
+					   buflen,
+					   &bytes_copied);
+
+	if (ret)
+		return ret;
+	*bc = bytes_copied;
+	return 0;
 }
 
 static void mlx5_ib_qp_event(struct mlx5_core_qp *qp, int type)
@@ -357,9 +443,9 @@
 	}
 
 	size += attr->cap.max_send_sge * sizeof(struct mlx5_wqe_data_seg);
-	if (attr->create_flags & IB_QP_CREATE_SIGNATURE_EN &&
+	if (attr->create_flags & IB_QP_CREATE_INTEGRITY_EN &&
 	    ALIGN(max_t(int, inl_size, size), MLX5_SEND_WQE_BB) < MLX5_SIG_WQE_SIZE)
-			return MLX5_SIG_WQE_SIZE;
+		return MLX5_SIG_WQE_SIZE;
 	else
 		return ALIGN(max_t(int, inl_size, size), MLX5_SEND_WQE_BB);
 }
@@ -411,9 +497,6 @@
 			      sizeof(struct mlx5_wqe_inline_seg);
 	attr->cap.max_inline_data = qp->max_inline_data;
 
-	if (attr->create_flags & IB_QP_CREATE_SIGNATURE_EN)
-		qp->signature_en = true;
-
 	wq_size = roundup_pow_of_two(attr->cap.max_send_wr * wqe_size);
 	qp->sq.wqe_cnt = wq_size / MLX5_SEND_WQE_BB;
 	if (qp->sq.wqe_cnt > (1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz))) {
@@ -449,9 +532,9 @@
 		return -EINVAL;
 	}
 
-	if (ucmd->sq_wqe_count && ((1 << ilog2(ucmd->sq_wqe_count)) != ucmd->sq_wqe_count)) {
-		mlx5_ib_warn(dev, "sq_wqe_count %d, sq_wqe_count %d\n",
-			     ucmd->sq_wqe_count, ucmd->sq_wqe_count);
+	if (ucmd->sq_wqe_count && !is_power_of_2(ucmd->sq_wqe_count)) {
+		mlx5_ib_warn(dev, "sq_wqe_count %d is not a power of two\n",
+			     ucmd->sq_wqe_count);
 		return -EINVAL;
 	}
 
@@ -659,16 +742,14 @@
 	return bfregi->sys_pages[index_of_sys_page] + offset;
 }
 
-static int mlx5_ib_umem_get(struct mlx5_ib_dev *dev,
-			    struct ib_pd *pd,
+static int mlx5_ib_umem_get(struct mlx5_ib_dev *dev, struct ib_udata *udata,
 			    unsigned long addr, size_t size,
-			    struct ib_umem **umem,
-			    int *npages, int *page_shift, int *ncont,
-			    u32 *offset)
+			    struct ib_umem **umem, int *npages, int *page_shift,
+			    int *ncont, u32 *offset)
 {
 	int err;
 
-	*umem = ib_umem_get(pd->uobject->context, addr, size, 0, 0);
+	*umem = ib_umem_get(udata, addr, size, 0, 0);
 	if (IS_ERR(*umem)) {
 		mlx5_ib_dbg(dev, "umem_get failed\n");
 		return PTR_ERR(*umem);
@@ -695,24 +776,27 @@
 }
 
 static void destroy_user_rq(struct mlx5_ib_dev *dev, struct ib_pd *pd,
-			    struct mlx5_ib_rwq *rwq)
+			    struct mlx5_ib_rwq *rwq, struct ib_udata *udata)
 {
-	struct mlx5_ib_ucontext *context;
+	struct mlx5_ib_ucontext *context =
+		rdma_udata_to_drv_context(
+			udata,
+			struct mlx5_ib_ucontext,
+			ibucontext);
 
 	if (rwq->create_flags & MLX5_IB_WQ_FLAGS_DELAY_DROP)
 		atomic_dec(&dev->delay_drop.rqs_cnt);
 
-	context = to_mucontext(pd->uobject->context);
 	mlx5_ib_db_unmap_user(context, &rwq->db);
-	if (rwq->umem)
-		ib_umem_release(rwq->umem);
+	ib_umem_release(rwq->umem);
 }
 
 static int create_user_rq(struct mlx5_ib_dev *dev, struct ib_pd *pd,
-			  struct mlx5_ib_rwq *rwq,
+			  struct ib_udata *udata, struct mlx5_ib_rwq *rwq,
 			  struct mlx5_ib_create_wq *ucmd)
 {
-	struct mlx5_ib_ucontext *context;
+	struct mlx5_ib_ucontext *ucontext = rdma_udata_to_drv_context(
+		udata, struct mlx5_ib_ucontext, ibucontext);
 	int page_shift = 0;
 	int npages;
 	u32 offset = 0;
@@ -722,9 +806,7 @@
 	if (!ucmd->buf_addr)
 		return -EINVAL;
 
-	context = to_mucontext(pd->uobject->context);
-	rwq->umem = ib_umem_get(pd->uobject->context, ucmd->buf_addr,
-			       rwq->buf_size, 0, 0);
+	rwq->umem = ib_umem_get(udata, ucmd->buf_addr, rwq->buf_size, 0, 0);
 	if (IS_ERR(rwq->umem)) {
 		mlx5_ib_dbg(dev, "umem_get failed\n");
 		err = PTR_ERR(rwq->umem);
@@ -749,7 +831,7 @@
 		    (unsigned long long)ucmd->buf_addr, rwq->buf_size,
 		    npages, page_shift, ncont, offset);
 
-	err = mlx5_ib_db_map_user(context, ucmd->db_addr, &rwq->db);
+	err = mlx5_ib_db_map_user(ucontext, udata, ucmd->db_addr, &rwq->db);
 	if (err) {
 		mlx5_ib_dbg(dev, "map failed\n");
 		goto err_umem;
@@ -789,6 +871,7 @@
 	__be64 *pas;
 	void *qpc;
 	int err;
+	u16 uid;
 
 	err = ib_copy_from_udata(&ucmd, udata, sizeof(ucmd));
 	if (err) {
@@ -796,7 +879,8 @@
 		return err;
 	}
 
-	context = to_mucontext(pd->uobject->context);
+	context = rdma_udata_to_drv_context(udata, struct mlx5_ib_ucontext,
+					    ibucontext);
 	if (ucmd.flags & MLX5_QP_FLAG_BFREG_INDEX) {
 		uar_index = bfregn_to_uar_index(dev, &context->bfregi,
 						ucmd.bfreg_index, true);
@@ -832,10 +916,9 @@
 
 	if (ucmd.buf_addr && ubuffer->buf_size) {
 		ubuffer->buf_addr = ucmd.buf_addr;
-		err = mlx5_ib_umem_get(dev, pd, ubuffer->buf_addr,
-				       ubuffer->buf_size,
-				       &ubuffer->umem, &npages, &page_shift,
-				       &ncont, &offset);
+		err = mlx5_ib_umem_get(dev, udata, ubuffer->buf_addr,
+				       ubuffer->buf_size, &ubuffer->umem,
+				       &npages, &page_shift, &ncont, &offset);
 		if (err)
 			goto err_bfreg;
 	} else {
@@ -850,6 +933,9 @@
 		goto err_umem;
 	}
 
+	uid = (attr->qp_type != IB_QPT_XRC_TGT &&
+	       attr->qp_type != IB_QPT_XRC_INI) ? to_mpd(pd)->uid : 0;
+	MLX5_SET(create_qp_in, *in, uid, uid);
 	pas = (__be64 *)MLX5_ADDR_OF(create_qp_in, *in, pas);
 	if (ubuffer->umem)
 		mlx5_ib_populate_pas(dev, ubuffer->umem, page_shift, pas, 0);
@@ -866,7 +952,7 @@
 		resp->bfreg_index = MLX5_IB_INVALID_BFREG;
 	qp->bfregn = bfregn;
 
-	err = mlx5_ib_db_map_user(context, ucmd.db_addr, &qp->db);
+	err = mlx5_ib_db_map_user(context, udata, ucmd.db_addr, &qp->db);
 	if (err) {
 		mlx5_ib_dbg(dev, "map failed\n");
 		goto err_free;
@@ -888,8 +974,7 @@
 	kvfree(*in);
 
 err_umem:
-	if (ubuffer->umem)
-		ib_umem_release(ubuffer->umem);
+	ib_umem_release(ubuffer->umem);
 
 err_bfreg:
 	if (bfregn != MLX5_IB_INVALID_BFREG)
@@ -898,14 +983,17 @@
 }
 
 static void destroy_qp_user(struct mlx5_ib_dev *dev, struct ib_pd *pd,
-			    struct mlx5_ib_qp *qp, struct mlx5_ib_qp_base *base)
+			    struct mlx5_ib_qp *qp, struct mlx5_ib_qp_base *base,
+			    struct ib_udata *udata)
 {
-	struct mlx5_ib_ucontext *context;
+	struct mlx5_ib_ucontext *context =
+		rdma_udata_to_drv_context(
+			udata,
+			struct mlx5_ib_ucontext,
+			ibucontext);
 
-	context = to_mucontext(pd->uobject->context);
 	mlx5_ib_db_unmap_user(context, &qp->db);
-	if (base->ubuffer.umem)
-		ib_umem_release(base->ubuffer.umem);
+	ib_umem_release(base->ubuffer.umem);
 
 	/*
 	 * Free only the BFREGs which are handled by the kernel.
@@ -915,6 +1003,30 @@
 		mlx5_ib_free_bfreg(dev, &context->bfregi, qp->bfregn);
 }
 
+/* get_sq_edge - Get the next nearby edge.
+ *
+ * An 'edge' is defined as the first following address after the end
+ * of the fragment or the SQ. Accordingly, during the WQE construction
+ * which repetitively increases the pointer to write the next data, it
+ * simply should check if it gets to an edge.
+ *
+ * @sq - SQ buffer.
+ * @idx - Stride index in the SQ buffer.
+ *
+ * Return:
+ *	The new edge.
+ */
+static void *get_sq_edge(struct mlx5_ib_wq *sq, u32 idx)
+{
+	void *fragment_end;
+
+	fragment_end = mlx5_frag_buf_get_wqe
+		(&sq->fbc,
+		 mlx5_frag_buf_get_idx_last_contig_stride(&sq->fbc, idx));
+
+	return fragment_end + MLX5_SEND_WQE_BB;
+}
+
 static int create_kernel_qp(struct mlx5_ib_dev *dev,
 			    struct ib_qp_init_attr *init_attr,
 			    struct mlx5_ib_qp *qp,
@@ -925,7 +1037,7 @@
 	void *qpc;
 	int err;
 
-	if (init_attr->create_flags & ~(IB_QP_CREATE_SIGNATURE_EN |
+	if (init_attr->create_flags & ~(IB_QP_CREATE_INTEGRITY_EN |
 					IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK |
 					IB_QP_CREATE_IPOIB_UD_LSO |
 					IB_QP_CREATE_NETIF_QP |
@@ -953,13 +1065,29 @@
 	qp->sq.offset = qp->rq.wqe_cnt << qp->rq.wqe_shift;
 	base->ubuffer.buf_size = err + (qp->rq.wqe_cnt << qp->rq.wqe_shift);
 
-	err = mlx5_buf_alloc(dev->mdev, base->ubuffer.buf_size, &qp->buf);
+	err = mlx5_frag_buf_alloc_node(dev->mdev, base->ubuffer.buf_size,
+				       &qp->buf, dev->mdev->priv.numa_node);
 	if (err) {
 		mlx5_ib_dbg(dev, "err %d\n", err);
 		return err;
 	}
 
-	qp->sq.qend = mlx5_get_send_wqe(qp, qp->sq.wqe_cnt);
+	if (qp->rq.wqe_cnt)
+		mlx5_init_fbc(qp->buf.frags, qp->rq.wqe_shift,
+			      ilog2(qp->rq.wqe_cnt), &qp->rq.fbc);
+
+	if (qp->sq.wqe_cnt) {
+		int sq_strides_offset = (qp->sq.offset  & (PAGE_SIZE - 1)) /
+					MLX5_SEND_WQE_BB;
+		mlx5_init_fbc_offset(qp->buf.frags +
+				     (qp->sq.offset / PAGE_SIZE),
+				     ilog2(MLX5_SEND_WQE_BB),
+				     ilog2(qp->sq.wqe_cnt),
+				     sq_strides_offset, &qp->sq.fbc);
+
+		qp->sq.cur_edge = get_sq_edge(&qp->sq, 0);
+	}
+
 	*inlen = MLX5_ST_SZ_BYTES(create_qp_in) +
 		 MLX5_FLD_SZ_BYTES(create_qp_in, pas[0]) * qp->buf.npages;
 	*in = kvzalloc(*inlen, GFP_KERNEL);
@@ -981,8 +1109,9 @@
 		qp->flags |= MLX5_IB_QP_SQPN_QP1;
 	}
 
-	mlx5_fill_page_array(&qp->buf,
-			     (__be64 *)MLX5_ADDR_OF(create_qp_in, *in, pas));
+	mlx5_fill_page_frag_array(&qp->buf,
+				  (__be64 *)MLX5_ADDR_OF(create_qp_in,
+							 *in, pas));
 
 	err = mlx5_db_alloc(dev->mdev, &qp->db);
 	if (err) {
@@ -1022,7 +1151,7 @@
 	kvfree(*in);
 
 err_buf:
-	mlx5_buf_free(dev->mdev, &qp->buf);
+	mlx5_frag_buf_free(dev->mdev, &qp->buf);
 	return err;
 }
 
@@ -1034,7 +1163,7 @@
 	kvfree(qp->sq.wr_data);
 	kvfree(qp->rq.wrid);
 	mlx5_db_free(dev->mdev, &qp->db);
-	mlx5_buf_free(dev->mdev, &qp->buf);
+	mlx5_frag_buf_free(dev->mdev, &qp->buf);
 }
 
 static u32 get_rx_type(struct mlx5_ib_qp *qp, struct ib_qp_init_attr *attr)
@@ -1051,7 +1180,8 @@
 
 static int is_connected(enum ib_qp_type qp_type)
 {
-	if (qp_type == IB_QPT_RC || qp_type == IB_QPT_UC)
+	if (qp_type == IB_QPT_RC || qp_type == IB_QPT_UC ||
+	    qp_type == MLX5_IB_QPT_DCI)
 		return 1;
 
 	return 0;
@@ -1059,11 +1189,13 @@
 
 static int create_raw_packet_qp_tis(struct mlx5_ib_dev *dev,
 				    struct mlx5_ib_qp *qp,
-				    struct mlx5_ib_sq *sq, u32 tdn)
+				    struct mlx5_ib_sq *sq, u32 tdn,
+				    struct ib_pd *pd)
 {
 	u32 in[MLX5_ST_SZ_DW(create_tis_in)] = {0};
 	void *tisc = MLX5_ADDR_OF(create_tis_in, in, ctx);
 
+	MLX5_SET(create_tis_in, in, uid, to_mpd(pd)->uid);
 	MLX5_SET(tisc, tisc, transport_domain, tdn);
 	if (qp->flags & MLX5_IB_QP_UNDERLAY)
 		MLX5_SET(tisc, tisc, underlay_qpn, qp->underlay_qpn);
@@ -1072,19 +1204,20 @@
 }
 
 static void destroy_raw_packet_qp_tis(struct mlx5_ib_dev *dev,
-				      struct mlx5_ib_sq *sq)
+				      struct mlx5_ib_sq *sq, struct ib_pd *pd)
 {
-	mlx5_core_destroy_tis(dev->mdev, sq->tisn);
+	mlx5_cmd_destroy_tis(dev->mdev, sq->tisn, to_mpd(pd)->uid);
 }
 
-static void destroy_flow_rule_vport_sq(struct mlx5_ib_dev *dev,
-				       struct mlx5_ib_sq *sq)
+static void destroy_flow_rule_vport_sq(struct mlx5_ib_sq *sq)
 {
 	if (sq->flow_rule)
 		mlx5_del_flow_rules(sq->flow_rule);
+	sq->flow_rule = NULL;
 }
 
 static int create_raw_packet_qp_sq(struct mlx5_ib_dev *dev,
+				   struct ib_udata *udata,
 				   struct mlx5_ib_sq *sq, void *qpin,
 				   struct ib_pd *pd)
 {
@@ -1101,9 +1234,9 @@
 	int ncont = 0;
 	u32 offset = 0;
 
-	err = mlx5_ib_umem_get(dev, pd, ubuffer->buf_addr, ubuffer->buf_size,
-			       &sq->ubuffer.umem, &npages, &page_shift,
-			       &ncont, &offset);
+	err = mlx5_ib_umem_get(dev, udata, ubuffer->buf_addr, ubuffer->buf_size,
+			       &sq->ubuffer.umem, &npages, &page_shift, &ncont,
+			       &offset);
 	if (err)
 		return err;
 
@@ -1114,6 +1247,7 @@
 		goto err_umem;
 	}
 
+	MLX5_SET(create_sq_in, in, uid, to_mpd(pd)->uid);
 	sqc = MLX5_ADDR_OF(create_sq_in, in, ctx);
 	MLX5_SET(sqc, sqc, flush_in_error_en, 1);
 	if (MLX5_CAP_ETH(dev->mdev, multi_pkt_send_wqe))
@@ -1147,15 +1281,8 @@
 	if (err)
 		goto err_umem;
 
-	err = create_flow_rule_vport_sq(dev, sq);
-	if (err)
-		goto err_flow;
-
 	return 0;
 
-err_flow:
-	mlx5_core_destroy_sq_tracked(dev->mdev, &sq->base.mqp);
-
 err_umem:
 	ib_umem_release(sq->ubuffer.umem);
 	sq->ubuffer.umem = NULL;
@@ -1166,7 +1293,7 @@
 static void destroy_raw_packet_qp_sq(struct mlx5_ib_dev *dev,
 				     struct mlx5_ib_sq *sq)
 {
-	destroy_flow_rule_vport_sq(dev, sq);
+	destroy_flow_rule_vport_sq(sq);
 	mlx5_core_destroy_sq_tracked(dev->mdev, &sq->base.mqp);
 	ib_umem_release(sq->ubuffer.umem);
 }
@@ -1188,7 +1315,7 @@
 
 static int create_raw_packet_qp_rq(struct mlx5_ib_dev *dev,
 				   struct mlx5_ib_rq *rq, void *qpin,
-				   size_t qpinlen)
+				   size_t qpinlen, struct ib_pd *pd)
 {
 	struct mlx5_ib_qp *mqp = rq->base.container_mibqp;
 	__be64 *pas;
@@ -1209,6 +1336,7 @@
 	if (!in)
 		return -ENOMEM;
 
+	MLX5_SET(create_rq_in, in, uid, to_mpd(pd)->uid);
 	rqc = MLX5_ADDR_OF(create_rq_in, in, ctx);
 	if (!(rq->flags & MLX5_IB_RQ_CVLAN_STRIPPING))
 		MLX5_SET(rqc, rqc, vsd, 1);
@@ -1256,10 +1384,24 @@
 		 MLX5_CAP_ETH(dev, tunnel_stateless_geneve_rx));
 }
 
+static void destroy_raw_packet_qp_tir(struct mlx5_ib_dev *dev,
+				      struct mlx5_ib_rq *rq,
+				      u32 qp_flags_en,
+				      struct ib_pd *pd)
+{
+	if (qp_flags_en & (MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_UC |
+			   MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_MC))
+		mlx5_ib_disable_lb(dev, false, true);
+	mlx5_cmd_destroy_tir(dev->mdev, rq->tirn, to_mpd(pd)->uid);
+}
+
 static int create_raw_packet_qp_tir(struct mlx5_ib_dev *dev,
 				    struct mlx5_ib_rq *rq, u32 tdn,
-				    bool tunnel_offload_en)
+				    u32 *qp_flags_en,
+				    struct ib_pd *pd,
+				    u32 *out, int outlen)
 {
+	u8 lb_flag = 0;
 	u32 *in;
 	void *tirc;
 	int inlen;
@@ -1270,52 +1412,73 @@
 	if (!in)
 		return -ENOMEM;
 
+	MLX5_SET(create_tir_in, in, uid, to_mpd(pd)->uid);
 	tirc = MLX5_ADDR_OF(create_tir_in, in, ctx);
 	MLX5_SET(tirc, tirc, disp_type, MLX5_TIRC_DISP_TYPE_DIRECT);
 	MLX5_SET(tirc, tirc, inline_rqn, rq->base.mqp.qpn);
 	MLX5_SET(tirc, tirc, transport_domain, tdn);
-	if (tunnel_offload_en)
+	if (*qp_flags_en & MLX5_QP_FLAG_TUNNEL_OFFLOADS)
 		MLX5_SET(tirc, tirc, tunneled_offload_en, 1);
 
-	if (dev->rep)
-		MLX5_SET(tirc, tirc, self_lb_block,
-			 MLX5_TIRC_SELF_LB_BLOCK_BLOCK_UNICAST_);
+	if (*qp_flags_en & MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_UC)
+		lb_flag |= MLX5_TIRC_SELF_LB_BLOCK_BLOCK_UNICAST;
 
-	err = mlx5_core_create_tir(dev->mdev, in, inlen, &rq->tirn);
+	if (*qp_flags_en & MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_MC)
+		lb_flag |= MLX5_TIRC_SELF_LB_BLOCK_BLOCK_MULTICAST;
 
+	if (dev->is_rep) {
+		lb_flag |= MLX5_TIRC_SELF_LB_BLOCK_BLOCK_UNICAST;
+		*qp_flags_en |= MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_UC;
+	}
+
+	MLX5_SET(tirc, tirc, self_lb_block, lb_flag);
+
+	err = mlx5_core_create_tir_out(dev->mdev, in, inlen, out, outlen);
+
+	rq->tirn = MLX5_GET(create_tir_out, out, tirn);
+	if (!err && MLX5_GET(tirc, tirc, self_lb_block)) {
+		err = mlx5_ib_enable_lb(dev, false, true);
+
+		if (err)
+			destroy_raw_packet_qp_tir(dev, rq, 0, pd);
+	}
 	kvfree(in);
 
 	return err;
 }
 
-static void destroy_raw_packet_qp_tir(struct mlx5_ib_dev *dev,
-				      struct mlx5_ib_rq *rq)
-{
-	mlx5_core_destroy_tir(dev->mdev, rq->tirn);
-}
-
 static int create_raw_packet_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp,
 				u32 *in, size_t inlen,
-				struct ib_pd *pd)
+				struct ib_pd *pd,
+				struct ib_udata *udata,
+				struct mlx5_ib_create_qp_resp *resp)
 {
 	struct mlx5_ib_raw_packet_qp *raw_packet_qp = &qp->raw_packet_qp;
 	struct mlx5_ib_sq *sq = &raw_packet_qp->sq;
 	struct mlx5_ib_rq *rq = &raw_packet_qp->rq;
-	struct ib_uobject *uobj = pd->uobject;
-	struct ib_ucontext *ucontext = uobj->context;
-	struct mlx5_ib_ucontext *mucontext = to_mucontext(ucontext);
+	struct mlx5_ib_ucontext *mucontext = rdma_udata_to_drv_context(
+		udata, struct mlx5_ib_ucontext, ibucontext);
 	int err;
 	u32 tdn = mucontext->tdn;
+	u16 uid = to_mpd(pd)->uid;
+	u32 out[MLX5_ST_SZ_DW(create_tir_out)] = {};
 
 	if (qp->sq.wqe_cnt) {
-		err = create_raw_packet_qp_tis(dev, qp, sq, tdn);
+		err = create_raw_packet_qp_tis(dev, qp, sq, tdn, pd);
 		if (err)
 			return err;
 
-		err = create_raw_packet_qp_sq(dev, sq, in, pd);
+		err = create_raw_packet_qp_sq(dev, udata, sq, in, pd);
 		if (err)
 			goto err_destroy_tis;
 
+		if (uid) {
+			resp->tisn = sq->tisn;
+			resp->comp_mask |= MLX5_IB_CREATE_QP_RESP_MASK_TISN;
+			resp->sqn = sq->base.mqp.qpn;
+			resp->comp_mask |= MLX5_IB_CREATE_QP_RESP_MASK_SQN;
+		}
+
 		sq->base.container_mibqp = qp;
 		sq->base.mqp.event = mlx5_ib_qp_event;
 	}
@@ -1327,22 +1490,48 @@
 			rq->flags |= MLX5_IB_RQ_CVLAN_STRIPPING;
 		if (qp->flags & MLX5_IB_QP_PCI_WRITE_END_PADDING)
 			rq->flags |= MLX5_IB_RQ_PCI_WRITE_END_PADDING;
-		err = create_raw_packet_qp_rq(dev, rq, in, inlen);
+		err = create_raw_packet_qp_rq(dev, rq, in, inlen, pd);
 		if (err)
 			goto err_destroy_sq;
 
-
-		err = create_raw_packet_qp_tir(dev, rq, tdn,
-					       qp->tunnel_offload_en);
+		err = create_raw_packet_qp_tir(
+			dev, rq, tdn, &qp->flags_en, pd, out,
+			MLX5_ST_SZ_BYTES(create_tir_out));
 		if (err)
 			goto err_destroy_rq;
+
+		if (uid) {
+			resp->rqn = rq->base.mqp.qpn;
+			resp->comp_mask |= MLX5_IB_CREATE_QP_RESP_MASK_RQN;
+			resp->tirn = rq->tirn;
+			resp->comp_mask |= MLX5_IB_CREATE_QP_RESP_MASK_TIRN;
+			if (MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev, sw_owner)) {
+				resp->tir_icm_addr = MLX5_GET(
+					create_tir_out, out, icm_address_31_0);
+				resp->tir_icm_addr |=
+					(u64)MLX5_GET(create_tir_out, out,
+						      icm_address_39_32)
+					<< 32;
+				resp->tir_icm_addr |=
+					(u64)MLX5_GET(create_tir_out, out,
+						      icm_address_63_40)
+					<< 40;
+				resp->comp_mask |=
+					MLX5_IB_CREATE_QP_RESP_MASK_TIR_ICM_ADDR;
+			}
+		}
 	}
 
 	qp->trans_qp.base.mqp.qpn = qp->sq.wqe_cnt ? sq->base.mqp.qpn :
 						     rq->base.mqp.qpn;
+	err = ib_copy_to_udata(udata, resp, min(udata->outlen, sizeof(*resp)));
+	if (err)
+		goto err_destroy_tir;
 
 	return 0;
 
+err_destroy_tir:
+	destroy_raw_packet_qp_tir(dev, rq, qp->flags_en, pd);
 err_destroy_rq:
 	destroy_raw_packet_qp_rq(dev, rq);
 err_destroy_sq:
@@ -1350,7 +1539,7 @@
 		return err;
 	destroy_raw_packet_qp_sq(dev, sq);
 err_destroy_tis:
-	destroy_raw_packet_qp_tis(dev, sq);
+	destroy_raw_packet_qp_tis(dev, sq, pd);
 
 	return err;
 }
@@ -1363,13 +1552,13 @@
 	struct mlx5_ib_rq *rq = &raw_packet_qp->rq;
 
 	if (qp->rq.wqe_cnt) {
-		destroy_raw_packet_qp_tir(dev, rq);
+		destroy_raw_packet_qp_tir(dev, rq, qp->flags_en, qp->ibqp.pd);
 		destroy_raw_packet_qp_rq(dev, rq);
 	}
 
 	if (qp->sq.wqe_cnt) {
 		destroy_raw_packet_qp_sq(dev, sq);
-		destroy_raw_packet_qp_tis(dev, sq);
+		destroy_raw_packet_qp_tis(dev, sq, qp->ibqp.pd);
 	}
 }
 
@@ -1387,7 +1576,11 @@
 
 static void destroy_rss_raw_qp_tir(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp)
 {
-	mlx5_core_destroy_tir(dev->mdev, qp->rss_qp.tirn);
+	if (qp->flags_en & (MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_UC |
+			    MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_MC))
+		mlx5_ib_disable_lb(dev, false, true);
+	mlx5_cmd_destroy_tir(dev->mdev, qp->rss_qp.tirn,
+			     to_mpd(qp->ibqp.pd)->uid);
 }
 
 static int create_rss_raw_qp_tir(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp,
@@ -1395,13 +1588,14 @@
 				 struct ib_qp_init_attr *init_attr,
 				 struct ib_udata *udata)
 {
-	struct ib_uobject *uobj = pd->uobject;
-	struct ib_ucontext *ucontext = uobj->context;
-	struct mlx5_ib_ucontext *mucontext = to_mucontext(ucontext);
+	struct mlx5_ib_ucontext *mucontext = rdma_udata_to_drv_context(
+		udata, struct mlx5_ib_ucontext, ibucontext);
 	struct mlx5_ib_create_qp_resp resp = {};
 	int inlen;
+	int outlen;
 	int err;
 	u32 *in;
+	u32 *out;
 	void *tirc;
 	void *hfso;
 	u32 selected_fields = 0;
@@ -1410,6 +1604,7 @@
 	u32 tdn = mucontext->tdn;
 	struct mlx5_ib_create_qp_rss ucmd = {};
 	size_t required_cmd_sz;
+	u8 lb_flag = 0;
 
 	if (init_attr->qp_type != IB_QPT_RAW_PACKET)
 		return -EOPNOTSUPP;
@@ -1444,7 +1639,9 @@
 		return -EOPNOTSUPP;
 	}
 
-	if (ucmd.flags & ~MLX5_QP_FLAG_TUNNEL_OFFLOADS) {
+	if (ucmd.flags & ~(MLX5_QP_FLAG_TUNNEL_OFFLOADS |
+			   MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_UC |
+			   MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_MC)) {
 		mlx5_ib_dbg(dev, "invalid flags\n");
 		return -EOPNOTSUPP;
 	}
@@ -1461,6 +1658,16 @@
 		return -EOPNOTSUPP;
 	}
 
+	if (ucmd.flags & MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_UC || dev->is_rep) {
+		lb_flag |= MLX5_TIRC_SELF_LB_BLOCK_BLOCK_UNICAST;
+		qp->flags_en |= MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_UC;
+	}
+
+	if (ucmd.flags & MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_MC) {
+		lb_flag |= MLX5_TIRC_SELF_LB_BLOCK_BLOCK_MULTICAST;
+		qp->flags_en |= MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_MC;
+	}
+
 	err = ib_copy_to_udata(udata, &resp, min(udata->outlen, sizeof(resp)));
 	if (err) {
 		mlx5_ib_dbg(dev, "copy failed\n");
@@ -1468,10 +1675,13 @@
 	}
 
 	inlen = MLX5_ST_SZ_BYTES(create_tir_in);
-	in = kvzalloc(inlen, GFP_KERNEL);
+	outlen = MLX5_ST_SZ_BYTES(create_tir_out);
+	in = kvzalloc(inlen + outlen, GFP_KERNEL);
 	if (!in)
 		return -ENOMEM;
 
+	out = in + MLX5_ST_SZ_DW(create_tir_in);
+	MLX5_SET(create_tir_in, in, uid, to_mpd(pd)->uid);
 	tirc = MLX5_ADDR_OF(create_tir_in, in, ctx);
 	MLX5_SET(tirc, tirc, disp_type,
 		 MLX5_TIRC_DISP_TYPE_INDIRECT);
@@ -1484,6 +1694,8 @@
 	if (ucmd.flags & MLX5_QP_FLAG_TUNNEL_OFFLOADS)
 		MLX5_SET(tirc, tirc, tunneled_offload_en, 1);
 
+	MLX5_SET(tirc, tirc, self_lb_block, lb_flag);
+
 	if (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_INNER)
 		hfso = MLX5_ADDR_OF(tirc, tirc, rx_hash_field_selector_inner);
 	else
@@ -1501,7 +1713,6 @@
 		}
 
 		MLX5_SET(tirc, tirc, rx_hash_fn, MLX5_RX_HASH_FN_TOEPLITZ);
-		MLX5_SET(tirc, tirc, rx_hash_symmetric, 1);
 		memcpy(rss_key, ucmd.rx_hash_key, len);
 		break;
 	}
@@ -1580,26 +1791,157 @@
 	MLX5_SET(rx_hash_field_select, hfso, selected_fields, selected_fields);
 
 create_tir:
-	if (dev->rep)
-		MLX5_SET(tirc, tirc, self_lb_block,
-			 MLX5_TIRC_SELF_LB_BLOCK_BLOCK_UNICAST_);
+	err = mlx5_core_create_tir_out(dev->mdev, in, inlen, out, outlen);
 
-	err = mlx5_core_create_tir(dev->mdev, in, inlen, &qp->rss_qp.tirn);
+	qp->rss_qp.tirn = MLX5_GET(create_tir_out, out, tirn);
+	if (!err && MLX5_GET(tirc, tirc, self_lb_block)) {
+		err = mlx5_ib_enable_lb(dev, false, true);
+
+		if (err)
+			mlx5_cmd_destroy_tir(dev->mdev, qp->rss_qp.tirn,
+					     to_mpd(pd)->uid);
+	}
 
 	if (err)
 		goto err;
 
+	if (mucontext->devx_uid) {
+		resp.comp_mask |= MLX5_IB_CREATE_QP_RESP_MASK_TIRN;
+		resp.tirn = qp->rss_qp.tirn;
+		if (MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev, sw_owner)) {
+			resp.tir_icm_addr =
+				MLX5_GET(create_tir_out, out, icm_address_31_0);
+			resp.tir_icm_addr |= (u64)MLX5_GET(create_tir_out, out,
+							   icm_address_39_32)
+					     << 32;
+			resp.tir_icm_addr |= (u64)MLX5_GET(create_tir_out, out,
+							   icm_address_63_40)
+					     << 40;
+			resp.comp_mask |=
+				MLX5_IB_CREATE_QP_RESP_MASK_TIR_ICM_ADDR;
+		}
+	}
+
+	err = ib_copy_to_udata(udata, &resp, min(udata->outlen, sizeof(resp)));
+	if (err)
+		goto err_copy;
+
 	kvfree(in);
 	/* qpn is reserved for that QP */
 	qp->trans_qp.base.mqp.qpn = 0;
 	qp->flags |= MLX5_IB_QP_RSS;
 	return 0;
 
+err_copy:
+	mlx5_cmd_destroy_tir(dev->mdev, qp->rss_qp.tirn, mucontext->devx_uid);
 err:
 	kvfree(in);
 	return err;
 }
 
+static void configure_responder_scat_cqe(struct ib_qp_init_attr *init_attr,
+					 void *qpc)
+{
+	int rcqe_sz;
+
+	if (init_attr->qp_type == MLX5_IB_QPT_DCI)
+		return;
+
+	rcqe_sz = mlx5_ib_get_cqe_size(init_attr->recv_cq);
+
+	if (init_attr->qp_type == MLX5_IB_QPT_DCT) {
+		if (rcqe_sz == 128)
+			MLX5_SET(dctc, qpc, cs_res, MLX5_RES_SCAT_DATA64_CQE);
+
+		return;
+	}
+
+	MLX5_SET(qpc, qpc, cs_res,
+		 rcqe_sz == 128 ? MLX5_RES_SCAT_DATA64_CQE :
+				  MLX5_RES_SCAT_DATA32_CQE);
+}
+
+static void configure_requester_scat_cqe(struct mlx5_ib_dev *dev,
+					 struct ib_qp_init_attr *init_attr,
+					 struct mlx5_ib_create_qp *ucmd,
+					 void *qpc)
+{
+	enum ib_qp_type qpt = init_attr->qp_type;
+	int scqe_sz;
+	bool allow_scat_cqe = 0;
+
+	if (qpt == IB_QPT_UC || qpt == IB_QPT_UD)
+		return;
+
+	if (ucmd)
+		allow_scat_cqe = ucmd->flags & MLX5_QP_FLAG_ALLOW_SCATTER_CQE;
+
+	if (!allow_scat_cqe && init_attr->sq_sig_type != IB_SIGNAL_ALL_WR)
+		return;
+
+	scqe_sz = mlx5_ib_get_cqe_size(init_attr->send_cq);
+	if (scqe_sz == 128) {
+		MLX5_SET(qpc, qpc, cs_req, MLX5_REQ_SCAT_DATA64_CQE);
+		return;
+	}
+
+	if (init_attr->qp_type != MLX5_IB_QPT_DCI ||
+	    MLX5_CAP_GEN(dev->mdev, dc_req_scat_data_cqe))
+		MLX5_SET(qpc, qpc, cs_req, MLX5_REQ_SCAT_DATA32_CQE);
+}
+
+static int atomic_size_to_mode(int size_mask)
+{
+	/* driver does not support atomic_size > 256B
+	 * and does not know how to translate bigger sizes
+	 */
+	int supported_size_mask = size_mask & 0x1ff;
+	int log_max_size;
+
+	if (!supported_size_mask)
+		return -EOPNOTSUPP;
+
+	log_max_size = __fls(supported_size_mask);
+
+	if (log_max_size > 3)
+		return log_max_size;
+
+	return MLX5_ATOMIC_MODE_8B;
+}
+
+static int get_atomic_mode(struct mlx5_ib_dev *dev,
+			   enum ib_qp_type qp_type)
+{
+	u8 atomic_operations = MLX5_CAP_ATOMIC(dev->mdev, atomic_operations);
+	u8 atomic = MLX5_CAP_GEN(dev->mdev, atomic);
+	int atomic_mode = -EOPNOTSUPP;
+	int atomic_size_mask;
+
+	if (!atomic)
+		return -EOPNOTSUPP;
+
+	if (qp_type == MLX5_IB_QPT_DCT)
+		atomic_size_mask = MLX5_CAP_ATOMIC(dev->mdev, atomic_size_dc);
+	else
+		atomic_size_mask = MLX5_CAP_ATOMIC(dev->mdev, atomic_size_qp);
+
+	if ((atomic_operations & MLX5_ATOMIC_OPS_EXTENDED_CMP_SWAP) ||
+	    (atomic_operations & MLX5_ATOMIC_OPS_EXTENDED_FETCH_ADD))
+		atomic_mode = atomic_size_to_mode(atomic_size_mask);
+
+	if (atomic_mode <= 0 &&
+	    (atomic_operations & MLX5_ATOMIC_OPS_CMP_SWAP &&
+	     atomic_operations & MLX5_ATOMIC_OPS_FETCH_ADD))
+		atomic_mode = MLX5_ATOMIC_MODE_IB_COMP;
+
+	return atomic_mode;
+}
+
+static inline bool check_flags_mask(uint64_t input, uint64_t supported)
+{
+	return (input & ~supported) == 0;
+}
+
 static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd,
 			    struct ib_qp_init_attr *init_attr,
 			    struct ib_udata *udata, struct mlx5_ib_qp *qp)
@@ -1608,6 +1950,8 @@
 	int inlen = MLX5_ST_SZ_BYTES(create_qp_in);
 	struct mlx5_core_dev *mdev = dev->mdev;
 	struct mlx5_ib_create_qp_resp resp = {};
+	struct mlx5_ib_ucontext *ucontext = rdma_udata_to_drv_context(
+		udata, struct mlx5_ib_ucontext, ibucontext);
 	struct mlx5_ib_cq *send_cq;
 	struct mlx5_ib_cq *recv_cq;
 	unsigned long flags;
@@ -1691,26 +2035,64 @@
 		qp->flags |= MLX5_IB_QP_CVLAN_STRIPPING;
 	}
 
-	if (pd && pd->uobject) {
+	if (udata) {
 		if (ib_copy_from_udata(&ucmd, udata, sizeof(ucmd))) {
 			mlx5_ib_dbg(dev, "copy failed\n");
 			return -EFAULT;
 		}
 
-		err = get_qp_user_index(to_mucontext(pd->uobject->context),
-					&ucmd, udata->inlen, &uidx);
+		if (!check_flags_mask(ucmd.flags,
+				      MLX5_QP_FLAG_ALLOW_SCATTER_CQE |
+				      MLX5_QP_FLAG_BFREG_INDEX |
+				      MLX5_QP_FLAG_PACKET_BASED_CREDIT_MODE |
+				      MLX5_QP_FLAG_SCATTER_CQE |
+				      MLX5_QP_FLAG_SIGNATURE |
+				      MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_MC |
+				      MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_UC |
+				      MLX5_QP_FLAG_TUNNEL_OFFLOADS |
+				      MLX5_QP_FLAG_TYPE_DCI |
+				      MLX5_QP_FLAG_TYPE_DCT))
+			return -EINVAL;
+
+		err = get_qp_user_index(ucontext, &ucmd, udata->inlen, &uidx);
 		if (err)
 			return err;
 
 		qp->wq_sig = !!(ucmd.flags & MLX5_QP_FLAG_SIGNATURE);
-		qp->scat_cqe = !!(ucmd.flags & MLX5_QP_FLAG_SCATTER_CQE);
+		if (MLX5_CAP_GEN(dev->mdev, sctr_data_cqe))
+			qp->scat_cqe = !!(ucmd.flags & MLX5_QP_FLAG_SCATTER_CQE);
 		if (ucmd.flags & MLX5_QP_FLAG_TUNNEL_OFFLOADS) {
 			if (init_attr->qp_type != IB_QPT_RAW_PACKET ||
 			    !tunnel_offload_supported(mdev)) {
 				mlx5_ib_dbg(dev, "Tunnel offload isn't supported\n");
 				return -EOPNOTSUPP;
 			}
-			qp->tunnel_offload_en = true;
+			qp->flags_en |= MLX5_QP_FLAG_TUNNEL_OFFLOADS;
+		}
+
+		if (ucmd.flags & MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_UC) {
+			if (init_attr->qp_type != IB_QPT_RAW_PACKET) {
+				mlx5_ib_dbg(dev, "Self-LB UC isn't supported\n");
+				return -EOPNOTSUPP;
+			}
+			qp->flags_en |= MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_UC;
+		}
+
+		if (ucmd.flags & MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_MC) {
+			if (init_attr->qp_type != IB_QPT_RAW_PACKET) {
+				mlx5_ib_dbg(dev, "Self-LB UM isn't supported\n");
+				return -EOPNOTSUPP;
+			}
+			qp->flags_en |= MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_MC;
+		}
+
+		if (ucmd.flags & MLX5_QP_FLAG_PACKET_BASED_CREDIT_MODE) {
+			if (init_attr->qp_type != IB_QPT_RC ||
+				!MLX5_CAP_GEN(dev->mdev, qp_packet_based)) {
+				mlx5_ib_dbg(dev, "packet based credit mode isn't supported\n");
+				return -EOPNOTSUPP;
+			}
+			qp->flags |= MLX5_IB_QP_PACKET_BASED_CREDIT;
 		}
 
 		if (init_attr->create_flags & IB_QP_CREATE_SOURCE_QPN) {
@@ -1736,14 +2118,14 @@
 
 	qp->has_rq = qp_has_rq(init_attr);
 	err = set_rq_size(dev, &init_attr->cap, qp->has_rq,
-			  qp, (pd && pd->uobject) ? &ucmd : NULL);
+			  qp, udata ? &ucmd : NULL);
 	if (err) {
 		mlx5_ib_dbg(dev, "err %d\n", err);
 		return err;
 	}
 
 	if (pd) {
-		if (pd->uobject) {
+		if (udata) {
 			__u32 max_wqes =
 				1 << MLX5_CAP_GEN(mdev, log_max_qp_sz);
 			mlx5_ib_dbg(dev, "requested sq_wqe_count (%d)\n", ucmd.sq_wqe_count);
@@ -1809,25 +2191,13 @@
 		MLX5_SET(qpc, qpc, cd_slave_send, 1);
 	if (qp->flags & MLX5_IB_QP_MANAGED_RECV)
 		MLX5_SET(qpc, qpc, cd_slave_receive, 1);
-
+	if (qp->flags & MLX5_IB_QP_PACKET_BASED_CREDIT)
+		MLX5_SET(qpc, qpc, req_e2e_credit_mode, 1);
 	if (qp->scat_cqe && is_connected(init_attr->qp_type)) {
-		int rcqe_sz;
-		int scqe_sz;
-
-		rcqe_sz = mlx5_ib_get_cqe_size(dev, init_attr->recv_cq);
-		scqe_sz = mlx5_ib_get_cqe_size(dev, init_attr->send_cq);
-
-		if (rcqe_sz == 128)
-			MLX5_SET(qpc, qpc, cs_res, MLX5_RES_SCAT_DATA64_CQE);
-		else
-			MLX5_SET(qpc, qpc, cs_res, MLX5_RES_SCAT_DATA32_CQE);
-
-		if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR) {
-			if (scqe_sz == 128)
-				MLX5_SET(qpc, qpc, cs_req, MLX5_REQ_SCAT_DATA64_CQE);
-			else
-				MLX5_SET(qpc, qpc, cs_req, MLX5_REQ_SCAT_DATA32_CQE);
-		}
+		configure_responder_scat_cqe(init_attr, qpc);
+		configure_requester_scat_cqe(dev, init_attr,
+					     udata ? &ucmd : NULL,
+					     qpc);
 	}
 
 	if (qp->rq.wqe_cnt) {
@@ -1911,7 +2281,8 @@
 	    qp->flags & MLX5_IB_QP_UNDERLAY) {
 		qp->raw_packet_qp.sq.ubuffer.buf_addr = ucmd.sq_buf_addr;
 		raw_packet_qp_copy_info(qp, &qp->raw_packet_qp);
-		err = create_raw_packet_qp(dev, qp, in, inlen, pd);
+		err = create_raw_packet_qp(dev, qp, in, inlen, pd, udata,
+					   &resp);
 	} else {
 		err = mlx5_core_create_qp(dev->mdev, &base->mqp, in, inlen);
 	}
@@ -1947,7 +2318,7 @@
 
 err_create:
 	if (qp->create_type == MLX5_QP_USER)
-		destroy_qp_user(dev, pd, qp, base);
+		destroy_qp_user(dev, pd, qp, base, udata);
 	else if (qp->create_type == MLX5_QP_KERNEL)
 		destroy_qp_kernel(dev, qp);
 
@@ -2058,7 +2429,8 @@
 				const struct mlx5_modify_raw_qp_param *raw_qp_param,
 				u8 lag_tx_affinity);
 
-static void destroy_qp_common(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp)
+static void destroy_qp_common(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp,
+			      struct ib_udata *udata)
 {
 	struct mlx5_ib_cq *send_cq, *recv_cq;
 	struct mlx5_ib_qp_base *base;
@@ -2129,7 +2501,7 @@
 	if (qp->create_type == MLX5_QP_KERNEL)
 		destroy_qp_kernel(dev, qp);
 	else if (qp->create_type == MLX5_QP_USER)
-		destroy_qp_user(dev, &get_pd(qp)->ibpd, qp, base);
+		destroy_qp_user(dev, &get_pd(qp)->ibpd, qp, base, udata);
 }
 
 static const char *ib_qp_type_str(enum ib_qp_type type)
@@ -2167,8 +2539,11 @@
 
 static struct ib_qp *mlx5_ib_create_dct(struct ib_pd *pd,
 					struct ib_qp_init_attr *attr,
-					struct mlx5_ib_create_qp *ucmd)
+					struct mlx5_ib_create_qp *ucmd,
+					struct ib_udata *udata)
 {
+	struct mlx5_ib_ucontext *ucontext = rdma_udata_to_drv_context(
+		udata, struct mlx5_ib_ucontext, ibucontext);
 	struct mlx5_ib_qp *qp;
 	int err = 0;
 	u32 uidx = MLX5_IB_DEFAULT_UIDX;
@@ -2177,8 +2552,7 @@
 	if (!attr->srq || !attr->recv_cq)
 		return ERR_PTR(-EINVAL);
 
-	err = get_qp_user_index(to_mucontext(pd->uobject->context),
-				ucmd, sizeof(*ucmd), &uidx);
+	err = get_qp_user_index(ucontext, ucmd, sizeof(*ucmd), &uidx);
 	if (err)
 		return ERR_PTR(err);
 
@@ -2192,6 +2566,7 @@
 		goto err_free;
 	}
 
+	MLX5_SET(create_dct_in, qp->dct.in, uid, to_mpd(pd)->uid);
 	dctc = MLX5_ADDR_OF(create_dct_in, qp->dct.in, dct_context_entry);
 	qp->qp_sub_type = MLX5_IB_QPT_DCT;
 	MLX5_SET(dctc, dctc, pd, to_mpd(pd)->pdn);
@@ -2200,6 +2575,9 @@
 	MLX5_SET64(dctc, dctc, dc_access_key, ucmd->access_key);
 	MLX5_SET(dctc, dctc, user_index, uidx);
 
+	if (ucmd->flags & MLX5_QP_FLAG_SCATTER_CQE)
+		configure_responder_scat_cqe(attr, dctc);
+
 	qp->state = IB_QPS_RESET;
 
 	return &qp->ibqp;
@@ -2256,15 +2634,17 @@
 	int err;
 	struct ib_qp_init_attr mlx_init_attr;
 	struct ib_qp_init_attr *init_attr = verbs_init_attr;
+	struct mlx5_ib_ucontext *ucontext = rdma_udata_to_drv_context(
+		udata, struct mlx5_ib_ucontext, ibucontext);
 
 	if (pd) {
 		dev = to_mdev(pd->device);
 
 		if (init_attr->qp_type == IB_QPT_RAW_PACKET) {
-			if (!pd->uobject) {
+			if (!ucontext) {
 				mlx5_ib_dbg(dev, "Raw Packet QP is not supported for kernel consumers\n");
 				return ERR_PTR(-EINVAL);
-			} else if (!to_mucontext(pd->uobject->context)->cqe_version) {
+			} else if (!ucontext->cqe_version) {
 				mlx5_ib_dbg(dev, "Raw Packet QP is only supported for CQE version > 0\n");
 				return ERR_PTR(-EINVAL);
 			}
@@ -2296,7 +2676,7 @@
 				return ERR_PTR(-EINVAL);
 			}
 		} else {
-			return mlx5_ib_create_dct(pd, init_attr, &ucmd);
+			return mlx5_ib_create_dct(pd, init_attr, &ucmd, udata);
 		}
 	}
 
@@ -2387,7 +2767,7 @@
 	return 0;
 }
 
-int mlx5_ib_destroy_qp(struct ib_qp *qp)
+int mlx5_ib_destroy_qp(struct ib_qp *qp, struct ib_udata *udata)
 {
 	struct mlx5_ib_dev *dev = to_mdev(qp->device);
 	struct mlx5_ib_qp *mqp = to_mqp(qp);
@@ -2398,19 +2778,21 @@
 	if (mqp->qp_sub_type == MLX5_IB_QPT_DCT)
 		return mlx5_ib_destroy_dct(mqp);
 
-	destroy_qp_common(dev, mqp);
+	destroy_qp_common(dev, mqp, udata);
 
 	kfree(mqp);
 
 	return 0;
 }
 
-static __be32 to_mlx5_access_flags(struct mlx5_ib_qp *qp, const struct ib_qp_attr *attr,
-				   int attr_mask)
+static int to_mlx5_access_flags(struct mlx5_ib_qp *qp,
+				const struct ib_qp_attr *attr,
+				int attr_mask, __be32 *hw_access_flags_be)
 {
-	u32 hw_access_flags = 0;
 	u8 dest_rd_atomic;
-	u32 access_flags;
+	u32 access_flags, hw_access_flags = 0;
+
+	struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.device);
 
 	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC)
 		dest_rd_atomic = attr->max_dest_rd_atomic;
@@ -2427,12 +2809,23 @@
 
 	if (access_flags & IB_ACCESS_REMOTE_READ)
 		hw_access_flags |= MLX5_QP_BIT_RRE;
-	if (access_flags & IB_ACCESS_REMOTE_ATOMIC)
-		hw_access_flags |= (MLX5_QP_BIT_RAE | MLX5_ATOMIC_MODE_CX);
+	if (access_flags & IB_ACCESS_REMOTE_ATOMIC) {
+		int atomic_mode;
+
+		atomic_mode = get_atomic_mode(dev, qp->ibqp.qp_type);
+		if (atomic_mode < 0)
+			return -EOPNOTSUPP;
+
+		hw_access_flags |= MLX5_QP_BIT_RAE;
+		hw_access_flags |= atomic_mode << MLX5_ATOMIC_MODE_OFFSET;
+	}
+
 	if (access_flags & IB_ACCESS_REMOTE_WRITE)
 		hw_access_flags |= MLX5_QP_BIT_RWE;
 
-	return cpu_to_be32(hw_access_flags);
+	*hw_access_flags_be = cpu_to_be32(hw_access_flags);
+
+	return 0;
 }
 
 enum {
@@ -2446,7 +2839,7 @@
 	if (rate == IB_RATE_PORT_CURRENT)
 		return 0;
 
-	if (rate < IB_RATE_2_5_GBPS || rate > IB_RATE_300_GBPS)
+	if (rate < IB_RATE_2_5_GBPS || rate > IB_RATE_600_GBPS)
 		return -EINVAL;
 
 	while (rate != IB_RATE_PORT_CURRENT &&
@@ -2458,7 +2851,8 @@
 }
 
 static int modify_raw_packet_eth_prio(struct mlx5_core_dev *dev,
-				      struct mlx5_ib_sq *sq, u8 sl)
+				      struct mlx5_ib_sq *sq, u8 sl,
+				      struct ib_pd *pd)
 {
 	void *in;
 	void *tisc;
@@ -2471,6 +2865,7 @@
 		return -ENOMEM;
 
 	MLX5_SET(modify_tis_in, in, bitmask.prio, 1);
+	MLX5_SET(modify_tis_in, in, uid, to_mpd(pd)->uid);
 
 	tisc = MLX5_ADDR_OF(modify_tis_in, in, ctx);
 	MLX5_SET(tisc, tisc, prio, ((sl & 0x7) << 1));
@@ -2483,7 +2878,8 @@
 }
 
 static int modify_raw_packet_tx_affinity(struct mlx5_core_dev *dev,
-					 struct mlx5_ib_sq *sq, u8 tx_affinity)
+					 struct mlx5_ib_sq *sq, u8 tx_affinity,
+					 struct ib_pd *pd)
 {
 	void *in;
 	void *tisc;
@@ -2496,6 +2892,7 @@
 		return -ENOMEM;
 
 	MLX5_SET(modify_tis_in, in, bitmask.lag_tx_port_affinity, 1);
+	MLX5_SET(modify_tis_in, in, uid, to_mpd(pd)->uid);
 
 	tisc = MLX5_ADDR_OF(modify_tis_in, in, ctx);
 	MLX5_SET(tisc, tisc, lag_tx_port_affinity, tx_affinity);
@@ -2580,7 +2977,7 @@
 	if ((qp->ibqp.qp_type == IB_QPT_RAW_PACKET) && qp->sq.wqe_cnt)
 		return modify_raw_packet_eth_prio(dev->mdev,
 						  &qp->raw_packet_qp.sq,
-						  sl & 0xf);
+						  sl & 0xf, qp->ibqp.pd);
 
 	return 0;
 }
@@ -2599,6 +2996,11 @@
 			[MLX5_QP_ST_UD] = MLX5_QP_OPTPAR_PKEY_INDEX	|
 					  MLX5_QP_OPTPAR_Q_KEY		|
 					  MLX5_QP_OPTPAR_PRI_PORT,
+			[MLX5_QP_ST_XRC] = MLX5_QP_OPTPAR_RRE		|
+					  MLX5_QP_OPTPAR_RAE		|
+					  MLX5_QP_OPTPAR_RWE		|
+					  MLX5_QP_OPTPAR_PKEY_INDEX	|
+					  MLX5_QP_OPTPAR_PRI_PORT,
 		},
 		[MLX5_QP_STATE_RTR] = {
 			[MLX5_QP_ST_RC] = MLX5_QP_OPTPAR_ALT_ADDR_PATH  |
@@ -2632,6 +3034,12 @@
 					  MLX5_QP_OPTPAR_RWE		|
 					  MLX5_QP_OPTPAR_PM_STATE,
 			[MLX5_QP_ST_UD] = MLX5_QP_OPTPAR_Q_KEY,
+			[MLX5_QP_ST_XRC] = MLX5_QP_OPTPAR_ALT_ADDR_PATH	|
+					  MLX5_QP_OPTPAR_RRE		|
+					  MLX5_QP_OPTPAR_RAE		|
+					  MLX5_QP_OPTPAR_RWE		|
+					  MLX5_QP_OPTPAR_PM_STATE	|
+					  MLX5_QP_OPTPAR_RNR_TIMEOUT,
 		},
 	},
 	[MLX5_QP_STATE_RTS] = {
@@ -2648,6 +3056,12 @@
 			[MLX5_QP_ST_UD] = MLX5_QP_OPTPAR_Q_KEY		|
 					  MLX5_QP_OPTPAR_SRQN		|
 					  MLX5_QP_OPTPAR_CQN_RCV,
+			[MLX5_QP_ST_XRC] = MLX5_QP_OPTPAR_RRE		|
+					  MLX5_QP_OPTPAR_RAE		|
+					  MLX5_QP_OPTPAR_RWE		|
+					  MLX5_QP_OPTPAR_RNR_TIMEOUT	|
+					  MLX5_QP_OPTPAR_PM_STATE	|
+					  MLX5_QP_OPTPAR_ALT_ADDR_PATH,
 		},
 	},
 	[MLX5_QP_STATE_SQER] = {
@@ -2659,6 +3073,10 @@
 					   MLX5_QP_OPTPAR_RWE		|
 					   MLX5_QP_OPTPAR_RAE		|
 					   MLX5_QP_OPTPAR_RRE,
+			[MLX5_QP_ST_XRC]  = MLX5_QP_OPTPAR_RNR_TIMEOUT	|
+					   MLX5_QP_OPTPAR_RWE		|
+					   MLX5_QP_OPTPAR_RAE		|
+					   MLX5_QP_OPTPAR_RRE,
 		},
 	},
 };
@@ -2728,9 +3146,9 @@
 	return result;
 }
 
-static int modify_raw_packet_qp_rq(struct mlx5_ib_dev *dev,
-				   struct mlx5_ib_rq *rq, int new_state,
-				   const struct mlx5_modify_raw_qp_param *raw_qp_param)
+static int modify_raw_packet_qp_rq(
+	struct mlx5_ib_dev *dev, struct mlx5_ib_rq *rq, int new_state,
+	const struct mlx5_modify_raw_qp_param *raw_qp_param, struct ib_pd *pd)
 {
 	void *in;
 	void *rqc;
@@ -2743,6 +3161,7 @@
 		return -ENOMEM;
 
 	MLX5_SET(modify_rq_in, in, rq_state, rq->state);
+	MLX5_SET(modify_rq_in, in, uid, to_mpd(pd)->uid);
 
 	rqc = MLX5_ADDR_OF(modify_rq_in, in, ctx);
 	MLX5_SET(rqc, rqc, state, new_state);
@@ -2753,8 +3172,9 @@
 				   MLX5_MODIFY_RQ_IN_MODIFY_BITMASK_RQ_COUNTER_SET_ID);
 			MLX5_SET(rqc, rqc, counter_set_id, raw_qp_param->rq_q_ctr_id);
 		} else
-			pr_info_once("%s: RAW PACKET QP counters are not supported on current FW\n",
-				     dev->ib_dev.name);
+			dev_info_once(
+				&dev->ib_dev.dev,
+				"RAW PACKET QP counters are not supported on current FW\n");
 	}
 
 	err = mlx5_core_modify_rq(dev->mdev, rq->base.mqp.qpn, in, inlen);
@@ -2768,10 +3188,9 @@
 	return err;
 }
 
-static int modify_raw_packet_qp_sq(struct mlx5_core_dev *dev,
-				   struct mlx5_ib_sq *sq,
-				   int new_state,
-				   const struct mlx5_modify_raw_qp_param *raw_qp_param)
+static int modify_raw_packet_qp_sq(
+	struct mlx5_core_dev *dev, struct mlx5_ib_sq *sq, int new_state,
+	const struct mlx5_modify_raw_qp_param *raw_qp_param, struct ib_pd *pd)
 {
 	struct mlx5_ib_qp *ibqp = sq->base.container_mibqp;
 	struct mlx5_rate_limit old_rl = ibqp->rl;
@@ -2788,6 +3207,7 @@
 	if (!in)
 		return -ENOMEM;
 
+	MLX5_SET(modify_sq_in, in, uid, to_mpd(pd)->uid);
 	MLX5_SET(modify_sq_in, in, sq_state, sq->state);
 
 	sqc = MLX5_ADDR_OF(modify_sq_in, in, ctx);
@@ -2829,10 +3249,12 @@
 	}
 
 	/* Only remove the old rate after new rate was set */
-	if ((old_rl.rate &&
-	     !mlx5_rl_are_equal(&old_rl, &new_rl)) ||
-	    (new_state != MLX5_SQC_STATE_RDY))
+	if ((old_rl.rate && !mlx5_rl_are_equal(&old_rl, &new_rl)) ||
+	    (new_state != MLX5_SQC_STATE_RDY)) {
 		mlx5_rl_remove_rate(dev, &old_rl);
+		if (new_state != MLX5_SQC_STATE_RDY)
+			memset(&new_rl, 0, sizeof(new_rl));
+	}
 
 	ibqp->rl = new_rl;
 	sq->state = new_state;
@@ -2890,29 +3312,108 @@
 	}
 
 	if (modify_rq) {
-		err =  modify_raw_packet_qp_rq(dev, rq, rq_state, raw_qp_param);
+		err =  modify_raw_packet_qp_rq(dev, rq, rq_state, raw_qp_param,
+					       qp->ibqp.pd);
 		if (err)
 			return err;
 	}
 
 	if (modify_sq) {
+		struct mlx5_flow_handle *flow_rule;
+
 		if (tx_affinity) {
 			err = modify_raw_packet_tx_affinity(dev->mdev, sq,
-							    tx_affinity);
+							    tx_affinity,
+							    qp->ibqp.pd);
 			if (err)
 				return err;
 		}
 
-		return modify_raw_packet_qp_sq(dev->mdev, sq, sq_state, raw_qp_param);
+		flow_rule = create_flow_rule_vport_sq(dev, sq,
+						      raw_qp_param->port);
+		if (IS_ERR(flow_rule))
+			return PTR_ERR(flow_rule);
+
+		err = modify_raw_packet_qp_sq(dev->mdev, sq, sq_state,
+					      raw_qp_param, qp->ibqp.pd);
+		if (err) {
+			if (flow_rule)
+				mlx5_del_flow_rules(flow_rule);
+			return err;
+		}
+
+		if (flow_rule) {
+			destroy_flow_rule_vport_sq(sq);
+			sq->flow_rule = flow_rule;
+		}
+
+		return err;
 	}
 
 	return 0;
 }
 
+static unsigned int get_tx_affinity(struct mlx5_ib_dev *dev,
+				    struct mlx5_ib_pd *pd,
+				    struct mlx5_ib_qp_base *qp_base,
+				    u8 port_num, struct ib_udata *udata)
+{
+	struct mlx5_ib_ucontext *ucontext = rdma_udata_to_drv_context(
+		udata, struct mlx5_ib_ucontext, ibucontext);
+	unsigned int tx_port_affinity;
+
+	if (ucontext) {
+		tx_port_affinity = (unsigned int)atomic_add_return(
+					   1, &ucontext->tx_port_affinity) %
+					   MLX5_MAX_PORTS +
+				   1;
+		mlx5_ib_dbg(dev, "Set tx affinity 0x%x to qpn 0x%x ucontext %p\n",
+				tx_port_affinity, qp_base->mqp.qpn, ucontext);
+	} else {
+		tx_port_affinity =
+			(unsigned int)atomic_add_return(
+				1, &dev->port[port_num].roce.tx_port_affinity) %
+				MLX5_MAX_PORTS +
+			1;
+		mlx5_ib_dbg(dev, "Set tx affinity 0x%x to qpn 0x%x\n",
+				tx_port_affinity, qp_base->mqp.qpn);
+	}
+
+	return tx_port_affinity;
+}
+
+static int __mlx5_ib_qp_set_counter(struct ib_qp *qp,
+				    struct rdma_counter *counter)
+{
+	struct mlx5_ib_dev *dev = to_mdev(qp->device);
+	struct mlx5_ib_qp *mqp = to_mqp(qp);
+	struct mlx5_qp_context context = {};
+	struct mlx5_ib_qp_base *base;
+	u32 set_id;
+
+	if (!MLX5_CAP_GEN(dev->mdev, rts2rts_qp_counters_set_id))
+		return 0;
+
+	if (counter)
+		set_id = counter->id;
+	else
+		set_id = mlx5_ib_get_counters_id(dev, mqp->port - 1);
+
+	base = &mqp->trans_qp.base;
+	context.qp_counter_set_usr_page &= cpu_to_be32(0xffffff);
+	context.qp_counter_set_usr_page |= cpu_to_be32(set_id << 24);
+	return mlx5_core_qp_modify(dev->mdev,
+				   MLX5_CMD_OP_RTS2RTS_QP,
+				   MLX5_QP_OPTPAR_COUNTER_SET_ID,
+				   &context, &base->mqp);
+}
+
 static int __mlx5_ib_modify_qp(struct ib_qp *ibqp,
 			       const struct ib_qp_attr *attr, int attr_mask,
-			       enum ib_qp_state cur_state, enum ib_qp_state new_state,
-			       const struct mlx5_ib_modify_qp *ucmd)
+			       enum ib_qp_state cur_state,
+			       enum ib_qp_state new_state,
+			       const struct mlx5_ib_modify_qp *ucmd,
+			       struct ib_udata *udata)
 {
 	static const u16 optab[MLX5_QP_NUM_STATE][MLX5_QP_NUM_STATE] = {
 		[MLX5_QP_STATE_RST] = {
@@ -2957,9 +3458,9 @@
 	struct mlx5_ib_cq *send_cq, *recv_cq;
 	struct mlx5_qp_context *context;
 	struct mlx5_ib_pd *pd;
-	struct mlx5_ib_port *mibport = NULL;
 	enum mlx5_qp_state mlx5_cur, mlx5_new;
 	enum mlx5_qp_optpar optpar;
+	u32 set_id = 0;
 	int mlx5_st;
 	int err;
 	u16 op;
@@ -2974,6 +3475,7 @@
 	if (!context)
 		return -ENOMEM;
 
+	pd = get_pd(qp);
 	context->flags = cpu_to_be32(mlx5_st << 16);
 
 	if (!(attr_mask & IB_QP_PATH_MIG_STATE)) {
@@ -3000,11 +3502,10 @@
 		    (ibqp->qp_type == IB_QPT_RAW_PACKET) ||
 		    (ibqp->qp_type == IB_QPT_XRC_INI) ||
 		    (ibqp->qp_type == IB_QPT_XRC_TGT)) {
-			if (mlx5_lag_is_active(dev->mdev)) {
-				u8 p = mlx5_core_native_port_num(dev->mdev);
-				tx_affinity = (unsigned int)atomic_add_return(1,
-						&dev->roce[p].next_port) %
-						MLX5_MAX_PORTS + 1;
+			if (dev->lag_active) {
+				u8 p = mlx5_core_native_port_num(dev->mdev) - 1;
+				tx_affinity = get_tx_affinity(dev, pd, base, p,
+							      udata);
 				context->flags |= cpu_to_be32(tx_affinity << 24);
 			}
 		}
@@ -3062,7 +3563,6 @@
 			goto out;
 	}
 
-	pd = get_pd(qp);
 	get_cqs(qp->ibqp.qp_type, qp->ibqp.send_cq, qp->ibqp.recv_cq,
 		&send_cq, &recv_cq);
 
@@ -3092,8 +3592,15 @@
 				cpu_to_be32(fls(attr->max_dest_rd_atomic - 1) << 21);
 	}
 
-	if (attr_mask & (IB_QP_ACCESS_FLAGS | IB_QP_MAX_DEST_RD_ATOMIC))
-		context->params2 |= to_mlx5_access_flags(qp, attr, attr_mask);
+	if (attr_mask & (IB_QP_ACCESS_FLAGS | IB_QP_MAX_DEST_RD_ATOMIC)) {
+		__be32 access_flags;
+
+		err = to_mlx5_access_flags(qp, attr, attr_mask, &access_flags);
+		if (err)
+			goto out;
+
+		context->params2 |= access_flags;
+	}
 
 	if (attr_mask & IB_QP_MIN_RNR_TIMER)
 		context->rnr_nextrecvpsn |= cpu_to_be32(attr->min_rnr_timer << 24);
@@ -3115,9 +3622,12 @@
 		if (qp->flags & MLX5_IB_QP_UNDERLAY)
 			port_num = 0;
 
-		mibport = &dev->port[port_num];
+		if (ibqp->counter)
+			set_id = ibqp->counter->id;
+		else
+			set_id = mlx5_ib_get_counters_id(dev, port_num);
 		context->qp_counter_set_usr_page |=
-			cpu_to_be32((u32)(mibport->cnts.set_id) << 24);
+			cpu_to_be32(set_id << 24);
 	}
 
 	if (!ibqp->uobject && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT)
@@ -3145,10 +3655,13 @@
 
 		raw_qp_param.operation = op;
 		if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) {
-			raw_qp_param.rq_q_ctr_id = mibport->cnts.set_id;
+			raw_qp_param.rq_q_ctr_id = set_id;
 			raw_qp_param.set_mask |= MLX5_RAW_QP_MOD_SET_RQ_Q_CTR_ID;
 		}
 
+		if (attr_mask & IB_QP_PORT)
+			raw_qp_param.port = attr->port_num;
+
 		if (attr_mask & IB_QP_RATE_LIMIT) {
 			raw_qp_param.rl.rate = attr->rate_limit;
 
@@ -3213,11 +3726,18 @@
 		qp->sq.head = 0;
 		qp->sq.tail = 0;
 		qp->sq.cur_post = 0;
-		qp->sq.last_poll = 0;
+		if (qp->sq.wqe_cnt)
+			qp->sq.cur_edge = get_sq_edge(&qp->sq, 0);
 		qp->db.db[MLX5_RCV_DBR] = 0;
 		qp->db.db[MLX5_SND_DBR] = 0;
 	}
 
+	if ((new_state == IB_QPS_RTS) && qp->counter_pending) {
+		err = __mlx5_ib_qp_set_counter(ibqp, ibqp->counter);
+		if (!err)
+			qp->counter_pending = 0;
+	}
+
 out:
 	kfree(context);
 	return err;
@@ -3253,7 +3773,7 @@
 		return is_valid_mask(attr_mask, req, opt);
 	} else if (cur_state == IB_QPS_INIT && new_state == IB_QPS_RTR) {
 		req |= IB_QP_PATH_MTU;
-		opt = IB_QP_PKEY_INDEX;
+		opt = IB_QP_PKEY_INDEX | IB_QP_AV;
 		return is_valid_mask(attr_mask, req, opt);
 	} else if (cur_state == IB_QPS_RTR && new_state == IB_QPS_RTS) {
 		req |= IB_QP_TIMEOUT | IB_QP_RETRY_CNT | IB_QP_RNR_RETRY |
@@ -3294,6 +3814,8 @@
 
 	dctc = MLX5_ADDR_OF(create_dct_in, qp->dct.in, dct_context_entry);
 	if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) {
+		u16 set_id;
+
 		required |= IB_QP_ACCESS_FLAGS | IB_QP_PKEY_INDEX | IB_QP_PORT;
 		if (!is_valid_mask(attr_mask, required, 0))
 			return -EINVAL;
@@ -3309,17 +3831,24 @@
 		if (attr->qp_access_flags & IB_ACCESS_REMOTE_WRITE)
 			MLX5_SET(dctc, dctc, rwe, 1);
 		if (attr->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC) {
-			if (!mlx5_ib_dc_atomic_is_supported(dev))
+			int atomic_mode;
+
+			atomic_mode = get_atomic_mode(dev, MLX5_IB_QPT_DCT);
+			if (atomic_mode < 0)
 				return -EOPNOTSUPP;
+
+			MLX5_SET(dctc, dctc, atomic_mode, atomic_mode);
 			MLX5_SET(dctc, dctc, rae, 1);
-			MLX5_SET(dctc, dctc, atomic_mode, MLX5_ATOMIC_MODE_DCT_CX);
 		}
 		MLX5_SET(dctc, dctc, pkey_index, attr->pkey_index);
 		MLX5_SET(dctc, dctc, port, attr->port_num);
-		MLX5_SET(dctc, dctc, counter_set_id, dev->port[attr->port_num - 1].cnts.set_id);
+
+		set_id = mlx5_ib_get_counters_id(dev, attr->port_num - 1);
+		MLX5_SET(dctc, dctc, counter_set_id, set_id);
 
 	} else if (cur_state == IB_QPS_INIT && new_state == IB_QPS_RTR) {
 		struct mlx5_ib_modify_qp_resp resp = {};
+		u32 out[MLX5_ST_SZ_DW(create_dct_out)] = {0};
 		u32 min_resp_len = offsetof(typeof(resp), dctn) +
 				   sizeof(resp.dctn);
 
@@ -3338,7 +3867,8 @@
 		MLX5_SET(dctc, dctc, hop_limit, attr->ah_attr.grh.hop_limit);
 
 		err = mlx5_core_create_dct(dev->mdev, &qp->dct.mdct, qp->dct.in,
-					   MLX5_ST_SZ_BYTES(create_dct_in));
+					   MLX5_ST_SZ_BYTES(create_dct_in), out,
+					   sizeof(out));
 		if (err)
 			return err;
 		resp.dctn = qp->dct.mdct.mqp.qpn;
@@ -3369,7 +3899,6 @@
 	size_t required_cmd_sz;
 	int err = -EINVAL;
 	int port;
-	enum rdma_link_layer ll = IB_LINK_LAYER_UNSPECIFIED;
 
 	if (ibqp->rwq_ind_tbl)
 		return -ENOSYS;
@@ -3415,7 +3944,6 @@
 
 	if (!(cur_state == new_state && cur_state == IB_QPS_RESET)) {
 		port = attr_mask & IB_QP_PORT ? attr->port_num : qp->port;
-		ll = dev->ib_dev.get_link_layer(&dev->ib_dev, port);
 	}
 
 	if (qp->flags & MLX5_IB_QP_UNDERLAY) {
@@ -3426,7 +3954,8 @@
 		}
 	} else if (qp_type != MLX5_IB_QPT_REG_UMR &&
 		   qp_type != MLX5_IB_QPT_DCI &&
-		   !ib_modify_qp_is_ok(cur_state, new_state, qp_type, attr_mask, ll)) {
+		   !ib_modify_qp_is_ok(cur_state, new_state, qp_type,
+				       attr_mask)) {
 		mlx5_ib_dbg(dev, "invalid QP state transition from %d to %d, qp_type %d, attr_mask 0x%x\n",
 			    cur_state, new_state, ibqp->qp_type, attr_mask);
 		goto out;
@@ -3477,13 +4006,69 @@
 	}
 
 	err = __mlx5_ib_modify_qp(ibqp, attr, attr_mask, cur_state,
-				  new_state, &ucmd);
+				  new_state, &ucmd, udata);
 
 out:
 	mutex_unlock(&qp->mutex);
 	return err;
 }
 
+static void _handle_post_send_edge(struct mlx5_ib_wq *sq, void **seg,
+				   u32 wqe_sz, void **cur_edge)
+{
+	u32 idx;
+
+	idx = (sq->cur_post + (wqe_sz >> 2)) & (sq->wqe_cnt - 1);
+	*cur_edge = get_sq_edge(sq, idx);
+
+	*seg = mlx5_frag_buf_get_wqe(&sq->fbc, idx);
+}
+
+/* handle_post_send_edge - Check if we get to SQ edge. If yes, update to the
+ * next nearby edge and get new address translation for current WQE position.
+ * @sq - SQ buffer.
+ * @seg: Current WQE position (16B aligned).
+ * @wqe_sz: Total current WQE size [16B].
+ * @cur_edge: Updated current edge.
+ */
+static inline void handle_post_send_edge(struct mlx5_ib_wq *sq, void **seg,
+					 u32 wqe_sz, void **cur_edge)
+{
+	if (likely(*seg != *cur_edge))
+		return;
+
+	_handle_post_send_edge(sq, seg, wqe_sz, cur_edge);
+}
+
+/* memcpy_send_wqe - copy data from src to WQE and update the relevant WQ's
+ * pointers. At the end @seg is aligned to 16B regardless the copied size.
+ * @sq - SQ buffer.
+ * @cur_edge: Updated current edge.
+ * @seg: Current WQE position (16B aligned).
+ * @wqe_sz: Total current WQE size [16B].
+ * @src: Pointer to copy from.
+ * @n: Number of bytes to copy.
+ */
+static inline void memcpy_send_wqe(struct mlx5_ib_wq *sq, void **cur_edge,
+				   void **seg, u32 *wqe_sz, const void *src,
+				   size_t n)
+{
+	while (likely(n)) {
+		size_t leftlen = *cur_edge - *seg;
+		size_t copysz = min_t(size_t, leftlen, n);
+		size_t stride;
+
+		memcpy(*seg, src, copysz);
+
+		n -= copysz;
+		src += copysz;
+		stride = !n ? ALIGN(copysz, 16) : copysz;
+		*seg += stride;
+		*wqe_sz += stride >> 4;
+		handle_post_send_edge(sq, seg, *wqe_sz, cur_edge);
+	}
+}
+
 static int mlx5_wq_overflow(struct mlx5_ib_wq *wq, int nreq, struct ib_cq *ib_cq)
 {
 	struct mlx5_ib_cq *cq;
@@ -3509,11 +4094,10 @@
 	rseg->reserved = 0;
 }
 
-static void *set_eth_seg(struct mlx5_wqe_eth_seg *eseg,
-			 const struct ib_send_wr *wr, void *qend,
-			 struct mlx5_ib_qp *qp, int *size)
+static void set_eth_seg(const struct ib_send_wr *wr, struct mlx5_ib_qp *qp,
+			void **seg, int *size, void **cur_edge)
 {
-	void *seg = eseg;
+	struct mlx5_wqe_eth_seg *eseg = *seg;
 
 	memset(eseg, 0, sizeof(struct mlx5_wqe_eth_seg));
 
@@ -3521,45 +4105,41 @@
 		eseg->cs_flags = MLX5_ETH_WQE_L3_CSUM |
 				 MLX5_ETH_WQE_L4_CSUM;
 
-	seg += sizeof(struct mlx5_wqe_eth_seg);
-	*size += sizeof(struct mlx5_wqe_eth_seg) / 16;
-
 	if (wr->opcode == IB_WR_LSO) {
 		struct ib_ud_wr *ud_wr = container_of(wr, struct ib_ud_wr, wr);
-		int size_of_inl_hdr_start = sizeof(eseg->inline_hdr.start);
-		u64 left, leftlen, copysz;
+		size_t left, copysz;
 		void *pdata = ud_wr->header;
+		size_t stride;
 
 		left = ud_wr->hlen;
 		eseg->mss = cpu_to_be16(ud_wr->mss);
 		eseg->inline_hdr.sz = cpu_to_be16(left);
 
-		/*
-		 * check if there is space till the end of queue, if yes,
-		 * copy all in one shot, otherwise copy till the end of queue,
-		 * rollback and than the copy the left
+		/* memcpy_send_wqe should get a 16B align address. Hence, we
+		 * first copy up to the current edge and then, if needed,
+		 * fall-through to memcpy_send_wqe.
 		 */
-		leftlen = qend - (void *)eseg->inline_hdr.start;
-		copysz = min_t(u64, leftlen, left);
+		copysz = min_t(u64, *cur_edge - (void *)eseg->inline_hdr.start,
+			       left);
+		memcpy(eseg->inline_hdr.start, pdata, copysz);
+		stride = ALIGN(sizeof(struct mlx5_wqe_eth_seg) -
+			       sizeof(eseg->inline_hdr.start) + copysz, 16);
+		*size += stride / 16;
+		*seg += stride;
 
-		memcpy(seg - size_of_inl_hdr_start, pdata, copysz);
-
-		if (likely(copysz > size_of_inl_hdr_start)) {
-			seg += ALIGN(copysz - size_of_inl_hdr_start, 16);
-			*size += ALIGN(copysz - size_of_inl_hdr_start, 16) / 16;
-		}
-
-		if (unlikely(copysz < left)) { /* the last wqe in the queue */
-			seg = mlx5_get_send_wqe(qp, 0);
+		if (copysz < left) {
+			handle_post_send_edge(&qp->sq, seg, *size, cur_edge);
 			left -= copysz;
 			pdata += copysz;
-			memcpy(seg, pdata, left);
-			seg += ALIGN(left, 16);
-			*size += ALIGN(left, 16) / 16;
+			memcpy_send_wqe(&qp->sq, cur_edge, seg, size, pdata,
+					left);
 		}
+
+		return;
 	}
 
-	return seg;
+	*seg += sizeof(struct mlx5_wqe_eth_seg);
+	*size += sizeof(struct mlx5_wqe_eth_seg) / 16;
 }
 
 static void set_datagram_seg(struct mlx5_wqe_datagram_seg *dseg,
@@ -3583,7 +4163,7 @@
 	       MLX5_IB_UMR_OCTOWORD;
 }
 
-static __be64 frwr_mkey_mask(void)
+static __be64 frwr_mkey_mask(bool atomic)
 {
 	u64 result;
 
@@ -3596,10 +4176,12 @@
 		MLX5_MKEY_MASK_LW		|
 		MLX5_MKEY_MASK_RR		|
 		MLX5_MKEY_MASK_RW		|
-		MLX5_MKEY_MASK_A		|
 		MLX5_MKEY_MASK_SMALL_FENCE	|
 		MLX5_MKEY_MASK_FREE;
 
+	if (atomic)
+		result |= MLX5_MKEY_MASK_A;
+
 	return cpu_to_be64(result);
 }
 
@@ -3625,17 +4207,15 @@
 }
 
 static void set_reg_umr_seg(struct mlx5_wqe_umr_ctrl_seg *umr,
-			    struct mlx5_ib_mr *mr, bool umr_inline)
+			    struct mlx5_ib_mr *mr, u8 flags, bool atomic)
 {
-	int size = mr->ndescs * mr->desc_size;
+	int size = (mr->ndescs + mr->meta_ndescs) * mr->desc_size;
 
 	memset(umr, 0, sizeof(*umr));
 
-	umr->flags = MLX5_UMR_CHECK_NOT_FREE;
-	if (umr_inline)
-		umr->flags |= MLX5_UMR_INLINE;
+	umr->flags = flags;
 	umr->xlt_octowords = cpu_to_be16(get_xlt_octo(size));
-	umr->mkey_mask = frwr_mkey_mask();
+	umr->mkey_mask = frwr_mkey_mask(atomic);
 }
 
 static void set_linv_umr_seg(struct mlx5_wqe_umr_ctrl_seg *umr)
@@ -3717,10 +4297,14 @@
 
 	memset(umr, 0, sizeof(*umr));
 
-	if (wr->send_flags & MLX5_IB_SEND_UMR_FAIL_IF_FREE)
-		umr->flags = MLX5_UMR_CHECK_FREE; /* fail if free */
-	else
-		umr->flags = MLX5_UMR_CHECK_NOT_FREE; /* fail if not free */
+	if (!umrwr->ignore_free_state) {
+		if (wr->send_flags & MLX5_IB_SEND_UMR_FAIL_IF_FREE)
+			 /* fail if free */
+			umr->flags = MLX5_UMR_CHECK_FREE;
+		else
+			/* fail if not free */
+			umr->flags = MLX5_UMR_CHECK_NOT_FREE;
+	}
 
 	umr->xlt_octowords = cpu_to_be16(get_xlt_octo(umrwr->xlt_size));
 	if (wr->send_flags & MLX5_IB_SEND_UMR_UPDATE_XLT) {
@@ -3760,7 +4344,7 @@
 			     struct mlx5_ib_mr *mr,
 			     u32 key, int access)
 {
-	int ndescs = ALIGN(mr->ndescs, 8) >> 1;
+	int ndescs = ALIGN(mr->ndescs + mr->meta_ndescs, 8) >> 1;
 
 	memset(seg, 0, sizeof(*seg));
 
@@ -3811,31 +4395,13 @@
 			     struct mlx5_ib_mr *mr,
 			     struct mlx5_ib_pd *pd)
 {
-	int bcount = mr->desc_size * mr->ndescs;
+	int bcount = mr->desc_size * (mr->ndescs + mr->meta_ndescs);
 
 	dseg->addr = cpu_to_be64(mr->desc_map);
 	dseg->byte_count = cpu_to_be32(ALIGN(bcount, 64));
 	dseg->lkey = cpu_to_be32(pd->ibpd.local_dma_lkey);
 }
 
-static void set_reg_umr_inline_seg(void *seg, struct mlx5_ib_qp *qp,
-				   struct mlx5_ib_mr *mr, int mr_list_size)
-{
-	void *qend = qp->sq.qend;
-	void *addr = mr->descs;
-	int copy;
-
-	if (unlikely(seg + mr_list_size > qend)) {
-		copy = qend - seg;
-		memcpy(seg, addr, copy);
-		addr += copy;
-		mr_list_size -= copy;
-		seg = mlx5_get_send_wqe(qp, 0);
-	}
-	memcpy(seg, addr, mr_list_size);
-	seg += mr_list_size;
-}
-
 static __be32 send_ieth(const struct ib_send_wr *wr)
 {
 	switch (wr->opcode) {
@@ -3869,40 +4435,48 @@
 }
 
 static int set_data_inl_seg(struct mlx5_ib_qp *qp, const struct ib_send_wr *wr,
-			    void *wqe, int *sz)
+			    void **wqe, int *wqe_sz, void **cur_edge)
 {
 	struct mlx5_wqe_inline_seg *seg;
-	void *qend = qp->sq.qend;
-	void *addr;
+	size_t offset;
 	int inl = 0;
-	int copy;
-	int len;
 	int i;
 
-	seg = wqe;
-	wqe += sizeof(*seg);
+	seg = *wqe;
+	*wqe += sizeof(*seg);
+	offset = sizeof(*seg);
+
 	for (i = 0; i < wr->num_sge; i++) {
-		addr = (void *)(unsigned long)(wr->sg_list[i].addr);
-		len  = wr->sg_list[i].length;
+		size_t len  = wr->sg_list[i].length;
+		void *addr = (void *)(unsigned long)(wr->sg_list[i].addr);
+
 		inl += len;
 
 		if (unlikely(inl > qp->max_inline_data))
 			return -ENOMEM;
 
-		if (unlikely(wqe + len > qend)) {
-			copy = qend - wqe;
-			memcpy(wqe, addr, copy);
-			addr += copy;
-			len -= copy;
-			wqe = mlx5_get_send_wqe(qp, 0);
+		while (likely(len)) {
+			size_t leftlen;
+			size_t copysz;
+
+			handle_post_send_edge(&qp->sq, wqe,
+					      *wqe_sz + (offset >> 4),
+					      cur_edge);
+
+			leftlen = *cur_edge - *wqe;
+			copysz = min_t(size_t, leftlen, len);
+
+			memcpy(*wqe, addr, copysz);
+			len -= copysz;
+			addr += copysz;
+			*wqe += copysz;
+			offset += copysz;
 		}
-		memcpy(wqe, addr, len);
-		wqe += len;
 	}
 
 	seg->byte_count = cpu_to_be32(inl | MLX5_INLINE_SEG);
 
-	*sz = ALIGN(inl + sizeof(seg->byte_count), 16) / 16;
+	*wqe_sz +=  ALIGN(inl + sizeof(seg->byte_count), 16) / 16;
 
 	return 0;
 }
@@ -4014,22 +4588,37 @@
 	return 0;
 }
 
-static int set_sig_data_segment(const struct ib_sig_handover_wr *wr,
-				struct mlx5_ib_qp *qp, void **seg, int *size)
+static int set_sig_data_segment(const struct ib_send_wr *send_wr,
+				struct ib_mr *sig_mr,
+				struct ib_sig_attrs *sig_attrs,
+				struct mlx5_ib_qp *qp, void **seg, int *size,
+				void **cur_edge)
 {
-	struct ib_sig_attrs *sig_attrs = wr->sig_attrs;
-	struct ib_mr *sig_mr = wr->sig_mr;
 	struct mlx5_bsf *bsf;
-	u32 data_len = wr->wr.sg_list->length;
-	u32 data_key = wr->wr.sg_list->lkey;
-	u64 data_va = wr->wr.sg_list->addr;
+	u32 data_len;
+	u32 data_key;
+	u64 data_va;
+	u32 prot_len = 0;
+	u32 prot_key = 0;
+	u64 prot_va = 0;
+	bool prot = false;
 	int ret;
 	int wqe_size;
+	struct mlx5_ib_mr *mr = to_mmr(sig_mr);
+	struct mlx5_ib_mr *pi_mr = mr->pi_mr;
 
-	if (!wr->prot ||
-	    (data_key == wr->prot->lkey &&
-	     data_va == wr->prot->addr &&
-	     data_len == wr->prot->length)) {
+	data_len = pi_mr->data_length;
+	data_key = pi_mr->ibmr.lkey;
+	data_va = pi_mr->data_iova;
+	if (pi_mr->meta_ndescs) {
+		prot_len = pi_mr->meta_length;
+		prot_key = pi_mr->ibmr.lkey;
+		prot_va = pi_mr->pi_iova;
+		prot = true;
+	}
+
+	if (!prot || (data_key == prot_key && data_va == prot_va &&
+		      data_len == prot_len)) {
 		/**
 		 * Source domain doesn't contain signature information
 		 * or data and protection are interleaved in memory.
@@ -4063,8 +4652,6 @@
 		struct mlx5_stride_block_ctrl_seg *sblock_ctrl;
 		struct mlx5_stride_block_entry *data_sentry;
 		struct mlx5_stride_block_entry *prot_sentry;
-		u32 prot_key = wr->prot->lkey;
-		u64 prot_va = wr->prot->addr;
 		u16 block_size = sig_attrs->mem.sig.dif.pi_interval;
 		int prot_size;
 
@@ -4099,8 +4686,7 @@
 
 	*seg += wqe_size;
 	*size += wqe_size / 16;
-	if (unlikely((*seg == qp->sq.qend)))
-		*seg = mlx5_get_send_wqe(qp, 0);
+	handle_post_send_edge(&qp->sq, seg, *size, cur_edge);
 
 	bsf = *seg;
 	ret = mlx5_set_bsf(sig_mr, sig_attrs, bsf, data_len);
@@ -4109,24 +4695,21 @@
 
 	*seg += sizeof(*bsf);
 	*size += sizeof(*bsf) / 16;
-	if (unlikely((*seg == qp->sq.qend)))
-		*seg = mlx5_get_send_wqe(qp, 0);
+	handle_post_send_edge(&qp->sq, seg, *size, cur_edge);
 
 	return 0;
 }
 
 static void set_sig_mkey_segment(struct mlx5_mkey_seg *seg,
-				 const struct ib_sig_handover_wr *wr, u32 size,
-				 u32 length, u32 pdn)
+				 struct ib_mr *sig_mr, int access_flags,
+				 u32 size, u32 length, u32 pdn)
 {
-	struct ib_mr *sig_mr = wr->sig_mr;
 	u32 sig_key = sig_mr->rkey;
 	u8 sigerr = to_mmr(sig_mr)->sig->sigerr_count & 1;
 
 	memset(seg, 0, sizeof(*seg));
 
-	seg->flags = get_umr_flags(wr->access_flags) |
-				   MLX5_MKC_ACCESS_MODE_KLMS;
+	seg->flags = get_umr_flags(access_flags) | MLX5_MKC_ACCESS_MODE_KLMS;
 	seg->qpn_mkey7_0 = cpu_to_be32((sig_key & 0xff) | 0xffffff00);
 	seg->flags_pd = cpu_to_be32(MLX5_MKEY_REMOTE_INVAL | sigerr << 26 |
 				    MLX5_MKEY_BSF_EN | pdn);
@@ -4146,50 +4729,50 @@
 	umr->mkey_mask = sig_mkey_mask();
 }
 
-
-static int set_sig_umr_wr(const struct ib_send_wr *send_wr,
-			  struct mlx5_ib_qp *qp, void **seg, int *size)
+static int set_pi_umr_wr(const struct ib_send_wr *send_wr,
+			 struct mlx5_ib_qp *qp, void **seg, int *size,
+			 void **cur_edge)
 {
-	const struct ib_sig_handover_wr *wr = sig_handover_wr(send_wr);
-	struct mlx5_ib_mr *sig_mr = to_mmr(wr->sig_mr);
+	const struct ib_reg_wr *wr = reg_wr(send_wr);
+	struct mlx5_ib_mr *sig_mr = to_mmr(wr->mr);
+	struct mlx5_ib_mr *pi_mr = sig_mr->pi_mr;
+	struct ib_sig_attrs *sig_attrs = sig_mr->ibmr.sig_attrs;
 	u32 pdn = get_pd(qp)->pdn;
 	u32 xlt_size;
 	int region_len, ret;
 
-	if (unlikely(wr->wr.num_sge != 1) ||
-	    unlikely(wr->access_flags & IB_ACCESS_REMOTE_ATOMIC) ||
-	    unlikely(!sig_mr->sig) || unlikely(!qp->signature_en) ||
+	if (unlikely(send_wr->num_sge != 0) ||
+	    unlikely(wr->access & IB_ACCESS_REMOTE_ATOMIC) ||
+	    unlikely(!sig_mr->sig) || unlikely(!qp->ibqp.integrity_en) ||
 	    unlikely(!sig_mr->sig->sig_status_checked))
 		return -EINVAL;
 
 	/* length of the protected region, data + protection */
-	region_len = wr->wr.sg_list->length;
-	if (wr->prot &&
-	    (wr->prot->lkey != wr->wr.sg_list->lkey  ||
-	     wr->prot->addr != wr->wr.sg_list->addr  ||
-	     wr->prot->length != wr->wr.sg_list->length))
-		region_len += wr->prot->length;
+	region_len = pi_mr->ibmr.length;
 
 	/**
 	 * KLM octoword size - if protection was provided
 	 * then we use strided block format (3 octowords),
 	 * else we use single KLM (1 octoword)
 	 **/
-	xlt_size = wr->prot ? 0x30 : sizeof(struct mlx5_klm);
+	if (sig_attrs->mem.sig_type != IB_SIG_TYPE_NONE)
+		xlt_size = 0x30;
+	else
+		xlt_size = sizeof(struct mlx5_klm);
 
 	set_sig_umr_segment(*seg, xlt_size);
 	*seg += sizeof(struct mlx5_wqe_umr_ctrl_seg);
 	*size += sizeof(struct mlx5_wqe_umr_ctrl_seg) / 16;
-	if (unlikely((*seg == qp->sq.qend)))
-		*seg = mlx5_get_send_wqe(qp, 0);
+	handle_post_send_edge(&qp->sq, seg, *size, cur_edge);
 
-	set_sig_mkey_segment(*seg, wr, xlt_size, region_len, pdn);
+	set_sig_mkey_segment(*seg, wr->mr, wr->access, xlt_size, region_len,
+			     pdn);
 	*seg += sizeof(struct mlx5_mkey_seg);
 	*size += sizeof(struct mlx5_mkey_seg) / 16;
-	if (unlikely((*seg == qp->sq.qend)))
-		*seg = mlx5_get_send_wqe(qp, 0);
+	handle_post_send_edge(&qp->sq, seg, *size, cur_edge);
 
-	ret = set_sig_data_segment(wr, qp, seg, size);
+	ret = set_sig_data_segment(send_wr, wr->mr, sig_attrs, qp, seg, size,
+				   cur_edge);
 	if (ret)
 		return ret;
 
@@ -4226,12 +4809,26 @@
 
 static int set_reg_wr(struct mlx5_ib_qp *qp,
 		      const struct ib_reg_wr *wr,
-		      void **seg, int *size)
+		      void **seg, int *size, void **cur_edge,
+		      bool check_not_free)
 {
 	struct mlx5_ib_mr *mr = to_mmr(wr->mr);
 	struct mlx5_ib_pd *pd = to_mpd(qp->ibqp.pd);
-	int mr_list_size = mr->ndescs * mr->desc_size;
+	struct mlx5_ib_dev *dev = to_mdev(pd->ibpd.device);
+	int mr_list_size = (mr->ndescs + mr->meta_ndescs) * mr->desc_size;
 	bool umr_inline = mr_list_size <= MLX5_IB_SQ_UMR_INLINE_THRESHOLD;
+	bool atomic = wr->access & IB_ACCESS_REMOTE_ATOMIC;
+	u8 flags = 0;
+
+	if (!mlx5_ib_can_use_umr(dev, atomic)) {
+		mlx5_ib_warn(to_mdev(qp->ibqp.device),
+			     "Fast update of %s for MR is disabled\n",
+			     (MLX5_CAP_GEN(dev->mdev,
+					   umr_modify_entity_size_disabled)) ?
+				     "entity size" :
+				     "atomic access");
+		return -EINVAL;
+	}
 
 	if (unlikely(wr->wr.send_flags & IB_SEND_INLINE)) {
 		mlx5_ib_warn(to_mdev(qp->ibqp.device),
@@ -4239,21 +4836,25 @@
 		return -EINVAL;
 	}
 
-	set_reg_umr_seg(*seg, mr, umr_inline);
+	if (check_not_free)
+		flags |= MLX5_UMR_CHECK_NOT_FREE;
+	if (umr_inline)
+		flags |= MLX5_UMR_INLINE;
+
+	set_reg_umr_seg(*seg, mr, flags, atomic);
 	*seg += sizeof(struct mlx5_wqe_umr_ctrl_seg);
 	*size += sizeof(struct mlx5_wqe_umr_ctrl_seg) / 16;
-	if (unlikely((*seg == qp->sq.qend)))
-		*seg = mlx5_get_send_wqe(qp, 0);
+	handle_post_send_edge(&qp->sq, seg, *size, cur_edge);
 
 	set_reg_mkey_seg(*seg, mr, wr->key, wr->access);
 	*seg += sizeof(struct mlx5_mkey_seg);
 	*size += sizeof(struct mlx5_mkey_seg) / 16;
-	if (unlikely((*seg == qp->sq.qend)))
-		*seg = mlx5_get_send_wqe(qp, 0);
+	handle_post_send_edge(&qp->sq, seg, *size, cur_edge);
 
 	if (umr_inline) {
-		set_reg_umr_inline_seg(*seg, qp, mr, mr_list_size);
-		*size += get_xlt_octo(mr_list_size);
+		memcpy_send_wqe(&qp->sq, cur_edge, seg, size, mr->descs,
+				mr_list_size);
+		*size = ALIGN(*size, MLX5_SEND_WQE_BB >> 4);
 	} else {
 		set_reg_data_seg(*seg, mr, pd);
 		*seg += sizeof(struct mlx5_wqe_data_seg);
@@ -4262,33 +4863,31 @@
 	return 0;
 }
 
-static void set_linv_wr(struct mlx5_ib_qp *qp, void **seg, int *size)
+static void set_linv_wr(struct mlx5_ib_qp *qp, void **seg, int *size,
+			void **cur_edge)
 {
 	set_linv_umr_seg(*seg);
 	*seg += sizeof(struct mlx5_wqe_umr_ctrl_seg);
 	*size += sizeof(struct mlx5_wqe_umr_ctrl_seg) / 16;
-	if (unlikely((*seg == qp->sq.qend)))
-		*seg = mlx5_get_send_wqe(qp, 0);
+	handle_post_send_edge(&qp->sq, seg, *size, cur_edge);
 	set_linv_mkey_seg(*seg);
 	*seg += sizeof(struct mlx5_mkey_seg);
 	*size += sizeof(struct mlx5_mkey_seg) / 16;
-	if (unlikely((*seg == qp->sq.qend)))
-		*seg = mlx5_get_send_wqe(qp, 0);
+	handle_post_send_edge(&qp->sq, seg, *size, cur_edge);
 }
 
-static void dump_wqe(struct mlx5_ib_qp *qp, int idx, int size_16)
+static void dump_wqe(struct mlx5_ib_qp *qp, u32 idx, int size_16)
 {
 	__be32 *p = NULL;
-	int tidx = idx;
 	int i, j;
 
-	pr_debug("dump wqe at %p\n", mlx5_get_send_wqe(qp, tidx));
+	pr_debug("dump WQE index %u:\n", idx);
 	for (i = 0, j = 0; i < size_16 * 4; i += 4, j += 4) {
 		if ((i & 0xf) == 0) {
-			void *buf = mlx5_get_send_wqe(qp, tidx);
-			tidx = (tidx + 1) & (qp->sq.wqe_cnt - 1);
-			p = buf;
+			p = mlx5_frag_buf_get_wqe(&qp->sq.fbc, idx);
+			pr_debug("WQBB at %p:\n", (void *)p);
 			j = 0;
+			idx = (idx + 1) & (qp->sq.wqe_cnt - 1);
 		}
 		pr_debug("%08x %08x %08x %08x\n", be32_to_cpu(p[j]),
 			 be32_to_cpu(p[j + 1]), be32_to_cpu(p[j + 2]),
@@ -4297,15 +4896,16 @@
 }
 
 static int __begin_wqe(struct mlx5_ib_qp *qp, void **seg,
-		     struct mlx5_wqe_ctrl_seg **ctrl,
-		     const struct ib_send_wr *wr, unsigned *idx,
-		     int *size, int nreq, bool send_signaled, bool solicited)
+		       struct mlx5_wqe_ctrl_seg **ctrl,
+		       const struct ib_send_wr *wr, unsigned int *idx,
+		       int *size, void **cur_edge, int nreq,
+		       bool send_signaled, bool solicited)
 {
 	if (unlikely(mlx5_wq_overflow(&qp->sq, nreq, qp->ibqp.send_cq)))
 		return -ENOMEM;
 
 	*idx = qp->sq.cur_post & (qp->sq.wqe_cnt - 1);
-	*seg = mlx5_get_send_wqe(qp, *idx);
+	*seg = mlx5_frag_buf_get_wqe(&qp->sq.fbc, *idx);
 	*ctrl = *seg;
 	*(uint32_t *)(*seg + 8) = 0;
 	(*ctrl)->imm = send_ieth(wr);
@@ -4315,6 +4915,7 @@
 
 	*seg += sizeof(**ctrl);
 	*size = sizeof(**ctrl) / 16;
+	*cur_edge = qp->sq.cur_edge;
 
 	return 0;
 }
@@ -4322,17 +4923,18 @@
 static int begin_wqe(struct mlx5_ib_qp *qp, void **seg,
 		     struct mlx5_wqe_ctrl_seg **ctrl,
 		     const struct ib_send_wr *wr, unsigned *idx,
-		     int *size, int nreq)
+		     int *size, void **cur_edge, int nreq)
 {
-	return __begin_wqe(qp, seg, ctrl, wr, idx, size, nreq,
+	return __begin_wqe(qp, seg, ctrl, wr, idx, size, cur_edge, nreq,
 			   wr->send_flags & IB_SEND_SIGNALED,
 			   wr->send_flags & IB_SEND_SOLICITED);
 }
 
 static void finish_wqe(struct mlx5_ib_qp *qp,
 		       struct mlx5_wqe_ctrl_seg *ctrl,
-		       u8 size, unsigned idx, u64 wr_id,
-		       int nreq, u8 fence, u32 mlx5_opcode)
+		       void *seg, u8 size, void *cur_edge,
+		       unsigned int idx, u64 wr_id, int nreq, u8 fence,
+		       u32 mlx5_opcode)
 {
 	u8 opmod = 0;
 
@@ -4348,6 +4950,15 @@
 	qp->sq.wqe_head[idx] = qp->sq.head + nreq;
 	qp->sq.cur_post += DIV_ROUND_UP(size * 16, MLX5_SEND_WQE_BB);
 	qp->sq.w_list[idx].next = qp->sq.cur_post;
+
+	/* We save the edge which was possibly updated during the WQE
+	 * construction, into SQ's cache.
+	 */
+	seg = PTR_ALIGN(seg, MLX5_SEND_WQE_BB);
+	qp->sq.cur_edge = (unlikely(seg == cur_edge)) ?
+			  get_sq_edge(&qp->sq, qp->sq.cur_post &
+				      (qp->sq.wqe_cnt - 1)) :
+			  cur_edge;
 }
 
 static int _mlx5_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr,
@@ -4356,13 +4967,16 @@
 	struct mlx5_wqe_ctrl_seg *ctrl = NULL;  /* compiler warning */
 	struct mlx5_ib_dev *dev = to_mdev(ibqp->device);
 	struct mlx5_core_dev *mdev = dev->mdev;
+	struct ib_reg_wr reg_pi_wr;
 	struct mlx5_ib_qp *qp;
 	struct mlx5_ib_mr *mr;
-	struct mlx5_wqe_data_seg *dpseg;
+	struct mlx5_ib_mr *pi_mr;
+	struct mlx5_ib_mr pa_pi_mr;
+	struct ib_sig_attrs *sig_attrs;
 	struct mlx5_wqe_xrc_seg *xrc;
 	struct mlx5_bf *bf;
+	void *cur_edge;
 	int uninitialized_var(size);
-	void *qend;
 	unsigned long flags;
 	unsigned idx;
 	int err = 0;
@@ -4373,22 +4987,20 @@
 	u8 next_fence = 0;
 	u8 fence;
 
+	if (unlikely(mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR &&
+		     !drain)) {
+		*bad_wr = wr;
+		return -EIO;
+	}
+
 	if (unlikely(ibqp->qp_type == IB_QPT_GSI))
 		return mlx5_ib_gsi_post_send(ibqp, wr, bad_wr);
 
 	qp = to_mqp(ibqp);
 	bf = &qp->bf;
-	qend = qp->sq.qend;
 
 	spin_lock_irqsave(&qp->sq.lock, flags);
 
-	if (mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR && !drain) {
-		err = -EIO;
-		*bad_wr = wr;
-		nreq = 0;
-		goto out;
-	}
-
 	for (nreq = 0; wr; nreq++, wr = wr->next) {
 		if (unlikely(wr->opcode >= ARRAY_SIZE(mlx5_ib_opcode))) {
 			mlx5_ib_warn(dev, "\n");
@@ -4405,7 +5017,8 @@
 			goto out;
 		}
 
-		err = begin_wqe(qp, &seg, &ctrl, wr, &idx, &size, nreq);
+		err = begin_wqe(qp, &seg, &ctrl, wr, &idx, &size, &cur_edge,
+				nreq);
 		if (err) {
 			mlx5_ib_warn(dev, "\n");
 			err = -ENOMEM;
@@ -4413,7 +5026,8 @@
 			goto out;
 		}
 
-		if (wr->opcode == IB_WR_REG_MR) {
+		if (wr->opcode == IB_WR_REG_MR ||
+		    wr->opcode == IB_WR_REG_MR_INTEGRITY) {
 			fence = dev->umr_fence;
 			next_fence = MLX5_FENCE_MODE_INITIATOR_SMALL;
 		} else  {
@@ -4455,14 +5069,15 @@
 			case IB_WR_LOCAL_INV:
 				qp->sq.wr_data[idx] = IB_WR_LOCAL_INV;
 				ctrl->imm = cpu_to_be32(wr->ex.invalidate_rkey);
-				set_linv_wr(qp, &seg, &size);
+				set_linv_wr(qp, &seg, &size, &cur_edge);
 				num_sge = 0;
 				break;
 
 			case IB_WR_REG_MR:
 				qp->sq.wr_data[idx] = IB_WR_REG_MR;
 				ctrl->imm = cpu_to_be32(reg_wr(wr)->key);
-				err = set_reg_wr(qp, reg_wr(wr), &seg, &size);
+				err = set_reg_wr(qp, reg_wr(wr), &seg, &size,
+						 &cur_edge, true);
 				if (err) {
 					*bad_wr = wr;
 					goto out;
@@ -4470,65 +5085,126 @@
 				num_sge = 0;
 				break;
 
-			case IB_WR_REG_SIG_MR:
-				qp->sq.wr_data[idx] = IB_WR_REG_SIG_MR;
-				mr = to_mmr(sig_handover_wr(wr)->sig_mr);
+			case IB_WR_REG_MR_INTEGRITY:
+				qp->sq.wr_data[idx] = IB_WR_REG_MR_INTEGRITY;
 
+				mr = to_mmr(reg_wr(wr)->mr);
+				pi_mr = mr->pi_mr;
+
+				if (pi_mr) {
+					memset(&reg_pi_wr, 0,
+					       sizeof(struct ib_reg_wr));
+
+					reg_pi_wr.mr = &pi_mr->ibmr;
+					reg_pi_wr.access = reg_wr(wr)->access;
+					reg_pi_wr.key = pi_mr->ibmr.rkey;
+
+					ctrl->imm = cpu_to_be32(reg_pi_wr.key);
+					/* UMR for data + prot registration */
+					err = set_reg_wr(qp, &reg_pi_wr, &seg,
+							 &size, &cur_edge,
+							 false);
+					if (err) {
+						*bad_wr = wr;
+						goto out;
+					}
+					finish_wqe(qp, ctrl, seg, size,
+						   cur_edge, idx, wr->wr_id,
+						   nreq, fence,
+						   MLX5_OPCODE_UMR);
+
+					err = begin_wqe(qp, &seg, &ctrl, wr,
+							&idx, &size, &cur_edge,
+							nreq);
+					if (err) {
+						mlx5_ib_warn(dev, "\n");
+						err = -ENOMEM;
+						*bad_wr = wr;
+						goto out;
+					}
+				} else {
+					memset(&pa_pi_mr, 0,
+					       sizeof(struct mlx5_ib_mr));
+					/* No UMR, use local_dma_lkey */
+					pa_pi_mr.ibmr.lkey =
+						mr->ibmr.pd->local_dma_lkey;
+
+					pa_pi_mr.ndescs = mr->ndescs;
+					pa_pi_mr.data_length = mr->data_length;
+					pa_pi_mr.data_iova = mr->data_iova;
+					if (mr->meta_ndescs) {
+						pa_pi_mr.meta_ndescs =
+							mr->meta_ndescs;
+						pa_pi_mr.meta_length =
+							mr->meta_length;
+						pa_pi_mr.pi_iova = mr->pi_iova;
+					}
+
+					pa_pi_mr.ibmr.length = mr->ibmr.length;
+					mr->pi_mr = &pa_pi_mr;
+				}
 				ctrl->imm = cpu_to_be32(mr->ibmr.rkey);
-				err = set_sig_umr_wr(wr, qp, &seg, &size);
+				/* UMR for sig MR */
+				err = set_pi_umr_wr(wr, qp, &seg, &size,
+						    &cur_edge);
 				if (err) {
 					mlx5_ib_warn(dev, "\n");
 					*bad_wr = wr;
 					goto out;
 				}
+				finish_wqe(qp, ctrl, seg, size, cur_edge, idx,
+					   wr->wr_id, nreq, fence,
+					   MLX5_OPCODE_UMR);
 
-				finish_wqe(qp, ctrl, size, idx, wr->wr_id, nreq,
-					   fence, MLX5_OPCODE_UMR);
 				/*
 				 * SET_PSV WQEs are not signaled and solicited
 				 * on error
 				 */
+				sig_attrs = mr->ibmr.sig_attrs;
 				err = __begin_wqe(qp, &seg, &ctrl, wr, &idx,
-						  &size, nreq, false, true);
+						  &size, &cur_edge, nreq, false,
+						  true);
 				if (err) {
 					mlx5_ib_warn(dev, "\n");
 					err = -ENOMEM;
 					*bad_wr = wr;
 					goto out;
 				}
-
-				err = set_psv_wr(&sig_handover_wr(wr)->sig_attrs->mem,
-						 mr->sig->psv_memory.psv_idx, &seg,
-						 &size);
+				err = set_psv_wr(&sig_attrs->mem,
+						 mr->sig->psv_memory.psv_idx,
+						 &seg, &size);
 				if (err) {
 					mlx5_ib_warn(dev, "\n");
 					*bad_wr = wr;
 					goto out;
 				}
+				finish_wqe(qp, ctrl, seg, size, cur_edge, idx,
+					   wr->wr_id, nreq, next_fence,
+					   MLX5_OPCODE_SET_PSV);
 
-				finish_wqe(qp, ctrl, size, idx, wr->wr_id, nreq,
-					   fence, MLX5_OPCODE_SET_PSV);
 				err = __begin_wqe(qp, &seg, &ctrl, wr, &idx,
-						  &size, nreq, false, true);
+						  &size, &cur_edge, nreq, false,
+						  true);
 				if (err) {
 					mlx5_ib_warn(dev, "\n");
 					err = -ENOMEM;
 					*bad_wr = wr;
 					goto out;
 				}
-
-				err = set_psv_wr(&sig_handover_wr(wr)->sig_attrs->wire,
-						 mr->sig->psv_wire.psv_idx, &seg,
-						 &size);
+				err = set_psv_wr(&sig_attrs->wire,
+						 mr->sig->psv_wire.psv_idx,
+						 &seg, &size);
 				if (err) {
 					mlx5_ib_warn(dev, "\n");
 					*bad_wr = wr;
 					goto out;
 				}
+				finish_wqe(qp, ctrl, seg, size, cur_edge, idx,
+					   wr->wr_id, nreq, next_fence,
+					   MLX5_OPCODE_SET_PSV);
 
-				finish_wqe(qp, ctrl, size, idx, wr->wr_id, nreq,
-					   fence, MLX5_OPCODE_SET_PSV);
-				qp->next_fence = MLX5_FENCE_MODE_INITIATOR_SMALL;
+				qp->next_fence =
+					MLX5_FENCE_MODE_INITIATOR_SMALL;
 				num_sge = 0;
 				goto skip_psv;
 
@@ -4564,16 +5240,14 @@
 			set_datagram_seg(seg, wr);
 			seg += sizeof(struct mlx5_wqe_datagram_seg);
 			size += sizeof(struct mlx5_wqe_datagram_seg) / 16;
-			if (unlikely((seg == qend)))
-				seg = mlx5_get_send_wqe(qp, 0);
+			handle_post_send_edge(&qp->sq, &seg, size, &cur_edge);
+
 			break;
 		case IB_QPT_UD:
 			set_datagram_seg(seg, wr);
 			seg += sizeof(struct mlx5_wqe_datagram_seg);
 			size += sizeof(struct mlx5_wqe_datagram_seg) / 16;
-
-			if (unlikely((seg == qend)))
-				seg = mlx5_get_send_wqe(qp, 0);
+			handle_post_send_edge(&qp->sq, &seg, size, &cur_edge);
 
 			/* handle qp that supports ud offload */
 			if (qp->flags & IB_QP_CREATE_IPOIB_UD_LSO) {
@@ -4583,11 +5257,9 @@
 				memset(pad, 0, sizeof(struct mlx5_wqe_eth_pad));
 				seg += sizeof(struct mlx5_wqe_eth_pad);
 				size += sizeof(struct mlx5_wqe_eth_pad) / 16;
-
-				seg = set_eth_seg(seg, wr, qend, qp, &size);
-
-				if (unlikely((seg == qend)))
-					seg = mlx5_get_send_wqe(qp, 0);
+				set_eth_seg(wr, qp, &seg, &size, &cur_edge);
+				handle_post_send_edge(&qp->sq, &seg, size,
+						      &cur_edge);
 			}
 			break;
 		case MLX5_IB_QPT_REG_UMR:
@@ -4603,13 +5275,11 @@
 				goto out;
 			seg += sizeof(struct mlx5_wqe_umr_ctrl_seg);
 			size += sizeof(struct mlx5_wqe_umr_ctrl_seg) / 16;
-			if (unlikely((seg == qend)))
-				seg = mlx5_get_send_wqe(qp, 0);
+			handle_post_send_edge(&qp->sq, &seg, size, &cur_edge);
 			set_reg_mkey_segment(seg, wr);
 			seg += sizeof(struct mlx5_mkey_seg);
 			size += sizeof(struct mlx5_mkey_seg) / 16;
-			if (unlikely((seg == qend)))
-				seg = mlx5_get_send_wqe(qp, 0);
+			handle_post_send_edge(&qp->sq, &seg, size, &cur_edge);
 			break;
 
 		default:
@@ -4617,33 +5287,29 @@
 		}
 
 		if (wr->send_flags & IB_SEND_INLINE && num_sge) {
-			int uninitialized_var(sz);
-
-			err = set_data_inl_seg(qp, wr, seg, &sz);
+			err = set_data_inl_seg(qp, wr, &seg, &size, &cur_edge);
 			if (unlikely(err)) {
 				mlx5_ib_warn(dev, "\n");
 				*bad_wr = wr;
 				goto out;
 			}
-			size += sz;
 		} else {
-			dpseg = seg;
 			for (i = 0; i < num_sge; i++) {
-				if (unlikely(dpseg == qend)) {
-					seg = mlx5_get_send_wqe(qp, 0);
-					dpseg = seg;
-				}
+				handle_post_send_edge(&qp->sq, &seg, size,
+						      &cur_edge);
 				if (likely(wr->sg_list[i].length)) {
-					set_data_ptr_seg(dpseg, wr->sg_list + i);
+					set_data_ptr_seg
+					((struct mlx5_wqe_data_seg *)seg,
+					 wr->sg_list + i);
 					size += sizeof(struct mlx5_wqe_data_seg) / 16;
-					dpseg++;
+					seg += sizeof(struct mlx5_wqe_data_seg);
 				}
 			}
 		}
 
 		qp->next_fence = next_fence;
-		finish_wqe(qp, ctrl, size, idx, wr->wr_id, nreq, fence,
-			   mlx5_ib_opcode[wr->opcode]);
+		finish_wqe(qp, ctrl, seg, size, cur_edge, idx, wr->wr_id, nreq,
+			   fence, mlx5_ib_opcode[wr->opcode]);
 skip_psv:
 		if (0)
 			dump_wqe(qp, idx, size);
@@ -4665,11 +5331,10 @@
 		wmb();
 
 		/* currently we support only regular doorbells */
-		mlx5_write64((__be32 *)ctrl, bf->bfreg->map + bf->offset, NULL);
+		mlx5_write64((__be32 *)ctrl, bf->bfreg->map + bf->offset);
 		/* Make sure doorbells don't leak out of SQ spinlock
 		 * and reach the HCA out of order.
 		 */
-		mmiowb();
 		bf->offset ^= bf->buf_size;
 	}
 
@@ -4703,18 +5368,17 @@
 	int ind;
 	int i;
 
+	if (unlikely(mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR &&
+		     !drain)) {
+		*bad_wr = wr;
+		return -EIO;
+	}
+
 	if (unlikely(ibqp->qp_type == IB_QPT_GSI))
 		return mlx5_ib_gsi_post_recv(ibqp, wr, bad_wr);
 
 	spin_lock_irqsave(&qp->rq.lock, flags);
 
-	if (mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR && !drain) {
-		err = -EIO;
-		*bad_wr = wr;
-		nreq = 0;
-		goto out;
-	}
-
 	ind = qp->rq.head & (qp->rq.wqe_cnt - 1);
 
 	for (nreq = 0; wr; nreq++, wr = wr->next) {
@@ -4730,7 +5394,7 @@
 			goto out;
 		}
 
-		scat = get_recv_wqe(qp, ind);
+		scat = mlx5_frag_buf_get_wqe(&qp->rq.fbc, ind);
 		if (qp->wq_sig)
 			scat++;
 
@@ -5172,8 +5836,7 @@
 }
 
 struct ib_xrcd *mlx5_ib_alloc_xrcd(struct ib_device *ibdev,
-					  struct ib_ucontext *context,
-					  struct ib_udata *udata)
+				   struct ib_udata *udata)
 {
 	struct mlx5_ib_dev *dev = to_mdev(ibdev);
 	struct mlx5_ib_xrcd *xrcd;
@@ -5186,7 +5849,7 @@
 	if (!xrcd)
 		return ERR_PTR(-ENOMEM);
 
-	err = mlx5_core_xrcd_alloc(dev->mdev, &xrcd->xrcdn);
+	err = mlx5_cmd_xrcd_alloc(dev->mdev, &xrcd->xrcdn, 0);
 	if (err) {
 		kfree(xrcd);
 		return ERR_PTR(-ENOMEM);
@@ -5195,13 +5858,13 @@
 	return &xrcd->ibxrcd;
 }
 
-int mlx5_ib_dealloc_xrcd(struct ib_xrcd *xrcd)
+int mlx5_ib_dealloc_xrcd(struct ib_xrcd *xrcd, struct ib_udata *udata)
 {
 	struct mlx5_ib_dev *dev = to_mdev(xrcd->device);
 	u32 xrcdn = to_mxrcd(xrcd)->xrcdn;
 	int err;
 
-	err = mlx5_core_xrcd_dealloc(dev->mdev, xrcdn);
+	err = mlx5_cmd_xrcd_dealloc(dev->mdev, xrcdn, 0);
 	if (err)
 		mlx5_ib_warn(dev, "failed to dealloc xrcdn 0x%x\n", xrcdn);
 
@@ -5271,6 +5934,7 @@
 	if (!in)
 		return -ENOMEM;
 
+	MLX5_SET(create_rq_in, in, uid, to_mpd(pd)->uid);
 	rqc = MLX5_ADDR_OF(create_rq_in, in, ctx);
 	MLX5_SET(rqc,  rqc, mem_rq_type,
 		 MLX5_RQC_MEM_RQ_TYPE_MEMORY_RQ_INLINE);
@@ -5443,11 +6107,10 @@
 		return err;
 	}
 
-	err = create_user_rq(dev, pd, rwq, &ucmd);
+	err = create_user_rq(dev, pd, udata, rwq, &ucmd);
 	if (err) {
 		mlx5_ib_dbg(dev, "err %d\n", err);
-		if (err)
-			return err;
+		return err;
 	}
 
 	rwq->user_index = ucmd.user_index;
@@ -5507,22 +6170,20 @@
 err_copy:
 	mlx5_core_destroy_rq_tracked(dev->mdev, &rwq->core_qp);
 err_user_rq:
-	destroy_user_rq(dev, pd, rwq);
+	destroy_user_rq(dev, pd, rwq, udata);
 err:
 	kfree(rwq);
 	return ERR_PTR(err);
 }
 
-int mlx5_ib_destroy_wq(struct ib_wq *wq)
+void mlx5_ib_destroy_wq(struct ib_wq *wq, struct ib_udata *udata)
 {
 	struct mlx5_ib_dev *dev = to_mdev(wq->device);
 	struct mlx5_ib_rwq *rwq = to_mrwq(wq);
 
 	mlx5_core_destroy_rq_tracked(dev->mdev, &rwq->core_qp);
-	destroy_user_rq(dev, wq->pd, rwq);
+	destroy_user_rq(dev, wq->pd, rwq, udata);
 	kfree(rwq);
-
-	return 0;
 }
 
 struct ib_rwq_ind_table *mlx5_ib_create_rwq_ind_table(struct ib_device *device,
@@ -5576,6 +6237,9 @@
 	for (i = 0; i < sz; i++)
 		MLX5_SET(rqtc, rqtc, rq_num[i], init_attr->ind_tbl[i]->wq_num);
 
+	rwq_ind_tbl->uid = to_mpd(init_attr->ind_tbl[0]->pd)->uid;
+	MLX5_SET(create_rqt_in, in, uid, rwq_ind_tbl->uid);
+
 	err = mlx5_core_create_rqt(dev->mdev, in, inlen, &rwq_ind_tbl->rqtn);
 	kvfree(in);
 
@@ -5594,7 +6258,7 @@
 	return &rwq_ind_tbl->ib_rwq_ind_tbl;
 
 err_copy:
-	mlx5_core_destroy_rqt(dev->mdev, rwq_ind_tbl->rqtn);
+	mlx5_cmd_destroy_rqt(dev->mdev, rwq_ind_tbl->rqtn, rwq_ind_tbl->uid);
 err:
 	kfree(rwq_ind_tbl);
 	return ERR_PTR(err);
@@ -5605,7 +6269,7 @@
 	struct mlx5_ib_rwq_ind_table *rwq_ind_tbl = to_mrwq_ind_table(ib_rwq_ind_tbl);
 	struct mlx5_ib_dev *dev = to_mdev(ib_rwq_ind_tbl->device);
 
-	mlx5_core_destroy_rqt(dev->mdev, rwq_ind_tbl->rqtn);
+	mlx5_cmd_destroy_rqt(dev->mdev, rwq_ind_tbl->rqtn, rwq_ind_tbl->uid);
 
 	kfree(rwq_ind_tbl);
 	return 0;
@@ -5656,6 +6320,7 @@
 	if (wq_state == IB_WQS_ERR)
 		wq_state = MLX5_RQC_STATE_ERR;
 	MLX5_SET(modify_rq_in, in, rq_state, curr_wq_state);
+	MLX5_SET(modify_rq_in, in, uid, to_mpd(wq->pd)->uid);
 	MLX5_SET(rqc, rqc, state, wq_state);
 
 	if (wq_attr_mask & IB_WQ_FLAGS) {
@@ -5681,14 +6346,17 @@
 	}
 
 	if (curr_wq_state == IB_WQS_RESET && wq_state == IB_WQS_RDY) {
+		u16 set_id;
+
+		set_id = mlx5_ib_get_counters_id(dev, 0);
 		if (MLX5_CAP_GEN(dev->mdev, modify_rq_counter_set_id)) {
 			MLX5_SET64(modify_rq_in, in, modify_bitmask,
 				   MLX5_MODIFY_RQ_IN_MODIFY_BITMASK_RQ_COUNTER_SET_ID);
-			MLX5_SET(rqc, rqc, counter_set_id,
-				 dev->port->cnts.set_id);
+			MLX5_SET(rqc, rqc, counter_set_id, set_id);
 		} else
-			pr_info_once("%s: Receive WQ counters are not supported on current FW\n",
-				     dev->ib_dev.name);
+			dev_info_once(
+				&dev->ib_dev.dev,
+				"Receive WQ counters are not supported on current FW\n");
 	}
 
 	err = mlx5_core_modify_rq(dev->mdev, rwq->core_qp.qpn, in, inlen);
@@ -5758,7 +6426,7 @@
 		/* Run the CQ handler - this makes sure that the drain WR will
 		 * be processed if wasn't processed yet.
 		 */
-		mcq->mcq.comp(&mcq->mcq);
+		mcq->mcq.comp(&mcq->mcq, NULL);
 	}
 
 	wait_for_completion(&sdrain->done);
@@ -5828,3 +6496,34 @@
 
 	handle_drain_completion(cq, &rdrain, dev);
 }
+
+/**
+ * Bind a qp to a counter. If @counter is NULL then bind the qp to
+ * the default counter
+ */
+int mlx5_ib_qp_set_counter(struct ib_qp *qp, struct rdma_counter *counter)
+{
+	struct mlx5_ib_qp *mqp = to_mqp(qp);
+	int err = 0;
+
+	mutex_lock(&mqp->mutex);
+	if (mqp->state == IB_QPS_RESET) {
+		qp->counter = counter;
+		goto out;
+	}
+
+	if (mqp->state == IB_QPS_RTS) {
+		err = __mlx5_ib_qp_set_counter(qp, counter);
+		if (!err)
+			qp->counter = counter;
+
+		goto out;
+	}
+
+	mqp->counter_pending = 1;
+	qp->counter = counter;
+
+out:
+	mutex_unlock(&mqp->mutex);
+	return err;
+}
diff --git a/drivers/infiniband/hw/mlx5/srq.c b/drivers/infiniband/hw/mlx5/srq.c
index d359fec..4e7fde8 100644
--- a/drivers/infiniband/hw/mlx5/srq.c
+++ b/drivers/infiniband/hw/mlx5/srq.c
@@ -1,50 +1,19 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
 /*
- * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
+ * Copyright (c) 2013-2018, Mellanox Technologies inc.  All rights reserved.
  */
 
 #include <linux/module.h>
 #include <linux/mlx5/qp.h>
-#include <linux/mlx5/srq.h>
 #include <linux/slab.h>
 #include <rdma/ib_umem.h>
 #include <rdma/ib_user_verbs.h>
-
 #include "mlx5_ib.h"
-
-/* not supported currently */
-static int srq_signature;
+#include "srq.h"
 
 static void *get_wqe(struct mlx5_ib_srq *srq, int n)
 {
-	return mlx5_buf_offset(&srq->buf, n << srq->msrq.wqe_shift);
+	return mlx5_frag_buf_get_wqe(&srq->fbc, n);
 }
 
 static void mlx5_ib_srq_event(struct mlx5_core_srq *srq, enum mlx5_event type)
@@ -78,6 +47,8 @@
 {
 	struct mlx5_ib_dev *dev = to_mdev(pd->device);
 	struct mlx5_ib_create_srq ucmd = {};
+	struct mlx5_ib_ucontext *ucontext = rdma_udata_to_drv_context(
+		udata, struct mlx5_ib_ucontext, ibucontext);
 	size_t ucmdlen;
 	int err;
 	int npages;
@@ -102,16 +73,14 @@
 		return -EINVAL;
 
 	if (in->type != IB_SRQT_BASIC) {
-		err = get_srq_user_index(to_mucontext(pd->uobject->context),
-					 &ucmd, udata->inlen, &uidx);
+		err = get_srq_user_index(ucontext, &ucmd, udata->inlen, &uidx);
 		if (err)
 			return err;
 	}
 
 	srq->wq_sig = !!(ucmd.flags & MLX5_SRQ_FLAG_SIGNATURE);
 
-	srq->umem = ib_umem_get(pd->uobject->context, ucmd.buf_addr, buf_size,
-				0, 0);
+	srq->umem = ib_umem_get(udata, ucmd.buf_addr, buf_size, 0, 0);
 	if (IS_ERR(srq->umem)) {
 		mlx5_ib_dbg(dev, "failed umem get, size %d\n", buf_size);
 		err = PTR_ERR(srq->umem);
@@ -135,8 +104,7 @@
 
 	mlx5_ib_populate_pas(dev, srq->umem, page_shift, in->pas, 0);
 
-	err = mlx5_ib_db_map_user(to_mucontext(pd->uobject->context),
-				  ucmd.db_addr, &srq->db);
+	err = mlx5_ib_db_map_user(ucontext, udata, ucmd.db_addr, &srq->db);
 	if (err) {
 		mlx5_ib_dbg(dev, "map doorbell failed\n");
 		goto err_in;
@@ -144,6 +112,7 @@
 
 	in->log_page_size = page_shift - MLX5_ADAPTER_PAGE_SHIFT;
 	in->page_offset = offset;
+	in->uid = (in->type != IB_SRQT_XRC) ?  to_mpd(pd)->uid : 0;
 	if (MLX5_CAP_GEN(dev->mdev, cqe_version) == MLX5_CQE_VERSION_V1 &&
 	    in->type != IB_SRQT_BASIC)
 		in->user_index = uidx;
@@ -172,12 +141,16 @@
 		return err;
 	}
 
-	if (mlx5_buf_alloc(dev->mdev, buf_size, &srq->buf)) {
+	if (mlx5_frag_buf_alloc_node(dev->mdev, buf_size, &srq->buf,
+				     dev->mdev->priv.numa_node)) {
 		mlx5_ib_dbg(dev, "buf alloc failed\n");
 		err = -ENOMEM;
 		goto err_db;
 	}
 
+	mlx5_init_fbc(srq->buf.frags, srq->msrq.wqe_shift, ilog2(srq->msrq.max),
+		      &srq->fbc);
+
 	srq->head    = 0;
 	srq->tail    = srq->msrq.max - 1;
 	srq->wqe_ctr = 0;
@@ -194,14 +167,14 @@
 		err = -ENOMEM;
 		goto err_buf;
 	}
-	mlx5_fill_page_array(&srq->buf, in->pas);
+	mlx5_fill_page_frag_array(&srq->buf, in->pas);
 
 	srq->wrid = kvmalloc_array(srq->msrq.max, sizeof(u64), GFP_KERNEL);
 	if (!srq->wrid) {
 		err = -ENOMEM;
 		goto err_in;
 	}
-	srq->wq_sig = !!srq_signature;
+	srq->wq_sig = 0;
 
 	in->log_page_size = srq->buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT;
 	if (MLX5_CAP_GEN(dev->mdev, cqe_version) == MLX5_CQE_VERSION_V1 &&
@@ -214,16 +187,22 @@
 	kvfree(in->pas);
 
 err_buf:
-	mlx5_buf_free(dev->mdev, &srq->buf);
+	mlx5_frag_buf_free(dev->mdev, &srq->buf);
 
 err_db:
 	mlx5_db_free(dev->mdev, &srq->db);
 	return err;
 }
 
-static void destroy_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq)
+static void destroy_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq,
+			     struct ib_udata *udata)
 {
-	mlx5_ib_db_unmap_user(to_mucontext(pd->uobject->context), &srq->db);
+	mlx5_ib_db_unmap_user(
+		rdma_udata_to_drv_context(
+			udata,
+			struct mlx5_ib_ucontext,
+			ibucontext),
+		&srq->db);
 	ib_umem_release(srq->umem);
 }
 
@@ -231,20 +210,20 @@
 static void destroy_srq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_srq *srq)
 {
 	kvfree(srq->wrid);
-	mlx5_buf_free(dev->mdev, &srq->buf);
+	mlx5_frag_buf_free(dev->mdev, &srq->buf);
 	mlx5_db_free(dev->mdev, &srq->db);
 }
 
-struct ib_srq *mlx5_ib_create_srq(struct ib_pd *pd,
-				  struct ib_srq_init_attr *init_attr,
-				  struct ib_udata *udata)
+int mlx5_ib_create_srq(struct ib_srq *ib_srq,
+		       struct ib_srq_init_attr *init_attr,
+		       struct ib_udata *udata)
 {
-	struct mlx5_ib_dev *dev = to_mdev(pd->device);
-	struct mlx5_ib_srq *srq;
+	struct mlx5_ib_dev *dev = to_mdev(ib_srq->device);
+	struct mlx5_ib_srq *srq = to_msrq(ib_srq);
 	size_t desc_size;
 	size_t buf_size;
 	int err;
-	struct mlx5_srq_attr in = {0};
+	struct mlx5_srq_attr in = {};
 	__u32 max_srq_wqes = 1 << MLX5_CAP_GEN(dev->mdev, log_max_srq_sz);
 
 	/* Sanity check SRQ size before proceeding */
@@ -252,13 +231,9 @@
 		mlx5_ib_dbg(dev, "max_wr %d, cap %d\n",
 			    init_attr->attr.max_wr,
 			    max_srq_wqes);
-		return ERR_PTR(-EINVAL);
+		return -EINVAL;
 	}
 
-	srq = kmalloc(sizeof(*srq), GFP_KERNEL);
-	if (!srq)
-		return ERR_PTR(-ENOMEM);
-
 	mutex_init(&srq->mutex);
 	spin_lock_init(&srq->lock);
 	srq->msrq.max    = roundup_pow_of_two(init_attr->attr.max_wr + 1);
@@ -266,35 +241,32 @@
 
 	desc_size = sizeof(struct mlx5_wqe_srq_next_seg) +
 		    srq->msrq.max_gs * sizeof(struct mlx5_wqe_data_seg);
-	if (desc_size == 0 || srq->msrq.max_gs > desc_size) {
-		err = -EINVAL;
-		goto err_srq;
-	}
+	if (desc_size == 0 || srq->msrq.max_gs > desc_size)
+		return -EINVAL;
+
 	desc_size = roundup_pow_of_two(desc_size);
 	desc_size = max_t(size_t, 32, desc_size);
-	if (desc_size < sizeof(struct mlx5_wqe_srq_next_seg)) {
-		err = -EINVAL;
-		goto err_srq;
-	}
+	if (desc_size < sizeof(struct mlx5_wqe_srq_next_seg))
+		return -EINVAL;
+
 	srq->msrq.max_avail_gather = (desc_size - sizeof(struct mlx5_wqe_srq_next_seg)) /
 		sizeof(struct mlx5_wqe_data_seg);
 	srq->msrq.wqe_shift = ilog2(desc_size);
 	buf_size = srq->msrq.max * desc_size;
-	if (buf_size < desc_size) {
-		err = -EINVAL;
-		goto err_srq;
-	}
+	if (buf_size < desc_size)
+		return -EINVAL;
+
 	in.type = init_attr->srq_type;
 
-	if (pd->uobject)
-		err = create_srq_user(pd, srq, &in, udata, buf_size);
+	if (udata)
+		err = create_srq_user(ib_srq->pd, srq, &in, udata, buf_size);
 	else
 		err = create_srq_kernel(dev, srq, &in, buf_size);
 
 	if (err) {
 		mlx5_ib_warn(dev, "create srq %s failed, err %d\n",
-			     pd->uobject ? "user" : "kernel", err);
-		goto err_srq;
+			     udata ? "user" : "kernel", err);
+		return err;
 	}
 
 	in.log_size = ilog2(srq->msrq.max);
@@ -324,9 +296,9 @@
 	else
 		in.cqn = to_mcq(dev->devr.c0)->mcq.cqn;
 
-	in.pd = to_mpd(pd)->pdn;
+	in.pd = to_mpd(ib_srq->pd)->pdn;
 	in.db_record = srq->db.dma;
-	err = mlx5_core_create_srq(dev->mdev, &srq->msrq, &in);
+	err = mlx5_cmd_create_srq(dev, &srq->msrq, &in);
 	kvfree(in.pas);
 	if (err) {
 		mlx5_ib_dbg(dev, "create SRQ failed, err %d\n", err);
@@ -338,7 +310,7 @@
 	srq->msrq.event = mlx5_ib_srq_event;
 	srq->ibsrq.ext.xrc.srq_num = srq->msrq.srqn;
 
-	if (pd->uobject)
+	if (udata)
 		if (ib_copy_to_udata(udata, &srq->msrq.srqn, sizeof(__u32))) {
 			mlx5_ib_dbg(dev, "copy to user failed\n");
 			err = -EFAULT;
@@ -347,21 +319,18 @@
 
 	init_attr->attr.max_wr = srq->msrq.max - 1;
 
-	return &srq->ibsrq;
+	return 0;
 
 err_core:
-	mlx5_core_destroy_srq(dev->mdev, &srq->msrq);
+	mlx5_cmd_destroy_srq(dev, &srq->msrq);
 
 err_usr_kern_srq:
-	if (pd->uobject)
-		destroy_srq_user(pd, srq);
+	if (udata)
+		destroy_srq_user(ib_srq->pd, srq, udata);
 	else
 		destroy_srq_kernel(dev, srq);
 
-err_srq:
-	kfree(srq);
-
-	return ERR_PTR(err);
+	return err;
 }
 
 int mlx5_ib_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
@@ -380,7 +349,7 @@
 			return -EINVAL;
 
 		mutex_lock(&srq->mutex);
-		ret = mlx5_core_arm_srq(dev->mdev, &srq->msrq, attr->srq_limit, 1);
+		ret = mlx5_cmd_arm_srq(dev, &srq->msrq, attr->srq_limit, 1);
 		mutex_unlock(&srq->mutex);
 
 		if (ret)
@@ -401,7 +370,7 @@
 	if (!out)
 		return -ENOMEM;
 
-	ret = mlx5_core_query_srq(dev->mdev, &srq->msrq, out);
+	ret = mlx5_cmd_query_srq(dev, &srq->msrq, out);
 	if (ret)
 		goto out_box;
 
@@ -414,22 +383,24 @@
 	return ret;
 }
 
-int mlx5_ib_destroy_srq(struct ib_srq *srq)
+void mlx5_ib_destroy_srq(struct ib_srq *srq, struct ib_udata *udata)
 {
 	struct mlx5_ib_dev *dev = to_mdev(srq->device);
 	struct mlx5_ib_srq *msrq = to_msrq(srq);
 
-	mlx5_core_destroy_srq(dev->mdev, &msrq->msrq);
+	mlx5_cmd_destroy_srq(dev, &msrq->msrq);
 
 	if (srq->uobject) {
-		mlx5_ib_db_unmap_user(to_mucontext(srq->uobject->context), &msrq->db);
+		mlx5_ib_db_unmap_user(
+			rdma_udata_to_drv_context(
+				udata,
+				struct mlx5_ib_ucontext,
+				ibucontext),
+			&msrq->db);
 		ib_umem_release(msrq->umem);
 	} else {
 		destroy_srq_kernel(dev, msrq);
 	}
-
-	kfree(srq);
-	return 0;
 }
 
 void mlx5_ib_free_srq_wqe(struct mlx5_ib_srq *srq, int wqe_index)
diff --git a/drivers/infiniband/hw/mlx5/srq.h b/drivers/infiniband/hw/mlx5/srq.h
new file mode 100644
index 0000000..af197c3
--- /dev/null
+++ b/drivers/infiniband/hw/mlx5/srq.h
@@ -0,0 +1,68 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/*
+ * Copyright (c) 2013-2018, Mellanox Technologies. All rights reserved.
+ */
+
+#ifndef MLX5_IB_SRQ_H
+#define MLX5_IB_SRQ_H
+
+enum {
+	MLX5_SRQ_FLAG_ERR    = (1 << 0),
+	MLX5_SRQ_FLAG_WQ_SIG = (1 << 1),
+	MLX5_SRQ_FLAG_RNDV   = (1 << 2),
+};
+
+struct mlx5_srq_attr {
+	u32 type;
+	u32 flags;
+	u32 log_size;
+	u32 wqe_shift;
+	u32 log_page_size;
+	u32 wqe_cnt;
+	u32 srqn;
+	u32 xrcd;
+	u32 page_offset;
+	u32 cqn;
+	u32 pd;
+	u32 lwm;
+	u32 user_index;
+	u64 db_record;
+	__be64 *pas;
+	u32 tm_log_list_size;
+	u32 tm_next_tag;
+	u32 tm_hw_phase_cnt;
+	u32 tm_sw_phase_cnt;
+	u16 uid;
+};
+
+struct mlx5_ib_dev;
+
+struct mlx5_core_srq {
+	struct mlx5_core_rsc_common common; /* must be first */
+	u32 srqn;
+	int max;
+	size_t max_gs;
+	size_t max_avail_gather;
+	int wqe_shift;
+	void (*event)(struct mlx5_core_srq *srq, enum mlx5_event e);
+
+	u16 uid;
+};
+
+struct mlx5_srq_table {
+	struct notifier_block nb;
+	struct xarray array;
+};
+
+int mlx5_cmd_create_srq(struct mlx5_ib_dev *dev, struct mlx5_core_srq *srq,
+			struct mlx5_srq_attr *in);
+void mlx5_cmd_destroy_srq(struct mlx5_ib_dev *dev, struct mlx5_core_srq *srq);
+int mlx5_cmd_query_srq(struct mlx5_ib_dev *dev, struct mlx5_core_srq *srq,
+		       struct mlx5_srq_attr *out);
+int mlx5_cmd_arm_srq(struct mlx5_ib_dev *dev, struct mlx5_core_srq *srq,
+		     u16 lwm, int is_srq);
+struct mlx5_core_srq *mlx5_cmd_get_srq(struct mlx5_ib_dev *dev, u32 srqn);
+
+int mlx5_init_srq_table(struct mlx5_ib_dev *dev);
+void mlx5_cleanup_srq_table(struct mlx5_ib_dev *dev);
+#endif /* MLX5_IB_SRQ_H */
diff --git a/drivers/infiniband/hw/mlx5/srq_cmd.c b/drivers/infiniband/hw/mlx5/srq_cmd.c
new file mode 100644
index 0000000..8fc3630
--- /dev/null
+++ b/drivers/infiniband/hw/mlx5/srq_cmd.c
@@ -0,0 +1,709 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright (c) 2013-2018, Mellanox Technologies inc.  All rights reserved.
+ */
+
+#include <linux/kernel.h>
+#include <linux/mlx5/driver.h>
+#include <linux/mlx5/cmd.h>
+#include "mlx5_ib.h"
+#include "srq.h"
+
+static int get_pas_size(struct mlx5_srq_attr *in)
+{
+	u32 log_page_size = in->log_page_size + 12;
+	u32 log_srq_size  = in->log_size;
+	u32 log_rq_stride = in->wqe_shift;
+	u32 page_offset   = in->page_offset;
+	u32 po_quanta	  = 1 << (log_page_size - 6);
+	u32 rq_sz	  = 1 << (log_srq_size + 4 + log_rq_stride);
+	u32 page_size	  = 1 << log_page_size;
+	u32 rq_sz_po      = rq_sz + (page_offset * po_quanta);
+	u32 rq_num_pas    = DIV_ROUND_UP(rq_sz_po, page_size);
+
+	return rq_num_pas * sizeof(u64);
+}
+
+static void set_wq(void *wq, struct mlx5_srq_attr *in)
+{
+	MLX5_SET(wq,   wq, wq_signature,  !!(in->flags
+		 & MLX5_SRQ_FLAG_WQ_SIG));
+	MLX5_SET(wq,   wq, log_wq_pg_sz,  in->log_page_size);
+	MLX5_SET(wq,   wq, log_wq_stride, in->wqe_shift + 4);
+	MLX5_SET(wq,   wq, log_wq_sz,     in->log_size);
+	MLX5_SET(wq,   wq, page_offset,   in->page_offset);
+	MLX5_SET(wq,   wq, lwm,		  in->lwm);
+	MLX5_SET(wq,   wq, pd,		  in->pd);
+	MLX5_SET64(wq, wq, dbr_addr,	  in->db_record);
+}
+
+static void set_srqc(void *srqc, struct mlx5_srq_attr *in)
+{
+	MLX5_SET(srqc,   srqc, wq_signature,  !!(in->flags
+		 & MLX5_SRQ_FLAG_WQ_SIG));
+	MLX5_SET(srqc,   srqc, log_page_size, in->log_page_size);
+	MLX5_SET(srqc,   srqc, log_rq_stride, in->wqe_shift);
+	MLX5_SET(srqc,   srqc, log_srq_size,  in->log_size);
+	MLX5_SET(srqc,   srqc, page_offset,   in->page_offset);
+	MLX5_SET(srqc,	 srqc, lwm,	      in->lwm);
+	MLX5_SET(srqc,	 srqc, pd,	      in->pd);
+	MLX5_SET64(srqc, srqc, dbr_addr,      in->db_record);
+	MLX5_SET(srqc,	 srqc, xrcd,	      in->xrcd);
+	MLX5_SET(srqc,	 srqc, cqn,	      in->cqn);
+}
+
+static void get_wq(void *wq, struct mlx5_srq_attr *in)
+{
+	if (MLX5_GET(wq, wq, wq_signature))
+		in->flags &= MLX5_SRQ_FLAG_WQ_SIG;
+	in->log_page_size = MLX5_GET(wq,   wq, log_wq_pg_sz);
+	in->wqe_shift	  = MLX5_GET(wq,   wq, log_wq_stride) - 4;
+	in->log_size	  = MLX5_GET(wq,   wq, log_wq_sz);
+	in->page_offset   = MLX5_GET(wq,   wq, page_offset);
+	in->lwm		  = MLX5_GET(wq,   wq, lwm);
+	in->pd		  = MLX5_GET(wq,   wq, pd);
+	in->db_record	  = MLX5_GET64(wq, wq, dbr_addr);
+}
+
+static void get_srqc(void *srqc, struct mlx5_srq_attr *in)
+{
+	if (MLX5_GET(srqc, srqc, wq_signature))
+		in->flags &= MLX5_SRQ_FLAG_WQ_SIG;
+	in->log_page_size = MLX5_GET(srqc,   srqc, log_page_size);
+	in->wqe_shift	  = MLX5_GET(srqc,   srqc, log_rq_stride);
+	in->log_size	  = MLX5_GET(srqc,   srqc, log_srq_size);
+	in->page_offset   = MLX5_GET(srqc,   srqc, page_offset);
+	in->lwm		  = MLX5_GET(srqc,   srqc, lwm);
+	in->pd		  = MLX5_GET(srqc,   srqc, pd);
+	in->db_record	  = MLX5_GET64(srqc, srqc, dbr_addr);
+}
+
+struct mlx5_core_srq *mlx5_cmd_get_srq(struct mlx5_ib_dev *dev, u32 srqn)
+{
+	struct mlx5_srq_table *table = &dev->srq_table;
+	struct mlx5_core_srq *srq;
+
+	xa_lock(&table->array);
+	srq = xa_load(&table->array, srqn);
+	if (srq)
+		refcount_inc(&srq->common.refcount);
+	xa_unlock(&table->array);
+
+	return srq;
+}
+
+static int create_srq_cmd(struct mlx5_ib_dev *dev, struct mlx5_core_srq *srq,
+			  struct mlx5_srq_attr *in)
+{
+	u32 create_out[MLX5_ST_SZ_DW(create_srq_out)] = {0};
+	void *create_in;
+	void *srqc;
+	void *pas;
+	int pas_size;
+	int inlen;
+	int err;
+
+	pas_size  = get_pas_size(in);
+	inlen	  = MLX5_ST_SZ_BYTES(create_srq_in) + pas_size;
+	create_in = kvzalloc(inlen, GFP_KERNEL);
+	if (!create_in)
+		return -ENOMEM;
+
+	MLX5_SET(create_srq_in, create_in, uid, in->uid);
+	srqc = MLX5_ADDR_OF(create_srq_in, create_in, srq_context_entry);
+	pas = MLX5_ADDR_OF(create_srq_in, create_in, pas);
+
+	set_srqc(srqc, in);
+	memcpy(pas, in->pas, pas_size);
+
+	MLX5_SET(create_srq_in, create_in, opcode,
+		 MLX5_CMD_OP_CREATE_SRQ);
+
+	err = mlx5_cmd_exec(dev->mdev, create_in, inlen, create_out,
+			    sizeof(create_out));
+	kvfree(create_in);
+	if (!err) {
+		srq->srqn = MLX5_GET(create_srq_out, create_out, srqn);
+		srq->uid = in->uid;
+	}
+
+	return err;
+}
+
+static int destroy_srq_cmd(struct mlx5_ib_dev *dev, struct mlx5_core_srq *srq)
+{
+	u32 srq_in[MLX5_ST_SZ_DW(destroy_srq_in)] = {0};
+	u32 srq_out[MLX5_ST_SZ_DW(destroy_srq_out)] = {0};
+
+	MLX5_SET(destroy_srq_in, srq_in, opcode,
+		 MLX5_CMD_OP_DESTROY_SRQ);
+	MLX5_SET(destroy_srq_in, srq_in, srqn, srq->srqn);
+	MLX5_SET(destroy_srq_in, srq_in, uid, srq->uid);
+
+	return mlx5_cmd_exec(dev->mdev, srq_in, sizeof(srq_in), srq_out,
+			     sizeof(srq_out));
+}
+
+static int arm_srq_cmd(struct mlx5_ib_dev *dev, struct mlx5_core_srq *srq,
+		       u16 lwm, int is_srq)
+{
+	u32 srq_in[MLX5_ST_SZ_DW(arm_rq_in)] = {0};
+	u32 srq_out[MLX5_ST_SZ_DW(arm_rq_out)] = {0};
+
+	MLX5_SET(arm_rq_in, srq_in, opcode, MLX5_CMD_OP_ARM_RQ);
+	MLX5_SET(arm_rq_in, srq_in, op_mod, MLX5_ARM_RQ_IN_OP_MOD_SRQ);
+	MLX5_SET(arm_rq_in, srq_in, srq_number, srq->srqn);
+	MLX5_SET(arm_rq_in, srq_in, lwm,      lwm);
+	MLX5_SET(arm_rq_in, srq_in, uid, srq->uid);
+
+	return mlx5_cmd_exec(dev->mdev, srq_in, sizeof(srq_in), srq_out,
+			     sizeof(srq_out));
+}
+
+static int query_srq_cmd(struct mlx5_ib_dev *dev, struct mlx5_core_srq *srq,
+			 struct mlx5_srq_attr *out)
+{
+	u32 srq_in[MLX5_ST_SZ_DW(query_srq_in)] = {0};
+	u32 *srq_out;
+	void *srqc;
+	int err;
+
+	srq_out = kvzalloc(MLX5_ST_SZ_BYTES(query_srq_out), GFP_KERNEL);
+	if (!srq_out)
+		return -ENOMEM;
+
+	MLX5_SET(query_srq_in, srq_in, opcode,
+		 MLX5_CMD_OP_QUERY_SRQ);
+	MLX5_SET(query_srq_in, srq_in, srqn, srq->srqn);
+	err = mlx5_cmd_exec(dev->mdev, srq_in, sizeof(srq_in), srq_out,
+			    MLX5_ST_SZ_BYTES(query_srq_out));
+	if (err)
+		goto out;
+
+	srqc = MLX5_ADDR_OF(query_srq_out, srq_out, srq_context_entry);
+	get_srqc(srqc, out);
+	if (MLX5_GET(srqc, srqc, state) != MLX5_SRQC_STATE_GOOD)
+		out->flags |= MLX5_SRQ_FLAG_ERR;
+out:
+	kvfree(srq_out);
+	return err;
+}
+
+static int create_xrc_srq_cmd(struct mlx5_ib_dev *dev,
+			      struct mlx5_core_srq *srq,
+			      struct mlx5_srq_attr *in)
+{
+	u32 create_out[MLX5_ST_SZ_DW(create_xrc_srq_out)];
+	void *create_in;
+	void *xrc_srqc;
+	void *pas;
+	int pas_size;
+	int inlen;
+	int err;
+
+	pas_size  = get_pas_size(in);
+	inlen	  = MLX5_ST_SZ_BYTES(create_xrc_srq_in) + pas_size;
+	create_in = kvzalloc(inlen, GFP_KERNEL);
+	if (!create_in)
+		return -ENOMEM;
+
+	MLX5_SET(create_xrc_srq_in, create_in, uid, in->uid);
+	xrc_srqc = MLX5_ADDR_OF(create_xrc_srq_in, create_in,
+				xrc_srq_context_entry);
+	pas	 = MLX5_ADDR_OF(create_xrc_srq_in, create_in, pas);
+
+	set_srqc(xrc_srqc, in);
+	MLX5_SET(xrc_srqc, xrc_srqc, user_index, in->user_index);
+	memcpy(pas, in->pas, pas_size);
+	MLX5_SET(create_xrc_srq_in, create_in, opcode,
+		 MLX5_CMD_OP_CREATE_XRC_SRQ);
+
+	memset(create_out, 0, sizeof(create_out));
+	err = mlx5_cmd_exec(dev->mdev, create_in, inlen, create_out,
+			    sizeof(create_out));
+	if (err)
+		goto out;
+
+	srq->srqn = MLX5_GET(create_xrc_srq_out, create_out, xrc_srqn);
+	srq->uid = in->uid;
+out:
+	kvfree(create_in);
+	return err;
+}
+
+static int destroy_xrc_srq_cmd(struct mlx5_ib_dev *dev,
+			       struct mlx5_core_srq *srq)
+{
+	u32 xrcsrq_in[MLX5_ST_SZ_DW(destroy_xrc_srq_in)]   = {0};
+	u32 xrcsrq_out[MLX5_ST_SZ_DW(destroy_xrc_srq_out)] = {0};
+
+	MLX5_SET(destroy_xrc_srq_in, xrcsrq_in, opcode,
+		 MLX5_CMD_OP_DESTROY_XRC_SRQ);
+	MLX5_SET(destroy_xrc_srq_in, xrcsrq_in, xrc_srqn, srq->srqn);
+	MLX5_SET(destroy_xrc_srq_in, xrcsrq_in, uid, srq->uid);
+
+	return mlx5_cmd_exec(dev->mdev, xrcsrq_in, sizeof(xrcsrq_in),
+			     xrcsrq_out, sizeof(xrcsrq_out));
+}
+
+static int arm_xrc_srq_cmd(struct mlx5_ib_dev *dev, struct mlx5_core_srq *srq,
+			   u16 lwm)
+{
+	u32 xrcsrq_in[MLX5_ST_SZ_DW(arm_xrc_srq_in)]   = {0};
+	u32 xrcsrq_out[MLX5_ST_SZ_DW(arm_xrc_srq_out)] = {0};
+
+	MLX5_SET(arm_xrc_srq_in, xrcsrq_in, opcode,   MLX5_CMD_OP_ARM_XRC_SRQ);
+	MLX5_SET(arm_xrc_srq_in, xrcsrq_in, op_mod,   MLX5_ARM_XRC_SRQ_IN_OP_MOD_XRC_SRQ);
+	MLX5_SET(arm_xrc_srq_in, xrcsrq_in, xrc_srqn, srq->srqn);
+	MLX5_SET(arm_xrc_srq_in, xrcsrq_in, lwm,      lwm);
+	MLX5_SET(arm_xrc_srq_in, xrcsrq_in, uid, srq->uid);
+
+	return  mlx5_cmd_exec(dev->mdev, xrcsrq_in, sizeof(xrcsrq_in),
+			      xrcsrq_out, sizeof(xrcsrq_out));
+}
+
+static int query_xrc_srq_cmd(struct mlx5_ib_dev *dev,
+			     struct mlx5_core_srq *srq,
+			     struct mlx5_srq_attr *out)
+{
+	u32 xrcsrq_in[MLX5_ST_SZ_DW(query_xrc_srq_in)];
+	u32 *xrcsrq_out;
+	void *xrc_srqc;
+	int err;
+
+	xrcsrq_out = kvzalloc(MLX5_ST_SZ_BYTES(query_xrc_srq_out), GFP_KERNEL);
+	if (!xrcsrq_out)
+		return -ENOMEM;
+	memset(xrcsrq_in, 0, sizeof(xrcsrq_in));
+
+	MLX5_SET(query_xrc_srq_in, xrcsrq_in, opcode,
+		 MLX5_CMD_OP_QUERY_XRC_SRQ);
+	MLX5_SET(query_xrc_srq_in, xrcsrq_in, xrc_srqn, srq->srqn);
+
+	err = mlx5_cmd_exec(dev->mdev, xrcsrq_in, sizeof(xrcsrq_in),
+			    xrcsrq_out, MLX5_ST_SZ_BYTES(query_xrc_srq_out));
+	if (err)
+		goto out;
+
+	xrc_srqc = MLX5_ADDR_OF(query_xrc_srq_out, xrcsrq_out,
+				xrc_srq_context_entry);
+	get_srqc(xrc_srqc, out);
+	if (MLX5_GET(xrc_srqc, xrc_srqc, state) != MLX5_XRC_SRQC_STATE_GOOD)
+		out->flags |= MLX5_SRQ_FLAG_ERR;
+
+out:
+	kvfree(xrcsrq_out);
+	return err;
+}
+
+static int create_rmp_cmd(struct mlx5_ib_dev *dev, struct mlx5_core_srq *srq,
+			  struct mlx5_srq_attr *in)
+{
+	void *create_out = NULL;
+	void *create_in = NULL;
+	void *rmpc;
+	void *wq;
+	int pas_size;
+	int outlen;
+	int inlen;
+	int err;
+
+	pas_size = get_pas_size(in);
+	inlen = MLX5_ST_SZ_BYTES(create_rmp_in) + pas_size;
+	outlen = MLX5_ST_SZ_BYTES(create_rmp_out);
+	create_in = kvzalloc(inlen, GFP_KERNEL);
+	create_out = kvzalloc(outlen, GFP_KERNEL);
+	if (!create_in || !create_out) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	rmpc = MLX5_ADDR_OF(create_rmp_in, create_in, ctx);
+	wq = MLX5_ADDR_OF(rmpc, rmpc, wq);
+
+	MLX5_SET(rmpc, rmpc, state, MLX5_RMPC_STATE_RDY);
+	MLX5_SET(create_rmp_in, create_in, uid, in->uid);
+	set_wq(wq, in);
+	memcpy(MLX5_ADDR_OF(rmpc, rmpc, wq.pas), in->pas, pas_size);
+
+	MLX5_SET(create_rmp_in, create_in, opcode, MLX5_CMD_OP_CREATE_RMP);
+	err = mlx5_cmd_exec(dev->mdev, create_in, inlen, create_out, outlen);
+	if (!err) {
+		srq->srqn = MLX5_GET(create_rmp_out, create_out, rmpn);
+		srq->uid = in->uid;
+	}
+
+out:
+	kvfree(create_in);
+	kvfree(create_out);
+	return err;
+}
+
+static int destroy_rmp_cmd(struct mlx5_ib_dev *dev, struct mlx5_core_srq *srq)
+{
+	u32 in[MLX5_ST_SZ_DW(destroy_rmp_in)]   = {};
+	u32 out[MLX5_ST_SZ_DW(destroy_rmp_out)] = {};
+
+	MLX5_SET(destroy_rmp_in, in, opcode, MLX5_CMD_OP_DESTROY_RMP);
+	MLX5_SET(destroy_rmp_in, in, rmpn, srq->srqn);
+	MLX5_SET(destroy_rmp_in, in, uid, srq->uid);
+	return mlx5_cmd_exec(dev->mdev, in, sizeof(in), out, sizeof(out));
+}
+
+static int arm_rmp_cmd(struct mlx5_ib_dev *dev, struct mlx5_core_srq *srq,
+		       u16 lwm)
+{
+	void *out = NULL;
+	void *in = NULL;
+	void *rmpc;
+	void *wq;
+	void *bitmask;
+	int outlen;
+	int inlen;
+	int err;
+
+	inlen = MLX5_ST_SZ_BYTES(modify_rmp_in);
+	outlen = MLX5_ST_SZ_BYTES(modify_rmp_out);
+
+	in = kvzalloc(inlen, GFP_KERNEL);
+	out = kvzalloc(outlen, GFP_KERNEL);
+	if (!in || !out) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	rmpc =	  MLX5_ADDR_OF(modify_rmp_in,   in,   ctx);
+	bitmask = MLX5_ADDR_OF(modify_rmp_in,   in,   bitmask);
+	wq   =	  MLX5_ADDR_OF(rmpc,	        rmpc, wq);
+
+	MLX5_SET(modify_rmp_in, in,	 rmp_state, MLX5_RMPC_STATE_RDY);
+	MLX5_SET(modify_rmp_in, in,	 rmpn,      srq->srqn);
+	MLX5_SET(modify_rmp_in, in, uid, srq->uid);
+	MLX5_SET(wq,		wq,	 lwm,	    lwm);
+	MLX5_SET(rmp_bitmask,	bitmask, lwm,	    1);
+	MLX5_SET(rmpc, rmpc, state, MLX5_RMPC_STATE_RDY);
+	MLX5_SET(modify_rmp_in, in, opcode, MLX5_CMD_OP_MODIFY_RMP);
+
+	err = mlx5_cmd_exec(dev->mdev, in, inlen, out, outlen);
+
+out:
+	kvfree(in);
+	kvfree(out);
+	return err;
+}
+
+static int query_rmp_cmd(struct mlx5_ib_dev *dev, struct mlx5_core_srq *srq,
+			 struct mlx5_srq_attr *out)
+{
+	u32 *rmp_out = NULL;
+	u32 *rmp_in = NULL;
+	void *rmpc;
+	int outlen;
+	int inlen;
+	int err;
+
+	outlen = MLX5_ST_SZ_BYTES(query_rmp_out);
+	inlen = MLX5_ST_SZ_BYTES(query_rmp_in);
+
+	rmp_out = kvzalloc(outlen, GFP_KERNEL);
+	rmp_in = kvzalloc(inlen, GFP_KERNEL);
+	if (!rmp_out || !rmp_in) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	MLX5_SET(query_rmp_in, rmp_in, opcode, MLX5_CMD_OP_QUERY_RMP);
+	MLX5_SET(query_rmp_in, rmp_in, rmpn,   srq->srqn);
+	err = mlx5_cmd_exec(dev->mdev, rmp_in, inlen, rmp_out, outlen);
+	if (err)
+		goto out;
+
+	rmpc = MLX5_ADDR_OF(query_rmp_out, rmp_out, rmp_context);
+	get_wq(MLX5_ADDR_OF(rmpc, rmpc, wq), out);
+	if (MLX5_GET(rmpc, rmpc, state) != MLX5_RMPC_STATE_RDY)
+		out->flags |= MLX5_SRQ_FLAG_ERR;
+
+out:
+	kvfree(rmp_out);
+	kvfree(rmp_in);
+	return err;
+}
+
+static int create_xrq_cmd(struct mlx5_ib_dev *dev, struct mlx5_core_srq *srq,
+			  struct mlx5_srq_attr *in)
+{
+	u32 create_out[MLX5_ST_SZ_DW(create_xrq_out)] = {0};
+	void *create_in;
+	void *xrqc;
+	void *wq;
+	int pas_size;
+	int inlen;
+	int err;
+
+	pas_size = get_pas_size(in);
+	inlen = MLX5_ST_SZ_BYTES(create_xrq_in) + pas_size;
+	create_in = kvzalloc(inlen, GFP_KERNEL);
+	if (!create_in)
+		return -ENOMEM;
+
+	xrqc = MLX5_ADDR_OF(create_xrq_in, create_in, xrq_context);
+	wq = MLX5_ADDR_OF(xrqc, xrqc, wq);
+
+	set_wq(wq, in);
+	memcpy(MLX5_ADDR_OF(xrqc, xrqc, wq.pas), in->pas, pas_size);
+
+	if (in->type == IB_SRQT_TM) {
+		MLX5_SET(xrqc, xrqc, topology, MLX5_XRQC_TOPOLOGY_TAG_MATCHING);
+		if (in->flags & MLX5_SRQ_FLAG_RNDV)
+			MLX5_SET(xrqc, xrqc, offload, MLX5_XRQC_OFFLOAD_RNDV);
+		MLX5_SET(xrqc, xrqc,
+			 tag_matching_topology_context.log_matching_list_sz,
+			 in->tm_log_list_size);
+	}
+	MLX5_SET(xrqc, xrqc, user_index, in->user_index);
+	MLX5_SET(xrqc, xrqc, cqn, in->cqn);
+	MLX5_SET(create_xrq_in, create_in, opcode, MLX5_CMD_OP_CREATE_XRQ);
+	MLX5_SET(create_xrq_in, create_in, uid, in->uid);
+	err = mlx5_cmd_exec(dev->mdev, create_in, inlen, create_out,
+			    sizeof(create_out));
+	kvfree(create_in);
+	if (!err) {
+		srq->srqn = MLX5_GET(create_xrq_out, create_out, xrqn);
+		srq->uid = in->uid;
+	}
+
+	return err;
+}
+
+static int destroy_xrq_cmd(struct mlx5_ib_dev *dev, struct mlx5_core_srq *srq)
+{
+	u32 in[MLX5_ST_SZ_DW(destroy_xrq_in)] = {0};
+	u32 out[MLX5_ST_SZ_DW(destroy_xrq_out)] = {0};
+
+	MLX5_SET(destroy_xrq_in, in, opcode, MLX5_CMD_OP_DESTROY_XRQ);
+	MLX5_SET(destroy_xrq_in, in, xrqn,   srq->srqn);
+	MLX5_SET(destroy_xrq_in, in, uid, srq->uid);
+
+	return mlx5_cmd_exec(dev->mdev, in, sizeof(in), out, sizeof(out));
+}
+
+static int arm_xrq_cmd(struct mlx5_ib_dev *dev,
+		       struct mlx5_core_srq *srq,
+		       u16 lwm)
+{
+	u32 out[MLX5_ST_SZ_DW(arm_rq_out)] = {0};
+	u32 in[MLX5_ST_SZ_DW(arm_rq_in)] = {0};
+
+	MLX5_SET(arm_rq_in, in, opcode,     MLX5_CMD_OP_ARM_RQ);
+	MLX5_SET(arm_rq_in, in, op_mod,     MLX5_ARM_RQ_IN_OP_MOD_XRQ);
+	MLX5_SET(arm_rq_in, in, srq_number, srq->srqn);
+	MLX5_SET(arm_rq_in, in, lwm,	    lwm);
+	MLX5_SET(arm_rq_in, in, uid, srq->uid);
+
+	return mlx5_cmd_exec(dev->mdev, in, sizeof(in), out, sizeof(out));
+}
+
+static int query_xrq_cmd(struct mlx5_ib_dev *dev, struct mlx5_core_srq *srq,
+			 struct mlx5_srq_attr *out)
+{
+	u32 in[MLX5_ST_SZ_DW(query_xrq_in)] = {0};
+	u32 *xrq_out;
+	int outlen = MLX5_ST_SZ_BYTES(query_xrq_out);
+	void *xrqc;
+	int err;
+
+	xrq_out = kvzalloc(outlen, GFP_KERNEL);
+	if (!xrq_out)
+		return -ENOMEM;
+
+	MLX5_SET(query_xrq_in, in, opcode, MLX5_CMD_OP_QUERY_XRQ);
+	MLX5_SET(query_xrq_in, in, xrqn, srq->srqn);
+
+	err = mlx5_cmd_exec(dev->mdev, in, sizeof(in), xrq_out, outlen);
+	if (err)
+		goto out;
+
+	xrqc = MLX5_ADDR_OF(query_xrq_out, xrq_out, xrq_context);
+	get_wq(MLX5_ADDR_OF(xrqc, xrqc, wq), out);
+	if (MLX5_GET(xrqc, xrqc, state) != MLX5_XRQC_STATE_GOOD)
+		out->flags |= MLX5_SRQ_FLAG_ERR;
+	out->tm_next_tag =
+		MLX5_GET(xrqc, xrqc,
+			 tag_matching_topology_context.append_next_index);
+	out->tm_hw_phase_cnt =
+		MLX5_GET(xrqc, xrqc,
+			 tag_matching_topology_context.hw_phase_cnt);
+	out->tm_sw_phase_cnt =
+		MLX5_GET(xrqc, xrqc,
+			 tag_matching_topology_context.sw_phase_cnt);
+
+out:
+	kvfree(xrq_out);
+	return err;
+}
+
+static int create_srq_split(struct mlx5_ib_dev *dev, struct mlx5_core_srq *srq,
+			    struct mlx5_srq_attr *in)
+{
+	if (!dev->mdev->issi)
+		return create_srq_cmd(dev, srq, in);
+	switch (srq->common.res) {
+	case MLX5_RES_XSRQ:
+		return create_xrc_srq_cmd(dev, srq, in);
+	case MLX5_RES_XRQ:
+		return create_xrq_cmd(dev, srq, in);
+	default:
+		return create_rmp_cmd(dev, srq, in);
+	}
+}
+
+static int destroy_srq_split(struct mlx5_ib_dev *dev, struct mlx5_core_srq *srq)
+{
+	if (!dev->mdev->issi)
+		return destroy_srq_cmd(dev, srq);
+	switch (srq->common.res) {
+	case MLX5_RES_XSRQ:
+		return destroy_xrc_srq_cmd(dev, srq);
+	case MLX5_RES_XRQ:
+		return destroy_xrq_cmd(dev, srq);
+	default:
+		return destroy_rmp_cmd(dev, srq);
+	}
+}
+
+int mlx5_cmd_create_srq(struct mlx5_ib_dev *dev, struct mlx5_core_srq *srq,
+			struct mlx5_srq_attr *in)
+{
+	struct mlx5_srq_table *table = &dev->srq_table;
+	int err;
+
+	switch (in->type) {
+	case IB_SRQT_XRC:
+		srq->common.res = MLX5_RES_XSRQ;
+		break;
+	case IB_SRQT_TM:
+		srq->common.res = MLX5_RES_XRQ;
+		break;
+	default:
+		srq->common.res = MLX5_RES_SRQ;
+	}
+
+	err = create_srq_split(dev, srq, in);
+	if (err)
+		return err;
+
+	refcount_set(&srq->common.refcount, 1);
+	init_completion(&srq->common.free);
+
+	err = xa_err(xa_store_irq(&table->array, srq->srqn, srq, GFP_KERNEL));
+	if (err)
+		goto err_destroy_srq_split;
+
+	return 0;
+
+err_destroy_srq_split:
+	destroy_srq_split(dev, srq);
+
+	return err;
+}
+
+void mlx5_cmd_destroy_srq(struct mlx5_ib_dev *dev, struct mlx5_core_srq *srq)
+{
+	struct mlx5_srq_table *table = &dev->srq_table;
+	struct mlx5_core_srq *tmp;
+	int err;
+
+	tmp = xa_erase_irq(&table->array, srq->srqn);
+	if (!tmp || tmp != srq)
+		return;
+
+	err = destroy_srq_split(dev, srq);
+	if (err)
+		return;
+
+	mlx5_core_res_put(&srq->common);
+	wait_for_completion(&srq->common.free);
+}
+
+int mlx5_cmd_query_srq(struct mlx5_ib_dev *dev, struct mlx5_core_srq *srq,
+		       struct mlx5_srq_attr *out)
+{
+	if (!dev->mdev->issi)
+		return query_srq_cmd(dev, srq, out);
+	switch (srq->common.res) {
+	case MLX5_RES_XSRQ:
+		return query_xrc_srq_cmd(dev, srq, out);
+	case MLX5_RES_XRQ:
+		return query_xrq_cmd(dev, srq, out);
+	default:
+		return query_rmp_cmd(dev, srq, out);
+	}
+}
+
+int mlx5_cmd_arm_srq(struct mlx5_ib_dev *dev, struct mlx5_core_srq *srq,
+		     u16 lwm, int is_srq)
+{
+	if (!dev->mdev->issi)
+		return arm_srq_cmd(dev, srq, lwm, is_srq);
+	switch (srq->common.res) {
+	case MLX5_RES_XSRQ:
+		return arm_xrc_srq_cmd(dev, srq, lwm);
+	case MLX5_RES_XRQ:
+		return arm_xrq_cmd(dev, srq, lwm);
+	default:
+		return arm_rmp_cmd(dev, srq, lwm);
+	}
+}
+
+static int srq_event_notifier(struct notifier_block *nb,
+			      unsigned long type, void *data)
+{
+	struct mlx5_srq_table *table;
+	struct mlx5_core_srq *srq;
+	struct mlx5_eqe *eqe;
+	u32 srqn;
+
+	if (type != MLX5_EVENT_TYPE_SRQ_CATAS_ERROR &&
+	    type != MLX5_EVENT_TYPE_SRQ_RQ_LIMIT)
+		return NOTIFY_DONE;
+
+	table = container_of(nb, struct mlx5_srq_table, nb);
+
+	eqe = data;
+	srqn = be32_to_cpu(eqe->data.qp_srq.qp_srq_n) & 0xffffff;
+
+	xa_lock(&table->array);
+	srq = xa_load(&table->array, srqn);
+	if (srq)
+		refcount_inc(&srq->common.refcount);
+	xa_unlock(&table->array);
+
+	if (!srq)
+		return NOTIFY_OK;
+
+	srq->event(srq, eqe->type);
+
+	mlx5_core_res_put(&srq->common);
+
+	return NOTIFY_OK;
+}
+
+int mlx5_init_srq_table(struct mlx5_ib_dev *dev)
+{
+	struct mlx5_srq_table *table = &dev->srq_table;
+
+	memset(table, 0, sizeof(*table));
+	xa_init_flags(&table->array, XA_FLAGS_LOCK_IRQ);
+
+	table->nb.notifier_call = srq_event_notifier;
+	mlx5_notifier_register(dev->mdev, &table->nb);
+
+	return 0;
+}
+
+void mlx5_cleanup_srq_table(struct mlx5_ib_dev *dev)
+{
+	struct mlx5_srq_table *table = &dev->srq_table;
+
+	mlx5_notifier_unregister(dev->mdev, &table->nb);
+}
diff --git a/drivers/infiniband/hw/mthca/Kconfig b/drivers/infiniband/hw/mthca/Kconfig
index da314c3..66ff527 100644
--- a/drivers/infiniband/hw/mthca/Kconfig
+++ b/drivers/infiniband/hw/mthca/Kconfig
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 config INFINIBAND_MTHCA
 	tristate "Mellanox HCA support"
 	depends on PCI
diff --git a/drivers/infiniband/hw/mthca/mthca_allocator.c b/drivers/infiniband/hw/mthca/mthca_allocator.c
index aaf10dd..aef1d27 100644
--- a/drivers/infiniband/hw/mthca/mthca_allocator.c
+++ b/drivers/infiniband/hw/mthca/mthca_allocator.c
@@ -214,8 +214,6 @@
 
 		dma_unmap_addr_set(&buf->direct, mapping, t);
 
-		memset(buf->direct.buf, 0, size);
-
 		while (t & ((1 << shift) - 1)) {
 			--shift;
 			npages *= 2;
diff --git a/drivers/infiniband/hw/mthca/mthca_cmd.c b/drivers/infiniband/hw/mthca/mthca_cmd.c
index 83aa47e..bdf5ed3 100644
--- a/drivers/infiniband/hw/mthca/mthca_cmd.c
+++ b/drivers/infiniband/hw/mthca/mthca_cmd.c
@@ -292,12 +292,6 @@
 		err = mthca_cmd_post_hcr(dev, in_param, out_param, in_modifier,
 					 op_modifier, op, token, event);
 
-	/*
-	 * Make sure that our HCR writes don't get mixed in with
-	 * writes from another CPU starting a FW command.
-	 */
-	mmiowb();
-
 	mutex_unlock(&dev->cmd.hcr_mutex);
 	return err;
 }
diff --git a/drivers/infiniband/hw/mthca/mthca_cq.c b/drivers/infiniband/hw/mthca/mthca_cq.c
index a6531ff..c3cfea2 100644
--- a/drivers/infiniband/hw/mthca/mthca_cq.c
+++ b/drivers/infiniband/hw/mthca/mthca_cq.c
@@ -77,7 +77,7 @@
 	__be32 ci_db;		/* Arbel only */
 	__be32 state_db;	/* Arbel only */
 	u32    reserved;
-} __attribute__((packed));
+} __packed;
 
 #define MTHCA_CQ_STATUS_OK          ( 0 << 28)
 #define MTHCA_CQ_STATUS_OVERFLOW    ( 9 << 28)
@@ -211,11 +211,6 @@
 		mthca_write64(MTHCA_TAVOR_CQ_DB_INC_CI | cq->cqn, incr - 1,
 			      dev->kar + MTHCA_CQ_DOORBELL,
 			      MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock));
-		/*
-		 * Make sure doorbells don't leak out of CQ spinlock
-		 * and reach the HCA out of order:
-		 */
-		mmiowb();
 	}
 }
 
diff --git a/drivers/infiniband/hw/mthca/mthca_dev.h b/drivers/infiniband/hw/mthca/mthca_dev.h
index 220a3e4..bfd4eeb 100644
--- a/drivers/infiniband/hw/mthca/mthca_dev.h
+++ b/drivers/infiniband/hw/mthca/mthca_dev.h
@@ -510,7 +510,8 @@
 void mthca_free_cq_buf(struct mthca_dev *dev, struct mthca_cq_buf *buf, int cqe);
 
 int mthca_alloc_srq(struct mthca_dev *dev, struct mthca_pd *pd,
-		    struct ib_srq_attr *attr, struct mthca_srq *srq);
+		    struct ib_srq_attr *attr, struct mthca_srq *srq,
+		    struct ib_udata *udata);
 void mthca_free_srq(struct mthca_dev *dev, struct mthca_srq *srq);
 int mthca_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
 		     enum ib_srq_attr_mask attr_mask, struct ib_udata *udata);
@@ -547,7 +548,8 @@
 		   enum ib_qp_type type,
 		   enum ib_sig_type send_policy,
 		   struct ib_qp_cap *cap,
-		   struct mthca_qp *qp);
+		   struct mthca_qp *qp,
+		   struct ib_udata *udata);
 int mthca_alloc_sqp(struct mthca_dev *dev,
 		    struct mthca_pd *pd,
 		    struct mthca_cq *send_cq,
@@ -556,7 +558,8 @@
 		    struct ib_qp_cap *cap,
 		    int qpn,
 		    int port,
-		    struct mthca_sqp *sqp);
+		    struct mthca_sqp *sqp,
+		    struct ib_udata *udata);
 void mthca_free_qp(struct mthca_dev *dev, struct mthca_qp *qp);
 int mthca_create_ah(struct mthca_dev *dev,
 		    struct mthca_pd *pd,
diff --git a/drivers/infiniband/hw/mthca/mthca_eq.c b/drivers/infiniband/hw/mthca/mthca_eq.c
index 30400ea..2cdf686 100644
--- a/drivers/infiniband/hw/mthca/mthca_eq.c
+++ b/drivers/infiniband/hw/mthca/mthca_eq.c
@@ -63,7 +63,7 @@
 	__be32 consumer_index;
 	__be32 producer_index;
 	u32    reserved3[4];
-} __attribute__((packed));
+} __packed;
 
 #define MTHCA_EQ_STATUS_OK          ( 0 << 28)
 #define MTHCA_EQ_STATUS_OVERFLOW    ( 9 << 28)
@@ -130,7 +130,7 @@
 		u32 raw[6];
 		struct {
 			__be32 cqn;
-		} __attribute__((packed)) comp;
+		} __packed comp;
 		struct {
 			u16    reserved1;
 			__be16 token;
@@ -138,27 +138,27 @@
 			u8     reserved3[3];
 			u8     status;
 			__be64 out_param;
-		} __attribute__((packed)) cmd;
+		} __packed cmd;
 		struct {
 			__be32 qpn;
-		} __attribute__((packed)) qp;
+		} __packed qp;
 		struct {
 			__be32 srqn;
-		} __attribute__((packed)) srq;
+		} __packed srq;
 		struct {
 			__be32 cqn;
 			u32    reserved1;
 			u8     reserved2[3];
 			u8     syndrome;
-		} __attribute__((packed)) cq_err;
+		} __packed cq_err;
 		struct {
 			u32    reserved1[2];
 			__be32 port;
-		} __attribute__((packed)) port_change;
+		} __packed port_change;
 	} event;
 	u8 reserved3[3];
 	u8 owner;
-} __attribute__((packed));
+} __packed;
 
 #define  MTHCA_EQ_ENTRY_OWNER_SW      (0 << 7)
 #define  MTHCA_EQ_ENTRY_OWNER_HW      (1 << 7)
diff --git a/drivers/infiniband/hw/mthca/mthca_mad.c b/drivers/infiniband/hw/mthca/mthca_mad.c
index 093f775..7ad517d 100644
--- a/drivers/infiniband/hw/mthca/mthca_mad.c
+++ b/drivers/infiniband/hw/mthca/mthca_mad.c
@@ -58,8 +58,9 @@
 
 	ret = ib_query_port(&dev->ib_dev, port_num, tprops);
 	if (ret) {
-		printk(KERN_WARNING "ib_query_port failed (%d) for %s port %d\n",
-		       ret, dev->ib_dev.name, port_num);
+		dev_warn(&dev->ib_dev.dev,
+			 "ib_query_port failed (%d) forport %d\n", ret,
+			 port_num);
 		goto out;
 	}
 
@@ -88,13 +89,13 @@
 	rdma_ah_set_port_num(&ah_attr, port_num);
 
 	new_ah = rdma_create_ah(dev->send_agent[port_num - 1][0]->qp->pd,
-				&ah_attr);
+				&ah_attr, 0);
 	if (IS_ERR(new_ah))
 		return;
 
 	spin_lock_irqsave(&dev->sm_lock, flags);
 	if (dev->sm_ah[port_num - 1])
-		rdma_destroy_ah(dev->sm_ah[port_num - 1]);
+		rdma_destroy_ah(dev->sm_ah[port_num - 1], 0);
 	dev->sm_ah[port_num - 1] = new_ah;
 	spin_unlock_irqrestore(&dev->sm_lock, flags);
 }
@@ -346,6 +347,7 @@
 		}
 
 		if (dev->sm_ah[p])
-			rdma_destroy_ah(dev->sm_ah[p]);
+			rdma_destroy_ah(dev->sm_ah[p],
+					RDMA_DESTROY_AH_SLEEPABLE);
 	}
 }
diff --git a/drivers/infiniband/hw/mthca/mthca_main.c b/drivers/infiniband/hw/mthca/mthca_main.c
index f3e80de..fe9654a 100644
--- a/drivers/infiniband/hw/mthca/mthca_main.c
+++ b/drivers/infiniband/hw/mthca/mthca_main.c
@@ -961,7 +961,7 @@
 	/* We can handle large RDMA requests, so allow larger segments. */
 	dma_set_max_seg_size(&pdev->dev, 1024 * 1024 * 1024);
 
-	mdev = (struct mthca_dev *) ib_alloc_device(sizeof *mdev);
+	mdev = ib_alloc_device(mthca_dev, ib_dev);
 	if (!mdev) {
 		dev_err(&pdev->dev, "Device struct alloc failed, "
 			"aborting.\n");
@@ -986,7 +986,8 @@
 		goto err_free_dev;
 	}
 
-	if (mthca_cmd_init(mdev)) {
+	err = mthca_cmd_init(mdev);
+	if (err) {
 		mthca_err(mdev, "Failed to init command interface, aborting.\n");
 		goto err_free_dev;
 	}
@@ -1014,8 +1015,7 @@
 
 	err = mthca_setup_hca(mdev);
 	if (err == -EBUSY && (mdev->mthca_flags & MTHCA_FLAG_MSI_X)) {
-		if (mdev->mthca_flags & MTHCA_FLAG_MSI_X)
-			pci_free_irq_vectors(pdev);
+		pci_free_irq_vectors(pdev);
 		mdev->mthca_flags &= ~MTHCA_FLAG_MSI_X;
 
 		err = mthca_setup_hca(mdev);
diff --git a/drivers/infiniband/hw/mthca/mthca_memfree.c b/drivers/infiniband/hw/mthca/mthca_memfree.c
index cc9c0c8..edccfd6 100644
--- a/drivers/infiniband/hw/mthca/mthca_memfree.c
+++ b/drivers/infiniband/hw/mthca/mthca_memfree.c
@@ -472,7 +472,8 @@
 		goto out;
 	}
 
-	ret = get_user_pages_fast(uaddr & PAGE_MASK, 1, FOLL_WRITE, pages);
+	ret = get_user_pages_fast(uaddr & PAGE_MASK, 1,
+				  FOLL_WRITE | FOLL_LONGTERM, pages);
 	if (ret < 0)
 		goto out;
 
@@ -481,7 +482,7 @@
 
 	ret = pci_map_sg(dev->pdev, &db_tab->page[i].mem, 1, PCI_DMA_TODEVICE);
 	if (ret < 0) {
-		put_page(pages[0]);
+		put_user_page(pages[0]);
 		goto out;
 	}
 
@@ -489,7 +490,7 @@
 				 mthca_uarc_virt(dev, uar, i));
 	if (ret) {
 		pci_unmap_sg(dev->pdev, &db_tab->page[i].mem, 1, PCI_DMA_TODEVICE);
-		put_page(sg_page(&db_tab->page[i].mem));
+		put_user_page(sg_page(&db_tab->page[i].mem));
 		goto out;
 	}
 
@@ -555,7 +556,7 @@
 		if (db_tab->page[i].uvirt) {
 			mthca_UNMAP_ICM(dev, mthca_uarc_virt(dev, uar, i), 1);
 			pci_unmap_sg(dev->pdev, &db_tab->page[i].mem, 1, PCI_DMA_TODEVICE);
-			put_page(sg_page(&db_tab->page[i].mem));
+			put_user_page(sg_page(&db_tab->page[i].mem));
 		}
 	}
 
@@ -623,8 +624,9 @@
 	page = dev->db_tab->page + end;
 
 alloc:
-	page->db_rec = dma_zalloc_coherent(&dev->pdev->dev, MTHCA_ICM_PAGE_SIZE,
-					   &page->mapping, GFP_KERNEL);
+	page->db_rec = dma_alloc_coherent(&dev->pdev->dev,
+					  MTHCA_ICM_PAGE_SIZE, &page->mapping,
+					  GFP_KERNEL);
 	if (!page->db_rec) {
 		ret = -ENOMEM;
 		goto out;
diff --git a/drivers/infiniband/hw/mthca/mthca_mr.c b/drivers/infiniband/hw/mthca/mthca_mr.c
index 6686042..4250b2c 100644
--- a/drivers/infiniband/hw/mthca/mthca_mr.c
+++ b/drivers/infiniband/hw/mthca/mthca_mr.c
@@ -60,7 +60,7 @@
 	__be64 mtt_seg;
 	__be32 mtt_sz;		/* Arbel only */
 	u32    reserved[2];
-} __attribute__((packed));
+} __packed;
 
 #define MTHCA_MPT_FLAG_SW_OWNS       (0xfUL << 28)
 #define MTHCA_MPT_FLAG_MIO           (1 << 17)
diff --git a/drivers/infiniband/hw/mthca/mthca_provider.c b/drivers/infiniband/hw/mthca/mthca_provider.c
index 0d3473b..23554d8 100644
--- a/drivers/infiniband/hw/mthca/mthca_provider.c
+++ b/drivers/infiniband/hw/mthca/mthca_provider.c
@@ -37,6 +37,7 @@
 #include <rdma/ib_smi.h>
 #include <rdma/ib_umem.h>
 #include <rdma/ib_user_verbs.h>
+#include <rdma/uverbs_ioctl.h>
 
 #include <linux/sched.h>
 #include <linux/slab.h>
@@ -300,17 +301,16 @@
 	return err;
 }
 
-static struct ib_ucontext *mthca_alloc_ucontext(struct ib_device *ibdev,
-						struct ib_udata *udata)
+static int mthca_alloc_ucontext(struct ib_ucontext *uctx,
+				struct ib_udata *udata)
 {
-	struct mthca_alloc_ucontext_resp uresp;
-	struct mthca_ucontext           *context;
+	struct ib_device *ibdev = uctx->device;
+	struct mthca_alloc_ucontext_resp uresp = {};
+	struct mthca_ucontext *context = to_mucontext(uctx);
 	int                              err;
 
 	if (!(to_mdev(ibdev)->active))
-		return ERR_PTR(-EAGAIN);
-
-	memset(&uresp, 0, sizeof uresp);
+		return -EAGAIN;
 
 	uresp.qp_tab_size = to_mdev(ibdev)->limits.num_qps;
 	if (mthca_is_memfree(to_mdev(ibdev)))
@@ -318,44 +318,33 @@
 	else
 		uresp.uarc_size = 0;
 
-	context = kmalloc(sizeof *context, GFP_KERNEL);
-	if (!context)
-		return ERR_PTR(-ENOMEM);
-
 	err = mthca_uar_alloc(to_mdev(ibdev), &context->uar);
-	if (err) {
-		kfree(context);
-		return ERR_PTR(err);
-	}
+	if (err)
+		return err;
 
 	context->db_tab = mthca_init_user_db_tab(to_mdev(ibdev));
 	if (IS_ERR(context->db_tab)) {
 		err = PTR_ERR(context->db_tab);
 		mthca_uar_free(to_mdev(ibdev), &context->uar);
-		kfree(context);
-		return ERR_PTR(err);
+		return err;
 	}
 
-	if (ib_copy_to_udata(udata, &uresp, sizeof uresp)) {
+	if (ib_copy_to_udata(udata, &uresp, sizeof(uresp))) {
 		mthca_cleanup_user_db_tab(to_mdev(ibdev), &context->uar, context->db_tab);
 		mthca_uar_free(to_mdev(ibdev), &context->uar);
-		kfree(context);
-		return ERR_PTR(-EFAULT);
+		return -EFAULT;
 	}
 
 	context->reg_mr_warned = 0;
 
-	return &context->ibucontext;
+	return 0;
 }
 
-static int mthca_dealloc_ucontext(struct ib_ucontext *context)
+static void mthca_dealloc_ucontext(struct ib_ucontext *context)
 {
 	mthca_cleanup_user_db_tab(to_mdev(context->device), &to_mucontext(context)->uar,
 				  to_mucontext(context)->db_tab);
 	mthca_uar_free(to_mdev(context->device), &to_mucontext(context)->uar);
-	kfree(to_mucontext(context));
-
-	return 0;
 }
 
 static int mthca_mmap_uar(struct ib_ucontext *context,
@@ -374,151 +363,114 @@
 	return 0;
 }
 
-static struct ib_pd *mthca_alloc_pd(struct ib_device *ibdev,
-				    struct ib_ucontext *context,
-				    struct ib_udata *udata)
+static int mthca_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata)
 {
-	struct mthca_pd *pd;
+	struct ib_device *ibdev = ibpd->device;
+	struct mthca_pd *pd = to_mpd(ibpd);
 	int err;
 
-	pd = kmalloc(sizeof *pd, GFP_KERNEL);
-	if (!pd)
-		return ERR_PTR(-ENOMEM);
+	err = mthca_pd_alloc(to_mdev(ibdev), !udata, pd);
+	if (err)
+		return err;
 
-	err = mthca_pd_alloc(to_mdev(ibdev), !context, pd);
-	if (err) {
-		kfree(pd);
-		return ERR_PTR(err);
-	}
-
-	if (context) {
+	if (udata) {
 		if (ib_copy_to_udata(udata, &pd->pd_num, sizeof (__u32))) {
 			mthca_pd_free(to_mdev(ibdev), pd);
-			kfree(pd);
-			return ERR_PTR(-EFAULT);
+			return -EFAULT;
 		}
 	}
 
-	return &pd->ibpd;
+	return 0;
 }
 
-static int mthca_dealloc_pd(struct ib_pd *pd)
+static void mthca_dealloc_pd(struct ib_pd *pd, struct ib_udata *udata)
 {
 	mthca_pd_free(to_mdev(pd->device), to_mpd(pd));
-	kfree(pd);
-
-	return 0;
 }
 
-static struct ib_ah *mthca_ah_create(struct ib_pd *pd,
-				     struct rdma_ah_attr *ah_attr,
-				     struct ib_udata *udata)
+static int mthca_ah_create(struct ib_ah *ibah, struct rdma_ah_attr *ah_attr,
+			   u32 flags, struct ib_udata *udata)
 
 {
-	int err;
-	struct mthca_ah *ah;
+	struct mthca_ah *ah = to_mah(ibah);
 
-	ah = kmalloc(sizeof *ah, GFP_ATOMIC);
-	if (!ah)
-		return ERR_PTR(-ENOMEM);
-
-	err = mthca_create_ah(to_mdev(pd->device), to_mpd(pd), ah_attr, ah);
-	if (err) {
-		kfree(ah);
-		return ERR_PTR(err);
-	}
-
-	return &ah->ibah;
+	return mthca_create_ah(to_mdev(ibah->device), to_mpd(ibah->pd), ah_attr,
+			       ah);
 }
 
-static int mthca_ah_destroy(struct ib_ah *ah)
+static void mthca_ah_destroy(struct ib_ah *ah, u32 flags)
 {
 	mthca_destroy_ah(to_mdev(ah->device), to_mah(ah));
-	kfree(ah);
-
-	return 0;
 }
 
-static struct ib_srq *mthca_create_srq(struct ib_pd *pd,
-				       struct ib_srq_init_attr *init_attr,
-				       struct ib_udata *udata)
+static int mthca_create_srq(struct ib_srq *ibsrq,
+			    struct ib_srq_init_attr *init_attr,
+			    struct ib_udata *udata)
 {
 	struct mthca_create_srq ucmd;
-	struct mthca_ucontext *context = NULL;
-	struct mthca_srq *srq;
+	struct mthca_ucontext *context = rdma_udata_to_drv_context(
+		udata, struct mthca_ucontext, ibucontext);
+	struct mthca_srq *srq = to_msrq(ibsrq);
 	int err;
 
 	if (init_attr->srq_type != IB_SRQT_BASIC)
-		return ERR_PTR(-EOPNOTSUPP);
+		return -EOPNOTSUPP;
 
-	srq = kmalloc(sizeof *srq, GFP_KERNEL);
-	if (!srq)
-		return ERR_PTR(-ENOMEM);
+	if (udata) {
+		if (ib_copy_from_udata(&ucmd, udata, sizeof(ucmd)))
+			return -EFAULT;
 
-	if (pd->uobject) {
-		context = to_mucontext(pd->uobject->context);
-
-		if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd)) {
-			err = -EFAULT;
-			goto err_free;
-		}
-
-		err = mthca_map_user_db(to_mdev(pd->device), &context->uar,
+		err = mthca_map_user_db(to_mdev(ibsrq->device), &context->uar,
 					context->db_tab, ucmd.db_index,
 					ucmd.db_page);
 
 		if (err)
-			goto err_free;
+			return err;
 
 		srq->mr.ibmr.lkey = ucmd.lkey;
 		srq->db_index     = ucmd.db_index;
 	}
 
-	err = mthca_alloc_srq(to_mdev(pd->device), to_mpd(pd),
-			      &init_attr->attr, srq);
+	err = mthca_alloc_srq(to_mdev(ibsrq->device), to_mpd(ibsrq->pd),
+			      &init_attr->attr, srq, udata);
 
-	if (err && pd->uobject)
-		mthca_unmap_user_db(to_mdev(pd->device), &context->uar,
+	if (err && udata)
+		mthca_unmap_user_db(to_mdev(ibsrq->device), &context->uar,
 				    context->db_tab, ucmd.db_index);
 
 	if (err)
-		goto err_free;
+		return err;
 
-	if (context && ib_copy_to_udata(udata, &srq->srqn, sizeof (__u32))) {
-		mthca_free_srq(to_mdev(pd->device), srq);
-		err = -EFAULT;
-		goto err_free;
+	if (context && ib_copy_to_udata(udata, &srq->srqn, sizeof(__u32))) {
+		mthca_free_srq(to_mdev(ibsrq->device), srq);
+		return -EFAULT;
 	}
 
-	return &srq->ibsrq;
-
-err_free:
-	kfree(srq);
-
-	return ERR_PTR(err);
+	return 0;
 }
 
-static int mthca_destroy_srq(struct ib_srq *srq)
+static void mthca_destroy_srq(struct ib_srq *srq, struct ib_udata *udata)
 {
-	struct mthca_ucontext *context;
-
-	if (srq->uobject) {
-		context = to_mucontext(srq->uobject->context);
+	if (udata) {
+		struct mthca_ucontext *context =
+			rdma_udata_to_drv_context(
+				udata,
+				struct mthca_ucontext,
+				ibucontext);
 
 		mthca_unmap_user_db(to_mdev(srq->device), &context->uar,
 				    context->db_tab, to_msrq(srq)->db_index);
 	}
 
 	mthca_free_srq(to_mdev(srq->device), to_msrq(srq));
-	kfree(srq);
-
-	return 0;
 }
 
 static struct ib_qp *mthca_create_qp(struct ib_pd *pd,
 				     struct ib_qp_init_attr *init_attr,
 				     struct ib_udata *udata)
 {
+	struct mthca_ucontext *context = rdma_udata_to_drv_context(
+		udata, struct mthca_ucontext, ibucontext);
 	struct mthca_create_qp ucmd;
 	struct mthca_qp *qp;
 	int err;
@@ -531,15 +483,11 @@
 	case IB_QPT_UC:
 	case IB_QPT_UD:
 	{
-		struct mthca_ucontext *context;
-
-		qp = kmalloc(sizeof *qp, GFP_KERNEL);
+		qp = kzalloc(sizeof(*qp), GFP_KERNEL);
 		if (!qp)
 			return ERR_PTR(-ENOMEM);
 
-		if (pd->uobject) {
-			context = to_mucontext(pd->uobject->context);
-
+		if (udata) {
 			if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd)) {
 				kfree(qp);
 				return ERR_PTR(-EFAULT);
@@ -574,11 +522,9 @@
 				     to_mcq(init_attr->send_cq),
 				     to_mcq(init_attr->recv_cq),
 				     init_attr->qp_type, init_attr->sq_sig_type,
-				     &init_attr->cap, qp);
+				     &init_attr->cap, qp, udata);
 
-		if (err && pd->uobject) {
-			context = to_mucontext(pd->uobject->context);
-
+		if (err && udata) {
 			mthca_unmap_user_db(to_mdev(pd->device),
 					    &context->uar,
 					    context->db_tab,
@@ -596,10 +542,10 @@
 	case IB_QPT_GSI:
 	{
 		/* Don't allow userspace to create special QPs */
-		if (pd->uobject)
+		if (udata)
 			return ERR_PTR(-EINVAL);
 
-		qp = kmalloc(sizeof (struct mthca_sqp), GFP_KERNEL);
+		qp = kzalloc(sizeof(struct mthca_sqp), GFP_KERNEL);
 		if (!qp)
 			return ERR_PTR(-ENOMEM);
 
@@ -610,7 +556,7 @@
 				      to_mcq(init_attr->recv_cq),
 				      init_attr->sq_sig_type, &init_attr->cap,
 				      qp->ibqp.qp_num, init_attr->port_num,
-				      to_msqp(qp));
+				      to_msqp(qp), udata);
 		break;
 	}
 	default:
@@ -632,16 +578,22 @@
 	return &qp->ibqp;
 }
 
-static int mthca_destroy_qp(struct ib_qp *qp)
+static int mthca_destroy_qp(struct ib_qp *qp, struct ib_udata *udata)
 {
-	if (qp->uobject) {
+	if (udata) {
+		struct mthca_ucontext *context =
+			rdma_udata_to_drv_context(
+				udata,
+				struct mthca_ucontext,
+				ibucontext);
+
 		mthca_unmap_user_db(to_mdev(qp->device),
-				    &to_mucontext(qp->uobject->context)->uar,
-				    to_mucontext(qp->uobject->context)->db_tab,
+				    &context->uar,
+				    context->db_tab,
 				    to_mqp(qp)->sq.db_index);
 		mthca_unmap_user_db(to_mdev(qp->device),
-				    &to_mucontext(qp->uobject->context)->uar,
-				    to_mucontext(qp->uobject->context)->db_tab,
+				    &context->uar,
+				    context->db_tab,
 				    to_mqp(qp)->rq.db_index);
 	}
 	mthca_free_qp(to_mdev(qp->device), to_mqp(qp));
@@ -649,47 +601,45 @@
 	return 0;
 }
 
-static struct ib_cq *mthca_create_cq(struct ib_device *ibdev,
-				     const struct ib_cq_init_attr *attr,
-				     struct ib_ucontext *context,
-				     struct ib_udata *udata)
+static int mthca_create_cq(struct ib_cq *ibcq,
+			   const struct ib_cq_init_attr *attr,
+			   struct ib_udata *udata)
 {
+	struct ib_device *ibdev = ibcq->device;
 	int entries = attr->cqe;
 	struct mthca_create_cq ucmd;
 	struct mthca_cq *cq;
 	int nent;
 	int err;
+	struct mthca_ucontext *context = rdma_udata_to_drv_context(
+		udata, struct mthca_ucontext, ibucontext);
 
 	if (attr->flags)
-		return ERR_PTR(-EINVAL);
+		return -EINVAL;
 
 	if (entries < 1 || entries > to_mdev(ibdev)->limits.max_cqes)
-		return ERR_PTR(-EINVAL);
+		return -EINVAL;
 
-	if (context) {
-		if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd))
-			return ERR_PTR(-EFAULT);
+	if (udata) {
+		if (ib_copy_from_udata(&ucmd, udata, sizeof(ucmd)))
+			return -EFAULT;
 
-		err = mthca_map_user_db(to_mdev(ibdev), &to_mucontext(context)->uar,
-					to_mucontext(context)->db_tab,
-					ucmd.set_db_index, ucmd.set_db_page);
+		err = mthca_map_user_db(to_mdev(ibdev), &context->uar,
+					context->db_tab, ucmd.set_db_index,
+					ucmd.set_db_page);
 		if (err)
-			return ERR_PTR(err);
+			return err;
 
-		err = mthca_map_user_db(to_mdev(ibdev), &to_mucontext(context)->uar,
-					to_mucontext(context)->db_tab,
-					ucmd.arm_db_index, ucmd.arm_db_page);
+		err = mthca_map_user_db(to_mdev(ibdev), &context->uar,
+					context->db_tab, ucmd.arm_db_index,
+					ucmd.arm_db_page);
 		if (err)
 			goto err_unmap_set;
 	}
 
-	cq = kmalloc(sizeof *cq, GFP_KERNEL);
-	if (!cq) {
-		err = -ENOMEM;
-		goto err_unmap_arm;
-	}
+	cq = to_mcq(ibcq);
 
-	if (context) {
+	if (udata) {
 		cq->buf.mr.ibmr.lkey = ucmd.lkey;
 		cq->set_ci_db_index  = ucmd.set_db_index;
 		cq->arm_db_index     = ucmd.arm_db_index;
@@ -698,37 +648,33 @@
 	for (nent = 1; nent <= entries; nent <<= 1)
 		; /* nothing */
 
-	err = mthca_init_cq(to_mdev(ibdev), nent,
-			    context ? to_mucontext(context) : NULL,
-			    context ? ucmd.pdn : to_mdev(ibdev)->driver_pd.pd_num,
+	err = mthca_init_cq(to_mdev(ibdev), nent, context,
+			    udata ? ucmd.pdn : to_mdev(ibdev)->driver_pd.pd_num,
 			    cq);
 	if (err)
-		goto err_free;
+		goto err_unmap_arm;
 
-	if (context && ib_copy_to_udata(udata, &cq->cqn, sizeof (__u32))) {
+	if (udata && ib_copy_to_udata(udata, &cq->cqn, sizeof(__u32))) {
 		mthca_free_cq(to_mdev(ibdev), cq);
 		err = -EFAULT;
-		goto err_free;
+		goto err_unmap_arm;
 	}
 
 	cq->resize_buf = NULL;
 
-	return &cq->ibcq;
-
-err_free:
-	kfree(cq);
+	return 0;
 
 err_unmap_arm:
-	if (context)
-		mthca_unmap_user_db(to_mdev(ibdev), &to_mucontext(context)->uar,
-				    to_mucontext(context)->db_tab, ucmd.arm_db_index);
+	if (udata)
+		mthca_unmap_user_db(to_mdev(ibdev), &context->uar,
+				    context->db_tab, ucmd.arm_db_index);
 
 err_unmap_set:
-	if (context)
-		mthca_unmap_user_db(to_mdev(ibdev), &to_mucontext(context)->uar,
-				    to_mucontext(context)->db_tab, ucmd.set_db_index);
+	if (udata)
+		mthca_unmap_user_db(to_mdev(ibdev), &context->uar,
+				    context->db_tab, ucmd.set_db_index);
 
-	return ERR_PTR(err);
+	return err;
 }
 
 static int mthca_alloc_resize_buf(struct mthca_dev *dev, struct mthca_cq *cq,
@@ -852,22 +798,25 @@
 	return ret;
 }
 
-static int mthca_destroy_cq(struct ib_cq *cq)
+static void mthca_destroy_cq(struct ib_cq *cq, struct ib_udata *udata)
 {
-	if (cq->uobject) {
+	if (udata) {
+		struct mthca_ucontext *context =
+			rdma_udata_to_drv_context(
+				udata,
+				struct mthca_ucontext,
+				ibucontext);
+
 		mthca_unmap_user_db(to_mdev(cq->device),
-				    &to_mucontext(cq->uobject->context)->uar,
-				    to_mucontext(cq->uobject->context)->db_tab,
+				    &context->uar,
+				    context->db_tab,
 				    to_mcq(cq)->arm_db_index);
 		mthca_unmap_user_db(to_mdev(cq->device),
-				    &to_mucontext(cq->uobject->context)->uar,
-				    to_mucontext(cq->uobject->context)->db_tab,
+				    &context->uar,
+				    context->db_tab,
 				    to_mcq(cq)->set_ci_db_index);
 	}
 	mthca_free_cq(to_mdev(cq->device), to_mcq(cq));
-	kfree(cq);
-
-	return 0;
 }
 
 static inline u32 convert_access(int acc)
@@ -906,22 +855,23 @@
 				       u64 virt, int acc, struct ib_udata *udata)
 {
 	struct mthca_dev *dev = to_mdev(pd->device);
-	struct scatterlist *sg;
+	struct sg_dma_page_iter sg_iter;
+	struct mthca_ucontext *context = rdma_udata_to_drv_context(
+		udata, struct mthca_ucontext, ibucontext);
 	struct mthca_mr *mr;
 	struct mthca_reg_mr ucmd;
 	u64 *pages;
-	int shift, n, len;
-	int i, k, entry;
+	int n, i;
 	int err = 0;
 	int write_mtt_size;
 
 	if (udata->inlen < sizeof ucmd) {
-		if (!to_mucontext(pd->uobject->context)->reg_mr_warned) {
+		if (!context->reg_mr_warned) {
 			mthca_warn(dev, "Process '%s' did not pass in MR attrs.\n",
 				   current->comm);
 			mthca_warn(dev, "  Update libmthca to fix this.\n");
 		}
-		++to_mucontext(pd->uobject->context)->reg_mr_warned;
+		++context->reg_mr_warned;
 		ucmd.mr_attrs = 0;
 	} else if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd))
 		return ERR_PTR(-EFAULT);
@@ -930,7 +880,7 @@
 	if (!mr)
 		return ERR_PTR(-ENOMEM);
 
-	mr->umem = ib_umem_get(pd->uobject->context, start, length, acc,
+	mr->umem = ib_umem_get(udata, start, length, acc,
 			       ucmd.mr_attrs & MTHCA_MR_DMASYNC);
 
 	if (IS_ERR(mr->umem)) {
@@ -938,8 +888,7 @@
 		goto err;
 	}
 
-	shift = mr->umem->page_shift;
-	n = mr->umem->nmap;
+	n = ib_umem_num_pages(mr->umem);
 
 	mr->mtt = mthca_alloc_mtt(dev, n);
 	if (IS_ERR(mr->mtt)) {
@@ -957,21 +906,19 @@
 
 	write_mtt_size = min(mthca_write_mtt_size(dev), (int) (PAGE_SIZE / sizeof *pages));
 
-	for_each_sg(mr->umem->sg_head.sgl, sg, mr->umem->nmap, entry) {
-		len = sg_dma_len(sg) >> shift;
-		for (k = 0; k < len; ++k) {
-			pages[i++] = sg_dma_address(sg) + (k << shift);
-			/*
-			 * Be friendly to write_mtt and pass it chunks
-			 * of appropriate size.
-			 */
-			if (i == write_mtt_size) {
-				err = mthca_write_mtt(dev, mr->mtt, n, pages, i);
-				if (err)
-					goto mtt_done;
-				n += i;
-				i = 0;
-			}
+	for_each_sg_dma_page(mr->umem->sg_head.sgl, &sg_iter, mr->umem->nmap, 0) {
+		pages[i++] = sg_page_iter_dma_address(&sg_iter);
+
+		/*
+		 * Be friendly to write_mtt and pass it chunks
+		 * of appropriate size.
+		 */
+		if (i == write_mtt_size) {
+			err = mthca_write_mtt(dev, mr->mtt, n, pages, i);
+			if (err)
+				goto mtt_done;
+			n += i;
+			i = 0;
 		}
 	}
 
@@ -982,7 +929,7 @@
 	if (err)
 		goto err_mtt;
 
-	err = mthca_mr_alloc(dev, to_mpd(pd)->pd_num, shift, virt, length,
+	err = mthca_mr_alloc(dev, to_mpd(pd)->pd_num, PAGE_SHIFT, virt, length,
 			     convert_access(acc), mr);
 
 	if (err)
@@ -1001,13 +948,12 @@
 	return ERR_PTR(err);
 }
 
-static int mthca_dereg_mr(struct ib_mr *mr)
+static int mthca_dereg_mr(struct ib_mr *mr, struct ib_udata *udata)
 {
 	struct mthca_mr *mmr = to_mmr(mr);
 
 	mthca_free_mr(to_mdev(mr->device), mmr);
-	if (mmr->umem)
-		ib_umem_release(mmr->umem);
+	ib_umem_release(mmr->umem);
 	kfree(mmr);
 
 	return 0;
@@ -1076,19 +1022,22 @@
 	return err;
 }
 
-static ssize_t show_rev(struct device *device, struct device_attribute *attr,
-			char *buf)
+static ssize_t hw_rev_show(struct device *device,
+			   struct device_attribute *attr, char *buf)
 {
 	struct mthca_dev *dev =
-		container_of(device, struct mthca_dev, ib_dev.dev);
+		rdma_device_to_drv_device(device, struct mthca_dev, ib_dev);
+
 	return sprintf(buf, "%x\n", dev->rev_id);
 }
+static DEVICE_ATTR_RO(hw_rev);
 
-static ssize_t show_hca(struct device *device, struct device_attribute *attr,
-			char *buf)
+static ssize_t hca_type_show(struct device *device,
+			     struct device_attribute *attr, char *buf)
 {
 	struct mthca_dev *dev =
-		container_of(device, struct mthca_dev, ib_dev.dev);
+		rdma_device_to_drv_device(device, struct mthca_dev, ib_dev);
+
 	switch (dev->pdev->device) {
 	case PCI_DEVICE_ID_MELLANOX_TAVOR:
 		return sprintf(buf, "MT23108\n");
@@ -1103,23 +1052,27 @@
 		return sprintf(buf, "unknown\n");
 	}
 }
+static DEVICE_ATTR_RO(hca_type);
 
-static ssize_t show_board(struct device *device, struct device_attribute *attr,
-			  char *buf)
+static ssize_t board_id_show(struct device *device,
+			     struct device_attribute *attr, char *buf)
 {
 	struct mthca_dev *dev =
-		container_of(device, struct mthca_dev, ib_dev.dev);
+		rdma_device_to_drv_device(device, struct mthca_dev, ib_dev);
+
 	return sprintf(buf, "%.*s\n", MTHCA_BOARD_ID_LEN, dev->board_id);
 }
+static DEVICE_ATTR_RO(board_id);
 
-static DEVICE_ATTR(hw_rev,   S_IRUGO, show_rev,    NULL);
-static DEVICE_ATTR(hca_type, S_IRUGO, show_hca,    NULL);
-static DEVICE_ATTR(board_id, S_IRUGO, show_board,  NULL);
+static struct attribute *mthca_dev_attributes[] = {
+	&dev_attr_hw_rev.attr,
+	&dev_attr_hca_type.attr,
+	&dev_attr_board_id.attr,
+	NULL
+};
 
-static struct device_attribute *mthca_dev_attributes[] = {
-	&dev_attr_hw_rev,
-	&dev_attr_hca_type,
-	&dev_attr_board_id
+static const struct attribute_group mthca_attr_group = {
+	.attrs = mthca_dev_attributes,
 };
 
 static int mthca_init_node_data(struct mthca_dev *dev)
@@ -1189,19 +1142,103 @@
 		 (int) dev->fw_ver & 0xffff);
 }
 
+static const struct ib_device_ops mthca_dev_ops = {
+	.owner = THIS_MODULE,
+	.driver_id = RDMA_DRIVER_MTHCA,
+	.uverbs_abi_ver = MTHCA_UVERBS_ABI_VERSION,
+	.uverbs_no_driver_id_binding = 1,
+
+	.alloc_pd = mthca_alloc_pd,
+	.alloc_ucontext = mthca_alloc_ucontext,
+	.attach_mcast = mthca_multicast_attach,
+	.create_ah = mthca_ah_create,
+	.create_cq = mthca_create_cq,
+	.create_qp = mthca_create_qp,
+	.dealloc_pd = mthca_dealloc_pd,
+	.dealloc_ucontext = mthca_dealloc_ucontext,
+	.dereg_mr = mthca_dereg_mr,
+	.destroy_ah = mthca_ah_destroy,
+	.destroy_cq = mthca_destroy_cq,
+	.destroy_qp = mthca_destroy_qp,
+	.detach_mcast = mthca_multicast_detach,
+	.get_dev_fw_str = get_dev_fw_str,
+	.get_dma_mr = mthca_get_dma_mr,
+	.get_port_immutable = mthca_port_immutable,
+	.mmap = mthca_mmap_uar,
+	.modify_device = mthca_modify_device,
+	.modify_port = mthca_modify_port,
+	.modify_qp = mthca_modify_qp,
+	.poll_cq = mthca_poll_cq,
+	.process_mad = mthca_process_mad,
+	.query_ah = mthca_ah_query,
+	.query_device = mthca_query_device,
+	.query_gid = mthca_query_gid,
+	.query_pkey = mthca_query_pkey,
+	.query_port = mthca_query_port,
+	.query_qp = mthca_query_qp,
+	.reg_user_mr = mthca_reg_user_mr,
+	.resize_cq = mthca_resize_cq,
+
+	INIT_RDMA_OBJ_SIZE(ib_ah, mthca_ah, ibah),
+	INIT_RDMA_OBJ_SIZE(ib_cq, mthca_cq, ibcq),
+	INIT_RDMA_OBJ_SIZE(ib_pd, mthca_pd, ibpd),
+	INIT_RDMA_OBJ_SIZE(ib_ucontext, mthca_ucontext, ibucontext),
+};
+
+static const struct ib_device_ops mthca_dev_arbel_srq_ops = {
+	.create_srq = mthca_create_srq,
+	.destroy_srq = mthca_destroy_srq,
+	.modify_srq = mthca_modify_srq,
+	.post_srq_recv = mthca_arbel_post_srq_recv,
+	.query_srq = mthca_query_srq,
+
+	INIT_RDMA_OBJ_SIZE(ib_srq, mthca_srq, ibsrq),
+};
+
+static const struct ib_device_ops mthca_dev_tavor_srq_ops = {
+	.create_srq = mthca_create_srq,
+	.destroy_srq = mthca_destroy_srq,
+	.modify_srq = mthca_modify_srq,
+	.post_srq_recv = mthca_tavor_post_srq_recv,
+	.query_srq = mthca_query_srq,
+
+	INIT_RDMA_OBJ_SIZE(ib_srq, mthca_srq, ibsrq),
+};
+
+static const struct ib_device_ops mthca_dev_arbel_fmr_ops = {
+	.alloc_fmr = mthca_alloc_fmr,
+	.dealloc_fmr = mthca_dealloc_fmr,
+	.map_phys_fmr = mthca_arbel_map_phys_fmr,
+	.unmap_fmr = mthca_unmap_fmr,
+};
+
+static const struct ib_device_ops mthca_dev_tavor_fmr_ops = {
+	.alloc_fmr = mthca_alloc_fmr,
+	.dealloc_fmr = mthca_dealloc_fmr,
+	.map_phys_fmr = mthca_tavor_map_phys_fmr,
+	.unmap_fmr = mthca_unmap_fmr,
+};
+
+static const struct ib_device_ops mthca_dev_arbel_ops = {
+	.post_recv = mthca_arbel_post_receive,
+	.post_send = mthca_arbel_post_send,
+	.req_notify_cq = mthca_arbel_arm_cq,
+};
+
+static const struct ib_device_ops mthca_dev_tavor_ops = {
+	.post_recv = mthca_tavor_post_receive,
+	.post_send = mthca_tavor_post_send,
+	.req_notify_cq = mthca_tavor_arm_cq,
+};
+
 int mthca_register_device(struct mthca_dev *dev)
 {
 	int ret;
-	int i;
 
 	ret = mthca_init_node_data(dev);
 	if (ret)
 		return ret;
 
-	strlcpy(dev->ib_dev.name, "mthca%d", IB_DEVICE_NAME_MAX);
-	dev->ib_dev.owner                = THIS_MODULE;
-
-	dev->ib_dev.uverbs_abi_ver	 = MTHCA_UVERBS_ABI_VERSION;
 	dev->ib_dev.uverbs_cmd_mask	 =
 		(1ull << IB_USER_VERBS_CMD_GET_CONTEXT)		|
 		(1ull << IB_USER_VERBS_CMD_QUERY_DEVICE)	|
@@ -1224,26 +1261,8 @@
 	dev->ib_dev.phys_port_cnt        = dev->limits.num_ports;
 	dev->ib_dev.num_comp_vectors     = 1;
 	dev->ib_dev.dev.parent           = &dev->pdev->dev;
-	dev->ib_dev.query_device         = mthca_query_device;
-	dev->ib_dev.query_port           = mthca_query_port;
-	dev->ib_dev.modify_device        = mthca_modify_device;
-	dev->ib_dev.modify_port          = mthca_modify_port;
-	dev->ib_dev.query_pkey           = mthca_query_pkey;
-	dev->ib_dev.query_gid            = mthca_query_gid;
-	dev->ib_dev.alloc_ucontext       = mthca_alloc_ucontext;
-	dev->ib_dev.dealloc_ucontext     = mthca_dealloc_ucontext;
-	dev->ib_dev.mmap                 = mthca_mmap_uar;
-	dev->ib_dev.alloc_pd             = mthca_alloc_pd;
-	dev->ib_dev.dealloc_pd           = mthca_dealloc_pd;
-	dev->ib_dev.create_ah            = mthca_ah_create;
-	dev->ib_dev.query_ah             = mthca_ah_query;
-	dev->ib_dev.destroy_ah           = mthca_ah_destroy;
 
 	if (dev->mthca_flags & MTHCA_FLAG_SRQ) {
-		dev->ib_dev.create_srq           = mthca_create_srq;
-		dev->ib_dev.modify_srq           = mthca_modify_srq;
-		dev->ib_dev.query_srq            = mthca_query_srq;
-		dev->ib_dev.destroy_srq          = mthca_destroy_srq;
 		dev->ib_dev.uverbs_cmd_mask	|=
 			(1ull << IB_USER_VERBS_CMD_CREATE_SRQ)		|
 			(1ull << IB_USER_VERBS_CMD_MODIFY_SRQ)		|
@@ -1251,65 +1270,36 @@
 			(1ull << IB_USER_VERBS_CMD_DESTROY_SRQ);
 
 		if (mthca_is_memfree(dev))
-			dev->ib_dev.post_srq_recv = mthca_arbel_post_srq_recv;
+			ib_set_device_ops(&dev->ib_dev,
+					  &mthca_dev_arbel_srq_ops);
 		else
-			dev->ib_dev.post_srq_recv = mthca_tavor_post_srq_recv;
+			ib_set_device_ops(&dev->ib_dev,
+					  &mthca_dev_tavor_srq_ops);
 	}
 
-	dev->ib_dev.create_qp            = mthca_create_qp;
-	dev->ib_dev.modify_qp            = mthca_modify_qp;
-	dev->ib_dev.query_qp             = mthca_query_qp;
-	dev->ib_dev.destroy_qp           = mthca_destroy_qp;
-	dev->ib_dev.create_cq            = mthca_create_cq;
-	dev->ib_dev.resize_cq            = mthca_resize_cq;
-	dev->ib_dev.destroy_cq           = mthca_destroy_cq;
-	dev->ib_dev.poll_cq              = mthca_poll_cq;
-	dev->ib_dev.get_dma_mr           = mthca_get_dma_mr;
-	dev->ib_dev.reg_user_mr          = mthca_reg_user_mr;
-	dev->ib_dev.dereg_mr             = mthca_dereg_mr;
-	dev->ib_dev.get_port_immutable   = mthca_port_immutable;
-	dev->ib_dev.get_dev_fw_str       = get_dev_fw_str;
-
 	if (dev->mthca_flags & MTHCA_FLAG_FMR) {
-		dev->ib_dev.alloc_fmr            = mthca_alloc_fmr;
-		dev->ib_dev.unmap_fmr            = mthca_unmap_fmr;
-		dev->ib_dev.dealloc_fmr          = mthca_dealloc_fmr;
 		if (mthca_is_memfree(dev))
-			dev->ib_dev.map_phys_fmr = mthca_arbel_map_phys_fmr;
+			ib_set_device_ops(&dev->ib_dev,
+					  &mthca_dev_arbel_fmr_ops);
 		else
-			dev->ib_dev.map_phys_fmr = mthca_tavor_map_phys_fmr;
+			ib_set_device_ops(&dev->ib_dev,
+					  &mthca_dev_tavor_fmr_ops);
 	}
 
-	dev->ib_dev.attach_mcast         = mthca_multicast_attach;
-	dev->ib_dev.detach_mcast         = mthca_multicast_detach;
-	dev->ib_dev.process_mad          = mthca_process_mad;
+	ib_set_device_ops(&dev->ib_dev, &mthca_dev_ops);
 
-	if (mthca_is_memfree(dev)) {
-		dev->ib_dev.req_notify_cq = mthca_arbel_arm_cq;
-		dev->ib_dev.post_send     = mthca_arbel_post_send;
-		dev->ib_dev.post_recv     = mthca_arbel_post_receive;
-	} else {
-		dev->ib_dev.req_notify_cq = mthca_tavor_arm_cq;
-		dev->ib_dev.post_send     = mthca_tavor_post_send;
-		dev->ib_dev.post_recv     = mthca_tavor_post_receive;
-	}
+	if (mthca_is_memfree(dev))
+		ib_set_device_ops(&dev->ib_dev, &mthca_dev_arbel_ops);
+	else
+		ib_set_device_ops(&dev->ib_dev, &mthca_dev_tavor_ops);
 
 	mutex_init(&dev->cap_mask_mutex);
 
-	dev->ib_dev.driver_id = RDMA_DRIVER_MTHCA;
-	ret = ib_register_device(&dev->ib_dev, NULL);
+	rdma_set_device_sysfs_group(&dev->ib_dev, &mthca_attr_group);
+	ret = ib_register_device(&dev->ib_dev, "mthca%d");
 	if (ret)
 		return ret;
 
-	for (i = 0; i < ARRAY_SIZE(mthca_dev_attributes); ++i) {
-		ret = device_create_file(&dev->ib_dev.dev,
-					 mthca_dev_attributes[i]);
-		if (ret) {
-			ib_unregister_device(&dev->ib_dev);
-			return ret;
-		}
-	}
-
 	mthca_start_catas_poll(dev);
 
 	return 0;
diff --git a/drivers/infiniband/hw/mthca/mthca_qp.c b/drivers/infiniband/hw/mthca/mthca_qp.c
index 3d37f23..d04c245 100644
--- a/drivers/infiniband/hw/mthca/mthca_qp.c
+++ b/drivers/infiniband/hw/mthca/mthca_qp.c
@@ -42,6 +42,7 @@
 #include <rdma/ib_verbs.h>
 #include <rdma/ib_cache.h>
 #include <rdma/ib_pack.h>
+#include <rdma/uverbs_ioctl.h>
 
 #include "mthca_dev.h"
 #include "mthca_cmd.h"
@@ -114,7 +115,7 @@
 	u8     hop_limit;
 	__be32 sl_tclass_flowlabel;
 	u8     rgid[16];
-} __attribute__((packed));
+} __packed;
 
 struct mthca_qp_context {
 	__be32 flags;
@@ -153,14 +154,14 @@
 	__be16 rq_wqe_counter;	/* reserved on Tavor */
 	__be16 sq_wqe_counter;	/* reserved on Tavor */
 	u32    reserved3[18];
-} __attribute__((packed));
+} __packed;
 
 struct mthca_qp_param {
 	__be32 opt_param_mask;
 	u32    reserved1;
 	struct mthca_qp_context context;
 	u32    reserved2[62];
-} __attribute__((packed));
+} __packed;
 
 enum {
 	MTHCA_QP_OPTPAR_ALT_ADDR_PATH     = 1 << 0,
@@ -554,10 +555,14 @@
 
 static int __mthca_modify_qp(struct ib_qp *ibqp,
 			     const struct ib_qp_attr *attr, int attr_mask,
-			     enum ib_qp_state cur_state, enum ib_qp_state new_state)
+			     enum ib_qp_state cur_state,
+			     enum ib_qp_state new_state,
+			     struct ib_udata *udata)
 {
 	struct mthca_dev *dev = to_mdev(ibqp->device);
 	struct mthca_qp *qp = to_mqp(ibqp);
+	struct mthca_ucontext *context = rdma_udata_to_drv_context(
+		udata, struct mthca_ucontext, ibucontext);
 	struct mthca_mailbox *mailbox;
 	struct mthca_qp_param *qp_param;
 	struct mthca_qp_context *qp_context;
@@ -619,8 +624,7 @@
 	/* leave arbel_sched_queue as 0 */
 
 	if (qp->ibqp.uobject)
-		qp_context->usr_page =
-			cpu_to_be32(to_mucontext(qp->ibqp.uobject->context)->uar.index);
+		qp_context->usr_page = cpu_to_be32(context->uar.index);
 	else
 		qp_context->usr_page = cpu_to_be32(dev->driver_uar.index);
 	qp_context->local_qpn  = cpu_to_be32(qp->qpn);
@@ -872,8 +876,8 @@
 
 	new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state;
 
-	if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, attr_mask,
-				IB_LINK_LAYER_UNSPECIFIED)) {
+	if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type,
+				attr_mask)) {
 		mthca_dbg(dev, "Bad QP transition (transport %d) "
 			  "%d->%d with attr 0x%08x\n",
 			  qp->transport, cur_state, new_state,
@@ -913,7 +917,8 @@
 		goto out;
 	}
 
-	err = __mthca_modify_qp(ibqp, attr, attr_mask, cur_state, new_state);
+	err = __mthca_modify_qp(ibqp, attr, attr_mask, cur_state, new_state,
+				udata);
 
 out:
 	mutex_unlock(&qp->mutex);
@@ -981,7 +986,8 @@
  */
 static int mthca_alloc_wqe_buf(struct mthca_dev *dev,
 			       struct mthca_pd *pd,
-			       struct mthca_qp *qp)
+			       struct mthca_qp *qp,
+			       struct ib_udata *udata)
 {
 	int size;
 	int err = -ENOMEM;
@@ -1048,7 +1054,7 @@
 	 * allocate anything.  All we need is to calculate the WQE
 	 * sizes and the send_wqe_offset, so we're done now.
 	 */
-	if (pd->ibpd.uobject)
+	if (udata)
 		return 0;
 
 	size = PAGE_ALIGN(qp->send_wqe_offset +
@@ -1155,7 +1161,8 @@
 				 struct mthca_cq *send_cq,
 				 struct mthca_cq *recv_cq,
 				 enum ib_sig_type send_policy,
-				 struct mthca_qp *qp)
+				 struct mthca_qp *qp,
+				 struct ib_udata *udata)
 {
 	int ret;
 	int i;
@@ -1178,7 +1185,7 @@
 	if (ret)
 		return ret;
 
-	ret = mthca_alloc_wqe_buf(dev, pd, qp);
+	ret = mthca_alloc_wqe_buf(dev, pd, qp, udata);
 	if (ret) {
 		mthca_unmap_memfree(dev, qp);
 		return ret;
@@ -1191,7 +1198,7 @@
 	 * will be allocated and buffers will be initialized in
 	 * userspace.
 	 */
-	if (pd->ibpd.uobject)
+	if (udata)
 		return 0;
 
 	ret = mthca_alloc_memfree(dev, qp);
@@ -1285,7 +1292,8 @@
 		   enum ib_qp_type type,
 		   enum ib_sig_type send_policy,
 		   struct ib_qp_cap *cap,
-		   struct mthca_qp *qp)
+		   struct mthca_qp *qp,
+		   struct ib_udata *udata)
 {
 	int err;
 
@@ -1308,7 +1316,7 @@
 	qp->port = 0;
 
 	err = mthca_alloc_qp_common(dev, pd, send_cq, recv_cq,
-				    send_policy, qp);
+				    send_policy, qp, udata);
 	if (err) {
 		mthca_free(&dev->qp_table.alloc, qp->qpn);
 		return err;
@@ -1360,7 +1368,8 @@
 		    struct ib_qp_cap *cap,
 		    int qpn,
 		    int port,
-		    struct mthca_sqp *sqp)
+		    struct mthca_sqp *sqp,
+		    struct ib_udata *udata)
 {
 	u32 mqpn = qpn * 2 + dev->qp_table.sqp_start + port - 1;
 	int err;
@@ -1391,7 +1400,7 @@
 	sqp->qp.transport = MLX;
 
 	err = mthca_alloc_qp_common(dev, pd, send_cq, recv_cq,
-				    send_policy, &sqp->qp);
+				    send_policy, &sqp->qp, udata);
 	if (err)
 		goto err_out_free;
 
@@ -1800,11 +1809,6 @@
 			      (qp->qpn << 8) | size0,
 			      dev->kar + MTHCA_SEND_DOORBELL,
 			      MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock));
-		/*
-		 * Make sure doorbells don't leak out of SQ spinlock
-		 * and reach the HCA out of order:
-		 */
-		mmiowb();
 	}
 
 	qp->sq.next_ind = ind;
@@ -1915,12 +1919,6 @@
 	qp->rq.next_ind = ind;
 	qp->rq.head    += nreq;
 
-	/*
-	 * Make sure doorbells don't leak out of RQ spinlock and reach
-	 * the HCA out of order:
-	 */
-	mmiowb();
-
 	spin_unlock_irqrestore(&qp->rq.lock, flags);
 	return err;
 }
@@ -2155,12 +2153,6 @@
 			      MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock));
 	}
 
-	/*
-	 * Make sure doorbells don't leak out of SQ spinlock and reach
-	 * the HCA out of order:
-	 */
-	mmiowb();
-
 	spin_unlock_irqrestore(&qp->sq.lock, flags);
 	return err;
 }
diff --git a/drivers/infiniband/hw/mthca/mthca_srq.c b/drivers/infiniband/hw/mthca/mthca_srq.c
index 9a3fc6f..a85935c 100644
--- a/drivers/infiniband/hw/mthca/mthca_srq.c
+++ b/drivers/infiniband/hw/mthca/mthca_srq.c
@@ -36,6 +36,8 @@
 
 #include <asm/io.h>
 
+#include <rdma/uverbs_ioctl.h>
+
 #include "mthca_dev.h"
 #include "mthca_cmd.h"
 #include "mthca_memfree.h"
@@ -95,17 +97,20 @@
 static void mthca_tavor_init_srq_context(struct mthca_dev *dev,
 					 struct mthca_pd *pd,
 					 struct mthca_srq *srq,
-					 struct mthca_tavor_srq_context *context)
+					 struct mthca_tavor_srq_context *context,
+					 struct ib_udata *udata)
 {
+	struct mthca_ucontext *ucontext = rdma_udata_to_drv_context(
+		udata, struct mthca_ucontext, ibucontext);
+
 	memset(context, 0, sizeof *context);
 
 	context->wqe_base_ds = cpu_to_be64(1 << (srq->wqe_shift - 4));
 	context->state_pd    = cpu_to_be32(pd->pd_num);
 	context->lkey        = cpu_to_be32(srq->mr.ibmr.lkey);
 
-	if (pd->ibpd.uobject)
-		context->uar =
-			cpu_to_be32(to_mucontext(pd->ibpd.uobject->context)->uar.index);
+	if (udata)
+		context->uar = cpu_to_be32(ucontext->uar.index);
 	else
 		context->uar = cpu_to_be32(dev->driver_uar.index);
 }
@@ -113,8 +118,11 @@
 static void mthca_arbel_init_srq_context(struct mthca_dev *dev,
 					 struct mthca_pd *pd,
 					 struct mthca_srq *srq,
-					 struct mthca_arbel_srq_context *context)
+					 struct mthca_arbel_srq_context *context,
+					 struct ib_udata *udata)
 {
+	struct mthca_ucontext *ucontext = rdma_udata_to_drv_context(
+		udata, struct mthca_ucontext, ibucontext);
 	int logsize, max;
 
 	memset(context, 0, sizeof *context);
@@ -129,9 +137,8 @@
 	context->lkey = cpu_to_be32(srq->mr.ibmr.lkey);
 	context->db_index = cpu_to_be32(srq->db_index);
 	context->logstride_usrpage = cpu_to_be32((srq->wqe_shift - 4) << 29);
-	if (pd->ibpd.uobject)
-		context->logstride_usrpage |=
-			cpu_to_be32(to_mucontext(pd->ibpd.uobject->context)->uar.index);
+	if (udata)
+		context->logstride_usrpage |= cpu_to_be32(ucontext->uar.index);
 	else
 		context->logstride_usrpage |= cpu_to_be32(dev->driver_uar.index);
 	context->eq_pd = cpu_to_be32(MTHCA_EQ_ASYNC << 24 | pd->pd_num);
@@ -145,14 +152,14 @@
 }
 
 static int mthca_alloc_srq_buf(struct mthca_dev *dev, struct mthca_pd *pd,
-			       struct mthca_srq *srq)
+			       struct mthca_srq *srq, struct ib_udata *udata)
 {
 	struct mthca_data_seg *scatter;
 	void *wqe;
 	int err;
 	int i;
 
-	if (pd->ibpd.uobject)
+	if (udata)
 		return 0;
 
 	srq->wrid = kmalloc_array(srq->max, sizeof(u64), GFP_KERNEL);
@@ -197,7 +204,8 @@
 }
 
 int mthca_alloc_srq(struct mthca_dev *dev, struct mthca_pd *pd,
-		    struct ib_srq_attr *attr, struct mthca_srq *srq)
+		    struct ib_srq_attr *attr, struct mthca_srq *srq,
+		    struct ib_udata *udata)
 {
 	struct mthca_mailbox *mailbox;
 	int ds;
@@ -235,7 +243,7 @@
 		if (err)
 			goto err_out;
 
-		if (!pd->ibpd.uobject) {
+		if (!udata) {
 			srq->db_index = mthca_alloc_db(dev, MTHCA_DB_TYPE_SRQ,
 						       srq->srqn, &srq->db);
 			if (srq->db_index < 0) {
@@ -251,7 +259,7 @@
 		goto err_out_db;
 	}
 
-	err = mthca_alloc_srq_buf(dev, pd, srq);
+	err = mthca_alloc_srq_buf(dev, pd, srq, udata);
 	if (err)
 		goto err_out_mailbox;
 
@@ -261,9 +269,9 @@
 	mutex_init(&srq->mutex);
 
 	if (mthca_is_memfree(dev))
-		mthca_arbel_init_srq_context(dev, pd, srq, mailbox->buf);
+		mthca_arbel_init_srq_context(dev, pd, srq, mailbox->buf, udata);
 	else
-		mthca_tavor_init_srq_context(dev, pd, srq, mailbox->buf);
+		mthca_tavor_init_srq_context(dev, pd, srq, mailbox->buf, udata);
 
 	err = mthca_SW2HW_SRQ(dev, mailbox, srq->srqn);
 
@@ -297,14 +305,14 @@
 		mthca_warn(dev, "HW2SW_SRQ failed (%d)\n", err);
 
 err_out_free_buf:
-	if (!pd->ibpd.uobject)
+	if (!udata)
 		mthca_free_srq_buf(dev, srq);
 
 err_out_mailbox:
 	mthca_free_mailbox(dev, mailbox);
 
 err_out_db:
-	if (!pd->ibpd.uobject && mthca_is_memfree(dev))
+	if (!udata && mthca_is_memfree(dev))
 		mthca_free_db(dev, MTHCA_DB_TYPE_SRQ, srq->db_index);
 
 err_out_icm:
@@ -562,12 +570,6 @@
 			      MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock));
 	}
 
-	/*
-	 * Make sure doorbells don't leak out of SRQ spinlock and
-	 * reach the HCA out of order:
-	 */
-	mmiowb();
-
 	spin_unlock_irqrestore(&srq->lock, flags);
 	return err;
 }
diff --git a/drivers/infiniband/hw/nes/Kconfig b/drivers/infiniband/hw/nes/Kconfig
deleted file mode 100644
index 7964eba..0000000
--- a/drivers/infiniband/hw/nes/Kconfig
+++ /dev/null
@@ -1,15 +0,0 @@
-config INFINIBAND_NES
-	tristate "NetEffect RNIC Driver"
-	depends on PCI && INET && INFINIBAND
-	select LIBCRC32C
-	---help---
-	  This is the RDMA Network Interface Card (RNIC) driver for
-	  NetEffect Ethernet Cluster Server Adapters.
-
-config INFINIBAND_NES_DEBUG
-	bool "Verbose debugging output"
-	depends on INFINIBAND_NES
-	default n
-	---help---
-	  This option enables debug messages from the NetEffect RNIC
-	  driver.  Select this if you are diagnosing a problem.
diff --git a/drivers/infiniband/hw/nes/Makefile b/drivers/infiniband/hw/nes/Makefile
deleted file mode 100644
index 97820c2..0000000
--- a/drivers/infiniband/hw/nes/Makefile
+++ /dev/null
@@ -1,3 +0,0 @@
-obj-$(CONFIG_INFINIBAND_NES) += iw_nes.o
-
-iw_nes-objs := nes.o nes_hw.o nes_nic.o nes_utils.o nes_verbs.o nes_cm.o nes_mgt.o
diff --git a/drivers/infiniband/hw/nes/nes.c b/drivers/infiniband/hw/nes/nes.c
deleted file mode 100644
index 42b68aa..0000000
--- a/drivers/infiniband/hw/nes/nes.c
+++ /dev/null
@@ -1,1208 +0,0 @@
-/*
- * Copyright (c) 2006 - 2011 Intel Corporation.  All rights reserved.
- * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include <linux/module.h>
-#include <linux/moduleparam.h>
-#include <linux/netdevice.h>
-#include <linux/etherdevice.h>
-#include <linux/ethtool.h>
-#include <linux/mii.h>
-#include <linux/if_vlan.h>
-#include <linux/crc32.h>
-#include <linux/in.h>
-#include <linux/fs.h>
-#include <linux/init.h>
-#include <linux/if_arp.h>
-#include <linux/highmem.h>
-#include <linux/slab.h>
-#include <asm/io.h>
-#include <asm/irq.h>
-#include <asm/byteorder.h>
-#include <rdma/ib_smi.h>
-#include <rdma/ib_verbs.h>
-#include <rdma/ib_pack.h>
-#include <rdma/iw_cm.h>
-
-#include "nes.h"
-
-#include <net/netevent.h>
-#include <net/neighbour.h>
-#include <linux/route.h>
-#include <net/ip_fib.h>
-
-MODULE_AUTHOR("NetEffect");
-MODULE_DESCRIPTION("NetEffect RNIC Low-level iWARP Driver");
-MODULE_LICENSE("Dual BSD/GPL");
-
-int interrupt_mod_interval = 0;
-
-/* Interoperability */
-int mpa_version = 1;
-module_param(mpa_version, int, 0644);
-MODULE_PARM_DESC(mpa_version, "MPA version to be used int MPA Req/Resp (0 or 1)");
-
-/* Interoperability */
-int disable_mpa_crc = 0;
-module_param(disable_mpa_crc, int, 0644);
-MODULE_PARM_DESC(disable_mpa_crc, "Disable checking of MPA CRC");
-
-unsigned int nes_drv_opt = NES_DRV_OPT_DISABLE_INT_MOD | NES_DRV_OPT_ENABLE_PAU;
-module_param(nes_drv_opt, int, 0644);
-MODULE_PARM_DESC(nes_drv_opt, "Driver option parameters");
-
-unsigned int nes_debug_level = 0;
-module_param_named(debug_level, nes_debug_level, uint, 0644);
-MODULE_PARM_DESC(debug_level, "Enable debug output level");
-
-unsigned int wqm_quanta = 0x10000;
-module_param(wqm_quanta, int, 0644);
-MODULE_PARM_DESC(wqm_quanta, "WQM quanta");
-
-static bool limit_maxrdreqsz;
-module_param(limit_maxrdreqsz, bool, 0644);
-MODULE_PARM_DESC(limit_maxrdreqsz, "Limit max read request size to 256 Bytes");
-
-LIST_HEAD(nes_adapter_list);
-static LIST_HEAD(nes_dev_list);
-
-atomic_t qps_destroyed;
-
-static unsigned int ee_flsh_adapter;
-static unsigned int sysfs_nonidx_addr;
-static unsigned int sysfs_idx_addr;
-
-static const struct pci_device_id nes_pci_table[] = {
-	{ PCI_VDEVICE(NETEFFECT, PCI_DEVICE_ID_NETEFFECT_NE020), },
-	{ PCI_VDEVICE(NETEFFECT, PCI_DEVICE_ID_NETEFFECT_NE020_KR), },
-	{0}
-};
-
-MODULE_DEVICE_TABLE(pci, nes_pci_table);
-
-static int nes_inetaddr_event(struct notifier_block *, unsigned long, void *);
-static int nes_net_event(struct notifier_block *, unsigned long, void *);
-static int nes_notifiers_registered;
-
-
-static struct notifier_block nes_inetaddr_notifier = {
-	.notifier_call = nes_inetaddr_event
-};
-
-static struct notifier_block nes_net_notifier = {
-	.notifier_call = nes_net_event
-};
-
-/**
- * nes_inetaddr_event
- */
-static int nes_inetaddr_event(struct notifier_block *notifier,
-		unsigned long event, void *ptr)
-{
-	struct in_ifaddr *ifa = ptr;
-	struct net_device *event_netdev = ifa->ifa_dev->dev;
-	struct nes_device *nesdev;
-	struct net_device *netdev;
-	struct net_device *upper_dev;
-	struct nes_vnic *nesvnic;
-	unsigned int is_bonded;
-
-	nes_debug(NES_DBG_NETDEV, "nes_inetaddr_event: ip address %pI4, netmask %pI4.\n",
-		  &ifa->ifa_address, &ifa->ifa_mask);
-	list_for_each_entry(nesdev, &nes_dev_list, list) {
-		nes_debug(NES_DBG_NETDEV, "Nesdev list entry = 0x%p. (%s)\n",
-				nesdev, nesdev->netdev[0]->name);
-		netdev = nesdev->netdev[0];
-		nesvnic = netdev_priv(netdev);
-		upper_dev = netdev_master_upper_dev_get(netdev);
-		is_bonded = netif_is_bond_slave(netdev) &&
-			    (upper_dev == event_netdev);
-		if ((netdev == event_netdev) || is_bonded) {
-			if (nesvnic->rdma_enabled == 0) {
-				nes_debug(NES_DBG_NETDEV, "Returning without processing event for %s since"
-						" RDMA is not enabled.\n",
-						netdev->name);
-				return NOTIFY_OK;
-			}
-			/* we have ifa->ifa_address/mask here if we need it */
-			switch (event) {
-				case NETDEV_DOWN:
-					nes_debug(NES_DBG_NETDEV, "event:DOWN\n");
-					nes_write_indexed(nesdev,
-							NES_IDX_DST_IP_ADDR+(0x10*PCI_FUNC(nesdev->pcidev->devfn)), 0);
-
-					nes_manage_arp_cache(netdev, netdev->dev_addr,
-							ntohl(nesvnic->local_ipaddr), NES_ARP_DELETE);
-					nesvnic->local_ipaddr = 0;
-					if (is_bonded)
-						continue;
-					else
-						return NOTIFY_OK;
-					break;
-				case NETDEV_UP:
-					nes_debug(NES_DBG_NETDEV, "event:UP\n");
-
-					if (nesvnic->local_ipaddr != 0) {
-						nes_debug(NES_DBG_NETDEV, "Interface already has local_ipaddr\n");
-						return NOTIFY_OK;
-					}
-					/* fall through */
-				case NETDEV_CHANGEADDR:
-					/* Add the address to the IP table */
-					if (upper_dev) {
-						struct in_device *in;
-
-						rcu_read_lock();
-						in = __in_dev_get_rcu(upper_dev);
-						nesvnic->local_ipaddr = in->ifa_list->ifa_address;
-						rcu_read_unlock();
-					} else {
-						nesvnic->local_ipaddr = ifa->ifa_address;
-					}
-
-					nes_write_indexed(nesdev,
-							NES_IDX_DST_IP_ADDR+(0x10*PCI_FUNC(nesdev->pcidev->devfn)),
-							ntohl(nesvnic->local_ipaddr));
-					nes_manage_arp_cache(netdev, netdev->dev_addr,
-							ntohl(nesvnic->local_ipaddr), NES_ARP_ADD);
-					if (is_bonded)
-						continue;
-					else
-						return NOTIFY_OK;
-					break;
-				default:
-					break;
-			}
-		}
-	}
-
-	return NOTIFY_DONE;
-}
-
-
-/**
- * nes_net_event
- */
-static int nes_net_event(struct notifier_block *notifier,
-		unsigned long event, void *ptr)
-{
-	struct neighbour *neigh = ptr;
-	struct nes_device *nesdev;
-	struct net_device *netdev;
-	struct nes_vnic *nesvnic;
-
-	switch (event) {
-		case NETEVENT_NEIGH_UPDATE:
-			list_for_each_entry(nesdev, &nes_dev_list, list) {
-				/* nes_debug(NES_DBG_NETDEV, "Nesdev list entry = 0x%p.\n", nesdev); */
-				netdev = nesdev->netdev[0];
-				nesvnic = netdev_priv(netdev);
-				if (netdev == neigh->dev) {
-					if (nesvnic->rdma_enabled == 0) {
-						nes_debug(NES_DBG_NETDEV, "Skipping device %s since no RDMA\n",
-								netdev->name);
-					} else {
-						if (neigh->nud_state & NUD_VALID) {
-							nes_manage_arp_cache(neigh->dev, neigh->ha,
-									ntohl(*(__be32 *)neigh->primary_key), NES_ARP_ADD);
-						} else {
-							nes_manage_arp_cache(neigh->dev, neigh->ha,
-									ntohl(*(__be32 *)neigh->primary_key), NES_ARP_DELETE);
-						}
-					}
-					return NOTIFY_OK;
-				}
-			}
-			break;
-		default:
-			nes_debug(NES_DBG_NETDEV, "NETEVENT_ %lu undefined\n", event);
-			break;
-	}
-
-	return NOTIFY_DONE;
-}
-
-
-/**
- * nes_add_ref
- */
-void nes_add_ref(struct ib_qp *ibqp)
-{
-	struct nes_qp *nesqp;
-
-	nesqp = to_nesqp(ibqp);
-	nes_debug(NES_DBG_QP, "Bumping refcount for QP%u.  Pre-inc value = %u\n",
-			ibqp->qp_num, atomic_read(&nesqp->refcount));
-	atomic_inc(&nesqp->refcount);
-}
-
-static void nes_cqp_rem_ref_callback(struct nes_device *nesdev, struct nes_cqp_request *cqp_request)
-{
-	unsigned long flags;
-	struct nes_qp *nesqp = cqp_request->cqp_callback_pointer;
-	struct nes_adapter *nesadapter = nesdev->nesadapter;
-
-	atomic_inc(&qps_destroyed);
-
-	/* Free the control structures */
-
-	if (nesqp->pbl_vbase) {
-		pci_free_consistent(nesdev->pcidev, nesqp->qp_mem_size,
-				nesqp->hwqp.q2_vbase, nesqp->hwqp.q2_pbase);
-		spin_lock_irqsave(&nesadapter->pbl_lock, flags);
-		nesadapter->free_256pbl++;
-		spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
-		pci_free_consistent(nesdev->pcidev, 256, nesqp->pbl_vbase, nesqp->pbl_pbase);
-		nesqp->pbl_vbase = NULL;
-
-	} else {
-		pci_free_consistent(nesdev->pcidev, nesqp->qp_mem_size,
-				nesqp->hwqp.sq_vbase, nesqp->hwqp.sq_pbase);
-	}
-	nes_free_resource(nesadapter, nesadapter->allocated_qps, nesqp->hwqp.qp_id);
-
-	nesadapter->qp_table[nesqp->hwqp.qp_id-NES_FIRST_QPN] = NULL;
-	kfree(nesqp->allocated_buffer);
-
-}
-
-/**
- * nes_rem_ref
- */
-void nes_rem_ref(struct ib_qp *ibqp)
-{
-	u64 u64temp;
-	struct nes_qp *nesqp;
-	struct nes_vnic *nesvnic = to_nesvnic(ibqp->device);
-	struct nes_device *nesdev = nesvnic->nesdev;
-	struct nes_hw_cqp_wqe *cqp_wqe;
-	struct nes_cqp_request *cqp_request;
-	u32 opcode;
-
-	nesqp = to_nesqp(ibqp);
-
-	if (atomic_read(&nesqp->refcount) == 0) {
-		printk(KERN_INFO PFX "%s: Reference count already 0 for QP%d, last aeq = 0x%04X.\n",
-				__func__, ibqp->qp_num, nesqp->last_aeq);
-		BUG();
-	}
-
-	if (atomic_dec_and_test(&nesqp->refcount)) {
-		if (nesqp->pau_mode)
-			nes_destroy_pau_qp(nesdev, nesqp);
-
-		/* Destroy the QP */
-		cqp_request = nes_get_cqp_request(nesdev);
-		if (cqp_request == NULL) {
-			nes_debug(NES_DBG_QP, "Failed to get a cqp_request.\n");
-			return;
-		}
-		cqp_request->waiting = 0;
-		cqp_request->callback = 1;
-		cqp_request->cqp_callback = nes_cqp_rem_ref_callback;
-		cqp_request->cqp_callback_pointer = nesqp;
-		cqp_wqe = &cqp_request->cqp_wqe;
-
-		nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
-		opcode = NES_CQP_DESTROY_QP | NES_CQP_QP_TYPE_IWARP;
-
-		if (nesqp->hte_added) {
-			opcode  |= NES_CQP_QP_DEL_HTE;
-			nesqp->hte_added = 0;
-		}
-		set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX, opcode);
-		set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX, nesqp->hwqp.qp_id);
-		u64temp = (u64)nesqp->nesqp_context_pbase;
-		set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_QP_WQE_CONTEXT_LOW_IDX, u64temp);
-		nes_post_cqp_request(nesdev, cqp_request);
-	}
-}
-
-
-/**
- * nes_get_qp
- */
-struct ib_qp *nes_get_qp(struct ib_device *device, int qpn)
-{
-	struct nes_vnic *nesvnic = to_nesvnic(device);
-	struct nes_device *nesdev = nesvnic->nesdev;
-	struct nes_adapter *nesadapter = nesdev->nesadapter;
-
-	if ((qpn < NES_FIRST_QPN) || (qpn >= (NES_FIRST_QPN + nesadapter->max_qp)))
-		return NULL;
-
-	return &nesadapter->qp_table[qpn - NES_FIRST_QPN]->ibqp;
-}
-
-
-/**
- * nes_print_macaddr
- */
-static void nes_print_macaddr(struct net_device *netdev)
-{
-	nes_debug(NES_DBG_INIT, "%s: %pM, IRQ %u\n",
-		  netdev->name, netdev->dev_addr, netdev->irq);
-}
-
-/**
- * nes_interrupt - handle interrupts
- */
-static irqreturn_t nes_interrupt(int irq, void *dev_id)
-{
-	struct nes_device *nesdev = (struct nes_device *)dev_id;
-	int handled = 0;
-	u32 int_mask;
-	u32 int_req;
-	u32 int_stat;
-	u32 intf_int_stat;
-	u32 timer_stat;
-
-	if (nesdev->msi_enabled) {
-		/* No need to read the interrupt pending register if msi is enabled */
-		handled = 1;
-	} else {
-		if (unlikely(nesdev->nesadapter->hw_rev == NE020_REV)) {
-			/* Master interrupt enable provides synchronization for kicking off bottom half
-			  when interrupt sharing is going on */
-			int_mask = nes_read32(nesdev->regs + NES_INT_MASK);
-			if (int_mask & 0x80000000) {
-				/* Check interrupt status to see if this might be ours */
-				int_stat = nes_read32(nesdev->regs + NES_INT_STAT);
-				int_req = nesdev->int_req;
-				if (int_stat&int_req) {
-					/* if interesting CEQ or AEQ is pending, claim the interrupt */
-					if ((int_stat&int_req) & (~(NES_INT_TIMER|NES_INT_INTF))) {
-						handled = 1;
-					} else {
-						if (((int_stat & int_req) & NES_INT_TIMER) == NES_INT_TIMER) {
-							/* Timer might be running but might be for another function */
-							timer_stat = nes_read32(nesdev->regs + NES_TIMER_STAT);
-							if ((timer_stat & nesdev->timer_int_req) != 0) {
-								handled = 1;
-							}
-						}
-						if ((((int_stat & int_req) & NES_INT_INTF) == NES_INT_INTF) &&
-								(handled == 0)) {
-							intf_int_stat = nes_read32(nesdev->regs+NES_INTF_INT_STAT);
-							if ((intf_int_stat & nesdev->intf_int_req) != 0) {
-								handled = 1;
-							}
-						}
-					}
-					if (handled) {
-						nes_write32(nesdev->regs+NES_INT_MASK, int_mask & (~0x80000000));
-						int_mask = nes_read32(nesdev->regs+NES_INT_MASK);
-						/* Save off the status to save an additional read */
-						nesdev->int_stat = int_stat;
-						nesdev->napi_isr_ran = 1;
-					}
-				}
-			}
-		} else {
-			handled = nes_read32(nesdev->regs+NES_INT_PENDING);
-		}
-	}
-
-	if (handled) {
-
-		if (nes_napi_isr(nesdev) == 0) {
-			tasklet_schedule(&nesdev->dpc_tasklet);
-
-		}
-		return IRQ_HANDLED;
-	} else {
-		return IRQ_NONE;
-	}
-}
-
-
-/**
- * nes_probe - Device initialization
- */
-static int nes_probe(struct pci_dev *pcidev, const struct pci_device_id *ent)
-{
-	struct net_device *netdev = NULL;
-	struct nes_device *nesdev = NULL;
-	int ret = 0;
-	void __iomem *mmio_regs = NULL;
-	u8 hw_rev;
-
-	assert(pcidev != NULL);
-	assert(ent != NULL);
-
-	printk(KERN_INFO PFX "NetEffect RNIC driver v%s loading. (%s)\n",
-			DRV_VERSION, pci_name(pcidev));
-
-	ret = pci_enable_device(pcidev);
-	if (ret) {
-		printk(KERN_ERR PFX "Unable to enable PCI device. (%s)\n", pci_name(pcidev));
-		goto bail0;
-	}
-
-	nes_debug(NES_DBG_INIT, "BAR0 (@0x%08lX) size = 0x%lX bytes\n",
-			(long unsigned int)pci_resource_start(pcidev, BAR_0),
-			(long unsigned int)pci_resource_len(pcidev, BAR_0));
-	nes_debug(NES_DBG_INIT, "BAR1 (@0x%08lX) size = 0x%lX bytes\n",
-			(long unsigned int)pci_resource_start(pcidev, BAR_1),
-			(long unsigned int)pci_resource_len(pcidev, BAR_1));
-
-	/* Make sure PCI base addr are MMIO */
-	if (!(pci_resource_flags(pcidev, BAR_0) & IORESOURCE_MEM) ||
-			!(pci_resource_flags(pcidev, BAR_1) & IORESOURCE_MEM)) {
-		printk(KERN_ERR PFX "PCI regions not an MMIO resource\n");
-		ret = -ENODEV;
-		goto bail1;
-	}
-
-	/* Reserve PCI I/O and memory resources */
-	ret = pci_request_regions(pcidev, DRV_NAME);
-	if (ret) {
-		printk(KERN_ERR PFX "Unable to request regions. (%s)\n", pci_name(pcidev));
-		goto bail1;
-	}
-
-	if ((sizeof(dma_addr_t) > 4)) {
-		ret = pci_set_dma_mask(pcidev, DMA_BIT_MASK(64));
-		if (ret < 0) {
-			printk(KERN_ERR PFX "64b DMA mask configuration failed\n");
-			goto bail2;
-		}
-		ret = pci_set_consistent_dma_mask(pcidev, DMA_BIT_MASK(64));
-		if (ret) {
-			printk(KERN_ERR PFX "64b DMA consistent mask configuration failed\n");
-			goto bail2;
-		}
-	} else {
-		ret = pci_set_dma_mask(pcidev, DMA_BIT_MASK(32));
-		if (ret < 0) {
-			printk(KERN_ERR PFX "32b DMA mask configuration failed\n");
-			goto bail2;
-		}
-		ret = pci_set_consistent_dma_mask(pcidev, DMA_BIT_MASK(32));
-		if (ret) {
-			printk(KERN_ERR PFX "32b DMA consistent mask configuration failed\n");
-			goto bail2;
-		}
-	}
-
-	pci_set_master(pcidev);
-
-	/* Allocate hardware structure */
-	nesdev = kzalloc(sizeof(struct nes_device), GFP_KERNEL);
-	if (!nesdev) {
-		ret = -ENOMEM;
-		goto bail2;
-	}
-
-	nes_debug(NES_DBG_INIT, "Allocated nes device at %p\n", nesdev);
-	nesdev->pcidev = pcidev;
-	pci_set_drvdata(pcidev, nesdev);
-
-	pci_read_config_byte(pcidev, 0x0008, &hw_rev);
-	nes_debug(NES_DBG_INIT, "hw_rev=%u\n", hw_rev);
-
-	spin_lock_init(&nesdev->indexed_regs_lock);
-
-	/* Remap the PCI registers in adapter BAR0 to kernel VA space */
-	mmio_regs = ioremap_nocache(pci_resource_start(pcidev, BAR_0),
-				    pci_resource_len(pcidev, BAR_0));
-	if (mmio_regs == NULL) {
-		printk(KERN_ERR PFX "Unable to remap BAR0\n");
-		ret = -EIO;
-		goto bail3;
-	}
-	nesdev->regs = mmio_regs;
-	nesdev->index_reg = 0x50 + (PCI_FUNC(pcidev->devfn)*8) + mmio_regs;
-
-	/* Ensure interrupts are disabled */
-	nes_write32(nesdev->regs+NES_INT_MASK, 0x7fffffff);
-
-	if (nes_drv_opt & NES_DRV_OPT_ENABLE_MSI) {
-		if (!pci_enable_msi(nesdev->pcidev)) {
-			nesdev->msi_enabled = 1;
-			nes_debug(NES_DBG_INIT, "MSI is enabled for device %s\n",
-					pci_name(pcidev));
-		} else {
-			nes_debug(NES_DBG_INIT, "MSI is disabled by linux for device %s\n",
-					pci_name(pcidev));
-		}
-	} else {
-		nes_debug(NES_DBG_INIT, "MSI not requested due to driver options for device %s\n",
-				pci_name(pcidev));
-	}
-
-	nesdev->csr_start = pci_resource_start(nesdev->pcidev, BAR_0);
-	nesdev->doorbell_region = pci_resource_start(nesdev->pcidev, BAR_1);
-
-	/* Init the adapter */
-	nesdev->nesadapter = nes_init_adapter(nesdev, hw_rev);
-	if (!nesdev->nesadapter) {
-		printk(KERN_ERR PFX "Unable to initialize adapter.\n");
-		ret = -ENOMEM;
-		goto bail5;
-	}
-	nesdev->nesadapter->et_rx_coalesce_usecs_irq = interrupt_mod_interval;
-	nesdev->nesadapter->wqm_quanta = wqm_quanta;
-
-	/* nesdev->base_doorbell_index =
-			nesdev->nesadapter->pd_config_base[PCI_FUNC(nesdev->pcidev->devfn)]; */
-	nesdev->base_doorbell_index = 1;
-	nesdev->doorbell_start = nesdev->nesadapter->doorbell_start;
-	if (nesdev->nesadapter->phy_type[0] == NES_PHY_TYPE_PUMA_1G) {
-		switch (PCI_FUNC(nesdev->pcidev->devfn) %
-			nesdev->nesadapter->port_count) {
-		case 1:
-			nesdev->mac_index = 2;
-			break;
-		case 2:
-			nesdev->mac_index = 1;
-			break;
-		case 3:
-			nesdev->mac_index = 3;
-			break;
-		case 0:
-		default:
-			nesdev->mac_index = 0;
-		}
-	} else {
-		nesdev->mac_index = PCI_FUNC(nesdev->pcidev->devfn) %
-						nesdev->nesadapter->port_count;
-	}
-
-	if ((limit_maxrdreqsz ||
-	     ((nesdev->nesadapter->phy_type[0] == NES_PHY_TYPE_GLADIUS) &&
-	      (hw_rev == NE020_REV1))) &&
-	    (pcie_get_readrq(pcidev) > 256)) {
-		if (pcie_set_readrq(pcidev, 256))
-			printk(KERN_ERR PFX "Unable to set max read request"
-				" to 256 bytes\n");
-		else
-			nes_debug(NES_DBG_INIT, "Max read request size set"
-				" to 256 bytes\n");
-	}
-
-	tasklet_init(&nesdev->dpc_tasklet, nes_dpc, (unsigned long)nesdev);
-
-	/* bring up the Control QP */
-	if (nes_init_cqp(nesdev)) {
-		ret = -ENODEV;
-		goto bail6;
-	}
-
-	/* Arm the CCQ */
-	nes_write32(nesdev->regs+NES_CQE_ALLOC, NES_CQE_ALLOC_NOTIFY_NEXT |
-			PCI_FUNC(nesdev->pcidev->devfn));
-	nes_read32(nesdev->regs+NES_CQE_ALLOC);
-
-	/* Enable the interrupts */
-	nesdev->int_req = (0x101 << PCI_FUNC(nesdev->pcidev->devfn)) |
-			(1 << (PCI_FUNC(nesdev->pcidev->devfn)+16));
-	if (PCI_FUNC(nesdev->pcidev->devfn) < 4) {
-		nesdev->int_req |= (1 << (PCI_FUNC(nesdev->mac_index)+24));
-	}
-
-	/* TODO: This really should be the first driver to load, not function 0 */
-	if (PCI_FUNC(nesdev->pcidev->devfn) == 0) {
-		/* pick up PCI and critical errors if the first driver to load */
-		nesdev->intf_int_req = NES_INTF_INT_PCIERR | NES_INTF_INT_CRITERR;
-		nesdev->int_req |= NES_INT_INTF;
-	} else {
-		nesdev->intf_int_req = 0;
-	}
-	nesdev->intf_int_req |= (1 << (PCI_FUNC(nesdev->pcidev->devfn)+16));
-	nes_write_indexed(nesdev, NES_IDX_DEBUG_ERROR_MASKS0, 0);
-	nes_write_indexed(nesdev, NES_IDX_DEBUG_ERROR_MASKS1, 0);
-	nes_write_indexed(nesdev, NES_IDX_DEBUG_ERROR_MASKS2, 0x00001265);
-	nes_write_indexed(nesdev, NES_IDX_DEBUG_ERROR_MASKS4, 0x18021804);
-
-	nes_write_indexed(nesdev, NES_IDX_DEBUG_ERROR_MASKS3, 0x17801790);
-
-	/* deal with both periodic and one_shot */
-	nesdev->timer_int_req = 0x101 << PCI_FUNC(nesdev->pcidev->devfn);
-	nesdev->nesadapter->timer_int_req |= nesdev->timer_int_req;
-	nes_debug(NES_DBG_INIT, "setting int_req for function %u, nesdev = 0x%04X, adapter = 0x%04X\n",
-			PCI_FUNC(nesdev->pcidev->devfn),
-			nesdev->timer_int_req, nesdev->nesadapter->timer_int_req);
-
-	nes_write32(nesdev->regs+NES_INTF_INT_MASK, ~(nesdev->intf_int_req));
-
-	list_add_tail(&nesdev->list, &nes_dev_list);
-
-	/* Request an interrupt line for the driver */
-	ret = request_irq(pcidev->irq, nes_interrupt, IRQF_SHARED, DRV_NAME, nesdev);
-	if (ret) {
-		printk(KERN_ERR PFX "%s: requested IRQ %u is busy\n",
-				pci_name(pcidev), pcidev->irq);
-		goto bail65;
-	}
-
-	nes_write32(nesdev->regs+NES_INT_MASK, ~nesdev->int_req);
-
-	if (nes_notifiers_registered == 0) {
-		register_inetaddr_notifier(&nes_inetaddr_notifier);
-		register_netevent_notifier(&nes_net_notifier);
-	}
-	nes_notifiers_registered++;
-
-	INIT_DELAYED_WORK(&nesdev->work, nes_recheck_link_status);
-
-	/* Initialize network devices */
-	netdev = nes_netdev_init(nesdev, mmio_regs);
-	if (netdev == NULL) {
-		ret = -ENOMEM;
-		goto bail7;
-	}
-
-	/* Register network device */
-	ret = register_netdev(netdev);
-	if (ret) {
-		printk(KERN_ERR PFX "Unable to register netdev, ret = %d\n", ret);
-		nes_netdev_destroy(netdev);
-		goto bail7;
-	}
-
-	nes_print_macaddr(netdev);
-
-	nesdev->netdev_count++;
-	nesdev->nesadapter->netdev_count++;
-
-	printk(KERN_INFO PFX "%s: NetEffect RNIC driver successfully loaded.\n",
-			pci_name(pcidev));
-	return 0;
-
-	bail7:
-	printk(KERN_ERR PFX "bail7\n");
-	while (nesdev->netdev_count > 0) {
-		nesdev->netdev_count--;
-		nesdev->nesadapter->netdev_count--;
-
-		unregister_netdev(nesdev->netdev[nesdev->netdev_count]);
-		nes_netdev_destroy(nesdev->netdev[nesdev->netdev_count]);
-	}
-
-	nes_debug(NES_DBG_INIT, "netdev_count=%d, nesadapter->netdev_count=%d\n",
-			nesdev->netdev_count, nesdev->nesadapter->netdev_count);
-
-	nes_notifiers_registered--;
-	if (nes_notifiers_registered == 0) {
-		unregister_netevent_notifier(&nes_net_notifier);
-		unregister_inetaddr_notifier(&nes_inetaddr_notifier);
-	}
-
-	list_del(&nesdev->list);
-	nes_destroy_cqp(nesdev);
-
-	bail65:
-	printk(KERN_ERR PFX "bail65\n");
-	free_irq(pcidev->irq, nesdev);
-	if (nesdev->msi_enabled) {
-		pci_disable_msi(pcidev);
-	}
-	bail6:
-	printk(KERN_ERR PFX "bail6\n");
-	tasklet_kill(&nesdev->dpc_tasklet);
-	/* Deallocate the Adapter Structure */
-	nes_destroy_adapter(nesdev->nesadapter);
-
-	bail5:
-	printk(KERN_ERR PFX "bail5\n");
-	iounmap(nesdev->regs);
-
-	bail3:
-	printk(KERN_ERR PFX "bail3\n");
-	kfree(nesdev);
-
-	bail2:
-	pci_release_regions(pcidev);
-
-	bail1:
-	pci_disable_device(pcidev);
-
-	bail0:
-	return ret;
-}
-
-
-/**
- * nes_remove - unload from kernel
- */
-static void nes_remove(struct pci_dev *pcidev)
-{
-	struct nes_device *nesdev = pci_get_drvdata(pcidev);
-	struct net_device *netdev;
-	int netdev_index = 0;
-	unsigned long flags;
-
-	if (nesdev->netdev_count) {
-		netdev = nesdev->netdev[netdev_index];
-		if (netdev) {
-			netif_stop_queue(netdev);
-			unregister_netdev(netdev);
-			nes_netdev_destroy(netdev);
-
-			nesdev->netdev[netdev_index] = NULL;
-			nesdev->netdev_count--;
-			nesdev->nesadapter->netdev_count--;
-		}
-	}
-
-	nes_notifiers_registered--;
-	if (nes_notifiers_registered == 0) {
-		unregister_netevent_notifier(&nes_net_notifier);
-		unregister_inetaddr_notifier(&nes_inetaddr_notifier);
-	}
-
-	list_del(&nesdev->list);
-	nes_destroy_cqp(nesdev);
-
-	free_irq(pcidev->irq, nesdev);
-	tasklet_kill(&nesdev->dpc_tasklet);
-
-	spin_lock_irqsave(&nesdev->nesadapter->phy_lock, flags);
-	if (nesdev->link_recheck) {
-		spin_unlock_irqrestore(&nesdev->nesadapter->phy_lock, flags);
-		cancel_delayed_work_sync(&nesdev->work);
-	} else {
-		spin_unlock_irqrestore(&nesdev->nesadapter->phy_lock, flags);
-	}
-
-	/* Deallocate the Adapter Structure */
-	nes_destroy_adapter(nesdev->nesadapter);
-
-	if (nesdev->msi_enabled) {
-		pci_disable_msi(pcidev);
-	}
-
-	iounmap(nesdev->regs);
-	kfree(nesdev);
-
-	/* nes_debug(NES_DBG_SHUTDOWN, "calling pci_release_regions.\n"); */
-	pci_release_regions(pcidev);
-	pci_disable_device(pcidev);
-	pci_set_drvdata(pcidev, NULL);
-}
-
-
-static ssize_t adapter_show(struct device_driver *ddp, char *buf)
-{
-	unsigned int  devfn = 0xffffffff;
-	unsigned char bus_number = 0xff;
-	unsigned int  i = 0;
-	struct nes_device *nesdev;
-
-	list_for_each_entry(nesdev, &nes_dev_list, list) {
-		if (i == ee_flsh_adapter) {
-			devfn = nesdev->pcidev->devfn;
-			bus_number = nesdev->pcidev->bus->number;
-			break;
-		}
-		i++;
-	}
-
-	return snprintf(buf, PAGE_SIZE, "%x:%x\n", bus_number, devfn);
-}
-
-static ssize_t adapter_store(struct device_driver *ddp,
-	const char *buf, size_t count)
-{
-	char *p = (char *)buf;
-
-	ee_flsh_adapter = simple_strtoul(p, &p, 10);
-	return strnlen(buf, count);
-}
-
-static ssize_t eeprom_cmd_show(struct device_driver *ddp, char *buf)
-{
-	u32 eeprom_cmd = 0xdead;
-	u32 i = 0;
-	struct nes_device *nesdev;
-
-	list_for_each_entry(nesdev, &nes_dev_list, list) {
-		if (i == ee_flsh_adapter) {
-			eeprom_cmd = nes_read32(nesdev->regs + NES_EEPROM_COMMAND);
-			break;
-		}
-		i++;
-	}
-	return snprintf(buf, PAGE_SIZE, "0x%x\n", eeprom_cmd);
-}
-
-static ssize_t eeprom_cmd_store(struct device_driver *ddp,
-	const char *buf, size_t count)
-{
-	char *p = (char *)buf;
-	u32 val;
-	u32 i = 0;
-	struct nes_device *nesdev;
-
-	if (p[1] == 'x' || p[1] == 'X' || p[0] == 'x' || p[0] == 'X') {
-		val = simple_strtoul(p, &p, 16);
-		list_for_each_entry(nesdev, &nes_dev_list, list) {
-			if (i == ee_flsh_adapter) {
-				nes_write32(nesdev->regs + NES_EEPROM_COMMAND, val);
-				break;
-			}
-			i++;
-		}
-	}
-	return strnlen(buf, count);
-}
-
-static ssize_t eeprom_data_show(struct device_driver *ddp, char *buf)
-{
-	u32 eeprom_data = 0xdead;
-	u32 i = 0;
-	struct nes_device *nesdev;
-
-	list_for_each_entry(nesdev, &nes_dev_list, list) {
-		if (i == ee_flsh_adapter) {
-			eeprom_data = nes_read32(nesdev->regs + NES_EEPROM_DATA);
-			break;
-		}
-		i++;
-	}
-
-	return  snprintf(buf, PAGE_SIZE, "0x%x\n", eeprom_data);
-}
-
-static ssize_t eeprom_data_store(struct device_driver *ddp,
-	const char *buf, size_t count)
-{
-	char *p = (char *)buf;
-	u32 val;
-	u32 i = 0;
-	struct nes_device *nesdev;
-
-	if (p[1] == 'x' || p[1] == 'X' || p[0] == 'x' || p[0] == 'X') {
-		val = simple_strtoul(p, &p, 16);
-		list_for_each_entry(nesdev, &nes_dev_list, list) {
-			if (i == ee_flsh_adapter) {
-				nes_write32(nesdev->regs + NES_EEPROM_DATA, val);
-				break;
-			}
-			i++;
-		}
-	}
-	return strnlen(buf, count);
-}
-
-static ssize_t flash_cmd_show(struct device_driver *ddp, char *buf)
-{
-	u32 flash_cmd = 0xdead;
-	u32 i = 0;
-	struct nes_device *nesdev;
-
-	list_for_each_entry(nesdev, &nes_dev_list, list) {
-		if (i == ee_flsh_adapter) {
-			flash_cmd = nes_read32(nesdev->regs + NES_FLASH_COMMAND);
-			break;
-		}
-		i++;
-	}
-
-	return  snprintf(buf, PAGE_SIZE, "0x%x\n", flash_cmd);
-}
-
-static ssize_t flash_cmd_store(struct device_driver *ddp,
-	const char *buf, size_t count)
-{
-	char *p = (char *)buf;
-	u32 val;
-	u32 i = 0;
-	struct nes_device *nesdev;
-
-	if (p[1] == 'x' || p[1] == 'X' || p[0] == 'x' || p[0] == 'X') {
-		val = simple_strtoul(p, &p, 16);
-		list_for_each_entry(nesdev, &nes_dev_list, list) {
-			if (i == ee_flsh_adapter) {
-				nes_write32(nesdev->regs + NES_FLASH_COMMAND, val);
-				break;
-			}
-			i++;
-		}
-	}
-	return strnlen(buf, count);
-}
-
-static ssize_t flash_data_show(struct device_driver *ddp, char *buf)
-{
-	u32 flash_data = 0xdead;
-	u32 i = 0;
-	struct nes_device *nesdev;
-
-	list_for_each_entry(nesdev, &nes_dev_list, list) {
-		if (i == ee_flsh_adapter) {
-			flash_data = nes_read32(nesdev->regs + NES_FLASH_DATA);
-			break;
-		}
-		i++;
-	}
-
-	return  snprintf(buf, PAGE_SIZE, "0x%x\n", flash_data);
-}
-
-static ssize_t flash_data_store(struct device_driver *ddp,
-	const char *buf, size_t count)
-{
-	char *p = (char *)buf;
-	u32 val;
-	u32 i = 0;
-	struct nes_device *nesdev;
-
-	if (p[1] == 'x' || p[1] == 'X' || p[0] == 'x' || p[0] == 'X') {
-		val = simple_strtoul(p, &p, 16);
-		list_for_each_entry(nesdev, &nes_dev_list, list) {
-			if (i == ee_flsh_adapter) {
-				nes_write32(nesdev->regs + NES_FLASH_DATA, val);
-				break;
-			}
-			i++;
-		}
-	}
-	return strnlen(buf, count);
-}
-
-static ssize_t nonidx_addr_show(struct device_driver *ddp, char *buf)
-{
-	return  snprintf(buf, PAGE_SIZE, "0x%x\n", sysfs_nonidx_addr);
-}
-
-static ssize_t nonidx_addr_store(struct device_driver *ddp,
-	const char *buf, size_t count)
-{
-	char *p = (char *)buf;
-
-	if (p[1] == 'x' || p[1] == 'X' || p[0] == 'x' || p[0] == 'X')
-		sysfs_nonidx_addr = simple_strtoul(p, &p, 16);
-
-	return strnlen(buf, count);
-}
-
-static ssize_t nonidx_data_show(struct device_driver *ddp, char *buf)
-{
-	u32 nonidx_data = 0xdead;
-	u32 i = 0;
-	struct nes_device *nesdev;
-
-	list_for_each_entry(nesdev, &nes_dev_list, list) {
-		if (i == ee_flsh_adapter) {
-			nonidx_data = nes_read32(nesdev->regs + sysfs_nonidx_addr);
-			break;
-		}
-		i++;
-	}
-
-	return  snprintf(buf, PAGE_SIZE, "0x%x\n", nonidx_data);
-}
-
-static ssize_t nonidx_data_store(struct device_driver *ddp,
-	const char *buf, size_t count)
-{
-	char *p = (char *)buf;
-	u32 val;
-	u32 i = 0;
-	struct nes_device *nesdev;
-
-	if (p[1] == 'x' || p[1] == 'X' || p[0] == 'x' || p[0] == 'X') {
-		val = simple_strtoul(p, &p, 16);
-		list_for_each_entry(nesdev, &nes_dev_list, list) {
-			if (i == ee_flsh_adapter) {
-				nes_write32(nesdev->regs + sysfs_nonidx_addr, val);
-				break;
-			}
-			i++;
-		}
-	}
-	return strnlen(buf, count);
-}
-
-static ssize_t idx_addr_show(struct device_driver *ddp, char *buf)
-{
-	return  snprintf(buf, PAGE_SIZE, "0x%x\n", sysfs_idx_addr);
-}
-
-static ssize_t idx_addr_store(struct device_driver *ddp,
-	const char *buf, size_t count)
-{
-	char *p = (char *)buf;
-
-	if (p[1] == 'x' || p[1] == 'X' || p[0] == 'x' || p[0] == 'X')
-		sysfs_idx_addr = simple_strtoul(p, &p, 16);
-
-	return strnlen(buf, count);
-}
-
-static ssize_t idx_data_show(struct device_driver *ddp, char *buf)
-{
-	u32 idx_data = 0xdead;
-	u32 i = 0;
-	struct nes_device *nesdev;
-
-	list_for_each_entry(nesdev, &nes_dev_list, list) {
-		if (i == ee_flsh_adapter) {
-			idx_data = nes_read_indexed(nesdev, sysfs_idx_addr);
-			break;
-		}
-		i++;
-	}
-
-	return  snprintf(buf, PAGE_SIZE, "0x%x\n", idx_data);
-}
-
-static ssize_t idx_data_store(struct device_driver *ddp,
-	const char *buf, size_t count)
-{
-	char *p = (char *)buf;
-	u32 val;
-	u32 i = 0;
-	struct nes_device *nesdev;
-
-	if (p[1] == 'x' || p[1] == 'X' || p[0] == 'x' || p[0] == 'X') {
-		val = simple_strtoul(p, &p, 16);
-		list_for_each_entry(nesdev, &nes_dev_list, list) {
-			if (i == ee_flsh_adapter) {
-				nes_write_indexed(nesdev, sysfs_idx_addr, val);
-				break;
-			}
-			i++;
-		}
-	}
-	return strnlen(buf, count);
-}
-
-static ssize_t wqm_quanta_show(struct device_driver *ddp, char *buf)
-{
-	u32 wqm_quanta_value = 0xdead;
-	u32 i = 0;
-	struct nes_device *nesdev;
-
-	list_for_each_entry(nesdev, &nes_dev_list, list) {
-		if (i == ee_flsh_adapter) {
-			wqm_quanta_value = nesdev->nesadapter->wqm_quanta;
-			break;
-		}
-		i++;
-	}
-
-	return  snprintf(buf, PAGE_SIZE, "0x%X\n", wqm_quanta_value);
-}
-
-static ssize_t wqm_quanta_store(struct device_driver *ddp, const char *buf,
-				size_t count)
-{
-	unsigned long wqm_quanta_value;
-	u32 wqm_config1;
-	u32 i = 0;
-	struct nes_device *nesdev;
-
-	if (kstrtoul(buf, 0, &wqm_quanta_value) < 0)
-		return -EINVAL;
-
-	list_for_each_entry(nesdev, &nes_dev_list, list) {
-		if (i == ee_flsh_adapter) {
-			nesdev->nesadapter->wqm_quanta = wqm_quanta_value;
-			wqm_config1 = nes_read_indexed(nesdev,
-						NES_IDX_WQM_CONFIG1);
-			nes_write_indexed(nesdev, NES_IDX_WQM_CONFIG1,
-					((wqm_quanta_value << 1) |
-					(wqm_config1 & 0x00000001)));
-			break;
-		}
-		i++;
-	}
-	return strnlen(buf, count);
-}
-
-static DRIVER_ATTR_RW(adapter);
-static DRIVER_ATTR_RW(eeprom_cmd);
-static DRIVER_ATTR_RW(eeprom_data);
-static DRIVER_ATTR_RW(flash_cmd);
-static DRIVER_ATTR_RW(flash_data);
-static DRIVER_ATTR_RW(nonidx_addr);
-static DRIVER_ATTR_RW(nonidx_data);
-static DRIVER_ATTR_RW(idx_addr);
-static DRIVER_ATTR_RW(idx_data);
-static DRIVER_ATTR_RW(wqm_quanta);
-
-static struct attribute *nes_attrs[] = {
-	&driver_attr_adapter.attr,
-	&driver_attr_eeprom_cmd.attr,
-	&driver_attr_eeprom_data.attr,
-	&driver_attr_flash_cmd.attr,
-	&driver_attr_flash_data.attr,
-	&driver_attr_nonidx_addr.attr,
-	&driver_attr_nonidx_data.attr,
-	&driver_attr_idx_addr.attr,
-	&driver_attr_idx_data.attr,
-	&driver_attr_wqm_quanta.attr,
-	NULL,
-};
-ATTRIBUTE_GROUPS(nes);
-
-static struct pci_driver nes_pci_driver = {
-	.name = DRV_NAME,
-	.id_table = nes_pci_table,
-	.probe = nes_probe,
-	.remove = nes_remove,
-	.groups = nes_groups,
-};
-
-
-/**
- * nes_init_module - module initialization entry point
- */
-static int __init nes_init_module(void)
-{
-	int retval;
-
-	retval = nes_cm_start();
-	if (retval) {
-		printk(KERN_ERR PFX "Unable to start NetEffect iWARP CM.\n");
-		return retval;
-	}
-	return pci_register_driver(&nes_pci_driver);
-}
-
-
-/**
- * nes_exit_module - module unload entry point
- */
-static void __exit nes_exit_module(void)
-{
-	nes_cm_stop();
-
-	pci_unregister_driver(&nes_pci_driver);
-}
-
-
-module_init(nes_init_module);
-module_exit(nes_exit_module);
diff --git a/drivers/infiniband/hw/nes/nes.h b/drivers/infiniband/hw/nes/nes.h
deleted file mode 100644
index bedaa02..0000000
--- a/drivers/infiniband/hw/nes/nes.h
+++ /dev/null
@@ -1,583 +0,0 @@
-/*
- * Copyright (c) 2006 - 2011 Intel Corporation.  All rights reserved.
- * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __NES_H
-#define __NES_H
-
-#include <linux/netdevice.h>
-#include <linux/inetdevice.h>
-#include <linux/interrupt.h>
-#include <linux/spinlock.h>
-#include <linux/kernel.h>
-#include <linux/delay.h>
-#include <linux/pci.h>
-#include <linux/dma-mapping.h>
-#include <linux/workqueue.h>
-#include <linux/slab.h>
-#include <asm/io.h>
-#include <linux/crc32c.h>
-
-#include <rdma/ib_smi.h>
-#include <rdma/ib_verbs.h>
-#include <rdma/ib_pack.h>
-#include <rdma/rdma_cm.h>
-#include <rdma/iw_cm.h>
-#include <rdma/rdma_netlink.h>
-#include <rdma/iw_portmap.h>
-
-#define NES_SEND_FIRST_WRITE
-
-#define QUEUE_DISCONNECTS
-
-#define DRV_NAME    "iw_nes"
-#define DRV_VERSION "1.5.0.1"
-#define PFX         DRV_NAME ": "
-
-/*
- * NetEffect PCI vendor id and NE010 PCI device id.
- */
-#ifndef PCI_VENDOR_ID_NETEFFECT	/* not in pci.ids yet */
-#define PCI_VENDOR_ID_NETEFFECT          0x1678
-#define PCI_DEVICE_ID_NETEFFECT_NE020    0x0100
-#define PCI_DEVICE_ID_NETEFFECT_NE020_KR 0x0110
-#endif
-
-#define NE020_REV   4
-#define NE020_REV1  5
-
-#define BAR_0       0
-#define BAR_1       2
-
-#define RX_BUF_SIZE             (1536 + 8)
-#define NES_REG0_SIZE           (4 * 1024)
-#define NES_TX_TIMEOUT          (6*HZ)
-#define NES_FIRST_QPN           64
-#define NES_SW_CONTEXT_ALIGN    1024
-
-#define NES_MAX_MTU		9000
-
-#define NES_NIC_MAX_NICS        16
-#define NES_MAX_ARP_TABLE_SIZE  4096
-
-#define NES_NIC_CEQ_SIZE        8
-/* NICs will be on a separate CQ */
-#define NES_CCEQ_SIZE ((nesadapter->max_cq / nesadapter->port_count) - 32)
-
-#define NES_MAX_PORT_COUNT 4
-
-#define MAX_DPC_ITERATIONS               128
-
-#define NES_DRV_OPT_ENABLE_MPA_VER_0     0x00000001
-#define NES_DRV_OPT_DISABLE_MPA_CRC      0x00000002
-#define NES_DRV_OPT_DISABLE_FIRST_WRITE  0x00000004
-#define NES_DRV_OPT_DISABLE_INTF         0x00000008
-#define NES_DRV_OPT_ENABLE_MSI           0x00000010
-#define NES_DRV_OPT_DUAL_LOGICAL_PORT    0x00000020
-#define NES_DRV_OPT_SUPRESS_OPTION_BC    0x00000040
-#define NES_DRV_OPT_NO_INLINE_DATA       0x00000080
-#define NES_DRV_OPT_DISABLE_INT_MOD      0x00000100
-#define NES_DRV_OPT_DISABLE_VIRT_WQ      0x00000200
-#define NES_DRV_OPT_ENABLE_PAU           0x00000400
-
-#define NES_AEQ_EVENT_TIMEOUT         2500
-#define NES_DISCONNECT_EVENT_TIMEOUT  2000
-
-/* debug levels */
-/* must match userspace */
-#define NES_DBG_HW          0x00000001
-#define NES_DBG_INIT        0x00000002
-#define NES_DBG_ISR         0x00000004
-#define NES_DBG_PHY         0x00000008
-#define NES_DBG_NETDEV      0x00000010
-#define NES_DBG_CM          0x00000020
-#define NES_DBG_CM1         0x00000040
-#define NES_DBG_NIC_RX      0x00000080
-#define NES_DBG_NIC_TX      0x00000100
-#define NES_DBG_CQP         0x00000200
-#define NES_DBG_MMAP        0x00000400
-#define NES_DBG_MR          0x00000800
-#define NES_DBG_PD          0x00001000
-#define NES_DBG_CQ          0x00002000
-#define NES_DBG_QP          0x00004000
-#define NES_DBG_MOD_QP      0x00008000
-#define NES_DBG_AEQ         0x00010000
-#define NES_DBG_IW_RX       0x00020000
-#define NES_DBG_IW_TX       0x00040000
-#define NES_DBG_SHUTDOWN    0x00080000
-#define NES_DBG_PAU         0x00100000
-#define NES_DBG_NLMSG       0x00200000
-#define NES_DBG_RSVD1       0x10000000
-#define NES_DBG_RSVD2       0x20000000
-#define NES_DBG_RSVD3       0x40000000
-#define NES_DBG_RSVD4       0x80000000
-#define NES_DBG_ALL         0xffffffff
-
-#ifdef CONFIG_INFINIBAND_NES_DEBUG
-#define nes_debug(level, fmt, args...) \
-do { \
-	if (level & nes_debug_level) \
-		printk(KERN_ERR PFX "%s[%u]: " fmt, __func__, __LINE__, ##args); \
-} while (0)
-
-#define assert(expr) \
-do { \
-	if (!(expr)) { \
-		printk(KERN_ERR PFX "Assertion failed! %s, %s, %s, line %d\n", \
-			   #expr, __FILE__, __func__, __LINE__); \
-	} \
-} while (0)
-
-#define NES_EVENT_TIMEOUT   1200000
-#else
-#define nes_debug(level, fmt, args...) no_printk(fmt, ##args)
-#define assert(expr)          do {} while (0)
-
-#define NES_EVENT_TIMEOUT   100000
-#endif
-
-#include "nes_hw.h"
-#include "nes_verbs.h"
-#include "nes_context.h"
-#include <rdma/nes-abi.h>
-#include "nes_cm.h"
-#include "nes_mgt.h"
-
-extern int interrupt_mod_interval;
-extern int nes_if_count;
-extern int mpa_version;
-extern int disable_mpa_crc;
-extern unsigned int nes_drv_opt;
-extern unsigned int nes_debug_level;
-extern unsigned int wqm_quanta;
-extern struct list_head nes_adapter_list;
-
-extern atomic_t cm_connects;
-extern atomic_t cm_accepts;
-extern atomic_t cm_disconnects;
-extern atomic_t cm_closes;
-extern atomic_t cm_connecteds;
-extern atomic_t cm_connect_reqs;
-extern atomic_t cm_rejects;
-extern atomic_t mod_qp_timouts;
-extern atomic_t qps_created;
-extern atomic_t qps_destroyed;
-extern atomic_t sw_qps_destroyed;
-extern u32 mh_detected;
-extern u32 mh_pauses_sent;
-extern u32 cm_packets_sent;
-extern u32 cm_packets_bounced;
-extern u32 cm_packets_created;
-extern u32 cm_packets_received;
-extern u32 cm_packets_dropped;
-extern u32 cm_packets_retrans;
-extern atomic_t cm_listens_created;
-extern atomic_t cm_listens_destroyed;
-extern u32 cm_backlog_drops;
-extern atomic_t cm_loopbacks;
-extern atomic_t cm_nodes_created;
-extern atomic_t cm_nodes_destroyed;
-extern atomic_t cm_accel_dropped_pkts;
-extern atomic_t cm_resets_recvd;
-extern atomic_t pau_qps_created;
-extern atomic_t pau_qps_destroyed;
-
-extern u32 int_mod_timer_init;
-extern u32 int_mod_cq_depth_256;
-extern u32 int_mod_cq_depth_128;
-extern u32 int_mod_cq_depth_32;
-extern u32 int_mod_cq_depth_24;
-extern u32 int_mod_cq_depth_16;
-extern u32 int_mod_cq_depth_4;
-extern u32 int_mod_cq_depth_1;
-
-struct nes_device {
-	struct nes_adapter	   *nesadapter;
-	void __iomem           *regs;
-	void __iomem           *index_reg;
-	struct pci_dev         *pcidev;
-	struct net_device      *netdev[NES_NIC_MAX_NICS];
-	u64                    link_status_interrupts;
-	struct tasklet_struct  dpc_tasklet;
-	spinlock_t             indexed_regs_lock;
-	unsigned long          csr_start;
-	unsigned long          doorbell_region;
-	unsigned long          doorbell_start;
-	unsigned long          mac_tx_errors;
-	unsigned long          mac_pause_frames_sent;
-	unsigned long          mac_pause_frames_received;
-	unsigned long          mac_rx_errors;
-	unsigned long          mac_rx_crc_errors;
-	unsigned long          mac_rx_symbol_err_frames;
-	unsigned long          mac_rx_jabber_frames;
-	unsigned long          mac_rx_oversized_frames;
-	unsigned long          mac_rx_short_frames;
-	unsigned long          port_rx_discards;
-	unsigned long          port_tx_discards;
-	unsigned int           mac_index;
-	unsigned int           nes_stack_start;
-
-	/* Control Structures */
-	void                   *cqp_vbase;
-	dma_addr_t             cqp_pbase;
-	u32                    cqp_mem_size;
-	u8                     ceq_index;
-	u8                     nic_ceq_index;
-	struct nes_hw_cqp      cqp;
-	struct nes_hw_cq       ccq;
-	struct list_head       cqp_avail_reqs;
-	struct list_head       cqp_pending_reqs;
-	struct nes_cqp_request *nes_cqp_requests;
-
-	u32                    int_req;
-	u32                    int_stat;
-	u32                    timer_int_req;
-	u32                    timer_only_int_count;
-	u32                    intf_int_req;
-	u32                    last_mac_tx_pauses;
-	u32                    last_used_chunks_tx;
-	struct list_head       list;
-
-	u16                    base_doorbell_index;
-	u16                    currcq_count;
-	u16                    deepcq_count;
-	u8                     iw_status;
-	u8                     msi_enabled;
-	u8                     netdev_count;
-	u8                     napi_isr_ran;
-	u8                     disable_rx_flow_control;
-	u8                     disable_tx_flow_control;
-
-	struct delayed_work    work;
-	u8                     link_recheck;
-};
-
-/* Receive skb private area - must fit in skb->cb area */
-struct nes_rskb_cb {
-	u64                    busaddr;
-	u32                    maplen;
-	u32                    seqnum;
-	u8                     *data_start;
-	struct nes_qp          *nesqp;
-};
-
-static inline __le32 get_crc_value(struct nes_v4_quad *nes_quad)
-{
-	u32 crc_value;
-	crc_value = crc32c(~0, (void *)nes_quad, sizeof (struct nes_v4_quad));
-
-	/*
-	 * With commit ef19454b ("[LIB] crc32c: Keep intermediate crc
-	 * state in cpu order"), behavior of crc32c changes on
-	 * big-endian platforms.  Our algorithm expects the previous
-	 * behavior; otherwise we have RDMA connection establishment
-	 * issue on big-endian.
-	 */
-	return cpu_to_le32(crc_value);
-}
-
-static inline void
-set_wqe_64bit_value(__le32 *wqe_words, u32 index, u64 value)
-{
-	wqe_words[index]     = cpu_to_le32((u32) value);
-	wqe_words[index + 1] = cpu_to_le32(upper_32_bits(value));
-}
-
-static inline void
-set_wqe_32bit_value(__le32 *wqe_words, u32 index, u32 value)
-{
-	wqe_words[index] = cpu_to_le32(value);
-}
-
-static inline void
-nes_fill_init_cqp_wqe(struct nes_hw_cqp_wqe *cqp_wqe, struct nes_device *nesdev)
-{
-	cqp_wqe->wqe_words[NES_CQP_WQE_COMP_CTX_LOW_IDX]       = 0;
-	cqp_wqe->wqe_words[NES_CQP_WQE_COMP_CTX_HIGH_IDX]      = 0;
-	cqp_wqe->wqe_words[NES_CQP_WQE_COMP_SCRATCH_LOW_IDX]   = 0;
-	cqp_wqe->wqe_words[NES_CQP_WQE_COMP_SCRATCH_HIGH_IDX]  = 0;
-	cqp_wqe->wqe_words[NES_CQP_STAG_WQE_PBL_BLK_COUNT_IDX] = 0;
-	cqp_wqe->wqe_words[NES_CQP_STAG_WQE_PBL_LEN_IDX]       = 0;
-	cqp_wqe->wqe_words[NES_CQP_STAG_WQE_LEN_LOW_IDX]       = 0;
-	cqp_wqe->wqe_words[NES_CQP_STAG_WQE_PA_LOW_IDX]        = 0;
-	cqp_wqe->wqe_words[NES_CQP_STAG_WQE_PA_HIGH_IDX]       = 0;
-}
-
-static inline void
-nes_fill_init_qp_wqe(struct nes_hw_qp_wqe *wqe, struct nes_qp *nesqp, u32 head)
-{
-	u32 value;
-	value = ((u32)((unsigned long) nesqp)) | head;
-	set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_COMP_CTX_HIGH_IDX,
-			(u32)(upper_32_bits((unsigned long)(nesqp))));
-	set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_COMP_CTX_LOW_IDX, value);
-}
-
-/* Read from memory-mapped device */
-static inline u32 nes_read_indexed(struct nes_device *nesdev, u32 reg_index)
-{
-	unsigned long flags;
-	void __iomem *addr = nesdev->index_reg;
-	u32 value;
-
-	spin_lock_irqsave(&nesdev->indexed_regs_lock, flags);
-
-	writel(reg_index, addr);
-	value = readl((void __iomem *)addr + 4);
-
-	spin_unlock_irqrestore(&nesdev->indexed_regs_lock, flags);
-	return value;
-}
-
-static inline u32 nes_read32(const void __iomem *addr)
-{
-	return readl(addr);
-}
-
-static inline u16 nes_read16(const void __iomem *addr)
-{
-	return readw(addr);
-}
-
-static inline u8 nes_read8(const void __iomem *addr)
-{
-	return readb(addr);
-}
-
-/* Write to memory-mapped device */
-static inline void nes_write_indexed(struct nes_device *nesdev, u32 reg_index, u32 val)
-{
-	unsigned long flags;
-	void __iomem *addr = nesdev->index_reg;
-
-	spin_lock_irqsave(&nesdev->indexed_regs_lock, flags);
-
-	writel(reg_index, addr);
-	writel(val, (void __iomem *)addr + 4);
-
-	spin_unlock_irqrestore(&nesdev->indexed_regs_lock, flags);
-}
-
-static inline void nes_write32(void __iomem *addr, u32 val)
-{
-	writel(val, addr);
-}
-
-static inline void nes_write16(void __iomem *addr, u16 val)
-{
-	writew(val, addr);
-}
-
-static inline void nes_write8(void __iomem *addr, u8 val)
-{
-	writeb(val, addr);
-}
-
-enum nes_resource {
-	NES_RESOURCE_MW = 1,
-	NES_RESOURCE_FAST_MR,
-	NES_RESOURCE_PHYS_MR,
-	NES_RESOURCE_USER_MR,
-	NES_RESOURCE_PD,
-	NES_RESOURCE_QP,
-	NES_RESOURCE_CQ,
-	NES_RESOURCE_ARP
-};
-
-static inline int nes_alloc_resource(struct nes_adapter *nesadapter,
-		unsigned long *resource_array, u32 max_resources,
-		u32 *req_resource_num, u32 *next, enum nes_resource resource_type)
-{
-	unsigned long flags;
-	u32 resource_num;
-
-	spin_lock_irqsave(&nesadapter->resource_lock, flags);
-
-	resource_num = find_next_zero_bit(resource_array, max_resources, *next);
-	if (resource_num >= max_resources) {
-		resource_num = find_first_zero_bit(resource_array, max_resources);
-		if (resource_num >= max_resources) {
-			printk(KERN_ERR PFX "%s: No available resources [type=%u].\n", __func__, resource_type);
-			spin_unlock_irqrestore(&nesadapter->resource_lock, flags);
-			return -EMFILE;
-		}
-	}
-	set_bit(resource_num, resource_array);
-	*next = resource_num+1;
-	if (*next == max_resources) {
-		*next = 0;
-	}
-	spin_unlock_irqrestore(&nesadapter->resource_lock, flags);
-	*req_resource_num = resource_num;
-
-	return 0;
-}
-
-static inline int nes_is_resource_allocated(struct nes_adapter *nesadapter,
-		unsigned long *resource_array, u32 resource_num)
-{
-	unsigned long flags;
-	int bit_is_set;
-
-	spin_lock_irqsave(&nesadapter->resource_lock, flags);
-
-	bit_is_set = test_bit(resource_num, resource_array);
-	nes_debug(NES_DBG_HW, "resource_num %u is%s allocated.\n",
-			resource_num, (bit_is_set ? "": " not"));
-	spin_unlock_irqrestore(&nesadapter->resource_lock, flags);
-
-	return bit_is_set;
-}
-
-static inline void nes_free_resource(struct nes_adapter *nesadapter,
-		unsigned long *resource_array, u32 resource_num)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&nesadapter->resource_lock, flags);
-	clear_bit(resource_num, resource_array);
-	spin_unlock_irqrestore(&nesadapter->resource_lock, flags);
-}
-
-static inline struct nes_vnic *to_nesvnic(struct ib_device *ibdev)
-{
-	return container_of(ibdev, struct nes_ib_device, ibdev)->nesvnic;
-}
-
-static inline struct nes_pd *to_nespd(struct ib_pd *ibpd)
-{
-	return container_of(ibpd, struct nes_pd, ibpd);
-}
-
-static inline struct nes_ucontext *to_nesucontext(struct ib_ucontext *ibucontext)
-{
-	return container_of(ibucontext, struct nes_ucontext, ibucontext);
-}
-
-static inline struct nes_mr *to_nesmr(struct ib_mr *ibmr)
-{
-	return container_of(ibmr, struct nes_mr, ibmr);
-}
-
-static inline struct nes_mr *to_nesmr_from_ibfmr(struct ib_fmr *ibfmr)
-{
-	return container_of(ibfmr, struct nes_mr, ibfmr);
-}
-
-static inline struct nes_mr *to_nesmw(struct ib_mw *ibmw)
-{
-	return container_of(ibmw, struct nes_mr, ibmw);
-}
-
-static inline struct nes_fmr *to_nesfmr(struct nes_mr *nesmr)
-{
-	return container_of(nesmr, struct nes_fmr, nesmr);
-}
-
-static inline struct nes_cq *to_nescq(struct ib_cq *ibcq)
-{
-	return container_of(ibcq, struct nes_cq, ibcq);
-}
-
-static inline struct nes_qp *to_nesqp(struct ib_qp *ibqp)
-{
-	return container_of(ibqp, struct nes_qp, ibqp);
-}
-
-
-
-/* nes.c */
-void nes_add_ref(struct ib_qp *);
-void nes_rem_ref(struct ib_qp *);
-struct ib_qp *nes_get_qp(struct ib_device *, int);
-
-
-/* nes_hw.c */
-struct nes_adapter *nes_init_adapter(struct nes_device *, u8);
-void  nes_nic_init_timer_defaults(struct nes_device *, u8);
-void nes_destroy_adapter(struct nes_adapter *);
-int nes_init_cqp(struct nes_device *);
-int nes_init_phy(struct nes_device *);
-int nes_init_nic_qp(struct nes_device *, struct net_device *);
-void nes_destroy_nic_qp(struct nes_vnic *);
-int nes_napi_isr(struct nes_device *);
-void nes_dpc(unsigned long);
-void nes_nic_ce_handler(struct nes_device *, struct nes_hw_nic_cq *);
-void nes_iwarp_ce_handler(struct nes_device *, struct nes_hw_cq *);
-int nes_destroy_cqp(struct nes_device *);
-int nes_nic_cm_xmit(struct sk_buff *, struct net_device *);
-void nes_recheck_link_status(struct work_struct *work);
-void nes_terminate_timeout(struct timer_list *t);
-
-/* nes_nic.c */
-struct net_device *nes_netdev_init(struct nes_device *, void __iomem *);
-void nes_netdev_destroy(struct net_device *);
-int nes_nic_cm_xmit(struct sk_buff *, struct net_device *);
-
-/* nes_cm.c */
-void *nes_cm_create(struct net_device *);
-int nes_cm_recv(struct sk_buff *, struct net_device *);
-void nes_update_arp(unsigned char *, u32, u32, u16, u16);
-void nes_manage_arp_cache(struct net_device *, unsigned char *, u32, u32);
-void nes_sock_release(struct nes_qp *, unsigned long *);
-void flush_wqes(struct nes_device *nesdev, struct nes_qp *, u32, u32);
-int nes_manage_apbvt(struct nes_vnic *, u32, u32, u32);
-int nes_cm_disconn(struct nes_qp *);
-void nes_cm_disconn_worker(void *);
-
-/* nes_verbs.c */
-int nes_hw_modify_qp(struct nes_device *, struct nes_qp *, u32, u32, u32);
-int nes_modify_qp(struct ib_qp *, struct ib_qp_attr *, int, struct ib_udata *);
-struct nes_ib_device *nes_init_ofa_device(struct net_device *);
-void  nes_port_ibevent(struct nes_vnic *nesvnic);
-void nes_destroy_ofa_device(struct nes_ib_device *);
-int nes_register_ofa_device(struct nes_ib_device *);
-
-/* nes_util.c */
-int nes_read_eeprom_values(struct nes_device *, struct nes_adapter *);
-void nes_write_1G_phy_reg(struct nes_device *, u8, u8, u16);
-void nes_read_1G_phy_reg(struct nes_device *, u8, u8, u16 *);
-void nes_write_10G_phy_reg(struct nes_device *, u16, u8, u16, u16);
-void nes_read_10G_phy_reg(struct nes_device *, u8, u8, u16);
-struct nes_cqp_request *nes_get_cqp_request(struct nes_device *);
-void nes_free_cqp_request(struct nes_device *nesdev,
-			  struct nes_cqp_request *cqp_request);
-void nes_put_cqp_request(struct nes_device *nesdev,
-			 struct nes_cqp_request *cqp_request);
-void nes_post_cqp_request(struct nes_device *, struct nes_cqp_request *);
-int nes_arp_table(struct nes_device *, u32, u8 *, u32);
-void nes_mh_fix(struct timer_list *t);
-void nes_clc(struct timer_list *t);
-void nes_dump_mem(unsigned int, void *, int);
-u32 nes_crc32(u32, u32, u32, u32, u8 *, u32, u32, u32);
-
-#endif	/* __NES_H */
diff --git a/drivers/infiniband/hw/nes/nes_cm.c b/drivers/infiniband/hw/nes/nes_cm.c
deleted file mode 100644
index 2b67ace..0000000
--- a/drivers/infiniband/hw/nes/nes_cm.c
+++ /dev/null
@@ -1,3991 +0,0 @@
-/*
- * Copyright (c) 2006 - 2014 Intel Corporation.  All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- */
-
-
-#define TCPOPT_TIMESTAMP 8
-
-#include <linux/atomic.h>
-#include <linux/skbuff.h>
-#include <linux/ip.h>
-#include <linux/tcp.h>
-#include <linux/init.h>
-#include <linux/if_arp.h>
-#include <linux/if_vlan.h>
-#include <linux/notifier.h>
-#include <linux/net.h>
-#include <linux/types.h>
-#include <linux/timer.h>
-#include <linux/time.h>
-#include <linux/delay.h>
-#include <linux/etherdevice.h>
-#include <linux/netdevice.h>
-#include <linux/random.h>
-#include <linux/list.h>
-#include <linux/threads.h>
-#include <linux/highmem.h>
-#include <linux/slab.h>
-#include <net/arp.h>
-#include <net/neighbour.h>
-#include <net/route.h>
-#include <net/ip_fib.h>
-#include <net/secure_seq.h>
-#include <net/tcp.h>
-#include <linux/fcntl.h>
-
-#include "nes.h"
-
-u32 cm_packets_sent;
-u32 cm_packets_bounced;
-u32 cm_packets_dropped;
-u32 cm_packets_retrans;
-u32 cm_packets_created;
-u32 cm_packets_received;
-atomic_t cm_listens_created;
-atomic_t cm_listens_destroyed;
-u32 cm_backlog_drops;
-atomic_t cm_loopbacks;
-atomic_t cm_nodes_created;
-atomic_t cm_nodes_destroyed;
-atomic_t cm_accel_dropped_pkts;
-atomic_t cm_resets_recvd;
-
-static inline int mini_cm_accelerated(struct nes_cm_core *, struct nes_cm_node *);
-static struct nes_cm_listener *mini_cm_listen(struct nes_cm_core *, struct nes_vnic *, struct nes_cm_info *);
-static int mini_cm_del_listen(struct nes_cm_core *, struct nes_cm_listener *);
-static struct nes_cm_node *mini_cm_connect(struct nes_cm_core *, struct nes_vnic *, u16, void *, struct nes_cm_info *);
-static int mini_cm_close(struct nes_cm_core *, struct nes_cm_node *);
-static int mini_cm_accept(struct nes_cm_core *, struct nes_cm_node *);
-static int mini_cm_reject(struct nes_cm_core *, struct nes_cm_node *);
-static int mini_cm_recv_pkt(struct nes_cm_core *, struct nes_vnic *, struct sk_buff *);
-static int mini_cm_dealloc_core(struct nes_cm_core *);
-static int mini_cm_get(struct nes_cm_core *);
-static int mini_cm_set(struct nes_cm_core *, u32, u32);
-
-static void form_cm_frame(struct sk_buff *, struct nes_cm_node *, void *, u32, void *, u32, u8);
-static int add_ref_cm_node(struct nes_cm_node *);
-static int rem_ref_cm_node(struct nes_cm_core *, struct nes_cm_node *);
-
-static int nes_cm_disconn_true(struct nes_qp *);
-static int nes_cm_post_event(struct nes_cm_event *event);
-static int nes_disconnect(struct nes_qp *nesqp, int abrupt);
-static void nes_disconnect_worker(struct work_struct *work);
-
-static int send_mpa_request(struct nes_cm_node *, struct sk_buff *);
-static int send_mpa_reject(struct nes_cm_node *);
-static int send_syn(struct nes_cm_node *, u32, struct sk_buff *);
-static int send_reset(struct nes_cm_node *, struct sk_buff *);
-static int send_ack(struct nes_cm_node *cm_node, struct sk_buff *skb);
-static int send_fin(struct nes_cm_node *cm_node, struct sk_buff *skb);
-static void process_packet(struct nes_cm_node *, struct sk_buff *, struct nes_cm_core *);
-
-static void active_open_err(struct nes_cm_node *, struct sk_buff *, int);
-static void passive_open_err(struct nes_cm_node *, struct sk_buff *, int);
-static void cleanup_retrans_entry(struct nes_cm_node *);
-static void handle_rcv_mpa(struct nes_cm_node *, struct sk_buff *);
-static void free_retrans_entry(struct nes_cm_node *cm_node);
-static int handle_tcp_options(struct nes_cm_node *cm_node, struct tcphdr *tcph, struct sk_buff *skb, int optionsize, int passive);
-
-/* CM event handler functions */
-static void cm_event_connected(struct nes_cm_event *);
-static void cm_event_connect_error(struct nes_cm_event *);
-static void cm_event_reset(struct nes_cm_event *);
-static void cm_event_mpa_req(struct nes_cm_event *);
-static void cm_event_mpa_reject(struct nes_cm_event *);
-static void handle_recv_entry(struct nes_cm_node *cm_node, u32 rem_node);
-
-/* MPA build functions */
-static int cm_build_mpa_frame(struct nes_cm_node *, u8 **, u16 *, u8 *, u8);
-static void build_mpa_v2(struct nes_cm_node *, void *, u8);
-static void build_mpa_v1(struct nes_cm_node *, void *, u8);
-static void build_rdma0_msg(struct nes_cm_node *, struct nes_qp **);
-
-static void print_core(struct nes_cm_core *core);
-static void record_ird_ord(struct nes_cm_node *, u16, u16);
-
-/* External CM API Interface */
-/* instance of function pointers for client API */
-/* set address of this instance to cm_core->cm_ops at cm_core alloc */
-static const struct nes_cm_ops nes_cm_api = {
-	.accelerated = mini_cm_accelerated,
-	.listen = mini_cm_listen,
-	.stop_listener = mini_cm_del_listen,
-	.connect = mini_cm_connect,
-	.close = mini_cm_close,
-	.accept = mini_cm_accept,
-	.reject = mini_cm_reject,
-	.recv_pkt = mini_cm_recv_pkt,
-	.destroy_cm_core = mini_cm_dealloc_core,
-	.get = mini_cm_get,
-	.set = mini_cm_set
-};
-
-static struct nes_cm_core *g_cm_core;
-
-atomic_t cm_connects;
-atomic_t cm_accepts;
-atomic_t cm_disconnects;
-atomic_t cm_closes;
-atomic_t cm_connecteds;
-atomic_t cm_connect_reqs;
-atomic_t cm_rejects;
-
-int nes_add_ref_cm_node(struct nes_cm_node *cm_node)
-{
-	return add_ref_cm_node(cm_node);
-}
-
-int nes_rem_ref_cm_node(struct nes_cm_node *cm_node)
-{
-	return rem_ref_cm_node(cm_node->cm_core, cm_node);
-}
-/**
- * create_event
- */
-static struct nes_cm_event *create_event(struct nes_cm_node *	cm_node,
-					 enum nes_cm_event_type type)
-{
-	struct nes_cm_event *event;
-
-	if (!cm_node->cm_id)
-		return NULL;
-
-	/* allocate an empty event */
-	event = kzalloc(sizeof(*event), GFP_ATOMIC);
-
-	if (!event)
-		return NULL;
-
-	event->type = type;
-	event->cm_node = cm_node;
-	event->cm_info.rem_addr = cm_node->rem_addr;
-	event->cm_info.loc_addr = cm_node->loc_addr;
-	event->cm_info.rem_port = cm_node->rem_port;
-	event->cm_info.loc_port = cm_node->loc_port;
-	event->cm_info.cm_id = cm_node->cm_id;
-
-	nes_debug(NES_DBG_CM, "cm_node=%p Created event=%p, type=%u, "
-		  "dst_addr=%08x[%x], src_addr=%08x[%x]\n",
-		  cm_node, event, type, event->cm_info.loc_addr,
-		  event->cm_info.loc_port, event->cm_info.rem_addr,
-		  event->cm_info.rem_port);
-
-	nes_cm_post_event(event);
-	return event;
-}
-
-
-/**
- * send_mpa_request
- */
-static int send_mpa_request(struct nes_cm_node *cm_node, struct sk_buff *skb)
-{
-	u8 start_addr = 0;
-	u8 *start_ptr = &start_addr;
-	u8 **start_buff = &start_ptr;
-	u16 buff_len = 0;
-
-	if (!skb) {
-		nes_debug(NES_DBG_CM, "skb set to NULL\n");
-		return -1;
-	}
-
-	/* send an MPA Request frame */
-	cm_build_mpa_frame(cm_node, start_buff, &buff_len, NULL, MPA_KEY_REQUEST);
-	form_cm_frame(skb, cm_node, NULL, 0, *start_buff, buff_len, SET_ACK);
-
-	return schedule_nes_timer(cm_node, skb, NES_TIMER_TYPE_SEND, 1, 0);
-}
-
-
-
-static int send_mpa_reject(struct nes_cm_node *cm_node)
-{
-	struct sk_buff *skb = NULL;
-	u8 start_addr = 0;
-	u8 *start_ptr = &start_addr;
-	u8 **start_buff = &start_ptr;
-	u16 buff_len = 0;
-	struct ietf_mpa_v1 *mpa_frame;
-
-	skb = dev_alloc_skb(MAX_CM_BUFFER);
-	if (!skb) {
-		nes_debug(NES_DBG_CM, "Failed to get a Free pkt\n");
-		return -ENOMEM;
-	}
-
-	/* send an MPA reject frame */
-	cm_build_mpa_frame(cm_node, start_buff, &buff_len, NULL, MPA_KEY_REPLY);
-	mpa_frame = (struct ietf_mpa_v1 *)*start_buff;
-	mpa_frame->flags |= IETF_MPA_FLAGS_REJECT;
-	form_cm_frame(skb, cm_node, NULL, 0, *start_buff, buff_len, SET_ACK | SET_FIN);
-
-	cm_node->state = NES_CM_STATE_FIN_WAIT1;
-	return schedule_nes_timer(cm_node, skb, NES_TIMER_TYPE_SEND, 1, 0);
-}
-
-
-/**
- * recv_mpa - process a received TCP pkt, we are expecting an
- * IETF MPA frame
- */
-static int parse_mpa(struct nes_cm_node *cm_node, u8 *buffer, u32 *type,
-		     u32 len)
-{
-	struct ietf_mpa_v1 *mpa_frame;
-	struct ietf_mpa_v2 *mpa_v2_frame;
-	struct ietf_rtr_msg *rtr_msg;
-	int mpa_hdr_len;
-	int priv_data_len;
-
-	*type = NES_MPA_REQUEST_ACCEPT;
-
-	/* assume req frame is in tcp data payload */
-	if (len < sizeof(struct ietf_mpa_v1)) {
-		nes_debug(NES_DBG_CM, "The received ietf buffer was too small (%x)\n", len);
-		return -EINVAL;
-	}
-
-	/* points to the beginning of the frame, which could be MPA V1 or V2 */
-	mpa_frame = (struct ietf_mpa_v1 *)buffer;
-	mpa_hdr_len = sizeof(struct ietf_mpa_v1);
-	priv_data_len = ntohs(mpa_frame->priv_data_len);
-
-	/* make sure mpa private data len is less than 512 bytes */
-	if (priv_data_len > IETF_MAX_PRIV_DATA_LEN) {
-		nes_debug(NES_DBG_CM, "The received Length of Private"
-			  " Data field exceeds 512 octets\n");
-		return -EINVAL;
-	}
-	/*
-	 * make sure MPA receiver interoperate with the
-	 * received MPA version and MPA key information
-	 *
-	 */
-	if (mpa_frame->rev != IETF_MPA_V1 && mpa_frame->rev != IETF_MPA_V2) {
-		nes_debug(NES_DBG_CM, "The received mpa version"
-			  " is not supported\n");
-		return -EINVAL;
-	}
-	/*
-	* backwards compatibility only
-	*/
-	if (mpa_frame->rev > cm_node->mpa_frame_rev) {
-		nes_debug(NES_DBG_CM, "The received mpa version"
-			" can not be interoperated\n");
-		return -EINVAL;
-	} else {
-		cm_node->mpa_frame_rev = mpa_frame->rev;
-	}
-
-	if (cm_node->state != NES_CM_STATE_MPAREQ_SENT) {
-		if (memcmp(mpa_frame->key, IEFT_MPA_KEY_REQ, IETF_MPA_KEY_SIZE)) {
-			nes_debug(NES_DBG_CM, "Unexpected MPA Key received \n");
-			return -EINVAL;
-		}
-	} else {
-		if (memcmp(mpa_frame->key, IEFT_MPA_KEY_REP, IETF_MPA_KEY_SIZE)) {
-			nes_debug(NES_DBG_CM, "Unexpected MPA Key received \n");
-			return -EINVAL;
-		}
-	}
-
-	if (priv_data_len + mpa_hdr_len != len) {
-		nes_debug(NES_DBG_CM, "The received ietf buffer was not right"
-			" complete (%x + %x != %x)\n",
-			priv_data_len, mpa_hdr_len, len);
-		return -EINVAL;
-	}
-	/* make sure it does not exceed the max size */
-	if (len > MAX_CM_BUFFER) {
-		nes_debug(NES_DBG_CM, "The received ietf buffer was too large"
-			" (%x + %x != %x)\n",
-			priv_data_len, mpa_hdr_len, len);
-		return -EINVAL;
-	}
-
-	cm_node->mpa_frame_size = priv_data_len;
-
-	switch (mpa_frame->rev) {
-	case IETF_MPA_V2: {
-		u16 ird_size;
-		u16 ord_size;
-		u16 rtr_ctrl_ird;
-		u16 rtr_ctrl_ord;
-
-		mpa_v2_frame = (struct ietf_mpa_v2 *)buffer;
-		mpa_hdr_len += IETF_RTR_MSG_SIZE;
-		cm_node->mpa_frame_size -= IETF_RTR_MSG_SIZE;
-		rtr_msg = &mpa_v2_frame->rtr_msg;
-
-		/* parse rtr message */
-		rtr_ctrl_ird = ntohs(rtr_msg->ctrl_ird);
-		rtr_ctrl_ord = ntohs(rtr_msg->ctrl_ord);
-		ird_size = rtr_ctrl_ird & IETF_NO_IRD_ORD;
-		ord_size = rtr_ctrl_ord & IETF_NO_IRD_ORD;
-
-		if (!(rtr_ctrl_ird & IETF_PEER_TO_PEER)) {
-			/* send reset */
-			return -EINVAL;
-		}
-		if (ird_size == IETF_NO_IRD_ORD || ord_size == IETF_NO_IRD_ORD)
-			cm_node->mpav2_ird_ord = IETF_NO_IRD_ORD;
-
-		if (cm_node->mpav2_ird_ord != IETF_NO_IRD_ORD) {
-			/* responder */
-			if (cm_node->state != NES_CM_STATE_MPAREQ_SENT) {
-				/* we are still negotiating */
-				if (ord_size > NES_MAX_IRD) {
-					cm_node->ird_size = NES_MAX_IRD;
-				} else {
-					cm_node->ird_size = ord_size;
-					if (ord_size == 0 &&
-					(rtr_ctrl_ord & IETF_RDMA0_READ)) {
-						cm_node->ird_size = 1;
-						nes_debug(NES_DBG_CM,
-						"%s: Remote peer doesn't support RDMA0_READ (ord=%u)\n",
-							__func__, ord_size);
-					}
-				}
-				if (ird_size > NES_MAX_ORD)
-					cm_node->ord_size = NES_MAX_ORD;
-				else
-					cm_node->ord_size = ird_size;
-			} else { /* initiator */
-				if (ord_size > NES_MAX_IRD) {
-					nes_debug(NES_DBG_CM,
-					"%s: Unable to support the requested (ord =%u)\n",
-							__func__, ord_size);
-					return -EINVAL;
-				}
-				cm_node->ird_size = ord_size;
-
-				if (ird_size > NES_MAX_ORD) {
-					cm_node->ord_size = NES_MAX_ORD;
-				} else {
-					if (ird_size == 0 &&
-					(rtr_ctrl_ord & IETF_RDMA0_READ)) {
-						nes_debug(NES_DBG_CM,
-						"%s: Remote peer doesn't support RDMA0_READ (ird=%u)\n",
-							__func__, ird_size);
-						return -EINVAL;
-					} else {
-						cm_node->ord_size = ird_size;
-					}
-				}
-			}
-		}
-
-		if (rtr_ctrl_ord & IETF_RDMA0_READ) {
-			cm_node->send_rdma0_op = SEND_RDMA_READ_ZERO;
-
-		} else if (rtr_ctrl_ord & IETF_RDMA0_WRITE) {
-			cm_node->send_rdma0_op = SEND_RDMA_WRITE_ZERO;
-		} else {        /* Not supported RDMA0 operation */
-			return -EINVAL;
-		}
-		break;
-	}
-	case IETF_MPA_V1:
-	default:
-		break;
-	}
-
-	/* copy entire MPA frame to our cm_node's frame */
-	memcpy(cm_node->mpa_frame_buf, buffer + mpa_hdr_len, cm_node->mpa_frame_size);
-
-	if (mpa_frame->flags & IETF_MPA_FLAGS_REJECT)
-		*type = NES_MPA_REQUEST_REJECT;
-	return 0;
-}
-
-
-/**
- * form_cm_frame - get a free packet and build empty frame Use
- * node info to build.
- */
-static void form_cm_frame(struct sk_buff *skb,
-			  struct nes_cm_node *cm_node, void *options, u32 optionsize,
-			  void *data, u32 datasize, u8 flags)
-{
-	struct tcphdr *tcph;
-	struct iphdr *iph;
-	struct ethhdr *ethh;
-	u8 *buf;
-	u16 packetsize = sizeof(*iph);
-
-	packetsize += sizeof(*tcph);
-	packetsize += optionsize + datasize;
-
-	skb_trim(skb, 0);
-	memset(skb->data, 0x00, ETH_HLEN + sizeof(*iph) + sizeof(*tcph));
-
-	buf = skb_put(skb, packetsize + ETH_HLEN);
-
-	ethh = (struct ethhdr *)buf;
-	buf += ETH_HLEN;
-
-	iph = (struct iphdr *)buf;
-	buf += sizeof(*iph);
-	tcph = (struct tcphdr *)buf;
-	skb_reset_mac_header(skb);
-	skb_set_network_header(skb, ETH_HLEN);
-	skb_set_transport_header(skb, ETH_HLEN + sizeof(*iph));
-	buf += sizeof(*tcph);
-
-	skb->ip_summed = CHECKSUM_PARTIAL;
-	if (!(cm_node->netdev->features & NETIF_F_IP_CSUM))
-		skb->ip_summed = CHECKSUM_NONE;
-	skb->protocol = htons(0x800);
-	skb->data_len = 0;
-	skb->mac_len = ETH_HLEN;
-
-	memcpy(ethh->h_dest, cm_node->rem_mac, ETH_ALEN);
-	memcpy(ethh->h_source, cm_node->loc_mac, ETH_ALEN);
-	ethh->h_proto = htons(0x0800);
-
-	iph->version = IPVERSION;
-	iph->ihl = 5;           /* 5 * 4Byte words, IP headr len */
-	iph->tos = 0;
-	iph->tot_len = htons(packetsize);
-	iph->id = htons(++cm_node->tcp_cntxt.loc_id);
-
-	iph->frag_off = htons(0x4000);
-	iph->ttl = 0x40;
-	iph->protocol = 0x06;   /* IPPROTO_TCP */
-
-	iph->saddr = htonl(cm_node->loc_addr);
-	iph->daddr = htonl(cm_node->rem_addr);
-
-	tcph->source = htons(cm_node->loc_port);
-	tcph->dest = htons(cm_node->rem_port);
-	tcph->seq = htonl(cm_node->tcp_cntxt.loc_seq_num);
-
-	if (flags & SET_ACK) {
-		cm_node->tcp_cntxt.loc_ack_num = cm_node->tcp_cntxt.rcv_nxt;
-		tcph->ack_seq = htonl(cm_node->tcp_cntxt.loc_ack_num);
-		tcph->ack = 1;
-	} else {
-		tcph->ack_seq = 0;
-	}
-
-	if (flags & SET_SYN) {
-		cm_node->tcp_cntxt.loc_seq_num++;
-		tcph->syn = 1;
-	} else {
-		cm_node->tcp_cntxt.loc_seq_num += datasize;
-	}
-
-	if (flags & SET_FIN) {
-		cm_node->tcp_cntxt.loc_seq_num++;
-		tcph->fin = 1;
-	}
-
-	if (flags & SET_RST)
-		tcph->rst = 1;
-
-	tcph->doff = (u16)((sizeof(*tcph) + optionsize + 3) >> 2);
-	tcph->window = htons(cm_node->tcp_cntxt.rcv_wnd);
-	tcph->urg_ptr = 0;
-	if (optionsize)
-		memcpy(buf, options, optionsize);
-	buf += optionsize;
-	if (datasize)
-		memcpy(buf, data, datasize);
-
-	skb_shinfo(skb)->nr_frags = 0;
-	cm_packets_created++;
-}
-
-/**
- * print_core - dump a cm core
- */
-static void print_core(struct nes_cm_core *core)
-{
-	nes_debug(NES_DBG_CM, "---------------------------------------------\n");
-	nes_debug(NES_DBG_CM, "CM Core  -- (core = %p )\n", core);
-	if (!core)
-		return;
-	nes_debug(NES_DBG_CM, "---------------------------------------------\n");
-
-	nes_debug(NES_DBG_CM, "State         : %u \n", core->state);
-
-	nes_debug(NES_DBG_CM, "Listen Nodes  : %u \n", atomic_read(&core->listen_node_cnt));
-	nes_debug(NES_DBG_CM, "Active Nodes  : %u \n", atomic_read(&core->node_cnt));
-
-	nes_debug(NES_DBG_CM, "core          : %p \n", core);
-
-	nes_debug(NES_DBG_CM, "-------------- end core ---------------\n");
-}
-
-static void record_ird_ord(struct nes_cm_node *cm_node,
-					u16 conn_ird, u16 conn_ord)
-{
-	if (conn_ird > NES_MAX_IRD)
-		conn_ird = NES_MAX_IRD;
-
-	if (conn_ord > NES_MAX_ORD)
-		conn_ord = NES_MAX_ORD;
-
-	cm_node->ird_size = conn_ird;
-	cm_node->ord_size = conn_ord;
-}
-
-/**
- * cm_build_mpa_frame - build a MPA V1 frame or MPA V2 frame
- */
-static int cm_build_mpa_frame(struct nes_cm_node *cm_node, u8 **start_buff,
-			      u16 *buff_len, u8 *pci_mem, u8 mpa_key)
-{
-	int ret = 0;
-
-	*start_buff = (pci_mem) ? pci_mem : &cm_node->mpa_frame_buf[0];
-
-	switch (cm_node->mpa_frame_rev) {
-	case IETF_MPA_V1:
-		*start_buff = (u8 *)*start_buff + sizeof(struct ietf_rtr_msg);
-		*buff_len = sizeof(struct ietf_mpa_v1) + cm_node->mpa_frame_size;
-		build_mpa_v1(cm_node, *start_buff, mpa_key);
-		break;
-	case IETF_MPA_V2:
-		*buff_len = sizeof(struct ietf_mpa_v2) + cm_node->mpa_frame_size;
-		build_mpa_v2(cm_node, *start_buff, mpa_key);
-		break;
-	default:
-		ret = -EINVAL;
-	}
-	return ret;
-}
-
-/**
- * build_mpa_v2 - build a MPA V2 frame
- */
-static void build_mpa_v2(struct nes_cm_node *cm_node,
-			 void *start_addr, u8 mpa_key)
-{
-	struct ietf_mpa_v2 *mpa_frame = (struct ietf_mpa_v2 *)start_addr;
-	struct ietf_rtr_msg *rtr_msg = &mpa_frame->rtr_msg;
-	u16 ctrl_ird;
-	u16 ctrl_ord;
-
-	/* initialize the upper 5 bytes of the frame */
-	build_mpa_v1(cm_node, start_addr, mpa_key);
-	mpa_frame->flags |= IETF_MPA_V2_FLAG; /* set a bit to indicate MPA V2 */
-	mpa_frame->priv_data_len += htons(IETF_RTR_MSG_SIZE);
-
-	/* initialize RTR msg */
-	if (cm_node->mpav2_ird_ord == IETF_NO_IRD_ORD) {
-		ctrl_ird = IETF_NO_IRD_ORD;
-		ctrl_ord = IETF_NO_IRD_ORD;
-	} else {
-		ctrl_ird = cm_node->ird_size & IETF_NO_IRD_ORD;
-		ctrl_ord = cm_node->ord_size & IETF_NO_IRD_ORD;
-	}
-	ctrl_ird |= IETF_PEER_TO_PEER;
-
-	switch (mpa_key) {
-	case MPA_KEY_REQUEST:
-		ctrl_ord |= IETF_RDMA0_WRITE;
-		ctrl_ord |= IETF_RDMA0_READ;
-		break;
-	case MPA_KEY_REPLY:
-		switch (cm_node->send_rdma0_op) {
-		case SEND_RDMA_WRITE_ZERO:
-			ctrl_ord |= IETF_RDMA0_WRITE;
-			break;
-		case SEND_RDMA_READ_ZERO:
-			ctrl_ord |= IETF_RDMA0_READ;
-			break;
-		}
-	}
-	rtr_msg->ctrl_ird = htons(ctrl_ird);
-	rtr_msg->ctrl_ord = htons(ctrl_ord);
-}
-
-/**
- * build_mpa_v1 - build a MPA V1 frame
- */
-static void build_mpa_v1(struct nes_cm_node *cm_node, void *start_addr, u8 mpa_key)
-{
-	struct ietf_mpa_v1 *mpa_frame = (struct ietf_mpa_v1 *)start_addr;
-
-	switch (mpa_key) {
-	case MPA_KEY_REQUEST:
-		memcpy(mpa_frame->key, IEFT_MPA_KEY_REQ, IETF_MPA_KEY_SIZE);
-		break;
-	case MPA_KEY_REPLY:
-		memcpy(mpa_frame->key, IEFT_MPA_KEY_REP, IETF_MPA_KEY_SIZE);
-		break;
-	}
-	mpa_frame->flags = IETF_MPA_FLAGS_CRC;
-	mpa_frame->rev = cm_node->mpa_frame_rev;
-	mpa_frame->priv_data_len = htons(cm_node->mpa_frame_size);
-}
-
-static void build_rdma0_msg(struct nes_cm_node *cm_node, struct nes_qp **nesqp_addr)
-{
-	u64 u64temp;
-	struct nes_qp *nesqp = *nesqp_addr;
-	struct nes_hw_qp_wqe *wqe = &nesqp->hwqp.sq_vbase[0];
-
-	u64temp = (unsigned long)nesqp->nesuqp_addr;
-	u64temp |= NES_SW_CONTEXT_ALIGN >> 1;
-	set_wqe_64bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_COMP_CTX_LOW_IDX, u64temp);
-
-	wqe->wqe_words[NES_IWARP_SQ_WQE_FRAG0_LOW_IDX] = 0;
-	wqe->wqe_words[NES_IWARP_SQ_WQE_FRAG0_HIGH_IDX] = 0;
-
-	switch (cm_node->send_rdma0_op) {
-	case SEND_RDMA_WRITE_ZERO:
-		nes_debug(NES_DBG_CM, "Sending first write.\n");
-		wqe->wqe_words[NES_IWARP_SQ_WQE_MISC_IDX] =
-			cpu_to_le32(NES_IWARP_SQ_OP_RDMAW);
-		wqe->wqe_words[NES_IWARP_SQ_WQE_TOTAL_PAYLOAD_IDX] = 0;
-		wqe->wqe_words[NES_IWARP_SQ_WQE_LENGTH0_IDX] = 0;
-		wqe->wqe_words[NES_IWARP_SQ_WQE_STAG0_IDX] = 0;
-		break;
-
-	case SEND_RDMA_READ_ZERO:
-	default:
-		if (cm_node->send_rdma0_op != SEND_RDMA_READ_ZERO)
-			WARN(1, "Unsupported RDMA0 len operation=%u\n",
-			     cm_node->send_rdma0_op);
-		nes_debug(NES_DBG_CM, "Sending first rdma operation.\n");
-		wqe->wqe_words[NES_IWARP_SQ_WQE_MISC_IDX] =
-			cpu_to_le32(NES_IWARP_SQ_OP_RDMAR);
-		wqe->wqe_words[NES_IWARP_SQ_WQE_RDMA_TO_LOW_IDX] = 1;
-		wqe->wqe_words[NES_IWARP_SQ_WQE_RDMA_TO_HIGH_IDX] = 0;
-		wqe->wqe_words[NES_IWARP_SQ_WQE_RDMA_LENGTH_IDX] = 0;
-		wqe->wqe_words[NES_IWARP_SQ_WQE_RDMA_STAG_IDX] = 1;
-		wqe->wqe_words[NES_IWARP_SQ_WQE_STAG0_IDX] = 1;
-		break;
-	}
-
-	if (nesqp->sq_kmapped) {
-		nesqp->sq_kmapped = 0;
-		kunmap(nesqp->page);
-	}
-
-	/*use the reserved spot on the WQ for the extra first WQE*/
-	nesqp->nesqp_context->ird_ord_sizes &= cpu_to_le32(~(NES_QPCONTEXT_ORDIRD_LSMM_PRESENT |
-							     NES_QPCONTEXT_ORDIRD_WRPDU |
-							     NES_QPCONTEXT_ORDIRD_ALSMM));
-	nesqp->skip_lsmm = 1;
-	nesqp->hwqp.sq_tail = 0;
-}
-
-/**
- * schedule_nes_timer
- * note - cm_node needs to be protected before calling this. Encase in:
- *			rem_ref_cm_node(cm_core, cm_node);add_ref_cm_node(cm_node);
- */
-int schedule_nes_timer(struct nes_cm_node *cm_node, struct sk_buff *skb,
-		       enum nes_timer_type type, int send_retrans,
-		       int close_when_complete)
-{
-	unsigned long flags;
-	struct nes_cm_core *cm_core = cm_node->cm_core;
-	struct nes_timer_entry *new_send;
-	int ret = 0;
-
-	new_send = kzalloc(sizeof(*new_send), GFP_ATOMIC);
-	if (!new_send)
-		return -ENOMEM;
-
-	/* new_send->timetosend = currenttime */
-	new_send->retrycount = NES_DEFAULT_RETRYS;
-	new_send->retranscount = NES_DEFAULT_RETRANS;
-	new_send->skb = skb;
-	new_send->timetosend = jiffies;
-	new_send->type = type;
-	new_send->netdev = cm_node->netdev;
-	new_send->send_retrans = send_retrans;
-	new_send->close_when_complete = close_when_complete;
-
-	if (type == NES_TIMER_TYPE_CLOSE) {
-		new_send->timetosend += (HZ / 10);
-		if (cm_node->recv_entry) {
-			kfree(new_send);
-			WARN_ON(1);
-			return -EINVAL;
-		}
-		cm_node->recv_entry = new_send;
-	}
-
-	if (type == NES_TIMER_TYPE_SEND) {
-		new_send->seq_num = ntohl(tcp_hdr(skb)->seq);
-		refcount_inc(&new_send->skb->users);
-		spin_lock_irqsave(&cm_node->retrans_list_lock, flags);
-		cm_node->send_entry = new_send;
-		add_ref_cm_node(cm_node);
-		spin_unlock_irqrestore(&cm_node->retrans_list_lock, flags);
-		new_send->timetosend = jiffies + NES_RETRY_TIMEOUT;
-
-		ret = nes_nic_cm_xmit(new_send->skb, cm_node->netdev);
-		if (ret != NETDEV_TX_OK) {
-			nes_debug(NES_DBG_CM, "Error sending packet %p "
-				  "(jiffies = %lu)\n", new_send, jiffies);
-			new_send->timetosend = jiffies;
-			ret = NETDEV_TX_OK;
-		} else {
-			cm_packets_sent++;
-			if (!send_retrans) {
-				cleanup_retrans_entry(cm_node);
-				if (close_when_complete)
-					rem_ref_cm_node(cm_core, cm_node);
-				return ret;
-			}
-		}
-	}
-
-	if (!timer_pending(&cm_core->tcp_timer))
-		mod_timer(&cm_core->tcp_timer, new_send->timetosend);
-
-	return ret;
-}
-
-static void nes_retrans_expired(struct nes_cm_node *cm_node)
-{
-	struct iw_cm_id *cm_id = cm_node->cm_id;
-	enum nes_cm_node_state state = cm_node->state;
-	cm_node->state = NES_CM_STATE_CLOSED;
-
-	switch (state) {
-	case NES_CM_STATE_SYN_RCVD:
-	case NES_CM_STATE_CLOSING:
-		rem_ref_cm_node(cm_node->cm_core, cm_node);
-		break;
-	case NES_CM_STATE_LAST_ACK:
-	case NES_CM_STATE_FIN_WAIT1:
-		if (cm_node->cm_id)
-			cm_id->rem_ref(cm_id);
-		send_reset(cm_node, NULL);
-		break;
-	default:
-		add_ref_cm_node(cm_node);
-		send_reset(cm_node, NULL);
-		create_event(cm_node, NES_CM_EVENT_ABORTED);
-	}
-}
-
-static void handle_recv_entry(struct nes_cm_node *cm_node, u32 rem_node)
-{
-	struct nes_timer_entry *recv_entry = cm_node->recv_entry;
-	struct iw_cm_id *cm_id = cm_node->cm_id;
-	struct nes_qp *nesqp;
-	unsigned long qplockflags;
-
-	if (!recv_entry)
-		return;
-	nesqp = (struct nes_qp *)recv_entry->skb;
-	if (nesqp) {
-		spin_lock_irqsave(&nesqp->lock, qplockflags);
-		if (nesqp->cm_id) {
-			nes_debug(NES_DBG_CM, "QP%u: cm_id = %p, "
-				  "refcount = %d: HIT A "
-				  "NES_TIMER_TYPE_CLOSE with something "
-				  "to do!!!\n", nesqp->hwqp.qp_id, cm_id,
-				  atomic_read(&nesqp->refcount));
-			nesqp->hw_tcp_state = NES_AEQE_TCP_STATE_CLOSED;
-			nesqp->last_aeq = NES_AEQE_AEID_RESET_SENT;
-			nesqp->ibqp_state = IB_QPS_ERR;
-			spin_unlock_irqrestore(&nesqp->lock, qplockflags);
-			nes_cm_disconn(nesqp);
-		} else {
-			spin_unlock_irqrestore(&nesqp->lock, qplockflags);
-			nes_debug(NES_DBG_CM, "QP%u: cm_id = %p, "
-				  "refcount = %d: HIT A "
-				  "NES_TIMER_TYPE_CLOSE with nothing "
-				  "to do!!!\n", nesqp->hwqp.qp_id, cm_id,
-				  atomic_read(&nesqp->refcount));
-		}
-	} else if (rem_node) {
-		/* TIME_WAIT state */
-		rem_ref_cm_node(cm_node->cm_core, cm_node);
-	}
-	if (cm_node->cm_id)
-		cm_id->rem_ref(cm_id);
-	kfree(recv_entry);
-	cm_node->recv_entry = NULL;
-}
-
-/**
- * nes_cm_timer_tick
- */
-static void nes_cm_timer_tick(struct timer_list *unused)
-{
-	unsigned long flags;
-	unsigned long nexttimeout = jiffies + NES_LONG_TIME;
-	struct nes_cm_node *cm_node;
-	struct nes_timer_entry *send_entry, *recv_entry;
-	struct list_head *list_core_temp;
-	struct list_head *list_node;
-	struct nes_cm_core *cm_core = g_cm_core;
-	u32 settimer = 0;
-	unsigned long timetosend;
-	int ret = NETDEV_TX_OK;
-
-	struct list_head timer_list;
-
-	INIT_LIST_HEAD(&timer_list);
-	spin_lock_irqsave(&cm_core->ht_lock, flags);
-
-	list_for_each_safe(list_node, list_core_temp,
-			   &cm_core->connected_nodes) {
-		cm_node = container_of(list_node, struct nes_cm_node, list);
-		if ((cm_node->recv_entry) || (cm_node->send_entry)) {
-			add_ref_cm_node(cm_node);
-			list_add(&cm_node->timer_entry, &timer_list);
-		}
-	}
-	spin_unlock_irqrestore(&cm_core->ht_lock, flags);
-
-	list_for_each_safe(list_node, list_core_temp, &timer_list) {
-		cm_node = container_of(list_node, struct nes_cm_node,
-				       timer_entry);
-		recv_entry = cm_node->recv_entry;
-
-		if (recv_entry) {
-			if (time_after(recv_entry->timetosend, jiffies)) {
-				if (nexttimeout > recv_entry->timetosend ||
-				    !settimer) {
-					nexttimeout = recv_entry->timetosend;
-					settimer = 1;
-				}
-			} else {
-				handle_recv_entry(cm_node, 1);
-			}
-		}
-
-		spin_lock_irqsave(&cm_node->retrans_list_lock, flags);
-		do {
-			send_entry = cm_node->send_entry;
-			if (!send_entry)
-				break;
-			if (time_after(send_entry->timetosend, jiffies)) {
-				if (cm_node->state != NES_CM_STATE_TSA) {
-					if ((nexttimeout >
-					     send_entry->timetosend) ||
-					    !settimer) {
-						nexttimeout =
-							send_entry->timetosend;
-						settimer = 1;
-					}
-				} else {
-					free_retrans_entry(cm_node);
-				}
-				break;
-			}
-
-			if ((cm_node->state == NES_CM_STATE_TSA) ||
-			    (cm_node->state == NES_CM_STATE_CLOSED)) {
-				free_retrans_entry(cm_node);
-				break;
-			}
-
-			if (!send_entry->retranscount ||
-			    !send_entry->retrycount) {
-				cm_packets_dropped++;
-				free_retrans_entry(cm_node);
-
-				spin_unlock_irqrestore(
-					&cm_node->retrans_list_lock, flags);
-				nes_retrans_expired(cm_node);
-				cm_node->state = NES_CM_STATE_CLOSED;
-				spin_lock_irqsave(&cm_node->retrans_list_lock,
-						  flags);
-				break;
-			}
-			refcount_inc(&send_entry->skb->users);
-			cm_packets_retrans++;
-			nes_debug(NES_DBG_CM, "Retransmitting send_entry %p "
-				  "for node %p, jiffies = %lu, time to send = "
-				  "%lu, retranscount = %u, send_entry->seq_num = "
-				  "0x%08X, cm_node->tcp_cntxt.rem_ack_num = "
-				  "0x%08X\n", send_entry, cm_node, jiffies,
-				  send_entry->timetosend,
-				  send_entry->retranscount,
-				  send_entry->seq_num,
-				  cm_node->tcp_cntxt.rem_ack_num);
-
-			spin_unlock_irqrestore(&cm_node->retrans_list_lock,
-					       flags);
-			ret = nes_nic_cm_xmit(send_entry->skb, cm_node->netdev);
-			spin_lock_irqsave(&cm_node->retrans_list_lock, flags);
-			if (ret != NETDEV_TX_OK) {
-				nes_debug(NES_DBG_CM, "rexmit failed for "
-					  "node=%p\n", cm_node);
-				cm_packets_bounced++;
-				send_entry->retrycount--;
-				nexttimeout = jiffies + NES_SHORT_TIME;
-				settimer = 1;
-				break;
-			} else {
-				cm_packets_sent++;
-			}
-			nes_debug(NES_DBG_CM, "Packet Sent: retrans count = "
-				  "%u, retry count = %u.\n",
-				  send_entry->retranscount,
-				  send_entry->retrycount);
-			if (send_entry->send_retrans) {
-				send_entry->retranscount--;
-				timetosend = (NES_RETRY_TIMEOUT <<
-					      (NES_DEFAULT_RETRANS - send_entry->retranscount));
-
-				send_entry->timetosend = jiffies +
-							 min(timetosend, NES_MAX_TIMEOUT);
-				if (nexttimeout > send_entry->timetosend ||
-				    !settimer) {
-					nexttimeout = send_entry->timetosend;
-					settimer = 1;
-				}
-			} else {
-				int close_when_complete;
-				close_when_complete =
-					send_entry->close_when_complete;
-				nes_debug(NES_DBG_CM, "cm_node=%p state=%d\n",
-					  cm_node, cm_node->state);
-				free_retrans_entry(cm_node);
-				if (close_when_complete)
-					rem_ref_cm_node(cm_node->cm_core,
-							cm_node);
-			}
-		} while (0);
-
-		spin_unlock_irqrestore(&cm_node->retrans_list_lock, flags);
-		rem_ref_cm_node(cm_node->cm_core, cm_node);
-	}
-
-	if (settimer) {
-		if (!timer_pending(&cm_core->tcp_timer))
-			mod_timer(&cm_core->tcp_timer, nexttimeout);
-	}
-}
-
-
-/**
- * send_syn
- */
-static int send_syn(struct nes_cm_node *cm_node, u32 sendack,
-		    struct sk_buff *skb)
-{
-	int ret;
-	int flags = SET_SYN;
-	char optionsbuffer[sizeof(struct option_mss) +
-			   sizeof(struct option_windowscale) + sizeof(struct option_base) +
-			   TCP_OPTIONS_PADDING];
-
-	int optionssize = 0;
-	/* Sending MSS option */
-	union all_known_options *options;
-
-	if (!cm_node)
-		return -EINVAL;
-
-	options = (union all_known_options *)&optionsbuffer[optionssize];
-	options->as_mss.optionnum = OPTION_NUMBER_MSS;
-	options->as_mss.length = sizeof(struct option_mss);
-	options->as_mss.mss = htons(cm_node->tcp_cntxt.mss);
-	optionssize += sizeof(struct option_mss);
-
-	options = (union all_known_options *)&optionsbuffer[optionssize];
-	options->as_windowscale.optionnum = OPTION_NUMBER_WINDOW_SCALE;
-	options->as_windowscale.length = sizeof(struct option_windowscale);
-	options->as_windowscale.shiftcount = cm_node->tcp_cntxt.rcv_wscale;
-	optionssize += sizeof(struct option_windowscale);
-
-	if (sendack && !(NES_DRV_OPT_SUPRESS_OPTION_BC & nes_drv_opt)) {
-		options = (union all_known_options *)&optionsbuffer[optionssize];
-		options->as_base.optionnum = OPTION_NUMBER_WRITE0;
-		options->as_base.length = sizeof(struct option_base);
-		optionssize += sizeof(struct option_base);
-		/* we need the size to be a multiple of 4 */
-		options = (union all_known_options *)&optionsbuffer[optionssize];
-		options->as_end = 1;
-		optionssize += 1;
-		options = (union all_known_options *)&optionsbuffer[optionssize];
-		options->as_end = 1;
-		optionssize += 1;
-	}
-
-	options = (union all_known_options *)&optionsbuffer[optionssize];
-	options->as_end = OPTION_NUMBER_END;
-	optionssize += 1;
-
-	if (!skb)
-		skb = dev_alloc_skb(MAX_CM_BUFFER);
-	if (!skb) {
-		nes_debug(NES_DBG_CM, "Failed to get a Free pkt\n");
-		return -1;
-	}
-
-	if (sendack)
-		flags |= SET_ACK;
-
-	form_cm_frame(skb, cm_node, optionsbuffer, optionssize, NULL, 0, flags);
-	ret = schedule_nes_timer(cm_node, skb, NES_TIMER_TYPE_SEND, 1, 0);
-
-	return ret;
-}
-
-
-/**
- * send_reset
- */
-static int send_reset(struct nes_cm_node *cm_node, struct sk_buff *skb)
-{
-	int ret;
-	int flags = SET_RST | SET_ACK;
-
-	if (!skb)
-		skb = dev_alloc_skb(MAX_CM_BUFFER);
-	if (!skb) {
-		nes_debug(NES_DBG_CM, "Failed to get a Free pkt\n");
-		return -ENOMEM;
-	}
-
-	form_cm_frame(skb, cm_node, NULL, 0, NULL, 0, flags);
-	ret = schedule_nes_timer(cm_node, skb, NES_TIMER_TYPE_SEND, 0, 1);
-
-	return ret;
-}
-
-
-/**
- * send_ack
- */
-static int send_ack(struct nes_cm_node *cm_node, struct sk_buff *skb)
-{
-	int ret;
-
-	if (!skb)
-		skb = dev_alloc_skb(MAX_CM_BUFFER);
-
-	if (!skb) {
-		nes_debug(NES_DBG_CM, "Failed to get a Free pkt\n");
-		return -1;
-	}
-
-	form_cm_frame(skb, cm_node, NULL, 0, NULL, 0, SET_ACK);
-	ret = schedule_nes_timer(cm_node, skb, NES_TIMER_TYPE_SEND, 0, 0);
-
-	return ret;
-}
-
-
-/**
- * send_fin
- */
-static int send_fin(struct nes_cm_node *cm_node, struct sk_buff *skb)
-{
-	int ret;
-
-	/* if we didn't get a frame get one */
-	if (!skb)
-		skb = dev_alloc_skb(MAX_CM_BUFFER);
-
-	if (!skb) {
-		nes_debug(NES_DBG_CM, "Failed to get a Free pkt\n");
-		return -1;
-	}
-
-	form_cm_frame(skb, cm_node, NULL, 0, NULL, 0, SET_ACK | SET_FIN);
-	ret = schedule_nes_timer(cm_node, skb, NES_TIMER_TYPE_SEND, 1, 0);
-
-	return ret;
-}
-
-
-/**
- * find_node - find a cm node that matches the reference cm node
- */
-static struct nes_cm_node *find_node(struct nes_cm_core *cm_core,
-				     u16 rem_port, nes_addr_t rem_addr, u16 loc_port, nes_addr_t loc_addr)
-{
-	unsigned long flags;
-	struct list_head *hte;
-	struct nes_cm_node *cm_node;
-
-	/* get a handle on the hte */
-	hte = &cm_core->connected_nodes;
-
-	/* walk list and find cm_node associated with this session ID */
-	spin_lock_irqsave(&cm_core->ht_lock, flags);
-	list_for_each_entry(cm_node, hte, list) {
-		/* compare quad, return node handle if a match */
-		nes_debug(NES_DBG_CM, "finding node %x:%x =? %x:%x ^ %x:%x =? %x:%x\n",
-			  cm_node->loc_addr, cm_node->loc_port,
-			  loc_addr, loc_port,
-			  cm_node->rem_addr, cm_node->rem_port,
-			  rem_addr, rem_port);
-		if ((cm_node->loc_addr == loc_addr) &&
-		    (cm_node->loc_port == loc_port) &&
-		    (cm_node->rem_addr == rem_addr) &&
-		    (cm_node->rem_port == rem_port)) {
-			add_ref_cm_node(cm_node);
-			spin_unlock_irqrestore(&cm_core->ht_lock, flags);
-			return cm_node;
-		}
-	}
-	spin_unlock_irqrestore(&cm_core->ht_lock, flags);
-
-	/* no owner node */
-	return NULL;
-}
-
-
-/**
- * find_listener - find a cm node listening on this addr-port pair
- */
-static struct nes_cm_listener *find_listener(struct nes_cm_core *cm_core,
-					     nes_addr_t dst_addr, u16 dst_port,
-					     enum nes_cm_listener_state listener_state)
-{
-	unsigned long flags;
-	struct nes_cm_listener *listen_node;
-	nes_addr_t listen_addr;
-	u16 listen_port;
-
-	/* walk list and find cm_node associated with this session ID */
-	spin_lock_irqsave(&cm_core->listen_list_lock, flags);
-	list_for_each_entry(listen_node, &cm_core->listen_list.list, list) {
-		listen_addr = listen_node->loc_addr;
-		listen_port = listen_node->loc_port;
-
-		/* compare node pair, return node handle if a match */
-		if (((listen_addr == dst_addr) ||
-		     listen_addr == 0x00000000) &&
-		    (listen_port == dst_port) &&
-		    (listener_state & listen_node->listener_state)) {
-			atomic_inc(&listen_node->ref_count);
-			spin_unlock_irqrestore(&cm_core->listen_list_lock, flags);
-			return listen_node;
-		}
-	}
-	spin_unlock_irqrestore(&cm_core->listen_list_lock, flags);
-
-	/* no listener */
-	return NULL;
-}
-
-/**
- * add_hte_node - add a cm node to the hash table
- */
-static int add_hte_node(struct nes_cm_core *cm_core, struct nes_cm_node *cm_node)
-{
-	unsigned long flags;
-	struct list_head *hte;
-
-	if (!cm_node || !cm_core)
-		return -EINVAL;
-
-	nes_debug(NES_DBG_CM, "Adding Node %p to Active Connection HT\n",
-		  cm_node);
-
-	spin_lock_irqsave(&cm_core->ht_lock, flags);
-
-	/* get a handle on the hash table element (list head for this slot) */
-	hte = &cm_core->connected_nodes;
-	list_add_tail(&cm_node->list, hte);
-	atomic_inc(&cm_core->ht_node_cnt);
-
-	spin_unlock_irqrestore(&cm_core->ht_lock, flags);
-
-	return 0;
-}
-
-
-/**
- * mini_cm_dec_refcnt_listen
- */
-static int mini_cm_dec_refcnt_listen(struct nes_cm_core *cm_core,
-				     struct nes_cm_listener *listener, int free_hanging_nodes)
-{
-	int ret = -EINVAL;
-	int err = 0;
-	unsigned long flags;
-	struct list_head *list_pos = NULL;
-	struct list_head *list_temp = NULL;
-	struct nes_cm_node *cm_node = NULL;
-	struct list_head reset_list;
-
-	nes_debug(NES_DBG_CM, "attempting listener= %p free_nodes= %d, "
-		  "refcnt=%d\n", listener, free_hanging_nodes,
-		  atomic_read(&listener->ref_count));
-	/* free non-accelerated child nodes for this listener */
-	INIT_LIST_HEAD(&reset_list);
-	if (free_hanging_nodes) {
-		spin_lock_irqsave(&cm_core->ht_lock, flags);
-		list_for_each_safe(list_pos, list_temp,
-				   &g_cm_core->connected_nodes) {
-			cm_node = container_of(list_pos, struct nes_cm_node,
-					       list);
-			if ((cm_node->listener == listener) &&
-			    (!cm_node->accelerated)) {
-				add_ref_cm_node(cm_node);
-				list_add(&cm_node->reset_entry, &reset_list);
-			}
-		}
-		spin_unlock_irqrestore(&cm_core->ht_lock, flags);
-	}
-
-	list_for_each_safe(list_pos, list_temp, &reset_list) {
-		cm_node = container_of(list_pos, struct nes_cm_node,
-				       reset_entry);
-		{
-			struct nes_cm_node *loopback = cm_node->loopbackpartner;
-			enum nes_cm_node_state old_state;
-			if (NES_CM_STATE_FIN_WAIT1 <= cm_node->state) {
-				rem_ref_cm_node(cm_node->cm_core, cm_node);
-			} else {
-				if (!loopback) {
-					cleanup_retrans_entry(cm_node);
-					err = send_reset(cm_node, NULL);
-					if (err) {
-						cm_node->state =
-							NES_CM_STATE_CLOSED;
-						WARN_ON(1);
-					} else {
-						old_state = cm_node->state;
-						cm_node->state = NES_CM_STATE_LISTENER_DESTROYED;
-						if (old_state != NES_CM_STATE_MPAREQ_RCVD)
-							rem_ref_cm_node(
-								cm_node->cm_core,
-								cm_node);
-					}
-				} else {
-					struct nes_cm_event event;
-
-					event.cm_node = loopback;
-					event.cm_info.rem_addr =
-							loopback->rem_addr;
-					event.cm_info.loc_addr =
-							loopback->loc_addr;
-					event.cm_info.rem_port =
-							loopback->rem_port;
-					event.cm_info.loc_port =
-							 loopback->loc_port;
-					event.cm_info.cm_id = loopback->cm_id;
-					add_ref_cm_node(loopback);
-					loopback->state = NES_CM_STATE_CLOSED;
-					cm_event_connect_error(&event);
-					cm_node->state = NES_CM_STATE_LISTENER_DESTROYED;
-
-					rem_ref_cm_node(cm_node->cm_core,
-							 cm_node);
-
-				}
-			}
-		}
-	}
-
-	spin_lock_irqsave(&cm_core->listen_list_lock, flags);
-	if (!atomic_dec_return(&listener->ref_count)) {
-		list_del(&listener->list);
-
-		/* decrement our listen node count */
-		atomic_dec(&cm_core->listen_node_cnt);
-
-		spin_unlock_irqrestore(&cm_core->listen_list_lock, flags);
-
-		if (listener->nesvnic) {
-			nes_manage_apbvt(listener->nesvnic,
-				listener->loc_port,
-				PCI_FUNC(listener->nesvnic->nesdev->pcidev->devfn),
-				NES_MANAGE_APBVT_DEL);
-
-			nes_debug(NES_DBG_NLMSG,
-					"Delete APBVT loc_port = %04X\n",
-					listener->loc_port);
-		}
-
-		nes_debug(NES_DBG_CM, "destroying listener (%p)\n", listener);
-
-		kfree(listener);
-		listener = NULL;
-		ret = 0;
-		atomic_inc(&cm_listens_destroyed);
-	} else {
-		spin_unlock_irqrestore(&cm_core->listen_list_lock, flags);
-	}
-	if (listener) {
-		if (atomic_read(&listener->pend_accepts_cnt) > 0)
-			nes_debug(NES_DBG_CM, "destroying listener (%p)"
-				  " with non-zero pending accepts=%u\n",
-				  listener, atomic_read(&listener->pend_accepts_cnt));
-	}
-
-	return ret;
-}
-
-
-/**
- * mini_cm_del_listen
- */
-static int mini_cm_del_listen(struct nes_cm_core *cm_core,
-			      struct nes_cm_listener *listener)
-{
-	listener->listener_state = NES_CM_LISTENER_PASSIVE_STATE;
-	listener->cm_id = NULL; /* going to be destroyed pretty soon */
-	return mini_cm_dec_refcnt_listen(cm_core, listener, 1);
-}
-
-
-/**
- * mini_cm_accelerated
- */
-static inline int mini_cm_accelerated(struct nes_cm_core *cm_core,
-				      struct nes_cm_node *cm_node)
-{
-	cm_node->accelerated = true;
-
-	if (cm_node->accept_pend) {
-		BUG_ON(!cm_node->listener);
-		atomic_dec(&cm_node->listener->pend_accepts_cnt);
-		cm_node->accept_pend = 0;
-		BUG_ON(atomic_read(&cm_node->listener->pend_accepts_cnt) < 0);
-	}
-
-	if (!timer_pending(&cm_core->tcp_timer))
-		mod_timer(&cm_core->tcp_timer, (jiffies + NES_SHORT_TIME));
-
-	return 0;
-}
-
-
-/**
- * nes_addr_resolve_neigh
- */
-static int nes_addr_resolve_neigh(struct nes_vnic *nesvnic, u32 dst_ip, int arpindex)
-{
-	struct rtable *rt;
-	struct neighbour *neigh;
-	int rc = arpindex;
-	struct nes_adapter *nesadapter = nesvnic->nesdev->nesadapter;
-	__be32 dst_ipaddr = htonl(dst_ip);
-
-	rt = ip_route_output(&init_net, dst_ipaddr, nesvnic->local_ipaddr, 0, 0);
-	if (IS_ERR(rt)) {
-		printk(KERN_ERR "%s: ip_route_output_key failed for 0x%08X\n",
-		       __func__, dst_ip);
-		return rc;
-	}
-
-	neigh = dst_neigh_lookup(&rt->dst, &dst_ipaddr);
-
-	rcu_read_lock();
-	if (neigh) {
-		if (neigh->nud_state & NUD_VALID) {
-			nes_debug(NES_DBG_CM, "Neighbor MAC address for 0x%08X"
-				  " is %pM, Gateway is 0x%08X \n", dst_ip,
-				  neigh->ha, ntohl(rt->rt_gateway));
-
-			if (arpindex >= 0) {
-				if (ether_addr_equal(nesadapter->arp_table[arpindex].mac_addr, neigh->ha)) {
-					/* Mac address same as in nes_arp_table */
-					goto out;
-				}
-
-				nes_manage_arp_cache(nesvnic->netdev,
-						     nesadapter->arp_table[arpindex].mac_addr,
-						     dst_ip, NES_ARP_DELETE);
-			}
-
-			nes_manage_arp_cache(nesvnic->netdev, neigh->ha,
-					     dst_ip, NES_ARP_ADD);
-			rc = nes_arp_table(nesvnic->nesdev, dst_ip, NULL,
-					   NES_ARP_RESOLVE);
-		} else {
-			neigh_event_send(neigh, NULL);
-		}
-	}
-out:
-	rcu_read_unlock();
-
-	if (neigh)
-		neigh_release(neigh);
-
-	ip_rt_put(rt);
-	return rc;
-}
-
-/**
- * make_cm_node - create a new instance of a cm node
- */
-static struct nes_cm_node *make_cm_node(struct nes_cm_core *cm_core,
-					struct nes_vnic *nesvnic, struct nes_cm_info *cm_info,
-					struct nes_cm_listener *listener)
-{
-	struct nes_cm_node *cm_node;
-	int oldarpindex = 0;
-	int arpindex = 0;
-	struct nes_device *nesdev;
-	struct nes_adapter *nesadapter;
-
-	/* create an hte and cm_node for this instance */
-	cm_node = kzalloc(sizeof(*cm_node), GFP_ATOMIC);
-	if (!cm_node)
-		return NULL;
-
-	/* set our node specific transport info */
-	if (listener) {
-		cm_node->loc_addr = listener->loc_addr;
-		cm_node->loc_port = listener->loc_port;
-	} else {
-		cm_node->loc_addr = cm_info->loc_addr;
-		cm_node->loc_port = cm_info->loc_port;
-	}
-	cm_node->rem_addr = cm_info->rem_addr;
-	cm_node->rem_port = cm_info->rem_port;
-
-	cm_node->mpa_frame_rev = mpa_version;
-	cm_node->send_rdma0_op = SEND_RDMA_READ_ZERO;
-	cm_node->mpav2_ird_ord = 0;
-	cm_node->ird_size = 0;
-	cm_node->ord_size = 0;
-
-	nes_debug(NES_DBG_CM, "Make node addresses : loc = %pI4:%x, rem = %pI4:%x\n",
-		  &cm_node->loc_addr, cm_node->loc_port,
-		  &cm_node->rem_addr, cm_node->rem_port);
-	cm_node->listener = listener;
-	if (listener)
-		cm_node->tos = listener->tos;
-	cm_node->netdev = nesvnic->netdev;
-	cm_node->cm_id = cm_info->cm_id;
-	memcpy(cm_node->loc_mac, nesvnic->netdev->dev_addr, ETH_ALEN);
-
-	nes_debug(NES_DBG_CM, "listener=%p, cm_id=%p\n", cm_node->listener,
-		  cm_node->cm_id);
-
-	spin_lock_init(&cm_node->retrans_list_lock);
-
-	cm_node->loopbackpartner = NULL;
-	atomic_set(&cm_node->ref_count, 1);
-	/* associate our parent CM core */
-	cm_node->cm_core = cm_core;
-	cm_node->tcp_cntxt.loc_id = NES_CM_DEF_LOCAL_ID;
-	cm_node->tcp_cntxt.rcv_wscale = NES_CM_DEFAULT_RCV_WND_SCALE;
-	cm_node->tcp_cntxt.rcv_wnd = NES_CM_DEFAULT_RCV_WND_SCALED >>
-				     NES_CM_DEFAULT_RCV_WND_SCALE;
-	cm_node->tcp_cntxt.loc_seq_num = secure_tcp_seq(htonl(cm_node->loc_addr),
-							htonl(cm_node->rem_addr),
-							htons(cm_node->loc_port),
-							htons(cm_node->rem_port));
-	cm_node->tcp_cntxt.mss = nesvnic->max_frame_size - sizeof(struct iphdr) -
-				 sizeof(struct tcphdr) - ETH_HLEN - VLAN_HLEN;
-	cm_node->tcp_cntxt.rcv_nxt = 0;
-	/* get a unique session ID , add thread_id to an upcounter to handle race */
-	atomic_inc(&cm_core->node_cnt);
-	cm_node->conn_type = cm_info->conn_type;
-	cm_node->apbvt_set = 0;
-	cm_node->accept_pend = 0;
-
-	cm_node->nesvnic = nesvnic;
-	/* get some device handles, for arp lookup */
-	nesdev = nesvnic->nesdev;
-	nesadapter = nesdev->nesadapter;
-
-	cm_node->loopbackpartner = NULL;
-
-	/* get the mac addr for the remote node */
-	oldarpindex = nes_arp_table(nesdev, cm_node->rem_addr,
-				    NULL, NES_ARP_RESOLVE);
-	arpindex = nes_addr_resolve_neigh(nesvnic, cm_node->rem_addr,
-					  oldarpindex);
-	if (arpindex < 0) {
-		kfree(cm_node);
-		return NULL;
-	}
-
-	/* copy the mac addr to node context */
-	memcpy(cm_node->rem_mac, nesadapter->arp_table[arpindex].mac_addr, ETH_ALEN);
-	nes_debug(NES_DBG_CM, "Remote mac addr from arp table: %pM\n",
-		  cm_node->rem_mac);
-
-	add_hte_node(cm_core, cm_node);
-	atomic_inc(&cm_nodes_created);
-
-	return cm_node;
-}
-
-
-/**
- * add_ref_cm_node - destroy an instance of a cm node
- */
-static int add_ref_cm_node(struct nes_cm_node *cm_node)
-{
-	atomic_inc(&cm_node->ref_count);
-	return 0;
-}
-
-
-/**
- * rem_ref_cm_node - destroy an instance of a cm node
- */
-static int rem_ref_cm_node(struct nes_cm_core *cm_core,
-			   struct nes_cm_node *cm_node)
-{
-	unsigned long flags;
-	struct nes_qp *nesqp;
-
-	if (!cm_node)
-		return -EINVAL;
-
-	spin_lock_irqsave(&cm_node->cm_core->ht_lock, flags);
-	if (atomic_dec_return(&cm_node->ref_count)) {
-		spin_unlock_irqrestore(&cm_node->cm_core->ht_lock, flags);
-		return 0;
-	}
-	list_del(&cm_node->list);
-	atomic_dec(&cm_core->ht_node_cnt);
-	spin_unlock_irqrestore(&cm_node->cm_core->ht_lock, flags);
-
-	/* if the node is destroyed before connection was accelerated */
-	if (!cm_node->accelerated && cm_node->accept_pend) {
-		BUG_ON(!cm_node->listener);
-		atomic_dec(&cm_node->listener->pend_accepts_cnt);
-		BUG_ON(atomic_read(&cm_node->listener->pend_accepts_cnt) < 0);
-	}
-	WARN_ON(cm_node->send_entry);
-	if (cm_node->recv_entry)
-		handle_recv_entry(cm_node, 0);
-	if (cm_node->listener) {
-		mini_cm_dec_refcnt_listen(cm_core, cm_node->listener, 0);
-	} else {
-		if (cm_node->apbvt_set && cm_node->nesvnic) {
-			nes_manage_apbvt(cm_node->nesvnic, cm_node->loc_port,
-					 PCI_FUNC(cm_node->nesvnic->nesdev->pcidev->devfn),
-					 NES_MANAGE_APBVT_DEL);
-		}
-		nes_debug(NES_DBG_NLMSG, "Delete APBVT loc_port = %04X\n",
-			  cm_node->loc_port);
-	}
-
-	atomic_dec(&cm_core->node_cnt);
-	atomic_inc(&cm_nodes_destroyed);
-	nesqp = cm_node->nesqp;
-	if (nesqp) {
-		nesqp->cm_node = NULL;
-		nes_rem_ref(&nesqp->ibqp);
-		cm_node->nesqp = NULL;
-	}
-
-	kfree(cm_node);
-	return 0;
-}
-
-/**
- * process_options
- */
-static int process_options(struct nes_cm_node *cm_node, u8 *optionsloc,
-			   u32 optionsize, u32 syn_packet)
-{
-	u32 tmp;
-	u32 offset = 0;
-	union all_known_options *all_options;
-	char got_mss_option = 0;
-
-	while (offset < optionsize) {
-		all_options = (union all_known_options *)(optionsloc + offset);
-		switch (all_options->as_base.optionnum) {
-		case OPTION_NUMBER_END:
-			offset = optionsize;
-			break;
-		case OPTION_NUMBER_NONE:
-			offset += 1;
-			continue;
-		case OPTION_NUMBER_MSS:
-			nes_debug(NES_DBG_CM, "%s: MSS Length: %d Offset: %d "
-				  "Size: %d\n", __func__,
-				  all_options->as_mss.length, offset, optionsize);
-			got_mss_option = 1;
-			if (all_options->as_mss.length != 4) {
-				return 1;
-			} else {
-				tmp = ntohs(all_options->as_mss.mss);
-				if (tmp > 0 && tmp <
-				    cm_node->tcp_cntxt.mss)
-					cm_node->tcp_cntxt.mss = tmp;
-			}
-			break;
-		case OPTION_NUMBER_WINDOW_SCALE:
-			cm_node->tcp_cntxt.snd_wscale =
-				all_options->as_windowscale.shiftcount;
-			break;
-		default:
-			nes_debug(NES_DBG_CM, "TCP Option not understood: %x\n",
-				  all_options->as_base.optionnum);
-			break;
-		}
-		offset += all_options->as_base.length;
-	}
-	if ((!got_mss_option) && (syn_packet))
-		cm_node->tcp_cntxt.mss = NES_CM_DEFAULT_MSS;
-	return 0;
-}
-
-static void drop_packet(struct sk_buff *skb)
-{
-	atomic_inc(&cm_accel_dropped_pkts);
-	dev_kfree_skb_any(skb);
-}
-
-static void handle_fin_pkt(struct nes_cm_node *cm_node)
-{
-	nes_debug(NES_DBG_CM, "Received FIN, cm_node = %p, state = %u. "
-		  "refcnt=%d\n", cm_node, cm_node->state,
-		  atomic_read(&cm_node->ref_count));
-	switch (cm_node->state) {
-	case NES_CM_STATE_SYN_RCVD:
-	case NES_CM_STATE_SYN_SENT:
-	case NES_CM_STATE_ESTABLISHED:
-	case NES_CM_STATE_MPAREJ_RCVD:
-		cm_node->tcp_cntxt.rcv_nxt++;
-		cleanup_retrans_entry(cm_node);
-		cm_node->state = NES_CM_STATE_LAST_ACK;
-		send_fin(cm_node, NULL);
-		break;
-	case NES_CM_STATE_MPAREQ_SENT:
-		create_event(cm_node, NES_CM_EVENT_ABORTED);
-		cm_node->tcp_cntxt.rcv_nxt++;
-		cleanup_retrans_entry(cm_node);
-		cm_node->state = NES_CM_STATE_CLOSED;
-		add_ref_cm_node(cm_node);
-		send_reset(cm_node, NULL);
-		break;
-	case NES_CM_STATE_FIN_WAIT1:
-		cm_node->tcp_cntxt.rcv_nxt++;
-		cleanup_retrans_entry(cm_node);
-		cm_node->state = NES_CM_STATE_CLOSING;
-		send_ack(cm_node, NULL);
-		/* Wait for ACK as this is simultaneous close..
-		* After we receive ACK, do not send anything..
-		* Just rm the node.. Done.. */
-		break;
-	case NES_CM_STATE_FIN_WAIT2:
-		cm_node->tcp_cntxt.rcv_nxt++;
-		cleanup_retrans_entry(cm_node);
-		cm_node->state = NES_CM_STATE_TIME_WAIT;
-		send_ack(cm_node, NULL);
-		schedule_nes_timer(cm_node, NULL,  NES_TIMER_TYPE_CLOSE, 1, 0);
-		break;
-	case NES_CM_STATE_TIME_WAIT:
-		cm_node->tcp_cntxt.rcv_nxt++;
-		cleanup_retrans_entry(cm_node);
-		cm_node->state = NES_CM_STATE_CLOSED;
-		rem_ref_cm_node(cm_node->cm_core, cm_node);
-		break;
-	case NES_CM_STATE_TSA:
-	default:
-		nes_debug(NES_DBG_CM, "Error Rcvd FIN for node-%p state = %d\n",
-			cm_node, cm_node->state);
-		break;
-	}
-}
-
-
-static void handle_rst_pkt(struct nes_cm_node *cm_node, struct sk_buff *skb,
-	struct tcphdr *tcph)
-{
-
-	int	reset = 0;	/* whether to send reset in case of err.. */
-	atomic_inc(&cm_resets_recvd);
-	nes_debug(NES_DBG_CM, "Received Reset, cm_node = %p, state = %u."
-			" refcnt=%d\n", cm_node, cm_node->state,
-			atomic_read(&cm_node->ref_count));
-	cleanup_retrans_entry(cm_node);
-	switch (cm_node->state) {
-	case NES_CM_STATE_SYN_SENT:
-	case NES_CM_STATE_MPAREQ_SENT:
-		nes_debug(NES_DBG_CM, "%s[%u] create abort for cm_node=%p "
-			"listener=%p state=%d\n", __func__, __LINE__, cm_node,
-			cm_node->listener, cm_node->state);
-		switch (cm_node->mpa_frame_rev) {
-		case IETF_MPA_V2:
-			cm_node->mpa_frame_rev = IETF_MPA_V1;
-			/* send a syn and goto syn sent state */
-			cm_node->state = NES_CM_STATE_SYN_SENT;
-			if (send_syn(cm_node, 0, NULL)) {
-				active_open_err(cm_node, skb, reset);
-			}
-			break;
-		case IETF_MPA_V1:
-		default:
-			active_open_err(cm_node, skb, reset);
-			break;
-		}
-		break;
-	case NES_CM_STATE_MPAREQ_RCVD:
-		atomic_inc(&cm_node->passive_state);
-		dev_kfree_skb_any(skb);
-		break;
-	case NES_CM_STATE_ESTABLISHED:
-	case NES_CM_STATE_SYN_RCVD:
-	case NES_CM_STATE_LISTENING:
-		nes_debug(NES_DBG_CM, "Bad state %s[%u]\n", __func__, __LINE__);
-		passive_open_err(cm_node, skb, reset);
-		break;
-	case NES_CM_STATE_TSA:
-		active_open_err(cm_node, skb, reset);
-		break;
-	case NES_CM_STATE_CLOSED:
-		drop_packet(skb);
-		break;
-	case NES_CM_STATE_FIN_WAIT2:
-	case NES_CM_STATE_FIN_WAIT1:
-	case NES_CM_STATE_LAST_ACK:
-		cm_node->cm_id->rem_ref(cm_node->cm_id);
-		/* fall through */
-	case NES_CM_STATE_TIME_WAIT:
-		cm_node->state = NES_CM_STATE_CLOSED;
-		rem_ref_cm_node(cm_node->cm_core, cm_node);
-		drop_packet(skb);
-		break;
-	default:
-		drop_packet(skb);
-		break;
-	}
-}
-
-
-static void handle_rcv_mpa(struct nes_cm_node *cm_node, struct sk_buff *skb)
-{
-	int ret = 0;
-	int datasize = skb->len;
-	u8 *dataloc = skb->data;
-
-	enum nes_cm_event_type type = NES_CM_EVENT_UNKNOWN;
-	u32 res_type;
-
-	ret = parse_mpa(cm_node, dataloc, &res_type, datasize);
-	if (ret) {
-		nes_debug(NES_DBG_CM, "didn't like MPA Request\n");
-		if (cm_node->state == NES_CM_STATE_MPAREQ_SENT) {
-			nes_debug(NES_DBG_CM, "%s[%u] create abort for "
-				  "cm_node=%p listener=%p state=%d\n", __func__,
-				  __LINE__, cm_node, cm_node->listener,
-				  cm_node->state);
-			active_open_err(cm_node, skb, 1);
-		} else {
-			passive_open_err(cm_node, skb, 1);
-		}
-		return;
-	}
-
-	switch (cm_node->state) {
-	case NES_CM_STATE_ESTABLISHED:
-		if (res_type == NES_MPA_REQUEST_REJECT)
-			/*BIG problem as we are receiving the MPA.. So should
-			 * not be REJECT.. This is Passive Open.. We can
-			 * only receive it Reject for Active Open...*/
-			WARN_ON(1);
-		cm_node->state = NES_CM_STATE_MPAREQ_RCVD;
-		type = NES_CM_EVENT_MPA_REQ;
-		atomic_set(&cm_node->passive_state,
-			   NES_PASSIVE_STATE_INDICATED);
-		break;
-	case NES_CM_STATE_MPAREQ_SENT:
-		cleanup_retrans_entry(cm_node);
-		if (res_type == NES_MPA_REQUEST_REJECT) {
-			type = NES_CM_EVENT_MPA_REJECT;
-			cm_node->state = NES_CM_STATE_MPAREJ_RCVD;
-		} else {
-			type = NES_CM_EVENT_CONNECTED;
-			cm_node->state = NES_CM_STATE_TSA;
-		}
-		send_ack(cm_node, NULL);
-		break;
-	default:
-		WARN_ON(1);
-		break;
-	}
-	dev_kfree_skb_any(skb);
-	create_event(cm_node, type);
-}
-
-static void indicate_pkt_err(struct nes_cm_node *cm_node, struct sk_buff *skb)
-{
-	switch (cm_node->state) {
-	case NES_CM_STATE_SYN_SENT:
-	case NES_CM_STATE_MPAREQ_SENT:
-		nes_debug(NES_DBG_CM, "%s[%u] create abort for cm_node=%p "
-			  "listener=%p state=%d\n", __func__, __LINE__, cm_node,
-			  cm_node->listener, cm_node->state);
-		active_open_err(cm_node, skb, 1);
-		break;
-	case NES_CM_STATE_ESTABLISHED:
-	case NES_CM_STATE_SYN_RCVD:
-		passive_open_err(cm_node, skb, 1);
-		break;
-	case NES_CM_STATE_TSA:
-	default:
-		drop_packet(skb);
-	}
-}
-
-static int check_syn(struct nes_cm_node *cm_node, struct tcphdr *tcph,
-		     struct sk_buff *skb)
-{
-	int err;
-
-	err = ((ntohl(tcph->ack_seq) == cm_node->tcp_cntxt.loc_seq_num)) ? 0 : 1;
-	if (err)
-		active_open_err(cm_node, skb, 1);
-
-	return err;
-}
-
-static int check_seq(struct nes_cm_node *cm_node, struct tcphdr *tcph,
-		     struct sk_buff *skb)
-{
-	int err = 0;
-	u32 seq;
-	u32 ack_seq;
-	u32 loc_seq_num = cm_node->tcp_cntxt.loc_seq_num;
-	u32 rcv_nxt = cm_node->tcp_cntxt.rcv_nxt;
-	u32 rcv_wnd;
-
-	seq = ntohl(tcph->seq);
-	ack_seq = ntohl(tcph->ack_seq);
-	rcv_wnd = cm_node->tcp_cntxt.rcv_wnd;
-	if (ack_seq != loc_seq_num)
-		err = 1;
-	else if (!between(seq, rcv_nxt, (rcv_nxt + rcv_wnd)))
-		err = 1;
-	if (err) {
-		nes_debug(NES_DBG_CM, "%s[%u] create abort for cm_node=%p "
-			  "listener=%p state=%d\n", __func__, __LINE__, cm_node,
-			  cm_node->listener, cm_node->state);
-		indicate_pkt_err(cm_node, skb);
-		nes_debug(NES_DBG_CM, "seq ERROR cm_node =%p seq=0x%08X "
-			  "rcv_nxt=0x%08X rcv_wnd=0x%x\n", cm_node, seq, rcv_nxt,
-			  rcv_wnd);
-	}
-	return err;
-}
-
-/*
- * handle_syn_pkt() is for Passive node. The syn packet is received when a node
- * is created with a listener or it may comein as rexmitted packet which in
- * that case will be just dropped.
- */
-static void handle_syn_pkt(struct nes_cm_node *cm_node, struct sk_buff *skb,
-			   struct tcphdr *tcph)
-{
-	int ret;
-	u32 inc_sequence;
-	int optionsize;
-
-	optionsize = (tcph->doff << 2) - sizeof(struct tcphdr);
-	skb_trim(skb, 0);
-	inc_sequence = ntohl(tcph->seq);
-
-	switch (cm_node->state) {
-	case NES_CM_STATE_SYN_SENT:
-	case NES_CM_STATE_MPAREQ_SENT:
-		/* Rcvd syn on active open connection*/
-		active_open_err(cm_node, skb, 1);
-		break;
-	case NES_CM_STATE_LISTENING:
-		/* Passive OPEN */
-		if (atomic_read(&cm_node->listener->pend_accepts_cnt) >
-		    cm_node->listener->backlog) {
-			nes_debug(NES_DBG_CM, "drop syn due to backlog "
-				  "pressure \n");
-			cm_backlog_drops++;
-			passive_open_err(cm_node, skb, 0);
-			break;
-		}
-		ret = handle_tcp_options(cm_node, tcph, skb, optionsize,
-					 1);
-		if (ret) {
-			passive_open_err(cm_node, skb, 0);
-			/* drop pkt */
-			break;
-		}
-		cm_node->tcp_cntxt.rcv_nxt = inc_sequence + 1;
-		BUG_ON(cm_node->send_entry);
-		cm_node->accept_pend = 1;
-		atomic_inc(&cm_node->listener->pend_accepts_cnt);
-
-		cm_node->state = NES_CM_STATE_SYN_RCVD;
-		send_syn(cm_node, 1, skb);
-		break;
-	case NES_CM_STATE_CLOSED:
-		cleanup_retrans_entry(cm_node);
-		add_ref_cm_node(cm_node);
-		send_reset(cm_node, skb);
-		break;
-	case NES_CM_STATE_TSA:
-	case NES_CM_STATE_ESTABLISHED:
-	case NES_CM_STATE_FIN_WAIT1:
-	case NES_CM_STATE_FIN_WAIT2:
-	case NES_CM_STATE_MPAREQ_RCVD:
-	case NES_CM_STATE_LAST_ACK:
-	case NES_CM_STATE_CLOSING:
-	case NES_CM_STATE_UNKNOWN:
-	default:
-		drop_packet(skb);
-		break;
-	}
-}
-
-static void handle_synack_pkt(struct nes_cm_node *cm_node, struct sk_buff *skb,
-			      struct tcphdr *tcph)
-{
-	int ret;
-	u32 inc_sequence;
-	int optionsize;
-
-	optionsize = (tcph->doff << 2) - sizeof(struct tcphdr);
-	skb_trim(skb, 0);
-	inc_sequence = ntohl(tcph->seq);
-	switch (cm_node->state) {
-	case NES_CM_STATE_SYN_SENT:
-		cleanup_retrans_entry(cm_node);
-		/* active open */
-		if (check_syn(cm_node, tcph, skb))
-			return;
-		cm_node->tcp_cntxt.rem_ack_num = ntohl(tcph->ack_seq);
-		/* setup options */
-		ret = handle_tcp_options(cm_node, tcph, skb, optionsize, 0);
-		if (ret) {
-			nes_debug(NES_DBG_CM, "cm_node=%p tcp_options failed\n",
-				  cm_node);
-			break;
-		}
-		cleanup_retrans_entry(cm_node);
-		cm_node->tcp_cntxt.rcv_nxt = inc_sequence + 1;
-		send_mpa_request(cm_node, skb);
-		cm_node->state = NES_CM_STATE_MPAREQ_SENT;
-		break;
-	case NES_CM_STATE_MPAREQ_RCVD:
-		/* passive open, so should not be here */
-		passive_open_err(cm_node, skb, 1);
-		break;
-	case NES_CM_STATE_LISTENING:
-		cm_node->tcp_cntxt.loc_seq_num = ntohl(tcph->ack_seq);
-		cleanup_retrans_entry(cm_node);
-		cm_node->state = NES_CM_STATE_CLOSED;
-		send_reset(cm_node, skb);
-		break;
-	case NES_CM_STATE_CLOSED:
-		cm_node->tcp_cntxt.loc_seq_num = ntohl(tcph->ack_seq);
-		cleanup_retrans_entry(cm_node);
-		add_ref_cm_node(cm_node);
-		send_reset(cm_node, skb);
-		break;
-	case NES_CM_STATE_ESTABLISHED:
-	case NES_CM_STATE_FIN_WAIT1:
-	case NES_CM_STATE_FIN_WAIT2:
-	case NES_CM_STATE_LAST_ACK:
-	case NES_CM_STATE_TSA:
-	case NES_CM_STATE_CLOSING:
-	case NES_CM_STATE_UNKNOWN:
-	case NES_CM_STATE_MPAREQ_SENT:
-	default:
-		drop_packet(skb);
-		break;
-	}
-}
-
-static int handle_ack_pkt(struct nes_cm_node *cm_node, struct sk_buff *skb,
-			  struct tcphdr *tcph)
-{
-	int datasize = 0;
-	u32 inc_sequence;
-	int ret = 0;
-	int optionsize;
-
-	optionsize = (tcph->doff << 2) - sizeof(struct tcphdr);
-
-	if (check_seq(cm_node, tcph, skb))
-		return -EINVAL;
-
-	skb_pull(skb, tcph->doff << 2);
-	inc_sequence = ntohl(tcph->seq);
-	datasize = skb->len;
-	switch (cm_node->state) {
-	case NES_CM_STATE_SYN_RCVD:
-		/* Passive OPEN */
-		cleanup_retrans_entry(cm_node);
-		ret = handle_tcp_options(cm_node, tcph, skb, optionsize, 1);
-		if (ret)
-			break;
-		cm_node->tcp_cntxt.rem_ack_num = ntohl(tcph->ack_seq);
-		cm_node->state = NES_CM_STATE_ESTABLISHED;
-		if (datasize) {
-			cm_node->tcp_cntxt.rcv_nxt = inc_sequence + datasize;
-			handle_rcv_mpa(cm_node, skb);
-		} else { /* rcvd ACK only */
-			dev_kfree_skb_any(skb);
-		}
-		break;
-	case NES_CM_STATE_ESTABLISHED:
-		/* Passive OPEN */
-		cleanup_retrans_entry(cm_node);
-		if (datasize) {
-			cm_node->tcp_cntxt.rcv_nxt = inc_sequence + datasize;
-			handle_rcv_mpa(cm_node, skb);
-		} else {
-			drop_packet(skb);
-		}
-		break;
-	case NES_CM_STATE_MPAREQ_SENT:
-		cm_node->tcp_cntxt.rem_ack_num = ntohl(tcph->ack_seq);
-		if (datasize) {
-			cm_node->tcp_cntxt.rcv_nxt = inc_sequence + datasize;
-			handle_rcv_mpa(cm_node, skb);
-		} else { /* Could be just an ack pkt.. */
-			dev_kfree_skb_any(skb);
-		}
-		break;
-	case NES_CM_STATE_LISTENING:
-		cleanup_retrans_entry(cm_node);
-		cm_node->state = NES_CM_STATE_CLOSED;
-		send_reset(cm_node, skb);
-		break;
-	case NES_CM_STATE_CLOSED:
-		cleanup_retrans_entry(cm_node);
-		add_ref_cm_node(cm_node);
-		send_reset(cm_node, skb);
-		break;
-	case NES_CM_STATE_LAST_ACK:
-	case NES_CM_STATE_CLOSING:
-		cleanup_retrans_entry(cm_node);
-		cm_node->state = NES_CM_STATE_CLOSED;
-		cm_node->cm_id->rem_ref(cm_node->cm_id);
-		rem_ref_cm_node(cm_node->cm_core, cm_node);
-		drop_packet(skb);
-		break;
-	case NES_CM_STATE_FIN_WAIT1:
-		cleanup_retrans_entry(cm_node);
-		drop_packet(skb);
-		cm_node->state = NES_CM_STATE_FIN_WAIT2;
-		break;
-	case NES_CM_STATE_SYN_SENT:
-	case NES_CM_STATE_FIN_WAIT2:
-	case NES_CM_STATE_TSA:
-	case NES_CM_STATE_MPAREQ_RCVD:
-	case NES_CM_STATE_UNKNOWN:
-	default:
-		cleanup_retrans_entry(cm_node);
-		drop_packet(skb);
-		break;
-	}
-	return ret;
-}
-
-
-
-static int handle_tcp_options(struct nes_cm_node *cm_node, struct tcphdr *tcph,
-			      struct sk_buff *skb, int optionsize, int passive)
-{
-	u8 *optionsloc = (u8 *)&tcph[1];
-
-	if (optionsize) {
-		if (process_options(cm_node, optionsloc, optionsize,
-				    (u32)tcph->syn)) {
-			nes_debug(NES_DBG_CM, "%s: Node %p, Sending RESET\n",
-				  __func__, cm_node);
-			if (passive)
-				passive_open_err(cm_node, skb, 1);
-			else
-				active_open_err(cm_node, skb, 1);
-			return 1;
-		}
-	}
-
-	cm_node->tcp_cntxt.snd_wnd = ntohs(tcph->window) <<
-				     cm_node->tcp_cntxt.snd_wscale;
-
-	if (cm_node->tcp_cntxt.snd_wnd > cm_node->tcp_cntxt.max_snd_wnd)
-		cm_node->tcp_cntxt.max_snd_wnd = cm_node->tcp_cntxt.snd_wnd;
-	return 0;
-}
-
-/*
- * active_open_err() will send reset() if flag set..
- * It will also send ABORT event.
- */
-static void active_open_err(struct nes_cm_node *cm_node, struct sk_buff *skb,
-			    int reset)
-{
-	cleanup_retrans_entry(cm_node);
-	if (reset) {
-		nes_debug(NES_DBG_CM, "ERROR active err called for cm_node=%p, "
-			  "state=%d\n", cm_node, cm_node->state);
-		add_ref_cm_node(cm_node);
-		send_reset(cm_node, skb);
-	} else {
-		dev_kfree_skb_any(skb);
-	}
-
-	cm_node->state = NES_CM_STATE_CLOSED;
-	create_event(cm_node, NES_CM_EVENT_ABORTED);
-}
-
-/*
- * passive_open_err() will either do a reset() or will free up the skb and
- * remove the cm_node.
- */
-static void passive_open_err(struct nes_cm_node *cm_node, struct sk_buff *skb,
-			     int reset)
-{
-	cleanup_retrans_entry(cm_node);
-	cm_node->state = NES_CM_STATE_CLOSED;
-	if (reset) {
-		nes_debug(NES_DBG_CM, "passive_open_err sending RST for "
-			  "cm_node=%p state =%d\n", cm_node, cm_node->state);
-		send_reset(cm_node, skb);
-	} else {
-		dev_kfree_skb_any(skb);
-		rem_ref_cm_node(cm_node->cm_core, cm_node);
-	}
-}
-
-/*
- * free_retrans_entry() routines assumes that the retrans_list_lock has
- * been acquired before calling.
- */
-static void free_retrans_entry(struct nes_cm_node *cm_node)
-{
-	struct nes_timer_entry *send_entry;
-
-	send_entry = cm_node->send_entry;
-	if (send_entry) {
-		cm_node->send_entry = NULL;
-		dev_kfree_skb_any(send_entry->skb);
-		kfree(send_entry);
-		rem_ref_cm_node(cm_node->cm_core, cm_node);
-	}
-}
-
-static void cleanup_retrans_entry(struct nes_cm_node *cm_node)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&cm_node->retrans_list_lock, flags);
-	free_retrans_entry(cm_node);
-	spin_unlock_irqrestore(&cm_node->retrans_list_lock, flags);
-}
-
-/**
- * process_packet
- * Returns skb if to be freed, else it will return NULL if already used..
- */
-static void process_packet(struct nes_cm_node *cm_node, struct sk_buff *skb,
-			   struct nes_cm_core *cm_core)
-{
-	enum nes_tcpip_pkt_type pkt_type = NES_PKT_TYPE_UNKNOWN;
-	struct tcphdr *tcph = tcp_hdr(skb);
-	u32 fin_set = 0;
-	int ret = 0;
-
-	skb_pull(skb, ip_hdr(skb)->ihl << 2);
-
-	nes_debug(NES_DBG_CM, "process_packet: cm_node=%p state =%d syn=%d "
-		  "ack=%d rst=%d fin=%d\n", cm_node, cm_node->state, tcph->syn,
-		  tcph->ack, tcph->rst, tcph->fin);
-
-	if (tcph->rst) {
-		pkt_type = NES_PKT_TYPE_RST;
-	} else if (tcph->syn) {
-		pkt_type = NES_PKT_TYPE_SYN;
-		if (tcph->ack)
-			pkt_type = NES_PKT_TYPE_SYNACK;
-	} else if (tcph->ack) {
-		pkt_type = NES_PKT_TYPE_ACK;
-	}
-	if (tcph->fin)
-		fin_set = 1;
-
-	switch (pkt_type) {
-	case NES_PKT_TYPE_SYN:
-		handle_syn_pkt(cm_node, skb, tcph);
-		break;
-	case NES_PKT_TYPE_SYNACK:
-		handle_synack_pkt(cm_node, skb, tcph);
-		break;
-	case NES_PKT_TYPE_ACK:
-		ret = handle_ack_pkt(cm_node, skb, tcph);
-		if (fin_set && !ret)
-			handle_fin_pkt(cm_node);
-		break;
-	case NES_PKT_TYPE_RST:
-		handle_rst_pkt(cm_node, skb, tcph);
-		break;
-	default:
-		if ((fin_set) && (!check_seq(cm_node, tcph, skb)))
-			handle_fin_pkt(cm_node);
-		drop_packet(skb);
-		break;
-	}
-}
-
-/**
- * mini_cm_listen - create a listen node with params
- */
-static struct nes_cm_listener *mini_cm_listen(struct nes_cm_core *cm_core,
-			struct nes_vnic *nesvnic, struct nes_cm_info *cm_info)
-{
-	struct nes_cm_listener *listener;
-	unsigned long flags;
-
-	nes_debug(NES_DBG_CM, "Search for 0x%08x : 0x%04x\n",
-		  cm_info->loc_addr, cm_info->loc_port);
-
-	/* cannot have multiple matching listeners */
-	listener = find_listener(cm_core, cm_info->loc_addr, cm_info->loc_port,
-				NES_CM_LISTENER_EITHER_STATE);
-
-	if (listener && listener->listener_state == NES_CM_LISTENER_ACTIVE_STATE) {
-		/* find automatically incs ref count ??? */
-		atomic_dec(&listener->ref_count);
-		nes_debug(NES_DBG_CM, "Not creating listener since it already exists\n");
-		return NULL;
-	}
-
-	if (!listener) {
-		/* create a CM listen node (1/2 node to compare incoming traffic to) */
-		listener = kzalloc(sizeof(*listener), GFP_ATOMIC);
-		if (!listener)
-			return NULL;
-
-		listener->loc_addr = cm_info->loc_addr;
-		listener->loc_port = cm_info->loc_port;
-		listener->reused_node = 0;
-
-		atomic_set(&listener->ref_count, 1);
-	}
-	/* pasive case */
-	/* find already inc'ed the ref count */
-	else {
-		listener->reused_node = 1;
-	}
-
-	listener->cm_id = cm_info->cm_id;
-	atomic_set(&listener->pend_accepts_cnt, 0);
-	listener->cm_core = cm_core;
-	listener->nesvnic = nesvnic;
-	atomic_inc(&cm_core->node_cnt);
-
-	listener->conn_type = cm_info->conn_type;
-	listener->backlog = cm_info->backlog;
-	listener->listener_state = NES_CM_LISTENER_ACTIVE_STATE;
-
-	if (!listener->reused_node) {
-		spin_lock_irqsave(&cm_core->listen_list_lock, flags);
-		list_add(&listener->list, &cm_core->listen_list.list);
-		spin_unlock_irqrestore(&cm_core->listen_list_lock, flags);
-		atomic_inc(&cm_core->listen_node_cnt);
-	}
-
-	nes_debug(NES_DBG_CM, "Api - listen(): addr=0x%08X, port=0x%04x,"
-		  " listener = %p, backlog = %d, cm_id = %p.\n",
-		  cm_info->loc_addr, cm_info->loc_port,
-		  listener, listener->backlog, listener->cm_id);
-
-	return listener;
-}
-
-
-/**
- * mini_cm_connect - make a connection node with params
- */
-static struct nes_cm_node *mini_cm_connect(struct nes_cm_core *cm_core,
-					   struct nes_vnic *nesvnic, u16 private_data_len,
-					   void *private_data, struct nes_cm_info *cm_info)
-{
-	int ret = 0;
-	struct nes_cm_node *cm_node;
-	struct nes_cm_listener *loopbackremotelistener;
-	struct nes_cm_node *loopbackremotenode;
-	struct nes_cm_info loopback_cm_info;
-	u8 *start_buff;
-
-	/* create a CM connection node */
-	cm_node = make_cm_node(cm_core, nesvnic, cm_info, NULL);
-	if (!cm_node)
-		return NULL;
-
-	/* set our node side to client (active) side */
-	cm_node->tcp_cntxt.client = 1;
-	cm_node->tcp_cntxt.rcv_wscale = NES_CM_DEFAULT_RCV_WND_SCALE;
-
-	if (cm_info->loc_addr == cm_info->rem_addr) {
-		loopbackremotelistener = find_listener(cm_core,
-			cm_node->loc_addr, cm_node->rem_port,
-			NES_CM_LISTENER_ACTIVE_STATE);
-		if (loopbackremotelistener == NULL) {
-			create_event(cm_node, NES_CM_EVENT_ABORTED);
-		} else {
-			loopback_cm_info = *cm_info;
-			loopback_cm_info.loc_port = cm_info->rem_port;
-			loopback_cm_info.rem_port = cm_info->loc_port;
-			loopback_cm_info.loc_port =
-				cm_info->rem_port;
-			loopback_cm_info.rem_port =
-				cm_info->loc_port;
-			loopback_cm_info.cm_id = loopbackremotelistener->cm_id;
-			loopbackremotenode = make_cm_node(cm_core, nesvnic,
-							  &loopback_cm_info, loopbackremotelistener);
-			if (!loopbackremotenode) {
-				rem_ref_cm_node(cm_node->cm_core, cm_node);
-				return NULL;
-			}
-			atomic_inc(&cm_loopbacks);
-			loopbackremotenode->loopbackpartner = cm_node;
-			loopbackremotenode->tcp_cntxt.rcv_wscale =
-				NES_CM_DEFAULT_RCV_WND_SCALE;
-			cm_node->loopbackpartner = loopbackremotenode;
-			memcpy(loopbackremotenode->mpa_frame_buf, private_data,
-			       private_data_len);
-			loopbackremotenode->mpa_frame_size = private_data_len;
-
-			/* we are done handling this state. */
-			/* set node to a TSA state */
-			cm_node->state = NES_CM_STATE_TSA;
-			cm_node->tcp_cntxt.rcv_nxt =
-				loopbackremotenode->tcp_cntxt.loc_seq_num;
-			loopbackremotenode->tcp_cntxt.rcv_nxt =
-				cm_node->tcp_cntxt.loc_seq_num;
-			cm_node->tcp_cntxt.max_snd_wnd =
-				loopbackremotenode->tcp_cntxt.rcv_wnd;
-			loopbackremotenode->tcp_cntxt.max_snd_wnd =
-				cm_node->tcp_cntxt.rcv_wnd;
-			cm_node->tcp_cntxt.snd_wnd =
-				loopbackremotenode->tcp_cntxt.rcv_wnd;
-			loopbackremotenode->tcp_cntxt.snd_wnd =
-				cm_node->tcp_cntxt.rcv_wnd;
-			cm_node->tcp_cntxt.snd_wscale =
-				loopbackremotenode->tcp_cntxt.rcv_wscale;
-			loopbackremotenode->tcp_cntxt.snd_wscale =
-				cm_node->tcp_cntxt.rcv_wscale;
-			loopbackremotenode->state = NES_CM_STATE_MPAREQ_RCVD;
-			create_event(loopbackremotenode, NES_CM_EVENT_MPA_REQ);
-		}
-		return cm_node;
-	}
-
-	start_buff = &cm_node->mpa_frame_buf[0] + sizeof(struct ietf_mpa_v2);
-	cm_node->mpa_frame_size = private_data_len;
-
-	memcpy(start_buff, private_data, private_data_len);
-
-	/* send a syn and goto syn sent state */
-	cm_node->state = NES_CM_STATE_SYN_SENT;
-	ret = send_syn(cm_node, 0, NULL);
-
-	if (ret) {
-		/* error in sending the syn free up the cm_node struct */
-		nes_debug(NES_DBG_CM, "Api - connect() FAILED: dest "
-			  "addr=0x%08X, port=0x%04x, cm_node=%p, cm_id = %p.\n",
-			  cm_node->rem_addr, cm_node->rem_port, cm_node,
-			  cm_node->cm_id);
-		rem_ref_cm_node(cm_node->cm_core, cm_node);
-		cm_node = NULL;
-	}
-
-	if (cm_node) {
-		nes_debug(NES_DBG_CM, "Api - connect(): dest addr=0x%08X,"
-			  "port=0x%04x, cm_node=%p, cm_id = %p.\n",
-			  cm_node->rem_addr, cm_node->rem_port, cm_node,
-			  cm_node->cm_id);
-	}
-
-	return cm_node;
-}
-
-
-/**
- * mini_cm_accept - accept a connection
- * This function is never called
- */
-static int mini_cm_accept(struct nes_cm_core *cm_core, struct nes_cm_node *cm_node)
-{
-	return 0;
-}
-
-
-/**
- * mini_cm_reject - reject and teardown a connection
- */
-static int mini_cm_reject(struct nes_cm_core *cm_core, struct nes_cm_node *cm_node)
-{
-	int ret = 0;
-	int err = 0;
-	int passive_state;
-	struct nes_cm_event event;
-	struct iw_cm_id *cm_id = cm_node->cm_id;
-	struct nes_cm_node *loopback = cm_node->loopbackpartner;
-
-	nes_debug(NES_DBG_CM, "%s cm_node=%p type=%d state=%d\n",
-		  __func__, cm_node, cm_node->tcp_cntxt.client, cm_node->state);
-
-	if (cm_node->tcp_cntxt.client)
-		return ret;
-	cleanup_retrans_entry(cm_node);
-
-	if (!loopback) {
-		passive_state = atomic_add_return(1, &cm_node->passive_state);
-		if (passive_state == NES_SEND_RESET_EVENT) {
-			cm_node->state = NES_CM_STATE_CLOSED;
-			rem_ref_cm_node(cm_core, cm_node);
-		} else {
-			if (cm_node->state == NES_CM_STATE_LISTENER_DESTROYED) {
-				rem_ref_cm_node(cm_core, cm_node);
-			} else {
-				ret = send_mpa_reject(cm_node);
-				if (ret) {
-					cm_node->state = NES_CM_STATE_CLOSED;
-					err = send_reset(cm_node, NULL);
-					if (err)
-						WARN_ON(1);
-				} else {
-					cm_id->add_ref(cm_id);
-				}
-			}
-		}
-	} else {
-		cm_node->cm_id = NULL;
-		if (cm_node->state == NES_CM_STATE_LISTENER_DESTROYED) {
-			rem_ref_cm_node(cm_core, cm_node);
-			rem_ref_cm_node(cm_core, loopback);
-		} else {
-			event.cm_node = loopback;
-			event.cm_info.rem_addr = loopback->rem_addr;
-			event.cm_info.loc_addr = loopback->loc_addr;
-			event.cm_info.rem_port = loopback->rem_port;
-			event.cm_info.loc_port = loopback->loc_port;
-			event.cm_info.cm_id = loopback->cm_id;
-			cm_event_mpa_reject(&event);
-			rem_ref_cm_node(cm_core, cm_node);
-			loopback->state = NES_CM_STATE_CLOSING;
-
-			cm_id = loopback->cm_id;
-			rem_ref_cm_node(cm_core, loopback);
-			cm_id->rem_ref(cm_id);
-		}
-	}
-
-	return ret;
-}
-
-
-/**
- * mini_cm_close
- */
-static int mini_cm_close(struct nes_cm_core *cm_core, struct nes_cm_node *cm_node)
-{
-	int ret = 0;
-
-	if (!cm_core || !cm_node)
-		return -EINVAL;
-
-	switch (cm_node->state) {
-	case NES_CM_STATE_SYN_RCVD:
-	case NES_CM_STATE_SYN_SENT:
-	case NES_CM_STATE_ONE_SIDE_ESTABLISHED:
-	case NES_CM_STATE_ESTABLISHED:
-	case NES_CM_STATE_ACCEPTING:
-	case NES_CM_STATE_MPAREQ_SENT:
-	case NES_CM_STATE_MPAREQ_RCVD:
-		cleanup_retrans_entry(cm_node);
-		send_reset(cm_node, NULL);
-		break;
-	case NES_CM_STATE_CLOSE_WAIT:
-		cm_node->state = NES_CM_STATE_LAST_ACK;
-		send_fin(cm_node, NULL);
-		break;
-	case NES_CM_STATE_FIN_WAIT1:
-	case NES_CM_STATE_FIN_WAIT2:
-	case NES_CM_STATE_LAST_ACK:
-	case NES_CM_STATE_TIME_WAIT:
-	case NES_CM_STATE_CLOSING:
-		ret = -1;
-		break;
-	case NES_CM_STATE_LISTENING:
-		cleanup_retrans_entry(cm_node);
-		send_reset(cm_node, NULL);
-		break;
-	case NES_CM_STATE_MPAREJ_RCVD:
-	case NES_CM_STATE_UNKNOWN:
-	case NES_CM_STATE_INITED:
-	case NES_CM_STATE_CLOSED:
-	case NES_CM_STATE_LISTENER_DESTROYED:
-		ret = rem_ref_cm_node(cm_core, cm_node);
-		break;
-	case NES_CM_STATE_TSA:
-		if (cm_node->send_entry)
-			printk(KERN_ERR "ERROR Close got called from STATE_TSA "
-			       "send_entry=%p\n", cm_node->send_entry);
-		ret = rem_ref_cm_node(cm_core, cm_node);
-		break;
-	}
-	return ret;
-}
-
-
-/**
- * recv_pkt - recv an ETHERNET packet, and process it through CM
- * node state machine
- */
-static int mini_cm_recv_pkt(struct nes_cm_core *cm_core,
-			    struct nes_vnic *nesvnic, struct sk_buff *skb)
-{
-	struct nes_cm_node *cm_node = NULL;
-	struct nes_cm_listener *listener = NULL;
-	struct iphdr *iph;
-	struct tcphdr *tcph;
-	struct nes_cm_info nfo;
-	int skb_handled = 1;
-	__be32 tmp_daddr, tmp_saddr;
-
-	if (!skb)
-		return 0;
-	if (skb->len < sizeof(struct iphdr) + sizeof(struct tcphdr))
-		return 0;
-
-	iph = (struct iphdr *)skb->data;
-	tcph = (struct tcphdr *)(skb->data + sizeof(struct iphdr));
-
-	nfo.loc_addr = ntohl(iph->daddr);
-	nfo.loc_port = ntohs(tcph->dest);
-	nfo.rem_addr = ntohl(iph->saddr);
-	nfo.rem_port = ntohs(tcph->source);
-
-	tmp_daddr = cpu_to_be32(iph->daddr);
-	tmp_saddr = cpu_to_be32(iph->saddr);
-
-	nes_debug(NES_DBG_CM, "Received packet: dest=%pI4:0x%04X src=%pI4:0x%04X\n",
-		  &tmp_daddr, tcph->dest, &tmp_saddr, tcph->source);
-
-	do {
-		cm_node = find_node(cm_core,
-				    nfo.rem_port, nfo.rem_addr,
-				    nfo.loc_port, nfo.loc_addr);
-
-		if (!cm_node) {
-			/* Only type of packet accepted are for */
-			/* the PASSIVE open (syn only) */
-			if ((!tcph->syn) || (tcph->ack)) {
-				skb_handled = 0;
-				break;
-			}
-			listener = find_listener(cm_core, nfo.loc_addr,
-						 nfo.loc_port,
-						 NES_CM_LISTENER_ACTIVE_STATE);
-			if (!listener) {
-				nfo.cm_id = NULL;
-				nfo.conn_type = 0;
-				nes_debug(NES_DBG_CM, "Unable to find listener for the pkt\n");
-				skb_handled = 0;
-				break;
-			}
-			nfo.cm_id = listener->cm_id;
-			nfo.conn_type = listener->conn_type;
-			cm_node = make_cm_node(cm_core, nesvnic, &nfo,
-					       listener);
-			if (!cm_node) {
-				nes_debug(NES_DBG_CM, "Unable to allocate "
-					  "node\n");
-				cm_packets_dropped++;
-				atomic_dec(&listener->ref_count);
-				dev_kfree_skb_any(skb);
-				break;
-			}
-			if (!tcph->rst && !tcph->fin) {
-				cm_node->state = NES_CM_STATE_LISTENING;
-			} else {
-				cm_packets_dropped++;
-				rem_ref_cm_node(cm_core, cm_node);
-				dev_kfree_skb_any(skb);
-				break;
-			}
-			add_ref_cm_node(cm_node);
-		} else if (cm_node->state == NES_CM_STATE_TSA) {
-			if (cm_node->nesqp->pau_mode)
-				nes_queue_mgt_skbs(skb, nesvnic, cm_node->nesqp);
-			else {
-				rem_ref_cm_node(cm_core, cm_node);
-				atomic_inc(&cm_accel_dropped_pkts);
-				dev_kfree_skb_any(skb);
-			}
-			break;
-		}
-		skb_reset_network_header(skb);
-		skb_set_transport_header(skb, sizeof(*tcph));
-		skb->len = ntohs(iph->tot_len);
-		process_packet(cm_node, skb, cm_core);
-		rem_ref_cm_node(cm_core, cm_node);
-	} while (0);
-	return skb_handled;
-}
-
-
-/**
- * nes_cm_alloc_core - allocate a top level instance of a cm core
- */
-static struct nes_cm_core *nes_cm_alloc_core(void)
-{
-	struct nes_cm_core *cm_core;
-
-	/* setup the CM core */
-	/* alloc top level core control structure */
-	cm_core = kzalloc(sizeof(*cm_core), GFP_KERNEL);
-	if (!cm_core)
-		return NULL;
-
-	INIT_LIST_HEAD(&cm_core->connected_nodes);
-	timer_setup(&cm_core->tcp_timer, nes_cm_timer_tick, 0);
-
-	cm_core->mtu = NES_CM_DEFAULT_MTU;
-	cm_core->state = NES_CM_STATE_INITED;
-	cm_core->free_tx_pkt_max = NES_CM_DEFAULT_FREE_PKTS;
-
-	atomic_set(&cm_core->events_posted, 0);
-
-	cm_core->api = &nes_cm_api;
-
-	spin_lock_init(&cm_core->ht_lock);
-	spin_lock_init(&cm_core->listen_list_lock);
-
-	INIT_LIST_HEAD(&cm_core->listen_list.list);
-
-	nes_debug(NES_DBG_CM, "Init CM Core completed -- cm_core=%p\n", cm_core);
-
-	nes_debug(NES_DBG_CM, "Enable QUEUE EVENTS\n");
-	cm_core->event_wq = alloc_ordered_workqueue("nesewq", 0);
-	if (!cm_core->event_wq)
-		goto out_free_cmcore;
-	cm_core->post_event = nes_cm_post_event;
-	nes_debug(NES_DBG_CM, "Enable QUEUE DISCONNECTS\n");
-	cm_core->disconn_wq = alloc_ordered_workqueue("nesdwq", 0);
-	if (!cm_core->disconn_wq)
-		goto out_free_wq;
-
-	print_core(cm_core);
-	return cm_core;
-
-out_free_wq:
-	destroy_workqueue(cm_core->event_wq);
-out_free_cmcore:
-	kfree(cm_core);
-	return NULL;
-}
-
-
-/**
- * mini_cm_dealloc_core - deallocate a top level instance of a cm core
- */
-static int mini_cm_dealloc_core(struct nes_cm_core *cm_core)
-{
-	nes_debug(NES_DBG_CM, "De-Alloc CM Core (%p)\n", cm_core);
-
-	if (!cm_core)
-		return -EINVAL;
-
-	barrier();
-
-	if (timer_pending(&cm_core->tcp_timer))
-		del_timer(&cm_core->tcp_timer);
-
-	destroy_workqueue(cm_core->event_wq);
-	destroy_workqueue(cm_core->disconn_wq);
-	nes_debug(NES_DBG_CM, "\n");
-	kfree(cm_core);
-
-	return 0;
-}
-
-
-/**
- * mini_cm_get
- */
-static int mini_cm_get(struct nes_cm_core *cm_core)
-{
-	return cm_core->state;
-}
-
-
-/**
- * mini_cm_set
- */
-static int mini_cm_set(struct nes_cm_core *cm_core, u32 type, u32 value)
-{
-	int ret = 0;
-
-	switch (type) {
-	case NES_CM_SET_PKT_SIZE:
-		cm_core->mtu = value;
-		break;
-	case NES_CM_SET_FREE_PKT_Q_SIZE:
-		cm_core->free_tx_pkt_max = value;
-		break;
-	default:
-		/* unknown set option */
-		ret = -EINVAL;
-	}
-
-	return ret;
-}
-
-
-/**
- * nes_cm_init_tsa_conn setup HW; MPA frames must be
- * successfully exchanged when this is called
- */
-static int nes_cm_init_tsa_conn(struct nes_qp *nesqp, struct nes_cm_node *cm_node)
-{
-	int ret = 0;
-
-	if (!nesqp)
-		return -EINVAL;
-
-	nesqp->nesqp_context->misc |= cpu_to_le32(NES_QPCONTEXT_MISC_IPV4 |
-						  NES_QPCONTEXT_MISC_NO_NAGLE | NES_QPCONTEXT_MISC_DO_NOT_FRAG |
-						  NES_QPCONTEXT_MISC_DROS);
-
-	if (cm_node->tcp_cntxt.snd_wscale || cm_node->tcp_cntxt.rcv_wscale)
-		nesqp->nesqp_context->misc |= cpu_to_le32(NES_QPCONTEXT_MISC_WSCALE);
-
-	nesqp->nesqp_context->misc2 |= cpu_to_le32(64 << NES_QPCONTEXT_MISC2_TTL_SHIFT);
-
-	nesqp->nesqp_context->misc2 |= cpu_to_le32(
-		cm_node->tos << NES_QPCONTEXT_MISC2_TOS_SHIFT);
-
-	nesqp->nesqp_context->mss |= cpu_to_le32(((u32)cm_node->tcp_cntxt.mss) << 16);
-
-	nesqp->nesqp_context->tcp_state_flow_label |= cpu_to_le32(
-		(u32)NES_QPCONTEXT_TCPSTATE_EST << NES_QPCONTEXT_TCPFLOW_TCP_STATE_SHIFT);
-
-	nesqp->nesqp_context->pd_index_wscale |= cpu_to_le32(
-		(cm_node->tcp_cntxt.snd_wscale << NES_QPCONTEXT_PDWSCALE_SND_WSCALE_SHIFT) &
-		NES_QPCONTEXT_PDWSCALE_SND_WSCALE_MASK);
-
-	nesqp->nesqp_context->pd_index_wscale |= cpu_to_le32(
-		(cm_node->tcp_cntxt.rcv_wscale << NES_QPCONTEXT_PDWSCALE_RCV_WSCALE_SHIFT) &
-		NES_QPCONTEXT_PDWSCALE_RCV_WSCALE_MASK);
-
-	nesqp->nesqp_context->keepalive = cpu_to_le32(0x80);
-	nesqp->nesqp_context->ts_recent = 0;
-	nesqp->nesqp_context->ts_age = 0;
-	nesqp->nesqp_context->snd_nxt = cpu_to_le32(cm_node->tcp_cntxt.loc_seq_num);
-	nesqp->nesqp_context->snd_wnd = cpu_to_le32(cm_node->tcp_cntxt.snd_wnd);
-	nesqp->nesqp_context->rcv_nxt = cpu_to_le32(cm_node->tcp_cntxt.rcv_nxt);
-	nesqp->nesqp_context->rcv_wnd = cpu_to_le32(cm_node->tcp_cntxt.rcv_wnd <<
-						    cm_node->tcp_cntxt.rcv_wscale);
-	nesqp->nesqp_context->snd_max = cpu_to_le32(cm_node->tcp_cntxt.loc_seq_num);
-	nesqp->nesqp_context->snd_una = cpu_to_le32(cm_node->tcp_cntxt.loc_seq_num);
-	nesqp->nesqp_context->srtt = 0;
-	nesqp->nesqp_context->rttvar = cpu_to_le32(0x6);
-	nesqp->nesqp_context->ssthresh = cpu_to_le32(0x3FFFC000);
-	nesqp->nesqp_context->cwnd = cpu_to_le32(2 * cm_node->tcp_cntxt.mss);
-	nesqp->nesqp_context->snd_wl1 = cpu_to_le32(cm_node->tcp_cntxt.rcv_nxt);
-	nesqp->nesqp_context->snd_wl2 = cpu_to_le32(cm_node->tcp_cntxt.loc_seq_num);
-	nesqp->nesqp_context->max_snd_wnd = cpu_to_le32(cm_node->tcp_cntxt.max_snd_wnd);
-
-	nes_debug(NES_DBG_CM, "QP%u: rcv_nxt = 0x%08X, snd_nxt = 0x%08X,"
-		  " Setting MSS to %u, PDWscale = 0x%08X, rcv_wnd = %u, context misc = 0x%08X.\n",
-		  nesqp->hwqp.qp_id, le32_to_cpu(nesqp->nesqp_context->rcv_nxt),
-		  le32_to_cpu(nesqp->nesqp_context->snd_nxt),
-		  cm_node->tcp_cntxt.mss, le32_to_cpu(nesqp->nesqp_context->pd_index_wscale),
-		  le32_to_cpu(nesqp->nesqp_context->rcv_wnd),
-		  le32_to_cpu(nesqp->nesqp_context->misc));
-	nes_debug(NES_DBG_CM, "  snd_wnd  = 0x%08X.\n", le32_to_cpu(nesqp->nesqp_context->snd_wnd));
-	nes_debug(NES_DBG_CM, "  snd_cwnd = 0x%08X.\n", le32_to_cpu(nesqp->nesqp_context->cwnd));
-	nes_debug(NES_DBG_CM, "  max_swnd = 0x%08X.\n", le32_to_cpu(nesqp->nesqp_context->max_snd_wnd));
-
-	nes_debug(NES_DBG_CM, "Change cm_node state to TSA\n");
-	cm_node->state = NES_CM_STATE_TSA;
-
-	return ret;
-}
-
-
-/**
- * nes_cm_disconn
- */
-int nes_cm_disconn(struct nes_qp *nesqp)
-{
-	struct disconn_work *work;
-
-	work = kzalloc(sizeof *work, GFP_ATOMIC);
-	if (!work)
-		return -ENOMEM;  /* Timer will clean up */
-
-	nes_add_ref(&nesqp->ibqp);
-	work->nesqp = nesqp;
-	INIT_WORK(&work->work, nes_disconnect_worker);
-	queue_work(g_cm_core->disconn_wq, &work->work);
-	return 0;
-}
-
-
-/**
- * nes_disconnect_worker
- */
-static void nes_disconnect_worker(struct work_struct *work)
-{
-	struct disconn_work *dwork = container_of(work, struct disconn_work, work);
-	struct nes_qp *nesqp = dwork->nesqp;
-
-	kfree(dwork);
-	nes_debug(NES_DBG_CM, "processing AEQE id 0x%04X for QP%u.\n",
-		  nesqp->last_aeq, nesqp->hwqp.qp_id);
-	nes_cm_disconn_true(nesqp);
-	nes_rem_ref(&nesqp->ibqp);
-}
-
-
-/**
- * nes_cm_disconn_true
- */
-static int nes_cm_disconn_true(struct nes_qp *nesqp)
-{
-	unsigned long flags;
-	int ret = 0;
-	struct iw_cm_id *cm_id;
-	struct iw_cm_event cm_event;
-	struct nes_vnic *nesvnic;
-	u16 last_ae;
-	u8 original_hw_tcp_state;
-	u8 original_ibqp_state;
-	int disconn_status = 0;
-	int issue_disconn = 0;
-	int issue_close = 0;
-	int issue_flush = 0;
-	u32 flush_q = NES_CQP_FLUSH_RQ;
-	struct ib_event ibevent;
-
-	if (!nesqp) {
-		nes_debug(NES_DBG_CM, "disconnect_worker nesqp is NULL\n");
-		return -1;
-	}
-
-	spin_lock_irqsave(&nesqp->lock, flags);
-	cm_id = nesqp->cm_id;
-	/* make sure we havent already closed this connection */
-	if (!cm_id) {
-		nes_debug(NES_DBG_CM, "QP%u disconnect_worker cmid is NULL\n",
-			  nesqp->hwqp.qp_id);
-		spin_unlock_irqrestore(&nesqp->lock, flags);
-		return -1;
-	}
-
-	nesvnic = to_nesvnic(nesqp->ibqp.device);
-	nes_debug(NES_DBG_CM, "Disconnecting QP%u\n", nesqp->hwqp.qp_id);
-
-	original_hw_tcp_state = nesqp->hw_tcp_state;
-	original_ibqp_state = nesqp->ibqp_state;
-	last_ae = nesqp->last_aeq;
-
-	if (nesqp->term_flags) {
-		issue_disconn = 1;
-		issue_close = 1;
-		nesqp->cm_id = NULL;
-		del_timer(&nesqp->terminate_timer);
-		if (nesqp->flush_issued == 0) {
-			nesqp->flush_issued = 1;
-			issue_flush = 1;
-		}
-	} else if ((original_hw_tcp_state == NES_AEQE_TCP_STATE_CLOSE_WAIT) ||
-			((original_ibqp_state == IB_QPS_RTS) &&
-			(last_ae == NES_AEQE_AEID_LLP_CONNECTION_RESET))) {
-		issue_disconn = 1;
-		if (last_ae == NES_AEQE_AEID_LLP_CONNECTION_RESET)
-			disconn_status = -ECONNRESET;
-	}
-
-	if (((original_hw_tcp_state == NES_AEQE_TCP_STATE_CLOSED) ||
-		 (original_hw_tcp_state == NES_AEQE_TCP_STATE_TIME_WAIT) ||
-		 (last_ae == NES_AEQE_AEID_RDMAP_ROE_BAD_LLP_CLOSE) ||
-		 (last_ae == NES_AEQE_AEID_LLP_CONNECTION_RESET))) {
-		issue_close = 1;
-		nesqp->cm_id = NULL;
-		if (nesqp->flush_issued == 0) {
-			nesqp->flush_issued = 1;
-			issue_flush = 1;
-		}
-	}
-
-	spin_unlock_irqrestore(&nesqp->lock, flags);
-
-	if ((issue_flush) && (nesqp->destroyed == 0)) {
-		/* Flush the queue(s) */
-		if (nesqp->hw_iwarp_state >= NES_AEQE_IWARP_STATE_TERMINATE)
-			flush_q |= NES_CQP_FLUSH_SQ;
-		flush_wqes(nesvnic->nesdev, nesqp, flush_q, 1);
-
-		if (nesqp->term_flags) {
-			ibevent.device = nesqp->ibqp.device;
-			ibevent.event = nesqp->terminate_eventtype;
-			ibevent.element.qp = &nesqp->ibqp;
-			if (nesqp->ibqp.event_handler)
-				nesqp->ibqp.event_handler(&ibevent, nesqp->ibqp.qp_context);
-		}
-	}
-
-	if ((cm_id) && (cm_id->event_handler)) {
-		if (issue_disconn) {
-			atomic_inc(&cm_disconnects);
-			cm_event.event = IW_CM_EVENT_DISCONNECT;
-			cm_event.status = disconn_status;
-			cm_event.local_addr = cm_id->m_local_addr;
-			cm_event.remote_addr = cm_id->m_remote_addr;
-			cm_event.private_data = NULL;
-			cm_event.private_data_len = 0;
-
-			nes_debug(NES_DBG_CM, "Generating a CM Disconnect Event"
-				  " for  QP%u, SQ Head = %u, SQ Tail = %u. "
-				  "cm_id = %p, refcount = %u.\n",
-				  nesqp->hwqp.qp_id, nesqp->hwqp.sq_head,
-				  nesqp->hwqp.sq_tail, cm_id,
-				  atomic_read(&nesqp->refcount));
-
-			ret = cm_id->event_handler(cm_id, &cm_event);
-			if (ret)
-				nes_debug(NES_DBG_CM, "OFA CM event_handler "
-					  "returned, ret=%d\n", ret);
-		}
-
-		if (issue_close) {
-			atomic_inc(&cm_closes);
-			nes_disconnect(nesqp, 1);
-
-			cm_id->provider_data = nesqp;
-			/* Send up the close complete event */
-			cm_event.event = IW_CM_EVENT_CLOSE;
-			cm_event.status = 0;
-			cm_event.provider_data = cm_id->provider_data;
-			cm_event.local_addr = cm_id->m_local_addr;
-			cm_event.remote_addr = cm_id->m_remote_addr;
-			cm_event.private_data = NULL;
-			cm_event.private_data_len = 0;
-
-			ret = cm_id->event_handler(cm_id, &cm_event);
-			if (ret)
-				nes_debug(NES_DBG_CM, "OFA CM event_handler returned, ret=%d\n", ret);
-
-			cm_id->rem_ref(cm_id);
-		}
-	}
-
-	return 0;
-}
-
-
-/**
- * nes_disconnect
- */
-static int nes_disconnect(struct nes_qp *nesqp, int abrupt)
-{
-	int ret = 0;
-	struct nes_vnic *nesvnic;
-	struct nes_device *nesdev;
-	struct nes_ib_device *nesibdev;
-
-	nesvnic = to_nesvnic(nesqp->ibqp.device);
-	if (!nesvnic)
-		return -EINVAL;
-
-	nesdev = nesvnic->nesdev;
-	nesibdev = nesvnic->nesibdev;
-
-	nes_debug(NES_DBG_CM, "netdev refcnt = %u.\n",
-			netdev_refcnt_read(nesvnic->netdev));
-
-	if (nesqp->active_conn) {
-
-		/* indicate this connection is NOT active */
-		nesqp->active_conn = 0;
-	} else {
-		/* Need to free the Last Streaming Mode Message */
-		if (nesqp->ietf_frame) {
-			if (nesqp->lsmm_mr)
-				nesibdev->ibdev.dereg_mr(nesqp->lsmm_mr);
-			pci_free_consistent(nesdev->pcidev,
-					    nesqp->private_data_len + nesqp->ietf_frame_size,
-					    nesqp->ietf_frame, nesqp->ietf_frame_pbase);
-		}
-	}
-
-	/* close the CM node down if it is still active */
-	if (nesqp->cm_node) {
-		nes_debug(NES_DBG_CM, "Call close API\n");
-
-		g_cm_core->api->close(g_cm_core, nesqp->cm_node);
-	}
-
-	return ret;
-}
-
-
-/**
- * nes_accept
- */
-int nes_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
-{
-	u64 u64temp;
-	struct ib_qp *ibqp;
-	struct nes_qp *nesqp;
-	struct nes_vnic *nesvnic;
-	struct nes_device *nesdev;
-	struct nes_cm_node *cm_node;
-	struct nes_adapter *adapter;
-	struct ib_qp_attr attr;
-	struct iw_cm_event cm_event;
-	struct nes_hw_qp_wqe *wqe;
-	struct nes_v4_quad nes_quad;
-	u32 crc_value;
-	int ret;
-	int passive_state;
-	struct ib_mr *ibmr = NULL;
-	struct nes_pd *nespd;
-	u64 tagged_offset;
-	u8 mpa_frame_offset = 0;
-	struct ietf_mpa_v2 *mpa_v2_frame;
-	u8 start_addr = 0;
-	u8 *start_ptr = &start_addr;
-	u8 **start_buff = &start_ptr;
-	u16 buff_len = 0;
-	struct sockaddr_in *laddr = (struct sockaddr_in *)&cm_id->m_local_addr;
-	struct sockaddr_in *raddr = (struct sockaddr_in *)&cm_id->m_remote_addr;
-
-	ibqp = nes_get_qp(cm_id->device, conn_param->qpn);
-	if (!ibqp)
-		return -EINVAL;
-
-	/* get all our handles */
-	nesqp = to_nesqp(ibqp);
-	nesvnic = to_nesvnic(nesqp->ibqp.device);
-	nesdev = nesvnic->nesdev;
-	adapter = nesdev->nesadapter;
-
-	cm_node = (struct nes_cm_node *)cm_id->provider_data;
-	nes_debug(NES_DBG_CM, "nes_accept: cm_node= %p nesvnic=%p, netdev=%p,"
-		"%s\n", cm_node, nesvnic, nesvnic->netdev,
-		nesvnic->netdev->name);
-
-	if (NES_CM_STATE_LISTENER_DESTROYED == cm_node->state) {
-		if (cm_node->loopbackpartner)
-			rem_ref_cm_node(cm_node->cm_core, cm_node->loopbackpartner);
-		rem_ref_cm_node(cm_node->cm_core, cm_node);
-		return -EINVAL;
-	}
-
-	passive_state = atomic_add_return(1, &cm_node->passive_state);
-	if (passive_state == NES_SEND_RESET_EVENT) {
-		rem_ref_cm_node(cm_node->cm_core, cm_node);
-		return -ECONNRESET;
-	}
-	/* associate the node with the QP */
-	nesqp->cm_node = (void *)cm_node;
-	cm_node->nesqp = nesqp;
-
-
-	nes_debug(NES_DBG_CM, "QP%u, cm_node=%p, jiffies = %lu listener = %p\n",
-		nesqp->hwqp.qp_id, cm_node, jiffies, cm_node->listener);
-	atomic_inc(&cm_accepts);
-
-	nes_debug(NES_DBG_CM, "netdev refcnt = %u.\n",
-			netdev_refcnt_read(nesvnic->netdev));
-
-	nesqp->ietf_frame_size = sizeof(struct ietf_mpa_v2);
-	/* allocate the ietf frame and space for private data */
-	nesqp->ietf_frame = pci_alloc_consistent(nesdev->pcidev,
-						 nesqp->ietf_frame_size + conn_param->private_data_len,
-						 &nesqp->ietf_frame_pbase);
-
-	if (!nesqp->ietf_frame) {
-		nes_debug(NES_DBG_CM, "Unable to allocate memory for private data\n");
-		return -ENOMEM;
-	}
-	mpa_v2_frame = (struct ietf_mpa_v2 *)nesqp->ietf_frame;
-
-	if (cm_node->mpa_frame_rev == IETF_MPA_V1)
-		mpa_frame_offset = 4;
-
-	if (cm_node->mpa_frame_rev == IETF_MPA_V1 ||
-			cm_node->mpav2_ird_ord == IETF_NO_IRD_ORD) {
-		record_ird_ord(cm_node, (u16)conn_param->ird, (u16)conn_param->ord);
-	}
-
-	memcpy(mpa_v2_frame->priv_data, conn_param->private_data,
-	       conn_param->private_data_len);
-
-	cm_build_mpa_frame(cm_node, start_buff, &buff_len, nesqp->ietf_frame, MPA_KEY_REPLY);
-	nesqp->private_data_len = conn_param->private_data_len;
-
-	/* setup our first outgoing iWarp send WQE (the IETF frame response) */
-	wqe = &nesqp->hwqp.sq_vbase[0];
-
-	if (raddr->sin_addr.s_addr != laddr->sin_addr.s_addr) {
-		u64temp = (unsigned long)nesqp;
-		nespd = nesqp->nespd;
-		tagged_offset = (u64)(unsigned long)*start_buff;
-		ibmr = nes_reg_phys_mr(&nespd->ibpd,
-				nesqp->ietf_frame_pbase + mpa_frame_offset,
-				buff_len, IB_ACCESS_LOCAL_WRITE,
-				&tagged_offset);
-		if (IS_ERR(ibmr)) {
-			nes_debug(NES_DBG_CM, "Unable to register memory region"
-				  "for lSMM for cm_node = %p \n",
-				  cm_node);
-			pci_free_consistent(nesdev->pcidev,
-					    nesqp->private_data_len + nesqp->ietf_frame_size,
-					    nesqp->ietf_frame, nesqp->ietf_frame_pbase);
-			return PTR_ERR(ibmr);
-		}
-
-		ibmr->pd = &nespd->ibpd;
-		ibmr->device = nespd->ibpd.device;
-		nesqp->lsmm_mr = ibmr;
-
-		u64temp |= NES_SW_CONTEXT_ALIGN >> 1;
-		set_wqe_64bit_value(wqe->wqe_words,
-				    NES_IWARP_SQ_WQE_COMP_CTX_LOW_IDX,
-				    u64temp);
-		wqe->wqe_words[NES_IWARP_SQ_WQE_MISC_IDX] =
-			cpu_to_le32(NES_IWARP_SQ_WQE_STREAMING |
-				    NES_IWARP_SQ_WQE_WRPDU);
-		wqe->wqe_words[NES_IWARP_SQ_WQE_TOTAL_PAYLOAD_IDX] =
-			cpu_to_le32(buff_len);
-		set_wqe_64bit_value(wqe->wqe_words,
-				    NES_IWARP_SQ_WQE_FRAG0_LOW_IDX,
-				    (u64)(unsigned long)(*start_buff));
-		wqe->wqe_words[NES_IWARP_SQ_WQE_LENGTH0_IDX] =
-			cpu_to_le32(buff_len);
-		wqe->wqe_words[NES_IWARP_SQ_WQE_STAG0_IDX] = ibmr->lkey;
-		if (nesqp->sq_kmapped) {
-			nesqp->sq_kmapped = 0;
-			kunmap(nesqp->page);
-		}
-
-		nesqp->nesqp_context->ird_ord_sizes |=
-			cpu_to_le32(NES_QPCONTEXT_ORDIRD_LSMM_PRESENT |
-				    NES_QPCONTEXT_ORDIRD_WRPDU);
-	} else {
-		nesqp->nesqp_context->ird_ord_sizes |=
-			cpu_to_le32(NES_QPCONTEXT_ORDIRD_WRPDU);
-	}
-	nesqp->skip_lsmm = 1;
-
-	/* Cache the cm_id in the qp */
-	nesqp->cm_id = cm_id;
-	cm_node->cm_id = cm_id;
-
-	/*  nesqp->cm_node = (void *)cm_id->provider_data; */
-	cm_id->provider_data = nesqp;
-	nesqp->active_conn = 0;
-
-	if (cm_node->state == NES_CM_STATE_TSA)
-		nes_debug(NES_DBG_CM, "Already state = TSA for cm_node=%p\n",
-			  cm_node);
-
-	nes_cm_init_tsa_conn(nesqp, cm_node);
-
-	nesqp->nesqp_context->tcpPorts[0] =
-				cpu_to_le16(cm_node->loc_port);
-	nesqp->nesqp_context->tcpPorts[1] =
-				cpu_to_le16(cm_node->rem_port);
-
-	nesqp->nesqp_context->ip0 = cpu_to_le32(cm_node->rem_addr);
-
-	nesqp->nesqp_context->misc2 |= cpu_to_le32(
-		(u32)PCI_FUNC(nesdev->pcidev->devfn) <<
-		NES_QPCONTEXT_MISC2_SRC_IP_SHIFT);
-
-	nesqp->nesqp_context->arp_index_vlan |=
-		cpu_to_le32(nes_arp_table(nesdev,
-					  le32_to_cpu(nesqp->nesqp_context->ip0), NULL,
-					  NES_ARP_RESOLVE) << 16);
-
-	nesqp->nesqp_context->ts_val_delta = cpu_to_le32(
-		jiffies - nes_read_indexed(nesdev, NES_IDX_TCP_NOW));
-
-	nesqp->nesqp_context->ird_index = cpu_to_le32(nesqp->hwqp.qp_id);
-
-	nesqp->nesqp_context->ird_ord_sizes |= cpu_to_le32(
-		((u32)1 << NES_QPCONTEXT_ORDIRD_IWARP_MODE_SHIFT));
-	nesqp->nesqp_context->ird_ord_sizes |=
-		cpu_to_le32((u32)cm_node->ord_size);
-
-	memset(&nes_quad, 0, sizeof(nes_quad));
-	nes_quad.DstIpAdrIndex =
-		cpu_to_le32((u32)PCI_FUNC(nesdev->pcidev->devfn) << 24);
-	nes_quad.SrcIpadr = htonl(cm_node->rem_addr);
-	nes_quad.TcpPorts[0] = htons(cm_node->rem_port);
-	nes_quad.TcpPorts[1] = htons(cm_node->loc_port);
-
-	/* Produce hash key */
-	crc_value = get_crc_value(&nes_quad);
-	nesqp->hte_index = cpu_to_be32(crc_value ^ 0xffffffff);
-	nes_debug(NES_DBG_CM, "HTE Index = 0x%08X, CRC = 0x%08X\n",
-		  nesqp->hte_index, nesqp->hte_index & adapter->hte_index_mask);
-
-	nesqp->hte_index &= adapter->hte_index_mask;
-	nesqp->nesqp_context->hte_index = cpu_to_le32(nesqp->hte_index);
-
-	cm_node->cm_core->api->accelerated(cm_node->cm_core, cm_node);
-
-	nes_debug(NES_DBG_CM, "QP%u, Destination IP = 0x%08X:0x%04X, local = "
-		  "0x%08X:0x%04X, rcv_nxt=0x%08X, snd_nxt=0x%08X, mpa + "
-		  "private data length=%u.\n", nesqp->hwqp.qp_id,
-		  ntohl(raddr->sin_addr.s_addr), ntohs(raddr->sin_port),
-		  ntohl(laddr->sin_addr.s_addr), ntohs(laddr->sin_port),
-		  le32_to_cpu(nesqp->nesqp_context->rcv_nxt),
-		  le32_to_cpu(nesqp->nesqp_context->snd_nxt),
-		  buff_len);
-
-	/* notify OF layer that accept event was successful */
-	cm_id->add_ref(cm_id);
-	nes_add_ref(&nesqp->ibqp);
-
-	cm_event.event = IW_CM_EVENT_ESTABLISHED;
-	cm_event.status = 0;
-	cm_event.provider_data = (void *)nesqp;
-	cm_event.local_addr = cm_id->m_local_addr;
-	cm_event.remote_addr = cm_id->m_remote_addr;
-	cm_event.private_data = NULL;
-	cm_event.private_data_len = 0;
-	cm_event.ird = cm_node->ird_size;
-	cm_event.ord = cm_node->ord_size;
-
-	ret = cm_id->event_handler(cm_id, &cm_event);
-	attr.qp_state = IB_QPS_RTS;
-	nes_modify_qp(&nesqp->ibqp, &attr, IB_QP_STATE, NULL);
-	if (cm_node->loopbackpartner) {
-		cm_node->loopbackpartner->mpa_frame_size =
-			nesqp->private_data_len;
-		/* copy entire MPA frame to our cm_node's frame */
-		memcpy(cm_node->loopbackpartner->mpa_frame_buf,
-		       conn_param->private_data, conn_param->private_data_len);
-		create_event(cm_node->loopbackpartner, NES_CM_EVENT_CONNECTED);
-	}
-	if (ret)
-		printk(KERN_ERR "%s[%u] OFA CM event_handler returned, "
-		       "ret=%d\n", __func__, __LINE__, ret);
-
-	return 0;
-}
-
-
-/**
- * nes_reject
- */
-int nes_reject(struct iw_cm_id *cm_id, const void *pdata, u8 pdata_len)
-{
-	struct nes_cm_node *cm_node;
-	struct nes_cm_node *loopback;
-	struct nes_cm_core *cm_core;
-	u8 *start_buff;
-
-	atomic_inc(&cm_rejects);
-	cm_node = (struct nes_cm_node *)cm_id->provider_data;
-	loopback = cm_node->loopbackpartner;
-	cm_core = cm_node->cm_core;
-	cm_node->cm_id = cm_id;
-
-	if (pdata_len + sizeof(struct ietf_mpa_v2) > MAX_CM_BUFFER)
-		return -EINVAL;
-
-	if (loopback) {
-		memcpy(&loopback->mpa_frame.priv_data, pdata, pdata_len);
-		loopback->mpa_frame.priv_data_len = pdata_len;
-		loopback->mpa_frame_size = pdata_len;
-	} else {
-		start_buff = &cm_node->mpa_frame_buf[0] + sizeof(struct ietf_mpa_v2);
-		cm_node->mpa_frame_size = pdata_len;
-		memcpy(start_buff, pdata, pdata_len);
-	}
-	return cm_core->api->reject(cm_core, cm_node);
-}
-
-
-/**
- * nes_connect
- * setup and launch cm connect node
- */
-int nes_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
-{
-	struct ib_qp *ibqp;
-	struct nes_qp *nesqp;
-	struct nes_vnic *nesvnic;
-	struct nes_device *nesdev;
-	struct nes_cm_node *cm_node;
-	struct nes_cm_info cm_info;
-	int apbvt_set = 0;
-	struct sockaddr_in *laddr = (struct sockaddr_in *)&cm_id->m_local_addr;
-	struct sockaddr_in *raddr = (struct sockaddr_in *)&cm_id->m_remote_addr;
-
-	if (cm_id->remote_addr.ss_family != AF_INET)
-		return -ENOSYS;
-	ibqp = nes_get_qp(cm_id->device, conn_param->qpn);
-	if (!ibqp)
-		return -EINVAL;
-	nesqp = to_nesqp(ibqp);
-	if (!nesqp)
-		return -EINVAL;
-	nesvnic = to_nesvnic(nesqp->ibqp.device);
-	if (!nesvnic)
-		return -EINVAL;
-	nesdev = nesvnic->nesdev;
-	if (!nesdev)
-		return -EINVAL;
-
-	if (!laddr->sin_port || !raddr->sin_port)
-		return -EINVAL;
-
-	nes_debug(NES_DBG_CM, "QP%u, current IP = 0x%08X, Destination IP = "
-		  "0x%08X:0x%04X, local = 0x%08X:0x%04X.\n", nesqp->hwqp.qp_id,
-		  ntohl(nesvnic->local_ipaddr), ntohl(raddr->sin_addr.s_addr),
-		  ntohs(raddr->sin_port), ntohl(laddr->sin_addr.s_addr),
-		  ntohs(laddr->sin_port));
-
-	atomic_inc(&cm_connects);
-	nesqp->active_conn = 1;
-
-	/* cache the cm_id in the qp */
-	nesqp->cm_id = cm_id;
-	cm_id->provider_data = nesqp;
-	nesqp->private_data_len = conn_param->private_data_len;
-
-	nes_debug(NES_DBG_CM, "requested ord = 0x%08X.\n", (u32)conn_param->ord);
-	nes_debug(NES_DBG_CM, "mpa private data len =%u\n",
-		  conn_param->private_data_len);
-
-	/* set up the connection params for the node */
-	cm_info.loc_addr = ntohl(laddr->sin_addr.s_addr);
-	cm_info.loc_port = ntohs(laddr->sin_port);
-	cm_info.rem_addr = ntohl(raddr->sin_addr.s_addr);
-	cm_info.rem_port = ntohs(raddr->sin_port);
-	cm_info.cm_id = cm_id;
-	cm_info.conn_type = NES_CM_IWARP_CONN_TYPE;
-
-	if (laddr->sin_addr.s_addr != raddr->sin_addr.s_addr) {
-		nes_manage_apbvt(nesvnic, cm_info.loc_port,
-				 PCI_FUNC(nesdev->pcidev->devfn),
-				 NES_MANAGE_APBVT_ADD);
-		apbvt_set = 1;
-	}
-
-	cm_id->add_ref(cm_id);
-
-	/* create a connect CM node connection */
-	cm_node = g_cm_core->api->connect(g_cm_core, nesvnic,
-					  conn_param->private_data_len, (void *)conn_param->private_data,
-					  &cm_info);
-	if (!cm_node) {
-		if (apbvt_set)
-			nes_manage_apbvt(nesvnic, cm_info.loc_port,
-					 PCI_FUNC(nesdev->pcidev->devfn),
-					 NES_MANAGE_APBVT_DEL);
-
-		nes_debug(NES_DBG_NLMSG, "Delete loc_port = %04X\n",
-			  cm_info.loc_port);
-		cm_id->rem_ref(cm_id);
-		return -ENOMEM;
-	}
-
-	record_ird_ord(cm_node, (u16)conn_param->ird, (u16)conn_param->ord);
-	if (cm_node->send_rdma0_op == SEND_RDMA_READ_ZERO &&
-				cm_node->ord_size == 0)
-		cm_node->ord_size = 1;
-
-	cm_node->apbvt_set = apbvt_set;
-	cm_node->tos = cm_id->tos;
-	nesqp->cm_node = cm_node;
-	cm_node->nesqp = nesqp;
-	nes_add_ref(&nesqp->ibqp);
-
-	return 0;
-}
-
-
-/**
- * nes_create_listen
- */
-int nes_create_listen(struct iw_cm_id *cm_id, int backlog)
-{
-	struct nes_vnic *nesvnic;
-	struct nes_cm_listener *cm_node;
-	struct nes_cm_info cm_info;
-	int err;
-	struct sockaddr_in *laddr = (struct sockaddr_in *)&cm_id->m_local_addr;
-
-	nes_debug(NES_DBG_CM, "cm_id = %p, local port = 0x%04X.\n",
-		  cm_id, ntohs(laddr->sin_port));
-
-	if (cm_id->m_local_addr.ss_family != AF_INET)
-		return -ENOSYS;
-	nesvnic = to_nesvnic(cm_id->device);
-	if (!nesvnic)
-		return -EINVAL;
-
-	nes_debug(NES_DBG_CM, "nesvnic=%p, netdev=%p, %s\n",
-			nesvnic, nesvnic->netdev, nesvnic->netdev->name);
-
-	nes_debug(NES_DBG_CM, "nesvnic->local_ipaddr=0x%08x, sin_addr.s_addr=0x%08x\n",
-			nesvnic->local_ipaddr, laddr->sin_addr.s_addr);
-
-	/* setup listen params in our api call struct */
-	cm_info.loc_addr = ntohl(nesvnic->local_ipaddr);
-	cm_info.loc_port = ntohs(laddr->sin_port);
-	cm_info.backlog = backlog;
-	cm_info.cm_id = cm_id;
-
-	cm_info.conn_type = NES_CM_IWARP_CONN_TYPE;
-
-	cm_node = g_cm_core->api->listen(g_cm_core, nesvnic, &cm_info);
-	if (!cm_node) {
-		printk(KERN_ERR "%s[%u] Error returned from listen API call\n",
-		       __func__, __LINE__);
-		return -ENOMEM;
-	}
-
-	cm_id->provider_data = cm_node;
-	cm_node->tos = cm_id->tos;
-
-	if (!cm_node->reused_node) {
-		err = nes_manage_apbvt(nesvnic, cm_node->loc_port,
-				       PCI_FUNC(nesvnic->nesdev->pcidev->devfn),
-				       NES_MANAGE_APBVT_ADD);
-		if (err) {
-			printk(KERN_ERR "nes_manage_apbvt call returned %d.\n",
-			       err);
-			g_cm_core->api->stop_listener(g_cm_core, (void *)cm_node);
-			return err;
-		}
-		atomic_inc(&cm_listens_created);
-	}
-
-	cm_id->add_ref(cm_id);
-	cm_id->provider_data = (void *)cm_node;
-
-
-	return 0;
-}
-
-
-/**
- * nes_destroy_listen
- */
-int nes_destroy_listen(struct iw_cm_id *cm_id)
-{
-	if (cm_id->provider_data)
-		g_cm_core->api->stop_listener(g_cm_core, cm_id->provider_data);
-	else
-		nes_debug(NES_DBG_CM, "cm_id->provider_data was NULL\n");
-
-	cm_id->rem_ref(cm_id);
-
-	return 0;
-}
-
-
-/**
- * nes_cm_recv
- */
-int nes_cm_recv(struct sk_buff *skb, struct net_device *netdevice)
-{
-	int rc = 0;
-
-	cm_packets_received++;
-	if ((g_cm_core) && (g_cm_core->api))
-		rc = g_cm_core->api->recv_pkt(g_cm_core, netdev_priv(netdevice), skb);
-	else
-		nes_debug(NES_DBG_CM, "Unable to process packet for CM,"
-			  " cm is not setup properly.\n");
-
-	return rc;
-}
-
-
-/**
- * nes_cm_start
- * Start and init a cm core module
- */
-int nes_cm_start(void)
-{
-	nes_debug(NES_DBG_CM, "\n");
-	/* create the primary CM core, pass this handle to subsequent core inits */
-	g_cm_core = nes_cm_alloc_core();
-	if (g_cm_core)
-		return 0;
-	else
-		return -ENOMEM;
-}
-
-
-/**
- * nes_cm_stop
- * stop and dealloc all cm core instances
- */
-int nes_cm_stop(void)
-{
-	g_cm_core->api->destroy_cm_core(g_cm_core);
-	return 0;
-}
-
-
-/**
- * cm_event_connected
- * handle a connected event, setup QPs and HW
- */
-static void cm_event_connected(struct nes_cm_event *event)
-{
-	struct nes_qp *nesqp;
-	struct nes_vnic *nesvnic;
-	struct nes_device *nesdev;
-	struct nes_cm_node *cm_node;
-	struct nes_adapter *nesadapter;
-	struct ib_qp_attr attr;
-	struct iw_cm_id *cm_id;
-	struct iw_cm_event cm_event;
-	struct nes_v4_quad nes_quad;
-	u32 crc_value;
-	int ret;
-	struct sockaddr_in *laddr;
-	struct sockaddr_in *raddr;
-	struct sockaddr_in *cm_event_laddr;
-
-	/* get all our handles */
-	cm_node = event->cm_node;
-	cm_id = cm_node->cm_id;
-	nes_debug(NES_DBG_CM, "cm_event_connected - %p - cm_id = %p\n", cm_node, cm_id);
-	nesqp = (struct nes_qp *)cm_id->provider_data;
-	nesvnic = to_nesvnic(nesqp->ibqp.device);
-	nesdev = nesvnic->nesdev;
-	nesadapter = nesdev->nesadapter;
-	laddr = (struct sockaddr_in *)&cm_id->m_local_addr;
-	raddr = (struct sockaddr_in *)&cm_id->m_remote_addr;
-	cm_event_laddr = (struct sockaddr_in *)&cm_event.local_addr;
-
-	if (nesqp->destroyed)
-		return;
-	atomic_inc(&cm_connecteds);
-	nes_debug(NES_DBG_CM, "QP%u attempting to connect to  0x%08X:0x%04X on"
-		  " local port 0x%04X. jiffies = %lu.\n",
-		  nesqp->hwqp.qp_id, ntohl(raddr->sin_addr.s_addr),
-		  ntohs(raddr->sin_port), ntohs(laddr->sin_port), jiffies);
-
-	nes_cm_init_tsa_conn(nesqp, cm_node);
-
-	/* set the QP tsa context */
-	nesqp->nesqp_context->tcpPorts[0] =
-			cpu_to_le16(cm_node->loc_port);
-	nesqp->nesqp_context->tcpPorts[1] =
-			cpu_to_le16(cm_node->rem_port);
-	nesqp->nesqp_context->ip0 = cpu_to_le32(cm_node->rem_addr);
-
-	nesqp->nesqp_context->misc2 |= cpu_to_le32(
-			(u32)PCI_FUNC(nesdev->pcidev->devfn) <<
-			NES_QPCONTEXT_MISC2_SRC_IP_SHIFT);
-	nesqp->nesqp_context->arp_index_vlan |= cpu_to_le32(
-			nes_arp_table(nesdev,
-			le32_to_cpu(nesqp->nesqp_context->ip0),
-			NULL, NES_ARP_RESOLVE) << 16);
-	nesqp->nesqp_context->ts_val_delta = cpu_to_le32(
-			jiffies - nes_read_indexed(nesdev, NES_IDX_TCP_NOW));
-	nesqp->nesqp_context->ird_index = cpu_to_le32(nesqp->hwqp.qp_id);
-	nesqp->nesqp_context->ird_ord_sizes |=
-			cpu_to_le32((u32)1 <<
-			NES_QPCONTEXT_ORDIRD_IWARP_MODE_SHIFT);
-	nesqp->nesqp_context->ird_ord_sizes |=
-			cpu_to_le32((u32)cm_node->ord_size);
-
-	/* Adjust tail for not having a LSMM */
-	/*nesqp->hwqp.sq_tail = 1;*/
-
-	build_rdma0_msg(cm_node, &nesqp);
-
-	nes_write32(nesdev->regs + NES_WQE_ALLOC,
-		    (1 << 24) | 0x00800000 | nesqp->hwqp.qp_id);
-
-	memset(&nes_quad, 0, sizeof(nes_quad));
-
-	nes_quad.DstIpAdrIndex =
-		cpu_to_le32((u32)PCI_FUNC(nesdev->pcidev->devfn) << 24);
-	nes_quad.SrcIpadr = htonl(cm_node->rem_addr);
-	nes_quad.TcpPorts[0] = htons(cm_node->rem_port);
-	nes_quad.TcpPorts[1] = htons(cm_node->loc_port);
-
-	/* Produce hash key */
-	crc_value = get_crc_value(&nes_quad);
-	nesqp->hte_index = cpu_to_be32(crc_value ^ 0xffffffff);
-	nes_debug(NES_DBG_CM, "HTE Index = 0x%08X, After CRC = 0x%08X\n",
-		  nesqp->hte_index, nesqp->hte_index & nesadapter->hte_index_mask);
-
-	nesqp->hte_index &= nesadapter->hte_index_mask;
-	nesqp->nesqp_context->hte_index = cpu_to_le32(nesqp->hte_index);
-
-	nesqp->ietf_frame = &cm_node->mpa_frame;
-	nesqp->private_data_len = (u8)cm_node->mpa_frame_size;
-	cm_node->cm_core->api->accelerated(cm_node->cm_core, cm_node);
-
-	/* notify OF layer we successfully created the requested connection */
-	cm_event.event = IW_CM_EVENT_CONNECT_REPLY;
-	cm_event.status = 0;
-	cm_event.provider_data = cm_id->provider_data;
-	cm_event_laddr->sin_family = AF_INET;
-	cm_event_laddr->sin_port = laddr->sin_port;
-	cm_event.remote_addr = cm_id->m_remote_addr;
-
-	cm_event.private_data = (void *)event->cm_node->mpa_frame_buf;
-	cm_event.private_data_len = (u8)event->cm_node->mpa_frame_size;
-	cm_event.ird = cm_node->ird_size;
-	cm_event.ord = cm_node->ord_size;
-
-	cm_event_laddr->sin_addr.s_addr = htonl(event->cm_info.loc_addr);
-	ret = cm_id->event_handler(cm_id, &cm_event);
-	nes_debug(NES_DBG_CM, "OFA CM event_handler returned, ret=%d\n", ret);
-
-	if (ret)
-		printk(KERN_ERR "%s[%u] OFA CM event_handler returned, "
-		       "ret=%d\n", __func__, __LINE__, ret);
-	attr.qp_state = IB_QPS_RTS;
-	nes_modify_qp(&nesqp->ibqp, &attr, IB_QP_STATE, NULL);
-
-	nes_debug(NES_DBG_CM, "Exiting connect thread for QP%u. jiffies = "
-		  "%lu\n", nesqp->hwqp.qp_id, jiffies);
-
-	return;
-}
-
-
-/**
- * cm_event_connect_error
- */
-static void cm_event_connect_error(struct nes_cm_event *event)
-{
-	struct nes_qp *nesqp;
-	struct iw_cm_id *cm_id;
-	struct iw_cm_event cm_event;
-	/* struct nes_cm_info cm_info; */
-	int ret;
-
-	if (!event->cm_node)
-		return;
-
-	cm_id = event->cm_node->cm_id;
-	if (!cm_id)
-		return;
-
-	nes_debug(NES_DBG_CM, "cm_node=%p, cm_id=%p\n", event->cm_node, cm_id);
-	nesqp = cm_id->provider_data;
-
-	if (!nesqp)
-		return;
-
-	/* notify OF layer about this connection error event */
-	/* cm_id->rem_ref(cm_id); */
-	nesqp->cm_id = NULL;
-	cm_id->provider_data = NULL;
-	cm_event.event = IW_CM_EVENT_CONNECT_REPLY;
-	cm_event.status = -ECONNRESET;
-	cm_event.provider_data = cm_id->provider_data;
-	cm_event.local_addr = cm_id->m_local_addr;
-	cm_event.remote_addr = cm_id->m_remote_addr;
-	cm_event.private_data = NULL;
-	cm_event.private_data_len = 0;
-
-#ifdef CONFIG_INFINIBAND_NES_DEBUG
-	{
-		struct sockaddr_in *cm_event_laddr = (struct sockaddr_in *)
-						     &cm_event.local_addr;
-		struct sockaddr_in *cm_event_raddr = (struct sockaddr_in *)
-						     &cm_event.remote_addr;
-		nes_debug(NES_DBG_CM, "call CM_EVENT REJECTED, local_addr=%08x, remote_addr=%08x\n",
-			  cm_event_laddr->sin_addr.s_addr, cm_event_raddr->sin_addr.s_addr);
-	}
-#endif
-
-	ret = cm_id->event_handler(cm_id, &cm_event);
-	nes_debug(NES_DBG_CM, "OFA CM event_handler returned, ret=%d\n", ret);
-	if (ret)
-		printk(KERN_ERR "%s[%u] OFA CM event_handler returned, "
-		       "ret=%d\n", __func__, __LINE__, ret);
-	cm_id->rem_ref(cm_id);
-
-	rem_ref_cm_node(event->cm_node->cm_core, event->cm_node);
-	return;
-}
-
-
-/**
- * cm_event_reset
- */
-static void cm_event_reset(struct nes_cm_event *event)
-{
-	struct nes_qp *nesqp;
-	struct iw_cm_id *cm_id;
-	struct iw_cm_event cm_event;
-	/* struct nes_cm_info cm_info; */
-	int ret;
-
-	if (!event->cm_node)
-		return;
-
-	if (!event->cm_node->cm_id)
-		return;
-
-	cm_id = event->cm_node->cm_id;
-
-	nes_debug(NES_DBG_CM, "%p - cm_id = %p\n", event->cm_node, cm_id);
-	nesqp = cm_id->provider_data;
-	if (!nesqp)
-		return;
-
-	nesqp->cm_id = NULL;
-	/* cm_id->provider_data = NULL; */
-	cm_event.event = IW_CM_EVENT_DISCONNECT;
-	cm_event.status = -ECONNRESET;
-	cm_event.provider_data = cm_id->provider_data;
-	cm_event.local_addr = cm_id->m_local_addr;
-	cm_event.remote_addr = cm_id->m_remote_addr;
-	cm_event.private_data = NULL;
-	cm_event.private_data_len = 0;
-
-	cm_id->add_ref(cm_id);
-	ret = cm_id->event_handler(cm_id, &cm_event);
-	atomic_inc(&cm_closes);
-	cm_event.event = IW_CM_EVENT_CLOSE;
-	cm_event.status = 0;
-	cm_event.provider_data = cm_id->provider_data;
-	cm_event.local_addr = cm_id->m_local_addr;
-	cm_event.remote_addr = cm_id->m_remote_addr;
-	cm_event.private_data = NULL;
-	cm_event.private_data_len = 0;
-	nes_debug(NES_DBG_CM, "NODE %p Generating CLOSE\n", event->cm_node);
-	ret = cm_id->event_handler(cm_id, &cm_event);
-
-	nes_debug(NES_DBG_CM, "OFA CM event_handler returned, ret=%d\n", ret);
-
-
-	/* notify OF layer about this connection error event */
-	cm_id->rem_ref(cm_id);
-
-	return;
-}
-
-
-/**
- * cm_event_mpa_req
- */
-static void cm_event_mpa_req(struct nes_cm_event *event)
-{
-	struct iw_cm_id *cm_id;
-	struct iw_cm_event cm_event;
-	int ret;
-	struct nes_cm_node *cm_node;
-	struct sockaddr_in *cm_event_laddr = (struct sockaddr_in *)
-					     &cm_event.local_addr;
-	struct sockaddr_in *cm_event_raddr = (struct sockaddr_in *)
-					     &cm_event.remote_addr;
-
-	cm_node = event->cm_node;
-	if (!cm_node)
-		return;
-	cm_id = cm_node->cm_id;
-
-	atomic_inc(&cm_connect_reqs);
-	nes_debug(NES_DBG_CM, "cm_node = %p - cm_id = %p, jiffies = %lu\n",
-		  cm_node, cm_id, jiffies);
-
-	cm_event.event = IW_CM_EVENT_CONNECT_REQUEST;
-	cm_event.status = 0;
-	cm_event.provider_data = (void *)cm_node;
-
-	cm_event_laddr->sin_family = AF_INET;
-	cm_event_laddr->sin_port = htons(event->cm_info.loc_port);
-	cm_event_laddr->sin_addr.s_addr = htonl(event->cm_info.loc_addr);
-
-	cm_event_raddr->sin_family = AF_INET;
-	cm_event_raddr->sin_port = htons(event->cm_info.rem_port);
-	cm_event_raddr->sin_addr.s_addr = htonl(event->cm_info.rem_addr);
-	cm_event.private_data = cm_node->mpa_frame_buf;
-	cm_event.private_data_len = (u8)cm_node->mpa_frame_size;
-	if (cm_node->mpa_frame_rev == IETF_MPA_V1) {
-		cm_event.ird = NES_MAX_IRD;
-		cm_event.ord = NES_MAX_ORD;
-	} else {
-	cm_event.ird = cm_node->ird_size;
-	cm_event.ord = cm_node->ord_size;
-	}
-
-	ret = cm_id->event_handler(cm_id, &cm_event);
-	if (ret)
-		printk(KERN_ERR "%s[%u] OFA CM event_handler returned, ret=%d\n",
-		       __func__, __LINE__, ret);
-	return;
-}
-
-
-static void cm_event_mpa_reject(struct nes_cm_event *event)
-{
-	struct iw_cm_id *cm_id;
-	struct iw_cm_event cm_event;
-	struct nes_cm_node *cm_node;
-	int ret;
-	struct sockaddr_in *cm_event_laddr = (struct sockaddr_in *)
-					     &cm_event.local_addr;
-	struct sockaddr_in *cm_event_raddr = (struct sockaddr_in *)
-					     &cm_event.remote_addr;
-
-	cm_node = event->cm_node;
-	if (!cm_node)
-		return;
-	cm_id = cm_node->cm_id;
-
-	atomic_inc(&cm_connect_reqs);
-	nes_debug(NES_DBG_CM, "cm_node = %p - cm_id = %p, jiffies = %lu\n",
-		  cm_node, cm_id, jiffies);
-
-	cm_event.event = IW_CM_EVENT_CONNECT_REPLY;
-	cm_event.status = -ECONNREFUSED;
-	cm_event.provider_data = cm_id->provider_data;
-
-	cm_event_laddr->sin_family = AF_INET;
-	cm_event_laddr->sin_port = htons(event->cm_info.loc_port);
-	cm_event_laddr->sin_addr.s_addr = htonl(event->cm_info.loc_addr);
-
-	cm_event_raddr->sin_family = AF_INET;
-	cm_event_raddr->sin_port = htons(event->cm_info.rem_port);
-	cm_event_raddr->sin_addr.s_addr = htonl(event->cm_info.rem_addr);
-
-	cm_event.private_data = cm_node->mpa_frame_buf;
-	cm_event.private_data_len = (u8)cm_node->mpa_frame_size;
-
-	nes_debug(NES_DBG_CM, "call CM_EVENT_MPA_REJECTED, local_addr=%08x, "
-		  "remove_addr=%08x\n",
-		  cm_event_laddr->sin_addr.s_addr,
-		  cm_event_raddr->sin_addr.s_addr);
-
-	ret = cm_id->event_handler(cm_id, &cm_event);
-	if (ret)
-		printk(KERN_ERR "%s[%u] OFA CM event_handler returned, ret=%d\n",
-		       __func__, __LINE__, ret);
-
-	return;
-}
-
-
-static void nes_cm_event_handler(struct work_struct *);
-
-/**
- * nes_cm_post_event
- * post an event to the cm event handler
- */
-static int nes_cm_post_event(struct nes_cm_event *event)
-{
-	atomic_inc(&event->cm_node->cm_core->events_posted);
-	add_ref_cm_node(event->cm_node);
-	event->cm_info.cm_id->add_ref(event->cm_info.cm_id);
-	INIT_WORK(&event->event_work, nes_cm_event_handler);
-	nes_debug(NES_DBG_CM, "cm_node=%p queue_work, event=%p\n",
-		  event->cm_node, event);
-
-	queue_work(event->cm_node->cm_core->event_wq, &event->event_work);
-
-	nes_debug(NES_DBG_CM, "Exit\n");
-	return 0;
-}
-
-
-/**
- * nes_cm_event_handler
- * worker function to handle cm events
- * will free instance of nes_cm_event
- */
-static void nes_cm_event_handler(struct work_struct *work)
-{
-	struct nes_cm_event *event = container_of(work, struct nes_cm_event,
-						  event_work);
-	struct nes_cm_core *cm_core;
-
-	if ((!event) || (!event->cm_node) || (!event->cm_node->cm_core))
-		return;
-
-	cm_core = event->cm_node->cm_core;
-	nes_debug(NES_DBG_CM, "event=%p, event->type=%u, events posted=%u\n",
-		  event, event->type, atomic_read(&cm_core->events_posted));
-
-	switch (event->type) {
-	case NES_CM_EVENT_MPA_REQ:
-		cm_event_mpa_req(event);
-		nes_debug(NES_DBG_CM, "cm_node=%p CM Event: MPA REQUEST\n",
-			  event->cm_node);
-		break;
-	case NES_CM_EVENT_RESET:
-		nes_debug(NES_DBG_CM, "cm_node = %p CM Event: RESET\n",
-			  event->cm_node);
-		cm_event_reset(event);
-		break;
-	case NES_CM_EVENT_CONNECTED:
-		if ((!event->cm_node->cm_id) ||
-		    (event->cm_node->state != NES_CM_STATE_TSA))
-			break;
-		cm_event_connected(event);
-		nes_debug(NES_DBG_CM, "CM Event: CONNECTED\n");
-		break;
-	case NES_CM_EVENT_MPA_REJECT:
-		if ((!event->cm_node->cm_id) ||
-		    (event->cm_node->state == NES_CM_STATE_TSA))
-			break;
-		cm_event_mpa_reject(event);
-		nes_debug(NES_DBG_CM, "CM Event: REJECT\n");
-		break;
-
-	case NES_CM_EVENT_ABORTED:
-		if ((!event->cm_node->cm_id) ||
-		    (event->cm_node->state == NES_CM_STATE_TSA))
-			break;
-		cm_event_connect_error(event);
-		nes_debug(NES_DBG_CM, "CM Event: ABORTED\n");
-		break;
-	case NES_CM_EVENT_DROPPED_PKT:
-		nes_debug(NES_DBG_CM, "CM Event: DROPPED PKT\n");
-		break;
-	default:
-		nes_debug(NES_DBG_CM, "CM Event: UNKNOWN EVENT TYPE\n");
-		break;
-	}
-
-	atomic_dec(&cm_core->events_posted);
-	event->cm_info.cm_id->rem_ref(event->cm_info.cm_id);
-	rem_ref_cm_node(cm_core, event->cm_node);
-	kfree(event);
-
-	return;
-}
diff --git a/drivers/infiniband/hw/nes/nes_cm.h b/drivers/infiniband/hw/nes/nes_cm.h
deleted file mode 100644
index b9cc02b..0000000
--- a/drivers/infiniband/hw/nes/nes_cm.h
+++ /dev/null
@@ -1,470 +0,0 @@
-/*
- * Copyright (c) 2006 - 2014 Intel Corporation.  All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- */
-
-#ifndef NES_CM_H
-#define NES_CM_H
-
-#define QUEUE_EVENTS
-
-#define NES_MANAGE_APBVT_DEL 0
-#define NES_MANAGE_APBVT_ADD 1
-
-#define NES_MPA_REQUEST_ACCEPT  1
-#define NES_MPA_REQUEST_REJECT  2
-
-/* IETF MPA -- defines, enums, structs */
-#define IEFT_MPA_KEY_REQ  "MPA ID Req Frame"
-#define IEFT_MPA_KEY_REP  "MPA ID Rep Frame"
-#define IETF_MPA_KEY_SIZE 16
-#define IETF_MPA_VERSION  1
-#define IETF_MAX_PRIV_DATA_LEN 512
-#define IETF_MPA_FRAME_SIZE    20
-#define IETF_RTR_MSG_SIZE      4
-#define IETF_MPA_V2_FLAG       0x10
-
-/* IETF RTR MSG Fields               */
-#define IETF_PEER_TO_PEER       0x8000
-#define IETF_FLPDU_ZERO_LEN     0x4000
-#define IETF_RDMA0_WRITE        0x8000
-#define IETF_RDMA0_READ         0x4000
-#define IETF_NO_IRD_ORD         0x3FFF
-#define NES_MAX_IRD		 0x40
-#define NES_MAX_ORD		 0x7F
-
-enum ietf_mpa_flags {
-	IETF_MPA_FLAGS_MARKERS = 0x80,	/* receive Markers */
-	IETF_MPA_FLAGS_CRC     = 0x40,	/* receive Markers */
-	IETF_MPA_FLAGS_REJECT  = 0x20,	/* Reject */
-};
-
-struct ietf_mpa_v1 {
-	u8 key[IETF_MPA_KEY_SIZE];
-	u8 flags;
-	u8 rev;
-	__be16 priv_data_len;
-	u8 priv_data[0];
-};
-
-#define ietf_mpa_req_resp_frame ietf_mpa_frame
-
-struct ietf_rtr_msg {
-	__be16 ctrl_ird;
-	__be16 ctrl_ord;
-};
-
-struct ietf_mpa_v2 {
-	u8 key[IETF_MPA_KEY_SIZE];
-	u8 flags;
-	u8 rev;
-	 __be16 priv_data_len;
-	struct ietf_rtr_msg rtr_msg;
-	u8 priv_data[0];
-};
-
-struct nes_v4_quad {
-	u32 rsvd0;
-	__le32 DstIpAdrIndex;	/* Only most significant 5 bits are valid */
-	__be32 SrcIpadr;
-	__be16 TcpPorts[2];		/* src is low, dest is high */
-};
-
-struct nes_cm_node;
-enum nes_timer_type {
-	NES_TIMER_TYPE_SEND,
-	NES_TIMER_TYPE_RECV,
-	NES_TIMER_NODE_CLEANUP,
-	NES_TIMER_TYPE_CLOSE,
-};
-
-#define NES_PASSIVE_STATE_INDICATED	0
-#define NES_DO_NOT_SEND_RESET_EVENT	1
-#define NES_SEND_RESET_EVENT		2
-
-#define MAX_NES_IFS 4
-
-#define SET_ACK 1
-#define SET_SYN 2
-#define SET_FIN 4
-#define SET_RST 8
-
-#define TCP_OPTIONS_PADDING	3
-
-struct option_base {
-	u8 optionnum;
-	u8 length;
-};
-
-enum option_numbers {
-	OPTION_NUMBER_END,
-	OPTION_NUMBER_NONE,
-	OPTION_NUMBER_MSS,
-	OPTION_NUMBER_WINDOW_SCALE,
-	OPTION_NUMBER_SACK_PERM,
-	OPTION_NUMBER_SACK,
-	OPTION_NUMBER_WRITE0 = 0xbc
-};
-
-struct option_mss {
-	u8 optionnum;
-	u8 length;
-	__be16 mss;
-};
-
-struct option_windowscale {
-	u8 optionnum;
-	u8 length;
-	u8 shiftcount;
-};
-
-union all_known_options {
-	char as_end;
-	struct option_base as_base;
-	struct option_mss as_mss;
-	struct option_windowscale as_windowscale;
-};
-
-struct nes_timer_entry {
-	struct list_head list;
-	unsigned long timetosend;	/* jiffies */
-	struct sk_buff *skb;
-	u32 type;
-	u32 retrycount;
-	u32 retranscount;
-	u32 context;
-	u32 seq_num;
-	u32 send_retrans;
-	int close_when_complete;
-	struct net_device *netdev;
-};
-
-#define NES_DEFAULT_RETRYS  64
-#define NES_DEFAULT_RETRANS 8
-#ifdef CONFIG_INFINIBAND_NES_DEBUG
-#define NES_RETRY_TIMEOUT   (1000*HZ/1000)
-#else
-#define NES_RETRY_TIMEOUT   (3000*HZ/1000)
-#endif
-#define NES_SHORT_TIME      (10)
-#define NES_LONG_TIME       (2000*HZ/1000)
-#define NES_MAX_TIMEOUT     ((unsigned long) (12*HZ))
-
-#define NES_CM_HASHTABLE_SIZE         1024
-#define NES_CM_TCP_TIMER_INTERVAL     3000
-#define NES_CM_DEFAULT_MTU            1540
-#define NES_CM_DEFAULT_FRAME_CNT      10
-#define NES_CM_THREAD_STACK_SIZE      256
-#define NES_CM_DEFAULT_RCV_WND        64240	// before we know that window scaling is allowed
-#define NES_CM_DEFAULT_RCV_WND_SCALED 256960  // after we know that window scaling is allowed
-#define NES_CM_DEFAULT_RCV_WND_SCALE  2
-#define NES_CM_DEFAULT_FREE_PKTS      0x000A
-#define NES_CM_FREE_PKT_LO_WATERMARK  2
-
-#define NES_CM_DEFAULT_MSS   536
-
-#define NES_CM_DEF_SEQ       0x159bf75f
-#define NES_CM_DEF_LOCAL_ID  0x3b47
-
-#define NES_CM_DEF_SEQ2      0x18ed5740
-#define NES_CM_DEF_LOCAL_ID2 0xb807
-#define	MAX_CM_BUFFER	(IETF_MPA_FRAME_SIZE + IETF_RTR_MSG_SIZE + IETF_MAX_PRIV_DATA_LEN)
-
-typedef u32 nes_addr_t;
-
-#define nes_cm_tsa_context nes_qp_context
-
-struct nes_qp;
-
-/* cm node transition states */
-enum nes_cm_node_state {
-	NES_CM_STATE_UNKNOWN,
-	NES_CM_STATE_INITED,
-	NES_CM_STATE_LISTENING,
-	NES_CM_STATE_SYN_RCVD,
-	NES_CM_STATE_SYN_SENT,
-	NES_CM_STATE_ONE_SIDE_ESTABLISHED,
-	NES_CM_STATE_ESTABLISHED,
-	NES_CM_STATE_ACCEPTING,
-	NES_CM_STATE_MPAREQ_SENT,
-	NES_CM_STATE_MPAREQ_RCVD,
-	NES_CM_STATE_MPAREJ_RCVD,
-	NES_CM_STATE_TSA,
-	NES_CM_STATE_FIN_WAIT1,
-	NES_CM_STATE_FIN_WAIT2,
-	NES_CM_STATE_CLOSE_WAIT,
-	NES_CM_STATE_TIME_WAIT,
-	NES_CM_STATE_LAST_ACK,
-	NES_CM_STATE_CLOSING,
-	NES_CM_STATE_LISTENER_DESTROYED,
-	NES_CM_STATE_CLOSED
-};
-
-enum mpa_frame_version {
-	IETF_MPA_V1 = 1,
-	IETF_MPA_V2 = 2
-};
-
-enum mpa_frame_key {
-	MPA_KEY_REQUEST,
-	MPA_KEY_REPLY
-};
-
-enum send_rdma0 {
-	SEND_RDMA_READ_ZERO = 1,
-	SEND_RDMA_WRITE_ZERO = 2
-};
-
-enum nes_tcpip_pkt_type {
-	NES_PKT_TYPE_UNKNOWN,
-	NES_PKT_TYPE_SYN,
-	NES_PKT_TYPE_SYNACK,
-	NES_PKT_TYPE_ACK,
-	NES_PKT_TYPE_FIN,
-	NES_PKT_TYPE_RST
-};
-
-
-/* type of nes connection */
-enum nes_cm_conn_type {
-	NES_CM_IWARP_CONN_TYPE,
-};
-
-/* CM context params */
-struct nes_cm_tcp_context {
-	u8  client;
-
-	u32 loc_seq_num;
-	u32 loc_ack_num;
-	u32 rem_ack_num;
-	u32 rcv_nxt;
-
-	u32 loc_id;
-	u32 rem_id;
-
-	u32 snd_wnd;
-	u32 max_snd_wnd;
-
-	u32 rcv_wnd;
-	u32 mss;
-	u8  snd_wscale;
-	u8  rcv_wscale;
-
-	struct nes_cm_tsa_context tsa_cntxt;
-};
-
-
-enum nes_cm_listener_state {
-	NES_CM_LISTENER_PASSIVE_STATE = 1,
-	NES_CM_LISTENER_ACTIVE_STATE = 2,
-	NES_CM_LISTENER_EITHER_STATE = 3
-};
-
-struct nes_cm_listener {
-	struct list_head           list;
-	struct nes_cm_core         *cm_core;
-	u8                         loc_mac[ETH_ALEN];
-	nes_addr_t                 loc_addr;
-	u16                        loc_port;
-	struct iw_cm_id            *cm_id;
-	enum nes_cm_conn_type      conn_type;
-	atomic_t                   ref_count;
-	struct nes_vnic            *nesvnic;
-	atomic_t                   pend_accepts_cnt;
-	int                        backlog;
-	enum nes_cm_listener_state listener_state;
-	u32                        reused_node;
-	u8			   tos;
-};
-
-/* per connection node and node state information */
-struct nes_cm_node {
-	nes_addr_t                loc_addr, rem_addr;
-	u16                       loc_port, rem_port;
-
-	u8                        loc_mac[ETH_ALEN];
-	u8                        rem_mac[ETH_ALEN];
-
-	enum nes_cm_node_state    state;
-	struct nes_cm_tcp_context tcp_cntxt;
-	struct nes_cm_core        *cm_core;
-	struct sk_buff_head       resend_list;
-	atomic_t                  ref_count;
-	struct net_device         *netdev;
-
-	struct nes_cm_node        *loopbackpartner;
-
-	struct nes_timer_entry	  *send_entry;
-	struct nes_timer_entry    *recv_entry;
-	spinlock_t                retrans_list_lock;
-	enum send_rdma0           send_rdma0_op;
-
-	union {
-		struct ietf_mpa_v1 mpa_frame;
-		struct ietf_mpa_v2 mpa_v2_frame;
-		u8                 mpa_frame_buf[MAX_CM_BUFFER];
-	};
-	enum mpa_frame_version    mpa_frame_rev;
-	u16			  ird_size;
-	u16                       ord_size;
-	u16			  mpav2_ird_ord;
-
-	u16                       mpa_frame_size;
-	struct iw_cm_id           *cm_id;
-	struct list_head          list;
-	bool                      accelerated;
-	struct nes_cm_listener    *listener;
-	enum nes_cm_conn_type     conn_type;
-	struct nes_vnic           *nesvnic;
-	int                       apbvt_set;
-	int                       accept_pend;
-	struct list_head	timer_entry;
-	struct list_head	reset_entry;
-	struct nes_qp		*nesqp;
-	atomic_t 		passive_state;
-	u8			tos;
-};
-
-/* structure for client or CM to fill when making CM api calls. */
-/*	- only need to set relevant data, based on op. */
-struct nes_cm_info {
-	union {
-		struct iw_cm_id   *cm_id;
-		struct net_device *netdev;
-	};
-
-	u16 loc_port;
-	u16 rem_port;
-	nes_addr_t loc_addr;
-	nes_addr_t rem_addr;
-	enum nes_cm_conn_type  conn_type;
-	int backlog;
-};
-
-/* CM event codes */
-enum  nes_cm_event_type {
-	NES_CM_EVENT_UNKNOWN,
-	NES_CM_EVENT_ESTABLISHED,
-	NES_CM_EVENT_MPA_REQ,
-	NES_CM_EVENT_MPA_CONNECT,
-	NES_CM_EVENT_MPA_ACCEPT,
-	NES_CM_EVENT_MPA_REJECT,
-	NES_CM_EVENT_MPA_ESTABLISHED,
-	NES_CM_EVENT_CONNECTED,
-	NES_CM_EVENT_CLOSED,
-	NES_CM_EVENT_RESET,
-	NES_CM_EVENT_DROPPED_PKT,
-	NES_CM_EVENT_CLOSE_IMMED,
-	NES_CM_EVENT_CLOSE_HARD,
-	NES_CM_EVENT_CLOSE_CLEAN,
-	NES_CM_EVENT_ABORTED,
-	NES_CM_EVENT_SEND_FIRST
-};
-
-/* event to post to CM event handler */
-struct nes_cm_event {
-	enum nes_cm_event_type type;
-
-	struct nes_cm_info cm_info;
-	struct work_struct event_work;
-	struct nes_cm_node *cm_node;
-};
-
-struct nes_cm_core {
-	enum nes_cm_node_state  state;
-
-	atomic_t                listen_node_cnt;
-	struct nes_cm_node      listen_list;
-	spinlock_t              listen_list_lock;
-
-	u32                     mtu;
-	u32                     free_tx_pkt_max;
-	u32                     rx_pkt_posted;
-	atomic_t                ht_node_cnt;
-	struct list_head        connected_nodes;
-	/* struct list_head hashtable[NES_CM_HASHTABLE_SIZE]; */
-	spinlock_t              ht_lock;
-
-	struct timer_list       tcp_timer;
-
-	const struct nes_cm_ops *api;
-
-	int (*post_event)(struct nes_cm_event *event);
-	atomic_t                events_posted;
-	struct workqueue_struct *event_wq;
-	struct workqueue_struct *disconn_wq;
-
-	atomic_t                node_cnt;
-	u64                     aborted_connects;
-	u32                     options;
-
-	struct nes_cm_node      *current_listen_node;
-};
-
-
-#define NES_CM_SET_PKT_SIZE        (1 << 1)
-#define NES_CM_SET_FREE_PKT_Q_SIZE (1 << 2)
-
-/* CM ops/API for client interface */
-struct nes_cm_ops {
-	int (*accelerated)(struct nes_cm_core *, struct nes_cm_node *);
-	struct nes_cm_listener * (*listen)(struct nes_cm_core *, struct nes_vnic *,
-			struct nes_cm_info *);
-	int (*stop_listener)(struct nes_cm_core *, struct nes_cm_listener *);
-	struct nes_cm_node * (*connect)(struct nes_cm_core *,
-			struct nes_vnic *, u16, void *,
-			struct nes_cm_info *);
-	int (*close)(struct nes_cm_core *, struct nes_cm_node *);
-	int (*accept)(struct nes_cm_core *, struct nes_cm_node *);
-	int (*reject)(struct nes_cm_core *, struct nes_cm_node *);
-	int (*recv_pkt)(struct nes_cm_core *, struct nes_vnic *,
-			struct sk_buff *);
-	int (*destroy_cm_core)(struct nes_cm_core *);
-	int (*get)(struct nes_cm_core *);
-	int (*set)(struct nes_cm_core *, u32, u32);
-};
-
-int schedule_nes_timer(struct nes_cm_node *, struct sk_buff *,
-		enum nes_timer_type, int, int);
-
-int nes_accept(struct iw_cm_id *, struct iw_cm_conn_param *);
-int nes_reject(struct iw_cm_id *, const void *, u8);
-int nes_connect(struct iw_cm_id *, struct iw_cm_conn_param *);
-int nes_create_listen(struct iw_cm_id *, int);
-int nes_destroy_listen(struct iw_cm_id *);
-
-int nes_cm_recv(struct sk_buff *, struct net_device *);
-int nes_cm_start(void);
-int nes_cm_stop(void);
-int nes_add_ref_cm_node(struct nes_cm_node *cm_node);
-int nes_rem_ref_cm_node(struct nes_cm_node *cm_node);
-
-#endif			/* NES_CM_H */
diff --git a/drivers/infiniband/hw/nes/nes_context.h b/drivers/infiniband/hw/nes/nes_context.h
deleted file mode 100644
index a69eef1..0000000
--- a/drivers/infiniband/hw/nes/nes_context.h
+++ /dev/null
@@ -1,193 +0,0 @@
-/*
- * Copyright (c) 2006 - 2011 Intel Corporation.  All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef NES_CONTEXT_H
-#define NES_CONTEXT_H
-
-struct nes_qp_context {
-	__le32   misc;
-	__le32   cqs;
-	__le32   sq_addr_low;
-	__le32   sq_addr_high;
-	__le32   rq_addr_low;
-	__le32   rq_addr_high;
-	__le32   misc2;
-	__le16   tcpPorts[2];
-	__le32   ip0;
-	__le32   ip1;
-	__le32   ip2;
-	__le32   ip3;
-	__le32   mss;
-	__le32   arp_index_vlan;
-	__le32   tcp_state_flow_label;
-	__le32   pd_index_wscale;
-	__le32   keepalive;
-	u32   ts_recent;
-	u32   ts_age;
-	__le32   snd_nxt;
-	__le32   snd_wnd;
-	__le32   rcv_nxt;
-	__le32   rcv_wnd;
-	__le32   snd_max;
-	__le32   snd_una;
-	u32   srtt;
-	__le32   rttvar;
-	__le32   ssthresh;
-	__le32   cwnd;
-	__le32   snd_wl1;
-	__le32   snd_wl2;
-	__le32   max_snd_wnd;
-	__le32   ts_val_delta;
-	u32   retransmit;
-	u32   probe_cnt;
-	u32   hte_index;
-	__le32   q2_addr_low;
-	__le32   q2_addr_high;
-	__le32   ird_index;
-	u32   Rsvd3;
-	__le32   ird_ord_sizes;
-	u32   mrkr_offset;
-	__le32   aeq_token_low;
-	__le32   aeq_token_high;
-};
-
-/* QP Context Misc Field */
-
-#define NES_QPCONTEXT_MISC_IWARP_VER_MASK    0x00000003
-#define NES_QPCONTEXT_MISC_IWARP_VER_SHIFT   0
-#define NES_QPCONTEXT_MISC_EFB_SIZE_MASK     0x000000C0
-#define NES_QPCONTEXT_MISC_EFB_SIZE_SHIFT    6
-#define NES_QPCONTEXT_MISC_RQ_SIZE_MASK      0x00000300
-#define NES_QPCONTEXT_MISC_RQ_SIZE_SHIFT     8
-#define NES_QPCONTEXT_MISC_SQ_SIZE_MASK      0x00000c00
-#define NES_QPCONTEXT_MISC_SQ_SIZE_SHIFT     10
-#define NES_QPCONTEXT_MISC_PCI_FCN_MASK      0x00007000
-#define NES_QPCONTEXT_MISC_PCI_FCN_SHIFT     12
-#define NES_QPCONTEXT_MISC_DUP_ACKS_MASK     0x00070000
-#define NES_QPCONTEXT_MISC_DUP_ACKS_SHIFT    16
-
-enum nes_qp_context_misc_bits {
-	NES_QPCONTEXT_MISC_RX_WQE_SIZE         = 0x00000004,
-	NES_QPCONTEXT_MISC_IPV4                = 0x00000008,
-	NES_QPCONTEXT_MISC_DO_NOT_FRAG         = 0x00000010,
-	NES_QPCONTEXT_MISC_INSERT_VLAN         = 0x00000020,
-	NES_QPCONTEXT_MISC_DROS                = 0x00008000,
-	NES_QPCONTEXT_MISC_WSCALE              = 0x00080000,
-	NES_QPCONTEXT_MISC_KEEPALIVE           = 0x00100000,
-	NES_QPCONTEXT_MISC_TIMESTAMP           = 0x00200000,
-	NES_QPCONTEXT_MISC_SACK                = 0x00400000,
-	NES_QPCONTEXT_MISC_RDMA_WRITE_EN       = 0x00800000,
-	NES_QPCONTEXT_MISC_RDMA_READ_EN        = 0x01000000,
-	NES_QPCONTEXT_MISC_WBIND_EN            = 0x10000000,
-	NES_QPCONTEXT_MISC_FAST_REGISTER_EN    = 0x20000000,
-	NES_QPCONTEXT_MISC_PRIV_EN             = 0x40000000,
-	NES_QPCONTEXT_MISC_NO_NAGLE            = 0x80000000
-};
-
-enum nes_qp_acc_wq_sizes {
-	HCONTEXT_TSA_WQ_SIZE_4 = 0,
-	HCONTEXT_TSA_WQ_SIZE_32 = 1,
-	HCONTEXT_TSA_WQ_SIZE_128 = 2,
-	HCONTEXT_TSA_WQ_SIZE_512 = 3
-};
-
-/* QP Context Misc2 Fields */
-#define NES_QPCONTEXT_MISC2_TTL_MASK            0x000000ff
-#define NES_QPCONTEXT_MISC2_TTL_SHIFT           0
-#define NES_QPCONTEXT_MISC2_HOP_LIMIT_MASK      0x000000ff
-#define NES_QPCONTEXT_MISC2_HOP_LIMIT_SHIFT     0
-#define NES_QPCONTEXT_MISC2_LIMIT_MASK          0x00000300
-#define NES_QPCONTEXT_MISC2_LIMIT_SHIFT         8
-#define NES_QPCONTEXT_MISC2_NIC_INDEX_MASK      0x0000fc00
-#define NES_QPCONTEXT_MISC2_NIC_INDEX_SHIFT     10
-#define NES_QPCONTEXT_MISC2_SRC_IP_MASK         0x001f0000
-#define NES_QPCONTEXT_MISC2_SRC_IP_SHIFT        16
-#define NES_QPCONTEXT_MISC2_TOS_MASK            0xff000000
-#define NES_QPCONTEXT_MISC2_TOS_SHIFT           24
-#define NES_QPCONTEXT_MISC2_TRAFFIC_CLASS_MASK  0xff000000
-#define NES_QPCONTEXT_MISC2_TRAFFIC_CLASS_SHIFT 24
-
-/* QP Context Tcp State/Flow Label Fields */
-#define NES_QPCONTEXT_TCPFLOW_FLOW_LABEL_MASK   0x000fffff
-#define NES_QPCONTEXT_TCPFLOW_FLOW_LABEL_SHIFT  0
-#define NES_QPCONTEXT_TCPFLOW_TCP_STATE_MASK    0xf0000000
-#define NES_QPCONTEXT_TCPFLOW_TCP_STATE_SHIFT   28
-
-enum nes_qp_tcp_state {
-	NES_QPCONTEXT_TCPSTATE_CLOSED = 1,
-	NES_QPCONTEXT_TCPSTATE_EST = 5,
-	NES_QPCONTEXT_TCPSTATE_TIME_WAIT = 11,
-};
-
-/* QP Context PD Index/wscale Fields */
-#define NES_QPCONTEXT_PDWSCALE_RCV_WSCALE_MASK  0x0000000f
-#define NES_QPCONTEXT_PDWSCALE_RCV_WSCALE_SHIFT 0
-#define NES_QPCONTEXT_PDWSCALE_SND_WSCALE_MASK  0x00000f00
-#define NES_QPCONTEXT_PDWSCALE_SND_WSCALE_SHIFT 8
-#define NES_QPCONTEXT_PDWSCALE_PDINDEX_MASK     0xffff0000
-#define NES_QPCONTEXT_PDWSCALE_PDINDEX_SHIFT    16
-
-/* QP Context Keepalive Fields */
-#define NES_QPCONTEXT_KEEPALIVE_DELTA_MASK      0x0000ffff
-#define NES_QPCONTEXT_KEEPALIVE_DELTA_SHIFT     0
-#define NES_QPCONTEXT_KEEPALIVE_PROBE_CNT_MASK  0x00ff0000
-#define NES_QPCONTEXT_KEEPALIVE_PROBE_CNT_SHIFT 16
-#define NES_QPCONTEXT_KEEPALIVE_INTV_MASK       0xff000000
-#define NES_QPCONTEXT_KEEPALIVE_INTV_SHIFT      24
-
-/* QP Context ORD/IRD Fields */
-#define NES_QPCONTEXT_ORDIRD_ORDSIZE_MASK       0x0000007f
-#define NES_QPCONTEXT_ORDIRD_ORDSIZE_SHIFT      0
-#define NES_QPCONTEXT_ORDIRD_IRDSIZE_MASK       0x00030000
-#define NES_QPCONTEXT_ORDIRD_IRDSIZE_SHIFT      16
-#define NES_QPCONTEXT_ORDIRD_IWARP_MODE_MASK    0x30000000
-#define NES_QPCONTEXT_ORDIRD_IWARP_MODE_SHIFT   28
-
-enum nes_ord_ird_bits {
-	NES_QPCONTEXT_ORDIRD_WRPDU                   = 0x02000000,
-	NES_QPCONTEXT_ORDIRD_LSMM_PRESENT            = 0x04000000,
-	NES_QPCONTEXT_ORDIRD_ALSMM                   = 0x08000000,
-	NES_QPCONTEXT_ORDIRD_AAH                     = 0x40000000,
-	NES_QPCONTEXT_ORDIRD_RNMC                    = 0x80000000
-};
-
-enum nes_iwarp_qp_state {
-	NES_QPCONTEXT_IWARP_STATE_NONEXIST  = 0,
-	NES_QPCONTEXT_IWARP_STATE_IDLE      = 1,
-	NES_QPCONTEXT_IWARP_STATE_RTS       = 2,
-	NES_QPCONTEXT_IWARP_STATE_CLOSING   = 3,
-	NES_QPCONTEXT_IWARP_STATE_TERMINATE = 5,
-	NES_QPCONTEXT_IWARP_STATE_ERROR     = 6
-};
-
-
-#endif		/* NES_CONTEXT_H */
diff --git a/drivers/infiniband/hw/nes/nes_hw.c b/drivers/infiniband/hw/nes/nes_hw.c
deleted file mode 100644
index bd0675d..0000000
--- a/drivers/infiniband/hw/nes/nes_hw.c
+++ /dev/null
@@ -1,3887 +0,0 @@
-/*
- * Copyright (c) 2006 - 2011 Intel Corporation.  All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- */
-
-#include <linux/module.h>
-#include <linux/moduleparam.h>
-#include <linux/netdevice.h>
-#include <linux/etherdevice.h>
-#include <linux/if_vlan.h>
-#include <linux/slab.h>
-
-#include "nes.h"
-
-static int wide_ppm_offset;
-module_param(wide_ppm_offset, int, 0644);
-MODULE_PARM_DESC(wide_ppm_offset, "Increase CX4 interface clock ppm offset, 0=100ppm (default), 1=300ppm");
-
-static u32 crit_err_count;
-u32 int_mod_timer_init;
-u32 int_mod_cq_depth_256;
-u32 int_mod_cq_depth_128;
-u32 int_mod_cq_depth_32;
-u32 int_mod_cq_depth_24;
-u32 int_mod_cq_depth_16;
-u32 int_mod_cq_depth_4;
-u32 int_mod_cq_depth_1;
-static const u8 nes_max_critical_error_count = 100;
-#include "nes_cm.h"
-
-static void nes_cqp_ce_handler(struct nes_device *nesdev, struct nes_hw_cq *cq);
-static void nes_init_csr_ne020(struct nes_device *nesdev, u8 hw_rev, u8 port_count);
-static int nes_init_serdes(struct nes_device *nesdev, u8 hw_rev, u8 port_count,
-				struct nes_adapter *nesadapter, u8  OneG_Mode);
-static void nes_nic_napi_ce_handler(struct nes_device *nesdev, struct nes_hw_nic_cq *cq);
-static void nes_process_aeq(struct nes_device *nesdev, struct nes_hw_aeq *aeq);
-static void nes_process_ceq(struct nes_device *nesdev, struct nes_hw_ceq *ceq);
-static void nes_process_iwarp_aeqe(struct nes_device *nesdev,
-				   struct nes_hw_aeqe *aeqe);
-static void process_critical_error(struct nes_device *nesdev);
-static void nes_process_mac_intr(struct nes_device *nesdev, u32 mac_number);
-static unsigned int nes_reset_adapter_ne020(struct nes_device *nesdev, u8 *OneG_Mode);
-static void nes_terminate_start_timer(struct nes_qp *nesqp);
-
-static const char *const nes_iwarp_state_str[] = {
-	"Non-Existent",
-	"Idle",
-	"RTS",
-	"Closing",
-	"RSVD1",
-	"Terminate",
-	"Error",
-	"RSVD2",
-};
-
-static const char *const nes_tcp_state_str[] = {
-	"Non-Existent",
-	"Closed",
-	"Listen",
-	"SYN Sent",
-	"SYN Rcvd",
-	"Established",
-	"Close Wait",
-	"FIN Wait 1",
-	"Closing",
-	"Last Ack",
-	"FIN Wait 2",
-	"Time Wait",
-	"RSVD1",
-	"RSVD2",
-	"RSVD3",
-	"RSVD4",
-};
-
-static inline void print_ip(struct nes_cm_node *cm_node)
-{
-	unsigned char *rem_addr;
-	if (cm_node) {
-		rem_addr = (unsigned char *)&cm_node->rem_addr;
-		printk(KERN_ERR PFX "Remote IP addr: %pI4\n", rem_addr);
-	}
-}
-
-/**
- * nes_nic_init_timer_defaults
- */
-void  nes_nic_init_timer_defaults(struct nes_device *nesdev, u8 jumbomode)
-{
-	unsigned long flags;
-	struct nes_adapter *nesadapter = nesdev->nesadapter;
-	struct nes_hw_tune_timer *shared_timer = &nesadapter->tune_timer;
-
-	spin_lock_irqsave(&nesadapter->periodic_timer_lock, flags);
-
-	shared_timer->timer_in_use_min = NES_NIC_FAST_TIMER_LOW;
-	shared_timer->timer_in_use_max = NES_NIC_FAST_TIMER_HIGH;
-	if (jumbomode) {
-		shared_timer->threshold_low    = DEFAULT_JUMBO_NES_QL_LOW;
-		shared_timer->threshold_target = DEFAULT_JUMBO_NES_QL_TARGET;
-		shared_timer->threshold_high   = DEFAULT_JUMBO_NES_QL_HIGH;
-	} else {
-		shared_timer->threshold_low    = DEFAULT_NES_QL_LOW;
-		shared_timer->threshold_target = DEFAULT_NES_QL_TARGET;
-		shared_timer->threshold_high   = DEFAULT_NES_QL_HIGH;
-	}
-
-	/* todo use netdev->mtu to set thresholds */
-	spin_unlock_irqrestore(&nesadapter->periodic_timer_lock, flags);
-}
-
-
-/**
- * nes_nic_init_timer
- */
-static void  nes_nic_init_timer(struct nes_device *nesdev)
-{
-	unsigned long flags;
-	struct nes_adapter *nesadapter = nesdev->nesadapter;
-	struct nes_hw_tune_timer *shared_timer = &nesadapter->tune_timer;
-
-	spin_lock_irqsave(&nesadapter->periodic_timer_lock, flags);
-
-	if (shared_timer->timer_in_use_old == 0) {
-		nesdev->deepcq_count = 0;
-		shared_timer->timer_direction_upward = 0;
-		shared_timer->timer_direction_downward = 0;
-		shared_timer->timer_in_use = NES_NIC_FAST_TIMER;
-		shared_timer->timer_in_use_old = 0;
-
-	}
-	if (shared_timer->timer_in_use != shared_timer->timer_in_use_old) {
-		shared_timer->timer_in_use_old = shared_timer->timer_in_use;
-		nes_write32(nesdev->regs+NES_PERIODIC_CONTROL,
-			0x80000000 | ((u32)(shared_timer->timer_in_use*8)));
-	}
-	/* todo use netdev->mtu to set thresholds */
-	spin_unlock_irqrestore(&nesadapter->periodic_timer_lock, flags);
-}
-
-
-/**
- * nes_nic_tune_timer
- */
-static void nes_nic_tune_timer(struct nes_device *nesdev)
-{
-	unsigned long flags;
-	struct nes_adapter *nesadapter = nesdev->nesadapter;
-	struct nes_hw_tune_timer *shared_timer = &nesadapter->tune_timer;
-	u16 cq_count = nesdev->currcq_count;
-
-	spin_lock_irqsave(&nesadapter->periodic_timer_lock, flags);
-
-	if (shared_timer->cq_count_old <= cq_count)
-		shared_timer->cq_direction_downward = 0;
-	else
-		shared_timer->cq_direction_downward++;
-	shared_timer->cq_count_old = cq_count;
-	if (shared_timer->cq_direction_downward > NES_NIC_CQ_DOWNWARD_TREND) {
-		if (cq_count <= shared_timer->threshold_low &&
-		    shared_timer->threshold_low > 4) {
-			shared_timer->threshold_low = shared_timer->threshold_low/2;
-			shared_timer->cq_direction_downward=0;
-			nesdev->currcq_count = 0;
-			spin_unlock_irqrestore(&nesadapter->periodic_timer_lock, flags);
-			return;
-		}
-	}
-
-	if (cq_count > 1) {
-		nesdev->deepcq_count += cq_count;
-		if (cq_count <= shared_timer->threshold_low) {       /* increase timer gently */
-			shared_timer->timer_direction_upward++;
-			shared_timer->timer_direction_downward = 0;
-		} else if (cq_count <= shared_timer->threshold_target) { /* balanced */
-			shared_timer->timer_direction_upward = 0;
-			shared_timer->timer_direction_downward = 0;
-		} else if (cq_count <= shared_timer->threshold_high) {  /* decrease timer gently */
-			shared_timer->timer_direction_downward++;
-			shared_timer->timer_direction_upward = 0;
-		} else if (cq_count <= (shared_timer->threshold_high) * 2) {
-			shared_timer->timer_in_use -= 2;
-			shared_timer->timer_direction_upward = 0;
-			shared_timer->timer_direction_downward++;
-		} else {
-			shared_timer->timer_in_use -= 4;
-			shared_timer->timer_direction_upward = 0;
-			shared_timer->timer_direction_downward++;
-		}
-
-		if (shared_timer->timer_direction_upward > 3 ) {  /* using history */
-			shared_timer->timer_in_use += 3;
-			shared_timer->timer_direction_upward = 0;
-			shared_timer->timer_direction_downward = 0;
-		}
-		if (shared_timer->timer_direction_downward > 5) { /* using history */
-			shared_timer->timer_in_use -= 4 ;
-			shared_timer->timer_direction_downward = 0;
-			shared_timer->timer_direction_upward = 0;
-		}
-	}
-
-	/* boundary checking */
-	if (shared_timer->timer_in_use > shared_timer->threshold_high)
-		shared_timer->timer_in_use = shared_timer->threshold_high;
-	else if (shared_timer->timer_in_use < shared_timer->threshold_low)
-		shared_timer->timer_in_use = shared_timer->threshold_low;
-
-	nesdev->currcq_count = 0;
-
-	spin_unlock_irqrestore(&nesadapter->periodic_timer_lock, flags);
-}
-
-
-/**
- * nes_init_adapter - initialize adapter
- */
-struct nes_adapter *nes_init_adapter(struct nes_device *nesdev, u8 hw_rev) {
-	struct nes_adapter *nesadapter = NULL;
-	unsigned long num_pds;
-	u32 u32temp;
-	u32 port_count;
-	u16 max_rq_wrs;
-	u16 max_sq_wrs;
-	u32 max_mr;
-	u32 max_256pbl;
-	u32 max_4kpbl;
-	u32 max_qp;
-	u32 max_irrq;
-	u32 max_cq;
-	u32 hte_index_mask;
-	u32 adapter_size;
-	u32 arp_table_size;
-	u16 vendor_id;
-	u16 device_id;
-	u8  OneG_Mode;
-	u8  func_index;
-
-	/* search the list of existing adapters */
-	list_for_each_entry(nesadapter, &nes_adapter_list, list) {
-		nes_debug(NES_DBG_INIT, "Searching Adapter list for PCI devfn = 0x%X,"
-				" adapter PCI slot/bus = %u/%u, pci devices PCI slot/bus = %u/%u, .\n",
-				nesdev->pcidev->devfn,
-				PCI_SLOT(nesadapter->devfn),
-				nesadapter->bus_number,
-				PCI_SLOT(nesdev->pcidev->devfn),
-				nesdev->pcidev->bus->number );
-		if ((PCI_SLOT(nesadapter->devfn) == PCI_SLOT(nesdev->pcidev->devfn)) &&
-				(nesadapter->bus_number == nesdev->pcidev->bus->number)) {
-			nesadapter->ref_count++;
-			return nesadapter;
-		}
-	}
-
-	/* no adapter found */
-	num_pds = pci_resource_len(nesdev->pcidev, BAR_1) >> PAGE_SHIFT;
-	if ((hw_rev != NE020_REV) && (hw_rev != NE020_REV1)) {
-		nes_debug(NES_DBG_INIT, "NE020 driver detected unknown hardware revision 0x%x\n",
-				hw_rev);
-		return NULL;
-	}
-
-	nes_debug(NES_DBG_INIT, "Determine Soft Reset, QP_control=0x%x, CPU0=0x%x, CPU1=0x%x, CPU2=0x%x\n",
-			nes_read_indexed(nesdev, NES_IDX_QP_CONTROL + PCI_FUNC(nesdev->pcidev->devfn) * 8),
-			nes_read_indexed(nesdev, NES_IDX_INT_CPU_STATUS),
-			nes_read_indexed(nesdev, NES_IDX_INT_CPU_STATUS + 4),
-			nes_read_indexed(nesdev, NES_IDX_INT_CPU_STATUS + 8));
-
-	nes_debug(NES_DBG_INIT, "Reset and init NE020\n");
-
-
-	if ((port_count = nes_reset_adapter_ne020(nesdev, &OneG_Mode)) == 0)
-		return NULL;
-
-	max_qp = nes_read_indexed(nesdev, NES_IDX_QP_CTX_SIZE);
-	nes_debug(NES_DBG_INIT, "QP_CTX_SIZE=%u\n", max_qp);
-
-	u32temp = nes_read_indexed(nesdev, NES_IDX_QUAD_HASH_TABLE_SIZE);
-	if (max_qp > ((u32)1 << (u32temp & 0x001f))) {
-		nes_debug(NES_DBG_INIT, "Reducing Max QPs to %u due to hash table size = 0x%08X\n",
-				max_qp, u32temp);
-		max_qp = (u32)1 << (u32temp & 0x001f);
-	}
-
-	hte_index_mask = ((u32)1 << ((u32temp & 0x001f)+1))-1;
-	nes_debug(NES_DBG_INIT, "Max QP = %u, hte_index_mask = 0x%08X.\n",
-			max_qp, hte_index_mask);
-
-	u32temp = nes_read_indexed(nesdev, NES_IDX_IRRQ_COUNT);
-
-	max_irrq = 1 << (u32temp & 0x001f);
-
-	if (max_qp > max_irrq) {
-		max_qp = max_irrq;
-		nes_debug(NES_DBG_INIT, "Reducing Max QPs to %u due to Available Q1s.\n",
-				max_qp);
-	}
-
-	/* there should be no reason to allocate more pds than qps */
-	if (num_pds > max_qp)
-		num_pds = max_qp;
-
-	u32temp = nes_read_indexed(nesdev, NES_IDX_MRT_SIZE);
-	max_mr = (u32)8192 << (u32temp & 0x7);
-
-	u32temp = nes_read_indexed(nesdev, NES_IDX_PBL_REGION_SIZE);
-	max_256pbl = (u32)1 << (u32temp & 0x0000001f);
-	max_4kpbl = (u32)1 << ((u32temp >> 16) & 0x0000001f);
-	max_cq = nes_read_indexed(nesdev, NES_IDX_CQ_CTX_SIZE);
-
-	u32temp = nes_read_indexed(nesdev, NES_IDX_ARP_CACHE_SIZE);
-	arp_table_size = 1 << u32temp;
-
-	adapter_size = (sizeof(struct nes_adapter) +
-			(sizeof(unsigned long)-1)) & (~(sizeof(unsigned long)-1));
-	adapter_size += sizeof(unsigned long) * BITS_TO_LONGS(max_qp);
-	adapter_size += sizeof(unsigned long) * BITS_TO_LONGS(max_mr);
-	adapter_size += sizeof(unsigned long) * BITS_TO_LONGS(max_cq);
-	adapter_size += sizeof(unsigned long) * BITS_TO_LONGS(num_pds);
-	adapter_size += sizeof(unsigned long) * BITS_TO_LONGS(arp_table_size);
-	adapter_size += sizeof(struct nes_qp **) * max_qp;
-
-	/* allocate a new adapter struct */
-	nesadapter = kzalloc(adapter_size, GFP_KERNEL);
-	if (!nesadapter)
-		return NULL;
-
-	nes_debug(NES_DBG_INIT, "Allocating new nesadapter @ %p, size = %u (actual size = %u).\n",
-			nesadapter, (u32)sizeof(struct nes_adapter), adapter_size);
-
-	if (nes_read_eeprom_values(nesdev, nesadapter)) {
-		printk(KERN_ERR PFX "Unable to read EEPROM data.\n");
-		kfree(nesadapter);
-		return NULL;
-	}
-
-	nesadapter->vendor_id = (((u32) nesadapter->mac_addr_high) << 8) |
-				(nesadapter->mac_addr_low >> 24);
-
-	pci_bus_read_config_word(nesdev->pcidev->bus, nesdev->pcidev->devfn,
-				 PCI_DEVICE_ID, &device_id);
-	nesadapter->vendor_part_id = device_id;
-
-	if (nes_init_serdes(nesdev, hw_rev, port_count, nesadapter,
-							OneG_Mode)) {
-		kfree(nesadapter);
-		return NULL;
-	}
-	nes_init_csr_ne020(nesdev, hw_rev, port_count);
-
-	memset(nesadapter->pft_mcast_map, 255,
-	       sizeof nesadapter->pft_mcast_map);
-
-	/* populate the new nesadapter */
-	nesadapter->nesdev = nesdev;
-	nesadapter->devfn = nesdev->pcidev->devfn;
-	nesadapter->bus_number = nesdev->pcidev->bus->number;
-	nesadapter->ref_count = 1;
-	nesadapter->timer_int_req = 0xffff0000;
-	nesadapter->OneG_Mode = OneG_Mode;
-	nesadapter->doorbell_start = nesdev->doorbell_region;
-
-	/* nesadapter->tick_delta = clk_divisor; */
-	nesadapter->hw_rev = hw_rev;
-	nesadapter->port_count = port_count;
-
-	nesadapter->max_qp = max_qp;
-	nesadapter->hte_index_mask = hte_index_mask;
-	nesadapter->max_irrq = max_irrq;
-	nesadapter->max_mr = max_mr;
-	nesadapter->max_256pbl = max_256pbl - 1;
-	nesadapter->max_4kpbl = max_4kpbl - 1;
-	nesadapter->max_cq = max_cq;
-	nesadapter->free_256pbl = max_256pbl - 1;
-	nesadapter->free_4kpbl = max_4kpbl - 1;
-	nesadapter->max_pd = num_pds;
-	nesadapter->arp_table_size = arp_table_size;
-
-	nesadapter->et_pkt_rate_low = NES_TIMER_ENABLE_LIMIT;
-	if (nes_drv_opt & NES_DRV_OPT_DISABLE_INT_MOD) {
-		nesadapter->et_use_adaptive_rx_coalesce = 0;
-		nesadapter->timer_int_limit = NES_TIMER_INT_LIMIT;
-		nesadapter->et_rx_coalesce_usecs_irq = interrupt_mod_interval;
-	} else {
-		nesadapter->et_use_adaptive_rx_coalesce = 1;
-		nesadapter->timer_int_limit = NES_TIMER_INT_LIMIT_DYNAMIC;
-		nesadapter->et_rx_coalesce_usecs_irq = 0;
-		printk(PFX "%s: Using Adaptive Interrupt Moderation\n", __func__);
-	}
-	/* Setup and enable the periodic timer */
-	if (nesadapter->et_rx_coalesce_usecs_irq)
-		nes_write32(nesdev->regs+NES_PERIODIC_CONTROL, 0x80000000 |
-				((u32)(nesadapter->et_rx_coalesce_usecs_irq * 8)));
-	else
-		nes_write32(nesdev->regs+NES_PERIODIC_CONTROL, 0x00000000);
-
-	nesadapter->base_pd = 1;
-
-	nesadapter->device_cap_flags = IB_DEVICE_LOCAL_DMA_LKEY |
-				       IB_DEVICE_MEM_WINDOW |
-				       IB_DEVICE_MEM_MGT_EXTENSIONS;
-
-	nesadapter->allocated_qps = (unsigned long *)&(((unsigned char *)nesadapter)
-			[(sizeof(struct nes_adapter)+(sizeof(unsigned long)-1))&(~(sizeof(unsigned long)-1))]);
-	nesadapter->allocated_cqs = &nesadapter->allocated_qps[BITS_TO_LONGS(max_qp)];
-	nesadapter->allocated_mrs = &nesadapter->allocated_cqs[BITS_TO_LONGS(max_cq)];
-	nesadapter->allocated_pds = &nesadapter->allocated_mrs[BITS_TO_LONGS(max_mr)];
-	nesadapter->allocated_arps = &nesadapter->allocated_pds[BITS_TO_LONGS(num_pds)];
-	nesadapter->qp_table = (struct nes_qp **)(&nesadapter->allocated_arps[BITS_TO_LONGS(arp_table_size)]);
-
-
-	/* mark the usual suspect QPs, MR and CQs as in use */
-	for (u32temp = 0; u32temp < NES_FIRST_QPN; u32temp++) {
-		set_bit(u32temp, nesadapter->allocated_qps);
-		set_bit(u32temp, nesadapter->allocated_cqs);
-	}
-	set_bit(0, nesadapter->allocated_mrs);
-
-	for (u32temp = 0; u32temp < 20; u32temp++)
-		set_bit(u32temp, nesadapter->allocated_pds);
-	u32temp = nes_read_indexed(nesdev, NES_IDX_QP_MAX_CFG_SIZES);
-
-	max_rq_wrs = ((u32temp >> 8) & 3);
-	switch (max_rq_wrs) {
-		case 0:
-			max_rq_wrs = 4;
-			break;
-		case 1:
-			max_rq_wrs = 16;
-			break;
-		case 2:
-			max_rq_wrs = 32;
-			break;
-		case 3:
-			max_rq_wrs = 512;
-			break;
-	}
-
-	max_sq_wrs = (u32temp & 3);
-	switch (max_sq_wrs) {
-		case 0:
-			max_sq_wrs = 4;
-			break;
-		case 1:
-			max_sq_wrs = 16;
-			break;
-		case 2:
-			max_sq_wrs = 32;
-			break;
-		case 3:
-			max_sq_wrs = 512;
-			break;
-	}
-	nesadapter->max_qp_wr = min(max_rq_wrs, max_sq_wrs);
-	nesadapter->max_irrq_wr = (u32temp >> 16) & 3;
-
-	nesadapter->max_sge = 4;
-	nesadapter->max_cqe = 32766;
-
-	if (nes_read_eeprom_values(nesdev, nesadapter)) {
-		printk(KERN_ERR PFX "Unable to read EEPROM data.\n");
-		kfree(nesadapter);
-		return NULL;
-	}
-
-	u32temp = nes_read_indexed(nesdev, NES_IDX_TCP_TIMER_CONFIG);
-	nes_write_indexed(nesdev, NES_IDX_TCP_TIMER_CONFIG,
-			(u32temp & 0xff000000) | (nesadapter->tcp_timer_core_clk_divisor & 0x00ffffff));
-
-	/* setup port configuration */
-	if (nesadapter->port_count == 1) {
-		nesadapter->log_port = 0x00000000;
-		if (nes_drv_opt & NES_DRV_OPT_DUAL_LOGICAL_PORT)
-			nes_write_indexed(nesdev, NES_IDX_TX_POOL_SIZE, 0x00000002);
-		else
-			nes_write_indexed(nesdev, NES_IDX_TX_POOL_SIZE, 0x00000003);
-	} else {
-		if (nesadapter->phy_type[0] == NES_PHY_TYPE_PUMA_1G) {
-			nesadapter->log_port = 0x000000D8;
-		} else {
-			if (nesadapter->port_count == 2)
-				nesadapter->log_port = 0x00000044;
-			else
-				nesadapter->log_port = 0x000000e4;
-		}
-		nes_write_indexed(nesdev, NES_IDX_TX_POOL_SIZE, 0x00000003);
-	}
-
-	nes_write_indexed(nesdev, NES_IDX_NIC_LOGPORT_TO_PHYPORT,
-						nesadapter->log_port);
-	nes_debug(NES_DBG_INIT, "Probe time, LOG2PHY=%u\n",
-			nes_read_indexed(nesdev, NES_IDX_NIC_LOGPORT_TO_PHYPORT));
-
-	spin_lock_init(&nesadapter->resource_lock);
-	spin_lock_init(&nesadapter->phy_lock);
-	spin_lock_init(&nesadapter->pbl_lock);
-	spin_lock_init(&nesadapter->periodic_timer_lock);
-
-	INIT_LIST_HEAD(&nesadapter->nesvnic_list[0]);
-	INIT_LIST_HEAD(&nesadapter->nesvnic_list[1]);
-	INIT_LIST_HEAD(&nesadapter->nesvnic_list[2]);
-	INIT_LIST_HEAD(&nesadapter->nesvnic_list[3]);
-
-	if ((!nesadapter->OneG_Mode) && (nesadapter->port_count == 2)) {
-		u32 pcs_control_status0, pcs_control_status1;
-		u32 reset_value;
-		u32 i = 0;
-		u32 int_cnt = 0;
-		u32 ext_cnt = 0;
-		unsigned long flags;
-		u32 j = 0;
-
-		pcs_control_status0 = nes_read_indexed(nesdev,
-			NES_IDX_PHY_PCS_CONTROL_STATUS0);
-		pcs_control_status1 = nes_read_indexed(nesdev,
-			NES_IDX_PHY_PCS_CONTROL_STATUS0 + 0x200);
-
-		for (i = 0; i < NES_MAX_LINK_CHECK; i++) {
-			pcs_control_status0 = nes_read_indexed(nesdev,
-					NES_IDX_PHY_PCS_CONTROL_STATUS0);
-			pcs_control_status1 = nes_read_indexed(nesdev,
-					NES_IDX_PHY_PCS_CONTROL_STATUS0 + 0x200);
-			if ((0x0F000100 == (pcs_control_status0 & 0x0F000100))
-			    || (0x0F000100 == (pcs_control_status1 & 0x0F000100)))
-				int_cnt++;
-			usleep_range(1000, 2000);
-		}
-		if (int_cnt > 1) {
-			spin_lock_irqsave(&nesadapter->phy_lock, flags);
-			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1, 0x0000F0C8);
-			mh_detected++;
-			reset_value = nes_read32(nesdev->regs+NES_SOFTWARE_RESET);
-			reset_value |= 0x0000003d;
-			nes_write32(nesdev->regs+NES_SOFTWARE_RESET, reset_value);
-
-			while (((nes_read32(nesdev->regs+NES_SOFTWARE_RESET)
-				& 0x00000040) != 0x00000040) && (j++ < 5000));
-			spin_unlock_irqrestore(&nesadapter->phy_lock, flags);
-
-			pcs_control_status0 = nes_read_indexed(nesdev,
-					NES_IDX_PHY_PCS_CONTROL_STATUS0);
-			pcs_control_status1 = nes_read_indexed(nesdev,
-					NES_IDX_PHY_PCS_CONTROL_STATUS0 + 0x200);
-
-			for (i = 0; i < NES_MAX_LINK_CHECK; i++) {
-				pcs_control_status0 = nes_read_indexed(nesdev,
-					NES_IDX_PHY_PCS_CONTROL_STATUS0);
-				pcs_control_status1 = nes_read_indexed(nesdev,
-					NES_IDX_PHY_PCS_CONTROL_STATUS0 + 0x200);
-				if ((0x0F000100 == (pcs_control_status0 & 0x0F000100))
-					|| (0x0F000100 == (pcs_control_status1 & 0x0F000100))) {
-					if (++ext_cnt > int_cnt) {
-						spin_lock_irqsave(&nesadapter->phy_lock, flags);
-						nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1,
-								0x0000F088);
-						mh_detected++;
-						reset_value = nes_read32(nesdev->regs+NES_SOFTWARE_RESET);
-						reset_value |= 0x0000003d;
-						nes_write32(nesdev->regs+NES_SOFTWARE_RESET, reset_value);
-
-						while (((nes_read32(nesdev->regs+NES_SOFTWARE_RESET)
-							& 0x00000040) != 0x00000040) && (j++ < 5000));
-						spin_unlock_irqrestore(&nesadapter->phy_lock, flags);
-						break;
-					}
-				}
-				usleep_range(1000, 2000);
-			}
-		}
-	}
-
-	if (nesadapter->hw_rev == NE020_REV) {
-		timer_setup(&nesadapter->mh_timer, nes_mh_fix, 0);
-		nesadapter->mh_timer.expires = jiffies + (HZ/5);  /* 1 second */
-		add_timer(&nesadapter->mh_timer);
-	} else {
-		nes_write32(nesdev->regs+NES_INTF_INT_STAT, 0x0f000000);
-	}
-
-	timer_setup(&nesadapter->lc_timer, nes_clc, 0);
-	nesadapter->lc_timer.expires = jiffies + 3600 * HZ;  /* 1 hour */
-	add_timer(&nesadapter->lc_timer);
-
-	list_add_tail(&nesadapter->list, &nes_adapter_list);
-
-	for (func_index = 0; func_index < 8; func_index++) {
-		pci_bus_read_config_word(nesdev->pcidev->bus,
-					PCI_DEVFN(PCI_SLOT(nesdev->pcidev->devfn),
-					func_index), 0, &vendor_id);
-		if (vendor_id == 0xffff)
-			break;
-	}
-	nes_debug(NES_DBG_INIT, "%s %d functions found for %s.\n", __func__,
-		func_index, pci_name(nesdev->pcidev));
-	nesadapter->adapter_fcn_count = func_index;
-
-	return nesadapter;
-}
-
-
-/**
- * nes_reset_adapter_ne020
- */
-static unsigned int nes_reset_adapter_ne020(struct nes_device *nesdev, u8 *OneG_Mode)
-{
-	u32 port_count;
-	u32 u32temp;
-	u32 i;
-
-	u32temp = nes_read32(nesdev->regs+NES_SOFTWARE_RESET);
-	port_count = ((u32temp & 0x00000300) >> 8) + 1;
-	/* TODO: assuming that both SERDES are set the same for now */
-	*OneG_Mode = (u32temp & 0x00003c00) ? 0 : 1;
-	nes_debug(NES_DBG_INIT, "Initial Software Reset = 0x%08X, port_count=%u\n",
-			u32temp, port_count);
-	if (*OneG_Mode)
-		nes_debug(NES_DBG_INIT, "Running in 1G mode.\n");
-	u32temp &= 0xff00ffc0;
-	switch (port_count) {
-		case 1:
-			u32temp |= 0x00ee0000;
-			break;
-		case 2:
-			u32temp |= 0x00cc0000;
-			break;
-		case 4:
-			u32temp |= 0x00000000;
-			break;
-		default:
-			return 0;
-			break;
-	}
-
-	/* check and do full reset if needed */
-	if (nes_read_indexed(nesdev, NES_IDX_QP_CONTROL+(PCI_FUNC(nesdev->pcidev->devfn)*8))) {
-		nes_debug(NES_DBG_INIT, "Issuing Full Soft reset = 0x%08X\n", u32temp | 0xd);
-		nes_write32(nesdev->regs+NES_SOFTWARE_RESET, u32temp | 0xd);
-
-		i = 0;
-		while (((nes_read32(nesdev->regs+NES_SOFTWARE_RESET) & 0x00000040) == 0) && i++ < 10000)
-			mdelay(1);
-		if (i > 10000) {
-			nes_debug(NES_DBG_INIT, "Did not see full soft reset done.\n");
-			return 0;
-		}
-
-		i = 0;
-		while ((nes_read_indexed(nesdev, NES_IDX_INT_CPU_STATUS) != 0x80) && i++ < 10000)
-			mdelay(1);
-		if (i > 10000) {
-			printk(KERN_ERR PFX "Internal CPU not ready, status = %02X\n",
-			       nes_read_indexed(nesdev, NES_IDX_INT_CPU_STATUS));
-			return 0;
-		}
-	}
-
-	/* port reset */
-	switch (port_count) {
-		case 1:
-			u32temp |= 0x00ee0010;
-			break;
-		case 2:
-			u32temp |= 0x00cc0030;
-			break;
-		case 4:
-			u32temp |= 0x00000030;
-			break;
-	}
-
-	nes_debug(NES_DBG_INIT, "Issuing Port Soft reset = 0x%08X\n", u32temp | 0xd);
-	nes_write32(nesdev->regs+NES_SOFTWARE_RESET, u32temp | 0xd);
-
-	i = 0;
-	while (((nes_read32(nesdev->regs+NES_SOFTWARE_RESET) & 0x00000040) == 0) && i++ < 10000)
-		mdelay(1);
-	if (i > 10000) {
-		nes_debug(NES_DBG_INIT, "Did not see port soft reset done.\n");
-		return 0;
-	}
-
-	/* serdes 0 */
-	i = 0;
-	while (((u32temp = (nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_STATUS0)
-			& 0x0000000f)) != 0x0000000f) && i++ < 5000)
-		mdelay(1);
-	if (i > 5000) {
-		nes_debug(NES_DBG_INIT, "Serdes 0 not ready, status=%x\n", u32temp);
-		return 0;
-	}
-
-	/* serdes 1 */
-	if (port_count > 1) {
-		i = 0;
-		while (((u32temp = (nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_STATUS1)
-				& 0x0000000f)) != 0x0000000f) && i++ < 5000)
-			mdelay(1);
-		if (i > 5000) {
-			nes_debug(NES_DBG_INIT, "Serdes 1 not ready, status=%x\n", u32temp);
-			return 0;
-		}
-	}
-
-	return port_count;
-}
-
-
-/**
- * nes_init_serdes
- */
-static int nes_init_serdes(struct nes_device *nesdev, u8 hw_rev, u8 port_count,
-				struct nes_adapter *nesadapter, u8  OneG_Mode)
-{
-	int i;
-	u32 u32temp;
-	u32 sds;
-
-	if (hw_rev != NE020_REV) {
-		/* init serdes 0 */
-		switch (nesadapter->phy_type[0]) {
-		case NES_PHY_TYPE_CX4:
-			if (wide_ppm_offset)
-				nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_CDR_CONTROL0, 0x000FFFAA);
-			else
-				nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_CDR_CONTROL0, 0x000000FF);
-			break;
-		case NES_PHY_TYPE_KR:
-			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_CDR_CONTROL0, 0x000000FF);
-			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_EMP0, 0x00000000);
-			break;
-		case NES_PHY_TYPE_PUMA_1G:
-			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_CDR_CONTROL0, 0x000000FF);
-			sds = nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL0);
-			sds |= 0x00000100;
-			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL0, sds);
-			break;
-		default:
-			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_CDR_CONTROL0, 0x000000FF);
-			break;
-		}
-
-		if (!OneG_Mode)
-			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_HIGHZ_LANE_MODE0, 0x11110000);
-
-		if (port_count < 2)
-			return 0;
-
-		/* init serdes 1 */
-		if (!(OneG_Mode && (nesadapter->phy_type[1] != NES_PHY_TYPE_PUMA_1G)))
-			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_CDR_CONTROL1, 0x000000FF);
-
-		switch (nesadapter->phy_type[1]) {
-		case NES_PHY_TYPE_ARGUS:
-		case NES_PHY_TYPE_SFP_D:
-			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_EMP0, 0x00000000);
-			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_EMP1, 0x00000000);
-			break;
-		case NES_PHY_TYPE_CX4:
-			if (wide_ppm_offset)
-				nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_CDR_CONTROL1, 0x000FFFAA);
-			break;
-		case NES_PHY_TYPE_KR:
-			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_EMP1, 0x00000000);
-			break;
-		case NES_PHY_TYPE_PUMA_1G:
-			sds = nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1);
-			sds |= 0x000000100;
-			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1, sds);
-		}
-		if (!OneG_Mode) {
-			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_HIGHZ_LANE_MODE1, 0x11110000);
-			sds = nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1);
-			sds &= 0xFFFFFFBF;
-			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1, sds);
-		}
-	} else {
-		/* init serdes 0 */
-		nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL0, 0x00000008);
-		i = 0;
-		while (((u32temp = (nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_STATUS0)
-				& 0x0000000f)) != 0x0000000f) && i++ < 5000)
-			mdelay(1);
-		if (i > 5000) {
-			nes_debug(NES_DBG_PHY, "Init: serdes 0 not ready, status=%x\n", u32temp);
-			return 1;
-		}
-		nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_EMP0, 0x000bdef7);
-		nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_DRIVE0, 0x9ce73000);
-		nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_RX_MODE0, 0x0ff00000);
-		nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_RX_SIGDET0, 0x00000000);
-		nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_BYPASS0, 0x00000000);
-		nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_LOOPBACK_CONTROL0, 0x00000000);
-		if (OneG_Mode)
-			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_RX_EQ_CONTROL0, 0xf0182222);
-		else
-			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_RX_EQ_CONTROL0, 0xf0042222);
-
-		nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_CDR_CONTROL0, 0x000000ff);
-		if (port_count > 1) {
-			/* init serdes 1 */
-			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1, 0x00000048);
-			i = 0;
-			while (((u32temp = (nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_STATUS1)
-				& 0x0000000f)) != 0x0000000f) && (i++ < 5000))
-				mdelay(1);
-			if (i > 5000) {
-				printk("%s: Init: serdes 1 not ready, status=%x\n", __func__, u32temp);
-				/* return 1; */
-			}
-			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_EMP1, 0x000bdef7);
-			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_DRIVE1, 0x9ce73000);
-			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_RX_MODE1, 0x0ff00000);
-			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_RX_SIGDET1, 0x00000000);
-			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_BYPASS1, 0x00000000);
-			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_LOOPBACK_CONTROL1, 0x00000000);
-			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_RX_EQ_CONTROL1, 0xf0002222);
-			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_CDR_CONTROL1, 0x000000ff);
-		}
-	}
-	return 0;
-}
-
-
-/**
- * nes_init_csr_ne020
- * Initialize registers for ne020 hardware
- */
-static void nes_init_csr_ne020(struct nes_device *nesdev, u8 hw_rev, u8 port_count)
-{
-	u32 u32temp;
-
-	nes_debug(NES_DBG_INIT, "port_count=%d\n", port_count);
-
-	nes_write_indexed(nesdev, 0x000001E4, 0x00000007);
-	/* nes_write_indexed(nesdev, 0x000001E8, 0x000208C4); */
-	nes_write_indexed(nesdev, 0x000001E8, 0x00020874);
-	nes_write_indexed(nesdev, 0x000001D8, 0x00048002);
-	/* nes_write_indexed(nesdev, 0x000001D8, 0x0004B002); */
-	nes_write_indexed(nesdev, 0x000001FC, 0x00050005);
-	nes_write_indexed(nesdev, 0x00000600, 0x55555555);
-	nes_write_indexed(nesdev, 0x00000604, 0x55555555);
-
-	/* TODO: move these MAC register settings to NIC bringup */
-	nes_write_indexed(nesdev, 0x00002000, 0x00000001);
-	nes_write_indexed(nesdev, 0x00002004, 0x00000001);
-	nes_write_indexed(nesdev, 0x00002008, 0x0000FFFF);
-	nes_write_indexed(nesdev, 0x0000200C, 0x00000001);
-	nes_write_indexed(nesdev, 0x00002010, 0x000003c1);
-	nes_write_indexed(nesdev, 0x0000201C, 0x75345678);
-	if (port_count > 1) {
-		nes_write_indexed(nesdev, 0x00002200, 0x00000001);
-		nes_write_indexed(nesdev, 0x00002204, 0x00000001);
-		nes_write_indexed(nesdev, 0x00002208, 0x0000FFFF);
-		nes_write_indexed(nesdev, 0x0000220C, 0x00000001);
-		nes_write_indexed(nesdev, 0x00002210, 0x000003c1);
-		nes_write_indexed(nesdev, 0x0000221C, 0x75345678);
-		nes_write_indexed(nesdev, 0x00000908, 0x20000001);
-	}
-	if (port_count > 2) {
-		nes_write_indexed(nesdev, 0x00002400, 0x00000001);
-		nes_write_indexed(nesdev, 0x00002404, 0x00000001);
-		nes_write_indexed(nesdev, 0x00002408, 0x0000FFFF);
-		nes_write_indexed(nesdev, 0x0000240C, 0x00000001);
-		nes_write_indexed(nesdev, 0x00002410, 0x000003c1);
-		nes_write_indexed(nesdev, 0x0000241C, 0x75345678);
-		nes_write_indexed(nesdev, 0x00000910, 0x20000001);
-
-		nes_write_indexed(nesdev, 0x00002600, 0x00000001);
-		nes_write_indexed(nesdev, 0x00002604, 0x00000001);
-		nes_write_indexed(nesdev, 0x00002608, 0x0000FFFF);
-		nes_write_indexed(nesdev, 0x0000260C, 0x00000001);
-		nes_write_indexed(nesdev, 0x00002610, 0x000003c1);
-		nes_write_indexed(nesdev, 0x0000261C, 0x75345678);
-		nes_write_indexed(nesdev, 0x00000918, 0x20000001);
-	}
-
-	nes_write_indexed(nesdev, 0x00005000, 0x00018000);
-	/* nes_write_indexed(nesdev, 0x00005000, 0x00010000); */
-	nes_write_indexed(nesdev, NES_IDX_WQM_CONFIG1, (wqm_quanta << 1) |
-							 0x00000001);
-	nes_write_indexed(nesdev, 0x00005008, 0x1F1F1F1F);
-	nes_write_indexed(nesdev, 0x00005010, 0x1F1F1F1F);
-	nes_write_indexed(nesdev, 0x00005018, 0x1F1F1F1F);
-	nes_write_indexed(nesdev, 0x00005020, 0x1F1F1F1F);
-	nes_write_indexed(nesdev, 0x00006090, 0xFFFFFFFF);
-
-	/* TODO: move this to code, get from EEPROM */
-	nes_write_indexed(nesdev, 0x00000900, 0x20000001);
-	nes_write_indexed(nesdev, 0x000060C0, 0x0000028e);
-	nes_write_indexed(nesdev, 0x000060C8, 0x00000020);
-
-	nes_write_indexed(nesdev, 0x000001EC, 0x7b2625a0);
-	/* nes_write_indexed(nesdev, 0x000001EC, 0x5f2625a0); */
-
-	if (hw_rev != NE020_REV) {
-		u32temp = nes_read_indexed(nesdev, 0x000008e8);
-		u32temp |= 0x80000000;
-		nes_write_indexed(nesdev, 0x000008e8, u32temp);
-		u32temp = nes_read_indexed(nesdev, 0x000021f8);
-		u32temp &= 0x7fffffff;
-		u32temp |= 0x7fff0010;
-		nes_write_indexed(nesdev, 0x000021f8, u32temp);
-		if (port_count > 1) {
-			u32temp = nes_read_indexed(nesdev, 0x000023f8);
-			u32temp &= 0x7fffffff;
-			u32temp |= 0x7fff0010;
-			nes_write_indexed(nesdev, 0x000023f8, u32temp);
-		}
-	}
-}
-
-
-/**
- * nes_destroy_adapter - destroy the adapter structure
- */
-void nes_destroy_adapter(struct nes_adapter *nesadapter)
-{
-	struct nes_adapter *tmp_adapter;
-
-	list_for_each_entry(tmp_adapter, &nes_adapter_list, list) {
-		nes_debug(NES_DBG_SHUTDOWN, "Nes Adapter list entry = 0x%p.\n",
-				tmp_adapter);
-	}
-
-	nesadapter->ref_count--;
-	if (!nesadapter->ref_count) {
-		if (nesadapter->hw_rev == NE020_REV) {
-			del_timer(&nesadapter->mh_timer);
-		}
-		del_timer(&nesadapter->lc_timer);
-
-		list_del(&nesadapter->list);
-		kfree(nesadapter);
-	}
-}
-
-
-/**
- * nes_init_cqp
- */
-int nes_init_cqp(struct nes_device *nesdev)
-{
-	struct nes_adapter *nesadapter = nesdev->nesadapter;
-	struct nes_hw_cqp_qp_context *cqp_qp_context;
-	struct nes_hw_cqp_wqe *cqp_wqe;
-	struct nes_hw_ceq *ceq;
-	struct nes_hw_ceq *nic_ceq;
-	struct nes_hw_aeq *aeq;
-	void *vmem;
-	dma_addr_t pmem;
-	u32 count=0;
-	u32 cqp_head;
-	u64 u64temp;
-	u32 u32temp;
-
-	/* allocate CQP memory */
-	/* Need to add max_cq to the aeq size once cq overflow checking is added back */
-	/* SQ is 512 byte aligned, others are 256 byte aligned */
-	nesdev->cqp_mem_size = 512 +
-			(sizeof(struct nes_hw_cqp_wqe) * NES_CQP_SQ_SIZE) +
-			(sizeof(struct nes_hw_cqe) * NES_CCQ_SIZE) +
-			max(((u32)sizeof(struct nes_hw_ceqe) * NES_CCEQ_SIZE), (u32)256) +
-			max(((u32)sizeof(struct nes_hw_ceqe) * NES_NIC_CEQ_SIZE), (u32)256) +
-			(sizeof(struct nes_hw_aeqe) * nesadapter->max_qp) +
-			sizeof(struct nes_hw_cqp_qp_context);
-
-	nesdev->cqp_vbase = pci_zalloc_consistent(nesdev->pcidev,
-						  nesdev->cqp_mem_size,
-						  &nesdev->cqp_pbase);
-	if (!nesdev->cqp_vbase) {
-		nes_debug(NES_DBG_INIT, "Unable to allocate memory for host descriptor rings\n");
-		return -ENOMEM;
-	}
-
-	/* Allocate a twice the number of CQP requests as the SQ size */
-	nesdev->nes_cqp_requests = kzalloc(sizeof(struct nes_cqp_request) *
-			2 * NES_CQP_SQ_SIZE, GFP_KERNEL);
-	if (!nesdev->nes_cqp_requests) {
-		pci_free_consistent(nesdev->pcidev, nesdev->cqp_mem_size, nesdev->cqp.sq_vbase,
-				nesdev->cqp.sq_pbase);
-		return -ENOMEM;
-	}
-
-	nes_debug(NES_DBG_INIT, "Allocated CQP structures at %p (phys = %016lX), size = %u.\n",
-			nesdev->cqp_vbase, (unsigned long)nesdev->cqp_pbase, nesdev->cqp_mem_size);
-
-	spin_lock_init(&nesdev->cqp.lock);
-	init_waitqueue_head(&nesdev->cqp.waitq);
-
-	/* Setup Various Structures */
-	vmem = (void *)(((unsigned long)nesdev->cqp_vbase + (512 - 1)) &
-			~(unsigned long)(512 - 1));
-	pmem = (dma_addr_t)(((unsigned long long)nesdev->cqp_pbase + (512 - 1)) &
-			~(unsigned long long)(512 - 1));
-
-	nesdev->cqp.sq_vbase = vmem;
-	nesdev->cqp.sq_pbase = pmem;
-	nesdev->cqp.sq_size = NES_CQP_SQ_SIZE;
-	nesdev->cqp.sq_head = 0;
-	nesdev->cqp.sq_tail = 0;
-	nesdev->cqp.qp_id = PCI_FUNC(nesdev->pcidev->devfn);
-
-	vmem += (sizeof(struct nes_hw_cqp_wqe) * nesdev->cqp.sq_size);
-	pmem += (sizeof(struct nes_hw_cqp_wqe) * nesdev->cqp.sq_size);
-
-	nesdev->ccq.cq_vbase = vmem;
-	nesdev->ccq.cq_pbase = pmem;
-	nesdev->ccq.cq_size = NES_CCQ_SIZE;
-	nesdev->ccq.cq_head = 0;
-	nesdev->ccq.ce_handler = nes_cqp_ce_handler;
-	nesdev->ccq.cq_number = PCI_FUNC(nesdev->pcidev->devfn);
-
-	vmem += (sizeof(struct nes_hw_cqe) * nesdev->ccq.cq_size);
-	pmem += (sizeof(struct nes_hw_cqe) * nesdev->ccq.cq_size);
-
-	nesdev->ceq_index = PCI_FUNC(nesdev->pcidev->devfn);
-	ceq = &nesadapter->ceq[nesdev->ceq_index];
-	ceq->ceq_vbase = vmem;
-	ceq->ceq_pbase = pmem;
-	ceq->ceq_size = NES_CCEQ_SIZE;
-	ceq->ceq_head = 0;
-
-	vmem += max(((u32)sizeof(struct nes_hw_ceqe) * ceq->ceq_size), (u32)256);
-	pmem += max(((u32)sizeof(struct nes_hw_ceqe) * ceq->ceq_size), (u32)256);
-
-	nesdev->nic_ceq_index = PCI_FUNC(nesdev->pcidev->devfn) + 8;
-	nic_ceq = &nesadapter->ceq[nesdev->nic_ceq_index];
-	nic_ceq->ceq_vbase = vmem;
-	nic_ceq->ceq_pbase = pmem;
-	nic_ceq->ceq_size = NES_NIC_CEQ_SIZE;
-	nic_ceq->ceq_head = 0;
-
-	vmem += max(((u32)sizeof(struct nes_hw_ceqe) * nic_ceq->ceq_size), (u32)256);
-	pmem += max(((u32)sizeof(struct nes_hw_ceqe) * nic_ceq->ceq_size), (u32)256);
-
-	aeq = &nesadapter->aeq[PCI_FUNC(nesdev->pcidev->devfn)];
-	aeq->aeq_vbase = vmem;
-	aeq->aeq_pbase = pmem;
-	aeq->aeq_size = nesadapter->max_qp;
-	aeq->aeq_head = 0;
-
-	/* Setup QP Context */
-	vmem += (sizeof(struct nes_hw_aeqe) * aeq->aeq_size);
-	pmem += (sizeof(struct nes_hw_aeqe) * aeq->aeq_size);
-
-	cqp_qp_context = vmem;
-	cqp_qp_context->context_words[0] =
-			cpu_to_le32((PCI_FUNC(nesdev->pcidev->devfn) << 12) + (2 << 10));
-	cqp_qp_context->context_words[1] = 0;
-	cqp_qp_context->context_words[2] = cpu_to_le32((u32)nesdev->cqp.sq_pbase);
-	cqp_qp_context->context_words[3] = cpu_to_le32(((u64)nesdev->cqp.sq_pbase) >> 32);
-
-
-	/* Write the address to Create CQP */
-	if ((sizeof(dma_addr_t) > 4)) {
-		nes_write_indexed(nesdev,
-				NES_IDX_CREATE_CQP_HIGH + (PCI_FUNC(nesdev->pcidev->devfn) * 8),
-				((u64)pmem) >> 32);
-	} else {
-		nes_write_indexed(nesdev,
-				NES_IDX_CREATE_CQP_HIGH + (PCI_FUNC(nesdev->pcidev->devfn) * 8), 0);
-	}
-	nes_write_indexed(nesdev,
-			NES_IDX_CREATE_CQP_LOW + (PCI_FUNC(nesdev->pcidev->devfn) * 8),
-			(u32)pmem);
-
-	INIT_LIST_HEAD(&nesdev->cqp_avail_reqs);
-	INIT_LIST_HEAD(&nesdev->cqp_pending_reqs);
-
-	for (count = 0; count < 2*NES_CQP_SQ_SIZE; count++) {
-		init_waitqueue_head(&nesdev->nes_cqp_requests[count].waitq);
-		list_add_tail(&nesdev->nes_cqp_requests[count].list, &nesdev->cqp_avail_reqs);
-	}
-
-	/* Write Create CCQ WQE */
-	cqp_head = nesdev->cqp.sq_head++;
-	cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head];
-	nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
-	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX,
-			(NES_CQP_CREATE_CQ | NES_CQP_CQ_CEQ_VALID |
-			NES_CQP_CQ_CHK_OVERFLOW | ((u32)nesdev->ccq.cq_size << 16)));
-	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX,
-			    (nesdev->ccq.cq_number |
-			     ((u32)nesdev->ceq_index << 16)));
-	u64temp = (u64)nesdev->ccq.cq_pbase;
-	set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_CQ_WQE_PBL_LOW_IDX, u64temp);
-	cqp_wqe->wqe_words[NES_CQP_CQ_WQE_CQ_CONTEXT_HIGH_IDX] = 0;
-	u64temp = (unsigned long)&nesdev->ccq;
-	cqp_wqe->wqe_words[NES_CQP_CQ_WQE_CQ_CONTEXT_LOW_IDX] =
-			cpu_to_le32((u32)(u64temp >> 1));
-	cqp_wqe->wqe_words[NES_CQP_CQ_WQE_CQ_CONTEXT_HIGH_IDX] =
-			cpu_to_le32(((u32)((u64temp) >> 33)) & 0x7FFFFFFF);
-	cqp_wqe->wqe_words[NES_CQP_CQ_WQE_DOORBELL_INDEX_HIGH_IDX] = 0;
-
-	/* Write Create CEQ WQE */
-	cqp_head = nesdev->cqp.sq_head++;
-	cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head];
-	nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
-	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX,
-			    (NES_CQP_CREATE_CEQ + ((u32)nesdev->ceq_index << 8)));
-	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_CEQ_WQE_ELEMENT_COUNT_IDX, ceq->ceq_size);
-	u64temp = (u64)ceq->ceq_pbase;
-	set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_CQ_WQE_PBL_LOW_IDX, u64temp);
-
-	/* Write Create AEQ WQE */
-	cqp_head = nesdev->cqp.sq_head++;
-	cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head];
-	nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
-	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX,
-			(NES_CQP_CREATE_AEQ + ((u32)PCI_FUNC(nesdev->pcidev->devfn) << 8)));
-	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_AEQ_WQE_ELEMENT_COUNT_IDX, aeq->aeq_size);
-	u64temp = (u64)aeq->aeq_pbase;
-	set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_CQ_WQE_PBL_LOW_IDX, u64temp);
-
-	/* Write Create NIC CEQ WQE */
-	cqp_head = nesdev->cqp.sq_head++;
-	cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head];
-	nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
-	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX,
-			(NES_CQP_CREATE_CEQ + ((u32)nesdev->nic_ceq_index << 8)));
-	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_CEQ_WQE_ELEMENT_COUNT_IDX, nic_ceq->ceq_size);
-	u64temp = (u64)nic_ceq->ceq_pbase;
-	set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_CQ_WQE_PBL_LOW_IDX, u64temp);
-
-	/* Poll until CCQP done */
-	count = 0;
-	do {
-		if (count++ > 1000) {
-			printk(KERN_ERR PFX "Error creating CQP\n");
-			pci_free_consistent(nesdev->pcidev, nesdev->cqp_mem_size,
-					nesdev->cqp_vbase, nesdev->cqp_pbase);
-			return -1;
-		}
-		udelay(10);
-	} while (!(nes_read_indexed(nesdev,
-			NES_IDX_QP_CONTROL + (PCI_FUNC(nesdev->pcidev->devfn) * 8)) & (1 << 8)));
-
-	nes_debug(NES_DBG_INIT, "CQP Status = 0x%08X\n", nes_read_indexed(nesdev,
-			NES_IDX_QP_CONTROL+(PCI_FUNC(nesdev->pcidev->devfn)*8)));
-
-	u32temp = 0x04800000;
-	nes_write32(nesdev->regs+NES_WQE_ALLOC, u32temp | nesdev->cqp.qp_id);
-
-	/* wait for the CCQ, CEQ, and AEQ to get created */
-	count = 0;
-	do {
-		if (count++ > 1000) {
-			printk(KERN_ERR PFX "Error creating CCQ, CEQ, and AEQ\n");
-			pci_free_consistent(nesdev->pcidev, nesdev->cqp_mem_size,
-					nesdev->cqp_vbase, nesdev->cqp_pbase);
-			return -1;
-		}
-		udelay(10);
-	} while (((nes_read_indexed(nesdev,
-			NES_IDX_QP_CONTROL+(PCI_FUNC(nesdev->pcidev->devfn)*8)) & (15<<8)) != (15<<8)));
-
-	/* dump the QP status value */
-	nes_debug(NES_DBG_INIT, "QP Status = 0x%08X\n", nes_read_indexed(nesdev,
-			NES_IDX_QP_CONTROL+(PCI_FUNC(nesdev->pcidev->devfn)*8)));
-
-	nesdev->cqp.sq_tail++;
-
-	return 0;
-}
-
-
-/**
- * nes_destroy_cqp
- */
-int nes_destroy_cqp(struct nes_device *nesdev)
-{
-	struct nes_hw_cqp_wqe *cqp_wqe;
-	u32 count = 0;
-	u32 cqp_head;
-	unsigned long flags;
-
-	do {
-		if (count++ > 1000)
-			break;
-		udelay(10);
-	} while (!(nesdev->cqp.sq_head == nesdev->cqp.sq_tail));
-
-	/* Reset CCQ */
-	nes_write32(nesdev->regs+NES_CQE_ALLOC, NES_CQE_ALLOC_RESET |
-			nesdev->ccq.cq_number);
-
-	/* Disable device interrupts */
-	nes_write32(nesdev->regs+NES_INT_MASK, 0x7fffffff);
-
-	spin_lock_irqsave(&nesdev->cqp.lock, flags);
-
-	/* Destroy the AEQ */
-	cqp_head = nesdev->cqp.sq_head++;
-	nesdev->cqp.sq_head &= nesdev->cqp.sq_size-1;
-	cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head];
-	cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] = cpu_to_le32(NES_CQP_DESTROY_AEQ |
-			((u32)PCI_FUNC(nesdev->pcidev->devfn) << 8));
-	cqp_wqe->wqe_words[NES_CQP_WQE_COMP_CTX_HIGH_IDX] = 0;
-
-	/* Destroy the NIC CEQ */
-	cqp_head = nesdev->cqp.sq_head++;
-	nesdev->cqp.sq_head &= nesdev->cqp.sq_size-1;
-	cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head];
-	cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] = cpu_to_le32(NES_CQP_DESTROY_CEQ |
-			((u32)nesdev->nic_ceq_index << 8));
-
-	/* Destroy the CEQ */
-	cqp_head = nesdev->cqp.sq_head++;
-	nesdev->cqp.sq_head &= nesdev->cqp.sq_size-1;
-	cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head];
-	cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] = cpu_to_le32(NES_CQP_DESTROY_CEQ |
-			(nesdev->ceq_index << 8));
-
-	/* Destroy the CCQ */
-	cqp_head = nesdev->cqp.sq_head++;
-	nesdev->cqp.sq_head &= nesdev->cqp.sq_size-1;
-	cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head];
-	cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] = cpu_to_le32(NES_CQP_DESTROY_CQ);
-	cqp_wqe->wqe_words[NES_CQP_WQE_ID_IDX] = cpu_to_le32(nesdev->ccq.cq_number |
-			((u32)nesdev->ceq_index << 16));
-
-	/* Destroy CQP */
-	cqp_head = nesdev->cqp.sq_head++;
-	nesdev->cqp.sq_head &= nesdev->cqp.sq_size-1;
-	cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head];
-	cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] = cpu_to_le32(NES_CQP_DESTROY_QP |
-			NES_CQP_QP_TYPE_CQP);
-	cqp_wqe->wqe_words[NES_CQP_WQE_ID_IDX] = cpu_to_le32(nesdev->cqp.qp_id);
-
-	barrier();
-	/* Ring doorbell (5 WQEs) */
-	nes_write32(nesdev->regs+NES_WQE_ALLOC, 0x05800000 | nesdev->cqp.qp_id);
-
-	spin_unlock_irqrestore(&nesdev->cqp.lock, flags);
-
-	/* wait for the CCQ, CEQ, and AEQ to get destroyed */
-	count = 0;
-	do {
-		if (count++ > 1000) {
-			printk(KERN_ERR PFX "Function%d: Error destroying CCQ, CEQ, and AEQ\n",
-					PCI_FUNC(nesdev->pcidev->devfn));
-			break;
-		}
-		udelay(10);
-	} while (((nes_read_indexed(nesdev,
-			NES_IDX_QP_CONTROL + (PCI_FUNC(nesdev->pcidev->devfn)*8)) & (15 << 8)) != 0));
-
-	/* dump the QP status value */
-	nes_debug(NES_DBG_SHUTDOWN, "Function%d: QP Status = 0x%08X\n",
-			PCI_FUNC(nesdev->pcidev->devfn),
-			nes_read_indexed(nesdev,
-			NES_IDX_QP_CONTROL+(PCI_FUNC(nesdev->pcidev->devfn)*8)));
-
-	kfree(nesdev->nes_cqp_requests);
-
-	/* Free the control structures */
-	pci_free_consistent(nesdev->pcidev, nesdev->cqp_mem_size, nesdev->cqp.sq_vbase,
-			nesdev->cqp.sq_pbase);
-
-	return 0;
-}
-
-
-/**
- * nes_init_1g_phy
- */
-static int nes_init_1g_phy(struct nes_device *nesdev, u8 phy_type, u8 phy_index)
-{
-	u32 counter = 0;
-	u16 phy_data;
-	int ret = 0;
-
-	nes_read_1G_phy_reg(nesdev, 1, phy_index, &phy_data);
-	nes_write_1G_phy_reg(nesdev, 23, phy_index, 0xb000);
-
-	/* Reset the PHY */
-	nes_write_1G_phy_reg(nesdev, 0, phy_index, 0x8000);
-	udelay(100);
-	counter = 0;
-	do {
-		nes_read_1G_phy_reg(nesdev, 0, phy_index, &phy_data);
-		if (counter++ > 100) {
-			ret = -1;
-			break;
-		}
-	} while (phy_data & 0x8000);
-
-	/* Setting no phy loopback */
-	phy_data &= 0xbfff;
-	phy_data |= 0x1140;
-	nes_write_1G_phy_reg(nesdev, 0, phy_index,  phy_data);
-	nes_read_1G_phy_reg(nesdev, 0, phy_index, &phy_data);
-	nes_read_1G_phy_reg(nesdev, 0x17, phy_index, &phy_data);
-	nes_read_1G_phy_reg(nesdev, 0x1e, phy_index, &phy_data);
-
-	/* Setting the interrupt mask */
-	nes_read_1G_phy_reg(nesdev, 0x19, phy_index, &phy_data);
-	nes_write_1G_phy_reg(nesdev, 0x19, phy_index, 0xffee);
-	nes_read_1G_phy_reg(nesdev, 0x19, phy_index, &phy_data);
-
-	/* turning on flow control */
-	nes_read_1G_phy_reg(nesdev, 4, phy_index, &phy_data);
-	nes_write_1G_phy_reg(nesdev, 4, phy_index, (phy_data & ~(0x03E0)) | 0xc00);
-	nes_read_1G_phy_reg(nesdev, 4, phy_index, &phy_data);
-
-	/* Clear Half duplex */
-	nes_read_1G_phy_reg(nesdev, 9, phy_index, &phy_data);
-	nes_write_1G_phy_reg(nesdev, 9, phy_index, phy_data & ~(0x0100));
-	nes_read_1G_phy_reg(nesdev, 9, phy_index, &phy_data);
-
-	nes_read_1G_phy_reg(nesdev, 0, phy_index, &phy_data);
-	nes_write_1G_phy_reg(nesdev, 0, phy_index, phy_data | 0x0300);
-
-	return ret;
-}
-
-
-/**
- * nes_init_2025_phy
- */
-static int nes_init_2025_phy(struct nes_device *nesdev, u8 phy_type, u8 phy_index)
-{
-	u32 temp_phy_data = 0;
-	u32 temp_phy_data2 = 0;
-	u32 counter = 0;
-	u32 sds;
-	u32 mac_index = nesdev->mac_index;
-	int ret = 0;
-	unsigned int first_attempt = 1;
-
-	/* Check firmware heartbeat */
-	nes_read_10G_phy_reg(nesdev, phy_index, 0x3, 0xd7ee);
-	temp_phy_data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL);
-	udelay(1500);
-	nes_read_10G_phy_reg(nesdev, phy_index, 0x3, 0xd7ee);
-	temp_phy_data2 = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL);
-
-	if (temp_phy_data != temp_phy_data2) {
-		nes_read_10G_phy_reg(nesdev, phy_index, 0x3, 0xd7fd);
-		temp_phy_data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL);
-		if ((temp_phy_data & 0xff) > 0x20)
-			return 0;
-		printk(PFX "Reinitialize external PHY\n");
-	}
-
-	/* no heartbeat, configure the PHY */
-	nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0x0000, 0x8000);
-	nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc300, 0x0000);
-	nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc316, 0x000A);
-	nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc318, 0x0052);
-
-	switch (phy_type) {
-	case NES_PHY_TYPE_ARGUS:
-		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc316, 0x000A);
-		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc318, 0x0052);
-		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc302, 0x000C);
-		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc319, 0x0008);
-		nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0x0027, 0x0001);
-		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc31a, 0x0098);
-		nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0x0026, 0x0E00);
-
-		/* setup LEDs */
-		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xd006, 0x0007);
-		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xd007, 0x000A);
-		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xd008, 0x0009);
-		break;
-
-	case NES_PHY_TYPE_SFP_D:
-		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc316, 0x000A);
-		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc318, 0x0052);
-		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc302, 0x0004);
-		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc319, 0x0038);
-		nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0x0027, 0x0013);
-		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc31a, 0x0098);
-		nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0x0026, 0x0E00);
-
-		/* setup LEDs */
-		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xd006, 0x0007);
-		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xd007, 0x000A);
-		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xd008, 0x0009);
-		break;
-
-	case NES_PHY_TYPE_KR:
-		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc316, 0x000A);
-		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc318, 0x0052);
-		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc302, 0x000C);
-		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc319, 0x0010);
-		nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0x0027, 0x0013);
-		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc31a, 0x0080);
-		nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0x0026, 0x0E00);
-
-		/* setup LEDs */
-		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xd006, 0x000B);
-		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xd007, 0x0003);
-		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xd008, 0x0004);
-
-		nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0x0022, 0x406D);
-		nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0x0023, 0x0020);
-		break;
-	}
-
-	nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0x0028, 0xA528);
-
-	/* Bring PHY out of reset */
-	nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc300, 0x0002);
-
-	/* Check for heartbeat */
-	counter = 0;
-	mdelay(690);
-	nes_read_10G_phy_reg(nesdev, phy_index, 0x3, 0xd7ee);
-	temp_phy_data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL);
-	do {
-		if (counter++ > 150) {
-			printk(PFX "No PHY heartbeat\n");
-			break;
-		}
-		mdelay(1);
-		nes_read_10G_phy_reg(nesdev, phy_index, 0x3, 0xd7ee);
-		temp_phy_data2 = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL);
-	} while ((temp_phy_data2 == temp_phy_data));
-
-	/* wait for tracking */
-	counter = 0;
-	do {
-		nes_read_10G_phy_reg(nesdev, phy_index, 0x3, 0xd7fd);
-		temp_phy_data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL);
-		if (counter++ > 300) {
-			if (((temp_phy_data & 0xff) == 0x0) && first_attempt) {
-				first_attempt = 0;
-				counter = 0;
-				/* reset AMCC PHY and try again */
-				nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0xe854, 0x00c0);
-				nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0xe854, 0x0040);
-				continue;
-			} else {
-				ret = 1;
-				break;
-			}
-		}
-		mdelay(10);
-	} while ((temp_phy_data & 0xff) < 0x30);
-
-	/* setup signal integrity */
-	nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xd003, 0x0000);
-	nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xF00D, 0x00FE);
-	nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xF00E, 0x0032);
-	if (phy_type == NES_PHY_TYPE_KR) {
-		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xF00F, 0x000C);
-	} else {
-		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xF00F, 0x0002);
-		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc314, 0x0063);
-	}
-
-	/* reset serdes */
-	sds = nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL0 + mac_index * 0x200);
-	sds |= 0x1;
-	nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL0 + mac_index * 0x200, sds);
-	sds &= 0xfffffffe;
-	nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL0 + mac_index * 0x200, sds);
-
-	counter = 0;
-	while (((nes_read32(nesdev->regs + NES_SOFTWARE_RESET) & 0x00000040) != 0x00000040)
-			&& (counter++ < 5000))
-		;
-
-	return ret;
-}
-
-
-/**
- * nes_init_phy
- */
-int nes_init_phy(struct nes_device *nesdev)
-{
-	struct nes_adapter *nesadapter = nesdev->nesadapter;
-	u32 mac_index = nesdev->mac_index;
-	u32 tx_config = 0;
-	unsigned long flags;
-	u8  phy_type = nesadapter->phy_type[mac_index];
-	u8  phy_index = nesadapter->phy_index[mac_index];
-	int ret = 0;
-
-	tx_config = nes_read_indexed(nesdev, NES_IDX_MAC_TX_CONFIG);
-	if (phy_type == NES_PHY_TYPE_1G) {
-		/* setup 1G MDIO operation */
-		tx_config &= 0xFFFFFFE3;
-		tx_config |= 0x04;
-	} else {
-		/* setup 10G MDIO operation */
-		tx_config &= 0xFFFFFFE3;
-		tx_config |= 0x1D;
-	}
-	nes_write_indexed(nesdev, NES_IDX_MAC_TX_CONFIG, tx_config);
-
-	spin_lock_irqsave(&nesdev->nesadapter->phy_lock, flags);
-
-	switch (phy_type) {
-	case NES_PHY_TYPE_1G:
-		ret = nes_init_1g_phy(nesdev, phy_type, phy_index);
-		break;
-	case NES_PHY_TYPE_ARGUS:
-	case NES_PHY_TYPE_SFP_D:
-	case NES_PHY_TYPE_KR:
-		ret = nes_init_2025_phy(nesdev, phy_type, phy_index);
-		break;
-	}
-
-	spin_unlock_irqrestore(&nesdev->nesadapter->phy_lock, flags);
-
-	return ret;
-}
-
-
-/**
- * nes_replenish_nic_rq
- */
-static void nes_replenish_nic_rq(struct nes_vnic *nesvnic)
-{
-	unsigned long flags;
-	dma_addr_t bus_address;
-	struct sk_buff *skb;
-	struct nes_hw_nic_rq_wqe *nic_rqe;
-	struct nes_hw_nic *nesnic;
-	struct nes_device *nesdev;
-	struct nes_rskb_cb *cb;
-	u32 rx_wqes_posted = 0;
-
-	nesnic = &nesvnic->nic;
-	nesdev = nesvnic->nesdev;
-	spin_lock_irqsave(&nesnic->rq_lock, flags);
-	if (nesnic->replenishing_rq !=0) {
-		if (((nesnic->rq_size-1) == atomic_read(&nesvnic->rx_skbs_needed)) &&
-				(atomic_read(&nesvnic->rx_skb_timer_running) == 0)) {
-			atomic_set(&nesvnic->rx_skb_timer_running, 1);
-			spin_unlock_irqrestore(&nesnic->rq_lock, flags);
-			nesvnic->rq_wqes_timer.expires = jiffies + (HZ/2);	/* 1/2 second */
-			add_timer(&nesvnic->rq_wqes_timer);
-		} else
-		spin_unlock_irqrestore(&nesnic->rq_lock, flags);
-		return;
-	}
-	nesnic->replenishing_rq = 1;
-	spin_unlock_irqrestore(&nesnic->rq_lock, flags);
-	do {
-		skb = dev_alloc_skb(nesvnic->max_frame_size);
-		if (skb) {
-			skb->dev = nesvnic->netdev;
-
-			bus_address = pci_map_single(nesdev->pcidev,
-					skb->data, nesvnic->max_frame_size, PCI_DMA_FROMDEVICE);
-			cb = (struct nes_rskb_cb *)&skb->cb[0];
-			cb->busaddr = bus_address;
-			cb->maplen = nesvnic->max_frame_size;
-
-			nic_rqe = &nesnic->rq_vbase[nesvnic->nic.rq_head];
-			nic_rqe->wqe_words[NES_NIC_RQ_WQE_LENGTH_1_0_IDX] =
-					cpu_to_le32(nesvnic->max_frame_size);
-			nic_rqe->wqe_words[NES_NIC_RQ_WQE_LENGTH_3_2_IDX] = 0;
-			nic_rqe->wqe_words[NES_NIC_RQ_WQE_FRAG0_LOW_IDX] =
-					cpu_to_le32((u32)bus_address);
-			nic_rqe->wqe_words[NES_NIC_RQ_WQE_FRAG0_HIGH_IDX] =
-					cpu_to_le32((u32)((u64)bus_address >> 32));
-			nesnic->rx_skb[nesnic->rq_head] = skb;
-			nesnic->rq_head++;
-			nesnic->rq_head &= nesnic->rq_size - 1;
-			atomic_dec(&nesvnic->rx_skbs_needed);
-			barrier();
-			if (++rx_wqes_posted == 255) {
-				nes_write32(nesdev->regs+NES_WQE_ALLOC, (rx_wqes_posted << 24) | nesnic->qp_id);
-				rx_wqes_posted = 0;
-			}
-		} else {
-			spin_lock_irqsave(&nesnic->rq_lock, flags);
-			if (((nesnic->rq_size-1) == atomic_read(&nesvnic->rx_skbs_needed)) &&
-					(atomic_read(&nesvnic->rx_skb_timer_running) == 0)) {
-				atomic_set(&nesvnic->rx_skb_timer_running, 1);
-				spin_unlock_irqrestore(&nesnic->rq_lock, flags);
-				nesvnic->rq_wqes_timer.expires = jiffies + (HZ/2);	/* 1/2 second */
-				add_timer(&nesvnic->rq_wqes_timer);
-			} else
-				spin_unlock_irqrestore(&nesnic->rq_lock, flags);
-			break;
-		}
-	} while (atomic_read(&nesvnic->rx_skbs_needed));
-	barrier();
-	if (rx_wqes_posted)
-		nes_write32(nesdev->regs+NES_WQE_ALLOC, (rx_wqes_posted << 24) | nesnic->qp_id);
-	nesnic->replenishing_rq = 0;
-}
-
-
-/**
- * nes_rq_wqes_timeout
- */
-static void nes_rq_wqes_timeout(struct timer_list *t)
-{
-	struct nes_vnic *nesvnic = from_timer(nesvnic, t, rq_wqes_timer);
-	printk("%s: Timer fired.\n", __func__);
-	atomic_set(&nesvnic->rx_skb_timer_running, 0);
-	if (atomic_read(&nesvnic->rx_skbs_needed))
-		nes_replenish_nic_rq(nesvnic);
-}
-
-
-/**
- * nes_init_nic_qp
- */
-int nes_init_nic_qp(struct nes_device *nesdev, struct net_device *netdev)
-{
-	struct nes_hw_cqp_wqe *cqp_wqe;
-	struct nes_hw_nic_sq_wqe *nic_sqe;
-	struct nes_hw_nic_qp_context *nic_context;
-	struct sk_buff *skb;
-	struct nes_hw_nic_rq_wqe *nic_rqe;
-	struct nes_vnic *nesvnic = netdev_priv(netdev);
-	unsigned long flags;
-	void *vmem;
-	dma_addr_t pmem;
-	u64 u64temp;
-	int ret;
-	u32 cqp_head;
-	u32 counter;
-	u32 wqe_count;
-	struct nes_rskb_cb *cb;
-	u8 jumbomode=0;
-
-	/* Allocate fragment, SQ, RQ, and CQ; Reuse CEQ based on the PCI function */
-	nesvnic->nic_mem_size = 256 +
-			(NES_NIC_WQ_SIZE * sizeof(struct nes_first_frag)) +
-			(NES_NIC_WQ_SIZE * sizeof(struct nes_hw_nic_sq_wqe)) +
-			(NES_NIC_WQ_SIZE * sizeof(struct nes_hw_nic_rq_wqe)) +
-			(NES_NIC_WQ_SIZE * 2 * sizeof(struct nes_hw_nic_cqe)) +
-			sizeof(struct nes_hw_nic_qp_context);
-
-	nesvnic->nic_vbase = pci_zalloc_consistent(nesdev->pcidev,
-						   nesvnic->nic_mem_size,
-						   &nesvnic->nic_pbase);
-	if (!nesvnic->nic_vbase) {
-		nes_debug(NES_DBG_INIT, "Unable to allocate memory for NIC host descriptor rings\n");
-		return -ENOMEM;
-	}
-	nes_debug(NES_DBG_INIT, "Allocated NIC QP structures at %p (phys = %016lX), size = %u.\n",
-			nesvnic->nic_vbase, (unsigned long)nesvnic->nic_pbase, nesvnic->nic_mem_size);
-
-	vmem = (void *)(((unsigned long)nesvnic->nic_vbase + (256 - 1)) &
-			~(unsigned long)(256 - 1));
-	pmem = (dma_addr_t)(((unsigned long long)nesvnic->nic_pbase + (256 - 1)) &
-			~(unsigned long long)(256 - 1));
-
-	/* Setup the first Fragment buffers */
-	nesvnic->nic.first_frag_vbase = vmem;
-
-	for (counter = 0; counter < NES_NIC_WQ_SIZE; counter++) {
-		nesvnic->nic.frag_paddr[counter] = pmem;
-		pmem += sizeof(struct nes_first_frag);
-	}
-
-	/* setup the SQ */
-	vmem += (NES_NIC_WQ_SIZE * sizeof(struct nes_first_frag));
-
-	nesvnic->nic.sq_vbase = (void *)vmem;
-	nesvnic->nic.sq_pbase = pmem;
-	nesvnic->nic.sq_head = 0;
-	nesvnic->nic.sq_tail = 0;
-	nesvnic->nic.sq_size = NES_NIC_WQ_SIZE;
-	for (counter = 0; counter < NES_NIC_WQ_SIZE; counter++) {
-		nic_sqe = &nesvnic->nic.sq_vbase[counter];
-		nic_sqe->wqe_words[NES_NIC_SQ_WQE_MISC_IDX] =
-				cpu_to_le32(NES_NIC_SQ_WQE_DISABLE_CHKSUM |
-				NES_NIC_SQ_WQE_COMPLETION);
-		nic_sqe->wqe_words[NES_NIC_SQ_WQE_LENGTH_0_TAG_IDX] =
-				cpu_to_le32((u32)NES_FIRST_FRAG_SIZE << 16);
-		nic_sqe->wqe_words[NES_NIC_SQ_WQE_FRAG0_LOW_IDX] =
-				cpu_to_le32((u32)nesvnic->nic.frag_paddr[counter]);
-		nic_sqe->wqe_words[NES_NIC_SQ_WQE_FRAG0_HIGH_IDX] =
-				cpu_to_le32((u32)((u64)nesvnic->nic.frag_paddr[counter] >> 32));
-	}
-
-	nesvnic->get_cqp_request = nes_get_cqp_request;
-	nesvnic->post_cqp_request = nes_post_cqp_request;
-	nesvnic->mcrq_mcast_filter = NULL;
-
-	spin_lock_init(&nesvnic->nic.rq_lock);
-
-	/* setup the RQ */
-	vmem += (NES_NIC_WQ_SIZE * sizeof(struct nes_hw_nic_sq_wqe));
-	pmem += (NES_NIC_WQ_SIZE * sizeof(struct nes_hw_nic_sq_wqe));
-
-
-	nesvnic->nic.rq_vbase = vmem;
-	nesvnic->nic.rq_pbase = pmem;
-	nesvnic->nic.rq_head = 0;
-	nesvnic->nic.rq_tail = 0;
-	nesvnic->nic.rq_size = NES_NIC_WQ_SIZE;
-
-	/* setup the CQ */
-	vmem += (NES_NIC_WQ_SIZE * sizeof(struct nes_hw_nic_rq_wqe));
-	pmem += (NES_NIC_WQ_SIZE * sizeof(struct nes_hw_nic_rq_wqe));
-
-	if (nesdev->nesadapter->netdev_count > 2)
-		nesvnic->mcrq_qp_id = nesvnic->nic_index + 32;
-	else
-		nesvnic->mcrq_qp_id = nesvnic->nic.qp_id + 4;
-
-	nesvnic->nic_cq.cq_vbase = vmem;
-	nesvnic->nic_cq.cq_pbase = pmem;
-	nesvnic->nic_cq.cq_head = 0;
-	nesvnic->nic_cq.cq_size = NES_NIC_WQ_SIZE * 2;
-
-	nesvnic->nic_cq.ce_handler = nes_nic_napi_ce_handler;
-
-	/* Send CreateCQ request to CQP */
-	spin_lock_irqsave(&nesdev->cqp.lock, flags);
-	cqp_head = nesdev->cqp.sq_head;
-
-	cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head];
-	nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
-
-	cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] = cpu_to_le32(
-			NES_CQP_CREATE_CQ | NES_CQP_CQ_CEQ_VALID |
-			((u32)nesvnic->nic_cq.cq_size << 16));
-	cqp_wqe->wqe_words[NES_CQP_WQE_ID_IDX] = cpu_to_le32(
-			nesvnic->nic_cq.cq_number | ((u32)nesdev->nic_ceq_index << 16));
-	u64temp = (u64)nesvnic->nic_cq.cq_pbase;
-	set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_CQ_WQE_PBL_LOW_IDX, u64temp);
-	cqp_wqe->wqe_words[NES_CQP_CQ_WQE_CQ_CONTEXT_HIGH_IDX] =  0;
-	u64temp = (unsigned long)&nesvnic->nic_cq;
-	cqp_wqe->wqe_words[NES_CQP_CQ_WQE_CQ_CONTEXT_LOW_IDX] =  cpu_to_le32((u32)(u64temp >> 1));
-	cqp_wqe->wqe_words[NES_CQP_CQ_WQE_CQ_CONTEXT_HIGH_IDX] =
-			cpu_to_le32(((u32)((u64temp) >> 33)) & 0x7FFFFFFF);
-	cqp_wqe->wqe_words[NES_CQP_CQ_WQE_DOORBELL_INDEX_HIGH_IDX] = 0;
-	if (++cqp_head >= nesdev->cqp.sq_size)
-		cqp_head = 0;
-	cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head];
-	nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
-
-	/* Send CreateQP request to CQP */
-	nic_context = (void *)(&nesvnic->nic_cq.cq_vbase[nesvnic->nic_cq.cq_size]);
-	nic_context->context_words[NES_NIC_CTX_MISC_IDX] =
-			cpu_to_le32((u32)NES_NIC_CTX_SIZE |
-			((u32)PCI_FUNC(nesdev->pcidev->devfn) << 12));
-	nes_debug(NES_DBG_INIT, "RX_WINDOW_BUFFER_PAGE_TABLE_SIZE = 0x%08X, RX_WINDOW_BUFFER_SIZE = 0x%08X\n",
-			nes_read_indexed(nesdev, NES_IDX_RX_WINDOW_BUFFER_PAGE_TABLE_SIZE),
-			nes_read_indexed(nesdev, NES_IDX_RX_WINDOW_BUFFER_SIZE));
-	if (nes_read_indexed(nesdev, NES_IDX_RX_WINDOW_BUFFER_SIZE) != 0) {
-		nic_context->context_words[NES_NIC_CTX_MISC_IDX] |= cpu_to_le32(NES_NIC_BACK_STORE);
-	}
-
-	u64temp = (u64)nesvnic->nic.sq_pbase;
-	nic_context->context_words[NES_NIC_CTX_SQ_LOW_IDX]  = cpu_to_le32((u32)u64temp);
-	nic_context->context_words[NES_NIC_CTX_SQ_HIGH_IDX] = cpu_to_le32((u32)(u64temp >> 32));
-	u64temp = (u64)nesvnic->nic.rq_pbase;
-	nic_context->context_words[NES_NIC_CTX_RQ_LOW_IDX]  = cpu_to_le32((u32)u64temp);
-	nic_context->context_words[NES_NIC_CTX_RQ_HIGH_IDX] = cpu_to_le32((u32)(u64temp >> 32));
-
-	cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] = cpu_to_le32(NES_CQP_CREATE_QP |
-			NES_CQP_QP_TYPE_NIC);
-	cqp_wqe->wqe_words[NES_CQP_WQE_ID_IDX] = cpu_to_le32(nesvnic->nic.qp_id);
-	u64temp = (u64)nesvnic->nic_cq.cq_pbase +
-			(nesvnic->nic_cq.cq_size * sizeof(struct nes_hw_nic_cqe));
-	set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_QP_WQE_CONTEXT_LOW_IDX, u64temp);
-
-	if (++cqp_head >= nesdev->cqp.sq_size)
-		cqp_head = 0;
-	nesdev->cqp.sq_head = cqp_head;
-
-	barrier();
-
-	/* Ring doorbell (2 WQEs) */
-	nes_write32(nesdev->regs+NES_WQE_ALLOC, 0x02800000 | nesdev->cqp.qp_id);
-
-	spin_unlock_irqrestore(&nesdev->cqp.lock, flags);
-	nes_debug(NES_DBG_INIT, "Waiting for create NIC QP%u to complete.\n",
-			nesvnic->nic.qp_id);
-
-	ret = wait_event_timeout(nesdev->cqp.waitq, (nesdev->cqp.sq_tail == cqp_head),
-			NES_EVENT_TIMEOUT);
-	nes_debug(NES_DBG_INIT, "Create NIC QP%u completed, wait_event_timeout ret = %u.\n",
-			nesvnic->nic.qp_id, ret);
-	if (!ret) {
-		nes_debug(NES_DBG_INIT, "NIC QP%u create timeout expired\n", nesvnic->nic.qp_id);
-		pci_free_consistent(nesdev->pcidev, nesvnic->nic_mem_size, nesvnic->nic_vbase,
-				nesvnic->nic_pbase);
-		return -EIO;
-	}
-
-	/* Populate the RQ */
-	for (counter = 0; counter < (NES_NIC_WQ_SIZE - 1); counter++) {
-		skb = dev_alloc_skb(nesvnic->max_frame_size);
-		if (!skb) {
-			nes_debug(NES_DBG_INIT, "%s: out of memory for receive skb\n", netdev->name);
-
-			nes_destroy_nic_qp(nesvnic);
-			return -ENOMEM;
-		}
-
-		skb->dev = netdev;
-
-		pmem = pci_map_single(nesdev->pcidev, skb->data,
-				nesvnic->max_frame_size, PCI_DMA_FROMDEVICE);
-		cb = (struct nes_rskb_cb *)&skb->cb[0];
-		cb->busaddr = pmem;
-		cb->maplen = nesvnic->max_frame_size;
-
-		nic_rqe = &nesvnic->nic.rq_vbase[counter];
-		nic_rqe->wqe_words[NES_NIC_RQ_WQE_LENGTH_1_0_IDX] = cpu_to_le32(nesvnic->max_frame_size);
-		nic_rqe->wqe_words[NES_NIC_RQ_WQE_LENGTH_3_2_IDX] = 0;
-		nic_rqe->wqe_words[NES_NIC_RQ_WQE_FRAG0_LOW_IDX]  = cpu_to_le32((u32)pmem);
-		nic_rqe->wqe_words[NES_NIC_RQ_WQE_FRAG0_HIGH_IDX] = cpu_to_le32((u32)((u64)pmem >> 32));
-		nesvnic->nic.rx_skb[counter] = skb;
-	}
-
-	wqe_count = NES_NIC_WQ_SIZE - 1;
-	nesvnic->nic.rq_head = wqe_count;
-	barrier();
-	do {
-		counter = min(wqe_count, ((u32)255));
-		wqe_count -= counter;
-		nes_write32(nesdev->regs+NES_WQE_ALLOC, (counter << 24) | nesvnic->nic.qp_id);
-	} while (wqe_count);
-	timer_setup(&nesvnic->rq_wqes_timer, nes_rq_wqes_timeout, 0);
-	nes_debug(NES_DBG_INIT, "NAPI support Enabled\n");
-	if (nesdev->nesadapter->et_use_adaptive_rx_coalesce)
-	{
-		nes_nic_init_timer(nesdev);
-		if (netdev->mtu > 1500)
-			jumbomode = 1;
-		nes_nic_init_timer_defaults(nesdev, jumbomode);
-	}
-	if ((nesdev->nesadapter->allow_unaligned_fpdus) &&
-		(nes_init_mgt_qp(nesdev, netdev, nesvnic))) {
-		nes_debug(NES_DBG_INIT, "%s: Out of memory for pau nic\n",
-			  netdev->name);
-		nes_destroy_nic_qp(nesvnic);
-		return -ENOMEM;
-	}
-
-	return 0;
-}
-
-
-/**
- * nes_destroy_nic_qp
- */
-void nes_destroy_nic_qp(struct nes_vnic *nesvnic)
-{
-	u64 u64temp;
-	dma_addr_t bus_address;
-	struct nes_device *nesdev = nesvnic->nesdev;
-	struct nes_hw_cqp_wqe *cqp_wqe;
-	struct nes_hw_nic_sq_wqe *nic_sqe;
-	__le16 *wqe_fragment_length;
-	u16  wqe_fragment_index;
-	u32 cqp_head;
-	u32 wqm_cfg0;
-	unsigned long flags;
-	struct sk_buff *rx_skb;
-	struct nes_rskb_cb *cb;
-	int ret;
-
-	if (nesdev->nesadapter->allow_unaligned_fpdus)
-		nes_destroy_mgt(nesvnic);
-
-	/* clear wqe stall before destroying NIC QP */
-	wqm_cfg0 = nes_read_indexed(nesdev, NES_IDX_WQM_CONFIG0);
-	nes_write_indexed(nesdev, NES_IDX_WQM_CONFIG0, wqm_cfg0 & 0xFFFF7FFF);
-
-	/* Free remaining NIC receive buffers */
-	while (nesvnic->nic.rq_head != nesvnic->nic.rq_tail) {
-		rx_skb = nesvnic->nic.rx_skb[nesvnic->nic.rq_tail];
-		cb = (struct nes_rskb_cb *)&rx_skb->cb[0];
-		pci_unmap_single(nesdev->pcidev, cb->busaddr, cb->maplen,
-			PCI_DMA_FROMDEVICE);
-
-		dev_kfree_skb(nesvnic->nic.rx_skb[nesvnic->nic.rq_tail++]);
-		nesvnic->nic.rq_tail &= (nesvnic->nic.rq_size - 1);
-	}
-
-	/* Free remaining NIC transmit buffers */
-	while (nesvnic->nic.sq_head != nesvnic->nic.sq_tail) {
-		nic_sqe = &nesvnic->nic.sq_vbase[nesvnic->nic.sq_tail];
-		wqe_fragment_index = 1;
-		wqe_fragment_length = (__le16 *)
-			&nic_sqe->wqe_words[NES_NIC_SQ_WQE_LENGTH_0_TAG_IDX];
-		/* bump past the vlan tag */
-		wqe_fragment_length++;
-		if (le16_to_cpu(wqe_fragment_length[wqe_fragment_index]) != 0) {
-			u64temp = (u64)le32_to_cpu(
-				nic_sqe->wqe_words[NES_NIC_SQ_WQE_FRAG0_LOW_IDX+
-				wqe_fragment_index*2]);
-			u64temp += ((u64)le32_to_cpu(
-				nic_sqe->wqe_words[NES_NIC_SQ_WQE_FRAG0_HIGH_IDX
-				+ wqe_fragment_index*2]))<<32;
-			bus_address = (dma_addr_t)u64temp;
-			if (test_and_clear_bit(nesvnic->nic.sq_tail,
-					nesvnic->nic.first_frag_overflow)) {
-				pci_unmap_single(nesdev->pcidev,
-						bus_address,
-						le16_to_cpu(wqe_fragment_length[
-							wqe_fragment_index++]),
-						PCI_DMA_TODEVICE);
-			}
-			for (; wqe_fragment_index < 5; wqe_fragment_index++) {
-				if (wqe_fragment_length[wqe_fragment_index]) {
-					u64temp = le32_to_cpu(
-						nic_sqe->wqe_words[
-						NES_NIC_SQ_WQE_FRAG0_LOW_IDX+
-						wqe_fragment_index*2]);
-					u64temp += ((u64)le32_to_cpu(
-						nic_sqe->wqe_words[
-						NES_NIC_SQ_WQE_FRAG0_HIGH_IDX+
-						wqe_fragment_index*2]))<<32;
-					bus_address = (dma_addr_t)u64temp;
-					pci_unmap_page(nesdev->pcidev,
-							bus_address,
-							le16_to_cpu(
-							wqe_fragment_length[
-							wqe_fragment_index]),
-							PCI_DMA_TODEVICE);
-				} else
-					break;
-			}
-		}
-		if (nesvnic->nic.tx_skb[nesvnic->nic.sq_tail])
-			dev_kfree_skb(
-				nesvnic->nic.tx_skb[nesvnic->nic.sq_tail]);
-
-		nesvnic->nic.sq_tail = (nesvnic->nic.sq_tail + 1)
-					& (nesvnic->nic.sq_size - 1);
-	}
-
-	spin_lock_irqsave(&nesdev->cqp.lock, flags);
-
-	/* Destroy NIC QP */
-	cqp_head = nesdev->cqp.sq_head;
-	cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head];
-	nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
-
-	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX,
-		(NES_CQP_DESTROY_QP | NES_CQP_QP_TYPE_NIC));
-	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX,
-		nesvnic->nic.qp_id);
-
-	if (++cqp_head >= nesdev->cqp.sq_size)
-		cqp_head = 0;
-
-	cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head];
-
-	/* Destroy NIC CQ */
-	nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
-	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX,
-		(NES_CQP_DESTROY_CQ | ((u32)nesvnic->nic_cq.cq_size << 16)));
-	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX,
-		(nesvnic->nic_cq.cq_number | ((u32)nesdev->nic_ceq_index << 16)));
-
-	if (++cqp_head >= nesdev->cqp.sq_size)
-		cqp_head = 0;
-
-	nesdev->cqp.sq_head = cqp_head;
-	barrier();
-
-	/* Ring doorbell (2 WQEs) */
-	nes_write32(nesdev->regs+NES_WQE_ALLOC, 0x02800000 | nesdev->cqp.qp_id);
-
-	spin_unlock_irqrestore(&nesdev->cqp.lock, flags);
-	nes_debug(NES_DBG_SHUTDOWN, "Waiting for CQP, cqp_head=%u, cqp.sq_head=%u,"
-			" cqp.sq_tail=%u, cqp.sq_size=%u\n",
-			cqp_head, nesdev->cqp.sq_head,
-			nesdev->cqp.sq_tail, nesdev->cqp.sq_size);
-
-	ret = wait_event_timeout(nesdev->cqp.waitq, (nesdev->cqp.sq_tail == cqp_head),
-			NES_EVENT_TIMEOUT);
-
-	nes_debug(NES_DBG_SHUTDOWN, "Destroy NIC QP returned, wait_event_timeout ret = %u, cqp_head=%u,"
-			" cqp.sq_head=%u, cqp.sq_tail=%u\n",
-			ret, cqp_head, nesdev->cqp.sq_head, nesdev->cqp.sq_tail);
-	if (!ret) {
-		nes_debug(NES_DBG_SHUTDOWN, "NIC QP%u destroy timeout expired\n",
-				nesvnic->nic.qp_id);
-	}
-
-	pci_free_consistent(nesdev->pcidev, nesvnic->nic_mem_size, nesvnic->nic_vbase,
-			nesvnic->nic_pbase);
-
-	/* restore old wqm_cfg0 value */
-	nes_write_indexed(nesdev, NES_IDX_WQM_CONFIG0, wqm_cfg0);
-}
-
-/**
- * nes_napi_isr
- */
-int nes_napi_isr(struct nes_device *nesdev)
-{
-	struct nes_adapter *nesadapter = nesdev->nesadapter;
-	u32 int_stat;
-
-	if (nesdev->napi_isr_ran) {
-		/* interrupt status has already been read in ISR */
-		int_stat = nesdev->int_stat;
-	} else {
-		int_stat = nes_read32(nesdev->regs + NES_INT_STAT);
-		nesdev->int_stat = int_stat;
-		nesdev->napi_isr_ran = 1;
-	}
-
-	int_stat &= nesdev->int_req;
-	/* iff NIC, process here, else wait for DPC */
-	if ((int_stat) && ((int_stat & 0x0000ff00) == int_stat)) {
-		nesdev->napi_isr_ran = 0;
-		nes_write32(nesdev->regs + NES_INT_STAT,
-			(int_stat &
-			~(NES_INT_INTF | NES_INT_TIMER | NES_INT_MAC0 | NES_INT_MAC1 | NES_INT_MAC2 | NES_INT_MAC3)));
-
-		/* Process the CEQs */
-		nes_process_ceq(nesdev, &nesdev->nesadapter->ceq[nesdev->nic_ceq_index]);
-
-		if (unlikely((((nesadapter->et_rx_coalesce_usecs_irq) &&
-					(!nesadapter->et_use_adaptive_rx_coalesce)) ||
-					((nesadapter->et_use_adaptive_rx_coalesce) &&
-					 (nesdev->deepcq_count > nesadapter->et_pkt_rate_low))))) {
-			if ((nesdev->int_req & NES_INT_TIMER) == 0) {
-				/* Enable Periodic timer interrupts */
-				nesdev->int_req |= NES_INT_TIMER;
-				/* ack any pending periodic timer interrupts so we don't get an immediate interrupt */
-				/* TODO: need to also ack other unused periodic timer values, get from nesadapter */
-				nes_write32(nesdev->regs+NES_TIMER_STAT,
-						nesdev->timer_int_req  | ~(nesdev->nesadapter->timer_int_req));
-				nes_write32(nesdev->regs+NES_INTF_INT_MASK,
-						~(nesdev->intf_int_req | NES_INTF_PERIODIC_TIMER));
-			}
-
-			if (unlikely(nesadapter->et_use_adaptive_rx_coalesce))
-			{
-				nes_nic_init_timer(nesdev);
-			}
-			/* Enable interrupts, except CEQs */
-			nes_write32(nesdev->regs+NES_INT_MASK, 0x0000ffff | (~nesdev->int_req));
-		} else {
-			/* Enable interrupts, make sure timer is off */
-			nesdev->int_req &= ~NES_INT_TIMER;
-			nes_write32(nesdev->regs+NES_INTF_INT_MASK, ~(nesdev->intf_int_req));
-			nes_write32(nesdev->regs+NES_INT_MASK, ~nesdev->int_req);
-		}
-		nesdev->deepcq_count = 0;
-		return 1;
-	} else {
-		return 0;
-	}
-}
-
-static void process_critical_error(struct nes_device *nesdev)
-{
-	u32 debug_error;
-	u32 nes_idx_debug_error_masks0 = 0;
-	u16 error_module = 0;
-
-	debug_error = nes_read_indexed(nesdev, NES_IDX_DEBUG_ERROR_CONTROL_STATUS);
-	printk(KERN_ERR PFX "Critical Error reported by device!!! 0x%02X\n",
-			(u16)debug_error);
-	nes_write_indexed(nesdev, NES_IDX_DEBUG_ERROR_CONTROL_STATUS,
-			0x01010000 | (debug_error & 0x0000ffff));
-	if (crit_err_count++ > 10)
-		nes_write_indexed(nesdev, NES_IDX_DEBUG_ERROR_MASKS1, 1 << 0x17);
-	error_module = (u16) (debug_error & 0x1F00) >> 8;
-	if (++nesdev->nesadapter->crit_error_count[error_module-1] >=
-			nes_max_critical_error_count) {
-		printk(KERN_ERR PFX "Masking off critical error for module "
-			"0x%02X\n", (u16)error_module);
-		nes_idx_debug_error_masks0 = nes_read_indexed(nesdev,
-			NES_IDX_DEBUG_ERROR_MASKS0);
-		nes_write_indexed(nesdev, NES_IDX_DEBUG_ERROR_MASKS0,
-			nes_idx_debug_error_masks0 | (1 << error_module));
-	}
-}
-/**
- * nes_dpc
- */
-void nes_dpc(unsigned long param)
-{
-	struct nes_device *nesdev = (struct nes_device *)param;
-	struct nes_adapter *nesadapter = nesdev->nesadapter;
-	u32 counter;
-	u32 loop_counter = 0;
-	u32 int_status_bit;
-	u32 int_stat;
-	u32 timer_stat;
-	u32 temp_int_stat;
-	u32 intf_int_stat;
-	u32 processed_intf_int = 0;
-	u16 processed_timer_int = 0;
-	u16 completion_ints = 0;
-	u16 timer_ints = 0;
-
-	/* nes_debug(NES_DBG_ISR, "\n"); */
-
-	do {
-		timer_stat = 0;
-		if (nesdev->napi_isr_ran) {
-			nesdev->napi_isr_ran = 0;
-			int_stat = nesdev->int_stat;
-		} else
-			int_stat = nes_read32(nesdev->regs+NES_INT_STAT);
-		if (processed_intf_int != 0)
-			int_stat &= nesdev->int_req & ~NES_INT_INTF;
-		else
-			int_stat &= nesdev->int_req;
-		if (processed_timer_int == 0) {
-			processed_timer_int = 1;
-			if (int_stat & NES_INT_TIMER) {
-				timer_stat = nes_read32(nesdev->regs + NES_TIMER_STAT);
-				if ((timer_stat & nesdev->timer_int_req) == 0) {
-					int_stat &= ~NES_INT_TIMER;
-				}
-			}
-		} else {
-			int_stat &= ~NES_INT_TIMER;
-		}
-
-		if (int_stat) {
-			if (int_stat & ~(NES_INT_INTF | NES_INT_TIMER | NES_INT_MAC0|
-					NES_INT_MAC1|NES_INT_MAC2 | NES_INT_MAC3)) {
-				/* Ack the interrupts */
-				nes_write32(nesdev->regs+NES_INT_STAT,
-					(int_stat & ~(NES_INT_INTF | NES_INT_TIMER | NES_INT_MAC0|
-					NES_INT_MAC1 | NES_INT_MAC2 | NES_INT_MAC3)));
-			}
-
-			temp_int_stat = int_stat;
-			for (counter = 0, int_status_bit = 1; counter < 16; counter++) {
-				if (int_stat & int_status_bit) {
-					nes_process_ceq(nesdev, &nesadapter->ceq[counter]);
-					temp_int_stat &= ~int_status_bit;
-					completion_ints = 1;
-				}
-				if (!(temp_int_stat & 0x0000ffff))
-					break;
-				int_status_bit <<= 1;
-			}
-
-			/* Process the AEQ for this pci function */
-			int_status_bit = 1 << (16 + PCI_FUNC(nesdev->pcidev->devfn));
-			if (int_stat & int_status_bit) {
-				nes_process_aeq(nesdev, &nesadapter->aeq[PCI_FUNC(nesdev->pcidev->devfn)]);
-			}
-
-			/* Process the MAC interrupt for this pci function */
-			int_status_bit = 1 << (24 + nesdev->mac_index);
-			if (int_stat & int_status_bit) {
-				nes_process_mac_intr(nesdev, nesdev->mac_index);
-			}
-
-			if (int_stat & NES_INT_TIMER) {
-				if (timer_stat & nesdev->timer_int_req) {
-					nes_write32(nesdev->regs + NES_TIMER_STAT,
-							(timer_stat & nesdev->timer_int_req) |
-							~(nesdev->nesadapter->timer_int_req));
-					timer_ints = 1;
-				}
-			}
-
-			if (int_stat & NES_INT_INTF) {
-				processed_intf_int = 1;
-				intf_int_stat = nes_read32(nesdev->regs+NES_INTF_INT_STAT);
-				intf_int_stat &= nesdev->intf_int_req;
-				if (NES_INTF_INT_CRITERR & intf_int_stat) {
-					process_critical_error(nesdev);
-				}
-				if (NES_INTF_INT_PCIERR & intf_int_stat) {
-					printk(KERN_ERR PFX "PCI Error reported by device!!!\n");
-					BUG();
-				}
-				if (NES_INTF_INT_AEQ_OFLOW & intf_int_stat) {
-					printk(KERN_ERR PFX "AEQ Overflow reported by device!!!\n");
-					BUG();
-				}
-				nes_write32(nesdev->regs+NES_INTF_INT_STAT, intf_int_stat);
-			}
-
-			if (int_stat & NES_INT_TSW) {
-			}
-		}
-		/* Don't use the interface interrupt bit stay in loop */
-		int_stat &= ~NES_INT_INTF | NES_INT_TIMER | NES_INT_MAC0 |
-				NES_INT_MAC1 | NES_INT_MAC2 | NES_INT_MAC3;
-	} while ((int_stat != 0) && (loop_counter++ < MAX_DPC_ITERATIONS));
-
-	if (timer_ints == 1) {
-		if ((nesadapter->et_rx_coalesce_usecs_irq) || (nesadapter->et_use_adaptive_rx_coalesce)) {
-			if (completion_ints == 0) {
-				nesdev->timer_only_int_count++;
-				if (nesdev->timer_only_int_count>=nesadapter->timer_int_limit) {
-					nesdev->timer_only_int_count = 0;
-					nesdev->int_req &= ~NES_INT_TIMER;
-					nes_write32(nesdev->regs + NES_INTF_INT_MASK, ~(nesdev->intf_int_req));
-					nes_write32(nesdev->regs + NES_INT_MASK, ~nesdev->int_req);
-				} else {
-					nes_write32(nesdev->regs+NES_INT_MASK, 0x0000ffff | (~nesdev->int_req));
-				}
-			} else {
-				if (unlikely(nesadapter->et_use_adaptive_rx_coalesce))
-				{
-					nes_nic_init_timer(nesdev);
-				}
-				nesdev->timer_only_int_count = 0;
-				nes_write32(nesdev->regs+NES_INT_MASK, 0x0000ffff | (~nesdev->int_req));
-			}
-		} else {
-			nesdev->timer_only_int_count = 0;
-			nesdev->int_req &= ~NES_INT_TIMER;
-			nes_write32(nesdev->regs+NES_INTF_INT_MASK, ~(nesdev->intf_int_req));
-			nes_write32(nesdev->regs+NES_TIMER_STAT,
-					nesdev->timer_int_req | ~(nesdev->nesadapter->timer_int_req));
-			nes_write32(nesdev->regs+NES_INT_MASK, ~nesdev->int_req);
-		}
-	} else {
-		if ( (completion_ints == 1) &&
-			 (((nesadapter->et_rx_coalesce_usecs_irq) &&
-			   (!nesadapter->et_use_adaptive_rx_coalesce)) ||
-			  ((nesdev->deepcq_count > nesadapter->et_pkt_rate_low) &&
-			   (nesadapter->et_use_adaptive_rx_coalesce) )) ) {
-			/* nes_debug(NES_DBG_ISR, "Enabling periodic timer interrupt.\n" ); */
-			nesdev->timer_only_int_count = 0;
-			nesdev->int_req |= NES_INT_TIMER;
-			nes_write32(nesdev->regs+NES_TIMER_STAT,
-					nesdev->timer_int_req | ~(nesdev->nesadapter->timer_int_req));
-			nes_write32(nesdev->regs+NES_INTF_INT_MASK,
-					~(nesdev->intf_int_req | NES_INTF_PERIODIC_TIMER));
-			nes_write32(nesdev->regs+NES_INT_MASK, 0x0000ffff | (~nesdev->int_req));
-		} else {
-			nes_write32(nesdev->regs+NES_INT_MASK, ~nesdev->int_req);
-		}
-	}
-	nesdev->deepcq_count = 0;
-}
-
-
-/**
- * nes_process_ceq
- */
-static void nes_process_ceq(struct nes_device *nesdev, struct nes_hw_ceq *ceq)
-{
-	u64 u64temp;
-	struct nes_hw_cq *cq;
-	u32 head;
-	u32 ceq_size;
-
-	/* nes_debug(NES_DBG_CQ, "\n"); */
-	head = ceq->ceq_head;
-	ceq_size = ceq->ceq_size;
-
-	do {
-		if (le32_to_cpu(ceq->ceq_vbase[head].ceqe_words[NES_CEQE_CQ_CTX_HIGH_IDX]) &
-				NES_CEQE_VALID) {
-			u64temp = (((u64)(le32_to_cpu(ceq->ceq_vbase[head].ceqe_words[NES_CEQE_CQ_CTX_HIGH_IDX]))) << 32) |
-						((u64)(le32_to_cpu(ceq->ceq_vbase[head].ceqe_words[NES_CEQE_CQ_CTX_LOW_IDX])));
-			u64temp <<= 1;
-			cq = *((struct nes_hw_cq **)&u64temp);
-			/* nes_debug(NES_DBG_CQ, "pCQ = %p\n", cq); */
-			barrier();
-			ceq->ceq_vbase[head].ceqe_words[NES_CEQE_CQ_CTX_HIGH_IDX] = 0;
-
-			/* call the event handler */
-			cq->ce_handler(nesdev, cq);
-
-			if (++head >= ceq_size)
-				head = 0;
-		} else {
-			break;
-		}
-
-	} while (1);
-
-	ceq->ceq_head = head;
-}
-
-
-/**
- * nes_process_aeq
- */
-static void nes_process_aeq(struct nes_device *nesdev, struct nes_hw_aeq *aeq)
-{
-	/* u64 u64temp; */
-	u32 head;
-	u32 aeq_size;
-	u32 aeqe_misc;
-	u32 aeqe_cq_id;
-	struct nes_hw_aeqe volatile *aeqe;
-
-	head = aeq->aeq_head;
-	aeq_size = aeq->aeq_size;
-
-	do {
-		aeqe = &aeq->aeq_vbase[head];
-		if ((le32_to_cpu(aeqe->aeqe_words[NES_AEQE_MISC_IDX]) & NES_AEQE_VALID) == 0)
-			break;
-		aeqe_misc  = le32_to_cpu(aeqe->aeqe_words[NES_AEQE_MISC_IDX]);
-		aeqe_cq_id = le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_QP_CQ_ID_IDX]);
-		if (aeqe_misc & (NES_AEQE_QP|NES_AEQE_CQ)) {
-			if (aeqe_cq_id >= NES_FIRST_QPN) {
-				/* dealing with an accelerated QP related AE */
-				/*
-				 * u64temp = (((u64)(le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_CTXT_HIGH_IDX]))) << 32) |
-				 *	     ((u64)(le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_CTXT_LOW_IDX])));
-				 */
-				nes_process_iwarp_aeqe(nesdev, (struct nes_hw_aeqe *)aeqe);
-			} else {
-				/* TODO: dealing with a CQP related AE */
-				nes_debug(NES_DBG_AEQ, "Processing CQP related AE, misc = 0x%04X\n",
-						(u16)(aeqe_misc >> 16));
-			}
-		}
-
-		aeqe->aeqe_words[NES_AEQE_MISC_IDX] = 0;
-
-		if (++head >= aeq_size)
-			head = 0;
-
-		nes_write32(nesdev->regs + NES_AEQ_ALLOC, 1 << 16);
-	}
-	while (1);
-	aeq->aeq_head = head;
-}
-
-static void nes_reset_link(struct nes_device *nesdev, u32 mac_index)
-{
-	struct nes_adapter *nesadapter = nesdev->nesadapter;
-	u32 reset_value;
-	u32 i=0;
-	u32 u32temp;
-
-	if (nesadapter->hw_rev == NE020_REV) {
-		return;
-	}
-	mh_detected++;
-
-	reset_value = nes_read32(nesdev->regs+NES_SOFTWARE_RESET);
-
-	if ((mac_index == 0) || ((mac_index == 1) && (nesadapter->OneG_Mode)))
-		reset_value |= 0x0000001d;
-	else
-		reset_value |= 0x0000002d;
-
-	if (4 <= (nesadapter->link_interrupt_count[mac_index] / ((u16)NES_MAX_LINK_INTERRUPTS))) {
-		if ((!nesadapter->OneG_Mode) && (nesadapter->port_count == 2)) {
-			nesadapter->link_interrupt_count[0] = 0;
-			nesadapter->link_interrupt_count[1] = 0;
-			u32temp = nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1);
-			if (0x00000040 & u32temp)
-				nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1, 0x0000F088);
-			else
-				nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1, 0x0000F0C8);
-
-			reset_value |= 0x0000003d;
-		}
-		nesadapter->link_interrupt_count[mac_index] = 0;
-	}
-
-	nes_write32(nesdev->regs+NES_SOFTWARE_RESET, reset_value);
-
-	while (((nes_read32(nesdev->regs+NES_SOFTWARE_RESET)
-			& 0x00000040) != 0x00000040) && (i++ < 5000));
-
-	if (0x0000003d == (reset_value & 0x0000003d)) {
-		u32 pcs_control_status0, pcs_control_status1;
-
-		for (i = 0; i < 10; i++) {
-			pcs_control_status0 = nes_read_indexed(nesdev, NES_IDX_PHY_PCS_CONTROL_STATUS0);
-			pcs_control_status1 = nes_read_indexed(nesdev, NES_IDX_PHY_PCS_CONTROL_STATUS0 + 0x200);
-			if (((0x0F000000 == (pcs_control_status0 & 0x0F000000))
-			     && (pcs_control_status0 & 0x00100000))
-			    || ((0x0F000000 == (pcs_control_status1 & 0x0F000000))
-				&& (pcs_control_status1 & 0x00100000)))
-				continue;
-			else
-				break;
-		}
-		if (10 == i) {
-			u32temp = nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1);
-			if (0x00000040 & u32temp)
-				nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1, 0x0000F088);
-			else
-				nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1, 0x0000F0C8);
-
-			nes_write32(nesdev->regs+NES_SOFTWARE_RESET, reset_value);
-
-			while (((nes_read32(nesdev->regs + NES_SOFTWARE_RESET)
-				 & 0x00000040) != 0x00000040) && (i++ < 5000));
-		}
-	}
-}
-
-/**
- * nes_process_mac_intr
- */
-static void nes_process_mac_intr(struct nes_device *nesdev, u32 mac_number)
-{
-	unsigned long flags;
-	u32 pcs_control_status;
-	struct nes_adapter *nesadapter = nesdev->nesadapter;
-	struct nes_vnic *nesvnic;
-	u32 mac_status;
-	u32 mac_index = nesdev->mac_index;
-	u32 u32temp;
-	u16 phy_data;
-	u16 temp_phy_data;
-	u32 pcs_val  = 0x0f0f0000;
-	u32 pcs_mask = 0x0f1f0000;
-	u32 cdr_ctrl;
-
-	spin_lock_irqsave(&nesadapter->phy_lock, flags);
-	if (nesadapter->mac_sw_state[mac_number] != NES_MAC_SW_IDLE) {
-		spin_unlock_irqrestore(&nesadapter->phy_lock, flags);
-		return;
-	}
-	nesadapter->mac_sw_state[mac_number] = NES_MAC_SW_INTERRUPT;
-
-	/* ack the MAC interrupt */
-	mac_status = nes_read_indexed(nesdev, NES_IDX_MAC_INT_STATUS + (mac_index * 0x200));
-	/* Clear the interrupt */
-	nes_write_indexed(nesdev, NES_IDX_MAC_INT_STATUS + (mac_index * 0x200), mac_status);
-
-	nes_debug(NES_DBG_PHY, "MAC%u interrupt status = 0x%X.\n", mac_number, mac_status);
-
-	if (mac_status & (NES_MAC_INT_LINK_STAT_CHG | NES_MAC_INT_XGMII_EXT)) {
-		nesdev->link_status_interrupts++;
-		if (0 == (++nesadapter->link_interrupt_count[mac_index] % ((u16)NES_MAX_LINK_INTERRUPTS)))
-			nes_reset_link(nesdev, mac_index);
-
-		/* read the PHY interrupt status register */
-		if ((nesadapter->OneG_Mode) &&
-		(nesadapter->phy_type[mac_index] != NES_PHY_TYPE_PUMA_1G)) {
-			do {
-				nes_read_1G_phy_reg(nesdev, 0x1a,
-						nesadapter->phy_index[mac_index], &phy_data);
-				nes_debug(NES_DBG_PHY, "Phy%d data from register 0x1a = 0x%X.\n",
-						nesadapter->phy_index[mac_index], phy_data);
-			} while (phy_data&0x8000);
-
-			temp_phy_data = 0;
-			do {
-				nes_read_1G_phy_reg(nesdev, 0x11,
-						nesadapter->phy_index[mac_index], &phy_data);
-				nes_debug(NES_DBG_PHY, "Phy%d data from register 0x11 = 0x%X.\n",
-						nesadapter->phy_index[mac_index], phy_data);
-				if (temp_phy_data == phy_data)
-					break;
-				temp_phy_data = phy_data;
-			} while (1);
-
-			nes_read_1G_phy_reg(nesdev, 0x1e,
-					nesadapter->phy_index[mac_index], &phy_data);
-			nes_debug(NES_DBG_PHY, "Phy%d data from register 0x1e = 0x%X.\n",
-					nesadapter->phy_index[mac_index], phy_data);
-
-			nes_read_1G_phy_reg(nesdev, 1,
-					nesadapter->phy_index[mac_index], &phy_data);
-			nes_debug(NES_DBG_PHY, "1G phy%u data from register 1 = 0x%X\n",
-					nesadapter->phy_index[mac_index], phy_data);
-
-			if (temp_phy_data & 0x1000) {
-				nes_debug(NES_DBG_PHY, "The Link is up according to the PHY\n");
-				phy_data = 4;
-			} else {
-				nes_debug(NES_DBG_PHY, "The Link is down according to the PHY\n");
-			}
-		}
-		nes_debug(NES_DBG_PHY, "Eth SERDES Common Status: 0=0x%08X, 1=0x%08X\n",
-				nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_STATUS0),
-				nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_STATUS0+0x200));
-
-		if (nesadapter->phy_type[mac_index] == NES_PHY_TYPE_PUMA_1G) {
-			switch (mac_index) {
-			case 1:
-			case 3:
-				pcs_control_status = nes_read_indexed(nesdev,
-						NES_IDX_PHY_PCS_CONTROL_STATUS0 + 0x200);
-				break;
-			default:
-				pcs_control_status = nes_read_indexed(nesdev,
-						NES_IDX_PHY_PCS_CONTROL_STATUS0);
-				break;
-			}
-		} else {
-			pcs_control_status = nes_read_indexed(nesdev,
-					NES_IDX_PHY_PCS_CONTROL_STATUS0 + ((mac_index & 1) * 0x200));
-			pcs_control_status = nes_read_indexed(nesdev,
-					NES_IDX_PHY_PCS_CONTROL_STATUS0 + ((mac_index & 1) * 0x200));
-		}
-
-		nes_debug(NES_DBG_PHY, "PCS PHY Control/Status%u: 0x%08X\n",
-				mac_index, pcs_control_status);
-		if ((nesadapter->OneG_Mode) &&
-				(nesadapter->phy_type[mac_index] != NES_PHY_TYPE_PUMA_1G)) {
-			u32temp = 0x01010000;
-			if (nesadapter->port_count > 2) {
-				u32temp |= 0x02020000;
-			}
-			if ((pcs_control_status & u32temp)!= u32temp) {
-				phy_data = 0;
-				nes_debug(NES_DBG_PHY, "PCS says the link is down\n");
-			}
-		} else {
-			switch (nesadapter->phy_type[mac_index]) {
-			case NES_PHY_TYPE_ARGUS:
-			case NES_PHY_TYPE_SFP_D:
-			case NES_PHY_TYPE_KR:
-				/* clear the alarms */
-				nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 4, 0x0008);
-				nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 4, 0xc001);
-				nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 4, 0xc002);
-				nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 4, 0xc005);
-				nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 4, 0xc006);
-				nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 1, 0x9003);
-				nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 1, 0x9004);
-				nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 1, 0x9005);
-				/* check link status */
-				nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 1, 0x9003);
-				temp_phy_data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL);
-
-				nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 3, 0x0021);
-				nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL);
-				nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 3, 0x0021);
-				phy_data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL);
-
-				phy_data = (!temp_phy_data && (phy_data == 0x8000)) ? 0x4 : 0x0;
-
-				nes_debug(NES_DBG_PHY, "%s: Phy data = 0x%04X, link was %s.\n",
-					__func__, phy_data, nesadapter->mac_link_down[mac_index] ? "DOWN" : "UP");
-				break;
-
-			case NES_PHY_TYPE_PUMA_1G:
-				if (mac_index < 2)
-					pcs_val = pcs_mask = 0x01010000;
-				else
-					pcs_val = pcs_mask = 0x02020000;
-				/* fall through */
-			default:
-				phy_data = (pcs_val == (pcs_control_status & pcs_mask)) ? 0x4 : 0x0;
-				break;
-			}
-		}
-
-		if (phy_data & 0x0004) {
-			if (wide_ppm_offset &&
-			    (nesadapter->phy_type[mac_index] == NES_PHY_TYPE_CX4) &&
-			    (nesadapter->hw_rev != NE020_REV)) {
-				cdr_ctrl = nes_read_indexed(nesdev,
-							    NES_IDX_ETH_SERDES_CDR_CONTROL0 +
-							    mac_index * 0x200);
-				nes_write_indexed(nesdev,
-						  NES_IDX_ETH_SERDES_CDR_CONTROL0 +
-						  mac_index * 0x200,
-						  cdr_ctrl | 0x000F0000);
-			}
-			nesadapter->mac_link_down[mac_index] = 0;
-			list_for_each_entry(nesvnic, &nesadapter->nesvnic_list[mac_index], list) {
-				nes_debug(NES_DBG_PHY, "The Link is UP!!.  linkup was %d\n",
-						nesvnic->linkup);
-				if (nesvnic->linkup == 0) {
-					printk(PFX "The Link is now up for port %s, netdev %p.\n",
-							nesvnic->netdev->name, nesvnic->netdev);
-					if (netif_queue_stopped(nesvnic->netdev))
-						netif_start_queue(nesvnic->netdev);
-					nesvnic->linkup = 1;
-					netif_carrier_on(nesvnic->netdev);
-
-					spin_lock(&nesvnic->port_ibevent_lock);
-					if (nesvnic->of_device_registered) {
-						if (nesdev->iw_status == 0) {
-							nesdev->iw_status = 1;
-							nes_port_ibevent(nesvnic);
-						}
-					}
-					spin_unlock(&nesvnic->port_ibevent_lock);
-				}
-			}
-		} else {
-			if (wide_ppm_offset &&
-			    (nesadapter->phy_type[mac_index] == NES_PHY_TYPE_CX4) &&
-			    (nesadapter->hw_rev != NE020_REV)) {
-				cdr_ctrl = nes_read_indexed(nesdev,
-							    NES_IDX_ETH_SERDES_CDR_CONTROL0 +
-							    mac_index * 0x200);
-				nes_write_indexed(nesdev,
-						  NES_IDX_ETH_SERDES_CDR_CONTROL0 +
-						  mac_index * 0x200,
-						  cdr_ctrl & 0xFFF0FFFF);
-			}
-			nesadapter->mac_link_down[mac_index] = 1;
-			list_for_each_entry(nesvnic, &nesadapter->nesvnic_list[mac_index], list) {
-				nes_debug(NES_DBG_PHY, "The Link is Down!!. linkup was %d\n",
-						nesvnic->linkup);
-				if (nesvnic->linkup == 1) {
-					printk(PFX "The Link is now down for port %s, netdev %p.\n",
-							nesvnic->netdev->name, nesvnic->netdev);
-					if (!(netif_queue_stopped(nesvnic->netdev)))
-						netif_stop_queue(nesvnic->netdev);
-					nesvnic->linkup = 0;
-					netif_carrier_off(nesvnic->netdev);
-
-					spin_lock(&nesvnic->port_ibevent_lock);
-					if (nesvnic->of_device_registered) {
-						if (nesdev->iw_status == 1) {
-							nesdev->iw_status = 0;
-							nes_port_ibevent(nesvnic);
-						}
-					}
-					spin_unlock(&nesvnic->port_ibevent_lock);
-				}
-			}
-		}
-		if (nesadapter->phy_type[mac_index] == NES_PHY_TYPE_SFP_D) {
-			nesdev->link_recheck = 1;
-			mod_delayed_work(system_wq, &nesdev->work,
-					 NES_LINK_RECHECK_DELAY);
-		}
-	}
-
-	spin_unlock_irqrestore(&nesadapter->phy_lock, flags);
-
-	nesadapter->mac_sw_state[mac_number] = NES_MAC_SW_IDLE;
-}
-
-void nes_recheck_link_status(struct work_struct *work)
-{
-	unsigned long flags;
-	struct nes_device *nesdev = container_of(work, struct nes_device, work.work);
-	struct nes_adapter *nesadapter = nesdev->nesadapter;
-	struct nes_vnic *nesvnic;
-	u32 mac_index = nesdev->mac_index;
-	u16 phy_data;
-	u16 temp_phy_data;
-
-	spin_lock_irqsave(&nesadapter->phy_lock, flags);
-
-	/* check link status */
-	nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 1, 0x9003);
-	temp_phy_data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL);
-
-	nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 3, 0x0021);
-	nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL);
-	nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 3, 0x0021);
-	phy_data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL);
-
-	phy_data = (!temp_phy_data && (phy_data == 0x8000)) ? 0x4 : 0x0;
-
-	nes_debug(NES_DBG_PHY, "%s: Phy data = 0x%04X, link was %s.\n",
-		__func__, phy_data,
-		nesadapter->mac_link_down[mac_index] ? "DOWN" : "UP");
-
-	if (phy_data & 0x0004) {
-		nesadapter->mac_link_down[mac_index] = 0;
-		list_for_each_entry(nesvnic, &nesadapter->nesvnic_list[mac_index], list) {
-			if (nesvnic->linkup == 0) {
-				printk(PFX "The Link is now up for port %s, netdev %p.\n",
-						nesvnic->netdev->name, nesvnic->netdev);
-				if (netif_queue_stopped(nesvnic->netdev))
-					netif_start_queue(nesvnic->netdev);
-				nesvnic->linkup = 1;
-				netif_carrier_on(nesvnic->netdev);
-
-				spin_lock(&nesvnic->port_ibevent_lock);
-				if (nesvnic->of_device_registered) {
-					if (nesdev->iw_status == 0) {
-						nesdev->iw_status = 1;
-						nes_port_ibevent(nesvnic);
-					}
-				}
-				spin_unlock(&nesvnic->port_ibevent_lock);
-			}
-		}
-
-	} else {
-		nesadapter->mac_link_down[mac_index] = 1;
-		list_for_each_entry(nesvnic, &nesadapter->nesvnic_list[mac_index], list) {
-			if (nesvnic->linkup == 1) {
-				printk(PFX "The Link is now down for port %s, netdev %p.\n",
-						nesvnic->netdev->name, nesvnic->netdev);
-				if (!(netif_queue_stopped(nesvnic->netdev)))
-					netif_stop_queue(nesvnic->netdev);
-				nesvnic->linkup = 0;
-				netif_carrier_off(nesvnic->netdev);
-
-				spin_lock(&nesvnic->port_ibevent_lock);
-				if (nesvnic->of_device_registered) {
-					if (nesdev->iw_status == 1) {
-						nesdev->iw_status = 0;
-						nes_port_ibevent(nesvnic);
-					}
-				}
-				spin_unlock(&nesvnic->port_ibevent_lock);
-			}
-		}
-	}
-	if (nesdev->link_recheck++ < NES_LINK_RECHECK_MAX)
-		schedule_delayed_work(&nesdev->work, NES_LINK_RECHECK_DELAY);
-	else
-		nesdev->link_recheck = 0;
-
-	spin_unlock_irqrestore(&nesadapter->phy_lock, flags);
-}
-
-
-static void nes_nic_napi_ce_handler(struct nes_device *nesdev, struct nes_hw_nic_cq *cq)
-{
-	struct nes_vnic *nesvnic = container_of(cq, struct nes_vnic, nic_cq);
-
-	napi_schedule(&nesvnic->napi);
-}
-
-
-/* The MAX_RQES_TO_PROCESS defines how many max read requests to complete before
-* getting out of nic_ce_handler
-*/
-#define	MAX_RQES_TO_PROCESS	384
-
-/**
- * nes_nic_ce_handler
- */
-void nes_nic_ce_handler(struct nes_device *nesdev, struct nes_hw_nic_cq *cq)
-{
-	u64 u64temp;
-	dma_addr_t bus_address;
-	struct nes_hw_nic *nesnic;
-	struct nes_vnic *nesvnic = container_of(cq, struct nes_vnic, nic_cq);
-	struct nes_adapter *nesadapter = nesdev->nesadapter;
-	struct nes_hw_nic_rq_wqe *nic_rqe;
-	struct nes_hw_nic_sq_wqe *nic_sqe;
-	struct sk_buff *skb;
-	struct sk_buff *rx_skb;
-	struct nes_rskb_cb *cb;
-	__le16 *wqe_fragment_length;
-	u32 head;
-	u32 cq_size;
-	u32 rx_pkt_size;
-	u32 cqe_count=0;
-	u32 cqe_errv;
-	u32 cqe_misc;
-	u16 wqe_fragment_index = 1;	/* first fragment (0) is used by copy buffer */
-	u16 vlan_tag;
-	u16 pkt_type;
-	u16 rqes_processed = 0;
-	u8 sq_cqes = 0;
-
-	head = cq->cq_head;
-	cq_size = cq->cq_size;
-	cq->cqes_pending = 1;
-	do {
-		if (le32_to_cpu(cq->cq_vbase[head].cqe_words[NES_NIC_CQE_MISC_IDX]) &
-				NES_NIC_CQE_VALID) {
-			nesnic = &nesvnic->nic;
-			cqe_misc = le32_to_cpu(cq->cq_vbase[head].cqe_words[NES_NIC_CQE_MISC_IDX]);
-			if (cqe_misc & NES_NIC_CQE_SQ) {
-				sq_cqes++;
-				wqe_fragment_index = 1;
-				nic_sqe = &nesnic->sq_vbase[nesnic->sq_tail];
-				skb = nesnic->tx_skb[nesnic->sq_tail];
-				wqe_fragment_length = (__le16 *)&nic_sqe->wqe_words[NES_NIC_SQ_WQE_LENGTH_0_TAG_IDX];
-				/* bump past the vlan tag */
-				wqe_fragment_length++;
-				if (le16_to_cpu(wqe_fragment_length[wqe_fragment_index]) != 0) {
-					u64temp = (u64) le32_to_cpu(nic_sqe->wqe_words[NES_NIC_SQ_WQE_FRAG0_LOW_IDX +
-							wqe_fragment_index * 2]);
-					u64temp += ((u64)le32_to_cpu(nic_sqe->wqe_words[NES_NIC_SQ_WQE_FRAG0_HIGH_IDX +
-							wqe_fragment_index * 2])) << 32;
-					bus_address = (dma_addr_t)u64temp;
-					if (test_and_clear_bit(nesnic->sq_tail, nesnic->first_frag_overflow)) {
-						pci_unmap_single(nesdev->pcidev,
-								bus_address,
-								le16_to_cpu(wqe_fragment_length[wqe_fragment_index++]),
-								PCI_DMA_TODEVICE);
-					}
-					for (; wqe_fragment_index < 5; wqe_fragment_index++) {
-						if (wqe_fragment_length[wqe_fragment_index]) {
-							u64temp = le32_to_cpu(nic_sqe->wqe_words[NES_NIC_SQ_WQE_FRAG0_LOW_IDX +
-										wqe_fragment_index * 2]);
-							u64temp += ((u64)le32_to_cpu(nic_sqe->wqe_words[NES_NIC_SQ_WQE_FRAG0_HIGH_IDX
-										+ wqe_fragment_index * 2])) <<32;
-							bus_address = (dma_addr_t)u64temp;
-							pci_unmap_page(nesdev->pcidev,
-									bus_address,
-									le16_to_cpu(wqe_fragment_length[wqe_fragment_index]),
-									PCI_DMA_TODEVICE);
-						} else
-							break;
-					}
-				}
-				if (skb)
-					dev_kfree_skb_any(skb);
-				nesnic->sq_tail++;
-				nesnic->sq_tail &= nesnic->sq_size-1;
-				if (sq_cqes > 128) {
-					barrier();
-					/* restart the queue if it had been stopped */
-					if (netif_queue_stopped(nesvnic->netdev))
-						netif_wake_queue(nesvnic->netdev);
-					sq_cqes = 0;
-				}
-			} else {
-				rqes_processed ++;
-
-				cq->rx_cqes_completed++;
-				cq->rx_pkts_indicated++;
-				rx_pkt_size = cqe_misc & 0x0000ffff;
-				nic_rqe = &nesnic->rq_vbase[nesnic->rq_tail];
-				/* Get the skb */
-				rx_skb = nesnic->rx_skb[nesnic->rq_tail];
-				nic_rqe = &nesnic->rq_vbase[nesvnic->nic.rq_tail];
-				bus_address = (dma_addr_t)le32_to_cpu(nic_rqe->wqe_words[NES_NIC_RQ_WQE_FRAG0_LOW_IDX]);
-				bus_address += ((u64)le32_to_cpu(nic_rqe->wqe_words[NES_NIC_RQ_WQE_FRAG0_HIGH_IDX])) << 32;
-				pci_unmap_single(nesdev->pcidev, bus_address,
-						nesvnic->max_frame_size, PCI_DMA_FROMDEVICE);
-				cb = (struct nes_rskb_cb *)&rx_skb->cb[0];
-				cb->busaddr = 0;
-				/* rx_skb->tail = rx_skb->data + rx_pkt_size; */
-				/* rx_skb->len = rx_pkt_size; */
-				rx_skb->len = 0;  /* TODO: see if this is necessary */
-				skb_put(rx_skb, rx_pkt_size);
-				rx_skb->protocol = eth_type_trans(rx_skb, nesvnic->netdev);
-				nesnic->rq_tail++;
-				nesnic->rq_tail &= nesnic->rq_size - 1;
-
-				atomic_inc(&nesvnic->rx_skbs_needed);
-				if (atomic_read(&nesvnic->rx_skbs_needed) > (nesvnic->nic.rq_size>>1)) {
-					nes_write32(nesdev->regs+NES_CQE_ALLOC,
-							cq->cq_number | (cqe_count << 16));
-					/* nesadapter->tune_timer.cq_count += cqe_count; */
-					nesdev->currcq_count += cqe_count;
-					cqe_count = 0;
-					nes_replenish_nic_rq(nesvnic);
-				}
-				pkt_type = (u16)(le32_to_cpu(cq->cq_vbase[head].cqe_words[NES_NIC_CQE_TAG_PKT_TYPE_IDX]));
-				cqe_errv = (cqe_misc & NES_NIC_CQE_ERRV_MASK) >> NES_NIC_CQE_ERRV_SHIFT;
-				rx_skb->ip_summed = CHECKSUM_NONE;
-
-				if ((NES_PKT_TYPE_TCPV4_BITS == (pkt_type & NES_PKT_TYPE_TCPV4_MASK)) ||
-						(NES_PKT_TYPE_UDPV4_BITS == (pkt_type & NES_PKT_TYPE_UDPV4_MASK))) {
-					if ((cqe_errv &
-							(NES_NIC_ERRV_BITS_IPV4_CSUM_ERR | NES_NIC_ERRV_BITS_TCPUDP_CSUM_ERR |
-							NES_NIC_ERRV_BITS_IPH_ERR | NES_NIC_ERRV_BITS_WQE_OVERRUN)) == 0) {
-						if (nesvnic->netdev->features & NETIF_F_RXCSUM)
-							rx_skb->ip_summed = CHECKSUM_UNNECESSARY;
-					} else
-						nes_debug(NES_DBG_CQ, "%s: unsuccessfully checksummed TCP or UDP packet."
-								" errv = 0x%X, pkt_type = 0x%X.\n",
-								nesvnic->netdev->name, cqe_errv, pkt_type);
-
-				} else if ((pkt_type & NES_PKT_TYPE_IPV4_MASK) == NES_PKT_TYPE_IPV4_BITS) {
-					if ((cqe_errv &
-							(NES_NIC_ERRV_BITS_IPV4_CSUM_ERR | NES_NIC_ERRV_BITS_IPH_ERR |
-							NES_NIC_ERRV_BITS_WQE_OVERRUN)) == 0) {
-						if (nesvnic->netdev->features & NETIF_F_RXCSUM) {
-							rx_skb->ip_summed = CHECKSUM_UNNECESSARY;
-							/* nes_debug(NES_DBG_CQ, "%s: Reporting successfully checksummed IPv4 packet.\n",
-								  nesvnic->netdev->name); */
-						}
-					} else
-						nes_debug(NES_DBG_CQ, "%s: unsuccessfully checksummed TCP or UDP packet."
-								" errv = 0x%X, pkt_type = 0x%X.\n",
-								nesvnic->netdev->name, cqe_errv, pkt_type);
-					}
-				/* nes_debug(NES_DBG_CQ, "pkt_type=%x, APBVT_MASK=%x\n",
-							pkt_type, (pkt_type & NES_PKT_TYPE_APBVT_MASK)); */
-
-				if ((pkt_type & NES_PKT_TYPE_APBVT_MASK) == NES_PKT_TYPE_APBVT_BITS) {
-					if (nes_cm_recv(rx_skb, nesvnic->netdev))
-						rx_skb = NULL;
-				}
-				if (rx_skb == NULL)
-					goto skip_rx_indicate0;
-
-
-				if (cqe_misc & NES_NIC_CQE_TAG_VALID) {
-					vlan_tag = (u16)(le32_to_cpu(
-							cq->cq_vbase[head].cqe_words[NES_NIC_CQE_TAG_PKT_TYPE_IDX])
-							>> 16);
-					nes_debug(NES_DBG_CQ, "%s: Reporting stripped VLAN packet. Tag = 0x%04X\n",
-							nesvnic->netdev->name, vlan_tag);
-
-					__vlan_hwaccel_put_tag(rx_skb, htons(ETH_P_8021Q), vlan_tag);
-				}
-				napi_gro_receive(&nesvnic->napi, rx_skb);
-
-skip_rx_indicate0:
-				;
-				/* nesvnic->netstats.rx_packets++; */
-				/* nesvnic->netstats.rx_bytes += rx_pkt_size; */
-			}
-
-			cq->cq_vbase[head].cqe_words[NES_NIC_CQE_MISC_IDX] = 0;
-			/* Accounting... */
-			cqe_count++;
-			if (++head >= cq_size)
-				head = 0;
-			if (cqe_count == 255) {
-				/* Replenish Nic CQ */
-				nes_write32(nesdev->regs+NES_CQE_ALLOC,
-						cq->cq_number | (cqe_count << 16));
-				/* nesdev->nesadapter->tune_timer.cq_count += cqe_count; */
-				nesdev->currcq_count += cqe_count;
-				cqe_count = 0;
-			}
-
-			if (cq->rx_cqes_completed >= nesvnic->budget)
-				break;
-		} else {
-			cq->cqes_pending = 0;
-			break;
-		}
-
-	} while (1);
-
-	if (sq_cqes) {
-		barrier();
-		/* restart the queue if it had been stopped */
-		if (netif_queue_stopped(nesvnic->netdev))
-			netif_wake_queue(nesvnic->netdev);
-	}
-	cq->cq_head = head;
-	/* nes_debug(NES_DBG_CQ, "CQ%u Processed = %u cqes, new head = %u.\n",
-			cq->cq_number, cqe_count, cq->cq_head); */
-	cq->cqe_allocs_pending = cqe_count;
-	if (unlikely(nesadapter->et_use_adaptive_rx_coalesce))
-	{
-		/* nesdev->nesadapter->tune_timer.cq_count += cqe_count; */
-		nesdev->currcq_count += cqe_count;
-		nes_nic_tune_timer(nesdev);
-	}
-	if (atomic_read(&nesvnic->rx_skbs_needed))
-		nes_replenish_nic_rq(nesvnic);
-}
-
-
-
-/**
- * nes_cqp_ce_handler
- */
-static void nes_cqp_ce_handler(struct nes_device *nesdev, struct nes_hw_cq *cq)
-{
-	u64 u64temp;
-	unsigned long flags;
-	struct nes_hw_cqp *cqp = NULL;
-	struct nes_cqp_request *cqp_request;
-	struct nes_hw_cqp_wqe *cqp_wqe;
-	u32 head;
-	u32 cq_size;
-	u32 cqe_count=0;
-	u32 error_code;
-	u32 opcode;
-	u32 ctx_index;
-	/* u32 counter; */
-
-	head = cq->cq_head;
-	cq_size = cq->cq_size;
-
-	do {
-		/* process the CQE */
-		/* nes_debug(NES_DBG_CQP, "head=%u cqe_words=%08X\n", head,
-			  le32_to_cpu(cq->cq_vbase[head].cqe_words[NES_CQE_OPCODE_IDX])); */
-
-		opcode = le32_to_cpu(cq->cq_vbase[head].cqe_words[NES_CQE_OPCODE_IDX]);
-		if (opcode & NES_CQE_VALID) {
-			cqp = &nesdev->cqp;
-
-			error_code = le32_to_cpu(cq->cq_vbase[head].cqe_words[NES_CQE_ERROR_CODE_IDX]);
-			if (error_code) {
-				nes_debug(NES_DBG_CQP, "Bad Completion code for opcode 0x%02X from CQP,"
-						" Major/Minor codes = 0x%04X:%04X.\n",
-						le32_to_cpu(cq->cq_vbase[head].cqe_words[NES_CQE_OPCODE_IDX])&0x3f,
-						(u16)(error_code >> 16),
-						(u16)error_code);
-			}
-
-			u64temp = (((u64)(le32_to_cpu(cq->cq_vbase[head].
-					cqe_words[NES_CQE_COMP_COMP_CTX_HIGH_IDX]))) << 32) |
-					((u64)(le32_to_cpu(cq->cq_vbase[head].
-					cqe_words[NES_CQE_COMP_COMP_CTX_LOW_IDX])));
-
-			cqp_request = (struct nes_cqp_request *)(unsigned long)u64temp;
-			if (cqp_request) {
-				if (cqp_request->waiting) {
-					/* nes_debug(NES_DBG_CQP, "%s: Waking up requestor\n"); */
-					cqp_request->major_code = (u16)(error_code >> 16);
-					cqp_request->minor_code = (u16)error_code;
-					barrier();
-					cqp_request->request_done = 1;
-					wake_up(&cqp_request->waitq);
-					nes_put_cqp_request(nesdev, cqp_request);
-				} else {
-					if (cqp_request->callback)
-						cqp_request->cqp_callback(nesdev, cqp_request);
-					nes_free_cqp_request(nesdev, cqp_request);
-				}
-			} else {
-				wake_up(&nesdev->cqp.waitq);
-			}
-
-			cq->cq_vbase[head].cqe_words[NES_CQE_OPCODE_IDX] = 0;
-			nes_write32(nesdev->regs + NES_CQE_ALLOC, cq->cq_number | (1 << 16));
-			if (++cqp->sq_tail >= cqp->sq_size)
-				cqp->sq_tail = 0;
-
-			/* Accounting... */
-			cqe_count++;
-			if (++head >= cq_size)
-				head = 0;
-		} else {
-			break;
-		}
-	} while (1);
-	cq->cq_head = head;
-
-	spin_lock_irqsave(&nesdev->cqp.lock, flags);
-	while ((!list_empty(&nesdev->cqp_pending_reqs)) &&
-			((((nesdev->cqp.sq_tail+nesdev->cqp.sq_size)-nesdev->cqp.sq_head) &
-			(nesdev->cqp.sq_size - 1)) != 1)) {
-		cqp_request = list_entry(nesdev->cqp_pending_reqs.next,
-				struct nes_cqp_request, list);
-		list_del_init(&cqp_request->list);
-		head = nesdev->cqp.sq_head++;
-		nesdev->cqp.sq_head &= nesdev->cqp.sq_size-1;
-		cqp_wqe = &nesdev->cqp.sq_vbase[head];
-		memcpy(cqp_wqe, &cqp_request->cqp_wqe, sizeof(*cqp_wqe));
-		barrier();
-
-		opcode = le32_to_cpu(cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX]);
-		if ((opcode & NES_CQP_OPCODE_MASK) == NES_CQP_DOWNLOAD_SEGMENT)
-			ctx_index = NES_CQP_WQE_DL_COMP_CTX_LOW_IDX;
-		else
-			ctx_index = NES_CQP_WQE_COMP_CTX_LOW_IDX;
-		cqp_wqe->wqe_words[ctx_index] =
-			cpu_to_le32((u32)((unsigned long)cqp_request));
-		cqp_wqe->wqe_words[ctx_index + 1] =
-			cpu_to_le32((u32)(upper_32_bits((unsigned long)cqp_request)));
-		nes_debug(NES_DBG_CQP, "CQP request %p (opcode 0x%02X) put on CQPs SQ wqe%u.\n",
-				cqp_request, le32_to_cpu(cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX])&0x3f, head);
-		/* Ring doorbell (1 WQEs) */
-		barrier();
-		nes_write32(nesdev->regs+NES_WQE_ALLOC, 0x01800000 | nesdev->cqp.qp_id);
-	}
-	spin_unlock_irqrestore(&nesdev->cqp.lock, flags);
-
-	/* Arm the CCQ */
-	nes_write32(nesdev->regs+NES_CQE_ALLOC, NES_CQE_ALLOC_NOTIFY_NEXT |
-			cq->cq_number);
-	nes_read32(nesdev->regs+NES_CQE_ALLOC);
-}
-
-static u8 *locate_mpa(u8 *pkt, u32 aeq_info)
-{
-	if (aeq_info & NES_AEQE_Q2_DATA_ETHERNET) {
-		/* skip over ethernet header */
-		pkt += ETH_HLEN;
-
-		/* Skip over IP and TCP headers */
-		pkt += 4 * (pkt[0] & 0x0f);
-		pkt += 4 * ((pkt[12] >> 4) & 0x0f);
-	}
-	return pkt;
-}
-
-/* Determine if incoming error pkt is rdma layer */
-static u32 iwarp_opcode(struct nes_qp *nesqp, u32 aeq_info)
-{
-	u8 *pkt;
-	u16 *mpa;
-	u32 opcode = 0xffffffff;
-
-	if (aeq_info & NES_AEQE_Q2_DATA_WRITTEN) {
-		pkt = nesqp->hwqp.q2_vbase + BAD_FRAME_OFFSET;
-		mpa = (u16 *)locate_mpa(pkt, aeq_info);
-		opcode = be16_to_cpu(mpa[1]) & 0xf;
-	}
-
-	return opcode;
-}
-
-/* Build iWARP terminate header */
-static int nes_bld_terminate_hdr(struct nes_qp *nesqp, u16 async_event_id, u32 aeq_info)
-{
-	u8 *pkt = nesqp->hwqp.q2_vbase + BAD_FRAME_OFFSET;
-	u16 ddp_seg_len;
-	int copy_len = 0;
-	u8 is_tagged = 0;
-	u8 flush_code = 0;
-	struct nes_terminate_hdr *termhdr;
-
-	termhdr = (struct nes_terminate_hdr *)nesqp->hwqp.q2_vbase;
-	memset(termhdr, 0, 64);
-
-	if (aeq_info & NES_AEQE_Q2_DATA_WRITTEN) {
-
-		/* Use data from offending packet to fill in ddp & rdma hdrs */
-		pkt = locate_mpa(pkt, aeq_info);
-		ddp_seg_len = be16_to_cpu(*(u16 *)pkt);
-		if (ddp_seg_len) {
-			copy_len = 2;
-			termhdr->hdrct = DDP_LEN_FLAG;
-			if (pkt[2] & 0x80) {
-				is_tagged = 1;
-				if (ddp_seg_len >= TERM_DDP_LEN_TAGGED) {
-					copy_len += TERM_DDP_LEN_TAGGED;
-					termhdr->hdrct |= DDP_HDR_FLAG;
-				}
-			} else {
-				if (ddp_seg_len >= TERM_DDP_LEN_UNTAGGED) {
-					copy_len += TERM_DDP_LEN_UNTAGGED;
-					termhdr->hdrct |= DDP_HDR_FLAG;
-				}
-
-				if (ddp_seg_len >= (TERM_DDP_LEN_UNTAGGED + TERM_RDMA_LEN)) {
-					if ((pkt[3] & RDMA_OPCODE_MASK) == RDMA_READ_REQ_OPCODE) {
-						copy_len += TERM_RDMA_LEN;
-						termhdr->hdrct |= RDMA_HDR_FLAG;
-					}
-				}
-			}
-		}
-	}
-
-	switch (async_event_id) {
-	case NES_AEQE_AEID_AMP_UNALLOCATED_STAG:
-		switch (iwarp_opcode(nesqp, aeq_info)) {
-		case IWARP_OPCODE_WRITE:
-			flush_code = IB_WC_LOC_PROT_ERR;
-			termhdr->layer_etype = (LAYER_DDP << 4) | DDP_TAGGED_BUFFER;
-			termhdr->error_code = DDP_TAGGED_INV_STAG;
-			break;
-		default:
-			flush_code = IB_WC_REM_ACCESS_ERR;
-			termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_PROT;
-			termhdr->error_code = RDMAP_INV_STAG;
-		}
-		break;
-	case NES_AEQE_AEID_AMP_INVALID_STAG:
-		flush_code = IB_WC_REM_ACCESS_ERR;
-		termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_PROT;
-		termhdr->error_code = RDMAP_INV_STAG;
-		break;
-	case NES_AEQE_AEID_AMP_BAD_QP:
-		flush_code = IB_WC_LOC_QP_OP_ERR;
-		termhdr->layer_etype = (LAYER_DDP << 4) | DDP_UNTAGGED_BUFFER;
-		termhdr->error_code = DDP_UNTAGGED_INV_QN;
-		break;
-	case NES_AEQE_AEID_AMP_BAD_STAG_KEY:
-	case NES_AEQE_AEID_AMP_BAD_STAG_INDEX:
-		switch (iwarp_opcode(nesqp, aeq_info)) {
-		case IWARP_OPCODE_SEND_INV:
-		case IWARP_OPCODE_SEND_SE_INV:
-			flush_code = IB_WC_REM_OP_ERR;
-			termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_OP;
-			termhdr->error_code = RDMAP_CANT_INV_STAG;
-			break;
-		default:
-			flush_code = IB_WC_REM_ACCESS_ERR;
-			termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_PROT;
-			termhdr->error_code = RDMAP_INV_STAG;
-		}
-		break;
-	case NES_AEQE_AEID_AMP_BOUNDS_VIOLATION:
-		if (aeq_info & (NES_AEQE_Q2_DATA_ETHERNET | NES_AEQE_Q2_DATA_MPA)) {
-			flush_code = IB_WC_LOC_PROT_ERR;
-			termhdr->layer_etype = (LAYER_DDP << 4) | DDP_TAGGED_BUFFER;
-			termhdr->error_code = DDP_TAGGED_BOUNDS;
-		} else {
-			flush_code = IB_WC_REM_ACCESS_ERR;
-			termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_PROT;
-			termhdr->error_code = RDMAP_INV_BOUNDS;
-		}
-		break;
-	case NES_AEQE_AEID_AMP_RIGHTS_VIOLATION:
-	case NES_AEQE_AEID_AMP_INVALIDATE_NO_REMOTE_ACCESS_RIGHTS:
-	case NES_AEQE_AEID_PRIV_OPERATION_DENIED:
-		flush_code = IB_WC_REM_ACCESS_ERR;
-		termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_PROT;
-		termhdr->error_code = RDMAP_ACCESS;
-		break;
-	case NES_AEQE_AEID_AMP_TO_WRAP:
-		flush_code = IB_WC_REM_ACCESS_ERR;
-		termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_PROT;
-		termhdr->error_code = RDMAP_TO_WRAP;
-		break;
-	case NES_AEQE_AEID_AMP_BAD_PD:
-		switch (iwarp_opcode(nesqp, aeq_info)) {
-		case IWARP_OPCODE_WRITE:
-			flush_code = IB_WC_LOC_PROT_ERR;
-			termhdr->layer_etype = (LAYER_DDP << 4) | DDP_TAGGED_BUFFER;
-			termhdr->error_code = DDP_TAGGED_UNASSOC_STAG;
-			break;
-		case IWARP_OPCODE_SEND_INV:
-		case IWARP_OPCODE_SEND_SE_INV:
-			flush_code = IB_WC_REM_ACCESS_ERR;
-			termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_PROT;
-			termhdr->error_code = RDMAP_CANT_INV_STAG;
-			break;
-		default:
-			flush_code = IB_WC_REM_ACCESS_ERR;
-			termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_PROT;
-			termhdr->error_code = RDMAP_UNASSOC_STAG;
-		}
-		break;
-	case NES_AEQE_AEID_LLP_RECEIVED_MARKER_AND_LENGTH_FIELDS_DONT_MATCH:
-		flush_code = IB_WC_LOC_LEN_ERR;
-		termhdr->layer_etype = (LAYER_MPA << 4) | DDP_LLP;
-		termhdr->error_code = MPA_MARKER;
-		break;
-	case NES_AEQE_AEID_LLP_RECEIVED_MPA_CRC_ERROR:
-		flush_code = IB_WC_GENERAL_ERR;
-		termhdr->layer_etype = (LAYER_MPA << 4) | DDP_LLP;
-		termhdr->error_code = MPA_CRC;
-		break;
-	case NES_AEQE_AEID_LLP_SEGMENT_TOO_LARGE:
-	case NES_AEQE_AEID_LLP_SEGMENT_TOO_SMALL:
-		flush_code = IB_WC_LOC_LEN_ERR;
-		termhdr->layer_etype = (LAYER_DDP << 4) | DDP_CATASTROPHIC;
-		termhdr->error_code = DDP_CATASTROPHIC_LOCAL;
-		break;
-	case NES_AEQE_AEID_DDP_LCE_LOCAL_CATASTROPHIC:
-	case NES_AEQE_AEID_DDP_NO_L_BIT:
-		flush_code = IB_WC_FATAL_ERR;
-		termhdr->layer_etype = (LAYER_DDP << 4) | DDP_CATASTROPHIC;
-		termhdr->error_code = DDP_CATASTROPHIC_LOCAL;
-		break;
-	case NES_AEQE_AEID_DDP_INVALID_MSN_GAP_IN_MSN:
-	case NES_AEQE_AEID_DDP_INVALID_MSN_RANGE_IS_NOT_VALID:
-		flush_code = IB_WC_GENERAL_ERR;
-		termhdr->layer_etype = (LAYER_DDP << 4) | DDP_UNTAGGED_BUFFER;
-		termhdr->error_code = DDP_UNTAGGED_INV_MSN_RANGE;
-		break;
-	case NES_AEQE_AEID_DDP_UBE_DDP_MESSAGE_TOO_LONG_FOR_AVAILABLE_BUFFER:
-		flush_code = IB_WC_LOC_LEN_ERR;
-		termhdr->layer_etype = (LAYER_DDP << 4) | DDP_UNTAGGED_BUFFER;
-		termhdr->error_code = DDP_UNTAGGED_INV_TOO_LONG;
-		break;
-	case NES_AEQE_AEID_DDP_UBE_INVALID_DDP_VERSION:
-		flush_code = IB_WC_GENERAL_ERR;
-		if (is_tagged) {
-			termhdr->layer_etype = (LAYER_DDP << 4) | DDP_TAGGED_BUFFER;
-			termhdr->error_code = DDP_TAGGED_INV_DDP_VER;
-		} else {
-			termhdr->layer_etype = (LAYER_DDP << 4) | DDP_UNTAGGED_BUFFER;
-			termhdr->error_code = DDP_UNTAGGED_INV_DDP_VER;
-		}
-		break;
-	case NES_AEQE_AEID_DDP_UBE_INVALID_MO:
-		flush_code = IB_WC_GENERAL_ERR;
-		termhdr->layer_etype = (LAYER_DDP << 4) | DDP_UNTAGGED_BUFFER;
-		termhdr->error_code = DDP_UNTAGGED_INV_MO;
-		break;
-	case NES_AEQE_AEID_DDP_UBE_INVALID_MSN_NO_BUFFER_AVAILABLE:
-		flush_code = IB_WC_REM_OP_ERR;
-		termhdr->layer_etype = (LAYER_DDP << 4) | DDP_UNTAGGED_BUFFER;
-		termhdr->error_code = DDP_UNTAGGED_INV_MSN_NO_BUF;
-		break;
-	case NES_AEQE_AEID_DDP_UBE_INVALID_QN:
-		flush_code = IB_WC_GENERAL_ERR;
-		termhdr->layer_etype = (LAYER_DDP << 4) | DDP_UNTAGGED_BUFFER;
-		termhdr->error_code = DDP_UNTAGGED_INV_QN;
-		break;
-	case NES_AEQE_AEID_RDMAP_ROE_INVALID_RDMAP_VERSION:
-		flush_code = IB_WC_GENERAL_ERR;
-		termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_OP;
-		termhdr->error_code = RDMAP_INV_RDMAP_VER;
-		break;
-	case NES_AEQE_AEID_RDMAP_ROE_UNEXPECTED_OPCODE:
-		flush_code = IB_WC_LOC_QP_OP_ERR;
-		termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_OP;
-		termhdr->error_code = RDMAP_UNEXPECTED_OP;
-		break;
-	default:
-		flush_code = IB_WC_FATAL_ERR;
-		termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_OP;
-		termhdr->error_code = RDMAP_UNSPECIFIED;
-		break;
-	}
-
-	if (copy_len)
-		memcpy(termhdr + 1, pkt, copy_len);
-
-	if ((flush_code) && ((NES_AEQE_INBOUND_RDMA & aeq_info) == 0)) {
-		if (aeq_info & NES_AEQE_SQ)
-			nesqp->term_sq_flush_code = flush_code;
-		else
-			nesqp->term_rq_flush_code = flush_code;
-	}
-
-	return sizeof(struct nes_terminate_hdr) + copy_len;
-}
-
-static void nes_terminate_connection(struct nes_device *nesdev, struct nes_qp *nesqp,
-		 struct nes_hw_aeqe *aeqe, enum ib_event_type eventtype)
-{
-	u64 context;
-	unsigned long flags;
-	u32 aeq_info;
-	u16 async_event_id;
-	u8 tcp_state;
-	u8 iwarp_state;
-	u32 termlen = 0;
-	u32 mod_qp_flags = NES_CQP_QP_IWARP_STATE_TERMINATE |
-			   NES_CQP_QP_TERM_DONT_SEND_FIN;
-	struct nes_adapter *nesadapter = nesdev->nesadapter;
-
-	if (nesqp->term_flags & NES_TERM_SENT)
-		return; /* Sanity check */
-
-	aeq_info = le32_to_cpu(aeqe->aeqe_words[NES_AEQE_MISC_IDX]);
-	tcp_state = (aeq_info & NES_AEQE_TCP_STATE_MASK) >> NES_AEQE_TCP_STATE_SHIFT;
-	iwarp_state = (aeq_info & NES_AEQE_IWARP_STATE_MASK) >> NES_AEQE_IWARP_STATE_SHIFT;
-	async_event_id = (u16)aeq_info;
-
-	context = (unsigned long)nesadapter->qp_table[le32_to_cpu(
-		aeqe->aeqe_words[NES_AEQE_COMP_QP_CQ_ID_IDX]) - NES_FIRST_QPN];
-	if (!context) {
-		WARN_ON(!context);
-		return;
-	}
-
-	nesqp = (struct nes_qp *)(unsigned long)context;
-	spin_lock_irqsave(&nesqp->lock, flags);
-	nesqp->hw_iwarp_state = iwarp_state;
-	nesqp->hw_tcp_state = tcp_state;
-	nesqp->last_aeq = async_event_id;
-	nesqp->terminate_eventtype = eventtype;
-	spin_unlock_irqrestore(&nesqp->lock, flags);
-
-	if (nesadapter->send_term_ok)
-		termlen = nes_bld_terminate_hdr(nesqp, async_event_id, aeq_info);
-	else
-		mod_qp_flags |= NES_CQP_QP_TERM_DONT_SEND_TERM_MSG;
-
-	if (!nesdev->iw_status)  {
-		nesqp->term_flags = NES_TERM_DONE;
-		nes_hw_modify_qp(nesdev, nesqp, NES_CQP_QP_IWARP_STATE_ERROR, 0, 0);
-		nes_cm_disconn(nesqp);
-	} else {
-		nes_terminate_start_timer(nesqp);
-		nesqp->term_flags |= NES_TERM_SENT;
-		nes_hw_modify_qp(nesdev, nesqp, mod_qp_flags, termlen, 0);
-	}
-}
-
-static void nes_terminate_send_fin(struct nes_device *nesdev,
-			  struct nes_qp *nesqp, struct nes_hw_aeqe *aeqe)
-{
-	u32 aeq_info;
-	u16 async_event_id;
-	u8 tcp_state;
-	u8 iwarp_state;
-	unsigned long flags;
-
-	aeq_info = le32_to_cpu(aeqe->aeqe_words[NES_AEQE_MISC_IDX]);
-	tcp_state = (aeq_info & NES_AEQE_TCP_STATE_MASK) >> NES_AEQE_TCP_STATE_SHIFT;
-	iwarp_state = (aeq_info & NES_AEQE_IWARP_STATE_MASK) >> NES_AEQE_IWARP_STATE_SHIFT;
-	async_event_id = (u16)aeq_info;
-
-	spin_lock_irqsave(&nesqp->lock, flags);
-	nesqp->hw_iwarp_state = iwarp_state;
-	nesqp->hw_tcp_state = tcp_state;
-	nesqp->last_aeq = async_event_id;
-	spin_unlock_irqrestore(&nesqp->lock, flags);
-
-	/* Send the fin only */
-	nes_hw_modify_qp(nesdev, nesqp, NES_CQP_QP_IWARP_STATE_TERMINATE |
-		NES_CQP_QP_TERM_DONT_SEND_TERM_MSG, 0, 0);
-}
-
-/* Cleanup after a terminate sent or received */
-static void nes_terminate_done(struct nes_qp *nesqp, int timeout_occurred)
-{
-	u32 next_iwarp_state = NES_CQP_QP_IWARP_STATE_ERROR;
-	unsigned long flags;
-	struct nes_vnic *nesvnic = to_nesvnic(nesqp->ibqp.device);
-	struct nes_device *nesdev = nesvnic->nesdev;
-	u8 first_time = 0;
-
-	spin_lock_irqsave(&nesqp->lock, flags);
-	if (nesqp->hte_added) {
-		nesqp->hte_added = 0;
-		next_iwarp_state |= NES_CQP_QP_DEL_HTE;
-	}
-
-	first_time = (nesqp->term_flags & NES_TERM_DONE) == 0;
-	nesqp->term_flags |= NES_TERM_DONE;
-	spin_unlock_irqrestore(&nesqp->lock, flags);
-
-	/* Make sure we go through this only once */
-	if (first_time) {
-		if (timeout_occurred == 0)
-			del_timer(&nesqp->terminate_timer);
-		else
-			next_iwarp_state |= NES_CQP_QP_RESET;
-
-		nes_hw_modify_qp(nesdev, nesqp, next_iwarp_state, 0, 0);
-		nes_cm_disconn(nesqp);
-	}
-}
-
-static void nes_terminate_received(struct nes_device *nesdev,
-				struct nes_qp *nesqp, struct nes_hw_aeqe *aeqe)
-{
-	u32 aeq_info;
-	u8 *pkt;
-	u32 *mpa;
-	u8 ddp_ctl;
-	u8 rdma_ctl;
-	u16 aeq_id = 0;
-
-	aeq_info = le32_to_cpu(aeqe->aeqe_words[NES_AEQE_MISC_IDX]);
-	if (aeq_info & NES_AEQE_Q2_DATA_WRITTEN) {
-		/* Terminate is not a performance path so the silicon */
-		/* did not validate the frame - do it now */
-		pkt = nesqp->hwqp.q2_vbase + BAD_FRAME_OFFSET;
-		mpa = (u32 *)locate_mpa(pkt, aeq_info);
-		ddp_ctl = (be32_to_cpu(mpa[0]) >> 8) & 0xff;
-		rdma_ctl = be32_to_cpu(mpa[0]) & 0xff;
-		if ((ddp_ctl & 0xc0) != 0x40)
-			aeq_id = NES_AEQE_AEID_DDP_LCE_LOCAL_CATASTROPHIC;
-		else if ((ddp_ctl & 0x03) != 1)
-			aeq_id = NES_AEQE_AEID_DDP_UBE_INVALID_DDP_VERSION;
-		else if (be32_to_cpu(mpa[2]) != 2)
-			aeq_id = NES_AEQE_AEID_DDP_UBE_INVALID_QN;
-		else if (be32_to_cpu(mpa[3]) != 1)
-			aeq_id = NES_AEQE_AEID_DDP_INVALID_MSN_GAP_IN_MSN;
-		else if (be32_to_cpu(mpa[4]) != 0)
-			aeq_id = NES_AEQE_AEID_DDP_UBE_INVALID_MO;
-		else if ((rdma_ctl & 0xc0) != 0x40)
-			aeq_id = NES_AEQE_AEID_RDMAP_ROE_INVALID_RDMAP_VERSION;
-
-		if (aeq_id) {
-			/* Bad terminate recvd - send back a terminate */
-			aeq_info = (aeq_info & 0xffff0000) | aeq_id;
-			aeqe->aeqe_words[NES_AEQE_MISC_IDX] = cpu_to_le32(aeq_info);
-			nes_terminate_connection(nesdev, nesqp, aeqe, IB_EVENT_QP_FATAL);
-			return;
-		}
-	}
-
-	nesqp->term_flags |= NES_TERM_RCVD;
-	nesqp->terminate_eventtype = IB_EVENT_QP_FATAL;
-	nes_terminate_start_timer(nesqp);
-	nes_terminate_send_fin(nesdev, nesqp, aeqe);
-}
-
-/* Timeout routine in case terminate fails to complete */
-void nes_terminate_timeout(struct timer_list *t)
-{
-	struct nes_qp *nesqp = from_timer(nesqp, t, terminate_timer);
-
-	nes_terminate_done(nesqp, 1);
-}
-
-/* Set a timer in case hw cannot complete the terminate sequence */
-static void nes_terminate_start_timer(struct nes_qp *nesqp)
-{
-	mod_timer(&nesqp->terminate_timer, (jiffies + HZ));
-}
-
-/**
- * nes_process_iwarp_aeqe
- */
-static void nes_process_iwarp_aeqe(struct nes_device *nesdev,
-				   struct nes_hw_aeqe *aeqe)
-{
-	u64 context;
-	unsigned long flags;
-	struct nes_qp *nesqp;
-	struct nes_hw_cq *hw_cq;
-	struct nes_cq *nescq;
-	int resource_allocated;
-	struct nes_adapter *nesadapter = nesdev->nesadapter;
-	u32 aeq_info;
-	u32 next_iwarp_state = 0;
-	u32 aeqe_cq_id;
-	u16 async_event_id;
-	u8 tcp_state;
-	u8 iwarp_state;
-	struct ib_event ibevent;
-
-	nes_debug(NES_DBG_AEQ, "\n");
-	aeq_info = le32_to_cpu(aeqe->aeqe_words[NES_AEQE_MISC_IDX]);
-	if ((NES_AEQE_INBOUND_RDMA & aeq_info) || (!(NES_AEQE_QP & aeq_info))) {
-		context  = le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_CTXT_LOW_IDX]);
-		context += ((u64)le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_CTXT_HIGH_IDX])) << 32;
-	} else {
-		context = (unsigned long)nesadapter->qp_table[le32_to_cpu(
-						aeqe->aeqe_words[NES_AEQE_COMP_QP_CQ_ID_IDX]) - NES_FIRST_QPN];
-		BUG_ON(!context);
-	}
-
-	/* context is nesqp unless async_event_id == CQ ERROR */
-	nesqp = (struct nes_qp *)(unsigned long)context;
-	async_event_id = (u16)aeq_info;
-	tcp_state = (aeq_info & NES_AEQE_TCP_STATE_MASK) >> NES_AEQE_TCP_STATE_SHIFT;
-	iwarp_state = (aeq_info & NES_AEQE_IWARP_STATE_MASK) >> NES_AEQE_IWARP_STATE_SHIFT;
-	nes_debug(NES_DBG_AEQ, "aeid = 0x%04X, qp-cq id = %d, aeqe = %p,"
-			" Tcp state = %s, iWARP state = %s\n",
-			async_event_id,
-			le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_QP_CQ_ID_IDX]), aeqe,
-			nes_tcp_state_str[tcp_state], nes_iwarp_state_str[iwarp_state]);
-
-	aeqe_cq_id = le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_QP_CQ_ID_IDX]);
-	if (aeq_info & NES_AEQE_QP) {
-		if (!nes_is_resource_allocated(nesadapter,
-				nesadapter->allocated_qps,
-				aeqe_cq_id))
-			return;
-	}
-
-	switch (async_event_id) {
-		case NES_AEQE_AEID_LLP_FIN_RECEIVED:
-			if (nesqp->term_flags)
-				return; /* Ignore it, wait for close complete */
-
-			if (atomic_inc_return(&nesqp->close_timer_started) == 1) {
-				if ((tcp_state == NES_AEQE_TCP_STATE_CLOSE_WAIT) &&
-					(nesqp->ibqp_state == IB_QPS_RTS)) {
-					spin_lock_irqsave(&nesqp->lock, flags);
-					nesqp->hw_iwarp_state = iwarp_state;
-					nesqp->hw_tcp_state = tcp_state;
-					nesqp->last_aeq = async_event_id;
-					next_iwarp_state = NES_CQP_QP_IWARP_STATE_CLOSING;
-					nesqp->hw_iwarp_state = NES_AEQE_IWARP_STATE_CLOSING;
-					spin_unlock_irqrestore(&nesqp->lock, flags);
-					nes_hw_modify_qp(nesdev, nesqp, next_iwarp_state, 0, 0);
-					nes_cm_disconn(nesqp);
-				}
-				nesqp->cm_id->add_ref(nesqp->cm_id);
-				schedule_nes_timer(nesqp->cm_node, (struct sk_buff *)nesqp,
-						NES_TIMER_TYPE_CLOSE, 1, 0);
-				nes_debug(NES_DBG_AEQ, "QP%u Not decrementing QP refcount (%d),"
-						" need ae to finish up, original_last_aeq = 0x%04X."
-						" last_aeq = 0x%04X, scheduling timer. TCP state = %d\n",
-						nesqp->hwqp.qp_id, atomic_read(&nesqp->refcount),
-						async_event_id, nesqp->last_aeq, tcp_state);
-			}
-			break;
-		case NES_AEQE_AEID_LLP_CLOSE_COMPLETE:
-			spin_lock_irqsave(&nesqp->lock, flags);
-			nesqp->hw_iwarp_state = iwarp_state;
-			nesqp->hw_tcp_state = tcp_state;
-			nesqp->last_aeq = async_event_id;
-			spin_unlock_irqrestore(&nesqp->lock, flags);
-			nes_cm_disconn(nesqp);
-			break;
-
-		case NES_AEQE_AEID_RESET_SENT:
-			tcp_state = NES_AEQE_TCP_STATE_CLOSED;
-			spin_lock_irqsave(&nesqp->lock, flags);
-			nesqp->hw_iwarp_state = iwarp_state;
-			nesqp->hw_tcp_state = tcp_state;
-			nesqp->last_aeq = async_event_id;
-			nesqp->hte_added = 0;
-			spin_unlock_irqrestore(&nesqp->lock, flags);
-			next_iwarp_state = NES_CQP_QP_IWARP_STATE_ERROR | NES_CQP_QP_DEL_HTE;
-			nes_hw_modify_qp(nesdev, nesqp, next_iwarp_state, 0, 0);
-			nes_cm_disconn(nesqp);
-			break;
-
-		case NES_AEQE_AEID_LLP_CONNECTION_RESET:
-			if (atomic_read(&nesqp->close_timer_started))
-				return;
-			spin_lock_irqsave(&nesqp->lock, flags);
-			nesqp->hw_iwarp_state = iwarp_state;
-			nesqp->hw_tcp_state = tcp_state;
-			nesqp->last_aeq = async_event_id;
-			spin_unlock_irqrestore(&nesqp->lock, flags);
-			nes_cm_disconn(nesqp);
-			break;
-
-		case NES_AEQE_AEID_TERMINATE_SENT:
-			nes_terminate_send_fin(nesdev, nesqp, aeqe);
-			break;
-
-		case NES_AEQE_AEID_LLP_TERMINATE_RECEIVED:
-			nes_terminate_received(nesdev, nesqp, aeqe);
-			break;
-
-		case NES_AEQE_AEID_AMP_BAD_STAG_KEY:
-		case NES_AEQE_AEID_AMP_BAD_STAG_INDEX:
-		case NES_AEQE_AEID_AMP_UNALLOCATED_STAG:
-		case NES_AEQE_AEID_AMP_INVALID_STAG:
-		case NES_AEQE_AEID_AMP_RIGHTS_VIOLATION:
-		case NES_AEQE_AEID_AMP_INVALIDATE_NO_REMOTE_ACCESS_RIGHTS:
-		case NES_AEQE_AEID_PRIV_OPERATION_DENIED:
-		case NES_AEQE_AEID_DDP_UBE_DDP_MESSAGE_TOO_LONG_FOR_AVAILABLE_BUFFER:
-		case NES_AEQE_AEID_AMP_BOUNDS_VIOLATION:
-		case NES_AEQE_AEID_AMP_TO_WRAP:
-			printk(KERN_ERR PFX "QP[%u] async_event_id=0x%04X IB_EVENT_QP_ACCESS_ERR\n",
-					nesqp->hwqp.qp_id, async_event_id);
-			nes_terminate_connection(nesdev, nesqp, aeqe, IB_EVENT_QP_ACCESS_ERR);
-			break;
-
-		case NES_AEQE_AEID_LLP_SEGMENT_TOO_LARGE:
-		case NES_AEQE_AEID_LLP_SEGMENT_TOO_SMALL:
-		case NES_AEQE_AEID_DDP_UBE_INVALID_MO:
-		case NES_AEQE_AEID_DDP_UBE_INVALID_QN:
-			if (iwarp_opcode(nesqp, aeq_info) > IWARP_OPCODE_TERM) {
-				aeq_info &= 0xffff0000;
-				aeq_info |= NES_AEQE_AEID_RDMAP_ROE_UNEXPECTED_OPCODE;
-				aeqe->aeqe_words[NES_AEQE_MISC_IDX] = cpu_to_le32(aeq_info);
-			}
-			/* fall through */
-		case NES_AEQE_AEID_RDMAP_ROE_BAD_LLP_CLOSE:
-		case NES_AEQE_AEID_LLP_TOO_MANY_RETRIES:
-		case NES_AEQE_AEID_DDP_UBE_INVALID_MSN_NO_BUFFER_AVAILABLE:
-		case NES_AEQE_AEID_LLP_RECEIVED_MPA_CRC_ERROR:
-		case NES_AEQE_AEID_AMP_BAD_QP:
-		case NES_AEQE_AEID_LLP_RECEIVED_MARKER_AND_LENGTH_FIELDS_DONT_MATCH:
-		case NES_AEQE_AEID_DDP_LCE_LOCAL_CATASTROPHIC:
-		case NES_AEQE_AEID_DDP_NO_L_BIT:
-		case NES_AEQE_AEID_DDP_INVALID_MSN_GAP_IN_MSN:
-		case NES_AEQE_AEID_DDP_INVALID_MSN_RANGE_IS_NOT_VALID:
-		case NES_AEQE_AEID_DDP_UBE_INVALID_DDP_VERSION:
-		case NES_AEQE_AEID_RDMAP_ROE_INVALID_RDMAP_VERSION:
-		case NES_AEQE_AEID_RDMAP_ROE_UNEXPECTED_OPCODE:
-		case NES_AEQE_AEID_AMP_BAD_PD:
-		case NES_AEQE_AEID_AMP_FASTREG_SHARED:
-		case NES_AEQE_AEID_AMP_FASTREG_VALID_STAG:
-		case NES_AEQE_AEID_AMP_FASTREG_MW_STAG:
-		case NES_AEQE_AEID_AMP_FASTREG_INVALID_RIGHTS:
-		case NES_AEQE_AEID_AMP_FASTREG_PBL_TABLE_OVERFLOW:
-		case NES_AEQE_AEID_AMP_FASTREG_INVALID_LENGTH:
-		case NES_AEQE_AEID_AMP_INVALIDATE_SHARED:
-		case NES_AEQE_AEID_AMP_INVALIDATE_MR_WITH_BOUND_WINDOWS:
-		case NES_AEQE_AEID_AMP_MWBIND_VALID_STAG:
-		case NES_AEQE_AEID_AMP_MWBIND_OF_MR_STAG:
-		case NES_AEQE_AEID_AMP_MWBIND_TO_ZERO_BASED_STAG:
-		case NES_AEQE_AEID_AMP_MWBIND_TO_MW_STAG:
-		case NES_AEQE_AEID_AMP_MWBIND_INVALID_RIGHTS:
-		case NES_AEQE_AEID_AMP_MWBIND_INVALID_BOUNDS:
-		case NES_AEQE_AEID_AMP_MWBIND_TO_INVALID_PARENT:
-		case NES_AEQE_AEID_AMP_MWBIND_BIND_DISABLED:
-		case NES_AEQE_AEID_BAD_CLOSE:
-		case NES_AEQE_AEID_RDMA_READ_WHILE_ORD_ZERO:
-		case NES_AEQE_AEID_STAG_ZERO_INVALID:
-		case NES_AEQE_AEID_ROE_INVALID_RDMA_READ_REQUEST:
-		case NES_AEQE_AEID_ROE_INVALID_RDMA_WRITE_OR_READ_RESP:
-			printk(KERN_ERR PFX "QP[%u] async_event_id=0x%04X IB_EVENT_QP_FATAL\n",
-					nesqp->hwqp.qp_id, async_event_id);
-			print_ip(nesqp->cm_node);
-			if (!atomic_read(&nesqp->close_timer_started))
-				nes_terminate_connection(nesdev, nesqp, aeqe, IB_EVENT_QP_FATAL);
-			break;
-
-		case NES_AEQE_AEID_CQ_OPERATION_ERROR:
-			context <<= 1;
-			nes_debug(NES_DBG_AEQ, "Processing an NES_AEQE_AEID_CQ_OPERATION_ERROR event on CQ%u, %p\n",
-					le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_QP_CQ_ID_IDX]), (void *)(unsigned long)context);
-			resource_allocated = nes_is_resource_allocated(nesadapter, nesadapter->allocated_cqs,
-					le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_QP_CQ_ID_IDX]));
-			if (resource_allocated) {
-				printk(KERN_ERR PFX "%s: Processing an NES_AEQE_AEID_CQ_OPERATION_ERROR event on CQ%u\n",
-						__func__, le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_QP_CQ_ID_IDX]));
-				hw_cq = (struct nes_hw_cq *)(unsigned long)context;
-				if (hw_cq) {
-					nescq = container_of(hw_cq, struct nes_cq, hw_cq);
-					if (nescq->ibcq.event_handler) {
-						ibevent.device = nescq->ibcq.device;
-						ibevent.event = IB_EVENT_CQ_ERR;
-						ibevent.element.cq = &nescq->ibcq;
-						nescq->ibcq.event_handler(&ibevent, nescq->ibcq.cq_context);
-					}
-				}
-			}
-			break;
-
-		default:
-			nes_debug(NES_DBG_AEQ, "Processing an iWARP related AE for QP, misc = 0x%04X\n",
-					async_event_id);
-			break;
-	}
-
-}
-
-/**
- * nes_iwarp_ce_handler
- */
-void nes_iwarp_ce_handler(struct nes_device *nesdev, struct nes_hw_cq *hw_cq)
-{
-	struct nes_cq *nescq = container_of(hw_cq, struct nes_cq, hw_cq);
-
-	/* nes_debug(NES_DBG_CQ, "Processing completion event for iWARP CQ%u.\n",
-			nescq->hw_cq.cq_number); */
-	nes_write32(nesdev->regs+NES_CQ_ACK, nescq->hw_cq.cq_number);
-
-	if (nescq->ibcq.comp_handler)
-		nescq->ibcq.comp_handler(&nescq->ibcq, nescq->ibcq.cq_context);
-
-	return;
-}
-
-
-/**
- * nes_manage_apbvt()
- */
-int nes_manage_apbvt(struct nes_vnic *nesvnic, u32 accel_local_port,
-		u32 nic_index, u32 add_port)
-{
-	struct nes_device *nesdev = nesvnic->nesdev;
-	struct nes_hw_cqp_wqe *cqp_wqe;
-	struct nes_cqp_request *cqp_request;
-	int ret = 0;
-	u16 major_code;
-
-	/* Send manage APBVT request to CQP */
-	cqp_request = nes_get_cqp_request(nesdev);
-	if (cqp_request == NULL) {
-		nes_debug(NES_DBG_QP, "Failed to get a cqp_request.\n");
-		return -ENOMEM;
-	}
-	cqp_request->waiting = 1;
-	cqp_wqe = &cqp_request->cqp_wqe;
-
-	nes_debug(NES_DBG_QP, "%s APBV for local port=%u(0x%04x), nic_index=%u\n",
-			(add_port == NES_MANAGE_APBVT_ADD) ? "ADD" : "DEL",
-			accel_local_port, accel_local_port, nic_index);
-
-	nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
-	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX, (NES_CQP_MANAGE_APBVT |
-			((add_port == NES_MANAGE_APBVT_ADD) ? NES_CQP_APBVT_ADD : 0)));
-	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX,
-			((nic_index << NES_CQP_APBVT_NIC_SHIFT) | accel_local_port));
-
-	nes_debug(NES_DBG_QP, "Waiting for CQP completion for APBVT.\n");
-
-	atomic_set(&cqp_request->refcount, 2);
-	nes_post_cqp_request(nesdev, cqp_request);
-
-	if (add_port == NES_MANAGE_APBVT_ADD)
-		ret = wait_event_timeout(cqp_request->waitq, (cqp_request->request_done != 0),
-				NES_EVENT_TIMEOUT);
-	nes_debug(NES_DBG_QP, "Completed, ret=%u,  CQP Major:Minor codes = 0x%04X:0x%04X\n",
-			ret, cqp_request->major_code, cqp_request->minor_code);
-	major_code = cqp_request->major_code;
-
-	nes_put_cqp_request(nesdev, cqp_request);
-
-	if (!ret)
-		return -ETIME;
-	else if (major_code)
-		return -EIO;
-	else
-		return 0;
-}
-
-
-/**
- * nes_manage_arp_cache
- */
-void nes_manage_arp_cache(struct net_device *netdev, unsigned char *mac_addr,
-		u32 ip_addr, u32 action)
-{
-	struct nes_hw_cqp_wqe *cqp_wqe;
-	struct nes_vnic *nesvnic = netdev_priv(netdev);
-	struct nes_device *nesdev;
-	struct nes_cqp_request *cqp_request;
-	int arp_index;
-
-	nesdev = nesvnic->nesdev;
-	arp_index = nes_arp_table(nesdev, ip_addr, mac_addr, action);
-	if (arp_index == -1) {
-		return;
-	}
-
-	/* update the ARP entry */
-	cqp_request = nes_get_cqp_request(nesdev);
-	if (cqp_request == NULL) {
-		nes_debug(NES_DBG_NETDEV, "Failed to get a cqp_request.\n");
-		return;
-	}
-	cqp_request->waiting = 0;
-	cqp_wqe = &cqp_request->cqp_wqe;
-	nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
-
-	cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] = cpu_to_le32(
-			NES_CQP_MANAGE_ARP_CACHE | NES_CQP_ARP_PERM);
-	cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] |= cpu_to_le32(
-			(u32)PCI_FUNC(nesdev->pcidev->devfn) << NES_CQP_ARP_AEQ_INDEX_SHIFT);
-	cqp_wqe->wqe_words[NES_CQP_WQE_ID_IDX] = cpu_to_le32(arp_index);
-
-	if (action == NES_ARP_ADD) {
-		cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] |= cpu_to_le32(NES_CQP_ARP_VALID);
-		cqp_wqe->wqe_words[NES_CQP_ARP_WQE_MAC_ADDR_LOW_IDX] = cpu_to_le32(
-				(((u32)mac_addr[2]) << 24) | (((u32)mac_addr[3]) << 16) |
-				(((u32)mac_addr[4]) << 8)  | (u32)mac_addr[5]);
-		cqp_wqe->wqe_words[NES_CQP_ARP_WQE_MAC_HIGH_IDX] = cpu_to_le32(
-				(((u32)mac_addr[0]) << 8) | (u32)mac_addr[1]);
-	} else {
-		cqp_wqe->wqe_words[NES_CQP_ARP_WQE_MAC_ADDR_LOW_IDX] = 0;
-		cqp_wqe->wqe_words[NES_CQP_ARP_WQE_MAC_HIGH_IDX] = 0;
-	}
-
-	nes_debug(NES_DBG_NETDEV, "Not waiting for CQP, cqp.sq_head=%u, cqp.sq_tail=%u\n",
-			nesdev->cqp.sq_head, nesdev->cqp.sq_tail);
-
-	atomic_set(&cqp_request->refcount, 1);
-	nes_post_cqp_request(nesdev, cqp_request);
-}
-
-
-/**
- * flush_wqes
- */
-void flush_wqes(struct nes_device *nesdev, struct nes_qp *nesqp,
-		u32 which_wq, u32 wait_completion)
-{
-	struct nes_cqp_request *cqp_request;
-	struct nes_hw_cqp_wqe *cqp_wqe;
-	u32 sq_code = (NES_IWARP_CQE_MAJOR_FLUSH << 16) | NES_IWARP_CQE_MINOR_FLUSH;
-	u32 rq_code = (NES_IWARP_CQE_MAJOR_FLUSH << 16) | NES_IWARP_CQE_MINOR_FLUSH;
-	int ret;
-
-	cqp_request = nes_get_cqp_request(nesdev);
-	if (cqp_request == NULL) {
-		nes_debug(NES_DBG_QP, "Failed to get a cqp_request.\n");
-		return;
-	}
-	if (wait_completion) {
-		cqp_request->waiting = 1;
-		atomic_set(&cqp_request->refcount, 2);
-	} else {
-		cqp_request->waiting = 0;
-	}
-	cqp_wqe = &cqp_request->cqp_wqe;
-	nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
-
-	/* If wqe in error was identified, set code to be put into cqe */
-	if ((nesqp->term_sq_flush_code) && (which_wq & NES_CQP_FLUSH_SQ)) {
-		which_wq |= NES_CQP_FLUSH_MAJ_MIN;
-		sq_code = (CQE_MAJOR_DRV << 16) | nesqp->term_sq_flush_code;
-		nesqp->term_sq_flush_code = 0;
-	}
-
-	if ((nesqp->term_rq_flush_code) && (which_wq & NES_CQP_FLUSH_RQ)) {
-		which_wq |= NES_CQP_FLUSH_MAJ_MIN;
-		rq_code = (CQE_MAJOR_DRV << 16) | nesqp->term_rq_flush_code;
-		nesqp->term_rq_flush_code = 0;
-	}
-
-	if (which_wq & NES_CQP_FLUSH_MAJ_MIN) {
-		cqp_wqe->wqe_words[NES_CQP_QP_WQE_FLUSH_SQ_CODE] = cpu_to_le32(sq_code);
-		cqp_wqe->wqe_words[NES_CQP_QP_WQE_FLUSH_RQ_CODE] = cpu_to_le32(rq_code);
-	}
-
-	cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] =
-			cpu_to_le32(NES_CQP_FLUSH_WQES | which_wq);
-	cqp_wqe->wqe_words[NES_CQP_WQE_ID_IDX] = cpu_to_le32(nesqp->hwqp.qp_id);
-
-	nes_post_cqp_request(nesdev, cqp_request);
-
-	if (wait_completion) {
-		/* Wait for CQP */
-		ret = wait_event_timeout(cqp_request->waitq, (cqp_request->request_done != 0),
-				NES_EVENT_TIMEOUT);
-		nes_debug(NES_DBG_QP, "Flush SQ QP WQEs completed, ret=%u,"
-				" CQP Major:Minor codes = 0x%04X:0x%04X\n",
-				ret, cqp_request->major_code, cqp_request->minor_code);
-		nes_put_cqp_request(nesdev, cqp_request);
-	}
-}
diff --git a/drivers/infiniband/hw/nes/nes_hw.h b/drivers/infiniband/hw/nes/nes_hw.h
deleted file mode 100644
index 3c56470..0000000
--- a/drivers/infiniband/hw/nes/nes_hw.h
+++ /dev/null
@@ -1,1380 +0,0 @@
-/*
-* Copyright (c) 2006 - 2011 Intel Corporation.  All rights reserved.
-*
-* This software is available to you under a choice of one of two
-* licenses.  You may choose to be licensed under the terms of the GNU
-* General Public License (GPL) Version 2, available from the file
-* COPYING in the main directory of this source tree, or the
-* OpenIB.org BSD license below:
-*
-*     Redistribution and use in source and binary forms, with or
-*     without modification, are permitted provided that the following
-*     conditions are met:
-*
-*      - Redistributions of source code must retain the above
-*        copyright notice, this list of conditions and the following
-*        disclaimer.
-*
-*      - Redistributions in binary form must reproduce the above
-*        copyright notice, this list of conditions and the following
-*        disclaimer in the documentation and/or other materials
-*        provided with the distribution.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
-* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
-* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
-* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-* SOFTWARE.
-*/
-
-#ifndef __NES_HW_H
-#define __NES_HW_H
-
-#define NES_PHY_TYPE_CX4       1
-#define NES_PHY_TYPE_1G        2
-#define NES_PHY_TYPE_ARGUS     4
-#define NES_PHY_TYPE_PUMA_1G   5
-#define NES_PHY_TYPE_PUMA_10G  6
-#define NES_PHY_TYPE_GLADIUS   7
-#define NES_PHY_TYPE_SFP_D     8
-#define NES_PHY_TYPE_KR	       9
-
-#define NES_MULTICAST_PF_MAX 8
-#define NES_A0 3
-
-#define NES_ENABLE_PAU 0x07000001
-#define NES_DISABLE_PAU 0x07000000
-#define NES_PAU_COUNTER 10
-#define NES_CQP_OPCODE_MASK 0x3f
-
-enum pci_regs {
-	NES_INT_STAT = 0x0000,
-	NES_INT_MASK = 0x0004,
-	NES_INT_PENDING = 0x0008,
-	NES_INTF_INT_STAT = 0x000C,
-	NES_INTF_INT_MASK = 0x0010,
-	NES_TIMER_STAT = 0x0014,
-	NES_PERIODIC_CONTROL = 0x0018,
-	NES_ONE_SHOT_CONTROL = 0x001C,
-	NES_EEPROM_COMMAND = 0x0020,
-	NES_EEPROM_DATA = 0x0024,
-	NES_FLASH_COMMAND = 0x0028,
-	NES_FLASH_DATA  = 0x002C,
-	NES_SOFTWARE_RESET = 0x0030,
-	NES_CQ_ACK = 0x0034,
-	NES_WQE_ALLOC = 0x0040,
-	NES_CQE_ALLOC = 0x0044,
-	NES_AEQ_ALLOC = 0x0048
-};
-
-enum indexed_regs {
-	NES_IDX_CREATE_CQP_LOW = 0x0000,
-	NES_IDX_CREATE_CQP_HIGH = 0x0004,
-	NES_IDX_QP_CONTROL = 0x0040,
-	NES_IDX_FLM_CONTROL = 0x0080,
-	NES_IDX_INT_CPU_STATUS = 0x00a0,
-	NES_IDX_GPR_TRIGGER = 0x00bc,
-	NES_IDX_GPIO_CONTROL = 0x00f0,
-	NES_IDX_GPIO_DATA = 0x00f4,
-	NES_IDX_GPR2 = 0x010c,
-	NES_IDX_TCP_CONFIG0 = 0x01e4,
-	NES_IDX_TCP_TIMER_CONFIG = 0x01ec,
-	NES_IDX_TCP_NOW = 0x01f0,
-	NES_IDX_QP_MAX_CFG_SIZES = 0x0200,
-	NES_IDX_QP_CTX_SIZE = 0x0218,
-	NES_IDX_TCP_TIMER_SIZE0 = 0x0238,
-	NES_IDX_TCP_TIMER_SIZE1 = 0x0240,
-	NES_IDX_ARP_CACHE_SIZE = 0x0258,
-	NES_IDX_CQ_CTX_SIZE = 0x0260,
-	NES_IDX_MRT_SIZE = 0x0278,
-	NES_IDX_PBL_REGION_SIZE = 0x0280,
-	NES_IDX_IRRQ_COUNT = 0x02b0,
-	NES_IDX_RX_WINDOW_BUFFER_PAGE_TABLE_SIZE = 0x02f0,
-	NES_IDX_RX_WINDOW_BUFFER_SIZE = 0x0300,
-	NES_IDX_DST_IP_ADDR = 0x0400,
-	NES_IDX_PCIX_DIAG = 0x08e8,
-	NES_IDX_MPP_DEBUG = 0x0a00,
-	NES_IDX_PORT_RX_DISCARDS = 0x0a30,
-	NES_IDX_PORT_TX_DISCARDS = 0x0a34,
-	NES_IDX_MPP_LB_DEBUG = 0x0b00,
-	NES_IDX_DENALI_CTL_22 = 0x1058,
-	NES_IDX_MAC_TX_CONTROL = 0x2000,
-	NES_IDX_MAC_TX_CONFIG = 0x2004,
-	NES_IDX_MAC_TX_PAUSE_QUANTA = 0x2008,
-	NES_IDX_MAC_RX_CONTROL = 0x200c,
-	NES_IDX_MAC_RX_CONFIG = 0x2010,
-	NES_IDX_MAC_EXACT_MATCH_BOTTOM = 0x201c,
-	NES_IDX_MAC_MDIO_CONTROL = 0x2084,
-	NES_IDX_MAC_TX_OCTETS_LOW = 0x2100,
-	NES_IDX_MAC_TX_OCTETS_HIGH = 0x2104,
-	NES_IDX_MAC_TX_FRAMES_LOW = 0x2108,
-	NES_IDX_MAC_TX_FRAMES_HIGH = 0x210c,
-	NES_IDX_MAC_TX_PAUSE_FRAMES = 0x2118,
-	NES_IDX_MAC_TX_ERRORS = 0x2138,
-	NES_IDX_MAC_RX_OCTETS_LOW = 0x213c,
-	NES_IDX_MAC_RX_OCTETS_HIGH = 0x2140,
-	NES_IDX_MAC_RX_FRAMES_LOW = 0x2144,
-	NES_IDX_MAC_RX_FRAMES_HIGH = 0x2148,
-	NES_IDX_MAC_RX_BC_FRAMES_LOW = 0x214c,
-	NES_IDX_MAC_RX_MC_FRAMES_HIGH = 0x2150,
-	NES_IDX_MAC_RX_PAUSE_FRAMES = 0x2154,
-	NES_IDX_MAC_RX_SHORT_FRAMES = 0x2174,
-	NES_IDX_MAC_RX_OVERSIZED_FRAMES = 0x2178,
-	NES_IDX_MAC_RX_JABBER_FRAMES = 0x217c,
-	NES_IDX_MAC_RX_CRC_ERR_FRAMES = 0x2180,
-	NES_IDX_MAC_RX_LENGTH_ERR_FRAMES = 0x2184,
-	NES_IDX_MAC_RX_SYMBOL_ERR_FRAMES = 0x2188,
-	NES_IDX_MAC_INT_STATUS = 0x21f0,
-	NES_IDX_MAC_INT_MASK = 0x21f4,
-	NES_IDX_PHY_PCS_CONTROL_STATUS0 = 0x2800,
-	NES_IDX_PHY_PCS_CONTROL_STATUS1 = 0x2a00,
-	NES_IDX_ETH_SERDES_COMMON_CONTROL0 = 0x2808,
-	NES_IDX_ETH_SERDES_COMMON_CONTROL1 = 0x2a08,
-	NES_IDX_ETH_SERDES_COMMON_STATUS0 = 0x280c,
-	NES_IDX_ETH_SERDES_COMMON_STATUS1 = 0x2a0c,
-	NES_IDX_ETH_SERDES_TX_EMP0 = 0x2810,
-	NES_IDX_ETH_SERDES_TX_EMP1 = 0x2a10,
-	NES_IDX_ETH_SERDES_TX_DRIVE0 = 0x2814,
-	NES_IDX_ETH_SERDES_TX_DRIVE1 = 0x2a14,
-	NES_IDX_ETH_SERDES_RX_MODE0 = 0x2818,
-	NES_IDX_ETH_SERDES_RX_MODE1 = 0x2a18,
-	NES_IDX_ETH_SERDES_RX_SIGDET0 = 0x281c,
-	NES_IDX_ETH_SERDES_RX_SIGDET1 = 0x2a1c,
-	NES_IDX_ETH_SERDES_BYPASS0 = 0x2820,
-	NES_IDX_ETH_SERDES_BYPASS1 = 0x2a20,
-	NES_IDX_ETH_SERDES_LOOPBACK_CONTROL0 = 0x2824,
-	NES_IDX_ETH_SERDES_LOOPBACK_CONTROL1 = 0x2a24,
-	NES_IDX_ETH_SERDES_RX_EQ_CONTROL0 = 0x2828,
-	NES_IDX_ETH_SERDES_RX_EQ_CONTROL1 = 0x2a28,
-	NES_IDX_ETH_SERDES_RX_EQ_STATUS0 = 0x282c,
-	NES_IDX_ETH_SERDES_RX_EQ_STATUS1 = 0x2a2c,
-	NES_IDX_ETH_SERDES_CDR_RESET0 = 0x2830,
-	NES_IDX_ETH_SERDES_CDR_RESET1 = 0x2a30,
-	NES_IDX_ETH_SERDES_CDR_CONTROL0 = 0x2834,
-	NES_IDX_ETH_SERDES_CDR_CONTROL1 = 0x2a34,
-	NES_IDX_ETH_SERDES_TX_HIGHZ_LANE_MODE0 = 0x2838,
-	NES_IDX_ETH_SERDES_TX_HIGHZ_LANE_MODE1 = 0x2a38,
-	NES_IDX_ENDNODE0_NSTAT_RX_DISCARD = 0x3080,
-	NES_IDX_ENDNODE0_NSTAT_RX_OCTETS_LO = 0x3000,
-	NES_IDX_ENDNODE0_NSTAT_RX_OCTETS_HI = 0x3004,
-	NES_IDX_ENDNODE0_NSTAT_RX_FRAMES_LO = 0x3008,
-	NES_IDX_ENDNODE0_NSTAT_RX_FRAMES_HI = 0x300c,
-	NES_IDX_ENDNODE0_NSTAT_TX_OCTETS_LO = 0x7000,
-	NES_IDX_ENDNODE0_NSTAT_TX_OCTETS_HI = 0x7004,
-	NES_IDX_ENDNODE0_NSTAT_TX_FRAMES_LO = 0x7008,
-	NES_IDX_ENDNODE0_NSTAT_TX_FRAMES_HI = 0x700c,
-	NES_IDX_WQM_CONFIG0 = 0x5000,
-	NES_IDX_WQM_CONFIG1 = 0x5004,
-	NES_IDX_CM_CONFIG = 0x5100,
-	NES_IDX_NIC_LOGPORT_TO_PHYPORT = 0x6000,
-	NES_IDX_NIC_PHYPORT_TO_USW = 0x6008,
-	NES_IDX_NIC_ACTIVE = 0x6010,
-	NES_IDX_NIC_UNICAST_ALL = 0x6018,
-	NES_IDX_NIC_MULTICAST_ALL = 0x6020,
-	NES_IDX_NIC_MULTICAST_ENABLE = 0x6028,
-	NES_IDX_NIC_BROADCAST_ON = 0x6030,
-	NES_IDX_USED_CHUNKS_TX = 0x60b0,
-	NES_IDX_TX_POOL_SIZE = 0x60b8,
-	NES_IDX_QUAD_HASH_TABLE_SIZE = 0x6148,
-	NES_IDX_PERFECT_FILTER_LOW = 0x6200,
-	NES_IDX_PERFECT_FILTER_HIGH = 0x6204,
-	NES_IDX_IPV4_TCP_REXMITS = 0x7080,
-	NES_IDX_DEBUG_ERROR_CONTROL_STATUS = 0x913c,
-	NES_IDX_DEBUG_ERROR_MASKS0 = 0x9140,
-	NES_IDX_DEBUG_ERROR_MASKS1 = 0x9144,
-	NES_IDX_DEBUG_ERROR_MASKS2 = 0x9148,
-	NES_IDX_DEBUG_ERROR_MASKS3 = 0x914c,
-	NES_IDX_DEBUG_ERROR_MASKS4 = 0x9150,
-	NES_IDX_DEBUG_ERROR_MASKS5 = 0x9154,
-};
-
-#define NES_IDX_MAC_TX_CONFIG_ENABLE_PAUSE   1
-#define NES_IDX_MPP_DEBUG_PORT_DISABLE_PAUSE (1 << 17)
-
-enum nes_cqp_opcodes {
-	NES_CQP_CREATE_QP = 0x00,
-	NES_CQP_MODIFY_QP = 0x01,
-	NES_CQP_DESTROY_QP = 0x02,
-	NES_CQP_CREATE_CQ = 0x03,
-	NES_CQP_MODIFY_CQ = 0x04,
-	NES_CQP_DESTROY_CQ = 0x05,
-	NES_CQP_ALLOCATE_STAG = 0x09,
-	NES_CQP_REGISTER_STAG = 0x0a,
-	NES_CQP_QUERY_STAG = 0x0b,
-	NES_CQP_REGISTER_SHARED_STAG = 0x0c,
-	NES_CQP_DEALLOCATE_STAG = 0x0d,
-	NES_CQP_MANAGE_ARP_CACHE = 0x0f,
-	NES_CQP_DOWNLOAD_SEGMENT = 0x10,
-	NES_CQP_SUSPEND_QPS = 0x11,
-	NES_CQP_UPLOAD_CONTEXT = 0x13,
-	NES_CQP_CREATE_CEQ = 0x16,
-	NES_CQP_DESTROY_CEQ = 0x18,
-	NES_CQP_CREATE_AEQ = 0x19,
-	NES_CQP_DESTROY_AEQ = 0x1b,
-	NES_CQP_LMI_ACCESS = 0x20,
-	NES_CQP_FLUSH_WQES = 0x22,
-	NES_CQP_MANAGE_APBVT = 0x23,
-	NES_CQP_MANAGE_QUAD_HASH = 0x25
-};
-
-enum nes_cqp_wqe_word_idx {
-	NES_CQP_WQE_OPCODE_IDX = 0,
-	NES_CQP_WQE_ID_IDX = 1,
-	NES_CQP_WQE_COMP_CTX_LOW_IDX = 2,
-	NES_CQP_WQE_COMP_CTX_HIGH_IDX = 3,
-	NES_CQP_WQE_COMP_SCRATCH_LOW_IDX = 4,
-	NES_CQP_WQE_COMP_SCRATCH_HIGH_IDX = 5,
-};
-
-enum nes_cqp_wqe_word_download_idx { /* format differs from other cqp ops */
-	NES_CQP_WQE_DL_OPCODE_IDX = 0,
-	NES_CQP_WQE_DL_COMP_CTX_LOW_IDX = 1,
-	NES_CQP_WQE_DL_COMP_CTX_HIGH_IDX = 2,
-	NES_CQP_WQE_DL_LENGTH_0_TOTAL_IDX = 3
-	/* For index values 4-15 use NES_NIC_SQ_WQE_ values */
-};
-
-enum nes_cqp_cq_wqeword_idx {
-	NES_CQP_CQ_WQE_PBL_LOW_IDX = 6,
-	NES_CQP_CQ_WQE_PBL_HIGH_IDX = 7,
-	NES_CQP_CQ_WQE_CQ_CONTEXT_LOW_IDX = 8,
-	NES_CQP_CQ_WQE_CQ_CONTEXT_HIGH_IDX = 9,
-	NES_CQP_CQ_WQE_DOORBELL_INDEX_HIGH_IDX = 10,
-};
-
-enum nes_cqp_stag_wqeword_idx {
-	NES_CQP_STAG_WQE_PBL_BLK_COUNT_IDX = 1,
-	NES_CQP_STAG_WQE_LEN_HIGH_PD_IDX = 6,
-	NES_CQP_STAG_WQE_LEN_LOW_IDX = 7,
-	NES_CQP_STAG_WQE_STAG_IDX = 8,
-	NES_CQP_STAG_WQE_VA_LOW_IDX = 10,
-	NES_CQP_STAG_WQE_VA_HIGH_IDX = 11,
-	NES_CQP_STAG_WQE_PA_LOW_IDX = 12,
-	NES_CQP_STAG_WQE_PA_HIGH_IDX = 13,
-	NES_CQP_STAG_WQE_PBL_LEN_IDX = 14
-};
-
-#define NES_CQP_OP_LOGICAL_PORT_SHIFT 26
-#define NES_CQP_OP_IWARP_STATE_SHIFT 28
-#define NES_CQP_OP_TERMLEN_SHIFT     28
-
-enum nes_cqp_qp_bits {
-	NES_CQP_QP_ARP_VALID = (1<<8),
-	NES_CQP_QP_WINBUF_VALID = (1<<9),
-	NES_CQP_QP_CONTEXT_VALID = (1<<10),
-	NES_CQP_QP_ORD_VALID = (1<<11),
-	NES_CQP_QP_WINBUF_DATAIND_EN = (1<<12),
-	NES_CQP_QP_VIRT_WQS = (1<<13),
-	NES_CQP_QP_DEL_HTE = (1<<14),
-	NES_CQP_QP_CQS_VALID = (1<<15),
-	NES_CQP_QP_TYPE_TSA = 0,
-	NES_CQP_QP_TYPE_IWARP = (1<<16),
-	NES_CQP_QP_TYPE_CQP = (4<<16),
-	NES_CQP_QP_TYPE_NIC = (5<<16),
-	NES_CQP_QP_MSS_CHG = (1<<20),
-	NES_CQP_QP_STATIC_RESOURCES = (1<<21),
-	NES_CQP_QP_IGNORE_MW_BOUND = (1<<22),
-	NES_CQP_QP_VWQ_USE_LMI = (1<<23),
-	NES_CQP_QP_IWARP_STATE_IDLE = (1<<NES_CQP_OP_IWARP_STATE_SHIFT),
-	NES_CQP_QP_IWARP_STATE_RTS = (2<<NES_CQP_OP_IWARP_STATE_SHIFT),
-	NES_CQP_QP_IWARP_STATE_CLOSING = (3<<NES_CQP_OP_IWARP_STATE_SHIFT),
-	NES_CQP_QP_IWARP_STATE_TERMINATE = (5<<NES_CQP_OP_IWARP_STATE_SHIFT),
-	NES_CQP_QP_IWARP_STATE_ERROR = (6<<NES_CQP_OP_IWARP_STATE_SHIFT),
-	NES_CQP_QP_IWARP_STATE_MASK = (7<<NES_CQP_OP_IWARP_STATE_SHIFT),
-	NES_CQP_QP_TERM_DONT_SEND_FIN = (1<<24),
-	NES_CQP_QP_TERM_DONT_SEND_TERM_MSG = (1<<25),
-	NES_CQP_QP_RESET = (1<<31),
-};
-
-enum nes_cqp_qp_wqe_word_idx {
-	NES_CQP_QP_WQE_CONTEXT_LOW_IDX = 6,
-	NES_CQP_QP_WQE_CONTEXT_HIGH_IDX = 7,
-	NES_CQP_QP_WQE_FLUSH_SQ_CODE = 8,
-	NES_CQP_QP_WQE_FLUSH_RQ_CODE = 9,
-	NES_CQP_QP_WQE_NEW_MSS_IDX = 15,
-};
-
-enum nes_nic_ctx_bits {
-	NES_NIC_CTX_RQ_SIZE_32 = (3<<8),
-	NES_NIC_CTX_RQ_SIZE_512 = (3<<8),
-	NES_NIC_CTX_SQ_SIZE_32 = (1<<10),
-	NES_NIC_CTX_SQ_SIZE_512 = (3<<10),
-};
-
-enum nes_nic_qp_ctx_word_idx {
-	NES_NIC_CTX_MISC_IDX = 0,
-	NES_NIC_CTX_SQ_LOW_IDX = 2,
-	NES_NIC_CTX_SQ_HIGH_IDX = 3,
-	NES_NIC_CTX_RQ_LOW_IDX = 4,
-	NES_NIC_CTX_RQ_HIGH_IDX = 5,
-};
-
-enum nes_cqp_cq_bits {
-	NES_CQP_CQ_CEQE_MASK = (1<<9),
-	NES_CQP_CQ_CEQ_VALID = (1<<10),
-	NES_CQP_CQ_RESIZE = (1<<11),
-	NES_CQP_CQ_CHK_OVERFLOW = (1<<12),
-	NES_CQP_CQ_4KB_CHUNK = (1<<14),
-	NES_CQP_CQ_VIRT = (1<<15),
-};
-
-enum nes_cqp_stag_bits {
-	NES_CQP_STAG_VA_TO = (1<<9),
-	NES_CQP_STAG_DEALLOC_PBLS = (1<<10),
-	NES_CQP_STAG_PBL_BLK_SIZE = (1<<11),
-	NES_CQP_STAG_MR = (1<<13),
-	NES_CQP_STAG_RIGHTS_LOCAL_READ = (1<<16),
-	NES_CQP_STAG_RIGHTS_LOCAL_WRITE = (1<<17),
-	NES_CQP_STAG_RIGHTS_REMOTE_READ = (1<<18),
-	NES_CQP_STAG_RIGHTS_REMOTE_WRITE = (1<<19),
-	NES_CQP_STAG_RIGHTS_WINDOW_BIND = (1<<20),
-	NES_CQP_STAG_REM_ACC_EN = (1<<21),
-	NES_CQP_STAG_LEAVE_PENDING = (1<<31),
-};
-
-enum nes_cqp_ceq_wqeword_idx {
-	NES_CQP_CEQ_WQE_ELEMENT_COUNT_IDX = 1,
-	NES_CQP_CEQ_WQE_PBL_LOW_IDX = 6,
-	NES_CQP_CEQ_WQE_PBL_HIGH_IDX = 7,
-};
-
-enum nes_cqp_ceq_bits {
-	NES_CQP_CEQ_4KB_CHUNK = (1<<14),
-	NES_CQP_CEQ_VIRT = (1<<15),
-};
-
-enum nes_cqp_aeq_wqeword_idx {
-	NES_CQP_AEQ_WQE_ELEMENT_COUNT_IDX = 1,
-	NES_CQP_AEQ_WQE_PBL_LOW_IDX = 6,
-	NES_CQP_AEQ_WQE_PBL_HIGH_IDX = 7,
-};
-
-enum nes_cqp_aeq_bits {
-	NES_CQP_AEQ_4KB_CHUNK = (1<<14),
-	NES_CQP_AEQ_VIRT = (1<<15),
-};
-
-enum nes_cqp_lmi_wqeword_idx {
-	NES_CQP_LMI_WQE_LMI_OFFSET_IDX = 1,
-	NES_CQP_LMI_WQE_FRAG_LOW_IDX = 8,
-	NES_CQP_LMI_WQE_FRAG_HIGH_IDX = 9,
-	NES_CQP_LMI_WQE_FRAG_LEN_IDX = 10,
-};
-
-enum nes_cqp_arp_wqeword_idx {
-	NES_CQP_ARP_WQE_MAC_ADDR_LOW_IDX = 6,
-	NES_CQP_ARP_WQE_MAC_HIGH_IDX = 7,
-	NES_CQP_ARP_WQE_REACHABILITY_MAX_IDX = 1,
-};
-
-enum nes_cqp_upload_wqeword_idx {
-	NES_CQP_UPLOAD_WQE_CTXT_LOW_IDX = 6,
-	NES_CQP_UPLOAD_WQE_CTXT_HIGH_IDX = 7,
-	NES_CQP_UPLOAD_WQE_HTE_IDX = 8,
-};
-
-enum nes_cqp_arp_bits {
-	NES_CQP_ARP_VALID = (1<<8),
-	NES_CQP_ARP_PERM = (1<<9),
-};
-
-enum nes_cqp_flush_bits {
-	NES_CQP_FLUSH_SQ = (1<<30),
-	NES_CQP_FLUSH_RQ = (1<<31),
-	NES_CQP_FLUSH_MAJ_MIN = (1<<28),
-};
-
-enum nes_cqe_opcode_bits {
-	NES_CQE_STAG_VALID = (1<<6),
-	NES_CQE_ERROR = (1<<7),
-	NES_CQE_SQ = (1<<8),
-	NES_CQE_SE = (1<<9),
-	NES_CQE_PSH = (1<<29),
-	NES_CQE_FIN = (1<<30),
-	NES_CQE_VALID = (1<<31),
-};
-
-
-enum nes_cqe_word_idx {
-	NES_CQE_PAYLOAD_LENGTH_IDX = 0,
-	NES_CQE_COMP_COMP_CTX_LOW_IDX = 2,
-	NES_CQE_COMP_COMP_CTX_HIGH_IDX = 3,
-	NES_CQE_INV_STAG_IDX = 4,
-	NES_CQE_QP_ID_IDX = 5,
-	NES_CQE_ERROR_CODE_IDX = 6,
-	NES_CQE_OPCODE_IDX = 7,
-};
-
-enum nes_ceqe_word_idx {
-	NES_CEQE_CQ_CTX_LOW_IDX = 0,
-	NES_CEQE_CQ_CTX_HIGH_IDX = 1,
-};
-
-enum nes_ceqe_status_bit {
-	NES_CEQE_VALID = (1<<31),
-};
-
-enum nes_int_bits {
-	NES_INT_CEQ0 = (1<<0),
-	NES_INT_CEQ1 = (1<<1),
-	NES_INT_CEQ2 = (1<<2),
-	NES_INT_CEQ3 = (1<<3),
-	NES_INT_CEQ4 = (1<<4),
-	NES_INT_CEQ5 = (1<<5),
-	NES_INT_CEQ6 = (1<<6),
-	NES_INT_CEQ7 = (1<<7),
-	NES_INT_CEQ8 = (1<<8),
-	NES_INT_CEQ9 = (1<<9),
-	NES_INT_CEQ10 = (1<<10),
-	NES_INT_CEQ11 = (1<<11),
-	NES_INT_CEQ12 = (1<<12),
-	NES_INT_CEQ13 = (1<<13),
-	NES_INT_CEQ14 = (1<<14),
-	NES_INT_CEQ15 = (1<<15),
-	NES_INT_AEQ0 = (1<<16),
-	NES_INT_AEQ1 = (1<<17),
-	NES_INT_AEQ2 = (1<<18),
-	NES_INT_AEQ3 = (1<<19),
-	NES_INT_AEQ4 = (1<<20),
-	NES_INT_AEQ5 = (1<<21),
-	NES_INT_AEQ6 = (1<<22),
-	NES_INT_AEQ7 = (1<<23),
-	NES_INT_MAC0 = (1<<24),
-	NES_INT_MAC1 = (1<<25),
-	NES_INT_MAC2 = (1<<26),
-	NES_INT_MAC3 = (1<<27),
-	NES_INT_TSW = (1<<28),
-	NES_INT_TIMER = (1<<29),
-	NES_INT_INTF = (1<<30),
-};
-
-enum nes_intf_int_bits {
-	NES_INTF_INT_PCIERR = (1<<0),
-	NES_INTF_PERIODIC_TIMER = (1<<2),
-	NES_INTF_ONE_SHOT_TIMER = (1<<3),
-	NES_INTF_INT_CRITERR = (1<<14),
-	NES_INTF_INT_AEQ0_OFLOW = (1<<16),
-	NES_INTF_INT_AEQ1_OFLOW = (1<<17),
-	NES_INTF_INT_AEQ2_OFLOW = (1<<18),
-	NES_INTF_INT_AEQ3_OFLOW = (1<<19),
-	NES_INTF_INT_AEQ4_OFLOW = (1<<20),
-	NES_INTF_INT_AEQ5_OFLOW = (1<<21),
-	NES_INTF_INT_AEQ6_OFLOW = (1<<22),
-	NES_INTF_INT_AEQ7_OFLOW = (1<<23),
-	NES_INTF_INT_AEQ_OFLOW = (0xff<<16),
-};
-
-enum nes_mac_int_bits {
-	NES_MAC_INT_LINK_STAT_CHG = (1<<1),
-	NES_MAC_INT_XGMII_EXT = (1<<2),
-	NES_MAC_INT_TX_UNDERFLOW = (1<<6),
-	NES_MAC_INT_TX_ERROR = (1<<7),
-};
-
-enum nes_cqe_allocate_bits {
-	NES_CQE_ALLOC_INC_SELECT = (1<<28),
-	NES_CQE_ALLOC_NOTIFY_NEXT = (1<<29),
-	NES_CQE_ALLOC_NOTIFY_SE = (1<<30),
-	NES_CQE_ALLOC_RESET = (1<<31),
-};
-
-enum nes_nic_rq_wqe_word_idx {
-	NES_NIC_RQ_WQE_LENGTH_1_0_IDX = 0,
-	NES_NIC_RQ_WQE_LENGTH_3_2_IDX = 1,
-	NES_NIC_RQ_WQE_FRAG0_LOW_IDX = 2,
-	NES_NIC_RQ_WQE_FRAG0_HIGH_IDX = 3,
-	NES_NIC_RQ_WQE_FRAG1_LOW_IDX = 4,
-	NES_NIC_RQ_WQE_FRAG1_HIGH_IDX = 5,
-	NES_NIC_RQ_WQE_FRAG2_LOW_IDX = 6,
-	NES_NIC_RQ_WQE_FRAG2_HIGH_IDX = 7,
-	NES_NIC_RQ_WQE_FRAG3_LOW_IDX = 8,
-	NES_NIC_RQ_WQE_FRAG3_HIGH_IDX = 9,
-};
-
-enum nes_nic_sq_wqe_word_idx {
-	NES_NIC_SQ_WQE_MISC_IDX = 0,
-	NES_NIC_SQ_WQE_TOTAL_LENGTH_IDX = 1,
-	NES_NIC_SQ_WQE_LSO_INFO_IDX = 2,
-	NES_NIC_SQ_WQE_LENGTH_0_TAG_IDX = 3,
-	NES_NIC_SQ_WQE_LENGTH_2_1_IDX = 4,
-	NES_NIC_SQ_WQE_LENGTH_4_3_IDX = 5,
-	NES_NIC_SQ_WQE_FRAG0_LOW_IDX = 6,
-	NES_NIC_SQ_WQE_FRAG0_HIGH_IDX = 7,
-	NES_NIC_SQ_WQE_FRAG1_LOW_IDX = 8,
-	NES_NIC_SQ_WQE_FRAG1_HIGH_IDX = 9,
-	NES_NIC_SQ_WQE_FRAG2_LOW_IDX = 10,
-	NES_NIC_SQ_WQE_FRAG2_HIGH_IDX = 11,
-	NES_NIC_SQ_WQE_FRAG3_LOW_IDX = 12,
-	NES_NIC_SQ_WQE_FRAG3_HIGH_IDX = 13,
-	NES_NIC_SQ_WQE_FRAG4_LOW_IDX = 14,
-	NES_NIC_SQ_WQE_FRAG4_HIGH_IDX = 15,
-};
-
-enum nes_iwarp_sq_wqe_word_idx {
-	NES_IWARP_SQ_WQE_MISC_IDX = 0,
-	NES_IWARP_SQ_WQE_TOTAL_PAYLOAD_IDX = 1,
-	NES_IWARP_SQ_WQE_COMP_CTX_LOW_IDX = 2,
-	NES_IWARP_SQ_WQE_COMP_CTX_HIGH_IDX = 3,
-	NES_IWARP_SQ_WQE_COMP_SCRATCH_LOW_IDX = 4,
-	NES_IWARP_SQ_WQE_COMP_SCRATCH_HIGH_IDX = 5,
-	NES_IWARP_SQ_WQE_INV_STAG_LOW_IDX = 7,
-	NES_IWARP_SQ_WQE_RDMA_TO_LOW_IDX = 8,
-	NES_IWARP_SQ_WQE_RDMA_TO_HIGH_IDX = 9,
-	NES_IWARP_SQ_WQE_RDMA_LENGTH_IDX = 10,
-	NES_IWARP_SQ_WQE_RDMA_STAG_IDX = 11,
-	NES_IWARP_SQ_WQE_IMM_DATA_START_IDX = 12,
-	NES_IWARP_SQ_WQE_FRAG0_LOW_IDX = 16,
-	NES_IWARP_SQ_WQE_FRAG0_HIGH_IDX = 17,
-	NES_IWARP_SQ_WQE_LENGTH0_IDX = 18,
-	NES_IWARP_SQ_WQE_STAG0_IDX = 19,
-	NES_IWARP_SQ_WQE_FRAG1_LOW_IDX = 20,
-	NES_IWARP_SQ_WQE_FRAG1_HIGH_IDX = 21,
-	NES_IWARP_SQ_WQE_LENGTH1_IDX = 22,
-	NES_IWARP_SQ_WQE_STAG1_IDX = 23,
-	NES_IWARP_SQ_WQE_FRAG2_LOW_IDX = 24,
-	NES_IWARP_SQ_WQE_FRAG2_HIGH_IDX = 25,
-	NES_IWARP_SQ_WQE_LENGTH2_IDX = 26,
-	NES_IWARP_SQ_WQE_STAG2_IDX = 27,
-	NES_IWARP_SQ_WQE_FRAG3_LOW_IDX = 28,
-	NES_IWARP_SQ_WQE_FRAG3_HIGH_IDX = 29,
-	NES_IWARP_SQ_WQE_LENGTH3_IDX = 30,
-	NES_IWARP_SQ_WQE_STAG3_IDX = 31,
-};
-
-enum nes_iwarp_sq_bind_wqe_word_idx {
-	NES_IWARP_SQ_BIND_WQE_MR_IDX = 6,
-	NES_IWARP_SQ_BIND_WQE_MW_IDX = 7,
-	NES_IWARP_SQ_BIND_WQE_LENGTH_LOW_IDX = 8,
-	NES_IWARP_SQ_BIND_WQE_LENGTH_HIGH_IDX = 9,
-	NES_IWARP_SQ_BIND_WQE_VA_FBO_LOW_IDX = 10,
-	NES_IWARP_SQ_BIND_WQE_VA_FBO_HIGH_IDX = 11,
-};
-
-enum nes_iwarp_sq_fmr_wqe_word_idx {
-	NES_IWARP_SQ_FMR_WQE_MR_STAG_IDX = 7,
-	NES_IWARP_SQ_FMR_WQE_LENGTH_LOW_IDX = 8,
-	NES_IWARP_SQ_FMR_WQE_LENGTH_HIGH_IDX = 9,
-	NES_IWARP_SQ_FMR_WQE_VA_FBO_LOW_IDX = 10,
-	NES_IWARP_SQ_FMR_WQE_VA_FBO_HIGH_IDX = 11,
-	NES_IWARP_SQ_FMR_WQE_PBL_ADDR_LOW_IDX = 12,
-	NES_IWARP_SQ_FMR_WQE_PBL_ADDR_HIGH_IDX = 13,
-	NES_IWARP_SQ_FMR_WQE_PBL_LENGTH_IDX = 14,
-};
-
-enum nes_iwarp_sq_fmr_opcodes {
-	NES_IWARP_SQ_FMR_WQE_ZERO_BASED			= (1<<6),
-	NES_IWARP_SQ_FMR_WQE_PAGE_SIZE_4K		= (0<<7),
-	NES_IWARP_SQ_FMR_WQE_PAGE_SIZE_2M		= (1<<7),
-	NES_IWARP_SQ_FMR_WQE_RIGHTS_ENABLE_LOCAL_READ	= (1<<16),
-	NES_IWARP_SQ_FMR_WQE_RIGHTS_ENABLE_LOCAL_WRITE 	= (1<<17),
-	NES_IWARP_SQ_FMR_WQE_RIGHTS_ENABLE_REMOTE_READ 	= (1<<18),
-	NES_IWARP_SQ_FMR_WQE_RIGHTS_ENABLE_REMOTE_WRITE = (1<<19),
-	NES_IWARP_SQ_FMR_WQE_RIGHTS_ENABLE_WINDOW_BIND 	= (1<<20),
-};
-
-#define NES_IWARP_SQ_FMR_WQE_MR_LENGTH_HIGH_MASK	0xFF;
-
-enum nes_iwarp_sq_locinv_wqe_word_idx {
-	NES_IWARP_SQ_LOCINV_WQE_INV_STAG_IDX = 6,
-};
-
-enum nes_iwarp_rq_wqe_word_idx {
-	NES_IWARP_RQ_WQE_TOTAL_PAYLOAD_IDX = 1,
-	NES_IWARP_RQ_WQE_COMP_CTX_LOW_IDX = 2,
-	NES_IWARP_RQ_WQE_COMP_CTX_HIGH_IDX = 3,
-	NES_IWARP_RQ_WQE_COMP_SCRATCH_LOW_IDX = 4,
-	NES_IWARP_RQ_WQE_COMP_SCRATCH_HIGH_IDX = 5,
-	NES_IWARP_RQ_WQE_FRAG0_LOW_IDX = 8,
-	NES_IWARP_RQ_WQE_FRAG0_HIGH_IDX = 9,
-	NES_IWARP_RQ_WQE_LENGTH0_IDX = 10,
-	NES_IWARP_RQ_WQE_STAG0_IDX = 11,
-	NES_IWARP_RQ_WQE_FRAG1_LOW_IDX = 12,
-	NES_IWARP_RQ_WQE_FRAG1_HIGH_IDX = 13,
-	NES_IWARP_RQ_WQE_LENGTH1_IDX = 14,
-	NES_IWARP_RQ_WQE_STAG1_IDX = 15,
-	NES_IWARP_RQ_WQE_FRAG2_LOW_IDX = 16,
-	NES_IWARP_RQ_WQE_FRAG2_HIGH_IDX = 17,
-	NES_IWARP_RQ_WQE_LENGTH2_IDX = 18,
-	NES_IWARP_RQ_WQE_STAG2_IDX = 19,
-	NES_IWARP_RQ_WQE_FRAG3_LOW_IDX = 20,
-	NES_IWARP_RQ_WQE_FRAG3_HIGH_IDX = 21,
-	NES_IWARP_RQ_WQE_LENGTH3_IDX = 22,
-	NES_IWARP_RQ_WQE_STAG3_IDX = 23,
-};
-
-enum nes_nic_sq_wqe_bits {
-	NES_NIC_SQ_WQE_PHDR_CS_READY =  (1<<21),
-	NES_NIC_SQ_WQE_LSO_ENABLE = (1<<22),
-	NES_NIC_SQ_WQE_TAGVALUE_ENABLE = (1<<23),
-	NES_NIC_SQ_WQE_DISABLE_CHKSUM = (1<<30),
-	NES_NIC_SQ_WQE_COMPLETION = (1<<31),
-};
-
-enum nes_nic_cqe_word_idx {
-	NES_NIC_CQE_ACCQP_ID_IDX = 0,
-	NES_NIC_CQE_HASH_RCVNXT = 1,
-	NES_NIC_CQE_TAG_PKT_TYPE_IDX = 2,
-	NES_NIC_CQE_MISC_IDX = 3,
-};
-
-#define NES_PKT_TYPE_APBVT_BITS 0xC112
-#define NES_PKT_TYPE_APBVT_MASK 0xff3e
-
-#define NES_PKT_TYPE_PVALID_BITS 0x10000000
-#define NES_PKT_TYPE_PVALID_MASK 0x30000000
-
-#define NES_PKT_TYPE_TCPV4_BITS 0x0110
-#define NES_PKT_TYPE_TCPV4_MASK 0x3f30
-
-#define NES_PKT_TYPE_UDPV4_BITS 0x0210
-#define NES_PKT_TYPE_UDPV4_MASK 0x3f30
-
-#define NES_PKT_TYPE_IPV4_BITS  0x0010
-#define NES_PKT_TYPE_IPV4_MASK  0x3f30
-
-#define NES_PKT_TYPE_OTHER_BITS 0x0000
-#define NES_PKT_TYPE_OTHER_MASK 0x0030
-
-#define NES_NIC_CQE_ERRV_SHIFT 16
-enum nes_nic_ev_bits {
-	NES_NIC_ERRV_BITS_MODE = (1<<0),
-	NES_NIC_ERRV_BITS_IPV4_CSUM_ERR = (1<<1),
-	NES_NIC_ERRV_BITS_TCPUDP_CSUM_ERR = (1<<2),
-	NES_NIC_ERRV_BITS_WQE_OVERRUN = (1<<3),
-	NES_NIC_ERRV_BITS_IPH_ERR = (1<<4),
-};
-
-enum nes_nic_cqe_bits {
-	NES_NIC_CQE_ERRV_MASK = (0xff<<NES_NIC_CQE_ERRV_SHIFT),
-	NES_NIC_CQE_SQ = (1<<24),
-	NES_NIC_CQE_ACCQP_PORT = (1<<28),
-	NES_NIC_CQE_ACCQP_VALID = (1<<29),
-	NES_NIC_CQE_TAG_VALID = (1<<30),
-	NES_NIC_CQE_VALID = (1<<31),
-};
-
-enum nes_aeqe_word_idx {
-	NES_AEQE_COMP_CTXT_LOW_IDX = 0,
-	NES_AEQE_COMP_CTXT_HIGH_IDX = 1,
-	NES_AEQE_COMP_QP_CQ_ID_IDX = 2,
-	NES_AEQE_MISC_IDX = 3,
-};
-
-enum nes_aeqe_bits {
-	NES_AEQE_QP = (1<<16),
-	NES_AEQE_CQ = (1<<17),
-	NES_AEQE_SQ = (1<<18),
-	NES_AEQE_INBOUND_RDMA = (1<<19),
-	NES_AEQE_IWARP_STATE_MASK = (7<<20),
-	NES_AEQE_TCP_STATE_MASK = (0xf<<24),
-	NES_AEQE_Q2_DATA_WRITTEN = (0x3<<28),
-	NES_AEQE_VALID = (1<<31),
-};
-
-#define NES_AEQE_IWARP_STATE_SHIFT	20
-#define NES_AEQE_TCP_STATE_SHIFT	24
-#define NES_AEQE_Q2_DATA_ETHERNET       (1<<28)
-#define NES_AEQE_Q2_DATA_MPA            (1<<29)
-
-enum nes_aeqe_iwarp_state {
-	NES_AEQE_IWARP_STATE_NON_EXISTANT = 0,
-	NES_AEQE_IWARP_STATE_IDLE = 1,
-	NES_AEQE_IWARP_STATE_RTS = 2,
-	NES_AEQE_IWARP_STATE_CLOSING = 3,
-	NES_AEQE_IWARP_STATE_TERMINATE = 5,
-	NES_AEQE_IWARP_STATE_ERROR = 6
-};
-
-enum nes_aeqe_tcp_state {
-	NES_AEQE_TCP_STATE_NON_EXISTANT = 0,
-	NES_AEQE_TCP_STATE_CLOSED = 1,
-	NES_AEQE_TCP_STATE_LISTEN = 2,
-	NES_AEQE_TCP_STATE_SYN_SENT = 3,
-	NES_AEQE_TCP_STATE_SYN_RCVD = 4,
-	NES_AEQE_TCP_STATE_ESTABLISHED = 5,
-	NES_AEQE_TCP_STATE_CLOSE_WAIT = 6,
-	NES_AEQE_TCP_STATE_FIN_WAIT_1 = 7,
-	NES_AEQE_TCP_STATE_CLOSING = 8,
-	NES_AEQE_TCP_STATE_LAST_ACK = 9,
-	NES_AEQE_TCP_STATE_FIN_WAIT_2 = 10,
-	NES_AEQE_TCP_STATE_TIME_WAIT = 11
-};
-
-enum nes_aeqe_aeid {
-	NES_AEQE_AEID_AMP_UNALLOCATED_STAG                            = 0x0102,
-	NES_AEQE_AEID_AMP_INVALID_STAG                                = 0x0103,
-	NES_AEQE_AEID_AMP_BAD_QP                                      = 0x0104,
-	NES_AEQE_AEID_AMP_BAD_PD                                      = 0x0105,
-	NES_AEQE_AEID_AMP_BAD_STAG_KEY                                = 0x0106,
-	NES_AEQE_AEID_AMP_BAD_STAG_INDEX                              = 0x0107,
-	NES_AEQE_AEID_AMP_BOUNDS_VIOLATION                            = 0x0108,
-	NES_AEQE_AEID_AMP_RIGHTS_VIOLATION                            = 0x0109,
-	NES_AEQE_AEID_AMP_TO_WRAP                                     = 0x010a,
-	NES_AEQE_AEID_AMP_FASTREG_SHARED                              = 0x010b,
-	NES_AEQE_AEID_AMP_FASTREG_VALID_STAG                          = 0x010c,
-	NES_AEQE_AEID_AMP_FASTREG_MW_STAG                             = 0x010d,
-	NES_AEQE_AEID_AMP_FASTREG_INVALID_RIGHTS                      = 0x010e,
-	NES_AEQE_AEID_AMP_FASTREG_PBL_TABLE_OVERFLOW                  = 0x010f,
-	NES_AEQE_AEID_AMP_FASTREG_INVALID_LENGTH                      = 0x0110,
-	NES_AEQE_AEID_AMP_INVALIDATE_SHARED                           = 0x0111,
-	NES_AEQE_AEID_AMP_INVALIDATE_NO_REMOTE_ACCESS_RIGHTS          = 0x0112,
-	NES_AEQE_AEID_AMP_INVALIDATE_MR_WITH_BOUND_WINDOWS            = 0x0113,
-	NES_AEQE_AEID_AMP_MWBIND_VALID_STAG                           = 0x0114,
-	NES_AEQE_AEID_AMP_MWBIND_OF_MR_STAG                           = 0x0115,
-	NES_AEQE_AEID_AMP_MWBIND_TO_ZERO_BASED_STAG                   = 0x0116,
-	NES_AEQE_AEID_AMP_MWBIND_TO_MW_STAG                           = 0x0117,
-	NES_AEQE_AEID_AMP_MWBIND_INVALID_RIGHTS                       = 0x0118,
-	NES_AEQE_AEID_AMP_MWBIND_INVALID_BOUNDS                       = 0x0119,
-	NES_AEQE_AEID_AMP_MWBIND_TO_INVALID_PARENT                    = 0x011a,
-	NES_AEQE_AEID_AMP_MWBIND_BIND_DISABLED                        = 0x011b,
-	NES_AEQE_AEID_BAD_CLOSE                                       = 0x0201,
-	NES_AEQE_AEID_RDMAP_ROE_BAD_LLP_CLOSE                         = 0x0202,
-	NES_AEQE_AEID_CQ_OPERATION_ERROR                              = 0x0203,
-	NES_AEQE_AEID_PRIV_OPERATION_DENIED                           = 0x0204,
-	NES_AEQE_AEID_RDMA_READ_WHILE_ORD_ZERO                        = 0x0205,
-	NES_AEQE_AEID_STAG_ZERO_INVALID                               = 0x0206,
-	NES_AEQE_AEID_DDP_INVALID_MSN_GAP_IN_MSN                      = 0x0301,
-	NES_AEQE_AEID_DDP_INVALID_MSN_RANGE_IS_NOT_VALID              = 0x0302,
-	NES_AEQE_AEID_DDP_UBE_DDP_MESSAGE_TOO_LONG_FOR_AVAILABLE_BUFFER = 0x0303,
-	NES_AEQE_AEID_DDP_UBE_INVALID_DDP_VERSION                     = 0x0304,
-	NES_AEQE_AEID_DDP_UBE_INVALID_MO                              = 0x0305,
-	NES_AEQE_AEID_DDP_UBE_INVALID_MSN_NO_BUFFER_AVAILABLE         = 0x0306,
-	NES_AEQE_AEID_DDP_UBE_INVALID_QN                              = 0x0307,
-	NES_AEQE_AEID_DDP_NO_L_BIT                                    = 0x0308,
-	NES_AEQE_AEID_RDMAP_ROE_INVALID_RDMAP_VERSION                 = 0x0311,
-	NES_AEQE_AEID_RDMAP_ROE_UNEXPECTED_OPCODE                     = 0x0312,
-	NES_AEQE_AEID_ROE_INVALID_RDMA_READ_REQUEST                   = 0x0313,
-	NES_AEQE_AEID_ROE_INVALID_RDMA_WRITE_OR_READ_RESP             = 0x0314,
-	NES_AEQE_AEID_INVALID_ARP_ENTRY                               = 0x0401,
-	NES_AEQE_AEID_INVALID_TCP_OPTION_RCVD                         = 0x0402,
-	NES_AEQE_AEID_STALE_ARP_ENTRY                                 = 0x0403,
-	NES_AEQE_AEID_LLP_CLOSE_COMPLETE                              = 0x0501,
-	NES_AEQE_AEID_LLP_CONNECTION_RESET                            = 0x0502,
-	NES_AEQE_AEID_LLP_FIN_RECEIVED                                = 0x0503,
-	NES_AEQE_AEID_LLP_RECEIVED_MARKER_AND_LENGTH_FIELDS_DONT_MATCH =  0x0504,
-	NES_AEQE_AEID_LLP_RECEIVED_MPA_CRC_ERROR                      = 0x0505,
-	NES_AEQE_AEID_LLP_SEGMENT_TOO_LARGE                           = 0x0506,
-	NES_AEQE_AEID_LLP_SEGMENT_TOO_SMALL                           = 0x0507,
-	NES_AEQE_AEID_LLP_SYN_RECEIVED                                = 0x0508,
-	NES_AEQE_AEID_LLP_TERMINATE_RECEIVED                          = 0x0509,
-	NES_AEQE_AEID_LLP_TOO_MANY_RETRIES                            = 0x050a,
-	NES_AEQE_AEID_LLP_TOO_MANY_KEEPALIVE_RETRIES                  = 0x050b,
-	NES_AEQE_AEID_RESET_SENT                                      = 0x0601,
-	NES_AEQE_AEID_TERMINATE_SENT                                  = 0x0602,
-	NES_AEQE_AEID_DDP_LCE_LOCAL_CATASTROPHIC                      = 0x0700
-};
-
-enum nes_iwarp_sq_opcodes {
-	NES_IWARP_SQ_WQE_WRPDU = (1<<15),
-	NES_IWARP_SQ_WQE_PSH = (1<<21),
-	NES_IWARP_SQ_WQE_STREAMING = (1<<23),
-	NES_IWARP_SQ_WQE_IMM_DATA = (1<<28),
-	NES_IWARP_SQ_WQE_READ_FENCE = (1<<29),
-	NES_IWARP_SQ_WQE_LOCAL_FENCE = (1<<30),
-	NES_IWARP_SQ_WQE_SIGNALED_COMPL = (1<<31),
-};
-
-enum nes_iwarp_sq_wqe_bits {
-	NES_IWARP_SQ_OP_RDMAW = 0,
-	NES_IWARP_SQ_OP_RDMAR = 1,
-	NES_IWARP_SQ_OP_SEND = 3,
-	NES_IWARP_SQ_OP_SENDINV = 4,
-	NES_IWARP_SQ_OP_SENDSE = 5,
-	NES_IWARP_SQ_OP_SENDSEINV = 6,
-	NES_IWARP_SQ_OP_BIND = 8,
-	NES_IWARP_SQ_OP_FAST_REG = 9,
-	NES_IWARP_SQ_OP_LOCINV = 10,
-	NES_IWARP_SQ_OP_RDMAR_LOCINV = 11,
-	NES_IWARP_SQ_OP_NOP = 12,
-};
-
-enum nes_iwarp_cqe_major_code {
-	NES_IWARP_CQE_MAJOR_FLUSH = 1,
-	NES_IWARP_CQE_MAJOR_DRV = 0x8000
-};
-
-enum nes_iwarp_cqe_minor_code {
-	NES_IWARP_CQE_MINOR_FLUSH = 1
-};
-
-#define NES_EEPROM_READ_REQUEST (1<<16)
-#define NES_MAC_ADDR_VALID      (1<<20)
-
-/*
- * NES index registers init values.
- */
-struct nes_init_values {
-	u32 index;
-	u32 data;
-	u8  wrt;
-};
-
-/*
- * NES registers in BAR0.
- */
-struct nes_pci_regs {
-	u32 int_status;
-	u32 int_mask;
-	u32 int_pending;
-	u32 intf_int_status;
-	u32 intf_int_mask;
-	u32 other_regs[59];	 /* pad out to 256 bytes for now */
-};
-
-#define NES_CQP_SQ_SIZE    128
-#define NES_CCQ_SIZE       128
-#define NES_NIC_WQ_SIZE    512
-#define NES_NIC_CTX_SIZE   ((NES_NIC_CTX_RQ_SIZE_512) | (NES_NIC_CTX_SQ_SIZE_512))
-#define NES_NIC_BACK_STORE 0x00038000
-
-struct nes_device;
-
-struct nes_hw_nic_qp_context {
-	__le32 context_words[6];
-};
-
-struct nes_hw_nic_sq_wqe {
-	__le32 wqe_words[16];
-};
-
-struct nes_hw_nic_rq_wqe {
-	__le32 wqe_words[16];
-};
-
-struct nes_hw_nic_cqe {
-	__le32 cqe_words[4];
-};
-
-struct nes_hw_cqp_qp_context {
-	__le32 context_words[4];
-};
-
-struct nes_hw_cqp_wqe {
-	__le32 wqe_words[16];
-};
-
-struct nes_hw_qp_wqe {
-	__le32 wqe_words[32];
-};
-
-struct nes_hw_cqe {
-	__le32 cqe_words[8];
-};
-
-struct nes_hw_ceqe {
-	__le32 ceqe_words[2];
-};
-
-struct nes_hw_aeqe {
-	__le32 aeqe_words[4];
-};
-
-struct nes_cqp_request {
-	union {
-		u64 cqp_callback_context;
-		void *cqp_callback_pointer;
-	};
-	wait_queue_head_t     waitq;
-	struct nes_hw_cqp_wqe cqp_wqe;
-	struct list_head      list;
-	atomic_t              refcount;
-	void (*cqp_callback)(struct nes_device *nesdev, struct nes_cqp_request *cqp_request);
-	u16                   major_code;
-	u16                   minor_code;
-	u8                    waiting;
-	u8                    request_done;
-	u8                    dynamic;
-	u8                    callback;
-};
-
-struct nes_hw_cqp {
-	struct nes_hw_cqp_wqe *sq_vbase;
-	dma_addr_t            sq_pbase;
-	spinlock_t            lock;
-	wait_queue_head_t     waitq;
-	u16                   qp_id;
-	u16                   sq_head;
-	u16                   sq_tail;
-	u16                   sq_size;
-};
-
-#define NES_FIRST_FRAG_SIZE 128
-struct nes_first_frag {
-	u8 buffer[NES_FIRST_FRAG_SIZE];
-};
-
-struct nes_hw_nic {
-	struct nes_first_frag    *first_frag_vbase;	/* virtual address of first frags */
-	struct nes_hw_nic_sq_wqe *sq_vbase;			/* virtual address of sq */
-	struct nes_hw_nic_rq_wqe *rq_vbase;			/* virtual address of rq */
-	struct sk_buff           *tx_skb[NES_NIC_WQ_SIZE];
-	struct sk_buff           *rx_skb[NES_NIC_WQ_SIZE];
-	dma_addr_t frag_paddr[NES_NIC_WQ_SIZE];
-	unsigned long first_frag_overflow[BITS_TO_LONGS(NES_NIC_WQ_SIZE)];
-	dma_addr_t sq_pbase;			/* PCI memory for host rings */
-	dma_addr_t rq_pbase;			/* PCI memory for host rings */
-
-	u16 qp_id;
-	u16 sq_head;
-	u16 sq_tail;
-	u16 sq_size;
-	u16 rq_head;
-	u16 rq_tail;
-	u16 rq_size;
-	u8 replenishing_rq;
-	u8 reserved;
-
-	spinlock_t rq_lock;
-};
-
-struct nes_hw_nic_cq {
-	struct nes_hw_nic_cqe volatile *cq_vbase;	/* PCI memory for host rings */
-	void (*ce_handler)(struct nes_device *nesdev, struct nes_hw_nic_cq *cq);
-	dma_addr_t cq_pbase;	/* PCI memory for host rings */
-	int rx_cqes_completed;
-	int cqe_allocs_pending;
-	int rx_pkts_indicated;
-	u16 cq_head;
-	u16 cq_size;
-	u16 cq_number;
-	u8  cqes_pending;
-};
-
-struct nes_hw_qp {
-	struct nes_hw_qp_wqe *sq_vbase;		/* PCI memory for host rings */
-	struct nes_hw_qp_wqe *rq_vbase;		/* PCI memory for host rings */
-	void                 *q2_vbase;			/* PCI memory for host rings */
-	dma_addr_t sq_pbase;	/* PCI memory for host rings */
-	dma_addr_t rq_pbase;	/* PCI memory for host rings */
-	dma_addr_t q2_pbase;	/* PCI memory for host rings */
-	u32 qp_id;
-	u16 sq_head;
-	u16 sq_tail;
-	u16 sq_size;
-	u16 rq_head;
-	u16 rq_tail;
-	u16 rq_size;
-	u8  rq_encoded_size;
-	u8  sq_encoded_size;
-};
-
-struct nes_hw_cq {
-	struct nes_hw_cqe *cq_vbase;	/* PCI memory for host rings */
-	void (*ce_handler)(struct nes_device *nesdev, struct nes_hw_cq *cq);
-	dma_addr_t cq_pbase;	/* PCI memory for host rings */
-	u16 cq_head;
-	u16 cq_size;
-	u16 cq_number;
-};
-
-struct nes_hw_ceq {
-	struct nes_hw_ceqe volatile *ceq_vbase;	/* PCI memory for host rings */
-	dma_addr_t ceq_pbase;	/* PCI memory for host rings */
-	u16 ceq_head;
-	u16 ceq_size;
-};
-
-struct nes_hw_aeq {
-	struct nes_hw_aeqe volatile *aeq_vbase;	/* PCI memory for host rings */
-	dma_addr_t aeq_pbase;	/* PCI memory for host rings */
-	u16 aeq_head;
-	u16 aeq_size;
-};
-
-struct nic_qp_map {
-	u8 qpid;
-	u8 nic_index;
-	u8 logical_port;
-	u8 is_hnic;
-};
-
-#define	NES_CQP_ARP_AEQ_INDEX_MASK  0x000f0000
-#define	NES_CQP_ARP_AEQ_INDEX_SHIFT 16
-
-#define NES_CQP_APBVT_ADD			0x00008000
-#define NES_CQP_APBVT_NIC_SHIFT		16
-
-#define NES_ARP_ADD     1
-#define NES_ARP_DELETE  2
-#define NES_ARP_RESOLVE 3
-
-#define NES_MAC_SW_IDLE      0
-#define NES_MAC_SW_INTERRUPT 1
-#define NES_MAC_SW_MH        2
-
-struct nes_arp_entry {
-	u32 ip_addr;
-	u8  mac_addr[ETH_ALEN];
-};
-
-#define NES_NIC_FAST_TIMER          96
-#define NES_NIC_FAST_TIMER_LOW      40
-#define NES_NIC_FAST_TIMER_HIGH     1000
-#define DEFAULT_NES_QL_HIGH         256
-#define DEFAULT_NES_QL_LOW          16
-#define DEFAULT_NES_QL_TARGET       64
-#define DEFAULT_JUMBO_NES_QL_LOW    12
-#define DEFAULT_JUMBO_NES_QL_TARGET 40
-#define DEFAULT_JUMBO_NES_QL_HIGH   128
-#define NES_NIC_CQ_DOWNWARD_TREND   16
-#define NES_PFT_SIZE		    48
-
-#define NES_MGT_WQ_COUNT 32
-#define NES_MGT_CTX_SIZE ((NES_NIC_CTX_RQ_SIZE_32) | (NES_NIC_CTX_SQ_SIZE_32))
-#define NES_MGT_QP_OFFSET 36
-#define NES_MGT_QP_COUNT 4
-
-struct nes_hw_tune_timer {
-    /* u16 cq_count; */
-    u16 threshold_low;
-    u16 threshold_target;
-    u16 threshold_high;
-    u16 timer_in_use;
-    u16 timer_in_use_old;
-    u16 timer_in_use_min;
-    u16 timer_in_use_max;
-    u8  timer_direction_upward;
-    u8  timer_direction_downward;
-    u16 cq_count_old;
-    u8  cq_direction_downward;
-};
-
-#define NES_TIMER_INT_LIMIT         2
-#define NES_TIMER_INT_LIMIT_DYNAMIC 10
-#define NES_TIMER_ENABLE_LIMIT      4
-#define NES_MAX_LINK_INTERRUPTS     128
-#define NES_MAX_LINK_CHECK          200
-
-struct nes_adapter {
-	u64              fw_ver;
-	unsigned long    *allocated_qps;
-	unsigned long    *allocated_cqs;
-	unsigned long    *allocated_mrs;
-	unsigned long    *allocated_pds;
-	unsigned long    *allocated_arps;
-	struct nes_qp    **qp_table;
-	struct workqueue_struct *work_q;
-
-	struct list_head list;
-	struct list_head active_listeners;
-	/* list of the netdev's associated with each logical port */
-	struct list_head nesvnic_list[4];
-
-	struct timer_list  mh_timer;
-	struct timer_list  lc_timer;
-	struct work_struct work;
-	spinlock_t         resource_lock;
-	spinlock_t         phy_lock;
-	spinlock_t         pbl_lock;
-	spinlock_t         periodic_timer_lock;
-
-	struct nes_arp_entry arp_table[NES_MAX_ARP_TABLE_SIZE];
-
-	/* Adapter CEQ and AEQs */
-	struct nes_hw_ceq ceq[16];
-	struct nes_hw_aeq aeq[8];
-
-	struct nes_hw_tune_timer tune_timer;
-
-	unsigned long doorbell_start;
-
-	u32 hw_rev;
-	u32 vendor_id;
-	u32 vendor_part_id;
-	u32 device_cap_flags;
-	u32 tick_delta;
-	u32 timer_int_req;
-	u32 arp_table_size;
-	u32 next_arp_index;
-
-	u32 max_mr;
-	u32 max_256pbl;
-	u32 max_4kpbl;
-	u32 free_256pbl;
-	u32 free_4kpbl;
-	u32 max_mr_size;
-	u32 max_qp;
-	u32 next_qp;
-	u32 max_irrq;
-	u32 max_qp_wr;
-	u32 max_sge;
-	u32 max_cq;
-	u32 next_cq;
-	u32 max_cqe;
-	u32 max_pd;
-	u32 base_pd;
-	u32 next_pd;
-	u32 hte_index_mask;
-
-	/* EEPROM information */
-	u32 rx_pool_size;
-	u32 tx_pool_size;
-	u32 rx_threshold;
-	u32 tcp_timer_core_clk_divisor;
-	u32 iwarp_config;
-	u32 cm_config;
-	u32 sws_timer_config;
-	u32 tcp_config1;
-	u32 wqm_wat;
-	u32 core_clock;
-	u32 firmware_version;
-	u32 eeprom_version;
-
-	u32 nic_rx_eth_route_err;
-
-	u32 et_rx_coalesce_usecs;
-	u32 et_rx_max_coalesced_frames;
-	u32 et_rx_coalesce_usecs_irq;
-	u32 et_rx_max_coalesced_frames_irq;
-	u32 et_pkt_rate_low;
-	u32 et_rx_coalesce_usecs_low;
-	u32 et_rx_max_coalesced_frames_low;
-	u32 et_pkt_rate_high;
-	u32 et_rx_coalesce_usecs_high;
-	u32 et_rx_max_coalesced_frames_high;
-	u32 et_rate_sample_interval;
-	u32 timer_int_limit;
-	u32 wqm_quanta;
-	u8 allow_unaligned_fpdus;
-
-	/* Adapter base MAC address */
-	u32 mac_addr_low;
-	u16 mac_addr_high;
-
-	u16 firmware_eeprom_offset;
-	u16 software_eeprom_offset;
-
-	u16 max_irrq_wr;
-
-	/* pd config for each port */
-	u16 pd_config_size[4];
-	u16 pd_config_base[4];
-
-	u16 link_interrupt_count[4];
-	u8 crit_error_count[32];
-
-	/* the phy index for each port */
-	u8  phy_index[4];
-	u8  mac_sw_state[4];
-	u8  mac_link_down[4];
-	u8  phy_type[4];
-	u8  log_port;
-
-	/* PCI information */
-	struct nes_device *nesdev;
-	unsigned int  devfn;
-	unsigned char bus_number;
-	unsigned char OneG_Mode;
-
-	unsigned char ref_count;
-	u8            netdev_count;
-	u8            netdev_max;	/* from host nic address count in EEPROM */
-	u8            port_count;
-	u8            virtwq;
-	u8            send_term_ok;
-	u8            et_use_adaptive_rx_coalesce;
-	u8            adapter_fcn_count;
-	u8 pft_mcast_map[NES_PFT_SIZE];
-};
-
-struct nes_pbl {
-	u64              *pbl_vbase;
-	dma_addr_t       pbl_pbase;
-	struct page      *page;
-	unsigned long    user_base;
-	u32              pbl_size;
-	struct list_head list;
-	/* TODO: need to add list for two level tables */
-};
-
-#define NES_4K_PBL_CHUNK_SIZE	4096
-
-struct nes_fast_mr_wqe_pbl {
-	u64		*kva;
-	dma_addr_t	paddr;
-};
-
-struct nes_listener {
-	struct work_struct      work;
-	struct workqueue_struct *wq;
-	struct nes_vnic         *nesvnic;
-	struct iw_cm_id         *cm_id;
-	struct list_head        list;
-	unsigned long           socket;
-	u8                      accept_failed;
-};
-
-struct nes_ib_device;
-
-#define NES_EVENT_DELAY msecs_to_jiffies(100)
-
-struct nes_vnic {
-	struct nes_ib_device *nesibdev;
-	u64 sq_full;
-	u64 tso_requests;
-	u64 segmented_tso_requests;
-	u64 linearized_skbs;
-	u64 tx_sw_dropped;
-	u64 endnode_nstat_rx_discard;
-	u64 endnode_nstat_rx_octets;
-	u64 endnode_nstat_rx_frames;
-	u64 endnode_nstat_tx_octets;
-	u64 endnode_nstat_tx_frames;
-	u64 endnode_ipv4_tcp_retransmits;
-	/* void *mem; */
-	struct nes_device *nesdev;
-	struct net_device *netdev;
-	atomic_t          rx_skbs_needed;
-	atomic_t          rx_skb_timer_running;
-	int               budget;
-	u32               msg_enable;
-	/* u32 tx_avail; */
-	__be32            local_ipaddr;
-	struct napi_struct   napi;
-	spinlock_t           tx_lock;	/* could use netdev tx lock? */
-	struct timer_list    rq_wqes_timer;
-	u32                  nic_mem_size;
-	void                 *nic_vbase;
-	dma_addr_t           nic_pbase;
-	struct nes_hw_nic    nic;
-	struct nes_hw_nic_cq nic_cq;
-	u32    mcrq_qp_id;
-	struct nes_ucontext *mcrq_ucontext;
-	struct nes_cqp_request* (*get_cqp_request)(struct nes_device *nesdev);
-	void (*post_cqp_request)(struct nes_device*, struct nes_cqp_request *);
-	int (*mcrq_mcast_filter)( struct nes_vnic* nesvnic, __u8* dmi_addr );
-	struct net_device_stats netstats;
-	/* used to put the netdev on the adapters logical port list */
-	struct list_head list;
-	u16 max_frame_size;
-	u8  netdev_open;
-	u8  linkup;
-	u8  logical_port;
-	u8  netdev_index;  /* might not be needed, indexes nesdev->netdev */
-	u8  perfect_filter_index;
-	u8  nic_index;
-	u8  qp_nic_index[4];
-	u8  next_qp_nic_index;
-	u8  of_device_registered;
-	u8  rdma_enabled;
-	struct timer_list event_timer;
-	enum ib_event_type delayed_event;
-	enum ib_event_type last_dispatched_event;
-	spinlock_t port_ibevent_lock;
-	u32 mgt_mem_size;
-	void *mgt_vbase;
-	dma_addr_t mgt_pbase;
-	struct nes_vnic_mgt *mgtvnic[NES_MGT_QP_COUNT];
-	struct task_struct *mgt_thread;
-	wait_queue_head_t mgt_wait_queue;
-	struct sk_buff_head mgt_skb_list;
-
-};
-
-struct nes_ib_device {
-	struct ib_device ibdev;
-	struct nes_vnic *nesvnic;
-
-	/* Virtual RNIC Limits */
-	u32 max_mr;
-	u32 max_qp;
-	u32 max_cq;
-	u32 max_pd;
-	u32 num_mr;
-	u32 num_qp;
-	u32 num_cq;
-	u32 num_pd;
-};
-
-enum nes_hdrct_flags {
-	DDP_LEN_FLAG                    = 0x80,
-	DDP_HDR_FLAG                    = 0x40,
-	RDMA_HDR_FLAG                   = 0x20
-};
-
-enum nes_term_layers {
-	LAYER_RDMA			= 0,
-	LAYER_DDP			= 1,
-	LAYER_MPA			= 2
-};
-
-enum nes_term_error_types {
-	RDMAP_CATASTROPHIC		= 0,
-	RDMAP_REMOTE_PROT		= 1,
-	RDMAP_REMOTE_OP			= 2,
-	DDP_CATASTROPHIC		= 0,
-	DDP_TAGGED_BUFFER		= 1,
-	DDP_UNTAGGED_BUFFER		= 2,
-	DDP_LLP				= 3
-};
-
-enum nes_term_rdma_errors {
-	RDMAP_INV_STAG			= 0x00,
-	RDMAP_INV_BOUNDS		= 0x01,
-	RDMAP_ACCESS			= 0x02,
-	RDMAP_UNASSOC_STAG		= 0x03,
-	RDMAP_TO_WRAP			= 0x04,
-	RDMAP_INV_RDMAP_VER		= 0x05,
-	RDMAP_UNEXPECTED_OP		= 0x06,
-	RDMAP_CATASTROPHIC_LOCAL	= 0x07,
-	RDMAP_CATASTROPHIC_GLOBAL	= 0x08,
-	RDMAP_CANT_INV_STAG		= 0x09,
-	RDMAP_UNSPECIFIED		= 0xff
-};
-
-enum nes_term_ddp_errors {
-	DDP_CATASTROPHIC_LOCAL		= 0x00,
-	DDP_TAGGED_INV_STAG		= 0x00,
-	DDP_TAGGED_BOUNDS		= 0x01,
-	DDP_TAGGED_UNASSOC_STAG		= 0x02,
-	DDP_TAGGED_TO_WRAP		= 0x03,
-	DDP_TAGGED_INV_DDP_VER		= 0x04,
-	DDP_UNTAGGED_INV_QN		= 0x01,
-	DDP_UNTAGGED_INV_MSN_NO_BUF	= 0x02,
-	DDP_UNTAGGED_INV_MSN_RANGE	= 0x03,
-	DDP_UNTAGGED_INV_MO		= 0x04,
-	DDP_UNTAGGED_INV_TOO_LONG	= 0x05,
-	DDP_UNTAGGED_INV_DDP_VER	= 0x06
-};
-
-enum nes_term_mpa_errors {
-	MPA_CLOSED			= 0x01,
-	MPA_CRC				= 0x02,
-	MPA_MARKER			= 0x03,
-	MPA_REQ_RSP			= 0x04,
-};
-
-struct nes_terminate_hdr {
-	u8 layer_etype;
-	u8 error_code;
-	u8 hdrct;
-	u8 rsvd;
-};
-
-/* Used to determine how to fill in terminate error codes */
-#define IWARP_OPCODE_WRITE		0
-#define IWARP_OPCODE_READREQ		1
-#define IWARP_OPCODE_READRSP		2
-#define IWARP_OPCODE_SEND		3
-#define IWARP_OPCODE_SEND_INV		4
-#define IWARP_OPCODE_SEND_SE		5
-#define IWARP_OPCODE_SEND_SE_INV	6
-#define IWARP_OPCODE_TERM		7
-
-/* These values are used only during terminate processing */
-#define TERM_DDP_LEN_TAGGED	14
-#define TERM_DDP_LEN_UNTAGGED	18
-#define TERM_RDMA_LEN		28
-#define RDMA_OPCODE_MASK	0x0f
-#define RDMA_READ_REQ_OPCODE	1
-#define BAD_FRAME_OFFSET	64
-#define CQE_MAJOR_DRV		0x8000
-
-/* Used for link status recheck after interrupt processing */
-#define NES_LINK_RECHECK_DELAY	msecs_to_jiffies(50)
-#define NES_LINK_RECHECK_MAX	60
-
-#endif		/* __NES_HW_H */
diff --git a/drivers/infiniband/hw/nes/nes_mgt.c b/drivers/infiniband/hw/nes/nes_mgt.c
deleted file mode 100644
index 9bdb84d..0000000
--- a/drivers/infiniband/hw/nes/nes_mgt.c
+++ /dev/null
@@ -1,1157 +0,0 @@
-/*
- * Copyright (c) 2006 - 2011 Intel-NE, Inc.  All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- */
-
-#include <linux/skbuff.h>
-#include <linux/etherdevice.h>
-#include <linux/kthread.h>
-#include <linux/ip.h>
-#include <linux/tcp.h>
-#include <net/tcp.h>
-#include "nes.h"
-#include "nes_mgt.h"
-
-atomic_t pau_qps_created;
-atomic_t pau_qps_destroyed;
-
-static void nes_replenish_mgt_rq(struct nes_vnic_mgt *mgtvnic)
-{
-	unsigned long flags;
-	dma_addr_t bus_address;
-	struct sk_buff *skb;
-	struct nes_hw_nic_rq_wqe *nic_rqe;
-	struct nes_hw_mgt *nesmgt;
-	struct nes_device *nesdev;
-	struct nes_rskb_cb *cb;
-	u32 rx_wqes_posted = 0;
-
-	nesmgt = &mgtvnic->mgt;
-	nesdev = mgtvnic->nesvnic->nesdev;
-	spin_lock_irqsave(&nesmgt->rq_lock, flags);
-	if (nesmgt->replenishing_rq != 0) {
-		if (((nesmgt->rq_size - 1) == atomic_read(&mgtvnic->rx_skbs_needed)) &&
-		    (atomic_read(&mgtvnic->rx_skb_timer_running) == 0)) {
-			atomic_set(&mgtvnic->rx_skb_timer_running, 1);
-			spin_unlock_irqrestore(&nesmgt->rq_lock, flags);
-			mgtvnic->rq_wqes_timer.expires = jiffies + (HZ / 2);      /* 1/2 second */
-			add_timer(&mgtvnic->rq_wqes_timer);
-		} else {
-			spin_unlock_irqrestore(&nesmgt->rq_lock, flags);
-		}
-		return;
-	}
-	nesmgt->replenishing_rq = 1;
-	spin_unlock_irqrestore(&nesmgt->rq_lock, flags);
-	do {
-		skb = dev_alloc_skb(mgtvnic->nesvnic->max_frame_size);
-		if (skb) {
-			skb->dev = mgtvnic->nesvnic->netdev;
-
-			bus_address = pci_map_single(nesdev->pcidev,
-						     skb->data, mgtvnic->nesvnic->max_frame_size, PCI_DMA_FROMDEVICE);
-			cb = (struct nes_rskb_cb *)&skb->cb[0];
-			cb->busaddr = bus_address;
-			cb->maplen = mgtvnic->nesvnic->max_frame_size;
-
-			nic_rqe = &nesmgt->rq_vbase[mgtvnic->mgt.rq_head];
-			nic_rqe->wqe_words[NES_NIC_RQ_WQE_LENGTH_1_0_IDX] =
-				cpu_to_le32(mgtvnic->nesvnic->max_frame_size);
-			nic_rqe->wqe_words[NES_NIC_RQ_WQE_LENGTH_3_2_IDX] = 0;
-			nic_rqe->wqe_words[NES_NIC_RQ_WQE_FRAG0_LOW_IDX] =
-				cpu_to_le32((u32)bus_address);
-			nic_rqe->wqe_words[NES_NIC_RQ_WQE_FRAG0_HIGH_IDX] =
-				cpu_to_le32((u32)((u64)bus_address >> 32));
-			nesmgt->rx_skb[nesmgt->rq_head] = skb;
-			nesmgt->rq_head++;
-			nesmgt->rq_head &= nesmgt->rq_size - 1;
-			atomic_dec(&mgtvnic->rx_skbs_needed);
-			barrier();
-			if (++rx_wqes_posted == 255) {
-				nes_write32(nesdev->regs + NES_WQE_ALLOC, (rx_wqes_posted << 24) | nesmgt->qp_id);
-				rx_wqes_posted = 0;
-			}
-		} else {
-			spin_lock_irqsave(&nesmgt->rq_lock, flags);
-			if (((nesmgt->rq_size - 1) == atomic_read(&mgtvnic->rx_skbs_needed)) &&
-			    (atomic_read(&mgtvnic->rx_skb_timer_running) == 0)) {
-				atomic_set(&mgtvnic->rx_skb_timer_running, 1);
-				spin_unlock_irqrestore(&nesmgt->rq_lock, flags);
-				mgtvnic->rq_wqes_timer.expires = jiffies + (HZ / 2);      /* 1/2 second */
-				add_timer(&mgtvnic->rq_wqes_timer);
-			} else {
-				spin_unlock_irqrestore(&nesmgt->rq_lock, flags);
-			}
-			break;
-		}
-	} while (atomic_read(&mgtvnic->rx_skbs_needed));
-	barrier();
-	if (rx_wqes_posted)
-		nes_write32(nesdev->regs + NES_WQE_ALLOC, (rx_wqes_posted << 24) | nesmgt->qp_id);
-	nesmgt->replenishing_rq = 0;
-}
-
-/**
- * nes_mgt_rq_wqes_timeout
- */
-static void nes_mgt_rq_wqes_timeout(struct timer_list *t)
-{
-	struct nes_vnic_mgt *mgtvnic = from_timer(mgtvnic, t,
-						       rq_wqes_timer);
-
-	atomic_set(&mgtvnic->rx_skb_timer_running, 0);
-	if (atomic_read(&mgtvnic->rx_skbs_needed))
-		nes_replenish_mgt_rq(mgtvnic);
-}
-
-/**
- * nes_mgt_free_skb - unmap and free skb
- */
-static void nes_mgt_free_skb(struct nes_device *nesdev, struct sk_buff *skb, u32 dir)
-{
-	struct nes_rskb_cb *cb;
-
-	cb = (struct nes_rskb_cb *)&skb->cb[0];
-	pci_unmap_single(nesdev->pcidev, cb->busaddr, cb->maplen, dir);
-	cb->busaddr = 0;
-	dev_kfree_skb_any(skb);
-}
-
-/**
- * nes_download_callback - handle download completions
- */
-static void nes_download_callback(struct nes_device *nesdev, struct nes_cqp_request *cqp_request)
-{
-	struct pau_fpdu_info *fpdu_info = cqp_request->cqp_callback_pointer;
-	struct nes_qp *nesqp = fpdu_info->nesqp;
-	struct sk_buff *skb;
-	int i;
-
-	for (i = 0; i < fpdu_info->frag_cnt; i++) {
-		skb = fpdu_info->frags[i].skb;
-		if (fpdu_info->frags[i].cmplt) {
-			nes_mgt_free_skb(nesdev, skb, PCI_DMA_TODEVICE);
-			nes_rem_ref_cm_node(nesqp->cm_node);
-		}
-	}
-
-	if (fpdu_info->hdr_vbase)
-		pci_free_consistent(nesdev->pcidev, fpdu_info->hdr_len,
-				    fpdu_info->hdr_vbase, fpdu_info->hdr_pbase);
-	kfree(fpdu_info);
-}
-
-/**
- * nes_get_seq - Get the seq, ack_seq and window from the packet
- */
-static u32 nes_get_seq(struct sk_buff *skb, u32 *ack, u16 *wnd, u32 *fin_rcvd, u32 *rst_rcvd)
-{
-	struct nes_rskb_cb *cb = (struct nes_rskb_cb *)&skb->cb[0];
-	struct iphdr *iph = (struct iphdr *)(cb->data_start + ETH_HLEN);
-	struct tcphdr *tcph = (struct tcphdr *)(((char *)iph) + (4 * iph->ihl));
-
-	*ack = be32_to_cpu(tcph->ack_seq);
-	*wnd = be16_to_cpu(tcph->window);
-	*fin_rcvd = tcph->fin;
-	*rst_rcvd = tcph->rst;
-	return be32_to_cpu(tcph->seq);
-}
-
-/**
- * nes_get_next_skb - Get the next skb based on where current skb is in the queue
- */
-static struct sk_buff *nes_get_next_skb(struct nes_device *nesdev, struct nes_qp *nesqp,
-					struct sk_buff *skb, u32 nextseq, u32 *ack,
-					u16 *wnd, u32 *fin_rcvd, u32 *rst_rcvd)
-{
-	u32 seq;
-	bool processacks;
-	struct sk_buff *old_skb;
-
-	if (skb) {
-		/* Continue processing fpdu */
-		if (skb->next == (struct sk_buff *)&nesqp->pau_list)
-			goto out;
-		skb = skb->next;
-		processacks = false;
-	} else {
-		/* Starting a new one */
-		if (skb_queue_empty(&nesqp->pau_list))
-			goto out;
-		skb = skb_peek(&nesqp->pau_list);
-		processacks = true;
-	}
-
-	while (1) {
-		if (skb_queue_empty(&nesqp->pau_list))
-			goto out;
-
-		seq = nes_get_seq(skb, ack, wnd, fin_rcvd, rst_rcvd);
-		if (seq == nextseq) {
-			if (skb->len || processacks)
-				break;
-		} else if (after(seq, nextseq)) {
-			goto out;
-		}
-
-		old_skb = skb;
-		skb = skb->next;
-		skb_unlink(old_skb, &nesqp->pau_list);
-		nes_mgt_free_skb(nesdev, old_skb, PCI_DMA_TODEVICE);
-		nes_rem_ref_cm_node(nesqp->cm_node);
-		if (skb == (struct sk_buff *)&nesqp->pau_list)
-			goto out;
-	}
-	return skb;
-
-out:
-	return NULL;
-}
-
-/**
- * get_fpdu_info - Find the next complete fpdu and return its fragments.
- */
-static int get_fpdu_info(struct nes_device *nesdev, struct nes_qp *nesqp,
-			 struct pau_fpdu_info **pau_fpdu_info)
-{
-	struct sk_buff *skb;
-	struct iphdr *iph;
-	struct tcphdr *tcph;
-	struct nes_rskb_cb *cb;
-	struct pau_fpdu_info *fpdu_info = NULL;
-	struct pau_fpdu_frag frags[MAX_FPDU_FRAGS];
-	u32 fpdu_len = 0;
-	u32 tmp_len;
-	int frag_cnt = 0;
-	u32 tot_len;
-	u32 frag_tot;
-	u32 ack;
-	u32 fin_rcvd;
-	u32 rst_rcvd;
-	u16 wnd;
-	int i;
-	int rc = 0;
-
-	*pau_fpdu_info = NULL;
-
-	skb = nes_get_next_skb(nesdev, nesqp, NULL, nesqp->pau_rcv_nxt, &ack, &wnd, &fin_rcvd, &rst_rcvd);
-	if (!skb)
-		goto out;
-
-	cb = (struct nes_rskb_cb *)&skb->cb[0];
-	if (skb->len) {
-		fpdu_len = be16_to_cpu(*(__be16 *) skb->data) + MPA_FRAMING;
-		fpdu_len = (fpdu_len + 3) & 0xfffffffc;
-		tmp_len = fpdu_len;
-
-		/* See if we have all of the fpdu */
-		frag_tot = 0;
-		memset(&frags, 0, sizeof frags);
-		for (i = 0; i < MAX_FPDU_FRAGS; i++) {
-			frags[i].physaddr = cb->busaddr;
-			frags[i].physaddr += skb->data - cb->data_start;
-			frags[i].frag_len = min(tmp_len, skb->len);
-			frags[i].skb = skb;
-			frags[i].cmplt = (skb->len == frags[i].frag_len);
-			frag_tot += frags[i].frag_len;
-			frag_cnt++;
-
-			tmp_len -= frags[i].frag_len;
-			if (tmp_len == 0)
-				break;
-
-			skb = nes_get_next_skb(nesdev, nesqp, skb,
-					       nesqp->pau_rcv_nxt + frag_tot, &ack, &wnd, &fin_rcvd, &rst_rcvd);
-			if (!skb)
-				goto out;
-			if (rst_rcvd) {
-				/* rst received in the middle of fpdu */
-				for (; i >= 0; i--) {
-					skb_unlink(frags[i].skb, &nesqp->pau_list);
-					nes_mgt_free_skb(nesdev, frags[i].skb, PCI_DMA_TODEVICE);
-				}
-				cb = (struct nes_rskb_cb *)&skb->cb[0];
-				frags[0].physaddr = cb->busaddr;
-				frags[0].physaddr += skb->data - cb->data_start;
-				frags[0].frag_len = skb->len;
-				frags[0].skb = skb;
-				frags[0].cmplt = true;
-				frag_cnt = 1;
-				break;
-			}
-
-			cb = (struct nes_rskb_cb *)&skb->cb[0];
-		}
-	} else {
-		/* no data */
-		frags[0].physaddr = cb->busaddr;
-		frags[0].frag_len = 0;
-		frags[0].skb = skb;
-		frags[0].cmplt = true;
-		frag_cnt = 1;
-	}
-
-	/* Found one */
-	fpdu_info = kzalloc(sizeof(*fpdu_info), GFP_ATOMIC);
-	if (!fpdu_info) {
-		rc = -ENOMEM;
-		goto out;
-	}
-
-	fpdu_info->cqp_request = nes_get_cqp_request(nesdev);
-	if (fpdu_info->cqp_request == NULL) {
-		nes_debug(NES_DBG_PAU, "Failed to get a cqp_request.\n");
-		rc = -ENOMEM;
-		goto out;
-	}
-
-	cb = (struct nes_rskb_cb *)&frags[0].skb->cb[0];
-	iph = (struct iphdr *)(cb->data_start + ETH_HLEN);
-	tcph = (struct tcphdr *)(((char *)iph) + (4 * iph->ihl));
-	fpdu_info->hdr_len = (((unsigned char *)tcph) + 4 * (tcph->doff)) - cb->data_start;
-	fpdu_info->data_len = fpdu_len;
-	tot_len = fpdu_info->hdr_len + fpdu_len - ETH_HLEN;
-
-	if (frags[0].cmplt) {
-		fpdu_info->hdr_pbase = cb->busaddr;
-		fpdu_info->hdr_vbase = NULL;
-	} else {
-		fpdu_info->hdr_vbase = pci_alloc_consistent(nesdev->pcidev,
-							    fpdu_info->hdr_len, &fpdu_info->hdr_pbase);
-		if (!fpdu_info->hdr_vbase) {
-			nes_debug(NES_DBG_PAU, "Unable to allocate memory for pau first frag\n");
-			rc = -ENOMEM;
-			goto out;
-		}
-
-		/* Copy hdrs, adjusting len and seqnum */
-		memcpy(fpdu_info->hdr_vbase, cb->data_start, fpdu_info->hdr_len);
-		iph = (struct iphdr *)(fpdu_info->hdr_vbase + ETH_HLEN);
-		tcph = (struct tcphdr *)(((char *)iph) + (4 * iph->ihl));
-	}
-
-	iph->tot_len = cpu_to_be16(tot_len);
-	iph->saddr = cpu_to_be32(0x7f000001);
-
-	tcph->seq = cpu_to_be32(nesqp->pau_rcv_nxt);
-	tcph->ack_seq = cpu_to_be32(ack);
-	tcph->window = cpu_to_be16(wnd);
-
-	nesqp->pau_rcv_nxt += fpdu_len + fin_rcvd;
-
-	memcpy(fpdu_info->frags, frags, sizeof(fpdu_info->frags));
-	fpdu_info->frag_cnt = frag_cnt;
-	fpdu_info->nesqp = nesqp;
-	*pau_fpdu_info = fpdu_info;
-
-	/* Update skb's for next pass */
-	for (i = 0; i < frag_cnt; i++) {
-		cb = (struct nes_rskb_cb *)&frags[i].skb->cb[0];
-		skb_pull(frags[i].skb, frags[i].frag_len);
-
-		if (frags[i].skb->len == 0) {
-			/* Pull skb off the list - it will be freed in the callback */
-			if (!skb_queue_empty(&nesqp->pau_list))
-				skb_unlink(frags[i].skb, &nesqp->pau_list);
-		} else {
-			/* Last skb still has data so update the seq */
-			iph = (struct iphdr *)(cb->data_start + ETH_HLEN);
-			tcph = (struct tcphdr *)(((char *)iph) + (4 * iph->ihl));
-			tcph->seq = cpu_to_be32(nesqp->pau_rcv_nxt);
-		}
-	}
-
-out:
-	if (rc) {
-		if (fpdu_info) {
-			if (fpdu_info->cqp_request)
-				nes_put_cqp_request(nesdev, fpdu_info->cqp_request);
-			kfree(fpdu_info);
-		}
-	}
-	return rc;
-}
-
-/**
- * forward_fpdu - send complete fpdus, one at a time
- */
-static int forward_fpdus(struct nes_vnic *nesvnic, struct nes_qp *nesqp)
-{
-	struct nes_device *nesdev = nesvnic->nesdev;
-	struct pau_fpdu_info *fpdu_info;
-	struct nes_hw_cqp_wqe *cqp_wqe;
-	struct nes_cqp_request *cqp_request;
-	unsigned long flags;
-	u64 u64tmp;
-	u32 u32tmp;
-	int rc;
-
-	while (1) {
-		spin_lock_irqsave(&nesqp->pau_lock, flags);
-		rc = get_fpdu_info(nesdev, nesqp, &fpdu_info);
-		if (rc || (fpdu_info == NULL)) {
-			spin_unlock_irqrestore(&nesqp->pau_lock, flags);
-			return rc;
-		}
-
-		cqp_request = fpdu_info->cqp_request;
-		cqp_wqe = &cqp_request->cqp_wqe;
-		nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
-		set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_DL_OPCODE_IDX,
-				    NES_CQP_DOWNLOAD_SEGMENT |
-				    (((u32)nesvnic->logical_port) << NES_CQP_OP_LOGICAL_PORT_SHIFT));
-
-		u32tmp = fpdu_info->hdr_len << 16;
-		u32tmp |= fpdu_info->hdr_len + (u32)fpdu_info->data_len;
-		set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_DL_LENGTH_0_TOTAL_IDX,
-				    u32tmp);
-
-		u32tmp = (fpdu_info->frags[1].frag_len << 16) | fpdu_info->frags[0].frag_len;
-		set_wqe_32bit_value(cqp_wqe->wqe_words, NES_NIC_SQ_WQE_LENGTH_2_1_IDX,
-				    u32tmp);
-
-		u32tmp = (fpdu_info->frags[3].frag_len << 16) | fpdu_info->frags[2].frag_len;
-		set_wqe_32bit_value(cqp_wqe->wqe_words, NES_NIC_SQ_WQE_LENGTH_4_3_IDX,
-				    u32tmp);
-
-		u64tmp = (u64)fpdu_info->hdr_pbase;
-		set_wqe_32bit_value(cqp_wqe->wqe_words, NES_NIC_SQ_WQE_FRAG0_LOW_IDX,
-				    lower_32_bits(u64tmp));
-		set_wqe_32bit_value(cqp_wqe->wqe_words, NES_NIC_SQ_WQE_FRAG0_HIGH_IDX,
-				    upper_32_bits(u64tmp));
-
-		set_wqe_32bit_value(cqp_wqe->wqe_words, NES_NIC_SQ_WQE_FRAG1_LOW_IDX,
-				    lower_32_bits(fpdu_info->frags[0].physaddr));
-		set_wqe_32bit_value(cqp_wqe->wqe_words, NES_NIC_SQ_WQE_FRAG1_HIGH_IDX,
-				    upper_32_bits(fpdu_info->frags[0].physaddr));
-
-		set_wqe_32bit_value(cqp_wqe->wqe_words, NES_NIC_SQ_WQE_FRAG2_LOW_IDX,
-				    lower_32_bits(fpdu_info->frags[1].physaddr));
-		set_wqe_32bit_value(cqp_wqe->wqe_words, NES_NIC_SQ_WQE_FRAG2_HIGH_IDX,
-				    upper_32_bits(fpdu_info->frags[1].physaddr));
-
-		set_wqe_32bit_value(cqp_wqe->wqe_words, NES_NIC_SQ_WQE_FRAG3_LOW_IDX,
-				    lower_32_bits(fpdu_info->frags[2].physaddr));
-		set_wqe_32bit_value(cqp_wqe->wqe_words, NES_NIC_SQ_WQE_FRAG3_HIGH_IDX,
-				    upper_32_bits(fpdu_info->frags[2].physaddr));
-
-		set_wqe_32bit_value(cqp_wqe->wqe_words, NES_NIC_SQ_WQE_FRAG4_LOW_IDX,
-				    lower_32_bits(fpdu_info->frags[3].physaddr));
-		set_wqe_32bit_value(cqp_wqe->wqe_words, NES_NIC_SQ_WQE_FRAG4_HIGH_IDX,
-				    upper_32_bits(fpdu_info->frags[3].physaddr));
-
-		cqp_request->cqp_callback_pointer = fpdu_info;
-		cqp_request->callback = 1;
-		cqp_request->cqp_callback = nes_download_callback;
-
-		atomic_set(&cqp_request->refcount, 1);
-		nes_post_cqp_request(nesdev, cqp_request);
-		spin_unlock_irqrestore(&nesqp->pau_lock, flags);
-	}
-
-	return 0;
-}
-
-static void process_fpdus(struct nes_vnic *nesvnic, struct nes_qp *nesqp)
-{
-	int again = 1;
-	unsigned long flags;
-
-	do {
-		/* Ignore rc - if it failed, tcp retries will cause it to try again */
-		forward_fpdus(nesvnic, nesqp);
-
-		spin_lock_irqsave(&nesqp->pau_lock, flags);
-		if (nesqp->pau_pending) {
-			nesqp->pau_pending = 0;
-		} else {
-			nesqp->pau_busy = 0;
-			again = 0;
-		}
-
-		spin_unlock_irqrestore(&nesqp->pau_lock, flags);
-	} while (again);
-}
-
-/**
- * queue_fpdus - Handle fpdu's that hw passed up to sw
- */
-static void queue_fpdus(struct sk_buff *skb, struct nes_vnic *nesvnic, struct nes_qp *nesqp)
-{
-	struct sk_buff *tmpskb;
-	struct nes_rskb_cb *cb;
-	struct iphdr *iph;
-	struct tcphdr *tcph;
-	unsigned char *tcph_end;
-	u32 rcv_nxt;
-	u32 rcv_wnd;
-	u32 seqnum;
-	u32 len;
-	bool process_it = false;
-	unsigned long flags;
-
-	/* Move data ptr to after tcp header */
-	iph = (struct iphdr *)skb->data;
-	tcph = (struct tcphdr *)(((char *)iph) + (4 * iph->ihl));
-	seqnum = be32_to_cpu(tcph->seq);
-	tcph_end = (((char *)tcph) + (4 * tcph->doff));
-
-	len = be16_to_cpu(iph->tot_len);
-	if (skb->len > len)
-		skb_trim(skb, len);
-	skb_pull(skb, tcph_end - skb->data);
-
-	/* Initialize tracking values */
-	cb = (struct nes_rskb_cb *)&skb->cb[0];
-	cb->seqnum = seqnum;
-
-	/* Make sure data is in the receive window */
-	rcv_nxt = nesqp->pau_rcv_nxt;
-	rcv_wnd = le32_to_cpu(nesqp->nesqp_context->rcv_wnd);
-	if (!between(seqnum, rcv_nxt, (rcv_nxt + rcv_wnd))) {
-		nes_mgt_free_skb(nesvnic->nesdev, skb, PCI_DMA_TODEVICE);
-		nes_rem_ref_cm_node(nesqp->cm_node);
-		return;
-	}
-
-	spin_lock_irqsave(&nesqp->pau_lock, flags);
-
-	if (nesqp->pau_busy)
-		nesqp->pau_pending = 1;
-	else
-		nesqp->pau_busy = 1;
-
-	/* Queue skb by sequence number */
-	if (skb_queue_len(&nesqp->pau_list) == 0) {
-		skb_queue_head(&nesqp->pau_list, skb);
-	} else {
-		tmpskb = nesqp->pau_list.next;
-		while (tmpskb != (struct sk_buff *)&nesqp->pau_list) {
-			cb = (struct nes_rskb_cb *)&tmpskb->cb[0];
-			if (before(seqnum, cb->seqnum))
-				break;
-			tmpskb = tmpskb->next;
-		}
-		skb_insert(tmpskb, skb, &nesqp->pau_list);
-	}
-	if (nesqp->pau_state == PAU_READY)
-		process_it = true;
-	spin_unlock_irqrestore(&nesqp->pau_lock, flags);
-
-	if (process_it)
-		process_fpdus(nesvnic, nesqp);
-
-	return;
-}
-
-/**
- * mgt_thread - Handle mgt skbs in a safe context
- */
-static int mgt_thread(void *context)
-{
-	struct nes_vnic *nesvnic = context;
-	struct sk_buff *skb;
-	struct nes_rskb_cb *cb;
-
-	while (!kthread_should_stop()) {
-		wait_event_interruptible(nesvnic->mgt_wait_queue,
-					 skb_queue_len(&nesvnic->mgt_skb_list) || kthread_should_stop());
-		while ((skb_queue_len(&nesvnic->mgt_skb_list)) && !kthread_should_stop()) {
-			skb = skb_dequeue(&nesvnic->mgt_skb_list);
-			cb = (struct nes_rskb_cb *)&skb->cb[0];
-			cb->data_start = skb->data - ETH_HLEN;
-			cb->busaddr = pci_map_single(nesvnic->nesdev->pcidev, cb->data_start,
-						     nesvnic->max_frame_size, PCI_DMA_TODEVICE);
-			queue_fpdus(skb, nesvnic, cb->nesqp);
-		}
-	}
-
-	/* Closing down so delete any entries on the queue */
-	while (skb_queue_len(&nesvnic->mgt_skb_list)) {
-		skb = skb_dequeue(&nesvnic->mgt_skb_list);
-		cb = (struct nes_rskb_cb *)&skb->cb[0];
-		nes_rem_ref_cm_node(cb->nesqp->cm_node);
-		dev_kfree_skb_any(skb);
-	}
-	return 0;
-}
-
-/**
- * nes_queue_skbs - Queue skb so it can be handled in a thread context
- */
-void nes_queue_mgt_skbs(struct sk_buff *skb, struct nes_vnic *nesvnic, struct nes_qp *nesqp)
-{
-	struct nes_rskb_cb *cb;
-
-	cb = (struct nes_rskb_cb *)&skb->cb[0];
-	cb->nesqp = nesqp;
-	skb_queue_tail(&nesvnic->mgt_skb_list, skb);
-	wake_up_interruptible(&nesvnic->mgt_wait_queue);
-}
-
-void nes_destroy_pau_qp(struct nes_device *nesdev, struct nes_qp *nesqp)
-{
-	struct sk_buff *skb;
-	unsigned long flags;
-	atomic_inc(&pau_qps_destroyed);
-
-	/* Free packets that have not yet been forwarded */
-	/* Lock is acquired by skb_dequeue when removing the skb */
-	spin_lock_irqsave(&nesqp->pau_lock, flags);
-	while (skb_queue_len(&nesqp->pau_list)) {
-		skb = skb_dequeue(&nesqp->pau_list);
-		nes_mgt_free_skb(nesdev, skb, PCI_DMA_TODEVICE);
-		nes_rem_ref_cm_node(nesqp->cm_node);
-	}
-	spin_unlock_irqrestore(&nesqp->pau_lock, flags);
-}
-
-static void nes_chg_qh_handler(struct nes_device *nesdev, struct nes_cqp_request *cqp_request)
-{
-	struct pau_qh_chg *qh_chg = cqp_request->cqp_callback_pointer;
-	struct nes_cqp_request *new_request;
-	struct nes_hw_cqp_wqe *cqp_wqe;
-	struct nes_adapter *nesadapter;
-	struct nes_qp *nesqp;
-	struct nes_v4_quad nes_quad;
-	u32 crc_value;
-	u64 u64temp;
-
-	nesadapter = nesdev->nesadapter;
-	nesqp = qh_chg->nesqp;
-
-	/* Should we handle the bad completion */
-	if (cqp_request->major_code)
-		WARN(1, PFX "Invalid cqp_request major_code=0x%x\n",
-		       cqp_request->major_code);
-
-	switch (nesqp->pau_state) {
-	case PAU_DEL_QH:
-		/* Old hash code deleted, now set the new one */
-		nesqp->pau_state = PAU_ADD_LB_QH;
-		new_request = nes_get_cqp_request(nesdev);
-		if (new_request == NULL) {
-			nes_debug(NES_DBG_PAU, "Failed to get a new_request.\n");
-			WARN_ON(1);
-			return;
-		}
-
-		memset(&nes_quad, 0, sizeof(nes_quad));
-		nes_quad.DstIpAdrIndex =
-			cpu_to_le32((u32)PCI_FUNC(nesdev->pcidev->devfn) << 24);
-		nes_quad.SrcIpadr = cpu_to_be32(0x7f000001);
-		nes_quad.TcpPorts[0] = swab16(nesqp->nesqp_context->tcpPorts[1]);
-		nes_quad.TcpPorts[1] = swab16(nesqp->nesqp_context->tcpPorts[0]);
-
-		/* Produce hash key */
-		crc_value = get_crc_value(&nes_quad);
-		nesqp->hte_index = cpu_to_be32(crc_value ^ 0xffffffff);
-		nes_debug(NES_DBG_PAU, "new HTE Index = 0x%08X, CRC = 0x%08X\n",
-			  nesqp->hte_index, nesqp->hte_index & nesadapter->hte_index_mask);
-
-		nesqp->hte_index &= nesadapter->hte_index_mask;
-		nesqp->nesqp_context->hte_index = cpu_to_le32(nesqp->hte_index);
-		nesqp->nesqp_context->ip0 = cpu_to_le32(0x7f000001);
-		nesqp->nesqp_context->rcv_nxt = cpu_to_le32(nesqp->pau_rcv_nxt);
-
-		cqp_wqe = &new_request->cqp_wqe;
-		nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
-		set_wqe_32bit_value(cqp_wqe->wqe_words,
-				    NES_CQP_WQE_OPCODE_IDX, NES_CQP_MANAGE_QUAD_HASH |
-				    NES_CQP_QP_TYPE_IWARP | NES_CQP_QP_CONTEXT_VALID | NES_CQP_QP_IWARP_STATE_RTS);
-		set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX, nesqp->hwqp.qp_id);
-		u64temp = (u64)nesqp->nesqp_context_pbase;
-		set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_QP_WQE_CONTEXT_LOW_IDX, u64temp);
-
-		nes_debug(NES_DBG_PAU, "Waiting for CQP completion for adding the quad hash.\n");
-
-		new_request->cqp_callback_pointer = qh_chg;
-		new_request->callback = 1;
-		new_request->cqp_callback = nes_chg_qh_handler;
-		atomic_set(&new_request->refcount, 1);
-		nes_post_cqp_request(nesdev, new_request);
-		break;
-
-	case PAU_ADD_LB_QH:
-		/* Start processing the queued fpdu's */
-		nesqp->pau_state = PAU_READY;
-		process_fpdus(qh_chg->nesvnic, qh_chg->nesqp);
-		kfree(qh_chg);
-		break;
-	}
-}
-
-/**
- * nes_change_quad_hash
- */
-static int nes_change_quad_hash(struct nes_device *nesdev,
-				struct nes_vnic *nesvnic, struct nes_qp *nesqp)
-{
-	struct nes_cqp_request *cqp_request = NULL;
-	struct pau_qh_chg *qh_chg = NULL;
-	u64 u64temp;
-	struct nes_hw_cqp_wqe *cqp_wqe;
-	int ret = 0;
-
-	cqp_request = nes_get_cqp_request(nesdev);
-	if (cqp_request == NULL) {
-		nes_debug(NES_DBG_PAU, "Failed to get a cqp_request.\n");
-		ret = -ENOMEM;
-		goto chg_qh_err;
-	}
-
-	qh_chg = kmalloc(sizeof *qh_chg, GFP_ATOMIC);
-	if (!qh_chg) {
-		ret = -ENOMEM;
-		goto chg_qh_err;
-	}
-	qh_chg->nesdev = nesdev;
-	qh_chg->nesvnic = nesvnic;
-	qh_chg->nesqp = nesqp;
-	nesqp->pau_state = PAU_DEL_QH;
-
-	cqp_wqe = &cqp_request->cqp_wqe;
-	nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
-	set_wqe_32bit_value(cqp_wqe->wqe_words,
-			    NES_CQP_WQE_OPCODE_IDX, NES_CQP_MANAGE_QUAD_HASH | NES_CQP_QP_DEL_HTE |
-			    NES_CQP_QP_TYPE_IWARP | NES_CQP_QP_CONTEXT_VALID | NES_CQP_QP_IWARP_STATE_RTS);
-	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX, nesqp->hwqp.qp_id);
-	u64temp = (u64)nesqp->nesqp_context_pbase;
-	set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_QP_WQE_CONTEXT_LOW_IDX, u64temp);
-
-	nes_debug(NES_DBG_PAU, "Waiting for CQP completion for deleting the quad hash.\n");
-
-	cqp_request->cqp_callback_pointer = qh_chg;
-	cqp_request->callback = 1;
-	cqp_request->cqp_callback = nes_chg_qh_handler;
-	atomic_set(&cqp_request->refcount, 1);
-	nes_post_cqp_request(nesdev, cqp_request);
-
-	return ret;
-
-chg_qh_err:
-	kfree(qh_chg);
-	if (cqp_request)
-		nes_put_cqp_request(nesdev, cqp_request);
-	return ret;
-}
-
-/**
- * nes_mgt_ce_handler
- * This management code deals with any packed and unaligned (pau) fpdu's
- * that the hardware cannot handle.
- */
-static void nes_mgt_ce_handler(struct nes_device *nesdev, struct nes_hw_nic_cq *cq)
-{
-	struct nes_vnic_mgt *mgtvnic = container_of(cq, struct nes_vnic_mgt, mgt_cq);
-	struct nes_adapter *nesadapter = nesdev->nesadapter;
-	u32 head;
-	u32 cq_size;
-	u32 cqe_count = 0;
-	u32 cqe_misc;
-	u32 qp_id = 0;
-	u32 skbs_needed;
-	unsigned long context;
-	struct nes_qp *nesqp;
-	struct sk_buff *rx_skb;
-	struct nes_rskb_cb *cb;
-
-	head = cq->cq_head;
-	cq_size = cq->cq_size;
-
-	while (1) {
-		cqe_misc = le32_to_cpu(cq->cq_vbase[head].cqe_words[NES_NIC_CQE_MISC_IDX]);
-		if (!(cqe_misc & NES_NIC_CQE_VALID))
-			break;
-
-		nesqp = NULL;
-		if (cqe_misc & NES_NIC_CQE_ACCQP_VALID) {
-			qp_id = le32_to_cpu(cq->cq_vbase[head].cqe_words[NES_NIC_CQE_ACCQP_ID_IDX]);
-			qp_id &= 0x001fffff;
-			if (qp_id < nesadapter->max_qp) {
-				context = (unsigned long)nesadapter->qp_table[qp_id - NES_FIRST_QPN];
-				nesqp = (struct nes_qp *)context;
-			}
-		}
-
-		if (nesqp) {
-			if (nesqp->pau_mode == false) {
-				nesqp->pau_mode = true; /* First time for this qp */
-				nesqp->pau_rcv_nxt = le32_to_cpu(
-					cq->cq_vbase[head].cqe_words[NES_NIC_CQE_HASH_RCVNXT]);
-				skb_queue_head_init(&nesqp->pau_list);
-				spin_lock_init(&nesqp->pau_lock);
-				atomic_inc(&pau_qps_created);
-				nes_change_quad_hash(nesdev, mgtvnic->nesvnic, nesqp);
-			}
-
-			rx_skb = mgtvnic->mgt.rx_skb[mgtvnic->mgt.rq_tail];
-			rx_skb->len = 0;
-			skb_put(rx_skb, cqe_misc & 0x0000ffff);
-			rx_skb->protocol = eth_type_trans(rx_skb, mgtvnic->nesvnic->netdev);
-			cb = (struct nes_rskb_cb *)&rx_skb->cb[0];
-			pci_unmap_single(nesdev->pcidev, cb->busaddr, cb->maplen, PCI_DMA_FROMDEVICE);
-			cb->busaddr = 0;
-			mgtvnic->mgt.rq_tail++;
-			mgtvnic->mgt.rq_tail &= mgtvnic->mgt.rq_size - 1;
-
-			nes_add_ref_cm_node(nesqp->cm_node);
-			nes_queue_mgt_skbs(rx_skb, mgtvnic->nesvnic, nesqp);
-		} else {
-			printk(KERN_ERR PFX "Invalid QP %d for packed/unaligned handling\n", qp_id);
-		}
-
-		cq->cq_vbase[head].cqe_words[NES_NIC_CQE_MISC_IDX] = 0;
-		cqe_count++;
-		if (++head >= cq_size)
-			head = 0;
-
-		if (cqe_count == 255) {
-			/* Replenish mgt CQ */
-			nes_write32(nesdev->regs + NES_CQE_ALLOC, cq->cq_number | (cqe_count << 16));
-			nesdev->currcq_count += cqe_count;
-			cqe_count = 0;
-		}
-
-		skbs_needed = atomic_inc_return(&mgtvnic->rx_skbs_needed);
-		if (skbs_needed > (mgtvnic->mgt.rq_size >> 1))
-			nes_replenish_mgt_rq(mgtvnic);
-	}
-
-	cq->cq_head = head;
-	nes_write32(nesdev->regs + NES_CQE_ALLOC, NES_CQE_ALLOC_NOTIFY_NEXT |
-		    cq->cq_number | (cqe_count << 16));
-	nes_read32(nesdev->regs + NES_CQE_ALLOC);
-	nesdev->currcq_count += cqe_count;
-}
-
-/**
- * nes_init_mgt_qp
- */
-int nes_init_mgt_qp(struct nes_device *nesdev, struct net_device *netdev, struct nes_vnic *nesvnic)
-{
-	struct nes_vnic_mgt *mgtvnic;
-	u32 counter;
-	void *vmem;
-	dma_addr_t pmem;
-	struct nes_hw_cqp_wqe *cqp_wqe;
-	u32 cqp_head;
-	unsigned long flags;
-	struct nes_hw_nic_qp_context *mgt_context;
-	u64 u64temp;
-	struct nes_hw_nic_rq_wqe *mgt_rqe;
-	struct sk_buff *skb;
-	u32 wqe_count;
-	struct nes_rskb_cb *cb;
-	u32 mgt_mem_size;
-	void *mgt_vbase;
-	dma_addr_t mgt_pbase;
-	int i;
-	int ret;
-
-	/* Allocate space the all mgt QPs once */
-	mgtvnic = kcalloc(NES_MGT_QP_COUNT, sizeof(struct nes_vnic_mgt),
-			  GFP_KERNEL);
-	if (!mgtvnic)
-		return -ENOMEM;
-
-	/* Allocate fragment, RQ, and CQ; Reuse CEQ based on the PCI function */
-	/* We are not sending from this NIC so sq is not allocated */
-	mgt_mem_size = 256 +
-		       (NES_MGT_WQ_COUNT * sizeof(struct nes_hw_nic_rq_wqe)) +
-		       (NES_MGT_WQ_COUNT * sizeof(struct nes_hw_nic_cqe)) +
-		       sizeof(struct nes_hw_nic_qp_context);
-	mgt_mem_size = (mgt_mem_size + PAGE_SIZE - 1) & ~(PAGE_SIZE - 1);
-	mgt_vbase = pci_alloc_consistent(nesdev->pcidev, NES_MGT_QP_COUNT * mgt_mem_size, &mgt_pbase);
-	if (!mgt_vbase) {
-		kfree(mgtvnic);
-		nes_debug(NES_DBG_INIT, "Unable to allocate memory for mgt host descriptor rings\n");
-		return -ENOMEM;
-	}
-
-	nesvnic->mgt_mem_size = NES_MGT_QP_COUNT * mgt_mem_size;
-	nesvnic->mgt_vbase = mgt_vbase;
-	nesvnic->mgt_pbase = mgt_pbase;
-
-	skb_queue_head_init(&nesvnic->mgt_skb_list);
-	init_waitqueue_head(&nesvnic->mgt_wait_queue);
-	nesvnic->mgt_thread = kthread_run(mgt_thread, nesvnic, "nes_mgt_thread");
-
-	for (i = 0; i < NES_MGT_QP_COUNT; i++) {
-		mgtvnic->nesvnic = nesvnic;
-		mgtvnic->mgt.qp_id = nesdev->mac_index + NES_MGT_QP_OFFSET + i;
-		memset(mgt_vbase, 0, mgt_mem_size);
-		nes_debug(NES_DBG_INIT, "Allocated mgt QP structures at %p (phys = %016lX), size = %u.\n",
-			  mgt_vbase, (unsigned long)mgt_pbase, mgt_mem_size);
-
-		vmem = (void *)(((unsigned long)mgt_vbase + (256 - 1)) &
-				~(unsigned long)(256 - 1));
-		pmem = (dma_addr_t)(((unsigned long long)mgt_pbase + (256 - 1)) &
-				    ~(unsigned long long)(256 - 1));
-
-		spin_lock_init(&mgtvnic->mgt.rq_lock);
-
-		/* setup the RQ */
-		mgtvnic->mgt.rq_vbase = vmem;
-		mgtvnic->mgt.rq_pbase = pmem;
-		mgtvnic->mgt.rq_head = 0;
-		mgtvnic->mgt.rq_tail = 0;
-		mgtvnic->mgt.rq_size = NES_MGT_WQ_COUNT;
-
-		/* setup the CQ */
-		vmem += (NES_MGT_WQ_COUNT * sizeof(struct nes_hw_nic_rq_wqe));
-		pmem += (NES_MGT_WQ_COUNT * sizeof(struct nes_hw_nic_rq_wqe));
-
-		mgtvnic->mgt_cq.cq_number = mgtvnic->mgt.qp_id;
-		mgtvnic->mgt_cq.cq_vbase = vmem;
-		mgtvnic->mgt_cq.cq_pbase = pmem;
-		mgtvnic->mgt_cq.cq_head = 0;
-		mgtvnic->mgt_cq.cq_size = NES_MGT_WQ_COUNT;
-
-		mgtvnic->mgt_cq.ce_handler = nes_mgt_ce_handler;
-
-		/* Send CreateCQ request to CQP */
-		spin_lock_irqsave(&nesdev->cqp.lock, flags);
-		cqp_head = nesdev->cqp.sq_head;
-
-		cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head];
-		nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
-
-		cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] = cpu_to_le32(
-			NES_CQP_CREATE_CQ | NES_CQP_CQ_CEQ_VALID |
-			((u32)mgtvnic->mgt_cq.cq_size << 16));
-		cqp_wqe->wqe_words[NES_CQP_WQE_ID_IDX] = cpu_to_le32(
-			mgtvnic->mgt_cq.cq_number | ((u32)nesdev->ceq_index << 16));
-		u64temp = (u64)mgtvnic->mgt_cq.cq_pbase;
-		set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_CQ_WQE_PBL_LOW_IDX, u64temp);
-		cqp_wqe->wqe_words[NES_CQP_CQ_WQE_CQ_CONTEXT_HIGH_IDX] = 0;
-		u64temp = (unsigned long)&mgtvnic->mgt_cq;
-		cqp_wqe->wqe_words[NES_CQP_CQ_WQE_CQ_CONTEXT_LOW_IDX] = cpu_to_le32((u32)(u64temp >> 1));
-		cqp_wqe->wqe_words[NES_CQP_CQ_WQE_CQ_CONTEXT_HIGH_IDX] =
-			cpu_to_le32(((u32)((u64temp) >> 33)) & 0x7FFFFFFF);
-		cqp_wqe->wqe_words[NES_CQP_CQ_WQE_DOORBELL_INDEX_HIGH_IDX] = 0;
-
-		if (++cqp_head >= nesdev->cqp.sq_size)
-			cqp_head = 0;
-		cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head];
-		nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
-
-		/* Send CreateQP request to CQP */
-		mgt_context = (void *)(&mgtvnic->mgt_cq.cq_vbase[mgtvnic->mgt_cq.cq_size]);
-		mgt_context->context_words[NES_NIC_CTX_MISC_IDX] =
-			cpu_to_le32((u32)NES_MGT_CTX_SIZE |
-				    ((u32)PCI_FUNC(nesdev->pcidev->devfn) << 12));
-		nes_debug(NES_DBG_INIT, "RX_WINDOW_BUFFER_PAGE_TABLE_SIZE = 0x%08X, RX_WINDOW_BUFFER_SIZE = 0x%08X\n",
-			  nes_read_indexed(nesdev, NES_IDX_RX_WINDOW_BUFFER_PAGE_TABLE_SIZE),
-			  nes_read_indexed(nesdev, NES_IDX_RX_WINDOW_BUFFER_SIZE));
-		if (nes_read_indexed(nesdev, NES_IDX_RX_WINDOW_BUFFER_SIZE) != 0)
-			mgt_context->context_words[NES_NIC_CTX_MISC_IDX] |= cpu_to_le32(NES_NIC_BACK_STORE);
-
-		u64temp = (u64)mgtvnic->mgt.rq_pbase;
-		mgt_context->context_words[NES_NIC_CTX_SQ_LOW_IDX] = cpu_to_le32((u32)u64temp);
-		mgt_context->context_words[NES_NIC_CTX_SQ_HIGH_IDX] = cpu_to_le32((u32)(u64temp >> 32));
-		u64temp = (u64)mgtvnic->mgt.rq_pbase;
-		mgt_context->context_words[NES_NIC_CTX_RQ_LOW_IDX] = cpu_to_le32((u32)u64temp);
-		mgt_context->context_words[NES_NIC_CTX_RQ_HIGH_IDX] = cpu_to_le32((u32)(u64temp >> 32));
-
-		cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] = cpu_to_le32(NES_CQP_CREATE_QP |
-									 NES_CQP_QP_TYPE_NIC);
-		cqp_wqe->wqe_words[NES_CQP_WQE_ID_IDX] = cpu_to_le32(mgtvnic->mgt.qp_id);
-		u64temp = (u64)mgtvnic->mgt_cq.cq_pbase +
-			  (mgtvnic->mgt_cq.cq_size * sizeof(struct nes_hw_nic_cqe));
-		set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_QP_WQE_CONTEXT_LOW_IDX, u64temp);
-
-		if (++cqp_head >= nesdev->cqp.sq_size)
-			cqp_head = 0;
-		nesdev->cqp.sq_head = cqp_head;
-
-		barrier();
-
-		/* Ring doorbell (2 WQEs) */
-		nes_write32(nesdev->regs + NES_WQE_ALLOC, 0x02800000 | nesdev->cqp.qp_id);
-
-		spin_unlock_irqrestore(&nesdev->cqp.lock, flags);
-		nes_debug(NES_DBG_INIT, "Waiting for create MGT QP%u to complete.\n",
-			  mgtvnic->mgt.qp_id);
-
-		ret = wait_event_timeout(nesdev->cqp.waitq, (nesdev->cqp.sq_tail == cqp_head),
-					 NES_EVENT_TIMEOUT);
-		nes_debug(NES_DBG_INIT, "Create MGT QP%u completed, wait_event_timeout ret = %u.\n",
-			  mgtvnic->mgt.qp_id, ret);
-		if (!ret) {
-			nes_debug(NES_DBG_INIT, "MGT QP%u create timeout expired\n", mgtvnic->mgt.qp_id);
-			if (i == 0) {
-				pci_free_consistent(nesdev->pcidev, nesvnic->mgt_mem_size, nesvnic->mgt_vbase,
-						    nesvnic->mgt_pbase);
-				kfree(mgtvnic);
-			} else {
-				nes_destroy_mgt(nesvnic);
-			}
-			return -EIO;
-		}
-
-		/* Populate the RQ */
-		for (counter = 0; counter < (NES_MGT_WQ_COUNT - 1); counter++) {
-			skb = dev_alloc_skb(nesvnic->max_frame_size);
-			if (!skb) {
-				nes_debug(NES_DBG_INIT, "%s: out of memory for receive skb\n", netdev->name);
-				return -ENOMEM;
-			}
-
-			skb->dev = netdev;
-
-			pmem = pci_map_single(nesdev->pcidev, skb->data,
-					      nesvnic->max_frame_size, PCI_DMA_FROMDEVICE);
-			cb = (struct nes_rskb_cb *)&skb->cb[0];
-			cb->busaddr = pmem;
-			cb->maplen = nesvnic->max_frame_size;
-
-			mgt_rqe = &mgtvnic->mgt.rq_vbase[counter];
-			mgt_rqe->wqe_words[NES_NIC_RQ_WQE_LENGTH_1_0_IDX] = cpu_to_le32((u32)nesvnic->max_frame_size);
-			mgt_rqe->wqe_words[NES_NIC_RQ_WQE_LENGTH_3_2_IDX] = 0;
-			mgt_rqe->wqe_words[NES_NIC_RQ_WQE_FRAG0_LOW_IDX] = cpu_to_le32((u32)pmem);
-			mgt_rqe->wqe_words[NES_NIC_RQ_WQE_FRAG0_HIGH_IDX] = cpu_to_le32((u32)((u64)pmem >> 32));
-			mgtvnic->mgt.rx_skb[counter] = skb;
-		}
-
-		timer_setup(&mgtvnic->rq_wqes_timer, nes_mgt_rq_wqes_timeout,
-			    0);
-
-		wqe_count = NES_MGT_WQ_COUNT - 1;
-		mgtvnic->mgt.rq_head = wqe_count;
-		barrier();
-		do {
-			counter = min(wqe_count, ((u32)255));
-			wqe_count -= counter;
-			nes_write32(nesdev->regs + NES_WQE_ALLOC, (counter << 24) | mgtvnic->mgt.qp_id);
-		} while (wqe_count);
-
-		nes_write32(nesdev->regs + NES_CQE_ALLOC, NES_CQE_ALLOC_NOTIFY_NEXT |
-			    mgtvnic->mgt_cq.cq_number);
-		nes_read32(nesdev->regs + NES_CQE_ALLOC);
-
-		mgt_vbase += mgt_mem_size;
-		mgt_pbase += mgt_mem_size;
-		nesvnic->mgtvnic[i] = mgtvnic++;
-	}
-	return 0;
-}
-
-
-void nes_destroy_mgt(struct nes_vnic *nesvnic)
-{
-	struct nes_device *nesdev = nesvnic->nesdev;
-	struct nes_vnic_mgt *mgtvnic;
-	struct nes_vnic_mgt *first_mgtvnic;
-	unsigned long flags;
-	struct nes_hw_cqp_wqe *cqp_wqe;
-	u32 cqp_head;
-	struct sk_buff *rx_skb;
-	int i;
-	int ret;
-
-	kthread_stop(nesvnic->mgt_thread);
-
-	/* Free remaining NIC receive buffers */
-	first_mgtvnic = nesvnic->mgtvnic[0];
-	for (i = 0; i < NES_MGT_QP_COUNT; i++) {
-		mgtvnic = nesvnic->mgtvnic[i];
-		if (mgtvnic == NULL)
-			continue;
-
-		while (mgtvnic->mgt.rq_head != mgtvnic->mgt.rq_tail) {
-			rx_skb = mgtvnic->mgt.rx_skb[mgtvnic->mgt.rq_tail];
-			nes_mgt_free_skb(nesdev, rx_skb, PCI_DMA_FROMDEVICE);
-			mgtvnic->mgt.rq_tail++;
-			mgtvnic->mgt.rq_tail &= (mgtvnic->mgt.rq_size - 1);
-		}
-
-		spin_lock_irqsave(&nesdev->cqp.lock, flags);
-
-		/* Destroy NIC QP */
-		cqp_head = nesdev->cqp.sq_head;
-		cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head];
-		nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
-
-		set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX,
-				    (NES_CQP_DESTROY_QP | NES_CQP_QP_TYPE_NIC));
-		set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX,
-				    mgtvnic->mgt.qp_id);
-
-		if (++cqp_head >= nesdev->cqp.sq_size)
-			cqp_head = 0;
-
-		cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head];
-
-		/* Destroy NIC CQ */
-		nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
-		set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX,
-				    (NES_CQP_DESTROY_CQ | ((u32)mgtvnic->mgt_cq.cq_size << 16)));
-		set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX,
-				    (mgtvnic->mgt_cq.cq_number | ((u32)nesdev->ceq_index << 16)));
-
-		if (++cqp_head >= nesdev->cqp.sq_size)
-			cqp_head = 0;
-
-		nesdev->cqp.sq_head = cqp_head;
-		barrier();
-
-		/* Ring doorbell (2 WQEs) */
-		nes_write32(nesdev->regs + NES_WQE_ALLOC, 0x02800000 | nesdev->cqp.qp_id);
-
-		spin_unlock_irqrestore(&nesdev->cqp.lock, flags);
-		nes_debug(NES_DBG_SHUTDOWN, "Waiting for CQP, cqp_head=%u, cqp.sq_head=%u,"
-			  " cqp.sq_tail=%u, cqp.sq_size=%u\n",
-			  cqp_head, nesdev->cqp.sq_head,
-			  nesdev->cqp.sq_tail, nesdev->cqp.sq_size);
-
-		ret = wait_event_timeout(nesdev->cqp.waitq, (nesdev->cqp.sq_tail == cqp_head),
-					 NES_EVENT_TIMEOUT);
-
-		nes_debug(NES_DBG_SHUTDOWN, "Destroy MGT QP returned, wait_event_timeout ret = %u, cqp_head=%u,"
-			  " cqp.sq_head=%u, cqp.sq_tail=%u\n",
-			  ret, cqp_head, nesdev->cqp.sq_head, nesdev->cqp.sq_tail);
-		if (!ret)
-			nes_debug(NES_DBG_SHUTDOWN, "MGT QP%u destroy timeout expired\n",
-				  mgtvnic->mgt.qp_id);
-
-		nesvnic->mgtvnic[i] = NULL;
-	}
-
-	if (nesvnic->mgt_vbase) {
-		pci_free_consistent(nesdev->pcidev, nesvnic->mgt_mem_size, nesvnic->mgt_vbase,
-				    nesvnic->mgt_pbase);
-		nesvnic->mgt_vbase = NULL;
-		nesvnic->mgt_pbase = 0;
-	}
-
-	kfree(first_mgtvnic);
-}
diff --git a/drivers/infiniband/hw/nes/nes_mgt.h b/drivers/infiniband/hw/nes/nes_mgt.h
deleted file mode 100644
index 4f7f701..0000000
--- a/drivers/infiniband/hw/nes/nes_mgt.h
+++ /dev/null
@@ -1,97 +0,0 @@
-/*
-* Copyright (c) 2006 - 2011 Intel-NE, Inc.  All rights reserved.
-*
-* This software is available to you under a choice of one of two
-* licenses.  You may choose to be licensed under the terms of the GNU
-* General Public License (GPL) Version 2, available from the file
-* COPYING in the main directory of this source tree, or the
-* OpenIB.org BSD license below:
-*
-*     Redistribution and use in source and binary forms, with or
-*     without modification, are permitted provided that the following
-*     conditions are met:
-*
-*      - Redistributions of source code must retain the above
-*        copyright notice, this list of conditions and the following
-*        disclaimer.
-*
-*      - Redistributions in binary form must reproduce the above
-*        copyright notice, this list of conditions and the following
-*        disclaimer in the documentation and/or other materials
-*        provided with the distribution.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
-* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
-* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
-* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-* SOFTWARE.
-*/
-
-#ifndef __NES_MGT_H
-#define __NES_MGT_H
-
-#define MPA_FRAMING 6	/* length is 2 bytes, crc is 4 bytes */
-
-int nes_init_mgt_qp(struct nes_device *nesdev, struct net_device *netdev, struct nes_vnic *nesvnic);
-void nes_queue_mgt_skbs(struct sk_buff *skb, struct nes_vnic *nesvnic, struct nes_qp *nesqp);
-void nes_destroy_mgt(struct nes_vnic *nesvnic);
-void nes_destroy_pau_qp(struct nes_device *nesdev, struct nes_qp *nesqp);
-
-struct nes_hw_mgt {
-	struct nes_hw_nic_rq_wqe *rq_vbase;	/* virtual address of rq */
-	dma_addr_t rq_pbase;			/* PCI memory for host rings */
-	struct sk_buff *rx_skb[NES_NIC_WQ_SIZE];
-	u16 qp_id;
-	u16 sq_head;
-	u16 rq_head;
-	u16 rq_tail;
-	u16 rq_size;
-	u8 replenishing_rq;
-	u8 reserved;
-	spinlock_t rq_lock;
-};
-
-struct nes_vnic_mgt {
-	struct nes_vnic        *nesvnic;
-	struct nes_hw_mgt      mgt;
-	struct nes_hw_nic_cq   mgt_cq;
-	atomic_t               rx_skbs_needed;
-	struct timer_list      rq_wqes_timer;
-	atomic_t               rx_skb_timer_running;
-};
-
-#define MAX_FPDU_FRAGS 4
-struct pau_fpdu_frag {
-	struct sk_buff         *skb;
-	u64                    physaddr;
-	u32                    frag_len;
-	bool                   cmplt;
-};
-
-struct pau_fpdu_info {
-	struct nes_qp          *nesqp;
-	struct nes_cqp_request *cqp_request;
-	void                   *hdr_vbase;
-	dma_addr_t             hdr_pbase;
-	int                    hdr_len;
-	u16                    data_len;
-	u16                    frag_cnt;
-	struct pau_fpdu_frag   frags[MAX_FPDU_FRAGS];
-};
-
-enum pau_qh_state {
-	PAU_DEL_QH,
-	PAU_ADD_LB_QH,
-	PAU_READY
-};
-
-struct pau_qh_chg {
-	struct nes_device *nesdev;
-	struct nes_vnic *nesvnic;
-	struct nes_qp *nesqp;
-};
-
-#endif          /* __NES_MGT_H */
diff --git a/drivers/infiniband/hw/nes/nes_nic.c b/drivers/infiniband/hw/nes/nes_nic.c
deleted file mode 100644
index 61014e2..0000000
--- a/drivers/infiniband/hw/nes/nes_nic.c
+++ /dev/null
@@ -1,1872 +0,0 @@
-/*
- * Copyright (c) 2006 - 2011 Intel Corporation.  All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- */
-
-#include <linux/module.h>
-#include <linux/moduleparam.h>
-#include <linux/netdevice.h>
-#include <linux/etherdevice.h>
-#include <linux/ip.h>
-#include <linux/tcp.h>
-#include <linux/if_arp.h>
-#include <linux/if_vlan.h>
-#include <linux/ethtool.h>
-#include <linux/slab.h>
-#include <net/tcp.h>
-
-#include <net/inet_common.h>
-#include <linux/inet.h>
-
-#include "nes.h"
-
-static struct nic_qp_map nic_qp_mapping_0[] = {
-	{16,0,0,1},{24,4,0,0},{28,8,0,0},{32,12,0,0},
-	{20,2,2,1},{26,6,2,0},{30,10,2,0},{34,14,2,0},
-	{18,1,1,1},{25,5,1,0},{29,9,1,0},{33,13,1,0},
-	{22,3,3,1},{27,7,3,0},{31,11,3,0},{35,15,3,0}
-};
-
-static struct nic_qp_map nic_qp_mapping_1[] = {
-	{18,1,1,1},{25,5,1,0},{29,9,1,0},{33,13,1,0},
-	{22,3,3,1},{27,7,3,0},{31,11,3,0},{35,15,3,0}
-};
-
-static struct nic_qp_map nic_qp_mapping_2[] = {
-	{20,2,2,1},{26,6,2,0},{30,10,2,0},{34,14,2,0}
-};
-
-static struct nic_qp_map nic_qp_mapping_3[] = {
-	{22,3,3,1},{27,7,3,0},{31,11,3,0},{35,15,3,0}
-};
-
-static struct nic_qp_map nic_qp_mapping_4[] = {
-	{28,8,0,0},{32,12,0,0}
-};
-
-static struct nic_qp_map nic_qp_mapping_5[] = {
-	{29,9,1,0},{33,13,1,0}
-};
-
-static struct nic_qp_map nic_qp_mapping_6[] = {
-	{30,10,2,0},{34,14,2,0}
-};
-
-static struct nic_qp_map nic_qp_mapping_7[] = {
-	{31,11,3,0},{35,15,3,0}
-};
-
-static struct nic_qp_map *nic_qp_mapping_per_function[] = {
-	nic_qp_mapping_0, nic_qp_mapping_1, nic_qp_mapping_2, nic_qp_mapping_3,
-	nic_qp_mapping_4, nic_qp_mapping_5, nic_qp_mapping_6, nic_qp_mapping_7
-};
-
-static const u32 default_msg = NETIF_MSG_DRV | NETIF_MSG_PROBE | NETIF_MSG_LINK
-		| NETIF_MSG_IFUP | NETIF_MSG_IFDOWN;
-static int debug = -1;
-static int nics_per_function = 1;
-
-/**
- * nes_netdev_poll
- */
-static int nes_netdev_poll(struct napi_struct *napi, int budget)
-{
-	struct nes_vnic *nesvnic = container_of(napi, struct nes_vnic, napi);
-	struct nes_device *nesdev = nesvnic->nesdev;
-	struct nes_hw_nic_cq *nescq = &nesvnic->nic_cq;
-
-	nesvnic->budget = budget;
-	nescq->cqes_pending = 0;
-	nescq->rx_cqes_completed = 0;
-	nescq->cqe_allocs_pending = 0;
-	nescq->rx_pkts_indicated = 0;
-
-	nes_nic_ce_handler(nesdev, nescq);
-
-	if (nescq->cqes_pending == 0) {
-		napi_complete(napi);
-		/* clear out completed cqes and arm */
-		nes_write32(nesdev->regs+NES_CQE_ALLOC, NES_CQE_ALLOC_NOTIFY_NEXT |
-				nescq->cq_number | (nescq->cqe_allocs_pending << 16));
-		nes_read32(nesdev->regs+NES_CQE_ALLOC);
-	} else {
-		/* clear out completed cqes but don't arm */
-		nes_write32(nesdev->regs+NES_CQE_ALLOC,
-				nescq->cq_number | (nescq->cqe_allocs_pending << 16));
-		nes_debug(NES_DBG_NETDEV, "%s: exiting with work pending\n",
-				nesvnic->netdev->name);
-	}
-	return nescq->rx_pkts_indicated;
-}
-
-
-/**
- * nes_netdev_open - Activate the network interface; ifconfig
- * ethx up.
- */
-static int nes_netdev_open(struct net_device *netdev)
-{
-	u32 macaddr_low;
-	u16 macaddr_high;
-	struct nes_vnic *nesvnic = netdev_priv(netdev);
-	struct nes_device *nesdev = nesvnic->nesdev;
-	int ret;
-	int i;
-	struct nes_vnic *first_nesvnic = NULL;
-	u32 nic_active_bit;
-	u32 nic_active;
-	struct list_head *list_pos, *list_temp;
-	unsigned long flags;
-
-	assert(nesdev != NULL);
-
-	if (nesvnic->netdev_open == 1)
-		return 0;
-
-	if (netif_msg_ifup(nesvnic))
-		printk(KERN_INFO PFX "%s: enabling interface\n", netdev->name);
-
-	ret = nes_init_nic_qp(nesdev, netdev);
-	if (ret) {
-		return ret;
-	}
-
-	netif_carrier_off(netdev);
-	netif_stop_queue(netdev);
-
-	if ((!nesvnic->of_device_registered) && (nesvnic->rdma_enabled)) {
-		nesvnic->nesibdev = nes_init_ofa_device(netdev);
-		if (nesvnic->nesibdev == NULL) {
-			printk(KERN_ERR PFX "%s: nesvnic->nesibdev alloc failed", netdev->name);
-		} else {
-			nesvnic->nesibdev->nesvnic = nesvnic;
-			ret = nes_register_ofa_device(nesvnic->nesibdev);
-			if (ret) {
-				printk(KERN_ERR PFX "%s: Unable to register RDMA device, ret = %d\n",
-						netdev->name, ret);
-			}
-		}
-	}
-	/* Set packet filters */
-	nic_active_bit = 1 << nesvnic->nic_index;
-	nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_ACTIVE);
-	nic_active |= nic_active_bit;
-	nes_write_indexed(nesdev, NES_IDX_NIC_ACTIVE, nic_active);
-	nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_MULTICAST_ENABLE);
-	nic_active |= nic_active_bit;
-	nes_write_indexed(nesdev, NES_IDX_NIC_MULTICAST_ENABLE, nic_active);
-	nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_BROADCAST_ON);
-	nic_active |= nic_active_bit;
-	nes_write_indexed(nesdev, NES_IDX_NIC_BROADCAST_ON, nic_active);
-
-	macaddr_high  = ((u16)netdev->dev_addr[0]) << 8;
-	macaddr_high += (u16)netdev->dev_addr[1];
-
-	macaddr_low   = ((u32)netdev->dev_addr[2]) << 24;
-	macaddr_low  += ((u32)netdev->dev_addr[3]) << 16;
-	macaddr_low  += ((u32)netdev->dev_addr[4]) << 8;
-	macaddr_low  += (u32)netdev->dev_addr[5];
-
-	/* Program the various MAC regs */
-	for (i = 0; i < NES_MAX_PORT_COUNT; i++) {
-		if (nesvnic->qp_nic_index[i] == 0xf) {
-			break;
-		}
-		nes_debug(NES_DBG_NETDEV, "i=%d, perfect filter table index= %d, PERF FILTER LOW"
-				" (Addr:%08X) = %08X, HIGH = %08X.\n",
-				i, nesvnic->qp_nic_index[i],
-				NES_IDX_PERFECT_FILTER_LOW+
-					(nesvnic->qp_nic_index[i] * 8),
-				macaddr_low,
-				(u32)macaddr_high | NES_MAC_ADDR_VALID |
-				((((u32)nesvnic->nic_index) << 16)));
-		nes_write_indexed(nesdev,
-				NES_IDX_PERFECT_FILTER_LOW + (nesvnic->qp_nic_index[i] * 8),
-				macaddr_low);
-		nes_write_indexed(nesdev,
-				NES_IDX_PERFECT_FILTER_HIGH + (nesvnic->qp_nic_index[i] * 8),
-				(u32)macaddr_high | NES_MAC_ADDR_VALID |
-				((((u32)nesvnic->nic_index) << 16)));
-	}
-
-
-	nes_write32(nesdev->regs+NES_CQE_ALLOC, NES_CQE_ALLOC_NOTIFY_NEXT |
-			nesvnic->nic_cq.cq_number);
-	nes_read32(nesdev->regs+NES_CQE_ALLOC);
-	list_for_each_safe(list_pos, list_temp, &nesdev->nesadapter->nesvnic_list[nesdev->mac_index]) {
-		first_nesvnic = container_of(list_pos, struct nes_vnic, list);
-		if (first_nesvnic->netdev_open == 1)
-			break;
-	}
-	if (first_nesvnic->netdev_open == 0) {
-		nes_debug(NES_DBG_INIT, "Setting up MAC interrupt mask.\n");
-		nes_write_indexed(nesdev, NES_IDX_MAC_INT_MASK + (0x200 * nesdev->mac_index),
-				~(NES_MAC_INT_LINK_STAT_CHG | NES_MAC_INT_XGMII_EXT |
-				NES_MAC_INT_TX_UNDERFLOW | NES_MAC_INT_TX_ERROR));
-		first_nesvnic = nesvnic;
-	}
-
-	if (first_nesvnic->linkup) {
-		/* Enable network packets */
-		nesvnic->linkup = 1;
-		netif_start_queue(netdev);
-		netif_carrier_on(netdev);
-	}
-
-	spin_lock_irqsave(&nesdev->nesadapter->phy_lock, flags);
-	if (nesdev->nesadapter->phy_type[nesdev->mac_index] == NES_PHY_TYPE_SFP_D) {
-		nesdev->link_recheck = 1;
-		mod_delayed_work(system_wq, &nesdev->work,
-				 NES_LINK_RECHECK_DELAY);
-	}
-	spin_unlock_irqrestore(&nesdev->nesadapter->phy_lock, flags);
-
-	spin_lock_irqsave(&nesvnic->port_ibevent_lock, flags);
-	if (nesvnic->of_device_registered) {
-		nesdev->nesadapter->send_term_ok = 1;
-		if (nesvnic->linkup == 1) {
-			if (nesdev->iw_status == 0) {
-				nesdev->iw_status = 1;
-				nes_port_ibevent(nesvnic);
-			}
-		} else {
-			nesdev->iw_status = 0;
-		}
-	}
-	spin_unlock_irqrestore(&nesvnic->port_ibevent_lock, flags);
-
-	napi_enable(&nesvnic->napi);
-	nesvnic->netdev_open = 1;
-
-	return 0;
-}
-
-
-/**
- * nes_netdev_stop
- */
-static int nes_netdev_stop(struct net_device *netdev)
-{
-	struct nes_vnic *nesvnic = netdev_priv(netdev);
-	struct nes_device *nesdev = nesvnic->nesdev;
-	u32 nic_active_mask;
-	u32 nic_active;
-	struct nes_vnic *first_nesvnic = NULL;
-	struct list_head *list_pos, *list_temp;
-	unsigned long flags;
-
-	nes_debug(NES_DBG_SHUTDOWN, "nesvnic=%p, nesdev=%p, netdev=%p %s\n",
-			nesvnic, nesdev, netdev, netdev->name);
-	if (nesvnic->netdev_open == 0)
-		return 0;
-
-	if (netif_msg_ifdown(nesvnic))
-		printk(KERN_INFO PFX "%s: disabling interface\n", netdev->name);
-	netif_carrier_off(netdev);
-
-	/* Disable network packets */
-	napi_disable(&nesvnic->napi);
-	netif_stop_queue(netdev);
-	list_for_each_safe(list_pos, list_temp, &nesdev->nesadapter->nesvnic_list[nesdev->mac_index]) {
-		first_nesvnic = container_of(list_pos, struct nes_vnic, list);
-		if ((first_nesvnic->netdev_open == 1) && (first_nesvnic != nesvnic))
-			break;
-	}
-
-	if ((first_nesvnic->netdev_open == 1) && (first_nesvnic != nesvnic)  &&
-		(PCI_FUNC(first_nesvnic->nesdev->pcidev->devfn) !=
-		PCI_FUNC(nesvnic->nesdev->pcidev->devfn))) {
-			nes_write_indexed(nesdev, NES_IDX_MAC_INT_MASK+
-				(0x200*nesdev->mac_index), 0xffffffff);
-			nes_write_indexed(first_nesvnic->nesdev,
-				NES_IDX_MAC_INT_MASK+
-				(0x200*first_nesvnic->nesdev->mac_index),
-			~(NES_MAC_INT_LINK_STAT_CHG | NES_MAC_INT_XGMII_EXT |
-			NES_MAC_INT_TX_UNDERFLOW | NES_MAC_INT_TX_ERROR));
-	} else {
-		nes_write_indexed(nesdev, NES_IDX_MAC_INT_MASK+(0x200*nesdev->mac_index), 0xffffffff);
-	}
-
-	nic_active_mask = ~((u32)(1 << nesvnic->nic_index));
-	nes_write_indexed(nesdev, NES_IDX_PERFECT_FILTER_HIGH+
-			(nesvnic->perfect_filter_index*8), 0);
-	nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_ACTIVE);
-	nic_active &= nic_active_mask;
-	nes_write_indexed(nesdev, NES_IDX_NIC_ACTIVE, nic_active);
-	nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_MULTICAST_ALL);
-	nic_active &= nic_active_mask;
-	nes_write_indexed(nesdev, NES_IDX_NIC_MULTICAST_ALL, nic_active);
-	nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_MULTICAST_ENABLE);
-	nic_active &= nic_active_mask;
-	nes_write_indexed(nesdev, NES_IDX_NIC_MULTICAST_ENABLE, nic_active);
-	nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_UNICAST_ALL);
-	nic_active &= nic_active_mask;
-	nes_write_indexed(nesdev, NES_IDX_NIC_UNICAST_ALL, nic_active);
-	nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_BROADCAST_ON);
-	nic_active &= nic_active_mask;
-	nes_write_indexed(nesdev, NES_IDX_NIC_BROADCAST_ON, nic_active);
-
-	spin_lock_irqsave(&nesvnic->port_ibevent_lock, flags);
-	if (nesvnic->of_device_registered) {
-		nesdev->nesadapter->send_term_ok = 0;
-		nesdev->iw_status = 0;
-		if (nesvnic->linkup == 1)
-			nes_port_ibevent(nesvnic);
-	}
-	del_timer_sync(&nesvnic->event_timer);
-	nesvnic->event_timer.function = NULL;
-	spin_unlock_irqrestore(&nesvnic->port_ibevent_lock, flags);
-
-	nes_destroy_nic_qp(nesvnic);
-
-	nesvnic->netdev_open = 0;
-
-	return 0;
-}
-
-
-/**
- * nes_nic_send
- */
-static bool nes_nic_send(struct sk_buff *skb, struct net_device *netdev)
-{
-	struct nes_vnic *nesvnic = netdev_priv(netdev);
-	struct nes_device *nesdev = nesvnic->nesdev;
-	struct nes_hw_nic *nesnic = &nesvnic->nic;
-	struct nes_hw_nic_sq_wqe *nic_sqe;
-	struct tcphdr *tcph;
-	__le16 *wqe_fragment_length;
-	u32 wqe_misc;
-	u16 wqe_fragment_index = 1;	/* first fragment (0) is used by copy buffer */
-	u16 skb_fragment_index;
-	dma_addr_t bus_address;
-
-	nic_sqe = &nesnic->sq_vbase[nesnic->sq_head];
-	wqe_fragment_length = (__le16 *)&nic_sqe->wqe_words[NES_NIC_SQ_WQE_LENGTH_0_TAG_IDX];
-
-	/* setup the VLAN tag if present */
-	if (skb_vlan_tag_present(skb)) {
-		nes_debug(NES_DBG_NIC_TX, "%s: VLAN packet to send... VLAN = %08X\n",
-				netdev->name, skb_vlan_tag_get(skb));
-		wqe_misc = NES_NIC_SQ_WQE_TAGVALUE_ENABLE;
-		wqe_fragment_length[0] = (__force __le16) skb_vlan_tag_get(skb);
-	} else
-		wqe_misc = 0;
-
-	/* bump past the vlan tag */
-	wqe_fragment_length++;
-	/*	wqe_fragment_address = (u64 *)&nic_sqe->wqe_words[NES_NIC_SQ_WQE_FRAG0_LOW_IDX]; */
-	wqe_misc |= NES_NIC_SQ_WQE_COMPLETION;
-
-	if (skb->ip_summed == CHECKSUM_PARTIAL) {
-		if (skb_is_gso(skb)) {
-			tcph = tcp_hdr(skb);
-			/* nes_debug(NES_DBG_NIC_TX, "%s: TSO request... is_gso = %u seg size = %u\n",
-					netdev->name, skb_is_gso(skb), skb_shinfo(skb)->gso_size); */
-			wqe_misc |= NES_NIC_SQ_WQE_LSO_ENABLE | (u16)skb_shinfo(skb)->gso_size;
-			set_wqe_32bit_value(nic_sqe->wqe_words, NES_NIC_SQ_WQE_LSO_INFO_IDX,
-					((u32)tcph->doff) |
-					(((u32)(((unsigned char *)tcph) - skb->data)) << 4));
-		}
-	} else {	/* CHECKSUM_HW */
-		wqe_misc |= NES_NIC_SQ_WQE_DISABLE_CHKSUM;
-	}
-
-	set_wqe_32bit_value(nic_sqe->wqe_words, NES_NIC_SQ_WQE_TOTAL_LENGTH_IDX,
-				skb->len);
-	memcpy(&nesnic->first_frag_vbase[nesnic->sq_head].buffer,
-			skb->data, min(((unsigned int)NES_FIRST_FRAG_SIZE), skb_headlen(skb)));
-	wqe_fragment_length[0] = cpu_to_le16(min(((unsigned int)NES_FIRST_FRAG_SIZE),
-			skb_headlen(skb)));
-	wqe_fragment_length[1] = 0;
-	if (skb_headlen(skb) > NES_FIRST_FRAG_SIZE) {
-		if ((skb_shinfo(skb)->nr_frags + 1) > 4) {
-			nes_debug(NES_DBG_NIC_TX, "%s: Packet with %u fragments not sent, skb_headlen=%u\n",
-					netdev->name, skb_shinfo(skb)->nr_frags + 2, skb_headlen(skb));
-			kfree_skb(skb);
-			nesvnic->tx_sw_dropped++;
-			return false;
-		}
-		set_bit(nesnic->sq_head, nesnic->first_frag_overflow);
-		bus_address = pci_map_single(nesdev->pcidev, skb->data + NES_FIRST_FRAG_SIZE,
-				skb_headlen(skb) - NES_FIRST_FRAG_SIZE, PCI_DMA_TODEVICE);
-		wqe_fragment_length[wqe_fragment_index++] =
-				cpu_to_le16(skb_headlen(skb) - NES_FIRST_FRAG_SIZE);
-		wqe_fragment_length[wqe_fragment_index] = 0;
-		set_wqe_64bit_value(nic_sqe->wqe_words, NES_NIC_SQ_WQE_FRAG1_LOW_IDX,
-				((u64)(bus_address)));
-		nesnic->tx_skb[nesnic->sq_head] = skb;
-	}
-
-	if (skb_headlen(skb) == skb->len) {
-		if (skb_headlen(skb) <= NES_FIRST_FRAG_SIZE) {
-			nic_sqe->wqe_words[NES_NIC_SQ_WQE_LENGTH_2_1_IDX] = 0;
-			nesnic->tx_skb[nesnic->sq_head] = skb;
-		}
-	} else {
-		/* Deal with Fragments */
-		nesnic->tx_skb[nesnic->sq_head] = skb;
-		for (skb_fragment_index = 0; skb_fragment_index < skb_shinfo(skb)->nr_frags;
-				skb_fragment_index++) {
-			skb_frag_t *frag =
-				&skb_shinfo(skb)->frags[skb_fragment_index];
-			bus_address = skb_frag_dma_map(&nesdev->pcidev->dev,
-						       frag, 0, skb_frag_size(frag),
-						       DMA_TO_DEVICE);
-			wqe_fragment_length[wqe_fragment_index] =
-					cpu_to_le16(skb_frag_size(&skb_shinfo(skb)->frags[skb_fragment_index]));
-			set_wqe_64bit_value(nic_sqe->wqe_words, NES_NIC_SQ_WQE_FRAG0_LOW_IDX+(2*wqe_fragment_index),
-				bus_address);
-			wqe_fragment_index++;
-			if (wqe_fragment_index < 5)
-				wqe_fragment_length[wqe_fragment_index] = 0;
-		}
-	}
-
-	set_wqe_32bit_value(nic_sqe->wqe_words, NES_NIC_SQ_WQE_MISC_IDX, wqe_misc);
-	nesnic->sq_head++;
-	nesnic->sq_head &= nesnic->sq_size - 1;
-	return true;
-}
-
-
-/**
- * nes_netdev_start_xmit
- */
-static netdev_tx_t nes_netdev_start_xmit(struct sk_buff *skb, struct net_device *netdev)
-{
-	struct nes_vnic *nesvnic = netdev_priv(netdev);
-	struct nes_device *nesdev = nesvnic->nesdev;
-	struct nes_hw_nic *nesnic = &nesvnic->nic;
-	struct nes_hw_nic_sq_wqe *nic_sqe;
-	struct tcphdr *tcph;
-	/* struct udphdr *udph; */
-#define NES_MAX_TSO_FRAGS MAX_SKB_FRAGS
-	/* 64K segment plus overflow on each side */
-	dma_addr_t tso_bus_address[NES_MAX_TSO_FRAGS];
-	dma_addr_t bus_address;
-	u32 tso_frag_index;
-	u32 tso_frag_count;
-	u32 tso_wqe_length;
-	u32 curr_tcp_seq;
-	u32 wqe_count=1;
-	struct iphdr *iph;
-	__le16 *wqe_fragment_length;
-	u32 nr_frags;
-	u32 original_first_length;
-	/* u64 *wqe_fragment_address; */
-	/* first fragment (0) is used by copy buffer */
-	u16 wqe_fragment_index=1;
-	u16 hoffset;
-	u16 nhoffset;
-	u16 wqes_needed;
-	u16 wqes_available;
-	u32 wqe_misc;
-
-	/*
-	 * nes_debug(NES_DBG_NIC_TX, "%s Request to tx NIC packet length %u, headlen %u,"
-	 *		" (%u frags), tso_size=%u\n",
-	 *		netdev->name, skb->len, skb_headlen(skb),
-	 *		skb_shinfo(skb)->nr_frags, skb_is_gso(skb));
-	 */
-
-	if (netif_queue_stopped(netdev))
-		return NETDEV_TX_BUSY;
-
-	/* Check if SQ is full */
-	if ((((nesnic->sq_tail+(nesnic->sq_size*2))-nesnic->sq_head) & (nesnic->sq_size - 1)) == 1) {
-		if (!netif_queue_stopped(netdev)) {
-			netif_stop_queue(netdev);
-			barrier();
-			if ((((((volatile u16)nesnic->sq_tail)+(nesnic->sq_size*2))-nesnic->sq_head) & (nesnic->sq_size - 1)) != 1) {
-				netif_start_queue(netdev);
-				goto sq_no_longer_full;
-			}
-		}
-		nesvnic->sq_full++;
-		return NETDEV_TX_BUSY;
-	}
-
-sq_no_longer_full:
-	nr_frags = skb_shinfo(skb)->nr_frags;
-	if (skb_headlen(skb) > NES_FIRST_FRAG_SIZE) {
-		nr_frags++;
-	}
-	/* Check if too many fragments */
-	if (unlikely((nr_frags > 4))) {
-		if (skb_is_gso(skb)) {
-			nesvnic->segmented_tso_requests++;
-			nesvnic->tso_requests++;
-			/* Basically 4 fragments available per WQE with extended fragments */
-			wqes_needed = nr_frags >> 2;
-			wqes_needed += (nr_frags&3)?1:0;
-			wqes_available = (((nesnic->sq_tail+nesnic->sq_size)-nesnic->sq_head) - 1) &
-					(nesnic->sq_size - 1);
-
-			if (unlikely(wqes_needed > wqes_available)) {
-				if (!netif_queue_stopped(netdev)) {
-					netif_stop_queue(netdev);
-					barrier();
-					wqes_available = (((((volatile u16)nesnic->sq_tail)+nesnic->sq_size)-nesnic->sq_head) - 1) &
-						(nesnic->sq_size - 1);
-					if (wqes_needed <= wqes_available) {
-						netif_start_queue(netdev);
-						goto tso_sq_no_longer_full;
-					}
-				}
-				nesvnic->sq_full++;
-				nes_debug(NES_DBG_NIC_TX, "%s: HNIC SQ full- TSO request has too many frags!\n",
-						netdev->name);
-				return NETDEV_TX_BUSY;
-			}
-tso_sq_no_longer_full:
-			/* Map all the buffers */
-			for (tso_frag_count=0; tso_frag_count < skb_shinfo(skb)->nr_frags;
-					tso_frag_count++) {
-				skb_frag_t *frag =
-					&skb_shinfo(skb)->frags[tso_frag_count];
-				tso_bus_address[tso_frag_count] =
-					skb_frag_dma_map(&nesdev->pcidev->dev,
-							 frag, 0, skb_frag_size(frag),
-							 DMA_TO_DEVICE);
-			}
-
-			tso_frag_index = 0;
-			curr_tcp_seq = ntohl(tcp_hdr(skb)->seq);
-			hoffset = skb_transport_header(skb) - skb->data;
-			nhoffset = skb_network_header(skb) - skb->data;
-			original_first_length = hoffset + ((((struct tcphdr *)skb_transport_header(skb))->doff)<<2);
-
-			for (wqe_count=0; wqe_count<((u32)wqes_needed); wqe_count++) {
-				tso_wqe_length = 0;
-				nic_sqe = &nesnic->sq_vbase[nesnic->sq_head];
-				wqe_fragment_length =
-						(__le16 *)&nic_sqe->wqe_words[NES_NIC_SQ_WQE_LENGTH_0_TAG_IDX];
-				/* setup the VLAN tag if present */
-				if (skb_vlan_tag_present(skb)) {
-					nes_debug(NES_DBG_NIC_TX, "%s: VLAN packet to send... VLAN = %08X\n",
-							netdev->name,
-						  skb_vlan_tag_get(skb));
-					wqe_misc = NES_NIC_SQ_WQE_TAGVALUE_ENABLE;
-					wqe_fragment_length[0] = (__force __le16) skb_vlan_tag_get(skb);
-				} else
-					wqe_misc = 0;
-
-				/* bump past the vlan tag */
-				wqe_fragment_length++;
-
-				/* Assumes header totally fits in allocated buffer and is in first fragment */
-				if (original_first_length > NES_FIRST_FRAG_SIZE) {
-					nes_debug(NES_DBG_NIC_TX, "ERROR: SKB header too big, headlen=%u, FIRST_FRAG_SIZE=%u\n",
-							original_first_length, NES_FIRST_FRAG_SIZE);
-					nes_debug(NES_DBG_NIC_TX, "%s Request to tx NIC packet length %u, headlen %u,"
-							" (%u frags), is_gso = %u tso_size=%u\n",
-							netdev->name,
-							skb->len, skb_headlen(skb),
-							skb_shinfo(skb)->nr_frags, skb_is_gso(skb), skb_shinfo(skb)->gso_size);
-				}
-				memcpy(&nesnic->first_frag_vbase[nesnic->sq_head].buffer,
-						skb->data, min(((unsigned int)NES_FIRST_FRAG_SIZE),
-						original_first_length));
-				iph = (struct iphdr *)
-				(&nesnic->first_frag_vbase[nesnic->sq_head].buffer[nhoffset]);
-				tcph = (struct tcphdr *)
-				(&nesnic->first_frag_vbase[nesnic->sq_head].buffer[hoffset]);
-				if ((wqe_count+1)!=(u32)wqes_needed) {
-					tcph->fin = 0;
-					tcph->psh = 0;
-					tcph->rst = 0;
-					tcph->urg = 0;
-				}
-				if (wqe_count) {
-					tcph->syn = 0;
-				}
-				tcph->seq = htonl(curr_tcp_seq);
-				wqe_fragment_length[0] = cpu_to_le16(min(((unsigned int)NES_FIRST_FRAG_SIZE),
-						original_first_length));
-
-				wqe_fragment_index = 1;
-				if ((wqe_count==0) && (skb_headlen(skb) > original_first_length)) {
-					set_bit(nesnic->sq_head, nesnic->first_frag_overflow);
-					bus_address = pci_map_single(nesdev->pcidev, skb->data + original_first_length,
-							skb_headlen(skb) - original_first_length, PCI_DMA_TODEVICE);
-					wqe_fragment_length[wqe_fragment_index++] =
-						cpu_to_le16(skb_headlen(skb) - original_first_length);
-					wqe_fragment_length[wqe_fragment_index] = 0;
-					set_wqe_64bit_value(nic_sqe->wqe_words, NES_NIC_SQ_WQE_FRAG1_LOW_IDX,
-									bus_address);
-					tso_wqe_length += skb_headlen(skb) -
-							original_first_length;
-				}
-				while (wqe_fragment_index < 5) {
-					wqe_fragment_length[wqe_fragment_index] =
-							cpu_to_le16(skb_frag_size(&skb_shinfo(skb)->frags[tso_frag_index]));
-					set_wqe_64bit_value(nic_sqe->wqe_words, NES_NIC_SQ_WQE_FRAG0_LOW_IDX+(2*wqe_fragment_index),
-						(u64)tso_bus_address[tso_frag_index]);
-					wqe_fragment_index++;
-					tso_wqe_length += skb_frag_size(&skb_shinfo(skb)->frags[tso_frag_index++]);
-					if (wqe_fragment_index < 5)
-						wqe_fragment_length[wqe_fragment_index] = 0;
-					if (tso_frag_index == tso_frag_count)
-						break;
-				}
-				if ((wqe_count+1) == (u32)wqes_needed) {
-					nesnic->tx_skb[nesnic->sq_head] = skb;
-				} else {
-					nesnic->tx_skb[nesnic->sq_head] = NULL;
-				}
-				wqe_misc |= NES_NIC_SQ_WQE_COMPLETION | (u16)skb_shinfo(skb)->gso_size;
-				if ((tso_wqe_length + original_first_length) > skb_shinfo(skb)->gso_size) {
-					wqe_misc |= NES_NIC_SQ_WQE_LSO_ENABLE;
-				} else {
-					iph->tot_len = htons(tso_wqe_length + original_first_length - nhoffset);
-				}
-
-				set_wqe_32bit_value(nic_sqe->wqe_words, NES_NIC_SQ_WQE_MISC_IDX,
-						 wqe_misc);
-				set_wqe_32bit_value(nic_sqe->wqe_words, NES_NIC_SQ_WQE_LSO_INFO_IDX,
-						((u32)tcph->doff) | (((u32)hoffset) << 4));
-
-				set_wqe_32bit_value(nic_sqe->wqe_words, NES_NIC_SQ_WQE_TOTAL_LENGTH_IDX,
-						tso_wqe_length + original_first_length);
-				curr_tcp_seq += tso_wqe_length;
-				nesnic->sq_head++;
-				nesnic->sq_head &= nesnic->sq_size-1;
-			}
-		} else {
-			hoffset = skb_transport_header(skb) - skb->data;
-			nhoffset = skb_network_header(skb) - skb->data;
-			if (skb_linearize(skb)) {
-				nesvnic->tx_sw_dropped++;
-				kfree_skb(skb);
-				return NETDEV_TX_OK;
-			}
-			nesvnic->linearized_skbs++;
-			skb_set_transport_header(skb, hoffset);
-			skb_set_network_header(skb, nhoffset);
-			if (!nes_nic_send(skb, netdev))
-				return NETDEV_TX_OK;
-		}
-	} else {
-		if (!nes_nic_send(skb, netdev))
-			return NETDEV_TX_OK;
-	}
-
-	barrier();
-
-	if (wqe_count)
-		nes_write32(nesdev->regs+NES_WQE_ALLOC,
-				(wqe_count << 24) | (1 << 23) | nesvnic->nic.qp_id);
-
-	netif_trans_update(netdev);
-
-	return NETDEV_TX_OK;
-}
-
-
-/**
- * nes_netdev_get_stats
- */
-static struct net_device_stats *nes_netdev_get_stats(struct net_device *netdev)
-{
-	struct nes_vnic *nesvnic = netdev_priv(netdev);
-	struct nes_device *nesdev = nesvnic->nesdev;
-	u64 u64temp;
-	u32 u32temp;
-
-	u32temp = nes_read_indexed(nesdev,
-			NES_IDX_ENDNODE0_NSTAT_RX_DISCARD + (nesvnic->nic_index*0x200));
-	nesvnic->netstats.rx_dropped += u32temp;
-	nesvnic->endnode_nstat_rx_discard += u32temp;
-
-	u64temp = (u64)nes_read_indexed(nesdev,
-			NES_IDX_ENDNODE0_NSTAT_RX_OCTETS_LO + (nesvnic->nic_index*0x200));
-	u64temp += ((u64)nes_read_indexed(nesdev,
-			NES_IDX_ENDNODE0_NSTAT_RX_OCTETS_HI + (nesvnic->nic_index*0x200))) << 32;
-
-	nesvnic->endnode_nstat_rx_octets += u64temp;
-	nesvnic->netstats.rx_bytes += u64temp;
-
-	u64temp = (u64)nes_read_indexed(nesdev,
-			NES_IDX_ENDNODE0_NSTAT_RX_FRAMES_LO + (nesvnic->nic_index*0x200));
-	u64temp += ((u64)nes_read_indexed(nesdev,
-			NES_IDX_ENDNODE0_NSTAT_RX_FRAMES_HI + (nesvnic->nic_index*0x200))) << 32;
-
-	nesvnic->endnode_nstat_rx_frames += u64temp;
-	nesvnic->netstats.rx_packets += u64temp;
-
-	u64temp = (u64)nes_read_indexed(nesdev,
-			NES_IDX_ENDNODE0_NSTAT_TX_OCTETS_LO + (nesvnic->nic_index*0x200));
-	u64temp += ((u64)nes_read_indexed(nesdev,
-			NES_IDX_ENDNODE0_NSTAT_TX_OCTETS_HI + (nesvnic->nic_index*0x200))) << 32;
-
-	nesvnic->endnode_nstat_tx_octets += u64temp;
-	nesvnic->netstats.tx_bytes += u64temp;
-
-	u64temp = (u64)nes_read_indexed(nesdev,
-			NES_IDX_ENDNODE0_NSTAT_TX_FRAMES_LO + (nesvnic->nic_index*0x200));
-	u64temp += ((u64)nes_read_indexed(nesdev,
-			NES_IDX_ENDNODE0_NSTAT_TX_FRAMES_HI + (nesvnic->nic_index*0x200))) << 32;
-
-	nesvnic->endnode_nstat_tx_frames += u64temp;
-	nesvnic->netstats.tx_packets += u64temp;
-
-	u32temp = nes_read_indexed(nesdev,
-			NES_IDX_MAC_RX_SHORT_FRAMES + (nesvnic->nesdev->mac_index*0x200));
-	nesvnic->netstats.rx_dropped += u32temp;
-	nesvnic->nesdev->mac_rx_errors += u32temp;
-	nesvnic->nesdev->mac_rx_short_frames += u32temp;
-
-	u32temp = nes_read_indexed(nesdev,
-			NES_IDX_MAC_RX_OVERSIZED_FRAMES + (nesvnic->nesdev->mac_index*0x200));
-	nesvnic->netstats.rx_dropped += u32temp;
-	nesvnic->nesdev->mac_rx_errors += u32temp;
-	nesvnic->nesdev->mac_rx_oversized_frames += u32temp;
-
-	u32temp = nes_read_indexed(nesdev,
-			NES_IDX_MAC_RX_JABBER_FRAMES + (nesvnic->nesdev->mac_index*0x200));
-	nesvnic->netstats.rx_dropped += u32temp;
-	nesvnic->nesdev->mac_rx_errors += u32temp;
-	nesvnic->nesdev->mac_rx_jabber_frames += u32temp;
-
-	u32temp = nes_read_indexed(nesdev,
-			NES_IDX_MAC_RX_SYMBOL_ERR_FRAMES + (nesvnic->nesdev->mac_index*0x200));
-	nesvnic->netstats.rx_dropped += u32temp;
-	nesvnic->nesdev->mac_rx_errors += u32temp;
-	nesvnic->nesdev->mac_rx_symbol_err_frames += u32temp;
-
-	u32temp = nes_read_indexed(nesdev,
-			NES_IDX_MAC_RX_LENGTH_ERR_FRAMES + (nesvnic->nesdev->mac_index*0x200));
-	nesvnic->netstats.rx_length_errors += u32temp;
-	nesvnic->nesdev->mac_rx_errors += u32temp;
-
-	u32temp = nes_read_indexed(nesdev,
-			NES_IDX_MAC_RX_CRC_ERR_FRAMES + (nesvnic->nesdev->mac_index*0x200));
-	nesvnic->nesdev->mac_rx_errors += u32temp;
-	nesvnic->nesdev->mac_rx_crc_errors += u32temp;
-	nesvnic->netstats.rx_crc_errors += u32temp;
-
-	u32temp = nes_read_indexed(nesdev,
-			NES_IDX_MAC_TX_ERRORS + (nesvnic->nesdev->mac_index*0x200));
-	nesvnic->nesdev->mac_tx_errors += u32temp;
-	nesvnic->netstats.tx_errors += u32temp;
-
-	return &nesvnic->netstats;
-}
-
-
-/**
- * nes_netdev_tx_timeout
- */
-static void nes_netdev_tx_timeout(struct net_device *netdev)
-{
-	struct nes_vnic *nesvnic = netdev_priv(netdev);
-
-	if (netif_msg_timer(nesvnic))
-		nes_debug(NES_DBG_NIC_TX, "%s: tx timeout\n", netdev->name);
-}
-
-
-/**
- * nes_netdev_set_mac_address
- */
-static int nes_netdev_set_mac_address(struct net_device *netdev, void *p)
-{
-	struct nes_vnic *nesvnic = netdev_priv(netdev);
-	struct nes_device *nesdev = nesvnic->nesdev;
-	struct sockaddr *mac_addr = p;
-	int i;
-	u32 macaddr_low;
-	u16 macaddr_high;
-
-	if (!is_valid_ether_addr(mac_addr->sa_data))
-		return -EADDRNOTAVAIL;
-
-	memcpy(netdev->dev_addr, mac_addr->sa_data, netdev->addr_len);
-	printk(PFX "%s: Address length = %d, Address = %pM\n",
-	       __func__, netdev->addr_len, mac_addr->sa_data);
-	macaddr_high  = ((u16)netdev->dev_addr[0]) << 8;
-	macaddr_high += (u16)netdev->dev_addr[1];
-	macaddr_low   = ((u32)netdev->dev_addr[2]) << 24;
-	macaddr_low  += ((u32)netdev->dev_addr[3]) << 16;
-	macaddr_low  += ((u32)netdev->dev_addr[4]) << 8;
-	macaddr_low  += (u32)netdev->dev_addr[5];
-
-	for (i = 0; i < NES_MAX_PORT_COUNT; i++) {
-		if (nesvnic->qp_nic_index[i] == 0xf) {
-			break;
-		}
-		nes_write_indexed(nesdev,
-				NES_IDX_PERFECT_FILTER_LOW + (nesvnic->qp_nic_index[i] * 8),
-				macaddr_low);
-		nes_write_indexed(nesdev,
-				NES_IDX_PERFECT_FILTER_HIGH + (nesvnic->qp_nic_index[i] * 8),
-				(u32)macaddr_high | NES_MAC_ADDR_VALID |
-				((((u32)nesvnic->nic_index) << 16)));
-	}
-	return 0;
-}
-
-
-static void set_allmulti(struct nes_device *nesdev, u32 nic_active_bit)
-{
-	u32 nic_active;
-
-	nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_MULTICAST_ALL);
-	nic_active |= nic_active_bit;
-	nes_write_indexed(nesdev, NES_IDX_NIC_MULTICAST_ALL, nic_active);
-	nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_UNICAST_ALL);
-	nic_active &= ~nic_active_bit;
-	nes_write_indexed(nesdev, NES_IDX_NIC_UNICAST_ALL, nic_active);
-}
-
-#define get_addr(addrs, index) ((addrs) + (index) * ETH_ALEN)
-
-/**
- * nes_netdev_set_multicast_list
- */
-static void nes_netdev_set_multicast_list(struct net_device *netdev)
-{
-	struct nes_vnic *nesvnic = netdev_priv(netdev);
-	struct nes_device *nesdev = nesvnic->nesdev;
-	struct nes_adapter *nesadapter = nesvnic->nesdev->nesadapter;
-	u32 nic_active_bit;
-	u32 nic_active;
-	u32 perfect_filter_register_address;
-	u32 macaddr_low;
-	u16 macaddr_high;
-	u8 mc_all_on = 0;
-	u8 mc_index;
-	int mc_nic_index = -1;
-	u8 pft_entries_preallocated = max(nesadapter->adapter_fcn_count *
-					nics_per_function, 4);
-	u8 max_pft_entries_avaiable = NES_PFT_SIZE - pft_entries_preallocated;
-	unsigned long flags;
-	int mc_count = netdev_mc_count(netdev);
-
-	spin_lock_irqsave(&nesadapter->resource_lock, flags);
-	nic_active_bit = 1 << nesvnic->nic_index;
-
-	if (netdev->flags & IFF_PROMISC) {
-		nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_MULTICAST_ALL);
-		nic_active |= nic_active_bit;
-		nes_write_indexed(nesdev, NES_IDX_NIC_MULTICAST_ALL, nic_active);
-		nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_UNICAST_ALL);
-		nic_active |= nic_active_bit;
-		nes_write_indexed(nesdev, NES_IDX_NIC_UNICAST_ALL, nic_active);
-		mc_all_on = 1;
-	} else if ((netdev->flags & IFF_ALLMULTI) ||
-			   (nesvnic->nic_index > 3)) {
-		set_allmulti(nesdev, nic_active_bit);
-		mc_all_on = 1;
-	} else {
-		nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_MULTICAST_ALL);
-		nic_active &= ~nic_active_bit;
-		nes_write_indexed(nesdev, NES_IDX_NIC_MULTICAST_ALL, nic_active);
-		nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_UNICAST_ALL);
-		nic_active &= ~nic_active_bit;
-		nes_write_indexed(nesdev, NES_IDX_NIC_UNICAST_ALL, nic_active);
-	}
-
-	nes_debug(NES_DBG_NIC_RX, "Number of MC entries = %d, Promiscuous = %d, All Multicast = %d.\n",
-		  mc_count, !!(netdev->flags & IFF_PROMISC),
-		  !!(netdev->flags & IFF_ALLMULTI));
-	if (!mc_all_on) {
-		char *addrs;
-		int i;
-		struct netdev_hw_addr *ha;
-
-		addrs = kmalloc_array(mc_count, ETH_ALEN, GFP_ATOMIC);
-		if (!addrs) {
-			set_allmulti(nesdev, nic_active_bit);
-			goto unlock;
-		}
-		i = 0;
-		netdev_for_each_mc_addr(ha, netdev)
-			memcpy(get_addr(addrs, i++), ha->addr, ETH_ALEN);
-
-		perfect_filter_register_address = NES_IDX_PERFECT_FILTER_LOW +
-						pft_entries_preallocated * 0x8;
-		for (i = 0, mc_index = 0; mc_index < max_pft_entries_avaiable;
-		     mc_index++) {
-			while (i < mc_count && nesvnic->mcrq_mcast_filter &&
-			((mc_nic_index = nesvnic->mcrq_mcast_filter(nesvnic,
-					get_addr(addrs, i++))) == 0));
-			if (mc_nic_index < 0)
-				mc_nic_index = nesvnic->nic_index;
-			while (nesadapter->pft_mcast_map[mc_index] < 16 &&
-				nesadapter->pft_mcast_map[mc_index] !=
-					nesvnic->nic_index &&
-					mc_index < max_pft_entries_avaiable) {
-				nes_debug(NES_DBG_NIC_RX,
-					  "mc_index=%d skipping nic_index=%d, used for=%d\n",
-					  mc_index, nesvnic->nic_index,
-					  nesadapter->pft_mcast_map[mc_index]);
-				mc_index++;
-			}
-			if (mc_index >= max_pft_entries_avaiable)
-				break;
-			if (i < mc_count) {
-				char *addr = get_addr(addrs, i++);
-
-				nes_debug(NES_DBG_NIC_RX, "Assigning MC Address %pM to register 0x%04X nic_idx=%d\n",
-					  addr,
-					  perfect_filter_register_address+(mc_index * 8),
-					  mc_nic_index);
-				macaddr_high  = ((u8) addr[0]) << 8;
-				macaddr_high += (u8) addr[1];
-				macaddr_low   = ((u8) addr[2]) << 24;
-				macaddr_low  += ((u8) addr[3]) << 16;
-				macaddr_low  += ((u8) addr[4]) << 8;
-				macaddr_low  += (u8) addr[5];
-
-				nes_write_indexed(nesdev,
-						perfect_filter_register_address+(mc_index * 8),
-						macaddr_low);
-				nes_write_indexed(nesdev,
-						perfect_filter_register_address+4+(mc_index * 8),
-						(u32)macaddr_high | NES_MAC_ADDR_VALID |
-						((((u32)(1<<mc_nic_index)) << 16)));
-				nesadapter->pft_mcast_map[mc_index] =
-							nesvnic->nic_index;
-			} else {
-				nes_debug(NES_DBG_NIC_RX, "Clearing MC Address at register 0x%04X\n",
-						  perfect_filter_register_address+(mc_index * 8));
-				nes_write_indexed(nesdev,
-						perfect_filter_register_address+4+(mc_index * 8),
-						0);
-				nesadapter->pft_mcast_map[mc_index] = 255;
-			}
-		}
-		kfree(addrs);
-		/* PFT is not large enough */
-		if (i < mc_count)
-			set_allmulti(nesdev, nic_active_bit);
-	}
-
-unlock:
-	spin_unlock_irqrestore(&nesadapter->resource_lock, flags);
-}
-
-
-/**
- * nes_netdev_change_mtu
- */
-static int nes_netdev_change_mtu(struct net_device *netdev, int new_mtu)
-{
-	struct nes_vnic	*nesvnic = netdev_priv(netdev);
-	struct nes_device *nesdev = nesvnic->nesdev;
-	u8 jumbomode = 0;
-	u32 nic_active;
-	u32 nic_active_bit;
-	u32 uc_all_active;
-	u32 mc_all_active;
-
-	netdev->mtu = new_mtu;
-	nesvnic->max_frame_size	= new_mtu + VLAN_ETH_HLEN;
-
-	if (netdev->mtu	> ETH_DATA_LEN)	{
-		jumbomode=1;
-	}
-	nes_nic_init_timer_defaults(nesdev, jumbomode);
-
-	if (netif_running(netdev)) {
-		nic_active_bit = 1 << nesvnic->nic_index;
-		mc_all_active = nes_read_indexed(nesdev,
-				NES_IDX_NIC_MULTICAST_ALL) & nic_active_bit;
-		uc_all_active = nes_read_indexed(nesdev,
-				NES_IDX_NIC_UNICAST_ALL)  & nic_active_bit;
-
-		nes_netdev_stop(netdev);
-		nes_netdev_open(netdev);
-
-		nic_active = nes_read_indexed(nesdev,
-					NES_IDX_NIC_MULTICAST_ALL);
-		nic_active |= mc_all_active;
-		nes_write_indexed(nesdev, NES_IDX_NIC_MULTICAST_ALL,
-							nic_active);
-
-		nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_UNICAST_ALL);
-		nic_active |= uc_all_active;
-		nes_write_indexed(nesdev, NES_IDX_NIC_UNICAST_ALL, nic_active);
-	}
-
-	return 0;
-}
-
-
-static const char nes_ethtool_stringset[][ETH_GSTRING_LEN] = {
-	"Link Change Interrupts",
-	"Linearized SKBs",
-	"T/GSO Requests",
-	"Pause Frames Sent",
-	"Pause Frames Received",
-	"Internal Routing Errors",
-	"SQ SW Dropped SKBs",
-	"SQ Full",
-	"Segmented TSO Requests",
-	"Rx Symbol Errors",
-	"Rx Jabber Errors",
-	"Rx Oversized Frames",
-	"Rx Short Frames",
-	"Rx Length Errors",
-	"Rx CRC Errors",
-	"Rx Port Discard",
-	"Endnode Rx Discards",
-	"Endnode Rx Octets",
-	"Endnode Rx Frames",
-	"Endnode Tx Octets",
-	"Endnode Tx Frames",
-	"Tx Errors",
-	"mh detected",
-	"mh pauses",
-	"Retransmission Count",
-	"CM Connects",
-	"CM Accepts",
-	"Disconnects",
-	"Connected Events",
-	"Connect Requests",
-	"CM Rejects",
-	"ModifyQP Timeouts",
-	"CreateQPs",
-	"SW DestroyQPs",
-	"DestroyQPs",
-	"CM Closes",
-	"CM Packets Sent",
-	"CM Packets Bounced",
-	"CM Packets Created",
-	"CM Packets Rcvd",
-	"CM Packets Dropped",
-	"CM Packets Retrans",
-	"CM Listens Created",
-	"CM Listens Destroyed",
-	"CM Backlog Drops",
-	"CM Loopbacks",
-	"CM Nodes Created",
-	"CM Nodes Destroyed",
-	"CM Accel Drops",
-	"CM Resets Received",
-	"Free 4Kpbls",
-	"Free 256pbls",
-	"Timer Inits",
-	"PAU CreateQPs",
-	"PAU DestroyQPs",
-};
-#define NES_ETHTOOL_STAT_COUNT  ARRAY_SIZE(nes_ethtool_stringset)
-
-
-/**
- * nes_netdev_get_sset_count
- */
-static int nes_netdev_get_sset_count(struct net_device *netdev, int stringset)
-{
-	if (stringset == ETH_SS_STATS)
-		return NES_ETHTOOL_STAT_COUNT;
-	else
-		return -EINVAL;
-}
-
-
-/**
- * nes_netdev_get_strings
- */
-static void nes_netdev_get_strings(struct net_device *netdev, u32 stringset,
-		u8 *ethtool_strings)
-{
-	if (stringset == ETH_SS_STATS)
-		memcpy(ethtool_strings,
-				&nes_ethtool_stringset,
-				sizeof(nes_ethtool_stringset));
-}
-
-
-/**
- * nes_netdev_get_ethtool_stats
- */
-
-static void nes_netdev_get_ethtool_stats(struct net_device *netdev,
-		struct ethtool_stats *target_ethtool_stats, u64 *target_stat_values)
-{
-	u64 u64temp;
-	struct nes_vnic *nesvnic = netdev_priv(netdev);
-	struct nes_device *nesdev = nesvnic->nesdev;
-	struct nes_adapter *nesadapter = nesdev->nesadapter;
-	u32 nic_count;
-	u32 u32temp;
-	u32 index = 0;
-
-	target_ethtool_stats->n_stats = NES_ETHTOOL_STAT_COUNT;
-	target_stat_values[index] = nesvnic->nesdev->link_status_interrupts;
-	target_stat_values[++index] = nesvnic->linearized_skbs;
-	target_stat_values[++index] = nesvnic->tso_requests;
-
-	u32temp = nes_read_indexed(nesdev,
-			NES_IDX_MAC_TX_PAUSE_FRAMES + (nesvnic->nesdev->mac_index*0x200));
-	nesvnic->nesdev->mac_pause_frames_sent += u32temp;
-	target_stat_values[++index] = nesvnic->nesdev->mac_pause_frames_sent;
-
-	u32temp = nes_read_indexed(nesdev,
-			NES_IDX_MAC_RX_PAUSE_FRAMES + (nesvnic->nesdev->mac_index*0x200));
-	nesvnic->nesdev->mac_pause_frames_received += u32temp;
-
-	u32temp = nes_read_indexed(nesdev,
-			NES_IDX_PORT_RX_DISCARDS + (nesvnic->nesdev->mac_index*0x40));
-	nesvnic->nesdev->port_rx_discards += u32temp;
-	nesvnic->netstats.rx_dropped += u32temp;
-
-	u32temp = nes_read_indexed(nesdev,
-			NES_IDX_PORT_TX_DISCARDS + (nesvnic->nesdev->mac_index*0x40));
-	nesvnic->nesdev->port_tx_discards += u32temp;
-	nesvnic->netstats.tx_dropped += u32temp;
-
-	u32temp = nes_read_indexed(nesdev,
-			NES_IDX_MAC_RX_SHORT_FRAMES + (nesvnic->nesdev->mac_index*0x200));
-	nesvnic->netstats.rx_dropped += u32temp;
-	nesvnic->nesdev->mac_rx_errors += u32temp;
-	nesvnic->nesdev->mac_rx_short_frames += u32temp;
-
-	u32temp = nes_read_indexed(nesdev,
-			NES_IDX_MAC_RX_OVERSIZED_FRAMES + (nesvnic->nesdev->mac_index*0x200));
-	nesvnic->netstats.rx_dropped += u32temp;
-	nesvnic->nesdev->mac_rx_errors += u32temp;
-	nesvnic->nesdev->mac_rx_oversized_frames += u32temp;
-
-	u32temp = nes_read_indexed(nesdev,
-			NES_IDX_MAC_RX_JABBER_FRAMES + (nesvnic->nesdev->mac_index*0x200));
-	nesvnic->netstats.rx_dropped += u32temp;
-	nesvnic->nesdev->mac_rx_errors += u32temp;
-	nesvnic->nesdev->mac_rx_jabber_frames += u32temp;
-
-	u32temp = nes_read_indexed(nesdev,
-			NES_IDX_MAC_RX_SYMBOL_ERR_FRAMES + (nesvnic->nesdev->mac_index*0x200));
-	nesvnic->netstats.rx_dropped += u32temp;
-	nesvnic->nesdev->mac_rx_errors += u32temp;
-	nesvnic->nesdev->mac_rx_symbol_err_frames += u32temp;
-
-	u32temp = nes_read_indexed(nesdev,
-			NES_IDX_MAC_RX_LENGTH_ERR_FRAMES + (nesvnic->nesdev->mac_index*0x200));
-	nesvnic->netstats.rx_length_errors += u32temp;
-	nesvnic->nesdev->mac_rx_errors += u32temp;
-
-	u32temp = nes_read_indexed(nesdev,
-			NES_IDX_MAC_RX_CRC_ERR_FRAMES + (nesvnic->nesdev->mac_index*0x200));
-	nesvnic->nesdev->mac_rx_errors += u32temp;
-	nesvnic->nesdev->mac_rx_crc_errors += u32temp;
-	nesvnic->netstats.rx_crc_errors += u32temp;
-
-	u32temp = nes_read_indexed(nesdev,
-			NES_IDX_MAC_TX_ERRORS + (nesvnic->nesdev->mac_index*0x200));
-	nesvnic->nesdev->mac_tx_errors += u32temp;
-	nesvnic->netstats.tx_errors += u32temp;
-
-	for (nic_count = 0; nic_count < NES_MAX_PORT_COUNT; nic_count++) {
-		if (nesvnic->qp_nic_index[nic_count] == 0xf)
-			break;
-
-		u32temp = nes_read_indexed(nesdev,
-				NES_IDX_ENDNODE0_NSTAT_RX_DISCARD +
-				(nesvnic->qp_nic_index[nic_count]*0x200));
-		nesvnic->netstats.rx_dropped += u32temp;
-		nesvnic->endnode_nstat_rx_discard += u32temp;
-
-		u64temp = (u64)nes_read_indexed(nesdev,
-				NES_IDX_ENDNODE0_NSTAT_RX_OCTETS_LO +
-				(nesvnic->qp_nic_index[nic_count]*0x200));
-		u64temp += ((u64)nes_read_indexed(nesdev,
-				NES_IDX_ENDNODE0_NSTAT_RX_OCTETS_HI +
-				(nesvnic->qp_nic_index[nic_count]*0x200))) << 32;
-
-		nesvnic->endnode_nstat_rx_octets += u64temp;
-		nesvnic->netstats.rx_bytes += u64temp;
-
-		u64temp = (u64)nes_read_indexed(nesdev,
-				NES_IDX_ENDNODE0_NSTAT_RX_FRAMES_LO +
-				(nesvnic->qp_nic_index[nic_count]*0x200));
-		u64temp += ((u64)nes_read_indexed(nesdev,
-				NES_IDX_ENDNODE0_NSTAT_RX_FRAMES_HI +
-				(nesvnic->qp_nic_index[nic_count]*0x200))) << 32;
-
-		nesvnic->endnode_nstat_rx_frames += u64temp;
-		nesvnic->netstats.rx_packets += u64temp;
-
-		u64temp = (u64)nes_read_indexed(nesdev,
-				NES_IDX_ENDNODE0_NSTAT_TX_OCTETS_LO +
-				(nesvnic->qp_nic_index[nic_count]*0x200));
-		u64temp += ((u64)nes_read_indexed(nesdev,
-				NES_IDX_ENDNODE0_NSTAT_TX_OCTETS_HI +
-				(nesvnic->qp_nic_index[nic_count]*0x200))) << 32;
-
-		nesvnic->endnode_nstat_tx_octets += u64temp;
-		nesvnic->netstats.tx_bytes += u64temp;
-
-		u64temp = (u64)nes_read_indexed(nesdev,
-				NES_IDX_ENDNODE0_NSTAT_TX_FRAMES_LO +
-				(nesvnic->qp_nic_index[nic_count]*0x200));
-		u64temp += ((u64)nes_read_indexed(nesdev,
-				NES_IDX_ENDNODE0_NSTAT_TX_FRAMES_HI +
-				(nesvnic->qp_nic_index[nic_count]*0x200))) << 32;
-
-		nesvnic->endnode_nstat_tx_frames += u64temp;
-		nesvnic->netstats.tx_packets += u64temp;
-
-		u32temp = nes_read_indexed(nesdev,
-				NES_IDX_IPV4_TCP_REXMITS + (nesvnic->qp_nic_index[nic_count]*0x200));
-		nesvnic->endnode_ipv4_tcp_retransmits += u32temp;
-	}
-
-	target_stat_values[++index] = nesvnic->nesdev->mac_pause_frames_received;
-	target_stat_values[++index] = nesdev->nesadapter->nic_rx_eth_route_err;
-	target_stat_values[++index] = nesvnic->tx_sw_dropped;
-	target_stat_values[++index] = nesvnic->sq_full;
-	target_stat_values[++index] = nesvnic->segmented_tso_requests;
-	target_stat_values[++index] = nesvnic->nesdev->mac_rx_symbol_err_frames;
-	target_stat_values[++index] = nesvnic->nesdev->mac_rx_jabber_frames;
-	target_stat_values[++index] = nesvnic->nesdev->mac_rx_oversized_frames;
-	target_stat_values[++index] = nesvnic->nesdev->mac_rx_short_frames;
-	target_stat_values[++index] = nesvnic->netstats.rx_length_errors;
-	target_stat_values[++index] = nesvnic->nesdev->mac_rx_crc_errors;
-	target_stat_values[++index] = nesvnic->nesdev->port_rx_discards;
-	target_stat_values[++index] = nesvnic->endnode_nstat_rx_discard;
-	target_stat_values[++index] = nesvnic->endnode_nstat_rx_octets;
-	target_stat_values[++index] = nesvnic->endnode_nstat_rx_frames;
-	target_stat_values[++index] = nesvnic->endnode_nstat_tx_octets;
-	target_stat_values[++index] = nesvnic->endnode_nstat_tx_frames;
-	target_stat_values[++index] = nesvnic->nesdev->mac_tx_errors;
-	target_stat_values[++index] = mh_detected;
-	target_stat_values[++index] = mh_pauses_sent;
-	target_stat_values[++index] = nesvnic->endnode_ipv4_tcp_retransmits;
-	target_stat_values[++index] = atomic_read(&cm_connects);
-	target_stat_values[++index] = atomic_read(&cm_accepts);
-	target_stat_values[++index] = atomic_read(&cm_disconnects);
-	target_stat_values[++index] = atomic_read(&cm_connecteds);
-	target_stat_values[++index] = atomic_read(&cm_connect_reqs);
-	target_stat_values[++index] = atomic_read(&cm_rejects);
-	target_stat_values[++index] = atomic_read(&mod_qp_timouts);
-	target_stat_values[++index] = atomic_read(&qps_created);
-	target_stat_values[++index] = atomic_read(&sw_qps_destroyed);
-	target_stat_values[++index] = atomic_read(&qps_destroyed);
-	target_stat_values[++index] = atomic_read(&cm_closes);
-	target_stat_values[++index] = cm_packets_sent;
-	target_stat_values[++index] = cm_packets_bounced;
-	target_stat_values[++index] = cm_packets_created;
-	target_stat_values[++index] = cm_packets_received;
-	target_stat_values[++index] = cm_packets_dropped;
-	target_stat_values[++index] = cm_packets_retrans;
-	target_stat_values[++index] = atomic_read(&cm_listens_created);
-	target_stat_values[++index] = atomic_read(&cm_listens_destroyed);
-	target_stat_values[++index] = cm_backlog_drops;
-	target_stat_values[++index] = atomic_read(&cm_loopbacks);
-	target_stat_values[++index] = atomic_read(&cm_nodes_created);
-	target_stat_values[++index] = atomic_read(&cm_nodes_destroyed);
-	target_stat_values[++index] = atomic_read(&cm_accel_dropped_pkts);
-	target_stat_values[++index] = atomic_read(&cm_resets_recvd);
-	target_stat_values[++index] = nesadapter->free_4kpbl;
-	target_stat_values[++index] = nesadapter->free_256pbl;
-	target_stat_values[++index] = int_mod_timer_init;
-	target_stat_values[++index] = atomic_read(&pau_qps_created);
-	target_stat_values[++index] = atomic_read(&pau_qps_destroyed);
-}
-
-/**
- * nes_netdev_get_drvinfo
- */
-static void nes_netdev_get_drvinfo(struct net_device *netdev,
-		struct ethtool_drvinfo *drvinfo)
-{
-	struct nes_vnic *nesvnic = netdev_priv(netdev);
-	struct nes_adapter *nesadapter = nesvnic->nesdev->nesadapter;
-
-	strlcpy(drvinfo->driver, DRV_NAME, sizeof(drvinfo->driver));
-	strlcpy(drvinfo->bus_info, pci_name(nesvnic->nesdev->pcidev),
-		sizeof(drvinfo->bus_info));
-	snprintf(drvinfo->fw_version, sizeof(drvinfo->fw_version),
-		 "%u.%u", nesadapter->firmware_version >> 16,
-		 nesadapter->firmware_version & 0x000000ff);
-	strlcpy(drvinfo->version, DRV_VERSION, sizeof(drvinfo->version));
-}
-
-
-/**
- * nes_netdev_set_coalesce
- */
-static int nes_netdev_set_coalesce(struct net_device *netdev,
-		struct ethtool_coalesce	*et_coalesce)
-{
-	struct nes_vnic	*nesvnic = netdev_priv(netdev);
-	struct nes_device *nesdev = nesvnic->nesdev;
-	struct nes_adapter *nesadapter = nesdev->nesadapter;
-	struct nes_hw_tune_timer *shared_timer = &nesadapter->tune_timer;
-	unsigned long flags;
-
-	spin_lock_irqsave(&nesadapter->periodic_timer_lock, flags);
-	if (et_coalesce->rx_max_coalesced_frames_low) {
-		shared_timer->threshold_low = et_coalesce->rx_max_coalesced_frames_low;
-	}
-	if (et_coalesce->rx_max_coalesced_frames_irq) {
-		shared_timer->threshold_target = et_coalesce->rx_max_coalesced_frames_irq;
-	}
-	if (et_coalesce->rx_max_coalesced_frames_high) {
-		shared_timer->threshold_high = et_coalesce->rx_max_coalesced_frames_high;
-	}
-	if (et_coalesce->rx_coalesce_usecs_low) {
-		shared_timer->timer_in_use_min = et_coalesce->rx_coalesce_usecs_low;
-	}
-	if (et_coalesce->rx_coalesce_usecs_high) {
-		shared_timer->timer_in_use_max = et_coalesce->rx_coalesce_usecs_high;
-	}
-	spin_unlock_irqrestore(&nesadapter->periodic_timer_lock, flags);
-
-	/* using this to drive total interrupt moderation */
-	nesadapter->et_rx_coalesce_usecs_irq = et_coalesce->rx_coalesce_usecs_irq;
-	if (et_coalesce->use_adaptive_rx_coalesce) {
-		nesadapter->et_use_adaptive_rx_coalesce	= 1;
-		nesadapter->timer_int_limit = NES_TIMER_INT_LIMIT_DYNAMIC;
-		nesadapter->et_rx_coalesce_usecs_irq = 0;
-		if (et_coalesce->pkt_rate_low) {
-			nesadapter->et_pkt_rate_low = et_coalesce->pkt_rate_low;
-		}
-	} else {
-		nesadapter->et_use_adaptive_rx_coalesce	= 0;
-		nesadapter->timer_int_limit = NES_TIMER_INT_LIMIT;
-		if (nesadapter->et_rx_coalesce_usecs_irq) {
-			nes_write32(nesdev->regs+NES_PERIODIC_CONTROL,
-					0x80000000 | ((u32)(nesadapter->et_rx_coalesce_usecs_irq*8)));
-		}
-	}
-	return 0;
-}
-
-
-/**
- * nes_netdev_get_coalesce
- */
-static int nes_netdev_get_coalesce(struct net_device *netdev,
-		struct ethtool_coalesce	*et_coalesce)
-{
-	struct nes_vnic	*nesvnic = netdev_priv(netdev);
-	struct nes_device *nesdev = nesvnic->nesdev;
-	struct nes_adapter *nesadapter = nesdev->nesadapter;
-	struct ethtool_coalesce	temp_et_coalesce;
-	struct nes_hw_tune_timer *shared_timer = &nesadapter->tune_timer;
-	unsigned long flags;
-
-	memset(&temp_et_coalesce, 0, sizeof(temp_et_coalesce));
-	temp_et_coalesce.rx_coalesce_usecs_irq    = nesadapter->et_rx_coalesce_usecs_irq;
-	temp_et_coalesce.use_adaptive_rx_coalesce = nesadapter->et_use_adaptive_rx_coalesce;
-	temp_et_coalesce.rate_sample_interval     = nesadapter->et_rate_sample_interval;
-	temp_et_coalesce.pkt_rate_low =	nesadapter->et_pkt_rate_low;
-	spin_lock_irqsave(&nesadapter->periodic_timer_lock,	flags);
-	temp_et_coalesce.rx_max_coalesced_frames_low  = shared_timer->threshold_low;
-	temp_et_coalesce.rx_max_coalesced_frames_irq  = shared_timer->threshold_target;
-	temp_et_coalesce.rx_max_coalesced_frames_high = shared_timer->threshold_high;
-	temp_et_coalesce.rx_coalesce_usecs_low  = shared_timer->timer_in_use_min;
-	temp_et_coalesce.rx_coalesce_usecs_high = shared_timer->timer_in_use_max;
-	if (nesadapter->et_use_adaptive_rx_coalesce) {
-		temp_et_coalesce.rx_coalesce_usecs_irq = shared_timer->timer_in_use;
-	}
-	spin_unlock_irqrestore(&nesadapter->periodic_timer_lock, flags);
-	memcpy(et_coalesce, &temp_et_coalesce, sizeof(*et_coalesce));
-	return 0;
-}
-
-
-/**
- * nes_netdev_get_pauseparam
- */
-static void nes_netdev_get_pauseparam(struct net_device *netdev,
-		struct ethtool_pauseparam *et_pauseparam)
-{
-	struct nes_vnic *nesvnic = netdev_priv(netdev);
-
-	et_pauseparam->autoneg = 0;
-	et_pauseparam->rx_pause = (nesvnic->nesdev->disable_rx_flow_control == 0) ? 1:0;
-	et_pauseparam->tx_pause = (nesvnic->nesdev->disable_tx_flow_control == 0) ? 1:0;
-}
-
-
-/**
- * nes_netdev_set_pauseparam
- */
-static int nes_netdev_set_pauseparam(struct net_device *netdev,
-		struct ethtool_pauseparam *et_pauseparam)
-{
-	struct nes_vnic *nesvnic = netdev_priv(netdev);
-	struct nes_device *nesdev = nesvnic->nesdev;
-	u32 u32temp;
-
-	if (et_pauseparam->autoneg) {
-		/* TODO: should return unsupported */
-		return 0;
-	}
-	if ((et_pauseparam->tx_pause == 1) && (nesdev->disable_tx_flow_control == 1)) {
-		u32temp = nes_read_indexed(nesdev,
-				NES_IDX_MAC_TX_CONFIG + (nesdev->mac_index*0x200));
-		u32temp |= NES_IDX_MAC_TX_CONFIG_ENABLE_PAUSE;
-		nes_write_indexed(nesdev,
-				NES_IDX_MAC_TX_CONFIG + (nesdev->mac_index*0x200), u32temp);
-		nesdev->disable_tx_flow_control = 0;
-	} else if ((et_pauseparam->tx_pause == 0) && (nesdev->disable_tx_flow_control == 0)) {
-		u32temp = nes_read_indexed(nesdev,
-				NES_IDX_MAC_TX_CONFIG + (nesdev->mac_index*0x200));
-		u32temp &= ~NES_IDX_MAC_TX_CONFIG_ENABLE_PAUSE;
-		nes_write_indexed(nesdev,
-				NES_IDX_MAC_TX_CONFIG + (nesdev->mac_index*0x200), u32temp);
-		nesdev->disable_tx_flow_control = 1;
-	}
-	if ((et_pauseparam->rx_pause == 1) && (nesdev->disable_rx_flow_control == 1)) {
-		u32temp = nes_read_indexed(nesdev,
-				NES_IDX_MPP_DEBUG + (nesdev->mac_index*0x40));
-		u32temp &= ~NES_IDX_MPP_DEBUG_PORT_DISABLE_PAUSE;
-		nes_write_indexed(nesdev,
-				NES_IDX_MPP_DEBUG + (nesdev->mac_index*0x40), u32temp);
-		nesdev->disable_rx_flow_control = 0;
-	} else if ((et_pauseparam->rx_pause == 0) && (nesdev->disable_rx_flow_control == 0)) {
-		u32temp = nes_read_indexed(nesdev,
-				NES_IDX_MPP_DEBUG + (nesdev->mac_index*0x40));
-		u32temp |= NES_IDX_MPP_DEBUG_PORT_DISABLE_PAUSE;
-		nes_write_indexed(nesdev,
-				NES_IDX_MPP_DEBUG + (nesdev->mac_index*0x40), u32temp);
-		nesdev->disable_rx_flow_control = 1;
-	}
-
-	return 0;
-}
-
-
-/**
- * nes_netdev_get_settings
- */
-static int nes_netdev_get_link_ksettings(struct net_device *netdev,
-					 struct ethtool_link_ksettings *cmd)
-{
-	struct nes_vnic *nesvnic = netdev_priv(netdev);
-	struct nes_device *nesdev = nesvnic->nesdev;
-	struct nes_adapter *nesadapter = nesdev->nesadapter;
-	u32 mac_index = nesdev->mac_index;
-	u8 phy_type = nesadapter->phy_type[mac_index];
-	u8 phy_index = nesadapter->phy_index[mac_index];
-	u16 phy_data;
-	u32 supported, advertising;
-
-	cmd->base.duplex = DUPLEX_FULL;
-	cmd->base.port   = PORT_MII;
-
-	if (nesadapter->OneG_Mode) {
-		cmd->base.speed = SPEED_1000;
-		if (phy_type == NES_PHY_TYPE_PUMA_1G) {
-			supported   = SUPPORTED_1000baseT_Full;
-			advertising = ADVERTISED_1000baseT_Full;
-			cmd->base.autoneg     = AUTONEG_DISABLE;
-			cmd->base.phy_address = mac_index;
-		} else {
-			unsigned long flags;
-
-			supported = SUPPORTED_1000baseT_Full
-				| SUPPORTED_Autoneg;
-			advertising = ADVERTISED_1000baseT_Full
-				| ADVERTISED_Autoneg;
-			spin_lock_irqsave(&nesadapter->phy_lock, flags);
-			nes_read_1G_phy_reg(nesdev, 0, phy_index, &phy_data);
-			spin_unlock_irqrestore(&nesadapter->phy_lock, flags);
-			if (phy_data & 0x1000)
-				cmd->base.autoneg = AUTONEG_ENABLE;
-			else
-				cmd->base.autoneg = AUTONEG_DISABLE;
-			cmd->base.phy_address = phy_index;
-		}
-		ethtool_convert_legacy_u32_to_link_mode(
-			cmd->link_modes.supported, supported);
-		ethtool_convert_legacy_u32_to_link_mode(
-			cmd->link_modes.advertising, advertising);
-		return 0;
-	}
-	if ((phy_type == NES_PHY_TYPE_ARGUS) ||
-	    (phy_type == NES_PHY_TYPE_SFP_D) ||
-	    (phy_type == NES_PHY_TYPE_KR)) {
-		cmd->base.port        = PORT_FIBRE;
-		supported   = SUPPORTED_FIBRE;
-		advertising = ADVERTISED_FIBRE;
-		cmd->base.phy_address = phy_index;
-	} else {
-		supported   = SUPPORTED_10000baseT_Full;
-		advertising = ADVERTISED_10000baseT_Full;
-		cmd->base.phy_address = mac_index;
-	}
-	cmd->base.speed = SPEED_10000;
-	cmd->base.autoneg = AUTONEG_DISABLE;
-	ethtool_convert_legacy_u32_to_link_mode(cmd->link_modes.supported,
-						supported);
-	ethtool_convert_legacy_u32_to_link_mode(cmd->link_modes.advertising,
-						advertising);
-
-	return 0;
-}
-
-
-/**
- * nes_netdev_set_settings
- */
-static int
-nes_netdev_set_link_ksettings(struct net_device *netdev,
-			      const struct ethtool_link_ksettings *cmd)
-{
-	struct nes_vnic *nesvnic = netdev_priv(netdev);
-	struct nes_device *nesdev = nesvnic->nesdev;
-	struct nes_adapter *nesadapter = nesdev->nesadapter;
-
-	if ((nesadapter->OneG_Mode) &&
-	    (nesadapter->phy_type[nesdev->mac_index] != NES_PHY_TYPE_PUMA_1G)) {
-		unsigned long flags;
-		u16 phy_data;
-		u8 phy_index = nesadapter->phy_index[nesdev->mac_index];
-
-		spin_lock_irqsave(&nesadapter->phy_lock, flags);
-		nes_read_1G_phy_reg(nesdev, 0, phy_index, &phy_data);
-		if (cmd->base.autoneg) {
-			/* Turn on Full duplex, Autoneg, and restart autonegotiation */
-			phy_data |= 0x1300;
-		} else {
-			/* Turn off autoneg */
-			phy_data &= ~0x1000;
-		}
-		nes_write_1G_phy_reg(nesdev, 0, phy_index, phy_data);
-		spin_unlock_irqrestore(&nesadapter->phy_lock, flags);
-	}
-
-	return 0;
-}
-
-
-static const struct ethtool_ops nes_ethtool_ops = {
-	.get_link = ethtool_op_get_link,
-	.get_strings = nes_netdev_get_strings,
-	.get_sset_count = nes_netdev_get_sset_count,
-	.get_ethtool_stats = nes_netdev_get_ethtool_stats,
-	.get_drvinfo = nes_netdev_get_drvinfo,
-	.get_coalesce = nes_netdev_get_coalesce,
-	.set_coalesce = nes_netdev_set_coalesce,
-	.get_pauseparam = nes_netdev_get_pauseparam,
-	.set_pauseparam = nes_netdev_set_pauseparam,
-	.get_link_ksettings = nes_netdev_get_link_ksettings,
-	.set_link_ksettings = nes_netdev_set_link_ksettings,
-};
-
-static void nes_vlan_mode(struct net_device *netdev, struct nes_device *nesdev, netdev_features_t features)
-{
-	struct nes_adapter *nesadapter = nesdev->nesadapter;
-	u32 u32temp;
-	unsigned long flags;
-
-	spin_lock_irqsave(&nesadapter->phy_lock, flags);
-
-	nes_debug(NES_DBG_NETDEV, "%s: %s\n", __func__, netdev->name);
-
-	/* Enable/Disable VLAN Stripping */
-	u32temp = nes_read_indexed(nesdev, NES_IDX_PCIX_DIAG);
-	if (features & NETIF_F_HW_VLAN_CTAG_RX)
-		u32temp &= 0xfdffffff;
-	else
-		u32temp	|= 0x02000000;
-
-	nes_write_indexed(nesdev, NES_IDX_PCIX_DIAG, u32temp);
-	spin_unlock_irqrestore(&nesadapter->phy_lock, flags);
-}
-
-static netdev_features_t nes_fix_features(struct net_device *netdev, netdev_features_t features)
-{
-	/*
-	 * Since there is no support for separate rx/tx vlan accel
-	 * enable/disable make sure tx flag is always in same state as rx.
-	 */
-	if (features & NETIF_F_HW_VLAN_CTAG_RX)
-		features |= NETIF_F_HW_VLAN_CTAG_TX;
-	else
-		features &= ~NETIF_F_HW_VLAN_CTAG_TX;
-
-	return features;
-}
-
-static int nes_set_features(struct net_device *netdev, netdev_features_t features)
-{
-	struct nes_vnic *nesvnic = netdev_priv(netdev);
-	struct nes_device *nesdev = nesvnic->nesdev;
-	u32 changed = netdev->features ^ features;
-
-	if (changed & NETIF_F_HW_VLAN_CTAG_RX)
-		nes_vlan_mode(netdev, nesdev, features);
-
-	return 0;
-}
-
-static const struct net_device_ops nes_netdev_ops = {
-	.ndo_open		= nes_netdev_open,
-	.ndo_stop		= nes_netdev_stop,
-	.ndo_start_xmit		= nes_netdev_start_xmit,
-	.ndo_get_stats		= nes_netdev_get_stats,
-	.ndo_tx_timeout		= nes_netdev_tx_timeout,
-	.ndo_set_mac_address	= nes_netdev_set_mac_address,
-	.ndo_set_rx_mode	= nes_netdev_set_multicast_list,
-	.ndo_change_mtu		= nes_netdev_change_mtu,
-	.ndo_validate_addr	= eth_validate_addr,
-	.ndo_fix_features	= nes_fix_features,
-	.ndo_set_features	= nes_set_features,
-};
-
-/**
- * nes_netdev_init - initialize network device
- */
-struct net_device *nes_netdev_init(struct nes_device *nesdev,
-		void __iomem *mmio_addr)
-{
-	u64 u64temp;
-	struct nes_vnic *nesvnic;
-	struct net_device *netdev;
-	struct nic_qp_map *curr_qp_map;
-	u8 phy_type = nesdev->nesadapter->phy_type[nesdev->mac_index];
-
-	netdev = alloc_etherdev(sizeof(struct nes_vnic));
-	if (!netdev) {
-		printk(KERN_ERR PFX "nesvnic etherdev alloc failed");
-		return NULL;
-	}
-	nesvnic = netdev_priv(netdev);
-
-	nes_debug(NES_DBG_INIT, "netdev = %p, %s\n", netdev, netdev->name);
-
-	SET_NETDEV_DEV(netdev, &nesdev->pcidev->dev);
-
-	netdev->watchdog_timeo = NES_TX_TIMEOUT;
-	netdev->irq = nesdev->pcidev->irq;
-	netdev->max_mtu = NES_MAX_MTU;
-	netdev->hard_header_len = ETH_HLEN;
-	netdev->addr_len = ETH_ALEN;
-	netdev->type = ARPHRD_ETHER;
-	netdev->netdev_ops = &nes_netdev_ops;
-	netdev->ethtool_ops = &nes_ethtool_ops;
-	netif_napi_add(netdev, &nesvnic->napi, nes_netdev_poll, 128);
-	nes_debug(NES_DBG_INIT, "Enabling VLAN Insert/Delete.\n");
-
-	/* Fill in the port structure */
-	nesvnic->netdev = netdev;
-	nesvnic->nesdev = nesdev;
-	nesvnic->msg_enable = netif_msg_init(debug, default_msg);
-	nesvnic->netdev_index = nesdev->netdev_count;
-	nesvnic->perfect_filter_index = nesdev->nesadapter->netdev_count;
-	nesvnic->max_frame_size = netdev->mtu + netdev->hard_header_len + VLAN_HLEN;
-
-	curr_qp_map = nic_qp_mapping_per_function[PCI_FUNC(nesdev->pcidev->devfn)];
-	nesvnic->nic.qp_id = curr_qp_map[nesdev->netdev_count].qpid;
-	nesvnic->nic_index = curr_qp_map[nesdev->netdev_count].nic_index;
-	nesvnic->logical_port = curr_qp_map[nesdev->netdev_count].logical_port;
-
-	/* Setup the burned in MAC address */
-	u64temp = (u64)nesdev->nesadapter->mac_addr_low;
-	u64temp += ((u64)nesdev->nesadapter->mac_addr_high) << 32;
-	u64temp += nesvnic->nic_index;
-	netdev->dev_addr[0] = (u8)(u64temp>>40);
-	netdev->dev_addr[1] = (u8)(u64temp>>32);
-	netdev->dev_addr[2] = (u8)(u64temp>>24);
-	netdev->dev_addr[3] = (u8)(u64temp>>16);
-	netdev->dev_addr[4] = (u8)(u64temp>>8);
-	netdev->dev_addr[5] = (u8)u64temp;
-
-	netdev->hw_features = NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_RXCSUM | NETIF_F_HW_VLAN_CTAG_RX;
-	if ((nesvnic->logical_port < 2) || (nesdev->nesadapter->hw_rev != NE020_REV))
-		netdev->hw_features |= NETIF_F_TSO;
-
-	netdev->features = netdev->hw_features | NETIF_F_HIGHDMA | NETIF_F_HW_VLAN_CTAG_TX;
-
-	nes_debug(NES_DBG_INIT, "nesvnic = %p, reported features = 0x%lX, QPid = %d,"
-			" nic_index = %d, logical_port = %d, mac_index = %d.\n",
-			nesvnic, (unsigned long)netdev->features, nesvnic->nic.qp_id,
-			nesvnic->nic_index, nesvnic->logical_port,  nesdev->mac_index);
-
-	if (nesvnic->nesdev->nesadapter->port_count == 1 &&
-		nesvnic->nesdev->nesadapter->adapter_fcn_count == 1) {
-
-		nesvnic->qp_nic_index[0] = nesvnic->nic_index;
-		nesvnic->qp_nic_index[1] = nesvnic->nic_index + 1;
-		if (nes_drv_opt & NES_DRV_OPT_DUAL_LOGICAL_PORT) {
-			nesvnic->qp_nic_index[2] = 0xf;
-			nesvnic->qp_nic_index[3] = 0xf;
-		} else {
-			nesvnic->qp_nic_index[2] = nesvnic->nic_index + 2;
-			nesvnic->qp_nic_index[3] = nesvnic->nic_index + 3;
-		}
-	} else {
-		if (nesvnic->nesdev->nesadapter->port_count == 2 ||
-			(nesvnic->nesdev->nesadapter->port_count == 1 &&
-			nesvnic->nesdev->nesadapter->adapter_fcn_count == 2)) {
-				nesvnic->qp_nic_index[0] = nesvnic->nic_index;
-				nesvnic->qp_nic_index[1] = nesvnic->nic_index
-									+ 2;
-				nesvnic->qp_nic_index[2] = 0xf;
-				nesvnic->qp_nic_index[3] = 0xf;
-		} else {
-			nesvnic->qp_nic_index[0] = nesvnic->nic_index;
-			nesvnic->qp_nic_index[1] = 0xf;
-			nesvnic->qp_nic_index[2] = 0xf;
-			nesvnic->qp_nic_index[3] = 0xf;
-		}
-	}
-	nesvnic->next_qp_nic_index = 0;
-
-	if (nesdev->netdev_count == 0) {
-		nesvnic->rdma_enabled = 1;
-	} else {
-		nesvnic->rdma_enabled = 0;
-	}
-	nesvnic->nic_cq.cq_number = nesvnic->nic.qp_id;
-	timer_setup(&nesvnic->event_timer, NULL, 0);
-	spin_lock_init(&nesvnic->tx_lock);
-	spin_lock_init(&nesvnic->port_ibevent_lock);
-	nesdev->netdev[nesdev->netdev_count] = netdev;
-
-	nes_debug(NES_DBG_INIT, "Adding nesvnic (%p) to the adapters nesvnic_list for MAC%d.\n",
-			nesvnic, nesdev->mac_index);
-	list_add_tail(&nesvnic->list, &nesdev->nesadapter->nesvnic_list[nesdev->mac_index]);
-
-	if ((nesdev->netdev_count == 0) &&
-	    ((PCI_FUNC(nesdev->pcidev->devfn) == nesdev->mac_index) ||
-	     ((phy_type == NES_PHY_TYPE_PUMA_1G) &&
-	      (((PCI_FUNC(nesdev->pcidev->devfn) == 1) && (nesdev->mac_index == 2)) ||
-	       ((PCI_FUNC(nesdev->pcidev->devfn) == 2) && (nesdev->mac_index == 1)))))) {
-		u32 u32temp;
-		u32 link_mask = 0;
-		u32 link_val = 0;
-		u16 temp_phy_data;
-		u16 phy_data = 0;
-		unsigned long flags;
-
-		u32temp = nes_read_indexed(nesdev, NES_IDX_PHY_PCS_CONTROL_STATUS0 +
-				(0x200 * (nesdev->mac_index & 1)));
-		if (phy_type != NES_PHY_TYPE_PUMA_1G) {
-			u32temp |= 0x00200000;
-			nes_write_indexed(nesdev, NES_IDX_PHY_PCS_CONTROL_STATUS0 +
-				(0x200 * (nesdev->mac_index & 1)), u32temp);
-		}
-
-		/* Check and set linkup here.  This is for back to back */
-		/* configuration where second port won't get link interrupt */
-		switch (phy_type) {
-		case NES_PHY_TYPE_PUMA_1G:
-			if (nesdev->mac_index < 2) {
-				link_mask = 0x01010000;
-				link_val = 0x01010000;
-			} else {
-				link_mask = 0x02020000;
-				link_val = 0x02020000;
-			}
-			break;
-		case NES_PHY_TYPE_SFP_D:
-			spin_lock_irqsave(&nesdev->nesadapter->phy_lock, flags);
-			nes_read_10G_phy_reg(nesdev,
-					     nesdev->nesadapter->phy_index[nesdev->mac_index],
-					     1, 0x9003);
-			temp_phy_data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL);
-			nes_read_10G_phy_reg(nesdev,
-					     nesdev->nesadapter->phy_index[nesdev->mac_index],
-					     3, 0x0021);
-			nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL);
-			nes_read_10G_phy_reg(nesdev,
-					     nesdev->nesadapter->phy_index[nesdev->mac_index],
-					     3, 0x0021);
-			phy_data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL);
-			spin_unlock_irqrestore(&nesdev->nesadapter->phy_lock, flags);
-			phy_data = (!temp_phy_data && (phy_data == 0x8000)) ? 0x4 : 0x0;
-			break;
-		default:
-			link_mask = 0x0f1f0000;
-			link_val = 0x0f0f0000;
-			break;
-		}
-
-		u32temp = nes_read_indexed(nesdev,
-					   NES_IDX_PHY_PCS_CONTROL_STATUS0 +
-					   (0x200 * (nesdev->mac_index & 1)));
-
-		if (phy_type == NES_PHY_TYPE_SFP_D) {
-			if (phy_data & 0x0004)
-				nesvnic->linkup = 1;
-		} else {
-			if ((u32temp & link_mask) == link_val)
-				nesvnic->linkup = 1;
-		}
-
-		/* clear the MAC interrupt status, assumes direct logical to physical mapping */
-		u32temp = nes_read_indexed(nesdev, NES_IDX_MAC_INT_STATUS + (0x200 * nesdev->mac_index));
-		nes_debug(NES_DBG_INIT, "Phy interrupt status = 0x%X.\n", u32temp);
-		nes_write_indexed(nesdev, NES_IDX_MAC_INT_STATUS + (0x200 * nesdev->mac_index), u32temp);
-
-		nes_init_phy(nesdev);
-	}
-
-	nes_vlan_mode(netdev, nesdev, netdev->features);
-
-	return netdev;
-}
-
-
-/**
- * nes_netdev_destroy - destroy network device structure
- */
-void nes_netdev_destroy(struct net_device *netdev)
-{
-	struct nes_vnic *nesvnic = netdev_priv(netdev);
-
-	/* make sure 'stop' method is called by Linux stack */
-	/* nes_netdev_stop(netdev); */
-
-	list_del(&nesvnic->list);
-
-	if (nesvnic->of_device_registered) {
-		nes_destroy_ofa_device(nesvnic->nesibdev);
-	}
-
-	free_netdev(netdev);
-}
-
-
-/**
- * nes_nic_cm_xmit -- CM calls this to send out pkts
- */
-int nes_nic_cm_xmit(struct sk_buff *skb, struct net_device *netdev)
-{
-	int ret;
-
-	skb->dev = netdev;
-	ret = dev_queue_xmit(skb);
-	if (ret) {
-		nes_debug(NES_DBG_CM, "Bad return code from dev_queue_xmit %d\n", ret);
-	}
-
-	return ret;
-}
diff --git a/drivers/infiniband/hw/nes/nes_utils.c b/drivers/infiniband/hw/nes/nes_utils.c
deleted file mode 100644
index 21b4a83..0000000
--- a/drivers/infiniband/hw/nes/nes_utils.c
+++ /dev/null
@@ -1,916 +0,0 @@
-/*
- * Copyright (c) 2006 - 2011 Intel Corporation.  All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- */
-
-#include <linux/module.h>
-#include <linux/moduleparam.h>
-#include <linux/netdevice.h>
-#include <linux/etherdevice.h>
-#include <linux/ethtool.h>
-#include <linux/mii.h>
-#include <linux/if_vlan.h>
-#include <linux/slab.h>
-#include <linux/crc32.h>
-#include <linux/in.h>
-#include <linux/ip.h>
-#include <linux/tcp.h>
-#include <linux/init.h>
-#include <linux/kernel.h>
-
-#include <asm/io.h>
-#include <asm/irq.h>
-#include <asm/byteorder.h>
-
-#include "nes.h"
-
-static u16 nes_read16_eeprom(void __iomem *addr, u16 offset);
-
-u32 mh_detected;
-u32 mh_pauses_sent;
-
-static u32 nes_set_pau(struct nes_device *nesdev)
-{
-	u32 ret = 0;
-	u32 counter;
-
-	nes_write_indexed(nesdev, NES_IDX_GPR2, NES_ENABLE_PAU);
-	nes_write_indexed(nesdev, NES_IDX_GPR_TRIGGER, 1);
-
-	for (counter = 0; counter < NES_PAU_COUNTER; counter++) {
-		udelay(30);
-		if (!nes_read_indexed(nesdev, NES_IDX_GPR2)) {
-			printk(KERN_INFO PFX "PAU is supported.\n");
-			break;
-		}
-		nes_write_indexed(nesdev, NES_IDX_GPR_TRIGGER, 1);
-	}
-	if (counter == NES_PAU_COUNTER) {
-		printk(KERN_INFO PFX "PAU is not supported.\n");
-		return -EPERM;
-	}
-	return ret;
-}
-
-/**
- * nes_read_eeprom_values -
- */
-int nes_read_eeprom_values(struct nes_device *nesdev, struct nes_adapter *nesadapter)
-{
-	u32 mac_addr_low;
-	u16 mac_addr_high;
-	u16 eeprom_data;
-	u16 eeprom_offset;
-	u16 next_section_address;
-	u16 sw_section_ver;
-	u8  major_ver = 0;
-	u8  minor_ver = 0;
-
-	/* TODO: deal with EEPROM endian issues */
-	if (nesadapter->firmware_eeprom_offset == 0) {
-		/* Read the EEPROM Parameters */
-		eeprom_data = nes_read16_eeprom(nesdev->regs, 0);
-		nes_debug(NES_DBG_HW, "EEPROM Offset 0  = 0x%04X\n", eeprom_data);
-		eeprom_offset = 2 + (((eeprom_data & 0x007f) << 3) <<
-				((eeprom_data & 0x0080) >> 7));
-		nes_debug(NES_DBG_HW, "Firmware Offset = 0x%04X\n", eeprom_offset);
-		nesadapter->firmware_eeprom_offset = eeprom_offset;
-		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset + 4);
-		if (eeprom_data != 0x5746) {
-			nes_debug(NES_DBG_HW, "Not a valid Firmware Image = 0x%04X\n", eeprom_data);
-			return -1;
-		}
-
-		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset + 2);
-		nes_debug(NES_DBG_HW, "EEPROM Offset %u  = 0x%04X\n",
-				eeprom_offset + 2, eeprom_data);
-		eeprom_offset += ((eeprom_data & 0x00ff) << 3) << ((eeprom_data & 0x0100) >> 8);
-		nes_debug(NES_DBG_HW, "Software Offset = 0x%04X\n", eeprom_offset);
-		nesadapter->software_eeprom_offset = eeprom_offset;
-		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset + 4);
-		if (eeprom_data != 0x5753) {
-			printk("Not a valid Software Image = 0x%04X\n", eeprom_data);
-			return -1;
-		}
-		sw_section_ver = nes_read16_eeprom(nesdev->regs, nesadapter->software_eeprom_offset  + 6);
-		nes_debug(NES_DBG_HW, "Software section version number = 0x%04X\n",
-				sw_section_ver);
-
-		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset + 2);
-		nes_debug(NES_DBG_HW, "EEPROM Offset %u (next section)  = 0x%04X\n",
-				eeprom_offset + 2, eeprom_data);
-		next_section_address = eeprom_offset + (((eeprom_data & 0x00ff) << 3) <<
-				((eeprom_data & 0x0100) >> 8));
-		eeprom_data = nes_read16_eeprom(nesdev->regs, next_section_address + 4);
-		if (eeprom_data != 0x414d) {
-			nes_debug(NES_DBG_HW, "EEPROM Changed offset should be 0x414d but was 0x%04X\n",
-					eeprom_data);
-			goto no_fw_rev;
-		}
-		eeprom_offset = next_section_address;
-
-		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset + 2);
-		nes_debug(NES_DBG_HW, "EEPROM Offset %u (next section)  = 0x%04X\n",
-				eeprom_offset + 2, eeprom_data);
-		next_section_address = eeprom_offset + (((eeprom_data & 0x00ff) << 3) <<
-				((eeprom_data & 0x0100) >> 8));
-		eeprom_data = nes_read16_eeprom(nesdev->regs, next_section_address + 4);
-		if (eeprom_data != 0x4f52) {
-			nes_debug(NES_DBG_HW, "EEPROM Changed offset should be 0x4f52 but was 0x%04X\n",
-					eeprom_data);
-			goto no_fw_rev;
-		}
-		eeprom_offset = next_section_address;
-
-		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset + 2);
-		nes_debug(NES_DBG_HW, "EEPROM Offset %u (next section)  = 0x%04X\n",
-				eeprom_offset + 2, eeprom_data);
-		next_section_address = eeprom_offset + ((eeprom_data & 0x00ff) << 3);
-		eeprom_data = nes_read16_eeprom(nesdev->regs, next_section_address + 4);
-		if (eeprom_data != 0x5746) {
-			nes_debug(NES_DBG_HW, "EEPROM Changed offset should be 0x5746 but was 0x%04X\n",
-					eeprom_data);
-			goto no_fw_rev;
-		}
-		eeprom_offset = next_section_address;
-
-		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset + 2);
-		nes_debug(NES_DBG_HW, "EEPROM Offset %u (next section)  = 0x%04X\n",
-				eeprom_offset + 2, eeprom_data);
-		next_section_address = eeprom_offset + ((eeprom_data & 0x00ff) << 3);
-		eeprom_data = nes_read16_eeprom(nesdev->regs, next_section_address + 4);
-		if (eeprom_data != 0x5753) {
-			nes_debug(NES_DBG_HW, "EEPROM Changed offset should be 0x5753 but was 0x%04X\n",
-					eeprom_data);
-			goto no_fw_rev;
-		}
-		eeprom_offset = next_section_address;
-
-		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset + 2);
-		nes_debug(NES_DBG_HW, "EEPROM Offset %u (next section)  = 0x%04X\n",
-				eeprom_offset + 2, eeprom_data);
-		next_section_address = eeprom_offset + ((eeprom_data & 0x00ff) << 3);
-		eeprom_data = nes_read16_eeprom(nesdev->regs, next_section_address + 4);
-		if (eeprom_data != 0x414d) {
-			nes_debug(NES_DBG_HW, "EEPROM Changed offset should be 0x414d but was 0x%04X\n",
-					eeprom_data);
-			goto no_fw_rev;
-		}
-		eeprom_offset = next_section_address;
-
-		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset + 2);
-		nes_debug(NES_DBG_HW, "EEPROM Offset %u (next section)  = 0x%04X\n",
-				eeprom_offset + 2, eeprom_data);
-		next_section_address = eeprom_offset + ((eeprom_data & 0x00ff) << 3);
-		eeprom_data = nes_read16_eeprom(nesdev->regs, next_section_address + 4);
-		if (eeprom_data != 0x464e) {
-			nes_debug(NES_DBG_HW, "EEPROM Changed offset should be 0x464e but was 0x%04X\n",
-					eeprom_data);
-			goto no_fw_rev;
-		}
-		eeprom_data = nes_read16_eeprom(nesdev->regs, next_section_address + 8);
-		printk(PFX "Firmware version %u.%u\n", (u8)(eeprom_data>>8), (u8)eeprom_data);
-		major_ver = (u8)(eeprom_data >> 8);
-		minor_ver = (u8)(eeprom_data);
-
-		if (nes_drv_opt & NES_DRV_OPT_DISABLE_VIRT_WQ) {
-			nes_debug(NES_DBG_HW, "Virtual WQs have been disabled\n");
-		} else if (((major_ver == 2) && (minor_ver > 21)) || ((major_ver > 2) && (major_ver != 255))) {
-			nesadapter->virtwq = 1;
-		}
-		if (((major_ver == 3) && (minor_ver >= 16)) || (major_ver > 3))
-			nesadapter->send_term_ok = 1;
-
-		if (nes_drv_opt & NES_DRV_OPT_ENABLE_PAU) {
-			if (!nes_set_pau(nesdev))
-				nesadapter->allow_unaligned_fpdus = 1;
-		}
-
-		nesadapter->firmware_version = (((u32)(u8)(eeprom_data>>8))  <<  16) +
-				(u32)((u8)eeprom_data);
-
-		eeprom_data = nes_read16_eeprom(nesdev->regs, next_section_address + 10);
-		printk(PFX "EEPROM version %u.%u\n", (u8)(eeprom_data>>8), (u8)eeprom_data);
-		nesadapter->eeprom_version = (((u32)(u8)(eeprom_data>>8)) << 16) +
-				(u32)((u8)eeprom_data);
-
-no_fw_rev:
-		/* eeprom is valid */
-		eeprom_offset = nesadapter->software_eeprom_offset;
-		eeprom_offset += 8;
-		nesadapter->netdev_max = (u8)nes_read16_eeprom(nesdev->regs, eeprom_offset);
-		eeprom_offset += 2;
-		mac_addr_high = nes_read16_eeprom(nesdev->regs, eeprom_offset);
-		eeprom_offset += 2;
-		mac_addr_low = (u32)nes_read16_eeprom(nesdev->regs, eeprom_offset);
-		eeprom_offset += 2;
-		mac_addr_low <<= 16;
-		mac_addr_low += (u32)nes_read16_eeprom(nesdev->regs, eeprom_offset);
-		nes_debug(NES_DBG_HW, "Base MAC Address = 0x%04X%08X\n",
-				mac_addr_high, mac_addr_low);
-		nes_debug(NES_DBG_HW, "MAC Address count = %u\n", nesadapter->netdev_max);
-
-		nesadapter->mac_addr_low = mac_addr_low;
-		nesadapter->mac_addr_high = mac_addr_high;
-
-		/* Read the Phy Type array */
-		eeprom_offset += 10;
-		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
-		nesadapter->phy_type[0] = (u8)(eeprom_data >> 8);
-		nesadapter->phy_type[1] = (u8)eeprom_data;
-
-		/* Read the port array */
-		eeprom_offset += 2;
-		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
-		nesadapter->phy_type[2] = (u8)(eeprom_data >> 8);
-		nesadapter->phy_type[3] = (u8)eeprom_data;
-		/* port_count is set by soft reset reg */
-		nes_debug(NES_DBG_HW, "port_count = %u, port 0 -> %u, port 1 -> %u,"
-				" port 2 -> %u, port 3 -> %u\n",
-				nesadapter->port_count,
-				nesadapter->phy_type[0], nesadapter->phy_type[1],
-				nesadapter->phy_type[2], nesadapter->phy_type[3]);
-
-		/* Read PD config array */
-		eeprom_offset += 10;
-		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
-		nesadapter->pd_config_size[0] = eeprom_data;
-		eeprom_offset += 2;
-		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
-		nesadapter->pd_config_base[0] = eeprom_data;
-		nes_debug(NES_DBG_HW, "PD0 config, size=0x%04x, base=0x%04x\n",
-				nesadapter->pd_config_size[0], nesadapter->pd_config_base[0]);
-
-		eeprom_offset += 2;
-		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
-		nesadapter->pd_config_size[1] = eeprom_data;
-		eeprom_offset += 2;
-		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
-		nesadapter->pd_config_base[1] = eeprom_data;
-		nes_debug(NES_DBG_HW, "PD1 config, size=0x%04x, base=0x%04x\n",
-				nesadapter->pd_config_size[1], nesadapter->pd_config_base[1]);
-
-		eeprom_offset += 2;
-		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
-		nesadapter->pd_config_size[2] = eeprom_data;
-		eeprom_offset += 2;
-		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
-		nesadapter->pd_config_base[2] = eeprom_data;
-		nes_debug(NES_DBG_HW, "PD2 config, size=0x%04x, base=0x%04x\n",
-				nesadapter->pd_config_size[2], nesadapter->pd_config_base[2]);
-
-		eeprom_offset += 2;
-		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
-		nesadapter->pd_config_size[3] = eeprom_data;
-		eeprom_offset += 2;
-		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
-		nesadapter->pd_config_base[3] = eeprom_data;
-		nes_debug(NES_DBG_HW, "PD3 config, size=0x%04x, base=0x%04x\n",
-				nesadapter->pd_config_size[3], nesadapter->pd_config_base[3]);
-
-		/* Read Rx Pool Size */
-		eeprom_offset += 22;   /* 46 */
-		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
-		eeprom_offset += 2;
-		nesadapter->rx_pool_size = (((u32)eeprom_data) << 16) +
-				nes_read16_eeprom(nesdev->regs, eeprom_offset);
-		nes_debug(NES_DBG_HW, "rx_pool_size = 0x%08X\n", nesadapter->rx_pool_size);
-
-		eeprom_offset += 2;
-		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
-		eeprom_offset += 2;
-		nesadapter->tx_pool_size = (((u32)eeprom_data) << 16) +
-				nes_read16_eeprom(nesdev->regs, eeprom_offset);
-		nes_debug(NES_DBG_HW, "tx_pool_size = 0x%08X\n", nesadapter->tx_pool_size);
-
-		eeprom_offset += 2;
-		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
-		eeprom_offset += 2;
-		nesadapter->rx_threshold = (((u32)eeprom_data) << 16) +
-				nes_read16_eeprom(nesdev->regs, eeprom_offset);
-		nes_debug(NES_DBG_HW, "rx_threshold = 0x%08X\n", nesadapter->rx_threshold);
-
-		eeprom_offset += 2;
-		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
-		eeprom_offset += 2;
-		nesadapter->tcp_timer_core_clk_divisor = (((u32)eeprom_data) << 16) +
-				nes_read16_eeprom(nesdev->regs, eeprom_offset);
-		nes_debug(NES_DBG_HW, "tcp_timer_core_clk_divisor = 0x%08X\n",
-				nesadapter->tcp_timer_core_clk_divisor);
-
-		eeprom_offset += 2;
-		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
-		eeprom_offset += 2;
-		nesadapter->iwarp_config = (((u32)eeprom_data) << 16) +
-				nes_read16_eeprom(nesdev->regs, eeprom_offset);
-		nes_debug(NES_DBG_HW, "iwarp_config = 0x%08X\n", nesadapter->iwarp_config);
-
-		eeprom_offset += 2;
-		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
-		eeprom_offset += 2;
-		nesadapter->cm_config = (((u32)eeprom_data) << 16) +
-				nes_read16_eeprom(nesdev->regs, eeprom_offset);
-		nes_debug(NES_DBG_HW, "cm_config = 0x%08X\n", nesadapter->cm_config);
-
-		eeprom_offset += 2;
-		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
-		eeprom_offset += 2;
-		nesadapter->sws_timer_config = (((u32)eeprom_data) << 16) +
-				nes_read16_eeprom(nesdev->regs, eeprom_offset);
-		nes_debug(NES_DBG_HW, "sws_timer_config = 0x%08X\n", nesadapter->sws_timer_config);
-
-		eeprom_offset += 2;
-		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
-		eeprom_offset += 2;
-		nesadapter->tcp_config1 = (((u32)eeprom_data) << 16) +
-				nes_read16_eeprom(nesdev->regs, eeprom_offset);
-		nes_debug(NES_DBG_HW, "tcp_config1 = 0x%08X\n", nesadapter->tcp_config1);
-
-		eeprom_offset += 2;
-		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
-		eeprom_offset += 2;
-		nesadapter->wqm_wat = (((u32)eeprom_data) << 16) +
-				nes_read16_eeprom(nesdev->regs, eeprom_offset);
-		nes_debug(NES_DBG_HW, "wqm_wat = 0x%08X\n", nesadapter->wqm_wat);
-
-		eeprom_offset += 2;
-		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
-		eeprom_offset += 2;
-		nesadapter->core_clock = (((u32)eeprom_data) << 16) +
-				nes_read16_eeprom(nesdev->regs, eeprom_offset);
-		nes_debug(NES_DBG_HW, "core_clock = 0x%08X\n", nesadapter->core_clock);
-
-		if ((sw_section_ver) && (nesadapter->hw_rev != NE020_REV)) {
-			eeprom_offset += 2;
-			eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
-			nesadapter->phy_index[0] = (eeprom_data & 0xff00)>>8;
-			nesadapter->phy_index[1] = eeprom_data & 0x00ff;
-			eeprom_offset += 2;
-			eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
-			nesadapter->phy_index[2] = (eeprom_data & 0xff00)>>8;
-			nesadapter->phy_index[3] = eeprom_data & 0x00ff;
-		} else {
-			nesadapter->phy_index[0] = 4;
-			nesadapter->phy_index[1] = 5;
-			nesadapter->phy_index[2] = 6;
-			nesadapter->phy_index[3] = 7;
-		}
-		nes_debug(NES_DBG_HW, "Phy address map = 0 > %u,  1 > %u, 2 > %u, 3 > %u\n",
-			   nesadapter->phy_index[0],nesadapter->phy_index[1],
-			   nesadapter->phy_index[2],nesadapter->phy_index[3]);
-	}
-
-	return 0;
-}
-
-
-/**
- * nes_read16_eeprom
- */
-static u16 nes_read16_eeprom(void __iomem *addr, u16 offset)
-{
-	writel(NES_EEPROM_READ_REQUEST + (offset >> 1),
-			(void __iomem *)addr + NES_EEPROM_COMMAND);
-
-	do {
-	} while (readl((void __iomem *)addr + NES_EEPROM_COMMAND) &
-			NES_EEPROM_READ_REQUEST);
-
-	return readw((void __iomem *)addr + NES_EEPROM_DATA);
-}
-
-
-/**
- * nes_write_1G_phy_reg
- */
-void nes_write_1G_phy_reg(struct nes_device *nesdev, u8 phy_reg, u8 phy_addr, u16 data)
-{
-	u32 u32temp;
-	u32 counter;
-
-	nes_write_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL,
-			0x50020000 | data | ((u32)phy_reg << 18) | ((u32)phy_addr << 23));
-	for (counter = 0; counter < 100 ; counter++) {
-		udelay(30);
-		u32temp = nes_read_indexed(nesdev, NES_IDX_MAC_INT_STATUS);
-		if (u32temp & 1) {
-			/* nes_debug(NES_DBG_PHY, "Phy interrupt status = 0x%X.\n", u32temp); */
-			nes_write_indexed(nesdev, NES_IDX_MAC_INT_STATUS, 1);
-			break;
-		}
-	}
-	if (!(u32temp & 1))
-		nes_debug(NES_DBG_PHY, "Phy is not responding. interrupt status = 0x%X.\n",
-				u32temp);
-}
-
-
-/**
- * nes_read_1G_phy_reg
- * This routine only issues the read, the data must be read
- * separately.
- */
-void nes_read_1G_phy_reg(struct nes_device *nesdev, u8 phy_reg, u8 phy_addr, u16 *data)
-{
-	u32 u32temp;
-	u32 counter;
-
-	/* nes_debug(NES_DBG_PHY, "phy addr = %d, mac_index = %d\n",
-			phy_addr, nesdev->mac_index); */
-
-	nes_write_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL,
-			0x60020000 | ((u32)phy_reg << 18) | ((u32)phy_addr << 23));
-	for (counter = 0; counter < 100 ; counter++) {
-		udelay(30);
-		u32temp = nes_read_indexed(nesdev, NES_IDX_MAC_INT_STATUS);
-		if (u32temp & 1) {
-			/* nes_debug(NES_DBG_PHY, "Phy interrupt status = 0x%X.\n", u32temp); */
-			nes_write_indexed(nesdev, NES_IDX_MAC_INT_STATUS, 1);
-			break;
-		}
-	}
-	if (!(u32temp & 1)) {
-		nes_debug(NES_DBG_PHY, "Phy is not responding. interrupt status = 0x%X.\n",
-				u32temp);
-		*data = 0xffff;
-	} else {
-		*data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL);
-	}
-}
-
-
-/**
- * nes_write_10G_phy_reg
- */
-void nes_write_10G_phy_reg(struct nes_device *nesdev, u16 phy_addr, u8 dev_addr, u16 phy_reg,
-		u16 data)
-{
-	u32 port_addr;
-	u32 u32temp;
-	u32 counter;
-
-	port_addr = phy_addr;
-
-	/* set address */
-	nes_write_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL,
-			0x00020000 | (u32)phy_reg | (((u32)dev_addr) << 18) | (((u32)port_addr) << 23));
-	for (counter = 0; counter < 100 ; counter++) {
-		udelay(30);
-		u32temp = nes_read_indexed(nesdev, NES_IDX_MAC_INT_STATUS);
-		if (u32temp & 1) {
-			nes_write_indexed(nesdev, NES_IDX_MAC_INT_STATUS, 1);
-			break;
-		}
-	}
-	if (!(u32temp & 1))
-		nes_debug(NES_DBG_PHY, "Phy is not responding. interrupt status = 0x%X.\n",
-				u32temp);
-
-	/* set data */
-	nes_write_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL,
-			0x10020000 | (u32)data | (((u32)dev_addr) << 18) | (((u32)port_addr) << 23));
-	for (counter = 0; counter < 100 ; counter++) {
-		udelay(30);
-		u32temp = nes_read_indexed(nesdev, NES_IDX_MAC_INT_STATUS);
-		if (u32temp & 1) {
-			nes_write_indexed(nesdev, NES_IDX_MAC_INT_STATUS, 1);
-			break;
-		}
-	}
-	if (!(u32temp & 1))
-		nes_debug(NES_DBG_PHY, "Phy is not responding. interrupt status = 0x%X.\n",
-				u32temp);
-}
-
-
-/**
- * nes_read_10G_phy_reg
- * This routine only issues the read, the data must be read
- * separately.
- */
-void nes_read_10G_phy_reg(struct nes_device *nesdev, u8 phy_addr, u8 dev_addr, u16 phy_reg)
-{
-	u32 port_addr;
-	u32 u32temp;
-	u32 counter;
-
-	port_addr = phy_addr;
-
-	/* set address */
-	nes_write_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL,
-			0x00020000 | (u32)phy_reg | (((u32)dev_addr) << 18) | (((u32)port_addr) << 23));
-	for (counter = 0; counter < 100 ; counter++) {
-		udelay(30);
-		u32temp = nes_read_indexed(nesdev, NES_IDX_MAC_INT_STATUS);
-		if (u32temp & 1) {
-			nes_write_indexed(nesdev, NES_IDX_MAC_INT_STATUS, 1);
-			break;
-		}
-	}
-	if (!(u32temp & 1))
-		nes_debug(NES_DBG_PHY, "Phy is not responding. interrupt status = 0x%X.\n",
-				u32temp);
-
-	/* issue read */
-	nes_write_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL,
-			0x30020000 | (((u32)dev_addr) << 18) | (((u32)port_addr) << 23));
-	for (counter = 0; counter < 100 ; counter++) {
-		udelay(30);
-		u32temp = nes_read_indexed(nesdev, NES_IDX_MAC_INT_STATUS);
-		if (u32temp & 1) {
-			nes_write_indexed(nesdev, NES_IDX_MAC_INT_STATUS, 1);
-			break;
-		}
-	}
-	if (!(u32temp & 1))
-		nes_debug(NES_DBG_PHY, "Phy is not responding. interrupt status = 0x%X.\n",
-				u32temp);
-}
-
-
-/**
- * nes_get_cqp_request
- */
-struct nes_cqp_request *nes_get_cqp_request(struct nes_device *nesdev)
-{
-	unsigned long flags;
-	struct nes_cqp_request *cqp_request = NULL;
-
-	if (!list_empty(&nesdev->cqp_avail_reqs)) {
-		spin_lock_irqsave(&nesdev->cqp.lock, flags);
-		if (!list_empty(&nesdev->cqp_avail_reqs)) {
-			cqp_request = list_entry(nesdev->cqp_avail_reqs.next,
-				struct nes_cqp_request, list);
-			list_del_init(&cqp_request->list);
-		}
-		spin_unlock_irqrestore(&nesdev->cqp.lock, flags);
-	}
-	if (cqp_request == NULL) {
-		cqp_request = kzalloc(sizeof(struct nes_cqp_request), GFP_ATOMIC);
-		if (cqp_request) {
-			cqp_request->dynamic = 1;
-			INIT_LIST_HEAD(&cqp_request->list);
-		}
-	}
-
-	if (cqp_request) {
-		init_waitqueue_head(&cqp_request->waitq);
-		cqp_request->waiting = 0;
-		cqp_request->request_done = 0;
-		cqp_request->callback = 0;
-		init_waitqueue_head(&cqp_request->waitq);
-		nes_debug(NES_DBG_CQP, "Got cqp request %p from the available list \n",
-				cqp_request);
-	} else
-		printk(KERN_ERR PFX "%s: Could not allocated a CQP request.\n",
-			   __func__);
-
-	return cqp_request;
-}
-
-void nes_free_cqp_request(struct nes_device *nesdev,
-			  struct nes_cqp_request *cqp_request)
-{
-	unsigned long flags;
-
-	nes_debug(NES_DBG_CQP, "CQP request %p (opcode 0x%02X) freed.\n",
-		  cqp_request,
-		  le32_to_cpu(cqp_request->cqp_wqe.wqe_words[NES_CQP_WQE_OPCODE_IDX]) & 0x3f);
-
-	if (cqp_request->dynamic) {
-		kfree(cqp_request);
-	} else {
-		spin_lock_irqsave(&nesdev->cqp.lock, flags);
-		list_add_tail(&cqp_request->list, &nesdev->cqp_avail_reqs);
-		spin_unlock_irqrestore(&nesdev->cqp.lock, flags);
-	}
-}
-
-void nes_put_cqp_request(struct nes_device *nesdev,
-			 struct nes_cqp_request *cqp_request)
-{
-	if (atomic_dec_and_test(&cqp_request->refcount))
-		nes_free_cqp_request(nesdev, cqp_request);
-}
-
-
-/**
- * nes_post_cqp_request
- */
-void nes_post_cqp_request(struct nes_device *nesdev,
-			  struct nes_cqp_request *cqp_request)
-{
-	struct nes_hw_cqp_wqe *cqp_wqe;
-	unsigned long flags;
-	u32 cqp_head;
-	u64 u64temp;
-	u32 opcode;
-	int ctx_index = NES_CQP_WQE_COMP_CTX_LOW_IDX;
-
-	spin_lock_irqsave(&nesdev->cqp.lock, flags);
-
-	if (((((nesdev->cqp.sq_tail+(nesdev->cqp.sq_size*2))-nesdev->cqp.sq_head) &
-			(nesdev->cqp.sq_size - 1)) != 1)
-			&& (list_empty(&nesdev->cqp_pending_reqs))) {
-		cqp_head = nesdev->cqp.sq_head++;
-		nesdev->cqp.sq_head &= nesdev->cqp.sq_size-1;
-		cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head];
-		memcpy(cqp_wqe, &cqp_request->cqp_wqe, sizeof(*cqp_wqe));
-		opcode = le32_to_cpu(cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX]);
-		if ((opcode & NES_CQP_OPCODE_MASK) == NES_CQP_DOWNLOAD_SEGMENT)
-			ctx_index = NES_CQP_WQE_DL_COMP_CTX_LOW_IDX;
-		barrier();
-		u64temp = (unsigned long)cqp_request;
-		set_wqe_64bit_value(cqp_wqe->wqe_words, ctx_index, u64temp);
-		nes_debug(NES_DBG_CQP, "CQP request (opcode 0x%02X), line 1 = 0x%08X put on CQPs SQ,"
-			" request = %p, cqp_head = %u, cqp_tail = %u, cqp_size = %u,"
-			" waiting = %d, refcount = %d.\n",
-			opcode & NES_CQP_OPCODE_MASK,
-			le32_to_cpu(cqp_wqe->wqe_words[NES_CQP_WQE_ID_IDX]), cqp_request,
-			nesdev->cqp.sq_head, nesdev->cqp.sq_tail, nesdev->cqp.sq_size,
-			cqp_request->waiting, atomic_read(&cqp_request->refcount));
-
-		barrier();
-
-		/* Ring doorbell (1 WQEs) */
-		nes_write32(nesdev->regs+NES_WQE_ALLOC, 0x01800000 | nesdev->cqp.qp_id);
-
-		barrier();
-	} else {
-		nes_debug(NES_DBG_CQP, "CQP request %p (opcode 0x%02X), line 1 = 0x%08X"
-				" put on the pending queue.\n",
-				cqp_request,
-				le32_to_cpu(cqp_request->cqp_wqe.wqe_words[NES_CQP_WQE_OPCODE_IDX])&0x3f,
-				le32_to_cpu(cqp_request->cqp_wqe.wqe_words[NES_CQP_WQE_ID_IDX]));
-		list_add_tail(&cqp_request->list, &nesdev->cqp_pending_reqs);
-	}
-
-	spin_unlock_irqrestore(&nesdev->cqp.lock, flags);
-
-	return;
-}
-
-/**
- * nes_arp_table
- */
-int nes_arp_table(struct nes_device *nesdev, u32 ip_addr, u8 *mac_addr, u32 action)
-{
-	struct nes_adapter *nesadapter = nesdev->nesadapter;
-	int arp_index;
-	int err = 0;
-	__be32 tmp_addr;
-
-	for (arp_index = 0; (u32) arp_index < nesadapter->arp_table_size; arp_index++) {
-		if (nesadapter->arp_table[arp_index].ip_addr == ip_addr)
-			break;
-	}
-
-	if (action == NES_ARP_ADD) {
-		if (arp_index != nesadapter->arp_table_size) {
-			return -1;
-		}
-
-		arp_index = 0;
-		err = nes_alloc_resource(nesadapter, nesadapter->allocated_arps,
-				nesadapter->arp_table_size, (u32 *)&arp_index, &nesadapter->next_arp_index, NES_RESOURCE_ARP);
-		if (err) {
-			nes_debug(NES_DBG_NETDEV, "nes_alloc_resource returned error = %u\n", err);
-			return err;
-		}
-		nes_debug(NES_DBG_NETDEV, "ADD, arp_index=%d\n", arp_index);
-
-		nesadapter->arp_table[arp_index].ip_addr = ip_addr;
-		memcpy(nesadapter->arp_table[arp_index].mac_addr, mac_addr, ETH_ALEN);
-		return arp_index;
-	}
-
-	/* DELETE or RESOLVE */
-	if (arp_index == nesadapter->arp_table_size) {
-		tmp_addr = cpu_to_be32(ip_addr);
-		nes_debug(NES_DBG_NETDEV, "MAC for %pI4 not in ARP table - cannot %s\n",
-			  &tmp_addr, action == NES_ARP_RESOLVE ? "resolve" : "delete");
-		return -1;
-	}
-
-	if (action == NES_ARP_RESOLVE) {
-		nes_debug(NES_DBG_NETDEV, "RESOLVE, arp_index=%d\n", arp_index);
-		return arp_index;
-	}
-
-	if (action == NES_ARP_DELETE) {
-		nes_debug(NES_DBG_NETDEV, "DELETE, arp_index=%d\n", arp_index);
-		nesadapter->arp_table[arp_index].ip_addr = 0;
-		eth_zero_addr(nesadapter->arp_table[arp_index].mac_addr);
-		nes_free_resource(nesadapter, nesadapter->allocated_arps, arp_index);
-		return arp_index;
-	}
-
-	return -1;
-}
-
-
-/**
- * nes_mh_fix
- */
-void nes_mh_fix(struct timer_list *t)
-{
-	struct nes_adapter *nesadapter = from_timer(nesadapter, t, mh_timer);
-	struct nes_device *nesdev = nesadapter->nesdev;
-	unsigned long flags;
-	struct nes_vnic *nesvnic;
-	u32 used_chunks_tx;
-	u32 temp_used_chunks_tx;
-	u32 temp_last_used_chunks_tx;
-	u32 used_chunks_mask;
-	u32 mac_tx_frames_low;
-	u32 mac_tx_frames_high;
-	u32 mac_tx_pauses;
-	u32 reset_value;
-	u32 tx_control;
-	u32 tx_config;
-	u32 tx_pause_quanta;
-	u32 rx_control;
-	u32 rx_config;
-	u32 mac_exact_match;
-	u32 mpp_debug;
-	u32 i=0;
-	u32 chunks_tx_progress = 0;
-
-	spin_lock_irqsave(&nesadapter->phy_lock, flags);
-	if ((nesadapter->mac_sw_state[0] != NES_MAC_SW_IDLE) || (nesadapter->mac_link_down[0])) {
-		spin_unlock_irqrestore(&nesadapter->phy_lock, flags);
-		goto no_mh_work;
-	}
-	nesadapter->mac_sw_state[0] = NES_MAC_SW_MH;
-	spin_unlock_irqrestore(&nesadapter->phy_lock, flags);
-	do {
-		mac_tx_frames_low = nes_read_indexed(nesdev, NES_IDX_MAC_TX_FRAMES_LOW);
-		mac_tx_frames_high = nes_read_indexed(nesdev, NES_IDX_MAC_TX_FRAMES_HIGH);
-		mac_tx_pauses = nes_read_indexed(nesdev, NES_IDX_MAC_TX_PAUSE_FRAMES);
-		used_chunks_tx = nes_read_indexed(nesdev, NES_IDX_USED_CHUNKS_TX);
-		nesdev->mac_pause_frames_sent += mac_tx_pauses;
-		used_chunks_mask = 0;
-		temp_used_chunks_tx = used_chunks_tx;
-		temp_last_used_chunks_tx = nesdev->last_used_chunks_tx;
-
-		if (nesdev->netdev[0]) {
-			nesvnic = netdev_priv(nesdev->netdev[0]);
-		} else {
-			break;
-		}
-
-		for (i=0; i<4; i++) {
-			used_chunks_mask <<= 8;
-			if (nesvnic->qp_nic_index[i] != 0xff) {
-				used_chunks_mask |= 0xff;
-				if ((temp_used_chunks_tx&0xff)<(temp_last_used_chunks_tx&0xff)) {
-					chunks_tx_progress = 1;
-				}
-			}
-			temp_used_chunks_tx >>= 8;
-			temp_last_used_chunks_tx >>= 8;
-		}
-		if ((mac_tx_frames_low) || (mac_tx_frames_high) ||
-			(!(used_chunks_tx&used_chunks_mask)) ||
-			(!(nesdev->last_used_chunks_tx&used_chunks_mask)) ||
-			(chunks_tx_progress) ) {
-			nesdev->last_used_chunks_tx = used_chunks_tx;
-			break;
-		}
-		nesdev->last_used_chunks_tx = used_chunks_tx;
-		barrier();
-
-		nes_write_indexed(nesdev, NES_IDX_MAC_TX_CONTROL, 0x00000005);
-		mh_pauses_sent++;
-		mac_tx_pauses = nes_read_indexed(nesdev, NES_IDX_MAC_TX_PAUSE_FRAMES);
-		if (mac_tx_pauses) {
-			nesdev->mac_pause_frames_sent += mac_tx_pauses;
-			break;
-		}
-
-		tx_control = nes_read_indexed(nesdev, NES_IDX_MAC_TX_CONTROL);
-		tx_config = nes_read_indexed(nesdev, NES_IDX_MAC_TX_CONFIG);
-		tx_pause_quanta = nes_read_indexed(nesdev, NES_IDX_MAC_TX_PAUSE_QUANTA);
-		rx_control = nes_read_indexed(nesdev, NES_IDX_MAC_RX_CONTROL);
-		rx_config = nes_read_indexed(nesdev, NES_IDX_MAC_RX_CONFIG);
-		mac_exact_match = nes_read_indexed(nesdev, NES_IDX_MAC_EXACT_MATCH_BOTTOM);
-		mpp_debug = nes_read_indexed(nesdev, NES_IDX_MPP_DEBUG);
-
-		/* one last ditch effort to avoid a false positive */
-		mac_tx_pauses = nes_read_indexed(nesdev, NES_IDX_MAC_TX_PAUSE_FRAMES);
-		if (mac_tx_pauses) {
-			nesdev->last_mac_tx_pauses = nesdev->mac_pause_frames_sent;
-			nes_debug(NES_DBG_HW, "failsafe caught slow outbound pause\n");
-			break;
-		}
-		mh_detected++;
-
-		nes_write_indexed(nesdev, NES_IDX_MAC_TX_CONTROL, 0x00000000);
-		nes_write_indexed(nesdev, NES_IDX_MAC_TX_CONFIG, 0x00000000);
-		reset_value = nes_read32(nesdev->regs+NES_SOFTWARE_RESET);
-
-		nes_write32(nesdev->regs+NES_SOFTWARE_RESET, reset_value | 0x0000001d);
-
-		while (((nes_read32(nesdev->regs+NES_SOFTWARE_RESET)
-				& 0x00000040) != 0x00000040) && (i++ < 5000)) {
-			/* mdelay(1); */
-		}
-
-		nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL0, 0x00000008);
-		nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_STATUS0);
-
-		nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_EMP0, 0x000bdef7);
-		nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_DRIVE0, 0x9ce73000);
-		nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_RX_MODE0, 0x0ff00000);
-		nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_RX_SIGDET0, 0x00000000);
-		nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_BYPASS0, 0x00000000);
-		nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_LOOPBACK_CONTROL0, 0x00000000);
-		if (nesadapter->OneG_Mode) {
-			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_RX_EQ_CONTROL0, 0xf0182222);
-		} else {
-			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_RX_EQ_CONTROL0, 0xf0042222);
-		}
-		nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_RX_EQ_STATUS0);
-		nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_CDR_CONTROL0, 0x000000ff);
-
-		nes_write_indexed(nesdev, NES_IDX_MAC_TX_CONTROL, tx_control);
-		nes_write_indexed(nesdev, NES_IDX_MAC_TX_CONFIG, tx_config);
-		nes_write_indexed(nesdev, NES_IDX_MAC_TX_PAUSE_QUANTA, tx_pause_quanta);
-		nes_write_indexed(nesdev, NES_IDX_MAC_RX_CONTROL, rx_control);
-		nes_write_indexed(nesdev, NES_IDX_MAC_RX_CONFIG, rx_config);
-		nes_write_indexed(nesdev, NES_IDX_MAC_EXACT_MATCH_BOTTOM, mac_exact_match);
-		nes_write_indexed(nesdev, NES_IDX_MPP_DEBUG, mpp_debug);
-
-	} while (0);
-
-	nesadapter->mac_sw_state[0] = NES_MAC_SW_IDLE;
-no_mh_work:
-	nesdev->nesadapter->mh_timer.expires = jiffies + (HZ/5);
-	add_timer(&nesdev->nesadapter->mh_timer);
-}
-
-/**
- * nes_clc
- */
-void nes_clc(struct timer_list *t)
-{
-	struct nes_adapter *nesadapter = from_timer(nesadapter, t, lc_timer);
-	unsigned long flags;
-
-	spin_lock_irqsave(&nesadapter->phy_lock, flags);
-	nesadapter->link_interrupt_count[0] = 0;
-	nesadapter->link_interrupt_count[1] = 0;
-	nesadapter->link_interrupt_count[2] = 0;
-	nesadapter->link_interrupt_count[3] = 0;
-	spin_unlock_irqrestore(&nesadapter->phy_lock, flags);
-
-	nesadapter->lc_timer.expires = jiffies + 3600 * HZ;  /* 1 hour */
-	add_timer(&nesadapter->lc_timer);
-}
-
-
-/**
- * nes_dump_mem
- */
-void nes_dump_mem(unsigned int dump_debug_level, void *addr, int length)
-{
-	if (!(nes_debug_level & dump_debug_level)) {
-		return;
-	}
-
-	if (length > 0x100) {
-		nes_debug(dump_debug_level, "Length truncated from %x to %x\n", length, 0x100);
-		length = 0x100;
-	}
-	nes_debug(dump_debug_level, "Address=0x%p, length=0x%x (%d)\n", addr, length, length);
-
-	print_hex_dump(KERN_ERR, PFX, DUMP_PREFIX_NONE, 16, 1, addr, length, true);
-}
diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c
deleted file mode 100644
index 6940c72..0000000
--- a/drivers/infiniband/hw/nes/nes_verbs.c
+++ /dev/null
@@ -1,3848 +0,0 @@
-/*
- * Copyright (c) 2006 - 2011 Intel Corporation.  All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- */
-
-#include <linux/module.h>
-#include <linux/moduleparam.h>
-#include <linux/random.h>
-#include <linux/highmem.h>
-#include <linux/slab.h>
-#include <asm/byteorder.h>
-
-#include <rdma/ib_verbs.h>
-#include <rdma/iw_cm.h>
-#include <rdma/ib_user_verbs.h>
-
-#include "nes.h"
-
-#include <rdma/ib_umem.h>
-
-atomic_t mod_qp_timouts;
-atomic_t qps_created;
-atomic_t sw_qps_destroyed;
-
-static void nes_unregister_ofa_device(struct nes_ib_device *nesibdev);
-static int nes_dereg_mr(struct ib_mr *ib_mr);
-
-/**
- * nes_alloc_mw
- */
-static struct ib_mw *nes_alloc_mw(struct ib_pd *ibpd, enum ib_mw_type type,
-				  struct ib_udata *udata)
-{
-	struct nes_pd *nespd = to_nespd(ibpd);
-	struct nes_vnic *nesvnic = to_nesvnic(ibpd->device);
-	struct nes_device *nesdev = nesvnic->nesdev;
-	struct nes_adapter *nesadapter = nesdev->nesadapter;
-	struct nes_cqp_request *cqp_request;
-	struct nes_mr *nesmr;
-	struct ib_mw *ibmw;
-	struct nes_hw_cqp_wqe *cqp_wqe;
-	int ret;
-	u32 stag;
-	u32 stag_index = 0;
-	u32 next_stag_index = 0;
-	u32 driver_key = 0;
-	u8 stag_key = 0;
-
-	if (type != IB_MW_TYPE_1)
-		return ERR_PTR(-EINVAL);
-
-	get_random_bytes(&next_stag_index, sizeof(next_stag_index));
-	stag_key = (u8)next_stag_index;
-
-	driver_key = 0;
-
-	next_stag_index >>= 8;
-	next_stag_index %= nesadapter->max_mr;
-
-	ret = nes_alloc_resource(nesadapter, nesadapter->allocated_mrs,
-			nesadapter->max_mr, &stag_index, &next_stag_index, NES_RESOURCE_MW);
-	if (ret) {
-		return ERR_PTR(ret);
-	}
-
-	nesmr = kzalloc(sizeof(*nesmr), GFP_KERNEL);
-	if (!nesmr) {
-		nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index);
-		return ERR_PTR(-ENOMEM);
-	}
-
-	stag = stag_index << 8;
-	stag |= driver_key;
-	stag += (u32)stag_key;
-
-	nes_debug(NES_DBG_MR, "Registering STag 0x%08X, index = 0x%08X\n",
-			stag, stag_index);
-
-	/* Register the region with the adapter */
-	cqp_request = nes_get_cqp_request(nesdev);
-	if (cqp_request == NULL) {
-		kfree(nesmr);
-		nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index);
-		return ERR_PTR(-ENOMEM);
-	}
-
-	cqp_request->waiting = 1;
-	cqp_wqe = &cqp_request->cqp_wqe;
-
-	cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] =
-			cpu_to_le32( NES_CQP_ALLOCATE_STAG | NES_CQP_STAG_RIGHTS_REMOTE_READ |
-			NES_CQP_STAG_RIGHTS_REMOTE_WRITE | NES_CQP_STAG_VA_TO |
-			NES_CQP_STAG_REM_ACC_EN);
-
-	nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
-	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_LEN_HIGH_PD_IDX, (nespd->pd_id & 0x00007fff));
-	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_STAG_IDX, stag);
-
-	atomic_set(&cqp_request->refcount, 2);
-	nes_post_cqp_request(nesdev, cqp_request);
-
-	/* Wait for CQP */
-	ret = wait_event_timeout(cqp_request->waitq, (cqp_request->request_done != 0),
-			NES_EVENT_TIMEOUT);
-	nes_debug(NES_DBG_MR, "Register STag 0x%08X completed, wait_event_timeout ret = %u,"
-			" CQP Major:Minor codes = 0x%04X:0x%04X.\n",
-			stag, ret, cqp_request->major_code, cqp_request->minor_code);
-	if ((!ret) || (cqp_request->major_code)) {
-		nes_put_cqp_request(nesdev, cqp_request);
-		kfree(nesmr);
-		nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index);
-		if (!ret) {
-			return ERR_PTR(-ETIME);
-		} else {
-			return ERR_PTR(-ENOMEM);
-		}
-	}
-	nes_put_cqp_request(nesdev, cqp_request);
-
-	nesmr->ibmw.rkey = stag;
-	nesmr->mode = IWNES_MEMREG_TYPE_MW;
-	ibmw = &nesmr->ibmw;
-	nesmr->pbl_4k = 0;
-	nesmr->pbls_used = 0;
-
-	return ibmw;
-}
-
-
-/**
- * nes_dealloc_mw
- */
-static int nes_dealloc_mw(struct ib_mw *ibmw)
-{
-	struct nes_mr *nesmr = to_nesmw(ibmw);
-	struct nes_vnic *nesvnic = to_nesvnic(ibmw->device);
-	struct nes_device *nesdev = nesvnic->nesdev;
-	struct nes_adapter *nesadapter = nesdev->nesadapter;
-	struct nes_hw_cqp_wqe *cqp_wqe;
-	struct nes_cqp_request *cqp_request;
-	int err = 0;
-	int ret;
-
-	/* Deallocate the window with the adapter */
-	cqp_request = nes_get_cqp_request(nesdev);
-	if (cqp_request == NULL) {
-		nes_debug(NES_DBG_MR, "Failed to get a cqp_request.\n");
-		return -ENOMEM;
-	}
-	cqp_request->waiting = 1;
-	cqp_wqe = &cqp_request->cqp_wqe;
-	nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
-	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX, NES_CQP_DEALLOCATE_STAG);
-	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_STAG_IDX, ibmw->rkey);
-
-	atomic_set(&cqp_request->refcount, 2);
-	nes_post_cqp_request(nesdev, cqp_request);
-
-	/* Wait for CQP */
-	nes_debug(NES_DBG_MR, "Waiting for deallocate STag 0x%08X to complete.\n",
-			ibmw->rkey);
-	ret = wait_event_timeout(cqp_request->waitq, (0 != cqp_request->request_done),
-			NES_EVENT_TIMEOUT);
-	nes_debug(NES_DBG_MR, "Deallocate STag completed, wait_event_timeout ret = %u,"
-			" CQP Major:Minor codes = 0x%04X:0x%04X.\n",
-			ret, cqp_request->major_code, cqp_request->minor_code);
-	if (!ret)
-		err = -ETIME;
-	else if (cqp_request->major_code)
-		err = -EIO;
-
-	nes_put_cqp_request(nesdev, cqp_request);
-
-	nes_free_resource(nesadapter, nesadapter->allocated_mrs,
-			(ibmw->rkey & 0x0fffff00) >> 8);
-	kfree(nesmr);
-
-	return err;
-}
-
-
-/*
- * nes_alloc_fast_mr
- */
-static int alloc_fast_reg_mr(struct nes_device *nesdev, struct nes_pd *nespd,
-			     u32 stag, u32 page_count)
-{
-	struct nes_hw_cqp_wqe *cqp_wqe;
-	struct nes_cqp_request *cqp_request;
-	unsigned long flags;
-	int ret;
-	struct nes_adapter *nesadapter = nesdev->nesadapter;
-	u32 opcode = 0;
-	u16 major_code;
-	u64 region_length = page_count * PAGE_SIZE;
-
-
-	cqp_request = nes_get_cqp_request(nesdev);
-	if (cqp_request == NULL) {
-		nes_debug(NES_DBG_MR, "Failed to get a cqp_request.\n");
-		return -ENOMEM;
-	}
-	nes_debug(NES_DBG_MR, "alloc_fast_reg_mr: page_count = %d, "
-			      "region_length = %llu\n",
-			      page_count, region_length);
-	cqp_request->waiting = 1;
-	cqp_wqe = &cqp_request->cqp_wqe;
-
-	spin_lock_irqsave(&nesadapter->pbl_lock, flags);
-	if (nesadapter->free_4kpbl > 0) {
-		nesadapter->free_4kpbl--;
-		spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
-	} else {
-		/* No 4kpbl's available: */
-		spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
-		nes_debug(NES_DBG_MR, "Out of Pbls\n");
-		nes_free_cqp_request(nesdev, cqp_request);
-		return -ENOMEM;
-	}
-
-	opcode = NES_CQP_ALLOCATE_STAG | NES_CQP_STAG_MR |
-		 NES_CQP_STAG_PBL_BLK_SIZE | NES_CQP_STAG_VA_TO |
-		 NES_CQP_STAG_REM_ACC_EN;
-	/*
-	 * The current OFED API does not support the zero based TO option.
-	 * If added then need to changed the NES_CQP_STAG_VA* option.  Also,
-	 * the API does not support that ability to have the MR set for local
-	 * access only when created and not allow the SQ op to override. Given
-	 * this the remote enable must be set here.
-	 */
-
-	nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
-	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX, opcode);
-	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_PBL_BLK_COUNT_IDX, 1);
-
-	cqp_wqe->wqe_words[NES_CQP_STAG_WQE_LEN_HIGH_PD_IDX] =
-			cpu_to_le32((u32)(region_length >> 8) & 0xff000000);
-	cqp_wqe->wqe_words[NES_CQP_STAG_WQE_LEN_HIGH_PD_IDX] |=
-			cpu_to_le32(nespd->pd_id & 0x00007fff);
-
-	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_STAG_IDX, stag);
-	set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_VA_LOW_IDX, 0);
-	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_LEN_LOW_IDX, 0);
-	set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_PA_LOW_IDX, 0);
-	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_PBL_LEN_IDX, (page_count * 8));
-	cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] |= cpu_to_le32(NES_CQP_STAG_PBL_BLK_SIZE);
-	barrier();
-
-	atomic_set(&cqp_request->refcount, 2);
-	nes_post_cqp_request(nesdev, cqp_request);
-
-	/* Wait for CQP */
-	ret = wait_event_timeout(cqp_request->waitq,
-				 (0 != cqp_request->request_done),
-				 NES_EVENT_TIMEOUT);
-
-	nes_debug(NES_DBG_MR, "Allocate STag 0x%08X completed, "
-		  "wait_event_timeout ret = %u, CQP Major:Minor codes = "
-		  "0x%04X:0x%04X.\n", stag, ret, cqp_request->major_code,
-		  cqp_request->minor_code);
-	major_code = cqp_request->major_code;
-	nes_put_cqp_request(nesdev, cqp_request);
-
-	if (!ret || major_code) {
-		spin_lock_irqsave(&nesadapter->pbl_lock, flags);
-		nesadapter->free_4kpbl++;
-		spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
-	}
-
-	if (!ret)
-		return -ETIME;
-	else if (major_code)
-		return -EIO;
-	return 0;
-}
-
-/*
- * nes_alloc_mr
- */
-static struct ib_mr *nes_alloc_mr(struct ib_pd *ibpd,
-				  enum ib_mr_type mr_type,
-				  u32 max_num_sg)
-{
-	struct nes_pd *nespd = to_nespd(ibpd);
-	struct nes_vnic *nesvnic = to_nesvnic(ibpd->device);
-	struct nes_device *nesdev = nesvnic->nesdev;
-	struct nes_adapter *nesadapter = nesdev->nesadapter;
-
-	u32 next_stag_index;
-	u8 stag_key = 0;
-	u32 driver_key = 0;
-	int err = 0;
-	u32 stag_index = 0;
-	struct nes_mr *nesmr;
-	u32 stag;
-	int ret;
-	struct ib_mr *ibmr;
-
-	if (mr_type != IB_MR_TYPE_MEM_REG)
-		return ERR_PTR(-EINVAL);
-
-	if (max_num_sg > (NES_4K_PBL_CHUNK_SIZE / sizeof(u64)))
-		return ERR_PTR(-E2BIG);
-
-/*
- * Note:  Set to always use a fixed length single page entry PBL.  This is to allow
- *	 for the fast_reg_mr operation to always know the size of the PBL.
- */
-	if (max_num_sg > (NES_4K_PBL_CHUNK_SIZE / sizeof(u64)))
-		return ERR_PTR(-E2BIG);
-
-	get_random_bytes(&next_stag_index, sizeof(next_stag_index));
-	stag_key = (u8)next_stag_index;
-	next_stag_index >>= 8;
-	next_stag_index %= nesadapter->max_mr;
-
-	err = nes_alloc_resource(nesadapter, nesadapter->allocated_mrs,
-				 nesadapter->max_mr, &stag_index,
-				 &next_stag_index, NES_RESOURCE_FAST_MR);
-	if (err)
-		return ERR_PTR(err);
-
-	nesmr = kzalloc(sizeof(*nesmr), GFP_KERNEL);
-	if (!nesmr) {
-		nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index);
-		return ERR_PTR(-ENOMEM);
-	}
-
-	stag = stag_index << 8;
-	stag |= driver_key;
-	stag += (u32)stag_key;
-
-	nes_debug(NES_DBG_MR, "Allocating STag 0x%08X index = 0x%08X\n",
-		  stag, stag_index);
-
-	ret = alloc_fast_reg_mr(nesdev, nespd, stag, max_num_sg);
-
-	if (ret == 0) {
-		nesmr->ibmr.rkey = stag;
-		nesmr->ibmr.lkey = stag;
-		nesmr->mode = IWNES_MEMREG_TYPE_FMEM;
-		ibmr = &nesmr->ibmr;
-	} else {
-		kfree(nesmr);
-		nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index);
-		return ERR_PTR(-ENOMEM);
-	}
-
-	nesmr->pages = pci_alloc_consistent(nesdev->pcidev,
-					    max_num_sg * sizeof(u64),
-					    &nesmr->paddr);
-	if (!nesmr->paddr)
-		goto err;
-
-	nesmr->max_pages = max_num_sg;
-
-	return ibmr;
-
-err:
-	nes_dereg_mr(ibmr);
-
-	return ERR_PTR(-ENOMEM);
-}
-
-static int nes_set_page(struct ib_mr *ibmr, u64 addr)
-{
-	struct nes_mr *nesmr = to_nesmr(ibmr);
-
-	if (unlikely(nesmr->npages == nesmr->max_pages))
-		return -ENOMEM;
-
-	nesmr->pages[nesmr->npages++] = cpu_to_le64(addr);
-
-	return 0;
-}
-
-static int nes_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg,
-			 int sg_nents, unsigned int *sg_offset)
-{
-	struct nes_mr *nesmr = to_nesmr(ibmr);
-
-	nesmr->npages = 0;
-
-	return ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset, nes_set_page);
-}
-
-/**
- * nes_query_device
- */
-static int nes_query_device(struct ib_device *ibdev, struct ib_device_attr *props,
-			    struct ib_udata *uhw)
-{
-	struct nes_vnic *nesvnic = to_nesvnic(ibdev);
-	struct nes_device *nesdev = nesvnic->nesdev;
-	struct nes_ib_device *nesibdev = nesvnic->nesibdev;
-
-	if (uhw->inlen || uhw->outlen)
-		return -EINVAL;
-
-	memset(props, 0, sizeof(*props));
-	memcpy(&props->sys_image_guid, nesvnic->netdev->dev_addr, 6);
-
-	props->fw_ver = nesdev->nesadapter->firmware_version;
-	props->device_cap_flags = nesdev->nesadapter->device_cap_flags;
-	props->vendor_id = nesdev->nesadapter->vendor_id;
-	props->vendor_part_id = nesdev->nesadapter->vendor_part_id;
-	props->hw_ver = nesdev->nesadapter->hw_rev;
-	props->max_mr_size = 0x80000000;
-	props->max_qp = nesibdev->max_qp;
-	props->max_qp_wr = nesdev->nesadapter->max_qp_wr - 2;
-	props->max_send_sge = nesdev->nesadapter->max_sge;
-	props->max_recv_sge = nesdev->nesadapter->max_sge;
-	props->max_cq = nesibdev->max_cq;
-	props->max_cqe = nesdev->nesadapter->max_cqe;
-	props->max_mr = nesibdev->max_mr;
-	props->max_mw = nesibdev->max_mr;
-	props->max_pd = nesibdev->max_pd;
-	props->max_sge_rd = 1;
-	switch (nesdev->nesadapter->max_irrq_wr) {
-		case 0:
-			props->max_qp_rd_atom = 2;
-			break;
-		case 1:
-			props->max_qp_rd_atom = 8;
-			break;
-		case 2:
-			props->max_qp_rd_atom = 32;
-			break;
-		case 3:
-			props->max_qp_rd_atom = 64;
-			break;
-		default:
-			props->max_qp_rd_atom = 0;
-	}
-	props->max_qp_init_rd_atom = props->max_qp_rd_atom;
-	props->atomic_cap = IB_ATOMIC_NONE;
-	props->max_map_per_fmr = 1;
-
-	return 0;
-}
-
-
-/**
- * nes_query_port
- */
-static int nes_query_port(struct ib_device *ibdev, u8 port, struct ib_port_attr *props)
-{
-	struct nes_vnic *nesvnic = to_nesvnic(ibdev);
-	struct net_device *netdev = nesvnic->netdev;
-
-	/* props being zeroed by the caller, avoid zeroing it here */
-
-	props->max_mtu = IB_MTU_4096;
-	props->active_mtu = ib_mtu_int_to_enum(netdev->mtu);
-
-	props->lid = 1;
-	if (netif_queue_stopped(netdev))
-		props->state = IB_PORT_DOWN;
-	else if (nesvnic->linkup)
-		props->state = IB_PORT_ACTIVE;
-	else
-		props->state = IB_PORT_DOWN;
-	props->port_cap_flags = IB_PORT_CM_SUP | IB_PORT_REINIT_SUP |
-			IB_PORT_VENDOR_CLASS_SUP | IB_PORT_BOOT_MGMT_SUP;
-	props->gid_tbl_len = 1;
-	props->pkey_tbl_len = 1;
-	props->active_width = IB_WIDTH_4X;
-	props->active_speed = IB_SPEED_SDR;
-	props->max_msg_sz = 0x80000000;
-
-	return 0;
-}
-
-/**
- * nes_query_pkey
- */
-static int nes_query_pkey(struct ib_device *ibdev, u8 port, u16 index, u16 *pkey)
-{
-	*pkey = 0;
-	return 0;
-}
-
-
-/**
- * nes_query_gid
- */
-static int nes_query_gid(struct ib_device *ibdev, u8 port,
-		int index, union ib_gid *gid)
-{
-	struct nes_vnic *nesvnic = to_nesvnic(ibdev);
-
-	memset(&(gid->raw[0]), 0, sizeof(gid->raw));
-	memcpy(&(gid->raw[0]), nesvnic->netdev->dev_addr, 6);
-
-	return 0;
-}
-
-
-/**
- * nes_alloc_ucontext - Allocate the user context data structure. This keeps track
- * of all objects associated with a particular user-mode client.
- */
-static struct ib_ucontext *nes_alloc_ucontext(struct ib_device *ibdev,
-		struct ib_udata *udata)
-{
-	struct nes_vnic *nesvnic = to_nesvnic(ibdev);
-	struct nes_device *nesdev = nesvnic->nesdev;
-	struct nes_adapter *nesadapter = nesdev->nesadapter;
-	struct nes_alloc_ucontext_req req;
-	struct nes_alloc_ucontext_resp uresp;
-	struct nes_ucontext *nes_ucontext;
-	struct nes_ib_device *nesibdev = nesvnic->nesibdev;
-
-
-	if (ib_copy_from_udata(&req, udata, sizeof(struct nes_alloc_ucontext_req))) {
-		printk(KERN_ERR PFX "Invalid structure size on allocate user context.\n");
-		return ERR_PTR(-EINVAL);
-	}
-
-	if (req.userspace_ver != NES_ABI_USERSPACE_VER) {
-		printk(KERN_ERR PFX "Invalid userspace driver version detected. Detected version %d, should be %d\n",
-			req.userspace_ver, NES_ABI_USERSPACE_VER);
-		return ERR_PTR(-EINVAL);
-	}
-
-
-	memset(&uresp, 0, sizeof uresp);
-
-	uresp.max_qps = nesibdev->max_qp;
-	uresp.max_pds = nesibdev->max_pd;
-	uresp.wq_size = nesdev->nesadapter->max_qp_wr * 2;
-	uresp.virtwq = nesadapter->virtwq;
-	uresp.kernel_ver = NES_ABI_KERNEL_VER;
-
-	nes_ucontext = kzalloc(sizeof *nes_ucontext, GFP_KERNEL);
-	if (!nes_ucontext)
-		return ERR_PTR(-ENOMEM);
-
-	nes_ucontext->nesdev = nesdev;
-	nes_ucontext->mmap_wq_offset = uresp.max_pds;
-	nes_ucontext->mmap_cq_offset = nes_ucontext->mmap_wq_offset +
-			((sizeof(struct nes_hw_qp_wqe) * uresp.max_qps * 2) + PAGE_SIZE-1) /
-			PAGE_SIZE;
-
-
-	if (ib_copy_to_udata(udata, &uresp, sizeof uresp)) {
-		kfree(nes_ucontext);
-		return ERR_PTR(-EFAULT);
-	}
-
-	INIT_LIST_HEAD(&nes_ucontext->cq_reg_mem_list);
-	INIT_LIST_HEAD(&nes_ucontext->qp_reg_mem_list);
-	atomic_set(&nes_ucontext->usecnt, 1);
-	return &nes_ucontext->ibucontext;
-}
-
-
-/**
- * nes_dealloc_ucontext
- */
-static int nes_dealloc_ucontext(struct ib_ucontext *context)
-{
-	/* struct nes_vnic *nesvnic = to_nesvnic(context->device); */
-	/* struct nes_device *nesdev = nesvnic->nesdev; */
-	struct nes_ucontext *nes_ucontext = to_nesucontext(context);
-
-	if (!atomic_dec_and_test(&nes_ucontext->usecnt))
-	  return 0;
-	kfree(nes_ucontext);
-	return 0;
-}
-
-
-/**
- * nes_mmap
- */
-static int nes_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
-{
-	unsigned long index;
-	struct nes_vnic *nesvnic = to_nesvnic(context->device);
-	struct nes_device *nesdev = nesvnic->nesdev;
-	/* struct nes_adapter *nesadapter = nesdev->nesadapter; */
-	struct nes_ucontext *nes_ucontext;
-	struct nes_qp *nesqp;
-
-	nes_ucontext = to_nesucontext(context);
-
-
-	if (vma->vm_pgoff >= nes_ucontext->mmap_wq_offset) {
-		index = (vma->vm_pgoff - nes_ucontext->mmap_wq_offset) * PAGE_SIZE;
-		index /= ((sizeof(struct nes_hw_qp_wqe) * nesdev->nesadapter->max_qp_wr * 2) +
-				PAGE_SIZE-1) & (~(PAGE_SIZE-1));
-		if (!test_bit(index, nes_ucontext->allocated_wqs)) {
-			nes_debug(NES_DBG_MMAP, "wq %lu not allocated\n", index);
-			return -EFAULT;
-		}
-		nesqp = nes_ucontext->mmap_nesqp[index];
-		if (nesqp == NULL) {
-			nes_debug(NES_DBG_MMAP, "wq %lu has a NULL QP base.\n", index);
-			return -EFAULT;
-		}
-		if (remap_pfn_range(vma, vma->vm_start,
-				virt_to_phys(nesqp->hwqp.sq_vbase) >> PAGE_SHIFT,
-				vma->vm_end - vma->vm_start,
-				vma->vm_page_prot)) {
-			nes_debug(NES_DBG_MMAP, "remap_pfn_range failed.\n");
-			return -EAGAIN;
-		}
-		vma->vm_private_data = nesqp;
-		return 0;
-	} else {
-		index = vma->vm_pgoff;
-		if (!test_bit(index, nes_ucontext->allocated_doorbells))
-			return -EFAULT;
-
-		vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
-		if (io_remap_pfn_range(vma, vma->vm_start,
-				(nesdev->doorbell_start +
-				((nes_ucontext->mmap_db_index[index] - nesdev->base_doorbell_index) * 4096))
-				>> PAGE_SHIFT, PAGE_SIZE, vma->vm_page_prot))
-			return -EAGAIN;
-		vma->vm_private_data = nes_ucontext;
-		return 0;
-	}
-
-	return -ENOSYS;
-}
-
-
-/**
- * nes_alloc_pd
- */
-static struct ib_pd *nes_alloc_pd(struct ib_device *ibdev,
-		struct ib_ucontext *context, struct ib_udata *udata)
-{
-	struct nes_pd *nespd;
-	struct nes_vnic *nesvnic = to_nesvnic(ibdev);
-	struct nes_device *nesdev = nesvnic->nesdev;
-	struct nes_adapter *nesadapter = nesdev->nesadapter;
-	struct nes_ucontext *nesucontext;
-	struct nes_alloc_pd_resp uresp;
-	u32 pd_num = 0;
-	int err;
-
-	nes_debug(NES_DBG_PD, "nesvnic=%p, netdev=%p %s, ibdev=%p, context=%p, netdev refcnt=%u\n",
-			nesvnic, nesdev->netdev[0], nesdev->netdev[0]->name, ibdev, context,
-			netdev_refcnt_read(nesvnic->netdev));
-
-	err = nes_alloc_resource(nesadapter, nesadapter->allocated_pds,
-			nesadapter->max_pd, &pd_num, &nesadapter->next_pd, NES_RESOURCE_PD);
-	if (err) {
-		return ERR_PTR(err);
-	}
-
-	nespd = kzalloc(sizeof (struct nes_pd), GFP_KERNEL);
-	if (!nespd) {
-		nes_free_resource(nesadapter, nesadapter->allocated_pds, pd_num);
-		return ERR_PTR(-ENOMEM);
-	}
-
-	nes_debug(NES_DBG_PD, "Allocating PD (%p) for ib device %s\n",
-			nespd, nesvnic->nesibdev->ibdev.name);
-
-	nespd->pd_id = (pd_num << (PAGE_SHIFT-12)) + nesadapter->base_pd;
-
-	if (context) {
-		nesucontext = to_nesucontext(context);
-		nespd->mmap_db_index = find_next_zero_bit(nesucontext->allocated_doorbells,
-				NES_MAX_USER_DB_REGIONS, nesucontext->first_free_db);
-		nes_debug(NES_DBG_PD, "find_first_zero_biton doorbells returned %u, mapping pd_id %u.\n",
-				nespd->mmap_db_index, nespd->pd_id);
-		if (nespd->mmap_db_index >= NES_MAX_USER_DB_REGIONS) {
-			nes_debug(NES_DBG_PD, "mmap_db_index > MAX\n");
-			nes_free_resource(nesadapter, nesadapter->allocated_pds, pd_num);
-			kfree(nespd);
-			return ERR_PTR(-ENOMEM);
-		}
-
-		uresp.pd_id = nespd->pd_id;
-		uresp.mmap_db_index = nespd->mmap_db_index;
-		if (ib_copy_to_udata(udata, &uresp, sizeof (struct nes_alloc_pd_resp))) {
-			nes_free_resource(nesadapter, nesadapter->allocated_pds, pd_num);
-			kfree(nespd);
-			return ERR_PTR(-EFAULT);
-		}
-
-		set_bit(nespd->mmap_db_index, nesucontext->allocated_doorbells);
-		nesucontext->mmap_db_index[nespd->mmap_db_index] = nespd->pd_id;
-		nesucontext->first_free_db = nespd->mmap_db_index + 1;
-	}
-
-	nes_debug(NES_DBG_PD, "PD%u structure located @%p.\n", nespd->pd_id, nespd);
-	return &nespd->ibpd;
-}
-
-
-/**
- * nes_dealloc_pd
- */
-static int nes_dealloc_pd(struct ib_pd *ibpd)
-{
-	struct nes_ucontext *nesucontext;
-	struct nes_pd *nespd = to_nespd(ibpd);
-	struct nes_vnic *nesvnic = to_nesvnic(ibpd->device);
-	struct nes_device *nesdev = nesvnic->nesdev;
-	struct nes_adapter *nesadapter = nesdev->nesadapter;
-
-	if ((ibpd->uobject) && (ibpd->uobject->context)) {
-		nesucontext = to_nesucontext(ibpd->uobject->context);
-		nes_debug(NES_DBG_PD, "Clearing bit %u from allocated doorbells\n",
-				nespd->mmap_db_index);
-		clear_bit(nespd->mmap_db_index, nesucontext->allocated_doorbells);
-		nesucontext->mmap_db_index[nespd->mmap_db_index] = 0;
-		if (nesucontext->first_free_db > nespd->mmap_db_index) {
-			nesucontext->first_free_db = nespd->mmap_db_index;
-		}
-	}
-
-	nes_debug(NES_DBG_PD, "Deallocating PD%u structure located @%p.\n",
-			nespd->pd_id, nespd);
-	nes_free_resource(nesadapter, nesadapter->allocated_pds,
-			(nespd->pd_id-nesadapter->base_pd)>>(PAGE_SHIFT-12));
-	kfree(nespd);
-
-	return 0;
-}
-
-
-/**
- * nes_get_encoded_size
- */
-static inline u8 nes_get_encoded_size(int *size)
-{
-	u8 encoded_size = 0;
-	if (*size <= 32) {
-		*size = 32;
-		encoded_size = 1;
-	} else if (*size <= 128) {
-		*size = 128;
-		encoded_size = 2;
-	} else if (*size <= 512) {
-		*size = 512;
-		encoded_size = 3;
-	}
-	return (encoded_size);
-}
-
-
-
-/**
- * nes_setup_virt_qp
- */
-static int nes_setup_virt_qp(struct nes_qp *nesqp, struct nes_pbl *nespbl,
-		struct nes_vnic *nesvnic, int sq_size, int rq_size)
-{
-	unsigned long flags;
-	void *mem;
-	__le64 *pbl = NULL;
-	__le64 *tpbl;
-	__le64 *pblbuffer;
-	struct nes_device *nesdev = nesvnic->nesdev;
-	struct nes_adapter *nesadapter = nesdev->nesadapter;
-	u32 pbl_entries;
-	u8 rq_pbl_entries;
-	u8 sq_pbl_entries;
-
-	pbl_entries = nespbl->pbl_size >> 3;
-	nes_debug(NES_DBG_QP, "Userspace PBL, pbl_size=%u, pbl_entries = %d pbl_vbase=%p, pbl_pbase=%lx\n",
-			nespbl->pbl_size, pbl_entries,
-			(void *)nespbl->pbl_vbase,
-			(unsigned long) nespbl->pbl_pbase);
-	pbl = (__le64 *) nespbl->pbl_vbase; /* points to first pbl entry */
-	/* now lets set the sq_vbase as well as rq_vbase addrs we will assign */
-	/* the first pbl to be fro the rq_vbase... */
-	rq_pbl_entries = (rq_size * sizeof(struct nes_hw_qp_wqe)) >> 12;
-	sq_pbl_entries = (sq_size * sizeof(struct nes_hw_qp_wqe)) >> 12;
-	nesqp->hwqp.sq_pbase = (le32_to_cpu(((__le32 *)pbl)[0])) | ((u64)((le32_to_cpu(((__le32 *)pbl)[1]))) << 32);
-	if (!nespbl->page) {
-		nes_debug(NES_DBG_QP, "QP nespbl->page is NULL \n");
-		kfree(nespbl);
-		return -ENOMEM;
-	}
-
-	nesqp->hwqp.sq_vbase = kmap(nespbl->page);
-	nesqp->page = nespbl->page;
-	if (!nesqp->hwqp.sq_vbase) {
-		nes_debug(NES_DBG_QP, "QP sq_vbase kmap failed\n");
-		kfree(nespbl);
-		return -ENOMEM;
-	}
-
-	/* Now to get to sq.. we need to calculate how many */
-	/* PBL entries were used by the rq.. */
-	pbl += sq_pbl_entries;
-	nesqp->hwqp.rq_pbase = (le32_to_cpu(((__le32 *)pbl)[0])) | ((u64)((le32_to_cpu(((__le32 *)pbl)[1]))) << 32);
-	/* nesqp->hwqp.rq_vbase = bus_to_virt(*pbl); */
-	/*nesqp->hwqp.rq_vbase = phys_to_virt(*pbl); */
-
-	nes_debug(NES_DBG_QP, "QP sq_vbase= %p sq_pbase=%lx rq_vbase=%p rq_pbase=%lx\n",
-		  nesqp->hwqp.sq_vbase, (unsigned long) nesqp->hwqp.sq_pbase,
-		  nesqp->hwqp.rq_vbase, (unsigned long) nesqp->hwqp.rq_pbase);
-	spin_lock_irqsave(&nesadapter->pbl_lock, flags);
-	if (!nesadapter->free_256pbl) {
-		pci_free_consistent(nesdev->pcidev, nespbl->pbl_size, nespbl->pbl_vbase,
-				nespbl->pbl_pbase);
-		spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
-		kunmap(nesqp->page);
-		kfree(nespbl);
-		return -ENOMEM;
-	}
-	nesadapter->free_256pbl--;
-	spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
-
-	nesqp->pbl_vbase = pci_alloc_consistent(nesdev->pcidev, 256, &nesqp->pbl_pbase);
-	pblbuffer = nesqp->pbl_vbase;
-	if (!nesqp->pbl_vbase) {
-		/* memory allocated during nes_reg_user_mr() */
-		pci_free_consistent(nesdev->pcidev, nespbl->pbl_size, nespbl->pbl_vbase,
-				    nespbl->pbl_pbase);
-		kfree(nespbl);
-		spin_lock_irqsave(&nesadapter->pbl_lock, flags);
-		nesadapter->free_256pbl++;
-		spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
-		kunmap(nesqp->page);
-		return -ENOMEM;
-	}
-	memset(nesqp->pbl_vbase, 0, 256);
-	/* fill in the page address in the pbl buffer.. */
-	tpbl = pblbuffer + 16;
-	pbl = (__le64 *)nespbl->pbl_vbase;
-	while (sq_pbl_entries--)
-		*tpbl++ = *pbl++;
-	tpbl = pblbuffer;
-	while (rq_pbl_entries--)
-		*tpbl++ = *pbl++;
-
-	/* done with memory allocated during nes_reg_user_mr() */
-	pci_free_consistent(nesdev->pcidev, nespbl->pbl_size, nespbl->pbl_vbase,
-			    nespbl->pbl_pbase);
-	kfree(nespbl);
-
-	nesqp->qp_mem_size =
-			max((u32)sizeof(struct nes_qp_context), ((u32)256)) + 256;     /* this is Q2 */
-	/* Round up to a multiple of a page */
-	nesqp->qp_mem_size += PAGE_SIZE - 1;
-	nesqp->qp_mem_size &= ~(PAGE_SIZE - 1);
-
-	mem = pci_alloc_consistent(nesdev->pcidev, nesqp->qp_mem_size,
-			&nesqp->hwqp.q2_pbase);
-
-	if (!mem) {
-		pci_free_consistent(nesdev->pcidev, 256, nesqp->pbl_vbase, nesqp->pbl_pbase);
-		nesqp->pbl_vbase = NULL;
-		spin_lock_irqsave(&nesadapter->pbl_lock, flags);
-		nesadapter->free_256pbl++;
-		spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
-		kunmap(nesqp->page);
-		return -ENOMEM;
-	}
-	nesqp->sq_kmapped = 1;
-	nesqp->hwqp.q2_vbase = mem;
-	mem += 256;
-	memset(nesqp->hwqp.q2_vbase, 0, 256);
-	nesqp->nesqp_context = mem;
-	memset(nesqp->nesqp_context, 0, sizeof(*nesqp->nesqp_context));
-	nesqp->nesqp_context_pbase = nesqp->hwqp.q2_pbase + 256;
-
-	return 0;
-}
-
-
-/**
- * nes_setup_mmap_qp
- */
-static int nes_setup_mmap_qp(struct nes_qp *nesqp, struct nes_vnic *nesvnic,
-		int sq_size, int rq_size)
-{
-	void *mem;
-	struct nes_device *nesdev = nesvnic->nesdev;
-
-	nesqp->qp_mem_size = (sizeof(struct nes_hw_qp_wqe) * sq_size) +
-			(sizeof(struct nes_hw_qp_wqe) * rq_size) +
-			max((u32)sizeof(struct nes_qp_context), ((u32)256)) +
-			256; /* this is Q2 */
-	/* Round up to a multiple of a page */
-	nesqp->qp_mem_size += PAGE_SIZE - 1;
-	nesqp->qp_mem_size &= ~(PAGE_SIZE - 1);
-
-	mem = pci_alloc_consistent(nesdev->pcidev, nesqp->qp_mem_size,
-			&nesqp->hwqp.sq_pbase);
-	if (!mem)
-		return -ENOMEM;
-	nes_debug(NES_DBG_QP, "PCI consistent memory for "
-			"host descriptor rings located @ %p (pa = 0x%08lX.) size = %u.\n",
-			mem, (unsigned long)nesqp->hwqp.sq_pbase, nesqp->qp_mem_size);
-
-	memset(mem, 0, nesqp->qp_mem_size);
-
-	nesqp->hwqp.sq_vbase = mem;
-	mem += sizeof(struct nes_hw_qp_wqe) * sq_size;
-
-	nesqp->hwqp.rq_vbase = mem;
-	nesqp->hwqp.rq_pbase = nesqp->hwqp.sq_pbase +
-			sizeof(struct nes_hw_qp_wqe) * sq_size;
-	mem += sizeof(struct nes_hw_qp_wqe) * rq_size;
-
-	nesqp->hwqp.q2_vbase = mem;
-	nesqp->hwqp.q2_pbase = nesqp->hwqp.rq_pbase +
-			sizeof(struct nes_hw_qp_wqe) * rq_size;
-	mem += 256;
-	memset(nesqp->hwqp.q2_vbase, 0, 256);
-
-	nesqp->nesqp_context = mem;
-	nesqp->nesqp_context_pbase = nesqp->hwqp.q2_pbase + 256;
-	memset(nesqp->nesqp_context, 0, sizeof(*nesqp->nesqp_context));
-	return 0;
-}
-
-
-/**
- * nes_free_qp_mem() is to free up the qp's pci_alloc_consistent() memory.
- */
-static void nes_free_qp_mem(struct nes_device *nesdev,
-		struct nes_qp *nesqp, int virt_wqs)
-{
-	unsigned long flags;
-	struct nes_adapter *nesadapter = nesdev->nesadapter;
-	if (!virt_wqs) {
-		pci_free_consistent(nesdev->pcidev, nesqp->qp_mem_size,
-				nesqp->hwqp.sq_vbase, nesqp->hwqp.sq_pbase);
-	}else {
-		spin_lock_irqsave(&nesadapter->pbl_lock, flags);
-		nesadapter->free_256pbl++;
-		spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
-		pci_free_consistent(nesdev->pcidev, nesqp->qp_mem_size, nesqp->hwqp.q2_vbase, nesqp->hwqp.q2_pbase);
-		pci_free_consistent(nesdev->pcidev, 256, nesqp->pbl_vbase, nesqp->pbl_pbase );
-		nesqp->pbl_vbase = NULL;
-		if (nesqp->sq_kmapped) {
-			nesqp->sq_kmapped = 0;
-			kunmap(nesqp->page);
-		}
-	}
-}
-
-
-/**
- * nes_create_qp
- */
-static struct ib_qp *nes_create_qp(struct ib_pd *ibpd,
-		struct ib_qp_init_attr *init_attr, struct ib_udata *udata)
-{
-	u64 u64temp= 0;
-	u64 u64nesqp = 0;
-	struct nes_pd *nespd = to_nespd(ibpd);
-	struct nes_vnic *nesvnic = to_nesvnic(ibpd->device);
-	struct nes_device *nesdev = nesvnic->nesdev;
-	struct nes_adapter *nesadapter = nesdev->nesadapter;
-	struct nes_qp *nesqp;
-	struct nes_cq *nescq;
-	struct nes_ucontext *nes_ucontext;
-	struct nes_hw_cqp_wqe *cqp_wqe;
-	struct nes_cqp_request *cqp_request;
-	struct nes_create_qp_req req;
-	struct nes_create_qp_resp uresp;
-	struct nes_pbl  *nespbl = NULL;
-	u32 qp_num = 0;
-	u32 opcode = 0;
-	/* u32 counter = 0; */
-	void *mem;
-	unsigned long flags;
-	int ret;
-	int err;
-	int virt_wqs = 0;
-	int sq_size;
-	int rq_size;
-	u8 sq_encoded_size;
-	u8 rq_encoded_size;
-	/* int counter; */
-
-	if (init_attr->create_flags)
-		return ERR_PTR(-EINVAL);
-
-	atomic_inc(&qps_created);
-	switch (init_attr->qp_type) {
-		case IB_QPT_RC:
-			if (nes_drv_opt & NES_DRV_OPT_NO_INLINE_DATA) {
-				init_attr->cap.max_inline_data = 0;
-			} else {
-				init_attr->cap.max_inline_data = 64;
-			}
-			sq_size = init_attr->cap.max_send_wr;
-			rq_size = init_attr->cap.max_recv_wr;
-
-			/* check if the encoded sizes are OK or not... */
-			sq_encoded_size = nes_get_encoded_size(&sq_size);
-			rq_encoded_size = nes_get_encoded_size(&rq_size);
-
-			if ((!sq_encoded_size) || (!rq_encoded_size)) {
-				nes_debug(NES_DBG_QP, "ERROR bad rq (%u) or sq (%u) size\n",
-						rq_size, sq_size);
-				return ERR_PTR(-EINVAL);
-			}
-
-			init_attr->cap.max_send_wr = sq_size -2;
-			init_attr->cap.max_recv_wr = rq_size -1;
-			nes_debug(NES_DBG_QP, "RQ size=%u, SQ Size=%u\n", rq_size, sq_size);
-
-			ret = nes_alloc_resource(nesadapter, nesadapter->allocated_qps,
-					nesadapter->max_qp, &qp_num, &nesadapter->next_qp, NES_RESOURCE_QP);
-			if (ret) {
-				return ERR_PTR(ret);
-			}
-
-			/* Need 512 (actually now 1024) byte alignment on this structure */
-			mem = kzalloc(sizeof(*nesqp)+NES_SW_CONTEXT_ALIGN-1, GFP_KERNEL);
-			if (!mem) {
-				nes_free_resource(nesadapter, nesadapter->allocated_qps, qp_num);
-				return ERR_PTR(-ENOMEM);
-			}
-			u64nesqp = (unsigned long)mem;
-			u64nesqp += ((u64)NES_SW_CONTEXT_ALIGN) - 1;
-			u64temp = ((u64)NES_SW_CONTEXT_ALIGN) - 1;
-			u64nesqp &= ~u64temp;
-			nesqp = (struct nes_qp *)(unsigned long)u64nesqp;
-			/* nes_debug(NES_DBG_QP, "nesqp=%p, allocated buffer=%p.  Rounded to closest %u\n",
-					nesqp, mem, NES_SW_CONTEXT_ALIGN); */
-			nesqp->allocated_buffer = mem;
-
-			if (udata) {
-				if (ib_copy_from_udata(&req, udata, sizeof(struct nes_create_qp_req))) {
-					nes_free_resource(nesadapter, nesadapter->allocated_qps, qp_num);
-					kfree(nesqp->allocated_buffer);
-					nes_debug(NES_DBG_QP, "ib_copy_from_udata() Failed \n");
-					return ERR_PTR(-EFAULT);
-				}
-				if (req.user_wqe_buffers) {
-					virt_wqs = 1;
-				}
-				if (req.user_qp_buffer)
-					nesqp->nesuqp_addr = req.user_qp_buffer;
-				if ((ibpd->uobject) && (ibpd->uobject->context)) {
-					nesqp->user_mode = 1;
-					nes_ucontext = to_nesucontext(ibpd->uobject->context);
-					if (virt_wqs) {
-						err = 1;
-						list_for_each_entry(nespbl, &nes_ucontext->qp_reg_mem_list, list) {
-							if (nespbl->user_base == (unsigned long )req.user_wqe_buffers) {
-								list_del(&nespbl->list);
-								err = 0;
-								nes_debug(NES_DBG_QP, "Found PBL for virtual QP. nespbl=%p. user_base=0x%lx\n",
-									  nespbl, nespbl->user_base);
-								break;
-							}
-						}
-						if (err) {
-							nes_debug(NES_DBG_QP, "Didn't Find PBL for virtual QP. address = %llx.\n",
-								  (long long unsigned int)req.user_wqe_buffers);
-							nes_free_resource(nesadapter, nesadapter->allocated_qps, qp_num);
-							kfree(nesqp->allocated_buffer);
-							return ERR_PTR(-EFAULT);
-						}
-					}
-
-					nes_ucontext = to_nesucontext(ibpd->uobject->context);
-					nesqp->mmap_sq_db_index =
-						find_next_zero_bit(nes_ucontext->allocated_wqs,
-								   NES_MAX_USER_WQ_REGIONS, nes_ucontext->first_free_wq);
-					/* nes_debug(NES_DBG_QP, "find_first_zero_biton wqs returned %u\n",
-							nespd->mmap_db_index); */
-					if (nesqp->mmap_sq_db_index >= NES_MAX_USER_WQ_REGIONS) {
-						nes_debug(NES_DBG_QP,
-							  "db index > max user regions, failing create QP\n");
-						nes_free_resource(nesadapter, nesadapter->allocated_qps, qp_num);
-						if (virt_wqs) {
-							pci_free_consistent(nesdev->pcidev, nespbl->pbl_size, nespbl->pbl_vbase,
-									    nespbl->pbl_pbase);
-							kfree(nespbl);
-						}
-						kfree(nesqp->allocated_buffer);
-						return ERR_PTR(-ENOMEM);
-					}
-					set_bit(nesqp->mmap_sq_db_index, nes_ucontext->allocated_wqs);
-					nes_ucontext->mmap_nesqp[nesqp->mmap_sq_db_index] = nesqp;
-					nes_ucontext->first_free_wq = nesqp->mmap_sq_db_index + 1;
-				} else {
-					nes_free_resource(nesadapter, nesadapter->allocated_qps, qp_num);
-					kfree(nesqp->allocated_buffer);
-					return ERR_PTR(-EFAULT);
-				}
-			}
-			err = (!virt_wqs) ? nes_setup_mmap_qp(nesqp, nesvnic, sq_size, rq_size) :
-					nes_setup_virt_qp(nesqp, nespbl, nesvnic, sq_size, rq_size);
-			if (err) {
-				nes_debug(NES_DBG_QP,
-					  "error geting qp mem code = %d\n", err);
-				nes_free_resource(nesadapter, nesadapter->allocated_qps, qp_num);
-				kfree(nesqp->allocated_buffer);
-				return ERR_PTR(-ENOMEM);
-			}
-
-			nesqp->hwqp.sq_size = sq_size;
-			nesqp->hwqp.sq_encoded_size = sq_encoded_size;
-			nesqp->hwqp.sq_head = 1;
-			nesqp->hwqp.rq_size = rq_size;
-			nesqp->hwqp.rq_encoded_size = rq_encoded_size;
-			/* nes_debug(NES_DBG_QP, "nesqp->nesqp_context_pbase = %p\n",
-					(void *)nesqp->nesqp_context_pbase);
-			*/
-			nesqp->hwqp.qp_id = qp_num;
-			nesqp->ibqp.qp_num = nesqp->hwqp.qp_id;
-			nesqp->nespd = nespd;
-
-			nescq = to_nescq(init_attr->send_cq);
-			nesqp->nesscq = nescq;
-			nescq = to_nescq(init_attr->recv_cq);
-			nesqp->nesrcq = nescq;
-
-			nesqp->nesqp_context->misc |= cpu_to_le32((u32)PCI_FUNC(nesdev->pcidev->devfn) <<
-					NES_QPCONTEXT_MISC_PCI_FCN_SHIFT);
-			nesqp->nesqp_context->misc |= cpu_to_le32((u32)nesqp->hwqp.rq_encoded_size <<
-					NES_QPCONTEXT_MISC_RQ_SIZE_SHIFT);
-			nesqp->nesqp_context->misc |= cpu_to_le32((u32)nesqp->hwqp.sq_encoded_size <<
-					NES_QPCONTEXT_MISC_SQ_SIZE_SHIFT);
-			if (!udata) {
-				nesqp->nesqp_context->misc |= cpu_to_le32(NES_QPCONTEXT_MISC_PRIV_EN);
-				nesqp->nesqp_context->misc |= cpu_to_le32(NES_QPCONTEXT_MISC_FAST_REGISTER_EN);
-			}
-			nesqp->nesqp_context->cqs = cpu_to_le32(nesqp->nesscq->hw_cq.cq_number +
-					((u32)nesqp->nesrcq->hw_cq.cq_number << 16));
-			u64temp = (u64)nesqp->hwqp.sq_pbase;
-			nesqp->nesqp_context->sq_addr_low = cpu_to_le32((u32)u64temp);
-			nesqp->nesqp_context->sq_addr_high = cpu_to_le32((u32)(u64temp >> 32));
-
-
-			if (!virt_wqs) {
-				u64temp = (u64)nesqp->hwqp.sq_pbase;
-				nesqp->nesqp_context->sq_addr_low = cpu_to_le32((u32)u64temp);
-				nesqp->nesqp_context->sq_addr_high = cpu_to_le32((u32)(u64temp >> 32));
-				u64temp = (u64)nesqp->hwqp.rq_pbase;
-				nesqp->nesqp_context->rq_addr_low = cpu_to_le32((u32)u64temp);
-				nesqp->nesqp_context->rq_addr_high = cpu_to_le32((u32)(u64temp >> 32));
-			} else {
-				u64temp = (u64)nesqp->pbl_pbase;
-				nesqp->nesqp_context->rq_addr_low = cpu_to_le32((u32)u64temp);
-				nesqp->nesqp_context->rq_addr_high = cpu_to_le32((u32)(u64temp >> 32));
-			}
-
-			/* nes_debug(NES_DBG_QP, "next_qp_nic_index=%u, using nic_index=%d\n",
-					nesvnic->next_qp_nic_index,
-					nesvnic->qp_nic_index[nesvnic->next_qp_nic_index]); */
-			spin_lock_irqsave(&nesdev->cqp.lock, flags);
-			nesqp->nesqp_context->misc2 |= cpu_to_le32(
-					(u32)nesvnic->qp_nic_index[nesvnic->next_qp_nic_index] <<
-					NES_QPCONTEXT_MISC2_NIC_INDEX_SHIFT);
-			nesvnic->next_qp_nic_index++;
-			if ((nesvnic->next_qp_nic_index > 3) ||
-					(nesvnic->qp_nic_index[nesvnic->next_qp_nic_index] == 0xf)) {
-				nesvnic->next_qp_nic_index = 0;
-			}
-			spin_unlock_irqrestore(&nesdev->cqp.lock, flags);
-
-			nesqp->nesqp_context->pd_index_wscale |= cpu_to_le32((u32)nesqp->nespd->pd_id << 16);
-			u64temp = (u64)nesqp->hwqp.q2_pbase;
-			nesqp->nesqp_context->q2_addr_low = cpu_to_le32((u32)u64temp);
-			nesqp->nesqp_context->q2_addr_high = cpu_to_le32((u32)(u64temp >> 32));
-			nesqp->nesqp_context->aeq_token_low =  cpu_to_le32((u32)((unsigned long)(nesqp)));
-			nesqp->nesqp_context->aeq_token_high =  cpu_to_le32((u32)(upper_32_bits((unsigned long)(nesqp))));
-			nesqp->nesqp_context->ird_ord_sizes = cpu_to_le32(NES_QPCONTEXT_ORDIRD_ALSMM |
-					NES_QPCONTEXT_ORDIRD_AAH |
-					((((u32)nesadapter->max_irrq_wr) <<
-					NES_QPCONTEXT_ORDIRD_IRDSIZE_SHIFT) & NES_QPCONTEXT_ORDIRD_IRDSIZE_MASK));
-			if (disable_mpa_crc) {
-				nes_debug(NES_DBG_QP, "Disabling MPA crc checking due to module option.\n");
-				nesqp->nesqp_context->ird_ord_sizes |= cpu_to_le32(NES_QPCONTEXT_ORDIRD_RNMC);
-			}
-
-
-			/* Create the QP */
-			cqp_request = nes_get_cqp_request(nesdev);
-			if (cqp_request == NULL) {
-				nes_debug(NES_DBG_QP, "Failed to get a cqp_request\n");
-				nes_free_resource(nesadapter, nesadapter->allocated_qps, qp_num);
-				nes_free_qp_mem(nesdev, nesqp,virt_wqs);
-				kfree(nesqp->allocated_buffer);
-				return ERR_PTR(-ENOMEM);
-			}
-			cqp_request->waiting = 1;
-			cqp_wqe = &cqp_request->cqp_wqe;
-
-			if (!virt_wqs) {
-				opcode = NES_CQP_CREATE_QP | NES_CQP_QP_TYPE_IWARP |
-					NES_CQP_QP_IWARP_STATE_IDLE;
-			} else {
-				opcode = NES_CQP_CREATE_QP | NES_CQP_QP_TYPE_IWARP | NES_CQP_QP_VIRT_WQS |
-					NES_CQP_QP_IWARP_STATE_IDLE;
-			}
-			opcode |= NES_CQP_QP_CQS_VALID;
-			nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
-			set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX, opcode);
-			set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX, nesqp->hwqp.qp_id);
-
-			u64temp = (u64)nesqp->nesqp_context_pbase;
-			set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_QP_WQE_CONTEXT_LOW_IDX, u64temp);
-
-			atomic_set(&cqp_request->refcount, 2);
-			nes_post_cqp_request(nesdev, cqp_request);
-
-			/* Wait for CQP */
-			nes_debug(NES_DBG_QP, "Waiting for create iWARP QP%u to complete.\n",
-					nesqp->hwqp.qp_id);
-			ret = wait_event_timeout(cqp_request->waitq,
-					(cqp_request->request_done != 0), NES_EVENT_TIMEOUT);
-			nes_debug(NES_DBG_QP, "Create iwarp QP%u completed, wait_event_timeout ret=%u,"
-					" nesdev->cqp_head = %u, nesdev->cqp.sq_tail = %u,"
-					" CQP Major:Minor codes = 0x%04X:0x%04X.\n",
-					nesqp->hwqp.qp_id, ret, nesdev->cqp.sq_head, nesdev->cqp.sq_tail,
-					cqp_request->major_code, cqp_request->minor_code);
-			if ((!ret) || (cqp_request->major_code)) {
-				nes_put_cqp_request(nesdev, cqp_request);
-				nes_free_resource(nesadapter, nesadapter->allocated_qps, qp_num);
-				nes_free_qp_mem(nesdev, nesqp,virt_wqs);
-				kfree(nesqp->allocated_buffer);
-				if (!ret) {
-					return ERR_PTR(-ETIME);
-				} else {
-					return ERR_PTR(-EIO);
-				}
-			}
-
-			nes_put_cqp_request(nesdev, cqp_request);
-
-			if (ibpd->uobject) {
-				uresp.mmap_sq_db_index = nesqp->mmap_sq_db_index;
-				uresp.mmap_rq_db_index = 0;
-				uresp.actual_sq_size = sq_size;
-				uresp.actual_rq_size = rq_size;
-				uresp.qp_id = nesqp->hwqp.qp_id;
-				uresp.nes_drv_opt = nes_drv_opt;
-				if (ib_copy_to_udata(udata, &uresp, sizeof uresp)) {
-					nes_free_resource(nesadapter, nesadapter->allocated_qps, qp_num);
-					nes_free_qp_mem(nesdev, nesqp,virt_wqs);
-					kfree(nesqp->allocated_buffer);
-					return ERR_PTR(-EFAULT);
-				}
-			}
-
-			nes_debug(NES_DBG_QP, "QP%u structure located @%p.Size = %u.\n",
-					nesqp->hwqp.qp_id, nesqp, (u32)sizeof(*nesqp));
-			spin_lock_init(&nesqp->lock);
-			nes_add_ref(&nesqp->ibqp);
-			break;
-		default:
-			nes_debug(NES_DBG_QP, "Invalid QP type: %d\n", init_attr->qp_type);
-			return ERR_PTR(-EINVAL);
-	}
-	init_completion(&nesqp->sq_drained);
-	init_completion(&nesqp->rq_drained);
-
-	nesqp->sig_all = (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR);
-	timer_setup(&nesqp->terminate_timer, nes_terminate_timeout, 0);
-
-	/* update the QP table */
-	nesdev->nesadapter->qp_table[nesqp->hwqp.qp_id-NES_FIRST_QPN] = nesqp;
-	nes_debug(NES_DBG_QP, "netdev refcnt=%u\n",
-			netdev_refcnt_read(nesvnic->netdev));
-
-	return &nesqp->ibqp;
-}
-
-/**
- * nes_clean_cq
- */
-static void nes_clean_cq(struct nes_qp *nesqp, struct nes_cq *nescq)
-{
-	u32 cq_head;
-	u32 lo;
-	u32 hi;
-	u64 u64temp;
-	unsigned long flags = 0;
-
-	spin_lock_irqsave(&nescq->lock, flags);
-
-	cq_head = nescq->hw_cq.cq_head;
-	while (le32_to_cpu(nescq->hw_cq.cq_vbase[cq_head].cqe_words[NES_CQE_OPCODE_IDX]) & NES_CQE_VALID) {
-		rmb();
-		lo = le32_to_cpu(nescq->hw_cq.cq_vbase[cq_head].cqe_words[NES_CQE_COMP_COMP_CTX_LOW_IDX]);
-		hi = le32_to_cpu(nescq->hw_cq.cq_vbase[cq_head].cqe_words[NES_CQE_COMP_COMP_CTX_HIGH_IDX]);
-		u64temp = (((u64)hi) << 32) | ((u64)lo);
-		u64temp &= ~(NES_SW_CONTEXT_ALIGN-1);
-		if (u64temp == (u64)(unsigned long)nesqp) {
-			/* Zero the context value so cqe will be ignored */
-			nescq->hw_cq.cq_vbase[cq_head].cqe_words[NES_CQE_COMP_COMP_CTX_LOW_IDX] = 0;
-			nescq->hw_cq.cq_vbase[cq_head].cqe_words[NES_CQE_COMP_COMP_CTX_HIGH_IDX] = 0;
-		}
-
-		if (++cq_head >= nescq->hw_cq.cq_size)
-			cq_head = 0;
-	}
-
-	spin_unlock_irqrestore(&nescq->lock, flags);
-}
-
-
-/**
- * nes_destroy_qp
- */
-static int nes_destroy_qp(struct ib_qp *ibqp)
-{
-	struct nes_qp *nesqp = to_nesqp(ibqp);
-	struct nes_ucontext *nes_ucontext;
-	struct ib_qp_attr attr;
-	struct iw_cm_id *cm_id;
-	struct iw_cm_event cm_event;
-	int ret = 0;
-
-	atomic_inc(&sw_qps_destroyed);
-	nesqp->destroyed = 1;
-
-	/* Blow away the connection if it exists. */
-	if (nesqp->ibqp_state >= IB_QPS_INIT && nesqp->ibqp_state <= IB_QPS_RTS) {
-		/* if (nesqp->ibqp_state == IB_QPS_RTS) { */
-		attr.qp_state = IB_QPS_ERR;
-		nes_modify_qp(&nesqp->ibqp, &attr, IB_QP_STATE, NULL);
-	}
-
-	if (((nesqp->ibqp_state == IB_QPS_INIT) ||
-			(nesqp->ibqp_state == IB_QPS_RTR)) && (nesqp->cm_id)) {
-		cm_id = nesqp->cm_id;
-		cm_event.event = IW_CM_EVENT_CONNECT_REPLY;
-		cm_event.status = -ETIMEDOUT;
-		cm_event.local_addr = cm_id->local_addr;
-		cm_event.remote_addr = cm_id->remote_addr;
-		cm_event.private_data = NULL;
-		cm_event.private_data_len = 0;
-
-		nes_debug(NES_DBG_QP, "Generating a CM Timeout Event for "
-				"QP%u. cm_id = %p, refcount = %u. \n",
-				nesqp->hwqp.qp_id, cm_id, atomic_read(&nesqp->refcount));
-
-		cm_id->rem_ref(cm_id);
-		ret = cm_id->event_handler(cm_id, &cm_event);
-		if (ret)
-			nes_debug(NES_DBG_QP, "OFA CM event_handler returned, ret=%d\n", ret);
-	}
-
-	if (nesqp->user_mode) {
-		if ((ibqp->uobject)&&(ibqp->uobject->context)) {
-			nes_ucontext = to_nesucontext(ibqp->uobject->context);
-			clear_bit(nesqp->mmap_sq_db_index, nes_ucontext->allocated_wqs);
-			nes_ucontext->mmap_nesqp[nesqp->mmap_sq_db_index] = NULL;
-			if (nes_ucontext->first_free_wq > nesqp->mmap_sq_db_index) {
-				nes_ucontext->first_free_wq = nesqp->mmap_sq_db_index;
-			}
-		}
-		if (nesqp->pbl_pbase && nesqp->sq_kmapped) {
-			nesqp->sq_kmapped = 0;
-			kunmap(nesqp->page);
-		}
-	} else {
-		/* Clean any pending completions from the cq(s) */
-		if (nesqp->nesscq)
-			nes_clean_cq(nesqp, nesqp->nesscq);
-
-		if ((nesqp->nesrcq) && (nesqp->nesrcq != nesqp->nesscq))
-			nes_clean_cq(nesqp, nesqp->nesrcq);
-	}
-	nes_rem_ref(&nesqp->ibqp);
-	return 0;
-}
-
-
-/**
- * nes_create_cq
- */
-static struct ib_cq *nes_create_cq(struct ib_device *ibdev,
-				   const struct ib_cq_init_attr *attr,
-				   struct ib_ucontext *context,
-				   struct ib_udata *udata)
-{
-	int entries = attr->cqe;
-	u64 u64temp;
-	struct nes_vnic *nesvnic = to_nesvnic(ibdev);
-	struct nes_device *nesdev = nesvnic->nesdev;
-	struct nes_adapter *nesadapter = nesdev->nesadapter;
-	struct nes_cq *nescq;
-	struct nes_ucontext *nes_ucontext = NULL;
-	struct nes_cqp_request *cqp_request;
-	void *mem = NULL;
-	struct nes_hw_cqp_wqe *cqp_wqe;
-	struct nes_pbl *nespbl = NULL;
-	struct nes_create_cq_req req;
-	struct nes_create_cq_resp resp;
-	u32 cq_num = 0;
-	u32 opcode = 0;
-	u32 pbl_entries = 1;
-	int err;
-	unsigned long flags;
-	int ret;
-
-	if (attr->flags)
-		return ERR_PTR(-EINVAL);
-
-	if (entries > nesadapter->max_cqe)
-		return ERR_PTR(-EINVAL);
-
-	err = nes_alloc_resource(nesadapter, nesadapter->allocated_cqs,
-			nesadapter->max_cq, &cq_num, &nesadapter->next_cq, NES_RESOURCE_CQ);
-	if (err) {
-		return ERR_PTR(err);
-	}
-
-	nescq = kzalloc(sizeof(struct nes_cq), GFP_KERNEL);
-	if (!nescq) {
-		nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num);
-		return ERR_PTR(-ENOMEM);
-	}
-
-	nescq->hw_cq.cq_size = max(entries + 1, 5);
-	nescq->hw_cq.cq_number = cq_num;
-	nescq->ibcq.cqe = nescq->hw_cq.cq_size - 1;
-
-
-	if (context) {
-		nes_ucontext = to_nesucontext(context);
-		if (ib_copy_from_udata(&req, udata, sizeof (struct nes_create_cq_req))) {
-			nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num);
-			kfree(nescq);
-			return ERR_PTR(-EFAULT);
-		}
-		nesvnic->mcrq_ucontext = nes_ucontext;
-		nes_ucontext->mcrqf = req.mcrqf;
-		if (nes_ucontext->mcrqf) {
-			if (nes_ucontext->mcrqf & 0x80000000)
-				nescq->hw_cq.cq_number = nesvnic->nic.qp_id + 28 + 2 * ((nes_ucontext->mcrqf & 0xf) - 1);
-			else if (nes_ucontext->mcrqf & 0x40000000)
-				nescq->hw_cq.cq_number = nes_ucontext->mcrqf & 0xffff;
-			else
-				nescq->hw_cq.cq_number = nesvnic->mcrq_qp_id + nes_ucontext->mcrqf-1;
-			nescq->mcrqf = nes_ucontext->mcrqf;
-			nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num);
-		}
-		nes_debug(NES_DBG_CQ, "CQ Virtual Address = %08lX, size = %u.\n",
-				(unsigned long)req.user_cq_buffer, entries);
-		err = 1;
-		list_for_each_entry(nespbl, &nes_ucontext->cq_reg_mem_list, list) {
-			if (nespbl->user_base == (unsigned long )req.user_cq_buffer) {
-				list_del(&nespbl->list);
-				err = 0;
-				nes_debug(NES_DBG_CQ, "Found PBL for virtual CQ. nespbl=%p.\n",
-						nespbl);
-				break;
-			}
-		}
-		if (err) {
-			nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num);
-			kfree(nescq);
-			return ERR_PTR(-EFAULT);
-		}
-
-		pbl_entries = nespbl->pbl_size >> 3;
-		nescq->cq_mem_size = 0;
-	} else {
-		nescq->cq_mem_size = nescq->hw_cq.cq_size * sizeof(struct nes_hw_cqe);
-		nes_debug(NES_DBG_CQ, "Attempting to allocate pci memory (%u entries, %u bytes) for CQ%u.\n",
-				entries, nescq->cq_mem_size, nescq->hw_cq.cq_number);
-
-		/* allocate the physical buffer space */
-		mem = pci_zalloc_consistent(nesdev->pcidev, nescq->cq_mem_size,
-					    &nescq->hw_cq.cq_pbase);
-		if (!mem) {
-			printk(KERN_ERR PFX "Unable to allocate pci memory for cq\n");
-			nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num);
-			kfree(nescq);
-			return ERR_PTR(-ENOMEM);
-		}
-
-		nescq->hw_cq.cq_vbase = mem;
-		nescq->hw_cq.cq_head = 0;
-		nes_debug(NES_DBG_CQ, "CQ%u virtual address @ %p, phys = 0x%08X\n",
-				nescq->hw_cq.cq_number, nescq->hw_cq.cq_vbase,
-				(u32)nescq->hw_cq.cq_pbase);
-	}
-
-	nescq->hw_cq.ce_handler = nes_iwarp_ce_handler;
-	spin_lock_init(&nescq->lock);
-
-	/* send CreateCQ request to CQP */
-	cqp_request = nes_get_cqp_request(nesdev);
-	if (cqp_request == NULL) {
-		nes_debug(NES_DBG_CQ, "Failed to get a cqp_request.\n");
-		if (!context)
-			pci_free_consistent(nesdev->pcidev, nescq->cq_mem_size, mem,
-					nescq->hw_cq.cq_pbase);
-		else {
-			pci_free_consistent(nesdev->pcidev, nespbl->pbl_size,
-					    nespbl->pbl_vbase, nespbl->pbl_pbase);
-			kfree(nespbl);
-		}
-
-		nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num);
-		kfree(nescq);
-		return ERR_PTR(-ENOMEM);
-	}
-	cqp_request->waiting = 1;
-	cqp_wqe = &cqp_request->cqp_wqe;
-
-	opcode = NES_CQP_CREATE_CQ | NES_CQP_CQ_CEQ_VALID |
-			NES_CQP_CQ_CHK_OVERFLOW |
-			NES_CQP_CQ_CEQE_MASK | ((u32)nescq->hw_cq.cq_size << 16);
-
-	spin_lock_irqsave(&nesadapter->pbl_lock, flags);
-
-	if (pbl_entries != 1) {
-		if (pbl_entries > 32) {
-			/* use 4k pbl */
-			nes_debug(NES_DBG_CQ, "pbl_entries=%u, use a 4k PBL\n", pbl_entries);
-			if (nesadapter->free_4kpbl == 0) {
-				spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
-				nes_free_cqp_request(nesdev, cqp_request);
-				if (!context)
-					pci_free_consistent(nesdev->pcidev, nescq->cq_mem_size, mem,
-							nescq->hw_cq.cq_pbase);
-				else {
-					pci_free_consistent(nesdev->pcidev, nespbl->pbl_size,
-							    nespbl->pbl_vbase, nespbl->pbl_pbase);
-					kfree(nespbl);
-				}
-				nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num);
-				kfree(nescq);
-				return ERR_PTR(-ENOMEM);
-			} else {
-				opcode |= (NES_CQP_CQ_VIRT | NES_CQP_CQ_4KB_CHUNK);
-				nescq->virtual_cq = 2;
-				nesadapter->free_4kpbl--;
-			}
-		} else {
-			/* use 256 byte pbl */
-			nes_debug(NES_DBG_CQ, "pbl_entries=%u, use a 256 byte PBL\n", pbl_entries);
-			if (nesadapter->free_256pbl == 0) {
-				spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
-				nes_free_cqp_request(nesdev, cqp_request);
-				if (!context)
-					pci_free_consistent(nesdev->pcidev, nescq->cq_mem_size, mem,
-							nescq->hw_cq.cq_pbase);
-				else {
-					pci_free_consistent(nesdev->pcidev, nespbl->pbl_size,
-							    nespbl->pbl_vbase, nespbl->pbl_pbase);
-					kfree(nespbl);
-				}
-				nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num);
-				kfree(nescq);
-				return ERR_PTR(-ENOMEM);
-			} else {
-				opcode |= NES_CQP_CQ_VIRT;
-				nescq->virtual_cq = 1;
-				nesadapter->free_256pbl--;
-			}
-		}
-	}
-
-	spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
-
-	nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
-	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX, opcode);
-	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX,
-			(nescq->hw_cq.cq_number | ((u32)nesdev->ceq_index << 16)));
-
-	if (context) {
-		if (pbl_entries != 1)
-			u64temp = (u64)nespbl->pbl_pbase;
-		else
-			u64temp	= le64_to_cpu(nespbl->pbl_vbase[0]);
-		set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_CQ_WQE_DOORBELL_INDEX_HIGH_IDX,
-				nes_ucontext->mmap_db_index[0]);
-	} else {
-		u64temp = (u64)nescq->hw_cq.cq_pbase;
-		cqp_wqe->wqe_words[NES_CQP_CQ_WQE_DOORBELL_INDEX_HIGH_IDX] = 0;
-	}
-	set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_CQ_WQE_PBL_LOW_IDX, u64temp);
-	cqp_wqe->wqe_words[NES_CQP_CQ_WQE_CQ_CONTEXT_HIGH_IDX] = 0;
-	u64temp = (u64)(unsigned long)&nescq->hw_cq;
-	cqp_wqe->wqe_words[NES_CQP_CQ_WQE_CQ_CONTEXT_LOW_IDX] =
-			cpu_to_le32((u32)(u64temp >> 1));
-	cqp_wqe->wqe_words[NES_CQP_CQ_WQE_CQ_CONTEXT_HIGH_IDX] =
-			cpu_to_le32(((u32)((u64temp) >> 33)) & 0x7FFFFFFF);
-
-	atomic_set(&cqp_request->refcount, 2);
-	nes_post_cqp_request(nesdev, cqp_request);
-
-	/* Wait for CQP */
-	nes_debug(NES_DBG_CQ, "Waiting for create iWARP CQ%u to complete.\n",
-			nescq->hw_cq.cq_number);
-	ret = wait_event_timeout(cqp_request->waitq, (0 != cqp_request->request_done),
-			NES_EVENT_TIMEOUT * 2);
-	nes_debug(NES_DBG_CQ, "Create iWARP CQ%u completed, wait_event_timeout ret = %d.\n",
-			nescq->hw_cq.cq_number, ret);
-	if ((!ret) || (cqp_request->major_code)) {
-		nes_put_cqp_request(nesdev, cqp_request);
-		if (!context)
-			pci_free_consistent(nesdev->pcidev, nescq->cq_mem_size, mem,
-					nescq->hw_cq.cq_pbase);
-		else {
-			pci_free_consistent(nesdev->pcidev, nespbl->pbl_size,
-					    nespbl->pbl_vbase, nespbl->pbl_pbase);
-			kfree(nespbl);
-		}
-		nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num);
-		kfree(nescq);
-		return ERR_PTR(-EIO);
-	}
-	nes_put_cqp_request(nesdev, cqp_request);
-
-	if (context) {
-		/* free the nespbl */
-		pci_free_consistent(nesdev->pcidev, nespbl->pbl_size, nespbl->pbl_vbase,
-				nespbl->pbl_pbase);
-		kfree(nespbl);
-		resp.cq_id = nescq->hw_cq.cq_number;
-		resp.cq_size = nescq->hw_cq.cq_size;
-		resp.mmap_db_index = 0;
-		if (ib_copy_to_udata(udata, &resp, sizeof resp - sizeof resp.reserved)) {
-			nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num);
-			kfree(nescq);
-			return ERR_PTR(-EFAULT);
-		}
-	}
-
-	return &nescq->ibcq;
-}
-
-
-/**
- * nes_destroy_cq
- */
-static int nes_destroy_cq(struct ib_cq *ib_cq)
-{
-	struct nes_cq *nescq;
-	struct nes_device *nesdev;
-	struct nes_vnic *nesvnic;
-	struct nes_adapter *nesadapter;
-	struct nes_hw_cqp_wqe *cqp_wqe;
-	struct nes_cqp_request *cqp_request;
-	unsigned long flags;
-	u32 opcode = 0;
-	int ret;
-
-	if (ib_cq == NULL)
-		return 0;
-
-	nescq = to_nescq(ib_cq);
-	nesvnic = to_nesvnic(ib_cq->device);
-	nesdev = nesvnic->nesdev;
-	nesadapter = nesdev->nesadapter;
-
-	nes_debug(NES_DBG_CQ, "Destroy CQ%u\n", nescq->hw_cq.cq_number);
-
-	/* Send DestroyCQ request to CQP */
-	cqp_request = nes_get_cqp_request(nesdev);
-	if (cqp_request == NULL) {
-		nes_debug(NES_DBG_CQ, "Failed to get a cqp_request.\n");
-		return -ENOMEM;
-	}
-	cqp_request->waiting = 1;
-	cqp_wqe = &cqp_request->cqp_wqe;
-	opcode = NES_CQP_DESTROY_CQ | (nescq->hw_cq.cq_size << 16);
-	spin_lock_irqsave(&nesadapter->pbl_lock, flags);
-	if (nescq->virtual_cq == 1) {
-		nesadapter->free_256pbl++;
-		if (nesadapter->free_256pbl > nesadapter->max_256pbl) {
-			printk(KERN_ERR PFX "%s: free 256B PBLs(%u) has exceeded the max(%u)\n",
-					__func__, nesadapter->free_256pbl, nesadapter->max_256pbl);
-		}
-	} else if (nescq->virtual_cq == 2) {
-		nesadapter->free_4kpbl++;
-		if (nesadapter->free_4kpbl > nesadapter->max_4kpbl) {
-			printk(KERN_ERR PFX "%s: free 4K PBLs(%u) has exceeded the max(%u)\n",
-					__func__, nesadapter->free_4kpbl, nesadapter->max_4kpbl);
-		}
-		opcode |= NES_CQP_CQ_4KB_CHUNK;
-	}
-
-	spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
-
-	nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
-	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX, opcode);
-	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX,
-		(nescq->hw_cq.cq_number | ((u32)PCI_FUNC(nesdev->pcidev->devfn) << 16)));
-	if (!nescq->mcrqf)
-		nes_free_resource(nesadapter, nesadapter->allocated_cqs, nescq->hw_cq.cq_number);
-
-	atomic_set(&cqp_request->refcount, 2);
-	nes_post_cqp_request(nesdev, cqp_request);
-
-	/* Wait for CQP */
-	nes_debug(NES_DBG_CQ, "Waiting for destroy iWARP CQ%u to complete.\n",
-			nescq->hw_cq.cq_number);
-	ret = wait_event_timeout(cqp_request->waitq, (0 != cqp_request->request_done),
-			NES_EVENT_TIMEOUT);
-	nes_debug(NES_DBG_CQ, "Destroy iWARP CQ%u completed, wait_event_timeout ret = %u,"
-			" CQP Major:Minor codes = 0x%04X:0x%04X.\n",
-			nescq->hw_cq.cq_number, ret, cqp_request->major_code,
-			cqp_request->minor_code);
-	if (!ret) {
-		nes_debug(NES_DBG_CQ, "iWARP CQ%u destroy timeout expired\n",
-					nescq->hw_cq.cq_number);
-		ret = -ETIME;
-	} else if (cqp_request->major_code) {
-		nes_debug(NES_DBG_CQ, "iWARP CQ%u destroy failed\n",
-					nescq->hw_cq.cq_number);
-		ret = -EIO;
-	} else {
-		ret = 0;
-	}
-	nes_put_cqp_request(nesdev, cqp_request);
-
-	if (nescq->cq_mem_size)
-		pci_free_consistent(nesdev->pcidev, nescq->cq_mem_size,
-				    nescq->hw_cq.cq_vbase, nescq->hw_cq.cq_pbase);
-	kfree(nescq);
-
-	return ret;
-}
-
-/**
- * root_256
- */
-static u32 root_256(struct nes_device *nesdev,
-		    struct nes_root_vpbl *root_vpbl,
-		    struct nes_root_vpbl *new_root,
-		    u16 pbl_count_4k)
-{
-	u64 leaf_pbl;
-	int i, j, k;
-
-	if (pbl_count_4k == 1) {
-		new_root->pbl_vbase = pci_alloc_consistent(nesdev->pcidev,
-						512, &new_root->pbl_pbase);
-
-		if (new_root->pbl_vbase == NULL)
-			return 0;
-
-		leaf_pbl = (u64)root_vpbl->pbl_pbase;
-		for (i = 0; i < 16; i++) {
-			new_root->pbl_vbase[i].pa_low =
-				cpu_to_le32((u32)leaf_pbl);
-			new_root->pbl_vbase[i].pa_high =
-				cpu_to_le32((u32)((((u64)leaf_pbl) >> 32)));
-			leaf_pbl += 256;
-		}
-	} else {
-		for (i = 3; i >= 0; i--) {
-			j = i * 16;
-			root_vpbl->pbl_vbase[j] = root_vpbl->pbl_vbase[i];
-			leaf_pbl = le32_to_cpu(root_vpbl->pbl_vbase[j].pa_low) +
-			    (((u64)le32_to_cpu(root_vpbl->pbl_vbase[j].pa_high))
-				<< 32);
-			for (k = 1; k < 16; k++) {
-				leaf_pbl += 256;
-				root_vpbl->pbl_vbase[j + k].pa_low =
-						cpu_to_le32((u32)leaf_pbl);
-				root_vpbl->pbl_vbase[j + k].pa_high =
-				    cpu_to_le32((u32)((((u64)leaf_pbl) >> 32)));
-			}
-		}
-	}
-
-	return 1;
-}
-
-
-/**
- * nes_reg_mr
- */
-static int nes_reg_mr(struct nes_device *nesdev, struct nes_pd *nespd,
-		u32 stag, u64 region_length, struct nes_root_vpbl *root_vpbl,
-		dma_addr_t single_buffer, u16 pbl_count_4k,
-		u16 residual_page_count_4k, int acc, u64 *iova_start,
-		u16 *actual_pbl_cnt, u8 *used_4k_pbls)
-{
-	struct nes_hw_cqp_wqe *cqp_wqe;
-	struct nes_cqp_request *cqp_request;
-	unsigned long flags;
-	int ret;
-	struct nes_adapter *nesadapter = nesdev->nesadapter;
-	uint pg_cnt = 0;
-	u16 pbl_count_256 = 0;
-	u16 pbl_count = 0;
-	u8  use_256_pbls = 0;
-	u8  use_4k_pbls = 0;
-	u16 use_two_level = (pbl_count_4k > 1) ? 1 : 0;
-	struct nes_root_vpbl new_root = { 0, NULL, NULL };
-	u32 opcode = 0;
-	u16 major_code;
-
-	/* Register the region with the adapter */
-	cqp_request = nes_get_cqp_request(nesdev);
-	if (cqp_request == NULL) {
-		nes_debug(NES_DBG_MR, "Failed to get a cqp_request.\n");
-		return -ENOMEM;
-	}
-	cqp_request->waiting = 1;
-	cqp_wqe = &cqp_request->cqp_wqe;
-
-	if (pbl_count_4k) {
-		spin_lock_irqsave(&nesadapter->pbl_lock, flags);
-
-		pg_cnt = ((pbl_count_4k - 1) * 512) + residual_page_count_4k;
-		pbl_count_256 = (pg_cnt + 31) / 32;
-		if (pg_cnt <= 32) {
-			if (pbl_count_256 <= nesadapter->free_256pbl)
-				use_256_pbls = 1;
-			else if (pbl_count_4k <= nesadapter->free_4kpbl)
-				use_4k_pbls = 1;
-		} else if (pg_cnt <= 2048) {
-			if (((pbl_count_4k + use_two_level) <= nesadapter->free_4kpbl) &&
-			    (nesadapter->free_4kpbl > (nesadapter->max_4kpbl >> 1))) {
-				use_4k_pbls = 1;
-			} else if ((pbl_count_256 + 1) <= nesadapter->free_256pbl) {
-				use_256_pbls = 1;
-				use_two_level = 1;
-			} else if ((pbl_count_4k + use_two_level) <= nesadapter->free_4kpbl) {
-				use_4k_pbls = 1;
-			}
-		} else {
-			if ((pbl_count_4k + 1) <= nesadapter->free_4kpbl)
-				use_4k_pbls = 1;
-		}
-
-		if (use_256_pbls) {
-			pbl_count = pbl_count_256;
-			nesadapter->free_256pbl -= pbl_count + use_two_level;
-		} else if (use_4k_pbls) {
-			pbl_count =  pbl_count_4k;
-			nesadapter->free_4kpbl -= pbl_count + use_two_level;
-		} else {
-			spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
-			nes_debug(NES_DBG_MR, "Out of Pbls\n");
-			nes_free_cqp_request(nesdev, cqp_request);
-			return -ENOMEM;
-		}
-
-		spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
-	}
-
-	if (use_256_pbls && use_two_level) {
-		if (root_256(nesdev, root_vpbl, &new_root, pbl_count_4k) == 1) {
-			if (new_root.pbl_pbase != 0)
-				root_vpbl = &new_root;
-		} else {
-			spin_lock_irqsave(&nesadapter->pbl_lock, flags);
-			nesadapter->free_256pbl += pbl_count_256 + use_two_level;
-			use_256_pbls = 0;
-
-			if (pbl_count_4k == 1)
-				use_two_level = 0;
-			pbl_count = pbl_count_4k;
-
-			if ((pbl_count_4k + use_two_level) <= nesadapter->free_4kpbl) {
-				nesadapter->free_4kpbl -= pbl_count + use_two_level;
-				use_4k_pbls = 1;
-			}
-			spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
-
-			if (use_4k_pbls == 0)
-				return -ENOMEM;
-		}
-	}
-
-	opcode = NES_CQP_REGISTER_STAG | NES_CQP_STAG_RIGHTS_LOCAL_READ |
-					NES_CQP_STAG_VA_TO | NES_CQP_STAG_MR;
-	if (acc & IB_ACCESS_LOCAL_WRITE)
-		opcode |= NES_CQP_STAG_RIGHTS_LOCAL_WRITE;
-	if (acc & IB_ACCESS_REMOTE_WRITE)
-		opcode |= NES_CQP_STAG_RIGHTS_REMOTE_WRITE | NES_CQP_STAG_REM_ACC_EN;
-	if (acc & IB_ACCESS_REMOTE_READ)
-		opcode |= NES_CQP_STAG_RIGHTS_REMOTE_READ | NES_CQP_STAG_REM_ACC_EN;
-	if (acc & IB_ACCESS_MW_BIND)
-		opcode |= NES_CQP_STAG_RIGHTS_WINDOW_BIND | NES_CQP_STAG_REM_ACC_EN;
-
-	nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
-	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX, opcode);
-	set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_VA_LOW_IDX, *iova_start);
-	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_LEN_LOW_IDX, region_length);
-
-	cqp_wqe->wqe_words[NES_CQP_STAG_WQE_LEN_HIGH_PD_IDX] =
-			cpu_to_le32((u32)(region_length >> 8) & 0xff000000);
-	cqp_wqe->wqe_words[NES_CQP_STAG_WQE_LEN_HIGH_PD_IDX] |=
-			cpu_to_le32(nespd->pd_id & 0x00007fff);
-	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_STAG_IDX, stag);
-
-	if (pbl_count == 0) {
-		set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_PA_LOW_IDX, single_buffer);
-	} else {
-		set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_PA_LOW_IDX, root_vpbl->pbl_pbase);
-		set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_PBL_BLK_COUNT_IDX, pbl_count);
-		set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_PBL_LEN_IDX, (pg_cnt * 8));
-
-		if (use_4k_pbls)
-			cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] |= cpu_to_le32(NES_CQP_STAG_PBL_BLK_SIZE);
-	}
-	barrier();
-
-	atomic_set(&cqp_request->refcount, 2);
-	nes_post_cqp_request(nesdev, cqp_request);
-
-	/* Wait for CQP */
-	ret = wait_event_timeout(cqp_request->waitq, (0 != cqp_request->request_done),
-			NES_EVENT_TIMEOUT);
-	nes_debug(NES_DBG_MR, "Register STag 0x%08X completed, wait_event_timeout ret = %u,"
-			" CQP Major:Minor codes = 0x%04X:0x%04X.\n",
-			stag, ret, cqp_request->major_code, cqp_request->minor_code);
-	major_code = cqp_request->major_code;
-	nes_put_cqp_request(nesdev, cqp_request);
-
-	if ((!ret || major_code) && pbl_count != 0) {
-		spin_lock_irqsave(&nesadapter->pbl_lock, flags);
-		if (use_256_pbls)
-			nesadapter->free_256pbl += pbl_count + use_two_level;
-		else if (use_4k_pbls)
-			nesadapter->free_4kpbl += pbl_count + use_two_level;
-		spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
-	}
-	if (new_root.pbl_pbase)
-		pci_free_consistent(nesdev->pcidev, 512, new_root.pbl_vbase,
-				    new_root.pbl_pbase);
-
-	if (!ret)
-		return -ETIME;
-	else if (major_code)
-		return -EIO;
-
-	*actual_pbl_cnt = pbl_count + use_two_level;
-	*used_4k_pbls = use_4k_pbls;
-	return 0;
-}
-
-
-/**
- * nes_reg_phys_mr
- */
-struct ib_mr *nes_reg_phys_mr(struct ib_pd *ib_pd, u64 addr, u64 size,
-		int acc, u64 *iova_start)
-{
-	u64 region_length;
-	struct nes_pd *nespd = to_nespd(ib_pd);
-	struct nes_vnic *nesvnic = to_nesvnic(ib_pd->device);
-	struct nes_device *nesdev = nesvnic->nesdev;
-	struct nes_adapter *nesadapter = nesdev->nesadapter;
-	struct nes_mr *nesmr;
-	struct ib_mr *ibmr;
-	struct nes_vpbl vpbl;
-	struct nes_root_vpbl root_vpbl;
-	u32 stag;
-	unsigned long mask;
-	u32 stag_index = 0;
-	u32 next_stag_index = 0;
-	u32 driver_key = 0;
-	int err = 0;
-	int ret = 0;
-	u16 pbl_count = 0;
-	u8 single_page = 1;
-	u8 stag_key = 0;
-
-	region_length = 0;
-	vpbl.pbl_vbase = NULL;
-	root_vpbl.pbl_vbase = NULL;
-	root_vpbl.pbl_pbase = 0;
-
-	get_random_bytes(&next_stag_index, sizeof(next_stag_index));
-	stag_key = (u8)next_stag_index;
-
-	driver_key = 0;
-
-	next_stag_index >>= 8;
-	next_stag_index %= nesadapter->max_mr;
-
-	if ((addr ^ *iova_start) & ~PAGE_MASK)
-		return ERR_PTR(-EINVAL);
-
-	err = nes_alloc_resource(nesadapter, nesadapter->allocated_mrs, nesadapter->max_mr,
-			&stag_index, &next_stag_index, NES_RESOURCE_PHYS_MR);
-	if (err) {
-		return ERR_PTR(err);
-	}
-
-	nesmr = kzalloc(sizeof(*nesmr), GFP_KERNEL);
-	if (!nesmr) {
-		nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index);
-		return ERR_PTR(-ENOMEM);
-	}
-
-	/* Allocate a 4K buffer for the PBL */
-	vpbl.pbl_vbase = pci_alloc_consistent(nesdev->pcidev, 4096,
-			&vpbl.pbl_pbase);
-	nes_debug(NES_DBG_MR, "Allocating leaf PBL, va = %p, pa = 0x%016lX\n",
-			vpbl.pbl_vbase, (unsigned long)vpbl.pbl_pbase);
-	if (!vpbl.pbl_vbase) {
-		nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index);
-		ibmr = ERR_PTR(-ENOMEM);
-		kfree(nesmr);
-		goto reg_phys_err;
-	}
-
-
-	mask = !size;
-
-	if (mask & ~PAGE_MASK) {
-		nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index);
-		nes_debug(NES_DBG_MR, "Invalid buffer addr or size\n");
-		ibmr = ERR_PTR(-EINVAL);
-		kfree(nesmr);
-		goto reg_phys_err;
-	}
-
-	region_length += size;
-	vpbl.pbl_vbase[0].pa_low = cpu_to_le32((u32)addr & PAGE_MASK);
-	vpbl.pbl_vbase[0].pa_high = cpu_to_le32((u32)((((u64)addr) >> 32)));
-
-	stag = stag_index << 8;
-	stag |= driver_key;
-	stag += (u32)stag_key;
-
-	nes_debug(NES_DBG_MR, "Registering STag 0x%08X, VA = 0x%016lX,"
-			" length = 0x%016lX, index = 0x%08X\n",
-			stag, (unsigned long)*iova_start, (unsigned long)region_length, stag_index);
-
-	/* Make the leaf PBL the root if only one PBL */
-	root_vpbl.pbl_pbase = vpbl.pbl_pbase;
-
-	if (single_page) {
-		pbl_count = 0;
-	} else {
-		pbl_count = 1;
-	}
-	ret = nes_reg_mr(nesdev, nespd, stag, region_length, &root_vpbl,
-			addr, pbl_count, 1, acc, iova_start,
-			&nesmr->pbls_used, &nesmr->pbl_4k);
-
-	if (ret == 0) {
-		nesmr->ibmr.rkey = stag;
-		nesmr->ibmr.lkey = stag;
-		nesmr->mode = IWNES_MEMREG_TYPE_MEM;
-		ibmr = &nesmr->ibmr;
-	} else {
-		kfree(nesmr);
-		ibmr = ERR_PTR(-ENOMEM);
-	}
-
-reg_phys_err:
-	/* single PBL case */
-	pci_free_consistent(nesdev->pcidev, 4096, vpbl.pbl_vbase, vpbl.pbl_pbase);
-	return ibmr;
-}
-
-
-/**
- * nes_get_dma_mr
- */
-static struct ib_mr *nes_get_dma_mr(struct ib_pd *pd, int acc)
-{
-	u64 kva = 0;
-
-	nes_debug(NES_DBG_MR, "\n");
-
-	return nes_reg_phys_mr(pd, 0, 0xffffffffffULL, acc, &kva);
-}
-
-/**
- * nes_reg_user_mr
- */
-static struct ib_mr *nes_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
-		u64 virt, int acc, struct ib_udata *udata)
-{
-	u64 iova_start;
-	__le64 *pbl;
-	u64 region_length;
-	dma_addr_t last_dma_addr = 0;
-	dma_addr_t first_dma_addr = 0;
-	struct nes_pd *nespd = to_nespd(pd);
-	struct nes_vnic *nesvnic = to_nesvnic(pd->device);
-	struct nes_device *nesdev = nesvnic->nesdev;
-	struct nes_adapter *nesadapter = nesdev->nesadapter;
-	struct ib_mr *ibmr = ERR_PTR(-EINVAL);
-	struct scatterlist *sg;
-	struct nes_ucontext *nes_ucontext;
-	struct nes_pbl *nespbl;
-	struct nes_mr *nesmr;
-	struct ib_umem *region;
-	struct nes_mem_reg_req req;
-	struct nes_vpbl vpbl;
-	struct nes_root_vpbl root_vpbl;
-	int entry, page_index;
-	int page_count = 0;
-	int err, pbl_depth = 0;
-	int chunk_pages;
-	int ret;
-	u32 stag;
-	u32 stag_index = 0;
-	u32 next_stag_index;
-	u32 driver_key;
-	u32 root_pbl_index = 0;
-	u32 cur_pbl_index = 0;
-	u32 skip_pages;
-	u16 pbl_count;
-	u8 single_page = 1;
-	u8 stag_key;
-	int first_page = 1;
-
-	region = ib_umem_get(pd->uobject->context, start, length, acc, 0);
-	if (IS_ERR(region)) {
-		return (struct ib_mr *)region;
-	}
-
-	nes_debug(NES_DBG_MR, "User base = 0x%lX, Virt base = 0x%lX, length = %u,"
-			" offset = %u, page size = %lu.\n",
-			(unsigned long int)start, (unsigned long int)virt, (u32)length,
-			ib_umem_offset(region), BIT(region->page_shift));
-
-	skip_pages = ((u32)ib_umem_offset(region)) >> 12;
-
-	if (ib_copy_from_udata(&req, udata, sizeof(req))) {
-		ib_umem_release(region);
-		return ERR_PTR(-EFAULT);
-	}
-	nes_debug(NES_DBG_MR, "Memory Registration type = %08X.\n", req.reg_type);
-
-	switch (req.reg_type) {
-		case IWNES_MEMREG_TYPE_MEM:
-			pbl_depth = 0;
-			region_length = 0;
-			vpbl.pbl_vbase = NULL;
-			root_vpbl.pbl_vbase = NULL;
-			root_vpbl.pbl_pbase = 0;
-
-			get_random_bytes(&next_stag_index, sizeof(next_stag_index));
-			stag_key = (u8)next_stag_index;
-
-			driver_key = next_stag_index & 0x70000000;
-
-			next_stag_index >>= 8;
-			next_stag_index %= nesadapter->max_mr;
-
-			err = nes_alloc_resource(nesadapter, nesadapter->allocated_mrs,
-					nesadapter->max_mr, &stag_index, &next_stag_index, NES_RESOURCE_USER_MR);
-			if (err) {
-				ib_umem_release(region);
-				return ERR_PTR(err);
-			}
-
-			nesmr = kzalloc(sizeof(*nesmr), GFP_KERNEL);
-			if (!nesmr) {
-				ib_umem_release(region);
-				nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index);
-				return ERR_PTR(-ENOMEM);
-			}
-			nesmr->region = region;
-
-			for_each_sg(region->sg_head.sgl, sg, region->nmap, entry) {
-				if (sg_dma_address(sg) & ~PAGE_MASK) {
-					ib_umem_release(region);
-					nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index);
-					nes_debug(NES_DBG_MR, "Unaligned Memory Buffer: 0x%x\n",
-						  (unsigned int) sg_dma_address(sg));
-					ibmr = ERR_PTR(-EINVAL);
-					kfree(nesmr);
-					goto reg_user_mr_err;
-				}
-
-				if (!sg_dma_len(sg)) {
-					ib_umem_release(region);
-					nes_free_resource(nesadapter, nesadapter->allocated_mrs,
-							  stag_index);
-					nes_debug(NES_DBG_MR, "Invalid Buffer Size\n");
-					ibmr = ERR_PTR(-EINVAL);
-					kfree(nesmr);
-					goto reg_user_mr_err;
-				}
-
-				region_length += sg_dma_len(sg);
-				chunk_pages = sg_dma_len(sg) >> 12;
-				region_length -= skip_pages << 12;
-				for (page_index = skip_pages; page_index < chunk_pages; page_index++) {
-					skip_pages = 0;
-					if ((page_count != 0) && (page_count << 12) - (ib_umem_offset(region) & (4096 - 1)) >= region->length)
-						goto enough_pages;
-					if ((page_count&0x01FF) == 0) {
-						if (page_count >= 1024 * 512) {
-							ib_umem_release(region);
-							nes_free_resource(nesadapter,
-									  nesadapter->allocated_mrs, stag_index);
-							kfree(nesmr);
-							ibmr = ERR_PTR(-E2BIG);
-							goto reg_user_mr_err;
-						}
-						if (root_pbl_index == 1) {
-							root_vpbl.pbl_vbase = pci_alloc_consistent(nesdev->pcidev,
-									8192, &root_vpbl.pbl_pbase);
-							nes_debug(NES_DBG_MR, "Allocating root PBL, va = %p, pa = 0x%08X\n",
-								  root_vpbl.pbl_vbase, (unsigned int)root_vpbl.pbl_pbase);
-							if (!root_vpbl.pbl_vbase) {
-								ib_umem_release(region);
-								pci_free_consistent(nesdev->pcidev, 4096, vpbl.pbl_vbase,
-										    vpbl.pbl_pbase);
-								nes_free_resource(nesadapter, nesadapter->allocated_mrs,
-										  stag_index);
-								kfree(nesmr);
-								ibmr = ERR_PTR(-ENOMEM);
-								goto reg_user_mr_err;
-							}
-							root_vpbl.leaf_vpbl = kcalloc(1024,
-										      sizeof(*root_vpbl.leaf_vpbl),
-										      GFP_KERNEL);
-							if (!root_vpbl.leaf_vpbl) {
-								ib_umem_release(region);
-								pci_free_consistent(nesdev->pcidev, 8192, root_vpbl.pbl_vbase,
-										    root_vpbl.pbl_pbase);
-								pci_free_consistent(nesdev->pcidev, 4096, vpbl.pbl_vbase,
-										    vpbl.pbl_pbase);
-								nes_free_resource(nesadapter, nesadapter->allocated_mrs,
-										  stag_index);
-								kfree(nesmr);
-								ibmr = ERR_PTR(-ENOMEM);
-								goto reg_user_mr_err;
-							}
-							root_vpbl.pbl_vbase[0].pa_low =
-									cpu_to_le32((u32)vpbl.pbl_pbase);
-							root_vpbl.pbl_vbase[0].pa_high =
-									cpu_to_le32((u32)((((u64)vpbl.pbl_pbase) >> 32)));
-							root_vpbl.leaf_vpbl[0] = vpbl;
-						}
-						vpbl.pbl_vbase = pci_alloc_consistent(nesdev->pcidev, 4096,
-								&vpbl.pbl_pbase);
-						nes_debug(NES_DBG_MR, "Allocating leaf PBL, va = %p, pa = 0x%08X\n",
-							  vpbl.pbl_vbase, (unsigned int)vpbl.pbl_pbase);
-						if (!vpbl.pbl_vbase) {
-							ib_umem_release(region);
-							nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index);
-							ibmr = ERR_PTR(-ENOMEM);
-							kfree(nesmr);
-							goto reg_user_mr_err;
-						}
-						if (1 <= root_pbl_index) {
-							root_vpbl.pbl_vbase[root_pbl_index].pa_low =
-									cpu_to_le32((u32)vpbl.pbl_pbase);
-							root_vpbl.pbl_vbase[root_pbl_index].pa_high =
-									cpu_to_le32((u32)((((u64)vpbl.pbl_pbase)>>32)));
-							root_vpbl.leaf_vpbl[root_pbl_index] = vpbl;
-						}
-						root_pbl_index++;
-						cur_pbl_index = 0;
-					}
-					if (single_page) {
-						if (page_count != 0) {
-							if ((last_dma_addr+4096) !=
-									(sg_dma_address(sg)+
-									(page_index*4096)))
-								single_page = 0;
-							last_dma_addr = sg_dma_address(sg)+
-									(page_index*4096);
-						} else {
-							first_dma_addr = sg_dma_address(sg)+
-									(page_index*4096);
-							last_dma_addr = first_dma_addr;
-						}
-					}
-
-					vpbl.pbl_vbase[cur_pbl_index].pa_low =
-							cpu_to_le32((u32)(sg_dma_address(sg)+
-							(page_index*4096)));
-					vpbl.pbl_vbase[cur_pbl_index].pa_high =
-							cpu_to_le32((u32)((((u64)(sg_dma_address(sg)+
-							(page_index*4096))) >> 32)));
-					cur_pbl_index++;
-					page_count++;
-				}
-			}
-
-			enough_pages:
-			nes_debug(NES_DBG_MR, "calculating stag, stag_index=0x%08x, driver_key=0x%08x,"
-					" stag_key=0x%08x\n",
-					stag_index, driver_key, stag_key);
-			stag = stag_index << 8;
-			stag |= driver_key;
-			stag += (u32)stag_key;
-
-			iova_start = virt;
-			/* Make the leaf PBL the root if only one PBL */
-			if (root_pbl_index == 1) {
-				root_vpbl.pbl_pbase = vpbl.pbl_pbase;
-			}
-
-			if (single_page) {
-				pbl_count = 0;
-			} else {
-				pbl_count = root_pbl_index;
-				first_dma_addr = 0;
-			}
-			nes_debug(NES_DBG_MR, "Registering STag 0x%08X, VA = 0x%08X, length = 0x%08X,"
-					" index = 0x%08X, region->length=0x%08llx, pbl_count = %u\n",
-					stag, (unsigned int)iova_start,
-					(unsigned int)region_length, stag_index,
-					(unsigned long long)region->length, pbl_count);
-			ret = nes_reg_mr(nesdev, nespd, stag, region->length, &root_vpbl,
-					 first_dma_addr, pbl_count, (u16)cur_pbl_index, acc,
-					 &iova_start, &nesmr->pbls_used, &nesmr->pbl_4k);
-
-			nes_debug(NES_DBG_MR, "ret=%d\n", ret);
-
-			if (ret == 0) {
-				nesmr->ibmr.rkey = stag;
-				nesmr->ibmr.lkey = stag;
-				nesmr->mode = IWNES_MEMREG_TYPE_MEM;
-				ibmr = &nesmr->ibmr;
-			} else {
-				ib_umem_release(region);
-				kfree(nesmr);
-				ibmr = ERR_PTR(-ENOMEM);
-			}
-
-			reg_user_mr_err:
-			/* free the resources */
-			if (root_pbl_index == 1) {
-				pci_free_consistent(nesdev->pcidev, 4096, vpbl.pbl_vbase,
-						vpbl.pbl_pbase);
-			} else {
-				for (page_index=0; page_index<root_pbl_index; page_index++) {
-					pci_free_consistent(nesdev->pcidev, 4096,
-							root_vpbl.leaf_vpbl[page_index].pbl_vbase,
-							root_vpbl.leaf_vpbl[page_index].pbl_pbase);
-				}
-				kfree(root_vpbl.leaf_vpbl);
-				pci_free_consistent(nesdev->pcidev, 8192, root_vpbl.pbl_vbase,
-						root_vpbl.pbl_pbase);
-			}
-
-			nes_debug(NES_DBG_MR, "Leaving, ibmr=%p", ibmr);
-
-			return ibmr;
-		case IWNES_MEMREG_TYPE_QP:
-		case IWNES_MEMREG_TYPE_CQ:
-			if (!region->length) {
-				nes_debug(NES_DBG_MR, "Unable to register zero length region for CQ\n");
-				ib_umem_release(region);
-				return ERR_PTR(-EINVAL);
-			}
-			nespbl = kzalloc(sizeof(*nespbl), GFP_KERNEL);
-			if (!nespbl) {
-				ib_umem_release(region);
-				return ERR_PTR(-ENOMEM);
-			}
-			nesmr = kzalloc(sizeof(*nesmr), GFP_KERNEL);
-			if (!nesmr) {
-				ib_umem_release(region);
-				kfree(nespbl);
-				return ERR_PTR(-ENOMEM);
-			}
-			nesmr->region = region;
-			nes_ucontext = to_nesucontext(pd->uobject->context);
-			pbl_depth = region->length >> 12;
-			pbl_depth += (region->length & (4096-1)) ? 1 : 0;
-			nespbl->pbl_size = pbl_depth*sizeof(u64);
-			if (req.reg_type == IWNES_MEMREG_TYPE_QP) {
-				nes_debug(NES_DBG_MR, "Attempting to allocate QP PBL memory");
-			} else {
-				nes_debug(NES_DBG_MR, "Attempting to allocate CP PBL memory");
-			}
-
-			nes_debug(NES_DBG_MR, " %u bytes, %u entries.\n",
-					nespbl->pbl_size, pbl_depth);
-			pbl = pci_alloc_consistent(nesdev->pcidev, nespbl->pbl_size,
-					&nespbl->pbl_pbase);
-			if (!pbl) {
-				ib_umem_release(region);
-				kfree(nesmr);
-				kfree(nespbl);
-				nes_debug(NES_DBG_MR, "Unable to allocate PBL memory\n");
-				return ERR_PTR(-ENOMEM);
-			}
-
-			nespbl->pbl_vbase = (u64 *)pbl;
-			nespbl->user_base = start;
-			nes_debug(NES_DBG_MR, "Allocated PBL memory, %u bytes, pbl_pbase=%lx,"
-					" pbl_vbase=%p user_base=0x%lx\n",
-				  nespbl->pbl_size, (unsigned long) nespbl->pbl_pbase,
-				  (void *) nespbl->pbl_vbase, nespbl->user_base);
-
-			for_each_sg(region->sg_head.sgl, sg, region->nmap, entry) {
-				chunk_pages = sg_dma_len(sg) >> 12;
-				chunk_pages += (sg_dma_len(sg) & (4096-1)) ? 1 : 0;
-				if (first_page) {
-					nespbl->page = sg_page(sg);
-					first_page = 0;
-				}
-
-				for (page_index = 0; page_index < chunk_pages; page_index++) {
-					((__le32 *)pbl)[0] = cpu_to_le32((u32)
-							(sg_dma_address(sg)+
-							(page_index*4096)));
-					((__le32 *)pbl)[1] = cpu_to_le32(((u64)
-							(sg_dma_address(sg)+
-							(page_index*4096)))>>32);
-					nes_debug(NES_DBG_MR, "pbl=%p, *pbl=0x%016llx, 0x%08x%08x\n", pbl,
-						  (unsigned long long)*pbl,
-						  le32_to_cpu(((__le32 *)pbl)[1]), le32_to_cpu(((__le32 *)pbl)[0]));
-					pbl++;
-				}
-			}
-
-			if (req.reg_type == IWNES_MEMREG_TYPE_QP) {
-				list_add_tail(&nespbl->list, &nes_ucontext->qp_reg_mem_list);
-			} else {
-				list_add_tail(&nespbl->list, &nes_ucontext->cq_reg_mem_list);
-			}
-			nesmr->ibmr.rkey = -1;
-			nesmr->ibmr.lkey = -1;
-			nesmr->mode = req.reg_type;
-			return &nesmr->ibmr;
-	}
-
-	ib_umem_release(region);
-	return ERR_PTR(-ENOSYS);
-}
-
-
-/**
- * nes_dereg_mr
- */
-static int nes_dereg_mr(struct ib_mr *ib_mr)
-{
-	struct nes_mr *nesmr = to_nesmr(ib_mr);
-	struct nes_vnic *nesvnic = to_nesvnic(ib_mr->device);
-	struct nes_device *nesdev = nesvnic->nesdev;
-	struct nes_adapter *nesadapter = nesdev->nesadapter;
-	struct nes_hw_cqp_wqe *cqp_wqe;
-	struct nes_cqp_request *cqp_request;
-	unsigned long flags;
-	int ret;
-	u16 major_code;
-	u16 minor_code;
-
-
-	if (nesmr->pages)
-		pci_free_consistent(nesdev->pcidev,
-				    nesmr->max_pages * sizeof(u64),
-				    nesmr->pages,
-				    nesmr->paddr);
-
-	if (nesmr->region) {
-		ib_umem_release(nesmr->region);
-	}
-	if (nesmr->mode != IWNES_MEMREG_TYPE_MEM) {
-		kfree(nesmr);
-		return 0;
-	}
-
-	/* Deallocate the region with the adapter */
-
-	cqp_request = nes_get_cqp_request(nesdev);
-	if (cqp_request == NULL) {
-		nes_debug(NES_DBG_MR, "Failed to get a cqp_request.\n");
-		return -ENOMEM;
-	}
-	cqp_request->waiting = 1;
-	cqp_wqe = &cqp_request->cqp_wqe;
-
-	nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
-	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX,
-			NES_CQP_DEALLOCATE_STAG | NES_CQP_STAG_VA_TO |
-			NES_CQP_STAG_DEALLOC_PBLS | NES_CQP_STAG_MR);
-	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_STAG_IDX, ib_mr->rkey);
-
-	atomic_set(&cqp_request->refcount, 2);
-	nes_post_cqp_request(nesdev, cqp_request);
-
-	/* Wait for CQP */
-	nes_debug(NES_DBG_MR, "Waiting for deallocate STag 0x%08X completed\n", ib_mr->rkey);
-	ret = wait_event_timeout(cqp_request->waitq, (cqp_request->request_done != 0),
-			NES_EVENT_TIMEOUT);
-	nes_debug(NES_DBG_MR, "Deallocate STag 0x%08X completed, wait_event_timeout ret = %u,"
-			" CQP Major:Minor codes = 0x%04X:0x%04X\n",
-			ib_mr->rkey, ret, cqp_request->major_code, cqp_request->minor_code);
-
-	major_code = cqp_request->major_code;
-	minor_code = cqp_request->minor_code;
-
-	nes_put_cqp_request(nesdev, cqp_request);
-
-	if (!ret) {
-		nes_debug(NES_DBG_MR, "Timeout waiting to destroy STag,"
-				" ib_mr=%p, rkey = 0x%08X\n",
-				ib_mr, ib_mr->rkey);
-		return -ETIME;
-	} else if (major_code) {
-		nes_debug(NES_DBG_MR, "Error (0x%04X:0x%04X) while attempting"
-				" to destroy STag, ib_mr=%p, rkey = 0x%08X\n",
-				major_code, minor_code, ib_mr, ib_mr->rkey);
-		return -EIO;
-	}
-
-	if (nesmr->pbls_used != 0) {
-		spin_lock_irqsave(&nesadapter->pbl_lock, flags);
-		if (nesmr->pbl_4k) {
-			nesadapter->free_4kpbl += nesmr->pbls_used;
-			if (nesadapter->free_4kpbl > nesadapter->max_4kpbl)
-				printk(KERN_ERR PFX "free 4KB PBLs(%u) has "
-					"exceeded the max(%u)\n",
-					nesadapter->free_4kpbl,
-					nesadapter->max_4kpbl);
-		} else {
-			nesadapter->free_256pbl += nesmr->pbls_used;
-			if (nesadapter->free_256pbl > nesadapter->max_256pbl)
-				printk(KERN_ERR PFX "free 256B PBLs(%u) has "
-					"exceeded the max(%u)\n",
-					nesadapter->free_256pbl,
-					nesadapter->max_256pbl);
-		}
-		spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
-	}
-	nes_free_resource(nesadapter, nesadapter->allocated_mrs,
-			(ib_mr->rkey & 0x0fffff00) >> 8);
-
-	kfree(nesmr);
-
-	return 0;
-}
-
-
-/**
- * show_rev
- */
-static ssize_t show_rev(struct device *dev, struct device_attribute *attr,
-			char *buf)
-{
-	struct nes_ib_device *nesibdev =
-			container_of(dev, struct nes_ib_device, ibdev.dev);
-	struct nes_vnic *nesvnic = nesibdev->nesvnic;
-
-	nes_debug(NES_DBG_INIT, "\n");
-	return sprintf(buf, "%x\n", nesvnic->nesdev->nesadapter->hw_rev);
-}
-
-
-/**
- * show_hca
- */
-static ssize_t show_hca(struct device *dev, struct device_attribute *attr,
-		        char *buf)
-{
-	nes_debug(NES_DBG_INIT, "\n");
-	return sprintf(buf, "NES020\n");
-}
-
-
-/**
- * show_board
- */
-static ssize_t show_board(struct device *dev, struct device_attribute *attr,
-			  char *buf)
-{
-	nes_debug(NES_DBG_INIT, "\n");
-	return sprintf(buf, "%.*s\n", 32, "NES020 Board ID");
-}
-
-
-static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL);
-static DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL);
-static DEVICE_ATTR(board_id, S_IRUGO, show_board, NULL);
-
-static struct device_attribute *nes_dev_attributes[] = {
-	&dev_attr_hw_rev,
-	&dev_attr_hca_type,
-	&dev_attr_board_id
-};
-
-
-/**
- * nes_query_qp
- */
-static int nes_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
-		int attr_mask, struct ib_qp_init_attr *init_attr)
-{
-	struct nes_qp *nesqp = to_nesqp(ibqp);
-
-	nes_debug(NES_DBG_QP, "\n");
-
-	attr->qp_access_flags = 0;
-	attr->cap.max_send_wr = nesqp->hwqp.sq_size;
-	attr->cap.max_recv_wr = nesqp->hwqp.rq_size;
-	attr->cap.max_recv_sge = 1;
-	if (nes_drv_opt & NES_DRV_OPT_NO_INLINE_DATA)
-		attr->cap.max_inline_data = 0;
-	else
-		attr->cap.max_inline_data = 64;
-
-	init_attr->event_handler = nesqp->ibqp.event_handler;
-	init_attr->qp_context = nesqp->ibqp.qp_context;
-	init_attr->send_cq = nesqp->ibqp.send_cq;
-	init_attr->recv_cq = nesqp->ibqp.recv_cq;
-	init_attr->srq = nesqp->ibqp.srq;
-	init_attr->cap = attr->cap;
-
-	return 0;
-}
-
-
-/**
- * nes_hw_modify_qp
- */
-int nes_hw_modify_qp(struct nes_device *nesdev, struct nes_qp *nesqp,
-		u32 next_iwarp_state, u32 termlen, u32 wait_completion)
-{
-	struct nes_hw_cqp_wqe *cqp_wqe;
-	/* struct iw_cm_id *cm_id = nesqp->cm_id; */
-	/* struct iw_cm_event cm_event; */
-	struct nes_cqp_request *cqp_request;
-	int ret;
-	u16 major_code;
-
-	nes_debug(NES_DBG_MOD_QP, "QP%u, refcount=%d\n",
-			nesqp->hwqp.qp_id, atomic_read(&nesqp->refcount));
-
-	cqp_request = nes_get_cqp_request(nesdev);
-	if (cqp_request == NULL) {
-		nes_debug(NES_DBG_MOD_QP, "Failed to get a cqp_request.\n");
-		return -ENOMEM;
-	}
-	if (wait_completion) {
-		cqp_request->waiting = 1;
-	} else {
-		cqp_request->waiting = 0;
-	}
-	cqp_wqe = &cqp_request->cqp_wqe;
-
-	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX,
-			NES_CQP_MODIFY_QP | NES_CQP_QP_TYPE_IWARP | next_iwarp_state);
-	nes_debug(NES_DBG_MOD_QP, "using next_iwarp_state=%08x, wqe_words=%08x\n",
-			next_iwarp_state, le32_to_cpu(cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX]));
-	nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
-	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX, nesqp->hwqp.qp_id);
-	set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_QP_WQE_CONTEXT_LOW_IDX, (u64)nesqp->nesqp_context_pbase);
-
-	/* If sending a terminate message, fill in the length (in words) */
-	if (((next_iwarp_state & NES_CQP_QP_IWARP_STATE_MASK) == NES_CQP_QP_IWARP_STATE_TERMINATE) &&
-	    !(next_iwarp_state & NES_CQP_QP_TERM_DONT_SEND_TERM_MSG)) {
-		termlen = ((termlen + 3) >> 2) << NES_CQP_OP_TERMLEN_SHIFT;
-		set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_QP_WQE_NEW_MSS_IDX, termlen);
-	}
-
-	atomic_set(&cqp_request->refcount, 2);
-	nes_post_cqp_request(nesdev, cqp_request);
-
-	/* Wait for CQP */
-	if (wait_completion) {
-		/* nes_debug(NES_DBG_MOD_QP, "Waiting for modify iWARP QP%u to complete.\n",
-				nesqp->hwqp.qp_id); */
-		ret = wait_event_timeout(cqp_request->waitq, (cqp_request->request_done != 0),
-				NES_EVENT_TIMEOUT);
-		nes_debug(NES_DBG_MOD_QP, "Modify iwarp QP%u completed, wait_event_timeout ret=%u, "
-				"CQP Major:Minor codes = 0x%04X:0x%04X.\n",
-				nesqp->hwqp.qp_id, ret, cqp_request->major_code, cqp_request->minor_code);
-		major_code = cqp_request->major_code;
-		if (major_code) {
-			nes_debug(NES_DBG_MOD_QP, "Modify iwarp QP%u failed"
-					"CQP Major:Minor codes = 0x%04X:0x%04X, intended next state = 0x%08X.\n",
-					nesqp->hwqp.qp_id, cqp_request->major_code,
-					cqp_request->minor_code, next_iwarp_state);
-		}
-
-		nes_put_cqp_request(nesdev, cqp_request);
-
-		if (!ret)
-			return -ETIME;
-		else if (major_code)
-			return -EIO;
-		else
-			return 0;
-	} else {
-		return 0;
-	}
-}
-
-
-/**
- * nes_modify_qp
- */
-int nes_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
-		int attr_mask, struct ib_udata *udata)
-{
-	struct nes_qp *nesqp = to_nesqp(ibqp);
-	struct nes_vnic *nesvnic = to_nesvnic(ibqp->device);
-	struct nes_device *nesdev = nesvnic->nesdev;
-	/* u32 cqp_head; */
-	/* u32 counter; */
-	u32 next_iwarp_state = 0;
-	int err;
-	unsigned long qplockflags;
-	int ret;
-	u16 original_last_aeq;
-	u8 issue_modify_qp = 0;
-	u8 dont_wait = 0;
-
-	nes_debug(NES_DBG_MOD_QP, "QP%u: QP State=%u, cur QP State=%u,"
-			" iwarp_state=0x%X, refcount=%d\n",
-			nesqp->hwqp.qp_id, attr->qp_state, nesqp->ibqp_state,
-			nesqp->iwarp_state, atomic_read(&nesqp->refcount));
-
-	spin_lock_irqsave(&nesqp->lock, qplockflags);
-
-	nes_debug(NES_DBG_MOD_QP, "QP%u: hw_iwarp_state=0x%X, hw_tcp_state=0x%X,"
-			" QP Access Flags=0x%X, attr_mask = 0x%0x\n",
-			nesqp->hwqp.qp_id, nesqp->hw_iwarp_state,
-			nesqp->hw_tcp_state, attr->qp_access_flags, attr_mask);
-
-	if (attr_mask & IB_QP_STATE) {
-		switch (attr->qp_state) {
-			case IB_QPS_INIT:
-				nes_debug(NES_DBG_MOD_QP, "QP%u: new state = init\n",
-						nesqp->hwqp.qp_id);
-				if (nesqp->iwarp_state > (u32)NES_CQP_QP_IWARP_STATE_IDLE) {
-					spin_unlock_irqrestore(&nesqp->lock, qplockflags);
-					return -EINVAL;
-				}
-				next_iwarp_state = NES_CQP_QP_IWARP_STATE_IDLE;
-				issue_modify_qp = 1;
-				break;
-			case IB_QPS_RTR:
-				nes_debug(NES_DBG_MOD_QP, "QP%u: new state = rtr\n",
-						nesqp->hwqp.qp_id);
-				if (nesqp->iwarp_state>(u32)NES_CQP_QP_IWARP_STATE_IDLE) {
-					spin_unlock_irqrestore(&nesqp->lock, qplockflags);
-					return -EINVAL;
-				}
-				next_iwarp_state = NES_CQP_QP_IWARP_STATE_IDLE;
-				issue_modify_qp = 1;
-				break;
-			case IB_QPS_RTS:
-				nes_debug(NES_DBG_MOD_QP, "QP%u: new state = rts\n",
-						nesqp->hwqp.qp_id);
-				if (nesqp->iwarp_state>(u32)NES_CQP_QP_IWARP_STATE_RTS) {
-					spin_unlock_irqrestore(&nesqp->lock, qplockflags);
-					return -EINVAL;
-				}
-				if (nesqp->cm_id == NULL) {
-					nes_debug(NES_DBG_MOD_QP, "QP%u: Failing attempt to move QP to RTS without a CM_ID. \n",
-							nesqp->hwqp.qp_id );
-					spin_unlock_irqrestore(&nesqp->lock, qplockflags);
-					return -EINVAL;
-				}
-				next_iwarp_state = NES_CQP_QP_IWARP_STATE_RTS;
-				if (nesqp->iwarp_state != NES_CQP_QP_IWARP_STATE_RTS)
-					next_iwarp_state |= NES_CQP_QP_CONTEXT_VALID |
-							NES_CQP_QP_ARP_VALID | NES_CQP_QP_ORD_VALID;
-				issue_modify_qp = 1;
-				nesqp->hw_tcp_state = NES_AEQE_TCP_STATE_ESTABLISHED;
-				nesqp->hw_iwarp_state = NES_AEQE_IWARP_STATE_RTS;
-				nesqp->hte_added = 1;
-				break;
-			case IB_QPS_SQD:
-				issue_modify_qp = 1;
-				nes_debug(NES_DBG_MOD_QP, "QP%u: new state=closing. SQ head=%u, SQ tail=%u\n",
-						nesqp->hwqp.qp_id, nesqp->hwqp.sq_head, nesqp->hwqp.sq_tail);
-				if (nesqp->iwarp_state == (u32)NES_CQP_QP_IWARP_STATE_CLOSING) {
-					spin_unlock_irqrestore(&nesqp->lock, qplockflags);
-					return 0;
-				} else {
-					if (nesqp->iwarp_state > (u32)NES_CQP_QP_IWARP_STATE_CLOSING) {
-						nes_debug(NES_DBG_MOD_QP, "QP%u: State change to closing"
-								" ignored due to current iWARP state\n",
-								nesqp->hwqp.qp_id);
-						spin_unlock_irqrestore(&nesqp->lock, qplockflags);
-						return -EINVAL;
-					}
-					if (nesqp->hw_iwarp_state != NES_AEQE_IWARP_STATE_RTS) {
-						nes_debug(NES_DBG_MOD_QP, "QP%u: State change to closing"
-								" already done based on hw state.\n",
-								nesqp->hwqp.qp_id);
-						issue_modify_qp = 0;
-					}
-					switch (nesqp->hw_iwarp_state) {
-						case NES_AEQE_IWARP_STATE_CLOSING:
-							next_iwarp_state = NES_CQP_QP_IWARP_STATE_CLOSING;
-							break;
-						case NES_AEQE_IWARP_STATE_TERMINATE:
-							next_iwarp_state = NES_CQP_QP_IWARP_STATE_TERMINATE;
-							break;
-						case NES_AEQE_IWARP_STATE_ERROR:
-							next_iwarp_state = NES_CQP_QP_IWARP_STATE_ERROR;
-							break;
-						default:
-							next_iwarp_state = NES_CQP_QP_IWARP_STATE_CLOSING;
-							nesqp->hw_iwarp_state = NES_AEQE_IWARP_STATE_CLOSING;
-							break;
-					}
-				}
-				break;
-			case IB_QPS_SQE:
-				nes_debug(NES_DBG_MOD_QP, "QP%u: new state = terminate\n",
-						nesqp->hwqp.qp_id);
-				if (nesqp->iwarp_state>=(u32)NES_CQP_QP_IWARP_STATE_TERMINATE) {
-					spin_unlock_irqrestore(&nesqp->lock, qplockflags);
-					return -EINVAL;
-				}
-				/* next_iwarp_state = (NES_CQP_QP_IWARP_STATE_TERMINATE | 0x02000000); */
-				next_iwarp_state = NES_CQP_QP_IWARP_STATE_TERMINATE;
-				nesqp->hw_iwarp_state = NES_AEQE_IWARP_STATE_TERMINATE;
-				issue_modify_qp = 1;
-				break;
-			case IB_QPS_ERR:
-			case IB_QPS_RESET:
-				if (nesqp->iwarp_state == (u32)NES_CQP_QP_IWARP_STATE_ERROR) {
-					spin_unlock_irqrestore(&nesqp->lock, qplockflags);
-					return -EINVAL;
-				}
-				nes_debug(NES_DBG_MOD_QP, "QP%u: new state = error\n",
-						nesqp->hwqp.qp_id);
-				if (nesqp->term_flags)
-					del_timer(&nesqp->terminate_timer);
-
-				next_iwarp_state = NES_CQP_QP_IWARP_STATE_ERROR;
-				/* next_iwarp_state = (NES_CQP_QP_IWARP_STATE_TERMINATE | 0x02000000); */
-				if (nesqp->hte_added) {
-					nes_debug(NES_DBG_MOD_QP, "set CQP_QP_DEL_HTE\n");
-					next_iwarp_state |= NES_CQP_QP_DEL_HTE;
-					nesqp->hte_added = 0;
-				}
-				if ((nesqp->hw_tcp_state > NES_AEQE_TCP_STATE_CLOSED) &&
-						(nesdev->iw_status) &&
-						(nesqp->hw_tcp_state != NES_AEQE_TCP_STATE_TIME_WAIT)) {
-					next_iwarp_state |= NES_CQP_QP_RESET;
-				} else {
-					nes_debug(NES_DBG_MOD_QP, "QP%u NOT setting NES_CQP_QP_RESET since TCP state = %u\n",
-							nesqp->hwqp.qp_id, nesqp->hw_tcp_state);
-					dont_wait = 1;
-				}
-				issue_modify_qp = 1;
-				nesqp->hw_iwarp_state = NES_AEQE_IWARP_STATE_ERROR;
-				break;
-			default:
-				spin_unlock_irqrestore(&nesqp->lock, qplockflags);
-				return -EINVAL;
-				break;
-		}
-
-		nesqp->ibqp_state = attr->qp_state;
-		nesqp->iwarp_state = next_iwarp_state & NES_CQP_QP_IWARP_STATE_MASK;
-		nes_debug(NES_DBG_MOD_QP, "Change nesqp->iwarp_state=%08x\n",
-				nesqp->iwarp_state);
-	}
-
-	if (attr_mask & IB_QP_ACCESS_FLAGS) {
-		if (attr->qp_access_flags & IB_ACCESS_LOCAL_WRITE) {
-			nesqp->nesqp_context->misc |= cpu_to_le32(NES_QPCONTEXT_MISC_RDMA_WRITE_EN |
-					NES_QPCONTEXT_MISC_RDMA_READ_EN);
-			issue_modify_qp = 1;
-		}
-		if (attr->qp_access_flags & IB_ACCESS_REMOTE_WRITE) {
-			nesqp->nesqp_context->misc |= cpu_to_le32(NES_QPCONTEXT_MISC_RDMA_WRITE_EN);
-			issue_modify_qp = 1;
-		}
-		if (attr->qp_access_flags & IB_ACCESS_REMOTE_READ) {
-			nesqp->nesqp_context->misc |= cpu_to_le32(NES_QPCONTEXT_MISC_RDMA_READ_EN);
-			issue_modify_qp = 1;
-		}
-		if (attr->qp_access_flags & IB_ACCESS_MW_BIND) {
-			nesqp->nesqp_context->misc |= cpu_to_le32(NES_QPCONTEXT_MISC_WBIND_EN);
-			issue_modify_qp = 1;
-		}
-
-		if (nesqp->user_mode) {
-			nesqp->nesqp_context->misc |= cpu_to_le32(NES_QPCONTEXT_MISC_RDMA_WRITE_EN |
-					NES_QPCONTEXT_MISC_RDMA_READ_EN);
-			issue_modify_qp = 1;
-		}
-	}
-
-	original_last_aeq = nesqp->last_aeq;
-	spin_unlock_irqrestore(&nesqp->lock, qplockflags);
-
-	nes_debug(NES_DBG_MOD_QP, "issue_modify_qp=%u\n", issue_modify_qp);
-
-	ret = 0;
-
-
-	if (issue_modify_qp) {
-		nes_debug(NES_DBG_MOD_QP, "call nes_hw_modify_qp\n");
-		ret = nes_hw_modify_qp(nesdev, nesqp, next_iwarp_state, 0, 1);
-		if (ret)
-			nes_debug(NES_DBG_MOD_QP, "nes_hw_modify_qp (next_iwarp_state = 0x%08X)"
-					" failed for QP%u.\n",
-					next_iwarp_state, nesqp->hwqp.qp_id);
-
-	}
-
-	if ((issue_modify_qp) && (nesqp->ibqp_state > IB_QPS_RTS)) {
-		nes_debug(NES_DBG_MOD_QP, "QP%u Issued ModifyQP refcount (%d),"
-				" original_last_aeq = 0x%04X. last_aeq = 0x%04X.\n",
-				nesqp->hwqp.qp_id, atomic_read(&nesqp->refcount),
-				original_last_aeq, nesqp->last_aeq);
-		if (!ret || original_last_aeq != NES_AEQE_AEID_RDMAP_ROE_BAD_LLP_CLOSE) {
-			if (dont_wait) {
-				if (nesqp->cm_id && nesqp->hw_tcp_state != 0) {
-					nes_debug(NES_DBG_MOD_QP, "QP%u Queuing fake disconnect for QP refcount (%d),"
-							" original_last_aeq = 0x%04X. last_aeq = 0x%04X.\n",
-							nesqp->hwqp.qp_id, atomic_read(&nesqp->refcount),
-							original_last_aeq, nesqp->last_aeq);
-					/* this one is for the cm_disconnect thread */
-					spin_lock_irqsave(&nesqp->lock, qplockflags);
-					nesqp->hw_tcp_state = NES_AEQE_TCP_STATE_CLOSED;
-					nesqp->last_aeq = NES_AEQE_AEID_RESET_SENT;
-					spin_unlock_irqrestore(&nesqp->lock, qplockflags);
-					nes_cm_disconn(nesqp);
-				} else {
-					nes_debug(NES_DBG_MOD_QP, "QP%u No fake disconnect, QP refcount=%d\n",
-							nesqp->hwqp.qp_id, atomic_read(&nesqp->refcount));
-				}
-			} else {
-				spin_lock_irqsave(&nesqp->lock, qplockflags);
-				if (nesqp->cm_id) {
-					/* These two are for the timer thread */
-					if (atomic_inc_return(&nesqp->close_timer_started) == 1) {
-						nesqp->cm_id->add_ref(nesqp->cm_id);
-						nes_debug(NES_DBG_MOD_QP, "QP%u Not decrementing QP refcount (%d),"
-								" need ae to finish up, original_last_aeq = 0x%04X."
-								" last_aeq = 0x%04X, scheduling timer.\n",
-								nesqp->hwqp.qp_id, atomic_read(&nesqp->refcount),
-								original_last_aeq, nesqp->last_aeq);
-						schedule_nes_timer(nesqp->cm_node, (struct sk_buff *) nesqp, NES_TIMER_TYPE_CLOSE, 1, 0);
-					}
-					spin_unlock_irqrestore(&nesqp->lock, qplockflags);
-				} else {
-					spin_unlock_irqrestore(&nesqp->lock, qplockflags);
-					nes_debug(NES_DBG_MOD_QP, "QP%u Not decrementing QP refcount (%d),"
-							" need ae to finish up, original_last_aeq = 0x%04X."
-							" last_aeq = 0x%04X.\n",
-							nesqp->hwqp.qp_id, atomic_read(&nesqp->refcount),
-							original_last_aeq, nesqp->last_aeq);
-				}
-			}
-		} else {
-			nes_debug(NES_DBG_MOD_QP, "QP%u Decrementing QP refcount (%d), No ae to finish up,"
-					" original_last_aeq = 0x%04X. last_aeq = 0x%04X.\n",
-					nesqp->hwqp.qp_id, atomic_read(&nesqp->refcount),
-					original_last_aeq, nesqp->last_aeq);
-		}
-	} else {
-		nes_debug(NES_DBG_MOD_QP, "QP%u Decrementing QP refcount (%d), No ae to finish up,"
-				" original_last_aeq = 0x%04X. last_aeq = 0x%04X.\n",
-				nesqp->hwqp.qp_id, atomic_read(&nesqp->refcount),
-				original_last_aeq, nesqp->last_aeq);
-	}
-
-	err = 0;
-
-	nes_debug(NES_DBG_MOD_QP, "QP%u Leaving, refcount=%d\n",
-			nesqp->hwqp.qp_id, atomic_read(&nesqp->refcount));
-
-	return err;
-}
-
-static inline void
-fill_wqe_sg_send(struct nes_hw_qp_wqe *wqe, const struct ib_send_wr *ib_wr,
-		 u32 uselkey)
-{
-	int sge_index;
-	int total_payload_length = 0;
-	for (sge_index = 0; sge_index < ib_wr->num_sge; sge_index++) {
-		set_wqe_64bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_FRAG0_LOW_IDX+(sge_index*4),
-			ib_wr->sg_list[sge_index].addr);
-		set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_LENGTH0_IDX + (sge_index*4),
-			ib_wr->sg_list[sge_index].length);
-		if (uselkey)
-			set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_STAG0_IDX + (sge_index*4),
-						(ib_wr->sg_list[sge_index].lkey));
-		else
-			set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_STAG0_IDX + (sge_index*4), 0);
-
-		total_payload_length += ib_wr->sg_list[sge_index].length;
-	}
-	nes_debug(NES_DBG_IW_TX, "UC UC UC, sending total_payload_length=%u \n",
-			total_payload_length);
-	set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_TOTAL_PAYLOAD_IDX,
-				total_payload_length);
-}
-
-/**
- * nes_post_send
- */
-static int nes_post_send(struct ib_qp *ibqp, const struct ib_send_wr *ib_wr,
-			 const struct ib_send_wr **bad_wr)
-{
-	u64 u64temp;
-	unsigned long flags = 0;
-	struct nes_vnic *nesvnic = to_nesvnic(ibqp->device);
-	struct nes_device *nesdev = nesvnic->nesdev;
-	struct nes_qp *nesqp = to_nesqp(ibqp);
-	struct nes_hw_qp_wqe *wqe;
-	int err = 0;
-	u32 qsize = nesqp->hwqp.sq_size;
-	u32 head;
-	u32 wqe_misc = 0;
-	u32 wqe_count = 0;
-	u32 counter;
-
-	if (nesqp->ibqp_state > IB_QPS_RTS) {
-		err = -EINVAL;
-		goto out;
-	}
-
-	spin_lock_irqsave(&nesqp->lock, flags);
-
-	head = nesqp->hwqp.sq_head;
-
-	while (ib_wr) {
-		/* Check for QP error */
-		if (nesqp->term_flags) {
-			err = -EINVAL;
-			break;
-		}
-
-		/* Check for SQ overflow */
-		if (((head + (2 * qsize) - nesqp->hwqp.sq_tail) % qsize) == (qsize - 1)) {
-			err = -ENOMEM;
-			break;
-		}
-
-		wqe = &nesqp->hwqp.sq_vbase[head];
-		/* nes_debug(NES_DBG_IW_TX, "processing sq wqe for QP%u at %p, head = %u.\n",
-				nesqp->hwqp.qp_id, wqe, head); */
-		nes_fill_init_qp_wqe(wqe, nesqp, head);
-		u64temp = (u64)(ib_wr->wr_id);
-		set_wqe_64bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_COMP_SCRATCH_LOW_IDX,
-					u64temp);
-		switch (ib_wr->opcode) {
-		case IB_WR_SEND:
-		case IB_WR_SEND_WITH_INV:
-			if (IB_WR_SEND == ib_wr->opcode) {
-				if (ib_wr->send_flags & IB_SEND_SOLICITED)
-					wqe_misc = NES_IWARP_SQ_OP_SENDSE;
-				else
-					wqe_misc = NES_IWARP_SQ_OP_SEND;
-			} else {
-				if (ib_wr->send_flags & IB_SEND_SOLICITED)
-					wqe_misc = NES_IWARP_SQ_OP_SENDSEINV;
-				else
-					wqe_misc = NES_IWARP_SQ_OP_SENDINV;
-
-				set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_INV_STAG_LOW_IDX,
-						    ib_wr->ex.invalidate_rkey);
-			}
-
-			if (ib_wr->num_sge > nesdev->nesadapter->max_sge) {
-				err = -EINVAL;
-				break;
-			}
-
-			if (ib_wr->send_flags & IB_SEND_FENCE)
-				wqe_misc |= NES_IWARP_SQ_WQE_LOCAL_FENCE;
-
-			if ((ib_wr->send_flags & IB_SEND_INLINE) &&
-			    ((nes_drv_opt & NES_DRV_OPT_NO_INLINE_DATA) == 0) &&
-			     (ib_wr->sg_list[0].length <= 64)) {
-				memcpy(&wqe->wqe_words[NES_IWARP_SQ_WQE_IMM_DATA_START_IDX],
-				       (void *)(unsigned long)ib_wr->sg_list[0].addr, ib_wr->sg_list[0].length);
-				set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_TOTAL_PAYLOAD_IDX,
-						    ib_wr->sg_list[0].length);
-				wqe_misc |= NES_IWARP_SQ_WQE_IMM_DATA;
-			} else {
-				fill_wqe_sg_send(wqe, ib_wr, 1);
-			}
-
-			break;
-		case IB_WR_RDMA_WRITE:
-			wqe_misc = NES_IWARP_SQ_OP_RDMAW;
-			if (ib_wr->num_sge > nesdev->nesadapter->max_sge) {
-				nes_debug(NES_DBG_IW_TX, "Exceeded max sge, ib_wr=%u, max=%u\n",
-					  ib_wr->num_sge, nesdev->nesadapter->max_sge);
-				err = -EINVAL;
-				break;
-			}
-
-			if (ib_wr->send_flags & IB_SEND_FENCE)
-				wqe_misc |= NES_IWARP_SQ_WQE_LOCAL_FENCE;
-
-			set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_RDMA_STAG_IDX,
-					    rdma_wr(ib_wr)->rkey);
-			set_wqe_64bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_RDMA_TO_LOW_IDX,
-					    rdma_wr(ib_wr)->remote_addr);
-
-			if ((ib_wr->send_flags & IB_SEND_INLINE) &&
-			    ((nes_drv_opt & NES_DRV_OPT_NO_INLINE_DATA) == 0) &&
-			     (ib_wr->sg_list[0].length <= 64)) {
-				memcpy(&wqe->wqe_words[NES_IWARP_SQ_WQE_IMM_DATA_START_IDX],
-				       (void *)(unsigned long)ib_wr->sg_list[0].addr, ib_wr->sg_list[0].length);
-				set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_TOTAL_PAYLOAD_IDX,
-						    ib_wr->sg_list[0].length);
-				wqe_misc |= NES_IWARP_SQ_WQE_IMM_DATA;
-			} else {
-				fill_wqe_sg_send(wqe, ib_wr, 1);
-			}
-
-			wqe->wqe_words[NES_IWARP_SQ_WQE_RDMA_LENGTH_IDX] =
-				wqe->wqe_words[NES_IWARP_SQ_WQE_TOTAL_PAYLOAD_IDX];
-			break;
-		case IB_WR_RDMA_READ:
-		case IB_WR_RDMA_READ_WITH_INV:
-			/* iWARP only supports 1 sge for RDMA reads */
-			if (ib_wr->num_sge > 1) {
-				nes_debug(NES_DBG_IW_TX, "Exceeded max sge, ib_wr=%u, max=1\n",
-					  ib_wr->num_sge);
-				err = -EINVAL;
-				break;
-			}
-			if (ib_wr->opcode == IB_WR_RDMA_READ) {
-				wqe_misc = NES_IWARP_SQ_OP_RDMAR;
-			} else {
-				wqe_misc = NES_IWARP_SQ_OP_RDMAR_LOCINV;
-				set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_INV_STAG_LOW_IDX,
-						    ib_wr->ex.invalidate_rkey);
-			}
-
-			set_wqe_64bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_RDMA_TO_LOW_IDX,
-					    rdma_wr(ib_wr)->remote_addr);
-			set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_RDMA_STAG_IDX,
-					    rdma_wr(ib_wr)->rkey);
-			set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_RDMA_LENGTH_IDX,
-					    ib_wr->sg_list->length);
-			set_wqe_64bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_FRAG0_LOW_IDX,
-					    ib_wr->sg_list->addr);
-			set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_STAG0_IDX,
-					    ib_wr->sg_list->lkey);
-			break;
-		case IB_WR_LOCAL_INV:
-			wqe_misc = NES_IWARP_SQ_OP_LOCINV;
-			set_wqe_32bit_value(wqe->wqe_words,
-					    NES_IWARP_SQ_LOCINV_WQE_INV_STAG_IDX,
-					    ib_wr->ex.invalidate_rkey);
-			break;
-		case IB_WR_REG_MR:
-		{
-			struct nes_mr *mr = to_nesmr(reg_wr(ib_wr)->mr);
-			int page_shift = ilog2(reg_wr(ib_wr)->mr->page_size);
-			int flags = reg_wr(ib_wr)->access;
-
-			if (mr->npages > (NES_4K_PBL_CHUNK_SIZE / sizeof(u64))) {
-				nes_debug(NES_DBG_IW_TX, "SQ_FMR: bad page_list_len\n");
-				err = -EINVAL;
-				break;
-			}
-			wqe_misc = NES_IWARP_SQ_OP_FAST_REG;
-			set_wqe_64bit_value(wqe->wqe_words,
-					    NES_IWARP_SQ_FMR_WQE_VA_FBO_LOW_IDX,
-					    mr->ibmr.iova);
-			set_wqe_32bit_value(wqe->wqe_words,
-					    NES_IWARP_SQ_FMR_WQE_LENGTH_LOW_IDX,
-					    lower_32_bits(mr->ibmr.length));
-			set_wqe_32bit_value(wqe->wqe_words,
-					    NES_IWARP_SQ_FMR_WQE_LENGTH_HIGH_IDX, 0);
-			set_wqe_32bit_value(wqe->wqe_words,
-					    NES_IWARP_SQ_FMR_WQE_MR_STAG_IDX,
-					    reg_wr(ib_wr)->key);
-
-			if (page_shift == 12) {
-				wqe_misc |= NES_IWARP_SQ_FMR_WQE_PAGE_SIZE_4K;
-			} else if (page_shift == 21) {
-				wqe_misc |= NES_IWARP_SQ_FMR_WQE_PAGE_SIZE_2M;
-			} else {
-				nes_debug(NES_DBG_IW_TX, "Invalid page shift,"
-					  " ib_wr=%u, max=1\n", ib_wr->num_sge);
-				err = -EINVAL;
-				break;
-			}
-
-			/* Set access_flags */
-			wqe_misc |= NES_IWARP_SQ_FMR_WQE_RIGHTS_ENABLE_LOCAL_READ;
-			if (flags & IB_ACCESS_LOCAL_WRITE)
-				wqe_misc |= NES_IWARP_SQ_FMR_WQE_RIGHTS_ENABLE_LOCAL_WRITE;
-
-			if (flags & IB_ACCESS_REMOTE_WRITE)
-				wqe_misc |= NES_IWARP_SQ_FMR_WQE_RIGHTS_ENABLE_REMOTE_WRITE;
-
-			if (flags & IB_ACCESS_REMOTE_READ)
-				wqe_misc |= NES_IWARP_SQ_FMR_WQE_RIGHTS_ENABLE_REMOTE_READ;
-
-			if (flags & IB_ACCESS_MW_BIND)
-				wqe_misc |= NES_IWARP_SQ_FMR_WQE_RIGHTS_ENABLE_WINDOW_BIND;
-
-			/* Fill in PBL info: */
-			set_wqe_64bit_value(wqe->wqe_words,
-					    NES_IWARP_SQ_FMR_WQE_PBL_ADDR_LOW_IDX,
-					    mr->paddr);
-
-			set_wqe_32bit_value(wqe->wqe_words,
-					    NES_IWARP_SQ_FMR_WQE_PBL_LENGTH_IDX,
-					    mr->npages * 8);
-
-			nes_debug(NES_DBG_IW_TX, "SQ_REG_MR: iova_start: %llx, "
-				  "length: %lld, rkey: %0x, pgl_paddr: %llx, "
-				  "page_list_len: %u, wqe_misc: %x\n",
-				  (unsigned long long) mr->ibmr.iova,
-				  mr->ibmr.length,
-				  reg_wr(ib_wr)->key,
-				  (unsigned long long) mr->paddr,
-				  mr->npages,
-				  wqe_misc);
-			break;
-		}
-		default:
-			/* error */
-			err = -EINVAL;
-			break;
-		}
-
-		if (err)
-			break;
-
-		if ((ib_wr->send_flags & IB_SEND_SIGNALED) || nesqp->sig_all)
-			wqe_misc |= NES_IWARP_SQ_WQE_SIGNALED_COMPL;
-
-		wqe->wqe_words[NES_IWARP_SQ_WQE_MISC_IDX] = cpu_to_le32(wqe_misc);
-
-		ib_wr = ib_wr->next;
-		head++;
-		wqe_count++;
-		if (head >= qsize)
-			head = 0;
-
-	}
-
-	nesqp->hwqp.sq_head = head;
-	barrier();
-	while (wqe_count) {
-		counter = min(wqe_count, ((u32)255));
-		wqe_count -= counter;
-		nes_write32(nesdev->regs + NES_WQE_ALLOC,
-				(counter << 24) | 0x00800000 | nesqp->hwqp.qp_id);
-	}
-
-	spin_unlock_irqrestore(&nesqp->lock, flags);
-
-out:
-	if (err)
-		*bad_wr = ib_wr;
-	return err;
-}
-
-
-/**
- * nes_post_recv
- */
-static int nes_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *ib_wr,
-			 const struct ib_recv_wr **bad_wr)
-{
-	u64 u64temp;
-	unsigned long flags = 0;
-	struct nes_vnic *nesvnic = to_nesvnic(ibqp->device);
-	struct nes_device *nesdev = nesvnic->nesdev;
-	struct nes_qp *nesqp = to_nesqp(ibqp);
-	struct nes_hw_qp_wqe *wqe;
-	int err = 0;
-	int sge_index;
-	u32 qsize = nesqp->hwqp.rq_size;
-	u32 head;
-	u32 wqe_count = 0;
-	u32 counter;
-	u32 total_payload_length;
-
-	if (nesqp->ibqp_state > IB_QPS_RTS) {
-		err = -EINVAL;
-		goto out;
-	}
-
-	spin_lock_irqsave(&nesqp->lock, flags);
-
-	head = nesqp->hwqp.rq_head;
-
-	while (ib_wr) {
-		/* Check for QP error */
-		if (nesqp->term_flags) {
-			err = -EINVAL;
-			break;
-		}
-
-		if (ib_wr->num_sge > nesdev->nesadapter->max_sge) {
-			err = -EINVAL;
-			break;
-		}
-		/* Check for RQ overflow */
-		if (((head + (2 * qsize) - nesqp->hwqp.rq_tail) % qsize) == (qsize - 1)) {
-			err = -ENOMEM;
-			break;
-		}
-
-		nes_debug(NES_DBG_IW_RX, "ibwr sge count = %u.\n", ib_wr->num_sge);
-		wqe = &nesqp->hwqp.rq_vbase[head];
-
-		/* nes_debug(NES_DBG_IW_RX, "QP%u:processing rq wqe at %p, head = %u.\n",
-				nesqp->hwqp.qp_id, wqe, head); */
-		nes_fill_init_qp_wqe(wqe, nesqp, head);
-		u64temp = (u64)(ib_wr->wr_id);
-		set_wqe_64bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_COMP_SCRATCH_LOW_IDX,
-					u64temp);
-		total_payload_length = 0;
-		for (sge_index=0; sge_index < ib_wr->num_sge; sge_index++) {
-			set_wqe_64bit_value(wqe->wqe_words, NES_IWARP_RQ_WQE_FRAG0_LOW_IDX+(sge_index*4),
-					ib_wr->sg_list[sge_index].addr);
-			set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_RQ_WQE_LENGTH0_IDX+(sge_index*4),
-					ib_wr->sg_list[sge_index].length);
-			set_wqe_32bit_value(wqe->wqe_words,NES_IWARP_RQ_WQE_STAG0_IDX+(sge_index*4),
-					ib_wr->sg_list[sge_index].lkey);
-
-			total_payload_length += ib_wr->sg_list[sge_index].length;
-		}
-		set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_RQ_WQE_TOTAL_PAYLOAD_IDX,
-					total_payload_length);
-
-		ib_wr = ib_wr->next;
-		head++;
-		wqe_count++;
-		if (head >= qsize)
-			head = 0;
-	}
-
-	nesqp->hwqp.rq_head = head;
-	barrier();
-	while (wqe_count) {
-		counter = min(wqe_count, ((u32)255));
-		wqe_count -= counter;
-		nes_write32(nesdev->regs+NES_WQE_ALLOC, (counter<<24) | nesqp->hwqp.qp_id);
-	}
-
-	spin_unlock_irqrestore(&nesqp->lock, flags);
-
-out:
-	if (err)
-		*bad_wr = ib_wr;
-	return err;
-}
-
-/**
- * nes_drain_sq - drain sq
- * @ibqp: pointer to ibqp
- */
-static void nes_drain_sq(struct ib_qp *ibqp)
-{
-	struct nes_qp *nesqp = to_nesqp(ibqp);
-
-	if (nesqp->hwqp.sq_tail != nesqp->hwqp.sq_head)
-		wait_for_completion(&nesqp->sq_drained);
-}
-
-/**
- * nes_drain_rq - drain rq
- * @ibqp: pointer to ibqp
- */
-static void nes_drain_rq(struct ib_qp *ibqp)
-{
-	struct nes_qp *nesqp = to_nesqp(ibqp);
-
-	if (nesqp->hwqp.rq_tail != nesqp->hwqp.rq_head)
-		wait_for_completion(&nesqp->rq_drained);
-}
-
-/**
- * nes_poll_cq
- */
-static int nes_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry)
-{
-	u64 u64temp;
-	u64 wrid;
-	unsigned long flags = 0;
-	struct nes_vnic *nesvnic = to_nesvnic(ibcq->device);
-	struct nes_device *nesdev = nesvnic->nesdev;
-	struct nes_cq *nescq = to_nescq(ibcq);
-	struct nes_qp *nesqp;
-	struct nes_hw_cqe cqe;
-	u32 head;
-	u32 wq_tail = 0;
-	u32 cq_size;
-	u32 cqe_count = 0;
-	u32 wqe_index;
-	u32 u32temp;
-	u32 move_cq_head = 1;
-	u32 err_code;
-
-	nes_debug(NES_DBG_CQ, "\n");
-
-	spin_lock_irqsave(&nescq->lock, flags);
-
-	head = nescq->hw_cq.cq_head;
-	cq_size = nescq->hw_cq.cq_size;
-
-	while (cqe_count < num_entries) {
-		if ((le32_to_cpu(nescq->hw_cq.cq_vbase[head].cqe_words[NES_CQE_OPCODE_IDX]) &
-				NES_CQE_VALID) == 0)
-			break;
-
-		/*
-		 * Make sure we read CQ entry contents *after*
-		 * we've checked the valid bit.
-		 */
-		rmb();
-
-		cqe = nescq->hw_cq.cq_vbase[head];
-		u32temp = le32_to_cpu(cqe.cqe_words[NES_CQE_COMP_COMP_CTX_LOW_IDX]);
-		wqe_index = u32temp & (nesdev->nesadapter->max_qp_wr - 1);
-		u32temp &= ~(NES_SW_CONTEXT_ALIGN-1);
-		/* parse CQE, get completion context from WQE (either rq or sq) */
-		u64temp = (((u64)(le32_to_cpu(cqe.cqe_words[NES_CQE_COMP_COMP_CTX_HIGH_IDX])))<<32) |
-				((u64)u32temp);
-
-		if (u64temp) {
-			nesqp = (struct nes_qp *)(unsigned long)u64temp;
-			memset(entry, 0, sizeof *entry);
-			if (cqe.cqe_words[NES_CQE_ERROR_CODE_IDX] == 0) {
-				entry->status = IB_WC_SUCCESS;
-			} else {
-				err_code = le32_to_cpu(cqe.cqe_words[NES_CQE_ERROR_CODE_IDX]);
-				if (NES_IWARP_CQE_MAJOR_DRV == (err_code >> 16)) {
-					entry->status = err_code & 0x0000ffff;
-
-					/* The rest of the cqe's will be marked as flushed */
-					nescq->hw_cq.cq_vbase[head].cqe_words[NES_CQE_ERROR_CODE_IDX] =
-						cpu_to_le32((NES_IWARP_CQE_MAJOR_FLUSH << 16) |
-							    NES_IWARP_CQE_MINOR_FLUSH);
-				} else
-					entry->status = IB_WC_WR_FLUSH_ERR;
-			}
-
-			entry->qp = &nesqp->ibqp;
-			entry->src_qp = nesqp->hwqp.qp_id;
-
-			if (le32_to_cpu(cqe.cqe_words[NES_CQE_OPCODE_IDX]) & NES_CQE_SQ) {
-				if (nesqp->skip_lsmm) {
-					nesqp->skip_lsmm = 0;
-					nesqp->hwqp.sq_tail++;
-				}
-
-				/* Working on a SQ Completion*/
-				wrid = (((u64)(cpu_to_le32((u32)nesqp->hwqp.sq_vbase[wqe_index].
-						wqe_words[NES_IWARP_SQ_WQE_COMP_SCRATCH_HIGH_IDX]))) << 32) |
-						((u64)(cpu_to_le32((u32)nesqp->hwqp.sq_vbase[wqe_index].
-						wqe_words[NES_IWARP_SQ_WQE_COMP_SCRATCH_LOW_IDX])));
-				entry->byte_len = le32_to_cpu(nesqp->hwqp.sq_vbase[wqe_index].
-						wqe_words[NES_IWARP_SQ_WQE_TOTAL_PAYLOAD_IDX]);
-
-				switch (le32_to_cpu(nesqp->hwqp.sq_vbase[wqe_index].
-						wqe_words[NES_IWARP_SQ_WQE_MISC_IDX]) & 0x3f) {
-					case NES_IWARP_SQ_OP_RDMAW:
-						nes_debug(NES_DBG_CQ, "Operation = RDMA WRITE.\n");
-						entry->opcode = IB_WC_RDMA_WRITE;
-						break;
-					case NES_IWARP_SQ_OP_RDMAR:
-						nes_debug(NES_DBG_CQ, "Operation = RDMA READ.\n");
-						entry->opcode = IB_WC_RDMA_READ;
-						entry->byte_len = le32_to_cpu(nesqp->hwqp.sq_vbase[wqe_index].
-								wqe_words[NES_IWARP_SQ_WQE_RDMA_LENGTH_IDX]);
-						break;
-					case NES_IWARP_SQ_OP_SENDINV:
-					case NES_IWARP_SQ_OP_SENDSEINV:
-					case NES_IWARP_SQ_OP_SEND:
-					case NES_IWARP_SQ_OP_SENDSE:
-						nes_debug(NES_DBG_CQ, "Operation = Send.\n");
-						entry->opcode = IB_WC_SEND;
-						break;
-					case NES_IWARP_SQ_OP_LOCINV:
-						entry->opcode = IB_WC_LOCAL_INV;
-						break;
-					case NES_IWARP_SQ_OP_FAST_REG:
-						entry->opcode = IB_WC_REG_MR;
-						break;
-				}
-
-				nesqp->hwqp.sq_tail = (wqe_index+1)&(nesqp->hwqp.sq_size - 1);
-				if ((entry->status != IB_WC_SUCCESS) && (nesqp->hwqp.sq_tail != nesqp->hwqp.sq_head)) {
-					move_cq_head = 0;
-					wq_tail = nesqp->hwqp.sq_tail;
-				}
-			} else {
-				/* Working on a RQ Completion*/
-				entry->byte_len = le32_to_cpu(cqe.cqe_words[NES_CQE_PAYLOAD_LENGTH_IDX]);
-				wrid = ((u64)(le32_to_cpu(nesqp->hwqp.rq_vbase[wqe_index].wqe_words[NES_IWARP_RQ_WQE_COMP_SCRATCH_LOW_IDX]))) |
-					((u64)(le32_to_cpu(nesqp->hwqp.rq_vbase[wqe_index].wqe_words[NES_IWARP_RQ_WQE_COMP_SCRATCH_HIGH_IDX]))<<32);
-				entry->opcode = IB_WC_RECV;
-
-				nesqp->hwqp.rq_tail = (wqe_index+1)&(nesqp->hwqp.rq_size - 1);
-				if ((entry->status != IB_WC_SUCCESS) && (nesqp->hwqp.rq_tail != nesqp->hwqp.rq_head)) {
-					move_cq_head = 0;
-					wq_tail = nesqp->hwqp.rq_tail;
-				}
-			}
-
-			if (nesqp->iwarp_state > NES_CQP_QP_IWARP_STATE_RTS) {
-				if (nesqp->hwqp.sq_tail == nesqp->hwqp.sq_head)
-					complete(&nesqp->sq_drained);
-				if (nesqp->hwqp.rq_tail == nesqp->hwqp.rq_head)
-					complete(&nesqp->rq_drained);
-			}
-
-			entry->wr_id = wrid;
-			entry++;
-			cqe_count++;
-		}
-
-		if (move_cq_head) {
-			nescq->hw_cq.cq_vbase[head].cqe_words[NES_CQE_OPCODE_IDX] = 0;
-			if (++head >= cq_size)
-				head = 0;
-			nescq->polled_completions++;
-
-			if ((nescq->polled_completions > (cq_size / 2)) ||
-					(nescq->polled_completions == 255)) {
-				nes_debug(NES_DBG_CQ, "CQ%u Issuing CQE Allocate since more than half of cqes"
-					" are pending %u of %u.\n",
-					nescq->hw_cq.cq_number, nescq->polled_completions, cq_size);
-				nes_write32(nesdev->regs+NES_CQE_ALLOC,
-					nescq->hw_cq.cq_number | (nescq->polled_completions << 16));
-				nescq->polled_completions = 0;
-			}
-		} else {
-			/* Update the wqe index and set status to flush */
-			wqe_index = le32_to_cpu(cqe.cqe_words[NES_CQE_COMP_COMP_CTX_LOW_IDX]);
-			wqe_index = (wqe_index & (~(nesdev->nesadapter->max_qp_wr - 1))) | wq_tail;
-			nescq->hw_cq.cq_vbase[head].cqe_words[NES_CQE_COMP_COMP_CTX_LOW_IDX] =
-				cpu_to_le32(wqe_index);
-			move_cq_head = 1; /* ready for next pass */
-		}
-	}
-
-	if (nescq->polled_completions) {
-		nes_write32(nesdev->regs+NES_CQE_ALLOC,
-				nescq->hw_cq.cq_number | (nescq->polled_completions << 16));
-		nescq->polled_completions = 0;
-	}
-
-	nescq->hw_cq.cq_head = head;
-	nes_debug(NES_DBG_CQ, "Reporting %u completions for CQ%u.\n",
-			cqe_count, nescq->hw_cq.cq_number);
-
-	spin_unlock_irqrestore(&nescq->lock, flags);
-
-	return cqe_count;
-}
-
-
-/**
- * nes_req_notify_cq
- */
-static int nes_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags notify_flags)
-		{
-	struct nes_vnic *nesvnic = to_nesvnic(ibcq->device);
-	struct nes_device *nesdev = nesvnic->nesdev;
-	struct nes_cq *nescq = to_nescq(ibcq);
-	u32 cq_arm;
-
-	nes_debug(NES_DBG_CQ, "Requesting notification for CQ%u.\n",
-			nescq->hw_cq.cq_number);
-
-	cq_arm = nescq->hw_cq.cq_number;
-	if ((notify_flags & IB_CQ_SOLICITED_MASK) == IB_CQ_NEXT_COMP)
-		cq_arm |= NES_CQE_ALLOC_NOTIFY_NEXT;
-	else if ((notify_flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED)
-		cq_arm |= NES_CQE_ALLOC_NOTIFY_SE;
-	else
-		return -EINVAL;
-
-	nes_write32(nesdev->regs+NES_CQE_ALLOC, cq_arm);
-	nes_read32(nesdev->regs+NES_CQE_ALLOC);
-
-	return 0;
-}
-
-static int nes_port_immutable(struct ib_device *ibdev, u8 port_num,
-			      struct ib_port_immutable *immutable)
-{
-	struct ib_port_attr attr;
-	int err;
-
-	immutable->core_cap_flags = RDMA_CORE_PORT_IWARP;
-
-	err = nes_query_port(ibdev, port_num, &attr);
-	if (err)
-		return err;
-
-	immutable->pkey_tbl_len = attr.pkey_tbl_len;
-	immutable->gid_tbl_len = attr.gid_tbl_len;
-
-	return 0;
-}
-
-static void get_dev_fw_str(struct ib_device *dev, char *str)
-{
-	struct nes_ib_device *nesibdev =
-			container_of(dev, struct nes_ib_device, ibdev);
-	struct nes_vnic *nesvnic = nesibdev->nesvnic;
-
-	nes_debug(NES_DBG_INIT, "\n");
-	snprintf(str, IB_FW_VERSION_NAME_MAX, "%u.%u",
-		 (nesvnic->nesdev->nesadapter->firmware_version >> 16),
-		 (nesvnic->nesdev->nesadapter->firmware_version & 0x000000ff));
-}
-
-/**
- * nes_init_ofa_device
- */
-struct nes_ib_device *nes_init_ofa_device(struct net_device *netdev)
-{
-	struct nes_ib_device *nesibdev;
-	struct nes_vnic *nesvnic = netdev_priv(netdev);
-	struct nes_device *nesdev = nesvnic->nesdev;
-
-	nesibdev = (struct nes_ib_device *)ib_alloc_device(sizeof(struct nes_ib_device));
-	if (nesibdev == NULL) {
-		return NULL;
-	}
-	strlcpy(nesibdev->ibdev.name, "nes%d", IB_DEVICE_NAME_MAX);
-	nesibdev->ibdev.owner = THIS_MODULE;
-
-	nesibdev->ibdev.node_type = RDMA_NODE_RNIC;
-	memset(&nesibdev->ibdev.node_guid, 0, sizeof(nesibdev->ibdev.node_guid));
-	memcpy(&nesibdev->ibdev.node_guid, netdev->dev_addr, 6);
-
-	nesibdev->ibdev.uverbs_cmd_mask =
-			(1ull << IB_USER_VERBS_CMD_GET_CONTEXT) |
-			(1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) |
-			(1ull << IB_USER_VERBS_CMD_QUERY_PORT) |
-			(1ull << IB_USER_VERBS_CMD_ALLOC_PD) |
-			(1ull << IB_USER_VERBS_CMD_DEALLOC_PD) |
-			(1ull << IB_USER_VERBS_CMD_REG_MR) |
-			(1ull << IB_USER_VERBS_CMD_DEREG_MR) |
-			(1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) |
-			(1ull << IB_USER_VERBS_CMD_CREATE_CQ) |
-			(1ull << IB_USER_VERBS_CMD_DESTROY_CQ) |
-			(1ull << IB_USER_VERBS_CMD_CREATE_AH) |
-			(1ull << IB_USER_VERBS_CMD_DESTROY_AH) |
-			(1ull << IB_USER_VERBS_CMD_REQ_NOTIFY_CQ) |
-			(1ull << IB_USER_VERBS_CMD_CREATE_QP) |
-			(1ull << IB_USER_VERBS_CMD_MODIFY_QP) |
-			(1ull << IB_USER_VERBS_CMD_POLL_CQ) |
-			(1ull << IB_USER_VERBS_CMD_DESTROY_QP) |
-			(1ull << IB_USER_VERBS_CMD_ALLOC_MW) |
-			(1ull << IB_USER_VERBS_CMD_BIND_MW) |
-			(1ull << IB_USER_VERBS_CMD_DEALLOC_MW) |
-			(1ull << IB_USER_VERBS_CMD_POST_RECV) |
-			(1ull << IB_USER_VERBS_CMD_POST_SEND);
-
-	nesibdev->ibdev.phys_port_cnt = 1;
-	nesibdev->ibdev.num_comp_vectors = 1;
-	nesibdev->ibdev.dev.parent = &nesdev->pcidev->dev;
-	nesibdev->ibdev.query_device = nes_query_device;
-	nesibdev->ibdev.query_port = nes_query_port;
-	nesibdev->ibdev.query_pkey = nes_query_pkey;
-	nesibdev->ibdev.query_gid = nes_query_gid;
-	nesibdev->ibdev.alloc_ucontext = nes_alloc_ucontext;
-	nesibdev->ibdev.dealloc_ucontext = nes_dealloc_ucontext;
-	nesibdev->ibdev.mmap = nes_mmap;
-	nesibdev->ibdev.alloc_pd = nes_alloc_pd;
-	nesibdev->ibdev.dealloc_pd = nes_dealloc_pd;
-	nesibdev->ibdev.create_qp = nes_create_qp;
-	nesibdev->ibdev.modify_qp = nes_modify_qp;
-	nesibdev->ibdev.query_qp = nes_query_qp;
-	nesibdev->ibdev.destroy_qp = nes_destroy_qp;
-	nesibdev->ibdev.create_cq = nes_create_cq;
-	nesibdev->ibdev.destroy_cq = nes_destroy_cq;
-	nesibdev->ibdev.poll_cq = nes_poll_cq;
-	nesibdev->ibdev.get_dma_mr = nes_get_dma_mr;
-	nesibdev->ibdev.reg_user_mr = nes_reg_user_mr;
-	nesibdev->ibdev.dereg_mr = nes_dereg_mr;
-	nesibdev->ibdev.alloc_mw = nes_alloc_mw;
-	nesibdev->ibdev.dealloc_mw = nes_dealloc_mw;
-
-	nesibdev->ibdev.alloc_mr = nes_alloc_mr;
-	nesibdev->ibdev.map_mr_sg = nes_map_mr_sg;
-
-	nesibdev->ibdev.req_notify_cq = nes_req_notify_cq;
-	nesibdev->ibdev.post_send = nes_post_send;
-	nesibdev->ibdev.post_recv = nes_post_recv;
-	nesibdev->ibdev.drain_sq = nes_drain_sq;
-	nesibdev->ibdev.drain_rq = nes_drain_rq;
-
-	nesibdev->ibdev.iwcm = kzalloc(sizeof(*nesibdev->ibdev.iwcm), GFP_KERNEL);
-	if (nesibdev->ibdev.iwcm == NULL) {
-		ib_dealloc_device(&nesibdev->ibdev);
-		return NULL;
-	}
-	nesibdev->ibdev.iwcm->add_ref = nes_add_ref;
-	nesibdev->ibdev.iwcm->rem_ref = nes_rem_ref;
-	nesibdev->ibdev.iwcm->get_qp = nes_get_qp;
-	nesibdev->ibdev.iwcm->connect = nes_connect;
-	nesibdev->ibdev.iwcm->accept = nes_accept;
-	nesibdev->ibdev.iwcm->reject = nes_reject;
-	nesibdev->ibdev.iwcm->create_listen = nes_create_listen;
-	nesibdev->ibdev.iwcm->destroy_listen = nes_destroy_listen;
-	nesibdev->ibdev.get_port_immutable   = nes_port_immutable;
-	nesibdev->ibdev.get_dev_fw_str   = get_dev_fw_str;
-	memcpy(nesibdev->ibdev.iwcm->ifname, netdev->name,
-	       sizeof(nesibdev->ibdev.iwcm->ifname));
-
-	return nesibdev;
-}
-
-
-/**
- * nes_handle_delayed_event
- */
-static void nes_handle_delayed_event(struct timer_list *t)
-{
-	struct nes_vnic *nesvnic = from_timer(nesvnic, t, event_timer);
-
-	if (nesvnic->delayed_event != nesvnic->last_dispatched_event) {
-		struct ib_event event;
-
-		event.device = &nesvnic->nesibdev->ibdev;
-		if (!event.device)
-			goto stop_timer;
-		event.event = nesvnic->delayed_event;
-		event.element.port_num = nesvnic->logical_port + 1;
-		ib_dispatch_event(&event);
-	}
-
-stop_timer:
-	nesvnic->event_timer.function = NULL;
-}
-
-
-void  nes_port_ibevent(struct nes_vnic *nesvnic)
-{
-	struct nes_ib_device *nesibdev = nesvnic->nesibdev;
-	struct nes_device *nesdev = nesvnic->nesdev;
-	struct ib_event event;
-	event.device = &nesibdev->ibdev;
-	event.element.port_num = nesvnic->logical_port + 1;
-	event.event = nesdev->iw_status ? IB_EVENT_PORT_ACTIVE : IB_EVENT_PORT_ERR;
-
-	if (!nesvnic->event_timer.function) {
-		ib_dispatch_event(&event);
-		nesvnic->last_dispatched_event = event.event;
-		nesvnic->event_timer.function = nes_handle_delayed_event;
-		nesvnic->event_timer.expires = jiffies + NES_EVENT_DELAY;
-		add_timer(&nesvnic->event_timer);
-	} else {
-		mod_timer(&nesvnic->event_timer, jiffies + NES_EVENT_DELAY);
-	}
-	nesvnic->delayed_event = event.event;
-}
-
-
-/**
- * nes_destroy_ofa_device
- */
-void nes_destroy_ofa_device(struct nes_ib_device *nesibdev)
-{
-	if (nesibdev == NULL)
-		return;
-
-	nes_unregister_ofa_device(nesibdev);
-
-	kfree(nesibdev->ibdev.iwcm);
-	ib_dealloc_device(&nesibdev->ibdev);
-}
-
-
-/**
- * nes_register_ofa_device
- */
-int nes_register_ofa_device(struct nes_ib_device *nesibdev)
-{
-	struct nes_vnic *nesvnic = nesibdev->nesvnic;
-	struct nes_device *nesdev = nesvnic->nesdev;
-	struct nes_adapter *nesadapter = nesdev->nesadapter;
-	int i, ret;
-
-	nesvnic->nesibdev->ibdev.driver_id = RDMA_DRIVER_NES;
-	ret = ib_register_device(&nesvnic->nesibdev->ibdev, NULL);
-	if (ret) {
-		return ret;
-	}
-
-	/* Get the resources allocated to this device */
-	nesibdev->max_cq = (nesadapter->max_cq-NES_FIRST_QPN) / nesadapter->port_count;
-	nesibdev->max_mr = nesadapter->max_mr / nesadapter->port_count;
-	nesibdev->max_qp = (nesadapter->max_qp-NES_FIRST_QPN) / nesadapter->port_count;
-	nesibdev->max_pd = nesadapter->max_pd / nesadapter->port_count;
-
-	for (i = 0; i < ARRAY_SIZE(nes_dev_attributes); ++i) {
-		ret = device_create_file(&nesibdev->ibdev.dev, nes_dev_attributes[i]);
-		if (ret) {
-			while (i > 0) {
-				i--;
-				device_remove_file(&nesibdev->ibdev.dev,
-						   nes_dev_attributes[i]);
-			}
-			ib_unregister_device(&nesibdev->ibdev);
-			return ret;
-		}
-	}
-
-	nesvnic->of_device_registered = 1;
-
-	return 0;
-}
-
-
-/**
- * nes_unregister_ofa_device
- */
-static void nes_unregister_ofa_device(struct nes_ib_device *nesibdev)
-{
-	struct nes_vnic *nesvnic = nesibdev->nesvnic;
-	int i;
-
-	for (i = 0; i < ARRAY_SIZE(nes_dev_attributes); ++i) {
-		device_remove_file(&nesibdev->ibdev.dev, nes_dev_attributes[i]);
-	}
-
-	if (nesvnic->of_device_registered) {
-		ib_unregister_device(&nesibdev->ibdev);
-	}
-
-	nesvnic->of_device_registered = 0;
-}
diff --git a/drivers/infiniband/hw/nes/nes_verbs.h b/drivers/infiniband/hw/nes/nes_verbs.h
deleted file mode 100644
index e02a566..0000000
--- a/drivers/infiniband/hw/nes/nes_verbs.h
+++ /dev/null
@@ -1,199 +0,0 @@
-/*
- * Copyright (c) 2006 - 2011 Intel Corporation.  All rights reserved.
- * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- */
-
-#ifndef NES_VERBS_H
-#define NES_VERBS_H
-
-struct nes_device;
-
-#define NES_MAX_USER_DB_REGIONS  4096
-#define NES_MAX_USER_WQ_REGIONS  4096
-
-#define NES_TERM_SENT            0x01
-#define NES_TERM_RCVD            0x02
-#define NES_TERM_DONE            0x04
-
-struct nes_ucontext {
-	struct ib_ucontext ibucontext;
-	struct nes_device  *nesdev;
-	unsigned long      mmap_wq_offset;
-	unsigned long      mmap_cq_offset; /* to be removed */
-	int                index;		/* rnic index (minor) */
-	unsigned long      allocated_doorbells[BITS_TO_LONGS(NES_MAX_USER_DB_REGIONS)];
-	u16                mmap_db_index[NES_MAX_USER_DB_REGIONS];
-	u16                first_free_db;
-	unsigned long      allocated_wqs[BITS_TO_LONGS(NES_MAX_USER_WQ_REGIONS)];
-	struct nes_qp      *mmap_nesqp[NES_MAX_USER_WQ_REGIONS];
-	u16                first_free_wq;
-	struct list_head   cq_reg_mem_list;
-	struct list_head   qp_reg_mem_list;
-	u32                mcrqf;
-	atomic_t	   usecnt;
-};
-
-struct nes_pd {
-	struct ib_pd ibpd;
-	u16          pd_id;
-	atomic_t     sqp_count;
-	u16          mmap_db_index;
-};
-
-struct nes_mr {
-	union {
-		struct ib_mr  ibmr;
-		struct ib_mw  ibmw;
-		struct ib_fmr ibfmr;
-	};
-	struct ib_umem    *region;
-	u16               pbls_used;
-	u8                mode;
-	u8                pbl_4k;
-	__le64            *pages;
-	dma_addr_t        paddr;
-	u32               max_pages;
-	u32		  npages;
-};
-
-struct nes_hw_pb {
-	__le32 pa_low;
-	__le32 pa_high;
-};
-
-struct nes_vpbl {
-	dma_addr_t       pbl_pbase;
-	struct nes_hw_pb *pbl_vbase;
-};
-
-struct nes_root_vpbl {
-	dma_addr_t       pbl_pbase;
-	struct nes_hw_pb *pbl_vbase;
-	struct nes_vpbl  *leaf_vpbl;
-};
-
-struct nes_fmr {
-	struct nes_mr        nesmr;
-	u32                  leaf_pbl_cnt;
-	struct nes_root_vpbl root_vpbl;
-	struct ib_qp         *ib_qp;
-	int                  access_rights;
-	struct ib_fmr_attr   attr;
-};
-
-struct nes_av;
-
-struct nes_cq {
-	struct ib_cq     ibcq;
-	struct nes_hw_cq hw_cq;
-	u32              polled_completions;
-	u32              cq_mem_size;
-	spinlock_t       lock;
-	u8               virtual_cq;
-	u8               pad[3];
-	u32		 mcrqf;
-};
-
-struct nes_wq {
-	spinlock_t lock;
-};
-
-struct disconn_work {
-	struct work_struct    work;
-	struct nes_qp         *nesqp;
-};
-
-struct iw_cm_id;
-struct ietf_mpa_frame;
-
-struct nes_qp {
-	struct ib_qp          ibqp;
-	void                  *allocated_buffer;
-	struct iw_cm_id       *cm_id;
-	struct nes_cq         *nesscq;
-	struct nes_cq         *nesrcq;
-	struct nes_pd         *nespd;
-	void *cm_node; /* handle of the node this QP is associated with */
-	void                  *ietf_frame;
-	u8                    ietf_frame_size;
-	dma_addr_t            ietf_frame_pbase;
-	struct ib_mr          *lsmm_mr;
-	struct nes_hw_qp      hwqp;
-	struct work_struct    work;
-	enum ib_qp_state      ibqp_state;
-	u32                   iwarp_state;
-	u32                   hte_index;
-	u32                   last_aeq;
-	u32                   qp_mem_size;
-	atomic_t              refcount;
-	atomic_t              close_timer_started;
-	u32                   mmap_sq_db_index;
-	u32                   mmap_rq_db_index;
-	spinlock_t            lock;
-	spinlock_t            pau_lock;
-	struct nes_qp_context *nesqp_context;
-	dma_addr_t            nesqp_context_pbase;
-	void	              *pbl_vbase;
-	dma_addr_t            pbl_pbase;
-	struct page           *page;
-	struct timer_list     terminate_timer;
-	enum ib_event_type    terminate_eventtype;
-	struct sk_buff_head   pau_list;
-	u32                   pau_rcv_nxt;
-	u16                   active_conn:1;
-	u16                   skip_lsmm:1;
-	u16                   user_mode:1;
-	u16                   hte_added:1;
-	u16                   flush_issued:1;
-	u16                   destroyed:1;
-	u16                   sig_all:1;
-	u16                   pau_mode:1;
-	u16                   rsvd:8;
-	u16                   private_data_len;
-	u16                   term_sq_flush_code;
-	u16                   term_rq_flush_code;
-	u8                    hw_iwarp_state;
-	u8                    hw_tcp_state;
-	u8                    term_flags;
-	u8                    sq_kmapped;
-	u8                    pau_busy;
-	u8                    pau_pending;
-	u8                    pau_state;
-	__u64                 nesuqp_addr;
-	struct completion     sq_drained;
-	struct completion     rq_drained;
-};
-
-struct ib_mr *nes_reg_phys_mr(struct ib_pd *ib_pd,
-		u64 addr, u64 size, int acc, u64 *iova_start);
-
-#endif			/* NES_VERBS_H */
diff --git a/drivers/infiniband/hw/ocrdma/Kconfig b/drivers/infiniband/hw/ocrdma/Kconfig
index c0cddc0..dd4ec38 100644
--- a/drivers/infiniband/hw/ocrdma/Kconfig
+++ b/drivers/infiniband/hw/ocrdma/Kconfig
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 config INFINIBAND_OCRDMA
 	tristate "Emulex One Connect HCA support"
 	depends on ETHERNET && NETDEVICES && PCI && INET && (IPV6 || IPV6=n)
diff --git a/drivers/infiniband/hw/ocrdma/Makefile b/drivers/infiniband/hw/ocrdma/Makefile
index d1bfd4f..14fba95 100644
--- a/drivers/infiniband/hw/ocrdma/Makefile
+++ b/drivers/infiniband/hw/ocrdma/Makefile
@@ -1,4 +1,5 @@
-ccflags-y := -Idrivers/net/ethernet/emulex/benet
+# SPDX-License-Identifier: GPL-2.0-only
+ccflags-y := -I $(srctree)/drivers/net/ethernet/emulex/benet
 
 obj-$(CONFIG_INFINIBAND_OCRDMA)	+= ocrdma.o
 
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_ah.c b/drivers/infiniband/hw/ocrdma/ocrdma_ah.c
index 58188fe..8d3e36d 100644
--- a/drivers/infiniband/hw/ocrdma/ocrdma_ah.c
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_ah.c
@@ -83,7 +83,6 @@
 	struct iphdr ipv4;
 	const struct ib_global_route *ib_grh;
 	union {
-		struct sockaddr     _sockaddr;
 		struct sockaddr_in  _sockaddr_in;
 		struct sockaddr_in6 _sockaddr_in6;
 	} sgid_addr, dgid_addr;
@@ -133,9 +132,9 @@
 		ipv4.tot_len = htons(0);
 		ipv4.ttl = ib_grh->hop_limit;
 		ipv4.protocol = nxthdr;
-		rdma_gid2ip(&sgid_addr._sockaddr, sgid);
+		rdma_gid2ip((struct sockaddr *)&sgid_addr, sgid);
 		ipv4.saddr = sgid_addr._sockaddr_in.sin_addr.s_addr;
-		rdma_gid2ip(&dgid_addr._sockaddr, &ib_grh->dgid);
+		rdma_gid2ip((struct sockaddr*)&dgid_addr, &ib_grh->dgid);
 		ipv4.daddr = dgid_addr._sockaddr_in.sin_addr.s_addr;
 		memcpy((u8 *)ah->av + eth_sz, &ipv4, sizeof(struct iphdr));
 	} else {
@@ -156,37 +155,34 @@
 	return status;
 }
 
-struct ib_ah *ocrdma_create_ah(struct ib_pd *ibpd, struct rdma_ah_attr *attr,
-			       struct ib_udata *udata)
+int ocrdma_create_ah(struct ib_ah *ibah, struct rdma_ah_attr *attr, u32 flags,
+		     struct ib_udata *udata)
 {
 	u32 *ahid_addr;
 	int status;
-	struct ocrdma_ah *ah;
+	struct ocrdma_ah *ah = get_ocrdma_ah(ibah);
 	bool isvlan = false;
 	u16 vlan_tag = 0xffff;
 	const struct ib_gid_attr *sgid_attr;
-	struct ocrdma_pd *pd = get_ocrdma_pd(ibpd);
-	struct ocrdma_dev *dev = get_ocrdma_dev(ibpd->device);
+	struct ocrdma_pd *pd = get_ocrdma_pd(ibah->pd);
+	struct ocrdma_dev *dev = get_ocrdma_dev(ibah->device);
 
 	if ((attr->type != RDMA_AH_ATTR_TYPE_ROCE) ||
 	    !(rdma_ah_get_ah_flags(attr) & IB_AH_GRH))
-		return ERR_PTR(-EINVAL);
+		return -EINVAL;
 
 	if (atomic_cmpxchg(&dev->update_sl, 1, 0))
 		ocrdma_init_service_level(dev);
 
-	ah = kzalloc(sizeof(*ah), GFP_ATOMIC);
-	if (!ah)
-		return ERR_PTR(-ENOMEM);
+	sgid_attr = attr->grh.sgid_attr;
+	status = rdma_read_gid_l2_fields(sgid_attr, &vlan_tag, NULL);
+	if (status)
+		return status;
 
 	status = ocrdma_alloc_av(dev, ah);
 	if (status)
 		goto av_err;
 
-	sgid_attr = attr->grh.sgid_attr;
-	if (is_vlan_dev(sgid_attr->ndev))
-		vlan_tag = vlan_dev_vlan_id(sgid_attr->ndev);
-
 	/* Get network header type for this GID */
 	ah->hdr_type = rdma_gid_attr_network_type(sgid_attr);
 
@@ -210,23 +206,20 @@
 				       OCRDMA_AH_VLAN_VALID_SHIFT);
 	}
 
-	return &ah->ibah;
+	return 0;
 
 av_conf_err:
 	ocrdma_free_av(dev, ah);
 av_err:
-	kfree(ah);
-	return ERR_PTR(status);
+	return status;
 }
 
-int ocrdma_destroy_ah(struct ib_ah *ibah)
+void ocrdma_destroy_ah(struct ib_ah *ibah, u32 flags)
 {
 	struct ocrdma_ah *ah = get_ocrdma_ah(ibah);
 	struct ocrdma_dev *dev = get_ocrdma_dev(ibah->device);
 
 	ocrdma_free_av(dev, ah);
-	kfree(ah);
-	return 0;
 }
 
 int ocrdma_query_ah(struct ib_ah *ibah, struct rdma_ah_attr *attr)
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_ah.h b/drivers/infiniband/hw/ocrdma/ocrdma_ah.h
index c0c32c9..64cb82c 100644
--- a/drivers/infiniband/hw/ocrdma/ocrdma_ah.h
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_ah.h
@@ -51,9 +51,9 @@
 	OCRDMA_AH_L3_TYPE_SHIFT		= 0x1D /* 29 bits */
 };
 
-struct ib_ah *ocrdma_create_ah(struct ib_pd *pd, struct rdma_ah_attr *ah_attr,
-			       struct ib_udata *udata);
-int ocrdma_destroy_ah(struct ib_ah *ah);
+int ocrdma_create_ah(struct ib_ah *ah, struct rdma_ah_attr *ah_attr, u32 flags,
+		     struct ib_udata *udata);
+void ocrdma_destroy_ah(struct ib_ah *ah, u32 flags);
 int ocrdma_query_ah(struct ib_ah *ah, struct rdma_ah_attr *ah_attr);
 
 int ocrdma_process_mad(struct ib_device *,
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_hw.c b/drivers/infiniband/hw/ocrdma/ocrdma_hw.c
index e578281..d82d3ec 100644
--- a/drivers/infiniband/hw/ocrdma/ocrdma_hw.c
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_hw.c
@@ -380,8 +380,8 @@
 	q->len = len;
 	q->entry_size = entry_size;
 	q->size = len * entry_size;
-	q->va = dma_zalloc_coherent(&dev->nic_info.pdev->dev, q->size,
-				    &q->dma, GFP_KERNEL);
+	q->va = dma_alloc_coherent(&dev->nic_info.pdev->dev, q->size, &q->dma,
+				   GFP_KERNEL);
 	if (!q->va)
 		return -ENOMEM;
 	return 0;
@@ -792,7 +792,7 @@
 						     qp->srq->ibsrq.
 						     srq_context);
 	} else if (dev_event) {
-		pr_err("%s: Fatal event received\n", dev->ibdev.name);
+		dev_err(&dev->ibdev.dev, "Fatal event received\n");
 		ib_dispatch_event(&ib_evt);
 	}
 
@@ -1351,7 +1351,6 @@
 	mqe->u.nonemb_req.sge[0].pa_hi = (u32) upper_32_bits(dma.pa);
 	mqe->u.nonemb_req.sge[0].len = dma.size;
 
-	memset(dma.va, 0, dma.size);
 	ocrdma_init_mch((struct ocrdma_mbx_hdr *)dma.va,
 			OCRDMA_CMD_GET_CTRL_ATTRIBUTES,
 			OCRDMA_SUBSYS_COMMON,
@@ -1690,7 +1689,6 @@
 		goto mem_err_ah;
 	dev->av_tbl.pa = pa;
 	dev->av_tbl.num_ah = max_ah;
-	memset(dev->av_tbl.va, 0, dev->av_tbl.size);
 
 	pbes = (struct ocrdma_pbe *)dev->av_tbl.pbl.va;
 	for (i = 0; i < dev->av_tbl.size / OCRDMA_MIN_Q_PAGE_SIZE; i++) {
@@ -1819,7 +1817,7 @@
 		return -ENOMEM;
 	ocrdma_init_mch(&cmd->cmd.req, OCRDMA_CMD_CREATE_CQ,
 			OCRDMA_SUBSYS_COMMON, sizeof(*cmd));
-	cq->va = dma_zalloc_coherent(&pdev->dev, cq->len, &cq->pa, GFP_KERNEL);
+	cq->va = dma_alloc_coherent(&pdev->dev, cq->len, &cq->pa, GFP_KERNEL);
 	if (!cq->va) {
 		status = -ENOMEM;
 		goto mem_err;
@@ -1888,14 +1886,13 @@
 	return status;
 }
 
-int ocrdma_mbx_destroy_cq(struct ocrdma_dev *dev, struct ocrdma_cq *cq)
+void ocrdma_mbx_destroy_cq(struct ocrdma_dev *dev, struct ocrdma_cq *cq)
 {
-	int status = -ENOMEM;
 	struct ocrdma_destroy_cq *cmd;
 
 	cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_DELETE_CQ, sizeof(*cmd));
 	if (!cmd)
-		return status;
+		return;
 	ocrdma_init_mch(&cmd->req, OCRDMA_CMD_DELETE_CQ,
 			OCRDMA_SUBSYS_COMMON, sizeof(*cmd));
 
@@ -1903,11 +1900,10 @@
 	    (cq->id << OCRDMA_DESTROY_CQ_QID_SHIFT) &
 	    OCRDMA_DESTROY_CQ_QID_MASK;
 
-	status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd);
+	ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd);
 	ocrdma_unbind_eq(dev, cq->eqn);
 	dma_free_coherent(&dev->nic_info.pdev->dev, cq->len, cq->va, cq->pa);
 	kfree(cmd);
-	return status;
 }
 
 int ocrdma_mbx_alloc_lkey(struct ocrdma_dev *dev, struct ocrdma_hw_mr *hwmr,
@@ -2209,7 +2205,7 @@
 	qp->sq.max_cnt = max_wqe_allocated;
 	len = (hw_pages * hw_page_size);
 
-	qp->sq.va = dma_zalloc_coherent(&pdev->dev, len, &pa, GFP_KERNEL);
+	qp->sq.va = dma_alloc_coherent(&pdev->dev, len, &pa, GFP_KERNEL);
 	if (!qp->sq.va)
 		return -EINVAL;
 	qp->sq.len = len;
@@ -2259,7 +2255,7 @@
 	qp->rq.max_cnt = max_rqe_allocated;
 	len = (hw_pages * hw_page_size);
 
-	qp->rq.va = dma_zalloc_coherent(&pdev->dev, len, &pa, GFP_KERNEL);
+	qp->rq.va = dma_alloc_coherent(&pdev->dev, len, &pa, GFP_KERNEL);
 	if (!qp->rq.va)
 		return -ENOMEM;
 	qp->rq.pa = pa;
@@ -2315,8 +2311,8 @@
 	if (dev->attr.ird == 0)
 		return 0;
 
-	qp->ird_q_va = dma_zalloc_coherent(&pdev->dev, ird_q_len, &pa,
-					   GFP_KERNEL);
+	qp->ird_q_va = dma_alloc_coherent(&pdev->dev, ird_q_len, &pa,
+					  GFP_KERNEL);
 	if (!qp->ird_q_va)
 		return -ENOMEM;
 	ocrdma_build_q_pages(&cmd->ird_addr[0], dev->attr.num_ird_pages,
@@ -2496,10 +2492,9 @@
 	int status;
 	struct rdma_ah_attr *ah_attr = &attrs->ah_attr;
 	const struct ib_gid_attr *sgid_attr;
-	u32 vlan_id = 0xFFFF;
+	u16 vlan_id = 0xFFFF;
 	u8 mac_addr[6], hdr_type;
 	union {
-		struct sockaddr     _sockaddr;
 		struct sockaddr_in  _sockaddr_in;
 		struct sockaddr_in6 _sockaddr_in6;
 	} sgid_addr, dgid_addr;
@@ -2526,8 +2521,9 @@
 	       sizeof(cmd->params.dgid));
 
 	sgid_attr = ah_attr->grh.sgid_attr;
-	vlan_id = rdma_vlan_dev_vlan_id(sgid_attr->ndev);
-	memcpy(mac_addr, sgid_attr->ndev->dev_addr, ETH_ALEN);
+	status = rdma_read_gid_l2_fields(sgid_attr, &vlan_id, &mac_addr[0]);
+	if (status)
+		return status;
 
 	qp->sgid_idx = grh->sgid_index;
 	memcpy(&cmd->params.sgid[0], &sgid_attr->gid.raw[0],
@@ -2541,8 +2537,8 @@
 
 	hdr_type = rdma_gid_attr_network_type(sgid_attr);
 	if (hdr_type == RDMA_NETWORK_IPV4) {
-		rdma_gid2ip(&sgid_addr._sockaddr, &sgid_attr->gid);
-		rdma_gid2ip(&dgid_addr._sockaddr, &grh->dgid);
+		rdma_gid2ip((struct sockaddr *)&sgid_addr, &sgid_attr->gid);
+		rdma_gid2ip((struct sockaddr *)&dgid_addr, &grh->dgid);
 		memcpy(&cmd->params.dgid[0],
 		       &dgid_addr._sockaddr_in.sin_addr.s_addr, 4);
 		memcpy(&cmd->params.sgid[0],
@@ -2863,21 +2859,19 @@
 	return status;
 }
 
-int ocrdma_mbx_destroy_srq(struct ocrdma_dev *dev, struct ocrdma_srq *srq)
+void ocrdma_mbx_destroy_srq(struct ocrdma_dev *dev, struct ocrdma_srq *srq)
 {
-	int status = -ENOMEM;
 	struct ocrdma_destroy_srq *cmd;
 	struct pci_dev *pdev = dev->nic_info.pdev;
 	cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_DELETE_SRQ, sizeof(*cmd));
 	if (!cmd)
-		return status;
+		return;
 	cmd->id = srq->id;
-	status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd);
+	ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd);
 	if (srq->rq.va)
 		dma_free_coherent(&pdev->dev, srq->rq.len,
 				  srq->rq.va, srq->rq.pa);
 	kfree(cmd);
-	return status;
 }
 
 static int ocrdma_mbx_get_dcbx_config(struct ocrdma_dev *dev, u32 ptype,
@@ -2907,7 +2901,6 @@
 	mqe_sge->pa_hi = (u32) upper_32_bits(pa);
 	mqe_sge->len = cmd.hdr.pyld_len;
 
-	memset(req, 0, sizeof(struct ocrdma_get_dcbx_cfg_req));
 	ocrdma_init_mch(&req->hdr, OCRDMA_CMD_GET_DCBX_CONFIG,
 			OCRDMA_SUBSYS_DCBX, cmd.hdr.pyld_len);
 	req->param_type = ptype;
@@ -3067,13 +3060,12 @@
 	return status;
 }
 
-int ocrdma_free_av(struct ocrdma_dev *dev, struct ocrdma_ah *ah)
+void ocrdma_free_av(struct ocrdma_dev *dev, struct ocrdma_ah *ah)
 {
 	unsigned long flags;
 	spin_lock_irqsave(&dev->av_tbl.lock, flags);
 	ah->av->valid = 0;
 	spin_unlock_irqrestore(&dev->av_tbl.lock, flags);
-	return 0;
 }
 
 static int ocrdma_create_eqs(struct ocrdma_dev *dev)
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_hw.h b/drivers/infiniband/hw/ocrdma/ocrdma_hw.h
index ebc1f44..12c23a7 100644
--- a/drivers/infiniband/hw/ocrdma/ocrdma_hw.h
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_hw.h
@@ -122,7 +122,7 @@
 			u32 pd_id, int acc);
 int ocrdma_mbx_create_cq(struct ocrdma_dev *, struct ocrdma_cq *,
 				int entries, int dpp_cq, u16 pd_id);
-int ocrdma_mbx_destroy_cq(struct ocrdma_dev *, struct ocrdma_cq *);
+void ocrdma_mbx_destroy_cq(struct ocrdma_dev *dev, struct ocrdma_cq *cq);
 
 int ocrdma_mbx_create_qp(struct ocrdma_qp *, struct ib_qp_init_attr *attrs,
 			 u8 enable_dpp_cq, u16 dpp_cq_id, u16 *dpp_offset,
@@ -137,10 +137,10 @@
 			  struct ocrdma_pd *);
 int ocrdma_mbx_modify_srq(struct ocrdma_srq *, struct ib_srq_attr *);
 int ocrdma_mbx_query_srq(struct ocrdma_srq *, struct ib_srq_attr *);
-int ocrdma_mbx_destroy_srq(struct ocrdma_dev *, struct ocrdma_srq *);
+void ocrdma_mbx_destroy_srq(struct ocrdma_dev *dev, struct ocrdma_srq *srq);
 
-int ocrdma_alloc_av(struct ocrdma_dev *, struct ocrdma_ah *);
-int ocrdma_free_av(struct ocrdma_dev *, struct ocrdma_ah *);
+int ocrdma_alloc_av(struct ocrdma_dev *dev, struct ocrdma_ah *ah);
+void ocrdma_free_av(struct ocrdma_dev *dev, struct ocrdma_ah *ah);
 
 int ocrdma_qp_state_change(struct ocrdma_qp *, enum ib_qp_state new_state,
 			    enum ib_qp_state *old_ib_state);
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_main.c b/drivers/infiniband/hw/ocrdma/ocrdma_main.c
index 7832ee3..c15cfc6 100644
--- a/drivers/infiniband/hw/ocrdma/ocrdma_main.c
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_main.c
@@ -62,8 +62,6 @@
 MODULE_AUTHOR("Emulex Corporation");
 MODULE_LICENSE("Dual BSD/GPL");
 
-static DEFINE_IDR(ocrdma_dev_id);
-
 void ocrdma_get_guid(struct ocrdma_dev *dev, u8 *guid)
 {
 	u8 mac_addr[6];
@@ -114,15 +112,99 @@
 	snprintf(str, IB_FW_VERSION_NAME_MAX, "%s", &dev->attr.fw_ver[0]);
 }
 
+/* OCRDMA sysfs interface */
+static ssize_t hw_rev_show(struct device *device,
+			   struct device_attribute *attr, char *buf)
+{
+	struct ocrdma_dev *dev =
+		rdma_device_to_drv_device(device, struct ocrdma_dev, ibdev);
+
+	return scnprintf(buf, PAGE_SIZE, "0x%x\n", dev->nic_info.pdev->vendor);
+}
+static DEVICE_ATTR_RO(hw_rev);
+
+static ssize_t hca_type_show(struct device *device,
+			     struct device_attribute *attr, char *buf)
+{
+	struct ocrdma_dev *dev =
+		rdma_device_to_drv_device(device, struct ocrdma_dev, ibdev);
+
+	return scnprintf(buf, PAGE_SIZE, "%s\n", &dev->model_number[0]);
+}
+static DEVICE_ATTR_RO(hca_type);
+
+static struct attribute *ocrdma_attributes[] = {
+	&dev_attr_hw_rev.attr,
+	&dev_attr_hca_type.attr,
+	NULL
+};
+
+static const struct attribute_group ocrdma_attr_group = {
+	.attrs = ocrdma_attributes,
+};
+
+static const struct ib_device_ops ocrdma_dev_ops = {
+	.owner = THIS_MODULE,
+	.driver_id = RDMA_DRIVER_OCRDMA,
+	.uverbs_abi_ver = OCRDMA_ABI_VERSION,
+
+	.alloc_mr = ocrdma_alloc_mr,
+	.alloc_pd = ocrdma_alloc_pd,
+	.alloc_ucontext = ocrdma_alloc_ucontext,
+	.create_ah = ocrdma_create_ah,
+	.create_cq = ocrdma_create_cq,
+	.create_qp = ocrdma_create_qp,
+	.dealloc_pd = ocrdma_dealloc_pd,
+	.dealloc_ucontext = ocrdma_dealloc_ucontext,
+	.dereg_mr = ocrdma_dereg_mr,
+	.destroy_ah = ocrdma_destroy_ah,
+	.destroy_cq = ocrdma_destroy_cq,
+	.destroy_qp = ocrdma_destroy_qp,
+	.get_dev_fw_str = get_dev_fw_str,
+	.get_dma_mr = ocrdma_get_dma_mr,
+	.get_link_layer = ocrdma_link_layer,
+	.get_port_immutable = ocrdma_port_immutable,
+	.map_mr_sg = ocrdma_map_mr_sg,
+	.mmap = ocrdma_mmap,
+	.modify_port = ocrdma_modify_port,
+	.modify_qp = ocrdma_modify_qp,
+	.poll_cq = ocrdma_poll_cq,
+	.post_recv = ocrdma_post_recv,
+	.post_send = ocrdma_post_send,
+	.process_mad = ocrdma_process_mad,
+	.query_ah = ocrdma_query_ah,
+	.query_device = ocrdma_query_device,
+	.query_pkey = ocrdma_query_pkey,
+	.query_port = ocrdma_query_port,
+	.query_qp = ocrdma_query_qp,
+	.reg_user_mr = ocrdma_reg_user_mr,
+	.req_notify_cq = ocrdma_arm_cq,
+	.resize_cq = ocrdma_resize_cq,
+
+	INIT_RDMA_OBJ_SIZE(ib_ah, ocrdma_ah, ibah),
+	INIT_RDMA_OBJ_SIZE(ib_cq, ocrdma_cq, ibcq),
+	INIT_RDMA_OBJ_SIZE(ib_pd, ocrdma_pd, ibpd),
+	INIT_RDMA_OBJ_SIZE(ib_ucontext, ocrdma_ucontext, ibucontext),
+};
+
+static const struct ib_device_ops ocrdma_dev_srq_ops = {
+	.create_srq = ocrdma_create_srq,
+	.destroy_srq = ocrdma_destroy_srq,
+	.modify_srq = ocrdma_modify_srq,
+	.post_srq_recv = ocrdma_post_srq_recv,
+	.query_srq = ocrdma_query_srq,
+
+	INIT_RDMA_OBJ_SIZE(ib_srq, ocrdma_srq, ibsrq),
+};
+
 static int ocrdma_register_device(struct ocrdma_dev *dev)
 {
-	strlcpy(dev->ibdev.name, "ocrdma%d", IB_DEVICE_NAME_MAX);
+	int ret;
+
 	ocrdma_get_guid(dev, (u8 *)&dev->ibdev.node_guid);
 	BUILD_BUG_ON(sizeof(OCRDMA_NODE_DESC) > IB_DEVICE_NODE_DESC_MAX);
 	memcpy(dev->ibdev.node_desc, OCRDMA_NODE_DESC,
 	       sizeof(OCRDMA_NODE_DESC));
-	dev->ibdev.owner = THIS_MODULE;
-	dev->ibdev.uverbs_abi_ver = OCRDMA_ABI_VERSION;
 	dev->ibdev.uverbs_cmd_mask =
 	    OCRDMA_UVERBS(GET_CONTEXT) |
 	    OCRDMA_UVERBS(QUERY_DEVICE) |
@@ -154,50 +236,10 @@
 	dev->ibdev.phys_port_cnt = 1;
 	dev->ibdev.num_comp_vectors = dev->eq_cnt;
 
-	/* mandatory verbs. */
-	dev->ibdev.query_device = ocrdma_query_device;
-	dev->ibdev.query_port = ocrdma_query_port;
-	dev->ibdev.modify_port = ocrdma_modify_port;
-	dev->ibdev.get_netdev = ocrdma_get_netdev;
-	dev->ibdev.get_link_layer = ocrdma_link_layer;
-	dev->ibdev.alloc_pd = ocrdma_alloc_pd;
-	dev->ibdev.dealloc_pd = ocrdma_dealloc_pd;
-
-	dev->ibdev.create_cq = ocrdma_create_cq;
-	dev->ibdev.destroy_cq = ocrdma_destroy_cq;
-	dev->ibdev.resize_cq = ocrdma_resize_cq;
-
-	dev->ibdev.create_qp = ocrdma_create_qp;
-	dev->ibdev.modify_qp = ocrdma_modify_qp;
-	dev->ibdev.query_qp = ocrdma_query_qp;
-	dev->ibdev.destroy_qp = ocrdma_destroy_qp;
-
-	dev->ibdev.query_pkey = ocrdma_query_pkey;
-	dev->ibdev.create_ah = ocrdma_create_ah;
-	dev->ibdev.destroy_ah = ocrdma_destroy_ah;
-	dev->ibdev.query_ah = ocrdma_query_ah;
-
-	dev->ibdev.poll_cq = ocrdma_poll_cq;
-	dev->ibdev.post_send = ocrdma_post_send;
-	dev->ibdev.post_recv = ocrdma_post_recv;
-	dev->ibdev.req_notify_cq = ocrdma_arm_cq;
-
-	dev->ibdev.get_dma_mr = ocrdma_get_dma_mr;
-	dev->ibdev.dereg_mr = ocrdma_dereg_mr;
-	dev->ibdev.reg_user_mr = ocrdma_reg_user_mr;
-
-	dev->ibdev.alloc_mr = ocrdma_alloc_mr;
-	dev->ibdev.map_mr_sg = ocrdma_map_mr_sg;
-
 	/* mandatory to support user space verbs consumer. */
-	dev->ibdev.alloc_ucontext = ocrdma_alloc_ucontext;
-	dev->ibdev.dealloc_ucontext = ocrdma_dealloc_ucontext;
-	dev->ibdev.mmap = ocrdma_mmap;
 	dev->ibdev.dev.parent = &dev->nic_info.pdev->dev;
 
-	dev->ibdev.process_mad = ocrdma_process_mad;
-	dev->ibdev.get_port_immutable = ocrdma_port_immutable;
-	dev->ibdev.get_dev_fw_str     = get_dev_fw_str;
+	ib_set_device_ops(&dev->ibdev, &ocrdma_dev_ops);
 
 	if (ocrdma_get_asic_type(dev) == OCRDMA_ASIC_GEN_SKH_R) {
 		dev->ibdev.uverbs_cmd_mask |=
@@ -207,14 +249,14 @@
 		     OCRDMA_UVERBS(DESTROY_SRQ) |
 		     OCRDMA_UVERBS(POST_SRQ_RECV);
 
-		dev->ibdev.create_srq = ocrdma_create_srq;
-		dev->ibdev.modify_srq = ocrdma_modify_srq;
-		dev->ibdev.query_srq = ocrdma_query_srq;
-		dev->ibdev.destroy_srq = ocrdma_destroy_srq;
-		dev->ibdev.post_srq_recv = ocrdma_post_srq_recv;
+		ib_set_device_ops(&dev->ibdev, &ocrdma_dev_srq_ops);
 	}
-	dev->ibdev.driver_id = RDMA_DRIVER_OCRDMA;
-	return ib_register_device(&dev->ibdev, NULL);
+	rdma_set_device_sysfs_group(&dev->ibdev, &ocrdma_attr_group);
+	ret = ib_device_set_netdev(&dev->ibdev, dev->nic_info.netdev, 1);
+	if (ret)
+		return ret;
+
+	return ib_register_device(&dev->ibdev, "ocrdma%d");
 }
 
 static int ocrdma_alloc_resources(struct ocrdma_dev *dev)
@@ -260,59 +302,24 @@
 	kfree(dev->cq_tbl);
 }
 
-/* OCRDMA sysfs interface */
-static ssize_t show_rev(struct device *device, struct device_attribute *attr,
-			char *buf)
-{
-	struct ocrdma_dev *dev = dev_get_drvdata(device);
-
-	return scnprintf(buf, PAGE_SIZE, "0x%x\n", dev->nic_info.pdev->vendor);
-}
-
-static ssize_t show_hca_type(struct device *device,
-			     struct device_attribute *attr, char *buf)
-{
-	struct ocrdma_dev *dev = dev_get_drvdata(device);
-
-	return scnprintf(buf, PAGE_SIZE, "%s\n", &dev->model_number[0]);
-}
-
-static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL);
-static DEVICE_ATTR(hca_type, S_IRUGO, show_hca_type, NULL);
-
-static struct device_attribute *ocrdma_attributes[] = {
-	&dev_attr_hw_rev,
-	&dev_attr_hca_type
-};
-
-static void ocrdma_remove_sysfiles(struct ocrdma_dev *dev)
-{
-	int i;
-
-	for (i = 0; i < ARRAY_SIZE(ocrdma_attributes); i++)
-		device_remove_file(&dev->ibdev.dev, ocrdma_attributes[i]);
-}
-
 static struct ocrdma_dev *ocrdma_add(struct be_dev_info *dev_info)
 {
-	int status = 0, i;
+	int status = 0;
 	u8 lstate = 0;
 	struct ocrdma_dev *dev;
 
-	dev = (struct ocrdma_dev *)ib_alloc_device(sizeof(struct ocrdma_dev));
+	dev = ib_alloc_device(ocrdma_dev, ibdev);
 	if (!dev) {
 		pr_err("Unable to allocate ib device\n");
 		return NULL;
 	}
+
 	dev->mbx_cmd = kzalloc(sizeof(struct ocrdma_mqe_emb_cmd), GFP_KERNEL);
 	if (!dev->mbx_cmd)
-		goto idr_err;
+		goto init_err;
 
 	memcpy(&dev->nic_info, dev_info, sizeof(*dev_info));
-	dev->id = idr_alloc(&ocrdma_dev_id, NULL, 0, 0, GFP_KERNEL);
-	if (dev->id < 0)
-		goto idr_err;
-
+	dev->id = PCI_FUNC(dev->nic_info.pdev->devfn);
 	status = ocrdma_init_hw(dev);
 	if (status)
 		goto init_err;
@@ -331,9 +338,6 @@
 	if (!status)
 		ocrdma_update_link_state(dev, lstate);
 
-	for (i = 0; i < ARRAY_SIZE(ocrdma_attributes); i++)
-		if (device_create_file(&dev->ibdev.dev, ocrdma_attributes[i]))
-			goto sysfs_err;
 	/* Init stats */
 	ocrdma_add_port_stats(dev);
 	/* Interrupt Moderation */
@@ -348,14 +352,10 @@
 		dev_name(&dev->nic_info.pdev->dev), dev->id);
 	return dev;
 
-sysfs_err:
-	ocrdma_remove_sysfiles(dev);
 alloc_err:
 	ocrdma_free_resources(dev);
 	ocrdma_cleanup_hw(dev);
 init_err:
-	idr_remove(&ocrdma_dev_id, dev->id);
-idr_err:
 	kfree(dev->mbx_cmd);
 	ib_dealloc_device(&dev->ibdev);
 	pr_err("%s() leaving. ret=%d\n", __func__, status);
@@ -365,7 +365,6 @@
 static void ocrdma_remove_free(struct ocrdma_dev *dev)
 {
 
-	idr_remove(&ocrdma_dev_id, dev->id);
 	kfree(dev->mbx_cmd);
 	ib_dealloc_device(&dev->ibdev);
 }
@@ -376,7 +375,6 @@
 	 * of the registered clients.
 	 */
 	cancel_delayed_work_sync(&dev->eqd_work);
-	ocrdma_remove_sysfiles(dev);
 	ib_unregister_device(&dev->ibdev);
 
 	ocrdma_rem_port_stats(dev);
@@ -471,7 +469,6 @@
 {
 	be_roce_unregister_driver(&ocrdma_drv);
 	ocrdma_rem_debugfs();
-	idr_destroy(&ocrdma_dev_id);
 }
 
 module_init(ocrdma_init_module);
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_stats.c b/drivers/infiniband/hw/ocrdma/ocrdma_stats.c
index 24d20a4..a902942 100644
--- a/drivers/infiniband/hw/ocrdma/ocrdma_stats.c
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_stats.c
@@ -73,8 +73,8 @@
 	mem->size = max_t(u32, sizeof(struct ocrdma_rdma_stats_req),
 			sizeof(struct ocrdma_rdma_stats_resp));
 
-	mem->va = dma_zalloc_coherent(&dev->nic_info.pdev->dev, mem->size,
-				      &mem->pa, GFP_KERNEL);
+	mem->va = dma_alloc_coherent(&dev->nic_info.pdev->dev, mem->size,
+				     &mem->pa, GFP_KERNEL);
 	if (!mem->va) {
 		pr_err("%s: stats mbox allocation failed\n", __func__);
 		return false;
@@ -760,93 +760,72 @@
 
 void ocrdma_add_port_stats(struct ocrdma_dev *dev)
 {
+	const struct pci_dev *pdev = dev->nic_info.pdev;
+
 	if (!ocrdma_dbgfs_dir)
 		return;
 
 	/* Create post stats base dir */
-	dev->dir = debugfs_create_dir(dev->ibdev.name, ocrdma_dbgfs_dir);
-	if (!dev->dir)
-		goto err;
+	dev->dir = debugfs_create_dir(pci_name(pdev), ocrdma_dbgfs_dir);
 
 	dev->rsrc_stats.type = OCRDMA_RSRC_STATS;
 	dev->rsrc_stats.dev = dev;
-	if (!debugfs_create_file("resource_stats", S_IRUSR, dev->dir,
-				 &dev->rsrc_stats, &ocrdma_dbg_ops))
-		goto err;
+	debugfs_create_file("resource_stats", S_IRUSR, dev->dir,
+			    &dev->rsrc_stats, &ocrdma_dbg_ops);
 
 	dev->rx_stats.type = OCRDMA_RXSTATS;
 	dev->rx_stats.dev = dev;
-	if (!debugfs_create_file("rx_stats", S_IRUSR, dev->dir,
-				 &dev->rx_stats, &ocrdma_dbg_ops))
-		goto err;
+	debugfs_create_file("rx_stats", S_IRUSR, dev->dir, &dev->rx_stats,
+			    &ocrdma_dbg_ops);
 
 	dev->wqe_stats.type = OCRDMA_WQESTATS;
 	dev->wqe_stats.dev = dev;
-	if (!debugfs_create_file("wqe_stats", S_IRUSR, dev->dir,
-				 &dev->wqe_stats, &ocrdma_dbg_ops))
-		goto err;
+	debugfs_create_file("wqe_stats", S_IRUSR, dev->dir, &dev->wqe_stats,
+			    &ocrdma_dbg_ops);
 
 	dev->tx_stats.type = OCRDMA_TXSTATS;
 	dev->tx_stats.dev = dev;
-	if (!debugfs_create_file("tx_stats", S_IRUSR, dev->dir,
-				 &dev->tx_stats, &ocrdma_dbg_ops))
-		goto err;
+	debugfs_create_file("tx_stats", S_IRUSR, dev->dir, &dev->tx_stats,
+			    &ocrdma_dbg_ops);
 
 	dev->db_err_stats.type = OCRDMA_DB_ERRSTATS;
 	dev->db_err_stats.dev = dev;
-	if (!debugfs_create_file("db_err_stats", S_IRUSR, dev->dir,
-				 &dev->db_err_stats, &ocrdma_dbg_ops))
-		goto err;
-
+	debugfs_create_file("db_err_stats", S_IRUSR, dev->dir,
+			    &dev->db_err_stats, &ocrdma_dbg_ops);
 
 	dev->tx_qp_err_stats.type = OCRDMA_TXQP_ERRSTATS;
 	dev->tx_qp_err_stats.dev = dev;
-	if (!debugfs_create_file("tx_qp_err_stats", S_IRUSR, dev->dir,
-				 &dev->tx_qp_err_stats, &ocrdma_dbg_ops))
-		goto err;
+	debugfs_create_file("tx_qp_err_stats", S_IRUSR, dev->dir,
+			    &dev->tx_qp_err_stats, &ocrdma_dbg_ops);
 
 	dev->rx_qp_err_stats.type = OCRDMA_RXQP_ERRSTATS;
 	dev->rx_qp_err_stats.dev = dev;
-	if (!debugfs_create_file("rx_qp_err_stats", S_IRUSR, dev->dir,
-				 &dev->rx_qp_err_stats, &ocrdma_dbg_ops))
-		goto err;
-
+	debugfs_create_file("rx_qp_err_stats", S_IRUSR, dev->dir,
+			    &dev->rx_qp_err_stats, &ocrdma_dbg_ops);
 
 	dev->tx_dbg_stats.type = OCRDMA_TX_DBG_STATS;
 	dev->tx_dbg_stats.dev = dev;
-	if (!debugfs_create_file("tx_dbg_stats", S_IRUSR, dev->dir,
-				 &dev->tx_dbg_stats, &ocrdma_dbg_ops))
-		goto err;
+	debugfs_create_file("tx_dbg_stats", S_IRUSR, dev->dir,
+			    &dev->tx_dbg_stats, &ocrdma_dbg_ops);
 
 	dev->rx_dbg_stats.type = OCRDMA_RX_DBG_STATS;
 	dev->rx_dbg_stats.dev = dev;
-	if (!debugfs_create_file("rx_dbg_stats", S_IRUSR, dev->dir,
-				 &dev->rx_dbg_stats, &ocrdma_dbg_ops))
-		goto err;
+	debugfs_create_file("rx_dbg_stats", S_IRUSR, dev->dir,
+			    &dev->rx_dbg_stats, &ocrdma_dbg_ops);
 
 	dev->driver_stats.type = OCRDMA_DRV_STATS;
 	dev->driver_stats.dev = dev;
-	if (!debugfs_create_file("driver_dbg_stats", S_IRUSR, dev->dir,
-					&dev->driver_stats, &ocrdma_dbg_ops))
-		goto err;
+	debugfs_create_file("driver_dbg_stats", S_IRUSR, dev->dir,
+			    &dev->driver_stats, &ocrdma_dbg_ops);
 
 	dev->reset_stats.type = OCRDMA_RESET_STATS;
 	dev->reset_stats.dev = dev;
-	if (!debugfs_create_file("reset_stats", 0200, dev->dir,
-				&dev->reset_stats, &ocrdma_dbg_ops))
-		goto err;
-
-
-	return;
-err:
-	debugfs_remove_recursive(dev->dir);
-	dev->dir = NULL;
+	debugfs_create_file("reset_stats", 0200, dev->dir, &dev->reset_stats,
+			    &ocrdma_dbg_ops);
 }
 
 void ocrdma_rem_port_stats(struct ocrdma_dev *dev)
 {
-	if (!dev->dir)
-		return;
 	debugfs_remove_recursive(dev->dir);
 }
 
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c
index c158ca9..e8267e5 100644
--- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c
@@ -47,6 +47,7 @@
 #include <rdma/ib_umem.h>
 #include <rdma/ib_addr.h>
 #include <rdma/ib_cache.h>
+#include <rdma/uverbs_ioctl.h>
 
 #include "ocrdma.h"
 #include "ocrdma_hw.h"
@@ -55,7 +56,7 @@
 
 int ocrdma_query_pkey(struct ib_device *ibdev, u8 port, u16 index, u16 *pkey)
 {
-	if (index > 1)
+	if (index > 0)
 		return -EINVAL;
 
 	*pkey = 0xffff;
@@ -112,24 +113,6 @@
 	return 0;
 }
 
-struct net_device *ocrdma_get_netdev(struct ib_device *ibdev, u8 port_num)
-{
-	struct ocrdma_dev *dev;
-	struct net_device *ndev = NULL;
-
-	rcu_read_lock();
-
-	dev = get_ocrdma_dev(ibdev);
-	if (dev)
-		ndev = dev->nic_info.netdev;
-	if (ndev)
-		dev_hold(ndev);
-
-	rcu_read_unlock();
-
-	return ndev;
-}
-
 static inline void get_link_speed_and_width(struct ocrdma_dev *dev,
 					    u8 *ib_speed, u8 *ib_width)
 {
@@ -177,18 +160,13 @@
 
 	/* props being zeroed by the caller, avoid zeroing it here */
 	dev = get_ocrdma_dev(ibdev);
-	if (port > 1) {
-		pr_err("%s(%d) invalid_port=0x%x\n", __func__,
-		       dev->id, port);
-		return -EINVAL;
-	}
 	netdev = dev->nic_info.netdev;
 	if (netif_running(netdev) && netif_oper_up(netdev)) {
 		port_state = IB_PORT_ACTIVE;
-		props->phys_state = 5;
+		props->phys_state = IB_PORT_PHYS_STATE_LINK_UP;
 	} else {
 		port_state = IB_PORT_DOWN;
-		props->phys_state = 3;
+		props->phys_state = IB_PORT_PHYS_STATE_DISABLED;
 	}
 	props->max_mtu = IB_MTU_4096;
 	props->active_mtu = iboe_get_mtu(netdev->mtu);
@@ -215,13 +193,6 @@
 int ocrdma_modify_port(struct ib_device *ibdev, u8 port, int mask,
 		       struct ib_port_modify *props)
 {
-	struct ocrdma_dev *dev;
-
-	dev = get_ocrdma_dev(ibdev);
-	if (port > 1) {
-		pr_err("%s(%d) invalid_port=0x%x\n", __func__, dev->id, port);
-		return -EINVAL;
-	}
 	return 0;
 }
 
@@ -379,17 +350,22 @@
 	return status;
 }
 
-static struct ocrdma_pd *_ocrdma_alloc_pd(struct ocrdma_dev *dev,
-					  struct ocrdma_ucontext *uctx,
-					  struct ib_udata *udata)
+/*
+ * NOTE:
+ *
+ * ocrdma_ucontext must be used here because this function is also
+ * called from ocrdma_alloc_ucontext where ib_udata does not have
+ * valid ib_ucontext pointer. ib_uverbs_get_context does not call
+ * uobj_{alloc|get_xxx} helpers which are used to store the
+ * ib_ucontext in uverbs_attr_bundle wrapping the ib_udata. so
+ * ib_udata does NOT imply valid ib_ucontext here!
+ */
+static int _ocrdma_alloc_pd(struct ocrdma_dev *dev, struct ocrdma_pd *pd,
+			    struct ocrdma_ucontext *uctx,
+			    struct ib_udata *udata)
 {
-	struct ocrdma_pd *pd = NULL;
 	int status;
 
-	pd = kzalloc(sizeof(*pd), GFP_KERNEL);
-	if (!pd)
-		return ERR_PTR(-ENOMEM);
-
 	if (udata && uctx && dev->attr.max_dpp_pds) {
 		pd->dpp_enabled =
 			ocrdma_get_asic_type(dev) == OCRDMA_ASIC_GEN_SKH_R;
@@ -398,15 +374,8 @@
 					   dev->attr.wqe_size) : 0;
 	}
 
-	if (dev->pd_mgr->pd_prealloc_valid) {
-		status = ocrdma_get_pd_num(dev, pd);
-		if (status == 0) {
-			return pd;
-		} else {
-			kfree(pd);
-			return ERR_PTR(status);
-		}
-	}
+	if (dev->pd_mgr->pd_prealloc_valid)
+		return ocrdma_get_pd_num(dev, pd);
 
 retry:
 	status = ocrdma_mbx_alloc_pd(dev, pd);
@@ -415,13 +384,11 @@
 			pd->dpp_enabled = false;
 			pd->num_dpp_qp = 0;
 			goto retry;
-		} else {
-			kfree(pd);
-			return ERR_PTR(status);
 		}
+		return status;
 	}
 
-	return pd;
+	return 0;
 }
 
 static inline int is_ucontext_pd(struct ocrdma_ucontext *uctx,
@@ -430,30 +397,33 @@
 	return (uctx->cntxt_pd == pd);
 }
 
-static int _ocrdma_dealloc_pd(struct ocrdma_dev *dev,
+static void _ocrdma_dealloc_pd(struct ocrdma_dev *dev,
 			      struct ocrdma_pd *pd)
 {
-	int status;
-
 	if (dev->pd_mgr->pd_prealloc_valid)
-		status = ocrdma_put_pd_num(dev, pd->id, pd->dpp_enabled);
+		ocrdma_put_pd_num(dev, pd->id, pd->dpp_enabled);
 	else
-		status = ocrdma_mbx_dealloc_pd(dev, pd);
-
-	kfree(pd);
-	return status;
+		ocrdma_mbx_dealloc_pd(dev, pd);
 }
 
 static int ocrdma_alloc_ucontext_pd(struct ocrdma_dev *dev,
 				    struct ocrdma_ucontext *uctx,
 				    struct ib_udata *udata)
 {
-	int status = 0;
+	struct ib_device *ibdev = &dev->ibdev;
+	struct ib_pd *pd;
+	int status;
 
-	uctx->cntxt_pd = _ocrdma_alloc_pd(dev, uctx, udata);
-	if (IS_ERR(uctx->cntxt_pd)) {
-		status = PTR_ERR(uctx->cntxt_pd);
-		uctx->cntxt_pd = NULL;
+	pd = rdma_zalloc_drv_obj(ibdev, ib_pd);
+	if (!pd)
+		return -ENOMEM;
+
+	pd->device  = ibdev;
+	uctx->cntxt_pd = get_ocrdma_pd(pd);
+
+	status = _ocrdma_alloc_pd(dev, uctx->cntxt_pd, uctx, udata);
+	if (status) {
+		kfree(uctx->cntxt_pd);
 		goto err;
 	}
 
@@ -463,7 +433,7 @@
 	return status;
 }
 
-static int ocrdma_dealloc_ucontext_pd(struct ocrdma_ucontext *uctx)
+static void ocrdma_dealloc_ucontext_pd(struct ocrdma_ucontext *uctx)
 {
 	struct ocrdma_pd *pd = uctx->cntxt_pd;
 	struct ocrdma_dev *dev = get_ocrdma_dev(pd->ibpd.device);
@@ -472,9 +442,9 @@
 		pr_err("%s(%d) Freeing in use pdid=0x%x.\n",
 		       __func__, dev->id, pd->id);
 	}
+	kfree(uctx->cntxt_pd);
 	uctx->cntxt_pd = NULL;
-	(void)_ocrdma_dealloc_pd(dev, pd);
-	return 0;
+	_ocrdma_dealloc_pd(dev, pd);
 }
 
 static struct ocrdma_pd *ocrdma_get_ucontext_pd(struct ocrdma_ucontext *uctx)
@@ -498,33 +468,28 @@
 	mutex_unlock(&uctx->mm_list_lock);
 }
 
-struct ib_ucontext *ocrdma_alloc_ucontext(struct ib_device *ibdev,
-					  struct ib_udata *udata)
+int ocrdma_alloc_ucontext(struct ib_ucontext *uctx, struct ib_udata *udata)
 {
+	struct ib_device *ibdev = uctx->device;
 	int status;
-	struct ocrdma_ucontext *ctx;
-	struct ocrdma_alloc_ucontext_resp resp;
+	struct ocrdma_ucontext *ctx = get_ocrdma_ucontext(uctx);
+	struct ocrdma_alloc_ucontext_resp resp = {};
 	struct ocrdma_dev *dev = get_ocrdma_dev(ibdev);
 	struct pci_dev *pdev = dev->nic_info.pdev;
 	u32 map_len = roundup(sizeof(u32) * 2048, PAGE_SIZE);
 
 	if (!udata)
-		return ERR_PTR(-EFAULT);
-	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
-	if (!ctx)
-		return ERR_PTR(-ENOMEM);
+		return -EFAULT;
 	INIT_LIST_HEAD(&ctx->mm_head);
 	mutex_init(&ctx->mm_list_lock);
 
-	ctx->ah_tbl.va = dma_zalloc_coherent(&pdev->dev, map_len,
-					     &ctx->ah_tbl.pa, GFP_KERNEL);
-	if (!ctx->ah_tbl.va) {
-		kfree(ctx);
-		return ERR_PTR(-ENOMEM);
-	}
+	ctx->ah_tbl.va = dma_alloc_coherent(&pdev->dev, map_len,
+					    &ctx->ah_tbl.pa, GFP_KERNEL);
+	if (!ctx->ah_tbl.va)
+		return -ENOMEM;
+
 	ctx->ah_tbl.len = map_len;
 
-	memset(&resp, 0, sizeof(resp));
 	resp.ah_tbl_len = ctx->ah_tbl.len;
 	resp.ah_tbl_page = virt_to_phys(ctx->ah_tbl.va);
 
@@ -546,27 +511,26 @@
 	status = ib_copy_to_udata(udata, &resp, sizeof(resp));
 	if (status)
 		goto cpy_err;
-	return &ctx->ibucontext;
+	return 0;
 
 cpy_err:
+	ocrdma_dealloc_ucontext_pd(ctx);
 pd_err:
 	ocrdma_del_mmap(ctx, ctx->ah_tbl.pa, ctx->ah_tbl.len);
 map_err:
 	dma_free_coherent(&pdev->dev, ctx->ah_tbl.len, ctx->ah_tbl.va,
 			  ctx->ah_tbl.pa);
-	kfree(ctx);
-	return ERR_PTR(status);
+	return status;
 }
 
-int ocrdma_dealloc_ucontext(struct ib_ucontext *ibctx)
+void ocrdma_dealloc_ucontext(struct ib_ucontext *ibctx)
 {
-	int status;
 	struct ocrdma_mm *mm, *tmp;
 	struct ocrdma_ucontext *uctx = get_ocrdma_ucontext(ibctx);
 	struct ocrdma_dev *dev = get_ocrdma_dev(ibctx->device);
 	struct pci_dev *pdev = dev->nic_info.pdev;
 
-	status = ocrdma_dealloc_ucontext_pd(uctx);
+	ocrdma_dealloc_ucontext_pd(uctx);
 
 	ocrdma_del_mmap(uctx, uctx->ah_tbl.pa, uctx->ah_tbl.len);
 	dma_free_coherent(&pdev->dev, uctx->ah_tbl.len, uctx->ah_tbl.va,
@@ -576,8 +540,6 @@
 		list_del(&mm->entry);
 		kfree(mm);
 	}
-	kfree(uctx);
-	return status;
 }
 
 int ocrdma_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
@@ -624,7 +586,6 @@
 }
 
 static int ocrdma_copy_pd_uresp(struct ocrdma_dev *dev, struct ocrdma_pd *pd,
-				struct ib_ucontext *ib_ctx,
 				struct ib_udata *udata)
 {
 	int status;
@@ -632,7 +593,8 @@
 	u64 dpp_page_addr = 0;
 	u32 db_page_size;
 	struct ocrdma_alloc_pd_uresp rsp;
-	struct ocrdma_ucontext *uctx = get_ocrdma_ucontext(ib_ctx);
+	struct ocrdma_ucontext *uctx = rdma_udata_to_drv_context(
+		udata, struct ocrdma_ucontext, ibucontext);
 
 	memset(&rsp, 0, sizeof(rsp));
 	rsp.id = pd->id;
@@ -670,18 +632,17 @@
 	return status;
 }
 
-struct ib_pd *ocrdma_alloc_pd(struct ib_device *ibdev,
-			      struct ib_ucontext *context,
-			      struct ib_udata *udata)
+int ocrdma_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata)
 {
+	struct ib_device *ibdev = ibpd->device;
 	struct ocrdma_dev *dev = get_ocrdma_dev(ibdev);
 	struct ocrdma_pd *pd;
-	struct ocrdma_ucontext *uctx = NULL;
 	int status;
 	u8 is_uctx_pd = false;
+	struct ocrdma_ucontext *uctx = rdma_udata_to_drv_context(
+		udata, struct ocrdma_ucontext, ibucontext);
 
-	if (udata && context) {
-		uctx = get_ocrdma_ucontext(context);
+	if (udata) {
 		pd = ocrdma_get_ucontext_pd(uctx);
 		if (pd) {
 			is_uctx_pd = true;
@@ -689,37 +650,33 @@
 		}
 	}
 
-	pd = _ocrdma_alloc_pd(dev, uctx, udata);
-	if (IS_ERR(pd)) {
-		status = PTR_ERR(pd);
+	pd = get_ocrdma_pd(ibpd);
+	status = _ocrdma_alloc_pd(dev, pd, uctx, udata);
+	if (status)
 		goto exit;
-	}
 
 pd_mapping:
-	if (udata && context) {
-		status = ocrdma_copy_pd_uresp(dev, pd, context, udata);
+	if (udata) {
+		status = ocrdma_copy_pd_uresp(dev, pd, udata);
 		if (status)
 			goto err;
 	}
-	return &pd->ibpd;
+	return 0;
 
 err:
-	if (is_uctx_pd) {
+	if (is_uctx_pd)
 		ocrdma_release_ucontext_pd(uctx);
-	} else {
-		if (_ocrdma_dealloc_pd(dev, pd))
-			pr_err("%s: _ocrdma_dealloc_pd() failed\n", __func__);
-	}
+	else
+		_ocrdma_dealloc_pd(dev, pd);
 exit:
-	return ERR_PTR(status);
+	return status;
 }
 
-int ocrdma_dealloc_pd(struct ib_pd *ibpd)
+void ocrdma_dealloc_pd(struct ib_pd *ibpd, struct ib_udata *udata)
 {
 	struct ocrdma_pd *pd = get_ocrdma_pd(ibpd);
 	struct ocrdma_dev *dev = get_ocrdma_dev(ibpd->device);
 	struct ocrdma_ucontext *uctx = NULL;
-	int status = 0;
 	u64 usr_db;
 
 	uctx = pd->uctx;
@@ -733,11 +690,10 @@
 
 		if (is_ucontext_pd(uctx, pd)) {
 			ocrdma_release_ucontext_pd(uctx);
-			return status;
+			return;
 		}
 	}
-	status = _ocrdma_dealloc_pd(dev, pd);
-	return status;
+	_ocrdma_dealloc_pd(dev, pd);
 }
 
 static int ocrdma_alloc_lkey(struct ocrdma_dev *dev, struct ocrdma_mr *mr,
@@ -850,7 +806,7 @@
 		return -ENOMEM;
 
 	for (i = 0; i < mr->num_pbls; i++) {
-		va = dma_zalloc_coherent(&pdev->dev, dma_len, &pa, GFP_KERNEL);
+		va = dma_alloc_coherent(&pdev->dev, dma_len, &pa, GFP_KERNEL);
 		if (!va) {
 			ocrdma_free_mr_pbl_tbl(dev, mr);
 			status = -ENOMEM;
@@ -866,10 +822,11 @@
 			    u32 num_pbes)
 {
 	struct ocrdma_pbe *pbe;
-	struct scatterlist *sg;
+	struct sg_dma_page_iter sg_iter;
 	struct ocrdma_pbl *pbl_tbl = mr->hwmr.pbl_table;
 	struct ib_umem *umem = mr->umem;
-	int shift, pg_cnt, pages, pbe_cnt, entry, total_num_pbes = 0;
+	int pbe_cnt, total_num_pbes = 0;
+	u64 pg_addr;
 
 	if (!mr->hwmr.num_pbes)
 		return;
@@ -877,36 +834,26 @@
 	pbe = (struct ocrdma_pbe *)pbl_tbl->va;
 	pbe_cnt = 0;
 
-	shift = umem->page_shift;
+	for_each_sg_dma_page (umem->sg_head.sgl, &sg_iter, umem->nmap, 0) {
+		/* store the page address in pbe */
+		pg_addr = sg_page_iter_dma_address(&sg_iter);
+		pbe->pa_lo = cpu_to_le32(pg_addr);
+		pbe->pa_hi = cpu_to_le32(upper_32_bits(pg_addr));
+		pbe_cnt += 1;
+		total_num_pbes += 1;
+		pbe++;
 
-	for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) {
-		pages = sg_dma_len(sg) >> shift;
-		for (pg_cnt = 0; pg_cnt < pages; pg_cnt++) {
-			/* store the page address in pbe */
-			pbe->pa_lo =
-			    cpu_to_le32(sg_dma_address(sg) +
-					(pg_cnt << shift));
-			pbe->pa_hi =
-			    cpu_to_le32(upper_32_bits(sg_dma_address(sg) +
-					 (pg_cnt << shift)));
-			pbe_cnt += 1;
-			total_num_pbes += 1;
-			pbe++;
+		/* if done building pbes, issue the mbx cmd. */
+		if (total_num_pbes == num_pbes)
+			return;
 
-			/* if done building pbes, issue the mbx cmd. */
-			if (total_num_pbes == num_pbes)
-				return;
-
-			/* if the given pbl is full storing the pbes,
-			 * move to next pbl.
-			 */
-			if (pbe_cnt ==
-				(mr->hwmr.pbl_size / sizeof(u64))) {
-				pbl_tbl++;
-				pbe = (struct ocrdma_pbe *)pbl_tbl->va;
-				pbe_cnt = 0;
-			}
-
+		/* if the given pbl is full storing the pbes,
+		 * move to next pbl.
+		 */
+		if (pbe_cnt == (mr->hwmr.pbl_size / sizeof(u64))) {
+			pbl_tbl++;
+			pbe = (struct ocrdma_pbe *)pbl_tbl->va;
+			pbe_cnt = 0;
 		}
 	}
 }
@@ -928,7 +875,7 @@
 	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
 	if (!mr)
 		return ERR_PTR(status);
-	mr->umem = ib_umem_get(ibpd->uobject->context, start, len, acc, 0);
+	mr->umem = ib_umem_get(udata, start, len, acc, 0);
 	if (IS_ERR(mr->umem)) {
 		status = -EFAULT;
 		goto umem_err;
@@ -938,7 +885,7 @@
 	if (status)
 		goto umem_err;
 
-	mr->hwmr.pbe_size = BIT(mr->umem->page_shift);
+	mr->hwmr.pbe_size = PAGE_SIZE;
 	mr->hwmr.fbo = ib_umem_offset(mr->umem);
 	mr->hwmr.va = usr_addr;
 	mr->hwmr.len = len;
@@ -967,7 +914,7 @@
 	return ERR_PTR(status);
 }
 
-int ocrdma_dereg_mr(struct ib_mr *ib_mr)
+int ocrdma_dereg_mr(struct ib_mr *ib_mr, struct ib_udata *udata)
 {
 	struct ocrdma_mr *mr = get_ocrdma_mr(ib_mr);
 	struct ocrdma_dev *dev = get_ocrdma_dev(ib_mr->device);
@@ -978,8 +925,7 @@
 	ocrdma_free_mr_pbl_tbl(dev, &mr->hwmr);
 
 	/* it could be user registered memory. */
-	if (mr->umem)
-		ib_umem_release(mr->umem);
+	ib_umem_release(mr->umem);
 	kfree(mr);
 
 	/* Don't stop cleanup, in case FW is unresponsive */
@@ -991,13 +937,17 @@
 }
 
 static int ocrdma_copy_cq_uresp(struct ocrdma_dev *dev, struct ocrdma_cq *cq,
-				struct ib_udata *udata,
-				struct ib_ucontext *ib_ctx)
+				struct ib_udata *udata)
 {
 	int status;
-	struct ocrdma_ucontext *uctx = get_ocrdma_ucontext(ib_ctx);
+	struct ocrdma_ucontext *uctx = rdma_udata_to_drv_context(
+		udata, struct ocrdma_ucontext, ibucontext);
 	struct ocrdma_create_cq_uresp uresp;
 
+	/* this must be user flow! */
+	if (!udata)
+		return -EINVAL;
+
 	memset(&uresp, 0, sizeof(uresp));
 	uresp.cq_id = cq->id;
 	uresp.page_size = PAGE_ALIGN(cq->len);
@@ -1026,59 +976,52 @@
 	return status;
 }
 
-struct ib_cq *ocrdma_create_cq(struct ib_device *ibdev,
-			       const struct ib_cq_init_attr *attr,
-			       struct ib_ucontext *ib_ctx,
-			       struct ib_udata *udata)
+int ocrdma_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
+		     struct ib_udata *udata)
 {
+	struct ib_device *ibdev = ibcq->device;
 	int entries = attr->cqe;
-	struct ocrdma_cq *cq;
+	struct ocrdma_cq *cq = get_ocrdma_cq(ibcq);
 	struct ocrdma_dev *dev = get_ocrdma_dev(ibdev);
-	struct ocrdma_ucontext *uctx = NULL;
+	struct ocrdma_ucontext *uctx = rdma_udata_to_drv_context(
+		udata, struct ocrdma_ucontext, ibucontext);
 	u16 pd_id = 0;
 	int status;
 	struct ocrdma_create_cq_ureq ureq;
 
 	if (attr->flags)
-		return ERR_PTR(-EINVAL);
+		return -EINVAL;
 
 	if (udata) {
 		if (ib_copy_from_udata(&ureq, udata, sizeof(ureq)))
-			return ERR_PTR(-EFAULT);
+			return -EFAULT;
 	} else
 		ureq.dpp_cq = 0;
-	cq = kzalloc(sizeof(*cq), GFP_KERNEL);
-	if (!cq)
-		return ERR_PTR(-ENOMEM);
 
 	spin_lock_init(&cq->cq_lock);
 	spin_lock_init(&cq->comp_handler_lock);
 	INIT_LIST_HEAD(&cq->sq_head);
 	INIT_LIST_HEAD(&cq->rq_head);
 
-	if (ib_ctx) {
-		uctx = get_ocrdma_ucontext(ib_ctx);
+	if (udata)
 		pd_id = uctx->cntxt_pd->id;
-	}
 
 	status = ocrdma_mbx_create_cq(dev, cq, entries, ureq.dpp_cq, pd_id);
-	if (status) {
-		kfree(cq);
-		return ERR_PTR(status);
-	}
-	if (ib_ctx) {
-		status = ocrdma_copy_cq_uresp(dev, cq, udata, ib_ctx);
+	if (status)
+		return status;
+
+	if (udata) {
+		status = ocrdma_copy_cq_uresp(dev, cq, udata);
 		if (status)
 			goto ctx_err;
 	}
 	cq->phase = OCRDMA_CQE_VALID;
 	dev->cq_tbl[cq->id] = cq;
-	return &cq->ibcq;
+	return 0;
 
 ctx_err:
 	ocrdma_mbx_destroy_cq(dev, cq);
-	kfree(cq);
-	return ERR_PTR(status);
+	return status;
 }
 
 int ocrdma_resize_cq(struct ib_cq *ibcq, int new_cnt,
@@ -1121,7 +1064,7 @@
 	spin_unlock_irqrestore(&cq->cq_lock, flags);
 }
 
-int ocrdma_destroy_cq(struct ib_cq *ibcq)
+void ocrdma_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata)
 {
 	struct ocrdma_cq *cq = get_ocrdma_cq(ibcq);
 	struct ocrdma_eq *eq = NULL;
@@ -1131,14 +1074,13 @@
 
 	dev->cq_tbl[cq->id] = NULL;
 	indx = ocrdma_get_eq_table_index(dev, cq->eqn);
-	BUG_ON(indx == -EINVAL);
 
 	eq = &dev->eq_tbl[indx];
 	irq = ocrdma_get_irq(dev, eq);
 	synchronize_irq(irq);
 	ocrdma_flush_cq(cq);
 
-	(void)ocrdma_mbx_destroy_cq(dev, cq);
+	ocrdma_mbx_destroy_cq(dev, cq);
 	if (cq->ucontext) {
 		pdid = cq->ucontext->cntxt_pd->id;
 		ocrdma_del_mmap(cq->ucontext, (u64) cq->pa,
@@ -1147,9 +1089,6 @@
 				ocrdma_get_db_addr(dev, pdid),
 				dev->nic_info.db_page_size);
 	}
-
-	kfree(cq);
-	return 0;
 }
 
 static int ocrdma_add_qpn_map(struct ocrdma_dev *dev, struct ocrdma_qp *qp)
@@ -1169,7 +1108,8 @@
 }
 
 static int ocrdma_check_qp_params(struct ib_pd *ibpd, struct ocrdma_dev *dev,
-				  struct ib_qp_init_attr *attrs)
+				  struct ib_qp_init_attr *attrs,
+				  struct ib_udata *udata)
 {
 	if ((attrs->qp_type != IB_QPT_GSI) &&
 	    (attrs->qp_type != IB_QPT_RC) &&
@@ -1217,7 +1157,7 @@
 		return -EINVAL;
 	}
 	/* unprivileged user space cannot create special QP */
-	if (ibpd->uobject && attrs->qp_type == IB_QPT_GSI) {
+	if (udata && attrs->qp_type == IB_QPT_GSI) {
 		pr_err
 		    ("%s(%d) Userspace can't create special QPs of type=0x%x\n",
 		     __func__, dev->id, attrs->qp_type);
@@ -1374,7 +1314,7 @@
 	struct ocrdma_create_qp_ureq ureq;
 	u16 dpp_credit_lmt, dpp_offset;
 
-	status = ocrdma_check_qp_params(ibpd, dev, attrs);
+	status = ocrdma_check_qp_params(ibpd, dev, attrs, udata);
 	if (status)
 		goto gen_err;
 
@@ -1480,8 +1420,7 @@
 		new_qps = old_qps;
 	spin_unlock_irqrestore(&qp->q_lock, flags);
 
-	if (!ib_modify_qp_is_ok(old_qps, new_qps, ibqp->qp_type, attr_mask,
-				IB_LINK_LAYER_ETHERNET)) {
+	if (!ib_modify_qp_is_ok(old_qps, new_qps, ibqp->qp_type, attr_mask)) {
 		pr_err("%s(%d) invalid attribute mask=0x%x specified for\n"
 		       "qpn=0x%x of type=0x%x old_qps=0x%x, new_qps=0x%x\n",
 		       __func__, dev->id, attr_mask, qp->id, ibqp->qp_type,
@@ -1742,7 +1681,7 @@
 	spin_unlock_irqrestore(&dev->flush_q_lock, flags);
 }
 
-int ocrdma_destroy_qp(struct ib_qp *ibqp)
+int ocrdma_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata)
 {
 	struct ocrdma_pd *pd;
 	struct ocrdma_qp *qp;
@@ -1838,45 +1777,43 @@
 	return status;
 }
 
-struct ib_srq *ocrdma_create_srq(struct ib_pd *ibpd,
-				 struct ib_srq_init_attr *init_attr,
-				 struct ib_udata *udata)
+int ocrdma_create_srq(struct ib_srq *ibsrq, struct ib_srq_init_attr *init_attr,
+		      struct ib_udata *udata)
 {
-	int status = -ENOMEM;
-	struct ocrdma_pd *pd = get_ocrdma_pd(ibpd);
-	struct ocrdma_dev *dev = get_ocrdma_dev(ibpd->device);
-	struct ocrdma_srq *srq;
+	int status;
+	struct ocrdma_pd *pd = get_ocrdma_pd(ibsrq->pd);
+	struct ocrdma_dev *dev = get_ocrdma_dev(ibsrq->device);
+	struct ocrdma_srq *srq = get_ocrdma_srq(ibsrq);
 
 	if (init_attr->attr.max_sge > dev->attr.max_recv_sge)
-		return ERR_PTR(-EINVAL);
+		return -EINVAL;
 	if (init_attr->attr.max_wr > dev->attr.max_rqe)
-		return ERR_PTR(-EINVAL);
-
-	srq = kzalloc(sizeof(*srq), GFP_KERNEL);
-	if (!srq)
-		return ERR_PTR(status);
+		return -EINVAL;
 
 	spin_lock_init(&srq->q_lock);
 	srq->pd = pd;
 	srq->db = dev->nic_info.db + (pd->id * dev->nic_info.db_page_size);
 	status = ocrdma_mbx_create_srq(dev, srq, init_attr, pd);
 	if (status)
-		goto err;
+		return status;
 
-	if (udata == NULL) {
-		status = -ENOMEM;
+	if (!udata) {
 		srq->rqe_wr_id_tbl = kcalloc(srq->rq.max_cnt, sizeof(u64),
 					     GFP_KERNEL);
-		if (srq->rqe_wr_id_tbl == NULL)
+		if (!srq->rqe_wr_id_tbl) {
+			status = -ENOMEM;
 			goto arm_err;
+		}
 
 		srq->bit_fields_len = (srq->rq.max_cnt / 32) +
 		    (srq->rq.max_cnt % 32 ? 1 : 0);
 		srq->idx_bit_fields =
 		    kmalloc_array(srq->bit_fields_len, sizeof(u32),
 				  GFP_KERNEL);
-		if (srq->idx_bit_fields == NULL)
+		if (!srq->idx_bit_fields) {
+			status = -ENOMEM;
 			goto arm_err;
+		}
 		memset(srq->idx_bit_fields, 0xff,
 		       srq->bit_fields_len * sizeof(u32));
 	}
@@ -1893,15 +1830,13 @@
 			goto arm_err;
 	}
 
-	return &srq->ibsrq;
+	return 0;
 
 arm_err:
 	ocrdma_mbx_destroy_srq(dev, srq);
-err:
 	kfree(srq->rqe_wr_id_tbl);
 	kfree(srq->idx_bit_fields);
-	kfree(srq);
-	return ERR_PTR(status);
+	return status;
 }
 
 int ocrdma_modify_srq(struct ib_srq *ibsrq,
@@ -1930,15 +1865,14 @@
 	return status;
 }
 
-int ocrdma_destroy_srq(struct ib_srq *ibsrq)
+void ocrdma_destroy_srq(struct ib_srq *ibsrq, struct ib_udata *udata)
 {
-	int status;
 	struct ocrdma_srq *srq;
 	struct ocrdma_dev *dev = get_ocrdma_dev(ibsrq->device);
 
 	srq = get_ocrdma_srq(ibsrq);
 
-	status = ocrdma_mbx_destroy_srq(dev, srq);
+	ocrdma_mbx_destroy_srq(dev, srq);
 
 	if (srq->pd->uctx)
 		ocrdma_del_mmap(srq->pd->uctx, (u64) srq->rq.pa,
@@ -1946,8 +1880,6 @@
 
 	kfree(srq->idx_bit_fields);
 	kfree(srq->rqe_wr_id_tbl);
-	kfree(srq);
-	return status;
 }
 
 /* unprivileged verbs and their support functions. */
@@ -2976,9 +2908,8 @@
 	return 0;
 }
 
-struct ib_mr *ocrdma_alloc_mr(struct ib_pd *ibpd,
-			      enum ib_mr_type mr_type,
-			      u32 max_num_sg)
+struct ib_mr *ocrdma_alloc_mr(struct ib_pd *ibpd, enum ib_mr_type mr_type,
+			      u32 max_num_sg, struct ib_udata *udata)
 {
 	int status;
 	struct ocrdma_mr *mr;
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h
index b69cfdc..32488da 100644
--- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h
@@ -61,25 +61,20 @@
 ocrdma_query_protocol(struct ib_device *device, u8 port_num);
 
 void ocrdma_get_guid(struct ocrdma_dev *, u8 *guid);
-struct net_device *ocrdma_get_netdev(struct ib_device *device, u8 port_num);
 int ocrdma_query_pkey(struct ib_device *, u8 port, u16 index, u16 *pkey);
 
-struct ib_ucontext *ocrdma_alloc_ucontext(struct ib_device *,
-					  struct ib_udata *);
-int ocrdma_dealloc_ucontext(struct ib_ucontext *);
+int ocrdma_alloc_ucontext(struct ib_ucontext *uctx, struct ib_udata *udata);
+void ocrdma_dealloc_ucontext(struct ib_ucontext *uctx);
 
 int ocrdma_mmap(struct ib_ucontext *, struct vm_area_struct *vma);
 
-struct ib_pd *ocrdma_alloc_pd(struct ib_device *,
-			      struct ib_ucontext *, struct ib_udata *);
-int ocrdma_dealloc_pd(struct ib_pd *pd);
+int ocrdma_alloc_pd(struct ib_pd *pd, struct ib_udata *udata);
+void ocrdma_dealloc_pd(struct ib_pd *pd, struct ib_udata *udata);
 
-struct ib_cq *ocrdma_create_cq(struct ib_device *ibdev,
-			       const struct ib_cq_init_attr *attr,
-			       struct ib_ucontext *ib_ctx,
-			       struct ib_udata *udata);
+int ocrdma_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
+		     struct ib_udata *udata);
 int ocrdma_resize_cq(struct ib_cq *, int cqe, struct ib_udata *);
-int ocrdma_destroy_cq(struct ib_cq *);
+void ocrdma_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata);
 
 struct ib_qp *ocrdma_create_qp(struct ib_pd *,
 			       struct ib_qp_init_attr *attrs,
@@ -91,25 +86,24 @@
 int ocrdma_query_qp(struct ib_qp *,
 		    struct ib_qp_attr *qp_attr,
 		    int qp_attr_mask, struct ib_qp_init_attr *);
-int ocrdma_destroy_qp(struct ib_qp *);
+int ocrdma_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata);
 void ocrdma_del_flush_qp(struct ocrdma_qp *qp);
 
-struct ib_srq *ocrdma_create_srq(struct ib_pd *, struct ib_srq_init_attr *,
-				 struct ib_udata *);
+int ocrdma_create_srq(struct ib_srq *srq, struct ib_srq_init_attr *attr,
+		      struct ib_udata *udata);
 int ocrdma_modify_srq(struct ib_srq *, struct ib_srq_attr *,
 		      enum ib_srq_attr_mask, struct ib_udata *);
 int ocrdma_query_srq(struct ib_srq *, struct ib_srq_attr *);
-int ocrdma_destroy_srq(struct ib_srq *);
+void ocrdma_destroy_srq(struct ib_srq *ibsrq, struct ib_udata *udata);
 int ocrdma_post_srq_recv(struct ib_srq *, const struct ib_recv_wr *,
 			 const struct ib_recv_wr **bad_recv_wr);
 
-int ocrdma_dereg_mr(struct ib_mr *);
+int ocrdma_dereg_mr(struct ib_mr *ib_mr, struct ib_udata *udata);
 struct ib_mr *ocrdma_get_dma_mr(struct ib_pd *, int acc);
 struct ib_mr *ocrdma_reg_user_mr(struct ib_pd *, u64 start, u64 length,
 				 u64 virt, int acc, struct ib_udata *);
-struct ib_mr *ocrdma_alloc_mr(struct ib_pd *pd,
-			      enum ib_mr_type mr_type,
-			      u32 max_num_sg);
+struct ib_mr *ocrdma_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
+			      u32 max_num_sg, struct ib_udata *udata);
 int ocrdma_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents,
 		     unsigned int *sg_offset);
 
diff --git a/drivers/infiniband/hw/qedr/Kconfig b/drivers/infiniband/hw/qedr/Kconfig
index 9b9e3b1..9c30325 100644
--- a/drivers/infiniband/hw/qedr/Kconfig
+++ b/drivers/infiniband/hw/qedr/Kconfig
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 config INFINIBAND_QEDR
 	tristate "QLogic RoCE driver"
 	depends on 64BIT && QEDE
diff --git a/drivers/infiniband/hw/qedr/Makefile b/drivers/infiniband/hw/qedr/Makefile
index 1c0bc4f..c756798 100644
--- a/drivers/infiniband/hw/qedr/Makefile
+++ b/drivers/infiniband/hw/qedr/Makefile
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 obj-$(CONFIG_INFINIBAND_QEDR) := qedr.o
 
 qedr-y := main.o verbs.o qedr_roce_cm.o qedr_iw_cm.o
diff --git a/drivers/infiniband/hw/qedr/main.c b/drivers/infiniband/hw/qedr/main.c
index a0af6d4..dc71b6e 100644
--- a/drivers/infiniband/hw/qedr/main.c
+++ b/drivers/infiniband/hw/qedr/main.c
@@ -39,7 +39,6 @@
 #include <linux/iommu.h>
 #include <linux/pci.h>
 #include <net/addrconf.h>
-#include <linux/idr.h>
 
 #include <linux/qed/qed_chain.h>
 #include <linux/qed/qed_if.h>
@@ -77,25 +76,11 @@
 	struct qedr_dev *qedr = get_qedr_dev(ibdev);
 	u32 fw_ver = (u32)qedr->attr.fw_ver;
 
-	snprintf(str, IB_FW_VERSION_NAME_MAX, "%d. %d. %d. %d",
+	snprintf(str, IB_FW_VERSION_NAME_MAX, "%d.%d.%d.%d",
 		 (fw_ver >> 24) & 0xFF, (fw_ver >> 16) & 0xFF,
 		 (fw_ver >> 8) & 0xFF, fw_ver & 0xFF);
 }
 
-static struct net_device *qedr_get_netdev(struct ib_device *dev, u8 port_num)
-{
-	struct qedr_dev *qdev;
-
-	qdev = get_qedr_dev(dev);
-	dev_hold(qdev->ndev);
-
-	/* The HW vendor's device driver must guarantee
-	 * that this function returns NULL before the net device has finished
-	 * NETDEV_UNREGISTER state.
-	 */
-	return qdev->ndev;
-}
-
 static int qedr_roce_port_immutable(struct ib_device *ibdev, u8 port_num,
 				    struct ib_port_immutable *immutable)
 {
@@ -133,49 +118,130 @@
 	return 0;
 }
 
+/* QEDR sysfs interface */
+static ssize_t hw_rev_show(struct device *device, struct device_attribute *attr,
+			   char *buf)
+{
+	struct qedr_dev *dev =
+		rdma_device_to_drv_device(device, struct qedr_dev, ibdev);
+
+	return scnprintf(buf, PAGE_SIZE, "0x%x\n", dev->attr.hw_ver);
+}
+static DEVICE_ATTR_RO(hw_rev);
+
+static ssize_t hca_type_show(struct device *device,
+			     struct device_attribute *attr, char *buf)
+{
+	struct qedr_dev *dev =
+		rdma_device_to_drv_device(device, struct qedr_dev, ibdev);
+
+	return scnprintf(buf, PAGE_SIZE, "FastLinQ QL%x %s\n",
+			 dev->pdev->device,
+			 rdma_protocol_iwarp(&dev->ibdev, 1) ?
+			 "iWARP" : "RoCE");
+}
+static DEVICE_ATTR_RO(hca_type);
+
+static struct attribute *qedr_attributes[] = {
+	&dev_attr_hw_rev.attr,
+	&dev_attr_hca_type.attr,
+	NULL
+};
+
+static const struct attribute_group qedr_attr_group = {
+	.attrs = qedr_attributes,
+};
+
+static const struct ib_device_ops qedr_iw_dev_ops = {
+	.get_port_immutable = qedr_iw_port_immutable,
+	.iw_accept = qedr_iw_accept,
+	.iw_add_ref = qedr_iw_qp_add_ref,
+	.iw_connect = qedr_iw_connect,
+	.iw_create_listen = qedr_iw_create_listen,
+	.iw_destroy_listen = qedr_iw_destroy_listen,
+	.iw_get_qp = qedr_iw_get_qp,
+	.iw_reject = qedr_iw_reject,
+	.iw_rem_ref = qedr_iw_qp_rem_ref,
+	.query_gid = qedr_iw_query_gid,
+};
+
 static int qedr_iw_register_device(struct qedr_dev *dev)
 {
 	dev->ibdev.node_type = RDMA_NODE_RNIC;
-	dev->ibdev.query_gid = qedr_iw_query_gid;
 
-	dev->ibdev.get_port_immutable = qedr_iw_port_immutable;
+	ib_set_device_ops(&dev->ibdev, &qedr_iw_dev_ops);
 
-	dev->ibdev.iwcm = kzalloc(sizeof(*dev->ibdev.iwcm), GFP_KERNEL);
-	if (!dev->ibdev.iwcm)
-		return -ENOMEM;
-
-	dev->ibdev.iwcm->connect = qedr_iw_connect;
-	dev->ibdev.iwcm->accept = qedr_iw_accept;
-	dev->ibdev.iwcm->reject = qedr_iw_reject;
-	dev->ibdev.iwcm->create_listen = qedr_iw_create_listen;
-	dev->ibdev.iwcm->destroy_listen = qedr_iw_destroy_listen;
-	dev->ibdev.iwcm->add_ref = qedr_iw_qp_add_ref;
-	dev->ibdev.iwcm->rem_ref = qedr_iw_qp_rem_ref;
-	dev->ibdev.iwcm->get_qp = qedr_iw_get_qp;
-
-	memcpy(dev->ibdev.iwcm->ifname,
-	       dev->ndev->name, sizeof(dev->ibdev.iwcm->ifname));
+	memcpy(dev->ibdev.iw_ifname,
+	       dev->ndev->name, sizeof(dev->ibdev.iw_ifname));
 
 	return 0;
 }
 
+static const struct ib_device_ops qedr_roce_dev_ops = {
+	.get_port_immutable = qedr_roce_port_immutable,
+};
+
 static void qedr_roce_register_device(struct qedr_dev *dev)
 {
 	dev->ibdev.node_type = RDMA_NODE_IB_CA;
 
-	dev->ibdev.get_port_immutable = qedr_roce_port_immutable;
+	ib_set_device_ops(&dev->ibdev, &qedr_roce_dev_ops);
 }
 
+static const struct ib_device_ops qedr_dev_ops = {
+	.owner = THIS_MODULE,
+	.driver_id = RDMA_DRIVER_QEDR,
+	.uverbs_abi_ver = QEDR_ABI_VERSION,
+
+	.alloc_mr = qedr_alloc_mr,
+	.alloc_pd = qedr_alloc_pd,
+	.alloc_ucontext = qedr_alloc_ucontext,
+	.create_ah = qedr_create_ah,
+	.create_cq = qedr_create_cq,
+	.create_qp = qedr_create_qp,
+	.create_srq = qedr_create_srq,
+	.dealloc_pd = qedr_dealloc_pd,
+	.dealloc_ucontext = qedr_dealloc_ucontext,
+	.dereg_mr = qedr_dereg_mr,
+	.destroy_ah = qedr_destroy_ah,
+	.destroy_cq = qedr_destroy_cq,
+	.destroy_qp = qedr_destroy_qp,
+	.destroy_srq = qedr_destroy_srq,
+	.get_dev_fw_str = qedr_get_dev_fw_str,
+	.get_dma_mr = qedr_get_dma_mr,
+	.get_link_layer = qedr_link_layer,
+	.map_mr_sg = qedr_map_mr_sg,
+	.mmap = qedr_mmap,
+	.modify_port = qedr_modify_port,
+	.modify_qp = qedr_modify_qp,
+	.modify_srq = qedr_modify_srq,
+	.poll_cq = qedr_poll_cq,
+	.post_recv = qedr_post_recv,
+	.post_send = qedr_post_send,
+	.post_srq_recv = qedr_post_srq_recv,
+	.process_mad = qedr_process_mad,
+	.query_device = qedr_query_device,
+	.query_pkey = qedr_query_pkey,
+	.query_port = qedr_query_port,
+	.query_qp = qedr_query_qp,
+	.query_srq = qedr_query_srq,
+	.reg_user_mr = qedr_reg_user_mr,
+	.req_notify_cq = qedr_arm_cq,
+	.resize_cq = qedr_resize_cq,
+
+	INIT_RDMA_OBJ_SIZE(ib_ah, qedr_ah, ibah),
+	INIT_RDMA_OBJ_SIZE(ib_cq, qedr_cq, ibcq),
+	INIT_RDMA_OBJ_SIZE(ib_pd, qedr_pd, ibpd),
+	INIT_RDMA_OBJ_SIZE(ib_srq, qedr_srq, ibsrq),
+	INIT_RDMA_OBJ_SIZE(ib_ucontext, qedr_ucontext, ibucontext),
+};
+
 static int qedr_register_device(struct qedr_dev *dev)
 {
 	int rc;
 
-	strlcpy(dev->ibdev.name, "qedr%d", IB_DEVICE_NAME_MAX);
-
 	dev->ibdev.node_guid = dev->attr.node_guid;
 	memcpy(dev->ibdev.node_desc, QEDR_NODE_DESC, sizeof(QEDR_NODE_DESC));
-	dev->ibdev.owner = THIS_MODULE;
-	dev->ibdev.uverbs_abi_ver = QEDR_ABI_VERSION;
 
 	dev->ibdev.uverbs_cmd_mask = QEDR_UVERBS(GET_CONTEXT) |
 				     QEDR_UVERBS(QUERY_DEVICE) |
@@ -212,59 +278,16 @@
 
 	dev->ibdev.phys_port_cnt = 1;
 	dev->ibdev.num_comp_vectors = dev->num_cnq;
-
-	dev->ibdev.query_device = qedr_query_device;
-	dev->ibdev.query_port = qedr_query_port;
-	dev->ibdev.modify_port = qedr_modify_port;
-
-	dev->ibdev.alloc_ucontext = qedr_alloc_ucontext;
-	dev->ibdev.dealloc_ucontext = qedr_dealloc_ucontext;
-	dev->ibdev.mmap = qedr_mmap;
-
-	dev->ibdev.alloc_pd = qedr_alloc_pd;
-	dev->ibdev.dealloc_pd = qedr_dealloc_pd;
-
-	dev->ibdev.create_cq = qedr_create_cq;
-	dev->ibdev.destroy_cq = qedr_destroy_cq;
-	dev->ibdev.resize_cq = qedr_resize_cq;
-	dev->ibdev.req_notify_cq = qedr_arm_cq;
-
-	dev->ibdev.create_qp = qedr_create_qp;
-	dev->ibdev.modify_qp = qedr_modify_qp;
-	dev->ibdev.query_qp = qedr_query_qp;
-	dev->ibdev.destroy_qp = qedr_destroy_qp;
-
-	dev->ibdev.create_srq = qedr_create_srq;
-	dev->ibdev.destroy_srq = qedr_destroy_srq;
-	dev->ibdev.modify_srq = qedr_modify_srq;
-	dev->ibdev.query_srq = qedr_query_srq;
-	dev->ibdev.post_srq_recv = qedr_post_srq_recv;
-	dev->ibdev.query_pkey = qedr_query_pkey;
-
-	dev->ibdev.create_ah = qedr_create_ah;
-	dev->ibdev.destroy_ah = qedr_destroy_ah;
-
-	dev->ibdev.get_dma_mr = qedr_get_dma_mr;
-	dev->ibdev.dereg_mr = qedr_dereg_mr;
-	dev->ibdev.reg_user_mr = qedr_reg_user_mr;
-	dev->ibdev.alloc_mr = qedr_alloc_mr;
-	dev->ibdev.map_mr_sg = qedr_map_mr_sg;
-
-	dev->ibdev.poll_cq = qedr_poll_cq;
-	dev->ibdev.post_send = qedr_post_send;
-	dev->ibdev.post_recv = qedr_post_recv;
-
-	dev->ibdev.process_mad = qedr_process_mad;
-
-	dev->ibdev.get_netdev = qedr_get_netdev;
-
 	dev->ibdev.dev.parent = &dev->pdev->dev;
 
-	dev->ibdev.get_link_layer = qedr_link_layer;
-	dev->ibdev.get_dev_fw_str = qedr_get_dev_fw_str;
+	rdma_set_device_sysfs_group(&dev->ibdev, &qedr_attr_group);
+	ib_set_device_ops(&dev->ibdev, &qedr_dev_ops);
 
-	dev->ibdev.driver_id = RDMA_DRIVER_QEDR;
-	return ib_register_device(&dev->ibdev, NULL);
+	rc = ib_device_set_netdev(&dev->ibdev, dev->ndev, 1);
+	if (rc)
+		return rc;
+
+	return ib_register_device(&dev->ibdev, "qedr%d");
 }
 
 /* This function allocates fast-path status block memory */
@@ -297,7 +320,8 @@
 			     struct qed_sb_info *sb_info, int sb_id)
 {
 	if (sb_info->sb_virt) {
-		dev->ops->common->sb_release(dev->cdev, sb_info, sb_id);
+		dev->ops->common->sb_release(dev->cdev, sb_info, sb_id,
+					     QED_SB_TYPE_CNQ);
 		dma_free_coherent(&dev->pdev->dev, sizeof(*sb_info->sb_virt),
 				  (void *)sb_info->sb_virt, sb_info->sb_phys);
 	}
@@ -335,8 +359,7 @@
 	spin_lock_init(&dev->sgid_lock);
 
 	if (IS_IWARP(dev)) {
-		spin_lock_init(&dev->qpidr.idr_lock);
-		idr_init(&dev->qpidr.idr);
+		xa_init_flags(&dev->qps, XA_FLAGS_LOCK_IRQ);
 		dev->iwarp_wq = create_singlethread_workqueue("qedr_iwarpq");
 	}
 
@@ -404,37 +427,6 @@
 	return rc;
 }
 
-/* QEDR sysfs interface */
-static ssize_t show_rev(struct device *device, struct device_attribute *attr,
-			char *buf)
-{
-	struct qedr_dev *dev = dev_get_drvdata(device);
-
-	return scnprintf(buf, PAGE_SIZE, "0x%x\n", dev->pdev->vendor);
-}
-
-static ssize_t show_hca_type(struct device *device,
-			     struct device_attribute *attr, char *buf)
-{
-	return scnprintf(buf, PAGE_SIZE, "%s\n", "HCA_TYPE_TO_SET");
-}
-
-static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL);
-static DEVICE_ATTR(hca_type, S_IRUGO, show_hca_type, NULL);
-
-static struct device_attribute *qedr_attributes[] = {
-	&dev_attr_hw_rev,
-	&dev_attr_hca_type
-};
-
-static void qedr_remove_sysfiles(struct qedr_dev *dev)
-{
-	int i;
-
-	for (i = 0; i < ARRAY_SIZE(qedr_attributes); i++)
-		device_remove_file(&dev->ibdev.dev, qedr_attributes[i]);
-}
-
 static void qedr_pci_set_atomic(struct qedr_dev *dev, struct pci_dev *pdev)
 {
 	int rc = pci_enable_atomic_ops_to_root(pdev,
@@ -521,11 +513,13 @@
 static void qedr_sync_free_irqs(struct qedr_dev *dev)
 {
 	u32 vector;
+	u16 idx;
 	int i;
 
 	for (i = 0; i < dev->int_info.used_cnt; i++) {
 		if (dev->int_info.msix_cnt) {
-			vector = dev->int_info.msix[i * dev->num_hwfns].vector;
+			idx = i * dev->num_hwfns + dev->affin_hwfn_idx;
+			vector = dev->int_info.msix[idx].vector;
 			synchronize_irq(vector);
 			free_irq(vector, &dev->cnq_array[i]);
 		}
@@ -537,6 +531,7 @@
 static int qedr_req_msix_irqs(struct qedr_dev *dev)
 {
 	int i, rc = 0;
+	u16 idx;
 
 	if (dev->num_cnq > dev->int_info.msix_cnt) {
 		DP_ERR(dev,
@@ -546,7 +541,8 @@
 	}
 
 	for (i = 0; i < dev->num_cnq; i++) {
-		rc = request_irq(dev->int_info.msix[i * dev->num_hwfns].vector,
+		idx = i * dev->num_hwfns + dev->affin_hwfn_idx;
+		rc = request_irq(dev->int_info.msix[idx].vector,
 				 qedr_irq_handler, 0, dev->cnq_array[i].name,
 				 &dev->cnq_array[i]);
 		if (rc) {
@@ -762,8 +758,8 @@
 		break;
 	case EVENT_TYPE_SRQ:
 		srq_id = (u16)roce_handle64;
-		spin_lock_irqsave(&dev->srqidr.idr_lock, flags);
-		srq = idr_find(&dev->srqidr.idr, srq_id);
+		xa_lock_irqsave(&dev->srqs, flags);
+		srq = xa_load(&dev->srqs, srq_id);
 		if (srq) {
 			ibsrq = &srq->ibsrq;
 			if (ibsrq->event_handler) {
@@ -777,7 +773,7 @@
 				  "SRQ event with NULL pointer ibsrq. Handle=%llx\n",
 				  roce_handle64);
 		}
-		spin_unlock_irqrestore(&dev->srqidr.idr_lock, flags);
+		xa_unlock_irqrestore(&dev->srqs, flags);
 		DP_NOTICE(dev, "SRQ event %d on handle %p\n", e_code, srq);
 	default:
 		break;
@@ -830,7 +826,7 @@
 	if (rc)
 		goto out;
 
-	dev->db_addr = (void __iomem *)(uintptr_t)out_params.dpi_addr;
+	dev->db_addr = out_params.dpi_addr;
 	dev->db_phys_addr = out_params.dpi_phys_addr;
 	dev->db_size = out_params.dpi_size;
 	dev->dpi = out_params.dpi;
@@ -855,9 +851,9 @@
 {
 	struct qed_dev_rdma_info dev_info;
 	struct qedr_dev *dev;
-	int rc = 0, i;
+	int rc = 0;
 
-	dev = (struct qedr_dev *)ib_alloc_device(sizeof(*dev));
+	dev = ib_alloc_device(qedr_dev, ibdev);
 	if (!dev) {
 		pr_err("Unable to allocate ib device\n");
 		return NULL;
@@ -883,6 +879,16 @@
 	dev->user_dpm_enabled = dev_info.user_dpm_enabled;
 	dev->rdma_type = dev_info.rdma_type;
 	dev->num_hwfns = dev_info.common.num_hwfns;
+
+	if (IS_IWARP(dev) && QEDR_IS_CMT(dev)) {
+		rc = dev->ops->iwarp_set_engine_affin(cdev, false);
+		if (rc) {
+			DP_ERR(dev, "iWARP is disabled over a 100g device Enabling it may impact L2 performance. To enable it run devlink dev param set <dev> name iwarp_cmt value true cmode runtime\n");
+			goto init_err;
+		}
+	}
+	dev->affin_hwfn_idx = dev->ops->common->get_affin_hwfn_idx(cdev);
+
 	dev->rdma_ctx = dev->ops->rdma_get_rdma_ctx(cdev);
 
 	dev->num_cnq = dev->ops->rdma_get_min_cnq_msix(cdev);
@@ -914,18 +920,12 @@
 		goto reg_err;
 	}
 
-	for (i = 0; i < ARRAY_SIZE(qedr_attributes); i++)
-		if (device_create_file(&dev->ibdev.dev, qedr_attributes[i]))
-			goto sysfs_err;
-
 	if (!test_and_set_bit(QEDR_ENET_STATE_BIT, &dev->enet_state))
 		qedr_ib_dispatch_event(dev, QEDR_PORT, IB_EVENT_PORT_ACTIVE);
 
 	DP_DEBUG(dev, QEDR_MSG_INIT, "qedr driver loaded successfully\n");
 	return dev;
 
-sysfs_err:
-	ib_unregister_device(&dev->ibdev);
 reg_err:
 	qedr_sync_free_irqs(dev);
 irq_err:
@@ -944,12 +944,15 @@
 	/* First unregister with stack to stop all the active traffic
 	 * of the registered clients.
 	 */
-	qedr_remove_sysfiles(dev);
 	ib_unregister_device(&dev->ibdev);
 
 	qedr_stop_hw(dev);
 	qedr_sync_free_irqs(dev);
 	qedr_free_resources(dev);
+
+	if (IS_IWARP(dev) && QEDR_IS_CMT(dev))
+		dev->ops->iwarp_set_engine_affin(dev->cdev, true);
+
 	ib_dealloc_device(&dev->ibdev);
 }
 
diff --git a/drivers/infiniband/hw/qedr/qedr.h b/drivers/infiniband/hw/qedr/qedr.h
index a2d708d..0cfd849 100644
--- a/drivers/infiniband/hw/qedr/qedr.h
+++ b/drivers/infiniband/hw/qedr/qedr.h
@@ -33,7 +33,7 @@
 #define __QEDR_H__
 
 #include <linux/pci.h>
-#include <linux/idr.h>
+#include <linux/xarray.h>
 #include <rdma/ib_addr.h>
 #include <linux/qed/qed_if.h>
 #include <linux/qed/qed_chain.h>
@@ -43,7 +43,7 @@
 #include "qedr_hsi_rdma.h"
 
 #define QEDR_NODE_DESC "QLogic 579xx RoCE HCA"
-#define DP_NAME(dev) ((dev)->ibdev.name)
+#define DP_NAME(_dev) dev_name(&(_dev)->ibdev.dev)
 #define IS_IWARP(_dev) ((_dev)->rdma_type == QED_RDMA_TYPE_IWARP)
 #define IS_ROCE(_dev) ((_dev)->rdma_type == QED_RDMA_TYPE_ROCE)
 
@@ -123,11 +123,6 @@
 
 #define QEDR_ENET_STATE_BIT	(0)
 
-struct qedr_idr {
-	spinlock_t idr_lock; /* Protect idr data-structure */
-	struct idr idr;
-};
-
 struct qedr_dev {
 	struct ib_device	ibdev;
 	struct qed_dev		*cdev;
@@ -162,6 +157,8 @@
 	u32			dp_module;
 	u8			dp_level;
 	u8			num_hwfns;
+#define QEDR_IS_CMT(dev)        ((dev)->num_hwfns > 1)
+	u8			affin_hwfn_idx;
 	u8			gsi_ll2_handle;
 
 	uint			wq_multiplier;
@@ -171,8 +168,8 @@
 	struct qedr_cq		*gsi_rqcq;
 	struct qedr_qp		*gsi_qp;
 	enum qed_rdma_type	rdma_type;
-	struct qedr_idr		qpidr;
-	struct qedr_idr		srqidr;
+	struct xarray		qps;
+	struct xarray		srqs;
 	struct workqueue_struct *iwarp_wq;
 	u16			iwarp_max_mtu;
 
@@ -232,7 +229,7 @@
 	struct ib_ucontext ibucontext;
 	struct qedr_dev *dev;
 	struct qedr_pd *pd;
-	u64 dpi_addr;
+	void __iomem *dpi_addr;
 	u64 dpi_phys_addr;
 	u32 dpi_size;
 	u16 dpi;
diff --git a/drivers/infiniband/hw/qedr/qedr_iw_cm.c b/drivers/infiniband/hw/qedr/qedr_iw_cm.c
index 505fa36..22881d4 100644
--- a/drivers/infiniband/hw/qedr/qedr_iw_cm.c
+++ b/drivers/infiniband/hw/qedr/qedr_iw_cm.c
@@ -349,7 +349,7 @@
 	default:
 		DP_NOTICE(dev, "Unknown event received %d\n", params->event);
 		break;
-	};
+	}
 	return 0;
 }
 
@@ -491,7 +491,9 @@
 	int rc = 0;
 	int i;
 
-	qp = idr_find(&dev->qpidr.idr, conn_param->qpn);
+	qp = xa_load(&dev->qps, conn_param->qpn);
+	if (unlikely(!qp))
+		return -EINVAL;
 
 	laddr = (struct sockaddr_in *)&cm_id->m_local_addr;
 	raddr = (struct sockaddr_in *)&cm_id->m_remote_addr;
@@ -679,7 +681,7 @@
 
 	DP_DEBUG(dev, QEDR_MSG_IWARP, "Accept on qpid=%d\n", conn_param->qpn);
 
-	qp = idr_find(&dev->qpidr.idr, conn_param->qpn);
+	qp = xa_load(&dev->qps, conn_param->qpn);
 	if (!qp) {
 		DP_ERR(dev, "Invalid QP number %d\n", conn_param->qpn);
 		return -EINVAL;
@@ -737,9 +739,7 @@
 	struct qedr_qp *qp = get_qedr_qp(ibqp);
 
 	if (atomic_dec_and_test(&qp->refcnt)) {
-		spin_lock_irq(&qp->dev->qpidr.idr_lock);
-		idr_remove(&qp->dev->qpidr.idr, qp->qp_id);
-		spin_unlock_irq(&qp->dev->qpidr.idr_lock);
+		xa_erase_irq(&qp->dev->qps, qp->qp_id);
 		kfree(qp);
 	}
 }
@@ -748,5 +748,5 @@
 {
 	struct qedr_dev *dev = get_qedr_dev(ibdev);
 
-	return idr_find(&dev->qpidr.idr, qpn);
+	return xa_load(&dev->qps, qpn);
 }
diff --git a/drivers/infiniband/hw/qedr/qedr_roce_cm.c b/drivers/infiniband/hw/qedr/qedr_roce_cm.c
index 8557888..f5542d7 100644
--- a/drivers/infiniband/hw/qedr/qedr_roce_cm.c
+++ b/drivers/infiniband/hw/qedr/qedr_roce_cm.c
@@ -397,14 +397,17 @@
 	bool has_udp = false;
 	int i;
 
+	rc = rdma_read_gid_l2_fields(sgid_attr, &vlan_id, NULL);
+	if (rc)
+		return rc;
+
+	if (vlan_id < VLAN_CFI_MASK)
+		has_vlan = true;
+
 	send_size = 0;
 	for (i = 0; i < swr->num_sge; ++i)
 		send_size += swr->sg_list[i].length;
 
-	vlan_id = rdma_vlan_dev_vlan_id(sgid_attr->ndev);
-	if (vlan_id < VLAN_CFI_MASK)
-		has_vlan = true;
-
 	has_udp = (sgid_attr->gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP);
 	if (!has_udp) {
 		/* RoCE v1 */
@@ -519,9 +522,9 @@
 	}
 
 	if (ether_addr_equal(udh.eth.smac_h, udh.eth.dmac_h))
-		packet->tx_dest = QED_ROCE_LL2_TX_DEST_LB;
+		packet->tx_dest = QED_LL2_TX_DEST_LB;
 	else
-		packet->tx_dest = QED_ROCE_LL2_TX_DEST_NW;
+		packet->tx_dest = QED_LL2_TX_DEST_NW;
 
 	packet->roce_mode = roce_mode;
 	memcpy(packet->header.vaddr, ud_header_buffer, header_size);
diff --git a/drivers/infiniband/hw/qedr/verbs.c b/drivers/infiniband/hw/qedr/verbs.c
index 8cc3df2..6f3ce86 100644
--- a/drivers/infiniband/hw/qedr/verbs.c
+++ b/drivers/infiniband/hw/qedr/verbs.c
@@ -42,6 +42,7 @@
 #include <rdma/ib_umem.h>
 #include <rdma/ib_addr.h>
 #include <rdma/ib_cache.h>
+#include <rdma/uverbs_ioctl.h>
 
 #include <linux/qed/common_hsi.h>
 #include "qedr_hsi_rdma.h"
@@ -67,7 +68,7 @@
 
 int qedr_query_pkey(struct ib_device *ibdev, u8 port, u16 index, u16 *pkey)
 {
-	if (index > QEDR_ROCE_PKEY_TABLE_LEN)
+	if (index >= QEDR_ROCE_PKEY_TABLE_LEN)
 		return -EINVAL;
 
 	*pkey = QEDR_ROCE_PKEY_DEFAULT;
@@ -158,54 +159,47 @@
 	return 0;
 }
 
-#define QEDR_SPEED_SDR		(1)
-#define QEDR_SPEED_DDR		(2)
-#define QEDR_SPEED_QDR		(4)
-#define QEDR_SPEED_FDR10	(8)
-#define QEDR_SPEED_FDR		(16)
-#define QEDR_SPEED_EDR		(32)
-
 static inline void get_link_speed_and_width(int speed, u8 *ib_speed,
 					    u8 *ib_width)
 {
 	switch (speed) {
 	case 1000:
-		*ib_speed = QEDR_SPEED_SDR;
+		*ib_speed = IB_SPEED_SDR;
 		*ib_width = IB_WIDTH_1X;
 		break;
 	case 10000:
-		*ib_speed = QEDR_SPEED_QDR;
+		*ib_speed = IB_SPEED_QDR;
 		*ib_width = IB_WIDTH_1X;
 		break;
 
 	case 20000:
-		*ib_speed = QEDR_SPEED_DDR;
+		*ib_speed = IB_SPEED_DDR;
 		*ib_width = IB_WIDTH_4X;
 		break;
 
 	case 25000:
-		*ib_speed = QEDR_SPEED_EDR;
+		*ib_speed = IB_SPEED_EDR;
 		*ib_width = IB_WIDTH_1X;
 		break;
 
 	case 40000:
-		*ib_speed = QEDR_SPEED_QDR;
+		*ib_speed = IB_SPEED_QDR;
 		*ib_width = IB_WIDTH_4X;
 		break;
 
 	case 50000:
-		*ib_speed = QEDR_SPEED_QDR;
-		*ib_width = IB_WIDTH_4X;
+		*ib_speed = IB_SPEED_HDR;
+		*ib_width = IB_WIDTH_1X;
 		break;
 
 	case 100000:
-		*ib_speed = QEDR_SPEED_EDR;
+		*ib_speed = IB_SPEED_EDR;
 		*ib_width = IB_WIDTH_4X;
 		break;
 
 	default:
 		/* Unsupported */
-		*ib_speed = QEDR_SPEED_SDR;
+		*ib_speed = IB_SPEED_SDR;
 		*ib_width = IB_WIDTH_1X;
 	}
 }
@@ -216,10 +210,6 @@
 	struct qed_rdma_port *rdma_port;
 
 	dev = get_qedr_dev(ibdev);
-	if (port > 1) {
-		DP_ERR(dev, "invalid_port=0x%x\n", port);
-		return -EINVAL;
-	}
 
 	if (!dev->rdma_ctx) {
 		DP_ERR(dev, "rdma_ctx is NULL\n");
@@ -231,10 +221,10 @@
 	/* *attr being zeroed by the caller, avoid zeroing it here */
 	if (rdma_port->port_state == QED_RDMA_PORT_UP) {
 		attr->state = IB_PORT_ACTIVE;
-		attr->phys_state = 5;
+		attr->phys_state = IB_PORT_PHYS_STATE_LINK_UP;
 	} else {
 		attr->state = IB_PORT_DOWN;
-		attr->phys_state = 3;
+		attr->phys_state = IB_PORT_PHYS_STATE_DISABLED;
 	}
 	attr->max_mtu = IB_MTU_4096;
 	attr->active_mtu = iboe_get_mtu(dev->ndev->mtu);
@@ -263,14 +253,6 @@
 int qedr_modify_port(struct ib_device *ibdev, u8 port, int mask,
 		     struct ib_port_modify *props)
 {
-	struct qedr_dev *dev;
-
-	dev = get_qedr_dev(ibdev);
-	if (port > 1) {
-		DP_ERR(dev, "invalid_port=0x%x\n", port);
-		return -EINVAL;
-	}
-
 	return 0;
 }
 
@@ -328,28 +310,24 @@
 	return found;
 }
 
-struct ib_ucontext *qedr_alloc_ucontext(struct ib_device *ibdev,
-					struct ib_udata *udata)
+int qedr_alloc_ucontext(struct ib_ucontext *uctx, struct ib_udata *udata)
 {
+	struct ib_device *ibdev = uctx->device;
 	int rc;
-	struct qedr_ucontext *ctx;
-	struct qedr_alloc_ucontext_resp uresp;
+	struct qedr_ucontext *ctx = get_qedr_ucontext(uctx);
+	struct qedr_alloc_ucontext_resp uresp = {};
 	struct qedr_dev *dev = get_qedr_dev(ibdev);
 	struct qed_rdma_add_user_out_params oparams;
 
 	if (!udata)
-		return ERR_PTR(-EFAULT);
-
-	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
-	if (!ctx)
-		return ERR_PTR(-ENOMEM);
+		return -EFAULT;
 
 	rc = dev->ops->rdma_add_user(dev->rdma_ctx, &oparams);
 	if (rc) {
 		DP_ERR(dev,
 		       "failed to allocate a DPI for a new RoCE application, rc=%d. To overcome this consider to increase the number of DPIs, increase the doorbell BAR size or just close unnecessary RoCE applications. In order to increase the number of DPIs consult the qedr readme\n",
 		       rc);
-		goto err;
+		return rc;
 	}
 
 	ctx->dpi = oparams.dpi;
@@ -359,8 +337,6 @@
 	INIT_LIST_HEAD(&ctx->mm_head);
 	mutex_init(&ctx->mm_list_lock);
 
-	memset(&uresp, 0, sizeof(uresp));
-
 	uresp.dpm_enabled = dev->user_dpm_enabled;
 	uresp.wids_enabled = 1;
 	uresp.wid_count = oparams.wid_count;
@@ -376,28 +352,23 @@
 
 	rc = qedr_ib_copy_to_udata(udata, &uresp, sizeof(uresp));
 	if (rc)
-		goto err;
+		return rc;
 
 	ctx->dev = dev;
 
 	rc = qedr_add_mmap(ctx, ctx->dpi_phys_addr, ctx->dpi_size);
 	if (rc)
-		goto err;
+		return rc;
 
 	DP_DEBUG(dev, QEDR_MSG_INIT, "Allocating user context %p\n",
 		 &ctx->ibucontext);
-	return &ctx->ibucontext;
-
-err:
-	kfree(ctx);
-	return ERR_PTR(rc);
+	return 0;
 }
 
-int qedr_dealloc_ucontext(struct ib_ucontext *ibctx)
+void qedr_dealloc_ucontext(struct ib_ucontext *ibctx)
 {
 	struct qedr_ucontext *uctx = get_qedr_ucontext(ibctx);
 	struct qedr_mm *mm, *tmp;
-	int status = 0;
 
 	DP_DEBUG(uctx->dev, QEDR_MSG_INIT, "Deallocating user context %p\n",
 		 uctx);
@@ -410,9 +381,6 @@
 		list_del(&mm->entry);
 		kfree(mm);
 	}
-
-	kfree(uctx);
-	return status;
 }
 
 int qedr_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
@@ -462,71 +430,56 @@
 				  vma->vm_page_prot);
 }
 
-struct ib_pd *qedr_alloc_pd(struct ib_device *ibdev,
-			    struct ib_ucontext *context, struct ib_udata *udata)
+int qedr_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata)
 {
+	struct ib_device *ibdev = ibpd->device;
 	struct qedr_dev *dev = get_qedr_dev(ibdev);
-	struct qedr_pd *pd;
+	struct qedr_pd *pd = get_qedr_pd(ibpd);
 	u16 pd_id;
 	int rc;
 
 	DP_DEBUG(dev, QEDR_MSG_INIT, "Function called from: %s\n",
-		 (udata && context) ? "User Lib" : "Kernel");
+		 udata ? "User Lib" : "Kernel");
 
 	if (!dev->rdma_ctx) {
 		DP_ERR(dev, "invalid RDMA context\n");
-		return ERR_PTR(-EINVAL);
+		return -EINVAL;
 	}
 
-	pd = kzalloc(sizeof(*pd), GFP_KERNEL);
-	if (!pd)
-		return ERR_PTR(-ENOMEM);
-
 	rc = dev->ops->rdma_alloc_pd(dev->rdma_ctx, &pd_id);
 	if (rc)
-		goto err;
+		return rc;
 
 	pd->pd_id = pd_id;
 
-	if (udata && context) {
+	if (udata) {
 		struct qedr_alloc_pd_uresp uresp = {
 			.pd_id = pd_id,
 		};
+		struct qedr_ucontext *context = rdma_udata_to_drv_context(
+			udata, struct qedr_ucontext, ibucontext);
 
 		rc = qedr_ib_copy_to_udata(udata, &uresp, sizeof(uresp));
 		if (rc) {
 			DP_ERR(dev, "copy error pd_id=0x%x.\n", pd_id);
 			dev->ops->rdma_dealloc_pd(dev->rdma_ctx, pd_id);
-			goto err;
+			return rc;
 		}
 
-		pd->uctx = get_qedr_ucontext(context);
+		pd->uctx = context;
 		pd->uctx->pd = pd;
 	}
 
-	return &pd->ibpd;
-
-err:
-	kfree(pd);
-	return ERR_PTR(rc);
+	return 0;
 }
 
-int qedr_dealloc_pd(struct ib_pd *ibpd)
+void qedr_dealloc_pd(struct ib_pd *ibpd, struct ib_udata *udata)
 {
 	struct qedr_dev *dev = get_qedr_dev(ibpd->device);
 	struct qedr_pd *pd = get_qedr_pd(ibpd);
 
-	if (!pd) {
-		pr_err("Invalid PD received in dealloc_pd\n");
-		return -EINVAL;
-	}
-
 	DP_DEBUG(dev, QEDR_MSG_INIT, "Deallocating PD %d\n", pd->pd_id);
 	dev->ops->rdma_dealloc_pd(dev->rdma_ctx, pd->pd_id);
-
-	kfree(pd);
-
-	return 0;
 }
 
 static void qedr_free_pbl(struct qedr_dev *dev,
@@ -568,8 +521,8 @@
 		return ERR_PTR(-ENOMEM);
 
 	for (i = 0; i < pbl_info->num_pbls; i++) {
-		va = dma_zalloc_coherent(&pdev->dev, pbl_info->pbl_size,
-					 &pa, flags);
+		va = dma_alloc_coherent(&pdev->dev, pbl_info->pbl_size, &pa,
+					flags);
 		if (!va)
 			goto err;
 
@@ -648,13 +601,12 @@
 			       struct qedr_pbl *pbl,
 			       struct qedr_pbl_info *pbl_info, u32 pg_shift)
 {
-	int shift, pg_cnt, pages, pbe_cnt, total_num_pbes = 0;
+	int pbe_cnt, total_num_pbes = 0;
 	u32 fw_pg_cnt, fw_pg_per_umem_pg;
 	struct qedr_pbl *pbl_tbl;
-	struct scatterlist *sg;
+	struct sg_dma_page_iter sg_iter;
 	struct regpair *pbe;
 	u64 pg_addr;
-	int entry;
 
 	if (!pbl_info->num_pbes)
 		return;
@@ -675,38 +627,32 @@
 
 	pbe_cnt = 0;
 
-	shift = umem->page_shift;
+	fw_pg_per_umem_pg = BIT(PAGE_SHIFT - pg_shift);
 
-	fw_pg_per_umem_pg = BIT(umem->page_shift - pg_shift);
+	for_each_sg_dma_page (umem->sg_head.sgl, &sg_iter, umem->nmap, 0) {
+		pg_addr = sg_page_iter_dma_address(&sg_iter);
+		for (fw_pg_cnt = 0; fw_pg_cnt < fw_pg_per_umem_pg;) {
+			pbe->lo = cpu_to_le32(pg_addr);
+			pbe->hi = cpu_to_le32(upper_32_bits(pg_addr));
 
-	for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) {
-		pages = sg_dma_len(sg) >> shift;
-		pg_addr = sg_dma_address(sg);
-		for (pg_cnt = 0; pg_cnt < pages; pg_cnt++) {
-			for (fw_pg_cnt = 0; fw_pg_cnt < fw_pg_per_umem_pg;) {
-				pbe->lo = cpu_to_le32(pg_addr);
-				pbe->hi = cpu_to_le32(upper_32_bits(pg_addr));
+			pg_addr += BIT(pg_shift);
+			pbe_cnt++;
+			total_num_pbes++;
+			pbe++;
 
-				pg_addr += BIT(pg_shift);
-				pbe_cnt++;
-				total_num_pbes++;
-				pbe++;
+			if (total_num_pbes == pbl_info->num_pbes)
+				return;
 
-				if (total_num_pbes == pbl_info->num_pbes)
-					return;
-
-				/* If the given pbl is full storing the pbes,
-				 * move to next pbl.
-				 */
-				if (pbe_cnt ==
-				    (pbl_info->pbl_size / sizeof(u64))) {
-					pbl_tbl++;
-					pbe = (struct regpair *)pbl_tbl->va;
-					pbe_cnt = 0;
-				}
-
-				fw_pg_cnt++;
+			/* If the given pbl is full storing the pbes,
+			 * move to next pbl.
+			 */
+			if (pbe_cnt == (pbl_info->pbl_size / sizeof(u64))) {
+				pbl_tbl++;
+				pbe = (struct regpair *)pbl_tbl->va;
+				pbe_cnt = 0;
 			}
+
+			fw_pg_cnt++;
 		}
 	}
 }
@@ -748,11 +694,10 @@
 	return aligned_size / QEDR_CQE_SIZE;
 }
 
-static inline int qedr_init_user_queue(struct ib_ucontext *ib_ctx,
+static inline int qedr_init_user_queue(struct ib_udata *udata,
 				       struct qedr_dev *dev,
-				       struct qedr_userq *q,
-				       u64 buf_addr, size_t buf_len,
-				       int access, int dmasync,
+				       struct qedr_userq *q, u64 buf_addr,
+				       size_t buf_len, int access, int dmasync,
 				       int alloc_and_init)
 {
 	u32 fw_pages;
@@ -760,7 +705,7 @@
 
 	q->buf_addr = buf_addr;
 	q->buf_len = buf_len;
-	q->umem = ib_umem_get(ib_ctx, q->buf_addr, q->buf_len, access, dmasync);
+	q->umem = ib_umem_get(udata, q->buf_addr, q->buf_len, access, dmasync);
 	if (IS_ERR(q->umem)) {
 		DP_ERR(dev, "create user queue: failed ib_umem_get, got %ld\n",
 		       PTR_ERR(q->umem));
@@ -768,7 +713,7 @@
 	}
 
 	fw_pages = ib_umem_page_count(q->umem) <<
-	    (q->umem->page_shift - FW_PAGE_SHIFT);
+	    (PAGE_SHIFT - FW_PAGE_SHIFT);
 
 	rc = qedr_prepare_pbl_tbl(dev, &q->pbl_info, fw_pages, 0);
 	if (rc)
@@ -823,9 +768,6 @@
 	cq->db.data.agg_flags = flags;
 	cq->db.data.value = cpu_to_le32(cons);
 	writeq(cq->db.raw, cq->db_addr);
-
-	/* Make sure write would stick */
-	mmiowb();
 }
 
 int qedr_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags)
@@ -864,19 +806,20 @@
 	return 0;
 }
 
-struct ib_cq *qedr_create_cq(struct ib_device *ibdev,
-			     const struct ib_cq_init_attr *attr,
-			     struct ib_ucontext *ib_ctx, struct ib_udata *udata)
+int qedr_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
+		   struct ib_udata *udata)
 {
-	struct qedr_ucontext *ctx = get_qedr_ucontext(ib_ctx);
+	struct ib_device *ibdev = ibcq->device;
+	struct qedr_ucontext *ctx = rdma_udata_to_drv_context(
+		udata, struct qedr_ucontext, ibucontext);
 	struct qed_rdma_destroy_cq_out_params destroy_oparams;
 	struct qed_rdma_destroy_cq_in_params destroy_iparams;
 	struct qedr_dev *dev = get_qedr_dev(ibdev);
 	struct qed_rdma_create_cq_in_params params;
-	struct qedr_create_cq_ureq ureq;
+	struct qedr_create_cq_ureq ureq = {};
 	int vector = attr->comp_vector;
 	int entries = attr->cqe;
-	struct qedr_cq *cq;
+	struct qedr_cq *cq = get_qedr_cq(ibcq);
 	int chain_entries;
 	int page_cnt;
 	u64 pbl_ptr;
@@ -891,18 +834,13 @@
 		DP_ERR(dev,
 		       "create cq: the number of entries %d is too high. Must be equal or below %d.\n",
 		       entries, QEDR_MAX_CQES);
-		return ERR_PTR(-EINVAL);
+		return -EINVAL;
 	}
 
 	chain_entries = qedr_align_cq_entries(entries);
 	chain_entries = min_t(int, chain_entries, QEDR_MAX_CQES);
 
-	cq = kzalloc(sizeof(*cq), GFP_KERNEL);
-	if (!cq)
-		return ERR_PTR(-ENOMEM);
-
 	if (udata) {
-		memset(&ureq, 0, sizeof(ureq));
 		if (ib_copy_from_udata(&ureq, udata, sizeof(ureq))) {
 			DP_ERR(dev,
 			       "create cq: problem copying data from user space\n");
@@ -917,9 +855,9 @@
 
 		cq->cq_type = QEDR_CQ_TYPE_USER;
 
-		rc = qedr_init_user_queue(ib_ctx, dev, &cq->q, ureq.addr,
-					  ureq.len, IB_ACCESS_LOCAL_WRITE,
-					  1, 1);
+		rc = qedr_init_user_queue(udata, dev, &cq->q, ureq.addr,
+					  ureq.len, IB_ACCESS_LOCAL_WRITE, 1,
+					  1);
 		if (rc)
 			goto err0;
 
@@ -956,7 +894,7 @@
 	cq->sig = QEDR_CQ_MAGIC_NUMBER;
 	spin_lock_init(&cq->cq_lock);
 
-	if (ib_ctx) {
+	if (udata) {
 		rc = qedr_copy_cq_uresp(dev, cq, udata);
 		if (rc)
 			goto err3;
@@ -980,7 +918,7 @@
 		 "create cq: icid=0x%0x, addr=%p, size(entries)=0x%0x\n",
 		 cq->icid, cq, params.cq_size);
 
-	return &cq->ibcq;
+	return 0;
 
 err3:
 	destroy_iparams.icid = cq->icid;
@@ -995,8 +933,7 @@
 	if (udata)
 		ib_umem_release(cq->q.umem);
 err0:
-	kfree(cq);
-	return ERR_PTR(-EINVAL);
+	return -EINVAL;
 }
 
 int qedr_resize_cq(struct ib_cq *ibcq, int new_cnt, struct ib_udata *udata)
@@ -1012,14 +949,13 @@
 #define QEDR_DESTROY_CQ_MAX_ITERATIONS		(10)
 #define QEDR_DESTROY_CQ_ITER_DURATION		(10)
 
-int qedr_destroy_cq(struct ib_cq *ibcq)
+void qedr_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata)
 {
 	struct qedr_dev *dev = get_qedr_dev(ibcq->device);
 	struct qed_rdma_destroy_cq_out_params oparams;
 	struct qed_rdma_destroy_cq_in_params iparams;
 	struct qedr_cq *cq = get_qedr_cq(ibcq);
 	int iter;
-	int rc;
 
 	DP_DEBUG(dev, QEDR_MSG_CQ, "destroy cq %p (icid=%d)\n", cq, cq->icid);
 
@@ -1027,16 +963,13 @@
 
 	/* GSIs CQs are handled by driver, so they don't exist in the FW */
 	if (cq->cq_type == QEDR_CQ_TYPE_GSI)
-		goto done;
+		return;
 
 	iparams.icid = cq->icid;
-	rc = dev->ops->rdma_destroy_cq(dev->rdma_ctx, &iparams, &oparams);
-	if (rc)
-		return rc;
-
+	dev->ops->rdma_destroy_cq(dev->rdma_ctx, &iparams, &oparams);
 	dev->ops->common->chain_free(dev->cdev, &cq->pbl);
 
-	if (ibcq->uobject && ibcq->uobject->context) {
+	if (udata) {
 		qedr_free_pbl(dev, &cq->q.pbl_info, cq->q.pbl_tbl);
 		ib_umem_release(cq->q.umem);
 	}
@@ -1064,27 +997,11 @@
 		iter--;
 	}
 
-	if (oparams.num_cq_notif != cq->cnq_notif)
-		goto err;
-
 	/* Note that we don't need to have explicit code to wait for the
 	 * completion of the event handler because it is invoked from the EQ.
 	 * Since the destroy CQ ramrod has also been received on the EQ we can
 	 * be certain that there's no event handler in process.
 	 */
-done:
-	cq->sig = ~cq->sig;
-
-	kfree(cq);
-
-	return 0;
-
-err:
-	DP_ERR(dev,
-	       "CQ %p (icid=%d) not freed, expecting %d ints but got %d ints\n",
-	       cq, cq->icid, oparams.num_cq_notif, cq->cnq_notif);
-
-	return -EINVAL;
 }
 
 static inline int get_gid_info_from_table(struct ib_qp *ibqp,
@@ -1097,10 +1014,13 @@
 	enum rdma_network_type nw_type;
 	const struct ib_global_route *grh = rdma_ah_read_grh(&attr->ah_attr);
 	u32 ipv4_addr;
+	int ret;
 	int i;
 
 	gid_attr = grh->sgid_attr;
-	qp_params->vlan_id = rdma_vlan_dev_vlan_id(gid_attr->ndev);
+	ret = rdma_read_gid_l2_fields(gid_attr, &qp_params->vlan_id, NULL);
+	if (ret)
+		return ret;
 
 	nw_type = rdma_gid_attr_network_type(gid_attr);
 	switch (nw_type) {
@@ -1148,7 +1068,8 @@
 }
 
 static int qedr_check_qp_attrs(struct ib_pd *ibpd, struct qedr_dev *dev,
-			       struct ib_qp_init_attr *attrs)
+			       struct ib_qp_init_attr *attrs,
+			       struct ib_udata *udata)
 {
 	struct qedr_device_attr *qattr = &dev->attr;
 
@@ -1189,7 +1110,7 @@
 	}
 
 	/* Unprivileged user space cannot create special QP */
-	if (ibpd->uobject && attrs->qp_type == IB_QPT_GSI) {
+	if (udata && attrs->qp_type == IB_QPT_GSI) {
 		DP_ERR(dev,
 		       "create qp: userspace can't create special QPs of type=0x%x\n",
 		       attrs->qp_type);
@@ -1313,7 +1234,7 @@
 	}
 }
 
-static int qedr_check_srq_params(struct ib_pd *ibpd, struct qedr_dev *dev,
+static int qedr_check_srq_params(struct qedr_dev *dev,
 				 struct ib_srq_init_attr *attrs,
 				 struct ib_udata *udata)
 {
@@ -1355,7 +1276,7 @@
 			  hw_srq->phy_prod_pair_addr);
 }
 
-static int qedr_init_srq_user_params(struct ib_ucontext *ib_ctx,
+static int qedr_init_srq_user_params(struct ib_udata *udata,
 				     struct qedr_srq *srq,
 				     struct qedr_create_srq_ureq *ureq,
 				     int access, int dmasync)
@@ -1363,14 +1284,14 @@
 	struct scatterlist *sg;
 	int rc;
 
-	rc = qedr_init_user_queue(ib_ctx, srq->dev, &srq->usrq, ureq->srq_addr,
+	rc = qedr_init_user_queue(udata, srq->dev, &srq->usrq, ureq->srq_addr,
 				  ureq->srq_len, access, dmasync, 1);
 	if (rc)
 		return rc;
 
-	srq->prod_umem = ib_umem_get(ib_ctx, ureq->prod_pair_addr,
-				     sizeof(struct rdma_srq_producers),
-				     access, dmasync);
+	srq->prod_umem =
+		ib_umem_get(udata, ureq->prod_pair_addr,
+			    sizeof(struct rdma_srq_producers), access, dmasync);
 	if (IS_ERR(srq->prod_umem)) {
 		qedr_free_pbl(srq->dev, &srq->usrq.pbl_info, srq->usrq.pbl_tbl);
 		ib_umem_release(srq->usrq.umem);
@@ -1429,40 +1350,28 @@
 	return rc;
 }
 
-static int qedr_idr_add(struct qedr_dev *dev, struct qedr_idr *qidr,
-			void *ptr, u32 id);
-static void qedr_idr_remove(struct qedr_dev *dev,
-			    struct qedr_idr *qidr, u32 id);
-
-struct ib_srq *qedr_create_srq(struct ib_pd *ibpd,
-			       struct ib_srq_init_attr *init_attr,
-			       struct ib_udata *udata)
+int qedr_create_srq(struct ib_srq *ibsrq, struct ib_srq_init_attr *init_attr,
+		    struct ib_udata *udata)
 {
 	struct qed_rdma_destroy_srq_in_params destroy_in_params;
 	struct qed_rdma_create_srq_in_params in_params = {};
-	struct qedr_dev *dev = get_qedr_dev(ibpd->device);
+	struct qedr_dev *dev = get_qedr_dev(ibsrq->device);
 	struct qed_rdma_create_srq_out_params out_params;
-	struct qedr_pd *pd = get_qedr_pd(ibpd);
+	struct qedr_pd *pd = get_qedr_pd(ibsrq->pd);
 	struct qedr_create_srq_ureq ureq = {};
 	u64 pbl_base_addr, phy_prod_pair_addr;
-	struct ib_ucontext *ib_ctx = NULL;
 	struct qedr_srq_hwq_info *hw_srq;
-	struct qedr_ucontext *ctx = NULL;
 	u32 page_cnt, page_size;
-	struct qedr_srq *srq;
+	struct qedr_srq *srq = get_qedr_srq(ibsrq);
 	int rc = 0;
 
 	DP_DEBUG(dev, QEDR_MSG_QP,
 		 "create SRQ called from %s (pd %p)\n",
 		 (udata) ? "User lib" : "kernel", pd);
 
-	rc = qedr_check_srq_params(ibpd, dev, init_attr, udata);
+	rc = qedr_check_srq_params(dev, init_attr, udata);
 	if (rc)
-		return ERR_PTR(-EINVAL);
-
-	srq = kzalloc(sizeof(*srq), GFP_KERNEL);
-	if (!srq)
-		return ERR_PTR(-ENOMEM);
+		return -EINVAL;
 
 	srq->dev = dev;
 	hw_srq = &srq->hw_srq;
@@ -1471,24 +1380,21 @@
 	hw_srq->max_wr = init_attr->attr.max_wr;
 	hw_srq->max_sges = init_attr->attr.max_sge;
 
-	if (udata && ibpd->uobject && ibpd->uobject->context) {
-		ib_ctx = ibpd->uobject->context;
-		ctx = get_qedr_ucontext(ib_ctx);
-
+	if (udata) {
 		if (ib_copy_from_udata(&ureq, udata, sizeof(ureq))) {
 			DP_ERR(dev,
 			       "create srq: problem copying data from user space\n");
 			goto err0;
 		}
 
-		rc = qedr_init_srq_user_params(ib_ctx, srq, &ureq, 0, 0);
+		rc = qedr_init_srq_user_params(udata, srq, &ureq, 0, 0);
 		if (rc)
 			goto err0;
 
 		page_cnt = srq->usrq.pbl_info.num_pbes;
 		pbl_base_addr = srq->usrq.pbl_tbl->pa;
 		phy_prod_pair_addr = hw_srq->phy_prod_pair_addr;
-		page_size = BIT(srq->usrq.umem->page_shift);
+		page_size = PAGE_SIZE;
 	} else {
 		struct qed_chain *pbl;
 
@@ -1521,13 +1427,13 @@
 			goto err2;
 	}
 
-	rc = qedr_idr_add(dev, &dev->srqidr, srq, srq->srq_id);
+	rc = xa_insert_irq(&dev->srqs, srq->srq_id, srq, GFP_KERNEL);
 	if (rc)
 		goto err2;
 
 	DP_DEBUG(dev, QEDR_MSG_SRQ,
 		 "create srq: created srq with srq_id=0x%0x\n", srq->srq_id);
-	return &srq->ibsrq;
+	return 0;
 
 err2:
 	destroy_in_params.srq_id = srq->srq_id;
@@ -1539,22 +1445,20 @@
 	else
 		qedr_free_srq_kernel_params(srq);
 err0:
-	kfree(srq);
-
-	return ERR_PTR(-EFAULT);
+	return -EFAULT;
 }
 
-int qedr_destroy_srq(struct ib_srq *ibsrq)
+void qedr_destroy_srq(struct ib_srq *ibsrq, struct ib_udata *udata)
 {
 	struct qed_rdma_destroy_srq_in_params in_params = {};
 	struct qedr_dev *dev = get_qedr_dev(ibsrq->device);
 	struct qedr_srq *srq = get_qedr_srq(ibsrq);
 
-	qedr_idr_remove(dev, &dev->srqidr, srq->srq_id);
+	xa_erase_irq(&dev->srqs, srq->srq_id);
 	in_params.srq_id = srq->srq_id;
 	dev->ops->rdma_destroy_srq(dev->rdma_ctx, &in_params);
 
-	if (ibsrq->pd->uobject)
+	if (ibsrq->uobject)
 		qedr_free_srq_user_params(srq);
 	else
 		qedr_free_srq_kernel_params(srq);
@@ -1562,9 +1466,6 @@
 	DP_DEBUG(dev, QEDR_MSG_SRQ,
 		 "destroy srq: destroyed srq with srq_id=0x%0x\n",
 		 srq->srq_id);
-	kfree(srq);
-
-	return 0;
 }
 
 int qedr_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
@@ -1650,29 +1551,6 @@
 		 qp->usq.buf_len, qp->urq.buf_addr, qp->urq.buf_len);
 }
 
-static int qedr_idr_add(struct qedr_dev *dev, struct qedr_idr *qidr,
-			void *ptr, u32 id)
-{
-	int rc;
-
-	idr_preload(GFP_KERNEL);
-	spin_lock_irq(&qidr->idr_lock);
-
-	rc = idr_alloc(&qidr->idr, ptr, id, id + 1, GFP_ATOMIC);
-
-	spin_unlock_irq(&qidr->idr_lock);
-	idr_preload_end();
-
-	return rc < 0 ? rc : 0;
-}
-
-static void qedr_idr_remove(struct qedr_dev *dev, struct qedr_idr *qidr, u32 id)
-{
-	spin_lock_irq(&qidr->idr_lock);
-	idr_remove(&qidr->idr, id);
-	spin_unlock_irq(&qidr->idr_lock);
-}
-
 static inline void
 qedr_iwarp_populate_user_qp(struct qedr_dev *dev,
 			    struct qedr_qp *qp,
@@ -1694,12 +1572,10 @@
 
 static void qedr_cleanup_user(struct qedr_dev *dev, struct qedr_qp *qp)
 {
-	if (qp->usq.umem)
-		ib_umem_release(qp->usq.umem);
+	ib_umem_release(qp->usq.umem);
 	qp->usq.umem = NULL;
 
-	if (qp->urq.umem)
-		ib_umem_release(qp->urq.umem);
+	ib_umem_release(qp->urq.umem);
 	qp->urq.umem = NULL;
 }
 
@@ -1712,13 +1588,10 @@
 	struct qed_rdma_create_qp_in_params in_params;
 	struct qed_rdma_create_qp_out_params out_params;
 	struct qedr_pd *pd = get_qedr_pd(ibpd);
-	struct ib_ucontext *ib_ctx = NULL;
 	struct qedr_create_qp_ureq ureq;
 	int alloc_and_init = rdma_protocol_roce(&dev->ibdev, 1);
 	int rc = -EINVAL;
 
-	ib_ctx = ibpd->uobject->context;
-
 	memset(&ureq, 0, sizeof(ureq));
 	rc = ib_copy_from_udata(&ureq, udata, sizeof(ureq));
 	if (rc) {
@@ -1727,14 +1600,14 @@
 	}
 
 	/* SQ - read access only (0), dma sync not required (0) */
-	rc = qedr_init_user_queue(ib_ctx, dev, &qp->usq, ureq.sq_addr,
+	rc = qedr_init_user_queue(udata, dev, &qp->usq, ureq.sq_addr,
 				  ureq.sq_len, 0, 0, alloc_and_init);
 	if (rc)
 		return rc;
 
 	if (!qp->srq) {
 		/* RQ - read access only (0), dma sync not required (0) */
-		rc = qedr_init_user_queue(ib_ctx, dev, &qp->urq, ureq.rq_addr,
+		rc = qedr_init_user_queue(udata, dev, &qp->urq, ureq.rq_addr,
 					  ureq.rq_len, 0, 0, alloc_and_init);
 		if (rc)
 			return rc;
@@ -2007,7 +1880,7 @@
 	DP_DEBUG(dev, QEDR_MSG_QP, "create qp: called from %s, pd=%p\n",
 		 udata ? "user library" : "kernel", pd);
 
-	rc = qedr_check_qp_attrs(ibpd, dev, attrs);
+	rc = qedr_check_qp_attrs(ibpd, dev, attrs, udata);
 	if (rc)
 		return ERR_PTR(rc);
 
@@ -2045,7 +1918,7 @@
 	qp->ibqp.qp_num = qp->qp_id;
 
 	if (rdma_protocol_iwarp(&dev->ibdev, 1)) {
-		rc = qedr_idr_add(dev, &dev->qpidr, qp, qp->qp_id);
+		rc = xa_insert_irq(&dev->qps, qp->qp_id, qp, GFP_KERNEL);
 		if (rc)
 			goto err;
 	}
@@ -2130,7 +2003,7 @@
 		default:
 			status = -EINVAL;
 			break;
-		};
+		}
 		break;
 	case QED_ROCE_QP_STATE_INIT:
 		switch (new_state) {
@@ -2141,8 +2014,6 @@
 
 			if (rdma_protocol_roce(&dev->ibdev, 1)) {
 				writel(qp->rq.db_data.raw, qp->rq.db);
-				/* Make sure write takes effect */
-				mmiowb();
 			}
 			break;
 		case QED_ROCE_QP_STATE_ERR:
@@ -2151,7 +2022,7 @@
 			/* Invalid state change. */
 			status = -EINVAL;
 			break;
-		};
+		}
 		break;
 	case QED_ROCE_QP_STATE_RTR:
 		/* RTR->XXX */
@@ -2164,7 +2035,7 @@
 			/* Invalid state change. */
 			status = -EINVAL;
 			break;
-		};
+		}
 		break;
 	case QED_ROCE_QP_STATE_RTS:
 		/* RTS->XXX */
@@ -2177,7 +2048,7 @@
 			/* Invalid state change. */
 			status = -EINVAL;
 			break;
-		};
+		}
 		break;
 	case QED_ROCE_QP_STATE_SQD:
 		/* SQD->XXX */
@@ -2189,7 +2060,7 @@
 			/* Invalid state change. */
 			status = -EINVAL;
 			break;
-		};
+		}
 		break;
 	case QED_ROCE_QP_STATE_ERR:
 		/* ERR->XXX */
@@ -2207,12 +2078,12 @@
 		default:
 			status = -EINVAL;
 			break;
-		};
+		}
 		break;
 	default:
 		status = -EINVAL;
 		break;
-	};
+	}
 
 	return status;
 }
@@ -2240,8 +2111,7 @@
 
 	if (rdma_protocol_roce(&dev->ibdev, 1)) {
 		if (!ib_modify_qp_is_ok(old_qp_state, new_qp_state,
-					ibqp->qp_type, attr_mask,
-					IB_LINK_LAYER_ETHERNET)) {
+					ibqp->qp_type, attr_mask)) {
 			DP_ERR(dev,
 			       "modify qp: invalid attribute mask=0x%x specified for\n"
 			       "qpn=0x%x of type=0x%x old_qp_state=0x%x, new_qp_state=0x%x\n",
@@ -2556,7 +2426,8 @@
 	return rc;
 }
 
-static int qedr_free_qp_resources(struct qedr_dev *dev, struct qedr_qp *qp)
+static int qedr_free_qp_resources(struct qedr_dev *dev, struct qedr_qp *qp,
+				  struct ib_udata *udata)
 {
 	int rc = 0;
 
@@ -2566,7 +2437,7 @@
 			return rc;
 	}
 
-	if (qp->ibqp.uobject && qp->ibqp.uobject->context)
+	if (udata)
 		qedr_cleanup_user(dev, qp);
 	else
 		qedr_cleanup_kernel(dev, qp);
@@ -2574,13 +2445,12 @@
 	return 0;
 }
 
-int qedr_destroy_qp(struct ib_qp *ibqp)
+int qedr_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata)
 {
 	struct qedr_qp *qp = get_qedr_qp(ibqp);
 	struct qedr_dev *dev = qp->dev;
 	struct ib_qp_attr attr;
 	int attr_mask = 0;
-	int rc = 0;
 
 	DP_DEBUG(dev, QEDR_MSG_QP, "destroy qp: destroying %p, qp type=%d\n",
 		 qp, qp->qp_type);
@@ -2618,37 +2488,31 @@
 	if (qp->qp_type == IB_QPT_GSI)
 		qedr_destroy_gsi_qp(dev);
 
-	qedr_free_qp_resources(dev, qp);
+	qedr_free_qp_resources(dev, qp, udata);
 
 	if (atomic_dec_and_test(&qp->refcnt) &&
 	    rdma_protocol_iwarp(&dev->ibdev, 1)) {
-		qedr_idr_remove(dev, &dev->qpidr, qp->qp_id);
+		xa_erase_irq(&dev->qps, qp->qp_id);
 		kfree(qp);
 	}
-	return rc;
+	return 0;
 }
 
-struct ib_ah *qedr_create_ah(struct ib_pd *ibpd, struct rdma_ah_attr *attr,
-			     struct ib_udata *udata)
+int qedr_create_ah(struct ib_ah *ibah, struct rdma_ah_attr *attr, u32 flags,
+		   struct ib_udata *udata)
 {
-	struct qedr_ah *ah;
-
-	ah = kzalloc(sizeof(*ah), GFP_ATOMIC);
-	if (!ah)
-		return ERR_PTR(-ENOMEM);
+	struct qedr_ah *ah = get_qedr_ah(ibah);
 
 	rdma_copy_ah_attr(&ah->attr, attr);
 
-	return &ah->ibah;
+	return 0;
 }
 
-int qedr_destroy_ah(struct ib_ah *ibah)
+void qedr_destroy_ah(struct ib_ah *ibah, u32 flags)
 {
 	struct qedr_ah *ah = get_qedr_ah(ibah);
 
 	rdma_destroy_ah_attr(&ah->attr);
-	kfree(ah);
-	return 0;
 }
 
 static void free_mr_info(struct qedr_dev *dev, struct mr_info *info)
@@ -2733,7 +2597,7 @@
 
 	mr->type = QEDR_MR_USER;
 
-	mr->umem = ib_umem_get(ibpd->uobject->context, start, len, acc, 0);
+	mr->umem = ib_umem_get(udata, start, len, acc, 0);
 	if (IS_ERR(mr->umem)) {
 		rc = -EFAULT;
 		goto err0;
@@ -2744,7 +2608,7 @@
 		goto err1;
 
 	qedr_populate_pbls(dev, mr->umem, mr->info.pbl_table,
-			   &mr->info.pbl_info, mr->umem->page_shift);
+			   &mr->info.pbl_info, PAGE_SHIFT);
 
 	rc = dev->ops->rdma_alloc_tid(dev->rdma_ctx, &mr->hw_mr.itid);
 	if (rc) {
@@ -2765,7 +2629,7 @@
 	mr->hw_mr.pbl_ptr = mr->info.pbl_table[0].pa;
 	mr->hw_mr.pbl_two_level = mr->info.pbl_info.two_layered;
 	mr->hw_mr.pbl_page_size_log = ilog2(mr->info.pbl_info.pbl_size);
-	mr->hw_mr.page_size_log = mr->umem->page_shift;
+	mr->hw_mr.page_size_log = PAGE_SHIFT;
 	mr->hw_mr.fbo = ib_umem_offset(mr->umem);
 	mr->hw_mr.length = len;
 	mr->hw_mr.vaddr = usr_addr;
@@ -2797,7 +2661,7 @@
 	return ERR_PTR(rc);
 }
 
-int qedr_dereg_mr(struct ib_mr *ib_mr)
+int qedr_dereg_mr(struct ib_mr *ib_mr, struct ib_udata *udata)
 {
 	struct qedr_mr *mr = get_qedr_mr(ib_mr);
 	struct qedr_dev *dev = get_qedr_dev(ib_mr->device);
@@ -2813,8 +2677,7 @@
 		qedr_free_pbl(dev, &mr->info.pbl_info, mr->info.pbl_table);
 
 	/* it could be user registered memory. */
-	if (mr->umem)
-		ib_umem_release(mr->umem);
+	ib_umem_release(mr->umem);
 
 	kfree(mr);
 
@@ -2889,8 +2752,8 @@
 	return ERR_PTR(rc);
 }
 
-struct ib_mr *qedr_alloc_mr(struct ib_pd *ibpd,
-			    enum ib_mr_type mr_type, u32 max_num_sg)
+struct ib_mr *qedr_alloc_mr(struct ib_pd *ibpd, enum ib_mr_type mr_type,
+			    u32 max_num_sg, struct ib_udata *udata)
 {
 	struct qedr_mr *mr;
 
@@ -3560,9 +3423,6 @@
 	smp_wmb();
 	writel(qp->sq.db_data.raw, qp->sq.db);
 
-	/* Make sure write sticks */
-	mmiowb();
-
 	spin_unlock_irqrestore(&qp->q_lock, flags);
 
 	return rc;
@@ -3753,12 +3613,8 @@
 
 		writel(qp->rq.db_data.raw, qp->rq.db);
 
-		/* Make sure write sticks */
-		mmiowb();
-
 		if (rdma_protocol_iwarp(&dev->ibdev, 1)) {
 			writel(qp->rq.iwarp_db2_data.raw, qp->rq.iwarp_db2);
-			mmiowb();	/* for second doorbell */
 		}
 
 		wr = wr->next;
diff --git a/drivers/infiniband/hw/qedr/verbs.h b/drivers/infiniband/hw/qedr/verbs.h
index 0b7d012..9aaa902 100644
--- a/drivers/infiniband/hw/qedr/verbs.h
+++ b/drivers/infiniband/hw/qedr/verbs.h
@@ -43,20 +43,17 @@
 
 int qedr_query_pkey(struct ib_device *, u8 port, u16 index, u16 *pkey);
 
-struct ib_ucontext *qedr_alloc_ucontext(struct ib_device *, struct ib_udata *);
-int qedr_dealloc_ucontext(struct ib_ucontext *);
+int qedr_alloc_ucontext(struct ib_ucontext *uctx, struct ib_udata *udata);
+void qedr_dealloc_ucontext(struct ib_ucontext *uctx);
 
 int qedr_mmap(struct ib_ucontext *, struct vm_area_struct *vma);
-struct ib_pd *qedr_alloc_pd(struct ib_device *,
-			    struct ib_ucontext *, struct ib_udata *);
-int qedr_dealloc_pd(struct ib_pd *pd);
+int qedr_alloc_pd(struct ib_pd *pd, struct ib_udata *udata);
+void qedr_dealloc_pd(struct ib_pd *pd, struct ib_udata *udata);
 
-struct ib_cq *qedr_create_cq(struct ib_device *ibdev,
-			     const struct ib_cq_init_attr *attr,
-			     struct ib_ucontext *ib_ctx,
-			     struct ib_udata *udata);
+int qedr_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
+		   struct ib_udata *udata);
 int qedr_resize_cq(struct ib_cq *, int cqe, struct ib_udata *);
-int qedr_destroy_cq(struct ib_cq *);
+void qedr_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata);
 int qedr_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags);
 struct ib_qp *qedr_create_qp(struct ib_pd *, struct ib_qp_init_attr *attrs,
 			     struct ib_udata *);
@@ -64,22 +61,21 @@
 		   int attr_mask, struct ib_udata *udata);
 int qedr_query_qp(struct ib_qp *, struct ib_qp_attr *qp_attr,
 		  int qp_attr_mask, struct ib_qp_init_attr *);
-int qedr_destroy_qp(struct ib_qp *ibqp);
+int qedr_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata);
 
-struct ib_srq *qedr_create_srq(struct ib_pd *ibpd,
-			       struct ib_srq_init_attr *attr,
-			       struct ib_udata *udata);
+int qedr_create_srq(struct ib_srq *ibsrq, struct ib_srq_init_attr *attr,
+		    struct ib_udata *udata);
 int qedr_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
 		    enum ib_srq_attr_mask attr_mask, struct ib_udata *udata);
 int qedr_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr);
-int qedr_destroy_srq(struct ib_srq *ibsrq);
+void qedr_destroy_srq(struct ib_srq *ibsrq, struct ib_udata *udata);
 int qedr_post_srq_recv(struct ib_srq *ibsrq, const struct ib_recv_wr *wr,
 		       const struct ib_recv_wr **bad_recv_wr);
-struct ib_ah *qedr_create_ah(struct ib_pd *ibpd, struct rdma_ah_attr *attr,
-			     struct ib_udata *udata);
-int qedr_destroy_ah(struct ib_ah *ibah);
+int qedr_create_ah(struct ib_ah *ibah, struct rdma_ah_attr *attr, u32 flags,
+		   struct ib_udata *udata);
+void qedr_destroy_ah(struct ib_ah *ibah, u32 flags);
 
-int qedr_dereg_mr(struct ib_mr *);
+int qedr_dereg_mr(struct ib_mr *ib_mr, struct ib_udata *udata);
 struct ib_mr *qedr_get_dma_mr(struct ib_pd *, int acc);
 
 struct ib_mr *qedr_reg_user_mr(struct ib_pd *, u64 start, u64 length,
@@ -89,7 +85,7 @@
 		   int sg_nents, unsigned int *sg_offset);
 
 struct ib_mr *qedr_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
-			    u32 max_num_sg);
+			    u32 max_num_sg, struct ib_udata *udata);
 int qedr_poll_cq(struct ib_cq *, int num_entries, struct ib_wc *wc);
 int qedr_post_send(struct ib_qp *, const struct ib_send_wr *,
 		   const struct ib_send_wr **bad_wr);
diff --git a/drivers/infiniband/hw/qib/Kconfig b/drivers/infiniband/hw/qib/Kconfig
index cb06314..376d19f 100644
--- a/drivers/infiniband/hw/qib/Kconfig
+++ b/drivers/infiniband/hw/qib/Kconfig
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 config INFINIBAND_QIB
 	tristate "Intel PCIe HCA support"
 	depends on 64BIT && INFINIBAND_RDMAVT
diff --git a/drivers/infiniband/hw/qib/qib.h b/drivers/infiniband/hw/qib/qib.h
index 3461df0..432d6d0 100644
--- a/drivers/infiniband/hw/qib/qib.h
+++ b/drivers/infiniband/hw/qib/qib.h
@@ -52,6 +52,7 @@
 #include <linux/kref.h>
 #include <linux/sched.h>
 #include <linux/kthread.h>
+#include <linux/xarray.h>
 #include <rdma/ib_hdrs.h>
 #include <rdma/rdma_vt.h>
 
@@ -1105,8 +1106,7 @@
 	int rec_cpu_num; /* for cpu affinity; -1 if none */
 };
 
-extern struct list_head qib_dev_list;
-extern spinlock_t qib_devs_lock;
+extern struct xarray qib_dev_table;
 extern struct qib_devdata *qib_lookup(int unit);
 extern u32 qib_cpulist_count;
 extern unsigned long *qib_cpulist;
@@ -1390,13 +1390,13 @@
  */
 
 extern const char ib_qib_version[];
+extern const struct attribute_group qib_attr_group;
 
 int qib_device_create(struct qib_devdata *);
 void qib_device_remove(struct qib_devdata *);
 
 int qib_create_port_files(struct ib_device *ibdev, u8 port_num,
 			  struct kobject *kobj);
-int qib_verbs_register_sysfs(struct qib_devdata *);
 void qib_verbs_unregister_sysfs(struct qib_devdata *);
 /* Hook for sysfs read of QSFP */
 extern int qib_qsfp_dump(struct qib_pportdata *ppd, char *buf, int len);
diff --git a/drivers/infiniband/hw/qib/qib_common.h b/drivers/infiniband/hw/qib/qib_common.h
index a4a1f56..f91f23e 100644
--- a/drivers/infiniband/hw/qib/qib_common.h
+++ b/drivers/infiniband/hw/qib/qib_common.h
@@ -57,7 +57,7 @@
  * QIB_VERBOSE_TRACING define as 1 if you want additional tracing in
  * fastpath code
  * QIB_TRACE_REGWRITES define as 1 if you want register writes to be
- * traced in faspath code
+ * traced in fastpath code
  * _QIB_TRACING define as 0 if you want to remove all tracing in a
  * compilation unit
  */
diff --git a/drivers/infiniband/hw/qib/qib_debugfs.c b/drivers/infiniband/hw/qib/qib_debugfs.c
index 5ed1ed9..caeb77d 100644
--- a/drivers/infiniband/hw/qib/qib_debugfs.c
+++ b/drivers/infiniband/hw/qib/qib_debugfs.c
@@ -66,15 +66,6 @@
 	.release = seq_release \
 };
 
-#define DEBUGFS_FILE_CREATE(name) \
-do { \
-	struct dentry *ent; \
-	ent = debugfs_create_file(#name , 0400, ibd->qib_ibdev_dbg, \
-		ibd, &_##name##_file_ops); \
-	if (!ent) \
-		pr_warn("create of " #name " failed\n"); \
-} while (0)
-
 static void *_opcode_stats_seq_start(struct seq_file *s, loff_t *pos)
 {
 	struct qib_opcode_stats_perctx *opstats;
@@ -249,17 +240,17 @@
 
 void qib_dbg_ibdev_init(struct qib_ibdev *ibd)
 {
+	struct dentry *root;
 	char name[10];
 
 	snprintf(name, sizeof(name), "qib%d", dd_from_dev(ibd)->unit);
-	ibd->qib_ibdev_dbg = debugfs_create_dir(name, qib_dbg_root);
-	if (!ibd->qib_ibdev_dbg) {
-		pr_warn("create of %s failed\n", name);
-		return;
-	}
-	DEBUGFS_FILE_CREATE(opcode_stats);
-	DEBUGFS_FILE_CREATE(ctx_stats);
-	DEBUGFS_FILE_CREATE(qp_stats);
+	root = debugfs_create_dir(name, qib_dbg_root);
+	ibd->qib_ibdev_dbg = root;
+
+	debugfs_create_file("opcode_stats", 0400, root, ibd,
+			    &_opcode_stats_file_ops);
+	debugfs_create_file("ctx_stats", 0400, root, ibd, &_ctx_stats_file_ops);
+	debugfs_create_file("qp_stats", 0400, root, ibd, &_qp_stats_file_ops);
 }
 
 void qib_dbg_ibdev_exit(struct qib_ibdev *ibd)
@@ -274,8 +265,6 @@
 void qib_dbg_init(void)
 {
 	qib_dbg_root = debugfs_create_dir(QIB_DRV_NAME, NULL);
-	if (!qib_dbg_root)
-		pr_warn("init of debugfs failed\n");
 }
 
 void qib_dbg_exit(void)
diff --git a/drivers/infiniband/hw/qib/qib_driver.c b/drivers/infiniband/hw/qib/qib_driver.c
index 3117cc5..92eeea5 100644
--- a/drivers/infiniband/hw/qib/qib_driver.c
+++ b/drivers/infiniband/hw/qib/qib_driver.c
@@ -49,8 +49,6 @@
  */
 const char ib_qib_version[] = QIB_DRIVER_VERSION "\n";
 
-DEFINE_SPINLOCK(qib_devs_lock);
-LIST_HEAD(qib_dev_list);
 DEFINE_MUTEX(qib_mutex);	/* general driver use */
 
 unsigned qib_ibmtu;
@@ -96,11 +94,11 @@
 {
 	struct qib_devdata *dd;
 	struct qib_pportdata *ppd;
-	unsigned long flags;
+	unsigned long index, flags;
 	int pidx, nunits_active = 0;
 
-	spin_lock_irqsave(&qib_devs_lock, flags);
-	list_for_each_entry(dd, &qib_dev_list, list) {
+	xa_lock_irqsave(&qib_dev_table, flags);
+	xa_for_each(&qib_dev_table, index, dd) {
 		if (!(dd->flags & QIB_PRESENT) || !dd->kregbase)
 			continue;
 		for (pidx = 0; pidx < dd->num_pports; ++pidx) {
@@ -112,7 +110,7 @@
 			}
 		}
 	}
-	spin_unlock_irqrestore(&qib_devs_lock, flags);
+	xa_unlock_irqrestore(&qib_dev_table, flags);
 	return nunits_active;
 }
 
@@ -125,13 +123,12 @@
 {
 	int nunits = 0, npresent = 0, nup = 0;
 	struct qib_devdata *dd;
-	unsigned long flags;
+	unsigned long index, flags;
 	int pidx;
 	struct qib_pportdata *ppd;
 
-	spin_lock_irqsave(&qib_devs_lock, flags);
-
-	list_for_each_entry(dd, &qib_dev_list, list) {
+	xa_lock_irqsave(&qib_dev_table, flags);
+	xa_for_each(&qib_dev_table, index, dd) {
 		nunits++;
 		if ((dd->flags & QIB_PRESENT) && dd->kregbase)
 			npresent++;
@@ -142,8 +139,7 @@
 				nup++;
 		}
 	}
-
-	spin_unlock_irqrestore(&qib_devs_lock, flags);
+	xa_unlock_irqrestore(&qib_dev_table, flags);
 
 	if (npresentp)
 		*npresentp = npresent;
diff --git a/drivers/infiniband/hw/qib/qib_file_ops.c b/drivers/infiniband/hw/qib/qib_file_ops.c
index 98e1ce1..b014422 100644
--- a/drivers/infiniband/hw/qib/qib_file_ops.c
+++ b/drivers/infiniband/hw/qib/qib_file_ops.c
@@ -343,7 +343,7 @@
 
 	/* virtual address of first page in transfer */
 	vaddr = ti->tidvaddr;
-	if (!access_ok(VERIFY_WRITE, (void __user *) vaddr,
+	if (!access_ok((void __user *) vaddr,
 		       cnt * PAGE_SIZE)) {
 		ret = -EFAULT;
 		goto done;
@@ -1142,7 +1142,7 @@
 static void assign_ctxt_affinity(struct file *fp, struct qib_devdata *dd)
 {
 	struct qib_filedata *fd = fp->private_data;
-	const unsigned int weight = cpumask_weight(&current->cpus_allowed);
+	const unsigned int weight = current->nr_cpus_allowed;
 	const struct cpumask *local_mask = cpumask_of_pcibus(dd->pcidev->bus);
 	int local_cpu;
 
@@ -1623,9 +1623,8 @@
 		ret = find_free_ctxt(i_minor - 1, fp, uinfo);
 	else {
 		int unit;
-		const unsigned int cpu = cpumask_first(&current->cpus_allowed);
-		const unsigned int weight =
-			cpumask_weight(&current->cpus_allowed);
+		const unsigned int cpu = cpumask_first(current->cpus_ptr);
+		const unsigned int weight = current->nr_cpus_allowed;
 
 		if (weight == 1 && !test_bit(cpu, qib_cpulist))
 			if (!find_hca(cpu, &unit) && unit >= 0)
@@ -1790,7 +1789,6 @@
 
 static int qib_close(struct inode *in, struct file *fp)
 {
-	int ret = 0;
 	struct qib_filedata *fd;
 	struct qib_ctxtdata *rcd;
 	struct qib_devdata *dd;
@@ -1874,7 +1872,7 @@
 
 bail:
 	kfree(fd);
-	return ret;
+	return 0;
 }
 
 static int qib_ctxt_info(struct file *fp, struct qib_ctxt_info __user *uinfo)
diff --git a/drivers/infiniband/hw/qib/qib_fs.c b/drivers/infiniband/hw/qib/qib_fs.c
index 1d940a2..e336d77 100644
--- a/drivers/infiniband/hw/qib/qib_fs.c
+++ b/drivers/infiniband/hw/qib/qib_fs.c
@@ -34,6 +34,7 @@
 
 #include <linux/module.h>
 #include <linux/fs.h>
+#include <linux/fs_context.h>
 #include <linux/mount.h>
 #include <linux/pagemap.h>
 #include <linux/init.h>
@@ -492,7 +493,7 @@
 	remove_file(dir, "flash");
 	inode_unlock(d_inode(dir));
 	ret = simple_rmdir(d_inode(root), dir);
-	d_delete(dir);
+	d_drop(dir);
 	dput(dir);
 
 bail:
@@ -506,10 +507,10 @@
  * after device init.  The direct add_cntr_files() call handles adding
  * them from the init code, when the fs is already mounted.
  */
-static int qibfs_fill_super(struct super_block *sb, void *data, int silent)
+static int qibfs_fill_super(struct super_block *sb, struct fs_context *fc)
 {
-	struct qib_devdata *dd, *tmp;
-	unsigned long flags;
+	struct qib_devdata *dd;
+	unsigned long index;
 	int ret;
 
 	static const struct tree_descr files[] = {
@@ -524,33 +525,34 @@
 		goto bail;
 	}
 
-	spin_lock_irqsave(&qib_devs_lock, flags);
-
-	list_for_each_entry_safe(dd, tmp, &qib_dev_list, list) {
-		spin_unlock_irqrestore(&qib_devs_lock, flags);
+	xa_for_each(&qib_dev_table, index, dd) {
 		ret = add_cntr_files(sb, dd);
 		if (ret)
 			goto bail;
-		spin_lock_irqsave(&qib_devs_lock, flags);
 	}
 
-	spin_unlock_irqrestore(&qib_devs_lock, flags);
-
 bail:
 	return ret;
 }
 
-static struct dentry *qibfs_mount(struct file_system_type *fs_type, int flags,
-			const char *dev_name, void *data)
+static int qibfs_get_tree(struct fs_context *fc)
 {
-	struct dentry *ret;
-
-	ret = mount_single(fs_type, flags, data, qibfs_fill_super);
-	if (!IS_ERR(ret))
-		qib_super = ret->d_sb;
+	int ret = get_tree_single(fc, qibfs_fill_super);
+	if (ret == 0)
+		qib_super = fc->root->d_sb;
 	return ret;
 }
 
+static const struct fs_context_operations qibfs_context_ops = {
+	.get_tree	= qibfs_get_tree,
+};
+
+static int qibfs_init_fs_context(struct fs_context *fc)
+{
+	fc->ops = &qibfs_context_ops;
+	return 0;
+}
+
 static void qibfs_kill_super(struct super_block *s)
 {
 	kill_litter_super(s);
@@ -589,7 +591,7 @@
 static struct file_system_type qibfs_fs_type = {
 	.owner =        THIS_MODULE,
 	.name =         "ipathfs",
-	.mount =        qibfs_mount,
+	.init_fs_context = qibfs_init_fs_context,
 	.kill_sb =      qibfs_kill_super,
 };
 MODULE_ALIAS_FS("ipathfs");
diff --git a/drivers/infiniband/hw/qib/qib_iba6120.c b/drivers/infiniband/hw/qib/qib_iba6120.c
index fb1ff59..531d8a1 100644
--- a/drivers/infiniband/hw/qib/qib_iba6120.c
+++ b/drivers/infiniband/hw/qib/qib_iba6120.c
@@ -1884,7 +1884,6 @@
 	qib_write_kreg(dd, kr_scratch, 0xfeeddeaf);
 	writel(pa, tidp32);
 	qib_write_kreg(dd, kr_scratch, 0xdeadbeef);
-	mmiowb();
 	spin_unlock_irqrestore(tidlockp, flags);
 }
 
@@ -1928,7 +1927,6 @@
 			pa |= 2 << 29;
 	}
 	writel(pa, tidp32);
-	mmiowb();
 }
 
 
@@ -2053,9 +2051,7 @@
 {
 	if (updegr)
 		qib_write_ureg(rcd->dd, ur_rcvegrindexhead, egrhd, rcd->ctxt);
-	mmiowb();
 	qib_write_ureg(rcd->dd, ur_rcvhdrhead, hd, rcd->ctxt);
-	mmiowb();
 }
 
 static u32 qib_6120_hdrqempty(struct qib_ctxtdata *rcd)
@@ -3237,7 +3233,6 @@
 	/* we always allocate at least 2048 bytes for eager buffers */
 	ret = ib_mtu_enum_to_int(qib_ibmtu);
 	dd->rcvegrbufsize = ret != -1 ? max(ret, 2048) : QIB_DEFAULT_MTU;
-	BUG_ON(!is_power_of_2(dd->rcvegrbufsize));
 	dd->rcvegrbufsize_shift = ilog2(dd->rcvegrbufsize);
 
 	qib_6120_tidtemplate(dd);
diff --git a/drivers/infiniband/hw/qib/qib_iba7220.c b/drivers/infiniband/hw/qib/qib_iba7220.c
index 163a57a..ea3ddb0 100644
--- a/drivers/infiniband/hw/qib/qib_iba7220.c
+++ b/drivers/infiniband/hw/qib/qib_iba7220.c
@@ -2175,7 +2175,6 @@
 		pa = chippa;
 	}
 	writeq(pa, tidptr);
-	mmiowb();
 }
 
 /**
@@ -2704,9 +2703,7 @@
 {
 	if (updegr)
 		qib_write_ureg(rcd->dd, ur_rcvegrindexhead, egrhd, rcd->ctxt);
-	mmiowb();
 	qib_write_ureg(rcd->dd, ur_rcvhdrhead, hd, rcd->ctxt);
-	mmiowb();
 }
 
 static u32 qib_7220_hdrqempty(struct qib_ctxtdata *rcd)
@@ -4043,7 +4040,6 @@
 	/* we always allocate at least 2048 bytes for eager buffers */
 	ret = ib_mtu_enum_to_int(qib_ibmtu);
 	dd->rcvegrbufsize = ret != -1 ? max(ret, 2048) : QIB_DEFAULT_MTU;
-	BUG_ON(!is_power_of_2(dd->rcvegrbufsize));
 	dd->rcvegrbufsize_shift = ilog2(dd->rcvegrbufsize);
 
 	qib_7220_tidtemplate(dd);
@@ -4252,7 +4248,6 @@
 		unsigned word = i / 64;
 		unsigned bit = i & 63;
 
-		BUG_ON(word >= 3);
 		senddmabufmask[word] |= 1ULL << bit;
 	}
 	qib_write_kreg(dd, kr_senddmabufmask0, senddmabufmask[0]);
diff --git a/drivers/infiniband/hw/qib/qib_iba7322.c b/drivers/infiniband/hw/qib/qib_iba7322.c
index bf5e222..dd48433 100644
--- a/drivers/infiniband/hw/qib/qib_iba7322.c
+++ b/drivers/infiniband/hw/qib/qib_iba7322.c
@@ -1382,7 +1382,6 @@
 					*msg++ = ',';
 					len--;
 				}
-				BUG_ON(!msp->sz);
 				/* msp->sz counts the nul */
 				took = min_t(size_t, msp->sz - (size_t)1, len);
 				memcpy(msg,  msp->msg, took);
@@ -3794,7 +3793,6 @@
 		pa = chippa;
 	}
 	writeq(pa, tidptr);
-	mmiowb();
 }
 
 /**
@@ -4441,10 +4439,8 @@
 		adjust_rcv_timeout(rcd, npkts);
 	if (updegr)
 		qib_write_ureg(rcd->dd, ur_rcvegrindexhead, egrhd, rcd->ctxt);
-	mmiowb();
 	qib_write_ureg(rcd->dd, ur_rcvhdrhead, hd, rcd->ctxt);
 	qib_write_ureg(rcd->dd, ur_rcvhdrhead, hd, rcd->ctxt);
-	mmiowb();
 }
 
 static u32 qib_7322_hdrqempty(struct qib_ctxtdata *rcd)
@@ -6141,7 +6137,7 @@
 static int setup_txselect(const char *str, const struct kernel_param *kp)
 {
 	struct qib_devdata *dd;
-	unsigned long val;
+	unsigned long index, val;
 	char *n;
 
 	if (strlen(str) >= ARRAY_SIZE(txselect_list)) {
@@ -6157,7 +6153,7 @@
 	}
 	strncpy(txselect_list, str, ARRAY_SIZE(txselect_list) - 1);
 
-	list_for_each_entry(dd, &qib_dev_list, list)
+	xa_for_each(&qib_dev_table, index, dd)
 		if (dd->deviceid == PCI_DEVICE_ID_QLOGIC_IB_7322)
 			set_no_qsfp_atten(dd, 1);
 	return 0;
@@ -6599,7 +6595,6 @@
 
 	/* we always allocate at least 2048 bytes for eager buffers */
 	dd->rcvegrbufsize = max(mtu, 2048);
-	BUG_ON(!is_power_of_2(dd->rcvegrbufsize));
 	dd->rcvegrbufsize_shift = ilog2(dd->rcvegrbufsize);
 
 	qib_7322_tidtemplate(dd);
@@ -6904,7 +6899,6 @@
 		unsigned word = erstbuf / BITS_PER_LONG;
 		unsigned bit = erstbuf & (BITS_PER_LONG - 1);
 
-		BUG_ON(word >= 3);
 		senddmabufmask[word] |= 1ULL << bit;
 	}
 	qib_write_kreg_port(ppd, krp_senddmabufmask0, senddmabufmask[0]);
diff --git a/drivers/infiniband/hw/qib/qib_init.c b/drivers/infiniband/hw/qib/qib_init.c
index d7cdc77..d4fd8a6 100644
--- a/drivers/infiniband/hw/qib/qib_init.c
+++ b/drivers/infiniband/hw/qib/qib_init.c
@@ -36,7 +36,6 @@
 #include <linux/netdevice.h>
 #include <linux/vmalloc.h>
 #include <linux/delay.h>
-#include <linux/idr.h>
 #include <linux/module.h>
 #include <linux/printk.h>
 #ifdef CONFIG_INFINIBAND_QIB_DCA
@@ -95,7 +94,7 @@
 
 static void verify_interrupt(struct timer_list *);
 
-static struct idr qib_unit_table;
+DEFINE_XARRAY_FLAGS(qib_dev_table, XA_FLAGS_ALLOC | XA_FLAGS_LOCK_IRQ);
 u32 qib_cpulist_count;
 unsigned long *qib_cpulist;
 
@@ -209,7 +208,6 @@
 		rcd->rcvegrbuf_chunks = (rcd->rcvegrcnt +
 			rcd->rcvegrbufs_perchunk - 1) /
 			rcd->rcvegrbufs_perchunk;
-		BUG_ON(!is_power_of_2(rcd->rcvegrbufs_perchunk));
 		rcd->rcvegrbufs_perchunk_shift =
 			ilog2(rcd->rcvegrbufs_perchunk);
 	}
@@ -786,21 +784,9 @@
 {
 }
 
-static inline struct qib_devdata *__qib_lookup(int unit)
-{
-	return idr_find(&qib_unit_table, unit);
-}
-
 struct qib_devdata *qib_lookup(int unit)
 {
-	struct qib_devdata *dd;
-	unsigned long flags;
-
-	spin_lock_irqsave(&qib_devs_lock, flags);
-	dd = __qib_lookup(unit);
-	spin_unlock_irqrestore(&qib_devs_lock, flags);
-
-	return dd;
+	return xa_load(&qib_dev_table, unit);
 }
 
 /*
@@ -1047,10 +1033,9 @@
 {
 	unsigned long flags;
 
-	spin_lock_irqsave(&qib_devs_lock, flags);
-	idr_remove(&qib_unit_table, dd->unit);
-	list_del(&dd->list);
-	spin_unlock_irqrestore(&qib_devs_lock, flags);
+	xa_lock_irqsave(&qib_dev_table, flags);
+	__xa_erase(&qib_dev_table, dd->unit);
+	xa_unlock_irqrestore(&qib_dev_table, flags);
 
 #ifdef CONFIG_DEBUG_FS
 	qib_dbg_ibdev_exit(&dd->verbs_dev);
@@ -1071,15 +1056,15 @@
 
 u64 qib_sps_ints(void)
 {
-	unsigned long flags;
+	unsigned long index, flags;
 	struct qib_devdata *dd;
 	u64 sps_ints = 0;
 
-	spin_lock_irqsave(&qib_devs_lock, flags);
-	list_for_each_entry(dd, &qib_dev_list, list) {
+	xa_lock_irqsave(&qib_dev_table, flags);
+	xa_for_each(&qib_dev_table, index, dd) {
 		sps_ints += qib_int_counter(dd);
 	}
-	spin_unlock_irqrestore(&qib_devs_lock, flags);
+	xa_unlock_irqrestore(&qib_dev_table, flags);
 	return sps_ints;
 }
 
@@ -1088,12 +1073,9 @@
  * allocator, because the verbs cleanup process both does cleanup and
  * free of the data structure.
  * "extra" is for chip-specific data.
- *
- * Use the idr mechanism to get a unit number for this unit.
  */
 struct qib_devdata *qib_alloc_devdata(struct pci_dev *pdev, size_t extra)
 {
-	unsigned long flags;
 	struct qib_devdata *dd;
 	int ret, nports;
 
@@ -1104,20 +1086,8 @@
 	if (!dd)
 		return ERR_PTR(-ENOMEM);
 
-	INIT_LIST_HEAD(&dd->list);
-
-	idr_preload(GFP_KERNEL);
-	spin_lock_irqsave(&qib_devs_lock, flags);
-
-	ret = idr_alloc(&qib_unit_table, dd, 0, 0, GFP_NOWAIT);
-	if (ret >= 0) {
-		dd->unit = ret;
-		list_add(&dd->list, &qib_dev_list);
-	}
-
-	spin_unlock_irqrestore(&qib_devs_lock, flags);
-	idr_preload_end();
-
+	ret = xa_alloc_irq(&qib_dev_table, &dd->unit, dd, xa_limit_32b,
+			GFP_KERNEL);
 	if (ret < 0) {
 		qib_early_err(&pdev->dev,
 			      "Could not allocate unit ID: error %d\n", -ret);
@@ -1256,8 +1226,6 @@
 	 * These must be called before the driver is registered with
 	 * the PCI subsystem.
 	 */
-	idr_init(&qib_unit_table);
-
 #ifdef CONFIG_INFINIBAND_QIB_DCA
 	dca_register_notify(&dca_notifier);
 #endif
@@ -1282,7 +1250,6 @@
 #ifdef CONFIG_DEBUG_FS
 	qib_dbg_exit();
 #endif
-	idr_destroy(&qib_unit_table);
 	qib_dev_cleanup();
 bail:
 	return ret;
@@ -1314,7 +1281,7 @@
 	qib_cpulist_count = 0;
 	kfree(qib_cpulist);
 
-	idr_destroy(&qib_unit_table);
+	WARN_ON(!xa_empty(&qib_dev_table));
 	qib_dev_cleanup();
 }
 
diff --git a/drivers/infiniband/hw/qib/qib_mad.c b/drivers/infiniband/hw/qib/qib_mad.c
index 4845d00..f92faf5 100644
--- a/drivers/infiniband/hw/qib/qib_mad.c
+++ b/drivers/infiniband/hw/qib/qib_mad.c
@@ -2494,5 +2494,6 @@
 		del_timer_sync(&dd->pport[port_idx].cong_stats.timer);
 
 	if (dd->pport[port_idx].ibport_data.smi_ah)
-		rdma_destroy_ah(&dd->pport[port_idx].ibport_data.smi_ah->ibah);
+		rdma_destroy_ah(&dd->pport[port_idx].ibport_data.smi_ah->ibah,
+				RDMA_DESTROY_AH_SLEEPABLE);
 }
diff --git a/drivers/infiniband/hw/qib/qib_pcie.c b/drivers/infiniband/hw/qib/qib_pcie.c
index 5ac7b31..864f2af 100644
--- a/drivers/infiniband/hw/qib/qib_pcie.c
+++ b/drivers/infiniband/hw/qib/qib_pcie.c
@@ -387,7 +387,7 @@
 
 static int qib_pcie_coalesce;
 module_param_named(pcie_coalesce, qib_pcie_coalesce, int, S_IRUGO);
-MODULE_PARM_DESC(pcie_coalesce, "tune PCIe colescing on some Intel chipsets");
+MODULE_PARM_DESC(pcie_coalesce, "tune PCIe coalescing on some Intel chipsets");
 
 /*
  * Enable PCIe completion and data coalescing, on Intel 5x00 and 7300
@@ -597,7 +597,6 @@
 	struct qib_devdata *dd = pci_get_drvdata(pdev);
 
 	qib_devinfo(pdev, "QIB resume function called\n");
-	pci_cleanup_aer_uncorrect_error_status(pdev);
 	/*
 	 * Running jobs will fail, since it's asynchronous
 	 * unlike sysfs-requested reset.   Better than
diff --git a/drivers/infiniband/hw/qib/qib_qp.c b/drivers/infiniband/hw/qib/qib_qp.c
index 344e401..8d0563e 100644
--- a/drivers/infiniband/hw/qib/qib_qp.c
+++ b/drivers/infiniband/hw/qib/qib_qp.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2012 - 2017 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2012 - 2019 Intel Corporation.  All rights reserved.
  * Copyright (c) 2006 - 2012 QLogic Corporation.  * All rights reserved.
  * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
  *
@@ -378,39 +378,36 @@
  * qib_check_send_wqe - validate wr/wqe
  * @qp - The qp
  * @wqe - The built wqe
+ * @call_send - Determine if the send should be posted or scheduled
  *
- * validate wr/wqe.  This is called
- * prior to inserting the wqe into
- * the ring but after the wqe has been
- * setup.
- *
- * Returns 1 to force direct progress, 0 otherwise, -EINVAL on failure
+ * Returns 0 on success, -EINVAL on failure
  */
 int qib_check_send_wqe(struct rvt_qp *qp,
-		       struct rvt_swqe *wqe)
+		       struct rvt_swqe *wqe, bool *call_send)
 {
 	struct rvt_ah *ah;
-	int ret = 0;
 
 	switch (qp->ibqp.qp_type) {
 	case IB_QPT_RC:
 	case IB_QPT_UC:
 		if (wqe->length > 0x80000000U)
 			return -EINVAL;
+		if (wqe->length > qp->pmtu)
+			*call_send = false;
 		break;
 	case IB_QPT_SMI:
 	case IB_QPT_GSI:
 	case IB_QPT_UD:
-		ah = ibah_to_rvtah(wqe->ud_wr.ah);
+		ah = rvt_get_swqe_ah(wqe);
 		if (wqe->length > (1 << ah->log_pmtu))
 			return -EINVAL;
 		/* progress hint */
-		ret = 1;
+		*call_send = true;
 		break;
 	default:
 		break;
 	}
-	return ret;
+	return 0;
 }
 
 #ifdef CONFIG_DEBUG_FS
diff --git a/drivers/infiniband/hw/qib/qib_rc.c b/drivers/infiniband/hw/qib/qib_rc.c
index f35fdeb..aaf7438 100644
--- a/drivers/infiniband/hw/qib/qib_rc.c
+++ b/drivers/infiniband/hw/qib/qib_rc.c
@@ -45,12 +45,7 @@
 	u32 len;
 
 	len = ((psn - wqe->psn) & QIB_PSN_MASK) * pmtu;
-	ss->sge = wqe->sg_list[0];
-	ss->sg_list = wqe->sg_list + 1;
-	ss->num_sge = wqe->wr.num_sge;
-	ss->total_len = wqe->length;
-	rvt_skip_sge(ss, len, false);
-	return wqe->length - len;
+	return rvt_restart_sge(ss, wqe, len);
 }
 
 /**
@@ -254,7 +249,7 @@
 			goto bail;
 		}
 		wqe = rvt_get_swqe_ptr(qp, qp->s_last);
-		qib_send_complete(qp, wqe, qp->s_last != qp->s_acked ?
+		rvt_send_complete(qp, wqe, qp->s_last != qp->s_acked ?
 			IB_WC_SUCCESS : IB_WC_WR_FLUSH_ERR);
 		/* will get called again */
 		goto done;
@@ -318,11 +313,8 @@
 		case IB_WR_SEND:
 		case IB_WR_SEND_WITH_IMM:
 			/* If no credit, return. */
-			if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT) &&
-			    rvt_cmp_msn(wqe->ssn, qp->s_lsn + 1) > 0) {
-				qp->s_flags |= RVT_S_WAIT_SSN_CREDIT;
+			if (!rvt_rc_credit_avail(qp, wqe))
 				goto bail;
-			}
 			if (len > pmtu) {
 				qp->s_state = OP(SEND_FIRST);
 				len = pmtu;
@@ -349,11 +341,8 @@
 			goto no_flow_control;
 		case IB_WR_RDMA_WRITE_WITH_IMM:
 			/* If no credit, return. */
-			if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT) &&
-			    rvt_cmp_msn(wqe->ssn, qp->s_lsn + 1) > 0) {
-				qp->s_flags |= RVT_S_WAIT_SSN_CREDIT;
+			if (!rvt_rc_credit_avail(qp, wqe))
 				goto bail;
-			}
 no_flow_control:
 			ohdr->u.rc.reth.vaddr =
 				cpu_to_be64(wqe->rdma_wr.remote_addr);
@@ -838,7 +827,7 @@
 			qib_migrate_qp(qp);
 			qp->s_retry = qp->s_retry_cnt;
 		} else if (qp->s_last == qp->s_acked) {
-			qib_send_complete(qp, wqe, IB_WC_RETRY_EXC_ERR);
+			rvt_send_complete(qp, wqe, IB_WC_RETRY_EXC_ERR);
 			rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR);
 			return;
 		} else /* XXX need to handle delayed completion */
@@ -926,20 +915,11 @@
 		rvt_add_retry_timer(qp);
 
 	while (qp->s_last != qp->s_acked) {
-		u32 s_last;
-
 		wqe = rvt_get_swqe_ptr(qp, qp->s_last);
 		if (qib_cmp24(wqe->lpsn, qp->s_sending_psn) >= 0 &&
 		    qib_cmp24(qp->s_sending_psn, qp->s_sending_hpsn) <= 0)
 			break;
-		s_last = qp->s_last;
-		if (++s_last >= qp->s_size)
-			s_last = 0;
-		qp->s_last = s_last;
-		/* see post_send() */
-		barrier();
-		rvt_put_swqe(wqe);
-		rvt_qp_swqe_complete(qp,
+		rvt_qp_complete_swqe(qp,
 				     wqe,
 				     ib_qib_wc_opcode[wqe->wr.opcode],
 				     IB_WC_SUCCESS);
@@ -977,21 +957,12 @@
 	 * is finished.
 	 */
 	if (qib_cmp24(wqe->lpsn, qp->s_sending_psn) < 0 ||
-	    qib_cmp24(qp->s_sending_psn, qp->s_sending_hpsn) > 0) {
-		u32 s_last;
-
-		rvt_put_swqe(wqe);
-		s_last = qp->s_last;
-		if (++s_last >= qp->s_size)
-			s_last = 0;
-		qp->s_last = s_last;
-		/* see post_send() */
-		barrier();
-		rvt_qp_swqe_complete(qp,
+	    qib_cmp24(qp->s_sending_psn, qp->s_sending_hpsn) > 0)
+		rvt_qp_complete_swqe(qp,
 				     wqe,
 				     ib_qib_wc_opcode[wqe->wr.opcode],
 				     IB_WC_SUCCESS);
-	} else
+	else
 		this_cpu_inc(*ibp->rvp.rc_delayed_comp);
 
 	qp->s_retry = qp->s_retry_cnt;
@@ -1221,7 +1192,7 @@
 			ibp->rvp.n_other_naks++;
 class_b:
 			if (qp->s_last == qp->s_acked) {
-				qib_send_complete(qp, wqe, status);
+				rvt_send_complete(qp, wqe, status);
 				rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR);
 			}
 			break;
@@ -1425,7 +1396,8 @@
 		qp->s_rdma_read_len -= pmtu;
 		update_last_psn(qp, psn);
 		spin_unlock_irqrestore(&qp->s_lock, flags);
-		qib_copy_sge(&qp->s_rdma_read_sge, data, pmtu, 0);
+		rvt_copy_sge(qp, &qp->s_rdma_read_sge,
+			     data, pmtu, false, false);
 		goto bail;
 
 	case OP(RDMA_READ_RESPONSE_ONLY):
@@ -1471,7 +1443,8 @@
 		if (unlikely(tlen != qp->s_rdma_read_len))
 			goto ack_len_err;
 		aeth = be32_to_cpu(ohdr->u.aeth);
-		qib_copy_sge(&qp->s_rdma_read_sge, data, tlen, 0);
+		rvt_copy_sge(qp, &qp->s_rdma_read_sge,
+			     data, tlen, false, false);
 		WARN_ON(qp->s_rdma_read_sge.num_sge);
 		(void) do_rc_ack(qp, aeth, psn,
 				 OP(RDMA_READ_RESPONSE_LAST), 0, rcd);
@@ -1490,7 +1463,7 @@
 	status = IB_WC_LOC_LEN_ERR;
 ack_err:
 	if (qp->s_last == qp->s_acked) {
-		qib_send_complete(qp, wqe, status);
+		rvt_send_complete(qp, wqe, status);
 		rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR);
 	}
 ack_done:
@@ -1844,7 +1817,7 @@
 		qp->r_rcv_len += pmtu;
 		if (unlikely(qp->r_rcv_len > qp->r_len))
 			goto nack_inv;
-		qib_copy_sge(&qp->r_sge, data, pmtu, 1);
+		rvt_copy_sge(qp, &qp->r_sge, data, pmtu, true, false);
 		break;
 
 	case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE):
@@ -1890,7 +1863,7 @@
 		wc.byte_len = tlen + qp->r_rcv_len;
 		if (unlikely(wc.byte_len > qp->r_len))
 			goto nack_inv;
-		qib_copy_sge(&qp->r_sge, data, tlen, 1);
+		rvt_copy_sge(qp, &qp->r_sge, data, tlen, true, false);
 		rvt_put_ss(&qp->r_sge);
 		qp->r_msn++;
 		if (!test_and_clear_bit(RVT_R_WRID_VALID, &qp->r_aflags))
@@ -1912,8 +1885,7 @@
 		wc.dlid_path_bits = 0;
 		wc.port_num = 0;
 		/* Signal completion event if the solicited bit is set. */
-		rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc,
-			     ib_bth_is_solicited(ohdr));
+		rvt_recv_cq(qp, &wc, ib_bth_is_solicited(ohdr));
 		break;
 
 	case OP(RDMA_WRITE_FIRST):
diff --git a/drivers/infiniband/hw/qib/qib_ruc.c b/drivers/infiniband/hw/qib/qib_ruc.c
index f8a7de7..1fa2193 100644
--- a/drivers/infiniband/hw/qib/qib_ruc.c
+++ b/drivers/infiniband/hw/qib/qib_ruc.c
@@ -171,307 +171,6 @@
 }
 
 /**
- * qib_ruc_loopback - handle UC and RC lookback requests
- * @sqp: the sending QP
- *
- * This is called from qib_do_send() to
- * forward a WQE addressed to the same HCA.
- * Note that although we are single threaded due to the tasklet, we still
- * have to protect against post_send().  We don't have to worry about
- * receive interrupts since this is a connected protocol and all packets
- * will pass through here.
- */
-static void qib_ruc_loopback(struct rvt_qp *sqp)
-{
-	struct qib_ibport *ibp = to_iport(sqp->ibqp.device, sqp->port_num);
-	struct qib_pportdata *ppd = ppd_from_ibp(ibp);
-	struct qib_devdata *dd = ppd->dd;
-	struct rvt_dev_info *rdi = &dd->verbs_dev.rdi;
-	struct rvt_qp *qp;
-	struct rvt_swqe *wqe;
-	struct rvt_sge *sge;
-	unsigned long flags;
-	struct ib_wc wc;
-	u64 sdata;
-	atomic64_t *maddr;
-	enum ib_wc_status send_status;
-	int release;
-	int ret;
-
-	rcu_read_lock();
-	/*
-	 * Note that we check the responder QP state after
-	 * checking the requester's state.
-	 */
-	qp = rvt_lookup_qpn(rdi, &ibp->rvp, sqp->remote_qpn);
-	if (!qp)
-		goto done;
-
-	spin_lock_irqsave(&sqp->s_lock, flags);
-
-	/* Return if we are already busy processing a work request. */
-	if ((sqp->s_flags & (RVT_S_BUSY | RVT_S_ANY_WAIT)) ||
-	    !(ib_rvt_state_ops[sqp->state] & RVT_PROCESS_OR_FLUSH_SEND))
-		goto unlock;
-
-	sqp->s_flags |= RVT_S_BUSY;
-
-again:
-	if (sqp->s_last == READ_ONCE(sqp->s_head))
-		goto clr_busy;
-	wqe = rvt_get_swqe_ptr(sqp, sqp->s_last);
-
-	/* Return if it is not OK to start a new work reqeust. */
-	if (!(ib_rvt_state_ops[sqp->state] & RVT_PROCESS_NEXT_SEND_OK)) {
-		if (!(ib_rvt_state_ops[sqp->state] & RVT_FLUSH_SEND))
-			goto clr_busy;
-		/* We are in the error state, flush the work request. */
-		send_status = IB_WC_WR_FLUSH_ERR;
-		goto flush_send;
-	}
-
-	/*
-	 * We can rely on the entry not changing without the s_lock
-	 * being held until we update s_last.
-	 * We increment s_cur to indicate s_last is in progress.
-	 */
-	if (sqp->s_last == sqp->s_cur) {
-		if (++sqp->s_cur >= sqp->s_size)
-			sqp->s_cur = 0;
-	}
-	spin_unlock_irqrestore(&sqp->s_lock, flags);
-
-	if (!qp || !(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) ||
-	    qp->ibqp.qp_type != sqp->ibqp.qp_type) {
-		ibp->rvp.n_pkt_drops++;
-		/*
-		 * For RC, the requester would timeout and retry so
-		 * shortcut the timeouts and just signal too many retries.
-		 */
-		if (sqp->ibqp.qp_type == IB_QPT_RC)
-			send_status = IB_WC_RETRY_EXC_ERR;
-		else
-			send_status = IB_WC_SUCCESS;
-		goto serr;
-	}
-
-	memset(&wc, 0, sizeof(wc));
-	send_status = IB_WC_SUCCESS;
-
-	release = 1;
-	sqp->s_sge.sge = wqe->sg_list[0];
-	sqp->s_sge.sg_list = wqe->sg_list + 1;
-	sqp->s_sge.num_sge = wqe->wr.num_sge;
-	sqp->s_len = wqe->length;
-	switch (wqe->wr.opcode) {
-	case IB_WR_SEND_WITH_IMM:
-		wc.wc_flags = IB_WC_WITH_IMM;
-		wc.ex.imm_data = wqe->wr.ex.imm_data;
-		/* FALLTHROUGH */
-	case IB_WR_SEND:
-		ret = rvt_get_rwqe(qp, false);
-		if (ret < 0)
-			goto op_err;
-		if (!ret)
-			goto rnr_nak;
-		break;
-
-	case IB_WR_RDMA_WRITE_WITH_IMM:
-		if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE)))
-			goto inv_err;
-		wc.wc_flags = IB_WC_WITH_IMM;
-		wc.ex.imm_data = wqe->wr.ex.imm_data;
-		ret = rvt_get_rwqe(qp, true);
-		if (ret < 0)
-			goto op_err;
-		if (!ret)
-			goto rnr_nak;
-		/* FALLTHROUGH */
-	case IB_WR_RDMA_WRITE:
-		if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE)))
-			goto inv_err;
-		if (wqe->length == 0)
-			break;
-		if (unlikely(!rvt_rkey_ok(qp, &qp->r_sge.sge, wqe->length,
-					  wqe->rdma_wr.remote_addr,
-					  wqe->rdma_wr.rkey,
-					  IB_ACCESS_REMOTE_WRITE)))
-			goto acc_err;
-		qp->r_sge.sg_list = NULL;
-		qp->r_sge.num_sge = 1;
-		qp->r_sge.total_len = wqe->length;
-		break;
-
-	case IB_WR_RDMA_READ:
-		if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ)))
-			goto inv_err;
-		if (unlikely(!rvt_rkey_ok(qp, &sqp->s_sge.sge, wqe->length,
-					  wqe->rdma_wr.remote_addr,
-					  wqe->rdma_wr.rkey,
-					  IB_ACCESS_REMOTE_READ)))
-			goto acc_err;
-		release = 0;
-		sqp->s_sge.sg_list = NULL;
-		sqp->s_sge.num_sge = 1;
-		qp->r_sge.sge = wqe->sg_list[0];
-		qp->r_sge.sg_list = wqe->sg_list + 1;
-		qp->r_sge.num_sge = wqe->wr.num_sge;
-		qp->r_sge.total_len = wqe->length;
-		break;
-
-	case IB_WR_ATOMIC_CMP_AND_SWP:
-	case IB_WR_ATOMIC_FETCH_AND_ADD:
-		if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC)))
-			goto inv_err;
-		if (unlikely(!rvt_rkey_ok(qp, &qp->r_sge.sge, sizeof(u64),
-					  wqe->atomic_wr.remote_addr,
-					  wqe->atomic_wr.rkey,
-					  IB_ACCESS_REMOTE_ATOMIC)))
-			goto acc_err;
-		/* Perform atomic OP and save result. */
-		maddr = (atomic64_t *) qp->r_sge.sge.vaddr;
-		sdata = wqe->atomic_wr.compare_add;
-		*(u64 *) sqp->s_sge.sge.vaddr =
-			(wqe->atomic_wr.wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) ?
-			(u64) atomic64_add_return(sdata, maddr) - sdata :
-			(u64) cmpxchg((u64 *) qp->r_sge.sge.vaddr,
-				      sdata, wqe->atomic_wr.swap);
-		rvt_put_mr(qp->r_sge.sge.mr);
-		qp->r_sge.num_sge = 0;
-		goto send_comp;
-
-	default:
-		send_status = IB_WC_LOC_QP_OP_ERR;
-		goto serr;
-	}
-
-	sge = &sqp->s_sge.sge;
-	while (sqp->s_len) {
-		u32 len = sqp->s_len;
-
-		if (len > sge->length)
-			len = sge->length;
-		if (len > sge->sge_length)
-			len = sge->sge_length;
-		BUG_ON(len == 0);
-		qib_copy_sge(&qp->r_sge, sge->vaddr, len, release);
-		sge->vaddr += len;
-		sge->length -= len;
-		sge->sge_length -= len;
-		if (sge->sge_length == 0) {
-			if (!release)
-				rvt_put_mr(sge->mr);
-			if (--sqp->s_sge.num_sge)
-				*sge = *sqp->s_sge.sg_list++;
-		} else if (sge->length == 0 && sge->mr->lkey) {
-			if (++sge->n >= RVT_SEGSZ) {
-				if (++sge->m >= sge->mr->mapsz)
-					break;
-				sge->n = 0;
-			}
-			sge->vaddr =
-				sge->mr->map[sge->m]->segs[sge->n].vaddr;
-			sge->length =
-				sge->mr->map[sge->m]->segs[sge->n].length;
-		}
-		sqp->s_len -= len;
-	}
-	if (release)
-		rvt_put_ss(&qp->r_sge);
-
-	if (!test_and_clear_bit(RVT_R_WRID_VALID, &qp->r_aflags))
-		goto send_comp;
-
-	if (wqe->wr.opcode == IB_WR_RDMA_WRITE_WITH_IMM)
-		wc.opcode = IB_WC_RECV_RDMA_WITH_IMM;
-	else
-		wc.opcode = IB_WC_RECV;
-	wc.wr_id = qp->r_wr_id;
-	wc.status = IB_WC_SUCCESS;
-	wc.byte_len = wqe->length;
-	wc.qp = &qp->ibqp;
-	wc.src_qp = qp->remote_qpn;
-	wc.slid = rdma_ah_get_dlid(&qp->remote_ah_attr);
-	wc.sl = rdma_ah_get_sl(&qp->remote_ah_attr);
-	wc.port_num = 1;
-	/* Signal completion event if the solicited bit is set. */
-	rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc,
-		     wqe->wr.send_flags & IB_SEND_SOLICITED);
-
-send_comp:
-	spin_lock_irqsave(&sqp->s_lock, flags);
-	ibp->rvp.n_loop_pkts++;
-flush_send:
-	sqp->s_rnr_retry = sqp->s_rnr_retry_cnt;
-	qib_send_complete(sqp, wqe, send_status);
-	goto again;
-
-rnr_nak:
-	/* Handle RNR NAK */
-	if (qp->ibqp.qp_type == IB_QPT_UC)
-		goto send_comp;
-	ibp->rvp.n_rnr_naks++;
-	/*
-	 * Note: we don't need the s_lock held since the BUSY flag
-	 * makes this single threaded.
-	 */
-	if (sqp->s_rnr_retry == 0) {
-		send_status = IB_WC_RNR_RETRY_EXC_ERR;
-		goto serr;
-	}
-	if (sqp->s_rnr_retry_cnt < 7)
-		sqp->s_rnr_retry--;
-	spin_lock_irqsave(&sqp->s_lock, flags);
-	if (!(ib_rvt_state_ops[sqp->state] & RVT_PROCESS_RECV_OK))
-		goto clr_busy;
-	rvt_add_rnr_timer(sqp, qp->r_min_rnr_timer <<
-				IB_AETH_CREDIT_SHIFT);
-	goto clr_busy;
-
-op_err:
-	send_status = IB_WC_REM_OP_ERR;
-	wc.status = IB_WC_LOC_QP_OP_ERR;
-	goto err;
-
-inv_err:
-	send_status = IB_WC_REM_INV_REQ_ERR;
-	wc.status = IB_WC_LOC_QP_OP_ERR;
-	goto err;
-
-acc_err:
-	send_status = IB_WC_REM_ACCESS_ERR;
-	wc.status = IB_WC_LOC_PROT_ERR;
-err:
-	/* responder goes to error state */
-	rvt_rc_error(qp, wc.status);
-
-serr:
-	spin_lock_irqsave(&sqp->s_lock, flags);
-	qib_send_complete(sqp, wqe, send_status);
-	if (sqp->ibqp.qp_type == IB_QPT_RC) {
-		int lastwqe = rvt_error_qp(sqp, IB_WC_WR_FLUSH_ERR);
-
-		sqp->s_flags &= ~RVT_S_BUSY;
-		spin_unlock_irqrestore(&sqp->s_lock, flags);
-		if (lastwqe) {
-			struct ib_event ev;
-
-			ev.device = sqp->ibqp.device;
-			ev.element.qp = &sqp->ibqp;
-			ev.event = IB_EVENT_QP_LAST_WQE_REACHED;
-			sqp->ibqp.event_handler(&ev, sqp->ibqp.qp_context);
-		}
-		goto done;
-	}
-clr_busy:
-	sqp->s_flags &= ~RVT_S_BUSY;
-unlock:
-	spin_unlock_irqrestore(&sqp->s_lock, flags);
-done:
-	rcu_read_unlock();
-}
-
-/**
  * qib_make_grh - construct a GRH header
  * @ibp: a pointer to the IB port
  * @hdr: a pointer to the GRH header being constructed
@@ -573,7 +272,7 @@
 	     qp->ibqp.qp_type == IB_QPT_UC) &&
 	    (rdma_ah_get_dlid(&qp->remote_ah_attr) &
 	     ~((1 << ppd->lmc) - 1)) == ppd->lid) {
-		qib_ruc_loopback(qp);
+		rvt_ruc_loopback(qp);
 		return;
 	}
 
@@ -613,42 +312,3 @@
 
 	spin_unlock_irqrestore(&qp->s_lock, flags);
 }
-
-/*
- * This should be called with s_lock held.
- */
-void qib_send_complete(struct rvt_qp *qp, struct rvt_swqe *wqe,
-		       enum ib_wc_status status)
-{
-	u32 old_last, last;
-
-	if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_OR_FLUSH_SEND))
-		return;
-
-	last = qp->s_last;
-	old_last = last;
-	if (++last >= qp->s_size)
-		last = 0;
-	qp->s_last = last;
-	/* See post_send() */
-	barrier();
-	rvt_put_swqe(wqe);
-	if (qp->ibqp.qp_type == IB_QPT_UD ||
-	    qp->ibqp.qp_type == IB_QPT_SMI ||
-	    qp->ibqp.qp_type == IB_QPT_GSI)
-		atomic_dec(&ibah_to_rvtah(wqe->ud_wr.ah)->refcount);
-
-	rvt_qp_swqe_complete(qp,
-			     wqe,
-			     ib_qib_wc_opcode[wqe->wr.opcode],
-			     status);
-
-	if (qp->s_acked == old_last)
-		qp->s_acked = last;
-	if (qp->s_cur == old_last)
-		qp->s_cur = last;
-	if (qp->s_tail == old_last)
-		qp->s_tail = last;
-	if (qp->state == IB_QPS_SQD && last == qp->s_cur)
-		qp->s_draining = 0;
-}
diff --git a/drivers/infiniband/hw/qib/qib_sd7220.c b/drivers/infiniband/hw/qib/qib_sd7220.c
index 12caf3d..4f4a09c 100644
--- a/drivers/infiniband/hw/qib/qib_sd7220.c
+++ b/drivers/infiniband/hw/qib/qib_sd7220.c
@@ -1068,7 +1068,6 @@
 	for (idx = 0; idx < NUM_DDS_REGS; ++idx) {
 		data = ((dds_reg_map & 0xF) << 4) | TX_FAST_ELT;
 		writeq(data, iaddr + idx);
-		mmiowb();
 		qib_read_kreg32(dd, kr_scratch);
 		dds_reg_map >>= 4;
 		for (midx = 0; midx < DDS_ROWS; ++midx) {
@@ -1076,7 +1075,6 @@
 
 			data = dds_init_vals[midx].reg_vals[idx];
 			writeq(data, daddr);
-			mmiowb();
 			qib_read_kreg32(dd, kr_scratch);
 		} /* End inner for (vals for this reg, each row) */
 	} /* end outer for (regs to be stored) */
@@ -1098,13 +1096,11 @@
 		didx = idx + min_idx;
 		/* Store the next RXEQ register address */
 		writeq(rxeq_init_vals[idx].rdesc, iaddr + didx);
-		mmiowb();
 		qib_read_kreg32(dd, kr_scratch);
 		/* Iterate through RXEQ values */
 		for (vidx = 0; vidx < 4; vidx++) {
 			data = rxeq_init_vals[idx].rdata[vidx];
 			writeq(data, taddr + (vidx << 6) + idx);
-			mmiowb();
 			qib_read_kreg32(dd, kr_scratch);
 		}
 	} /* end outer for (Reg-writes for RXEQ) */
diff --git a/drivers/infiniband/hw/qib/qib_sdma.c b/drivers/infiniband/hw/qib/qib_sdma.c
index d0723d4..99e11c3 100644
--- a/drivers/infiniband/hw/qib/qib_sdma.c
+++ b/drivers/infiniband/hw/qib/qib_sdma.c
@@ -565,19 +565,15 @@
 	sge = &ss->sge;
 	while (dwords) {
 		u32 dw;
-		u32 len;
+		u32 len = rvt_get_sge_length(sge, dwords << 2);
 
-		len = dwords << 2;
-		if (len > sge->length)
-			len = sge->length;
-		if (len > sge->sge_length)
-			len = sge->sge_length;
-		BUG_ON(len == 0);
 		dw = (len + 3) >> 2;
 		addr = dma_map_single(&ppd->dd->pcidev->dev, sge->vaddr,
 				      dw << 2, DMA_TO_DEVICE);
-		if (dma_mapping_error(&ppd->dd->pcidev->dev, addr))
+		if (dma_mapping_error(&ppd->dd->pcidev->dev, addr)) {
+			ret = -ENOMEM;
 			goto unmap;
+		}
 		sdmadesc[0] = 0;
 		make_sdma_desc(ppd, sdmadesc, (u64) addr, dw, dwoffset);
 		/* SDmaUseLargeBuf has to be set in every descriptor */
@@ -593,24 +589,7 @@
 			descqp = &ppd->sdma_descq[0].qw[0];
 			++ppd->sdma_generation;
 		}
-		sge->vaddr += len;
-		sge->length -= len;
-		sge->sge_length -= len;
-		if (sge->sge_length == 0) {
-			if (--ss->num_sge)
-				*sge = *ss->sg_list++;
-		} else if (sge->length == 0 && sge->mr->lkey) {
-			if (++sge->n >= RVT_SEGSZ) {
-				if (++sge->m >= sge->mr->mapsz)
-					break;
-				sge->n = 0;
-			}
-			sge->vaddr =
-				sge->mr->map[sge->m]->segs[sge->n].vaddr;
-			sge->length =
-				sge->mr->map[sge->m]->segs[sge->n].length;
-		}
-
+		rvt_update_sge(ss, len, false);
 		dwoffset += dw;
 		dwords -= dw;
 	}
@@ -651,7 +630,7 @@
 		if (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)
 			rvt_error_qp(qp, IB_WC_GENERAL_ERR);
 	} else if (qp->s_wqe)
-		qib_send_complete(qp, qp->s_wqe, IB_WC_GENERAL_ERR);
+		rvt_send_complete(qp, qp->s_wqe, IB_WC_GENERAL_ERR);
 	spin_unlock(&qp->s_lock);
 	spin_unlock(&qp->r_lock);
 	/* return zero to process the next send work request */
diff --git a/drivers/infiniband/hw/qib/qib_sysfs.c b/drivers/infiniband/hw/qib/qib_sysfs.c
index ca2638d..3926be7 100644
--- a/drivers/infiniband/hw/qib/qib_sysfs.c
+++ b/drivers/infiniband/hw/qib/qib_sysfs.c
@@ -436,6 +436,7 @@
 QIB_DIAGC_ATTR(unaligned);
 QIB_DIAGC_ATTR(rc_dupreq);
 QIB_DIAGC_ATTR(rc_seqnak);
+QIB_DIAGC_ATTR(rc_crwaits);
 
 static struct attribute *diagc_default_attributes[] = {
 	&qib_diagc_attr_rc_resends.attr,
@@ -453,6 +454,7 @@
 	&qib_diagc_attr_unaligned.attr,
 	&qib_diagc_attr_rc_dupreq.attr,
 	&qib_diagc_attr_rc_seqnak.attr,
+	&qib_diagc_attr_rc_crwaits.attr,
 	NULL
 };
 
@@ -551,20 +553,21 @@
  * Start of per-unit (or driver, in some cases, but replicated
  * per unit) functions (these get a device *)
  */
-static ssize_t show_rev(struct device *device, struct device_attribute *attr,
-			char *buf)
+static ssize_t hw_rev_show(struct device *device, struct device_attribute *attr,
+			   char *buf)
 {
 	struct qib_ibdev *dev =
-		container_of(device, struct qib_ibdev, rdi.ibdev.dev);
+		rdma_device_to_drv_device(device, struct qib_ibdev, rdi.ibdev);
 
 	return sprintf(buf, "%x\n", dd_from_dev(dev)->minrev);
 }
+static DEVICE_ATTR_RO(hw_rev);
 
-static ssize_t show_hca(struct device *device, struct device_attribute *attr,
-			char *buf)
+static ssize_t hca_type_show(struct device *device,
+			     struct device_attribute *attr, char *buf)
 {
 	struct qib_ibdev *dev =
-		container_of(device, struct qib_ibdev, rdi.ibdev.dev);
+		rdma_device_to_drv_device(device, struct qib_ibdev, rdi.ibdev);
 	struct qib_devdata *dd = dd_from_dev(dev);
 	int ret;
 
@@ -574,43 +577,46 @@
 		ret = scnprintf(buf, PAGE_SIZE, "%s\n", dd->boardname);
 	return ret;
 }
+static DEVICE_ATTR_RO(hca_type);
+static DEVICE_ATTR(board_id, 0444, hca_type_show, NULL);
 
-static ssize_t show_version(struct device *device,
+static ssize_t version_show(struct device *device,
 			    struct device_attribute *attr, char *buf)
 {
 	/* The string printed here is already newline-terminated. */
 	return scnprintf(buf, PAGE_SIZE, "%s", (char *)ib_qib_version);
 }
+static DEVICE_ATTR_RO(version);
 
-static ssize_t show_boardversion(struct device *device,
+static ssize_t boardversion_show(struct device *device,
 				 struct device_attribute *attr, char *buf)
 {
 	struct qib_ibdev *dev =
-		container_of(device, struct qib_ibdev, rdi.ibdev.dev);
+		rdma_device_to_drv_device(device, struct qib_ibdev, rdi.ibdev);
 	struct qib_devdata *dd = dd_from_dev(dev);
 
 	/* The string printed here is already newline-terminated. */
 	return scnprintf(buf, PAGE_SIZE, "%s", dd->boardversion);
 }
+static DEVICE_ATTR_RO(boardversion);
 
-
-static ssize_t show_localbus_info(struct device *device,
+static ssize_t localbus_info_show(struct device *device,
 				  struct device_attribute *attr, char *buf)
 {
 	struct qib_ibdev *dev =
-		container_of(device, struct qib_ibdev, rdi.ibdev.dev);
+		rdma_device_to_drv_device(device, struct qib_ibdev, rdi.ibdev);
 	struct qib_devdata *dd = dd_from_dev(dev);
 
 	/* The string printed here is already newline-terminated. */
 	return scnprintf(buf, PAGE_SIZE, "%s", dd->lbus_info);
 }
+static DEVICE_ATTR_RO(localbus_info);
 
-
-static ssize_t show_nctxts(struct device *device,
+static ssize_t nctxts_show(struct device *device,
 			   struct device_attribute *attr, char *buf)
 {
 	struct qib_ibdev *dev =
-		container_of(device, struct qib_ibdev, rdi.ibdev.dev);
+		rdma_device_to_drv_device(device, struct qib_ibdev, rdi.ibdev);
 	struct qib_devdata *dd = dd_from_dev(dev);
 
 	/* Return the number of user ports (contexts) available. */
@@ -620,23 +626,25 @@
 			(dd->first_user_ctxt > dd->cfgctxts) ? 0 :
 			(dd->cfgctxts - dd->first_user_ctxt));
 }
+static DEVICE_ATTR_RO(nctxts);
 
-static ssize_t show_nfreectxts(struct device *device,
-			   struct device_attribute *attr, char *buf)
+static ssize_t nfreectxts_show(struct device *device,
+			       struct device_attribute *attr, char *buf)
 {
 	struct qib_ibdev *dev =
-		container_of(device, struct qib_ibdev, rdi.ibdev.dev);
+		rdma_device_to_drv_device(device, struct qib_ibdev, rdi.ibdev);
 	struct qib_devdata *dd = dd_from_dev(dev);
 
 	/* Return the number of free user ports (contexts) available. */
 	return scnprintf(buf, PAGE_SIZE, "%u\n", dd->freectxts);
 }
+static DEVICE_ATTR_RO(nfreectxts);
 
-static ssize_t show_serial(struct device *device,
+static ssize_t serial_show(struct device *device,
 			   struct device_attribute *attr, char *buf)
 {
 	struct qib_ibdev *dev =
-		container_of(device, struct qib_ibdev, rdi.ibdev.dev);
+		rdma_device_to_drv_device(device, struct qib_ibdev, rdi.ibdev);
 	struct qib_devdata *dd = dd_from_dev(dev);
 
 	buf[sizeof(dd->serial)] = '\0';
@@ -644,13 +652,14 @@
 	strcat(buf, "\n");
 	return strlen(buf);
 }
+static DEVICE_ATTR_RO(serial);
 
-static ssize_t store_chip_reset(struct device *device,
+static ssize_t chip_reset_store(struct device *device,
 				struct device_attribute *attr, const char *buf,
 				size_t count)
 {
 	struct qib_ibdev *dev =
-		container_of(device, struct qib_ibdev, rdi.ibdev.dev);
+		rdma_device_to_drv_device(device, struct qib_ibdev, rdi.ibdev);
 	struct qib_devdata *dd = dd_from_dev(dev);
 	int ret;
 
@@ -663,15 +672,16 @@
 bail:
 	return ret < 0 ? ret : count;
 }
+static DEVICE_ATTR_WO(chip_reset);
 
 /*
  * Dump tempsense regs. in decimal, to ease shell-scripts.
  */
-static ssize_t show_tempsense(struct device *device,
+static ssize_t tempsense_show(struct device *device,
 			      struct device_attribute *attr, char *buf)
 {
 	struct qib_ibdev *dev =
-		container_of(device, struct qib_ibdev, rdi.ibdev.dev);
+		rdma_device_to_drv_device(device, struct qib_ibdev, rdi.ibdev);
 	struct qib_devdata *dd = dd_from_dev(dev);
 	int ret;
 	int idx;
@@ -695,6 +705,7 @@
 				*(signed char *)(regvals + 7));
 	return ret;
 }
+static DEVICE_ATTR_RO(tempsense);
 
 /*
  * end of per-unit (or driver, in some cases, but replicated
@@ -702,30 +713,23 @@
  */
 
 /* start of per-unit file structures and support code */
-static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL);
-static DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL);
-static DEVICE_ATTR(board_id, S_IRUGO, show_hca, NULL);
-static DEVICE_ATTR(version, S_IRUGO, show_version, NULL);
-static DEVICE_ATTR(nctxts, S_IRUGO, show_nctxts, NULL);
-static DEVICE_ATTR(nfreectxts, S_IRUGO, show_nfreectxts, NULL);
-static DEVICE_ATTR(serial, S_IRUGO, show_serial, NULL);
-static DEVICE_ATTR(boardversion, S_IRUGO, show_boardversion, NULL);
-static DEVICE_ATTR(tempsense, S_IRUGO, show_tempsense, NULL);
-static DEVICE_ATTR(localbus_info, S_IRUGO, show_localbus_info, NULL);
-static DEVICE_ATTR(chip_reset, S_IWUSR, NULL, store_chip_reset);
+static struct attribute *qib_attributes[] = {
+	&dev_attr_hw_rev.attr,
+	&dev_attr_hca_type.attr,
+	&dev_attr_board_id.attr,
+	&dev_attr_version.attr,
+	&dev_attr_nctxts.attr,
+	&dev_attr_nfreectxts.attr,
+	&dev_attr_serial.attr,
+	&dev_attr_boardversion.attr,
+	&dev_attr_tempsense.attr,
+	&dev_attr_localbus_info.attr,
+	&dev_attr_chip_reset.attr,
+	NULL,
+};
 
-static struct device_attribute *qib_attributes[] = {
-	&dev_attr_hw_rev,
-	&dev_attr_hca_type,
-	&dev_attr_board_id,
-	&dev_attr_version,
-	&dev_attr_nctxts,
-	&dev_attr_nfreectxts,
-	&dev_attr_serial,
-	&dev_attr_boardversion,
-	&dev_attr_tempsense,
-	&dev_attr_localbus_info,
-	&dev_attr_chip_reset,
+const struct attribute_group qib_attr_group = {
+	.attrs = qib_attributes,
 };
 
 int qib_create_port_files(struct ib_device *ibdev, u8 port_num,
@@ -827,27 +831,6 @@
 }
 
 /*
- * Register and create our files in /sys/class/infiniband.
- */
-int qib_verbs_register_sysfs(struct qib_devdata *dd)
-{
-	struct ib_device *dev = &dd->verbs_dev.rdi.ibdev;
-	int i, ret;
-
-	for (i = 0; i < ARRAY_SIZE(qib_attributes); ++i) {
-		ret = device_create_file(&dev->dev, qib_attributes[i]);
-		if (ret)
-			goto bail;
-	}
-
-	return 0;
-bail:
-	for (i = 0; i < ARRAY_SIZE(qib_attributes); ++i)
-		device_remove_file(&dev->dev, qib_attributes[i]);
-	return ret;
-}
-
-/*
  * Unregister and remove our files in /sys/class/infiniband.
  */
 void qib_verbs_unregister_sysfs(struct qib_devdata *dd)
diff --git a/drivers/infiniband/hw/qib/qib_uc.c b/drivers/infiniband/hw/qib/qib_uc.c
index 3e54bc1..e17b91e 100644
--- a/drivers/infiniband/hw/qib/qib_uc.c
+++ b/drivers/infiniband/hw/qib/qib_uc.c
@@ -68,7 +68,7 @@
 			goto bail;
 		}
 		wqe = rvt_get_swqe_ptr(qp, qp->s_last);
-		qib_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR);
+		rvt_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR);
 		goto done;
 	}
 
@@ -359,7 +359,7 @@
 		qp->r_rcv_len += pmtu;
 		if (unlikely(qp->r_rcv_len > qp->r_len))
 			goto rewind;
-		qib_copy_sge(&qp->r_sge, data, pmtu, 0);
+		rvt_copy_sge(qp, &qp->r_sge, data, pmtu, false, false);
 		break;
 
 	case OP(SEND_LAST_WITH_IMMEDIATE):
@@ -385,7 +385,7 @@
 		if (unlikely(wc.byte_len > qp->r_len))
 			goto rewind;
 		wc.opcode = IB_WC_RECV;
-		qib_copy_sge(&qp->r_sge, data, tlen, 0);
+		rvt_copy_sge(qp, &qp->r_sge, data, tlen, false, false);
 		rvt_put_ss(&qp->s_rdma_read_sge);
 last_imm:
 		wc.wr_id = qp->r_wr_id;
@@ -400,8 +400,7 @@
 		wc.dlid_path_bits = 0;
 		wc.port_num = 0;
 		/* Signal completion event if the solicited bit is set. */
-		rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc,
-			     ib_bth_is_solicited(ohdr));
+		rvt_recv_cq(qp, &wc, ib_bth_is_solicited(ohdr));
 		break;
 
 	case OP(RDMA_WRITE_FIRST):
@@ -449,7 +448,7 @@
 		qp->r_rcv_len += pmtu;
 		if (unlikely(qp->r_rcv_len > qp->r_len))
 			goto drop;
-		qib_copy_sge(&qp->r_sge, data, pmtu, 1);
+		rvt_copy_sge(qp, &qp->r_sge, data, pmtu, true, false);
 		break;
 
 	case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE):
@@ -479,7 +478,7 @@
 		}
 		wc.byte_len = qp->r_len;
 		wc.opcode = IB_WC_RECV_RDMA_WITH_IMM;
-		qib_copy_sge(&qp->r_sge, data, tlen, 1);
+		rvt_copy_sge(qp, &qp->r_sge, data, tlen, true, false);
 		rvt_put_ss(&qp->r_sge);
 		goto last_imm;
 
@@ -495,7 +494,7 @@
 		tlen -= (hdrsize + pad + 4);
 		if (unlikely(tlen + qp->r_rcv_len != qp->r_len))
 			goto drop;
-		qib_copy_sge(&qp->r_sge, data, tlen, 1);
+		rvt_copy_sge(qp, &qp->r_sge, data, tlen, true, false);
 		rvt_put_ss(&qp->r_sge);
 		break;
 
diff --git a/drivers/infiniband/hw/qib/qib_ud.c b/drivers/infiniband/hw/qib/qib_ud.c
index f8d029a..93ca213 100644
--- a/drivers/infiniband/hw/qib/qib_ud.c
+++ b/drivers/infiniband/hw/qib/qib_ud.c
@@ -1,4 +1,5 @@
 /*
+ * Copyright (c) 2012 - 2019 Intel Corporation.  All rights reserved.
  * Copyright (c) 2006, 2007, 2008, 2009 QLogic Corporation. All rights reserved.
  * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
  *
@@ -63,7 +64,7 @@
 	enum ib_qp_type sqptype, dqptype;
 
 	rcu_read_lock();
-	qp = rvt_lookup_qpn(rdi, &ibp->rvp, swqe->ud_wr.remote_qpn);
+	qp = rvt_lookup_qpn(rdi, &ibp->rvp, rvt_get_swqe_remote_qpn(swqe));
 	if (!qp) {
 		ibp->rvp.n_pkt_drops++;
 		goto drop;
@@ -80,7 +81,7 @@
 		goto drop;
 	}
 
-	ah_attr = &ibah_to_rvtah(swqe->ud_wr.ah)->attr;
+	ah_attr = rvt_get_swqe_ah_attr(swqe);
 	ppd = ppd_from_ibp(ibp);
 
 	if (qp->ibqp.qp_num > 1) {
@@ -110,8 +111,8 @@
 	if (qp->ibqp.qp_num) {
 		u32 qkey;
 
-		qkey = (int)swqe->ud_wr.remote_qkey < 0 ?
-			sqp->qkey : swqe->ud_wr.remote_qkey;
+		qkey = (int)rvt_get_swqe_remote_qkey(swqe) < 0 ?
+			sqp->qkey : rvt_get_swqe_remote_qkey(swqe);
 		if (unlikely(qkey != qp->qkey))
 			goto drop;
 	}
@@ -162,8 +163,8 @@
 		const struct ib_global_route *grd = rdma_ah_read_grh(ah_attr);
 
 		qib_make_grh(ibp, &grh, grd, 0, 0);
-		qib_copy_sge(&qp->r_sge, &grh,
-			     sizeof(grh), 1);
+		rvt_copy_sge(qp, &qp->r_sge, &grh,
+			     sizeof(grh), true, false);
 		wc.wc_flags |= IB_WC_GRH;
 	} else
 		rvt_skip_sge(&qp->r_sge, sizeof(struct ib_grh), true);
@@ -172,14 +173,9 @@
 	ssge.num_sge = swqe->wr.num_sge;
 	sge = &ssge.sge;
 	while (length) {
-		u32 len = sge->length;
+		u32 len = rvt_get_sge_length(sge, length);
 
-		if (len > length)
-			len = length;
-		if (len > sge->sge_length)
-			len = sge->sge_length;
-		BUG_ON(len == 0);
-		qib_copy_sge(&qp->r_sge, sge->vaddr, len, 1);
+		rvt_copy_sge(qp, &qp->r_sge, sge->vaddr, len, true, false);
 		sge->vaddr += len;
 		sge->length -= len;
 		sge->sge_length -= len;
@@ -208,15 +204,14 @@
 	wc.qp = &qp->ibqp;
 	wc.src_qp = sqp->ibqp.qp_num;
 	wc.pkey_index = qp->ibqp.qp_type == IB_QPT_GSI ?
-		swqe->ud_wr.pkey_index : 0;
+		rvt_get_swqe_pkey_index(swqe) : 0;
 	wc.slid = ppd->lid | (rdma_ah_get_path_bits(ah_attr) &
 				((1 << ppd->lmc) - 1));
 	wc.sl = rdma_ah_get_sl(ah_attr);
 	wc.dlid_path_bits = rdma_ah_get_dlid(ah_attr) & ((1 << ppd->lmc) - 1);
 	wc.port_num = qp->port_num;
 	/* Signal completion event if the solicited bit is set. */
-	rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc,
-		     swqe->wr.send_flags & IB_SEND_SOLICITED);
+	rvt_recv_cq(qp, &wc, swqe->wr.send_flags & IB_SEND_SOLICITED);
 	ibp->rvp.n_loop_pkts++;
 bail_unlock:
 	spin_unlock_irqrestore(&qp->r_lock, flags);
@@ -260,7 +255,7 @@
 			goto bail;
 		}
 		wqe = rvt_get_swqe_ptr(qp, qp->s_last);
-		qib_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR);
+		rvt_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR);
 		goto done;
 	}
 
@@ -276,7 +271,7 @@
 	/* Construct the header. */
 	ibp = to_iport(qp->ibqp.device, qp->port_num);
 	ppd = ppd_from_ibp(ibp);
-	ah_attr = &ibah_to_rvtah(wqe->ud_wr.ah)->attr;
+	ah_attr = rvt_get_swqe_ah_attr(wqe);
 	if (rdma_ah_get_dlid(ah_attr) >= be16_to_cpu(IB_MULTICAST_LID_BASE)) {
 		if (rdma_ah_get_dlid(ah_attr) !=
 				be16_to_cpu(IB_LID_PERMISSIVE))
@@ -304,7 +299,7 @@
 			qib_ud_loopback(qp, wqe);
 			spin_lock_irqsave(&qp->s_lock, tflags);
 			*flags = tflags;
-			qib_send_complete(qp, wqe, IB_WC_SUCCESS);
+			rvt_send_complete(qp, wqe, IB_WC_SUCCESS);
 			goto done;
 		}
 	}
@@ -368,7 +363,7 @@
 	bth0 |= extra_bytes << 20;
 	bth0 |= qp->ibqp.qp_type == IB_QPT_SMI ? QIB_DEFAULT_P_KEY :
 		qib_get_pkey(ibp, qp->ibqp.qp_type == IB_QPT_GSI ?
-			     wqe->ud_wr.pkey_index : qp->s_pkey_index);
+			     rvt_get_swqe_pkey_index(wqe) : qp->s_pkey_index);
 	ohdr->bth[0] = cpu_to_be32(bth0);
 	/*
 	 * Use the multicast QP if the destination LID is a multicast LID.
@@ -377,14 +372,15 @@
 			be16_to_cpu(IB_MULTICAST_LID_BASE) &&
 		rdma_ah_get_dlid(ah_attr) != be16_to_cpu(IB_LID_PERMISSIVE) ?
 		cpu_to_be32(QIB_MULTICAST_QPN) :
-		cpu_to_be32(wqe->ud_wr.remote_qpn);
+		cpu_to_be32(rvt_get_swqe_remote_qpn(wqe));
 	ohdr->bth[2] = cpu_to_be32(wqe->psn & QIB_PSN_MASK);
 	/*
 	 * Qkeys with the high order bit set mean use the
 	 * qkey from the QP context instead of the WR (see 10.2.5).
 	 */
-	ohdr->u.ud.deth[0] = cpu_to_be32((int)wqe->ud_wr.remote_qkey < 0 ?
-					 qp->qkey : wqe->ud_wr.remote_qkey);
+	ohdr->u.ud.deth[0] =
+		cpu_to_be32((int)rvt_get_swqe_remote_qkey(wqe) < 0 ? qp->qkey :
+			    rvt_get_swqe_remote_qkey(wqe));
 	ohdr->u.ud.deth[1] = cpu_to_be32(qp->ibqp.qp_num);
 
 done:
@@ -513,7 +509,6 @@
 	    opcode == IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE) {
 		wc.ex.imm_data = ohdr->u.ud.imm_data;
 		wc.wc_flags = IB_WC_WITH_IMM;
-		tlen -= sizeof(u32);
 	} else if (opcode == IB_OPCODE_UD_SEND_ONLY) {
 		wc.ex.imm_data = 0;
 		wc.wc_flags = 0;
@@ -551,12 +546,13 @@
 		goto drop;
 	}
 	if (has_grh) {
-		qib_copy_sge(&qp->r_sge, &hdr->u.l.grh,
-			     sizeof(struct ib_grh), 1);
+		rvt_copy_sge(qp, &qp->r_sge, &hdr->u.l.grh,
+			     sizeof(struct ib_grh), true, false);
 		wc.wc_flags |= IB_WC_GRH;
 	} else
 		rvt_skip_sge(&qp->r_sge, sizeof(struct ib_grh), true);
-	qib_copy_sge(&qp->r_sge, data, wc.byte_len - sizeof(struct ib_grh), 1);
+	rvt_copy_sge(qp, &qp->r_sge, data, wc.byte_len - sizeof(struct ib_grh),
+		     true, false);
 	rvt_put_ss(&qp->r_sge);
 	if (!test_and_clear_bit(RVT_R_WRID_VALID, &qp->r_aflags))
 		return;
@@ -578,8 +574,7 @@
 		dlid & ((1 << ppd_from_ibp(ibp)->lmc) - 1);
 	wc.port_num = qp->port_num;
 	/* Signal completion event if the solicited bit is set. */
-	rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc,
-		     ib_bth_is_solicited(ohdr));
+	rvt_recv_cq(qp, &wc, ib_bth_is_solicited(ohdr));
 	return;
 
 drop:
diff --git a/drivers/infiniband/hw/qib/qib_user_pages.c b/drivers/infiniband/hw/qib/qib_user_pages.c
index 16543d5..6bf764e 100644
--- a/drivers/infiniband/hw/qib/qib_user_pages.c
+++ b/drivers/infiniband/hw/qib/qib_user_pages.c
@@ -40,50 +40,7 @@
 static void __qib_release_user_pages(struct page **p, size_t num_pages,
 				     int dirty)
 {
-	size_t i;
-
-	for (i = 0; i < num_pages; i++) {
-		if (dirty)
-			set_page_dirty_lock(p[i]);
-		put_page(p[i]);
-	}
-}
-
-/*
- * Call with current->mm->mmap_sem held.
- */
-static int __qib_get_user_pages(unsigned long start_page, size_t num_pages,
-				struct page **p)
-{
-	unsigned long lock_limit;
-	size_t got;
-	int ret;
-
-	lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
-
-	if (num_pages > lock_limit && !capable(CAP_IPC_LOCK)) {
-		ret = -ENOMEM;
-		goto bail;
-	}
-
-	for (got = 0; got < num_pages; got += ret) {
-		ret = get_user_pages(start_page + got * PAGE_SIZE,
-				     num_pages - got,
-				     FOLL_WRITE | FOLL_FORCE,
-				     p + got, NULL);
-		if (ret < 0)
-			goto bail_release;
-	}
-
-	current->mm->pinned_vm += num_pages;
-
-	ret = 0;
-	goto bail;
-
-bail_release:
-	__qib_release_user_pages(p, got, 0);
-bail:
-	return ret;
+	put_user_pages_dirty_lock(p, num_pages, dirty);
 }
 
 /**
@@ -137,26 +94,44 @@
 int qib_get_user_pages(unsigned long start_page, size_t num_pages,
 		       struct page **p)
 {
+	unsigned long locked, lock_limit;
+	size_t got;
 	int ret;
 
-	down_write(&current->mm->mmap_sem);
+	lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+	locked = atomic64_add_return(num_pages, &current->mm->pinned_vm);
 
-	ret = __qib_get_user_pages(start_page, num_pages, p);
+	if (locked > lock_limit && !capable(CAP_IPC_LOCK)) {
+		ret = -ENOMEM;
+		goto bail;
+	}
 
-	up_write(&current->mm->mmap_sem);
+	down_read(&current->mm->mmap_sem);
+	for (got = 0; got < num_pages; got += ret) {
+		ret = get_user_pages(start_page + got * PAGE_SIZE,
+				     num_pages - got,
+				     FOLL_LONGTERM | FOLL_WRITE | FOLL_FORCE,
+				     p + got, NULL);
+		if (ret < 0) {
+			up_read(&current->mm->mmap_sem);
+			goto bail_release;
+		}
+	}
+	up_read(&current->mm->mmap_sem);
 
+	return 0;
+bail_release:
+	__qib_release_user_pages(p, got, 0);
+bail:
+	atomic64_sub(num_pages, &current->mm->pinned_vm);
 	return ret;
 }
 
 void qib_release_user_pages(struct page **p, size_t num_pages)
 {
-	if (current->mm) /* during close after signal, mm can be NULL */
-		down_write(&current->mm->mmap_sem);
-
 	__qib_release_user_pages(p, num_pages, 1);
 
-	if (current->mm) {
-		current->mm->pinned_vm -= num_pages;
-		up_write(&current->mm->mmap_sem);
-	}
+	/* during close after signal, mm can be NULL */
+	if (current->mm)
+		atomic64_sub(num_pages, &current->mm->pinned_vm);
 }
diff --git a/drivers/infiniband/hw/qib/qib_user_sdma.c b/drivers/infiniband/hw/qib/qib_user_sdma.c
index 926f3c8..05190ed 100644
--- a/drivers/infiniband/hw/qib/qib_user_sdma.c
+++ b/drivers/infiniband/hw/qib/qib_user_sdma.c
@@ -225,8 +225,6 @@
 	if (sdma_rb_node) {
 		sdma_rb_node->refcount++;
 	} else {
-		int ret;
-
 		sdma_rb_node = kmalloc(sizeof(
 			struct qib_user_sdma_rb_node), GFP_KERNEL);
 		if (!sdma_rb_node)
@@ -235,9 +233,7 @@
 		sdma_rb_node->refcount = 1;
 		sdma_rb_node->pid = current->pid;
 
-		ret = qib_user_sdma_rb_insert(&qib_user_sdma_rb_root,
-					sdma_rb_node);
-		BUG_ON(ret == 0);
+		qib_user_sdma_rb_insert(&qib_user_sdma_rb_root, sdma_rb_node);
 	}
 	pq->sdma_rb_node = sdma_rb_node;
 
@@ -321,7 +317,7 @@
 		 * the caller can ignore this page.
 		 */
 		if (put) {
-			put_page(page);
+			put_user_page(page);
 		} else {
 			/* coalesce case */
 			kunmap(page);
@@ -635,7 +631,7 @@
 			kunmap(pkt->addr[i].page);
 
 		if (pkt->addr[i].put_page)
-			put_page(pkt->addr[i].page);
+			put_user_page(pkt->addr[i].page);
 		else
 			__free_page(pkt->addr[i].page);
 	} else if (pkt->addr[i].kvaddr) {
@@ -674,7 +670,7 @@
 		else
 			j = npages;
 
-		ret = get_user_pages_fast(addr, j, 0, pages);
+		ret = get_user_pages_fast(addr, j, FOLL_LONGTERM, pages);
 		if (ret != j) {
 			i = 0;
 			j = ret;
@@ -710,7 +706,7 @@
 	/* if error, return all pages not managed by pkt */
 free_pages:
 	while (i < j)
-		put_page(pages[i++]);
+		put_user_page(pages[i++]);
 
 done:
 	return ret;
@@ -908,10 +904,11 @@
 		}
 
 		if (frag_size) {
-			int pktsize, tidsmsize, n;
+			int tidsmsize, n;
+			size_t pktsize;
 
 			n = npages*((2*PAGE_SIZE/frag_size)+1);
-			pktsize = sizeof(*pkt) + sizeof(pkt->addr[0])*n;
+			pktsize = struct_size(pkt, addr, n);
 
 			/*
 			 * Determine if this is tid-sdma or just sdma.
diff --git a/drivers/infiniband/hw/qib/qib_verbs.c b/drivers/infiniband/hw/qib/qib_verbs.c
index 41babbc..33778d4 100644
--- a/drivers/infiniband/hw/qib/qib_verbs.c
+++ b/drivers/infiniband/hw/qib/qib_verbs.c
@@ -131,27 +131,6 @@
  */
 __be64 ib_qib_sys_image_guid;
 
-/**
- * qib_copy_sge - copy data to SGE memory
- * @ss: the SGE state
- * @data: the data to copy
- * @length: the length of the data
- */
-void qib_copy_sge(struct rvt_sge_state *ss, void *data, u32 length, int release)
-{
-	struct rvt_sge *sge = &ss->sge;
-
-	while (length) {
-		u32 len = rvt_get_sge_length(sge, length);
-
-		WARN_ON_ONCE(len == 0);
-		memcpy(sge->vaddr, data, len);
-		rvt_update_sge(ss, len, release);
-		data += len;
-		length -= len;
-	}
-}
-
 /*
  * Count the number of DMA descriptors needed to send length bytes of data.
  * Don't modify the qib_sge_state to get the count.
@@ -165,13 +144,8 @@
 	u32 ndesc = 1;  /* count the header */
 
 	while (length) {
-		u32 len = sge.length;
+		u32 len = rvt_get_sge_length(&sge, length);
 
-		if (len > length)
-			len = length;
-		if (len > sge.sge_length)
-			len = sge.sge_length;
-		BUG_ON(len == 0);
 		if (((long) sge.vaddr & (sizeof(u32) - 1)) ||
 		    (len != length && (len & (sizeof(u32) - 1)))) {
 			ndesc = 0;
@@ -208,13 +182,8 @@
 	struct rvt_sge *sge = &ss->sge;
 
 	while (length) {
-		u32 len = sge->length;
+		u32 len = rvt_get_sge_length(sge, length);
 
-		if (len > length)
-			len = length;
-		if (len > sge->sge_length)
-			len = sge->sge_length;
-		BUG_ON(len == 0);
 		memcpy(data, sge->vaddr, len);
 		sge->vaddr += len;
 		sge->length -= len;
@@ -463,14 +432,9 @@
 	u32 last;
 
 	while (1) {
-		u32 len = ss->sge.length;
+		u32 len = rvt_get_sge_length(&ss->sge, length);
 		u32 off;
 
-		if (len > length)
-			len = length;
-		if (len > ss->sge.sge_length)
-			len = ss->sge.sge_length;
-		BUG_ON(len == 0);
 		/* If the source address is not aligned, try to align it. */
 		off = (unsigned long)ss->sge.vaddr & (sizeof(u32) - 1);
 		if (off) {
@@ -752,7 +716,7 @@
 
 	spin_lock(&qp->s_lock);
 	if (tx->wqe)
-		qib_send_complete(qp, tx->wqe, IB_WC_SUCCESS);
+		rvt_send_complete(qp, tx->wqe, IB_WC_SUCCESS);
 	else if (qp->ibqp.qp_type == IB_QPT_RC) {
 		struct ib_header *hdr;
 
@@ -1025,7 +989,7 @@
 	}
 	if (qp->s_wqe) {
 		spin_lock_irqsave(&qp->s_lock, flags);
-		qib_send_complete(qp, qp->s_wqe, IB_WC_SUCCESS);
+		rvt_send_complete(qp, qp->s_wqe, IB_WC_SUCCESS);
 		spin_unlock_irqrestore(&qp->s_lock, flags);
 	} else if (qp->ibqp.qp_type == IB_QPT_RC) {
 		spin_lock_irqsave(&qp->s_lock, flags);
@@ -1386,7 +1350,7 @@
 	rcu_read_lock();
 	qp0 = rcu_dereference(ibp->rvp.qp[0]);
 	if (qp0)
-		ah = rdma_create_ah(qp0->ibqp.pd, &attr);
+		ah = rdma_create_ah(qp0->ibqp.pd, &attr, 0);
 	rcu_read_unlock();
 	return ah;
 }
@@ -1495,8 +1459,6 @@
 	rdi->dparms.props.max_cq = ib_qib_max_cqs;
 	rdi->dparms.props.max_cqe = ib_qib_max_cqes;
 	rdi->dparms.props.max_ah = ib_qib_max_ahs;
-	rdi->dparms.props.max_mr = rdi->lkey_table.max;
-	rdi->dparms.props.max_fmr = rdi->lkey_table.max;
 	rdi->dparms.props.max_map_per_fmr = 32767;
 	rdi->dparms.props.max_qp_rd_atom = QIB_MAX_RDMA_ATOMIC;
 	rdi->dparms.props.max_qp_init_rd_atom = 255;
@@ -1512,8 +1474,20 @@
 					rdi->dparms.props.max_mcast_grp;
 	/* post send table */
 	dd->verbs_dev.rdi.post_parms = qib_post_parms;
+
+	/* opcode translation table */
+	dd->verbs_dev.rdi.wc_opcode = ib_qib_wc_opcode;
 }
 
+static const struct ib_device_ops qib_dev_ops = {
+	.owner = THIS_MODULE,
+	.driver_id = RDMA_DRIVER_QIB,
+
+	.init_port = qib_create_port_files,
+	.modify_device = qib_modify_device,
+	.process_mad = qib_process_mad,
+};
+
 /**
  * qib_register_ib_device - register our device with the infiniband core
  * @dd: the device data structure
@@ -1572,12 +1546,9 @@
 	if (!ib_qib_sys_image_guid)
 		ib_qib_sys_image_guid = ppd->guid;
 
-	ibdev->owner = THIS_MODULE;
 	ibdev->node_guid = ppd->guid;
 	ibdev->phys_port_cnt = dd->num_pports;
 	ibdev->dev.parent = &dd->pcidev->dev;
-	ibdev->modify_device = qib_modify_device;
-	ibdev->process_mad = qib_process_mad;
 
 	snprintf(ibdev->node_desc, sizeof(ibdev->node_desc),
 		 "Intel Infiniband HCA %s", init_utsname()->nodename);
@@ -1585,10 +1556,9 @@
 	/*
 	 * Fill in rvt info object.
 	 */
-	dd->verbs_dev.rdi.driver_f.port_callback = qib_create_port_files;
 	dd->verbs_dev.rdi.driver_f.get_pci_dev = qib_get_pci_dev;
 	dd->verbs_dev.rdi.driver_f.check_ah = qib_check_ah;
-	dd->verbs_dev.rdi.driver_f.check_send_wqe = qib_check_send_wqe;
+	dd->verbs_dev.rdi.driver_f.setup_wqe = qib_check_send_wqe;
 	dd->verbs_dev.rdi.driver_f.notify_new_ah = qib_notify_new_ah;
 	dd->verbs_dev.rdi.driver_f.alloc_qpn = qib_alloc_qpn;
 	dd->verbs_dev.rdi.driver_f.qp_priv_alloc = qib_qp_priv_alloc;
@@ -1631,6 +1601,7 @@
 	dd->verbs_dev.rdi.dparms.node = dd->assigned_node_id;
 	dd->verbs_dev.rdi.dparms.core_cap_flags = RDMA_CORE_PORT_IBA_IB;
 	dd->verbs_dev.rdi.dparms.max_mad_size = IB_MGMT_MAD_SIZE;
+	dd->verbs_dev.rdi.dparms.sge_copy_mode = RVT_SGE_COPY_MEMCPY;
 
 	qib_fill_device_attr(dd);
 
@@ -1642,19 +1613,15 @@
 			      i,
 			      dd->rcd[ctxt]->pkeys);
 	}
+	rdma_set_device_sysfs_group(&dd->verbs_dev.rdi.ibdev, &qib_attr_group);
 
-	ret = rvt_register_device(&dd->verbs_dev.rdi, RDMA_DRIVER_QIB);
+	ib_set_device_ops(ibdev, &qib_dev_ops);
+	ret = rvt_register_device(&dd->verbs_dev.rdi);
 	if (ret)
 		goto err_tx;
 
-	ret = qib_verbs_register_sysfs(dd);
-	if (ret)
-		goto err_class;
-
 	return ret;
 
-err_class:
-	rvt_unregister_device(&dd->verbs_dev.rdi);
 err_tx:
 	while (!list_empty(&dev->txreq_free)) {
 		struct list_head *l = dev->txreq_free.next;
@@ -1716,14 +1683,14 @@
  * It is only used in post send, which doesn't hold
  * the s_lock.
  */
-void _qib_schedule_send(struct rvt_qp *qp)
+bool _qib_schedule_send(struct rvt_qp *qp)
 {
 	struct qib_ibport *ibp =
 		to_iport(qp->ibqp.device, qp->port_num);
 	struct qib_pportdata *ppd = ppd_from_ibp(ibp);
 	struct qib_qp_priv *priv = qp->priv;
 
-	queue_work(ppd->qib_wq, &priv->s_work);
+	return queue_work(ppd->qib_wq, &priv->s_work);
 }
 
 /**
@@ -1733,8 +1700,9 @@
  * This schedules qp progress.  The s_lock
  * should be held.
  */
-void qib_schedule_send(struct rvt_qp *qp)
+bool qib_schedule_send(struct rvt_qp *qp)
 {
 	if (qib_send_ok(qp))
-		_qib_schedule_send(qp);
+		return _qib_schedule_send(qp);
+	return false;
 }
diff --git a/drivers/infiniband/hw/qib/qib_verbs.h b/drivers/infiniband/hw/qib/qib_verbs.h
index 666613e..17bdf8a 100644
--- a/drivers/infiniband/hw/qib/qib_verbs.h
+++ b/drivers/infiniband/hw/qib/qib_verbs.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2012 - 2017 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2012 - 2018 Intel Corporation.  All rights reserved.
  * Copyright (c) 2006 - 2012 QLogic Corporation. All rights reserved.
  * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
  *
@@ -46,7 +46,7 @@
 #include <rdma/ib_pack.h>
 #include <rdma/ib_user_verbs.h>
 #include <rdma/ib_hdrs.h>
-#include <rdma/rdma_vt.h>
+#include <rdma/rdmavt_qp.h>
 #include <rdma/rdmavt_cq.h>
 
 struct qib_ctxtdata;
@@ -223,8 +223,8 @@
 		 !(qp->s_flags & RVT_S_ANY_WAIT_SEND));
 }
 
-void _qib_schedule_send(struct rvt_qp *qp);
-void qib_schedule_send(struct rvt_qp *qp);
+bool _qib_schedule_send(struct rvt_qp *qp);
+bool qib_schedule_send(struct rvt_qp *qp);
 
 static inline int qib_pkey_ok(u16 pkey1, u16 pkey2)
 {
@@ -292,9 +292,6 @@
 int qib_verbs_send(struct rvt_qp *qp, struct ib_header *hdr,
 		   u32 hdrwords, struct rvt_sge_state *ss, u32 len);
 
-void qib_copy_sge(struct rvt_sge_state *ss, void *data, u32 length,
-		  int release);
-
 void qib_uc_rcv(struct qib_ibport *ibp, struct ib_header *hdr,
 		int has_grh, void *data, u32 tlen, struct rvt_qp *qp);
 
@@ -303,7 +300,8 @@
 
 int qib_check_ah(struct ib_device *ibdev, struct rdma_ah_attr *ah_attr);
 
-int qib_check_send_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe);
+int qib_check_send_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe,
+		       bool *call_send);
 
 struct ib_ah *qib_create_qp0_ah(struct qib_ibport *ibp, u16 dlid);
 
@@ -333,9 +331,6 @@
 
 void qib_do_send(struct rvt_qp *qp);
 
-void qib_send_complete(struct rvt_qp *qp, struct rvt_swqe *wqe,
-		       enum ib_wc_status status);
-
 void qib_send_rc_ack(struct rvt_qp *qp);
 
 int qib_make_rc_req(struct rvt_qp *qp, unsigned long *flags);
diff --git a/drivers/infiniband/hw/usnic/Kconfig b/drivers/infiniband/hw/usnic/Kconfig
index d1dae2a..c0847d9 100644
--- a/drivers/infiniband/hw/usnic/Kconfig
+++ b/drivers/infiniband/hw/usnic/Kconfig
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 config INFINIBAND_USNIC
 	tristate "Verbs support for Cisco VIC"
 	depends on NETDEVICES && ETHERNET && INET && PCI && INTEL_IOMMU
diff --git a/drivers/infiniband/hw/usnic/Makefile b/drivers/infiniband/hw/usnic/Makefile
index 94ae7a1..f12a493 100644
--- a/drivers/infiniband/hw/usnic/Makefile
+++ b/drivers/infiniband/hw/usnic/Makefile
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: GPL-2.0
-ccflags-y := -Idrivers/net/ethernet/cisco/enic
+ccflags-y := -I $(srctree)/drivers/net/ethernet/cisco/enic
 
 obj-$(CONFIG_INFINIBAND_USNIC)+= usnic_verbs.o
 
diff --git a/drivers/infiniband/hw/usnic/usnic_debugfs.c b/drivers/infiniband/hw/usnic/usnic_debugfs.c
index 92dc66c..e5a3f02 100644
--- a/drivers/infiniband/hw/usnic/usnic_debugfs.c
+++ b/drivers/infiniband/hw/usnic/usnic_debugfs.c
@@ -113,42 +113,21 @@
 void usnic_debugfs_init(void)
 {
 	debugfs_root = debugfs_create_dir(DRV_NAME, NULL);
-	if (IS_ERR(debugfs_root)) {
-		usnic_err("Failed to create debugfs root dir, check if debugfs is enabled in kernel configuration\n");
-		goto out_clear_root;
-	}
 
 	flows_dentry = debugfs_create_dir("flows", debugfs_root);
-	if (IS_ERR_OR_NULL(flows_dentry)) {
-		usnic_err("Failed to create debugfs flow dir with err %ld\n",
-				PTR_ERR(flows_dentry));
-		goto out_free_root;
-	}
 
 	debugfs_create_file("build-info", S_IRUGO, debugfs_root,
 				NULL, &usnic_debugfs_buildinfo_ops);
-	return;
-
-out_free_root:
-	debugfs_remove_recursive(debugfs_root);
-out_clear_root:
-	debugfs_root = NULL;
 }
 
 void usnic_debugfs_exit(void)
 {
-	if (!debugfs_root)
-		return;
-
 	debugfs_remove_recursive(debugfs_root);
 	debugfs_root = NULL;
 }
 
 void usnic_debugfs_flow_add(struct usnic_ib_qp_grp_flow *qp_flow)
 {
-	if (IS_ERR_OR_NULL(flows_dentry))
-		return;
-
 	scnprintf(qp_flow->dentry_name, sizeof(qp_flow->dentry_name),
 			"%u", qp_flow->flow->flow_id);
 	qp_flow->dbgfs_dentry = debugfs_create_file(qp_flow->dentry_name,
@@ -156,15 +135,9 @@
 							flows_dentry,
 							qp_flow,
 							&flowinfo_ops);
-	if (IS_ERR_OR_NULL(qp_flow->dbgfs_dentry)) {
-		usnic_err("Failed to create dbg fs entry for flow %u with error %ld\n",
-				qp_flow->flow->flow_id,
-				PTR_ERR(qp_flow->dbgfs_dentry));
-	}
 }
 
 void usnic_debugfs_flow_remove(struct usnic_ib_qp_grp_flow *qp_flow)
 {
-	if (!IS_ERR_OR_NULL(qp_flow->dbgfs_dentry))
-		debugfs_remove(qp_flow->dbgfs_dentry);
+	debugfs_remove(qp_flow->dbgfs_dentry);
 }
diff --git a/drivers/infiniband/hw/usnic/usnic_ib.h b/drivers/infiniband/hw/usnic/usnic_ib.h
index 525bf27..84dd682 100644
--- a/drivers/infiniband/hw/usnic/usnic_ib.h
+++ b/drivers/infiniband/hw/usnic/usnic_ib.h
@@ -61,6 +61,10 @@
 	struct usnic_uiom_pd		*umem_pd;
 };
 
+struct usnic_ib_cq {
+	struct ib_cq			ibcq;
+};
+
 struct usnic_ib_mr {
 	struct ib_mr			ibmr;
 	struct usnic_uiom_reg		*umem;
diff --git a/drivers/infiniband/hw/usnic/usnic_ib_main.c b/drivers/infiniband/hw/usnic/usnic_ib_main.c
index f0538a4..c9abe1c 100644
--- a/drivers/infiniband/hw/usnic/usnic_ib_main.c
+++ b/drivers/infiniband/hw/usnic/usnic_ib_main.c
@@ -76,7 +76,7 @@
 static int usnic_ib_dump_vf_hdr(void *obj, char *buf, int buf_sz)
 {
 	struct usnic_ib_vf *vf = obj;
-	return scnprintf(buf, buf_sz, "PF: %s ", vf->pf->ib_dev.name);
+	return scnprintf(buf, buf_sz, "PF: %s ", dev_name(&vf->pf->ib_dev.dev));
 }
 /* End callback dump funcs */
 
@@ -89,9 +89,15 @@
 
 void usnic_ib_log_vf(struct usnic_ib_vf *vf)
 {
-	char buf[1000];
-	usnic_ib_dump_vf(vf, buf, sizeof(buf));
+	char *buf = kzalloc(1000, GFP_KERNEL);
+
+	if (!buf)
+		return;
+
+	usnic_ib_dump_vf(vf, buf, 1000);
 	usnic_dbg("%s\n", buf);
+
+	kfree(buf);
 }
 
 /* Start of netdev section */
@@ -138,7 +144,7 @@
 	netdev = us_ibdev->netdev;
 	switch (event) {
 	case NETDEV_REBOOT:
-		usnic_info("PF Reset on %s\n", us_ibdev->ib_dev.name);
+		usnic_info("PF Reset on %s\n", dev_name(&us_ibdev->ib_dev.dev));
 		usnic_ib_qp_grp_modify_active_to_err(us_ibdev);
 		ib_event.event = IB_EVENT_PORT_ERR;
 		ib_event.device = &us_ibdev->ib_dev;
@@ -151,7 +157,8 @@
 		if (!us_ibdev->ufdev->link_up &&
 				netif_carrier_ok(netdev)) {
 			usnic_fwd_carrier_up(us_ibdev->ufdev);
-			usnic_info("Link UP on %s\n", us_ibdev->ib_dev.name);
+			usnic_info("Link UP on %s\n",
+				   dev_name(&us_ibdev->ib_dev.dev));
 			ib_event.event = IB_EVENT_PORT_ACTIVE;
 			ib_event.device = &us_ibdev->ib_dev;
 			ib_event.element.port_num = 1;
@@ -159,7 +166,8 @@
 		} else if (us_ibdev->ufdev->link_up &&
 				!netif_carrier_ok(netdev)) {
 			usnic_fwd_carrier_down(us_ibdev->ufdev);
-			usnic_info("Link DOWN on %s\n", us_ibdev->ib_dev.name);
+			usnic_info("Link DOWN on %s\n",
+				   dev_name(&us_ibdev->ib_dev.dev));
 			usnic_ib_qp_grp_modify_active_to_err(us_ibdev);
 			ib_event.event = IB_EVENT_PORT_ERR;
 			ib_event.device = &us_ibdev->ib_dev;
@@ -168,17 +176,17 @@
 		} else {
 			usnic_dbg("Ignoring %s on %s\n",
 					netdev_cmd_to_name(event),
-					us_ibdev->ib_dev.name);
+					dev_name(&us_ibdev->ib_dev.dev));
 		}
 		break;
 	case NETDEV_CHANGEADDR:
 		if (!memcmp(us_ibdev->ufdev->mac, netdev->dev_addr,
 				sizeof(us_ibdev->ufdev->mac))) {
 			usnic_dbg("Ignoring addr change on %s\n",
-					us_ibdev->ib_dev.name);
+				  dev_name(&us_ibdev->ib_dev.dev));
 		} else {
 			usnic_info(" %s old mac: %pM new mac: %pM\n",
-					us_ibdev->ib_dev.name,
+					dev_name(&us_ibdev->ib_dev.dev),
 					us_ibdev->ufdev->mac,
 					netdev->dev_addr);
 			usnic_fwd_set_mac(us_ibdev->ufdev, netdev->dev_addr);
@@ -193,19 +201,19 @@
 	case NETDEV_CHANGEMTU:
 		if (us_ibdev->ufdev->mtu != netdev->mtu) {
 			usnic_info("MTU Change on %s old: %u new: %u\n",
-					us_ibdev->ib_dev.name,
+					dev_name(&us_ibdev->ib_dev.dev),
 					us_ibdev->ufdev->mtu, netdev->mtu);
 			usnic_fwd_set_mtu(us_ibdev->ufdev, netdev->mtu);
 			usnic_ib_qp_grp_modify_active_to_err(us_ibdev);
 		} else {
 			usnic_dbg("Ignoring MTU change on %s\n",
-					us_ibdev->ib_dev.name);
+				  dev_name(&us_ibdev->ib_dev.dev));
 		}
 		break;
 	default:
 		usnic_dbg("Ignoring event %s on %s",
 				netdev_cmd_to_name(event),
-				us_ibdev->ib_dev.name);
+				dev_name(&us_ibdev->ib_dev.dev));
 	}
 	mutex_unlock(&us_ibdev->usdev_lock);
 }
@@ -214,18 +222,17 @@
 					unsigned long event, void *ptr)
 {
 	struct usnic_ib_dev *us_ibdev;
+	struct ib_device *ibdev;
 
 	struct net_device *netdev = netdev_notifier_info_to_dev(ptr);
 
-	mutex_lock(&usnic_ib_ibdev_list_lock);
-	list_for_each_entry(us_ibdev, &usnic_ib_ibdev_list, ib_dev_link) {
-		if (us_ibdev->netdev == netdev) {
-			usnic_ib_handle_usdev_event(us_ibdev, event);
-			break;
-		}
-	}
-	mutex_unlock(&usnic_ib_ibdev_list_lock);
+	ibdev = ib_device_get_by_netdev(netdev, RDMA_DRIVER_USNIC);
+	if (!ibdev)
+		return NOTIFY_DONE;
 
+	us_ibdev = container_of(ibdev, struct usnic_ib_dev, ib_dev);
+	usnic_ib_handle_usdev_event(us_ibdev, event);
+	ib_device_put(ibdev);
 	return NOTIFY_DONE;
 }
 
@@ -267,7 +274,7 @@
 	default:
 		usnic_info("Ignoring event %s on %s",
 				netdev_cmd_to_name(event),
-				us_ibdev->ib_dev.name);
+				dev_name(&us_ibdev->ib_dev.dev));
 	}
 	mutex_unlock(&us_ibdev->usdev_lock);
 
@@ -280,16 +287,15 @@
 	struct usnic_ib_dev *us_ibdev;
 	struct in_ifaddr *ifa = ptr;
 	struct net_device *netdev = ifa->ifa_dev->dev;
+	struct ib_device *ibdev;
 
-	mutex_lock(&usnic_ib_ibdev_list_lock);
-	list_for_each_entry(us_ibdev, &usnic_ib_ibdev_list, ib_dev_link) {
-		if (us_ibdev->netdev == netdev) {
-			usnic_ib_handle_inet_event(us_ibdev, event, ptr);
-			break;
-		}
-	}
-	mutex_unlock(&usnic_ib_ibdev_list_lock);
+	ibdev = ib_device_get_by_netdev(netdev, RDMA_DRIVER_USNIC);
+	if (!ibdev)
+		return NOTIFY_DONE;
 
+	us_ibdev = container_of(ibdev, struct usnic_ib_dev, ib_dev);
+	usnic_ib_handle_inet_event(us_ibdev, event, ptr);
+	ib_device_put(ibdev);
 	return NOTIFY_DONE;
 }
 static struct notifier_block usnic_ib_inetaddr_notifier = {
@@ -328,6 +334,36 @@
 	snprintf(str, IB_FW_VERSION_NAME_MAX, "%s", info.fw_version);
 }
 
+static const struct ib_device_ops usnic_dev_ops = {
+	.owner = THIS_MODULE,
+	.driver_id = RDMA_DRIVER_USNIC,
+	.uverbs_abi_ver = USNIC_UVERBS_ABI_VERSION,
+
+	.alloc_pd = usnic_ib_alloc_pd,
+	.alloc_ucontext = usnic_ib_alloc_ucontext,
+	.create_cq = usnic_ib_create_cq,
+	.create_qp = usnic_ib_create_qp,
+	.dealloc_pd = usnic_ib_dealloc_pd,
+	.dealloc_ucontext = usnic_ib_dealloc_ucontext,
+	.dereg_mr = usnic_ib_dereg_mr,
+	.destroy_cq = usnic_ib_destroy_cq,
+	.destroy_qp = usnic_ib_destroy_qp,
+	.get_dev_fw_str = usnic_get_dev_fw_str,
+	.get_link_layer = usnic_ib_port_link_layer,
+	.get_port_immutable = usnic_port_immutable,
+	.mmap = usnic_ib_mmap,
+	.modify_qp = usnic_ib_modify_qp,
+	.query_device = usnic_ib_query_device,
+	.query_gid = usnic_ib_query_gid,
+	.query_pkey = usnic_ib_query_pkey,
+	.query_port = usnic_ib_query_port,
+	.query_qp = usnic_ib_query_qp,
+	.reg_user_mr = usnic_ib_reg_mr,
+	INIT_RDMA_OBJ_SIZE(ib_pd, usnic_ib_pd, ibpd),
+	INIT_RDMA_OBJ_SIZE(ib_cq, usnic_ib_cq, ibcq),
+	INIT_RDMA_OBJ_SIZE(ib_ucontext, usnic_ib_ucontext, ibucontext),
+};
+
 /* Start of PF discovery section */
 static void *usnic_ib_device_add(struct pci_dev *dev)
 {
@@ -335,11 +371,12 @@
 	union ib_gid gid;
 	struct in_device *ind;
 	struct net_device *netdev;
+	int ret;
 
 	usnic_dbg("\n");
 	netdev = pci_get_drvdata(dev);
 
-	us_ibdev = (struct usnic_ib_dev *)ib_alloc_device(sizeof(*us_ibdev));
+	us_ibdev = ib_alloc_device(usnic_ib_dev, ib_dev);
 	if (!us_ibdev) {
 		usnic_err("Device %s context alloc failed\n",
 				netdev_name(pci_get_drvdata(dev)));
@@ -358,13 +395,10 @@
 
 	us_ibdev->pdev = dev;
 	us_ibdev->netdev = pci_get_drvdata(dev);
-	us_ibdev->ib_dev.owner = THIS_MODULE;
 	us_ibdev->ib_dev.node_type = RDMA_NODE_USNIC_UDP;
 	us_ibdev->ib_dev.phys_port_cnt = USNIC_IB_PORT_CNT;
 	us_ibdev->ib_dev.num_comp_vectors = USNIC_IB_NUM_COMP_VECTORS;
 	us_ibdev->ib_dev.dev.parent = &dev->dev;
-	us_ibdev->ib_dev.uverbs_abi_ver = USNIC_UVERBS_ABI_VERSION;
-	strlcpy(us_ibdev->ib_dev.name, "usnic_%d", IB_DEVICE_NAME_MAX);
 
 	us_ibdev->ib_dev.uverbs_cmd_mask =
 		(1ull << IB_USER_VERBS_CMD_GET_CONTEXT) |
@@ -385,38 +419,15 @@
 		(1ull << IB_USER_VERBS_CMD_DETACH_MCAST) |
 		(1ull << IB_USER_VERBS_CMD_OPEN_QP);
 
-	us_ibdev->ib_dev.query_device = usnic_ib_query_device;
-	us_ibdev->ib_dev.query_port = usnic_ib_query_port;
-	us_ibdev->ib_dev.query_pkey = usnic_ib_query_pkey;
-	us_ibdev->ib_dev.query_gid = usnic_ib_query_gid;
-	us_ibdev->ib_dev.get_netdev = usnic_get_netdev;
-	us_ibdev->ib_dev.get_link_layer = usnic_ib_port_link_layer;
-	us_ibdev->ib_dev.alloc_pd = usnic_ib_alloc_pd;
-	us_ibdev->ib_dev.dealloc_pd = usnic_ib_dealloc_pd;
-	us_ibdev->ib_dev.create_qp = usnic_ib_create_qp;
-	us_ibdev->ib_dev.modify_qp = usnic_ib_modify_qp;
-	us_ibdev->ib_dev.query_qp = usnic_ib_query_qp;
-	us_ibdev->ib_dev.destroy_qp = usnic_ib_destroy_qp;
-	us_ibdev->ib_dev.create_cq = usnic_ib_create_cq;
-	us_ibdev->ib_dev.destroy_cq = usnic_ib_destroy_cq;
-	us_ibdev->ib_dev.reg_user_mr = usnic_ib_reg_mr;
-	us_ibdev->ib_dev.dereg_mr = usnic_ib_dereg_mr;
-	us_ibdev->ib_dev.alloc_ucontext = usnic_ib_alloc_ucontext;
-	us_ibdev->ib_dev.dealloc_ucontext = usnic_ib_dealloc_ucontext;
-	us_ibdev->ib_dev.mmap = usnic_ib_mmap;
-	us_ibdev->ib_dev.create_ah = usnic_ib_create_ah;
-	us_ibdev->ib_dev.destroy_ah = usnic_ib_destroy_ah;
-	us_ibdev->ib_dev.post_send = usnic_ib_post_send;
-	us_ibdev->ib_dev.post_recv = usnic_ib_post_recv;
-	us_ibdev->ib_dev.poll_cq = usnic_ib_poll_cq;
-	us_ibdev->ib_dev.req_notify_cq = usnic_ib_req_notify_cq;
-	us_ibdev->ib_dev.get_dma_mr = usnic_ib_get_dma_mr;
-	us_ibdev->ib_dev.get_port_immutable = usnic_port_immutable;
-	us_ibdev->ib_dev.get_dev_fw_str     = usnic_get_dev_fw_str;
+	ib_set_device_ops(&us_ibdev->ib_dev, &usnic_dev_ops);
 
+	rdma_set_device_sysfs_group(&us_ibdev->ib_dev, &usnic_attr_group);
 
-	us_ibdev->ib_dev.driver_id = RDMA_DRIVER_USNIC;
-	if (ib_register_device(&us_ibdev->ib_dev, NULL))
+	ret = ib_device_set_netdev(&us_ibdev->ib_dev, us_ibdev->netdev, 1);
+	if (ret)
+		goto err_fwd_dealloc;
+
+	if (ib_register_device(&us_ibdev->ib_dev, "usnic_%d"))
 		goto err_fwd_dealloc;
 
 	usnic_fwd_set_mtu(us_ibdev->ufdev, us_ibdev->netdev->mtu);
@@ -424,11 +435,16 @@
 	if (netif_carrier_ok(us_ibdev->netdev))
 		usnic_fwd_carrier_up(us_ibdev->ufdev);
 
-	ind = in_dev_get(netdev);
-	if (ind->ifa_list)
-		usnic_fwd_add_ipaddr(us_ibdev->ufdev,
-				     ind->ifa_list->ifa_address);
-	in_dev_put(ind);
+	rcu_read_lock();
+	ind = __in_dev_get_rcu(netdev);
+	if (ind) {
+		const struct in_ifaddr *ifa;
+
+		ifa = rcu_dereference(ind->ifa_list);
+		if (ifa)
+			usnic_fwd_add_ipaddr(us_ibdev->ufdev, ifa->ifa_address);
+	}
+	rcu_read_unlock();
 
 	usnic_mac_ip_to_gid(us_ibdev->netdev->perm_addr,
 				us_ibdev->ufdev->inaddr, &gid.raw[0]);
@@ -437,9 +453,9 @@
 	kref_init(&us_ibdev->vf_cnt);
 
 	usnic_info("Added ibdev: %s netdev: %s with mac %pM Link: %u MTU: %u\n",
-			us_ibdev->ib_dev.name, netdev_name(us_ibdev->netdev),
-			us_ibdev->ufdev->mac, us_ibdev->ufdev->link_up,
-			us_ibdev->ufdev->mtu);
+		   dev_name(&us_ibdev->ib_dev.dev),
+		   netdev_name(us_ibdev->netdev), us_ibdev->ufdev->mac,
+		   us_ibdev->ufdev->link_up, us_ibdev->ufdev->mtu);
 	return us_ibdev;
 
 err_fwd_dealloc:
@@ -452,7 +468,7 @@
 
 static void usnic_ib_device_remove(struct usnic_ib_dev *us_ibdev)
 {
-	usnic_info("Unregistering %s\n", us_ibdev->ib_dev.name);
+	usnic_info("Unregistering %s\n", dev_name(&us_ibdev->ib_dev.dev));
 	usnic_ib_sysfs_unregister_usdev(us_ibdev);
 	usnic_fwd_dev_free(us_ibdev->ufdev);
 	ib_unregister_device(&us_ibdev->ib_dev);
@@ -471,15 +487,17 @@
 				&usnic_ib_ibdev_list, ib_dev_link) {
 		if (us_ibdev->pdev == dev) {
 			list_del(&us_ibdev->ib_dev_link);
-			usnic_ib_device_remove(us_ibdev);
 			found = true;
 			break;
 		}
 	}
 
-	WARN(!found, "Failed to remove PF %s\n", pci_name(dev));
 
 	mutex_unlock(&usnic_ib_ibdev_list_lock);
+	if (found)
+		usnic_ib_device_remove(us_ibdev);
+	else
+		WARN(1, "Failed to remove PF %s\n", pci_name(dev));
 }
 
 static struct usnic_ib_dev *usnic_ib_discover_pf(struct usnic_vnic *vnic)
@@ -591,7 +609,7 @@
 	mutex_unlock(&pf->usdev_lock);
 
 	usnic_info("Registering usnic VF %s into PF %s\n", pci_name(pdev),
-			pf->ib_dev.name);
+		   dev_name(&pf->ib_dev.dev));
 	usnic_ib_log_vf(vf);
 	return 0;
 
@@ -646,7 +664,7 @@
 
 	err = usnic_uiom_init(DRV_NAME);
 	if (err) {
-		usnic_err("Unable to initalize umem with err %d\n", err);
+		usnic_err("Unable to initialize umem with err %d\n", err);
 		return err;
 	}
 
@@ -685,7 +703,6 @@
 out_pci_unreg:
 	pci_unregister_driver(&usnic_ib_pci_driver);
 out_umem_fini:
-	usnic_uiom_fini();
 
 	return err;
 }
@@ -698,7 +715,6 @@
 	unregister_inetaddr_notifier(&usnic_ib_inetaddr_notifier);
 	unregister_netdevice_notifier(&usnic_ib_netdevice_notifier);
 	pci_unregister_driver(&usnic_ib_pci_driver);
-	usnic_uiom_fini();
 }
 
 MODULE_DESCRIPTION("Cisco VIC (usNIC) Verbs Driver");
diff --git a/drivers/infiniband/hw/usnic/usnic_ib_qp_grp.c b/drivers/infiniband/hw/usnic/usnic_ib_qp_grp.c
index bf51365..0cdb156 100644
--- a/drivers/infiniband/hw/usnic/usnic_ib_qp_grp.c
+++ b/drivers/infiniband/hw/usnic/usnic_ib_qp_grp.c
@@ -681,7 +681,7 @@
 	err = usnic_vnic_res_spec_satisfied(&min_transport_spec[transport],
 						res_spec);
 	if (err) {
-		usnic_err("Spec does not meet miniumum req for transport %d\n",
+		usnic_err("Spec does not meet minimum req for transport %d\n",
 				transport);
 		log_spec(res_spec);
 		return ERR_PTR(err);
diff --git a/drivers/infiniband/hw/usnic/usnic_ib_sysfs.c b/drivers/infiniband/hw/usnic/usnic_ib_sysfs.c
index 4210ca1..c85d48a 100644
--- a/drivers/infiniband/hw/usnic/usnic_ib_sysfs.c
+++ b/drivers/infiniband/hw/usnic/usnic_ib_sysfs.c
@@ -46,12 +46,11 @@
 #include "usnic_ib_sysfs.h"
 #include "usnic_log.h"
 
-static ssize_t usnic_ib_show_board(struct device *device,
-					struct device_attribute *attr,
-					char *buf)
+static ssize_t board_id_show(struct device *device,
+			     struct device_attribute *attr, char *buf)
 {
 	struct usnic_ib_dev *us_ibdev =
-		container_of(device, struct usnic_ib_dev, ib_dev.dev);
+		rdma_device_to_drv_device(device, struct usnic_ib_dev, ib_dev);
 	unsigned short subsystem_device_id;
 
 	mutex_lock(&us_ibdev->usdev_lock);
@@ -60,22 +59,21 @@
 
 	return scnprintf(buf, PAGE_SIZE, "%hu\n", subsystem_device_id);
 }
+static DEVICE_ATTR_RO(board_id);
 
 /*
  * Report the configuration for this PF
  */
 static ssize_t
-usnic_ib_show_config(struct device *device, struct device_attribute *attr,
-			char *buf)
+config_show(struct device *device, struct device_attribute *attr, char *buf)
 {
-	struct usnic_ib_dev *us_ibdev;
+	struct usnic_ib_dev *us_ibdev =
+		rdma_device_to_drv_device(device, struct usnic_ib_dev, ib_dev);
 	char *ptr;
 	unsigned left;
 	unsigned n;
 	enum usnic_vnic_res_type res_type;
 
-	us_ibdev = container_of(device, struct usnic_ib_dev, ib_dev.dev);
-
 	/* Buffer space limit is 1 page */
 	ptr = buf;
 	left = PAGE_SIZE;
@@ -94,7 +92,7 @@
 
 		n = scnprintf(ptr, left,
 			"%s: %s:%d.%d, %s, %pM, %u VFs\n Per VF:",
-			us_ibdev->ib_dev.name,
+			dev_name(&us_ibdev->ib_dev.dev),
 			busname,
 			PCI_SLOT(us_ibdev->pdev->devfn),
 			PCI_FUNC(us_ibdev->pdev->devfn),
@@ -119,79 +117,75 @@
 		UPDATE_PTR_LEFT(n, ptr, left);
 	} else {
 		n = scnprintf(ptr, left, "%s: no VFs\n",
-				us_ibdev->ib_dev.name);
+				dev_name(&us_ibdev->ib_dev.dev));
 		UPDATE_PTR_LEFT(n, ptr, left);
 	}
 	mutex_unlock(&us_ibdev->usdev_lock);
 
 	return ptr - buf;
 }
+static DEVICE_ATTR_RO(config);
 
 static ssize_t
-usnic_ib_show_iface(struct device *device, struct device_attribute *attr,
-			char *buf)
+iface_show(struct device *device, struct device_attribute *attr, char *buf)
 {
-	struct usnic_ib_dev *us_ibdev;
-
-	us_ibdev = container_of(device, struct usnic_ib_dev, ib_dev.dev);
+	struct usnic_ib_dev *us_ibdev =
+		rdma_device_to_drv_device(device, struct usnic_ib_dev, ib_dev);
 
 	return scnprintf(buf, PAGE_SIZE, "%s\n",
 			netdev_name(us_ibdev->netdev));
 }
+static DEVICE_ATTR_RO(iface);
 
 static ssize_t
-usnic_ib_show_max_vf(struct device *device, struct device_attribute *attr,
-			char *buf)
+max_vf_show(struct device *device, struct device_attribute *attr, char *buf)
 {
-	struct usnic_ib_dev *us_ibdev;
-
-	us_ibdev = container_of(device, struct usnic_ib_dev, ib_dev.dev);
+	struct usnic_ib_dev *us_ibdev =
+		rdma_device_to_drv_device(device, struct usnic_ib_dev, ib_dev);
 
 	return scnprintf(buf, PAGE_SIZE, "%u\n",
 			kref_read(&us_ibdev->vf_cnt));
 }
+static DEVICE_ATTR_RO(max_vf);
 
 static ssize_t
-usnic_ib_show_qp_per_vf(struct device *device, struct device_attribute *attr,
-			char *buf)
+qp_per_vf_show(struct device *device, struct device_attribute *attr, char *buf)
 {
-	struct usnic_ib_dev *us_ibdev;
+	struct usnic_ib_dev *us_ibdev =
+		rdma_device_to_drv_device(device, struct usnic_ib_dev, ib_dev);
 	int qp_per_vf;
 
-	us_ibdev = container_of(device, struct usnic_ib_dev, ib_dev.dev);
 	qp_per_vf = max(us_ibdev->vf_res_cnt[USNIC_VNIC_RES_TYPE_WQ],
 			us_ibdev->vf_res_cnt[USNIC_VNIC_RES_TYPE_RQ]);
 
 	return scnprintf(buf, PAGE_SIZE,
 				"%d\n", qp_per_vf);
 }
+static DEVICE_ATTR_RO(qp_per_vf);
 
 static ssize_t
-usnic_ib_show_cq_per_vf(struct device *device, struct device_attribute *attr,
-			char *buf)
+cq_per_vf_show(struct device *device, struct device_attribute *attr, char *buf)
 {
-	struct usnic_ib_dev *us_ibdev;
-
-	us_ibdev = container_of(device, struct usnic_ib_dev, ib_dev.dev);
+	struct usnic_ib_dev *us_ibdev =
+		rdma_device_to_drv_device(device, struct usnic_ib_dev, ib_dev);
 
 	return scnprintf(buf, PAGE_SIZE, "%d\n",
 			us_ibdev->vf_res_cnt[USNIC_VNIC_RES_TYPE_CQ]);
 }
+static DEVICE_ATTR_RO(cq_per_vf);
 
-static DEVICE_ATTR(board_id, S_IRUGO, usnic_ib_show_board, NULL);
-static DEVICE_ATTR(config, S_IRUGO, usnic_ib_show_config, NULL);
-static DEVICE_ATTR(iface, S_IRUGO, usnic_ib_show_iface, NULL);
-static DEVICE_ATTR(max_vf, S_IRUGO, usnic_ib_show_max_vf, NULL);
-static DEVICE_ATTR(qp_per_vf, S_IRUGO, usnic_ib_show_qp_per_vf, NULL);
-static DEVICE_ATTR(cq_per_vf, S_IRUGO, usnic_ib_show_cq_per_vf, NULL);
+static struct attribute *usnic_class_attributes[] = {
+	&dev_attr_board_id.attr,
+	&dev_attr_config.attr,
+	&dev_attr_iface.attr,
+	&dev_attr_max_vf.attr,
+	&dev_attr_qp_per_vf.attr,
+	&dev_attr_cq_per_vf.attr,
+	NULL
+};
 
-static struct device_attribute *usnic_class_attributes[] = {
-	&dev_attr_board_id,
-	&dev_attr_config,
-	&dev_attr_iface,
-	&dev_attr_max_vf,
-	&dev_attr_qp_per_vf,
-	&dev_attr_cq_per_vf,
+const struct attribute_group usnic_attr_group = {
+	.attrs = usnic_class_attributes,
 };
 
 struct qpn_attribute {
@@ -278,18 +272,6 @@
 
 int usnic_ib_sysfs_register_usdev(struct usnic_ib_dev *us_ibdev)
 {
-	int i;
-	int err;
-	for (i = 0; i < ARRAY_SIZE(usnic_class_attributes); ++i) {
-		err = device_create_file(&us_ibdev->ib_dev.dev,
-						usnic_class_attributes[i]);
-		if (err) {
-			usnic_err("Failed to create device file %d for %s eith err %d",
-				i, us_ibdev->ib_dev.name, err);
-			return -EINVAL;
-		}
-	}
-
 	/* create kernel object for looking at individual QPs */
 	kobject_get(&us_ibdev->ib_dev.dev.kobj);
 	us_ibdev->qpn_kobj = kobject_create_and_add("qpn",
@@ -304,12 +286,6 @@
 
 void usnic_ib_sysfs_unregister_usdev(struct usnic_ib_dev *us_ibdev)
 {
-	int i;
-	for (i = 0; i < ARRAY_SIZE(usnic_class_attributes); ++i) {
-		device_remove_file(&us_ibdev->ib_dev.dev,
-					usnic_class_attributes[i]);
-	}
-
 	kobject_put(us_ibdev->qpn_kobj);
 }
 
diff --git a/drivers/infiniband/hw/usnic/usnic_ib_sysfs.h b/drivers/infiniband/hw/usnic/usnic_ib_sysfs.h
index 3d98e16..b1f064c 100644
--- a/drivers/infiniband/hw/usnic/usnic_ib_sysfs.h
+++ b/drivers/infiniband/hw/usnic/usnic_ib_sysfs.h
@@ -41,4 +41,6 @@
 void usnic_ib_sysfs_qpn_add(struct usnic_ib_qp_grp *qp_grp);
 void usnic_ib_sysfs_qpn_remove(struct usnic_ib_qp_grp *qp_grp);
 
+extern const struct attribute_group usnic_attr_group;
+
 #endif /* !USNIC_IB_SYSFS_H_ */
diff --git a/drivers/infiniband/hw/usnic/usnic_ib_verbs.c b/drivers/infiniband/hw/usnic/usnic_ib_verbs.c
index 9973ac8..556b8e4 100644
--- a/drivers/infiniband/hw/usnic/usnic_ib_verbs.c
+++ b/drivers/infiniband/hw/usnic/usnic_ib_verbs.c
@@ -37,6 +37,7 @@
 
 #include <rdma/ib_user_verbs.h>
 #include <rdma/ib_addr.h>
+#include <rdma/uverbs_ioctl.h>
 
 #include "usnic_abi.h"
 #include "usnic_ib.h"
@@ -159,7 +160,8 @@
 
 	err = ib_copy_to_udata(udata, &resp, sizeof(resp));
 	if (err) {
-		usnic_err("Failed to copy udata for %s", us_ibdev->ib_dev.name);
+		usnic_err("Failed to copy udata for %s",
+			  dev_name(&us_ibdev->ib_dev.dev));
 		return err;
 	}
 
@@ -192,12 +194,12 @@
 			return ERR_CAST(dev_list);
 		for (i = 0; dev_list[i]; i++) {
 			dev = dev_list[i];
-			vf = pci_get_drvdata(to_pci_dev(dev));
+			vf = dev_get_drvdata(dev);
 			spin_lock(&vf->lock);
 			vnic = vf->vnic;
 			if (!usnic_vnic_check_room(vnic, res_spec)) {
 				usnic_dbg("Found used vnic %s from %s\n",
-						us_ibdev->ib_dev.name,
+						dev_name(&us_ibdev->ib_dev.dev),
 						pci_name(usnic_vnic_get_pdev(
 									vnic)));
 				qp_grp = usnic_ib_qp_grp_create(us_ibdev->ufdev,
@@ -230,7 +232,8 @@
 		spin_unlock(&vf->lock);
 	}
 
-	usnic_info("No free qp grp found on %s\n", us_ibdev->ib_dev.name);
+	usnic_info("No free qp grp found on %s\n",
+		   dev_name(&us_ibdev->ib_dev.dev));
 	return ERR_PTR(-ENOMEM);
 
 qp_grp_check:
@@ -334,13 +337,16 @@
 
 	usnic_dbg("\n");
 
-	mutex_lock(&us_ibdev->usdev_lock);
 	if (ib_get_eth_speed(ibdev, port, &props->active_speed,
-			     &props->active_width)) {
-		mutex_unlock(&us_ibdev->usdev_lock);
+			     &props->active_width))
 		return -EINVAL;
-	}
 
+	/*
+	 * usdev_lock is acquired after (and not before) ib_get_eth_speed call
+	 * because acquiring rtnl_lock in ib_get_eth_speed, while holding
+	 * usdev_lock could lead to a deadlock.
+	 */
+	mutex_lock(&us_ibdev->usdev_lock);
 	/* props being zeroed by the caller, avoid zeroing it here */
 
 	props->lid = 0;
@@ -350,13 +356,14 @@
 
 	if (!us_ibdev->ufdev->link_up) {
 		props->state = IB_PORT_DOWN;
-		props->phys_state = 3;
+		props->phys_state = IB_PORT_PHYS_STATE_DISABLED;
 	} else if (!us_ibdev->ufdev->inaddr) {
 		props->state = IB_PORT_INIT;
-		props->phys_state = 4;
+		props->phys_state =
+			IB_PORT_PHYS_STATE_PORT_CONFIGURATION_TRAINING;
 	} else {
 		props->state = IB_PORT_ACTIVE;
-		props->phys_state = 5;
+		props->phys_state = IB_PORT_PHYS_STATE_LINK_UP;
 	}
 
 	props->port_cap_flags = 0;
@@ -431,57 +438,32 @@
 	return 0;
 }
 
-struct net_device *usnic_get_netdev(struct ib_device *device, u8 port_num)
-{
-	struct usnic_ib_dev *us_ibdev = to_usdev(device);
-
-	if (us_ibdev->netdev)
-		dev_hold(us_ibdev->netdev);
-
-	return us_ibdev->netdev;
-}
-
 int usnic_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index,
 				u16 *pkey)
 {
-	if (index > 1)
+	if (index > 0)
 		return -EINVAL;
 
 	*pkey = 0xffff;
 	return 0;
 }
 
-struct ib_pd *usnic_ib_alloc_pd(struct ib_device *ibdev,
-					struct ib_ucontext *context,
-					struct ib_udata *udata)
+int usnic_ib_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata)
 {
-	struct usnic_ib_pd *pd;
+	struct usnic_ib_pd *pd = to_upd(ibpd);
 	void *umem_pd;
 
-	usnic_dbg("\n");
-
-	pd = kzalloc(sizeof(*pd), GFP_KERNEL);
-	if (!pd)
-		return ERR_PTR(-ENOMEM);
-
 	umem_pd = pd->umem_pd = usnic_uiom_alloc_pd();
 	if (IS_ERR_OR_NULL(umem_pd)) {
-		kfree(pd);
-		return ERR_PTR(umem_pd ? PTR_ERR(umem_pd) : -ENOMEM);
+		return umem_pd ? PTR_ERR(umem_pd) : -ENOMEM;
 	}
 
-	usnic_info("domain 0x%p allocated for context 0x%p and device %s\n",
-			pd, context, ibdev->name);
-	return &pd->ibpd;
+	return 0;
 }
 
-int usnic_ib_dealloc_pd(struct ib_pd *pd)
+void usnic_ib_dealloc_pd(struct ib_pd *pd, struct ib_udata *udata)
 {
-	usnic_info("freeing domain 0x%p\n", pd);
-
 	usnic_uiom_dealloc_pd((to_upd(pd))->umem_pd);
-	kfree(pd);
-	return 0;
 }
 
 struct ib_qp *usnic_ib_create_qp(struct ib_pd *pd,
@@ -491,7 +473,8 @@
 	int err;
 	struct usnic_ib_dev *us_ibdev;
 	struct usnic_ib_qp_grp *qp_grp;
-	struct usnic_ib_ucontext *ucontext;
+	struct usnic_ib_ucontext *ucontext = rdma_udata_to_drv_context(
+		udata, struct usnic_ib_ucontext, ibucontext);
 	int cq_cnt;
 	struct usnic_vnic_res_spec res_spec;
 	struct usnic_ib_create_qp_cmd cmd;
@@ -499,7 +482,6 @@
 
 	usnic_dbg("\n");
 
-	ucontext = to_uucontext(pd->uobject->context);
 	us_ibdev = to_usdev(pd->device);
 
 	if (init_attr->create_flags)
@@ -508,20 +490,20 @@
 	err = ib_copy_from_udata(&cmd, udata, sizeof(cmd));
 	if (err) {
 		usnic_err("%s: cannot copy udata for create_qp\n",
-				us_ibdev->ib_dev.name);
+			  dev_name(&us_ibdev->ib_dev.dev));
 		return ERR_PTR(-EINVAL);
 	}
 
 	err = create_qp_validate_user_data(cmd);
 	if (err) {
 		usnic_err("%s: Failed to validate user data\n",
-				us_ibdev->ib_dev.name);
+			  dev_name(&us_ibdev->ib_dev.dev));
 		return ERR_PTR(-EINVAL);
 	}
 
 	if (init_attr->qp_type != IB_QPT_UD) {
 		usnic_err("%s asked to make a non-UD QP: %d\n",
-				us_ibdev->ib_dev.name, init_attr->qp_type);
+			  dev_name(&us_ibdev->ib_dev.dev), init_attr->qp_type);
 		return ERR_PTR(-EINVAL);
 	}
 
@@ -557,7 +539,7 @@
 	return ERR_PTR(err);
 }
 
-int usnic_ib_destroy_qp(struct ib_qp *qp)
+int usnic_ib_destroy_qp(struct ib_qp *qp, struct ib_udata *udata)
 {
 	struct usnic_ib_qp_grp *qp_grp;
 	struct usnic_ib_vf *vf;
@@ -606,29 +588,18 @@
 	return status;
 }
 
-struct ib_cq *usnic_ib_create_cq(struct ib_device *ibdev,
-				 const struct ib_cq_init_attr *attr,
-				 struct ib_ucontext *context,
-				 struct ib_udata *udata)
+int usnic_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
+		       struct ib_udata *udata)
 {
-	struct ib_cq *cq;
-
-	usnic_dbg("\n");
 	if (attr->flags)
-		return ERR_PTR(-EINVAL);
+		return -EINVAL;
 
-	cq = kzalloc(sizeof(*cq), GFP_KERNEL);
-	if (!cq)
-		return ERR_PTR(-EBUSY);
-
-	return cq;
+	return 0;
 }
 
-int usnic_ib_destroy_cq(struct ib_cq *cq)
+void usnic_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata)
 {
-	usnic_dbg("\n");
-	kfree(cq);
-	return 0;
+	return;
 }
 
 struct ib_mr *usnic_ib_reg_mr(struct ib_pd *pd, u64 start, u64 length,
@@ -660,48 +631,42 @@
 	return ERR_PTR(err);
 }
 
-int usnic_ib_dereg_mr(struct ib_mr *ibmr)
+int usnic_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
 {
 	struct usnic_ib_mr *mr = to_umr(ibmr);
 
 	usnic_dbg("va 0x%lx length 0x%zx\n", mr->umem->va, mr->umem->length);
 
-	usnic_uiom_reg_release(mr->umem, ibmr->uobject->context);
+	usnic_uiom_reg_release(mr->umem);
 	kfree(mr);
 	return 0;
 }
 
-struct ib_ucontext *usnic_ib_alloc_ucontext(struct ib_device *ibdev,
-							struct ib_udata *udata)
+int usnic_ib_alloc_ucontext(struct ib_ucontext *uctx, struct ib_udata *udata)
 {
-	struct usnic_ib_ucontext *context;
+	struct ib_device *ibdev = uctx->device;
+	struct usnic_ib_ucontext *context = to_ucontext(uctx);
 	struct usnic_ib_dev *us_ibdev = to_usdev(ibdev);
 	usnic_dbg("\n");
 
-	context = kmalloc(sizeof(*context), GFP_KERNEL);
-	if (!context)
-		return ERR_PTR(-ENOMEM);
-
 	INIT_LIST_HEAD(&context->qp_grp_list);
 	mutex_lock(&us_ibdev->usdev_lock);
 	list_add_tail(&context->link, &us_ibdev->ctx_list);
 	mutex_unlock(&us_ibdev->usdev_lock);
 
-	return &context->ibucontext;
+	return 0;
 }
 
-int usnic_ib_dealloc_ucontext(struct ib_ucontext *ibcontext)
+void usnic_ib_dealloc_ucontext(struct ib_ucontext *ibcontext)
 {
 	struct usnic_ib_ucontext *context = to_uucontext(ibcontext);
 	struct usnic_ib_dev *us_ibdev = to_usdev(ibcontext->device);
 	usnic_dbg("\n");
 
 	mutex_lock(&us_ibdev->usdev_lock);
-	BUG_ON(!list_empty(&context->qp_grp_list));
+	WARN_ON_ONCE(!list_empty(&context->qp_grp_list));
 	list_del(&context->link);
 	mutex_unlock(&us_ibdev->usdev_lock);
-	kfree(context);
-	return 0;
 }
 
 int usnic_ib_mmap(struct ib_ucontext *context,
@@ -755,56 +720,3 @@
 	return -EINVAL;
 }
 
-/* In ib callbacks section -  Start of stub funcs */
-struct ib_ah *usnic_ib_create_ah(struct ib_pd *pd,
-				 struct rdma_ah_attr *ah_attr,
-				 struct ib_udata *udata)
-
-{
-	usnic_dbg("\n");
-	return ERR_PTR(-EPERM);
-}
-
-int usnic_ib_destroy_ah(struct ib_ah *ah)
-{
-	usnic_dbg("\n");
-	return -EINVAL;
-}
-
-int usnic_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr,
-		       const struct ib_send_wr **bad_wr)
-{
-	usnic_dbg("\n");
-	return -EINVAL;
-}
-
-int usnic_ib_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr,
-		       const struct ib_recv_wr **bad_wr)
-{
-	usnic_dbg("\n");
-	return -EINVAL;
-}
-
-int usnic_ib_poll_cq(struct ib_cq *ibcq, int num_entries,
-				struct ib_wc *wc)
-{
-	usnic_dbg("\n");
-	return -EINVAL;
-}
-
-int usnic_ib_req_notify_cq(struct ib_cq *cq,
-					enum ib_cq_notify_flags flags)
-{
-	usnic_dbg("\n");
-	return -EINVAL;
-}
-
-struct ib_mr *usnic_ib_get_dma_mr(struct ib_pd *pd, int acc)
-{
-	usnic_dbg("\n");
-	return ERR_PTR(-ENOMEM);
-}
-
-
-/* In ib callbacks section - End of stub funcs */
-/* End of ib callbacks section */
diff --git a/drivers/infiniband/hw/usnic/usnic_ib_verbs.h b/drivers/infiniband/hw/usnic/usnic_ib_verbs.h
index 2a2c9be..2aedf78 100644
--- a/drivers/infiniband/hw/usnic/usnic_ib_verbs.h
+++ b/drivers/infiniband/hw/usnic/usnic_ib_verbs.h
@@ -48,45 +48,25 @@
 				struct ib_qp_init_attr *qp_init_attr);
 int usnic_ib_query_gid(struct ib_device *ibdev, u8 port, int index,
 				union ib_gid *gid);
-struct net_device *usnic_get_netdev(struct ib_device *device, u8 port_num);
 int usnic_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index,
 				u16 *pkey);
-struct ib_pd *usnic_ib_alloc_pd(struct ib_device *ibdev,
-				struct ib_ucontext *context,
-				struct ib_udata *udata);
-int usnic_ib_dealloc_pd(struct ib_pd *pd);
+int usnic_ib_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata);
+void usnic_ib_dealloc_pd(struct ib_pd *pd, struct ib_udata *udata);
 struct ib_qp *usnic_ib_create_qp(struct ib_pd *pd,
 					struct ib_qp_init_attr *init_attr,
 					struct ib_udata *udata);
-int usnic_ib_destroy_qp(struct ib_qp *qp);
+int usnic_ib_destroy_qp(struct ib_qp *qp, struct ib_udata *udata);
 int usnic_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
 				int attr_mask, struct ib_udata *udata);
-struct ib_cq *usnic_ib_create_cq(struct ib_device *ibdev,
-				 const struct ib_cq_init_attr *attr,
-				 struct ib_ucontext *context,
-				 struct ib_udata *udata);
-int usnic_ib_destroy_cq(struct ib_cq *cq);
+int usnic_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
+		       struct ib_udata *udata);
+void usnic_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata);
 struct ib_mr *usnic_ib_reg_mr(struct ib_pd *pd, u64 start, u64 length,
 				u64 virt_addr, int access_flags,
 				struct ib_udata *udata);
-int usnic_ib_dereg_mr(struct ib_mr *ibmr);
-struct ib_ucontext *usnic_ib_alloc_ucontext(struct ib_device *ibdev,
-						struct ib_udata *udata);
-int usnic_ib_dealloc_ucontext(struct ib_ucontext *ibcontext);
+int usnic_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata);
+int usnic_ib_alloc_ucontext(struct ib_ucontext *uctx, struct ib_udata *udata);
+void usnic_ib_dealloc_ucontext(struct ib_ucontext *ibcontext);
 int usnic_ib_mmap(struct ib_ucontext *context,
 			struct vm_area_struct *vma);
-struct ib_ah *usnic_ib_create_ah(struct ib_pd *pd,
-				 struct rdma_ah_attr *ah_attr,
-				 struct ib_udata *udata);
-
-int usnic_ib_destroy_ah(struct ib_ah *ah);
-int usnic_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr,
-			const struct ib_send_wr **bad_wr);
-int usnic_ib_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr,
-		       const struct ib_recv_wr **bad_wr);
-int usnic_ib_poll_cq(struct ib_cq *ibcq, int num_entries,
-			struct ib_wc *wc);
-int usnic_ib_req_notify_cq(struct ib_cq *cq,
-				enum ib_cq_notify_flags flags);
-struct ib_mr *usnic_ib_get_dma_mr(struct ib_pd *pd, int acc);
 #endif /* !USNIC_IB_VERBS_H */
diff --git a/drivers/infiniband/hw/usnic/usnic_transport.c b/drivers/infiniband/hw/usnic/usnic_transport.c
index e0a9553..82dd810 100644
--- a/drivers/infiniband/hw/usnic/usnic_transport.c
+++ b/drivers/infiniband/hw/usnic/usnic_transport.c
@@ -121,7 +121,7 @@
 	if (type == USNIC_TRANSPORT_ROCE_CUSTOM) {
 		spin_lock(&roce_bitmap_lock);
 		if (!port_num) {
-			usnic_err("Unreserved unvalid port num 0 for %s\n",
+			usnic_err("Unreserved invalid port num 0 for %s\n",
 					usnic_transport_to_str(type));
 			goto out_roce_custom;
 		}
diff --git a/drivers/infiniband/hw/usnic/usnic_uiom.c b/drivers/infiniband/hw/usnic/usnic_uiom.c
index 9dd39da..62e6ffa 100644
--- a/drivers/infiniband/hw/usnic/usnic_uiom.c
+++ b/drivers/infiniband/hw/usnic/usnic_uiom.c
@@ -47,25 +47,11 @@
 #include "usnic_uiom.h"
 #include "usnic_uiom_interval_tree.h"
 
-static struct workqueue_struct *usnic_uiom_wq;
-
 #define USNIC_UIOM_PAGE_CHUNK						\
 	((PAGE_SIZE - offsetof(struct usnic_uiom_chunk, page_list))	/\
 	((void *) &((struct usnic_uiom_chunk *) 0)->page_list[1] -	\
 	(void *) &((struct usnic_uiom_chunk *) 0)->page_list[0]))
 
-static void usnic_uiom_reg_account(struct work_struct *work)
-{
-	struct usnic_uiom_reg *umem = container_of(work,
-						struct usnic_uiom_reg, work);
-
-	down_write(&umem->mm->mmap_sem);
-	umem->mm->locked_vm -= umem->diff;
-	up_write(&umem->mm->mmap_sem);
-	mmput(umem->mm);
-	kfree(umem);
-}
-
 static int usnic_uiom_dma_fault(struct iommu_domain *domain,
 				struct device *dev,
 				unsigned long iova, int flags,
@@ -89,9 +75,7 @@
 		for_each_sg(chunk->page_list, sg, chunk->nents, i) {
 			page = sg_page(sg);
 			pa = sg_phys(sg);
-			if (!PageDirty(page) && dirty)
-				set_page_dirty_lock(page);
-			put_page(page);
+			put_user_pages_dirty_lock(&page, 1, dirty);
 			usnic_dbg("pa: %pa\n", &pa);
 		}
 		kfree(chunk);
@@ -99,8 +83,9 @@
 }
 
 static int usnic_uiom_get_pages(unsigned long addr, size_t size, int writable,
-				int dmasync, struct list_head *chunk_list)
+				int dmasync, struct usnic_uiom_reg *uiomr)
 {
+	struct list_head *chunk_list = &uiomr->chunk_list;
 	struct page **page_list;
 	struct scatterlist *sg;
 	struct usnic_uiom_chunk *chunk;
@@ -114,6 +99,7 @@
 	int flags;
 	dma_addr_t pa;
 	unsigned int gup_flags;
+	struct mm_struct *mm;
 
 	/*
 	 * If the combination of the addr and size requested for this memory
@@ -136,9 +122,10 @@
 
 	npages = PAGE_ALIGN(size + (addr & ~PAGE_MASK)) >> PAGE_SHIFT;
 
-	down_write(&current->mm->mmap_sem);
+	uiomr->owning_mm = mm = current->mm;
+	down_read(&mm->mmap_sem);
 
-	locked = npages + current->mm->pinned_vm;
+	locked = atomic64_add_return(npages, &current->mm->pinned_vm);
 	lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
 
 	if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) {
@@ -154,10 +141,11 @@
 	ret = 0;
 
 	while (npages) {
-		ret = get_user_pages_longterm(cur_base,
-					min_t(unsigned long, npages,
-					PAGE_SIZE / sizeof(struct page *)),
-					gup_flags, page_list, NULL);
+		ret = get_user_pages(cur_base,
+				     min_t(unsigned long, npages,
+				     PAGE_SIZE / sizeof(struct page *)),
+				     gup_flags | FOLL_LONGTERM,
+				     page_list, NULL);
 
 		if (ret < 0)
 			goto out;
@@ -166,9 +154,8 @@
 		off = 0;
 
 		while (ret) {
-			chunk = kmalloc(sizeof(*chunk) +
-					sizeof(struct scatterlist) *
-					min_t(int, ret, USNIC_UIOM_PAGE_CHUNK),
+			chunk = kmalloc(struct_size(chunk, page_list,
+					min_t(int, ret, USNIC_UIOM_PAGE_CHUNK)),
 					GFP_KERNEL);
 			if (!chunk) {
 				ret = -ENOMEM;
@@ -194,12 +181,13 @@
 	}
 
 out:
-	if (ret < 0)
+	if (ret < 0) {
 		usnic_uiom_put_pages(chunk_list, 0);
-	else
-		current->mm->pinned_vm = locked;
+		atomic64_sub(npages, &current->mm->pinned_vm);
+	} else
+		mmgrab(uiomr->owning_mm);
 
-	up_write(&current->mm->mmap_sem);
+	up_read(&mm->mmap_sem);
 	free_page((unsigned long) page_list);
 	return ret;
 }
@@ -379,7 +367,7 @@
 	uiomr->pd = pd;
 
 	err = usnic_uiom_get_pages(addr, size, writable, dmasync,
-					&uiomr->chunk_list);
+				   uiomr);
 	if (err) {
 		usnic_err("Failed get_pages vpn [0x%lx,0x%lx] err %d\n",
 				vpn_start, vpn_last, err);
@@ -426,55 +414,29 @@
 out_put_pages:
 	usnic_uiom_put_pages(&uiomr->chunk_list, 0);
 	spin_unlock(&pd->lock);
+	mmdrop(uiomr->owning_mm);
 out_free_uiomr:
 	kfree(uiomr);
 	return ERR_PTR(err);
 }
 
-void usnic_uiom_reg_release(struct usnic_uiom_reg *uiomr,
-			    struct ib_ucontext *ucontext)
+static void __usnic_uiom_release_tail(struct usnic_uiom_reg *uiomr)
 {
-	struct task_struct *task;
-	struct mm_struct *mm;
-	unsigned long diff;
+	mmdrop(uiomr->owning_mm);
+	kfree(uiomr);
+}
 
+static inline size_t usnic_uiom_num_pages(struct usnic_uiom_reg *uiomr)
+{
+	return PAGE_ALIGN(uiomr->length + uiomr->offset) >> PAGE_SHIFT;
+}
+
+void usnic_uiom_reg_release(struct usnic_uiom_reg *uiomr)
+{
 	__usnic_uiom_reg_release(uiomr->pd, uiomr, 1);
 
-	task = get_pid_task(ucontext->tgid, PIDTYPE_PID);
-	if (!task)
-		goto out;
-	mm = get_task_mm(task);
-	put_task_struct(task);
-	if (!mm)
-		goto out;
-
-	diff = PAGE_ALIGN(uiomr->length + uiomr->offset) >> PAGE_SHIFT;
-
-	/*
-	 * We may be called with the mm's mmap_sem already held.  This
-	 * can happen when a userspace munmap() is the call that drops
-	 * the last reference to our file and calls our release
-	 * method.  If there are memory regions to destroy, we'll end
-	 * up here and not be able to take the mmap_sem.  In that case
-	 * we defer the vm_locked accounting to the system workqueue.
-	 */
-	if (ucontext->closing) {
-		if (!down_write_trylock(&mm->mmap_sem)) {
-			INIT_WORK(&uiomr->work, usnic_uiom_reg_account);
-			uiomr->mm = mm;
-			uiomr->diff = diff;
-
-			queue_work(usnic_uiom_wq, &uiomr->work);
-			return;
-		}
-	} else
-		down_write(&mm->mmap_sem);
-
-	mm->pinned_vm -= diff;
-	up_write(&mm->mmap_sem);
-	mmput(mm);
-out:
-	kfree(uiomr);
+	atomic64_sub(usnic_uiom_num_pages(uiomr), &uiomr->owning_mm->pinned_vm);
+	__usnic_uiom_release_tail(uiomr);
 }
 
 struct usnic_uiom_pd *usnic_uiom_alloc_pd(void)
@@ -602,17 +564,5 @@
 		return -EPERM;
 	}
 
-	usnic_uiom_wq = create_workqueue(drv_name);
-	if (!usnic_uiom_wq) {
-		usnic_err("Unable to alloc wq for drv %s\n", drv_name);
-		return -ENOMEM;
-	}
-
 	return 0;
 }
-
-void usnic_uiom_fini(void)
-{
-	flush_workqueue(usnic_uiom_wq);
-	destroy_workqueue(usnic_uiom_wq);
-}
diff --git a/drivers/infiniband/hw/usnic/usnic_uiom.h b/drivers/infiniband/hw/usnic/usnic_uiom.h
index 8c096ac..70be49b 100644
--- a/drivers/infiniband/hw/usnic/usnic_uiom.h
+++ b/drivers/infiniband/hw/usnic/usnic_uiom.h
@@ -71,8 +71,7 @@
 	int				writable;
 	struct list_head		chunk_list;
 	struct work_struct		work;
-	struct mm_struct		*mm;
-	unsigned long			diff;
+	struct mm_struct		*owning_mm;
 };
 
 struct usnic_uiom_chunk {
@@ -91,8 +90,6 @@
 struct usnic_uiom_reg *usnic_uiom_reg_get(struct usnic_uiom_pd *pd,
 						unsigned long addr, size_t size,
 						int access, int dmasync);
-void usnic_uiom_reg_release(struct usnic_uiom_reg *uiomr,
-			    struct ib_ucontext *ucontext);
+void usnic_uiom_reg_release(struct usnic_uiom_reg *uiomr);
 int usnic_uiom_init(char *drv_name);
-void usnic_uiom_fini(void);
 #endif /* USNIC_UIOM_H_ */
diff --git a/drivers/infiniband/hw/vmw_pvrdma/Kconfig b/drivers/infiniband/hw/vmw_pvrdma/Kconfig
index 5a9790a..b99c9f0 100644
--- a/drivers/infiniband/hw/vmw_pvrdma/Kconfig
+++ b/drivers/infiniband/hw/vmw_pvrdma/Kconfig
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 config INFINIBAND_VMWARE_PVRDMA
 	tristate "VMware Paravirtualized RDMA Driver"
 	depends on NETDEVICES && ETHERNET && PCI && INET && VMXNET3
diff --git a/drivers/infiniband/hw/vmw_pvrdma/Makefile b/drivers/infiniband/hw/vmw_pvrdma/Makefile
index 2f52e0a..0f5fa4e 100644
--- a/drivers/infiniband/hw/vmw_pvrdma/Makefile
+++ b/drivers/infiniband/hw/vmw_pvrdma/Makefile
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 obj-$(CONFIG_INFINIBAND_VMWARE_PVRDMA) += vmw_pvrdma.o
 
 vmw_pvrdma-y := pvrdma_cmd.o pvrdma_cq.o pvrdma_doorbell.o pvrdma_main.o pvrdma_misc.o pvrdma_mr.o pvrdma_qp.o pvrdma_srq.o pvrdma_verbs.o
diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma.h b/drivers/infiniband/hw/vmw_pvrdma/pvrdma.h
index 42b8685..c142f5e 100644
--- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma.h
+++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma.h
@@ -427,7 +427,40 @@
 
 static inline enum pvrdma_wr_opcode ib_wr_opcode_to_pvrdma(enum ib_wr_opcode op)
 {
-	return (enum pvrdma_wr_opcode)op;
+	switch (op) {
+	case IB_WR_RDMA_WRITE:
+		return PVRDMA_WR_RDMA_WRITE;
+	case IB_WR_RDMA_WRITE_WITH_IMM:
+		return PVRDMA_WR_RDMA_WRITE_WITH_IMM;
+	case IB_WR_SEND:
+		return PVRDMA_WR_SEND;
+	case IB_WR_SEND_WITH_IMM:
+		return PVRDMA_WR_SEND_WITH_IMM;
+	case IB_WR_RDMA_READ:
+		return PVRDMA_WR_RDMA_READ;
+	case IB_WR_ATOMIC_CMP_AND_SWP:
+		return PVRDMA_WR_ATOMIC_CMP_AND_SWP;
+	case IB_WR_ATOMIC_FETCH_AND_ADD:
+		return PVRDMA_WR_ATOMIC_FETCH_AND_ADD;
+	case IB_WR_LSO:
+		return PVRDMA_WR_LSO;
+	case IB_WR_SEND_WITH_INV:
+		return PVRDMA_WR_SEND_WITH_INV;
+	case IB_WR_RDMA_READ_WITH_INV:
+		return PVRDMA_WR_RDMA_READ_WITH_INV;
+	case IB_WR_LOCAL_INV:
+		return PVRDMA_WR_LOCAL_INV;
+	case IB_WR_REG_MR:
+		return PVRDMA_WR_FAST_REG_MR;
+	case IB_WR_MASKED_ATOMIC_CMP_AND_SWP:
+		return PVRDMA_WR_MASKED_ATOMIC_CMP_AND_SWP;
+	case IB_WR_MASKED_ATOMIC_FETCH_AND_ADD:
+		return PVRDMA_WR_MASKED_ATOMIC_FETCH_AND_ADD;
+	case IB_WR_REG_MR_INTEGRITY:
+		return PVRDMA_WR_REG_SIG_MR;
+	default:
+		return PVRDMA_WR_ERROR;
+	}
 }
 
 static inline enum ib_wc_status pvrdma_wc_status_to_ib(
diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c
index 0f004c7..7800e69 100644
--- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c
+++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c
@@ -49,6 +49,7 @@
 #include <rdma/ib_addr.h>
 #include <rdma/ib_smi.h>
 #include <rdma/ib_user_verbs.h>
+#include <rdma/uverbs_ioctl.h>
 
 #include "pvrdma.h"
 
@@ -91,22 +92,19 @@
 
 /**
  * pvrdma_create_cq - create completion queue
- * @ibdev: the device
+ * @ibcq: Allocated CQ
  * @attr: completion queue attributes
- * @context: user context
  * @udata: user data
  *
- * @return: ib_cq completion queue pointer on success,
- *          otherwise returns negative errno.
+ * @return: 0 on success
  */
-struct ib_cq *pvrdma_create_cq(struct ib_device *ibdev,
-			       const struct ib_cq_init_attr *attr,
-			       struct ib_ucontext *context,
-			       struct ib_udata *udata)
+int pvrdma_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
+		     struct ib_udata *udata)
 {
+	struct ib_device *ibdev = ibcq->device;
 	int entries = attr->cqe;
 	struct pvrdma_dev *dev = to_vdev(ibdev);
-	struct pvrdma_cq *cq;
+	struct pvrdma_cq *cq = to_vcq(ibcq);
 	int ret;
 	int npages;
 	unsigned long flags;
@@ -114,26 +112,22 @@
 	union pvrdma_cmd_resp rsp;
 	struct pvrdma_cmd_create_cq *cmd = &req.create_cq;
 	struct pvrdma_cmd_create_cq_resp *resp = &rsp.create_cq_resp;
-	struct pvrdma_create_cq_resp cq_resp = {0};
+	struct pvrdma_create_cq_resp cq_resp = {};
 	struct pvrdma_create_cq ucmd;
+	struct pvrdma_ucontext *context = rdma_udata_to_drv_context(
+		udata, struct pvrdma_ucontext, ibucontext);
 
 	BUILD_BUG_ON(sizeof(struct pvrdma_cqe) != 64);
 
 	entries = roundup_pow_of_two(entries);
 	if (entries < 1 || entries > dev->dsr->caps.max_cqe)
-		return ERR_PTR(-EINVAL);
+		return -EINVAL;
 
 	if (!atomic_add_unless(&dev->num_cqs, 1, dev->dsr->caps.max_cq))
-		return ERR_PTR(-ENOMEM);
-
-	cq = kzalloc(sizeof(*cq), GFP_KERNEL);
-	if (!cq) {
-		atomic_dec(&dev->num_cqs);
-		return ERR_PTR(-ENOMEM);
-	}
+		return -ENOMEM;
 
 	cq->ibcq.cqe = entries;
-	cq->is_kernel = !context;
+	cq->is_kernel = !udata;
 
 	if (!cq->is_kernel) {
 		if (ib_copy_from_udata(&ucmd, udata, sizeof(ucmd))) {
@@ -141,7 +135,7 @@
 			goto err_cq;
 		}
 
-		cq->umem = ib_umem_get(context, ucmd.buf_addr, ucmd.buf_size,
+		cq->umem = ib_umem_get(udata, ucmd.buf_addr, ucmd.buf_size,
 				       IB_ACCESS_LOCAL_WRITE, 1);
 		if (IS_ERR(cq->umem)) {
 			ret = PTR_ERR(cq->umem);
@@ -185,8 +179,7 @@
 	memset(cmd, 0, sizeof(*cmd));
 	cmd->hdr.cmd = PVRDMA_CMD_CREATE_CQ;
 	cmd->nchunks = npages;
-	cmd->ctx_handle = (context) ?
-		(u64)to_vucontext(context)->ctx_handle : 0;
+	cmd->ctx_handle = context ? context->ctx_handle : 0;
 	cmd->cqe = entries;
 	cmd->pdir_dma = cq->pdir.dir_dma;
 	ret = pvrdma_cmd_post(dev, &req, &rsp, PVRDMA_CMD_CREATE_CQ_RESP);
@@ -204,29 +197,26 @@
 	spin_unlock_irqrestore(&dev->cq_tbl_lock, flags);
 
 	if (!cq->is_kernel) {
-		cq->uar = &(to_vucontext(context)->uar);
+		cq->uar = &context->uar;
 
 		/* Copy udata back. */
 		if (ib_copy_to_udata(udata, &cq_resp, sizeof(cq_resp))) {
 			dev_warn(&dev->pdev->dev,
 				 "failed to copy back udata\n");
-			pvrdma_destroy_cq(&cq->ibcq);
-			return ERR_PTR(-EINVAL);
+			pvrdma_destroy_cq(&cq->ibcq, udata);
+			return -EINVAL;
 		}
 	}
 
-	return &cq->ibcq;
+	return 0;
 
 err_page_dir:
 	pvrdma_page_dir_cleanup(dev, &cq->pdir);
 err_umem:
-	if (!cq->is_kernel)
-		ib_umem_release(cq->umem);
+	ib_umem_release(cq->umem);
 err_cq:
 	atomic_dec(&dev->num_cqs);
-	kfree(cq);
-
-	return ERR_PTR(ret);
+	return ret;
 }
 
 static void pvrdma_free_cq(struct pvrdma_dev *dev, struct pvrdma_cq *cq)
@@ -235,20 +225,17 @@
 		complete(&cq->free);
 	wait_for_completion(&cq->free);
 
-	if (!cq->is_kernel)
-		ib_umem_release(cq->umem);
+	ib_umem_release(cq->umem);
 
 	pvrdma_page_dir_cleanup(dev, &cq->pdir);
-	kfree(cq);
 }
 
 /**
  * pvrdma_destroy_cq - destroy completion queue
  * @cq: the completion queue to destroy.
- *
- * @return: 0 for success.
+ * @udata: user data or null for kernel object
  */
-int pvrdma_destroy_cq(struct ib_cq *cq)
+void pvrdma_destroy_cq(struct ib_cq *cq, struct ib_udata *udata)
 {
 	struct pvrdma_cq *vcq = to_vcq(cq);
 	union pvrdma_cmd_req req;
@@ -274,8 +261,6 @@
 
 	pvrdma_free_cq(dev, vcq);
 	atomic_dec(&dev->num_cqs);
-
-	return ret;
 }
 
 static inline struct pvrdma_cqe *get_cqe(struct pvrdma_cq *cq, int i)
diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_dev_api.h b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_dev_api.h
index 6fd5a8f..8f9749d 100644
--- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_dev_api.h
+++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_dev_api.h
@@ -57,7 +57,8 @@
 
 #define PVRDMA_ROCEV1_VERSION		17
 #define PVRDMA_ROCEV2_VERSION		18
-#define PVRDMA_VERSION			PVRDMA_ROCEV2_VERSION
+#define PVRDMA_PPN64_VERSION		19
+#define PVRDMA_VERSION			PVRDMA_PPN64_VERSION
 
 #define PVRDMA_BOARD_ID			1
 #define PVRDMA_REV_ID			1
@@ -279,8 +280,10 @@
 						/* W: Async ring page info. */
 	struct pvrdma_ring_page_info cq_ring_pages;
 						/* W: CQ ring page info. */
-	u32 uar_pfn;				/* W: UAR pageframe. */
-	u32 pad2;				/* Pad to 8-byte align. */
+	union {
+		u32 uar_pfn;			/* W: UAR pageframe. */
+		u64 uar_pfn64;			/* W: 64-bit UAR page frame. */
+	};
 	struct pvrdma_device_caps caps;		/* R: Device capabilities. */
 };
 
@@ -411,8 +414,10 @@
 
 struct pvrdma_cmd_create_uc {
 	struct pvrdma_cmd_hdr hdr;
-	u32 pfn; /* UAR page frame number */
-	u8 reserved[4];
+	union {
+		u32 pfn; /* UAR page frame number */
+		u64 pfn64; /* 64-bit UAR page frame number */
+	};
 };
 
 struct pvrdma_cmd_create_uc_resp {
diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c
index a571989..e580ae9 100644
--- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c
+++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c
@@ -65,32 +65,36 @@
 static int pvrdma_add_gid(const struct ib_gid_attr *attr, void **context);
 static int pvrdma_del_gid(const struct ib_gid_attr *attr, void **context);
 
-static ssize_t show_hca(struct device *device, struct device_attribute *attr,
-			char *buf)
+static ssize_t hca_type_show(struct device *device,
+			     struct device_attribute *attr, char *buf)
 {
 	return sprintf(buf, "VMW_PVRDMA-%s\n", DRV_VERSION);
 }
+static DEVICE_ATTR_RO(hca_type);
 
-static ssize_t show_rev(struct device *device, struct device_attribute *attr,
-			char *buf)
+static ssize_t hw_rev_show(struct device *device,
+			   struct device_attribute *attr, char *buf)
 {
 	return sprintf(buf, "%d\n", PVRDMA_REV_ID);
 }
+static DEVICE_ATTR_RO(hw_rev);
 
-static ssize_t show_board(struct device *device, struct device_attribute *attr,
-			  char *buf)
+static ssize_t board_id_show(struct device *device,
+			     struct device_attribute *attr, char *buf)
 {
 	return sprintf(buf, "%d\n", PVRDMA_BOARD_ID);
 }
+static DEVICE_ATTR_RO(board_id);
 
-static DEVICE_ATTR(hw_rev,   S_IRUGO, show_rev,	   NULL);
-static DEVICE_ATTR(hca_type, S_IRUGO, show_hca,	   NULL);
-static DEVICE_ATTR(board_id, S_IRUGO, show_board,  NULL);
+static struct attribute *pvrdma_class_attributes[] = {
+	&dev_attr_hw_rev.attr,
+	&dev_attr_hca_type.attr,
+	&dev_attr_board_id.attr,
+	NULL,
+};
 
-static struct device_attribute *pvrdma_class_attributes[] = {
-	&dev_attr_hw_rev,
-	&dev_attr_hca_type,
-	&dev_attr_board_id
+static const struct attribute_group pvrdma_attr_group = {
+	.attrs = pvrdma_class_attributes,
 };
 
 static void pvrdma_get_fw_ver_str(struct ib_device *device, char *str)
@@ -139,37 +143,68 @@
 	return 0;
 }
 
-static struct net_device *pvrdma_get_netdev(struct ib_device *ibdev,
-					    u8 port_num)
-{
-	struct net_device *netdev;
-	struct pvrdma_dev *dev = to_vdev(ibdev);
+static const struct ib_device_ops pvrdma_dev_ops = {
+	.owner = THIS_MODULE,
+	.driver_id = RDMA_DRIVER_VMW_PVRDMA,
+	.uverbs_abi_ver = PVRDMA_UVERBS_ABI_VERSION,
 
-	if (port_num != 1)
-		return NULL;
+	.add_gid = pvrdma_add_gid,
+	.alloc_mr = pvrdma_alloc_mr,
+	.alloc_pd = pvrdma_alloc_pd,
+	.alloc_ucontext = pvrdma_alloc_ucontext,
+	.create_ah = pvrdma_create_ah,
+	.create_cq = pvrdma_create_cq,
+	.create_qp = pvrdma_create_qp,
+	.dealloc_pd = pvrdma_dealloc_pd,
+	.dealloc_ucontext = pvrdma_dealloc_ucontext,
+	.del_gid = pvrdma_del_gid,
+	.dereg_mr = pvrdma_dereg_mr,
+	.destroy_ah = pvrdma_destroy_ah,
+	.destroy_cq = pvrdma_destroy_cq,
+	.destroy_qp = pvrdma_destroy_qp,
+	.get_dev_fw_str = pvrdma_get_fw_ver_str,
+	.get_dma_mr = pvrdma_get_dma_mr,
+	.get_link_layer = pvrdma_port_link_layer,
+	.get_port_immutable = pvrdma_port_immutable,
+	.map_mr_sg = pvrdma_map_mr_sg,
+	.mmap = pvrdma_mmap,
+	.modify_port = pvrdma_modify_port,
+	.modify_qp = pvrdma_modify_qp,
+	.poll_cq = pvrdma_poll_cq,
+	.post_recv = pvrdma_post_recv,
+	.post_send = pvrdma_post_send,
+	.query_device = pvrdma_query_device,
+	.query_gid = pvrdma_query_gid,
+	.query_pkey = pvrdma_query_pkey,
+	.query_port = pvrdma_query_port,
+	.query_qp = pvrdma_query_qp,
+	.reg_user_mr = pvrdma_reg_user_mr,
+	.req_notify_cq = pvrdma_req_notify_cq,
 
-	rcu_read_lock();
-	netdev = dev->netdev;
-	if (netdev)
-		dev_hold(netdev);
-	rcu_read_unlock();
+	INIT_RDMA_OBJ_SIZE(ib_ah, pvrdma_ah, ibah),
+	INIT_RDMA_OBJ_SIZE(ib_cq, pvrdma_cq, ibcq),
+	INIT_RDMA_OBJ_SIZE(ib_pd, pvrdma_pd, ibpd),
+	INIT_RDMA_OBJ_SIZE(ib_ucontext, pvrdma_ucontext, ibucontext),
+};
 
-	return netdev;
-}
+static const struct ib_device_ops pvrdma_dev_srq_ops = {
+	.create_srq = pvrdma_create_srq,
+	.destroy_srq = pvrdma_destroy_srq,
+	.modify_srq = pvrdma_modify_srq,
+	.query_srq = pvrdma_query_srq,
+
+	INIT_RDMA_OBJ_SIZE(ib_srq, pvrdma_srq, ibsrq),
+};
 
 static int pvrdma_register_device(struct pvrdma_dev *dev)
 {
 	int ret = -1;
-	int i = 0;
 
-	strlcpy(dev->ib_dev.name, "vmw_pvrdma%d", IB_DEVICE_NAME_MAX);
 	dev->ib_dev.node_guid = dev->dsr->caps.node_guid;
 	dev->sys_image_guid = dev->dsr->caps.sys_image_guid;
 	dev->flags = 0;
-	dev->ib_dev.owner = THIS_MODULE;
 	dev->ib_dev.num_comp_vectors = 1;
 	dev->ib_dev.dev.parent = &dev->pdev->dev;
-	dev->ib_dev.uverbs_abi_ver = PVRDMA_UVERBS_ABI_VERSION;
 	dev->ib_dev.uverbs_cmd_mask =
 		(1ull << IB_USER_VERBS_CMD_GET_CONTEXT)		|
 		(1ull << IB_USER_VERBS_CMD_QUERY_DEVICE)	|
@@ -195,39 +230,7 @@
 	dev->ib_dev.node_type = RDMA_NODE_IB_CA;
 	dev->ib_dev.phys_port_cnt = dev->dsr->caps.phys_port_cnt;
 
-	dev->ib_dev.query_device = pvrdma_query_device;
-	dev->ib_dev.query_port = pvrdma_query_port;
-	dev->ib_dev.query_gid = pvrdma_query_gid;
-	dev->ib_dev.query_pkey = pvrdma_query_pkey;
-	dev->ib_dev.modify_port	= pvrdma_modify_port;
-	dev->ib_dev.alloc_ucontext = pvrdma_alloc_ucontext;
-	dev->ib_dev.dealloc_ucontext = pvrdma_dealloc_ucontext;
-	dev->ib_dev.mmap = pvrdma_mmap;
-	dev->ib_dev.alloc_pd = pvrdma_alloc_pd;
-	dev->ib_dev.dealloc_pd = pvrdma_dealloc_pd;
-	dev->ib_dev.create_ah = pvrdma_create_ah;
-	dev->ib_dev.destroy_ah = pvrdma_destroy_ah;
-	dev->ib_dev.create_qp = pvrdma_create_qp;
-	dev->ib_dev.modify_qp = pvrdma_modify_qp;
-	dev->ib_dev.query_qp = pvrdma_query_qp;
-	dev->ib_dev.destroy_qp = pvrdma_destroy_qp;
-	dev->ib_dev.post_send = pvrdma_post_send;
-	dev->ib_dev.post_recv = pvrdma_post_recv;
-	dev->ib_dev.create_cq = pvrdma_create_cq;
-	dev->ib_dev.destroy_cq = pvrdma_destroy_cq;
-	dev->ib_dev.poll_cq = pvrdma_poll_cq;
-	dev->ib_dev.req_notify_cq = pvrdma_req_notify_cq;
-	dev->ib_dev.get_dma_mr = pvrdma_get_dma_mr;
-	dev->ib_dev.reg_user_mr	= pvrdma_reg_user_mr;
-	dev->ib_dev.dereg_mr = pvrdma_dereg_mr;
-	dev->ib_dev.alloc_mr = pvrdma_alloc_mr;
-	dev->ib_dev.map_mr_sg = pvrdma_map_mr_sg;
-	dev->ib_dev.add_gid = pvrdma_add_gid;
-	dev->ib_dev.del_gid = pvrdma_del_gid;
-	dev->ib_dev.get_netdev = pvrdma_get_netdev;
-	dev->ib_dev.get_port_immutable = pvrdma_port_immutable;
-	dev->ib_dev.get_link_layer = pvrdma_port_link_layer;
-	dev->ib_dev.get_dev_fw_str = pvrdma_get_fw_ver_str;
+	ib_set_device_ops(&dev->ib_dev, &pvrdma_dev_ops);
 
 	mutex_init(&dev->port_mutex);
 	spin_lock_init(&dev->desc_lock);
@@ -253,10 +256,7 @@
 			(1ull << IB_USER_VERBS_CMD_DESTROY_SRQ)	|
 			(1ull << IB_USER_VERBS_CMD_POST_SRQ_RECV);
 
-		dev->ib_dev.create_srq = pvrdma_create_srq;
-		dev->ib_dev.modify_srq = pvrdma_modify_srq;
-		dev->ib_dev.query_srq = pvrdma_query_srq;
-		dev->ib_dev.destroy_srq = pvrdma_destroy_srq;
+		ib_set_device_ops(&dev->ib_dev, &pvrdma_dev_srq_ops);
 
 		dev->srq_tbl = kcalloc(dev->dsr->caps.max_srq,
 				       sizeof(struct pvrdma_srq *),
@@ -264,26 +264,20 @@
 		if (!dev->srq_tbl)
 			goto err_qp_free;
 	}
-	dev->ib_dev.driver_id = RDMA_DRIVER_VMW_PVRDMA;
+	ret = ib_device_set_netdev(&dev->ib_dev, dev->netdev, 1);
+	if (ret)
+		return ret;
 	spin_lock_init(&dev->srq_tbl_lock);
+	rdma_set_device_sysfs_group(&dev->ib_dev, &pvrdma_attr_group);
 
-	ret = ib_register_device(&dev->ib_dev, NULL);
+	ret = ib_register_device(&dev->ib_dev, "vmw_pvrdma%d");
 	if (ret)
 		goto err_srq_free;
 
-	for (i = 0; i < ARRAY_SIZE(pvrdma_class_attributes); ++i) {
-		ret = device_create_file(&dev->ib_dev.dev,
-					 pvrdma_class_attributes[i]);
-		if (ret)
-			goto err_class;
-	}
-
 	dev->ib_active = true;
 
 	return 0;
 
-err_class:
-	ib_unregister_device(&dev->ib_dev);
 err_srq_free:
 	kfree(dev->srq_tbl);
 err_qp_free:
@@ -716,6 +710,7 @@
 			pvrdma_dispatch_event(dev, 1, IB_EVENT_PORT_ACTIVE);
 		break;
 	case NETDEV_UNREGISTER:
+		ib_device_set_netdev(&dev->ib_dev, NULL, 1);
 		dev_put(dev->netdev);
 		dev->netdev = NULL;
 		break;
@@ -727,6 +722,7 @@
 		if ((dev->netdev == NULL) &&
 		    (pci_get_drvdata(pdev_net) == ndev)) {
 			/* this is our netdev */
+			ib_device_set_netdev(&dev->ib_dev, ndev, 1);
 			dev->netdev = ndev;
 			dev_hold(ndev);
 		}
@@ -735,7 +731,7 @@
 
 	default:
 		dev_dbg(&dev->pdev->dev, "ignore netdevice event %ld on %s\n",
-			event, dev->ib_dev.name);
+			event, dev_name(&dev->ib_dev.dev));
 		break;
 	}
 }
@@ -793,7 +789,7 @@
 	dev_dbg(&pdev->dev, "initializing driver %s\n", pci_name(pdev));
 
 	/* Allocate zero-out device */
-	dev = (struct pvrdma_dev *)ib_alloc_device(sizeof(*dev));
+	dev = ib_alloc_device(pvrdma_dev, ib_dev);
 	if (!dev) {
 		dev_err(&pdev->dev, "failed to allocate IB device\n");
 		return -ENOMEM;
@@ -888,8 +884,8 @@
 	dev_info(&pdev->dev, "device version %d, driver version %d\n",
 		 dev->dsr_version, PVRDMA_VERSION);
 
-	dev->dsr = dma_zalloc_coherent(&pdev->dev, sizeof(*dev->dsr),
-				       &dev->dsrbase, GFP_KERNEL);
+	dev->dsr = dma_alloc_coherent(&pdev->dev, sizeof(*dev->dsr),
+				      &dev->dsrbase, GFP_KERNEL);
 	if (!dev->dsr) {
 		dev_err(&pdev->dev, "failed to allocate shared region\n");
 		ret = -ENOMEM;
@@ -903,7 +899,11 @@
 		PVRDMA_GOS_BITS_64;
 	dev->dsr->gos_info.gos_type = PVRDMA_GOS_TYPE_LINUX;
 	dev->dsr->gos_info.gos_ver = 1;
-	dev->dsr->uar_pfn = dev->driver_uar.pfn;
+
+	if (dev->dsr_version < PVRDMA_PPN64_VERSION)
+		dev->dsr->uar_pfn = dev->driver_uar.pfn;
+	else
+		dev->dsr->uar_pfn64 = dev->driver_uar.pfn;
 
 	/* Command slot. */
 	dev->cmd_slot = dma_alloc_coherent(&pdev->dev, PAGE_SIZE,
@@ -1123,6 +1123,8 @@
 	pvrdma_page_dir_cleanup(dev, &dev->cq_pdir);
 	pvrdma_page_dir_cleanup(dev, &dev->async_pdir);
 	pvrdma_free_slots(dev);
+	dma_free_coherent(&pdev->dev, sizeof(*dev->dsr), dev->dsr,
+			  dev->dsrbase);
 
 	iounmap(dev->regs);
 	kfree(dev->sgid_tbl);
diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_misc.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_misc.c
index fb0c5c0..7944c58 100644
--- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_misc.c
+++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_misc.c
@@ -183,25 +183,20 @@
 				struct ib_umem *umem, u64 offset)
 {
 	u64 i = offset;
-	int j, entry;
-	int ret = 0, len = 0;
-	struct scatterlist *sg;
+	int ret = 0;
+	struct sg_dma_page_iter sg_iter;
 
 	if (offset >= pdir->npages)
 		return -EINVAL;
 
-	for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) {
-		len = sg_dma_len(sg) >> PAGE_SHIFT;
-		for (j = 0; j < len; j++) {
-			dma_addr_t addr = sg_dma_address(sg) +
-					  (j << umem->page_shift);
+	for_each_sg_dma_page(umem->sg_head.sgl, &sg_iter, umem->nmap, 0) {
+		dma_addr_t addr = sg_page_iter_dma_address(&sg_iter);
 
-			ret = pvrdma_page_dir_insert_dma(pdir, i, addr);
-			if (ret)
-				goto exit;
+		ret = pvrdma_page_dir_insert_dma(pdir, i, addr);
+		if (ret)
+			goto exit;
 
-			i++;
-		}
+		i++;
 	}
 
 exit:
diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_mr.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_mr.c
index fa96fa4..f3a3d22 100644
--- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_mr.c
+++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_mr.c
@@ -119,24 +119,24 @@
 	union pvrdma_cmd_resp rsp;
 	struct pvrdma_cmd_create_mr *cmd = &req.create_mr;
 	struct pvrdma_cmd_create_mr_resp *resp = &rsp.create_mr_resp;
-	int ret;
+	int ret, npages;
 
 	if (length == 0 || length > dev->dsr->caps.max_mr_size) {
 		dev_warn(&dev->pdev->dev, "invalid mem region length\n");
 		return ERR_PTR(-EINVAL);
 	}
 
-	umem = ib_umem_get(pd->uobject->context, start,
-			   length, access_flags, 0);
+	umem = ib_umem_get(udata, start, length, access_flags, 0);
 	if (IS_ERR(umem)) {
 		dev_warn(&dev->pdev->dev,
 			 "could not get umem for mem region\n");
 		return ERR_CAST(umem);
 	}
 
-	if (umem->npages < 0 || umem->npages > PVRDMA_PAGE_DIR_MAX_PAGES) {
+	npages = ib_umem_num_pages(umem);
+	if (npages < 0 || npages > PVRDMA_PAGE_DIR_MAX_PAGES) {
 		dev_warn(&dev->pdev->dev, "overflow %d pages in mem region\n",
-			 umem->npages);
+			 npages);
 		ret = -EINVAL;
 		goto err_umem;
 	}
@@ -151,7 +151,7 @@
 	mr->mmr.size = length;
 	mr->umem = umem;
 
-	ret = pvrdma_page_dir_init(dev, &mr->pdir, umem->npages, false);
+	ret = pvrdma_page_dir_init(dev, &mr->pdir, npages, false);
 	if (ret) {
 		dev_warn(&dev->pdev->dev,
 			 "could not allocate page directory\n");
@@ -168,7 +168,7 @@
 	cmd->length = length;
 	cmd->pd_handle = to_vpd(pd)->pd_handle;
 	cmd->access_flags = access_flags;
-	cmd->nchunks = umem->npages;
+	cmd->nchunks = npages;
 	cmd->pdir_dma = mr->pdir.dir_dma;
 
 	ret = pvrdma_cmd_post(dev, &req, &rsp, PVRDMA_CMD_CREATE_MR_RESP);
@@ -202,7 +202,7 @@
  * @return: ib_mr pointer on success, otherwise returns an errno.
  */
 struct ib_mr *pvrdma_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
-			      u32 max_num_sg)
+			      u32 max_num_sg, struct ib_udata *udata)
 {
 	struct pvrdma_dev *dev = to_vdev(pd->device);
 	struct pvrdma_user_mr *mr;
@@ -273,7 +273,7 @@
  *
  * @return: 0 on success.
  */
-int pvrdma_dereg_mr(struct ib_mr *ibmr)
+int pvrdma_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
 {
 	struct pvrdma_user_mr *mr = to_vmr(ibmr);
 	struct pvrdma_dev *dev = to_vdev(ibmr->device);
@@ -290,8 +290,7 @@
 			 "could not deregister mem region, error: %d\n", ret);
 
 	pvrdma_page_dir_cleanup(dev, &mr->pdir);
-	if (mr->umem)
-		ib_umem_release(mr->umem);
+	ib_umem_release(mr->umem);
 
 	kfree(mr->pages);
 	kfree(mr);
diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c
index 60083c0..bca6a58 100644
--- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c
+++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c
@@ -249,7 +249,7 @@
 		init_completion(&qp->free);
 
 		qp->state = IB_QPS_RESET;
-		qp->is_kernel = !(pd->uobject && udata);
+		qp->is_kernel = !udata;
 
 		if (!qp->is_kernel) {
 			dev_dbg(&dev->pdev->dev,
@@ -262,8 +262,7 @@
 
 			if (!is_srq) {
 				/* set qp->sq.wqe_cnt, shift, buf_size.. */
-				qp->rumem = ib_umem_get(pd->uobject->context,
-							ucmd.rbuf_addr,
+				qp->rumem = ib_umem_get(udata, ucmd.rbuf_addr,
 							ucmd.rbuf_size, 0, 0);
 				if (IS_ERR(qp->rumem)) {
 					ret = PTR_ERR(qp->rumem);
@@ -275,8 +274,7 @@
 				qp->srq = to_vsrq(init_attr->srq);
 			}
 
-			qp->sumem = ib_umem_get(pd->uobject->context,
-						ucmd.sbuf_addr,
+			qp->sumem = ib_umem_get(udata, ucmd.sbuf_addr,
 						ucmd.sbuf_size, 0, 0);
 			if (IS_ERR(qp->sumem)) {
 				if (!is_srq)
@@ -393,12 +391,8 @@
 err_pdir:
 	pvrdma_page_dir_cleanup(dev, &qp->pdir);
 err_umem:
-	if (!qp->is_kernel) {
-		if (qp->rumem)
-			ib_umem_release(qp->rumem);
-		if (qp->sumem)
-			ib_umem_release(qp->sumem);
-	}
+	ib_umem_release(qp->rumem);
+	ib_umem_release(qp->sumem);
 err_qp:
 	kfree(qp);
 	atomic_dec(&dev->num_qps);
@@ -431,12 +425,8 @@
 		complete(&qp->free);
 	wait_for_completion(&qp->free);
 
-	if (!qp->is_kernel) {
-		if (qp->rumem)
-			ib_umem_release(qp->rumem);
-		if (qp->sumem)
-			ib_umem_release(qp->sumem);
-	}
+	ib_umem_release(qp->rumem);
+	ib_umem_release(qp->sumem);
 
 	pvrdma_page_dir_cleanup(dev, &qp->pdir);
 
@@ -448,10 +438,11 @@
 /**
  * pvrdma_destroy_qp - destroy a queue pair
  * @qp: the queue pair to destroy
+ * @udata: user data or null for kernel object
  *
  * @return: 0 on success.
  */
-int pvrdma_destroy_qp(struct ib_qp *qp)
+int pvrdma_destroy_qp(struct ib_qp *qp, struct ib_udata *udata)
 {
 	struct pvrdma_qp *vqp = to_vqp(qp);
 	union pvrdma_cmd_req req;
@@ -499,7 +490,7 @@
 	next_state = (attr_mask & IB_QP_STATE) ? attr->qp_state : cur_state;
 
 	if (!ib_modify_qp_is_ok(cur_state, next_state, ibqp->qp_type,
-				attr_mask, IB_LINK_LAYER_ETHERNET)) {
+				attr_mask)) {
 		ret = -EINVAL;
 		goto out;
 	}
@@ -721,6 +712,12 @@
 		    wr->opcode == IB_WR_RDMA_WRITE_WITH_IMM)
 			wqe_hdr->ex.imm_data = wr->ex.imm_data;
 
+		if (unlikely(wqe_hdr->opcode == PVRDMA_WR_ERROR)) {
+			*bad_wr = wr;
+			ret = -EINVAL;
+			goto out;
+		}
+
 		switch (qp->ibqp.qp_type) {
 		case IB_QPT_GSI:
 		case IB_QPT_UD:
diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_srq.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_srq.c
index dc0ce87..36cdfbd 100644
--- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_srq.c
+++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_srq.c
@@ -94,52 +94,45 @@
  * @init_attr: shared receive queue attributes
  * @udata: user data
  *
- * @return: the ib_srq pointer on success, otherwise returns an errno.
+ * @return: 0 on success, otherwise returns an errno.
  */
-struct ib_srq *pvrdma_create_srq(struct ib_pd *pd,
-				 struct ib_srq_init_attr *init_attr,
-				 struct ib_udata *udata)
+int pvrdma_create_srq(struct ib_srq *ibsrq, struct ib_srq_init_attr *init_attr,
+		      struct ib_udata *udata)
 {
-	struct pvrdma_srq *srq = NULL;
-	struct pvrdma_dev *dev = to_vdev(pd->device);
+	struct pvrdma_srq *srq = to_vsrq(ibsrq);
+	struct pvrdma_dev *dev = to_vdev(ibsrq->device);
 	union pvrdma_cmd_req req;
 	union pvrdma_cmd_resp rsp;
 	struct pvrdma_cmd_create_srq *cmd = &req.create_srq;
 	struct pvrdma_cmd_create_srq_resp *resp = &rsp.create_srq_resp;
-	struct pvrdma_create_srq_resp srq_resp = {0};
+	struct pvrdma_create_srq_resp srq_resp = {};
 	struct pvrdma_create_srq ucmd;
 	unsigned long flags;
 	int ret;
 
-	if (!(pd->uobject && udata)) {
+	if (!udata) {
 		/* No support for kernel clients. */
 		dev_warn(&dev->pdev->dev,
 			 "no shared receive queue support for kernel client\n");
-		return ERR_PTR(-EOPNOTSUPP);
+		return -EOPNOTSUPP;
 	}
 
 	if (init_attr->srq_type != IB_SRQT_BASIC) {
 		dev_warn(&dev->pdev->dev,
 			 "shared receive queue type %d not supported\n",
 			 init_attr->srq_type);
-		return ERR_PTR(-EINVAL);
+		return -EINVAL;
 	}
 
 	if (init_attr->attr.max_wr  > dev->dsr->caps.max_srq_wr ||
 	    init_attr->attr.max_sge > dev->dsr->caps.max_srq_sge) {
 		dev_warn(&dev->pdev->dev,
 			 "shared receive queue size invalid\n");
-		return ERR_PTR(-EINVAL);
+		return -EINVAL;
 	}
 
 	if (!atomic_add_unless(&dev->num_srqs, 1, dev->dsr->caps.max_srq))
-		return ERR_PTR(-ENOMEM);
-
-	srq = kmalloc(sizeof(*srq), GFP_KERNEL);
-	if (!srq) {
-		ret = -ENOMEM;
-		goto err_srq;
-	}
+		return -ENOMEM;
 
 	spin_lock_init(&srq->lock);
 	refcount_set(&srq->refcnt, 1);
@@ -153,9 +146,7 @@
 		goto err_srq;
 	}
 
-	srq->umem = ib_umem_get(pd->uobject->context,
-				ucmd.buf_addr,
-				ucmd.buf_size, 0, 0);
+	srq->umem = ib_umem_get(udata, ucmd.buf_addr, ucmd.buf_size, 0, 0);
 	if (IS_ERR(srq->umem)) {
 		ret = PTR_ERR(srq->umem);
 		goto err_srq;
@@ -183,7 +174,7 @@
 	cmd->hdr.cmd = PVRDMA_CMD_CREATE_SRQ;
 	cmd->srq_type = init_attr->srq_type;
 	cmd->nchunks = srq->npages;
-	cmd->pd_handle = to_vpd(pd)->pd_handle;
+	cmd->pd_handle = to_vpd(ibsrq->pd)->pd_handle;
 	cmd->attrs.max_wr = init_attr->attr.max_wr;
 	cmd->attrs.max_sge = init_attr->attr.max_sge;
 	cmd->attrs.srq_limit = init_attr->attr.srq_limit;
@@ -206,21 +197,20 @@
 	/* Copy udata back. */
 	if (ib_copy_to_udata(udata, &srq_resp, sizeof(srq_resp))) {
 		dev_warn(&dev->pdev->dev, "failed to copy back udata\n");
-		pvrdma_destroy_srq(&srq->ibsrq);
-		return ERR_PTR(-EINVAL);
+		pvrdma_destroy_srq(&srq->ibsrq, udata);
+		return -EINVAL;
 	}
 
-	return &srq->ibsrq;
+	return 0;
 
 err_page_dir:
 	pvrdma_page_dir_cleanup(dev, &srq->pdir);
 err_umem:
 	ib_umem_release(srq->umem);
 err_srq:
-	kfree(srq);
 	atomic_dec(&dev->num_srqs);
 
-	return ERR_PTR(ret);
+	return ret;
 }
 
 static void pvrdma_free_srq(struct pvrdma_dev *dev, struct pvrdma_srq *srq)
@@ -240,18 +230,17 @@
 
 	pvrdma_page_dir_cleanup(dev, &srq->pdir);
 
-	kfree(srq);
-
 	atomic_dec(&dev->num_srqs);
 }
 
 /**
  * pvrdma_destroy_srq - destroy shared receive queue
  * @srq: the shared receive queue to destroy
+ * @udata: user data or null for kernel object
  *
  * @return: 0 for success.
  */
-int pvrdma_destroy_srq(struct ib_srq *srq)
+void pvrdma_destroy_srq(struct ib_srq *srq, struct ib_udata *udata)
 {
 	struct pvrdma_srq *vsrq = to_vsrq(srq);
 	union pvrdma_cmd_req req;
@@ -270,8 +259,6 @@
 			 ret);
 
 	pvrdma_free_srq(dev, vsrq);
-
-	return 0;
 }
 
 /**
diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.c
index b65d10b..faf7ecd 100644
--- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.c
+++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.c
@@ -50,6 +50,7 @@
 #include <rdma/ib_smi.h>
 #include <rdma/ib_user_verbs.h>
 #include <rdma/vmw_pvrdma-abi.h>
+#include <rdma/uverbs_ioctl.h>
 
 #include "pvrdma.h"
 
@@ -70,8 +71,6 @@
 	if (uhw->inlen || uhw->outlen)
 		return -EINVAL;
 
-	memset(props, 0, sizeof(*props));
-
 	props->fw_ver = dev->dsr->caps.fw_ver;
 	props->sys_image_guid = dev->dsr->caps.sys_image_guid;
 	props->max_mr_size = dev->dsr->caps.max_mr_size;
@@ -306,47 +305,42 @@
 
 /**
  * pvrdma_alloc_ucontext - allocate ucontext
- * @ibdev: the IB device
+ * @uctx: the uverbs countext
  * @udata: user data
  *
- * @return: the ib_ucontext pointer on success, otherwise errno.
+ * @return:  zero on success, otherwise errno.
  */
-struct ib_ucontext *pvrdma_alloc_ucontext(struct ib_device *ibdev,
-					  struct ib_udata *udata)
+int pvrdma_alloc_ucontext(struct ib_ucontext *uctx, struct ib_udata *udata)
 {
+	struct ib_device *ibdev = uctx->device;
 	struct pvrdma_dev *vdev = to_vdev(ibdev);
-	struct pvrdma_ucontext *context;
-	union pvrdma_cmd_req req;
-	union pvrdma_cmd_resp rsp;
+	struct pvrdma_ucontext *context = to_vucontext(uctx);
+	union pvrdma_cmd_req req = {};
+	union pvrdma_cmd_resp rsp = {};
 	struct pvrdma_cmd_create_uc *cmd = &req.create_uc;
 	struct pvrdma_cmd_create_uc_resp *resp = &rsp.create_uc_resp;
-	struct pvrdma_alloc_ucontext_resp uresp = {0};
+	struct pvrdma_alloc_ucontext_resp uresp = {};
 	int ret;
-	void *ptr;
 
 	if (!vdev->ib_active)
-		return ERR_PTR(-EAGAIN);
-
-	context = kmalloc(sizeof(*context), GFP_KERNEL);
-	if (!context)
-		return ERR_PTR(-ENOMEM);
+		return -EAGAIN;
 
 	context->dev = vdev;
 	ret = pvrdma_uar_alloc(vdev, &context->uar);
-	if (ret) {
-		kfree(context);
-		return ERR_PTR(-ENOMEM);
-	}
+	if (ret)
+		return -ENOMEM;
 
 	/* get ctx_handle from host */
-	memset(cmd, 0, sizeof(*cmd));
-	cmd->pfn = context->uar.pfn;
+	if (vdev->dsr_version < PVRDMA_PPN64_VERSION)
+		cmd->pfn = context->uar.pfn;
+	else
+		cmd->pfn64 = context->uar.pfn;
+
 	cmd->hdr.cmd = PVRDMA_CMD_CREATE_UC;
 	ret = pvrdma_cmd_post(vdev, &req, &rsp, PVRDMA_CMD_CREATE_UC_RESP);
 	if (ret < 0) {
 		dev_warn(&vdev->pdev->dev,
 			 "could not create ucontext, error: %d\n", ret);
-		ptr = ERR_PTR(ret);
 		goto err;
 	}
 
@@ -357,33 +351,28 @@
 	ret = ib_copy_to_udata(udata, &uresp, sizeof(uresp));
 	if (ret) {
 		pvrdma_uar_free(vdev, &context->uar);
-		context->ibucontext.device = ibdev;
 		pvrdma_dealloc_ucontext(&context->ibucontext);
-		return ERR_PTR(-EFAULT);
+		return -EFAULT;
 	}
 
-	return &context->ibucontext;
+	return 0;
 
 err:
 	pvrdma_uar_free(vdev, &context->uar);
-	kfree(context);
-	return ptr;
+	return ret;
 }
 
 /**
  * pvrdma_dealloc_ucontext - deallocate ucontext
  * @ibcontext: the ucontext
- *
- * @return: 0 on success, otherwise errno.
  */
-int pvrdma_dealloc_ucontext(struct ib_ucontext *ibcontext)
+void pvrdma_dealloc_ucontext(struct ib_ucontext *ibcontext)
 {
 	struct pvrdma_ucontext *context = to_vucontext(ibcontext);
-	union pvrdma_cmd_req req;
+	union pvrdma_cmd_req req = {};
 	struct pvrdma_cmd_destroy_uc *cmd = &req.destroy_uc;
 	int ret;
 
-	memset(cmd, 0, sizeof(*cmd));
 	cmd->hdr.cmd = PVRDMA_CMD_DESTROY_UC;
 	cmd->ctx_handle = context->ctx_handle;
 
@@ -394,9 +383,6 @@
 
 	/* Free the UAR even if the device command failed */
 	pvrdma_uar_free(to_vdev(ibcontext->device), &context->uar);
-	kfree(context);
-
-	return ret;
 }
 
 /**
@@ -433,86 +419,75 @@
 
 /**
  * pvrdma_alloc_pd - allocate protection domain
- * @ibdev: the IB device
- * @context: user context
+ * @ibpd: PD pointer
  * @udata: user data
  *
  * @return: the ib_pd protection domain pointer on success, otherwise errno.
  */
-struct ib_pd *pvrdma_alloc_pd(struct ib_device *ibdev,
-			      struct ib_ucontext *context,
-			      struct ib_udata *udata)
+int pvrdma_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata)
 {
-	struct pvrdma_pd *pd;
+	struct ib_device *ibdev = ibpd->device;
+	struct pvrdma_pd *pd = to_vpd(ibpd);
 	struct pvrdma_dev *dev = to_vdev(ibdev);
-	union pvrdma_cmd_req req;
-	union pvrdma_cmd_resp rsp;
+	union pvrdma_cmd_req req = {};
+	union pvrdma_cmd_resp rsp = {};
 	struct pvrdma_cmd_create_pd *cmd = &req.create_pd;
 	struct pvrdma_cmd_create_pd_resp *resp = &rsp.create_pd_resp;
 	struct pvrdma_alloc_pd_resp pd_resp = {0};
 	int ret;
-	void *ptr;
+	struct pvrdma_ucontext *context = rdma_udata_to_drv_context(
+		udata, struct pvrdma_ucontext, ibucontext);
 
 	/* Check allowed max pds */
 	if (!atomic_add_unless(&dev->num_pds, 1, dev->dsr->caps.max_pd))
-		return ERR_PTR(-ENOMEM);
+		return -ENOMEM;
 
-	pd = kmalloc(sizeof(*pd), GFP_KERNEL);
-	if (!pd) {
-		ptr = ERR_PTR(-ENOMEM);
-		goto err;
-	}
-
-	memset(cmd, 0, sizeof(*cmd));
 	cmd->hdr.cmd = PVRDMA_CMD_CREATE_PD;
-	cmd->ctx_handle = (context) ? to_vucontext(context)->ctx_handle : 0;
+	cmd->ctx_handle = context ? context->ctx_handle : 0;
 	ret = pvrdma_cmd_post(dev, &req, &rsp, PVRDMA_CMD_CREATE_PD_RESP);
 	if (ret < 0) {
 		dev_warn(&dev->pdev->dev,
 			 "failed to allocate protection domain, error: %d\n",
 			 ret);
-		ptr = ERR_PTR(ret);
-		goto freepd;
+		goto err;
 	}
 
-	pd->privileged = !context;
+	pd->privileged = !udata;
 	pd->pd_handle = resp->pd_handle;
 	pd->pdn = resp->pd_handle;
 	pd_resp.pdn = resp->pd_handle;
 
-	if (context) {
+	if (udata) {
 		if (ib_copy_to_udata(udata, &pd_resp, sizeof(pd_resp))) {
 			dev_warn(&dev->pdev->dev,
 				 "failed to copy back protection domain\n");
-			pvrdma_dealloc_pd(&pd->ibpd);
-			return ERR_PTR(-EFAULT);
+			pvrdma_dealloc_pd(&pd->ibpd, udata);
+			return -EFAULT;
 		}
 	}
 
 	/* u32 pd handle */
-	return &pd->ibpd;
+	return 0;
 
-freepd:
-	kfree(pd);
 err:
 	atomic_dec(&dev->num_pds);
-	return ptr;
+	return ret;
 }
 
 /**
  * pvrdma_dealloc_pd - deallocate protection domain
  * @pd: the protection domain to be released
+ * @udata: user data or null for kernel object
  *
  * @return: 0 on success, otherwise errno.
  */
-int pvrdma_dealloc_pd(struct ib_pd *pd)
+void pvrdma_dealloc_pd(struct ib_pd *pd, struct ib_udata *udata)
 {
 	struct pvrdma_dev *dev = to_vdev(pd->device);
-	union pvrdma_cmd_req req;
+	union pvrdma_cmd_req req = {};
 	struct pvrdma_cmd_destroy_pd *cmd = &req.destroy_pd;
 	int ret;
 
-	memset(cmd, 0, sizeof(*cmd));
 	cmd->hdr.cmd = PVRDMA_CMD_DESTROY_PD;
 	cmd->pd_handle = to_vpd(pd)->pd_handle;
 
@@ -522,10 +497,7 @@
 			 "could not dealloc protection domain, error: %d\n",
 			 ret);
 
-	kfree(to_vpd(pd));
 	atomic_dec(&dev->num_pds);
-
-	return 0;
 }
 
 /**
@@ -533,35 +505,30 @@
  * @pd: the protection domain
  * @ah_attr: the attributes of the AH
  * @udata: user data blob
+ * @flags: create address handle flags (see enum rdma_create_ah_flags)
  *
- * @return: the ib_ah pointer on success, otherwise errno.
+ * @return: 0 on success, otherwise errno.
  */
-struct ib_ah *pvrdma_create_ah(struct ib_pd *pd, struct rdma_ah_attr *ah_attr,
-			       struct ib_udata *udata)
+int pvrdma_create_ah(struct ib_ah *ibah, struct rdma_ah_attr *ah_attr,
+		     u32 flags, struct ib_udata *udata)
 {
-	struct pvrdma_dev *dev = to_vdev(pd->device);
-	struct pvrdma_ah *ah;
+	struct pvrdma_dev *dev = to_vdev(ibah->device);
+	struct pvrdma_ah *ah = to_vah(ibah);
 	const struct ib_global_route *grh;
 	u8 port_num = rdma_ah_get_port_num(ah_attr);
 
 	if (!(rdma_ah_get_ah_flags(ah_attr) & IB_AH_GRH))
-		return ERR_PTR(-EINVAL);
+		return -EINVAL;
 
 	grh = rdma_ah_read_grh(ah_attr);
 	if ((ah_attr->type != RDMA_AH_ATTR_TYPE_ROCE)  ||
 	    rdma_is_multicast_addr((struct in6_addr *)grh->dgid.raw))
-		return ERR_PTR(-EINVAL);
+		return -EINVAL;
 
 	if (!atomic_add_unless(&dev->num_ahs, 1, dev->dsr->caps.max_ah))
-		return ERR_PTR(-ENOMEM);
+		return -ENOMEM;
 
-	ah = kzalloc(sizeof(*ah), GFP_KERNEL);
-	if (!ah) {
-		atomic_dec(&dev->num_ahs);
-		return ERR_PTR(-ENOMEM);
-	}
-
-	ah->av.port_pd = to_vpd(pd)->pd_handle | (port_num << 24);
+	ah->av.port_pd = to_vpd(ibah->pd)->pd_handle | (port_num << 24);
 	ah->av.src_path_bits = rdma_ah_get_path_bits(ah_attr);
 	ah->av.src_path_bits |= 0x80;
 	ah->av.gid_index = grh->sgid_index;
@@ -571,25 +538,18 @@
 	memcpy(ah->av.dgid, grh->dgid.raw, 16);
 	memcpy(ah->av.dmac, ah_attr->roce.dmac, ETH_ALEN);
 
-	ah->ibah.device = pd->device;
-	ah->ibah.pd = pd;
-	ah->ibah.uobject = NULL;
-
-	return &ah->ibah;
+	return 0;
 }
 
 /**
  * pvrdma_destroy_ah - destroy an address handle
  * @ah: the address handle to destroyed
+ * @flags: destroy address handle flags (see enum rdma_destroy_ah_flags)
  *
- * @return: 0 on success.
  */
-int pvrdma_destroy_ah(struct ib_ah *ah)
+void pvrdma_destroy_ah(struct ib_ah *ah, u32 flags)
 {
 	struct pvrdma_dev *dev = to_vdev(ah->device);
 
-	kfree(to_vah(ah));
 	atomic_dec(&dev->num_ahs);
-
-	return 0;
 }
diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h
index b2e3ab5..e4a48f5 100644
--- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h
+++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h
@@ -396,40 +396,34 @@
 int pvrdma_modify_port(struct ib_device *ibdev, u8 port,
 		       int mask, struct ib_port_modify *props);
 int pvrdma_mmap(struct ib_ucontext *context, struct vm_area_struct *vma);
-struct ib_ucontext *pvrdma_alloc_ucontext(struct ib_device *ibdev,
-					  struct ib_udata *udata);
-int pvrdma_dealloc_ucontext(struct ib_ucontext *context);
-struct ib_pd *pvrdma_alloc_pd(struct ib_device *ibdev,
-			      struct ib_ucontext *context,
-			      struct ib_udata *udata);
-int pvrdma_dealloc_pd(struct ib_pd *ibpd);
+int pvrdma_alloc_ucontext(struct ib_ucontext *uctx, struct ib_udata *udata);
+void pvrdma_dealloc_ucontext(struct ib_ucontext *context);
+int pvrdma_alloc_pd(struct ib_pd *pd, struct ib_udata *udata);
+void pvrdma_dealloc_pd(struct ib_pd *ibpd, struct ib_udata *udata);
 struct ib_mr *pvrdma_get_dma_mr(struct ib_pd *pd, int acc);
 struct ib_mr *pvrdma_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 				 u64 virt_addr, int access_flags,
 				 struct ib_udata *udata);
-int pvrdma_dereg_mr(struct ib_mr *mr);
+int pvrdma_dereg_mr(struct ib_mr *mr, struct ib_udata *udata);
 struct ib_mr *pvrdma_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
-			      u32 max_num_sg);
+			      u32 max_num_sg, struct ib_udata *udata);
 int pvrdma_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg,
 		     int sg_nents, unsigned int *sg_offset);
-struct ib_cq *pvrdma_create_cq(struct ib_device *ibdev,
-			       const struct ib_cq_init_attr *attr,
-			       struct ib_ucontext *context,
-			       struct ib_udata *udata);
-int pvrdma_destroy_cq(struct ib_cq *cq);
+int pvrdma_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
+		     struct ib_udata *udata);
+void pvrdma_destroy_cq(struct ib_cq *cq, struct ib_udata *udata);
 int pvrdma_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc);
 int pvrdma_req_notify_cq(struct ib_cq *cq, enum ib_cq_notify_flags flags);
-struct ib_ah *pvrdma_create_ah(struct ib_pd *pd, struct rdma_ah_attr *ah_attr,
-			       struct ib_udata *udata);
-int pvrdma_destroy_ah(struct ib_ah *ah);
+int pvrdma_create_ah(struct ib_ah *ah, struct rdma_ah_attr *ah_attr, u32 flags,
+		     struct ib_udata *udata);
+void pvrdma_destroy_ah(struct ib_ah *ah, u32 flags);
 
-struct ib_srq *pvrdma_create_srq(struct ib_pd *pd,
-				 struct ib_srq_init_attr *init_attr,
-				 struct ib_udata *udata);
+int pvrdma_create_srq(struct ib_srq *srq, struct ib_srq_init_attr *init_attr,
+		      struct ib_udata *udata);
 int pvrdma_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
 		      enum ib_srq_attr_mask attr_mask, struct ib_udata *udata);
 int pvrdma_query_srq(struct ib_srq *srq, struct ib_srq_attr *srq_attr);
-int pvrdma_destroy_srq(struct ib_srq *srq);
+void pvrdma_destroy_srq(struct ib_srq *srq, struct ib_udata *udata);
 
 struct ib_qp *pvrdma_create_qp(struct ib_pd *pd,
 			       struct ib_qp_init_attr *init_attr,
@@ -438,7 +432,7 @@
 		     int attr_mask, struct ib_udata *udata);
 int pvrdma_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr,
 		    int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr);
-int pvrdma_destroy_qp(struct ib_qp *qp);
+int pvrdma_destroy_qp(struct ib_qp *qp, struct ib_udata *udata);
 int pvrdma_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr,
 		     const struct ib_send_wr **bad_wr);
 int pvrdma_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr,
diff --git a/drivers/infiniband/sw/Makefile b/drivers/infiniband/sw/Makefile
index 8b095b2..68e0230 100644
--- a/drivers/infiniband/sw/Makefile
+++ b/drivers/infiniband/sw/Makefile
@@ -1,2 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 obj-$(CONFIG_INFINIBAND_RDMAVT)		+= rdmavt/
 obj-$(CONFIG_RDMA_RXE)			+= rxe/
+obj-$(CONFIG_RDMA_SIW)			+= siw/
diff --git a/drivers/infiniband/sw/rdmavt/Kconfig b/drivers/infiniband/sw/rdmavt/Kconfig
index 98e7980..1f2759c 100644
--- a/drivers/infiniband/sw/rdmavt/Kconfig
+++ b/drivers/infiniband/sw/rdmavt/Kconfig
@@ -1,6 +1,7 @@
+# SPDX-License-Identifier: GPL-2.0-only
 config INFINIBAND_RDMAVT
 	tristate "RDMA verbs transport library"
-	depends on 64BIT && ARCH_DMA_ADDR_T_64BIT
+	depends on X86_64 && ARCH_DMA_ADDR_T_64BIT
 	depends on PCI
 	select DMA_VIRT_OPS
 	---help---
diff --git a/drivers/infiniband/sw/rdmavt/Makefile b/drivers/infiniband/sw/rdmavt/Makefile
index 78b276a..b21962d 100644
--- a/drivers/infiniband/sw/rdmavt/Makefile
+++ b/drivers/infiniband/sw/rdmavt/Makefile
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 #
 # rdmavt driver
 #
diff --git a/drivers/infiniband/sw/rdmavt/ah.c b/drivers/infiniband/sw/rdmavt/ah.c
index 084bb4b..fe99da0 100644
--- a/drivers/infiniband/sw/rdmavt/ah.c
+++ b/drivers/infiniband/sw/rdmavt/ah.c
@@ -1,5 +1,5 @@
 /*
- * Copyright(c) 2016 Intel Corporation.
+ * Copyright(c) 2016 - 2019 Intel Corporation.
  *
  * This file is provided under a dual BSD/GPLv2 license.  When using or
  * redistributing this file, you may do so under either license.
@@ -89,34 +89,29 @@
 
 /**
  * rvt_create_ah - create an address handle
- * @pd: the protection domain
+ * @ibah: the IB address handle
  * @ah_attr: the attributes of the AH
+ * @create_flags: create address handle flags (see enum rdma_create_ah_flags)
  * @udata: pointer to user's input output buffer information.
  *
  * This may be called from interrupt context.
  *
- * Return: newly allocated ah
+ * Return: 0 on success
  */
-struct ib_ah *rvt_create_ah(struct ib_pd *pd,
-			    struct rdma_ah_attr *ah_attr,
-			    struct ib_udata *udata)
+int rvt_create_ah(struct ib_ah *ibah, struct rdma_ah_attr *ah_attr,
+		  u32 create_flags, struct ib_udata *udata)
 {
-	struct rvt_ah *ah;
-	struct rvt_dev_info *dev = ib_to_rvt(pd->device);
+	struct rvt_ah *ah = ibah_to_rvtah(ibah);
+	struct rvt_dev_info *dev = ib_to_rvt(ibah->device);
 	unsigned long flags;
 
-	if (rvt_check_ah(pd->device, ah_attr))
-		return ERR_PTR(-EINVAL);
-
-	ah = kmalloc(sizeof(*ah), GFP_ATOMIC);
-	if (!ah)
-		return ERR_PTR(-ENOMEM);
+	if (rvt_check_ah(ibah->device, ah_attr))
+		return -EINVAL;
 
 	spin_lock_irqsave(&dev->n_ahs_lock, flags);
 	if (dev->n_ahs_allocated == dev->dparms.props.max_ah) {
 		spin_unlock_irqrestore(&dev->n_ahs_lock, flags);
-		kfree(ah);
-		return ERR_PTR(-ENOMEM);
+		return -ENOMEM;
 	}
 
 	dev->n_ahs_allocated++;
@@ -124,37 +119,31 @@
 
 	rdma_copy_ah_attr(&ah->attr, ah_attr);
 
-	atomic_set(&ah->refcount, 0);
-
 	if (dev->driver_f.notify_new_ah)
-		dev->driver_f.notify_new_ah(pd->device, ah_attr, ah);
+		dev->driver_f.notify_new_ah(ibah->device, ah_attr, ah);
 
-	return &ah->ibah;
+	return 0;
 }
 
 /**
  * rvt_destory_ah - Destory an address handle
  * @ibah: address handle
+ * @destroy_flags: destroy address handle flags (see enum rdma_destroy_ah_flags)
+ * @udata: user data or NULL for kernel object
  *
  * Return: 0 on success
  */
-int rvt_destroy_ah(struct ib_ah *ibah)
+void rvt_destroy_ah(struct ib_ah *ibah, u32 destroy_flags)
 {
 	struct rvt_dev_info *dev = ib_to_rvt(ibah->device);
 	struct rvt_ah *ah = ibah_to_rvtah(ibah);
 	unsigned long flags;
 
-	if (atomic_read(&ah->refcount) != 0)
-		return -EBUSY;
-
 	spin_lock_irqsave(&dev->n_ahs_lock, flags);
 	dev->n_ahs_allocated--;
 	spin_unlock_irqrestore(&dev->n_ahs_lock, flags);
 
 	rdma_destroy_ah_attr(&ah->attr);
-	kfree(ah);
-
-	return 0;
 }
 
 /**
diff --git a/drivers/infiniband/sw/rdmavt/ah.h b/drivers/infiniband/sw/rdmavt/ah.h
index 25271b4..bbb4d3b 100644
--- a/drivers/infiniband/sw/rdmavt/ah.h
+++ b/drivers/infiniband/sw/rdmavt/ah.h
@@ -50,10 +50,9 @@
 
 #include <rdma/rdma_vt.h>
 
-struct ib_ah *rvt_create_ah(struct ib_pd *pd,
-			    struct rdma_ah_attr *ah_attr,
-			    struct ib_udata *udata);
-int rvt_destroy_ah(struct ib_ah *ibah);
+int rvt_create_ah(struct ib_ah *ah, struct rdma_ah_attr *ah_attr,
+		  u32 create_flags, struct ib_udata *udata);
+void rvt_destroy_ah(struct ib_ah *ibah, u32 destroy_flags);
 int rvt_modify_ah(struct ib_ah *ibah, struct rdma_ah_attr *ah_attr);
 int rvt_query_ah(struct ib_ah *ibah, struct rdma_ah_attr *ah_attr);
 
diff --git a/drivers/infiniband/sw/rdmavt/cq.c b/drivers/infiniband/sw/rdmavt/cq.c
index 4f1544a..a85571a 100644
--- a/drivers/infiniband/sw/rdmavt/cq.c
+++ b/drivers/infiniband/sw/rdmavt/cq.c
@@ -60,22 +60,39 @@
  * @solicited: true if @entry is solicited
  *
  * This may be called with qp->s_lock held.
+ *
+ * Return: return true on success, else return
+ * false if cq is full.
  */
-void rvt_cq_enter(struct rvt_cq *cq, struct ib_wc *entry, bool solicited)
+bool rvt_cq_enter(struct rvt_cq *cq, struct ib_wc *entry, bool solicited)
 {
-	struct rvt_cq_wc *wc;
+	struct ib_uverbs_wc *uqueue = NULL;
+	struct ib_wc *kqueue = NULL;
+	struct rvt_cq_wc *u_wc = NULL;
+	struct rvt_k_cq_wc *k_wc = NULL;
 	unsigned long flags;
 	u32 head;
 	u32 next;
+	u32 tail;
 
 	spin_lock_irqsave(&cq->lock, flags);
 
+	if (cq->ip) {
+		u_wc = cq->queue;
+		uqueue = &u_wc->uqueue[0];
+		head = RDMA_READ_UAPI_ATOMIC(u_wc->head);
+		tail = RDMA_READ_UAPI_ATOMIC(u_wc->tail);
+	} else {
+		k_wc = cq->kqueue;
+		kqueue = &k_wc->kqueue[0];
+		head = k_wc->head;
+		tail = k_wc->tail;
+	}
+
 	/*
-	 * Note that the head pointer might be writable by user processes.
-	 * Take care to verify it is a sane value.
+	 * Note that the head pointer might be writable by
+	 * user processes.Take care to verify it is a sane value.
 	 */
-	wc = cq->queue;
-	head = wc->head;
 	if (head >= (unsigned)cq->ibcq.cqe) {
 		head = cq->ibcq.cqe;
 		next = 0;
@@ -83,7 +100,12 @@
 		next = head + 1;
 	}
 
-	if (unlikely(next == wc->tail)) {
+	if (unlikely(next == tail || cq->cq_full)) {
+		struct rvt_dev_info *rdi = cq->rdi;
+
+		if (!cq->cq_full)
+			rvt_pr_err_ratelimited(rdi, "CQ is full!\n");
+		cq->cq_full = true;
 		spin_unlock_irqrestore(&cq->lock, flags);
 		if (cq->ibcq.event_handler) {
 			struct ib_event ev;
@@ -93,30 +115,30 @@
 			ev.event = IB_EVENT_CQ_ERR;
 			cq->ibcq.event_handler(&ev, cq->ibcq.cq_context);
 		}
-		return;
+		return false;
 	}
 	trace_rvt_cq_enter(cq, entry, head);
-	if (cq->ip) {
-		wc->uqueue[head].wr_id = entry->wr_id;
-		wc->uqueue[head].status = entry->status;
-		wc->uqueue[head].opcode = entry->opcode;
-		wc->uqueue[head].vendor_err = entry->vendor_err;
-		wc->uqueue[head].byte_len = entry->byte_len;
-		wc->uqueue[head].ex.imm_data = entry->ex.imm_data;
-		wc->uqueue[head].qp_num = entry->qp->qp_num;
-		wc->uqueue[head].src_qp = entry->src_qp;
-		wc->uqueue[head].wc_flags = entry->wc_flags;
-		wc->uqueue[head].pkey_index = entry->pkey_index;
-		wc->uqueue[head].slid = ib_lid_cpu16(entry->slid);
-		wc->uqueue[head].sl = entry->sl;
-		wc->uqueue[head].dlid_path_bits = entry->dlid_path_bits;
-		wc->uqueue[head].port_num = entry->port_num;
+	if (uqueue) {
+		uqueue[head].wr_id = entry->wr_id;
+		uqueue[head].status = entry->status;
+		uqueue[head].opcode = entry->opcode;
+		uqueue[head].vendor_err = entry->vendor_err;
+		uqueue[head].byte_len = entry->byte_len;
+		uqueue[head].ex.imm_data = entry->ex.imm_data;
+		uqueue[head].qp_num = entry->qp->qp_num;
+		uqueue[head].src_qp = entry->src_qp;
+		uqueue[head].wc_flags = entry->wc_flags;
+		uqueue[head].pkey_index = entry->pkey_index;
+		uqueue[head].slid = ib_lid_cpu16(entry->slid);
+		uqueue[head].sl = entry->sl;
+		uqueue[head].dlid_path_bits = entry->dlid_path_bits;
+		uqueue[head].port_num = entry->port_num;
 		/* Make sure entry is written before the head index. */
-		smp_wmb();
+		RDMA_WRITE_UAPI_ATOMIC(u_wc->head, next);
 	} else {
-		wc->kqueue[head] = *entry;
+		kqueue[head] = *entry;
+		k_wc->head = next;
 	}
-	wc->head = next;
 
 	if (cq->notify == IB_CQ_NEXT_COMP ||
 	    (cq->notify == IB_CQ_SOLICITED &&
@@ -132,6 +154,7 @@
 	}
 
 	spin_unlock_irqrestore(&cq->lock, flags);
+	return true;
 }
 EXPORT_SYMBOL(rvt_cq_enter);
 
@@ -166,45 +189,38 @@
 
 /**
  * rvt_create_cq - create a completion queue
- * @ibdev: the device this completion queue is attached to
+ * @ibcq: Allocated CQ
  * @attr: creation attributes
- * @context: unused by the QLogic_IB driver
  * @udata: user data for libibverbs.so
  *
  * Called by ib_create_cq() in the generic verbs code.
  *
- * Return: pointer to the completion queue or negative errno values
- * for failure.
+ * Return: 0 on success
  */
-struct ib_cq *rvt_create_cq(struct ib_device *ibdev,
-			    const struct ib_cq_init_attr *attr,
-			    struct ib_ucontext *context,
-			    struct ib_udata *udata)
+int rvt_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
+		  struct ib_udata *udata)
 {
+	struct ib_device *ibdev = ibcq->device;
 	struct rvt_dev_info *rdi = ib_to_rvt(ibdev);
-	struct rvt_cq *cq;
-	struct rvt_cq_wc *wc;
-	struct ib_cq *ret;
+	struct rvt_cq *cq = ibcq_to_rvtcq(ibcq);
+	struct rvt_cq_wc *u_wc = NULL;
+	struct rvt_k_cq_wc *k_wc = NULL;
 	u32 sz;
 	unsigned int entries = attr->cqe;
 	int comp_vector = attr->comp_vector;
+	int err;
 
 	if (attr->flags)
-		return ERR_PTR(-EINVAL);
+		return -EINVAL;
 
 	if (entries < 1 || entries > rdi->dparms.props.max_cqe)
-		return ERR_PTR(-EINVAL);
+		return -EINVAL;
 
 	if (comp_vector < 0)
 		comp_vector = 0;
 
 	comp_vector = comp_vector % rdi->ibdev.num_comp_vectors;
 
-	/* Allocate the completion queue structure. */
-	cq = kzalloc_node(sizeof(*cq), GFP_KERNEL, rdi->dparms.node);
-	if (!cq)
-		return ERR_PTR(-ENOMEM);
-
 	/*
 	 * Allocate the completion queue entries and head/tail pointers.
 	 * This is allocated separately so that it can be resized and
@@ -212,17 +228,18 @@
 	 * We need to use vmalloc() in order to support mmap and large
 	 * numbers of entries.
 	 */
-	sz = sizeof(*wc);
-	if (udata && udata->outlen >= sizeof(__u64))
-		sz += sizeof(struct ib_uverbs_wc) * (entries + 1);
-	else
-		sz += sizeof(struct ib_wc) * (entries + 1);
-	wc = udata ?
-		vmalloc_user(sz) :
-		vzalloc_node(sz, rdi->dparms.node);
-	if (!wc) {
-		ret = ERR_PTR(-ENOMEM);
-		goto bail_cq;
+	if (udata && udata->outlen >= sizeof(__u64)) {
+		sz = sizeof(struct ib_uverbs_wc) * (entries + 1);
+		sz += sizeof(*u_wc);
+		u_wc = vmalloc_user(sz);
+		if (!u_wc)
+			return -ENOMEM;
+	} else {
+		sz = sizeof(struct ib_wc) * (entries + 1);
+		sz += sizeof(*k_wc);
+		k_wc = vzalloc_node(sz, rdi->dparms.node);
+		if (!k_wc)
+			return -ENOMEM;
 	}
 
 	/*
@@ -230,26 +247,22 @@
 	 * See rvt_mmap() for details.
 	 */
 	if (udata && udata->outlen >= sizeof(__u64)) {
-		int err;
-
-		cq->ip = rvt_create_mmap_info(rdi, sz, context, wc);
+		cq->ip = rvt_create_mmap_info(rdi, sz, udata, u_wc);
 		if (!cq->ip) {
-			ret = ERR_PTR(-ENOMEM);
+			err = -ENOMEM;
 			goto bail_wc;
 		}
 
 		err = ib_copy_to_udata(udata, &cq->ip->offset,
 				       sizeof(cq->ip->offset));
-		if (err) {
-			ret = ERR_PTR(err);
+		if (err)
 			goto bail_ip;
-		}
 	}
 
 	spin_lock_irq(&rdi->n_cqs_lock);
 	if (rdi->n_cqs_allocated == rdi->dparms.props.max_cq) {
 		spin_unlock_irq(&rdi->n_cqs_lock);
-		ret = ERR_PTR(-ENOMEM);
+		err = -ENOMEM;
 		goto bail_ip;
 	}
 
@@ -279,32 +292,30 @@
 	cq->notify = RVT_CQ_NONE;
 	spin_lock_init(&cq->lock);
 	INIT_WORK(&cq->comptask, send_complete);
-	cq->queue = wc;
-
-	ret = &cq->ibcq;
+	if (u_wc)
+		cq->queue = u_wc;
+	else
+		cq->kqueue = k_wc;
 
 	trace_rvt_create_cq(cq, attr);
-	goto done;
+	return 0;
 
 bail_ip:
 	kfree(cq->ip);
 bail_wc:
-	vfree(wc);
-bail_cq:
-	kfree(cq);
-done:
-	return ret;
+	vfree(u_wc);
+	vfree(k_wc);
+	return err;
 }
 
 /**
  * rvt_destroy_cq - destroy a completion queue
  * @ibcq: the completion queue to destroy.
+ * @udata: user data or NULL for kernel object
  *
  * Called by ib_destroy_cq() in the generic verbs code.
- *
- * Return: always 0
  */
-int rvt_destroy_cq(struct ib_cq *ibcq)
+void rvt_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata)
 {
 	struct rvt_cq *cq = ibcq_to_rvtcq(ibcq);
 	struct rvt_dev_info *rdi = cq->rdi;
@@ -317,9 +328,6 @@
 		kref_put(&cq->ip->ref, rvt_release_mmap_info);
 	else
 		vfree(cq->queue);
-	kfree(cq);
-
-	return 0;
 }
 
 /**
@@ -346,9 +354,16 @@
 	if (cq->notify != IB_CQ_NEXT_COMP)
 		cq->notify = notify_flags & IB_CQ_SOLICITED_MASK;
 
-	if ((notify_flags & IB_CQ_REPORT_MISSED_EVENTS) &&
-	    cq->queue->head != cq->queue->tail)
-		ret = 1;
+	if (notify_flags & IB_CQ_REPORT_MISSED_EVENTS) {
+		if (cq->queue) {
+			if (RDMA_READ_UAPI_ATOMIC(cq->queue->head) !=
+				RDMA_READ_UAPI_ATOMIC(cq->queue->tail))
+				ret = 1;
+		} else {
+			if (cq->kqueue->head != cq->kqueue->tail)
+				ret = 1;
+		}
+	}
 
 	spin_unlock_irqrestore(&cq->lock, flags);
 
@@ -364,12 +379,14 @@
 int rvt_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata)
 {
 	struct rvt_cq *cq = ibcq_to_rvtcq(ibcq);
-	struct rvt_cq_wc *old_wc;
-	struct rvt_cq_wc *wc;
 	u32 head, tail, n;
 	int ret;
 	u32 sz;
 	struct rvt_dev_info *rdi = cq->rdi;
+	struct rvt_cq_wc *u_wc = NULL;
+	struct rvt_cq_wc *old_u_wc = NULL;
+	struct rvt_k_cq_wc *k_wc = NULL;
+	struct rvt_k_cq_wc *old_k_wc = NULL;
 
 	if (cqe < 1 || cqe > rdi->dparms.props.max_cqe)
 		return -EINVAL;
@@ -377,17 +394,19 @@
 	/*
 	 * Need to use vmalloc() if we want to support large #s of entries.
 	 */
-	sz = sizeof(*wc);
-	if (udata && udata->outlen >= sizeof(__u64))
-		sz += sizeof(struct ib_uverbs_wc) * (cqe + 1);
-	else
-		sz += sizeof(struct ib_wc) * (cqe + 1);
-	wc = udata ?
-		vmalloc_user(sz) :
-		vzalloc_node(sz, rdi->dparms.node);
-	if (!wc)
-		return -ENOMEM;
-
+	if (udata && udata->outlen >= sizeof(__u64)) {
+		sz = sizeof(struct ib_uverbs_wc) * (cqe + 1);
+		sz += sizeof(*u_wc);
+		u_wc = vmalloc_user(sz);
+		if (!u_wc)
+			return -ENOMEM;
+	} else {
+		sz = sizeof(struct ib_wc) * (cqe + 1);
+		sz += sizeof(*k_wc);
+		k_wc = vzalloc_node(sz, rdi->dparms.node);
+		if (!k_wc)
+			return -ENOMEM;
+	}
 	/* Check that we can write the offset to mmap. */
 	if (udata && udata->outlen >= sizeof(__u64)) {
 		__u64 offset = 0;
@@ -402,11 +421,18 @@
 	 * Make sure head and tail are sane since they
 	 * might be user writable.
 	 */
-	old_wc = cq->queue;
-	head = old_wc->head;
+	if (u_wc) {
+		old_u_wc = cq->queue;
+		head = RDMA_READ_UAPI_ATOMIC(old_u_wc->head);
+		tail = RDMA_READ_UAPI_ATOMIC(old_u_wc->tail);
+	} else {
+		old_k_wc = cq->kqueue;
+		head = old_k_wc->head;
+		tail = old_k_wc->tail;
+	}
+
 	if (head > (u32)cq->ibcq.cqe)
 		head = (u32)cq->ibcq.cqe;
-	tail = old_wc->tail;
 	if (tail > (u32)cq->ibcq.cqe)
 		tail = (u32)cq->ibcq.cqe;
 	if (head < tail)
@@ -418,27 +444,36 @@
 		goto bail_unlock;
 	}
 	for (n = 0; tail != head; n++) {
-		if (cq->ip)
-			wc->uqueue[n] = old_wc->uqueue[tail];
+		if (u_wc)
+			u_wc->uqueue[n] = old_u_wc->uqueue[tail];
 		else
-			wc->kqueue[n] = old_wc->kqueue[tail];
+			k_wc->kqueue[n] = old_k_wc->kqueue[tail];
 		if (tail == (u32)cq->ibcq.cqe)
 			tail = 0;
 		else
 			tail++;
 	}
 	cq->ibcq.cqe = cqe;
-	wc->head = n;
-	wc->tail = 0;
-	cq->queue = wc;
+	if (u_wc) {
+		RDMA_WRITE_UAPI_ATOMIC(u_wc->head, n);
+		RDMA_WRITE_UAPI_ATOMIC(u_wc->tail, 0);
+		cq->queue = u_wc;
+	} else {
+		k_wc->head = n;
+		k_wc->tail = 0;
+		cq->kqueue = k_wc;
+	}
 	spin_unlock_irq(&cq->lock);
 
-	vfree(old_wc);
+	if (u_wc)
+		vfree(old_u_wc);
+	else
+		vfree(old_k_wc);
 
 	if (cq->ip) {
 		struct rvt_mmap_info *ip = cq->ip;
 
-		rvt_update_mmap_info(rdi, ip, sz, wc);
+		rvt_update_mmap_info(rdi, ip, sz, u_wc);
 
 		/*
 		 * Return the offset to mmap.
@@ -462,7 +497,9 @@
 bail_unlock:
 	spin_unlock_irq(&cq->lock);
 bail_free:
-	vfree(wc);
+	vfree(u_wc);
+	vfree(k_wc);
+
 	return ret;
 }
 
@@ -480,7 +517,7 @@
 int rvt_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry)
 {
 	struct rvt_cq *cq = ibcq_to_rvtcq(ibcq);
-	struct rvt_cq_wc *wc;
+	struct rvt_k_cq_wc *wc;
 	unsigned long flags;
 	int npolled;
 	u32 tail;
@@ -491,7 +528,7 @@
 
 	spin_lock_irqsave(&cq->lock, flags);
 
-	wc = cq->queue;
+	wc = cq->kqueue;
 	tail = wc->tail;
 	if (tail > (u32)cq->ibcq.cqe)
 		tail = (u32)cq->ibcq.cqe;
diff --git a/drivers/infiniband/sw/rdmavt/cq.h b/drivers/infiniband/sw/rdmavt/cq.h
index 72184b1..5e26a2e 100644
--- a/drivers/infiniband/sw/rdmavt/cq.h
+++ b/drivers/infiniband/sw/rdmavt/cq.h
@@ -51,11 +51,9 @@
 #include <rdma/rdma_vt.h>
 #include <rdma/rdmavt_cq.h>
 
-struct ib_cq *rvt_create_cq(struct ib_device *ibdev,
-			    const struct ib_cq_init_attr *attr,
-			    struct ib_ucontext *context,
-			    struct ib_udata *udata);
-int rvt_destroy_cq(struct ib_cq *ibcq);
+int rvt_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
+		  struct ib_udata *udata);
+void rvt_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata);
 int rvt_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags notify_flags);
 int rvt_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata);
 int rvt_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry);
diff --git a/drivers/infiniband/sw/rdmavt/mad.c b/drivers/infiniband/sw/rdmavt/mad.c
index d6981dc..108c71e 100644
--- a/drivers/infiniband/sw/rdmavt/mad.c
+++ b/drivers/infiniband/sw/rdmavt/mad.c
@@ -160,7 +160,8 @@
 			ib_unregister_mad_agent(agent);
 		}
 		if (rvp->sm_ah) {
-			rdma_destroy_ah(&rvp->sm_ah->ibah);
+			rdma_destroy_ah(&rvp->sm_ah->ibah,
+					RDMA_DESTROY_AH_SLEEPABLE);
 			rvp->sm_ah = NULL;
 		}
 
diff --git a/drivers/infiniband/sw/rdmavt/mmap.c b/drivers/infiniband/sw/rdmavt/mmap.c
index 6b712ee..652f4a7 100644
--- a/drivers/infiniband/sw/rdmavt/mmap.c
+++ b/drivers/infiniband/sw/rdmavt/mmap.c
@@ -49,6 +49,7 @@
 #include <linux/vmalloc.h>
 #include <linux/mm.h>
 #include <asm/pgtable.h>
+#include <rdma/uverbs_ioctl.h>
 #include "mmap.h"
 
 /**
@@ -150,18 +151,19 @@
  * rvt_create_mmap_info - allocate information for hfi1_mmap
  * @rdi: rvt dev struct
  * @size: size in bytes to map
- * @context: user context
+ * @udata: user data (must be valid!)
  * @obj: opaque pointer to a cq, wq etc
  *
  * Return: rvt_mmap struct on success
  */
-struct rvt_mmap_info *rvt_create_mmap_info(struct rvt_dev_info *rdi,
-					   u32 size,
-					   struct ib_ucontext *context,
-					   void *obj)
+struct rvt_mmap_info *rvt_create_mmap_info(struct rvt_dev_info *rdi, u32 size,
+					   struct ib_udata *udata, void *obj)
 {
 	struct rvt_mmap_info *ip;
 
+	if (!udata)
+		return ERR_PTR(-EINVAL);
+
 	ip = kmalloc_node(sizeof(*ip), GFP_KERNEL, rdi->dparms.node);
 	if (!ip)
 		return ip;
@@ -177,7 +179,9 @@
 
 	INIT_LIST_HEAD(&ip->pending_mmaps);
 	ip->size = size;
-	ip->context = context;
+	ip->context =
+		container_of(udata, struct uverbs_attr_bundle, driver_udata)
+			->context;
 	ip->obj = obj;
 	kref_init(&ip->ref);
 
diff --git a/drivers/infiniband/sw/rdmavt/mmap.h b/drivers/infiniband/sw/rdmavt/mmap.h
index fab0e7b..02466c4 100644
--- a/drivers/infiniband/sw/rdmavt/mmap.h
+++ b/drivers/infiniband/sw/rdmavt/mmap.h
@@ -53,10 +53,8 @@
 void rvt_mmap_init(struct rvt_dev_info *rdi);
 void rvt_release_mmap_info(struct kref *ref);
 int rvt_mmap(struct ib_ucontext *context, struct vm_area_struct *vma);
-struct rvt_mmap_info *rvt_create_mmap_info(struct rvt_dev_info *rdi,
-					   u32 size,
-					   struct ib_ucontext *context,
-					   void *obj);
+struct rvt_mmap_info *rvt_create_mmap_info(struct rvt_dev_info *rdi, u32 size,
+					   struct ib_udata *udata, void *obj);
 void rvt_update_mmap_info(struct rvt_dev_info *rdi, struct rvt_mmap_info *ip,
 			  u32 size, void *obj);
 
diff --git a/drivers/infiniband/sw/rdmavt/mr.c b/drivers/infiniband/sw/rdmavt/mr.c
index 49c9541..a6a39f0 100644
--- a/drivers/infiniband/sw/rdmavt/mr.c
+++ b/drivers/infiniband/sw/rdmavt/mr.c
@@ -96,6 +96,8 @@
 	for (i = 0; i < rdi->lkey_table.max; i++)
 		RCU_INIT_POINTER(rdi->lkey_table.table[i], NULL);
 
+	rdi->dparms.props.max_mr = rdi->lkey_table.max;
+	rdi->dparms.props.max_fmr = rdi->lkey_table.max;
 	return 0;
 }
 
@@ -381,19 +383,18 @@
 {
 	struct rvt_mr *mr;
 	struct ib_umem *umem;
-	struct scatterlist *sg;
-	int n, m, entry;
+	struct sg_page_iter sg_iter;
+	int n, m;
 	struct ib_mr *ret;
 
 	if (length == 0)
 		return ERR_PTR(-EINVAL);
 
-	umem = ib_umem_get(pd->uobject->context, start, length,
-			   mr_access_flags, 0);
+	umem = ib_umem_get(udata, start, length, mr_access_flags, 0);
 	if (IS_ERR(umem))
 		return (void *)umem;
 
-	n = umem->nmap;
+	n = ib_umem_num_pages(umem);
 
 	mr = __rvt_alloc_mr(n, pd);
 	if (IS_ERR(mr)) {
@@ -408,23 +409,21 @@
 	mr->mr.access_flags = mr_access_flags;
 	mr->umem = umem;
 
-	mr->mr.page_shift = umem->page_shift;
+	mr->mr.page_shift = PAGE_SHIFT;
 	m = 0;
 	n = 0;
-	for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) {
+	for_each_sg_page (umem->sg_head.sgl, &sg_iter, umem->nmap, 0) {
 		void *vaddr;
 
-		vaddr = page_address(sg_page(sg));
+		vaddr = page_address(sg_page_iter_page(&sg_iter));
 		if (!vaddr) {
 			ret = ERR_PTR(-EINVAL);
 			goto bail_inval;
 		}
 		mr->mr.map[m]->segs[n].vaddr = vaddr;
-		mr->mr.map[m]->segs[n].length = BIT(umem->page_shift);
-		trace_rvt_mr_user_seg(&mr->mr, m, n, vaddr,
-				      BIT(umem->page_shift));
-		n++;
-		if (n == RVT_SEGSZ) {
+		mr->mr.map[m]->segs[n].length = PAGE_SIZE;
+		trace_rvt_mr_user_seg(&mr->mr, m, n, vaddr, PAGE_SIZE);
+		if (++n == RVT_SEGSZ) {
 			m++;
 			n = 0;
 		}
@@ -551,7 +550,7 @@
  *
  * Returns 0 on success.
  */
-int rvt_dereg_mr(struct ib_mr *ibmr)
+int rvt_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
 {
 	struct rvt_mr *mr = to_imr(ibmr);
 	int ret;
@@ -563,8 +562,7 @@
 	if (ret)
 		goto out;
 	rvt_deinit_mregion(&mr->mr);
-	if (mr->umem)
-		ib_umem_release(mr->umem);
+	ib_umem_release(mr->umem);
 	kfree(mr);
 out:
 	return ret;
@@ -578,9 +576,8 @@
  *
  * Return: the memory region on success, otherwise return an errno.
  */
-struct ib_mr *rvt_alloc_mr(struct ib_pd *pd,
-			   enum ib_mr_type mr_type,
-			   u32 max_num_sg)
+struct ib_mr *rvt_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
+			   u32 max_num_sg, struct ib_udata *udata)
 {
 	struct rvt_mr *mr;
 
@@ -611,17 +608,12 @@
 	if (unlikely(mapped_segs == mr->mr.max_segs))
 		return -ENOMEM;
 
-	if (mr->mr.length == 0) {
-		mr->mr.user_base = addr;
-		mr->mr.iova = addr;
-	}
-
 	m = mapped_segs / RVT_SEGSZ;
 	n = mapped_segs % RVT_SEGSZ;
 	mr->mr.map[m]->segs[n].vaddr = (void *)addr;
 	mr->mr.map[m]->segs[n].length = ps;
-	trace_rvt_mr_page_seg(&mr->mr, m, n, (void *)addr, ps);
 	mr->mr.length += ps;
+	trace_rvt_mr_page_seg(&mr->mr, m, n, (void *)addr, ps);
 
 	return 0;
 }
@@ -633,17 +625,25 @@
  * @sg_nents: number of entries in sg
  * @sg_offset: offset in bytes into sg
  *
+ * Overwrite rvt_mr length with mr length calculated by ib_sg_to_pages.
+ *
  * Return: number of sg elements mapped to the memory region
  */
 int rvt_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg,
 		  int sg_nents, unsigned int *sg_offset)
 {
 	struct rvt_mr *mr = to_imr(ibmr);
+	int ret;
 
 	mr->mr.length = 0;
 	mr->mr.page_shift = PAGE_SHIFT;
-	return ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset,
-			      rvt_set_page);
+	ret = ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset, rvt_set_page);
+	mr->mr.user_base = ibmr->iova;
+	mr->mr.iova = ibmr->iova;
+	mr->mr.offset = ibmr->iova - (u64)mr->mr.map[0]->segs[0].vaddr;
+	mr->mr.length = (size_t)ibmr->length;
+	trace_rvt_map_mr_sg(ibmr, sg_nents, sg_offset);
+	return ret;
 }
 
 /**
@@ -674,6 +674,7 @@
 	ibmr->rkey = key;
 	mr->mr.lkey = key;
 	mr->mr.access_flags = access;
+	mr->mr.iova = ibmr->iova;
 	atomic_set(&mr->mr.lkey_invalid, 0);
 
 	return 0;
diff --git a/drivers/infiniband/sw/rdmavt/mr.h b/drivers/infiniband/sw/rdmavt/mr.h
index 132800e..2c8d075 100644
--- a/drivers/infiniband/sw/rdmavt/mr.h
+++ b/drivers/infiniband/sw/rdmavt/mr.h
@@ -78,10 +78,9 @@
 struct ib_mr *rvt_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 			      u64 virt_addr, int mr_access_flags,
 			      struct ib_udata *udata);
-int rvt_dereg_mr(struct ib_mr *ibmr);
-struct ib_mr *rvt_alloc_mr(struct ib_pd *pd,
-			   enum ib_mr_type mr_type,
-			   u32 max_num_sg);
+int rvt_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata);
+struct ib_mr *rvt_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
+			   u32 max_num_sg, struct ib_udata *udata);
 int rvt_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg,
 		  int sg_nents, unsigned int *sg_offset);
 struct ib_fmr *rvt_alloc_fmr(struct ib_pd *pd, int mr_access_flags,
diff --git a/drivers/infiniband/sw/rdmavt/pd.c b/drivers/infiniband/sw/rdmavt/pd.c
index 8a89aff..a403718 100644
--- a/drivers/infiniband/sw/rdmavt/pd.c
+++ b/drivers/infiniband/sw/rdmavt/pd.c
@@ -50,27 +50,20 @@
 
 /**
  * rvt_alloc_pd - allocate a protection domain
- * @ibdev: ib device
- * @context: optional user context
+ * @ibpd: PD
  * @udata: optional user data
  *
  * Allocate and keep track of a PD.
  *
  * Return: 0 on success
  */
-struct ib_pd *rvt_alloc_pd(struct ib_device *ibdev,
-			   struct ib_ucontext *context,
-			   struct ib_udata *udata)
+int rvt_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata)
 {
+	struct ib_device *ibdev = ibpd->device;
 	struct rvt_dev_info *dev = ib_to_rvt(ibdev);
-	struct rvt_pd *pd;
-	struct ib_pd *ret;
+	struct rvt_pd *pd = ibpd_to_rvtpd(ibpd);
+	int ret = 0;
 
-	pd = kmalloc(sizeof(*pd), GFP_KERNEL);
-	if (!pd) {
-		ret = ERR_PTR(-ENOMEM);
-		goto bail;
-	}
 	/*
 	 * While we could continue allocating protecetion domains, being
 	 * constrained only by system resources. The IBTA spec defines that
@@ -81,8 +74,7 @@
 	spin_lock(&dev->n_pds_lock);
 	if (dev->n_pds_allocated == dev->dparms.props.max_pd) {
 		spin_unlock(&dev->n_pds_lock);
-		kfree(pd);
-		ret = ERR_PTR(-ENOMEM);
+		ret = -ENOMEM;
 		goto bail;
 	}
 
@@ -92,8 +84,6 @@
 	/* ib_alloc_pd() will initialize pd->ibpd. */
 	pd->user = !!udata;
 
-	ret = &pd->ibpd;
-
 bail:
 	return ret;
 }
@@ -101,19 +91,15 @@
 /**
  * rvt_dealloc_pd - Free PD
  * @ibpd: Free up PD
+ * @udata: Valid user data or NULL for kernel object
  *
  * Return: always 0
  */
-int rvt_dealloc_pd(struct ib_pd *ibpd)
+void rvt_dealloc_pd(struct ib_pd *ibpd, struct ib_udata *udata)
 {
-	struct rvt_pd *pd = ibpd_to_rvtpd(ibpd);
 	struct rvt_dev_info *dev = ib_to_rvt(ibpd->device);
 
 	spin_lock(&dev->n_pds_lock);
 	dev->n_pds_allocated--;
 	spin_unlock(&dev->n_pds_lock);
-
-	kfree(pd);
-
-	return 0;
 }
diff --git a/drivers/infiniband/sw/rdmavt/pd.h b/drivers/infiniband/sw/rdmavt/pd.h
index 1892ca4..71ba76d 100644
--- a/drivers/infiniband/sw/rdmavt/pd.h
+++ b/drivers/infiniband/sw/rdmavt/pd.h
@@ -50,9 +50,7 @@
 
 #include <rdma/rdma_vt.h>
 
-struct ib_pd *rvt_alloc_pd(struct ib_device *ibdev,
-			   struct ib_ucontext *context,
-			   struct ib_udata *udata);
-int rvt_dealloc_pd(struct ib_pd *ibpd);
+int rvt_alloc_pd(struct ib_pd *pd, struct ib_udata *udata);
+void rvt_dealloc_pd(struct ib_pd *ibpd, struct ib_udata *udata);
 
 #endif          /* DEF_RDMAVTPD_H */
diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c
index 5ce403c..0b0a241 100644
--- a/drivers/infiniband/sw/rdmavt/qp.c
+++ b/drivers/infiniband/sw/rdmavt/qp.c
@@ -1,5 +1,5 @@
 /*
- * Copyright(c) 2016, 2017 Intel Corporation.
+ * Copyright(c) 2016 - 2019 Intel Corporation.
  *
  * This file is provided under a dual BSD/GPLv2 license.  When using or
  * redistributing this file, you may do so under either license.
@@ -53,10 +53,13 @@
 #include <rdma/ib_verbs.h>
 #include <rdma/ib_hdrs.h>
 #include <rdma/opa_addr.h>
+#include <rdma/uverbs_ioctl.h>
 #include "qp.h"
 #include "vt.h"
 #include "trace.h"
 
+#define RVT_RWQ_COUNT_THRESHOLD 16
+
 static void rvt_rc_timeout(struct timer_list *t);
 
 /*
@@ -118,6 +121,187 @@
 };
 EXPORT_SYMBOL(ib_rvt_state_ops);
 
+/* platform specific: return the last level cache (llc) size, in KiB */
+static int rvt_wss_llc_size(void)
+{
+	/* assume that the boot CPU value is universal for all CPUs */
+	return boot_cpu_data.x86_cache_size;
+}
+
+/* platform specific: cacheless copy */
+static void cacheless_memcpy(void *dst, void *src, size_t n)
+{
+	/*
+	 * Use the only available X64 cacheless copy.  Add a __user cast
+	 * to quiet sparse.  The src agument is already in the kernel so
+	 * there are no security issues.  The extra fault recovery machinery
+	 * is not invoked.
+	 */
+	__copy_user_nocache(dst, (void __user *)src, n, 0);
+}
+
+void rvt_wss_exit(struct rvt_dev_info *rdi)
+{
+	struct rvt_wss *wss = rdi->wss;
+
+	if (!wss)
+		return;
+
+	/* coded to handle partially initialized and repeat callers */
+	kfree(wss->entries);
+	wss->entries = NULL;
+	kfree(rdi->wss);
+	rdi->wss = NULL;
+}
+
+/**
+ * rvt_wss_init - Init wss data structures
+ *
+ * Return: 0 on success
+ */
+int rvt_wss_init(struct rvt_dev_info *rdi)
+{
+	unsigned int sge_copy_mode = rdi->dparms.sge_copy_mode;
+	unsigned int wss_threshold = rdi->dparms.wss_threshold;
+	unsigned int wss_clean_period = rdi->dparms.wss_clean_period;
+	long llc_size;
+	long llc_bits;
+	long table_size;
+	long table_bits;
+	struct rvt_wss *wss;
+	int node = rdi->dparms.node;
+
+	if (sge_copy_mode != RVT_SGE_COPY_ADAPTIVE) {
+		rdi->wss = NULL;
+		return 0;
+	}
+
+	rdi->wss = kzalloc_node(sizeof(*rdi->wss), GFP_KERNEL, node);
+	if (!rdi->wss)
+		return -ENOMEM;
+	wss = rdi->wss;
+
+	/* check for a valid percent range - default to 80 if none or invalid */
+	if (wss_threshold < 1 || wss_threshold > 100)
+		wss_threshold = 80;
+
+	/* reject a wildly large period */
+	if (wss_clean_period > 1000000)
+		wss_clean_period = 256;
+
+	/* reject a zero period */
+	if (wss_clean_period == 0)
+		wss_clean_period = 1;
+
+	/*
+	 * Calculate the table size - the next power of 2 larger than the
+	 * LLC size.  LLC size is in KiB.
+	 */
+	llc_size = rvt_wss_llc_size() * 1024;
+	table_size = roundup_pow_of_two(llc_size);
+
+	/* one bit per page in rounded up table */
+	llc_bits = llc_size / PAGE_SIZE;
+	table_bits = table_size / PAGE_SIZE;
+	wss->pages_mask = table_bits - 1;
+	wss->num_entries = table_bits / BITS_PER_LONG;
+
+	wss->threshold = (llc_bits * wss_threshold) / 100;
+	if (wss->threshold == 0)
+		wss->threshold = 1;
+
+	wss->clean_period = wss_clean_period;
+	atomic_set(&wss->clean_counter, wss_clean_period);
+
+	wss->entries = kcalloc_node(wss->num_entries, sizeof(*wss->entries),
+				    GFP_KERNEL, node);
+	if (!wss->entries) {
+		rvt_wss_exit(rdi);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+/*
+ * Advance the clean counter.  When the clean period has expired,
+ * clean an entry.
+ *
+ * This is implemented in atomics to avoid locking.  Because multiple
+ * variables are involved, it can be racy which can lead to slightly
+ * inaccurate information.  Since this is only a heuristic, this is
+ * OK.  Any innaccuracies will clean themselves out as the counter
+ * advances.  That said, it is unlikely the entry clean operation will
+ * race - the next possible racer will not start until the next clean
+ * period.
+ *
+ * The clean counter is implemented as a decrement to zero.  When zero
+ * is reached an entry is cleaned.
+ */
+static void wss_advance_clean_counter(struct rvt_wss *wss)
+{
+	int entry;
+	int weight;
+	unsigned long bits;
+
+	/* become the cleaner if we decrement the counter to zero */
+	if (atomic_dec_and_test(&wss->clean_counter)) {
+		/*
+		 * Set, not add, the clean period.  This avoids an issue
+		 * where the counter could decrement below the clean period.
+		 * Doing a set can result in lost decrements, slowing the
+		 * clean advance.  Since this a heuristic, this possible
+		 * slowdown is OK.
+		 *
+		 * An alternative is to loop, advancing the counter by a
+		 * clean period until the result is > 0. However, this could
+		 * lead to several threads keeping another in the clean loop.
+		 * This could be mitigated by limiting the number of times
+		 * we stay in the loop.
+		 */
+		atomic_set(&wss->clean_counter, wss->clean_period);
+
+		/*
+		 * Uniquely grab the entry to clean and move to next.
+		 * The current entry is always the lower bits of
+		 * wss.clean_entry.  The table size, wss.num_entries,
+		 * is always a power-of-2.
+		 */
+		entry = (atomic_inc_return(&wss->clean_entry) - 1)
+			& (wss->num_entries - 1);
+
+		/* clear the entry and count the bits */
+		bits = xchg(&wss->entries[entry], 0);
+		weight = hweight64((u64)bits);
+		/* only adjust the contended total count if needed */
+		if (weight)
+			atomic_sub(weight, &wss->total_count);
+	}
+}
+
+/*
+ * Insert the given address into the working set array.
+ */
+static void wss_insert(struct rvt_wss *wss, void *address)
+{
+	u32 page = ((unsigned long)address >> PAGE_SHIFT) & wss->pages_mask;
+	u32 entry = page / BITS_PER_LONG; /* assumes this ends up a shift */
+	u32 nr = page & (BITS_PER_LONG - 1);
+
+	if (!test_and_set_bit(nr, &wss->entries[entry]))
+		atomic_inc(&wss->total_count);
+
+	wss_advance_clean_counter(wss);
+}
+
+/*
+ * Is the working set larger than the threshold?
+ */
+static inline bool wss_exceeds_threshold(struct rvt_wss *wss)
+{
+	return atomic_read(&wss->total_count) >= wss->threshold;
+}
+
 static void get_map_page(struct rvt_qpn_table *qpt,
 			 struct rvt_qpn_map *map)
 {
@@ -412,7 +596,8 @@
 			offset = qpt->incr | ((offset & 1) ^ 1);
 		}
 		/* there can be no set bits in low-order QoS bits */
-		WARN_ON(offset & (BIT(rdi->dparms.qos_shift) - 1));
+		WARN_ON(rdi->dparms.qos_shift > 1 &&
+			offset & ((BIT(rdi->dparms.qos_shift - 1) - 1) << 1));
 		qpn = mk_qpn(qpt, map, offset);
 	}
 
@@ -441,13 +626,7 @@
 		while (qp->s_last != qp->s_head) {
 			struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, qp->s_last);
 
-			rvt_put_swqe(wqe);
-
-			if (qp->ibqp.qp_type == IB_QPT_UD ||
-			    qp->ibqp.qp_type == IB_QPT_SMI ||
-			    qp->ibqp.qp_type == IB_QPT_GSI)
-				atomic_dec(&ibah_to_rvtah(
-						wqe->ud_wr.ah)->refcount);
+			rvt_put_qp_swqe(qp, wqe);
 			if (++qp->s_last >= qp->s_size)
 				qp->s_last = 0;
 			smp_wmb(); /* see qp_set_savail */
@@ -626,6 +805,47 @@
 }
 
 /**
+ * rvt_alloc_rq - allocate memory for user or kernel buffer
+ * @rq: receive queue data structure
+ * @size: number of request queue entries
+ * @node: The NUMA node
+ * @udata: True if user data is available or not false
+ *
+ * Return: If memory allocation failed, return -ENONEM
+ * This function is used by both shared receive
+ * queues and non-shared receive queues to allocate
+ * memory.
+ */
+int rvt_alloc_rq(struct rvt_rq *rq, u32 size, int node,
+		 struct ib_udata *udata)
+{
+	if (udata) {
+		rq->wq = vmalloc_user(sizeof(struct rvt_rwq) + size);
+		if (!rq->wq)
+			goto bail;
+		/* need kwq with no buffers */
+		rq->kwq = kzalloc_node(sizeof(*rq->kwq), GFP_KERNEL, node);
+		if (!rq->kwq)
+			goto bail;
+		rq->kwq->curr_wq = rq->wq->wq;
+	} else {
+		/* need kwq with buffers */
+		rq->kwq =
+			vzalloc_node(sizeof(struct rvt_krwq) + size, node);
+		if (!rq->kwq)
+			goto bail;
+		rq->kwq->curr_wq = rq->kwq->wq;
+	}
+
+	spin_lock_init(&rq->kwq->p_lock);
+	spin_lock_init(&rq->kwq->c_lock);
+	return 0;
+bail:
+	rvt_free_rq(rq);
+	return -ENOMEM;
+}
+
+/**
  * rvt_init_qp - initialize the QP state to the reset state
  * @qp: the QP to init or reinit
  * @type: the QP type
@@ -673,11 +893,10 @@
 	qp->s_mig_state = IB_MIG_MIGRATED;
 	qp->r_head_ack_queue = 0;
 	qp->s_tail_ack_queue = 0;
+	qp->s_acked_ack_queue = 0;
 	qp->s_num_rd_atomic = 0;
-	if (qp->r_rq.wq) {
-		qp->r_rq.wq->head = 0;
-		qp->r_rq.wq->tail = 0;
-	}
+	if (qp->r_rq.kwq)
+		qp->r_rq.kwq->count = qp->r_rq.size;
 	qp->r_sge.num_sge = 0;
 	atomic_set(&qp->s_reserved_used, 0);
 }
@@ -750,6 +969,61 @@
 }
 
 /**
+ * get_allowed_ops - Given a QP type return the appropriate allowed OP
+ * @type: valid, supported, QP type
+ */
+static u8 get_allowed_ops(enum ib_qp_type type)
+{
+	return type == IB_QPT_RC ? IB_OPCODE_RC : type == IB_QPT_UC ?
+		IB_OPCODE_UC : IB_OPCODE_UD;
+}
+
+/**
+ * free_ud_wq_attr - Clean up AH attribute cache for UD QPs
+ * @qp: Valid QP with allowed_ops set
+ *
+ * The rvt_swqe data structure being used is a union, so this is
+ * only valid for UD QPs.
+ */
+static void free_ud_wq_attr(struct rvt_qp *qp)
+{
+	struct rvt_swqe *wqe;
+	int i;
+
+	for (i = 0; qp->allowed_ops == IB_OPCODE_UD && i < qp->s_size; i++) {
+		wqe = rvt_get_swqe_ptr(qp, i);
+		kfree(wqe->ud_wr.attr);
+		wqe->ud_wr.attr = NULL;
+	}
+}
+
+/**
+ * alloc_ud_wq_attr - AH attribute cache for UD QPs
+ * @qp: Valid QP with allowed_ops set
+ * @node: Numa node for allocation
+ *
+ * The rvt_swqe data structure being used is a union, so this is
+ * only valid for UD QPs.
+ */
+static int alloc_ud_wq_attr(struct rvt_qp *qp, int node)
+{
+	struct rvt_swqe *wqe;
+	int i;
+
+	for (i = 0; qp->allowed_ops == IB_OPCODE_UD && i < qp->s_size; i++) {
+		wqe = rvt_get_swqe_ptr(qp, i);
+		wqe->ud_wr.attr = kzalloc_node(sizeof(*wqe->ud_wr.attr),
+					       GFP_KERNEL, node);
+		if (!wqe->ud_wr.attr) {
+			free_ud_wq_attr(qp);
+			return -ENOMEM;
+		}
+	}
+
+	return 0;
+}
+
+/**
  * rvt_create_qp - create a queue pair for a device
  * @ibpd: the protection domain who's device we create the queue pair for
  * @init_attr: the attributes of the queue pair
@@ -811,9 +1085,7 @@
 	case IB_QPT_UC:
 	case IB_QPT_RC:
 	case IB_QPT_UD:
-		sz = sizeof(struct rvt_sge) *
-			init_attr->cap.max_send_sge +
-			sizeof(struct rvt_swqe);
+		sz = struct_size(swq, sg_list, init_attr->cap.max_send_sge);
 		swq = vzalloc_node(array_size(sz, sqsize), rdi->dparms.node);
 		if (!swq)
 			return ERR_PTR(-ENOMEM);
@@ -833,6 +1105,7 @@
 				  rdi->dparms.node);
 		if (!qp)
 			goto bail_swq;
+		qp->allowed_ops = get_allowed_ops(init_attr->qp_type);
 
 		RCU_INIT_POINTER(qp->next, NULL);
 		if (init_attr->qp_type == IB_QPT_RC) {
@@ -870,17 +1143,12 @@
 			qp->r_rq.max_sge = init_attr->cap.max_recv_sge;
 			sz = (sizeof(struct ib_sge) * qp->r_rq.max_sge) +
 				sizeof(struct rvt_rwqe);
-			if (udata)
-				qp->r_rq.wq = vmalloc_user(
-						sizeof(struct rvt_rwq) +
-						qp->r_rq.size * sz);
-			else
-				qp->r_rq.wq = vzalloc_node(
-						sizeof(struct rvt_rwq) +
-						qp->r_rq.size * sz,
-						rdi->dparms.node);
-			if (!qp->r_rq.wq)
+			err = rvt_alloc_rq(&qp->r_rq, qp->r_rq.size * sz,
+					   rdi->dparms.node, udata);
+			if (err) {
+				ret = ERR_PTR(err);
 				goto bail_driver_priv;
+			}
 		}
 
 		/*
@@ -890,7 +1158,6 @@
 		spin_lock_init(&qp->r_lock);
 		spin_lock_init(&qp->s_hlock);
 		spin_lock_init(&qp->s_lock);
-		spin_lock_init(&qp->r_rq.lock);
 		atomic_set(&qp->refcount, 0);
 		atomic_set(&qp->local_ops_pending, 0);
 		init_waitqueue_head(&qp->wait);
@@ -902,6 +1169,11 @@
 		qp->s_max_sge = init_attr->cap.max_send_sge;
 		if (init_attr->sq_sig_type == IB_SIGNAL_REQ_WR)
 			qp->s_flags = RVT_S_SIGNAL_REQ_WR;
+		err = alloc_ud_wq_attr(qp, rdi->dparms.node);
+		if (err) {
+			ret = (ERR_PTR(err));
+			goto bail_driver_priv;
+		}
 
 		err = alloc_qpn(rdi, &rdi->qp_dev->qpn_table,
 				init_attr->qp_type,
@@ -913,6 +1185,13 @@
 		qp->ibqp.qp_num = err;
 		qp->port_num = init_attr->port_num;
 		rvt_init_qp(rdi, qp, init_attr->qp_type);
+		if (rdi->driver_f.qp_priv_init) {
+			err = rdi->driver_f.qp_priv_init(rdi, qp, init_attr);
+			if (err) {
+				ret = ERR_PTR(err);
+				goto bail_rq_wq;
+			}
+		}
 		break;
 
 	default:
@@ -939,8 +1218,7 @@
 		} else {
 			u32 s = sizeof(struct rvt_rwq) + qp->r_rq.size * sz;
 
-			qp->ip = rvt_create_mmap_info(rdi, s,
-						      ibpd->uobject->context,
+			qp->ip = rvt_create_mmap_info(rdi, s, udata,
 						      qp->r_rq.wq);
 			if (!qp->ip) {
 				ret = ERR_PTR(-ENOMEM);
@@ -988,28 +1266,6 @@
 
 	ret = &qp->ibqp;
 
-	/*
-	 * We have our QP and its good, now keep track of what types of opcodes
-	 * can be processed on this QP. We do this by keeping track of what the
-	 * 3 high order bits of the opcode are.
-	 */
-	switch (init_attr->qp_type) {
-	case IB_QPT_SMI:
-	case IB_QPT_GSI:
-	case IB_QPT_UD:
-		qp->allowed_ops = IB_OPCODE_UD;
-		break;
-	case IB_QPT_RC:
-		qp->allowed_ops = IB_OPCODE_RC;
-		break;
-	case IB_QPT_UC:
-		qp->allowed_ops = IB_OPCODE_UC;
-		break;
-	default:
-		ret = ERR_PTR(-EINVAL);
-		goto bail_ip;
-	}
-
 	return ret;
 
 bail_ip:
@@ -1020,8 +1276,8 @@
 	rvt_free_qpn(&rdi->qp_dev->qpn_table, qp->ibqp.qp_num);
 
 bail_rq_wq:
-	if (!qp->ip)
-		vfree(qp->r_rq.wq);
+	rvt_free_rq(&qp->r_rq);
+	free_ud_wq_attr(qp);
 
 bail_driver_priv:
 	rdi->driver_f.qp_priv_free(rdi, qp);
@@ -1087,19 +1343,26 @@
 	}
 	wc.status = IB_WC_WR_FLUSH_ERR;
 
-	if (qp->r_rq.wq) {
-		struct rvt_rwq *wq;
+	if (qp->r_rq.kwq) {
 		u32 head;
 		u32 tail;
+		struct rvt_rwq *wq = NULL;
+		struct rvt_krwq *kwq = NULL;
 
-		spin_lock(&qp->r_rq.lock);
-
+		spin_lock(&qp->r_rq.kwq->c_lock);
+		/* qp->ip used to validate if there is a  user buffer mmaped */
+		if (qp->ip) {
+			wq = qp->r_rq.wq;
+			head = RDMA_READ_UAPI_ATOMIC(wq->head);
+			tail = RDMA_READ_UAPI_ATOMIC(wq->tail);
+		} else {
+			kwq = qp->r_rq.kwq;
+			head = kwq->head;
+			tail = kwq->tail;
+		}
 		/* sanity check pointers before trusting them */
-		wq = qp->r_rq.wq;
-		head = wq->head;
 		if (head >= qp->r_rq.size)
 			head = 0;
-		tail = wq->tail;
 		if (tail >= qp->r_rq.size)
 			tail = 0;
 		while (tail != head) {
@@ -1108,9 +1371,11 @@
 				tail = 0;
 			rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc, 1);
 		}
-		wq->tail = tail;
-
-		spin_unlock(&qp->r_rq.lock);
+		if (qp->ip)
+			RDMA_WRITE_UAPI_ATOMIC(wq->tail, tail);
+		else
+			kwq->tail = tail;
+		spin_unlock(&qp->r_rq.kwq->c_lock);
 	} else if (qp->ibqp.event_handler) {
 		ret = 1;
 	}
@@ -1164,11 +1429,8 @@
 	int lastwqe = 0;
 	int mig = 0;
 	int pmtu = 0; /* for gcc warning only */
-	enum rdma_link_layer link;
 	int opa_ah;
 
-	link = rdma_port_get_link_layer(ibqp->device, qp->port_num);
-
 	spin_lock_irq(&qp->r_lock);
 	spin_lock(&qp->s_hlock);
 	spin_lock(&qp->s_lock);
@@ -1179,7 +1441,7 @@
 	opa_ah = rdma_cap_opa_ah(ibqp->device, qp->port_num);
 
 	if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type,
-				attr_mask, link))
+				attr_mask))
 		goto inval;
 
 	if (rdi->driver_f.check_modify_qp &&
@@ -1428,7 +1690,7 @@
  *
  * Return: 0 on success.
  */
-int rvt_destroy_qp(struct ib_qp *ibqp)
+int rvt_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata)
 {
 	struct rvt_qp *qp = ibqp_to_rvtqp(ibqp);
 	struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device);
@@ -1455,13 +1717,13 @@
 
 	if (qp->ip)
 		kref_put(&qp->ip->ref, rvt_release_mmap_info);
-	else
-		vfree(qp->r_rq.wq);
-	vfree(qp->s_wq);
+	kvfree(qp->r_rq.kwq);
 	rdi->driver_f.qp_priv_free(rdi, qp);
 	kfree(qp->s_ack_queue);
 	rdma_destroy_ah_attr(&qp->remote_ah_attr);
 	rdma_destroy_ah_attr(&qp->alt_ah_attr);
+	free_ud_wq_attr(qp);
+	vfree(qp->s_wq);
 	kfree(qp);
 	return 0;
 }
@@ -1542,7 +1804,7 @@
 		  const struct ib_recv_wr **bad_wr)
 {
 	struct rvt_qp *qp = ibqp_to_rvtqp(ibqp);
-	struct rvt_rwq *wq = qp->r_rq.wq;
+	struct rvt_krwq *wq = qp->r_rq.kwq;
 	unsigned long flags;
 	int qp_err_flush = (ib_rvt_state_ops[qp->state] & RVT_FLUSH_RECV) &&
 				!qp->ibqp.srq;
@@ -1563,12 +1825,12 @@
 			return -EINVAL;
 		}
 
-		spin_lock_irqsave(&qp->r_rq.lock, flags);
+		spin_lock_irqsave(&qp->r_rq.kwq->p_lock, flags);
 		next = wq->head + 1;
 		if (next >= qp->r_rq.size)
 			next = 0;
-		if (next == wq->tail) {
-			spin_unlock_irqrestore(&qp->r_rq.lock, flags);
+		if (next == READ_ONCE(wq->tail)) {
+			spin_unlock_irqrestore(&qp->r_rq.kwq->p_lock, flags);
 			*bad_wr = wr;
 			return -ENOMEM;
 		}
@@ -1585,16 +1847,18 @@
 			wqe = rvt_get_rwqe_ptr(&qp->r_rq, wq->head);
 			wqe->wr_id = wr->wr_id;
 			wqe->num_sge = wr->num_sge;
-			for (i = 0; i < wr->num_sge; i++)
-				wqe->sg_list[i] = wr->sg_list[i];
+			for (i = 0; i < wr->num_sge; i++) {
+				wqe->sg_list[i].addr = wr->sg_list[i].addr;
+				wqe->sg_list[i].length = wr->sg_list[i].length;
+				wqe->sg_list[i].lkey = wr->sg_list[i].lkey;
+			}
 			/*
 			 * Make sure queue entry is written
 			 * before the head index.
 			 */
-			smp_wmb();
-			wq->head = next;
+			smp_store_release(&wq->head, next);
 		}
-		spin_unlock_irqrestore(&qp->r_rq.lock, flags);
+		spin_unlock_irqrestore(&qp->r_rq.kwq->p_lock, flags);
 	}
 	return 0;
 }
@@ -1675,10 +1939,9 @@
 
 	/* see rvt_qp_wqe_unreserve() */
 	smp_mb__before_atomic();
-	reserved_used = atomic_read(&qp->s_reserved_used);
 	if (unlikely(reserved_op)) {
 		/* see rvt_qp_wqe_unreserve() */
-		smp_mb__before_atomic();
+		reserved_used = atomic_read(&qp->s_reserved_used);
 		if (reserved_used >= rdi->dparms.reserved_operations)
 			return -ENOMEM;
 		return 0;
@@ -1686,14 +1949,13 @@
 	/* non-reserved operations */
 	if (likely(qp->s_avail))
 		return 0;
-	slast = READ_ONCE(qp->s_last);
+	/* See rvt_qp_complete_swqe() */
+	slast = smp_load_acquire(&qp->s_last);
 	if (qp->s_head >= slast)
 		avail = qp->s_size - (qp->s_head - slast);
 	else
 		avail = slast - qp->s_head;
 
-	/* see rvt_qp_wqe_unreserve() */
-	smp_mb__before_atomic();
 	reserved_used = atomic_read(&qp->s_reserved_used);
 	avail =  avail - 1 -
 		(rdi->dparms.reserved_operations - reserved_used);
@@ -1718,7 +1980,7 @@
  */
 static int rvt_post_one_wr(struct rvt_qp *qp,
 			   const struct ib_send_wr *wr,
-			   int *call_send)
+			   bool *call_send)
 {
 	struct rvt_swqe *wqe;
 	u32 next;
@@ -1823,22 +2085,17 @@
 		wqe->wr.num_sge = j;
 	}
 
-	/* general part of wqe valid - allow for driver checks */
-	if (rdi->driver_f.check_send_wqe) {
-		ret = rdi->driver_f.check_send_wqe(qp, wqe);
-		if (ret < 0)
-			goto bail_inval_free;
-		if (ret)
-			*call_send = ret;
-	}
-
+	/*
+	 * Calculate and set SWQE PSN values prior to handing it off
+	 * to the driver's check routine. This give the driver the
+	 * opportunity to adjust PSN values based on internal checks.
+	 */
 	log_pmtu = qp->log_pmtu;
-	if (qp->ibqp.qp_type != IB_QPT_UC &&
-	    qp->ibqp.qp_type != IB_QPT_RC) {
-		struct rvt_ah *ah = ibah_to_rvtah(wqe->ud_wr.ah);
+	if (qp->allowed_ops == IB_OPCODE_UD) {
+		struct rvt_ah *ah = rvt_get_swqe_ah(wqe);
 
 		log_pmtu = ah->log_pmtu;
-		atomic_inc(&ibah_to_rvtah(ud_wr(wr)->ah)->refcount);
+		rdma_copy_ah_attr(wqe->ud_wr.attr, &ah->attr);
 	}
 
 	if (rdi->post_parms[wr->opcode].flags & RVT_OPERATION_LOCAL) {
@@ -1856,8 +2113,18 @@
 				(wqe->length ?
 					((wqe->length - 1) >> log_pmtu) :
 					0);
-		qp->s_next_psn = wqe->lpsn + 1;
 	}
+
+	/* general part of wqe valid - allow for driver checks */
+	if (rdi->driver_f.setup_wqe) {
+		ret = rdi->driver_f.setup_wqe(qp, wqe, call_send);
+		if (ret < 0)
+			goto bail_inval_free_ref;
+	}
+
+	if (!(rdi->post_parms[wr->opcode].flags & RVT_OPERATION_LOCAL))
+		qp->s_next_psn = wqe->lpsn + 1;
+
 	if (unlikely(reserved_op)) {
 		wqe->wr.send_flags |= RVT_SEND_RESERVE_USED;
 		rvt_qp_wqe_reserve(qp, wqe);
@@ -1871,6 +2138,9 @@
 
 	return 0;
 
+bail_inval_free_ref:
+	if (qp->allowed_ops == IB_OPCODE_UD)
+		rdma_destroy_ah_attr(wqe->ud_wr.attr);
 bail_inval_free:
 	/* release mr holds */
 	while (j) {
@@ -1897,7 +2167,7 @@
 	struct rvt_qp *qp = ibqp_to_rvtqp(ibqp);
 	struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device);
 	unsigned long flags = 0;
-	int call_send;
+	bool call_send;
 	unsigned nreq = 0;
 	int err = 0;
 
@@ -1930,7 +2200,11 @@
 bail:
 	spin_unlock_irqrestore(&qp->s_hlock, flags);
 	if (nreq) {
-		if (call_send)
+		/*
+		 * Only call do_send if there is exactly one packet, and the
+		 * driver said it was ok.
+		 */
+		if (nreq == 1 && call_send)
 			rdi->driver_f.do_send(qp);
 		else
 			rdi->driver_f.schedule_send_no_lock(qp);
@@ -1952,7 +2226,7 @@
 		      const struct ib_recv_wr **bad_wr)
 {
 	struct rvt_srq *srq = ibsrq_to_rvtsrq(ibsrq);
-	struct rvt_rwq *wq;
+	struct rvt_krwq *wq;
 	unsigned long flags;
 
 	for (; wr; wr = wr->next) {
@@ -1965,13 +2239,13 @@
 			return -EINVAL;
 		}
 
-		spin_lock_irqsave(&srq->rq.lock, flags);
-		wq = srq->rq.wq;
+		spin_lock_irqsave(&srq->rq.kwq->p_lock, flags);
+		wq = srq->rq.kwq;
 		next = wq->head + 1;
 		if (next >= srq->rq.size)
 			next = 0;
-		if (next == wq->tail) {
-			spin_unlock_irqrestore(&srq->rq.lock, flags);
+		if (next == READ_ONCE(wq->tail)) {
+			spin_unlock_irqrestore(&srq->rq.kwq->p_lock, flags);
 			*bad_wr = wr;
 			return -ENOMEM;
 		}
@@ -1979,17 +2253,35 @@
 		wqe = rvt_get_rwqe_ptr(&srq->rq, wq->head);
 		wqe->wr_id = wr->wr_id;
 		wqe->num_sge = wr->num_sge;
-		for (i = 0; i < wr->num_sge; i++)
-			wqe->sg_list[i] = wr->sg_list[i];
+		for (i = 0; i < wr->num_sge; i++) {
+			wqe->sg_list[i].addr = wr->sg_list[i].addr;
+			wqe->sg_list[i].length = wr->sg_list[i].length;
+			wqe->sg_list[i].lkey = wr->sg_list[i].lkey;
+		}
 		/* Make sure queue entry is written before the head index. */
-		smp_wmb();
-		wq->head = next;
-		spin_unlock_irqrestore(&srq->rq.lock, flags);
+		smp_store_release(&wq->head, next);
+		spin_unlock_irqrestore(&srq->rq.kwq->p_lock, flags);
 	}
 	return 0;
 }
 
 /*
+ * rvt used the internal kernel struct as part of its ABI, for now make sure
+ * the kernel struct does not change layout. FIXME: rvt should never cast the
+ * user struct to a kernel struct.
+ */
+static struct ib_sge *rvt_cast_sge(struct rvt_wqe_sge *sge)
+{
+	BUILD_BUG_ON(offsetof(struct ib_sge, addr) !=
+		     offsetof(struct rvt_wqe_sge, addr));
+	BUILD_BUG_ON(offsetof(struct ib_sge, length) !=
+		     offsetof(struct rvt_wqe_sge, length));
+	BUILD_BUG_ON(offsetof(struct ib_sge, lkey) !=
+		     offsetof(struct rvt_wqe_sge, lkey));
+	return (struct ib_sge *)sge;
+}
+
+/*
  * Validate a RWQE and fill in the SGE state.
  * Return 1 if OK.
  */
@@ -2012,7 +2304,7 @@
 			continue;
 		/* Check LKEY */
 		ret = rvt_lkey_ok(rkt, pd, j ? &ss->sg_list[j - 1] : &ss->sge,
-				  NULL, &wqe->sg_list[i],
+				  NULL, rvt_cast_sge(&wqe->sg_list[i]),
 				  IB_ACCESS_LOCAL_WRITE);
 		if (unlikely(ret <= 0))
 			goto bad_lkey;
@@ -2041,6 +2333,50 @@
 }
 
 /**
+ * get_count - count numbers of request work queue entries
+ * in circular buffer
+ * @rq: data structure for request queue entry
+ * @tail: tail indices of the circular buffer
+ * @head: head indices of the circular buffer
+ *
+ * Return - total number of entries in the circular buffer
+ */
+static u32 get_count(struct rvt_rq *rq, u32 tail, u32 head)
+{
+	u32 count;
+
+	count = head;
+
+	if (count >= rq->size)
+		count = 0;
+	if (count < tail)
+		count += rq->size - tail;
+	else
+		count -= tail;
+
+	return count;
+}
+
+/**
+ * get_rvt_head - get head indices of the circular buffer
+ * @rq: data structure for request queue entry
+ * @ip: the QP
+ *
+ * Return - head index value
+ */
+static inline u32 get_rvt_head(struct rvt_rq *rq, void *ip)
+{
+	u32 head;
+
+	if (ip)
+		head = RDMA_READ_UAPI_ATOMIC(rq->wq->head);
+	else
+		head = rq->kwq->head;
+
+	return head;
+}
+
+/**
  * rvt_get_rwqe - copy the next RWQE into the QP's RWQE
  * @qp: the QP
  * @wr_id_only: update qp->r_wr_id only, not qp->r_sge
@@ -2054,39 +2390,54 @@
 {
 	unsigned long flags;
 	struct rvt_rq *rq;
+	struct rvt_krwq *kwq = NULL;
 	struct rvt_rwq *wq;
 	struct rvt_srq *srq;
 	struct rvt_rwqe *wqe;
 	void (*handler)(struct ib_event *, void *);
 	u32 tail;
+	u32 head;
 	int ret;
+	void *ip = NULL;
 
 	if (qp->ibqp.srq) {
 		srq = ibsrq_to_rvtsrq(qp->ibqp.srq);
 		handler = srq->ibsrq.event_handler;
 		rq = &srq->rq;
+		ip = srq->ip;
 	} else {
 		srq = NULL;
 		handler = NULL;
 		rq = &qp->r_rq;
+		ip = qp->ip;
 	}
 
-	spin_lock_irqsave(&rq->lock, flags);
+	spin_lock_irqsave(&rq->kwq->c_lock, flags);
 	if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) {
 		ret = 0;
 		goto unlock;
 	}
+	kwq = rq->kwq;
+	if (ip) {
+		wq = rq->wq;
+		tail = RDMA_READ_UAPI_ATOMIC(wq->tail);
+	} else {
+		tail = kwq->tail;
+	}
 
-	wq = rq->wq;
-	tail = wq->tail;
 	/* Validate tail before using it since it is user writable. */
 	if (tail >= rq->size)
 		tail = 0;
-	if (unlikely(tail == wq->head)) {
+
+	if (kwq->count < RVT_RWQ_COUNT_THRESHOLD) {
+		head = get_rvt_head(rq, ip);
+		kwq->count = get_count(rq, tail, head);
+	}
+	if (unlikely(kwq->count == 0)) {
 		ret = 0;
 		goto unlock;
 	}
-	/* Make sure entry is read after head index is read. */
+	/* Make sure entry is read after the count is read. */
 	smp_rmb();
 	wqe = rvt_get_rwqe_ptr(rq, tail);
 	/*
@@ -2096,43 +2447,41 @@
 	 */
 	if (++tail >= rq->size)
 		tail = 0;
-	wq->tail = tail;
+	if (ip)
+		RDMA_WRITE_UAPI_ATOMIC(wq->tail, tail);
+	else
+		kwq->tail = tail;
 	if (!wr_id_only && !init_sge(qp, wqe)) {
 		ret = -1;
 		goto unlock;
 	}
 	qp->r_wr_id = wqe->wr_id;
 
+	kwq->count--;
 	ret = 1;
 	set_bit(RVT_R_WRID_VALID, &qp->r_aflags);
 	if (handler) {
-		u32 n;
-
 		/*
 		 * Validate head pointer value and compute
 		 * the number of remaining WQEs.
 		 */
-		n = wq->head;
-		if (n >= rq->size)
-			n = 0;
-		if (n < tail)
-			n += rq->size - tail;
-		else
-			n -= tail;
-		if (n < srq->limit) {
-			struct ib_event ev;
+		if (kwq->count < srq->limit) {
+			kwq->count = get_count(rq, tail, get_rvt_head(rq, ip));
+			if (kwq->count < srq->limit) {
+				struct ib_event ev;
 
-			srq->limit = 0;
-			spin_unlock_irqrestore(&rq->lock, flags);
-			ev.device = qp->ibqp.device;
-			ev.element.srq = qp->ibqp.srq;
-			ev.event = IB_EVENT_SRQ_LIMIT_REACHED;
-			handler(&ev, srq->ibsrq.srq_context);
-			goto bail;
+				srq->limit = 0;
+				spin_unlock_irqrestore(&rq->kwq->c_lock, flags);
+				ev.device = qp->ibqp.device;
+				ev.element.srq = qp->ibqp.srq;
+				ev.event = IB_EVENT_SRQ_LIMIT_REACHED;
+				handler(&ev, srq->ibsrq.srq_context);
+				goto bail;
+			}
 		}
 	}
 unlock:
-	spin_unlock_irqrestore(&rq->lock, flags);
+	spin_unlock_irqrestore(&rq->kwq->c_lock, flags);
 bail:
 	return ret;
 }
@@ -2194,11 +2543,12 @@
 }
 
 /*
- *  rvt_add_retry_timer - add/start a retry timer
+ *  rvt_add_retry_timer_ext - add/start a retry timer
  *  @qp - the QP
+ *  @shift - timeout shift to wait for multiple packets
  *  add a retry timer on the QP
  */
-void rvt_add_retry_timer(struct rvt_qp *qp)
+void rvt_add_retry_timer_ext(struct rvt_qp *qp, u8 shift)
 {
 	struct ib_qp *ibqp = &qp->ibqp;
 	struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device);
@@ -2206,11 +2556,11 @@
 	lockdep_assert_held(&qp->s_lock);
 	qp->s_flags |= RVT_S_TIMER;
        /* 4.096 usec. * (1 << qp->timeout) */
-	qp->s_timer.expires = jiffies + qp->timeout_jiffies +
-			     rdi->busy_jiffies;
+	qp->s_timer.expires = jiffies + rdi->busy_jiffies +
+			      (qp->timeout_jiffies << shift);
 	add_timer(&qp->s_timer);
 }
-EXPORT_SYMBOL(rvt_add_retry_timer);
+EXPORT_SYMBOL(rvt_add_retry_timer_ext);
 
 /**
  * rvt_add_rnr_timer - add/start an rnr timer
@@ -2465,3 +2815,440 @@
 	rcu_read_unlock();
 }
 EXPORT_SYMBOL(rvt_qp_iter);
+
+/*
+ * This should be called with s_lock held.
+ */
+void rvt_send_complete(struct rvt_qp *qp, struct rvt_swqe *wqe,
+		       enum ib_wc_status status)
+{
+	u32 old_last, last;
+	struct rvt_dev_info *rdi;
+
+	if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_OR_FLUSH_SEND))
+		return;
+	rdi = ib_to_rvt(qp->ibqp.device);
+
+	old_last = qp->s_last;
+	trace_rvt_qp_send_completion(qp, wqe, old_last);
+	last = rvt_qp_complete_swqe(qp, wqe, rdi->wc_opcode[wqe->wr.opcode],
+				    status);
+	if (qp->s_acked == old_last)
+		qp->s_acked = last;
+	if (qp->s_cur == old_last)
+		qp->s_cur = last;
+	if (qp->s_tail == old_last)
+		qp->s_tail = last;
+	if (qp->state == IB_QPS_SQD && last == qp->s_cur)
+		qp->s_draining = 0;
+}
+EXPORT_SYMBOL(rvt_send_complete);
+
+/**
+ * rvt_copy_sge - copy data to SGE memory
+ * @qp: associated QP
+ * @ss: the SGE state
+ * @data: the data to copy
+ * @length: the length of the data
+ * @release: boolean to release MR
+ * @copy_last: do a separate copy of the last 8 bytes
+ */
+void rvt_copy_sge(struct rvt_qp *qp, struct rvt_sge_state *ss,
+		  void *data, u32 length,
+		  bool release, bool copy_last)
+{
+	struct rvt_sge *sge = &ss->sge;
+	int i;
+	bool in_last = false;
+	bool cacheless_copy = false;
+	struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device);
+	struct rvt_wss *wss = rdi->wss;
+	unsigned int sge_copy_mode = rdi->dparms.sge_copy_mode;
+
+	if (sge_copy_mode == RVT_SGE_COPY_CACHELESS) {
+		cacheless_copy = length >= PAGE_SIZE;
+	} else if (sge_copy_mode == RVT_SGE_COPY_ADAPTIVE) {
+		if (length >= PAGE_SIZE) {
+			/*
+			 * NOTE: this *assumes*:
+			 * o The first vaddr is the dest.
+			 * o If multiple pages, then vaddr is sequential.
+			 */
+			wss_insert(wss, sge->vaddr);
+			if (length >= (2 * PAGE_SIZE))
+				wss_insert(wss, (sge->vaddr + PAGE_SIZE));
+
+			cacheless_copy = wss_exceeds_threshold(wss);
+		} else {
+			wss_advance_clean_counter(wss);
+		}
+	}
+
+	if (copy_last) {
+		if (length > 8) {
+			length -= 8;
+		} else {
+			copy_last = false;
+			in_last = true;
+		}
+	}
+
+again:
+	while (length) {
+		u32 len = rvt_get_sge_length(sge, length);
+
+		WARN_ON_ONCE(len == 0);
+		if (unlikely(in_last)) {
+			/* enforce byte transfer ordering */
+			for (i = 0; i < len; i++)
+				((u8 *)sge->vaddr)[i] = ((u8 *)data)[i];
+		} else if (cacheless_copy) {
+			cacheless_memcpy(sge->vaddr, data, len);
+		} else {
+			memcpy(sge->vaddr, data, len);
+		}
+		rvt_update_sge(ss, len, release);
+		data += len;
+		length -= len;
+	}
+
+	if (copy_last) {
+		copy_last = false;
+		in_last = true;
+		length = 8;
+		goto again;
+	}
+}
+EXPORT_SYMBOL(rvt_copy_sge);
+
+static enum ib_wc_status loopback_qp_drop(struct rvt_ibport *rvp,
+					  struct rvt_qp *sqp)
+{
+	rvp->n_pkt_drops++;
+	/*
+	 * For RC, the requester would timeout and retry so
+	 * shortcut the timeouts and just signal too many retries.
+	 */
+	return sqp->ibqp.qp_type == IB_QPT_RC ?
+		IB_WC_RETRY_EXC_ERR : IB_WC_SUCCESS;
+}
+
+/**
+ * ruc_loopback - handle UC and RC loopback requests
+ * @sqp: the sending QP
+ *
+ * This is called from rvt_do_send() to forward a WQE addressed to the same HFI
+ * Note that although we are single threaded due to the send engine, we still
+ * have to protect against post_send().  We don't have to worry about
+ * receive interrupts since this is a connected protocol and all packets
+ * will pass through here.
+ */
+void rvt_ruc_loopback(struct rvt_qp *sqp)
+{
+	struct rvt_ibport *rvp =  NULL;
+	struct rvt_dev_info *rdi = ib_to_rvt(sqp->ibqp.device);
+	struct rvt_qp *qp;
+	struct rvt_swqe *wqe;
+	struct rvt_sge *sge;
+	unsigned long flags;
+	struct ib_wc wc;
+	u64 sdata;
+	atomic64_t *maddr;
+	enum ib_wc_status send_status;
+	bool release;
+	int ret;
+	bool copy_last = false;
+	int local_ops = 0;
+
+	rcu_read_lock();
+	rvp = rdi->ports[sqp->port_num - 1];
+
+	/*
+	 * Note that we check the responder QP state after
+	 * checking the requester's state.
+	 */
+
+	qp = rvt_lookup_qpn(ib_to_rvt(sqp->ibqp.device), rvp,
+			    sqp->remote_qpn);
+
+	spin_lock_irqsave(&sqp->s_lock, flags);
+
+	/* Return if we are already busy processing a work request. */
+	if ((sqp->s_flags & (RVT_S_BUSY | RVT_S_ANY_WAIT)) ||
+	    !(ib_rvt_state_ops[sqp->state] & RVT_PROCESS_OR_FLUSH_SEND))
+		goto unlock;
+
+	sqp->s_flags |= RVT_S_BUSY;
+
+again:
+	if (sqp->s_last == READ_ONCE(sqp->s_head))
+		goto clr_busy;
+	wqe = rvt_get_swqe_ptr(sqp, sqp->s_last);
+
+	/* Return if it is not OK to start a new work request. */
+	if (!(ib_rvt_state_ops[sqp->state] & RVT_PROCESS_NEXT_SEND_OK)) {
+		if (!(ib_rvt_state_ops[sqp->state] & RVT_FLUSH_SEND))
+			goto clr_busy;
+		/* We are in the error state, flush the work request. */
+		send_status = IB_WC_WR_FLUSH_ERR;
+		goto flush_send;
+	}
+
+	/*
+	 * We can rely on the entry not changing without the s_lock
+	 * being held until we update s_last.
+	 * We increment s_cur to indicate s_last is in progress.
+	 */
+	if (sqp->s_last == sqp->s_cur) {
+		if (++sqp->s_cur >= sqp->s_size)
+			sqp->s_cur = 0;
+	}
+	spin_unlock_irqrestore(&sqp->s_lock, flags);
+
+	if (!qp) {
+		send_status = loopback_qp_drop(rvp, sqp);
+		goto serr_no_r_lock;
+	}
+	spin_lock_irqsave(&qp->r_lock, flags);
+	if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) ||
+	    qp->ibqp.qp_type != sqp->ibqp.qp_type) {
+		send_status = loopback_qp_drop(rvp, sqp);
+		goto serr;
+	}
+
+	memset(&wc, 0, sizeof(wc));
+	send_status = IB_WC_SUCCESS;
+
+	release = true;
+	sqp->s_sge.sge = wqe->sg_list[0];
+	sqp->s_sge.sg_list = wqe->sg_list + 1;
+	sqp->s_sge.num_sge = wqe->wr.num_sge;
+	sqp->s_len = wqe->length;
+	switch (wqe->wr.opcode) {
+	case IB_WR_REG_MR:
+		goto send_comp;
+
+	case IB_WR_LOCAL_INV:
+		if (!(wqe->wr.send_flags & RVT_SEND_COMPLETION_ONLY)) {
+			if (rvt_invalidate_rkey(sqp,
+						wqe->wr.ex.invalidate_rkey))
+				send_status = IB_WC_LOC_PROT_ERR;
+			local_ops = 1;
+		}
+		goto send_comp;
+
+	case IB_WR_SEND_WITH_INV:
+	case IB_WR_SEND_WITH_IMM:
+	case IB_WR_SEND:
+		ret = rvt_get_rwqe(qp, false);
+		if (ret < 0)
+			goto op_err;
+		if (!ret)
+			goto rnr_nak;
+		if (wqe->length > qp->r_len)
+			goto inv_err;
+		switch (wqe->wr.opcode) {
+		case IB_WR_SEND_WITH_INV:
+			if (!rvt_invalidate_rkey(qp,
+						 wqe->wr.ex.invalidate_rkey)) {
+				wc.wc_flags = IB_WC_WITH_INVALIDATE;
+				wc.ex.invalidate_rkey =
+					wqe->wr.ex.invalidate_rkey;
+			}
+			break;
+		case IB_WR_SEND_WITH_IMM:
+			wc.wc_flags = IB_WC_WITH_IMM;
+			wc.ex.imm_data = wqe->wr.ex.imm_data;
+			break;
+		default:
+			break;
+		}
+		break;
+
+	case IB_WR_RDMA_WRITE_WITH_IMM:
+		if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE)))
+			goto inv_err;
+		wc.wc_flags = IB_WC_WITH_IMM;
+		wc.ex.imm_data = wqe->wr.ex.imm_data;
+		ret = rvt_get_rwqe(qp, true);
+		if (ret < 0)
+			goto op_err;
+		if (!ret)
+			goto rnr_nak;
+		/* skip copy_last set and qp_access_flags recheck */
+		goto do_write;
+	case IB_WR_RDMA_WRITE:
+		copy_last = rvt_is_user_qp(qp);
+		if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE)))
+			goto inv_err;
+do_write:
+		if (wqe->length == 0)
+			break;
+		if (unlikely(!rvt_rkey_ok(qp, &qp->r_sge.sge, wqe->length,
+					  wqe->rdma_wr.remote_addr,
+					  wqe->rdma_wr.rkey,
+					  IB_ACCESS_REMOTE_WRITE)))
+			goto acc_err;
+		qp->r_sge.sg_list = NULL;
+		qp->r_sge.num_sge = 1;
+		qp->r_sge.total_len = wqe->length;
+		break;
+
+	case IB_WR_RDMA_READ:
+		if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ)))
+			goto inv_err;
+		if (unlikely(!rvt_rkey_ok(qp, &sqp->s_sge.sge, wqe->length,
+					  wqe->rdma_wr.remote_addr,
+					  wqe->rdma_wr.rkey,
+					  IB_ACCESS_REMOTE_READ)))
+			goto acc_err;
+		release = false;
+		sqp->s_sge.sg_list = NULL;
+		sqp->s_sge.num_sge = 1;
+		qp->r_sge.sge = wqe->sg_list[0];
+		qp->r_sge.sg_list = wqe->sg_list + 1;
+		qp->r_sge.num_sge = wqe->wr.num_sge;
+		qp->r_sge.total_len = wqe->length;
+		break;
+
+	case IB_WR_ATOMIC_CMP_AND_SWP:
+	case IB_WR_ATOMIC_FETCH_AND_ADD:
+		if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC)))
+			goto inv_err;
+		if (unlikely(!rvt_rkey_ok(qp, &qp->r_sge.sge, sizeof(u64),
+					  wqe->atomic_wr.remote_addr,
+					  wqe->atomic_wr.rkey,
+					  IB_ACCESS_REMOTE_ATOMIC)))
+			goto acc_err;
+		/* Perform atomic OP and save result. */
+		maddr = (atomic64_t *)qp->r_sge.sge.vaddr;
+		sdata = wqe->atomic_wr.compare_add;
+		*(u64 *)sqp->s_sge.sge.vaddr =
+			(wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) ?
+			(u64)atomic64_add_return(sdata, maddr) - sdata :
+			(u64)cmpxchg((u64 *)qp->r_sge.sge.vaddr,
+				      sdata, wqe->atomic_wr.swap);
+		rvt_put_mr(qp->r_sge.sge.mr);
+		qp->r_sge.num_sge = 0;
+		goto send_comp;
+
+	default:
+		send_status = IB_WC_LOC_QP_OP_ERR;
+		goto serr;
+	}
+
+	sge = &sqp->s_sge.sge;
+	while (sqp->s_len) {
+		u32 len = rvt_get_sge_length(sge, sqp->s_len);
+
+		WARN_ON_ONCE(len == 0);
+		rvt_copy_sge(qp, &qp->r_sge, sge->vaddr,
+			     len, release, copy_last);
+		rvt_update_sge(&sqp->s_sge, len, !release);
+		sqp->s_len -= len;
+	}
+	if (release)
+		rvt_put_ss(&qp->r_sge);
+
+	if (!test_and_clear_bit(RVT_R_WRID_VALID, &qp->r_aflags))
+		goto send_comp;
+
+	if (wqe->wr.opcode == IB_WR_RDMA_WRITE_WITH_IMM)
+		wc.opcode = IB_WC_RECV_RDMA_WITH_IMM;
+	else
+		wc.opcode = IB_WC_RECV;
+	wc.wr_id = qp->r_wr_id;
+	wc.status = IB_WC_SUCCESS;
+	wc.byte_len = wqe->length;
+	wc.qp = &qp->ibqp;
+	wc.src_qp = qp->remote_qpn;
+	wc.slid = rdma_ah_get_dlid(&qp->remote_ah_attr) & U16_MAX;
+	wc.sl = rdma_ah_get_sl(&qp->remote_ah_attr);
+	wc.port_num = 1;
+	/* Signal completion event if the solicited bit is set. */
+	rvt_recv_cq(qp, &wc, wqe->wr.send_flags & IB_SEND_SOLICITED);
+
+send_comp:
+	spin_unlock_irqrestore(&qp->r_lock, flags);
+	spin_lock_irqsave(&sqp->s_lock, flags);
+	rvp->n_loop_pkts++;
+flush_send:
+	sqp->s_rnr_retry = sqp->s_rnr_retry_cnt;
+	rvt_send_complete(sqp, wqe, send_status);
+	if (local_ops) {
+		atomic_dec(&sqp->local_ops_pending);
+		local_ops = 0;
+	}
+	goto again;
+
+rnr_nak:
+	/* Handle RNR NAK */
+	if (qp->ibqp.qp_type == IB_QPT_UC)
+		goto send_comp;
+	rvp->n_rnr_naks++;
+	/*
+	 * Note: we don't need the s_lock held since the BUSY flag
+	 * makes this single threaded.
+	 */
+	if (sqp->s_rnr_retry == 0) {
+		send_status = IB_WC_RNR_RETRY_EXC_ERR;
+		goto serr;
+	}
+	if (sqp->s_rnr_retry_cnt < 7)
+		sqp->s_rnr_retry--;
+	spin_unlock_irqrestore(&qp->r_lock, flags);
+	spin_lock_irqsave(&sqp->s_lock, flags);
+	if (!(ib_rvt_state_ops[sqp->state] & RVT_PROCESS_RECV_OK))
+		goto clr_busy;
+	rvt_add_rnr_timer(sqp, qp->r_min_rnr_timer <<
+				IB_AETH_CREDIT_SHIFT);
+	goto clr_busy;
+
+op_err:
+	send_status = IB_WC_REM_OP_ERR;
+	wc.status = IB_WC_LOC_QP_OP_ERR;
+	goto err;
+
+inv_err:
+	send_status =
+		sqp->ibqp.qp_type == IB_QPT_RC ?
+			IB_WC_REM_INV_REQ_ERR :
+			IB_WC_SUCCESS;
+	wc.status = IB_WC_LOC_QP_OP_ERR;
+	goto err;
+
+acc_err:
+	send_status = IB_WC_REM_ACCESS_ERR;
+	wc.status = IB_WC_LOC_PROT_ERR;
+err:
+	/* responder goes to error state */
+	rvt_rc_error(qp, wc.status);
+
+serr:
+	spin_unlock_irqrestore(&qp->r_lock, flags);
+serr_no_r_lock:
+	spin_lock_irqsave(&sqp->s_lock, flags);
+	rvt_send_complete(sqp, wqe, send_status);
+	if (sqp->ibqp.qp_type == IB_QPT_RC) {
+		int lastwqe = rvt_error_qp(sqp, IB_WC_WR_FLUSH_ERR);
+
+		sqp->s_flags &= ~RVT_S_BUSY;
+		spin_unlock_irqrestore(&sqp->s_lock, flags);
+		if (lastwqe) {
+			struct ib_event ev;
+
+			ev.device = sqp->ibqp.device;
+			ev.element.qp = &sqp->ibqp;
+			ev.event = IB_EVENT_QP_LAST_WQE_REACHED;
+			sqp->ibqp.event_handler(&ev, sqp->ibqp.qp_context);
+		}
+		goto done;
+	}
+clr_busy:
+	sqp->s_flags &= ~RVT_S_BUSY;
+unlock:
+	spin_unlock_irqrestore(&sqp->s_lock, flags);
+done:
+	rcu_read_unlock();
+}
+EXPORT_SYMBOL(rvt_ruc_loopback);
diff --git a/drivers/infiniband/sw/rdmavt/qp.h b/drivers/infiniband/sw/rdmavt/qp.h
index 264811f..2cdba12 100644
--- a/drivers/infiniband/sw/rdmavt/qp.h
+++ b/drivers/infiniband/sw/rdmavt/qp.h
@@ -48,7 +48,7 @@
  *
  */
 
-#include <rdma/rdma_vt.h>
+#include <rdma/rdmavt_qp.h>
 
 int rvt_driver_qp_init(struct rvt_dev_info *rdi);
 void rvt_qp_exit(struct rvt_dev_info *rdi);
@@ -57,7 +57,7 @@
 			    struct ib_udata *udata);
 int rvt_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
 		  int attr_mask, struct ib_udata *udata);
-int rvt_destroy_qp(struct ib_qp *ibqp);
+int rvt_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata);
 int rvt_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
 		 int attr_mask, struct ib_qp_init_attr *init_attr);
 int rvt_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr,
@@ -66,4 +66,8 @@
 		  const struct ib_send_wr **bad_wr);
 int rvt_post_srq_recv(struct ib_srq *ibsrq, const struct ib_recv_wr *wr,
 		      const struct ib_recv_wr **bad_wr);
+int rvt_wss_init(struct rvt_dev_info *rdi);
+void rvt_wss_exit(struct rvt_dev_info *rdi);
+int rvt_alloc_rq(struct rvt_rq *rq, u32 size, int node,
+		 struct ib_udata *udata);
 #endif          /* DEF_RVTQP_H */
diff --git a/drivers/infiniband/sw/rdmavt/rc.c b/drivers/infiniband/sw/rdmavt/rc.c
index 6131cc5..890d7b7 100644
--- a/drivers/infiniband/sw/rdmavt/rc.c
+++ b/drivers/infiniband/sw/rdmavt/rc.c
@@ -45,7 +45,7 @@
  *
  */
 
-#include <rdma/rdma_vt.h>
+#include <rdma/rdmavt_qp.h>
 #include <rdma/ib_hdrs.h>
 
 /*
@@ -104,26 +104,33 @@
 	} else {
 		u32 min, max, x;
 		u32 credits;
-		struct rvt_rwq *wq = qp->r_rq.wq;
 		u32 head;
 		u32 tail;
 
-		/* sanity check pointers before trusting them */
-		head = wq->head;
-		if (head >= qp->r_rq.size)
-			head = 0;
-		tail = wq->tail;
-		if (tail >= qp->r_rq.size)
-			tail = 0;
-		/*
-		 * Compute the number of credits available (RWQEs).
-		 * There is a small chance that the pair of reads are
-		 * not atomic, which is OK, since the fuzziness is
-		 * resolved as further ACKs go out.
-		 */
-		credits = head - tail;
-		if ((int)credits < 0)
-			credits += qp->r_rq.size;
+		credits = READ_ONCE(qp->r_rq.kwq->count);
+		if (credits == 0) {
+			/* sanity check pointers before trusting them */
+			if (qp->ip) {
+				head = RDMA_READ_UAPI_ATOMIC(qp->r_rq.wq->head);
+				tail = RDMA_READ_UAPI_ATOMIC(qp->r_rq.wq->tail);
+			} else {
+				head = READ_ONCE(qp->r_rq.kwq->head);
+				tail = READ_ONCE(qp->r_rq.kwq->tail);
+			}
+			if (head >= qp->r_rq.size)
+				head = 0;
+			if (tail >= qp->r_rq.size)
+				tail = 0;
+			/*
+			 * Compute the number of credits available (RWQEs).
+			 * There is a small chance that the pair of reads are
+			 * not atomic, which is OK, since the fuzziness is
+			 * resolved as further ACKs go out.
+			 */
+			credits = head - tail;
+			if ((int)credits < 0)
+				credits += qp->r_rq.size;
+		}
 		/*
 		 * Binary search the credit table to find the code to
 		 * use.
@@ -187,3 +194,16 @@
 	}
 }
 EXPORT_SYMBOL(rvt_get_credit);
+
+/* rvt_restart_sge - rewind the sge state for a wqe */
+u32 rvt_restart_sge(struct rvt_sge_state *ss, struct rvt_swqe *wqe, u32 len)
+{
+	ss->sge = wqe->sg_list[0];
+	ss->sg_list = wqe->sg_list + 1;
+	ss->num_sge = wqe->wr.num_sge;
+	ss->total_len = wqe->length;
+	rvt_skip_sge(ss, len, false);
+	return wqe->length - len;
+}
+EXPORT_SYMBOL(rvt_restart_sge);
+
diff --git a/drivers/infiniband/sw/rdmavt/srq.c b/drivers/infiniband/sw/rdmavt/srq.c
index 78e06fc..24fef02 100644
--- a/drivers/infiniband/sw/rdmavt/srq.c
+++ b/drivers/infiniband/sw/rdmavt/srq.c
@@ -48,10 +48,11 @@
 #include <linux/err.h>
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
+#include <rdma/uverbs_ioctl.h>
 
 #include "srq.h"
 #include "vt.h"
-
+#include "qp.h"
 /**
  * rvt_driver_srq_init - init srq resources on a per driver basis
  * @rdi: rvt dev structure
@@ -70,29 +71,24 @@
  * @srq_init_attr: the attributes of the SRQ
  * @udata: data from libibverbs when creating a user SRQ
  *
- * Return: Allocated srq object
+ * Return: 0 on success
  */
-struct ib_srq *rvt_create_srq(struct ib_pd *ibpd,
-			      struct ib_srq_init_attr *srq_init_attr,
-			      struct ib_udata *udata)
+int rvt_create_srq(struct ib_srq *ibsrq, struct ib_srq_init_attr *srq_init_attr,
+		   struct ib_udata *udata)
 {
-	struct rvt_dev_info *dev = ib_to_rvt(ibpd->device);
-	struct rvt_srq *srq;
+	struct rvt_dev_info *dev = ib_to_rvt(ibsrq->device);
+	struct rvt_srq *srq = ibsrq_to_rvtsrq(ibsrq);
 	u32 sz;
-	struct ib_srq *ret;
+	int ret;
 
 	if (srq_init_attr->srq_type != IB_SRQT_BASIC)
-		return ERR_PTR(-EOPNOTSUPP);
+		return -EOPNOTSUPP;
 
 	if (srq_init_attr->attr.max_sge == 0 ||
 	    srq_init_attr->attr.max_sge > dev->dparms.props.max_srq_sge ||
 	    srq_init_attr->attr.max_wr == 0 ||
 	    srq_init_attr->attr.max_wr > dev->dparms.props.max_srq_wr)
-		return ERR_PTR(-EINVAL);
-
-	srq = kzalloc_node(sizeof(*srq), GFP_KERNEL, dev->dparms.node);
-	if (!srq)
-		return ERR_PTR(-ENOMEM);
+		return -EINVAL;
 
 	/*
 	 * Need to use vmalloc() if we want to support large #s of entries.
@@ -101,12 +97,9 @@
 	srq->rq.max_sge = srq_init_attr->attr.max_sge;
 	sz = sizeof(struct ib_sge) * srq->rq.max_sge +
 		sizeof(struct rvt_rwqe);
-	srq->rq.wq = udata ?
-		vmalloc_user(sizeof(struct rvt_rwq) + srq->rq.size * sz) :
-		vzalloc_node(sizeof(struct rvt_rwq) + srq->rq.size * sz,
-			     dev->dparms.node);
-	if (!srq->rq.wq) {
-		ret = ERR_PTR(-ENOMEM);
+	if (rvt_alloc_rq(&srq->rq, srq->rq.size * sz,
+			 dev->dparms.node, udata)) {
+		ret = -ENOMEM;
 		goto bail_srq;
 	}
 
@@ -115,23 +108,18 @@
 	 * See rvt_mmap() for details.
 	 */
 	if (udata && udata->outlen >= sizeof(__u64)) {
-		int err;
 		u32 s = sizeof(struct rvt_rwq) + srq->rq.size * sz;
 
-		srq->ip =
-		    rvt_create_mmap_info(dev, s, ibpd->uobject->context,
-					 srq->rq.wq);
+		srq->ip = rvt_create_mmap_info(dev, s, udata, srq->rq.wq);
 		if (!srq->ip) {
-			ret = ERR_PTR(-ENOMEM);
+			ret = -ENOMEM;
 			goto bail_wq;
 		}
 
-		err = ib_copy_to_udata(udata, &srq->ip->offset,
+		ret = ib_copy_to_udata(udata, &srq->ip->offset,
 				       sizeof(srq->ip->offset));
-		if (err) {
-			ret = ERR_PTR(err);
+		if (ret)
 			goto bail_ip;
-		}
 	}
 
 	/*
@@ -143,7 +131,7 @@
 	spin_lock(&dev->n_srqs_lock);
 	if (dev->n_srqs_allocated == dev->dparms.props.max_srq) {
 		spin_unlock(&dev->n_srqs_lock);
-		ret = ERR_PTR(-ENOMEM);
+		ret = -ENOMEM;
 		goto bail_ip;
 	}
 
@@ -156,14 +144,13 @@
 		spin_unlock_irq(&dev->pending_lock);
 	}
 
-	return &srq->ibsrq;
+	return 0;
 
 bail_ip:
 	kfree(srq->ip);
 bail_wq:
-	vfree(srq->rq.wq);
+	rvt_free_rq(&srq->rq);
 bail_srq:
-	kfree(srq);
 	return ret;
 }
 
@@ -182,11 +169,12 @@
 {
 	struct rvt_srq *srq = ibsrq_to_rvtsrq(ibsrq);
 	struct rvt_dev_info *dev = ib_to_rvt(ibsrq->device);
-	struct rvt_rwq *wq;
+	struct rvt_rq tmp_rq = {};
 	int ret = 0;
 
 	if (attr_mask & IB_SRQ_MAX_WR) {
-		struct rvt_rwq *owq;
+		struct rvt_krwq *okwq = NULL;
+		struct rvt_rwq *owq = NULL;
 		struct rvt_rwqe *p;
 		u32 sz, size, n, head, tail;
 
@@ -195,17 +183,12 @@
 		    ((attr_mask & IB_SRQ_LIMIT) ?
 		     attr->srq_limit : srq->limit) > attr->max_wr)
 			return -EINVAL;
-
 		sz = sizeof(struct rvt_rwqe) +
 			srq->rq.max_sge * sizeof(struct ib_sge);
 		size = attr->max_wr + 1;
-		wq = udata ?
-			vmalloc_user(sizeof(struct rvt_rwq) + size * sz) :
-			vzalloc_node(sizeof(struct rvt_rwq) + size * sz,
-				     dev->dparms.node);
-		if (!wq)
+		if (rvt_alloc_rq(&tmp_rq, size * sz, dev->dparms.node,
+				 udata))
 			return -ENOMEM;
-
 		/* Check that we can write the offset to mmap. */
 		if (udata && udata->inlen >= sizeof(__u64)) {
 			__u64 offset_addr;
@@ -223,14 +206,20 @@
 				goto bail_free;
 		}
 
-		spin_lock_irq(&srq->rq.lock);
+		spin_lock_irq(&srq->rq.kwq->c_lock);
 		/*
 		 * validate head and tail pointer values and compute
 		 * the number of remaining WQEs.
 		 */
-		owq = srq->rq.wq;
-		head = owq->head;
-		tail = owq->tail;
+		if (udata) {
+			owq = srq->rq.wq;
+			head = RDMA_READ_UAPI_ATOMIC(owq->head);
+			tail = RDMA_READ_UAPI_ATOMIC(owq->tail);
+		} else {
+			okwq = srq->rq.kwq;
+			head = okwq->head;
+			tail = okwq->tail;
+		}
 		if (head >= srq->rq.size || tail >= srq->rq.size) {
 			ret = -EINVAL;
 			goto bail_unlock;
@@ -245,7 +234,7 @@
 			goto bail_unlock;
 		}
 		n = 0;
-		p = wq->wq;
+		p = tmp_rq.kwq->curr_wq;
 		while (tail != head) {
 			struct rvt_rwqe *wqe;
 			int i;
@@ -260,22 +249,29 @@
 			if (++tail >= srq->rq.size)
 				tail = 0;
 		}
-		srq->rq.wq = wq;
+		srq->rq.kwq = tmp_rq.kwq;
+		if (udata) {
+			srq->rq.wq = tmp_rq.wq;
+			RDMA_WRITE_UAPI_ATOMIC(tmp_rq.wq->head, n);
+			RDMA_WRITE_UAPI_ATOMIC(tmp_rq.wq->tail, 0);
+		} else {
+			tmp_rq.kwq->head = n;
+			tmp_rq.kwq->tail = 0;
+		}
 		srq->rq.size = size;
-		wq->head = n;
-		wq->tail = 0;
 		if (attr_mask & IB_SRQ_LIMIT)
 			srq->limit = attr->srq_limit;
-		spin_unlock_irq(&srq->rq.lock);
+		spin_unlock_irq(&srq->rq.kwq->c_lock);
 
 		vfree(owq);
+		kvfree(okwq);
 
 		if (srq->ip) {
 			struct rvt_mmap_info *ip = srq->ip;
 			struct rvt_dev_info *dev = ib_to_rvt(srq->ibsrq.device);
 			u32 s = sizeof(struct rvt_rwq) + size * sz;
 
-			rvt_update_mmap_info(dev, ip, s, wq);
+			rvt_update_mmap_info(dev, ip, s, tmp_rq.wq);
 
 			/*
 			 * Return the offset to mmap.
@@ -299,19 +295,19 @@
 			spin_unlock_irq(&dev->pending_lock);
 		}
 	} else if (attr_mask & IB_SRQ_LIMIT) {
-		spin_lock_irq(&srq->rq.lock);
+		spin_lock_irq(&srq->rq.kwq->c_lock);
 		if (attr->srq_limit >= srq->rq.size)
 			ret = -EINVAL;
 		else
 			srq->limit = attr->srq_limit;
-		spin_unlock_irq(&srq->rq.lock);
+		spin_unlock_irq(&srq->rq.kwq->c_lock);
 	}
 	return ret;
 
 bail_unlock:
-	spin_unlock_irq(&srq->rq.lock);
+	spin_unlock_irq(&srq->rq.kwq->c_lock);
 bail_free:
-	vfree(wq);
+	rvt_free_rq(&tmp_rq);
 	return ret;
 }
 
@@ -335,9 +331,8 @@
  * rvt_destroy_srq - destory an srq
  * @ibsrq: srq object to destroy
  *
- * Return always 0
  */
-int rvt_destroy_srq(struct ib_srq *ibsrq)
+void rvt_destroy_srq(struct ib_srq *ibsrq, struct ib_udata *udata)
 {
 	struct rvt_srq *srq = ibsrq_to_rvtsrq(ibsrq);
 	struct rvt_dev_info *dev = ib_to_rvt(ibsrq->device);
@@ -347,9 +342,5 @@
 	spin_unlock(&dev->n_srqs_lock);
 	if (srq->ip)
 		kref_put(&srq->ip->ref, rvt_release_mmap_info);
-	else
-		vfree(srq->rq.wq);
-	kfree(srq);
-
-	return 0;
+	kvfree(srq->rq.kwq);
 }
diff --git a/drivers/infiniband/sw/rdmavt/srq.h b/drivers/infiniband/sw/rdmavt/srq.h
index bf0eaaf..6427d7d 100644
--- a/drivers/infiniband/sw/rdmavt/srq.h
+++ b/drivers/infiniband/sw/rdmavt/srq.h
@@ -50,13 +50,12 @@
 
 #include <rdma/rdma_vt.h>
 void rvt_driver_srq_init(struct rvt_dev_info *rdi);
-struct ib_srq *rvt_create_srq(struct ib_pd *ibpd,
-			      struct ib_srq_init_attr *srq_init_attr,
-			      struct ib_udata *udata);
+int rvt_create_srq(struct ib_srq *ibsrq, struct ib_srq_init_attr *srq_init_attr,
+		   struct ib_udata *udata);
 int rvt_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
 		   enum ib_srq_attr_mask attr_mask,
 		   struct ib_udata *udata);
 int rvt_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr);
-int rvt_destroy_srq(struct ib_srq *ibsrq);
+void rvt_destroy_srq(struct ib_srq *ibsrq, struct ib_udata *udata);
 
 #endif          /* DEF_RVTSRQ_H */
diff --git a/drivers/infiniband/sw/rdmavt/trace_cq.h b/drivers/infiniband/sw/rdmavt/trace_cq.h
index df8e1ad..e3c416c 100644
--- a/drivers/infiniband/sw/rdmavt/trace_cq.h
+++ b/drivers/infiniband/sw/rdmavt/trace_cq.h
@@ -105,7 +105,7 @@
 	     TP_ARGS(cq, attr));
 
 #define CQ_PRN \
-"[%s] idx %u wr_id %llx status %u opcode %u,%s length %u qpn %x"
+"[%s] idx %u wr_id %llx status %u opcode %u,%s length %u qpn %x flags %x imm %x"
 
 DECLARE_EVENT_CLASS(
 	rvt_cq_entry_template,
@@ -119,6 +119,8 @@
 		__field(u32, qpn)
 		__field(u32, length)
 		__field(u32, idx)
+		__field(u32, flags)
+		__field(u32, imm)
 	),
 	TP_fast_assign(
 		RDI_DEV_ASSIGN(cq->rdi)
@@ -128,6 +130,8 @@
 		__entry->length = wc->byte_len;
 		__entry->qpn = wc->qp->qp_num;
 		__entry->idx = idx;
+		__entry->flags = wc->wc_flags;
+		__entry->imm = be32_to_cpu(wc->ex.imm_data);
 	),
 	TP_printk(
 		CQ_PRN,
@@ -137,7 +141,9 @@
 		__entry->status,
 		__entry->opcode, show_wc_opcode(__entry->opcode),
 		__entry->length,
-		__entry->qpn
+		__entry->qpn,
+		__entry->flags,
+		__entry->imm
 	)
 );
 
diff --git a/drivers/infiniband/sw/rdmavt/trace_mr.h b/drivers/infiniband/sw/rdmavt/trace_mr.h
index 976e482..95b8a0e 100644
--- a/drivers/infiniband/sw/rdmavt/trace_mr.h
+++ b/drivers/infiniband/sw/rdmavt/trace_mr.h
@@ -54,6 +54,8 @@
 #include <rdma/rdma_vt.h>
 #include <rdma/rdmavt_mr.h>
 
+#include "mr.h"
+
 #undef TRACE_SYSTEM
 #define TRACE_SYSTEM rvt_mr
 DECLARE_EVENT_CLASS(
@@ -64,8 +66,12 @@
 		RDI_DEV_ENTRY(ib_to_rvt(mr->pd->device))
 		__field(void *, vaddr)
 		__field(struct page *, page)
+		__field(u64, iova)
+		__field(u64, user_base)
 		__field(size_t, len)
+		__field(size_t, length)
 		__field(u32, lkey)
+		__field(u32, offset)
 		__field(u16, m)
 		__field(u16, n)
 	),
@@ -73,18 +79,28 @@
 		RDI_DEV_ASSIGN(ib_to_rvt(mr->pd->device));
 		__entry->vaddr = v;
 		__entry->page = virt_to_page(v);
+		__entry->iova = mr->iova;
+		__entry->user_base = mr->user_base;
+		__entry->lkey = mr->lkey;
 		__entry->m = m;
 		__entry->n = n;
 		__entry->len = len;
+		__entry->length = mr->length;
+		__entry->offset = mr->offset;
 	),
 	TP_printk(
-		"[%s] vaddr %p page %p m %u n %u len %ld",
+		"[%s] lkey %x iova %llx user_base %llx mr_len %lu vaddr %llx page %p m %u n %u len %lu off %u",
 		__get_str(dev),
-		__entry->vaddr,
+		__entry->lkey,
+		__entry->iova,
+		__entry->user_base,
+		__entry->length,
+		(unsigned long long)__entry->vaddr,
 		__entry->page,
 		__entry->m,
 		__entry->n,
-		__entry->len
+		__entry->len,
+		__entry->offset
 	)
 );
 
@@ -165,6 +181,40 @@
 	TP_PROTO(struct rvt_sge *sge, struct ib_sge *isge),
 	TP_ARGS(sge, isge));
 
+TRACE_EVENT(
+	rvt_map_mr_sg,
+	TP_PROTO(struct ib_mr *ibmr, int sg_nents, unsigned int *sg_offset),
+	TP_ARGS(ibmr, sg_nents, sg_offset),
+	TP_STRUCT__entry(
+		RDI_DEV_ENTRY(ib_to_rvt(to_imr(ibmr)->mr.pd->device))
+		__field(u64, iova)
+		__field(u64, ibmr_iova)
+		__field(u64, user_base)
+		__field(u64, ibmr_length)
+		__field(int, sg_nents)
+		__field(uint, sg_offset)
+	),
+	TP_fast_assign(
+		RDI_DEV_ASSIGN(ib_to_rvt(to_imr(ibmr)->mr.pd->device))
+		__entry->ibmr_iova = ibmr->iova;
+		__entry->iova = to_imr(ibmr)->mr.iova;
+		__entry->user_base = to_imr(ibmr)->mr.user_base;
+		__entry->ibmr_length = to_imr(ibmr)->mr.length;
+		__entry->sg_nents = sg_nents;
+		__entry->sg_offset = sg_offset ? *sg_offset : 0;
+	),
+	TP_printk(
+		"[%s] ibmr_iova %llx iova %llx user_base %llx length %llx sg_nents %d sg_offset %u",
+		__get_str(dev),
+		__entry->ibmr_iova,
+		__entry->iova,
+		__entry->user_base,
+		__entry->ibmr_length,
+		__entry->sg_nents,
+		__entry->sg_offset
+	)
+);
+
 #endif /* __RVT_TRACE_MR_H */
 
 #undef TRACE_INCLUDE_PATH
diff --git a/drivers/infiniband/sw/rdmavt/trace_qp.h b/drivers/infiniband/sw/rdmavt/trace_qp.h
index efc9d81..c32d21c 100644
--- a/drivers/infiniband/sw/rdmavt/trace_qp.h
+++ b/drivers/infiniband/sw/rdmavt/trace_qp.h
@@ -51,7 +51,7 @@
 #include <linux/trace_seq.h>
 
 #include <rdma/ib_verbs.h>
-#include <rdma/rdma_vt.h>
+#include <rdma/rdmavt_qp.h>
 
 #undef TRACE_SYSTEM
 #define TRACE_SYSTEM rvt_qp
diff --git a/drivers/infiniband/sw/rdmavt/trace_rc.h b/drivers/infiniband/sw/rdmavt/trace_rc.h
index 9952769..c47357a 100644
--- a/drivers/infiniband/sw/rdmavt/trace_rc.h
+++ b/drivers/infiniband/sw/rdmavt/trace_rc.h
@@ -51,7 +51,7 @@
 #include <linux/trace_seq.h>
 
 #include <rdma/ib_verbs.h>
-#include <rdma/rdma_vt.h>
+#include <rdma/rdmavt_qp.h>
 
 #undef TRACE_SYSTEM
 #define TRACE_SYSTEM rvt_rc
diff --git a/drivers/infiniband/sw/rdmavt/trace_tx.h b/drivers/infiniband/sw/rdmavt/trace_tx.h
index 0ef25fc..d963ca7 100644
--- a/drivers/infiniband/sw/rdmavt/trace_tx.h
+++ b/drivers/infiniband/sw/rdmavt/trace_tx.h
@@ -51,7 +51,7 @@
 #include <linux/trace_seq.h>
 
 #include <rdma/ib_verbs.h>
-#include <rdma/rdma_vt.h>
+#include <rdma/rdmavt_qp.h>
 
 #undef TRACE_SYSTEM
 #define TRACE_SYSTEM rvt_tx
@@ -153,6 +153,48 @@
 	)
 );
 
+TRACE_EVENT(
+	rvt_qp_send_completion,
+	TP_PROTO(struct rvt_qp *qp, struct rvt_swqe *wqe, u32 idx),
+	TP_ARGS(qp, wqe, idx),
+	TP_STRUCT__entry(
+		RDI_DEV_ENTRY(ib_to_rvt(qp->ibqp.device))
+		__field(struct rvt_swqe *, wqe)
+		__field(u64, wr_id)
+		__field(u32, qpn)
+		__field(u32, qpt)
+		__field(u32, length)
+		__field(u32, idx)
+		__field(u32, ssn)
+		__field(enum ib_wr_opcode, opcode)
+		__field(int, send_flags)
+	),
+	TP_fast_assign(
+		RDI_DEV_ASSIGN(ib_to_rvt(qp->ibqp.device))
+		__entry->wqe = wqe;
+		__entry->wr_id = wqe->wr.wr_id;
+		__entry->qpn = qp->ibqp.qp_num;
+		__entry->qpt = qp->ibqp.qp_type;
+		__entry->length = wqe->length;
+		__entry->idx = idx;
+		__entry->ssn = wqe->ssn;
+		__entry->opcode = wqe->wr.opcode;
+		__entry->send_flags = wqe->wr.send_flags;
+	),
+	TP_printk(
+		"[%s] qpn 0x%x qpt %u wqe %p idx %u wr_id %llx length %u ssn %u opcode %x send_flags %x",
+		__get_str(dev),
+		__entry->qpn,
+		__entry->qpt,
+		__entry->wqe,
+		__entry->idx,
+		__entry->wr_id,
+		__entry->length,
+		__entry->ssn,
+		__entry->opcode,
+		__entry->send_flags
+	)
+);
 #endif /* __RVT_TRACE_TX_H */
 
 #undef TRACE_INCLUDE_PATH
diff --git a/drivers/infiniband/sw/rdmavt/vt.c b/drivers/infiniband/sw/rdmavt/vt.c
index 17e4abc..18da1e1 100644
--- a/drivers/infiniband/sw/rdmavt/vt.c
+++ b/drivers/infiniband/sw/rdmavt/vt.c
@@ -91,7 +91,7 @@
 {
 	struct rvt_dev_info *rdi;
 
-	rdi = (struct rvt_dev_info *)ib_alloc_device(size);
+	rdi = container_of(_ib_alloc_device(size), struct rvt_dev_info, ibdev);
 	if (!rdi)
 		return rdi;
 
@@ -284,10 +284,6 @@
 					 &gid->global.interface_id);
 }
 
-struct rvt_ucontext {
-	struct ib_ucontext ibucontext;
-};
-
 static inline struct rvt_ucontext *to_iucontext(struct ib_ucontext
 						*ibucontext)
 {
@@ -296,28 +292,21 @@
 
 /**
  * rvt_alloc_ucontext - Allocate a user context
- * @ibdev: Verbs IB dev
+ * @uctx: Verbs context
  * @udata: User data allocated
  */
-static struct ib_ucontext *rvt_alloc_ucontext(struct ib_device *ibdev,
-					      struct ib_udata *udata)
+static int rvt_alloc_ucontext(struct ib_ucontext *uctx, struct ib_udata *udata)
 {
-	struct rvt_ucontext *context;
-
-	context = kmalloc(sizeof(*context), GFP_KERNEL);
-	if (!context)
-		return ERR_PTR(-ENOMEM);
-	return &context->ibucontext;
+	return 0;
 }
 
 /**
- *rvt_dealloc_ucontext - Free a user context
- *@context - Free this
+ * rvt_dealloc_ucontext - Free a user context
+ * @context - Free this
  */
-static int rvt_dealloc_ucontext(struct ib_ucontext *context)
+static void rvt_dealloc_ucontext(struct ib_ucontext *context)
 {
-	kfree(to_iucontext(context));
-	return 0;
+	return;
 }
 
 static int rvt_get_port_immutable(struct ib_device *ibdev, u8 port_num,
@@ -392,16 +381,59 @@
 	_VERB_IDX_MAX /* Must always be last! */
 };
 
-static inline int check_driver_override(struct rvt_dev_info *rdi,
-					size_t offset, void *func)
-{
-	if (!*(void **)((void *)&rdi->ibdev + offset)) {
-		*(void **)((void *)&rdi->ibdev + offset) = func;
-		return 0;
-	}
+static const struct ib_device_ops rvt_dev_ops = {
+	.uverbs_abi_ver = RVT_UVERBS_ABI_VERSION,
 
-	return 1;
-}
+	.alloc_fmr = rvt_alloc_fmr,
+	.alloc_mr = rvt_alloc_mr,
+	.alloc_pd = rvt_alloc_pd,
+	.alloc_ucontext = rvt_alloc_ucontext,
+	.attach_mcast = rvt_attach_mcast,
+	.create_ah = rvt_create_ah,
+	.create_cq = rvt_create_cq,
+	.create_qp = rvt_create_qp,
+	.create_srq = rvt_create_srq,
+	.dealloc_fmr = rvt_dealloc_fmr,
+	.dealloc_pd = rvt_dealloc_pd,
+	.dealloc_ucontext = rvt_dealloc_ucontext,
+	.dereg_mr = rvt_dereg_mr,
+	.destroy_ah = rvt_destroy_ah,
+	.destroy_cq = rvt_destroy_cq,
+	.destroy_qp = rvt_destroy_qp,
+	.destroy_srq = rvt_destroy_srq,
+	.detach_mcast = rvt_detach_mcast,
+	.get_dma_mr = rvt_get_dma_mr,
+	.get_port_immutable = rvt_get_port_immutable,
+	.map_mr_sg = rvt_map_mr_sg,
+	.map_phys_fmr = rvt_map_phys_fmr,
+	.mmap = rvt_mmap,
+	.modify_ah = rvt_modify_ah,
+	.modify_device = rvt_modify_device,
+	.modify_port = rvt_modify_port,
+	.modify_qp = rvt_modify_qp,
+	.modify_srq = rvt_modify_srq,
+	.poll_cq = rvt_poll_cq,
+	.post_recv = rvt_post_recv,
+	.post_send = rvt_post_send,
+	.post_srq_recv = rvt_post_srq_recv,
+	.query_ah = rvt_query_ah,
+	.query_device = rvt_query_device,
+	.query_gid = rvt_query_gid,
+	.query_pkey = rvt_query_pkey,
+	.query_port = rvt_query_port,
+	.query_qp = rvt_query_qp,
+	.query_srq = rvt_query_srq,
+	.reg_user_mr = rvt_reg_user_mr,
+	.req_notify_cq = rvt_req_notify_cq,
+	.resize_cq = rvt_resize_cq,
+	.unmap_fmr = rvt_unmap_fmr,
+
+	INIT_RDMA_OBJ_SIZE(ib_ah, rvt_ah, ibah),
+	INIT_RDMA_OBJ_SIZE(ib_cq, rvt_cq, ibcq),
+	INIT_RDMA_OBJ_SIZE(ib_pd, rvt_pd, ibpd),
+	INIT_RDMA_OBJ_SIZE(ib_srq, rvt_srq, ibsrq),
+	INIT_RDMA_OBJ_SIZE(ib_ucontext, rvt_ucontext, ibucontext),
+};
 
 static noinline int check_support(struct rvt_dev_info *rdi, int verb)
 {
@@ -411,81 +443,41 @@
 		 * These functions are not part of verbs specifically but are
 		 * required for rdmavt to function.
 		 */
-		if ((!rdi->driver_f.port_callback) ||
+		if ((!rdi->ibdev.ops.init_port) ||
 		    (!rdi->driver_f.get_pci_dev))
 			return -EINVAL;
 		break;
 
-	case QUERY_DEVICE:
-		check_driver_override(rdi, offsetof(struct ib_device,
-						    query_device),
-						    rvt_query_device);
-		break;
-
 	case MODIFY_DEVICE:
 		/*
 		 * rdmavt does not support modify device currently drivers must
 		 * provide.
 		 */
-		if (!check_driver_override(rdi, offsetof(struct ib_device,
-							 modify_device),
-					   rvt_modify_device))
+		if (!rdi->ibdev.ops.modify_device)
 			return -EOPNOTSUPP;
 		break;
 
 	case QUERY_PORT:
-		if (!check_driver_override(rdi, offsetof(struct ib_device,
-							 query_port),
-					   rvt_query_port))
+		if (!rdi->ibdev.ops.query_port)
 			if (!rdi->driver_f.query_port_state)
 				return -EINVAL;
 		break;
 
 	case MODIFY_PORT:
-		if (!check_driver_override(rdi, offsetof(struct ib_device,
-							 modify_port),
-					   rvt_modify_port))
+		if (!rdi->ibdev.ops.modify_port)
 			if (!rdi->driver_f.cap_mask_chg ||
 			    !rdi->driver_f.shut_down_port)
 				return -EINVAL;
 		break;
 
-	case QUERY_PKEY:
-		check_driver_override(rdi, offsetof(struct ib_device,
-						    query_pkey),
-				      rvt_query_pkey);
-		break;
-
 	case QUERY_GID:
-		if (!check_driver_override(rdi, offsetof(struct ib_device,
-							 query_gid),
-					   rvt_query_gid))
+		if (!rdi->ibdev.ops.query_gid)
 			if (!rdi->driver_f.get_guid_be)
 				return -EINVAL;
 		break;
 
-	case ALLOC_UCONTEXT:
-		check_driver_override(rdi, offsetof(struct ib_device,
-						    alloc_ucontext),
-				      rvt_alloc_ucontext);
-		break;
-
-	case DEALLOC_UCONTEXT:
-		check_driver_override(rdi, offsetof(struct ib_device,
-						    dealloc_ucontext),
-				      rvt_dealloc_ucontext);
-		break;
-
-	case GET_PORT_IMMUTABLE:
-		check_driver_override(rdi, offsetof(struct ib_device,
-						    get_port_immutable),
-				      rvt_get_port_immutable);
-		break;
-
 	case CREATE_QP:
-		if (!check_driver_override(rdi, offsetof(struct ib_device,
-							 create_qp),
-					   rvt_create_qp))
+		if (!rdi->ibdev.ops.create_qp)
 			if (!rdi->driver_f.qp_priv_alloc ||
 			    !rdi->driver_f.qp_priv_free ||
 			    !rdi->driver_f.notify_qp_reset ||
@@ -496,9 +488,7 @@
 		break;
 
 	case MODIFY_QP:
-		if (!check_driver_override(rdi, offsetof(struct ib_device,
-							 modify_qp),
-					   rvt_modify_qp))
+		if (!rdi->ibdev.ops.modify_qp)
 			if (!rdi->driver_f.notify_qp_reset ||
 			    !rdi->driver_f.schedule_send ||
 			    !rdi->driver_f.get_pmtu_from_attr ||
@@ -512,9 +502,7 @@
 		break;
 
 	case DESTROY_QP:
-		if (!check_driver_override(rdi, offsetof(struct ib_device,
-							 destroy_qp),
-					   rvt_destroy_qp))
+		if (!rdi->ibdev.ops.destroy_qp)
 			if (!rdi->driver_f.qp_priv_free ||
 			    !rdi->driver_f.notify_qp_reset ||
 			    !rdi->driver_f.flush_qp_waiters ||
@@ -523,197 +511,14 @@
 				return -EINVAL;
 		break;
 
-	case QUERY_QP:
-		check_driver_override(rdi, offsetof(struct ib_device,
-						    query_qp),
-						    rvt_query_qp);
-		break;
-
 	case POST_SEND:
-		if (!check_driver_override(rdi, offsetof(struct ib_device,
-							 post_send),
-					   rvt_post_send))
+		if (!rdi->ibdev.ops.post_send)
 			if (!rdi->driver_f.schedule_send ||
 			    !rdi->driver_f.do_send ||
 			    !rdi->post_parms)
 				return -EINVAL;
 		break;
 
-	case POST_RECV:
-		check_driver_override(rdi, offsetof(struct ib_device,
-						    post_recv),
-				      rvt_post_recv);
-		break;
-	case POST_SRQ_RECV:
-		check_driver_override(rdi, offsetof(struct ib_device,
-						    post_srq_recv),
-				      rvt_post_srq_recv);
-		break;
-
-	case CREATE_AH:
-		check_driver_override(rdi, offsetof(struct ib_device,
-						    create_ah),
-				      rvt_create_ah);
-		break;
-
-	case DESTROY_AH:
-		check_driver_override(rdi, offsetof(struct ib_device,
-						    destroy_ah),
-				      rvt_destroy_ah);
-		break;
-
-	case MODIFY_AH:
-		check_driver_override(rdi, offsetof(struct ib_device,
-						    modify_ah),
-				      rvt_modify_ah);
-		break;
-
-	case QUERY_AH:
-		check_driver_override(rdi, offsetof(struct ib_device,
-						    query_ah),
-				      rvt_query_ah);
-		break;
-
-	case CREATE_SRQ:
-		check_driver_override(rdi, offsetof(struct ib_device,
-						    create_srq),
-				      rvt_create_srq);
-		break;
-
-	case MODIFY_SRQ:
-		check_driver_override(rdi, offsetof(struct ib_device,
-						    modify_srq),
-				      rvt_modify_srq);
-		break;
-
-	case DESTROY_SRQ:
-		check_driver_override(rdi, offsetof(struct ib_device,
-						    destroy_srq),
-				      rvt_destroy_srq);
-		break;
-
-	case QUERY_SRQ:
-		check_driver_override(rdi, offsetof(struct ib_device,
-						    query_srq),
-				      rvt_query_srq);
-		break;
-
-	case ATTACH_MCAST:
-		check_driver_override(rdi, offsetof(struct ib_device,
-						    attach_mcast),
-				      rvt_attach_mcast);
-		break;
-
-	case DETACH_MCAST:
-		check_driver_override(rdi, offsetof(struct ib_device,
-						    detach_mcast),
-				      rvt_detach_mcast);
-		break;
-
-	case GET_DMA_MR:
-		check_driver_override(rdi, offsetof(struct ib_device,
-						    get_dma_mr),
-				      rvt_get_dma_mr);
-		break;
-
-	case REG_USER_MR:
-		check_driver_override(rdi, offsetof(struct ib_device,
-						    reg_user_mr),
-				      rvt_reg_user_mr);
-		break;
-
-	case DEREG_MR:
-		check_driver_override(rdi, offsetof(struct ib_device,
-						    dereg_mr),
-				      rvt_dereg_mr);
-		break;
-
-	case ALLOC_FMR:
-		check_driver_override(rdi, offsetof(struct ib_device,
-						    alloc_fmr),
-				      rvt_alloc_fmr);
-		break;
-
-	case ALLOC_MR:
-		check_driver_override(rdi, offsetof(struct ib_device,
-						    alloc_mr),
-				      rvt_alloc_mr);
-		break;
-
-	case MAP_MR_SG:
-		check_driver_override(rdi, offsetof(struct ib_device,
-						    map_mr_sg),
-				      rvt_map_mr_sg);
-		break;
-
-	case MAP_PHYS_FMR:
-		check_driver_override(rdi, offsetof(struct ib_device,
-						    map_phys_fmr),
-				      rvt_map_phys_fmr);
-		break;
-
-	case UNMAP_FMR:
-		check_driver_override(rdi, offsetof(struct ib_device,
-						    unmap_fmr),
-				      rvt_unmap_fmr);
-		break;
-
-	case DEALLOC_FMR:
-		check_driver_override(rdi, offsetof(struct ib_device,
-						    dealloc_fmr),
-				      rvt_dealloc_fmr);
-		break;
-
-	case MMAP:
-		check_driver_override(rdi, offsetof(struct ib_device,
-						    mmap),
-				      rvt_mmap);
-		break;
-
-	case CREATE_CQ:
-		check_driver_override(rdi, offsetof(struct ib_device,
-						    create_cq),
-				      rvt_create_cq);
-		break;
-
-	case DESTROY_CQ:
-		check_driver_override(rdi, offsetof(struct ib_device,
-						    destroy_cq),
-				      rvt_destroy_cq);
-		break;
-
-	case POLL_CQ:
-		check_driver_override(rdi, offsetof(struct ib_device,
-						    poll_cq),
-				      rvt_poll_cq);
-		break;
-
-	case REQ_NOTFIY_CQ:
-		check_driver_override(rdi, offsetof(struct ib_device,
-						    req_notify_cq),
-				      rvt_req_notify_cq);
-		break;
-
-	case RESIZE_CQ:
-		check_driver_override(rdi, offsetof(struct ib_device,
-						    resize_cq),
-				      rvt_resize_cq);
-		break;
-
-	case ALLOC_PD:
-		check_driver_override(rdi, offsetof(struct ib_device,
-						    alloc_pd),
-				      rvt_alloc_pd);
-		break;
-
-	case DEALLOC_PD:
-		check_driver_override(rdi, offsetof(struct ib_device,
-						    dealloc_pd),
-				      rvt_dealloc_pd);
-		break;
-
-	default:
-		return -EINVAL;
 	}
 
 	return 0;
@@ -728,7 +533,7 @@
  *
  * Return: 0 on success otherwise an errno.
  */
-int rvt_register_device(struct rvt_dev_info *rdi, u32 driver_id)
+int rvt_register_device(struct rvt_dev_info *rdi)
 {
 	int ret = 0, i;
 
@@ -745,6 +550,7 @@
 			return -EINVAL;
 		}
 
+	ib_set_device_ops(&rdi->ibdev, &rvt_dev_ops);
 
 	/* Once we get past here we can use rvt_pr macros and tracepoints */
 	trace_rvt_dbg(rdi, "Driver attempting registration");
@@ -774,6 +580,13 @@
 		goto bail_no_mr;
 	}
 
+	/* Memory Working Set Size */
+	ret = rvt_wss_init(rdi);
+	if (ret) {
+		rvt_pr_err(rdi, "Error in WSS init.\n");
+		goto bail_mr;
+	}
+
 	/* Completion queues */
 	spin_lock_init(&rdi->n_cqs_lock);
 
@@ -790,7 +603,6 @@
 	 * exactly which functions rdmavt supports, nor do they know the ABI
 	 * version, so we do all of this sort of stuff here.
 	 */
-	rdi->ibdev.uverbs_abi_ver = RVT_UVERBS_ABI_VERSION;
 	rdi->ibdev.uverbs_cmd_mask =
 		(1ull << IB_USER_VERBS_CMD_GET_CONTEXT)         |
 		(1ull << IB_USER_VERBS_CMD_QUERY_DEVICE)        |
@@ -826,12 +638,11 @@
 	if (!rdi->ibdev.num_comp_vectors)
 		rdi->ibdev.num_comp_vectors = 1;
 
-	rdi->ibdev.driver_id = driver_id;
 	/* We are now good to announce we exist */
-	ret =  ib_register_device(&rdi->ibdev, rdi->driver_f.port_callback);
+	ret = ib_register_device(&rdi->ibdev, dev_name(&rdi->ibdev.dev));
 	if (ret) {
 		rvt_pr_err(rdi, "Failed to register driver with ib core.\n");
-		goto bail_mr;
+		goto bail_wss;
 	}
 
 	rvt_create_mad_agents(rdi);
@@ -839,6 +650,8 @@
 	rvt_pr_info(rdi, "Registration with rdmavt done.\n");
 	return ret;
 
+bail_wss:
+	rvt_wss_exit(rdi);
 bail_mr:
 	rvt_mr_exit(rdi);
 
@@ -862,6 +675,7 @@
 	rvt_free_mad_agents(rdi);
 
 	ib_unregister_device(&rdi->ibdev);
+	rvt_wss_exit(rdi);
 	rvt_mr_exit(rdi);
 	rvt_qp_exit(rdi);
 }
diff --git a/drivers/infiniband/sw/rdmavt/vt.h b/drivers/infiniband/sw/rdmavt/vt.h
index 0675ea6..d19ff81 100644
--- a/drivers/infiniband/sw/rdmavt/vt.h
+++ b/drivers/infiniband/sw/rdmavt/vt.h
@@ -78,6 +78,12 @@
 		     fmt, \
 		     ##__VA_ARGS__)
 
+#define rvt_pr_err_ratelimited(rdi, fmt, ...) \
+	__rvt_pr_err_ratelimited((rdi)->driver_f.get_pci_dev(rdi), \
+				 rvt_get_ibdev_name(rdi), \
+				 fmt, \
+				 ##__VA_ARGS__)
+
 #define __rvt_pr_info(pdev, name, fmt, ...) \
 	dev_info(&pdev->dev, "%s: " fmt, name, ##__VA_ARGS__)
 
@@ -87,6 +93,9 @@
 #define __rvt_pr_err(pdev, name, fmt, ...) \
 	dev_err(&pdev->dev, "%s: " fmt, name, ##__VA_ARGS__)
 
+#define __rvt_pr_err_ratelimited(pdev, name, fmt, ...) \
+	dev_err_ratelimited(&(pdev)->dev, "%s: " fmt, name, ##__VA_ARGS__)
+
 static inline int ibport_num_to_idx(struct ib_device *ibdev, u8 port_num)
 {
 	struct rvt_dev_info *rdi = ib_to_rvt(ibdev);
diff --git a/drivers/infiniband/sw/rxe/Kconfig b/drivers/infiniband/sw/rxe/Kconfig
index 67ae960..d9bcfe7 100644
--- a/drivers/infiniband/sw/rxe/Kconfig
+++ b/drivers/infiniband/sw/rxe/Kconfig
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 config RDMA_RXE
 	tristate "Software RDMA over Ethernet (RoCE) driver"
 	depends on INET && PCI && INFINIBAND
diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c
index 10999fa..a8c11b5 100644
--- a/drivers/infiniband/sw/rxe/rxe.c
+++ b/drivers/infiniband/sw/rxe/rxe.c
@@ -31,6 +31,7 @@
  * SOFTWARE.
  */
 
+#include <rdma/rdma_netlink.h>
 #include <net/addrconf.h>
 #include "rxe.h"
 #include "rxe_loc.h"
@@ -50,8 +51,10 @@
 /* free resources for a rxe device all objects created for this device must
  * have been destroyed
  */
-static void rxe_cleanup(struct rxe_dev *rxe)
+void rxe_dealloc(struct ib_device *ib_dev)
 {
+	struct rxe_dev *rxe = container_of(ib_dev, struct rxe_dev, ib_dev);
+
 	rxe_pool_cleanup(&rxe->uc_pool);
 	rxe_pool_cleanup(&rxe->pd_pool);
 	rxe_pool_cleanup(&rxe->ah_pool);
@@ -65,16 +68,8 @@
 
 	rxe_cleanup_ports(rxe);
 
-	crypto_free_shash(rxe->tfm);
-}
-
-/* called when all references have been dropped */
-void rxe_release(struct kref *kref)
-{
-	struct rxe_dev *rxe = container_of(kref, struct rxe_dev, ref_cnt);
-
-	rxe_cleanup(rxe);
-	ib_dealloc_device(&rxe->ib_dev);
+	if (rxe->tfm)
+		crypto_free_shash(rxe->tfm);
 }
 
 /* initialize rxe device parameters */
@@ -103,7 +98,7 @@
 	rxe->attr.max_res_rd_atom		= RXE_MAX_RES_RD_ATOM;
 	rxe->attr.max_qp_init_rd_atom		= RXE_MAX_QP_INIT_RD_ATOM;
 	rxe->attr.max_ee_init_rd_atom		= RXE_MAX_EE_INIT_RD_ATOM;
-	rxe->attr.atomic_cap			= RXE_ATOMIC_CAP;
+	rxe->attr.atomic_cap			= IB_ATOMIC_HCA;
 	rxe->attr.max_ee			= RXE_MAX_EE;
 	rxe->attr.max_rdd			= RXE_MAX_RDD;
 	rxe->attr.max_mw			= RXE_MAX_MW;
@@ -128,9 +123,9 @@
 /* initialize port attributes */
 static int rxe_init_port_param(struct rxe_port *port)
 {
-	port->attr.state		= RXE_PORT_STATE;
-	port->attr.max_mtu		= RXE_PORT_MAX_MTU;
-	port->attr.active_mtu		= RXE_PORT_ACTIVE_MTU;
+	port->attr.state		= IB_PORT_DOWN;
+	port->attr.max_mtu		= IB_MTU_4096;
+	port->attr.active_mtu		= IB_MTU_256;
 	port->attr.gid_tbl_len		= RXE_PORT_GID_TBL_LEN;
 	port->attr.port_cap_flags	= RXE_PORT_PORT_CAP_FLAGS;
 	port->attr.max_msg_sz		= RXE_PORT_MAX_MSG_SZ;
@@ -147,8 +142,7 @@
 	port->attr.active_width		= RXE_PORT_ACTIVE_WIDTH;
 	port->attr.active_speed		= RXE_PORT_ACTIVE_SPEED;
 	port->attr.phys_state		= RXE_PORT_PHYS_STATE;
-	port->mtu_cap			=
-				ib_mtu_enum_to_int(RXE_PORT_ACTIVE_MTU);
+	port->mtu_cap			= ib_mtu_enum_to_int(IB_MTU_256);
 	port->subnet_prefix		= cpu_to_be64(RXE_PORT_SUBNET_PREFIX);
 
 	return 0;
@@ -280,7 +274,6 @@
 	spin_lock_init(&rxe->mmap_offset_lock);
 	spin_lock_init(&rxe->pending_lock);
 	INIT_LIST_HEAD(&rxe->pending_mmaps);
-	INIT_LIST_HEAD(&rxe->list);
 
 	mutex_init(&rxe->usdev_lock);
 
@@ -300,7 +293,7 @@
 	mtu = eth_mtu_int_to_enum(ndev_mtu);
 
 	/* Make sure that new MTU in range */
-	mtu = mtu ? min_t(enum ib_mtu, mtu, RXE_PORT_MAX_MTU) : IB_MTU_256;
+	mtu = mtu ? min_t(enum ib_mtu, mtu, IB_MTU_4096) : IB_MTU_256;
 
 	port->attr.active_mtu = mtu;
 	port->mtu_cap = ib_mtu_enum_to_int(mtu);
@@ -309,36 +302,45 @@
 /* called by ifc layer to create new rxe device.
  * The caller should allocate memory for rxe by calling ib_alloc_device.
  */
-int rxe_add(struct rxe_dev *rxe, unsigned int mtu)
+int rxe_add(struct rxe_dev *rxe, unsigned int mtu, const char *ibdev_name)
 {
 	int err;
 
-	kref_init(&rxe->ref_cnt);
-
 	err = rxe_init(rxe);
 	if (err)
-		goto err1;
+		return err;
 
 	rxe_set_mtu(rxe, mtu);
 
-	err = rxe_register_device(rxe);
-	if (err)
-		goto err1;
+	return rxe_register_device(rxe, ibdev_name);
+}
 
-	return 0;
+static int rxe_newlink(const char *ibdev_name, struct net_device *ndev)
+{
+	struct rxe_dev *exists;
+	int err = 0;
 
-err1:
-	rxe_dev_put(rxe);
+	exists = rxe_get_dev_from_net(ndev);
+	if (exists) {
+		ib_device_put(&exists->ib_dev);
+		pr_err("already configured on %s\n", ndev->name);
+		err = -EEXIST;
+		goto err;
+	}
+
+	err = rxe_net_add(ibdev_name, ndev);
+	if (err) {
+		pr_err("failed to add %s\n", ndev->name);
+		goto err;
+	}
+err:
 	return err;
 }
 
-/* called by the ifc layer to remove a device */
-void rxe_remove(struct rxe_dev *rxe)
-{
-	rxe_unregister_device(rxe);
-
-	rxe_dev_put(rxe);
-}
+static struct rdma_link_ops rxe_link_ops = {
+	.type = "rxe",
+	.newlink = rxe_newlink,
+};
 
 static int __init rxe_module_init(void)
 {
@@ -355,13 +357,15 @@
 	if (err)
 		return err;
 
+	rdma_link_register(&rxe_link_ops);
 	pr_info("loaded\n");
 	return 0;
 }
 
 static void __exit rxe_module_exit(void)
 {
-	rxe_remove_all();
+	rdma_link_unregister(&rxe_link_ops);
+	ib_unregister_driver(RDMA_DRIVER_RXE);
 	rxe_net_exit();
 	rxe_cache_exit();
 
@@ -370,3 +374,5 @@
 
 late_initcall(rxe_module_init);
 module_exit(rxe_module_exit);
+
+MODULE_ALIAS_RDMA_LINK("rxe");
diff --git a/drivers/infiniband/sw/rxe/rxe.h b/drivers/infiniband/sw/rxe/rxe.h
index d9ec2de..fb07eed 100644
--- a/drivers/infiniband/sw/rxe/rxe.h
+++ b/drivers/infiniband/sw/rxe/rxe.h
@@ -65,9 +65,6 @@
  */
 #define RXE_UVERBS_ABI_VERSION		2
 
-#define IB_PHYS_STATE_LINK_UP		(5)
-#define IB_PHYS_STATE_LINK_DOWN		(3)
-
 #define RXE_ROCE_V2_SPORT		(0xc000)
 
 static inline u32 rxe_crc32(struct rxe_dev *rxe,
@@ -79,7 +76,6 @@
 	SHASH_DESC_ON_STACK(shash, rxe->tfm);
 
 	shash->tfm = rxe->tfm;
-	shash->flags = 0;
 	*(u32 *)shash_desc_ctx(shash) = crc;
 	err = crypto_shash_update(shash, next, len);
 	if (unlikely(err)) {
@@ -94,20 +90,23 @@
 
 void rxe_set_mtu(struct rxe_dev *rxe, unsigned int dev_mtu);
 
-int rxe_add(struct rxe_dev *rxe, unsigned int mtu);
-void rxe_remove(struct rxe_dev *rxe);
-void rxe_remove_all(void);
+int rxe_add(struct rxe_dev *rxe, unsigned int mtu, const char *ibdev_name);
 
 void rxe_rcv(struct sk_buff *skb);
 
-static inline void rxe_dev_put(struct rxe_dev *rxe)
+/* The caller must do a matching ib_device_put(&dev->ib_dev) */
+static inline struct rxe_dev *rxe_get_dev_from_net(struct net_device *ndev)
 {
-	kref_put(&rxe->ref_cnt, rxe_release);
+	struct ib_device *ibdev =
+		ib_device_get_by_netdev(ndev, RDMA_DRIVER_RXE);
+
+	if (!ibdev)
+		return NULL;
+	return container_of(ibdev, struct rxe_dev, ib_dev);
 }
-struct rxe_dev *net_to_rxe(struct net_device *ndev);
-struct rxe_dev *get_rxe_by_name(const char *name);
 
 void rxe_port_up(struct rxe_dev *rxe);
 void rxe_port_down(struct rxe_dev *rxe);
+void rxe_set_port_state(struct rxe_dev *rxe);
 
 #endif /* RXE_H */
diff --git a/drivers/infiniband/sw/rxe/rxe_av.c b/drivers/infiniband/sw/rxe/rxe_av.c
index 26fe8d7..81ee756 100644
--- a/drivers/infiniband/sw/rxe/rxe_av.c
+++ b/drivers/infiniband/sw/rxe/rxe_av.c
@@ -34,6 +34,13 @@
 #include "rxe.h"
 #include "rxe_loc.h"
 
+void rxe_init_av(struct rdma_ah_attr *attr, struct rxe_av *av)
+{
+	rxe_av_from_attr(rdma_ah_get_port_num(attr), av, attr);
+	rxe_av_fill_ip_info(av, attr);
+	memcpy(av->dmac, attr->roce.dmac, ETH_ALEN);
+}
+
 int rxe_av_chk_attr(struct rxe_dev *rxe, struct rdma_ah_attr *attr)
 {
 	struct rxe_port *port;
diff --git a/drivers/infiniband/sw/rxe/rxe_comp.c b/drivers/infiniband/sw/rxe/rxe_comp.c
index 83311dd..116cafc 100644
--- a/drivers/infiniband/sw/rxe/rxe_comp.c
+++ b/drivers/infiniband/sw/rxe/rxe_comp.c
@@ -146,8 +146,7 @@
 	}
 }
 
-void rxe_comp_queue_pkt(struct rxe_dev *rxe, struct rxe_qp *qp,
-			struct sk_buff *skb)
+void rxe_comp_queue_pkt(struct rxe_qp *qp, struct sk_buff *skb)
 {
 	int must_sched;
 
@@ -155,7 +154,8 @@
 
 	must_sched = skb_queue_len(&qp->resp_pkts) > 1;
 	if (must_sched != 0)
-		rxe_counter_inc(rxe, RXE_CNT_COMPLETER_SCHED);
+		rxe_counter_inc(SKB_TO_PKT(skb)->rxe, RXE_CNT_COMPLETER_SCHED);
+
 	rxe_run_task(&qp->comp.task, must_sched);
 }
 
@@ -191,6 +191,7 @@
 {
 	qp->comp.retry_cnt = qp->attr.retry_cnt;
 	qp->comp.rnr_retry = qp->attr.rnr_retry;
+	qp->comp.started_retry = 0;
 }
 
 static inline enum comp_state check_psn(struct rxe_qp *qp,
@@ -253,6 +254,17 @@
 	case IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE:
 		if (pkt->opcode != IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE &&
 		    pkt->opcode != IB_OPCODE_RC_RDMA_READ_RESPONSE_LAST) {
+			/* read retries of partial data may restart from
+			 * read response first or response only.
+			 */
+			if ((pkt->psn == wqe->first_psn &&
+			     pkt->opcode ==
+			     IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST) ||
+			    (wqe->first_psn == wqe->last_psn &&
+			     pkt->opcode ==
+			     IB_OPCODE_RC_RDMA_READ_RESPONSE_ONLY))
+				break;
+
 			return COMPST_ERROR;
 		}
 		break;
@@ -427,6 +439,7 @@
  */
 static void do_complete(struct rxe_qp *qp, struct rxe_send_wqe *wqe)
 {
+	struct rxe_dev *rxe = to_rdev(qp->ibqp.device);
 	struct rxe_cqe cqe;
 
 	if ((qp->sq_sig_type == IB_SIGNAL_ALL_WR) ||
@@ -439,6 +452,11 @@
 		advance_consumer(qp->sq.queue);
 	}
 
+	if (wqe->wr.opcode == IB_WR_SEND ||
+	    wqe->wr.opcode == IB_WR_SEND_WITH_IMM ||
+	    wqe->wr.opcode == IB_WR_SEND_WITH_INV)
+		rxe_counter_inc(rxe, RXE_CNT_RDMA_SEND);
+
 	/*
 	 * we completed something so let req run again
 	 * if it is trying to fence
@@ -499,11 +517,11 @@
 					   struct rxe_pkt_info *pkt,
 					   struct rxe_send_wqe *wqe)
 {
-	qp->comp.opcode = -1;
-
-	if (pkt) {
-		if (psn_compare(pkt->psn, qp->comp.psn) >= 0)
-			qp->comp.psn = (pkt->psn + 1) & BTH_PSN_MASK;
+	if (pkt && wqe->state == wqe_state_pending) {
+		if (psn_compare(wqe->last_psn, qp->comp.psn) >= 0) {
+			qp->comp.psn = (wqe->last_psn + 1) & BTH_PSN_MASK;
+			qp->comp.opcode = -1;
+		}
 
 		if (qp->req.wait_psn) {
 			qp->req.wait_psn = 0;
@@ -540,7 +558,7 @@
 {
 	struct rxe_qp *qp = (struct rxe_qp *)arg;
 	struct rxe_dev *rxe = to_rdev(qp->ibqp.device);
-	struct rxe_send_wqe *wqe = wqe;
+	struct rxe_send_wqe *wqe = NULL;
 	struct sk_buff *skb = NULL;
 	struct rxe_pkt_info *pkt = NULL;
 	enum comp_state state;
@@ -676,6 +694,20 @@
 				goto exit;
 			}
 
+			/* if we've started a retry, don't start another
+			 * retry sequence, unless this is a timeout.
+			 */
+			if (qp->comp.started_retry &&
+			    !qp->comp.timeout_retry) {
+				if (pkt) {
+					rxe_drop_ref(pkt->qp);
+					kfree_skb(skb);
+					skb = NULL;
+				}
+
+				goto done;
+			}
+
 			if (qp->comp.retry_cnt > 0) {
 				if (qp->comp.retry_cnt != 7)
 					qp->comp.retry_cnt--;
@@ -692,6 +724,7 @@
 					rxe_counter_inc(rxe,
 							RXE_CNT_COMP_RETRY);
 					qp->req.need_retry = 1;
+					qp->comp.started_retry = 1;
 					rxe_run_task(&qp->req.task, 1);
 				}
 
@@ -701,7 +734,7 @@
 					skb = NULL;
 				}
 
-				goto exit;
+				goto done;
 
 			} else {
 				rxe_counter_inc(rxe, RXE_CNT_RETRY_EXCEEDED);
diff --git a/drivers/infiniband/sw/rxe/rxe_cq.c b/drivers/infiniband/sw/rxe/rxe_cq.c
index 2ee4b08..ad30901 100644
--- a/drivers/infiniband/sw/rxe/rxe_cq.c
+++ b/drivers/infiniband/sw/rxe/rxe_cq.c
@@ -30,7 +30,7 @@
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-
+#include <linux/vmalloc.h>
 #include "rxe.h"
 #include "rxe_loc.h"
 #include "rxe_queue.h"
@@ -82,7 +82,7 @@
 }
 
 int rxe_cq_from_init(struct rxe_dev *rxe, struct rxe_cq *cq, int cqe,
-		     int comp_vector, struct ib_ucontext *context,
+		     int comp_vector, struct ib_udata *udata,
 		     struct rxe_create_cq_resp __user *uresp)
 {
 	int err;
@@ -94,10 +94,10 @@
 		return -ENOMEM;
 	}
 
-	err = do_mmap_info(rxe, uresp ? &uresp->mi : NULL, context,
+	err = do_mmap_info(rxe, uresp ? &uresp->mi : NULL, udata,
 			   cq->queue->buf, cq->queue->buf_size, &cq->queue->ip);
 	if (err) {
-		kvfree(cq->queue->buf);
+		vfree(cq->queue->buf);
 		kfree(cq->queue);
 		return err;
 	}
@@ -115,13 +115,13 @@
 }
 
 int rxe_cq_resize_queue(struct rxe_cq *cq, int cqe,
-			struct rxe_resize_cq_resp __user *uresp)
+			struct rxe_resize_cq_resp __user *uresp,
+			struct ib_udata *udata)
 {
 	int err;
 
 	err = rxe_queue_resize(cq->queue, (unsigned int *)&cqe,
-			       sizeof(struct rxe_cqe),
-			       cq->queue->ip ? cq->queue->ip->context : NULL,
+			       sizeof(struct rxe_cqe), udata,
 			       uresp ? &uresp->mi : NULL, NULL, &cq->cq_lock);
 	if (!err)
 		cq->ibcq.cqe = cqe;
diff --git a/drivers/infiniband/sw/rxe/rxe_hdr.h b/drivers/infiniband/sw/rxe/rxe_hdr.h
index 6cb1840..ce00366 100644
--- a/drivers/infiniband/sw/rxe/rxe_hdr.h
+++ b/drivers/infiniband/sw/rxe/rxe_hdr.h
@@ -643,7 +643,7 @@
 	__be32			rkey;
 	__be64			swap_add;
 	__be64			comp;
-} __attribute__((__packed__));
+} __packed;
 
 static inline u64 __atmeth_va(void *arg)
 {
diff --git a/drivers/infiniband/sw/rxe/rxe_hw_counters.c b/drivers/infiniband/sw/rxe/rxe_hw_counters.c
index 6aeb7a1..636edb5 100644
--- a/drivers/infiniband/sw/rxe/rxe_hw_counters.c
+++ b/drivers/infiniband/sw/rxe/rxe_hw_counters.c
@@ -37,15 +37,18 @@
 	[RXE_CNT_SENT_PKTS]           =  "sent_pkts",
 	[RXE_CNT_RCVD_PKTS]           =  "rcvd_pkts",
 	[RXE_CNT_DUP_REQ]             =  "duplicate_request",
-	[RXE_CNT_OUT_OF_SEQ_REQ]      =  "out_of_sequence",
+	[RXE_CNT_OUT_OF_SEQ_REQ]      =  "out_of_seq_request",
 	[RXE_CNT_RCV_RNR]             =  "rcvd_rnr_err",
 	[RXE_CNT_SND_RNR]             =  "send_rnr_err",
 	[RXE_CNT_RCV_SEQ_ERR]         =  "rcvd_seq_err",
-	[RXE_CNT_COMPLETER_SCHED]     =  "ack_deffered",
+	[RXE_CNT_COMPLETER_SCHED]     =  "ack_deferred",
 	[RXE_CNT_RETRY_EXCEEDED]      =  "retry_exceeded_err",
 	[RXE_CNT_RNR_RETRY_EXCEEDED]  =  "retry_rnr_exceeded_err",
 	[RXE_CNT_COMP_RETRY]          =  "completer_retry_err",
 	[RXE_CNT_SEND_ERR]            =  "send_err",
+	[RXE_CNT_LINK_DOWNED]         =  "link_downed",
+	[RXE_CNT_RDMA_SEND]           =  "rdma_sends",
+	[RXE_CNT_RDMA_RECV]           =  "rdma_recvs",
 };
 
 int rxe_ib_get_hw_stats(struct ib_device *ibdev,
@@ -59,7 +62,7 @@
 		return -EINVAL;
 
 	for (cnt = 0; cnt  < ARRAY_SIZE(rxe_counter_name); cnt++)
-		stats->value[cnt] = dev->stats_counters[cnt];
+		stats->value[cnt] = atomic64_read(&dev->stats_counters[cnt]);
 
 	return ARRAY_SIZE(rxe_counter_name);
 }
diff --git a/drivers/infiniband/sw/rxe/rxe_hw_counters.h b/drivers/infiniband/sw/rxe/rxe_hw_counters.h
index f44df1b..72c0d63 100644
--- a/drivers/infiniband/sw/rxe/rxe_hw_counters.h
+++ b/drivers/infiniband/sw/rxe/rxe_hw_counters.h
@@ -50,6 +50,9 @@
 	RXE_CNT_RNR_RETRY_EXCEEDED,
 	RXE_CNT_COMP_RETRY,
 	RXE_CNT_SEND_ERR,
+	RXE_CNT_LINK_DOWNED,
+	RXE_CNT_RDMA_SEND,
+	RXE_CNT_RDMA_RECV,
 	RXE_NUM_OF_COUNTERS
 };
 
diff --git a/drivers/infiniband/sw/rxe/rxe_loc.h b/drivers/infiniband/sw/rxe/rxe_loc.h
index 87d14f7..775c23b 100644
--- a/drivers/infiniband/sw/rxe/rxe_loc.h
+++ b/drivers/infiniband/sw/rxe/rxe_loc.h
@@ -35,6 +35,7 @@
 #define RXE_LOC_H
 
 /* rxe_av.c */
+void rxe_init_av(struct rdma_ah_attr *attr, struct rxe_av *av);
 
 int rxe_av_chk_attr(struct rxe_dev *rxe, struct rdma_ah_attr *attr);
 
@@ -52,11 +53,12 @@
 		    int cqe, int comp_vector);
 
 int rxe_cq_from_init(struct rxe_dev *rxe, struct rxe_cq *cq, int cqe,
-		     int comp_vector, struct ib_ucontext *context,
+		     int comp_vector, struct ib_udata *udata,
 		     struct rxe_create_cq_resp __user *uresp);
 
 int rxe_cq_resize_queue(struct rxe_cq *cq, int new_cqe,
-			struct rxe_resize_cq_resp __user *uresp);
+			struct rxe_resize_cq_resp __user *uresp,
+			struct ib_udata *udata);
 
 int rxe_cq_post(struct rxe_cq *cq, struct rxe_cqe *cqe, int solicited);
 
@@ -90,10 +92,8 @@
 
 void rxe_mmap_release(struct kref *ref);
 
-struct rxe_mmap_info *rxe_create_mmap_info(struct rxe_dev *dev,
-					   u32 size,
-					   struct ib_ucontext *context,
-					   void *obj);
+struct rxe_mmap_info *rxe_create_mmap_info(struct rxe_dev *dev, u32 size,
+					   struct ib_udata *udata, void *obj);
 
 int rxe_mmap(struct ib_ucontext *context, struct vm_area_struct *vma);
 
@@ -144,8 +144,7 @@
 int rxe_send(struct rxe_pkt_info *pkt, struct sk_buff *skb);
 struct sk_buff *rxe_init_packet(struct rxe_dev *rxe, struct rxe_av *av,
 				int paylen, struct rxe_pkt_info *pkt);
-int rxe_prepare(struct rxe_dev *rxe, struct rxe_pkt_info *pkt,
-		struct sk_buff *skb, u32 *crc);
+int rxe_prepare(struct rxe_pkt_info *pkt, struct sk_buff *skb, u32 *crc);
 enum rdma_link_layer rxe_link_layer(struct rxe_dev *rxe, unsigned int port_num);
 const char *rxe_parent_name(struct rxe_dev *rxe, unsigned int port_num);
 struct device *rxe_dma_device(struct rxe_dev *rxe);
@@ -158,7 +157,7 @@
 int rxe_qp_from_init(struct rxe_dev *rxe, struct rxe_qp *qp, struct rxe_pd *pd,
 		     struct ib_qp_init_attr *init,
 		     struct rxe_create_qp_resp __user *uresp,
-		     struct ib_pd *ibpd);
+		     struct ib_pd *ibpd, struct ib_udata *udata);
 
 int rxe_qp_to_init(struct rxe_qp *qp, struct ib_qp_init_attr *init);
 
@@ -196,7 +195,7 @@
 	if (qp->ibqp.qp_type == IB_QPT_RC || qp->ibqp.qp_type == IB_QPT_UC)
 		return qp->attr.path_mtu;
 	else
-		return RXE_PORT_MAX_MTU;
+		return IB_MTU_4096;
 }
 
 static inline int rcv_wqe_size(int max_sge)
@@ -224,15 +223,14 @@
 		     struct ib_srq_attr *attr, enum ib_srq_attr_mask mask);
 
 int rxe_srq_from_init(struct rxe_dev *rxe, struct rxe_srq *srq,
-		      struct ib_srq_init_attr *init,
-		      struct ib_ucontext *context,
+		      struct ib_srq_init_attr *init, struct ib_udata *udata,
 		      struct rxe_create_srq_resp __user *uresp);
 
 int rxe_srq_from_attr(struct rxe_dev *rxe, struct rxe_srq *srq,
 		      struct ib_srq_attr *attr, enum ib_srq_attr_mask mask,
-		      struct rxe_modify_srq_cmd *ucmd);
+		      struct rxe_modify_srq_cmd *ucmd, struct ib_udata *udata);
 
-void rxe_release(struct kref *kref);
+void rxe_dealloc(struct ib_device *ib_dev);
 
 int rxe_completer(void *arg);
 int rxe_requester(void *arg);
@@ -240,22 +238,21 @@
 
 u32 rxe_icrc_hdr(struct rxe_pkt_info *pkt, struct sk_buff *skb);
 
-void rxe_resp_queue_pkt(struct rxe_dev *rxe,
-			struct rxe_qp *qp, struct sk_buff *skb);
+void rxe_resp_queue_pkt(struct rxe_qp *qp, struct sk_buff *skb);
 
-void rxe_comp_queue_pkt(struct rxe_dev *rxe,
-			struct rxe_qp *qp, struct sk_buff *skb);
+void rxe_comp_queue_pkt(struct rxe_qp *qp, struct sk_buff *skb);
 
 static inline unsigned int wr_opcode_mask(int opcode, struct rxe_qp *qp)
 {
 	return rxe_wr_opcode_info[opcode].mask[qp->ibqp.qp_type];
 }
 
-static inline int rxe_xmit_packet(struct rxe_dev *rxe, struct rxe_qp *qp,
-				  struct rxe_pkt_info *pkt, struct sk_buff *skb)
+static inline int rxe_xmit_packet(struct rxe_qp *qp, struct rxe_pkt_info *pkt,
+				  struct sk_buff *skb)
 {
 	int err;
 	int is_request = pkt->mask & RXE_REQ_MASK;
+	struct rxe_dev *rxe = to_rdev(qp->ibqp.device);
 
 	if ((is_request && (qp->req.state != QP_STATE_READY)) ||
 	    (!is_request && (qp->resp.state != QP_STATE_READY))) {
diff --git a/drivers/infiniband/sw/rxe/rxe_mmap.c b/drivers/infiniband/sw/rxe/rxe_mmap.c
index d22431e..48f4812 100644
--- a/drivers/infiniband/sw/rxe/rxe_mmap.c
+++ b/drivers/infiniband/sw/rxe/rxe_mmap.c
@@ -36,6 +36,7 @@
 #include <linux/mm.h>
 #include <linux/errno.h>
 #include <asm/pgtable.h>
+#include <rdma/uverbs_ioctl.h>
 
 #include "rxe.h"
 #include "rxe_loc.h"
@@ -140,13 +141,14 @@
 /*
  * Allocate information for rxe_mmap
  */
-struct rxe_mmap_info *rxe_create_mmap_info(struct rxe_dev *rxe,
-					   u32 size,
-					   struct ib_ucontext *context,
-					   void *obj)
+struct rxe_mmap_info *rxe_create_mmap_info(struct rxe_dev *rxe, u32 size,
+					   struct ib_udata *udata, void *obj)
 {
 	struct rxe_mmap_info *ip;
 
+	if (!udata)
+		return ERR_PTR(-EINVAL);
+
 	ip = kmalloc(sizeof(*ip), GFP_KERNEL);
 	if (!ip)
 		return NULL;
@@ -165,7 +167,9 @@
 
 	INIT_LIST_HEAD(&ip->pending_mmaps);
 	ip->info.size = size;
-	ip->context = context;
+	ip->context =
+		container_of(udata, struct uverbs_attr_bundle, driver_udata)
+			->context;
 	ip->obj = obj;
 	kref_init(&ip->ref);
 
diff --git a/drivers/infiniband/sw/rxe/rxe_mr.c b/drivers/infiniband/sw/rxe/rxe_mr.c
index dff605f..ea6a819 100644
--- a/drivers/infiniband/sw/rxe/rxe_mr.c
+++ b/drivers/infiniband/sw/rxe/rxe_mr.c
@@ -96,8 +96,7 @@
 	struct rxe_mem *mem = container_of(arg, typeof(*mem), pelem);
 	int i;
 
-	if (mem->umem)
-		ib_umem_release(mem->umem);
+	ib_umem_release(mem->umem);
 
 	if (mem->map) {
 		for (i = 0; i < mem->num_map; i++)
@@ -162,16 +161,15 @@
 		      u64 length, u64 iova, int access, struct ib_udata *udata,
 		      struct rxe_mem *mem)
 {
-	int			entry;
 	struct rxe_map		**map;
 	struct rxe_phys_buf	*buf = NULL;
 	struct ib_umem		*umem;
-	struct scatterlist	*sg;
+	struct sg_page_iter	sg_iter;
 	int			num_buf;
 	void			*vaddr;
 	int err;
 
-	umem = ib_umem_get(pd->ibpd.uobject->context, start, length, access, 0);
+	umem = ib_umem_get(udata, start, length, access, 0);
 	if (IS_ERR(umem)) {
 		pr_warn("err %d from rxe_umem_get\n",
 			(int)PTR_ERR(umem));
@@ -180,7 +178,7 @@
 	}
 
 	mem->umem = umem;
-	num_buf = umem->nmap;
+	num_buf = ib_umem_num_pages(umem);
 
 	rxe_mem_init(access, mem);
 
@@ -191,16 +189,22 @@
 		goto err1;
 	}
 
-	mem->page_shift		= umem->page_shift;
-	mem->page_mask		= BIT(umem->page_shift) - 1;
+	mem->page_shift		= PAGE_SHIFT;
+	mem->page_mask = PAGE_SIZE - 1;
 
 	num_buf			= 0;
 	map			= mem->map;
 	if (length > 0) {
 		buf = map[0]->buf;
 
-		for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) {
-			vaddr = page_address(sg_page(sg));
+		for_each_sg_page(umem->sg_head.sgl, &sg_iter, umem->nmap, 0) {
+			if (num_buf >= RXE_BUF_PER_MAP) {
+				map++;
+				buf = map[0]->buf;
+				num_buf = 0;
+			}
+
+			vaddr = page_address(sg_page_iter_page(&sg_iter));
 			if (!vaddr) {
 				pr_warn("null vaddr\n");
 				err = -ENOMEM;
@@ -208,15 +212,10 @@
 			}
 
 			buf->addr = (uintptr_t)vaddr;
-			buf->size = BIT(umem->page_shift);
+			buf->size = PAGE_SIZE;
 			num_buf++;
 			buf++;
 
-			if (num_buf >= RXE_BUF_PER_MAP) {
-				map++;
-				buf = map[0]->buf;
-				num_buf = 0;
-			}
 		}
 	}
 
@@ -573,33 +572,20 @@
 	struct rxe_dev *rxe = to_rdev(pd->ibpd.device);
 	int index = key >> 8;
 
-	if (index >= RXE_MIN_MR_INDEX && index <= RXE_MAX_MR_INDEX) {
-		mem = rxe_pool_get_index(&rxe->mr_pool, index);
-		if (!mem)
-			goto err1;
-	} else {
-		goto err1;
+	mem = rxe_pool_get_index(&rxe->mr_pool, index);
+	if (!mem)
+		return NULL;
+
+	if (unlikely((type == lookup_local && mem->lkey != key) ||
+		     (type == lookup_remote && mem->rkey != key) ||
+		     mem->pd != pd ||
+		     (access && !(access & mem->access)) ||
+		     mem->state != RXE_MEM_STATE_VALID)) {
+		rxe_drop_ref(mem);
+		mem = NULL;
 	}
 
-	if ((type == lookup_local && mem->lkey != key) ||
-	    (type == lookup_remote && mem->rkey != key))
-		goto err2;
-
-	if (mem->pd != pd)
-		goto err2;
-
-	if (access && !(access & mem->access))
-		goto err2;
-
-	if (mem->state != RXE_MEM_STATE_VALID)
-		goto err2;
-
 	return mem;
-
-err2:
-	rxe_drop_ref(mem);
-err1:
-	return NULL;
 }
 
 int rxe_mem_map_pages(struct rxe_dev *rxe, struct rxe_mem *mem,
diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c
index 8094cba..5a3474f 100644
--- a/drivers/infiniband/sw/rxe/rxe_net.c
+++ b/drivers/infiniband/sw/rxe/rxe_net.c
@@ -45,43 +45,6 @@
 #include "rxe_net.h"
 #include "rxe_loc.h"
 
-static LIST_HEAD(rxe_dev_list);
-static DEFINE_SPINLOCK(dev_list_lock); /* spinlock for device list */
-
-struct rxe_dev *net_to_rxe(struct net_device *ndev)
-{
-	struct rxe_dev *rxe;
-	struct rxe_dev *found = NULL;
-
-	spin_lock_bh(&dev_list_lock);
-	list_for_each_entry(rxe, &rxe_dev_list, list) {
-		if (rxe->ndev == ndev) {
-			found = rxe;
-			break;
-		}
-	}
-	spin_unlock_bh(&dev_list_lock);
-
-	return found;
-}
-
-struct rxe_dev *get_rxe_by_name(const char *name)
-{
-	struct rxe_dev *rxe;
-	struct rxe_dev *found = NULL;
-
-	spin_lock_bh(&dev_list_lock);
-	list_for_each_entry(rxe, &rxe_dev_list, list) {
-		if (!strcmp(name, rxe->ib_dev.name)) {
-			found = rxe;
-			break;
-		}
-	}
-	spin_unlock_bh(&dev_list_lock);
-	return found;
-}
-
-
 static struct rxe_recv_sockets recv_sockets;
 
 struct device *rxe_dma_device(struct rxe_dev *rxe)
@@ -182,19 +145,11 @@
 
 #endif
 
-static struct dst_entry *rxe_find_route(struct rxe_dev *rxe,
+static struct dst_entry *rxe_find_route(struct net_device *ndev,
 					struct rxe_qp *qp,
 					struct rxe_av *av)
 {
-	const struct ib_gid_attr *attr;
 	struct dst_entry *dst = NULL;
-	struct net_device *ndev;
-
-	attr = rdma_get_gid_attr(&rxe->ib_dev, qp->attr.port_num,
-				 av->grh.sgid_index);
-	if (IS_ERR(attr))
-		return NULL;
-	ndev = attr->ndev;
 
 	if (qp_type(qp) == IB_QPT_RC)
 		dst = sk_dst_get(qp->sk->sk);
@@ -229,7 +184,6 @@
 			sk_dst_set(qp->sk->sk, dst);
 		}
 	}
-	rdma_put_gid_attr(attr);
 	return dst;
 }
 
@@ -238,18 +192,19 @@
 	struct udphdr *udph;
 	struct net_device *ndev = skb->dev;
 	struct net_device *rdev = ndev;
-	struct rxe_dev *rxe = net_to_rxe(ndev);
+	struct rxe_dev *rxe = rxe_get_dev_from_net(ndev);
 	struct rxe_pkt_info *pkt = SKB_TO_PKT(skb);
 
 	if (!rxe && is_vlan_dev(rdev)) {
 		rdev = vlan_dev_real_dev(ndev);
-		rxe = net_to_rxe(rdev);
+		rxe = rxe_get_dev_from_net(rdev);
 	}
 	if (!rxe)
 		goto drop;
 
 	if (skb_linearize(skb)) {
 		pr_err("skb_linearize failed\n");
+		ib_device_put(&rxe->ib_dev);
 		goto drop;
 	}
 
@@ -262,6 +217,12 @@
 
 	rxe_rcv(skb);
 
+	/*
+	 * FIXME: this is in the wrong place, it needs to be done when pkt is
+	 * destroyed
+	 */
+	ib_device_put(&rxe->ib_dev);
+
 	return 0;
 drop:
 	kfree_skb(skb);
@@ -377,27 +338,24 @@
 	ip6h->payload_len = htons(skb->len - sizeof(*ip6h));
 }
 
-static int prepare4(struct rxe_dev *rxe, struct rxe_pkt_info *pkt,
-		    struct sk_buff *skb, struct rxe_av *av)
+static int prepare4(struct rxe_pkt_info *pkt, struct sk_buff *skb)
 {
 	struct rxe_qp *qp = pkt->qp;
 	struct dst_entry *dst;
 	bool xnet = false;
 	__be16 df = htons(IP_DF);
+	struct rxe_av *av = rxe_get_av(pkt);
 	struct in_addr *saddr = &av->sgid_addr._sockaddr_in.sin_addr;
 	struct in_addr *daddr = &av->dgid_addr._sockaddr_in.sin_addr;
 
-	dst = rxe_find_route(rxe, qp, av);
+	dst = rxe_find_route(skb->dev, qp, av);
 	if (!dst) {
 		pr_err("Host not reachable\n");
 		return -EHOSTUNREACH;
 	}
 
-	if (!memcmp(saddr, daddr, sizeof(*daddr)))
-		pkt->mask |= RXE_LOOPBACK_MASK;
-
-	prepare_udp_hdr(skb, htons(RXE_ROCE_V2_SPORT),
-			htons(ROCE_V2_UDP_DPORT));
+	prepare_udp_hdr(skb, cpu_to_be16(qp->src_port),
+			cpu_to_be16(ROCE_V2_UDP_DPORT));
 
 	prepare_ipv4_hdr(dst, skb, saddr->s_addr, daddr->s_addr, IPPROTO_UDP,
 			 av->grh.traffic_class, av->grh.hop_limit, df, xnet);
@@ -406,25 +364,22 @@
 	return 0;
 }
 
-static int prepare6(struct rxe_dev *rxe, struct rxe_pkt_info *pkt,
-		    struct sk_buff *skb, struct rxe_av *av)
+static int prepare6(struct rxe_pkt_info *pkt, struct sk_buff *skb)
 {
 	struct rxe_qp *qp = pkt->qp;
 	struct dst_entry *dst;
+	struct rxe_av *av = rxe_get_av(pkt);
 	struct in6_addr *saddr = &av->sgid_addr._sockaddr_in6.sin6_addr;
 	struct in6_addr *daddr = &av->dgid_addr._sockaddr_in6.sin6_addr;
 
-	dst = rxe_find_route(rxe, qp, av);
+	dst = rxe_find_route(skb->dev, qp, av);
 	if (!dst) {
 		pr_err("Host not reachable\n");
 		return -EHOSTUNREACH;
 	}
 
-	if (!memcmp(saddr, daddr, sizeof(*daddr)))
-		pkt->mask |= RXE_LOOPBACK_MASK;
-
-	prepare_udp_hdr(skb, htons(RXE_ROCE_V2_SPORT),
-			htons(ROCE_V2_UDP_DPORT));
+	prepare_udp_hdr(skb, cpu_to_be16(qp->src_port),
+			cpu_to_be16(ROCE_V2_UDP_DPORT));
 
 	prepare_ipv6_hdr(dst, skb, saddr, daddr, IPPROTO_UDP,
 			 av->grh.traffic_class,
@@ -434,19 +389,20 @@
 	return 0;
 }
 
-int rxe_prepare(struct rxe_dev *rxe, struct rxe_pkt_info *pkt,
-		struct sk_buff *skb, u32 *crc)
+int rxe_prepare(struct rxe_pkt_info *pkt, struct sk_buff *skb, u32 *crc)
 {
 	int err = 0;
-	struct rxe_av *av = rxe_get_av(pkt);
 
-	if (av->network_type == RDMA_NETWORK_IPV4)
-		err = prepare4(rxe, pkt, skb, av);
-	else if (av->network_type == RDMA_NETWORK_IPV6)
-		err = prepare6(rxe, pkt, skb, av);
+	if (skb->protocol == htons(ETH_P_IP))
+		err = prepare4(pkt, skb);
+	else if (skb->protocol == htons(ETH_P_IPV6))
+		err = prepare6(pkt, skb);
 
 	*crc = rxe_icrc_hdr(pkt, skb);
 
+	if (ether_addr_equal(skb->dev->dev_addr, rxe_get_av(pkt)->dmac))
+		pkt->mask |= RXE_LOOPBACK_MASK;
+
 	return err;
 }
 
@@ -465,23 +421,20 @@
 
 int rxe_send(struct rxe_pkt_info *pkt, struct sk_buff *skb)
 {
-	struct rxe_av *av;
 	int err;
 
-	av = rxe_get_av(pkt);
-
 	skb->destructor = rxe_skb_tx_dtor;
 	skb->sk = pkt->qp->sk->sk;
 
 	rxe_add_ref(pkt->qp);
 	atomic_inc(&pkt->qp->skb_out);
 
-	if (av->network_type == RDMA_NETWORK_IPV4) {
+	if (skb->protocol == htons(ETH_P_IP)) {
 		err = ip_local_out(dev_net(skb_dst(skb)->dev), skb->sk, skb);
-	} else if (av->network_type == RDMA_NETWORK_IPV6) {
+	} else if (skb->protocol == htons(ETH_P_IPV6)) {
 		err = ip6_local_out(dev_net(skb_dst(skb)->dev), skb->sk, skb);
 	} else {
-		pr_err("Unknown layer 3 protocol: %d\n", av->network_type);
+		pr_err("Unknown layer 3 protocol: %d\n", skb->protocol);
 		atomic_dec(&pkt->qp->skb_out);
 		rxe_drop_ref(pkt->qp);
 		kfree_skb(skb);
@@ -501,16 +454,11 @@
 	rxe_rcv(skb);
 }
 
-static inline int addr_same(struct rxe_dev *rxe, struct rxe_av *av)
-{
-	return rxe->port.port_guid == av->grh.dgid.global.interface_id;
-}
-
 struct sk_buff *rxe_init_packet(struct rxe_dev *rxe, struct rxe_av *av,
 				int paylen, struct rxe_pkt_info *pkt)
 {
 	unsigned int hdr_len;
-	struct sk_buff *skb;
+	struct sk_buff *skb = NULL;
 	struct net_device *ndev;
 	const struct ib_gid_attr *attr;
 	const int port_num = 1;
@@ -518,7 +466,6 @@
 	attr = rdma_get_gid_attr(&rxe->ib_dev, port_num, av->grh.sgid_index);
 	if (IS_ERR(attr))
 		return NULL;
-	ndev = attr->ndev;
 
 	if (av->network_type == RDMA_NETWORK_IPV4)
 		hdr_len = ETH_HLEN + sizeof(struct udphdr) +
@@ -527,15 +474,26 @@
 		hdr_len = ETH_HLEN + sizeof(struct udphdr) +
 			sizeof(struct ipv6hdr);
 
+	rcu_read_lock();
+	ndev = rdma_read_gid_attr_ndev_rcu(attr);
+	if (IS_ERR(ndev)) {
+		rcu_read_unlock();
+		goto out;
+	}
 	skb = alloc_skb(paylen + hdr_len + LL_RESERVED_SPACE(ndev),
 			GFP_ATOMIC);
 
-	if (unlikely(!skb))
+	if (unlikely(!skb)) {
+		rcu_read_unlock();
 		goto out;
+	}
 
-	skb_reserve(skb, hdr_len + LL_RESERVED_SPACE(rxe->ndev));
+	skb_reserve(skb, hdr_len + LL_RESERVED_SPACE(ndev));
 
+	/* FIXME: hold reference to this netdev until life of this skb. */
 	skb->dev	= ndev;
+	rcu_read_unlock();
+
 	if (av->network_type == RDMA_NETWORK_IPV4)
 		skb->protocol = htons(ETH_P_IP);
 	else
@@ -565,42 +523,24 @@
 	return IB_LINK_LAYER_ETHERNET;
 }
 
-struct rxe_dev *rxe_net_add(struct net_device *ndev)
+int rxe_net_add(const char *ibdev_name, struct net_device *ndev)
 {
 	int err;
 	struct rxe_dev *rxe = NULL;
 
-	rxe = (struct rxe_dev *)ib_alloc_device(sizeof(*rxe));
+	rxe = ib_alloc_device(rxe_dev, ib_dev);
 	if (!rxe)
-		return NULL;
+		return -ENOMEM;
 
 	rxe->ndev = ndev;
 
-	err = rxe_add(rxe, ndev->mtu);
+	err = rxe_add(rxe, ndev->mtu, ibdev_name);
 	if (err) {
 		ib_dealloc_device(&rxe->ib_dev);
-		return NULL;
+		return err;
 	}
 
-	spin_lock_bh(&dev_list_lock);
-	list_add_tail(&rxe->list, &rxe_dev_list);
-	spin_unlock_bh(&dev_list_lock);
-	return rxe;
-}
-
-void rxe_remove_all(void)
-{
-	spin_lock_bh(&dev_list_lock);
-	while (!list_empty(&rxe_dev_list)) {
-		struct rxe_dev *rxe =
-			list_first_entry(&rxe_dev_list, struct rxe_dev, list);
-
-		list_del(&rxe->list);
-		spin_unlock_bh(&dev_list_lock);
-		rxe_remove(rxe);
-		spin_lock_bh(&dev_list_lock);
-	}
-	spin_unlock_bh(&dev_list_lock);
+	return 0;
 }
 
 static void rxe_port_event(struct rxe_dev *rxe,
@@ -622,10 +562,9 @@
 
 	port = &rxe->port;
 	port->attr.state = IB_PORT_ACTIVE;
-	port->attr.phys_state = IB_PHYS_STATE_LINK_UP;
 
 	rxe_port_event(rxe, IB_EVENT_PORT_ACTIVE);
-	pr_info("set %s active\n", rxe->ib_dev.name);
+	dev_info(&rxe->ib_dev.dev, "set active\n");
 }
 
 /* Caller must hold net_info_lock */
@@ -635,10 +574,18 @@
 
 	port = &rxe->port;
 	port->attr.state = IB_PORT_DOWN;
-	port->attr.phys_state = IB_PHYS_STATE_LINK_DOWN;
 
 	rxe_port_event(rxe, IB_EVENT_PORT_ERR);
-	pr_info("set %s down\n", rxe->ib_dev.name);
+	rxe_counter_inc(rxe, RXE_CNT_LINK_DOWNED);
+	dev_info(&rxe->ib_dev.dev, "set down\n");
+}
+
+void rxe_set_port_state(struct rxe_dev *rxe)
+{
+	if (netif_running(rxe->ndev) && netif_carrier_ok(rxe->ndev))
+		rxe_port_up(rxe);
+	else
+		rxe_port_down(rxe);
 }
 
 static int rxe_notify(struct notifier_block *not_blk,
@@ -646,15 +593,14 @@
 		      void *arg)
 {
 	struct net_device *ndev = netdev_notifier_info_to_dev(arg);
-	struct rxe_dev *rxe = net_to_rxe(ndev);
+	struct rxe_dev *rxe = rxe_get_dev_from_net(ndev);
 
 	if (!rxe)
-		goto out;
+		return NOTIFY_OK;
 
 	switch (event) {
 	case NETDEV_UNREGISTER:
-		list_del(&rxe->list);
-		rxe_remove(rxe);
+		ib_unregister_device_queued(&rxe->ib_dev);
 		break;
 	case NETDEV_UP:
 		rxe_port_up(rxe);
@@ -667,10 +613,7 @@
 		rxe_set_mtu(rxe, ndev->mtu);
 		break;
 	case NETDEV_CHANGE:
-		if (netif_running(ndev) && netif_carrier_ok(ndev))
-			rxe_port_up(rxe);
-		else
-			rxe_port_down(rxe);
+		rxe_set_port_state(rxe);
 		break;
 	case NETDEV_REBOOT:
 	case NETDEV_GOING_DOWN:
@@ -682,7 +625,8 @@
 			event, ndev->name);
 		break;
 	}
-out:
+
+	ib_device_put(&rxe->ib_dev);
 	return NOTIFY_OK;
 }
 
diff --git a/drivers/infiniband/sw/rxe/rxe_net.h b/drivers/infiniband/sw/rxe/rxe_net.h
index 106c586..2ca71d3 100644
--- a/drivers/infiniband/sw/rxe/rxe_net.h
+++ b/drivers/infiniband/sw/rxe/rxe_net.h
@@ -43,7 +43,7 @@
 	struct socket *sk6;
 };
 
-struct rxe_dev *rxe_net_add(struct net_device *ndev);
+int rxe_net_add(const char *ibdev_name, struct net_device *ndev);
 
 int rxe_net_init(void);
 void rxe_net_exit(void);
diff --git a/drivers/infiniband/sw/rxe/rxe_param.h b/drivers/infiniband/sw/rxe/rxe_param.h
index 4555510..fe52073 100644
--- a/drivers/infiniband/sw/rxe/rxe_param.h
+++ b/drivers/infiniband/sw/rxe/rxe_param.h
@@ -78,7 +78,8 @@
 					| IB_DEVICE_SYS_IMAGE_GUID
 					| IB_DEVICE_RC_RNR_NAK_GEN
 					| IB_DEVICE_SRQ_RESIZE
-					| IB_DEVICE_MEM_MGT_EXTENSIONS,
+					| IB_DEVICE_MEM_MGT_EXTENSIONS
+					| IB_DEVICE_ALLOW_USER_UNREG,
 	RXE_MAX_SGE			= 32,
 	RXE_MAX_SGE_RD			= 32,
 	RXE_MAX_CQ			= 16384,
@@ -90,7 +91,6 @@
 	RXE_MAX_RES_RD_ATOM		= 0x3f000,
 	RXE_MAX_QP_INIT_RD_ATOM		= 128,
 	RXE_MAX_EE_INIT_RD_ATOM		= 0,
-	RXE_ATOMIC_CAP			= 1,
 	RXE_MAX_EE			= 0,
 	RXE_MAX_RDD			= 0,
 	RXE_MAX_MW			= 0,
@@ -139,9 +139,6 @@
 
 /* default/initial rxe port parameters */
 enum rxe_port_param {
-	RXE_PORT_STATE			= IB_PORT_DOWN,
-	RXE_PORT_MAX_MTU		= IB_MTU_4096,
-	RXE_PORT_ACTIVE_MTU		= IB_MTU_256,
 	RXE_PORT_GID_TBL_LEN		= 1024,
 	RXE_PORT_PORT_CAP_FLAGS		= RDMA_CORE_CAP_PROT_ROCE_UDP_ENCAP,
 	RXE_PORT_MAX_MSG_SZ		= 0x800000,
@@ -157,7 +154,7 @@
 	RXE_PORT_ACTIVE_WIDTH		= IB_WIDTH_1X,
 	RXE_PORT_ACTIVE_SPEED		= 1,
 	RXE_PORT_PKEY_TBL_LEN		= 64,
-	RXE_PORT_PHYS_STATE		= 2,
+	RXE_PORT_PHYS_STATE		= IB_PORT_PHYS_STATE_POLLING,
 	RXE_PORT_SUBNET_PREFIX		= 0xfe80000000000000ULL,
 };
 
diff --git a/drivers/infiniband/sw/rxe/rxe_pool.c b/drivers/infiniband/sw/rxe/rxe_pool.c
index b4a8acc..fbcbac5 100644
--- a/drivers/infiniband/sw/rxe/rxe_pool.c
+++ b/drivers/infiniband/sw/rxe/rxe_pool.c
@@ -42,20 +42,22 @@
 	[RXE_TYPE_UC] = {
 		.name		= "rxe-uc",
 		.size		= sizeof(struct rxe_ucontext),
+		.flags          = RXE_POOL_NO_ALLOC,
 	},
 	[RXE_TYPE_PD] = {
 		.name		= "rxe-pd",
 		.size		= sizeof(struct rxe_pd),
+		.flags		= RXE_POOL_NO_ALLOC,
 	},
 	[RXE_TYPE_AH] = {
 		.name		= "rxe-ah",
 		.size		= sizeof(struct rxe_ah),
-		.flags		= RXE_POOL_ATOMIC,
+		.flags		= RXE_POOL_ATOMIC | RXE_POOL_NO_ALLOC,
 	},
 	[RXE_TYPE_SRQ] = {
 		.name		= "rxe-srq",
 		.size		= sizeof(struct rxe_srq),
-		.flags		= RXE_POOL_INDEX,
+		.flags		= RXE_POOL_INDEX | RXE_POOL_NO_ALLOC,
 		.min_index	= RXE_MIN_SRQ_INDEX,
 		.max_index	= RXE_MAX_SRQ_INDEX,
 	},
@@ -70,6 +72,7 @@
 	[RXE_TYPE_CQ] = {
 		.name		= "rxe-cq",
 		.size		= sizeof(struct rxe_cq),
+		.flags          = RXE_POOL_NO_ALLOC,
 		.cleanup	= rxe_cq_cleanup,
 	},
 	[RXE_TYPE_MR] = {
@@ -112,6 +115,20 @@
 	return rxe_type_info[pool->type].cache;
 }
 
+static void rxe_cache_clean(size_t cnt)
+{
+	int i;
+	struct rxe_type_info *type;
+
+	for (i = 0; i < cnt; i++) {
+		type = &rxe_type_info[i];
+		if (!(type->flags & RXE_POOL_NO_ALLOC)) {
+			kmem_cache_destroy(type->cache);
+			type->cache = NULL;
+		}
+	}
+}
+
 int rxe_cache_init(void)
 {
 	int err;
@@ -122,38 +139,31 @@
 	for (i = 0; i < RXE_NUM_TYPES; i++) {
 		type = &rxe_type_info[i];
 		size = ALIGN(type->size, RXE_POOL_ALIGN);
-		type->cache = kmem_cache_create(type->name, size,
-				RXE_POOL_ALIGN,
-				RXE_POOL_CACHE_FLAGS, NULL);
-		if (!type->cache) {
-			pr_err("Unable to init kmem cache for %s\n",
-			       type->name);
-			err = -ENOMEM;
-			goto err1;
+		if (!(type->flags & RXE_POOL_NO_ALLOC)) {
+			type->cache =
+				kmem_cache_create(type->name, size,
+						  RXE_POOL_ALIGN,
+						  RXE_POOL_CACHE_FLAGS, NULL);
+			if (!type->cache) {
+				pr_err("Unable to init kmem cache for %s\n",
+				       type->name);
+				err = -ENOMEM;
+				goto err1;
+			}
 		}
 	}
 
 	return 0;
 
 err1:
-	while (--i >= 0) {
-		kmem_cache_destroy(type->cache);
-		type->cache = NULL;
-	}
+	rxe_cache_clean(i);
 
 	return err;
 }
 
 void rxe_cache_exit(void)
 {
-	int i;
-	struct rxe_type_info *type;
-
-	for (i = 0; i < RXE_NUM_TYPES; i++) {
-		type = &rxe_type_info[i];
-		kmem_cache_destroy(type->cache);
-		type->cache = NULL;
-	}
+	rxe_cache_clean(RXE_NUM_TYPES);
 }
 
 static int rxe_pool_init_index(struct rxe_pool *pool, u32 max, u32 min)
@@ -207,7 +217,7 @@
 
 	kref_init(&pool->ref_cnt);
 
-	spin_lock_init(&pool->pool_lock);
+	rwlock_init(&pool->pool_lock);
 
 	if (rxe_type_info[type].flags & RXE_POOL_INDEX) {
 		err = rxe_pool_init_index(pool,
@@ -222,7 +232,7 @@
 		pool->key_size = rxe_type_info[type].key_size;
 	}
 
-	pool->state = rxe_pool_valid;
+	pool->state = RXE_POOL_STATE_VALID;
 
 out:
 	return err;
@@ -232,7 +242,7 @@
 {
 	struct rxe_pool *pool = container_of(kref, struct rxe_pool, ref_cnt);
 
-	pool->state = rxe_pool_invalid;
+	pool->state = RXE_POOL_STATE_INVALID;
 	kfree(pool->table);
 }
 
@@ -241,20 +251,18 @@
 	kref_put(&pool->ref_cnt, rxe_pool_release);
 }
 
-int rxe_pool_cleanup(struct rxe_pool *pool)
+void rxe_pool_cleanup(struct rxe_pool *pool)
 {
 	unsigned long flags;
 
-	spin_lock_irqsave(&pool->pool_lock, flags);
-	pool->state = rxe_pool_invalid;
+	write_lock_irqsave(&pool->pool_lock, flags);
+	pool->state = RXE_POOL_STATE_INVALID;
 	if (atomic_read(&pool->num_elem) > 0)
 		pr_warn("%s pool destroyed with unfree'd elem\n",
 			pool_name(pool));
-	spin_unlock_irqrestore(&pool->pool_lock, flags);
+	write_unlock_irqrestore(&pool->pool_lock, flags);
 
 	rxe_pool_put(pool);
-
-	return 0;
 }
 
 static u32 alloc_index(struct rxe_pool *pool)
@@ -336,10 +344,10 @@
 	struct rxe_pool *pool = elem->pool;
 	unsigned long flags;
 
-	spin_lock_irqsave(&pool->pool_lock, flags);
+	write_lock_irqsave(&pool->pool_lock, flags);
 	memcpy((u8 *)elem + pool->key_offset, key, pool->key_size);
 	insert_key(pool, elem);
-	spin_unlock_irqrestore(&pool->pool_lock, flags);
+	write_unlock_irqrestore(&pool->pool_lock, flags);
 }
 
 void rxe_drop_key(void *arg)
@@ -348,9 +356,9 @@
 	struct rxe_pool *pool = elem->pool;
 	unsigned long flags;
 
-	spin_lock_irqsave(&pool->pool_lock, flags);
+	write_lock_irqsave(&pool->pool_lock, flags);
 	rb_erase(&elem->node, &pool->tree);
-	spin_unlock_irqrestore(&pool->pool_lock, flags);
+	write_unlock_irqrestore(&pool->pool_lock, flags);
 }
 
 void rxe_add_index(void *arg)
@@ -359,10 +367,10 @@
 	struct rxe_pool *pool = elem->pool;
 	unsigned long flags;
 
-	spin_lock_irqsave(&pool->pool_lock, flags);
+	write_lock_irqsave(&pool->pool_lock, flags);
 	elem->index = alloc_index(pool);
 	insert_index(pool, elem);
-	spin_unlock_irqrestore(&pool->pool_lock, flags);
+	write_unlock_irqrestore(&pool->pool_lock, flags);
 }
 
 void rxe_drop_index(void *arg)
@@ -371,10 +379,10 @@
 	struct rxe_pool *pool = elem->pool;
 	unsigned long flags;
 
-	spin_lock_irqsave(&pool->pool_lock, flags);
+	write_lock_irqsave(&pool->pool_lock, flags);
 	clear_bit(elem->index - pool->min_index, pool->table);
 	rb_erase(&elem->node, &pool->tree);
-	spin_unlock_irqrestore(&pool->pool_lock, flags);
+	write_unlock_irqrestore(&pool->pool_lock, flags);
 }
 
 void *rxe_alloc(struct rxe_pool *pool)
@@ -384,37 +392,72 @@
 
 	might_sleep_if(!(pool->flags & RXE_POOL_ATOMIC));
 
-	spin_lock_irqsave(&pool->pool_lock, flags);
-	if (pool->state != rxe_pool_valid) {
-		spin_unlock_irqrestore(&pool->pool_lock, flags);
+	read_lock_irqsave(&pool->pool_lock, flags);
+	if (pool->state != RXE_POOL_STATE_VALID) {
+		read_unlock_irqrestore(&pool->pool_lock, flags);
 		return NULL;
 	}
 	kref_get(&pool->ref_cnt);
-	spin_unlock_irqrestore(&pool->pool_lock, flags);
+	read_unlock_irqrestore(&pool->pool_lock, flags);
 
-	kref_get(&pool->rxe->ref_cnt);
+	if (!ib_device_try_get(&pool->rxe->ib_dev))
+		goto out_put_pool;
 
 	if (atomic_inc_return(&pool->num_elem) > pool->max_elem)
-		goto out_put_pool;
+		goto out_cnt;
 
 	elem = kmem_cache_zalloc(pool_cache(pool),
 				 (pool->flags & RXE_POOL_ATOMIC) ?
 				 GFP_ATOMIC : GFP_KERNEL);
 	if (!elem)
-		goto out_put_pool;
+		goto out_cnt;
 
 	elem->pool = pool;
 	kref_init(&elem->ref_cnt);
 
 	return elem;
 
-out_put_pool:
+out_cnt:
 	atomic_dec(&pool->num_elem);
-	rxe_dev_put(pool->rxe);
+	ib_device_put(&pool->rxe->ib_dev);
+out_put_pool:
 	rxe_pool_put(pool);
 	return NULL;
 }
 
+int rxe_add_to_pool(struct rxe_pool *pool, struct rxe_pool_entry *elem)
+{
+	unsigned long flags;
+
+	might_sleep_if(!(pool->flags & RXE_POOL_ATOMIC));
+
+	read_lock_irqsave(&pool->pool_lock, flags);
+	if (pool->state != RXE_POOL_STATE_VALID) {
+		read_unlock_irqrestore(&pool->pool_lock, flags);
+		return -EINVAL;
+	}
+	kref_get(&pool->ref_cnt);
+	read_unlock_irqrestore(&pool->pool_lock, flags);
+
+	if (!ib_device_try_get(&pool->rxe->ib_dev))
+		goto out_put_pool;
+
+	if (atomic_inc_return(&pool->num_elem) > pool->max_elem)
+		goto out_cnt;
+
+	elem->pool = pool;
+	kref_init(&elem->ref_cnt);
+
+	return 0;
+
+out_cnt:
+	atomic_dec(&pool->num_elem);
+	ib_device_put(&pool->rxe->ib_dev);
+out_put_pool:
+	rxe_pool_put(pool);
+	return -EINVAL;
+}
+
 void rxe_elem_release(struct kref *kref)
 {
 	struct rxe_pool_entry *elem =
@@ -424,9 +467,10 @@
 	if (pool->cleanup)
 		pool->cleanup(elem);
 
-	kmem_cache_free(pool_cache(pool), elem);
+	if (!(pool->flags & RXE_POOL_NO_ALLOC))
+		kmem_cache_free(pool_cache(pool), elem);
 	atomic_dec(&pool->num_elem);
-	rxe_dev_put(pool->rxe);
+	ib_device_put(&pool->rxe->ib_dev);
 	rxe_pool_put(pool);
 }
 
@@ -436,9 +480,9 @@
 	struct rxe_pool_entry *elem = NULL;
 	unsigned long flags;
 
-	spin_lock_irqsave(&pool->pool_lock, flags);
+	read_lock_irqsave(&pool->pool_lock, flags);
 
-	if (pool->state != rxe_pool_valid)
+	if (pool->state != RXE_POOL_STATE_VALID)
 		goto out;
 
 	node = pool->tree.rb_node;
@@ -450,15 +494,14 @@
 			node = node->rb_left;
 		else if (elem->index < index)
 			node = node->rb_right;
-		else
+		else {
+			kref_get(&elem->ref_cnt);
 			break;
+		}
 	}
 
-	if (node)
-		kref_get(&elem->ref_cnt);
-
 out:
-	spin_unlock_irqrestore(&pool->pool_lock, flags);
+	read_unlock_irqrestore(&pool->pool_lock, flags);
 	return node ? elem : NULL;
 }
 
@@ -469,9 +512,9 @@
 	int cmp;
 	unsigned long flags;
 
-	spin_lock_irqsave(&pool->pool_lock, flags);
+	read_lock_irqsave(&pool->pool_lock, flags);
 
-	if (pool->state != rxe_pool_valid)
+	if (pool->state != RXE_POOL_STATE_VALID)
 		goto out;
 
 	node = pool->tree.rb_node;
@@ -494,6 +537,6 @@
 		kref_get(&elem->ref_cnt);
 
 out:
-	spin_unlock_irqrestore(&pool->pool_lock, flags);
+	read_unlock_irqrestore(&pool->pool_lock, flags);
 	return node ? elem : NULL;
 }
diff --git a/drivers/infiniband/sw/rxe/rxe_pool.h b/drivers/infiniband/sw/rxe/rxe_pool.h
index 47df28e..2f2cff1 100644
--- a/drivers/infiniband/sw/rxe/rxe_pool.h
+++ b/drivers/infiniband/sw/rxe/rxe_pool.h
@@ -41,6 +41,7 @@
 	RXE_POOL_ATOMIC		= BIT(0),
 	RXE_POOL_INDEX		= BIT(1),
 	RXE_POOL_KEY		= BIT(2),
+	RXE_POOL_NO_ALLOC	= BIT(4),
 };
 
 enum rxe_elem_type {
@@ -74,8 +75,8 @@
 extern struct rxe_type_info rxe_type_info[];
 
 enum rxe_pool_state {
-	rxe_pool_invalid,
-	rxe_pool_valid,
+	RXE_POOL_STATE_INVALID,
+	RXE_POOL_STATE_VALID,
 };
 
 struct rxe_pool_entry {
@@ -90,7 +91,7 @@
 
 struct rxe_pool {
 	struct rxe_dev		*rxe;
-	spinlock_t              pool_lock; /* pool spinlock */
+	rwlock_t		pool_lock; /* protects pool add/del/search */
 	size_t			elem_size;
 	struct kref		ref_cnt;
 	void			(*cleanup)(struct rxe_pool_entry *obj);
@@ -126,11 +127,14 @@
 		  enum rxe_elem_type type, u32 max_elem);
 
 /* free resources from object pool */
-int rxe_pool_cleanup(struct rxe_pool *pool);
+void rxe_pool_cleanup(struct rxe_pool *pool);
 
 /* allocate an object from pool */
 void *rxe_alloc(struct rxe_pool *pool);
 
+/* connect already allocated object to pool */
+int rxe_add_to_pool(struct rxe_pool *pool, struct rxe_pool_entry *elem);
+
 /* assign an index to an indexed object and insert object into
  *  pool's rb tree
  */
diff --git a/drivers/infiniband/sw/rxe/rxe_qp.c b/drivers/infiniband/sw/rxe/rxe_qp.c
index c58452d..e2c6d1c 100644
--- a/drivers/infiniband/sw/rxe/rxe_qp.c
+++ b/drivers/infiniband/sw/rxe/rxe_qp.c
@@ -34,6 +34,8 @@
 #include <linux/skbuff.h>
 #include <linux/delay.h>
 #include <linux/sched.h>
+#include <linux/vmalloc.h>
+#include <rdma/uverbs_ioctl.h>
 
 #include "rxe.h"
 #include "rxe_loc.h"
@@ -96,7 +98,7 @@
 		goto err1;
 
 	if (init->qp_type == IB_QPT_SMI || init->qp_type == IB_QPT_GSI) {
-		if (port_num != 1) {
+		if (!rdma_is_port_valid(&rxe->ib_dev, port_num)) {
 			pr_warn("invalid port = %d\n", port_num);
 			goto err1;
 		}
@@ -215,8 +217,7 @@
 }
 
 static int rxe_qp_init_req(struct rxe_dev *rxe, struct rxe_qp *qp,
-			   struct ib_qp_init_attr *init,
-			   struct ib_ucontext *context,
+			   struct ib_qp_init_attr *init, struct ib_udata *udata,
 			   struct rxe_create_qp_resp __user *uresp)
 {
 	int err;
@@ -227,6 +228,16 @@
 		return err;
 	qp->sk->sk->sk_user_data = qp;
 
+	/* pick a source UDP port number for this QP based on
+	 * the source QPN. this spreads traffic for different QPs
+	 * across different NIC RX queues (while using a single
+	 * flow for a given QP to maintain packet order).
+	 * the port number must be in the Dynamic Ports range
+	 * (0xc000 - 0xffff).
+	 */
+	qp->src_port = RXE_ROCE_V2_SPORT +
+		(hash_32_generic(qp_num(qp), 14) & 0x3fff);
+
 	qp->sq.max_wr		= init->cap.max_send_wr;
 	qp->sq.max_sge		= init->cap.max_send_sge;
 	qp->sq.max_inline	= init->cap.max_inline_data;
@@ -242,12 +253,12 @@
 	if (!qp->sq.queue)
 		return -ENOMEM;
 
-	err = do_mmap_info(rxe, uresp ? &uresp->sq_mi : NULL, context,
+	err = do_mmap_info(rxe, uresp ? &uresp->sq_mi : NULL, udata,
 			   qp->sq.queue->buf, qp->sq.queue->buf_size,
 			   &qp->sq.queue->ip);
 
 	if (err) {
-		kvfree(qp->sq.queue->buf);
+		vfree(qp->sq.queue->buf);
 		kfree(qp->sq.queue);
 		return err;
 	}
@@ -275,7 +286,7 @@
 
 static int rxe_qp_init_resp(struct rxe_dev *rxe, struct rxe_qp *qp,
 			    struct ib_qp_init_attr *init,
-			    struct ib_ucontext *context,
+			    struct ib_udata *udata,
 			    struct rxe_create_qp_resp __user *uresp)
 {
 	int err;
@@ -296,11 +307,11 @@
 		if (!qp->rq.queue)
 			return -ENOMEM;
 
-		err = do_mmap_info(rxe, uresp ? &uresp->rq_mi : NULL, context,
+		err = do_mmap_info(rxe, uresp ? &uresp->rq_mi : NULL, udata,
 				   qp->rq.queue->buf, qp->rq.queue->buf_size,
 				   &qp->rq.queue->ip);
 		if (err) {
-			kvfree(qp->rq.queue->buf);
+			vfree(qp->rq.queue->buf);
 			kfree(qp->rq.queue);
 			return err;
 		}
@@ -325,13 +336,13 @@
 int rxe_qp_from_init(struct rxe_dev *rxe, struct rxe_qp *qp, struct rxe_pd *pd,
 		     struct ib_qp_init_attr *init,
 		     struct rxe_create_qp_resp __user *uresp,
-		     struct ib_pd *ibpd)
+		     struct ib_pd *ibpd,
+		     struct ib_udata *udata)
 {
 	int err;
 	struct rxe_cq *rcq = to_rcq(init->recv_cq);
 	struct rxe_cq *scq = to_rcq(init->send_cq);
 	struct rxe_srq *srq = init->srq ? to_rsrq(init->srq) : NULL;
-	struct ib_ucontext *context = ibpd->uobject ? ibpd->uobject->context : NULL;
 
 	rxe_add_ref(pd);
 	rxe_add_ref(rcq);
@@ -346,11 +357,11 @@
 
 	rxe_qp_init_misc(rxe, qp, init);
 
-	err = rxe_qp_init_req(rxe, qp, init, context, uresp);
+	err = rxe_qp_init_req(rxe, qp, init, udata, uresp);
 	if (err)
 		goto err1;
 
-	err = rxe_qp_init_resp(rxe, qp, init, context, uresp);
+	err = rxe_qp_init_resp(rxe, qp, init, udata, uresp);
 	if (err)
 		goto err2;
 
@@ -408,8 +419,7 @@
 	enum ib_qp_state new_state = (mask & IB_QP_STATE) ?
 					attr->qp_state : cur_state;
 
-	if (!ib_modify_qp_is_ok(cur_state, new_state, qp_type(qp), mask,
-				IB_LINK_LAYER_ETHERNET)) {
+	if (!ib_modify_qp_is_ok(cur_state, new_state, qp_type(qp), mask)) {
 		pr_warn("invalid mask or state for qp\n");
 		goto err1;
 	}
@@ -423,7 +433,7 @@
 	}
 
 	if (mask & IB_QP_PORT) {
-		if (attr->port_num != 1) {
+		if (!rdma_is_port_valid(&rxe->ib_dev, attr->port_num)) {
 			pr_warn("invalid port %d\n", attr->port_num);
 			goto err1;
 		}
@@ -438,7 +448,7 @@
 	if (mask & IB_QP_ALT_PATH) {
 		if (rxe_av_chk_attr(rxe, &attr->alt_ah_attr))
 			goto err1;
-		if (attr->alt_port_num != 1) {
+		if (!rdma_is_port_valid(&rxe->ib_dev, attr->alt_port_num))  {
 			pr_warn("invalid alt port %d\n", attr->alt_port_num);
 			goto err1;
 		}
@@ -620,14 +630,11 @@
 		qp->attr.qkey = attr->qkey;
 
 	if (mask & IB_QP_AV) {
-		rxe_av_from_attr(attr->port_num, &qp->pri_av, &attr->ah_attr);
-		rxe_av_fill_ip_info(&qp->pri_av, &attr->ah_attr);
+		rxe_init_av(&attr->ah_attr, &qp->pri_av);
 	}
 
 	if (mask & IB_QP_ALT_PATH) {
-		rxe_av_from_attr(attr->alt_port_num, &qp->alt_av,
-				 &attr->alt_ah_attr);
-		rxe_av_fill_ip_info(&qp->alt_av, &attr->alt_ah_attr);
+		rxe_init_av(&attr->alt_ah_attr, &qp->alt_av);
 		qp->attr.alt_port_num = attr->alt_port_num;
 		qp->attr.alt_pkey_index = attr->alt_pkey_index;
 		qp->attr.alt_timeout = attr->alt_timeout;
diff --git a/drivers/infiniband/sw/rxe/rxe_queue.c b/drivers/infiniband/sw/rxe/rxe_queue.c
index f84ab44..ff92704 100644
--- a/drivers/infiniband/sw/rxe/rxe_queue.c
+++ b/drivers/infiniband/sw/rxe/rxe_queue.c
@@ -36,18 +36,15 @@
 #include "rxe_loc.h"
 #include "rxe_queue.h"
 
-int do_mmap_info(struct rxe_dev *rxe,
-		 struct mminfo __user *outbuf,
-		 struct ib_ucontext *context,
-		 struct rxe_queue_buf *buf,
-		 size_t buf_size,
-		 struct rxe_mmap_info **ip_p)
+int do_mmap_info(struct rxe_dev *rxe, struct mminfo __user *outbuf,
+		 struct ib_udata *udata, struct rxe_queue_buf *buf,
+		 size_t buf_size, struct rxe_mmap_info **ip_p)
 {
 	int err;
 	struct rxe_mmap_info *ip = NULL;
 
 	if (outbuf) {
-		ip = rxe_create_mmap_info(rxe, buf_size, context, buf);
+		ip = rxe_create_mmap_info(rxe, buf_size, udata, buf);
 		if (!ip)
 			goto err1;
 
@@ -153,12 +150,9 @@
 	return 0;
 }
 
-int rxe_queue_resize(struct rxe_queue *q,
-		     unsigned int *num_elem_p,
-		     unsigned int elem_size,
-		     struct ib_ucontext *context,
-		     struct mminfo __user *outbuf,
-		     spinlock_t *producer_lock,
+int rxe_queue_resize(struct rxe_queue *q, unsigned int *num_elem_p,
+		     unsigned int elem_size, struct ib_udata *udata,
+		     struct mminfo __user *outbuf, spinlock_t *producer_lock,
 		     spinlock_t *consumer_lock)
 {
 	struct rxe_queue *new_q;
@@ -170,7 +164,7 @@
 	if (!new_q)
 		return -ENOMEM;
 
-	err = do_mmap_info(new_q->rxe, outbuf, context, new_q->buf,
+	err = do_mmap_info(new_q->rxe, outbuf, udata, new_q->buf,
 			   new_q->buf_size, &new_q->ip);
 	if (err) {
 		vfree(new_q->buf);
diff --git a/drivers/infiniband/sw/rxe/rxe_queue.h b/drivers/infiniband/sw/rxe/rxe_queue.h
index 79ba4b3..acd0a92 100644
--- a/drivers/infiniband/sw/rxe/rxe_queue.h
+++ b/drivers/infiniband/sw/rxe/rxe_queue.h
@@ -76,12 +76,9 @@
 	unsigned int		index_mask;
 };
 
-int do_mmap_info(struct rxe_dev *rxe,
-		 struct mminfo __user *outbuf,
-		 struct ib_ucontext *context,
-		 struct rxe_queue_buf *buf,
-		 size_t buf_size,
-		 struct rxe_mmap_info **ip_p);
+int do_mmap_info(struct rxe_dev *rxe, struct mminfo __user *outbuf,
+		 struct ib_udata *udata, struct rxe_queue_buf *buf,
+		 size_t buf_size, struct rxe_mmap_info **ip_p);
 
 void rxe_queue_reset(struct rxe_queue *q);
 
@@ -89,10 +86,8 @@
 				 int *num_elem,
 				 unsigned int elem_size);
 
-int rxe_queue_resize(struct rxe_queue *q,
-		     unsigned int *num_elem_p,
-		     unsigned int elem_size,
-		     struct ib_ucontext *context,
+int rxe_queue_resize(struct rxe_queue *q, unsigned int *num_elem_p,
+		     unsigned int elem_size, struct ib_udata *udata,
 		     struct mminfo __user *outbuf,
 		     /* Protect producers while resizing queue */
 		     spinlock_t *producer_lock,
diff --git a/drivers/infiniband/sw/rxe/rxe_recv.c b/drivers/infiniband/sw/rxe/rxe_recv.c
index d30dbac..f9a492e 100644
--- a/drivers/infiniband/sw/rxe/rxe_recv.c
+++ b/drivers/infiniband/sw/rxe/rxe_recv.c
@@ -122,7 +122,7 @@
 			set_bad_pkey_cntr(port);
 			goto err1;
 		}
-	} else if (qpn != 0) {
+	} else {
 		if (unlikely(!pkey_match(pkey,
 					 port->pkey_tbl[qp->attr.pkey_index]
 					))) {
@@ -134,7 +134,7 @@
 	}
 
 	if ((qp_type(qp) == IB_QPT_UD || qp_type(qp) == IB_QPT_GSI) &&
-	    qpn != 0 && pkt->mask) {
+	    pkt->mask) {
 		u32 qkey = (qpn == 1) ? GSI_QKEY : qp->attr.qkey;
 
 		if (unlikely(deth_qkey(pkt) != qkey)) {
@@ -266,14 +266,12 @@
 	return -EINVAL;
 }
 
-static inline void rxe_rcv_pkt(struct rxe_dev *rxe,
-			       struct rxe_pkt_info *pkt,
-			       struct sk_buff *skb)
+static inline void rxe_rcv_pkt(struct rxe_pkt_info *pkt, struct sk_buff *skb)
 {
 	if (pkt->mask & RXE_REQ_MASK)
-		rxe_resp_queue_pkt(rxe, pkt->qp, skb);
+		rxe_resp_queue_pkt(pkt->qp, skb);
 	else
-		rxe_comp_queue_pkt(rxe, pkt->qp, skb);
+		rxe_comp_queue_pkt(pkt->qp, skb);
 }
 
 static void rxe_rcv_mcast_pkt(struct rxe_dev *rxe, struct sk_buff *skb)
@@ -319,7 +317,7 @@
 
 		pkt->qp = qp;
 		rxe_add_ref(qp);
-		rxe_rcv_pkt(rxe, pkt, skb);
+		rxe_rcv_pkt(pkt, skb);
 	}
 
 	spin_unlock_bh(&mcg->mcg_lock);
@@ -411,7 +409,7 @@
 	if (unlikely(bth_qpn(pkt) == IB_MULTICAST_QPN))
 		rxe_rcv_mcast_pkt(rxe, skb);
 	else
-		rxe_rcv_pkt(rxe, pkt, skb);
+		rxe_rcv_pkt(pkt, skb);
 
 	return;
 
diff --git a/drivers/infiniband/sw/rxe/rxe_req.c b/drivers/infiniband/sw/rxe/rxe_req.c
index 8be2723..c5d9b55 100644
--- a/drivers/infiniband/sw/rxe/rxe_req.c
+++ b/drivers/infiniband/sw/rxe/rxe_req.c
@@ -73,9 +73,6 @@
 	int npsn;
 	int first = 1;
 
-	wqe = queue_head(qp->sq.queue);
-	npsn = (qp->comp.psn - wqe->first_psn) & BTH_PSN_MASK;
-
 	qp->req.wqe_index	= consumer_index(qp->sq.queue);
 	qp->req.psn		= qp->comp.psn;
 	qp->req.opcode		= -1;
@@ -107,11 +104,17 @@
 		if (first) {
 			first = 0;
 
-			if (mask & WR_WRITE_OR_SEND_MASK)
+			if (mask & WR_WRITE_OR_SEND_MASK) {
+				npsn = (qp->comp.psn - wqe->first_psn) &
+					BTH_PSN_MASK;
 				retry_first_write_send(qp, wqe, mask, npsn);
+			}
 
-			if (mask & WR_READ_MASK)
+			if (mask & WR_READ_MASK) {
+				npsn = (wqe->dma.length - wqe->dma.resid) /
+					qp->mtu;
 				wqe->iova += npsn * qp->mtu;
+			}
 		}
 
 		wqe->state = wqe_state_posted;
@@ -435,7 +438,7 @@
 	if (pkt->mask & RXE_RETH_MASK) {
 		reth_set_rkey(pkt, ibwr->wr.rdma.rkey);
 		reth_set_va(pkt, wqe->iova);
-		reth_set_len(pkt, wqe->dma.length);
+		reth_set_len(pkt, wqe->dma.resid);
 	}
 
 	if (pkt->mask & RXE_IMMDT_MASK)
@@ -476,7 +479,7 @@
 	u32 *p;
 	int err;
 
-	err = rxe_prepare(rxe, pkt, skb, &crc);
+	err = rxe_prepare(pkt, skb, &crc);
 	if (err)
 		return err;
 
@@ -640,6 +643,7 @@
 			rmr->access = wqe->wr.wr.reg.access;
 			rmr->lkey = wqe->wr.wr.reg.key;
 			rmr->rkey = wqe->wr.wr.reg.key;
+			rmr->iova = wqe->wr.wr.reg.mr->iova;
 			wqe->state = wqe_state_done;
 			wqe->status = IB_WC_SUCCESS;
 		} else {
@@ -725,7 +729,7 @@
 	save_state(wqe, qp, &rollback_wqe, &rollback_psn);
 	update_wqe_state(qp, wqe, &pkt);
 	update_wqe_psn(qp, wqe, &pkt, payload);
-	ret = rxe_xmit_packet(to_rdev(qp->ibqp.device), qp, &pkt, skb);
+	ret = rxe_xmit_packet(qp, &pkt, skb);
 	if (ret) {
 		qp->need_req_skb = 1;
 
diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c
index fc6c880..1cbfbd9 100644
--- a/drivers/infiniband/sw/rxe/rxe_resp.c
+++ b/drivers/infiniband/sw/rxe/rxe_resp.c
@@ -104,8 +104,7 @@
 };
 
 /* rxe_recv calls here to add a request packet to the input queue */
-void rxe_resp_queue_pkt(struct rxe_dev *rxe, struct rxe_qp *qp,
-			struct sk_buff *skb)
+void rxe_resp_queue_pkt(struct rxe_qp *qp, struct sk_buff *skb)
 {
 	int must_sched;
 	struct rxe_pkt_info *pkt = SKB_TO_PKT(skb);
@@ -124,12 +123,9 @@
 	struct sk_buff *skb;
 
 	if (qp->resp.state == QP_STATE_ERROR) {
-		skb = skb_dequeue(&qp->req_pkts);
-		if (skb) {
-			/* drain request packet queue */
+		while ((skb = skb_dequeue(&qp->req_pkts))) {
 			rxe_drop_ref(qp);
 			kfree_skb(skb);
-			return RESPST_GET_REQ;
 		}
 
 		/* go drain recv wr queue */
@@ -435,6 +431,7 @@
 			qp->resp.va = reth_va(pkt);
 			qp->resp.rkey = reth_rkey(pkt);
 			qp->resp.resid = reth_len(pkt);
+			qp->resp.length = reth_len(pkt);
 		}
 		access = (pkt->mask & RXE_READ_MASK) ? IB_ACCESS_REMOTE_READ
 						     : IB_ACCESS_REMOTE_WRITE;
@@ -637,7 +634,7 @@
 	if (ack->mask & RXE_ATMACK_MASK)
 		atmack_set_orig(ack, qp->resp.atomic_orig);
 
-	err = rxe_prepare(rxe, ack, skb, &crc);
+	err = rxe_prepare(ack, skb, &crc);
 	if (err) {
 		kfree_skb(skb);
 		return NULL;
@@ -660,7 +657,6 @@
 static enum resp_states read_reply(struct rxe_qp *qp,
 				   struct rxe_pkt_info *req_pkt)
 {
-	struct rxe_dev *rxe = to_rdev(qp->ibqp.device);
 	struct rxe_pkt_info ack_pkt;
 	struct sk_buff *skb;
 	int mtu = qp->mtu;
@@ -739,7 +735,7 @@
 	p = payload_addr(&ack_pkt) + payload + bth_pad(&ack_pkt);
 	*p = ~icrc;
 
-	err = rxe_xmit_packet(rxe, qp, &ack_pkt, skb);
+	err = rxe_xmit_packet(qp, &ack_pkt, skb);
 	if (err) {
 		pr_err("Failed sending RDMA reply.\n");
 		return RESPST_ERR_RNR;
@@ -838,23 +834,32 @@
 	struct ib_wc *wc = &cqe.ibwc;
 	struct ib_uverbs_wc *uwc = &cqe.uibwc;
 	struct rxe_recv_wqe *wqe = qp->resp.wqe;
+	struct rxe_dev *rxe = to_rdev(qp->ibqp.device);
 
 	if (unlikely(!wqe))
 		return RESPST_CLEANUP;
 
 	memset(&cqe, 0, sizeof(cqe));
 
-	wc->wr_id		= wqe->wr_id;
-	wc->status		= qp->resp.status;
-	wc->qp			= &qp->ibqp;
+	if (qp->rcq->is_user) {
+		uwc->status             = qp->resp.status;
+		uwc->qp_num             = qp->ibqp.qp_num;
+		uwc->wr_id              = wqe->wr_id;
+	} else {
+		wc->status              = qp->resp.status;
+		wc->qp                  = &qp->ibqp;
+		wc->wr_id               = wqe->wr_id;
+	}
 
-	/* fields after status are not required for errors */
 	if (wc->status == IB_WC_SUCCESS) {
+		rxe_counter_inc(rxe, RXE_CNT_RDMA_RECV);
 		wc->opcode = (pkt->mask & RXE_IMMDT_MASK &&
 				pkt->mask & RXE_WRITE_MASK) ?
 					IB_WC_RECV_RDMA_WITH_IMM : IB_WC_RECV;
 		wc->vendor_err = 0;
-		wc->byte_len = wqe->dma.length - wqe->dma.resid;
+		wc->byte_len = (pkt->mask & RXE_IMMDT_MASK &&
+				pkt->mask & RXE_WRITE_MASK) ?
+					qp->resp.length : wqe->dma.length - wqe->dma.resid;
 
 		/* fields after byte_len are different between kernel and user
 		 * space
@@ -898,7 +903,6 @@
 			}
 
 			if (pkt->mask & RXE_IETH_MASK) {
-				struct rxe_dev *rxe = to_rdev(qp->ibqp.device);
 				struct rxe_mem *rmr;
 
 				wc->wc_flags |= IB_WC_WITH_INVALIDATE;
@@ -950,7 +954,6 @@
 	int err = 0;
 	struct rxe_pkt_info ack_pkt;
 	struct sk_buff *skb;
-	struct rxe_dev *rxe = to_rdev(qp->ibqp.device);
 
 	skb = prepare_ack_packet(qp, pkt, &ack_pkt, IB_OPCODE_RC_ACKNOWLEDGE,
 				 0, psn, syndrome, NULL);
@@ -959,7 +962,7 @@
 		goto err1;
 	}
 
-	err = rxe_xmit_packet(rxe, qp, &ack_pkt, skb);
+	err = rxe_xmit_packet(qp, &ack_pkt, skb);
 	if (err)
 		pr_err_ratelimited("Failed sending ack\n");
 
@@ -973,7 +976,6 @@
 	int rc = 0;
 	struct rxe_pkt_info ack_pkt;
 	struct sk_buff *skb;
-	struct rxe_dev *rxe = to_rdev(qp->ibqp.device);
 	struct resp_res *res;
 
 	skb = prepare_ack_packet(qp, pkt, &ack_pkt,
@@ -1001,7 +1003,7 @@
 	res->last_psn  = ack_pkt.psn;
 	res->cur_psn   = ack_pkt.psn;
 
-	rc = rxe_xmit_packet(rxe, qp, &ack_pkt, skb);
+	rc = rxe_xmit_packet(qp, &ack_pkt, skb);
 	if (rc) {
 		pr_err_ratelimited("Failed sending ack\n");
 		rxe_drop_ref(qp);
@@ -1131,8 +1133,7 @@
 		if (res) {
 			skb_get(res->atomic.skb);
 			/* Resend the result. */
-			rc = rxe_xmit_packet(to_rdev(qp->ibqp.device), qp,
-					     pkt, res->atomic.skb);
+			rc = rxe_xmit_packet(qp, pkt, res->atomic.skb);
 			if (rc) {
 				pr_err("Failed resending result. This flow is not handled - skb ignored\n");
 				rc = RESPST_CLEANUP;
diff --git a/drivers/infiniband/sw/rxe/rxe_srq.c b/drivers/infiniband/sw/rxe/rxe_srq.c
index 0d6c04b..d845943 100644
--- a/drivers/infiniband/sw/rxe/rxe_srq.c
+++ b/drivers/infiniband/sw/rxe/rxe_srq.c
@@ -31,6 +31,7 @@
  * SOFTWARE.
  */
 
+#include <linux/vmalloc.h>
 #include "rxe.h"
 #include "rxe_loc.h"
 #include "rxe_queue.h"
@@ -98,8 +99,7 @@
 }
 
 int rxe_srq_from_init(struct rxe_dev *rxe, struct rxe_srq *srq,
-		      struct ib_srq_init_attr *init,
-		      struct ib_ucontext *context,
+		      struct ib_srq_init_attr *init, struct ib_udata *udata,
 		      struct rxe_create_srq_resp __user *uresp)
 {
 	int err;
@@ -127,15 +127,20 @@
 
 	srq->rq.queue = q;
 
-	err = do_mmap_info(rxe, uresp ? &uresp->mi : NULL, context, q->buf,
+	err = do_mmap_info(rxe, uresp ? &uresp->mi : NULL, udata, q->buf,
 			   q->buf_size, &q->ip);
-	if (err)
+	if (err) {
+		vfree(q->buf);
+		kfree(q);
 		return err;
+	}
 
 	if (uresp) {
 		if (copy_to_user(&uresp->srq_num, &srq->srq_num,
-				 sizeof(uresp->srq_num)))
+				 sizeof(uresp->srq_num))) {
+			rxe_queue_cleanup(q);
 			return -EFAULT;
+		}
 	}
 
 	return 0;
@@ -143,7 +148,7 @@
 
 int rxe_srq_from_attr(struct rxe_dev *rxe, struct rxe_srq *srq,
 		      struct ib_srq_attr *attr, enum ib_srq_attr_mask mask,
-		      struct rxe_modify_srq_cmd *ucmd)
+		      struct rxe_modify_srq_cmd *ucmd, struct ib_udata *udata)
 {
 	int err;
 	struct rxe_queue *q = srq->rq.queue;
@@ -157,11 +162,8 @@
 		mi = u64_to_user_ptr(ucmd->mmap_info_addr);
 
 		err = rxe_queue_resize(q, &attr->max_wr,
-				       rcv_wqe_size(srq->rq.max_sge),
-				       srq->rq.queue->ip ?
-						srq->rq.queue->ip->context :
-						NULL,
-				       mi, &srq->rq.producer_lock,
+				       rcv_wqe_size(srq->rq.max_sge), udata, mi,
+				       &srq->rq.producer_lock,
 				       &srq->rq.consumer_lock);
 		if (err)
 			goto err2;
diff --git a/drivers/infiniband/sw/rxe/rxe_sysfs.c b/drivers/infiniband/sw/rxe/rxe_sysfs.c
index d5ed757..ccda5f5 100644
--- a/drivers/infiniband/sw/rxe/rxe_sysfs.c
+++ b/drivers/infiniband/sw/rxe/rxe_sysfs.c
@@ -53,62 +53,42 @@
 	return len;
 }
 
-static void rxe_set_port_state(struct net_device *ndev)
-{
-	struct rxe_dev *rxe = net_to_rxe(ndev);
-	bool is_up = netif_running(ndev) && netif_carrier_ok(ndev);
-
-	if (!rxe)
-		goto out;
-
-	if (is_up)
-		rxe_port_up(rxe);
-	else
-		rxe_port_down(rxe); /* down for unknown state */
-out:
-	return;
-}
-
 static int rxe_param_set_add(const char *val, const struct kernel_param *kp)
 {
 	int len;
 	int err = 0;
 	char intf[32];
-	struct net_device *ndev = NULL;
-	struct rxe_dev *rxe;
+	struct net_device *ndev;
+	struct rxe_dev *exists;
 
 	len = sanitize_arg(val, intf, sizeof(intf));
 	if (!len) {
 		pr_err("add: invalid interface name\n");
-		err = -EINVAL;
-		goto err;
+		return -EINVAL;
 	}
 
 	ndev = dev_get_by_name(&init_net, intf);
 	if (!ndev) {
 		pr_err("interface %s not found\n", intf);
-		err = -EINVAL;
-		goto err;
+		return -EINVAL;
 	}
 
-	if (net_to_rxe(ndev)) {
+	exists = rxe_get_dev_from_net(ndev);
+	if (exists) {
+		ib_device_put(&exists->ib_dev);
 		pr_err("already configured on %s\n", intf);
 		err = -EINVAL;
 		goto err;
 	}
 
-	rxe = rxe_net_add(ndev);
-	if (!rxe) {
+	err = rxe_net_add("rxe%d", ndev);
+	if (err) {
 		pr_err("failed to add %s\n", intf);
-		err = -EINVAL;
 		goto err;
 	}
 
-	rxe_set_port_state(ndev);
-	pr_info("added %s to %s\n", rxe->ib_dev.name, intf);
 err:
-	if (ndev)
-		dev_put(ndev);
+	dev_put(ndev);
 	return err;
 }
 
@@ -116,7 +96,7 @@
 {
 	int len;
 	char intf[32];
-	struct rxe_dev *rxe;
+	struct ib_device *ib_dev;
 
 	len = sanitize_arg(val, intf, sizeof(intf));
 	if (!len) {
@@ -126,19 +106,17 @@
 
 	if (strncmp("all", intf, len) == 0) {
 		pr_info("rxe_sys: remove all");
-		rxe_remove_all();
+		ib_unregister_driver(RDMA_DRIVER_RXE);
 		return 0;
 	}
 
-	rxe = get_rxe_by_name(intf);
-
-	if (!rxe) {
+	ib_dev = ib_device_get_by_name(intf, RDMA_DRIVER_RXE);
+	if (!ib_dev) {
 		pr_err("not configured on %s\n", intf);
 		return -EINVAL;
 	}
 
-	list_del(&rxe->list);
-	rxe_remove(rxe);
+	ib_unregister_device_and_put(ib_dev);
 
 	return 0;
 }
@@ -152,6 +130,6 @@
 };
 
 module_param_cb(add, &rxe_add_ops, NULL, 0200);
-MODULE_PARM_DESC(add, "Create RXE device over network interface");
+MODULE_PARM_DESC(add, "DEPRECATED.  Create RXE device over network interface");
 module_param_cb(remove, &rxe_remove_ops, NULL, 0200);
-MODULE_PARM_DESC(remove, "Remove RXE device over network interface");
+MODULE_PARM_DESC(remove, "DEPRECATED.  Remove RXE device over network interface");
diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c
index f5b1e0a..623129f 100644
--- a/drivers/infiniband/sw/rxe/rxe_verbs.c
+++ b/drivers/infiniband/sw/rxe/rxe_verbs.c
@@ -33,6 +33,7 @@
 
 #include <linux/dma-mapping.h>
 #include <net/addrconf.h>
+#include <rdma/uverbs_ioctl.h>
 #include "rxe.h"
 #include "rxe_loc.h"
 #include "rxe_queue.h"
@@ -56,12 +57,7 @@
 {
 	struct rxe_dev *rxe = to_rdev(dev);
 	struct rxe_port *port;
-	int rc = -EINVAL;
-
-	if (unlikely(port_num != 1)) {
-		pr_warn("invalid port_number %d\n", port_num);
-		goto out;
-	}
+	int rc;
 
 	port = &rxe->port;
 
@@ -71,37 +67,25 @@
 	mutex_lock(&rxe->usdev_lock);
 	rc = ib_get_eth_speed(dev, port_num, &attr->active_speed,
 			      &attr->active_width);
+
+	if (attr->state == IB_PORT_ACTIVE)
+		attr->phys_state = IB_PORT_PHYS_STATE_LINK_UP;
+	else if (dev_get_flags(rxe->ndev) & IFF_UP)
+		attr->phys_state = IB_PORT_PHYS_STATE_POLLING;
+	else
+		attr->phys_state = IB_PORT_PHYS_STATE_DISABLED;
+
 	mutex_unlock(&rxe->usdev_lock);
 
-out:
 	return rc;
 }
 
-static struct net_device *rxe_get_netdev(struct ib_device *device,
-					 u8 port_num)
-{
-	struct rxe_dev *rxe = to_rdev(device);
-
-	if (rxe->ndev) {
-		dev_hold(rxe->ndev);
-		return rxe->ndev;
-	}
-
-	return NULL;
-}
-
 static int rxe_query_pkey(struct ib_device *device,
 			  u8 port_num, u16 index, u16 *pkey)
 {
 	struct rxe_dev *rxe = to_rdev(device);
 	struct rxe_port *port;
 
-	if (unlikely(port_num != 1)) {
-		dev_warn(device->dev.parent, "invalid port_num = %d\n",
-			 port_num);
-		goto err1;
-	}
-
 	port = &rxe->port;
 
 	if (unlikely(index >= port->attr.pkey_tbl_len)) {
@@ -139,11 +123,6 @@
 	struct rxe_dev *rxe = to_rdev(dev);
 	struct rxe_port *port;
 
-	if (unlikely(port_num != 1)) {
-		pr_warn("invalid port_num = %d\n", port_num);
-		goto err1;
-	}
-
 	port = &rxe->port;
 
 	port->attr.port_cap_flags |= attr->set_port_cap_mask;
@@ -153,9 +132,6 @@
 		port->attr.qkey_viol_cntr = 0;
 
 	return 0;
-
-err1:
-	return -EINVAL;
 }
 
 static enum rdma_link_layer rxe_get_link_layer(struct ib_device *dev,
@@ -166,22 +142,19 @@
 	return rxe_link_layer(rxe, port_num);
 }
 
-static struct ib_ucontext *rxe_alloc_ucontext(struct ib_device *dev,
-					      struct ib_udata *udata)
+static int rxe_alloc_ucontext(struct ib_ucontext *uctx, struct ib_udata *udata)
 {
-	struct rxe_dev *rxe = to_rdev(dev);
-	struct rxe_ucontext *uc;
+	struct rxe_dev *rxe = to_rdev(uctx->device);
+	struct rxe_ucontext *uc = to_ruc(uctx);
 
-	uc = rxe_alloc(&rxe->uc_pool);
-	return uc ? &uc->ibuc : ERR_PTR(-ENOMEM);
+	return rxe_add_to_pool(&rxe->uc_pool, &uc->pelem);
 }
 
-static int rxe_dealloc_ucontext(struct ib_ucontext *ibuc)
+static void rxe_dealloc_ucontext(struct ib_ucontext *ibuc)
 {
 	struct rxe_ucontext *uc = to_ruc(ibuc);
 
 	rxe_drop_ref(uc);
-	return 0;
 }
 
 static int rxe_port_immutable(struct ib_device *dev, u8 port_num,
@@ -203,55 +176,39 @@
 	return 0;
 }
 
-static struct ib_pd *rxe_alloc_pd(struct ib_device *dev,
-				  struct ib_ucontext *context,
-				  struct ib_udata *udata)
+static int rxe_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata)
 {
-	struct rxe_dev *rxe = to_rdev(dev);
-	struct rxe_pd *pd;
+	struct rxe_dev *rxe = to_rdev(ibpd->device);
+	struct rxe_pd *pd = to_rpd(ibpd);
 
-	pd = rxe_alloc(&rxe->pd_pool);
-	return pd ? &pd->ibpd : ERR_PTR(-ENOMEM);
+	return rxe_add_to_pool(&rxe->pd_pool, &pd->pelem);
 }
 
-static int rxe_dealloc_pd(struct ib_pd *ibpd)
+static void rxe_dealloc_pd(struct ib_pd *ibpd, struct ib_udata *udata)
 {
 	struct rxe_pd *pd = to_rpd(ibpd);
 
 	rxe_drop_ref(pd);
-	return 0;
 }
 
-static void rxe_init_av(struct rxe_dev *rxe, struct rdma_ah_attr *attr,
-			struct rxe_av *av)
-{
-	rxe_av_from_attr(rdma_ah_get_port_num(attr), av, attr);
-	rxe_av_fill_ip_info(av, attr);
-}
-
-static struct ib_ah *rxe_create_ah(struct ib_pd *ibpd,
-				   struct rdma_ah_attr *attr,
-				   struct ib_udata *udata)
+static int rxe_create_ah(struct ib_ah *ibah, struct rdma_ah_attr *attr,
+			 u32 flags, struct ib_udata *udata)
 
 {
 	int err;
-	struct rxe_dev *rxe = to_rdev(ibpd->device);
-	struct rxe_pd *pd = to_rpd(ibpd);
-	struct rxe_ah *ah;
+	struct rxe_dev *rxe = to_rdev(ibah->device);
+	struct rxe_ah *ah = to_rah(ibah);
 
 	err = rxe_av_chk_attr(rxe, attr);
 	if (err)
-		return ERR_PTR(err);
+		return err;
 
-	ah = rxe_alloc(&rxe->ah_pool);
-	if (!ah)
-		return ERR_PTR(-ENOMEM);
+	err = rxe_add_to_pool(&rxe->ah_pool, &ah->pelem);
+	if (err)
+		return err;
 
-	rxe_add_ref(pd);
-	ah->pd = pd;
-
-	rxe_init_av(rxe, attr, &ah->av);
-	return &ah->ibah;
+	rxe_init_av(attr, &ah->av);
+	return 0;
 }
 
 static int rxe_modify_ah(struct ib_ah *ibah, struct rdma_ah_attr *attr)
@@ -264,7 +221,7 @@
 	if (err)
 		return err;
 
-	rxe_init_av(rxe, attr, &ah->av);
+	rxe_init_av(attr, &ah->av);
 	return 0;
 }
 
@@ -278,13 +235,11 @@
 	return 0;
 }
 
-static int rxe_destroy_ah(struct ib_ah *ibah)
+static void rxe_destroy_ah(struct ib_ah *ibah, u32 flags)
 {
 	struct rxe_ah *ah = to_rah(ibah);
 
-	rxe_drop_ref(ah->pd);
 	rxe_drop_ref(ah);
-	return 0;
 }
 
 static int post_one_recv(struct rxe_rq *rq, const struct ib_recv_wr *ibwr)
@@ -334,20 +289,18 @@
 	return err;
 }
 
-static struct ib_srq *rxe_create_srq(struct ib_pd *ibpd,
-				     struct ib_srq_init_attr *init,
-				     struct ib_udata *udata)
+static int rxe_create_srq(struct ib_srq *ibsrq, struct ib_srq_init_attr *init,
+			  struct ib_udata *udata)
 {
 	int err;
-	struct rxe_dev *rxe = to_rdev(ibpd->device);
-	struct rxe_pd *pd = to_rpd(ibpd);
-	struct rxe_srq *srq;
-	struct ib_ucontext *context = udata ? ibpd->uobject->context : NULL;
+	struct rxe_dev *rxe = to_rdev(ibsrq->device);
+	struct rxe_pd *pd = to_rpd(ibsrq->pd);
+	struct rxe_srq *srq = to_rsrq(ibsrq);
 	struct rxe_create_srq_resp __user *uresp = NULL;
 
 	if (udata) {
 		if (udata->outlen < sizeof(*uresp))
-			return ERR_PTR(-EINVAL);
+			return -EINVAL;
 		uresp = udata->outbuf;
 	}
 
@@ -355,28 +308,24 @@
 	if (err)
 		goto err1;
 
-	srq = rxe_alloc(&rxe->srq_pool);
-	if (!srq) {
-		err = -ENOMEM;
+	err = rxe_add_to_pool(&rxe->srq_pool, &srq->pelem);
+	if (err)
 		goto err1;
-	}
 
-	rxe_add_index(srq);
 	rxe_add_ref(pd);
 	srq->pd = pd;
 
-	err = rxe_srq_from_init(rxe, srq, init, context, uresp);
+	err = rxe_srq_from_init(rxe, srq, init, udata, uresp);
 	if (err)
 		goto err2;
 
-	return &srq->ibsrq;
+	return 0;
 
 err2:
 	rxe_drop_ref(pd);
-	rxe_drop_index(srq);
 	rxe_drop_ref(srq);
 err1:
-	return ERR_PTR(err);
+	return err;
 }
 
 static int rxe_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
@@ -401,7 +350,7 @@
 	if (err)
 		goto err1;
 
-	err = rxe_srq_from_attr(rxe, srq, attr, mask, &ucmd);
+	err = rxe_srq_from_attr(rxe, srq, attr, mask, &ucmd, udata);
 	if (err)
 		goto err1;
 
@@ -424,7 +373,7 @@
 	return 0;
 }
 
-static int rxe_destroy_srq(struct ib_srq *ibsrq)
+static void rxe_destroy_srq(struct ib_srq *ibsrq, struct ib_udata *udata)
 {
 	struct rxe_srq *srq = to_rsrq(ibsrq);
 
@@ -432,10 +381,7 @@
 		rxe_queue_cleanup(srq->rq.queue);
 
 	rxe_drop_ref(srq->pd);
-	rxe_drop_index(srq);
 	rxe_drop_ref(srq);
-
-	return 0;
 }
 
 static int rxe_post_srq_recv(struct ib_srq *ibsrq, const struct ib_recv_wr *wr,
@@ -498,7 +444,7 @@
 
 	rxe_add_index(qp);
 
-	err = rxe_qp_from_init(rxe, qp, pd, init, uresp, ibpd);
+	err = rxe_qp_from_init(rxe, qp, pd, init, uresp, ibpd, udata);
 	if (err)
 		goto err3;
 
@@ -544,7 +490,7 @@
 	return 0;
 }
 
-static int rxe_destroy_qp(struct ib_qp *ibqp)
+static int rxe_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata)
 {
 	struct rxe_qp *qp = to_rqp(ibqp);
 
@@ -832,56 +778,43 @@
 	return err;
 }
 
-static struct ib_cq *rxe_create_cq(struct ib_device *dev,
-				   const struct ib_cq_init_attr *attr,
-				   struct ib_ucontext *context,
-				   struct ib_udata *udata)
+static int rxe_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
+			 struct ib_udata *udata)
 {
 	int err;
+	struct ib_device *dev = ibcq->device;
 	struct rxe_dev *rxe = to_rdev(dev);
-	struct rxe_cq *cq;
+	struct rxe_cq *cq = to_rcq(ibcq);
 	struct rxe_create_cq_resp __user *uresp = NULL;
 
 	if (udata) {
 		if (udata->outlen < sizeof(*uresp))
-			return ERR_PTR(-EINVAL);
+			return -EINVAL;
 		uresp = udata->outbuf;
 	}
 
 	if (attr->flags)
-		return ERR_PTR(-EINVAL);
+		return -EINVAL;
 
 	err = rxe_cq_chk_attr(rxe, NULL, attr->cqe, attr->comp_vector);
 	if (err)
-		goto err1;
+		return err;
 
-	cq = rxe_alloc(&rxe->cq_pool);
-	if (!cq) {
-		err = -ENOMEM;
-		goto err1;
-	}
-
-	err = rxe_cq_from_init(rxe, cq, attr->cqe, attr->comp_vector,
-			       context, uresp);
+	err = rxe_cq_from_init(rxe, cq, attr->cqe, attr->comp_vector, udata,
+			       uresp);
 	if (err)
-		goto err2;
+		return err;
 
-	return &cq->ibcq;
-
-err2:
-	rxe_drop_ref(cq);
-err1:
-	return ERR_PTR(err);
+	return rxe_add_to_pool(&rxe->cq_pool, &cq->pelem);
 }
 
-static int rxe_destroy_cq(struct ib_cq *ibcq)
+static void rxe_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata)
 {
 	struct rxe_cq *cq = to_rcq(ibcq);
 
 	rxe_cq_disable(cq);
 
 	rxe_drop_ref(cq);
-	return 0;
 }
 
 static int rxe_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata)
@@ -901,7 +834,7 @@
 	if (err)
 		goto err1;
 
-	err = rxe_cq_resize_queue(cq, cqe, uresp);
+	err = rxe_cq_resize_queue(cq, cqe, uresp, udata);
 	if (err)
 		goto err1;
 
@@ -1025,7 +958,7 @@
 	return ERR_PTR(err);
 }
 
-static int rxe_dereg_mr(struct ib_mr *ibmr)
+static int rxe_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
 {
 	struct rxe_mem *mr = to_rmr(ibmr);
 
@@ -1036,9 +969,8 @@
 	return 0;
 }
 
-static struct ib_mr *rxe_alloc_mr(struct ib_pd *ibpd,
-				  enum ib_mr_type mr_type,
-				  u32 max_num_sg)
+static struct ib_mr *rxe_alloc_mr(struct ib_pd *ibpd, enum ib_mr_type mr_type,
+				  u32 max_num_sg, struct ib_udata *udata)
 {
 	struct rxe_dev *rxe = to_rdev(ibpd->device);
 	struct rxe_pd *pd = to_rpd(ibpd);
@@ -1140,29 +1072,97 @@
 static ssize_t parent_show(struct device *device,
 			   struct device_attribute *attr, char *buf)
 {
-	struct rxe_dev *rxe = container_of(device, struct rxe_dev,
-					   ib_dev.dev);
+	struct rxe_dev *rxe =
+		rdma_device_to_drv_device(device, struct rxe_dev, ib_dev);
 
 	return snprintf(buf, 16, "%s\n", rxe_parent_name(rxe, 1));
 }
 
 static DEVICE_ATTR_RO(parent);
 
-static struct device_attribute *rxe_dev_attributes[] = {
-	&dev_attr_parent,
+static struct attribute *rxe_dev_attributes[] = {
+	&dev_attr_parent.attr,
+	NULL
 };
 
-int rxe_register_device(struct rxe_dev *rxe)
+static const struct attribute_group rxe_attr_group = {
+	.attrs = rxe_dev_attributes,
+};
+
+static int rxe_enable_driver(struct ib_device *ib_dev)
+{
+	struct rxe_dev *rxe = container_of(ib_dev, struct rxe_dev, ib_dev);
+
+	rxe_set_port_state(rxe);
+	dev_info(&rxe->ib_dev.dev, "added %s\n", netdev_name(rxe->ndev));
+	return 0;
+}
+
+static const struct ib_device_ops rxe_dev_ops = {
+	.owner = THIS_MODULE,
+	.driver_id = RDMA_DRIVER_RXE,
+	.uverbs_abi_ver = RXE_UVERBS_ABI_VERSION,
+
+	.alloc_hw_stats = rxe_ib_alloc_hw_stats,
+	.alloc_mr = rxe_alloc_mr,
+	.alloc_pd = rxe_alloc_pd,
+	.alloc_ucontext = rxe_alloc_ucontext,
+	.attach_mcast = rxe_attach_mcast,
+	.create_ah = rxe_create_ah,
+	.create_cq = rxe_create_cq,
+	.create_qp = rxe_create_qp,
+	.create_srq = rxe_create_srq,
+	.dealloc_driver = rxe_dealloc,
+	.dealloc_pd = rxe_dealloc_pd,
+	.dealloc_ucontext = rxe_dealloc_ucontext,
+	.dereg_mr = rxe_dereg_mr,
+	.destroy_ah = rxe_destroy_ah,
+	.destroy_cq = rxe_destroy_cq,
+	.destroy_qp = rxe_destroy_qp,
+	.destroy_srq = rxe_destroy_srq,
+	.detach_mcast = rxe_detach_mcast,
+	.enable_driver = rxe_enable_driver,
+	.get_dma_mr = rxe_get_dma_mr,
+	.get_hw_stats = rxe_ib_get_hw_stats,
+	.get_link_layer = rxe_get_link_layer,
+	.get_port_immutable = rxe_port_immutable,
+	.map_mr_sg = rxe_map_mr_sg,
+	.mmap = rxe_mmap,
+	.modify_ah = rxe_modify_ah,
+	.modify_device = rxe_modify_device,
+	.modify_port = rxe_modify_port,
+	.modify_qp = rxe_modify_qp,
+	.modify_srq = rxe_modify_srq,
+	.peek_cq = rxe_peek_cq,
+	.poll_cq = rxe_poll_cq,
+	.post_recv = rxe_post_recv,
+	.post_send = rxe_post_send,
+	.post_srq_recv = rxe_post_srq_recv,
+	.query_ah = rxe_query_ah,
+	.query_device = rxe_query_device,
+	.query_pkey = rxe_query_pkey,
+	.query_port = rxe_query_port,
+	.query_qp = rxe_query_qp,
+	.query_srq = rxe_query_srq,
+	.reg_user_mr = rxe_reg_user_mr,
+	.req_notify_cq = rxe_req_notify_cq,
+	.resize_cq = rxe_resize_cq,
+
+	INIT_RDMA_OBJ_SIZE(ib_ah, rxe_ah, ibah),
+	INIT_RDMA_OBJ_SIZE(ib_cq, rxe_cq, ibcq),
+	INIT_RDMA_OBJ_SIZE(ib_pd, rxe_pd, ibpd),
+	INIT_RDMA_OBJ_SIZE(ib_srq, rxe_srq, ibsrq),
+	INIT_RDMA_OBJ_SIZE(ib_ucontext, rxe_ucontext, ibuc),
+};
+
+int rxe_register_device(struct rxe_dev *rxe, const char *ibdev_name)
 {
 	int err;
-	int i;
 	struct ib_device *dev = &rxe->ib_dev;
 	struct crypto_shash *tfm;
 
-	strlcpy(dev->name, "rxe%d", IB_DEVICE_NAME_MAX);
 	strlcpy(dev->node_desc, "rxe", sizeof(dev->node_desc));
 
-	dev->owner = THIS_MODULE;
 	dev->node_type = RDMA_NODE_IB_CA;
 	dev->phys_port_cnt = 1;
 	dev->num_comp_vectors = num_possible_cpus();
@@ -1174,7 +1174,6 @@
 	dma_coerce_mask_and_coherent(&dev->dev,
 				     dma_get_required_mask(&dev->dev));
 
-	dev->uverbs_abi_ver = RXE_UVERBS_ABI_VERSION;
 	dev->uverbs_cmd_mask = BIT_ULL(IB_USER_VERBS_CMD_GET_CONTEXT)
 	    | BIT_ULL(IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL)
 	    | BIT_ULL(IB_USER_VERBS_CMD_QUERY_DEVICE)
@@ -1208,49 +1207,10 @@
 	    | BIT_ULL(IB_USER_VERBS_CMD_DETACH_MCAST)
 	    ;
 
-	dev->query_device = rxe_query_device;
-	dev->modify_device = rxe_modify_device;
-	dev->query_port = rxe_query_port;
-	dev->modify_port = rxe_modify_port;
-	dev->get_link_layer = rxe_get_link_layer;
-	dev->get_netdev = rxe_get_netdev;
-	dev->query_pkey = rxe_query_pkey;
-	dev->alloc_ucontext = rxe_alloc_ucontext;
-	dev->dealloc_ucontext = rxe_dealloc_ucontext;
-	dev->mmap = rxe_mmap;
-	dev->get_port_immutable = rxe_port_immutable;
-	dev->alloc_pd = rxe_alloc_pd;
-	dev->dealloc_pd = rxe_dealloc_pd;
-	dev->create_ah = rxe_create_ah;
-	dev->modify_ah = rxe_modify_ah;
-	dev->query_ah = rxe_query_ah;
-	dev->destroy_ah = rxe_destroy_ah;
-	dev->create_srq = rxe_create_srq;
-	dev->modify_srq = rxe_modify_srq;
-	dev->query_srq = rxe_query_srq;
-	dev->destroy_srq = rxe_destroy_srq;
-	dev->post_srq_recv = rxe_post_srq_recv;
-	dev->create_qp = rxe_create_qp;
-	dev->modify_qp = rxe_modify_qp;
-	dev->query_qp = rxe_query_qp;
-	dev->destroy_qp = rxe_destroy_qp;
-	dev->post_send = rxe_post_send;
-	dev->post_recv = rxe_post_recv;
-	dev->create_cq = rxe_create_cq;
-	dev->destroy_cq = rxe_destroy_cq;
-	dev->resize_cq = rxe_resize_cq;
-	dev->poll_cq = rxe_poll_cq;
-	dev->peek_cq = rxe_peek_cq;
-	dev->req_notify_cq = rxe_req_notify_cq;
-	dev->get_dma_mr = rxe_get_dma_mr;
-	dev->reg_user_mr = rxe_reg_user_mr;
-	dev->dereg_mr = rxe_dereg_mr;
-	dev->alloc_mr = rxe_alloc_mr;
-	dev->map_mr_sg = rxe_map_mr_sg;
-	dev->attach_mcast = rxe_attach_mcast;
-	dev->detach_mcast = rxe_detach_mcast;
-	dev->get_hw_stats = rxe_ib_get_hw_stats;
-	dev->alloc_hw_stats = rxe_ib_alloc_hw_stats;
+	ib_set_device_ops(dev, &rxe_dev_ops);
+	err = ib_device_set_netdev(&rxe->ib_dev, rxe->ndev, 1);
+	if (err)
+		return err;
 
 	tfm = crypto_alloc_shash("crc32", 0, 0);
 	if (IS_ERR(tfm)) {
@@ -1260,41 +1220,14 @@
 	}
 	rxe->tfm = tfm;
 
-	dev->driver_id = RDMA_DRIVER_RXE;
-	err = ib_register_device(dev, NULL);
-	if (err) {
+	rdma_set_device_sysfs_group(dev, &rxe_attr_group);
+	err = ib_register_device(dev, ibdev_name);
+	if (err)
 		pr_warn("%s failed with error %d\n", __func__, err);
-		goto err1;
-	}
 
-	for (i = 0; i < ARRAY_SIZE(rxe_dev_attributes); ++i) {
-		err = device_create_file(&dev->dev, rxe_dev_attributes[i]);
-		if (err) {
-			pr_warn("%s failed with error %d for attr number %d\n",
-				__func__, err, i);
-			goto err2;
-		}
-	}
-
-	return 0;
-
-err2:
-	ib_unregister_device(dev);
-err1:
-	crypto_free_shash(rxe->tfm);
-
+	/*
+	 * Note that rxe may be invalid at this point if another thread
+	 * unregistered it.
+	 */
 	return err;
 }
-
-int rxe_unregister_device(struct rxe_dev *rxe)
-{
-	int i;
-	struct ib_device *dev = &rxe->ib_dev;
-
-	for (i = 0; i < ARRAY_SIZE(rxe_dev_attributes); ++i)
-		device_remove_file(&dev->dev, rxe_dev_attributes[i]);
-
-	ib_unregister_device(dev);
-
-	return 0;
-}
diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h
index 332a16d..5c4b223 100644
--- a/drivers/infiniband/sw/rxe/rxe_verbs.h
+++ b/drivers/infiniband/sw/rxe/rxe_verbs.h
@@ -61,18 +61,18 @@
 }
 
 struct rxe_ucontext {
+	struct ib_ucontext ibuc;
 	struct rxe_pool_entry	pelem;
-	struct ib_ucontext	ibuc;
 };
 
 struct rxe_pd {
+	struct ib_pd            ibpd;
 	struct rxe_pool_entry	pelem;
-	struct ib_pd		ibpd;
 };
 
 struct rxe_ah {
-	struct rxe_pool_entry	pelem;
 	struct ib_ah		ibah;
+	struct rxe_pool_entry	pelem;
 	struct rxe_pd		*pd;
 	struct rxe_av		av;
 };
@@ -85,8 +85,8 @@
 };
 
 struct rxe_cq {
-	struct rxe_pool_entry	pelem;
 	struct ib_cq		ibcq;
+	struct rxe_pool_entry	pelem;
 	struct rxe_queue	*queue;
 	spinlock_t		cq_lock;
 	u8			notify;
@@ -120,8 +120,8 @@
 };
 
 struct rxe_srq {
-	struct rxe_pool_entry	pelem;
 	struct ib_srq		ibsrq;
+	struct rxe_pool_entry	pelem;
 	struct rxe_pd		*pd;
 	struct rxe_rq		rq;
 	u32			srq_num;
@@ -158,6 +158,7 @@
 	int			opcode;
 	int			timeout;
 	int			timeout_retry;
+	int			started_retry;
 	u32			retry_cnt;
 	u32			rnr_retry;
 	struct rxe_task		task;
@@ -212,6 +213,7 @@
 	struct rxe_mem		*mr;
 	u32			resid;
 	u32			rkey;
+	u32			length;
 	u64			atomic_orig;
 
 	/* SRQ only */
@@ -250,6 +252,7 @@
 
 	struct socket		*sk;
 	u32			dst_cookie;
+	u16			src_port;
 
 	struct rxe_av		pri_av;
 	struct rxe_av		alt_av;
@@ -383,7 +386,6 @@
 	struct ib_device_attr	attr;
 	int			max_ucontext;
 	int			max_inline_data;
-	struct kref		ref_cnt;
 	struct mutex	usdev_lock;
 
 	struct net_device	*ndev;
@@ -407,16 +409,15 @@
 	spinlock_t		mmap_offset_lock; /* guard mmap_offset */
 	int			mmap_offset;
 
-	u64			stats_counters[RXE_NUM_OF_COUNTERS];
+	atomic64_t		stats_counters[RXE_NUM_OF_COUNTERS];
 
 	struct rxe_port		port;
-	struct list_head	list;
 	struct crypto_shash	*tfm;
 };
 
-static inline void rxe_counter_inc(struct rxe_dev *rxe, enum rxe_counters cnt)
+static inline void rxe_counter_inc(struct rxe_dev *rxe, enum rxe_counters index)
 {
-	rxe->stats_counters[cnt]++;
+	atomic64_inc(&rxe->stats_counters[index]);
 }
 
 static inline struct rxe_dev *to_rdev(struct ib_device *dev)
@@ -464,8 +465,7 @@
 	return mw ? container_of(mw, struct rxe_mem, ibmw) : NULL;
 }
 
-int rxe_register_device(struct rxe_dev *rxe);
-int rxe_unregister_device(struct rxe_dev *rxe);
+int rxe_register_device(struct rxe_dev *rxe, const char *ibdev_name);
 
 void rxe_mc_cleanup(struct rxe_pool_entry *arg);
 
diff --git a/drivers/infiniband/sw/siw/Kconfig b/drivers/infiniband/sw/siw/Kconfig
new file mode 100644
index 0000000..b622fc6
--- /dev/null
+++ b/drivers/infiniband/sw/siw/Kconfig
@@ -0,0 +1,18 @@
+config RDMA_SIW
+	tristate "Software RDMA over TCP/IP (iWARP) driver"
+	depends on INET && INFINIBAND && LIBCRC32C
+	select DMA_VIRT_OPS
+	help
+	This driver implements the iWARP RDMA transport over
+	the Linux TCP/IP network stack. It enables a system with a
+	standard Ethernet adapter to interoperate with a iWARP
+	adapter or with another system running the SIW driver.
+	(See also RXE which is a similar software driver for RoCE.)
+
+	The driver interfaces with the Linux RDMA stack and
+	implements both a kernel and user space RDMA verbs API.
+	The user space verbs API requires a support
+	library named libsiw which is loaded by the generic user
+	space verbs API, libibverbs. To implement RDMA over
+	TCP/IP, the driver further interfaces with the Linux
+	in-kernel TCP socket layer.
diff --git a/drivers/infiniband/sw/siw/Makefile b/drivers/infiniband/sw/siw/Makefile
new file mode 100644
index 0000000..f5f7e38
--- /dev/null
+++ b/drivers/infiniband/sw/siw/Makefile
@@ -0,0 +1,11 @@
+obj-$(CONFIG_RDMA_SIW) += siw.o
+
+siw-y := \
+	siw_cm.o \
+	siw_cq.o \
+	siw_main.o \
+	siw_mem.o \
+	siw_qp.o \
+	siw_qp_tx.o \
+	siw_qp_rx.o \
+	siw_verbs.o
diff --git a/drivers/infiniband/sw/siw/iwarp.h b/drivers/infiniband/sw/siw/iwarp.h
new file mode 100644
index 0000000..e8a04d9
--- /dev/null
+++ b/drivers/infiniband/sw/siw/iwarp.h
@@ -0,0 +1,380 @@
+/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */
+
+/* Authors: Bernard Metzler <bmt@zurich.ibm.com> */
+/* Copyright (c) 2008-2019, IBM Corporation */
+
+#ifndef _IWARP_H
+#define _IWARP_H
+
+#include <rdma/rdma_user_cm.h> /* RDMA_MAX_PRIVATE_DATA */
+#include <linux/types.h>
+#include <asm/byteorder.h>
+
+#define RDMAP_VERSION 1
+#define DDP_VERSION 1
+#define MPA_REVISION_1 1
+#define MPA_REVISION_2 2
+#define MPA_MAX_PRIVDATA RDMA_MAX_PRIVATE_DATA
+#define MPA_KEY_REQ "MPA ID Req Frame"
+#define MPA_KEY_REP "MPA ID Rep Frame"
+#define MPA_IRD_ORD_MASK 0x3fff
+
+struct mpa_rr_params {
+	__be16 bits;
+	__be16 pd_len;
+};
+
+/*
+ * MPA request/response header bits & fields
+ */
+enum {
+	MPA_RR_FLAG_MARKERS = cpu_to_be16(0x8000),
+	MPA_RR_FLAG_CRC = cpu_to_be16(0x4000),
+	MPA_RR_FLAG_REJECT = cpu_to_be16(0x2000),
+	MPA_RR_FLAG_ENHANCED = cpu_to_be16(0x1000),
+	MPA_RR_FLAG_GSO_EXP = cpu_to_be16(0x0800),
+	MPA_RR_MASK_REVISION = cpu_to_be16(0x00ff)
+};
+
+/*
+ * MPA request/reply header
+ */
+struct mpa_rr {
+	__u8 key[16];
+	struct mpa_rr_params params;
+};
+
+static inline void __mpa_rr_set_revision(__be16 *bits, u8 rev)
+{
+	*bits = (*bits & ~MPA_RR_MASK_REVISION) |
+		(cpu_to_be16(rev) & MPA_RR_MASK_REVISION);
+}
+
+static inline u8 __mpa_rr_revision(__be16 mpa_rr_bits)
+{
+	__be16 rev = mpa_rr_bits & MPA_RR_MASK_REVISION;
+
+	return be16_to_cpu(rev);
+}
+
+enum mpa_v2_ctrl {
+	MPA_V2_PEER_TO_PEER = cpu_to_be16(0x8000),
+	MPA_V2_ZERO_LENGTH_RTR = cpu_to_be16(0x4000),
+	MPA_V2_RDMA_WRITE_RTR = cpu_to_be16(0x8000),
+	MPA_V2_RDMA_READ_RTR = cpu_to_be16(0x4000),
+	MPA_V2_RDMA_NO_RTR = cpu_to_be16(0x0000),
+	MPA_V2_MASK_IRD_ORD = cpu_to_be16(0x3fff)
+};
+
+struct mpa_v2_data {
+	__be16 ird;
+	__be16 ord;
+};
+
+struct mpa_marker {
+	__be16 rsvd;
+	__be16 fpdu_hmd; /* FPDU header-marker distance (= MPA's FPDUPTR) */
+};
+
+/*
+ * maximum MPA trailer
+ */
+struct mpa_trailer {
+	__u8 pad[4];
+	__be32 crc;
+};
+
+#define MPA_HDR_SIZE 2
+#define MPA_CRC_SIZE 4
+
+/*
+ * Common portion of iWARP headers (MPA, DDP, RDMAP)
+ * for any FPDU
+ */
+struct iwarp_ctrl {
+	__be16 mpa_len;
+	__be16 ddp_rdmap_ctrl;
+};
+
+/*
+ * DDP/RDMAP Hdr bits & fields
+ */
+enum {
+	DDP_FLAG_TAGGED = cpu_to_be16(0x8000),
+	DDP_FLAG_LAST = cpu_to_be16(0x4000),
+	DDP_MASK_RESERVED = cpu_to_be16(0x3C00),
+	DDP_MASK_VERSION = cpu_to_be16(0x0300),
+	RDMAP_MASK_VERSION = cpu_to_be16(0x00C0),
+	RDMAP_MASK_RESERVED = cpu_to_be16(0x0030),
+	RDMAP_MASK_OPCODE = cpu_to_be16(0x000f)
+};
+
+static inline u8 __ddp_get_version(struct iwarp_ctrl *ctrl)
+{
+	return be16_to_cpu(ctrl->ddp_rdmap_ctrl & DDP_MASK_VERSION) >> 8;
+}
+
+static inline void __ddp_set_version(struct iwarp_ctrl *ctrl, u8 version)
+{
+	ctrl->ddp_rdmap_ctrl =
+		(ctrl->ddp_rdmap_ctrl & ~DDP_MASK_VERSION) |
+		(cpu_to_be16((u16)version << 8) & DDP_MASK_VERSION);
+}
+
+static inline u8 __rdmap_get_version(struct iwarp_ctrl *ctrl)
+{
+	__be16 ver = ctrl->ddp_rdmap_ctrl & RDMAP_MASK_VERSION;
+
+	return be16_to_cpu(ver) >> 6;
+}
+
+static inline void __rdmap_set_version(struct iwarp_ctrl *ctrl, u8 version)
+{
+	ctrl->ddp_rdmap_ctrl = (ctrl->ddp_rdmap_ctrl & ~RDMAP_MASK_VERSION) |
+			       (cpu_to_be16(version << 6) & RDMAP_MASK_VERSION);
+}
+
+static inline u8 __rdmap_get_opcode(struct iwarp_ctrl *ctrl)
+{
+	return be16_to_cpu(ctrl->ddp_rdmap_ctrl & RDMAP_MASK_OPCODE);
+}
+
+static inline void __rdmap_set_opcode(struct iwarp_ctrl *ctrl, u8 opcode)
+{
+	ctrl->ddp_rdmap_ctrl = (ctrl->ddp_rdmap_ctrl & ~RDMAP_MASK_OPCODE) |
+			       (cpu_to_be16(opcode) & RDMAP_MASK_OPCODE);
+}
+
+struct iwarp_rdma_write {
+	struct iwarp_ctrl ctrl;
+	__be32 sink_stag;
+	__be64 sink_to;
+};
+
+struct iwarp_rdma_rreq {
+	struct iwarp_ctrl ctrl;
+	__be32 rsvd;
+	__be32 ddp_qn;
+	__be32 ddp_msn;
+	__be32 ddp_mo;
+	__be32 sink_stag;
+	__be64 sink_to;
+	__be32 read_size;
+	__be32 source_stag;
+	__be64 source_to;
+};
+
+struct iwarp_rdma_rresp {
+	struct iwarp_ctrl ctrl;
+	__be32 sink_stag;
+	__be64 sink_to;
+};
+
+struct iwarp_send {
+	struct iwarp_ctrl ctrl;
+	__be32 rsvd;
+	__be32 ddp_qn;
+	__be32 ddp_msn;
+	__be32 ddp_mo;
+};
+
+struct iwarp_send_inv {
+	struct iwarp_ctrl ctrl;
+	__be32 inval_stag;
+	__be32 ddp_qn;
+	__be32 ddp_msn;
+	__be32 ddp_mo;
+};
+
+struct iwarp_terminate {
+	struct iwarp_ctrl ctrl;
+	__be32 rsvd;
+	__be32 ddp_qn;
+	__be32 ddp_msn;
+	__be32 ddp_mo;
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	__be32 layer : 4;
+	__be32 etype : 4;
+	__be32 ecode : 8;
+	__be32 flag_m : 1;
+	__be32 flag_d : 1;
+	__be32 flag_r : 1;
+	__be32 reserved : 13;
+#elif defined(__BIG_ENDIAN_BITFIELD)
+	__be32 reserved : 13;
+	__be32 flag_r : 1;
+	__be32 flag_d : 1;
+	__be32 flag_m : 1;
+	__be32 ecode : 8;
+	__be32 etype : 4;
+	__be32 layer : 4;
+#else
+#error "undefined byte order"
+#endif
+};
+
+/*
+ * Terminate Hdr bits & fields
+ */
+enum {
+	TERM_MASK_LAYER = cpu_to_be32(0xf0000000),
+	TERM_MASK_ETYPE = cpu_to_be32(0x0f000000),
+	TERM_MASK_ECODE = cpu_to_be32(0x00ff0000),
+	TERM_FLAG_M = cpu_to_be32(0x00008000),
+	TERM_FLAG_D = cpu_to_be32(0x00004000),
+	TERM_FLAG_R = cpu_to_be32(0x00002000),
+	TERM_MASK_RESVD = cpu_to_be32(0x00001fff)
+};
+
+static inline u8 __rdmap_term_layer(struct iwarp_terminate *term)
+{
+	return term->layer;
+}
+
+static inline void __rdmap_term_set_layer(struct iwarp_terminate *term,
+					  u8 layer)
+{
+	term->layer = layer & 0xf;
+}
+
+static inline u8 __rdmap_term_etype(struct iwarp_terminate *term)
+{
+	return term->etype;
+}
+
+static inline void __rdmap_term_set_etype(struct iwarp_terminate *term,
+					  u8 etype)
+{
+	term->etype = etype & 0xf;
+}
+
+static inline u8 __rdmap_term_ecode(struct iwarp_terminate *term)
+{
+	return term->ecode;
+}
+
+static inline void __rdmap_term_set_ecode(struct iwarp_terminate *term,
+					  u8 ecode)
+{
+	term->ecode = ecode;
+}
+
+/*
+ * Common portion of iWARP headers (MPA, DDP, RDMAP)
+ * for an FPDU carrying an untagged DDP segment
+ */
+struct iwarp_ctrl_untagged {
+	struct iwarp_ctrl ctrl;
+	__be32 rsvd;
+	__be32 ddp_qn;
+	__be32 ddp_msn;
+	__be32 ddp_mo;
+};
+
+/*
+ * Common portion of iWARP headers (MPA, DDP, RDMAP)
+ * for an FPDU carrying a tagged DDP segment
+ */
+struct iwarp_ctrl_tagged {
+	struct iwarp_ctrl ctrl;
+	__be32 ddp_stag;
+	__be64 ddp_to;
+};
+
+union iwarp_hdr {
+	struct iwarp_ctrl ctrl;
+	struct iwarp_ctrl_untagged c_untagged;
+	struct iwarp_ctrl_tagged c_tagged;
+	struct iwarp_rdma_write rwrite;
+	struct iwarp_rdma_rreq rreq;
+	struct iwarp_rdma_rresp rresp;
+	struct iwarp_terminate terminate;
+	struct iwarp_send send;
+	struct iwarp_send_inv send_inv;
+};
+
+enum term_elayer {
+	TERM_ERROR_LAYER_RDMAP = 0x00,
+	TERM_ERROR_LAYER_DDP = 0x01,
+	TERM_ERROR_LAYER_LLP = 0x02 /* eg., MPA */
+};
+
+enum ddp_etype {
+	DDP_ETYPE_CATASTROPHIC = 0x0,
+	DDP_ETYPE_TAGGED_BUF = 0x1,
+	DDP_ETYPE_UNTAGGED_BUF = 0x2,
+	DDP_ETYPE_RSVD = 0x3
+};
+
+enum ddp_ecode {
+	/* unspecified, set to zero */
+	DDP_ECODE_CATASTROPHIC = 0x00,
+	/* Tagged Buffer Errors */
+	DDP_ECODE_T_INVALID_STAG = 0x00,
+	DDP_ECODE_T_BASE_BOUNDS = 0x01,
+	DDP_ECODE_T_STAG_NOT_ASSOC = 0x02,
+	DDP_ECODE_T_TO_WRAP = 0x03,
+	DDP_ECODE_T_VERSION = 0x04,
+	/* Untagged Buffer Errors */
+	DDP_ECODE_UT_INVALID_QN = 0x01,
+	DDP_ECODE_UT_INVALID_MSN_NOBUF = 0x02,
+	DDP_ECODE_UT_INVALID_MSN_RANGE = 0x03,
+	DDP_ECODE_UT_INVALID_MO = 0x04,
+	DDP_ECODE_UT_MSG_TOOLONG = 0x05,
+	DDP_ECODE_UT_VERSION = 0x06
+};
+
+enum rdmap_untagged_qn {
+	RDMAP_UNTAGGED_QN_SEND = 0,
+	RDMAP_UNTAGGED_QN_RDMA_READ = 1,
+	RDMAP_UNTAGGED_QN_TERMINATE = 2,
+	RDMAP_UNTAGGED_QN_COUNT = 3
+};
+
+enum rdmap_etype {
+	RDMAP_ETYPE_CATASTROPHIC = 0x0,
+	RDMAP_ETYPE_REMOTE_PROTECTION = 0x1,
+	RDMAP_ETYPE_REMOTE_OPERATION = 0x2
+};
+
+enum rdmap_ecode {
+	RDMAP_ECODE_INVALID_STAG = 0x00,
+	RDMAP_ECODE_BASE_BOUNDS = 0x01,
+	RDMAP_ECODE_ACCESS_RIGHTS = 0x02,
+	RDMAP_ECODE_STAG_NOT_ASSOC = 0x03,
+	RDMAP_ECODE_TO_WRAP = 0x04,
+	RDMAP_ECODE_VERSION = 0x05,
+	RDMAP_ECODE_OPCODE = 0x06,
+	RDMAP_ECODE_CATASTROPHIC_STREAM = 0x07,
+	RDMAP_ECODE_CATASTROPHIC_GLOBAL = 0x08,
+	RDMAP_ECODE_CANNOT_INVALIDATE = 0x09,
+	RDMAP_ECODE_UNSPECIFIED = 0xff
+};
+
+enum llp_ecode {
+	LLP_ECODE_TCP_STREAM_LOST = 0x01, /* How to transfer this ?? */
+	LLP_ECODE_RECEIVED_CRC = 0x02,
+	LLP_ECODE_FPDU_START = 0x03,
+	LLP_ECODE_INVALID_REQ_RESP = 0x04,
+
+	/* Errors for Enhanced Connection Establishment only */
+	LLP_ECODE_LOCAL_CATASTROPHIC = 0x05,
+	LLP_ECODE_INSUFFICIENT_IRD = 0x06,
+	LLP_ECODE_NO_MATCHING_RTR = 0x07
+};
+
+enum llp_etype { LLP_ETYPE_MPA = 0x00 };
+
+enum rdma_opcode {
+	RDMAP_RDMA_WRITE = 0x0,
+	RDMAP_RDMA_READ_REQ = 0x1,
+	RDMAP_RDMA_READ_RESP = 0x2,
+	RDMAP_SEND = 0x3,
+	RDMAP_SEND_INVAL = 0x4,
+	RDMAP_SEND_SE = 0x5,
+	RDMAP_SEND_SE_INVAL = 0x6,
+	RDMAP_TERMINATE = 0x7,
+	RDMAP_NOT_SUPPORTED = RDMAP_TERMINATE + 1
+};
+
+#endif
diff --git a/drivers/infiniband/sw/siw/siw.h b/drivers/infiniband/sw/siw/siw.h
new file mode 100644
index 0000000..dba4535
--- /dev/null
+++ b/drivers/infiniband/sw/siw/siw.h
@@ -0,0 +1,745 @@
+/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */
+
+/* Authors: Bernard Metzler <bmt@zurich.ibm.com> */
+/* Copyright (c) 2008-2019, IBM Corporation */
+
+#ifndef _SIW_H
+#define _SIW_H
+
+#include <rdma/ib_verbs.h>
+#include <linux/socket.h>
+#include <linux/skbuff.h>
+#include <crypto/hash.h>
+#include <linux/crc32.h>
+#include <linux/crc32c.h>
+
+#include <rdma/siw-abi.h>
+#include "iwarp.h"
+
+#define SIW_VENDOR_ID 0x626d74 /* ascii 'bmt' for now */
+#define SIW_VENDORT_PART_ID 0
+#define SIW_MAX_QP (1024 * 100)
+#define SIW_MAX_QP_WR (1024 * 32)
+#define SIW_MAX_ORD_QP 128
+#define SIW_MAX_IRD_QP 128
+#define SIW_MAX_SGE_PBL 256 /* max num sge's for PBL */
+#define SIW_MAX_SGE_RD 1 /* iwarp limitation. we could relax */
+#define SIW_MAX_CQ (1024 * 100)
+#define SIW_MAX_CQE (SIW_MAX_QP_WR * 100)
+#define SIW_MAX_MR (SIW_MAX_QP * 10)
+#define SIW_MAX_PD SIW_MAX_QP
+#define SIW_MAX_MW 0 /* to be set if MW's are supported */
+#define SIW_MAX_FMR SIW_MAX_MR
+#define SIW_MAX_SRQ SIW_MAX_QP
+#define SIW_MAX_SRQ_WR (SIW_MAX_QP_WR * 10)
+#define SIW_MAX_CONTEXT SIW_MAX_PD
+
+/* Min number of bytes for using zero copy transmit */
+#define SENDPAGE_THRESH PAGE_SIZE
+
+/* Maximum number of frames which can be send in one SQ processing */
+#define SQ_USER_MAXBURST 100
+
+/* Maximum number of consecutive IRQ elements which get served
+ * if SQ has pending work. Prevents starving local SQ processing
+ * by serving peer Read Requests.
+ */
+#define SIW_IRQ_MAXBURST_SQ_ACTIVE 4
+
+struct siw_dev_cap {
+	int max_qp;
+	int max_qp_wr;
+	int max_ord; /* max. outbound read queue depth */
+	int max_ird; /* max. inbound read queue depth */
+	int max_sge;
+	int max_sge_rd;
+	int max_cq;
+	int max_cqe;
+	int max_mr;
+	int max_pd;
+	int max_mw;
+	int max_fmr;
+	int max_srq;
+	int max_srq_wr;
+	int max_srq_sge;
+};
+
+struct siw_pd {
+	struct ib_pd base_pd;
+};
+
+struct siw_device {
+	struct ib_device base_dev;
+	struct net_device *netdev;
+	struct siw_dev_cap attrs;
+
+	u32 vendor_part_id;
+	int numa_node;
+
+	/* physical port state (only one port per device) */
+	enum ib_port_state state;
+
+	spinlock_t lock;
+
+	struct xarray qp_xa;
+	struct xarray mem_xa;
+
+	struct list_head cep_list;
+	struct list_head qp_list;
+
+	/* active objects statistics to enforce limits */
+	atomic_t num_qp;
+	atomic_t num_cq;
+	atomic_t num_pd;
+	atomic_t num_mr;
+	atomic_t num_srq;
+	atomic_t num_ctx;
+
+	struct work_struct netdev_down;
+};
+
+struct siw_uobj {
+	void *addr;
+	u32 size;
+};
+
+struct siw_ucontext {
+	struct ib_ucontext base_ucontext;
+	struct siw_device *sdev;
+
+	/* xarray of user mappable objects */
+	struct xarray xa;
+	u32 uobj_nextkey;
+};
+
+/*
+ * The RDMA core does not define LOCAL_READ access, which is always
+ * enabled implictely.
+ */
+#define IWARP_ACCESS_MASK					\
+	(IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE	|	\
+	 IB_ACCESS_REMOTE_READ)
+
+/*
+ * siw presentation of user memory registered as source
+ * or target of RDMA operations.
+ */
+
+struct siw_page_chunk {
+	struct page **plist;
+};
+
+struct siw_umem {
+	struct siw_page_chunk *page_chunk;
+	int num_pages;
+	bool writable;
+	u64 fp_addr; /* First page base address */
+	struct mm_struct *owning_mm;
+};
+
+struct siw_pble {
+	dma_addr_t addr; /* Address of assigned buffer */
+	unsigned int size; /* Size of this entry */
+	unsigned long pbl_off; /* Total offset from start of PBL */
+};
+
+struct siw_pbl {
+	unsigned int num_buf;
+	unsigned int max_buf;
+	struct siw_pble pbe[1];
+};
+
+struct siw_mr;
+
+/*
+ * Generic memory representation for registered siw memory.
+ * Memory lookup always via higher 24 bit of STag (STag index).
+ */
+struct siw_mem {
+	struct siw_device *sdev;
+	struct kref ref;
+	u64 va; /* VA of memory */
+	u64 len; /* length of the memory buffer in bytes */
+	u32 stag; /* iWarp memory access steering tag */
+	u8 stag_valid; /* VALID or INVALID */
+	u8 is_pbl; /* PBL or user space mem */
+	u8 is_mw; /* Memory Region or Memory Window */
+	enum ib_access_flags perms; /* local/remote READ & WRITE */
+	union {
+		struct siw_umem *umem;
+		struct siw_pbl *pbl;
+		void *mem_obj;
+	};
+	struct ib_pd *pd;
+};
+
+struct siw_mr {
+	struct ib_mr base_mr;
+	struct siw_mem *mem;
+	struct rcu_head rcu;
+};
+
+/*
+ * Error codes for local or remote
+ * access to registered memory
+ */
+enum siw_access_state {
+	E_ACCESS_OK,
+	E_STAG_INVALID,
+	E_BASE_BOUNDS,
+	E_ACCESS_PERM,
+	E_PD_MISMATCH
+};
+
+enum siw_wr_state {
+	SIW_WR_IDLE,
+	SIW_WR_QUEUED, /* processing has not started yet */
+	SIW_WR_INPROGRESS /* initiated processing of the WR */
+};
+
+/* The WQE currently being processed (RX or TX) */
+struct siw_wqe {
+	/* Copy of applications SQE or RQE */
+	union {
+		struct siw_sqe sqe;
+		struct siw_rqe rqe;
+	};
+	struct siw_mem *mem[SIW_MAX_SGE]; /* per sge's resolved mem */
+	enum siw_wr_state wr_status;
+	enum siw_wc_status wc_status;
+	u32 bytes; /* total bytes to process */
+	u32 processed; /* bytes processed */
+};
+
+struct siw_cq {
+	struct ib_cq base_cq;
+	spinlock_t lock;
+	struct siw_cq_ctrl *notify;
+	struct siw_cqe *queue;
+	u32 cq_put;
+	u32 cq_get;
+	u32 num_cqe;
+	bool kernel_verbs;
+	u32 xa_cq_index; /* mmap information for CQE array */
+	u32 id; /* For debugging only */
+};
+
+enum siw_qp_state {
+	SIW_QP_STATE_IDLE,
+	SIW_QP_STATE_RTR,
+	SIW_QP_STATE_RTS,
+	SIW_QP_STATE_CLOSING,
+	SIW_QP_STATE_TERMINATE,
+	SIW_QP_STATE_ERROR,
+	SIW_QP_STATE_COUNT
+};
+
+enum siw_qp_flags {
+	SIW_RDMA_BIND_ENABLED = (1 << 0),
+	SIW_RDMA_WRITE_ENABLED = (1 << 1),
+	SIW_RDMA_READ_ENABLED = (1 << 2),
+	SIW_SIGNAL_ALL_WR = (1 << 3),
+	SIW_MPA_CRC = (1 << 4),
+	SIW_QP_IN_DESTROY = (1 << 5)
+};
+
+enum siw_qp_attr_mask {
+	SIW_QP_ATTR_STATE = (1 << 0),
+	SIW_QP_ATTR_ACCESS_FLAGS = (1 << 1),
+	SIW_QP_ATTR_LLP_HANDLE = (1 << 2),
+	SIW_QP_ATTR_ORD = (1 << 3),
+	SIW_QP_ATTR_IRD = (1 << 4),
+	SIW_QP_ATTR_SQ_SIZE = (1 << 5),
+	SIW_QP_ATTR_RQ_SIZE = (1 << 6),
+	SIW_QP_ATTR_MPA = (1 << 7)
+};
+
+struct siw_srq {
+	struct ib_srq base_srq;
+	spinlock_t lock;
+	u32 max_sge;
+	u32 limit; /* low watermark for async event */
+	struct siw_rqe *recvq;
+	u32 rq_put;
+	u32 rq_get;
+	u32 num_rqe; /* max # of wqe's allowed */
+	u32 xa_srq_index; /* mmap information for SRQ array */
+	char armed; /* inform user if limit hit */
+	char kernel_verbs; /* '1' if kernel client */
+};
+
+struct siw_qp_attrs {
+	enum siw_qp_state state;
+	u32 sq_size;
+	u32 rq_size;
+	u32 orq_size;
+	u32 irq_size;
+	u32 sq_max_sges;
+	u32 rq_max_sges;
+	enum siw_qp_flags flags;
+
+	struct socket *sk;
+};
+
+enum siw_tx_ctx {
+	SIW_SEND_HDR, /* start or continue sending HDR */
+	SIW_SEND_DATA, /* start or continue sending DDP payload */
+	SIW_SEND_TRAILER, /* start or continue sending TRAILER */
+	SIW_SEND_SHORT_FPDU/* send whole FPDU hdr|data|trailer at once */
+};
+
+enum siw_rx_state {
+	SIW_GET_HDR, /* await new hdr or within hdr */
+	SIW_GET_DATA_START, /* start of inbound DDP payload */
+	SIW_GET_DATA_MORE, /* continuation of (misaligned) DDP payload */
+	SIW_GET_TRAILER/* await new trailer or within trailer */
+};
+
+struct siw_rx_stream {
+	struct sk_buff *skb;
+	int skb_new; /* pending unread bytes in skb */
+	int skb_offset; /* offset in skb */
+	int skb_copied; /* processed bytes in skb */
+
+	union iwarp_hdr hdr;
+	struct mpa_trailer trailer;
+
+	enum siw_rx_state state;
+
+	/*
+	 * For each FPDU, main RX loop runs through 3 stages:
+	 * Receiving protocol headers, placing DDP payload and receiving
+	 * trailer information (CRC + possibly padding).
+	 * Next two variables keep state on receive status of the
+	 * current FPDU part (hdr, data, trailer).
+	 */
+	int fpdu_part_rcvd; /* bytes in pkt part copied */
+	int fpdu_part_rem; /* bytes in pkt part not seen */
+
+	/*
+	 * Next expected DDP MSN for each QN +
+	 * expected steering tag +
+	 * expected DDP tagget offset (all HBO)
+	 */
+	u32 ddp_msn[RDMAP_UNTAGGED_QN_COUNT];
+	u32 ddp_stag;
+	u64 ddp_to;
+	u32 inval_stag; /* Stag to be invalidated */
+
+	struct shash_desc *mpa_crc_hd;
+	u8 rx_suspend : 1;
+	u8 pad : 2; /* # of pad bytes expected */
+	u8 rdmap_op : 4; /* opcode of current frame */
+};
+
+struct siw_rx_fpdu {
+	/*
+	 * Local destination memory of inbound RDMA operation.
+	 * Valid, according to wqe->wr_status
+	 */
+	struct siw_wqe wqe_active;
+
+	unsigned int pbl_idx; /* Index into current PBL */
+	unsigned int sge_idx; /* current sge in rx */
+	unsigned int sge_off; /* already rcvd in curr. sge */
+
+	char first_ddp_seg; /* this is the first DDP seg */
+	char more_ddp_segs; /* more DDP segs expected */
+	u8 prev_rdmap_op : 4; /* opcode of prev frame */
+};
+
+/*
+ * Shorthands for short packets w/o payload
+ * to be transmitted more efficient.
+ */
+struct siw_send_pkt {
+	struct iwarp_send send;
+	__be32 crc;
+};
+
+struct siw_write_pkt {
+	struct iwarp_rdma_write write;
+	__be32 crc;
+};
+
+struct siw_rreq_pkt {
+	struct iwarp_rdma_rreq rreq;
+	__be32 crc;
+};
+
+struct siw_rresp_pkt {
+	struct iwarp_rdma_rresp rresp;
+	__be32 crc;
+};
+
+struct siw_iwarp_tx {
+	union {
+		union iwarp_hdr hdr;
+
+		/* Generic part of FPDU header */
+		struct iwarp_ctrl ctrl;
+		struct iwarp_ctrl_untagged c_untagged;
+		struct iwarp_ctrl_tagged c_tagged;
+
+		/* FPDU headers */
+		struct iwarp_rdma_write rwrite;
+		struct iwarp_rdma_rreq rreq;
+		struct iwarp_rdma_rresp rresp;
+		struct iwarp_terminate terminate;
+		struct iwarp_send send;
+		struct iwarp_send_inv send_inv;
+
+		/* complete short FPDUs */
+		struct siw_send_pkt send_pkt;
+		struct siw_write_pkt write_pkt;
+		struct siw_rreq_pkt rreq_pkt;
+		struct siw_rresp_pkt rresp_pkt;
+	} pkt;
+
+	struct mpa_trailer trailer;
+	/* DDP MSN for untagged messages */
+	u32 ddp_msn[RDMAP_UNTAGGED_QN_COUNT];
+
+	enum siw_tx_ctx state;
+	u16 ctrl_len; /* ddp+rdmap hdr */
+	u16 ctrl_sent;
+	int burst;
+	int bytes_unsent; /* ddp payload bytes */
+
+	struct shash_desc *mpa_crc_hd;
+
+	u8 do_crc : 1; /* do crc for segment */
+	u8 use_sendpage : 1; /* send w/o copy */
+	u8 tx_suspend : 1; /* stop sending DDP segs. */
+	u8 pad : 2; /* # pad in current fpdu */
+	u8 orq_fence : 1; /* ORQ full or Send fenced */
+	u8 in_syscall : 1; /* TX out of user context */
+	u8 zcopy_tx : 1; /* Use TCP_SENDPAGE if possible */
+	u8 gso_seg_limit; /* Maximum segments for GSO, 0 = unbound */
+
+	u16 fpdu_len; /* len of FPDU to tx */
+	unsigned int tcp_seglen; /* remaining tcp seg space */
+
+	struct siw_wqe wqe_active;
+
+	int pbl_idx; /* Index into current PBL */
+	int sge_idx; /* current sge in tx */
+	u32 sge_off; /* already sent in curr. sge */
+};
+
+struct siw_qp {
+	struct siw_device *sdev;
+	struct ib_qp *ib_qp;
+	struct kref ref;
+	u32 qp_num;
+	struct list_head devq;
+	int tx_cpu;
+	bool kernel_verbs;
+	struct siw_qp_attrs attrs;
+
+	struct siw_cep *cep;
+	struct rw_semaphore state_lock;
+
+	struct ib_pd *pd;
+	struct siw_cq *scq;
+	struct siw_cq *rcq;
+	struct siw_srq *srq;
+
+	struct siw_iwarp_tx tx_ctx; /* Transmit context */
+	spinlock_t sq_lock;
+	struct siw_sqe *sendq; /* send queue element array */
+	uint32_t sq_get; /* consumer index into sq array */
+	uint32_t sq_put; /* kernel prod. index into sq array */
+	struct llist_node tx_list;
+
+	struct siw_sqe *orq; /* outbound read queue element array */
+	spinlock_t orq_lock;
+	uint32_t orq_get; /* consumer index into orq array */
+	uint32_t orq_put; /* shared producer index for ORQ */
+
+	struct siw_rx_stream rx_stream;
+	struct siw_rx_fpdu *rx_fpdu;
+	struct siw_rx_fpdu rx_tagged;
+	struct siw_rx_fpdu rx_untagged;
+	spinlock_t rq_lock;
+	struct siw_rqe *recvq; /* recv queue element array */
+	uint32_t rq_get; /* consumer index into rq array */
+	uint32_t rq_put; /* kernel prod. index into rq array */
+
+	struct siw_sqe *irq; /* inbound read queue element array */
+	uint32_t irq_get; /* consumer index into irq array */
+	uint32_t irq_put; /* producer index into irq array */
+	int irq_burst;
+
+	struct { /* information to be carried in TERMINATE pkt, if valid */
+		u8 valid;
+		u8 in_tx;
+		u8 layer : 4, etype : 4;
+		u8 ecode;
+	} term_info;
+	u32 xa_sq_index; /* mmap information for SQE array */
+	u32 xa_rq_index; /* mmap information for RQE array */
+	struct rcu_head rcu;
+};
+
+struct siw_base_qp {
+	struct ib_qp base_qp;
+	struct siw_qp *qp;
+};
+
+/* helper macros */
+#define rx_qp(rx) container_of(rx, struct siw_qp, rx_stream)
+#define tx_qp(tx) container_of(tx, struct siw_qp, tx_ctx)
+#define tx_wqe(qp) (&(qp)->tx_ctx.wqe_active)
+#define rx_wqe(rctx) (&(rctx)->wqe_active)
+#define rx_mem(rctx) ((rctx)->wqe_active.mem[0])
+#define tx_type(wqe) ((wqe)->sqe.opcode)
+#define rx_type(wqe) ((wqe)->rqe.opcode)
+#define tx_flags(wqe) ((wqe)->sqe.flags)
+
+struct iwarp_msg_info {
+	int hdr_len;
+	struct iwarp_ctrl ctrl;
+	int (*rx_data)(struct siw_qp *qp);
+};
+
+/* Global siw parameters. Currently set in siw_main.c */
+extern const bool zcopy_tx;
+extern const bool try_gso;
+extern const bool loopback_enabled;
+extern const bool mpa_crc_required;
+extern const bool mpa_crc_strict;
+extern const bool siw_tcp_nagle;
+extern u_char mpa_version;
+extern const bool peer_to_peer;
+extern struct task_struct *siw_tx_thread[];
+
+extern struct crypto_shash *siw_crypto_shash;
+extern struct iwarp_msg_info iwarp_pktinfo[RDMAP_TERMINATE + 1];
+
+/* QP general functions */
+int siw_qp_modify(struct siw_qp *qp, struct siw_qp_attrs *attr,
+		  enum siw_qp_attr_mask mask);
+int siw_qp_mpa_rts(struct siw_qp *qp, enum mpa_v2_ctrl ctrl);
+void siw_qp_llp_close(struct siw_qp *qp);
+void siw_qp_cm_drop(struct siw_qp *qp, int schedule);
+void siw_send_terminate(struct siw_qp *qp);
+
+void siw_qp_get_ref(struct ib_qp *qp);
+void siw_qp_put_ref(struct ib_qp *qp);
+int siw_qp_add(struct siw_device *sdev, struct siw_qp *qp);
+void siw_free_qp(struct kref *ref);
+
+void siw_init_terminate(struct siw_qp *qp, enum term_elayer layer,
+			u8 etype, u8 ecode, int in_tx);
+enum ddp_ecode siw_tagged_error(enum siw_access_state state);
+enum rdmap_ecode siw_rdmap_error(enum siw_access_state state);
+
+void siw_read_to_orq(struct siw_sqe *rreq, struct siw_sqe *sqe);
+int siw_sqe_complete(struct siw_qp *qp, struct siw_sqe *sqe, u32 bytes,
+		     enum siw_wc_status status);
+int siw_rqe_complete(struct siw_qp *qp, struct siw_rqe *rqe, u32 bytes,
+		     u32 inval_stag, enum siw_wc_status status);
+void siw_qp_llp_data_ready(struct sock *sk);
+void siw_qp_llp_write_space(struct sock *sk);
+
+/* QP TX path functions */
+int siw_run_sq(void *arg);
+int siw_qp_sq_process(struct siw_qp *qp);
+int siw_sq_start(struct siw_qp *qp);
+int siw_activate_tx(struct siw_qp *qp);
+void siw_stop_tx_thread(int nr_cpu);
+int siw_get_tx_cpu(struct siw_device *sdev);
+void siw_put_tx_cpu(int cpu);
+
+/* QP RX path functions */
+int siw_proc_send(struct siw_qp *qp);
+int siw_proc_rreq(struct siw_qp *qp);
+int siw_proc_rresp(struct siw_qp *qp);
+int siw_proc_write(struct siw_qp *qp);
+int siw_proc_terminate(struct siw_qp *qp);
+
+int siw_tcp_rx_data(read_descriptor_t *rd_desc, struct sk_buff *skb,
+		    unsigned int off, size_t len);
+
+static inline void set_rx_fpdu_context(struct siw_qp *qp, u8 opcode)
+{
+	if (opcode == RDMAP_RDMA_WRITE || opcode == RDMAP_RDMA_READ_RESP)
+		qp->rx_fpdu = &qp->rx_tagged;
+	else
+		qp->rx_fpdu = &qp->rx_untagged;
+
+	qp->rx_stream.rdmap_op = opcode;
+}
+
+static inline struct siw_ucontext *to_siw_ctx(struct ib_ucontext *base_ctx)
+{
+	return container_of(base_ctx, struct siw_ucontext, base_ucontext);
+}
+
+static inline struct siw_base_qp *to_siw_base_qp(struct ib_qp *base_qp)
+{
+	return container_of(base_qp, struct siw_base_qp, base_qp);
+}
+
+static inline struct siw_qp *to_siw_qp(struct ib_qp *base_qp)
+{
+	return to_siw_base_qp(base_qp)->qp;
+}
+
+static inline struct siw_cq *to_siw_cq(struct ib_cq *base_cq)
+{
+	return container_of(base_cq, struct siw_cq, base_cq);
+}
+
+static inline struct siw_srq *to_siw_srq(struct ib_srq *base_srq)
+{
+	return container_of(base_srq, struct siw_srq, base_srq);
+}
+
+static inline struct siw_device *to_siw_dev(struct ib_device *base_dev)
+{
+	return container_of(base_dev, struct siw_device, base_dev);
+}
+
+static inline struct siw_mr *to_siw_mr(struct ib_mr *base_mr)
+{
+	return container_of(base_mr, struct siw_mr, base_mr);
+}
+
+static inline struct siw_qp *siw_qp_id2obj(struct siw_device *sdev, int id)
+{
+	struct siw_qp *qp;
+
+	rcu_read_lock();
+	qp = xa_load(&sdev->qp_xa, id);
+	if (likely(qp && kref_get_unless_zero(&qp->ref))) {
+		rcu_read_unlock();
+		return qp;
+	}
+	rcu_read_unlock();
+	return NULL;
+}
+
+static inline u32 qp_id(struct siw_qp *qp)
+{
+	return qp->qp_num;
+}
+
+static inline void siw_qp_get(struct siw_qp *qp)
+{
+	kref_get(&qp->ref);
+}
+
+static inline void siw_qp_put(struct siw_qp *qp)
+{
+	kref_put(&qp->ref, siw_free_qp);
+}
+
+static inline int siw_sq_empty(struct siw_qp *qp)
+{
+	struct siw_sqe *sqe = &qp->sendq[qp->sq_get % qp->attrs.sq_size];
+
+	return READ_ONCE(sqe->flags) == 0;
+}
+
+static inline struct siw_sqe *sq_get_next(struct siw_qp *qp)
+{
+	struct siw_sqe *sqe = &qp->sendq[qp->sq_get % qp->attrs.sq_size];
+
+	if (READ_ONCE(sqe->flags) & SIW_WQE_VALID)
+		return sqe;
+
+	return NULL;
+}
+
+static inline struct siw_sqe *orq_get_current(struct siw_qp *qp)
+{
+	return &qp->orq[qp->orq_get % qp->attrs.orq_size];
+}
+
+static inline struct siw_sqe *orq_get_tail(struct siw_qp *qp)
+{
+	return &qp->orq[qp->orq_put % qp->attrs.orq_size];
+}
+
+static inline struct siw_sqe *orq_get_free(struct siw_qp *qp)
+{
+	struct siw_sqe *orq_e = orq_get_tail(qp);
+
+	if (orq_e && READ_ONCE(orq_e->flags) == 0)
+		return orq_e;
+
+	return NULL;
+}
+
+static inline int siw_orq_empty(struct siw_qp *qp)
+{
+	return qp->orq[qp->orq_get % qp->attrs.orq_size].flags == 0 ? 1 : 0;
+}
+
+static inline struct siw_sqe *irq_alloc_free(struct siw_qp *qp)
+{
+	struct siw_sqe *irq_e = &qp->irq[qp->irq_put % qp->attrs.irq_size];
+
+	if (READ_ONCE(irq_e->flags) == 0) {
+		qp->irq_put++;
+		return irq_e;
+	}
+	return NULL;
+}
+
+static inline __wsum siw_csum_update(const void *buff, int len, __wsum sum)
+{
+	return (__force __wsum)crc32c((__force __u32)sum, buff, len);
+}
+
+static inline __wsum siw_csum_combine(__wsum csum, __wsum csum2, int offset,
+				      int len)
+{
+	return (__force __wsum)__crc32c_le_combine((__force __u32)csum,
+						   (__force __u32)csum2, len);
+}
+
+static inline void siw_crc_skb(struct siw_rx_stream *srx, unsigned int len)
+{
+	const struct skb_checksum_ops siw_cs_ops = {
+		.update = siw_csum_update,
+		.combine = siw_csum_combine,
+	};
+	__wsum crc = *(u32 *)shash_desc_ctx(srx->mpa_crc_hd);
+
+	crc = __skb_checksum(srx->skb, srx->skb_offset, len, crc,
+			     &siw_cs_ops);
+	*(u32 *)shash_desc_ctx(srx->mpa_crc_hd) = crc;
+}
+
+#define siw_dbg(ibdev, fmt, ...)                                               \
+	ibdev_dbg(ibdev, "%s: " fmt, __func__, ##__VA_ARGS__)
+
+#define siw_dbg_qp(qp, fmt, ...)                                               \
+	ibdev_dbg(&qp->sdev->base_dev, "QP[%u] %s: " fmt, qp_id(qp), __func__, \
+		  ##__VA_ARGS__)
+
+#define siw_dbg_cq(cq, fmt, ...)                                               \
+	ibdev_dbg(cq->base_cq.device, "CQ[%u] %s: " fmt, cq->id, __func__,     \
+		  ##__VA_ARGS__)
+
+#define siw_dbg_pd(pd, fmt, ...)                                               \
+	ibdev_dbg(pd->device, "PD[%u] %s: " fmt, pd->res.id, __func__,         \
+		  ##__VA_ARGS__)
+
+#define siw_dbg_mem(mem, fmt, ...)                                             \
+	ibdev_dbg(&mem->sdev->base_dev,                                        \
+		  "MEM[0x%08x] %s: " fmt, mem->stag, __func__, ##__VA_ARGS__)
+
+#define siw_dbg_cep(cep, fmt, ...)                                             \
+	ibdev_dbg(&cep->sdev->base_dev, "CEP[0x%pK] %s: " fmt,                  \
+		  cep, __func__, ##__VA_ARGS__)
+
+void siw_cq_flush(struct siw_cq *cq);
+void siw_sq_flush(struct siw_qp *qp);
+void siw_rq_flush(struct siw_qp *qp);
+int siw_reap_cqe(struct siw_cq *cq, struct ib_wc *wc);
+
+#endif
diff --git a/drivers/infiniband/sw/siw/siw_cm.c b/drivers/infiniband/sw/siw/siw_cm.c
new file mode 100644
index 0000000..8c1931a
--- /dev/null
+++ b/drivers/infiniband/sw/siw/siw_cm.c
@@ -0,0 +1,2074 @@
+// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause
+
+/* Authors: Bernard Metzler <bmt@zurich.ibm.com> */
+/*          Fredy Neeser */
+/*          Greg Joyce <greg@opengridcomputing.com> */
+/* Copyright (c) 2008-2019, IBM Corporation */
+/* Copyright (c) 2017, Open Grid Computing, Inc. */
+
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/net.h>
+#include <linux/inetdevice.h>
+#include <net/addrconf.h>
+#include <linux/workqueue.h>
+#include <net/sock.h>
+#include <net/tcp.h>
+#include <linux/inet.h>
+#include <linux/tcp.h>
+
+#include <rdma/iw_cm.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_user_verbs.h>
+
+#include "siw.h"
+#include "siw_cm.h"
+
+/*
+ * Set to any combination of
+ * MPA_V2_RDMA_NO_RTR, MPA_V2_RDMA_READ_RTR, MPA_V2_RDMA_WRITE_RTR
+ */
+static __be16 rtr_type = MPA_V2_RDMA_READ_RTR | MPA_V2_RDMA_WRITE_RTR;
+static const bool relaxed_ird_negotiation = 1;
+
+static void siw_cm_llp_state_change(struct sock *s);
+static void siw_cm_llp_data_ready(struct sock *s);
+static void siw_cm_llp_write_space(struct sock *s);
+static void siw_cm_llp_error_report(struct sock *s);
+static int siw_cm_upcall(struct siw_cep *cep, enum iw_cm_event_type reason,
+			 int status);
+
+static void siw_sk_assign_cm_upcalls(struct sock *sk)
+{
+	write_lock_bh(&sk->sk_callback_lock);
+	sk->sk_state_change = siw_cm_llp_state_change;
+	sk->sk_data_ready = siw_cm_llp_data_ready;
+	sk->sk_write_space = siw_cm_llp_write_space;
+	sk->sk_error_report = siw_cm_llp_error_report;
+	write_unlock_bh(&sk->sk_callback_lock);
+}
+
+static void siw_sk_save_upcalls(struct sock *sk)
+{
+	struct siw_cep *cep = sk_to_cep(sk);
+
+	write_lock_bh(&sk->sk_callback_lock);
+	cep->sk_state_change = sk->sk_state_change;
+	cep->sk_data_ready = sk->sk_data_ready;
+	cep->sk_write_space = sk->sk_write_space;
+	cep->sk_error_report = sk->sk_error_report;
+	write_unlock_bh(&sk->sk_callback_lock);
+}
+
+static void siw_sk_restore_upcalls(struct sock *sk, struct siw_cep *cep)
+{
+	sk->sk_state_change = cep->sk_state_change;
+	sk->sk_data_ready = cep->sk_data_ready;
+	sk->sk_write_space = cep->sk_write_space;
+	sk->sk_error_report = cep->sk_error_report;
+	sk->sk_user_data = NULL;
+}
+
+static void siw_qp_socket_assoc(struct siw_cep *cep, struct siw_qp *qp)
+{
+	struct socket *s = cep->sock;
+	struct sock *sk = s->sk;
+
+	write_lock_bh(&sk->sk_callback_lock);
+
+	qp->attrs.sk = s;
+	sk->sk_data_ready = siw_qp_llp_data_ready;
+	sk->sk_write_space = siw_qp_llp_write_space;
+
+	write_unlock_bh(&sk->sk_callback_lock);
+}
+
+static void siw_socket_disassoc(struct socket *s)
+{
+	struct sock *sk = s->sk;
+	struct siw_cep *cep;
+
+	if (sk) {
+		write_lock_bh(&sk->sk_callback_lock);
+		cep = sk_to_cep(sk);
+		if (cep) {
+			siw_sk_restore_upcalls(sk, cep);
+			siw_cep_put(cep);
+		} else {
+			pr_warn("siw: cannot restore sk callbacks: no ep\n");
+		}
+		write_unlock_bh(&sk->sk_callback_lock);
+	} else {
+		pr_warn("siw: cannot restore sk callbacks: no sk\n");
+	}
+}
+
+static void siw_rtr_data_ready(struct sock *sk)
+{
+	struct siw_cep *cep;
+	struct siw_qp *qp = NULL;
+	read_descriptor_t rd_desc;
+
+	read_lock(&sk->sk_callback_lock);
+
+	cep = sk_to_cep(sk);
+	if (!cep) {
+		WARN(1, "No connection endpoint\n");
+		goto out;
+	}
+	qp = sk_to_qp(sk);
+
+	memset(&rd_desc, 0, sizeof(rd_desc));
+	rd_desc.arg.data = qp;
+	rd_desc.count = 1;
+
+	tcp_read_sock(sk, &rd_desc, siw_tcp_rx_data);
+	/*
+	 * Check if first frame was successfully processed.
+	 * Signal connection full establishment if yes.
+	 * Failed data processing would have already scheduled
+	 * connection drop.
+	 */
+	if (!qp->rx_stream.rx_suspend)
+		siw_cm_upcall(cep, IW_CM_EVENT_ESTABLISHED, 0);
+out:
+	read_unlock(&sk->sk_callback_lock);
+	if (qp)
+		siw_qp_socket_assoc(cep, qp);
+}
+
+static void siw_sk_assign_rtr_upcalls(struct siw_cep *cep)
+{
+	struct sock *sk = cep->sock->sk;
+
+	write_lock_bh(&sk->sk_callback_lock);
+	sk->sk_data_ready = siw_rtr_data_ready;
+	sk->sk_write_space = siw_qp_llp_write_space;
+	write_unlock_bh(&sk->sk_callback_lock);
+}
+
+static void siw_cep_socket_assoc(struct siw_cep *cep, struct socket *s)
+{
+	cep->sock = s;
+	siw_cep_get(cep);
+	s->sk->sk_user_data = cep;
+
+	siw_sk_save_upcalls(s->sk);
+	siw_sk_assign_cm_upcalls(s->sk);
+}
+
+static struct siw_cep *siw_cep_alloc(struct siw_device *sdev)
+{
+	struct siw_cep *cep = kzalloc(sizeof(*cep), GFP_KERNEL);
+	unsigned long flags;
+
+	if (!cep)
+		return NULL;
+
+	INIT_LIST_HEAD(&cep->listenq);
+	INIT_LIST_HEAD(&cep->devq);
+	INIT_LIST_HEAD(&cep->work_freelist);
+
+	kref_init(&cep->ref);
+	cep->state = SIW_EPSTATE_IDLE;
+	init_waitqueue_head(&cep->waitq);
+	spin_lock_init(&cep->lock);
+	cep->sdev = sdev;
+	cep->enhanced_rdma_conn_est = false;
+
+	spin_lock_irqsave(&sdev->lock, flags);
+	list_add_tail(&cep->devq, &sdev->cep_list);
+	spin_unlock_irqrestore(&sdev->lock, flags);
+
+	siw_dbg_cep(cep, "new endpoint\n");
+	return cep;
+}
+
+static void siw_cm_free_work(struct siw_cep *cep)
+{
+	struct list_head *w, *tmp;
+	struct siw_cm_work *work;
+
+	list_for_each_safe(w, tmp, &cep->work_freelist) {
+		work = list_entry(w, struct siw_cm_work, list);
+		list_del(&work->list);
+		kfree(work);
+	}
+}
+
+static void siw_cancel_mpatimer(struct siw_cep *cep)
+{
+	spin_lock_bh(&cep->lock);
+	if (cep->mpa_timer) {
+		if (cancel_delayed_work(&cep->mpa_timer->work)) {
+			siw_cep_put(cep);
+			kfree(cep->mpa_timer); /* not needed again */
+		}
+		cep->mpa_timer = NULL;
+	}
+	spin_unlock_bh(&cep->lock);
+}
+
+static void siw_put_work(struct siw_cm_work *work)
+{
+	INIT_LIST_HEAD(&work->list);
+	spin_lock_bh(&work->cep->lock);
+	list_add(&work->list, &work->cep->work_freelist);
+	spin_unlock_bh(&work->cep->lock);
+}
+
+static void siw_cep_set_inuse(struct siw_cep *cep)
+{
+	unsigned long flags;
+retry:
+	spin_lock_irqsave(&cep->lock, flags);
+
+	if (cep->in_use) {
+		spin_unlock_irqrestore(&cep->lock, flags);
+		wait_event_interruptible(cep->waitq, !cep->in_use);
+		if (signal_pending(current))
+			flush_signals(current);
+		goto retry;
+	} else {
+		cep->in_use = 1;
+		spin_unlock_irqrestore(&cep->lock, flags);
+	}
+}
+
+static void siw_cep_set_free(struct siw_cep *cep)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&cep->lock, flags);
+	cep->in_use = 0;
+	spin_unlock_irqrestore(&cep->lock, flags);
+
+	wake_up(&cep->waitq);
+}
+
+static void __siw_cep_dealloc(struct kref *ref)
+{
+	struct siw_cep *cep = container_of(ref, struct siw_cep, ref);
+	struct siw_device *sdev = cep->sdev;
+	unsigned long flags;
+
+	WARN_ON(cep->listen_cep);
+
+	/* kfree(NULL) is safe */
+	kfree(cep->mpa.pdata);
+	spin_lock_bh(&cep->lock);
+	if (!list_empty(&cep->work_freelist))
+		siw_cm_free_work(cep);
+	spin_unlock_bh(&cep->lock);
+
+	spin_lock_irqsave(&sdev->lock, flags);
+	list_del(&cep->devq);
+	spin_unlock_irqrestore(&sdev->lock, flags);
+
+	siw_dbg_cep(cep, "free endpoint\n");
+	kfree(cep);
+}
+
+static struct siw_cm_work *siw_get_work(struct siw_cep *cep)
+{
+	struct siw_cm_work *work = NULL;
+
+	spin_lock_bh(&cep->lock);
+	if (!list_empty(&cep->work_freelist)) {
+		work = list_entry(cep->work_freelist.next, struct siw_cm_work,
+				  list);
+		list_del_init(&work->list);
+	}
+	spin_unlock_bh(&cep->lock);
+	return work;
+}
+
+static int siw_cm_alloc_work(struct siw_cep *cep, int num)
+{
+	struct siw_cm_work *work;
+
+	while (num--) {
+		work = kmalloc(sizeof(*work), GFP_KERNEL);
+		if (!work) {
+			if (!(list_empty(&cep->work_freelist)))
+				siw_cm_free_work(cep);
+			return -ENOMEM;
+		}
+		work->cep = cep;
+		INIT_LIST_HEAD(&work->list);
+		list_add(&work->list, &cep->work_freelist);
+	}
+	return 0;
+}
+
+/*
+ * siw_cm_upcall()
+ *
+ * Upcall to IWCM to inform about async connection events
+ */
+static int siw_cm_upcall(struct siw_cep *cep, enum iw_cm_event_type reason,
+			 int status)
+{
+	struct iw_cm_event event;
+	struct iw_cm_id *id;
+
+	memset(&event, 0, sizeof(event));
+	event.status = status;
+	event.event = reason;
+
+	if (reason == IW_CM_EVENT_CONNECT_REQUEST) {
+		event.provider_data = cep;
+		id = cep->listen_cep->cm_id;
+	} else {
+		id = cep->cm_id;
+	}
+	/* Signal IRD and ORD */
+	if (reason == IW_CM_EVENT_ESTABLISHED ||
+	    reason == IW_CM_EVENT_CONNECT_REPLY) {
+		/* Signal negotiated IRD/ORD values we will use */
+		event.ird = cep->ird;
+		event.ord = cep->ord;
+	} else if (reason == IW_CM_EVENT_CONNECT_REQUEST) {
+		event.ird = cep->ord;
+		event.ord = cep->ird;
+	}
+	/* Signal private data and address information */
+	if (reason == IW_CM_EVENT_CONNECT_REQUEST ||
+	    reason == IW_CM_EVENT_CONNECT_REPLY) {
+		u16 pd_len = be16_to_cpu(cep->mpa.hdr.params.pd_len);
+
+		if (pd_len) {
+			/*
+			 * hand over MPA private data
+			 */
+			event.private_data_len = pd_len;
+			event.private_data = cep->mpa.pdata;
+
+			/* Hide MPA V2 IRD/ORD control */
+			if (cep->enhanced_rdma_conn_est) {
+				event.private_data_len -=
+					sizeof(struct mpa_v2_data);
+				event.private_data +=
+					sizeof(struct mpa_v2_data);
+			}
+		}
+		getname_local(cep->sock, &event.local_addr);
+		getname_peer(cep->sock, &event.remote_addr);
+	}
+	siw_dbg_cep(cep, "[QP %u]: reason=%d, status=%d\n",
+		    cep->qp ? qp_id(cep->qp) : UINT_MAX, reason, status);
+
+	return id->event_handler(id, &event);
+}
+
+/*
+ * siw_qp_cm_drop()
+ *
+ * Drops established LLP connection if present and not already
+ * scheduled for dropping. Called from user context, SQ workqueue
+ * or receive IRQ. Caller signals if socket can be immediately
+ * closed (basically, if not in IRQ).
+ */
+void siw_qp_cm_drop(struct siw_qp *qp, int schedule)
+{
+	struct siw_cep *cep = qp->cep;
+
+	qp->rx_stream.rx_suspend = 1;
+	qp->tx_ctx.tx_suspend = 1;
+
+	if (!qp->cep)
+		return;
+
+	if (schedule) {
+		siw_cm_queue_work(cep, SIW_CM_WORK_CLOSE_LLP);
+	} else {
+		siw_cep_set_inuse(cep);
+
+		if (cep->state == SIW_EPSTATE_CLOSED) {
+			siw_dbg_cep(cep, "already closed\n");
+			goto out;
+		}
+		siw_dbg_cep(cep, "immediate close, state %d\n", cep->state);
+
+		if (qp->term_info.valid)
+			siw_send_terminate(qp);
+
+		if (cep->cm_id) {
+			switch (cep->state) {
+			case SIW_EPSTATE_AWAIT_MPAREP:
+				siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY,
+					      -EINVAL);
+				break;
+
+			case SIW_EPSTATE_RDMA_MODE:
+				siw_cm_upcall(cep, IW_CM_EVENT_CLOSE, 0);
+				break;
+
+			case SIW_EPSTATE_IDLE:
+			case SIW_EPSTATE_LISTENING:
+			case SIW_EPSTATE_CONNECTING:
+			case SIW_EPSTATE_AWAIT_MPAREQ:
+			case SIW_EPSTATE_RECVD_MPAREQ:
+			case SIW_EPSTATE_CLOSED:
+			default:
+				break;
+			}
+			cep->cm_id->rem_ref(cep->cm_id);
+			cep->cm_id = NULL;
+			siw_cep_put(cep);
+		}
+		cep->state = SIW_EPSTATE_CLOSED;
+
+		if (cep->sock) {
+			siw_socket_disassoc(cep->sock);
+			/*
+			 * Immediately close socket
+			 */
+			sock_release(cep->sock);
+			cep->sock = NULL;
+		}
+		if (cep->qp) {
+			cep->qp = NULL;
+			siw_qp_put(qp);
+		}
+out:
+		siw_cep_set_free(cep);
+	}
+}
+
+void siw_cep_put(struct siw_cep *cep)
+{
+	WARN_ON(kref_read(&cep->ref) < 1);
+	kref_put(&cep->ref, __siw_cep_dealloc);
+}
+
+void siw_cep_get(struct siw_cep *cep)
+{
+	kref_get(&cep->ref);
+}
+
+/*
+ * Expects params->pd_len in host byte order
+ */
+static int siw_send_mpareqrep(struct siw_cep *cep, const void *pdata, u8 pd_len)
+{
+	struct socket *s = cep->sock;
+	struct mpa_rr *rr = &cep->mpa.hdr;
+	struct kvec iov[3];
+	struct msghdr msg;
+	int rv;
+	int iovec_num = 0;
+	int mpa_len;
+
+	memset(&msg, 0, sizeof(msg));
+
+	iov[iovec_num].iov_base = rr;
+	iov[iovec_num].iov_len = sizeof(*rr);
+	mpa_len = sizeof(*rr);
+
+	if (cep->enhanced_rdma_conn_est) {
+		iovec_num++;
+		iov[iovec_num].iov_base = &cep->mpa.v2_ctrl;
+		iov[iovec_num].iov_len = sizeof(cep->mpa.v2_ctrl);
+		mpa_len += sizeof(cep->mpa.v2_ctrl);
+	}
+	if (pd_len) {
+		iovec_num++;
+		iov[iovec_num].iov_base = (char *)pdata;
+		iov[iovec_num].iov_len = pd_len;
+		mpa_len += pd_len;
+	}
+	if (cep->enhanced_rdma_conn_est)
+		pd_len += sizeof(cep->mpa.v2_ctrl);
+
+	rr->params.pd_len = cpu_to_be16(pd_len);
+
+	rv = kernel_sendmsg(s, &msg, iov, iovec_num + 1, mpa_len);
+
+	return rv < 0 ? rv : 0;
+}
+
+/*
+ * Receive MPA Request/Reply header.
+ *
+ * Returns 0 if complete MPA Request/Reply header including
+ * eventual private data was received. Returns -EAGAIN if
+ * header was partially received or negative error code otherwise.
+ *
+ * Context: May be called in process context only
+ */
+static int siw_recv_mpa_rr(struct siw_cep *cep)
+{
+	struct mpa_rr *hdr = &cep->mpa.hdr;
+	struct socket *s = cep->sock;
+	u16 pd_len;
+	int rcvd, to_rcv;
+
+	if (cep->mpa.bytes_rcvd < sizeof(struct mpa_rr)) {
+		rcvd = ksock_recv(s, (char *)hdr + cep->mpa.bytes_rcvd,
+				  sizeof(struct mpa_rr) - cep->mpa.bytes_rcvd,
+				  0);
+		if (rcvd <= 0)
+			return -ECONNABORTED;
+
+		cep->mpa.bytes_rcvd += rcvd;
+
+		if (cep->mpa.bytes_rcvd < sizeof(struct mpa_rr))
+			return -EAGAIN;
+
+		if (be16_to_cpu(hdr->params.pd_len) > MPA_MAX_PRIVDATA)
+			return -EPROTO;
+	}
+	pd_len = be16_to_cpu(hdr->params.pd_len);
+
+	/*
+	 * At least the MPA Request/Reply header (frame not including
+	 * private data) has been received.
+	 * Receive (or continue receiving) any private data.
+	 */
+	to_rcv = pd_len - (cep->mpa.bytes_rcvd - sizeof(struct mpa_rr));
+
+	if (!to_rcv) {
+		/*
+		 * We must have hdr->params.pd_len == 0 and thus received a
+		 * complete MPA Request/Reply frame.
+		 * Check against peer protocol violation.
+		 */
+		u32 word;
+
+		rcvd = ksock_recv(s, (char *)&word, sizeof(word), MSG_DONTWAIT);
+		if (rcvd == -EAGAIN)
+			return 0;
+
+		if (rcvd == 0) {
+			siw_dbg_cep(cep, "peer EOF\n");
+			return -EPIPE;
+		}
+		if (rcvd < 0) {
+			siw_dbg_cep(cep, "error: %d\n", rcvd);
+			return rcvd;
+		}
+		siw_dbg_cep(cep, "peer sent extra data: %d\n", rcvd);
+
+		return -EPROTO;
+	}
+
+	/*
+	 * At this point, we must have hdr->params.pd_len != 0.
+	 * A private data buffer gets allocated if hdr->params.pd_len != 0.
+	 */
+	if (!cep->mpa.pdata) {
+		cep->mpa.pdata = kmalloc(pd_len + 4, GFP_KERNEL);
+		if (!cep->mpa.pdata)
+			return -ENOMEM;
+	}
+	rcvd = ksock_recv(
+		s, cep->mpa.pdata + cep->mpa.bytes_rcvd - sizeof(struct mpa_rr),
+		to_rcv + 4, MSG_DONTWAIT);
+
+	if (rcvd < 0)
+		return rcvd;
+
+	if (rcvd > to_rcv)
+		return -EPROTO;
+
+	cep->mpa.bytes_rcvd += rcvd;
+
+	if (to_rcv == rcvd) {
+		siw_dbg_cep(cep, "%d bytes private data received\n", pd_len);
+		return 0;
+	}
+	return -EAGAIN;
+}
+
+/*
+ * siw_proc_mpareq()
+ *
+ * Read MPA Request from socket and signal new connection to IWCM
+ * if success. Caller must hold lock on corresponding listening CEP.
+ */
+static int siw_proc_mpareq(struct siw_cep *cep)
+{
+	struct mpa_rr *req;
+	int version, rv;
+	u16 pd_len;
+
+	rv = siw_recv_mpa_rr(cep);
+	if (rv)
+		return rv;
+
+	req = &cep->mpa.hdr;
+
+	version = __mpa_rr_revision(req->params.bits);
+	pd_len = be16_to_cpu(req->params.pd_len);
+
+	if (version > MPA_REVISION_2)
+		/* allow for 0, 1, and 2 only */
+		return -EPROTO;
+
+	if (memcmp(req->key, MPA_KEY_REQ, 16))
+		return -EPROTO;
+
+	/* Prepare for sending MPA reply */
+	memcpy(req->key, MPA_KEY_REP, 16);
+
+	if (version == MPA_REVISION_2 &&
+	    (req->params.bits & MPA_RR_FLAG_ENHANCED)) {
+		/*
+		 * MPA version 2 must signal IRD/ORD values and P2P mode
+		 * in private data if header flag MPA_RR_FLAG_ENHANCED
+		 * is set.
+		 */
+		if (pd_len < sizeof(struct mpa_v2_data))
+			goto reject_conn;
+
+		cep->enhanced_rdma_conn_est = true;
+	}
+
+	/* MPA Markers: currently not supported. Marker TX to be added. */
+	if (req->params.bits & MPA_RR_FLAG_MARKERS)
+		goto reject_conn;
+
+	if (req->params.bits & MPA_RR_FLAG_CRC) {
+		/*
+		 * RFC 5044, page 27: CRC MUST be used if peer requests it.
+		 * siw specific: 'mpa_crc_strict' parameter to reject
+		 * connection with CRC if local CRC off enforced by
+		 * 'mpa_crc_strict' module parameter.
+		 */
+		if (!mpa_crc_required && mpa_crc_strict)
+			goto reject_conn;
+
+		/* Enable CRC if requested by module parameter */
+		if (mpa_crc_required)
+			req->params.bits |= MPA_RR_FLAG_CRC;
+	}
+	if (cep->enhanced_rdma_conn_est) {
+		struct mpa_v2_data *v2 = (struct mpa_v2_data *)cep->mpa.pdata;
+
+		/*
+		 * Peer requested ORD becomes requested local IRD,
+		 * peer requested IRD becomes requested local ORD.
+		 * IRD and ORD get limited by global maximum values.
+		 */
+		cep->ord = ntohs(v2->ird) & MPA_IRD_ORD_MASK;
+		cep->ord = min(cep->ord, SIW_MAX_ORD_QP);
+		cep->ird = ntohs(v2->ord) & MPA_IRD_ORD_MASK;
+		cep->ird = min(cep->ird, SIW_MAX_IRD_QP);
+
+		/* May get overwritten by locally negotiated values */
+		cep->mpa.v2_ctrl.ird = htons(cep->ird);
+		cep->mpa.v2_ctrl.ord = htons(cep->ord);
+
+		/*
+		 * Support for peer sent zero length Write or Read to
+		 * let local side enter RTS. Writes are preferred.
+		 * Sends would require pre-posting a Receive and are
+		 * not supported.
+		 * Propose zero length Write if none of Read and Write
+		 * is indicated.
+		 */
+		if (v2->ird & MPA_V2_PEER_TO_PEER) {
+			cep->mpa.v2_ctrl.ird |= MPA_V2_PEER_TO_PEER;
+
+			if (v2->ord & MPA_V2_RDMA_WRITE_RTR)
+				cep->mpa.v2_ctrl.ord |= MPA_V2_RDMA_WRITE_RTR;
+			else if (v2->ord & MPA_V2_RDMA_READ_RTR)
+				cep->mpa.v2_ctrl.ord |= MPA_V2_RDMA_READ_RTR;
+			else
+				cep->mpa.v2_ctrl.ord |= MPA_V2_RDMA_WRITE_RTR;
+		}
+	}
+
+	cep->state = SIW_EPSTATE_RECVD_MPAREQ;
+
+	/* Keep reference until IWCM accepts/rejects */
+	siw_cep_get(cep);
+	rv = siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REQUEST, 0);
+	if (rv)
+		siw_cep_put(cep);
+
+	return rv;
+
+reject_conn:
+	siw_dbg_cep(cep, "reject: crc %d:%d:%d, m %d:%d\n",
+		    req->params.bits & MPA_RR_FLAG_CRC ? 1 : 0,
+		    mpa_crc_required, mpa_crc_strict,
+		    req->params.bits & MPA_RR_FLAG_MARKERS ? 1 : 0, 0);
+
+	req->params.bits &= ~MPA_RR_FLAG_MARKERS;
+	req->params.bits |= MPA_RR_FLAG_REJECT;
+
+	if (!mpa_crc_required && mpa_crc_strict)
+		req->params.bits &= ~MPA_RR_FLAG_CRC;
+
+	if (pd_len)
+		kfree(cep->mpa.pdata);
+
+	cep->mpa.pdata = NULL;
+
+	siw_send_mpareqrep(cep, NULL, 0);
+
+	return -EOPNOTSUPP;
+}
+
+static int siw_proc_mpareply(struct siw_cep *cep)
+{
+	struct siw_qp_attrs qp_attrs;
+	enum siw_qp_attr_mask qp_attr_mask;
+	struct siw_qp *qp = cep->qp;
+	struct mpa_rr *rep;
+	int rv;
+	u16 rep_ord;
+	u16 rep_ird;
+	bool ird_insufficient = false;
+	enum mpa_v2_ctrl mpa_p2p_mode = MPA_V2_RDMA_NO_RTR;
+
+	rv = siw_recv_mpa_rr(cep);
+	if (rv != -EAGAIN)
+		siw_cancel_mpatimer(cep);
+	if (rv)
+		goto out_err;
+
+	rep = &cep->mpa.hdr;
+
+	if (__mpa_rr_revision(rep->params.bits) > MPA_REVISION_2) {
+		/* allow for 0, 1,  and 2 only */
+		rv = -EPROTO;
+		goto out_err;
+	}
+	if (memcmp(rep->key, MPA_KEY_REP, 16)) {
+		siw_init_terminate(qp, TERM_ERROR_LAYER_LLP, LLP_ETYPE_MPA,
+				   LLP_ECODE_INVALID_REQ_RESP, 0);
+		siw_send_terminate(qp);
+		rv = -EPROTO;
+		goto out_err;
+	}
+	if (rep->params.bits & MPA_RR_FLAG_REJECT) {
+		siw_dbg_cep(cep, "got mpa reject\n");
+		siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, -ECONNRESET);
+
+		return -ECONNRESET;
+	}
+	if (try_gso && rep->params.bits & MPA_RR_FLAG_GSO_EXP) {
+		siw_dbg_cep(cep, "peer allows GSO on TX\n");
+		qp->tx_ctx.gso_seg_limit = 0;
+	}
+	if ((rep->params.bits & MPA_RR_FLAG_MARKERS) ||
+	    (mpa_crc_required && !(rep->params.bits & MPA_RR_FLAG_CRC)) ||
+	    (mpa_crc_strict && !mpa_crc_required &&
+	     (rep->params.bits & MPA_RR_FLAG_CRC))) {
+		siw_dbg_cep(cep, "reply unsupp: crc %d:%d:%d, m %d:%d\n",
+			    rep->params.bits & MPA_RR_FLAG_CRC ? 1 : 0,
+			    mpa_crc_required, mpa_crc_strict,
+			    rep->params.bits & MPA_RR_FLAG_MARKERS ? 1 : 0, 0);
+
+		siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, -ECONNREFUSED);
+
+		return -EINVAL;
+	}
+	if (cep->enhanced_rdma_conn_est) {
+		struct mpa_v2_data *v2;
+
+		if (__mpa_rr_revision(rep->params.bits) < MPA_REVISION_2 ||
+		    !(rep->params.bits & MPA_RR_FLAG_ENHANCED)) {
+			/*
+			 * Protocol failure: The responder MUST reply with
+			 * MPA version 2 and MUST set MPA_RR_FLAG_ENHANCED.
+			 */
+			siw_dbg_cep(cep, "mpa reply error: vers %d, enhcd %d\n",
+				    __mpa_rr_revision(rep->params.bits),
+				    rep->params.bits & MPA_RR_FLAG_ENHANCED ?
+					    1 :
+					    0);
+
+			siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY,
+				      -ECONNRESET);
+			return -EINVAL;
+		}
+		v2 = (struct mpa_v2_data *)cep->mpa.pdata;
+		rep_ird = ntohs(v2->ird) & MPA_IRD_ORD_MASK;
+		rep_ord = ntohs(v2->ord) & MPA_IRD_ORD_MASK;
+
+		if (cep->ird < rep_ord &&
+		    (relaxed_ird_negotiation == false ||
+		     rep_ord > cep->sdev->attrs.max_ird)) {
+			siw_dbg_cep(cep, "ird %d, rep_ord %d, max_ord %d\n",
+				    cep->ird, rep_ord,
+				    cep->sdev->attrs.max_ord);
+			ird_insufficient = true;
+		}
+		if (cep->ord > rep_ird && relaxed_ird_negotiation == false) {
+			siw_dbg_cep(cep, "ord %d, rep_ird %d\n", cep->ord,
+				    rep_ird);
+			ird_insufficient = true;
+		}
+		/*
+		 * Always report negotiated peer values to user,
+		 * even if IRD/ORD negotiation failed
+		 */
+		cep->ird = rep_ord;
+		cep->ord = rep_ird;
+
+		if (ird_insufficient) {
+			/*
+			 * If the initiator IRD is insuffient for the
+			 * responder ORD, send a TERM.
+			 */
+			siw_init_terminate(qp, TERM_ERROR_LAYER_LLP,
+					   LLP_ETYPE_MPA,
+					   LLP_ECODE_INSUFFICIENT_IRD, 0);
+			siw_send_terminate(qp);
+			rv = -ENOMEM;
+			goto out_err;
+		}
+		if (cep->mpa.v2_ctrl_req.ird & MPA_V2_PEER_TO_PEER)
+			mpa_p2p_mode =
+				cep->mpa.v2_ctrl_req.ord &
+				(MPA_V2_RDMA_WRITE_RTR | MPA_V2_RDMA_READ_RTR);
+
+		/*
+		 * Check if we requested P2P mode, and if peer agrees
+		 */
+		if (mpa_p2p_mode != MPA_V2_RDMA_NO_RTR) {
+			if ((mpa_p2p_mode & v2->ord) == 0) {
+				/*
+				 * We requested RTR mode(s), but the peer
+				 * did not pick any mode we support.
+				 */
+				siw_dbg_cep(cep,
+					    "rtr mode:  req %2x, got %2x\n",
+					    mpa_p2p_mode,
+					    v2->ord & (MPA_V2_RDMA_WRITE_RTR |
+						       MPA_V2_RDMA_READ_RTR));
+
+				siw_init_terminate(qp, TERM_ERROR_LAYER_LLP,
+						   LLP_ETYPE_MPA,
+						   LLP_ECODE_NO_MATCHING_RTR,
+						   0);
+				siw_send_terminate(qp);
+				rv = -EPROTO;
+				goto out_err;
+			}
+			mpa_p2p_mode = v2->ord & (MPA_V2_RDMA_WRITE_RTR |
+						  MPA_V2_RDMA_READ_RTR);
+		}
+	}
+	memset(&qp_attrs, 0, sizeof(qp_attrs));
+
+	if (rep->params.bits & MPA_RR_FLAG_CRC)
+		qp_attrs.flags = SIW_MPA_CRC;
+
+	qp_attrs.irq_size = cep->ird;
+	qp_attrs.orq_size = cep->ord;
+	qp_attrs.sk = cep->sock;
+	qp_attrs.state = SIW_QP_STATE_RTS;
+
+	qp_attr_mask = SIW_QP_ATTR_STATE | SIW_QP_ATTR_LLP_HANDLE |
+		       SIW_QP_ATTR_ORD | SIW_QP_ATTR_IRD | SIW_QP_ATTR_MPA;
+
+	/* Move socket RX/TX under QP control */
+	down_write(&qp->state_lock);
+	if (qp->attrs.state > SIW_QP_STATE_RTR) {
+		rv = -EINVAL;
+		up_write(&qp->state_lock);
+		goto out_err;
+	}
+	rv = siw_qp_modify(qp, &qp_attrs, qp_attr_mask);
+
+	siw_qp_socket_assoc(cep, qp);
+
+	up_write(&qp->state_lock);
+
+	/* Send extra RDMA frame to trigger peer RTS if negotiated */
+	if (mpa_p2p_mode != MPA_V2_RDMA_NO_RTR) {
+		rv = siw_qp_mpa_rts(qp, mpa_p2p_mode);
+		if (rv)
+			goto out_err;
+	}
+	if (!rv) {
+		rv = siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, 0);
+		if (!rv)
+			cep->state = SIW_EPSTATE_RDMA_MODE;
+
+		return 0;
+	}
+
+out_err:
+	siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, -EINVAL);
+
+	return rv;
+}
+
+/*
+ * siw_accept_newconn - accept an incoming pending connection
+ *
+ */
+static void siw_accept_newconn(struct siw_cep *cep)
+{
+	struct socket *s = cep->sock;
+	struct socket *new_s = NULL;
+	struct siw_cep *new_cep = NULL;
+	int rv = 0; /* debug only. should disappear */
+
+	if (cep->state != SIW_EPSTATE_LISTENING)
+		goto error;
+
+	new_cep = siw_cep_alloc(cep->sdev);
+	if (!new_cep)
+		goto error;
+
+	/*
+	 * 4: Allocate a sufficient number of work elements
+	 * to allow concurrent handling of local + peer close
+	 * events, MPA header processing + MPA timeout.
+	 */
+	if (siw_cm_alloc_work(new_cep, 4) != 0)
+		goto error;
+
+	/*
+	 * Copy saved socket callbacks from listening CEP
+	 * and assign new socket with new CEP
+	 */
+	new_cep->sk_state_change = cep->sk_state_change;
+	new_cep->sk_data_ready = cep->sk_data_ready;
+	new_cep->sk_write_space = cep->sk_write_space;
+	new_cep->sk_error_report = cep->sk_error_report;
+
+	rv = kernel_accept(s, &new_s, O_NONBLOCK);
+	if (rv != 0) {
+		/*
+		 * Connection already aborted by peer..?
+		 */
+		siw_dbg_cep(cep, "kernel_accept() error: %d\n", rv);
+		goto error;
+	}
+	new_cep->sock = new_s;
+	siw_cep_get(new_cep);
+	new_s->sk->sk_user_data = new_cep;
+
+	if (siw_tcp_nagle == false) {
+		int val = 1;
+
+		rv = kernel_setsockopt(new_s, SOL_TCP, TCP_NODELAY,
+				       (char *)&val, sizeof(val));
+		if (rv) {
+			siw_dbg_cep(cep, "setsockopt NODELAY error: %d\n", rv);
+			goto error;
+		}
+	}
+	new_cep->state = SIW_EPSTATE_AWAIT_MPAREQ;
+
+	rv = siw_cm_queue_work(new_cep, SIW_CM_WORK_MPATIMEOUT);
+	if (rv)
+		goto error;
+	/*
+	 * See siw_proc_mpareq() etc. for the use of new_cep->listen_cep.
+	 */
+	new_cep->listen_cep = cep;
+	siw_cep_get(cep);
+
+	if (atomic_read(&new_s->sk->sk_rmem_alloc)) {
+		/*
+		 * MPA REQ already queued
+		 */
+		siw_dbg_cep(cep, "immediate mpa request\n");
+
+		siw_cep_set_inuse(new_cep);
+		rv = siw_proc_mpareq(new_cep);
+		siw_cep_set_free(new_cep);
+
+		if (rv != -EAGAIN) {
+			siw_cep_put(cep);
+			new_cep->listen_cep = NULL;
+			if (rv)
+				goto error;
+		}
+	}
+	return;
+
+error:
+	if (new_cep)
+		siw_cep_put(new_cep);
+
+	if (new_s) {
+		siw_socket_disassoc(new_s);
+		sock_release(new_s);
+		new_cep->sock = NULL;
+	}
+	siw_dbg_cep(cep, "error %d\n", rv);
+}
+
+static void siw_cm_work_handler(struct work_struct *w)
+{
+	struct siw_cm_work *work;
+	struct siw_cep *cep;
+	int release_cep = 0, rv = 0;
+
+	work = container_of(w, struct siw_cm_work, work.work);
+	cep = work->cep;
+
+	siw_dbg_cep(cep, "[QP %u]: work type: %d, state %d\n",
+		    cep->qp ? qp_id(cep->qp) : UINT_MAX,
+		    work->type, cep->state);
+
+	siw_cep_set_inuse(cep);
+
+	switch (work->type) {
+	case SIW_CM_WORK_ACCEPT:
+		siw_accept_newconn(cep);
+		break;
+
+	case SIW_CM_WORK_READ_MPAHDR:
+		if (cep->state == SIW_EPSTATE_AWAIT_MPAREQ) {
+			if (cep->listen_cep) {
+				siw_cep_set_inuse(cep->listen_cep);
+
+				if (cep->listen_cep->state ==
+				    SIW_EPSTATE_LISTENING)
+					rv = siw_proc_mpareq(cep);
+				else
+					rv = -EFAULT;
+
+				siw_cep_set_free(cep->listen_cep);
+
+				if (rv != -EAGAIN) {
+					siw_cep_put(cep->listen_cep);
+					cep->listen_cep = NULL;
+					if (rv)
+						siw_cep_put(cep);
+				}
+			}
+		} else if (cep->state == SIW_EPSTATE_AWAIT_MPAREP) {
+			rv = siw_proc_mpareply(cep);
+		} else {
+			/*
+			 * CEP already moved out of MPA handshake.
+			 * any connection management already done.
+			 * silently ignore the mpa packet.
+			 */
+			if (cep->state == SIW_EPSTATE_RDMA_MODE) {
+				cep->sock->sk->sk_data_ready(cep->sock->sk);
+				siw_dbg_cep(cep, "already in RDMA mode");
+			} else {
+				siw_dbg_cep(cep, "out of state: %d\n",
+					    cep->state);
+			}
+		}
+		if (rv && rv != EAGAIN)
+			release_cep = 1;
+		break;
+
+	case SIW_CM_WORK_CLOSE_LLP:
+		/*
+		 * QP scheduled LLP close
+		 */
+		if (cep->qp && cep->qp->term_info.valid)
+			siw_send_terminate(cep->qp);
+
+		if (cep->cm_id)
+			siw_cm_upcall(cep, IW_CM_EVENT_CLOSE, 0);
+
+		release_cep = 1;
+		break;
+
+	case SIW_CM_WORK_PEER_CLOSE:
+		if (cep->cm_id) {
+			if (cep->state == SIW_EPSTATE_AWAIT_MPAREP) {
+				/*
+				 * MPA reply not received, but connection drop
+				 */
+				siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY,
+					      -ECONNRESET);
+			} else if (cep->state == SIW_EPSTATE_RDMA_MODE) {
+				/*
+				 * NOTE: IW_CM_EVENT_DISCONNECT is given just
+				 *       to transition IWCM into CLOSING.
+				 */
+				siw_cm_upcall(cep, IW_CM_EVENT_DISCONNECT, 0);
+				siw_cm_upcall(cep, IW_CM_EVENT_CLOSE, 0);
+			}
+			/*
+			 * for other states there is no connection
+			 * known to the IWCM.
+			 */
+		} else {
+			if (cep->state == SIW_EPSTATE_RECVD_MPAREQ) {
+				/*
+				 * Wait for the ulp/CM to call accept/reject
+				 */
+				siw_dbg_cep(cep,
+					    "mpa req recvd, wait for ULP\n");
+			} else if (cep->state == SIW_EPSTATE_AWAIT_MPAREQ) {
+				/*
+				 * Socket close before MPA request received.
+				 */
+				siw_dbg_cep(cep, "no mpareq: drop listener\n");
+				siw_cep_put(cep->listen_cep);
+				cep->listen_cep = NULL;
+			}
+		}
+		release_cep = 1;
+		break;
+
+	case SIW_CM_WORK_MPATIMEOUT:
+		cep->mpa_timer = NULL;
+
+		if (cep->state == SIW_EPSTATE_AWAIT_MPAREP) {
+			/*
+			 * MPA request timed out:
+			 * Hide any partially received private data and signal
+			 * timeout
+			 */
+			cep->mpa.hdr.params.pd_len = 0;
+
+			if (cep->cm_id)
+				siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY,
+					      -ETIMEDOUT);
+			release_cep = 1;
+
+		} else if (cep->state == SIW_EPSTATE_AWAIT_MPAREQ) {
+			/*
+			 * No MPA request received after peer TCP stream setup.
+			 */
+			if (cep->listen_cep) {
+				siw_cep_put(cep->listen_cep);
+				cep->listen_cep = NULL;
+			}
+			release_cep = 1;
+		}
+		break;
+
+	default:
+		WARN(1, "Undefined CM work type: %d\n", work->type);
+	}
+	if (release_cep) {
+		siw_dbg_cep(cep,
+			    "release: timer=%s, QP[%u]\n",
+			    cep->mpa_timer ? "y" : "n",
+			    cep->qp ? qp_id(cep->qp) : UINT_MAX);
+
+		siw_cancel_mpatimer(cep);
+
+		cep->state = SIW_EPSTATE_CLOSED;
+
+		if (cep->qp) {
+			struct siw_qp *qp = cep->qp;
+			/*
+			 * Serialize a potential race with application
+			 * closing the QP and calling siw_qp_cm_drop()
+			 */
+			siw_qp_get(qp);
+			siw_cep_set_free(cep);
+
+			siw_qp_llp_close(qp);
+			siw_qp_put(qp);
+
+			siw_cep_set_inuse(cep);
+			cep->qp = NULL;
+			siw_qp_put(qp);
+		}
+		if (cep->sock) {
+			siw_socket_disassoc(cep->sock);
+			sock_release(cep->sock);
+			cep->sock = NULL;
+		}
+		if (cep->cm_id) {
+			cep->cm_id->rem_ref(cep->cm_id);
+			cep->cm_id = NULL;
+			siw_cep_put(cep);
+		}
+	}
+	siw_cep_set_free(cep);
+	siw_put_work(work);
+	siw_cep_put(cep);
+}
+
+static struct workqueue_struct *siw_cm_wq;
+
+int siw_cm_queue_work(struct siw_cep *cep, enum siw_work_type type)
+{
+	struct siw_cm_work *work = siw_get_work(cep);
+	unsigned long delay = 0;
+
+	if (!work) {
+		siw_dbg_cep(cep, "failed with no work available\n");
+		return -ENOMEM;
+	}
+	work->type = type;
+	work->cep = cep;
+
+	siw_cep_get(cep);
+
+	INIT_DELAYED_WORK(&work->work, siw_cm_work_handler);
+
+	if (type == SIW_CM_WORK_MPATIMEOUT) {
+		cep->mpa_timer = work;
+
+		if (cep->state == SIW_EPSTATE_AWAIT_MPAREP)
+			delay = MPAREQ_TIMEOUT;
+		else
+			delay = MPAREP_TIMEOUT;
+	}
+	siw_dbg_cep(cep, "[QP %u]: work type: %d, timeout %lu\n",
+		    cep->qp ? qp_id(cep->qp) : -1, type, delay);
+
+	queue_delayed_work(siw_cm_wq, &work->work, delay);
+
+	return 0;
+}
+
+static void siw_cm_llp_data_ready(struct sock *sk)
+{
+	struct siw_cep *cep;
+
+	read_lock(&sk->sk_callback_lock);
+
+	cep = sk_to_cep(sk);
+	if (!cep) {
+		WARN_ON(1);
+		goto out;
+	}
+	siw_dbg_cep(cep, "state: %d\n", cep->state);
+
+	switch (cep->state) {
+	case SIW_EPSTATE_RDMA_MODE:
+		/* fall through */
+	case SIW_EPSTATE_LISTENING:
+		break;
+
+	case SIW_EPSTATE_AWAIT_MPAREQ:
+		/* fall through */
+	case SIW_EPSTATE_AWAIT_MPAREP:
+		siw_cm_queue_work(cep, SIW_CM_WORK_READ_MPAHDR);
+		break;
+
+	default:
+		siw_dbg_cep(cep, "unexpected data, state %d\n", cep->state);
+		break;
+	}
+out:
+	read_unlock(&sk->sk_callback_lock);
+}
+
+static void siw_cm_llp_write_space(struct sock *sk)
+{
+	struct siw_cep *cep = sk_to_cep(sk);
+
+	if (cep)
+		siw_dbg_cep(cep, "state: %d\n", cep->state);
+}
+
+static void siw_cm_llp_error_report(struct sock *sk)
+{
+	struct siw_cep *cep = sk_to_cep(sk);
+
+	if (cep) {
+		siw_dbg_cep(cep, "error %d, socket state: %d, cep state: %d\n",
+			    sk->sk_err, sk->sk_state, cep->state);
+		cep->sk_error_report(sk);
+	}
+}
+
+static void siw_cm_llp_state_change(struct sock *sk)
+{
+	struct siw_cep *cep;
+	void (*orig_state_change)(struct sock *s);
+
+	read_lock(&sk->sk_callback_lock);
+
+	cep = sk_to_cep(sk);
+	if (!cep) {
+		/* endpoint already disassociated */
+		read_unlock(&sk->sk_callback_lock);
+		return;
+	}
+	orig_state_change = cep->sk_state_change;
+
+	siw_dbg_cep(cep, "state: %d\n", cep->state);
+
+	switch (sk->sk_state) {
+	case TCP_ESTABLISHED:
+		/*
+		 * handle accepting socket as special case where only
+		 * new connection is possible
+		 */
+		siw_cm_queue_work(cep, SIW_CM_WORK_ACCEPT);
+		break;
+
+	case TCP_CLOSE:
+	case TCP_CLOSE_WAIT:
+		if (cep->qp)
+			cep->qp->tx_ctx.tx_suspend = 1;
+		siw_cm_queue_work(cep, SIW_CM_WORK_PEER_CLOSE);
+		break;
+
+	default:
+		siw_dbg_cep(cep, "unexpected socket state %d\n", sk->sk_state);
+	}
+	read_unlock(&sk->sk_callback_lock);
+	orig_state_change(sk);
+}
+
+static int kernel_bindconnect(struct socket *s, struct sockaddr *laddr,
+			      struct sockaddr *raddr)
+{
+	int rv, flags = 0, s_val = 1;
+	size_t size = laddr->sa_family == AF_INET ?
+		sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6);
+
+	/*
+	 * Make address available again asap.
+	 */
+	rv = kernel_setsockopt(s, SOL_SOCKET, SO_REUSEADDR, (char *)&s_val,
+			       sizeof(s_val));
+	if (rv < 0)
+		return rv;
+
+	rv = s->ops->bind(s, laddr, size);
+	if (rv < 0)
+		return rv;
+
+	rv = s->ops->connect(s, raddr, size, flags);
+
+	return rv < 0 ? rv : 0;
+}
+
+int siw_connect(struct iw_cm_id *id, struct iw_cm_conn_param *params)
+{
+	struct siw_device *sdev = to_siw_dev(id->device);
+	struct siw_qp *qp;
+	struct siw_cep *cep = NULL;
+	struct socket *s = NULL;
+	struct sockaddr *laddr = (struct sockaddr *)&id->local_addr,
+			*raddr = (struct sockaddr *)&id->remote_addr;
+	bool p2p_mode = peer_to_peer, v4 = true;
+	u16 pd_len = params->private_data_len;
+	int version = mpa_version, rv;
+
+	if (pd_len > MPA_MAX_PRIVDATA)
+		return -EINVAL;
+
+	if (params->ird > sdev->attrs.max_ird ||
+	    params->ord > sdev->attrs.max_ord)
+		return -ENOMEM;
+
+	if (laddr->sa_family == AF_INET6)
+		v4 = false;
+	else if (laddr->sa_family != AF_INET)
+		return -EAFNOSUPPORT;
+
+	/*
+	 * Respect any iwarp port mapping: Use mapped remote address
+	 * if valid. Local address must not be mapped, since siw
+	 * uses kernel TCP stack.
+	 */
+	if ((v4 && to_sockaddr_in(id->remote_addr).sin_port != 0) ||
+	     to_sockaddr_in6(id->remote_addr).sin6_port != 0)
+		raddr = (struct sockaddr *)&id->m_remote_addr;
+
+	qp = siw_qp_id2obj(sdev, params->qpn);
+	if (!qp) {
+		WARN(1, "[QP %u] does not exist\n", params->qpn);
+		rv = -EINVAL;
+		goto error;
+	}
+	if (v4)
+		siw_dbg_qp(qp,
+			   "pd_len %d, laddr %pI4 %d, raddr %pI4 %d\n",
+			   pd_len,
+			   &((struct sockaddr_in *)(laddr))->sin_addr,
+			   ntohs(((struct sockaddr_in *)(laddr))->sin_port),
+			   &((struct sockaddr_in *)(raddr))->sin_addr,
+			   ntohs(((struct sockaddr_in *)(raddr))->sin_port));
+	else
+		siw_dbg_qp(qp,
+			   "pd_len %d, laddr %pI6 %d, raddr %pI6 %d\n",
+			   pd_len,
+			   &((struct sockaddr_in6 *)(laddr))->sin6_addr,
+			   ntohs(((struct sockaddr_in6 *)(laddr))->sin6_port),
+			   &((struct sockaddr_in6 *)(raddr))->sin6_addr,
+			   ntohs(((struct sockaddr_in6 *)(raddr))->sin6_port));
+
+	rv = sock_create(v4 ? AF_INET : AF_INET6, SOCK_STREAM, IPPROTO_TCP, &s);
+	if (rv < 0)
+		goto error;
+
+	/*
+	 * NOTE: For simplification, connect() is called in blocking
+	 * mode. Might be reconsidered for async connection setup at
+	 * TCP level.
+	 */
+	rv = kernel_bindconnect(s, laddr, raddr);
+	if (rv != 0) {
+		siw_dbg_qp(qp, "kernel_bindconnect: error %d\n", rv);
+		goto error;
+	}
+	if (siw_tcp_nagle == false) {
+		int val = 1;
+
+		rv = kernel_setsockopt(s, SOL_TCP, TCP_NODELAY, (char *)&val,
+				       sizeof(val));
+		if (rv) {
+			siw_dbg_qp(qp, "setsockopt NODELAY error: %d\n", rv);
+			goto error;
+		}
+	}
+	cep = siw_cep_alloc(sdev);
+	if (!cep) {
+		rv = -ENOMEM;
+		goto error;
+	}
+	siw_cep_set_inuse(cep);
+
+	/* Associate QP with CEP */
+	siw_cep_get(cep);
+	qp->cep = cep;
+
+	/* siw_qp_get(qp) already done by QP lookup */
+	cep->qp = qp;
+
+	id->add_ref(id);
+	cep->cm_id = id;
+
+	/*
+	 * 4: Allocate a sufficient number of work elements
+	 * to allow concurrent handling of local + peer close
+	 * events, MPA header processing + MPA timeout.
+	 */
+	rv = siw_cm_alloc_work(cep, 4);
+	if (rv != 0) {
+		rv = -ENOMEM;
+		goto error;
+	}
+	cep->ird = params->ird;
+	cep->ord = params->ord;
+
+	if (p2p_mode && cep->ord == 0)
+		cep->ord = 1;
+
+	cep->state = SIW_EPSTATE_CONNECTING;
+
+	/*
+	 * Associate CEP with socket
+	 */
+	siw_cep_socket_assoc(cep, s);
+
+	cep->state = SIW_EPSTATE_AWAIT_MPAREP;
+
+	/*
+	 * Set MPA Request bits: CRC if required, no MPA Markers,
+	 * MPA Rev. according to module parameter 'mpa_version', Key 'Request'.
+	 */
+	cep->mpa.hdr.params.bits = 0;
+	if (version > MPA_REVISION_2) {
+		pr_warn("Setting MPA version to %u\n", MPA_REVISION_2);
+		version = MPA_REVISION_2;
+		/* Adjust also module parameter */
+		mpa_version = MPA_REVISION_2;
+	}
+	__mpa_rr_set_revision(&cep->mpa.hdr.params.bits, version);
+
+	if (try_gso)
+		cep->mpa.hdr.params.bits |= MPA_RR_FLAG_GSO_EXP;
+
+	if (mpa_crc_required)
+		cep->mpa.hdr.params.bits |= MPA_RR_FLAG_CRC;
+
+	/*
+	 * If MPA version == 2:
+	 * o Include ORD and IRD.
+	 * o Indicate peer-to-peer mode, if required by module
+	 *   parameter 'peer_to_peer'.
+	 */
+	if (version == MPA_REVISION_2) {
+		cep->enhanced_rdma_conn_est = true;
+		cep->mpa.hdr.params.bits |= MPA_RR_FLAG_ENHANCED;
+
+		cep->mpa.v2_ctrl.ird = htons(cep->ird);
+		cep->mpa.v2_ctrl.ord = htons(cep->ord);
+
+		if (p2p_mode) {
+			cep->mpa.v2_ctrl.ird |= MPA_V2_PEER_TO_PEER;
+			cep->mpa.v2_ctrl.ord |= rtr_type;
+		}
+		/* Remember own P2P mode requested */
+		cep->mpa.v2_ctrl_req.ird = cep->mpa.v2_ctrl.ird;
+		cep->mpa.v2_ctrl_req.ord = cep->mpa.v2_ctrl.ord;
+	}
+	memcpy(cep->mpa.hdr.key, MPA_KEY_REQ, 16);
+
+	rv = siw_send_mpareqrep(cep, params->private_data, pd_len);
+	/*
+	 * Reset private data.
+	 */
+	cep->mpa.hdr.params.pd_len = 0;
+
+	if (rv >= 0) {
+		rv = siw_cm_queue_work(cep, SIW_CM_WORK_MPATIMEOUT);
+		if (!rv) {
+			siw_dbg_cep(cep, "[QP %u]: exit\n", qp_id(qp));
+			siw_cep_set_free(cep);
+			return 0;
+		}
+	}
+error:
+	siw_dbg(id->device, "failed: %d\n", rv);
+
+	if (cep) {
+		siw_socket_disassoc(s);
+		sock_release(s);
+		cep->sock = NULL;
+
+		cep->qp = NULL;
+
+		cep->cm_id = NULL;
+		id->rem_ref(id);
+		siw_cep_put(cep);
+
+		qp->cep = NULL;
+		siw_cep_put(cep);
+
+		cep->state = SIW_EPSTATE_CLOSED;
+
+		siw_cep_set_free(cep);
+
+		siw_cep_put(cep);
+
+	} else if (s) {
+		sock_release(s);
+	}
+	if (qp)
+		siw_qp_put(qp);
+
+	return rv;
+}
+
+/*
+ * siw_accept - Let SoftiWARP accept an RDMA connection request
+ *
+ * @id:		New connection management id to be used for accepted
+ *		connection request
+ * @params:	Connection parameters provided by ULP for accepting connection
+ *
+ * Transition QP to RTS state, associate new CM id @id with accepted CEP
+ * and get prepared for TCP input by installing socket callbacks.
+ * Then send MPA Reply and generate the "connection established" event.
+ * Socket callbacks must be installed before sending MPA Reply, because
+ * the latter may cause a first RDMA message to arrive from the RDMA Initiator
+ * side very quickly, at which time the socket callbacks must be ready.
+ */
+int siw_accept(struct iw_cm_id *id, struct iw_cm_conn_param *params)
+{
+	struct siw_device *sdev = to_siw_dev(id->device);
+	struct siw_cep *cep = (struct siw_cep *)id->provider_data;
+	struct siw_qp *qp;
+	struct siw_qp_attrs qp_attrs;
+	int rv, max_priv_data = MPA_MAX_PRIVDATA;
+	bool wait_for_peer_rts = false;
+
+	siw_cep_set_inuse(cep);
+	siw_cep_put(cep);
+
+	/* Free lingering inbound private data */
+	if (cep->mpa.hdr.params.pd_len) {
+		cep->mpa.hdr.params.pd_len = 0;
+		kfree(cep->mpa.pdata);
+		cep->mpa.pdata = NULL;
+	}
+	siw_cancel_mpatimer(cep);
+
+	if (cep->state != SIW_EPSTATE_RECVD_MPAREQ) {
+		siw_dbg_cep(cep, "out of state\n");
+
+		siw_cep_set_free(cep);
+		siw_cep_put(cep);
+
+		return -ECONNRESET;
+	}
+	qp = siw_qp_id2obj(sdev, params->qpn);
+	if (!qp) {
+		WARN(1, "[QP %d] does not exist\n", params->qpn);
+		siw_cep_set_free(cep);
+		siw_cep_put(cep);
+
+		return -EINVAL;
+	}
+	down_write(&qp->state_lock);
+	if (qp->attrs.state > SIW_QP_STATE_RTR) {
+		rv = -EINVAL;
+		up_write(&qp->state_lock);
+		goto error;
+	}
+	siw_dbg_cep(cep, "[QP %d]\n", params->qpn);
+
+	if (try_gso && cep->mpa.hdr.params.bits & MPA_RR_FLAG_GSO_EXP) {
+		siw_dbg_cep(cep, "peer allows GSO on TX\n");
+		qp->tx_ctx.gso_seg_limit = 0;
+	}
+	if (params->ord > sdev->attrs.max_ord ||
+	    params->ird > sdev->attrs.max_ird) {
+		siw_dbg_cep(
+			cep,
+			"[QP %u]: ord %d (max %d), ird %d (max %d)\n",
+			qp_id(qp), params->ord, sdev->attrs.max_ord,
+			params->ird, sdev->attrs.max_ird);
+		rv = -EINVAL;
+		up_write(&qp->state_lock);
+		goto error;
+	}
+	if (cep->enhanced_rdma_conn_est)
+		max_priv_data -= sizeof(struct mpa_v2_data);
+
+	if (params->private_data_len > max_priv_data) {
+		siw_dbg_cep(
+			cep,
+			"[QP %u]: private data length: %d (max %d)\n",
+			qp_id(qp), params->private_data_len, max_priv_data);
+		rv = -EINVAL;
+		up_write(&qp->state_lock);
+		goto error;
+	}
+	if (cep->enhanced_rdma_conn_est) {
+		if (params->ord > cep->ord) {
+			if (relaxed_ird_negotiation) {
+				params->ord = cep->ord;
+			} else {
+				cep->ird = params->ird;
+				cep->ord = params->ord;
+				rv = -EINVAL;
+				up_write(&qp->state_lock);
+				goto error;
+			}
+		}
+		if (params->ird < cep->ird) {
+			if (relaxed_ird_negotiation &&
+			    cep->ird <= sdev->attrs.max_ird)
+				params->ird = cep->ird;
+			else {
+				rv = -ENOMEM;
+				up_write(&qp->state_lock);
+				goto error;
+			}
+		}
+		if (cep->mpa.v2_ctrl.ord &
+		    (MPA_V2_RDMA_WRITE_RTR | MPA_V2_RDMA_READ_RTR))
+			wait_for_peer_rts = true;
+		/*
+		 * Signal back negotiated IRD and ORD values
+		 */
+		cep->mpa.v2_ctrl.ord =
+			htons(params->ord & MPA_IRD_ORD_MASK) |
+			(cep->mpa.v2_ctrl.ord & ~MPA_V2_MASK_IRD_ORD);
+		cep->mpa.v2_ctrl.ird =
+			htons(params->ird & MPA_IRD_ORD_MASK) |
+			(cep->mpa.v2_ctrl.ird & ~MPA_V2_MASK_IRD_ORD);
+	}
+	cep->ird = params->ird;
+	cep->ord = params->ord;
+
+	cep->cm_id = id;
+	id->add_ref(id);
+
+	memset(&qp_attrs, 0, sizeof(qp_attrs));
+	qp_attrs.orq_size = cep->ord;
+	qp_attrs.irq_size = cep->ird;
+	qp_attrs.sk = cep->sock;
+	if (cep->mpa.hdr.params.bits & MPA_RR_FLAG_CRC)
+		qp_attrs.flags = SIW_MPA_CRC;
+	qp_attrs.state = SIW_QP_STATE_RTS;
+
+	siw_dbg_cep(cep, "[QP%u]: moving to rts\n", qp_id(qp));
+
+	/* Associate QP with CEP */
+	siw_cep_get(cep);
+	qp->cep = cep;
+
+	/* siw_qp_get(qp) already done by QP lookup */
+	cep->qp = qp;
+
+	cep->state = SIW_EPSTATE_RDMA_MODE;
+
+	/* Move socket RX/TX under QP control */
+	rv = siw_qp_modify(qp, &qp_attrs,
+			   SIW_QP_ATTR_STATE | SIW_QP_ATTR_LLP_HANDLE |
+				   SIW_QP_ATTR_ORD | SIW_QP_ATTR_IRD |
+				   SIW_QP_ATTR_MPA);
+	up_write(&qp->state_lock);
+
+	if (rv)
+		goto error;
+
+	siw_dbg_cep(cep, "[QP %u]: send mpa reply, %d byte pdata\n",
+		    qp_id(qp), params->private_data_len);
+
+	rv = siw_send_mpareqrep(cep, params->private_data,
+				params->private_data_len);
+	if (rv != 0)
+		goto error;
+
+	if (wait_for_peer_rts) {
+		siw_sk_assign_rtr_upcalls(cep);
+	} else {
+		siw_qp_socket_assoc(cep, qp);
+		rv = siw_cm_upcall(cep, IW_CM_EVENT_ESTABLISHED, 0);
+		if (rv)
+			goto error;
+	}
+	siw_cep_set_free(cep);
+
+	return 0;
+error:
+	siw_socket_disassoc(cep->sock);
+	sock_release(cep->sock);
+	cep->sock = NULL;
+
+	cep->state = SIW_EPSTATE_CLOSED;
+
+	if (cep->cm_id) {
+		cep->cm_id->rem_ref(id);
+		cep->cm_id = NULL;
+	}
+	if (qp->cep) {
+		siw_cep_put(cep);
+		qp->cep = NULL;
+	}
+	cep->qp = NULL;
+	siw_qp_put(qp);
+
+	siw_cep_set_free(cep);
+	siw_cep_put(cep);
+
+	return rv;
+}
+
+/*
+ * siw_reject()
+ *
+ * Local connection reject case. Send private data back to peer,
+ * close connection and dereference connection id.
+ */
+int siw_reject(struct iw_cm_id *id, const void *pdata, u8 pd_len)
+{
+	struct siw_cep *cep = (struct siw_cep *)id->provider_data;
+
+	siw_cep_set_inuse(cep);
+	siw_cep_put(cep);
+
+	siw_cancel_mpatimer(cep);
+
+	if (cep->state != SIW_EPSTATE_RECVD_MPAREQ) {
+		siw_dbg_cep(cep, "out of state\n");
+
+		siw_cep_set_free(cep);
+		siw_cep_put(cep); /* put last reference */
+
+		return -ECONNRESET;
+	}
+	siw_dbg_cep(cep, "cep->state %d, pd_len %d\n", cep->state,
+		    pd_len);
+
+	if (__mpa_rr_revision(cep->mpa.hdr.params.bits) >= MPA_REVISION_1) {
+		cep->mpa.hdr.params.bits |= MPA_RR_FLAG_REJECT; /* reject */
+		siw_send_mpareqrep(cep, pdata, pd_len);
+	}
+	siw_socket_disassoc(cep->sock);
+	sock_release(cep->sock);
+	cep->sock = NULL;
+
+	cep->state = SIW_EPSTATE_CLOSED;
+
+	siw_cep_set_free(cep);
+	siw_cep_put(cep);
+
+	return 0;
+}
+
+static int siw_listen_address(struct iw_cm_id *id, int backlog,
+			      struct sockaddr *laddr, int addr_family)
+{
+	struct socket *s;
+	struct siw_cep *cep = NULL;
+	struct siw_device *sdev = to_siw_dev(id->device);
+	int rv = 0, s_val;
+
+	rv = sock_create(addr_family, SOCK_STREAM, IPPROTO_TCP, &s);
+	if (rv < 0)
+		return rv;
+
+	/*
+	 * Allow binding local port when still in TIME_WAIT from last close.
+	 */
+	s_val = 1;
+	rv = kernel_setsockopt(s, SOL_SOCKET, SO_REUSEADDR, (char *)&s_val,
+			       sizeof(s_val));
+	if (rv) {
+		siw_dbg(id->device, "setsockopt error: %d\n", rv);
+		goto error;
+	}
+	rv = s->ops->bind(s, laddr, addr_family == AF_INET ?
+				    sizeof(struct sockaddr_in) :
+				    sizeof(struct sockaddr_in6));
+	if (rv) {
+		siw_dbg(id->device, "socket bind error: %d\n", rv);
+		goto error;
+	}
+	cep = siw_cep_alloc(sdev);
+	if (!cep) {
+		rv = -ENOMEM;
+		goto error;
+	}
+	siw_cep_socket_assoc(cep, s);
+
+	rv = siw_cm_alloc_work(cep, backlog);
+	if (rv) {
+		siw_dbg(id->device,
+			"alloc_work error %d, backlog %d\n",
+			rv, backlog);
+		goto error;
+	}
+	rv = s->ops->listen(s, backlog);
+	if (rv) {
+		siw_dbg(id->device, "listen error %d\n", rv);
+		goto error;
+	}
+	cep->cm_id = id;
+	id->add_ref(id);
+
+	/*
+	 * In case of a wildcard rdma_listen on a multi-homed device,
+	 * a listener's IWCM id is associated with more than one listening CEP.
+	 *
+	 * We currently use id->provider_data in three different ways:
+	 *
+	 * o For a listener's IWCM id, id->provider_data points to
+	 *   the list_head of the list of listening CEPs.
+	 *   Uses: siw_create_listen(), siw_destroy_listen()
+	 *
+	 * o For each accepted passive-side IWCM id, id->provider_data
+	 *   points to the CEP itself. This is a consequence of
+	 *   - siw_cm_upcall() setting event.provider_data = cep and
+	 *   - the IWCM's cm_conn_req_handler() setting provider_data of the
+	 *     new passive-side IWCM id equal to event.provider_data
+	 *   Uses: siw_accept(), siw_reject()
+	 *
+	 * o For an active-side IWCM id, id->provider_data is not used at all.
+	 *
+	 */
+	if (!id->provider_data) {
+		id->provider_data =
+			kmalloc(sizeof(struct list_head), GFP_KERNEL);
+		if (!id->provider_data) {
+			rv = -ENOMEM;
+			goto error;
+		}
+		INIT_LIST_HEAD((struct list_head *)id->provider_data);
+	}
+	list_add_tail(&cep->listenq, (struct list_head *)id->provider_data);
+	cep->state = SIW_EPSTATE_LISTENING;
+
+	if (addr_family == AF_INET)
+		siw_dbg(id->device, "Listen at laddr %pI4 %u\n",
+			&(((struct sockaddr_in *)laddr)->sin_addr),
+			((struct sockaddr_in *)laddr)->sin_port);
+	else
+		siw_dbg(id->device, "Listen at laddr %pI6 %u\n",
+			&(((struct sockaddr_in6 *)laddr)->sin6_addr),
+			((struct sockaddr_in6 *)laddr)->sin6_port);
+
+	return 0;
+
+error:
+	siw_dbg(id->device, "failed: %d\n", rv);
+
+	if (cep) {
+		siw_cep_set_inuse(cep);
+
+		if (cep->cm_id) {
+			cep->cm_id->rem_ref(cep->cm_id);
+			cep->cm_id = NULL;
+		}
+		cep->sock = NULL;
+		siw_socket_disassoc(s);
+		cep->state = SIW_EPSTATE_CLOSED;
+
+		siw_cep_set_free(cep);
+		siw_cep_put(cep);
+	}
+	sock_release(s);
+
+	return rv;
+}
+
+static void siw_drop_listeners(struct iw_cm_id *id)
+{
+	struct list_head *p, *tmp;
+
+	/*
+	 * In case of a wildcard rdma_listen on a multi-homed device,
+	 * a listener's IWCM id is associated with more than one listening CEP.
+	 */
+	list_for_each_safe(p, tmp, (struct list_head *)id->provider_data) {
+		struct siw_cep *cep = list_entry(p, struct siw_cep, listenq);
+
+		list_del(p);
+
+		siw_dbg_cep(cep, "drop cep, state %d\n", cep->state);
+
+		siw_cep_set_inuse(cep);
+
+		if (cep->cm_id) {
+			cep->cm_id->rem_ref(cep->cm_id);
+			cep->cm_id = NULL;
+		}
+		if (cep->sock) {
+			siw_socket_disassoc(cep->sock);
+			sock_release(cep->sock);
+			cep->sock = NULL;
+		}
+		cep->state = SIW_EPSTATE_CLOSED;
+		siw_cep_set_free(cep);
+		siw_cep_put(cep);
+	}
+}
+
+/*
+ * siw_create_listen - Create resources for a listener's IWCM ID @id
+ *
+ * Listens on the socket addresses id->local_addr and id->remote_addr.
+ *
+ * If the listener's @id provides a specific local IP address, at most one
+ * listening socket is created and associated with @id.
+ *
+ * If the listener's @id provides the wildcard (zero) local IP address,
+ * a separate listen is performed for each local IP address of the device
+ * by creating a listening socket and binding to that local IP address.
+ *
+ */
+int siw_create_listen(struct iw_cm_id *id, int backlog)
+{
+	struct net_device *dev = to_siw_dev(id->device)->netdev;
+	int rv = 0, listeners = 0;
+
+	siw_dbg(id->device, "backlog %d\n", backlog);
+
+	/*
+	 * For each attached address of the interface, create a
+	 * listening socket, if id->local_addr is the wildcard
+	 * IP address or matches the IP address.
+	 */
+	if (id->local_addr.ss_family == AF_INET) {
+		struct in_device *in_dev = in_dev_get(dev);
+		struct sockaddr_in s_laddr, *s_raddr;
+		const struct in_ifaddr *ifa;
+
+		if (!in_dev) {
+			rv = -ENODEV;
+			goto out;
+		}
+		memcpy(&s_laddr, &id->local_addr, sizeof(s_laddr));
+		s_raddr = (struct sockaddr_in *)&id->remote_addr;
+
+		siw_dbg(id->device,
+			"laddr %pI4:%d, raddr %pI4:%d\n",
+			&s_laddr.sin_addr, ntohs(s_laddr.sin_port),
+			&s_raddr->sin_addr, ntohs(s_raddr->sin_port));
+
+		rtnl_lock();
+		in_dev_for_each_ifa_rtnl(ifa, in_dev) {
+			if (ipv4_is_zeronet(s_laddr.sin_addr.s_addr) ||
+			    s_laddr.sin_addr.s_addr == ifa->ifa_address) {
+				s_laddr.sin_addr.s_addr = ifa->ifa_address;
+
+				rv = siw_listen_address(id, backlog,
+						(struct sockaddr *)&s_laddr,
+						AF_INET);
+				if (!rv)
+					listeners++;
+			}
+		}
+		rtnl_unlock();
+		in_dev_put(in_dev);
+	} else if (id->local_addr.ss_family == AF_INET6) {
+		struct inet6_dev *in6_dev = in6_dev_get(dev);
+		struct inet6_ifaddr *ifp;
+		struct sockaddr_in6 *s_laddr = &to_sockaddr_in6(id->local_addr),
+			*s_raddr = &to_sockaddr_in6(id->remote_addr);
+
+		if (!in6_dev) {
+			rv = -ENODEV;
+			goto out;
+		}
+		siw_dbg(id->device,
+			"laddr %pI6:%d, raddr %pI6:%d\n",
+			&s_laddr->sin6_addr, ntohs(s_laddr->sin6_port),
+			&s_raddr->sin6_addr, ntohs(s_raddr->sin6_port));
+
+		rtnl_lock();
+		list_for_each_entry(ifp, &in6_dev->addr_list, if_list) {
+			if (ifp->flags & (IFA_F_TENTATIVE | IFA_F_DEPRECATED))
+				continue;
+			if (ipv6_addr_any(&s_laddr->sin6_addr) ||
+			    ipv6_addr_equal(&s_laddr->sin6_addr, &ifp->addr)) {
+				struct sockaddr_in6 bind_addr  = {
+					.sin6_family = AF_INET6,
+					.sin6_port = s_laddr->sin6_port,
+					.sin6_flowinfo = 0,
+					.sin6_addr = ifp->addr,
+					.sin6_scope_id = dev->ifindex };
+
+				rv = siw_listen_address(id, backlog,
+						(struct sockaddr *)&bind_addr,
+						AF_INET6);
+				if (!rv)
+					listeners++;
+			}
+		}
+		rtnl_unlock();
+		in6_dev_put(in6_dev);
+	} else {
+		rv = -EAFNOSUPPORT;
+	}
+out:
+	if (listeners)
+		rv = 0;
+	else if (!rv)
+		rv = -EINVAL;
+
+	siw_dbg(id->device, "%s\n", rv ? "FAIL" : "OK");
+
+	return rv;
+}
+
+int siw_destroy_listen(struct iw_cm_id *id)
+{
+	if (!id->provider_data) {
+		siw_dbg(id->device, "no cep(s)\n");
+		return 0;
+	}
+	siw_drop_listeners(id);
+	kfree(id->provider_data);
+	id->provider_data = NULL;
+
+	return 0;
+}
+
+int siw_cm_init(void)
+{
+	/*
+	 * create_single_workqueue for strict ordering
+	 */
+	siw_cm_wq = create_singlethread_workqueue("siw_cm_wq");
+	if (!siw_cm_wq)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void siw_cm_exit(void)
+{
+	if (siw_cm_wq) {
+		flush_workqueue(siw_cm_wq);
+		destroy_workqueue(siw_cm_wq);
+	}
+}
diff --git a/drivers/infiniband/sw/siw/siw_cm.h b/drivers/infiniband/sw/siw/siw_cm.h
new file mode 100644
index 0000000..8c59cb3
--- /dev/null
+++ b/drivers/infiniband/sw/siw/siw_cm.h
@@ -0,0 +1,133 @@
+/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */
+
+/* Authors: Bernard Metzler <bmt@zurich.ibm.com> */
+/*          Greg Joyce <greg@opengridcomputing.com> */
+/* Copyright (c) 2008-2019, IBM Corporation */
+/* Copyright (c) 2017, Open Grid Computing, Inc. */
+
+#ifndef _SIW_CM_H
+#define _SIW_CM_H
+
+#include <net/sock.h>
+#include <linux/tcp.h>
+
+#include <rdma/iw_cm.h>
+
+enum siw_cep_state {
+	SIW_EPSTATE_IDLE = 1,
+	SIW_EPSTATE_LISTENING,
+	SIW_EPSTATE_CONNECTING,
+	SIW_EPSTATE_AWAIT_MPAREQ,
+	SIW_EPSTATE_RECVD_MPAREQ,
+	SIW_EPSTATE_AWAIT_MPAREP,
+	SIW_EPSTATE_RDMA_MODE,
+	SIW_EPSTATE_CLOSED
+};
+
+struct siw_mpa_info {
+	struct mpa_rr hdr; /* peer mpa hdr in host byte order */
+	struct mpa_v2_data v2_ctrl;
+	struct mpa_v2_data v2_ctrl_req;
+	char *pdata;
+	int bytes_rcvd;
+};
+
+struct siw_device;
+
+struct siw_cep {
+	struct iw_cm_id *cm_id;
+	struct siw_device *sdev;
+	struct list_head devq;
+	spinlock_t lock;
+	struct kref ref;
+	int in_use;
+	wait_queue_head_t waitq;
+	enum siw_cep_state state;
+
+	struct list_head listenq;
+	struct siw_cep *listen_cep;
+
+	struct siw_qp *qp;
+	struct socket *sock;
+
+	struct siw_cm_work *mpa_timer;
+	struct list_head work_freelist;
+
+	struct siw_mpa_info mpa;
+	int ord;
+	int ird;
+	bool enhanced_rdma_conn_est;
+
+	/* Saved upcalls of socket */
+	void (*sk_state_change)(struct sock *sk);
+	void (*sk_data_ready)(struct sock *sk);
+	void (*sk_write_space)(struct sock *sk);
+	void (*sk_error_report)(struct sock *sk);
+};
+
+/*
+ * Connection initiator waits 10 seconds to receive an
+ * MPA reply after sending out MPA request. Reponder waits for
+ * 5 seconds for MPA request to arrive if new TCP connection
+ * was set up.
+ */
+#define MPAREQ_TIMEOUT (HZ * 10)
+#define MPAREP_TIMEOUT (HZ * 5)
+
+enum siw_work_type {
+	SIW_CM_WORK_ACCEPT = 1,
+	SIW_CM_WORK_READ_MPAHDR,
+	SIW_CM_WORK_CLOSE_LLP, /* close socket */
+	SIW_CM_WORK_PEER_CLOSE, /* socket indicated peer close */
+	SIW_CM_WORK_MPATIMEOUT
+};
+
+struct siw_cm_work {
+	struct delayed_work work;
+	struct list_head list;
+	enum siw_work_type type;
+	struct siw_cep *cep;
+};
+
+#define to_sockaddr_in(a) (*(struct sockaddr_in *)(&(a)))
+#define to_sockaddr_in6(a) (*(struct sockaddr_in6 *)(&(a)))
+
+static inline int getname_peer(struct socket *s, struct sockaddr_storage *a)
+{
+	return s->ops->getname(s, (struct sockaddr *)a, 1);
+}
+
+static inline int getname_local(struct socket *s, struct sockaddr_storage *a)
+{
+	return s->ops->getname(s, (struct sockaddr *)a, 0);
+}
+
+static inline int ksock_recv(struct socket *sock, char *buf, size_t size,
+			     int flags)
+{
+	struct kvec iov = { buf, size };
+	struct msghdr msg = { .msg_name = NULL, .msg_flags = flags };
+
+	return kernel_recvmsg(sock, &msg, &iov, 1, size, flags);
+}
+
+int siw_connect(struct iw_cm_id *id, struct iw_cm_conn_param *parm);
+int siw_accept(struct iw_cm_id *id, struct iw_cm_conn_param *param);
+int siw_reject(struct iw_cm_id *id, const void *data, u8 len);
+int siw_create_listen(struct iw_cm_id *id, int backlog);
+int siw_destroy_listen(struct iw_cm_id *id);
+
+void siw_cep_get(struct siw_cep *cep);
+void siw_cep_put(struct siw_cep *cep);
+int siw_cm_queue_work(struct siw_cep *cep, enum siw_work_type type);
+
+int siw_cm_init(void);
+void siw_cm_exit(void);
+
+/*
+ * TCP socket interface
+ */
+#define sk_to_qp(sk) (((struct siw_cep *)((sk)->sk_user_data))->qp)
+#define sk_to_cep(sk) ((struct siw_cep *)((sk)->sk_user_data))
+
+#endif
diff --git a/drivers/infiniband/sw/siw/siw_cq.c b/drivers/infiniband/sw/siw/siw_cq.c
new file mode 100644
index 0000000..d8db3be
--- /dev/null
+++ b/drivers/infiniband/sw/siw/siw_cq.c
@@ -0,0 +1,102 @@
+// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause
+
+/* Authors: Bernard Metzler <bmt@zurich.ibm.com> */
+/* Copyright (c) 2008-2019, IBM Corporation */
+
+#include <linux/errno.h>
+#include <linux/types.h>
+
+#include <rdma/ib_verbs.h>
+
+#include "siw.h"
+
+static int map_wc_opcode[SIW_NUM_OPCODES] = {
+	[SIW_OP_WRITE] = IB_WC_RDMA_WRITE,
+	[SIW_OP_SEND] = IB_WC_SEND,
+	[SIW_OP_SEND_WITH_IMM] = IB_WC_SEND,
+	[SIW_OP_READ] = IB_WC_RDMA_READ,
+	[SIW_OP_READ_LOCAL_INV] = IB_WC_RDMA_READ,
+	[SIW_OP_COMP_AND_SWAP] = IB_WC_COMP_SWAP,
+	[SIW_OP_FETCH_AND_ADD] = IB_WC_FETCH_ADD,
+	[SIW_OP_INVAL_STAG] = IB_WC_LOCAL_INV,
+	[SIW_OP_REG_MR] = IB_WC_REG_MR,
+	[SIW_OP_RECEIVE] = IB_WC_RECV,
+	[SIW_OP_READ_RESPONSE] = -1 /* not used */
+};
+
+static struct {
+	enum siw_wc_status siw;
+	enum ib_wc_status ib;
+} map_cqe_status[SIW_NUM_WC_STATUS] = {
+	{ SIW_WC_SUCCESS, IB_WC_SUCCESS },
+	{ SIW_WC_LOC_LEN_ERR, IB_WC_LOC_LEN_ERR },
+	{ SIW_WC_LOC_PROT_ERR, IB_WC_LOC_PROT_ERR },
+	{ SIW_WC_LOC_QP_OP_ERR, IB_WC_LOC_QP_OP_ERR },
+	{ SIW_WC_WR_FLUSH_ERR, IB_WC_WR_FLUSH_ERR },
+	{ SIW_WC_BAD_RESP_ERR, IB_WC_BAD_RESP_ERR },
+	{ SIW_WC_LOC_ACCESS_ERR, IB_WC_LOC_ACCESS_ERR },
+	{ SIW_WC_REM_ACCESS_ERR, IB_WC_REM_ACCESS_ERR },
+	{ SIW_WC_REM_INV_REQ_ERR, IB_WC_REM_INV_REQ_ERR },
+	{ SIW_WC_GENERAL_ERR, IB_WC_GENERAL_ERR }
+};
+
+/*
+ * Reap one CQE from the CQ. Only used by kernel clients
+ * during CQ normal operation. Might be called during CQ
+ * flush for user mapped CQE array as well.
+ */
+int siw_reap_cqe(struct siw_cq *cq, struct ib_wc *wc)
+{
+	struct siw_cqe *cqe;
+	unsigned long flags;
+
+	spin_lock_irqsave(&cq->lock, flags);
+
+	cqe = &cq->queue[cq->cq_get % cq->num_cqe];
+	if (READ_ONCE(cqe->flags) & SIW_WQE_VALID) {
+		memset(wc, 0, sizeof(*wc));
+		wc->wr_id = cqe->id;
+		wc->status = map_cqe_status[cqe->status].ib;
+		wc->opcode = map_wc_opcode[cqe->opcode];
+		wc->byte_len = cqe->bytes;
+
+		/*
+		 * During CQ flush, also user land CQE's may get
+		 * reaped here, which do not hold a QP reference
+		 * and do not qualify for memory extension verbs.
+		 */
+		if (likely(cq->kernel_verbs)) {
+			if (cqe->flags & SIW_WQE_REM_INVAL) {
+				wc->ex.invalidate_rkey = cqe->inval_stag;
+				wc->wc_flags = IB_WC_WITH_INVALIDATE;
+			}
+			wc->qp = cqe->base_qp;
+			siw_dbg_cq(cq,
+				   "idx %u, type %d, flags %2x, id 0x%pK\n",
+				   cq->cq_get % cq->num_cqe, cqe->opcode,
+				   cqe->flags, (void *)(uintptr_t)cqe->id);
+		}
+		WRITE_ONCE(cqe->flags, 0);
+		cq->cq_get++;
+
+		spin_unlock_irqrestore(&cq->lock, flags);
+
+		return 1;
+	}
+	spin_unlock_irqrestore(&cq->lock, flags);
+
+	return 0;
+}
+
+/*
+ * siw_cq_flush()
+ *
+ * Flush all CQ elements.
+ */
+void siw_cq_flush(struct siw_cq *cq)
+{
+	struct ib_wc wc;
+
+	while (siw_reap_cqe(cq, &wc))
+		;
+}
diff --git a/drivers/infiniband/sw/siw/siw_main.c b/drivers/infiniband/sw/siw/siw_main.c
new file mode 100644
index 0000000..05a92f9
--- /dev/null
+++ b/drivers/infiniband/sw/siw/siw_main.c
@@ -0,0 +1,684 @@
+// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause
+
+/* Authors: Bernard Metzler <bmt@zurich.ibm.com> */
+/* Copyright (c) 2008-2019, IBM Corporation */
+
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/netdevice.h>
+#include <linux/inetdevice.h>
+#include <net/net_namespace.h>
+#include <linux/rtnetlink.h>
+#include <linux/if_arp.h>
+#include <linux/list.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/module.h>
+#include <linux/dma-mapping.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_user_verbs.h>
+#include <rdma/rdma_netlink.h>
+#include <linux/kthread.h>
+
+#include "siw.h"
+#include "siw_verbs.h"
+
+MODULE_AUTHOR("Bernard Metzler");
+MODULE_DESCRIPTION("Software iWARP Driver");
+MODULE_LICENSE("Dual BSD/GPL");
+
+/* transmit from user buffer, if possible */
+const bool zcopy_tx = true;
+
+/* Restrict usage of GSO, if hardware peer iwarp is unable to process
+ * large packets. try_gso = true lets siw try to use local GSO,
+ * if peer agrees.  Not using GSO severly limits siw maximum tx bandwidth.
+ */
+const bool try_gso;
+
+/* Attach siw also with loopback devices */
+const bool loopback_enabled = true;
+
+/* We try to negotiate CRC on, if true */
+const bool mpa_crc_required;
+
+/* MPA CRC on/off enforced */
+const bool mpa_crc_strict;
+
+/* Control TCP_NODELAY socket option */
+const bool siw_tcp_nagle;
+
+/* Select MPA version to be used during connection setup */
+u_char mpa_version = MPA_REVISION_2;
+
+/* Selects MPA P2P mode (additional handshake during connection
+ * setup, if true.
+ */
+const bool peer_to_peer;
+
+struct task_struct *siw_tx_thread[NR_CPUS];
+struct crypto_shash *siw_crypto_shash;
+
+static int siw_device_register(struct siw_device *sdev, const char *name)
+{
+	struct ib_device *base_dev = &sdev->base_dev;
+	static int dev_id = 1;
+	int rv;
+
+	rv = ib_register_device(base_dev, name);
+	if (rv) {
+		pr_warn("siw: device registration error %d\n", rv);
+		return rv;
+	}
+	sdev->vendor_part_id = dev_id++;
+
+	siw_dbg(base_dev, "HWaddr=%pM\n", sdev->netdev->dev_addr);
+
+	return 0;
+}
+
+static void siw_device_cleanup(struct ib_device *base_dev)
+{
+	struct siw_device *sdev = to_siw_dev(base_dev);
+
+	xa_destroy(&sdev->qp_xa);
+	xa_destroy(&sdev->mem_xa);
+}
+
+static int siw_create_tx_threads(void)
+{
+	int cpu, assigned = 0;
+
+	for_each_online_cpu(cpu) {
+		/* Skip HT cores */
+		if (cpu % cpumask_weight(topology_sibling_cpumask(cpu)))
+			continue;
+
+		siw_tx_thread[cpu] =
+			kthread_create(siw_run_sq, (unsigned long *)(long)cpu,
+				       "siw_tx/%d", cpu);
+		if (IS_ERR(siw_tx_thread[cpu])) {
+			siw_tx_thread[cpu] = NULL;
+			continue;
+		}
+		kthread_bind(siw_tx_thread[cpu], cpu);
+
+		wake_up_process(siw_tx_thread[cpu]);
+		assigned++;
+	}
+	return assigned;
+}
+
+static int siw_dev_qualified(struct net_device *netdev)
+{
+	/*
+	 * Additional hardware support can be added here
+	 * (e.g. ARPHRD_FDDI, ARPHRD_ATM, ...) - see
+	 * <linux/if_arp.h> for type identifiers.
+	 */
+	if (netdev->type == ARPHRD_ETHER || netdev->type == ARPHRD_IEEE802 ||
+	    (netdev->type == ARPHRD_LOOPBACK && loopback_enabled))
+		return 1;
+
+	return 0;
+}
+
+static DEFINE_PER_CPU(atomic_t, siw_use_cnt);
+
+static struct {
+	struct cpumask **tx_valid_cpus;
+	int num_nodes;
+} siw_cpu_info;
+
+static int siw_init_cpulist(void)
+{
+	int i, num_nodes = num_possible_nodes();
+
+	memset(siw_tx_thread, 0, sizeof(siw_tx_thread));
+
+	siw_cpu_info.num_nodes = num_nodes;
+
+	siw_cpu_info.tx_valid_cpus =
+		kcalloc(num_nodes, sizeof(struct cpumask *), GFP_KERNEL);
+	if (!siw_cpu_info.tx_valid_cpus) {
+		siw_cpu_info.num_nodes = 0;
+		return -ENOMEM;
+	}
+	for (i = 0; i < siw_cpu_info.num_nodes; i++) {
+		siw_cpu_info.tx_valid_cpus[i] =
+			kzalloc(sizeof(struct cpumask), GFP_KERNEL);
+		if (!siw_cpu_info.tx_valid_cpus[i])
+			goto out_err;
+
+		cpumask_clear(siw_cpu_info.tx_valid_cpus[i]);
+	}
+	for_each_possible_cpu(i)
+		cpumask_set_cpu(i, siw_cpu_info.tx_valid_cpus[cpu_to_node(i)]);
+
+	return 0;
+
+out_err:
+	siw_cpu_info.num_nodes = 0;
+	while (--i >= 0)
+		kfree(siw_cpu_info.tx_valid_cpus[i]);
+	kfree(siw_cpu_info.tx_valid_cpus);
+	siw_cpu_info.tx_valid_cpus = NULL;
+
+	return -ENOMEM;
+}
+
+static void siw_destroy_cpulist(void)
+{
+	int i = 0;
+
+	while (i < siw_cpu_info.num_nodes)
+		kfree(siw_cpu_info.tx_valid_cpus[i++]);
+
+	kfree(siw_cpu_info.tx_valid_cpus);
+}
+
+/*
+ * Choose CPU with least number of active QP's from NUMA node of
+ * TX interface.
+ */
+int siw_get_tx_cpu(struct siw_device *sdev)
+{
+	const struct cpumask *tx_cpumask;
+	int i, num_cpus, cpu, min_use, node = sdev->numa_node, tx_cpu = -1;
+
+	if (node < 0)
+		tx_cpumask = cpu_online_mask;
+	else
+		tx_cpumask = siw_cpu_info.tx_valid_cpus[node];
+
+	num_cpus = cpumask_weight(tx_cpumask);
+	if (!num_cpus) {
+		/* no CPU on this NUMA node */
+		tx_cpumask = cpu_online_mask;
+		num_cpus = cpumask_weight(tx_cpumask);
+	}
+	if (!num_cpus)
+		goto out;
+
+	cpu = cpumask_first(tx_cpumask);
+
+	for (i = 0, min_use = SIW_MAX_QP; i < num_cpus;
+	     i++, cpu = cpumask_next(cpu, tx_cpumask)) {
+		int usage;
+
+		/* Skip any cores which have no TX thread */
+		if (!siw_tx_thread[cpu])
+			continue;
+
+		usage = atomic_read(&per_cpu(siw_use_cnt, cpu));
+		if (usage <= min_use) {
+			tx_cpu = cpu;
+			min_use = usage;
+		}
+	}
+	siw_dbg(&sdev->base_dev,
+		"tx cpu %d, node %d, %d qp's\n", tx_cpu, node, min_use);
+
+out:
+	if (tx_cpu >= 0)
+		atomic_inc(&per_cpu(siw_use_cnt, tx_cpu));
+	else
+		pr_warn("siw: no tx cpu found\n");
+
+	return tx_cpu;
+}
+
+void siw_put_tx_cpu(int cpu)
+{
+	atomic_dec(&per_cpu(siw_use_cnt, cpu));
+}
+
+static struct ib_qp *siw_get_base_qp(struct ib_device *base_dev, int id)
+{
+	struct siw_qp *qp = siw_qp_id2obj(to_siw_dev(base_dev), id);
+
+	if (qp) {
+		/*
+		 * siw_qp_id2obj() increments object reference count
+		 */
+		siw_qp_put(qp);
+		return qp->ib_qp;
+	}
+	return NULL;
+}
+
+static void siw_verbs_sq_flush(struct ib_qp *base_qp)
+{
+	struct siw_qp *qp = to_siw_qp(base_qp);
+
+	down_write(&qp->state_lock);
+	siw_sq_flush(qp);
+	up_write(&qp->state_lock);
+}
+
+static void siw_verbs_rq_flush(struct ib_qp *base_qp)
+{
+	struct siw_qp *qp = to_siw_qp(base_qp);
+
+	down_write(&qp->state_lock);
+	siw_rq_flush(qp);
+	up_write(&qp->state_lock);
+}
+
+static const struct ib_device_ops siw_device_ops = {
+	.owner = THIS_MODULE,
+	.uverbs_abi_ver = SIW_ABI_VERSION,
+	.driver_id = RDMA_DRIVER_SIW,
+
+	.alloc_mr = siw_alloc_mr,
+	.alloc_pd = siw_alloc_pd,
+	.alloc_ucontext = siw_alloc_ucontext,
+	.create_cq = siw_create_cq,
+	.create_qp = siw_create_qp,
+	.create_srq = siw_create_srq,
+	.dealloc_driver = siw_device_cleanup,
+	.dealloc_pd = siw_dealloc_pd,
+	.dealloc_ucontext = siw_dealloc_ucontext,
+	.dereg_mr = siw_dereg_mr,
+	.destroy_cq = siw_destroy_cq,
+	.destroy_qp = siw_destroy_qp,
+	.destroy_srq = siw_destroy_srq,
+	.drain_rq = siw_verbs_rq_flush,
+	.drain_sq = siw_verbs_sq_flush,
+	.get_dma_mr = siw_get_dma_mr,
+	.get_port_immutable = siw_get_port_immutable,
+	.iw_accept = siw_accept,
+	.iw_add_ref = siw_qp_get_ref,
+	.iw_connect = siw_connect,
+	.iw_create_listen = siw_create_listen,
+	.iw_destroy_listen = siw_destroy_listen,
+	.iw_get_qp = siw_get_base_qp,
+	.iw_reject = siw_reject,
+	.iw_rem_ref = siw_qp_put_ref,
+	.map_mr_sg = siw_map_mr_sg,
+	.mmap = siw_mmap,
+	.modify_qp = siw_verbs_modify_qp,
+	.modify_srq = siw_modify_srq,
+	.poll_cq = siw_poll_cq,
+	.post_recv = siw_post_receive,
+	.post_send = siw_post_send,
+	.post_srq_recv = siw_post_srq_recv,
+	.query_device = siw_query_device,
+	.query_gid = siw_query_gid,
+	.query_pkey = siw_query_pkey,
+	.query_port = siw_query_port,
+	.query_qp = siw_query_qp,
+	.query_srq = siw_query_srq,
+	.req_notify_cq = siw_req_notify_cq,
+	.reg_user_mr = siw_reg_user_mr,
+
+	INIT_RDMA_OBJ_SIZE(ib_cq, siw_cq, base_cq),
+	INIT_RDMA_OBJ_SIZE(ib_pd, siw_pd, base_pd),
+	INIT_RDMA_OBJ_SIZE(ib_srq, siw_srq, base_srq),
+	INIT_RDMA_OBJ_SIZE(ib_ucontext, siw_ucontext, base_ucontext),
+};
+
+static struct siw_device *siw_device_create(struct net_device *netdev)
+{
+	struct siw_device *sdev = NULL;
+	struct ib_device *base_dev;
+	struct device *parent = netdev->dev.parent;
+	int rv;
+
+	if (!parent) {
+		/*
+		 * The loopback device has no parent device,
+		 * so it appears as a top-level device. To support
+		 * loopback device connectivity, take this device
+		 * as the parent device. Skip all other devices
+		 * w/o parent device.
+		 */
+		if (netdev->type != ARPHRD_LOOPBACK) {
+			pr_warn("siw: device %s error: no parent device\n",
+				netdev->name);
+			return NULL;
+		}
+		parent = &netdev->dev;
+	}
+	sdev = ib_alloc_device(siw_device, base_dev);
+	if (!sdev)
+		return NULL;
+
+	base_dev = &sdev->base_dev;
+
+	sdev->netdev = netdev;
+
+	if (netdev->type != ARPHRD_LOOPBACK) {
+		memcpy(&base_dev->node_guid, netdev->dev_addr, 6);
+	} else {
+		/*
+		 * The loopback device does not have a HW address,
+		 * but connection mangagement lib expects gid != 0
+		 */
+		size_t gidlen = min_t(size_t, strlen(base_dev->name), 6);
+
+		memcpy(&base_dev->node_guid, base_dev->name, gidlen);
+	}
+	base_dev->uverbs_cmd_mask =
+		(1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) |
+		(1ull << IB_USER_VERBS_CMD_QUERY_PORT) |
+		(1ull << IB_USER_VERBS_CMD_GET_CONTEXT) |
+		(1ull << IB_USER_VERBS_CMD_ALLOC_PD) |
+		(1ull << IB_USER_VERBS_CMD_DEALLOC_PD) |
+		(1ull << IB_USER_VERBS_CMD_REG_MR) |
+		(1ull << IB_USER_VERBS_CMD_DEREG_MR) |
+		(1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) |
+		(1ull << IB_USER_VERBS_CMD_CREATE_CQ) |
+		(1ull << IB_USER_VERBS_CMD_POLL_CQ) |
+		(1ull << IB_USER_VERBS_CMD_REQ_NOTIFY_CQ) |
+		(1ull << IB_USER_VERBS_CMD_DESTROY_CQ) |
+		(1ull << IB_USER_VERBS_CMD_CREATE_QP) |
+		(1ull << IB_USER_VERBS_CMD_QUERY_QP) |
+		(1ull << IB_USER_VERBS_CMD_MODIFY_QP) |
+		(1ull << IB_USER_VERBS_CMD_DESTROY_QP) |
+		(1ull << IB_USER_VERBS_CMD_POST_SEND) |
+		(1ull << IB_USER_VERBS_CMD_POST_RECV) |
+		(1ull << IB_USER_VERBS_CMD_CREATE_SRQ) |
+		(1ull << IB_USER_VERBS_CMD_POST_SRQ_RECV) |
+		(1ull << IB_USER_VERBS_CMD_MODIFY_SRQ) |
+		(1ull << IB_USER_VERBS_CMD_QUERY_SRQ) |
+		(1ull << IB_USER_VERBS_CMD_DESTROY_SRQ);
+
+	base_dev->node_type = RDMA_NODE_RNIC;
+	memcpy(base_dev->node_desc, SIW_NODE_DESC_COMMON,
+	       sizeof(SIW_NODE_DESC_COMMON));
+
+	/*
+	 * Current model (one-to-one device association):
+	 * One Softiwarp device per net_device or, equivalently,
+	 * per physical port.
+	 */
+	base_dev->phys_port_cnt = 1;
+	base_dev->dev.parent = parent;
+	base_dev->dev.dma_ops = &dma_virt_ops;
+	base_dev->num_comp_vectors = num_possible_cpus();
+
+	ib_set_device_ops(base_dev, &siw_device_ops);
+	rv = ib_device_set_netdev(base_dev, netdev, 1);
+	if (rv)
+		goto error;
+
+	memcpy(base_dev->iw_ifname, netdev->name,
+	       sizeof(base_dev->iw_ifname));
+
+	/* Disable TCP port mapping */
+	base_dev->iw_driver_flags = IW_F_NO_PORT_MAP,
+
+	sdev->attrs.max_qp = SIW_MAX_QP;
+	sdev->attrs.max_qp_wr = SIW_MAX_QP_WR;
+	sdev->attrs.max_ord = SIW_MAX_ORD_QP;
+	sdev->attrs.max_ird = SIW_MAX_IRD_QP;
+	sdev->attrs.max_sge = SIW_MAX_SGE;
+	sdev->attrs.max_sge_rd = SIW_MAX_SGE_RD;
+	sdev->attrs.max_cq = SIW_MAX_CQ;
+	sdev->attrs.max_cqe = SIW_MAX_CQE;
+	sdev->attrs.max_mr = SIW_MAX_MR;
+	sdev->attrs.max_pd = SIW_MAX_PD;
+	sdev->attrs.max_mw = SIW_MAX_MW;
+	sdev->attrs.max_fmr = SIW_MAX_FMR;
+	sdev->attrs.max_srq = SIW_MAX_SRQ;
+	sdev->attrs.max_srq_wr = SIW_MAX_SRQ_WR;
+	sdev->attrs.max_srq_sge = SIW_MAX_SGE;
+
+	xa_init_flags(&sdev->qp_xa, XA_FLAGS_ALLOC1);
+	xa_init_flags(&sdev->mem_xa, XA_FLAGS_ALLOC1);
+
+	INIT_LIST_HEAD(&sdev->cep_list);
+	INIT_LIST_HEAD(&sdev->qp_list);
+
+	atomic_set(&sdev->num_ctx, 0);
+	atomic_set(&sdev->num_srq, 0);
+	atomic_set(&sdev->num_qp, 0);
+	atomic_set(&sdev->num_cq, 0);
+	atomic_set(&sdev->num_mr, 0);
+	atomic_set(&sdev->num_pd, 0);
+
+	sdev->numa_node = dev_to_node(parent);
+	spin_lock_init(&sdev->lock);
+
+	return sdev;
+error:
+	ib_dealloc_device(base_dev);
+
+	return NULL;
+}
+
+/*
+ * Network link becomes unavailable. Mark all
+ * affected QP's accordingly.
+ */
+static void siw_netdev_down(struct work_struct *work)
+{
+	struct siw_device *sdev =
+		container_of(work, struct siw_device, netdev_down);
+
+	struct siw_qp_attrs qp_attrs;
+	struct list_head *pos, *tmp;
+
+	memset(&qp_attrs, 0, sizeof(qp_attrs));
+	qp_attrs.state = SIW_QP_STATE_ERROR;
+
+	list_for_each_safe(pos, tmp, &sdev->qp_list) {
+		struct siw_qp *qp = list_entry(pos, struct siw_qp, devq);
+
+		down_write(&qp->state_lock);
+		WARN_ON(siw_qp_modify(qp, &qp_attrs, SIW_QP_ATTR_STATE));
+		up_write(&qp->state_lock);
+	}
+	ib_device_put(&sdev->base_dev);
+}
+
+static void siw_device_goes_down(struct siw_device *sdev)
+{
+	if (ib_device_try_get(&sdev->base_dev)) {
+		INIT_WORK(&sdev->netdev_down, siw_netdev_down);
+		schedule_work(&sdev->netdev_down);
+	}
+}
+
+static int siw_netdev_event(struct notifier_block *nb, unsigned long event,
+			    void *arg)
+{
+	struct net_device *netdev = netdev_notifier_info_to_dev(arg);
+	struct ib_device *base_dev;
+	struct siw_device *sdev;
+
+	dev_dbg(&netdev->dev, "siw: event %lu\n", event);
+
+	if (dev_net(netdev) != &init_net)
+		return NOTIFY_OK;
+
+	base_dev = ib_device_get_by_netdev(netdev, RDMA_DRIVER_SIW);
+	if (!base_dev)
+		return NOTIFY_OK;
+
+	sdev = to_siw_dev(base_dev);
+
+	switch (event) {
+	case NETDEV_UP:
+		sdev->state = IB_PORT_ACTIVE;
+		siw_port_event(sdev, 1, IB_EVENT_PORT_ACTIVE);
+		break;
+
+	case NETDEV_GOING_DOWN:
+		siw_device_goes_down(sdev);
+		break;
+
+	case NETDEV_DOWN:
+		sdev->state = IB_PORT_DOWN;
+		siw_port_event(sdev, 1, IB_EVENT_PORT_ERR);
+		break;
+
+	case NETDEV_REGISTER:
+		/*
+		 * Device registration now handled only by
+		 * rdma netlink commands. So it shall be impossible
+		 * to end up here with a valid siw device.
+		 */
+		siw_dbg(base_dev, "unexpected NETDEV_REGISTER event\n");
+		break;
+
+	case NETDEV_UNREGISTER:
+		ib_unregister_device_queued(&sdev->base_dev);
+		break;
+
+	case NETDEV_CHANGEADDR:
+		siw_port_event(sdev, 1, IB_EVENT_LID_CHANGE);
+		break;
+	/*
+	 * Todo: Below netdev events are currently not handled.
+	 */
+	case NETDEV_CHANGEMTU:
+	case NETDEV_CHANGE:
+		break;
+
+	default:
+		break;
+	}
+	ib_device_put(&sdev->base_dev);
+
+	return NOTIFY_OK;
+}
+
+static struct notifier_block siw_netdev_nb = {
+	.notifier_call = siw_netdev_event,
+};
+
+static int siw_newlink(const char *basedev_name, struct net_device *netdev)
+{
+	struct ib_device *base_dev;
+	struct siw_device *sdev = NULL;
+	int rv = -ENOMEM;
+
+	if (!siw_dev_qualified(netdev))
+		return -EINVAL;
+
+	base_dev = ib_device_get_by_netdev(netdev, RDMA_DRIVER_SIW);
+	if (base_dev) {
+		ib_device_put(base_dev);
+		return -EEXIST;
+	}
+	sdev = siw_device_create(netdev);
+	if (sdev) {
+		dev_dbg(&netdev->dev, "siw: new device\n");
+
+		if (netif_running(netdev) && netif_carrier_ok(netdev))
+			sdev->state = IB_PORT_ACTIVE;
+		else
+			sdev->state = IB_PORT_DOWN;
+
+		rv = siw_device_register(sdev, basedev_name);
+		if (rv)
+			ib_dealloc_device(&sdev->base_dev);
+	}
+	return rv;
+}
+
+static struct rdma_link_ops siw_link_ops = {
+	.type = "siw",
+	.newlink = siw_newlink,
+};
+
+/*
+ * siw_init_module - Initialize Softiwarp module and register with netdev
+ *                   subsystem.
+ */
+static __init int siw_init_module(void)
+{
+	int rv;
+	int nr_cpu;
+
+	if (SENDPAGE_THRESH < SIW_MAX_INLINE) {
+		pr_info("siw: sendpage threshold too small: %u\n",
+			(int)SENDPAGE_THRESH);
+		rv = -EINVAL;
+		goto out_error;
+	}
+	rv = siw_init_cpulist();
+	if (rv)
+		goto out_error;
+
+	rv = siw_cm_init();
+	if (rv)
+		goto out_error;
+
+	if (!siw_create_tx_threads()) {
+		pr_info("siw: Could not start any TX thread\n");
+		rv = -ENOMEM;
+		goto out_error;
+	}
+	/*
+	 * Locate CRC32 algorithm. If unsuccessful, fail
+	 * loading siw only, if CRC is required.
+	 */
+	siw_crypto_shash = crypto_alloc_shash("crc32c", 0, 0);
+	if (IS_ERR(siw_crypto_shash)) {
+		pr_info("siw: Loading CRC32c failed: %ld\n",
+			PTR_ERR(siw_crypto_shash));
+		siw_crypto_shash = NULL;
+		if (mpa_crc_required) {
+			rv = -EOPNOTSUPP;
+			goto out_error;
+		}
+	}
+	rv = register_netdevice_notifier(&siw_netdev_nb);
+	if (rv)
+		goto out_error;
+
+	rdma_link_register(&siw_link_ops);
+
+	pr_info("SoftiWARP attached\n");
+	return 0;
+
+out_error:
+	for (nr_cpu = 0; nr_cpu < nr_cpu_ids; nr_cpu++) {
+		if (siw_tx_thread[nr_cpu]) {
+			siw_stop_tx_thread(nr_cpu);
+			siw_tx_thread[nr_cpu] = NULL;
+		}
+	}
+	if (siw_crypto_shash)
+		crypto_free_shash(siw_crypto_shash);
+
+	pr_info("SoftIWARP attach failed. Error: %d\n", rv);
+
+	siw_cm_exit();
+	siw_destroy_cpulist();
+
+	return rv;
+}
+
+static void __exit siw_exit_module(void)
+{
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		if (siw_tx_thread[cpu]) {
+			siw_stop_tx_thread(cpu);
+			siw_tx_thread[cpu] = NULL;
+		}
+	}
+	unregister_netdevice_notifier(&siw_netdev_nb);
+	rdma_link_unregister(&siw_link_ops);
+	ib_unregister_driver(RDMA_DRIVER_SIW);
+
+	siw_cm_exit();
+
+	siw_destroy_cpulist();
+
+	if (siw_crypto_shash)
+		crypto_free_shash(siw_crypto_shash);
+
+	pr_info("SoftiWARP detached\n");
+}
+
+module_init(siw_init_module);
+module_exit(siw_exit_module);
+
+MODULE_ALIAS_RDMA_LINK("siw");
diff --git a/drivers/infiniband/sw/siw/siw_mem.c b/drivers/infiniband/sw/siw/siw_mem.c
new file mode 100644
index 0000000..e99983f
--- /dev/null
+++ b/drivers/infiniband/sw/siw/siw_mem.c
@@ -0,0 +1,452 @@
+// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause
+
+/* Authors: Bernard Metzler <bmt@zurich.ibm.com> */
+/* Copyright (c) 2008-2019, IBM Corporation */
+
+#include <linux/gfp.h>
+#include <rdma/ib_verbs.h>
+#include <linux/dma-mapping.h>
+#include <linux/slab.h>
+#include <linux/sched/mm.h>
+#include <linux/resource.h>
+
+#include "siw.h"
+#include "siw_mem.h"
+
+/*
+ * Stag lookup is based on its index part only (24 bits).
+ * The code avoids special Stag of zero and tries to randomize
+ * STag values between 1 and SIW_STAG_MAX_INDEX.
+ */
+int siw_mem_add(struct siw_device *sdev, struct siw_mem *m)
+{
+	struct xa_limit limit = XA_LIMIT(1, 0x00ffffff);
+	u32 id, next;
+
+	get_random_bytes(&next, 4);
+	next &= 0x00ffffff;
+
+	if (xa_alloc_cyclic(&sdev->mem_xa, &id, m, limit, &next,
+	    GFP_KERNEL) < 0)
+		return -ENOMEM;
+
+	/* Set the STag index part */
+	m->stag = id << 8;
+
+	siw_dbg_mem(m, "new MEM object\n");
+
+	return 0;
+}
+
+/*
+ * siw_mem_id2obj()
+ *
+ * resolves memory from stag given by id. might be called from:
+ * o process context before sending out of sgl, or
+ * o in softirq when resolving target memory
+ */
+struct siw_mem *siw_mem_id2obj(struct siw_device *sdev, int stag_index)
+{
+	struct siw_mem *mem;
+
+	rcu_read_lock();
+	mem = xa_load(&sdev->mem_xa, stag_index);
+	if (likely(mem && kref_get_unless_zero(&mem->ref))) {
+		rcu_read_unlock();
+		return mem;
+	}
+	rcu_read_unlock();
+
+	return NULL;
+}
+
+static void siw_free_plist(struct siw_page_chunk *chunk, int num_pages,
+			   bool dirty)
+{
+	put_user_pages_dirty_lock(chunk->plist, num_pages, dirty);
+}
+
+void siw_umem_release(struct siw_umem *umem, bool dirty)
+{
+	struct mm_struct *mm_s = umem->owning_mm;
+	int i, num_pages = umem->num_pages;
+
+	for (i = 0; num_pages; i++) {
+		int to_free = min_t(int, PAGES_PER_CHUNK, num_pages);
+
+		siw_free_plist(&umem->page_chunk[i], to_free,
+			       umem->writable && dirty);
+		kfree(umem->page_chunk[i].plist);
+		num_pages -= to_free;
+	}
+	atomic64_sub(umem->num_pages, &mm_s->pinned_vm);
+
+	mmdrop(mm_s);
+	kfree(umem->page_chunk);
+	kfree(umem);
+}
+
+int siw_mr_add_mem(struct siw_mr *mr, struct ib_pd *pd, void *mem_obj,
+		   u64 start, u64 len, int rights)
+{
+	struct siw_device *sdev = to_siw_dev(pd->device);
+	struct siw_mem *mem = kzalloc(sizeof(*mem), GFP_KERNEL);
+	struct xa_limit limit = XA_LIMIT(1, 0x00ffffff);
+	u32 id, next;
+
+	if (!mem)
+		return -ENOMEM;
+
+	mem->mem_obj = mem_obj;
+	mem->stag_valid = 0;
+	mem->sdev = sdev;
+	mem->va = start;
+	mem->len = len;
+	mem->pd = pd;
+	mem->perms = rights & IWARP_ACCESS_MASK;
+	kref_init(&mem->ref);
+
+	mr->mem = mem;
+
+	get_random_bytes(&next, 4);
+	next &= 0x00ffffff;
+
+	if (xa_alloc_cyclic(&sdev->mem_xa, &id, mem, limit, &next,
+	    GFP_KERNEL) < 0) {
+		kfree(mem);
+		return -ENOMEM;
+	}
+	/* Set the STag index part */
+	mem->stag = id << 8;
+	mr->base_mr.lkey = mr->base_mr.rkey = mem->stag;
+
+	return 0;
+}
+
+void siw_mr_drop_mem(struct siw_mr *mr)
+{
+	struct siw_mem *mem = mr->mem, *found;
+
+	mem->stag_valid = 0;
+
+	/* make STag invalid visible asap */
+	smp_mb();
+
+	found = xa_erase(&mem->sdev->mem_xa, mem->stag >> 8);
+	WARN_ON(found != mem);
+	siw_mem_put(mem);
+}
+
+void siw_free_mem(struct kref *ref)
+{
+	struct siw_mem *mem = container_of(ref, struct siw_mem, ref);
+
+	siw_dbg_mem(mem, "free mem, pbl: %s\n", mem->is_pbl ? "y" : "n");
+
+	if (!mem->is_mw && mem->mem_obj) {
+		if (mem->is_pbl == 0)
+			siw_umem_release(mem->umem, true);
+		else
+			kfree(mem->pbl);
+	}
+	kfree(mem);
+}
+
+/*
+ * siw_check_mem()
+ *
+ * Check protection domain, STAG state, access permissions and
+ * address range for memory object.
+ *
+ * @pd:		Protection Domain memory should belong to
+ * @mem:	memory to be checked
+ * @addr:	starting addr of mem
+ * @perms:	requested access permissions
+ * @len:	len of memory interval to be checked
+ *
+ */
+int siw_check_mem(struct ib_pd *pd, struct siw_mem *mem, u64 addr,
+		  enum ib_access_flags perms, int len)
+{
+	if (!mem->stag_valid) {
+		siw_dbg_pd(pd, "STag 0x%08x invalid\n", mem->stag);
+		return -E_STAG_INVALID;
+	}
+	if (mem->pd != pd) {
+		siw_dbg_pd(pd, "STag 0x%08x: PD mismatch\n", mem->stag);
+		return -E_PD_MISMATCH;
+	}
+	/*
+	 * check access permissions
+	 */
+	if ((mem->perms & perms) < perms) {
+		siw_dbg_pd(pd, "permissions 0x%08x < 0x%08x\n",
+			   mem->perms, perms);
+		return -E_ACCESS_PERM;
+	}
+	/*
+	 * Check if access falls into valid memory interval.
+	 */
+	if (addr < mem->va || addr + len > mem->va + mem->len) {
+		siw_dbg_pd(pd, "MEM interval len %d\n", len);
+		siw_dbg_pd(pd, "[0x%pK, 0x%pK] out of bounds\n",
+			   (void *)(uintptr_t)addr,
+			   (void *)(uintptr_t)(addr + len));
+		siw_dbg_pd(pd, "[0x%pK, 0x%pK] STag=0x%08x\n",
+			   (void *)(uintptr_t)mem->va,
+			   (void *)(uintptr_t)(mem->va + mem->len),
+			   mem->stag);
+
+		return -E_BASE_BOUNDS;
+	}
+	return E_ACCESS_OK;
+}
+
+/*
+ * siw_check_sge()
+ *
+ * Check SGE for access rights in given interval
+ *
+ * @pd:		Protection Domain memory should belong to
+ * @sge:	SGE to be checked
+ * @mem:	location of memory reference within array
+ * @perms:	requested access permissions
+ * @off:	starting offset in SGE
+ * @len:	len of memory interval to be checked
+ *
+ * NOTE: Function references SGE's memory object (mem->obj)
+ * if not yet done. New reference is kept if check went ok and
+ * released if check failed. If mem->obj is already valid, no new
+ * lookup is being done and mem is not released it check fails.
+ */
+int siw_check_sge(struct ib_pd *pd, struct siw_sge *sge, struct siw_mem *mem[],
+		  enum ib_access_flags perms, u32 off, int len)
+{
+	struct siw_device *sdev = to_siw_dev(pd->device);
+	struct siw_mem *new = NULL;
+	int rv = E_ACCESS_OK;
+
+	if (len + off > sge->length) {
+		rv = -E_BASE_BOUNDS;
+		goto fail;
+	}
+	if (*mem == NULL) {
+		new = siw_mem_id2obj(sdev, sge->lkey >> 8);
+		if (unlikely(!new)) {
+			siw_dbg_pd(pd, "STag unknown: 0x%08x\n", sge->lkey);
+			rv = -E_STAG_INVALID;
+			goto fail;
+		}
+		*mem = new;
+	}
+	/* Check if user re-registered with different STag key */
+	if (unlikely((*mem)->stag != sge->lkey)) {
+		siw_dbg_mem((*mem), "STag mismatch: 0x%08x\n", sge->lkey);
+		rv = -E_STAG_INVALID;
+		goto fail;
+	}
+	rv = siw_check_mem(pd, *mem, sge->laddr + off, perms, len);
+	if (unlikely(rv))
+		goto fail;
+
+	return 0;
+
+fail:
+	if (new) {
+		*mem = NULL;
+		siw_mem_put(new);
+	}
+	return rv;
+}
+
+void siw_wqe_put_mem(struct siw_wqe *wqe, enum siw_opcode op)
+{
+	switch (op) {
+	case SIW_OP_SEND:
+	case SIW_OP_WRITE:
+	case SIW_OP_SEND_WITH_IMM:
+	case SIW_OP_SEND_REMOTE_INV:
+	case SIW_OP_READ:
+	case SIW_OP_READ_LOCAL_INV:
+		if (!(wqe->sqe.flags & SIW_WQE_INLINE))
+			siw_unref_mem_sgl(wqe->mem, wqe->sqe.num_sge);
+		break;
+
+	case SIW_OP_RECEIVE:
+		siw_unref_mem_sgl(wqe->mem, wqe->rqe.num_sge);
+		break;
+
+	case SIW_OP_READ_RESPONSE:
+		siw_unref_mem_sgl(wqe->mem, 1);
+		break;
+
+	default:
+		/*
+		 * SIW_OP_INVAL_STAG and SIW_OP_REG_MR
+		 * do not hold memory references
+		 */
+		break;
+	}
+}
+
+int siw_invalidate_stag(struct ib_pd *pd, u32 stag)
+{
+	struct siw_device *sdev = to_siw_dev(pd->device);
+	struct siw_mem *mem = siw_mem_id2obj(sdev, stag >> 8);
+	int rv = 0;
+
+	if (unlikely(!mem)) {
+		siw_dbg_pd(pd, "STag 0x%08x unknown\n", stag);
+		return -EINVAL;
+	}
+	if (unlikely(mem->pd != pd)) {
+		siw_dbg_pd(pd, "PD mismatch for STag 0x%08x\n", stag);
+		rv = -EACCES;
+		goto out;
+	}
+	/*
+	 * Per RDMA verbs definition, an STag may already be in invalid
+	 * state if invalidation is requested. So no state check here.
+	 */
+	mem->stag_valid = 0;
+
+	siw_dbg_pd(pd, "STag 0x%08x now invalid\n", stag);
+out:
+	siw_mem_put(mem);
+	return rv;
+}
+
+/*
+ * Gets physical address backed by PBL element. Address is referenced
+ * by linear byte offset into list of variably sized PB elements.
+ * Optionally, provides remaining len within current element, and
+ * current PBL index for later resume at same element.
+ */
+dma_addr_t siw_pbl_get_buffer(struct siw_pbl *pbl, u64 off, int *len, int *idx)
+{
+	int i = idx ? *idx : 0;
+
+	while (i < pbl->num_buf) {
+		struct siw_pble *pble = &pbl->pbe[i];
+
+		if (pble->pbl_off + pble->size > off) {
+			u64 pble_off = off - pble->pbl_off;
+
+			if (len)
+				*len = pble->size - pble_off;
+			if (idx)
+				*idx = i;
+
+			return pble->addr + pble_off;
+		}
+		i++;
+	}
+	if (len)
+		*len = 0;
+	return 0;
+}
+
+struct siw_pbl *siw_pbl_alloc(u32 num_buf)
+{
+	struct siw_pbl *pbl;
+	int buf_size = sizeof(*pbl);
+
+	if (num_buf == 0)
+		return ERR_PTR(-EINVAL);
+
+	buf_size += ((num_buf - 1) * sizeof(struct siw_pble));
+
+	pbl = kzalloc(buf_size, GFP_KERNEL);
+	if (!pbl)
+		return ERR_PTR(-ENOMEM);
+
+	pbl->max_buf = num_buf;
+
+	return pbl;
+}
+
+struct siw_umem *siw_umem_get(u64 start, u64 len, bool writable)
+{
+	struct siw_umem *umem;
+	struct mm_struct *mm_s;
+	u64 first_page_va;
+	unsigned long mlock_limit;
+	unsigned int foll_flags = FOLL_WRITE;
+	int num_pages, num_chunks, i, rv = 0;
+
+	if (!can_do_mlock())
+		return ERR_PTR(-EPERM);
+
+	if (!len)
+		return ERR_PTR(-EINVAL);
+
+	first_page_va = start & PAGE_MASK;
+	num_pages = PAGE_ALIGN(start + len - first_page_va) >> PAGE_SHIFT;
+	num_chunks = (num_pages >> CHUNK_SHIFT) + 1;
+
+	umem = kzalloc(sizeof(*umem), GFP_KERNEL);
+	if (!umem)
+		return ERR_PTR(-ENOMEM);
+
+	mm_s = current->mm;
+	umem->owning_mm = mm_s;
+	umem->writable = writable;
+
+	mmgrab(mm_s);
+
+	if (!writable)
+		foll_flags |= FOLL_FORCE;
+
+	down_read(&mm_s->mmap_sem);
+
+	mlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+
+	if (num_pages + atomic64_read(&mm_s->pinned_vm) > mlock_limit) {
+		rv = -ENOMEM;
+		goto out_sem_up;
+	}
+	umem->fp_addr = first_page_va;
+
+	umem->page_chunk =
+		kcalloc(num_chunks, sizeof(struct siw_page_chunk), GFP_KERNEL);
+	if (!umem->page_chunk) {
+		rv = -ENOMEM;
+		goto out_sem_up;
+	}
+	for (i = 0; num_pages; i++) {
+		int got, nents = min_t(int, num_pages, PAGES_PER_CHUNK);
+
+		umem->page_chunk[i].plist =
+			kcalloc(nents, sizeof(struct page *), GFP_KERNEL);
+		if (!umem->page_chunk[i].plist) {
+			rv = -ENOMEM;
+			goto out_sem_up;
+		}
+		got = 0;
+		while (nents) {
+			struct page **plist = &umem->page_chunk[i].plist[got];
+
+			rv = get_user_pages(first_page_va, nents,
+					    foll_flags | FOLL_LONGTERM,
+					    plist, NULL);
+			if (rv < 0)
+				goto out_sem_up;
+
+			umem->num_pages += rv;
+			atomic64_add(rv, &mm_s->pinned_vm);
+			first_page_va += rv * PAGE_SIZE;
+			nents -= rv;
+			got += rv;
+		}
+		num_pages -= got;
+	}
+out_sem_up:
+	up_read(&mm_s->mmap_sem);
+
+	if (rv > 0)
+		return umem;
+
+	siw_umem_release(umem, false);
+
+	return ERR_PTR(rv);
+}
diff --git a/drivers/infiniband/sw/siw/siw_mem.h b/drivers/infiniband/sw/siw/siw_mem.h
new file mode 100644
index 0000000..db138c8
--- /dev/null
+++ b/drivers/infiniband/sw/siw/siw_mem.h
@@ -0,0 +1,74 @@
+/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */
+
+/* Authors: Bernard Metzler <bmt@zurich.ibm.com> */
+/* Copyright (c) 2008-2019, IBM Corporation */
+
+#ifndef _SIW_MEM_H
+#define _SIW_MEM_H
+
+struct siw_umem *siw_umem_get(u64 start, u64 len, bool writable);
+void siw_umem_release(struct siw_umem *umem, bool dirty);
+struct siw_pbl *siw_pbl_alloc(u32 num_buf);
+dma_addr_t siw_pbl_get_buffer(struct siw_pbl *pbl, u64 off, int *len, int *idx);
+struct siw_mem *siw_mem_id2obj(struct siw_device *sdev, int stag_index);
+int siw_mem_add(struct siw_device *sdev, struct siw_mem *m);
+int siw_invalidate_stag(struct ib_pd *pd, u32 stag);
+int siw_check_mem(struct ib_pd *pd, struct siw_mem *mem, u64 addr,
+		  enum ib_access_flags perms, int len);
+int siw_check_sge(struct ib_pd *pd, struct siw_sge *sge,
+		  struct siw_mem *mem[], enum ib_access_flags perms,
+		  u32 off, int len);
+void siw_wqe_put_mem(struct siw_wqe *wqe, enum siw_opcode op);
+int siw_mr_add_mem(struct siw_mr *mr, struct ib_pd *pd, void *mem_obj,
+		   u64 start, u64 len, int rights);
+void siw_mr_drop_mem(struct siw_mr *mr);
+void siw_free_mem(struct kref *ref);
+
+static inline void siw_mem_put(struct siw_mem *mem)
+{
+	kref_put(&mem->ref, siw_free_mem);
+}
+
+static inline struct siw_mr *siw_mem2mr(struct siw_mem *m)
+{
+	return container_of(m, struct siw_mr, mem);
+}
+
+static inline void siw_unref_mem_sgl(struct siw_mem **mem, unsigned int num_sge)
+{
+	while (num_sge) {
+		if (*mem == NULL)
+			break;
+
+		siw_mem_put(*mem);
+		*mem = NULL;
+		mem++;
+		num_sge--;
+	}
+}
+
+#define CHUNK_SHIFT 9 /* sets number of pages per chunk */
+#define PAGES_PER_CHUNK (_AC(1, UL) << CHUNK_SHIFT)
+#define CHUNK_MASK (~(PAGES_PER_CHUNK - 1))
+#define PAGE_CHUNK_SIZE (PAGES_PER_CHUNK * sizeof(struct page *))
+
+/*
+ * siw_get_upage()
+ *
+ * Get page pointer for address on given umem.
+ *
+ * @umem: two dimensional list of page pointers
+ * @addr: user virtual address
+ */
+static inline struct page *siw_get_upage(struct siw_umem *umem, u64 addr)
+{
+	unsigned int page_idx = (addr - umem->fp_addr) >> PAGE_SHIFT,
+		     chunk_idx = page_idx >> CHUNK_SHIFT,
+		     page_in_chunk = page_idx & ~CHUNK_MASK;
+
+	if (likely(page_idx < umem->num_pages))
+		return umem->page_chunk[chunk_idx].plist[page_in_chunk];
+
+	return NULL;
+}
+#endif
diff --git a/drivers/infiniband/sw/siw/siw_qp.c b/drivers/infiniband/sw/siw/siw_qp.c
new file mode 100644
index 0000000..b431748
--- /dev/null
+++ b/drivers/infiniband/sw/siw/siw_qp.c
@@ -0,0 +1,1339 @@
+// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause
+
+/* Authors: Bernard Metzler <bmt@zurich.ibm.com> */
+/* Copyright (c) 2008-2019, IBM Corporation */
+
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/net.h>
+#include <linux/scatterlist.h>
+#include <linux/llist.h>
+#include <asm/barrier.h>
+#include <net/tcp.h>
+
+#include "siw.h"
+#include "siw_verbs.h"
+#include "siw_mem.h"
+
+static char siw_qp_state_to_string[SIW_QP_STATE_COUNT][sizeof "TERMINATE"] = {
+	[SIW_QP_STATE_IDLE] = "IDLE",
+	[SIW_QP_STATE_RTR] = "RTR",
+	[SIW_QP_STATE_RTS] = "RTS",
+	[SIW_QP_STATE_CLOSING] = "CLOSING",
+	[SIW_QP_STATE_TERMINATE] = "TERMINATE",
+	[SIW_QP_STATE_ERROR] = "ERROR"
+};
+
+/*
+ * iWARP (RDMAP, DDP and MPA) parameters as well as Softiwarp settings on a
+ * per-RDMAP message basis. Please keep order of initializer. All MPA len
+ * is initialized to minimum packet size.
+ */
+struct iwarp_msg_info iwarp_pktinfo[RDMAP_TERMINATE + 1] = {
+	{ /* RDMAP_RDMA_WRITE */
+	  .hdr_len = sizeof(struct iwarp_rdma_write),
+	  .ctrl.mpa_len = htons(sizeof(struct iwarp_rdma_write) - 2),
+	  .ctrl.ddp_rdmap_ctrl = DDP_FLAG_TAGGED | DDP_FLAG_LAST |
+				 cpu_to_be16(DDP_VERSION << 8) |
+				 cpu_to_be16(RDMAP_VERSION << 6) |
+				 cpu_to_be16(RDMAP_RDMA_WRITE),
+	  .rx_data = siw_proc_write },
+	{ /* RDMAP_RDMA_READ_REQ */
+	  .hdr_len = sizeof(struct iwarp_rdma_rreq),
+	  .ctrl.mpa_len = htons(sizeof(struct iwarp_rdma_rreq) - 2),
+	  .ctrl.ddp_rdmap_ctrl = DDP_FLAG_LAST | cpu_to_be16(DDP_VERSION << 8) |
+				 cpu_to_be16(RDMAP_VERSION << 6) |
+				 cpu_to_be16(RDMAP_RDMA_READ_REQ),
+	  .rx_data = siw_proc_rreq },
+	{ /* RDMAP_RDMA_READ_RESP */
+	  .hdr_len = sizeof(struct iwarp_rdma_rresp),
+	  .ctrl.mpa_len = htons(sizeof(struct iwarp_rdma_rresp) - 2),
+	  .ctrl.ddp_rdmap_ctrl = DDP_FLAG_TAGGED | DDP_FLAG_LAST |
+				 cpu_to_be16(DDP_VERSION << 8) |
+				 cpu_to_be16(RDMAP_VERSION << 6) |
+				 cpu_to_be16(RDMAP_RDMA_READ_RESP),
+	  .rx_data = siw_proc_rresp },
+	{ /* RDMAP_SEND */
+	  .hdr_len = sizeof(struct iwarp_send),
+	  .ctrl.mpa_len = htons(sizeof(struct iwarp_send) - 2),
+	  .ctrl.ddp_rdmap_ctrl = DDP_FLAG_LAST | cpu_to_be16(DDP_VERSION << 8) |
+				 cpu_to_be16(RDMAP_VERSION << 6) |
+				 cpu_to_be16(RDMAP_SEND),
+	  .rx_data = siw_proc_send },
+	{ /* RDMAP_SEND_INVAL */
+	  .hdr_len = sizeof(struct iwarp_send_inv),
+	  .ctrl.mpa_len = htons(sizeof(struct iwarp_send_inv) - 2),
+	  .ctrl.ddp_rdmap_ctrl = DDP_FLAG_LAST | cpu_to_be16(DDP_VERSION << 8) |
+				 cpu_to_be16(RDMAP_VERSION << 6) |
+				 cpu_to_be16(RDMAP_SEND_INVAL),
+	  .rx_data = siw_proc_send },
+	{ /* RDMAP_SEND_SE */
+	  .hdr_len = sizeof(struct iwarp_send),
+	  .ctrl.mpa_len = htons(sizeof(struct iwarp_send) - 2),
+	  .ctrl.ddp_rdmap_ctrl = DDP_FLAG_LAST | cpu_to_be16(DDP_VERSION << 8) |
+				 cpu_to_be16(RDMAP_VERSION << 6) |
+				 cpu_to_be16(RDMAP_SEND_SE),
+	  .rx_data = siw_proc_send },
+	{ /* RDMAP_SEND_SE_INVAL */
+	  .hdr_len = sizeof(struct iwarp_send_inv),
+	  .ctrl.mpa_len = htons(sizeof(struct iwarp_send_inv) - 2),
+	  .ctrl.ddp_rdmap_ctrl = DDP_FLAG_LAST | cpu_to_be16(DDP_VERSION << 8) |
+				 cpu_to_be16(RDMAP_VERSION << 6) |
+				 cpu_to_be16(RDMAP_SEND_SE_INVAL),
+	  .rx_data = siw_proc_send },
+	{ /* RDMAP_TERMINATE */
+	  .hdr_len = sizeof(struct iwarp_terminate),
+	  .ctrl.mpa_len = htons(sizeof(struct iwarp_terminate) - 2),
+	  .ctrl.ddp_rdmap_ctrl = DDP_FLAG_LAST | cpu_to_be16(DDP_VERSION << 8) |
+				 cpu_to_be16(RDMAP_VERSION << 6) |
+				 cpu_to_be16(RDMAP_TERMINATE),
+	  .rx_data = siw_proc_terminate }
+};
+
+void siw_qp_llp_data_ready(struct sock *sk)
+{
+	struct siw_qp *qp;
+
+	read_lock(&sk->sk_callback_lock);
+
+	if (unlikely(!sk->sk_user_data || !sk_to_qp(sk)))
+		goto done;
+
+	qp = sk_to_qp(sk);
+
+	if (likely(!qp->rx_stream.rx_suspend &&
+		   down_read_trylock(&qp->state_lock))) {
+		read_descriptor_t rd_desc = { .arg.data = qp, .count = 1 };
+
+		if (likely(qp->attrs.state == SIW_QP_STATE_RTS))
+			/*
+			 * Implements data receive operation during
+			 * socket callback. TCP gracefully catches
+			 * the case where there is nothing to receive
+			 * (not calling siw_tcp_rx_data() then).
+			 */
+			tcp_read_sock(sk, &rd_desc, siw_tcp_rx_data);
+
+		up_read(&qp->state_lock);
+	} else {
+		siw_dbg_qp(qp, "unable to process RX, suspend: %d\n",
+			   qp->rx_stream.rx_suspend);
+	}
+done:
+	read_unlock(&sk->sk_callback_lock);
+}
+
+void siw_qp_llp_close(struct siw_qp *qp)
+{
+	siw_dbg_qp(qp, "enter llp close, state = %s\n",
+		   siw_qp_state_to_string[qp->attrs.state]);
+
+	down_write(&qp->state_lock);
+
+	qp->rx_stream.rx_suspend = 1;
+	qp->tx_ctx.tx_suspend = 1;
+	qp->attrs.sk = NULL;
+
+	switch (qp->attrs.state) {
+	case SIW_QP_STATE_RTS:
+	case SIW_QP_STATE_RTR:
+	case SIW_QP_STATE_IDLE:
+	case SIW_QP_STATE_TERMINATE:
+		qp->attrs.state = SIW_QP_STATE_ERROR;
+		break;
+	/*
+	 * SIW_QP_STATE_CLOSING:
+	 *
+	 * This is a forced close. shall the QP be moved to
+	 * ERROR or IDLE ?
+	 */
+	case SIW_QP_STATE_CLOSING:
+		if (tx_wqe(qp)->wr_status == SIW_WR_IDLE)
+			qp->attrs.state = SIW_QP_STATE_ERROR;
+		else
+			qp->attrs.state = SIW_QP_STATE_IDLE;
+		break;
+
+	default:
+		siw_dbg_qp(qp, "llp close: no state transition needed: %s\n",
+			   siw_qp_state_to_string[qp->attrs.state]);
+		break;
+	}
+	siw_sq_flush(qp);
+	siw_rq_flush(qp);
+
+	/*
+	 * Dereference closing CEP
+	 */
+	if (qp->cep) {
+		siw_cep_put(qp->cep);
+		qp->cep = NULL;
+	}
+
+	up_write(&qp->state_lock);
+
+	siw_dbg_qp(qp, "llp close exit: state %s\n",
+		   siw_qp_state_to_string[qp->attrs.state]);
+}
+
+/*
+ * socket callback routine informing about newly available send space.
+ * Function schedules SQ work for processing SQ items.
+ */
+void siw_qp_llp_write_space(struct sock *sk)
+{
+	struct siw_cep *cep;
+
+	read_lock(&sk->sk_callback_lock);
+
+	cep  = sk_to_cep(sk);
+	if (cep) {
+		cep->sk_write_space(sk);
+
+		if (!test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))
+			(void)siw_sq_start(cep->qp);
+	}
+
+	read_unlock(&sk->sk_callback_lock);
+}
+
+static int siw_qp_readq_init(struct siw_qp *qp, int irq_size, int orq_size)
+{
+	irq_size = roundup_pow_of_two(irq_size);
+	orq_size = roundup_pow_of_two(orq_size);
+
+	qp->attrs.irq_size = irq_size;
+	qp->attrs.orq_size = orq_size;
+
+	qp->irq = vzalloc(irq_size * sizeof(struct siw_sqe));
+	if (!qp->irq) {
+		siw_dbg_qp(qp, "irq malloc for %d failed\n", irq_size);
+		qp->attrs.irq_size = 0;
+		return -ENOMEM;
+	}
+	qp->orq = vzalloc(orq_size * sizeof(struct siw_sqe));
+	if (!qp->orq) {
+		siw_dbg_qp(qp, "orq malloc for %d failed\n", orq_size);
+		qp->attrs.orq_size = 0;
+		qp->attrs.irq_size = 0;
+		vfree(qp->irq);
+		return -ENOMEM;
+	}
+	siw_dbg_qp(qp, "ORD %d, IRD %d\n", orq_size, irq_size);
+	return 0;
+}
+
+static int siw_qp_enable_crc(struct siw_qp *qp)
+{
+	struct siw_rx_stream *c_rx = &qp->rx_stream;
+	struct siw_iwarp_tx *c_tx = &qp->tx_ctx;
+	int size;
+
+	if (siw_crypto_shash == NULL)
+		return -ENOENT;
+
+	size = crypto_shash_descsize(siw_crypto_shash) +
+		sizeof(struct shash_desc);
+
+	c_tx->mpa_crc_hd = kzalloc(size, GFP_KERNEL);
+	c_rx->mpa_crc_hd = kzalloc(size, GFP_KERNEL);
+	if (!c_tx->mpa_crc_hd || !c_rx->mpa_crc_hd) {
+		kfree(c_tx->mpa_crc_hd);
+		kfree(c_rx->mpa_crc_hd);
+		c_tx->mpa_crc_hd = NULL;
+		c_rx->mpa_crc_hd = NULL;
+		return -ENOMEM;
+	}
+	c_tx->mpa_crc_hd->tfm = siw_crypto_shash;
+	c_rx->mpa_crc_hd->tfm = siw_crypto_shash;
+
+	return 0;
+}
+
+/*
+ * Send a non signalled READ or WRITE to peer side as negotiated
+ * with MPAv2 P2P setup protocol. The work request is only created
+ * as a current active WR and does not consume Send Queue space.
+ *
+ * Caller must hold QP state lock.
+ */
+int siw_qp_mpa_rts(struct siw_qp *qp, enum mpa_v2_ctrl ctrl)
+{
+	struct siw_wqe *wqe = tx_wqe(qp);
+	unsigned long flags;
+	int rv = 0;
+
+	spin_lock_irqsave(&qp->sq_lock, flags);
+
+	if (unlikely(wqe->wr_status != SIW_WR_IDLE)) {
+		spin_unlock_irqrestore(&qp->sq_lock, flags);
+		return -EIO;
+	}
+	memset(wqe->mem, 0, sizeof(*wqe->mem) * SIW_MAX_SGE);
+
+	wqe->wr_status = SIW_WR_QUEUED;
+	wqe->sqe.flags = 0;
+	wqe->sqe.num_sge = 1;
+	wqe->sqe.sge[0].length = 0;
+	wqe->sqe.sge[0].laddr = 0;
+	wqe->sqe.sge[0].lkey = 0;
+	/*
+	 * While it must not be checked for inbound zero length
+	 * READ/WRITE, some HW may treat STag 0 special.
+	 */
+	wqe->sqe.rkey = 1;
+	wqe->sqe.raddr = 0;
+	wqe->processed = 0;
+
+	if (ctrl & MPA_V2_RDMA_WRITE_RTR)
+		wqe->sqe.opcode = SIW_OP_WRITE;
+	else if (ctrl & MPA_V2_RDMA_READ_RTR) {
+		struct siw_sqe *rreq;
+
+		wqe->sqe.opcode = SIW_OP_READ;
+
+		spin_lock(&qp->orq_lock);
+
+		rreq = orq_get_free(qp);
+		if (rreq) {
+			siw_read_to_orq(rreq, &wqe->sqe);
+			qp->orq_put++;
+		} else
+			rv = -EIO;
+
+		spin_unlock(&qp->orq_lock);
+	} else
+		rv = -EINVAL;
+
+	if (rv)
+		wqe->wr_status = SIW_WR_IDLE;
+
+	spin_unlock_irqrestore(&qp->sq_lock, flags);
+
+	if (!rv)
+		rv = siw_sq_start(qp);
+
+	return rv;
+}
+
+/*
+ * Map memory access error to DDP tagged error
+ */
+enum ddp_ecode siw_tagged_error(enum siw_access_state state)
+{
+	switch (state) {
+	case E_STAG_INVALID:
+		return DDP_ECODE_T_INVALID_STAG;
+	case E_BASE_BOUNDS:
+		return DDP_ECODE_T_BASE_BOUNDS;
+	case E_PD_MISMATCH:
+		return DDP_ECODE_T_STAG_NOT_ASSOC;
+	case E_ACCESS_PERM:
+		/*
+		 * RFC 5041 (DDP) lacks an ecode for insufficient access
+		 * permissions. 'Invalid STag' seem to be the closest
+		 * match though.
+		 */
+		return DDP_ECODE_T_INVALID_STAG;
+	default:
+		WARN_ON(1);
+		return DDP_ECODE_T_INVALID_STAG;
+	}
+}
+
+/*
+ * Map memory access error to RDMAP protection error
+ */
+enum rdmap_ecode siw_rdmap_error(enum siw_access_state state)
+{
+	switch (state) {
+	case E_STAG_INVALID:
+		return RDMAP_ECODE_INVALID_STAG;
+	case E_BASE_BOUNDS:
+		return RDMAP_ECODE_BASE_BOUNDS;
+	case E_PD_MISMATCH:
+		return RDMAP_ECODE_STAG_NOT_ASSOC;
+	case E_ACCESS_PERM:
+		return RDMAP_ECODE_ACCESS_RIGHTS;
+	default:
+		return RDMAP_ECODE_UNSPECIFIED;
+	}
+}
+
+void siw_init_terminate(struct siw_qp *qp, enum term_elayer layer, u8 etype,
+			u8 ecode, int in_tx)
+{
+	if (!qp->term_info.valid) {
+		memset(&qp->term_info, 0, sizeof(qp->term_info));
+		qp->term_info.layer = layer;
+		qp->term_info.etype = etype;
+		qp->term_info.ecode = ecode;
+		qp->term_info.in_tx = in_tx;
+		qp->term_info.valid = 1;
+	}
+	siw_dbg_qp(qp, "init TERM: layer %d, type %d, code %d, in tx %s\n",
+		   layer, etype, ecode, in_tx ? "yes" : "no");
+}
+
+/*
+ * Send a TERMINATE message, as defined in RFC's 5040/5041/5044/6581.
+ * Sending TERMINATE messages is best effort - such messages
+ * can only be send if the QP is still connected and it does
+ * not have another outbound message in-progress, i.e. the
+ * TERMINATE message must not interfer with an incomplete current
+ * transmit operation.
+ */
+void siw_send_terminate(struct siw_qp *qp)
+{
+	struct kvec iov[3];
+	struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_EOR };
+	struct iwarp_terminate *term = NULL;
+	union iwarp_hdr *err_hdr = NULL;
+	struct socket *s = qp->attrs.sk;
+	struct siw_rx_stream *srx = &qp->rx_stream;
+	union iwarp_hdr *rx_hdr = &srx->hdr;
+	u32 crc = 0;
+	int num_frags, len_terminate, rv;
+
+	if (!qp->term_info.valid)
+		return;
+
+	qp->term_info.valid = 0;
+
+	if (tx_wqe(qp)->wr_status == SIW_WR_INPROGRESS) {
+		siw_dbg_qp(qp, "cannot send TERMINATE: op %d in progress\n",
+			   tx_type(tx_wqe(qp)));
+		return;
+	}
+	if (!s && qp->cep)
+		/* QP not yet in RTS. Take socket from connection end point */
+		s = qp->cep->sock;
+
+	if (!s) {
+		siw_dbg_qp(qp, "cannot send TERMINATE: not connected\n");
+		return;
+	}
+
+	term = kzalloc(sizeof(*term), GFP_KERNEL);
+	if (!term)
+		return;
+
+	term->ddp_qn = cpu_to_be32(RDMAP_UNTAGGED_QN_TERMINATE);
+	term->ddp_mo = 0;
+	term->ddp_msn = cpu_to_be32(1);
+
+	iov[0].iov_base = term;
+	iov[0].iov_len = sizeof(*term);
+
+	if ((qp->term_info.layer == TERM_ERROR_LAYER_DDP) ||
+	    ((qp->term_info.layer == TERM_ERROR_LAYER_RDMAP) &&
+	     (qp->term_info.etype != RDMAP_ETYPE_CATASTROPHIC))) {
+		err_hdr = kzalloc(sizeof(*err_hdr), GFP_KERNEL);
+		if (!err_hdr) {
+			kfree(term);
+			return;
+		}
+	}
+	memcpy(&term->ctrl, &iwarp_pktinfo[RDMAP_TERMINATE].ctrl,
+	       sizeof(struct iwarp_ctrl));
+
+	__rdmap_term_set_layer(term, qp->term_info.layer);
+	__rdmap_term_set_etype(term, qp->term_info.etype);
+	__rdmap_term_set_ecode(term, qp->term_info.ecode);
+
+	switch (qp->term_info.layer) {
+	case TERM_ERROR_LAYER_RDMAP:
+		if (qp->term_info.etype == RDMAP_ETYPE_CATASTROPHIC)
+			/* No additional DDP/RDMAP header to be included */
+			break;
+
+		if (qp->term_info.etype == RDMAP_ETYPE_REMOTE_PROTECTION) {
+			/*
+			 * Complete RDMAP frame will get attached, and
+			 * DDP segment length is valid
+			 */
+			term->flag_m = 1;
+			term->flag_d = 1;
+			term->flag_r = 1;
+
+			if (qp->term_info.in_tx) {
+				struct iwarp_rdma_rreq *rreq;
+				struct siw_wqe *wqe = tx_wqe(qp);
+
+				/* Inbound RREQ error, detected during
+				 * RRESP creation. Take state from
+				 * current TX work queue element to
+				 * reconstruct peers RREQ.
+				 */
+				rreq = (struct iwarp_rdma_rreq *)err_hdr;
+
+				memcpy(&rreq->ctrl,
+				       &iwarp_pktinfo[RDMAP_RDMA_READ_REQ].ctrl,
+				       sizeof(struct iwarp_ctrl));
+
+				rreq->rsvd = 0;
+				rreq->ddp_qn =
+					htonl(RDMAP_UNTAGGED_QN_RDMA_READ);
+
+				/* Provide RREQ's MSN as kept aside */
+				rreq->ddp_msn = htonl(wqe->sqe.sge[0].length);
+
+				rreq->ddp_mo = htonl(wqe->processed);
+				rreq->sink_stag = htonl(wqe->sqe.rkey);
+				rreq->sink_to = cpu_to_be64(wqe->sqe.raddr);
+				rreq->read_size = htonl(wqe->sqe.sge[0].length);
+				rreq->source_stag = htonl(wqe->sqe.sge[0].lkey);
+				rreq->source_to =
+					cpu_to_be64(wqe->sqe.sge[0].laddr);
+
+				iov[1].iov_base = rreq;
+				iov[1].iov_len = sizeof(*rreq);
+
+				rx_hdr = (union iwarp_hdr *)rreq;
+			} else {
+				/* Take RDMAP/DDP information from
+				 * current (failed) inbound frame.
+				 */
+				iov[1].iov_base = rx_hdr;
+
+				if (__rdmap_get_opcode(&rx_hdr->ctrl) ==
+				    RDMAP_RDMA_READ_REQ)
+					iov[1].iov_len =
+						sizeof(struct iwarp_rdma_rreq);
+				else /* SEND type */
+					iov[1].iov_len =
+						sizeof(struct iwarp_send);
+			}
+		} else {
+			/* Do not report DDP hdr information if packet
+			 * layout is unknown
+			 */
+			if ((qp->term_info.ecode == RDMAP_ECODE_VERSION) ||
+			    (qp->term_info.ecode == RDMAP_ECODE_OPCODE))
+				break;
+
+			iov[1].iov_base = rx_hdr;
+
+			/* Only DDP frame will get attached */
+			if (rx_hdr->ctrl.ddp_rdmap_ctrl & DDP_FLAG_TAGGED)
+				iov[1].iov_len =
+					sizeof(struct iwarp_rdma_write);
+			else
+				iov[1].iov_len = sizeof(struct iwarp_send);
+
+			term->flag_m = 1;
+			term->flag_d = 1;
+		}
+		term->ctrl.mpa_len = cpu_to_be16(iov[1].iov_len);
+		break;
+
+	case TERM_ERROR_LAYER_DDP:
+		/* Report error encountered while DDP processing.
+		 * This can only happen as a result of inbound
+		 * DDP processing
+		 */
+
+		/* Do not report DDP hdr information if packet
+		 * layout is unknown
+		 */
+		if (((qp->term_info.etype == DDP_ETYPE_TAGGED_BUF) &&
+		     (qp->term_info.ecode == DDP_ECODE_T_VERSION)) ||
+		    ((qp->term_info.etype == DDP_ETYPE_UNTAGGED_BUF) &&
+		     (qp->term_info.ecode == DDP_ECODE_UT_VERSION)))
+			break;
+
+		iov[1].iov_base = rx_hdr;
+
+		if (rx_hdr->ctrl.ddp_rdmap_ctrl & DDP_FLAG_TAGGED)
+			iov[1].iov_len = sizeof(struct iwarp_ctrl_tagged);
+		else
+			iov[1].iov_len = sizeof(struct iwarp_ctrl_untagged);
+
+		term->flag_m = 1;
+		term->flag_d = 1;
+		break;
+
+	default:
+		break;
+	}
+	if (term->flag_m || term->flag_d || term->flag_r) {
+		iov[2].iov_base = &crc;
+		iov[2].iov_len = sizeof(crc);
+		len_terminate = sizeof(*term) + iov[1].iov_len + MPA_CRC_SIZE;
+		num_frags = 3;
+	} else {
+		iov[1].iov_base = &crc;
+		iov[1].iov_len = sizeof(crc);
+		len_terminate = sizeof(*term) + MPA_CRC_SIZE;
+		num_frags = 2;
+	}
+
+	/* Adjust DDP Segment Length parameter, if valid */
+	if (term->flag_m) {
+		u32 real_ddp_len = be16_to_cpu(rx_hdr->ctrl.mpa_len);
+		enum rdma_opcode op = __rdmap_get_opcode(&rx_hdr->ctrl);
+
+		real_ddp_len -= iwarp_pktinfo[op].hdr_len - MPA_HDR_SIZE;
+		rx_hdr->ctrl.mpa_len = cpu_to_be16(real_ddp_len);
+	}
+
+	term->ctrl.mpa_len =
+		cpu_to_be16(len_terminate - (MPA_HDR_SIZE + MPA_CRC_SIZE));
+	if (qp->tx_ctx.mpa_crc_hd) {
+		crypto_shash_init(qp->tx_ctx.mpa_crc_hd);
+		if (crypto_shash_update(qp->tx_ctx.mpa_crc_hd,
+					(u8 *)iov[0].iov_base,
+					iov[0].iov_len))
+			goto out;
+
+		if (num_frags == 3) {
+			if (crypto_shash_update(qp->tx_ctx.mpa_crc_hd,
+						(u8 *)iov[1].iov_base,
+						iov[1].iov_len))
+				goto out;
+		}
+		crypto_shash_final(qp->tx_ctx.mpa_crc_hd, (u8 *)&crc);
+	}
+
+	rv = kernel_sendmsg(s, &msg, iov, num_frags, len_terminate);
+	siw_dbg_qp(qp, "sent TERM: %s, layer %d, type %d, code %d (%d bytes)\n",
+		   rv == len_terminate ? "success" : "failure",
+		   __rdmap_term_layer(term), __rdmap_term_etype(term),
+		   __rdmap_term_ecode(term), rv);
+out:
+	kfree(term);
+	kfree(err_hdr);
+}
+
+/*
+ * Handle all attrs other than state
+ */
+static void siw_qp_modify_nonstate(struct siw_qp *qp,
+				   struct siw_qp_attrs *attrs,
+				   enum siw_qp_attr_mask mask)
+{
+	if (mask & SIW_QP_ATTR_ACCESS_FLAGS) {
+		if (attrs->flags & SIW_RDMA_BIND_ENABLED)
+			qp->attrs.flags |= SIW_RDMA_BIND_ENABLED;
+		else
+			qp->attrs.flags &= ~SIW_RDMA_BIND_ENABLED;
+
+		if (attrs->flags & SIW_RDMA_WRITE_ENABLED)
+			qp->attrs.flags |= SIW_RDMA_WRITE_ENABLED;
+		else
+			qp->attrs.flags &= ~SIW_RDMA_WRITE_ENABLED;
+
+		if (attrs->flags & SIW_RDMA_READ_ENABLED)
+			qp->attrs.flags |= SIW_RDMA_READ_ENABLED;
+		else
+			qp->attrs.flags &= ~SIW_RDMA_READ_ENABLED;
+	}
+}
+
+static int siw_qp_nextstate_from_idle(struct siw_qp *qp,
+				      struct siw_qp_attrs *attrs,
+				      enum siw_qp_attr_mask mask)
+{
+	int rv = 0;
+
+	switch (attrs->state) {
+	case SIW_QP_STATE_RTS:
+		if (attrs->flags & SIW_MPA_CRC) {
+			rv = siw_qp_enable_crc(qp);
+			if (rv)
+				break;
+		}
+		if (!(mask & SIW_QP_ATTR_LLP_HANDLE)) {
+			siw_dbg_qp(qp, "no socket\n");
+			rv = -EINVAL;
+			break;
+		}
+		if (!(mask & SIW_QP_ATTR_MPA)) {
+			siw_dbg_qp(qp, "no MPA\n");
+			rv = -EINVAL;
+			break;
+		}
+		/*
+		 * Initialize iWARP TX state
+		 */
+		qp->tx_ctx.ddp_msn[RDMAP_UNTAGGED_QN_SEND] = 0;
+		qp->tx_ctx.ddp_msn[RDMAP_UNTAGGED_QN_RDMA_READ] = 0;
+		qp->tx_ctx.ddp_msn[RDMAP_UNTAGGED_QN_TERMINATE] = 0;
+
+		/*
+		 * Initialize iWARP RX state
+		 */
+		qp->rx_stream.ddp_msn[RDMAP_UNTAGGED_QN_SEND] = 1;
+		qp->rx_stream.ddp_msn[RDMAP_UNTAGGED_QN_RDMA_READ] = 1;
+		qp->rx_stream.ddp_msn[RDMAP_UNTAGGED_QN_TERMINATE] = 1;
+
+		/*
+		 * init IRD free queue, caller has already checked
+		 * limits.
+		 */
+		rv = siw_qp_readq_init(qp, attrs->irq_size,
+				       attrs->orq_size);
+		if (rv)
+			break;
+
+		qp->attrs.sk = attrs->sk;
+		qp->attrs.state = SIW_QP_STATE_RTS;
+
+		siw_dbg_qp(qp, "enter RTS: crc=%s, ord=%u, ird=%u\n",
+			   attrs->flags & SIW_MPA_CRC ? "y" : "n",
+			   qp->attrs.orq_size, qp->attrs.irq_size);
+		break;
+
+	case SIW_QP_STATE_ERROR:
+		siw_rq_flush(qp);
+		qp->attrs.state = SIW_QP_STATE_ERROR;
+		if (qp->cep) {
+			siw_cep_put(qp->cep);
+			qp->cep = NULL;
+		}
+		break;
+
+	default:
+		break;
+	}
+	return rv;
+}
+
+static int siw_qp_nextstate_from_rts(struct siw_qp *qp,
+				     struct siw_qp_attrs *attrs)
+{
+	int drop_conn = 0;
+
+	switch (attrs->state) {
+	case SIW_QP_STATE_CLOSING:
+		/*
+		 * Verbs: move to IDLE if SQ and ORQ are empty.
+		 * Move to ERROR otherwise. But first of all we must
+		 * close the connection. So we keep CLOSING or ERROR
+		 * as a transient state, schedule connection drop work
+		 * and wait for the socket state change upcall to
+		 * come back closed.
+		 */
+		if (tx_wqe(qp)->wr_status == SIW_WR_IDLE) {
+			qp->attrs.state = SIW_QP_STATE_CLOSING;
+		} else {
+			qp->attrs.state = SIW_QP_STATE_ERROR;
+			siw_sq_flush(qp);
+		}
+		siw_rq_flush(qp);
+
+		drop_conn = 1;
+		break;
+
+	case SIW_QP_STATE_TERMINATE:
+		qp->attrs.state = SIW_QP_STATE_TERMINATE;
+
+		siw_init_terminate(qp, TERM_ERROR_LAYER_RDMAP,
+				   RDMAP_ETYPE_CATASTROPHIC,
+				   RDMAP_ECODE_UNSPECIFIED, 1);
+		drop_conn = 1;
+		break;
+
+	case SIW_QP_STATE_ERROR:
+		/*
+		 * This is an emergency close.
+		 *
+		 * Any in progress transmit operation will get
+		 * cancelled.
+		 * This will likely result in a protocol failure,
+		 * if a TX operation is in transit. The caller
+		 * could unconditional wait to give the current
+		 * operation a chance to complete.
+		 * Esp., how to handle the non-empty IRQ case?
+		 * The peer was asking for data transfer at a valid
+		 * point in time.
+		 */
+		siw_sq_flush(qp);
+		siw_rq_flush(qp);
+		qp->attrs.state = SIW_QP_STATE_ERROR;
+		drop_conn = 1;
+		break;
+
+	default:
+		break;
+	}
+	return drop_conn;
+}
+
+static void siw_qp_nextstate_from_term(struct siw_qp *qp,
+				       struct siw_qp_attrs *attrs)
+{
+	switch (attrs->state) {
+	case SIW_QP_STATE_ERROR:
+		siw_rq_flush(qp);
+		qp->attrs.state = SIW_QP_STATE_ERROR;
+
+		if (tx_wqe(qp)->wr_status != SIW_WR_IDLE)
+			siw_sq_flush(qp);
+		break;
+
+	default:
+		break;
+	}
+}
+
+static int siw_qp_nextstate_from_close(struct siw_qp *qp,
+				       struct siw_qp_attrs *attrs)
+{
+	int rv = 0;
+
+	switch (attrs->state) {
+	case SIW_QP_STATE_IDLE:
+		WARN_ON(tx_wqe(qp)->wr_status != SIW_WR_IDLE);
+		qp->attrs.state = SIW_QP_STATE_IDLE;
+		break;
+
+	case SIW_QP_STATE_CLOSING:
+		/*
+		 * The LLP may already moved the QP to closing
+		 * due to graceful peer close init
+		 */
+		break;
+
+	case SIW_QP_STATE_ERROR:
+		/*
+		 * QP was moved to CLOSING by LLP event
+		 * not yet seen by user.
+		 */
+		qp->attrs.state = SIW_QP_STATE_ERROR;
+
+		if (tx_wqe(qp)->wr_status != SIW_WR_IDLE)
+			siw_sq_flush(qp);
+
+		siw_rq_flush(qp);
+		break;
+
+	default:
+		siw_dbg_qp(qp, "state transition undefined: %s => %s\n",
+			   siw_qp_state_to_string[qp->attrs.state],
+			   siw_qp_state_to_string[attrs->state]);
+
+		rv = -ECONNABORTED;
+	}
+	return rv;
+}
+
+/*
+ * Caller must hold qp->state_lock
+ */
+int siw_qp_modify(struct siw_qp *qp, struct siw_qp_attrs *attrs,
+		  enum siw_qp_attr_mask mask)
+{
+	int drop_conn = 0, rv = 0;
+
+	if (!mask)
+		return 0;
+
+	siw_dbg_qp(qp, "state: %s => %s\n",
+		   siw_qp_state_to_string[qp->attrs.state],
+		   siw_qp_state_to_string[attrs->state]);
+
+	if (mask != SIW_QP_ATTR_STATE)
+		siw_qp_modify_nonstate(qp, attrs, mask);
+
+	if (!(mask & SIW_QP_ATTR_STATE))
+		return 0;
+
+	switch (qp->attrs.state) {
+	case SIW_QP_STATE_IDLE:
+	case SIW_QP_STATE_RTR:
+		rv = siw_qp_nextstate_from_idle(qp, attrs, mask);
+		break;
+
+	case SIW_QP_STATE_RTS:
+		drop_conn = siw_qp_nextstate_from_rts(qp, attrs);
+		break;
+
+	case SIW_QP_STATE_TERMINATE:
+		siw_qp_nextstate_from_term(qp, attrs);
+		break;
+
+	case SIW_QP_STATE_CLOSING:
+		siw_qp_nextstate_from_close(qp, attrs);
+		break;
+	default:
+		break;
+	}
+	if (drop_conn)
+		siw_qp_cm_drop(qp, 0);
+
+	return rv;
+}
+
+void siw_read_to_orq(struct siw_sqe *rreq, struct siw_sqe *sqe)
+{
+	rreq->id = sqe->id;
+	rreq->opcode = sqe->opcode;
+	rreq->sge[0].laddr = sqe->sge[0].laddr;
+	rreq->sge[0].length = sqe->sge[0].length;
+	rreq->sge[0].lkey = sqe->sge[0].lkey;
+	rreq->sge[1].lkey = sqe->sge[1].lkey;
+	rreq->flags = sqe->flags | SIW_WQE_VALID;
+	rreq->num_sge = 1;
+}
+
+/*
+ * Must be called with SQ locked.
+ * To avoid complete SQ starvation by constant inbound READ requests,
+ * the active IRQ will not be served after qp->irq_burst, if the
+ * SQ has pending work.
+ */
+int siw_activate_tx(struct siw_qp *qp)
+{
+	struct siw_sqe *irqe, *sqe;
+	struct siw_wqe *wqe = tx_wqe(qp);
+	int rv = 1;
+
+	irqe = &qp->irq[qp->irq_get % qp->attrs.irq_size];
+
+	if (irqe->flags & SIW_WQE_VALID) {
+		sqe = sq_get_next(qp);
+
+		/*
+		 * Avoid local WQE processing starvation in case
+		 * of constant inbound READ request stream
+		 */
+		if (sqe && ++qp->irq_burst >= SIW_IRQ_MAXBURST_SQ_ACTIVE) {
+			qp->irq_burst = 0;
+			goto skip_irq;
+		}
+		memset(wqe->mem, 0, sizeof(*wqe->mem) * SIW_MAX_SGE);
+		wqe->wr_status = SIW_WR_QUEUED;
+
+		/* start READ RESPONSE */
+		wqe->sqe.opcode = SIW_OP_READ_RESPONSE;
+		wqe->sqe.flags = 0;
+		if (irqe->num_sge) {
+			wqe->sqe.num_sge = 1;
+			wqe->sqe.sge[0].length = irqe->sge[0].length;
+			wqe->sqe.sge[0].laddr = irqe->sge[0].laddr;
+			wqe->sqe.sge[0].lkey = irqe->sge[0].lkey;
+		} else {
+			wqe->sqe.num_sge = 0;
+		}
+
+		/* Retain original RREQ's message sequence number for
+		 * potential error reporting cases.
+		 */
+		wqe->sqe.sge[1].length = irqe->sge[1].length;
+
+		wqe->sqe.rkey = irqe->rkey;
+		wqe->sqe.raddr = irqe->raddr;
+
+		wqe->processed = 0;
+		qp->irq_get++;
+
+		/* mark current IRQ entry free */
+		smp_store_mb(irqe->flags, 0);
+
+		goto out;
+	}
+	sqe = sq_get_next(qp);
+	if (sqe) {
+skip_irq:
+		memset(wqe->mem, 0, sizeof(*wqe->mem) * SIW_MAX_SGE);
+		wqe->wr_status = SIW_WR_QUEUED;
+
+		/* First copy SQE to kernel private memory */
+		memcpy(&wqe->sqe, sqe, sizeof(*sqe));
+
+		if (wqe->sqe.opcode >= SIW_NUM_OPCODES) {
+			rv = -EINVAL;
+			goto out;
+		}
+		if (wqe->sqe.flags & SIW_WQE_INLINE) {
+			if (wqe->sqe.opcode != SIW_OP_SEND &&
+			    wqe->sqe.opcode != SIW_OP_WRITE) {
+				rv = -EINVAL;
+				goto out;
+			}
+			if (wqe->sqe.sge[0].length > SIW_MAX_INLINE) {
+				rv = -EINVAL;
+				goto out;
+			}
+			wqe->sqe.sge[0].laddr = (uintptr_t)&wqe->sqe.sge[1];
+			wqe->sqe.sge[0].lkey = 0;
+			wqe->sqe.num_sge = 1;
+		}
+		if (wqe->sqe.flags & SIW_WQE_READ_FENCE) {
+			/* A READ cannot be fenced */
+			if (unlikely(wqe->sqe.opcode == SIW_OP_READ ||
+				     wqe->sqe.opcode ==
+					     SIW_OP_READ_LOCAL_INV)) {
+				siw_dbg_qp(qp, "cannot fence read\n");
+				rv = -EINVAL;
+				goto out;
+			}
+			spin_lock(&qp->orq_lock);
+
+			if (!siw_orq_empty(qp)) {
+				qp->tx_ctx.orq_fence = 1;
+				rv = 0;
+			}
+			spin_unlock(&qp->orq_lock);
+
+		} else if (wqe->sqe.opcode == SIW_OP_READ ||
+			   wqe->sqe.opcode == SIW_OP_READ_LOCAL_INV) {
+			struct siw_sqe *rreq;
+
+			wqe->sqe.num_sge = 1;
+
+			spin_lock(&qp->orq_lock);
+
+			rreq = orq_get_free(qp);
+			if (rreq) {
+				/*
+				 * Make an immediate copy in ORQ to be ready
+				 * to process loopback READ reply
+				 */
+				siw_read_to_orq(rreq, &wqe->sqe);
+				qp->orq_put++;
+			} else {
+				qp->tx_ctx.orq_fence = 1;
+				rv = 0;
+			}
+			spin_unlock(&qp->orq_lock);
+		}
+
+		/* Clear SQE, can be re-used by application */
+		smp_store_mb(sqe->flags, 0);
+		qp->sq_get++;
+	} else {
+		rv = 0;
+	}
+out:
+	if (unlikely(rv < 0)) {
+		siw_dbg_qp(qp, "error %d\n", rv);
+		wqe->wr_status = SIW_WR_IDLE;
+	}
+	return rv;
+}
+
+/*
+ * Check if current CQ state qualifies for calling CQ completion
+ * handler. Must be called with CQ lock held.
+ */
+static bool siw_cq_notify_now(struct siw_cq *cq, u32 flags)
+{
+	u32 cq_notify;
+
+	if (!cq->base_cq.comp_handler)
+		return false;
+
+	/* Read application shared notification state */
+	cq_notify = READ_ONCE(cq->notify->flags);
+
+	if ((cq_notify & SIW_NOTIFY_NEXT_COMPLETION) ||
+	    ((cq_notify & SIW_NOTIFY_SOLICITED) &&
+	     (flags & SIW_WQE_SOLICITED))) {
+		/*
+		 * CQ notification is one-shot: Since the
+		 * current CQE causes user notification,
+		 * the CQ gets dis-aremd and must be re-aremd
+		 * by the user for a new notification.
+		 */
+		WRITE_ONCE(cq->notify->flags, SIW_NOTIFY_NOT);
+
+		return true;
+	}
+	return false;
+}
+
+int siw_sqe_complete(struct siw_qp *qp, struct siw_sqe *sqe, u32 bytes,
+		     enum siw_wc_status status)
+{
+	struct siw_cq *cq = qp->scq;
+	int rv = 0;
+
+	if (cq) {
+		u32 sqe_flags = sqe->flags;
+		struct siw_cqe *cqe;
+		u32 idx;
+		unsigned long flags;
+
+		spin_lock_irqsave(&cq->lock, flags);
+
+		idx = cq->cq_put % cq->num_cqe;
+		cqe = &cq->queue[idx];
+
+		if (!READ_ONCE(cqe->flags)) {
+			bool notify;
+
+			cqe->id = sqe->id;
+			cqe->opcode = sqe->opcode;
+			cqe->status = status;
+			cqe->imm_data = 0;
+			cqe->bytes = bytes;
+
+			if (cq->kernel_verbs)
+				cqe->base_qp = qp->ib_qp;
+			else
+				cqe->qp_id = qp_id(qp);
+
+			/* mark CQE valid for application */
+			WRITE_ONCE(cqe->flags, SIW_WQE_VALID);
+			/* recycle SQE */
+			smp_store_mb(sqe->flags, 0);
+
+			cq->cq_put++;
+			notify = siw_cq_notify_now(cq, sqe_flags);
+
+			spin_unlock_irqrestore(&cq->lock, flags);
+
+			if (notify) {
+				siw_dbg_cq(cq, "Call completion handler\n");
+				cq->base_cq.comp_handler(&cq->base_cq,
+						cq->base_cq.cq_context);
+			}
+		} else {
+			spin_unlock_irqrestore(&cq->lock, flags);
+			rv = -ENOMEM;
+			siw_cq_event(cq, IB_EVENT_CQ_ERR);
+		}
+	} else {
+		/* recycle SQE */
+		smp_store_mb(sqe->flags, 0);
+	}
+	return rv;
+}
+
+int siw_rqe_complete(struct siw_qp *qp, struct siw_rqe *rqe, u32 bytes,
+		     u32 inval_stag, enum siw_wc_status status)
+{
+	struct siw_cq *cq = qp->rcq;
+	int rv = 0;
+
+	if (cq) {
+		struct siw_cqe *cqe;
+		u32 idx;
+		unsigned long flags;
+
+		spin_lock_irqsave(&cq->lock, flags);
+
+		idx = cq->cq_put % cq->num_cqe;
+		cqe = &cq->queue[idx];
+
+		if (!READ_ONCE(cqe->flags)) {
+			bool notify;
+			u8 cqe_flags = SIW_WQE_VALID;
+
+			cqe->id = rqe->id;
+			cqe->opcode = SIW_OP_RECEIVE;
+			cqe->status = status;
+			cqe->imm_data = 0;
+			cqe->bytes = bytes;
+
+			if (cq->kernel_verbs) {
+				cqe->base_qp = qp->ib_qp;
+				if (inval_stag) {
+					cqe_flags |= SIW_WQE_REM_INVAL;
+					cqe->inval_stag = inval_stag;
+				}
+			} else {
+				cqe->qp_id = qp_id(qp);
+			}
+			/* mark CQE valid for application */
+			WRITE_ONCE(cqe->flags, cqe_flags);
+			/* recycle RQE */
+			smp_store_mb(rqe->flags, 0);
+
+			cq->cq_put++;
+			notify = siw_cq_notify_now(cq, SIW_WQE_SIGNALLED);
+
+			spin_unlock_irqrestore(&cq->lock, flags);
+
+			if (notify) {
+				siw_dbg_cq(cq, "Call completion handler\n");
+				cq->base_cq.comp_handler(&cq->base_cq,
+						cq->base_cq.cq_context);
+			}
+		} else {
+			spin_unlock_irqrestore(&cq->lock, flags);
+			rv = -ENOMEM;
+			siw_cq_event(cq, IB_EVENT_CQ_ERR);
+		}
+	} else {
+		/* recycle RQE */
+		smp_store_mb(rqe->flags, 0);
+	}
+	return rv;
+}
+
+/*
+ * siw_sq_flush()
+ *
+ * Flush SQ and ORRQ entries to CQ.
+ *
+ * Must be called with QP state write lock held.
+ * Therefore, SQ and ORQ lock must not be taken.
+ */
+void siw_sq_flush(struct siw_qp *qp)
+{
+	struct siw_sqe *sqe;
+	struct siw_wqe *wqe = tx_wqe(qp);
+	int async_event = 0;
+
+	/*
+	 * Start with completing any work currently on the ORQ
+	 */
+	while (qp->attrs.orq_size) {
+		sqe = &qp->orq[qp->orq_get % qp->attrs.orq_size];
+		if (!READ_ONCE(sqe->flags))
+			break;
+
+		if (siw_sqe_complete(qp, sqe, 0, SIW_WC_WR_FLUSH_ERR) != 0)
+			break;
+
+		WRITE_ONCE(sqe->flags, 0);
+		qp->orq_get++;
+	}
+	/*
+	 * Flush an in-progress WQE if present
+	 */
+	if (wqe->wr_status != SIW_WR_IDLE) {
+		siw_dbg_qp(qp, "flush current SQE, type %d, status %d\n",
+			   tx_type(wqe), wqe->wr_status);
+
+		siw_wqe_put_mem(wqe, tx_type(wqe));
+
+		if (tx_type(wqe) != SIW_OP_READ_RESPONSE &&
+		    ((tx_type(wqe) != SIW_OP_READ &&
+		      tx_type(wqe) != SIW_OP_READ_LOCAL_INV) ||
+		     wqe->wr_status == SIW_WR_QUEUED))
+			/*
+			 * An in-progress Read Request is already in
+			 * the ORQ
+			 */
+			siw_sqe_complete(qp, &wqe->sqe, wqe->bytes,
+					 SIW_WC_WR_FLUSH_ERR);
+
+		wqe->wr_status = SIW_WR_IDLE;
+	}
+	/*
+	 * Flush the Send Queue
+	 */
+	while (qp->attrs.sq_size) {
+		sqe = &qp->sendq[qp->sq_get % qp->attrs.sq_size];
+		if (!READ_ONCE(sqe->flags))
+			break;
+
+		async_event = 1;
+		if (siw_sqe_complete(qp, sqe, 0, SIW_WC_WR_FLUSH_ERR) != 0)
+			/*
+			 * Shall IB_EVENT_SQ_DRAINED be supressed if work
+			 * completion fails?
+			 */
+			break;
+
+		WRITE_ONCE(sqe->flags, 0);
+		qp->sq_get++;
+	}
+	if (async_event)
+		siw_qp_event(qp, IB_EVENT_SQ_DRAINED);
+}
+
+/*
+ * siw_rq_flush()
+ *
+ * Flush recv queue entries to CQ. Also
+ * takes care of pending active tagged and untagged
+ * inbound transfers, which have target memory
+ * referenced.
+ *
+ * Must be called with QP state write lock held.
+ * Therefore, RQ lock must not be taken.
+ */
+void siw_rq_flush(struct siw_qp *qp)
+{
+	struct siw_wqe *wqe = &qp->rx_untagged.wqe_active;
+
+	/*
+	 * Flush an in-progress untagged operation if present
+	 */
+	if (wqe->wr_status != SIW_WR_IDLE) {
+		siw_dbg_qp(qp, "flush current rqe, type %d, status %d\n",
+			   rx_type(wqe), wqe->wr_status);
+
+		siw_wqe_put_mem(wqe, rx_type(wqe));
+
+		if (rx_type(wqe) == SIW_OP_RECEIVE) {
+			siw_rqe_complete(qp, &wqe->rqe, wqe->bytes,
+					 0, SIW_WC_WR_FLUSH_ERR);
+		} else if (rx_type(wqe) != SIW_OP_READ &&
+			   rx_type(wqe) != SIW_OP_READ_RESPONSE &&
+			   rx_type(wqe) != SIW_OP_WRITE) {
+			siw_sqe_complete(qp, &wqe->sqe, 0, SIW_WC_WR_FLUSH_ERR);
+		}
+		wqe->wr_status = SIW_WR_IDLE;
+	}
+	wqe = &qp->rx_tagged.wqe_active;
+
+	if (wqe->wr_status != SIW_WR_IDLE) {
+		siw_wqe_put_mem(wqe, rx_type(wqe));
+		wqe->wr_status = SIW_WR_IDLE;
+	}
+	/*
+	 * Flush the Receive Queue
+	 */
+	while (qp->attrs.rq_size) {
+		struct siw_rqe *rqe =
+			&qp->recvq[qp->rq_get % qp->attrs.rq_size];
+
+		if (!READ_ONCE(rqe->flags))
+			break;
+
+		if (siw_rqe_complete(qp, rqe, 0, 0, SIW_WC_WR_FLUSH_ERR) != 0)
+			break;
+
+		WRITE_ONCE(rqe->flags, 0);
+		qp->rq_get++;
+	}
+}
+
+int siw_qp_add(struct siw_device *sdev, struct siw_qp *qp)
+{
+	int rv = xa_alloc(&sdev->qp_xa, &qp->ib_qp->qp_num, qp, xa_limit_32b,
+			  GFP_KERNEL);
+
+	if (!rv) {
+		kref_init(&qp->ref);
+		qp->sdev = sdev;
+		qp->qp_num = qp->ib_qp->qp_num;
+		siw_dbg_qp(qp, "new QP\n");
+	}
+	return rv;
+}
+
+void siw_free_qp(struct kref *ref)
+{
+	struct siw_qp *found, *qp = container_of(ref, struct siw_qp, ref);
+	struct siw_base_qp *siw_base_qp = to_siw_base_qp(qp->ib_qp);
+	struct siw_device *sdev = qp->sdev;
+	unsigned long flags;
+
+	if (qp->cep)
+		siw_cep_put(qp->cep);
+
+	found = xa_erase(&sdev->qp_xa, qp_id(qp));
+	WARN_ON(found != qp);
+	spin_lock_irqsave(&sdev->lock, flags);
+	list_del(&qp->devq);
+	spin_unlock_irqrestore(&sdev->lock, flags);
+
+	vfree(qp->sendq);
+	vfree(qp->recvq);
+	vfree(qp->irq);
+	vfree(qp->orq);
+
+	siw_put_tx_cpu(qp->tx_cpu);
+
+	atomic_dec(&sdev->num_qp);
+	siw_dbg_qp(qp, "free QP\n");
+	kfree_rcu(qp, rcu);
+	kfree(siw_base_qp);
+}
diff --git a/drivers/infiniband/sw/siw/siw_qp_rx.c b/drivers/infiniband/sw/siw/siw_qp_rx.c
new file mode 100644
index 0000000..c0a8872
--- /dev/null
+++ b/drivers/infiniband/sw/siw/siw_qp_rx.c
@@ -0,0 +1,1460 @@
+// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause
+
+/* Authors: Bernard Metzler <bmt@zurich.ibm.com> */
+/* Copyright (c) 2008-2019, IBM Corporation */
+
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/net.h>
+#include <linux/scatterlist.h>
+#include <linux/highmem.h>
+
+#include <rdma/iw_cm.h>
+#include <rdma/ib_verbs.h>
+
+#include "siw.h"
+#include "siw_verbs.h"
+#include "siw_mem.h"
+
+/*
+ * siw_rx_umem()
+ *
+ * Receive data of @len into target referenced by @dest_addr.
+ *
+ * @srx:	Receive Context
+ * @umem:	siw representation of target memory
+ * @dest_addr:	user virtual address
+ * @len:	number of bytes to place
+ */
+static int siw_rx_umem(struct siw_rx_stream *srx, struct siw_umem *umem,
+		       u64 dest_addr, int len)
+{
+	int copied = 0;
+
+	while (len) {
+		struct page *p;
+		int pg_off, bytes, rv;
+		void *dest;
+
+		p = siw_get_upage(umem, dest_addr);
+		if (unlikely(!p)) {
+			pr_warn("siw: %s: [QP %u]: bogus addr: %pK, %pK\n",
+				__func__, qp_id(rx_qp(srx)),
+				(void *)(uintptr_t)dest_addr,
+				(void *)(uintptr_t)umem->fp_addr);
+			/* siw internal error */
+			srx->skb_copied += copied;
+			srx->skb_new -= copied;
+
+			return -EFAULT;
+		}
+		pg_off = dest_addr & ~PAGE_MASK;
+		bytes = min(len, (int)PAGE_SIZE - pg_off);
+
+		siw_dbg_qp(rx_qp(srx), "page %pK, bytes=%u\n", p, bytes);
+
+		dest = kmap_atomic(p);
+		rv = skb_copy_bits(srx->skb, srx->skb_offset, dest + pg_off,
+				   bytes);
+
+		if (unlikely(rv)) {
+			kunmap_atomic(dest);
+			srx->skb_copied += copied;
+			srx->skb_new -= copied;
+
+			pr_warn("siw: [QP %u]: %s, len %d, page %p, rv %d\n",
+				qp_id(rx_qp(srx)), __func__, len, p, rv);
+
+			return -EFAULT;
+		}
+		if (srx->mpa_crc_hd) {
+			if (rx_qp(srx)->kernel_verbs) {
+				crypto_shash_update(srx->mpa_crc_hd,
+					(u8 *)(dest + pg_off), bytes);
+				kunmap_atomic(dest);
+			} else {
+				kunmap_atomic(dest);
+				/*
+				 * Do CRC on original, not target buffer.
+				 * Some user land applications may
+				 * concurrently write the target buffer,
+				 * which would yield a broken CRC.
+				 * Walking the skb twice is very ineffcient.
+				 * Folding the CRC into skb_copy_bits()
+				 * would be much better, but is currently
+				 * not supported.
+				 */
+				siw_crc_skb(srx, bytes);
+			}
+		} else {
+			kunmap_atomic(dest);
+		}
+		srx->skb_offset += bytes;
+		copied += bytes;
+		len -= bytes;
+		dest_addr += bytes;
+		pg_off = 0;
+	}
+	srx->skb_copied += copied;
+	srx->skb_new -= copied;
+
+	return copied;
+}
+
+static int siw_rx_kva(struct siw_rx_stream *srx, void *kva, int len)
+{
+	int rv;
+
+	siw_dbg_qp(rx_qp(srx), "kva: 0x%pK, len: %u\n", kva, len);
+
+	rv = skb_copy_bits(srx->skb, srx->skb_offset, kva, len);
+	if (unlikely(rv)) {
+		pr_warn("siw: [QP %u]: %s, len %d, kva 0x%pK, rv %d\n",
+			qp_id(rx_qp(srx)), __func__, len, kva, rv);
+
+		return rv;
+	}
+	if (srx->mpa_crc_hd)
+		crypto_shash_update(srx->mpa_crc_hd, (u8 *)kva, len);
+
+	srx->skb_offset += len;
+	srx->skb_copied += len;
+	srx->skb_new -= len;
+
+	return len;
+}
+
+static int siw_rx_pbl(struct siw_rx_stream *srx, int *pbl_idx,
+		      struct siw_mem *mem, u64 addr, int len)
+{
+	struct siw_pbl *pbl = mem->pbl;
+	u64 offset = addr - mem->va;
+	int copied = 0;
+
+	while (len) {
+		int bytes;
+		dma_addr_t buf_addr =
+			siw_pbl_get_buffer(pbl, offset, &bytes, pbl_idx);
+		if (!buf_addr)
+			break;
+
+		bytes = min(bytes, len);
+		if (siw_rx_kva(srx, (void *)buf_addr, bytes) == bytes) {
+			copied += bytes;
+			offset += bytes;
+			len -= bytes;
+		} else {
+			break;
+		}
+	}
+	return copied;
+}
+
+/*
+ * siw_rresp_check_ntoh()
+ *
+ * Check incoming RRESP fragment header against expected
+ * header values and update expected values for potential next
+ * fragment.
+ *
+ * NOTE: This function must be called only if a RRESP DDP segment
+ *       starts but not for fragmented consecutive pieces of an
+ *       already started DDP segment.
+ */
+static int siw_rresp_check_ntoh(struct siw_rx_stream *srx,
+				struct siw_rx_fpdu *frx)
+{
+	struct iwarp_rdma_rresp *rresp = &srx->hdr.rresp;
+	struct siw_wqe *wqe = &frx->wqe_active;
+	enum ddp_ecode ecode;
+
+	u32 sink_stag = be32_to_cpu(rresp->sink_stag);
+	u64 sink_to = be64_to_cpu(rresp->sink_to);
+
+	if (frx->first_ddp_seg) {
+		srx->ddp_stag = wqe->sqe.sge[0].lkey;
+		srx->ddp_to = wqe->sqe.sge[0].laddr;
+		frx->pbl_idx = 0;
+	}
+	/* Below checks extend beyond the semantics of DDP, and
+	 * into RDMAP:
+	 * We check if the read response matches exactly the
+	 * read request which was send to the remote peer to
+	 * trigger this read response. RFC5040/5041 do not
+	 * always have a proper error code for the detected
+	 * error cases. We choose 'base or bounds error' for
+	 * cases where the inbound STag is valid, but offset
+	 * or length do not match our response receive state.
+	 */
+	if (unlikely(srx->ddp_stag != sink_stag)) {
+		pr_warn("siw: [QP %u]: rresp stag: %08x != %08x\n",
+			qp_id(rx_qp(srx)), sink_stag, srx->ddp_stag);
+		ecode = DDP_ECODE_T_INVALID_STAG;
+		goto error;
+	}
+	if (unlikely(srx->ddp_to != sink_to)) {
+		pr_warn("siw: [QP %u]: rresp off: %016llx != %016llx\n",
+			qp_id(rx_qp(srx)), (unsigned long long)sink_to,
+			(unsigned long long)srx->ddp_to);
+		ecode = DDP_ECODE_T_BASE_BOUNDS;
+		goto error;
+	}
+	if (unlikely(!frx->more_ddp_segs &&
+		     (wqe->processed + srx->fpdu_part_rem != wqe->bytes))) {
+		pr_warn("siw: [QP %u]: rresp len: %d != %d\n",
+			qp_id(rx_qp(srx)),
+			wqe->processed + srx->fpdu_part_rem, wqe->bytes);
+		ecode = DDP_ECODE_T_BASE_BOUNDS;
+		goto error;
+	}
+	return 0;
+error:
+	siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_DDP,
+			   DDP_ETYPE_TAGGED_BUF, ecode, 0);
+	return -EINVAL;
+}
+
+/*
+ * siw_write_check_ntoh()
+ *
+ * Check incoming WRITE fragment header against expected
+ * header values and update expected values for potential next
+ * fragment
+ *
+ * NOTE: This function must be called only if a WRITE DDP segment
+ *       starts but not for fragmented consecutive pieces of an
+ *       already started DDP segment.
+ */
+static int siw_write_check_ntoh(struct siw_rx_stream *srx,
+				struct siw_rx_fpdu *frx)
+{
+	struct iwarp_rdma_write *write = &srx->hdr.rwrite;
+	enum ddp_ecode ecode;
+
+	u32 sink_stag = be32_to_cpu(write->sink_stag);
+	u64 sink_to = be64_to_cpu(write->sink_to);
+
+	if (frx->first_ddp_seg) {
+		srx->ddp_stag = sink_stag;
+		srx->ddp_to = sink_to;
+		frx->pbl_idx = 0;
+	} else {
+		if (unlikely(srx->ddp_stag != sink_stag)) {
+			pr_warn("siw: [QP %u]: write stag: %08x != %08x\n",
+				qp_id(rx_qp(srx)), sink_stag,
+				srx->ddp_stag);
+			ecode = DDP_ECODE_T_INVALID_STAG;
+			goto error;
+		}
+		if (unlikely(srx->ddp_to != sink_to)) {
+			pr_warn("siw: [QP %u]: write off: %016llx != %016llx\n",
+				qp_id(rx_qp(srx)),
+				(unsigned long long)sink_to,
+				(unsigned long long)srx->ddp_to);
+			ecode = DDP_ECODE_T_BASE_BOUNDS;
+			goto error;
+		}
+	}
+	return 0;
+error:
+	siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_DDP,
+			   DDP_ETYPE_TAGGED_BUF, ecode, 0);
+	return -EINVAL;
+}
+
+/*
+ * siw_send_check_ntoh()
+ *
+ * Check incoming SEND fragment header against expected
+ * header values and update expected MSN if no next
+ * fragment expected
+ *
+ * NOTE: This function must be called only if a SEND DDP segment
+ *       starts but not for fragmented consecutive pieces of an
+ *       already started DDP segment.
+ */
+static int siw_send_check_ntoh(struct siw_rx_stream *srx,
+			       struct siw_rx_fpdu *frx)
+{
+	struct iwarp_send_inv *send = &srx->hdr.send_inv;
+	struct siw_wqe *wqe = &frx->wqe_active;
+	enum ddp_ecode ecode;
+
+	u32 ddp_msn = be32_to_cpu(send->ddp_msn);
+	u32 ddp_mo = be32_to_cpu(send->ddp_mo);
+	u32 ddp_qn = be32_to_cpu(send->ddp_qn);
+
+	if (unlikely(ddp_qn != RDMAP_UNTAGGED_QN_SEND)) {
+		pr_warn("siw: [QP %u]: invalid ddp qn %d for send\n",
+			qp_id(rx_qp(srx)), ddp_qn);
+		ecode = DDP_ECODE_UT_INVALID_QN;
+		goto error;
+	}
+	if (unlikely(ddp_msn != srx->ddp_msn[RDMAP_UNTAGGED_QN_SEND])) {
+		pr_warn("siw: [QP %u]: send msn: %u != %u\n",
+			qp_id(rx_qp(srx)), ddp_msn,
+			srx->ddp_msn[RDMAP_UNTAGGED_QN_SEND]);
+		ecode = DDP_ECODE_UT_INVALID_MSN_RANGE;
+		goto error;
+	}
+	if (unlikely(ddp_mo != wqe->processed)) {
+		pr_warn("siw: [QP %u], send mo: %u != %u\n",
+			qp_id(rx_qp(srx)), ddp_mo, wqe->processed);
+		ecode = DDP_ECODE_UT_INVALID_MO;
+		goto error;
+	}
+	if (frx->first_ddp_seg) {
+		/* initialize user memory write position */
+		frx->sge_idx = 0;
+		frx->sge_off = 0;
+		frx->pbl_idx = 0;
+
+		/* only valid for SEND_INV and SEND_SE_INV operations */
+		srx->inval_stag = be32_to_cpu(send->inval_stag);
+	}
+	if (unlikely(wqe->bytes < wqe->processed + srx->fpdu_part_rem)) {
+		siw_dbg_qp(rx_qp(srx), "receive space short: %d - %d < %d\n",
+			   wqe->bytes, wqe->processed, srx->fpdu_part_rem);
+		wqe->wc_status = SIW_WC_LOC_LEN_ERR;
+		ecode = DDP_ECODE_UT_INVALID_MSN_NOBUF;
+		goto error;
+	}
+	return 0;
+error:
+	siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_DDP,
+			   DDP_ETYPE_UNTAGGED_BUF, ecode, 0);
+	return -EINVAL;
+}
+
+static struct siw_wqe *siw_rqe_get(struct siw_qp *qp)
+{
+	struct siw_rqe *rqe;
+	struct siw_srq *srq;
+	struct siw_wqe *wqe = NULL;
+	bool srq_event = false;
+	unsigned long flags;
+
+	srq = qp->srq;
+	if (srq) {
+		spin_lock_irqsave(&srq->lock, flags);
+		if (unlikely(!srq->num_rqe))
+			goto out;
+
+		rqe = &srq->recvq[srq->rq_get % srq->num_rqe];
+	} else {
+		if (unlikely(!qp->recvq))
+			goto out;
+
+		rqe = &qp->recvq[qp->rq_get % qp->attrs.rq_size];
+	}
+	if (likely(rqe->flags == SIW_WQE_VALID)) {
+		int num_sge = rqe->num_sge;
+
+		if (likely(num_sge <= SIW_MAX_SGE)) {
+			int i = 0;
+
+			wqe = rx_wqe(&qp->rx_untagged);
+			rx_type(wqe) = SIW_OP_RECEIVE;
+			wqe->wr_status = SIW_WR_INPROGRESS;
+			wqe->bytes = 0;
+			wqe->processed = 0;
+
+			wqe->rqe.id = rqe->id;
+			wqe->rqe.num_sge = num_sge;
+
+			while (i < num_sge) {
+				wqe->rqe.sge[i].laddr = rqe->sge[i].laddr;
+				wqe->rqe.sge[i].lkey = rqe->sge[i].lkey;
+				wqe->rqe.sge[i].length = rqe->sge[i].length;
+				wqe->bytes += wqe->rqe.sge[i].length;
+				wqe->mem[i] = NULL;
+				i++;
+			}
+			/* can be re-used by appl */
+			smp_store_mb(rqe->flags, 0);
+		} else {
+			siw_dbg_qp(qp, "too many sge's: %d\n", rqe->num_sge);
+			if (srq)
+				spin_unlock_irqrestore(&srq->lock, flags);
+			return NULL;
+		}
+		if (!srq) {
+			qp->rq_get++;
+		} else {
+			if (srq->armed) {
+				/* Test SRQ limit */
+				u32 off = (srq->rq_get + srq->limit) %
+					  srq->num_rqe;
+				struct siw_rqe *rqe2 = &srq->recvq[off];
+
+				if (!(rqe2->flags & SIW_WQE_VALID)) {
+					srq->armed = 0;
+					srq_event = true;
+				}
+			}
+			srq->rq_get++;
+		}
+	}
+out:
+	if (srq) {
+		spin_unlock_irqrestore(&srq->lock, flags);
+		if (srq_event)
+			siw_srq_event(srq, IB_EVENT_SRQ_LIMIT_REACHED);
+	}
+	return wqe;
+}
+
+/*
+ * siw_proc_send:
+ *
+ * Process one incoming SEND and place data into memory referenced by
+ * receive wqe.
+ *
+ * Function supports partially received sends (suspending/resuming
+ * current receive wqe processing)
+ *
+ * return value:
+ *	0:       reached the end of a DDP segment
+ *	-EAGAIN: to be called again to finish the DDP segment
+ */
+int siw_proc_send(struct siw_qp *qp)
+{
+	struct siw_rx_stream *srx = &qp->rx_stream;
+	struct siw_rx_fpdu *frx = &qp->rx_untagged;
+	struct siw_wqe *wqe;
+	u32 data_bytes; /* all data bytes available */
+	u32 rcvd_bytes; /* sum of data bytes rcvd */
+	int rv = 0;
+
+	if (frx->first_ddp_seg) {
+		wqe = siw_rqe_get(qp);
+		if (unlikely(!wqe)) {
+			siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
+					   DDP_ETYPE_UNTAGGED_BUF,
+					   DDP_ECODE_UT_INVALID_MSN_NOBUF, 0);
+			return -ENOENT;
+		}
+	} else {
+		wqe = rx_wqe(frx);
+	}
+	if (srx->state == SIW_GET_DATA_START) {
+		rv = siw_send_check_ntoh(srx, frx);
+		if (unlikely(rv)) {
+			siw_qp_event(qp, IB_EVENT_QP_FATAL);
+			return rv;
+		}
+		if (!srx->fpdu_part_rem) /* zero length SEND */
+			return 0;
+	}
+	data_bytes = min(srx->fpdu_part_rem, srx->skb_new);
+	rcvd_bytes = 0;
+
+	/* A zero length SEND will skip below loop */
+	while (data_bytes) {
+		struct ib_pd *pd;
+		struct siw_mem **mem, *mem_p;
+		struct siw_sge *sge;
+		u32 sge_bytes; /* data bytes avail for SGE */
+
+		sge = &wqe->rqe.sge[frx->sge_idx];
+
+		if (!sge->length) {
+			/* just skip empty sge's */
+			frx->sge_idx++;
+			frx->sge_off = 0;
+			frx->pbl_idx = 0;
+			continue;
+		}
+		sge_bytes = min(data_bytes, sge->length - frx->sge_off);
+		mem = &wqe->mem[frx->sge_idx];
+
+		/*
+		 * check with QP's PD if no SRQ present, SRQ's PD otherwise
+		 */
+		pd = qp->srq == NULL ? qp->pd : qp->srq->base_srq.pd;
+
+		rv = siw_check_sge(pd, sge, mem, IB_ACCESS_LOCAL_WRITE,
+				   frx->sge_off, sge_bytes);
+		if (unlikely(rv)) {
+			siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
+					   DDP_ETYPE_CATASTROPHIC,
+					   DDP_ECODE_CATASTROPHIC, 0);
+
+			siw_qp_event(qp, IB_EVENT_QP_ACCESS_ERR);
+			break;
+		}
+		mem_p = *mem;
+		if (mem_p->mem_obj == NULL)
+			rv = siw_rx_kva(srx,
+				(void *)(uintptr_t)(sge->laddr + frx->sge_off),
+				sge_bytes);
+		else if (!mem_p->is_pbl)
+			rv = siw_rx_umem(srx, mem_p->umem,
+					 sge->laddr + frx->sge_off, sge_bytes);
+		else
+			rv = siw_rx_pbl(srx, &frx->pbl_idx, mem_p,
+					sge->laddr + frx->sge_off, sge_bytes);
+
+		if (unlikely(rv != sge_bytes)) {
+			wqe->processed += rcvd_bytes;
+
+			siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
+					   DDP_ETYPE_CATASTROPHIC,
+					   DDP_ECODE_CATASTROPHIC, 0);
+			return -EINVAL;
+		}
+		frx->sge_off += rv;
+
+		if (frx->sge_off == sge->length) {
+			frx->sge_idx++;
+			frx->sge_off = 0;
+			frx->pbl_idx = 0;
+		}
+		data_bytes -= rv;
+		rcvd_bytes += rv;
+
+		srx->fpdu_part_rem -= rv;
+		srx->fpdu_part_rcvd += rv;
+	}
+	wqe->processed += rcvd_bytes;
+
+	if (!srx->fpdu_part_rem)
+		return 0;
+
+	return (rv < 0) ? rv : -EAGAIN;
+}
+
+/*
+ * siw_proc_write:
+ *
+ * Place incoming WRITE after referencing and checking target buffer
+
+ * Function supports partially received WRITEs (suspending/resuming
+ * current receive processing)
+ *
+ * return value:
+ *	0:       reached the end of a DDP segment
+ *	-EAGAIN: to be called again to finish the DDP segment
+ */
+int siw_proc_write(struct siw_qp *qp)
+{
+	struct siw_rx_stream *srx = &qp->rx_stream;
+	struct siw_rx_fpdu *frx = &qp->rx_tagged;
+	struct siw_mem *mem;
+	int bytes, rv;
+
+	if (srx->state == SIW_GET_DATA_START) {
+		if (!srx->fpdu_part_rem) /* zero length WRITE */
+			return 0;
+
+		rv = siw_write_check_ntoh(srx, frx);
+		if (unlikely(rv)) {
+			siw_qp_event(qp, IB_EVENT_QP_FATAL);
+			return rv;
+		}
+	}
+	bytes = min(srx->fpdu_part_rem, srx->skb_new);
+
+	if (frx->first_ddp_seg) {
+		struct siw_wqe *wqe = rx_wqe(frx);
+
+		rx_mem(frx) = siw_mem_id2obj(qp->sdev, srx->ddp_stag >> 8);
+		if (unlikely(!rx_mem(frx))) {
+			siw_dbg_qp(qp,
+				   "sink stag not found/invalid, stag 0x%08x\n",
+				   srx->ddp_stag);
+
+			siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
+					   DDP_ETYPE_TAGGED_BUF,
+					   DDP_ECODE_T_INVALID_STAG, 0);
+			return -EINVAL;
+		}
+		wqe->rqe.num_sge = 1;
+		rx_type(wqe) = SIW_OP_WRITE;
+		wqe->wr_status = SIW_WR_INPROGRESS;
+	}
+	mem = rx_mem(frx);
+
+	/*
+	 * Check if application re-registered memory with different
+	 * key field of STag.
+	 */
+	if (unlikely(mem->stag != srx->ddp_stag)) {
+		siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
+				   DDP_ETYPE_TAGGED_BUF,
+				   DDP_ECODE_T_INVALID_STAG, 0);
+		return -EINVAL;
+	}
+	rv = siw_check_mem(qp->pd, mem, srx->ddp_to + srx->fpdu_part_rcvd,
+			   IB_ACCESS_REMOTE_WRITE, bytes);
+	if (unlikely(rv)) {
+		siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
+				   DDP_ETYPE_TAGGED_BUF, siw_tagged_error(-rv),
+				   0);
+
+		siw_qp_event(qp, IB_EVENT_QP_ACCESS_ERR);
+
+		return -EINVAL;
+	}
+
+	if (mem->mem_obj == NULL)
+		rv = siw_rx_kva(srx,
+			(void *)(uintptr_t)(srx->ddp_to + srx->fpdu_part_rcvd),
+			bytes);
+	else if (!mem->is_pbl)
+		rv = siw_rx_umem(srx, mem->umem,
+				 srx->ddp_to + srx->fpdu_part_rcvd, bytes);
+	else
+		rv = siw_rx_pbl(srx, &frx->pbl_idx, mem,
+				srx->ddp_to + srx->fpdu_part_rcvd, bytes);
+
+	if (unlikely(rv != bytes)) {
+		siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
+				   DDP_ETYPE_CATASTROPHIC,
+				   DDP_ECODE_CATASTROPHIC, 0);
+		return -EINVAL;
+	}
+	srx->fpdu_part_rem -= rv;
+	srx->fpdu_part_rcvd += rv;
+
+	if (!srx->fpdu_part_rem) {
+		srx->ddp_to += srx->fpdu_part_rcvd;
+		return 0;
+	}
+	return -EAGAIN;
+}
+
+/*
+ * Inbound RREQ's cannot carry user data.
+ */
+int siw_proc_rreq(struct siw_qp *qp)
+{
+	struct siw_rx_stream *srx = &qp->rx_stream;
+
+	if (!srx->fpdu_part_rem)
+		return 0;
+
+	pr_warn("siw: [QP %u]: rreq with mpa len %d\n", qp_id(qp),
+		be16_to_cpu(srx->hdr.ctrl.mpa_len));
+
+	return -EPROTO;
+}
+
+/*
+ * siw_init_rresp:
+ *
+ * Process inbound RDMA READ REQ. Produce a pseudo READ RESPONSE WQE.
+ * Put it at the tail of the IRQ, if there is another WQE currently in
+ * transmit processing. If not, make it the current WQE to be processed
+ * and schedule transmit processing.
+ *
+ * Can be called from softirq context and from process
+ * context (RREAD socket loopback case!)
+ *
+ * return value:
+ *	0:      success,
+ *		failure code otherwise
+ */
+
+static int siw_init_rresp(struct siw_qp *qp, struct siw_rx_stream *srx)
+{
+	struct siw_wqe *tx_work = tx_wqe(qp);
+	struct siw_sqe *resp;
+
+	uint64_t raddr = be64_to_cpu(srx->hdr.rreq.sink_to),
+		 laddr = be64_to_cpu(srx->hdr.rreq.source_to);
+	uint32_t length = be32_to_cpu(srx->hdr.rreq.read_size),
+		 lkey = be32_to_cpu(srx->hdr.rreq.source_stag),
+		 rkey = be32_to_cpu(srx->hdr.rreq.sink_stag),
+		 msn = be32_to_cpu(srx->hdr.rreq.ddp_msn);
+
+	int run_sq = 1, rv = 0;
+	unsigned long flags;
+
+	if (unlikely(msn != srx->ddp_msn[RDMAP_UNTAGGED_QN_RDMA_READ])) {
+		siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
+				   DDP_ETYPE_UNTAGGED_BUF,
+				   DDP_ECODE_UT_INVALID_MSN_RANGE, 0);
+		return -EPROTO;
+	}
+	spin_lock_irqsave(&qp->sq_lock, flags);
+
+	if (tx_work->wr_status == SIW_WR_IDLE) {
+		/*
+		 * immediately schedule READ response w/o
+		 * consuming IRQ entry: IRQ must be empty.
+		 */
+		tx_work->processed = 0;
+		tx_work->mem[0] = NULL;
+		tx_work->wr_status = SIW_WR_QUEUED;
+		resp = &tx_work->sqe;
+	} else {
+		resp = irq_alloc_free(qp);
+		run_sq = 0;
+	}
+	if (likely(resp)) {
+		resp->opcode = SIW_OP_READ_RESPONSE;
+
+		resp->sge[0].length = length;
+		resp->sge[0].laddr = laddr;
+		resp->sge[0].lkey = lkey;
+
+		/* Keep aside message sequence number for potential
+		 * error reporting during Read Response generation.
+		 */
+		resp->sge[1].length = msn;
+
+		resp->raddr = raddr;
+		resp->rkey = rkey;
+		resp->num_sge = length ? 1 : 0;
+
+		/* RRESP now valid as current TX wqe or placed into IRQ */
+		smp_store_mb(resp->flags, SIW_WQE_VALID);
+	} else {
+		pr_warn("siw: [QP %u]: irq %d exceeded %d\n", qp_id(qp),
+			qp->irq_put % qp->attrs.irq_size, qp->attrs.irq_size);
+
+		siw_init_terminate(qp, TERM_ERROR_LAYER_RDMAP,
+				   RDMAP_ETYPE_REMOTE_OPERATION,
+				   RDMAP_ECODE_CATASTROPHIC_STREAM, 0);
+		rv = -EPROTO;
+	}
+
+	spin_unlock_irqrestore(&qp->sq_lock, flags);
+
+	if (run_sq)
+		rv = siw_sq_start(qp);
+
+	return rv;
+}
+
+/*
+ * Only called at start of Read.Resonse processing.
+ * Transfer pending Read from tip of ORQ into currrent rx wqe,
+ * but keep ORQ entry valid until Read.Response processing done.
+ * No Queue locking needed.
+ */
+static int siw_orqe_start_rx(struct siw_qp *qp)
+{
+	struct siw_sqe *orqe;
+	struct siw_wqe *wqe = NULL;
+
+	/* make sure ORQ indices are current */
+	smp_mb();
+
+	orqe = orq_get_current(qp);
+	if (READ_ONCE(orqe->flags) & SIW_WQE_VALID) {
+		/* RRESP is a TAGGED RDMAP operation */
+		wqe = rx_wqe(&qp->rx_tagged);
+		wqe->sqe.id = orqe->id;
+		wqe->sqe.opcode = orqe->opcode;
+		wqe->sqe.sge[0].laddr = orqe->sge[0].laddr;
+		wqe->sqe.sge[0].lkey = orqe->sge[0].lkey;
+		wqe->sqe.sge[0].length = orqe->sge[0].length;
+		wqe->sqe.flags = orqe->flags;
+		wqe->sqe.num_sge = 1;
+		wqe->bytes = orqe->sge[0].length;
+		wqe->processed = 0;
+		wqe->mem[0] = NULL;
+		/* make sure WQE is completely written before valid */
+		smp_wmb();
+		wqe->wr_status = SIW_WR_INPROGRESS;
+
+		return 0;
+	}
+	return -EPROTO;
+}
+
+/*
+ * siw_proc_rresp:
+ *
+ * Place incoming RRESP data into memory referenced by RREQ WQE
+ * which is at the tip of the ORQ
+ *
+ * Function supports partially received RRESP's (suspending/resuming
+ * current receive processing)
+ */
+int siw_proc_rresp(struct siw_qp *qp)
+{
+	struct siw_rx_stream *srx = &qp->rx_stream;
+	struct siw_rx_fpdu *frx = &qp->rx_tagged;
+	struct siw_wqe *wqe = rx_wqe(frx);
+	struct siw_mem **mem, *mem_p;
+	struct siw_sge *sge;
+	int bytes, rv;
+
+	if (frx->first_ddp_seg) {
+		if (unlikely(wqe->wr_status != SIW_WR_IDLE)) {
+			pr_warn("siw: [QP %u]: proc RRESP: status %d, op %d\n",
+				qp_id(qp), wqe->wr_status, wqe->sqe.opcode);
+			rv = -EPROTO;
+			goto error_term;
+		}
+		/*
+		 * fetch pending RREQ from orq
+		 */
+		rv = siw_orqe_start_rx(qp);
+		if (rv) {
+			pr_warn("siw: [QP %u]: ORQ empty at idx %d\n",
+				qp_id(qp), qp->orq_get % qp->attrs.orq_size);
+			goto error_term;
+		}
+		rv = siw_rresp_check_ntoh(srx, frx);
+		if (unlikely(rv)) {
+			siw_qp_event(qp, IB_EVENT_QP_FATAL);
+			return rv;
+		}
+	} else {
+		if (unlikely(wqe->wr_status != SIW_WR_INPROGRESS)) {
+			pr_warn("siw: [QP %u]: resume RRESP: status %d\n",
+				qp_id(qp), wqe->wr_status);
+			rv = -EPROTO;
+			goto error_term;
+		}
+	}
+	if (!srx->fpdu_part_rem) /* zero length RRESPONSE */
+		return 0;
+
+	sge = wqe->sqe.sge; /* there is only one */
+	mem = &wqe->mem[0];
+
+	if (!(*mem)) {
+		/*
+		 * check target memory which resolves memory on first fragment
+		 */
+		rv = siw_check_sge(qp->pd, sge, mem, IB_ACCESS_LOCAL_WRITE, 0,
+				   wqe->bytes);
+		if (unlikely(rv)) {
+			siw_dbg_qp(qp, "target mem check: %d\n", rv);
+			wqe->wc_status = SIW_WC_LOC_PROT_ERR;
+
+			siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
+					   DDP_ETYPE_TAGGED_BUF,
+					   siw_tagged_error(-rv), 0);
+
+			siw_qp_event(qp, IB_EVENT_QP_ACCESS_ERR);
+
+			return -EINVAL;
+		}
+	}
+	mem_p = *mem;
+
+	bytes = min(srx->fpdu_part_rem, srx->skb_new);
+
+	if (mem_p->mem_obj == NULL)
+		rv = siw_rx_kva(srx,
+			(void *)(uintptr_t)(sge->laddr + wqe->processed),
+			bytes);
+	else if (!mem_p->is_pbl)
+		rv = siw_rx_umem(srx, mem_p->umem, sge->laddr + wqe->processed,
+				 bytes);
+	else
+		rv = siw_rx_pbl(srx, &frx->pbl_idx, mem_p,
+				sge->laddr + wqe->processed, bytes);
+	if (rv != bytes) {
+		wqe->wc_status = SIW_WC_GENERAL_ERR;
+		rv = -EINVAL;
+		goto error_term;
+	}
+	srx->fpdu_part_rem -= rv;
+	srx->fpdu_part_rcvd += rv;
+	wqe->processed += rv;
+
+	if (!srx->fpdu_part_rem) {
+		srx->ddp_to += srx->fpdu_part_rcvd;
+		return 0;
+	}
+	return -EAGAIN;
+
+error_term:
+	siw_init_terminate(qp, TERM_ERROR_LAYER_DDP, DDP_ETYPE_CATASTROPHIC,
+			   DDP_ECODE_CATASTROPHIC, 0);
+	return rv;
+}
+
+int siw_proc_terminate(struct siw_qp *qp)
+{
+	struct siw_rx_stream *srx = &qp->rx_stream;
+	struct sk_buff *skb = srx->skb;
+	struct iwarp_terminate *term = &srx->hdr.terminate;
+	union iwarp_hdr term_info;
+	u8 *infop = (u8 *)&term_info;
+	enum rdma_opcode op;
+	u16 to_copy = sizeof(struct iwarp_ctrl);
+
+	pr_warn("siw: got TERMINATE. layer %d, type %d, code %d\n",
+		__rdmap_term_layer(term), __rdmap_term_etype(term),
+		__rdmap_term_ecode(term));
+
+	if (be32_to_cpu(term->ddp_qn) != RDMAP_UNTAGGED_QN_TERMINATE ||
+	    be32_to_cpu(term->ddp_msn) !=
+		    qp->rx_stream.ddp_msn[RDMAP_UNTAGGED_QN_TERMINATE] ||
+	    be32_to_cpu(term->ddp_mo) != 0) {
+		pr_warn("siw: rx bogus TERM [QN x%08x, MSN x%08x, MO x%08x]\n",
+			be32_to_cpu(term->ddp_qn), be32_to_cpu(term->ddp_msn),
+			be32_to_cpu(term->ddp_mo));
+		return -ECONNRESET;
+	}
+	/*
+	 * Receive remaining pieces of TERM if indicated
+	 */
+	if (!term->flag_m)
+		return -ECONNRESET;
+
+	/* Do not take the effort to reassemble a network fragmented
+	 * TERM message
+	 */
+	if (srx->skb_new < sizeof(struct iwarp_ctrl_tagged))
+		return -ECONNRESET;
+
+	memset(infop, 0, sizeof(term_info));
+
+	skb_copy_bits(skb, srx->skb_offset, infop, to_copy);
+
+	op = __rdmap_get_opcode(&term_info.ctrl);
+	if (op >= RDMAP_TERMINATE)
+		goto out;
+
+	infop += to_copy;
+	srx->skb_offset += to_copy;
+	srx->skb_new -= to_copy;
+	srx->skb_copied += to_copy;
+	srx->fpdu_part_rcvd += to_copy;
+	srx->fpdu_part_rem -= to_copy;
+
+	to_copy = iwarp_pktinfo[op].hdr_len - to_copy;
+
+	/* Again, no network fragmented TERM's */
+	if (to_copy + MPA_CRC_SIZE > srx->skb_new)
+		return -ECONNRESET;
+
+	skb_copy_bits(skb, srx->skb_offset, infop, to_copy);
+
+	if (term->flag_r) {
+		siw_dbg_qp(qp, "TERM reports RDMAP hdr type %u, len %u (%s)\n",
+			   op, be16_to_cpu(term_info.ctrl.mpa_len),
+			   term->flag_m ? "valid" : "invalid");
+	} else if (term->flag_d) {
+		siw_dbg_qp(qp, "TERM reports DDP hdr type %u, len %u (%s)\n",
+			   op, be16_to_cpu(term_info.ctrl.mpa_len),
+			   term->flag_m ? "valid" : "invalid");
+	}
+out:
+	srx->skb_new -= to_copy;
+	srx->skb_offset += to_copy;
+	srx->skb_copied += to_copy;
+	srx->fpdu_part_rcvd += to_copy;
+	srx->fpdu_part_rem -= to_copy;
+
+	return -ECONNRESET;
+}
+
+static int siw_get_trailer(struct siw_qp *qp, struct siw_rx_stream *srx)
+{
+	struct sk_buff *skb = srx->skb;
+	u8 *tbuf = (u8 *)&srx->trailer.crc - srx->pad;
+	__wsum crc_in, crc_own = 0;
+
+	siw_dbg_qp(qp, "expected %d, available %d, pad %u\n",
+		   srx->fpdu_part_rem, srx->skb_new, srx->pad);
+
+	if (srx->skb_new < srx->fpdu_part_rem)
+		return -EAGAIN;
+
+	skb_copy_bits(skb, srx->skb_offset, tbuf, srx->fpdu_part_rem);
+
+	if (srx->mpa_crc_hd && srx->pad)
+		crypto_shash_update(srx->mpa_crc_hd, tbuf, srx->pad);
+
+	srx->skb_new -= srx->fpdu_part_rem;
+	srx->skb_offset += srx->fpdu_part_rem;
+	srx->skb_copied += srx->fpdu_part_rem;
+
+	if (!srx->mpa_crc_hd)
+		return 0;
+
+	/*
+	 * CRC32 is computed, transmitted and received directly in NBO,
+	 * so there's never a reason to convert byte order.
+	 */
+	crypto_shash_final(srx->mpa_crc_hd, (u8 *)&crc_own);
+	crc_in = (__force __wsum)srx->trailer.crc;
+
+	if (unlikely(crc_in != crc_own)) {
+		pr_warn("siw: crc error. in: %08x, own %08x, op %u\n",
+			crc_in, crc_own, qp->rx_stream.rdmap_op);
+
+		siw_init_terminate(qp, TERM_ERROR_LAYER_LLP,
+				   LLP_ETYPE_MPA,
+				   LLP_ECODE_RECEIVED_CRC, 0);
+		return -EINVAL;
+	}
+	return 0;
+}
+
+#define MIN_DDP_HDR sizeof(struct iwarp_ctrl_tagged)
+
+static int siw_get_hdr(struct siw_rx_stream *srx)
+{
+	struct sk_buff *skb = srx->skb;
+	struct siw_qp *qp = rx_qp(srx);
+	struct iwarp_ctrl *c_hdr = &srx->hdr.ctrl;
+	struct siw_rx_fpdu *frx;
+	u8 opcode;
+	int bytes;
+
+	if (srx->fpdu_part_rcvd < MIN_DDP_HDR) {
+		/*
+		 * copy a mimimum sized (tagged) DDP frame control part
+		 */
+		bytes = min_t(int, srx->skb_new,
+			      MIN_DDP_HDR - srx->fpdu_part_rcvd);
+
+		skb_copy_bits(skb, srx->skb_offset,
+			      (char *)c_hdr + srx->fpdu_part_rcvd, bytes);
+
+		srx->fpdu_part_rcvd += bytes;
+
+		srx->skb_new -= bytes;
+		srx->skb_offset += bytes;
+		srx->skb_copied += bytes;
+
+		if (srx->fpdu_part_rcvd < MIN_DDP_HDR)
+			return -EAGAIN;
+
+		if (unlikely(__ddp_get_version(c_hdr) != DDP_VERSION)) {
+			enum ddp_etype etype;
+			enum ddp_ecode ecode;
+
+			pr_warn("siw: received ddp version unsupported %d\n",
+				__ddp_get_version(c_hdr));
+
+			if (c_hdr->ddp_rdmap_ctrl & DDP_FLAG_TAGGED) {
+				etype = DDP_ETYPE_TAGGED_BUF;
+				ecode = DDP_ECODE_T_VERSION;
+			} else {
+				etype = DDP_ETYPE_UNTAGGED_BUF;
+				ecode = DDP_ECODE_UT_VERSION;
+			}
+			siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_DDP,
+					   etype, ecode, 0);
+			return -EINVAL;
+		}
+		if (unlikely(__rdmap_get_version(c_hdr) != RDMAP_VERSION)) {
+			pr_warn("siw: received rdmap version unsupported %d\n",
+				__rdmap_get_version(c_hdr));
+
+			siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_RDMAP,
+					   RDMAP_ETYPE_REMOTE_OPERATION,
+					   RDMAP_ECODE_VERSION, 0);
+			return -EINVAL;
+		}
+		opcode = __rdmap_get_opcode(c_hdr);
+
+		if (opcode > RDMAP_TERMINATE) {
+			pr_warn("siw: received unknown packet type %u\n",
+				opcode);
+
+			siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_RDMAP,
+					   RDMAP_ETYPE_REMOTE_OPERATION,
+					   RDMAP_ECODE_OPCODE, 0);
+			return -EINVAL;
+		}
+		siw_dbg_qp(rx_qp(srx), "new header, opcode %u\n", opcode);
+	} else {
+		opcode = __rdmap_get_opcode(c_hdr);
+	}
+	set_rx_fpdu_context(qp, opcode);
+	frx = qp->rx_fpdu;
+
+	/*
+	 * Figure out len of current hdr: variable length of
+	 * iwarp hdr may force us to copy hdr information in
+	 * two steps. Only tagged DDP messages are already
+	 * completely received.
+	 */
+	if (iwarp_pktinfo[opcode].hdr_len > sizeof(struct iwarp_ctrl_tagged)) {
+		bytes = iwarp_pktinfo[opcode].hdr_len - MIN_DDP_HDR;
+
+		if (srx->skb_new < bytes)
+			return -EAGAIN;
+
+		skb_copy_bits(skb, srx->skb_offset,
+			      (char *)c_hdr + srx->fpdu_part_rcvd, bytes);
+
+		srx->fpdu_part_rcvd += bytes;
+
+		srx->skb_new -= bytes;
+		srx->skb_offset += bytes;
+		srx->skb_copied += bytes;
+	}
+
+	/*
+	 * DDP/RDMAP header receive completed. Check if the current
+	 * DDP segment starts a new RDMAP message or continues a previously
+	 * started RDMAP message.
+	 *
+	 * Alternating reception of DDP segments (or FPDUs) from incomplete
+	 * tagged and untagged RDMAP messages is supported, as long as
+	 * the current tagged or untagged message gets eventually completed
+	 * w/o intersection from another message of the same type
+	 * (tagged/untagged). E.g., a WRITE can get intersected by a SEND,
+	 * but not by a READ RESPONSE etc.
+	 */
+	if (srx->mpa_crc_hd) {
+		/*
+		 * Restart CRC computation
+		 */
+		crypto_shash_init(srx->mpa_crc_hd);
+		crypto_shash_update(srx->mpa_crc_hd, (u8 *)c_hdr,
+				    srx->fpdu_part_rcvd);
+	}
+	if (frx->more_ddp_segs) {
+		frx->first_ddp_seg = 0;
+		if (frx->prev_rdmap_op != opcode) {
+			pr_warn("siw: packet intersection: %u : %u\n",
+				frx->prev_rdmap_op, opcode);
+			/*
+			 * The last inbound RDMA operation of same type
+			 * (tagged or untagged) is left unfinished.
+			 * To complete it in error, make it the current
+			 * operation again, even with the header already
+			 * overwritten. For error handling, only the opcode
+			 * and current rx context are relevant.
+			 */
+			set_rx_fpdu_context(qp, frx->prev_rdmap_op);
+			__rdmap_set_opcode(c_hdr, frx->prev_rdmap_op);
+			return -EPROTO;
+		}
+	} else {
+		frx->prev_rdmap_op = opcode;
+		frx->first_ddp_seg = 1;
+	}
+	frx->more_ddp_segs = c_hdr->ddp_rdmap_ctrl & DDP_FLAG_LAST ? 0 : 1;
+
+	return 0;
+}
+
+static int siw_check_tx_fence(struct siw_qp *qp)
+{
+	struct siw_wqe *tx_waiting = tx_wqe(qp);
+	struct siw_sqe *rreq;
+	int resume_tx = 0, rv = 0;
+	unsigned long flags;
+
+	spin_lock_irqsave(&qp->orq_lock, flags);
+
+	rreq = orq_get_current(qp);
+
+	/* free current orq entry */
+	WRITE_ONCE(rreq->flags, 0);
+
+	if (qp->tx_ctx.orq_fence) {
+		if (unlikely(tx_waiting->wr_status != SIW_WR_QUEUED)) {
+			pr_warn("siw: [QP %u]: fence resume: bad status %d\n",
+				qp_id(qp), tx_waiting->wr_status);
+			rv = -EPROTO;
+			goto out;
+		}
+		/* resume SQ processing */
+		if (tx_waiting->sqe.opcode == SIW_OP_READ ||
+		    tx_waiting->sqe.opcode == SIW_OP_READ_LOCAL_INV) {
+			rreq = orq_get_tail(qp);
+			if (unlikely(!rreq)) {
+				pr_warn("siw: [QP %u]: no ORQE\n", qp_id(qp));
+				rv = -EPROTO;
+				goto out;
+			}
+			siw_read_to_orq(rreq, &tx_waiting->sqe);
+
+			qp->orq_put++;
+			qp->tx_ctx.orq_fence = 0;
+			resume_tx = 1;
+
+		} else if (siw_orq_empty(qp)) {
+			qp->tx_ctx.orq_fence = 0;
+			resume_tx = 1;
+		} else {
+			pr_warn("siw: [QP %u]: fence resume: orq idx: %d:%d\n",
+				qp_id(qp), qp->orq_get, qp->orq_put);
+			rv = -EPROTO;
+		}
+	}
+	qp->orq_get++;
+out:
+	spin_unlock_irqrestore(&qp->orq_lock, flags);
+
+	if (resume_tx)
+		rv = siw_sq_start(qp);
+
+	return rv;
+}
+
+/*
+ * siw_rdmap_complete()
+ *
+ * Complete processing of an RDMA message after receiving all
+ * DDP segmens or ABort processing after encountering error case.
+ *
+ *   o SENDs + RRESPs will need for completion,
+ *   o RREQs need for  READ RESPONSE initialization
+ *   o WRITEs need memory dereferencing
+ *
+ * TODO: Failed WRITEs need local error to be surfaced.
+ */
+static int siw_rdmap_complete(struct siw_qp *qp, int error)
+{
+	struct siw_rx_stream *srx = &qp->rx_stream;
+	struct siw_wqe *wqe = rx_wqe(qp->rx_fpdu);
+	enum siw_wc_status wc_status = wqe->wc_status;
+	u8 opcode = __rdmap_get_opcode(&srx->hdr.ctrl);
+	int rv = 0;
+
+	switch (opcode) {
+	case RDMAP_SEND_SE:
+	case RDMAP_SEND_SE_INVAL:
+		wqe->rqe.flags |= SIW_WQE_SOLICITED;
+		/* Fall through */
+
+	case RDMAP_SEND:
+	case RDMAP_SEND_INVAL:
+		if (wqe->wr_status == SIW_WR_IDLE)
+			break;
+
+		srx->ddp_msn[RDMAP_UNTAGGED_QN_SEND]++;
+
+		if (error != 0 && wc_status == SIW_WC_SUCCESS)
+			wc_status = SIW_WC_GENERAL_ERR;
+		/*
+		 * Handle STag invalidation request
+		 */
+		if (wc_status == SIW_WC_SUCCESS &&
+		    (opcode == RDMAP_SEND_INVAL ||
+		     opcode == RDMAP_SEND_SE_INVAL)) {
+			rv = siw_invalidate_stag(qp->pd, srx->inval_stag);
+			if (rv) {
+				siw_init_terminate(
+					qp, TERM_ERROR_LAYER_RDMAP,
+					rv == -EACCES ?
+						RDMAP_ETYPE_REMOTE_PROTECTION :
+						RDMAP_ETYPE_REMOTE_OPERATION,
+					RDMAP_ECODE_CANNOT_INVALIDATE, 0);
+
+				wc_status = SIW_WC_REM_INV_REQ_ERR;
+			}
+			rv = siw_rqe_complete(qp, &wqe->rqe, wqe->processed,
+					      rv ? 0 : srx->inval_stag,
+					      wc_status);
+		} else {
+			rv = siw_rqe_complete(qp, &wqe->rqe, wqe->processed,
+					      0, wc_status);
+		}
+		siw_wqe_put_mem(wqe, SIW_OP_RECEIVE);
+		break;
+
+	case RDMAP_RDMA_READ_RESP:
+		if (wqe->wr_status == SIW_WR_IDLE)
+			break;
+
+		if (error != 0) {
+			if ((srx->state == SIW_GET_HDR &&
+			     qp->rx_fpdu->first_ddp_seg) || error == -ENODATA)
+				/* possible RREQ in ORQ left untouched */
+				break;
+
+			if (wc_status == SIW_WC_SUCCESS)
+				wc_status = SIW_WC_GENERAL_ERR;
+		} else if (qp->kernel_verbs &&
+			   rx_type(wqe) == SIW_OP_READ_LOCAL_INV) {
+			/*
+			 * Handle any STag invalidation request
+			 */
+			rv = siw_invalidate_stag(qp->pd, wqe->sqe.sge[0].lkey);
+			if (rv) {
+				siw_init_terminate(qp, TERM_ERROR_LAYER_RDMAP,
+						   RDMAP_ETYPE_CATASTROPHIC,
+						   RDMAP_ECODE_UNSPECIFIED, 0);
+
+				if (wc_status == SIW_WC_SUCCESS) {
+					wc_status = SIW_WC_GENERAL_ERR;
+					error = rv;
+				}
+			}
+		}
+		/*
+		 * All errors turn the wqe into signalled.
+		 */
+		if ((wqe->sqe.flags & SIW_WQE_SIGNALLED) || error != 0)
+			rv = siw_sqe_complete(qp, &wqe->sqe, wqe->processed,
+					      wc_status);
+		siw_wqe_put_mem(wqe, SIW_OP_READ);
+
+		if (!error)
+			rv = siw_check_tx_fence(qp);
+		else
+			/* Disable current ORQ eleement */
+			WRITE_ONCE(orq_get_current(qp)->flags, 0);
+		break;
+
+	case RDMAP_RDMA_READ_REQ:
+		if (!error) {
+			rv = siw_init_rresp(qp, srx);
+			srx->ddp_msn[RDMAP_UNTAGGED_QN_RDMA_READ]++;
+		}
+		break;
+
+	case RDMAP_RDMA_WRITE:
+		if (wqe->wr_status == SIW_WR_IDLE)
+			break;
+
+		/*
+		 * Free References from memory object if
+		 * attached to receive context (inbound WRITE).
+		 * While a zero-length WRITE is allowed,
+		 * no memory reference got created.
+		 */
+		if (rx_mem(&qp->rx_tagged)) {
+			siw_mem_put(rx_mem(&qp->rx_tagged));
+			rx_mem(&qp->rx_tagged) = NULL;
+		}
+		break;
+
+	default:
+		break;
+	}
+	wqe->wr_status = SIW_WR_IDLE;
+
+	return rv;
+}
+
+/*
+ * siw_tcp_rx_data()
+ *
+ * Main routine to consume inbound TCP payload
+ *
+ * @rd_desc:	read descriptor
+ * @skb:	socket buffer
+ * @off:	offset in skb
+ * @len:	skb->len - offset : payload in skb
+ */
+int siw_tcp_rx_data(read_descriptor_t *rd_desc, struct sk_buff *skb,
+		    unsigned int off, size_t len)
+{
+	struct siw_qp *qp = rd_desc->arg.data;
+	struct siw_rx_stream *srx = &qp->rx_stream;
+	int rv;
+
+	srx->skb = skb;
+	srx->skb_new = skb->len - off;
+	srx->skb_offset = off;
+	srx->skb_copied = 0;
+
+	siw_dbg_qp(qp, "new data, len %d\n", srx->skb_new);
+
+	while (srx->skb_new) {
+		int run_completion = 1;
+
+		if (unlikely(srx->rx_suspend)) {
+			/* Do not process any more data */
+			srx->skb_copied += srx->skb_new;
+			break;
+		}
+		switch (srx->state) {
+		case SIW_GET_HDR:
+			rv = siw_get_hdr(srx);
+			if (!rv) {
+				srx->fpdu_part_rem =
+					be16_to_cpu(srx->hdr.ctrl.mpa_len) -
+					srx->fpdu_part_rcvd + MPA_HDR_SIZE;
+
+				if (srx->fpdu_part_rem)
+					srx->pad = -srx->fpdu_part_rem & 0x3;
+				else
+					srx->pad = 0;
+
+				srx->state = SIW_GET_DATA_START;
+				srx->fpdu_part_rcvd = 0;
+			}
+			break;
+
+		case SIW_GET_DATA_MORE:
+			/*
+			 * Another data fragment of the same DDP segment.
+			 * Setting first_ddp_seg = 0 avoids repeating
+			 * initializations that shall occur only once per
+			 * DDP segment.
+			 */
+			qp->rx_fpdu->first_ddp_seg = 0;
+			/* Fall through */
+
+		case SIW_GET_DATA_START:
+			/*
+			 * Headers will be checked by the opcode-specific
+			 * data receive function below.
+			 */
+			rv = iwarp_pktinfo[qp->rx_stream.rdmap_op].rx_data(qp);
+			if (!rv) {
+				int mpa_len =
+					be16_to_cpu(srx->hdr.ctrl.mpa_len)
+					+ MPA_HDR_SIZE;
+
+				srx->fpdu_part_rem = (-mpa_len & 0x3)
+						      + MPA_CRC_SIZE;
+				srx->fpdu_part_rcvd = 0;
+				srx->state = SIW_GET_TRAILER;
+			} else {
+				if (unlikely(rv == -ECONNRESET))
+					run_completion = 0;
+				else
+					srx->state = SIW_GET_DATA_MORE;
+			}
+			break;
+
+		case SIW_GET_TRAILER:
+			/*
+			 * read CRC + any padding
+			 */
+			rv = siw_get_trailer(qp, srx);
+			if (likely(!rv)) {
+				/*
+				 * FPDU completed.
+				 * complete RDMAP message if last fragment
+				 */
+				srx->state = SIW_GET_HDR;
+				srx->fpdu_part_rcvd = 0;
+
+				if (!(srx->hdr.ctrl.ddp_rdmap_ctrl &
+				      DDP_FLAG_LAST))
+					/* more frags */
+					break;
+
+				rv = siw_rdmap_complete(qp, 0);
+				run_completion = 0;
+			}
+			break;
+
+		default:
+			pr_warn("QP[%u]: RX out of state\n", qp_id(qp));
+			rv = -EPROTO;
+			run_completion = 0;
+		}
+		if (unlikely(rv != 0 && rv != -EAGAIN)) {
+			if ((srx->state > SIW_GET_HDR ||
+			     qp->rx_fpdu->more_ddp_segs) && run_completion)
+				siw_rdmap_complete(qp, rv);
+
+			siw_dbg_qp(qp, "rx error %d, rx state %d\n", rv,
+				   srx->state);
+
+			siw_qp_cm_drop(qp, 1);
+
+			break;
+		}
+		if (rv) {
+			siw_dbg_qp(qp, "fpdu fragment, state %d, missing %d\n",
+				   srx->state, srx->fpdu_part_rem);
+			break;
+		}
+	}
+	return srx->skb_copied;
+}
diff --git a/drivers/infiniband/sw/siw/siw_qp_tx.c b/drivers/infiniband/sw/siw/siw_qp_tx.c
new file mode 100644
index 0000000..5d97bba
--- /dev/null
+++ b/drivers/infiniband/sw/siw/siw_qp_tx.c
@@ -0,0 +1,1262 @@
+// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause
+
+/* Authors: Bernard Metzler <bmt@zurich.ibm.com> */
+/* Copyright (c) 2008-2019, IBM Corporation */
+
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/net.h>
+#include <linux/scatterlist.h>
+#include <linux/highmem.h>
+#include <net/tcp.h>
+
+#include <rdma/iw_cm.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_user_verbs.h>
+
+#include "siw.h"
+#include "siw_verbs.h"
+#include "siw_mem.h"
+
+#define MAX_HDR_INLINE					\
+	(((uint32_t)(sizeof(struct siw_rreq_pkt) -	\
+		     sizeof(struct iwarp_send))) & 0xF8)
+
+static struct page *siw_get_pblpage(struct siw_mem *mem, u64 addr, int *idx)
+{
+	struct siw_pbl *pbl = mem->pbl;
+	u64 offset = addr - mem->va;
+	dma_addr_t paddr = siw_pbl_get_buffer(pbl, offset, NULL, idx);
+
+	if (paddr)
+		return virt_to_page(paddr);
+
+	return NULL;
+}
+
+/*
+ * Copy short payload at provided destination payload address
+ */
+static int siw_try_1seg(struct siw_iwarp_tx *c_tx, void *paddr)
+{
+	struct siw_wqe *wqe = &c_tx->wqe_active;
+	struct siw_sge *sge = &wqe->sqe.sge[0];
+	u32 bytes = sge->length;
+
+	if (bytes > MAX_HDR_INLINE || wqe->sqe.num_sge != 1)
+		return MAX_HDR_INLINE + 1;
+
+	if (!bytes)
+		return 0;
+
+	if (tx_flags(wqe) & SIW_WQE_INLINE) {
+		memcpy(paddr, &wqe->sqe.sge[1], bytes);
+	} else {
+		struct siw_mem *mem = wqe->mem[0];
+
+		if (!mem->mem_obj) {
+			/* Kernel client using kva */
+			memcpy(paddr,
+			       (const void *)(uintptr_t)sge->laddr, bytes);
+		} else if (c_tx->in_syscall) {
+			if (copy_from_user(paddr, u64_to_user_ptr(sge->laddr),
+					   bytes))
+				return -EFAULT;
+		} else {
+			unsigned int off = sge->laddr & ~PAGE_MASK;
+			struct page *p;
+			char *buffer;
+			int pbl_idx = 0;
+
+			if (!mem->is_pbl)
+				p = siw_get_upage(mem->umem, sge->laddr);
+			else
+				p = siw_get_pblpage(mem, sge->laddr, &pbl_idx);
+
+			if (unlikely(!p))
+				return -EFAULT;
+
+			buffer = kmap(p);
+
+			if (likely(PAGE_SIZE - off >= bytes)) {
+				memcpy(paddr, buffer + off, bytes);
+			} else {
+				unsigned long part = bytes - (PAGE_SIZE - off);
+
+				memcpy(paddr, buffer + off, part);
+				kunmap(p);
+
+				if (!mem->is_pbl)
+					p = siw_get_upage(mem->umem,
+							  sge->laddr + part);
+				else
+					p = siw_get_pblpage(mem,
+							    sge->laddr + part,
+							    &pbl_idx);
+				if (unlikely(!p))
+					return -EFAULT;
+
+				buffer = kmap(p);
+				memcpy(paddr + part, buffer, bytes - part);
+			}
+			kunmap(p);
+		}
+	}
+	return (int)bytes;
+}
+
+#define PKT_FRAGMENTED 1
+#define PKT_COMPLETE 0
+
+/*
+ * siw_qp_prepare_tx()
+ *
+ * Prepare tx state for sending out one fpdu. Builds complete pkt
+ * if no user data or only immediate data are present.
+ *
+ * returns PKT_COMPLETE if complete pkt built, PKT_FRAGMENTED otherwise.
+ */
+static int siw_qp_prepare_tx(struct siw_iwarp_tx *c_tx)
+{
+	struct siw_wqe *wqe = &c_tx->wqe_active;
+	char *crc = NULL;
+	int data = 0;
+
+	switch (tx_type(wqe)) {
+	case SIW_OP_READ:
+	case SIW_OP_READ_LOCAL_INV:
+		memcpy(&c_tx->pkt.ctrl,
+		       &iwarp_pktinfo[RDMAP_RDMA_READ_REQ].ctrl,
+		       sizeof(struct iwarp_ctrl));
+
+		c_tx->pkt.rreq.rsvd = 0;
+		c_tx->pkt.rreq.ddp_qn = htonl(RDMAP_UNTAGGED_QN_RDMA_READ);
+		c_tx->pkt.rreq.ddp_msn =
+			htonl(++c_tx->ddp_msn[RDMAP_UNTAGGED_QN_RDMA_READ]);
+		c_tx->pkt.rreq.ddp_mo = 0;
+		c_tx->pkt.rreq.sink_stag = htonl(wqe->sqe.sge[0].lkey);
+		c_tx->pkt.rreq.sink_to =
+			cpu_to_be64(wqe->sqe.sge[0].laddr);
+		c_tx->pkt.rreq.source_stag = htonl(wqe->sqe.rkey);
+		c_tx->pkt.rreq.source_to = cpu_to_be64(wqe->sqe.raddr);
+		c_tx->pkt.rreq.read_size = htonl(wqe->sqe.sge[0].length);
+
+		c_tx->ctrl_len = sizeof(struct iwarp_rdma_rreq);
+		crc = (char *)&c_tx->pkt.rreq_pkt.crc;
+		break;
+
+	case SIW_OP_SEND:
+		if (tx_flags(wqe) & SIW_WQE_SOLICITED)
+			memcpy(&c_tx->pkt.ctrl,
+			       &iwarp_pktinfo[RDMAP_SEND_SE].ctrl,
+			       sizeof(struct iwarp_ctrl));
+		else
+			memcpy(&c_tx->pkt.ctrl, &iwarp_pktinfo[RDMAP_SEND].ctrl,
+			       sizeof(struct iwarp_ctrl));
+
+		c_tx->pkt.send.ddp_qn = RDMAP_UNTAGGED_QN_SEND;
+		c_tx->pkt.send.ddp_msn =
+			htonl(++c_tx->ddp_msn[RDMAP_UNTAGGED_QN_SEND]);
+		c_tx->pkt.send.ddp_mo = 0;
+
+		c_tx->pkt.send_inv.inval_stag = 0;
+
+		c_tx->ctrl_len = sizeof(struct iwarp_send);
+
+		crc = (char *)&c_tx->pkt.send_pkt.crc;
+		data = siw_try_1seg(c_tx, crc);
+		break;
+
+	case SIW_OP_SEND_REMOTE_INV:
+		if (tx_flags(wqe) & SIW_WQE_SOLICITED)
+			memcpy(&c_tx->pkt.ctrl,
+			       &iwarp_pktinfo[RDMAP_SEND_SE_INVAL].ctrl,
+			       sizeof(struct iwarp_ctrl));
+		else
+			memcpy(&c_tx->pkt.ctrl,
+			       &iwarp_pktinfo[RDMAP_SEND_INVAL].ctrl,
+			       sizeof(struct iwarp_ctrl));
+
+		c_tx->pkt.send.ddp_qn = RDMAP_UNTAGGED_QN_SEND;
+		c_tx->pkt.send.ddp_msn =
+			htonl(++c_tx->ddp_msn[RDMAP_UNTAGGED_QN_SEND]);
+		c_tx->pkt.send.ddp_mo = 0;
+
+		c_tx->pkt.send_inv.inval_stag = cpu_to_be32(wqe->sqe.rkey);
+
+		c_tx->ctrl_len = sizeof(struct iwarp_send_inv);
+
+		crc = (char *)&c_tx->pkt.send_pkt.crc;
+		data = siw_try_1seg(c_tx, crc);
+		break;
+
+	case SIW_OP_WRITE:
+		memcpy(&c_tx->pkt.ctrl, &iwarp_pktinfo[RDMAP_RDMA_WRITE].ctrl,
+		       sizeof(struct iwarp_ctrl));
+
+		c_tx->pkt.rwrite.sink_stag = htonl(wqe->sqe.rkey);
+		c_tx->pkt.rwrite.sink_to = cpu_to_be64(wqe->sqe.raddr);
+		c_tx->ctrl_len = sizeof(struct iwarp_rdma_write);
+
+		crc = (char *)&c_tx->pkt.write_pkt.crc;
+		data = siw_try_1seg(c_tx, crc);
+		break;
+
+	case SIW_OP_READ_RESPONSE:
+		memcpy(&c_tx->pkt.ctrl,
+		       &iwarp_pktinfo[RDMAP_RDMA_READ_RESP].ctrl,
+		       sizeof(struct iwarp_ctrl));
+
+		/* NBO */
+		c_tx->pkt.rresp.sink_stag = cpu_to_be32(wqe->sqe.rkey);
+		c_tx->pkt.rresp.sink_to = cpu_to_be64(wqe->sqe.raddr);
+
+		c_tx->ctrl_len = sizeof(struct iwarp_rdma_rresp);
+
+		crc = (char *)&c_tx->pkt.write_pkt.crc;
+		data = siw_try_1seg(c_tx, crc);
+		break;
+
+	default:
+		siw_dbg_qp(tx_qp(c_tx), "stale wqe type %d\n", tx_type(wqe));
+		return -EOPNOTSUPP;
+	}
+	if (unlikely(data < 0))
+		return data;
+
+	c_tx->ctrl_sent = 0;
+
+	if (data <= MAX_HDR_INLINE) {
+		if (data) {
+			wqe->processed = data;
+
+			c_tx->pkt.ctrl.mpa_len =
+				htons(c_tx->ctrl_len + data - MPA_HDR_SIZE);
+
+			/* Add pad, if needed */
+			data += -(int)data & 0x3;
+			/* advance CRC location after payload */
+			crc += data;
+			c_tx->ctrl_len += data;
+
+			if (!(c_tx->pkt.ctrl.ddp_rdmap_ctrl & DDP_FLAG_TAGGED))
+				c_tx->pkt.c_untagged.ddp_mo = 0;
+			else
+				c_tx->pkt.c_tagged.ddp_to =
+					cpu_to_be64(wqe->sqe.raddr);
+		}
+
+		*(u32 *)crc = 0;
+		/*
+		 * Do complete CRC if enabled and short packet
+		 */
+		if (c_tx->mpa_crc_hd) {
+			crypto_shash_init(c_tx->mpa_crc_hd);
+			if (crypto_shash_update(c_tx->mpa_crc_hd,
+						(u8 *)&c_tx->pkt,
+						c_tx->ctrl_len))
+				return -EINVAL;
+			crypto_shash_final(c_tx->mpa_crc_hd, (u8 *)crc);
+		}
+		c_tx->ctrl_len += MPA_CRC_SIZE;
+
+		return PKT_COMPLETE;
+	}
+	c_tx->ctrl_len += MPA_CRC_SIZE;
+	c_tx->sge_idx = 0;
+	c_tx->sge_off = 0;
+	c_tx->pbl_idx = 0;
+
+	/*
+	 * Allow direct sending out of user buffer if WR is non signalled
+	 * and payload is over threshold.
+	 * Per RDMA verbs, the application should not change the send buffer
+	 * until the work completed. In iWarp, work completion is only
+	 * local delivery to TCP. TCP may reuse the buffer for
+	 * retransmission. Changing unsent data also breaks the CRC,
+	 * if applied.
+	 */
+	if (c_tx->zcopy_tx && wqe->bytes >= SENDPAGE_THRESH &&
+	    !(tx_flags(wqe) & SIW_WQE_SIGNALLED))
+		c_tx->use_sendpage = 1;
+	else
+		c_tx->use_sendpage = 0;
+
+	return PKT_FRAGMENTED;
+}
+
+/*
+ * Send out one complete control type FPDU, or header of FPDU carrying
+ * data. Used for fixed sized packets like Read.Requests or zero length
+ * SENDs, WRITEs, READ.Responses, or header only.
+ */
+static int siw_tx_ctrl(struct siw_iwarp_tx *c_tx, struct socket *s,
+			      int flags)
+{
+	struct msghdr msg = { .msg_flags = flags };
+	struct kvec iov = { .iov_base =
+				    (char *)&c_tx->pkt.ctrl + c_tx->ctrl_sent,
+			    .iov_len = c_tx->ctrl_len - c_tx->ctrl_sent };
+
+	int rv = kernel_sendmsg(s, &msg, &iov, 1,
+				c_tx->ctrl_len - c_tx->ctrl_sent);
+
+	if (rv >= 0) {
+		c_tx->ctrl_sent += rv;
+
+		if (c_tx->ctrl_sent == c_tx->ctrl_len)
+			rv = 0;
+		else
+			rv = -EAGAIN;
+	}
+	return rv;
+}
+
+/*
+ * 0copy TCP transmit interface: Use do_tcp_sendpages.
+ *
+ * Using sendpage to push page by page appears to be less efficient
+ * than using sendmsg, even if data are copied.
+ *
+ * A general performance limitation might be the extra four bytes
+ * trailer checksum segment to be pushed after user data.
+ */
+static int siw_tcp_sendpages(struct socket *s, struct page **page, int offset,
+			     size_t size)
+{
+	struct sock *sk = s->sk;
+	int i = 0, rv = 0, sent = 0,
+	    flags = MSG_MORE | MSG_DONTWAIT | MSG_SENDPAGE_NOTLAST;
+
+	while (size) {
+		size_t bytes = min_t(size_t, PAGE_SIZE - offset, size);
+
+		if (size + offset <= PAGE_SIZE)
+			flags = MSG_MORE | MSG_DONTWAIT;
+
+		tcp_rate_check_app_limited(sk);
+try_page_again:
+		lock_sock(sk);
+		rv = do_tcp_sendpages(sk, page[i], offset, bytes, flags);
+		release_sock(sk);
+
+		if (rv > 0) {
+			size -= rv;
+			sent += rv;
+			if (rv != bytes) {
+				offset += rv;
+				bytes -= rv;
+				goto try_page_again;
+			}
+			offset = 0;
+		} else {
+			if (rv == -EAGAIN || rv == 0)
+				break;
+			return rv;
+		}
+		i++;
+	}
+	return sent;
+}
+
+/*
+ * siw_0copy_tx()
+ *
+ * Pushes list of pages to TCP socket. If pages from multiple
+ * SGE's, all referenced pages of each SGE are pushed in one
+ * shot.
+ */
+static int siw_0copy_tx(struct socket *s, struct page **page,
+			struct siw_sge *sge, unsigned int offset,
+			unsigned int size)
+{
+	int i = 0, sent = 0, rv;
+	int sge_bytes = min(sge->length - offset, size);
+
+	offset = (sge->laddr + offset) & ~PAGE_MASK;
+
+	while (sent != size) {
+		rv = siw_tcp_sendpages(s, &page[i], offset, sge_bytes);
+		if (rv >= 0) {
+			sent += rv;
+			if (size == sent || sge_bytes > rv)
+				break;
+
+			i += PAGE_ALIGN(sge_bytes + offset) >> PAGE_SHIFT;
+			sge++;
+			sge_bytes = min(sge->length, size - sent);
+			offset = sge->laddr & ~PAGE_MASK;
+		} else {
+			sent = rv;
+			break;
+		}
+	}
+	return sent;
+}
+
+#define MAX_TRAILER (MPA_CRC_SIZE + 4)
+
+static void siw_unmap_pages(struct page **pp, unsigned long kmap_mask)
+{
+	while (kmap_mask) {
+		if (kmap_mask & BIT(0))
+			kunmap(*pp);
+		pp++;
+		kmap_mask >>= 1;
+	}
+}
+
+/*
+ * siw_tx_hdt() tries to push a complete packet to TCP where all
+ * packet fragments are referenced by the elements of one iovec.
+ * For the data portion, each involved page must be referenced by
+ * one extra element. All sge's data can be non-aligned to page
+ * boundaries. Two more elements are referencing iWARP header
+ * and trailer:
+ * MAX_ARRAY = 64KB/PAGE_SIZE + 1 + (2 * (SIW_MAX_SGE - 1) + HDR + TRL
+ */
+#define MAX_ARRAY ((0xffff / PAGE_SIZE) + 1 + (2 * (SIW_MAX_SGE - 1) + 2))
+
+/*
+ * Write out iov referencing hdr, data and trailer of current FPDU.
+ * Update transmit state dependent on write return status
+ */
+static int siw_tx_hdt(struct siw_iwarp_tx *c_tx, struct socket *s)
+{
+	struct siw_wqe *wqe = &c_tx->wqe_active;
+	struct siw_sge *sge = &wqe->sqe.sge[c_tx->sge_idx];
+	struct kvec iov[MAX_ARRAY];
+	struct page *page_array[MAX_ARRAY];
+	struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_EOR };
+
+	int seg = 0, do_crc = c_tx->do_crc, is_kva = 0, rv;
+	unsigned int data_len = c_tx->bytes_unsent, hdr_len = 0, trl_len = 0,
+		     sge_off = c_tx->sge_off, sge_idx = c_tx->sge_idx,
+		     pbl_idx = c_tx->pbl_idx;
+	unsigned long kmap_mask = 0L;
+
+	if (c_tx->state == SIW_SEND_HDR) {
+		if (c_tx->use_sendpage) {
+			rv = siw_tx_ctrl(c_tx, s, MSG_DONTWAIT | MSG_MORE);
+			if (rv)
+				goto done;
+
+			c_tx->state = SIW_SEND_DATA;
+		} else {
+			iov[0].iov_base =
+				(char *)&c_tx->pkt.ctrl + c_tx->ctrl_sent;
+			iov[0].iov_len = hdr_len =
+				c_tx->ctrl_len - c_tx->ctrl_sent;
+			seg = 1;
+		}
+	}
+
+	wqe->processed += data_len;
+
+	while (data_len) { /* walk the list of SGE's */
+		unsigned int sge_len = min(sge->length - sge_off, data_len);
+		unsigned int fp_off = (sge->laddr + sge_off) & ~PAGE_MASK;
+		struct siw_mem *mem;
+
+		if (!(tx_flags(wqe) & SIW_WQE_INLINE)) {
+			mem = wqe->mem[sge_idx];
+			is_kva = mem->mem_obj == NULL ? 1 : 0;
+		} else {
+			is_kva = 1;
+		}
+		if (is_kva && !c_tx->use_sendpage) {
+			/*
+			 * tx from kernel virtual address: either inline data
+			 * or memory region with assigned kernel buffer
+			 */
+			iov[seg].iov_base =
+				(void *)(uintptr_t)(sge->laddr + sge_off);
+			iov[seg].iov_len = sge_len;
+
+			if (do_crc)
+				crypto_shash_update(c_tx->mpa_crc_hd,
+						    iov[seg].iov_base,
+						    sge_len);
+			sge_off += sge_len;
+			data_len -= sge_len;
+			seg++;
+			goto sge_done;
+		}
+
+		while (sge_len) {
+			size_t plen = min((int)PAGE_SIZE - fp_off, sge_len);
+
+			if (!is_kva) {
+				struct page *p;
+
+				if (mem->is_pbl)
+					p = siw_get_pblpage(
+						mem, sge->laddr + sge_off,
+						&pbl_idx);
+				else
+					p = siw_get_upage(mem->umem,
+							  sge->laddr + sge_off);
+				if (unlikely(!p)) {
+					siw_unmap_pages(page_array, kmap_mask);
+					wqe->processed -= c_tx->bytes_unsent;
+					rv = -EFAULT;
+					goto done_crc;
+				}
+				page_array[seg] = p;
+
+				if (!c_tx->use_sendpage) {
+					iov[seg].iov_base = kmap(p) + fp_off;
+					iov[seg].iov_len = plen;
+
+					/* Remember for later kunmap() */
+					kmap_mask |= BIT(seg);
+
+					if (do_crc)
+						crypto_shash_update(
+							c_tx->mpa_crc_hd,
+							iov[seg].iov_base,
+							plen);
+				} else if (do_crc) {
+					crypto_shash_update(c_tx->mpa_crc_hd,
+							    kmap(p) + fp_off,
+							    plen);
+					kunmap(p);
+				}
+			} else {
+				u64 va = sge->laddr + sge_off;
+
+				page_array[seg] = virt_to_page(va & PAGE_MASK);
+				if (do_crc)
+					crypto_shash_update(
+						c_tx->mpa_crc_hd,
+						(void *)(uintptr_t)va,
+						plen);
+			}
+
+			sge_len -= plen;
+			sge_off += plen;
+			data_len -= plen;
+			fp_off = 0;
+
+			if (++seg > (int)MAX_ARRAY) {
+				siw_dbg_qp(tx_qp(c_tx), "to many fragments\n");
+				siw_unmap_pages(page_array, kmap_mask);
+				wqe->processed -= c_tx->bytes_unsent;
+				rv = -EMSGSIZE;
+				goto done_crc;
+			}
+		}
+sge_done:
+		/* Update SGE variables at end of SGE */
+		if (sge_off == sge->length &&
+		    (data_len != 0 || wqe->processed < wqe->bytes)) {
+			sge_idx++;
+			sge++;
+			sge_off = 0;
+		}
+	}
+	/* trailer */
+	if (likely(c_tx->state != SIW_SEND_TRAILER)) {
+		iov[seg].iov_base = &c_tx->trailer.pad[4 - c_tx->pad];
+		iov[seg].iov_len = trl_len = MAX_TRAILER - (4 - c_tx->pad);
+	} else {
+		iov[seg].iov_base = &c_tx->trailer.pad[c_tx->ctrl_sent];
+		iov[seg].iov_len = trl_len = MAX_TRAILER - c_tx->ctrl_sent;
+	}
+
+	if (c_tx->pad) {
+		*(u32 *)c_tx->trailer.pad = 0;
+		if (do_crc)
+			crypto_shash_update(c_tx->mpa_crc_hd,
+				(u8 *)&c_tx->trailer.crc - c_tx->pad,
+				c_tx->pad);
+	}
+	if (!c_tx->mpa_crc_hd)
+		c_tx->trailer.crc = 0;
+	else if (do_crc)
+		crypto_shash_final(c_tx->mpa_crc_hd, (u8 *)&c_tx->trailer.crc);
+
+	data_len = c_tx->bytes_unsent;
+
+	if (c_tx->use_sendpage) {
+		rv = siw_0copy_tx(s, page_array, &wqe->sqe.sge[c_tx->sge_idx],
+				  c_tx->sge_off, data_len);
+		if (rv == data_len) {
+			rv = kernel_sendmsg(s, &msg, &iov[seg], 1, trl_len);
+			if (rv > 0)
+				rv += data_len;
+			else
+				rv = data_len;
+		}
+	} else {
+		rv = kernel_sendmsg(s, &msg, iov, seg + 1,
+				    hdr_len + data_len + trl_len);
+		siw_unmap_pages(page_array, kmap_mask);
+	}
+	if (rv < (int)hdr_len) {
+		/* Not even complete hdr pushed or negative rv */
+		wqe->processed -= data_len;
+		if (rv >= 0) {
+			c_tx->ctrl_sent += rv;
+			rv = -EAGAIN;
+		}
+		goto done_crc;
+	}
+	rv -= hdr_len;
+
+	if (rv >= (int)data_len) {
+		/* all user data pushed to TCP or no data to push */
+		if (data_len > 0 && wqe->processed < wqe->bytes) {
+			/* Save the current state for next tx */
+			c_tx->sge_idx = sge_idx;
+			c_tx->sge_off = sge_off;
+			c_tx->pbl_idx = pbl_idx;
+		}
+		rv -= data_len;
+
+		if (rv == trl_len) /* all pushed */
+			rv = 0;
+		else {
+			c_tx->state = SIW_SEND_TRAILER;
+			c_tx->ctrl_len = MAX_TRAILER;
+			c_tx->ctrl_sent = rv + 4 - c_tx->pad;
+			c_tx->bytes_unsent = 0;
+			rv = -EAGAIN;
+		}
+
+	} else if (data_len > 0) {
+		/* Maybe some user data pushed to TCP */
+		c_tx->state = SIW_SEND_DATA;
+		wqe->processed -= data_len - rv;
+
+		if (rv) {
+			/*
+			 * Some bytes out. Recompute tx state based
+			 * on old state and bytes pushed
+			 */
+			unsigned int sge_unsent;
+
+			c_tx->bytes_unsent -= rv;
+			sge = &wqe->sqe.sge[c_tx->sge_idx];
+			sge_unsent = sge->length - c_tx->sge_off;
+
+			while (sge_unsent <= rv) {
+				rv -= sge_unsent;
+				c_tx->sge_idx++;
+				c_tx->sge_off = 0;
+				sge++;
+				sge_unsent = sge->length;
+			}
+			c_tx->sge_off += rv;
+		}
+		rv = -EAGAIN;
+	}
+done_crc:
+	c_tx->do_crc = 0;
+done:
+	return rv;
+}
+
+static void siw_update_tcpseg(struct siw_iwarp_tx *c_tx,
+				     struct socket *s)
+{
+	struct tcp_sock *tp = tcp_sk(s->sk);
+
+	if (tp->gso_segs) {
+		if (c_tx->gso_seg_limit == 0)
+			c_tx->tcp_seglen = tp->mss_cache * tp->gso_segs;
+		else
+			c_tx->tcp_seglen =
+				tp->mss_cache *
+				min_t(u16, c_tx->gso_seg_limit, tp->gso_segs);
+	} else {
+		c_tx->tcp_seglen = tp->mss_cache;
+	}
+	/* Loopback may give odd numbers */
+	c_tx->tcp_seglen &= 0xfffffff8;
+}
+
+/*
+ * siw_prepare_fpdu()
+ *
+ * Prepares transmit context to send out one FPDU if FPDU will contain
+ * user data and user data are not immediate data.
+ * Computes maximum FPDU length to fill up TCP MSS if possible.
+ *
+ * @qp:		QP from which to transmit
+ * @wqe:	Current WQE causing transmission
+ *
+ * TODO: Take into account real available sendspace on socket
+ *       to avoid header misalignment due to send pausing within
+ *       fpdu transmission
+ */
+static void siw_prepare_fpdu(struct siw_qp *qp, struct siw_wqe *wqe)
+{
+	struct siw_iwarp_tx *c_tx = &qp->tx_ctx;
+	int data_len;
+
+	c_tx->ctrl_len =
+		iwarp_pktinfo[__rdmap_get_opcode(&c_tx->pkt.ctrl)].hdr_len;
+	c_tx->ctrl_sent = 0;
+
+	/*
+	 * Update target buffer offset if any
+	 */
+	if (!(c_tx->pkt.ctrl.ddp_rdmap_ctrl & DDP_FLAG_TAGGED))
+		/* Untagged message */
+		c_tx->pkt.c_untagged.ddp_mo = cpu_to_be32(wqe->processed);
+	else /* Tagged message */
+		c_tx->pkt.c_tagged.ddp_to =
+			cpu_to_be64(wqe->sqe.raddr + wqe->processed);
+
+	data_len = wqe->bytes - wqe->processed;
+	if (data_len + c_tx->ctrl_len + MPA_CRC_SIZE > c_tx->tcp_seglen) {
+		/* Trim DDP payload to fit into current TCP segment */
+		data_len = c_tx->tcp_seglen - (c_tx->ctrl_len + MPA_CRC_SIZE);
+		c_tx->pkt.ctrl.ddp_rdmap_ctrl &= ~DDP_FLAG_LAST;
+		c_tx->pad = 0;
+	} else {
+		c_tx->pkt.ctrl.ddp_rdmap_ctrl |= DDP_FLAG_LAST;
+		c_tx->pad = -data_len & 0x3;
+	}
+	c_tx->bytes_unsent = data_len;
+
+	c_tx->pkt.ctrl.mpa_len =
+		htons(c_tx->ctrl_len + data_len - MPA_HDR_SIZE);
+
+	/*
+	 * Init MPA CRC computation
+	 */
+	if (c_tx->mpa_crc_hd) {
+		crypto_shash_init(c_tx->mpa_crc_hd);
+		crypto_shash_update(c_tx->mpa_crc_hd, (u8 *)&c_tx->pkt,
+				    c_tx->ctrl_len);
+		c_tx->do_crc = 1;
+	}
+}
+
+/*
+ * siw_check_sgl_tx()
+ *
+ * Check permissions for a list of SGE's (SGL).
+ * A successful check will have all memory referenced
+ * for transmission resolved and assigned to the WQE.
+ *
+ * @pd:		Protection Domain SGL should belong to
+ * @wqe:	WQE to be checked
+ * @perms:	requested access permissions
+ *
+ */
+
+static int siw_check_sgl_tx(struct ib_pd *pd, struct siw_wqe *wqe,
+			    enum ib_access_flags perms)
+{
+	struct siw_sge *sge = &wqe->sqe.sge[0];
+	int i, len, num_sge = wqe->sqe.num_sge;
+
+	if (unlikely(num_sge > SIW_MAX_SGE))
+		return -EINVAL;
+
+	for (i = 0, len = 0; num_sge; num_sge--, i++, sge++) {
+		/*
+		 * rdma verbs: do not check stag for a zero length sge
+		 */
+		if (sge->length) {
+			int rv = siw_check_sge(pd, sge, &wqe->mem[i], perms, 0,
+					       sge->length);
+
+			if (unlikely(rv != E_ACCESS_OK))
+				return rv;
+		}
+		len += sge->length;
+	}
+	return len;
+}
+
+/*
+ * siw_qp_sq_proc_tx()
+ *
+ * Process one WQE which needs transmission on the wire.
+ */
+static int siw_qp_sq_proc_tx(struct siw_qp *qp, struct siw_wqe *wqe)
+{
+	struct siw_iwarp_tx *c_tx = &qp->tx_ctx;
+	struct socket *s = qp->attrs.sk;
+	int rv = 0, burst_len = qp->tx_ctx.burst;
+	enum rdmap_ecode ecode = RDMAP_ECODE_CATASTROPHIC_STREAM;
+
+	if (unlikely(wqe->wr_status == SIW_WR_IDLE))
+		return 0;
+
+	if (!burst_len)
+		burst_len = SQ_USER_MAXBURST;
+
+	if (wqe->wr_status == SIW_WR_QUEUED) {
+		if (!(wqe->sqe.flags & SIW_WQE_INLINE)) {
+			if (tx_type(wqe) == SIW_OP_READ_RESPONSE)
+				wqe->sqe.num_sge = 1;
+
+			if (tx_type(wqe) != SIW_OP_READ &&
+			    tx_type(wqe) != SIW_OP_READ_LOCAL_INV) {
+				/*
+				 * Reference memory to be tx'd w/o checking
+				 * access for LOCAL_READ permission, since
+				 * not defined in RDMA core.
+				 */
+				rv = siw_check_sgl_tx(qp->pd, wqe, 0);
+				if (rv < 0) {
+					if (tx_type(wqe) ==
+					    SIW_OP_READ_RESPONSE)
+						ecode = siw_rdmap_error(-rv);
+					rv = -EINVAL;
+					goto tx_error;
+				}
+				wqe->bytes = rv;
+			} else {
+				wqe->bytes = 0;
+			}
+		} else {
+			wqe->bytes = wqe->sqe.sge[0].length;
+			if (!qp->kernel_verbs) {
+				if (wqe->bytes > SIW_MAX_INLINE) {
+					rv = -EINVAL;
+					goto tx_error;
+				}
+				wqe->sqe.sge[0].laddr =
+					(u64)(uintptr_t)&wqe->sqe.sge[1];
+			}
+		}
+		wqe->wr_status = SIW_WR_INPROGRESS;
+		wqe->processed = 0;
+
+		siw_update_tcpseg(c_tx, s);
+
+		rv = siw_qp_prepare_tx(c_tx);
+		if (rv == PKT_FRAGMENTED) {
+			c_tx->state = SIW_SEND_HDR;
+			siw_prepare_fpdu(qp, wqe);
+		} else if (rv == PKT_COMPLETE) {
+			c_tx->state = SIW_SEND_SHORT_FPDU;
+		} else {
+			goto tx_error;
+		}
+	}
+
+next_segment:
+	siw_dbg_qp(qp, "wr type %d, state %d, data %u, sent %u, id %llx\n",
+		   tx_type(wqe), wqe->wr_status, wqe->bytes, wqe->processed,
+		   wqe->sqe.id);
+
+	if (--burst_len == 0) {
+		rv = -EINPROGRESS;
+		goto tx_done;
+	}
+	if (c_tx->state == SIW_SEND_SHORT_FPDU) {
+		enum siw_opcode tx_type = tx_type(wqe);
+		unsigned int msg_flags;
+
+		if (siw_sq_empty(qp) || !siw_tcp_nagle || burst_len == 1)
+			/*
+			 * End current TCP segment, if SQ runs empty,
+			 * or siw_tcp_nagle is not set, or we bail out
+			 * soon due to no burst credit left.
+			 */
+			msg_flags = MSG_DONTWAIT;
+		else
+			msg_flags = MSG_DONTWAIT | MSG_MORE;
+
+		rv = siw_tx_ctrl(c_tx, s, msg_flags);
+
+		if (!rv && tx_type != SIW_OP_READ &&
+		    tx_type != SIW_OP_READ_LOCAL_INV)
+			wqe->processed = wqe->bytes;
+
+		goto tx_done;
+
+	} else {
+		rv = siw_tx_hdt(c_tx, s);
+	}
+	if (!rv) {
+		/*
+		 * One segment sent. Processing completed if last
+		 * segment, Do next segment otherwise.
+		 */
+		if (unlikely(c_tx->tx_suspend)) {
+			/*
+			 * Verbs, 6.4.: Try stopping sending after a full
+			 * DDP segment if the connection goes down
+			 * (== peer halfclose)
+			 */
+			rv = -ECONNABORTED;
+			goto tx_done;
+		}
+		if (c_tx->pkt.ctrl.ddp_rdmap_ctrl & DDP_FLAG_LAST) {
+			siw_dbg_qp(qp, "WQE completed\n");
+			goto tx_done;
+		}
+		c_tx->state = SIW_SEND_HDR;
+
+		siw_update_tcpseg(c_tx, s);
+
+		siw_prepare_fpdu(qp, wqe);
+		goto next_segment;
+	}
+tx_done:
+	qp->tx_ctx.burst = burst_len;
+	return rv;
+
+tx_error:
+	if (ecode != RDMAP_ECODE_CATASTROPHIC_STREAM)
+		siw_init_terminate(qp, TERM_ERROR_LAYER_RDMAP,
+				   RDMAP_ETYPE_REMOTE_PROTECTION, ecode, 1);
+	else
+		siw_init_terminate(qp, TERM_ERROR_LAYER_RDMAP,
+				   RDMAP_ETYPE_CATASTROPHIC,
+				   RDMAP_ECODE_UNSPECIFIED, 1);
+	return rv;
+}
+
+static int siw_fastreg_mr(struct ib_pd *pd, struct siw_sqe *sqe)
+{
+	struct ib_mr *base_mr = (struct ib_mr *)(uintptr_t)sqe->base_mr;
+	struct siw_device *sdev = to_siw_dev(pd->device);
+	struct siw_mem *mem = siw_mem_id2obj(sdev, sqe->rkey  >> 8);
+	int rv = 0;
+
+	siw_dbg_pd(pd, "STag 0x%08x\n", sqe->rkey);
+
+	if (unlikely(!mem || !base_mr)) {
+		pr_warn("siw: fastreg: STag 0x%08x unknown\n", sqe->rkey);
+		return -EINVAL;
+	}
+	if (unlikely(base_mr->rkey >> 8 != sqe->rkey  >> 8)) {
+		pr_warn("siw: fastreg: STag 0x%08x: bad MR\n", sqe->rkey);
+		rv = -EINVAL;
+		goto out;
+	}
+	if (unlikely(mem->pd != pd)) {
+		pr_warn("siw: fastreg: PD mismatch\n");
+		rv = -EINVAL;
+		goto out;
+	}
+	if (unlikely(mem->stag_valid)) {
+		pr_warn("siw: fastreg: STag 0x%08x already valid\n", sqe->rkey);
+		rv = -EINVAL;
+		goto out;
+	}
+	/* Refresh STag since user may have changed key part */
+	mem->stag = sqe->rkey;
+	mem->perms = sqe->access;
+
+	siw_dbg_mem(mem, "STag 0x%08x now valid\n", sqe->rkey);
+	mem->va = base_mr->iova;
+	mem->stag_valid = 1;
+out:
+	siw_mem_put(mem);
+	return rv;
+}
+
+static int siw_qp_sq_proc_local(struct siw_qp *qp, struct siw_wqe *wqe)
+{
+	int rv;
+
+	switch (tx_type(wqe)) {
+	case SIW_OP_REG_MR:
+		rv = siw_fastreg_mr(qp->pd, &wqe->sqe);
+		break;
+
+	case SIW_OP_INVAL_STAG:
+		rv = siw_invalidate_stag(qp->pd, wqe->sqe.rkey);
+		break;
+
+	default:
+		rv = -EINVAL;
+	}
+	return rv;
+}
+
+/*
+ * siw_qp_sq_process()
+ *
+ * Core TX path routine for RDMAP/DDP/MPA using a TCP kernel socket.
+ * Sends RDMAP payload for the current SQ WR @wqe of @qp in one or more
+ * MPA FPDUs, each containing a DDP segment.
+ *
+ * SQ processing may occur in user context as a result of posting
+ * new WQE's or from siw_sq_work_handler() context. Processing in
+ * user context is limited to non-kernel verbs users.
+ *
+ * SQ processing may get paused anytime, possibly in the middle of a WR
+ * or FPDU, if insufficient send space is available. SQ processing
+ * gets resumed from siw_sq_work_handler(), if send space becomes
+ * available again.
+ *
+ * Must be called with the QP state read-locked.
+ *
+ * Note:
+ * An outbound RREQ can be satisfied by the corresponding RRESP
+ * _before_ it gets assigned to the ORQ. This happens regularly
+ * in RDMA READ via loopback case. Since both outbound RREQ and
+ * inbound RRESP can be handled by the same CPU, locking the ORQ
+ * is dead-lock prone and thus not an option. With that, the
+ * RREQ gets assigned to the ORQ _before_ being sent - see
+ * siw_activate_tx() - and pulled back in case of send failure.
+ */
+int siw_qp_sq_process(struct siw_qp *qp)
+{
+	struct siw_wqe *wqe = tx_wqe(qp);
+	enum siw_opcode tx_type;
+	unsigned long flags;
+	int rv = 0;
+
+	siw_dbg_qp(qp, "enter for type %d\n", tx_type(wqe));
+
+next_wqe:
+	/*
+	 * Stop QP processing if SQ state changed
+	 */
+	if (unlikely(qp->tx_ctx.tx_suspend)) {
+		siw_dbg_qp(qp, "tx suspended\n");
+		goto done;
+	}
+	tx_type = tx_type(wqe);
+
+	if (tx_type <= SIW_OP_READ_RESPONSE)
+		rv = siw_qp_sq_proc_tx(qp, wqe);
+	else
+		rv = siw_qp_sq_proc_local(qp, wqe);
+
+	if (!rv) {
+		/*
+		 * WQE processing done
+		 */
+		switch (tx_type) {
+		case SIW_OP_SEND:
+		case SIW_OP_SEND_REMOTE_INV:
+		case SIW_OP_WRITE:
+			siw_wqe_put_mem(wqe, tx_type);
+			/* Fall through */
+
+		case SIW_OP_INVAL_STAG:
+		case SIW_OP_REG_MR:
+			if (tx_flags(wqe) & SIW_WQE_SIGNALLED)
+				siw_sqe_complete(qp, &wqe->sqe, wqe->bytes,
+						 SIW_WC_SUCCESS);
+			break;
+
+		case SIW_OP_READ:
+		case SIW_OP_READ_LOCAL_INV:
+			/*
+			 * already enqueued to ORQ queue
+			 */
+			break;
+
+		case SIW_OP_READ_RESPONSE:
+			siw_wqe_put_mem(wqe, tx_type);
+			break;
+
+		default:
+			WARN(1, "undefined WQE type %d\n", tx_type);
+			rv = -EINVAL;
+			goto done;
+		}
+
+		spin_lock_irqsave(&qp->sq_lock, flags);
+		wqe->wr_status = SIW_WR_IDLE;
+		rv = siw_activate_tx(qp);
+		spin_unlock_irqrestore(&qp->sq_lock, flags);
+
+		if (rv <= 0)
+			goto done;
+
+		goto next_wqe;
+
+	} else if (rv == -EAGAIN) {
+		siw_dbg_qp(qp, "sq paused: hd/tr %d of %d, data %d\n",
+			   qp->tx_ctx.ctrl_sent, qp->tx_ctx.ctrl_len,
+			   qp->tx_ctx.bytes_unsent);
+		rv = 0;
+		goto done;
+	} else if (rv == -EINPROGRESS) {
+		rv = siw_sq_start(qp);
+		goto done;
+	} else {
+		/*
+		 * WQE processing failed.
+		 * Verbs 8.3.2:
+		 * o It turns any WQE into a signalled WQE.
+		 * o Local catastrophic error must be surfaced
+		 * o QP must be moved into Terminate state: done by code
+		 *   doing socket state change processing
+		 *
+		 * o TODO: Termination message must be sent.
+		 * o TODO: Implement more precise work completion errors,
+		 *         see enum ib_wc_status in ib_verbs.h
+		 */
+		siw_dbg_qp(qp, "wqe type %d processing failed: %d\n",
+			   tx_type(wqe), rv);
+
+		spin_lock_irqsave(&qp->sq_lock, flags);
+		/*
+		 * RREQ may have already been completed by inbound RRESP!
+		 */
+		if (tx_type == SIW_OP_READ ||
+		    tx_type == SIW_OP_READ_LOCAL_INV) {
+			/* Cleanup pending entry in ORQ */
+			qp->orq_put--;
+			qp->orq[qp->orq_put % qp->attrs.orq_size].flags = 0;
+		}
+		spin_unlock_irqrestore(&qp->sq_lock, flags);
+		/*
+		 * immediately suspends further TX processing
+		 */
+		if (!qp->tx_ctx.tx_suspend)
+			siw_qp_cm_drop(qp, 0);
+
+		switch (tx_type) {
+		case SIW_OP_SEND:
+		case SIW_OP_SEND_REMOTE_INV:
+		case SIW_OP_SEND_WITH_IMM:
+		case SIW_OP_WRITE:
+		case SIW_OP_READ:
+		case SIW_OP_READ_LOCAL_INV:
+			siw_wqe_put_mem(wqe, tx_type);
+			/* Fall through */
+
+		case SIW_OP_INVAL_STAG:
+		case SIW_OP_REG_MR:
+			siw_sqe_complete(qp, &wqe->sqe, wqe->bytes,
+					 SIW_WC_LOC_QP_OP_ERR);
+
+			siw_qp_event(qp, IB_EVENT_QP_FATAL);
+
+			break;
+
+		case SIW_OP_READ_RESPONSE:
+			siw_dbg_qp(qp, "proc. read.response failed: %d\n", rv);
+
+			siw_qp_event(qp, IB_EVENT_QP_REQ_ERR);
+
+			siw_wqe_put_mem(wqe, SIW_OP_READ_RESPONSE);
+
+			break;
+
+		default:
+			WARN(1, "undefined WQE type %d\n", tx_type);
+			rv = -EINVAL;
+		}
+		wqe->wr_status = SIW_WR_IDLE;
+	}
+done:
+	return rv;
+}
+
+static void siw_sq_resume(struct siw_qp *qp)
+{
+	if (down_read_trylock(&qp->state_lock)) {
+		if (likely(qp->attrs.state == SIW_QP_STATE_RTS &&
+			   !qp->tx_ctx.tx_suspend)) {
+			int rv = siw_qp_sq_process(qp);
+
+			up_read(&qp->state_lock);
+
+			if (unlikely(rv < 0)) {
+				siw_dbg_qp(qp, "SQ task failed: err %d\n", rv);
+
+				if (!qp->tx_ctx.tx_suspend)
+					siw_qp_cm_drop(qp, 0);
+			}
+		} else {
+			up_read(&qp->state_lock);
+		}
+	} else {
+		siw_dbg_qp(qp, "Resume SQ while QP locked\n");
+	}
+	siw_qp_put(qp);
+}
+
+struct tx_task_t {
+	struct llist_head active;
+	wait_queue_head_t waiting;
+};
+
+static DEFINE_PER_CPU(struct tx_task_t, siw_tx_task_g);
+
+void siw_stop_tx_thread(int nr_cpu)
+{
+	kthread_stop(siw_tx_thread[nr_cpu]);
+	wake_up(&per_cpu(siw_tx_task_g, nr_cpu).waiting);
+}
+
+int siw_run_sq(void *data)
+{
+	const int nr_cpu = (unsigned int)(long)data;
+	struct llist_node *active;
+	struct siw_qp *qp;
+	struct tx_task_t *tx_task = &per_cpu(siw_tx_task_g, nr_cpu);
+
+	init_llist_head(&tx_task->active);
+	init_waitqueue_head(&tx_task->waiting);
+
+	while (1) {
+		struct llist_node *fifo_list = NULL;
+
+		wait_event_interruptible(tx_task->waiting,
+					 !llist_empty(&tx_task->active) ||
+						 kthread_should_stop());
+
+		if (kthread_should_stop())
+			break;
+
+		active = llist_del_all(&tx_task->active);
+		/*
+		 * llist_del_all returns a list with newest entry first.
+		 * Re-order list for fairness among QP's.
+		 */
+		while (active) {
+			struct llist_node *tmp = active;
+
+			active = llist_next(active);
+			tmp->next = fifo_list;
+			fifo_list = tmp;
+		}
+		while (fifo_list) {
+			qp = container_of(fifo_list, struct siw_qp, tx_list);
+			fifo_list = llist_next(fifo_list);
+			qp->tx_list.next = NULL;
+
+			siw_sq_resume(qp);
+		}
+	}
+	active = llist_del_all(&tx_task->active);
+	if (active) {
+		llist_for_each_entry(qp, active, tx_list) {
+			qp->tx_list.next = NULL;
+			siw_sq_resume(qp);
+		}
+	}
+	return 0;
+}
+
+int siw_sq_start(struct siw_qp *qp)
+{
+	if (tx_wqe(qp)->wr_status == SIW_WR_IDLE)
+		return 0;
+
+	if (unlikely(!cpu_online(qp->tx_cpu))) {
+		siw_put_tx_cpu(qp->tx_cpu);
+		qp->tx_cpu = siw_get_tx_cpu(qp->sdev);
+		if (qp->tx_cpu < 0) {
+			pr_warn("siw: no tx cpu available\n");
+
+			return -EIO;
+		}
+	}
+	siw_qp_get(qp);
+
+	llist_add(&qp->tx_list, &per_cpu(siw_tx_task_g, qp->tx_cpu).active);
+
+	wake_up(&per_cpu(siw_tx_task_g, qp->tx_cpu).waiting);
+
+	return 0;
+}
diff --git a/drivers/infiniband/sw/siw/siw_verbs.c b/drivers/infiniband/sw/siw/siw_verbs.c
new file mode 100644
index 0000000..b18a677
--- /dev/null
+++ b/drivers/infiniband/sw/siw/siw_verbs.c
@@ -0,0 +1,1765 @@
+// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause
+
+/* Authors: Bernard Metzler <bmt@zurich.ibm.com> */
+/* Copyright (c) 2008-2019, IBM Corporation */
+
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/uaccess.h>
+#include <linux/vmalloc.h>
+#include <linux/xarray.h>
+
+#include <rdma/iw_cm.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_user_verbs.h>
+#include <rdma/uverbs_ioctl.h>
+
+#include "siw.h"
+#include "siw_verbs.h"
+#include "siw_mem.h"
+
+static int ib_qp_state_to_siw_qp_state[IB_QPS_ERR + 1] = {
+	[IB_QPS_RESET] = SIW_QP_STATE_IDLE,
+	[IB_QPS_INIT] = SIW_QP_STATE_IDLE,
+	[IB_QPS_RTR] = SIW_QP_STATE_RTR,
+	[IB_QPS_RTS] = SIW_QP_STATE_RTS,
+	[IB_QPS_SQD] = SIW_QP_STATE_CLOSING,
+	[IB_QPS_SQE] = SIW_QP_STATE_TERMINATE,
+	[IB_QPS_ERR] = SIW_QP_STATE_ERROR
+};
+
+static char ib_qp_state_to_string[IB_QPS_ERR + 1][sizeof("RESET")] = {
+	[IB_QPS_RESET] = "RESET", [IB_QPS_INIT] = "INIT", [IB_QPS_RTR] = "RTR",
+	[IB_QPS_RTS] = "RTS",     [IB_QPS_SQD] = "SQD",   [IB_QPS_SQE] = "SQE",
+	[IB_QPS_ERR] = "ERR"
+};
+
+static u32 siw_create_uobj(struct siw_ucontext *uctx, void *vaddr, u32 size)
+{
+	struct siw_uobj *uobj;
+	struct xa_limit limit = XA_LIMIT(0, SIW_UOBJ_MAX_KEY);
+	u32 key;
+
+	uobj = kzalloc(sizeof(*uobj), GFP_KERNEL);
+	if (!uobj)
+		return SIW_INVAL_UOBJ_KEY;
+
+	if (xa_alloc_cyclic(&uctx->xa, &key, uobj, limit, &uctx->uobj_nextkey,
+			    GFP_KERNEL) < 0) {
+		kfree(uobj);
+		return SIW_INVAL_UOBJ_KEY;
+	}
+	uobj->size = PAGE_ALIGN(size);
+	uobj->addr = vaddr;
+
+	return key;
+}
+
+static struct siw_uobj *siw_get_uobj(struct siw_ucontext *uctx,
+				     unsigned long off, u32 size)
+{
+	struct siw_uobj *uobj = xa_load(&uctx->xa, off);
+
+	if (uobj && uobj->size == size)
+		return uobj;
+
+	return NULL;
+}
+
+int siw_mmap(struct ib_ucontext *ctx, struct vm_area_struct *vma)
+{
+	struct siw_ucontext *uctx = to_siw_ctx(ctx);
+	struct siw_uobj *uobj;
+	unsigned long off = vma->vm_pgoff;
+	int size = vma->vm_end - vma->vm_start;
+	int rv = -EINVAL;
+
+	/*
+	 * Must be page aligned
+	 */
+	if (vma->vm_start & (PAGE_SIZE - 1)) {
+		pr_warn("siw: mmap not page aligned\n");
+		goto out;
+	}
+	uobj = siw_get_uobj(uctx, off, size);
+	if (!uobj) {
+		siw_dbg(&uctx->sdev->base_dev, "mmap lookup failed: %lu, %u\n",
+			off, size);
+		goto out;
+	}
+	rv = remap_vmalloc_range(vma, uobj->addr, 0);
+	if (rv)
+		pr_warn("remap_vmalloc_range failed: %lu, %u\n", off, size);
+out:
+	return rv;
+}
+
+int siw_alloc_ucontext(struct ib_ucontext *base_ctx, struct ib_udata *udata)
+{
+	struct siw_device *sdev = to_siw_dev(base_ctx->device);
+	struct siw_ucontext *ctx = to_siw_ctx(base_ctx);
+	struct siw_uresp_alloc_ctx uresp = {};
+	int rv;
+
+	if (atomic_inc_return(&sdev->num_ctx) > SIW_MAX_CONTEXT) {
+		rv = -ENOMEM;
+		goto err_out;
+	}
+	xa_init_flags(&ctx->xa, XA_FLAGS_ALLOC);
+	ctx->uobj_nextkey = 0;
+	ctx->sdev = sdev;
+
+	uresp.dev_id = sdev->vendor_part_id;
+
+	if (udata->outlen < sizeof(uresp)) {
+		rv = -EINVAL;
+		goto err_out;
+	}
+	rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp));
+	if (rv)
+		goto err_out;
+
+	siw_dbg(base_ctx->device, "success. now %d context(s)\n",
+		atomic_read(&sdev->num_ctx));
+
+	return 0;
+
+err_out:
+	atomic_dec(&sdev->num_ctx);
+	siw_dbg(base_ctx->device, "failure %d. now %d context(s)\n", rv,
+		atomic_read(&sdev->num_ctx));
+
+	return rv;
+}
+
+void siw_dealloc_ucontext(struct ib_ucontext *base_ctx)
+{
+	struct siw_ucontext *uctx = to_siw_ctx(base_ctx);
+	void *entry;
+	unsigned long index;
+
+	/*
+	 * Make sure all user mmap objects are gone. Since QP, CQ
+	 * and SRQ destroy routines destroy related objects, nothing
+	 * should be found here.
+	 */
+	xa_for_each(&uctx->xa, index, entry) {
+		kfree(xa_erase(&uctx->xa, index));
+		pr_warn("siw: dropping orphaned uobj at %lu\n", index);
+	}
+	xa_destroy(&uctx->xa);
+	atomic_dec(&uctx->sdev->num_ctx);
+}
+
+int siw_query_device(struct ib_device *base_dev, struct ib_device_attr *attr,
+		     struct ib_udata *udata)
+{
+	struct siw_device *sdev = to_siw_dev(base_dev);
+
+	if (udata->inlen || udata->outlen)
+		return -EINVAL;
+
+	memset(attr, 0, sizeof(*attr));
+
+	/* Revisit atomic caps if RFC 7306 gets supported */
+	attr->atomic_cap = 0;
+	attr->device_cap_flags =
+		IB_DEVICE_MEM_MGT_EXTENSIONS | IB_DEVICE_ALLOW_USER_UNREG;
+	attr->max_cq = sdev->attrs.max_cq;
+	attr->max_cqe = sdev->attrs.max_cqe;
+	attr->max_fast_reg_page_list_len = SIW_MAX_SGE_PBL;
+	attr->max_fmr = sdev->attrs.max_fmr;
+	attr->max_mr = sdev->attrs.max_mr;
+	attr->max_mw = sdev->attrs.max_mw;
+	attr->max_mr_size = ~0ull;
+	attr->max_pd = sdev->attrs.max_pd;
+	attr->max_qp = sdev->attrs.max_qp;
+	attr->max_qp_init_rd_atom = sdev->attrs.max_ird;
+	attr->max_qp_rd_atom = sdev->attrs.max_ord;
+	attr->max_qp_wr = sdev->attrs.max_qp_wr;
+	attr->max_recv_sge = sdev->attrs.max_sge;
+	attr->max_res_rd_atom = sdev->attrs.max_qp * sdev->attrs.max_ird;
+	attr->max_send_sge = sdev->attrs.max_sge;
+	attr->max_sge_rd = sdev->attrs.max_sge_rd;
+	attr->max_srq = sdev->attrs.max_srq;
+	attr->max_srq_sge = sdev->attrs.max_srq_sge;
+	attr->max_srq_wr = sdev->attrs.max_srq_wr;
+	attr->page_size_cap = PAGE_SIZE;
+	attr->vendor_id = SIW_VENDOR_ID;
+	attr->vendor_part_id = sdev->vendor_part_id;
+
+	memcpy(&attr->sys_image_guid, sdev->netdev->dev_addr, 6);
+
+	return 0;
+}
+
+int siw_query_port(struct ib_device *base_dev, u8 port,
+		   struct ib_port_attr *attr)
+{
+	struct siw_device *sdev = to_siw_dev(base_dev);
+
+	memset(attr, 0, sizeof(*attr));
+
+	attr->active_mtu = attr->max_mtu;
+	attr->active_speed = 2;
+	attr->active_width = 2;
+	attr->gid_tbl_len = 1;
+	attr->max_msg_sz = -1;
+	attr->max_mtu = ib_mtu_int_to_enum(sdev->netdev->mtu);
+	attr->phys_state = sdev->state == IB_PORT_ACTIVE ?
+		IB_PORT_PHYS_STATE_LINK_UP : IB_PORT_PHYS_STATE_DISABLED;
+	attr->pkey_tbl_len = 1;
+	attr->port_cap_flags = IB_PORT_CM_SUP | IB_PORT_DEVICE_MGMT_SUP;
+	attr->state = sdev->state;
+	/*
+	 * All zero
+	 *
+	 * attr->lid = 0;
+	 * attr->bad_pkey_cntr = 0;
+	 * attr->qkey_viol_cntr = 0;
+	 * attr->sm_lid = 0;
+	 * attr->lmc = 0;
+	 * attr->max_vl_num = 0;
+	 * attr->sm_sl = 0;
+	 * attr->subnet_timeout = 0;
+	 * attr->init_type_repy = 0;
+	 */
+	return 0;
+}
+
+int siw_get_port_immutable(struct ib_device *base_dev, u8 port,
+			   struct ib_port_immutable *port_immutable)
+{
+	struct ib_port_attr attr;
+	int rv = siw_query_port(base_dev, port, &attr);
+
+	if (rv)
+		return rv;
+
+	port_immutable->pkey_tbl_len = attr.pkey_tbl_len;
+	port_immutable->gid_tbl_len = attr.gid_tbl_len;
+	port_immutable->core_cap_flags = RDMA_CORE_PORT_IWARP;
+
+	return 0;
+}
+
+int siw_query_pkey(struct ib_device *base_dev, u8 port, u16 idx, u16 *pkey)
+{
+	/* Report the default pkey */
+	*pkey = 0xffff;
+	return 0;
+}
+
+int siw_query_gid(struct ib_device *base_dev, u8 port, int idx,
+		  union ib_gid *gid)
+{
+	struct siw_device *sdev = to_siw_dev(base_dev);
+
+	/* subnet_prefix == interface_id == 0; */
+	memset(gid, 0, sizeof(*gid));
+	memcpy(&gid->raw[0], sdev->netdev->dev_addr, 6);
+
+	return 0;
+}
+
+int siw_alloc_pd(struct ib_pd *pd, struct ib_udata *udata)
+{
+	struct siw_device *sdev = to_siw_dev(pd->device);
+
+	if (atomic_inc_return(&sdev->num_pd) > SIW_MAX_PD) {
+		atomic_dec(&sdev->num_pd);
+		return -ENOMEM;
+	}
+	siw_dbg_pd(pd, "now %d PD's(s)\n", atomic_read(&sdev->num_pd));
+
+	return 0;
+}
+
+void siw_dealloc_pd(struct ib_pd *pd, struct ib_udata *udata)
+{
+	struct siw_device *sdev = to_siw_dev(pd->device);
+
+	siw_dbg_pd(pd, "free PD\n");
+	atomic_dec(&sdev->num_pd);
+}
+
+void siw_qp_get_ref(struct ib_qp *base_qp)
+{
+	siw_qp_get(to_siw_qp(base_qp));
+}
+
+void siw_qp_put_ref(struct ib_qp *base_qp)
+{
+	siw_qp_put(to_siw_qp(base_qp));
+}
+
+/*
+ * siw_create_qp()
+ *
+ * Create QP of requested size on given device.
+ *
+ * @pd:		Protection Domain
+ * @attrs:	Initial QP attributes.
+ * @udata:	used to provide QP ID, SQ and RQ size back to user.
+ */
+
+struct ib_qp *siw_create_qp(struct ib_pd *pd,
+			    struct ib_qp_init_attr *attrs,
+			    struct ib_udata *udata)
+{
+	struct siw_qp *qp = NULL;
+	struct siw_base_qp *siw_base_qp = NULL;
+	struct ib_device *base_dev = pd->device;
+	struct siw_device *sdev = to_siw_dev(base_dev);
+	struct siw_ucontext *uctx =
+		rdma_udata_to_drv_context(udata, struct siw_ucontext,
+					  base_ucontext);
+	struct siw_cq *scq = NULL, *rcq = NULL;
+	unsigned long flags;
+	int num_sqe, num_rqe, rv = 0;
+
+	siw_dbg(base_dev, "create new QP\n");
+
+	if (atomic_inc_return(&sdev->num_qp) > SIW_MAX_QP) {
+		siw_dbg(base_dev, "too many QP's\n");
+		rv = -ENOMEM;
+		goto err_out;
+	}
+	if (attrs->qp_type != IB_QPT_RC) {
+		siw_dbg(base_dev, "only RC QP's supported\n");
+		rv = -EINVAL;
+		goto err_out;
+	}
+	if ((attrs->cap.max_send_wr > SIW_MAX_QP_WR) ||
+	    (attrs->cap.max_recv_wr > SIW_MAX_QP_WR) ||
+	    (attrs->cap.max_send_sge > SIW_MAX_SGE) ||
+	    (attrs->cap.max_recv_sge > SIW_MAX_SGE)) {
+		siw_dbg(base_dev, "QP size error\n");
+		rv = -EINVAL;
+		goto err_out;
+	}
+	if (attrs->cap.max_inline_data > SIW_MAX_INLINE) {
+		siw_dbg(base_dev, "max inline send: %d > %d\n",
+			attrs->cap.max_inline_data, (int)SIW_MAX_INLINE);
+		rv = -EINVAL;
+		goto err_out;
+	}
+	/*
+	 * NOTE: we allow for zero element SQ and RQ WQE's SGL's
+	 * but not for a QP unable to hold any WQE (SQ + RQ)
+	 */
+	if (attrs->cap.max_send_wr + attrs->cap.max_recv_wr == 0) {
+		siw_dbg(base_dev, "QP must have send or receive queue\n");
+		rv = -EINVAL;
+		goto err_out;
+	}
+	scq = to_siw_cq(attrs->send_cq);
+	rcq = to_siw_cq(attrs->recv_cq);
+
+	if (!scq || (!rcq && !attrs->srq)) {
+		siw_dbg(base_dev, "send CQ or receive CQ invalid\n");
+		rv = -EINVAL;
+		goto err_out;
+	}
+	siw_base_qp = kzalloc(sizeof(*siw_base_qp), GFP_KERNEL);
+	if (!siw_base_qp) {
+		rv = -ENOMEM;
+		goto err_out;
+	}
+	qp = kzalloc(sizeof(*qp), GFP_KERNEL);
+	if (!qp) {
+		rv = -ENOMEM;
+		goto err_out;
+	}
+	siw_base_qp->qp = qp;
+	qp->ib_qp = &siw_base_qp->base_qp;
+
+	init_rwsem(&qp->state_lock);
+	spin_lock_init(&qp->sq_lock);
+	spin_lock_init(&qp->rq_lock);
+	spin_lock_init(&qp->orq_lock);
+
+	qp->kernel_verbs = !udata;
+	qp->xa_sq_index = SIW_INVAL_UOBJ_KEY;
+	qp->xa_rq_index = SIW_INVAL_UOBJ_KEY;
+
+	rv = siw_qp_add(sdev, qp);
+	if (rv)
+		goto err_out;
+
+	/* All queue indices are derived from modulo operations
+	 * on a free running 'get' (consumer) and 'put' (producer)
+	 * unsigned counter. Having queue sizes at power of two
+	 * avoids handling counter wrap around.
+	 */
+	num_sqe = roundup_pow_of_two(attrs->cap.max_send_wr);
+	num_rqe = roundup_pow_of_two(attrs->cap.max_recv_wr);
+
+	if (qp->kernel_verbs)
+		qp->sendq = vzalloc(num_sqe * sizeof(struct siw_sqe));
+	else
+		qp->sendq = vmalloc_user(num_sqe * sizeof(struct siw_sqe));
+
+	if (qp->sendq == NULL) {
+		siw_dbg(base_dev, "SQ size %d alloc failed\n", num_sqe);
+		rv = -ENOMEM;
+		goto err_out_xa;
+	}
+	if (attrs->sq_sig_type != IB_SIGNAL_REQ_WR) {
+		if (attrs->sq_sig_type == IB_SIGNAL_ALL_WR)
+			qp->attrs.flags |= SIW_SIGNAL_ALL_WR;
+		else {
+			rv = -EINVAL;
+			goto err_out_xa;
+		}
+	}
+	qp->pd = pd;
+	qp->scq = scq;
+	qp->rcq = rcq;
+
+	if (attrs->srq) {
+		/*
+		 * SRQ support.
+		 * Verbs 6.3.7: ignore RQ size, if SRQ present
+		 * Verbs 6.3.5: do not check PD of SRQ against PD of QP
+		 */
+		qp->srq = to_siw_srq(attrs->srq);
+		qp->attrs.rq_size = 0;
+		siw_dbg(base_dev, "QP [%u]: SRQ attached\n", qp->qp_num);
+	} else if (num_rqe) {
+		if (qp->kernel_verbs)
+			qp->recvq = vzalloc(num_rqe * sizeof(struct siw_rqe));
+		else
+			qp->recvq =
+				vmalloc_user(num_rqe * sizeof(struct siw_rqe));
+
+		if (qp->recvq == NULL) {
+			siw_dbg(base_dev, "RQ size %d alloc failed\n", num_rqe);
+			rv = -ENOMEM;
+			goto err_out_xa;
+		}
+		qp->attrs.rq_size = num_rqe;
+	}
+	qp->attrs.sq_size = num_sqe;
+	qp->attrs.sq_max_sges = attrs->cap.max_send_sge;
+	qp->attrs.rq_max_sges = attrs->cap.max_recv_sge;
+
+	/* Make those two tunables fixed for now. */
+	qp->tx_ctx.gso_seg_limit = 1;
+	qp->tx_ctx.zcopy_tx = zcopy_tx;
+
+	qp->attrs.state = SIW_QP_STATE_IDLE;
+
+	if (udata) {
+		struct siw_uresp_create_qp uresp = {};
+
+		uresp.num_sqe = num_sqe;
+		uresp.num_rqe = num_rqe;
+		uresp.qp_id = qp_id(qp);
+
+		if (qp->sendq) {
+			qp->xa_sq_index =
+				siw_create_uobj(uctx, qp->sendq,
+					num_sqe * sizeof(struct siw_sqe));
+		}
+		if (qp->recvq) {
+			qp->xa_rq_index =
+				 siw_create_uobj(uctx, qp->recvq,
+					num_rqe * sizeof(struct siw_rqe));
+		}
+		if (qp->xa_sq_index == SIW_INVAL_UOBJ_KEY ||
+		    qp->xa_rq_index == SIW_INVAL_UOBJ_KEY) {
+			rv = -ENOMEM;
+			goto err_out_xa;
+		}
+		uresp.sq_key = qp->xa_sq_index << PAGE_SHIFT;
+		uresp.rq_key = qp->xa_rq_index << PAGE_SHIFT;
+
+		if (udata->outlen < sizeof(uresp)) {
+			rv = -EINVAL;
+			goto err_out_xa;
+		}
+		rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp));
+		if (rv)
+			goto err_out_xa;
+	}
+	qp->tx_cpu = siw_get_tx_cpu(sdev);
+	if (qp->tx_cpu < 0) {
+		rv = -EINVAL;
+		goto err_out_xa;
+	}
+	INIT_LIST_HEAD(&qp->devq);
+	spin_lock_irqsave(&sdev->lock, flags);
+	list_add_tail(&qp->devq, &sdev->qp_list);
+	spin_unlock_irqrestore(&sdev->lock, flags);
+
+	return qp->ib_qp;
+
+err_out_xa:
+	xa_erase(&sdev->qp_xa, qp_id(qp));
+err_out:
+	kfree(siw_base_qp);
+
+	if (qp) {
+		if (qp->xa_sq_index != SIW_INVAL_UOBJ_KEY)
+			kfree(xa_erase(&uctx->xa, qp->xa_sq_index));
+		if (qp->xa_rq_index != SIW_INVAL_UOBJ_KEY)
+			kfree(xa_erase(&uctx->xa, qp->xa_rq_index));
+
+		vfree(qp->sendq);
+		vfree(qp->recvq);
+		kfree(qp);
+	}
+	atomic_dec(&sdev->num_qp);
+
+	return ERR_PTR(rv);
+}
+
+/*
+ * Minimum siw_query_qp() verb interface.
+ *
+ * @qp_attr_mask is not used but all available information is provided
+ */
+int siw_query_qp(struct ib_qp *base_qp, struct ib_qp_attr *qp_attr,
+		 int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr)
+{
+	struct siw_qp *qp;
+	struct siw_device *sdev;
+
+	if (base_qp && qp_attr && qp_init_attr) {
+		qp = to_siw_qp(base_qp);
+		sdev = to_siw_dev(base_qp->device);
+	} else {
+		return -EINVAL;
+	}
+	qp_attr->cap.max_inline_data = SIW_MAX_INLINE;
+	qp_attr->cap.max_send_wr = qp->attrs.sq_size;
+	qp_attr->cap.max_send_sge = qp->attrs.sq_max_sges;
+	qp_attr->cap.max_recv_wr = qp->attrs.rq_size;
+	qp_attr->cap.max_recv_sge = qp->attrs.rq_max_sges;
+	qp_attr->path_mtu = ib_mtu_int_to_enum(sdev->netdev->mtu);
+	qp_attr->max_rd_atomic = qp->attrs.irq_size;
+	qp_attr->max_dest_rd_atomic = qp->attrs.orq_size;
+
+	qp_attr->qp_access_flags = IB_ACCESS_LOCAL_WRITE |
+				   IB_ACCESS_REMOTE_WRITE |
+				   IB_ACCESS_REMOTE_READ;
+
+	qp_init_attr->qp_type = base_qp->qp_type;
+	qp_init_attr->send_cq = base_qp->send_cq;
+	qp_init_attr->recv_cq = base_qp->recv_cq;
+	qp_init_attr->srq = base_qp->srq;
+
+	qp_init_attr->cap = qp_attr->cap;
+
+	return 0;
+}
+
+int siw_verbs_modify_qp(struct ib_qp *base_qp, struct ib_qp_attr *attr,
+			int attr_mask, struct ib_udata *udata)
+{
+	struct siw_qp_attrs new_attrs;
+	enum siw_qp_attr_mask siw_attr_mask = 0;
+	struct siw_qp *qp = to_siw_qp(base_qp);
+	int rv = 0;
+
+	if (!attr_mask)
+		return 0;
+
+	memset(&new_attrs, 0, sizeof(new_attrs));
+
+	if (attr_mask & IB_QP_ACCESS_FLAGS) {
+		siw_attr_mask = SIW_QP_ATTR_ACCESS_FLAGS;
+
+		if (attr->qp_access_flags & IB_ACCESS_REMOTE_READ)
+			new_attrs.flags |= SIW_RDMA_READ_ENABLED;
+		if (attr->qp_access_flags & IB_ACCESS_REMOTE_WRITE)
+			new_attrs.flags |= SIW_RDMA_WRITE_ENABLED;
+		if (attr->qp_access_flags & IB_ACCESS_MW_BIND)
+			new_attrs.flags |= SIW_RDMA_BIND_ENABLED;
+	}
+	if (attr_mask & IB_QP_STATE) {
+		siw_dbg_qp(qp, "desired IB QP state: %s\n",
+			   ib_qp_state_to_string[attr->qp_state]);
+
+		new_attrs.state = ib_qp_state_to_siw_qp_state[attr->qp_state];
+
+		if (new_attrs.state > SIW_QP_STATE_RTS)
+			qp->tx_ctx.tx_suspend = 1;
+
+		siw_attr_mask |= SIW_QP_ATTR_STATE;
+	}
+	if (!siw_attr_mask)
+		goto out;
+
+	down_write(&qp->state_lock);
+
+	rv = siw_qp_modify(qp, &new_attrs, siw_attr_mask);
+
+	up_write(&qp->state_lock);
+out:
+	return rv;
+}
+
+int siw_destroy_qp(struct ib_qp *base_qp, struct ib_udata *udata)
+{
+	struct siw_qp *qp = to_siw_qp(base_qp);
+	struct siw_ucontext *uctx =
+		rdma_udata_to_drv_context(udata, struct siw_ucontext,
+					  base_ucontext);
+	struct siw_qp_attrs qp_attrs;
+
+	siw_dbg_qp(qp, "state %d\n", qp->attrs.state);
+
+	/*
+	 * Mark QP as in process of destruction to prevent from
+	 * any async callbacks to RDMA core
+	 */
+	qp->attrs.flags |= SIW_QP_IN_DESTROY;
+	qp->rx_stream.rx_suspend = 1;
+
+	if (uctx && qp->xa_sq_index != SIW_INVAL_UOBJ_KEY)
+		kfree(xa_erase(&uctx->xa, qp->xa_sq_index));
+	if (uctx && qp->xa_rq_index != SIW_INVAL_UOBJ_KEY)
+		kfree(xa_erase(&uctx->xa, qp->xa_rq_index));
+
+	down_write(&qp->state_lock);
+
+	qp_attrs.state = SIW_QP_STATE_ERROR;
+	siw_qp_modify(qp, &qp_attrs, SIW_QP_ATTR_STATE);
+
+	if (qp->cep) {
+		siw_cep_put(qp->cep);
+		qp->cep = NULL;
+	}
+	up_write(&qp->state_lock);
+
+	kfree(qp->tx_ctx.mpa_crc_hd);
+	kfree(qp->rx_stream.mpa_crc_hd);
+
+	qp->scq = qp->rcq = NULL;
+
+	siw_qp_put(qp);
+
+	return 0;
+}
+
+/*
+ * siw_copy_inline_sgl()
+ *
+ * Prepare sgl of inlined data for sending. For userland callers
+ * function checks if given buffer addresses and len's are within
+ * process context bounds.
+ * Data from all provided sge's are copied together into the wqe,
+ * referenced by a single sge.
+ */
+static int siw_copy_inline_sgl(const struct ib_send_wr *core_wr,
+			       struct siw_sqe *sqe)
+{
+	struct ib_sge *core_sge = core_wr->sg_list;
+	void *kbuf = &sqe->sge[1];
+	int num_sge = core_wr->num_sge, bytes = 0;
+
+	sqe->sge[0].laddr = (uintptr_t)kbuf;
+	sqe->sge[0].lkey = 0;
+
+	while (num_sge--) {
+		if (!core_sge->length) {
+			core_sge++;
+			continue;
+		}
+		bytes += core_sge->length;
+		if (bytes > SIW_MAX_INLINE) {
+			bytes = -EINVAL;
+			break;
+		}
+		memcpy(kbuf, (void *)(uintptr_t)core_sge->addr,
+		       core_sge->length);
+
+		kbuf += core_sge->length;
+		core_sge++;
+	}
+	sqe->sge[0].length = bytes > 0 ? bytes : 0;
+	sqe->num_sge = bytes > 0 ? 1 : 0;
+
+	return bytes;
+}
+
+/*
+ * siw_post_send()
+ *
+ * Post a list of S-WR's to a SQ.
+ *
+ * @base_qp:	Base QP contained in siw QP
+ * @wr:		Null terminated list of user WR's
+ * @bad_wr:	Points to failing WR in case of synchronous failure.
+ */
+int siw_post_send(struct ib_qp *base_qp, const struct ib_send_wr *wr,
+		  const struct ib_send_wr **bad_wr)
+{
+	struct siw_qp *qp = to_siw_qp(base_qp);
+	struct siw_wqe *wqe = tx_wqe(qp);
+
+	unsigned long flags;
+	int rv = 0;
+
+	/*
+	 * Try to acquire QP state lock. Must be non-blocking
+	 * to accommodate kernel clients needs.
+	 */
+	if (!down_read_trylock(&qp->state_lock)) {
+		*bad_wr = wr;
+		siw_dbg_qp(qp, "QP locked, state %d\n", qp->attrs.state);
+		return -ENOTCONN;
+	}
+	if (unlikely(qp->attrs.state != SIW_QP_STATE_RTS)) {
+		up_read(&qp->state_lock);
+		*bad_wr = wr;
+		siw_dbg_qp(qp, "QP out of state %d\n", qp->attrs.state);
+		return -ENOTCONN;
+	}
+	if (wr && !qp->kernel_verbs) {
+		siw_dbg_qp(qp, "wr must be empty for user mapped sq\n");
+		up_read(&qp->state_lock);
+		*bad_wr = wr;
+		return -EINVAL;
+	}
+	spin_lock_irqsave(&qp->sq_lock, flags);
+
+	while (wr) {
+		u32 idx = qp->sq_put % qp->attrs.sq_size;
+		struct siw_sqe *sqe = &qp->sendq[idx];
+
+		if (sqe->flags) {
+			siw_dbg_qp(qp, "sq full\n");
+			rv = -ENOMEM;
+			break;
+		}
+		if (wr->num_sge > qp->attrs.sq_max_sges) {
+			siw_dbg_qp(qp, "too many sge's: %d\n", wr->num_sge);
+			rv = -EINVAL;
+			break;
+		}
+		sqe->id = wr->wr_id;
+
+		if ((wr->send_flags & IB_SEND_SIGNALED) ||
+		    (qp->attrs.flags & SIW_SIGNAL_ALL_WR))
+			sqe->flags |= SIW_WQE_SIGNALLED;
+
+		if (wr->send_flags & IB_SEND_FENCE)
+			sqe->flags |= SIW_WQE_READ_FENCE;
+
+		switch (wr->opcode) {
+		case IB_WR_SEND:
+		case IB_WR_SEND_WITH_INV:
+			if (wr->send_flags & IB_SEND_SOLICITED)
+				sqe->flags |= SIW_WQE_SOLICITED;
+
+			if (!(wr->send_flags & IB_SEND_INLINE)) {
+				siw_copy_sgl(wr->sg_list, sqe->sge,
+					     wr->num_sge);
+				sqe->num_sge = wr->num_sge;
+			} else {
+				rv = siw_copy_inline_sgl(wr, sqe);
+				if (rv <= 0) {
+					rv = -EINVAL;
+					break;
+				}
+				sqe->flags |= SIW_WQE_INLINE;
+				sqe->num_sge = 1;
+			}
+			if (wr->opcode == IB_WR_SEND)
+				sqe->opcode = SIW_OP_SEND;
+			else {
+				sqe->opcode = SIW_OP_SEND_REMOTE_INV;
+				sqe->rkey = wr->ex.invalidate_rkey;
+			}
+			break;
+
+		case IB_WR_RDMA_READ_WITH_INV:
+		case IB_WR_RDMA_READ:
+			/*
+			 * iWarp restricts RREAD sink to SGL containing
+			 * 1 SGE only. we could relax to SGL with multiple
+			 * elements referring the SAME ltag or even sending
+			 * a private per-rreq tag referring to a checked
+			 * local sgl with MULTIPLE ltag's.
+			 */
+			if (unlikely(wr->num_sge != 1)) {
+				rv = -EINVAL;
+				break;
+			}
+			siw_copy_sgl(wr->sg_list, &sqe->sge[0], 1);
+			/*
+			 * NOTE: zero length RREAD is allowed!
+			 */
+			sqe->raddr = rdma_wr(wr)->remote_addr;
+			sqe->rkey = rdma_wr(wr)->rkey;
+			sqe->num_sge = 1;
+
+			if (wr->opcode == IB_WR_RDMA_READ)
+				sqe->opcode = SIW_OP_READ;
+			else
+				sqe->opcode = SIW_OP_READ_LOCAL_INV;
+			break;
+
+		case IB_WR_RDMA_WRITE:
+			if (!(wr->send_flags & IB_SEND_INLINE)) {
+				siw_copy_sgl(wr->sg_list, &sqe->sge[0],
+					     wr->num_sge);
+				sqe->num_sge = wr->num_sge;
+			} else {
+				rv = siw_copy_inline_sgl(wr, sqe);
+				if (unlikely(rv < 0)) {
+					rv = -EINVAL;
+					break;
+				}
+				sqe->flags |= SIW_WQE_INLINE;
+				sqe->num_sge = 1;
+			}
+			sqe->raddr = rdma_wr(wr)->remote_addr;
+			sqe->rkey = rdma_wr(wr)->rkey;
+			sqe->opcode = SIW_OP_WRITE;
+			break;
+
+		case IB_WR_REG_MR:
+			sqe->base_mr = (uintptr_t)reg_wr(wr)->mr;
+			sqe->rkey = reg_wr(wr)->key;
+			sqe->access = reg_wr(wr)->access & IWARP_ACCESS_MASK;
+			sqe->opcode = SIW_OP_REG_MR;
+			break;
+
+		case IB_WR_LOCAL_INV:
+			sqe->rkey = wr->ex.invalidate_rkey;
+			sqe->opcode = SIW_OP_INVAL_STAG;
+			break;
+
+		default:
+			siw_dbg_qp(qp, "ib wr type %d unsupported\n",
+				   wr->opcode);
+			rv = -EINVAL;
+			break;
+		}
+		siw_dbg_qp(qp, "opcode %d, flags 0x%x, wr_id 0x%pK\n",
+			   sqe->opcode, sqe->flags,
+			   (void *)(uintptr_t)sqe->id);
+
+		if (unlikely(rv < 0))
+			break;
+
+		/* make SQE only valid after completely written */
+		smp_wmb();
+		sqe->flags |= SIW_WQE_VALID;
+
+		qp->sq_put++;
+		wr = wr->next;
+	}
+
+	/*
+	 * Send directly if SQ processing is not in progress.
+	 * Eventual immediate errors (rv < 0) do not affect the involved
+	 * RI resources (Verbs, 8.3.1) and thus do not prevent from SQ
+	 * processing, if new work is already pending. But rv must be passed
+	 * to caller.
+	 */
+	if (wqe->wr_status != SIW_WR_IDLE) {
+		spin_unlock_irqrestore(&qp->sq_lock, flags);
+		goto skip_direct_sending;
+	}
+	rv = siw_activate_tx(qp);
+	spin_unlock_irqrestore(&qp->sq_lock, flags);
+
+	if (rv <= 0)
+		goto skip_direct_sending;
+
+	if (qp->kernel_verbs) {
+		rv = siw_sq_start(qp);
+	} else {
+		qp->tx_ctx.in_syscall = 1;
+
+		if (siw_qp_sq_process(qp) != 0 && !(qp->tx_ctx.tx_suspend))
+			siw_qp_cm_drop(qp, 0);
+
+		qp->tx_ctx.in_syscall = 0;
+	}
+skip_direct_sending:
+
+	up_read(&qp->state_lock);
+
+	if (rv >= 0)
+		return 0;
+	/*
+	 * Immediate error
+	 */
+	siw_dbg_qp(qp, "error %d\n", rv);
+
+	*bad_wr = wr;
+	return rv;
+}
+
+/*
+ * siw_post_receive()
+ *
+ * Post a list of R-WR's to a RQ.
+ *
+ * @base_qp:	Base QP contained in siw QP
+ * @wr:		Null terminated list of user WR's
+ * @bad_wr:	Points to failing WR in case of synchronous failure.
+ */
+int siw_post_receive(struct ib_qp *base_qp, const struct ib_recv_wr *wr,
+		     const struct ib_recv_wr **bad_wr)
+{
+	struct siw_qp *qp = to_siw_qp(base_qp);
+	unsigned long flags;
+	int rv = 0;
+
+	if (qp->srq) {
+		*bad_wr = wr;
+		return -EOPNOTSUPP; /* what else from errno.h? */
+	}
+	/*
+	 * Try to acquire QP state lock. Must be non-blocking
+	 * to accommodate kernel clients needs.
+	 */
+	if (!down_read_trylock(&qp->state_lock)) {
+		*bad_wr = wr;
+		return -ENOTCONN;
+	}
+	if (!qp->kernel_verbs) {
+		siw_dbg_qp(qp, "no kernel post_recv for user mapped sq\n");
+		up_read(&qp->state_lock);
+		*bad_wr = wr;
+		return -EINVAL;
+	}
+	if (qp->attrs.state > SIW_QP_STATE_RTS) {
+		up_read(&qp->state_lock);
+		*bad_wr = wr;
+		return -EINVAL;
+	}
+	/*
+	 * Serialize potentially multiple producers.
+	 * Not needed for single threaded consumer side.
+	 */
+	spin_lock_irqsave(&qp->rq_lock, flags);
+
+	while (wr) {
+		u32 idx = qp->rq_put % qp->attrs.rq_size;
+		struct siw_rqe *rqe = &qp->recvq[idx];
+
+		if (rqe->flags) {
+			siw_dbg_qp(qp, "RQ full\n");
+			rv = -ENOMEM;
+			break;
+		}
+		if (wr->num_sge > qp->attrs.rq_max_sges) {
+			siw_dbg_qp(qp, "too many sge's: %d\n", wr->num_sge);
+			rv = -EINVAL;
+			break;
+		}
+		rqe->id = wr->wr_id;
+		rqe->num_sge = wr->num_sge;
+		siw_copy_sgl(wr->sg_list, rqe->sge, wr->num_sge);
+
+		/* make sure RQE is completely written before valid */
+		smp_wmb();
+
+		rqe->flags = SIW_WQE_VALID;
+
+		qp->rq_put++;
+		wr = wr->next;
+	}
+	spin_unlock_irqrestore(&qp->rq_lock, flags);
+
+	up_read(&qp->state_lock);
+
+	if (rv < 0) {
+		siw_dbg_qp(qp, "error %d\n", rv);
+		*bad_wr = wr;
+	}
+	return rv > 0 ? 0 : rv;
+}
+
+void siw_destroy_cq(struct ib_cq *base_cq, struct ib_udata *udata)
+{
+	struct siw_cq *cq = to_siw_cq(base_cq);
+	struct siw_device *sdev = to_siw_dev(base_cq->device);
+	struct siw_ucontext *ctx =
+		rdma_udata_to_drv_context(udata, struct siw_ucontext,
+					  base_ucontext);
+
+	siw_dbg_cq(cq, "free CQ resources\n");
+
+	siw_cq_flush(cq);
+
+	if (ctx && cq->xa_cq_index != SIW_INVAL_UOBJ_KEY)
+		kfree(xa_erase(&ctx->xa, cq->xa_cq_index));
+
+	atomic_dec(&sdev->num_cq);
+
+	vfree(cq->queue);
+}
+
+/*
+ * siw_create_cq()
+ *
+ * Populate CQ of requested size
+ *
+ * @base_cq: CQ as allocated by RDMA midlayer
+ * @attr: Initial CQ attributes
+ * @udata: relates to user context
+ */
+
+int siw_create_cq(struct ib_cq *base_cq, const struct ib_cq_init_attr *attr,
+		  struct ib_udata *udata)
+{
+	struct siw_device *sdev = to_siw_dev(base_cq->device);
+	struct siw_cq *cq = to_siw_cq(base_cq);
+	int rv, size = attr->cqe;
+
+	if (atomic_inc_return(&sdev->num_cq) > SIW_MAX_CQ) {
+		siw_dbg(base_cq->device, "too many CQ's\n");
+		rv = -ENOMEM;
+		goto err_out;
+	}
+	if (size < 1 || size > sdev->attrs.max_cqe) {
+		siw_dbg(base_cq->device, "CQ size error: %d\n", size);
+		rv = -EINVAL;
+		goto err_out;
+	}
+	size = roundup_pow_of_two(size);
+	cq->base_cq.cqe = size;
+	cq->num_cqe = size;
+	cq->xa_cq_index = SIW_INVAL_UOBJ_KEY;
+
+	if (!udata) {
+		cq->kernel_verbs = 1;
+		cq->queue = vzalloc(size * sizeof(struct siw_cqe) +
+				    sizeof(struct siw_cq_ctrl));
+	} else {
+		cq->queue = vmalloc_user(size * sizeof(struct siw_cqe) +
+					 sizeof(struct siw_cq_ctrl));
+	}
+	if (cq->queue == NULL) {
+		rv = -ENOMEM;
+		goto err_out;
+	}
+	get_random_bytes(&cq->id, 4);
+	siw_dbg(base_cq->device, "new CQ [%u]\n", cq->id);
+
+	spin_lock_init(&cq->lock);
+
+	cq->notify = (struct siw_cq_ctrl *)&cq->queue[size];
+
+	if (udata) {
+		struct siw_uresp_create_cq uresp = {};
+		struct siw_ucontext *ctx =
+			rdma_udata_to_drv_context(udata, struct siw_ucontext,
+						  base_ucontext);
+
+		cq->xa_cq_index =
+			siw_create_uobj(ctx, cq->queue,
+					size * sizeof(struct siw_cqe) +
+						sizeof(struct siw_cq_ctrl));
+		if (cq->xa_cq_index == SIW_INVAL_UOBJ_KEY) {
+			rv = -ENOMEM;
+			goto err_out;
+		}
+		uresp.cq_key = cq->xa_cq_index << PAGE_SHIFT;
+		uresp.cq_id = cq->id;
+		uresp.num_cqe = size;
+
+		if (udata->outlen < sizeof(uresp)) {
+			rv = -EINVAL;
+			goto err_out;
+		}
+		rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp));
+		if (rv)
+			goto err_out;
+	}
+	return 0;
+
+err_out:
+	siw_dbg(base_cq->device, "CQ creation failed: %d", rv);
+
+	if (cq && cq->queue) {
+		struct siw_ucontext *ctx =
+			rdma_udata_to_drv_context(udata, struct siw_ucontext,
+						  base_ucontext);
+		if (cq->xa_cq_index != SIW_INVAL_UOBJ_KEY)
+			kfree(xa_erase(&ctx->xa, cq->xa_cq_index));
+		vfree(cq->queue);
+	}
+	atomic_dec(&sdev->num_cq);
+
+	return rv;
+}
+
+/*
+ * siw_poll_cq()
+ *
+ * Reap CQ entries if available and copy work completion status into
+ * array of WC's provided by caller. Returns number of reaped CQE's.
+ *
+ * @base_cq:	Base CQ contained in siw CQ.
+ * @num_cqe:	Maximum number of CQE's to reap.
+ * @wc:		Array of work completions to be filled by siw.
+ */
+int siw_poll_cq(struct ib_cq *base_cq, int num_cqe, struct ib_wc *wc)
+{
+	struct siw_cq *cq = to_siw_cq(base_cq);
+	int i;
+
+	for (i = 0; i < num_cqe; i++) {
+		if (!siw_reap_cqe(cq, wc))
+			break;
+		wc++;
+	}
+	return i;
+}
+
+/*
+ * siw_req_notify_cq()
+ *
+ * Request notification for new CQE's added to that CQ.
+ * Defined flags:
+ * o SIW_CQ_NOTIFY_SOLICITED lets siw trigger a notification
+ *   event if a WQE with notification flag set enters the CQ
+ * o SIW_CQ_NOTIFY_NEXT_COMP lets siw trigger a notification
+ *   event if a WQE enters the CQ.
+ * o IB_CQ_REPORT_MISSED_EVENTS: return value will provide the
+ *   number of not reaped CQE's regardless of its notification
+ *   type and current or new CQ notification settings.
+ *
+ * @base_cq:	Base CQ contained in siw CQ.
+ * @flags:	Requested notification flags.
+ */
+int siw_req_notify_cq(struct ib_cq *base_cq, enum ib_cq_notify_flags flags)
+{
+	struct siw_cq *cq = to_siw_cq(base_cq);
+
+	siw_dbg_cq(cq, "flags: 0x%02x\n", flags);
+
+	if ((flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED)
+		/*
+		 * Enable CQ event for next solicited completion.
+		 * and make it visible to all associated producers.
+		 */
+		smp_store_mb(cq->notify->flags, SIW_NOTIFY_SOLICITED);
+	else
+		/*
+		 * Enable CQ event for any signalled completion.
+		 * and make it visible to all associated producers.
+		 */
+		smp_store_mb(cq->notify->flags, SIW_NOTIFY_ALL);
+
+	if (flags & IB_CQ_REPORT_MISSED_EVENTS)
+		return cq->cq_put - cq->cq_get;
+
+	return 0;
+}
+
+/*
+ * siw_dereg_mr()
+ *
+ * Release Memory Region.
+ *
+ * @base_mr: Base MR contained in siw MR.
+ * @udata: points to user context, unused.
+ */
+int siw_dereg_mr(struct ib_mr *base_mr, struct ib_udata *udata)
+{
+	struct siw_mr *mr = to_siw_mr(base_mr);
+	struct siw_device *sdev = to_siw_dev(base_mr->device);
+
+	siw_dbg_mem(mr->mem, "deregister MR\n");
+
+	atomic_dec(&sdev->num_mr);
+
+	siw_mr_drop_mem(mr);
+	kfree_rcu(mr, rcu);
+
+	return 0;
+}
+
+/*
+ * siw_reg_user_mr()
+ *
+ * Register Memory Region.
+ *
+ * @pd:		Protection Domain
+ * @start:	starting address of MR (virtual address)
+ * @len:	len of MR
+ * @rnic_va:	not used by siw
+ * @rights:	MR access rights
+ * @udata:	user buffer to communicate STag and Key.
+ */
+struct ib_mr *siw_reg_user_mr(struct ib_pd *pd, u64 start, u64 len,
+			      u64 rnic_va, int rights, struct ib_udata *udata)
+{
+	struct siw_mr *mr = NULL;
+	struct siw_umem *umem = NULL;
+	struct siw_ureq_reg_mr ureq;
+	struct siw_device *sdev = to_siw_dev(pd->device);
+
+	unsigned long mem_limit = rlimit(RLIMIT_MEMLOCK);
+	int rv;
+
+	siw_dbg_pd(pd, "start: 0x%pK, va: 0x%pK, len: %llu\n",
+		   (void *)(uintptr_t)start, (void *)(uintptr_t)rnic_va,
+		   (unsigned long long)len);
+
+	if (atomic_inc_return(&sdev->num_mr) > SIW_MAX_MR) {
+		siw_dbg_pd(pd, "too many mr's\n");
+		rv = -ENOMEM;
+		goto err_out;
+	}
+	if (!len) {
+		rv = -EINVAL;
+		goto err_out;
+	}
+	if (mem_limit != RLIM_INFINITY) {
+		unsigned long num_pages =
+			(PAGE_ALIGN(len + (start & ~PAGE_MASK))) >> PAGE_SHIFT;
+		mem_limit >>= PAGE_SHIFT;
+
+		if (num_pages > mem_limit - current->mm->locked_vm) {
+			siw_dbg_pd(pd, "pages req %lu, max %lu, lock %lu\n",
+				   num_pages, mem_limit,
+				   current->mm->locked_vm);
+			rv = -ENOMEM;
+			goto err_out;
+		}
+	}
+	umem = siw_umem_get(start, len, ib_access_writable(rights));
+	if (IS_ERR(umem)) {
+		rv = PTR_ERR(umem);
+		siw_dbg_pd(pd, "getting user memory failed: %d\n", rv);
+		umem = NULL;
+		goto err_out;
+	}
+	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+	if (!mr) {
+		rv = -ENOMEM;
+		goto err_out;
+	}
+	rv = siw_mr_add_mem(mr, pd, umem, start, len, rights);
+	if (rv)
+		goto err_out;
+
+	if (udata) {
+		struct siw_uresp_reg_mr uresp = {};
+		struct siw_mem *mem = mr->mem;
+
+		if (udata->inlen < sizeof(ureq)) {
+			rv = -EINVAL;
+			goto err_out;
+		}
+		rv = ib_copy_from_udata(&ureq, udata, sizeof(ureq));
+		if (rv)
+			goto err_out;
+
+		mr->base_mr.lkey |= ureq.stag_key;
+		mr->base_mr.rkey |= ureq.stag_key;
+		mem->stag |= ureq.stag_key;
+		uresp.stag = mem->stag;
+
+		if (udata->outlen < sizeof(uresp)) {
+			rv = -EINVAL;
+			goto err_out;
+		}
+		rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp));
+		if (rv)
+			goto err_out;
+	}
+	mr->mem->stag_valid = 1;
+
+	return &mr->base_mr;
+
+err_out:
+	atomic_dec(&sdev->num_mr);
+	if (mr) {
+		if (mr->mem)
+			siw_mr_drop_mem(mr);
+		kfree_rcu(mr, rcu);
+	} else {
+		if (umem)
+			siw_umem_release(umem, false);
+	}
+	return ERR_PTR(rv);
+}
+
+struct ib_mr *siw_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
+			   u32 max_sge, struct ib_udata *udata)
+{
+	struct siw_device *sdev = to_siw_dev(pd->device);
+	struct siw_mr *mr = NULL;
+	struct siw_pbl *pbl = NULL;
+	int rv;
+
+	if (atomic_inc_return(&sdev->num_mr) > SIW_MAX_MR) {
+		siw_dbg_pd(pd, "too many mr's\n");
+		rv = -ENOMEM;
+		goto err_out;
+	}
+	if (mr_type != IB_MR_TYPE_MEM_REG) {
+		siw_dbg_pd(pd, "mr type %d unsupported\n", mr_type);
+		rv = -EOPNOTSUPP;
+		goto err_out;
+	}
+	if (max_sge > SIW_MAX_SGE_PBL) {
+		siw_dbg_pd(pd, "too many sge's: %d\n", max_sge);
+		rv = -ENOMEM;
+		goto err_out;
+	}
+	pbl = siw_pbl_alloc(max_sge);
+	if (IS_ERR(pbl)) {
+		rv = PTR_ERR(pbl);
+		siw_dbg_pd(pd, "pbl allocation failed: %d\n", rv);
+		pbl = NULL;
+		goto err_out;
+	}
+	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+	if (!mr) {
+		rv = -ENOMEM;
+		goto err_out;
+	}
+	rv = siw_mr_add_mem(mr, pd, pbl, 0, max_sge * PAGE_SIZE, 0);
+	if (rv)
+		goto err_out;
+
+	mr->mem->is_pbl = 1;
+
+	siw_dbg_pd(pd, "[MEM %u]: success\n", mr->mem->stag);
+
+	return &mr->base_mr;
+
+err_out:
+	atomic_dec(&sdev->num_mr);
+
+	if (!mr) {
+		kfree(pbl);
+	} else {
+		if (mr->mem)
+			siw_mr_drop_mem(mr);
+		kfree_rcu(mr, rcu);
+	}
+	siw_dbg_pd(pd, "failed: %d\n", rv);
+
+	return ERR_PTR(rv);
+}
+
+/* Just used to count number of pages being mapped */
+static int siw_set_pbl_page(struct ib_mr *base_mr, u64 buf_addr)
+{
+	return 0;
+}
+
+int siw_map_mr_sg(struct ib_mr *base_mr, struct scatterlist *sl, int num_sle,
+		  unsigned int *sg_off)
+{
+	struct scatterlist *slp;
+	struct siw_mr *mr = to_siw_mr(base_mr);
+	struct siw_mem *mem = mr->mem;
+	struct siw_pbl *pbl = mem->pbl;
+	struct siw_pble *pble;
+	unsigned long pbl_size;
+	int i, rv;
+
+	if (!pbl) {
+		siw_dbg_mem(mem, "no PBL allocated\n");
+		return -EINVAL;
+	}
+	pble = pbl->pbe;
+
+	if (pbl->max_buf < num_sle) {
+		siw_dbg_mem(mem, "too many SGE's: %d > %d\n",
+			    mem->pbl->max_buf, num_sle);
+		return -ENOMEM;
+	}
+	for_each_sg(sl, slp, num_sle, i) {
+		if (sg_dma_len(slp) == 0) {
+			siw_dbg_mem(mem, "empty SGE\n");
+			return -EINVAL;
+		}
+		if (i == 0) {
+			pble->addr = sg_dma_address(slp);
+			pble->size = sg_dma_len(slp);
+			pble->pbl_off = 0;
+			pbl_size = pble->size;
+			pbl->num_buf = 1;
+		} else {
+			/* Merge PBL entries if adjacent */
+			if (pble->addr + pble->size == sg_dma_address(slp)) {
+				pble->size += sg_dma_len(slp);
+			} else {
+				pble++;
+				pbl->num_buf++;
+				pble->addr = sg_dma_address(slp);
+				pble->size = sg_dma_len(slp);
+				pble->pbl_off = pbl_size;
+			}
+			pbl_size += sg_dma_len(slp);
+		}
+		siw_dbg_mem(mem,
+			"sge[%d], size %u, addr 0x%p, total %lu\n",
+			i, pble->size, (void *)(uintptr_t)pble->addr,
+			pbl_size);
+	}
+	rv = ib_sg_to_pages(base_mr, sl, num_sle, sg_off, siw_set_pbl_page);
+	if (rv > 0) {
+		mem->len = base_mr->length;
+		mem->va = base_mr->iova;
+		siw_dbg_mem(mem,
+			"%llu bytes, start 0x%pK, %u SLE to %u entries\n",
+			mem->len, (void *)(uintptr_t)mem->va, num_sle,
+			pbl->num_buf);
+	}
+	return rv;
+}
+
+/*
+ * siw_get_dma_mr()
+ *
+ * Create a (empty) DMA memory region, where no umem is attached.
+ */
+struct ib_mr *siw_get_dma_mr(struct ib_pd *pd, int rights)
+{
+	struct siw_device *sdev = to_siw_dev(pd->device);
+	struct siw_mr *mr = NULL;
+	int rv;
+
+	if (atomic_inc_return(&sdev->num_mr) > SIW_MAX_MR) {
+		siw_dbg_pd(pd, "too many mr's\n");
+		rv = -ENOMEM;
+		goto err_out;
+	}
+	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+	if (!mr) {
+		rv = -ENOMEM;
+		goto err_out;
+	}
+	rv = siw_mr_add_mem(mr, pd, NULL, 0, ULONG_MAX, rights);
+	if (rv)
+		goto err_out;
+
+	mr->mem->stag_valid = 1;
+
+	siw_dbg_pd(pd, "[MEM %u]: success\n", mr->mem->stag);
+
+	return &mr->base_mr;
+
+err_out:
+	if (rv)
+		kfree(mr);
+
+	atomic_dec(&sdev->num_mr);
+
+	return ERR_PTR(rv);
+}
+
+/*
+ * siw_create_srq()
+ *
+ * Create Shared Receive Queue of attributes @init_attrs
+ * within protection domain given by @pd.
+ *
+ * @base_srq:	Base SRQ contained in siw SRQ.
+ * @init_attrs:	SRQ init attributes.
+ * @udata:	points to user context
+ */
+int siw_create_srq(struct ib_srq *base_srq,
+		   struct ib_srq_init_attr *init_attrs, struct ib_udata *udata)
+{
+	struct siw_srq *srq = to_siw_srq(base_srq);
+	struct ib_srq_attr *attrs = &init_attrs->attr;
+	struct siw_device *sdev = to_siw_dev(base_srq->device);
+	struct siw_ucontext *ctx =
+		rdma_udata_to_drv_context(udata, struct siw_ucontext,
+					  base_ucontext);
+	int rv;
+
+	if (atomic_inc_return(&sdev->num_srq) > SIW_MAX_SRQ) {
+		siw_dbg_pd(base_srq->pd, "too many SRQ's\n");
+		rv = -ENOMEM;
+		goto err_out;
+	}
+	if (attrs->max_wr == 0 || attrs->max_wr > SIW_MAX_SRQ_WR ||
+	    attrs->max_sge > SIW_MAX_SGE || attrs->srq_limit > attrs->max_wr) {
+		rv = -EINVAL;
+		goto err_out;
+	}
+	srq->max_sge = attrs->max_sge;
+	srq->num_rqe = roundup_pow_of_two(attrs->max_wr);
+	srq->xa_srq_index = SIW_INVAL_UOBJ_KEY;
+	srq->limit = attrs->srq_limit;
+	if (srq->limit)
+		srq->armed = 1;
+
+	srq->kernel_verbs = !udata;
+
+	if (udata)
+		srq->recvq =
+			vmalloc_user(srq->num_rqe * sizeof(struct siw_rqe));
+	else
+		srq->recvq = vzalloc(srq->num_rqe * sizeof(struct siw_rqe));
+
+	if (srq->recvq == NULL) {
+		rv = -ENOMEM;
+		goto err_out;
+	}
+	if (udata) {
+		struct siw_uresp_create_srq uresp = {};
+
+		srq->xa_srq_index = siw_create_uobj(
+			ctx, srq->recvq, srq->num_rqe * sizeof(struct siw_rqe));
+
+		if (srq->xa_srq_index == SIW_INVAL_UOBJ_KEY) {
+			rv = -ENOMEM;
+			goto err_out;
+		}
+		uresp.srq_key = srq->xa_srq_index;
+		uresp.num_rqe = srq->num_rqe;
+
+		if (udata->outlen < sizeof(uresp)) {
+			rv = -EINVAL;
+			goto err_out;
+		}
+		rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp));
+		if (rv)
+			goto err_out;
+	}
+	spin_lock_init(&srq->lock);
+
+	siw_dbg_pd(base_srq->pd, "[SRQ]: success\n");
+
+	return 0;
+
+err_out:
+	if (srq->recvq) {
+		if (ctx && srq->xa_srq_index != SIW_INVAL_UOBJ_KEY)
+			kfree(xa_erase(&ctx->xa, srq->xa_srq_index));
+		vfree(srq->recvq);
+	}
+	atomic_dec(&sdev->num_srq);
+
+	return rv;
+}
+
+/*
+ * siw_modify_srq()
+ *
+ * Modify SRQ. The caller may resize SRQ and/or set/reset notification
+ * limit and (re)arm IB_EVENT_SRQ_LIMIT_REACHED notification.
+ *
+ * NOTE: it is unclear if RDMA core allows for changing the MAX_SGE
+ * parameter. siw_modify_srq() does not check the attrs->max_sge param.
+ */
+int siw_modify_srq(struct ib_srq *base_srq, struct ib_srq_attr *attrs,
+		   enum ib_srq_attr_mask attr_mask, struct ib_udata *udata)
+{
+	struct siw_srq *srq = to_siw_srq(base_srq);
+	unsigned long flags;
+	int rv = 0;
+
+	spin_lock_irqsave(&srq->lock, flags);
+
+	if (attr_mask & IB_SRQ_MAX_WR) {
+		/* resize request not yet supported */
+		rv = -EOPNOTSUPP;
+		goto out;
+	}
+	if (attr_mask & IB_SRQ_LIMIT) {
+		if (attrs->srq_limit) {
+			if (unlikely(attrs->srq_limit > srq->num_rqe)) {
+				rv = -EINVAL;
+				goto out;
+			}
+			srq->armed = 1;
+		} else {
+			srq->armed = 0;
+		}
+		srq->limit = attrs->srq_limit;
+	}
+out:
+	spin_unlock_irqrestore(&srq->lock, flags);
+
+	return rv;
+}
+
+/*
+ * siw_query_srq()
+ *
+ * Query SRQ attributes.
+ */
+int siw_query_srq(struct ib_srq *base_srq, struct ib_srq_attr *attrs)
+{
+	struct siw_srq *srq = to_siw_srq(base_srq);
+	unsigned long flags;
+
+	spin_lock_irqsave(&srq->lock, flags);
+
+	attrs->max_wr = srq->num_rqe;
+	attrs->max_sge = srq->max_sge;
+	attrs->srq_limit = srq->limit;
+
+	spin_unlock_irqrestore(&srq->lock, flags);
+
+	return 0;
+}
+
+/*
+ * siw_destroy_srq()
+ *
+ * Destroy SRQ.
+ * It is assumed that the SRQ is not referenced by any
+ * QP anymore - the code trusts the RDMA core environment to keep track
+ * of QP references.
+ */
+void siw_destroy_srq(struct ib_srq *base_srq, struct ib_udata *udata)
+{
+	struct siw_srq *srq = to_siw_srq(base_srq);
+	struct siw_device *sdev = to_siw_dev(base_srq->device);
+	struct siw_ucontext *ctx =
+		rdma_udata_to_drv_context(udata, struct siw_ucontext,
+					  base_ucontext);
+
+	if (ctx && srq->xa_srq_index != SIW_INVAL_UOBJ_KEY)
+		kfree(xa_erase(&ctx->xa, srq->xa_srq_index));
+
+	vfree(srq->recvq);
+	atomic_dec(&sdev->num_srq);
+}
+
+/*
+ * siw_post_srq_recv()
+ *
+ * Post a list of receive queue elements to SRQ.
+ * NOTE: The function does not check or lock a certain SRQ state
+ *       during the post operation. The code simply trusts the
+ *       RDMA core environment.
+ *
+ * @base_srq:	Base SRQ contained in siw SRQ
+ * @wr:		List of R-WR's
+ * @bad_wr:	Updated to failing WR if posting fails.
+ */
+int siw_post_srq_recv(struct ib_srq *base_srq, const struct ib_recv_wr *wr,
+		      const struct ib_recv_wr **bad_wr)
+{
+	struct siw_srq *srq = to_siw_srq(base_srq);
+	unsigned long flags;
+	int rv = 0;
+
+	if (unlikely(!srq->kernel_verbs)) {
+		siw_dbg_pd(base_srq->pd,
+			   "[SRQ]: no kernel post_recv for mapped srq\n");
+		rv = -EINVAL;
+		goto out;
+	}
+	/*
+	 * Serialize potentially multiple producers.
+	 * Also needed to serialize potentially multiple
+	 * consumers.
+	 */
+	spin_lock_irqsave(&srq->lock, flags);
+
+	while (wr) {
+		u32 idx = srq->rq_put % srq->num_rqe;
+		struct siw_rqe *rqe = &srq->recvq[idx];
+
+		if (rqe->flags) {
+			siw_dbg_pd(base_srq->pd, "SRQ full\n");
+			rv = -ENOMEM;
+			break;
+		}
+		if (unlikely(wr->num_sge > srq->max_sge)) {
+			siw_dbg_pd(base_srq->pd,
+				   "[SRQ]: too many sge's: %d\n", wr->num_sge);
+			rv = -EINVAL;
+			break;
+		}
+		rqe->id = wr->wr_id;
+		rqe->num_sge = wr->num_sge;
+		siw_copy_sgl(wr->sg_list, rqe->sge, wr->num_sge);
+
+		/* Make sure S-RQE is completely written before valid */
+		smp_wmb();
+
+		rqe->flags = SIW_WQE_VALID;
+
+		srq->rq_put++;
+		wr = wr->next;
+	}
+	spin_unlock_irqrestore(&srq->lock, flags);
+out:
+	if (unlikely(rv < 0)) {
+		siw_dbg_pd(base_srq->pd, "[SRQ]: error %d\n", rv);
+		*bad_wr = wr;
+	}
+	return rv;
+}
+
+void siw_qp_event(struct siw_qp *qp, enum ib_event_type etype)
+{
+	struct ib_event event;
+	struct ib_qp *base_qp = qp->ib_qp;
+
+	/*
+	 * Do not report asynchronous errors on QP which gets
+	 * destroyed via verbs interface (siw_destroy_qp())
+	 */
+	if (qp->attrs.flags & SIW_QP_IN_DESTROY)
+		return;
+
+	event.event = etype;
+	event.device = base_qp->device;
+	event.element.qp = base_qp;
+
+	if (base_qp->event_handler) {
+		siw_dbg_qp(qp, "reporting event %d\n", etype);
+		base_qp->event_handler(&event, base_qp->qp_context);
+	}
+}
+
+void siw_cq_event(struct siw_cq *cq, enum ib_event_type etype)
+{
+	struct ib_event event;
+	struct ib_cq *base_cq = &cq->base_cq;
+
+	event.event = etype;
+	event.device = base_cq->device;
+	event.element.cq = base_cq;
+
+	if (base_cq->event_handler) {
+		siw_dbg_cq(cq, "reporting CQ event %d\n", etype);
+		base_cq->event_handler(&event, base_cq->cq_context);
+	}
+}
+
+void siw_srq_event(struct siw_srq *srq, enum ib_event_type etype)
+{
+	struct ib_event event;
+	struct ib_srq *base_srq = &srq->base_srq;
+
+	event.event = etype;
+	event.device = base_srq->device;
+	event.element.srq = base_srq;
+
+	if (base_srq->event_handler) {
+		siw_dbg_pd(srq->base_srq.pd,
+			   "reporting SRQ event %d\n", etype);
+		base_srq->event_handler(&event, base_srq->srq_context);
+	}
+}
+
+void siw_port_event(struct siw_device *sdev, u8 port, enum ib_event_type etype)
+{
+	struct ib_event event;
+
+	event.event = etype;
+	event.device = &sdev->base_dev;
+	event.element.port_num = port;
+
+	siw_dbg(&sdev->base_dev, "reporting port event %d\n", etype);
+
+	ib_dispatch_event(&event);
+}
diff --git a/drivers/infiniband/sw/siw/siw_verbs.h b/drivers/infiniband/sw/siw/siw_verbs.h
new file mode 100644
index 0000000..1910869
--- /dev/null
+++ b/drivers/infiniband/sw/siw/siw_verbs.h
@@ -0,0 +1,91 @@
+/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */
+
+/* Authors: Bernard Metzler <bmt@zurich.ibm.com> */
+/* Copyright (c) 2008-2019, IBM Corporation */
+
+#ifndef _SIW_VERBS_H
+#define _SIW_VERBS_H
+
+#include <linux/errno.h>
+
+#include <rdma/iw_cm.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_user_verbs.h>
+
+#include "siw.h"
+#include "siw_cm.h"
+
+/*
+ * siw_copy_sgl()
+ *
+ * Copy SGL from RDMA core representation to local
+ * representation.
+ */
+static inline void siw_copy_sgl(struct ib_sge *sge, struct siw_sge *siw_sge,
+				int num_sge)
+{
+	while (num_sge--) {
+		siw_sge->laddr = sge->addr;
+		siw_sge->length = sge->length;
+		siw_sge->lkey = sge->lkey;
+
+		siw_sge++;
+		sge++;
+	}
+}
+
+int siw_alloc_ucontext(struct ib_ucontext *base_ctx, struct ib_udata *udata);
+void siw_dealloc_ucontext(struct ib_ucontext *base_ctx);
+int siw_query_port(struct ib_device *base_dev, u8 port,
+		   struct ib_port_attr *attr);
+int siw_get_port_immutable(struct ib_device *base_dev, u8 port,
+			   struct ib_port_immutable *port_immutable);
+int siw_query_device(struct ib_device *base_dev, struct ib_device_attr *attr,
+		     struct ib_udata *udata);
+int siw_create_cq(struct ib_cq *base_cq, const struct ib_cq_init_attr *attr,
+		  struct ib_udata *udata);
+int siw_query_port(struct ib_device *base_dev, u8 port,
+		   struct ib_port_attr *attr);
+int siw_query_pkey(struct ib_device *base_dev, u8 port, u16 idx, u16 *pkey);
+int siw_query_gid(struct ib_device *base_dev, u8 port, int idx,
+		  union ib_gid *gid);
+int siw_alloc_pd(struct ib_pd *base_pd, struct ib_udata *udata);
+void siw_dealloc_pd(struct ib_pd *base_pd, struct ib_udata *udata);
+struct ib_qp *siw_create_qp(struct ib_pd *base_pd,
+			    struct ib_qp_init_attr *attr,
+			    struct ib_udata *udata);
+int siw_query_qp(struct ib_qp *base_qp, struct ib_qp_attr *qp_attr,
+		 int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr);
+int siw_verbs_modify_qp(struct ib_qp *base_qp, struct ib_qp_attr *attr,
+			int attr_mask, struct ib_udata *udata);
+int siw_destroy_qp(struct ib_qp *base_qp, struct ib_udata *udata);
+int siw_post_send(struct ib_qp *base_qp, const struct ib_send_wr *wr,
+		  const struct ib_send_wr **bad_wr);
+int siw_post_receive(struct ib_qp *base_qp, const struct ib_recv_wr *wr,
+		     const struct ib_recv_wr **bad_wr);
+void siw_destroy_cq(struct ib_cq *base_cq, struct ib_udata *udata);
+int siw_poll_cq(struct ib_cq *base_cq, int num_entries, struct ib_wc *wc);
+int siw_req_notify_cq(struct ib_cq *base_cq, enum ib_cq_notify_flags flags);
+struct ib_mr *siw_reg_user_mr(struct ib_pd *base_pd, u64 start, u64 len,
+			      u64 rnic_va, int rights, struct ib_udata *udata);
+struct ib_mr *siw_alloc_mr(struct ib_pd *base_pd, enum ib_mr_type mr_type,
+			   u32 max_sge, struct ib_udata *udata);
+struct ib_mr *siw_get_dma_mr(struct ib_pd *base_pd, int rights);
+int siw_map_mr_sg(struct ib_mr *base_mr, struct scatterlist *sl, int num_sle,
+		  unsigned int *sg_off);
+int siw_dereg_mr(struct ib_mr *base_mr, struct ib_udata *udata);
+int siw_create_srq(struct ib_srq *base_srq, struct ib_srq_init_attr *attr,
+		   struct ib_udata *udata);
+int siw_modify_srq(struct ib_srq *base_srq, struct ib_srq_attr *attr,
+		   enum ib_srq_attr_mask mask, struct ib_udata *udata);
+int siw_query_srq(struct ib_srq *base_srq, struct ib_srq_attr *attr);
+void siw_destroy_srq(struct ib_srq *base_srq, struct ib_udata *udata);
+int siw_post_srq_recv(struct ib_srq *base_srq, const struct ib_recv_wr *wr,
+		      const struct ib_recv_wr **bad_wr);
+int siw_mmap(struct ib_ucontext *ctx, struct vm_area_struct *vma);
+void siw_qp_event(struct siw_qp *qp, enum ib_event_type type);
+void siw_cq_event(struct siw_cq *cq, enum ib_event_type type);
+void siw_srq_event(struct siw_srq *srq, enum ib_event_type type);
+void siw_port_event(struct siw_device *dev, u8 port, enum ib_event_type type);
+
+#endif
diff --git a/drivers/infiniband/ulp/ipoib/Kconfig b/drivers/infiniband/ulp/ipoib/Kconfig
index cda8eac..7af6860 100644
--- a/drivers/infiniband/ulp/ipoib/Kconfig
+++ b/drivers/infiniband/ulp/ipoib/Kconfig
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 config INFINIBAND_IPOIB
 	tristate "IP-over-InfiniBand"
 	depends on NETDEVICES && INET && (IPV6 || IPV6=n)
@@ -6,7 +7,7 @@
 	  transports IP packets over InfiniBand so you can use your IB
 	  device as a fancy NIC.
 
-	  See Documentation/infiniband/ipoib.txt for more information
+	  See Documentation/infiniband/ipoib.rst for more information
 
 config INFINIBAND_IPOIB_CM
 	bool "IP-over-InfiniBand Connected Mode support"
diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h
index 1abe3c6..2aa3457 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib.h
+++ b/drivers/infiniband/ulp/ipoib/ipoib.h
@@ -248,7 +248,6 @@
 	struct list_head     list;
 	struct net_device   *dev;
 	struct ipoib_neigh  *neigh;
-	struct ipoib_path   *path;
 	struct ipoib_tx_buf *tx_ring;
 	unsigned int	     tx_head;
 	unsigned int	     tx_tail;
@@ -499,8 +498,10 @@
 struct ipoib_path *__path_find(struct net_device *dev, void *gid);
 void ipoib_mark_paths_invalid(struct net_device *dev);
 void ipoib_flush_paths(struct net_device *dev);
-struct ipoib_dev_priv *ipoib_intf_alloc(struct ib_device *hca, u8 port,
-					const char *format);
+struct net_device *ipoib_intf_alloc(struct ib_device *hca, u8 port,
+				    const char *format);
+int ipoib_intf_init(struct ib_device *hca, u8 port, const char *format,
+		    struct net_device *dev);
 void ipoib_ib_tx_timer_func(struct timer_list *t);
 void ipoib_ib_dev_flush_light(struct work_struct *work);
 void ipoib_ib_dev_flush_normal(struct work_struct *work);
@@ -531,6 +532,8 @@
 void ipoib_dma_unmap_tx(struct ipoib_dev_priv *priv,
 			struct ipoib_tx_buf *tx_req);
 
+struct rtnl_link_ops *ipoib_get_link_ops(void);
+
 static inline void ipoib_build_sge(struct ipoib_dev_priv *priv,
 				   struct ipoib_tx_buf *tx_req)
 {
@@ -777,12 +780,12 @@
 #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
 void ipoib_create_debug_files(struct net_device *dev);
 void ipoib_delete_debug_files(struct net_device *dev);
-int ipoib_register_debugfs(void);
+void ipoib_register_debugfs(void);
 void ipoib_unregister_debugfs(void);
 #else
 static inline void ipoib_create_debug_files(struct net_device *dev) { }
 static inline void ipoib_delete_debug_files(struct net_device *dev) { }
-static inline int ipoib_register_debugfs(void) { return 0; }
+static inline void ipoib_register_debugfs(void) { }
 static inline void ipoib_unregister_debugfs(void) { }
 #endif
 
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
index 0428e01..c59e00a 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
@@ -1153,7 +1153,6 @@
 		ret = -ENOMEM;
 		goto err_tx;
 	}
-	memset(p->tx_ring, 0, ipoib_sendq_size * sizeof(*p->tx_ring));
 
 	p->qp = ipoib_cm_create_tx_qp(p->dev, p);
 	memalloc_noio_restore(noio_flag);
@@ -1312,7 +1311,6 @@
 
 	neigh->cm = tx;
 	tx->neigh = neigh;
-	tx->path = path;
 	tx->dev = dev;
 	list_add(&tx->list, &priv->cm.start_list);
 	set_bit(IPOIB_FLAG_INITIALIZED, &tx->flags);
@@ -1371,7 +1369,7 @@
 				neigh->daddr + QPN_AND_OPTIONS_OFFSET);
 			goto free_neigh;
 		}
-		memcpy(&pathrec, &p->path->pathrec, sizeof(pathrec));
+		memcpy(&pathrec, &path->pathrec, sizeof(pathrec));
 
 		spin_unlock_irqrestore(&priv->lock, flags);
 		netif_tx_unlock_bh(dev);
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c b/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c
index 8342992..63e4f9d 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c
@@ -138,7 +138,6 @@
 			p += ETH_GSTRING_LEN;
 		}
 		break;
-	case ETH_SS_TEST:
 	default:
 		break;
 	}
@@ -149,7 +148,6 @@
 	switch (sset) {
 	case ETH_SS_STATS:
 		return IPOIB_GLOBAL_STATS_LEN;
-	case ETH_SS_TEST:
 	default:
 		break;
 	}
@@ -222,6 +220,7 @@
 	.get_strings		= ipoib_get_strings,
 	.get_ethtool_stats	= ipoib_get_ethtool_stats,
 	.get_sset_count		= ipoib_get_sset_count,
+	.get_link		= ethtool_op_get_link,
 };
 
 void ipoib_set_ethtool_ops(struct net_device *dev)
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_fs.c b/drivers/infiniband/ulp/ipoib/ipoib_fs.c
index 1784880..64c19f6 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_fs.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_fs.c
@@ -267,14 +267,10 @@
 	snprintf(name, sizeof(name), "%s_mcg", dev->name);
 	priv->mcg_dentry = debugfs_create_file(name, S_IFREG | S_IRUGO,
 					       ipoib_root, dev, &ipoib_mcg_fops);
-	if (!priv->mcg_dentry)
-		ipoib_warn(priv, "failed to create mcg debug file\n");
 
 	snprintf(name, sizeof(name), "%s_path", dev->name);
 	priv->path_dentry = debugfs_create_file(name, S_IFREG | S_IRUGO,
 						ipoib_root, dev, &ipoib_path_fops);
-	if (!priv->path_dentry)
-		ipoib_warn(priv, "failed to create path debug file\n");
 }
 
 void ipoib_delete_debug_files(struct net_device *dev)
@@ -286,10 +282,9 @@
 	priv->mcg_dentry = priv->path_dentry = NULL;
 }
 
-int ipoib_register_debugfs(void)
+void ipoib_register_debugfs(void)
 {
 	ipoib_root = debugfs_create_dir("ipoib", NULL);
-	return ipoib_root ? 0 : -ENOMEM;
 }
 
 void ipoib_unregister_debugfs(void)
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
index 9006a13..c332b47 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
@@ -66,7 +66,7 @@
 	ah->last_send = 0;
 	kref_init(&ah->ref);
 
-	vah = rdma_create_ah(pd, attr);
+	vah = rdma_create_ah(pd, attr, RDMA_CREATE_AH_SLEEPABLE);
 	if (IS_ERR(vah)) {
 		kfree(ah);
 		ah = (struct ipoib_ah *)vah;
@@ -293,7 +293,8 @@
 		const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
 		mapping[i + off] = ib_dma_map_page(ca,
 						 skb_frag_page(frag),
-						 frag->page_offset, skb_frag_size(frag),
+						 skb_frag_off(frag),
+						 skb_frag_size(frag),
 						 DMA_TO_DEVICE);
 		if (unlikely(ib_dma_mapping_error(ca, mapping[i + off])))
 			goto partial_error;
@@ -669,7 +670,6 @@
 {
 	struct ipoib_dev_priv *priv = ipoib_priv(dev);
 	struct ipoib_ah *ah, *tah;
-	LIST_HEAD(remove_list);
 	unsigned long flags;
 
 	netif_tx_lock_bh(dev);
@@ -678,7 +678,7 @@
 	list_for_each_entry_safe(ah, tah, &priv->dead_ahs, list)
 		if ((int) priv->tx_tail - (int) ah->last_send >= 0) {
 			list_del(&ah->list);
-			rdma_destroy_ah(ah->ah);
+			rdma_destroy_ah(ah->ah, 0);
 			kfree(ah);
 		}
 
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c
index 30f840f..ac0583f 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -167,7 +167,7 @@
 			if (flags & IFF_UP)
 				continue;
 
-			dev_change_flags(cpriv->dev, flags | IFF_UP);
+			dev_change_flags(cpriv->dev, flags | IFF_UP, NULL);
 		}
 		up_read(&priv->vlan_rwsem);
 	}
@@ -207,7 +207,7 @@
 			if (!(flags & IFF_UP))
 				continue;
 
-			dev_change_flags(cpriv->dev, flags & ~IFF_UP);
+			dev_change_flags(cpriv->dev, flags & ~IFF_UP, NULL);
 		}
 		up_read(&priv->vlan_rwsem);
 	}
@@ -243,7 +243,8 @@
 		return 0;
 	}
 
-	if (new_mtu > IPOIB_UD_MTU(priv->max_ib_mtu))
+	if (new_mtu < (ETH_MIN_MTU + IPOIB_ENCAP_LEN) ||
+	    new_mtu > IPOIB_UD_MTU(priv->max_ib_mtu))
 		return -EINVAL;
 
 	priv->admin_mtu = new_mtu;
@@ -612,7 +613,7 @@
 	while ((skb = __skb_dequeue(&path->queue)))
 		dev_kfree_skb_irq(skb);
 
-	ipoib_dbg(ipoib_priv(dev), "path_free\n");
+	ipoib_dbg(ipoib_priv(dev), "%s\n", __func__);
 
 	/* remove all neigh connected to this path */
 	ipoib_del_neighs_by_gid(dev, path->pathrec.dgid.raw);
@@ -1640,7 +1641,7 @@
 {
 	struct ipoib_dev_priv *priv = ipoib_priv(dev);
 
-	ipoib_dbg(priv, "ipoib_neigh_hash_uninit\n");
+	ipoib_dbg(priv, "%s\n", __func__);
 	init_completion(&priv->ntbl.deleted);
 
 	cancel_delayed_work_sync(&priv->neigh_reap_task);
@@ -1822,7 +1823,7 @@
 	 * running ensures the it will not add more work.
 	 */
 	rtnl_lock();
-	dev_change_flags(priv->dev, priv->dev->flags & ~IFF_UP);
+	dev_change_flags(priv->dev, priv->dev->flags & ~IFF_UP, NULL);
 	rtnl_unlock();
 
 	/* ipoib_event() cannot be running once this returns */
@@ -1892,12 +1893,6 @@
 	struct ipoib_dev_priv *priv = ipoib_priv(ndev);
 	struct ipoib_dev_priv *ppriv = ipoib_priv(priv->parent);
 
-	dev_hold(priv->parent);
-
-	down_write(&ppriv->vlan_rwsem);
-	list_add_tail(&priv->list, &ppriv->child_intfs);
-	up_write(&ppriv->vlan_rwsem);
-
 	priv->max_ib_mtu = ppriv->max_ib_mtu;
 	set_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags);
 	memcpy(priv->dev->dev_addr, ppriv->dev->dev_addr, INFINIBAND_ALEN);
@@ -1940,6 +1935,17 @@
 	if (rc) {
 		pr_warn("%s: failed to initialize device: %s port %d (ret = %d)\n",
 			priv->ca->name, priv->dev->name, priv->port, rc);
+		return rc;
+	}
+
+	if (priv->parent) {
+		struct ipoib_dev_priv *ppriv = ipoib_priv(priv->parent);
+
+		dev_hold(priv->parent);
+
+		down_write(&ppriv->vlan_rwsem);
+		list_add_tail(&priv->list, &ppriv->child_intfs);
+		up_write(&ppriv->vlan_rwsem);
 	}
 
 	return 0;
@@ -1957,6 +1963,14 @@
 	 */
 	WARN_ON(!list_empty(&priv->child_intfs));
 
+	if (priv->parent) {
+		struct ipoib_dev_priv *ppriv = ipoib_priv(priv->parent);
+
+		down_write(&ppriv->vlan_rwsem);
+		list_del(&priv->list);
+		up_write(&ppriv->vlan_rwsem);
+	}
+
 	ipoib_neigh_hash_uninit(dev);
 
 	ipoib_ib_dev_cleanup(dev);
@@ -1968,15 +1982,8 @@
 		priv->wq = NULL;
 	}
 
-	if (priv->parent) {
-		struct ipoib_dev_priv *ppriv = ipoib_priv(priv->parent);
-
-		down_write(&ppriv->vlan_rwsem);
-		list_del(&priv->list);
-		up_write(&ppriv->vlan_rwsem);
-
+	if (priv->parent)
 		dev_put(priv->parent);
-	}
 }
 
 static int ipoib_set_vf_link_state(struct net_device *dev, int vf, int link_state)
@@ -1997,6 +2004,7 @@
 		return err;
 
 	ivf->vf = vf;
+	memcpy(ivf->mac, dev->dev_addr, dev->addr_len);
 
 	return 0;
 }
@@ -2117,82 +2125,58 @@
 	.ndo_stop		 = ipoib_ib_dev_stop_default,
 };
 
-static struct net_device
-*ipoib_create_netdev_default(struct ib_device *hca,
-			     const char *name,
-			     unsigned char name_assign_type,
-			     void (*setup)(struct net_device *))
+static struct net_device *ipoib_alloc_netdev(struct ib_device *hca, u8 port,
+					     const char *name)
 {
 	struct net_device *dev;
-	struct rdma_netdev *rn;
 
-	dev = alloc_netdev((int)sizeof(struct rdma_netdev),
-			   name,
-			   name_assign_type, setup);
+	dev = rdma_alloc_netdev(hca, port, RDMA_NETDEV_IPOIB, name,
+				NET_NAME_UNKNOWN, ipoib_setup_common);
+	if (!IS_ERR(dev) || PTR_ERR(dev) != -EOPNOTSUPP)
+		return dev;
+
+	dev = alloc_netdev(sizeof(struct rdma_netdev), name, NET_NAME_UNKNOWN,
+			   ipoib_setup_common);
 	if (!dev)
-		return NULL;
-
-	rn = netdev_priv(dev);
-
-	rn->send = ipoib_send;
-	rn->attach_mcast = ipoib_mcast_attach;
-	rn->detach_mcast = ipoib_mcast_detach;
-	rn->hca = hca;
-	dev->netdev_ops = &ipoib_netdev_default_pf;
-
+		return ERR_PTR(-ENOMEM);
 	return dev;
 }
 
-static struct net_device *ipoib_get_netdev(struct ib_device *hca, u8 port,
-					   const char *name)
+int ipoib_intf_init(struct ib_device *hca, u8 port, const char *name,
+		    struct net_device *dev)
 {
-	struct net_device *dev;
-
-	if (hca->alloc_rdma_netdev) {
-		dev = hca->alloc_rdma_netdev(hca, port,
-					     RDMA_NETDEV_IPOIB, name,
-					     NET_NAME_UNKNOWN,
-					     ipoib_setup_common);
-		if (IS_ERR_OR_NULL(dev) && PTR_ERR(dev) != -EOPNOTSUPP)
-			return NULL;
-	}
-
-	if (!hca->alloc_rdma_netdev || PTR_ERR(dev) == -EOPNOTSUPP)
-		dev = ipoib_create_netdev_default(hca, name, NET_NAME_UNKNOWN,
-						  ipoib_setup_common);
-
-	return dev;
-}
-
-struct ipoib_dev_priv *ipoib_intf_alloc(struct ib_device *hca, u8 port,
-					const char *name)
-{
-	struct net_device *dev;
+	struct rdma_netdev *rn = netdev_priv(dev);
 	struct ipoib_dev_priv *priv;
-	struct rdma_netdev *rn;
+	int rc;
 
 	priv = kzalloc(sizeof(*priv), GFP_KERNEL);
 	if (!priv)
-		return NULL;
+		return -ENOMEM;
 
 	priv->ca = hca;
 	priv->port = port;
 
-	dev = ipoib_get_netdev(hca, port, name);
-	if (!dev)
-		goto free_priv;
+	rc = rdma_init_netdev(hca, port, RDMA_NETDEV_IPOIB, name,
+			      NET_NAME_UNKNOWN, ipoib_setup_common, dev);
+	if (rc) {
+		if (rc != -EOPNOTSUPP)
+			goto out;
+
+		dev->netdev_ops = &ipoib_netdev_default_pf;
+		rn->send = ipoib_send;
+		rn->attach_mcast = ipoib_mcast_attach;
+		rn->detach_mcast = ipoib_mcast_detach;
+		rn->hca = hca;
+	}
 
 	priv->rn_ops = dev->netdev_ops;
 
-	/* fixme : should be after the query_cap */
-	if (priv->hca_caps & IB_DEVICE_VIRTUAL_FUNCTION)
+	if (hca->attrs.device_cap_flags & IB_DEVICE_VIRTUAL_FUNCTION)
 		dev->netdev_ops	= &ipoib_netdev_ops_vf;
 	else
 		dev->netdev_ops	= &ipoib_netdev_ops_pf;
 
-	rn = netdev_priv(dev);
 	rn->clnt_priv = priv;
-
 	/*
 	 * Only the child register_netdev flows can handle priv_destructor
 	 * being set, so we force it to NULL here and handle manually until it
@@ -2203,10 +2187,35 @@
 
 	ipoib_build_priv(dev);
 
-	return priv;
-free_priv:
+	return 0;
+
+out:
 	kfree(priv);
-	return NULL;
+	return rc;
+}
+
+struct net_device *ipoib_intf_alloc(struct ib_device *hca, u8 port,
+				    const char *name)
+{
+	struct net_device *dev;
+	int rc;
+
+	dev = ipoib_alloc_netdev(hca, port, name);
+	if (IS_ERR(dev))
+		return dev;
+
+	rc = ipoib_intf_init(hca, port, name, dev);
+	if (rc) {
+		free_netdev(dev);
+		return ERR_PTR(rc);
+	}
+
+	/*
+	 * Upon success the caller must ensure ipoib_intf_free is called or
+	 * register_netdevice succeed'd and priv_destructor is set to
+	 * ipoib_intf_free.
+	 */
+	return dev;
 }
 
 void ipoib_intf_free(struct net_device *dev)
@@ -2386,19 +2395,62 @@
 	return device_create_file(&dev->dev, &dev_attr_pkey);
 }
 
+/*
+ * We erroneously exposed the iface's port number in the dev_id
+ * sysfs field long after dev_port was introduced for that purpose[1],
+ * and we need to stop everyone from relying on that.
+ * Let's overload the shower routine for the dev_id file here
+ * to gently bring the issue up.
+ *
+ * [1] https://www.spinics.net/lists/netdev/msg272123.html
+ */
+static ssize_t dev_id_show(struct device *dev,
+			   struct device_attribute *attr, char *buf)
+{
+	struct net_device *ndev = to_net_dev(dev);
+
+	/*
+	 * ndev->dev_port will be equal to 0 in old kernel prior to commit
+	 * 9b8b2a323008 ("IB/ipoib: Use dev_port to expose network interface
+	 * port numbers") Zero was chosen as special case for user space
+	 * applications to fallback and query dev_id to check if it has
+	 * different value or not.
+	 *
+	 * Don't print warning in such scenario.
+	 *
+	 * https://github.com/systemd/systemd/blob/master/src/udev/udev-builtin-net_id.c#L358
+	 */
+	if (ndev->dev_port && ndev->dev_id == ndev->dev_port)
+		netdev_info_once(ndev,
+			"\"%s\" wants to know my dev_id. Should it look at dev_port instead? See Documentation/ABI/testing/sysfs-class-net for more info.\n",
+			current->comm);
+
+	return sprintf(buf, "%#x\n", ndev->dev_id);
+}
+static DEVICE_ATTR_RO(dev_id);
+
+static int ipoib_intercept_dev_id_attr(struct net_device *dev)
+{
+	device_remove_file(&dev->dev, &dev_attr_dev_id);
+	return device_create_file(&dev->dev, &dev_attr_dev_id);
+}
+
 static struct net_device *ipoib_add_port(const char *format,
 					 struct ib_device *hca, u8 port)
 {
+	struct rtnl_link_ops *ops = ipoib_get_link_ops();
+	struct rdma_netdev_alloc_params params;
 	struct ipoib_dev_priv *priv;
 	struct net_device *ndev;
 	int result;
 
-	priv = ipoib_intf_alloc(hca, port, format);
-	if (!priv) {
-		pr_warn("%s, %d: ipoib_intf_alloc failed\n", hca->name, port);
-		return ERR_PTR(-ENOMEM);
+	ndev = ipoib_intf_alloc(hca, port, format);
+	if (IS_ERR(ndev)) {
+		pr_warn("%s, %d: ipoib_intf_alloc failed %ld\n", hca->name, port,
+			PTR_ERR(ndev));
+		return ndev;
 	}
-	ndev = priv->dev;
+	priv = ipoib_priv(ndev);
 
 	INIT_IB_EVENT_HANDLER(&priv->event_handler,
 			      priv->ca, ipoib_event);
@@ -2419,6 +2471,14 @@
 		return ERR_PTR(result);
 	}
 
+	if (hca->ops.rdma_netdev_get_params) {
+		int rc = hca->ops.rdma_netdev_get_params(hca, port,
+						     RDMA_NETDEV_IPOIB,
+						     &params);
+
+		if (!rc && ops->priv_size < params.sizeof_priv)
+			ops->priv_size = params.sizeof_priv;
+	}
 	/*
 	 * We cannot set priv_destructor before register_netdev because we
 	 * need priv to be always valid during the error flow to execute
@@ -2427,6 +2487,8 @@
 	 */
 	ndev->priv_destructor = ipoib_intf_free;
 
+	if (ipoib_intercept_dev_id_attr(ndev))
+		goto sysfs_failed;
 	if (ipoib_cm_add_mode_attr(ndev))
 		goto sysfs_failed;
 	if (ipoib_add_pkey_attr(ndev))
@@ -2451,7 +2513,7 @@
 	struct list_head *dev_list;
 	struct net_device *dev;
 	struct ipoib_dev_priv *priv;
-	int p;
+	unsigned int p;
 	int count = 0;
 
 	dev_list = kmalloc(sizeof(*dev_list), GFP_KERNEL);
@@ -2460,7 +2522,7 @@
 
 	INIT_LIST_HEAD(dev_list);
 
-	for (p = rdma_start_port(device); p <= rdma_end_port(device); ++p) {
+	rdma_for_each_port (device, p) {
 		if (!rdma_protocol_ib(device, p))
 			continue;
 		dev = ipoib_add_port("ib%d", device, p);
@@ -2533,9 +2595,7 @@
 	 */
 	BUILD_BUG_ON(IPOIB_CM_COPYBREAK > IPOIB_CM_HEAD_SIZE);
 
-	ret = ipoib_register_debugfs();
-	if (ret)
-		return ret;
+	ipoib_register_debugfs();
 
 	/*
 	 * We create a global workqueue here that is used for all flush
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_netlink.c b/drivers/infiniband/ulp/ipoib/ipoib_netlink.c
index d4d553a..38c984d 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_netlink.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_netlink.c
@@ -122,12 +122,26 @@
 	} else
 		child_pkey  = nla_get_u16(data[IFLA_IPOIB_PKEY]);
 
+	err = ipoib_intf_init(ppriv->ca, ppriv->port, dev->name, dev);
+	if (err) {
+		ipoib_warn(ppriv, "failed to initialize pkey device\n");
+		return err;
+	}
+
 	err = __ipoib_vlan_add(ppriv, ipoib_priv(dev),
 			       child_pkey, IPOIB_RTNL_CHILD);
+	if (err)
+		return err;
 
-	if (!err && data)
+	if (data) {
 		err = ipoib_changelink(dev, tb, data, extack);
-	return err;
+		if (err) {
+			unregister_netdevice(dev);
+			return err;
+		}
+	}
+
+	return 0;
 }
 
 static size_t ipoib_get_size(const struct net_device *dev)
@@ -149,6 +163,11 @@
 	.fill_info	= ipoib_fill_info,
 };
 
+struct rtnl_link_ops *ipoib_get_link_ops(void)
+{
+	return &ipoib_link_ops;
+}
+
 int __init ipoib_netlink_init(void)
 {
 	return rtnl_link_register(&ipoib_link_ops);
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
index 9f36ca7..b69304d 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
@@ -260,11 +260,8 @@
 		priv->qp = NULL;
 	}
 
-	if (ib_destroy_cq(priv->send_cq))
-		ipoib_warn(priv, "ib_cq_destroy (send) failed\n");
-
-	if (ib_destroy_cq(priv->recv_cq))
-		ipoib_warn(priv, "ib_cq_destroy (recv) failed\n");
+	ib_destroy_cq(priv->send_cq);
+	ib_destroy_cq(priv->recv_cq);
 }
 
 void ipoib_event(struct ib_event_handler *handler,
@@ -277,10 +274,9 @@
 		return;
 
 	ipoib_dbg(priv, "Event %d on device %s port %d\n", record->event,
-		  record->device->name, record->element.port_num);
+		  dev_name(&record->device->dev), record->element.port_num);
 
-	if (record->event == IB_EVENT_SM_CHANGE ||
-	    record->event == IB_EVENT_CLIENT_REREGISTER) {
+	if (record->event == IB_EVENT_CLIENT_REREGISTER) {
 		queue_work(ipoib_workqueue, &priv->flush_light);
 	} else if (record->event == IB_EVENT_PORT_ERR ||
 		   record->event == IB_EVENT_PORT_ACTIVE ||
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c
index 341753f..8ac8e18 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c
@@ -85,7 +85,7 @@
 
 /*
  * NOTE: If this function fails then the priv->dev will remain valid, however
- * priv can have been freed and must not be touched by caller in the error
+ * priv will have been freed and must not be touched by caller in the error
  * case.
  *
  * If (ndev->reg_state == NETREG_UNINITIALIZED) then it is up to the caller to
@@ -101,6 +101,12 @@
 	ASSERT_RTNL();
 
 	/*
+	 * We do not need to touch priv if register_netdevice fails, so just
+	 * always use this flow.
+	 */
+	ndev->priv_destructor = ipoib_intf_free;
+
+	/*
 	 * Racing with unregister of the parent must be prevented by the
 	 * caller.
 	 */
@@ -120,9 +126,6 @@
 		goto out_early;
 	}
 
-	/* We do not need to touch priv if register_netdevice fails */
-	ndev->priv_destructor = ipoib_intf_free;
-
 	result = register_netdevice(ndev);
 	if (result) {
 		ipoib_warn(priv, "failed to initialize; error %i", result);
@@ -182,12 +185,12 @@
 	snprintf(intf_name, sizeof(intf_name), "%s.%04x",
 		 ppriv->dev->name, pkey);
 
-	priv = ipoib_intf_alloc(ppriv->ca, ppriv->port, intf_name);
-	if (!priv) {
-		result = -ENOMEM;
+	ndev = ipoib_intf_alloc(ppriv->ca, ppriv->port, intf_name);
+	if (IS_ERR(ndev)) {
+		result = PTR_ERR(ndev);
 		goto out;
 	}
-	ndev = priv->dev;
+	priv = ipoib_priv(ndev);
 
 	result = __ipoib_vlan_add(ppriv, priv, pkey, IPOIB_LEGACY_CHILD);
 
diff --git a/drivers/infiniband/ulp/iser/Kconfig b/drivers/infiniband/ulp/iser/Kconfig
index d00af71..1d29dff 100644
--- a/drivers/infiniband/ulp/iser/Kconfig
+++ b/drivers/infiniband/ulp/iser/Kconfig
@@ -1,11 +1,12 @@
+# SPDX-License-Identifier: GPL-2.0-only
 config INFINIBAND_ISER
 	tristate "iSCSI Extensions for RDMA (iSER)"
 	depends on SCSI && INET && INFINIBAND_ADDR_TRANS
 	select SCSI_ISCSI_ATTRS
 	---help---
 	  Support for the iSCSI Extensions for RDMA (iSER) Protocol
-          over InfiniBand. This allows you to access storage devices
-          that speak iSCSI over iSER over InfiniBand.
+	  over InfiniBand. This allows you to access storage devices
+	  that speak iSCSI over iSER over InfiniBand.
 
 	  The iSER protocol is defined by IETF.
 	  See <http://www.ietf.org/rfc/rfc5046.txt>
diff --git a/drivers/infiniband/ulp/iser/Makefile b/drivers/infiniband/ulp/iser/Makefile
index fe6cd15..2f3e788 100644
--- a/drivers/infiniband/ulp/iser/Makefile
+++ b/drivers/infiniband/ulp/iser/Makefile
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 obj-$(CONFIG_INFINIBAND_ISER)	+= ib_iser.o
 
 ib_iser-y			:= iser_verbs.o iser_initiator.o iser_memory.o \
diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.c b/drivers/infiniband/ulp/iser/iscsi_iser.c
index 3fecd87..2e72fc5 100644
--- a/drivers/infiniband/ulp/iser/iscsi_iser.c
+++ b/drivers/infiniband/ulp/iser/iscsi_iser.c
@@ -205,7 +205,8 @@
 		goto out;
 	}
 
-	tx_desc->wr_idx = 0;
+	tx_desc->inv_wr.next = NULL;
+	tx_desc->reg_wr.wr.next = NULL;
 	tx_desc->mapped = true;
 	tx_desc->dma_addr = dma_addr;
 	tx_desc->tx_sg[0].addr   = tx_desc->dma_addr;
@@ -406,13 +407,10 @@
 iscsi_iser_check_protection(struct iscsi_task *task, sector_t *sector)
 {
 	struct iscsi_iser_task *iser_task = task->dd_data;
+	enum iser_data_dir dir = iser_task->dir[ISER_DIR_IN] ?
+					ISER_DIR_IN : ISER_DIR_OUT;
 
-	if (iser_task->dir[ISER_DIR_IN])
-		return iser_check_task_pi_status(iser_task, ISER_DIR_IN,
-						 sector);
-	else
-		return iser_check_task_pi_status(iser_task, ISER_DIR_OUT,
-						 sector);
+	return iser_check_task_pi_status(iser_task, dir, sector);
 }
 
 /**
@@ -613,6 +611,7 @@
 	struct Scsi_Host *shost;
 	struct iser_conn *iser_conn = NULL;
 	struct ib_conn *ib_conn;
+	struct ib_device *ib_dev;
 	u32 max_fr_sectors;
 
 	shost = iscsi_host_alloc(&iscsi_iser_sht, 0, 0);
@@ -643,16 +642,19 @@
 		}
 
 		ib_conn = &iser_conn->ib_conn;
+		ib_dev = ib_conn->device->ib_device;
 		if (ib_conn->pi_support) {
-			u32 sig_caps = ib_conn->device->ib_device->attrs.sig_prot_cap;
+			u32 sig_caps = ib_dev->attrs.sig_prot_cap;
 
 			scsi_host_set_prot(shost, iser_dif_prot_caps(sig_caps));
 			scsi_host_set_guard(shost, SHOST_DIX_GUARD_IP |
 						   SHOST_DIX_GUARD_CRC);
 		}
 
-		if (iscsi_host_add(shost,
-				   ib_conn->device->ib_device->dev.parent)) {
+		if (!(ib_dev->attrs.device_cap_flags & IB_DEVICE_SG_GAPS_REG))
+			shost->virt_boundary_mask = ~MASK_4K;
+
+		if (iscsi_host_add(shost, ib_dev->dev.parent)) {
 			mutex_unlock(&iser_conn->state_mutex);
 			goto free_host;
 		}
@@ -763,7 +765,6 @@
 				   enum iscsi_param param, char *buf)
 {
 	struct iser_conn *iser_conn = ep->dd_data;
-	int len;
 
 	switch (param) {
 	case ISCSI_PARAM_CONN_PORT:
@@ -774,12 +775,10 @@
 		return iscsi_conn_get_addr_param((struct sockaddr_storage *)
 				&iser_conn->ib_conn.cma_id->route.addr.dst_addr,
 				param, buf);
-		break;
 	default:
-		return -ENOSYS;
+		break;
 	}
-
-	return len;
+	return -ENOSYS;
 }
 
 /**
@@ -961,30 +960,6 @@
 	return 0;
 }
 
-static int iscsi_iser_slave_alloc(struct scsi_device *sdev)
-{
-	struct iscsi_session *session;
-	struct iser_conn *iser_conn;
-	struct ib_device *ib_dev;
-
-	mutex_lock(&unbind_iser_conn_mutex);
-
-	session = starget_to_session(scsi_target(sdev))->dd_data;
-	iser_conn = session->leadconn->dd_data;
-	if (!iser_conn) {
-		mutex_unlock(&unbind_iser_conn_mutex);
-		return -ENOTCONN;
-	}
-	ib_dev = iser_conn->ib_conn.device->ib_device;
-
-	if (!(ib_dev->attrs.device_cap_flags & IB_DEVICE_SG_GAPS_REG))
-		blk_queue_virt_boundary(sdev->request_queue, ~MASK_4K);
-
-	mutex_unlock(&unbind_iser_conn_mutex);
-
-	return 0;
-}
-
 static struct scsi_host_template iscsi_iser_sht = {
 	.module                 = THIS_MODULE,
 	.name                   = "iSCSI Initiator over iSER",
@@ -997,8 +972,6 @@
 	.eh_device_reset_handler= iscsi_eh_device_reset,
 	.eh_target_reset_handler = iscsi_eh_recover_target,
 	.target_alloc		= iscsi_target_alloc,
-	.use_clustering         = ENABLE_CLUSTERING,
-	.slave_alloc            = iscsi_iser_slave_alloc,
 	.proc_name              = "iscsi_iser",
 	.this_id                = -1,
 	.track_queue_depth	= 1,
diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.h b/drivers/infiniband/ulp/iser/iscsi_iser.h
index 120b408..52ce635 100644
--- a/drivers/infiniband/ulp/iser/iscsi_iser.h
+++ b/drivers/infiniband/ulp/iser/iscsi_iser.h
@@ -102,9 +102,10 @@
 
 /* Default support is 512KB I/O size */
 #define ISER_DEF_MAX_SECTORS		1024
-#define ISCSI_ISER_DEF_SG_TABLESIZE	((ISER_DEF_MAX_SECTORS * 512) >> SHIFT_4K)
-/* Maximum support is 8MB I/O size */
-#define ISCSI_ISER_MAX_SG_TABLESIZE	((16384 * 512) >> SHIFT_4K)
+#define ISCSI_ISER_DEF_SG_TABLESIZE                                            \
+	((ISER_DEF_MAX_SECTORS * SECTOR_SIZE) >> SHIFT_4K)
+/* Maximum support is 16MB I/O size */
+#define ISCSI_ISER_MAX_SG_TABLESIZE	((32768 * SECTOR_SIZE) >> SHIFT_4K)
 
 #define ISER_DEF_XMIT_CMDS_DEFAULT		512
 #if ISCSI_DEF_XMIT_CMDS_MAX > ISER_DEF_XMIT_CMDS_DEFAULT
@@ -197,7 +198,7 @@
 	struct scatterlist *sg;
 	int                size;
 	unsigned long      data_len;
-	unsigned int       dma_nents;
+	int                dma_nents;
 };
 
 /* fwd declarations */
@@ -225,14 +226,6 @@
 	ISCSI_TX_DATAOUT
 };
 
-/* Maximum number of work requests per task:
- * Data memory region local invalidate + fast registration
- * Protection memory region local invalidate + fast registration
- * Signature memory region local invalidate + fast registration
- * PDU send
- */
-#define ISER_MAX_WRS 7
-
 /**
  * struct iser_tx_desc - iSER TX descriptor
  *
@@ -245,11 +238,9 @@
  *                 unsolicited data-out or control
  * @num_sge:       number sges used on this TX task
  * @mapped:        Is the task header mapped
- * @wr_idx:        Current WR index
- * @wrs:           Array of WRs per task
- * @data_reg:      Data buffer registration details
- * @prot_reg:      Protection buffer registration details
- * @sig_attrs:     Signature attributes
+ * reg_wr:         registration WR
+ * send_wr:        send WR
+ * inv_wr:         invalidate WR
  */
 struct iser_tx_desc {
 	struct iser_ctrl             iser_header;
@@ -260,15 +251,9 @@
 	int                          num_sge;
 	struct ib_cqe		     cqe;
 	bool			     mapped;
-	u8                           wr_idx;
-	union iser_wr {
-		struct ib_send_wr		send;
-		struct ib_reg_wr		fast_reg;
-		struct ib_sig_handover_wr	sig;
-	} wrs[ISER_MAX_WRS];
-	struct iser_mem_reg          data_reg;
-	struct iser_mem_reg          prot_reg;
-	struct ib_sig_attrs          sig_attrs;
+	struct ib_reg_wr	     reg_wr;
+	struct ib_send_wr	     send_wr;
+	struct ib_send_wr	     inv_wr;
 };
 
 #define ISER_RX_PAD_SIZE	(256 - (ISER_RX_PAYLOAD_SIZE + \
@@ -311,7 +296,7 @@
 	u64                          rsp_dma;
 	struct ib_sge                sge;
 	struct ib_cqe		     cqe;
-} __attribute__((packed));
+} __packed;
 
 struct iser_conn;
 struct ib_conn;
@@ -388,6 +373,7 @@
  *
  * @mr:         memory region
  * @fmr_pool:   pool of fmrs
+ * @sig_mr:     signature memory region
  * @page_vec:   fast reg page list used by fmr pool
  * @mr_valid:   is mr valid indicator
  */
@@ -396,36 +382,22 @@
 		struct ib_mr             *mr;
 		struct ib_fmr_pool       *fmr_pool;
 	};
+	struct ib_mr                     *sig_mr;
 	struct iser_page_vec             *page_vec;
 	u8				  mr_valid:1;
 };
 
 /**
- * struct iser_pi_context - Protection information context
- *
- * @rsc:             protection buffer registration resources
- * @sig_mr:          signature enable memory region
- * @sig_mr_valid:    is sig_mr valid indicator
- * @sig_protected:   is region protected indicator
- */
-struct iser_pi_context {
-	struct iser_reg_resources	rsc;
-	struct ib_mr                   *sig_mr;
-	u8                              sig_mr_valid:1;
-	u8                              sig_protected:1;
-};
-
-/**
  * struct iser_fr_desc - Fast registration descriptor
  *
  * @list:           entry in connection fastreg pool
  * @rsc:            data buffer registration resources
- * @pi_ctx:         protection information context
+ * @sig_protected:  is region protected indicator
  */
 struct iser_fr_desc {
 	struct list_head		  list;
 	struct iser_reg_resources	  rsc;
-	struct iser_pi_context		 *pi_ctx;
+	bool				  sig_protected;
 	struct list_head                  all_list;
 };
 
@@ -674,21 +646,6 @@
 iser_reg_desc_put_fmr(struct ib_conn *ib_conn,
 		      struct iser_fr_desc *desc);
 
-static inline struct ib_send_wr *
-iser_tx_next_wr(struct iser_tx_desc *tx_desc)
-{
-	struct ib_send_wr *cur_wr = &tx_desc->wrs[tx_desc->wr_idx].send;
-	struct ib_send_wr *last_wr;
-
-	if (tx_desc->wr_idx) {
-		last_wr = &tx_desc->wrs[tx_desc->wr_idx - 1].send;
-		last_wr->next = cur_wr;
-	}
-	tx_desc->wr_idx++;
-
-	return cur_wr;
-}
-
 static inline struct iser_conn *
 to_iser_conn(struct ib_conn *ib_conn)
 {
diff --git a/drivers/infiniband/ulp/iser/iser_initiator.c b/drivers/infiniband/ulp/iser/iser_initiator.c
index 2f63885..5cbb4b3 100644
--- a/drivers/infiniband/ulp/iser/iser_initiator.c
+++ b/drivers/infiniband/ulp/iser/iser_initiator.c
@@ -589,13 +589,18 @@
 	ib_conn->post_recv_buf_count--;
 }
 
-static inline void
+static inline int
 iser_inv_desc(struct iser_fr_desc *desc, u32 rkey)
 {
-	if (likely(rkey == desc->rsc.mr->rkey))
-		desc->rsc.mr_valid = 0;
-	else if (likely(rkey == desc->pi_ctx->sig_mr->rkey))
-		desc->pi_ctx->sig_mr_valid = 0;
+	if (unlikely((!desc->sig_protected && rkey != desc->rsc.mr->rkey) ||
+		     (desc->sig_protected && rkey != desc->rsc.sig_mr->rkey))) {
+		iser_err("Bogus remote invalidation for rkey %#x\n", rkey);
+		return -EINVAL;
+	}
+
+	desc->rsc.mr_valid = 0;
+
+	return 0;
 }
 
 static int
@@ -623,12 +628,14 @@
 
 			if (iser_task->dir[ISER_DIR_IN]) {
 				desc = iser_task->rdma_reg[ISER_DIR_IN].mem_h;
-				iser_inv_desc(desc, rkey);
+				if (unlikely(iser_inv_desc(desc, rkey)))
+					return -EINVAL;
 			}
 
 			if (iser_task->dir[ISER_DIR_OUT]) {
 				desc = iser_task->rdma_reg[ISER_DIR_OUT].mem_h;
-				iser_inv_desc(desc, rkey);
+				if (unlikely(iser_inv_desc(desc, rkey)))
+					return -EINVAL;
 			}
 		} else {
 			iser_err("failed to get task for itt=%d\n", hdr->itt);
@@ -742,6 +749,9 @@
 	iser_task->prot[ISER_DIR_IN].data_len  = 0;
 	iser_task->prot[ISER_DIR_OUT].data_len = 0;
 
+	iser_task->prot[ISER_DIR_IN].dma_nents = 0;
+	iser_task->prot[ISER_DIR_OUT].dma_nents = 0;
+
 	memset(&iser_task->rdma_reg[ISER_DIR_IN], 0,
 	       sizeof(struct iser_mem_reg));
 	memset(&iser_task->rdma_reg[ISER_DIR_OUT], 0,
diff --git a/drivers/infiniband/ulp/iser/iser_memory.c b/drivers/infiniband/ulp/iser/iser_memory.c
index 009be88..2cc89a9 100644
--- a/drivers/infiniband/ulp/iser/iser_memory.c
+++ b/drivers/infiniband/ulp/iser/iser_memory.c
@@ -77,8 +77,8 @@
 	struct ib_device *ib_dev = device->ib_device;
 
 	/* Assign function handles  - based on FMR support */
-	if (ib_dev->alloc_fmr && ib_dev->dealloc_fmr &&
-	    ib_dev->map_phys_fmr && ib_dev->unmap_fmr) {
+	if (ib_dev->ops.alloc_fmr && ib_dev->ops.dealloc_fmr &&
+	    ib_dev->ops.map_phys_fmr && ib_dev->ops.unmap_fmr) {
 		iser_info("FMR supported, using FMR for registration\n");
 		device->reg_ops = &fmr_ops;
 	} else if (ib_dev->attrs.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) {
@@ -145,9 +145,8 @@
 	for_each_sg(data->sg, sg, data->dma_nents, i)
 		iser_dbg("sg[%d] dma_addr:0x%lX page:0x%p "
 			 "off:0x%x sz:0x%x dma_len:0x%x\n",
-			 i, (unsigned long)ib_sg_dma_address(ibdev, sg),
-			 sg_page(sg), sg->offset,
-			 sg->length, ib_sg_dma_len(ibdev, sg));
+			 i, (unsigned long)sg_dma_address(sg),
+			 sg_page(sg), sg->offset, sg->length, sg_dma_len(sg));
 }
 
 static void iser_dump_page_vec(struct iser_page_vec *page_vec)
@@ -204,8 +203,8 @@
 		reg->rkey = device->pd->unsafe_global_rkey;
 	else
 		reg->rkey = 0;
-	reg->sge.addr = ib_sg_dma_address(device->ib_device, &sg[0]);
-	reg->sge.length = ib_sg_dma_len(device->ib_device, &sg[0]);
+	reg->sge.addr = sg_dma_address(&sg[0]);
+	reg->sge.length = sg_dma_len(&sg[0]);
 
 	iser_dbg("Single DMA entry: lkey=0x%x, rkey=0x%x, addr=0x%llx,"
 		 " length=0x%x\n", reg->sge.lkey, reg->rkey,
@@ -240,8 +239,8 @@
 	page_vec->npages = 0;
 	page_vec->fake_mr.page_size = SIZE_4K;
 	plen = ib_sg_to_pages(&page_vec->fake_mr, mem->sg,
-			      mem->size, NULL, iser_set_page);
-	if (unlikely(plen < mem->size)) {
+			      mem->dma_nents, NULL, iser_set_page);
+	if (unlikely(plen < mem->dma_nents)) {
 		iser_err("page vec too short to hold this SG\n");
 		iser_data_buf_dump(mem, device->ib_device);
 		iser_dump_page_vec(page_vec);
@@ -277,16 +276,13 @@
 			enum iser_data_dir cmd_dir)
 {
 	struct iser_mem_reg *reg = &iser_task->rdma_reg[cmd_dir];
-	int ret;
 
 	if (!reg->mem_h)
 		return;
 
 	iser_dbg("PHYSICAL Mem.Unregister mem_h %p\n", reg->mem_h);
 
-	ret = ib_fmr_pool_unmap((struct ib_pool_fmr *)reg->mem_h);
-	if (ret)
-		iser_err("ib_fmr_pool_unmap failed %d\n", ret);
+	ib_fmr_pool_unmap((struct ib_pool_fmr *)reg->mem_h);
 
 	reg->mem_h = NULL;
 }
@@ -306,8 +302,7 @@
 }
 
 static void
-iser_set_dif_domain(struct scsi_cmnd *sc, struct ib_sig_attrs *sig_attrs,
-		    struct ib_sig_domain *domain)
+iser_set_dif_domain(struct scsi_cmnd *sc, struct ib_sig_domain *domain)
 {
 	domain->sig_type = IB_SIG_TYPE_T10_DIF;
 	domain->sig.dif.pi_interval = scsi_prot_interval(sc);
@@ -330,21 +325,21 @@
 	case SCSI_PROT_WRITE_INSERT:
 	case SCSI_PROT_READ_STRIP:
 		sig_attrs->mem.sig_type = IB_SIG_TYPE_NONE;
-		iser_set_dif_domain(sc, sig_attrs, &sig_attrs->wire);
+		iser_set_dif_domain(sc, &sig_attrs->wire);
 		sig_attrs->wire.sig.dif.bg_type = IB_T10DIF_CRC;
 		break;
 	case SCSI_PROT_READ_INSERT:
 	case SCSI_PROT_WRITE_STRIP:
 		sig_attrs->wire.sig_type = IB_SIG_TYPE_NONE;
-		iser_set_dif_domain(sc, sig_attrs, &sig_attrs->mem);
+		iser_set_dif_domain(sc, &sig_attrs->mem);
 		sig_attrs->mem.sig.dif.bg_type = sc->prot_flags & SCSI_PROT_IP_CHECKSUM ?
 						IB_T10DIF_CSUM : IB_T10DIF_CRC;
 		break;
 	case SCSI_PROT_READ_PASS:
 	case SCSI_PROT_WRITE_PASS:
-		iser_set_dif_domain(sc, sig_attrs, &sig_attrs->wire);
+		iser_set_dif_domain(sc, &sig_attrs->wire);
 		sig_attrs->wire.sig.dif.bg_type = IB_T10DIF_CRC;
-		iser_set_dif_domain(sc, sig_attrs, &sig_attrs->mem);
+		iser_set_dif_domain(sc, &sig_attrs->mem);
 		sig_attrs->mem.sig.dif.bg_type = sc->prot_flags & SCSI_PROT_IP_CHECKSUM ?
 						IB_T10DIF_CSUM : IB_T10DIF_CRC;
 		break;
@@ -370,27 +365,29 @@
 static inline void
 iser_inv_rkey(struct ib_send_wr *inv_wr,
 	      struct ib_mr *mr,
-	      struct ib_cqe *cqe)
+	      struct ib_cqe *cqe,
+	      struct ib_send_wr *next_wr)
 {
 	inv_wr->opcode = IB_WR_LOCAL_INV;
 	inv_wr->wr_cqe = cqe;
 	inv_wr->ex.invalidate_rkey = mr->rkey;
 	inv_wr->send_flags = 0;
 	inv_wr->num_sge = 0;
+	inv_wr->next = next_wr;
 }
 
 static int
 iser_reg_sig_mr(struct iscsi_iser_task *iser_task,
-		struct iser_pi_context *pi_ctx,
-		struct iser_mem_reg *data_reg,
-		struct iser_mem_reg *prot_reg,
+		struct iser_data_buf *mem,
+		struct iser_data_buf *sig_mem,
+		struct iser_reg_resources *rsc,
 		struct iser_mem_reg *sig_reg)
 {
 	struct iser_tx_desc *tx_desc = &iser_task->desc;
-	struct ib_sig_attrs *sig_attrs = &tx_desc->sig_attrs;
 	struct ib_cqe *cqe = &iser_task->iser_conn->ib_conn.reg_cqe;
-	struct ib_sig_handover_wr *wr;
-	struct ib_mr *mr = pi_ctx->sig_mr;
+	struct ib_mr *mr = rsc->sig_mr;
+	struct ib_sig_attrs *sig_attrs = mr->sig_attrs;
+	struct ib_reg_wr *wr = &tx_desc->reg_wr;
 	int ret;
 
 	memset(sig_attrs, 0, sizeof(*sig_attrs));
@@ -400,33 +397,36 @@
 
 	iser_set_prot_checks(iser_task->sc, &sig_attrs->check_mask);
 
-	if (pi_ctx->sig_mr_valid)
-		iser_inv_rkey(iser_tx_next_wr(tx_desc), mr, cqe);
+	if (rsc->mr_valid)
+		iser_inv_rkey(&tx_desc->inv_wr, mr, cqe, &wr->wr);
 
 	ib_update_fast_reg_key(mr, ib_inc_rkey(mr->rkey));
 
-	wr = container_of(iser_tx_next_wr(tx_desc), struct ib_sig_handover_wr,
-			  wr);
-	wr->wr.opcode = IB_WR_REG_SIG_MR;
+	ret = ib_map_mr_sg_pi(mr, mem->sg, mem->dma_nents, NULL,
+			      sig_mem->sg, sig_mem->dma_nents, NULL, SZ_4K);
+	if (unlikely(ret)) {
+		iser_err("failed to map PI sg (%d)\n",
+			 mem->dma_nents + sig_mem->dma_nents);
+		goto err;
+	}
+
+	memset(wr, 0, sizeof(*wr));
+	wr->wr.next = &tx_desc->send_wr;
+	wr->wr.opcode = IB_WR_REG_MR_INTEGRITY;
 	wr->wr.wr_cqe = cqe;
-	wr->wr.sg_list = &data_reg->sge;
-	wr->wr.num_sge = 1;
+	wr->wr.num_sge = 0;
 	wr->wr.send_flags = 0;
-	wr->sig_attrs = sig_attrs;
-	wr->sig_mr = mr;
-	if (scsi_prot_sg_count(iser_task->sc))
-		wr->prot = &prot_reg->sge;
-	else
-		wr->prot = NULL;
-	wr->access_flags = IB_ACCESS_LOCAL_WRITE |
-			   IB_ACCESS_REMOTE_READ |
-			   IB_ACCESS_REMOTE_WRITE;
-	pi_ctx->sig_mr_valid = 1;
+	wr->mr = mr;
+	wr->key = mr->rkey;
+	wr->access = IB_ACCESS_LOCAL_WRITE |
+		     IB_ACCESS_REMOTE_READ |
+		     IB_ACCESS_REMOTE_WRITE;
+	rsc->mr_valid = 1;
 
 	sig_reg->sge.lkey = mr->lkey;
 	sig_reg->rkey = mr->rkey;
-	sig_reg->sge.addr = 0;
-	sig_reg->sge.length = scsi_transfer_length(iser_task->sc);
+	sig_reg->sge.addr = mr->iova;
+	sig_reg->sge.length = mr->length;
 
 	iser_dbg("lkey=0x%x rkey=0x%x addr=0x%llx length=%u\n",
 		 sig_reg->sge.lkey, sig_reg->rkey, sig_reg->sge.addr,
@@ -443,22 +443,22 @@
 	struct iser_tx_desc *tx_desc = &iser_task->desc;
 	struct ib_cqe *cqe = &iser_task->iser_conn->ib_conn.reg_cqe;
 	struct ib_mr *mr = rsc->mr;
-	struct ib_reg_wr *wr;
+	struct ib_reg_wr *wr = &tx_desc->reg_wr;
 	int n;
 
 	if (rsc->mr_valid)
-		iser_inv_rkey(iser_tx_next_wr(tx_desc), mr, cqe);
+		iser_inv_rkey(&tx_desc->inv_wr, mr, cqe, &wr->wr);
 
 	ib_update_fast_reg_key(mr, ib_inc_rkey(mr->rkey));
 
-	n = ib_map_mr_sg(mr, mem->sg, mem->size, NULL, SIZE_4K);
-	if (unlikely(n != mem->size)) {
+	n = ib_map_mr_sg(mr, mem->sg, mem->dma_nents, NULL, SIZE_4K);
+	if (unlikely(n != mem->dma_nents)) {
 		iser_err("failed to map sg (%d/%d)\n",
-			 n, mem->size);
+			 n, mem->dma_nents);
 		return n < 0 ? n : -EINVAL;
 	}
 
-	wr = container_of(iser_tx_next_wr(tx_desc), struct ib_reg_wr, wr);
+	wr->wr.next = &tx_desc->send_wr;
 	wr->wr.opcode = IB_WR_REG_MR;
 	wr->wr.wr_cqe = cqe;
 	wr->wr.send_flags = 0;
@@ -483,21 +483,6 @@
 }
 
 static int
-iser_reg_prot_sg(struct iscsi_iser_task *task,
-		 struct iser_data_buf *mem,
-		 struct iser_fr_desc *desc,
-		 bool use_dma_key,
-		 struct iser_mem_reg *reg)
-{
-	struct iser_device *device = task->iser_conn->ib_conn.device;
-
-	if (use_dma_key)
-		return iser_reg_dma(device, mem, reg);
-
-	return device->reg_ops->reg_mem(task, mem, &desc->pi_ctx->rsc, reg);
-}
-
-static int
 iser_reg_data_sg(struct iscsi_iser_task *task,
 		 struct iser_data_buf *mem,
 		 struct iser_fr_desc *desc,
@@ -520,7 +505,6 @@
 	struct iser_device *device = ib_conn->device;
 	struct iser_data_buf *mem = &task->data[dir];
 	struct iser_mem_reg *reg = &task->rdma_reg[dir];
-	struct iser_mem_reg *data_reg;
 	struct iser_fr_desc *desc = NULL;
 	bool use_dma_key;
 	int err;
@@ -533,32 +517,17 @@
 		reg->mem_h = desc;
 	}
 
-	if (scsi_get_prot_op(task->sc) == SCSI_PROT_NORMAL)
-		data_reg = reg;
-	else
-		data_reg = &task->desc.data_reg;
-
-	err = iser_reg_data_sg(task, mem, desc, use_dma_key, data_reg);
-	if (unlikely(err))
-		goto err_reg;
-
-	if (scsi_get_prot_op(task->sc) != SCSI_PROT_NORMAL) {
-		struct iser_mem_reg *prot_reg = &task->desc.prot_reg;
-
-		if (scsi_prot_sg_count(task->sc)) {
-			mem = &task->prot[dir];
-			err = iser_reg_prot_sg(task, mem, desc,
-					       use_dma_key, prot_reg);
-			if (unlikely(err))
-				goto err_reg;
-		}
-
-		err = iser_reg_sig_mr(task, desc->pi_ctx, data_reg,
-				      prot_reg, reg);
+	if (scsi_get_prot_op(task->sc) == SCSI_PROT_NORMAL) {
+		err = iser_reg_data_sg(task, mem, desc, use_dma_key, reg);
+		if (unlikely(err))
+			goto err_reg;
+	} else {
+		err = iser_reg_sig_mr(task, mem, &task->prot[dir],
+				      &desc->rsc, reg);
 		if (unlikely(err))
 			goto err_reg;
 
-		desc->pi_ctx->sig_protected = 1;
+		desc->sig_protected = 1;
 	}
 
 	return 0;
diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c b/drivers/infiniband/ulp/iser/iser_verbs.c
index bee8c0b..a6548de 100644
--- a/drivers/infiniband/ulp/iser/iser_verbs.c
+++ b/drivers/infiniband/ulp/iser/iser_verbs.c
@@ -55,7 +55,7 @@
 {
 	iser_err("async event %s (%d) on device %s port %d\n",
 		 ib_event_msg(event->event), event->event,
-		 event->device->name, event->element.port_num);
+		dev_name(&event->device->dev), event->element.port_num);
 }
 
 /**
@@ -85,7 +85,7 @@
 	max_cqe = min(ISER_MAX_CQ_LEN, ib_dev->attrs.max_cqe);
 
 	iser_info("using %d CQs, device %s supports %d vectors max_cqe %d\n",
-		  device->comps_used, ib_dev->name,
+		  device->comps_used, dev_name(&ib_dev->dev),
 		  ib_dev->num_comp_vectors, max_cqe);
 
 	device->pd = ib_alloc_pd(ib_dev,
@@ -233,85 +233,6 @@
 	kfree(desc);
 }
 
-static int
-iser_alloc_reg_res(struct iser_device *device,
-		   struct ib_pd *pd,
-		   struct iser_reg_resources *res,
-		   unsigned int size)
-{
-	struct ib_device *ib_dev = device->ib_device;
-	enum ib_mr_type mr_type;
-	int ret;
-
-	if (ib_dev->attrs.device_cap_flags & IB_DEVICE_SG_GAPS_REG)
-		mr_type = IB_MR_TYPE_SG_GAPS;
-	else
-		mr_type = IB_MR_TYPE_MEM_REG;
-
-	res->mr = ib_alloc_mr(pd, mr_type, size);
-	if (IS_ERR(res->mr)) {
-		ret = PTR_ERR(res->mr);
-		iser_err("Failed to allocate ib_fast_reg_mr err=%d\n", ret);
-		return ret;
-	}
-	res->mr_valid = 0;
-
-	return 0;
-}
-
-static void
-iser_free_reg_res(struct iser_reg_resources *rsc)
-{
-	ib_dereg_mr(rsc->mr);
-}
-
-static int
-iser_alloc_pi_ctx(struct iser_device *device,
-		  struct ib_pd *pd,
-		  struct iser_fr_desc *desc,
-		  unsigned int size)
-{
-	struct iser_pi_context *pi_ctx = NULL;
-	int ret;
-
-	desc->pi_ctx = kzalloc(sizeof(*desc->pi_ctx), GFP_KERNEL);
-	if (!desc->pi_ctx)
-		return -ENOMEM;
-
-	pi_ctx = desc->pi_ctx;
-
-	ret = iser_alloc_reg_res(device, pd, &pi_ctx->rsc, size);
-	if (ret) {
-		iser_err("failed to allocate reg_resources\n");
-		goto alloc_reg_res_err;
-	}
-
-	pi_ctx->sig_mr = ib_alloc_mr(pd, IB_MR_TYPE_SIGNATURE, 2);
-	if (IS_ERR(pi_ctx->sig_mr)) {
-		ret = PTR_ERR(pi_ctx->sig_mr);
-		goto sig_mr_failure;
-	}
-	pi_ctx->sig_mr_valid = 0;
-	desc->pi_ctx->sig_protected = 0;
-
-	return 0;
-
-sig_mr_failure:
-	iser_free_reg_res(&pi_ctx->rsc);
-alloc_reg_res_err:
-	kfree(desc->pi_ctx);
-
-	return ret;
-}
-
-static void
-iser_free_pi_ctx(struct iser_pi_context *pi_ctx)
-{
-	iser_free_reg_res(&pi_ctx->rsc);
-	ib_dereg_mr(pi_ctx->sig_mr);
-	kfree(pi_ctx);
-}
-
 static struct iser_fr_desc *
 iser_create_fastreg_desc(struct iser_device *device,
 			 struct ib_pd *pd,
@@ -319,32 +240,58 @@
 			 unsigned int size)
 {
 	struct iser_fr_desc *desc;
+	struct ib_device *ib_dev = device->ib_device;
+	enum ib_mr_type mr_type;
 	int ret;
 
 	desc = kzalloc(sizeof(*desc), GFP_KERNEL);
 	if (!desc)
 		return ERR_PTR(-ENOMEM);
 
-	ret = iser_alloc_reg_res(device, pd, &desc->rsc, size);
-	if (ret)
-		goto reg_res_alloc_failure;
+	if (ib_dev->attrs.device_cap_flags & IB_DEVICE_SG_GAPS_REG)
+		mr_type = IB_MR_TYPE_SG_GAPS;
+	else
+		mr_type = IB_MR_TYPE_MEM_REG;
+
+	desc->rsc.mr = ib_alloc_mr(pd, mr_type, size);
+	if (IS_ERR(desc->rsc.mr)) {
+		ret = PTR_ERR(desc->rsc.mr);
+		iser_err("Failed to allocate ib_fast_reg_mr err=%d\n", ret);
+		goto err_alloc_mr;
+	}
 
 	if (pi_enable) {
-		ret = iser_alloc_pi_ctx(device, pd, desc, size);
-		if (ret)
-			goto pi_ctx_alloc_failure;
+		desc->rsc.sig_mr = ib_alloc_mr_integrity(pd, size, size);
+		if (IS_ERR(desc->rsc.sig_mr)) {
+			ret = PTR_ERR(desc->rsc.sig_mr);
+			iser_err("Failed to allocate sig_mr err=%d\n", ret);
+			goto err_alloc_mr_integrity;
+		}
 	}
+	desc->rsc.mr_valid = 0;
 
 	return desc;
 
-pi_ctx_alloc_failure:
-	iser_free_reg_res(&desc->rsc);
-reg_res_alloc_failure:
+err_alloc_mr_integrity:
+	ib_dereg_mr(desc->rsc.mr);
+err_alloc_mr:
 	kfree(desc);
 
 	return ERR_PTR(ret);
 }
 
+static void iser_destroy_fastreg_desc(struct iser_fr_desc *desc)
+{
+	struct iser_reg_resources *res = &desc->rsc;
+
+	ib_dereg_mr(res->mr);
+	if (res->sig_mr) {
+		ib_dereg_mr(res->sig_mr);
+		res->sig_mr = NULL;
+	}
+	kfree(desc);
+}
+
 /**
  * iser_alloc_fastreg_pool - Creates pool of fast_reg descriptors
  * for fast registration work requests.
@@ -399,10 +346,7 @@
 
 	list_for_each_entry_safe(desc, tmp, &fr_pool->all_list, all_list) {
 		list_del(&desc->all_list);
-		iser_free_reg_res(&desc->rsc);
-		if (desc->pi_ctx)
-			iser_free_pi_ctx(desc->pi_ctx);
-		kfree(desc);
+		iser_destroy_fastreg_desc(desc);
 		++i;
 	}
 
@@ -455,7 +399,7 @@
 	init_attr.qp_type	= IB_QPT_RC;
 	if (ib_conn->pi_support) {
 		init_attr.cap.max_send_wr = ISER_QP_SIG_MAX_REQ_DTOS + 1;
-		init_attr.create_flags |= IB_QP_CREATE_SIGNATURE_EN;
+		init_attr.create_flags |= IB_QP_CREATE_INTEGRITY_EN;
 		iser_conn->max_cmds =
 			ISER_GET_MAX_XMIT_CMDS(ISER_QP_SIG_MAX_REQ_DTOS);
 	} else {
@@ -468,7 +412,8 @@
 			iser_conn->max_cmds =
 				ISER_GET_MAX_XMIT_CMDS(ib_dev->attrs.max_qp_wr);
 			iser_dbg("device %s supports max_send_wr %d\n",
-				 device->ib_device->name, ib_dev->attrs.max_qp_wr);
+				 dev_name(&device->ib_device->dev),
+				 ib_dev->attrs.max_qp_wr);
 		}
 	}
 
@@ -706,6 +651,7 @@
 	struct ib_device_attr *attr = &device->ib_device->attrs;
 	unsigned short sg_tablesize, sup_sg_tablesize;
 	unsigned short reserved_mr_pages;
+	u32 max_num_sg;
 
 	/*
 	 * FRs without SG_GAPS or FMRs can only map up to a (device) page per
@@ -719,12 +665,17 @@
 	else
 		reserved_mr_pages = 1;
 
+	if (iser_conn->ib_conn.pi_support)
+		max_num_sg = attr->max_pi_fast_reg_page_list_len;
+	else
+		max_num_sg = attr->max_fast_reg_page_list_len;
+
 	sg_tablesize = DIV_ROUND_UP(max_sectors * 512, SIZE_4K);
 	if (attr->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS)
 		sup_sg_tablesize =
 			min_t(
 			 uint, ISCSI_ISER_MAX_SG_TABLESIZE,
-			 attr->max_fast_reg_page_list_len - reserved_mr_pages);
+			 max_num_sg - reserved_mr_pages);
 	else
 		sup_sg_tablesize = ISCSI_ISER_MAX_SG_TABLESIZE;
 
@@ -761,10 +712,10 @@
 	/* connection T10-PI support */
 	if (iser_pi_enable) {
 		if (!(device->ib_device->attrs.device_cap_flags &
-		      IB_DEVICE_SIGNATURE_HANDOVER)) {
+		      IB_DEVICE_INTEGRITY_HANDOVER)) {
 			iser_warn("T10-PI requested but not supported on %s, "
 				  "continue without T10-PI\n",
-				  ib_conn->device->ib_device->name);
+				  dev_name(&ib_conn->device->ib_device->dev));
 			ib_conn->pi_support = false;
 		} else {
 			ib_conn->pi_support = true;
@@ -1086,7 +1037,8 @@
 int iser_post_send(struct ib_conn *ib_conn, struct iser_tx_desc *tx_desc,
 		   bool signal)
 {
-	struct ib_send_wr *wr = iser_tx_next_wr(tx_desc);
+	struct ib_send_wr *wr = &tx_desc->send_wr;
+	struct ib_send_wr *first_wr;
 	int ib_ret;
 
 	ib_dma_sync_single_for_device(ib_conn->device->ib_device,
@@ -1100,7 +1052,14 @@
 	wr->opcode = IB_WR_SEND;
 	wr->send_flags = signal ? IB_SEND_SIGNALED : 0;
 
-	ib_ret = ib_post_send(ib_conn->qp, &tx_desc->wrs[0].send, NULL);
+	if (tx_desc->inv_wr.next)
+		first_wr = &tx_desc->inv_wr;
+	else if (tx_desc->reg_wr.wr.next)
+		first_wr = &tx_desc->reg_wr.wr;
+	else
+		first_wr = wr;
+
+	ib_ret = ib_post_send(ib_conn->qp, first_wr, NULL);
 	if (ib_ret)
 		iser_err("ib_post_send failed, ret:%d opcode:%d\n",
 			 ib_ret, wr->opcode);
@@ -1117,9 +1076,9 @@
 	struct ib_mr_status mr_status;
 	int ret;
 
-	if (desc && desc->pi_ctx->sig_protected) {
-		desc->pi_ctx->sig_protected = 0;
-		ret = ib_check_mr_status(desc->pi_ctx->sig_mr,
+	if (desc && desc->sig_protected) {
+		desc->sig_protected = 0;
+		ret = ib_check_mr_status(desc->rsc.sig_mr,
 					 IB_MR_CHECK_SIG_STATUS, &mr_status);
 		if (ret) {
 			pr_err("ib_check_mr_status failed, ret %d\n", ret);
diff --git a/drivers/infiniband/ulp/isert/Kconfig b/drivers/infiniband/ulp/isert/Kconfig
index 02f9759..1a3f5ca 100644
--- a/drivers/infiniband/ulp/isert/Kconfig
+++ b/drivers/infiniband/ulp/isert/Kconfig
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 config INFINIBAND_ISERT
 	tristate "iSCSI Extensions for RDMA (iSER) target support"
 	depends on INET && INFINIBAND_ADDR_TRANS && TARGET_CORE && ISCSI_TARGET
diff --git a/drivers/infiniband/ulp/isert/Makefile b/drivers/infiniband/ulp/isert/Makefile
index c8bf242..e19b16c 100644
--- a/drivers/infiniband/ulp/isert/Makefile
+++ b/drivers/infiniband/ulp/isert/Makefile
@@ -1,2 +1,2 @@
-ccflags-y		:= -Idrivers/target -Idrivers/target/iscsi
+# SPDX-License-Identifier: GPL-2.0-only
 obj-$(CONFIG_INFINIBAND_ISERT)	+= ib_isert.o
diff --git a/drivers/infiniband/ulp/isert/ib_isert.c b/drivers/infiniband/ulp/isert/ib_isert.c
index f39670c..a1a0352 100644
--- a/drivers/infiniband/ulp/isert/ib_isert.c
+++ b/drivers/infiniband/ulp/isert/ib_isert.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*******************************************************************************
  * This file contains iSCSI extentions for RDMA (iSER) Verbs
  *
@@ -5,15 +6,6 @@
  *
  * Nicholas A. Bellinger <nab@linux-iscsi.org>
  *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
  ****************************************************************************/
 
 #include <linux/string.h>
@@ -141,7 +133,7 @@
 	attr.sq_sig_type = IB_SIGNAL_REQ_WR;
 	attr.qp_type = IB_QPT_RC;
 	if (device->pi_capable)
-		attr.create_flags |= IB_QP_CREATE_SIGNATURE_EN;
+		attr.create_flags |= IB_QP_CREATE_INTEGRITY_EN;
 
 	ret = rdma_create_qp(cma_id, device->pd, &attr);
 	if (ret) {
@@ -262,7 +254,7 @@
 
 	isert_info("Using %d CQs, %s supports %d vectors support "
 		   "pi_capable %d\n",
-		   device->comps_used, device->ib_device->name,
+		   device->comps_used, dev_name(&device->ib_device->dev),
 		   device->ib_device->num_comp_vectors,
 		   device->pi_capable);
 
@@ -317,7 +309,7 @@
 
 	/* Check signature cap */
 	device->pi_capable = ib_dev->attrs.device_cap_flags &
-			     IB_DEVICE_SIGNATURE_HANDOVER ? true : false;
+			     IB_DEVICE_INTEGRITY_HANDOVER ? true : false;
 
 	return 0;
 
@@ -1186,7 +1178,7 @@
 	rc = iscsit_sequence_cmd(conn, cmd, buf, hdr->cmdsn);
 
 	if (!rc && dump_payload == false && unsol_data)
-		iscsit_set_unsoliticed_dataout(cmd);
+		iscsit_set_unsolicited_dataout(cmd);
 	else if (dump_payload && imm_data)
 		target_put_sess_cmd(&cmd->se_cmd);
 
@@ -1677,7 +1669,7 @@
 
 	isert_dbg("Cmd %p\n", isert_cmd);
 
-	ret = isert_check_pi_status(cmd, isert_cmd->rw.sig->sig_mr);
+	ret = isert_check_pi_status(cmd, isert_cmd->rw.reg->mr);
 	isert_rdma_rw_ctx_destroy(isert_cmd, isert_conn);
 
 	if (ret) {
@@ -1723,7 +1715,7 @@
 	iscsit_stop_dataout_timer(cmd);
 
 	if (isert_prot_cmd(isert_conn, se_cmd))
-		ret = isert_check_pi_status(se_cmd, isert_cmd->rw.sig->sig_mr);
+		ret = isert_check_pi_status(se_cmd, isert_cmd->rw.reg->mr);
 	isert_rdma_rw_ctx_destroy(isert_cmd, isert_conn);
 	cmd->write_data_done = 0;
 
@@ -2067,8 +2059,7 @@
 }
 
 static inline void
-isert_set_dif_domain(struct se_cmd *se_cmd, struct ib_sig_attrs *sig_attrs,
-		     struct ib_sig_domain *domain)
+isert_set_dif_domain(struct se_cmd *se_cmd, struct ib_sig_domain *domain)
 {
 	domain->sig_type = IB_SIG_TYPE_T10_DIF;
 	domain->sig.dif.bg_type = IB_T10DIF_CRC;
@@ -2096,17 +2087,17 @@
 	case TARGET_PROT_DIN_INSERT:
 	case TARGET_PROT_DOUT_STRIP:
 		sig_attrs->mem.sig_type = IB_SIG_TYPE_NONE;
-		isert_set_dif_domain(se_cmd, sig_attrs, &sig_attrs->wire);
+		isert_set_dif_domain(se_cmd, &sig_attrs->wire);
 		break;
 	case TARGET_PROT_DOUT_INSERT:
 	case TARGET_PROT_DIN_STRIP:
 		sig_attrs->wire.sig_type = IB_SIG_TYPE_NONE;
-		isert_set_dif_domain(se_cmd, sig_attrs, &sig_attrs->mem);
+		isert_set_dif_domain(se_cmd, &sig_attrs->mem);
 		break;
 	case TARGET_PROT_DIN_PASS:
 	case TARGET_PROT_DOUT_PASS:
-		isert_set_dif_domain(se_cmd, sig_attrs, &sig_attrs->wire);
-		isert_set_dif_domain(se_cmd, sig_attrs, &sig_attrs->mem);
+		isert_set_dif_domain(se_cmd, &sig_attrs->wire);
+		isert_set_dif_domain(se_cmd, &sig_attrs->mem);
 		break;
 	default:
 		isert_err("Unsupported PI operation %d\n", se_cmd->prot_op);
diff --git a/drivers/infiniband/ulp/opa_vnic/Kconfig b/drivers/infiniband/ulp/opa_vnic/Kconfig
index 48132ab5..a1f266b 100644
--- a/drivers/infiniband/ulp/opa_vnic/Kconfig
+++ b/drivers/infiniband/ulp/opa_vnic/Kconfig
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 config INFINIBAND_OPA_VNIC
 	tristate "Intel OPA VNIC support"
 	depends on X86_64 && INFINIBAND
diff --git a/drivers/infiniband/ulp/opa_vnic/Makefile b/drivers/infiniband/ulp/opa_vnic/Makefile
index 8061b28..a8c21d1 100644
--- a/drivers/infiniband/ulp/opa_vnic/Makefile
+++ b/drivers/infiniband/ulp/opa_vnic/Makefile
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 # Makefile - Intel Omni-Path Virtual Network Controller driver
 # Copyright(c) 2017, Intel Corporation.
 #
diff --git a/drivers/infiniband/ulp/opa_vnic/opa_vnic_encap.c b/drivers/infiniband/ulp/opa_vnic/opa_vnic_encap.c
index 267da82..31cd361 100644
--- a/drivers/infiniband/ulp/opa_vnic/opa_vnic_encap.c
+++ b/drivers/infiniband/ulp/opa_vnic/opa_vnic_encap.c
@@ -351,7 +351,8 @@
 			if (unlikely(!dlid))
 				v_warn("Null dlid in MAC address\n");
 		} else if (def_port != OPA_VNIC_INVALID_PORT) {
-			dlid = info->vesw.u_ucast_dlid[def_port];
+			if (def_port < OPA_VESW_MAX_NUM_DEF_PORT)
+				dlid = info->vesw.u_ucast_dlid[def_port];
 		}
 	}
 
diff --git a/drivers/infiniband/ulp/opa_vnic/opa_vnic_netdev.c b/drivers/infiniband/ulp/opa_vnic/opa_vnic_netdev.c
index 6155878..aeff68f 100644
--- a/drivers/infiniband/ulp/opa_vnic/opa_vnic_netdev.c
+++ b/drivers/infiniband/ulp/opa_vnic/opa_vnic_netdev.c
@@ -95,8 +95,7 @@
 }
 
 static u16 opa_vnic_select_queue(struct net_device *netdev, struct sk_buff *skb,
-				 struct net_device *sb_dev,
-				 select_queue_fallback_t fallback)
+				 struct net_device *sb_dev)
 {
 	struct opa_vnic_adapter *adapter = opa_vnic_priv(netdev);
 	struct opa_vnic_skb_mdata *mdata;
@@ -106,8 +105,7 @@
 	mdata = skb_push(skb, sizeof(*mdata));
 	mdata->entropy = opa_vnic_calc_entropy(skb);
 	mdata->vl = opa_vnic_get_vl(adapter, skb);
-	rc = adapter->rn_ops->ndo_select_queue(netdev, skb,
-					       sb_dev, fallback);
+	rc = adapter->rn_ops->ndo_select_queue(netdev, skb, sb_dev);
 	skb_pull(skb, sizeof(*mdata));
 	return rc;
 }
@@ -330,10 +328,10 @@
 	struct rdma_netdev *rn;
 	int rc;
 
-	netdev = ibdev->alloc_rdma_netdev(ibdev, port_num,
-					  RDMA_NETDEV_OPA_VNIC,
-					  "veth%d", NET_NAME_UNKNOWN,
-					  ether_setup);
+	netdev = ibdev->ops.alloc_rdma_netdev(ibdev, port_num,
+					      RDMA_NETDEV_OPA_VNIC,
+					      "veth%d", NET_NAME_UNKNOWN,
+					      ether_setup);
 	if (!netdev)
 		return ERR_PTR(-ENOMEM);
 	else if (IS_ERR(netdev))
diff --git a/drivers/infiniband/ulp/opa_vnic/opa_vnic_vema.c b/drivers/infiniband/ulp/opa_vnic/opa_vnic_vema.c
index 15711dc..be5befd 100644
--- a/drivers/infiniband/ulp/opa_vnic/opa_vnic_vema.c
+++ b/drivers/infiniband/ulp/opa_vnic/opa_vnic_vema.c
@@ -51,6 +51,7 @@
  */
 
 #include <linux/module.h>
+#include <linux/xarray.h>
 #include <rdma/ib_addr.h>
 #include <rdma/ib_verbs.h>
 #include <rdma/opa_smi.h>
@@ -97,7 +98,7 @@
  * @class_port_info: Class port info information.
  * @tid: Transaction id
  * @port_num: OPA port number
- * @vport_idr: vnic ports idr
+ * @vports: vnic ports
  * @event_handler: ib event handler
  * @lock: adapter interface lock
  */
@@ -107,7 +108,7 @@
 	struct opa_class_port_info      class_port_info;
 	u64                             tid;
 	u8                              port_num;
-	struct idr                      vport_idr;
+	struct xarray                   vports;
 	struct ib_event_handler         event_handler;
 
 	/* Lock to query/update network adapter */
@@ -148,7 +149,7 @@
 {
 	u8 vport_num = vema_get_vport_num(recvd_mad);
 
-	return idr_find(&port->vport_idr, vport_num);
+	return xa_load(&port->vports, vport_num);
 }
 
 /**
@@ -207,8 +208,7 @@
 		int rc;
 
 		adapter->cport = cport;
-		rc = idr_alloc(&port->vport_idr, adapter, vport_num,
-			       vport_num + 1, GFP_NOWAIT);
+		rc = xa_insert(&port->vports, vport_num, adapter, GFP_KERNEL);
 		if (rc < 0) {
 			opa_vnic_rem_netdev(adapter);
 			adapter = ERR_PTR(rc);
@@ -606,7 +606,7 @@
 static void vema_send(struct ib_mad_agent *mad_agent,
 		      struct ib_mad_send_wc *mad_wc)
 {
-	rdma_destroy_ah(mad_wc->send_buf->ah);
+	rdma_destroy_ah(mad_wc->send_buf->ah, RDMA_DESTROY_AH_SLEEPABLE);
 	ib_free_send_mad(mad_wc->send_buf);
 }
 
@@ -680,7 +680,7 @@
 	ib_free_send_mad(rsp);
 
 err_rsp:
-	rdma_destroy_ah(ah);
+	rdma_destroy_ah(ah, RDMA_DESTROY_AH_SLEEPABLE);
 free_recv_mad:
 	ib_free_recv_mad(mad_wc);
 }
@@ -777,7 +777,7 @@
 	}
 
 	rdma_ah_set_dlid(&ah_attr, trap_lid);
-	ah = rdma_create_ah(port->mad_agent->qp->pd, &ah_attr);
+	ah = rdma_create_ah(port->mad_agent->qp->pd, &ah_attr, 0);
 	if (IS_ERR(ah)) {
 		c_err("%s:Couldn't create new AH = %p\n", __func__, ah);
 		c_err("%s:dlid = %d, sl = %d, port = %d\n", __func__,
@@ -848,52 +848,37 @@
 	}
 
 err_sndbuf:
-	rdma_destroy_ah(ah);
+	rdma_destroy_ah(ah, 0);
 err_exit:
 	v_err("Aborting trap\n");
 }
 
-static int vema_rem_vport(int id, void *p, void *data)
-{
-	struct opa_vnic_adapter *adapter = p;
-
-	opa_vnic_rem_netdev(adapter);
-	return 0;
-}
-
-static int vema_enable_vport(int id, void *p, void *data)
-{
-	struct opa_vnic_adapter *adapter = p;
-
-	netif_carrier_on(adapter->netdev);
-	return 0;
-}
-
-static int vema_disable_vport(int id, void *p, void *data)
-{
-	struct opa_vnic_adapter *adapter = p;
-
-	netif_carrier_off(adapter->netdev);
-	return 0;
-}
-
 static void opa_vnic_event(struct ib_event_handler *handler,
 			   struct ib_event *record)
 {
 	struct opa_vnic_vema_port *port =
 		container_of(handler, struct opa_vnic_vema_port, event_handler);
 	struct opa_vnic_ctrl_port *cport = port->cport;
+	struct opa_vnic_adapter *adapter;
+	unsigned long index;
 
 	if (record->element.port_num != port->port_num)
 		return;
 
 	c_dbg("OPA_VNIC received event %d on device %s port %d\n",
-	      record->event, record->device->name, record->element.port_num);
+	      record->event, dev_name(&record->device->dev),
+	      record->element.port_num);
 
-	if (record->event == IB_EVENT_PORT_ERR)
-		idr_for_each(&port->vport_idr, vema_disable_vport, NULL);
-	if (record->event == IB_EVENT_PORT_ACTIVE)
-		idr_for_each(&port->vport_idr, vema_enable_vport, NULL);
+	if (record->event != IB_EVENT_PORT_ERR &&
+	    record->event != IB_EVENT_PORT_ACTIVE)
+		return;
+
+	xa_for_each(&port->vports, index, adapter) {
+		if (record->event == IB_EVENT_PORT_ACTIVE)
+			netif_carrier_on(adapter->netdev);
+		else
+			netif_carrier_off(adapter->netdev);
+	}
 }
 
 /**
@@ -904,6 +889,8 @@
  */
 static void vema_unregister(struct opa_vnic_ctrl_port *cport)
 {
+	struct opa_vnic_adapter *adapter;
+	unsigned long index;
 	int i;
 
 	for (i = 1; i <= cport->num_ports; i++) {
@@ -914,13 +901,14 @@
 
 		/* Lock ensures no MAD is being processed */
 		mutex_lock(&port->lock);
-		idr_for_each(&port->vport_idr, vema_rem_vport, NULL);
+		xa_for_each(&port->vports, index, adapter)
+			opa_vnic_rem_netdev(adapter);
 		mutex_unlock(&port->lock);
 
 		ib_unregister_mad_agent(port->mad_agent);
 		port->mad_agent = NULL;
 		mutex_destroy(&port->lock);
-		idr_destroy(&port->vport_idr);
+		xa_destroy(&port->vports);
 		ib_unregister_event_handler(&port->event_handler);
 	}
 }
@@ -957,7 +945,7 @@
 				      cport->ibdev, opa_vnic_event);
 		ib_register_event_handler(&port->event_handler);
 
-		idr_init(&port->vport_idr);
+		xa_init(&port->vports);
 		mutex_init(&port->lock);
 		port->mad_agent = ib_register_mad_agent(cport->ibdev, i,
 							IB_QPT_GSI, &reg_req,
@@ -968,7 +956,6 @@
 			ret = PTR_ERR(port->mad_agent);
 			port->mad_agent = NULL;
 			mutex_destroy(&port->lock);
-			idr_destroy(&port->vport_idr);
 			vema_unregister(cport);
 			return ret;
 		}
diff --git a/drivers/infiniband/ulp/srp/Kbuild b/drivers/infiniband/ulp/srp/Kbuild
index a16c73c..d1f4e51 100644
--- a/drivers/infiniband/ulp/srp/Kbuild
+++ b/drivers/infiniband/ulp/srp/Kbuild
@@ -1 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0
 obj-$(CONFIG_INFINIBAND_SRP)			+= ib_srp.o
diff --git a/drivers/infiniband/ulp/srp/Kconfig b/drivers/infiniband/ulp/srp/Kconfig
index 99db8fe..6f5e7b3 100644
--- a/drivers/infiniband/ulp/srp/Kconfig
+++ b/drivers/infiniband/ulp/srp/Kconfig
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 config INFINIBAND_SRP
 	tristate "InfiniBand SCSI RDMA Protocol"
 	depends on SCSI && INFINIBAND_ADDR_TRANS
diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c
index 0b34e90..b596035 100644
--- a/drivers/infiniband/ulp/srp/ib_srp.c
+++ b/drivers/infiniband/ulp/srp/ib_srp.c
@@ -132,6 +132,15 @@
 		 " if fast_io_fail_tmo has not been set. \"off\" means that"
 		 " this functionality is disabled.");
 
+static bool srp_use_imm_data = true;
+module_param_named(use_imm_data, srp_use_imm_data, bool, 0644);
+MODULE_PARM_DESC(use_imm_data,
+		 "Whether or not to request permission to use immediate data during SRP login.");
+
+static unsigned int srp_max_imm_data = 8 * 1024;
+module_param_named(max_imm_data, srp_max_imm_data, uint, 0644);
+MODULE_PARM_DESC(max_imm_data, "Maximum immediate data size.");
+
 static unsigned ch_count;
 module_param(ch_count, uint, 0444);
 MODULE_PARM_DESC(ch_count,
@@ -139,6 +148,7 @@
 
 static void srp_add_one(struct ib_device *device);
 static void srp_remove_one(struct ib_device *device, void *client_data);
+static void srp_rename_dev(struct ib_device *device, void *client_data);
 static void srp_recv_done(struct ib_cq *cq, struct ib_wc *wc);
 static void srp_handle_qp_err(struct ib_cq *cq, struct ib_wc *wc,
 		const char *opname);
@@ -153,7 +163,8 @@
 static struct ib_client srp_client = {
 	.name   = "srp",
 	.add    = srp_add_one,
-	.remove = srp_remove_one
+	.remove = srp_remove_one,
+	.rename = srp_rename_dev
 };
 
 static struct ib_sa_client srp_sa_client;
@@ -434,8 +445,7 @@
 	if (pool_size <= 0)
 		goto err;
 	ret = -ENOMEM;
-	pool = kzalloc(sizeof(struct srp_fr_pool) +
-		       pool_size * sizeof(struct srp_fr_desc), GFP_KERNEL);
+	pool = kzalloc(struct_size(pool, desc, pool_size), GFP_KERNEL);
 	if (!pool)
 		goto err;
 	pool->size = pool_size;
@@ -573,7 +583,7 @@
 	init_attr->cap.max_send_wr     = m * target->queue_size;
 	init_attr->cap.max_recv_wr     = target->queue_size + 1;
 	init_attr->cap.max_recv_sge    = 1;
-	init_attr->cap.max_send_sge    = 1;
+	init_attr->cap.max_send_sge    = SRP_MAX_SGE;
 	init_attr->sq_sig_type         = IB_SIGNAL_REQ_WR;
 	init_attr->qp_type             = IB_QPT_RC;
 	init_attr->send_cq             = send_cq;
@@ -823,7 +833,8 @@
 	return subnet_timeout;
 }
 
-static int srp_send_req(struct srp_rdma_ch *ch, bool multich)
+static int srp_send_req(struct srp_rdma_ch *ch, uint32_t max_iu_len,
+			bool multich)
 {
 	struct srp_target_port *target = ch->target;
 	struct {
@@ -852,11 +863,15 @@
 
 	req->ib_req.opcode = SRP_LOGIN_REQ;
 	req->ib_req.tag = 0;
-	req->ib_req.req_it_iu_len = cpu_to_be32(target->max_iu_len);
+	req->ib_req.req_it_iu_len = cpu_to_be32(max_iu_len);
 	req->ib_req.req_buf_fmt	= cpu_to_be16(SRP_BUF_FORMAT_DIRECT |
 					      SRP_BUF_FORMAT_INDIRECT);
 	req->ib_req.req_flags = (multich ? SRP_MULTICHAN_MULTI :
 				 SRP_MULTICHAN_SINGLE);
+	if (srp_use_imm_data) {
+		req->ib_req.req_flags |= SRP_IMMED_REQUESTED;
+		req->ib_req.imm_data_offset = cpu_to_be16(SRP_IMM_DATA_OFFSET);
+	}
 
 	if (target->using_rdma_cm) {
 		req->rdma_param.flow_control = req->ib_param.flow_control;
@@ -873,6 +888,7 @@
 		req->rdma_req.req_it_iu_len = req->ib_req.req_it_iu_len;
 		req->rdma_req.req_buf_fmt = req->ib_req.req_buf_fmt;
 		req->rdma_req.req_flags	= req->ib_req.req_flags;
+		req->rdma_req.imm_data_offset = req->ib_req.imm_data_offset;
 
 		ipi = req->rdma_req.initiator_port_id;
 		tpi = req->rdma_req.target_port_id;
@@ -1145,7 +1161,8 @@
 	return c;
 }
 
-static int srp_connect_ch(struct srp_rdma_ch *ch, bool multich)
+static int srp_connect_ch(struct srp_rdma_ch *ch, uint32_t max_iu_len,
+			  bool multich)
 {
 	struct srp_target_port *target = ch->target;
 	int ret;
@@ -1158,7 +1175,7 @@
 
 	while (1) {
 		init_completion(&ch->done);
-		ret = srp_send_req(ch, multich);
+		ret = srp_send_req(ch, max_iu_len, multich);
 		if (ret)
 			goto out;
 		ret = wait_for_completion_interruptible(&ch->done);
@@ -1330,17 +1347,8 @@
 {
 	struct srp_target_port *target = rport->lld_data;
 	struct srp_rdma_ch *ch;
-	struct Scsi_Host *shost = target->scsi_host;
-	struct scsi_device *sdev;
 	int i, j;
 
-	/*
-	 * Invoking srp_terminate_io() while srp_queuecommand() is running
-	 * is not safe. Hence the warning statement below.
-	 */
-	shost_for_each_device(sdev, shost)
-		WARN_ON_ONCE(sdev->request_queue->request_fn_active);
-
 	for (i = 0; i < target->ch_count; i++) {
 		ch = &target->ch[i];
 
@@ -1353,6 +1361,20 @@
 	}
 }
 
+/* Calculate maximum initiator to target information unit length. */
+static uint32_t srp_max_it_iu_len(int cmd_sg_cnt, bool use_imm_data)
+{
+	uint32_t max_iu_len = sizeof(struct srp_cmd) + SRP_MAX_ADD_CDB_LEN +
+		sizeof(struct srp_indirect_buf) +
+		cmd_sg_cnt * sizeof(struct srp_direct_buf);
+
+	if (use_imm_data)
+		max_iu_len = max(max_iu_len, SRP_IMM_DATA_OFFSET +
+				 srp_max_imm_data);
+
+	return max_iu_len;
+}
+
 /*
  * It is up to the caller to ensure that srp_rport_reconnect() calls are
  * serialized and that no concurrent srp_queuecommand(), srp_abort(),
@@ -1366,6 +1388,8 @@
 {
 	struct srp_target_port *target = rport->lld_data;
 	struct srp_rdma_ch *ch;
+	uint32_t max_iu_len = srp_max_it_iu_len(target->cmd_sg_cnt,
+						srp_use_imm_data);
 	int i, j, ret = 0;
 	bool multich = false;
 
@@ -1411,7 +1435,7 @@
 		ch = &target->ch[i];
 		if (ret)
 			break;
-		ret = srp_connect_ch(ch, multich);
+		ret = srp_connect_ch(ch, max_iu_len, multich);
 		multich = true;
 	}
 
@@ -1578,9 +1602,8 @@
 {
 	struct srp_target_port *target = ch->target;
 	struct srp_device *dev = target->srp_host->srp_dev;
-	struct ib_device *ibdev = dev->dev;
-	dma_addr_t dma_addr = ib_sg_dma_address(ibdev, sg);
-	unsigned int dma_len = ib_sg_dma_len(ibdev, sg);
+	dma_addr_t dma_addr = sg_dma_address(sg);
+	unsigned int dma_len = sg_dma_len(sg);
 	unsigned int len = 0;
 	int ret;
 
@@ -1674,13 +1697,11 @@
 			  int count)
 {
 	struct srp_target_port *target = ch->target;
-	struct srp_device *dev = target->srp_host->srp_dev;
 	struct scatterlist *sg;
 	int i;
 
 	for_each_sg(scat, sg, count, i) {
-		srp_map_desc(state, ib_sg_dma_address(dev->dev, sg),
-			     ib_sg_dma_len(dev->dev, sg),
+		srp_map_desc(state, sg_dma_address(sg), sg_dma_len(sg),
 			     target->global_rkey);
 	}
 
@@ -1773,25 +1794,29 @@
  * @req: SRP request
  *
  * Returns the length in bytes of the SRP_CMD IU or a negative value if
- * mapping failed.
+ * mapping failed. The size of any immediate data is not included in the
+ * return value.
  */
 static int srp_map_data(struct scsi_cmnd *scmnd, struct srp_rdma_ch *ch,
 			struct srp_request *req)
 {
 	struct srp_target_port *target = ch->target;
-	struct scatterlist *scat;
+	struct scatterlist *scat, *sg;
 	struct srp_cmd *cmd = req->cmd->buf;
-	int len, nents, count, ret;
+	int i, len, nents, count, ret;
 	struct srp_device *dev;
 	struct ib_device *ibdev;
 	struct srp_map_state state;
 	struct srp_indirect_buf *indirect_hdr;
+	u64 data_len;
 	u32 idb_len, table_len;
 	__be32 idb_rkey;
 	u8 fmt;
 
+	req->cmd->num_sge = 1;
+
 	if (!scsi_sglist(scmnd) || scmnd->sc_data_direction == DMA_NONE)
-		return sizeof (struct srp_cmd);
+		return sizeof(struct srp_cmd) + cmd->add_cdb_len;
 
 	if (scmnd->sc_data_direction != DMA_FROM_DEVICE &&
 	    scmnd->sc_data_direction != DMA_TO_DEVICE) {
@@ -1803,6 +1828,7 @@
 
 	nents = scsi_sg_count(scmnd);
 	scat  = scsi_sglist(scmnd);
+	data_len = scsi_bufflen(scmnd);
 
 	dev = target->srp_host->srp_dev;
 	ibdev = dev->dev;
@@ -1811,8 +1837,31 @@
 	if (unlikely(count == 0))
 		return -EIO;
 
+	if (ch->use_imm_data &&
+	    count <= SRP_MAX_IMM_SGE &&
+	    SRP_IMM_DATA_OFFSET + data_len <= ch->max_it_iu_len &&
+	    scmnd->sc_data_direction == DMA_TO_DEVICE) {
+		struct srp_imm_buf *buf;
+		struct ib_sge *sge = &req->cmd->sge[1];
+
+		fmt = SRP_DATA_DESC_IMM;
+		len = SRP_IMM_DATA_OFFSET;
+		req->nmdesc = 0;
+		buf = (void *)cmd->add_data + cmd->add_cdb_len;
+		buf->len = cpu_to_be32(data_len);
+		WARN_ON_ONCE((void *)(buf + 1) > (void *)cmd + len);
+		for_each_sg(scat, sg, count, i) {
+			sge[i].addr   = sg_dma_address(sg);
+			sge[i].length = sg_dma_len(sg);
+			sge[i].lkey   = target->lkey;
+		}
+		req->cmd->num_sge += count;
+		goto map_complete;
+	}
+
 	fmt = SRP_DATA_DESC_DIRECT;
-	len = sizeof (struct srp_cmd) +	sizeof (struct srp_direct_buf);
+	len = sizeof(struct srp_cmd) + cmd->add_cdb_len +
+		sizeof(struct srp_direct_buf);
 
 	if (count == 1 && target->global_rkey) {
 		/*
@@ -1821,11 +1870,12 @@
 		 * single entry.  So a direct descriptor along with
 		 * the DMA MR suffices.
 		 */
-		struct srp_direct_buf *buf = (void *) cmd->add_data;
+		struct srp_direct_buf *buf;
 
-		buf->va  = cpu_to_be64(ib_sg_dma_address(ibdev, scat));
+		buf = (void *)cmd->add_data + cmd->add_cdb_len;
+		buf->va  = cpu_to_be64(sg_dma_address(scat));
 		buf->key = cpu_to_be32(target->global_rkey);
-		buf->len = cpu_to_be32(ib_sg_dma_len(ibdev, scat));
+		buf->len = cpu_to_be32(sg_dma_len(scat));
 
 		req->nmdesc = 0;
 		goto map_complete;
@@ -1835,7 +1885,7 @@
 	 * We have more than one scatter/gather entry, so build our indirect
 	 * descriptor table, trying to merge as many entries as we can.
 	 */
-	indirect_hdr = (void *) cmd->add_data;
+	indirect_hdr = (void *)cmd->add_data + cmd->add_cdb_len;
 
 	ib_dma_sync_single_for_cpu(ibdev, req->indirect_dma_addr,
 				   target->indirect_size, DMA_TO_DEVICE);
@@ -1870,8 +1920,9 @@
 		 * Memory registration collapsed the sg-list into one entry,
 		 * so use a direct descriptor.
 		 */
-		struct srp_direct_buf *buf = (void *) cmd->add_data;
+		struct srp_direct_buf *buf;
 
+		buf = (void *)cmd->add_data + cmd->add_cdb_len;
 		*buf = req->indirect_desc[0];
 		goto map_complete;
 	}
@@ -1889,7 +1940,8 @@
 	idb_len = sizeof(struct srp_indirect_buf) + table_len;
 
 	fmt = SRP_DATA_DESC_INDIRECT;
-	len = sizeof(struct srp_cmd) + sizeof (struct srp_indirect_buf);
+	len = sizeof(struct srp_cmd) + cmd->add_cdb_len +
+		sizeof(struct srp_indirect_buf);
 	len += count * sizeof (struct srp_direct_buf);
 
 	memcpy(indirect_hdr->desc_list, req->indirect_desc,
@@ -2010,22 +2062,30 @@
 	list_add(&iu->list, &ch->free_tx);
 }
 
+/**
+ * srp_post_send() - send an SRP information unit
+ * @ch: RDMA channel over which to send the information unit.
+ * @iu: Information unit to send.
+ * @len: Length of the information unit excluding immediate data.
+ */
 static int srp_post_send(struct srp_rdma_ch *ch, struct srp_iu *iu, int len)
 {
 	struct srp_target_port *target = ch->target;
-	struct ib_sge list;
 	struct ib_send_wr wr;
 
-	list.addr   = iu->dma;
-	list.length = len;
-	list.lkey   = target->lkey;
+	if (WARN_ON_ONCE(iu->num_sge > SRP_MAX_SGE))
+		return -EINVAL;
+
+	iu->sge[0].addr   = iu->dma;
+	iu->sge[0].length = len;
+	iu->sge[0].lkey   = target->lkey;
 
 	iu->cqe.done = srp_send_done;
 
 	wr.next       = NULL;
 	wr.wr_cqe     = &iu->cqe;
-	wr.sg_list    = &list;
-	wr.num_sge    = 1;
+	wr.sg_list    = &iu->sge[0];
+	wr.num_sge    = iu->num_sge;
 	wr.opcode     = IB_WR_SEND;
 	wr.send_flags = IB_SEND_SIGNALED;
 
@@ -2138,6 +2198,7 @@
 		return 1;
 	}
 
+	iu->num_sge = 1;
 	ib_dma_sync_single_for_cpu(dev, iu->dma, len, DMA_TO_DEVICE);
 	memcpy(iu->buf, rsp, len);
 	ib_dma_sync_single_for_device(dev, iu->dma, len, DMA_TO_DEVICE);
@@ -2279,7 +2340,6 @@
 static int srp_queuecommand(struct Scsi_Host *shost, struct scsi_cmnd *scmnd)
 {
 	struct srp_target_port *target = host_to_target(shost);
-	struct srp_rport *rport = target->rport;
 	struct srp_rdma_ch *ch;
 	struct srp_request *req;
 	struct srp_iu *iu;
@@ -2289,16 +2349,6 @@
 	u32 tag;
 	u16 idx;
 	int len, ret;
-	const bool in_scsi_eh = !in_interrupt() && current == shost->ehandler;
-
-	/*
-	 * The SCSI EH thread is the only context from which srp_queuecommand()
-	 * can get invoked for blocked devices (SDEV_BLOCK /
-	 * SDEV_CREATED_BLOCK). Avoid racing with srp_reconnect_rport() by
-	 * locking the rport mutex if invoked from inside the SCSI EH.
-	 */
-	if (in_scsi_eh)
-		mutex_lock(&rport->mutex);
 
 	scmnd->result = srp_chkready(target->rport);
 	if (unlikely(scmnd->result))
@@ -2321,7 +2371,7 @@
 
 	req = &ch->req_ring[idx];
 	dev = target->srp_host->srp_dev->dev;
-	ib_dma_sync_single_for_cpu(dev, iu->dma, target->max_iu_len,
+	ib_dma_sync_single_for_cpu(dev, iu->dma, ch->max_it_iu_len,
 				   DMA_TO_DEVICE);
 
 	scmnd->host_scribble = (void *) req;
@@ -2333,6 +2383,12 @@
 	int_to_scsilun(scmnd->device->lun, &cmd->lun);
 	cmd->tag    = tag;
 	memcpy(cmd->cdb, scmnd->cmnd, scmnd->cmd_len);
+	if (unlikely(scmnd->cmd_len > sizeof(cmd->cdb))) {
+		cmd->add_cdb_len = round_up(scmnd->cmd_len - sizeof(cmd->cdb),
+					    4);
+		if (WARN_ON_ONCE(cmd->add_cdb_len > SRP_MAX_ADD_CDB_LEN))
+			goto err_iu;
+	}
 
 	req->scmnd    = scmnd;
 	req->cmd      = iu;
@@ -2352,21 +2408,16 @@
 		goto err_iu;
 	}
 
-	ib_dma_sync_single_for_device(dev, iu->dma, target->max_iu_len,
+	ib_dma_sync_single_for_device(dev, iu->dma, ch->max_it_iu_len,
 				      DMA_TO_DEVICE);
 
 	if (srp_post_send(ch, iu, len)) {
 		shost_printk(KERN_ERR, target->scsi_host, PFX "Send failed\n");
+		scmnd->result = DID_ERROR << 16;
 		goto err_unmap;
 	}
 
-	ret = 0;
-
-unlock_rport:
-	if (in_scsi_eh)
-		mutex_unlock(&rport->mutex);
-
-	return ret;
+	return 0;
 
 err_unmap:
 	srp_unmap_data(scmnd, ch, req);
@@ -2388,7 +2439,7 @@
 		ret = SCSI_MLQUEUE_HOST_BUSY;
 	}
 
-	goto unlock_rport;
+	return ret;
 }
 
 /*
@@ -2419,7 +2470,7 @@
 
 	for (i = 0; i < target->queue_size; ++i) {
 		ch->tx_ring[i] = srp_alloc_iu(target->srp_host,
-					      target->max_iu_len,
+					      ch->max_it_iu_len,
 					      GFP_KERNEL, DMA_TO_DEVICE);
 		if (!ch->tx_ring[i])
 			goto err;
@@ -2485,6 +2536,15 @@
 	if (lrsp->opcode == SRP_LOGIN_RSP) {
 		ch->max_ti_iu_len = be32_to_cpu(lrsp->max_ti_iu_len);
 		ch->req_lim       = be32_to_cpu(lrsp->req_lim_delta);
+		ch->use_imm_data  = lrsp->rsp_flags & SRP_LOGIN_RSP_IMMED_SUPP;
+		ch->max_it_iu_len = srp_max_it_iu_len(target->cmd_sg_cnt,
+						      ch->use_imm_data);
+		WARN_ON_ONCE(ch->max_it_iu_len >
+			     be32_to_cpu(lrsp->max_it_iu_len));
+
+		if (ch->use_imm_data)
+			shost_printk(KERN_DEBUG, target->scsi_host,
+				     PFX "using immediate data\n");
 
 		/*
 		 * Reserve credits for task management so we don't
@@ -2873,6 +2933,8 @@
 		return -1;
 	}
 
+	iu->num_sge = 1;
+
 	ib_dma_sync_single_for_cpu(dev, iu->dma, sizeof *tsk_mgmt,
 				   DMA_TO_DEVICE);
 	tsk_mgmt = iu->buf;
@@ -2951,7 +3013,6 @@
 {
 	struct srp_target_port *target = host_to_target(scmnd->device->host);
 	struct srp_rdma_ch *ch;
-	int i, j;
 	u8 status;
 
 	shost_printk(KERN_ERR, target->scsi_host, "SRP reset_device called\n");
@@ -2963,15 +3024,6 @@
 	if (status)
 		return FAILED;
 
-	for (i = 0; i < target->ch_count; i++) {
-		ch = &target->ch[i];
-		for (j = 0; j < target->req_ring_size; ++j) {
-			struct srp_request *req = &ch->req_ring[j];
-
-			srp_finish_req(ch, req, scmnd->device, DID_RESET << 16);
-		}
-	}
-
 	return SUCCESS;
 }
 
@@ -2994,20 +3046,6 @@
 	return 0;
 }
 
-static int srp_slave_alloc(struct scsi_device *sdev)
-{
-	struct Scsi_Host *shost = sdev->host;
-	struct srp_target_port *target = host_to_target(shost);
-	struct srp_device *srp_dev = target->srp_host->srp_dev;
-	struct ib_device *ibdev = srp_dev->dev;
-
-	if (!(ibdev->attrs.device_cap_flags & IB_DEVICE_SG_GAPS_REG))
-		blk_queue_virt_boundary(sdev->request_queue,
-					~srp_dev->mr_page_mask);
-
-	return 0;
-}
-
 static int srp_slave_configure(struct scsi_device *sdev)
 {
 	struct Scsi_Host *shost = sdev->host;
@@ -3124,7 +3162,8 @@
 {
 	struct srp_target_port *target = host_to_target(class_to_shost(dev));
 
-	return sprintf(buf, "%s\n", target->srp_host->srp_dev->dev->name);
+	return sprintf(buf, "%s\n",
+		       dev_name(&target->srp_host->srp_dev->dev->dev));
 }
 
 static ssize_t show_ch_count(struct device *dev, struct device_attribute *attr,
@@ -3209,7 +3248,6 @@
 	.name				= "InfiniBand SRP initiator",
 	.proc_name			= DRV_NAME,
 	.target_alloc			= srp_target_alloc,
-	.slave_alloc			= srp_slave_alloc,
 	.slave_configure		= srp_slave_configure,
 	.info				= srp_target_info,
 	.queuecommand			= srp_queuecommand,
@@ -3223,7 +3261,6 @@
 	.can_queue			= SRP_DEFAULT_CMD_SQ_SIZE,
 	.this_id			= -1,
 	.cmd_per_lun			= SRP_DEFAULT_CMD_SQ_SIZE,
-	.use_clustering			= ENABLE_CLUSTERING,
 	.shost_attrs			= srp_host_attrs,
 	.track_queue_depth		= 1,
 };
@@ -3411,13 +3448,17 @@
 
 /**
  * srp_parse_in - parse an IP address and port number combination
+ * @net:	   [in]  Network namespace.
+ * @sa:		   [out] Address family, IP address and port number.
+ * @addr_port_str: [in]  IP address and port number.
+ * @has_port:	   [out] Whether or not @addr_port_str includes a port number.
  *
  * Parse the following address formats:
  * - IPv4: <ip_address>:<port>, e.g. 1.2.3.4:5.
  * - IPv6: \[<ipv6_address>\]:<port>, e.g. [1::2:3%4]:5.
  */
 static int srp_parse_in(struct net *net, struct sockaddr_storage *sa,
-			const char *addr_port_str)
+			const char *addr_port_str, bool *has_port)
 {
 	char *addr_end, *addr = kstrdup(addr_port_str, GFP_KERNEL);
 	char *port_str;
@@ -3426,9 +3467,12 @@
 	if (!addr)
 		return -ENOMEM;
 	port_str = strrchr(addr, ':');
-	if (!port_str)
-		return -EINVAL;
-	*port_str++ = '\0';
+	if (port_str && strchr(port_str, ']'))
+		port_str = NULL;
+	if (port_str)
+		*port_str++ = '\0';
+	if (has_port)
+		*has_port = port_str != NULL;
 	ret = inet_pton_with_scope(net, AF_INET, addr, port_str, sa);
 	if (ret && addr[0]) {
 		addr_end = addr + strlen(addr) - 1;
@@ -3450,6 +3494,7 @@
 	char *p;
 	substring_t args[MAX_OPT_ARGS];
 	unsigned long long ull;
+	bool has_port;
 	int opt_mask = 0;
 	int token;
 	int ret = -EINVAL;
@@ -3548,7 +3593,8 @@
 				ret = -ENOMEM;
 				goto out;
 			}
-			ret = srp_parse_in(net, &target->rdma_cm.src.ss, p);
+			ret = srp_parse_in(net, &target->rdma_cm.src.ss, p,
+					   NULL);
 			if (ret < 0) {
 				pr_warn("bad source parameter '%s'\n", p);
 				kfree(p);
@@ -3564,7 +3610,10 @@
 				ret = -ENOMEM;
 				goto out;
 			}
-			ret = srp_parse_in(net, &target->rdma_cm.dst.ss, p);
+			ret = srp_parse_in(net, &target->rdma_cm.dst.ss, p,
+					   &has_port);
+			if (!has_port)
+				ret = -EINVAL;
 			if (ret < 0) {
 				pr_warn("bad dest parameter '%s'\n", p);
 				kfree(p);
@@ -3728,6 +3777,7 @@
 	int ret, node_idx, node, cpu, i;
 	unsigned int max_sectors_per_mr, mr_per_cmd = 0;
 	bool multich = false;
+	uint32_t max_iu_len;
 
 	target_host = scsi_host_alloc(&srp_template,
 				      sizeof (struct srp_target_port));
@@ -3739,6 +3789,10 @@
 	target_host->max_id      = 1;
 	target_host->max_lun     = -1LL;
 	target_host->max_cmd_len = sizeof ((struct srp_cmd *) (void *) 0L)->cdb;
+	target_host->max_segment_size = ib_dma_max_seg_size(ibdev);
+
+	if (!(ibdev->attrs.device_cap_flags & IB_DEVICE_SG_GAPS_REG))
+		target_host->virt_boundary_mask = ~srp_dev->mr_page_mask;
 
 	target = host_to_target(target_host);
 
@@ -3833,9 +3887,7 @@
 	target->mr_per_cmd = mr_per_cmd;
 	target->indirect_size = target->sg_tablesize *
 				sizeof (struct srp_direct_buf);
-	target->max_iu_len = sizeof (struct srp_cmd) +
-			     sizeof (struct srp_indirect_buf) +
-			     target->cmd_sg_cnt * sizeof (struct srp_direct_buf);
+	max_iu_len = srp_max_it_iu_len(target->cmd_sg_cnt, srp_use_imm_data);
 
 	INIT_WORK(&target->tl_err_work, srp_tl_err_work);
 	INIT_WORK(&target->remove_work, srp_remove_work);
@@ -3890,7 +3942,7 @@
 			if (ret)
 				goto err_disconnect;
 
-			ret = srp_connect_ch(ch, multich);
+			ret = srp_connect_ch(ch, max_iu_len, multich);
 			if (ret) {
 				char dst[64];
 
@@ -3987,7 +4039,7 @@
 {
 	struct srp_host *host = container_of(dev, struct srp_host, dev);
 
-	return sprintf(buf, "%s\n", host->srp_dev->dev->name);
+	return sprintf(buf, "%s\n", dev_name(&host->srp_dev->dev->dev));
 }
 
 static DEVICE_ATTR(ibdev, S_IRUGO, show_ibdev, NULL);
@@ -4019,7 +4071,8 @@
 
 	host->dev.class = &srp_class;
 	host->dev.parent = device->dev->dev.parent;
-	dev_set_name(&host->dev, "srp-%s-%d", device->dev->name, port);
+	dev_set_name(&host->dev, "srp-%s-%d", dev_name(&device->dev->dev),
+		     port);
 
 	if (device_register(&host->dev))
 		goto free_host;
@@ -4041,12 +4094,27 @@
 	return NULL;
 }
 
+static void srp_rename_dev(struct ib_device *device, void *client_data)
+{
+	struct srp_device *srp_dev = client_data;
+	struct srp_host *host, *tmp_host;
+
+	list_for_each_entry_safe(host, tmp_host, &srp_dev->dev_list, list) {
+		char name[IB_DEVICE_NAME_MAX + 8];
+
+		snprintf(name, sizeof(name), "srp-%s-%d",
+			 dev_name(&device->dev), host->port);
+		device_rename(&host->dev, name);
+	}
+}
+
 static void srp_add_one(struct ib_device *device)
 {
 	struct srp_device *srp_dev;
 	struct ib_device_attr *attr = &device->attrs;
 	struct srp_host *host;
-	int mr_page_shift, p;
+	int mr_page_shift;
+	unsigned int p;
 	u64 max_pages_per_mr;
 	unsigned int flags = 0;
 
@@ -4070,8 +4138,10 @@
 	srp_dev->max_pages_per_mr = min_t(u64, SRP_MAX_PAGES_PER_MR,
 					  max_pages_per_mr);
 
-	srp_dev->has_fmr = (device->alloc_fmr && device->dealloc_fmr &&
-			    device->map_phys_fmr && device->unmap_fmr);
+	srp_dev->has_fmr = (device->ops.alloc_fmr &&
+			    device->ops.dealloc_fmr &&
+			    device->ops.map_phys_fmr &&
+			    device->ops.unmap_fmr);
 	srp_dev->has_fr = (attr->device_cap_flags &
 			   IB_DEVICE_MEM_MGT_EXTENSIONS);
 	if (!never_register && !srp_dev->has_fmr && !srp_dev->has_fr) {
@@ -4095,7 +4165,7 @@
 	srp_dev->mr_max_size	= srp_dev->mr_page_size *
 				   srp_dev->max_pages_per_mr;
 	pr_debug("%s: mr_page_shift = %d, device->max_mr_size = %#llx, device->max_fast_reg_page_list_len = %u, max_pages_per_mr = %d, mr_max_size = %#x\n",
-		 device->name, mr_page_shift, attr->max_mr_size,
+		 dev_name(&device->dev), mr_page_shift, attr->max_mr_size,
 		 attr->max_fast_reg_page_list_len,
 		 srp_dev->max_pages_per_mr, srp_dev->mr_max_size);
 
@@ -4111,7 +4181,7 @@
 		WARN_ON_ONCE(srp_dev->global_rkey == 0);
 	}
 
-	for (p = rdma_start_port(device); p <= rdma_end_port(device); ++p) {
+	rdma_for_each_port (device, p) {
 		host = srp_add_port(srp_dev, p);
 		if (host)
 			list_add_tail(&host->list, &srp_dev->dev_list);
@@ -4179,6 +4249,11 @@
 {
 	int ret;
 
+	BUILD_BUG_ON(sizeof(struct srp_imm_buf) != 4);
+	BUILD_BUG_ON(sizeof(struct srp_login_req) != 64);
+	BUILD_BUG_ON(sizeof(struct srp_login_req_rdma) != 56);
+	BUILD_BUG_ON(sizeof(struct srp_cmd) != 48);
+
 	if (srp_sg_tablesize) {
 		pr_warn("srp_sg_tablesize is deprecated, please use cmd_sg_entries\n");
 		if (!cmd_sg_entries)
diff --git a/drivers/infiniband/ulp/srp/ib_srp.h b/drivers/infiniband/ulp/srp/ib_srp.h
index a270608..b2861cd 100644
--- a/drivers/infiniband/ulp/srp/ib_srp.h
+++ b/drivers/infiniband/ulp/srp/ib_srp.h
@@ -67,6 +67,17 @@
 	SRP_TAG_TSK_MGMT	= 1U << 31,
 
 	SRP_MAX_PAGES_PER_MR	= 512,
+
+	SRP_MAX_ADD_CDB_LEN	= 16,
+
+	SRP_MAX_IMM_SGE		= 2,
+	SRP_MAX_SGE		= SRP_MAX_IMM_SGE + 1,
+	/*
+	 * Choose the immediate data offset such that a 32 byte CDB still fits.
+	 */
+	SRP_IMM_DATA_OFFSET	= sizeof(struct srp_cmd) +
+				  SRP_MAX_ADD_CDB_LEN +
+				  sizeof(struct srp_imm_buf),
 };
 
 enum srp_target_state {
@@ -130,6 +141,8 @@
 /**
  * struct srp_rdma_ch
  * @comp_vector: Completion vector used by this RDMA channel.
+ * @max_it_iu_len: Maximum initiator-to-target information unit length.
+ * @max_ti_iu_len: Maximum target-to-initiator information unit length.
  */
 struct srp_rdma_ch {
 	/* These are RW in the hot path, and commonly used together */
@@ -146,6 +159,9 @@
 		struct ib_fmr_pool     *fmr_pool;
 		struct srp_fr_pool     *fr_pool;
 	};
+	uint32_t		max_it_iu_len;
+	uint32_t		max_ti_iu_len;
+	bool			use_imm_data;
 
 	/* Everything above this point is used in the hot path of
 	 * command processing. Try to keep them packed into cachelines.
@@ -169,7 +185,6 @@
 	struct srp_iu	      **tx_ring;
 	struct srp_iu	      **rx_ring;
 	struct srp_request     *req_ring;
-	int			max_ti_iu_len;
 	int			comp_vector;
 
 	u64			tsk_mgmt_tag;
@@ -194,7 +209,6 @@
 	u32			ch_count;
 	u32			lkey;
 	enum srp_target_state	state;
-	unsigned int		max_iu_len;
 	unsigned int		cmd_sg_cnt;
 	unsigned int		indirect_size;
 	bool			allow_ext_sg;
@@ -259,6 +273,8 @@
 	void		       *buf;
 	size_t			size;
 	enum dma_data_direction	direction;
+	u32			num_sge;
+	struct ib_sge		sge[SRP_MAX_SGE];
 	struct ib_cqe		cqe;
 };
 
diff --git a/drivers/infiniband/ulp/srpt/Kconfig b/drivers/infiniband/ulp/srpt/Kconfig
index fb8b718..ce7567c 100644
--- a/drivers/infiniband/ulp/srpt/Kconfig
+++ b/drivers/infiniband/ulp/srpt/Kconfig
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 config INFINIBAND_SRPT
 	tristate "InfiniBand SCSI RDMA Protocol target support"
 	depends on INFINIBAND && INFINIBAND_ADDR_TRANS && TARGET_CORE
diff --git a/drivers/infiniband/ulp/srpt/Makefile b/drivers/infiniband/ulp/srpt/Makefile
index e3ee4bd..2d13792 100644
--- a/drivers/infiniband/ulp/srpt/Makefile
+++ b/drivers/infiniband/ulp/srpt/Makefile
@@ -1,2 +1,2 @@
-ccflags-y			:= -Idrivers/target
+# SPDX-License-Identifier: GPL-2.0-only
 obj-$(CONFIG_INFINIBAND_SRPT)	+= ib_srpt.o
diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.c b/drivers/infiniband/ulp/srpt/ib_srpt.c
index f37cbad..e25c70a 100644
--- a/drivers/infiniband/ulp/srpt/ib_srpt.c
+++ b/drivers/infiniband/ulp/srpt/ib_srpt.c
@@ -51,8 +51,6 @@
 
 /* Name of this kernel module. */
 #define DRV_NAME		"ib_srpt"
-#define DRV_VERSION		"2.0.0"
-#define DRV_RELDATE		"2011-02-14"
 
 #define SRPT_ID_STRING	"Linux SRP target"
 
@@ -60,8 +58,7 @@
 #define pr_fmt(fmt) DRV_NAME " " fmt
 
 MODULE_AUTHOR("Vu Pham and Bart Van Assche");
-MODULE_DESCRIPTION("InfiniBand SCSI RDMA Protocol target "
-		   "v" DRV_VERSION " (" DRV_RELDATE ")");
+MODULE_DESCRIPTION("SCSI RDMA Protocol target driver");
 MODULE_LICENSE("Dual BSD/GPL");
 
 /*
@@ -89,8 +86,7 @@
 module_param_call(srpt_service_guid, NULL, srpt_get_u64_x, &srpt_service_guid,
 		  0444);
 MODULE_PARM_DESC(srpt_service_guid,
-		 "Using this value for ioc_guid, id_ext, and cm_listen_id"
-		 " instead of using the node_guid of the first HCA.");
+		 "Using this value for ioc_guid, id_ext, and cm_listen_id instead of using the node_guid of the first HCA.");
 
 static struct ib_client srpt_client;
 /* Protects both rdma_cm_port and rdma_cm_id. */
@@ -148,7 +144,7 @@
 		return;
 
 	pr_debug("ASYNC event= %d on device= %s\n", event->event,
-		 sdev->device->name);
+		 dev_name(&sdev->device->dev));
 
 	switch (event->event) {
 	case IB_EVENT_PORT_ERR:
@@ -462,7 +458,7 @@
 static void srpt_mad_send_handler(struct ib_mad_agent *mad_agent,
 				  struct ib_mad_send_wc *mad_wc)
 {
-	rdma_destroy_ah(mad_wc->send_buf->ah);
+	rdma_destroy_ah(mad_wc->send_buf->ah, RDMA_DESTROY_AH_SLEEPABLE);
 	ib_free_send_mad(mad_wc->send_buf);
 }
 
@@ -529,7 +525,7 @@
 	ib_free_send_mad(rsp);
 
 err_rsp:
-	rdma_destroy_ah(ah);
+	rdma_destroy_ah(ah, RDMA_DESTROY_AH_SLEEPABLE);
 err:
 	ib_free_recv_mad(mad_wc);
 }
@@ -652,31 +648,33 @@
  * srpt_alloc_ioctx - allocate a SRPT I/O context structure
  * @sdev: SRPT HCA pointer.
  * @ioctx_size: I/O context size.
- * @dma_size: Size of I/O context DMA buffer.
+ * @buf_cache: I/O buffer cache.
  * @dir: DMA data direction.
  */
 static struct srpt_ioctx *srpt_alloc_ioctx(struct srpt_device *sdev,
-					   int ioctx_size, int dma_size,
+					   int ioctx_size,
+					   struct kmem_cache *buf_cache,
 					   enum dma_data_direction dir)
 {
 	struct srpt_ioctx *ioctx;
 
-	ioctx = kmalloc(ioctx_size, GFP_KERNEL);
+	ioctx = kzalloc(ioctx_size, GFP_KERNEL);
 	if (!ioctx)
 		goto err;
 
-	ioctx->buf = kmalloc(dma_size, GFP_KERNEL);
+	ioctx->buf = kmem_cache_alloc(buf_cache, GFP_KERNEL);
 	if (!ioctx->buf)
 		goto err_free_ioctx;
 
-	ioctx->dma = ib_dma_map_single(sdev->device, ioctx->buf, dma_size, dir);
+	ioctx->dma = ib_dma_map_single(sdev->device, ioctx->buf,
+				       kmem_cache_size(buf_cache), dir);
 	if (ib_dma_mapping_error(sdev->device, ioctx->dma))
 		goto err_free_buf;
 
 	return ioctx;
 
 err_free_buf:
-	kfree(ioctx->buf);
+	kmem_cache_free(buf_cache, ioctx->buf);
 err_free_ioctx:
 	kfree(ioctx);
 err:
@@ -687,17 +685,19 @@
  * srpt_free_ioctx - free a SRPT I/O context structure
  * @sdev: SRPT HCA pointer.
  * @ioctx: I/O context pointer.
- * @dma_size: Size of I/O context DMA buffer.
+ * @buf_cache: I/O buffer cache.
  * @dir: DMA data direction.
  */
 static void srpt_free_ioctx(struct srpt_device *sdev, struct srpt_ioctx *ioctx,
-			    int dma_size, enum dma_data_direction dir)
+			    struct kmem_cache *buf_cache,
+			    enum dma_data_direction dir)
 {
 	if (!ioctx)
 		return;
 
-	ib_dma_unmap_single(sdev->device, ioctx->dma, dma_size, dir);
-	kfree(ioctx->buf);
+	ib_dma_unmap_single(sdev->device, ioctx->dma,
+			    kmem_cache_size(buf_cache), dir);
+	kmem_cache_free(buf_cache, ioctx->buf);
 	kfree(ioctx);
 }
 
@@ -706,33 +706,38 @@
  * @sdev:       Device to allocate the I/O context ring for.
  * @ring_size:  Number of elements in the I/O context ring.
  * @ioctx_size: I/O context size.
- * @dma_size:   DMA buffer size.
+ * @buf_cache:  I/O buffer cache.
+ * @alignment_offset: Offset in each ring buffer at which the SRP information
+ *		unit starts.
  * @dir:        DMA data direction.
  */
 static struct srpt_ioctx **srpt_alloc_ioctx_ring(struct srpt_device *sdev,
 				int ring_size, int ioctx_size,
-				int dma_size, enum dma_data_direction dir)
+				struct kmem_cache *buf_cache,
+				int alignment_offset,
+				enum dma_data_direction dir)
 {
 	struct srpt_ioctx **ring;
 	int i;
 
-	WARN_ON(ioctx_size != sizeof(struct srpt_recv_ioctx)
-		&& ioctx_size != sizeof(struct srpt_send_ioctx));
+	WARN_ON(ioctx_size != sizeof(struct srpt_recv_ioctx) &&
+		ioctx_size != sizeof(struct srpt_send_ioctx));
 
 	ring = kvmalloc_array(ring_size, sizeof(ring[0]), GFP_KERNEL);
 	if (!ring)
 		goto out;
 	for (i = 0; i < ring_size; ++i) {
-		ring[i] = srpt_alloc_ioctx(sdev, ioctx_size, dma_size, dir);
+		ring[i] = srpt_alloc_ioctx(sdev, ioctx_size, buf_cache, dir);
 		if (!ring[i])
 			goto err;
 		ring[i]->index = i;
+		ring[i]->offset = alignment_offset;
 	}
 	goto out;
 
 err:
 	while (--i >= 0)
-		srpt_free_ioctx(sdev, ring[i], dma_size, dir);
+		srpt_free_ioctx(sdev, ring[i], buf_cache, dir);
 	kvfree(ring);
 	ring = NULL;
 out:
@@ -744,12 +749,13 @@
  * @ioctx_ring: I/O context ring to be freed.
  * @sdev: SRPT HCA pointer.
  * @ring_size: Number of ring elements.
- * @dma_size: Size of I/O context DMA buffer.
+ * @buf_cache: I/O buffer cache.
  * @dir: DMA data direction.
  */
 static void srpt_free_ioctx_ring(struct srpt_ioctx **ioctx_ring,
 				 struct srpt_device *sdev, int ring_size,
-				 int dma_size, enum dma_data_direction dir)
+				 struct kmem_cache *buf_cache,
+				 enum dma_data_direction dir)
 {
 	int i;
 
@@ -757,7 +763,7 @@
 		return;
 
 	for (i = 0; i < ring_size; ++i)
-		srpt_free_ioctx(sdev, ioctx_ring[i], dma_size, dir);
+		srpt_free_ioctx(sdev, ioctx_ring[i], buf_cache, dir);
 	kvfree(ioctx_ring);
 }
 
@@ -819,7 +825,7 @@
 	struct ib_recv_wr wr;
 
 	BUG_ON(!sdev);
-	list.addr = ioctx->ioctx.dma;
+	list.addr = ioctx->ioctx.dma + ioctx->ioctx.offset;
 	list.length = srp_max_req_size;
 	list.lkey = sdev->lkey;
 
@@ -985,23 +991,28 @@
 
 /**
  * srpt_get_desc_tbl - parse the data descriptors of a SRP_CMD request
- * @ioctx: Pointer to the I/O context associated with the request.
+ * @recv_ioctx: I/O context associated with the received command @srp_cmd.
+ * @ioctx: I/O context that will be used for responding to the initiator.
  * @srp_cmd: Pointer to the SRP_CMD request data.
  * @dir: Pointer to the variable to which the transfer direction will be
  *   written.
- * @sg: [out] scatterlist allocated for the parsed SRP_CMD.
+ * @sg: [out] scatterlist for the parsed SRP_CMD.
  * @sg_cnt: [out] length of @sg.
  * @data_len: Pointer to the variable to which the total data length of all
  *   descriptors in the SRP_CMD request will be written.
+ * @imm_data_offset: [in] Offset in SRP_CMD requests at which immediate data
+ *   starts.
  *
  * This function initializes ioctx->nrbuf and ioctx->r_bufs.
  *
  * Returns -EINVAL when the SRP_CMD request contains inconsistent descriptors;
  * -ENOMEM when memory allocation fails and zero upon success.
  */
-static int srpt_get_desc_tbl(struct srpt_send_ioctx *ioctx,
+static int srpt_get_desc_tbl(struct srpt_recv_ioctx *recv_ioctx,
+		struct srpt_send_ioctx *ioctx,
 		struct srp_cmd *srp_cmd, enum dma_data_direction *dir,
-		struct scatterlist **sg, unsigned *sg_cnt, u64 *data_len)
+		struct scatterlist **sg, unsigned int *sg_cnt, u64 *data_len,
+		u16 imm_data_offset)
 {
 	BUG_ON(!dir);
 	BUG_ON(!data_len);
@@ -1025,7 +1036,7 @@
 
 	if (((srp_cmd->buf_fmt & 0xf) == SRP_DATA_DESC_DIRECT) ||
 	    ((srp_cmd->buf_fmt >> 4) == SRP_DATA_DESC_DIRECT)) {
-	    	struct srp_direct_buf *db = srpt_get_desc_buf(srp_cmd);
+		struct srp_direct_buf *db = srpt_get_desc_buf(srp_cmd);
 
 		*data_len = be32_to_cpu(db->len);
 		return srpt_alloc_rw_ctxs(ioctx, db, 1, sg, sg_cnt);
@@ -1037,8 +1048,7 @@
 
 		if (nbufs >
 		    (srp_cmd->data_out_desc_cnt + srp_cmd->data_in_desc_cnt)) {
-			pr_err("received unsupported SRP_CMD request"
-			       " type (%u out + %u in != %u / %zu)\n",
+			pr_err("received unsupported SRP_CMD request type (%u out + %u in != %u / %zu)\n",
 			       srp_cmd->data_out_desc_cnt,
 			       srp_cmd->data_in_desc_cnt,
 			       be32_to_cpu(idb->table_desc.len),
@@ -1049,6 +1059,40 @@
 		*data_len = be32_to_cpu(idb->len);
 		return srpt_alloc_rw_ctxs(ioctx, idb->desc_list, nbufs,
 				sg, sg_cnt);
+	} else if ((srp_cmd->buf_fmt >> 4) == SRP_DATA_DESC_IMM) {
+		struct srp_imm_buf *imm_buf = srpt_get_desc_buf(srp_cmd);
+		void *data = (void *)srp_cmd + imm_data_offset;
+		uint32_t len = be32_to_cpu(imm_buf->len);
+		uint32_t req_size = imm_data_offset + len;
+
+		if (req_size > srp_max_req_size) {
+			pr_err("Immediate data (length %d + %d) exceeds request size %d\n",
+			       imm_data_offset, len, srp_max_req_size);
+			return -EINVAL;
+		}
+		if (recv_ioctx->byte_len < req_size) {
+			pr_err("Received too few data - %d < %d\n",
+			       recv_ioctx->byte_len, req_size);
+			return -EIO;
+		}
+		/*
+		 * The immediate data buffer descriptor must occur before the
+		 * immediate data itself.
+		 */
+		if ((void *)(imm_buf + 1) > (void *)data) {
+			pr_err("Received invalid write request\n");
+			return -EINVAL;
+		}
+		*data_len = len;
+		ioctx->recv_ioctx = recv_ioctx;
+		if ((uintptr_t)data & 511) {
+			pr_warn_once("Internal error - the receive buffers are not aligned properly.\n");
+			return -EINVAL;
+		}
+		sg_init_one(&ioctx->imm_sg, data, len);
+		*sg = &ioctx->imm_sg;
+		*sg_cnt = 1;
+		return 0;
 	} else {
 		*data_len = 0;
 		return 0;
@@ -1173,24 +1217,18 @@
 static struct srpt_send_ioctx *srpt_get_send_ioctx(struct srpt_rdma_ch *ch)
 {
 	struct srpt_send_ioctx *ioctx;
-	unsigned long flags;
+	int tag, cpu;
 
 	BUG_ON(!ch);
 
-	ioctx = NULL;
-	spin_lock_irqsave(&ch->spinlock, flags);
-	if (!list_empty(&ch->free_list)) {
-		ioctx = list_first_entry(&ch->free_list,
-					 struct srpt_send_ioctx, free_list);
-		list_del(&ioctx->free_list);
-	}
-	spin_unlock_irqrestore(&ch->spinlock, flags);
+	tag = sbitmap_queue_get(&ch->sess->sess_tag_pool, &cpu);
+	if (tag < 0)
+		return NULL;
 
-	if (!ioctx)
-		return ioctx;
-
+	ioctx = ch->ioctx_ring[tag];
 	BUG_ON(ioctx->ch != ch);
 	ioctx->state = SRPT_STATE_NEW;
+	WARN_ON_ONCE(ioctx->recv_ioctx);
 	ioctx->n_rdma = 0;
 	ioctx->n_rw_ctx = 0;
 	ioctx->queue_status_only = false;
@@ -1200,6 +1238,8 @@
 	 */
 	memset(&ioctx->cmd, 0, sizeof(ioctx->cmd));
 	memset(&ioctx->sense_data, 0, sizeof(ioctx->sense_data));
+	ioctx->cmd.map_tag = tag;
+	ioctx->cmd.map_cpu = cpu;
 
 	return ioctx;
 }
@@ -1352,8 +1392,8 @@
 		BUILD_BUG_ON(MIN_MAX_RSP_SIZE <= sizeof(*srp_rsp));
 		max_sense_len = ch->max_ti_iu_len - sizeof(*srp_rsp);
 		if (sense_data_len > max_sense_len) {
-			pr_warn("truncated sense data from %d to %d"
-				" bytes\n", sense_data_len, max_sense_len);
+			pr_warn("truncated sense data from %d to %d bytes\n",
+				sense_data_len, max_sense_len);
 			sense_data_len = max_sense_len;
 		}
 
@@ -1433,7 +1473,7 @@
 
 	BUG_ON(!send_ioctx);
 
-	srp_cmd = recv_ioctx->ioctx.buf;
+	srp_cmd = recv_ioctx->ioctx.buf + recv_ioctx->ioctx.offset;
 	cmd = &send_ioctx->cmd;
 	cmd->tag = srp_cmd->tag;
 
@@ -1453,14 +1493,14 @@
 		break;
 	}
 
-	rc = srpt_get_desc_tbl(send_ioctx, srp_cmd, &dir, &sg, &sg_cnt,
-			&data_len);
+	rc = srpt_get_desc_tbl(recv_ioctx, send_ioctx, srp_cmd, &dir,
+			       &sg, &sg_cnt, &data_len, ch->imm_data_offset);
 	if (rc) {
 		if (rc != -EAGAIN) {
 			pr_err("0x%llx: parsing SRP descriptor table failed.\n",
 			       srp_cmd->tag);
 		}
-		goto release_ioctx;
+		goto busy;
 	}
 
 	rc = target_submit_cmd_map_sgls(cmd, ch->sess, srp_cmd->cdb,
@@ -1471,13 +1511,12 @@
 	if (rc != 0) {
 		pr_debug("target_submit_cmd() returned %d for tag %#llx\n", rc,
 			 srp_cmd->tag);
-		goto release_ioctx;
+		goto busy;
 	}
 	return;
 
-release_ioctx:
-	send_ioctx->state = SRPT_STATE_DONE;
-	srpt_release_cmd(cmd);
+busy:
+	target_send_busy(cmd);
 }
 
 static int srp_tmr_to_tcm(int fn)
@@ -1521,7 +1560,7 @@
 
 	BUG_ON(!send_ioctx);
 
-	srp_tsk = recv_ioctx->ioctx.buf;
+	srp_tsk = recv_ioctx->ioctx.buf + recv_ioctx->ioctx.offset;
 	cmd = &send_ioctx->cmd;
 
 	pr_debug("recv tsk_mgmt fn %d for task_tag %lld and cmd tag %lld ch %p sess %p\n",
@@ -1537,11 +1576,9 @@
 			       TARGET_SCF_ACK_KREF);
 	if (rc != 0) {
 		send_ioctx->cmd.se_tmr_req->response = TMR_FUNCTION_REJECTED;
-		goto fail;
+		cmd->se_tfo->queue_tm_rsp(cmd);
 	}
 	return;
-fail:
-	transport_send_check_condition_and_sense(cmd, 0, 0); // XXX:
 }
 
 /**
@@ -1564,10 +1601,11 @@
 		goto push;
 
 	ib_dma_sync_single_for_cpu(ch->sport->sdev->device,
-				   recv_ioctx->ioctx.dma, srp_max_req_size,
+				   recv_ioctx->ioctx.dma,
+				   recv_ioctx->ioctx.offset + srp_max_req_size,
 				   DMA_FROM_DEVICE);
 
-	srp_cmd = recv_ioctx->ioctx.buf;
+	srp_cmd = recv_ioctx->ioctx.buf + recv_ioctx->ioctx.offset;
 	opcode = srp_cmd->opcode;
 	if (opcode == SRP_CMD || opcode == SRP_TSK_MGMT) {
 		send_ioctx = srpt_get_send_ioctx(ch);
@@ -1604,7 +1642,8 @@
 		break;
 	}
 
-	srpt_post_recv(ch->sport->sdev, ch, recv_ioctx);
+	if (!send_ioctx || !send_ioctx->recv_ioctx)
+		srpt_post_recv(ch->sport->sdev, ch, recv_ioctx);
 	res = true;
 
 out:
@@ -1630,6 +1669,7 @@
 		req_lim = atomic_dec_return(&ch->req_lim);
 		if (unlikely(req_lim < 0))
 			pr_err("req_lim = %d < 0\n", req_lim);
+		ioctx->byte_len = wc->byte_len;
 		srpt_handle_new_iu(ch, ioctx);
 	} else {
 		pr_info_ratelimited("receiving failed for ioctx %p with status %d\n",
@@ -1693,14 +1733,14 @@
 	atomic_add(1 + ioctx->n_rdma, &ch->sq_wr_avail);
 
 	if (wc->status != IB_WC_SUCCESS)
-		pr_info("sending response for ioctx 0x%p failed"
-			" with status %d\n", ioctx, wc->status);
+		pr_info("sending response for ioctx 0x%p failed with status %d\n",
+			ioctx, wc->status);
 
 	if (state != SRPT_STATE_DONE) {
 		transport_generic_free_cmd(&ioctx->cmd, 0);
 	} else {
-		pr_err("IB completion has been received too late for"
-		       " wr_id = %u.\n", ioctx->ioctx.index);
+		pr_err("IB completion has been received too late for wr_id = %u.\n",
+		       ioctx->ioctx.index);
 	}
 
 	srpt_process_wait_list(ch);
@@ -1727,8 +1767,8 @@
 		goto out;
 
 retry:
-	ch->cq = ib_alloc_cq(sdev->device, ch, ch->rq_size + sq_size,
-			0 /* XXX: spread CQs */, IB_POLL_WORKQUEUE);
+	ch->cq = ib_alloc_cq_any(sdev->device, ch, ch->rq_size + sq_size,
+				 IB_POLL_WORKQUEUE);
 	if (IS_ERR(ch->cq)) {
 		ret = PTR_ERR(ch->cq);
 		pr_err("failed to create CQ cqe= %d ret= %d\n",
@@ -1754,6 +1794,8 @@
 	qp_init->cap.max_rdma_ctxs = sq_size / 2;
 	qp_init->cap.max_send_sge = min(attrs->max_send_sge,
 					SRPT_MAX_SG_PER_WQE);
+	qp_init->cap.max_recv_sge = min(attrs->max_recv_sge,
+					SRPT_MAX_SG_PER_WQE);
 	qp_init->port_num = ch->sport->port;
 	if (sdev->use_srq) {
 		qp_init->srq = sdev->srq;
@@ -1941,7 +1983,8 @@
 			if (srpt_disconnect_ch(ch) >= 0)
 				pr_info("Closing channel %s because target %s_%d has been disabled\n",
 					ch->sess_name,
-					sport->sdev->device->name, sport->port);
+					dev_name(&sport->sdev->device->dev),
+					sport->port);
 			srpt_close_ch(ch);
 		}
 	}
@@ -2009,6 +2052,14 @@
 	kfree_rcu(ch, rcu);
 }
 
+/*
+ * Shut down the SCSI target session, tell the connection manager to
+ * disconnect the associated RDMA channel, transition the QP to the error
+ * state and remove the channel from the channel list. This function is
+ * typically called from inside srpt_zerolength_write_done(). Concurrent
+ * srpt_zerolength_write() calls from inside srpt_close_ch() are possible
+ * as long as the channel is on sport->nexus_list.
+ */
 static void srpt_release_channel_work(struct work_struct *w)
 {
 	struct srpt_rdma_ch *ch;
@@ -2036,21 +2087,25 @@
 	else
 		ib_destroy_cm_id(ch->ib_cm.cm_id);
 
-	srpt_destroy_ch_ib(ch);
-
-	srpt_free_ioctx_ring((struct srpt_ioctx **)ch->ioctx_ring,
-			     ch->sport->sdev, ch->rq_size,
-			     ch->max_rsp_size, DMA_TO_DEVICE);
-
-	srpt_free_ioctx_ring((struct srpt_ioctx **)ch->ioctx_recv_ring,
-			     sdev, ch->rq_size,
-			     srp_max_req_size, DMA_FROM_DEVICE);
-
 	sport = ch->sport;
 	mutex_lock(&sport->mutex);
 	list_del_rcu(&ch->list);
 	mutex_unlock(&sport->mutex);
 
+	srpt_destroy_ch_ib(ch);
+
+	srpt_free_ioctx_ring((struct srpt_ioctx **)ch->ioctx_ring,
+			     ch->sport->sdev, ch->rq_size,
+			     ch->rsp_buf_cache, DMA_TO_DEVICE);
+
+	kmem_cache_destroy(ch->rsp_buf_cache);
+
+	srpt_free_ioctx_ring((struct srpt_ioctx **)ch->ioctx_recv_ring,
+			     sdev, ch->rq_size,
+			     ch->req_buf_cache, DMA_FROM_DEVICE);
+
+	kmem_cache_destroy(ch->req_buf_cache);
+
 	wake_up(&sport->ch_releaseQ);
 
 	kref_put(&ch->kref, srpt_free_ch);
@@ -2088,7 +2143,7 @@
 	struct srpt_rdma_ch *ch = NULL;
 	char i_port_id[36];
 	u32 it_iu_len;
-	int i, ret;
+	int i, tag_num, tag_size, ret;
 
 	WARN_ON_ONCE(irqs_disabled());
 
@@ -2127,7 +2182,7 @@
 	if (!sport->enabled) {
 		rej->reason = cpu_to_be32(SRP_LOGIN_REJ_INSUFFICIENT_RESOURCES);
 		pr_info("rejected SRP_LOGIN_REQ because target port %s_%d has not yet been enabled\n",
-			sport->sdev->device->name, port_num);
+			dev_name(&sport->sdev->device->dev), port_num);
 		goto reject;
 	}
 
@@ -2173,32 +2228,57 @@
 	INIT_LIST_HEAD(&ch->cmd_wait_list);
 	ch->max_rsp_size = ch->sport->port_attrib.srp_max_rsp_size;
 
+	ch->rsp_buf_cache = kmem_cache_create("srpt-rsp-buf", ch->max_rsp_size,
+					      512, 0, NULL);
+	if (!ch->rsp_buf_cache)
+		goto free_ch;
+
 	ch->ioctx_ring = (struct srpt_send_ioctx **)
 		srpt_alloc_ioctx_ring(ch->sport->sdev, ch->rq_size,
 				      sizeof(*ch->ioctx_ring[0]),
-				      ch->max_rsp_size, DMA_TO_DEVICE);
+				      ch->rsp_buf_cache, 0, DMA_TO_DEVICE);
 	if (!ch->ioctx_ring) {
 		pr_err("rejected SRP_LOGIN_REQ because creating a new QP SQ ring failed.\n");
 		rej->reason = cpu_to_be32(SRP_LOGIN_REJ_INSUFFICIENT_RESOURCES);
-		goto free_ch;
+		goto free_rsp_cache;
 	}
 
-	INIT_LIST_HEAD(&ch->free_list);
-	for (i = 0; i < ch->rq_size; i++) {
+	for (i = 0; i < ch->rq_size; i++)
 		ch->ioctx_ring[i]->ch = ch;
-		list_add_tail(&ch->ioctx_ring[i]->free_list, &ch->free_list);
-	}
 	if (!sdev->use_srq) {
+		u16 imm_data_offset = req->req_flags & SRP_IMMED_REQUESTED ?
+			be16_to_cpu(req->imm_data_offset) : 0;
+		u16 alignment_offset;
+		u32 req_sz;
+
+		if (req->req_flags & SRP_IMMED_REQUESTED)
+			pr_debug("imm_data_offset = %d\n",
+				 be16_to_cpu(req->imm_data_offset));
+		if (imm_data_offset >= sizeof(struct srp_cmd)) {
+			ch->imm_data_offset = imm_data_offset;
+			rsp->rsp_flags |= SRP_LOGIN_RSP_IMMED_SUPP;
+		} else {
+			ch->imm_data_offset = 0;
+		}
+		alignment_offset = round_up(imm_data_offset, 512) -
+			imm_data_offset;
+		req_sz = alignment_offset + imm_data_offset + srp_max_req_size;
+		ch->req_buf_cache = kmem_cache_create("srpt-req-buf", req_sz,
+						      512, 0, NULL);
+		if (!ch->req_buf_cache)
+			goto free_rsp_ring;
+
 		ch->ioctx_recv_ring = (struct srpt_recv_ioctx **)
 			srpt_alloc_ioctx_ring(ch->sport->sdev, ch->rq_size,
 					      sizeof(*ch->ioctx_recv_ring[0]),
-					      srp_max_req_size,
+					      ch->req_buf_cache,
+					      alignment_offset,
 					      DMA_FROM_DEVICE);
 		if (!ch->ioctx_recv_ring) {
 			pr_err("rejected SRP_LOGIN_REQ because creating a new QP RQ ring failed.\n");
 			rej->reason =
 			    cpu_to_be32(SRP_LOGIN_REJ_INSUFFICIENT_RESOURCES);
-			goto free_ring;
+			goto free_recv_cache;
 		}
 		for (i = 0; i < ch->rq_size; i++)
 			INIT_LIST_HEAD(&ch->ioctx_recv_ring[i]->wait_list);
@@ -2218,18 +2298,20 @@
 
 	pr_debug("registering session %s\n", ch->sess_name);
 
+	tag_num = ch->rq_size;
+	tag_size = 1; /* ib_srpt does not use se_sess->sess_cmd_map */
 	if (sport->port_guid_tpg.se_tpg_wwn)
-		ch->sess = target_setup_session(&sport->port_guid_tpg, 0, 0,
-						TARGET_PROT_NORMAL,
+		ch->sess = target_setup_session(&sport->port_guid_tpg, tag_num,
+						tag_size, TARGET_PROT_NORMAL,
 						ch->sess_name, ch, NULL);
 	if (sport->port_gid_tpg.se_tpg_wwn && IS_ERR_OR_NULL(ch->sess))
-		ch->sess = target_setup_session(&sport->port_gid_tpg, 0, 0,
-					TARGET_PROT_NORMAL, i_port_id, ch,
-					NULL);
+		ch->sess = target_setup_session(&sport->port_gid_tpg, tag_num,
+					tag_size, TARGET_PROT_NORMAL, i_port_id,
+					ch, NULL);
 	/* Retry without leading "0x" */
 	if (sport->port_gid_tpg.se_tpg_wwn && IS_ERR_OR_NULL(ch->sess))
-		ch->sess = target_setup_session(&sport->port_gid_tpg, 0, 0,
-						TARGET_PROT_NORMAL,
+		ch->sess = target_setup_session(&sport->port_gid_tpg, tag_num,
+						tag_size, TARGET_PROT_NORMAL,
 						i_port_id + 2, ch, NULL);
 	if (IS_ERR_OR_NULL(ch->sess)) {
 		WARN_ON_ONCE(ch->sess == NULL);
@@ -2248,17 +2330,15 @@
 	if ((req->req_flags & SRP_MTCH_ACTION) == SRP_MULTICHAN_SINGLE) {
 		struct srpt_rdma_ch *ch2;
 
-		rsp->rsp_flags = SRP_LOGIN_RSP_MULTICHAN_NO_CHAN;
-
 		list_for_each_entry(ch2, &nexus->ch_list, list) {
 			if (srpt_disconnect_ch(ch2) < 0)
 				continue;
 			pr_info("Relogin - closed existing channel %s\n",
 				ch2->sess_name);
-			rsp->rsp_flags = SRP_LOGIN_RSP_MULTICHAN_TERMINATED;
+			rsp->rsp_flags |= SRP_LOGIN_RSP_MULTICHAN_TERMINATED;
 		}
 	} else {
-		rsp->rsp_flags = SRP_LOGIN_RSP_MULTICHAN_MAINTAINED;
+		rsp->rsp_flags |= SRP_LOGIN_RSP_MULTICHAN_MAINTAINED;
 	}
 
 	list_add_tail_rcu(&ch->list, &nexus->ch_list);
@@ -2267,7 +2347,7 @@
 		rej->reason = cpu_to_be32(
 				SRP_LOGIN_REJ_INSUFFICIENT_RESOURCES);
 		pr_info("rejected SRP_LOGIN_REQ because target %s_%d is not enabled\n",
-			sdev->device->name, port_num);
+			dev_name(&sdev->device->dev), port_num);
 		mutex_unlock(&sport->mutex);
 		goto reject;
 	}
@@ -2288,7 +2368,7 @@
 	/* create srp_login_response */
 	rsp->opcode = SRP_LOGIN_RSP;
 	rsp->tag = req->tag;
-	rsp->max_it_iu_len = req->req_it_iu_len;
+	rsp->max_it_iu_len = cpu_to_be32(srp_max_req_size);
 	rsp->max_ti_iu_len = req->req_it_iu_len;
 	ch->max_ti_iu_len = it_iu_len;
 	rsp->buf_fmt = cpu_to_be16(SRP_BUF_FORMAT_DIRECT |
@@ -2352,12 +2432,18 @@
 free_recv_ring:
 	srpt_free_ioctx_ring((struct srpt_ioctx **)ch->ioctx_recv_ring,
 			     ch->sport->sdev, ch->rq_size,
-			     srp_max_req_size, DMA_FROM_DEVICE);
+			     ch->req_buf_cache, DMA_FROM_DEVICE);
 
-free_ring:
+free_recv_cache:
+	kmem_cache_destroy(ch->req_buf_cache);
+
+free_rsp_ring:
 	srpt_free_ioctx_ring((struct srpt_ioctx **)ch->ioctx_ring,
 			     ch->sport->sdev, ch->rq_size,
-			     ch->max_rsp_size, DMA_TO_DEVICE);
+			     ch->rsp_buf_cache, DMA_TO_DEVICE);
+
+free_rsp_cache:
+	kmem_cache_destroy(ch->rsp_buf_cache);
 
 free_ch:
 	if (rdma_cm_id)
@@ -2438,6 +2524,7 @@
 	req.req_flags		= req_rdma->req_flags;
 	memcpy(req.initiator_port_id, req_rdma->initiator_port_id, 16);
 	memcpy(req.target_port_id, req_rdma->target_port_id, 16);
+	req.imm_data_offset	= req_rdma->imm_data_offset;
 
 	snprintf(src_addr, sizeof(src_addr), "%pIS",
 		 &cm_id->route.addr.src_addr);
@@ -2607,14 +2694,6 @@
 	return ret;
 }
 
-static int srpt_write_pending_status(struct se_cmd *se_cmd)
-{
-	struct srpt_send_ioctx *ioctx;
-
-	ioctx = container_of(se_cmd, struct srpt_send_ioctx, cmd);
-	return ioctx->state == SRPT_STATE_NEED_DATA;
-}
-
 /*
  * srpt_write_pending - Start data transfer from initiator to target (write).
  */
@@ -2628,6 +2707,12 @@
 	enum srpt_command_state new_state;
 	int ret, i;
 
+	if (ioctx->recv_ioctx) {
+		srpt_set_cmd_state(ioctx, SRPT_STATE_DATA_IN);
+		target_execute_cmd(&ioctx->cmd);
+		return 0;
+	}
+
 	new_state = srpt_set_cmd_state(ioctx, SRPT_STATE_NEED_DATA);
 	WARN_ON(new_state == SRPT_STATE_DONE);
 
@@ -2708,7 +2793,7 @@
 		break;
 	}
 
-	if (unlikely(WARN_ON_ONCE(state == SRPT_STATE_CMD_RSP_SENT)))
+	if (WARN_ON_ONCE(state == SRPT_STATE_CMD_RSP_SENT))
 		return;
 
 	/* For read commands, transfer the data to the initiator. */
@@ -2785,8 +2870,19 @@
 	srpt_queue_response(cmd);
 }
 
+/*
+ * This function is called for aborted commands if no response is sent to the
+ * initiator. Make sure that the credits freed by aborting a command are
+ * returned to the initiator the next time a response is sent by incrementing
+ * ch->req_lim_delta.
+ */
 static void srpt_aborted_task(struct se_cmd *cmd)
 {
+	struct srpt_send_ioctx *ioctx = container_of(cmd,
+				struct srpt_send_ioctx, cmd);
+	struct srpt_rdma_ch *ch = ioctx->ch;
+
+	atomic_inc(&ch->req_lim_delta);
 }
 
 static int srpt_queue_status(struct se_cmd *cmd)
@@ -2842,7 +2938,7 @@
 	while (wait_event_timeout(sport->ch_releaseQ,
 				  srpt_ch_list_empty(sport), 5 * HZ) <= 0) {
 		pr_info("%s_%d: waiting for session unregistration ...\n",
-			sport->sdev->device->name, sport->port);
+			dev_name(&sport->sdev->device->dev), sport->port);
 		rcu_read_lock();
 		list_for_each_entry(nexus, &sport->nexus_list, entry) {
 			list_for_each_entry(ch, &nexus->ch_list, list) {
@@ -2907,7 +3003,9 @@
 
 	ib_destroy_srq(sdev->srq);
 	srpt_free_ioctx_ring((struct srpt_ioctx **)sdev->ioctx_ring, sdev,
-			     sdev->srq_size, srp_max_req_size, DMA_FROM_DEVICE);
+			     sdev->srq_size, sdev->req_buf_cache,
+			     DMA_FROM_DEVICE);
+	kmem_cache_destroy(sdev->req_buf_cache);
 	sdev->srq = NULL;
 }
 
@@ -2932,16 +3030,19 @@
 	}
 
 	pr_debug("create SRQ #wr= %d max_allow=%d dev= %s\n", sdev->srq_size,
-		 sdev->device->attrs.max_srq_wr, device->name);
+		 sdev->device->attrs.max_srq_wr, dev_name(&device->dev));
+
+	sdev->req_buf_cache = kmem_cache_create("srpt-srq-req-buf",
+						srp_max_req_size, 0, 0, NULL);
+	if (!sdev->req_buf_cache)
+		goto free_srq;
 
 	sdev->ioctx_ring = (struct srpt_recv_ioctx **)
 		srpt_alloc_ioctx_ring(sdev, sdev->srq_size,
 				      sizeof(*sdev->ioctx_ring[0]),
-				      srp_max_req_size, DMA_FROM_DEVICE);
-	if (!sdev->ioctx_ring) {
-		ib_destroy_srq(srq);
-		return -ENOMEM;
-	}
+				      sdev->req_buf_cache, 0, DMA_FROM_DEVICE);
+	if (!sdev->ioctx_ring)
+		goto free_cache;
 
 	sdev->use_srq = true;
 	sdev->srq = srq;
@@ -2952,6 +3053,13 @@
 	}
 
 	return 0;
+
+free_cache:
+	kmem_cache_destroy(sdev->req_buf_cache);
+
+free_srq:
+	ib_destroy_srq(srq);
+	return -ENOMEM;
 }
 
 static int srpt_use_srq(struct srpt_device *sdev, bool use_srq)
@@ -2965,8 +3073,8 @@
 	} else if (use_srq && !sdev->srq) {
 		ret = srpt_alloc_srq(sdev);
 	}
-	pr_debug("%s(%s): use_srq = %d; ret = %d\n", __func__, device->name,
-		 sdev->use_srq, ret);
+	pr_debug("%s(%s): use_srq = %d; ret = %d\n", __func__,
+		 dev_name(&device->dev), sdev->use_srq, ret);
 	return ret;
 }
 
@@ -3014,9 +3122,8 @@
 	}
 
 	/* print out target login information */
-	pr_debug("Target login info: id_ext=%016llx,ioc_guid=%016llx,"
-		 "pkey=ffff,service_id=%016llx\n", srpt_service_guid,
-		 srpt_service_guid, srpt_service_guid);
+	pr_debug("Target login info: id_ext=%016llx,ioc_guid=%016llx,pkey=ffff,service_id=%016llx\n",
+		 srpt_service_guid, srpt_service_guid, srpt_service_guid);
 
 	/*
 	 * We do not have a consistent service_id (ie. also id_ext of target_id)
@@ -3052,7 +3159,7 @@
 
 		if (srpt_refresh_port(sport)) {
 			pr_err("MAD registration failed for %s-%d.\n",
-			       sdev->device->name, i);
+			       dev_name(&sdev->device->dev), i);
 			goto err_event;
 		}
 	}
@@ -3063,7 +3170,7 @@
 
 out:
 	ib_set_client_data(device, &srpt_client, sdev);
-	pr_debug("added %s.\n", device->name);
+	pr_debug("added %s.\n", dev_name(&device->dev));
 	return;
 
 err_event:
@@ -3078,7 +3185,7 @@
 	kfree(sdev);
 err:
 	sdev = NULL;
-	pr_info("%s(%s) failed.\n", __func__, device->name);
+	pr_info("%s(%s) failed.\n", __func__, dev_name(&device->dev));
 	goto out;
 }
 
@@ -3093,7 +3200,8 @@
 	int i;
 
 	if (!sdev) {
-		pr_info("%s(%s): nothing to do.\n", __func__, device->name);
+		pr_info("%s(%s): nothing to do.\n", __func__,
+			dev_name(&device->dev));
 		return;
 	}
 
@@ -3145,11 +3253,6 @@
 	return 0;
 }
 
-static char *srpt_get_fabric_name(void)
-{
-	return "srpt";
-}
-
 static struct srpt_port *srpt_tpg_to_sport(struct se_portal_group *tpg)
 {
 	return tpg->se_tpg_wwn->priv;
@@ -3180,19 +3283,23 @@
 	struct srpt_send_ioctx *ioctx = container_of(se_cmd,
 				struct srpt_send_ioctx, cmd);
 	struct srpt_rdma_ch *ch = ioctx->ch;
-	unsigned long flags;
+	struct srpt_recv_ioctx *recv_ioctx = ioctx->recv_ioctx;
 
 	WARN_ON_ONCE(ioctx->state != SRPT_STATE_DONE &&
 		     !(ioctx->cmd.transport_state & CMD_T_ABORTED));
 
+	if (recv_ioctx) {
+		WARN_ON_ONCE(!list_empty(&recv_ioctx->wait_list));
+		ioctx->recv_ioctx = NULL;
+		srpt_post_recv(ch->sport->sdev, ch, recv_ioctx);
+	}
+
 	if (ioctx->n_rw_ctx) {
 		srpt_free_rw_ctxs(ch, ioctx);
 		ioctx->n_rw_ctx = 0;
 	}
 
-	spin_lock_irqsave(&ch->spinlock, flags);
-	list_add(&ioctx->free_list, &ch->free_list);
-	spin_unlock_irqrestore(&ch->spinlock, flags);
+	target_free_tag(se_cmd->se_sess, se_cmd);
 }
 
 /**
@@ -3570,7 +3677,7 @@
 	struct se_portal_group *se_tpg = to_tpg(item);
 	struct srpt_port *sport = srpt_tpg_to_sport(se_tpg);
 
-	return snprintf(page, PAGE_SIZE, "%d\n", (sport->enabled) ? 1: 0);
+	return snprintf(page, PAGE_SIZE, "%d\n", sport->enabled);
 }
 
 static ssize_t srpt_tpg_enable_store(struct config_item *item,
@@ -3579,7 +3686,7 @@
 	struct se_portal_group *se_tpg = to_tpg(item);
 	struct srpt_port *sport = srpt_tpg_to_sport(se_tpg);
 	unsigned long tmp;
-        int ret;
+	int ret;
 
 	ret = kstrtoul(page, 0, &tmp);
 	if (ret < 0) {
@@ -3615,7 +3722,7 @@
 					     const char *name)
 {
 	struct srpt_port *sport = wwn->priv;
-	static struct se_portal_group *tpg;
+	struct se_portal_group *tpg;
 	int res;
 
 	WARN_ON_ONCE(wwn != &sport->port_guid_wwn &&
@@ -3664,7 +3771,7 @@
 
 static ssize_t srpt_wwn_version_show(struct config_item *item, char *buf)
 {
-	return scnprintf(buf, PAGE_SIZE, "%s\n", DRV_VERSION);
+	return scnprintf(buf, PAGE_SIZE, "\n");
 }
 
 CONFIGFS_ATTR_RO(srpt_wwn_, version);
@@ -3676,8 +3783,7 @@
 
 static const struct target_core_fabric_ops srpt_template = {
 	.module				= THIS_MODULE,
-	.name				= "srpt",
-	.get_fabric_name		= srpt_get_fabric_name,
+	.fabric_name			= "srpt",
 	.tpg_get_wwn			= srpt_get_fabric_wwn,
 	.tpg_get_tag			= srpt_get_tag,
 	.tpg_check_demo_mode		= srpt_check_false,
@@ -3691,7 +3797,6 @@
 	.sess_get_index			= srpt_sess_get_index,
 	.sess_get_initiator_sid		= NULL,
 	.write_pending			= srpt_write_pending,
-	.write_pending_status		= srpt_write_pending_status,
 	.set_default_node_attributes	= srpt_set_default_node_attrs,
 	.get_cmd_state			= srpt_get_tcm_cmd_state,
 	.queue_data_in			= srpt_queue_data_in,
@@ -3728,16 +3833,14 @@
 
 	ret = -EINVAL;
 	if (srp_max_req_size < MIN_MAX_REQ_SIZE) {
-		pr_err("invalid value %d for kernel module parameter"
-		       " srp_max_req_size -- must be at least %d.\n",
+		pr_err("invalid value %d for kernel module parameter srp_max_req_size -- must be at least %d.\n",
 		       srp_max_req_size, MIN_MAX_REQ_SIZE);
 		goto out;
 	}
 
 	if (srpt_srq_size < MIN_SRPT_SRQ_SIZE
 	    || srpt_srq_size > MAX_SRPT_SRQ_SIZE) {
-		pr_err("invalid value %d for kernel module parameter"
-		       " srpt_srq_size -- must be in the range [%d..%d].\n",
+		pr_err("invalid value %d for kernel module parameter srpt_srq_size -- must be in the range [%d..%d].\n",
 		       srpt_srq_size, MIN_SRPT_SRQ_SIZE, MAX_SRPT_SRQ_SIZE);
 		goto out;
 	}
diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.h b/drivers/infiniband/ulp/srpt/ib_srpt.h
index 444dfd7..ee9f20e 100644
--- a/drivers/infiniband/ulp/srpt/ib_srpt.h
+++ b/drivers/infiniband/ulp/srpt/ib_srpt.h
@@ -104,10 +104,6 @@
 	SRP_CMD_ORDERED_Q = 0x2,
 	SRP_CMD_ACA = 0x4,
 
-	SRP_LOGIN_RSP_MULTICHAN_NO_CHAN = 0x0,
-	SRP_LOGIN_RSP_MULTICHAN_TERMINATED = 0x1,
-	SRP_LOGIN_RSP_MULTICHAN_MAINTAINED = 0x2,
-
 	SRPT_DEF_SG_TABLESIZE = 128,
 	/*
 	 * An experimentally determined value that avoids that QP creation
@@ -124,11 +120,18 @@
 	MAX_SRPT_RDMA_SIZE = 1U << 24,
 	MAX_SRPT_RSP_SIZE = 1024,
 
+	SRP_MAX_ADD_CDB_LEN = 16,
+	SRP_MAX_IMM_DATA_OFFSET = 80,
+	SRP_MAX_IMM_DATA = 8 * 1024,
 	MIN_MAX_REQ_SIZE = 996,
-	DEFAULT_MAX_REQ_SIZE
-		= sizeof(struct srp_cmd)/*48*/
-		+ sizeof(struct srp_indirect_buf)/*20*/
-		+ 128 * sizeof(struct srp_direct_buf)/*16*/,
+	DEFAULT_MAX_REQ_SIZE_1 = sizeof(struct srp_cmd)/*48*/ +
+				 SRP_MAX_ADD_CDB_LEN +
+				 sizeof(struct srp_indirect_buf)/*20*/ +
+				 128 * sizeof(struct srp_direct_buf)/*16*/,
+	DEFAULT_MAX_REQ_SIZE_2 = SRP_MAX_IMM_DATA_OFFSET +
+				 sizeof(struct srp_imm_buf) + SRP_MAX_IMM_DATA,
+	DEFAULT_MAX_REQ_SIZE = DEFAULT_MAX_REQ_SIZE_1 > DEFAULT_MAX_REQ_SIZE_2 ?
+			       DEFAULT_MAX_REQ_SIZE_1 : DEFAULT_MAX_REQ_SIZE_2,
 
 	MIN_MAX_RSP_SIZE = sizeof(struct srp_rsp)/*36*/ + 4,
 	DEFAULT_MAX_RSP_SIZE = 256, /* leaves 220 bytes for sense data */
@@ -165,12 +168,14 @@
  * @cqe:   Completion queue element.
  * @buf:   Pointer to the buffer.
  * @dma:   DMA address of the buffer.
+ * @offset: Offset of the first byte in @buf and @dma that is actually used.
  * @index: Index of the I/O context in its ioctx_ring array.
  */
 struct srpt_ioctx {
 	struct ib_cqe		cqe;
 	void			*buf;
 	dma_addr_t		dma;
+	uint32_t		offset;
 	uint32_t		index;
 };
 
@@ -178,12 +183,14 @@
  * struct srpt_recv_ioctx - SRPT receive I/O context
  * @ioctx:     See above.
  * @wait_list: Node for insertion in srpt_rdma_ch.cmd_wait_list.
+ * @byte_len:  Number of bytes in @ioctx.buf.
  */
 struct srpt_recv_ioctx {
 	struct srpt_ioctx	ioctx;
 	struct list_head	wait_list;
+	int			byte_len;
 };
-	
+
 struct srpt_rw_ctx {
 	struct rdma_rw_ctx	rw;
 	struct scatterlist	*sg;
@@ -194,10 +201,12 @@
  * struct srpt_send_ioctx - SRPT send I/O context
  * @ioctx:       See above.
  * @ch:          Channel pointer.
+ * @recv_ioctx:  Receive I/O context associated with this send I/O context.
+ *		 Only used for processing immediate data.
  * @s_rw_ctx:    @rw_ctxs points here if only a single rw_ctx is needed.
  * @rw_ctxs:     RDMA read/write contexts.
+ * @imm_sg:      Scatterlist for immediate data.
  * @rdma_cqe:    RDMA completion queue element.
- * @free_list:   Node in srpt_rdma_ch.free_list.
  * @state:       I/O context state.
  * @cmd:         Target core command data structure.
  * @sense_data:  SCSI sense data.
@@ -209,12 +218,14 @@
 struct srpt_send_ioctx {
 	struct srpt_ioctx	ioctx;
 	struct srpt_rdma_ch	*ch;
+	struct srpt_recv_ioctx	*recv_ioctx;
 
 	struct srpt_rw_ctx	s_rw_ctx;
 	struct srpt_rw_ctx	*rw_ctxs;
 
+	struct scatterlist	imm_sg;
+
 	struct ib_cqe		rdma_cqe;
-	struct list_head	free_list;
 	enum srpt_command_state	state;
 	struct se_cmd		cmd;
 	u8			n_rdma;
@@ -245,7 +256,10 @@
  * struct srpt_rdma_ch - RDMA channel
  * @nexus:         I_T nexus this channel is associated with.
  * @qp:            IB queue pair used for communicating over this channel.
- * @cm_id:         IB CM ID associated with the channel.
+ * @ib_cm:	   See below.
+ * @ib_cm.cm_id:   IB CM ID associated with the channel.
+ * @rdma_cm:	   See below.
+ * @rdma_cm.cm_id: RDMA CM ID associated with the channel.
  * @cq:            IB completion queue for this channel.
  * @zw_cqe:	   Zero-length write CQE.
  * @rcu:           RCU head.
@@ -259,12 +273,14 @@
  * @req_lim:       request limit: maximum number of requests that may be sent
  *                 by the initiator without having received a response.
  * @req_lim_delta: Number of credits not yet sent back to the initiator.
+ * @imm_data_offset: Offset from start of SRP_CMD for immediate data.
  * @spinlock:      Protects free_list and state.
- * @free_list:     Head of list with free send I/O contexts.
  * @state:         channel state. See also enum rdma_ch_state.
  * @using_rdma_cm: Whether the RDMA/CM or IB/CM is used for this channel.
  * @processing_wait_list: Whether or not cmd_wait_list is being processed.
+ * @rsp_buf_cache: kmem_cache for @ioctx_ring.
  * @ioctx_ring:    Send ring.
+ * @req_buf_cache: kmem_cache for @ioctx_recv_ring.
  * @ioctx_recv_ring: Receive I/O context ring.
  * @list:          Node in srpt_nexus.ch_list.
  * @cmd_wait_list: List of SCSI commands that arrived before the RTU event. This
@@ -297,10 +313,12 @@
 	int			max_ti_iu_len;
 	atomic_t		req_lim;
 	atomic_t		req_lim_delta;
+	u16			imm_data_offset;
 	spinlock_t		spinlock;
-	struct list_head	free_list;
 	enum rdma_ch_state	state;
+	struct kmem_cache	*rsp_buf_cache;
 	struct srpt_send_ioctx	**ioctx_ring;
+	struct kmem_cache	*req_buf_cache;
 	struct srpt_recv_ioctx	**ioctx_recv_ring;
 	struct list_head	list;
 	struct list_head	cmd_wait_list;
@@ -395,6 +413,7 @@
  * @srq_size:      SRQ size.
  * @sdev_mutex:	   Serializes use_srq changes.
  * @use_srq:       Whether or not to use SRQ.
+ * @req_buf_cache: kmem_cache for @ioctx_ring buffers.
  * @ioctx_ring:    Per-HCA SRQ.
  * @event_handler: Per-HCA asynchronous IB event handler.
  * @list:          Node in srpt_dev_list.
@@ -409,6 +428,7 @@
 	int			srq_size;
 	struct mutex		sdev_mutex;
 	bool			use_srq;
+	struct kmem_cache	*req_buf_cache;
 	struct srpt_recv_ioctx	**ioctx_ring;
 	struct ib_event_handler	event_handler;
 	struct list_head	list;