Update Linux to v5.10.109
Sourced from [1]
[1] https://cdn.kernel.org/pub/linux/kernel/v5.x/linux-5.10.109.tar.xz
Change-Id: I19bca9fc6762d4e63bcf3e4cba88bbe560d9c76c
Signed-off-by: Olivier Deprez <olivier.deprez@arm.com>
diff --git a/net/rds/Kconfig b/net/rds/Kconfig
index c64e154..75cd696 100644
--- a/net/rds/Kconfig
+++ b/net/rds/Kconfig
@@ -3,14 +3,14 @@
config RDS
tristate "The Reliable Datagram Sockets Protocol"
depends on INET
- ---help---
+ help
The RDS (Reliable Datagram Sockets) protocol provides reliable,
sequenced delivery of datagrams over Infiniband or TCP.
config RDS_RDMA
tristate "RDS over Infiniband"
depends on RDS && INFINIBAND && INFINIBAND_ADDR_TRANS
- ---help---
+ help
Allow RDS to use Infiniband as a transport.
This transport supports RDMA operations.
@@ -18,7 +18,7 @@
tristate "RDS over TCP"
depends on RDS
depends on IPV6 || !IPV6
- ---help---
+ help
Allow RDS to use TCP as a transport.
This transport does not support RDMA operations.
diff --git a/net/rds/Makefile b/net/rds/Makefile
index e647f9d..8fdc118 100644
--- a/net/rds/Makefile
+++ b/net/rds/Makefile
@@ -7,7 +7,7 @@
obj-$(CONFIG_RDS_RDMA) += rds_rdma.o
rds_rdma-y := rdma_transport.o \
ib.o ib_cm.o ib_recv.o ib_ring.o ib_send.o ib_stats.o \
- ib_sysctl.o ib_rdma.o ib_fmr.o ib_frmr.o
+ ib_sysctl.o ib_rdma.o ib_frmr.o
obj-$(CONFIG_RDS_TCP) += rds_tcp.o
diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c
index 1a5bf3f..b239120 100644
--- a/net/rds/af_rds.c
+++ b/net/rds/af_rds.c
@@ -290,8 +290,7 @@
return 0;
}
-static int rds_cancel_sent_to(struct rds_sock *rs, char __user *optval,
- int len)
+static int rds_cancel_sent_to(struct rds_sock *rs, sockptr_t optval, int len)
{
struct sockaddr_in6 sin6;
struct sockaddr_in sin;
@@ -308,14 +307,15 @@
goto out;
} else if (len < sizeof(struct sockaddr_in6)) {
/* Assume IPv4 */
- if (copy_from_user(&sin, optval, sizeof(struct sockaddr_in))) {
+ if (copy_from_sockptr(&sin, optval,
+ sizeof(struct sockaddr_in))) {
ret = -EFAULT;
goto out;
}
ipv6_addr_set_v4mapped(sin.sin_addr.s_addr, &sin6.sin6_addr);
sin6.sin6_port = sin.sin_port;
} else {
- if (copy_from_user(&sin6, optval,
+ if (copy_from_sockptr(&sin6, optval,
sizeof(struct sockaddr_in6))) {
ret = -EFAULT;
goto out;
@@ -327,21 +327,20 @@
return ret;
}
-static int rds_set_bool_option(unsigned char *optvar, char __user *optval,
+static int rds_set_bool_option(unsigned char *optvar, sockptr_t optval,
int optlen)
{
int value;
if (optlen < sizeof(int))
return -EINVAL;
- if (get_user(value, (int __user *) optval))
+ if (copy_from_sockptr(&value, optval, sizeof(int)))
return -EFAULT;
*optvar = !!value;
return 0;
}
-static int rds_cong_monitor(struct rds_sock *rs, char __user *optval,
- int optlen)
+static int rds_cong_monitor(struct rds_sock *rs, sockptr_t optval, int optlen)
{
int ret;
@@ -358,8 +357,7 @@
return ret;
}
-static int rds_set_transport(struct rds_sock *rs, char __user *optval,
- int optlen)
+static int rds_set_transport(struct rds_sock *rs, sockptr_t optval, int optlen)
{
int t_type;
@@ -369,7 +367,7 @@
if (optlen != sizeof(int))
return -EINVAL;
- if (copy_from_user(&t_type, (int __user *)optval, sizeof(t_type)))
+ if (copy_from_sockptr(&t_type, optval, sizeof(t_type)))
return -EFAULT;
if (t_type < 0 || t_type >= RDS_TRANS_COUNT)
@@ -380,7 +378,7 @@
return rs->rs_transport ? 0 : -ENOPROTOOPT;
}
-static int rds_enable_recvtstamp(struct sock *sk, char __user *optval,
+static int rds_enable_recvtstamp(struct sock *sk, sockptr_t optval,
int optlen, int optname)
{
int val, valbool;
@@ -388,7 +386,7 @@
if (optlen != sizeof(int))
return -EFAULT;
- if (get_user(val, (int __user *)optval))
+ if (copy_from_sockptr(&val, optval, sizeof(int)))
return -EFAULT;
valbool = val ? 1 : 0;
@@ -404,7 +402,7 @@
return 0;
}
-static int rds_recv_track_latency(struct rds_sock *rs, char __user *optval,
+static int rds_recv_track_latency(struct rds_sock *rs, sockptr_t optval,
int optlen)
{
struct rds_rx_trace_so trace;
@@ -413,7 +411,7 @@
if (optlen != sizeof(struct rds_rx_trace_so))
return -EFAULT;
- if (copy_from_user(&trace, optval, sizeof(trace)))
+ if (copy_from_sockptr(&trace, optval, sizeof(trace)))
return -EFAULT;
if (trace.rx_traces > RDS_MSG_RX_DGRAM_TRACE_MAX)
@@ -432,7 +430,7 @@
}
static int rds_setsockopt(struct socket *sock, int level, int optname,
- char __user *optval, unsigned int optlen)
+ sockptr_t optval, unsigned int optlen)
{
struct rds_sock *rs = rds_sk_to_rs(sock->sk);
int ret;
diff --git a/net/rds/cong.c b/net/rds/cong.c
index ccdff09..8b689eb 100644
--- a/net/rds/cong.c
+++ b/net/rds/cong.c
@@ -236,7 +236,7 @@
* tcp_setsockopt and/or tcp_sendmsg will deadlock
* when it tries to get the sock_lock())
* 2. Interrupts are masked so that we can mark the
- * the port congested from both send and recv paths.
+ * port congested from both send and recv paths.
* (See comment around declaration of rdc_cong_lock).
* An attempt to get the sock_lock() here will
* therefore trigger warnings.
diff --git a/net/rds/connection.c b/net/rds/connection.c
index c85bd63..b4cc699 100644
--- a/net/rds/connection.c
+++ b/net/rds/connection.c
@@ -253,6 +253,7 @@
* should end up here, but if it
* does, reset/destroy the connection.
*/
+ kfree(conn->c_path);
kmem_cache_free(rds_conn_slab, conn);
conn = ERR_PTR(-EOPNOTSUPP);
goto out;
@@ -916,6 +917,17 @@
}
EXPORT_SYMBOL_GPL(rds_conn_path_connect_if_down);
+/* Check connectivity of all paths
+ */
+void rds_check_all_paths(struct rds_connection *conn)
+{
+ int i = 0;
+
+ do {
+ rds_conn_path_connect_if_down(&conn->c_path[i]);
+ } while (++i < conn->c_npaths);
+}
+
void rds_conn_connect_if_down(struct rds_connection *conn)
{
WARN_ON(conn->c_trans->t_mp_capable);
diff --git a/net/rds/ib.c b/net/rds/ib.c
index 9de2ae2..24c9a90 100644
--- a/net/rds/ib.c
+++ b/net/rds/ib.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2006, 2019 Oracle and/or its affiliates. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
@@ -125,19 +125,23 @@
queue_work(rds_wq, &rds_ibdev->free_work);
}
-static void rds_ib_add_one(struct ib_device *device)
+static int rds_ib_add_one(struct ib_device *device)
{
struct rds_ib_device *rds_ibdev;
- bool has_fr, has_fmr;
+ int ret;
/* Only handle IB (no iWARP) devices */
if (device->node_type != RDMA_NODE_IB_CA)
- return;
+ return -EOPNOTSUPP;
+
+ /* Device must support FRWR */
+ if (!(device->attrs.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS))
+ return -EOPNOTSUPP;
rds_ibdev = kzalloc_node(sizeof(struct rds_ib_device), GFP_KERNEL,
ibdev_to_node(device));
if (!rds_ibdev)
- return;
+ return -ENOMEM;
spin_lock_init(&rds_ibdev->spinlock);
refcount_set(&rds_ibdev->refcount, 1);
@@ -149,13 +153,14 @@
rds_ibdev->max_wrs = device->attrs.max_qp_wr;
rds_ibdev->max_sge = min(device->attrs.max_send_sge, RDS_IB_MAX_SGE);
- has_fr = (device->attrs.device_cap_flags &
- IB_DEVICE_MEM_MGT_EXTENSIONS);
- has_fmr = (device->ops.alloc_fmr && device->ops.dealloc_fmr &&
- device->ops.map_phys_fmr && device->ops.unmap_fmr);
- rds_ibdev->use_fastreg = (has_fr && !has_fmr);
+ rds_ibdev->odp_capable =
+ !!(device->attrs.device_cap_flags &
+ IB_DEVICE_ON_DEMAND_PAGING) &&
+ !!(device->attrs.odp_caps.per_transport_caps.rc_odp_caps &
+ IB_ODP_SUPPORT_WRITE) &&
+ !!(device->attrs.odp_caps.per_transport_caps.rc_odp_caps &
+ IB_ODP_SUPPORT_READ);
- rds_ibdev->fmr_max_remaps = device->attrs.max_map_per_fmr?: 32;
rds_ibdev->max_1m_mrs = device->attrs.max_mr ?
min_t(unsigned int, (device->attrs.max_mr / 2),
rds_ib_mr_1m_pool_size) : rds_ib_mr_1m_pool_size;
@@ -173,12 +178,14 @@
if (!rds_ibdev->vector_load) {
pr_err("RDS/IB: %s failed to allocate vector memory\n",
__func__);
+ ret = -ENOMEM;
goto put_dev;
}
rds_ibdev->dev = device;
rds_ibdev->pd = ib_alloc_pd(device, 0);
if (IS_ERR(rds_ibdev->pd)) {
+ ret = PTR_ERR(rds_ibdev->pd);
rds_ibdev->pd = NULL;
goto put_dev;
}
@@ -186,6 +193,7 @@
rds_ibdev->mr_1m_pool =
rds_ib_create_mr_pool(rds_ibdev, RDS_IB_MR_1M_POOL);
if (IS_ERR(rds_ibdev->mr_1m_pool)) {
+ ret = PTR_ERR(rds_ibdev->mr_1m_pool);
rds_ibdev->mr_1m_pool = NULL;
goto put_dev;
}
@@ -193,18 +201,16 @@
rds_ibdev->mr_8k_pool =
rds_ib_create_mr_pool(rds_ibdev, RDS_IB_MR_8K_POOL);
if (IS_ERR(rds_ibdev->mr_8k_pool)) {
+ ret = PTR_ERR(rds_ibdev->mr_8k_pool);
rds_ibdev->mr_8k_pool = NULL;
goto put_dev;
}
- rdsdebug("RDS/IB: max_mr = %d, max_wrs = %d, max_sge = %d, fmr_max_remaps = %d, max_1m_mrs = %d, max_8k_mrs = %d\n",
- device->attrs.max_fmr, rds_ibdev->max_wrs, rds_ibdev->max_sge,
- rds_ibdev->fmr_max_remaps, rds_ibdev->max_1m_mrs,
- rds_ibdev->max_8k_mrs);
+ rdsdebug("RDS/IB: max_mr = %d, max_wrs = %d, max_sge = %d, max_1m_mrs = %d, max_8k_mrs = %d\n",
+ device->attrs.max_mr, rds_ibdev->max_wrs, rds_ibdev->max_sge,
+ rds_ibdev->max_1m_mrs, rds_ibdev->max_8k_mrs);
- pr_info("RDS/IB: %s: %s supported and preferred\n",
- device->name,
- rds_ibdev->use_fastreg ? "FRMR" : "FMR");
+ pr_info("RDS/IB: %s: added\n", device->name);
down_write(&rds_ib_devices_lock);
list_add_tail_rcu(&rds_ibdev->list, &rds_ib_devices);
@@ -212,12 +218,13 @@
refcount_inc(&rds_ibdev->refcount);
ib_set_client_data(device, &rds_ib_client, rds_ibdev);
- refcount_inc(&rds_ibdev->refcount);
rds_ib_nodev_connect();
+ return 0;
put_dev:
rds_ib_dev_put(rds_ibdev);
+ return ret;
}
/*
@@ -259,9 +266,6 @@
{
struct rds_ib_device *rds_ibdev = client_data;
- if (!rds_ibdev)
- return;
-
rds_ib_dev_shutdown(rds_ibdev);
/* stop connection attempts from getting a reference to this device. */
diff --git a/net/rds/ib.h b/net/rds/ib.h
index f2b558e..2ba7110 100644
--- a/net/rds/ib.h
+++ b/net/rds/ib.h
@@ -165,8 +165,8 @@
/* tx */
struct rds_ib_work_ring i_send_ring;
struct rm_data_op *i_data_op;
- struct rds_header *i_send_hdrs;
- dma_addr_t i_send_hdrs_dma;
+ struct rds_header **i_send_hdrs;
+ dma_addr_t *i_send_hdrs_dma;
struct rds_ib_send_work *i_sends;
atomic_t i_signaled_sends;
@@ -175,8 +175,8 @@
struct rds_ib_work_ring i_recv_ring;
struct rds_ib_incoming *i_ibinc;
u32 i_recv_data_rem;
- struct rds_header *i_recv_hdrs;
- dma_addr_t i_recv_hdrs_dma;
+ struct rds_header **i_recv_hdrs;
+ dma_addr_t *i_recv_hdrs_dma;
struct rds_ib_recv_work *i_recvs;
u64 i_ack_recv; /* last ACK received */
struct rds_ib_refill_cache i_cache_incs;
@@ -246,12 +246,11 @@
struct list_head conn_list;
struct ib_device *dev;
struct ib_pd *pd;
- bool use_fastreg;
+ u8 odp_capable:1;
unsigned int max_mrs;
struct rds_ib_mr_pool *mr_1m_pool;
struct rds_ib_mr_pool *mr_8k_pool;
- unsigned int fmr_max_remaps;
unsigned int max_8k_mrs;
unsigned int max_1m_mrs;
int max_sge;
@@ -264,7 +263,6 @@
int *vector_load;
};
-#define ibdev_to_node(ibdev) dev_to_node((ibdev)->dev.parent)
#define rdsibdev_to_node(rdsibdev) ibdev_to_node(rdsibdev->dev)
/* bits for i_ack_flags */
@@ -382,7 +380,6 @@
void rds_ib_cm_connect_complete(struct rds_connection *conn,
struct rdma_cm_event *event);
-
#define rds_ib_conn_error(conn, fmt...) \
__rds_ib_conn_error(conn, KERN_WARNING "RDS/IB: " fmt)
diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c
index 18c6fac..f5cbe96 100644
--- a/net/rds/ib_cm.c
+++ b/net/rds/ib_cm.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2006, 2019 Oracle and/or its affiliates. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
@@ -36,6 +36,7 @@
#include <linux/vmalloc.h>
#include <linux/ratelimit.h>
#include <net/addrconf.h>
+#include <rdma/ib_cm.h>
#include "rds_single_path.h"
#include "rds.h"
@@ -439,6 +440,95 @@
rds_ibdev->vector_load[index]--;
}
+static void rds_dma_hdr_free(struct ib_device *dev, struct rds_header *hdr,
+ dma_addr_t dma_addr, enum dma_data_direction dir)
+{
+ ib_dma_unmap_single(dev, dma_addr, sizeof(*hdr), dir);
+ kfree(hdr);
+}
+
+static struct rds_header *rds_dma_hdr_alloc(struct ib_device *dev,
+ dma_addr_t *dma_addr, enum dma_data_direction dir)
+{
+ struct rds_header *hdr;
+
+ hdr = kzalloc_node(sizeof(*hdr), GFP_KERNEL, ibdev_to_node(dev));
+ if (!hdr)
+ return NULL;
+
+ *dma_addr = ib_dma_map_single(dev, hdr, sizeof(*hdr),
+ DMA_BIDIRECTIONAL);
+ if (ib_dma_mapping_error(dev, *dma_addr)) {
+ kfree(hdr);
+ return NULL;
+ }
+
+ return hdr;
+}
+
+/* Free the DMA memory used to store struct rds_header.
+ *
+ * @dev: the RDS IB device
+ * @hdrs: pointer to the array storing DMA memory pointers
+ * @dma_addrs: pointer to the array storing DMA addresses
+ * @num_hdars: number of headers to free.
+ */
+static void rds_dma_hdrs_free(struct rds_ib_device *dev,
+ struct rds_header **hdrs, dma_addr_t *dma_addrs, u32 num_hdrs,
+ enum dma_data_direction dir)
+{
+ u32 i;
+
+ for (i = 0; i < num_hdrs; i++)
+ rds_dma_hdr_free(dev->dev, hdrs[i], dma_addrs[i], dir);
+ kvfree(hdrs);
+ kvfree(dma_addrs);
+}
+
+
+/* Allocate DMA coherent memory to be used to store struct rds_header for
+ * sending/receiving packets. The pointers to the DMA memory and the
+ * associated DMA addresses are stored in two arrays.
+ *
+ * @dev: the RDS IB device
+ * @dma_addrs: pointer to the array for storing DMA addresses
+ * @num_hdrs: number of headers to allocate
+ *
+ * It returns the pointer to the array storing the DMA memory pointers. On
+ * error, NULL pointer is returned.
+ */
+static struct rds_header **rds_dma_hdrs_alloc(struct rds_ib_device *dev,
+ dma_addr_t **dma_addrs, u32 num_hdrs,
+ enum dma_data_direction dir)
+{
+ struct rds_header **hdrs;
+ dma_addr_t *hdr_daddrs;
+ u32 i;
+
+ hdrs = kvmalloc_node(sizeof(*hdrs) * num_hdrs, GFP_KERNEL,
+ ibdev_to_node(dev->dev));
+ if (!hdrs)
+ return NULL;
+
+ hdr_daddrs = kvmalloc_node(sizeof(*hdr_daddrs) * num_hdrs, GFP_KERNEL,
+ ibdev_to_node(dev->dev));
+ if (!hdr_daddrs) {
+ kvfree(hdrs);
+ return NULL;
+ }
+
+ for (i = 0; i < num_hdrs; i++) {
+ hdrs[i] = rds_dma_hdr_alloc(dev->dev, &hdr_daddrs[i], dir);
+ if (!hdrs[i]) {
+ rds_dma_hdrs_free(dev, hdrs, hdr_daddrs, i, dir);
+ return NULL;
+ }
+ }
+
+ *dma_addrs = hdr_daddrs;
+ return hdrs;
+}
+
/*
* This needs to be very careful to not leave IS_ERR pointers around for
* cleanup to trip over.
@@ -462,10 +552,10 @@
return -EOPNOTSUPP;
/* The fr_queue_space is currently set to 512, to add extra space on
- * completion queue and send queue. This extra space is used for FRMR
+ * completion queue and send queue. This extra space is used for FRWR
* registration and invalidation work requests
*/
- fr_queue_space = (rds_ibdev->use_fastreg ? RDS_IB_DEFAULT_FR_WR : 0);
+ fr_queue_space = RDS_IB_DEFAULT_FR_WR;
/* add the conn now so that connection establishment has the dev */
rds_ib_add_conn(rds_ibdev, conn);
@@ -547,31 +637,29 @@
goto recv_cq_out;
}
- ic->i_send_hdrs = ib_dma_alloc_coherent(dev,
- ic->i_send_ring.w_nr *
- sizeof(struct rds_header),
- &ic->i_send_hdrs_dma, GFP_KERNEL);
+ ic->i_send_hdrs = rds_dma_hdrs_alloc(rds_ibdev, &ic->i_send_hdrs_dma,
+ ic->i_send_ring.w_nr,
+ DMA_TO_DEVICE);
if (!ic->i_send_hdrs) {
ret = -ENOMEM;
- rdsdebug("ib_dma_alloc_coherent send failed\n");
+ rdsdebug("DMA send hdrs alloc failed\n");
goto qp_out;
}
- ic->i_recv_hdrs = ib_dma_alloc_coherent(dev,
- ic->i_recv_ring.w_nr *
- sizeof(struct rds_header),
- &ic->i_recv_hdrs_dma, GFP_KERNEL);
+ ic->i_recv_hdrs = rds_dma_hdrs_alloc(rds_ibdev, &ic->i_recv_hdrs_dma,
+ ic->i_recv_ring.w_nr,
+ DMA_FROM_DEVICE);
if (!ic->i_recv_hdrs) {
ret = -ENOMEM;
- rdsdebug("ib_dma_alloc_coherent recv failed\n");
+ rdsdebug("DMA recv hdrs alloc failed\n");
goto send_hdrs_dma_out;
}
- ic->i_ack = ib_dma_alloc_coherent(dev, sizeof(struct rds_header),
- &ic->i_ack_dma, GFP_KERNEL);
+ ic->i_ack = rds_dma_hdr_alloc(rds_ibdev->dev, &ic->i_ack_dma,
+ DMA_TO_DEVICE);
if (!ic->i_ack) {
ret = -ENOMEM;
- rdsdebug("ib_dma_alloc_coherent ack failed\n");
+ rdsdebug("DMA ack header alloc failed\n");
goto recv_hdrs_dma_out;
}
@@ -602,17 +690,24 @@
sends_out:
vfree(ic->i_sends);
+
ack_dma_out:
- ib_dma_free_coherent(dev, sizeof(struct rds_header),
- ic->i_ack, ic->i_ack_dma);
+ rds_dma_hdr_free(rds_ibdev->dev, ic->i_ack, ic->i_ack_dma,
+ DMA_TO_DEVICE);
+ ic->i_ack = NULL;
+
recv_hdrs_dma_out:
- ib_dma_free_coherent(dev, ic->i_recv_ring.w_nr *
- sizeof(struct rds_header),
- ic->i_recv_hdrs, ic->i_recv_hdrs_dma);
+ rds_dma_hdrs_free(rds_ibdev, ic->i_recv_hdrs, ic->i_recv_hdrs_dma,
+ ic->i_recv_ring.w_nr, DMA_FROM_DEVICE);
+ ic->i_recv_hdrs = NULL;
+ ic->i_recv_hdrs_dma = NULL;
+
send_hdrs_dma_out:
- ib_dma_free_coherent(dev, ic->i_send_ring.w_nr *
- sizeof(struct rds_header),
- ic->i_send_hdrs, ic->i_send_hdrs_dma);
+ rds_dma_hdrs_free(rds_ibdev, ic->i_send_hdrs, ic->i_send_hdrs_dma,
+ ic->i_send_ring.w_nr, DMA_TO_DEVICE);
+ ic->i_send_hdrs = NULL;
+ ic->i_send_hdrs_dma = NULL;
+
qp_out:
rdma_destroy_qp(ic->i_cm_id);
recv_cq_out:
@@ -643,7 +738,7 @@
* original size. The only way to tell the difference is by looking at
* the contents, which are initialized to zero.
* If the protocol version fields aren't set, this is a connection attempt
- * from an older version. This could could be 3.0 or 2.0 - we can't tell.
+ * from an older version. This could be 3.0 or 2.0 - we can't tell.
* We really should have changed this for OFED 1.3 :-(
*/
@@ -860,7 +955,8 @@
if (conn)
mutex_unlock(&conn->c_cm_lock);
if (err)
- rdma_reject(cm_id, &err, sizeof(int));
+ rdma_reject(cm_id, &err, sizeof(int),
+ IB_CM_REJ_CONSUMER_DEFINED);
return destroy;
}
@@ -887,9 +983,10 @@
rds_ib_cm_fill_conn_param(conn, &conn_param, &dp,
conn->c_proposed_version,
UINT_MAX, UINT_MAX, isv6);
- ret = rdma_connect(cm_id, &conn_param);
+ ret = rdma_connect_locked(cm_id, &conn_param);
if (ret)
- rds_ib_conn_error(conn, "rdma_connect failed (%d)\n", ret);
+ rds_ib_conn_error(conn, "rdma_connect_locked failed (%d)\n",
+ ret);
out:
/* Beware - returning non-zero tells the rdma_cm to destroy
@@ -990,8 +1087,6 @@
ic->i_cm_id ? ic->i_cm_id->qp : NULL);
if (ic->i_cm_id) {
- struct ib_device *dev = ic->i_cm_id->device;
-
rdsdebug("disconnecting cm %p\n", ic->i_cm_id);
err = rdma_disconnect(ic->i_cm_id);
if (err) {
@@ -1041,24 +1136,40 @@
ib_destroy_cq(ic->i_recv_cq);
}
- /* then free the resources that ib callbacks use */
- if (ic->i_send_hdrs)
- ib_dma_free_coherent(dev,
- ic->i_send_ring.w_nr *
- sizeof(struct rds_header),
- ic->i_send_hdrs,
- ic->i_send_hdrs_dma);
+ if (ic->rds_ibdev) {
+ /* then free the resources that ib callbacks use */
+ if (ic->i_send_hdrs) {
+ rds_dma_hdrs_free(ic->rds_ibdev,
+ ic->i_send_hdrs,
+ ic->i_send_hdrs_dma,
+ ic->i_send_ring.w_nr,
+ DMA_TO_DEVICE);
+ ic->i_send_hdrs = NULL;
+ ic->i_send_hdrs_dma = NULL;
+ }
- if (ic->i_recv_hdrs)
- ib_dma_free_coherent(dev,
- ic->i_recv_ring.w_nr *
- sizeof(struct rds_header),
- ic->i_recv_hdrs,
- ic->i_recv_hdrs_dma);
+ if (ic->i_recv_hdrs) {
+ rds_dma_hdrs_free(ic->rds_ibdev,
+ ic->i_recv_hdrs,
+ ic->i_recv_hdrs_dma,
+ ic->i_recv_ring.w_nr,
+ DMA_FROM_DEVICE);
+ ic->i_recv_hdrs = NULL;
+ ic->i_recv_hdrs_dma = NULL;
+ }
- if (ic->i_ack)
- ib_dma_free_coherent(dev, sizeof(struct rds_header),
- ic->i_ack, ic->i_ack_dma);
+ if (ic->i_ack) {
+ rds_dma_hdr_free(ic->rds_ibdev->dev, ic->i_ack,
+ ic->i_ack_dma, DMA_TO_DEVICE);
+ ic->i_ack = NULL;
+ }
+ } else {
+ WARN_ON(ic->i_send_hdrs);
+ WARN_ON(ic->i_send_hdrs_dma);
+ WARN_ON(ic->i_recv_hdrs);
+ WARN_ON(ic->i_recv_hdrs_dma);
+ WARN_ON(ic->i_ack);
+ }
if (ic->i_sends)
rds_ib_send_clear_ring(ic);
@@ -1077,9 +1188,6 @@
ic->i_pd = NULL;
ic->i_send_cq = NULL;
ic->i_recv_cq = NULL;
- ic->i_send_hdrs = NULL;
- ic->i_recv_hdrs = NULL;
- ic->i_ack = NULL;
}
BUG_ON(ic->rds_ibdev);
diff --git a/net/rds/ib_fmr.c b/net/rds/ib_fmr.c
deleted file mode 100644
index 93c0437..0000000
--- a/net/rds/ib_fmr.c
+++ /dev/null
@@ -1,269 +0,0 @@
-/*
- * Copyright (c) 2016 Oracle. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses. You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- * Redistribution and use in source and binary forms, with or
- * without modification, are permitted provided that the following
- * conditions are met:
- *
- * - Redistributions of source code must retain the above
- * copyright notice, this list of conditions and the following
- * disclaimer.
- *
- * - Redistributions in binary form must reproduce the above
- * copyright notice, this list of conditions and the following
- * disclaimer in the documentation and/or other materials
- * provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "ib_mr.h"
-
-struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev, int npages)
-{
- struct rds_ib_mr_pool *pool;
- struct rds_ib_mr *ibmr = NULL;
- struct rds_ib_fmr *fmr;
- int err = 0;
-
- if (npages <= RDS_MR_8K_MSG_SIZE)
- pool = rds_ibdev->mr_8k_pool;
- else
- pool = rds_ibdev->mr_1m_pool;
-
- if (atomic_read(&pool->dirty_count) >= pool->max_items / 10)
- queue_delayed_work(rds_ib_mr_wq, &pool->flush_worker, 10);
-
- /* Switch pools if one of the pool is reaching upper limit */
- if (atomic_read(&pool->dirty_count) >= pool->max_items * 9 / 10) {
- if (pool->pool_type == RDS_IB_MR_8K_POOL)
- pool = rds_ibdev->mr_1m_pool;
- else
- pool = rds_ibdev->mr_8k_pool;
- }
-
- ibmr = rds_ib_try_reuse_ibmr(pool);
- if (ibmr)
- return ibmr;
-
- ibmr = kzalloc_node(sizeof(*ibmr), GFP_KERNEL,
- rdsibdev_to_node(rds_ibdev));
- if (!ibmr) {
- err = -ENOMEM;
- goto out_no_cigar;
- }
-
- fmr = &ibmr->u.fmr;
- fmr->fmr = ib_alloc_fmr(rds_ibdev->pd,
- (IB_ACCESS_LOCAL_WRITE |
- IB_ACCESS_REMOTE_READ |
- IB_ACCESS_REMOTE_WRITE |
- IB_ACCESS_REMOTE_ATOMIC),
- &pool->fmr_attr);
- if (IS_ERR(fmr->fmr)) {
- err = PTR_ERR(fmr->fmr);
- fmr->fmr = NULL;
- pr_warn("RDS/IB: %s failed (err=%d)\n", __func__, err);
- goto out_no_cigar;
- }
-
- ibmr->pool = pool;
- if (pool->pool_type == RDS_IB_MR_8K_POOL)
- rds_ib_stats_inc(s_ib_rdma_mr_8k_alloc);
- else
- rds_ib_stats_inc(s_ib_rdma_mr_1m_alloc);
-
- return ibmr;
-
-out_no_cigar:
- kfree(ibmr);
- atomic_dec(&pool->item_count);
-
- return ERR_PTR(err);
-}
-
-static int rds_ib_map_fmr(struct rds_ib_device *rds_ibdev,
- struct rds_ib_mr *ibmr, struct scatterlist *sg,
- unsigned int nents)
-{
- struct ib_device *dev = rds_ibdev->dev;
- struct rds_ib_fmr *fmr = &ibmr->u.fmr;
- struct scatterlist *scat = sg;
- u64 io_addr = 0;
- u64 *dma_pages;
- u32 len;
- int page_cnt, sg_dma_len;
- int i, j;
- int ret;
-
- sg_dma_len = ib_dma_map_sg(dev, sg, nents, DMA_BIDIRECTIONAL);
- if (unlikely(!sg_dma_len)) {
- pr_warn("RDS/IB: %s failed!\n", __func__);
- return -EBUSY;
- }
-
- len = 0;
- page_cnt = 0;
-
- for (i = 0; i < sg_dma_len; ++i) {
- unsigned int dma_len = sg_dma_len(&scat[i]);
- u64 dma_addr = sg_dma_address(&scat[i]);
-
- if (dma_addr & ~PAGE_MASK) {
- if (i > 0) {
- ib_dma_unmap_sg(dev, sg, nents,
- DMA_BIDIRECTIONAL);
- return -EINVAL;
- } else {
- ++page_cnt;
- }
- }
- if ((dma_addr + dma_len) & ~PAGE_MASK) {
- if (i < sg_dma_len - 1) {
- ib_dma_unmap_sg(dev, sg, nents,
- DMA_BIDIRECTIONAL);
- return -EINVAL;
- } else {
- ++page_cnt;
- }
- }
-
- len += dma_len;
- }
-
- page_cnt += len >> PAGE_SHIFT;
- if (page_cnt > ibmr->pool->fmr_attr.max_pages) {
- ib_dma_unmap_sg(dev, sg, nents, DMA_BIDIRECTIONAL);
- return -EINVAL;
- }
-
- dma_pages = kmalloc_array_node(sizeof(u64), page_cnt, GFP_ATOMIC,
- rdsibdev_to_node(rds_ibdev));
- if (!dma_pages) {
- ib_dma_unmap_sg(dev, sg, nents, DMA_BIDIRECTIONAL);
- return -ENOMEM;
- }
-
- page_cnt = 0;
- for (i = 0; i < sg_dma_len; ++i) {
- unsigned int dma_len = sg_dma_len(&scat[i]);
- u64 dma_addr = sg_dma_address(&scat[i]);
-
- for (j = 0; j < dma_len; j += PAGE_SIZE)
- dma_pages[page_cnt++] =
- (dma_addr & PAGE_MASK) + j;
- }
-
- ret = ib_map_phys_fmr(fmr->fmr, dma_pages, page_cnt, io_addr);
- if (ret) {
- ib_dma_unmap_sg(dev, sg, nents, DMA_BIDIRECTIONAL);
- goto out;
- }
-
- /* Success - we successfully remapped the MR, so we can
- * safely tear down the old mapping.
- */
- rds_ib_teardown_mr(ibmr);
-
- ibmr->sg = scat;
- ibmr->sg_len = nents;
- ibmr->sg_dma_len = sg_dma_len;
- ibmr->remap_count++;
-
- if (ibmr->pool->pool_type == RDS_IB_MR_8K_POOL)
- rds_ib_stats_inc(s_ib_rdma_mr_8k_used);
- else
- rds_ib_stats_inc(s_ib_rdma_mr_1m_used);
- ret = 0;
-
-out:
- kfree(dma_pages);
-
- return ret;
-}
-
-struct rds_ib_mr *rds_ib_reg_fmr(struct rds_ib_device *rds_ibdev,
- struct scatterlist *sg,
- unsigned long nents,
- u32 *key)
-{
- struct rds_ib_mr *ibmr = NULL;
- struct rds_ib_fmr *fmr;
- int ret;
-
- ibmr = rds_ib_alloc_fmr(rds_ibdev, nents);
- if (IS_ERR(ibmr))
- return ibmr;
-
- ibmr->device = rds_ibdev;
- fmr = &ibmr->u.fmr;
- ret = rds_ib_map_fmr(rds_ibdev, ibmr, sg, nents);
- if (ret == 0)
- *key = fmr->fmr->rkey;
- else
- rds_ib_free_mr(ibmr, 0);
-
- return ibmr;
-}
-
-void rds_ib_unreg_fmr(struct list_head *list, unsigned int *nfreed,
- unsigned long *unpinned, unsigned int goal)
-{
- struct rds_ib_mr *ibmr, *next;
- struct rds_ib_fmr *fmr;
- LIST_HEAD(fmr_list);
- int ret = 0;
- unsigned int freed = *nfreed;
-
- /* String all ib_mr's onto one list and hand them to ib_unmap_fmr */
- list_for_each_entry(ibmr, list, unmap_list) {
- fmr = &ibmr->u.fmr;
- list_add(&fmr->fmr->list, &fmr_list);
- }
-
- ret = ib_unmap_fmr(&fmr_list);
- if (ret)
- pr_warn("RDS/IB: FMR invalidation failed (err=%d)\n", ret);
-
- /* Now we can destroy the DMA mapping and unpin any pages */
- list_for_each_entry_safe(ibmr, next, list, unmap_list) {
- fmr = &ibmr->u.fmr;
- *unpinned += ibmr->sg_len;
- __rds_ib_teardown_mr(ibmr);
- if (freed < goal ||
- ibmr->remap_count >= ibmr->pool->fmr_attr.max_maps) {
- if (ibmr->pool->pool_type == RDS_IB_MR_8K_POOL)
- rds_ib_stats_inc(s_ib_rdma_mr_8k_free);
- else
- rds_ib_stats_inc(s_ib_rdma_mr_1m_free);
- list_del(&ibmr->unmap_list);
- ib_dealloc_fmr(fmr->fmr);
- kfree(ibmr);
- freed++;
- }
- }
- *nfreed = freed;
-}
-
-void rds_ib_free_fmr_list(struct rds_ib_mr *ibmr)
-{
- struct rds_ib_mr_pool *pool = ibmr->pool;
-
- if (ibmr->remap_count >= pool->fmr_attr.max_maps)
- llist_add(&ibmr->llnode, &pool->drop_list);
- else
- llist_add(&ibmr->llnode, &pool->free_list);
-}
diff --git a/net/rds/ib_frmr.c b/net/rds/ib_frmr.c
index ef6acd7..28c1b00 100644
--- a/net/rds/ib_frmr.c
+++ b/net/rds/ib_frmr.c
@@ -76,7 +76,7 @@
frmr = &ibmr->u.frmr;
frmr->mr = ib_alloc_mr(rds_ibdev->pd, IB_MR_TYPE_MEM_REG,
- pool->fmr_attr.max_pages);
+ pool->max_pages);
if (IS_ERR(frmr->mr)) {
pr_warn("RDS/IB: %s failed to allocate MR", __func__);
err = PTR_ERR(frmr->mr);
@@ -240,7 +240,7 @@
}
frmr->dma_npages += len >> PAGE_SHIFT;
- if (frmr->dma_npages > ibmr->pool->fmr_attr.max_pages) {
+ if (frmr->dma_npages > ibmr->pool->max_pages) {
ret = -EMSGSIZE;
goto out_unmap;
}
diff --git a/net/rds/ib_mr.h b/net/rds/ib_mr.h
index 9045a8c..ea5e9ae 100644
--- a/net/rds/ib_mr.h
+++ b/net/rds/ib_mr.h
@@ -43,10 +43,6 @@
#define RDS_MR_8K_SCALE (256 / (RDS_MR_8K_MSG_SIZE + 1))
#define RDS_MR_8K_POOL_SIZE (RDS_MR_8K_SCALE * (8192 / 2))
-struct rds_ib_fmr {
- struct ib_fmr *fmr;
-};
-
enum rds_ib_fr_state {
FRMR_IS_FREE, /* mr invalidated & ready for use */
FRMR_IS_INUSE, /* mr is in use or used & can be invalidated */
@@ -67,6 +63,7 @@
/* This is stored as mr->r_trans_private. */
struct rds_ib_mr {
+ struct delayed_work work;
struct rds_ib_device *device;
struct rds_ib_mr_pool *pool;
struct rds_ib_connection *ic;
@@ -81,9 +78,10 @@
unsigned int sg_len;
int sg_dma_len;
+ u8 odp:1;
union {
- struct rds_ib_fmr fmr;
struct rds_ib_frmr frmr;
+ struct ib_mr *mr;
} u;
};
@@ -106,8 +104,7 @@
unsigned long max_items;
unsigned long max_items_soft;
unsigned long max_free_pinned;
- struct ib_fmr_attr fmr_attr;
- bool use_fastreg;
+ unsigned int max_pages;
};
extern struct workqueue_struct *rds_ib_mr_wq;
@@ -122,24 +119,20 @@
void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *);
void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
struct rds_sock *rs, u32 *key_ret,
- struct rds_connection *conn);
+ struct rds_connection *conn, u64 start, u64 length,
+ int need_odp);
void rds_ib_sync_mr(void *trans_private, int dir);
void rds_ib_free_mr(void *trans_private, int invalidate);
void rds_ib_flush_mrs(void);
int rds_ib_mr_init(void);
void rds_ib_mr_exit(void);
+u32 rds_ib_get_lkey(void *trans_private);
void __rds_ib_teardown_mr(struct rds_ib_mr *);
void rds_ib_teardown_mr(struct rds_ib_mr *);
-struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *, int);
struct rds_ib_mr *rds_ib_reuse_mr(struct rds_ib_mr_pool *);
int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *, int, struct rds_ib_mr **);
-struct rds_ib_mr *rds_ib_reg_fmr(struct rds_ib_device *, struct scatterlist *,
- unsigned long, u32 *);
struct rds_ib_mr *rds_ib_try_reuse_ibmr(struct rds_ib_mr_pool *);
-void rds_ib_unreg_fmr(struct list_head *, unsigned int *,
- unsigned long *, unsigned int);
-void rds_ib_free_fmr_list(struct rds_ib_mr *);
struct rds_ib_mr *rds_ib_reg_frmr(struct rds_ib_device *rds_ibdev,
struct rds_ib_connection *ic,
struct scatterlist *sg,
diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c
index c8c1e3a..8f070ee 100644
--- a/net/rds/ib_rdma.c
+++ b/net/rds/ib_rdma.c
@@ -37,8 +37,15 @@
#include "rds_single_path.h"
#include "ib_mr.h"
+#include "rds.h"
struct workqueue_struct *rds_ib_mr_wq;
+struct rds_ib_dereg_odp_mr {
+ struct work_struct work;
+ struct ib_mr *mr;
+};
+
+static void rds_ib_odp_mr_worker(struct work_struct *work);
static struct rds_ib_device *rds_ib_get_device(__be32 ipaddr)
{
@@ -174,7 +181,7 @@
struct rds_ib_mr_pool *pool_1m = rds_ibdev->mr_1m_pool;
iinfo->rdma_mr_max = pool_1m->max_items;
- iinfo->rdma_mr_size = pool_1m->fmr_attr.max_pages;
+ iinfo->rdma_mr_size = pool_1m->max_pages;
}
#if IS_ENABLED(CONFIG_IPV6)
@@ -184,7 +191,7 @@
struct rds_ib_mr_pool *pool_1m = rds_ibdev->mr_1m_pool;
iinfo6->rdma_mr_max = pool_1m->max_items;
- iinfo6->rdma_mr_size = pool_1m->fmr_attr.max_pages;
+ iinfo6->rdma_mr_size = pool_1m->max_pages;
}
#endif
@@ -213,6 +220,9 @@
struct rds_ib_mr *ibmr = trans_private;
struct rds_ib_device *rds_ibdev = ibmr->device;
+ if (ibmr->odp)
+ return;
+
switch (direction) {
case DMA_FROM_DEVICE:
ib_dma_sync_sg_for_cpu(rds_ibdev->dev, ibmr->sg,
@@ -396,10 +406,7 @@
if (list_empty(&unmap_list))
goto out;
- if (pool->use_fastreg)
- rds_ib_unreg_frmr(&unmap_list, &nfreed, &unpinned, free_goal);
- else
- rds_ib_unreg_fmr(&unmap_list, &nfreed, &unpinned, free_goal);
+ rds_ib_unreg_frmr(&unmap_list, &nfreed, &unpinned, free_goal);
if (!list_empty(&unmap_list)) {
unsigned long flags;
@@ -482,11 +489,18 @@
rdsdebug("RDS/IB: free_mr nents %u\n", ibmr->sg_len);
+ if (ibmr->odp) {
+ /* A MR created and marked as use_once. We use delayed work,
+ * because there is a change that we are in interrupt and can't
+ * call to ib_dereg_mr() directly.
+ */
+ INIT_DELAYED_WORK(&ibmr->work, rds_ib_odp_mr_worker);
+ queue_delayed_work(rds_ib_mr_wq, &ibmr->work, 0);
+ return;
+ }
+
/* Return it to the pool's free list */
- if (rds_ibdev->use_fastreg)
- rds_ib_free_frmr_list(ibmr);
- else
- rds_ib_free_fmr_list(ibmr);
+ rds_ib_free_frmr_list(ibmr);
atomic_add(ibmr->sg_len, &pool->free_pinned);
atomic_inc(&pool->dirty_count);
@@ -526,9 +540,17 @@
up_read(&rds_ib_devices_lock);
}
+u32 rds_ib_get_lkey(void *trans_private)
+{
+ struct rds_ib_mr *ibmr = trans_private;
+
+ return ibmr->u.mr->lkey;
+}
+
void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
struct rds_sock *rs, u32 *key_ret,
- struct rds_connection *conn)
+ struct rds_connection *conn,
+ u64 start, u64 length, int need_odp)
{
struct rds_ib_device *rds_ibdev;
struct rds_ib_mr *ibmr = NULL;
@@ -541,6 +563,51 @@
goto out;
}
+ if (need_odp == ODP_ZEROBASED || need_odp == ODP_VIRTUAL) {
+ u64 virt_addr = need_odp == ODP_ZEROBASED ? 0 : start;
+ int access_flags =
+ (IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_READ |
+ IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_ATOMIC |
+ IB_ACCESS_ON_DEMAND);
+ struct ib_sge sge = {};
+ struct ib_mr *ib_mr;
+
+ if (!rds_ibdev->odp_capable) {
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
+
+ ib_mr = ib_reg_user_mr(rds_ibdev->pd, start, length, virt_addr,
+ access_flags);
+
+ if (IS_ERR(ib_mr)) {
+ rdsdebug("rds_ib_get_user_mr returned %d\n",
+ IS_ERR(ib_mr));
+ ret = PTR_ERR(ib_mr);
+ goto out;
+ }
+ if (key_ret)
+ *key_ret = ib_mr->rkey;
+
+ ibmr = kzalloc(sizeof(*ibmr), GFP_KERNEL);
+ if (!ibmr) {
+ ib_dereg_mr(ib_mr);
+ ret = -ENOMEM;
+ goto out;
+ }
+ ibmr->u.mr = ib_mr;
+ ibmr->odp = 1;
+
+ sge.addr = virt_addr;
+ sge.length = length;
+ sge.lkey = ib_mr->lkey;
+
+ ib_advise_mr(rds_ibdev->pd,
+ IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE,
+ IB_UVERBS_ADVISE_MR_FLAG_FLUSH, &sge, 1);
+ return ibmr;
+ }
+
if (conn)
ic = conn->c_transport_data;
@@ -549,10 +616,7 @@
goto out;
}
- if (rds_ibdev->use_fastreg)
- ibmr = rds_ib_reg_frmr(rds_ibdev, ic, sg, nents, key_ret);
- else
- ibmr = rds_ib_reg_fmr(rds_ibdev, sg, nents, key_ret);
+ ibmr = rds_ib_reg_frmr(rds_ibdev, ic, sg, nents, key_ret);
if (IS_ERR(ibmr)) {
ret = PTR_ERR(ibmr);
pr_warn("RDS/IB: rds_ib_get_mr failed (errno=%d)\n", ret);
@@ -596,19 +660,16 @@
if (pool_type == RDS_IB_MR_1M_POOL) {
/* +1 allows for unaligned MRs */
- pool->fmr_attr.max_pages = RDS_MR_1M_MSG_SIZE + 1;
+ pool->max_pages = RDS_MR_1M_MSG_SIZE + 1;
pool->max_items = rds_ibdev->max_1m_mrs;
} else {
/* pool_type == RDS_IB_MR_8K_POOL */
- pool->fmr_attr.max_pages = RDS_MR_8K_MSG_SIZE + 1;
+ pool->max_pages = RDS_MR_8K_MSG_SIZE + 1;
pool->max_items = rds_ibdev->max_8k_mrs;
}
- pool->max_free_pinned = pool->max_items * pool->fmr_attr.max_pages / 4;
- pool->fmr_attr.max_maps = rds_ibdev->fmr_max_remaps;
- pool->fmr_attr.page_shift = PAGE_SHIFT;
+ pool->max_free_pinned = pool->max_items * pool->max_pages / 4;
pool->max_items_soft = rds_ibdev->max_mrs * 3 / 4;
- pool->use_fastreg = rds_ibdev->use_fastreg;
return pool;
}
@@ -629,3 +690,12 @@
{
destroy_workqueue(rds_ib_mr_wq);
}
+
+static void rds_ib_odp_mr_worker(struct work_struct *work)
+{
+ struct rds_ib_mr *ibmr;
+
+ ibmr = container_of(work, struct rds_ib_mr, work.work);
+ ib_dereg_mr(ibmr->u.mr);
+ kfree(ibmr);
+}
diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c
index a0f99bb..6fdedd9 100644
--- a/net/rds/ib_recv.c
+++ b/net/rds/ib_recv.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2006, 2019 Oracle and/or its affiliates. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
@@ -61,7 +61,7 @@
recv->r_wr.num_sge = RDS_IB_RECV_SGE;
sge = &recv->r_sge[0];
- sge->addr = ic->i_recv_hdrs_dma + (i * sizeof(struct rds_header));
+ sge->addr = ic->i_recv_hdrs_dma[i];
sge->length = sizeof(struct rds_header);
sge->lkey = ic->i_pd->local_dma_lkey;
@@ -310,8 +310,8 @@
struct rds_ib_connection *ic = conn->c_transport_data;
struct ib_sge *sge;
int ret = -ENOMEM;
- gfp_t slab_mask = GFP_NOWAIT;
- gfp_t page_mask = GFP_NOWAIT;
+ gfp_t slab_mask = gfp;
+ gfp_t page_mask = gfp;
if (gfp & __GFP_DIRECT_RECLAIM) {
slab_mask = GFP_KERNEL;
@@ -343,7 +343,7 @@
WARN_ON(ret != 1);
sge = &recv->r_sge[0];
- sge->addr = ic->i_recv_hdrs_dma + (recv - ic->i_recvs) * sizeof(struct rds_header);
+ sge->addr = ic->i_recv_hdrs_dma[recv - ic->i_recvs];
sge->length = sizeof(struct rds_header);
sge = &recv->r_sge[1];
@@ -662,10 +662,16 @@
seq = rds_ib_get_ack(ic);
rdsdebug("send_ack: ic %p ack %llu\n", ic, (unsigned long long) seq);
+
+ ib_dma_sync_single_for_cpu(ic->rds_ibdev->dev, ic->i_ack_dma,
+ sizeof(*hdr), DMA_TO_DEVICE);
rds_message_populate_header(hdr, 0, 0, 0);
hdr->h_ack = cpu_to_be64(seq);
hdr->h_credit = adv_credits;
rds_message_make_checksum(hdr);
+ ib_dma_sync_single_for_device(ic->rds_ibdev->dev, ic->i_ack_dma,
+ sizeof(*hdr), DMA_TO_DEVICE);
+
ic->i_ack_queued = jiffies;
ret = ib_post_send(ic->i_cm_id->qp, &ic->i_ack_wr, NULL);
@@ -845,6 +851,7 @@
struct rds_ib_connection *ic = conn->c_transport_data;
struct rds_ib_incoming *ibinc = ic->i_ibinc;
struct rds_header *ihdr, *hdr;
+ dma_addr_t dma_addr = ic->i_recv_hdrs_dma[recv - ic->i_recvs];
/* XXX shut down the connection if port 0,0 are seen? */
@@ -861,8 +868,10 @@
}
data_len -= sizeof(struct rds_header);
- ihdr = &ic->i_recv_hdrs[recv - ic->i_recvs];
+ ihdr = ic->i_recv_hdrs[recv - ic->i_recvs];
+ ib_dma_sync_single_for_cpu(ic->rds_ibdev->dev, dma_addr,
+ sizeof(*ihdr), DMA_FROM_DEVICE);
/* Validate the checksum. */
if (!rds_message_verify_checksum(ihdr)) {
rds_ib_conn_error(conn, "incoming message "
@@ -870,7 +879,7 @@
"forcing a reconnect\n",
&conn->c_faddr);
rds_stats_inc(s_recv_drop_bad_checksum);
- return;
+ goto done;
}
/* Process the ACK sequence which comes with every packet */
@@ -899,7 +908,7 @@
*/
rds_ib_frag_free(ic, recv->r_frag);
recv->r_frag = NULL;
- return;
+ goto done;
}
/*
@@ -933,7 +942,7 @@
hdr->h_dport != ihdr->h_dport) {
rds_ib_conn_error(conn,
"fragment header mismatch; forcing reconnect\n");
- return;
+ goto done;
}
}
@@ -965,6 +974,9 @@
rds_inc_put(&ibinc->ii_inc);
}
+done:
+ ib_dma_sync_single_for_device(ic->rds_ibdev->dev, dma_addr,
+ sizeof(*ihdr), DMA_FROM_DEVICE);
}
void rds_ib_recv_cqe_handler(struct rds_ib_connection *ic,
@@ -993,10 +1005,11 @@
} else {
/* We expect errors as the qp is drained during shutdown */
if (rds_conn_up(conn) || rds_conn_connecting(conn))
- rds_ib_conn_error(conn, "recv completion on <%pI6c,%pI6c, %d> had status %u (%s), disconnecting and reconnecting\n",
+ rds_ib_conn_error(conn, "recv completion on <%pI6c,%pI6c, %d> had status %u (%s), vendor err 0x%x, disconnecting and reconnecting\n",
&conn->c_laddr, &conn->c_faddr,
conn->c_tos, wc->status,
- ib_wc_status_msg(wc->status));
+ ib_wc_status_msg(wc->status),
+ wc->vendor_err);
}
/* rds_ib_process_recv() doesn't always consume the frag, and
@@ -1019,7 +1032,7 @@
rds_ib_stats_inc(s_ib_rx_ring_empty);
if (rds_ib_ring_low(&ic->i_recv_ring)) {
- rds_ib_recv_refill(conn, 0, GFP_NOWAIT);
+ rds_ib_recv_refill(conn, 0, GFP_NOWAIT | __GFP_NOWARN);
rds_ib_stats_inc(s_ib_rx_refill_from_cq);
}
}
diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c
index dfe6237..92b4a86 100644
--- a/net/rds/ib_send.c
+++ b/net/rds/ib_send.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2006, 2019 Oracle and/or its affiliates. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
@@ -39,6 +39,7 @@
#include "rds_single_path.h"
#include "rds.h"
#include "ib.h"
+#include "ib_mr.h"
/*
* Convert IB-specific error message to RDS error message and call core
@@ -201,7 +202,8 @@
send->s_wr.ex.imm_data = 0;
sge = &send->s_sge[0];
- sge->addr = ic->i_send_hdrs_dma + (i * sizeof(struct rds_header));
+ sge->addr = ic->i_send_hdrs_dma[i];
+
sge->length = sizeof(struct rds_header);
sge->lkey = ic->i_pd->local_dma_lkey;
@@ -300,10 +302,10 @@
/* We expect errors as the qp is drained during shutdown */
if (wc->status != IB_WC_SUCCESS && rds_conn_up(conn)) {
- rds_ib_conn_error(conn, "send completion on <%pI6c,%pI6c,%d> had status %u (%s), disconnecting and reconnecting\n",
+ rds_ib_conn_error(conn, "send completion on <%pI6c,%pI6c,%d> had status %u (%s), vendor err 0x%x, disconnecting and reconnecting\n",
&conn->c_laddr, &conn->c_faddr,
conn->c_tos, wc->status,
- ib_wc_status_msg(wc->status));
+ ib_wc_status_msg(wc->status), wc->vendor_err);
}
}
@@ -631,11 +633,18 @@
send->s_queued = jiffies;
send->s_op = NULL;
- send->s_sge[0].addr = ic->i_send_hdrs_dma
- + (pos * sizeof(struct rds_header));
- send->s_sge[0].length = sizeof(struct rds_header);
+ send->s_sge[0].addr = ic->i_send_hdrs_dma[pos];
- memcpy(&ic->i_send_hdrs[pos], &rm->m_inc.i_hdr, sizeof(struct rds_header));
+ send->s_sge[0].length = sizeof(struct rds_header);
+ send->s_sge[0].lkey = ic->i_pd->local_dma_lkey;
+
+ ib_dma_sync_single_for_cpu(ic->rds_ibdev->dev,
+ ic->i_send_hdrs_dma[pos],
+ sizeof(struct rds_header),
+ DMA_TO_DEVICE);
+ memcpy(ic->i_send_hdrs[pos], &rm->m_inc.i_hdr,
+ sizeof(struct rds_header));
+
/* Set up the data, if present */
if (i < work_alloc
@@ -647,6 +656,7 @@
send->s_sge[1].addr = sg_dma_address(scat);
send->s_sge[1].addr += rm->data.op_dmaoff;
send->s_sge[1].length = len;
+ send->s_sge[1].lkey = ic->i_pd->local_dma_lkey;
bytes_sent += len;
rm->data.op_dmaoff += len;
@@ -674,7 +684,7 @@
&send->s_wr, send->s_wr.num_sge, send->s_wr.next);
if (ic->i_flowctl && adv_credits) {
- struct rds_header *hdr = &ic->i_send_hdrs[pos];
+ struct rds_header *hdr = ic->i_send_hdrs[pos];
/* add credit and redo the header checksum */
hdr->h_credit = adv_credits;
@@ -682,6 +692,10 @@
adv_credits = 0;
rds_ib_stats_inc(s_ib_tx_credit_updates);
}
+ ib_dma_sync_single_for_device(ic->rds_ibdev->dev,
+ ic->i_send_hdrs_dma[pos],
+ sizeof(struct rds_header),
+ DMA_TO_DEVICE);
if (prev)
prev->s_wr.next = &send->s_wr;
@@ -855,20 +869,29 @@
int ret;
int num_sge;
int nr_sig = 0;
+ u64 odp_addr = op->op_odp_addr;
+ u32 odp_lkey = 0;
/* map the op the first time we see it */
- if (!op->op_mapped) {
- op->op_count = ib_dma_map_sg(ic->i_cm_id->device,
- op->op_sg, op->op_nents, (op->op_write) ?
- DMA_TO_DEVICE : DMA_FROM_DEVICE);
- rdsdebug("ic %p mapping op %p: %d\n", ic, op, op->op_count);
- if (op->op_count == 0) {
- rds_ib_stats_inc(s_ib_tx_sg_mapping_failure);
- ret = -ENOMEM; /* XXX ? */
- goto out;
+ if (!op->op_odp_mr) {
+ if (!op->op_mapped) {
+ op->op_count =
+ ib_dma_map_sg(ic->i_cm_id->device, op->op_sg,
+ op->op_nents,
+ (op->op_write) ? DMA_TO_DEVICE :
+ DMA_FROM_DEVICE);
+ rdsdebug("ic %p mapping op %p: %d\n", ic, op,
+ op->op_count);
+ if (op->op_count == 0) {
+ rds_ib_stats_inc(s_ib_tx_sg_mapping_failure);
+ ret = -ENOMEM; /* XXX ? */
+ goto out;
+ }
+ op->op_mapped = 1;
}
-
- op->op_mapped = 1;
+ } else {
+ op->op_count = op->op_nents;
+ odp_lkey = rds_ib_get_lkey(op->op_odp_mr->r_trans_private);
}
/*
@@ -920,14 +943,20 @@
for (j = 0; j < send->s_rdma_wr.wr.num_sge &&
scat != &op->op_sg[op->op_count]; j++) {
len = sg_dma_len(scat);
- send->s_sge[j].addr = sg_dma_address(scat);
+ if (!op->op_odp_mr) {
+ send->s_sge[j].addr = sg_dma_address(scat);
+ send->s_sge[j].lkey = ic->i_pd->local_dma_lkey;
+ } else {
+ send->s_sge[j].addr = odp_addr;
+ send->s_sge[j].lkey = odp_lkey;
+ }
send->s_sge[j].length = len;
- send->s_sge[j].lkey = ic->i_pd->local_dma_lkey;
sent += len;
rdsdebug("ic %p sent %d remote_addr %llu\n", ic, sent, remote_addr);
remote_addr += len;
+ odp_addr += len;
scat++;
}
diff --git a/net/rds/info.c b/net/rds/info.c
index 03f6fd5..b6b46a8 100644
--- a/net/rds/info.c
+++ b/net/rds/info.c
@@ -162,7 +162,6 @@
struct rds_info_lengths lens;
unsigned long nr_pages = 0;
unsigned long start;
- unsigned long i;
rds_info_func func;
struct page **pages = NULL;
int ret;
@@ -193,7 +192,7 @@
ret = -ENOMEM;
goto out;
}
- ret = get_user_pages_fast(start, nr_pages, FOLL_WRITE, pages);
+ ret = pin_user_pages_fast(start, nr_pages, FOLL_WRITE, pages);
if (ret != nr_pages) {
if (ret > 0)
nr_pages = ret;
@@ -235,8 +234,8 @@
ret = -EFAULT;
out:
- for (i = 0; pages && i < nr_pages; i++)
- put_page(pages[i]);
+ if (pages)
+ unpin_user_pages(pages, nr_pages);
kfree(pages);
return ret;
diff --git a/net/rds/message.c b/net/rds/message.c
index 92b6b22..799034e 100644
--- a/net/rds/message.c
+++ b/net/rds/message.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2006 Oracle. All rights reserved.
+ * Copyright (c) 2006, 2020 Oracle and/or its affiliates.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
@@ -162,12 +162,12 @@
if (rm->rdma.op_active)
rds_rdma_free_op(&rm->rdma);
if (rm->rdma.op_rdma_mr)
- rds_mr_put(rm->rdma.op_rdma_mr);
+ kref_put(&rm->rdma.op_rdma_mr->r_kref, __rds_put_mr_final);
if (rm->atomic.op_active)
rds_atomic_free_op(&rm->atomic);
if (rm->atomic.op_rdma_mr)
- rds_mr_put(rm->atomic.op_rdma_mr);
+ kref_put(&rm->atomic.op_rdma_mr->r_kref, __rds_put_mr_final);
}
void rds_message_put(struct rds_message *rm)
diff --git a/net/rds/rdma.c b/net/rds/rdma.c
index 1c42a60..6f1a50d 100644
--- a/net/rds/rdma.c
+++ b/net/rds/rdma.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2007, 2017 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2007, 2020 Oracle and/or its affiliates.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
@@ -84,7 +84,7 @@
if (insert) {
rb_link_node(&insert->r_rb_node, parent, p);
rb_insert_color(&insert->r_rb_node, root);
- refcount_inc(&insert->r_refcount);
+ kref_get(&insert->r_kref);
}
return NULL;
}
@@ -99,10 +99,7 @@
unsigned long flags;
rdsdebug("RDS: destroy mr key is %x refcnt %u\n",
- mr->r_key, refcount_read(&mr->r_refcount));
-
- if (test_and_set_bit(RDS_MR_DEAD, &mr->r_state))
- return;
+ mr->r_key, kref_read(&mr->r_kref));
spin_lock_irqsave(&rs->rs_rdma_lock, flags);
if (!RB_EMPTY_NODE(&mr->r_rb_node))
@@ -115,8 +112,10 @@
mr->r_trans->free_mr(trans_private, mr->r_invalidate);
}
-void __rds_put_mr_final(struct rds_mr *mr)
+void __rds_put_mr_final(struct kref *kref)
{
+ struct rds_mr *mr = container_of(kref, struct rds_mr, r_kref);
+
rds_destroy_mr(mr);
kfree(mr);
}
@@ -140,8 +139,7 @@
rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys);
RB_CLEAR_NODE(&mr->r_rb_node);
spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
- rds_destroy_mr(mr);
- rds_mr_put(mr);
+ kref_put(&mr->r_kref, __rds_put_mr_final);
spin_lock_irqsave(&rs->rs_rdma_lock, flags);
}
spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
@@ -156,14 +154,15 @@
static int rds_pin_pages(unsigned long user_addr, unsigned int nr_pages,
struct page **pages, int write)
{
+ unsigned int gup_flags = FOLL_LONGTERM;
int ret;
- ret = get_user_pages_fast(user_addr, nr_pages, write ? FOLL_WRITE : 0,
- pages);
+ if (write)
+ gup_flags |= FOLL_WRITE;
+ ret = pin_user_pages_fast(user_addr, nr_pages, gup_flags, pages);
if (ret >= 0 && ret < nr_pages) {
- while (ret--)
- put_page(pages[ret]);
+ unpin_user_pages(pages, ret);
ret = -EFAULT;
}
@@ -175,13 +174,14 @@
struct rds_conn_path *cp)
{
struct rds_mr *mr = NULL, *found;
+ struct scatterlist *sg = NULL;
unsigned int nr_pages;
struct page **pages = NULL;
- struct scatterlist *sg;
void *trans_private;
unsigned long flags;
rds_rdma_cookie_t cookie;
- unsigned int nents;
+ unsigned int nents = 0;
+ int need_odp = 0;
long i;
int ret;
@@ -195,6 +195,21 @@
goto out;
}
+ /* If the combination of the addr and size requested for this memory
+ * region causes an integer overflow, return error.
+ */
+ if (((args->vec.addr + args->vec.bytes) < args->vec.addr) ||
+ PAGE_ALIGN(args->vec.addr + args->vec.bytes) <
+ (args->vec.addr + args->vec.bytes)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (!can_do_mlock()) {
+ ret = -EPERM;
+ goto out;
+ }
+
nr_pages = rds_pages_in_vec(&args->vec);
if (nr_pages == 0) {
ret = -EINVAL;
@@ -225,7 +240,7 @@
goto out;
}
- refcount_set(&mr->r_refcount, 1);
+ kref_init(&mr->r_kref);
RB_CLEAR_NODE(&mr->r_rb_node);
mr->r_trans = rs->rs_transport;
mr->r_sock = rs;
@@ -248,36 +263,43 @@
* the zero page.
*/
ret = rds_pin_pages(args->vec.addr, nr_pages, pages, 1);
- if (ret < 0)
+ if (ret == -EOPNOTSUPP) {
+ need_odp = 1;
+ } else if (ret <= 0) {
goto out;
+ } else {
+ nents = ret;
+ sg = kmalloc_array(nents, sizeof(*sg), GFP_KERNEL);
+ if (!sg) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ WARN_ON(!nents);
+ sg_init_table(sg, nents);
- nents = ret;
- sg = kcalloc(nents, sizeof(*sg), GFP_KERNEL);
- if (!sg) {
- ret = -ENOMEM;
- goto out;
+ /* Stick all pages into the scatterlist */
+ for (i = 0 ; i < nents; i++)
+ sg_set_page(&sg[i], pages[i], PAGE_SIZE, 0);
+
+ rdsdebug("RDS: trans_private nents is %u\n", nents);
}
- WARN_ON(!nents);
- sg_init_table(sg, nents);
-
- /* Stick all pages into the scatterlist */
- for (i = 0 ; i < nents; i++)
- sg_set_page(&sg[i], pages[i], PAGE_SIZE, 0);
-
- rdsdebug("RDS: trans_private nents is %u\n", nents);
-
/* Obtain a transport specific MR. If this succeeds, the
* s/g list is now owned by the MR.
* Note that dma_map() implies that pending writes are
* flushed to RAM, so no dma_sync is needed here. */
- trans_private = rs->rs_transport->get_mr(sg, nents, rs,
- &mr->r_key,
- cp ? cp->cp_conn : NULL);
+ trans_private = rs->rs_transport->get_mr(
+ sg, nents, rs, &mr->r_key, cp ? cp->cp_conn : NULL,
+ args->vec.addr, args->vec.bytes,
+ need_odp ? ODP_ZEROBASED : ODP_NOT_NEEDED);
if (IS_ERR(trans_private)) {
- for (i = 0 ; i < nents; i++)
- put_page(sg_page(&sg[i]));
- kfree(sg);
+ /* In ODP case, we don't GUP pages, so don't need
+ * to release anything.
+ */
+ if (!need_odp) {
+ unpin_user_pages(pages, nr_pages);
+ kfree(sg);
+ }
ret = PTR_ERR(trans_private);
goto out;
}
@@ -291,11 +313,20 @@
* map page aligned regions. So we keep the offset, and build
* a 64bit cookie containing <R_Key, offset> and pass that
* around. */
- cookie = rds_rdma_make_cookie(mr->r_key, args->vec.addr & ~PAGE_MASK);
+ if (need_odp)
+ cookie = rds_rdma_make_cookie(mr->r_key, 0);
+ else
+ cookie = rds_rdma_make_cookie(mr->r_key,
+ args->vec.addr & ~PAGE_MASK);
if (cookie_ret)
*cookie_ret = cookie;
- if (args->cookie_addr && put_user(cookie, (u64 __user *)(unsigned long) args->cookie_addr)) {
+ if (args->cookie_addr &&
+ put_user(cookie, (u64 __user *)(unsigned long)args->cookie_addr)) {
+ if (!need_odp) {
+ unpin_user_pages(pages, nr_pages);
+ kfree(sg);
+ }
ret = -EFAULT;
goto out;
}
@@ -310,7 +341,7 @@
rdsdebug("RDS: get_mr key is %x\n", mr->r_key);
if (mr_ret) {
- refcount_inc(&mr->r_refcount);
+ kref_get(&mr->r_kref);
*mr_ret = mr;
}
@@ -318,25 +349,24 @@
out:
kfree(pages);
if (mr)
- rds_mr_put(mr);
+ kref_put(&mr->r_kref, __rds_put_mr_final);
return ret;
}
-int rds_get_mr(struct rds_sock *rs, char __user *optval, int optlen)
+int rds_get_mr(struct rds_sock *rs, sockptr_t optval, int optlen)
{
struct rds_get_mr_args args;
if (optlen != sizeof(struct rds_get_mr_args))
return -EINVAL;
- if (copy_from_user(&args, (struct rds_get_mr_args __user *)optval,
- sizeof(struct rds_get_mr_args)))
+ if (copy_from_sockptr(&args, optval, sizeof(struct rds_get_mr_args)))
return -EFAULT;
return __rds_rdma_map(rs, &args, NULL, NULL, NULL);
}
-int rds_get_mr_for_dest(struct rds_sock *rs, char __user *optval, int optlen)
+int rds_get_mr_for_dest(struct rds_sock *rs, sockptr_t optval, int optlen)
{
struct rds_get_mr_for_dest_args args;
struct rds_get_mr_args new_args;
@@ -344,7 +374,7 @@
if (optlen != sizeof(struct rds_get_mr_for_dest_args))
return -EINVAL;
- if (copy_from_user(&args, (struct rds_get_mr_for_dest_args __user *)optval,
+ if (copy_from_sockptr(&args, optval,
sizeof(struct rds_get_mr_for_dest_args)))
return -EFAULT;
@@ -363,7 +393,7 @@
/*
* Free the MR indicated by the given R_Key
*/
-int rds_free_mr(struct rds_sock *rs, char __user *optval, int optlen)
+int rds_free_mr(struct rds_sock *rs, sockptr_t optval, int optlen)
{
struct rds_free_mr_args args;
struct rds_mr *mr;
@@ -372,8 +402,7 @@
if (optlen != sizeof(struct rds_free_mr_args))
return -EINVAL;
- if (copy_from_user(&args, (struct rds_free_mr_args __user *)optval,
- sizeof(struct rds_free_mr_args)))
+ if (copy_from_sockptr(&args, optval, sizeof(struct rds_free_mr_args)))
return -EFAULT;
/* Special case - a null cookie means flush all unused MRs */
@@ -401,13 +430,7 @@
if (!mr)
return -EINVAL;
- /*
- * call rds_destroy_mr() ourselves so that we're sure it's done by the time
- * we return. If we let rds_mr_put() do it it might not happen until
- * someone else drops their ref.
- */
- rds_destroy_mr(mr);
- rds_mr_put(mr);
+ kref_put(&mr->r_kref, __rds_put_mr_final);
return 0;
}
@@ -431,6 +454,14 @@
return;
}
+ /* Get a reference so that the MR won't go away before calling
+ * sync_mr() below.
+ */
+ kref_get(&mr->r_kref);
+
+ /* If it is going to be freed, remove it from the tree now so
+ * that no other thread can find it and free it.
+ */
if (mr->r_use_once || force) {
rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys);
RB_CLEAR_NODE(&mr->r_rb_node);
@@ -444,34 +475,37 @@
if (mr->r_trans->sync_mr)
mr->r_trans->sync_mr(mr->r_trans_private, DMA_FROM_DEVICE);
+ /* Release the reference held above. */
+ kref_put(&mr->r_kref, __rds_put_mr_final);
+
/* If the MR was marked as invalidate, this will
* trigger an async flush. */
- if (zot_me) {
- rds_destroy_mr(mr);
- rds_mr_put(mr);
- }
+ if (zot_me)
+ kref_put(&mr->r_kref, __rds_put_mr_final);
}
void rds_rdma_free_op(struct rm_rdma_op *ro)
{
unsigned int i;
- for (i = 0; i < ro->op_nents; i++) {
- struct page *page = sg_page(&ro->op_sg[i]);
+ if (ro->op_odp_mr) {
+ kref_put(&ro->op_odp_mr->r_kref, __rds_put_mr_final);
+ } else {
+ for (i = 0; i < ro->op_nents; i++) {
+ struct page *page = sg_page(&ro->op_sg[i]);
- /* Mark page dirty if it was possibly modified, which
- * is the case for a RDMA_READ which copies from remote
- * to local memory */
- if (!ro->op_write) {
- WARN_ON(!page->mapping && irqs_disabled());
- set_page_dirty(page);
+ /* Mark page dirty if it was possibly modified, which
+ * is the case for a RDMA_READ which copies from remote
+ * to local memory
+ */
+ unpin_user_pages_dirty_lock(&page, 1, !ro->op_write);
}
- put_page(page);
}
kfree(ro->op_notifier);
ro->op_notifier = NULL;
ro->op_active = 0;
+ ro->op_odp_mr = NULL;
}
void rds_atomic_free_op(struct rm_atomic_op *ao)
@@ -481,8 +515,7 @@
/* Mark page dirty if it was possibly modified, which
* is the case for a RDMA_READ which copies from remote
* to local memory */
- set_page_dirty(page);
- put_page(page);
+ unpin_user_pages_dirty_lock(&page, 1, true);
kfree(ao->op_notifier);
ao->op_notifier = NULL;
@@ -584,6 +617,7 @@
struct rds_iovec *iovs;
unsigned int i, j;
int ret = 0;
+ bool odp_supported = true;
if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_rdma_args))
|| rm->rdma.op_active)
@@ -605,6 +639,9 @@
ret = -EINVAL;
goto out_ret;
}
+ /* odp-mr is not supported for multiple requests within one message */
+ if (args->nr_local != 1)
+ odp_supported = false;
iovs = vec->iov;
@@ -626,6 +663,8 @@
op->op_silent = !!(args->flags & RDS_RDMA_SILENT);
op->op_active = 1;
op->op_recverr = rs->rs_recverr;
+ op->op_odp_mr = NULL;
+
WARN_ON(!nr_pages);
op->op_sg = rds_message_alloc_sgs(rm, nr_pages);
if (IS_ERR(op->op_sg)) {
@@ -677,10 +716,44 @@
* If it's a READ operation, we need to pin the pages for writing.
*/
ret = rds_pin_pages(iov->addr, nr, pages, !op->op_write);
- if (ret < 0)
+ if ((!odp_supported && ret <= 0) ||
+ (odp_supported && ret <= 0 && ret != -EOPNOTSUPP))
goto out_pages;
- else
- ret = 0;
+
+ if (ret == -EOPNOTSUPP) {
+ struct rds_mr *local_odp_mr;
+
+ if (!rs->rs_transport->get_mr) {
+ ret = -EOPNOTSUPP;
+ goto out_pages;
+ }
+ local_odp_mr =
+ kzalloc(sizeof(*local_odp_mr), GFP_KERNEL);
+ if (!local_odp_mr) {
+ ret = -ENOMEM;
+ goto out_pages;
+ }
+ RB_CLEAR_NODE(&local_odp_mr->r_rb_node);
+ kref_init(&local_odp_mr->r_kref);
+ local_odp_mr->r_trans = rs->rs_transport;
+ local_odp_mr->r_sock = rs;
+ local_odp_mr->r_trans_private =
+ rs->rs_transport->get_mr(
+ NULL, 0, rs, &local_odp_mr->r_key, NULL,
+ iov->addr, iov->bytes, ODP_VIRTUAL);
+ if (IS_ERR(local_odp_mr->r_trans_private)) {
+ ret = IS_ERR(local_odp_mr->r_trans_private);
+ rdsdebug("get_mr ret %d %p\"", ret,
+ local_odp_mr->r_trans_private);
+ kfree(local_odp_mr);
+ ret = -EOPNOTSUPP;
+ goto out_pages;
+ }
+ rdsdebug("Need odp; local_odp_mr %p trans_private %p\n",
+ local_odp_mr, local_odp_mr->r_trans_private);
+ op->op_odp_mr = local_odp_mr;
+ op->op_odp_addr = iov->addr;
+ }
rdsdebug("RDS: nr_bytes %u nr %u iov->bytes %llu iov->addr %llx\n",
nr_bytes, nr, iov->bytes, iov->addr);
@@ -696,6 +769,7 @@
min_t(unsigned int, iov->bytes, PAGE_SIZE - offset),
offset);
+ sg_dma_len(sg) = sg->length;
rdsdebug("RDS: sg->offset %x sg->len %x iov->addr %llx iov->bytes %llu\n",
sg->offset, sg->length, iov->addr, iov->bytes);
@@ -714,6 +788,7 @@
goto out_pages;
}
op->op_bytes = nr_bytes;
+ ret = 0;
out_pages:
kfree(pages);
@@ -756,11 +831,12 @@
if (!mr)
err = -EINVAL; /* invalid r_key */
else
- refcount_inc(&mr->r_refcount);
+ kref_get(&mr->r_kref);
spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
if (mr) {
- mr->r_trans->sync_mr(mr->r_trans_private, DMA_TO_DEVICE);
+ mr->r_trans->sync_mr(mr->r_trans_private,
+ DMA_TO_DEVICE);
rm->rdma.op_rdma_mr = mr;
}
return err;
@@ -874,7 +950,7 @@
return ret;
err:
if (page)
- put_page(page);
+ unpin_user_page(page);
rm->atomic.op_active = 0;
kfree(rm->atomic.op_notifier);
diff --git a/net/rds/rdma_transport.h b/net/rds/rdma_transport.h
index bfafd4a..ca4c3a6 100644
--- a/net/rds/rdma_transport.h
+++ b/net/rds/rdma_transport.h
@@ -13,7 +13,7 @@
/* Below reject reason is for legacy interoperability issue with non-linux
* RDS endpoints where older version incompatibility is conveyed via value 1.
- * For future version(s), proper encoded reject reason should be be used.
+ * For future version(s), proper encoded reject reason should be used.
*/
#define RDS_RDMA_REJ_INCOMPAT 1
diff --git a/net/rds/rds.h b/net/rds/rds.h
index 2ac5b5e..d35d1fc 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -40,7 +40,6 @@
#ifdef ATOMIC64_INIT
#define KERNEL_HAS_ATOMIC64
#endif
-
#ifdef RDS_DEBUG
#define rdsdebug(fmt, args...) pr_debug("%s(): " fmt, __func__ , ##args)
#else
@@ -292,7 +291,7 @@
struct rds_mr {
struct rb_node r_rb_node;
- refcount_t r_refcount;
+ struct kref r_kref;
u32 r_key;
/* A copy of the creation flags */
@@ -300,19 +299,11 @@
unsigned int r_invalidate:1;
unsigned int r_write:1;
- /* This is for RDS_MR_DEAD.
- * It would be nice & consistent to make this part of the above
- * bit field here, but we need to use test_and_set_bit.
- */
- unsigned long r_state;
struct rds_sock *r_sock; /* back pointer to the socket that owns us */
struct rds_transport *r_trans;
void *r_trans_private;
};
-/* Flags for mr->r_state */
-#define RDS_MR_DEAD 0
-
static inline rds_rdma_cookie_t rds_rdma_make_cookie(u32 r_key, u32 offset)
{
return r_key | (((u64) offset) << 32);
@@ -478,6 +469,9 @@
struct rds_notifier *op_notifier;
struct rds_mr *op_rdma_mr;
+
+ u64 op_odp_addr;
+ struct rds_mr *op_odp_mr;
} rdma;
struct rm_data_op {
unsigned int op_active:1;
@@ -573,7 +567,8 @@
void (*exit)(void);
void *(*get_mr)(struct scatterlist *sg, unsigned long nr_sg,
struct rds_sock *rs, u32 *key_ret,
- struct rds_connection *conn);
+ struct rds_connection *conn,
+ u64 start, u64 length, int need_odp);
void (*sync_mr)(void *trans_private, int direction);
void (*free_mr)(void *trans_private, int invalidate);
void (*flush_mrs)(void);
@@ -783,6 +778,7 @@
void rds_conn_path_drop(struct rds_conn_path *cpath, bool destroy);
void rds_conn_connect_if_down(struct rds_connection *conn);
void rds_conn_path_connect_if_down(struct rds_conn_path *cp);
+void rds_check_all_paths(struct rds_connection *conn);
void rds_for_each_conn_info(struct socket *sock, unsigned int len,
struct rds_info_iterator *iter,
struct rds_info_lengths *lens,
@@ -828,6 +824,12 @@
}
static inline int
+rds_conn_path_down(struct rds_conn_path *cp)
+{
+ return atomic_read(&cp->cp_state) == RDS_CONN_DOWN;
+}
+
+static inline int
rds_conn_up(struct rds_connection *conn)
{
WARN_ON(conn->c_trans->t_mp_capable);
@@ -922,9 +924,9 @@
/* rdma.c */
void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force);
-int rds_get_mr(struct rds_sock *rs, char __user *optval, int optlen);
-int rds_get_mr_for_dest(struct rds_sock *rs, char __user *optval, int optlen);
-int rds_free_mr(struct rds_sock *rs, char __user *optval, int optlen);
+int rds_get_mr(struct rds_sock *rs, sockptr_t optval, int optlen);
+int rds_get_mr_for_dest(struct rds_sock *rs, sockptr_t optval, int optlen);
+int rds_free_mr(struct rds_sock *rs, sockptr_t optval, int optlen);
void rds_rdma_drop_keys(struct rds_sock *rs);
int rds_rdma_extra_size(struct rds_rdma_args *args,
struct rds_iov_vector *iov);
@@ -942,12 +944,7 @@
int rds_cmsg_atomic(struct rds_sock *rs, struct rds_message *rm,
struct cmsghdr *cmsg);
-void __rds_put_mr_final(struct rds_mr *mr);
-static inline void rds_mr_put(struct rds_mr *mr)
-{
- if (refcount_dec_and_test(&mr->r_refcount))
- __rds_put_mr_final(mr);
-}
+void __rds_put_mr_final(struct kref *kref);
static inline bool rds_destroy_pending(struct rds_connection *conn)
{
@@ -955,6 +952,12 @@
(conn->c_trans->t_unloading && conn->c_trans->t_unloading(conn));
}
+enum {
+ ODP_NOT_NEEDED,
+ ODP_ZEROBASED,
+ ODP_VIRTUAL
+};
+
/* stats.c */
DECLARE_PER_CPU_SHARED_ALIGNED(struct rds_statistics, rds_stats);
#define rds_stats_inc_which(which, member) do { \
diff --git a/net/rds/send.c b/net/rds/send.c
index 68e2bdb..985d0b7 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -934,7 +934,7 @@
case RDS_CMSG_ZCOPY_COOKIE:
zcopy_cookie = true;
- /* fall through */
+ fallthrough;
case RDS_CMSG_RDMA_DEST:
case RDS_CMSG_RDMA_MAP:
@@ -1340,7 +1340,8 @@
goto out;
}
- rds_conn_path_connect_if_down(cpath);
+ if (rds_conn_path_down(cpath))
+ rds_check_all_paths(conn);
ret = rds_cong_wait(conn->c_fcong, dport, nonblock, rs);
if (ret) {
diff --git a/net/rds/tcp.c b/net/rds/tcp.c
index 1402e91..5327d13 100644
--- a/net/rds/tcp.c
+++ b/net/rds/tcp.c
@@ -62,8 +62,7 @@
static struct kmem_cache *rds_tcp_conn_slab;
static int rds_tcp_skbuf_handler(struct ctl_table *ctl, int write,
- void __user *buffer, size_t *lenp,
- loff_t *fpos);
+ void *buffer, size_t *lenp, loff_t *fpos);
static int rds_tcp_min_sndbuf = SOCK_MIN_SNDBUF;
static int rds_tcp_min_rcvbuf = SOCK_MIN_RCVBUF;
@@ -90,15 +89,6 @@
{ }
};
-/* doing it this way avoids calling tcp_sk() */
-void rds_tcp_nonagle(struct socket *sock)
-{
- int val = 1;
-
- kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY, (void *)&val,
- sizeof(val));
-}
-
u32 rds_tcp_write_seq(struct rds_tcp_connection *tc)
{
/* seq# of the last byte of data in tcp send buffer */
@@ -503,14 +493,14 @@
struct net *net = sock_net(sk);
struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid);
- rds_tcp_nonagle(sock);
+ tcp_sock_set_nodelay(sock->sk);
lock_sock(sk);
if (rtn->sndbuf_size > 0) {
sk->sk_sndbuf = rtn->sndbuf_size;
sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
}
if (rtn->rcvbuf_size > 0) {
- sk->sk_sndbuf = rtn->rcvbuf_size;
+ sk->sk_rcvbuf = rtn->rcvbuf_size;
sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
}
release_sock(sk);
@@ -676,8 +666,7 @@
}
static int rds_tcp_skbuf_handler(struct ctl_table *ctl, int write,
- void __user *buffer, size_t *lenp,
- loff_t *fpos)
+ void *buffer, size_t *lenp, loff_t *fpos)
{
struct net *net = current->nsproxy->net_ns;
int err;
diff --git a/net/rds/tcp.h b/net/rds/tcp.h
index 4620549..dc8d745 100644
--- a/net/rds/tcp.h
+++ b/net/rds/tcp.h
@@ -50,7 +50,6 @@
/* tcp.c */
void rds_tcp_tune(struct socket *sock);
-void rds_tcp_nonagle(struct socket *sock);
void rds_tcp_set_callbacks(struct socket *sock, struct rds_conn_path *cp);
void rds_tcp_reset_callbacks(struct socket *sock, struct rds_conn_path *cp);
void rds_tcp_restore_callbacks(struct socket *sock,
@@ -72,9 +71,8 @@
void rds_tcp_listen_stop(struct socket *sock, struct work_struct *acceptor);
void rds_tcp_listen_data_ready(struct sock *sk);
int rds_tcp_accept_one(struct socket *sock);
-int rds_tcp_keepalive(struct socket *sock);
+void rds_tcp_keepalive(struct socket *sock);
void *rds_tcp_listen_sock_def_readable(struct net *net);
-void rds_tcp_set_linger(struct socket *sock);
/* tcp_recv.c */
int rds_tcp_recv_init(void);
diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c
index 008f50f..4e64598 100644
--- a/net/rds/tcp_connect.c
+++ b/net/rds/tcp_connect.c
@@ -207,7 +207,7 @@
if (sock) {
if (rds_destroy_pending(cp->cp_conn))
- rds_tcp_set_linger(sock);
+ sock_no_linger(sock->sk);
sock->ops->shutdown(sock, RCV_SHUTDOWN | SEND_SHUTDOWN);
lock_sock(sock->sk);
rds_tcp_restore_callbacks(sock, tc); /* tc->tc_sock = NULL */
diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c
index 26a3e18..09cadd5 100644
--- a/net/rds/tcp_listen.c
+++ b/net/rds/tcp_listen.c
@@ -38,36 +38,19 @@
#include "rds.h"
#include "tcp.h"
-int rds_tcp_keepalive(struct socket *sock)
+void rds_tcp_keepalive(struct socket *sock)
{
/* values below based on xs_udp_default_timeout */
int keepidle = 5; /* send a probe 'keepidle' secs after last data */
int keepcnt = 5; /* number of unack'ed probes before declaring dead */
- int keepalive = 1;
- int ret = 0;
- ret = kernel_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE,
- (char *)&keepalive, sizeof(keepalive));
- if (ret < 0)
- goto bail;
-
- ret = kernel_setsockopt(sock, IPPROTO_TCP, TCP_KEEPCNT,
- (char *)&keepcnt, sizeof(keepcnt));
- if (ret < 0)
- goto bail;
-
- ret = kernel_setsockopt(sock, IPPROTO_TCP, TCP_KEEPIDLE,
- (char *)&keepidle, sizeof(keepidle));
- if (ret < 0)
- goto bail;
-
+ sock_set_keepalive(sock->sk);
+ tcp_sock_set_keepcnt(sock->sk, keepcnt);
+ tcp_sock_set_keepidle(sock->sk, keepidle);
/* KEEPINTVL is the interval between successive probes. We follow
* the model in xs_tcp_finish_connecting() and re-use keepidle.
*/
- ret = kernel_setsockopt(sock, IPPROTO_TCP, TCP_KEEPINTVL,
- (char *)&keepidle, sizeof(keepidle));
-bail:
- return ret;
+ tcp_sock_set_keepintvl(sock->sk, keepidle);
}
/* rds_tcp_accept_one_path(): if accepting on cp_index > 0, make sure the
@@ -111,17 +94,6 @@
return NULL;
}
-void rds_tcp_set_linger(struct socket *sock)
-{
- struct linger no_linger = {
- .l_onoff = 1,
- .l_linger = 0,
- };
-
- kernel_setsockopt(sock, SOL_SOCKET, SO_LINGER,
- (char *)&no_linger, sizeof(no_linger));
-}
-
int rds_tcp_accept_one(struct socket *sock)
{
struct socket *new_sock = NULL;
@@ -160,10 +132,7 @@
new_sock->ops = sock->ops;
__module_get(new_sock->ops->owner);
- ret = rds_tcp_keepalive(new_sock);
- if (ret < 0)
- goto out;
-
+ rds_tcp_keepalive(new_sock);
rds_tcp_tune(new_sock);
inet = inet_sk(new_sock->sk);
@@ -247,7 +216,7 @@
* be pending on it. By setting linger, we achieve the side-effect
* of avoiding TIME_WAIT state on new_sock.
*/
- rds_tcp_set_linger(new_sock);
+ sock_no_linger(new_sock->sk);
kernel_sock_shutdown(new_sock, SHUT_RDWR);
ret = 0;
out:
@@ -309,7 +278,7 @@
}
sock->sk->sk_reuse = SK_CAN_REUSE;
- rds_tcp_nonagle(sock);
+ tcp_sock_set_nodelay(sock->sk);
write_lock_bh(&sock->sk->sk_callback_lock);
sock->sk->sk_user_data = sock->sk->sk_data_ready;
diff --git a/net/rds/tcp_send.c b/net/rds/tcp_send.c
index 78a2554..8c4d1d6 100644
--- a/net/rds/tcp_send.c
+++ b/net/rds/tcp_send.c
@@ -38,23 +38,18 @@
#include "rds.h"
#include "tcp.h"
-static void rds_tcp_cork(struct socket *sock, int val)
-{
- kernel_setsockopt(sock, SOL_TCP, TCP_CORK, (void *)&val, sizeof(val));
-}
-
void rds_tcp_xmit_path_prepare(struct rds_conn_path *cp)
{
struct rds_tcp_connection *tc = cp->cp_transport_data;
- rds_tcp_cork(tc->t_sock, 1);
+ tcp_sock_set_cork(tc->t_sock->sk, true);
}
void rds_tcp_xmit_path_complete(struct rds_conn_path *cp)
{
struct rds_tcp_connection *tc = cp->cp_transport_data;
- rds_tcp_cork(tc->t_sock, 0);
+ tcp_sock_set_cork(tc->t_sock->sk, false);
}
/* the core send_sem serializes this with other xmit and shutdown */
diff --git a/net/rds/transport.c b/net/rds/transport.c
index 46f709a..f8001ec 100644
--- a/net/rds/transport.c
+++ b/net/rds/transport.c
@@ -38,6 +38,12 @@
#include "rds.h"
#include "loop.h"
+static char * const rds_trans_modules[] = {
+ [RDS_TRANS_IB] = "rds_rdma",
+ [RDS_TRANS_GAP] = NULL,
+ [RDS_TRANS_TCP] = "rds_tcp",
+};
+
static struct rds_transport *transports[RDS_TRANS_COUNT];
static DECLARE_RWSEM(rds_trans_sem);
@@ -110,18 +116,20 @@
{
struct rds_transport *ret = NULL;
struct rds_transport *trans;
- unsigned int i;
down_read(&rds_trans_sem);
- for (i = 0; i < RDS_TRANS_COUNT; i++) {
- trans = transports[i];
-
- if (trans && trans->t_type == t_type &&
- (!trans->t_owner || try_module_get(trans->t_owner))) {
- ret = trans;
- break;
- }
+ trans = transports[t_type];
+ if (!trans) {
+ up_read(&rds_trans_sem);
+ if (rds_trans_modules[t_type])
+ request_module(rds_trans_modules[t_type]);
+ down_read(&rds_trans_sem);
+ trans = transports[t_type];
}
+ if (trans && trans->t_type == t_type &&
+ (!trans->t_owner || try_module_get(trans->t_owner)))
+ ret = trans;
+
up_read(&rds_trans_sem);
return ret;