Update Linux to v5.10.109
Sourced from [1]
[1] https://cdn.kernel.org/pub/linux/kernel/v5.x/linux-5.10.109.tar.xz
Change-Id: I19bca9fc6762d4e63bcf3e4cba88bbe560d9c76c
Signed-off-by: Olivier Deprez <olivier.deprez@arm.com>
diff --git a/net/sunrpc/Kconfig b/net/sunrpc/Kconfig
index 3bcf985..bbbb5af 100644
--- a/net/sunrpc/Kconfig
+++ b/net/sunrpc/Kconfig
@@ -21,7 +21,6 @@
depends on SUNRPC && CRYPTO
depends on CRYPTO_MD5 && CRYPTO_DES && CRYPTO_CBC && CRYPTO_CTS
depends on CRYPTO_ECB && CRYPTO_HMAC && CRYPTO_SHA1 && CRYPTO_AES
- depends on CRYPTO_ARC4
default y
select SUNRPC_GSS
help
diff --git a/net/sunrpc/addr.c b/net/sunrpc/addr.c
index 0d4a2bb..d435bff 100644
--- a/net/sunrpc/addr.c
+++ b/net/sunrpc/addr.c
@@ -162,8 +162,10 @@
const size_t buflen, const char *delim,
struct sockaddr_in6 *sin6)
{
- char *p;
+ char p[IPV6_SCOPE_ID_LEN + 1];
size_t len;
+ u32 scope_id = 0;
+ struct net_device *dev;
if ((buf + buflen) == delim)
return 1;
@@ -175,29 +177,23 @@
return 0;
len = (buf + buflen) - delim - 1;
- p = kstrndup(delim + 1, len, GFP_KERNEL);
- if (p) {
- u32 scope_id = 0;
- struct net_device *dev;
+ if (len > IPV6_SCOPE_ID_LEN)
+ return 0;
- dev = dev_get_by_name(net, p);
- if (dev != NULL) {
- scope_id = dev->ifindex;
- dev_put(dev);
- } else {
- if (kstrtou32(p, 10, &scope_id) != 0) {
- kfree(p);
- return 0;
- }
- }
+ memcpy(p, delim + 1, len);
+ p[len] = 0;
- kfree(p);
-
- sin6->sin6_scope_id = scope_id;
- return 1;
+ dev = dev_get_by_name(net, p);
+ if (dev != NULL) {
+ scope_id = dev->ifindex;
+ dev_put(dev);
+ } else {
+ if (kstrtou32(p, 10, &scope_id) != 0)
+ return 0;
}
- return 0;
+ sin6->sin6_scope_id = scope_id;
+ return 1;
}
static size_t rpc_pton6(struct net *net, const char *buf, const size_t buflen,
diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c
index cdb05b4..a9f0d17 100644
--- a/net/sunrpc/auth.c
+++ b/net/sunrpc/auth.c
@@ -81,7 +81,7 @@
unsigned int nbits;
nbits = *(unsigned int *)kp->arg;
- return sprintf(buffer, "%u", 1U << nbits);
+ return sprintf(buffer, "%u\n", 1U << nbits);
}
#define param_check_hashtbl_sz(name, p) __param_check(name, p, unsigned int);
@@ -221,55 +221,6 @@
}
EXPORT_SYMBOL_GPL(rpcauth_get_gssinfo);
-/**
- * rpcauth_list_flavors - discover registered flavors and pseudoflavors
- * @array: array to fill in
- * @size: size of "array"
- *
- * Returns the number of array items filled in, or a negative errno.
- *
- * The returned array is not sorted by any policy. Callers should not
- * rely on the order of the items in the returned array.
- */
-int
-rpcauth_list_flavors(rpc_authflavor_t *array, int size)
-{
- const struct rpc_authops *ops;
- rpc_authflavor_t flavor, pseudos[4];
- int i, len, result = 0;
-
- rcu_read_lock();
- for (flavor = 0; flavor < RPC_AUTH_MAXFLAVOR; flavor++) {
- ops = rcu_dereference(auth_flavors[flavor]);
- if (result >= size) {
- result = -ENOMEM;
- break;
- }
-
- if (ops == NULL)
- continue;
- if (ops->list_pseudoflavors == NULL) {
- array[result++] = ops->au_flavor;
- continue;
- }
- len = ops->list_pseudoflavors(pseudos, ARRAY_SIZE(pseudos));
- if (len < 0) {
- result = len;
- break;
- }
- for (i = 0; i < len; i++) {
- if (result >= size) {
- result = -ENOMEM;
- break;
- }
- array[result++] = pseudos[i];
- }
- }
- rcu_read_unlock();
- return result;
-}
-EXPORT_SYMBOL_GPL(rpcauth_list_flavors);
-
struct rpc_auth *
rpcauth_create(const struct rpc_auth_create_args *args, struct rpc_clnt *clnt)
{
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index b7a7157..5f42aa5 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -226,7 +226,7 @@
if (IS_ERR(p))
goto err;
done:
- trace_rpcgss_context(ctx->gc_expiry, now, timeout,
+ trace_rpcgss_context(window_size, ctx->gc_expiry, now, timeout,
ctx->gc_acceptor.len, ctx->gc_acceptor.data);
err:
return p;
@@ -669,10 +669,12 @@
}
schedule();
}
- if (gss_msg->ctx)
+ if (gss_msg->ctx) {
+ trace_rpcgss_ctx_init(gss_cred);
gss_cred_set_ctx(cred, gss_msg->ctx);
- else
+ } else {
err = gss_msg->msg.errno;
+ }
spin_unlock(&pipe->lock);
out_intr:
finish_wait(&gss_msg->waitqueue, &wait);
@@ -1026,11 +1028,11 @@
auth->au_rslack = GSS_KRB5_MAX_SLACK_NEEDED >> 2;
auth->au_verfsize = GSS_VERF_SLACK >> 2;
auth->au_ralign = GSS_VERF_SLACK >> 2;
- auth->au_flags = 0;
+ __set_bit(RPCAUTH_AUTH_UPDATE_SLACK, &auth->au_flags);
auth->au_ops = &authgss_ops;
auth->au_flavor = flavor;
if (gss_pseudoflavor_to_datatouch(gss_auth->mech, flavor))
- auth->au_flags |= RPCAUTH_AUTH_DATATOUCH;
+ __set_bit(RPCAUTH_AUTH_DATATOUCH, &auth->au_flags);
refcount_set(&auth->au_count, 1);
kref_init(&gss_auth->kref);
@@ -1256,8 +1258,9 @@
if (new) {
ctx->gc_proc = RPC_GSS_PROC_DESTROY;
+ trace_rpcgss_ctx_destroy(gss_cred);
task = rpc_call_null(gss_auth->client, &new->gc_base,
- RPC_TASK_ASYNC|RPC_TASK_SOFT);
+ RPC_TASK_ASYNC);
if (!IS_ERR(task))
rpc_put_task(task);
@@ -1321,7 +1324,6 @@
static void
gss_destroy_cred(struct rpc_cred *cred)
{
-
if (test_and_clear_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags) != 0)
gss_send_destroy_context(cred);
gss_destroy_nullcred(cred);
@@ -1585,6 +1587,7 @@
new = gss_lookup_cred(auth, &acred, RPCAUTH_LOOKUP_NEW);
if (IS_ERR(new))
return PTR_ERR(new);
+
task->tk_rqstp->rq_cred = new;
put_rpccred(oldcred);
return 0;
@@ -1681,7 +1684,8 @@
/* We leave it to unwrap to calculate au_rslack. For now we just
* calculate the length of the verifier: */
- cred->cr_auth->au_verfsize = XDR_QUADLEN(len) + 2;
+ if (test_bit(RPCAUTH_AUTH_UPDATE_SLACK, &cred->cr_auth->au_flags))
+ cred->cr_auth->au_verfsize = XDR_QUADLEN(len) + 2;
status = 0;
out:
gss_put_ctx(ctx);
@@ -1697,8 +1701,9 @@
goto out;
}
-static int gss_wrap_req_integ(struct rpc_cred *cred, struct gss_cl_ctx *ctx,
- struct rpc_task *task, struct xdr_stream *xdr)
+static noinline_for_stack int
+gss_wrap_req_integ(struct rpc_cred *cred, struct gss_cl_ctx *ctx,
+ struct rpc_task *task, struct xdr_stream *xdr)
{
struct rpc_rqst *rqstp = task->tk_rqstp;
struct xdr_buf integ_buf, *snd_buf = &rqstp->rq_snd_buf;
@@ -1789,8 +1794,9 @@
return -EAGAIN;
}
-static int gss_wrap_req_priv(struct rpc_cred *cred, struct gss_cl_ctx *ctx,
- struct rpc_task *task, struct xdr_stream *xdr)
+static noinline_for_stack int
+gss_wrap_req_priv(struct rpc_cred *cred, struct gss_cl_ctx *ctx,
+ struct rpc_task *task, struct xdr_stream *xdr)
{
struct rpc_rqst *rqstp = task->tk_rqstp;
struct xdr_buf *snd_buf = &rqstp->rq_snd_buf;
@@ -1850,7 +1856,7 @@
else
iov = snd_buf->head;
p = iov->iov_base + iov->iov_len;
- pad = 3 - ((snd_buf->len - offset - 1) & 3);
+ pad = xdr_pad_size(snd_buf->len - offset);
memset(p, 0, pad);
iov->iov_len += pad;
snd_buf->len += pad;
@@ -1897,13 +1903,30 @@
return status;
}
-static int
-gss_unwrap_resp_auth(struct rpc_cred *cred)
+/**
+ * gss_update_rslack - Possibly update RPC receive buffer size estimates
+ * @task: rpc_task for incoming RPC Reply being unwrapped
+ * @cred: controlling rpc_cred for @task
+ * @before: XDR words needed before each RPC Reply message
+ * @after: XDR words needed following each RPC Reply message
+ *
+ */
+static void gss_update_rslack(struct rpc_task *task, struct rpc_cred *cred,
+ unsigned int before, unsigned int after)
{
struct rpc_auth *auth = cred->cr_auth;
- auth->au_rslack = auth->au_verfsize;
- auth->au_ralign = auth->au_verfsize;
+ if (test_and_clear_bit(RPCAUTH_AUTH_UPDATE_SLACK, &auth->au_flags)) {
+ auth->au_ralign = auth->au_verfsize + before;
+ auth->au_rslack = auth->au_verfsize + after;
+ trace_rpcgss_update_slack(task, auth);
+ }
+}
+
+static int
+gss_unwrap_resp_auth(struct rpc_task *task, struct rpc_cred *cred)
+{
+ gss_update_rslack(task, cred, 0, 0);
return 0;
}
@@ -1920,13 +1943,12 @@
* proc_req_arg_t arg;
* };
*/
-static int
+static noinline_for_stack int
gss_unwrap_resp_integ(struct rpc_task *task, struct rpc_cred *cred,
struct gss_cl_ctx *ctx, struct rpc_rqst *rqstp,
struct xdr_stream *xdr)
{
struct xdr_buf gss_data, *rcv_buf = &rqstp->rq_rcv_buf;
- struct rpc_auth *auth = cred->cr_auth;
u32 len, offset, seqno, maj_stat;
struct xdr_netobj mic;
int ret;
@@ -1975,8 +1997,7 @@
if (maj_stat != GSS_S_COMPLETE)
goto bad_mic;
- auth->au_rslack = auth->au_verfsize + 2 + 1 + XDR_QUADLEN(mic.len);
- auth->au_ralign = auth->au_verfsize + 2;
+ gss_update_rslack(task, cred, 2, 2 + 1 + XDR_QUADLEN(mic.len));
ret = 0;
out:
@@ -1994,14 +2015,13 @@
goto out;
}
-static int
+static noinline_for_stack int
gss_unwrap_resp_priv(struct rpc_task *task, struct rpc_cred *cred,
struct gss_cl_ctx *ctx, struct rpc_rqst *rqstp,
struct xdr_stream *xdr)
{
struct xdr_buf *rcv_buf = &rqstp->rq_rcv_buf;
struct kvec *head = rqstp->rq_rcv_buf.head;
- struct rpc_auth *auth = cred->cr_auth;
u32 offset, opaque_len, maj_stat;
__be32 *p;
@@ -2028,8 +2048,8 @@
*/
xdr_init_decode(xdr, rcv_buf, p, rqstp);
- auth->au_rslack = auth->au_verfsize + 2 + ctx->gc_gss_ctx->slack;
- auth->au_ralign = auth->au_verfsize + 2 + ctx->gc_gss_ctx->align;
+ gss_update_rslack(task, cred, 2 + ctx->gc_gss_ctx->align,
+ 2 + ctx->gc_gss_ctx->slack);
return 0;
unwrap_failed:
@@ -2100,7 +2120,7 @@
goto out_decode;
switch (gss_cred->gc_service) {
case RPC_GSS_SVC_NONE:
- status = gss_unwrap_resp_auth(cred);
+ status = gss_unwrap_resp_auth(task, cred);
break;
case RPC_GSS_SVC_INTEGRITY:
status = gss_unwrap_resp_integ(task, cred, ctx, rqstp, xdr);
@@ -2128,7 +2148,6 @@
.hash_cred = gss_hash_cred,
.lookup_cred = gss_lookup_cred,
.crcreate = gss_create_cred,
- .list_pseudoflavors = gss_mech_list_pseudoflavors,
.info2flavor = gss_mech_info2flavor,
.flavor2info = gss_mech_flavor2info,
};
diff --git a/net/sunrpc/auth_gss/gss_krb5_crypto.c b/net/sunrpc/auth_gss/gss_krb5_crypto.c
index e7180da..634b6c6 100644
--- a/net/sunrpc/auth_gss/gss_krb5_crypto.c
+++ b/net/sunrpc/auth_gss/gss_krb5_crypto.c
@@ -138,135 +138,6 @@
return crypto_ahash_update(req);
}
-static int
-arcfour_hmac_md5_usage_to_salt(unsigned int usage, u8 salt[4])
-{
- unsigned int ms_usage;
-
- switch (usage) {
- case KG_USAGE_SIGN:
- ms_usage = 15;
- break;
- case KG_USAGE_SEAL:
- ms_usage = 13;
- break;
- default:
- return -EINVAL;
- }
- salt[0] = (ms_usage >> 0) & 0xff;
- salt[1] = (ms_usage >> 8) & 0xff;
- salt[2] = (ms_usage >> 16) & 0xff;
- salt[3] = (ms_usage >> 24) & 0xff;
-
- return 0;
-}
-
-static u32
-make_checksum_hmac_md5(struct krb5_ctx *kctx, char *header, int hdrlen,
- struct xdr_buf *body, int body_offset, u8 *cksumkey,
- unsigned int usage, struct xdr_netobj *cksumout)
-{
- struct scatterlist sg[1];
- int err = -1;
- u8 *checksumdata;
- u8 *rc4salt;
- struct crypto_ahash *md5;
- struct crypto_ahash *hmac_md5;
- struct ahash_request *req;
-
- if (cksumkey == NULL)
- return GSS_S_FAILURE;
-
- if (cksumout->len < kctx->gk5e->cksumlength) {
- dprintk("%s: checksum buffer length, %u, too small for %s\n",
- __func__, cksumout->len, kctx->gk5e->name);
- return GSS_S_FAILURE;
- }
-
- rc4salt = kmalloc_array(4, sizeof(*rc4salt), GFP_NOFS);
- if (!rc4salt)
- return GSS_S_FAILURE;
-
- if (arcfour_hmac_md5_usage_to_salt(usage, rc4salt)) {
- dprintk("%s: invalid usage value %u\n", __func__, usage);
- goto out_free_rc4salt;
- }
-
- checksumdata = kmalloc(GSS_KRB5_MAX_CKSUM_LEN, GFP_NOFS);
- if (!checksumdata)
- goto out_free_rc4salt;
-
- md5 = crypto_alloc_ahash("md5", 0, CRYPTO_ALG_ASYNC);
- if (IS_ERR(md5))
- goto out_free_cksum;
-
- hmac_md5 = crypto_alloc_ahash(kctx->gk5e->cksum_name, 0,
- CRYPTO_ALG_ASYNC);
- if (IS_ERR(hmac_md5))
- goto out_free_md5;
-
- req = ahash_request_alloc(md5, GFP_NOFS);
- if (!req)
- goto out_free_hmac_md5;
-
- ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP, NULL, NULL);
-
- err = crypto_ahash_init(req);
- if (err)
- goto out;
- sg_init_one(sg, rc4salt, 4);
- ahash_request_set_crypt(req, sg, NULL, 4);
- err = crypto_ahash_update(req);
- if (err)
- goto out;
-
- sg_init_one(sg, header, hdrlen);
- ahash_request_set_crypt(req, sg, NULL, hdrlen);
- err = crypto_ahash_update(req);
- if (err)
- goto out;
- err = xdr_process_buf(body, body_offset, body->len - body_offset,
- checksummer, req);
- if (err)
- goto out;
- ahash_request_set_crypt(req, NULL, checksumdata, 0);
- err = crypto_ahash_final(req);
- if (err)
- goto out;
-
- ahash_request_free(req);
- req = ahash_request_alloc(hmac_md5, GFP_NOFS);
- if (!req)
- goto out_free_hmac_md5;
-
- ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP, NULL, NULL);
-
- err = crypto_ahash_setkey(hmac_md5, cksumkey, kctx->gk5e->keylength);
- if (err)
- goto out;
-
- sg_init_one(sg, checksumdata, crypto_ahash_digestsize(md5));
- ahash_request_set_crypt(req, sg, checksumdata,
- crypto_ahash_digestsize(md5));
- err = crypto_ahash_digest(req);
- if (err)
- goto out;
-
- memcpy(cksumout->data, checksumdata, kctx->gk5e->cksumlength);
- cksumout->len = kctx->gk5e->cksumlength;
-out:
- ahash_request_free(req);
-out_free_hmac_md5:
- crypto_free_ahash(hmac_md5);
-out_free_md5:
- crypto_free_ahash(md5);
-out_free_cksum:
- kfree(checksumdata);
-out_free_rc4salt:
- kfree(rc4salt);
- return err ? GSS_S_FAILURE : 0;
-}
-
/*
* checksum the plaintext data and hdrlen bytes of the token header
* The checksum is performed over the first 8 bytes of the
@@ -284,11 +155,6 @@
u8 *checksumdata;
unsigned int checksumlen;
- if (kctx->gk5e->ctype == CKSUMTYPE_HMAC_MD5_ARCFOUR)
- return make_checksum_hmac_md5(kctx, header, hdrlen,
- body, body_offset,
- cksumkey, usage, cksumout);
-
if (cksumout->len < kctx->gk5e->cksumlength) {
dprintk("%s: checksum buffer length, %u, too small for %s\n",
__func__, cksumout->len, kctx->gk5e->name);
@@ -942,145 +808,3 @@
ret = GSS_S_FAILURE;
return ret;
}
-
-/*
- * Compute Kseq given the initial session key and the checksum.
- * Set the key of the given cipher.
- */
-int
-krb5_rc4_setup_seq_key(struct krb5_ctx *kctx,
- struct crypto_sync_skcipher *cipher,
- unsigned char *cksum)
-{
- struct crypto_shash *hmac;
- struct shash_desc *desc;
- u8 Kseq[GSS_KRB5_MAX_KEYLEN];
- u32 zeroconstant = 0;
- int err;
-
- dprintk("%s: entered\n", __func__);
-
- hmac = crypto_alloc_shash(kctx->gk5e->cksum_name, 0, 0);
- if (IS_ERR(hmac)) {
- dprintk("%s: error %ld, allocating hash '%s'\n",
- __func__, PTR_ERR(hmac), kctx->gk5e->cksum_name);
- return PTR_ERR(hmac);
- }
-
- desc = kmalloc(sizeof(*desc) + crypto_shash_descsize(hmac),
- GFP_NOFS);
- if (!desc) {
- dprintk("%s: failed to allocate shash descriptor for '%s'\n",
- __func__, kctx->gk5e->cksum_name);
- crypto_free_shash(hmac);
- return -ENOMEM;
- }
-
- desc->tfm = hmac;
-
- /* Compute intermediate Kseq from session key */
- err = crypto_shash_setkey(hmac, kctx->Ksess, kctx->gk5e->keylength);
- if (err)
- goto out_err;
-
- err = crypto_shash_digest(desc, (u8 *)&zeroconstant, 4, Kseq);
- if (err)
- goto out_err;
-
- /* Compute final Kseq from the checksum and intermediate Kseq */
- err = crypto_shash_setkey(hmac, Kseq, kctx->gk5e->keylength);
- if (err)
- goto out_err;
-
- err = crypto_shash_digest(desc, cksum, 8, Kseq);
- if (err)
- goto out_err;
-
- err = crypto_sync_skcipher_setkey(cipher, Kseq, kctx->gk5e->keylength);
- if (err)
- goto out_err;
-
- err = 0;
-
-out_err:
- kzfree(desc);
- crypto_free_shash(hmac);
- dprintk("%s: returning %d\n", __func__, err);
- return err;
-}
-
-/*
- * Compute Kcrypt given the initial session key and the plaintext seqnum.
- * Set the key of cipher kctx->enc.
- */
-int
-krb5_rc4_setup_enc_key(struct krb5_ctx *kctx,
- struct crypto_sync_skcipher *cipher,
- s32 seqnum)
-{
- struct crypto_shash *hmac;
- struct shash_desc *desc;
- u8 Kcrypt[GSS_KRB5_MAX_KEYLEN];
- u8 zeroconstant[4] = {0};
- u8 seqnumarray[4];
- int err, i;
-
- dprintk("%s: entered, seqnum %u\n", __func__, seqnum);
-
- hmac = crypto_alloc_shash(kctx->gk5e->cksum_name, 0, 0);
- if (IS_ERR(hmac)) {
- dprintk("%s: error %ld, allocating hash '%s'\n",
- __func__, PTR_ERR(hmac), kctx->gk5e->cksum_name);
- return PTR_ERR(hmac);
- }
-
- desc = kmalloc(sizeof(*desc) + crypto_shash_descsize(hmac),
- GFP_NOFS);
- if (!desc) {
- dprintk("%s: failed to allocate shash descriptor for '%s'\n",
- __func__, kctx->gk5e->cksum_name);
- crypto_free_shash(hmac);
- return -ENOMEM;
- }
-
- desc->tfm = hmac;
-
- /* Compute intermediate Kcrypt from session key */
- for (i = 0; i < kctx->gk5e->keylength; i++)
- Kcrypt[i] = kctx->Ksess[i] ^ 0xf0;
-
- err = crypto_shash_setkey(hmac, Kcrypt, kctx->gk5e->keylength);
- if (err)
- goto out_err;
-
- err = crypto_shash_digest(desc, zeroconstant, 4, Kcrypt);
- if (err)
- goto out_err;
-
- /* Compute final Kcrypt from the seqnum and intermediate Kcrypt */
- err = crypto_shash_setkey(hmac, Kcrypt, kctx->gk5e->keylength);
- if (err)
- goto out_err;
-
- seqnumarray[0] = (unsigned char) ((seqnum >> 24) & 0xff);
- seqnumarray[1] = (unsigned char) ((seqnum >> 16) & 0xff);
- seqnumarray[2] = (unsigned char) ((seqnum >> 8) & 0xff);
- seqnumarray[3] = (unsigned char) ((seqnum >> 0) & 0xff);
-
- err = crypto_shash_digest(desc, seqnumarray, 4, Kcrypt);
- if (err)
- goto out_err;
-
- err = crypto_sync_skcipher_setkey(cipher, Kcrypt,
- kctx->gk5e->keylength);
- if (err)
- goto out_err;
-
- err = 0;
-
-out_err:
- kzfree(desc);
- crypto_free_shash(hmac);
- dprintk("%s: returning %d\n", __func__, err);
- return err;
-}
diff --git a/net/sunrpc/auth_gss/gss_krb5_keys.c b/net/sunrpc/auth_gss/gss_krb5_keys.c
index 3b7f721..726c076 100644
--- a/net/sunrpc/auth_gss/gss_krb5_keys.c
+++ b/net/sunrpc/auth_gss/gss_krb5_keys.c
@@ -228,11 +228,11 @@
ret = 0;
err_free_raw:
- kzfree(rawkey);
+ kfree_sensitive(rawkey);
err_free_out:
- kzfree(outblockdata);
+ kfree_sensitive(outblockdata);
err_free_in:
- kzfree(inblockdata);
+ kfree_sensitive(inblockdata);
err_free_cipher:
crypto_free_sync_skcipher(cipher);
err_return:
diff --git a/net/sunrpc/auth_gss/gss_krb5_mech.c b/net/sunrpc/auth_gss/gss_krb5_mech.c
index b552dd4..1c092b0 100644
--- a/net/sunrpc/auth_gss/gss_krb5_mech.c
+++ b/net/sunrpc/auth_gss/gss_krb5_mech.c
@@ -54,27 +54,6 @@
},
#endif /* CONFIG_SUNRPC_DISABLE_INSECURE_ENCTYPES */
/*
- * RC4-HMAC
- */
- {
- .etype = ENCTYPE_ARCFOUR_HMAC,
- .ctype = CKSUMTYPE_HMAC_MD5_ARCFOUR,
- .name = "rc4-hmac",
- .encrypt_name = "ecb(arc4)",
- .cksum_name = "hmac(md5)",
- .encrypt = krb5_encrypt,
- .decrypt = krb5_decrypt,
- .mk_key = NULL,
- .signalg = SGN_ALG_HMAC_MD5,
- .sealalg = SEAL_ALG_MICROSOFT_RC4,
- .keybytes = 16,
- .keylength = 16,
- .blocksize = 1,
- .conflen = 8,
- .cksumlength = 8,
- .keyed_cksum = 1,
- },
- /*
* 3DES
*/
{
@@ -226,6 +205,7 @@
{
u32 seq_send;
int tmp;
+ u32 time32;
p = simple_get_bytes(p, end, &ctx->initiate, sizeof(ctx->initiate));
if (IS_ERR(p))
@@ -263,9 +243,11 @@
p = ERR_PTR(-ENOSYS);
goto out_err;
}
- p = simple_get_bytes(p, end, &ctx->endtime, sizeof(ctx->endtime));
+ p = simple_get_bytes(p, end, &time32, sizeof(time32));
if (IS_ERR(p))
goto out_err;
+ /* unsigned 32-bit time overflows in year 2106 */
+ ctx->endtime = (time64_t)time32;
p = simple_get_bytes(p, end, &seq_send, sizeof(seq_send));
if (IS_ERR(p))
goto out_err;
@@ -371,78 +353,6 @@
return -EINVAL;
}
-/*
- * Note that RC4 depends on deriving keys using the sequence
- * number or the checksum of a token. Therefore, the final keys
- * cannot be calculated until the token is being constructed!
- */
-static int
-context_derive_keys_rc4(struct krb5_ctx *ctx)
-{
- struct crypto_shash *hmac;
- char sigkeyconstant[] = "signaturekey";
- int slen = strlen(sigkeyconstant) + 1; /* include null terminator */
- struct shash_desc *desc;
- int err;
-
- dprintk("RPC: %s: entered\n", __func__);
- /*
- * derive cksum (aka Ksign) key
- */
- hmac = crypto_alloc_shash(ctx->gk5e->cksum_name, 0, 0);
- if (IS_ERR(hmac)) {
- dprintk("%s: error %ld allocating hash '%s'\n",
- __func__, PTR_ERR(hmac), ctx->gk5e->cksum_name);
- err = PTR_ERR(hmac);
- goto out_err;
- }
-
- err = crypto_shash_setkey(hmac, ctx->Ksess, ctx->gk5e->keylength);
- if (err)
- goto out_err_free_hmac;
-
-
- desc = kmalloc(sizeof(*desc) + crypto_shash_descsize(hmac), GFP_NOFS);
- if (!desc) {
- dprintk("%s: failed to allocate hash descriptor for '%s'\n",
- __func__, ctx->gk5e->cksum_name);
- err = -ENOMEM;
- goto out_err_free_hmac;
- }
-
- desc->tfm = hmac;
-
- err = crypto_shash_digest(desc, sigkeyconstant, slen, ctx->cksum);
- kzfree(desc);
- if (err)
- goto out_err_free_hmac;
- /*
- * allocate hash, and skciphers for data and seqnum encryption
- */
- ctx->enc = crypto_alloc_sync_skcipher(ctx->gk5e->encrypt_name, 0, 0);
- if (IS_ERR(ctx->enc)) {
- err = PTR_ERR(ctx->enc);
- goto out_err_free_hmac;
- }
-
- ctx->seq = crypto_alloc_sync_skcipher(ctx->gk5e->encrypt_name, 0, 0);
- if (IS_ERR(ctx->seq)) {
- crypto_free_sync_skcipher(ctx->enc);
- err = PTR_ERR(ctx->seq);
- goto out_err_free_hmac;
- }
-
- dprintk("RPC: %s: returning success\n", __func__);
-
- err = 0;
-
-out_err_free_hmac:
- crypto_free_shash(hmac);
-out_err:
- dprintk("RPC: %s: returning %d\n", __func__, err);
- return err;
-}
-
static int
context_derive_keys_new(struct krb5_ctx *ctx, gfp_t gfp_mask)
{
@@ -560,15 +470,18 @@
{
u64 seq_send64;
int keylen;
+ u32 time32;
p = simple_get_bytes(p, end, &ctx->flags, sizeof(ctx->flags));
if (IS_ERR(p))
goto out_err;
ctx->initiate = ctx->flags & KRB5_CTX_FLAG_INITIATOR;
- p = simple_get_bytes(p, end, &ctx->endtime, sizeof(ctx->endtime));
+ p = simple_get_bytes(p, end, &time32, sizeof(time32));
if (IS_ERR(p))
goto out_err;
+ /* unsigned 32-bit time overflows in year 2106 */
+ ctx->endtime = (time64_t)time32;
p = simple_get_bytes(p, end, &seq_send64, sizeof(seq_send64));
if (IS_ERR(p))
goto out_err;
@@ -616,8 +529,6 @@
switch (ctx->enctype) {
case ENCTYPE_DES3_CBC_RAW:
return context_derive_keys_des3(ctx, gfp_mask);
- case ENCTYPE_ARCFOUR_HMAC:
- return context_derive_keys_rc4(ctx);
case ENCTYPE_AES128_CTS_HMAC_SHA1_96:
case ENCTYPE_AES256_CTS_HMAC_SHA1_96:
return context_derive_keys_new(ctx, gfp_mask);
@@ -632,7 +543,7 @@
static int
gss_import_sec_context_kerberos(const void *p, size_t len,
struct gss_ctx *ctx_id,
- time_t *endtime,
+ time64_t *endtime,
gfp_t gfp_mask)
{
const void *end = (const void *)((const char *)p + len);
diff --git a/net/sunrpc/auth_gss/gss_krb5_seal.c b/net/sunrpc/auth_gss/gss_krb5_seal.c
index 48fe4a5..3306141 100644
--- a/net/sunrpc/auth_gss/gss_krb5_seal.c
+++ b/net/sunrpc/auth_gss/gss_krb5_seal.c
@@ -131,14 +131,14 @@
struct xdr_netobj md5cksum = {.len = sizeof(cksumdata),
.data = cksumdata};
void *ptr;
- s32 now;
+ time64_t now;
u32 seq_send;
u8 *cksumkey;
dprintk("RPC: %s\n", __func__);
BUG_ON(ctx == NULL);
- now = get_seconds();
+ now = ktime_get_real_seconds();
ptr = setup_token(ctx, token);
@@ -170,7 +170,7 @@
struct xdr_netobj cksumobj = { .len = sizeof(cksumdata),
.data = cksumdata};
void *krb5_hdr;
- s32 now;
+ time64_t now;
u8 *cksumkey;
unsigned int cksum_usage;
__be64 seq_send_be64;
@@ -198,7 +198,7 @@
memcpy(krb5_hdr + GSS_KRB5_TOK_HDR_LEN, cksumobj.data, cksumobj.len);
- now = get_seconds();
+ now = ktime_get_real_seconds();
return (ctx->endtime < now) ? GSS_S_CONTEXT_EXPIRED : GSS_S_COMPLETE;
}
@@ -214,7 +214,6 @@
BUG();
case ENCTYPE_DES_CBC_RAW:
case ENCTYPE_DES3_CBC_RAW:
- case ENCTYPE_ARCFOUR_HMAC:
return gss_get_mic_v1(ctx, text, token);
case ENCTYPE_AES128_CTS_HMAC_SHA1_96:
case ENCTYPE_AES256_CTS_HMAC_SHA1_96:
diff --git a/net/sunrpc/auth_gss/gss_krb5_seqnum.c b/net/sunrpc/auth_gss/gss_krb5_seqnum.c
index 5071051..fb11781 100644
--- a/net/sunrpc/auth_gss/gss_krb5_seqnum.c
+++ b/net/sunrpc/auth_gss/gss_krb5_seqnum.c
@@ -39,42 +39,6 @@
# define RPCDBG_FACILITY RPCDBG_AUTH
#endif
-static s32
-krb5_make_rc4_seq_num(struct krb5_ctx *kctx, int direction, s32 seqnum,
- unsigned char *cksum, unsigned char *buf)
-{
- struct crypto_sync_skcipher *cipher;
- unsigned char *plain;
- s32 code;
-
- dprintk("RPC: %s:\n", __func__);
- cipher = crypto_alloc_sync_skcipher(kctx->gk5e->encrypt_name, 0, 0);
- if (IS_ERR(cipher))
- return PTR_ERR(cipher);
-
- plain = kmalloc(8, GFP_NOFS);
- if (!plain)
- return -ENOMEM;
-
- plain[0] = (unsigned char) ((seqnum >> 24) & 0xff);
- plain[1] = (unsigned char) ((seqnum >> 16) & 0xff);
- plain[2] = (unsigned char) ((seqnum >> 8) & 0xff);
- plain[3] = (unsigned char) ((seqnum >> 0) & 0xff);
- plain[4] = direction;
- plain[5] = direction;
- plain[6] = direction;
- plain[7] = direction;
-
- code = krb5_rc4_setup_seq_key(kctx, cipher, cksum);
- if (code)
- goto out;
-
- code = krb5_encrypt(cipher, cksum, plain, buf, 8);
-out:
- kfree(plain);
- crypto_free_sync_skcipher(cipher);
- return code;
-}
s32
krb5_make_seq_num(struct krb5_ctx *kctx,
struct crypto_sync_skcipher *key,
@@ -85,10 +49,6 @@
unsigned char *plain;
s32 code;
- if (kctx->enctype == ENCTYPE_ARCFOUR_HMAC)
- return krb5_make_rc4_seq_num(kctx, direction, seqnum,
- cksum, buf);
-
plain = kmalloc(8, GFP_NOFS);
if (!plain)
return -ENOMEM;
@@ -108,50 +68,6 @@
return code;
}
-static s32
-krb5_get_rc4_seq_num(struct krb5_ctx *kctx, unsigned char *cksum,
- unsigned char *buf, int *direction, s32 *seqnum)
-{
- struct crypto_sync_skcipher *cipher;
- unsigned char *plain;
- s32 code;
-
- dprintk("RPC: %s:\n", __func__);
- cipher = crypto_alloc_sync_skcipher(kctx->gk5e->encrypt_name, 0, 0);
- if (IS_ERR(cipher))
- return PTR_ERR(cipher);
-
- code = krb5_rc4_setup_seq_key(kctx, cipher, cksum);
- if (code)
- goto out;
-
- plain = kmalloc(8, GFP_NOFS);
- if (!plain) {
- code = -ENOMEM;
- goto out;
- }
-
- code = krb5_decrypt(cipher, cksum, buf, plain, 8);
- if (code)
- goto out_plain;
-
- if ((plain[4] != plain[5]) || (plain[4] != plain[6])
- || (plain[4] != plain[7])) {
- code = (s32)KG_BAD_SEQ;
- goto out_plain;
- }
-
- *direction = plain[4];
-
- *seqnum = ((plain[0] << 24) | (plain[1] << 16) |
- (plain[2] << 8) | (plain[3]));
-out_plain:
- kfree(plain);
-out:
- crypto_free_sync_skcipher(cipher);
- return code;
-}
-
s32
krb5_get_seq_num(struct krb5_ctx *kctx,
unsigned char *cksum,
@@ -164,9 +80,6 @@
dprintk("RPC: krb5_get_seq_num:\n");
- if (kctx->enctype == ENCTYPE_ARCFOUR_HMAC)
- return krb5_get_rc4_seq_num(kctx, cksum, buf,
- direction, seqnum);
plain = kmalloc(8, GFP_NOFS);
if (!plain)
return -ENOMEM;
diff --git a/net/sunrpc/auth_gss/gss_krb5_unseal.c b/net/sunrpc/auth_gss/gss_krb5_unseal.c
index ef2b25b..ba04e3e 100644
--- a/net/sunrpc/auth_gss/gss_krb5_unseal.c
+++ b/net/sunrpc/auth_gss/gss_krb5_unseal.c
@@ -124,7 +124,7 @@
/* it got through unscathed. Make sure the context is unexpired */
- now = get_seconds();
+ now = ktime_get_real_seconds();
if (now > ctx->endtime)
return GSS_S_CONTEXT_EXPIRED;
@@ -149,7 +149,7 @@
char cksumdata[GSS_KRB5_MAX_CKSUM_LEN];
struct xdr_netobj cksumobj = {.len = sizeof(cksumdata),
.data = cksumdata};
- s32 now;
+ time64_t now;
u8 *ptr = read_token->data;
u8 *cksumkey;
u8 flags;
@@ -194,7 +194,7 @@
return GSS_S_BAD_SIG;
/* it got through unscathed. Make sure the context is unexpired */
- now = get_seconds();
+ now = ktime_get_real_seconds();
if (now > ctx->endtime)
return GSS_S_CONTEXT_EXPIRED;
@@ -218,7 +218,6 @@
BUG();
case ENCTYPE_DES_CBC_RAW:
case ENCTYPE_DES3_CBC_RAW:
- case ENCTYPE_ARCFOUR_HMAC:
return gss_verify_mic_v1(ctx, message_buffer, read_token);
case ENCTYPE_AES128_CTS_HMAC_SHA1_96:
case ENCTYPE_AES256_CTS_HMAC_SHA1_96:
diff --git a/net/sunrpc/auth_gss/gss_krb5_wrap.c b/net/sunrpc/auth_gss/gss_krb5_wrap.c
index 78ad416..e95c009 100644
--- a/net/sunrpc/auth_gss/gss_krb5_wrap.c
+++ b/net/sunrpc/auth_gss/gss_krb5_wrap.c
@@ -137,7 +137,7 @@
switch (conflen) {
case 16:
*q++ = i++;
- /* fall through */
+ fallthrough;
case 8:
*q++ = i++;
break;
@@ -163,7 +163,7 @@
.data = cksumdata};
int blocksize = 0, plainlen;
unsigned char *ptr, *msg_start;
- s32 now;
+ time64_t now;
int headlen;
struct page **tmp_pages;
u32 seq_send;
@@ -172,7 +172,7 @@
dprintk("RPC: %s\n", __func__);
- now = get_seconds();
+ now = ktime_get_real_seconds();
blocksize = crypto_sync_skcipher_blocksize(kctx->enc);
gss_krb5_add_padding(buf, offset, blocksize);
@@ -236,26 +236,9 @@
seq_send, ptr + GSS_KRB5_TOK_HDR_LEN, ptr + 8)))
return GSS_S_FAILURE;
- if (kctx->enctype == ENCTYPE_ARCFOUR_HMAC) {
- struct crypto_sync_skcipher *cipher;
- int err;
- cipher = crypto_alloc_sync_skcipher(kctx->gk5e->encrypt_name,
- 0, 0);
- if (IS_ERR(cipher))
- return GSS_S_FAILURE;
-
- krb5_rc4_setup_enc_key(kctx, cipher, seq_send);
-
- err = gss_encrypt_xdr_buf(cipher, buf,
- offset + headlen - conflen, pages);
- crypto_free_sync_skcipher(cipher);
- if (err)
- return GSS_S_FAILURE;
- } else {
- if (gss_encrypt_xdr_buf(kctx->enc, buf,
- offset + headlen - conflen, pages))
- return GSS_S_FAILURE;
- }
+ if (gss_encrypt_xdr_buf(kctx->enc, buf,
+ offset + headlen - conflen, pages))
+ return GSS_S_FAILURE;
return (kctx->endtime < now) ? GSS_S_CONTEXT_EXPIRED : GSS_S_COMPLETE;
}
@@ -270,7 +253,7 @@
char cksumdata[GSS_KRB5_MAX_CKSUM_LEN];
struct xdr_netobj md5cksum = {.len = sizeof(cksumdata),
.data = cksumdata};
- s32 now;
+ time64_t now;
int direction;
s32 seqnum;
unsigned char *ptr;
@@ -316,37 +299,9 @@
crypt_offset = ptr + (GSS_KRB5_TOK_HDR_LEN + kctx->gk5e->cksumlength) -
(unsigned char *)buf->head[0].iov_base;
- /*
- * Need plaintext seqnum to derive encryption key for arcfour-hmac
- */
- if (krb5_get_seq_num(kctx, ptr + GSS_KRB5_TOK_HDR_LEN,
- ptr + 8, &direction, &seqnum))
- return GSS_S_BAD_SIG;
-
- if ((kctx->initiate && direction != 0xff) ||
- (!kctx->initiate && direction != 0))
- return GSS_S_BAD_SIG;
-
buf->len = len;
- if (kctx->enctype == ENCTYPE_ARCFOUR_HMAC) {
- struct crypto_sync_skcipher *cipher;
- int err;
-
- cipher = crypto_alloc_sync_skcipher(kctx->gk5e->encrypt_name,
- 0, 0);
- if (IS_ERR(cipher))
- return GSS_S_FAILURE;
-
- krb5_rc4_setup_enc_key(kctx, cipher, seqnum);
-
- err = gss_decrypt_xdr_buf(cipher, buf, crypt_offset);
- crypto_free_sync_skcipher(cipher);
- if (err)
- return GSS_S_DEFECTIVE_TOKEN;
- } else {
- if (gss_decrypt_xdr_buf(kctx->enc, buf, crypt_offset))
- return GSS_S_DEFECTIVE_TOKEN;
- }
+ if (gss_decrypt_xdr_buf(kctx->enc, buf, crypt_offset))
+ return GSS_S_DEFECTIVE_TOKEN;
if (kctx->gk5e->keyed_cksum)
cksumkey = kctx->cksum;
@@ -363,13 +318,21 @@
/* it got through unscathed. Make sure the context is unexpired */
- now = get_seconds();
+ now = ktime_get_real_seconds();
if (now > kctx->endtime)
return GSS_S_CONTEXT_EXPIRED;
/* do sequencing checks */
+ if (krb5_get_seq_num(kctx, ptr + GSS_KRB5_TOK_HDR_LEN,
+ ptr + 8, &direction, &seqnum))
+ return GSS_S_BAD_SIG;
+
+ if ((kctx->initiate && direction != 0xff) ||
+ (!kctx->initiate && direction != 0))
+ return GSS_S_BAD_SIG;
+
/* Copy the data back to the right position. XXX: Would probably be
* better to copy and encrypt at the same time. */
@@ -447,7 +410,7 @@
struct xdr_buf *buf, struct page **pages)
{
u8 *ptr, *plainhdr;
- s32 now;
+ time64_t now;
u8 flags = 0x00;
__be16 *be16ptr;
__be64 *be64ptr;
@@ -489,7 +452,7 @@
if (err)
return err;
- now = get_seconds();
+ now = ktime_get_real_seconds();
return (kctx->endtime < now) ? GSS_S_CONTEXT_EXPIRED : GSS_S_COMPLETE;
}
@@ -498,7 +461,7 @@
struct xdr_buf *buf, unsigned int *slack,
unsigned int *align)
{
- s32 now;
+ time64_t now;
u8 *ptr;
u8 flags = 0x00;
u16 ec, rrc;
@@ -567,7 +530,7 @@
/* do sequencing checks */
/* it got through unscathed. Make sure the context is unexpired */
- now = get_seconds();
+ now = ktime_get_real_seconds();
if (now > kctx->endtime)
return GSS_S_CONTEXT_EXPIRED;
@@ -605,7 +568,6 @@
BUG();
case ENCTYPE_DES_CBC_RAW:
case ENCTYPE_DES3_CBC_RAW:
- case ENCTYPE_ARCFOUR_HMAC:
return gss_wrap_kerberos_v1(kctx, offset, buf, pages);
case ENCTYPE_AES128_CTS_HMAC_SHA1_96:
case ENCTYPE_AES256_CTS_HMAC_SHA1_96:
@@ -624,7 +586,6 @@
BUG();
case ENCTYPE_DES_CBC_RAW:
case ENCTYPE_DES3_CBC_RAW:
- case ENCTYPE_ARCFOUR_HMAC:
return gss_unwrap_kerberos_v1(kctx, offset, len, buf,
&gctx->slack, &gctx->align);
case ENCTYPE_AES128_CTS_HMAC_SHA1_96:
diff --git a/net/sunrpc/auth_gss/gss_mech_switch.c b/net/sunrpc/auth_gss/gss_mech_switch.c
index 9314999..fae632d 100644
--- a/net/sunrpc/auth_gss/gss_mech_switch.c
+++ b/net/sunrpc/auth_gss/gss_mech_switch.c
@@ -20,6 +20,7 @@
#include <linux/sunrpc/sched.h>
#include <linux/sunrpc/gss_api.h>
#include <linux/sunrpc/clnt.h>
+#include <trace/events/rpcgss.h>
#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
# define RPCDBG_FACILITY RPCDBG_AUTH
@@ -164,7 +165,6 @@
if (sprint_oid(obj->data, obj->len, buf, sizeof(buf)) < 0)
return NULL;
- dprintk("RPC: %s(%s)\n", __func__, buf);
request_module("rpc-auth-gss-%s", buf);
rcu_read_lock();
@@ -178,6 +178,8 @@
}
}
rcu_read_unlock();
+ if (!gm)
+ trace_rpcgss_oid_to_mech(buf);
return gm;
}
@@ -224,35 +226,6 @@
}
/**
- * gss_mech_list_pseudoflavors - Discover registered GSS pseudoflavors
- * @array_ptr: array to fill in
- * @size: size of "array"
- *
- * Returns the number of array items filled in, or a negative errno.
- *
- * The returned array is not sorted by any policy. Callers should not
- * rely on the order of the items in the returned array.
- */
-int gss_mech_list_pseudoflavors(rpc_authflavor_t *array_ptr, int size)
-{
- struct gss_api_mech *pos = NULL;
- int j, i = 0;
-
- rcu_read_lock();
- list_for_each_entry_rcu(pos, ®istered_mechs, gm_list) {
- for (j = 0; j < pos->gm_pf_num; j++) {
- if (i >= size) {
- spin_unlock(®istered_mechs_lock);
- return -ENOMEM;
- }
- array_ptr[i++] = pos->gm_pfs[j].pseudoflavor;
- }
- }
- rcu_read_unlock();
- return i;
-}
-
-/**
* gss_svc_to_pseudoflavor - map a GSS service number to a pseudoflavor
* @gm: GSS mechanism handle
* @qop: GSS quality-of-protection value
@@ -380,7 +353,7 @@
gss_import_sec_context(const void *input_token, size_t bufsize,
struct gss_api_mech *mech,
struct gss_ctx **ctx_id,
- time_t *endtime,
+ time64_t *endtime,
gfp_t gfp_mask)
{
if (!(*ctx_id = kzalloc(sizeof(**ctx_id), gfp_mask)))
diff --git a/net/sunrpc/auth_gss/gss_rpc_upcall.c b/net/sunrpc/auth_gss/gss_rpc_upcall.c
index 0349f45..af9c7f4 100644
--- a/net/sunrpc/auth_gss/gss_rpc_upcall.c
+++ b/net/sunrpc/auth_gss/gss_rpc_upcall.c
@@ -223,7 +223,7 @@
static char *gssp_stringify(struct xdr_netobj *netobj)
{
- return kstrndup(netobj->data, netobj->len, GFP_KERNEL);
+ return kmemdup_nul(netobj->data, netobj->len, GFP_KERNEL);
}
static void gssp_hostbased_service(char **principal)
diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c
index c001647..f5111d6 100644
--- a/net/sunrpc/auth_gss/svcauth_gss.c
+++ b/net/sunrpc/auth_gss/svcauth_gss.c
@@ -49,13 +49,12 @@
#include <linux/sunrpc/svcauth.h>
#include <linux/sunrpc/svcauth_gss.h>
#include <linux/sunrpc/cache.h>
+
+#include <trace/events/rpcgss.h>
+
#include "gss_rpc_upcall.h"
-#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
-# define RPCDBG_FACILITY RPCDBG_AUTH
-#endif
-
/* The rpcsec_init cache is used for mapping RPCSEC_GSS_{,CONT_}INIT requests
* into replies.
*
@@ -181,6 +180,11 @@
return NULL;
}
+static int rsi_upcall(struct cache_detail *cd, struct cache_head *h)
+{
+ return sunrpc_cache_pipe_upcall_timeout(cd, h);
+}
+
static void rsi_request(struct cache_detail *cd,
struct cache_head *h,
char **bpp, int *blen)
@@ -200,7 +204,7 @@
char *ep;
int len;
struct rsi rsii, *rsip = NULL;
- time_t expiry;
+ time64_t expiry;
int status = -EINVAL;
memset(&rsii, 0, sizeof(rsii));
@@ -279,6 +283,7 @@
.hash_size = RSI_HASHMAX,
.name = "auth.rpcsec.init",
.cache_put = rsi_put,
+ .cache_upcall = rsi_upcall,
.cache_request = rsi_request,
.cache_parse = rsi_parse,
.match = rsi_match,
@@ -327,7 +332,7 @@
struct gss_svc_seq_data {
/* highest seq number seen so far: */
- int sd_max;
+ u32 sd_max;
/* for i such that sd_max-GSS_SEQ_WIN < i <= sd_max, the i-th bit of
* sd_win is nonzero iff sequence number i has been seen already: */
unsigned long sd_win[GSS_SEQ_WIN/BITS_PER_LONG];
@@ -425,6 +430,11 @@
return NULL;
}
+static int rsc_upcall(struct cache_detail *cd, struct cache_head *h)
+{
+ return -EINVAL;
+}
+
static int rsc_parse(struct cache_detail *cd,
char *mesg, int mlen)
{
@@ -433,7 +443,7 @@
int id;
int len, rv;
struct rsc rsci, *rscp = NULL;
- time_t expiry;
+ time64_t expiry;
int status = -EINVAL;
struct gss_api_mech *gm = NULL;
@@ -551,6 +561,7 @@
.hash_size = RSC_HASHMAX,
.name = "auth.rpcsec.context",
.cache_put = rsc_put,
+ .cache_upcall = rsc_upcall,
.cache_parse = rsc_parse,
.match = rsc_match,
.init = rsc_init,
@@ -602,16 +613,29 @@
return found;
}
-/* Implements sequence number algorithm as specified in RFC 2203. */
-static int
-gss_check_seq_num(struct rsc *rsci, int seq_num)
+/**
+ * gss_check_seq_num - GSS sequence number window check
+ * @rqstp: RPC Call to use when reporting errors
+ * @rsci: cached GSS context state (updated on return)
+ * @seq_num: sequence number to check
+ *
+ * Implements sequence number algorithm as specified in
+ * RFC 2203, Section 5.3.3.1. "Context Management".
+ *
+ * Return values:
+ * %true: @rqstp's GSS sequence number is inside the window
+ * %false: @rqstp's GSS sequence number is outside the window
+ */
+static bool gss_check_seq_num(const struct svc_rqst *rqstp, struct rsc *rsci,
+ u32 seq_num)
{
struct gss_svc_seq_data *sd = &rsci->seqdata;
+ bool result = false;
spin_lock(&sd->sd_lock);
if (seq_num > sd->sd_max) {
if (seq_num >= sd->sd_max + GSS_SEQ_WIN) {
- memset(sd->sd_win,0,sizeof(sd->sd_win));
+ memset(sd->sd_win, 0, sizeof(sd->sd_win));
sd->sd_max = seq_num;
} else while (sd->sd_max < seq_num) {
sd->sd_max++;
@@ -619,18 +643,26 @@
}
__set_bit(seq_num % GSS_SEQ_WIN, sd->sd_win);
goto ok;
- } else if (seq_num <= sd->sd_max - GSS_SEQ_WIN) {
- goto drop;
+ } else if (seq_num + GSS_SEQ_WIN <= sd->sd_max) {
+ goto toolow;
}
- /* sd_max - GSS_SEQ_WIN < seq_num <= sd_max */
if (__test_and_set_bit(seq_num % GSS_SEQ_WIN, sd->sd_win))
- goto drop;
+ goto alreadyseen;
+
ok:
+ result = true;
+out:
spin_unlock(&sd->sd_lock);
- return 1;
-drop:
- spin_unlock(&sd->sd_lock);
- return 0;
+ return result;
+
+toolow:
+ trace_rpcgss_svc_seqno_low(rqstp, seq_num,
+ sd->sd_max - GSS_SEQ_WIN,
+ sd->sd_max);
+ goto out;
+alreadyseen:
+ trace_rpcgss_svc_seqno_seen(rqstp, seq_num);
+ goto out;
}
static inline u32 round_up_to_quad(u32 i)
@@ -710,16 +742,12 @@
}
if (gc->gc_seq > MAXSEQ) {
- dprintk("RPC: svcauth_gss: discarding request with "
- "large sequence number %d\n", gc->gc_seq);
+ trace_rpcgss_svc_seqno_large(rqstp, gc->gc_seq);
*authp = rpcsec_gsserr_ctxproblem;
return SVC_DENIED;
}
- if (!gss_check_seq_num(rsci, gc->gc_seq)) {
- dprintk("RPC: svcauth_gss: discarding request with "
- "old sequence number %d\n", gc->gc_seq);
+ if (!gss_check_seq_num(rqstp, rsci, gc->gc_seq))
return SVC_DROP;
- }
return SVC_OK;
}
@@ -857,11 +885,13 @@
static int
unwrap_integ_data(struct svc_rqst *rqstp, struct xdr_buf *buf, u32 seq, struct gss_ctx *ctx)
{
+ u32 integ_len, rseqno, maj_stat;
int stat = -EINVAL;
- u32 integ_len, maj_stat;
struct xdr_netobj mic;
struct xdr_buf integ_buf;
+ mic.data = NULL;
+
/* NFS READ normally uses splice to send data in-place. However
* the data in cache can change after the reply's MIC is computed
* but before the RPC reply is sent. To prevent the client from
@@ -876,34 +906,44 @@
integ_len = svc_getnl(&buf->head[0]);
if (integ_len & 3)
- return stat;
+ goto unwrap_failed;
if (integ_len > buf->len)
- return stat;
- if (xdr_buf_subsegment(buf, &integ_buf, 0, integ_len)) {
- WARN_ON_ONCE(1);
- return stat;
- }
+ goto unwrap_failed;
+ if (xdr_buf_subsegment(buf, &integ_buf, 0, integ_len))
+ goto unwrap_failed;
+
/* copy out mic... */
if (read_u32_from_xdr_buf(buf, integ_len, &mic.len))
- return stat;
+ goto unwrap_failed;
if (mic.len > RPC_MAX_AUTH_SIZE)
- return stat;
+ goto unwrap_failed;
mic.data = kmalloc(mic.len, GFP_KERNEL);
if (!mic.data)
- return stat;
+ goto unwrap_failed;
if (read_bytes_from_xdr_buf(buf, integ_len + 4, mic.data, mic.len))
- goto out;
+ goto unwrap_failed;
maj_stat = gss_verify_mic(ctx, &integ_buf, &mic);
if (maj_stat != GSS_S_COMPLETE)
- goto out;
- if (svc_getnl(&buf->head[0]) != seq)
- goto out;
+ goto bad_mic;
+ rseqno = svc_getnl(&buf->head[0]);
+ if (rseqno != seq)
+ goto bad_seqno;
/* trim off the mic and padding at the end before returning */
xdr_buf_trim(buf, round_up_to_quad(mic.len) + 4);
stat = 0;
out:
kfree(mic.data);
return stat;
+
+unwrap_failed:
+ trace_rpcgss_svc_unwrap_failed(rqstp);
+ goto out;
+bad_seqno:
+ trace_rpcgss_svc_seqno_bad(rqstp, seq, rseqno);
+ goto out;
+bad_mic:
+ trace_rpcgss_svc_mic(rqstp, maj_stat);
+ goto out;
}
static inline int
@@ -928,6 +968,7 @@
{
u32 priv_len, maj_stat;
int pad, remaining_len, offset;
+ u32 rseqno;
clear_bit(RQ_SPLICE_OK, &rqstp->rq_flags);
@@ -942,7 +983,7 @@
* not yet read from the head, so these two values are different: */
remaining_len = total_buf_len(buf);
if (priv_len > remaining_len)
- return -EINVAL;
+ goto unwrap_failed;
pad = remaining_len - priv_len;
buf->len -= pad;
fix_priv_head(buf, pad);
@@ -955,18 +996,29 @@
/* XXX: This is very inefficient. It would be better to either do
* this while we encrypt, or maybe in the receive code, if we can peak
* ahead and work out the service and mechanism there. */
- offset = buf->head[0].iov_len % 4;
+ offset = xdr_pad_size(buf->head[0].iov_len);
if (offset) {
buf->buflen = RPCSVC_MAXPAYLOAD;
xdr_shift_buf(buf, offset);
fix_priv_head(buf, pad);
}
if (maj_stat != GSS_S_COMPLETE)
- return -EINVAL;
+ goto bad_unwrap;
out_seq:
- if (svc_getnl(&buf->head[0]) != seq)
- return -EINVAL;
+ rseqno = svc_getnl(&buf->head[0]);
+ if (rseqno != seq)
+ goto bad_seqno;
return 0;
+
+unwrap_failed:
+ trace_rpcgss_svc_unwrap_failed(rqstp);
+ return -EINVAL;
+bad_seqno:
+ trace_rpcgss_svc_seqno_bad(rqstp, seq, rseqno);
+ return -EINVAL;
+bad_unwrap:
+ trace_rpcgss_svc_unwrap(rqstp, maj_stat);
+ return -EINVAL;
}
struct gss_svc_data {
@@ -1222,7 +1274,7 @@
static atomic64_t ctxhctr;
long long ctxh;
struct gss_api_mech *gm = NULL;
- time_t expiry;
+ time64_t expiry;
int status = -EINVAL;
memset(&rsci, 0, sizeof(rsci));
@@ -1246,7 +1298,6 @@
if (!ud->found_creds) {
/* userspace seem buggy, we should always get at least a
* mapping to nobody */
- dprintk("RPC: No creds found!\n");
goto out;
} else {
struct timespec64 boot;
@@ -1312,9 +1363,7 @@
if (status)
goto out;
- dprintk("RPC: svcauth_gss: gss major status = %d "
- "minor status = %d\n",
- ud.major_status, ud.minor_status);
+ trace_rpcgss_svc_accept_upcall(rqstp, ud.major_status, ud.minor_status);
switch (ud.major_status) {
case GSS_S_CONTINUE_NEEDED:
@@ -1322,31 +1371,23 @@
break;
case GSS_S_COMPLETE:
status = gss_proxy_save_rsc(sn->rsc_cache, &ud, &handle);
- if (status) {
- pr_info("%s: gss_proxy_save_rsc failed (%d)\n",
- __func__, status);
+ if (status)
goto out;
- }
cli_handle.data = (u8 *)&handle;
cli_handle.len = sizeof(handle);
break;
default:
- ret = SVC_CLOSE;
goto out;
}
/* Got an answer to the upcall; use it: */
if (gss_write_init_verf(sn->rsc_cache, rqstp,
- &cli_handle, &ud.major_status)) {
- pr_info("%s: gss_write_init_verf failed\n", __func__);
+ &cli_handle, &ud.major_status))
goto out;
- }
if (gss_write_resv(resv, PAGE_SIZE,
&cli_handle, &ud.out_token,
- ud.major_status, ud.minor_status)) {
- pr_info("%s: gss_write_resv failed\n", __func__);
+ ud.major_status, ud.minor_status))
goto out;
- }
ret = SVC_COMPLETE;
out:
@@ -1434,10 +1475,10 @@
return len;
}
-static const struct file_operations use_gss_proxy_ops = {
- .open = nonseekable_open,
- .write = write_gssp,
- .read = read_gssp,
+static const struct proc_ops use_gss_proxy_proc_ops = {
+ .proc_open = nonseekable_open,
+ .proc_write = write_gssp,
+ .proc_read = read_gssp,
};
static int create_use_gss_proxy_proc_entry(struct net *net)
@@ -1448,7 +1489,7 @@
sn->use_gss_proxy = -1;
*p = proc_create_data("use-gss-proxy", S_IFREG | 0600,
sn->proc_net_rpc,
- &use_gss_proxy_ops, net);
+ &use_gss_proxy_proc_ops, net);
if (!*p)
return -ENOMEM;
init_gssp_clnt(sn);
@@ -1497,9 +1538,6 @@
int ret;
struct sunrpc_net *sn = net_generic(SVC_NET(rqstp), sunrpc_net_id);
- dprintk("RPC: svcauth_gss: argv->iov_len = %zd\n",
- argv->iov_len);
-
*authp = rpc_autherr_badcred;
if (!svcdata)
svcdata = kmalloc(sizeof(*svcdata), GFP_KERNEL);
@@ -1616,6 +1654,7 @@
GSS_C_QOP_DEFAULT,
gc->gc_svc);
ret = SVC_OK;
+ trace_rpcgss_svc_authenticate(rqstp, gc);
goto out;
}
garbage_args:
@@ -1682,7 +1721,8 @@
goto out;
integ_offset = (u8 *)(p + 1) - (u8 *)resbuf->head[0].iov_base;
integ_len = resbuf->len - integ_offset;
- BUG_ON(integ_len % 4);
+ if (integ_len & 3)
+ goto out;
*p++ = htonl(integ_len);
*p++ = htonl(gc->gc_seq);
if (xdr_buf_subsegment(resbuf, &integ_buf, integ_offset, integ_len)) {
@@ -1706,7 +1746,8 @@
resv->iov_len += XDR_QUADLEN(mic.len) << 2;
/* not strictly required: */
resbuf->len += XDR_QUADLEN(mic.len) << 2;
- BUG_ON(resv->iov_len > PAGE_SIZE);
+ if (resv->iov_len > PAGE_SIZE)
+ goto out_err;
out:
stat = 0;
out_err:
@@ -1742,9 +1783,11 @@
* both the head and tail.
*/
if (resbuf->tail[0].iov_base) {
- BUG_ON(resbuf->tail[0].iov_base >= resbuf->head[0].iov_base
- + PAGE_SIZE);
- BUG_ON(resbuf->tail[0].iov_base < resbuf->head[0].iov_base);
+ if (resbuf->tail[0].iov_base >=
+ resbuf->head[0].iov_base + PAGE_SIZE)
+ return -EINVAL;
+ if (resbuf->tail[0].iov_base < resbuf->head[0].iov_base)
+ return -EINVAL;
if (resbuf->tail[0].iov_len + resbuf->head[0].iov_len
+ 2 * RPC_MAX_AUTH_SIZE > PAGE_SIZE)
return -ENOMEM;
diff --git a/net/sunrpc/auth_gss/trace.c b/net/sunrpc/auth_gss/trace.c
index 5576f1e..76685ab 100644
--- a/net/sunrpc/auth_gss/trace.c
+++ b/net/sunrpc/auth_gss/trace.c
@@ -5,6 +5,9 @@
#include <linux/sunrpc/clnt.h>
#include <linux/sunrpc/sched.h>
+#include <linux/sunrpc/svc.h>
+#include <linux/sunrpc/svc_xprt.h>
+#include <linux/sunrpc/auth_gss.h>
#include <linux/sunrpc/gss_err.h>
#define CREATE_TRACE_POINTS
diff --git a/net/sunrpc/backchannel_rqst.c b/net/sunrpc/backchannel_rqst.c
index 195b40c..22a2c23 100644
--- a/net/sunrpc/backchannel_rqst.c
+++ b/net/sunrpc/backchannel_rqst.c
@@ -5,7 +5,7 @@
NetApp provides this source code under the GPL v2 License.
The GPL v2 license is available at
-http://opensource.org/licenses/gpl-license.php.
+https://opensource.org/licenses/gpl-license.php.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
@@ -111,7 +111,7 @@
* by the backchannel. This function can be called multiple times
* when creating new sessions that use the same rpc_xprt. The
* preallocated buffers are added to the pool of resources used by
- * the rpc_xprt. Anyone of these resources may be used used by an
+ * the rpc_xprt. Any one of these resources may be used by an
* incoming callback request. It's up to the higher levels in the
* stack to enforce that the maximum number of session slots is not
* being exceeded.
diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c
index 7ede1e5..20c93b6 100644
--- a/net/sunrpc/cache.c
+++ b/net/sunrpc/cache.c
@@ -32,17 +32,17 @@
#include <linux/sunrpc/cache.h>
#include <linux/sunrpc/stats.h>
#include <linux/sunrpc/rpc_pipe_fs.h>
+#include <trace/events/sunrpc.h>
#include "netns.h"
#define RPCDBG_FACILITY RPCDBG_CACHE
static bool cache_defer_req(struct cache_req *req, struct cache_head *item);
static void cache_revisit_request(struct cache_head *item);
-static bool cache_listeners_exist(struct cache_detail *detail);
static void cache_init(struct cache_head *h, struct cache_detail *detail)
{
- time_t now = seconds_since_boot();
+ time64_t now = seconds_since_boot();
INIT_HLIST_NODE(&h->cache_list);
h->flags = 0;
kref_init(&h->ref);
@@ -65,18 +65,35 @@
rcu_read_lock();
hlist_for_each_entry_rcu(tmp, head, cache_list) {
- if (detail->match(tmp, key)) {
- if (cache_is_expired(detail, tmp))
- continue;
- tmp = cache_get_rcu(tmp);
- rcu_read_unlock();
- return tmp;
- }
+ if (!detail->match(tmp, key))
+ continue;
+ if (test_bit(CACHE_VALID, &tmp->flags) &&
+ cache_is_expired(detail, tmp))
+ continue;
+ tmp = cache_get_rcu(tmp);
+ rcu_read_unlock();
+ return tmp;
}
rcu_read_unlock();
return NULL;
}
+static void sunrpc_begin_cache_remove_entry(struct cache_head *ch,
+ struct cache_detail *cd)
+{
+ /* Must be called under cd->hash_lock */
+ hlist_del_init_rcu(&ch->cache_list);
+ set_bit(CACHE_CLEANED, &ch->flags);
+ cd->entries --;
+}
+
+static void sunrpc_end_cache_remove_entry(struct cache_head *ch,
+ struct cache_detail *cd)
+{
+ cache_fresh_unlocked(ch, cd);
+ cache_put(ch, cd);
+}
+
static struct cache_head *sunrpc_cache_add_entry(struct cache_detail *detail,
struct cache_head *key,
int hash)
@@ -97,19 +114,21 @@
spin_lock(&detail->hash_lock);
/* check if entry appeared while we slept */
- hlist_for_each_entry_rcu(tmp, head, cache_list) {
- if (detail->match(tmp, key)) {
- if (cache_is_expired(detail, tmp)) {
- hlist_del_init_rcu(&tmp->cache_list);
- detail->entries --;
- freeme = tmp;
- break;
- }
- cache_get(tmp);
- spin_unlock(&detail->hash_lock);
- cache_put(new, detail);
- return tmp;
+ hlist_for_each_entry_rcu(tmp, head, cache_list,
+ lockdep_is_held(&detail->hash_lock)) {
+ if (!detail->match(tmp, key))
+ continue;
+ if (test_bit(CACHE_VALID, &tmp->flags) &&
+ cache_is_expired(detail, tmp)) {
+ sunrpc_begin_cache_remove_entry(tmp, detail);
+ trace_cache_entry_expired(detail, tmp);
+ freeme = tmp;
+ break;
}
+ cache_get(tmp);
+ spin_unlock(&detail->hash_lock);
+ cache_put(new, detail);
+ return tmp;
}
hlist_add_head_rcu(&new->cache_list, head);
@@ -117,10 +136,8 @@
cache_get(new);
spin_unlock(&detail->hash_lock);
- if (freeme) {
- cache_fresh_unlocked(freeme, detail);
- cache_put(freeme, detail);
- }
+ if (freeme)
+ sunrpc_end_cache_remove_entry(freeme, detail);
return new;
}
@@ -139,10 +156,10 @@
static void cache_dequeue(struct cache_detail *detail, struct cache_head *ch);
-static void cache_fresh_locked(struct cache_head *head, time_t expiry,
+static void cache_fresh_locked(struct cache_head *head, time64_t expiry,
struct cache_detail *detail)
{
- time_t now = seconds_since_boot();
+ time64_t now = seconds_since_boot();
if (now <= detail->flush_time)
/* ensure it isn't immediately treated as expired */
now = detail->flush_time + 1;
@@ -161,6 +178,25 @@
}
}
+static void cache_make_negative(struct cache_detail *detail,
+ struct cache_head *h)
+{
+ set_bit(CACHE_NEGATIVE, &h->flags);
+ trace_cache_entry_make_negative(detail, h);
+}
+
+static void cache_entry_update(struct cache_detail *detail,
+ struct cache_head *h,
+ struct cache_head *new)
+{
+ if (!test_bit(CACHE_NEGATIVE, &new->flags)) {
+ detail->update(h, new);
+ trace_cache_entry_update(detail, h);
+ } else {
+ cache_make_negative(detail, h);
+ }
+}
+
struct cache_head *sunrpc_cache_update(struct cache_detail *detail,
struct cache_head *new, struct cache_head *old, int hash)
{
@@ -173,10 +209,7 @@
if (!test_bit(CACHE_VALID, &old->flags)) {
spin_lock(&detail->hash_lock);
if (!test_bit(CACHE_VALID, &old->flags)) {
- if (test_bit(CACHE_NEGATIVE, &new->flags))
- set_bit(CACHE_NEGATIVE, &old->flags);
- else
- detail->update(old, new);
+ cache_entry_update(detail, old, new);
cache_fresh_locked(old, new->expiry_time, detail);
spin_unlock(&detail->hash_lock);
cache_fresh_unlocked(old, detail);
@@ -194,10 +227,7 @@
detail->init(tmp, old);
spin_lock(&detail->hash_lock);
- if (test_bit(CACHE_NEGATIVE, &new->flags))
- set_bit(CACHE_NEGATIVE, &tmp->flags);
- else
- detail->update(tmp, new);
+ cache_entry_update(detail, tmp, new);
hlist_add_head(&tmp->cache_list, &detail->hash_table[hash]);
detail->entries++;
cache_get(tmp);
@@ -211,13 +241,6 @@
}
EXPORT_SYMBOL_GPL(sunrpc_cache_update);
-static int cache_make_upcall(struct cache_detail *cd, struct cache_head *h)
-{
- if (cd->cache_upcall)
- return cd->cache_upcall(cd, h);
- return sunrpc_cache_pipe_upcall(cd, h);
-}
-
static inline int cache_is_valid(struct cache_head *h)
{
if (!test_bit(CACHE_VALID, &h->flags))
@@ -246,7 +269,7 @@
spin_lock(&detail->hash_lock);
rv = cache_is_valid(h);
if (rv == -EAGAIN) {
- set_bit(CACHE_NEGATIVE, &h->flags);
+ cache_make_negative(detail, h);
cache_fresh_locked(h, seconds_since_boot()+CACHE_NEW_EXPIRY,
detail);
rv = -ENOENT;
@@ -274,7 +297,7 @@
struct cache_head *h, struct cache_req *rqstp)
{
int rv;
- long refresh_age, age;
+ time64_t refresh_age, age;
/* First decide return status as best we can */
rv = cache_is_valid(h);
@@ -288,19 +311,16 @@
rv = -ENOENT;
} else if (rv == -EAGAIN ||
(h->expiry_time != 0 && age > refresh_age/2)) {
- dprintk("RPC: Want update, refage=%ld, age=%ld\n",
+ dprintk("RPC: Want update, refage=%lld, age=%lld\n",
refresh_age, age);
- if (!test_and_set_bit(CACHE_PENDING, &h->flags)) {
- switch (cache_make_upcall(detail, h)) {
- case -EINVAL:
- rv = try_to_negate_entry(detail, h);
- break;
- case -EAGAIN:
- cache_fresh_unlocked(h, detail);
- break;
- }
- } else if (!cache_listeners_exist(detail))
+ switch (detail->cache_upcall(detail, h)) {
+ case -EINVAL:
rv = try_to_negate_entry(detail, h);
+ break;
+ case -EAGAIN:
+ cache_fresh_unlocked(h, detail);
+ break;
+ }
}
if (rv == -EAGAIN) {
@@ -454,8 +474,8 @@
if (!cache_is_expired(current_detail, ch))
continue;
- hlist_del_init_rcu(&ch->cache_list);
- current_detail->entries--;
+ sunrpc_begin_cache_remove_entry(ch, current_detail);
+ trace_cache_entry_expired(current_detail, ch);
rv = 1;
break;
}
@@ -465,11 +485,8 @@
if (!ch)
current_index ++;
spin_unlock(&cache_list_lock);
- if (ch) {
- set_bit(CACHE_CLEANED, &ch->flags);
- cache_fresh_unlocked(ch, d);
- cache_put(ch, d);
- }
+ if (ch)
+ sunrpc_end_cache_remove_entry(ch, d);
} else
spin_unlock(&cache_list_lock);
@@ -481,16 +498,17 @@
*/
static void do_cache_clean(struct work_struct *work)
{
- int delay = 5;
- if (cache_clean() == -1)
- delay = round_jiffies_relative(30*HZ);
+ int delay;
if (list_empty(&cache_list))
- delay = 0;
+ return;
- if (delay)
- queue_delayed_work(system_power_efficient_wq,
- &cache_cleaner, delay);
+ if (cache_clean() == -1)
+ delay = round_jiffies_relative(30*HZ);
+ else
+ delay = 5;
+
+ queue_delayed_work(system_power_efficient_wq, &cache_cleaner, delay);
}
@@ -512,7 +530,6 @@
{
struct cache_head *ch = NULL;
struct hlist_head *head = NULL;
- struct hlist_node *tmp = NULL;
int i = 0;
spin_lock(&detail->hash_lock);
@@ -524,14 +541,12 @@
dprintk("RPC: %d entries in %s cache\n", detail->entries, detail->name);
for (i = 0; i < detail->hash_size; i++) {
head = &detail->hash_table[i];
- hlist_for_each_entry_safe(ch, tmp, head, cache_list) {
- hlist_del_init_rcu(&ch->cache_list);
- detail->entries--;
-
- set_bit(CACHE_CLEANED, &ch->flags);
+ while (!hlist_empty(head)) {
+ ch = hlist_entry(head->first, struct cache_head,
+ cache_list);
+ sunrpc_begin_cache_remove_entry(ch, detail);
spin_unlock(&detail->hash_lock);
- cache_fresh_unlocked(ch, detail);
- cache_put(ch, detail);
+ sunrpc_end_cache_remove_entry(ch, detail);
spin_lock(&detail->hash_lock);
}
}
@@ -894,7 +909,7 @@
static ssize_t cache_slow_downcall(const char __user *buf,
size_t count, struct cache_detail *cd)
{
- static char write_buf[8192]; /* protected by queue_io_mutex */
+ static char write_buf[32768]; /* protected by queue_io_mutex */
ssize_t ret = -EINVAL;
if (count >= sizeof(write_buf))
@@ -1190,20 +1205,12 @@
*
* Each request is at most one page long.
*/
-int sunrpc_cache_pipe_upcall(struct cache_detail *detail, struct cache_head *h)
+static int cache_pipe_upcall(struct cache_detail *detail, struct cache_head *h)
{
-
char *buf;
struct cache_request *crq;
int ret = 0;
- if (!detail->cache_request)
- return -EINVAL;
-
- if (!cache_listeners_exist(detail)) {
- warn_no_listener(detail);
- return -EINVAL;
- }
if (test_bit(CACHE_CLEANED, &h->flags))
/* Too late to make an upcall */
return -EAGAIN;
@@ -1226,6 +1233,7 @@
if (test_bit(CACHE_PENDING, &h->flags)) {
crq->item = cache_get(h);
list_add_tail(&crq->q.list, &detail->queue);
+ trace_cache_entry_upcall(detail, h);
} else
/* Lost a race, no longer PENDING, so don't enqueue */
ret = -EAGAIN;
@@ -1237,8 +1245,27 @@
}
return ret;
}
+
+int sunrpc_cache_pipe_upcall(struct cache_detail *detail, struct cache_head *h)
+{
+ if (test_and_set_bit(CACHE_PENDING, &h->flags))
+ return 0;
+ return cache_pipe_upcall(detail, h);
+}
EXPORT_SYMBOL_GPL(sunrpc_cache_pipe_upcall);
+int sunrpc_cache_pipe_upcall_timeout(struct cache_detail *detail,
+ struct cache_head *h)
+{
+ if (!cache_listeners_exist(detail)) {
+ warn_no_listener(detail);
+ trace_cache_entry_no_listener(detail, h);
+ return -EINVAL;
+ }
+ return sunrpc_cache_pipe_upcall(detail, h);
+}
+EXPORT_SYMBOL_GPL(sunrpc_cache_pipe_upcall_timeout);
+
/*
* parse a message from user-space and pass it
* to an appropriate cache
@@ -1404,16 +1431,16 @@
return cd->cache_show(m, cd, NULL);
ifdebug(CACHE)
- seq_printf(m, "# expiry=%ld refcnt=%d flags=%lx\n",
+ seq_printf(m, "# expiry=%lld refcnt=%d flags=%lx\n",
convert_to_wallclock(cp->expiry_time),
kref_read(&cp->ref), cp->flags);
cache_get(cp);
if (cache_check(cd, cp, NULL))
/* cache_check does a cache_put on failure */
- seq_printf(m, "# ");
+ seq_puts(m, "# ");
else {
if (cache_is_expired(cd, cp))
- seq_printf(m, "# ");
+ seq_puts(m, "# ");
cache_put(cp, cd);
}
@@ -1477,7 +1504,7 @@
char tbuf[22];
size_t len;
- len = snprintf(tbuf, sizeof(tbuf), "%lu\n",
+ len = snprintf(tbuf, sizeof(tbuf), "%llu\n",
convert_to_wallclock(cd->flush_time));
return simple_read_from_buffer(buf, count, ppos, tbuf, len);
}
@@ -1488,7 +1515,7 @@
{
char tbuf[20];
char *ep;
- time_t now;
+ time64_t now;
if (*ppos || count > sizeof(tbuf)-1)
return -EINVAL;
@@ -1571,15 +1598,14 @@
return cache_release(inode, filp, cd);
}
-static const struct file_operations cache_file_operations_procfs = {
- .owner = THIS_MODULE,
- .llseek = no_llseek,
- .read = cache_read_procfs,
- .write = cache_write_procfs,
- .poll = cache_poll_procfs,
- .unlocked_ioctl = cache_ioctl_procfs, /* for FIONREAD */
- .open = cache_open_procfs,
- .release = cache_release_procfs,
+static const struct proc_ops cache_channel_proc_ops = {
+ .proc_lseek = no_llseek,
+ .proc_read = cache_read_procfs,
+ .proc_write = cache_write_procfs,
+ .proc_poll = cache_poll_procfs,
+ .proc_ioctl = cache_ioctl_procfs, /* for FIONREAD */
+ .proc_open = cache_open_procfs,
+ .proc_release = cache_release_procfs,
};
static int content_open_procfs(struct inode *inode, struct file *filp)
@@ -1596,11 +1622,11 @@
return content_release(inode, filp, cd);
}
-static const struct file_operations content_file_operations_procfs = {
- .open = content_open_procfs,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = content_release_procfs,
+static const struct proc_ops content_proc_ops = {
+ .proc_open = content_open_procfs,
+ .proc_read = seq_read,
+ .proc_lseek = seq_lseek,
+ .proc_release = content_release_procfs,
};
static int open_flush_procfs(struct inode *inode, struct file *filp)
@@ -1634,12 +1660,12 @@
return write_flush(filp, buf, count, ppos, cd);
}
-static const struct file_operations cache_flush_operations_procfs = {
- .open = open_flush_procfs,
- .read = read_flush_procfs,
- .write = write_flush_procfs,
- .release = release_flush_procfs,
- .llseek = no_llseek,
+static const struct proc_ops cache_flush_proc_ops = {
+ .proc_open = open_flush_procfs,
+ .proc_read = read_flush_procfs,
+ .proc_write = write_flush_procfs,
+ .proc_release = release_flush_procfs,
+ .proc_lseek = no_llseek,
};
static void remove_cache_proc_entries(struct cache_detail *cd)
@@ -1662,19 +1688,19 @@
goto out_nomem;
p = proc_create_data("flush", S_IFREG | 0600,
- cd->procfs, &cache_flush_operations_procfs, cd);
+ cd->procfs, &cache_flush_proc_ops, cd);
if (p == NULL)
goto out_nomem;
if (cd->cache_request || cd->cache_parse) {
p = proc_create_data("channel", S_IFREG | 0600, cd->procfs,
- &cache_file_operations_procfs, cd);
+ &cache_channel_proc_ops, cd);
if (p == NULL)
goto out_nomem;
}
if (cd->cache_show) {
p = proc_create_data("content", S_IFREG | 0400, cd->procfs,
- &content_file_operations_procfs, cd);
+ &content_proc_ops, cd);
if (p == NULL)
goto out_nomem;
}
@@ -1886,12 +1912,9 @@
{
spin_lock(&cd->hash_lock);
if (!hlist_unhashed(&h->cache_list)){
- hlist_del_init_rcu(&h->cache_list);
- cd->entries--;
- set_bit(CACHE_CLEANED, &h->flags);
+ sunrpc_begin_cache_remove_entry(h, cd);
spin_unlock(&cd->hash_lock);
- cache_fresh_unlocked(h, cd);
- cache_put(h, cd);
+ sunrpc_end_cache_remove_entry(h, cd);
} else
spin_unlock(&cd->hash_lock);
}
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index b603964..84c8a53 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -47,10 +47,6 @@
# define RPCDBG_FACILITY RPCDBG_CALL
#endif
-#define dprint_status(t) \
- dprintk("RPC: %5u %s (status %d)\n", t->tk_pid, \
- __func__, t->tk_status)
-
/*
* All RPC clients are linked into this list
*/
@@ -370,10 +366,6 @@
const char *nodename = args->nodename;
int err;
- /* sanity check the name before trying to print it */
- dprintk("RPC: creating %s client for %s (xprt %p)\n",
- program->name, args->servername, xprt);
-
err = rpciod_up();
if (err)
goto out_no_rpciod;
@@ -436,6 +428,8 @@
goto out_no_path;
if (parent)
atomic_inc(&parent->cl_count);
+
+ trace_rpc_clnt_new(clnt, xprt, program->name, args->servername);
return clnt;
out_no_path:
@@ -450,6 +444,7 @@
out_no_rpciod:
xprt_switch_put(xps);
xprt_put(xprt);
+ trace_rpc_clnt_new_err(program->name, args->servername, err);
return ERR_PTR(err);
}
@@ -591,6 +586,9 @@
xprt->resvport = 1;
if (args->flags & RPC_CLNT_CREATE_NONPRIVPORT)
xprt->resvport = 0;
+ xprt->reuseport = 0;
+ if (args->flags & RPC_CLNT_CREATE_REUSEPORT)
+ xprt->reuseport = 1;
clnt = rpc_create_xprt(args, xprt);
if (IS_ERR(clnt) || args->nconnect <= 1)
@@ -631,10 +629,8 @@
args->nodename = clnt->cl_nodename;
new = rpc_new_client(args, xps, xprt, clnt);
- if (IS_ERR(new)) {
- err = PTR_ERR(new);
- goto out_err;
- }
+ if (IS_ERR(new))
+ return new;
/* Turn off autobind on clones */
new->cl_autobind = 0;
@@ -647,7 +643,7 @@
return new;
out_err:
- dprintk("RPC: %s: returned error %d\n", __func__, err);
+ trace_rpc_clnt_clone_err(clnt, err);
return ERR_PTR(err);
}
@@ -720,11 +716,8 @@
int err;
xprt = xprt_create_transport(args);
- if (IS_ERR(xprt)) {
- dprintk("RPC: failed to create new xprt for clnt %p\n",
- clnt);
+ if (IS_ERR(xprt))
return PTR_ERR(xprt);
- }
xps = xprt_switch_alloc(xprt, GFP_KERNEL);
if (xps == NULL) {
@@ -764,7 +757,7 @@
rpc_release_client(parent);
xprt_switch_put(oldxps);
xprt_put(old);
- dprintk("RPC: replaced xprt for clnt %p\n", clnt);
+ trace_rpc_clnt_replace_xprt(clnt);
return 0;
out_revert:
@@ -774,7 +767,7 @@
rpc_client_register(clnt, pseudoflavor, NULL);
xprt_switch_put(xps);
xprt_put(xprt);
- dprintk("RPC: failed to switch xprt for clnt %p\n", clnt);
+ trace_rpc_clnt_replace_xprt_err(clnt);
return err;
}
EXPORT_SYMBOL_GPL(rpc_switch_client_transport);
@@ -841,10 +834,11 @@
if (list_empty(&clnt->cl_tasks))
return;
- dprintk("RPC: killing all tasks for client %p\n", clnt);
+
/*
* Spin lock all_tasks to prevent changes...
*/
+ trace_rpc_clnt_killall(clnt);
spin_lock(&clnt->cl_lock);
list_for_each_entry(rovr, &clnt->cl_tasks, tk_task)
rpc_signal_task(rovr);
@@ -860,9 +854,7 @@
{
might_sleep();
- dprintk_rcu("RPC: shutting down %s client for %s\n",
- clnt->cl_program->name,
- rcu_dereference(clnt->cl_xprt)->servername);
+ trace_rpc_clnt_shutdown(clnt);
while (!list_empty(&clnt->cl_tasks)) {
rpc_killall_tasks(clnt);
@@ -877,27 +869,40 @@
/*
* Free an RPC client
*/
+static void rpc_free_client_work(struct work_struct *work)
+{
+ struct rpc_clnt *clnt = container_of(work, struct rpc_clnt, cl_work);
+
+ trace_rpc_clnt_free(clnt);
+
+ /* These might block on processes that might allocate memory,
+ * so they cannot be called in rpciod, so they are handled separately
+ * here.
+ */
+ rpc_clnt_debugfs_unregister(clnt);
+ rpc_free_clid(clnt);
+ rpc_clnt_remove_pipedir(clnt);
+ xprt_put(rcu_dereference_raw(clnt->cl_xprt));
+
+ kfree(clnt);
+ rpciod_down();
+}
static struct rpc_clnt *
rpc_free_client(struct rpc_clnt *clnt)
{
struct rpc_clnt *parent = NULL;
- dprintk_rcu("RPC: destroying %s client for %s\n",
- clnt->cl_program->name,
- rcu_dereference(clnt->cl_xprt)->servername);
+ trace_rpc_clnt_release(clnt);
if (clnt->cl_parent != clnt)
parent = clnt->cl_parent;
- rpc_clnt_debugfs_unregister(clnt);
- rpc_clnt_remove_pipedir(clnt);
rpc_unregister_client(clnt);
rpc_free_iostats(clnt->cl_metrics);
clnt->cl_metrics = NULL;
- xprt_put(rcu_dereference_raw(clnt->cl_xprt));
xprt_iter_destroy(&clnt->cl_xpi);
- rpciod_down();
put_cred(clnt->cl_cred);
- rpc_free_clid(clnt);
- kfree(clnt);
+
+ INIT_WORK(&clnt->cl_work, rpc_free_client_work);
+ schedule_work(&clnt->cl_work);
return parent;
}
@@ -929,8 +934,6 @@
void
rpc_release_client(struct rpc_clnt *clnt)
{
- dprintk("RPC: rpc_release_client(%p)\n", clnt);
-
do {
if (list_empty(&clnt->cl_tasks))
wake_up(&destroy_wait);
@@ -1096,8 +1099,9 @@
task->tk_msg.rpc_proc = msg->rpc_proc;
task->tk_msg.rpc_argp = msg->rpc_argp;
task->tk_msg.rpc_resp = msg->rpc_resp;
- if (msg->rpc_cred != NULL)
- task->tk_msg.rpc_cred = get_cred(msg->rpc_cred);
+ task->tk_msg.rpc_cred = msg->rpc_cred;
+ if (!(task->tk_flags & RPC_TASK_CRED_NOREF))
+ get_cred(task->tk_msg.rpc_cred);
}
}
@@ -1123,6 +1127,9 @@
task = rpc_new_task(task_setup_data);
+ if (!RPC_IS_ASYNC(task))
+ task->tk_flags |= RPC_TASK_CRED_NOREF;
+
rpc_task_set_client(task, task_setup_data->rpc_client);
rpc_task_set_rpc_message(task, task_setup_data->rpc_message);
@@ -1250,7 +1257,7 @@
hdrsize += RPC_REPHDRSIZE + req->rq_cred->cr_auth->au_ralign - 1;
xdr_inline_pages(&req->rq_rcv_buf, hdrsize << 2, pages, base, len);
- trace_rpc_reply_pages(req);
+ trace_rpc_xdr_reply_pages(req->rq_task, &req->rq_rcv_buf);
}
EXPORT_SYMBOL_GPL(rpc_prepare_reply_pages);
@@ -1604,6 +1611,7 @@
static void
__rpc_call_rpcerror(struct rpc_task *task, int tk_status, int rpc_status)
{
+ trace_rpc_call_rpcerror(task, tk_status, rpc_status);
task->tk_rpc_status = rpc_status;
rpc_exit(task, tk_status);
}
@@ -1627,10 +1635,6 @@
int idx = task->tk_msg.rpc_proc->p_statidx;
trace_rpc_request(task);
- dprintk("RPC: %5u call_start %s%d proc %s (%s)\n", task->tk_pid,
- clnt->cl_program->name, clnt->cl_vers,
- rpc_proc_name(task),
- (RPC_IS_ASYNC(task) ? "async" : "sync"));
/* Increment call count (version might not be valid for ping) */
if (clnt->cl_program->version[clnt->cl_vers])
@@ -1646,8 +1650,6 @@
static void
call_reserve(struct rpc_task *task)
{
- dprint_status(task);
-
task->tk_status = 0;
task->tk_action = call_reserveresult;
xprt_reserve(task);
@@ -1663,8 +1665,6 @@
{
int status = task->tk_status;
- dprint_status(task);
-
/*
* After a call to xprt_reserve(), we must have either
* a request slot or else an error status.
@@ -1676,37 +1676,20 @@
return;
}
- printk(KERN_ERR "%s: status=%d, but no request slot, exiting\n",
- __func__, status);
rpc_call_rpcerror(task, -EIO);
return;
}
- /*
- * Even though there was an error, we may have acquired
- * a request slot somehow. Make sure not to leak it.
- */
- if (task->tk_rqstp) {
- printk(KERN_ERR "%s: status=%d, request allocated anyway\n",
- __func__, status);
- xprt_release(task);
- }
-
switch (status) {
case -ENOMEM:
rpc_delay(task, HZ >> 2);
- /* fall through */
+ fallthrough;
case -EAGAIN: /* woken up; retry */
task->tk_action = call_retry_reserve;
return;
- case -EIO: /* probably a shutdown */
- break;
default:
- printk(KERN_ERR "%s: unrecognized error %d, exiting\n",
- __func__, status);
- break;
+ rpc_call_rpcerror(task, status);
}
- rpc_call_rpcerror(task, status);
}
/*
@@ -1715,8 +1698,6 @@
static void
call_retry_reserve(struct rpc_task *task)
{
- dprint_status(task);
-
task->tk_status = 0;
task->tk_action = call_reserveresult;
xprt_retry_reserve(task);
@@ -1728,8 +1709,6 @@
static void
call_refresh(struct rpc_task *task)
{
- dprint_status(task);
-
task->tk_action = call_refreshresult;
task->tk_status = 0;
task->tk_client->cl_stats->rpcauthrefresh++;
@@ -1744,8 +1723,6 @@
{
int status = task->tk_status;
- dprint_status(task);
-
task->tk_status = 0;
task->tk_action = call_refresh;
switch (status) {
@@ -1757,23 +1734,21 @@
/* Use rate-limiting and a max number of retries if refresh
* had status 0 but failed to update the cred.
*/
- /* fall through */
+ fallthrough;
case -ETIMEDOUT:
rpc_delay(task, 3*HZ);
- /* fall through */
+ fallthrough;
case -EAGAIN:
status = -EACCES;
- /* fall through */
+ fallthrough;
case -EKEYEXPIRED:
if (!task->tk_cred_retry)
break;
task->tk_cred_retry--;
- dprintk("RPC: %5u %s: retry refresh creds\n",
- task->tk_pid, __func__);
+ trace_rpc_retry_refresh_status(task);
return;
}
- dprintk("RPC: %5u %s: refresh creds failed with error %d\n",
- task->tk_pid, __func__, status);
+ trace_rpc_refresh_status(task);
rpc_call_rpcerror(task, status);
}
@@ -1790,8 +1765,6 @@
const struct rpc_procinfo *proc = task->tk_msg.rpc_proc;
int status;
- dprint_status(task);
-
task->tk_status = 0;
task->tk_action = call_encode;
@@ -1821,7 +1794,7 @@
req->rq_rcvsize <<= 2;
status = xprt->ops->buf_alloc(task);
- xprt_inject_disconnect(xprt);
+ trace_rpc_buf_alloc(task, status);
if (status == 0)
return;
if (status != -ENOMEM) {
@@ -1829,8 +1802,6 @@
return;
}
- dprintk("RPC: %5u rpc_buffer allocation failed\n", task->tk_pid);
-
if (RPC_IS_ASYNC(task) || !fatal_signal_pending(current)) {
task->tk_action = call_allocate;
rpc_delay(task, HZ>>4);
@@ -1881,7 +1852,7 @@
{
if (!rpc_task_need_encode(task))
goto out;
- dprint_status(task);
+
/* Dequeue task from the receive queue while we're encoding */
xprt_request_dequeue_xprt(task);
/* Encode here so that rpcsec_gss can use correct sequence number. */
@@ -1900,8 +1871,7 @@
} else {
task->tk_action = call_refresh;
task->tk_cred_retry--;
- dprintk("RPC: %5u %s: retry refresh creds\n",
- task->tk_pid, __func__);
+ trace_rpc_retry_refresh_status(task);
}
break;
default:
@@ -1958,8 +1928,6 @@
return;
}
- dprint_status(task);
-
task->tk_action = call_bind_status;
if (!xprt_prepare_transmit(task))
return;
@@ -1981,8 +1949,6 @@
return;
}
- dprint_status(task);
- trace_rpc_bind_status(task);
if (task->tk_status >= 0)
goto out_next;
if (xprt_bound(xprt)) {
@@ -1992,12 +1958,10 @@
switch (task->tk_status) {
case -ENOMEM:
- dprintk("RPC: %5u rpcbind out of memory\n", task->tk_pid);
rpc_delay(task, HZ >> 2);
goto retry_timeout;
case -EACCES:
- dprintk("RPC: %5u remote rpcbind: RPC program/version "
- "unavailable\n", task->tk_pid);
+ trace_rpcb_prog_unavail_err(task);
/* fail immediately if this is an RPC ping */
if (task->tk_msg.rpc_proc->p_proc == 0) {
status = -EOPNOTSUPP;
@@ -2014,17 +1978,14 @@
case -EAGAIN:
goto retry_timeout;
case -ETIMEDOUT:
- dprintk("RPC: %5u rpcbind request timed out\n",
- task->tk_pid);
+ trace_rpcb_timeout_err(task);
goto retry_timeout;
case -EPFNOSUPPORT:
/* server doesn't support any rpcbind version we know of */
- dprintk("RPC: %5u unrecognized remote rpcbind service\n",
- task->tk_pid);
+ trace_rpcb_bind_version_err(task);
break;
case -EPROTONOSUPPORT:
- dprintk("RPC: %5u remote rpcbind version unavailable, retrying\n",
- task->tk_pid);
+ trace_rpcb_bind_version_err(task);
goto retry_timeout;
case -ECONNREFUSED: /* connection problems */
case -ECONNRESET:
@@ -2035,8 +1996,7 @@
case -EHOSTUNREACH:
case -ENETUNREACH:
case -EPIPE:
- dprintk("RPC: %5u remote rpcbind unreachable: %d\n",
- task->tk_pid, task->tk_status);
+ trace_rpcb_unreachable_err(task);
if (!RPC_IS_SOFTCONN(task)) {
rpc_delay(task, 5*HZ);
goto retry_timeout;
@@ -2044,8 +2004,7 @@
status = task->tk_status;
break;
default:
- dprintk("RPC: %5u unrecognized rpcbind error (%d)\n",
- task->tk_pid, -task->tk_status);
+ trace_rpcb_unrecognized_err(task);
}
rpc_call_rpcerror(task, status);
@@ -2077,10 +2036,6 @@
return;
}
- dprintk("RPC: %5u call_connect xprt %p %s connected\n",
- task->tk_pid, xprt,
- (xprt_connected(xprt) ? "is" : "is not"));
-
task->tk_action = call_connect_status;
if (task->tk_status < 0)
return;
@@ -2108,7 +2063,6 @@
return;
}
- dprint_status(task);
trace_rpc_connect_status(task);
if (task->tk_status == 0) {
@@ -2130,20 +2084,21 @@
rpc_force_rebind(clnt);
goto out_retry;
}
- /* fall through */
+ fallthrough;
case -ECONNRESET:
case -ECONNABORTED:
case -ENETDOWN:
case -ENETUNREACH:
case -EHOSTUNREACH:
case -EPIPE:
+ case -EPROTO:
xprt_conditional_disconnect(task->tk_rqstp->rq_xprt,
task->tk_rqstp->rq_connect_cookie);
if (RPC_IS_SOFTCONN(task))
break;
/* retry with existing socket, after a delay */
rpc_delay(task, 3*HZ);
- /* fall through */
+ fallthrough;
case -EADDRINUSE:
case -ENOTCONN:
case -EAGAIN:
@@ -2175,8 +2130,6 @@
return;
}
- dprint_status(task);
-
task->tk_action = call_transmit_status;
if (!xprt_prepare_transmit(task))
return;
@@ -2211,7 +2164,6 @@
switch (task->tk_status) {
default:
- dprint_status(task);
break;
case -EBADMSG:
task->tk_status = 0;
@@ -2225,7 +2177,7 @@
*/
case -ENOBUFS:
rpc_delay(task, HZ>>2);
- /* fall through */
+ fallthrough;
case -EBADSLT:
case -EAGAIN:
task->tk_action = call_transmit;
@@ -2244,7 +2196,7 @@
rpc_call_rpcerror(task, task->tk_status);
return;
}
- /* fall through */
+ fallthrough;
case -ECONNRESET:
case -ECONNABORTED:
case -EADDRINUSE:
@@ -2293,8 +2245,6 @@
if (rpc_task_transmitted(task))
task->tk_status = 0;
- dprint_status(task);
-
switch (task->tk_status) {
case 0:
/* Success */
@@ -2310,7 +2260,7 @@
break;
case -ENOBUFS:
rpc_delay(task, HZ>>2);
- /* fall through */
+ fallthrough;
case -EBADSLT:
case -EAGAIN:
task->tk_status = 0;
@@ -2354,8 +2304,6 @@
if (!task->tk_msg.rpc_proc->p_proc)
trace_xprt_ping(task->tk_xprt, task->tk_status);
- dprint_status(task);
-
status = task->tk_status;
if (status >= 0) {
task->tk_action = call_decode;
@@ -2377,7 +2325,7 @@
* were a timeout.
*/
rpc_delay(task, 3*HZ);
- /* fall through */
+ fallthrough;
case -ETIMEDOUT:
break;
case -ECONNREFUSED:
@@ -2388,7 +2336,7 @@
break;
case -EADDRINUSE:
rpc_delay(task, 3*HZ);
- /* fall through */
+ fallthrough;
case -EPIPE:
case -EAGAIN:
break;
@@ -2402,7 +2350,8 @@
goto out_exit;
}
task->tk_action = call_encode;
- rpc_check_timeout(task);
+ if (status != -ECONNRESET && status != -ECONNABORTED)
+ rpc_check_timeout(task);
return;
out_exit:
rpc_call_rpcerror(task, status);
@@ -2430,7 +2379,7 @@
if (xprt_adjust_timeout(task->tk_rqstp) == 0)
return;
- dprintk("RPC: %5u call_timeout (major)\n", task->tk_pid);
+ trace_rpc_timeout_status(task);
task->tk_timeouts++;
if (RPC_IS_SOFTCONN(task) && !rpc_check_connected(task->tk_rqstp)) {
@@ -2489,8 +2438,6 @@
struct xdr_stream xdr;
int err;
- dprint_status(task);
-
if (!task->tk_msg.rpc_proc->p_decode) {
task->tk_action = rpc_exit_task;
return;
@@ -2519,6 +2466,7 @@
smp_rmb();
req->rq_rcv_buf.len = req->rq_private_buf.len;
+ trace_rpc_xdr_recvfrom(task, &req->rq_rcv_buf);
/* Check that the softirq receive buffer is valid */
WARN_ON(memcmp(&req->rq_rcv_buf, &req->rq_private_buf,
@@ -2532,8 +2480,6 @@
case 0:
task->tk_action = rpc_exit_task;
task->tk_status = rpcauth_unwrap_resp(task, &xdr);
- dprintk("RPC: %5u %s result %d\n",
- task->tk_pid, __func__, task->tk_status);
return;
case -EAGAIN:
task->tk_status = 0;
@@ -2747,7 +2693,8 @@
.rpc_op_cred = cred,
.callback_ops = (ops != NULL) ? ops : &rpc_default_ops,
.callback_data = data,
- .flags = flags | RPC_TASK_NULLCREDS,
+ .flags = flags | RPC_TASK_SOFT | RPC_TASK_SOFTCONN |
+ RPC_TASK_NULLCREDS,
};
return rpc_run_task(&task_setup_data);
@@ -2810,11 +2757,9 @@
goto success;
}
- task = rpc_call_null_helper(clnt, xprt, NULL,
- RPC_TASK_SOFT|RPC_TASK_SOFTCONN|RPC_TASK_ASYNC|RPC_TASK_NULLCREDS,
+ task = rpc_call_null_helper(clnt, xprt, NULL, RPC_TASK_ASYNC,
&rpc_cb_add_xprt_call_ops, data);
- if (IS_ERR(task))
- return PTR_ERR(task);
+
rpc_put_task(task);
success:
return 1;
@@ -2855,9 +2800,7 @@
goto out_err;
/* Test the connection */
- task = rpc_call_null_helper(clnt, xprt, NULL,
- RPC_TASK_SOFT | RPC_TASK_SOFTCONN | RPC_TASK_NULLCREDS,
- NULL, NULL);
+ task = rpc_call_null_helper(clnt, xprt, NULL, 0, NULL, NULL);
if (IS_ERR(task)) {
status = PTR_ERR(task);
goto out_err;
@@ -2910,7 +2853,7 @@
struct rpc_xprt *xprt;
unsigned long connect_timeout;
unsigned long reconnect_timeout;
- unsigned char resvport;
+ unsigned char resvport, reuseport;
int ret = 0;
rcu_read_lock();
@@ -2922,6 +2865,7 @@
return -EAGAIN;
}
resvport = xprt->resvport;
+ reuseport = xprt->reuseport;
connect_timeout = xprt->connect_timeout;
reconnect_timeout = xprt->max_reconnect_timeout;
rcu_read_unlock();
@@ -2932,6 +2876,7 @@
goto out_put_switch;
}
xprt->resvport = resvport;
+ xprt->reuseport = reuseport;
if (xprt->ops->set_connect_timeout != NULL)
xprt->ops->set_connect_timeout(xprt,
connect_timeout,
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index 3779267..5f854ff 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -51,7 +51,7 @@
int rpc_pipefs_notifier_register(struct notifier_block *nb)
{
- return blocking_notifier_chain_cond_register(&rpc_pipefs_notifier_list, nb);
+ return blocking_notifier_chain_register(&rpc_pipefs_notifier_list, nb);
}
EXPORT_SYMBOL_GPL(rpc_pipefs_notifier_register);
@@ -599,9 +599,9 @@
dget(dentry);
ret = simple_rmdir(dir, dentry);
+ d_drop(dentry);
if (!ret)
fsnotify_rmdir(dir, dentry);
- d_delete(dentry);
dput(dentry);
return ret;
}
@@ -612,9 +612,9 @@
dget(dentry);
ret = simple_unlink(dir, dentry);
+ d_drop(dentry);
if (!ret)
fsnotify_unlink(dir, dentry);
- d_delete(dentry);
dput(dentry);
return ret;
}
@@ -1510,6 +1510,6 @@
void unregister_rpc_pipefs(void)
{
rpc_clients_notifier_unregister();
- kmem_cache_destroy(rpc_inode_cachep);
unregister_filesystem(&rpc_pipe_fs_type);
+ kmem_cache_destroy(rpc_inode_cachep);
}
diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c
index 1db9f62..38fe2ce 100644
--- a/net/sunrpc/rpcb_clnt.c
+++ b/net/sunrpc/rpcb_clnt.c
@@ -31,11 +31,9 @@
#include <linux/sunrpc/sched.h>
#include <linux/sunrpc/xprtsock.h>
-#include "netns.h"
+#include <trace/events/sunrpc.h>
-#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
-# define RPCDBG_FACILITY RPCDBG_BIND
-#endif
+#include "netns.h"
#define RPCBIND_SOCK_PATHNAME "/var/run/rpcbind.sock"
@@ -216,10 +214,6 @@
sn->rpcb_is_af_local = is_af_local ? 1 : 0;
smp_wmb();
sn->rpcb_users = 1;
- dprintk("RPC: created new rpcb local clients (rpcb_local_clnt: "
- "%p, rpcb_local_clnt4: %p) for net %x%s\n",
- sn->rpcb_local_clnt, sn->rpcb_local_clnt4,
- net->ns.inum, (net == &init_net) ? " (init_net)" : "");
}
/*
@@ -261,19 +255,13 @@
*/
clnt = rpc_create(&args);
if (IS_ERR(clnt)) {
- dprintk("RPC: failed to create AF_LOCAL rpcbind "
- "client (errno %ld).\n", PTR_ERR(clnt));
result = PTR_ERR(clnt);
goto out;
}
clnt4 = rpc_bind_new_program(clnt, &rpcb_program, RPCBVERS_4);
- if (IS_ERR(clnt4)) {
- dprintk("RPC: failed to bind second program to "
- "rpcbind v4 client (errno %ld).\n",
- PTR_ERR(clnt4));
+ if (IS_ERR(clnt4))
clnt4 = NULL;
- }
rpcb_set_local(net, clnt, clnt4, true);
@@ -309,8 +297,6 @@
clnt = rpc_create(&args);
if (IS_ERR(clnt)) {
- dprintk("RPC: failed to create local rpcbind "
- "client (errno %ld).\n", PTR_ERR(clnt));
result = PTR_ERR(clnt);
goto out;
}
@@ -321,12 +307,8 @@
* v4 upcalls.
*/
clnt4 = rpc_bind_new_program(clnt, &rpcb_program, RPCBVERS_4);
- if (IS_ERR(clnt4)) {
- dprintk("RPC: failed to bind second program to "
- "rpcbind v4 client (errno %ld).\n",
- PTR_ERR(clnt4));
+ if (IS_ERR(clnt4))
clnt4 = NULL;
- }
rpcb_set_local(net, clnt, clnt4, false);
@@ -403,11 +385,8 @@
msg->rpc_resp = &result;
error = rpc_call_sync(clnt, msg, flags);
- if (error < 0) {
- dprintk("RPC: failed to contact local rpcbind "
- "server (errno %d).\n", -error);
+ if (error < 0)
return error;
- }
if (!result)
return -EACCES;
@@ -461,9 +440,7 @@
struct sunrpc_net *sn = net_generic(net, sunrpc_net_id);
bool is_set = false;
- dprintk("RPC: %sregistering (%u, %u, %d, %u) with local "
- "rpcbind\n", (port ? "" : "un"),
- prog, vers, prot, port);
+ trace_pmap_register(prog, vers, prot, port);
msg.rpc_proc = &rpcb_procedures2[RPCBPROC_UNSET];
if (port != 0) {
@@ -489,11 +466,6 @@
map->r_addr = rpc_sockaddr2uaddr(sap, GFP_KERNEL);
- dprintk("RPC: %sregistering [%u, %u, %s, '%s'] with "
- "local rpcbind\n", (port ? "" : "un"),
- map->r_prog, map->r_vers,
- map->r_addr, map->r_netid);
-
msg->rpc_proc = &rpcb_procedures4[RPCBPROC_UNSET];
if (port != 0) {
msg->rpc_proc = &rpcb_procedures4[RPCBPROC_SET];
@@ -520,11 +492,6 @@
map->r_addr = rpc_sockaddr2uaddr(sap, GFP_KERNEL);
- dprintk("RPC: %sregistering [%u, %u, %s, '%s'] with "
- "local rpcbind\n", (port ? "" : "un"),
- map->r_prog, map->r_vers,
- map->r_addr, map->r_netid);
-
msg->rpc_proc = &rpcb_procedures4[RPCBPROC_UNSET];
if (port != 0) {
msg->rpc_proc = &rpcb_procedures4[RPCBPROC_SET];
@@ -541,9 +508,7 @@
{
struct rpcbind_args *map = msg->rpc_argp;
- dprintk("RPC: unregistering [%u, %u, '%s'] with "
- "local rpcbind\n",
- map->r_prog, map->r_vers, map->r_netid);
+ trace_rpcb_unregister(map->r_prog, map->r_vers, map->r_netid);
map->r_addr = "";
msg->rpc_proc = &rpcb_procedures4[RPCBPROC_UNSET];
@@ -615,6 +580,8 @@
if (address == NULL)
return rpcb_unregister_all_protofamilies(sn, &msg);
+ trace_rpcb_register(map.r_prog, map.r_vers, map.r_addr, map.r_netid);
+
switch (address->sa_family) {
case AF_INET:
return rpcb_register_inet4(sn, address, &msg);
@@ -693,18 +660,12 @@
rcu_read_unlock();
xprt = xprt_get(task->tk_xprt);
- dprintk("RPC: %5u %s(%s, %u, %u, %d)\n",
- task->tk_pid, __func__,
- xprt->servername, clnt->cl_prog, clnt->cl_vers, xprt->prot);
-
/* Put self on the wait queue to ensure we get notified if
* some other task is already attempting to bind the port */
rpc_sleep_on_timeout(&xprt->binding, task,
NULL, jiffies + xprt->bind_timeout);
if (xprt_test_and_set_binding(xprt)) {
- dprintk("RPC: %5u %s: waiting for another binder\n",
- task->tk_pid, __func__);
xprt_put(xprt);
return;
}
@@ -712,8 +673,6 @@
/* Someone else may have bound if we slept */
if (xprt_bound(xprt)) {
status = 0;
- dprintk("RPC: %5u %s: already bound\n",
- task->tk_pid, __func__);
goto bailout_nofree;
}
@@ -732,20 +691,15 @@
break;
default:
status = -EAFNOSUPPORT;
- dprintk("RPC: %5u %s: bad address family\n",
- task->tk_pid, __func__);
goto bailout_nofree;
}
if (proc == NULL) {
xprt->bind_index = 0;
status = -EPFNOSUPPORT;
- dprintk("RPC: %5u %s: no more getport versions available\n",
- task->tk_pid, __func__);
goto bailout_nofree;
}
- dprintk("RPC: %5u %s: trying rpcbind version %u\n",
- task->tk_pid, __func__, bind_version);
+ trace_rpcb_getport(clnt, task, bind_version);
rpcb_clnt = rpcb_create(xprt->xprt_net,
clnt->cl_nodename,
@@ -754,16 +708,12 @@
clnt->cl_cred);
if (IS_ERR(rpcb_clnt)) {
status = PTR_ERR(rpcb_clnt);
- dprintk("RPC: %5u %s: rpcb_create failed, error %ld\n",
- task->tk_pid, __func__, PTR_ERR(rpcb_clnt));
goto bailout_nofree;
}
map = kzalloc(sizeof(struct rpcbind_args), GFP_NOFS);
if (!map) {
status = -ENOMEM;
- dprintk("RPC: %5u %s: no memory available\n",
- task->tk_pid, __func__);
goto bailout_release_client;
}
map->r_prog = clnt->cl_prog;
@@ -780,8 +730,6 @@
map->r_addr = rpc_sockaddr2uaddr(sap, GFP_NOFS);
if (!map->r_addr) {
status = -ENOMEM;
- dprintk("RPC: %5u %s: no memory available\n",
- task->tk_pid, __func__);
goto bailout_free_args;
}
map->r_owner = "";
@@ -795,12 +743,6 @@
child = rpcb_call_async(rpcb_clnt, map, proc);
rpc_release_client(rpcb_clnt);
- if (IS_ERR(child)) {
- /* rpcb_map_release() has freed the arguments */
- dprintk("RPC: %5u %s: rpc_run_task failed\n",
- task->tk_pid, __func__);
- return;
- }
xprt->stat.bind_count++;
rpc_put_task(child);
@@ -824,34 +766,33 @@
{
struct rpcbind_args *map = data;
struct rpc_xprt *xprt = map->r_xprt;
- int status = child->tk_status;
+
+ map->r_status = child->tk_status;
/* Garbage reply: retry with a lesser rpcbind version */
- if (status == -EIO)
- status = -EPROTONOSUPPORT;
+ if (map->r_status == -EIO)
+ map->r_status = -EPROTONOSUPPORT;
/* rpcbind server doesn't support this rpcbind protocol version */
- if (status == -EPROTONOSUPPORT)
+ if (map->r_status == -EPROTONOSUPPORT)
xprt->bind_index++;
- if (status < 0) {
+ if (map->r_status < 0) {
/* rpcbind server not available on remote host? */
- xprt->ops->set_port(xprt, 0);
+ map->r_port = 0;
+
} else if (map->r_port == 0) {
/* Requested RPC service wasn't registered on remote host */
- xprt->ops->set_port(xprt, 0);
- status = -EACCES;
+ map->r_status = -EACCES;
} else {
/* Succeeded */
- xprt->ops->set_port(xprt, map->r_port);
- xprt_set_bound(xprt);
- status = 0;
+ map->r_status = 0;
}
- dprintk("RPC: %5u rpcb_getport_done(status %d, port %u)\n",
- child->tk_pid, status, map->r_port);
-
- map->r_status = status;
+ trace_rpcb_setport(child, map->r_status, map->r_port);
+ xprt->ops->set_port(xprt, map->r_port);
+ if (map->r_port)
+ xprt_set_bound(xprt);
}
/*
@@ -864,11 +805,6 @@
const struct rpcbind_args *rpcb = data;
__be32 *p;
- dprintk("RPC: %5u encoding PMAP_%s call (%u, %u, %d, %u)\n",
- req->rq_task->tk_pid,
- req->rq_task->tk_msg.rpc_proc->p_name,
- rpcb->r_prog, rpcb->r_vers, rpcb->r_prot, rpcb->r_port);
-
p = xdr_reserve_space(xdr, RPCB_mappingargs_sz << 2);
*p++ = cpu_to_be32(rpcb->r_prog);
*p++ = cpu_to_be32(rpcb->r_vers);
@@ -890,8 +826,6 @@
return -EIO;
port = be32_to_cpup(p);
- dprintk("RPC: %5u PMAP_%s result: %lu\n", req->rq_task->tk_pid,
- req->rq_task->tk_msg.rpc_proc->p_name, port);
if (unlikely(port > USHRT_MAX))
return -EIO;
@@ -912,11 +846,6 @@
*boolp = 0;
if (*p != xdr_zero)
*boolp = 1;
-
- dprintk("RPC: %5u RPCB_%s call %s\n",
- req->rq_task->tk_pid,
- req->rq_task->tk_msg.rpc_proc->p_name,
- (*boolp ? "succeeded" : "failed"));
return 0;
}
@@ -941,12 +870,6 @@
const struct rpcbind_args *rpcb = data;
__be32 *p;
- dprintk("RPC: %5u encoding RPCB_%s call (%u, %u, '%s', '%s')\n",
- req->rq_task->tk_pid,
- req->rq_task->tk_msg.rpc_proc->p_name,
- rpcb->r_prog, rpcb->r_vers,
- rpcb->r_netid, rpcb->r_addr);
-
p = xdr_reserve_space(xdr, (RPCB_program_sz + RPCB_version_sz) << 2);
*p++ = cpu_to_be32(rpcb->r_prog);
*p = cpu_to_be32(rpcb->r_vers);
@@ -976,11 +899,8 @@
* If the returned universal address is a null string,
* the requested RPC service was not registered.
*/
- if (len == 0) {
- dprintk("RPC: %5u RPCB reply: program not registered\n",
- req->rq_task->tk_pid);
+ if (len == 0)
return 0;
- }
if (unlikely(len > RPCBIND_MAXUADDRLEN))
goto out_fail;
@@ -988,8 +908,6 @@
p = xdr_inline_decode(xdr, len);
if (unlikely(p == NULL))
goto out_fail;
- dprintk("RPC: %5u RPCB_%s reply: %*pE\n", req->rq_task->tk_pid,
- req->rq_task->tk_msg.rpc_proc->p_name, len, (char *)p);
if (rpc_uaddr2sockaddr(req->rq_xprt->xprt_net, (char *)p, len,
sap, sizeof(address)) == 0)
@@ -999,9 +917,6 @@
return 0;
out_fail:
- dprintk("RPC: %5u malformed RPCB_%s reply\n",
- req->rq_task->tk_pid,
- req->rq_task->tk_msg.rpc_proc->p_name);
return -EIO;
}
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index 8fc4a6b..c045f63 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -27,10 +27,6 @@
#include "sunrpc.h"
-#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
-#define RPCDBG_FACILITY RPCDBG_SCHED
-#endif
-
#define CREATE_TRACE_POINTS
#include <trace/events/sunrpc.h>
@@ -85,7 +81,6 @@
{
if (list_empty(&task->u.tk_wait.timer_list))
return;
- dprintk("RPC: %5u disabling timer\n", task->tk_pid);
task->tk_timeout = 0;
list_del(&task->u.tk_wait.timer_list);
if (list_empty(&queue->timer_list.list))
@@ -111,9 +106,6 @@
__rpc_add_timer(struct rpc_wait_queue *queue, struct rpc_task *task,
unsigned long timeout)
{
- dprintk("RPC: %5u setting alarm for %u ms\n",
- task->tk_pid, jiffies_to_msecs(timeout - jiffies));
-
task->tk_timeout = timeout;
if (list_empty(&queue->timer_list.list) || time_before(timeout, queue->timer_list.expires))
rpc_set_queue_timer(queue, timeout);
@@ -216,9 +208,6 @@
/* barrier matches the read in rpc_wake_up_task_queue_locked() */
smp_wmb();
rpc_set_queued(task);
-
- dprintk("RPC: %5u added to queue %p \"%s\"\n",
- task->tk_pid, queue, rpc_qname(queue));
}
/*
@@ -241,8 +230,6 @@
else
list_del(&task->u.tk_wait.list);
queue->qlen--;
- dprintk("RPC: %5u removed from queue %p \"%s\"\n",
- task->tk_pid, queue, rpc_qname(queue));
}
static void __rpc_init_priority_wait_queue(struct rpc_wait_queue *queue, const char *qname, unsigned char nr_queues)
@@ -382,13 +369,9 @@
struct rpc_task *task,
unsigned char queue_priority)
{
- dprintk("RPC: %5u sleep_on(queue \"%s\" time %lu)\n",
- task->tk_pid, rpc_qname(q), jiffies);
-
trace_rpc_task_sleep(task, q);
__rpc_add_wait_queue(q, task, queue_priority);
-
}
static void __rpc_sleep_on_priority(struct rpc_wait_queue *q,
@@ -510,9 +493,6 @@
struct rpc_wait_queue *queue,
struct rpc_task *task)
{
- dprintk("RPC: %5u __rpc_wake_up_task (now %lu)\n",
- task->tk_pid, jiffies);
-
/* Has the task been executed yet? If not, we cannot wake it up! */
if (!RPC_IS_ACTIVATED(task)) {
printk(KERN_ERR "RPC: Inactive task (%p) being woken up!\n", task);
@@ -524,8 +504,6 @@
__rpc_remove_wait_queue(queue, task);
rpc_make_runnable(wq, task);
-
- dprintk("RPC: __rpc_wake_up_task done\n");
}
/*
@@ -673,8 +651,6 @@
{
struct rpc_task *task = NULL;
- dprintk("RPC: wake_up_first(%p \"%s\")\n",
- queue, rpc_qname(queue));
spin_lock(&queue->lock);
task = __rpc_find_next_queued(queue);
if (task != NULL)
@@ -785,7 +761,7 @@
list_for_each_entry_safe(task, n, &queue->timer_list.list, u.tk_wait.timer_list) {
timeo = task->tk_timeout;
if (time_after_eq(now, timeo)) {
- dprintk("RPC: %5u timeout\n", task->tk_pid);
+ trace_rpc_task_timeout(task, task->tk_action);
task->tk_status = -ETIMEDOUT;
rpc_wake_up_task_queue_locked(queue, task);
continue;
@@ -868,6 +844,8 @@
if (!RPC_IS_ACTIVATED(task))
return;
+
+ trace_rpc_task_signalled(task, task->tk_action);
set_bit(RPC_TASK_SIGNALLED, &task->tk_runstate);
smp_mb__after_atomic();
queue = READ_ONCE(task->tk_waitqueue);
@@ -898,9 +876,6 @@
int task_is_async = RPC_IS_ASYNC(task);
int status = 0;
- dprintk("RPC: %5u __rpc_execute flags=0x%x\n",
- task->tk_pid, task->tk_flags);
-
WARN_ON_ONCE(RPC_IS_QUEUED(task));
if (RPC_IS_QUEUED(task))
return;
@@ -960,7 +935,7 @@
return;
/* sync task: sleep here */
- dprintk("RPC: %5u sync task going to sleep\n", task->tk_pid);
+ trace_rpc_task_sync_sleep(task, task->tk_action);
status = out_of_line_wait_on_bit(&task->tk_runstate,
RPC_TASK_QUEUED, rpc_wait_bit_killable,
TASK_KILLABLE);
@@ -971,16 +946,14 @@
* clean up after sleeping on some queue, we don't
* break the loop here, but go around once more.
*/
- dprintk("RPC: %5u got signal\n", task->tk_pid);
+ trace_rpc_task_signalled(task, task->tk_action);
set_bit(RPC_TASK_SIGNALLED, &task->tk_runstate);
task->tk_rpc_status = -ERESTARTSYS;
rpc_exit(task, -ERESTARTSYS);
}
- dprintk("RPC: %5u sync task resuming\n", task->tk_pid);
+ trace_rpc_task_sync_wake(task, task->tk_action);
}
- dprintk("RPC: %5u return %d, status %d\n", task->tk_pid, status,
- task->tk_status);
/* Release all resources associated with the task */
rpc_release_task(task);
}
@@ -1052,8 +1025,6 @@
return -ENOMEM;
buf->len = size;
- dprintk("RPC: %5u allocated buffer of size %zu at %p\n",
- task->tk_pid, size, buf);
rqst->rq_buffer = buf->data;
rqst->rq_rbuffer = (char *)rqst->rq_buffer + rqst->rq_callsize;
return 0;
@@ -1074,9 +1045,6 @@
buf = container_of(buffer, struct rpc_buffer, data);
size = buf->len;
- dprintk("RPC: freeing buffer of size %zu at %p\n",
- size, buf);
-
if (size <= RPC_BUFFER_MAXSIZE)
mempool_free(buf, rpc_buffer_mempool);
else
@@ -1111,9 +1079,6 @@
task->tk_action = rpc_prepare_task;
rpc_init_task_statistics(task);
-
- dprintk("RPC: new task initialized, procpid %u\n",
- task_pid_nr(current));
}
static struct rpc_task *
@@ -1137,7 +1102,6 @@
rpc_init_task(task, setup_data);
task->tk_flags |= flags;
- dprintk("RPC: allocated task %p\n", task);
return task;
}
@@ -1167,10 +1131,8 @@
put_rpccred(task->tk_op_cred);
rpc_release_calldata(task->tk_ops, task->tk_calldata);
- if (tk_flags & RPC_TASK_DYNAMIC) {
- dprintk("RPC: %5u freeing task\n", task->tk_pid);
+ if (tk_flags & RPC_TASK_DYNAMIC)
mempool_free(task, rpc_task_mempool);
- }
}
static void rpc_async_release(struct work_struct *work)
@@ -1185,7 +1147,8 @@
{
xprt_release(task);
if (task->tk_msg.rpc_cred) {
- put_cred(task->tk_msg.rpc_cred);
+ if (!(task->tk_flags & RPC_TASK_CRED_NOREF))
+ put_cred(task->tk_msg.rpc_cred);
task->tk_msg.rpc_cred = NULL;
}
rpc_task_release_client(task);
@@ -1223,8 +1186,6 @@
static void rpc_release_task(struct rpc_task *task)
{
- dprintk("RPC: %5u release task\n", task->tk_pid);
-
WARN_ON_ONCE(RPC_IS_QUEUED(task));
rpc_release_resources_task(task);
@@ -1265,7 +1226,6 @@
/*
* Create the rpciod thread and wait for it to start.
*/
- dprintk("RPC: creating workqueue rpciod\n");
wq = alloc_workqueue("rpciod", WQ_MEM_RECLAIM | WQ_UNBOUND, 0);
if (!wq)
goto out_failed;
@@ -1290,7 +1250,6 @@
if (rpciod_workqueue == NULL)
return;
- dprintk("RPC: destroying workqueue rpciod\n");
wq = rpciod_workqueue;
rpciod_workqueue = NULL;
diff --git a/net/sunrpc/socklib.c b/net/sunrpc/socklib.c
index 1a864f1..d52313a 100644
--- a/net/sunrpc/socklib.c
+++ b/net/sunrpc/socklib.c
@@ -14,9 +14,24 @@
#include <linux/types.h>
#include <linux/pagemap.h>
#include <linux/udp.h>
+#include <linux/sunrpc/msg_prot.h>
#include <linux/sunrpc/xdr.h>
#include <linux/export.h>
+#include "socklib.h"
+
+/*
+ * Helper structure for copying from an sk_buff.
+ */
+struct xdr_skb_reader {
+ struct sk_buff *skb;
+ unsigned int offset;
+ size_t count;
+ __wsum csum;
+};
+
+typedef size_t (*xdr_skb_read_actor)(struct xdr_skb_reader *desc, void *to,
+ size_t len);
/**
* xdr_skb_read_bits - copy some data bits from skb to internal buffer
@@ -55,7 +70,7 @@
if (len > desc->count)
len = desc->count;
pos = desc->offset;
- csum2 = skb_copy_and_csum_bits(desc->skb, pos, to, len, 0);
+ csum2 = skb_copy_and_csum_bits(desc->skb, pos, to, len);
desc->csum = csum_block_add(desc->csum, csum2, pos);
desc->count -= len;
desc->offset += len;
@@ -186,3 +201,129 @@
return 0;
}
EXPORT_SYMBOL_GPL(csum_partial_copy_to_xdr);
+
+static inline int xprt_sendmsg(struct socket *sock, struct msghdr *msg,
+ size_t seek)
+{
+ if (seek)
+ iov_iter_advance(&msg->msg_iter, seek);
+ return sock_sendmsg(sock, msg);
+}
+
+static int xprt_send_kvec(struct socket *sock, struct msghdr *msg,
+ struct kvec *vec, size_t seek)
+{
+ iov_iter_kvec(&msg->msg_iter, WRITE, vec, 1, vec->iov_len);
+ return xprt_sendmsg(sock, msg, seek);
+}
+
+static int xprt_send_pagedata(struct socket *sock, struct msghdr *msg,
+ struct xdr_buf *xdr, size_t base)
+{
+ int err;
+
+ err = xdr_alloc_bvec(xdr, GFP_KERNEL);
+ if (err < 0)
+ return err;
+
+ iov_iter_bvec(&msg->msg_iter, WRITE, xdr->bvec, xdr_buf_pagecount(xdr),
+ xdr->page_len + xdr->page_base);
+ return xprt_sendmsg(sock, msg, base + xdr->page_base);
+}
+
+/* Common case:
+ * - stream transport
+ * - sending from byte 0 of the message
+ * - the message is wholly contained in @xdr's head iovec
+ */
+static int xprt_send_rm_and_kvec(struct socket *sock, struct msghdr *msg,
+ rpc_fraghdr marker, struct kvec *vec,
+ size_t base)
+{
+ struct kvec iov[2] = {
+ [0] = {
+ .iov_base = &marker,
+ .iov_len = sizeof(marker)
+ },
+ [1] = *vec,
+ };
+ size_t len = iov[0].iov_len + iov[1].iov_len;
+
+ iov_iter_kvec(&msg->msg_iter, WRITE, iov, 2, len);
+ return xprt_sendmsg(sock, msg, base);
+}
+
+/**
+ * xprt_sock_sendmsg - write an xdr_buf directly to a socket
+ * @sock: open socket to send on
+ * @msg: socket message metadata
+ * @xdr: xdr_buf containing this request
+ * @base: starting position in the buffer
+ * @marker: stream record marker field
+ * @sent_p: return the total number of bytes successfully queued for sending
+ *
+ * Return values:
+ * On success, returns zero and fills in @sent_p.
+ * %-ENOTSOCK if @sock is not a struct socket.
+ */
+int xprt_sock_sendmsg(struct socket *sock, struct msghdr *msg,
+ struct xdr_buf *xdr, unsigned int base,
+ rpc_fraghdr marker, unsigned int *sent_p)
+{
+ unsigned int rmsize = marker ? sizeof(marker) : 0;
+ unsigned int remainder = rmsize + xdr->len - base;
+ unsigned int want;
+ int err = 0;
+
+ *sent_p = 0;
+
+ if (unlikely(!sock))
+ return -ENOTSOCK;
+
+ msg->msg_flags |= MSG_MORE;
+ want = xdr->head[0].iov_len + rmsize;
+ if (base < want) {
+ unsigned int len = want - base;
+
+ remainder -= len;
+ if (remainder == 0)
+ msg->msg_flags &= ~MSG_MORE;
+ if (rmsize)
+ err = xprt_send_rm_and_kvec(sock, msg, marker,
+ &xdr->head[0], base);
+ else
+ err = xprt_send_kvec(sock, msg, &xdr->head[0], base);
+ if (remainder == 0 || err != len)
+ goto out;
+ *sent_p += err;
+ base = 0;
+ } else {
+ base -= want;
+ }
+
+ if (base < xdr->page_len) {
+ unsigned int len = xdr->page_len - base;
+
+ remainder -= len;
+ if (remainder == 0)
+ msg->msg_flags &= ~MSG_MORE;
+ err = xprt_send_pagedata(sock, msg, xdr, base);
+ if (remainder == 0 || err != len)
+ goto out;
+ *sent_p += err;
+ base = 0;
+ } else {
+ base -= xdr->page_len;
+ }
+
+ if (base >= xdr->tail[0].iov_len)
+ return 0;
+ msg->msg_flags &= ~MSG_MORE;
+ err = xprt_send_kvec(sock, msg, &xdr->tail[0], base);
+out:
+ if (err > 0) {
+ *sent_p += err;
+ err = 0;
+ }
+ return err;
+}
diff --git a/net/sunrpc/socklib.h b/net/sunrpc/socklib.h
new file mode 100644
index 0000000..c48114a
--- /dev/null
+++ b/net/sunrpc/socklib.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 1995-1997 Olaf Kirch <okir@monad.swb.de>
+ * Copyright (C) 2020, Oracle.
+ */
+
+#ifndef _NET_SUNRPC_SOCKLIB_H_
+#define _NET_SUNRPC_SOCKLIB_H_
+
+int csum_partial_copy_to_xdr(struct xdr_buf *xdr, struct sk_buff *skb);
+int xprt_sock_sendmsg(struct socket *sock, struct msghdr *msg,
+ struct xdr_buf *xdr, unsigned int base,
+ rpc_fraghdr marker, unsigned int *sent_p);
+
+#endif /* _NET_SUNRPC_SOCKLIB_H_ */
diff --git a/net/sunrpc/stats.c b/net/sunrpc/stats.c
index 7c74197..c964b48 100644
--- a/net/sunrpc/stats.c
+++ b/net/sunrpc/stats.c
@@ -69,12 +69,11 @@
return single_open(file, rpc_proc_show, PDE_DATA(inode));
}
-static const struct file_operations rpc_proc_fops = {
- .owner = THIS_MODULE,
- .open = rpc_proc_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
+static const struct proc_ops rpc_proc_ops = {
+ .proc_open = rpc_proc_open,
+ .proc_read = seq_read,
+ .proc_lseek = seq_lseek,
+ .proc_release = single_release,
};
/*
@@ -281,19 +280,19 @@
*/
static inline struct proc_dir_entry *
do_register(struct net *net, const char *name, void *data,
- const struct file_operations *fops)
+ const struct proc_ops *proc_ops)
{
struct sunrpc_net *sn;
dprintk("RPC: registering /proc/net/rpc/%s\n", name);
sn = net_generic(net, sunrpc_net_id);
- return proc_create_data(name, 0, sn->proc_net_rpc, fops, data);
+ return proc_create_data(name, 0, sn->proc_net_rpc, proc_ops, data);
}
struct proc_dir_entry *
rpc_proc_register(struct net *net, struct rpc_stat *statp)
{
- return do_register(net, statp->program->name, statp, &rpc_proc_fops);
+ return do_register(net, statp->program->name, statp, &rpc_proc_ops);
}
EXPORT_SYMBOL_GPL(rpc_proc_register);
@@ -308,9 +307,9 @@
EXPORT_SYMBOL_GPL(rpc_proc_unregister);
struct proc_dir_entry *
-svc_proc_register(struct net *net, struct svc_stat *statp, const struct file_operations *fops)
+svc_proc_register(struct net *net, struct svc_stat *statp, const struct proc_ops *proc_ops)
{
- return do_register(net, statp->program->pg_name, statp, fops);
+ return do_register(net, statp->program->pg_name, statp, proc_ops);
}
EXPORT_SYMBOL_GPL(svc_proc_register);
diff --git a/net/sunrpc/sunrpc.h b/net/sunrpc/sunrpc.h
index 82035fa..2f59464 100644
--- a/net/sunrpc/sunrpc.h
+++ b/net/sunrpc/sunrpc.h
@@ -4,7 +4,7 @@
NetApp provides this source code under the GPL v2 License.
The GPL v2 license is available at
-http://opensource.org/licenses/gpl-license.php.
+https://opensource.org/licenses/gpl-license.php.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
@@ -50,10 +50,6 @@
return loopback;
}
-int svc_send_common(struct socket *sock, struct xdr_buf *xdr,
- struct page *headpage, unsigned long headoffset,
- struct page *tailpage, unsigned long tailoffset);
-
int rpc_clients_notifier_register(void);
void rpc_clients_notifier_unregister(void);
void auth_domain_cleanup(void);
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index 1741f11..d38788c 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -88,15 +88,15 @@
switch (*ip)
{
case SVC_POOL_AUTO:
- return strlcpy(buf, "auto", 20);
+ return strlcpy(buf, "auto\n", 20);
case SVC_POOL_GLOBAL:
- return strlcpy(buf, "global", 20);
+ return strlcpy(buf, "global\n", 20);
case SVC_POOL_PERCPU:
- return strlcpy(buf, "percpu", 20);
+ return strlcpy(buf, "percpu\n", 20);
case SVC_POOL_PERNODE:
- return strlcpy(buf, "pernode", 20);
+ return strlcpy(buf, "pernode\n", 20);
default:
- return sprintf(buf, "%d", *ip);
+ return sprintf(buf, "%d\n", *ip);
}
}
@@ -991,6 +991,7 @@
#endif
}
+ trace_svc_register(progname, version, protocol, port, family, error);
return error;
}
@@ -1000,11 +1001,6 @@
unsigned short proto,
unsigned short port)
{
- dprintk("svc: svc_register(%sv%d, %s, %u, %u)\n",
- progp->pg_name, version,
- proto == IPPROTO_UDP? "udp" : "tcp",
- port, family);
-
return __svc_register(net, progp->pg_name, progp->pg_prog,
version, family, proto, port);
@@ -1024,11 +1020,8 @@
return 0;
if (vers->vs_hidden) {
- dprintk("svc: svc_register(%sv%d, %s, %u, %u)"
- " (but not telling portmap)\n",
- progp->pg_name, version,
- proto == IPPROTO_UDP? "udp" : "tcp",
- port, family);
+ trace_svc_noregister(progp->pg_name, version, proto,
+ port, family, 0);
return 0;
}
@@ -1106,8 +1099,7 @@
if (error == -EPROTONOSUPPORT)
error = rpcb_register(net, program, version, 0, 0);
- dprintk("svc: %s(%sv%u), error %d\n",
- __func__, progname, version, error);
+ trace_svc_unregister(progname, version, error);
}
/*
@@ -1132,9 +1124,6 @@
continue;
if (progp->pg_vers[i]->vs_hidden)
continue;
-
- dprintk("svc: attempting to unregister %sv%u\n",
- progp->pg_name, i);
__svc_unregister(net, progp->pg_prog, i, progp->pg_name);
}
}
@@ -1337,6 +1326,8 @@
auth_stat = rpc_autherr_badcred;
auth_res = progp->pg_authenticate(rqstp);
}
+ if (auth_res != SVC_OK)
+ trace_svc_authenticate(rqstp, auth_res, auth_stat);
switch (auth_res) {
case SVC_OK:
break;
@@ -1529,10 +1520,6 @@
goto out_drop;
}
- /* Reserve space for the record marker */
- if (rqstp->rq_prot == IPPROTO_TCP)
- svc_putnl(resv, 0);
-
/* Returns 1 for send, 0 for drop */
if (likely(svc_process_common(rqstp, argv, resv)))
return svc_send(rqstp);
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index f911153..06e5034 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -153,6 +153,7 @@
xprt_put(xprt->xpt_bc_xprt);
if (xprt->xpt_bc_xps)
xprt_switch_put(xprt->xpt_bc_xps);
+ trace_svc_xprt_free(xprt);
xprt->xpt_ops->xpo_free(xprt);
module_put(owner);
}
@@ -206,6 +207,7 @@
.sin6_port = htons(port),
};
#endif
+ struct svc_xprt *xprt;
struct sockaddr *sap;
size_t len;
@@ -224,7 +226,11 @@
return ERR_PTR(-EAFNOSUPPORT);
}
- return xcl->xcl_ops->xpo_create(serv, net, sap, len, flags);
+ xprt = xcl->xcl_ops->xpo_create(serv, net, sap, len, flags);
+ if (IS_ERR(xprt))
+ trace_svc_xprt_create_err(serv->sv_program->pg_name,
+ xcl->xcl_name, sap, xprt);
+ return xprt;
}
/*
@@ -304,15 +310,11 @@
{
int err;
- dprintk("svc: creating transport %s[%d]\n", xprt_name, port);
err = _svc_create_xprt(serv, xprt_name, net, family, port, flags, cred);
if (err == -EPROTONOSUPPORT) {
request_module("svc%s", xprt_name);
err = _svc_create_xprt(serv, xprt_name, net, family, port, flags, cred);
}
- if (err < 0)
- dprintk("svc: transport %s not found, err %d\n",
- xprt_name, -err);
return err;
}
EXPORT_SYMBOL_GPL(svc_create_xprt);
@@ -780,7 +782,6 @@
int len = 0;
if (test_bit(XPT_CLOSE, &xprt->xpt_flags)) {
- dprintk("svc_recv: found XPT_CLOSE\n");
if (test_and_clear_bit(XPT_KILL_TEMP, &xprt->xpt_flags))
xprt->xpt_ops->xpo_kill_temp_xprt(xprt);
svc_delete_xprt(xprt);
@@ -799,6 +800,7 @@
if (newxpt) {
newxpt->xpt_cred = get_cred(xprt->xpt_cred);
svc_add_new_temp_xprt(serv, newxpt);
+ trace_svc_xprt_accept(newxpt, serv->sv_name);
} else
module_put(xprt->xpt_class->xcl_owner);
} else if (svc_xprt_reserve_slot(rqstp, xprt)) {
@@ -811,6 +813,8 @@
len = svc_deferred_recv(rqstp);
else
len = xprt->xpt_ops->xpo_recvfrom(rqstp);
+ if (len > 0)
+ trace_svc_xdr_recvfrom(rqstp, &rqstp->rq_arg);
rqstp->rq_stime = ktime_get();
rqstp->rq_reserved = serv->sv_max_mesg;
atomic_add(rqstp->rq_reserved, &xprt->xpt_reserved);
@@ -833,14 +837,6 @@
struct svc_serv *serv = rqstp->rq_server;
int len, err;
- dprintk("svc: server %p waiting for data (to = %ld)\n",
- rqstp, timeout);
-
- if (rqstp->rq_xprt)
- printk(KERN_ERR
- "svc_recv: service %p, transport not NULL!\n",
- rqstp);
-
err = svc_alloc_arg(rqstp);
if (err)
goto out;
@@ -888,7 +884,6 @@
void svc_drop(struct svc_rqst *rqstp)
{
trace_svc_drop(rqstp);
- dprintk("svc: xprt %p dropped request\n", rqstp->rq_xprt);
svc_xprt_release(rqstp);
}
EXPORT_SYMBOL_GPL(svc_drop);
@@ -911,16 +906,11 @@
xb->len = xb->head[0].iov_len +
xb->page_len +
xb->tail[0].iov_len;
-
- /* Grab mutex to serialize outgoing data. */
- mutex_lock(&xprt->xpt_mutex);
+ trace_svc_xdr_sendto(rqstp, xb);
trace_svc_stats_latency(rqstp);
- if (test_bit(XPT_DEAD, &xprt->xpt_flags)
- || test_bit(XPT_CLOSE, &xprt->xpt_flags))
- len = -ENOTCONN;
- else
- len = xprt->xpt_ops->xpo_sendto(rqstp);
- mutex_unlock(&xprt->xpt_mutex);
+
+ len = xprt->xpt_ops->xpo_sendto(rqstp);
+
trace_svc_send(rqstp, len);
svc_xprt_release(rqstp);
@@ -1028,11 +1018,10 @@
struct svc_serv *serv = xprt->xpt_server;
struct svc_deferred_req *dr;
- /* Only do this once */
if (test_and_set_bit(XPT_DEAD, &xprt->xpt_flags))
- BUG();
+ return;
- dprintk("svc: svc_delete_xprt(%p)\n", xprt);
+ trace_svc_xprt_detach(xprt);
xprt->xpt_ops->xpo_detach(xprt);
if (xprt->xpt_bc_xprt)
xprt->xpt_bc_xprt->ops->close(xprt->xpt_bc_xprt);
@@ -1053,6 +1042,7 @@
void svc_close_xprt(struct svc_xprt *xprt)
{
+ trace_svc_xprt_close(xprt);
set_bit(XPT_CLOSE, &xprt->xpt_flags);
if (test_and_set_bit(XPT_BUSY, &xprt->xpt_flags))
/* someone else will have to effect the close */
@@ -1155,16 +1145,15 @@
set_bit(XPT_DEFERRED, &xprt->xpt_flags);
if (too_many || test_bit(XPT_DEAD, &xprt->xpt_flags)) {
spin_unlock(&xprt->xpt_lock);
- dprintk("revisit canceled\n");
+ trace_svc_defer_drop(dr);
svc_xprt_put(xprt);
- trace_svc_drop_deferred(dr);
kfree(dr);
return;
}
- dprintk("revisit queued\n");
dr->xprt = NULL;
list_add(&dr->handle.recent, &xprt->xpt_deferred);
spin_unlock(&xprt->xpt_lock);
+ trace_svc_defer_queue(dr);
svc_xprt_enqueue(xprt);
svc_xprt_put(xprt);
}
@@ -1210,22 +1199,24 @@
memcpy(dr->args, rqstp->rq_arg.head[0].iov_base - skip,
dr->argslen << 2);
}
+ trace_svc_defer(rqstp);
svc_xprt_get(rqstp->rq_xprt);
dr->xprt = rqstp->rq_xprt;
set_bit(RQ_DROPME, &rqstp->rq_flags);
dr->handle.revisit = svc_revisit;
- trace_svc_defer(rqstp);
return &dr->handle;
}
/*
* recv data from a deferred request into an active one
*/
-static int svc_deferred_recv(struct svc_rqst *rqstp)
+static noinline int svc_deferred_recv(struct svc_rqst *rqstp)
{
struct svc_deferred_req *dr = rqstp->rq_deferred;
+ trace_svc_defer_recv(dr);
+
/* setup iov_base past transport header */
rqstp->rq_arg.head[0].iov_base = dr->args + (dr->xprt_hlen>>2);
/* The iov_len does not include the transport header bytes */
@@ -1256,7 +1247,6 @@
struct svc_deferred_req,
handle.recent);
list_del_init(&dr->handle.recent);
- trace_svc_revisit_deferred(dr);
} else
clear_bit(XPT_DEFERRED, &xprt->xpt_flags);
spin_unlock(&xprt->xpt_lock);
diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c
index 5c04ba7..97c0bdd 100644
--- a/net/sunrpc/svcauth_unix.c
+++ b/net/sunrpc/svcauth_unix.c
@@ -148,6 +148,11 @@
return NULL;
}
+static int ip_map_upcall(struct cache_detail *cd, struct cache_head *h)
+{
+ return sunrpc_cache_pipe_upcall(cd, h);
+}
+
static void ip_map_request(struct cache_detail *cd,
struct cache_head *h,
char **bpp, int *blen)
@@ -166,7 +171,7 @@
}
static struct ip_map *__ip_map_lookup(struct cache_detail *cd, char *class, struct in6_addr *addr);
-static int __ip_map_update(struct cache_detail *cd, struct ip_map *ipm, struct unix_domain *udom, time_t expiry);
+static int __ip_map_update(struct cache_detail *cd, struct ip_map *ipm, struct unix_domain *udom, time64_t expiry);
static int ip_map_parse(struct cache_detail *cd,
char *mesg, int mlen)
@@ -187,7 +192,7 @@
struct ip_map *ipmp;
struct auth_domain *dom;
- time_t expiry;
+ time64_t expiry;
if (mesg[mlen-1] != '\n')
return -EINVAL;
@@ -308,7 +313,7 @@
}
static int __ip_map_update(struct cache_detail *cd, struct ip_map *ipm,
- struct unix_domain *udom, time_t expiry)
+ struct unix_domain *udom, time64_t expiry)
{
struct ip_map ip;
struct cache_head *ch;
@@ -327,15 +332,6 @@
return 0;
}
-static inline int ip_map_update(struct net *net, struct ip_map *ipm,
- struct unix_domain *udom, time_t expiry)
-{
- struct sunrpc_net *sn;
-
- sn = net_generic(net, sunrpc_net_id);
- return __ip_map_update(sn->ip_map_cache, ipm, udom, expiry);
-}
-
void svcauth_unix_purge(struct net *net)
{
struct sunrpc_net *sn;
@@ -467,6 +463,11 @@
return NULL;
}
+static int unix_gid_upcall(struct cache_detail *cd, struct cache_head *h)
+{
+ return sunrpc_cache_pipe_upcall_timeout(cd, h);
+}
+
static void unix_gid_request(struct cache_detail *cd,
struct cache_head *h,
char **bpp, int *blen)
@@ -491,7 +492,7 @@
int rv;
int i;
int err;
- time_t expiry;
+ time64_t expiry;
struct unix_gid ug, *ugp;
if (mesg[mlen - 1] != '\n')
@@ -584,6 +585,7 @@
.hash_size = GID_HASHMAX,
.name = "auth.unix.gid",
.cache_put = unix_gid_put,
+ .cache_upcall = unix_gid_upcall,
.cache_request = unix_gid_request,
.cache_parse = unix_gid_parse,
.cache_show = unix_gid_show,
@@ -881,6 +883,7 @@
.hash_size = IP_HASHMAX,
.name = "auth.unix.ip",
.cache_put = ip_map_put,
+ .cache_upcall = ip_map_upcall,
.cache_request = ip_map_request,
.cache_parse = ip_map_parse,
.cache_show = ip_map_show,
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index d52abde..eba1714 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -44,8 +44,8 @@
#include <net/tcp.h>
#include <net/tcp_states.h>
#include <linux/uaccess.h>
+#include <linux/highmem.h>
#include <asm/ioctls.h>
-#include <trace/events/skb.h>
#include <linux/sunrpc/types.h>
#include <linux/sunrpc/clnt.h>
@@ -55,6 +55,9 @@
#include <linux/sunrpc/stats.h>
#include <linux/sunrpc/xprt.h>
+#include <trace/events/sunrpc.h>
+
+#include "socklib.h"
#include "sunrpc.h"
#define RPCDBG_FACILITY RPCDBG_SVCXPRT
@@ -107,31 +110,35 @@
}
#endif
-/*
- * Release an skbuff after use
+/**
+ * svc_tcp_release_rqst - Release transport-related resources
+ * @rqstp: request structure with resources to be released
+ *
*/
-static void svc_release_skb(struct svc_rqst *rqstp)
+static void svc_tcp_release_rqst(struct svc_rqst *rqstp)
{
struct sk_buff *skb = rqstp->rq_xprt_ctxt;
if (skb) {
struct svc_sock *svsk =
container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt);
- rqstp->rq_xprt_ctxt = NULL;
- dprintk("svc: service %p, releasing skb %p\n", rqstp, skb);
+ rqstp->rq_xprt_ctxt = NULL;
skb_free_datagram_locked(svsk->sk_sk, skb);
}
}
-static void svc_release_udp_skb(struct svc_rqst *rqstp)
+/**
+ * svc_udp_release_rqst - Release transport-related resources
+ * @rqstp: request structure with resources to be released
+ *
+ */
+static void svc_udp_release_rqst(struct svc_rqst *rqstp)
{
struct sk_buff *skb = rqstp->rq_xprt_ctxt;
if (skb) {
rqstp->rq_xprt_ctxt = NULL;
-
- dprintk("svc: service %p, releasing skb %p\n", rqstp, skb);
consume_skb(skb);
}
}
@@ -174,111 +181,6 @@
}
}
-/*
- * send routine intended to be shared by the fore- and back-channel
- */
-int svc_send_common(struct socket *sock, struct xdr_buf *xdr,
- struct page *headpage, unsigned long headoffset,
- struct page *tailpage, unsigned long tailoffset)
-{
- int result;
- int size;
- struct page **ppage = xdr->pages;
- size_t base = xdr->page_base;
- unsigned int pglen = xdr->page_len;
- unsigned int flags = MSG_MORE | MSG_SENDPAGE_NOTLAST;
- int slen;
- int len = 0;
-
- slen = xdr->len;
-
- /* send head */
- if (slen == xdr->head[0].iov_len)
- flags = 0;
- len = kernel_sendpage(sock, headpage, headoffset,
- xdr->head[0].iov_len, flags);
- if (len != xdr->head[0].iov_len)
- goto out;
- slen -= xdr->head[0].iov_len;
- if (slen == 0)
- goto out;
-
- /* send page data */
- size = PAGE_SIZE - base < pglen ? PAGE_SIZE - base : pglen;
- while (pglen > 0) {
- if (slen == size)
- flags = 0;
- result = kernel_sendpage(sock, *ppage, base, size, flags);
- if (result > 0)
- len += result;
- if (result != size)
- goto out;
- slen -= size;
- pglen -= size;
- size = PAGE_SIZE < pglen ? PAGE_SIZE : pglen;
- base = 0;
- ppage++;
- }
-
- /* send tail */
- if (xdr->tail[0].iov_len) {
- result = kernel_sendpage(sock, tailpage, tailoffset,
- xdr->tail[0].iov_len, 0);
- if (result > 0)
- len += result;
- }
-
-out:
- return len;
-}
-
-
-/*
- * Generic sendto routine
- */
-static int svc_sendto(struct svc_rqst *rqstp, struct xdr_buf *xdr)
-{
- struct svc_sock *svsk =
- container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt);
- struct socket *sock = svsk->sk_sock;
- union {
- struct cmsghdr hdr;
- long all[SVC_PKTINFO_SPACE / sizeof(long)];
- } buffer;
- struct cmsghdr *cmh = &buffer.hdr;
- int len = 0;
- unsigned long tailoff;
- unsigned long headoff;
- RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]);
-
- if (rqstp->rq_prot == IPPROTO_UDP) {
- struct msghdr msg = {
- .msg_name = &rqstp->rq_addr,
- .msg_namelen = rqstp->rq_addrlen,
- .msg_control = cmh,
- .msg_controllen = sizeof(buffer),
- .msg_flags = MSG_MORE,
- };
-
- svc_set_cmsg_data(rqstp, cmh);
-
- if (sock_sendmsg(sock, &msg) < 0)
- goto out;
- }
-
- tailoff = ((unsigned long)xdr->tail[0].iov_base) & (PAGE_SIZE-1);
- headoff = 0;
- len = svc_send_common(sock, xdr, rqstp->rq_respages[0], headoff,
- rqstp->rq_respages[0], tailoff);
-
-out:
- dprintk("svc: socket %p sendto([%p %zu... ], %d) = %d (addr %s)\n",
- svsk, xdr->head[0].iov_base, xdr->head[0].iov_len,
- xdr->len, len, svc_print_addr(rqstp, buf, sizeof(buf)));
-
- return len;
-}
-
static int svc_sock_read_payload(struct svc_rqst *rqstp, unsigned int offset,
unsigned int length)
{
@@ -322,34 +224,68 @@
return len;
}
+#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
+static void svc_flush_bvec(const struct bio_vec *bvec, size_t size, size_t seek)
+{
+ struct bvec_iter bi = {
+ .bi_size = size + seek,
+ };
+ struct bio_vec bv;
+
+ bvec_iter_advance(bvec, &bi, seek & PAGE_MASK);
+ for_each_bvec(bv, bvec, bi, bi)
+ flush_dcache_page(bv.bv_page);
+}
+#else
+static inline void svc_flush_bvec(const struct bio_vec *bvec, size_t size,
+ size_t seek)
+{
+}
+#endif
+
/*
- * Generic recvfrom routine.
+ * Read from @rqstp's transport socket. The incoming message fills whole
+ * pages in @rqstp's rq_pages array until the last page of the message
+ * has been received into a partial page.
*/
-static ssize_t svc_recvfrom(struct svc_rqst *rqstp, struct kvec *iov,
- unsigned int nr, size_t buflen, unsigned int base)
+static ssize_t svc_tcp_read_msg(struct svc_rqst *rqstp, size_t buflen,
+ size_t seek)
{
struct svc_sock *svsk =
container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt);
+ struct bio_vec *bvec = rqstp->rq_bvec;
struct msghdr msg = { NULL };
+ unsigned int i;
ssize_t len;
+ size_t t;
rqstp->rq_xprt_hlen = 0;
clear_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
- iov_iter_kvec(&msg.msg_iter, READ, iov, nr, buflen);
- if (base != 0) {
- iov_iter_advance(&msg.msg_iter, base);
- buflen -= base;
+
+ for (i = 0, t = 0; t < buflen; i++, t += PAGE_SIZE) {
+ bvec[i].bv_page = rqstp->rq_pages[i];
+ bvec[i].bv_len = PAGE_SIZE;
+ bvec[i].bv_offset = 0;
+ }
+ rqstp->rq_respages = &rqstp->rq_pages[i];
+ rqstp->rq_next_page = rqstp->rq_respages + 1;
+
+ iov_iter_bvec(&msg.msg_iter, READ, bvec, i, buflen);
+ if (seek) {
+ iov_iter_advance(&msg.msg_iter, seek);
+ buflen -= seek;
}
len = sock_recvmsg(svsk->sk_sock, &msg, MSG_DONTWAIT);
+ if (len > 0)
+ svc_flush_bvec(bvec, len, seek);
+
/* If we read a full record, then assume there may be more
* data to read (stream based sockets only!)
*/
if (len == buflen)
set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
- dprintk("svc: socket %p recvfrom(%p, %zu) = %zd\n",
- svsk, iov[0].iov_base, iov[0].iov_len, len);
return len;
}
@@ -386,13 +322,10 @@
struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data;
if (svsk) {
- dprintk("svc: socket %p(inet %p), busy=%d\n",
- svsk, sk,
- test_bit(XPT_BUSY, &svsk->sk_xprt.xpt_flags));
-
/* Refer to svc_setup_socket() for details. */
rmb();
svsk->sk_odata(sk);
+ trace_svcsock_data_ready(&svsk->sk_xprt, 0);
if (!test_and_set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags))
svc_xprt_enqueue(&svsk->sk_xprt);
}
@@ -406,11 +339,9 @@
struct svc_sock *svsk = (struct svc_sock *)(sk->sk_user_data);
if (svsk) {
- dprintk("svc: socket %p(inet %p), write_space busy=%d\n",
- svsk, sk, test_bit(XPT_BUSY, &svsk->sk_xprt.xpt_flags));
-
/* Refer to svc_setup_socket() for details. */
rmb();
+ trace_svcsock_write_space(&svsk->sk_xprt, 0);
svsk->sk_owspace(sk);
svc_xprt_enqueue(&svsk->sk_xprt);
}
@@ -427,17 +358,9 @@
static void svc_tcp_kill_temp_xprt(struct svc_xprt *xprt)
{
- struct svc_sock *svsk;
- struct socket *sock;
- struct linger no_linger = {
- .l_onoff = 1,
- .l_linger = 0,
- };
+ struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt);
- svsk = container_of(xprt, struct svc_sock, sk_xprt);
- sock = svsk->sk_sock;
- kernel_setsockopt(sock, SOL_SOCKET, SO_LINGER,
- (char *)&no_linger, sizeof(no_linger));
+ sock_no_linger(svsk->sk_sock->sk);
}
/*
@@ -495,8 +418,15 @@
return 0;
}
-/*
- * Receive a datagram from a UDP socket.
+/**
+ * svc_udp_recvfrom - Receive a datagram from a UDP socket.
+ * @rqstp: request structure into which to receive an RPC Call
+ *
+ * Called in a loop when XPT_DATA has been set.
+ *
+ * Returns:
+ * On success, the number of bytes in a received RPC Call, or
+ * %0 if a complete RPC Call message was not ready to return
*/
static int svc_udp_recvfrom(struct svc_rqst *rqstp)
{
@@ -530,20 +460,14 @@
svc_sock_setbufsize(svsk, serv->sv_nrthreads + 3);
clear_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
- skb = NULL;
err = kernel_recvmsg(svsk->sk_sock, &msg, NULL,
0, 0, MSG_PEEK | MSG_DONTWAIT);
- if (err >= 0)
- skb = skb_recv_udp(svsk->sk_sk, 0, 1, &err);
+ if (err < 0)
+ goto out_recv_err;
+ skb = skb_recv_udp(svsk->sk_sk, 0, 1, &err);
+ if (!skb)
+ goto out_recv_err;
- if (skb == NULL) {
- if (err != -EAGAIN) {
- /* possibly an icmp error */
- dprintk("svc: recvfrom returned error %d\n", -err);
- set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
- }
- return 0;
- }
len = svc_addr_len(svc_addr(rqstp));
rqstp->rq_addrlen = len;
if (skb->tstamp == 0) {
@@ -554,26 +478,21 @@
sock_write_timestamp(svsk->sk_sk, skb->tstamp);
set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); /* there may be more data... */
- len = skb->len;
+ len = skb->len;
rqstp->rq_arg.len = len;
+ trace_svcsock_udp_recv(&svsk->sk_xprt, len);
rqstp->rq_prot = IPPROTO_UDP;
- if (!svc_udp_get_dest_address(rqstp, cmh)) {
- net_warn_ratelimited("svc: received unknown control message %d/%d; dropping RPC reply datagram\n",
- cmh->cmsg_level, cmh->cmsg_type);
- goto out_free;
- }
+ if (!svc_udp_get_dest_address(rqstp, cmh))
+ goto out_cmsg_err;
rqstp->rq_daddrlen = svc_addr_len(svc_daddr(rqstp));
if (skb_is_nonlinear(skb)) {
/* we have to copy */
local_bh_disable();
- if (csum_partial_copy_to_xdr(&rqstp->rq_arg, skb)) {
- local_bh_enable();
- /* checksum error */
- goto out_free;
- }
+ if (csum_partial_copy_to_xdr(&rqstp->rq_arg, skb))
+ goto out_bh_enable;
local_bh_enable();
consume_skb(skb);
} else {
@@ -601,24 +520,79 @@
serv->sv_stats->netudpcnt++;
return len;
+
+out_recv_err:
+ if (err != -EAGAIN) {
+ /* possibly an icmp error */
+ set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
+ }
+ trace_svcsock_udp_recv_err(&svsk->sk_xprt, err);
+ return 0;
+out_cmsg_err:
+ net_warn_ratelimited("svc: received unknown control message %d/%d; dropping RPC reply datagram\n",
+ cmh->cmsg_level, cmh->cmsg_type);
+ goto out_free;
+out_bh_enable:
+ local_bh_enable();
out_free:
kfree_skb(skb);
return 0;
}
-static int
-svc_udp_sendto(struct svc_rqst *rqstp)
+/**
+ * svc_udp_sendto - Send out a reply on a UDP socket
+ * @rqstp: completed svc_rqst
+ *
+ * xpt_mutex ensures @rqstp's whole message is written to the socket
+ * without interruption.
+ *
+ * Returns the number of bytes sent, or a negative errno.
+ */
+static int svc_udp_sendto(struct svc_rqst *rqstp)
{
- int error;
+ struct svc_xprt *xprt = rqstp->rq_xprt;
+ struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt);
+ struct xdr_buf *xdr = &rqstp->rq_res;
+ union {
+ struct cmsghdr hdr;
+ long all[SVC_PKTINFO_SPACE / sizeof(long)];
+ } buffer;
+ struct cmsghdr *cmh = &buffer.hdr;
+ struct msghdr msg = {
+ .msg_name = &rqstp->rq_addr,
+ .msg_namelen = rqstp->rq_addrlen,
+ .msg_control = cmh,
+ .msg_controllen = sizeof(buffer),
+ };
+ unsigned int sent;
+ int err;
- svc_release_udp_skb(rqstp);
+ svc_udp_release_rqst(rqstp);
- error = svc_sendto(rqstp, &rqstp->rq_res);
- if (error == -ECONNREFUSED)
+ svc_set_cmsg_data(rqstp, cmh);
+
+ mutex_lock(&xprt->xpt_mutex);
+
+ if (svc_xprt_is_dead(xprt))
+ goto out_notconn;
+
+ err = xprt_sock_sendmsg(svsk->sk_sock, &msg, xdr, 0, 0, &sent);
+ xdr_free_bvec(xdr);
+ if (err == -ECONNREFUSED) {
/* ICMP error on earlier request. */
- error = svc_sendto(rqstp, &rqstp->rq_res);
+ err = xprt_sock_sendmsg(svsk->sk_sock, &msg, xdr, 0, 0, &sent);
+ xdr_free_bvec(xdr);
+ }
+ trace_svcsock_udp_send(xprt, err);
- return error;
+ mutex_unlock(&xprt->xpt_mutex);
+ if (err < 0)
+ return err;
+ return sent;
+
+out_notconn:
+ mutex_unlock(&xprt->xpt_mutex);
+ return -ENOTCONN;
}
static int svc_udp_has_wspace(struct svc_xprt *xprt)
@@ -662,7 +636,7 @@
.xpo_recvfrom = svc_udp_recvfrom,
.xpo_sendto = svc_udp_sendto,
.xpo_read_payload = svc_sock_read_payload,
- .xpo_release_rqst = svc_release_udp_skb,
+ .xpo_release_rqst = svc_udp_release_rqst,
.xpo_detach = svc_sock_detach,
.xpo_free = svc_sock_free,
.xpo_has_wspace = svc_udp_has_wspace,
@@ -681,8 +655,6 @@
static void svc_udp_init(struct svc_sock *svsk, struct svc_serv *serv)
{
- int err, level, optname, one = 1;
-
svc_xprt_init(sock_net(svsk->sk_sock->sk), &svc_udp_class,
&svsk->sk_xprt, serv);
clear_bit(XPT_CACHE_AUTH, &svsk->sk_xprt.xpt_flags);
@@ -702,19 +674,14 @@
/* make sure we get destination address info */
switch (svsk->sk_sk->sk_family) {
case AF_INET:
- level = SOL_IP;
- optname = IP_PKTINFO;
+ ip_sock_set_pktinfo(svsk->sk_sock->sk);
break;
case AF_INET6:
- level = SOL_IPV6;
- optname = IPV6_RECVPKTINFO;
+ ip6_sock_set_recvpktinfo(svsk->sk_sock->sk);
break;
default:
BUG();
}
- err = kernel_setsockopt(svsk->sk_sock, level, optname,
- (char *)&one, sizeof(one));
- dprintk("svc: kernel_setsockopt returned %d\n", err);
}
/*
@@ -725,9 +692,6 @@
{
struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data;
- dprintk("svc: socket %p TCP (listen) state change %d\n",
- sk, sk->sk_state);
-
if (svsk) {
/* Refer to svc_setup_socket() for details. */
rmb();
@@ -748,8 +712,7 @@
if (svsk) {
set_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags);
svc_xprt_enqueue(&svsk->sk_xprt);
- } else
- printk("svc: socket %p: no user data\n", sk);
+ }
}
}
@@ -760,15 +723,11 @@
{
struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data;
- dprintk("svc: socket %p TCP (connected) state change %d (svsk %p)\n",
- sk, sk->sk_state, sk->sk_user_data);
-
- if (!svsk)
- printk("svc: socket %p: no user data\n", sk);
- else {
+ if (svsk) {
/* Refer to svc_setup_socket() for details. */
rmb();
svsk->sk_ostate(sk);
+ trace_svcsock_tcp_state(&svsk->sk_xprt, svsk->sk_sock);
if (sk->sk_state != TCP_ESTABLISHED) {
set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags);
svc_xprt_enqueue(&svsk->sk_xprt);
@@ -789,9 +748,7 @@
struct socket *newsock;
struct svc_sock *newsvsk;
int err, slen;
- RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]);
- dprintk("svc: tcp_accept %p sock %p\n", svsk, sock);
if (!sock)
return NULL;
@@ -804,30 +761,18 @@
else if (err != -EAGAIN)
net_warn_ratelimited("%s: accept failed (err %d)!\n",
serv->sv_name, -err);
+ trace_svcsock_accept_err(xprt, serv->sv_name, err);
return NULL;
}
set_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags);
err = kernel_getpeername(newsock, sin);
if (err < 0) {
- net_warn_ratelimited("%s: peername failed (err %d)!\n",
- serv->sv_name, -err);
+ trace_svcsock_getpeername_err(xprt, serv->sv_name, err);
goto failed; /* aborted connection or whatever */
}
slen = err;
- /* Ideally, we would want to reject connections from unauthorized
- * hosts here, but when we get encryption, the IP of the host won't
- * tell us anything. For now just warn about unpriv connections.
- */
- if (!svc_port_is_privileged(sin)) {
- dprintk("%s: connect from unprivileged port: %s\n",
- serv->sv_name,
- __svc_print_addr(sin, buf, sizeof(buf)));
- }
- dprintk("%s: connect from %s\n", serv->sv_name,
- __svc_print_addr(sin, buf, sizeof(buf)));
-
/* Reset the inherited callbacks before calling svc_setup_socket */
newsock->sk->sk_state_change = svsk->sk_ostate;
newsock->sk->sk_data_ready = svsk->sk_odata;
@@ -845,10 +790,8 @@
svc_xprt_set_remote(&newsvsk->sk_xprt, sin, slen);
err = kernel_getsockname(newsock, sin);
slen = err;
- if (unlikely(err < 0)) {
- dprintk("svc_tcp_accept: kernel_getsockname error %d\n", -err);
+ if (unlikely(err < 0))
slen = offsetof(struct sockaddr, sa_data);
- }
svc_xprt_set_local(&newsvsk->sk_xprt, sin, slen);
if (sock_is_loopback(newsock->sk))
@@ -865,13 +808,14 @@
return NULL;
}
-static unsigned int svc_tcp_restore_pages(struct svc_sock *svsk, struct svc_rqst *rqstp)
+static size_t svc_tcp_restore_pages(struct svc_sock *svsk,
+ struct svc_rqst *rqstp)
{
- unsigned int i, len, npages;
+ size_t len = svsk->sk_datalen;
+ unsigned int i, npages;
- if (svsk->sk_datalen == 0)
+ if (!len)
return 0;
- len = svsk->sk_datalen;
npages = (len + PAGE_SIZE - 1) >> PAGE_SHIFT;
for (i = 0; i < npages; i++) {
if (rqstp->rq_pages[i] != NULL)
@@ -920,47 +864,45 @@
}
/*
- * Receive fragment record header.
- * If we haven't gotten the record length yet, get the next four bytes.
+ * Receive fragment record header into sk_marker.
*/
-static int svc_tcp_recv_record(struct svc_sock *svsk, struct svc_rqst *rqstp)
+static ssize_t svc_tcp_read_marker(struct svc_sock *svsk,
+ struct svc_rqst *rqstp)
{
- struct svc_serv *serv = svsk->sk_xprt.xpt_server;
- unsigned int want;
- int len;
+ ssize_t want, len;
+ /* If we haven't gotten the record length yet,
+ * get the next four bytes.
+ */
if (svsk->sk_tcplen < sizeof(rpc_fraghdr)) {
+ struct msghdr msg = { NULL };
struct kvec iov;
want = sizeof(rpc_fraghdr) - svsk->sk_tcplen;
- iov.iov_base = ((char *) &svsk->sk_reclen) + svsk->sk_tcplen;
+ iov.iov_base = ((char *)&svsk->sk_marker) + svsk->sk_tcplen;
iov.iov_len = want;
- len = svc_recvfrom(rqstp, &iov, 1, want, 0);
+ iov_iter_kvec(&msg.msg_iter, READ, &iov, 1, want);
+ len = sock_recvmsg(svsk->sk_sock, &msg, MSG_DONTWAIT);
if (len < 0)
- goto error;
+ return len;
svsk->sk_tcplen += len;
-
if (len < want) {
- dprintk("svc: short recvfrom while reading record "
- "length (%d of %d)\n", len, want);
- return -EAGAIN;
+ /* call again to read the remaining bytes */
+ goto err_short;
}
-
- dprintk("svc: TCP record, %d bytes\n", svc_sock_reclen(svsk));
+ trace_svcsock_marker(&svsk->sk_xprt, svsk->sk_marker);
if (svc_sock_reclen(svsk) + svsk->sk_datalen >
- serv->sv_max_mesg) {
- net_notice_ratelimited("RPC: fragment too large: %d\n",
- svc_sock_reclen(svsk));
- goto err_delete;
- }
+ svsk->sk_xprt.xpt_server->sv_max_mesg)
+ goto err_too_large;
}
-
return svc_sock_reclen(svsk);
-error:
- dprintk("RPC: TCP recv_record got %d\n", len);
- return len;
-err_delete:
+
+err_too_large:
+ net_notice_ratelimited("svc: %s %s RPC fragment too large: %d\n",
+ __func__, svsk->sk_xprt.xpt_server->sv_name,
+ svc_sock_reclen(svsk));
set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags);
+err_short:
return -EAGAIN;
}
@@ -1009,87 +951,58 @@
return -EAGAIN;
}
-static int copy_pages_to_kvecs(struct kvec *vec, struct page **pages, int len)
-{
- int i = 0;
- int t = 0;
-
- while (t < len) {
- vec[i].iov_base = page_address(pages[i]);
- vec[i].iov_len = PAGE_SIZE;
- i++;
- t += PAGE_SIZE;
- }
- return i;
-}
-
static void svc_tcp_fragment_received(struct svc_sock *svsk)
{
/* If we have more data, signal svc_xprt_enqueue() to try again */
- dprintk("svc: TCP %s record (%d bytes)\n",
- svc_sock_final_rec(svsk) ? "final" : "nonfinal",
- svc_sock_reclen(svsk));
svsk->sk_tcplen = 0;
- svsk->sk_reclen = 0;
+ svsk->sk_marker = xdr_zero;
}
-/*
- * Receive data from a TCP socket.
+/**
+ * svc_tcp_recvfrom - Receive data from a TCP socket
+ * @rqstp: request structure into which to receive an RPC Call
+ *
+ * Called in a loop when XPT_DATA has been set.
+ *
+ * Read the 4-byte stream record marker, then use the record length
+ * in that marker to set up exactly the resources needed to receive
+ * the next RPC message into @rqstp.
+ *
+ * Returns:
+ * On success, the number of bytes in a received RPC Call, or
+ * %0 if a complete RPC Call message was not ready to return
+ *
+ * The zero return case handles partial receives and callback Replies.
+ * The state of a partial receive is preserved in the svc_sock for
+ * the next call to svc_tcp_recvfrom.
*/
static int svc_tcp_recvfrom(struct svc_rqst *rqstp)
{
struct svc_sock *svsk =
container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt);
struct svc_serv *serv = svsk->sk_xprt.xpt_server;
- int len;
- struct kvec *vec;
- unsigned int want, base;
+ size_t want, base;
+ ssize_t len;
__be32 *p;
__be32 calldir;
- int pnum;
- dprintk("svc: tcp_recv %p data %d conn %d close %d\n",
- svsk, test_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags),
- test_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags),
- test_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags));
-
- len = svc_tcp_recv_record(svsk, rqstp);
+ clear_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
+ len = svc_tcp_read_marker(svsk, rqstp);
if (len < 0)
goto error;
base = svc_tcp_restore_pages(svsk, rqstp);
- want = svc_sock_reclen(svsk) - (svsk->sk_tcplen - sizeof(rpc_fraghdr));
-
- vec = rqstp->rq_vec;
-
- pnum = copy_pages_to_kvecs(&vec[0], &rqstp->rq_pages[0], base + want);
-
- rqstp->rq_respages = &rqstp->rq_pages[pnum];
- rqstp->rq_next_page = rqstp->rq_respages + 1;
-
- /* Now receive data */
- len = svc_recvfrom(rqstp, vec, pnum, base + want, base);
+ want = len - (svsk->sk_tcplen - sizeof(rpc_fraghdr));
+ len = svc_tcp_read_msg(rqstp, base + want, base);
if (len >= 0) {
+ trace_svcsock_tcp_recv(&svsk->sk_xprt, len);
svsk->sk_tcplen += len;
svsk->sk_datalen += len;
}
- if (len != want || !svc_sock_final_rec(svsk)) {
- svc_tcp_save_pages(svsk, rqstp);
- if (len < 0 && len != -EAGAIN)
- goto err_delete;
- if (len == want)
- svc_tcp_fragment_received(svsk);
- else
- dprintk("svc: incomplete TCP record (%d of %d)\n",
- (int)(svsk->sk_tcplen - sizeof(rpc_fraghdr)),
- svc_sock_reclen(svsk));
- goto err_noclose;
- }
-
- if (svsk->sk_datalen < 8) {
- svsk->sk_datalen = 0;
- goto err_delete; /* client is nuts. */
- }
+ if (len != want || !svc_sock_final_rec(svsk))
+ goto err_incomplete;
+ if (svsk->sk_datalen < 8)
+ goto err_nuts;
rqstp->rq_arg.len = svsk->sk_datalen;
rqstp->rq_arg.page_base = 0;
@@ -1124,50 +1037,163 @@
return rqstp->rq_arg.len;
+err_incomplete:
+ svc_tcp_save_pages(svsk, rqstp);
+ if (len < 0 && len != -EAGAIN)
+ goto err_delete;
+ if (len == want)
+ svc_tcp_fragment_received(svsk);
+ else
+ trace_svcsock_tcp_recv_short(&svsk->sk_xprt,
+ svc_sock_reclen(svsk),
+ svsk->sk_tcplen - sizeof(rpc_fraghdr));
+ goto err_noclose;
error:
if (len != -EAGAIN)
goto err_delete;
- dprintk("RPC: TCP recvfrom got EAGAIN\n");
+ trace_svcsock_tcp_recv_eagain(&svsk->sk_xprt, 0);
return 0;
+err_nuts:
+ svsk->sk_datalen = 0;
err_delete:
- printk(KERN_NOTICE "%s: recvfrom returned errno %d\n",
- svsk->sk_xprt.xpt_server->sv_name, -len);
+ trace_svcsock_tcp_recv_err(&svsk->sk_xprt, len);
set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags);
err_noclose:
return 0; /* record not complete */
}
+static int svc_tcp_send_kvec(struct socket *sock, const struct kvec *vec,
+ int flags)
+{
+ return kernel_sendpage(sock, virt_to_page(vec->iov_base),
+ offset_in_page(vec->iov_base),
+ vec->iov_len, flags);
+}
+
/*
- * Send out data on TCP socket.
+ * kernel_sendpage() is used exclusively to reduce the number of
+ * copy operations in this path. Therefore the caller must ensure
+ * that the pages backing @xdr are unchanging.
+ *
+ * In addition, the logic assumes that * .bv_len is never larger
+ * than PAGE_SIZE.
+ */
+static int svc_tcp_sendmsg(struct socket *sock, struct msghdr *msg,
+ struct xdr_buf *xdr, rpc_fraghdr marker,
+ unsigned int *sentp)
+{
+ const struct kvec *head = xdr->head;
+ const struct kvec *tail = xdr->tail;
+ struct kvec rm = {
+ .iov_base = &marker,
+ .iov_len = sizeof(marker),
+ };
+ int flags, ret;
+
+ *sentp = 0;
+ xdr_alloc_bvec(xdr, GFP_KERNEL);
+
+ msg->msg_flags = MSG_MORE;
+ ret = kernel_sendmsg(sock, msg, &rm, 1, rm.iov_len);
+ if (ret < 0)
+ return ret;
+ *sentp += ret;
+ if (ret != rm.iov_len)
+ return -EAGAIN;
+
+ flags = head->iov_len < xdr->len ? MSG_MORE | MSG_SENDPAGE_NOTLAST : 0;
+ ret = svc_tcp_send_kvec(sock, head, flags);
+ if (ret < 0)
+ return ret;
+ *sentp += ret;
+ if (ret != head->iov_len)
+ goto out;
+
+ if (xdr->page_len) {
+ unsigned int offset, len, remaining;
+ struct bio_vec *bvec;
+
+ bvec = xdr->bvec + (xdr->page_base >> PAGE_SHIFT);
+ offset = offset_in_page(xdr->page_base);
+ remaining = xdr->page_len;
+ flags = MSG_MORE | MSG_SENDPAGE_NOTLAST;
+ while (remaining > 0) {
+ if (remaining <= PAGE_SIZE && tail->iov_len == 0)
+ flags = 0;
+
+ len = min(remaining, bvec->bv_len - offset);
+ ret = kernel_sendpage(sock, bvec->bv_page,
+ bvec->bv_offset + offset,
+ len, flags);
+ if (ret < 0)
+ return ret;
+ *sentp += ret;
+ if (ret != len)
+ goto out;
+ remaining -= len;
+ offset = 0;
+ bvec++;
+ }
+ }
+
+ if (tail->iov_len) {
+ ret = svc_tcp_send_kvec(sock, tail, 0);
+ if (ret < 0)
+ return ret;
+ *sentp += ret;
+ }
+
+out:
+ return 0;
+}
+
+/**
+ * svc_tcp_sendto - Send out a reply on a TCP socket
+ * @rqstp: completed svc_rqst
+ *
+ * xpt_mutex ensures @rqstp's whole message is written to the socket
+ * without interruption.
+ *
+ * Returns the number of bytes sent, or a negative errno.
*/
static int svc_tcp_sendto(struct svc_rqst *rqstp)
{
- struct xdr_buf *xbufp = &rqstp->rq_res;
- int sent;
- __be32 reclen;
+ struct svc_xprt *xprt = rqstp->rq_xprt;
+ struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt);
+ struct xdr_buf *xdr = &rqstp->rq_res;
+ rpc_fraghdr marker = cpu_to_be32(RPC_LAST_STREAM_FRAGMENT |
+ (u32)xdr->len);
+ struct msghdr msg = {
+ .msg_flags = 0,
+ };
+ unsigned int sent;
+ int err;
- svc_release_skb(rqstp);
+ svc_tcp_release_rqst(rqstp);
- /* Set up the first element of the reply kvec.
- * Any other kvecs that may be in use have been taken
- * care of by the server implementation itself.
- */
- reclen = htonl(0x80000000|((xbufp->len ) - 4));
- memcpy(xbufp->head[0].iov_base, &reclen, 4);
-
- sent = svc_sendto(rqstp, &rqstp->rq_res);
- if (sent != xbufp->len) {
- printk(KERN_NOTICE
- "rpc-srv/tcp: %s: %s %d when sending %d bytes "
- "- shutting down socket\n",
- rqstp->rq_xprt->xpt_server->sv_name,
- (sent<0)?"got error":"sent only",
- sent, xbufp->len);
- set_bit(XPT_CLOSE, &rqstp->rq_xprt->xpt_flags);
- svc_xprt_enqueue(rqstp->rq_xprt);
- sent = -EAGAIN;
- }
+ mutex_lock(&xprt->xpt_mutex);
+ if (svc_xprt_is_dead(xprt))
+ goto out_notconn;
+ err = svc_tcp_sendmsg(svsk->sk_sock, &msg, xdr, marker, &sent);
+ xdr_free_bvec(xdr);
+ trace_svcsock_tcp_send(xprt, err < 0 ? (long)err : sent);
+ if (err < 0 || sent != (xdr->len + sizeof(marker)))
+ goto out_close;
+ mutex_unlock(&xprt->xpt_mutex);
return sent;
+
+out_notconn:
+ mutex_unlock(&xprt->xpt_mutex);
+ return -ENOTCONN;
+out_close:
+ pr_notice("rpc-srv/tcp: %s: %s %d when sending %d bytes - shutting down socket\n",
+ xprt->xpt_server->sv_name,
+ (err < 0) ? "got error" : "sent",
+ (err < 0) ? err : sent, xdr->len);
+ set_bit(XPT_CLOSE, &xprt->xpt_flags);
+ svc_xprt_enqueue(xprt);
+ mutex_unlock(&xprt->xpt_mutex);
+ return -EAGAIN;
}
static struct svc_xprt *svc_tcp_create(struct svc_serv *serv,
@@ -1183,7 +1209,7 @@
.xpo_recvfrom = svc_tcp_recvfrom,
.xpo_sendto = svc_tcp_sendto,
.xpo_read_payload = svc_sock_read_payload,
- .xpo_release_rqst = svc_release_skb,
+ .xpo_release_rqst = svc_tcp_release_rqst,
.xpo_detach = svc_tcp_sock_detach,
.xpo_free = svc_sock_free,
.xpo_has_wspace = svc_tcp_has_wspace,
@@ -1221,18 +1247,16 @@
set_bit(XPT_CACHE_AUTH, &svsk->sk_xprt.xpt_flags);
set_bit(XPT_CONG_CTRL, &svsk->sk_xprt.xpt_flags);
if (sk->sk_state == TCP_LISTEN) {
- dprintk("setting up TCP socket for listening\n");
strcpy(svsk->sk_xprt.xpt_remotebuf, "listener");
set_bit(XPT_LISTENER, &svsk->sk_xprt.xpt_flags);
sk->sk_data_ready = svc_tcp_listen_data_ready;
set_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags);
} else {
- dprintk("setting up TCP socket for reading\n");
sk->sk_state_change = svc_tcp_state_change;
sk->sk_data_ready = svc_data_ready;
sk->sk_write_space = svc_write_space;
- svsk->sk_reclen = 0;
+ svsk->sk_marker = xdr_zero;
svsk->sk_tcplen = 0;
svsk->sk_datalen = 0;
memset(&svsk->sk_pages[0], 0, sizeof(svsk->sk_pages));
@@ -1277,7 +1301,6 @@
int pmap_register = !(flags & SVC_SOCK_ANONYMOUS);
int err = 0;
- dprintk("svc: svc_setup_socket %p\n", sock);
svsk = kzalloc(sizeof(*svsk), GFP_KERNEL);
if (!svsk)
return ERR_PTR(-ENOMEM);
@@ -1314,12 +1337,7 @@
else
svc_tcp_init(svsk, serv);
- dprintk("svc: svc_setup_socket created %p (inet %p), "
- "listen %d close %d\n",
- svsk, svsk->sk_sk,
- test_bit(XPT_LISTENER, &svsk->sk_xprt.xpt_flags),
- test_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags));
-
+ trace_svcsock_new_socket(sock);
return svsk;
}
@@ -1411,12 +1429,6 @@
struct sockaddr *newsin = (struct sockaddr *)&addr;
int newlen;
int family;
- int val;
- RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]);
-
- dprintk("svc: svc_create_socket(%s, %d, %s)\n",
- serv->sv_program->pg_name, protocol,
- __svc_print_addr(sin, buf, sizeof(buf)));
if (protocol != IPPROTO_UDP && protocol != IPPROTO_TCP) {
printk(KERN_WARNING "svc: only UDP and TCP "
@@ -1447,11 +1459,8 @@
* getting requests from IPv4 remotes. Those should
* be shunted to a PF_INET listener via rpcbind.
*/
- val = 1;
if (family == PF_INET6)
- kernel_setsockopt(sock, SOL_IPV6, IPV6_V6ONLY,
- (char *)&val, sizeof(val));
-
+ ip6_sock_set_v6only(sock->sk);
if (type == SOCK_STREAM)
sock->sk->sk_reuse = SK_CAN_REUSE; /* allow address reuse */
error = kernel_bind(sock, sin, len);
@@ -1476,7 +1485,6 @@
svc_xprt_set_local(&svsk->sk_xprt, newsin, newlen);
return (struct svc_xprt *)svsk;
bummer:
- dprintk("svc: svc_create_socket error = %d\n", -error);
sock_release(sock);
return ERR_PTR(error);
}
@@ -1490,8 +1498,6 @@
struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt);
struct sock *sk = svsk->sk_sk;
- dprintk("svc: svc_sock_detach(%p)\n", svsk);
-
/* put back the old socket callbacks */
lock_sock(sk);
sk->sk_state_change = svsk->sk_ostate;
@@ -1508,8 +1514,6 @@
{
struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt);
- dprintk("svc: svc_tcp_sock_detach(%p)\n", svsk);
-
svc_sock_detach(xprt);
if (!test_bit(XPT_LISTENER, &xprt->xpt_flags)) {
@@ -1524,7 +1528,6 @@
static void svc_sock_free(struct svc_xprt *xprt)
{
struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt);
- dprintk("svc: svc_sock_free(%p)\n", svsk);
if (svsk->sk_sock->file)
sockfd_put(svsk->sk_sock);
diff --git a/net/sunrpc/sysctl.c b/net/sunrpc/sysctl.c
index d75f17b..3aad6ef 100644
--- a/net/sunrpc/sysctl.c
+++ b/net/sunrpc/sysctl.c
@@ -60,25 +60,32 @@
}
static int proc_do_xprt(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
char tmpbuf[256];
- size_t len;
+ ssize_t len;
- if ((*ppos && !write) || !*lenp) {
+ if (write || *ppos) {
*lenp = 0;
return 0;
}
len = svc_print_xprts(tmpbuf, sizeof(tmpbuf));
- return simple_read_from_buffer(buffer, *lenp, ppos, tmpbuf, len);
+ len = memory_read_from_buffer(buffer, *lenp, ppos, tmpbuf, len);
+
+ if (len < 0) {
+ *lenp = 0;
+ return -EINVAL;
+ }
+ *lenp = len;
+ return 0;
}
static int
-proc_dodebug(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+proc_dodebug(struct ctl_table *table, int write, void *buffer, size_t *lenp,
+ loff_t *ppos)
{
- char tmpbuf[20], c, *s = NULL;
- char __user *p;
+ char tmpbuf[20], *s = NULL;
+ char *p;
unsigned int value;
size_t left, len;
@@ -90,18 +97,17 @@
left = *lenp;
if (write) {
- if (!access_ok(buffer, left))
- return -EFAULT;
p = buffer;
- while (left && __get_user(c, p) >= 0 && isspace(c))
- left--, p++;
+ while (left && isspace(*p)) {
+ left--;
+ p++;
+ }
if (!left)
goto done;
if (left > sizeof(tmpbuf) - 1)
return -EINVAL;
- if (copy_from_user(tmpbuf, p, left))
- return -EFAULT;
+ memcpy(tmpbuf, p, left);
tmpbuf[left] = '\0';
value = simple_strtol(tmpbuf, &s, 0);
@@ -109,8 +115,10 @@
left -= (s - tmpbuf);
if (left && !isspace(*s))
return -EINVAL;
- while (left && isspace(*s))
- left--, s++;
+ while (left && isspace(*s)) {
+ left--;
+ s++;
+ }
} else
left = 0;
*(unsigned int *) table->data = value;
@@ -121,11 +129,9 @@
len = sprintf(tmpbuf, "0x%04x", *(unsigned int *) table->data);
if (len > left)
len = left;
- if (copy_to_user(buffer, tmpbuf, len))
- return -EFAULT;
+ memcpy(buffer, tmpbuf, len);
if ((left -= len) > 0) {
- if (put_user('\n', (char __user *)buffer + len))
- return -EFAULT;
+ *((char *)buffer + len) = '\n';
left--;
}
}
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index 7ef3705..71e03b9 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -19,6 +19,9 @@
#include <linux/bvec.h>
#include <trace/events/sunrpc.h>
+static void _copy_to_pages(struct page **, size_t, const char *, size_t);
+
+
/*
* XDR functions for basic NFS types
*/
@@ -202,6 +205,88 @@
*/
/**
+ * _shift_data_left_pages
+ * @pages: vector of pages containing both the source and dest memory area.
+ * @pgto_base: page vector address of destination
+ * @pgfrom_base: page vector address of source
+ * @len: number of bytes to copy
+ *
+ * Note: the addresses pgto_base and pgfrom_base are both calculated in
+ * the same way:
+ * if a memory area starts at byte 'base' in page 'pages[i]',
+ * then its address is given as (i << PAGE_CACHE_SHIFT) + base
+ * Alse note: pgto_base must be < pgfrom_base, but the memory areas
+ * they point to may overlap.
+ */
+static void
+_shift_data_left_pages(struct page **pages, size_t pgto_base,
+ size_t pgfrom_base, size_t len)
+{
+ struct page **pgfrom, **pgto;
+ char *vfrom, *vto;
+ size_t copy;
+
+ BUG_ON(pgfrom_base <= pgto_base);
+
+ pgto = pages + (pgto_base >> PAGE_SHIFT);
+ pgfrom = pages + (pgfrom_base >> PAGE_SHIFT);
+
+ pgto_base &= ~PAGE_MASK;
+ pgfrom_base &= ~PAGE_MASK;
+
+ do {
+ if (pgto_base >= PAGE_SIZE) {
+ pgto_base = 0;
+ pgto++;
+ }
+ if (pgfrom_base >= PAGE_SIZE){
+ pgfrom_base = 0;
+ pgfrom++;
+ }
+
+ copy = len;
+ if (copy > (PAGE_SIZE - pgto_base))
+ copy = PAGE_SIZE - pgto_base;
+ if (copy > (PAGE_SIZE - pgfrom_base))
+ copy = PAGE_SIZE - pgfrom_base;
+
+ vto = kmap_atomic(*pgto);
+ if (*pgto != *pgfrom) {
+ vfrom = kmap_atomic(*pgfrom);
+ memcpy(vto + pgto_base, vfrom + pgfrom_base, copy);
+ kunmap_atomic(vfrom);
+ } else
+ memmove(vto + pgto_base, vto + pgfrom_base, copy);
+ flush_dcache_page(*pgto);
+ kunmap_atomic(vto);
+
+ pgto_base += copy;
+ pgfrom_base += copy;
+
+ } while ((len -= copy) != 0);
+}
+
+static void
+_shift_data_left_tail(struct xdr_buf *buf, unsigned int pgto, size_t len)
+{
+ struct kvec *tail = buf->tail;
+
+ if (len > tail->iov_len)
+ len = tail->iov_len;
+
+ _copy_to_pages(buf->pages,
+ buf->page_base + pgto,
+ (char *)tail->iov_base,
+ len);
+ tail->iov_len -= len;
+
+ if (tail->iov_len > 0)
+ memmove((char *)tail->iov_base,
+ tail->iov_base + len,
+ tail->iov_len);
+}
+
+/**
* _shift_data_right_pages
* @pages: vector of pages containing both the source and dest memory area.
* @pgto_base: page vector address of destination
@@ -266,6 +351,46 @@
} while ((len -= copy) != 0);
}
+static unsigned int
+_shift_data_right_tail(struct xdr_buf *buf, unsigned int pgfrom, size_t len)
+{
+ struct kvec *tail = buf->tail;
+ unsigned int tailbuf_len;
+ unsigned int result = 0;
+ size_t copy;
+
+ tailbuf_len = buf->buflen - buf->head->iov_len - buf->page_len;
+
+ /* Shift the tail first */
+ if (tailbuf_len != 0) {
+ unsigned int free_space = tailbuf_len - tail->iov_len;
+
+ if (len < free_space)
+ free_space = len;
+ if (len > free_space)
+ len = free_space;
+
+ tail->iov_len += free_space;
+ copy = len;
+
+ if (tail->iov_len > len) {
+ char *p = (char *)tail->iov_base + len;
+ memmove(p, tail->iov_base, tail->iov_len - free_space);
+ result += tail->iov_len - free_space;
+ } else
+ copy = tail->iov_len;
+
+ /* Copy from the inlined pages into the tail */
+ _copy_from_pages((char *)tail->iov_base,
+ buf->pages,
+ buf->page_base + pgfrom,
+ copy);
+ result += copy;
+ }
+
+ return result;
+}
+
/**
* _copy_to_pages
* @pages: array of pages
@@ -351,6 +476,38 @@
EXPORT_SYMBOL_GPL(_copy_from_pages);
/**
+ * _zero_pages
+ * @pages: array of pages
+ * @pgbase: beginning page vector address
+ * @len: length
+ */
+static void
+_zero_pages(struct page **pages, size_t pgbase, size_t len)
+{
+ struct page **page;
+ char *vpage;
+ size_t zero;
+
+ page = pages + (pgbase >> PAGE_SHIFT);
+ pgbase &= ~PAGE_MASK;
+
+ do {
+ zero = PAGE_SIZE - pgbase;
+ if (zero > len)
+ zero = len;
+
+ vpage = kmap_atomic(*page);
+ memset(vpage + pgbase, 0, zero);
+ kunmap_atomic(vpage);
+
+ flush_dcache_page(*page);
+ pgbase = 0;
+ page++;
+
+ } while ((len -= zero) != 0);
+}
+
+/**
* xdr_shrink_bufhead
* @buf: xdr_buf
* @len: bytes to remove from buf->head[0]
@@ -446,39 +603,13 @@
static unsigned int
xdr_shrink_pagelen(struct xdr_buf *buf, size_t len)
{
- struct kvec *tail;
- size_t copy;
unsigned int pglen = buf->page_len;
- unsigned int tailbuf_len;
unsigned int result;
- result = 0;
- tail = buf->tail;
if (len > buf->page_len)
len = buf-> page_len;
- tailbuf_len = buf->buflen - buf->head->iov_len - buf->page_len;
- /* Shift the tail first */
- if (tailbuf_len != 0) {
- unsigned int free_space = tailbuf_len - tail->iov_len;
-
- if (len < free_space)
- free_space = len;
- tail->iov_len += free_space;
-
- copy = len;
- if (tail->iov_len > len) {
- char *p = (char *)tail->iov_base + len;
- memmove(p, tail->iov_base, tail->iov_len - len);
- result += tail->iov_len - len;
- } else
- copy = tail->iov_len;
- /* Copy from the inlined pages into the tail */
- _copy_from_pages((char *)tail->iov_base,
- buf->pages, buf->page_base + pglen - len,
- copy);
- result += copy;
- }
+ result = _shift_data_right_tail(buf, pglen - len, len);
buf->page_len -= len;
buf->buflen -= len;
/* Have we truncated the message? */
@@ -506,6 +637,19 @@
EXPORT_SYMBOL_GPL(xdr_stream_pos);
/**
+ * xdr_page_pos - Return the current offset from the start of the xdr pages
+ * @xdr: pointer to struct xdr_stream
+ */
+unsigned int xdr_page_pos(const struct xdr_stream *xdr)
+{
+ unsigned int pos = xdr_stream_pos(xdr);
+
+ WARN_ON(pos < xdr->buf->head[0].iov_len);
+ return pos - xdr->buf->head[0].iov_len;
+}
+EXPORT_SYMBOL_GPL(xdr_page_pos);
+
+/**
* xdr_init_encode - Initialize a struct xdr_stream for sending data.
* @xdr: pointer to xdr_stream struct
* @buf: pointer to XDR buffer in which to encode data
@@ -648,6 +792,51 @@
}
EXPORT_SYMBOL_GPL(xdr_reserve_space);
+
+/**
+ * xdr_reserve_space_vec - Reserves a large amount of buffer space for sending
+ * @xdr: pointer to xdr_stream
+ * @vec: pointer to a kvec array
+ * @nbytes: number of bytes to reserve
+ *
+ * Reserves enough buffer space to encode 'nbytes' of data and stores the
+ * pointers in 'vec'. The size argument passed to xdr_reserve_space() is
+ * determined based on the number of bytes remaining in the current page to
+ * avoid invalidating iov_base pointers when xdr_commit_encode() is called.
+ */
+int xdr_reserve_space_vec(struct xdr_stream *xdr, struct kvec *vec, size_t nbytes)
+{
+ int thislen;
+ int v = 0;
+ __be32 *p;
+
+ /*
+ * svcrdma requires every READ payload to start somewhere
+ * in xdr->pages.
+ */
+ if (xdr->iov == xdr->buf->head) {
+ xdr->iov = NULL;
+ xdr->end = xdr->p;
+ }
+
+ while (nbytes) {
+ thislen = xdr->buf->page_len % PAGE_SIZE;
+ thislen = min_t(size_t, nbytes, PAGE_SIZE - thislen);
+
+ p = xdr_reserve_space(xdr, thislen);
+ if (!p)
+ return -EIO;
+
+ vec[v].iov_base = p;
+ vec[v].iov_len = thislen;
+ v++;
+ nbytes -= thislen;
+ }
+
+ return v;
+}
+EXPORT_SYMBOL_GPL(xdr_reserve_space_vec);
+
/**
* xdr_truncate_encode - truncate an encode buffer
* @xdr: pointer to xdr_stream
@@ -658,7 +847,7 @@
* head, tail, and page lengths are adjusted to correspond.
*
* If this means moving xdr->p to a different buffer, we assume that
- * that the end pointer should be set to the end of the current page,
+ * the end pointer should be set to the end of the current page,
* except in the case of the head buffer when we assume the head
* buffer's current length represents the end of the available buffer.
*
@@ -825,6 +1014,13 @@
return 0;
}
+static void xdr_set_page(struct xdr_stream *xdr, unsigned int base,
+ unsigned int len)
+{
+ if (xdr_set_page_base(xdr, base, len) < 0)
+ xdr_set_iov(xdr, xdr->buf->tail, xdr->nwords << 2);
+}
+
static void xdr_set_next_page(struct xdr_stream *xdr)
{
unsigned int newbase;
@@ -832,8 +1028,7 @@
newbase = (1 + xdr->page_ptr - xdr->buf->pages) << PAGE_SHIFT;
newbase -= xdr->buf->page_base;
- if (xdr_set_page_base(xdr, newbase, PAGE_SIZE) < 0)
- xdr_set_iov(xdr, xdr->buf->tail, xdr->nwords << 2);
+ xdr_set_page(xdr, newbase, PAGE_SIZE);
}
static bool xdr_set_next_buffer(struct xdr_stream *xdr)
@@ -841,8 +1036,7 @@
if (xdr->page_ptr != NULL)
xdr_set_next_page(xdr);
else if (xdr->iov == xdr->buf->head) {
- if (xdr_set_page_base(xdr, 0, PAGE_SIZE) < 0)
- xdr_set_iov(xdr, xdr->buf->tail, xdr->nwords << 2);
+ xdr_set_page(xdr, 0, PAGE_SIZE);
}
return xdr->p != xdr->end;
}
@@ -979,10 +1173,25 @@
}
EXPORT_SYMBOL_GPL(xdr_inline_decode);
+static void xdr_realign_pages(struct xdr_stream *xdr)
+{
+ struct xdr_buf *buf = xdr->buf;
+ struct kvec *iov = buf->head;
+ unsigned int cur = xdr_stream_pos(xdr);
+ unsigned int copied, offset;
+
+ /* Realign pages to current pointer position */
+ if (iov->iov_len > cur) {
+ offset = iov->iov_len - cur;
+ copied = xdr_shrink_bufhead(buf, offset);
+ trace_rpc_xdr_alignment(xdr, offset, copied);
+ xdr->nwords = XDR_QUADLEN(buf->len - cur);
+ }
+}
+
static unsigned int xdr_align_pages(struct xdr_stream *xdr, unsigned int len)
{
struct xdr_buf *buf = xdr->buf;
- struct kvec *iov;
unsigned int nwords = XDR_QUADLEN(len);
unsigned int cur = xdr_stream_pos(xdr);
unsigned int copied, offset;
@@ -990,15 +1199,7 @@
if (xdr->nwords == 0)
return 0;
- /* Realign pages to current pointer position */
- iov = buf->head;
- if (iov->iov_len > cur) {
- offset = iov->iov_len - cur;
- copied = xdr_shrink_bufhead(buf, offset);
- trace_rpc_xdr_alignment(xdr, offset, copied);
- xdr->nwords = XDR_QUADLEN(buf->len - cur);
- }
-
+ xdr_realign_pages(xdr);
if (nwords > xdr->nwords) {
nwords = xdr->nwords;
len = nwords << 2;
@@ -1057,6 +1258,79 @@
}
EXPORT_SYMBOL_GPL(xdr_read_pages);
+uint64_t xdr_align_data(struct xdr_stream *xdr, uint64_t offset, uint32_t length)
+{
+ struct xdr_buf *buf = xdr->buf;
+ unsigned int from, bytes;
+ unsigned int shift = 0;
+
+ if ((offset + length) < offset ||
+ (offset + length) > buf->page_len)
+ length = buf->page_len - offset;
+
+ xdr_realign_pages(xdr);
+ from = xdr_page_pos(xdr);
+ bytes = xdr->nwords << 2;
+ if (length < bytes)
+ bytes = length;
+
+ /* Move page data to the left */
+ if (from > offset) {
+ shift = min_t(unsigned int, bytes, buf->page_len - from);
+ _shift_data_left_pages(buf->pages,
+ buf->page_base + offset,
+ buf->page_base + from,
+ shift);
+ bytes -= shift;
+
+ /* Move tail data into the pages, if necessary */
+ if (bytes > 0)
+ _shift_data_left_tail(buf, offset + shift, bytes);
+ }
+
+ xdr->nwords -= XDR_QUADLEN(length);
+ xdr_set_page(xdr, from + length, PAGE_SIZE);
+ return length;
+}
+EXPORT_SYMBOL_GPL(xdr_align_data);
+
+uint64_t xdr_expand_hole(struct xdr_stream *xdr, uint64_t offset, uint64_t length)
+{
+ struct xdr_buf *buf = xdr->buf;
+ unsigned int bytes;
+ unsigned int from;
+ unsigned int truncated = 0;
+
+ if ((offset + length) < offset ||
+ (offset + length) > buf->page_len)
+ length = buf->page_len - offset;
+
+ xdr_realign_pages(xdr);
+ from = xdr_page_pos(xdr);
+ bytes = xdr->nwords << 2;
+
+ if (offset + length + bytes > buf->page_len) {
+ unsigned int shift = (offset + length + bytes) - buf->page_len;
+ unsigned int res = _shift_data_right_tail(buf, from + bytes - shift, shift);
+ truncated = shift - res;
+ xdr->nwords -= XDR_QUADLEN(truncated);
+ bytes -= shift;
+ }
+
+ /* Now move the page data over and zero pages */
+ if (bytes > 0)
+ _shift_data_right_pages(buf->pages,
+ buf->page_base + offset + length,
+ buf->page_base + from,
+ bytes);
+ _zero_pages(buf->pages, buf->page_base + offset, length);
+
+ buf->len += length - (from - offset) - truncated;
+ xdr_set_page(xdr, offset + length, PAGE_SIZE);
+ return length;
+}
+EXPORT_SYMBOL_GPL(xdr_expand_hole);
+
/**
* xdr_enter_page - decode data from the XDR page
* @xdr: pointer to xdr_stream struct
@@ -1079,7 +1353,7 @@
}
EXPORT_SYMBOL_GPL(xdr_enter_page);
-static struct kvec empty_iov = {.iov_base = NULL, .iov_len = 0};
+static const struct kvec empty_iov = {.iov_base = NULL, .iov_len = 0};
void
xdr_buf_from_iov(struct kvec *iov, struct xdr_buf *buf)
@@ -1280,61 +1554,6 @@
}
EXPORT_SYMBOL_GPL(xdr_encode_word);
-/**
- * xdr_buf_read_mic() - obtain the address of the GSS mic from xdr buf
- * @buf: pointer to buffer containing a mic
- * @mic: on success, returns the address of the mic
- * @offset: the offset in buf where mic may be found
- *
- * This function may modify the xdr buf if the mic is found to be straddling
- * a boundary between head, pages, and tail. On success the mic can be read
- * from the address returned. There is no need to free the mic.
- *
- * Return: Success returns 0, otherwise an integer error.
- */
-int xdr_buf_read_mic(struct xdr_buf *buf, struct xdr_netobj *mic, unsigned int offset)
-{
- struct xdr_buf subbuf;
- unsigned int boundary;
-
- if (xdr_decode_word(buf, offset, &mic->len))
- return -EFAULT;
- offset += 4;
-
- /* Is the mic partially in the head? */
- boundary = buf->head[0].iov_len;
- if (offset < boundary && (offset + mic->len) > boundary)
- xdr_shift_buf(buf, boundary - offset);
-
- /* Is the mic partially in the pages? */
- boundary += buf->page_len;
- if (offset < boundary && (offset + mic->len) > boundary)
- xdr_shrink_pagelen(buf, boundary - offset);
-
- if (xdr_buf_subsegment(buf, &subbuf, offset, mic->len))
- return -EFAULT;
-
- /* Is the mic contained entirely in the head? */
- mic->data = subbuf.head[0].iov_base;
- if (subbuf.head[0].iov_len == mic->len)
- return 0;
- /* ..or is the mic contained entirely in the tail? */
- mic->data = subbuf.tail[0].iov_base;
- if (subbuf.tail[0].iov_len == mic->len)
- return 0;
-
- /* Find a contiguous area in @buf to hold all of @mic */
- if (mic->len > buf->buflen - buf->len)
- return -ENOMEM;
- if (buf->tail[0].iov_len != 0)
- mic->data = buf->tail[0].iov_base + buf->tail[0].iov_len;
- else
- mic->data = buf->head[0].iov_base + buf->head[0].iov_len;
- __read_bytes_from_xdr_buf(&subbuf, mic->data, mic->len);
- return 0;
-}
-EXPORT_SYMBOL_GPL(xdr_buf_read_mic);
-
/* Returns 0 on success, or else a negative error code. */
static int
xdr_xcode_array2(struct xdr_buf *buf, unsigned int base,
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 3653898..04aaca4 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -70,6 +70,7 @@
static void xprt_init(struct rpc_xprt *xprt, struct net *net);
static __be32 xprt_alloc_xid(struct rpc_xprt *xprt);
static void xprt_destroy(struct rpc_xprt *xprt);
+static void xprt_request_init(struct rpc_task *task);
static DEFINE_SPINLOCK(xprt_list_lock);
static LIST_HEAD(xprt_list);
@@ -238,20 +239,20 @@
if (test_and_set_bit(XPRT_LOCKED, &xprt->state)) {
if (task == xprt->snd_task)
- return 1;
+ goto out_locked;
goto out_sleep;
}
if (test_bit(XPRT_WRITE_SPACE, &xprt->state))
goto out_unlock;
xprt->snd_task = task;
+out_locked:
+ trace_xprt_reserve_xprt(xprt, task);
return 1;
out_unlock:
xprt_clear_locked(xprt);
out_sleep:
- dprintk("RPC: %5u failed to lock transport %p\n",
- task->tk_pid, xprt);
task->tk_status = -EAGAIN;
if (RPC_IS_SOFT(task))
rpc_sleep_on_timeout(&xprt->sending, task, NULL,
@@ -302,23 +303,22 @@
if (test_and_set_bit(XPRT_LOCKED, &xprt->state)) {
if (task == xprt->snd_task)
- return 1;
+ goto out_locked;
goto out_sleep;
}
if (req == NULL) {
xprt->snd_task = task;
- return 1;
+ goto out_locked;
}
if (test_bit(XPRT_WRITE_SPACE, &xprt->state))
goto out_unlock;
if (!xprt_need_congestion_window_wait(xprt)) {
xprt->snd_task = task;
- return 1;
+ goto out_locked;
}
out_unlock:
xprt_clear_locked(xprt);
out_sleep:
- dprintk("RPC: %5u failed to lock transport %p\n", task->tk_pid, xprt);
task->tk_status = -EAGAIN;
if (RPC_IS_SOFT(task))
rpc_sleep_on_timeout(&xprt->sending, task, NULL,
@@ -326,6 +326,9 @@
else
rpc_sleep_on(&xprt->sending, task, NULL);
return 0;
+out_locked:
+ trace_xprt_reserve_cong(xprt, task);
+ return 1;
}
EXPORT_SYMBOL_GPL(xprt_reserve_xprt_cong);
@@ -390,6 +393,7 @@
xprt_clear_locked(xprt);
__xprt_lock_write_next(xprt);
}
+ trace_xprt_release_xprt(xprt, task);
}
EXPORT_SYMBOL_GPL(xprt_release_xprt);
@@ -407,6 +411,7 @@
xprt_clear_locked(xprt);
__xprt_lock_write_next_cong(xprt);
}
+ trace_xprt_release_cong(xprt, task);
}
EXPORT_SYMBOL_GPL(xprt_release_xprt_cong);
@@ -428,8 +433,7 @@
{
if (req->rq_cong)
return 1;
- dprintk("RPC: %5u xprt_cwnd_limited cong = %lu cwnd = %lu\n",
- req->rq_task->tk_pid, xprt->cong, xprt->cwnd);
+ trace_xprt_get_cong(xprt, req->rq_task);
if (RPCXPRT_CONGESTED(xprt)) {
xprt_set_congestion_window_wait(xprt);
return 0;
@@ -451,6 +455,7 @@
req->rq_cong = 0;
xprt->cong -= RPC_CWNDSCALE;
xprt_test_and_clear_congestion_window_wait(xprt);
+ trace_xprt_put_cong(xprt, req->rq_task);
__xprt_lock_write_next_cong(xprt);
}
@@ -636,6 +641,11 @@
req->rq_majortimeo += xprt_calc_majortimeo(req);
}
+static void xprt_reset_minortimeo(struct rpc_rqst *req)
+{
+ req->rq_minortimeo += req->rq_timeout;
+}
+
static void xprt_init_majortimeo(struct rpc_task *task, struct rpc_rqst *req)
{
unsigned long time_init;
@@ -647,6 +657,7 @@
time_init = xprt_abs_ktime_to_jiffies(task->tk_start);
req->rq_timeout = task->tk_client->cl_timeout->to_initval;
req->rq_majortimeo = time_init + xprt_calc_majortimeo(req);
+ req->rq_minortimeo = time_init + req->rq_timeout;
}
/**
@@ -661,6 +672,8 @@
int status = 0;
if (time_before(jiffies, req->rq_majortimeo)) {
+ if (time_before(jiffies, req->rq_minortimeo))
+ return status;
if (to->to_exponential)
req->rq_timeout <<= 1;
else
@@ -678,6 +691,7 @@
spin_unlock(&xprt->transport_lock);
status = -ETIMEDOUT;
}
+ xprt_reset_minortimeo(req);
if (req->rq_timeout == 0) {
printk(KERN_WARNING "xprt_adjust_timeout: rq_timeout = 0!\n");
@@ -692,6 +706,7 @@
container_of(work, struct rpc_xprt, task_cleanup);
unsigned int pflags = memalloc_nofs_save();
+ trace_xprt_disconnect_auto(xprt);
clear_bit(XPRT_CLOSE_WAIT, &xprt->state);
xprt->ops->close(xprt);
xprt_release_write(xprt, NULL);
@@ -706,7 +721,7 @@
*/
void xprt_disconnect_done(struct rpc_xprt *xprt)
{
- dprintk("RPC: disconnected transport %p\n", xprt);
+ trace_xprt_disconnect_done(xprt);
spin_lock(&xprt->transport_lock);
xprt_clear_connected(xprt);
xprt_clear_write_space_locked(xprt);
@@ -723,6 +738,8 @@
*/
void xprt_force_disconnect(struct rpc_xprt *xprt)
{
+ trace_xprt_disconnect_force(xprt);
+
/* Don't race with the test_bit() in xprt_clear_locked() */
spin_lock(&xprt->transport_lock);
set_bit(XPRT_CLOSE_WAIT, &xprt->state);
@@ -827,6 +844,7 @@
spin_unlock(&xprt->transport_lock);
return ret;
}
+EXPORT_SYMBOL_GPL(xprt_lock_connect);
void xprt_unlock_connect(struct rpc_xprt *xprt, void *cookie)
{
@@ -843,6 +861,7 @@
spin_unlock(&xprt->transport_lock);
wake_up_bit(&xprt->state, XPRT_LOCKED);
}
+EXPORT_SYMBOL_GPL(xprt_unlock_connect);
/**
* xprt_connect - schedule a transport connect operation
@@ -853,8 +872,7 @@
{
struct rpc_xprt *xprt = task->tk_rqstp->rq_xprt;
- dprintk("RPC: %5u xprt_connect xprt %p %s connected\n", task->tk_pid,
- xprt, (xprt_connected(xprt) ? "is" : "is not"));
+ trace_xprt_connect(xprt);
if (!xprt_bound(xprt)) {
task->tk_status = -EAGAIN;
@@ -863,8 +881,10 @@
if (!xprt_lock_write(xprt, task))
return;
- if (test_and_clear_bit(XPRT_CLOSE_WAIT, &xprt->state))
+ if (test_and_clear_bit(XPRT_CLOSE_WAIT, &xprt->state)) {
+ trace_xprt_disconnect_cleanup(xprt);
xprt->ops->close(xprt);
+ }
if (!xprt_connected(xprt)) {
task->tk_rqstp->rq_connect_cookie = xprt->connect_cookie;
@@ -1148,10 +1168,6 @@
struct rpc_rqst *req = task->tk_rqstp;
struct rpc_xprt *xprt = req->rq_xprt;
- dprintk("RPC: %5u xid %08x complete (%d bytes received)\n",
- task->tk_pid, ntohl(req->rq_xid), copied);
- trace_xprt_complete_rqst(xprt, req->rq_xid, copied);
-
xprt->stat.recvs++;
req->rq_private_buf.len = copied;
@@ -1288,7 +1304,6 @@
/* Note: req is added _before_ pos */
list_add_tail(&req->rq_xmit, &pos->rq_xmit);
INIT_LIST_HEAD(&req->rq_xmit2);
- trace_xprt_enq_xmit(task, 1);
goto out;
}
} else if (RPC_IS_SWAPPER(task)) {
@@ -1300,7 +1315,6 @@
/* Note: req is added _before_ pos */
list_add_tail(&req->rq_xmit, &pos->rq_xmit);
INIT_LIST_HEAD(&req->rq_xmit2);
- trace_xprt_enq_xmit(task, 2);
goto out;
}
} else if (!req->rq_seqno) {
@@ -1309,13 +1323,11 @@
continue;
list_add_tail(&req->rq_xmit2, &pos->rq_xmit2);
INIT_LIST_HEAD(&req->rq_xmit);
- trace_xprt_enq_xmit(task, 3);
goto out;
}
}
list_add_tail(&req->rq_xmit, &xprt->xmit_queue);
INIT_LIST_HEAD(&req->rq_xmit2);
- trace_xprt_enq_xmit(task, 4);
out:
set_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate);
spin_unlock(&xprt->queue_lock);
@@ -1433,8 +1445,6 @@
struct rpc_rqst *req = task->tk_rqstp;
struct rpc_xprt *xprt = req->rq_xprt;
- dprintk("RPC: %5u xprt_prepare_transmit\n", task->tk_pid);
-
if (!xprt_lock_write(xprt, task)) {
/* Race breaker: someone may have transmitted us */
if (!test_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate))
@@ -1448,7 +1458,10 @@
void xprt_end_transmit(struct rpc_task *task)
{
- xprt_release_write(task->tk_rqstp->rq_xprt, task);
+ struct rpc_xprt *xprt = task->tk_rqstp->rq_xprt;
+
+ xprt_inject_disconnect(xprt);
+ xprt_release_write(xprt, task);
}
/**
@@ -1493,6 +1506,7 @@
*/
req->rq_ntrans++;
+ trace_rpc_xdr_sendto(task, &req->rq_snd_buf);
connect_cookie = xprt->connect_cookie;
status = xprt->ops->send_request(req);
if (status != 0) {
@@ -1538,15 +1552,14 @@
{
struct rpc_rqst *next, *req = task->tk_rqstp;
struct rpc_xprt *xprt = req->rq_xprt;
- int counter, status;
+ int status;
spin_lock(&xprt->queue_lock);
- counter = 0;
- while (!list_empty(&xprt->xmit_queue)) {
- if (++counter == 20)
+ for (;;) {
+ next = list_first_entry_or_null(&xprt->xmit_queue,
+ struct rpc_rqst, rq_xmit);
+ if (!next)
break;
- next = list_first_entry(&xprt->xmit_queue,
- struct rpc_rqst, rq_xmit);
xprt_pin_rqst(next);
spin_unlock(&xprt->queue_lock);
status = xprt_request_transmit(next, task);
@@ -1554,28 +1567,54 @@
status = 0;
spin_lock(&xprt->queue_lock);
xprt_unpin_rqst(next);
- if (status == 0) {
- if (!xprt_request_data_received(task) ||
- test_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate))
- continue;
- } else if (test_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate))
- task->tk_status = status;
- break;
+ if (status < 0) {
+ if (test_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate))
+ task->tk_status = status;
+ break;
+ }
+ /* Was @task transmitted, and has it received a reply? */
+ if (xprt_request_data_received(task) &&
+ !test_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate))
+ break;
+ cond_resched_lock(&xprt->queue_lock);
}
spin_unlock(&xprt->queue_lock);
}
-static void xprt_add_backlog(struct rpc_xprt *xprt, struct rpc_task *task)
+static void xprt_complete_request_init(struct rpc_task *task)
{
- set_bit(XPRT_CONGESTED, &xprt->state);
- rpc_sleep_on(&xprt->backlog, task, NULL);
+ if (task->tk_rqstp)
+ xprt_request_init(task);
}
-static void xprt_wake_up_backlog(struct rpc_xprt *xprt)
+void xprt_add_backlog(struct rpc_xprt *xprt, struct rpc_task *task)
{
- if (rpc_wake_up_next(&xprt->backlog) == NULL)
- clear_bit(XPRT_CONGESTED, &xprt->state);
+ set_bit(XPRT_CONGESTED, &xprt->state);
+ rpc_sleep_on(&xprt->backlog, task, xprt_complete_request_init);
}
+EXPORT_SYMBOL_GPL(xprt_add_backlog);
+
+static bool __xprt_set_rq(struct rpc_task *task, void *data)
+{
+ struct rpc_rqst *req = data;
+
+ if (task->tk_rqstp == NULL) {
+ memset(req, 0, sizeof(*req)); /* mark unused */
+ task->tk_rqstp = req;
+ return true;
+ }
+ return false;
+}
+
+bool xprt_wake_up_backlog(struct rpc_xprt *xprt, struct rpc_rqst *req)
+{
+ if (rpc_wake_up_first(&xprt->backlog, __xprt_set_rq, req) == NULL) {
+ clear_bit(XPRT_CONGESTED, &xprt->state);
+ return false;
+ }
+ return true;
+}
+EXPORT_SYMBOL_GPL(xprt_wake_up_backlog);
static bool xprt_throttle_congested(struct rpc_xprt *xprt, struct rpc_task *task)
{
@@ -1585,7 +1624,7 @@
goto out;
spin_lock(&xprt->reserve_lock);
if (test_bit(XPRT_CONGESTED, &xprt->state)) {
- rpc_sleep_on(&xprt->backlog, task, NULL);
+ xprt_add_backlog(xprt, task);
ret = true;
}
spin_unlock(&xprt->reserve_lock);
@@ -1643,7 +1682,7 @@
case -EAGAIN:
xprt_add_backlog(xprt, task);
dprintk("RPC: waiting for request slot\n");
- /* fall through */
+ fallthrough;
default:
task->tk_status = -EAGAIN;
}
@@ -1662,11 +1701,11 @@
void xprt_free_slot(struct rpc_xprt *xprt, struct rpc_rqst *req)
{
spin_lock(&xprt->reserve_lock);
- if (!xprt_dynamic_free_slot(xprt, req)) {
+ if (!xprt_wake_up_backlog(xprt, req) &&
+ !xprt_dynamic_free_slot(xprt, req)) {
memset(req, 0, sizeof(*req)); /* mark unused */
list_add(&req->rq_list, &xprt->free);
}
- xprt_wake_up_backlog(xprt);
spin_unlock(&xprt->reserve_lock);
}
EXPORT_SYMBOL_GPL(xprt_free_slot);
@@ -1767,8 +1806,8 @@
req->rq_rcv_buf.bvec = NULL;
req->rq_release_snd_buf = NULL;
xprt_init_majortimeo(task, req);
- dprintk("RPC: %5u reserved req %p xid %08x\n", task->tk_pid,
- req, ntohl(req->rq_xid));
+
+ trace_xprt_reserve(req);
}
static void
@@ -1849,16 +1888,14 @@
spin_unlock(&xprt->transport_lock);
if (req->rq_buffer)
xprt->ops->buf_free(task);
- xprt_inject_disconnect(xprt);
xdr_free_bvec(&req->rq_rcv_buf);
xdr_free_bvec(&req->rq_snd_buf);
if (req->rq_cred != NULL)
put_rpccred(req->rq_cred);
- task->tk_rqstp = NULL;
if (req->rq_release_snd_buf)
req->rq_release_snd_buf(req);
- dprintk("RPC: %5u release request %p\n", task->tk_pid, req);
+ task->tk_rqstp = NULL;
if (likely(!bc_prealloc(req)))
xprt->ops->free_slot(xprt, req);
else
@@ -1937,11 +1974,8 @@
found:
xprt = t->setup(args);
- if (IS_ERR(xprt)) {
- dprintk("RPC: xprt_create_transport: failed, %ld\n",
- -PTR_ERR(xprt));
+ if (IS_ERR(xprt))
goto out;
- }
if (args->flags & XPRT_CREATE_NO_IDLE_TIMEOUT)
xprt->idle_timeout = 0;
INIT_WORK(&xprt->task_cleanup, xprt_autoclose);
@@ -1962,8 +1996,7 @@
rpc_xprt_debugfs_register(xprt);
- dprintk("RPC: created transport %p with %u slots\n", xprt,
- xprt->max_reqs);
+ trace_xprt_create(xprt);
out:
return xprt;
}
@@ -1973,6 +2006,8 @@
struct rpc_xprt *xprt =
container_of(work, struct rpc_xprt, task_cleanup);
+ trace_xprt_destroy(xprt);
+
rpc_xprt_debugfs_unregister(xprt);
rpc_destroy_wait_queue(&xprt->binding);
rpc_destroy_wait_queue(&xprt->pending);
@@ -1997,8 +2032,6 @@
*/
static void xprt_destroy(struct rpc_xprt *xprt)
{
- dprintk("RPC: destroying transport %p\n", xprt);
-
/*
* Exclude transport connect/disconnect handlers and autoclose
*/
diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c
index b458bf5..c92c1aa 100644
--- a/net/sunrpc/xprtrdma/backchannel.c
+++ b/net/sunrpc/xprtrdma/backchannel.c
@@ -44,10 +44,10 @@
size_t xprt_rdma_bc_maxpayload(struct rpc_xprt *xprt)
{
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
- struct rpcrdma_ep *ep = &r_xprt->rx_ep;
+ struct rpcrdma_ep *ep = r_xprt->rx_ep;
size_t maxmsg;
- maxmsg = min_t(unsigned int, ep->rep_inline_send, ep->rep_inline_recv);
+ maxmsg = min_t(unsigned int, ep->re_inline_send, ep->re_inline_recv);
maxmsg = min_t(unsigned int, maxmsg, PAGE_SIZE);
return maxmsg - RPCRDMA_HDRLEN_MIN;
}
@@ -79,7 +79,7 @@
*p = xdr_zero;
if (rpcrdma_prepare_send_sges(r_xprt, req, RPCRDMA_HDRLEN_MIN,
- &rqst->rq_snd_buf, rpcrdma_noch))
+ &rqst->rq_snd_buf, rpcrdma_noch_pullup))
return -EIO;
trace_xprtrdma_cb_reply(rqst);
@@ -115,7 +115,7 @@
if (rc < 0)
goto failed_marshal;
- if (rpcrdma_ep_post(&r_xprt->rx_ia, &r_xprt->rx_ep, req))
+ if (rpcrdma_post_sends(r_xprt, req))
goto drop_connection;
return 0;
@@ -190,10 +190,14 @@
if (xprt->bc_alloc_count >= RPCRDMA_BACKWARD_WRS)
return NULL;
- size = min_t(size_t, r_xprt->rx_ep.rep_inline_recv, PAGE_SIZE);
+ size = min_t(size_t, r_xprt->rx_ep->re_inline_recv, PAGE_SIZE);
req = rpcrdma_req_create(r_xprt, size, GFP_KERNEL);
if (!req)
return NULL;
+ if (rpcrdma_req_setup(r_xprt, req)) {
+ rpcrdma_req_destroy(req);
+ return NULL;
+ }
xprt->bc_alloc_count++;
rqst = &req->rl_slot;
diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
index 0ad45a8..bf3627d 100644
--- a/net/sunrpc/xprtrdma/frwr_ops.c
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -36,11 +36,10 @@
* connect worker from running concurrently.
*
* When the underlying transport disconnects, MRs that are in flight
- * are flushed and are likely unusable. Thus all flushed MRs are
- * destroyed. New MRs are created on demand.
+ * are flushed and are likely unusable. Thus all MRs are destroyed.
+ * New MRs are created on demand.
*/
-#include <linux/sunrpc/rpc_rdma.h>
#include <linux/sunrpc/svc_rdma.h>
#include "xprt_rdma.h"
@@ -51,30 +50,8 @@
#endif
/**
- * frwr_is_supported - Check if device supports FRWR
- * @device: interface adapter to check
- *
- * Returns true if device supports FRWR, otherwise false
- */
-bool frwr_is_supported(struct ib_device *device)
-{
- struct ib_device_attr *attrs = &device->attrs;
-
- if (!(attrs->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS))
- goto out_not_supported;
- if (attrs->max_fast_reg_page_list_len == 0)
- goto out_not_supported;
- return true;
-
-out_not_supported:
- pr_info("rpcrdma: 'frwr' mode is not supported by device %s\n",
- device->name);
- return false;
-}
-
-/**
* frwr_release_mr - Destroy one MR
- * @mr: MR allocated by frwr_init_mr
+ * @mr: MR allocated by frwr_mr_init
*
*/
void frwr_release_mr(struct rpcrdma_mr *mr)
@@ -88,13 +65,15 @@
kfree(mr);
}
-static void frwr_mr_recycle(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr *mr)
+static void frwr_mr_recycle(struct rpcrdma_mr *mr)
{
+ struct rpcrdma_xprt *r_xprt = mr->mr_xprt;
+
trace_xprtrdma_mr_recycle(mr);
if (mr->mr_dir != DMA_NONE) {
trace_xprtrdma_mr_unmap(mr);
- ib_dma_unmap_sg(r_xprt->rx_ia.ri_id->device,
+ ib_dma_unmap_sg(r_xprt->rx_ep->re_id->device,
mr->mr_sg, mr->mr_nents, mr->mr_dir);
mr->mr_dir = DMA_NONE;
}
@@ -107,32 +86,6 @@
frwr_release_mr(mr);
}
-/* MRs are dynamically allocated, so simply clean up and release the MR.
- * A replacement MR will subsequently be allocated on demand.
- */
-static void
-frwr_mr_recycle_worker(struct work_struct *work)
-{
- struct rpcrdma_mr *mr = container_of(work, struct rpcrdma_mr,
- mr_recycle);
-
- frwr_mr_recycle(mr->mr_xprt, mr);
-}
-
-/* frwr_recycle - Discard MRs
- * @req: request to reset
- *
- * Used after a reconnect. These MRs could be in flight, we can't
- * tell. Safe thing to do is release them.
- */
-void frwr_recycle(struct rpcrdma_req *req)
-{
- struct rpcrdma_mr *mr;
-
- while ((mr = rpcrdma_mr_pop(&req->rl_registered)))
- frwr_mr_recycle(mr->mr_xprt, mr);
-}
-
/* frwr_reset - Place MRs back on the free list
* @req: request to reset
*
@@ -152,35 +105,33 @@
}
/**
- * frwr_init_mr - Initialize one MR
- * @ia: interface adapter
+ * frwr_mr_init - Initialize one MR
+ * @r_xprt: controlling transport instance
* @mr: generic MR to prepare for FRWR
*
* Returns zero if successful. Otherwise a negative errno
* is returned.
*/
-int frwr_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr)
+int frwr_mr_init(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr *mr)
{
- unsigned int depth = ia->ri_max_frwr_depth;
+ struct rpcrdma_ep *ep = r_xprt->rx_ep;
+ unsigned int depth = ep->re_max_fr_depth;
struct scatterlist *sg;
struct ib_mr *frmr;
int rc;
- /* NB: ib_alloc_mr and device drivers typically allocate
- * memory with GFP_KERNEL.
- */
- frmr = ib_alloc_mr(ia->ri_pd, ia->ri_mrtype, depth);
+ frmr = ib_alloc_mr(ep->re_pd, ep->re_mrtype, depth);
if (IS_ERR(frmr))
goto out_mr_err;
- sg = kcalloc(depth, sizeof(*sg), GFP_NOFS);
+ sg = kmalloc_array(depth, sizeof(*sg), GFP_NOFS);
if (!sg)
goto out_list_err;
+ mr->mr_xprt = r_xprt;
mr->frwr.fr_mr = frmr;
mr->mr_dir = DMA_NONE;
INIT_LIST_HEAD(&mr->mr_list);
- INIT_WORK(&mr->mr_recycle, frwr_mr_recycle_worker);
init_completion(&mr->frwr.fr_linv_done);
sg_init_table(sg, depth);
@@ -198,43 +149,58 @@
}
/**
- * frwr_open - Prepare an endpoint for use with FRWR
- * @ia: interface adapter this endpoint will use
- * @ep: endpoint to prepare
+ * frwr_query_device - Prepare a transport for use with FRWR
+ * @ep: endpoint to fill in
+ * @device: RDMA device to query
*
* On success, sets:
- * ep->rep_attr.cap.max_send_wr
- * ep->rep_attr.cap.max_recv_wr
- * ep->rep_max_requests
- * ia->ri_max_segs
+ * ep->re_attr
+ * ep->re_max_requests
+ * ep->re_max_rdma_segs
+ * ep->re_max_fr_depth
+ * ep->re_mrtype
*
- * And these FRWR-related fields:
- * ia->ri_max_frwr_depth
- * ia->ri_mrtype
- *
- * On failure, a negative errno is returned.
+ * Return values:
+ * On success, returns zero.
+ * %-EINVAL - the device does not support FRWR memory registration
+ * %-ENOMEM - the device is not sufficiently capable for NFS/RDMA
*/
-int frwr_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep)
+int frwr_query_device(struct rpcrdma_ep *ep, const struct ib_device *device)
{
- struct ib_device_attr *attrs = &ia->ri_id->device->attrs;
+ const struct ib_device_attr *attrs = &device->attrs;
int max_qp_wr, depth, delta;
+ unsigned int max_sge;
- ia->ri_mrtype = IB_MR_TYPE_MEM_REG;
+ if (!(attrs->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) ||
+ attrs->max_fast_reg_page_list_len == 0) {
+ pr_err("rpcrdma: 'frwr' mode is not supported by device %s\n",
+ device->name);
+ return -EINVAL;
+ }
+
+ max_sge = min_t(unsigned int, attrs->max_send_sge,
+ RPCRDMA_MAX_SEND_SGES);
+ if (max_sge < RPCRDMA_MIN_SEND_SGES) {
+ pr_err("rpcrdma: HCA provides only %u send SGEs\n", max_sge);
+ return -ENOMEM;
+ }
+ ep->re_attr.cap.max_send_sge = max_sge;
+ ep->re_attr.cap.max_recv_sge = 1;
+
+ ep->re_mrtype = IB_MR_TYPE_MEM_REG;
if (attrs->device_cap_flags & IB_DEVICE_SG_GAPS_REG)
- ia->ri_mrtype = IB_MR_TYPE_SG_GAPS;
+ ep->re_mrtype = IB_MR_TYPE_SG_GAPS;
/* Quirk: Some devices advertise a large max_fast_reg_page_list_len
* capability, but perform optimally when the MRs are not larger
* than a page.
*/
- if (attrs->max_sge_rd > 1)
- ia->ri_max_frwr_depth = attrs->max_sge_rd;
+ if (attrs->max_sge_rd > RPCRDMA_MAX_HDR_SEGS)
+ ep->re_max_fr_depth = attrs->max_sge_rd;
else
- ia->ri_max_frwr_depth = attrs->max_fast_reg_page_list_len;
- if (ia->ri_max_frwr_depth > RPCRDMA_MAX_DATA_SEGS)
- ia->ri_max_frwr_depth = RPCRDMA_MAX_DATA_SEGS;
- dprintk("RPC: %s: max FR page list depth = %u\n",
- __func__, ia->ri_max_frwr_depth);
+ ep->re_max_fr_depth = attrs->max_fast_reg_page_list_len;
+ if (ep->re_max_fr_depth > RPCRDMA_MAX_DATA_SEGS)
+ ep->re_max_fr_depth = RPCRDMA_MAX_DATA_SEGS;
/* Add room for frwr register and invalidate WRs.
* 1. FRWR reg WR for head
@@ -250,61 +216,54 @@
/* Calculate N if the device max FRWR depth is smaller than
* RPCRDMA_MAX_DATA_SEGS.
*/
- if (ia->ri_max_frwr_depth < RPCRDMA_MAX_DATA_SEGS) {
- delta = RPCRDMA_MAX_DATA_SEGS - ia->ri_max_frwr_depth;
+ if (ep->re_max_fr_depth < RPCRDMA_MAX_DATA_SEGS) {
+ delta = RPCRDMA_MAX_DATA_SEGS - ep->re_max_fr_depth;
do {
depth += 2; /* FRWR reg + invalidate */
- delta -= ia->ri_max_frwr_depth;
+ delta -= ep->re_max_fr_depth;
} while (delta > 0);
}
- max_qp_wr = ia->ri_id->device->attrs.max_qp_wr;
+ max_qp_wr = attrs->max_qp_wr;
max_qp_wr -= RPCRDMA_BACKWARD_WRS;
max_qp_wr -= 1;
if (max_qp_wr < RPCRDMA_MIN_SLOT_TABLE)
return -ENOMEM;
- if (ep->rep_max_requests > max_qp_wr)
- ep->rep_max_requests = max_qp_wr;
- ep->rep_attr.cap.max_send_wr = ep->rep_max_requests * depth;
- if (ep->rep_attr.cap.max_send_wr > max_qp_wr) {
- ep->rep_max_requests = max_qp_wr / depth;
- if (!ep->rep_max_requests)
- return -EINVAL;
- ep->rep_attr.cap.max_send_wr = ep->rep_max_requests * depth;
+ if (ep->re_max_requests > max_qp_wr)
+ ep->re_max_requests = max_qp_wr;
+ ep->re_attr.cap.max_send_wr = ep->re_max_requests * depth;
+ if (ep->re_attr.cap.max_send_wr > max_qp_wr) {
+ ep->re_max_requests = max_qp_wr / depth;
+ if (!ep->re_max_requests)
+ return -ENOMEM;
+ ep->re_attr.cap.max_send_wr = ep->re_max_requests * depth;
}
- ep->rep_attr.cap.max_send_wr += RPCRDMA_BACKWARD_WRS;
- ep->rep_attr.cap.max_send_wr += 1; /* for ib_drain_sq */
- ep->rep_attr.cap.max_recv_wr = ep->rep_max_requests;
- ep->rep_attr.cap.max_recv_wr += RPCRDMA_BACKWARD_WRS;
- ep->rep_attr.cap.max_recv_wr += 1; /* for ib_drain_rq */
+ ep->re_attr.cap.max_send_wr += RPCRDMA_BACKWARD_WRS;
+ ep->re_attr.cap.max_send_wr += 1; /* for ib_drain_sq */
+ ep->re_attr.cap.max_recv_wr = ep->re_max_requests;
+ ep->re_attr.cap.max_recv_wr += RPCRDMA_BACKWARD_WRS;
+ ep->re_attr.cap.max_recv_wr += RPCRDMA_MAX_RECV_BATCH;
+ ep->re_attr.cap.max_recv_wr += 1; /* for ib_drain_rq */
- ia->ri_max_segs =
- DIV_ROUND_UP(RPCRDMA_MAX_DATA_SEGS, ia->ri_max_frwr_depth);
+ ep->re_max_rdma_segs =
+ DIV_ROUND_UP(RPCRDMA_MAX_DATA_SEGS, ep->re_max_fr_depth);
/* Reply chunks require segments for head and tail buffers */
- ia->ri_max_segs += 2;
- if (ia->ri_max_segs > RPCRDMA_MAX_HDR_SEGS)
- ia->ri_max_segs = RPCRDMA_MAX_HDR_SEGS;
+ ep->re_max_rdma_segs += 2;
+ if (ep->re_max_rdma_segs > RPCRDMA_MAX_HDR_SEGS)
+ ep->re_max_rdma_segs = RPCRDMA_MAX_HDR_SEGS;
+
+ /* Ensure the underlying device is capable of conveying the
+ * largest r/wsize NFS will ask for. This guarantees that
+ * failing over from one RDMA device to another will not
+ * break NFS I/O.
+ */
+ if ((ep->re_max_rdma_segs * ep->re_max_fr_depth) < RPCRDMA_MAX_SEGS)
+ return -ENOMEM;
+
return 0;
}
/**
- * frwr_maxpages - Compute size of largest payload
- * @r_xprt: transport
- *
- * Returns maximum size of an RPC message, in pages.
- *
- * FRWR mode conveys a list of pages per chunk segment. The
- * maximum length of that list is the FRWR page list depth.
- */
-size_t frwr_maxpages(struct rpcrdma_xprt *r_xprt)
-{
- struct rpcrdma_ia *ia = &r_xprt->rx_ia;
-
- return min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS,
- (ia->ri_max_segs - 2) * ia->ri_max_frwr_depth);
-}
-
-/**
* frwr_map - Register a memory region
* @r_xprt: controlling transport
* @seg: memory region co-ordinates
@@ -324,14 +283,14 @@
int nsegs, bool writing, __be32 xid,
struct rpcrdma_mr *mr)
{
- struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+ struct rpcrdma_ep *ep = r_xprt->rx_ep;
struct ib_reg_wr *reg_wr;
int i, n, dma_nents;
struct ib_mr *ibmr;
u8 key;
- if (nsegs > ia->ri_max_frwr_depth)
- nsegs = ia->ri_max_frwr_depth;
+ if (nsegs > ep->re_max_fr_depth)
+ nsegs = ep->re_max_fr_depth;
for (i = 0; i < nsegs;) {
if (seg->mr_page)
sg_set_page(&mr->mr_sg[i],
@@ -344,7 +303,7 @@
++seg;
++i;
- if (ia->ri_mrtype == IB_MR_TYPE_SG_GAPS)
+ if (ep->re_mrtype == IB_MR_TYPE_SG_GAPS)
continue;
if ((i < nsegs && offset_in_page(seg->mr_offset)) ||
offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
@@ -353,7 +312,7 @@
mr->mr_dir = rpcrdma_data_dir(writing);
mr->mr_nents = i;
- dma_nents = ib_dma_map_sg(ia->ri_id->device, mr->mr_sg, mr->mr_nents,
+ dma_nents = ib_dma_map_sg(ep->re_id->device, mr->mr_sg, mr->mr_nents,
mr->mr_dir);
if (!dma_nents)
goto out_dmamap_err;
@@ -394,8 +353,8 @@
/**
* frwr_wc_fastreg - Invoked by RDMA provider for a flushed FastReg WC
- * @cq: completion queue (ignored)
- * @wc: completed WR
+ * @cq: completion queue
+ * @wc: WCE for a completed FastReg WR
*
*/
static void frwr_wc_fastreg(struct ib_cq *cq, struct ib_wc *wc)
@@ -407,25 +366,30 @@
/* WARNING: Only wr_cqe and status are reliable at this point */
trace_xprtrdma_wc_fastreg(wc, frwr);
/* The MR will get recycled when the associated req is retransmitted */
+
+ rpcrdma_flush_disconnect(cq->cq_context, wc);
}
/**
- * frwr_send - post Send WR containing the RPC Call message
- * @ia: interface adapter
- * @req: Prepared RPC Call
+ * frwr_send - post Send WRs containing the RPC Call message
+ * @r_xprt: controlling transport instance
+ * @req: prepared RPC Call
*
* For FRWR, chain any FastReg WRs to the Send WR. Only a
* single ib_post_send call is needed to register memory
* and then post the Send WR.
*
- * Returns the result of ib_post_send.
+ * Returns the return code from ib_post_send.
+ *
+ * Caller must hold the transport send lock to ensure that the
+ * pointers to the transport's rdma_cm_id and QP are stable.
*/
-int frwr_send(struct rpcrdma_ia *ia, struct rpcrdma_req *req)
+int frwr_send(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
{
struct ib_send_wr *post_wr;
struct rpcrdma_mr *mr;
- post_wr = &req->rl_sendctx->sc_wr;
+ post_wr = &req->rl_wr;
list_for_each_entry(mr, &req->rl_registered, mr_list) {
struct rpcrdma_frwr *frwr;
@@ -441,10 +405,7 @@
post_wr = &frwr->fr_regwr.wr;
}
- /* If ib_post_send fails, the next ->send_request for
- * @req will queue these MRs for recovery.
- */
- return ib_post_send(ia->ri_id->qp, post_wr, NULL);
+ return ib_post_send(r_xprt->rx_ep->re_id->qp, post_wr, NULL);
}
/**
@@ -460,7 +421,7 @@
list_for_each_entry(mr, mrs, mr_list)
if (mr->mr_handle == rep->rr_inv_rkey) {
list_del_init(&mr->mr_list);
- trace_xprtrdma_mr_remoteinv(mr);
+ trace_xprtrdma_mr_reminv(mr);
rpcrdma_mr_put(mr);
break; /* only one invalidated MR per RPC */
}
@@ -469,15 +430,15 @@
static void __frwr_release_mr(struct ib_wc *wc, struct rpcrdma_mr *mr)
{
if (wc->status != IB_WC_SUCCESS)
- rpcrdma_mr_recycle(mr);
+ frwr_mr_recycle(mr);
else
rpcrdma_mr_put(mr);
}
/**
* frwr_wc_localinv - Invoked by RDMA provider for a LOCAL_INV WC
- * @cq: completion queue (ignored)
- * @wc: completed WR
+ * @cq: completion queue
+ * @wc: WCE for a completed LocalInv WR
*
*/
static void frwr_wc_localinv(struct ib_cq *cq, struct ib_wc *wc)
@@ -490,12 +451,14 @@
/* WARNING: Only wr_cqe and status are reliable at this point */
trace_xprtrdma_wc_li(wc, frwr);
__frwr_release_mr(wc, mr);
+
+ rpcrdma_flush_disconnect(cq->cq_context, wc);
}
/**
* frwr_wc_localinv_wake - Invoked by RDMA provider for a LOCAL_INV WC
- * @cq: completion queue (ignored)
- * @wc: completed WR
+ * @cq: completion queue
+ * @wc: WCE for a completed LocalInv WR
*
* Awaken anyone waiting for an MR to finish being fenced.
*/
@@ -510,6 +473,8 @@
trace_xprtrdma_wc_li_wake(wc, frwr);
__frwr_release_mr(wc, mr);
complete(&frwr->fr_linv_done);
+
+ rpcrdma_flush_disconnect(cq->cq_context, wc);
}
/**
@@ -567,10 +532,10 @@
/* Transport disconnect drains the receive CQ before it
* replaces the QP. The RPC reply handler won't call us
- * unless ri_id->qp is a valid pointer.
+ * unless re_id->qp is a valid pointer.
*/
bad_wr = NULL;
- rc = ib_post_send(r_xprt->rx_ia.ri_id->qp, first, &bad_wr);
+ rc = ib_post_send(r_xprt->rx_ep->re_id->qp, first, &bad_wr);
/* The final LOCAL_INV WR in the chain is supposed to
* do the wake. If it was never posted, the wake will
@@ -590,15 +555,14 @@
mr = container_of(frwr, struct rpcrdma_mr, frwr);
bad_wr = bad_wr->next;
- list_del_init(&mr->mr_list);
- rpcrdma_mr_recycle(mr);
+ frwr_mr_recycle(mr);
}
}
/**
* frwr_wc_localinv_done - Invoked by RDMA provider for a signaled LOCAL_INV WC
- * @cq: completion queue (ignored)
- * @wc: completed WR
+ * @cq: completion queue
+ * @wc: WCE for a completed LocalInv WR
*
*/
static void frwr_wc_localinv_done(struct ib_cq *cq, struct ib_wc *wc)
@@ -616,6 +580,8 @@
/* Ensure @rep is generated before __frwr_release_mr */
smp_rmb();
rpcrdma_complete_rqst(rep);
+
+ rpcrdma_flush_disconnect(cq->cq_context, wc);
}
/**
@@ -670,10 +636,10 @@
/* Transport disconnect drains the receive CQ before it
* replaces the QP. The RPC reply handler won't call us
- * unless ri_id->qp is a valid pointer.
+ * unless re_id->qp is a valid pointer.
*/
bad_wr = NULL;
- rc = ib_post_send(r_xprt->rx_ia.ri_id->qp, first, &bad_wr);
+ rc = ib_post_send(r_xprt->rx_ep->re_id->qp, first, &bad_wr);
if (!rc)
return;
@@ -685,7 +651,7 @@
mr = container_of(frwr, struct rpcrdma_mr, frwr);
bad_wr = bad_wr->next;
- rpcrdma_mr_recycle(mr);
+ frwr_mr_recycle(mr);
}
/* The final LOCAL_INV WR in the chain is supposed to
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index c091417..ca267a8 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -78,8 +78,6 @@
size += rpcrdma_segment_maxsz * sizeof(__be32);
size += sizeof(__be32); /* list discriminator */
- dprintk("RPC: %s: max call header size = %u\n",
- __func__, size);
return size;
}
@@ -100,28 +98,25 @@
size += maxsegs * rpcrdma_segment_maxsz * sizeof(__be32);
size += sizeof(__be32); /* list discriminator */
- dprintk("RPC: %s: max reply header size = %u\n",
- __func__, size);
return size;
}
/**
* rpcrdma_set_max_header_sizes - Initialize inline payload sizes
- * @r_xprt: transport instance to initialize
+ * @ep: endpoint to initialize
*
* The max_inline fields contain the maximum size of an RPC message
* so the marshaling code doesn't have to repeat this calculation
* for every RPC.
*/
-void rpcrdma_set_max_header_sizes(struct rpcrdma_xprt *r_xprt)
+void rpcrdma_set_max_header_sizes(struct rpcrdma_ep *ep)
{
- unsigned int maxsegs = r_xprt->rx_ia.ri_max_segs;
- struct rpcrdma_ep *ep = &r_xprt->rx_ep;
+ unsigned int maxsegs = ep->re_max_rdma_segs;
- ep->rep_max_inline_send =
- ep->rep_inline_send - rpcrdma_max_call_header_size(maxsegs);
- ep->rep_max_inline_recv =
- ep->rep_inline_recv - rpcrdma_max_reply_header_size(maxsegs);
+ ep->re_max_inline_send =
+ ep->re_inline_send - rpcrdma_max_call_header_size(maxsegs);
+ ep->re_max_inline_recv =
+ ep->re_inline_recv - rpcrdma_max_reply_header_size(maxsegs);
}
/* The client can send a request inline as long as the RPCRDMA header
@@ -136,9 +131,10 @@
struct rpc_rqst *rqst)
{
struct xdr_buf *xdr = &rqst->rq_snd_buf;
+ struct rpcrdma_ep *ep = r_xprt->rx_ep;
unsigned int count, remaining, offset;
- if (xdr->len > r_xprt->rx_ep.rep_max_inline_send)
+ if (xdr->len > ep->re_max_inline_send)
return false;
if (xdr->page_len) {
@@ -149,7 +145,7 @@
remaining -= min_t(unsigned int,
PAGE_SIZE - offset, remaining);
offset = 0;
- if (++count > r_xprt->rx_ia.ri_max_send_sges)
+ if (++count > ep->re_attr.cap.max_send_sge)
return false;
}
}
@@ -166,7 +162,7 @@
static bool rpcrdma_results_inline(struct rpcrdma_xprt *r_xprt,
struct rpc_rqst *rqst)
{
- return rqst->rq_rcv_buf.buflen <= r_xprt->rx_ep.rep_max_inline_recv;
+ return rqst->rq_rcv_buf.buflen <= r_xprt->rx_ep->re_max_inline_recv;
}
/* The client is required to provide a Reply chunk if the maximum
@@ -180,7 +176,7 @@
const struct xdr_buf *buf = &rqst->rq_rcv_buf;
return (buf->head[0].iov_len + buf->tail[0].iov_len) <
- r_xprt->rx_ep.rep_max_inline_recv;
+ r_xprt->rx_ep->re_max_inline_recv;
}
/* ACL likes to be lazy in allocating pages. For TCP, these
@@ -275,7 +271,7 @@
/* When encoding a Read chunk, the tail iovec contains an
* XDR pad and may be omitted.
*/
- if (type == rpcrdma_readch && r_xprt->rx_ia.ri_implicit_roundup)
+ if (type == rpcrdma_readch && r_xprt->rx_ep->re_implicit_roundup)
goto out;
/* When encoding a Write chunk, some servers need to see an
@@ -283,7 +279,7 @@
* layer provides space in the tail iovec that may be used
* for this purpose.
*/
- if (type == rpcrdma_writech && r_xprt->rx_ia.ri_implicit_roundup)
+ if (type == rpcrdma_writech && r_xprt->rx_ep->re_implicit_roundup)
goto out;
if (xdrbuf->tail[0].iov_len)
@@ -295,40 +291,6 @@
return n;
}
-static inline int
-encode_item_present(struct xdr_stream *xdr)
-{
- __be32 *p;
-
- p = xdr_reserve_space(xdr, sizeof(*p));
- if (unlikely(!p))
- return -EMSGSIZE;
-
- *p = xdr_one;
- return 0;
-}
-
-static inline int
-encode_item_not_present(struct xdr_stream *xdr)
-{
- __be32 *p;
-
- p = xdr_reserve_space(xdr, sizeof(*p));
- if (unlikely(!p))
- return -EMSGSIZE;
-
- *p = xdr_zero;
- return 0;
-}
-
-static void
-xdr_encode_rdma_segment(__be32 *iptr, struct rpcrdma_mr *mr)
-{
- *iptr++ = cpu_to_be32(mr->mr_handle);
- *iptr++ = cpu_to_be32(mr->mr_length);
- xdr_encode_hyper(iptr, mr->mr_offset);
-}
-
static int
encode_rdma_segment(struct xdr_stream *xdr, struct rpcrdma_mr *mr)
{
@@ -338,7 +300,7 @@
if (unlikely(!p))
return -EMSGSIZE;
- xdr_encode_rdma_segment(p, mr);
+ xdr_encode_rdma_segment(p, mr->mr_handle, mr->mr_length, mr->mr_offset);
return 0;
}
@@ -353,8 +315,8 @@
return -EMSGSIZE;
*p++ = xdr_one; /* Item present */
- *p++ = cpu_to_be32(position);
- xdr_encode_rdma_segment(p, mr);
+ xdr_encode_read_segment(p, position, mr->mr_handle, mr->mr_length,
+ mr->mr_offset);
return 0;
}
@@ -379,8 +341,7 @@
out_getmr_err:
trace_xprtrdma_nomrs(req);
xprt_wait_for_buffer_space(&r_xprt->rx_xprt);
- if (r_xprt->rx_ep.rep_connected != -ENODEV)
- schedule_work(&r_xprt->rx_buf.rb_refresh_worker);
+ rpcrdma_mrs_refresh(r_xprt);
return ERR_PTR(-EAGAIN);
}
@@ -409,7 +370,7 @@
unsigned int pos;
int nsegs;
- if (rtype == rpcrdma_noch)
+ if (rtype == rpcrdma_noch_pullup || rtype == rpcrdma_noch_mapped)
goto done;
pos = rqst->rq_snd_buf.head[0].iov_len;
@@ -435,7 +396,9 @@
} while (nsegs);
done:
- return encode_item_not_present(xdr);
+ if (xdr_stream_encode_item_absent(xdr) < 0)
+ return -EMSGSIZE;
+ return 0;
}
/* Register and XDR encode the Write list. Supports encoding a list
@@ -474,7 +437,7 @@
if (nsegs < 0)
return nsegs;
- if (encode_item_present(xdr) < 0)
+ if (xdr_stream_encode_item_present(xdr) < 0)
return -EMSGSIZE;
segcount = xdr_reserve_space(xdr, sizeof(*segcount));
if (unlikely(!segcount))
@@ -501,7 +464,9 @@
*segcount = cpu_to_be32(nchunks);
done:
- return encode_item_not_present(xdr);
+ if (xdr_stream_encode_item_absent(xdr) < 0)
+ return -EMSGSIZE;
+ return 0;
}
/* Register and XDR encode the Reply chunk. Supports encoding an array
@@ -527,15 +492,18 @@
int nsegs, nchunks;
__be32 *segcount;
- if (wtype != rpcrdma_replych)
- return encode_item_not_present(xdr);
+ if (wtype != rpcrdma_replych) {
+ if (xdr_stream_encode_item_absent(xdr) < 0)
+ return -EMSGSIZE;
+ return 0;
+ }
seg = req->rl_segments;
nsegs = rpcrdma_convert_iovs(r_xprt, &rqst->rq_rcv_buf, 0, wtype, seg);
if (nsegs < 0)
return nsegs;
- if (encode_item_present(xdr) < 0)
+ if (xdr_stream_encode_item_present(xdr) < 0)
return -EMSGSIZE;
segcount = xdr_reserve_space(xdr, sizeof(*segcount));
if (unlikely(!segcount))
@@ -581,6 +549,7 @@
*/
void rpcrdma_sendctx_unmap(struct rpcrdma_sendctx *sc)
{
+ struct rpcrdma_regbuf *rb = sc->sc_req->rl_sendbuf;
struct ib_sge *sge;
if (!sc->sc_unmap_count)
@@ -592,7 +561,7 @@
*/
for (sge = &sc->sc_sges[2]; sc->sc_unmap_count;
++sge, --sc->sc_unmap_count)
- ib_dma_unmap_page(sc->sc_device, sge->addr, sge->length,
+ ib_dma_unmap_page(rdmab_device(rb), sge->addr, sge->length,
DMA_TO_DEVICE);
kref_put(&sc->sc_req->rl_kref, rpcrdma_sendctx_done);
@@ -600,154 +569,230 @@
/* Prepare an SGE for the RPC-over-RDMA transport header.
*/
-static bool rpcrdma_prepare_hdr_sge(struct rpcrdma_xprt *r_xprt,
+static void rpcrdma_prepare_hdr_sge(struct rpcrdma_xprt *r_xprt,
struct rpcrdma_req *req, u32 len)
{
struct rpcrdma_sendctx *sc = req->rl_sendctx;
struct rpcrdma_regbuf *rb = req->rl_rdmabuf;
- struct ib_sge *sge = sc->sc_sges;
+ struct ib_sge *sge = &sc->sc_sges[req->rl_wr.num_sge++];
- if (!rpcrdma_regbuf_dma_map(r_xprt, rb))
- goto out_regbuf;
sge->addr = rdmab_addr(rb);
sge->length = len;
sge->lkey = rdmab_lkey(rb);
ib_dma_sync_single_for_device(rdmab_device(rb), sge->addr, sge->length,
DMA_TO_DEVICE);
- sc->sc_wr.num_sge++;
+}
+
+/* The head iovec is straightforward, as it is usually already
+ * DMA-mapped. Sync the content that has changed.
+ */
+static bool rpcrdma_prepare_head_iov(struct rpcrdma_xprt *r_xprt,
+ struct rpcrdma_req *req, unsigned int len)
+{
+ struct rpcrdma_sendctx *sc = req->rl_sendctx;
+ struct ib_sge *sge = &sc->sc_sges[req->rl_wr.num_sge++];
+ struct rpcrdma_regbuf *rb = req->rl_sendbuf;
+
+ if (!rpcrdma_regbuf_dma_map(r_xprt, rb))
+ return false;
+
+ sge->addr = rdmab_addr(rb);
+ sge->length = len;
+ sge->lkey = rdmab_lkey(rb);
+
+ ib_dma_sync_single_for_device(rdmab_device(rb), sge->addr, sge->length,
+ DMA_TO_DEVICE);
+ return true;
+}
+
+/* If there is a page list present, DMA map and prepare an
+ * SGE for each page to be sent.
+ */
+static bool rpcrdma_prepare_pagelist(struct rpcrdma_req *req,
+ struct xdr_buf *xdr)
+{
+ struct rpcrdma_sendctx *sc = req->rl_sendctx;
+ struct rpcrdma_regbuf *rb = req->rl_sendbuf;
+ unsigned int page_base, len, remaining;
+ struct page **ppages;
+ struct ib_sge *sge;
+
+ ppages = xdr->pages + (xdr->page_base >> PAGE_SHIFT);
+ page_base = offset_in_page(xdr->page_base);
+ remaining = xdr->page_len;
+ while (remaining) {
+ sge = &sc->sc_sges[req->rl_wr.num_sge++];
+ len = min_t(unsigned int, PAGE_SIZE - page_base, remaining);
+ sge->addr = ib_dma_map_page(rdmab_device(rb), *ppages,
+ page_base, len, DMA_TO_DEVICE);
+ if (ib_dma_mapping_error(rdmab_device(rb), sge->addr))
+ goto out_mapping_err;
+
+ sge->length = len;
+ sge->lkey = rdmab_lkey(rb);
+
+ sc->sc_unmap_count++;
+ ppages++;
+ remaining -= len;
+ page_base = 0;
+ }
+
return true;
-out_regbuf:
- pr_err("rpcrdma: failed to DMA map a Send buffer\n");
+out_mapping_err:
+ trace_xprtrdma_dma_maperr(sge->addr);
return false;
}
-/* Prepare the Send SGEs. The head and tail iovec, and each entry
- * in the page list, gets its own SGE.
+/* The tail iovec may include an XDR pad for the page list,
+ * as well as additional content, and may not reside in the
+ * same page as the head iovec.
*/
-static bool rpcrdma_prepare_msg_sges(struct rpcrdma_xprt *r_xprt,
- struct rpcrdma_req *req,
+static bool rpcrdma_prepare_tail_iov(struct rpcrdma_req *req,
struct xdr_buf *xdr,
- enum rpcrdma_chunktype rtype)
+ unsigned int page_base, unsigned int len)
{
struct rpcrdma_sendctx *sc = req->rl_sendctx;
- unsigned int sge_no, page_base, len, remaining;
+ struct ib_sge *sge = &sc->sc_sges[req->rl_wr.num_sge++];
struct rpcrdma_regbuf *rb = req->rl_sendbuf;
- struct ib_sge *sge = sc->sc_sges;
- struct page *page, **ppages;
+ struct page *page = virt_to_page(xdr->tail[0].iov_base);
- /* The head iovec is straightforward, as it is already
- * DMA-mapped. Sync the content that has changed.
- */
- if (!rpcrdma_regbuf_dma_map(r_xprt, rb))
- goto out_regbuf;
- sc->sc_device = rdmab_device(rb);
- sge_no = 1;
- sge[sge_no].addr = rdmab_addr(rb);
- sge[sge_no].length = xdr->head[0].iov_len;
- sge[sge_no].lkey = rdmab_lkey(rb);
- ib_dma_sync_single_for_device(rdmab_device(rb), sge[sge_no].addr,
- sge[sge_no].length, DMA_TO_DEVICE);
+ sge->addr = ib_dma_map_page(rdmab_device(rb), page, page_base, len,
+ DMA_TO_DEVICE);
+ if (ib_dma_mapping_error(rdmab_device(rb), sge->addr))
+ goto out_mapping_err;
- /* If there is a Read chunk, the page list is being handled
- * via explicit RDMA, and thus is skipped here. However, the
- * tail iovec may include an XDR pad for the page list, as
- * well as additional content, and may not reside in the
- * same page as the head iovec.
- */
- if (rtype == rpcrdma_readch) {
- len = xdr->tail[0].iov_len;
-
- /* Do not include the tail if it is only an XDR pad */
- if (len < 4)
- goto out;
-
- page = virt_to_page(xdr->tail[0].iov_base);
- page_base = offset_in_page(xdr->tail[0].iov_base);
-
- /* If the content in the page list is an odd length,
- * xdr_write_pages() has added a pad at the beginning
- * of the tail iovec. Force the tail's non-pad content
- * to land at the next XDR position in the Send message.
- */
- page_base += len & 3;
- len -= len & 3;
- goto map_tail;
- }
-
- /* If there is a page list present, temporarily DMA map
- * and prepare an SGE for each page to be sent.
- */
- if (xdr->page_len) {
- ppages = xdr->pages + (xdr->page_base >> PAGE_SHIFT);
- page_base = offset_in_page(xdr->page_base);
- remaining = xdr->page_len;
- while (remaining) {
- sge_no++;
- if (sge_no > RPCRDMA_MAX_SEND_SGES - 2)
- goto out_mapping_overflow;
-
- len = min_t(u32, PAGE_SIZE - page_base, remaining);
- sge[sge_no].addr =
- ib_dma_map_page(rdmab_device(rb), *ppages,
- page_base, len, DMA_TO_DEVICE);
- if (ib_dma_mapping_error(rdmab_device(rb),
- sge[sge_no].addr))
- goto out_mapping_err;
- sge[sge_no].length = len;
- sge[sge_no].lkey = rdmab_lkey(rb);
-
- sc->sc_unmap_count++;
- ppages++;
- remaining -= len;
- page_base = 0;
- }
- }
-
- /* The tail iovec is not always constructed in the same
- * page where the head iovec resides (see, for example,
- * gss_wrap_req_priv). To neatly accommodate that case,
- * DMA map it separately.
- */
- if (xdr->tail[0].iov_len) {
- page = virt_to_page(xdr->tail[0].iov_base);
- page_base = offset_in_page(xdr->tail[0].iov_base);
- len = xdr->tail[0].iov_len;
-
-map_tail:
- sge_no++;
- sge[sge_no].addr =
- ib_dma_map_page(rdmab_device(rb), page, page_base, len,
- DMA_TO_DEVICE);
- if (ib_dma_mapping_error(rdmab_device(rb), sge[sge_no].addr))
- goto out_mapping_err;
- sge[sge_no].length = len;
- sge[sge_no].lkey = rdmab_lkey(rb);
- sc->sc_unmap_count++;
- }
-
-out:
- sc->sc_wr.num_sge += sge_no;
- if (sc->sc_unmap_count)
- kref_get(&req->rl_kref);
+ sge->length = len;
+ sge->lkey = rdmab_lkey(rb);
+ ++sc->sc_unmap_count;
return true;
-out_regbuf:
- pr_err("rpcrdma: failed to DMA map a Send buffer\n");
- return false;
-
-out_mapping_overflow:
- rpcrdma_sendctx_unmap(sc);
- pr_err("rpcrdma: too many Send SGEs (%u)\n", sge_no);
- return false;
-
out_mapping_err:
- rpcrdma_sendctx_unmap(sc);
- trace_xprtrdma_dma_maperr(sge[sge_no].addr);
+ trace_xprtrdma_dma_maperr(sge->addr);
return false;
}
+/* Copy the tail to the end of the head buffer.
+ */
+static void rpcrdma_pullup_tail_iov(struct rpcrdma_xprt *r_xprt,
+ struct rpcrdma_req *req,
+ struct xdr_buf *xdr)
+{
+ unsigned char *dst;
+
+ dst = (unsigned char *)xdr->head[0].iov_base;
+ dst += xdr->head[0].iov_len + xdr->page_len;
+ memmove(dst, xdr->tail[0].iov_base, xdr->tail[0].iov_len);
+ r_xprt->rx_stats.pullup_copy_count += xdr->tail[0].iov_len;
+}
+
+/* Copy pagelist content into the head buffer.
+ */
+static void rpcrdma_pullup_pagelist(struct rpcrdma_xprt *r_xprt,
+ struct rpcrdma_req *req,
+ struct xdr_buf *xdr)
+{
+ unsigned int len, page_base, remaining;
+ struct page **ppages;
+ unsigned char *src, *dst;
+
+ dst = (unsigned char *)xdr->head[0].iov_base;
+ dst += xdr->head[0].iov_len;
+ ppages = xdr->pages + (xdr->page_base >> PAGE_SHIFT);
+ page_base = offset_in_page(xdr->page_base);
+ remaining = xdr->page_len;
+ while (remaining) {
+ src = page_address(*ppages);
+ src += page_base;
+ len = min_t(unsigned int, PAGE_SIZE - page_base, remaining);
+ memcpy(dst, src, len);
+ r_xprt->rx_stats.pullup_copy_count += len;
+
+ ppages++;
+ dst += len;
+ remaining -= len;
+ page_base = 0;
+ }
+}
+
+/* Copy the contents of @xdr into @rl_sendbuf and DMA sync it.
+ * When the head, pagelist, and tail are small, a pull-up copy
+ * is considerably less costly than DMA mapping the components
+ * of @xdr.
+ *
+ * Assumptions:
+ * - the caller has already verified that the total length
+ * of the RPC Call body will fit into @rl_sendbuf.
+ */
+static bool rpcrdma_prepare_noch_pullup(struct rpcrdma_xprt *r_xprt,
+ struct rpcrdma_req *req,
+ struct xdr_buf *xdr)
+{
+ if (unlikely(xdr->tail[0].iov_len))
+ rpcrdma_pullup_tail_iov(r_xprt, req, xdr);
+
+ if (unlikely(xdr->page_len))
+ rpcrdma_pullup_pagelist(r_xprt, req, xdr);
+
+ /* The whole RPC message resides in the head iovec now */
+ return rpcrdma_prepare_head_iov(r_xprt, req, xdr->len);
+}
+
+static bool rpcrdma_prepare_noch_mapped(struct rpcrdma_xprt *r_xprt,
+ struct rpcrdma_req *req,
+ struct xdr_buf *xdr)
+{
+ struct kvec *tail = &xdr->tail[0];
+
+ if (!rpcrdma_prepare_head_iov(r_xprt, req, xdr->head[0].iov_len))
+ return false;
+ if (xdr->page_len)
+ if (!rpcrdma_prepare_pagelist(req, xdr))
+ return false;
+ if (tail->iov_len)
+ if (!rpcrdma_prepare_tail_iov(req, xdr,
+ offset_in_page(tail->iov_base),
+ tail->iov_len))
+ return false;
+
+ if (req->rl_sendctx->sc_unmap_count)
+ kref_get(&req->rl_kref);
+ return true;
+}
+
+static bool rpcrdma_prepare_readch(struct rpcrdma_xprt *r_xprt,
+ struct rpcrdma_req *req,
+ struct xdr_buf *xdr)
+{
+ if (!rpcrdma_prepare_head_iov(r_xprt, req, xdr->head[0].iov_len))
+ return false;
+
+ /* If there is a Read chunk, the page list is being handled
+ * via explicit RDMA, and thus is skipped here.
+ */
+
+ /* Do not include the tail if it is only an XDR pad */
+ if (xdr->tail[0].iov_len > 3) {
+ unsigned int page_base, len;
+
+ /* If the content in the page list is an odd length,
+ * xdr_write_pages() adds a pad at the beginning of
+ * the tail iovec. Force the tail's non-pad content to
+ * land at the next XDR position in the Send message.
+ */
+ page_base = offset_in_page(xdr->tail[0].iov_base);
+ len = xdr->tail[0].iov_len;
+ page_base += len & 3;
+ len -= len & 3;
+ if (!rpcrdma_prepare_tail_iov(req, xdr, page_base, len))
+ return false;
+ kref_get(&req->rl_kref);
+ }
+
+ return true;
+}
+
/**
* rpcrdma_prepare_send_sges - Construct SGEs for a Send WR
* @r_xprt: controlling transport
@@ -758,31 +803,52 @@
*
* Returns 0 on success; otherwise a negative errno is returned.
*/
-int
-rpcrdma_prepare_send_sges(struct rpcrdma_xprt *r_xprt,
- struct rpcrdma_req *req, u32 hdrlen,
- struct xdr_buf *xdr, enum rpcrdma_chunktype rtype)
+inline int rpcrdma_prepare_send_sges(struct rpcrdma_xprt *r_xprt,
+ struct rpcrdma_req *req, u32 hdrlen,
+ struct xdr_buf *xdr,
+ enum rpcrdma_chunktype rtype)
{
int ret;
ret = -EAGAIN;
req->rl_sendctx = rpcrdma_sendctx_get_locked(r_xprt);
if (!req->rl_sendctx)
- goto err;
- req->rl_sendctx->sc_wr.num_sge = 0;
+ goto out_nosc;
req->rl_sendctx->sc_unmap_count = 0;
req->rl_sendctx->sc_req = req;
kref_init(&req->rl_kref);
+ req->rl_wr.wr_cqe = &req->rl_sendctx->sc_cqe;
+ req->rl_wr.sg_list = req->rl_sendctx->sc_sges;
+ req->rl_wr.num_sge = 0;
+ req->rl_wr.opcode = IB_WR_SEND;
+
+ rpcrdma_prepare_hdr_sge(r_xprt, req, hdrlen);
ret = -EIO;
- if (!rpcrdma_prepare_hdr_sge(r_xprt, req, hdrlen))
- goto err;
- if (rtype != rpcrdma_areadch)
- if (!rpcrdma_prepare_msg_sges(r_xprt, req, xdr, rtype))
- goto err;
+ switch (rtype) {
+ case rpcrdma_noch_pullup:
+ if (!rpcrdma_prepare_noch_pullup(r_xprt, req, xdr))
+ goto out_unmap;
+ break;
+ case rpcrdma_noch_mapped:
+ if (!rpcrdma_prepare_noch_mapped(r_xprt, req, xdr))
+ goto out_unmap;
+ break;
+ case rpcrdma_readch:
+ if (!rpcrdma_prepare_readch(r_xprt, req, xdr))
+ goto out_unmap;
+ break;
+ case rpcrdma_areadch:
+ break;
+ default:
+ goto out_unmap;
+ }
+
return 0;
-err:
+out_unmap:
+ rpcrdma_sendctx_unmap(req->rl_sendctx);
+out_nosc:
trace_xprtrdma_prepsend_failed(&req->rl_slot, ret);
return ret;
}
@@ -812,6 +878,7 @@
struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
struct xdr_stream *xdr = &req->rl_stream;
enum rpcrdma_chunktype rtype, wtype;
+ struct xdr_buf *buf = &rqst->rq_snd_buf;
bool ddp_allowed;
__be32 *p;
int ret;
@@ -833,14 +900,14 @@
goto out_err;
*p++ = rqst->rq_xid;
*p++ = rpcrdma_version;
- *p++ = cpu_to_be32(r_xprt->rx_buf.rb_max_requests);
+ *p++ = r_xprt->rx_buf.rb_max_requests;
/* When the ULP employs a GSS flavor that guarantees integrity
* or privacy, direct data placement of individual data items
* is not allowed.
*/
- ddp_allowed = !(rqst->rq_cred->cr_auth->au_flags &
- RPCAUTH_AUTH_DATATOUCH);
+ ddp_allowed = !test_bit(RPCAUTH_AUTH_DATATOUCH,
+ &rqst->rq_cred->cr_auth->au_flags);
/*
* Chunks needed for results?
@@ -875,8 +942,9 @@
*/
if (rpcrdma_args_inline(r_xprt, rqst)) {
*p++ = rdma_msg;
- rtype = rpcrdma_noch;
- } else if (ddp_allowed && rqst->rq_snd_buf.flags & XDRBUF_WRITE) {
+ rtype = buf->len < rdmab_length(req->rl_sendbuf) ?
+ rpcrdma_noch_pullup : rpcrdma_noch_mapped;
+ } else if (ddp_allowed && buf->flags & XDRBUF_WRITE) {
*p++ = rdma_msg;
rtype = rpcrdma_readch;
} else {
@@ -885,12 +953,6 @@
rtype = rpcrdma_areadch;
}
- /* If this is a retransmit, discard previously registered
- * chunks. Very likely the connection has been replaced,
- * so these registrations are invalid and unusable.
- */
- frwr_recycle(req);
-
/* This implementation supports the following combinations
* of chunk lists in one RPC-over-RDMA Call message:
*
@@ -924,7 +986,7 @@
goto out_err;
ret = rpcrdma_prepare_send_sges(r_xprt, req, req->rl_hdrbuf.len,
- &rqst->rq_snd_buf, rtype);
+ buf, rtype);
if (ret)
goto out_err;
@@ -938,6 +1000,40 @@
return ret;
}
+static void __rpcrdma_update_cwnd_locked(struct rpc_xprt *xprt,
+ struct rpcrdma_buffer *buf,
+ u32 grant)
+{
+ buf->rb_credits = grant;
+ xprt->cwnd = grant << RPC_CWNDSHIFT;
+}
+
+static void rpcrdma_update_cwnd(struct rpcrdma_xprt *r_xprt, u32 grant)
+{
+ struct rpc_xprt *xprt = &r_xprt->rx_xprt;
+
+ spin_lock(&xprt->transport_lock);
+ __rpcrdma_update_cwnd_locked(xprt, &r_xprt->rx_buf, grant);
+ spin_unlock(&xprt->transport_lock);
+}
+
+/**
+ * rpcrdma_reset_cwnd - Reset the xprt's congestion window
+ * @r_xprt: controlling transport instance
+ *
+ * Prepare @r_xprt for the next connection by reinitializing
+ * its credit grant to one (see RFC 8166, Section 3.3.3).
+ */
+void rpcrdma_reset_cwnd(struct rpcrdma_xprt *r_xprt)
+{
+ struct rpc_xprt *xprt = &r_xprt->rx_xprt;
+
+ spin_lock(&xprt->transport_lock);
+ xprt->cong = 0;
+ __rpcrdma_update_cwnd_locked(xprt, &r_xprt->rx_buf, 1);
+ spin_unlock(&xprt->transport_lock);
+}
+
/**
* rpcrdma_inline_fixup - Scatter inline received data into rqst's iovecs
* @rqst: controlling RPC request
@@ -977,7 +1073,6 @@
curlen = rqst->rq_rcv_buf.head[0].iov_len;
if (curlen > copy_len)
curlen = copy_len;
- trace_xprtrdma_fixup(rqst, copy_len, curlen);
srcp += curlen;
copy_len -= curlen;
@@ -997,8 +1092,6 @@
if (curlen > pagelist_len)
curlen = pagelist_len;
- trace_xprtrdma_fixup_pg(rqst, i, srcp,
- copy_len, curlen);
destp = kmap_atomic(ppages[i]);
memcpy(destp + page_base, srcp, curlen);
flush_dcache_page(ppages[i]);
@@ -1030,6 +1123,8 @@
rqst->rq_private_buf.tail[0].iov_base = srcp;
}
+ if (fixup_copy_count)
+ trace_xprtrdma_fixup(rqst, fixup_copy_count);
return fixup_copy_count;
}
@@ -1052,11 +1147,11 @@
p = xdr_inline_decode(xdr, 0);
/* Chunk lists */
- if (*p++ != xdr_zero)
+ if (xdr_item_is_present(p++))
return false;
- if (*p++ != xdr_zero)
+ if (xdr_item_is_present(p++))
return false;
- if (*p++ != xdr_zero)
+ if (xdr_item_is_present(p++))
return false;
/* RPC header */
@@ -1095,10 +1190,7 @@
if (unlikely(!p))
return -EIO;
- handle = be32_to_cpup(p++);
- *length = be32_to_cpup(p++);
- xdr_decode_hyper(p, &offset);
-
+ xdr_decode_rdma_segment(p, &handle, length, &offset);
trace_xprtrdma_decode_seg(handle, *length, offset);
return 0;
}
@@ -1134,7 +1226,7 @@
p = xdr_inline_decode(xdr, sizeof(*p));
if (unlikely(!p))
return -EIO;
- if (unlikely(*p != xdr_zero))
+ if (unlikely(xdr_item_is_present(p)))
return -EIO;
return 0;
}
@@ -1153,7 +1245,7 @@
p = xdr_inline_decode(xdr, sizeof(*p));
if (unlikely(!p))
return -EIO;
- if (*p == xdr_zero)
+ if (xdr_item_is_absent(p))
break;
if (!first)
return -EIO;
@@ -1175,7 +1267,7 @@
return -EIO;
*length = 0;
- if (*p != xdr_zero)
+ if (xdr_item_is_present(p))
if (decode_write_chunk(xdr, length))
return -EIO;
return 0;
@@ -1373,15 +1465,12 @@
if (credits == 0)
credits = 1; /* don't deadlock */
- else if (credits > buf->rb_max_requests)
- credits = buf->rb_max_requests;
- if (buf->rb_credits != credits) {
- spin_lock(&xprt->transport_lock);
- buf->rb_credits = credits;
- xprt->cwnd = credits << RPC_CWNDSHIFT;
- spin_unlock(&xprt->transport_lock);
- }
- rpcrdma_post_recvs(r_xprt, false);
+ else if (credits > r_xprt->rx_ep->re_max_requests)
+ credits = r_xprt->rx_ep->re_max_requests;
+ rpcrdma_post_recvs(r_xprt, credits + (buf->rb_bc_srv_max_requests << 1),
+ false);
+ if (buf->rb_credits != credits)
+ rpcrdma_update_cwnd(r_xprt, credits);
req = rpcr_to_rdmar(rqst);
if (req->rl_reply) {
diff --git a/net/sunrpc/xprtrdma/svc_rdma.c b/net/sunrpc/xprtrdma/svc_rdma.c
index 97bca50..526da5d 100644
--- a/net/sunrpc/xprtrdma/svc_rdma.c
+++ b/net/sunrpc/xprtrdma/svc_rdma.c
@@ -80,8 +80,7 @@
* current value.
*/
static int read_reset_stat(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
atomic_t *stat = (atomic_t *)table->data;
@@ -103,8 +102,8 @@
len -= *ppos;
if (len > *lenp)
len = *lenp;
- if (len && copy_to_user(buffer, str_buf, len))
- return -EFAULT;
+ if (len)
+ memcpy(buffer, str_buf, len);
*lenp = len;
*ppos += len;
}
diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
index 0ff5c59..c5154bc 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
@@ -10,10 +10,6 @@
#include "xprt_rdma.h"
#include <trace/events/rpcrdma.h>
-#define RPCDBG_FACILITY RPCDBG_SVCXPRT
-
-#undef SVCRDMA_BACKCHANNEL_DEBUG
-
/**
* svc_rdma_handle_bc_reply - Process incoming backchannel Reply
* @rqstp: resources for handling the Reply
@@ -31,33 +27,17 @@
__be32 *rdma_resp = rctxt->rc_recv_buf;
struct rpc_rqst *req;
u32 credits;
- size_t len;
- __be32 xid;
- __be32 *p;
-
- p = (__be32 *)src->iov_base;
- len = src->iov_len;
- xid = *rdma_resp;
-
-#ifdef SVCRDMA_BACKCHANNEL_DEBUG
- pr_info("%s: xid=%08x, length=%zu\n",
- __func__, be32_to_cpu(xid), len);
- pr_info("%s: RPC/RDMA: %*ph\n",
- __func__, (int)RPCRDMA_HDRLEN_MIN, rdma_resp);
- pr_info("%s: RPC: %*ph\n",
- __func__, (int)len, p);
-#endif
spin_lock(&xprt->queue_lock);
- req = xprt_lookup_rqst(xprt, xid);
+ req = xprt_lookup_rqst(xprt, *rdma_resp);
if (!req)
goto out_unlock;
dst = &req->rq_private_buf.head[0];
memcpy(&req->rq_private_buf, &req->rq_rcv_buf, sizeof(struct xdr_buf));
- if (dst->iov_len < len)
+ if (dst->iov_len < src->iov_len)
goto out_unlock;
- memcpy(dst->iov_base, p, len);
+ memcpy(dst->iov_base, src->iov_base, src->iov_len);
xprt_pin_rqst(req);
spin_unlock(&xprt->queue_lock);
@@ -66,7 +46,6 @@
credits = 1; /* don't deadlock */
else if (credits > r_xprt->rx_buf.rb_bc_max_requests)
credits = r_xprt->rx_buf.rb_bc_max_requests;
-
spin_lock(&xprt->transport_lock);
xprt->cwnd = credits << RPC_CWNDSHIFT;
spin_unlock(&xprt->transport_lock);
@@ -99,7 +78,7 @@
{
int ret;
- ret = svc_rdma_map_reply_msg(rdma, ctxt, &rqst->rq_snd_buf, NULL);
+ ret = svc_rdma_map_reply_msg(rdma, ctxt, NULL, &rqst->rq_snd_buf);
if (ret < 0)
return -EIO;
@@ -108,7 +87,7 @@
*/
get_page(virt_to_page(rqst->rq_buffer));
ctxt->sc_send_wr.opcode = IB_WR_SEND;
- return svc_rdma_send(rdma, &ctxt->sc_send_wr);
+ return svc_rdma_send(rdma, ctxt);
}
/* Server-side transport endpoint wants a whole page for its send
@@ -163,7 +142,9 @@
if (!ctxt)
goto drop_connection;
- p = ctxt->sc_xprt_buf;
+ p = xdr_reserve_space(&ctxt->sc_stream, RPCRDMA_HDRLEN_MIN);
+ if (!p)
+ goto put_ctxt;
*p++ = rqst->rq_xid;
*p++ = rpcrdma_version;
*p++ = cpu_to_be32(r_xprt->rx_buf.rb_bc_max_requests);
@@ -171,60 +152,47 @@
*p++ = xdr_zero;
*p++ = xdr_zero;
*p = xdr_zero;
- svc_rdma_sync_reply_hdr(rdma, ctxt, RPCRDMA_HDRLEN_MIN);
-
-#ifdef SVCRDMA_BACKCHANNEL_DEBUG
- pr_info("%s: %*ph\n", __func__, 64, rqst->rq_buffer);
-#endif
rqst->rq_xtime = ktime_get();
rc = svc_rdma_bc_sendto(rdma, rqst, ctxt);
- if (rc) {
- svc_rdma_send_ctxt_put(rdma, ctxt);
- goto drop_connection;
- }
+ if (rc)
+ goto put_ctxt;
return 0;
+put_ctxt:
+ svc_rdma_send_ctxt_put(rdma, ctxt);
+
drop_connection:
- dprintk("svcrdma: failed to send bc call\n");
return -ENOTCONN;
}
-/* Send an RPC call on the passive end of a transport
- * connection.
+/**
+ * xprt_rdma_bc_send_request - Send a reverse-direction Call
+ * @rqst: rpc_rqst containing Call message to be sent
+ *
+ * Return values:
+ * %0 if the message was sent successfully
+ * %ENOTCONN if the message was not sent
*/
-static int
-xprt_rdma_bc_send_request(struct rpc_rqst *rqst)
+static int xprt_rdma_bc_send_request(struct rpc_rqst *rqst)
{
struct svc_xprt *sxprt = rqst->rq_xprt->bc_xprt;
- struct svcxprt_rdma *rdma;
+ struct svcxprt_rdma *rdma =
+ container_of(sxprt, struct svcxprt_rdma, sc_xprt);
int ret;
- dprintk("svcrdma: sending bc call with xid: %08x\n",
- be32_to_cpu(rqst->rq_xid));
+ if (test_bit(XPT_DEAD, &sxprt->xpt_flags))
+ return -ENOTCONN;
- mutex_lock(&sxprt->xpt_mutex);
-
- ret = -ENOTCONN;
- rdma = container_of(sxprt, struct svcxprt_rdma, sc_xprt);
- if (!test_bit(XPT_DEAD, &sxprt->xpt_flags)) {
- ret = rpcrdma_bc_send_request(rdma, rqst);
- if (ret == -ENOTCONN)
- svc_close_xprt(sxprt);
- }
-
- mutex_unlock(&sxprt->xpt_mutex);
-
- if (ret < 0)
- return ret;
- return 0;
+ ret = rpcrdma_bc_send_request(rdma, rqst);
+ if (ret == -ENOTCONN)
+ svc_close_xprt(sxprt);
+ return ret;
}
static void
xprt_rdma_bc_close(struct rpc_xprt *xprt)
{
- dprintk("svcrdma: %s: xprt %p\n", __func__, xprt);
-
xprt_disconnect_done(xprt);
xprt->cwnd = RPC_CWNDSHIFT;
}
@@ -232,8 +200,6 @@
static void
xprt_rdma_bc_put(struct rpc_xprt *xprt)
{
- dprintk("svcrdma: %s: xprt %p\n", __func__, xprt);
-
xprt_rdma_free_addresses(xprt);
xprt_free(xprt);
}
@@ -268,19 +234,14 @@
struct rpc_xprt *xprt;
struct rpcrdma_xprt *new_xprt;
- if (args->addrlen > sizeof(xprt->addr)) {
- dprintk("RPC: %s: address too large\n", __func__);
+ if (args->addrlen > sizeof(xprt->addr))
return ERR_PTR(-EBADF);
- }
xprt = xprt_alloc(args->net, sizeof(*new_xprt),
RPCRDMA_MAX_BC_REQUESTS,
RPCRDMA_MAX_BC_REQUESTS);
- if (!xprt) {
- dprintk("RPC: %s: couldn't allocate rpc_xprt\n",
- __func__);
+ if (!xprt)
return ERR_PTR(-ENOMEM);
- }
xprt->timeout = &xprt_rdma_bc_timeout;
xprt_set_bound(xprt);
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index fd5c1f1..c6ea290 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -117,6 +117,13 @@
rc_list);
}
+static void svc_rdma_recv_cid_init(struct svcxprt_rdma *rdma,
+ struct rpc_rdma_cid *cid)
+{
+ cid->ci_queue_id = rdma->sc_rq_cq->res.id;
+ cid->ci_completion_id = atomic_inc_return(&rdma->sc_completion_ids);
+}
+
static struct svc_rdma_recv_ctxt *
svc_rdma_recv_ctxt_alloc(struct svcxprt_rdma *rdma)
{
@@ -135,6 +142,8 @@
if (ib_dma_mapping_error(rdma->sc_pd->device, addr))
goto fail2;
+ svc_rdma_recv_cid_init(rdma, &ctxt->rc_cid);
+
ctxt->rc_recv_wr.next = NULL;
ctxt->rc_recv_wr.wr_cqe = &ctxt->rc_cqe;
ctxt->rc_recv_wr.sg_list = &ctxt->rc_recv_sge;
@@ -248,16 +257,15 @@
{
int ret;
- svc_xprt_get(&rdma->sc_xprt);
+ trace_svcrdma_post_recv(ctxt);
ret = ib_post_recv(rdma->sc_qp, &ctxt->rc_recv_wr, NULL);
- trace_svcrdma_post_recv(&ctxt->rc_recv_wr, ret);
if (ret)
goto err_post;
return 0;
err_post:
+ trace_svcrdma_rq_post_err(rdma, ret);
svc_rdma_recv_ctxt_put(rdma, ctxt);
- svc_xprt_put(&rdma->sc_xprt);
return ret;
}
@@ -311,11 +319,10 @@
struct ib_cqe *cqe = wc->wr_cqe;
struct svc_rdma_recv_ctxt *ctxt;
- trace_svcrdma_wc_receive(wc);
-
/* WARNING: Only wc->wr_cqe and wc->status are reliable */
ctxt = container_of(cqe, struct svc_rdma_recv_ctxt, rc_cqe);
+ trace_svcrdma_wc_receive(wc, &ctxt->rc_cid);
if (wc->status != IB_WC_SUCCESS)
goto flushed;
@@ -335,15 +342,13 @@
spin_unlock(&rdma->sc_rq_dto_lock);
if (!test_bit(RDMAXPRT_CONN_PENDING, &rdma->sc_flags))
svc_xprt_enqueue(&rdma->sc_xprt);
- goto out;
+ return;
flushed:
post_err:
svc_rdma_recv_ctxt_put(rdma, ctxt);
set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags);
svc_xprt_enqueue(&rdma->sc_xprt);
-out:
- svc_xprt_put(&rdma->sc_xprt);
}
/**
@@ -380,15 +385,14 @@
arg->len = ctxt->rc_byte_len;
}
-/* This accommodates the largest possible Write chunk,
- * in one segment.
+/* This accommodates the largest possible Write chunk.
*/
-#define MAX_BYTES_WRITE_SEG ((u32)(RPCSVC_MAXPAGES << PAGE_SHIFT))
+#define MAX_BYTES_WRITE_CHUNK ((u32)(RPCSVC_MAXPAGES << PAGE_SHIFT))
/* This accommodates the largest possible Position-Zero
- * Read chunk or Reply chunk, in one segment.
+ * Read chunk or Reply chunk.
*/
-#define MAX_BYTES_SPECIAL_SEG ((u32)((RPCSVC_MAXPAGES + 2) << PAGE_SHIFT))
+#define MAX_BYTES_SPECIAL_CHUNK ((u32)((RPCSVC_MAXPAGES + 2) << PAGE_SHIFT))
/* Sanity check the Read list.
*
@@ -396,7 +400,7 @@
* - This implementation supports only one Read chunk.
*
* Sanity checks:
- * - Read list does not overflow buffer.
+ * - Read list does not overflow Receive buffer.
* - Segment size limited by largest NFS data payload.
*
* The segment count is limited to how many segments can
@@ -404,30 +408,44 @@
* buffer. That's about 40 Read segments for a 1KB inline
* threshold.
*
- * Returns pointer to the following Write list.
+ * Return values:
+ * %true: Read list is valid. @rctxt's xdr_stream is updated
+ * to point to the first byte past the Read list.
+ * %false: Read list is corrupt. @rctxt's xdr_stream is left
+ * in an unknown state.
*/
-static __be32 *xdr_check_read_list(__be32 *p, const __be32 *end)
+static bool xdr_check_read_list(struct svc_rdma_recv_ctxt *rctxt)
{
- u32 position;
+ u32 position, len;
bool first;
+ __be32 *p;
+ p = xdr_inline_decode(&rctxt->rc_stream, sizeof(*p));
+ if (!p)
+ return false;
+
+ len = 0;
first = true;
- while (*p++ != xdr_zero) {
- if (first) {
- position = be32_to_cpup(p++);
- first = false;
- } else if (be32_to_cpup(p++) != position) {
- return NULL;
- }
- p++; /* handle */
- if (be32_to_cpup(p++) > MAX_BYTES_SPECIAL_SEG)
- return NULL;
- p += 2; /* offset */
+ while (xdr_item_is_present(p)) {
+ p = xdr_inline_decode(&rctxt->rc_stream,
+ rpcrdma_readseg_maxsz * sizeof(*p));
+ if (!p)
+ return false;
- if (p > end)
- return NULL;
+ if (first) {
+ position = be32_to_cpup(p);
+ first = false;
+ } else if (be32_to_cpup(p) != position) {
+ return false;
+ }
+ p += 2;
+ len += be32_to_cpup(p);
+
+ p = xdr_inline_decode(&rctxt->rc_stream, sizeof(*p));
+ if (!p)
+ return false;
}
- return p;
+ return len <= MAX_BYTES_SPECIAL_CHUNK;
}
/* The segment count is limited to how many segments can
@@ -435,67 +453,97 @@
* buffer. That's about 60 Write segments for a 1KB inline
* threshold.
*/
-static __be32 *xdr_check_write_chunk(__be32 *p, const __be32 *end,
- u32 maxlen)
+static bool xdr_check_write_chunk(struct svc_rdma_recv_ctxt *rctxt, u32 maxlen)
{
- u32 i, segcount;
+ u32 i, segcount, total;
+ __be32 *p;
- segcount = be32_to_cpup(p++);
+ p = xdr_inline_decode(&rctxt->rc_stream, sizeof(*p));
+ if (!p)
+ return false;
+ segcount = be32_to_cpup(p);
+
+ total = 0;
for (i = 0; i < segcount; i++) {
- p++; /* handle */
- if (be32_to_cpup(p++) > maxlen)
- return NULL;
- p += 2; /* offset */
+ u32 handle, length;
+ u64 offset;
- if (p > end)
- return NULL;
+ p = xdr_inline_decode(&rctxt->rc_stream,
+ rpcrdma_segment_maxsz * sizeof(*p));
+ if (!p)
+ return false;
+
+ xdr_decode_rdma_segment(p, &handle, &length, &offset);
+ trace_svcrdma_decode_wseg(handle, length, offset);
+
+ total += length;
}
-
- return p;
+ return total <= maxlen;
}
/* Sanity check the Write list.
*
* Implementation limits:
- * - This implementation supports only one Write chunk.
+ * - This implementation currently supports only one Write chunk.
*
* Sanity checks:
- * - Write list does not overflow buffer.
- * - Segment size limited by largest NFS data payload.
+ * - Write list does not overflow Receive buffer.
+ * - Chunk size limited by largest NFS data payload.
*
- * Returns pointer to the following Reply chunk.
+ * Return values:
+ * %true: Write list is valid. @rctxt's xdr_stream is updated
+ * to point to the first byte past the Write list.
+ * %false: Write list is corrupt. @rctxt's xdr_stream is left
+ * in an unknown state.
*/
-static __be32 *xdr_check_write_list(__be32 *p, const __be32 *end)
+static bool xdr_check_write_list(struct svc_rdma_recv_ctxt *rctxt)
{
- u32 chcount;
+ u32 chcount = 0;
+ __be32 *p;
- chcount = 0;
- while (*p++ != xdr_zero) {
- p = xdr_check_write_chunk(p, end, MAX_BYTES_WRITE_SEG);
+ p = xdr_inline_decode(&rctxt->rc_stream, sizeof(*p));
+ if (!p)
+ return false;
+ rctxt->rc_write_list = p;
+ while (xdr_item_is_present(p)) {
+ if (!xdr_check_write_chunk(rctxt, MAX_BYTES_WRITE_CHUNK))
+ return false;
+ ++chcount;
+ p = xdr_inline_decode(&rctxt->rc_stream, sizeof(*p));
if (!p)
- return NULL;
- if (chcount++ > 1)
- return NULL;
+ return false;
}
- return p;
+ if (!chcount)
+ rctxt->rc_write_list = NULL;
+ return chcount < 2;
}
/* Sanity check the Reply chunk.
*
* Sanity checks:
- * - Reply chunk does not overflow buffer.
- * - Segment size limited by largest NFS data payload.
+ * - Reply chunk does not overflow Receive buffer.
+ * - Chunk size limited by largest NFS data payload.
*
- * Returns pointer to the following RPC header.
+ * Return values:
+ * %true: Reply chunk is valid. @rctxt's xdr_stream is updated
+ * to point to the first byte past the Reply chunk.
+ * %false: Reply chunk is corrupt. @rctxt's xdr_stream is left
+ * in an unknown state.
*/
-static __be32 *xdr_check_reply_chunk(__be32 *p, const __be32 *end)
+static bool xdr_check_reply_chunk(struct svc_rdma_recv_ctxt *rctxt)
{
- if (*p++ != xdr_zero) {
- p = xdr_check_write_chunk(p, end, MAX_BYTES_SPECIAL_SEG);
- if (!p)
- return NULL;
+ __be32 *p;
+
+ p = xdr_inline_decode(&rctxt->rc_stream, sizeof(*p));
+ if (!p)
+ return false;
+ rctxt->rc_reply_chunk = NULL;
+ if (xdr_item_is_present(p)) {
+ if (!xdr_check_write_chunk(rctxt, MAX_BYTES_SPECIAL_CHUNK))
+ return false;
+ rctxt->rc_reply_chunk = p;
}
- return p;
+ return true;
}
/* RPC-over-RDMA Version One private extension: Remote Invalidation.
@@ -524,7 +572,7 @@
p += rpcrdma_fixed_maxsz;
/* Read list */
- while (*p++ != xdr_zero) {
+ while (xdr_item_is_present(p++)) {
p++; /* position */
if (inv_rkey == xdr_zero)
inv_rkey = *p;
@@ -534,7 +582,7 @@
}
/* Write list */
- while (*p++ != xdr_zero) {
+ while (xdr_item_is_present(p++)) {
segcount = be32_to_cpup(p++);
for (i = 0; i < segcount; i++) {
if (inv_rkey == xdr_zero)
@@ -546,7 +594,7 @@
}
/* Reply chunk */
- if (*p++ != xdr_zero) {
+ if (xdr_item_is_present(p++)) {
segcount = be32_to_cpup(p++);
for (i = 0; i < segcount; i++) {
if (inv_rkey == xdr_zero)
@@ -560,83 +608,84 @@
ctxt->rc_inv_rkey = be32_to_cpu(inv_rkey);
}
-/* On entry, xdr->head[0].iov_base points to first byte in the
- * RPC-over-RDMA header.
+/**
+ * svc_rdma_xdr_decode_req - Decode the transport header
+ * @rq_arg: xdr_buf containing ingress RPC/RDMA message
+ * @rctxt: state of decoding
+ *
+ * On entry, xdr->head[0].iov_base points to first byte of the
+ * RPC-over-RDMA transport header.
*
* On successful exit, head[0] points to first byte past the
* RPC-over-RDMA header. For RDMA_MSG, this is the RPC message.
+ *
* The length of the RPC-over-RDMA header is returned.
*
* Assumptions:
* - The transport header is entirely contained in the head iovec.
*/
-static int svc_rdma_xdr_decode_req(struct xdr_buf *rq_arg)
+static int svc_rdma_xdr_decode_req(struct xdr_buf *rq_arg,
+ struct svc_rdma_recv_ctxt *rctxt)
{
- __be32 *p, *end, *rdma_argp;
+ __be32 *p, *rdma_argp;
unsigned int hdr_len;
- /* Verify that there's enough bytes for header + something */
- if (rq_arg->len <= RPCRDMA_HDRLEN_ERR)
- goto out_short;
-
rdma_argp = rq_arg->head[0].iov_base;
- if (*(rdma_argp + 1) != rpcrdma_version)
- goto out_version;
+ xdr_init_decode(&rctxt->rc_stream, rq_arg, rdma_argp, NULL);
- switch (*(rdma_argp + 3)) {
+ p = xdr_inline_decode(&rctxt->rc_stream,
+ rpcrdma_fixed_maxsz * sizeof(*p));
+ if (unlikely(!p))
+ goto out_short;
+ p++;
+ if (*p != rpcrdma_version)
+ goto out_version;
+ p += 2;
+ switch (*p) {
case rdma_msg:
break;
case rdma_nomsg:
break;
-
case rdma_done:
goto out_drop;
-
case rdma_error:
goto out_drop;
-
default:
goto out_proc;
}
- end = (__be32 *)((unsigned long)rdma_argp + rq_arg->len);
- p = xdr_check_read_list(rdma_argp + 4, end);
- if (!p)
+ if (!xdr_check_read_list(rctxt))
goto out_inval;
- p = xdr_check_write_list(p, end);
- if (!p)
+ if (!xdr_check_write_list(rctxt))
goto out_inval;
- p = xdr_check_reply_chunk(p, end);
- if (!p)
- goto out_inval;
- if (p > end)
+ if (!xdr_check_reply_chunk(rctxt))
goto out_inval;
- rq_arg->head[0].iov_base = p;
- hdr_len = (unsigned long)p - (unsigned long)rdma_argp;
+ rq_arg->head[0].iov_base = rctxt->rc_stream.p;
+ hdr_len = xdr_stream_pos(&rctxt->rc_stream);
rq_arg->head[0].iov_len -= hdr_len;
rq_arg->len -= hdr_len;
- trace_svcrdma_decode_rqst(rdma_argp, hdr_len);
+ trace_svcrdma_decode_rqst(rctxt, rdma_argp, hdr_len);
return hdr_len;
out_short:
- trace_svcrdma_decode_short(rq_arg->len);
+ trace_svcrdma_decode_short_err(rctxt, rq_arg->len);
return -EINVAL;
out_version:
- trace_svcrdma_decode_badvers(rdma_argp);
+ trace_svcrdma_decode_badvers_err(rctxt, rdma_argp);
return -EPROTONOSUPPORT;
out_drop:
- trace_svcrdma_decode_drop(rdma_argp);
+ trace_svcrdma_decode_drop_err(rctxt, rdma_argp);
return 0;
out_proc:
- trace_svcrdma_decode_badproc(rdma_argp);
+ trace_svcrdma_decode_badproc_err(rctxt, rdma_argp);
return -EINVAL;
out_inval:
- trace_svcrdma_decode_parse(rdma_argp);
+ trace_svcrdma_decode_parse_err(rctxt, rdma_argp);
return -EINVAL;
}
@@ -669,41 +718,16 @@
rqstp->rq_arg.buflen = head->rc_arg.buflen;
}
-static void svc_rdma_send_error(struct svcxprt_rdma *xprt,
- __be32 *rdma_argp, int status)
+static void svc_rdma_send_error(struct svcxprt_rdma *rdma,
+ struct svc_rdma_recv_ctxt *rctxt,
+ int status)
{
- struct svc_rdma_send_ctxt *ctxt;
- unsigned int length;
- __be32 *p;
- int ret;
+ struct svc_rdma_send_ctxt *sctxt;
- ctxt = svc_rdma_send_ctxt_get(xprt);
- if (!ctxt)
+ sctxt = svc_rdma_send_ctxt_get(rdma);
+ if (!sctxt)
return;
-
- p = ctxt->sc_xprt_buf;
- *p++ = *rdma_argp;
- *p++ = *(rdma_argp + 1);
- *p++ = xprt->sc_fc_credits;
- *p++ = rdma_error;
- switch (status) {
- case -EPROTONOSUPPORT:
- *p++ = err_vers;
- *p++ = rpcrdma_version;
- *p++ = rpcrdma_version;
- trace_svcrdma_err_vers(*rdma_argp);
- break;
- default:
- *p++ = err_chunk;
- trace_svcrdma_err_chunk(*rdma_argp);
- }
- length = (unsigned long)p - (unsigned long)ctxt->sc_xprt_buf;
- svc_rdma_sync_reply_hdr(xprt, ctxt, length);
-
- ctxt->sc_send_wr.opcode = IB_WR_SEND;
- ret = svc_rdma_send(xprt, &ctxt->sc_send_wr);
- if (ret)
- svc_rdma_send_ctxt_put(xprt, ctxt);
+ svc_rdma_send_error_msg(rdma, sctxt, rctxt, status);
}
/* By convention, backchannel calls arrive via rdma_msg type
@@ -810,7 +834,7 @@
rqstp->rq_next_page = rqstp->rq_respages;
p = (__be32 *)rqstp->rq_arg.head[0].iov_base;
- ret = svc_rdma_xdr_decode_req(&rqstp->rq_arg);
+ ret = svc_rdma_xdr_decode_req(&rqstp->rq_arg, ctxt);
if (ret < 0)
goto out_err;
if (ret == 0)
@@ -839,13 +863,13 @@
return 0;
out_err:
- svc_rdma_send_error(rdma_xprt, p, ret);
+ svc_rdma_send_error(rdma_xprt, ctxt, ret);
svc_rdma_recv_ctxt_put(rdma_xprt, ctxt);
return 0;
out_postfail:
if (ret == -EINVAL)
- svc_rdma_send_error(rdma_xprt, p, ret);
+ svc_rdma_send_error(rdma_xprt, ctxt, ret);
svc_rdma_recv_ctxt_put(rdma_xprt, ctxt);
return ret;
diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c
index 0bb3f0d..80a0c0e 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_rw.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c
@@ -7,15 +7,13 @@
#include <rdma/rw.h>
+#include <linux/sunrpc/xdr.h>
#include <linux/sunrpc/rpc_rdma.h>
#include <linux/sunrpc/svc_rdma.h>
-#include <linux/sunrpc/debug.h>
#include "xprt_rdma.h"
#include <trace/events/rpcrdma.h>
-#define RPCDBG_FACILITY RPCDBG_SVCXPRT
-
static void svc_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc);
static void svc_rdma_wc_read_done(struct ib_cq *cq, struct ib_wc *wc);
@@ -39,9 +37,9 @@
struct svc_rdma_rw_ctxt {
struct list_head rw_list;
struct rdma_rw_ctx rw_ctx;
- int rw_nents;
+ unsigned int rw_nents;
struct sg_table rw_sg_table;
- struct scatterlist rw_first_sgl[0];
+ struct scatterlist rw_first_sgl[];
};
static inline struct svc_rdma_rw_ctxt *
@@ -67,19 +65,22 @@
ctxt = kmalloc(struct_size(ctxt, rw_first_sgl, SG_CHUNK_SIZE),
GFP_KERNEL);
if (!ctxt)
- goto out;
+ goto out_noctx;
INIT_LIST_HEAD(&ctxt->rw_list);
}
ctxt->rw_sg_table.sgl = ctxt->rw_first_sgl;
if (sg_alloc_table_chained(&ctxt->rw_sg_table, sges,
ctxt->rw_sg_table.sgl,
- SG_CHUNK_SIZE)) {
- kfree(ctxt);
- ctxt = NULL;
- }
-out:
+ SG_CHUNK_SIZE))
+ goto out_free;
return ctxt;
+
+out_free:
+ kfree(ctxt);
+out_noctx:
+ trace_svcrdma_no_rwctx_err(rdma, sges);
+ return NULL;
}
static void svc_rdma_put_rw_ctxt(struct svcxprt_rdma *rdma,
@@ -107,8 +108,36 @@
}
}
+/**
+ * svc_rdma_rw_ctx_init - Prepare a R/W context for I/O
+ * @rdma: controlling transport instance
+ * @ctxt: R/W context to prepare
+ * @offset: RDMA offset
+ * @handle: RDMA tag/handle
+ * @direction: I/O direction
+ *
+ * Returns on success, the number of WQEs that will be needed
+ * on the workqueue, or a negative errno.
+ */
+static int svc_rdma_rw_ctx_init(struct svcxprt_rdma *rdma,
+ struct svc_rdma_rw_ctxt *ctxt,
+ u64 offset, u32 handle,
+ enum dma_data_direction direction)
+{
+ int ret;
+
+ ret = rdma_rw_ctx_init(&ctxt->rw_ctx, rdma->sc_qp, rdma->sc_port_num,
+ ctxt->rw_sg_table.sgl, ctxt->rw_nents,
+ 0, offset, handle, direction);
+ if (unlikely(ret < 0)) {
+ svc_rdma_put_rw_ctxt(rdma, ctxt);
+ trace_svcrdma_dma_map_rw_err(rdma, ctxt->rw_nents, ret);
+ }
+ return ret;
+}
+
/* A chunk context tracks all I/O for moving one Read or Write
- * chunk. This is a a set of rdma_rw's that handle data movement
+ * chunk. This is a set of rdma_rw's that handle data movement
* for all segments of one chunk.
*
* These are small, acquired with a single allocator call, and
@@ -116,17 +145,25 @@
* demand, and not cached.
*/
struct svc_rdma_chunk_ctxt {
+ struct rpc_rdma_cid cc_cid;
struct ib_cqe cc_cqe;
struct svcxprt_rdma *cc_rdma;
struct list_head cc_rwctxts;
int cc_sqecount;
};
+static void svc_rdma_cc_cid_init(struct svcxprt_rdma *rdma,
+ struct rpc_rdma_cid *cid)
+{
+ cid->ci_queue_id = rdma->sc_sq_cq->res.id;
+ cid->ci_completion_id = atomic_inc_return(&rdma->sc_completion_ids);
+}
+
static void svc_rdma_cc_init(struct svcxprt_rdma *rdma,
struct svc_rdma_chunk_ctxt *cc)
{
+ svc_rdma_cc_cid_init(rdma, &cc->cc_cid);
cc->cc_rdma = rdma;
- svc_xprt_get(&rdma->sc_xprt);
INIT_LIST_HEAD(&cc->cc_rwctxts);
cc->cc_sqecount = 0;
@@ -146,7 +183,6 @@
ctxt->rw_nents, dir);
svc_rdma_put_rw_ctxt(rdma, ctxt);
}
- svc_xprt_put(&rdma->sc_xprt);
}
/* State for sending a Write or Reply chunk.
@@ -208,7 +244,7 @@
struct svc_rdma_write_info *info =
container_of(cc, struct svc_rdma_write_info, wi_cc);
- trace_svcrdma_wc_write(wc);
+ trace_svcrdma_wc_write(wc, &cc->cc_cid);
atomic_add(cc->cc_sqecount, &rdma->sc_sq_avail);
wake_up(&rdma->sc_send_wait);
@@ -266,7 +302,7 @@
struct svc_rdma_read_info *info =
container_of(cc, struct svc_rdma_read_info, ri_cc);
- trace_svcrdma_wc_read(wc);
+ trace_svcrdma_wc_read(wc, &cc->cc_cid);
atomic_add(cc->cc_sqecount, &rdma->sc_sq_avail);
wake_up(&rdma->sc_send_wait);
@@ -322,6 +358,7 @@
do {
if (atomic_sub_return(cc->cc_sqecount,
&rdma->sc_sq_avail) > 0) {
+ trace_svcrdma_post_chunk(&cc->cc_cid, cc->cc_sqecount);
ret = ib_post_send(rdma->sc_qp, first_wr, &bad_wr);
if (ret)
break;
@@ -413,35 +450,32 @@
seg = info->wi_segs + info->wi_seg_no * rpcrdma_segment_maxsz;
do {
unsigned int write_len;
- u32 seg_length, seg_handle;
- u64 seg_offset;
+ u32 handle, length;
+ u64 offset;
if (info->wi_seg_no >= info->wi_nsegs)
goto out_overflow;
- seg_handle = be32_to_cpup(seg);
- seg_length = be32_to_cpup(seg + 1);
- xdr_decode_hyper(seg + 2, &seg_offset);
- seg_offset += info->wi_seg_off;
+ xdr_decode_rdma_segment(seg, &handle, &length, &offset);
+ offset += info->wi_seg_off;
- write_len = min(remaining, seg_length - info->wi_seg_off);
+ write_len = min(remaining, length - info->wi_seg_off);
ctxt = svc_rdma_get_rw_ctxt(rdma,
(write_len >> PAGE_SHIFT) + 2);
if (!ctxt)
- goto out_noctx;
+ return -ENOMEM;
constructor(info, write_len, ctxt);
- ret = rdma_rw_ctx_init(&ctxt->rw_ctx, rdma->sc_qp,
- rdma->sc_port_num, ctxt->rw_sg_table.sgl,
- ctxt->rw_nents, 0, seg_offset,
- seg_handle, DMA_TO_DEVICE);
+ ret = svc_rdma_rw_ctx_init(rdma, ctxt, offset, handle,
+ DMA_TO_DEVICE);
if (ret < 0)
- goto out_initerr;
+ return -EIO;
- trace_svcrdma_encode_wseg(seg_handle, write_len, seg_offset);
+ trace_svcrdma_send_wseg(handle, write_len, offset);
+
list_add(&ctxt->rw_list, &cc->cc_rwctxts);
cc->cc_sqecount += ret;
- if (write_len == seg_length - info->wi_seg_off) {
+ if (write_len == length - info->wi_seg_off) {
seg += 4;
info->wi_seg_no++;
info->wi_seg_off = 0;
@@ -454,18 +488,9 @@
return 0;
out_overflow:
- dprintk("svcrdma: inadequate space in Write chunk (%u)\n",
- info->wi_nsegs);
+ trace_svcrdma_small_wrch_err(rdma, remaining, info->wi_seg_no,
+ info->wi_nsegs);
return -E2BIG;
-
-out_noctx:
- dprintk("svcrdma: no R/W ctxs available\n");
- return -ENOMEM;
-
-out_initerr:
- svc_rdma_put_rw_ctxt(rdma, ctxt);
- trace_svcrdma_dma_map_rwctx(rdma, ret);
- return -EIO;
}
/* Send one of an xdr_buf's kvecs by itself. To send a Reply
@@ -533,7 +558,7 @@
if (ret < 0)
goto out_err;
- trace_svcrdma_encode_write(xdr->page_len);
+ trace_svcrdma_send_write_chunk(xdr->page_len);
return length;
out_err:
@@ -544,8 +569,7 @@
/**
* svc_rdma_send_reply_chunk - Write all segments in the Reply chunk
* @rdma: controlling RDMA transport
- * @rp_ch: Reply chunk provided by client
- * @writelist: true if client provided a Write list
+ * @rctxt: Write and Reply chunks from client
* @xdr: xdr_buf containing an RPC Reply
*
* Returns a non-negative number of bytes the chunk consumed, or
@@ -555,13 +579,14 @@
* %-ENOTCONN if posting failed (connection is lost),
* %-EIO if rdma_rw initialization failed (DMA mapping, etc).
*/
-int svc_rdma_send_reply_chunk(struct svcxprt_rdma *rdma, __be32 *rp_ch,
- bool writelist, struct xdr_buf *xdr)
+int svc_rdma_send_reply_chunk(struct svcxprt_rdma *rdma,
+ const struct svc_rdma_recv_ctxt *rctxt,
+ struct xdr_buf *xdr)
{
struct svc_rdma_write_info *info;
int consumed, ret;
- info = svc_rdma_write_info_alloc(rdma, rp_ch);
+ info = svc_rdma_write_info_alloc(rdma, rctxt->rc_reply_chunk);
if (!info)
return -ENOMEM;
@@ -573,7 +598,7 @@
/* Send the page list in the Reply chunk only if the
* client did not provide Write chunks.
*/
- if (!writelist && xdr->page_len) {
+ if (!rctxt->rc_write_list && xdr->page_len) {
ret = svc_rdma_send_xdr_pagelist(info, xdr,
xdr->head[0].iov_len,
xdr->page_len);
@@ -593,7 +618,7 @@
if (ret < 0)
goto out_err;
- trace_svcrdma_encode_reply(consumed);
+ trace_svcrdma_send_reply_chunk(consumed);
return consumed;
out_err:
@@ -615,7 +640,7 @@
sge_no = PAGE_ALIGN(info->ri_pageoff + len) >> PAGE_SHIFT;
ctxt = svc_rdma_get_rw_ctxt(cc->cc_rdma, sge_no);
if (!ctxt)
- goto out_noctx;
+ return -ENOMEM;
ctxt->rw_nents = sge_no;
sg = ctxt->rw_sg_table.sgl;
@@ -645,29 +670,18 @@
goto out_overrun;
}
- ret = rdma_rw_ctx_init(&ctxt->rw_ctx, cc->cc_rdma->sc_qp,
- cc->cc_rdma->sc_port_num,
- ctxt->rw_sg_table.sgl, ctxt->rw_nents,
- 0, offset, rkey, DMA_FROM_DEVICE);
+ ret = svc_rdma_rw_ctx_init(cc->cc_rdma, ctxt, offset, rkey,
+ DMA_FROM_DEVICE);
if (ret < 0)
- goto out_initerr;
+ return -EIO;
list_add(&ctxt->rw_list, &cc->cc_rwctxts);
cc->cc_sqecount += ret;
return 0;
-out_noctx:
- dprintk("svcrdma: no R/W ctxs available\n");
- return -ENOMEM;
-
out_overrun:
- dprintk("svcrdma: request overruns rq_pages\n");
+ trace_svcrdma_page_overrun_err(cc->cc_rdma, rqstp, info->ri_pageno);
return -EINVAL;
-
-out_initerr:
- trace_svcrdma_dma_map_rwctx(cc->cc_rdma, ret);
- svc_rdma_put_rw_ctxt(cc->cc_rdma, ctxt);
- return -EIO;
}
/* Walk the segments in the Read chunk starting at @p and construct
@@ -682,21 +696,17 @@
ret = -EINVAL;
info->ri_chunklen = 0;
while (*p++ != xdr_zero && be32_to_cpup(p++) == info->ri_position) {
- u32 rs_handle, rs_length;
- u64 rs_offset;
+ u32 handle, length;
+ u64 offset;
- rs_handle = be32_to_cpup(p++);
- rs_length = be32_to_cpup(p++);
- p = xdr_decode_hyper(p, &rs_offset);
-
- ret = svc_rdma_build_read_segment(info, rqstp,
- rs_handle, rs_length,
- rs_offset);
+ p = xdr_decode_rdma_segment(p, &handle, &length, &offset);
+ ret = svc_rdma_build_read_segment(info, rqstp, handle, length,
+ offset);
if (ret < 0)
break;
- trace_svcrdma_encode_rseg(rs_handle, rs_length, rs_offset);
- info->ri_chunklen += rs_length;
+ trace_svcrdma_send_rseg(handle, length, offset);
+ info->ri_chunklen += length;
}
return ret;
@@ -720,7 +730,7 @@
if (ret < 0)
goto out;
- trace_svcrdma_encode_read(info->ri_chunklen, info->ri_position);
+ trace_svcrdma_send_read_chunk(info->ri_chunklen, info->ri_position);
head->rc_hdr_count = 0;
@@ -776,7 +786,7 @@
if (ret < 0)
goto out;
- trace_svcrdma_encode_pzr(info->ri_chunklen);
+ trace_svcrdma_send_pzr(info->ri_chunklen);
head->rc_arg.len += info->ri_chunklen;
head->rc_arg.buflen += info->ri_chunklen;
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index 25e8922..c3d588b 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -106,7 +106,6 @@
#include <rdma/rdma_cm.h>
#include <linux/sunrpc/debug.h>
-#include <linux/sunrpc/rpc_rdma.h>
#include <linux/sunrpc/svc_rdma.h>
#include "xprt_rdma.h"
@@ -123,6 +122,13 @@
sc_list);
}
+static void svc_rdma_send_cid_init(struct svcxprt_rdma *rdma,
+ struct rpc_rdma_cid *cid)
+{
+ cid->ci_queue_id = rdma->sc_sq_cq->res.id;
+ cid->ci_completion_id = atomic_inc_return(&rdma->sc_completion_ids);
+}
+
static struct svc_rdma_send_ctxt *
svc_rdma_send_ctxt_alloc(struct svcxprt_rdma *rdma)
{
@@ -145,12 +151,16 @@
if (ib_dma_mapping_error(rdma->sc_pd->device, addr))
goto fail2;
+ svc_rdma_send_cid_init(rdma, &ctxt->sc_cid);
+
ctxt->sc_send_wr.next = NULL;
ctxt->sc_send_wr.wr_cqe = &ctxt->sc_cqe;
ctxt->sc_send_wr.sg_list = ctxt->sc_sges;
ctxt->sc_send_wr.send_flags = IB_SEND_SIGNALED;
ctxt->sc_cqe.done = svc_rdma_wc_send;
ctxt->sc_xprt_buf = buffer;
+ xdr_buf_init(&ctxt->sc_hdrbuf, ctxt->sc_xprt_buf,
+ rdma->sc_max_req_size);
ctxt->sc_sges[0].addr = addr;
for (i = 0; i < rdma->sc_max_send_sges; i++)
@@ -204,6 +214,10 @@
spin_unlock(&rdma->sc_send_lock);
out:
+ rpcrdma_set_xdrlen(&ctxt->sc_hdrbuf, 0);
+ xdr_init_encode(&ctxt->sc_stream, &ctxt->sc_hdrbuf,
+ ctxt->sc_xprt_buf, NULL);
+
ctxt->sc_send_wr.num_sge = 0;
ctxt->sc_cur_sge_no = 0;
ctxt->sc_page_count = 0;
@@ -233,11 +247,15 @@
/* The first SGE contains the transport header, which
* remains mapped until @ctxt is destroyed.
*/
- for (i = 1; i < ctxt->sc_send_wr.num_sge; i++)
+ for (i = 1; i < ctxt->sc_send_wr.num_sge; i++) {
ib_dma_unmap_page(device,
ctxt->sc_sges[i].addr,
ctxt->sc_sges[i].length,
DMA_TO_DEVICE);
+ trace_svcrdma_dma_unmap_page(rdma,
+ ctxt->sc_sges[i].addr,
+ ctxt->sc_sges[i].length);
+ }
for (i = 0; i < ctxt->sc_page_count; ++i)
put_page(ctxt->sc_pages[i]);
@@ -259,38 +277,43 @@
{
struct svcxprt_rdma *rdma = cq->cq_context;
struct ib_cqe *cqe = wc->wr_cqe;
- struct svc_rdma_send_ctxt *ctxt;
+ struct svc_rdma_send_ctxt *ctxt =
+ container_of(cqe, struct svc_rdma_send_ctxt, sc_cqe);
- trace_svcrdma_wc_send(wc);
+ trace_svcrdma_wc_send(wc, &ctxt->sc_cid);
atomic_inc(&rdma->sc_sq_avail);
wake_up(&rdma->sc_send_wait);
- ctxt = container_of(cqe, struct svc_rdma_send_ctxt, sc_cqe);
svc_rdma_send_ctxt_put(rdma, ctxt);
if (unlikely(wc->status != IB_WC_SUCCESS)) {
set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags);
svc_xprt_enqueue(&rdma->sc_xprt);
}
-
- svc_xprt_put(&rdma->sc_xprt);
}
/**
* svc_rdma_send - Post a single Send WR
* @rdma: transport on which to post the WR
- * @wr: prepared Send WR to post
+ * @ctxt: send ctxt with a Send WR ready to post
*
* Returns zero the Send WR was posted successfully. Otherwise, a
* negative errno is returned.
*/
-int svc_rdma_send(struct svcxprt_rdma *rdma, struct ib_send_wr *wr)
+int svc_rdma_send(struct svcxprt_rdma *rdma, struct svc_rdma_send_ctxt *ctxt)
{
+ struct ib_send_wr *wr = &ctxt->sc_send_wr;
int ret;
might_sleep();
+ /* Sync the transport header buffer */
+ ib_dma_sync_single_for_device(rdma->sc_pd->device,
+ wr->sg_list[0].addr,
+ wr->sg_list[0].length,
+ DMA_TO_DEVICE);
+
/* If the SQ is full, wait until an SQ entry is available */
while (1) {
if ((atomic_dec_return(&rdma->sc_sq_avail) < 0)) {
@@ -305,8 +328,7 @@
continue;
}
- svc_xprt_get(&rdma->sc_xprt);
- trace_svcrdma_post_send(wr);
+ trace_svcrdma_post_send(ctxt);
ret = ib_post_send(rdma->sc_qp, wr, NULL);
if (ret)
break;
@@ -315,171 +337,173 @@
trace_svcrdma_sq_post_err(rdma, ret);
set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags);
- svc_xprt_put(&rdma->sc_xprt);
wake_up(&rdma->sc_send_wait);
return ret;
}
-static u32 xdr_padsize(u32 len)
-{
- return (len & 3) ? (4 - (len & 3)) : 0;
-}
-
-/* Returns length of transport header, in bytes.
- */
-static unsigned int svc_rdma_reply_hdr_len(__be32 *rdma_resp)
-{
- unsigned int nsegs;
- __be32 *p;
-
- p = rdma_resp;
-
- /* RPC-over-RDMA V1 replies never have a Read list. */
- p += rpcrdma_fixed_maxsz + 1;
-
- /* Skip Write list. */
- while (*p++ != xdr_zero) {
- nsegs = be32_to_cpup(p++);
- p += nsegs * rpcrdma_segment_maxsz;
- }
-
- /* Skip Reply chunk. */
- if (*p++ != xdr_zero) {
- nsegs = be32_to_cpup(p++);
- p += nsegs * rpcrdma_segment_maxsz;
- }
-
- return (unsigned long)p - (unsigned long)rdma_resp;
-}
-
-/* One Write chunk is copied from Call transport header to Reply
- * transport header. Each segment's length field is updated to
- * reflect number of bytes consumed in the segment.
+/**
+ * svc_rdma_encode_read_list - Encode RPC Reply's Read chunk list
+ * @sctxt: Send context for the RPC Reply
*
- * Returns number of segments in this chunk.
+ * Return values:
+ * On success, returns length in bytes of the Reply XDR buffer
+ * that was consumed by the Reply Read list
+ * %-EMSGSIZE on XDR buffer overflow
*/
-static unsigned int xdr_encode_write_chunk(__be32 *dst, __be32 *src,
+static ssize_t svc_rdma_encode_read_list(struct svc_rdma_send_ctxt *sctxt)
+{
+ /* RPC-over-RDMA version 1 replies never have a Read list. */
+ return xdr_stream_encode_item_absent(&sctxt->sc_stream);
+}
+
+/**
+ * svc_rdma_encode_write_segment - Encode one Write segment
+ * @src: matching Write chunk in the RPC Call header
+ * @sctxt: Send context for the RPC Reply
+ * @remaining: remaining bytes of the payload left in the Write chunk
+ *
+ * Return values:
+ * On success, returns length in bytes of the Reply XDR buffer
+ * that was consumed by the Write segment
+ * %-EMSGSIZE on XDR buffer overflow
+ */
+static ssize_t svc_rdma_encode_write_segment(__be32 *src,
+ struct svc_rdma_send_ctxt *sctxt,
+ unsigned int *remaining)
+{
+ __be32 *p;
+ const size_t len = rpcrdma_segment_maxsz * sizeof(*p);
+ u32 handle, length;
+ u64 offset;
+
+ p = xdr_reserve_space(&sctxt->sc_stream, len);
+ if (!p)
+ return -EMSGSIZE;
+
+ xdr_decode_rdma_segment(src, &handle, &length, &offset);
+
+ if (*remaining < length) {
+ /* segment only partly filled */
+ length = *remaining;
+ *remaining = 0;
+ } else {
+ /* entire segment was consumed */
+ *remaining -= length;
+ }
+ xdr_encode_rdma_segment(p, handle, length, offset);
+
+ trace_svcrdma_encode_wseg(handle, length, offset);
+ return len;
+}
+
+/**
+ * svc_rdma_encode_write_chunk - Encode one Write chunk
+ * @src: matching Write chunk in the RPC Call header
+ * @sctxt: Send context for the RPC Reply
+ * @remaining: size in bytes of the payload in the Write chunk
+ *
+ * Copy a Write chunk from the Call transport header to the
+ * Reply transport header. Update each segment's length field
+ * to reflect the number of bytes written in that segment.
+ *
+ * Return values:
+ * On success, returns length in bytes of the Reply XDR buffer
+ * that was consumed by the Write chunk
+ * %-EMSGSIZE on XDR buffer overflow
+ */
+static ssize_t svc_rdma_encode_write_chunk(__be32 *src,
+ struct svc_rdma_send_ctxt *sctxt,
unsigned int remaining)
{
unsigned int i, nsegs;
- u32 seg_len;
+ ssize_t len, ret;
- /* Write list discriminator */
- *dst++ = *src++;
+ len = 0;
+ trace_svcrdma_encode_write_chunk(remaining);
- /* number of segments in this chunk */
- nsegs = be32_to_cpup(src);
- *dst++ = *src++;
+ src++;
+ ret = xdr_stream_encode_item_present(&sctxt->sc_stream);
+ if (ret < 0)
+ return -EMSGSIZE;
+ len += ret;
+
+ nsegs = be32_to_cpup(src++);
+ ret = xdr_stream_encode_u32(&sctxt->sc_stream, nsegs);
+ if (ret < 0)
+ return -EMSGSIZE;
+ len += ret;
for (i = nsegs; i; i--) {
- /* segment's RDMA handle */
- *dst++ = *src++;
-
- /* bytes returned in this segment */
- seg_len = be32_to_cpu(*src);
- if (remaining >= seg_len) {
- /* entire segment was consumed */
- *dst = *src;
- remaining -= seg_len;
- } else {
- /* segment only partly filled */
- *dst = cpu_to_be32(remaining);
- remaining = 0;
- }
- dst++; src++;
-
- /* segment's RDMA offset */
- *dst++ = *src++;
- *dst++ = *src++;
+ ret = svc_rdma_encode_write_segment(src, sctxt, &remaining);
+ if (ret < 0)
+ return -EMSGSIZE;
+ src += rpcrdma_segment_maxsz;
+ len += ret;
}
- return nsegs;
+ return len;
}
-/* The client provided a Write list in the Call message. Fill in
- * the segments in the first Write chunk in the Reply's transport
+/**
+ * svc_rdma_encode_write_list - Encode RPC Reply's Write chunk list
+ * @rctxt: Reply context with information about the RPC Call
+ * @sctxt: Send context for the RPC Reply
+ * @length: size in bytes of the payload in the first Write chunk
+ *
+ * The client provides a Write chunk list in the Call message. Fill
+ * in the segments in the first Write chunk in the Reply's transport
* header with the number of bytes consumed in each segment.
* Remaining chunks are returned unused.
*
* Assumptions:
* - Client has provided only one Write chunk
+ *
+ * Return values:
+ * On success, returns length in bytes of the Reply XDR buffer
+ * that was consumed by the Reply's Write list
+ * %-EMSGSIZE on XDR buffer overflow
*/
-static void svc_rdma_xdr_encode_write_list(__be32 *rdma_resp, __be32 *wr_ch,
- unsigned int consumed)
+static ssize_t
+svc_rdma_encode_write_list(const struct svc_rdma_recv_ctxt *rctxt,
+ struct svc_rdma_send_ctxt *sctxt,
+ unsigned int length)
{
- unsigned int nsegs;
- __be32 *p, *q;
+ ssize_t len, ret;
- /* RPC-over-RDMA V1 replies never have a Read list. */
- p = rdma_resp + rpcrdma_fixed_maxsz + 1;
+ ret = svc_rdma_encode_write_chunk(rctxt->rc_write_list, sctxt, length);
+ if (ret < 0)
+ return ret;
+ len = ret;
- q = wr_ch;
- while (*q != xdr_zero) {
- nsegs = xdr_encode_write_chunk(p, q, consumed);
- q += 2 + nsegs * rpcrdma_segment_maxsz;
- p += 2 + nsegs * rpcrdma_segment_maxsz;
- consumed = 0;
- }
+ /* Terminate the Write list */
+ ret = xdr_stream_encode_item_absent(&sctxt->sc_stream);
+ if (ret < 0)
+ return ret;
- /* Terminate Write list */
- *p++ = xdr_zero;
-
- /* Reply chunk discriminator; may be replaced later */
- *p = xdr_zero;
+ return len + ret;
}
-/* The client provided a Reply chunk in the Call message. Fill in
- * the segments in the Reply chunk in the Reply message with the
- * number of bytes consumed in each segment.
+/**
+ * svc_rdma_encode_reply_chunk - Encode RPC Reply's Reply chunk
+ * @rctxt: Reply context with information about the RPC Call
+ * @sctxt: Send context for the RPC Reply
+ * @length: size in bytes of the payload in the Reply chunk
*
* Assumptions:
- * - Reply can always fit in the provided Reply chunk
+ * - Reply can always fit in the client-provided Reply chunk
+ *
+ * Return values:
+ * On success, returns length in bytes of the Reply XDR buffer
+ * that was consumed by the Reply's Reply chunk
+ * %-EMSGSIZE on XDR buffer overflow
*/
-static void svc_rdma_xdr_encode_reply_chunk(__be32 *rdma_resp, __be32 *rp_ch,
- unsigned int consumed)
+static ssize_t
+svc_rdma_encode_reply_chunk(const struct svc_rdma_recv_ctxt *rctxt,
+ struct svc_rdma_send_ctxt *sctxt,
+ unsigned int length)
{
- __be32 *p;
-
- /* Find the Reply chunk in the Reply's xprt header.
- * RPC-over-RDMA V1 replies never have a Read list.
- */
- p = rdma_resp + rpcrdma_fixed_maxsz + 1;
-
- /* Skip past Write list */
- while (*p++ != xdr_zero)
- p += 1 + be32_to_cpup(p) * rpcrdma_segment_maxsz;
-
- xdr_encode_write_chunk(p, rp_ch, consumed);
-}
-
-/* Parse the RPC Call's transport header.
- */
-static void svc_rdma_get_write_arrays(__be32 *rdma_argp,
- __be32 **write, __be32 **reply)
-{
- __be32 *p;
-
- p = rdma_argp + rpcrdma_fixed_maxsz;
-
- /* Read list */
- while (*p++ != xdr_zero)
- p += 5;
-
- /* Write list */
- if (*p != xdr_zero) {
- *write = p;
- while (*p++ != xdr_zero)
- p += 1 + be32_to_cpu(*p) * 4;
- } else {
- *write = NULL;
- p++;
- }
-
- /* Reply chunk */
- if (*p != xdr_zero)
- *reply = p;
- else
- *reply = NULL;
+ return svc_rdma_encode_write_chunk(rctxt->rc_reply_chunk, sctxt,
+ length);
}
static int svc_rdma_dma_map_page(struct svcxprt_rdma *rdma,
@@ -492,6 +516,7 @@
dma_addr_t dma_addr;
dma_addr = ib_dma_map_page(dev, page, offset, len, DMA_TO_DEVICE);
+ trace_svcrdma_dma_map_page(rdma, dma_addr, len);
if (ib_dma_mapping_error(dev, dma_addr))
goto out_maperr;
@@ -501,7 +526,6 @@
return 0;
out_maperr:
- trace_svcrdma_dma_map_page(rdma, page);
return -EIO;
}
@@ -518,38 +542,36 @@
}
/**
- * svc_rdma_sync_reply_hdr - DMA sync the transport header buffer
+ * svc_rdma_pull_up_needed - Determine whether to use pull-up
* @rdma: controlling transport
- * @ctxt: send_ctxt for the Send WR
- * @len: length of transport header
+ * @sctxt: send_ctxt for the Send WR
+ * @rctxt: Write and Reply chunks provided by client
+ * @xdr: xdr_buf containing RPC message to transmit
*
- */
-void svc_rdma_sync_reply_hdr(struct svcxprt_rdma *rdma,
- struct svc_rdma_send_ctxt *ctxt,
- unsigned int len)
-{
- ctxt->sc_sges[0].length = len;
- ctxt->sc_send_wr.num_sge++;
- ib_dma_sync_single_for_device(rdma->sc_pd->device,
- ctxt->sc_sges[0].addr, len,
- DMA_TO_DEVICE);
-}
-
-/* If the xdr_buf has more elements than the device can
- * transmit in a single RDMA Send, then the reply will
- * have to be copied into a bounce buffer.
+ * Returns:
+ * %true if pull-up must be used
+ * %false otherwise
*/
static bool svc_rdma_pull_up_needed(struct svcxprt_rdma *rdma,
- struct xdr_buf *xdr,
- __be32 *wr_lst)
+ struct svc_rdma_send_ctxt *sctxt,
+ const struct svc_rdma_recv_ctxt *rctxt,
+ struct xdr_buf *xdr)
{
int elements;
+ /* For small messages, copying bytes is cheaper than DMA mapping.
+ */
+ if (sctxt->sc_hdrbuf.len + xdr->len < RPCRDMA_PULLUP_THRESH)
+ return true;
+
+ /* Check whether the xdr_buf has more elements than can
+ * fit in a single RDMA Send.
+ */
/* xdr->head */
elements = 1;
/* xdr->pages */
- if (!wr_lst) {
+ if (!rctxt || !rctxt->rc_write_list) {
unsigned int remaining;
unsigned long pageoff;
@@ -571,29 +593,36 @@
return elements >= rdma->sc_max_send_sges;
}
-/* The device is not capable of sending the reply directly.
- * Assemble the elements of @xdr into the transport header
- * buffer.
+/**
+ * svc_rdma_pull_up_reply_msg - Copy Reply into a single buffer
+ * @rdma: controlling transport
+ * @sctxt: send_ctxt for the Send WR; xprt hdr is already prepared
+ * @rctxt: Write and Reply chunks provided by client
+ * @xdr: prepared xdr_buf containing RPC message
+ *
+ * The device is not capable of sending the reply directly.
+ * Assemble the elements of @xdr into the transport header buffer.
+ *
+ * Returns zero on success, or a negative errno on failure.
*/
static int svc_rdma_pull_up_reply_msg(struct svcxprt_rdma *rdma,
- struct svc_rdma_send_ctxt *ctxt,
- struct xdr_buf *xdr, __be32 *wr_lst)
+ struct svc_rdma_send_ctxt *sctxt,
+ const struct svc_rdma_recv_ctxt *rctxt,
+ const struct xdr_buf *xdr)
{
unsigned char *dst, *tailbase;
unsigned int taillen;
- dst = ctxt->sc_xprt_buf;
- dst += ctxt->sc_sges[0].length;
-
+ dst = sctxt->sc_xprt_buf + sctxt->sc_hdrbuf.len;
memcpy(dst, xdr->head[0].iov_base, xdr->head[0].iov_len);
dst += xdr->head[0].iov_len;
tailbase = xdr->tail[0].iov_base;
taillen = xdr->tail[0].iov_len;
- if (wr_lst) {
+ if (rctxt && rctxt->rc_write_list) {
u32 xdrpad;
- xdrpad = xdr_padsize(xdr->page_len);
+ xdrpad = xdr_pad_size(xdr->page_len);
if (taillen && xdrpad) {
tailbase += xdrpad;
taillen -= xdrpad;
@@ -620,29 +649,26 @@
if (taillen)
memcpy(dst, tailbase, taillen);
- ctxt->sc_sges[0].length += xdr->len;
- ib_dma_sync_single_for_device(rdma->sc_pd->device,
- ctxt->sc_sges[0].addr,
- ctxt->sc_sges[0].length,
- DMA_TO_DEVICE);
-
+ sctxt->sc_sges[0].length += xdr->len;
+ trace_svcrdma_send_pullup(sctxt->sc_sges[0].length);
return 0;
}
-/* svc_rdma_map_reply_msg - Map the buffer holding RPC message
+/* svc_rdma_map_reply_msg - DMA map the buffer holding RPC message
* @rdma: controlling transport
- * @ctxt: send_ctxt for the Send WR
+ * @sctxt: send_ctxt for the Send WR
+ * @rctxt: Write and Reply chunks provided by client
* @xdr: prepared xdr_buf containing RPC message
- * @wr_lst: pointer to Call header's Write list, or NULL
*
* Load the xdr_buf into the ctxt's sge array, and DMA map each
- * element as it is added.
+ * element as it is added. The Send WR's num_sge field is set.
*
* Returns zero on success, or a negative errno on failure.
*/
int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma,
- struct svc_rdma_send_ctxt *ctxt,
- struct xdr_buf *xdr, __be32 *wr_lst)
+ struct svc_rdma_send_ctxt *sctxt,
+ const struct svc_rdma_recv_ctxt *rctxt,
+ struct xdr_buf *xdr)
{
unsigned int len, remaining;
unsigned long page_off;
@@ -651,11 +677,24 @@
u32 xdr_pad;
int ret;
- if (svc_rdma_pull_up_needed(rdma, xdr, wr_lst))
- return svc_rdma_pull_up_reply_msg(rdma, ctxt, xdr, wr_lst);
+ /* Set up the (persistently-mapped) transport header SGE. */
+ sctxt->sc_send_wr.num_sge = 1;
+ sctxt->sc_sges[0].length = sctxt->sc_hdrbuf.len;
- ++ctxt->sc_cur_sge_no;
- ret = svc_rdma_dma_map_buf(rdma, ctxt,
+ /* If there is a Reply chunk, nothing follows the transport
+ * header, and we're done here.
+ */
+ if (rctxt && rctxt->rc_reply_chunk)
+ return 0;
+
+ /* For pull-up, svc_rdma_send() will sync the transport header.
+ * No additional DMA mapping is necessary.
+ */
+ if (svc_rdma_pull_up_needed(rdma, sctxt, rctxt, xdr))
+ return svc_rdma_pull_up_reply_msg(rdma, sctxt, rctxt, xdr);
+
+ ++sctxt->sc_cur_sge_no;
+ ret = svc_rdma_dma_map_buf(rdma, sctxt,
xdr->head[0].iov_base,
xdr->head[0].iov_len);
if (ret < 0)
@@ -666,10 +705,10 @@
* have added XDR padding in the tail buffer, and that
* should not be included inline.
*/
- if (wr_lst) {
+ if (rctxt && rctxt->rc_write_list) {
base = xdr->tail[0].iov_base;
len = xdr->tail[0].iov_len;
- xdr_pad = xdr_padsize(xdr->page_len);
+ xdr_pad = xdr_pad_size(xdr->page_len);
if (len && xdr_pad) {
base += xdr_pad;
@@ -685,8 +724,8 @@
while (remaining) {
len = min_t(u32, PAGE_SIZE - page_off, remaining);
- ++ctxt->sc_cur_sge_no;
- ret = svc_rdma_dma_map_page(rdma, ctxt, *ppages++,
+ ++sctxt->sc_cur_sge_no;
+ ret = svc_rdma_dma_map_page(rdma, sctxt, *ppages++,
page_off, len);
if (ret < 0)
return ret;
@@ -699,8 +738,8 @@
len = xdr->tail[0].iov_len;
tail:
if (len) {
- ++ctxt->sc_cur_sge_no;
- ret = svc_rdma_dma_map_buf(rdma, ctxt, base, len);
+ ++sctxt->sc_cur_sge_no;
+ ret = svc_rdma_dma_map_buf(rdma, sctxt, base, len);
if (ret < 0)
return ret;
}
@@ -747,18 +786,14 @@
*/
static int svc_rdma_send_reply_msg(struct svcxprt_rdma *rdma,
struct svc_rdma_send_ctxt *sctxt,
- struct svc_rdma_recv_ctxt *rctxt,
- struct svc_rqst *rqstp,
- __be32 *wr_lst, __be32 *rp_ch)
+ const struct svc_rdma_recv_ctxt *rctxt,
+ struct svc_rqst *rqstp)
{
int ret;
- if (!rp_ch) {
- ret = svc_rdma_map_reply_msg(rdma, sctxt,
- &rqstp->rq_res, wr_lst);
- if (ret < 0)
- return ret;
- }
+ ret = svc_rdma_map_reply_msg(rdma, sctxt, rctxt, &rqstp->rq_res);
+ if (ret < 0)
+ return ret;
svc_rdma_save_io_pages(rqstp, sctxt);
@@ -768,42 +803,76 @@
} else {
sctxt->sc_send_wr.opcode = IB_WR_SEND;
}
- dprintk("svcrdma: posting Send WR with %u sge(s)\n",
- sctxt->sc_send_wr.num_sge);
- return svc_rdma_send(rdma, &sctxt->sc_send_wr);
+ return svc_rdma_send(rdma, sctxt);
}
-/* Given the client-provided Write and Reply chunks, the server was not
- * able to form a complete reply. Return an RDMA_ERROR message so the
- * client can retire this RPC transaction. As above, the Send completion
- * routine releases payload pages that were part of a previous RDMA Write.
+/**
+ * svc_rdma_send_error_msg - Send an RPC/RDMA v1 error response
+ * @rdma: controlling transport context
+ * @sctxt: Send context for the response
+ * @rctxt: Receive context for incoming bad message
+ * @status: negative errno indicating error that occurred
*
- * Remote Invalidation is skipped for simplicity.
+ * Given the client-provided Read, Write, and Reply chunks, the
+ * server was not able to parse the Call or form a complete Reply.
+ * Return an RDMA_ERROR message so the client can retire the RPC
+ * transaction.
+ *
+ * The caller does not have to release @sctxt. It is released by
+ * Send completion, or by this function on error.
*/
-static int svc_rdma_send_error_msg(struct svcxprt_rdma *rdma,
- struct svc_rdma_send_ctxt *ctxt,
- struct svc_rqst *rqstp)
+void svc_rdma_send_error_msg(struct svcxprt_rdma *rdma,
+ struct svc_rdma_send_ctxt *sctxt,
+ struct svc_rdma_recv_ctxt *rctxt,
+ int status)
{
+ __be32 *rdma_argp = rctxt->rc_recv_buf;
__be32 *p;
- int ret;
- p = ctxt->sc_xprt_buf;
- trace_svcrdma_err_chunk(*p);
- p += 3;
- *p++ = rdma_error;
- *p = err_chunk;
- svc_rdma_sync_reply_hdr(rdma, ctxt, RPCRDMA_HDRLEN_ERR);
+ rpcrdma_set_xdrlen(&sctxt->sc_hdrbuf, 0);
+ xdr_init_encode(&sctxt->sc_stream, &sctxt->sc_hdrbuf,
+ sctxt->sc_xprt_buf, NULL);
- svc_rdma_save_io_pages(rqstp, ctxt);
+ p = xdr_reserve_space(&sctxt->sc_stream,
+ rpcrdma_fixed_maxsz * sizeof(*p));
+ if (!p)
+ goto put_ctxt;
- ctxt->sc_send_wr.opcode = IB_WR_SEND;
- ret = svc_rdma_send(rdma, &ctxt->sc_send_wr);
- if (ret) {
- svc_rdma_send_ctxt_put(rdma, ctxt);
- return ret;
+ *p++ = *rdma_argp;
+ *p++ = *(rdma_argp + 1);
+ *p++ = rdma->sc_fc_credits;
+ *p = rdma_error;
+
+ switch (status) {
+ case -EPROTONOSUPPORT:
+ p = xdr_reserve_space(&sctxt->sc_stream, 3 * sizeof(*p));
+ if (!p)
+ goto put_ctxt;
+
+ *p++ = err_vers;
+ *p++ = rpcrdma_version;
+ *p = rpcrdma_version;
+ trace_svcrdma_err_vers(*rdma_argp);
+ break;
+ default:
+ p = xdr_reserve_space(&sctxt->sc_stream, sizeof(*p));
+ if (!p)
+ goto put_ctxt;
+
+ *p = err_chunk;
+ trace_svcrdma_err_chunk(*rdma_argp);
}
- return 0;
+ /* Remote Invalidation is skipped for simplicity. */
+ sctxt->sc_send_wr.num_sge = 1;
+ sctxt->sc_send_wr.opcode = IB_WR_SEND;
+ sctxt->sc_sges[0].length = sctxt->sc_hdrbuf.len;
+ if (svc_rdma_send(rdma, sctxt))
+ goto put_ctxt;
+ return;
+
+put_ctxt:
+ svc_rdma_send_ctxt_put(rdma, sctxt);
}
/**
@@ -824,37 +893,34 @@
struct svcxprt_rdma *rdma =
container_of(xprt, struct svcxprt_rdma, sc_xprt);
struct svc_rdma_recv_ctxt *rctxt = rqstp->rq_xprt_ctxt;
- __be32 *p, *rdma_argp, *rdma_resp, *wr_lst, *rp_ch;
+ __be32 *rdma_argp = rctxt->rc_recv_buf;
+ __be32 *wr_lst = rctxt->rc_write_list;
+ __be32 *rp_ch = rctxt->rc_reply_chunk;
struct xdr_buf *xdr = &rqstp->rq_res;
struct svc_rdma_send_ctxt *sctxt;
+ __be32 *p;
int ret;
- rdma_argp = rctxt->rc_recv_buf;
- svc_rdma_get_write_arrays(rdma_argp, &wr_lst, &rp_ch);
+ ret = -ENOTCONN;
+ if (svc_xprt_is_dead(xprt))
+ goto err0;
- /* Create the RDMA response header. xprt->xpt_mutex,
- * acquired in svc_send(), serializes RPC replies. The
- * code path below that inserts the credit grant value
- * into each transport header runs only inside this
- * critical section.
- */
ret = -ENOMEM;
sctxt = svc_rdma_send_ctxt_get(rdma);
if (!sctxt)
goto err0;
- rdma_resp = sctxt->sc_xprt_buf;
- p = rdma_resp;
+ p = xdr_reserve_space(&sctxt->sc_stream,
+ rpcrdma_fixed_maxsz * sizeof(*p));
+ if (!p)
+ goto err0;
*p++ = *rdma_argp;
*p++ = *(rdma_argp + 1);
*p++ = rdma->sc_fc_credits;
- *p++ = rp_ch ? rdma_nomsg : rdma_msg;
+ *p = rp_ch ? rdma_nomsg : rdma_msg;
- /* Start with empty chunks */
- *p++ = xdr_zero;
- *p++ = xdr_zero;
- *p = xdr_zero;
-
+ if (svc_rdma_encode_read_list(sctxt) < 0)
+ goto err0;
if (wr_lst) {
/* XXX: Presume the client sent only one Write chunk */
unsigned long offset;
@@ -871,18 +937,24 @@
length);
if (ret < 0)
goto err2;
- svc_rdma_xdr_encode_write_list(rdma_resp, wr_lst, ret);
+ if (svc_rdma_encode_write_list(rctxt, sctxt, length) < 0)
+ goto err0;
+ } else {
+ if (xdr_stream_encode_item_absent(&sctxt->sc_stream) < 0)
+ goto err0;
}
if (rp_ch) {
- ret = svc_rdma_send_reply_chunk(rdma, rp_ch, wr_lst, xdr);
+ ret = svc_rdma_send_reply_chunk(rdma, rctxt, &rqstp->rq_res);
if (ret < 0)
goto err2;
- svc_rdma_xdr_encode_reply_chunk(rdma_resp, rp_ch, ret);
+ if (svc_rdma_encode_reply_chunk(rctxt, sctxt, ret) < 0)
+ goto err0;
+ } else {
+ if (xdr_stream_encode_item_absent(&sctxt->sc_stream) < 0)
+ goto err0;
}
- svc_rdma_sync_reply_hdr(rdma, sctxt, svc_rdma_reply_hdr_len(rdma_resp));
- ret = svc_rdma_send_reply_msg(rdma, sctxt, rctxt, rqstp,
- wr_lst, rp_ch);
+ ret = svc_rdma_send_reply_msg(rdma, sctxt, rctxt, rqstp);
if (ret < 0)
goto err1;
return 0;
@@ -891,15 +963,17 @@
if (ret != -E2BIG && ret != -EINVAL)
goto err1;
- ret = svc_rdma_send_error_msg(rdma, sctxt, rqstp);
- if (ret < 0)
- goto err1;
+ /* Send completion releases payload pages that were part
+ * of previously posted RDMA Writes.
+ */
+ svc_rdma_save_io_pages(rqstp, sctxt);
+ svc_rdma_send_error_msg(rdma, sctxt, rctxt, ret);
return 0;
err1:
svc_rdma_send_ctxt_put(rdma, sctxt);
err0:
- trace_svcrdma_send_failed(rqstp, ret);
+ trace_svcrdma_send_err(rqstp, ret);
set_bit(XPT_CLOSE, &xprt->xpt_flags);
return -ENOTCONN;
}
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index 89a1267..5f7e3d1 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -55,7 +55,6 @@
#include <linux/sunrpc/addr.h>
#include <linux/sunrpc/debug.h>
-#include <linux/sunrpc/rpc_rdma.h>
#include <linux/sunrpc/svc_xprt.h>
#include <linux/sunrpc/svc_rdma.h>
@@ -211,7 +210,12 @@
newxprt->sc_ord = param->initiator_depth;
sa = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.dst_addr;
- svc_xprt_set_remote(&newxprt->sc_xprt, sa, svc_addr_len(sa));
+ newxprt->sc_xprt.xpt_remotelen = svc_addr_len(sa);
+ memcpy(&newxprt->sc_xprt.xpt_remote, sa,
+ newxprt->sc_xprt.xpt_remotelen);
+ snprintf(newxprt->sc_xprt.xpt_remotebuf,
+ sizeof(newxprt->sc_xprt.xpt_remotebuf) - 1, "%pISc", sa);
+
/* The remote port is arbitrary and not under the control of the
* client ULP. Set it to a fixed value so that the DRC continues
* to be effective after a reconnect.
@@ -233,72 +237,56 @@
svc_xprt_enqueue(&listen_xprt->sc_xprt);
}
-/*
- * Handles events generated on the listening endpoint. These events will be
- * either be incoming connect requests or adapter removal events.
+/**
+ * svc_rdma_listen_handler - Handle CM events generated on a listening endpoint
+ * @cma_id: the server's listener rdma_cm_id
+ * @event: details of the event
+ *
+ * Return values:
+ * %0: Do not destroy @cma_id
+ * %1: Destroy @cma_id (never returned here)
+ *
+ * NB: There is never a DEVICE_REMOVAL event for INADDR_ANY listeners.
*/
-static int rdma_listen_handler(struct rdma_cm_id *cma_id,
- struct rdma_cm_event *event)
+static int svc_rdma_listen_handler(struct rdma_cm_id *cma_id,
+ struct rdma_cm_event *event)
{
- struct sockaddr *sap = (struct sockaddr *)&cma_id->route.addr.src_addr;
-
- trace_svcrdma_cm_event(event, sap);
-
switch (event->event) {
case RDMA_CM_EVENT_CONNECT_REQUEST:
- dprintk("svcrdma: Connect request on cma_id=%p, xprt = %p, "
- "event = %s (%d)\n", cma_id, cma_id->context,
- rdma_event_msg(event->event), event->event);
handle_connect_req(cma_id, &event->param.conn);
break;
default:
- /* NB: No device removal upcall for INADDR_ANY listeners */
- dprintk("svcrdma: Unexpected event on listening endpoint %p, "
- "event = %s (%d)\n", cma_id,
- rdma_event_msg(event->event), event->event);
break;
}
-
return 0;
}
-static int rdma_cma_handler(struct rdma_cm_id *cma_id,
- struct rdma_cm_event *event)
+/**
+ * svc_rdma_cma_handler - Handle CM events on client connections
+ * @cma_id: the server's listener rdma_cm_id
+ * @event: details of the event
+ *
+ * Return values:
+ * %0: Do not destroy @cma_id
+ * %1: Destroy @cma_id (never returned here)
+ */
+static int svc_rdma_cma_handler(struct rdma_cm_id *cma_id,
+ struct rdma_cm_event *event)
{
- struct sockaddr *sap = (struct sockaddr *)&cma_id->route.addr.dst_addr;
struct svcxprt_rdma *rdma = cma_id->context;
struct svc_xprt *xprt = &rdma->sc_xprt;
- trace_svcrdma_cm_event(event, sap);
-
switch (event->event) {
case RDMA_CM_EVENT_ESTABLISHED:
- /* Accept complete */
- svc_xprt_get(xprt);
- dprintk("svcrdma: Connection completed on DTO xprt=%p, "
- "cm_id=%p\n", xprt, cma_id);
clear_bit(RDMAXPRT_CONN_PENDING, &rdma->sc_flags);
svc_xprt_enqueue(xprt);
break;
case RDMA_CM_EVENT_DISCONNECTED:
- dprintk("svcrdma: Disconnect on DTO xprt=%p, cm_id=%p\n",
- xprt, cma_id);
- set_bit(XPT_CLOSE, &xprt->xpt_flags);
- svc_xprt_enqueue(xprt);
- svc_xprt_put(xprt);
- break;
case RDMA_CM_EVENT_DEVICE_REMOVAL:
- dprintk("svcrdma: Device removal cma_id=%p, xprt = %p, "
- "event = %s (%d)\n", cma_id, xprt,
- rdma_event_msg(event->event), event->event);
set_bit(XPT_CLOSE, &xprt->xpt_flags);
svc_xprt_enqueue(xprt);
- svc_xprt_put(xprt);
break;
default:
- dprintk("svcrdma: Unexpected event on DTO endpoint %p, "
- "event = %s (%d)\n", cma_id,
- rdma_event_msg(event->event), event->event);
break;
}
return 0;
@@ -316,22 +304,18 @@
struct svcxprt_rdma *cma_xprt;
int ret;
- dprintk("svcrdma: Creating RDMA listener\n");
- if ((sa->sa_family != AF_INET) && (sa->sa_family != AF_INET6)) {
- dprintk("svcrdma: Address family %d is not supported.\n", sa->sa_family);
+ if (sa->sa_family != AF_INET && sa->sa_family != AF_INET6)
return ERR_PTR(-EAFNOSUPPORT);
- }
cma_xprt = svc_rdma_create_xprt(serv, net);
if (!cma_xprt)
return ERR_PTR(-ENOMEM);
set_bit(XPT_LISTENER, &cma_xprt->sc_xprt.xpt_flags);
strcpy(cma_xprt->sc_xprt.xpt_remotebuf, "listener");
- listen_id = rdma_create_id(net, rdma_listen_handler, cma_xprt,
+ listen_id = rdma_create_id(net, svc_rdma_listen_handler, cma_xprt,
RDMA_PS_TCP, IB_QPT_RC);
if (IS_ERR(listen_id)) {
ret = PTR_ERR(listen_id);
- dprintk("svcrdma: rdma_create_id failed = %d\n", ret);
goto err0;
}
@@ -340,23 +324,17 @@
*/
#if IS_ENABLED(CONFIG_IPV6)
ret = rdma_set_afonly(listen_id, 1);
- if (ret) {
- dprintk("svcrdma: rdma_set_afonly failed = %d\n", ret);
+ if (ret)
goto err1;
- }
#endif
ret = rdma_bind_addr(listen_id, sa);
- if (ret) {
- dprintk("svcrdma: rdma_bind_addr failed = %d\n", ret);
+ if (ret)
goto err1;
- }
cma_xprt->sc_cm_id = listen_id;
ret = rdma_listen(listen_id, RPCRDMA_LISTEN_BACKLOG);
- if (ret) {
- dprintk("svcrdma: rdma_listen failed = %d\n", ret);
+ if (ret)
goto err1;
- }
/*
* We need to use the address from the cm_id in case the
@@ -412,9 +390,6 @@
if (!newxprt)
return NULL;
- dprintk("svcrdma: newxprt from accept queue = %p, cm_id=%p\n",
- newxprt, newxprt->sc_cm_id);
-
dev = newxprt->sc_cm_id->device;
newxprt->sc_port_num = newxprt->sc_cm_id->port_num;
@@ -450,21 +425,17 @@
newxprt->sc_pd = ib_alloc_pd(dev, 0);
if (IS_ERR(newxprt->sc_pd)) {
- dprintk("svcrdma: error creating PD for connect request\n");
+ trace_svcrdma_pd_err(newxprt, PTR_ERR(newxprt->sc_pd));
goto errout;
}
newxprt->sc_sq_cq = ib_alloc_cq_any(dev, newxprt, newxprt->sc_sq_depth,
IB_POLL_WORKQUEUE);
- if (IS_ERR(newxprt->sc_sq_cq)) {
- dprintk("svcrdma: error creating SQ CQ for connect request\n");
+ if (IS_ERR(newxprt->sc_sq_cq))
goto errout;
- }
newxprt->sc_rq_cq =
ib_alloc_cq_any(dev, newxprt, rq_depth, IB_POLL_WORKQUEUE);
- if (IS_ERR(newxprt->sc_rq_cq)) {
- dprintk("svcrdma: error creating RQ CQ for connect request\n");
+ if (IS_ERR(newxprt->sc_rq_cq))
goto errout;
- }
memset(&qp_attr, 0, sizeof qp_attr);
qp_attr.event_handler = qp_event_handler;
@@ -488,7 +459,7 @@
ret = rdma_create_qp(newxprt->sc_cm_id, newxprt->sc_pd, &qp_attr);
if (ret) {
- dprintk("svcrdma: failed to create QP, ret=%d\n", ret);
+ trace_svcrdma_qp_err(newxprt, ret);
goto errout;
}
newxprt->sc_qp = newxprt->sc_cm_id->qp;
@@ -496,15 +467,14 @@
if (!(dev->attrs.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS))
newxprt->sc_snd_w_inv = false;
if (!rdma_protocol_iwarp(dev, newxprt->sc_port_num) &&
- !rdma_ib_or_roce(dev, newxprt->sc_port_num))
+ !rdma_ib_or_roce(dev, newxprt->sc_port_num)) {
+ trace_svcrdma_fabric_err(newxprt, -EINVAL);
goto errout;
+ }
if (!svc_rdma_post_recvs(newxprt))
goto errout;
- /* Swap out the handler */
- newxprt->sc_cm_id->event_handler = rdma_cma_handler;
-
/* Construct RDMA-CM private message */
pmsg.cp_magic = rpcrdma_cmp_magic;
pmsg.cp_version = RPCRDMA_CMP_VERSION;
@@ -519,15 +489,20 @@
conn_param.initiator_depth = min_t(int, newxprt->sc_ord,
dev->attrs.max_qp_init_rd_atom);
if (!conn_param.initiator_depth) {
- dprintk("svcrdma: invalid ORD setting\n");
ret = -EINVAL;
+ trace_svcrdma_initdepth_err(newxprt, ret);
goto errout;
}
conn_param.private_data = &pmsg;
conn_param.private_data_len = sizeof(pmsg);
+ rdma_lock_handler(newxprt->sc_cm_id);
+ newxprt->sc_cm_id->event_handler = svc_rdma_cma_handler;
ret = rdma_accept(newxprt->sc_cm_id, &conn_param);
- if (ret)
+ rdma_unlock_handler(newxprt->sc_cm_id);
+ if (ret) {
+ trace_svcrdma_accept_err(newxprt, ret);
goto errout;
+ }
#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
dprintk("svcrdma: new connection %p accepted:\n", newxprt);
@@ -542,12 +517,9 @@
dprintk(" ord : %d\n", conn_param.initiator_depth);
#endif
- trace_svcrdma_xprt_accept(&newxprt->sc_xprt);
return &newxprt->sc_xprt;
errout:
- dprintk("svcrdma: failure accepting new connection rc=%d.\n", ret);
- trace_svcrdma_xprt_fail(&newxprt->sc_xprt);
/* Take a reference in case the DTO handler runs */
svc_xprt_get(&newxprt->sc_xprt);
if (newxprt->sc_qp && !IS_ERR(newxprt->sc_qp))
@@ -558,24 +530,11 @@
return NULL;
}
-/*
- * When connected, an svc_xprt has at least two references:
- *
- * - A reference held by the cm_id between the ESTABLISHED and
- * DISCONNECTED events. If the remote peer disconnected first, this
- * reference could be gone.
- *
- * - A reference held by the svc_recv code that called this function
- * as part of close processing.
- *
- * At a minimum one references should still be held.
- */
static void svc_rdma_detach(struct svc_xprt *xprt)
{
struct svcxprt_rdma *rdma =
container_of(xprt, struct svcxprt_rdma, sc_xprt);
- /* Disconnect and flush posted WQE */
rdma_disconnect(rdma->sc_cm_id);
}
@@ -585,8 +544,7 @@
container_of(work, struct svcxprt_rdma, sc_work);
struct svc_xprt *xprt = &rdma->sc_xprt;
- trace_svcrdma_xprt_free(xprt);
-
+ /* This blocks until the Completion Queues are empty */
if (rdma->sc_qp && !IS_ERR(rdma->sc_qp))
ib_drain_qp(rdma->sc_qp);
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index 2f21e3c..8e2368a 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -68,7 +68,7 @@
* tunables
*/
-unsigned int xprt_rdma_slot_table_entries = RPCRDMA_DEF_SLOT_TABLE;
+static unsigned int xprt_rdma_slot_table_entries = RPCRDMA_DEF_SLOT_TABLE;
unsigned int xprt_rdma_max_inline_read = RPCRDMA_DEF_INLINE;
unsigned int xprt_rdma_max_inline_write = RPCRDMA_DEF_INLINE;
unsigned int xprt_rdma_memreg_strategy = RPCRDMA_FRWR;
@@ -240,27 +240,29 @@
struct rpc_xprt *xprt = &r_xprt->rx_xprt;
int rc;
- rc = rpcrdma_ep_connect(&r_xprt->rx_ep, &r_xprt->rx_ia);
+ rc = rpcrdma_xprt_connect(r_xprt);
xprt_clear_connecting(xprt);
- if (r_xprt->rx_ep.rep_connected > 0) {
- if (!xprt_test_and_set_connected(xprt)) {
- xprt->stat.connect_count++;
- xprt->stat.connect_time += (long)jiffies -
- xprt->stat.connect_start;
- xprt_wake_pending_tasks(xprt, -EAGAIN);
- }
- } else {
- if (xprt_test_and_clear_connected(xprt))
- xprt_wake_pending_tasks(xprt, rc);
- }
+ if (!rc) {
+ xprt->connect_cookie++;
+ xprt->stat.connect_count++;
+ xprt->stat.connect_time += (long)jiffies -
+ xprt->stat.connect_start;
+ xprt_set_connected(xprt);
+ rc = -EAGAIN;
+ } else
+ rpcrdma_xprt_disconnect(r_xprt);
+ xprt_unlock_connect(xprt, r_xprt);
+ xprt_wake_pending_tasks(xprt, rc);
}
/**
* xprt_rdma_inject_disconnect - inject a connection fault
* @xprt: transport context
*
- * If @xprt is connected, disconnect it to simulate spurious connection
- * loss.
+ * If @xprt is connected, disconnect it to simulate spurious
+ * connection loss. Caller must hold @xprt's send lock to
+ * ensure that data structures and hardware resources are
+ * stable during the rdma_disconnect() call.
*/
static void
xprt_rdma_inject_disconnect(struct rpc_xprt *xprt)
@@ -268,7 +270,7 @@
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
trace_xprtrdma_op_inject_dsc(r_xprt);
- rdma_disconnect(r_xprt->rx_ia.ri_id);
+ rdma_disconnect(r_xprt->rx_ep->re_id);
}
/**
@@ -283,13 +285,10 @@
{
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
- trace_xprtrdma_op_destroy(r_xprt);
-
cancel_delayed_work_sync(&r_xprt->rx_connect_worker);
- rpcrdma_ep_destroy(r_xprt);
+ rpcrdma_xprt_disconnect(r_xprt);
rpcrdma_buffer_destroy(&r_xprt->rx_buf);
- rpcrdma_ia_close(&r_xprt->rx_ia);
xprt_rdma_free_addresses(xprt);
xprt_free(xprt);
@@ -319,9 +318,15 @@
if (args->addrlen > sizeof(xprt->addr))
return ERR_PTR(-EBADF);
- xprt = xprt_alloc(args->net, sizeof(struct rpcrdma_xprt), 0, 0);
- if (!xprt)
+ if (!try_module_get(THIS_MODULE))
+ return ERR_PTR(-EIO);
+
+ xprt = xprt_alloc(args->net, sizeof(struct rpcrdma_xprt), 0,
+ xprt_rdma_slot_table_entries);
+ if (!xprt) {
+ module_put(THIS_MODULE);
return ERR_PTR(-ENOMEM);
+ }
xprt->timeout = &xprt_rdma_default_timeout;
xprt->connect_timeout = xprt->timeout->to_initval;
@@ -349,49 +354,20 @@
xprt_rdma_format_addresses(xprt, sap);
new_xprt = rpcx_to_rdmax(xprt);
- rc = rpcrdma_ia_open(new_xprt);
- if (rc)
- goto out1;
-
- rc = rpcrdma_ep_create(new_xprt);
- if (rc)
- goto out2;
-
rc = rpcrdma_buffer_create(new_xprt);
- if (rc)
- goto out3;
+ if (rc) {
+ xprt_rdma_free_addresses(xprt);
+ xprt_free(xprt);
+ module_put(THIS_MODULE);
+ return ERR_PTR(rc);
+ }
INIT_DELAYED_WORK(&new_xprt->rx_connect_worker,
xprt_rdma_connect_worker);
- xprt->max_payload = frwr_maxpages(new_xprt);
- if (xprt->max_payload == 0)
- goto out4;
- xprt->max_payload <<= PAGE_SHIFT;
- dprintk("RPC: %s: transport data payload maximum: %zu bytes\n",
- __func__, xprt->max_payload);
+ xprt->max_payload = RPCRDMA_MAX_DATA_SEGS << PAGE_SHIFT;
- if (!try_module_get(THIS_MODULE))
- goto out4;
-
- dprintk("RPC: %s: %s:%s\n", __func__,
- xprt->address_strings[RPC_DISPLAY_ADDR],
- xprt->address_strings[RPC_DISPLAY_PORT]);
- trace_xprtrdma_create(new_xprt);
return xprt;
-
-out4:
- rpcrdma_buffer_destroy(&new_xprt->rx_buf);
- rc = -ENODEV;
-out3:
- rpcrdma_ep_destroy(new_xprt);
-out2:
- rpcrdma_ia_close(&new_xprt->rx_ia);
-out1:
- trace_xprtrdma_op_destroy(new_xprt);
- xprt_rdma_free_addresses(xprt);
- xprt_free(xprt);
- return ERR_PTR(rc);
}
/**
@@ -406,35 +382,9 @@
void xprt_rdma_close(struct rpc_xprt *xprt)
{
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
- struct rpcrdma_ep *ep = &r_xprt->rx_ep;
- struct rpcrdma_ia *ia = &r_xprt->rx_ia;
- might_sleep();
+ rpcrdma_xprt_disconnect(r_xprt);
- trace_xprtrdma_op_close(r_xprt);
-
- /* Prevent marshaling and sending of new requests */
- xprt_clear_connected(xprt);
-
- if (test_and_clear_bit(RPCRDMA_IAF_REMOVING, &ia->ri_flags)) {
- rpcrdma_ia_remove(ia);
- goto out;
- }
-
- if (ep->rep_connected == -ENODEV)
- return;
- rpcrdma_ep_disconnect(ep, ia);
-
- /* Prepare @xprt for the next connection by reinitializing
- * its credit grant to one (see RFC 8166, Section 3.3.3).
- */
- spin_lock(&xprt->transport_lock);
- r_xprt->rx_buf.rb_credits = 1;
- xprt->cong = 0;
- xprt->cwnd = RPC_CWNDSHIFT;
- spin_unlock(&xprt->transport_lock);
-
-out:
xprt->reestablish_timeout = 0;
++xprt->connect_cookie;
xprt_disconnect_done(xprt);
@@ -453,12 +403,6 @@
struct sockaddr *sap = (struct sockaddr *)&xprt->addr;
char buf[8];
- dprintk("RPC: %s: setting port for xprt %p (%s:%s) to %u\n",
- __func__, xprt,
- xprt->address_strings[RPC_DISPLAY_ADDR],
- xprt->address_strings[RPC_DISPLAY_PORT],
- port);
-
rpc_set_port(sap, port);
kfree(xprt->address_strings[RPC_DISPLAY_PORT]);
@@ -537,15 +481,17 @@
xprt_rdma_connect(struct rpc_xprt *xprt, struct rpc_task *task)
{
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+ struct rpcrdma_ep *ep = r_xprt->rx_ep;
unsigned long delay;
- trace_xprtrdma_op_connect(r_xprt);
+ WARN_ON_ONCE(!xprt_lock_connect(xprt, task, r_xprt));
delay = 0;
- if (r_xprt->rx_ep.rep_connected != 0) {
+ if (ep && ep->re_connect_status != 0) {
delay = xprt_reconnect_delay(xprt);
xprt_reconnect_backoff(xprt, RPCRDMA_INIT_REEST_TO);
}
+ trace_xprtrdma_op_connect(r_xprt, delay);
queue_delayed_work(xprtiod_workqueue, &r_xprt->rx_connect_worker,
delay);
}
@@ -573,9 +519,8 @@
return;
out_sleep:
- set_bit(XPRT_CONGESTED, &xprt->state);
- rpc_sleep_on(&xprt->backlog, task, NULL);
task->tk_status = -EAGAIN;
+ xprt_add_backlog(xprt, task);
}
/**
@@ -590,10 +535,11 @@
struct rpcrdma_xprt *r_xprt =
container_of(xprt, struct rpcrdma_xprt, rx_xprt);
- memset(rqst, 0, sizeof(*rqst));
- rpcrdma_buffer_put(&r_xprt->rx_buf, rpcr_to_rdmar(rqst));
- if (unlikely(!rpc_wake_up_next(&xprt->backlog)))
- clear_bit(XPRT_CONGESTED, &xprt->state);
+ rpcrdma_reply_put(&r_xprt->rx_buf, rpcr_to_rdmar(rqst));
+ if (!xprt_wake_up_backlog(xprt, rqst)) {
+ memset(rqst, 0, sizeof(*rqst));
+ rpcrdma_buffer_put(&r_xprt->rx_buf, rpcr_to_rdmar(rqst));
+ }
}
static bool rpcrdma_check_regbuf(struct rpcrdma_xprt *r_xprt,
@@ -638,11 +584,9 @@
rqst->rq_buffer = rdmab_data(req->rl_sendbuf);
rqst->rq_rbuffer = rdmab_data(req->rl_recvbuf);
- trace_xprtrdma_op_allocate(task, req);
return 0;
out_fail:
- trace_xprtrdma_op_allocate(task, NULL);
return -ENOMEM;
}
@@ -659,8 +603,6 @@
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt);
struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
- trace_xprtrdma_op_free(task, req);
-
if (!list_empty(&req->rl_registered))
frwr_unmap_sync(r_xprt, req);
@@ -715,7 +657,7 @@
goto drop_connection;
rqst->rq_xtime = ktime_get();
- if (rpcrdma_ep_post(&r_xprt->rx_ia, &r_xprt->rx_ep, req))
+ if (rpcrdma_post_sends(r_xprt, req))
goto drop_connection;
rqst->rq_xmit_bytes_sent += rqst->rq_snd_buf.len;
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 0f4d39f..dcc1992 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -74,11 +74,18 @@
/*
* internal functions
*/
-static void rpcrdma_sendctx_put_locked(struct rpcrdma_sendctx *sc);
+static int rpcrdma_sendctxs_create(struct rpcrdma_xprt *r_xprt);
+static void rpcrdma_sendctxs_destroy(struct rpcrdma_xprt *r_xprt);
+static void rpcrdma_sendctx_put_locked(struct rpcrdma_xprt *r_xprt,
+ struct rpcrdma_sendctx *sc);
+static int rpcrdma_reqs_setup(struct rpcrdma_xprt *r_xprt);
static void rpcrdma_reqs_reset(struct rpcrdma_xprt *r_xprt);
+static void rpcrdma_rep_destroy(struct rpcrdma_rep *rep);
static void rpcrdma_reps_unmap(struct rpcrdma_xprt *r_xprt);
static void rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt);
-static void rpcrdma_mrs_destroy(struct rpcrdma_buffer *buf);
+static void rpcrdma_mrs_destroy(struct rpcrdma_xprt *r_xprt);
+static void rpcrdma_ep_get(struct rpcrdma_ep *ep);
+static int rpcrdma_ep_put(struct rpcrdma_ep *ep);
static struct rpcrdma_regbuf *
rpcrdma_regbuf_alloc(size_t size, enum dma_data_direction direction,
gfp_t flags);
@@ -91,17 +98,20 @@
*/
static void rpcrdma_xprt_drain(struct rpcrdma_xprt *r_xprt)
{
- struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+ struct rpcrdma_ep *ep = r_xprt->rx_ep;
+ struct rdma_cm_id *id = ep->re_id;
/* Flush Receives, then wait for deferred Reply work
* to complete.
*/
- ib_drain_rq(ia->ri_id->qp);
+ ib_drain_rq(id->qp);
/* Deferred Reply processing might have scheduled
* local invalidations.
*/
- ib_drain_sq(ia->ri_id->qp);
+ ib_drain_sq(id->qp);
+
+ rpcrdma_ep_put(ep);
}
/**
@@ -110,53 +120,74 @@
* @context: ep that owns QP where event occurred
*
* Called from the RDMA provider (device driver) possibly in an interrupt
- * context.
+ * context. The QP is always destroyed before the ID, so the ID will be
+ * reliably available when this handler is invoked.
*/
-static void
-rpcrdma_qp_event_handler(struct ib_event *event, void *context)
+static void rpcrdma_qp_event_handler(struct ib_event *event, void *context)
{
struct rpcrdma_ep *ep = context;
- struct rpcrdma_xprt *r_xprt = container_of(ep, struct rpcrdma_xprt,
- rx_ep);
- trace_xprtrdma_qp_event(r_xprt, event);
+ trace_xprtrdma_qp_event(ep, event);
+}
+
+/* Ensure xprt_force_disconnect() is invoked exactly once when a
+ * connection is closed or lost. (The important thing is it needs
+ * to be invoked "at least" once).
+ */
+static void rpcrdma_force_disconnect(struct rpcrdma_ep *ep)
+{
+ if (atomic_add_unless(&ep->re_force_disconnect, 1, 1))
+ xprt_force_disconnect(ep->re_xprt);
+}
+
+/**
+ * rpcrdma_flush_disconnect - Disconnect on flushed completion
+ * @r_xprt: transport to disconnect
+ * @wc: work completion entry
+ *
+ * Must be called in process context.
+ */
+void rpcrdma_flush_disconnect(struct rpcrdma_xprt *r_xprt, struct ib_wc *wc)
+{
+ if (wc->status != IB_WC_SUCCESS)
+ rpcrdma_force_disconnect(r_xprt->rx_ep);
}
/**
* rpcrdma_wc_send - Invoked by RDMA provider for each polled Send WC
- * @cq: completion queue (ignored)
- * @wc: completed WR
+ * @cq: completion queue
+ * @wc: WCE for a completed Send WR
*
*/
-static void
-rpcrdma_wc_send(struct ib_cq *cq, struct ib_wc *wc)
+static void rpcrdma_wc_send(struct ib_cq *cq, struct ib_wc *wc)
{
struct ib_cqe *cqe = wc->wr_cqe;
struct rpcrdma_sendctx *sc =
container_of(cqe, struct rpcrdma_sendctx, sc_cqe);
+ struct rpcrdma_xprt *r_xprt = cq->cq_context;
/* WARNING: Only wr_cqe and status are reliable at this point */
trace_xprtrdma_wc_send(sc, wc);
- rpcrdma_sendctx_put_locked(sc);
+ rpcrdma_sendctx_put_locked(r_xprt, sc);
+ rpcrdma_flush_disconnect(r_xprt, wc);
}
/**
* rpcrdma_wc_receive - Invoked by RDMA provider for each polled Receive WC
- * @cq: completion queue (ignored)
- * @wc: completed WR
+ * @cq: completion queue
+ * @wc: WCE for a completed Receive WR
*
*/
-static void
-rpcrdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc)
+static void rpcrdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc)
{
struct ib_cqe *cqe = wc->wr_cqe;
struct rpcrdma_rep *rep = container_of(cqe, struct rpcrdma_rep,
rr_cqe);
- struct rpcrdma_xprt *r_xprt = rep->rr_rxprt;
+ struct rpcrdma_xprt *r_xprt = cq->cq_context;
/* WARNING: Only wr_cqe and status are reliable at this point */
trace_xprtrdma_wc_receive(wc);
- --r_xprt->rx_ep.rep_receive_count;
+ --r_xprt->rx_ep->re_receive_count;
if (wc->status != IB_WC_SUCCESS)
goto out_flushed;
@@ -173,37 +204,35 @@
return;
out_flushed:
- rpcrdma_recv_buffer_put(rep);
+ rpcrdma_flush_disconnect(r_xprt, wc);
+ rpcrdma_rep_destroy(rep);
}
-static void
-rpcrdma_update_connect_private(struct rpcrdma_xprt *r_xprt,
- struct rdma_conn_param *param)
+static void rpcrdma_update_cm_private(struct rpcrdma_ep *ep,
+ struct rdma_conn_param *param)
{
const struct rpcrdma_connect_private *pmsg = param->private_data;
unsigned int rsize, wsize;
/* Default settings for RPC-over-RDMA Version One */
- r_xprt->rx_ia.ri_implicit_roundup = xprt_rdma_pad_optimize;
+ ep->re_implicit_roundup = xprt_rdma_pad_optimize;
rsize = RPCRDMA_V1_DEF_INLINE_SIZE;
wsize = RPCRDMA_V1_DEF_INLINE_SIZE;
if (pmsg &&
pmsg->cp_magic == rpcrdma_cmp_magic &&
pmsg->cp_version == RPCRDMA_CMP_VERSION) {
- r_xprt->rx_ia.ri_implicit_roundup = true;
+ ep->re_implicit_roundup = true;
rsize = rpcrdma_decode_buffer_size(pmsg->cp_send_size);
wsize = rpcrdma_decode_buffer_size(pmsg->cp_recv_size);
}
- if (rsize < r_xprt->rx_ep.rep_inline_recv)
- r_xprt->rx_ep.rep_inline_recv = rsize;
- if (wsize < r_xprt->rx_ep.rep_inline_send)
- r_xprt->rx_ep.rep_inline_send = wsize;
- dprintk("RPC: %s: max send %u, max recv %u\n", __func__,
- r_xprt->rx_ep.rep_inline_send,
- r_xprt->rx_ep.rep_inline_recv);
- rpcrdma_set_max_header_sizes(r_xprt);
+ if (rsize < ep->re_inline_recv)
+ ep->re_inline_recv = rsize;
+ if (wsize < ep->re_inline_send)
+ ep->re_inline_send = wsize;
+
+ rpcrdma_set_max_header_sizes(ep);
}
/**
@@ -217,121 +246,104 @@
static int
rpcrdma_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event)
{
- struct rpcrdma_xprt *r_xprt = id->context;
- struct rpcrdma_ia *ia = &r_xprt->rx_ia;
- struct rpcrdma_ep *ep = &r_xprt->rx_ep;
- struct rpc_xprt *xprt = &r_xprt->rx_xprt;
+ struct sockaddr *sap = (struct sockaddr *)&id->route.addr.dst_addr;
+ struct rpcrdma_ep *ep = id->context;
might_sleep();
- trace_xprtrdma_cm_event(r_xprt, event);
switch (event->event) {
case RDMA_CM_EVENT_ADDR_RESOLVED:
case RDMA_CM_EVENT_ROUTE_RESOLVED:
- ia->ri_async_rc = 0;
- complete(&ia->ri_done);
+ ep->re_async_rc = 0;
+ complete(&ep->re_done);
return 0;
case RDMA_CM_EVENT_ADDR_ERROR:
- ia->ri_async_rc = -EPROTO;
- complete(&ia->ri_done);
+ ep->re_async_rc = -EPROTO;
+ complete(&ep->re_done);
return 0;
case RDMA_CM_EVENT_ROUTE_ERROR:
- ia->ri_async_rc = -ENETUNREACH;
- complete(&ia->ri_done);
+ ep->re_async_rc = -ENETUNREACH;
+ complete(&ep->re_done);
return 0;
case RDMA_CM_EVENT_DEVICE_REMOVAL:
-#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
- pr_info("rpcrdma: removing device %s for %s:%s\n",
- ia->ri_id->device->name,
- rpcrdma_addrstr(r_xprt), rpcrdma_portstr(r_xprt));
-#endif
- init_completion(&ia->ri_remove_done);
- set_bit(RPCRDMA_IAF_REMOVING, &ia->ri_flags);
- ep->rep_connected = -ENODEV;
- xprt_force_disconnect(xprt);
- wait_for_completion(&ia->ri_remove_done);
-
- ia->ri_id = NULL;
- /* Return 1 to ensure the core destroys the id. */
- return 1;
+ pr_info("rpcrdma: removing device %s for %pISpc\n",
+ ep->re_id->device->name, sap);
+ fallthrough;
+ case RDMA_CM_EVENT_ADDR_CHANGE:
+ ep->re_connect_status = -ENODEV;
+ goto disconnected;
case RDMA_CM_EVENT_ESTABLISHED:
- ++xprt->connect_cookie;
- ep->rep_connected = 1;
- rpcrdma_update_connect_private(r_xprt, &event->param.conn);
- wake_up_all(&ep->rep_connect_wait);
+ rpcrdma_ep_get(ep);
+ ep->re_connect_status = 1;
+ rpcrdma_update_cm_private(ep, &event->param.conn);
+ trace_xprtrdma_inline_thresh(ep);
+ wake_up_all(&ep->re_connect_wait);
break;
case RDMA_CM_EVENT_CONNECT_ERROR:
- ep->rep_connected = -ENOTCONN;
- goto disconnected;
+ ep->re_connect_status = -ENOTCONN;
+ goto wake_connect_worker;
case RDMA_CM_EVENT_UNREACHABLE:
- ep->rep_connected = -ENETUNREACH;
- goto disconnected;
+ ep->re_connect_status = -ENETUNREACH;
+ goto wake_connect_worker;
case RDMA_CM_EVENT_REJECTED:
- dprintk("rpcrdma: connection to %s:%s rejected: %s\n",
- rpcrdma_addrstr(r_xprt), rpcrdma_portstr(r_xprt),
- rdma_reject_msg(id, event->status));
- ep->rep_connected = -ECONNREFUSED;
+ dprintk("rpcrdma: connection to %pISpc rejected: %s\n",
+ sap, rdma_reject_msg(id, event->status));
+ ep->re_connect_status = -ECONNREFUSED;
if (event->status == IB_CM_REJ_STALE_CONN)
- ep->rep_connected = -EAGAIN;
- goto disconnected;
+ ep->re_connect_status = -ENOTCONN;
+wake_connect_worker:
+ wake_up_all(&ep->re_connect_wait);
+ return 0;
case RDMA_CM_EVENT_DISCONNECTED:
- ep->rep_connected = -ECONNABORTED;
+ ep->re_connect_status = -ECONNABORTED;
disconnected:
- xprt_force_disconnect(xprt);
- wake_up_all(&ep->rep_connect_wait);
- break;
+ rpcrdma_force_disconnect(ep);
+ return rpcrdma_ep_put(ep);
default:
break;
}
- dprintk("RPC: %s: %s:%s on %s/frwr: %s\n", __func__,
- rpcrdma_addrstr(r_xprt), rpcrdma_portstr(r_xprt),
- ia->ri_id->device->name, rdma_event_msg(event->event));
+ dprintk("RPC: %s: %pISpc on %s/frwr: %s\n", __func__, sap,
+ ep->re_id->device->name, rdma_event_msg(event->event));
return 0;
}
-static struct rdma_cm_id *
-rpcrdma_create_id(struct rpcrdma_xprt *xprt, struct rpcrdma_ia *ia)
+static struct rdma_cm_id *rpcrdma_create_id(struct rpcrdma_xprt *r_xprt,
+ struct rpcrdma_ep *ep)
{
unsigned long wtimeout = msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1;
+ struct rpc_xprt *xprt = &r_xprt->rx_xprt;
struct rdma_cm_id *id;
int rc;
- trace_xprtrdma_conn_start(xprt);
+ init_completion(&ep->re_done);
- init_completion(&ia->ri_done);
-
- id = rdma_create_id(xprt->rx_xprt.xprt_net, rpcrdma_cm_event_handler,
- xprt, RDMA_PS_TCP, IB_QPT_RC);
+ id = rdma_create_id(xprt->xprt_net, rpcrdma_cm_event_handler, ep,
+ RDMA_PS_TCP, IB_QPT_RC);
if (IS_ERR(id))
return id;
- ia->ri_async_rc = -ETIMEDOUT;
- rc = rdma_resolve_addr(id, NULL,
- (struct sockaddr *)&xprt->rx_xprt.addr,
+ ep->re_async_rc = -ETIMEDOUT;
+ rc = rdma_resolve_addr(id, NULL, (struct sockaddr *)&xprt->addr,
RDMA_RESOLVE_TIMEOUT);
if (rc)
goto out;
- rc = wait_for_completion_interruptible_timeout(&ia->ri_done, wtimeout);
- if (rc < 0) {
- trace_xprtrdma_conn_tout(xprt);
+ rc = wait_for_completion_interruptible_timeout(&ep->re_done, wtimeout);
+ if (rc < 0)
goto out;
- }
- rc = ia->ri_async_rc;
+ rc = ep->re_async_rc;
if (rc)
goto out;
- ia->ri_async_rc = -ETIMEDOUT;
+ ep->re_async_rc = -ETIMEDOUT;
rc = rdma_resolve_route(id, RDMA_RESOLVE_TIMEOUT);
if (rc)
goto out;
- rc = wait_for_completion_interruptible_timeout(&ia->ri_done, wtimeout);
- if (rc < 0) {
- trace_xprtrdma_conn_tout(xprt);
+ rc = wait_for_completion_interruptible_timeout(&ep->re_done, wtimeout);
+ if (rc < 0)
goto out;
- }
- rc = ia->ri_async_rc;
+ rc = ep->re_async_rc;
if (rc)
goto out;
@@ -342,445 +354,256 @@
return ERR_PTR(rc);
}
-/*
- * Exported functions.
- */
-
-/**
- * rpcrdma_ia_open - Open and initialize an Interface Adapter.
- * @xprt: transport with IA to (re)initialize
- *
- * Returns 0 on success, negative errno if an appropriate
- * Interface Adapter could not be found and opened.
- */
-int
-rpcrdma_ia_open(struct rpcrdma_xprt *xprt)
+static void rpcrdma_ep_destroy(struct kref *kref)
{
- struct rpcrdma_ia *ia = &xprt->rx_ia;
+ struct rpcrdma_ep *ep = container_of(kref, struct rpcrdma_ep, re_kref);
+
+ if (ep->re_id->qp) {
+ rdma_destroy_qp(ep->re_id);
+ ep->re_id->qp = NULL;
+ }
+
+ if (ep->re_attr.recv_cq)
+ ib_free_cq(ep->re_attr.recv_cq);
+ ep->re_attr.recv_cq = NULL;
+ if (ep->re_attr.send_cq)
+ ib_free_cq(ep->re_attr.send_cq);
+ ep->re_attr.send_cq = NULL;
+
+ if (ep->re_pd)
+ ib_dealloc_pd(ep->re_pd);
+ ep->re_pd = NULL;
+
+ kfree(ep);
+ module_put(THIS_MODULE);
+}
+
+static noinline void rpcrdma_ep_get(struct rpcrdma_ep *ep)
+{
+ kref_get(&ep->re_kref);
+}
+
+/* Returns:
+ * %0 if @ep still has a positive kref count, or
+ * %1 if @ep was destroyed successfully.
+ */
+static noinline int rpcrdma_ep_put(struct rpcrdma_ep *ep)
+{
+ return kref_put(&ep->re_kref, rpcrdma_ep_destroy);
+}
+
+static int rpcrdma_ep_create(struct rpcrdma_xprt *r_xprt)
+{
+ struct rpcrdma_connect_private *pmsg;
+ struct ib_device *device;
+ struct rdma_cm_id *id;
+ struct rpcrdma_ep *ep;
int rc;
- ia->ri_id = rpcrdma_create_id(xprt, ia);
- if (IS_ERR(ia->ri_id)) {
- rc = PTR_ERR(ia->ri_id);
- goto out_err;
+ ep = kzalloc(sizeof(*ep), GFP_NOFS);
+ if (!ep)
+ return -ENOTCONN;
+ ep->re_xprt = &r_xprt->rx_xprt;
+ kref_init(&ep->re_kref);
+
+ id = rpcrdma_create_id(r_xprt, ep);
+ if (IS_ERR(id)) {
+ kfree(ep);
+ return PTR_ERR(id);
}
+ __module_get(THIS_MODULE);
+ device = id->device;
+ ep->re_id = id;
- ia->ri_pd = ib_alloc_pd(ia->ri_id->device, 0);
- if (IS_ERR(ia->ri_pd)) {
- rc = PTR_ERR(ia->ri_pd);
- pr_err("rpcrdma: ib_alloc_pd() returned %d\n", rc);
- goto out_err;
- }
-
- switch (xprt_rdma_memreg_strategy) {
- case RPCRDMA_FRWR:
- if (frwr_is_supported(ia->ri_id->device))
- break;
- /*FALLTHROUGH*/
- default:
- pr_err("rpcrdma: Device %s does not support memreg mode %d\n",
- ia->ri_id->device->name, xprt_rdma_memreg_strategy);
- rc = -EINVAL;
- goto out_err;
- }
-
- return 0;
-
-out_err:
- rpcrdma_ia_close(ia);
- return rc;
-}
-
-/**
- * rpcrdma_ia_remove - Handle device driver unload
- * @ia: interface adapter being removed
- *
- * Divest transport H/W resources associated with this adapter,
- * but allow it to be restored later.
- */
-void
-rpcrdma_ia_remove(struct rpcrdma_ia *ia)
-{
- struct rpcrdma_xprt *r_xprt = container_of(ia, struct rpcrdma_xprt,
- rx_ia);
- struct rpcrdma_ep *ep = &r_xprt->rx_ep;
- struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
- struct rpcrdma_req *req;
-
- cancel_work_sync(&buf->rb_refresh_worker);
-
- /* This is similar to rpcrdma_ep_destroy, but:
- * - Don't cancel the connect worker.
- * - Don't call rpcrdma_ep_disconnect, which waits
- * for another conn upcall, which will deadlock.
- * - rdma_disconnect is unneeded, the underlying
- * connection is already gone.
- */
- if (ia->ri_id->qp) {
- rpcrdma_xprt_drain(r_xprt);
- rdma_destroy_qp(ia->ri_id);
- ia->ri_id->qp = NULL;
- }
- ib_free_cq(ep->rep_attr.recv_cq);
- ep->rep_attr.recv_cq = NULL;
- ib_free_cq(ep->rep_attr.send_cq);
- ep->rep_attr.send_cq = NULL;
-
- /* The ULP is responsible for ensuring all DMA
- * mappings and MRs are gone.
- */
- rpcrdma_reps_unmap(r_xprt);
- list_for_each_entry(req, &buf->rb_allreqs, rl_all) {
- rpcrdma_regbuf_dma_unmap(req->rl_rdmabuf);
- rpcrdma_regbuf_dma_unmap(req->rl_sendbuf);
- rpcrdma_regbuf_dma_unmap(req->rl_recvbuf);
- }
- rpcrdma_mrs_destroy(buf);
- ib_dealloc_pd(ia->ri_pd);
- ia->ri_pd = NULL;
-
- /* Allow waiters to continue */
- complete(&ia->ri_remove_done);
-
- trace_xprtrdma_remove(r_xprt);
-}
-
-/**
- * rpcrdma_ia_close - Clean up/close an IA.
- * @ia: interface adapter to close
- *
- */
-void
-rpcrdma_ia_close(struct rpcrdma_ia *ia)
-{
- if (ia->ri_id != NULL && !IS_ERR(ia->ri_id)) {
- if (ia->ri_id->qp)
- rdma_destroy_qp(ia->ri_id);
- rdma_destroy_id(ia->ri_id);
- }
- ia->ri_id = NULL;
-
- /* If the pd is still busy, xprtrdma missed freeing a resource */
- if (ia->ri_pd && !IS_ERR(ia->ri_pd))
- ib_dealloc_pd(ia->ri_pd);
- ia->ri_pd = NULL;
-}
-
-/**
- * rpcrdma_ep_create - Create unconnected endpoint
- * @r_xprt: transport to instantiate
- *
- * Returns zero on success, or a negative errno.
- */
-int rpcrdma_ep_create(struct rpcrdma_xprt *r_xprt)
-{
- struct rpcrdma_ep *ep = &r_xprt->rx_ep;
- struct rpcrdma_ia *ia = &r_xprt->rx_ia;
- struct rpcrdma_connect_private *pmsg = &ep->rep_cm_private;
- struct ib_cq *sendcq, *recvcq;
- unsigned int max_sge;
- int rc;
-
- ep->rep_max_requests = xprt_rdma_slot_table_entries;
- ep->rep_inline_send = xprt_rdma_max_inline_write;
- ep->rep_inline_recv = xprt_rdma_max_inline_read;
-
- max_sge = min_t(unsigned int, ia->ri_id->device->attrs.max_send_sge,
- RPCRDMA_MAX_SEND_SGES);
- if (max_sge < RPCRDMA_MIN_SEND_SGES) {
- pr_warn("rpcrdma: HCA provides only %d send SGEs\n", max_sge);
- return -ENOMEM;
- }
- ia->ri_max_send_sges = max_sge;
-
- rc = frwr_open(ia, ep);
+ ep->re_max_requests = r_xprt->rx_xprt.max_reqs;
+ ep->re_inline_send = xprt_rdma_max_inline_write;
+ ep->re_inline_recv = xprt_rdma_max_inline_read;
+ rc = frwr_query_device(ep, device);
if (rc)
- return rc;
+ goto out_destroy;
- ep->rep_attr.event_handler = rpcrdma_qp_event_handler;
- ep->rep_attr.qp_context = ep;
- ep->rep_attr.srq = NULL;
- ep->rep_attr.cap.max_send_sge = max_sge;
- ep->rep_attr.cap.max_recv_sge = 1;
- ep->rep_attr.cap.max_inline_data = 0;
- ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
- ep->rep_attr.qp_type = IB_QPT_RC;
- ep->rep_attr.port_num = ~0;
+ r_xprt->rx_buf.rb_max_requests = cpu_to_be32(ep->re_max_requests);
+
+ ep->re_attr.event_handler = rpcrdma_qp_event_handler;
+ ep->re_attr.qp_context = ep;
+ ep->re_attr.srq = NULL;
+ ep->re_attr.cap.max_inline_data = 0;
+ ep->re_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
+ ep->re_attr.qp_type = IB_QPT_RC;
+ ep->re_attr.port_num = ~0;
dprintk("RPC: %s: requested max: dtos: send %d recv %d; "
"iovs: send %d recv %d\n",
__func__,
- ep->rep_attr.cap.max_send_wr,
- ep->rep_attr.cap.max_recv_wr,
- ep->rep_attr.cap.max_send_sge,
- ep->rep_attr.cap.max_recv_sge);
+ ep->re_attr.cap.max_send_wr,
+ ep->re_attr.cap.max_recv_wr,
+ ep->re_attr.cap.max_send_sge,
+ ep->re_attr.cap.max_recv_sge);
- ep->rep_send_batch = ep->rep_max_requests >> 3;
- ep->rep_send_count = ep->rep_send_batch;
- init_waitqueue_head(&ep->rep_connect_wait);
- ep->rep_receive_count = 0;
+ ep->re_send_batch = ep->re_max_requests >> 3;
+ ep->re_send_count = ep->re_send_batch;
+ init_waitqueue_head(&ep->re_connect_wait);
- sendcq = ib_alloc_cq_any(ia->ri_id->device, NULL,
- ep->rep_attr.cap.max_send_wr + 1,
- IB_POLL_WORKQUEUE);
- if (IS_ERR(sendcq)) {
- rc = PTR_ERR(sendcq);
- goto out1;
+ ep->re_attr.send_cq = ib_alloc_cq_any(device, r_xprt,
+ ep->re_attr.cap.max_send_wr,
+ IB_POLL_WORKQUEUE);
+ if (IS_ERR(ep->re_attr.send_cq)) {
+ rc = PTR_ERR(ep->re_attr.send_cq);
+ ep->re_attr.send_cq = NULL;
+ goto out_destroy;
}
- recvcq = ib_alloc_cq_any(ia->ri_id->device, NULL,
- ep->rep_attr.cap.max_recv_wr + 1,
- IB_POLL_WORKQUEUE);
- if (IS_ERR(recvcq)) {
- rc = PTR_ERR(recvcq);
- goto out2;
+ ep->re_attr.recv_cq = ib_alloc_cq_any(device, r_xprt,
+ ep->re_attr.cap.max_recv_wr,
+ IB_POLL_WORKQUEUE);
+ if (IS_ERR(ep->re_attr.recv_cq)) {
+ rc = PTR_ERR(ep->re_attr.recv_cq);
+ ep->re_attr.recv_cq = NULL;
+ goto out_destroy;
}
-
- ep->rep_attr.send_cq = sendcq;
- ep->rep_attr.recv_cq = recvcq;
+ ep->re_receive_count = 0;
/* Initialize cma parameters */
- memset(&ep->rep_remote_cma, 0, sizeof(ep->rep_remote_cma));
+ memset(&ep->re_remote_cma, 0, sizeof(ep->re_remote_cma));
/* Prepare RDMA-CM private message */
+ pmsg = &ep->re_cm_private;
pmsg->cp_magic = rpcrdma_cmp_magic;
pmsg->cp_version = RPCRDMA_CMP_VERSION;
pmsg->cp_flags |= RPCRDMA_CMP_F_SND_W_INV_OK;
- pmsg->cp_send_size = rpcrdma_encode_buffer_size(ep->rep_inline_send);
- pmsg->cp_recv_size = rpcrdma_encode_buffer_size(ep->rep_inline_recv);
- ep->rep_remote_cma.private_data = pmsg;
- ep->rep_remote_cma.private_data_len = sizeof(*pmsg);
+ pmsg->cp_send_size = rpcrdma_encode_buffer_size(ep->re_inline_send);
+ pmsg->cp_recv_size = rpcrdma_encode_buffer_size(ep->re_inline_recv);
+ ep->re_remote_cma.private_data = pmsg;
+ ep->re_remote_cma.private_data_len = sizeof(*pmsg);
/* Client offers RDMA Read but does not initiate */
- ep->rep_remote_cma.initiator_depth = 0;
- ep->rep_remote_cma.responder_resources =
- min_t(int, U8_MAX, ia->ri_id->device->attrs.max_qp_rd_atom);
+ ep->re_remote_cma.initiator_depth = 0;
+ ep->re_remote_cma.responder_resources =
+ min_t(int, U8_MAX, device->attrs.max_qp_rd_atom);
/* Limit transport retries so client can detect server
* GID changes quickly. RPC layer handles re-establishing
* transport connection and retransmission.
*/
- ep->rep_remote_cma.retry_count = 6;
+ ep->re_remote_cma.retry_count = 6;
/* RPC-over-RDMA handles its own flow control. In addition,
* make all RNR NAKs visible so we know that RPC-over-RDMA
* flow control is working correctly (no NAKs should be seen).
*/
- ep->rep_remote_cma.flow_control = 0;
- ep->rep_remote_cma.rnr_retry_count = 0;
+ ep->re_remote_cma.flow_control = 0;
+ ep->re_remote_cma.rnr_retry_count = 0;
+ ep->re_pd = ib_alloc_pd(device, 0);
+ if (IS_ERR(ep->re_pd)) {
+ rc = PTR_ERR(ep->re_pd);
+ ep->re_pd = NULL;
+ goto out_destroy;
+ }
+
+ rc = rdma_create_qp(id, ep->re_pd, &ep->re_attr);
+ if (rc)
+ goto out_destroy;
+
+ r_xprt->rx_ep = ep;
return 0;
-out2:
- ib_free_cq(sendcq);
-out1:
+out_destroy:
+ rpcrdma_ep_put(ep);
+ rdma_destroy_id(id);
return rc;
}
/**
- * rpcrdma_ep_destroy - Disconnect and destroy endpoint.
- * @r_xprt: transport instance to shut down
+ * rpcrdma_xprt_connect - Connect an unconnected transport
+ * @r_xprt: controlling transport instance
*
+ * Returns 0 on success or a negative errno.
*/
-void rpcrdma_ep_destroy(struct rpcrdma_xprt *r_xprt)
+int rpcrdma_xprt_connect(struct rpcrdma_xprt *r_xprt)
{
- struct rpcrdma_ep *ep = &r_xprt->rx_ep;
- struct rpcrdma_ia *ia = &r_xprt->rx_ia;
-
- if (ia->ri_id && ia->ri_id->qp) {
- rpcrdma_ep_disconnect(ep, ia);
- rdma_destroy_qp(ia->ri_id);
- ia->ri_id->qp = NULL;
- }
-
- if (ep->rep_attr.recv_cq)
- ib_free_cq(ep->rep_attr.recv_cq);
- if (ep->rep_attr.send_cq)
- ib_free_cq(ep->rep_attr.send_cq);
-}
-
-/* Re-establish a connection after a device removal event.
- * Unlike a normal reconnection, a fresh PD and a new set
- * of MRs and buffers is needed.
- */
-static int rpcrdma_ep_recreate_xprt(struct rpcrdma_xprt *r_xprt,
- struct ib_qp_init_attr *qp_init_attr)
-{
- struct rpcrdma_ia *ia = &r_xprt->rx_ia;
- struct rpcrdma_ep *ep = &r_xprt->rx_ep;
- int rc, err;
-
- trace_xprtrdma_reinsert(r_xprt);
-
- rc = -EHOSTUNREACH;
- if (rpcrdma_ia_open(r_xprt))
- goto out1;
-
- rc = -ENOMEM;
- err = rpcrdma_ep_create(r_xprt);
- if (err) {
- pr_err("rpcrdma: rpcrdma_ep_create returned %d\n", err);
- goto out2;
- }
- memcpy(qp_init_attr, &ep->rep_attr, sizeof(*qp_init_attr));
-
- rc = -ENETUNREACH;
- err = rdma_create_qp(ia->ri_id, ia->ri_pd, qp_init_attr);
- if (err) {
- pr_err("rpcrdma: rdma_create_qp returned %d\n", err);
- goto out3;
- }
-
- rpcrdma_mrs_create(r_xprt);
- return 0;
-
-out3:
- rpcrdma_ep_destroy(r_xprt);
-out2:
- rpcrdma_ia_close(ia);
-out1:
- return rc;
-}
-
-static int rpcrdma_ep_reconnect(struct rpcrdma_xprt *r_xprt,
- struct ib_qp_init_attr *qp_init_attr)
-{
- struct rpcrdma_ia *ia = &r_xprt->rx_ia;
- struct rdma_cm_id *id, *old;
- int err, rc;
-
- trace_xprtrdma_reconnect(r_xprt);
-
- rpcrdma_ep_disconnect(&r_xprt->rx_ep, ia);
-
- rc = -EHOSTUNREACH;
- id = rpcrdma_create_id(r_xprt, ia);
- if (IS_ERR(id))
- goto out;
-
- /* As long as the new ID points to the same device as the
- * old ID, we can reuse the transport's existing PD and all
- * previously allocated MRs. Also, the same device means
- * the transport's previous DMA mappings are still valid.
- *
- * This is a sanity check only. There should be no way these
- * point to two different devices here.
- */
- old = id;
- rc = -ENETUNREACH;
- if (ia->ri_id->device != id->device) {
- pr_err("rpcrdma: can't reconnect on different device!\n");
- goto out_destroy;
- }
-
- err = rdma_create_qp(id, ia->ri_pd, qp_init_attr);
- if (err)
- goto out_destroy;
-
- /* Atomically replace the transport's ID and QP. */
- rc = 0;
- old = ia->ri_id;
- ia->ri_id = id;
- rdma_destroy_qp(old);
-
-out_destroy:
- rdma_destroy_id(old);
-out:
- return rc;
-}
-
-/*
- * Connect unconnected endpoint.
- */
-int
-rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
-{
- struct rpcrdma_xprt *r_xprt = container_of(ia, struct rpcrdma_xprt,
- rx_ia);
struct rpc_xprt *xprt = &r_xprt->rx_xprt;
- struct ib_qp_init_attr qp_init_attr;
+ struct rpcrdma_ep *ep;
int rc;
-retry:
- memcpy(&qp_init_attr, &ep->rep_attr, sizeof(qp_init_attr));
- switch (ep->rep_connected) {
- case 0:
- dprintk("RPC: %s: connecting...\n", __func__);
- rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &qp_init_attr);
- if (rc) {
- rc = -ENETUNREACH;
- goto out_noupdate;
- }
- break;
- case -ENODEV:
- rc = rpcrdma_ep_recreate_xprt(r_xprt, &qp_init_attr);
- if (rc)
- goto out_noupdate;
- break;
- default:
- rc = rpcrdma_ep_reconnect(r_xprt, &qp_init_attr);
- if (rc)
- goto out;
- }
+ rc = rpcrdma_ep_create(r_xprt);
+ if (rc)
+ return rc;
+ ep = r_xprt->rx_ep;
- ep->rep_connected = 0;
xprt_clear_connected(xprt);
+ rpcrdma_reset_cwnd(r_xprt);
- rpcrdma_post_recvs(r_xprt, true);
+ /* Bump the ep's reference count while there are
+ * outstanding Receives.
+ */
+ rpcrdma_ep_get(ep);
+ rpcrdma_post_recvs(r_xprt, 1, true);
- rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma);
+ rc = rdma_connect(ep->re_id, &ep->re_remote_cma);
if (rc)
goto out;
if (xprt->reestablish_timeout < RPCRDMA_INIT_REEST_TO)
xprt->reestablish_timeout = RPCRDMA_INIT_REEST_TO;
- wait_event_interruptible(ep->rep_connect_wait, ep->rep_connected != 0);
- if (ep->rep_connected <= 0) {
- if (ep->rep_connected == -EAGAIN)
- goto retry;
- rc = ep->rep_connected;
+ wait_event_interruptible(ep->re_connect_wait,
+ ep->re_connect_status != 0);
+ if (ep->re_connect_status <= 0) {
+ rc = ep->re_connect_status;
goto out;
}
- dprintk("RPC: %s: connected\n", __func__);
+ rc = rpcrdma_sendctxs_create(r_xprt);
+ if (rc) {
+ rc = -ENOTCONN;
+ goto out;
+ }
+
+ rc = rpcrdma_reqs_setup(r_xprt);
+ if (rc) {
+ rc = -ENOTCONN;
+ goto out;
+ }
+ rpcrdma_mrs_create(r_xprt);
out:
- if (rc)
- ep->rep_connected = rc;
-
-out_noupdate:
+ trace_xprtrdma_connect(r_xprt, rc);
return rc;
}
/**
- * rpcrdma_ep_disconnect - Disconnect underlying transport
- * @ep: endpoint to disconnect
- * @ia: associated interface adapter
+ * rpcrdma_xprt_disconnect - Disconnect underlying transport
+ * @r_xprt: controlling transport instance
*
- * This is separate from destroy to facilitate the ability
- * to reconnect without recreating the endpoint.
+ * Caller serializes. Either the transport send lock is held,
+ * or we're being called to destroy the transport.
*
- * This call is not reentrant, and must not be made in parallel
- * on the same endpoint.
+ * On return, @r_xprt is completely divested of all hardware
+ * resources and prepared for the next ->connect operation.
*/
-void
-rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
+void rpcrdma_xprt_disconnect(struct rpcrdma_xprt *r_xprt)
{
- struct rpcrdma_xprt *r_xprt = container_of(ep, struct rpcrdma_xprt,
- rx_ep);
+ struct rpcrdma_ep *ep = r_xprt->rx_ep;
+ struct rdma_cm_id *id;
int rc;
- /* returns without wait if ID is not connected */
- rc = rdma_disconnect(ia->ri_id);
- if (!rc)
- wait_event_interruptible(ep->rep_connect_wait,
- ep->rep_connected != 1);
- else
- ep->rep_connected = rc;
+ if (!ep)
+ return;
+
+ id = ep->re_id;
+ rc = rdma_disconnect(id);
trace_xprtrdma_disconnect(r_xprt, rc);
rpcrdma_xprt_drain(r_xprt);
+ rpcrdma_reps_unmap(r_xprt);
rpcrdma_reqs_reset(r_xprt);
+ rpcrdma_mrs_destroy(r_xprt);
+ rpcrdma_sendctxs_destroy(r_xprt);
+
+ if (rpcrdma_ep_put(ep))
+ rdma_destroy_id(id);
+
+ r_xprt->rx_ep = NULL;
}
/* Fixed-size circular FIFO queue. This implementation is wait-free and
@@ -800,27 +623,28 @@
* queue activity, and rpcrdma_xprt_drain has flushed all remaining
* Send requests.
*/
-static void rpcrdma_sendctxs_destroy(struct rpcrdma_buffer *buf)
+static void rpcrdma_sendctxs_destroy(struct rpcrdma_xprt *r_xprt)
{
+ struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
unsigned long i;
+ if (!buf->rb_sc_ctxs)
+ return;
for (i = 0; i <= buf->rb_sc_last; i++)
kfree(buf->rb_sc_ctxs[i]);
kfree(buf->rb_sc_ctxs);
+ buf->rb_sc_ctxs = NULL;
}
-static struct rpcrdma_sendctx *rpcrdma_sendctx_create(struct rpcrdma_ia *ia)
+static struct rpcrdma_sendctx *rpcrdma_sendctx_create(struct rpcrdma_ep *ep)
{
struct rpcrdma_sendctx *sc;
- sc = kzalloc(struct_size(sc, sc_sges, ia->ri_max_send_sges),
+ sc = kzalloc(struct_size(sc, sc_sges, ep->re_attr.cap.max_send_sge),
GFP_KERNEL);
if (!sc)
return NULL;
- sc->sc_wr.wr_cqe = &sc->sc_cqe;
- sc->sc_wr.sg_list = sc->sc_sges;
- sc->sc_wr.opcode = IB_WR_SEND;
sc->sc_cqe.done = rpcrdma_wc_send;
return sc;
}
@@ -836,22 +660,22 @@
* the ->send_request call to fail temporarily before too many
* Sends are posted.
*/
- i = buf->rb_max_requests + RPCRDMA_MAX_BC_REQUESTS;
- dprintk("RPC: %s: allocating %lu send_ctxs\n", __func__, i);
+ i = r_xprt->rx_ep->re_max_requests + RPCRDMA_MAX_BC_REQUESTS;
buf->rb_sc_ctxs = kcalloc(i, sizeof(sc), GFP_KERNEL);
if (!buf->rb_sc_ctxs)
return -ENOMEM;
buf->rb_sc_last = i - 1;
for (i = 0; i <= buf->rb_sc_last; i++) {
- sc = rpcrdma_sendctx_create(&r_xprt->rx_ia);
+ sc = rpcrdma_sendctx_create(r_xprt->rx_ep);
if (!sc)
return -ENOMEM;
- sc->sc_xprt = r_xprt;
buf->rb_sc_ctxs[i] = sc;
}
+ buf->rb_sc_head = 0;
+ buf->rb_sc_tail = 0;
return 0;
}
@@ -911,6 +735,7 @@
/**
* rpcrdma_sendctx_put_locked - Release a send context
+ * @r_xprt: controlling transport instance
* @sc: send context to release
*
* Usage: Called from Send completion to return a sendctxt
@@ -918,10 +743,10 @@
*
* The caller serializes calls to this function (per transport).
*/
-static void
-rpcrdma_sendctx_put_locked(struct rpcrdma_sendctx *sc)
+static void rpcrdma_sendctx_put_locked(struct rpcrdma_xprt *r_xprt,
+ struct rpcrdma_sendctx *sc)
{
- struct rpcrdma_buffer *buf = &sc->sc_xprt->rx_buf;
+ struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
unsigned long next_tail;
/* Unmap SGEs of previously completed but unsignaled
@@ -939,17 +764,17 @@
/* Paired with READ_ONCE */
smp_store_release(&buf->rb_sc_tail, next_tail);
- xprt_write_space(&sc->sc_xprt->rx_xprt);
+ xprt_write_space(&r_xprt->rx_xprt);
}
static void
rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt)
{
struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
- struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+ struct rpcrdma_ep *ep = r_xprt->rx_ep;
unsigned int count;
- for (count = 0; count < ia->ri_max_segs; count++) {
+ for (count = 0; count < ep->re_max_rdma_segs; count++) {
struct rpcrdma_mr *mr;
int rc;
@@ -957,14 +782,12 @@
if (!mr)
break;
- rc = frwr_init_mr(ia, mr);
+ rc = frwr_mr_init(r_xprt, mr);
if (rc) {
kfree(mr);
break;
}
- mr->mr_xprt = r_xprt;
-
spin_lock(&buf->rb_lock);
rpcrdma_mr_push(mr, &buf->rb_mrs);
list_add(&mr->mr_all, &buf->rb_all_mrs);
@@ -988,6 +811,28 @@
}
/**
+ * rpcrdma_mrs_refresh - Wake the MR refresh worker
+ * @r_xprt: controlling transport instance
+ *
+ */
+void rpcrdma_mrs_refresh(struct rpcrdma_xprt *r_xprt)
+{
+ struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
+ struct rpcrdma_ep *ep = r_xprt->rx_ep;
+
+ /* If there is no underlying connection, it's no use
+ * to wake the refresh worker.
+ */
+ if (ep->re_connect_status == 1) {
+ /* The work is scheduled on a WQ_MEM_RECLAIM
+ * workqueue in order to prevent MR allocation
+ * from recursing into NFS during direct reclaim.
+ */
+ queue_work(xprtiod_workqueue, &buf->rb_refresh_worker);
+ }
+}
+
+/**
* rpcrdma_req_create - Allocate an rpcrdma_req object
* @r_xprt: controlling r_xprt
* @size: initial size, in bytes, of send and receive buffers
@@ -999,32 +844,19 @@
gfp_t flags)
{
struct rpcrdma_buffer *buffer = &r_xprt->rx_buf;
- struct rpcrdma_regbuf *rb;
struct rpcrdma_req *req;
- size_t maxhdrsize;
req = kzalloc(sizeof(*req), flags);
if (req == NULL)
goto out1;
- /* Compute maximum header buffer size in bytes */
- maxhdrsize = rpcrdma_fixed_maxsz + 3 +
- r_xprt->rx_ia.ri_max_segs * rpcrdma_readchunk_maxsz;
- maxhdrsize *= sizeof(__be32);
- rb = rpcrdma_regbuf_alloc(__roundup_pow_of_two(maxhdrsize),
- DMA_TO_DEVICE, flags);
- if (!rb)
- goto out2;
- req->rl_rdmabuf = rb;
- xdr_buf_init(&req->rl_hdrbuf, rdmab_data(rb), rdmab_length(rb));
-
req->rl_sendbuf = rpcrdma_regbuf_alloc(size, DMA_TO_DEVICE, flags);
if (!req->rl_sendbuf)
- goto out3;
+ goto out2;
req->rl_recvbuf = rpcrdma_regbuf_alloc(size, DMA_NONE, flags);
if (!req->rl_recvbuf)
- goto out4;
+ goto out3;
INIT_LIST_HEAD(&req->rl_free_mrs);
INIT_LIST_HEAD(&req->rl_registered);
@@ -1033,10 +865,8 @@
spin_unlock(&buffer->rb_lock);
return req;
-out4:
- kfree(req->rl_sendbuf);
out3:
- kfree(req->rl_rdmabuf);
+ kfree(req->rl_sendbuf);
out2:
kfree(req);
out1:
@@ -1044,10 +874,73 @@
}
/**
- * rpcrdma_reqs_reset - Reset all reqs owned by a transport
+ * rpcrdma_req_setup - Per-connection instance setup of an rpcrdma_req object
* @r_xprt: controlling transport instance
+ * @req: rpcrdma_req object to set up
*
- * ASSUMPTION: the rb_allreqs list is stable for the duration,
+ * Returns zero on success, and a negative errno on failure.
+ */
+int rpcrdma_req_setup(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
+{
+ struct rpcrdma_regbuf *rb;
+ size_t maxhdrsize;
+
+ /* Compute maximum header buffer size in bytes */
+ maxhdrsize = rpcrdma_fixed_maxsz + 3 +
+ r_xprt->rx_ep->re_max_rdma_segs * rpcrdma_readchunk_maxsz;
+ maxhdrsize *= sizeof(__be32);
+ rb = rpcrdma_regbuf_alloc(__roundup_pow_of_two(maxhdrsize),
+ DMA_TO_DEVICE, GFP_KERNEL);
+ if (!rb)
+ goto out;
+
+ if (!__rpcrdma_regbuf_dma_map(r_xprt, rb))
+ goto out_free;
+
+ req->rl_rdmabuf = rb;
+ xdr_buf_init(&req->rl_hdrbuf, rdmab_data(rb), rdmab_length(rb));
+ return 0;
+
+out_free:
+ rpcrdma_regbuf_free(rb);
+out:
+ return -ENOMEM;
+}
+
+/* ASSUMPTION: the rb_allreqs list is stable for the duration,
+ * and thus can be walked without holding rb_lock. Eg. the
+ * caller is holding the transport send lock to exclude
+ * device removal or disconnection.
+ */
+static int rpcrdma_reqs_setup(struct rpcrdma_xprt *r_xprt)
+{
+ struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
+ struct rpcrdma_req *req;
+ int rc;
+
+ list_for_each_entry(req, &buf->rb_allreqs, rl_all) {
+ rc = rpcrdma_req_setup(r_xprt, req);
+ if (rc)
+ return rc;
+ }
+ return 0;
+}
+
+static void rpcrdma_req_reset(struct rpcrdma_req *req)
+{
+ /* Credits are valid for only one connection */
+ req->rl_slot.rq_cong = 0;
+
+ rpcrdma_regbuf_free(req->rl_rdmabuf);
+ req->rl_rdmabuf = NULL;
+
+ rpcrdma_regbuf_dma_unmap(req->rl_sendbuf);
+ rpcrdma_regbuf_dma_unmap(req->rl_recvbuf);
+
+ frwr_reset(req);
+}
+
+/* ASSUMPTION: the rb_allreqs list is stable for the duration,
* and thus can be walked without holding rb_lock. Eg. the
* caller is holding the transport send lock to exclude
* device removal or disconnection.
@@ -1057,14 +950,16 @@
struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
struct rpcrdma_req *req;
- list_for_each_entry(req, &buf->rb_allreqs, rl_all) {
- /* Credits are valid only for one connection */
- req->rl_slot.rq_cong = 0;
- }
+ list_for_each_entry(req, &buf->rb_allreqs, rl_all)
+ rpcrdma_req_reset(req);
}
-static struct rpcrdma_rep *rpcrdma_rep_create(struct rpcrdma_xprt *r_xprt,
- bool temp)
+/* No locking needed here. This function is called only by the
+ * Receive completion handler.
+ */
+static noinline
+struct rpcrdma_rep *rpcrdma_rep_create(struct rpcrdma_xprt *r_xprt,
+ bool temp)
{
struct rpcrdma_rep *rep;
@@ -1072,11 +967,14 @@
if (rep == NULL)
goto out;
- rep->rr_rdmabuf = rpcrdma_regbuf_alloc(r_xprt->rx_ep.rep_inline_recv,
+ rep->rr_rdmabuf = rpcrdma_regbuf_alloc(r_xprt->rx_ep->re_inline_recv,
DMA_FROM_DEVICE, GFP_KERNEL);
if (!rep->rr_rdmabuf)
goto out_free;
+ if (!rpcrdma_regbuf_dma_map(r_xprt, rep->rr_rdmabuf))
+ goto out_free_regbuf;
+
xdr_buf_init(&rep->rr_hdrbuf, rdmab_data(rep->rr_rdmabuf),
rdmab_length(rep->rr_rdmabuf));
rep->rr_cqe.done = rpcrdma_wc_receive;
@@ -1089,12 +987,17 @@
list_add(&rep->rr_all, &r_xprt->rx_buf.rb_all_reps);
return rep;
+out_free_regbuf:
+ rpcrdma_regbuf_free(rep->rr_rdmabuf);
out_free:
kfree(rep);
out:
return NULL;
}
+/* No locking needed here. This function is invoked only by the
+ * Receive completion handler, or during transport shutdown.
+ */
static void rpcrdma_rep_destroy(struct rpcrdma_rep *rep)
{
list_del(&rep->rr_all);
@@ -1124,8 +1027,10 @@
struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
struct rpcrdma_rep *rep;
- list_for_each_entry(rep, &buf->rb_all_reps, rr_all)
+ list_for_each_entry(rep, &buf->rb_all_reps, rr_all) {
rpcrdma_regbuf_dma_unmap(rep->rr_rdmabuf);
+ rep->rr_temp = true;
+ }
}
static void rpcrdma_reps_destroy(struct rpcrdma_buffer *buf)
@@ -1147,37 +1052,29 @@
struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
int i, rc;
- buf->rb_max_requests = r_xprt->rx_ep.rep_max_requests;
buf->rb_bc_srv_max_requests = 0;
spin_lock_init(&buf->rb_lock);
INIT_LIST_HEAD(&buf->rb_mrs);
INIT_LIST_HEAD(&buf->rb_all_mrs);
INIT_WORK(&buf->rb_refresh_worker, rpcrdma_mr_refresh_worker);
- rpcrdma_mrs_create(r_xprt);
-
INIT_LIST_HEAD(&buf->rb_send_bufs);
INIT_LIST_HEAD(&buf->rb_allreqs);
INIT_LIST_HEAD(&buf->rb_all_reps);
rc = -ENOMEM;
- for (i = 0; i < buf->rb_max_requests; i++) {
+ for (i = 0; i < r_xprt->rx_xprt.max_reqs; i++) {
struct rpcrdma_req *req;
- req = rpcrdma_req_create(r_xprt, RPCRDMA_V1_DEF_INLINE_SIZE,
+ req = rpcrdma_req_create(r_xprt, RPCRDMA_V1_DEF_INLINE_SIZE * 2,
GFP_KERNEL);
if (!req)
goto out;
list_add(&req->rl_list, &buf->rb_send_bufs);
}
- buf->rb_credits = 1;
init_llist_head(&buf->rb_free_reps);
- rc = rpcrdma_sendctxs_create(r_xprt);
- if (rc)
- goto out;
-
return 0;
out:
rpcrdma_buffer_destroy(buf);
@@ -1188,8 +1085,8 @@
* rpcrdma_req_destroy - Destroy an rpcrdma_req object
* @req: unused object to be destroyed
*
- * This function assumes that the caller prevents concurrent device
- * unload and transport tear-down.
+ * Relies on caller holding the transport send lock to protect
+ * removing req->rl_all from buf->rb_all_reqs safely.
*/
void rpcrdma_req_destroy(struct rpcrdma_req *req)
{
@@ -1215,17 +1112,18 @@
/**
* rpcrdma_mrs_destroy - Release all of a transport's MRs
- * @buf: controlling buffer instance
+ * @r_xprt: controlling transport instance
*
* Relies on caller holding the transport send lock to protect
* removing mr->mr_list from req->rl_free_mrs safely.
*/
-static void rpcrdma_mrs_destroy(struct rpcrdma_buffer *buf)
+static void rpcrdma_mrs_destroy(struct rpcrdma_xprt *r_xprt)
{
- struct rpcrdma_xprt *r_xprt = container_of(buf, struct rpcrdma_xprt,
- rx_buf);
+ struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
struct rpcrdma_mr *mr;
+ cancel_work_sync(&buf->rb_refresh_worker);
+
spin_lock(&buf->rb_lock);
while ((mr = list_first_entry_or_null(&buf->rb_all_mrs,
struct rpcrdma_mr,
@@ -1235,10 +1133,10 @@
spin_unlock(&buf->rb_lock);
frwr_release_mr(mr);
+
spin_lock(&buf->rb_lock);
}
spin_unlock(&buf->rb_lock);
- r_xprt->rx_stats.mrs_allocated = 0;
}
/**
@@ -1252,9 +1150,6 @@
void
rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
{
- cancel_work_sync(&buf->rb_refresh_worker);
-
- rpcrdma_sendctxs_destroy(buf);
rpcrdma_reps_destroy(buf);
while (!list_empty(&buf->rb_send_bufs)) {
@@ -1265,8 +1160,6 @@
list_del(&req->rl_list);
rpcrdma_req_destroy(req);
}
-
- rpcrdma_mrs_destroy(buf);
}
/**
@@ -1299,7 +1192,7 @@
if (mr->mr_dir != DMA_NONE) {
trace_xprtrdma_mr_unmap(mr);
- ib_dma_unmap_sg(r_xprt->rx_ia.ri_id->device,
+ ib_dma_unmap_sg(r_xprt->rx_ep->re_id->device,
mr->mr_sg, mr->mr_nents, mr->mr_dir);
mr->mr_dir = DMA_NONE;
}
@@ -1308,6 +1201,20 @@
}
/**
+ * rpcrdma_reply_put - Put reply buffers back into pool
+ * @buffers: buffer pool
+ * @req: object to return
+ *
+ */
+void rpcrdma_reply_put(struct rpcrdma_buffer *buffers, struct rpcrdma_req *req)
+{
+ if (req->rl_reply) {
+ rpcrdma_rep_put(buffers, req->rl_reply);
+ req->rl_reply = NULL;
+ }
+}
+
+/**
* rpcrdma_buffer_get - Get a request buffer
* @buffers: Buffer pool from which to obtain a buffer
*
@@ -1335,9 +1242,7 @@
*/
void rpcrdma_buffer_put(struct rpcrdma_buffer *buffers, struct rpcrdma_req *req)
{
- if (req->rl_reply)
- rpcrdma_rep_put(buffers, req->rl_reply);
- req->rl_reply = NULL;
+ rpcrdma_reply_put(buffers, req);
spin_lock(&buffers->rb_lock);
list_add(&req->rl_list, &buffers->rb_send_bufs);
@@ -1417,7 +1322,7 @@
bool __rpcrdma_regbuf_dma_map(struct rpcrdma_xprt *r_xprt,
struct rpcrdma_regbuf *rb)
{
- struct ib_device *device = r_xprt->rx_ia.ri_id->device;
+ struct ib_device *device = r_xprt->rx_ep->re_id->device;
if (rb->rg_direction == DMA_NONE)
return false;
@@ -1430,7 +1335,7 @@
}
rb->rg_device = device;
- rb->rg_iov.lkey = r_xprt->rx_ia.ri_pd->local_dma_lkey;
+ rb->rg_iov.lkey = r_xprt->rx_ep->re_pd->local_dma_lkey;
return true;
}
@@ -1456,32 +1361,29 @@
}
/**
- * rpcrdma_ep_post - Post WRs to a transport's Send Queue
- * @ia: transport's device information
- * @ep: transport's RDMA endpoint information
+ * rpcrdma_post_sends - Post WRs to a transport's Send Queue
+ * @r_xprt: controlling transport instance
* @req: rpcrdma_req containing the Send WR to post
*
* Returns 0 if the post was successful, otherwise -ENOTCONN
* is returned.
*/
-int
-rpcrdma_ep_post(struct rpcrdma_ia *ia,
- struct rpcrdma_ep *ep,
- struct rpcrdma_req *req)
+int rpcrdma_post_sends(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
{
- struct ib_send_wr *send_wr = &req->rl_sendctx->sc_wr;
+ struct ib_send_wr *send_wr = &req->rl_wr;
+ struct rpcrdma_ep *ep = r_xprt->rx_ep;
int rc;
- if (!ep->rep_send_count || kref_read(&req->rl_kref) > 1) {
+ if (!ep->re_send_count || kref_read(&req->rl_kref) > 1) {
send_wr->send_flags |= IB_SEND_SIGNALED;
- ep->rep_send_count = ep->rep_send_batch;
+ ep->re_send_count = ep->re_send_batch;
} else {
send_wr->send_flags &= ~IB_SEND_SIGNALED;
- --ep->rep_send_count;
+ --ep->re_send_count;
}
- rc = frwr_send(ia, req);
- trace_xprtrdma_post_send(req, rc);
+ trace_xprtrdma_post_send(req);
+ rc = frwr_send(r_xprt, req);
if (rc)
return -ENOTCONN;
return 0;
@@ -1490,24 +1392,24 @@
/**
* rpcrdma_post_recvs - Refill the Receive Queue
* @r_xprt: controlling transport instance
- * @temp: mark Receive buffers to be deleted after use
+ * @needed: current credit grant
+ * @temp: mark Receive buffers to be deleted after one use
*
*/
-void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp)
+void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, int needed, bool temp)
{
struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
- struct rpcrdma_ep *ep = &r_xprt->rx_ep;
- struct ib_recv_wr *i, *wr, *bad_wr;
+ struct rpcrdma_ep *ep = r_xprt->rx_ep;
+ struct ib_recv_wr *wr, *bad_wr;
struct rpcrdma_rep *rep;
- int needed, count, rc;
+ int count, rc;
rc = 0;
count = 0;
- needed = buf->rb_credits + (buf->rb_bc_srv_max_requests << 1);
- if (likely(ep->rep_receive_count > needed))
+ if (likely(ep->re_receive_count > needed))
goto out;
- needed -= ep->rep_receive_count;
+ needed -= ep->re_receive_count;
if (!temp)
needed += RPCRDMA_MAX_RECV_BATCH;
@@ -1524,24 +1426,16 @@
if (!rep)
break;
+ trace_xprtrdma_post_recv(rep);
rep->rr_recv_wr.next = wr;
wr = &rep->rr_recv_wr;
--needed;
+ ++count;
}
if (!wr)
goto out;
- for (i = wr; i; i = i->next) {
- rep = container_of(i, struct rpcrdma_rep, rr_recv_wr);
-
- if (!rpcrdma_regbuf_dma_map(r_xprt, rep->rr_rdmabuf))
- goto release_wrs;
-
- trace_xprtrdma_post_recv(rep);
- ++count;
- }
-
- rc = ib_post_recv(r_xprt->rx_ia.ri_id->qp, wr,
+ rc = ib_post_recv(ep->re_id->qp, wr,
(const struct ib_recv_wr **)&bad_wr);
out:
trace_xprtrdma_post_recvs(r_xprt, count, rc);
@@ -1555,13 +1449,6 @@
--count;
}
}
- ep->rep_receive_count += count;
+ ep->re_receive_count += count;
return;
-
-release_wrs:
- for (i = wr; i;) {
- rep = container_of(i, struct rpcrdma_rep, rr_recv_wr);
- i = i->next;
- rpcrdma_recv_buffer_put(rep);
- }
}
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index fc76167..702f034 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -65,44 +65,34 @@
#define RPCRDMA_IDLE_DISC_TO (5U * 60 * HZ)
/*
- * Interface Adapter -- one per transport instance
+ * RDMA Endpoint -- connection endpoint details
*/
-struct rpcrdma_ia {
- struct rdma_cm_id *ri_id;
- struct ib_pd *ri_pd;
- int ri_async_rc;
- unsigned int ri_max_segs;
- unsigned int ri_max_frwr_depth;
- unsigned int ri_max_send_sges;
- bool ri_implicit_roundup;
- enum ib_mr_type ri_mrtype;
- unsigned long ri_flags;
- struct completion ri_done;
- struct completion ri_remove_done;
-};
-
-enum {
- RPCRDMA_IAF_REMOVING = 0,
-};
-
-/*
- * RDMA Endpoint -- one per transport instance
- */
-
struct rpcrdma_ep {
- unsigned int rep_send_count;
- unsigned int rep_send_batch;
- unsigned int rep_max_inline_send;
- unsigned int rep_max_inline_recv;
- int rep_connected;
- struct ib_qp_init_attr rep_attr;
- wait_queue_head_t rep_connect_wait;
- struct rpcrdma_connect_private rep_cm_private;
- struct rdma_conn_param rep_remote_cma;
- unsigned int rep_max_requests; /* set by /proc */
- unsigned int rep_inline_send; /* negotiated */
- unsigned int rep_inline_recv; /* negotiated */
- int rep_receive_count;
+ struct kref re_kref;
+ struct rdma_cm_id *re_id;
+ struct ib_pd *re_pd;
+ unsigned int re_max_rdma_segs;
+ unsigned int re_max_fr_depth;
+ bool re_implicit_roundup;
+ enum ib_mr_type re_mrtype;
+ struct completion re_done;
+ unsigned int re_send_count;
+ unsigned int re_send_batch;
+ unsigned int re_max_inline_send;
+ unsigned int re_max_inline_recv;
+ int re_async_rc;
+ int re_connect_status;
+ atomic_t re_force_disconnect;
+ struct ib_qp_init_attr re_attr;
+ wait_queue_head_t re_connect_wait;
+ struct rpc_xprt *re_xprt;
+ struct rpcrdma_connect_private
+ re_cm_private;
+ struct rdma_conn_param re_remote_cma;
+ int re_receive_count;
+ unsigned int re_max_requests; /* depends on device */
+ unsigned int re_inline_send; /* negotiated */
+ unsigned int re_inline_recv; /* negotiated */
};
/* Pre-allocate extra Work Requests for handling backward receives
@@ -219,12 +209,8 @@
/* struct rpcrdma_sendctx - DMA mapped SGEs to unmap after Send completes
*/
struct rpcrdma_req;
-struct rpcrdma_xprt;
struct rpcrdma_sendctx {
- struct ib_send_wr sc_wr;
struct ib_cqe sc_cqe;
- struct ib_device *sc_device;
- struct rpcrdma_xprt *sc_xprt;
struct rpcrdma_req *sc_req;
unsigned int sc_unmap_count;
struct ib_sge sc_sges[];
@@ -258,7 +244,6 @@
u32 mr_handle;
u32 mr_length;
u64 mr_offset;
- struct work_struct mr_recycle;
struct list_head mr_all;
};
@@ -319,6 +304,7 @@
struct rpcrdma_rep *rl_reply;
struct xdr_stream rl_stream;
struct xdr_buf rl_hdrbuf;
+ struct ib_send_wr rl_wr;
struct rpcrdma_sendctx *rl_sendctx;
struct rpcrdma_regbuf *rl_rdmabuf; /* xprt header */
struct rpcrdma_regbuf *rl_sendbuf; /* rq_snd_buf */
@@ -377,7 +363,7 @@
struct llist_head rb_free_reps;
- u32 rb_max_requests;
+ __be32 rb_max_requests;
u32 rb_credits; /* most recent credit grant */
u32 rb_bc_srv_max_requests;
@@ -427,8 +413,7 @@
*/
struct rpcrdma_xprt {
struct rpc_xprt rx_xprt;
- struct rpcrdma_ia rx_ia;
- struct rpcrdma_ep rx_ep;
+ struct rpcrdma_ep *rx_ep;
struct rpcrdma_buffer rx_buf;
struct delayed_work rx_connect_worker;
struct rpc_timeout rx_timeout;
@@ -460,29 +445,21 @@
extern unsigned int xprt_rdma_memreg_strategy;
/*
- * Interface Adapter calls - xprtrdma/verbs.c
- */
-int rpcrdma_ia_open(struct rpcrdma_xprt *xprt);
-void rpcrdma_ia_remove(struct rpcrdma_ia *ia);
-void rpcrdma_ia_close(struct rpcrdma_ia *);
-
-/*
* Endpoint calls - xprtrdma/verbs.c
*/
-int rpcrdma_ep_create(struct rpcrdma_xprt *r_xprt);
-void rpcrdma_ep_destroy(struct rpcrdma_xprt *r_xprt);
-int rpcrdma_ep_connect(struct rpcrdma_ep *, struct rpcrdma_ia *);
-void rpcrdma_ep_disconnect(struct rpcrdma_ep *, struct rpcrdma_ia *);
+void rpcrdma_flush_disconnect(struct rpcrdma_xprt *r_xprt, struct ib_wc *wc);
+int rpcrdma_xprt_connect(struct rpcrdma_xprt *r_xprt);
+void rpcrdma_xprt_disconnect(struct rpcrdma_xprt *r_xprt);
-int rpcrdma_ep_post(struct rpcrdma_ia *, struct rpcrdma_ep *,
- struct rpcrdma_req *);
-void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp);
+int rpcrdma_post_sends(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req);
+void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, int needed, bool temp);
/*
* Buffer calls - xprtrdma/verbs.c
*/
struct rpcrdma_req *rpcrdma_req_create(struct rpcrdma_xprt *r_xprt, size_t size,
gfp_t flags);
+int rpcrdma_req_setup(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req);
void rpcrdma_req_destroy(struct rpcrdma_req *req);
int rpcrdma_buffer_create(struct rpcrdma_xprt *);
void rpcrdma_buffer_destroy(struct rpcrdma_buffer *);
@@ -490,16 +467,12 @@
struct rpcrdma_mr *rpcrdma_mr_get(struct rpcrdma_xprt *r_xprt);
void rpcrdma_mr_put(struct rpcrdma_mr *mr);
-
-static inline void
-rpcrdma_mr_recycle(struct rpcrdma_mr *mr)
-{
- schedule_work(&mr->mr_recycle);
-}
+void rpcrdma_mrs_refresh(struct rpcrdma_xprt *r_xprt);
struct rpcrdma_req *rpcrdma_buffer_get(struct rpcrdma_buffer *);
void rpcrdma_buffer_put(struct rpcrdma_buffer *buffers,
struct rpcrdma_req *req);
+void rpcrdma_reply_put(struct rpcrdma_buffer *buffers, struct rpcrdma_req *req);
void rpcrdma_recv_buffer_put(struct rpcrdma_rep *);
bool rpcrdma_regbuf_realloc(struct rpcrdma_regbuf *rb, size_t size,
@@ -544,18 +517,15 @@
/* Memory registration calls xprtrdma/frwr_ops.c
*/
-bool frwr_is_supported(struct ib_device *device);
-void frwr_recycle(struct rpcrdma_req *req);
void frwr_reset(struct rpcrdma_req *req);
-int frwr_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep);
-int frwr_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr);
+int frwr_query_device(struct rpcrdma_ep *ep, const struct ib_device *device);
+int frwr_mr_init(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr *mr);
void frwr_release_mr(struct rpcrdma_mr *mr);
-size_t frwr_maxpages(struct rpcrdma_xprt *r_xprt);
struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt,
struct rpcrdma_mr_seg *seg,
int nsegs, bool writing, __be32 xid,
struct rpcrdma_mr *mr);
-int frwr_send(struct rpcrdma_ia *ia, struct rpcrdma_req *req);
+int frwr_send(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req);
void frwr_reminv(struct rpcrdma_rep *rep, struct list_head *mrs);
void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req);
void frwr_unmap_async(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req);
@@ -566,6 +536,8 @@
enum rpcrdma_chunktype {
rpcrdma_noch = 0,
+ rpcrdma_noch_pullup,
+ rpcrdma_noch_mapped,
rpcrdma_readch,
rpcrdma_areadch,
rpcrdma_writech,
@@ -578,7 +550,8 @@
enum rpcrdma_chunktype rtype);
void rpcrdma_sendctx_unmap(struct rpcrdma_sendctx *sc);
int rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst);
-void rpcrdma_set_max_header_sizes(struct rpcrdma_xprt *);
+void rpcrdma_set_max_header_sizes(struct rpcrdma_ep *ep);
+void rpcrdma_reset_cwnd(struct rpcrdma_xprt *r_xprt);
void rpcrdma_complete_rqst(struct rpcrdma_rep *rep);
void rpcrdma_reply_handler(struct rpcrdma_rep *rep);
@@ -590,7 +563,6 @@
/* RPC/RDMA module init - xprtrdma/transport.c
*/
-extern unsigned int xprt_rdma_slot_table_entries;
extern unsigned int xprt_rdma_max_inline_read;
extern unsigned int xprt_rdma_max_inline_write;
void xprt_rdma_format_addresses(struct rpc_xprt *xprt, struct sockaddr *sap);
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 8ffc54b..16c7758 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -54,6 +54,7 @@
#include <trace/events/sunrpc.h>
+#include "socklib.h"
#include "sunrpc.h"
static void xs_close(struct rpc_xprt *xprt);
@@ -496,8 +497,8 @@
int flags, struct rpc_rqst *req)
{
struct xdr_buf *buf = &req->rq_private_buf;
- size_t want, uninitialized_var(read);
- ssize_t uninitialized_var(ret);
+ size_t want, read;
+ ssize_t ret;
xs_read_header(transport, buf);
@@ -750,125 +751,6 @@
#define XS_SENDMSG_FLAGS (MSG_DONTWAIT | MSG_NOSIGNAL)
-static int xs_sendmsg(struct socket *sock, struct msghdr *msg, size_t seek)
-{
- if (seek)
- iov_iter_advance(&msg->msg_iter, seek);
- return sock_sendmsg(sock, msg);
-}
-
-static int xs_send_kvec(struct socket *sock, struct msghdr *msg, struct kvec *vec, size_t seek)
-{
- iov_iter_kvec(&msg->msg_iter, WRITE, vec, 1, vec->iov_len);
- return xs_sendmsg(sock, msg, seek);
-}
-
-static int xs_send_pagedata(struct socket *sock, struct msghdr *msg, struct xdr_buf *xdr, size_t base)
-{
- int err;
-
- err = xdr_alloc_bvec(xdr, GFP_KERNEL);
- if (err < 0)
- return err;
-
- iov_iter_bvec(&msg->msg_iter, WRITE, xdr->bvec,
- xdr_buf_pagecount(xdr),
- xdr->page_len + xdr->page_base);
- return xs_sendmsg(sock, msg, base + xdr->page_base);
-}
-
-#define xs_record_marker_len() sizeof(rpc_fraghdr)
-
-/* Common case:
- * - stream transport
- * - sending from byte 0 of the message
- * - the message is wholly contained in @xdr's head iovec
- */
-static int xs_send_rm_and_kvec(struct socket *sock, struct msghdr *msg,
- rpc_fraghdr marker, struct kvec *vec, size_t base)
-{
- struct kvec iov[2] = {
- [0] = {
- .iov_base = &marker,
- .iov_len = sizeof(marker)
- },
- [1] = *vec,
- };
- size_t len = iov[0].iov_len + iov[1].iov_len;
-
- iov_iter_kvec(&msg->msg_iter, WRITE, iov, 2, len);
- return xs_sendmsg(sock, msg, base);
-}
-
-/**
- * xs_sendpages - write pages directly to a socket
- * @sock: socket to send on
- * @addr: UDP only -- address of destination
- * @addrlen: UDP only -- length of destination address
- * @xdr: buffer containing this request
- * @base: starting position in the buffer
- * @rm: stream record marker field
- * @sent_p: return the total number of bytes successfully queued for sending
- *
- */
-static int xs_sendpages(struct socket *sock, struct sockaddr *addr, int addrlen, struct xdr_buf *xdr, unsigned int base, rpc_fraghdr rm, int *sent_p)
-{
- struct msghdr msg = {
- .msg_name = addr,
- .msg_namelen = addrlen,
- .msg_flags = XS_SENDMSG_FLAGS | MSG_MORE,
- };
- unsigned int rmsize = rm ? sizeof(rm) : 0;
- unsigned int remainder = rmsize + xdr->len - base;
- unsigned int want;
- int err = 0;
-
- if (unlikely(!sock))
- return -ENOTSOCK;
-
- want = xdr->head[0].iov_len + rmsize;
- if (base < want) {
- unsigned int len = want - base;
- remainder -= len;
- if (remainder == 0)
- msg.msg_flags &= ~MSG_MORE;
- if (rmsize)
- err = xs_send_rm_and_kvec(sock, &msg, rm,
- &xdr->head[0], base);
- else
- err = xs_send_kvec(sock, &msg, &xdr->head[0], base);
- if (remainder == 0 || err != len)
- goto out;
- *sent_p += err;
- base = 0;
- } else
- base -= want;
-
- if (base < xdr->page_len) {
- unsigned int len = xdr->page_len - base;
- remainder -= len;
- if (remainder == 0)
- msg.msg_flags &= ~MSG_MORE;
- err = xs_send_pagedata(sock, &msg, xdr, base);
- if (remainder == 0 || err != len)
- goto out;
- *sent_p += err;
- base = 0;
- } else
- base -= xdr->page_len;
-
- if (base >= xdr->tail[0].iov_len)
- return 0;
- msg.msg_flags &= ~MSG_MORE;
- err = xs_send_kvec(sock, &msg, &xdr->tail[0], base);
-out:
- if (err > 0) {
- *sent_p += err;
- err = 0;
- }
- return err;
-}
-
/**
* xs_nospace - handle transmit was incomplete
* @req: pointer to RPC request
@@ -881,10 +763,7 @@
struct sock *sk = transport->inet;
int ret = -EAGAIN;
- dprintk("RPC: %5u xmit incomplete (%u left of %u)\n",
- req->rq_task->tk_pid,
- req->rq_slen - transport->xmit.offset,
- req->rq_slen);
+ trace_rpc_socket_nospace(req, transport);
/* Protect against races with write_space */
spin_lock(&xprt->transport_lock);
@@ -960,8 +839,11 @@
struct xdr_buf *xdr = &req->rq_snd_buf;
rpc_fraghdr rm = xs_stream_record_marker(xdr);
unsigned int msglen = rm ? req->rq_slen + sizeof(rm) : req->rq_slen;
+ struct msghdr msg = {
+ .msg_flags = XS_SENDMSG_FLAGS,
+ };
+ unsigned int sent;
int status;
- int sent = 0;
/* Close the stream if the previous transmission was incomplete */
if (xs_send_request_was_aborted(transport, req)) {
@@ -973,8 +855,8 @@
req->rq_svec->iov_base, req->rq_svec->iov_len);
req->rq_xtime = ktime_get();
- status = xs_sendpages(transport->sock, NULL, 0, xdr,
- transport->xmit.offset, rm, &sent);
+ status = xprt_sock_sendmsg(transport->sock, &msg, xdr,
+ transport->xmit.offset, rm, &sent);
dprintk("RPC: %s(%u) = %d\n",
__func__, xdr->len - transport->xmit.offset, status);
@@ -1001,7 +883,7 @@
default:
dprintk("RPC: sendmsg returned unrecognized error %d\n",
-status);
- /* fall through */
+ fallthrough;
case -EPIPE:
xs_close(xprt);
status = -ENOTCONN;
@@ -1026,7 +908,12 @@
struct rpc_xprt *xprt = req->rq_xprt;
struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
struct xdr_buf *xdr = &req->rq_snd_buf;
- int sent = 0;
+ struct msghdr msg = {
+ .msg_name = xs_addr(xprt),
+ .msg_namelen = xprt->addrlen,
+ .msg_flags = XS_SENDMSG_FLAGS,
+ };
+ unsigned int sent;
int status;
xs_pktdump("packet data:",
@@ -1040,8 +927,7 @@
return -EBADSLT;
req->rq_xtime = ktime_get();
- status = xs_sendpages(transport->sock, xs_addr(xprt), xprt->addrlen,
- xdr, 0, 0, &sent);
+ status = xprt_sock_sendmsg(transport->sock, &msg, xdr, 0, 0, &sent);
dprintk("RPC: xs_udp_send_request(%u) = %d\n",
xdr->len, status);
@@ -1107,9 +993,12 @@
struct xdr_buf *xdr = &req->rq_snd_buf;
rpc_fraghdr rm = xs_stream_record_marker(xdr);
unsigned int msglen = rm ? req->rq_slen + sizeof(rm) : req->rq_slen;
+ struct msghdr msg = {
+ .msg_flags = XS_SENDMSG_FLAGS,
+ };
bool vm_wait = false;
+ unsigned int sent;
int status;
- int sent;
/* Close the stream if the previous transmission was incomplete */
if (xs_send_request_was_aborted(transport, req)) {
@@ -1130,9 +1019,8 @@
* called sendmsg(). */
req->rq_xtime = ktime_get();
while (1) {
- sent = 0;
- status = xs_sendpages(transport->sock, NULL, 0, xdr,
- transport->xmit.offset, rm, &sent);
+ status = xprt_sock_sendmsg(transport->sock, &msg, xdr,
+ transport->xmit.offset, rm, &sent);
dprintk("RPC: xs_tcp_send_request(%u) = %d\n",
xdr->len - transport->xmit.offset, status);
@@ -1546,7 +1434,7 @@
xprt->connect_cookie++;
clear_bit(XPRT_CONNECTED, &xprt->state);
xs_run_error_worker(transport, XPRT_SOCK_WAKE_DISCONNECT);
- /* fall through */
+ fallthrough;
case TCP_CLOSING:
/*
* If the server closed down the connection, make sure that
@@ -1704,21 +1592,6 @@
return rand + min;
}
-/**
- * xs_set_reuseaddr_port - set the socket's port and address reuse options
- * @sock: socket
- *
- * Note that this function has to be called on all sockets that share the
- * same port, and it must be called before binding.
- */
-static void xs_sock_set_reuseport(struct socket *sock)
-{
- int opt = 1;
-
- kernel_setsockopt(sock, SOL_SOCKET, SO_REUSEPORT,
- (char *)&opt, sizeof(opt));
-}
-
static unsigned short xs_sock_getport(struct socket *sock)
{
struct sockaddr_storage buf;
@@ -1753,7 +1626,7 @@
static void xs_set_srcport(struct sock_xprt *transport, struct socket *sock)
{
- if (transport->srcport == 0)
+ if (transport->srcport == 0 && transport->xprt.reuseport)
transport->srcport = xs_sock_getport(sock);
}
@@ -1766,6 +1639,13 @@
return port;
}
+unsigned short get_srcport(struct rpc_xprt *xprt)
+{
+ struct sock_xprt *sock = container_of(xprt, struct sock_xprt, xprt);
+ return xs_sock_getport(sock->sock);
+}
+EXPORT_SYMBOL(get_srcport);
+
static unsigned short xs_next_srcport(struct sock_xprt *transport, unsigned short port)
{
if (transport->srcport != 0)
@@ -1807,7 +1687,8 @@
err = kernel_bind(sock, (struct sockaddr *)&myaddr,
transport->xprt.addrlen);
if (err == 0) {
- transport->srcport = port;
+ if (transport->xprt.reuseport)
+ transport->srcport = port;
break;
}
last = port;
@@ -1911,7 +1792,7 @@
xs_reclassify_socket(family, sock);
if (reuseport)
- xs_sock_set_reuseport(sock);
+ sock_set_reuseport(sock->sk);
err = xs_bind(transport, sock);
if (err) {
@@ -1971,7 +1852,7 @@
struct rpc_xprt *xprt = &transport->xprt;
struct file *filp;
struct socket *sock;
- int status = -EIO;
+ int status;
status = __sock_create(xprt->xprt_net, AF_LOCAL,
SOCK_STREAM, 0, &sock, 1);
@@ -2220,7 +2101,6 @@
struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
unsigned int keepidle;
unsigned int keepcnt;
- unsigned int opt_on = 1;
unsigned int timeo;
spin_lock(&xprt->transport_lock);
@@ -2232,18 +2112,13 @@
spin_unlock(&xprt->transport_lock);
/* TCP Keepalive options */
- kernel_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE,
- (char *)&opt_on, sizeof(opt_on));
- kernel_setsockopt(sock, SOL_TCP, TCP_KEEPIDLE,
- (char *)&keepidle, sizeof(keepidle));
- kernel_setsockopt(sock, SOL_TCP, TCP_KEEPINTVL,
- (char *)&keepidle, sizeof(keepidle));
- kernel_setsockopt(sock, SOL_TCP, TCP_KEEPCNT,
- (char *)&keepcnt, sizeof(keepcnt));
+ sock_set_keepalive(sock->sk);
+ tcp_sock_set_keepidle(sock->sk, keepidle);
+ tcp_sock_set_keepintvl(sock->sk, keepidle);
+ tcp_sock_set_keepcnt(sock->sk, keepcnt);
/* TCP user timeout (see RFC5482) */
- kernel_setsockopt(sock, SOL_TCP, TCP_USER_TIMEOUT,
- (char *)&timeo, sizeof(timeo));
+ tcp_sock_set_user_timeout(sock->sk, timeo);
}
static void xs_tcp_set_connect_timeout(struct rpc_xprt *xprt,
@@ -2281,7 +2156,6 @@
if (!transport->inet) {
struct sock *sk = sock->sk;
- unsigned int addr_pref = IPV6_PREFER_SRC_PUBLIC;
/* Avoid temporary address, they are bad for long-lived
* connections such as NFS mounts.
@@ -2290,8 +2164,10 @@
* knowledge about the normal duration of connections,
* MAY override this as appropriate.
*/
- kernel_setsockopt(sock, SOL_IPV6, IPV6_ADDR_PREFERENCES,
- (char *)&addr_pref, sizeof(addr_pref));
+ if (xs_addr(xprt)->sa_family == PF_INET6) {
+ ip6_sock_set_addr_preferences(sk,
+ IPV6_PREFER_SRC_PUBLIC);
+ }
xs_tcp_set_socket_timeouts(xprt, sock);
@@ -2332,7 +2208,7 @@
switch (ret) {
case 0:
xs_set_srcport(transport, sock);
- /* fall through */
+ fallthrough;
case -EINPROGRESS:
/* SYN_SENT! */
if (xprt->reestablish_timeout < XS_TCP_INIT_REEST_TO)
@@ -2385,7 +2261,7 @@
default:
printk("%s: connect returned unhandled error %d\n",
__func__, status);
- /* fall through */
+ fallthrough;
case -EADDRNOTAVAIL:
/* We're probably in TIME_WAIT. Get rid of existing socket,
* and retry
@@ -2637,50 +2513,37 @@
free_page((unsigned long)buf);
}
-/*
- * Use the svc_sock to send the callback. Must be called with svsk->sk_mutex
- * held. Borrows heavily from svc_tcp_sendto and xs_tcp_send_request.
- */
static int bc_sendto(struct rpc_rqst *req)
{
- int len;
- struct xdr_buf *xbufp = &req->rq_snd_buf;
+ struct xdr_buf *xdr = &req->rq_snd_buf;
struct sock_xprt *transport =
container_of(req->rq_xprt, struct sock_xprt, xprt);
- unsigned long headoff;
- unsigned long tailoff;
- struct page *tailpage;
struct msghdr msg = {
- .msg_flags = MSG_MORE
+ .msg_flags = 0,
};
rpc_fraghdr marker = cpu_to_be32(RPC_LAST_STREAM_FRAGMENT |
- (u32)xbufp->len);
- struct kvec iov = {
- .iov_base = &marker,
- .iov_len = sizeof(marker),
- };
+ (u32)xdr->len);
+ unsigned int sent = 0;
+ int err;
req->rq_xtime = ktime_get();
-
- len = kernel_sendmsg(transport->sock, &msg, &iov, 1, iov.iov_len);
- if (len != iov.iov_len)
+ err = xprt_sock_sendmsg(transport->sock, &msg, xdr, 0, marker, &sent);
+ xdr_free_bvec(xdr);
+ if (err < 0 || sent != (xdr->len + sizeof(marker)))
return -EAGAIN;
-
- tailpage = NULL;
- if (xbufp->tail[0].iov_len)
- tailpage = virt_to_page(xbufp->tail[0].iov_base);
- tailoff = (unsigned long)xbufp->tail[0].iov_base & ~PAGE_MASK;
- headoff = (unsigned long)xbufp->head[0].iov_base & ~PAGE_MASK;
- len = svc_send_common(transport->sock, xbufp,
- virt_to_page(xbufp->head[0].iov_base), headoff,
- tailpage, tailoff);
- if (len != xbufp->len)
- return -EAGAIN;
- return len;
+ return sent;
}
-/*
- * The send routine. Borrows from svc_send
+/**
+ * bc_send_request - Send a backchannel Call on a TCP socket
+ * @req: rpc_rqst containing Call message to be sent
+ *
+ * xpt_mutex ensures @rqstp's whole message is written to the socket
+ * without interruption.
+ *
+ * Return values:
+ * %0 if the message was sent successfully
+ * %ENOTCONN if the message was not sent
*/
static int bc_send_request(struct rpc_rqst *req)
{