Update Linux to v5.4.2
Change-Id: Idf6911045d9d382da2cfe01b1edff026404ac8fd
diff --git a/net/sunrpc/Kconfig b/net/sunrpc/Kconfig
index ac09ca8..3bcf985 100644
--- a/net/sunrpc/Kconfig
+++ b/net/sunrpc/Kconfig
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
config SUNRPC
tristate
depends on MULTIUSER
@@ -34,6 +35,22 @@
If unsure, say Y.
+config SUNRPC_DISABLE_INSECURE_ENCTYPES
+ bool "Secure RPC: Disable insecure Kerberos encryption types"
+ depends on RPCSEC_GSS_KRB5
+ default n
+ help
+ Choose Y here to disable the use of deprecated encryption types
+ with the Kerberos version 5 GSS-API mechanism (RFC 1964). The
+ deprecated encryption types include DES-CBC-MD5, DES-CBC-CRC,
+ and DES-CBC-MD4. These types were deprecated by RFC 6649 because
+ they were found to be insecure.
+
+ N is the default because many sites have deployed KDCs and
+ keytabs that contain only these deprecated encryption types.
+ Choosing Y prevents the use of known-insecure encryption types
+ but might result in compatibility problems.
+
config SUNRPC_DEBUG
bool "RPC: Enable dprintk debugging"
depends on SUNRPC && SYSCTL
diff --git a/net/sunrpc/Makefile b/net/sunrpc/Makefile
index 090658c..9488600 100644
--- a/net/sunrpc/Makefile
+++ b/net/sunrpc/Makefile
@@ -9,7 +9,7 @@
obj-$(CONFIG_SUNRPC_XPRT_RDMA) += xprtrdma/
sunrpc-y := clnt.o xprt.o socklib.o xprtsock.o sched.o \
- auth.o auth_null.o auth_unix.o auth_generic.o \
+ auth.o auth_null.o auth_unix.o \
svc.o svcsock.o svcauth.o svcauth_unix.o \
addr.o rpcb_clnt.o timer.o xdr.o \
sunrpc_syms.o cache.o rpc_pipe.o \
diff --git a/net/sunrpc/addr.c b/net/sunrpc/addr.c
index 2e0a6f9..d024af4 100644
--- a/net/sunrpc/addr.c
+++ b/net/sunrpc/addr.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright 2009, Oracle. All rights reserved.
*
diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c
index 305ecea..cdb05b4 100644
--- a/net/sunrpc/auth.c
+++ b/net/sunrpc/auth.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/net/sunrpc/auth.c
*
@@ -17,9 +18,7 @@
#include <linux/sunrpc/gss_api.h>
#include <linux/spinlock.h>
-#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
-# define RPCDBG_FACILITY RPCDBG_AUTH
-#endif
+#include <trace/events/sunrpc.h>
#define RPC_CREDCACHE_DEFAULT_HASHBITS (4)
struct rpc_cred_cache {
@@ -30,16 +29,32 @@
static unsigned int auth_hashbits = RPC_CREDCACHE_DEFAULT_HASHBITS;
-static DEFINE_SPINLOCK(rpc_authflavor_lock);
-static const struct rpc_authops *auth_flavors[RPC_AUTH_MAXFLAVOR] = {
- &authnull_ops, /* AUTH_NULL */
- &authunix_ops, /* AUTH_UNIX */
+static const struct rpc_authops __rcu *auth_flavors[RPC_AUTH_MAXFLAVOR] = {
+ [RPC_AUTH_NULL] = (const struct rpc_authops __force __rcu *)&authnull_ops,
+ [RPC_AUTH_UNIX] = (const struct rpc_authops __force __rcu *)&authunix_ops,
NULL, /* others can be loadable modules */
};
static LIST_HEAD(cred_unused);
static unsigned long number_cred_unused;
+static struct cred machine_cred = {
+ .usage = ATOMIC_INIT(1),
+#ifdef CONFIG_DEBUG_CREDENTIALS
+ .magic = CRED_MAGIC,
+#endif
+};
+
+/*
+ * Return the machine_cred pointer to be used whenever
+ * the a generic machine credential is needed.
+ */
+const struct cred *rpc_machine_cred(void)
+{
+ return &machine_cred;
+}
+EXPORT_SYMBOL_GPL(rpc_machine_cred);
+
#define MAX_HASHTABLE_BITS (14)
static int param_set_hashtbl_sz(const char *val, const struct kernel_param *kp)
{
@@ -93,39 +108,65 @@
int
rpcauth_register(const struct rpc_authops *ops)
{
+ const struct rpc_authops *old;
rpc_authflavor_t flavor;
- int ret = -EPERM;
if ((flavor = ops->au_flavor) >= RPC_AUTH_MAXFLAVOR)
return -EINVAL;
- spin_lock(&rpc_authflavor_lock);
- if (auth_flavors[flavor] == NULL) {
- auth_flavors[flavor] = ops;
- ret = 0;
- }
- spin_unlock(&rpc_authflavor_lock);
- return ret;
+ old = cmpxchg((const struct rpc_authops ** __force)&auth_flavors[flavor], NULL, ops);
+ if (old == NULL || old == ops)
+ return 0;
+ return -EPERM;
}
EXPORT_SYMBOL_GPL(rpcauth_register);
int
rpcauth_unregister(const struct rpc_authops *ops)
{
+ const struct rpc_authops *old;
rpc_authflavor_t flavor;
- int ret = -EPERM;
if ((flavor = ops->au_flavor) >= RPC_AUTH_MAXFLAVOR)
return -EINVAL;
- spin_lock(&rpc_authflavor_lock);
- if (auth_flavors[flavor] == ops) {
- auth_flavors[flavor] = NULL;
- ret = 0;
- }
- spin_unlock(&rpc_authflavor_lock);
- return ret;
+
+ old = cmpxchg((const struct rpc_authops ** __force)&auth_flavors[flavor], ops, NULL);
+ if (old == ops || old == NULL)
+ return 0;
+ return -EPERM;
}
EXPORT_SYMBOL_GPL(rpcauth_unregister);
+static const struct rpc_authops *
+rpcauth_get_authops(rpc_authflavor_t flavor)
+{
+ const struct rpc_authops *ops;
+
+ if (flavor >= RPC_AUTH_MAXFLAVOR)
+ return NULL;
+
+ rcu_read_lock();
+ ops = rcu_dereference(auth_flavors[flavor]);
+ if (ops == NULL) {
+ rcu_read_unlock();
+ request_module("rpc-auth-%u", flavor);
+ rcu_read_lock();
+ ops = rcu_dereference(auth_flavors[flavor]);
+ if (ops == NULL)
+ goto out;
+ }
+ if (!try_module_get(ops->owner))
+ ops = NULL;
+out:
+ rcu_read_unlock();
+ return ops;
+}
+
+static void
+rpcauth_put_authops(const struct rpc_authops *ops)
+{
+ module_put(ops->owner);
+}
+
/**
* rpcauth_get_pseudoflavor - check if security flavor is supported
* @flavor: a security flavor
@@ -138,25 +179,16 @@
rpc_authflavor_t
rpcauth_get_pseudoflavor(rpc_authflavor_t flavor, struct rpcsec_gss_info *info)
{
- const struct rpc_authops *ops;
+ const struct rpc_authops *ops = rpcauth_get_authops(flavor);
rpc_authflavor_t pseudoflavor;
- ops = auth_flavors[flavor];
- if (ops == NULL)
- request_module("rpc-auth-%u", flavor);
- spin_lock(&rpc_authflavor_lock);
- ops = auth_flavors[flavor];
- if (ops == NULL || !try_module_get(ops->owner)) {
- spin_unlock(&rpc_authflavor_lock);
+ if (!ops)
return RPC_AUTH_MAXFLAVOR;
- }
- spin_unlock(&rpc_authflavor_lock);
-
pseudoflavor = flavor;
if (ops->info2flavor != NULL)
pseudoflavor = ops->info2flavor(info);
- module_put(ops->owner);
+ rpcauth_put_authops(ops);
return pseudoflavor;
}
EXPORT_SYMBOL_GPL(rpcauth_get_pseudoflavor);
@@ -176,25 +208,15 @@
const struct rpc_authops *ops;
int result;
- if (flavor >= RPC_AUTH_MAXFLAVOR)
- return -EINVAL;
-
- ops = auth_flavors[flavor];
+ ops = rpcauth_get_authops(flavor);
if (ops == NULL)
- request_module("rpc-auth-%u", flavor);
- spin_lock(&rpc_authflavor_lock);
- ops = auth_flavors[flavor];
- if (ops == NULL || !try_module_get(ops->owner)) {
- spin_unlock(&rpc_authflavor_lock);
return -ENOENT;
- }
- spin_unlock(&rpc_authflavor_lock);
result = -ENOENT;
if (ops->flavor2info != NULL)
result = ops->flavor2info(pseudoflavor, info);
- module_put(ops->owner);
+ rpcauth_put_authops(ops);
return result;
}
EXPORT_SYMBOL_GPL(rpcauth_get_gssinfo);
@@ -212,15 +234,13 @@
int
rpcauth_list_flavors(rpc_authflavor_t *array, int size)
{
- rpc_authflavor_t flavor;
- int result = 0;
+ const struct rpc_authops *ops;
+ rpc_authflavor_t flavor, pseudos[4];
+ int i, len, result = 0;
- spin_lock(&rpc_authflavor_lock);
+ rcu_read_lock();
for (flavor = 0; flavor < RPC_AUTH_MAXFLAVOR; flavor++) {
- const struct rpc_authops *ops = auth_flavors[flavor];
- rpc_authflavor_t pseudos[4];
- int i, len;
-
+ ops = rcu_dereference(auth_flavors[flavor]);
if (result >= size) {
result = -ENOMEM;
break;
@@ -245,9 +265,7 @@
array[result++] = pseudos[i];
}
}
- spin_unlock(&rpc_authflavor_lock);
-
- dprintk("RPC: %s returns %d\n", __func__, result);
+ rcu_read_unlock();
return result;
}
EXPORT_SYMBOL_GPL(rpcauth_list_flavors);
@@ -255,25 +273,17 @@
struct rpc_auth *
rpcauth_create(const struct rpc_auth_create_args *args, struct rpc_clnt *clnt)
{
- struct rpc_auth *auth;
+ struct rpc_auth *auth = ERR_PTR(-EINVAL);
const struct rpc_authops *ops;
- u32 flavor = pseudoflavor_to_flavor(args->pseudoflavor);
+ u32 flavor = pseudoflavor_to_flavor(args->pseudoflavor);
- auth = ERR_PTR(-EINVAL);
- if (flavor >= RPC_AUTH_MAXFLAVOR)
+ ops = rpcauth_get_authops(flavor);
+ if (ops == NULL)
goto out;
- if ((ops = auth_flavors[flavor]) == NULL)
- request_module("rpc-auth-%u", flavor);
- spin_lock(&rpc_authflavor_lock);
- ops = auth_flavors[flavor];
- if (ops == NULL || !try_module_get(ops->owner)) {
- spin_unlock(&rpc_authflavor_lock);
- goto out;
- }
- spin_unlock(&rpc_authflavor_lock);
auth = ops->create(args, clnt);
- module_put(ops->owner);
+
+ rpcauth_put_authops(ops);
if (IS_ERR(auth))
return auth;
if (clnt->cl_auth)
@@ -288,32 +298,37 @@
void
rpcauth_release(struct rpc_auth *auth)
{
- if (!atomic_dec_and_test(&auth->au_count))
+ if (!refcount_dec_and_test(&auth->au_count))
return;
auth->au_ops->destroy(auth);
}
static DEFINE_SPINLOCK(rpc_credcache_lock);
-static void
+/*
+ * On success, the caller is responsible for freeing the reference
+ * held by the hashtable
+ */
+static bool
rpcauth_unhash_cred_locked(struct rpc_cred *cred)
{
+ if (!test_and_clear_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags))
+ return false;
hlist_del_rcu(&cred->cr_hash);
- smp_mb__before_atomic();
- clear_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags);
+ return true;
}
-static int
+static bool
rpcauth_unhash_cred(struct rpc_cred *cred)
{
spinlock_t *cache_lock;
- int ret;
+ bool ret;
+ if (!test_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags))
+ return false;
cache_lock = &cred->cr_auth->au_credcache->lock;
spin_lock(cache_lock);
- ret = atomic_read(&cred->cr_count) == 0;
- if (ret)
- rpcauth_unhash_cred_locked(cred);
+ ret = rpcauth_unhash_cred_locked(cred);
spin_unlock(cache_lock);
return ret;
}
@@ -345,29 +360,6 @@
}
EXPORT_SYMBOL_GPL(rpcauth_init_credcache);
-/*
- * Setup a credential key lifetime timeout notification
- */
-int
-rpcauth_key_timeout_notify(struct rpc_auth *auth, struct rpc_cred *cred)
-{
- if (!cred->cr_auth->au_ops->key_timeout)
- return 0;
- return cred->cr_auth->au_ops->key_timeout(auth, cred);
-}
-EXPORT_SYMBOL_GPL(rpcauth_key_timeout_notify);
-
-bool
-rpcauth_cred_key_to_expire(struct rpc_auth *auth, struct rpc_cred *cred)
-{
- if (auth->au_flags & RPCAUTH_AUTH_NO_CRKEY_TIMEOUT)
- return false;
- if (!cred->cr_ops->crkey_to_expire)
- return false;
- return cred->cr_ops->crkey_to_expire(cred);
-}
-EXPORT_SYMBOL_GPL(rpcauth_cred_key_to_expire);
-
char *
rpcauth_stringify_acceptor(struct rpc_cred *cred)
{
@@ -392,6 +384,44 @@
}
}
+static void
+rpcauth_lru_add_locked(struct rpc_cred *cred)
+{
+ if (!list_empty(&cred->cr_lru))
+ return;
+ number_cred_unused++;
+ list_add_tail(&cred->cr_lru, &cred_unused);
+}
+
+static void
+rpcauth_lru_add(struct rpc_cred *cred)
+{
+ if (!list_empty(&cred->cr_lru))
+ return;
+ spin_lock(&rpc_credcache_lock);
+ rpcauth_lru_add_locked(cred);
+ spin_unlock(&rpc_credcache_lock);
+}
+
+static void
+rpcauth_lru_remove_locked(struct rpc_cred *cred)
+{
+ if (list_empty(&cred->cr_lru))
+ return;
+ number_cred_unused--;
+ list_del_init(&cred->cr_lru);
+}
+
+static void
+rpcauth_lru_remove(struct rpc_cred *cred)
+{
+ if (list_empty(&cred->cr_lru))
+ return;
+ spin_lock(&rpc_credcache_lock);
+ rpcauth_lru_remove_locked(cred);
+ spin_unlock(&rpc_credcache_lock);
+}
+
/*
* Clear the RPC credential cache, and delete those credentials
* that are not referenced.
@@ -411,13 +441,10 @@
head = &cache->hashtable[i];
while (!hlist_empty(head)) {
cred = hlist_entry(head->first, struct rpc_cred, cr_hash);
- get_rpccred(cred);
- if (!list_empty(&cred->cr_lru)) {
- list_del(&cred->cr_lru);
- number_cred_unused--;
- }
- list_add_tail(&cred->cr_lru, &free);
rpcauth_unhash_cred_locked(cred);
+ /* Note: We now hold a reference to cred */
+ rpcauth_lru_remove_locked(cred);
+ list_add_tail(&cred->cr_lru, &free);
}
}
spin_unlock(&cache->lock);
@@ -451,7 +478,6 @@
static long
rpcauth_prune_expired(struct list_head *free, int nr_to_scan)
{
- spinlock_t *cache_lock;
struct rpc_cred *cred, *next;
unsigned long expired = jiffies - RPC_AUTH_EXPIRY_MORATORIUM;
long freed = 0;
@@ -460,32 +486,24 @@
if (nr_to_scan-- == 0)
break;
+ if (refcount_read(&cred->cr_count) > 1) {
+ rpcauth_lru_remove_locked(cred);
+ continue;
+ }
/*
* Enforce a 60 second garbage collection moratorium
* Note that the cred_unused list must be time-ordered.
*/
- if (time_in_range(cred->cr_expire, expired, jiffies) &&
- test_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags) != 0) {
- freed = SHRINK_STOP;
- break;
- }
-
- list_del_init(&cred->cr_lru);
- number_cred_unused--;
- freed++;
- if (atomic_read(&cred->cr_count) != 0)
+ if (!time_in_range(cred->cr_expire, expired, jiffies))
+ continue;
+ if (!rpcauth_unhash_cred(cred))
continue;
- cache_lock = &cred->cr_auth->au_credcache->lock;
- spin_lock(cache_lock);
- if (atomic_read(&cred->cr_count) == 0) {
- get_rpccred(cred);
- list_add_tail(&cred->cr_lru, free);
- rpcauth_unhash_cred_locked(cred);
- }
- spin_unlock(cache_lock);
+ rpcauth_lru_remove_locked(cred);
+ freed++;
+ list_add_tail(&cred->cr_lru, free);
}
- return freed;
+ return freed ? freed : SHRINK_STOP;
}
static unsigned long
@@ -560,29 +578,15 @@
hlist_for_each_entry_rcu(entry, &cache->hashtable[nr], cr_hash) {
if (!entry->cr_ops->crmatch(acred, entry, flags))
continue;
- if (flags & RPCAUTH_LOOKUP_RCU) {
- if (test_bit(RPCAUTH_CRED_HASHED, &entry->cr_flags) &&
- !test_bit(RPCAUTH_CRED_NEW, &entry->cr_flags))
- cred = entry;
- break;
- }
- spin_lock(&cache->lock);
- if (test_bit(RPCAUTH_CRED_HASHED, &entry->cr_flags) == 0) {
- spin_unlock(&cache->lock);
- continue;
- }
cred = get_rpccred(entry);
- spin_unlock(&cache->lock);
- break;
+ if (cred)
+ break;
}
rcu_read_unlock();
if (cred != NULL)
goto found;
- if (flags & RPCAUTH_LOOKUP_RCU)
- return ERR_PTR(-ECHILD);
-
new = auth->au_ops->crcreate(auth, acred, flags, gfp);
if (IS_ERR(new)) {
cred = new;
@@ -594,11 +598,13 @@
if (!entry->cr_ops->crmatch(acred, entry, flags))
continue;
cred = get_rpccred(entry);
- break;
+ if (cred)
+ break;
}
if (cred == NULL) {
cred = new;
set_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags);
+ refcount_inc(&cred->cr_count);
hlist_add_head_rcu(&cred->cr_hash, &cache->hashtable[nr]);
} else
list_add_tail(&new->cr_lru, &free);
@@ -627,13 +633,8 @@
struct rpc_cred *ret;
const struct cred *cred = current_cred();
- dprintk("RPC: looking up %s cred\n",
- auth->au_ops->au_name);
-
memset(&acred, 0, sizeof(acred));
- acred.uid = cred->fsuid;
- acred.gid = cred->fsgid;
- acred.group_info = cred->group_info;
+ acred.cred = cred;
ret = auth->au_ops->lookup_cred(auth, &acred, flags);
return ret;
}
@@ -645,34 +646,40 @@
{
INIT_HLIST_NODE(&cred->cr_hash);
INIT_LIST_HEAD(&cred->cr_lru);
- atomic_set(&cred->cr_count, 1);
+ refcount_set(&cred->cr_count, 1);
cred->cr_auth = auth;
+ cred->cr_flags = 0;
cred->cr_ops = ops;
cred->cr_expire = jiffies;
- cred->cr_uid = acred->uid;
+ cred->cr_cred = get_cred(acred->cred);
}
EXPORT_SYMBOL_GPL(rpcauth_init_cred);
-struct rpc_cred *
-rpcauth_generic_bind_cred(struct rpc_task *task, struct rpc_cred *cred, int lookupflags)
-{
- dprintk("RPC: %5u holding %s cred %p\n", task->tk_pid,
- cred->cr_auth->au_ops->au_name, cred);
- return get_rpccred(cred);
-}
-EXPORT_SYMBOL_GPL(rpcauth_generic_bind_cred);
-
static struct rpc_cred *
rpcauth_bind_root_cred(struct rpc_task *task, int lookupflags)
{
struct rpc_auth *auth = task->tk_client->cl_auth;
struct auth_cred acred = {
- .uid = GLOBAL_ROOT_UID,
- .gid = GLOBAL_ROOT_GID,
+ .cred = get_task_cred(&init_task),
+ };
+ struct rpc_cred *ret;
+
+ ret = auth->au_ops->lookup_cred(auth, &acred, lookupflags);
+ put_cred(acred.cred);
+ return ret;
+}
+
+static struct rpc_cred *
+rpcauth_bind_machine_cred(struct rpc_task *task, int lookupflags)
+{
+ struct rpc_auth *auth = task->tk_client->cl_auth;
+ struct auth_cred acred = {
+ .principal = task->tk_client->cl_principal,
+ .cred = init_task.cred,
};
- dprintk("RPC: %5u looking up %s cred\n",
- task->tk_pid, task->tk_client->cl_auth->au_ops->au_name);
+ if (!acred.principal)
+ return NULL;
return auth->au_ops->lookup_cred(auth, &acred, lookupflags);
}
@@ -681,24 +688,37 @@
{
struct rpc_auth *auth = task->tk_client->cl_auth;
- dprintk("RPC: %5u looking up %s cred\n",
- task->tk_pid, auth->au_ops->au_name);
return rpcauth_lookupcred(auth, lookupflags);
}
static int
-rpcauth_bindcred(struct rpc_task *task, struct rpc_cred *cred, int flags)
+rpcauth_bindcred(struct rpc_task *task, const struct cred *cred, int flags)
{
struct rpc_rqst *req = task->tk_rqstp;
- struct rpc_cred *new;
+ struct rpc_cred *new = NULL;
int lookupflags = 0;
+ struct rpc_auth *auth = task->tk_client->cl_auth;
+ struct auth_cred acred = {
+ .cred = cred,
+ };
if (flags & RPC_TASK_ASYNC)
lookupflags |= RPCAUTH_LOOKUP_NEW;
- if (cred != NULL)
- new = cred->cr_ops->crbind(task, cred, lookupflags);
- else if (flags & RPC_TASK_ROOTCREDS)
+ if (task->tk_op_cred)
+ /* Task must use exactly this rpc_cred */
+ new = get_rpccred(task->tk_op_cred);
+ else if (cred != NULL && cred != &machine_cred)
+ new = auth->au_ops->lookup_cred(auth, &acred, lookupflags);
+ else if (cred == &machine_cred)
+ new = rpcauth_bind_machine_cred(task, lookupflags);
+
+ /* If machine cred couldn't be bound, try a root cred */
+ if (new)
+ ;
+ else if (cred == &machine_cred || (flags & RPC_TASK_ROOTCREDS))
new = rpcauth_bind_root_cred(task, lookupflags);
+ else if (flags & RPC_TASK_NULLCREDS)
+ new = authnull_ops.lookup_cred(NULL, NULL, 0);
else
new = rpcauth_bind_new_cred(task, lookupflags);
if (IS_ERR(new))
@@ -713,108 +733,138 @@
{
if (cred == NULL)
return;
- /* Fast path for unhashed credentials */
- if (test_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags) == 0) {
- if (atomic_dec_and_test(&cred->cr_count))
- cred->cr_ops->crdestroy(cred);
- return;
+ rcu_read_lock();
+ if (refcount_dec_and_test(&cred->cr_count))
+ goto destroy;
+ if (refcount_read(&cred->cr_count) != 1 ||
+ !test_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags))
+ goto out;
+ if (test_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags) != 0) {
+ cred->cr_expire = jiffies;
+ rpcauth_lru_add(cred);
+ /* Race breaker */
+ if (unlikely(!test_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags)))
+ rpcauth_lru_remove(cred);
+ } else if (rpcauth_unhash_cred(cred)) {
+ rpcauth_lru_remove(cred);
+ if (refcount_dec_and_test(&cred->cr_count))
+ goto destroy;
}
-
- if (!atomic_dec_and_lock(&cred->cr_count, &rpc_credcache_lock))
- return;
- if (!list_empty(&cred->cr_lru)) {
- number_cred_unused--;
- list_del_init(&cred->cr_lru);
- }
- if (test_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags) != 0) {
- if (test_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags) != 0) {
- cred->cr_expire = jiffies;
- list_add_tail(&cred->cr_lru, &cred_unused);
- number_cred_unused++;
- goto out_nodestroy;
- }
- if (!rpcauth_unhash_cred(cred)) {
- /* We were hashed and someone looked us up... */
- goto out_nodestroy;
- }
- }
- spin_unlock(&rpc_credcache_lock);
- cred->cr_ops->crdestroy(cred);
+out:
+ rcu_read_unlock();
return;
-out_nodestroy:
- spin_unlock(&rpc_credcache_lock);
+destroy:
+ rcu_read_unlock();
+ cred->cr_ops->crdestroy(cred);
}
EXPORT_SYMBOL_GPL(put_rpccred);
-__be32 *
-rpcauth_marshcred(struct rpc_task *task, __be32 *p)
+/**
+ * rpcauth_marshcred - Append RPC credential to end of @xdr
+ * @task: controlling RPC task
+ * @xdr: xdr_stream containing initial portion of RPC Call header
+ *
+ * On success, an appropriate verifier is added to @xdr, @xdr is
+ * updated to point past the verifier, and zero is returned.
+ * Otherwise, @xdr is in an undefined state and a negative errno
+ * is returned.
+ */
+int rpcauth_marshcred(struct rpc_task *task, struct xdr_stream *xdr)
{
- struct rpc_cred *cred = task->tk_rqstp->rq_cred;
+ const struct rpc_credops *ops = task->tk_rqstp->rq_cred->cr_ops;
- dprintk("RPC: %5u marshaling %s cred %p\n",
- task->tk_pid, cred->cr_auth->au_ops->au_name, cred);
-
- return cred->cr_ops->crmarshal(task, p);
+ return ops->crmarshal(task, xdr);
}
-__be32 *
-rpcauth_checkverf(struct rpc_task *task, __be32 *p)
+/**
+ * rpcauth_wrap_req_encode - XDR encode the RPC procedure
+ * @task: controlling RPC task
+ * @xdr: stream where on-the-wire bytes are to be marshalled
+ *
+ * On success, @xdr contains the encoded and wrapped message.
+ * Otherwise, @xdr is in an undefined state.
+ */
+int rpcauth_wrap_req_encode(struct rpc_task *task, struct xdr_stream *xdr)
{
- struct rpc_cred *cred = task->tk_rqstp->rq_cred;
+ kxdreproc_t encode = task->tk_msg.rpc_proc->p_encode;
- dprintk("RPC: %5u validating %s cred %p\n",
- task->tk_pid, cred->cr_auth->au_ops->au_name, cred);
-
- return cred->cr_ops->crvalidate(task, p);
-}
-
-static void rpcauth_wrap_req_encode(kxdreproc_t encode, struct rpc_rqst *rqstp,
- __be32 *data, void *obj)
-{
- struct xdr_stream xdr;
-
- xdr_init_encode(&xdr, &rqstp->rq_snd_buf, data);
- encode(rqstp, &xdr, obj);
-}
-
-int
-rpcauth_wrap_req(struct rpc_task *task, kxdreproc_t encode, void *rqstp,
- __be32 *data, void *obj)
-{
- struct rpc_cred *cred = task->tk_rqstp->rq_cred;
-
- dprintk("RPC: %5u using %s cred %p to wrap rpc data\n",
- task->tk_pid, cred->cr_ops->cr_name, cred);
- if (cred->cr_ops->crwrap_req)
- return cred->cr_ops->crwrap_req(task, encode, rqstp, data, obj);
- /* By default, we encode the arguments normally. */
- rpcauth_wrap_req_encode(encode, rqstp, data, obj);
+ encode(task->tk_rqstp, xdr, task->tk_msg.rpc_argp);
return 0;
}
+EXPORT_SYMBOL_GPL(rpcauth_wrap_req_encode);
-static int
-rpcauth_unwrap_req_decode(kxdrdproc_t decode, struct rpc_rqst *rqstp,
- __be32 *data, void *obj)
+/**
+ * rpcauth_wrap_req - XDR encode and wrap the RPC procedure
+ * @task: controlling RPC task
+ * @xdr: stream where on-the-wire bytes are to be marshalled
+ *
+ * On success, @xdr contains the encoded and wrapped message,
+ * and zero is returned. Otherwise, @xdr is in an undefined
+ * state and a negative errno is returned.
+ */
+int rpcauth_wrap_req(struct rpc_task *task, struct xdr_stream *xdr)
{
- struct xdr_stream xdr;
+ const struct rpc_credops *ops = task->tk_rqstp->rq_cred->cr_ops;
- xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, data);
- return decode(rqstp, &xdr, obj);
+ return ops->crwrap_req(task, xdr);
}
+/**
+ * rpcauth_checkverf - Validate verifier in RPC Reply header
+ * @task: controlling RPC task
+ * @xdr: xdr_stream containing RPC Reply header
+ *
+ * On success, @xdr is updated to point past the verifier and
+ * zero is returned. Otherwise, @xdr is in an undefined state
+ * and a negative errno is returned.
+ */
int
-rpcauth_unwrap_resp(struct rpc_task *task, kxdrdproc_t decode, void *rqstp,
- __be32 *data, void *obj)
+rpcauth_checkverf(struct rpc_task *task, struct xdr_stream *xdr)
+{
+ const struct rpc_credops *ops = task->tk_rqstp->rq_cred->cr_ops;
+
+ return ops->crvalidate(task, xdr);
+}
+
+/**
+ * rpcauth_unwrap_resp_decode - Invoke XDR decode function
+ * @task: controlling RPC task
+ * @xdr: stream where the Reply message resides
+ *
+ * Returns zero on success; otherwise a negative errno is returned.
+ */
+int
+rpcauth_unwrap_resp_decode(struct rpc_task *task, struct xdr_stream *xdr)
+{
+ kxdrdproc_t decode = task->tk_msg.rpc_proc->p_decode;
+
+ return decode(task->tk_rqstp, xdr, task->tk_msg.rpc_resp);
+}
+EXPORT_SYMBOL_GPL(rpcauth_unwrap_resp_decode);
+
+/**
+ * rpcauth_unwrap_resp - Invoke unwrap and decode function for the cred
+ * @task: controlling RPC task
+ * @xdr: stream where the Reply message resides
+ *
+ * Returns zero on success; otherwise a negative errno is returned.
+ */
+int
+rpcauth_unwrap_resp(struct rpc_task *task, struct xdr_stream *xdr)
+{
+ const struct rpc_credops *ops = task->tk_rqstp->rq_cred->cr_ops;
+
+ return ops->crunwrap_resp(task, xdr);
+}
+
+bool
+rpcauth_xmit_need_reencode(struct rpc_task *task)
{
struct rpc_cred *cred = task->tk_rqstp->rq_cred;
- dprintk("RPC: %5u using %s cred %p to unwrap rpc data\n",
- task->tk_pid, cred->cr_ops->cr_name, cred);
- if (cred->cr_ops->crunwrap_resp)
- return cred->cr_ops->crunwrap_resp(task, decode, rqstp,
- data, obj);
- /* By default, we decode the arguments normally. */
- return rpcauth_unwrap_req_decode(decode, rqstp, data, obj);
+ if (!cred || !cred->cr_ops->crneed_reencode)
+ return false;
+ return cred->cr_ops->crneed_reencode(task);
}
int
@@ -830,8 +880,6 @@
goto out;
cred = task->tk_rqstp->rq_cred;
}
- dprintk("RPC: %5u refreshing %s cred %p\n",
- task->tk_pid, cred->cr_auth->au_ops->au_name, cred);
err = cred->cr_ops->crrefresh(task);
out:
@@ -845,8 +893,6 @@
{
struct rpc_cred *cred = task->tk_rqstp->rq_cred;
- dprintk("RPC: %5u invalidating %s cred %p\n",
- task->tk_pid, cred->cr_auth->au_ops->au_name, cred);
if (cred)
clear_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags);
}
@@ -873,15 +919,10 @@
err = rpc_init_authunix();
if (err < 0)
goto out1;
- err = rpc_init_generic_auth();
- if (err < 0)
- goto out2;
err = register_shrinker(&rpc_cred_shrinker);
if (err < 0)
- goto out3;
+ goto out2;
return 0;
-out3:
- rpc_destroy_generic_auth();
out2:
rpc_destroy_authunix();
out1:
@@ -891,6 +932,5 @@
void rpcauth_remove_module(void)
{
rpc_destroy_authunix();
- rpc_destroy_generic_auth();
unregister_shrinker(&rpc_cred_shrinker);
}
diff --git a/net/sunrpc/auth_generic.c b/net/sunrpc/auth_generic.c
deleted file mode 100644
index 1ac08dc..0000000
--- a/net/sunrpc/auth_generic.c
+++ /dev/null
@@ -1,293 +0,0 @@
-/*
- * Generic RPC credential
- *
- * Copyright (C) 2008, Trond Myklebust <Trond.Myklebust@netapp.com>
- */
-
-#include <linux/err.h>
-#include <linux/slab.h>
-#include <linux/types.h>
-#include <linux/module.h>
-#include <linux/sched.h>
-#include <linux/sunrpc/auth.h>
-#include <linux/sunrpc/clnt.h>
-#include <linux/sunrpc/debug.h>
-#include <linux/sunrpc/sched.h>
-
-#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
-# define RPCDBG_FACILITY RPCDBG_AUTH
-#endif
-
-#define RPC_MACHINE_CRED_USERID GLOBAL_ROOT_UID
-#define RPC_MACHINE_CRED_GROUPID GLOBAL_ROOT_GID
-
-struct generic_cred {
- struct rpc_cred gc_base;
- struct auth_cred acred;
-};
-
-static struct rpc_auth generic_auth;
-static const struct rpc_credops generic_credops;
-
-/*
- * Public call interface
- */
-struct rpc_cred *rpc_lookup_cred(void)
-{
- return rpcauth_lookupcred(&generic_auth, 0);
-}
-EXPORT_SYMBOL_GPL(rpc_lookup_cred);
-
-struct rpc_cred *
-rpc_lookup_generic_cred(struct auth_cred *acred, int flags, gfp_t gfp)
-{
- return rpcauth_lookup_credcache(&generic_auth, acred, flags, gfp);
-}
-EXPORT_SYMBOL_GPL(rpc_lookup_generic_cred);
-
-struct rpc_cred *rpc_lookup_cred_nonblock(void)
-{
- return rpcauth_lookupcred(&generic_auth, RPCAUTH_LOOKUP_RCU);
-}
-EXPORT_SYMBOL_GPL(rpc_lookup_cred_nonblock);
-
-/*
- * Public call interface for looking up machine creds.
- */
-struct rpc_cred *rpc_lookup_machine_cred(const char *service_name)
-{
- struct auth_cred acred = {
- .uid = RPC_MACHINE_CRED_USERID,
- .gid = RPC_MACHINE_CRED_GROUPID,
- .principal = service_name,
- .machine_cred = 1,
- };
-
- dprintk("RPC: looking up machine cred for service %s\n",
- service_name);
- return generic_auth.au_ops->lookup_cred(&generic_auth, &acred, 0);
-}
-EXPORT_SYMBOL_GPL(rpc_lookup_machine_cred);
-
-static struct rpc_cred *generic_bind_cred(struct rpc_task *task,
- struct rpc_cred *cred, int lookupflags)
-{
- struct rpc_auth *auth = task->tk_client->cl_auth;
- struct auth_cred *acred = &container_of(cred, struct generic_cred, gc_base)->acred;
-
- return auth->au_ops->lookup_cred(auth, acred, lookupflags);
-}
-
-static int
-generic_hash_cred(struct auth_cred *acred, unsigned int hashbits)
-{
- return hash_64(from_kgid(&init_user_ns, acred->gid) |
- ((u64)from_kuid(&init_user_ns, acred->uid) <<
- (sizeof(gid_t) * 8)), hashbits);
-}
-
-/*
- * Lookup generic creds for current process
- */
-static struct rpc_cred *
-generic_lookup_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
-{
- return rpcauth_lookup_credcache(&generic_auth, acred, flags, GFP_KERNEL);
-}
-
-static struct rpc_cred *
-generic_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags, gfp_t gfp)
-{
- struct generic_cred *gcred;
-
- gcred = kmalloc(sizeof(*gcred), gfp);
- if (gcred == NULL)
- return ERR_PTR(-ENOMEM);
-
- rpcauth_init_cred(&gcred->gc_base, acred, &generic_auth, &generic_credops);
- gcred->gc_base.cr_flags = 1UL << RPCAUTH_CRED_UPTODATE;
-
- gcred->acred.uid = acred->uid;
- gcred->acred.gid = acred->gid;
- gcred->acred.group_info = acred->group_info;
- gcred->acred.ac_flags = 0;
- if (gcred->acred.group_info != NULL)
- get_group_info(gcred->acred.group_info);
- gcred->acred.machine_cred = acred->machine_cred;
- gcred->acred.principal = acred->principal;
-
- dprintk("RPC: allocated %s cred %p for uid %d gid %d\n",
- gcred->acred.machine_cred ? "machine" : "generic",
- gcred,
- from_kuid(&init_user_ns, acred->uid),
- from_kgid(&init_user_ns, acred->gid));
- return &gcred->gc_base;
-}
-
-static void
-generic_free_cred(struct rpc_cred *cred)
-{
- struct generic_cred *gcred = container_of(cred, struct generic_cred, gc_base);
-
- dprintk("RPC: generic_free_cred %p\n", gcred);
- if (gcred->acred.group_info != NULL)
- put_group_info(gcred->acred.group_info);
- kfree(gcred);
-}
-
-static void
-generic_free_cred_callback(struct rcu_head *head)
-{
- struct rpc_cred *cred = container_of(head, struct rpc_cred, cr_rcu);
- generic_free_cred(cred);
-}
-
-static void
-generic_destroy_cred(struct rpc_cred *cred)
-{
- call_rcu(&cred->cr_rcu, generic_free_cred_callback);
-}
-
-static int
-machine_cred_match(struct auth_cred *acred, struct generic_cred *gcred, int flags)
-{
- if (!gcred->acred.machine_cred ||
- gcred->acred.principal != acred->principal ||
- !uid_eq(gcred->acred.uid, acred->uid) ||
- !gid_eq(gcred->acred.gid, acred->gid))
- return 0;
- return 1;
-}
-
-/*
- * Match credentials against current process creds.
- */
-static int
-generic_match(struct auth_cred *acred, struct rpc_cred *cred, int flags)
-{
- struct generic_cred *gcred = container_of(cred, struct generic_cred, gc_base);
- int i;
-
- if (acred->machine_cred)
- return machine_cred_match(acred, gcred, flags);
-
- if (!uid_eq(gcred->acred.uid, acred->uid) ||
- !gid_eq(gcred->acred.gid, acred->gid) ||
- gcred->acred.machine_cred != 0)
- goto out_nomatch;
-
- /* Optimisation in the case where pointers are identical... */
- if (gcred->acred.group_info == acred->group_info)
- goto out_match;
-
- /* Slow path... */
- if (gcred->acred.group_info->ngroups != acred->group_info->ngroups)
- goto out_nomatch;
- for (i = 0; i < gcred->acred.group_info->ngroups; i++) {
- if (!gid_eq(gcred->acred.group_info->gid[i],
- acred->group_info->gid[i]))
- goto out_nomatch;
- }
-out_match:
- return 1;
-out_nomatch:
- return 0;
-}
-
-int __init rpc_init_generic_auth(void)
-{
- return rpcauth_init_credcache(&generic_auth);
-}
-
-void rpc_destroy_generic_auth(void)
-{
- rpcauth_destroy_credcache(&generic_auth);
-}
-
-/*
- * Test the the current time (now) against the underlying credential key expiry
- * minus a timeout and setup notification.
- *
- * The normal case:
- * If 'now' is before the key expiry minus RPC_KEY_EXPIRE_TIMEO, set
- * the RPC_CRED_NOTIFY_TIMEOUT flag to setup the underlying credential
- * rpc_credops crmatch routine to notify this generic cred when it's key
- * expiration is within RPC_KEY_EXPIRE_TIMEO, and return 0.
- *
- * The error case:
- * If the underlying cred lookup fails, return -EACCES.
- *
- * The 'almost' error case:
- * If 'now' is within key expiry minus RPC_KEY_EXPIRE_TIMEO, but not within
- * key expiry minus RPC_KEY_EXPIRE_FAIL, set the RPC_CRED_EXPIRE_SOON bit
- * on the acred ac_flags and return 0.
- */
-static int
-generic_key_timeout(struct rpc_auth *auth, struct rpc_cred *cred)
-{
- struct auth_cred *acred = &container_of(cred, struct generic_cred,
- gc_base)->acred;
- struct rpc_cred *tcred;
- int ret = 0;
-
-
- /* Fast track for non crkey_timeout (no key) underlying credentials */
- if (auth->au_flags & RPCAUTH_AUTH_NO_CRKEY_TIMEOUT)
- return 0;
-
- /* Fast track for the normal case */
- if (test_bit(RPC_CRED_NOTIFY_TIMEOUT, &acred->ac_flags))
- return 0;
-
- /* lookup_cred either returns a valid referenced rpc_cred, or PTR_ERR */
- tcred = auth->au_ops->lookup_cred(auth, acred, 0);
- if (IS_ERR(tcred))
- return -EACCES;
-
- /* Test for the almost error case */
- ret = tcred->cr_ops->crkey_timeout(tcred);
- if (ret != 0) {
- set_bit(RPC_CRED_KEY_EXPIRE_SOON, &acred->ac_flags);
- ret = 0;
- } else {
- /* In case underlying cred key has been reset */
- if (test_and_clear_bit(RPC_CRED_KEY_EXPIRE_SOON,
- &acred->ac_flags))
- dprintk("RPC: UID %d Credential key reset\n",
- from_kuid(&init_user_ns, tcred->cr_uid));
- /* set up fasttrack for the normal case */
- set_bit(RPC_CRED_NOTIFY_TIMEOUT, &acred->ac_flags);
- }
-
- put_rpccred(tcred);
- return ret;
-}
-
-static const struct rpc_authops generic_auth_ops = {
- .owner = THIS_MODULE,
- .au_name = "Generic",
- .hash_cred = generic_hash_cred,
- .lookup_cred = generic_lookup_cred,
- .crcreate = generic_create_cred,
- .key_timeout = generic_key_timeout,
-};
-
-static struct rpc_auth generic_auth = {
- .au_ops = &generic_auth_ops,
- .au_count = ATOMIC_INIT(0),
-};
-
-static bool generic_key_to_expire(struct rpc_cred *cred)
-{
- struct auth_cred *acred = &container_of(cred, struct generic_cred,
- gc_base)->acred;
- return test_bit(RPC_CRED_KEY_EXPIRE_SOON, &acred->ac_flags);
-}
-
-static const struct rpc_credops generic_credops = {
- .cr_name = "Generic cred",
- .crdestroy = generic_destroy_cred,
- .crbind = generic_bind_cred,
- .crmatch = generic_match,
- .crkey_to_expire = generic_key_to_expire,
-};
diff --git a/net/sunrpc/auth_gss/Makefile b/net/sunrpc/auth_gss/Makefile
index c374268..4a29f4c 100644
--- a/net/sunrpc/auth_gss/Makefile
+++ b/net/sunrpc/auth_gss/Makefile
@@ -7,7 +7,7 @@
auth_rpcgss-y := auth_gss.o gss_generic_token.o \
gss_mech_switch.o svcauth_gss.o \
- gss_rpc_upcall.o gss_rpc_xdr.o
+ gss_rpc_upcall.o gss_rpc_xdr.o trace.o
obj-$(CONFIG_RPCSEC_GSS_KRB5) += rpcsec_gss_krb5.o
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index 8cb7d81..d75fddc 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: BSD-3-Clause
/*
* linux/net/sunrpc/auth_gss/auth_gss.c
*
@@ -8,34 +9,8 @@
*
* Dug Song <dugsong@monkey.org>
* Andy Adamson <andros@umich.edu>
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * 3. Neither the name of the University nor the names of its
- * contributors may be used to endorse or promote products derived
- * from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
- * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
- * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
- * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
- * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
-
#include <linux/module.h>
#include <linux/init.h>
#include <linux/types.h>
@@ -55,6 +30,8 @@
#include "../netns.h"
+#include <trace/events/rpcgss.h>
+
static const struct rpc_authops authgss_ops;
static const struct rpc_credops gss_credops;
@@ -260,6 +237,7 @@
}
ret = gss_import_sec_context(p, seclen, gm, &ctx->gc_gss_ctx, NULL, GFP_NOFS);
if (ret < 0) {
+ trace_rpcgss_import_ctx(ret);
p = ERR_PTR(ret);
goto err;
}
@@ -275,12 +253,9 @@
if (IS_ERR(p))
goto err;
done:
- dprintk("RPC: %s Success. gc_expiry %lu now %lu timeout %u acceptor %.*s\n",
- __func__, ctx->gc_expiry, now, timeout, ctx->gc_acceptor.len,
- ctx->gc_acceptor.data);
- return p;
+ trace_rpcgss_context(ctx->gc_expiry, now, timeout,
+ ctx->gc_acceptor.len, ctx->gc_acceptor.data);
err:
- dprintk("RPC: %s returns error %ld\n", __func__, -PTR_ERR(p));
return p;
}
@@ -294,6 +269,7 @@
struct gss_upcall_msg {
refcount_t count;
kuid_t uid;
+ const char *service_name;
struct rpc_pipe_msg msg;
struct list_head list;
struct gss_auth *auth;
@@ -341,6 +317,7 @@
gss_put_ctx(gss_msg->ctx);
rpc_destroy_wait_queue(&gss_msg->rpc_waitqueue);
gss_put_auth(gss_msg->auth);
+ kfree_const(gss_msg->service_name);
kfree(gss_msg);
}
@@ -354,10 +331,8 @@
if (auth && pos->auth->service != auth->service)
continue;
refcount_inc(&pos->count);
- dprintk("RPC: %s found msg %p\n", __func__, pos);
return pos;
}
- dprintk("RPC: %s found nothing\n", __func__);
return NULL;
}
@@ -437,9 +412,12 @@
gss_release_msg(gss_msg);
}
-static void gss_encode_v0_msg(struct gss_upcall_msg *gss_msg)
+static void gss_encode_v0_msg(struct gss_upcall_msg *gss_msg,
+ const struct cred *cred)
{
- uid_t uid = from_kuid(&init_user_ns, gss_msg->uid);
+ struct user_namespace *userns = cred->user_ns;
+
+ uid_t uid = from_kuid_munged(userns, gss_msg->uid);
memcpy(gss_msg->databuf, &uid, sizeof(uid));
gss_msg->msg.data = gss_msg->databuf;
gss_msg->msg.len = sizeof(uid);
@@ -447,17 +425,31 @@
BUILD_BUG_ON(sizeof(uid) > sizeof(gss_msg->databuf));
}
+static ssize_t
+gss_v0_upcall(struct file *file, struct rpc_pipe_msg *msg,
+ char __user *buf, size_t buflen)
+{
+ struct gss_upcall_msg *gss_msg = container_of(msg,
+ struct gss_upcall_msg,
+ msg);
+ if (msg->copied == 0)
+ gss_encode_v0_msg(gss_msg, file->f_cred);
+ return rpc_pipe_generic_upcall(file, msg, buf, buflen);
+}
+
static int gss_encode_v1_msg(struct gss_upcall_msg *gss_msg,
const char *service_name,
- const char *target_name)
+ const char *target_name,
+ const struct cred *cred)
{
+ struct user_namespace *userns = cred->user_ns;
struct gss_api_mech *mech = gss_msg->auth->mech;
char *p = gss_msg->databuf;
size_t buflen = sizeof(gss_msg->databuf);
int len;
- len = scnprintf(p, buflen, "mech=%s uid=%d ", mech->gm_name,
- from_kuid(&init_user_ns, gss_msg->uid));
+ len = scnprintf(p, buflen, "mech=%s uid=%d", mech->gm_name,
+ from_kuid_munged(userns, gss_msg->uid));
buflen -= len;
p += len;
gss_msg->msg.len = len;
@@ -467,7 +459,7 @@
* identity that we are authenticating to.
*/
if (target_name) {
- len = scnprintf(p, buflen, "target=%s ", target_name);
+ len = scnprintf(p, buflen, " target=%s", target_name);
buflen -= len;
p += len;
gss_msg->msg.len += len;
@@ -487,11 +479,11 @@
char *c = strchr(service_name, '@');
if (!c)
- len = scnprintf(p, buflen, "service=%s ",
+ len = scnprintf(p, buflen, " service=%s",
service_name);
else
len = scnprintf(p, buflen,
- "service=%.*s srchost=%s ",
+ " service=%.*s srchost=%s",
(int)(c - service_name),
service_name, c + 1);
buflen -= len;
@@ -500,17 +492,17 @@
}
if (mech->gm_upcall_enctypes) {
- len = scnprintf(p, buflen, "enctypes=%s ",
+ len = scnprintf(p, buflen, " enctypes=%s",
mech->gm_upcall_enctypes);
buflen -= len;
p += len;
gss_msg->msg.len += len;
}
+ trace_rpcgss_upcall_msg(gss_msg->databuf);
len = scnprintf(p, buflen, "\n");
if (len == 0)
goto out_overflow;
gss_msg->msg.len += len;
-
gss_msg->msg.data = gss_msg->databuf;
return 0;
out_overflow:
@@ -518,6 +510,25 @@
return -ENOMEM;
}
+static ssize_t
+gss_v1_upcall(struct file *file, struct rpc_pipe_msg *msg,
+ char __user *buf, size_t buflen)
+{
+ struct gss_upcall_msg *gss_msg = container_of(msg,
+ struct gss_upcall_msg,
+ msg);
+ int err;
+ if (msg->copied == 0) {
+ err = gss_encode_v1_msg(gss_msg,
+ gss_msg->service_name,
+ gss_msg->auth->target_name,
+ file->f_cred);
+ if (err)
+ return err;
+ }
+ return rpc_pipe_generic_upcall(file, msg, buf, buflen);
+}
+
static struct gss_upcall_msg *
gss_alloc_msg(struct gss_auth *gss_auth,
kuid_t uid, const char *service_name)
@@ -540,16 +551,14 @@
refcount_set(&gss_msg->count, 1);
gss_msg->uid = uid;
gss_msg->auth = gss_auth;
- switch (vers) {
- case 0:
- gss_encode_v0_msg(gss_msg);
- break;
- default:
- err = gss_encode_v1_msg(gss_msg, service_name, gss_auth->target_name);
- if (err)
- goto err_put_pipe_version;
- }
kref_get(&gss_auth->kref);
+ if (service_name) {
+ gss_msg->service_name = kstrdup_const(service_name, GFP_NOFS);
+ if (!gss_msg->service_name) {
+ err = -ENOMEM;
+ goto err_put_pipe_version;
+ }
+ }
return gss_msg;
err_put_pipe_version:
put_pipe_version(gss_auth->net);
@@ -565,7 +574,7 @@
struct gss_cred *gss_cred = container_of(cred,
struct gss_cred, gc_base);
struct gss_upcall_msg *gss_new, *gss_msg;
- kuid_t uid = cred->cr_uid;
+ kuid_t uid = cred->cr_cred->fsuid;
gss_new = gss_alloc_msg(gss_auth, uid, gss_cred->gc_principal);
if (IS_ERR(gss_new))
@@ -603,16 +612,15 @@
struct rpc_pipe *pipe;
int err = 0;
- dprintk("RPC: %5u %s for uid %u\n",
- task->tk_pid, __func__, from_kuid(&init_user_ns, cred->cr_uid));
gss_msg = gss_setup_upcall(gss_auth, cred);
if (PTR_ERR(gss_msg) == -EAGAIN) {
/* XXX: warning on the first, under the assumption we
* shouldn't normally hit this case on a refresh. */
warn_gssd();
- task->tk_timeout = 15*HZ;
- rpc_sleep_on(&pipe_version_rpc_waitqueue, task, NULL);
- return -EAGAIN;
+ rpc_sleep_on_timeout(&pipe_version_rpc_waitqueue,
+ task, NULL, jiffies + (15 * HZ));
+ err = -EAGAIN;
+ goto out;
}
if (IS_ERR(gss_msg)) {
err = PTR_ERR(gss_msg);
@@ -623,7 +631,6 @@
if (gss_cred->gc_upcall != NULL)
rpc_sleep_on(&gss_cred->gc_upcall->rpc_waitqueue, task, NULL);
else if (gss_msg->ctx == NULL && gss_msg->msg.errno >= 0) {
- task->tk_timeout = 0;
gss_cred->gc_upcall = gss_msg;
/* gss_upcall_callback will release the reference to gss_upcall_msg */
refcount_inc(&gss_msg->count);
@@ -635,9 +642,8 @@
spin_unlock(&pipe->lock);
gss_release_msg(gss_msg);
out:
- dprintk("RPC: %5u %s for uid %u result %d\n",
- task->tk_pid, __func__,
- from_kuid(&init_user_ns, cred->cr_uid), err);
+ trace_rpcgss_upcall_result(from_kuid(&init_user_ns,
+ cred->cr_cred->fsuid), err);
return err;
}
@@ -652,14 +658,13 @@
DEFINE_WAIT(wait);
int err;
- dprintk("RPC: %s for uid %u\n",
- __func__, from_kuid(&init_user_ns, cred->cr_uid));
retry:
err = 0;
/* if gssd is down, just skip upcalling altogether */
if (!gssd_running(net)) {
warn_gssd();
- return -EACCES;
+ err = -EACCES;
+ goto out;
}
gss_msg = gss_setup_upcall(gss_auth, cred);
if (PTR_ERR(gss_msg) == -EAGAIN) {
@@ -700,8 +705,8 @@
finish_wait(&gss_msg->waitqueue, &wait);
gss_release_msg(gss_msg);
out:
- dprintk("RPC: %s for uid %u result %d\n",
- __func__, from_kuid(&init_user_ns, cred->cr_uid), err);
+ trace_rpcgss_upcall_result(from_kuid(&init_user_ns,
+ cred->cr_cred->fsuid), err);
return err;
}
@@ -737,7 +742,7 @@
goto err;
}
- uid = make_kuid(&init_user_ns, id);
+ uid = make_kuid(current_user_ns(), id);
if (!uid_valid(uid)) {
err = -EINVAL;
goto err;
@@ -794,7 +799,6 @@
err:
kfree(buf);
out:
- dprintk("RPC: %s returning %zd\n", __func__, err);
return err;
}
@@ -863,8 +867,6 @@
struct gss_upcall_msg *gss_msg = container_of(msg, struct gss_upcall_msg, msg);
if (msg->errno < 0) {
- dprintk("RPC: %s releasing msg %p\n",
- __func__, gss_msg);
refcount_inc(&gss_msg->count);
gss_unhash_msg(gss_msg);
if (msg->errno == -ETIMEDOUT)
@@ -1024,8 +1026,6 @@
struct rpc_auth * auth;
int err = -ENOMEM; /* XXX? */
- dprintk("RPC: creating GSS authenticator for client %p\n", clnt);
-
if (!try_module_get(THIS_MODULE))
return ERR_PTR(err);
if (!(gss_auth = kmalloc(sizeof(*gss_auth), GFP_KERNEL)))
@@ -1041,10 +1041,8 @@
gss_auth->net = get_net(rpc_net_ns(clnt));
err = -EINVAL;
gss_auth->mech = gss_mech_get_by_pseudoflavor(flavor);
- if (!gss_auth->mech) {
- dprintk("RPC: Pseudoflavor %d not found!\n", flavor);
+ if (!gss_auth->mech)
goto err_put_net;
- }
gss_auth->service = gss_pseudoflavor_to_service(gss_auth->mech, flavor);
if (gss_auth->service == 0)
goto err_put_mech;
@@ -1053,12 +1051,14 @@
auth = &gss_auth->rpc_auth;
auth->au_cslack = GSS_CRED_SLACK >> 2;
auth->au_rslack = GSS_VERF_SLACK >> 2;
+ auth->au_verfsize = GSS_VERF_SLACK >> 2;
+ auth->au_ralign = GSS_VERF_SLACK >> 2;
auth->au_flags = 0;
auth->au_ops = &authgss_ops;
auth->au_flavor = flavor;
if (gss_pseudoflavor_to_datatouch(gss_auth->mech, flavor))
auth->au_flags |= RPCAUTH_AUTH_DATATOUCH;
- atomic_set(&auth->au_count, 1);
+ refcount_set(&auth->au_count, 1);
kref_init(&gss_auth->kref);
err = rpcauth_init_credcache(auth);
@@ -1099,6 +1099,7 @@
kfree(gss_auth);
out_dec:
module_put(THIS_MODULE);
+ trace_rpcgss_createauth(flavor, err);
return ERR_PTR(err);
}
@@ -1135,9 +1136,6 @@
struct gss_auth *gss_auth = container_of(auth,
struct gss_auth, rpc_auth);
- dprintk("RPC: destroying GSS authenticator %p flavor %d\n",
- auth, auth->au_flavor);
-
if (hash_hashed(&gss_auth->hash)) {
spin_lock(&gss_auth_hash_lock);
hash_del(&gss_auth->hash);
@@ -1187,7 +1185,7 @@
if (strcmp(gss_auth->target_name, args->target_name))
continue;
}
- if (!atomic_inc_not_zero(&gss_auth->rpc_auth.au_count))
+ if (!refcount_inc_not_zero(&gss_auth->rpc_auth.au_count))
continue;
goto out;
}
@@ -1239,36 +1237,59 @@
return &gss_auth->rpc_auth;
}
+static struct gss_cred *
+gss_dup_cred(struct gss_auth *gss_auth, struct gss_cred *gss_cred)
+{
+ struct gss_cred *new;
+
+ /* Make a copy of the cred so that we can reference count it */
+ new = kzalloc(sizeof(*gss_cred), GFP_NOFS);
+ if (new) {
+ struct auth_cred acred = {
+ .cred = gss_cred->gc_base.cr_cred,
+ };
+ struct gss_cl_ctx *ctx =
+ rcu_dereference_protected(gss_cred->gc_ctx, 1);
+
+ rpcauth_init_cred(&new->gc_base, &acred,
+ &gss_auth->rpc_auth,
+ &gss_nullops);
+ new->gc_base.cr_flags = 1UL << RPCAUTH_CRED_UPTODATE;
+ new->gc_service = gss_cred->gc_service;
+ new->gc_principal = gss_cred->gc_principal;
+ kref_get(&gss_auth->kref);
+ rcu_assign_pointer(new->gc_ctx, ctx);
+ gss_get_ctx(ctx);
+ }
+ return new;
+}
+
/*
- * gss_destroying_context will cause the RPCSEC_GSS to send a NULL RPC call
+ * gss_send_destroy_context will cause the RPCSEC_GSS to send a NULL RPC call
* to the server with the GSS control procedure field set to
* RPC_GSS_PROC_DESTROY. This should normally cause the server to release
* all RPCSEC_GSS state associated with that context.
*/
-static int
-gss_destroying_context(struct rpc_cred *cred)
+static void
+gss_send_destroy_context(struct rpc_cred *cred)
{
struct gss_cred *gss_cred = container_of(cred, struct gss_cred, gc_base);
struct gss_auth *gss_auth = container_of(cred->cr_auth, struct gss_auth, rpc_auth);
struct gss_cl_ctx *ctx = rcu_dereference_protected(gss_cred->gc_ctx, 1);
+ struct gss_cred *new;
struct rpc_task *task;
- if (test_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags) == 0)
- return 0;
+ new = gss_dup_cred(gss_auth, gss_cred);
+ if (new) {
+ ctx->gc_proc = RPC_GSS_PROC_DESTROY;
- ctx->gc_proc = RPC_GSS_PROC_DESTROY;
- cred->cr_ops = &gss_nullops;
+ task = rpc_call_null(gss_auth->client, &new->gc_base,
+ RPC_TASK_ASYNC|RPC_TASK_SOFT);
+ if (!IS_ERR(task))
+ rpc_put_task(task);
- /* Take a reference to ensure the cred will be destroyed either
- * by the RPC call or by the put_rpccred() below */
- get_rpccred(cred);
-
- task = rpc_call_null(gss_auth->client, cred, RPC_TASK_ASYNC|RPC_TASK_SOFT);
- if (!IS_ERR(task))
- rpc_put_task(task);
-
- put_rpccred(cred);
- return 1;
+ put_rpccred(&new->gc_base);
+ }
}
/* gss_destroy_cred (and gss_free_ctx) are used to clean up after failure
@@ -1277,8 +1298,6 @@
static void
gss_do_free_ctx(struct gss_cl_ctx *ctx)
{
- dprintk("RPC: %s\n", __func__);
-
gss_delete_sec_context(&ctx->gc_gss_ctx);
kfree(ctx->gc_wire_ctx.data);
kfree(ctx->gc_acceptor.data);
@@ -1301,7 +1320,6 @@
static void
gss_free_cred(struct gss_cred *gss_cred)
{
- dprintk("RPC: %s cred=%p\n", __func__, gss_cred);
kfree(gss_cred);
}
@@ -1320,6 +1338,7 @@
struct gss_cl_ctx *ctx = rcu_dereference_protected(gss_cred->gc_ctx, 1);
RCU_INIT_POINTER(gss_cred->gc_ctx, NULL);
+ put_cred(cred->cr_cred);
call_rcu(&cred->cr_rcu, gss_free_cred_callback);
if (ctx)
gss_put_ctx(ctx);
@@ -1330,15 +1349,15 @@
gss_destroy_cred(struct rpc_cred *cred)
{
- if (gss_destroying_context(cred))
- return;
+ if (test_and_clear_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags) != 0)
+ gss_send_destroy_context(cred);
gss_destroy_nullcred(cred);
}
static int
gss_hash_cred(struct auth_cred *acred, unsigned int hashbits)
{
- return hash_64(from_kuid(&init_user_ns, acred->uid), hashbits);
+ return hash_64(from_kuid(&init_user_ns, acred->cred->fsuid), hashbits);
}
/*
@@ -1357,10 +1376,6 @@
struct gss_cred *cred = NULL;
int err = -ENOMEM;
- dprintk("RPC: %s for uid %d, flavor %d\n",
- __func__, from_kuid(&init_user_ns, acred->uid),
- auth->au_flavor);
-
if (!(cred = kzalloc(sizeof(*cred), gfp)))
goto out_err;
@@ -1371,14 +1386,11 @@
*/
cred->gc_base.cr_flags = 1UL << RPCAUTH_CRED_NEW;
cred->gc_service = gss_auth->service;
- cred->gc_principal = NULL;
- if (acred->machine_cred)
- cred->gc_principal = acred->principal;
+ cred->gc_principal = acred->principal;
kref_get(&gss_auth->kref);
return &cred->gc_base;
out_err:
- dprintk("RPC: %s failed with error %d\n", __func__, err);
return ERR_PTR(err);
}
@@ -1495,85 +1507,93 @@
if (gss_cred->gc_principal == NULL)
return 0;
ret = strcmp(acred->principal, gss_cred->gc_principal) == 0;
- goto check_expire;
- }
- if (gss_cred->gc_principal != NULL)
- return 0;
- ret = uid_eq(rc->cr_uid, acred->uid);
-
-check_expire:
- if (ret == 0)
- return ret;
-
- /* Notify acred users of GSS context expiration timeout */
- if (test_bit(RPC_CRED_NOTIFY_TIMEOUT, &acred->ac_flags) &&
- (gss_key_timeout(rc) != 0)) {
- /* test will now be done from generic cred */
- test_and_clear_bit(RPC_CRED_NOTIFY_TIMEOUT, &acred->ac_flags);
- /* tell NFS layer that key will expire soon */
- set_bit(RPC_CRED_KEY_EXPIRE_SOON, &acred->ac_flags);
+ } else {
+ if (gss_cred->gc_principal != NULL)
+ return 0;
+ ret = uid_eq(rc->cr_cred->fsuid, acred->cred->fsuid);
}
return ret;
}
/*
-* Marshal credentials.
-* Maybe we should keep a cached credential for performance reasons.
-*/
-static __be32 *
-gss_marshal(struct rpc_task *task, __be32 *p)
+ * Marshal credentials.
+ *
+ * The expensive part is computing the verifier. We can't cache a
+ * pre-computed version of the verifier because the seqno, which
+ * is different every time, is included in the MIC.
+ */
+static int gss_marshal(struct rpc_task *task, struct xdr_stream *xdr)
{
struct rpc_rqst *req = task->tk_rqstp;
struct rpc_cred *cred = req->rq_cred;
struct gss_cred *gss_cred = container_of(cred, struct gss_cred,
gc_base);
struct gss_cl_ctx *ctx = gss_cred_get_ctx(cred);
- __be32 *cred_len;
+ __be32 *p, *cred_len;
u32 maj_stat = 0;
struct xdr_netobj mic;
struct kvec iov;
struct xdr_buf verf_buf;
+ int status;
- dprintk("RPC: %5u %s\n", task->tk_pid, __func__);
+ /* Credential */
- *p++ = htonl(RPC_AUTH_GSS);
+ p = xdr_reserve_space(xdr, 7 * sizeof(*p) +
+ ctx->gc_wire_ctx.len);
+ if (!p)
+ goto marshal_failed;
+ *p++ = rpc_auth_gss;
cred_len = p++;
spin_lock(&ctx->gc_seq_lock);
- req->rq_seqno = ctx->gc_seq++;
+ req->rq_seqno = (ctx->gc_seq < MAXSEQ) ? ctx->gc_seq++ : MAXSEQ;
spin_unlock(&ctx->gc_seq_lock);
+ if (req->rq_seqno == MAXSEQ)
+ goto expired;
+ trace_rpcgss_seqno(task);
- *p++ = htonl((u32) RPC_GSS_VERSION);
- *p++ = htonl((u32) ctx->gc_proc);
- *p++ = htonl((u32) req->rq_seqno);
- *p++ = htonl((u32) gss_cred->gc_service);
+ *p++ = cpu_to_be32(RPC_GSS_VERSION);
+ *p++ = cpu_to_be32(ctx->gc_proc);
+ *p++ = cpu_to_be32(req->rq_seqno);
+ *p++ = cpu_to_be32(gss_cred->gc_service);
p = xdr_encode_netobj(p, &ctx->gc_wire_ctx);
- *cred_len = htonl((p - (cred_len + 1)) << 2);
+ *cred_len = cpu_to_be32((p - (cred_len + 1)) << 2);
+
+ /* Verifier */
/* We compute the checksum for the verifier over the xdr-encoded bytes
* starting with the xid and ending at the end of the credential: */
- iov.iov_base = xprt_skip_transport_header(req->rq_xprt,
- req->rq_snd_buf.head[0].iov_base);
+ iov.iov_base = req->rq_snd_buf.head[0].iov_base;
iov.iov_len = (u8 *)p - (u8 *)iov.iov_base;
xdr_buf_from_iov(&iov, &verf_buf);
- /* set verifier flavor*/
- *p++ = htonl(RPC_AUTH_GSS);
-
+ p = xdr_reserve_space(xdr, sizeof(*p));
+ if (!p)
+ goto marshal_failed;
+ *p++ = rpc_auth_gss;
mic.data = (u8 *)(p + 1);
maj_stat = gss_get_mic(ctx->gc_gss_ctx, &verf_buf, &mic);
- if (maj_stat == GSS_S_CONTEXT_EXPIRED) {
- clear_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags);
- } else if (maj_stat != 0) {
- printk("gss_marshal: gss_get_mic FAILED (%d)\n", maj_stat);
- goto out_put_ctx;
- }
- p = xdr_encode_opaque(p, NULL, mic.len);
+ if (maj_stat == GSS_S_CONTEXT_EXPIRED)
+ goto expired;
+ else if (maj_stat != 0)
+ goto bad_mic;
+ if (xdr_stream_encode_opaque_inline(xdr, (void **)&p, mic.len) < 0)
+ goto marshal_failed;
+ status = 0;
+out:
gss_put_ctx(ctx);
- return p;
-out_put_ctx:
- gss_put_ctx(ctx);
- return NULL;
+ return status;
+expired:
+ clear_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags);
+ status = -EKEYEXPIRED;
+ goto out;
+marshal_failed:
+ status = -EMSGSIZE;
+ goto out;
+bad_mic:
+ trace_rpcgss_get_mic(task, maj_stat);
+ status = -EIO;
+ goto out;
}
static int gss_renew_cred(struct rpc_task *task)
@@ -1584,9 +1604,8 @@
gc_base);
struct rpc_auth *auth = oldcred->cr_auth;
struct auth_cred acred = {
- .uid = oldcred->cr_uid,
+ .cred = oldcred->cr_cred,
.principal = gss_cred->gc_principal,
- .machine_cred = (gss_cred->gc_principal != NULL ? 1 : 0),
};
struct rpc_cred *new;
@@ -1648,116 +1667,105 @@
return 0;
}
-static __be32 *
-gss_validate(struct rpc_task *task, __be32 *p)
+static int
+gss_validate(struct rpc_task *task, struct xdr_stream *xdr)
{
struct rpc_cred *cred = task->tk_rqstp->rq_cred;
struct gss_cl_ctx *ctx = gss_cred_get_ctx(cred);
- __be32 *seq = NULL;
+ __be32 *p, *seq = NULL;
struct kvec iov;
struct xdr_buf verf_buf;
struct xdr_netobj mic;
- u32 flav,len;
- u32 maj_stat;
- __be32 *ret = ERR_PTR(-EIO);
+ u32 len, maj_stat;
+ int status;
- dprintk("RPC: %5u %s\n", task->tk_pid, __func__);
+ p = xdr_inline_decode(xdr, 2 * sizeof(*p));
+ if (!p)
+ goto validate_failed;
+ if (*p++ != rpc_auth_gss)
+ goto validate_failed;
+ len = be32_to_cpup(p);
+ if (len > RPC_MAX_AUTH_SIZE)
+ goto validate_failed;
+ p = xdr_inline_decode(xdr, len);
+ if (!p)
+ goto validate_failed;
- flav = ntohl(*p++);
- if ((len = ntohl(*p++)) > RPC_MAX_AUTH_SIZE)
- goto out_bad;
- if (flav != RPC_AUTH_GSS)
- goto out_bad;
seq = kmalloc(4, GFP_NOFS);
if (!seq)
- goto out_bad;
- *seq = htonl(task->tk_rqstp->rq_seqno);
+ goto validate_failed;
+ *seq = cpu_to_be32(task->tk_rqstp->rq_seqno);
iov.iov_base = seq;
iov.iov_len = 4;
xdr_buf_from_iov(&iov, &verf_buf);
mic.data = (u8 *)p;
mic.len = len;
-
- ret = ERR_PTR(-EACCES);
maj_stat = gss_verify_mic(ctx->gc_gss_ctx, &verf_buf, &mic);
if (maj_stat == GSS_S_CONTEXT_EXPIRED)
clear_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags);
- if (maj_stat) {
- dprintk("RPC: %5u %s: gss_verify_mic returned error 0x%08x\n",
- task->tk_pid, __func__, maj_stat);
- goto out_bad;
- }
+ if (maj_stat)
+ goto bad_mic;
+
/* We leave it to unwrap to calculate au_rslack. For now we just
* calculate the length of the verifier: */
cred->cr_auth->au_verfsize = XDR_QUADLEN(len) + 2;
+ status = 0;
+out:
gss_put_ctx(ctx);
- dprintk("RPC: %5u %s: gss_verify_mic succeeded.\n",
- task->tk_pid, __func__);
kfree(seq);
- return p + XDR_QUADLEN(len);
-out_bad:
- gss_put_ctx(ctx);
- dprintk("RPC: %5u %s failed ret %ld.\n", task->tk_pid, __func__,
- PTR_ERR(ret));
- kfree(seq);
- return ret;
+ return status;
+
+validate_failed:
+ status = -EIO;
+ goto out;
+bad_mic:
+ trace_rpcgss_verify_mic(task, maj_stat);
+ status = -EACCES;
+ goto out;
}
-static void gss_wrap_req_encode(kxdreproc_t encode, struct rpc_rqst *rqstp,
- __be32 *p, void *obj)
+static int gss_wrap_req_integ(struct rpc_cred *cred, struct gss_cl_ctx *ctx,
+ struct rpc_task *task, struct xdr_stream *xdr)
{
- struct xdr_stream xdr;
-
- xdr_init_encode(&xdr, &rqstp->rq_snd_buf, p);
- encode(rqstp, &xdr, obj);
-}
-
-static inline int
-gss_wrap_req_integ(struct rpc_cred *cred, struct gss_cl_ctx *ctx,
- kxdreproc_t encode, struct rpc_rqst *rqstp,
- __be32 *p, void *obj)
-{
- struct xdr_buf *snd_buf = &rqstp->rq_snd_buf;
- struct xdr_buf integ_buf;
- __be32 *integ_len = NULL;
+ struct rpc_rqst *rqstp = task->tk_rqstp;
+ struct xdr_buf integ_buf, *snd_buf = &rqstp->rq_snd_buf;
struct xdr_netobj mic;
- u32 offset;
- __be32 *q;
- struct kvec *iov;
- u32 maj_stat = 0;
- int status = -EIO;
+ __be32 *p, *integ_len;
+ u32 offset, maj_stat;
+ p = xdr_reserve_space(xdr, 2 * sizeof(*p));
+ if (!p)
+ goto wrap_failed;
integ_len = p++;
+ *p = cpu_to_be32(rqstp->rq_seqno);
+
+ if (rpcauth_wrap_req_encode(task, xdr))
+ goto wrap_failed;
+
offset = (u8 *)p - (u8 *)snd_buf->head[0].iov_base;
- *p++ = htonl(rqstp->rq_seqno);
-
- gss_wrap_req_encode(encode, rqstp, p, obj);
-
if (xdr_buf_subsegment(snd_buf, &integ_buf,
offset, snd_buf->len - offset))
- return status;
- *integ_len = htonl(integ_buf.len);
+ goto wrap_failed;
+ *integ_len = cpu_to_be32(integ_buf.len);
- /* guess whether we're in the head or the tail: */
- if (snd_buf->page_len || snd_buf->tail[0].iov_len)
- iov = snd_buf->tail;
- else
- iov = snd_buf->head;
- p = iov->iov_base + iov->iov_len;
+ p = xdr_reserve_space(xdr, 0);
+ if (!p)
+ goto wrap_failed;
mic.data = (u8 *)(p + 1);
-
maj_stat = gss_get_mic(ctx->gc_gss_ctx, &integ_buf, &mic);
- status = -EIO; /* XXX? */
if (maj_stat == GSS_S_CONTEXT_EXPIRED)
clear_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags);
else if (maj_stat)
- return status;
- q = xdr_encode_opaque(p, NULL, mic.len);
-
- offset = (u8 *)q - (u8 *)p;
- iov->iov_len += offset;
- snd_buf->len += offset;
+ goto bad_mic;
+ /* Check that the trailing MIC fit in the buffer, after the fact */
+ if (xdr_stream_encode_opaque_inline(xdr, (void **)&p, mic.len) < 0)
+ goto wrap_failed;
return 0;
+wrap_failed:
+ return -EMSGSIZE;
+bad_mic:
+ trace_rpcgss_get_mic(task, maj_stat);
+ return -EIO;
}
static void
@@ -1808,61 +1816,62 @@
return -EAGAIN;
}
-static inline int
-gss_wrap_req_priv(struct rpc_cred *cred, struct gss_cl_ctx *ctx,
- kxdreproc_t encode, struct rpc_rqst *rqstp,
- __be32 *p, void *obj)
+static int gss_wrap_req_priv(struct rpc_cred *cred, struct gss_cl_ctx *ctx,
+ struct rpc_task *task, struct xdr_stream *xdr)
{
+ struct rpc_rqst *rqstp = task->tk_rqstp;
struct xdr_buf *snd_buf = &rqstp->rq_snd_buf;
- u32 offset;
- u32 maj_stat;
+ u32 pad, offset, maj_stat;
int status;
- __be32 *opaque_len;
+ __be32 *p, *opaque_len;
struct page **inpages;
int first;
- int pad;
struct kvec *iov;
- char *tmp;
+ status = -EIO;
+ p = xdr_reserve_space(xdr, 2 * sizeof(*p));
+ if (!p)
+ goto wrap_failed;
opaque_len = p++;
- offset = (u8 *)p - (u8 *)snd_buf->head[0].iov_base;
- *p++ = htonl(rqstp->rq_seqno);
+ *p = cpu_to_be32(rqstp->rq_seqno);
- gss_wrap_req_encode(encode, rqstp, p, obj);
+ if (rpcauth_wrap_req_encode(task, xdr))
+ goto wrap_failed;
status = alloc_enc_pages(rqstp);
- if (status)
- return status;
+ if (unlikely(status))
+ goto wrap_failed;
first = snd_buf->page_base >> PAGE_SHIFT;
inpages = snd_buf->pages + first;
snd_buf->pages = rqstp->rq_enc_pages;
snd_buf->page_base -= first << PAGE_SHIFT;
/*
- * Give the tail its own page, in case we need extra space in the
- * head when wrapping:
+ * Move the tail into its own page, in case gss_wrap needs
+ * more space in the head when wrapping.
*
- * call_allocate() allocates twice the slack space required
- * by the authentication flavor to rq_callsize.
- * For GSS, slack is GSS_CRED_SLACK.
+ * Still... Why can't gss_wrap just slide the tail down?
*/
if (snd_buf->page_len || snd_buf->tail[0].iov_len) {
+ char *tmp;
+
tmp = page_address(rqstp->rq_enc_pages[rqstp->rq_enc_pages_num - 1]);
memcpy(tmp, snd_buf->tail[0].iov_base, snd_buf->tail[0].iov_len);
snd_buf->tail[0].iov_base = tmp;
}
+ offset = (u8 *)p - (u8 *)snd_buf->head[0].iov_base;
maj_stat = gss_wrap(ctx->gc_gss_ctx, offset, snd_buf, inpages);
/* slack space should prevent this ever happening: */
- BUG_ON(snd_buf->len > snd_buf->buflen);
- status = -EIO;
+ if (unlikely(snd_buf->len > snd_buf->buflen))
+ goto wrap_failed;
/* We're assuming that when GSS_S_CONTEXT_EXPIRED, the encryption was
* done anyway, so it's safe to put the request on the wire: */
if (maj_stat == GSS_S_CONTEXT_EXPIRED)
clear_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags);
else if (maj_stat)
- return status;
+ goto bad_wrap;
- *opaque_len = htonl(snd_buf->len - offset);
- /* guess whether we're in the head or the tail: */
+ *opaque_len = cpu_to_be32(snd_buf->len - offset);
+ /* guess whether the pad goes into the head or the tail: */
if (snd_buf->page_len || snd_buf->tail[0].iov_len)
iov = snd_buf->tail;
else
@@ -1874,158 +1883,229 @@
snd_buf->len += pad;
return 0;
+wrap_failed:
+ return status;
+bad_wrap:
+ trace_rpcgss_wrap(task, maj_stat);
+ return -EIO;
}
-static int
-gss_wrap_req(struct rpc_task *task,
- kxdreproc_t encode, void *rqstp, __be32 *p, void *obj)
+static int gss_wrap_req(struct rpc_task *task, struct xdr_stream *xdr)
{
struct rpc_cred *cred = task->tk_rqstp->rq_cred;
struct gss_cred *gss_cred = container_of(cred, struct gss_cred,
gc_base);
struct gss_cl_ctx *ctx = gss_cred_get_ctx(cred);
- int status = -EIO;
+ int status;
- dprintk("RPC: %5u %s\n", task->tk_pid, __func__);
+ status = -EIO;
if (ctx->gc_proc != RPC_GSS_PROC_DATA) {
/* The spec seems a little ambiguous here, but I think that not
* wrapping context destruction requests makes the most sense.
*/
- gss_wrap_req_encode(encode, rqstp, p, obj);
- status = 0;
+ status = rpcauth_wrap_req_encode(task, xdr);
goto out;
}
switch (gss_cred->gc_service) {
case RPC_GSS_SVC_NONE:
- gss_wrap_req_encode(encode, rqstp, p, obj);
- status = 0;
+ status = rpcauth_wrap_req_encode(task, xdr);
break;
case RPC_GSS_SVC_INTEGRITY:
- status = gss_wrap_req_integ(cred, ctx, encode, rqstp, p, obj);
+ status = gss_wrap_req_integ(cred, ctx, task, xdr);
break;
case RPC_GSS_SVC_PRIVACY:
- status = gss_wrap_req_priv(cred, ctx, encode, rqstp, p, obj);
+ status = gss_wrap_req_priv(cred, ctx, task, xdr);
break;
+ default:
+ status = -EIO;
}
out:
gss_put_ctx(ctx);
- dprintk("RPC: %5u %s returning %d\n", task->tk_pid, __func__, status);
return status;
}
-static inline int
-gss_unwrap_resp_integ(struct rpc_cred *cred, struct gss_cl_ctx *ctx,
- struct rpc_rqst *rqstp, __be32 **p)
+static int
+gss_unwrap_resp_auth(struct rpc_cred *cred)
{
- struct xdr_buf *rcv_buf = &rqstp->rq_rcv_buf;
- struct xdr_buf integ_buf;
- struct xdr_netobj mic;
- u32 data_offset, mic_offset;
- u32 integ_len;
- u32 maj_stat;
- int status = -EIO;
+ struct rpc_auth *auth = cred->cr_auth;
- integ_len = ntohl(*(*p)++);
+ auth->au_rslack = auth->au_verfsize;
+ auth->au_ralign = auth->au_verfsize;
+ return 0;
+}
+
+static int
+gss_unwrap_resp_integ(struct rpc_task *task, struct rpc_cred *cred,
+ struct gss_cl_ctx *ctx, struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr)
+{
+ struct xdr_buf integ_buf, *rcv_buf = &rqstp->rq_rcv_buf;
+ u32 data_offset, mic_offset, integ_len, maj_stat;
+ struct rpc_auth *auth = cred->cr_auth;
+ struct xdr_netobj mic;
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, 2 * sizeof(*p));
+ if (unlikely(!p))
+ goto unwrap_failed;
+ integ_len = be32_to_cpup(p++);
if (integ_len & 3)
- return status;
- data_offset = (u8 *)(*p) - (u8 *)rcv_buf->head[0].iov_base;
+ goto unwrap_failed;
+ data_offset = (u8 *)(p) - (u8 *)rcv_buf->head[0].iov_base;
mic_offset = integ_len + data_offset;
if (mic_offset > rcv_buf->len)
- return status;
- if (ntohl(*(*p)++) != rqstp->rq_seqno)
- return status;
+ goto unwrap_failed;
+ if (be32_to_cpup(p) != rqstp->rq_seqno)
+ goto bad_seqno;
- if (xdr_buf_subsegment(rcv_buf, &integ_buf, data_offset,
- mic_offset - data_offset))
- return status;
-
- if (xdr_buf_read_netobj(rcv_buf, &mic, mic_offset))
- return status;
-
+ if (xdr_buf_subsegment(rcv_buf, &integ_buf, data_offset, integ_len))
+ goto unwrap_failed;
+ if (xdr_buf_read_mic(rcv_buf, &mic, mic_offset))
+ goto unwrap_failed;
maj_stat = gss_verify_mic(ctx->gc_gss_ctx, &integ_buf, &mic);
if (maj_stat == GSS_S_CONTEXT_EXPIRED)
clear_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags);
if (maj_stat != GSS_S_COMPLETE)
- return status;
+ goto bad_mic;
+
+ auth->au_rslack = auth->au_verfsize + 2 + 1 + XDR_QUADLEN(mic.len);
+ auth->au_ralign = auth->au_verfsize + 2;
return 0;
+unwrap_failed:
+ trace_rpcgss_unwrap_failed(task);
+ return -EIO;
+bad_seqno:
+ trace_rpcgss_bad_seqno(task, rqstp->rq_seqno, be32_to_cpup(p));
+ return -EIO;
+bad_mic:
+ trace_rpcgss_verify_mic(task, maj_stat);
+ return -EIO;
}
-static inline int
-gss_unwrap_resp_priv(struct rpc_cred *cred, struct gss_cl_ctx *ctx,
- struct rpc_rqst *rqstp, __be32 **p)
+static int
+gss_unwrap_resp_priv(struct rpc_task *task, struct rpc_cred *cred,
+ struct gss_cl_ctx *ctx, struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr)
{
- struct xdr_buf *rcv_buf = &rqstp->rq_rcv_buf;
- u32 offset;
- u32 opaque_len;
- u32 maj_stat;
- int status = -EIO;
+ struct xdr_buf *rcv_buf = &rqstp->rq_rcv_buf;
+ struct kvec *head = rqstp->rq_rcv_buf.head;
+ struct rpc_auth *auth = cred->cr_auth;
+ unsigned int savedlen = rcv_buf->len;
+ u32 offset, opaque_len, maj_stat;
+ __be32 *p;
- opaque_len = ntohl(*(*p)++);
- offset = (u8 *)(*p) - (u8 *)rcv_buf->head[0].iov_base;
+ p = xdr_inline_decode(xdr, 2 * sizeof(*p));
+ if (unlikely(!p))
+ goto unwrap_failed;
+ opaque_len = be32_to_cpup(p++);
+ offset = (u8 *)(p) - (u8 *)head->iov_base;
if (offset + opaque_len > rcv_buf->len)
- return status;
- /* remove padding: */
+ goto unwrap_failed;
rcv_buf->len = offset + opaque_len;
maj_stat = gss_unwrap(ctx->gc_gss_ctx, offset, rcv_buf);
if (maj_stat == GSS_S_CONTEXT_EXPIRED)
clear_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags);
if (maj_stat != GSS_S_COMPLETE)
- return status;
- if (ntohl(*(*p)++) != rqstp->rq_seqno)
- return status;
+ goto bad_unwrap;
+ /* gss_unwrap decrypted the sequence number */
+ if (be32_to_cpup(p++) != rqstp->rq_seqno)
+ goto bad_seqno;
+ /* gss_unwrap redacts the opaque blob from the head iovec.
+ * rcv_buf has changed, thus the stream needs to be reset.
+ */
+ xdr_init_decode(xdr, rcv_buf, p, rqstp);
+
+ auth->au_rslack = auth->au_verfsize + 2 +
+ XDR_QUADLEN(savedlen - rcv_buf->len);
+ auth->au_ralign = auth->au_verfsize + 2 +
+ XDR_QUADLEN(savedlen - rcv_buf->len);
return 0;
+unwrap_failed:
+ trace_rpcgss_unwrap_failed(task);
+ return -EIO;
+bad_seqno:
+ trace_rpcgss_bad_seqno(task, rqstp->rq_seqno, be32_to_cpup(--p));
+ return -EIO;
+bad_unwrap:
+ trace_rpcgss_unwrap(task, maj_stat);
+ return -EIO;
+}
+
+static bool
+gss_seq_is_newer(u32 new, u32 old)
+{
+ return (s32)(new - old) > 0;
+}
+
+static bool
+gss_xmit_need_reencode(struct rpc_task *task)
+{
+ struct rpc_rqst *req = task->tk_rqstp;
+ struct rpc_cred *cred = req->rq_cred;
+ struct gss_cl_ctx *ctx = gss_cred_get_ctx(cred);
+ u32 win, seq_xmit = 0;
+ bool ret = true;
+
+ if (!ctx)
+ goto out;
+
+ if (gss_seq_is_newer(req->rq_seqno, READ_ONCE(ctx->gc_seq)))
+ goto out_ctx;
+
+ seq_xmit = READ_ONCE(ctx->gc_seq_xmit);
+ while (gss_seq_is_newer(req->rq_seqno, seq_xmit)) {
+ u32 tmp = seq_xmit;
+
+ seq_xmit = cmpxchg(&ctx->gc_seq_xmit, tmp, req->rq_seqno);
+ if (seq_xmit == tmp) {
+ ret = false;
+ goto out_ctx;
+ }
+ }
+
+ win = ctx->gc_win;
+ if (win > 0)
+ ret = !gss_seq_is_newer(req->rq_seqno, seq_xmit - win);
+
+out_ctx:
+ gss_put_ctx(ctx);
+out:
+ trace_rpcgss_need_reencode(task, seq_xmit, ret);
+ return ret;
}
static int
-gss_unwrap_req_decode(kxdrdproc_t decode, struct rpc_rqst *rqstp,
- __be32 *p, void *obj)
+gss_unwrap_resp(struct rpc_task *task, struct xdr_stream *xdr)
{
- struct xdr_stream xdr;
-
- xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
- return decode(rqstp, &xdr, obj);
-}
-
-static int
-gss_unwrap_resp(struct rpc_task *task,
- kxdrdproc_t decode, void *rqstp, __be32 *p, void *obj)
-{
- struct rpc_cred *cred = task->tk_rqstp->rq_cred;
+ struct rpc_rqst *rqstp = task->tk_rqstp;
+ struct rpc_cred *cred = rqstp->rq_cred;
struct gss_cred *gss_cred = container_of(cred, struct gss_cred,
gc_base);
struct gss_cl_ctx *ctx = gss_cred_get_ctx(cred);
- __be32 *savedp = p;
- struct kvec *head = ((struct rpc_rqst *)rqstp)->rq_rcv_buf.head;
- int savedlen = head->iov_len;
- int status = -EIO;
+ int status = -EIO;
if (ctx->gc_proc != RPC_GSS_PROC_DATA)
goto out_decode;
switch (gss_cred->gc_service) {
case RPC_GSS_SVC_NONE:
+ status = gss_unwrap_resp_auth(cred);
break;
case RPC_GSS_SVC_INTEGRITY:
- status = gss_unwrap_resp_integ(cred, ctx, rqstp, &p);
- if (status)
- goto out;
+ status = gss_unwrap_resp_integ(task, cred, ctx, rqstp, xdr);
break;
case RPC_GSS_SVC_PRIVACY:
- status = gss_unwrap_resp_priv(cred, ctx, rqstp, &p);
- if (status)
- goto out;
+ status = gss_unwrap_resp_priv(task, cred, ctx, rqstp, xdr);
break;
}
- /* take into account extra slack for integrity and privacy cases: */
- cred->cr_auth->au_rslack = cred->cr_auth->au_verfsize + (p - savedp)
- + (savedlen - head->iov_len);
+ if (status)
+ goto out;
+
out_decode:
- status = gss_unwrap_req_decode(decode, rqstp, p, obj);
+ status = rpcauth_unwrap_resp_decode(task, xdr);
out:
gss_put_ctx(ctx);
- dprintk("RPC: %5u %s returning %d\n",
- task->tk_pid, __func__, status);
return status;
}
@@ -2047,7 +2127,6 @@
.cr_name = "AUTH_GSS",
.crdestroy = gss_destroy_cred,
.cr_init = gss_cred_init,
- .crbind = rpcauth_generic_bind_cred,
.crmatch = gss_match,
.crmarshal = gss_marshal,
.crrefresh = gss_refresh,
@@ -2056,12 +2135,12 @@
.crunwrap_resp = gss_unwrap_resp,
.crkey_timeout = gss_key_timeout,
.crstringify_acceptor = gss_stringify_acceptor,
+ .crneed_reencode = gss_xmit_need_reencode,
};
static const struct rpc_credops gss_nullops = {
.cr_name = "AUTH_GSS",
.crdestroy = gss_destroy_nullcred,
- .crbind = rpcauth_generic_bind_cred,
.crmatch = gss_match,
.crmarshal = gss_marshal,
.crrefresh = gss_refresh_null,
@@ -2072,7 +2151,7 @@
};
static const struct rpc_pipe_ops gss_upcall_ops_v0 = {
- .upcall = rpc_pipe_generic_upcall,
+ .upcall = gss_v0_upcall,
.downcall = gss_pipe_downcall,
.destroy_msg = gss_pipe_destroy_msg,
.open_pipe = gss_pipe_open_v0,
@@ -2080,7 +2159,7 @@
};
static const struct rpc_pipe_ops gss_upcall_ops_v1 = {
- .upcall = rpc_pipe_generic_upcall,
+ .upcall = gss_v1_upcall,
.downcall = gss_pipe_downcall,
.destroy_msg = gss_pipe_destroy_msg,
.open_pipe = gss_pipe_open_v1,
diff --git a/net/sunrpc/auth_gss/gss_krb5_crypto.c b/net/sunrpc/auth_gss/gss_krb5_crypto.c
index 0220e1c..6f2d30d 100644
--- a/net/sunrpc/auth_gss/gss_krb5_crypto.c
+++ b/net/sunrpc/auth_gss/gss_krb5_crypto.c
@@ -53,7 +53,7 @@
u32
krb5_encrypt(
- struct crypto_skcipher *tfm,
+ struct crypto_sync_skcipher *tfm,
void * iv,
void * in,
void * out,
@@ -62,24 +62,24 @@
u32 ret = -EINVAL;
struct scatterlist sg[1];
u8 local_iv[GSS_KRB5_MAX_BLOCKSIZE] = {0};
- SKCIPHER_REQUEST_ON_STACK(req, tfm);
+ SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm);
- if (length % crypto_skcipher_blocksize(tfm) != 0)
+ if (length % crypto_sync_skcipher_blocksize(tfm) != 0)
goto out;
- if (crypto_skcipher_ivsize(tfm) > GSS_KRB5_MAX_BLOCKSIZE) {
+ if (crypto_sync_skcipher_ivsize(tfm) > GSS_KRB5_MAX_BLOCKSIZE) {
dprintk("RPC: gss_k5encrypt: tfm iv size too large %d\n",
- crypto_skcipher_ivsize(tfm));
+ crypto_sync_skcipher_ivsize(tfm));
goto out;
}
if (iv)
- memcpy(local_iv, iv, crypto_skcipher_ivsize(tfm));
+ memcpy(local_iv, iv, crypto_sync_skcipher_ivsize(tfm));
memcpy(out, in, length);
sg_init_one(sg, out, length);
- skcipher_request_set_tfm(req, tfm);
+ skcipher_request_set_sync_tfm(req, tfm);
skcipher_request_set_callback(req, 0, NULL, NULL);
skcipher_request_set_crypt(req, sg, sg, length, local_iv);
@@ -92,7 +92,7 @@
u32
krb5_decrypt(
- struct crypto_skcipher *tfm,
+ struct crypto_sync_skcipher *tfm,
void * iv,
void * in,
void * out,
@@ -101,23 +101,23 @@
u32 ret = -EINVAL;
struct scatterlist sg[1];
u8 local_iv[GSS_KRB5_MAX_BLOCKSIZE] = {0};
- SKCIPHER_REQUEST_ON_STACK(req, tfm);
+ SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm);
- if (length % crypto_skcipher_blocksize(tfm) != 0)
+ if (length % crypto_sync_skcipher_blocksize(tfm) != 0)
goto out;
- if (crypto_skcipher_ivsize(tfm) > GSS_KRB5_MAX_BLOCKSIZE) {
+ if (crypto_sync_skcipher_ivsize(tfm) > GSS_KRB5_MAX_BLOCKSIZE) {
dprintk("RPC: gss_k5decrypt: tfm iv size too large %d\n",
- crypto_skcipher_ivsize(tfm));
+ crypto_sync_skcipher_ivsize(tfm));
goto out;
}
if (iv)
- memcpy(local_iv,iv, crypto_skcipher_ivsize(tfm));
+ memcpy(local_iv, iv, crypto_sync_skcipher_ivsize(tfm));
memcpy(out, in, length);
sg_init_one(sg, out, length);
- skcipher_request_set_tfm(req, tfm);
+ skcipher_request_set_sync_tfm(req, tfm);
skcipher_request_set_callback(req, 0, NULL, NULL);
skcipher_request_set_crypt(req, sg, sg, length, local_iv);
@@ -466,7 +466,8 @@
{
struct encryptor_desc *desc = data;
struct xdr_buf *outbuf = desc->outbuf;
- struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(desc->req);
+ struct crypto_sync_skcipher *tfm =
+ crypto_sync_skcipher_reqtfm(desc->req);
struct page *in_page;
int thislen = desc->fraglen + sg->length;
int fraglen, ret;
@@ -492,7 +493,7 @@
desc->fraglen += sg->length;
desc->pos += sg->length;
- fraglen = thislen & (crypto_skcipher_blocksize(tfm) - 1);
+ fraglen = thislen & (crypto_sync_skcipher_blocksize(tfm) - 1);
thislen -= fraglen;
if (thislen == 0)
@@ -526,16 +527,16 @@
}
int
-gss_encrypt_xdr_buf(struct crypto_skcipher *tfm, struct xdr_buf *buf,
+gss_encrypt_xdr_buf(struct crypto_sync_skcipher *tfm, struct xdr_buf *buf,
int offset, struct page **pages)
{
int ret;
struct encryptor_desc desc;
- SKCIPHER_REQUEST_ON_STACK(req, tfm);
+ SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm);
- BUG_ON((buf->len - offset) % crypto_skcipher_blocksize(tfm) != 0);
+ BUG_ON((buf->len - offset) % crypto_sync_skcipher_blocksize(tfm) != 0);
- skcipher_request_set_tfm(req, tfm);
+ skcipher_request_set_sync_tfm(req, tfm);
skcipher_request_set_callback(req, 0, NULL, NULL);
memset(desc.iv, 0, sizeof(desc.iv));
@@ -567,7 +568,8 @@
{
struct decryptor_desc *desc = data;
int thislen = desc->fraglen + sg->length;
- struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(desc->req);
+ struct crypto_sync_skcipher *tfm =
+ crypto_sync_skcipher_reqtfm(desc->req);
int fraglen, ret;
/* Worst case is 4 fragments: head, end of page 1, start
@@ -578,7 +580,7 @@
desc->fragno++;
desc->fraglen += sg->length;
- fraglen = thislen & (crypto_skcipher_blocksize(tfm) - 1);
+ fraglen = thislen & (crypto_sync_skcipher_blocksize(tfm) - 1);
thislen -= fraglen;
if (thislen == 0)
@@ -608,17 +610,17 @@
}
int
-gss_decrypt_xdr_buf(struct crypto_skcipher *tfm, struct xdr_buf *buf,
+gss_decrypt_xdr_buf(struct crypto_sync_skcipher *tfm, struct xdr_buf *buf,
int offset)
{
int ret;
struct decryptor_desc desc;
- SKCIPHER_REQUEST_ON_STACK(req, tfm);
+ SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm);
/* XXXJBF: */
- BUG_ON((buf->len - offset) % crypto_skcipher_blocksize(tfm) != 0);
+ BUG_ON((buf->len - offset) % crypto_sync_skcipher_blocksize(tfm) != 0);
- skcipher_request_set_tfm(req, tfm);
+ skcipher_request_set_sync_tfm(req, tfm);
skcipher_request_set_callback(req, 0, NULL, NULL);
memset(desc.iv, 0, sizeof(desc.iv));
@@ -672,12 +674,12 @@
}
static u32
-gss_krb5_cts_crypt(struct crypto_skcipher *cipher, struct xdr_buf *buf,
+gss_krb5_cts_crypt(struct crypto_sync_skcipher *cipher, struct xdr_buf *buf,
u32 offset, u8 *iv, struct page **pages, int encrypt)
{
u32 ret;
struct scatterlist sg[1];
- SKCIPHER_REQUEST_ON_STACK(req, cipher);
+ SYNC_SKCIPHER_REQUEST_ON_STACK(req, cipher);
u8 *data;
struct page **save_pages;
u32 len = buf->len - offset;
@@ -706,7 +708,7 @@
sg_init_one(sg, data, len);
- skcipher_request_set_tfm(req, cipher);
+ skcipher_request_set_sync_tfm(req, cipher);
skcipher_request_set_callback(req, 0, NULL, NULL);
skcipher_request_set_crypt(req, sg, sg, len, iv);
@@ -735,7 +737,7 @@
struct xdr_netobj hmac;
u8 *cksumkey;
u8 *ecptr;
- struct crypto_skcipher *cipher, *aux_cipher;
+ struct crypto_sync_skcipher *cipher, *aux_cipher;
int blocksize;
struct page **save_pages;
int nblocks, nbytes;
@@ -754,7 +756,7 @@
cksumkey = kctx->acceptor_integ;
usage = KG_USAGE_ACCEPTOR_SEAL;
}
- blocksize = crypto_skcipher_blocksize(cipher);
+ blocksize = crypto_sync_skcipher_blocksize(cipher);
/* hide the gss token header and insert the confounder */
offset += GSS_KRB5_TOK_HDR_LEN;
@@ -807,7 +809,7 @@
memset(desc.iv, 0, sizeof(desc.iv));
if (cbcbytes) {
- SKCIPHER_REQUEST_ON_STACK(req, aux_cipher);
+ SYNC_SKCIPHER_REQUEST_ON_STACK(req, aux_cipher);
desc.pos = offset + GSS_KRB5_TOK_HDR_LEN;
desc.fragno = 0;
@@ -816,7 +818,7 @@
desc.outbuf = buf;
desc.req = req;
- skcipher_request_set_tfm(req, aux_cipher);
+ skcipher_request_set_sync_tfm(req, aux_cipher);
skcipher_request_set_callback(req, 0, NULL, NULL);
sg_init_table(desc.infrags, 4);
@@ -855,7 +857,7 @@
struct xdr_buf subbuf;
u32 ret = 0;
u8 *cksum_key;
- struct crypto_skcipher *cipher, *aux_cipher;
+ struct crypto_sync_skcipher *cipher, *aux_cipher;
struct xdr_netobj our_hmac_obj;
u8 our_hmac[GSS_KRB5_MAX_CKSUM_LEN];
u8 pkt_hmac[GSS_KRB5_MAX_CKSUM_LEN];
@@ -874,7 +876,7 @@
cksum_key = kctx->initiator_integ;
usage = KG_USAGE_INITIATOR_SEAL;
}
- blocksize = crypto_skcipher_blocksize(cipher);
+ blocksize = crypto_sync_skcipher_blocksize(cipher);
/* create a segment skipping the header and leaving out the checksum */
@@ -891,13 +893,13 @@
memset(desc.iv, 0, sizeof(desc.iv));
if (cbcbytes) {
- SKCIPHER_REQUEST_ON_STACK(req, aux_cipher);
+ SYNC_SKCIPHER_REQUEST_ON_STACK(req, aux_cipher);
desc.fragno = 0;
desc.fraglen = 0;
desc.req = req;
- skcipher_request_set_tfm(req, aux_cipher);
+ skcipher_request_set_sync_tfm(req, aux_cipher);
skcipher_request_set_callback(req, 0, NULL, NULL);
sg_init_table(desc.frags, 4);
@@ -946,7 +948,8 @@
* Set the key of the given cipher.
*/
int
-krb5_rc4_setup_seq_key(struct krb5_ctx *kctx, struct crypto_skcipher *cipher,
+krb5_rc4_setup_seq_key(struct krb5_ctx *kctx,
+ struct crypto_sync_skcipher *cipher,
unsigned char *cksum)
{
struct crypto_shash *hmac;
@@ -974,7 +977,6 @@
}
desc->tfm = hmac;
- desc->flags = 0;
/* Compute intermediate Kseq from session key */
err = crypto_shash_setkey(hmac, kctx->Ksess, kctx->gk5e->keylength);
@@ -994,7 +996,7 @@
if (err)
goto out_err;
- err = crypto_skcipher_setkey(cipher, Kseq, kctx->gk5e->keylength);
+ err = crypto_sync_skcipher_setkey(cipher, Kseq, kctx->gk5e->keylength);
if (err)
goto out_err;
@@ -1012,7 +1014,8 @@
* Set the key of cipher kctx->enc.
*/
int
-krb5_rc4_setup_enc_key(struct krb5_ctx *kctx, struct crypto_skcipher *cipher,
+krb5_rc4_setup_enc_key(struct krb5_ctx *kctx,
+ struct crypto_sync_skcipher *cipher,
s32 seqnum)
{
struct crypto_shash *hmac;
@@ -1041,7 +1044,6 @@
}
desc->tfm = hmac;
- desc->flags = 0;
/* Compute intermediate Kcrypt from session key */
for (i = 0; i < kctx->gk5e->keylength; i++)
@@ -1069,7 +1071,8 @@
if (err)
goto out_err;
- err = crypto_skcipher_setkey(cipher, Kcrypt, kctx->gk5e->keylength);
+ err = crypto_sync_skcipher_setkey(cipher, Kcrypt,
+ kctx->gk5e->keylength);
if (err)
goto out_err;
diff --git a/net/sunrpc/auth_gss/gss_krb5_keys.c b/net/sunrpc/auth_gss/gss_krb5_keys.c
index f7fe2d2..3b7f721 100644
--- a/net/sunrpc/auth_gss/gss_krb5_keys.c
+++ b/net/sunrpc/auth_gss/gss_krb5_keys.c
@@ -147,7 +147,7 @@
size_t blocksize, keybytes, keylength, n;
unsigned char *inblockdata, *outblockdata, *rawkey;
struct xdr_netobj inblock, outblock;
- struct crypto_skcipher *cipher;
+ struct crypto_sync_skcipher *cipher;
u32 ret = EINVAL;
blocksize = gk5e->blocksize;
@@ -157,11 +157,10 @@
if ((inkey->len != keylength) || (outkey->len != keylength))
goto err_return;
- cipher = crypto_alloc_skcipher(gk5e->encrypt_name, 0,
- CRYPTO_ALG_ASYNC);
+ cipher = crypto_alloc_sync_skcipher(gk5e->encrypt_name, 0, 0);
if (IS_ERR(cipher))
goto err_return;
- if (crypto_skcipher_setkey(cipher, inkey->data, inkey->len))
+ if (crypto_sync_skcipher_setkey(cipher, inkey->data, inkey->len))
goto err_return;
/* allocate and set up buffers */
@@ -229,16 +228,13 @@
ret = 0;
err_free_raw:
- memset(rawkey, 0, keybytes);
- kfree(rawkey);
+ kzfree(rawkey);
err_free_out:
- memset(outblockdata, 0, blocksize);
- kfree(outblockdata);
+ kzfree(outblockdata);
err_free_in:
- memset(inblockdata, 0, blocksize);
- kfree(inblockdata);
+ kzfree(inblockdata);
err_free_cipher:
- crypto_free_skcipher(cipher);
+ crypto_free_sync_skcipher(cipher);
err_return:
return ret;
}
diff --git a/net/sunrpc/auth_gss/gss_krb5_mech.c b/net/sunrpc/auth_gss/gss_krb5_mech.c
index 7bb2514..6e5d6d2 100644
--- a/net/sunrpc/auth_gss/gss_krb5_mech.c
+++ b/net/sunrpc/auth_gss/gss_krb5_mech.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: BSD-3-Clause
/*
* linux/net/sunrpc/gss_krb5_mech.c
*
@@ -6,32 +7,6 @@
*
* Andy Adamson <andros@umich.edu>
* J. Bruce Fields <bfields@umich.edu>
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * 3. Neither the name of the University nor the names of its
- * contributors may be used to endorse or promote products derived
- * from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
- * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
- * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
- * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
- * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
*/
#include <crypto/hash.h>
@@ -53,6 +28,7 @@
static struct gss_api_mech gss_kerberos_mech; /* forward declaration */
static const struct gss_krb5_enctype supported_gss_krb5_enctypes[] = {
+#ifndef CONFIG_SUNRPC_DISABLE_INSECURE_ENCTYPES
/*
* DES (All DES enctypes are mapped to the same gss functionality)
*/
@@ -74,6 +50,7 @@
.cksumlength = 8,
.keyed_cksum = 0,
},
+#endif /* CONFIG_SUNRPC_DISABLE_INSECURE_ENCTYPES */
/*
* RC4-HMAC
*/
@@ -218,7 +195,7 @@
static inline const void *
get_key(const void *p, const void *end,
- struct krb5_ctx *ctx, struct crypto_skcipher **res)
+ struct krb5_ctx *ctx, struct crypto_sync_skcipher **res)
{
struct xdr_netobj key;
int alg;
@@ -246,15 +223,14 @@
if (IS_ERR(p))
goto out_err;
- *res = crypto_alloc_skcipher(ctx->gk5e->encrypt_name, 0,
- CRYPTO_ALG_ASYNC);
+ *res = crypto_alloc_sync_skcipher(ctx->gk5e->encrypt_name, 0, 0);
if (IS_ERR(*res)) {
printk(KERN_WARNING "gss_kerberos_mech: unable to initialize "
"crypto algorithm %s\n", ctx->gk5e->encrypt_name);
*res = NULL;
goto out_err_free_key;
}
- if (crypto_skcipher_setkey(*res, key.data, key.len)) {
+ if (crypto_sync_skcipher_setkey(*res, key.data, key.len)) {
printk(KERN_WARNING "gss_kerberos_mech: error setting key for "
"crypto algorithm %s\n", ctx->gk5e->encrypt_name);
goto out_err_free_tfm;
@@ -264,7 +240,7 @@
return p;
out_err_free_tfm:
- crypto_free_skcipher(*res);
+ crypto_free_sync_skcipher(*res);
out_err_free_key:
kfree(key.data);
p = ERR_PTR(-EINVAL);
@@ -275,6 +251,7 @@
static int
gss_import_v1_context(const void *p, const void *end, struct krb5_ctx *ctx)
{
+ u32 seq_send;
int tmp;
p = simple_get_bytes(p, end, &ctx->initiate, sizeof(ctx->initiate));
@@ -316,9 +293,10 @@
p = simple_get_bytes(p, end, &ctx->endtime, sizeof(ctx->endtime));
if (IS_ERR(p))
goto out_err;
- p = simple_get_bytes(p, end, &ctx->seq_send, sizeof(ctx->seq_send));
+ p = simple_get_bytes(p, end, &seq_send, sizeof(seq_send));
if (IS_ERR(p))
goto out_err;
+ atomic_set(&ctx->seq_send, seq_send);
p = simple_get_netobj(p, end, &ctx->mech_used);
if (IS_ERR(p))
goto out_err;
@@ -336,30 +314,30 @@
return 0;
out_err_free_key2:
- crypto_free_skcipher(ctx->seq);
+ crypto_free_sync_skcipher(ctx->seq);
out_err_free_key1:
- crypto_free_skcipher(ctx->enc);
+ crypto_free_sync_skcipher(ctx->enc);
out_err_free_mech:
kfree(ctx->mech_used.data);
out_err:
return PTR_ERR(p);
}
-static struct crypto_skcipher *
+static struct crypto_sync_skcipher *
context_v2_alloc_cipher(struct krb5_ctx *ctx, const char *cname, u8 *key)
{
- struct crypto_skcipher *cp;
+ struct crypto_sync_skcipher *cp;
- cp = crypto_alloc_skcipher(cname, 0, CRYPTO_ALG_ASYNC);
+ cp = crypto_alloc_sync_skcipher(cname, 0, 0);
if (IS_ERR(cp)) {
dprintk("gss_kerberos_mech: unable to initialize "
"crypto algorithm %s\n", cname);
return NULL;
}
- if (crypto_skcipher_setkey(cp, key, ctx->gk5e->keylength)) {
+ if (crypto_sync_skcipher_setkey(cp, key, ctx->gk5e->keylength)) {
dprintk("gss_kerberos_mech: error setting key for "
"crypto algorithm %s\n", cname);
- crypto_free_skcipher(cp);
+ crypto_free_sync_skcipher(cp);
return NULL;
}
return cp;
@@ -413,9 +391,9 @@
return 0;
out_free_enc:
- crypto_free_skcipher(ctx->enc);
+ crypto_free_sync_skcipher(ctx->enc);
out_free_seq:
- crypto_free_skcipher(ctx->seq);
+ crypto_free_sync_skcipher(ctx->seq);
out_err:
return -EINVAL;
}
@@ -460,7 +438,6 @@
}
desc->tfm = hmac;
- desc->flags = 0;
err = crypto_shash_digest(desc, sigkeyconstant, slen, ctx->cksum);
kzfree(desc);
@@ -469,17 +446,15 @@
/*
* allocate hash, and skciphers for data and seqnum encryption
*/
- ctx->enc = crypto_alloc_skcipher(ctx->gk5e->encrypt_name, 0,
- CRYPTO_ALG_ASYNC);
+ ctx->enc = crypto_alloc_sync_skcipher(ctx->gk5e->encrypt_name, 0, 0);
if (IS_ERR(ctx->enc)) {
err = PTR_ERR(ctx->enc);
goto out_err_free_hmac;
}
- ctx->seq = crypto_alloc_skcipher(ctx->gk5e->encrypt_name, 0,
- CRYPTO_ALG_ASYNC);
+ ctx->seq = crypto_alloc_sync_skcipher(ctx->gk5e->encrypt_name, 0, 0);
if (IS_ERR(ctx->seq)) {
- crypto_free_skcipher(ctx->enc);
+ crypto_free_sync_skcipher(ctx->enc);
err = PTR_ERR(ctx->seq);
goto out_err_free_hmac;
}
@@ -591,7 +566,7 @@
context_v2_alloc_cipher(ctx, "cbc(aes)",
ctx->acceptor_seal);
if (ctx->acceptor_enc_aux == NULL) {
- crypto_free_skcipher(ctx->initiator_enc_aux);
+ crypto_free_sync_skcipher(ctx->initiator_enc_aux);
goto out_free_acceptor_enc;
}
}
@@ -599,9 +574,9 @@
return 0;
out_free_acceptor_enc:
- crypto_free_skcipher(ctx->acceptor_enc);
+ crypto_free_sync_skcipher(ctx->acceptor_enc);
out_free_initiator_enc:
- crypto_free_skcipher(ctx->initiator_enc);
+ crypto_free_sync_skcipher(ctx->initiator_enc);
out_err:
return -EINVAL;
}
@@ -610,6 +585,7 @@
gss_import_v2_context(const void *p, const void *end, struct krb5_ctx *ctx,
gfp_t gfp_mask)
{
+ u64 seq_send64;
int keylen;
p = simple_get_bytes(p, end, &ctx->flags, sizeof(ctx->flags));
@@ -620,14 +596,15 @@
p = simple_get_bytes(p, end, &ctx->endtime, sizeof(ctx->endtime));
if (IS_ERR(p))
goto out_err;
- p = simple_get_bytes(p, end, &ctx->seq_send64, sizeof(ctx->seq_send64));
+ p = simple_get_bytes(p, end, &seq_send64, sizeof(seq_send64));
if (IS_ERR(p))
goto out_err;
+ atomic64_set(&ctx->seq_send64, seq_send64);
/* set seq_send for use by "older" enctypes */
- ctx->seq_send = ctx->seq_send64;
- if (ctx->seq_send64 != ctx->seq_send) {
- dprintk("%s: seq_send64 %lx, seq_send %x overflow?\n", __func__,
- (unsigned long)ctx->seq_send64, ctx->seq_send);
+ atomic_set(&ctx->seq_send, seq_send64);
+ if (seq_send64 != atomic_read(&ctx->seq_send)) {
+ dprintk("%s: seq_send64 %llx, seq_send %x overflow?\n", __func__,
+ seq_send64, atomic_read(&ctx->seq_send));
p = ERR_PTR(-EINVAL);
goto out_err;
}
@@ -713,12 +690,12 @@
gss_delete_sec_context_kerberos(void *internal_ctx) {
struct krb5_ctx *kctx = internal_ctx;
- crypto_free_skcipher(kctx->seq);
- crypto_free_skcipher(kctx->enc);
- crypto_free_skcipher(kctx->acceptor_enc);
- crypto_free_skcipher(kctx->initiator_enc);
- crypto_free_skcipher(kctx->acceptor_enc_aux);
- crypto_free_skcipher(kctx->initiator_enc_aux);
+ crypto_free_sync_skcipher(kctx->seq);
+ crypto_free_sync_skcipher(kctx->enc);
+ crypto_free_sync_skcipher(kctx->acceptor_enc);
+ crypto_free_sync_skcipher(kctx->initiator_enc);
+ crypto_free_sync_skcipher(kctx->acceptor_enc_aux);
+ crypto_free_sync_skcipher(kctx->initiator_enc_aux);
kfree(kctx->mech_used.data);
kfree(kctx);
}
diff --git a/net/sunrpc/auth_gss/gss_krb5_seal.c b/net/sunrpc/auth_gss/gss_krb5_seal.c
index eaad9bc..48fe4a5 100644
--- a/net/sunrpc/auth_gss/gss_krb5_seal.c
+++ b/net/sunrpc/auth_gss/gss_krb5_seal.c
@@ -63,13 +63,12 @@
#include <linux/sunrpc/gss_krb5.h>
#include <linux/random.h>
#include <linux/crypto.h>
+#include <linux/atomic.h>
#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
# define RPCDBG_FACILITY RPCDBG_AUTH
#endif
-DEFINE_SPINLOCK(krb5_seq_lock);
-
static void *
setup_token(struct krb5_ctx *ctx, struct xdr_netobj *token)
{
@@ -154,9 +153,7 @@
memcpy(ptr + GSS_KRB5_TOK_HDR_LEN, md5cksum.data, md5cksum.len);
- spin_lock(&krb5_seq_lock);
- seq_send = ctx->seq_send++;
- spin_unlock(&krb5_seq_lock);
+ seq_send = atomic_fetch_inc(&ctx->seq_send);
if (krb5_make_seq_num(ctx, ctx->seq, ctx->initiate ? 0 : 0xff,
seq_send, ptr + GSS_KRB5_TOK_HDR_LEN, ptr + 8))
@@ -174,7 +171,6 @@
.data = cksumdata};
void *krb5_hdr;
s32 now;
- u64 seq_send;
u8 *cksumkey;
unsigned int cksum_usage;
__be64 seq_send_be64;
@@ -185,11 +181,7 @@
/* Set up the sequence number. Now 64-bits in clear
* text and w/o direction indicator */
- spin_lock(&krb5_seq_lock);
- seq_send = ctx->seq_send64++;
- spin_unlock(&krb5_seq_lock);
-
- seq_send_be64 = cpu_to_be64(seq_send);
+ seq_send_be64 = cpu_to_be64(atomic64_fetch_inc(&ctx->seq_send64));
memcpy(krb5_hdr + 8, (char *) &seq_send_be64, 8);
if (ctx->initiate) {
diff --git a/net/sunrpc/auth_gss/gss_krb5_seqnum.c b/net/sunrpc/auth_gss/gss_krb5_seqnum.c
index c8b9082..5071051 100644
--- a/net/sunrpc/auth_gss/gss_krb5_seqnum.c
+++ b/net/sunrpc/auth_gss/gss_krb5_seqnum.c
@@ -43,16 +43,19 @@
krb5_make_rc4_seq_num(struct krb5_ctx *kctx, int direction, s32 seqnum,
unsigned char *cksum, unsigned char *buf)
{
- struct crypto_skcipher *cipher;
- unsigned char plain[8];
+ struct crypto_sync_skcipher *cipher;
+ unsigned char *plain;
s32 code;
dprintk("RPC: %s:\n", __func__);
- cipher = crypto_alloc_skcipher(kctx->gk5e->encrypt_name, 0,
- CRYPTO_ALG_ASYNC);
+ cipher = crypto_alloc_sync_skcipher(kctx->gk5e->encrypt_name, 0, 0);
if (IS_ERR(cipher))
return PTR_ERR(cipher);
+ plain = kmalloc(8, GFP_NOFS);
+ if (!plain)
+ return -ENOMEM;
+
plain[0] = (unsigned char) ((seqnum >> 24) & 0xff);
plain[1] = (unsigned char) ((seqnum >> 16) & 0xff);
plain[2] = (unsigned char) ((seqnum >> 8) & 0xff);
@@ -68,22 +71,28 @@
code = krb5_encrypt(cipher, cksum, plain, buf, 8);
out:
- crypto_free_skcipher(cipher);
+ kfree(plain);
+ crypto_free_sync_skcipher(cipher);
return code;
}
s32
krb5_make_seq_num(struct krb5_ctx *kctx,
- struct crypto_skcipher *key,
+ struct crypto_sync_skcipher *key,
int direction,
u32 seqnum,
unsigned char *cksum, unsigned char *buf)
{
- unsigned char plain[8];
+ unsigned char *plain;
+ s32 code;
if (kctx->enctype == ENCTYPE_ARCFOUR_HMAC)
return krb5_make_rc4_seq_num(kctx, direction, seqnum,
cksum, buf);
+ plain = kmalloc(8, GFP_NOFS);
+ if (!plain)
+ return -ENOMEM;
+
plain[0] = (unsigned char) (seqnum & 0xff);
plain[1] = (unsigned char) ((seqnum >> 8) & 0xff);
plain[2] = (unsigned char) ((seqnum >> 16) & 0xff);
@@ -94,20 +103,21 @@
plain[6] = direction;
plain[7] = direction;
- return krb5_encrypt(key, cksum, plain, buf, 8);
+ code = krb5_encrypt(key, cksum, plain, buf, 8);
+ kfree(plain);
+ return code;
}
static s32
krb5_get_rc4_seq_num(struct krb5_ctx *kctx, unsigned char *cksum,
unsigned char *buf, int *direction, s32 *seqnum)
{
- struct crypto_skcipher *cipher;
- unsigned char plain[8];
+ struct crypto_sync_skcipher *cipher;
+ unsigned char *plain;
s32 code;
dprintk("RPC: %s:\n", __func__);
- cipher = crypto_alloc_skcipher(kctx->gk5e->encrypt_name, 0,
- CRYPTO_ALG_ASYNC);
+ cipher = crypto_alloc_sync_skcipher(kctx->gk5e->encrypt_name, 0, 0);
if (IS_ERR(cipher))
return PTR_ERR(cipher);
@@ -115,22 +125,30 @@
if (code)
goto out;
+ plain = kmalloc(8, GFP_NOFS);
+ if (!plain) {
+ code = -ENOMEM;
+ goto out;
+ }
+
code = krb5_decrypt(cipher, cksum, buf, plain, 8);
if (code)
- goto out;
+ goto out_plain;
if ((plain[4] != plain[5]) || (plain[4] != plain[6])
|| (plain[4] != plain[7])) {
code = (s32)KG_BAD_SEQ;
- goto out;
+ goto out_plain;
}
*direction = plain[4];
*seqnum = ((plain[0] << 24) | (plain[1] << 16) |
(plain[2] << 8) | (plain[3]));
+out_plain:
+ kfree(plain);
out:
- crypto_free_skcipher(cipher);
+ crypto_free_sync_skcipher(cipher);
return code;
}
@@ -141,26 +159,33 @@
int *direction, u32 *seqnum)
{
s32 code;
- unsigned char plain[8];
- struct crypto_skcipher *key = kctx->seq;
+ unsigned char *plain;
+ struct crypto_sync_skcipher *key = kctx->seq;
dprintk("RPC: krb5_get_seq_num:\n");
if (kctx->enctype == ENCTYPE_ARCFOUR_HMAC)
return krb5_get_rc4_seq_num(kctx, cksum, buf,
direction, seqnum);
+ plain = kmalloc(8, GFP_NOFS);
+ if (!plain)
+ return -ENOMEM;
if ((code = krb5_decrypt(key, cksum, buf, plain, 8)))
- return code;
+ goto out;
if ((plain[4] != plain[5]) || (plain[4] != plain[6]) ||
- (plain[4] != plain[7]))
- return (s32)KG_BAD_SEQ;
+ (plain[4] != plain[7])) {
+ code = (s32)KG_BAD_SEQ;
+ goto out;
+ }
*direction = plain[4];
*seqnum = ((plain[0]) |
(plain[1] << 8) | (plain[2] << 16) | (plain[3] << 24));
- return 0;
+out:
+ kfree(plain);
+ return code;
}
diff --git a/net/sunrpc/auth_gss/gss_krb5_wrap.c b/net/sunrpc/auth_gss/gss_krb5_wrap.c
index 39a2e67..14a0aff 100644
--- a/net/sunrpc/auth_gss/gss_krb5_wrap.c
+++ b/net/sunrpc/auth_gss/gss_krb5_wrap.c
@@ -174,7 +174,7 @@
now = get_seconds();
- blocksize = crypto_skcipher_blocksize(kctx->enc);
+ blocksize = crypto_sync_skcipher_blocksize(kctx->enc);
gss_krb5_add_padding(buf, offset, blocksize);
BUG_ON((buf->len - offset) % blocksize);
plainlen = conflen + buf->len - offset;
@@ -228,9 +228,7 @@
memcpy(ptr + GSS_KRB5_TOK_HDR_LEN, md5cksum.data, md5cksum.len);
- spin_lock(&krb5_seq_lock);
- seq_send = kctx->seq_send++;
- spin_unlock(&krb5_seq_lock);
+ seq_send = atomic_fetch_inc(&kctx->seq_send);
/* XXX would probably be more efficient to compute checksum
* and encrypt at the same time: */
@@ -239,10 +237,10 @@
return GSS_S_FAILURE;
if (kctx->enctype == ENCTYPE_ARCFOUR_HMAC) {
- struct crypto_skcipher *cipher;
+ struct crypto_sync_skcipher *cipher;
int err;
- cipher = crypto_alloc_skcipher(kctx->gk5e->encrypt_name, 0,
- CRYPTO_ALG_ASYNC);
+ cipher = crypto_alloc_sync_skcipher(kctx->gk5e->encrypt_name,
+ 0, 0);
if (IS_ERR(cipher))
return GSS_S_FAILURE;
@@ -250,7 +248,7 @@
err = gss_encrypt_xdr_buf(cipher, buf,
offset + headlen - conflen, pages);
- crypto_free_skcipher(cipher);
+ crypto_free_sync_skcipher(cipher);
if (err)
return GSS_S_FAILURE;
} else {
@@ -327,18 +325,18 @@
return GSS_S_BAD_SIG;
if (kctx->enctype == ENCTYPE_ARCFOUR_HMAC) {
- struct crypto_skcipher *cipher;
+ struct crypto_sync_skcipher *cipher;
int err;
- cipher = crypto_alloc_skcipher(kctx->gk5e->encrypt_name, 0,
- CRYPTO_ALG_ASYNC);
+ cipher = crypto_alloc_sync_skcipher(kctx->gk5e->encrypt_name,
+ 0, 0);
if (IS_ERR(cipher))
return GSS_S_FAILURE;
krb5_rc4_setup_enc_key(kctx, cipher, seqnum);
err = gss_decrypt_xdr_buf(cipher, buf, crypt_offset);
- crypto_free_skcipher(cipher);
+ crypto_free_sync_skcipher(cipher);
if (err)
return GSS_S_DEFECTIVE_TOKEN;
} else {
@@ -371,7 +369,7 @@
/* Copy the data back to the right position. XXX: Would probably be
* better to copy and encrypt at the same time. */
- blocksize = crypto_skcipher_blocksize(kctx->enc);
+ blocksize = crypto_sync_skcipher_blocksize(kctx->enc);
data_start = ptr + (GSS_KRB5_TOK_HDR_LEN + kctx->gk5e->cksumlength) +
conflen;
orig_start = buf->head[0].iov_base + offset;
@@ -477,9 +475,7 @@
*be16ptr++ = 0;
be64ptr = (__be64 *)be16ptr;
- spin_lock(&krb5_seq_lock);
- *be64ptr = cpu_to_be64(kctx->seq_send64++);
- spin_unlock(&krb5_seq_lock);
+ *be64ptr = cpu_to_be64(atomic64_fetch_inc(&kctx->seq_send64));
err = (*kctx->gk5e->encrypt_v2)(kctx, offset, buf, pages);
if (err)
@@ -574,14 +570,16 @@
*/
movelen = min_t(unsigned int, buf->head[0].iov_len, buf->len);
movelen -= offset + GSS_KRB5_TOK_HDR_LEN + headskip;
- BUG_ON(offset + GSS_KRB5_TOK_HDR_LEN + headskip + movelen >
- buf->head[0].iov_len);
+ if (offset + GSS_KRB5_TOK_HDR_LEN + headskip + movelen >
+ buf->head[0].iov_len)
+ return GSS_S_FAILURE;
memmove(ptr, ptr + GSS_KRB5_TOK_HDR_LEN + headskip, movelen);
buf->head[0].iov_len -= GSS_KRB5_TOK_HDR_LEN + headskip;
buf->len -= GSS_KRB5_TOK_HDR_LEN + headskip;
/* Trim off the trailing "extra count" and checksum blob */
- xdr_buf_trim(buf, ec + GSS_KRB5_TOK_HDR_LEN + tailskip);
+ buf->len -= ec + GSS_KRB5_TOK_HDR_LEN + tailskip;
+
return GSS_S_COMPLETE;
}
diff --git a/net/sunrpc/auth_gss/gss_mech_switch.c b/net/sunrpc/auth_gss/gss_mech_switch.c
index 5fec3ab..8206009 100644
--- a/net/sunrpc/auth_gss/gss_mech_switch.c
+++ b/net/sunrpc/auth_gss/gss_mech_switch.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: BSD-3-Clause
/*
* linux/net/sunrpc/gss_mech_switch.c
*
@@ -5,32 +6,6 @@
* All rights reserved.
*
* J. Bruce Fields <bfields@umich.edu>
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * 3. Neither the name of the University nor the names of its
- * contributors may be used to endorse or promote products derived
- * from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
- * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
- * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
- * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
- * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
*/
#include <linux/types.h>
@@ -117,7 +92,7 @@
if (status)
return status;
spin_lock(®istered_mechs_lock);
- list_add(&gm->gm_list, ®istered_mechs);
+ list_add_rcu(&gm->gm_list, ®istered_mechs);
spin_unlock(®istered_mechs_lock);
dprintk("RPC: registered gss mechanism %s\n", gm->gm_name);
return 0;
@@ -132,7 +107,7 @@
void gss_mech_unregister(struct gss_api_mech *gm)
{
spin_lock(®istered_mechs_lock);
- list_del(&gm->gm_list);
+ list_del_rcu(&gm->gm_list);
spin_unlock(®istered_mechs_lock);
dprintk("RPC: unregistered gss mechanism %s\n", gm->gm_name);
gss_mech_free(gm);
@@ -151,15 +126,15 @@
{
struct gss_api_mech *pos, *gm = NULL;
- spin_lock(®istered_mechs_lock);
- list_for_each_entry(pos, ®istered_mechs, gm_list) {
+ rcu_read_lock();
+ list_for_each_entry_rcu(pos, ®istered_mechs, gm_list) {
if (0 == strcmp(name, pos->gm_name)) {
if (try_module_get(pos->gm_owner))
gm = pos;
break;
}
}
- spin_unlock(®istered_mechs_lock);
+ rcu_read_unlock();
return gm;
}
@@ -186,8 +161,8 @@
dprintk("RPC: %s(%s)\n", __func__, buf);
request_module("rpc-auth-gss-%s", buf);
- spin_lock(®istered_mechs_lock);
- list_for_each_entry(pos, ®istered_mechs, gm_list) {
+ rcu_read_lock();
+ list_for_each_entry_rcu(pos, ®istered_mechs, gm_list) {
if (obj->len == pos->gm_oid.len) {
if (0 == memcmp(obj->data, pos->gm_oid.data, obj->len)) {
if (try_module_get(pos->gm_owner))
@@ -196,7 +171,7 @@
}
}
}
- spin_unlock(®istered_mechs_lock);
+ rcu_read_unlock();
return gm;
}
@@ -216,15 +191,15 @@
{
struct gss_api_mech *gm = NULL, *pos;
- spin_lock(®istered_mechs_lock);
- list_for_each_entry(pos, ®istered_mechs, gm_list) {
+ rcu_read_lock();
+ list_for_each_entry_rcu(pos, ®istered_mechs, gm_list) {
if (!mech_supports_pseudoflavor(pos, pseudoflavor))
continue;
if (try_module_get(pos->gm_owner))
gm = pos;
break;
}
- spin_unlock(®istered_mechs_lock);
+ rcu_read_unlock();
return gm;
}
@@ -244,7 +219,7 @@
/**
* gss_mech_list_pseudoflavors - Discover registered GSS pseudoflavors
- * @array: array to fill in
+ * @array_ptr: array to fill in
* @size: size of "array"
*
* Returns the number of array items filled in, or a negative errno.
@@ -257,8 +232,8 @@
struct gss_api_mech *pos = NULL;
int j, i = 0;
- spin_lock(®istered_mechs_lock);
- list_for_each_entry(pos, ®istered_mechs, gm_list) {
+ rcu_read_lock();
+ list_for_each_entry_rcu(pos, ®istered_mechs, gm_list) {
for (j = 0; j < pos->gm_pf_num; j++) {
if (i >= size) {
spin_unlock(®istered_mechs_lock);
@@ -267,7 +242,7 @@
array_ptr[i++] = pos->gm_pfs[j].pseudoflavor;
}
}
- spin_unlock(®istered_mechs_lock);
+ rcu_read_unlock();
return i;
}
diff --git a/net/sunrpc/auth_gss/gss_rpc_upcall.c b/net/sunrpc/auth_gss/gss_rpc_upcall.c
index 73dcda0..0349f45 100644
--- a/net/sunrpc/auth_gss/gss_rpc_upcall.c
+++ b/net/sunrpc/auth_gss/gss_rpc_upcall.c
@@ -1,21 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0+
/*
* linux/net/sunrpc/gss_rpc_upcall.c
*
* Copyright (C) 2012 Simo Sorce <simo@redhat.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#include <linux/types.h>
diff --git a/net/sunrpc/auth_gss/gss_rpc_upcall.h b/net/sunrpc/auth_gss/gss_rpc_upcall.h
index 1e542ad..31e9634 100644
--- a/net/sunrpc/auth_gss/gss_rpc_upcall.h
+++ b/net/sunrpc/auth_gss/gss_rpc_upcall.h
@@ -1,21 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
/*
* linux/net/sunrpc/gss_rpc_upcall.h
*
* Copyright (C) 2012 Simo Sorce <simo@redhat.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#ifndef _GSS_RPC_UPCALL_H
@@ -45,4 +32,5 @@
void init_gssp_clnt(struct sunrpc_net *);
int set_gssp_clnt(struct net *);
void clear_gssp_clnt(struct sunrpc_net *);
+
#endif /* _GSS_RPC_UPCALL_H */
diff --git a/net/sunrpc/auth_gss/gss_rpc_xdr.c b/net/sunrpc/auth_gss/gss_rpc_xdr.c
index 444380f..2ff7b70 100644
--- a/net/sunrpc/auth_gss/gss_rpc_xdr.c
+++ b/net/sunrpc/auth_gss/gss_rpc_xdr.c
@@ -1,21 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0+
/*
* GSS Proxy upcall module
*
* Copyright (C) 2012 Simo Sorce <simo@redhat.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#include <linux/sunrpc/svcauth.h>
@@ -784,6 +771,7 @@
xdr_inline_pages(&req->rq_rcv_buf,
PAGE_SIZE/2 /* pretty arbitrary */,
arg->pages, 0 /* page base */, arg->npages * PAGE_SIZE);
+ req->rq_rcv_buf.flags |= XDRBUF_SPARSE_PAGES;
done:
if (err)
dprintk("RPC: gssx_enc_accept_sec_context: %d\n", err);
diff --git a/net/sunrpc/auth_gss/gss_rpc_xdr.h b/net/sunrpc/auth_gss/gss_rpc_xdr.h
index 146c310..3f17411 100644
--- a/net/sunrpc/auth_gss/gss_rpc_xdr.h
+++ b/net/sunrpc/auth_gss/gss_rpc_xdr.h
@@ -1,21 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
/*
* GSS Proxy upcall module
*
* Copyright (C) 2012 Simo Sorce <simo@redhat.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#ifndef _LINUX_GSS_RPC_XDR_H
@@ -262,6 +249,4 @@
#define GSSX_ARG_wrap_size_limit_sz 0
#define GSSX_RES_wrap_size_limit_sz 0
-
-
#endif /* _LINUX_GSS_RPC_XDR_H */
diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c
index 860f2a1..8be2f20 100644
--- a/net/sunrpc/auth_gss/svcauth_gss.c
+++ b/net/sunrpc/auth_gss/svcauth_gss.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Neil Brown <neilb@cse.unsw.edu.au>
* J. Bruce Fields <bfields@umich.edu>
@@ -76,6 +77,7 @@
struct xdr_netobj in_handle, in_token;
struct xdr_netobj out_handle, out_token;
int major_status, minor_status;
+ struct rcu_head rcu_head;
};
static struct rsi *rsi_update(struct cache_detail *cd, struct rsi *new, struct rsi *old);
@@ -89,11 +91,19 @@
kfree(rsii->out_token.data);
}
+static void rsi_free_rcu(struct rcu_head *head)
+{
+ struct rsi *rsii = container_of(head, struct rsi, rcu_head);
+
+ rsi_free(rsii);
+ kfree(rsii);
+}
+
static void rsi_put(struct kref *ref)
{
struct rsi *rsii = container_of(ref, struct rsi, h.ref);
- rsi_free(rsii);
- kfree(rsii);
+
+ call_rcu(&rsii->rcu_head, rsi_free_rcu);
}
static inline int rsi_hash(struct rsi *item)
@@ -282,7 +292,7 @@
struct cache_head *ch;
int hash = rsi_hash(item);
- ch = sunrpc_cache_lookup(cd, &item->h, hash);
+ ch = sunrpc_cache_lookup_rcu(cd, &item->h, hash);
if (ch)
return container_of(ch, struct rsi, h);
else
@@ -330,6 +340,7 @@
struct svc_cred cred;
struct gss_svc_seq_data seqdata;
struct gss_ctx *mechctx;
+ struct rcu_head rcu_head;
};
static struct rsc *rsc_update(struct cache_detail *cd, struct rsc *new, struct rsc *old);
@@ -343,12 +354,22 @@
free_svc_cred(&rsci->cred);
}
+static void rsc_free_rcu(struct rcu_head *head)
+{
+ struct rsc *rsci = container_of(head, struct rsc, rcu_head);
+
+ kfree(rsci->handle.data);
+ kfree(rsci);
+}
+
static void rsc_put(struct kref *ref)
{
struct rsc *rsci = container_of(ref, struct rsc, h.ref);
- rsc_free(rsci);
- kfree(rsci);
+ if (rsci->mechctx)
+ gss_delete_sec_context(&rsci->mechctx);
+ free_svc_cred(&rsci->cred);
+ call_rcu(&rsci->rcu_head, rsc_free_rcu);
}
static inline int
@@ -453,12 +474,12 @@
* treatment so are checked for validity here.)
*/
/* uid */
- rsci.cred.cr_uid = make_kuid(&init_user_ns, id);
+ rsci.cred.cr_uid = make_kuid(current_user_ns(), id);
/* gid */
if (get_int(&mesg, &id))
goto out;
- rsci.cred.cr_gid = make_kgid(&init_user_ns, id);
+ rsci.cred.cr_gid = make_kgid(current_user_ns(), id);
/* number of additional gid's */
if (get_int(&mesg, &N))
@@ -476,7 +497,7 @@
kgid_t kgid;
if (get_int(&mesg, &id))
goto out;
- kgid = make_kgid(&init_user_ns, id);
+ kgid = make_kgid(current_user_ns(), id);
if (!gid_valid(kgid))
goto out;
rsci.cred.cr_group_info->gid[i] = kgid;
@@ -542,7 +563,7 @@
struct cache_head *ch;
int hash = rsc_hash(item);
- ch = sunrpc_cache_lookup(cd, &item->h, hash);
+ ch = sunrpc_cache_lookup_rcu(cd, &item->h, hash);
if (ch)
return container_of(ch, struct rsc, h);
else
@@ -876,7 +897,7 @@
if (svc_getnl(&buf->head[0]) != seq)
goto out;
/* trim off the mic and padding at the end before returning */
- xdr_buf_trim(buf, round_up_to_quad(mic.len) + 4);
+ buf->len -= 4 + round_up_to_quad(mic.len);
stat = 0;
out:
kfree(mic.data);
@@ -1122,7 +1143,7 @@
struct kvec *resv = &rqstp->rq_res.head[0];
struct rsi *rsip, rsikey;
int ret;
- struct sunrpc_net *sn = net_generic(rqstp->rq_xprt->xpt_net, sunrpc_net_id);
+ struct sunrpc_net *sn = net_generic(SVC_NET(rqstp), sunrpc_net_id);
memset(&rsikey, 0, sizeof(rsikey));
ret = gss_read_verf(gc, argv, authp,
@@ -1233,7 +1254,7 @@
uint64_t handle;
int status;
int ret;
- struct net *net = rqstp->rq_xprt->xpt_net;
+ struct net *net = SVC_NET(rqstp);
struct sunrpc_net *sn = net_generic(net, sunrpc_net_id);
memset(&ud, 0, sizeof(ud));
@@ -1424,7 +1445,7 @@
__be32 *rpcstart;
__be32 *reject_stat = resv->iov_base + resv->iov_len;
int ret;
- struct sunrpc_net *sn = net_generic(rqstp->rq_xprt->xpt_net, sunrpc_net_id);
+ struct sunrpc_net *sn = net_generic(SVC_NET(rqstp), sunrpc_net_id);
dprintk("RPC: svcauth_gss: argv->iov_len = %zd\n",
argv->iov_len);
@@ -1714,7 +1735,7 @@
struct rpc_gss_wire_cred *gc = &gsd->clcred;
struct xdr_buf *resbuf = &rqstp->rq_res;
int stat = -EINVAL;
- struct sunrpc_net *sn = net_generic(rqstp->rq_xprt->xpt_net, sunrpc_net_id);
+ struct sunrpc_net *sn = net_generic(SVC_NET(rqstp), sunrpc_net_id);
if (gc->gc_proc != RPC_GSS_PROC_DATA)
goto out;
@@ -1764,14 +1785,21 @@
}
static void
-svcauth_gss_domain_release(struct auth_domain *dom)
+svcauth_gss_domain_release_rcu(struct rcu_head *head)
{
+ struct auth_domain *dom = container_of(head, struct auth_domain, rcu_head);
struct gss_domain *gd = container_of(dom, struct gss_domain, h);
kfree(dom->name);
kfree(gd);
}
+static void
+svcauth_gss_domain_release(struct auth_domain *dom)
+{
+ call_rcu(&dom->rcu_head, svcauth_gss_domain_release_rcu);
+}
+
static struct auth_ops svcauthops_gss = {
.name = "rpcsec_gss",
.owner = THIS_MODULE,
diff --git a/net/sunrpc/auth_gss/trace.c b/net/sunrpc/auth_gss/trace.c
new file mode 100644
index 0000000..5576f1e
--- /dev/null
+++ b/net/sunrpc/auth_gss/trace.c
@@ -0,0 +1,11 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2018, 2019 Oracle. All rights reserved.
+ */
+
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/sched.h>
+#include <linux/sunrpc/gss_err.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/rpcgss.h>
diff --git a/net/sunrpc/auth_null.c b/net/sunrpc/auth_null.c
index 4b48228..41a633a 100644
--- a/net/sunrpc/auth_null.c
+++ b/net/sunrpc/auth_null.c
@@ -21,7 +21,7 @@
static struct rpc_auth *
nul_create(const struct rpc_auth_create_args *args, struct rpc_clnt *clnt)
{
- atomic_inc(&null_auth.au_count);
+ refcount_inc(&null_auth.au_count);
return &null_auth;
}
@@ -36,8 +36,6 @@
static struct rpc_cred *
nul_lookup_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
{
- if (flags & RPCAUTH_LOOKUP_RCU)
- return &null_cred;
return get_rpccred(&null_cred);
}
@@ -61,15 +59,21 @@
/*
* Marshal credential.
*/
-static __be32 *
-nul_marshal(struct rpc_task *task, __be32 *p)
+static int
+nul_marshal(struct rpc_task *task, struct xdr_stream *xdr)
{
- *p++ = htonl(RPC_AUTH_NULL);
- *p++ = 0;
- *p++ = htonl(RPC_AUTH_NULL);
- *p++ = 0;
+ __be32 *p;
- return p;
+ p = xdr_reserve_space(xdr, 4 * sizeof(*p));
+ if (!p)
+ return -EMSGSIZE;
+ /* Credential */
+ *p++ = rpc_auth_null;
+ *p++ = xdr_zero;
+ /* Verifier */
+ *p++ = rpc_auth_null;
+ *p = xdr_zero;
+ return 0;
}
/*
@@ -82,25 +86,19 @@
return 0;
}
-static __be32 *
-nul_validate(struct rpc_task *task, __be32 *p)
+static int
+nul_validate(struct rpc_task *task, struct xdr_stream *xdr)
{
- rpc_authflavor_t flavor;
- u32 size;
+ __be32 *p;
- flavor = ntohl(*p++);
- if (flavor != RPC_AUTH_NULL) {
- printk("RPC: bad verf flavor: %u\n", flavor);
- return ERR_PTR(-EIO);
- }
-
- size = ntohl(*p++);
- if (size != 0) {
- printk("RPC: bad verf size: %u\n", size);
- return ERR_PTR(-EIO);
- }
-
- return p;
+ p = xdr_inline_decode(xdr, 2 * sizeof(*p));
+ if (!p)
+ return -EIO;
+ if (*p++ != rpc_auth_null)
+ return -EIO;
+ if (*p != xdr_zero)
+ return -EIO;
+ return 0;
}
const struct rpc_authops authnull_ops = {
@@ -116,21 +114,23 @@
struct rpc_auth null_auth = {
.au_cslack = NUL_CALLSLACK,
.au_rslack = NUL_REPLYSLACK,
- .au_flags = RPCAUTH_AUTH_NO_CRKEY_TIMEOUT,
+ .au_verfsize = NUL_REPLYSLACK,
+ .au_ralign = NUL_REPLYSLACK,
.au_ops = &authnull_ops,
.au_flavor = RPC_AUTH_NULL,
- .au_count = ATOMIC_INIT(0),
+ .au_count = REFCOUNT_INIT(1),
};
static
const struct rpc_credops null_credops = {
.cr_name = "AUTH_NULL",
.crdestroy = nul_destroy_cred,
- .crbind = rpcauth_generic_bind_cred,
.crmatch = nul_match,
.crmarshal = nul_marshal,
+ .crwrap_req = rpcauth_wrap_req_encode,
.crrefresh = nul_refresh,
.crvalidate = nul_validate,
+ .crunwrap_resp = rpcauth_unwrap_resp_decode,
};
static
@@ -138,6 +138,6 @@
.cr_lru = LIST_HEAD_INIT(null_cred.cr_lru),
.cr_auth = &null_auth,
.cr_ops = &null_credops,
- .cr_count = ATOMIC_INIT(1),
+ .cr_count = REFCOUNT_INIT(2),
.cr_flags = 1UL << RPCAUTH_CRED_UPTODATE,
};
diff --git a/net/sunrpc/auth_unix.c b/net/sunrpc/auth_unix.c
index 185e56d..e7df1f7 100644
--- a/net/sunrpc/auth_unix.c
+++ b/net/sunrpc/auth_unix.c
@@ -11,16 +11,11 @@
#include <linux/types.h>
#include <linux/sched.h>
#include <linux/module.h>
+#include <linux/mempool.h>
#include <linux/sunrpc/clnt.h>
#include <linux/sunrpc/auth.h>
#include <linux/user_namespace.h>
-struct unx_cred {
- struct rpc_cred uc_base;
- kgid_t uc_gid;
- kgid_t uc_gids[UNX_NGROUPS];
-};
-#define uc_uid uc_base.cr_uid
#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
# define RPCDBG_FACILITY RPCDBG_AUTH
@@ -28,29 +23,18 @@
static struct rpc_auth unix_auth;
static const struct rpc_credops unix_credops;
+static mempool_t *unix_pool;
static struct rpc_auth *
unx_create(const struct rpc_auth_create_args *args, struct rpc_clnt *clnt)
{
- dprintk("RPC: creating UNIX authenticator for client %p\n",
- clnt);
- atomic_inc(&unix_auth.au_count);
+ refcount_inc(&unix_auth.au_count);
return &unix_auth;
}
static void
unx_destroy(struct rpc_auth *auth)
{
- dprintk("RPC: destroying UNIX authenticator %p\n", auth);
- rpcauth_clear_credcache(auth->au_credcache);
-}
-
-static int
-unx_hash_cred(struct auth_cred *acred, unsigned int hashbits)
-{
- return hash_64(from_kgid(&init_user_ns, acred->gid) |
- ((u64)from_kuid(&init_user_ns, acred->uid) <<
- (sizeof(gid_t) * 8)), hashbits);
}
/*
@@ -59,52 +43,20 @@
static struct rpc_cred *
unx_lookup_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
{
- return rpcauth_lookup_credcache(auth, acred, flags, GFP_NOFS);
-}
+ struct rpc_cred *ret = mempool_alloc(unix_pool, GFP_NOFS);
-static struct rpc_cred *
-unx_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags, gfp_t gfp)
-{
- struct unx_cred *cred;
- unsigned int groups = 0;
- unsigned int i;
-
- dprintk("RPC: allocating UNIX cred for uid %d gid %d\n",
- from_kuid(&init_user_ns, acred->uid),
- from_kgid(&init_user_ns, acred->gid));
-
- if (!(cred = kmalloc(sizeof(*cred), gfp)))
- return ERR_PTR(-ENOMEM);
-
- rpcauth_init_cred(&cred->uc_base, acred, auth, &unix_credops);
- cred->uc_base.cr_flags = 1UL << RPCAUTH_CRED_UPTODATE;
-
- if (acred->group_info != NULL)
- groups = acred->group_info->ngroups;
- if (groups > UNX_NGROUPS)
- groups = UNX_NGROUPS;
-
- cred->uc_gid = acred->gid;
- for (i = 0; i < groups; i++)
- cred->uc_gids[i] = acred->group_info->gid[i];
- if (i < UNX_NGROUPS)
- cred->uc_gids[i] = INVALID_GID;
-
- return &cred->uc_base;
-}
-
-static void
-unx_free_cred(struct unx_cred *unx_cred)
-{
- dprintk("RPC: unx_free_cred %p\n", unx_cred);
- kfree(unx_cred);
+ rpcauth_init_cred(ret, acred, auth, &unix_credops);
+ ret->cr_flags = 1UL << RPCAUTH_CRED_UPTODATE;
+ return ret;
}
static void
unx_free_cred_callback(struct rcu_head *head)
{
- struct unx_cred *unx_cred = container_of(head, struct unx_cred, uc_base.cr_rcu);
- unx_free_cred(unx_cred);
+ struct rpc_cred *rpc_cred = container_of(head, struct rpc_cred, cr_rcu);
+
+ put_cred(rpc_cred->cr_cred);
+ mempool_free(rpc_cred, unix_pool);
}
static void
@@ -114,30 +66,32 @@
}
/*
- * Match credentials against current process creds.
- * The root_override argument takes care of cases where the caller may
- * request root creds (e.g. for NFS swapping).
+ * Match credentials against current the auth_cred.
*/
static int
-unx_match(struct auth_cred *acred, struct rpc_cred *rcred, int flags)
+unx_match(struct auth_cred *acred, struct rpc_cred *cred, int flags)
{
- struct unx_cred *cred = container_of(rcred, struct unx_cred, uc_base);
unsigned int groups = 0;
unsigned int i;
+ if (cred->cr_cred == acred->cred)
+ return 1;
- if (!uid_eq(cred->uc_uid, acred->uid) || !gid_eq(cred->uc_gid, acred->gid))
+ if (!uid_eq(cred->cr_cred->fsuid, acred->cred->fsuid) || !gid_eq(cred->cr_cred->fsgid, acred->cred->fsgid))
return 0;
- if (acred->group_info != NULL)
- groups = acred->group_info->ngroups;
+ if (acred->cred->group_info != NULL)
+ groups = acred->cred->group_info->ngroups;
if (groups > UNX_NGROUPS)
groups = UNX_NGROUPS;
- for (i = 0; i < groups ; i++)
- if (!gid_eq(cred->uc_gids[i], acred->group_info->gid[i]))
- return 0;
- if (groups < UNX_NGROUPS && gid_valid(cred->uc_gids[groups]))
+ if (cred->cr_cred->group_info == NULL)
+ return groups == 0;
+ if (groups != cred->cr_cred->group_info->ngroups)
return 0;
+
+ for (i = 0; i < groups ; i++)
+ if (!gid_eq(cred->cr_cred->group_info->gid[i], acred->cred->group_info->gid[i]))
+ return 0;
return 1;
}
@@ -145,35 +99,56 @@
* Marshal credentials.
* Maybe we should keep a cached credential for performance reasons.
*/
-static __be32 *
-unx_marshal(struct rpc_task *task, __be32 *p)
+static int
+unx_marshal(struct rpc_task *task, struct xdr_stream *xdr)
{
struct rpc_clnt *clnt = task->tk_client;
- struct unx_cred *cred = container_of(task->tk_rqstp->rq_cred, struct unx_cred, uc_base);
- __be32 *base, *hold;
+ struct rpc_cred *cred = task->tk_rqstp->rq_cred;
+ __be32 *p, *cred_len, *gidarr_len;
int i;
+ struct group_info *gi = cred->cr_cred->group_info;
+ struct user_namespace *userns = clnt->cl_cred ?
+ clnt->cl_cred->user_ns : &init_user_ns;
- *p++ = htonl(RPC_AUTH_UNIX);
- base = p++;
- *p++ = htonl(jiffies/HZ);
+ /* Credential */
- /*
- * Copy the UTS nodename captured when the client was created.
- */
- p = xdr_encode_array(p, clnt->cl_nodename, clnt->cl_nodelen);
+ p = xdr_reserve_space(xdr, 3 * sizeof(*p));
+ if (!p)
+ goto marshal_failed;
+ *p++ = rpc_auth_unix;
+ cred_len = p++;
+ *p++ = xdr_zero; /* stamp */
+ if (xdr_stream_encode_opaque(xdr, clnt->cl_nodename,
+ clnt->cl_nodelen) < 0)
+ goto marshal_failed;
+ p = xdr_reserve_space(xdr, 3 * sizeof(*p));
+ if (!p)
+ goto marshal_failed;
+ *p++ = cpu_to_be32(from_kuid_munged(userns, cred->cr_cred->fsuid));
+ *p++ = cpu_to_be32(from_kgid_munged(userns, cred->cr_cred->fsgid));
- *p++ = htonl((u32) from_kuid(&init_user_ns, cred->uc_uid));
- *p++ = htonl((u32) from_kgid(&init_user_ns, cred->uc_gid));
- hold = p++;
- for (i = 0; i < UNX_NGROUPS && gid_valid(cred->uc_gids[i]); i++)
- *p++ = htonl((u32) from_kgid(&init_user_ns, cred->uc_gids[i]));
- *hold = htonl(p - hold - 1); /* gid array length */
- *base = htonl((p - base - 1) << 2); /* cred length */
+ gidarr_len = p++;
+ if (gi)
+ for (i = 0; i < UNX_NGROUPS && i < gi->ngroups; i++)
+ *p++ = cpu_to_be32(from_kgid_munged(userns, gi->gid[i]));
+ *gidarr_len = cpu_to_be32(p - gidarr_len - 1);
+ *cred_len = cpu_to_be32((p - cred_len - 1) << 2);
+ p = xdr_reserve_space(xdr, (p - gidarr_len - 1) << 2);
+ if (!p)
+ goto marshal_failed;
- *p++ = htonl(RPC_AUTH_NULL);
- *p++ = htonl(0);
+ /* Verifier */
- return p;
+ p = xdr_reserve_space(xdr, 2 * sizeof(*p));
+ if (!p)
+ goto marshal_failed;
+ *p++ = rpc_auth_null;
+ *p = xdr_zero;
+
+ return 0;
+
+marshal_failed:
+ return -EMSGSIZE;
}
/*
@@ -186,39 +161,46 @@
return 0;
}
-static __be32 *
-unx_validate(struct rpc_task *task, __be32 *p)
+static int
+unx_validate(struct rpc_task *task, struct xdr_stream *xdr)
{
- rpc_authflavor_t flavor;
- u32 size;
+ struct rpc_auth *auth = task->tk_rqstp->rq_cred->cr_auth;
+ __be32 *p;
+ u32 size;
- flavor = ntohl(*p++);
- if (flavor != RPC_AUTH_NULL &&
- flavor != RPC_AUTH_UNIX &&
- flavor != RPC_AUTH_SHORT) {
- printk("RPC: bad verf flavor: %u\n", flavor);
- return ERR_PTR(-EIO);
+ p = xdr_inline_decode(xdr, 2 * sizeof(*p));
+ if (!p)
+ return -EIO;
+ switch (*p++) {
+ case rpc_auth_null:
+ case rpc_auth_unix:
+ case rpc_auth_short:
+ break;
+ default:
+ return -EIO;
}
+ size = be32_to_cpup(p);
+ if (size > RPC_MAX_AUTH_SIZE)
+ return -EIO;
+ p = xdr_inline_decode(xdr, size);
+ if (!p)
+ return -EIO;
- size = ntohl(*p++);
- if (size > RPC_MAX_AUTH_SIZE) {
- printk("RPC: giant verf size: %u\n", size);
- return ERR_PTR(-EIO);
- }
- task->tk_rqstp->rq_cred->cr_auth->au_rslack = (size >> 2) + 2;
- p += (size >> 2);
-
- return p;
+ auth->au_verfsize = XDR_QUADLEN(size) + 2;
+ auth->au_rslack = XDR_QUADLEN(size) + 2;
+ auth->au_ralign = XDR_QUADLEN(size) + 2;
+ return 0;
}
int __init rpc_init_authunix(void)
{
- return rpcauth_init_credcache(&unix_auth);
+ unix_pool = mempool_create_kmalloc_pool(16, sizeof(struct rpc_cred));
+ return unix_pool ? 0 : -ENOMEM;
}
void rpc_destroy_authunix(void)
{
- rpcauth_destroy_credcache(&unix_auth);
+ mempool_destroy(unix_pool);
}
const struct rpc_authops authunix_ops = {
@@ -227,28 +209,27 @@
.au_name = "UNIX",
.create = unx_create,
.destroy = unx_destroy,
- .hash_cred = unx_hash_cred,
.lookup_cred = unx_lookup_cred,
- .crcreate = unx_create_cred,
};
static
struct rpc_auth unix_auth = {
.au_cslack = UNX_CALLSLACK,
.au_rslack = NUL_REPLYSLACK,
- .au_flags = RPCAUTH_AUTH_NO_CRKEY_TIMEOUT,
+ .au_verfsize = NUL_REPLYSLACK,
.au_ops = &authunix_ops,
.au_flavor = RPC_AUTH_UNIX,
- .au_count = ATOMIC_INIT(0),
+ .au_count = REFCOUNT_INIT(1),
};
static
const struct rpc_credops unix_credops = {
.cr_name = "AUTH_UNIX",
.crdestroy = unx_destroy_cred,
- .crbind = rpcauth_generic_bind_cred,
.crmatch = unx_match,
.crmarshal = unx_marshal,
+ .crwrap_req = rpcauth_wrap_req_encode,
.crrefresh = unx_refresh,
.crvalidate = unx_validate,
+ .crunwrap_resp = rpcauth_unwrap_resp_decode,
};
diff --git a/net/sunrpc/backchannel_rqst.c b/net/sunrpc/backchannel_rqst.c
index 3c15a99..195b40c 100644
--- a/net/sunrpc/backchannel_rqst.c
+++ b/net/sunrpc/backchannel_rqst.c
@@ -31,25 +31,20 @@
#define RPCDBG_FACILITY RPCDBG_TRANS
#endif
+#define BC_MAX_SLOTS 64U
+
+unsigned int xprt_bc_max_slots(struct rpc_xprt *xprt)
+{
+ return BC_MAX_SLOTS;
+}
+
/*
* Helper routines that track the number of preallocation elements
* on the transport.
*/
static inline int xprt_need_to_requeue(struct rpc_xprt *xprt)
{
- return xprt->bc_alloc_count < atomic_read(&xprt->bc_free_slots);
-}
-
-static inline void xprt_inc_alloc_count(struct rpc_xprt *xprt, unsigned int n)
-{
- atomic_add(n, &xprt->bc_free_slots);
- xprt->bc_alloc_count += n;
-}
-
-static inline int xprt_dec_alloc_count(struct rpc_xprt *xprt, unsigned int n)
-{
- atomic_sub(n, &xprt->bc_free_slots);
- return xprt->bc_alloc_count -= n;
+ return xprt->bc_alloc_count < xprt->bc_alloc_max;
}
/*
@@ -91,7 +86,6 @@
return NULL;
req->rq_xprt = xprt;
- INIT_LIST_HEAD(&req->rq_list);
INIT_LIST_HEAD(&req->rq_bc_list);
/* Preallocate one XDR receive buffer */
@@ -146,6 +140,9 @@
dprintk("RPC: setup backchannel transport\n");
+ if (min_reqs > BC_MAX_SLOTS)
+ min_reqs = BC_MAX_SLOTS;
+
/*
* We use a temporary list to keep track of the preallocated
* buffers. Once we're done building the list we splice it
@@ -173,7 +170,9 @@
*/
spin_lock(&xprt->bc_pa_lock);
list_splice(&tmp_list, &xprt->bc_pa_list);
- xprt_inc_alloc_count(xprt, min_reqs);
+ xprt->bc_alloc_count += min_reqs;
+ xprt->bc_alloc_max += min_reqs;
+ atomic_add(min_reqs, &xprt->bc_slot_count);
spin_unlock(&xprt->bc_pa_lock);
dprintk("RPC: setup backchannel transport done\n");
@@ -198,7 +197,7 @@
/**
* xprt_destroy_backchannel - Destroys the backchannel preallocated structures.
* @xprt: the transport holding the preallocated strucures
- * @max_reqs the maximum number of preallocated structures to destroy
+ * @max_reqs: the maximum number of preallocated structures to destroy
*
* Since these structures may have been allocated by multiple calls
* to xprt_setup_backchannel, we only destroy up to the maximum number
@@ -221,11 +220,13 @@
goto out;
spin_lock_bh(&xprt->bc_pa_lock);
- xprt_dec_alloc_count(xprt, max_reqs);
+ xprt->bc_alloc_max -= min(max_reqs, xprt->bc_alloc_max);
list_for_each_entry_safe(req, tmp, &xprt->bc_pa_list, rq_bc_pa_list) {
dprintk("RPC: req=%p\n", req);
list_del(&req->rq_bc_pa_list);
xprt_free_allocation(req);
+ xprt->bc_alloc_count--;
+ atomic_dec(&xprt->bc_slot_count);
if (--max_reqs == 0)
break;
}
@@ -236,30 +237,30 @@
list_empty(&xprt->bc_pa_list) ? "true" : "false");
}
-static struct rpc_rqst *xprt_alloc_bc_request(struct rpc_xprt *xprt, __be32 xid)
+static struct rpc_rqst *xprt_get_bc_request(struct rpc_xprt *xprt, __be32 xid,
+ struct rpc_rqst *new)
{
struct rpc_rqst *req = NULL;
dprintk("RPC: allocate a backchannel request\n");
- if (atomic_read(&xprt->bc_free_slots) <= 0)
- goto not_found;
if (list_empty(&xprt->bc_pa_list)) {
- req = xprt_alloc_bc_req(xprt, GFP_ATOMIC);
- if (!req)
+ if (!new)
goto not_found;
- list_add_tail(&req->rq_bc_pa_list, &xprt->bc_pa_list);
+ if (atomic_read(&xprt->bc_slot_count) >= BC_MAX_SLOTS)
+ goto not_found;
+ list_add_tail(&new->rq_bc_pa_list, &xprt->bc_pa_list);
xprt->bc_alloc_count++;
+ atomic_inc(&xprt->bc_slot_count);
}
req = list_first_entry(&xprt->bc_pa_list, struct rpc_rqst,
rq_bc_pa_list);
req->rq_reply_bytes_recvd = 0;
- req->rq_bytes_sent = 0;
memcpy(&req->rq_private_buf, &req->rq_rcv_buf,
sizeof(req->rq_private_buf));
req->rq_xid = xid;
req->rq_connect_cookie = xprt->connect_cookie;
-not_found:
dprintk("RPC: backchannel req=%p\n", req);
+not_found:
return req;
}
@@ -293,6 +294,7 @@
if (xprt_need_to_requeue(xprt)) {
list_add_tail(&req->rq_bc_pa_list, &xprt->bc_pa_list);
xprt->bc_alloc_count++;
+ atomic_inc(&xprt->bc_slot_count);
req = NULL;
}
spin_unlock_bh(&xprt->bc_pa_lock);
@@ -305,8 +307,8 @@
*/
dprintk("RPC: Last session removed req=%p\n", req);
xprt_free_allocation(req);
- return;
}
+ xprt_put(xprt);
}
/*
@@ -322,18 +324,27 @@
*/
struct rpc_rqst *xprt_lookup_bc_request(struct rpc_xprt *xprt, __be32 xid)
{
- struct rpc_rqst *req;
+ struct rpc_rqst *req, *new = NULL;
- spin_lock(&xprt->bc_pa_lock);
- list_for_each_entry(req, &xprt->bc_pa_list, rq_bc_pa_list) {
- if (req->rq_connect_cookie != xprt->connect_cookie)
- continue;
- if (req->rq_xid == xid)
- goto found;
- }
- req = xprt_alloc_bc_request(xprt, xid);
+ do {
+ spin_lock(&xprt->bc_pa_lock);
+ list_for_each_entry(req, &xprt->bc_pa_list, rq_bc_pa_list) {
+ if (req->rq_connect_cookie != xprt->connect_cookie)
+ continue;
+ if (req->rq_xid == xid)
+ goto found;
+ }
+ req = xprt_get_bc_request(xprt, xid, new);
found:
- spin_unlock(&xprt->bc_pa_lock);
+ spin_unlock(&xprt->bc_pa_lock);
+ if (new) {
+ if (req != new)
+ xprt_free_allocation(new);
+ break;
+ } else if (req)
+ break;
+ new = xprt_alloc_bc_req(xprt, GFP_KERNEL);
+ } while (new);
return req;
}
@@ -350,13 +361,14 @@
spin_lock(&xprt->bc_pa_lock);
list_del(&req->rq_bc_pa_list);
- xprt_dec_alloc_count(xprt, 1);
+ xprt->bc_alloc_count--;
spin_unlock(&xprt->bc_pa_lock);
req->rq_private_buf.len = copied;
set_bit(RPC_BC_PA_IN_USE, &req->rq_bc_pa_state);
dprintk("RPC: add callback request to list\n");
+ xprt_get(xprt);
spin_lock(&bc_serv->sv_cb_lock);
list_add(&req->rq_bc_list, &bc_serv->sv_cb_list);
wake_up(&bc_serv->sv_cb_waitq);
diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c
index 109fbe5..a349094 100644
--- a/net/sunrpc/cache.c
+++ b/net/sunrpc/cache.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* net/sunrpc/cache.c
*
@@ -5,9 +6,6 @@
* used by sunrpc clients and servers.
*
* Copyright (C) 2002 Neil Brown <neilb@cse.unsw.edu.au>
- *
- * Released under terms in GPL version 2. See COPYING.
- *
*/
#include <linux/types.h>
@@ -40,6 +38,7 @@
static bool cache_defer_req(struct cache_req *req, struct cache_head *item);
static void cache_revisit_request(struct cache_head *item);
+static bool cache_listeners_exist(struct cache_detail *detail);
static void cache_init(struct cache_head *h, struct cache_detail *detail)
{
@@ -54,28 +53,39 @@
h->last_refresh = now;
}
-struct cache_head *sunrpc_cache_lookup(struct cache_detail *detail,
- struct cache_head *key, int hash)
+static inline int cache_is_valid(struct cache_head *h);
+static void cache_fresh_locked(struct cache_head *head, time_t expiry,
+ struct cache_detail *detail);
+static void cache_fresh_unlocked(struct cache_head *head,
+ struct cache_detail *detail);
+
+static struct cache_head *sunrpc_cache_find_rcu(struct cache_detail *detail,
+ struct cache_head *key,
+ int hash)
{
- struct cache_head *new = NULL, *freeme = NULL, *tmp = NULL;
- struct hlist_head *head;
+ struct hlist_head *head = &detail->hash_table[hash];
+ struct cache_head *tmp;
- head = &detail->hash_table[hash];
-
- read_lock(&detail->hash_lock);
-
- hlist_for_each_entry(tmp, head, cache_list) {
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(tmp, head, cache_list) {
if (detail->match(tmp, key)) {
if (cache_is_expired(detail, tmp))
- /* This entry is expired, we will discard it. */
- break;
- cache_get(tmp);
- read_unlock(&detail->hash_lock);
+ continue;
+ tmp = cache_get_rcu(tmp);
+ rcu_read_unlock();
return tmp;
}
}
- read_unlock(&detail->hash_lock);
- /* Didn't find anything, insert an empty entry */
+ rcu_read_unlock();
+ return NULL;
+}
+
+static struct cache_head *sunrpc_cache_add_entry(struct cache_detail *detail,
+ struct cache_head *key,
+ int hash)
+{
+ struct cache_head *new, *tmp, *freeme = NULL;
+ struct hlist_head *head = &detail->hash_table[hash];
new = detail->alloc();
if (!new)
@@ -87,35 +97,51 @@
cache_init(new, detail);
detail->init(new, key);
- write_lock(&detail->hash_lock);
+ spin_lock(&detail->hash_lock);
/* check if entry appeared while we slept */
- hlist_for_each_entry(tmp, head, cache_list) {
+ hlist_for_each_entry_rcu(tmp, head, cache_list) {
if (detail->match(tmp, key)) {
if (cache_is_expired(detail, tmp)) {
- hlist_del_init(&tmp->cache_list);
+ hlist_del_init_rcu(&tmp->cache_list);
detail->entries --;
+ if (cache_is_valid(tmp) == -EAGAIN)
+ set_bit(CACHE_NEGATIVE, &tmp->flags);
+ cache_fresh_locked(tmp, 0, detail);
freeme = tmp;
break;
}
cache_get(tmp);
- write_unlock(&detail->hash_lock);
+ spin_unlock(&detail->hash_lock);
cache_put(new, detail);
return tmp;
}
}
- hlist_add_head(&new->cache_list, head);
+ hlist_add_head_rcu(&new->cache_list, head);
detail->entries++;
cache_get(new);
- write_unlock(&detail->hash_lock);
+ spin_unlock(&detail->hash_lock);
- if (freeme)
+ if (freeme) {
+ cache_fresh_unlocked(freeme, detail);
cache_put(freeme, detail);
+ }
return new;
}
-EXPORT_SYMBOL_GPL(sunrpc_cache_lookup);
+struct cache_head *sunrpc_cache_lookup_rcu(struct cache_detail *detail,
+ struct cache_head *key, int hash)
+{
+ struct cache_head *ret;
+
+ ret = sunrpc_cache_find_rcu(detail, key, hash);
+ if (ret)
+ return ret;
+ /* Didn't find anything, insert an empty entry */
+ return sunrpc_cache_add_entry(detail, key, hash);
+}
+EXPORT_SYMBOL_GPL(sunrpc_cache_lookup_rcu);
static void cache_dequeue(struct cache_detail *detail, struct cache_head *ch);
@@ -151,18 +177,18 @@
struct cache_head *tmp;
if (!test_bit(CACHE_VALID, &old->flags)) {
- write_lock(&detail->hash_lock);
+ spin_lock(&detail->hash_lock);
if (!test_bit(CACHE_VALID, &old->flags)) {
if (test_bit(CACHE_NEGATIVE, &new->flags))
set_bit(CACHE_NEGATIVE, &old->flags);
else
detail->update(old, new);
cache_fresh_locked(old, new->expiry_time, detail);
- write_unlock(&detail->hash_lock);
+ spin_unlock(&detail->hash_lock);
cache_fresh_unlocked(old, detail);
return old;
}
- write_unlock(&detail->hash_lock);
+ spin_unlock(&detail->hash_lock);
}
/* We need to insert a new entry */
tmp = detail->alloc();
@@ -173,7 +199,7 @@
cache_init(tmp, detail);
detail->init(tmp, old);
- write_lock(&detail->hash_lock);
+ spin_lock(&detail->hash_lock);
if (test_bit(CACHE_NEGATIVE, &new->flags))
set_bit(CACHE_NEGATIVE, &tmp->flags);
else
@@ -183,7 +209,7 @@
cache_get(tmp);
cache_fresh_locked(tmp, new->expiry_time, detail);
cache_fresh_locked(old, 0, detail);
- write_unlock(&detail->hash_lock);
+ spin_unlock(&detail->hash_lock);
cache_fresh_unlocked(tmp, detail);
cache_fresh_unlocked(old, detail);
cache_put(old, detail);
@@ -223,7 +249,7 @@
{
int rv;
- write_lock(&detail->hash_lock);
+ spin_lock(&detail->hash_lock);
rv = cache_is_valid(h);
if (rv == -EAGAIN) {
set_bit(CACHE_NEGATIVE, &h->flags);
@@ -231,7 +257,7 @@
detail);
rv = -ENOENT;
}
- write_unlock(&detail->hash_lock);
+ spin_unlock(&detail->hash_lock);
cache_fresh_unlocked(h, detail);
return rv;
}
@@ -279,7 +305,8 @@
cache_fresh_unlocked(h, detail);
break;
}
- }
+ } else if (!cache_listeners_exist(detail))
+ rv = try_to_negate_entry(detail, h);
}
if (rv == -EAGAIN) {
@@ -341,12 +368,12 @@
void sunrpc_init_cache_detail(struct cache_detail *cd)
{
- rwlock_init(&cd->hash_lock);
+ spin_lock_init(&cd->hash_lock);
INIT_LIST_HEAD(&cd->queue);
spin_lock(&cache_list_lock);
cd->nextcheck = 0;
cd->entries = 0;
- atomic_set(&cd->readers, 0);
+ atomic_set(&cd->writers, 0);
cd->last_close = 0;
cd->last_warn = -1;
list_add(&cd->others, &cache_list);
@@ -361,11 +388,11 @@
{
cache_purge(cd);
spin_lock(&cache_list_lock);
- write_lock(&cd->hash_lock);
+ spin_lock(&cd->hash_lock);
if (current_detail == cd)
current_detail = NULL;
list_del_init(&cd->others);
- write_unlock(&cd->hash_lock);
+ spin_unlock(&cd->hash_lock);
spin_unlock(&cache_list_lock);
if (list_empty(&cache_list)) {
/* module must be being unloaded so its safe to kill the worker */
@@ -422,7 +449,7 @@
struct hlist_head *head;
struct hlist_node *tmp;
- write_lock(¤t_detail->hash_lock);
+ spin_lock(¤t_detail->hash_lock);
/* Ok, now to clean this strand */
@@ -433,13 +460,13 @@
if (!cache_is_expired(current_detail, ch))
continue;
- hlist_del_init(&ch->cache_list);
+ hlist_del_init_rcu(&ch->cache_list);
current_detail->entries--;
rv = 1;
break;
}
- write_unlock(¤t_detail->hash_lock);
+ spin_unlock(¤t_detail->hash_lock);
d = current_detail;
if (!ch)
current_index ++;
@@ -494,9 +521,9 @@
struct hlist_node *tmp = NULL;
int i = 0;
- write_lock(&detail->hash_lock);
+ spin_lock(&detail->hash_lock);
if (!detail->entries) {
- write_unlock(&detail->hash_lock);
+ spin_unlock(&detail->hash_lock);
return;
}
@@ -504,17 +531,17 @@
for (i = 0; i < detail->hash_size; i++) {
head = &detail->hash_table[i];
hlist_for_each_entry_safe(ch, tmp, head, cache_list) {
- hlist_del_init(&ch->cache_list);
+ hlist_del_init_rcu(&ch->cache_list);
detail->entries--;
set_bit(CACHE_CLEANED, &ch->flags);
- write_unlock(&detail->hash_lock);
+ spin_unlock(&detail->hash_lock);
cache_fresh_unlocked(ch, detail);
cache_put(ch, detail);
- write_lock(&detail->hash_lock);
+ spin_lock(&detail->hash_lock);
}
}
- write_unlock(&detail->hash_lock);
+ spin_unlock(&detail->hash_lock);
}
EXPORT_SYMBOL_GPL(cache_purge);
@@ -1002,11 +1029,13 @@
}
rp->offset = 0;
rp->q.reader = 1;
- atomic_inc(&cd->readers);
+
spin_lock(&queue_lock);
list_add(&rp->q.list, &cd->queue);
spin_unlock(&queue_lock);
}
+ if (filp->f_mode & FMODE_WRITE)
+ atomic_inc(&cd->writers);
filp->private_data = rp;
return 0;
}
@@ -1035,8 +1064,10 @@
filp->private_data = NULL;
kfree(rp);
+ }
+ if (filp->f_mode & FMODE_WRITE) {
+ atomic_dec(&cd->writers);
cd->last_close = seconds_since_boot();
- atomic_dec(&cd->readers);
}
module_put(cd->owner);
return 0;
@@ -1144,7 +1175,7 @@
static bool cache_listeners_exist(struct cache_detail *detail)
{
- if (atomic_read(&detail->readers))
+ if (atomic_read(&detail->writers))
return true;
if (detail->last_close == 0)
/* This cache was never opened */
@@ -1289,21 +1320,19 @@
* get a header, then pass each real item in the cache
*/
-void *cache_seq_start(struct seq_file *m, loff_t *pos)
- __acquires(cd->hash_lock)
+static void *__cache_seq_start(struct seq_file *m, loff_t *pos)
{
loff_t n = *pos;
unsigned int hash, entry;
struct cache_head *ch;
struct cache_detail *cd = m->private;
- read_lock(&cd->hash_lock);
if (!n--)
return SEQ_START_TOKEN;
hash = n >> 32;
entry = n & ((1LL<<32) - 1);
- hlist_for_each_entry(ch, &cd->hash_table[hash], cache_list)
+ hlist_for_each_entry_rcu(ch, &cd->hash_table[hash], cache_list)
if (!entry--)
return ch;
n &= ~((1LL<<32) - 1);
@@ -1315,12 +1344,12 @@
if (hash >= cd->hash_size)
return NULL;
*pos = n+1;
- return hlist_entry_safe(cd->hash_table[hash].first,
+ return hlist_entry_safe(rcu_dereference_raw(
+ hlist_first_rcu(&cd->hash_table[hash])),
struct cache_head, cache_list);
}
-EXPORT_SYMBOL_GPL(cache_seq_start);
-void *cache_seq_next(struct seq_file *m, void *p, loff_t *pos)
+static void *cache_seq_next(struct seq_file *m, void *p, loff_t *pos)
{
struct cache_head *ch = p;
int hash = (*pos >> 32);
@@ -1333,7 +1362,8 @@
*pos += 1LL<<32;
} else {
++*pos;
- return hlist_entry_safe(ch->cache_list.next,
+ return hlist_entry_safe(rcu_dereference_raw(
+ hlist_next_rcu(&ch->cache_list)),
struct cache_head, cache_list);
}
*pos &= ~((1LL<<32) - 1);
@@ -1345,18 +1375,31 @@
if (hash >= cd->hash_size)
return NULL;
++*pos;
- return hlist_entry_safe(cd->hash_table[hash].first,
+ return hlist_entry_safe(rcu_dereference_raw(
+ hlist_first_rcu(&cd->hash_table[hash])),
struct cache_head, cache_list);
}
-EXPORT_SYMBOL_GPL(cache_seq_next);
-void cache_seq_stop(struct seq_file *m, void *p)
- __releases(cd->hash_lock)
+void *cache_seq_start_rcu(struct seq_file *m, loff_t *pos)
+ __acquires(RCU)
{
- struct cache_detail *cd = m->private;
- read_unlock(&cd->hash_lock);
+ rcu_read_lock();
+ return __cache_seq_start(m, pos);
}
-EXPORT_SYMBOL_GPL(cache_seq_stop);
+EXPORT_SYMBOL_GPL(cache_seq_start_rcu);
+
+void *cache_seq_next_rcu(struct seq_file *file, void *p, loff_t *pos)
+{
+ return cache_seq_next(file, p, pos);
+}
+EXPORT_SYMBOL_GPL(cache_seq_next_rcu);
+
+void cache_seq_stop_rcu(struct seq_file *m, void *p)
+ __releases(RCU)
+{
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(cache_seq_stop_rcu);
static int c_show(struct seq_file *m, void *p)
{
@@ -1384,9 +1427,9 @@
}
static const struct seq_operations cache_content_op = {
- .start = cache_seq_start,
- .next = cache_seq_next,
- .stop = cache_seq_stop,
+ .start = cache_seq_start_rcu,
+ .next = cache_seq_next_rcu,
+ .stop = cache_seq_stop_rcu,
.show = c_show,
};
@@ -1481,6 +1524,9 @@
cd->nextcheck = now;
cache_flush();
+ if (cd->flush)
+ cd->flush();
+
*ppos += count;
return count;
}
@@ -1844,13 +1890,13 @@
void sunrpc_cache_unhash(struct cache_detail *cd, struct cache_head *h)
{
- write_lock(&cd->hash_lock);
+ spin_lock(&cd->hash_lock);
if (!hlist_unhashed(&h->cache_list)){
- hlist_del_init(&h->cache_list);
+ hlist_del_init_rcu(&h->cache_list);
cd->entries--;
- write_unlock(&cd->hash_lock);
+ spin_unlock(&cd->hash_lock);
cache_put(h, cd);
} else
- write_unlock(&cd->hash_lock);
+ spin_unlock(&cd->hash_lock);
}
EXPORT_SYMBOL_GPL(sunrpc_cache_unhash);
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 8ea2f5f..f7f7856 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/net/sunrpc/clnt.c
*
@@ -61,24 +62,24 @@
static void call_reserve(struct rpc_task *task);
static void call_reserveresult(struct rpc_task *task);
static void call_allocate(struct rpc_task *task);
+static void call_encode(struct rpc_task *task);
static void call_decode(struct rpc_task *task);
static void call_bind(struct rpc_task *task);
static void call_bind_status(struct rpc_task *task);
static void call_transmit(struct rpc_task *task);
-#if defined(CONFIG_SUNRPC_BACKCHANNEL)
-static void call_bc_transmit(struct rpc_task *task);
-#endif /* CONFIG_SUNRPC_BACKCHANNEL */
static void call_status(struct rpc_task *task);
static void call_transmit_status(struct rpc_task *task);
static void call_refresh(struct rpc_task *task);
static void call_refreshresult(struct rpc_task *task);
-static void call_timeout(struct rpc_task *task);
static void call_connect(struct rpc_task *task);
static void call_connect_status(struct rpc_task *task);
-static __be32 *rpc_encode_header(struct rpc_task *task);
-static __be32 *rpc_verify_header(struct rpc_task *task);
+static int rpc_encode_header(struct rpc_task *task,
+ struct xdr_stream *xdr);
+static int rpc_decode_header(struct rpc_task *task,
+ struct xdr_stream *xdr);
static int rpc_ping(struct rpc_clnt *clnt);
+static void rpc_check_timeout(struct rpc_task *task);
static void rpc_register_client(struct rpc_clnt *clnt)
{
@@ -394,6 +395,7 @@
if (err)
goto out_no_clid;
+ clnt->cl_cred = get_cred(args->cred);
clnt->cl_procinfo = version->procs;
clnt->cl_maxproc = version->nrprocs;
clnt->cl_prog = args->prognumber ? : program->number;
@@ -439,6 +441,7 @@
out_no_path:
rpc_free_iostats(clnt->cl_metrics);
out_no_stats:
+ put_cred(clnt->cl_cred);
rpc_free_clid(clnt);
out_no_clid:
kfree(clnt);
@@ -484,8 +487,11 @@
}
clnt->cl_softrtry = 1;
- if (args->flags & RPC_CLNT_CREATE_HARDRTRY)
+ if (args->flags & (RPC_CLNT_CREATE_HARDRTRY|RPC_CLNT_CREATE_SOFTERR)) {
clnt->cl_softrtry = 0;
+ if (args->flags & RPC_CLNT_CREATE_SOFTERR)
+ clnt->cl_softerr = 1;
+ }
if (args->flags & RPC_CLNT_CREATE_AUTOBIND)
clnt->cl_autobind = 1;
@@ -522,6 +528,8 @@
.bc_xprt = args->bc_xprt,
};
char servername[48];
+ struct rpc_clnt *clnt;
+ int i;
if (args->bc_xprt) {
WARN_ON_ONCE(!(args->protocol & XPRT_TRANSPORT_BC));
@@ -584,7 +592,15 @@
if (args->flags & RPC_CLNT_CREATE_NONPRIVPORT)
xprt->resvport = 0;
- return rpc_create_xprt(args, xprt);
+ clnt = rpc_create_xprt(args, xprt);
+ if (IS_ERR(clnt) || args->nconnect <= 1)
+ return clnt;
+
+ for (i = 0; i < args->nconnect - 1; i++) {
+ if (rpc_clnt_add_xprt(clnt, &xprtargs, NULL, NULL) < 0)
+ break;
+ }
+ return clnt;
}
EXPORT_SYMBOL_GPL(rpc_create);
@@ -623,9 +639,11 @@
/* Turn off autobind on clones */
new->cl_autobind = 0;
new->cl_softrtry = clnt->cl_softrtry;
+ new->cl_softerr = clnt->cl_softerr;
new->cl_noretranstimeo = clnt->cl_noretranstimeo;
new->cl_discrtry = clnt->cl_discrtry;
new->cl_chatty = clnt->cl_chatty;
+ new->cl_principal = clnt->cl_principal;
return new;
out_err:
@@ -647,6 +665,7 @@
.prognumber = clnt->cl_prog,
.version = clnt->cl_vers,
.authflavor = clnt->cl_auth->au_flavor,
+ .cred = clnt->cl_cred,
};
return __rpc_clone_client(&args, clnt);
}
@@ -668,6 +687,7 @@
.prognumber = clnt->cl_prog,
.version = clnt->cl_vers,
.authflavor = flavor,
+ .cred = clnt->cl_cred,
};
return __rpc_clone_client(&args, clnt);
}
@@ -826,17 +846,8 @@
* Spin lock all_tasks to prevent changes...
*/
spin_lock(&clnt->cl_lock);
- list_for_each_entry(rovr, &clnt->cl_tasks, tk_task) {
- if (!RPC_IS_ACTIVATED(rovr))
- continue;
- if (!(rovr->tk_flags & RPC_TASK_KILLED)) {
- rovr->tk_flags |= RPC_TASK_KILLED;
- rpc_exit(rovr, -EIO);
- if (RPC_IS_QUEUED(rovr))
- rpc_wake_up_queued_task(rovr->tk_waitqueue,
- rovr);
- }
- }
+ list_for_each_entry(rovr, &clnt->cl_tasks, tk_task)
+ rpc_signal_task(rovr);
spin_unlock(&clnt->cl_lock);
}
EXPORT_SYMBOL_GPL(rpc_killall_tasks);
@@ -884,6 +895,7 @@
xprt_put(rcu_dereference_raw(clnt->cl_xprt));
xprt_iter_destroy(&clnt->cl_xpi);
rpciod_down();
+ put_cred(clnt->cl_cred);
rpc_free_clid(clnt);
kfree(clnt);
return parent;
@@ -948,6 +960,7 @@
.prognumber = program->number,
.version = vers,
.authflavor = old->cl_auth->au_flavor,
+ .cred = old->cl_cred,
};
struct rpc_clnt *clnt;
int err;
@@ -965,13 +978,46 @@
}
EXPORT_SYMBOL_GPL(rpc_bind_new_program);
+struct rpc_xprt *
+rpc_task_get_xprt(struct rpc_clnt *clnt, struct rpc_xprt *xprt)
+{
+ struct rpc_xprt_switch *xps;
+
+ if (!xprt)
+ return NULL;
+ rcu_read_lock();
+ xps = rcu_dereference(clnt->cl_xpi.xpi_xpswitch);
+ atomic_long_inc(&xps->xps_queuelen);
+ rcu_read_unlock();
+ atomic_long_inc(&xprt->queuelen);
+
+ return xprt;
+}
+
+static void
+rpc_task_release_xprt(struct rpc_clnt *clnt, struct rpc_xprt *xprt)
+{
+ struct rpc_xprt_switch *xps;
+
+ atomic_long_dec(&xprt->queuelen);
+ rcu_read_lock();
+ xps = rcu_dereference(clnt->cl_xpi.xpi_xpswitch);
+ atomic_long_dec(&xps->xps_queuelen);
+ rcu_read_unlock();
+
+ xprt_put(xprt);
+}
+
void rpc_task_release_transport(struct rpc_task *task)
{
struct rpc_xprt *xprt = task->tk_xprt;
if (xprt) {
task->tk_xprt = NULL;
- xprt_put(xprt);
+ if (task->tk_client)
+ rpc_task_release_xprt(task->tk_client, xprt);
+ else
+ xprt_put(xprt);
}
}
EXPORT_SYMBOL_GPL(rpc_task_release_transport);
@@ -980,6 +1026,7 @@
{
struct rpc_clnt *clnt = task->tk_client;
+ rpc_task_release_transport(task);
if (clnt != NULL) {
/* Remove from client task list */
spin_lock(&clnt->cl_lock);
@@ -989,14 +1036,34 @@
rpc_release_client(clnt);
}
- rpc_task_release_transport(task);
+}
+
+static struct rpc_xprt *
+rpc_task_get_first_xprt(struct rpc_clnt *clnt)
+{
+ struct rpc_xprt *xprt;
+
+ rcu_read_lock();
+ xprt = xprt_get(rcu_dereference(clnt->cl_xprt));
+ rcu_read_unlock();
+ return rpc_task_get_xprt(clnt, xprt);
+}
+
+static struct rpc_xprt *
+rpc_task_get_next_xprt(struct rpc_clnt *clnt)
+{
+ return rpc_task_get_xprt(clnt, xprt_iter_get_next(&clnt->cl_xpi));
}
static
void rpc_task_set_transport(struct rpc_task *task, struct rpc_clnt *clnt)
{
- if (!task->tk_xprt)
- task->tk_xprt = xprt_iter_get_next(&clnt->cl_xpi);
+ if (task->tk_xprt)
+ return;
+ if (task->tk_flags & RPC_TASK_NO_ROUND_ROBIN)
+ task->tk_xprt = rpc_task_get_first_xprt(clnt);
+ else
+ task->tk_xprt = rpc_task_get_next_xprt(clnt);
}
static
@@ -1009,6 +1076,8 @@
atomic_inc(&clnt->cl_count);
if (clnt->cl_softrtry)
task->tk_flags |= RPC_TASK_SOFT;
+ if (clnt->cl_softerr)
+ task->tk_flags |= RPC_TASK_TIMEOUT;
if (clnt->cl_noretranstimeo)
task->tk_flags |= RPC_TASK_NO_RETRANS_TIMEOUT;
if (atomic_read(&clnt->cl_swapper))
@@ -1028,7 +1097,7 @@
task->tk_msg.rpc_argp = msg->rpc_argp;
task->tk_msg.rpc_resp = msg->rpc_resp;
if (msg->rpc_cred != NULL)
- task->tk_msg.rpc_cred = get_rpccred(msg->rpc_cred);
+ task->tk_msg.rpc_cred = get_cred(msg->rpc_cred);
}
}
@@ -1129,6 +1198,8 @@
EXPORT_SYMBOL_GPL(rpc_call_async);
#if defined(CONFIG_SUNRPC_BACKCHANNEL)
+static void call_bc_encode(struct rpc_task *task);
+
/**
* rpc_run_bc_task - Allocate a new RPC task for backchannel use, then run
* rpc_execute against it
@@ -1137,10 +1208,10 @@
struct rpc_task *rpc_run_bc_task(struct rpc_rqst *req)
{
struct rpc_task *task;
- struct xdr_buf *xbufp = &req->rq_snd_buf;
struct rpc_task_setup task_setup_data = {
.callback_ops = &rpc_default_ops,
- .flags = RPC_TASK_SOFTCONN,
+ .flags = RPC_TASK_SOFTCONN |
+ RPC_TASK_NO_RETRANS_TIMEOUT,
};
dprintk("RPC: rpc_run_bc_task req= %p\n", req);
@@ -1148,16 +1219,9 @@
* Create an rpc_task to send the data
*/
task = rpc_new_task(&task_setup_data);
- task->tk_rqstp = req;
+ xprt_init_bc_request(req, task);
- /*
- * Set up the xdr_buf length.
- * This also indicates that the buffer is XDR encoded already.
- */
- xbufp->len = xbufp->head[0].iov_len + xbufp->page_len +
- xbufp->tail[0].iov_len;
-
- task->tk_action = call_bc_transmit;
+ task->tk_action = call_bc_encode;
atomic_inc(&task->tk_count);
WARN_ON_ONCE(atomic_read(&task->tk_count) != 2);
rpc_execute(task);
@@ -1167,6 +1231,29 @@
}
#endif /* CONFIG_SUNRPC_BACKCHANNEL */
+/**
+ * rpc_prepare_reply_pages - Prepare to receive a reply data payload into pages
+ * @req: RPC request to prepare
+ * @pages: vector of struct page pointers
+ * @base: offset in first page where receive should start, in bytes
+ * @len: expected size of the upper layer data payload, in bytes
+ * @hdrsize: expected size of upper layer reply header, in XDR words
+ *
+ */
+void rpc_prepare_reply_pages(struct rpc_rqst *req, struct page **pages,
+ unsigned int base, unsigned int len,
+ unsigned int hdrsize)
+{
+ /* Subtract one to force an extra word of buffer space for the
+ * payload's XDR pad to fall into the rcv_buf's tail iovec.
+ */
+ hdrsize += RPC_REPHDRSIZE + req->rq_cred->cr_auth->au_ralign - 1;
+
+ xdr_inline_pages(&req->rq_rcv_buf, hdrsize << 2, pages, base, len);
+ trace_rpc_reply_pages(req);
+}
+EXPORT_SYMBOL_GPL(rpc_prepare_reply_pages);
+
void
rpc_call_start(struct rpc_task *task)
{
@@ -1439,6 +1526,19 @@
}
EXPORT_SYMBOL_GPL(rpc_max_bc_payload);
+unsigned int rpc_num_bc_slots(struct rpc_clnt *clnt)
+{
+ struct rpc_xprt *xprt;
+ unsigned int ret;
+
+ rcu_read_lock();
+ xprt = rcu_dereference(clnt->cl_xprt);
+ ret = xprt->ops->bc_num_slots(xprt);
+ rcu_read_unlock();
+ return ret;
+}
+EXPORT_SYMBOL_GPL(rpc_num_bc_slots);
+
/**
* rpc_force_rebind - force transport to check that remote port is unchanged
* @clnt: client to rebind
@@ -1454,22 +1554,14 @@
}
EXPORT_SYMBOL_GPL(rpc_force_rebind);
-/*
- * Restart an (async) RPC call from the call_prepare state.
- * Usually called from within the exit handler.
- */
-int
-rpc_restart_call_prepare(struct rpc_task *task)
+static int
+__rpc_restart_call(struct rpc_task *task, void (*action)(struct rpc_task *))
{
- if (RPC_ASSASSINATED(task))
- return 0;
- task->tk_action = call_start;
task->tk_status = 0;
- if (task->tk_ops->rpc_call_prepare != NULL)
- task->tk_action = rpc_prepare_task;
+ task->tk_rpc_status = 0;
+ task->tk_action = action;
return 1;
}
-EXPORT_SYMBOL_GPL(rpc_restart_call_prepare);
/*
* Restart an (async) RPC call. Usually called from within the
@@ -1478,14 +1570,23 @@
int
rpc_restart_call(struct rpc_task *task)
{
- if (RPC_ASSASSINATED(task))
- return 0;
- task->tk_action = call_start;
- task->tk_status = 0;
- return 1;
+ return __rpc_restart_call(task, call_start);
}
EXPORT_SYMBOL_GPL(rpc_restart_call);
+/*
+ * Restart an (async) RPC call from the call_prepare state.
+ * Usually called from within the exit handler.
+ */
+int
+rpc_restart_call_prepare(struct rpc_task *task)
+{
+ if (task->tk_ops->rpc_call_prepare != NULL)
+ return __rpc_restart_call(task, rpc_prepare_task);
+ return rpc_restart_call(task);
+}
+EXPORT_SYMBOL_GPL(rpc_restart_call_prepare);
+
const char
*rpc_proc_name(const struct rpc_task *task)
{
@@ -1500,6 +1601,19 @@
return "no proc";
}
+static void
+__rpc_call_rpcerror(struct rpc_task *task, int tk_status, int rpc_status)
+{
+ task->tk_rpc_status = rpc_status;
+ rpc_exit(task, tk_status);
+}
+
+static void
+rpc_call_rpcerror(struct rpc_task *task, int status)
+{
+ __rpc_call_rpcerror(task, status, status);
+}
+
/*
* 0. Initial state
*
@@ -1558,14 +1672,13 @@
task->tk_status = 0;
if (status >= 0) {
if (task->tk_rqstp) {
- xprt_request_init(task);
task->tk_action = call_refresh;
return;
}
printk(KERN_ERR "%s: status=%d, but no request slot, exiting\n",
__func__, status);
- rpc_exit(task, -EIO);
+ rpc_call_rpcerror(task, -EIO);
return;
}
@@ -1593,7 +1706,7 @@
__func__, status);
break;
}
- rpc_exit(task, status);
+ rpc_call_rpcerror(task, status);
}
/*
@@ -1661,7 +1774,7 @@
}
dprintk("RPC: %5u %s: refresh creds failed with error %d\n",
task->tk_pid, __func__, status);
- rpc_exit(task, status);
+ rpc_call_rpcerror(task, status);
}
/*
@@ -1671,7 +1784,7 @@
static void
call_allocate(struct rpc_task *task)
{
- unsigned int slack = task->tk_rqstp->rq_cred->cr_auth->au_cslack;
+ const struct rpc_auth *auth = task->tk_rqstp->rq_cred->cr_auth;
struct rpc_rqst *req = task->tk_rqstp;
struct rpc_xprt *xprt = req->rq_xprt;
const struct rpc_procinfo *proc = task->tk_msg.rpc_proc;
@@ -1680,7 +1793,7 @@
dprint_status(task);
task->tk_status = 0;
- task->tk_action = call_bind;
+ task->tk_action = call_encode;
if (req->rq_buffer)
return;
@@ -1696,9 +1809,15 @@
* and reply headers, and convert both values
* to byte sizes.
*/
- req->rq_callsize = RPC_CALLHDRSIZE + (slack << 1) + proc->p_arglen;
+ req->rq_callsize = RPC_CALLHDRSIZE + (auth->au_cslack << 1) +
+ proc->p_arglen;
req->rq_callsize <<= 2;
- req->rq_rcvsize = RPC_REPHDRSIZE + slack + proc->p_replen;
+ /*
+ * Note: the reply buffer must at minimum allocate enough space
+ * for the 'struct accepted_reply' from RFC5531.
+ */
+ req->rq_rcvsize = RPC_REPHDRSIZE + auth->au_rslack + \
+ max_t(size_t, proc->p_replen, 2);
req->rq_rcvsize <<= 2;
status = xprt->ops->buf_alloc(task);
@@ -1706,7 +1825,7 @@
if (status == 0)
return;
if (status != -ENOMEM) {
- rpc_exit(task, status);
+ rpc_call_rpcerror(task, status);
return;
}
@@ -1718,33 +1837,23 @@
return;
}
- rpc_exit(task, -ERESTARTSYS);
+ rpc_call_rpcerror(task, -ERESTARTSYS);
}
-static inline int
+static int
rpc_task_need_encode(struct rpc_task *task)
{
- return task->tk_rqstp->rq_snd_buf.len == 0;
+ return test_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate) == 0 &&
+ (!(task->tk_flags & RPC_TASK_SENT) ||
+ !(task->tk_flags & RPC_TASK_NO_RETRANS_TIMEOUT) ||
+ xprt_request_need_retransmit(task));
}
-static inline void
-rpc_task_force_reencode(struct rpc_task *task)
-{
- task->tk_rqstp->rq_snd_buf.len = 0;
- task->tk_rqstp->rq_bytes_sent = 0;
-}
-
-/*
- * 3. Encode arguments of an RPC call
- */
static void
rpc_xdr_encode(struct rpc_task *task)
{
struct rpc_rqst *req = task->tk_rqstp;
- kxdreproc_t encode;
- __be32 *p;
-
- dprint_status(task);
+ struct xdr_stream xdr;
xdr_buf_init(&req->rq_snd_buf,
req->rq_buffer,
@@ -1753,19 +1862,82 @@
req->rq_rbuffer,
req->rq_rcvsize);
- p = rpc_encode_header(task);
- if (p == NULL) {
- printk(KERN_INFO "RPC: couldn't encode RPC header, exit EIO\n");
- rpc_exit(task, -EIO);
+ req->rq_reply_bytes_recvd = 0;
+ req->rq_snd_buf.head[0].iov_len = 0;
+ xdr_init_encode(&xdr, &req->rq_snd_buf,
+ req->rq_snd_buf.head[0].iov_base, req);
+ xdr_free_bvec(&req->rq_snd_buf);
+ if (rpc_encode_header(task, &xdr))
+ return;
+
+ task->tk_status = rpcauth_wrap_req(task, &xdr);
+}
+
+/*
+ * 3. Encode arguments of an RPC call
+ */
+static void
+call_encode(struct rpc_task *task)
+{
+ if (!rpc_task_need_encode(task))
+ goto out;
+ dprint_status(task);
+ /* Dequeue task from the receive queue while we're encoding */
+ xprt_request_dequeue_xprt(task);
+ /* Encode here so that rpcsec_gss can use correct sequence number. */
+ rpc_xdr_encode(task);
+ /* Did the encode result in an error condition? */
+ if (task->tk_status != 0) {
+ /* Was the error nonfatal? */
+ switch (task->tk_status) {
+ case -EAGAIN:
+ case -ENOMEM:
+ rpc_delay(task, HZ >> 4);
+ break;
+ case -EKEYEXPIRED:
+ if (!task->tk_cred_retry) {
+ rpc_exit(task, task->tk_status);
+ } else {
+ task->tk_action = call_refresh;
+ task->tk_cred_retry--;
+ dprintk("RPC: %5u %s: retry refresh creds\n",
+ task->tk_pid, __func__);
+ }
+ break;
+ default:
+ rpc_call_rpcerror(task, task->tk_status);
+ }
return;
}
- encode = task->tk_msg.rpc_proc->p_encode;
- if (encode == NULL)
- return;
+ /* Add task to reply queue before transmission to avoid races */
+ if (rpc_reply_expected(task))
+ xprt_request_enqueue_receive(task);
+ xprt_request_enqueue_transmit(task);
+out:
+ task->tk_action = call_transmit;
+ /* Check that the connection is OK */
+ if (!xprt_bound(task->tk_xprt))
+ task->tk_action = call_bind;
+ else if (!xprt_connected(task->tk_xprt))
+ task->tk_action = call_connect;
+}
- task->tk_status = rpcauth_wrap_req(task, encode, req, p,
- task->tk_msg.rpc_argp);
+/*
+ * Helpers to check if the task was already transmitted, and
+ * to take action when that is the case.
+ */
+static bool
+rpc_task_transmitted(struct rpc_task *task)
+{
+ return !test_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate);
+}
+
+static void
+rpc_task_handle_transmitted(struct rpc_task *task)
+{
+ xprt_end_transmit(task);
+ task->tk_action = call_transmit_status;
}
/*
@@ -1776,14 +1948,23 @@
{
struct rpc_xprt *xprt = task->tk_rqstp->rq_xprt;
+ if (rpc_task_transmitted(task)) {
+ rpc_task_handle_transmitted(task);
+ return;
+ }
+
+ if (xprt_bound(xprt)) {
+ task->tk_action = call_connect;
+ return;
+ }
+
dprint_status(task);
- task->tk_action = call_connect;
- if (!xprt_bound(xprt)) {
- task->tk_action = call_bind_status;
- task->tk_timeout = xprt->bind_timeout;
- xprt->ops->rpcbind(task);
- }
+ task->tk_action = call_bind_status;
+ if (!xprt_prepare_transmit(task))
+ return;
+
+ xprt->ops->rpcbind(task);
}
/*
@@ -1792,16 +1973,23 @@
static void
call_bind_status(struct rpc_task *task)
{
+ struct rpc_xprt *xprt = task->tk_rqstp->rq_xprt;
int status = -EIO;
- if (task->tk_status >= 0) {
- dprint_status(task);
- task->tk_status = 0;
- task->tk_action = call_connect;
+ if (rpc_task_transmitted(task)) {
+ rpc_task_handle_transmitted(task);
return;
}
+ dprint_status(task);
trace_rpc_bind_status(task);
+ if (task->tk_status >= 0)
+ goto out_next;
+ if (xprt_bound(xprt)) {
+ task->tk_status = 0;
+ goto out_next;
+ }
+
switch (task->tk_status) {
case -ENOMEM:
dprintk("RPC: %5u rpcbind out of memory\n", task->tk_pid);
@@ -1820,6 +2008,11 @@
task->tk_rebind_retry--;
rpc_delay(task, 3*HZ);
goto retry_timeout;
+ case -ENOBUFS:
+ rpc_delay(task, HZ >> 2);
+ goto retry_timeout;
+ case -EAGAIN:
+ goto retry_timeout;
case -ETIMEDOUT:
dprintk("RPC: %5u rpcbind request timed out\n",
task->tk_pid);
@@ -1841,7 +2034,6 @@
case -ENETDOWN:
case -EHOSTUNREACH:
case -ENETUNREACH:
- case -ENOBUFS:
case -EPIPE:
dprintk("RPC: %5u remote rpcbind unreachable: %d\n",
task->tk_pid, task->tk_status);
@@ -1856,12 +2048,15 @@
task->tk_pid, -task->tk_status);
}
- rpc_exit(task, status);
+ rpc_call_rpcerror(task, status);
return;
-
+out_next:
+ task->tk_action = call_connect;
+ return;
retry_timeout:
task->tk_status = 0;
- task->tk_action = call_timeout;
+ task->tk_action = call_bind;
+ rpc_check_timeout(task);
}
/*
@@ -1872,21 +2067,30 @@
{
struct rpc_xprt *xprt = task->tk_rqstp->rq_xprt;
+ if (rpc_task_transmitted(task)) {
+ rpc_task_handle_transmitted(task);
+ return;
+ }
+
+ if (xprt_connected(xprt)) {
+ task->tk_action = call_transmit;
+ return;
+ }
+
dprintk("RPC: %5u call_connect xprt %p %s connected\n",
task->tk_pid, xprt,
(xprt_connected(xprt) ? "is" : "is not"));
- task->tk_action = call_transmit;
- if (!xprt_connected(xprt)) {
- task->tk_action = call_connect_status;
- if (task->tk_status < 0)
- return;
- if (task->tk_flags & RPC_TASK_NOCONNECT) {
- rpc_exit(task, -ENOTCONN);
- return;
- }
- xprt_connect(task);
+ task->tk_action = call_connect_status;
+ if (task->tk_status < 0)
+ return;
+ if (task->tk_flags & RPC_TASK_NOCONNECT) {
+ rpc_call_rpcerror(task, -ENOTCONN);
+ return;
}
+ if (!xprt_prepare_transmit(task))
+ return;
+ xprt_connect(task);
}
/*
@@ -1895,12 +2099,27 @@
static void
call_connect_status(struct rpc_task *task)
{
+ struct rpc_xprt *xprt = task->tk_rqstp->rq_xprt;
struct rpc_clnt *clnt = task->tk_client;
int status = task->tk_status;
- dprint_status(task);
+ if (rpc_task_transmitted(task)) {
+ rpc_task_handle_transmitted(task);
+ return;
+ }
+ dprint_status(task);
trace_rpc_connect_status(task);
+
+ if (task->tk_status == 0) {
+ clnt->cl_stats->netreconn++;
+ goto out_next;
+ }
+ if (xprt_connected(xprt)) {
+ task->tk_status = 0;
+ goto out_next;
+ }
+
task->tk_status = 0;
switch (status) {
case -ECONNREFUSED:
@@ -1909,8 +2128,7 @@
break;
if (clnt->cl_autobind) {
rpc_force_rebind(clnt);
- task->tk_action = call_bind;
- return;
+ goto out_retry;
}
/* fall through */
case -ECONNRESET:
@@ -1918,8 +2136,6 @@
case -ENETDOWN:
case -ENETUNREACH:
case -EHOSTUNREACH:
- case -EADDRINUSE:
- case -ENOBUFS:
case -EPIPE:
xprt_conditional_disconnect(task->tk_rqstp->rq_xprt,
task->tk_rqstp->rq_connect_cookie);
@@ -1928,17 +2144,24 @@
/* retry with existing socket, after a delay */
rpc_delay(task, 3*HZ);
/* fall through */
+ case -EADDRINUSE:
+ case -ENOTCONN:
case -EAGAIN:
- /* Check for timeouts before looping back to call_bind */
case -ETIMEDOUT:
- task->tk_action = call_timeout;
- return;
- case 0:
- clnt->cl_stats->netreconn++;
- task->tk_action = call_transmit;
- return;
+ goto out_retry;
+ case -ENOBUFS:
+ rpc_delay(task, HZ >> 2);
+ goto out_retry;
}
- rpc_exit(task, status);
+ rpc_call_rpcerror(task, status);
+ return;
+out_next:
+ task->tk_action = call_transmit;
+ return;
+out_retry:
+ /* Check for timeouts before looping back to call_bind */
+ task->tk_action = call_bind;
+ rpc_check_timeout(task);
}
/*
@@ -1947,43 +2170,25 @@
static void
call_transmit(struct rpc_task *task)
{
- int is_retrans = RPC_WAS_SENT(task);
+ if (rpc_task_transmitted(task)) {
+ rpc_task_handle_transmitted(task);
+ return;
+ }
dprint_status(task);
- task->tk_action = call_status;
- if (task->tk_status < 0)
- return;
+ task->tk_action = call_transmit_status;
if (!xprt_prepare_transmit(task))
return;
- task->tk_action = call_transmit_status;
- /* Encode here so that rpcsec_gss can use correct sequence number. */
- if (rpc_task_need_encode(task)) {
- rpc_xdr_encode(task);
- /* Did the encode result in an error condition? */
- if (task->tk_status != 0) {
- /* Was the error nonfatal? */
- if (task->tk_status == -EAGAIN)
- rpc_delay(task, HZ >> 4);
- else
- rpc_exit(task, task->tk_status);
+ task->tk_status = 0;
+ if (test_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate)) {
+ if (!xprt_connected(task->tk_xprt)) {
+ task->tk_status = -ENOTCONN;
return;
}
+ xprt_transmit(task);
}
- xprt_transmit(task);
- if (task->tk_status < 0)
- return;
- if (is_retrans)
- task->tk_client->cl_stats->rpcretrans++;
- /*
- * On success, ensure that we call xprt_end_transmit() before sleeping
- * in order to allow access to the socket to other RPC requests.
- */
- call_transmit_status(task);
- if (rpc_reply_expected(task))
- return;
- task->tk_action = rpc_exit_task;
- rpc_wake_up_queued_task(&task->tk_rqstp->rq_xprt->pending, task);
+ xprt_end_transmit(task);
}
/*
@@ -1998,20 +2203,19 @@
* Common case: success. Force the compiler to put this
* test first.
*/
- if (task->tk_status == 0) {
- xprt_end_transmit(task);
- rpc_task_force_reencode(task);
+ if (rpc_task_transmitted(task)) {
+ task->tk_status = 0;
+ xprt_request_wait_receive(task);
return;
}
switch (task->tk_status) {
- case -EAGAIN:
- case -ENOBUFS:
- break;
default:
dprint_status(task);
- xprt_end_transmit(task);
- rpc_task_force_reencode(task);
+ break;
+ case -EBADMSG:
+ task->tk_status = 0;
+ task->tk_action = call_encode;
break;
/*
* Special cases: if we've been waiting on the
@@ -2019,6 +2223,14 @@
* socket just returned a connection error,
* then hold onto the transport lock.
*/
+ case -ENOBUFS:
+ rpc_delay(task, HZ>>2);
+ /* fall through */
+ case -EBADSLT:
+ case -EAGAIN:
+ task->tk_action = call_transmit;
+ task->tk_status = 0;
+ break;
case -ECONNREFUSED:
case -EHOSTDOWN:
case -ENETDOWN:
@@ -2026,12 +2238,11 @@
case -ENETUNREACH:
case -EPERM:
if (RPC_IS_SOFTCONN(task)) {
- xprt_end_transmit(task);
if (!task->tk_msg.rpc_proc->p_proc)
trace_xprt_ping(task->tk_xprt,
task->tk_status);
- rpc_exit(task, task->tk_status);
- break;
+ rpc_call_rpcerror(task, task->tk_status);
+ return;
}
/* fall through */
case -ECONNRESET:
@@ -2039,11 +2250,24 @@
case -EADDRINUSE:
case -ENOTCONN:
case -EPIPE:
- rpc_task_force_reencode(task);
+ task->tk_action = call_bind;
+ task->tk_status = 0;
+ break;
}
+ rpc_check_timeout(task);
}
#if defined(CONFIG_SUNRPC_BACKCHANNEL)
+static void call_bc_transmit(struct rpc_task *task);
+static void call_bc_transmit_status(struct rpc_task *task);
+
+static void
+call_bc_encode(struct rpc_task *task)
+{
+ xprt_request_enqueue_transmit(task);
+ task->tk_action = call_bc_transmit;
+}
+
/*
* 5b. Send the backchannel RPC reply. On error, drop the reply. In
* addition, disconnect on connectivity errors.
@@ -2051,26 +2275,26 @@
static void
call_bc_transmit(struct rpc_task *task)
{
+ task->tk_action = call_bc_transmit_status;
+ if (test_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate)) {
+ if (!xprt_prepare_transmit(task))
+ return;
+ task->tk_status = 0;
+ xprt_transmit(task);
+ }
+ xprt_end_transmit(task);
+}
+
+static void
+call_bc_transmit_status(struct rpc_task *task)
+{
struct rpc_rqst *req = task->tk_rqstp;
- if (!xprt_prepare_transmit(task))
- goto out_retry;
+ if (rpc_task_transmitted(task))
+ task->tk_status = 0;
- if (task->tk_status < 0) {
- printk(KERN_NOTICE "RPC: Could not send backchannel reply "
- "error: %d\n", task->tk_status);
- goto out_done;
- }
- if (req->rq_connect_cookie != req->rq_xprt->connect_cookie)
- req->rq_bytes_sent = 0;
-
- xprt_transmit(task);
-
- if (task->tk_status == -EAGAIN)
- goto out_nospace;
-
- xprt_end_transmit(task);
dprint_status(task);
+
switch (task->tk_status) {
case 0:
/* Success */
@@ -2084,6 +2308,14 @@
case -ENOTCONN:
case -EPIPE:
break;
+ case -ENOBUFS:
+ rpc_delay(task, HZ>>2);
+ /* fall through */
+ case -EBADSLT:
+ case -EAGAIN:
+ task->tk_status = 0;
+ task->tk_action = call_bc_transmit;
+ return;
case -ETIMEDOUT:
/*
* Problem reaching the server. Disconnect and let the
@@ -2102,19 +2334,11 @@
* We were unable to reply and will have to drop the
* request. The server should reconnect and retransmit.
*/
- WARN_ON_ONCE(task->tk_status == -EAGAIN);
printk(KERN_NOTICE "RPC: Could not send backchannel reply "
"error: %d\n", task->tk_status);
break;
}
- rpc_wake_up_queued_task(&req->rq_xprt->pending, task);
-out_done:
task->tk_action = rpc_exit_task;
- return;
-out_nospace:
- req->rq_connect_cookie = req->rq_xprt->connect_cookie;
-out_retry:
- task->tk_status = 0;
}
#endif /* CONFIG_SUNRPC_BACKCHANNEL */
@@ -2125,15 +2349,11 @@
call_status(struct rpc_task *task)
{
struct rpc_clnt *clnt = task->tk_client;
- struct rpc_rqst *req = task->tk_rqstp;
int status;
if (!task->tk_msg.rpc_proc->p_proc)
trace_xprt_ping(task->tk_xprt, task->tk_status);
- if (req->rq_reply_bytes_recvd > 0 && !req->rq_bytes_sent)
- task->tk_status = req->rq_reply_bytes_recvd;
-
dprint_status(task);
status = task->tk_status;
@@ -2150,10 +2370,8 @@
case -EHOSTUNREACH:
case -ENETUNREACH:
case -EPERM:
- if (RPC_IS_SOFTCONN(task)) {
- rpc_exit(task, status);
- break;
- }
+ if (RPC_IS_SOFTCONN(task))
+ goto out_exit;
/*
* Delay any retries for 3 seconds, then handle as if it
* were a timeout.
@@ -2161,79 +2379,90 @@
rpc_delay(task, 3*HZ);
/* fall through */
case -ETIMEDOUT:
- task->tk_action = call_timeout;
break;
case -ECONNREFUSED:
case -ECONNRESET:
case -ECONNABORTED:
+ case -ENOTCONN:
rpc_force_rebind(clnt);
- /* fall through */
+ break;
case -EADDRINUSE:
rpc_delay(task, 3*HZ);
/* fall through */
case -EPIPE:
- case -ENOTCONN:
- task->tk_action = call_bind;
- break;
- case -ENOBUFS:
- rpc_delay(task, HZ>>2);
- /* fall through */
case -EAGAIN:
- task->tk_action = call_transmit;
break;
case -EIO:
/* shutdown or soft timeout */
- rpc_exit(task, status);
- break;
+ goto out_exit;
default:
if (clnt->cl_chatty)
printk("%s: RPC call returned error %d\n",
clnt->cl_program->name, -status);
- rpc_exit(task, status);
+ goto out_exit;
}
+ task->tk_action = call_encode;
+ rpc_check_timeout(task);
+ return;
+out_exit:
+ rpc_call_rpcerror(task, status);
}
-/*
- * 6a. Handle RPC timeout
- * We do not release the request slot, so we keep using the
- * same XID for all retransmits.
- */
+static bool
+rpc_check_connected(const struct rpc_rqst *req)
+{
+ /* No allocated request or transport? return true */
+ if (!req || !req->rq_xprt)
+ return true;
+ return xprt_connected(req->rq_xprt);
+}
+
static void
-call_timeout(struct rpc_task *task)
+rpc_check_timeout(struct rpc_task *task)
{
struct rpc_clnt *clnt = task->tk_client;
- if (xprt_adjust_timeout(task->tk_rqstp) == 0) {
- dprintk("RPC: %5u call_timeout (minor)\n", task->tk_pid);
- goto retry;
- }
+ if (xprt_adjust_timeout(task->tk_rqstp) == 0)
+ return;
dprintk("RPC: %5u call_timeout (major)\n", task->tk_pid);
task->tk_timeouts++;
- if (RPC_IS_SOFTCONN(task)) {
- rpc_exit(task, -ETIMEDOUT);
+ if (RPC_IS_SOFTCONN(task) && !rpc_check_connected(task->tk_rqstp)) {
+ rpc_call_rpcerror(task, -ETIMEDOUT);
return;
}
+
if (RPC_IS_SOFT(task)) {
+ /*
+ * Once a "no retrans timeout" soft tasks (a.k.a NFSv4) has
+ * been sent, it should time out only if the transport
+ * connection gets terminally broken.
+ */
+ if ((task->tk_flags & RPC_TASK_NO_RETRANS_TIMEOUT) &&
+ rpc_check_connected(task->tk_rqstp))
+ return;
+
if (clnt->cl_chatty) {
- printk(KERN_NOTICE "%s: server %s not responding, timed out\n",
+ pr_notice_ratelimited(
+ "%s: server %s not responding, timed out\n",
clnt->cl_program->name,
task->tk_xprt->servername);
}
if (task->tk_flags & RPC_TASK_TIMEOUT)
- rpc_exit(task, -ETIMEDOUT);
+ rpc_call_rpcerror(task, -ETIMEDOUT);
else
- rpc_exit(task, -EIO);
+ __rpc_call_rpcerror(task, -EIO, -ETIMEDOUT);
return;
}
if (!(task->tk_flags & RPC_CALL_MAJORSEEN)) {
task->tk_flags |= RPC_CALL_MAJORSEEN;
if (clnt->cl_chatty) {
- printk(KERN_NOTICE "%s: server %s not responding, still trying\n",
- clnt->cl_program->name,
- task->tk_xprt->servername);
+ pr_notice_ratelimited(
+ "%s: server %s not responding, still trying\n",
+ clnt->cl_program->name,
+ task->tk_xprt->servername);
}
}
rpc_force_rebind(clnt);
@@ -2242,10 +2471,6 @@
* event? RFC2203 requires the server to drop all such requests.
*/
rpcauth_invalcred(task);
-
-retry:
- task->tk_action = call_bind;
- task->tk_status = 0;
}
/*
@@ -2256,14 +2481,19 @@
{
struct rpc_clnt *clnt = task->tk_client;
struct rpc_rqst *req = task->tk_rqstp;
- kxdrdproc_t decode = task->tk_msg.rpc_proc->p_decode;
- __be32 *p;
+ struct xdr_stream xdr;
+ int err;
dprint_status(task);
+ if (!task->tk_msg.rpc_proc->p_decode) {
+ task->tk_action = rpc_exit_task;
+ return;
+ }
+
if (task->tk_flags & RPC_CALL_MAJORSEEN) {
if (clnt->cl_chatty) {
- printk(KERN_NOTICE "%s: server %s OK\n",
+ pr_notice_ratelimited("%s: server %s OK\n",
clnt->cl_program->name,
task->tk_xprt->servername);
}
@@ -2275,229 +2505,200 @@
* before it changed req->rq_reply_bytes_recvd.
*/
smp_rmb();
+
+ /*
+ * Did we ever call xprt_complete_rqst()? If not, we should assume
+ * the message is incomplete.
+ */
+ err = -EAGAIN;
+ if (!req->rq_reply_bytes_recvd)
+ goto out;
+
req->rq_rcv_buf.len = req->rq_private_buf.len;
/* Check that the softirq receive buffer is valid */
WARN_ON(memcmp(&req->rq_rcv_buf, &req->rq_private_buf,
sizeof(req->rq_rcv_buf)) != 0);
- if (req->rq_rcv_buf.len < 12) {
- if (!RPC_IS_SOFT(task)) {
- task->tk_action = call_bind;
- goto out_retry;
- }
- dprintk("RPC: %s: too small RPC reply size (%d bytes)\n",
- clnt->cl_program->name, task->tk_status);
- task->tk_action = call_timeout;
- goto out_retry;
- }
-
- p = rpc_verify_header(task);
- if (IS_ERR(p)) {
- if (p == ERR_PTR(-EAGAIN))
- goto out_retry;
+ xdr_init_decode(&xdr, &req->rq_rcv_buf,
+ req->rq_rcv_buf.head[0].iov_base, req);
+ err = rpc_decode_header(task, &xdr);
+out:
+ switch (err) {
+ case 0:
+ task->tk_action = rpc_exit_task;
+ task->tk_status = rpcauth_unwrap_resp(task, &xdr);
+ dprintk("RPC: %5u %s result %d\n",
+ task->tk_pid, __func__, task->tk_status);
return;
- }
-
- task->tk_action = rpc_exit_task;
-
- if (decode) {
- task->tk_status = rpcauth_unwrap_resp(task, decode, req, p,
- task->tk_msg.rpc_resp);
- }
- dprintk("RPC: %5u call_decode result %d\n", task->tk_pid,
- task->tk_status);
- return;
-out_retry:
- task->tk_status = 0;
- /* Note: rpc_verify_header() may have freed the RPC slot */
- if (task->tk_rqstp == req) {
- req->rq_reply_bytes_recvd = req->rq_rcv_buf.len = 0;
+ case -EAGAIN:
+ task->tk_status = 0;
if (task->tk_client->cl_discrtry)
xprt_conditional_disconnect(req->rq_xprt,
- req->rq_connect_cookie);
+ req->rq_connect_cookie);
+ task->tk_action = call_encode;
+ rpc_check_timeout(task);
+ break;
+ case -EKEYREJECTED:
+ task->tk_action = call_reserve;
+ rpc_check_timeout(task);
+ rpcauth_invalcred(task);
+ /* Ensure we obtain a new XID if we retry! */
+ xprt_release(task);
}
}
-static __be32 *
-rpc_encode_header(struct rpc_task *task)
+static int
+rpc_encode_header(struct rpc_task *task, struct xdr_stream *xdr)
{
struct rpc_clnt *clnt = task->tk_client;
struct rpc_rqst *req = task->tk_rqstp;
- __be32 *p = req->rq_svec[0].iov_base;
+ __be32 *p;
+ int error;
- /* FIXME: check buffer size? */
+ error = -EMSGSIZE;
+ p = xdr_reserve_space(xdr, RPC_CALLHDRSIZE << 2);
+ if (!p)
+ goto out_fail;
+ *p++ = req->rq_xid;
+ *p++ = rpc_call;
+ *p++ = cpu_to_be32(RPC_VERSION);
+ *p++ = cpu_to_be32(clnt->cl_prog);
+ *p++ = cpu_to_be32(clnt->cl_vers);
+ *p = cpu_to_be32(task->tk_msg.rpc_proc->p_proc);
- p = xprt_skip_transport_header(req->rq_xprt, p);
- *p++ = req->rq_xid; /* XID */
- *p++ = htonl(RPC_CALL); /* CALL */
- *p++ = htonl(RPC_VERSION); /* RPC version */
- *p++ = htonl(clnt->cl_prog); /* program number */
- *p++ = htonl(clnt->cl_vers); /* program version */
- *p++ = htonl(task->tk_msg.rpc_proc->p_proc); /* procedure */
- p = rpcauth_marshcred(task, p);
- req->rq_slen = xdr_adjust_iovec(&req->rq_svec[0], p);
- return p;
+ error = rpcauth_marshcred(task, xdr);
+ if (error < 0)
+ goto out_fail;
+ return 0;
+out_fail:
+ trace_rpc_bad_callhdr(task);
+ rpc_call_rpcerror(task, error);
+ return error;
}
-static __be32 *
-rpc_verify_header(struct rpc_task *task)
+static noinline int
+rpc_decode_header(struct rpc_task *task, struct xdr_stream *xdr)
{
struct rpc_clnt *clnt = task->tk_client;
- struct kvec *iov = &task->tk_rqstp->rq_rcv_buf.head[0];
- int len = task->tk_rqstp->rq_rcv_buf.len >> 2;
- __be32 *p = iov->iov_base;
- u32 n;
- int error = -EACCES;
+ int error;
+ __be32 *p;
- if ((task->tk_rqstp->rq_rcv_buf.len & 3) != 0) {
- /* RFC-1014 says that the representation of XDR data must be a
- * multiple of four bytes
- * - if it isn't pointer subtraction in the NFS client may give
- * undefined results
- */
- dprintk("RPC: %5u %s: XDR representation not a multiple of"
- " 4 bytes: 0x%x\n", task->tk_pid, __func__,
- task->tk_rqstp->rq_rcv_buf.len);
- error = -EIO;
- goto out_err;
- }
- if ((len -= 3) < 0)
- goto out_overflow;
+ /* RFC-1014 says that the representation of XDR data must be a
+ * multiple of four bytes
+ * - if it isn't pointer subtraction in the NFS client may give
+ * undefined results
+ */
+ if (task->tk_rqstp->rq_rcv_buf.len & 3)
+ goto out_unparsable;
- p += 1; /* skip XID */
- if ((n = ntohl(*p++)) != RPC_REPLY) {
- dprintk("RPC: %5u %s: not an RPC reply: %x\n",
- task->tk_pid, __func__, n);
- error = -EIO;
- goto out_garbage;
- }
+ p = xdr_inline_decode(xdr, 3 * sizeof(*p));
+ if (!p)
+ goto out_unparsable;
+ p++; /* skip XID */
+ if (*p++ != rpc_reply)
+ goto out_unparsable;
+ if (*p++ != rpc_msg_accepted)
+ goto out_msg_denied;
- if ((n = ntohl(*p++)) != RPC_MSG_ACCEPTED) {
- if (--len < 0)
- goto out_overflow;
- switch ((n = ntohl(*p++))) {
- case RPC_AUTH_ERROR:
- break;
- case RPC_MISMATCH:
- dprintk("RPC: %5u %s: RPC call version mismatch!\n",
- task->tk_pid, __func__);
- error = -EPROTONOSUPPORT;
- goto out_err;
- default:
- dprintk("RPC: %5u %s: RPC call rejected, "
- "unknown error: %x\n",
- task->tk_pid, __func__, n);
- error = -EIO;
- goto out_err;
- }
- if (--len < 0)
- goto out_overflow;
- switch ((n = ntohl(*p++))) {
- case RPC_AUTH_REJECTEDCRED:
- case RPC_AUTH_REJECTEDVERF:
- case RPCSEC_GSS_CREDPROBLEM:
- case RPCSEC_GSS_CTXPROBLEM:
- if (!task->tk_cred_retry)
- break;
- task->tk_cred_retry--;
- dprintk("RPC: %5u %s: retry stale creds\n",
- task->tk_pid, __func__);
- rpcauth_invalcred(task);
- /* Ensure we obtain a new XID! */
- xprt_release(task);
- task->tk_action = call_reserve;
- goto out_retry;
- case RPC_AUTH_BADCRED:
- case RPC_AUTH_BADVERF:
- /* possibly garbled cred/verf? */
- if (!task->tk_garb_retry)
- break;
- task->tk_garb_retry--;
- dprintk("RPC: %5u %s: retry garbled creds\n",
- task->tk_pid, __func__);
- task->tk_action = call_bind;
- goto out_retry;
- case RPC_AUTH_TOOWEAK:
- printk(KERN_NOTICE "RPC: server %s requires stronger "
- "authentication.\n",
- task->tk_xprt->servername);
- break;
- default:
- dprintk("RPC: %5u %s: unknown auth error: %x\n",
- task->tk_pid, __func__, n);
- error = -EIO;
- }
- dprintk("RPC: %5u %s: call rejected %d\n",
- task->tk_pid, __func__, n);
- goto out_err;
- }
- p = rpcauth_checkverf(task, p);
- if (IS_ERR(p)) {
- error = PTR_ERR(p);
- dprintk("RPC: %5u %s: auth check failed with %d\n",
- task->tk_pid, __func__, error);
- goto out_garbage; /* bad verifier, retry */
- }
- len = p - (__be32 *)iov->iov_base - 1;
- if (len < 0)
- goto out_overflow;
- switch ((n = ntohl(*p++))) {
- case RPC_SUCCESS:
- return p;
- case RPC_PROG_UNAVAIL:
- dprintk("RPC: %5u %s: program %u is unsupported "
- "by server %s\n", task->tk_pid, __func__,
- (unsigned int)clnt->cl_prog,
- task->tk_xprt->servername);
+ error = rpcauth_checkverf(task, xdr);
+ if (error)
+ goto out_verifier;
+
+ p = xdr_inline_decode(xdr, sizeof(*p));
+ if (!p)
+ goto out_unparsable;
+ switch (*p) {
+ case rpc_success:
+ return 0;
+ case rpc_prog_unavail:
+ trace_rpc__prog_unavail(task);
error = -EPFNOSUPPORT;
goto out_err;
- case RPC_PROG_MISMATCH:
- dprintk("RPC: %5u %s: program %u, version %u unsupported "
- "by server %s\n", task->tk_pid, __func__,
- (unsigned int)clnt->cl_prog,
- (unsigned int)clnt->cl_vers,
- task->tk_xprt->servername);
+ case rpc_prog_mismatch:
+ trace_rpc__prog_mismatch(task);
error = -EPROTONOSUPPORT;
goto out_err;
- case RPC_PROC_UNAVAIL:
- dprintk("RPC: %5u %s: proc %s unsupported by program %u, "
- "version %u on server %s\n",
- task->tk_pid, __func__,
- rpc_proc_name(task),
- clnt->cl_prog, clnt->cl_vers,
- task->tk_xprt->servername);
+ case rpc_proc_unavail:
+ trace_rpc__proc_unavail(task);
error = -EOPNOTSUPP;
goto out_err;
- case RPC_GARBAGE_ARGS:
- dprintk("RPC: %5u %s: server saw garbage\n",
- task->tk_pid, __func__);
- break; /* retry */
+ case rpc_garbage_args:
+ case rpc_system_err:
+ trace_rpc__garbage_args(task);
+ error = -EIO;
+ break;
default:
- dprintk("RPC: %5u %s: server accept status: %x\n",
- task->tk_pid, __func__, n);
- /* Also retry */
+ goto out_unparsable;
}
out_garbage:
clnt->cl_stats->rpcgarbage++;
if (task->tk_garb_retry) {
task->tk_garb_retry--;
- dprintk("RPC: %5u %s: retrying\n",
- task->tk_pid, __func__);
- task->tk_action = call_bind;
-out_retry:
- return ERR_PTR(-EAGAIN);
+ task->tk_action = call_encode;
+ return -EAGAIN;
}
out_err:
- rpc_exit(task, error);
- dprintk("RPC: %5u %s: call failed with error %d\n", task->tk_pid,
- __func__, error);
- return ERR_PTR(error);
-out_overflow:
- dprintk("RPC: %5u %s: server reply was truncated.\n", task->tk_pid,
- __func__);
+ rpc_call_rpcerror(task, error);
+ return error;
+
+out_unparsable:
+ trace_rpc__unparsable(task);
+ error = -EIO;
goto out_garbage;
+
+out_verifier:
+ trace_rpc_bad_verifier(task);
+ goto out_garbage;
+
+out_msg_denied:
+ error = -EACCES;
+ p = xdr_inline_decode(xdr, sizeof(*p));
+ if (!p)
+ goto out_unparsable;
+ switch (*p++) {
+ case rpc_auth_error:
+ break;
+ case rpc_mismatch:
+ trace_rpc__mismatch(task);
+ error = -EPROTONOSUPPORT;
+ goto out_err;
+ default:
+ goto out_unparsable;
+ }
+
+ p = xdr_inline_decode(xdr, sizeof(*p));
+ if (!p)
+ goto out_unparsable;
+ switch (*p++) {
+ case rpc_autherr_rejectedcred:
+ case rpc_autherr_rejectedverf:
+ case rpcsec_gsserr_credproblem:
+ case rpcsec_gsserr_ctxproblem:
+ if (!task->tk_cred_retry)
+ break;
+ task->tk_cred_retry--;
+ trace_rpc__stale_creds(task);
+ return -EKEYREJECTED;
+ case rpc_autherr_badcred:
+ case rpc_autherr_badverf:
+ /* possibly garbled cred/verf? */
+ if (!task->tk_garb_retry)
+ break;
+ task->tk_garb_retry--;
+ trace_rpc__bad_creds(task);
+ task->tk_action = call_encode;
+ return -EAGAIN;
+ case rpc_autherr_tooweak:
+ trace_rpc__auth_tooweak(task);
+ pr_warn("RPC: server %s requires stronger authentication.\n",
+ task->tk_xprt->servername);
+ break;
+ default:
+ goto out_unparsable;
+ }
+ goto out_err;
}
static void rpcproc_encode_null(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
@@ -2522,9 +2723,8 @@
.rpc_proc = &rpcproc_null,
};
int err;
- msg.rpc_cred = authnull_ops.lookup_cred(NULL, NULL, 0);
- err = rpc_call_sync(clnt, &msg, RPC_TASK_SOFT | RPC_TASK_SOFTCONN);
- put_rpccred(msg.rpc_cred);
+ err = rpc_call_sync(clnt, &msg, RPC_TASK_SOFT | RPC_TASK_SOFTCONN |
+ RPC_TASK_NULLCREDS);
return err;
}
@@ -2535,15 +2735,15 @@
{
struct rpc_message msg = {
.rpc_proc = &rpcproc_null,
- .rpc_cred = cred,
};
struct rpc_task_setup task_setup_data = {
.rpc_client = clnt,
.rpc_xprt = xprt,
.rpc_message = &msg,
+ .rpc_op_cred = cred,
.callback_ops = (ops != NULL) ? ops : &rpc_default_ops,
.callback_data = data,
- .flags = flags,
+ .flags = flags | RPC_TASK_NULLCREDS,
};
return rpc_run_task(&task_setup_data);
@@ -2594,7 +2794,6 @@
void *dummy)
{
struct rpc_cb_add_xprt_calldata *data;
- struct rpc_cred *cred;
struct rpc_task *task;
data = kmalloc(sizeof(*data), GFP_NOFS);
@@ -2602,15 +2801,18 @@
return -ENOMEM;
data->xps = xprt_switch_get(xps);
data->xprt = xprt_get(xprt);
+ if (rpc_xprt_switch_has_addr(data->xps, (struct sockaddr *)&xprt->addr)) {
+ rpc_cb_add_xprt_release(data);
+ goto success;
+ }
- cred = authnull_ops.lookup_cred(NULL, NULL, 0);
- task = rpc_call_null_helper(clnt, xprt, cred,
- RPC_TASK_SOFT|RPC_TASK_SOFTCONN|RPC_TASK_ASYNC,
+ task = rpc_call_null_helper(clnt, xprt, NULL,
+ RPC_TASK_SOFT|RPC_TASK_SOFTCONN|RPC_TASK_ASYNC|RPC_TASK_NULLCREDS,
&rpc_cb_add_xprt_call_ops, data);
- put_rpccred(cred);
if (IS_ERR(task))
return PTR_ERR(task);
rpc_put_task(task);
+success:
return 1;
}
EXPORT_SYMBOL_GPL(rpc_clnt_test_and_add_xprt);
@@ -2638,7 +2840,6 @@
struct rpc_xprt *xprt,
void *data)
{
- struct rpc_cred *cred;
struct rpc_task *task;
struct rpc_add_xprt_test *xtest = (struct rpc_add_xprt_test *)data;
int status = -EADDRINUSE;
@@ -2650,11 +2851,9 @@
goto out_err;
/* Test the connection */
- cred = authnull_ops.lookup_cred(NULL, NULL, 0);
- task = rpc_call_null_helper(clnt, xprt, cred,
- RPC_TASK_SOFT | RPC_TASK_SOFTCONN,
+ task = rpc_call_null_helper(clnt, xprt, NULL,
+ RPC_TASK_SOFT | RPC_TASK_SOFTCONN | RPC_TASK_NULLCREDS,
NULL, NULL);
- put_rpccred(cred);
if (IS_ERR(task)) {
status = PTR_ERR(task);
goto out_err;
@@ -2668,6 +2867,9 @@
/* rpc_xprt_switch and rpc_xprt are deferrenced by add_xprt_test() */
xtest->add_xprt_test(clnt, xprt, xtest->data);
+ xprt_put(xprt);
+ xprt_switch_put(xps);
+
/* so that rpc_clnt_add_xprt does not call rpc_xprt_switch_add_xprt */
return 1;
out_err:
@@ -2712,6 +2914,7 @@
xprt = xprt_iter_xprt(&clnt->cl_xpi);
if (xps == NULL || xprt == NULL) {
rcu_read_unlock();
+ xprt_switch_put(xps);
return -EAGAIN;
}
resvport = xprt->resvport;
@@ -2827,7 +3030,7 @@
printk(KERN_INFO "%5u %04x %6d %8p %8p %8ld %8p %sv%u %s a:%ps q:%s\n",
task->tk_pid, task->tk_flags, task->tk_status,
- clnt, task->tk_rqstp, task->tk_timeout, task->tk_ops,
+ clnt, task->tk_rqstp, rpc_task_timeout(task), task->tk_ops,
clnt->cl_program->name, clnt->cl_vers, rpc_proc_name(task),
task->tk_action, rpc_waitq);
}
diff --git a/net/sunrpc/debugfs.c b/net/sunrpc/debugfs.c
index 45a0333..fd9bca2 100644
--- a/net/sunrpc/debugfs.c
+++ b/net/sunrpc/debugfs.c
@@ -1,5 +1,5 @@
// SPDX-License-Identifier: GPL-2.0
-/**
+/*
* debugfs interface for sunrpc
*
* (c) 2014 Jeff Layton <jlayton@primarydata.com>
@@ -11,7 +11,6 @@
#include "netns.h"
static struct dentry *topdir;
-static struct dentry *rpc_fault_dir;
static struct dentry *rpc_clnt_dir;
static struct dentry *rpc_xprt_dir;
@@ -33,7 +32,7 @@
seq_printf(f, "%5u %04x %6d 0x%x 0x%x %8ld %ps %sv%u %s a:%ps q:%s\n",
task->tk_pid, task->tk_flags, task->tk_status,
- clnt->cl_clid, xid, task->tk_timeout, task->tk_ops,
+ clnt->cl_clid, xid, rpc_task_timeout(task), task->tk_ops,
clnt->cl_program->name, clnt->cl_vers, rpc_proc_name(task),
task->tk_action, rpc_waitq);
return 0;
@@ -118,16 +117,37 @@
.release = tasks_release,
};
+static int do_xprt_debugfs(struct rpc_clnt *clnt, struct rpc_xprt *xprt, void *numv)
+{
+ int len;
+ char name[24]; /* enough for "../../rpc_xprt/ + 8 hex digits + NULL */
+ char link[9]; /* enough for 8 hex digits + NULL */
+ int *nump = numv;
+
+ if (IS_ERR_OR_NULL(xprt->debugfs))
+ return 0;
+ len = snprintf(name, sizeof(name), "../../rpc_xprt/%s",
+ xprt->debugfs->d_name.name);
+ if (len > sizeof(name))
+ return -1;
+ if (*nump == 0)
+ strcpy(link, "xprt");
+ else {
+ len = snprintf(link, sizeof(link), "xprt%d", *nump);
+ if (len > sizeof(link))
+ return -1;
+ }
+ debugfs_create_symlink(link, clnt->cl_debugfs, name);
+ (*nump)++;
+ return 0;
+}
+
void
rpc_clnt_debugfs_register(struct rpc_clnt *clnt)
{
int len;
- char name[24]; /* enough for "../../rpc_xprt/ + 8 hex digits + NULL */
- struct rpc_xprt *xprt;
-
- /* Already registered? */
- if (clnt->cl_debugfs || !rpc_clnt_dir)
- return;
+ char name[9]; /* enough for 8 hex digits + NULL */
+ int xprtnum = 0;
len = snprintf(name, sizeof(name), "%x", clnt->cl_clid);
if (len >= sizeof(name))
@@ -135,35 +155,12 @@
/* make the per-client dir */
clnt->cl_debugfs = debugfs_create_dir(name, rpc_clnt_dir);
- if (!clnt->cl_debugfs)
- return;
/* make tasks file */
- if (!debugfs_create_file("tasks", S_IFREG | 0400, clnt->cl_debugfs,
- clnt, &tasks_fops))
- goto out_err;
+ debugfs_create_file("tasks", S_IFREG | 0400, clnt->cl_debugfs, clnt,
+ &tasks_fops);
- rcu_read_lock();
- xprt = rcu_dereference(clnt->cl_xprt);
- /* no "debugfs" dentry? Don't bother with the symlink. */
- if (!xprt->debugfs) {
- rcu_read_unlock();
- return;
- }
- len = snprintf(name, sizeof(name), "../../rpc_xprt/%s",
- xprt->debugfs->d_name.name);
- rcu_read_unlock();
-
- if (len >= sizeof(name))
- goto out_err;
-
- if (!debugfs_create_symlink("xprt", clnt->cl_debugfs, name))
- goto out_err;
-
- return;
-out_err:
- debugfs_remove_recursive(clnt->cl_debugfs);
- clnt->cl_debugfs = NULL;
+ rpc_clnt_iterate_for_each_xprt(clnt, do_xprt_debugfs, &xprtnum);
}
void
@@ -226,9 +223,6 @@
static atomic_t cur_id;
char name[9]; /* 8 hex digits + NULL term */
- if (!rpc_xprt_dir)
- return;
-
id = (unsigned int)atomic_inc_return(&cur_id);
len = snprintf(name, sizeof(name), "%x", id);
@@ -237,15 +231,10 @@
/* make the per-client dir */
xprt->debugfs = debugfs_create_dir(name, rpc_xprt_dir);
- if (!xprt->debugfs)
- return;
/* make tasks file */
- if (!debugfs_create_file("info", S_IFREG | 0400, xprt->debugfs,
- xprt, &xprt_info_fops)) {
- debugfs_remove_recursive(xprt->debugfs);
- xprt->debugfs = NULL;
- }
+ debugfs_create_file("info", S_IFREG | 0400, xprt->debugfs, xprt,
+ &xprt_info_fops);
atomic_set(&xprt->inject_disconnect, rpc_inject_disconnect);
}
@@ -308,28 +297,11 @@
.release = fault_release,
};
-static struct dentry *
-inject_fault_dir(struct dentry *topdir)
-{
- struct dentry *faultdir;
-
- faultdir = debugfs_create_dir("inject_fault", topdir);
- if (!faultdir)
- return NULL;
-
- if (!debugfs_create_file("disconnect", S_IFREG | 0400, faultdir,
- NULL, &fault_disconnect_fops))
- return NULL;
-
- return faultdir;
-}
-
void __exit
sunrpc_debugfs_exit(void)
{
debugfs_remove_recursive(topdir);
topdir = NULL;
- rpc_fault_dir = NULL;
rpc_clnt_dir = NULL;
rpc_xprt_dir = NULL;
}
@@ -337,26 +309,16 @@
void __init
sunrpc_debugfs_init(void)
{
- topdir = debugfs_create_dir("sunrpc", NULL);
- if (!topdir)
- return;
+ struct dentry *rpc_fault_dir;
- rpc_fault_dir = inject_fault_dir(topdir);
- if (!rpc_fault_dir)
- goto out_remove;
+ topdir = debugfs_create_dir("sunrpc", NULL);
rpc_clnt_dir = debugfs_create_dir("rpc_clnt", topdir);
- if (!rpc_clnt_dir)
- goto out_remove;
rpc_xprt_dir = debugfs_create_dir("rpc_xprt", topdir);
- if (!rpc_xprt_dir)
- goto out_remove;
- return;
-out_remove:
- debugfs_remove_recursive(topdir);
- topdir = NULL;
- rpc_fault_dir = NULL;
- rpc_clnt_dir = NULL;
+ rpc_fault_dir = debugfs_create_dir("inject_fault", topdir);
+
+ debugfs_create_file("disconnect", S_IFREG | 0400, rpc_fault_dir, NULL,
+ &fault_disconnect_fops);
}
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index 4fda18d..b71a39d 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* net/sunrpc/rpc_pipe.c
*
@@ -13,6 +14,7 @@
#include <linux/string.h>
#include <linux/pagemap.h>
#include <linux/mount.h>
+#include <linux/fs_context.h>
#include <linux/namei.h>
#include <linux/fsnotify.h>
#include <linux/kernel.h>
@@ -202,18 +204,11 @@
}
static void
-rpc_i_callback(struct rcu_head *head)
+rpc_free_inode(struct inode *inode)
{
- struct inode *inode = container_of(head, struct inode, i_rcu);
kmem_cache_free(rpc_inode_cachep, RPC_I(inode));
}
-static void
-rpc_destroy_inode(struct inode *inode)
-{
- call_rcu(&inode->i_rcu, rpc_i_callback);
-}
-
static int
rpc_pipe_open(struct inode *inode, struct file *filp)
{
@@ -604,6 +599,8 @@
dget(dentry);
ret = simple_rmdir(dir, dentry);
+ if (!ret)
+ fsnotify_rmdir(dir, dentry);
d_delete(dentry);
dput(dentry);
return ret;
@@ -615,6 +612,8 @@
dget(dentry);
ret = simple_unlink(dir, dentry);
+ if (!ret)
+ fsnotify_unlink(dir, dentry);
d_delete(dentry);
dput(dentry);
return ret;
@@ -1123,7 +1122,7 @@
*/
static const struct super_operations s_ops = {
.alloc_inode = rpc_alloc_inode,
- .destroy_inode = rpc_destroy_inode,
+ .free_inode = rpc_free_inode,
.statfs = simple_statfs,
};
@@ -1266,7 +1265,7 @@
* that this file will be there and have a certain format.
*/
static int
-rpc_show_dummy_info(struct seq_file *m, void *v)
+rpc_dummy_info_show(struct seq_file *m, void *v)
{
seq_printf(m, "RPC server: %s\n", utsname()->nodename);
seq_printf(m, "service: foo (1) version 0\n");
@@ -1275,25 +1274,12 @@
seq_printf(m, "port: 0\n");
return 0;
}
-
-static int
-rpc_dummy_info_open(struct inode *inode, struct file *file)
-{
- return single_open(file, rpc_show_dummy_info, NULL);
-}
-
-static const struct file_operations rpc_dummy_info_operations = {
- .owner = THIS_MODULE,
- .open = rpc_dummy_info_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
+DEFINE_SHOW_ATTRIBUTE(rpc_dummy_info);
static const struct rpc_filelist gssd_dummy_info_file[] = {
[0] = {
.name = "info",
- .i_fop = &rpc_dummy_info_operations,
+ .i_fop = &rpc_dummy_info_fops,
.mode = S_IFREG | 0400,
},
};
@@ -1367,11 +1353,11 @@
}
static int
-rpc_fill_super(struct super_block *sb, void *data, int silent)
+rpc_fill_super(struct super_block *sb, struct fs_context *fc)
{
struct inode *inode;
struct dentry *root, *gssd_dentry;
- struct net *net = get_net(sb->s_fs_info);
+ struct net *net = sb->s_fs_info;
struct sunrpc_net *sn = net_generic(net, sunrpc_net_id);
int err;
@@ -1428,12 +1414,28 @@
}
EXPORT_SYMBOL_GPL(gssd_running);
-static struct dentry *
-rpc_mount(struct file_system_type *fs_type,
- int flags, const char *dev_name, void *data)
+static int rpc_fs_get_tree(struct fs_context *fc)
{
- struct net *net = current->nsproxy->net_ns;
- return mount_ns(fs_type, flags, data, net, net->user_ns, rpc_fill_super);
+ return get_tree_keyed(fc, rpc_fill_super, get_net(fc->net_ns));
+}
+
+static void rpc_fs_free_fc(struct fs_context *fc)
+{
+ if (fc->s_fs_info)
+ put_net(fc->s_fs_info);
+}
+
+static const struct fs_context_operations rpc_fs_context_ops = {
+ .free = rpc_fs_free_fc,
+ .get_tree = rpc_fs_get_tree,
+};
+
+static int rpc_init_fs_context(struct fs_context *fc)
+{
+ put_user_ns(fc->user_ns);
+ fc->user_ns = get_user_ns(fc->net_ns->user_ns);
+ fc->ops = &rpc_fs_context_ops;
+ return 0;
}
static void rpc_kill_sb(struct super_block *sb)
@@ -1461,7 +1463,7 @@
static struct file_system_type rpc_pipe_fs_type = {
.owner = THIS_MODULE,
.name = "rpc_pipefs",
- .mount = rpc_mount,
+ .init_fs_context = rpc_init_fs_context,
.kill_sb = rpc_kill_sb,
};
MODULE_ALIAS_FS("rpc_pipefs");
diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c
index c7872bc..4a020b6 100644
--- a/net/sunrpc/rpcb_clnt.c
+++ b/net/sunrpc/rpcb_clnt.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* In-kernel rpcbind client supporting versions 2, 3, and 4 of the rpcbind
* protocol
@@ -240,6 +241,7 @@
.program = &rpcb_program,
.version = RPCBVERS_2,
.authflavor = RPC_AUTH_NULL,
+ .cred = current_cred(),
/*
* We turn off the idle timeout to prevent the kernel
* from automatically disconnecting the socket.
@@ -299,6 +301,7 @@
.program = &rpcb_program,
.version = RPCBVERS_2,
.authflavor = RPC_AUTH_UNIX,
+ .cred = current_cred(),
.flags = RPC_CLNT_CREATE_NOPING,
};
struct rpc_clnt *clnt, *clnt4;
@@ -358,7 +361,8 @@
static struct rpc_clnt *rpcb_create(struct net *net, const char *nodename,
const char *hostname,
struct sockaddr *srvaddr, size_t salen,
- int proto, u32 version)
+ int proto, u32 version,
+ const struct cred *cred)
{
struct rpc_create_args args = {
.net = net,
@@ -370,6 +374,7 @@
.program = &rpcb_program,
.version = version,
.authflavor = RPC_AUTH_UNIX,
+ .cred = cred,
.flags = (RPC_CLNT_CREATE_NOPING |
RPC_CLNT_CREATE_NONPRIVPORT),
};
@@ -694,7 +699,8 @@
/* Put self on the wait queue to ensure we get notified if
* some other task is already attempting to bind the port */
- rpc_sleep_on(&xprt->binding, task, NULL);
+ rpc_sleep_on_timeout(&xprt->binding, task,
+ NULL, jiffies + xprt->bind_timeout);
if (xprt_test_and_set_binding(xprt)) {
dprintk("RPC: %5u %s: waiting for another binder\n",
@@ -744,7 +750,8 @@
rpcb_clnt = rpcb_create(xprt->xprt_net,
clnt->cl_nodename,
xprt->servername, sap, salen,
- xprt->prot, bind_version);
+ xprt->prot, bind_version,
+ clnt->cl_cred);
if (IS_ERR(rpcb_clnt)) {
status = PTR_ERR(rpcb_clnt);
dprintk("RPC: %5u %s: rpcb_create failed, error %ld\n",
@@ -752,7 +759,7 @@
goto bailout_nofree;
}
- map = kzalloc(sizeof(struct rpcbind_args), GFP_ATOMIC);
+ map = kzalloc(sizeof(struct rpcbind_args), GFP_NOFS);
if (!map) {
status = -ENOMEM;
dprintk("RPC: %5u %s: no memory available\n",
@@ -770,7 +777,13 @@
case RPCBVERS_4:
case RPCBVERS_3:
map->r_netid = xprt->address_strings[RPC_DISPLAY_NETID];
- map->r_addr = rpc_sockaddr2uaddr(sap, GFP_ATOMIC);
+ map->r_addr = rpc_sockaddr2uaddr(sap, GFP_NOFS);
+ if (!map->r_addr) {
+ status = -ENOMEM;
+ dprintk("RPC: %5u %s: no memory available\n",
+ task->tk_pid, __func__);
+ goto bailout_free_args;
+ }
map->r_owner = "";
break;
case RPCBVERS_2:
@@ -793,6 +806,8 @@
rpc_put_task(child);
return;
+bailout_free_args:
+ kfree(map);
bailout_release_client:
rpc_release_client(rpcb_clnt);
bailout_nofree:
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index 3fe5d60..360afe1 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/net/sunrpc/sched.c
*
@@ -19,8 +20,10 @@
#include <linux/spinlock.h>
#include <linux/mutex.h>
#include <linux/freezer.h>
+#include <linux/sched/mm.h>
#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/metrics.h>
#include "sunrpc.h"
@@ -44,7 +47,7 @@
static void rpc_async_schedule(struct work_struct *);
static void rpc_release_task(struct rpc_task *task);
-static void __rpc_queue_timer_fn(struct timer_list *t);
+static void __rpc_queue_timer_fn(struct work_struct *);
/*
* RPC tasks sit here while waiting for conditions to improve.
@@ -56,6 +59,21 @@
*/
struct workqueue_struct *rpciod_workqueue __read_mostly;
struct workqueue_struct *xprtiod_workqueue __read_mostly;
+EXPORT_SYMBOL_GPL(xprtiod_workqueue);
+
+unsigned long
+rpc_task_timeout(const struct rpc_task *task)
+{
+ unsigned long timeout = READ_ONCE(task->tk_timeout);
+
+ if (timeout != 0) {
+ unsigned long now = jiffies;
+ if (time_before(now, timeout))
+ return timeout - now;
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(rpc_task_timeout);
/*
* Disable the timer for a given RPC task. Should be called with
@@ -65,71 +83,101 @@
static void
__rpc_disable_timer(struct rpc_wait_queue *queue, struct rpc_task *task)
{
- if (task->tk_timeout == 0)
+ if (list_empty(&task->u.tk_wait.timer_list))
return;
dprintk("RPC: %5u disabling timer\n", task->tk_pid);
task->tk_timeout = 0;
list_del(&task->u.tk_wait.timer_list);
if (list_empty(&queue->timer_list.list))
- del_timer(&queue->timer_list.timer);
+ cancel_delayed_work(&queue->timer_list.dwork);
}
static void
rpc_set_queue_timer(struct rpc_wait_queue *queue, unsigned long expires)
{
+ unsigned long now = jiffies;
queue->timer_list.expires = expires;
- mod_timer(&queue->timer_list.timer, expires);
+ if (time_before_eq(expires, now))
+ expires = 0;
+ else
+ expires -= now;
+ mod_delayed_work(rpciod_workqueue, &queue->timer_list.dwork, expires);
}
/*
* Set up a timer for the current task.
*/
static void
-__rpc_add_timer(struct rpc_wait_queue *queue, struct rpc_task *task)
+__rpc_add_timer(struct rpc_wait_queue *queue, struct rpc_task *task,
+ unsigned long timeout)
{
- if (!task->tk_timeout)
- return;
-
dprintk("RPC: %5u setting alarm for %u ms\n",
- task->tk_pid, jiffies_to_msecs(task->tk_timeout));
+ task->tk_pid, jiffies_to_msecs(timeout - jiffies));
- task->u.tk_wait.expires = jiffies + task->tk_timeout;
- if (list_empty(&queue->timer_list.list) || time_before(task->u.tk_wait.expires, queue->timer_list.expires))
- rpc_set_queue_timer(queue, task->u.tk_wait.expires);
+ task->tk_timeout = timeout;
+ if (list_empty(&queue->timer_list.list) || time_before(timeout, queue->timer_list.expires))
+ rpc_set_queue_timer(queue, timeout);
list_add(&task->u.tk_wait.timer_list, &queue->timer_list.list);
}
-static void rpc_rotate_queue_owner(struct rpc_wait_queue *queue)
-{
- struct list_head *q = &queue->tasks[queue->priority];
- struct rpc_task *task;
-
- if (!list_empty(q)) {
- task = list_first_entry(q, struct rpc_task, u.tk_wait.list);
- if (task->tk_owner == queue->owner)
- list_move_tail(&task->u.tk_wait.list, q);
- }
-}
-
static void rpc_set_waitqueue_priority(struct rpc_wait_queue *queue, int priority)
{
if (queue->priority != priority) {
- /* Fairness: rotate the list when changing priority */
- rpc_rotate_queue_owner(queue);
queue->priority = priority;
+ queue->nr = 1U << priority;
}
}
-static void rpc_set_waitqueue_owner(struct rpc_wait_queue *queue, pid_t pid)
-{
- queue->owner = pid;
- queue->nr = RPC_BATCH_COUNT;
-}
-
static void rpc_reset_waitqueue_priority(struct rpc_wait_queue *queue)
{
rpc_set_waitqueue_priority(queue, queue->maxpriority);
- rpc_set_waitqueue_owner(queue, 0);
+}
+
+/*
+ * Add a request to a queue list
+ */
+static void
+__rpc_list_enqueue_task(struct list_head *q, struct rpc_task *task)
+{
+ struct rpc_task *t;
+
+ list_for_each_entry(t, q, u.tk_wait.list) {
+ if (t->tk_owner == task->tk_owner) {
+ list_add_tail(&task->u.tk_wait.links,
+ &t->u.tk_wait.links);
+ /* Cache the queue head in task->u.tk_wait.list */
+ task->u.tk_wait.list.next = q;
+ task->u.tk_wait.list.prev = NULL;
+ return;
+ }
+ }
+ INIT_LIST_HEAD(&task->u.tk_wait.links);
+ list_add_tail(&task->u.tk_wait.list, q);
+}
+
+/*
+ * Remove request from a queue list
+ */
+static void
+__rpc_list_dequeue_task(struct rpc_task *task)
+{
+ struct list_head *q;
+ struct rpc_task *t;
+
+ if (task->u.tk_wait.list.prev == NULL) {
+ list_del(&task->u.tk_wait.links);
+ return;
+ }
+ if (!list_empty(&task->u.tk_wait.links)) {
+ t = list_first_entry(&task->u.tk_wait.links,
+ struct rpc_task,
+ u.tk_wait.links);
+ /* Assume __rpc_list_enqueue_task() cached the queue head */
+ q = t->u.tk_wait.list.next;
+ list_add_tail(&t->u.tk_wait.list, q);
+ list_del(&task->u.tk_wait.links);
+ }
+ list_del(&task->u.tk_wait.list);
}
/*
@@ -139,22 +187,9 @@
struct rpc_task *task,
unsigned char queue_priority)
{
- struct list_head *q;
- struct rpc_task *t;
-
- INIT_LIST_HEAD(&task->u.tk_wait.links);
if (unlikely(queue_priority > queue->maxpriority))
queue_priority = queue->maxpriority;
- if (queue_priority > queue->priority)
- rpc_set_waitqueue_priority(queue, queue_priority);
- q = &queue->tasks[queue_priority];
- list_for_each_entry(t, q, u.tk_wait.list) {
- if (t->tk_owner == task->tk_owner) {
- list_add_tail(&task->u.tk_wait.list, &t->u.tk_wait.links);
- return;
- }
- }
- list_add_tail(&task->u.tk_wait.list, q);
+ __rpc_list_enqueue_task(&queue->tasks[queue_priority], task);
}
/*
@@ -173,6 +208,7 @@
if (RPC_IS_QUEUED(task))
return;
+ INIT_LIST_HEAD(&task->u.tk_wait.timer_list);
if (RPC_IS_PRIORITY(queue))
__rpc_add_wait_queue_priority(queue, task, queue_priority);
else if (RPC_IS_SWAPPER(task))
@@ -194,13 +230,7 @@
*/
static void __rpc_remove_wait_queue_priority(struct rpc_task *task)
{
- struct rpc_task *t;
-
- if (!list_empty(&task->u.tk_wait.links)) {
- t = list_entry(task->u.tk_wait.links.next, struct rpc_task, u.tk_wait.list);
- list_move(&t->u.tk_wait.list, &task->u.tk_wait.list);
- list_splice_init(&task->u.tk_wait.links, &t->u.tk_wait.links);
- }
+ __rpc_list_dequeue_task(task);
}
/*
@@ -212,7 +242,8 @@
__rpc_disable_timer(queue, task);
if (RPC_IS_PRIORITY(queue))
__rpc_remove_wait_queue_priority(task);
- list_del(&task->u.tk_wait.list);
+ else
+ list_del(&task->u.tk_wait.list);
queue->qlen--;
dprintk("RPC: %5u removed from queue %p \"%s\"\n",
task->tk_pid, queue, rpc_qname(queue));
@@ -228,7 +259,8 @@
queue->maxpriority = nr_queues - 1;
rpc_reset_waitqueue_priority(queue);
queue->qlen = 0;
- timer_setup(&queue->timer_list.timer, __rpc_queue_timer_fn, 0);
+ queue->timer_list.expires = 0;
+ INIT_DEFERRABLE_WORK(&queue->timer_list.dwork, __rpc_queue_timer_fn);
INIT_LIST_HEAD(&queue->timer_list.list);
rpc_assign_waitqueue_name(queue, qname);
}
@@ -247,7 +279,7 @@
void rpc_destroy_wait_queue(struct rpc_wait_queue *queue)
{
- del_timer_sync(&queue->timer_list.timer);
+ cancel_delayed_work_sync(&queue->timer_list.dwork);
}
EXPORT_SYMBOL_GPL(rpc_destroy_wait_queue);
@@ -352,7 +384,6 @@
*/
static void __rpc_sleep_on_priority(struct rpc_wait_queue *q,
struct rpc_task *task,
- rpc_action action,
unsigned char queue_priority)
{
dprintk("RPC: %5u sleep_on(queue \"%s\" time %lu)\n",
@@ -362,48 +393,101 @@
__rpc_add_wait_queue(q, task, queue_priority);
- WARN_ON_ONCE(task->tk_callback != NULL);
- task->tk_callback = action;
- __rpc_add_timer(q, task);
}
+static void __rpc_sleep_on_priority_timeout(struct rpc_wait_queue *q,
+ struct rpc_task *task, unsigned long timeout,
+ unsigned char queue_priority)
+{
+ if (time_is_after_jiffies(timeout)) {
+ __rpc_sleep_on_priority(q, task, queue_priority);
+ __rpc_add_timer(q, task, timeout);
+ } else
+ task->tk_status = -ETIMEDOUT;
+}
+
+static void rpc_set_tk_callback(struct rpc_task *task, rpc_action action)
+{
+ if (action && !WARN_ON_ONCE(task->tk_callback != NULL))
+ task->tk_callback = action;
+}
+
+static bool rpc_sleep_check_activated(struct rpc_task *task)
+{
+ /* We shouldn't ever put an inactive task to sleep */
+ if (WARN_ON_ONCE(!RPC_IS_ACTIVATED(task))) {
+ task->tk_status = -EIO;
+ rpc_put_task_async(task);
+ return false;
+ }
+ return true;
+}
+
+void rpc_sleep_on_timeout(struct rpc_wait_queue *q, struct rpc_task *task,
+ rpc_action action, unsigned long timeout)
+{
+ if (!rpc_sleep_check_activated(task))
+ return;
+
+ rpc_set_tk_callback(task, action);
+
+ /*
+ * Protect the queue operations.
+ */
+ spin_lock(&q->lock);
+ __rpc_sleep_on_priority_timeout(q, task, timeout, task->tk_priority);
+ spin_unlock(&q->lock);
+}
+EXPORT_SYMBOL_GPL(rpc_sleep_on_timeout);
+
void rpc_sleep_on(struct rpc_wait_queue *q, struct rpc_task *task,
rpc_action action)
{
- /* We shouldn't ever put an inactive task to sleep */
- WARN_ON_ONCE(!RPC_IS_ACTIVATED(task));
- if (!RPC_IS_ACTIVATED(task)) {
- task->tk_status = -EIO;
- rpc_put_task_async(task);
+ if (!rpc_sleep_check_activated(task))
return;
- }
+ rpc_set_tk_callback(task, action);
+
+ WARN_ON_ONCE(task->tk_timeout != 0);
/*
* Protect the queue operations.
*/
- spin_lock_bh(&q->lock);
- __rpc_sleep_on_priority(q, task, action, task->tk_priority);
- spin_unlock_bh(&q->lock);
+ spin_lock(&q->lock);
+ __rpc_sleep_on_priority(q, task, task->tk_priority);
+ spin_unlock(&q->lock);
}
EXPORT_SYMBOL_GPL(rpc_sleep_on);
-void rpc_sleep_on_priority(struct rpc_wait_queue *q, struct rpc_task *task,
- rpc_action action, int priority)
+void rpc_sleep_on_priority_timeout(struct rpc_wait_queue *q,
+ struct rpc_task *task, unsigned long timeout, int priority)
{
- /* We shouldn't ever put an inactive task to sleep */
- WARN_ON_ONCE(!RPC_IS_ACTIVATED(task));
- if (!RPC_IS_ACTIVATED(task)) {
- task->tk_status = -EIO;
- rpc_put_task_async(task);
+ if (!rpc_sleep_check_activated(task))
return;
- }
+ priority -= RPC_PRIORITY_LOW;
/*
* Protect the queue operations.
*/
- spin_lock_bh(&q->lock);
- __rpc_sleep_on_priority(q, task, action, priority - RPC_PRIORITY_LOW);
- spin_unlock_bh(&q->lock);
+ spin_lock(&q->lock);
+ __rpc_sleep_on_priority_timeout(q, task, timeout, priority);
+ spin_unlock(&q->lock);
+}
+EXPORT_SYMBOL_GPL(rpc_sleep_on_priority_timeout);
+
+void rpc_sleep_on_priority(struct rpc_wait_queue *q, struct rpc_task *task,
+ int priority)
+{
+ if (!rpc_sleep_check_activated(task))
+ return;
+
+ WARN_ON_ONCE(task->tk_timeout != 0);
+ priority -= RPC_PRIORITY_LOW;
+ /*
+ * Protect the queue operations.
+ */
+ spin_lock(&q->lock);
+ __rpc_sleep_on_priority(q, task, priority);
+ spin_unlock(&q->lock);
}
EXPORT_SYMBOL_GPL(rpc_sleep_on_priority);
@@ -440,34 +524,31 @@
/*
* Wake up a queued task while the queue lock is being held
*/
-static void rpc_wake_up_task_on_wq_queue_locked(struct workqueue_struct *wq,
- struct rpc_wait_queue *queue, struct rpc_task *task)
+static struct rpc_task *
+rpc_wake_up_task_on_wq_queue_action_locked(struct workqueue_struct *wq,
+ struct rpc_wait_queue *queue, struct rpc_task *task,
+ bool (*action)(struct rpc_task *, void *), void *data)
{
if (RPC_IS_QUEUED(task)) {
smp_rmb();
- if (task->tk_waitqueue == queue)
- __rpc_do_wake_up_task_on_wq(wq, queue, task);
+ if (task->tk_waitqueue == queue) {
+ if (action == NULL || action(task, data)) {
+ __rpc_do_wake_up_task_on_wq(wq, queue, task);
+ return task;
+ }
+ }
}
+ return NULL;
}
/*
* Wake up a queued task while the queue lock is being held
*/
-static void rpc_wake_up_task_queue_locked(struct rpc_wait_queue *queue, struct rpc_task *task)
+static void rpc_wake_up_task_queue_locked(struct rpc_wait_queue *queue,
+ struct rpc_task *task)
{
- rpc_wake_up_task_on_wq_queue_locked(rpciod_workqueue, queue, task);
-}
-
-/*
- * Wake up a task on a specific queue
- */
-void rpc_wake_up_queued_task_on_wq(struct workqueue_struct *wq,
- struct rpc_wait_queue *queue,
- struct rpc_task *task)
-{
- spin_lock_bh(&queue->lock);
- rpc_wake_up_task_on_wq_queue_locked(wq, queue, task);
- spin_unlock_bh(&queue->lock);
+ rpc_wake_up_task_on_wq_queue_action_locked(rpciod_workqueue, queue,
+ task, NULL, NULL);
}
/*
@@ -475,12 +556,48 @@
*/
void rpc_wake_up_queued_task(struct rpc_wait_queue *queue, struct rpc_task *task)
{
- spin_lock_bh(&queue->lock);
+ if (!RPC_IS_QUEUED(task))
+ return;
+ spin_lock(&queue->lock);
rpc_wake_up_task_queue_locked(queue, task);
- spin_unlock_bh(&queue->lock);
+ spin_unlock(&queue->lock);
}
EXPORT_SYMBOL_GPL(rpc_wake_up_queued_task);
+static bool rpc_task_action_set_status(struct rpc_task *task, void *status)
+{
+ task->tk_status = *(int *)status;
+ return true;
+}
+
+static void
+rpc_wake_up_task_queue_set_status_locked(struct rpc_wait_queue *queue,
+ struct rpc_task *task, int status)
+{
+ rpc_wake_up_task_on_wq_queue_action_locked(rpciod_workqueue, queue,
+ task, rpc_task_action_set_status, &status);
+}
+
+/**
+ * rpc_wake_up_queued_task_set_status - wake up a task and set task->tk_status
+ * @queue: pointer to rpc_wait_queue
+ * @task: pointer to rpc_task
+ * @status: integer error value
+ *
+ * If @task is queued on @queue, then it is woken up, and @task->tk_status is
+ * set to the value of @status.
+ */
+void
+rpc_wake_up_queued_task_set_status(struct rpc_wait_queue *queue,
+ struct rpc_task *task, int status)
+{
+ if (!RPC_IS_QUEUED(task))
+ return;
+ spin_lock(&queue->lock);
+ rpc_wake_up_task_queue_set_status_locked(queue, task, status);
+ spin_unlock(&queue->lock);
+}
+
/*
* Wake up the next task on a priority queue.
*/
@@ -493,17 +610,9 @@
* Service a batch of tasks from a single owner.
*/
q = &queue->tasks[queue->priority];
- if (!list_empty(q)) {
- task = list_entry(q->next, struct rpc_task, u.tk_wait.list);
- if (queue->owner == task->tk_owner) {
- if (--queue->nr)
- goto out;
- list_move_tail(&task->u.tk_wait.list, q);
- }
- /*
- * Check if we need to switch queues.
- */
- goto new_owner;
+ if (!list_empty(q) && --queue->nr) {
+ task = list_first_entry(q, struct rpc_task, u.tk_wait.list);
+ goto out;
}
/*
@@ -515,7 +624,7 @@
else
q = q - 1;
if (!list_empty(q)) {
- task = list_entry(q->next, struct rpc_task, u.tk_wait.list);
+ task = list_first_entry(q, struct rpc_task, u.tk_wait.list);
goto new_queue;
}
} while (q != &queue->tasks[queue->priority]);
@@ -525,8 +634,6 @@
new_queue:
rpc_set_waitqueue_priority(queue, (unsigned int)(q - &queue->tasks[0]));
-new_owner:
- rpc_set_waitqueue_owner(queue, task->tk_owner);
out:
return task;
}
@@ -551,15 +658,12 @@
dprintk("RPC: wake_up_first(%p \"%s\")\n",
queue, rpc_qname(queue));
- spin_lock_bh(&queue->lock);
+ spin_lock(&queue->lock);
task = __rpc_find_next_queued(queue);
- if (task != NULL) {
- if (func(task, data))
- rpc_wake_up_task_on_wq_queue_locked(wq, queue, task);
- else
- task = NULL;
- }
- spin_unlock_bh(&queue->lock);
+ if (task != NULL)
+ task = rpc_wake_up_task_on_wq_queue_action_locked(wq, queue,
+ task, func, data);
+ spin_unlock(&queue->lock);
return task;
}
@@ -598,7 +702,7 @@
{
struct list_head *head;
- spin_lock_bh(&queue->lock);
+ spin_lock(&queue->lock);
head = &queue->tasks[queue->maxpriority];
for (;;) {
while (!list_empty(head)) {
@@ -612,7 +716,7 @@
break;
head--;
}
- spin_unlock_bh(&queue->lock);
+ spin_unlock(&queue->lock);
}
EXPORT_SYMBOL_GPL(rpc_wake_up);
@@ -627,7 +731,7 @@
{
struct list_head *head;
- spin_lock_bh(&queue->lock);
+ spin_lock(&queue->lock);
head = &queue->tasks[queue->maxpriority];
for (;;) {
while (!list_empty(head)) {
@@ -642,20 +746,22 @@
break;
head--;
}
- spin_unlock_bh(&queue->lock);
+ spin_unlock(&queue->lock);
}
EXPORT_SYMBOL_GPL(rpc_wake_up_status);
-static void __rpc_queue_timer_fn(struct timer_list *t)
+static void __rpc_queue_timer_fn(struct work_struct *work)
{
- struct rpc_wait_queue *queue = from_timer(queue, t, timer_list.timer);
+ struct rpc_wait_queue *queue = container_of(work,
+ struct rpc_wait_queue,
+ timer_list.dwork.work);
struct rpc_task *task, *n;
unsigned long expires, now, timeo;
spin_lock(&queue->lock);
expires = now = jiffies;
list_for_each_entry_safe(task, n, &queue->timer_list.list, u.tk_wait.timer_list) {
- timeo = task->u.tk_wait.expires;
+ timeo = task->tk_timeout;
if (time_after_eq(now, timeo)) {
dprintk("RPC: %5u timeout\n", task->tk_pid);
task->tk_status = -ETIMEDOUT;
@@ -681,8 +787,7 @@
*/
void rpc_delay(struct rpc_task *task, unsigned long delay)
{
- task->tk_timeout = delay;
- rpc_sleep_on(&delay_queue, task, __rpc_atrun);
+ rpc_sleep_on_timeout(&delay_queue, task, __rpc_atrun, jiffies + delay);
}
EXPORT_SYMBOL_GPL(rpc_delay);
@@ -710,8 +815,7 @@
rpc_reset_task_statistics(struct rpc_task *task)
{
task->tk_timeouts = 0;
- task->tk_flags &= ~(RPC_CALL_MAJORSEEN|RPC_TASK_KILLED|RPC_TASK_SENT);
-
+ task->tk_flags &= ~(RPC_CALL_MAJORSEEN|RPC_TASK_SENT);
rpc_init_task_statistics(task);
}
@@ -721,10 +825,13 @@
void rpc_exit_task(struct rpc_task *task)
{
task->tk_action = NULL;
+ if (task->tk_ops->rpc_count_stats)
+ task->tk_ops->rpc_count_stats(task, task->tk_calldata);
+ else if (task->tk_client)
+ rpc_count_iostats(task, task->tk_client->cl_metrics);
if (task->tk_ops->rpc_call_done != NULL) {
task->tk_ops->rpc_call_done(task, task->tk_calldata);
if (task->tk_action != NULL) {
- WARN_ON(RPC_ASSASSINATED(task));
/* Always release the RPC slot and buffer memory */
xprt_release(task);
rpc_reset_task_statistics(task);
@@ -732,12 +839,24 @@
}
}
+void rpc_signal_task(struct rpc_task *task)
+{
+ struct rpc_wait_queue *queue;
+
+ if (!RPC_IS_ACTIVATED(task))
+ return;
+ set_bit(RPC_TASK_SIGNALLED, &task->tk_runstate);
+ smp_mb__after_atomic();
+ queue = READ_ONCE(task->tk_waitqueue);
+ if (queue)
+ rpc_wake_up_queued_task_set_status(queue, task, -ERESTARTSYS);
+}
+
void rpc_exit(struct rpc_task *task, int status)
{
task->tk_status = status;
task->tk_action = rpc_exit_task;
- if (RPC_IS_QUEUED(task))
- rpc_wake_up_queued_task(task->tk_waitqueue, task);
+ rpc_wake_up_queued_task(task->tk_waitqueue, task);
}
EXPORT_SYMBOL_GPL(rpc_exit);
@@ -788,6 +907,15 @@
*/
if (!RPC_IS_QUEUED(task))
continue;
+
+ /*
+ * Signalled tasks should exit rather than sleep.
+ */
+ if (RPC_SIGNALLED(task)) {
+ task->tk_rpc_status = -ERESTARTSYS;
+ rpc_exit(task, -ERESTARTSYS);
+ }
+
/*
* The queue->lock protects against races with
* rpc_make_runnable().
@@ -798,13 +926,13 @@
* rpc_task pointer may still be dereferenced.
*/
queue = task->tk_waitqueue;
- spin_lock_bh(&queue->lock);
+ spin_lock(&queue->lock);
if (!RPC_IS_QUEUED(task)) {
- spin_unlock_bh(&queue->lock);
+ spin_unlock(&queue->lock);
continue;
}
rpc_clear_running(task);
- spin_unlock_bh(&queue->lock);
+ spin_unlock(&queue->lock);
if (task_is_async)
return;
@@ -813,7 +941,7 @@
status = out_of_line_wait_on_bit(&task->tk_runstate,
RPC_TASK_QUEUED, rpc_wait_bit_killable,
TASK_KILLABLE);
- if (status == -ERESTARTSYS) {
+ if (status < 0) {
/*
* When a sync task receives a signal, it exits with
* -ERESTARTSYS. In order to catch any callbacks that
@@ -821,7 +949,8 @@
* break the loop here, but go around once more.
*/
dprintk("RPC: %5u got signal\n", task->tk_pid);
- task->tk_flags |= RPC_TASK_KILLED;
+ set_bit(RPC_TASK_SIGNALLED, &task->tk_runstate);
+ task->tk_rpc_status = -ERESTARTSYS;
rpc_exit(task, -ERESTARTSYS);
}
dprintk("RPC: %5u sync task resuming\n", task->tk_pid);
@@ -854,7 +983,10 @@
static void rpc_async_schedule(struct work_struct *work)
{
+ unsigned int pflags = memalloc_nofs_save();
+
__rpc_execute(container_of(work, struct rpc_task, u.tk_work));
+ memalloc_nofs_restore(pflags);
}
/**
@@ -873,16 +1005,13 @@
* Most requests are 'small' (under 2KiB) and can be serviced from a
* mempool, ensuring that NFS reads and writes can always proceed,
* and that there is good locality of reference for these buffers.
- *
- * In order to avoid memory starvation triggering more writebacks of
- * NFS requests, we avoid using GFP_KERNEL.
*/
int rpc_malloc(struct rpc_task *task)
{
struct rpc_rqst *rqst = task->tk_rqstp;
size_t size = rqst->rq_callsize + rqst->rq_rcvsize;
struct rpc_buffer *buf;
- gfp_t gfp = GFP_NOIO | __GFP_NOWARN;
+ gfp_t gfp = GFP_NOFS;
if (RPC_IS_SWAPPER(task))
gfp = __GFP_MEMALLOC | GFP_NOWAIT | __GFP_NOWARN;
@@ -947,7 +1076,10 @@
/* Initialize workqueue for async tasks */
task->tk_workqueue = task_setup_data->workqueue;
- task->tk_xprt = xprt_get(task_setup_data->rpc_xprt);
+ task->tk_xprt = rpc_task_get_xprt(task_setup_data->rpc_client,
+ xprt_get(task_setup_data->rpc_xprt));
+
+ task->tk_op_cred = get_rpccred(task_setup_data->rpc_op_cred);
if (task->tk_ops->rpc_call_prepare != NULL)
task->tk_action = rpc_prepare_task;
@@ -961,7 +1093,7 @@
static struct rpc_task *
rpc_alloc_task(void)
{
- return (struct rpc_task *)mempool_alloc(rpc_task_mempool, GFP_NOIO);
+ return (struct rpc_task *)mempool_alloc(rpc_task_mempool, GFP_NOFS);
}
/*
@@ -1006,6 +1138,7 @@
{
unsigned short tk_flags = task->tk_flags;
+ put_rpccred(task->tk_op_cred);
rpc_release_calldata(task->tk_ops, task->tk_calldata);
if (tk_flags & RPC_TASK_DYNAMIC) {
@@ -1016,14 +1149,17 @@
static void rpc_async_release(struct work_struct *work)
{
+ unsigned int pflags = memalloc_nofs_save();
+
rpc_free_task(container_of(work, struct rpc_task, u.tk_work));
+ memalloc_nofs_restore(pflags);
}
static void rpc_release_resources_task(struct rpc_task *task)
{
xprt_release(task);
if (task->tk_msg.rpc_cred) {
- put_rpccred(task->tk_msg.rpc_cred);
+ put_cred(task->tk_msg.rpc_cred);
task->tk_msg.rpc_cred = NULL;
}
rpc_task_release_client(task);
diff --git a/net/sunrpc/socklib.c b/net/sunrpc/socklib.c
index f217c34..1a864f1 100644
--- a/net/sunrpc/socklib.c
+++ b/net/sunrpc/socklib.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/net/sunrpc/socklib.c
*
@@ -26,7 +27,8 @@
* Possibly called several times to iterate over an sk_buff and copy
* data out of it.
*/
-size_t xdr_skb_read_bits(struct xdr_skb_reader *desc, void *to, size_t len)
+static size_t
+xdr_skb_read_bits(struct xdr_skb_reader *desc, void *to, size_t len)
{
if (len > desc->count)
len = desc->count;
@@ -36,7 +38,6 @@
desc->offset += len;
return len;
}
-EXPORT_SYMBOL_GPL(xdr_skb_read_bits);
/**
* xdr_skb_read_and_csum_bits - copy and checksum from skb to buffer
@@ -69,7 +70,8 @@
* @copy_actor: virtual method for copying data
*
*/
-ssize_t xdr_partial_copy_from_skb(struct xdr_buf *xdr, unsigned int base, struct xdr_skb_reader *desc, xdr_skb_read_actor copy_actor)
+static ssize_t
+xdr_partial_copy_from_skb(struct xdr_buf *xdr, unsigned int base, struct xdr_skb_reader *desc, xdr_skb_read_actor copy_actor)
{
struct page **ppage = xdr->pages;
unsigned int len, pglen = xdr->page_len;
@@ -104,8 +106,8 @@
/* ACL likes to be lazy in allocating pages - ACLs
* are small by default but can get huge. */
- if (unlikely(*ppage == NULL)) {
- *ppage = alloc_page(GFP_ATOMIC);
+ if ((xdr->flags & XDRBUF_SPARSE_PAGES) && *ppage == NULL) {
+ *ppage = alloc_page(GFP_NOWAIT | __GFP_NOWARN);
if (unlikely(*ppage == NULL)) {
if (copied == 0)
copied = -ENOMEM;
@@ -140,7 +142,6 @@
out:
return copied;
}
-EXPORT_SYMBOL_GPL(xdr_partial_copy_from_skb);
/**
* csum_partial_copy_to_xdr - checksum and copy data
@@ -175,7 +176,7 @@
return -1;
if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
!skb->csum_complete_sw)
- netdev_rx_csum_fault(skb->dev);
+ netdev_rx_csum_fault(skb->dev, skb);
return 0;
no_checksum:
if (xdr_partial_copy_from_skb(xdr, 0, &desc, xdr_skb_read_bits) < 0)
diff --git a/net/sunrpc/stats.c b/net/sunrpc/stats.c
index 71166b3..7c74197 100644
--- a/net/sunrpc/stats.c
+++ b/net/sunrpc/stats.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/net/sunrpc/stats.c
*
@@ -176,6 +177,8 @@
execute = ktime_sub(now, task->tk_start);
op_metrics->om_execute = ktime_add(op_metrics->om_execute, execute);
+ if (task->tk_status < 0)
+ op_metrics->om_error_status++;
spin_unlock(&op_metrics->om_lock);
@@ -218,13 +221,14 @@
a->om_queue = ktime_add(a->om_queue, b->om_queue);
a->om_rtt = ktime_add(a->om_rtt, b->om_rtt);
a->om_execute = ktime_add(a->om_execute, b->om_execute);
+ a->om_error_status += b->om_error_status;
}
static void _print_rpc_iostats(struct seq_file *seq, struct rpc_iostats *stats,
int op, const struct rpc_procinfo *procs)
{
_print_name(seq, op, procs);
- seq_printf(seq, "%lu %lu %lu %Lu %Lu %Lu %Lu %Lu\n",
+ seq_printf(seq, "%lu %lu %lu %llu %llu %llu %llu %llu %lu\n",
stats->om_ops,
stats->om_ntrans,
stats->om_timeouts,
@@ -232,12 +236,20 @@
stats->om_bytes_recv,
ktime_to_ms(stats->om_queue),
ktime_to_ms(stats->om_rtt),
- ktime_to_ms(stats->om_execute));
+ ktime_to_ms(stats->om_execute),
+ stats->om_error_status);
+}
+
+static int do_print_stats(struct rpc_clnt *clnt, struct rpc_xprt *xprt, void *seqv)
+{
+ struct seq_file *seq = seqv;
+
+ xprt->ops->print_stats(xprt, seq);
+ return 0;
}
void rpc_clnt_show_stats(struct seq_file *seq, struct rpc_clnt *clnt)
{
- struct rpc_xprt *xprt;
unsigned int op, maxproc = clnt->cl_maxproc;
if (!clnt->cl_metrics)
@@ -247,11 +259,7 @@
seq_printf(seq, "p/v: %u/%u (%s)\n",
clnt->cl_prog, clnt->cl_vers, clnt->cl_program->name);
- rcu_read_lock();
- xprt = rcu_dereference(clnt->cl_xprt);
- if (xprt)
- xprt->ops->print_stats(xprt, seq);
- rcu_read_unlock();
+ rpc_clnt_iterate_for_each_xprt(clnt, do_print_stats, seq);
seq_printf(seq, "\tper-op statistics\n");
for (op = 0; op < maxproc; op++) {
diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c
index 56f9eff..f9edaa9 100644
--- a/net/sunrpc/sunrpc_syms.c
+++ b/net/sunrpc/sunrpc_syms.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/net/sunrpc/sunrpc_syms.c
*
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index d13e05f..d11b705 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/net/sunrpc/svc.c
*
@@ -993,6 +994,58 @@
return error;
}
+int svc_rpcbind_set_version(struct net *net,
+ const struct svc_program *progp,
+ u32 version, int family,
+ unsigned short proto,
+ unsigned short port)
+{
+ dprintk("svc: svc_register(%sv%d, %s, %u, %u)\n",
+ progp->pg_name, version,
+ proto == IPPROTO_UDP? "udp" : "tcp",
+ port, family);
+
+ return __svc_register(net, progp->pg_name, progp->pg_prog,
+ version, family, proto, port);
+
+}
+EXPORT_SYMBOL_GPL(svc_rpcbind_set_version);
+
+int svc_generic_rpcbind_set(struct net *net,
+ const struct svc_program *progp,
+ u32 version, int family,
+ unsigned short proto,
+ unsigned short port)
+{
+ const struct svc_version *vers = progp->pg_vers[version];
+ int error;
+
+ if (vers == NULL)
+ return 0;
+
+ if (vers->vs_hidden) {
+ dprintk("svc: svc_register(%sv%d, %s, %u, %u)"
+ " (but not telling portmap)\n",
+ progp->pg_name, version,
+ proto == IPPROTO_UDP? "udp" : "tcp",
+ port, family);
+ return 0;
+ }
+
+ /*
+ * Don't register a UDP port if we need congestion
+ * control.
+ */
+ if (vers->vs_need_cong_ctrl && proto == IPPROTO_UDP)
+ return 0;
+
+ error = svc_rpcbind_set_version(net, progp, version,
+ family, proto, port);
+
+ return (vers->vs_rpcb_optnl) ? 0 : error;
+}
+EXPORT_SYMBOL_GPL(svc_generic_rpcbind_set);
+
/**
* svc_register - register an RPC service with the local portmapper
* @serv: svc_serv struct for the service to register
@@ -1008,7 +1061,6 @@
const unsigned short port)
{
struct svc_program *progp;
- const struct svc_version *vers;
unsigned int i;
int error = 0;
@@ -1018,37 +1070,9 @@
for (progp = serv->sv_program; progp; progp = progp->pg_next) {
for (i = 0; i < progp->pg_nvers; i++) {
- vers = progp->pg_vers[i];
- if (vers == NULL)
- continue;
- dprintk("svc: svc_register(%sv%d, %s, %u, %u)%s\n",
- progp->pg_name,
- i,
- proto == IPPROTO_UDP? "udp" : "tcp",
- port,
- family,
- vers->vs_hidden ?
- " (but not telling portmap)" : "");
-
- if (vers->vs_hidden)
- continue;
-
- /*
- * Don't register a UDP port if we need congestion
- * control.
- */
- if (vers->vs_need_cong_ctrl && proto == IPPROTO_UDP)
- continue;
-
- error = __svc_register(net, progp->pg_name, progp->pg_prog,
- i, family, proto, port);
-
- if (vers->vs_rpcb_optnl) {
- error = 0;
- continue;
- }
-
+ error = progp->pg_rpcbind_set(net, progp, i,
+ family, proto, port);
if (error < 0) {
printk(KERN_WARNING "svc: failed to register "
"%sv%u RPC service (errno %d).\n",
@@ -1144,6 +1168,114 @@
static __printf(2,3) void svc_printk(struct svc_rqst *rqstp, const char *fmt, ...) {}
#endif
+__be32
+svc_return_autherr(struct svc_rqst *rqstp, __be32 auth_err)
+{
+ set_bit(RQ_AUTHERR, &rqstp->rq_flags);
+ return auth_err;
+}
+EXPORT_SYMBOL_GPL(svc_return_autherr);
+
+static __be32
+svc_get_autherr(struct svc_rqst *rqstp, __be32 *statp)
+{
+ if (test_and_clear_bit(RQ_AUTHERR, &rqstp->rq_flags))
+ return *statp;
+ return rpc_auth_ok;
+}
+
+static int
+svc_generic_dispatch(struct svc_rqst *rqstp, __be32 *statp)
+{
+ struct kvec *argv = &rqstp->rq_arg.head[0];
+ struct kvec *resv = &rqstp->rq_res.head[0];
+ const struct svc_procedure *procp = rqstp->rq_procinfo;
+
+ /*
+ * Decode arguments
+ * XXX: why do we ignore the return value?
+ */
+ if (procp->pc_decode &&
+ !procp->pc_decode(rqstp, argv->iov_base)) {
+ *statp = rpc_garbage_args;
+ return 1;
+ }
+
+ *statp = procp->pc_func(rqstp);
+
+ if (*statp == rpc_drop_reply ||
+ test_bit(RQ_DROPME, &rqstp->rq_flags))
+ return 0;
+
+ if (test_bit(RQ_AUTHERR, &rqstp->rq_flags))
+ return 1;
+
+ if (*statp != rpc_success)
+ return 1;
+
+ /* Encode reply */
+ if (procp->pc_encode &&
+ !procp->pc_encode(rqstp, resv->iov_base + resv->iov_len)) {
+ dprintk("svc: failed to encode reply\n");
+ /* serv->sv_stats->rpcsystemerr++; */
+ *statp = rpc_system_err;
+ }
+ return 1;
+}
+
+__be32
+svc_generic_init_request(struct svc_rqst *rqstp,
+ const struct svc_program *progp,
+ struct svc_process_info *ret)
+{
+ const struct svc_version *versp = NULL; /* compiler food */
+ const struct svc_procedure *procp = NULL;
+
+ if (rqstp->rq_vers >= progp->pg_nvers )
+ goto err_bad_vers;
+ versp = progp->pg_vers[rqstp->rq_vers];
+ if (!versp)
+ goto err_bad_vers;
+
+ /*
+ * Some protocol versions (namely NFSv4) require some form of
+ * congestion control. (See RFC 7530 section 3.1 paragraph 2)
+ * In other words, UDP is not allowed. We mark those when setting
+ * up the svc_xprt, and verify that here.
+ *
+ * The spec is not very clear about what error should be returned
+ * when someone tries to access a server that is listening on UDP
+ * for lower versions. RPC_PROG_MISMATCH seems to be the closest
+ * fit.
+ */
+ if (versp->vs_need_cong_ctrl && rqstp->rq_xprt &&
+ !test_bit(XPT_CONG_CTRL, &rqstp->rq_xprt->xpt_flags))
+ goto err_bad_vers;
+
+ if (rqstp->rq_proc >= versp->vs_nproc)
+ goto err_bad_proc;
+ rqstp->rq_procinfo = procp = &versp->vs_proc[rqstp->rq_proc];
+ if (!procp)
+ goto err_bad_proc;
+
+ /* Initialize storage for argp and resp */
+ memset(rqstp->rq_argp, 0, procp->pc_argsize);
+ memset(rqstp->rq_resp, 0, procp->pc_ressize);
+
+ /* Bump per-procedure stats counter */
+ versp->vs_count[rqstp->rq_proc]++;
+
+ ret->dispatch = versp->vs_dispatch;
+ return rpc_success;
+err_bad_vers:
+ ret->mismatch.lovers = progp->pg_lovers;
+ ret->mismatch.hivers = progp->pg_hivers;
+ return rpc_prog_mismatch;
+err_bad_proc:
+ return rpc_proc_unavail;
+}
+EXPORT_SYMBOL_GPL(svc_generic_init_request);
+
/*
* Common routine for processing the RPC request.
*/
@@ -1151,11 +1283,11 @@
svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv)
{
struct svc_program *progp;
- const struct svc_version *versp = NULL; /* compiler food */
const struct svc_procedure *procp = NULL;
struct svc_serv *serv = rqstp->rq_server;
+ struct svc_process_info process;
__be32 *statp;
- u32 prog, vers, proc;
+ u32 prog, vers;
__be32 auth_stat, rpc_stat;
int auth_res;
__be32 *reply_statp;
@@ -1171,9 +1303,6 @@
set_bit(RQ_USEDEFERRAL, &rqstp->rq_flags);
clear_bit(RQ_DROPME, &rqstp->rq_flags);
- /* Setup reply header */
- rqstp->rq_xprt->xpt_ops->xpo_prep_reply_hdr(rqstp);
-
svc_putu32(resv, rqstp->rq_xid);
vers = svc_getnl(argv);
@@ -1190,8 +1319,8 @@
svc_putnl(resv, 0); /* ACCEPT */
rqstp->rq_prog = prog = svc_getnl(argv); /* program number */
- rqstp->rq_vers = vers = svc_getnl(argv); /* version number */
- rqstp->rq_proc = proc = svc_getnl(argv); /* procedure number */
+ rqstp->rq_vers = svc_getnl(argv); /* version number */
+ rqstp->rq_proc = svc_getnl(argv); /* procedure number */
for (progp = serv->sv_program; progp; progp = progp->pg_next)
if (prog == progp->pg_prog)
@@ -1229,29 +1358,22 @@
if (progp == NULL)
goto err_bad_prog;
- if (vers >= progp->pg_nvers ||
- !(versp = progp->pg_vers[vers]))
+ rpc_stat = progp->pg_init_request(rqstp, progp, &process);
+ switch (rpc_stat) {
+ case rpc_success:
+ break;
+ case rpc_prog_unavail:
+ goto err_bad_prog;
+ case rpc_prog_mismatch:
goto err_bad_vers;
-
- /*
- * Some protocol versions (namely NFSv4) require some form of
- * congestion control. (See RFC 7530 section 3.1 paragraph 2)
- * In other words, UDP is not allowed. We mark those when setting
- * up the svc_xprt, and verify that here.
- *
- * The spec is not very clear about what error should be returned
- * when someone tries to access a server that is listening on UDP
- * for lower versions. RPC_PROG_MISMATCH seems to be the closest
- * fit.
- */
- if (versp->vs_need_cong_ctrl &&
- !test_bit(XPT_CONG_CTRL, &rqstp->rq_xprt->xpt_flags))
- goto err_bad_vers;
-
- procp = versp->vs_proc + proc;
- if (proc >= versp->vs_nproc || !procp->pc_func)
+ case rpc_proc_unavail:
goto err_bad_proc;
- rqstp->rq_procinfo = procp;
+ }
+
+ procp = rqstp->rq_procinfo;
+ /* Should this check go into the dispatcher? */
+ if (!procp || !procp->pc_func)
+ goto err_bad_proc;
/* Syntactic check complete */
serv->sv_stats->rpccnt++;
@@ -1261,13 +1383,6 @@
statp = resv->iov_base +resv->iov_len;
svc_putnl(resv, RPC_SUCCESS);
- /* Bump per-procedure stats counter */
- versp->vs_count[proc]++;
-
- /* Initialize storage for argp and resp */
- memset(rqstp->rq_argp, 0, procp->pc_argsize);
- memset(rqstp->rq_resp, 0, procp->pc_ressize);
-
/* un-reserve some of the out-queue now that we have a
* better idea of reply size
*/
@@ -1275,43 +1390,18 @@
svc_reserve_auth(rqstp, procp->pc_xdrressize<<2);
/* Call the function that processes the request. */
- if (!versp->vs_dispatch) {
- /*
- * Decode arguments
- * XXX: why do we ignore the return value?
- */
- if (procp->pc_decode &&
- !procp->pc_decode(rqstp, argv->iov_base))
+ if (!process.dispatch) {
+ if (!svc_generic_dispatch(rqstp, statp))
+ goto release_dropit;
+ if (*statp == rpc_garbage_args)
goto err_garbage;
-
- *statp = procp->pc_func(rqstp);
-
- /* Encode reply */
- if (*statp == rpc_drop_reply ||
- test_bit(RQ_DROPME, &rqstp->rq_flags)) {
- if (procp->pc_release)
- procp->pc_release(rqstp);
- goto dropit;
- }
- if (*statp == rpc_autherr_badcred) {
- if (procp->pc_release)
- procp->pc_release(rqstp);
- goto err_bad_auth;
- }
- if (*statp == rpc_success && procp->pc_encode &&
- !procp->pc_encode(rqstp, resv->iov_base + resv->iov_len)) {
- dprintk("svc: failed to encode reply\n");
- /* serv->sv_stats->rpcsystemerr++; */
- *statp = rpc_system_err;
- }
+ auth_stat = svc_get_autherr(rqstp, statp);
+ if (auth_stat != rpc_auth_ok)
+ goto err_release_bad_auth;
} else {
dprintk("svc: calling dispatcher\n");
- if (!versp->vs_dispatch(rqstp, statp)) {
- /* Release reply info */
- if (procp->pc_release)
- procp->pc_release(rqstp);
- goto dropit;
- }
+ if (!process.dispatch(rqstp, statp))
+ goto release_dropit; /* Release reply info */
}
/* Check RPC status result */
@@ -1330,13 +1420,16 @@
goto close;
return 1; /* Caller can now send it */
+release_dropit:
+ if (procp->pc_release)
+ procp->pc_release(rqstp);
dropit:
svc_authorise(rqstp); /* doesn't hurt to call this twice */
dprintk("svc: svc_process dropit\n");
return 0;
close:
- if (test_bit(XPT_TEMP, &rqstp->rq_xprt->xpt_flags))
+ if (rqstp->rq_xprt && test_bit(XPT_TEMP, &rqstp->rq_xprt->xpt_flags))
svc_close_xprt(rqstp->rq_xprt);
dprintk("svc: svc_process close\n");
return 0;
@@ -1354,6 +1447,9 @@
svc_putnl(resv, 2);
goto sendit;
+err_release_bad_auth:
+ if (procp->pc_release)
+ procp->pc_release(rqstp);
err_bad_auth:
dprintk("svc: authentication failed (%d)\n", ntohl(auth_stat));
serv->sv_stats->rpcbadauth++;
@@ -1372,16 +1468,16 @@
err_bad_vers:
svc_printk(rqstp, "unknown version (%d for prog %d, %s)\n",
- vers, prog, progp->pg_name);
+ rqstp->rq_vers, rqstp->rq_prog, progp->pg_name);
serv->sv_stats->rpcbadfmt++;
svc_putnl(resv, RPC_PROG_MISMATCH);
- svc_putnl(resv, progp->pg_lovers);
- svc_putnl(resv, progp->pg_hivers);
+ svc_putnl(resv, process.mismatch.lovers);
+ svc_putnl(resv, process.mismatch.hivers);
goto sendit;
err_bad_proc:
- svc_printk(rqstp, "unknown procedure (%d)\n", proc);
+ svc_printk(rqstp, "unknown procedure (%d)\n", rqstp->rq_proc);
serv->sv_stats->rpcbadfmt++;
svc_putnl(resv, RPC_PROC_UNAVAIL);
@@ -1431,6 +1527,10 @@
goto out_drop;
}
+ /* Reserve space for the record marker */
+ if (rqstp->rq_prot == IPPROTO_TCP)
+ svc_putnl(resv, 0);
+
/* Returns 1 for send, 0 for drop */
if (likely(svc_process_common(rqstp, argv, resv)))
return svc_send(rqstp);
@@ -1459,10 +1559,10 @@
dprintk("svc: %s(%p)\n", __func__, req);
/* Build the svc_rqst used by the common processing routine */
- rqstp->rq_xprt = serv->sv_bc_xprt;
rqstp->rq_xid = req->rq_xid;
rqstp->rq_prot = req->rq_xprt->prot;
rqstp->rq_server = serv;
+ rqstp->rq_bc_net = req->rq_xprt->xprt_net;
rqstp->rq_addrlen = sizeof(req->rq_xprt->addr);
memcpy(&rqstp->rq_addr, &req->rq_xprt->addr, rqstp->rq_addrlen);
@@ -1495,13 +1595,13 @@
/* Parse and execute the bc call */
proc_error = svc_process_common(rqstp, argv, resv);
- atomic_inc(&req->rq_xprt->bc_free_slots);
+ atomic_dec(&req->rq_xprt->bc_slot_count);
if (!proc_error) {
/* Processing error: drop the request */
xprt_free_bc_request(req);
- return 0;
+ error = -EINVAL;
+ goto out;
}
-
/* Finally, send the reply synchronously */
memcpy(&req->rq_snd_buf, &rqstp->rq_res, sizeof(req->rq_snd_buf));
task = rpc_run_bc_task(req);
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index 83ccd02..de3c077 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/net/sunrpc/svc_xprt.c
*
@@ -34,7 +35,7 @@
/* apparently the "standard" is that clients close
* idle connections after 5 minutes, servers after
* 6 minutes
- * http://www.connectathon.org/talks96/nfstcp.pdf
+ * http://nfsv4bat.org/Documents/ConnectAThon/1996/nfstcp.pdf
*/
static int svc_conn_age_period = 6*60;
@@ -136,6 +137,7 @@
struct module *owner = xprt->xpt_class->xcl_owner;
if (test_bit(XPT_CACHE_AUTH, &xprt->xpt_flags))
svcauth_unix_info_release(xprt);
+ put_cred(xprt->xpt_cred);
put_net(xprt->xpt_net);
/* See comment on corresponding get in xs_setup_bc_tcp(): */
if (xprt->xpt_bc_xprt)
@@ -171,7 +173,6 @@
mutex_init(&xprt->xpt_mutex);
spin_lock_init(&xprt->xpt_lock);
set_bit(XPT_BUSY, &xprt->xpt_flags);
- rpc_init_wait_queue(&xprt->xpt_bc_pending, "xpt_bc_pending");
xprt->xpt_net = get_net(net);
strcpy(xprt->xpt_remotebuf, "uninitialized");
}
@@ -253,7 +254,8 @@
static int _svc_create_xprt(struct svc_serv *serv, const char *xprt_name,
struct net *net, const int family,
- const unsigned short port, int flags)
+ const unsigned short port, int flags,
+ const struct cred *cred)
{
struct svc_xprt_class *xcl;
@@ -274,6 +276,7 @@
module_put(xcl->xcl_owner);
return PTR_ERR(newxprt);
}
+ newxprt->xpt_cred = get_cred(cred);
svc_add_new_perm_xprt(serv, newxprt);
newport = svc_xprt_local_port(newxprt);
return newport;
@@ -287,19 +290,20 @@
int svc_create_xprt(struct svc_serv *serv, const char *xprt_name,
struct net *net, const int family,
- const unsigned short port, int flags)
+ const unsigned short port, int flags,
+ const struct cred *cred)
{
int err;
dprintk("svc: creating transport %s[%d]\n", xprt_name, port);
- err = _svc_create_xprt(serv, xprt_name, net, family, port, flags);
+ err = _svc_create_xprt(serv, xprt_name, net, family, port, flags, cred);
if (err == -EPROTONOSUPPORT) {
request_module("svc%s", xprt_name);
- err = _svc_create_xprt(serv, xprt_name, net, family, port, flags);
+ err = _svc_create_xprt(serv, xprt_name, net, family, port, flags, cred);
}
- if (err)
+ if (err < 0)
dprintk("svc: transport %s not found, err %d\n",
- xprt_name, err);
+ xprt_name, -err);
return err;
}
EXPORT_SYMBOL_GPL(svc_create_xprt);
@@ -358,15 +362,29 @@
struct svc_xprt *xprt = rqstp->rq_xprt;
if (test_and_clear_bit(RQ_DATA, &rqstp->rq_flags)) {
atomic_dec(&xprt->xpt_nr_rqsts);
+ smp_wmb(); /* See smp_rmb() in svc_xprt_ready() */
svc_xprt_enqueue(xprt);
}
}
-static bool svc_xprt_has_something_to_do(struct svc_xprt *xprt)
+static bool svc_xprt_ready(struct svc_xprt *xprt)
{
- if (xprt->xpt_flags & ((1<<XPT_CONN)|(1<<XPT_CLOSE)))
+ unsigned long xpt_flags;
+
+ /*
+ * If another cpu has recently updated xpt_flags,
+ * sk_sock->flags, xpt_reserved, or xpt_nr_rqsts, we need to
+ * know about it; otherwise it's possible that both that cpu and
+ * this one could call svc_xprt_enqueue() without either
+ * svc_xprt_enqueue() recognizing that the conditions below
+ * are satisfied, and we could stall indefinitely:
+ */
+ smp_rmb();
+ xpt_flags = READ_ONCE(xprt->xpt_flags);
+
+ if (xpt_flags & (BIT(XPT_CONN) | BIT(XPT_CLOSE)))
return true;
- if (xprt->xpt_flags & ((1<<XPT_DATA)|(1<<XPT_DEFERRED))) {
+ if (xpt_flags & (BIT(XPT_DATA) | BIT(XPT_DEFERRED))) {
if (xprt->xpt_ops->xpo_has_wspace(xprt) &&
svc_xprt_slots_in_range(xprt))
return true;
@@ -382,7 +400,7 @@
struct svc_rqst *rqstp = NULL;
int cpu;
- if (!svc_xprt_has_something_to_do(xprt))
+ if (!svc_xprt_ready(xprt))
return;
/* Mark transport as busy. It will remain in this state until
@@ -469,13 +487,14 @@
*/
void svc_reserve(struct svc_rqst *rqstp, int space)
{
+ struct svc_xprt *xprt = rqstp->rq_xprt;
+
space += rqstp->rq_res.head[0].iov_len;
- if (space < rqstp->rq_reserved) {
- struct svc_xprt *xprt = rqstp->rq_xprt;
+ if (xprt && space < rqstp->rq_reserved) {
atomic_sub((rqstp->rq_reserved - space), &xprt->xpt_reserved);
rqstp->rq_reserved = space;
-
+ smp_wmb(); /* See smp_rmb() in svc_xprt_ready() */
svc_xprt_enqueue(xprt);
}
}
@@ -768,9 +787,10 @@
__module_get(xprt->xpt_class->xcl_owner);
svc_check_conn_limits(xprt->xpt_server);
newxpt = xprt->xpt_ops->xpo_accept(xprt);
- if (newxpt)
+ if (newxpt) {
+ newxpt->xpt_cred = get_cred(xprt->xpt_cred);
svc_add_new_temp_xprt(serv, newxpt);
- else
+ } else
module_put(xprt->xpt_class->xcl_owner);
} else if (svc_xprt_reserve_slot(rqstp, xprt)) {
/* XPT_DATA|XPT_DEFERRED case: */
@@ -895,7 +915,6 @@
else
len = xprt->xpt_ops->xpo_sendto(rqstp);
mutex_unlock(&xprt->xpt_mutex);
- rpc_wake_up(&xprt->xpt_bc_pending);
trace_svc_send(rqstp, len);
svc_xprt_release(rqstp);
diff --git a/net/sunrpc/svcauth.c b/net/sunrpc/svcauth.c
index bb8db3c..550b214 100644
--- a/net/sunrpc/svcauth.c
+++ b/net/sunrpc/svcauth.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/net/sunrpc/svcauth.c
*
@@ -27,12 +28,32 @@
extern struct auth_ops svcauth_null;
extern struct auth_ops svcauth_unix;
-static DEFINE_SPINLOCK(authtab_lock);
-static struct auth_ops *authtab[RPC_AUTH_MAXFLAVOR] = {
- [0] = &svcauth_null,
- [1] = &svcauth_unix,
+static struct auth_ops __rcu *authtab[RPC_AUTH_MAXFLAVOR] = {
+ [RPC_AUTH_NULL] = (struct auth_ops __force __rcu *)&svcauth_null,
+ [RPC_AUTH_UNIX] = (struct auth_ops __force __rcu *)&svcauth_unix,
};
+static struct auth_ops *
+svc_get_auth_ops(rpc_authflavor_t flavor)
+{
+ struct auth_ops *aops;
+
+ if (flavor >= RPC_AUTH_MAXFLAVOR)
+ return NULL;
+ rcu_read_lock();
+ aops = rcu_dereference(authtab[flavor]);
+ if (aops != NULL && !try_module_get(aops->owner))
+ aops = NULL;
+ rcu_read_unlock();
+ return aops;
+}
+
+static void
+svc_put_auth_ops(struct auth_ops *aops)
+{
+ module_put(aops->owner);
+}
+
int
svc_authenticate(struct svc_rqst *rqstp, __be32 *authp)
{
@@ -45,14 +66,11 @@
dprintk("svc: svc_authenticate (%d)\n", flavor);
- spin_lock(&authtab_lock);
- if (flavor >= RPC_AUTH_MAXFLAVOR || !(aops = authtab[flavor]) ||
- !try_module_get(aops->owner)) {
- spin_unlock(&authtab_lock);
+ aops = svc_get_auth_ops(flavor);
+ if (aops == NULL) {
*authp = rpc_autherr_badcred;
return SVC_DENIED;
}
- spin_unlock(&authtab_lock);
rqstp->rq_auth_slack = 0;
init_svc_cred(&rqstp->rq_cred);
@@ -82,7 +100,7 @@
if (aops) {
rv = aops->release(rqstp);
- module_put(aops->owner);
+ svc_put_auth_ops(aops);
}
return rv;
}
@@ -90,13 +108,14 @@
int
svc_auth_register(rpc_authflavor_t flavor, struct auth_ops *aops)
{
+ struct auth_ops *old;
int rv = -EINVAL;
- spin_lock(&authtab_lock);
- if (flavor < RPC_AUTH_MAXFLAVOR && authtab[flavor] == NULL) {
- authtab[flavor] = aops;
- rv = 0;
+
+ if (flavor < RPC_AUTH_MAXFLAVOR) {
+ old = cmpxchg((struct auth_ops ** __force)&authtab[flavor], NULL, aops);
+ if (old == NULL || old == aops)
+ rv = 0;
}
- spin_unlock(&authtab_lock);
return rv;
}
EXPORT_SYMBOL_GPL(svc_auth_register);
@@ -104,10 +123,8 @@
void
svc_auth_unregister(rpc_authflavor_t flavor)
{
- spin_lock(&authtab_lock);
if (flavor < RPC_AUTH_MAXFLAVOR)
- authtab[flavor] = NULL;
- spin_unlock(&authtab_lock);
+ rcu_assign_pointer(authtab[flavor], NULL);
}
EXPORT_SYMBOL_GPL(svc_auth_unregister);
@@ -127,10 +144,11 @@
static DEFINE_SPINLOCK(auth_domain_lock);
static void auth_domain_release(struct kref *kref)
+ __releases(&auth_domain_lock)
{
struct auth_domain *dom = container_of(kref, struct auth_domain, ref);
- hlist_del(&dom->hash);
+ hlist_del_rcu(&dom->hash);
dom->flavour->domain_release(dom);
spin_unlock(&auth_domain_lock);
}
@@ -159,7 +177,7 @@
}
}
if (new)
- hlist_add_head(&new->hash, head);
+ hlist_add_head_rcu(&new->hash, head);
spin_unlock(&auth_domain_lock);
return new;
}
@@ -167,6 +185,21 @@
struct auth_domain *auth_domain_find(char *name)
{
- return auth_domain_lookup(name, NULL);
+ struct auth_domain *hp;
+ struct hlist_head *head;
+
+ head = &auth_domain_table[hash_str(name, DN_HASHBITS)];
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(hp, head, hash) {
+ if (strcmp(hp->name, name)==0) {
+ if (!kref_get_unless_zero(&hp->ref))
+ hp = NULL;
+ rcu_read_unlock();
+ return hp;
+ }
+ }
+ rcu_read_unlock();
+ return NULL;
}
EXPORT_SYMBOL_GPL(auth_domain_find);
diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c
index af7f28f..5c04ba7 100644
--- a/net/sunrpc/svcauth_unix.c
+++ b/net/sunrpc/svcauth_unix.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
#include <linux/types.h>
#include <linux/sched.h>
#include <linux/module.h>
@@ -37,20 +38,26 @@
extern struct auth_ops svcauth_null;
extern struct auth_ops svcauth_unix;
-static void svcauth_unix_domain_release(struct auth_domain *dom)
+static void svcauth_unix_domain_release_rcu(struct rcu_head *head)
{
+ struct auth_domain *dom = container_of(head, struct auth_domain, rcu_head);
struct unix_domain *ud = container_of(dom, struct unix_domain, h);
kfree(dom->name);
kfree(ud);
}
+static void svcauth_unix_domain_release(struct auth_domain *dom)
+{
+ call_rcu(&dom->rcu_head, svcauth_unix_domain_release_rcu);
+}
+
struct auth_domain *unix_domain_find(char *name)
{
struct auth_domain *rv;
struct unix_domain *new = NULL;
- rv = auth_domain_lookup(name, NULL);
+ rv = auth_domain_find(name);
while(1) {
if (rv) {
if (new && rv != &new->h)
@@ -91,6 +98,7 @@
char m_class[8]; /* e.g. "nfsd" */
struct in6_addr m_addr;
struct unix_domain *m_client;
+ struct rcu_head m_rcu;
};
static void ip_map_put(struct kref *kref)
@@ -101,7 +109,7 @@
if (test_bit(CACHE_VALID, &item->flags) &&
!test_bit(CACHE_NEGATIVE, &item->flags))
auth_domain_put(&im->m_client->h);
- kfree(im);
+ kfree_rcu(im, m_rcu);
}
static inline int hash_ip6(const struct in6_addr *ip)
@@ -280,9 +288,9 @@
strcpy(ip.m_class, class);
ip.m_addr = *addr;
- ch = sunrpc_cache_lookup(cd, &ip.h,
- hash_str(class, IP_HASHBITS) ^
- hash_ip6(addr));
+ ch = sunrpc_cache_lookup_rcu(cd, &ip.h,
+ hash_str(class, IP_HASHBITS) ^
+ hash_ip6(addr));
if (ch)
return container_of(ch, struct ip_map, h);
@@ -412,6 +420,7 @@
struct cache_head h;
kuid_t uid;
struct group_info *gi;
+ struct rcu_head rcu;
};
static int unix_gid_hash(kuid_t uid)
@@ -426,7 +435,7 @@
if (test_bit(CACHE_VALID, &item->flags) &&
!test_bit(CACHE_NEGATIVE, &item->flags))
put_group_info(ug->gi);
- kfree(ug);
+ kfree_rcu(ug, rcu);
}
static int unix_gid_match(struct cache_head *corig, struct cache_head *cnew)
@@ -492,7 +501,7 @@
rv = get_int(&mesg, &id);
if (rv)
return -EINVAL;
- uid = make_kuid(&init_user_ns, id);
+ uid = make_kuid(current_user_ns(), id);
ug.uid = uid;
expiry = get_expiry(&mesg);
@@ -514,7 +523,7 @@
err = -EINVAL;
if (rv)
goto out;
- kgid = make_kgid(&init_user_ns, gid);
+ kgid = make_kgid(current_user_ns(), gid);
if (!gid_valid(kgid))
goto out;
ug.gi->gid[i] = kgid;
@@ -547,7 +556,7 @@
struct cache_detail *cd,
struct cache_head *h)
{
- struct user_namespace *user_ns = &init_user_ns;
+ struct user_namespace *user_ns = m->file->f_cred->user_ns;
struct unix_gid *ug;
int i;
int glen;
@@ -619,7 +628,7 @@
struct cache_head *ch;
ug.uid = uid;
- ch = sunrpc_cache_lookup(cd, &ug.h, unix_gid_hash(uid));
+ ch = sunrpc_cache_lookup_rcu(cd, &ug.h, unix_gid_hash(uid));
if (ch)
return container_of(ch, struct unix_gid, h);
else
@@ -788,6 +797,7 @@
struct kvec *argv = &rqstp->rq_arg.head[0];
struct kvec *resv = &rqstp->rq_res.head[0];
struct svc_cred *cred = &rqstp->rq_cred;
+ struct user_namespace *userns;
u32 slen, i;
int len = argv->iov_len;
@@ -808,8 +818,10 @@
* (export-specific) anonymous id by nfsd_setuser.
* Supplementary gid's will be left alone.
*/
- cred->cr_uid = make_kuid(&init_user_ns, svc_getnl(argv)); /* uid */
- cred->cr_gid = make_kgid(&init_user_ns, svc_getnl(argv)); /* gid */
+ userns = (rqstp->rq_xprt && rqstp->rq_xprt->xpt_cred) ?
+ rqstp->rq_xprt->xpt_cred->user_ns : &init_user_ns;
+ cred->cr_uid = make_kuid(userns, svc_getnl(argv)); /* uid */
+ cred->cr_gid = make_kgid(userns, svc_getnl(argv)); /* gid */
slen = svc_getnl(argv); /* gids length */
if (slen > UNX_NGROUPS || (len -= (slen + 2)*4) < 0)
goto badcred;
@@ -817,7 +829,7 @@
if (cred->cr_group_info == NULL)
return SVC_CLOSE;
for (i = 0; i < slen; i++) {
- kgid_t kgid = make_kgid(&init_user_ns, svc_getnl(argv));
+ kgid_t kgid = make_kgid(userns, svc_getnl(argv));
cred->cr_group_info->gid[i] = kgid;
}
groups_sort(cred->cr_group_info);
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 5445145..2934dd7 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/net/sunrpc/svcsock.c
*
@@ -70,13 +71,6 @@
static struct svc_xprt *svc_create_socket(struct svc_serv *, int,
struct net *, struct sockaddr *,
int, int);
-#if defined(CONFIG_SUNRPC_BACKCHANNEL)
-static struct svc_xprt *svc_bc_create_socket(struct svc_serv *, int,
- struct net *, struct sockaddr *,
- int, int);
-static void svc_bc_sock_free(struct svc_xprt *xprt);
-#endif /* CONFIG_SUNRPC_BACKCHANNEL */
-
#ifdef CONFIG_DEBUG_LOCK_ALLOC
static struct lock_class_key svc_key[2];
static struct lock_class_key svc_slock_key[2];
@@ -325,68 +319,47 @@
/*
* Generic recvfrom routine.
*/
-static int svc_recvfrom(struct svc_rqst *rqstp, struct kvec *iov, int nr,
- int buflen)
+static ssize_t svc_recvfrom(struct svc_rqst *rqstp, struct kvec *iov,
+ unsigned int nr, size_t buflen, unsigned int base)
{
struct svc_sock *svsk =
container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt);
- struct msghdr msg = {
- .msg_flags = MSG_DONTWAIT,
- };
- int len;
+ struct msghdr msg = { NULL };
+ ssize_t len;
rqstp->rq_xprt_hlen = 0;
clear_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
- iov_iter_kvec(&msg.msg_iter, READ | ITER_KVEC, iov, nr, buflen);
- len = sock_recvmsg(svsk->sk_sock, &msg, msg.msg_flags);
+ iov_iter_kvec(&msg.msg_iter, READ, iov, nr, buflen);
+ if (base != 0) {
+ iov_iter_advance(&msg.msg_iter, base);
+ buflen -= base;
+ }
+ len = sock_recvmsg(svsk->sk_sock, &msg, MSG_DONTWAIT);
/* If we read a full record, then assume there may be more
* data to read (stream based sockets only!)
*/
if (len == buflen)
set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
- dprintk("svc: socket %p recvfrom(%p, %zu) = %d\n",
+ dprintk("svc: socket %p recvfrom(%p, %zu) = %zd\n",
svsk, iov[0].iov_base, iov[0].iov_len, len);
return len;
}
-static int svc_partial_recvfrom(struct svc_rqst *rqstp,
- struct kvec *iov, int nr,
- int buflen, unsigned int base)
-{
- size_t save_iovlen;
- void *save_iovbase;
- unsigned int i;
- int ret;
-
- if (base == 0)
- return svc_recvfrom(rqstp, iov, nr, buflen);
-
- for (i = 0; i < nr; i++) {
- if (iov[i].iov_len > base)
- break;
- base -= iov[i].iov_len;
- }
- save_iovlen = iov[i].iov_len;
- save_iovbase = iov[i].iov_base;
- iov[i].iov_len -= base;
- iov[i].iov_base += base;
- ret = svc_recvfrom(rqstp, &iov[i], nr - i, buflen);
- iov[i].iov_len = save_iovlen;
- iov[i].iov_base = save_iovbase;
- return ret;
-}
-
/*
* Set socket snd and rcv buffer lengths
*/
-static void svc_sock_setbufsize(struct socket *sock, unsigned int snd,
- unsigned int rcv)
+static void svc_sock_setbufsize(struct svc_sock *svsk, unsigned int nreqs)
{
+ unsigned int max_mesg = svsk->sk_xprt.xpt_server->sv_max_mesg;
+ struct socket *sock = svsk->sk_sock;
+
+ nreqs = min(nreqs, INT_MAX / 2 / max_mesg);
+
lock_sock(sock->sk);
- sock->sk->sk_sndbuf = snd * 2;
- sock->sk->sk_rcvbuf = rcv * 2;
+ sock->sk->sk_sndbuf = nreqs * max_mesg * 2;
+ sock->sk->sk_rcvbuf = nreqs * max_mesg * 2;
sock->sk->sk_write_space(sock->sk);
release_sock(sock->sk);
}
@@ -548,9 +521,7 @@
* provides an upper bound on the number of threads
* which will access the socket.
*/
- svc_sock_setbufsize(svsk->sk_sock,
- (serv->sv_nrthreads+3) * serv->sv_max_mesg,
- (serv->sv_nrthreads+3) * serv->sv_max_mesg);
+ svc_sock_setbufsize(svsk, serv->sv_nrthreads + 3);
clear_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
skb = NULL;
@@ -574,7 +545,7 @@
/* Don't enable netstamp, sunrpc doesn't
need that much accuracy */
}
- svsk->sk_sk->sk_stamp = skb->tstamp;
+ sock_write_timestamp(svsk->sk_sk, skb->tstamp);
set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); /* there may be more data... */
len = skb->len;
@@ -642,10 +613,6 @@
return error;
}
-static void svc_udp_prep_reply_hdr(struct svc_rqst *rqstp)
-{
-}
-
static int svc_udp_has_wspace(struct svc_xprt *xprt)
{
struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt);
@@ -689,7 +656,6 @@
.xpo_release_rqst = svc_release_udp_skb,
.xpo_detach = svc_sock_detach,
.xpo_free = svc_sock_free,
- .xpo_prep_reply_hdr = svc_udp_prep_reply_hdr,
.xpo_has_wspace = svc_udp_has_wspace,
.xpo_accept = svc_udp_accept,
.xpo_secure_port = svc_sock_secure_port,
@@ -718,9 +684,7 @@
* receive and respond to one request.
* svc_udp_recvfrom will re-adjust if necessary
*/
- svc_sock_setbufsize(svsk->sk_sock,
- 3 * svsk->sk_xprt.xpt_server->sv_max_mesg,
- 3 * svsk->sk_xprt.xpt_server->sv_max_mesg);
+ svc_sock_setbufsize(svsk, 3);
/* data might have come in before data_ready set up */
set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
@@ -962,7 +926,8 @@
want = sizeof(rpc_fraghdr) - svsk->sk_tcplen;
iov.iov_base = ((char *) &svsk->sk_reclen) + svsk->sk_tcplen;
iov.iov_len = want;
- if ((len = svc_recvfrom(rqstp, &iov, 1, want)) < 0)
+ len = svc_recvfrom(rqstp, &iov, 1, want, 0);
+ if (len < 0)
goto error;
svsk->sk_tcplen += len;
@@ -1004,7 +969,7 @@
if (!bc_xprt)
return -EAGAIN;
- spin_lock(&bc_xprt->recv_lock);
+ spin_lock(&bc_xprt->queue_lock);
req = xprt_lookup_rqst(bc_xprt, xid);
if (!req)
goto unlock_notfound;
@@ -1022,7 +987,7 @@
memcpy(dst->iov_base, src->iov_base, src->iov_len);
xprt_complete_rqst(req->rq_task, rqstp->rq_arg.len);
rqstp->rq_arg.len = 0;
- spin_unlock(&bc_xprt->recv_lock);
+ spin_unlock(&bc_xprt->queue_lock);
return 0;
unlock_notfound:
printk(KERN_NOTICE
@@ -1031,7 +996,7 @@
__func__, ntohl(calldir),
bc_xprt, ntohl(xid));
unlock_eagain:
- spin_unlock(&bc_xprt->recv_lock);
+ spin_unlock(&bc_xprt->queue_lock);
return -EAGAIN;
}
@@ -1088,14 +1053,13 @@
vec = rqstp->rq_vec;
- pnum = copy_pages_to_kvecs(&vec[0], &rqstp->rq_pages[0],
- svsk->sk_datalen + want);
+ pnum = copy_pages_to_kvecs(&vec[0], &rqstp->rq_pages[0], base + want);
rqstp->rq_respages = &rqstp->rq_pages[pnum];
rqstp->rq_next_page = rqstp->rq_respages + 1;
/* Now receive data */
- len = svc_partial_recvfrom(rqstp, vec, pnum, want, base);
+ len = svc_recvfrom(rqstp, vec, pnum, base + want, base);
if (len >= 0) {
svsk->sk_tcplen += len;
svsk->sk_datalen += len;
@@ -1195,17 +1159,6 @@
return sent;
}
-/*
- * Setup response header. TCP has a 4B record length field.
- */
-static void svc_tcp_prep_reply_hdr(struct svc_rqst *rqstp)
-{
- struct kvec *resv = &rqstp->rq_res.head[0];
-
- /* tcp needs a space for the record length... */
- svc_putnl(resv, 0);
-}
-
static struct svc_xprt *svc_tcp_create(struct svc_serv *serv,
struct net *net,
struct sockaddr *sa, int salen,
@@ -1214,58 +1167,6 @@
return svc_create_socket(serv, IPPROTO_TCP, net, sa, salen, flags);
}
-#if defined(CONFIG_SUNRPC_BACKCHANNEL)
-static struct svc_xprt *svc_bc_create_socket(struct svc_serv *, int,
- struct net *, struct sockaddr *,
- int, int);
-static void svc_bc_sock_free(struct svc_xprt *xprt);
-
-static struct svc_xprt *svc_bc_tcp_create(struct svc_serv *serv,
- struct net *net,
- struct sockaddr *sa, int salen,
- int flags)
-{
- return svc_bc_create_socket(serv, IPPROTO_TCP, net, sa, salen, flags);
-}
-
-static void svc_bc_tcp_sock_detach(struct svc_xprt *xprt)
-{
-}
-
-static const struct svc_xprt_ops svc_tcp_bc_ops = {
- .xpo_create = svc_bc_tcp_create,
- .xpo_detach = svc_bc_tcp_sock_detach,
- .xpo_free = svc_bc_sock_free,
- .xpo_prep_reply_hdr = svc_tcp_prep_reply_hdr,
- .xpo_secure_port = svc_sock_secure_port,
-};
-
-static struct svc_xprt_class svc_tcp_bc_class = {
- .xcl_name = "tcp-bc",
- .xcl_owner = THIS_MODULE,
- .xcl_ops = &svc_tcp_bc_ops,
- .xcl_max_payload = RPCSVC_MAXPAYLOAD_TCP,
-};
-
-static void svc_init_bc_xprt_sock(void)
-{
- svc_reg_xprt_class(&svc_tcp_bc_class);
-}
-
-static void svc_cleanup_bc_xprt_sock(void)
-{
- svc_unreg_xprt_class(&svc_tcp_bc_class);
-}
-#else /* CONFIG_SUNRPC_BACKCHANNEL */
-static void svc_init_bc_xprt_sock(void)
-{
-}
-
-static void svc_cleanup_bc_xprt_sock(void)
-{
-}
-#endif /* CONFIG_SUNRPC_BACKCHANNEL */
-
static const struct svc_xprt_ops svc_tcp_ops = {
.xpo_create = svc_tcp_create,
.xpo_recvfrom = svc_tcp_recvfrom,
@@ -1273,7 +1174,6 @@
.xpo_release_rqst = svc_release_skb,
.xpo_detach = svc_tcp_sock_detach,
.xpo_free = svc_sock_free,
- .xpo_prep_reply_hdr = svc_tcp_prep_reply_hdr,
.xpo_has_wspace = svc_tcp_has_wspace,
.xpo_accept = svc_tcp_accept,
.xpo_secure_port = svc_sock_secure_port,
@@ -1292,14 +1192,12 @@
{
svc_reg_xprt_class(&svc_tcp_class);
svc_reg_xprt_class(&svc_udp_class);
- svc_init_bc_xprt_sock();
}
void svc_cleanup_xprt_sock(void)
{
svc_unreg_xprt_class(&svc_tcp_class);
svc_unreg_xprt_class(&svc_udp_class);
- svc_cleanup_bc_xprt_sock();
}
static void svc_tcp_init(struct svc_sock *svsk, struct svc_serv *serv)
@@ -1435,13 +1333,14 @@
* @fd: file descriptor of the new listener
* @name_return: pointer to buffer to fill in with name of listener
* @len: size of the buffer
+ * @cred: credential
*
* Fills in socket name and returns positive length of name if successful.
* Name is terminated with '\n'. On error, returns a negative errno
* value.
*/
int svc_addsock(struct svc_serv *serv, const int fd, char *name_return,
- const size_t len)
+ const size_t len, const struct cred *cred)
{
int err = 0;
struct socket *so = sockfd_lookup(fd, &err);
@@ -1474,6 +1373,7 @@
salen = kernel_getsockname(svsk->sk_sock, sin);
if (salen >= 0)
svc_xprt_set_local(&svsk->sk_xprt, sin, salen);
+ svsk->sk_xprt.xpt_cred = get_cred(cred);
svc_add_new_perm_xprt(serv, &svsk->sk_xprt);
return svc_one_sock_name(svsk, name_return, len);
out:
@@ -1620,45 +1520,3 @@
sock_release(svsk->sk_sock);
kfree(svsk);
}
-
-#if defined(CONFIG_SUNRPC_BACKCHANNEL)
-/*
- * Create a back channel svc_xprt which shares the fore channel socket.
- */
-static struct svc_xprt *svc_bc_create_socket(struct svc_serv *serv,
- int protocol,
- struct net *net,
- struct sockaddr *sin, int len,
- int flags)
-{
- struct svc_sock *svsk;
- struct svc_xprt *xprt;
-
- if (protocol != IPPROTO_TCP) {
- printk(KERN_WARNING "svc: only TCP sockets"
- " supported on shared back channel\n");
- return ERR_PTR(-EINVAL);
- }
-
- svsk = kzalloc(sizeof(*svsk), GFP_KERNEL);
- if (!svsk)
- return ERR_PTR(-ENOMEM);
-
- xprt = &svsk->sk_xprt;
- svc_xprt_init(net, &svc_tcp_bc_class, xprt, serv);
- set_bit(XPT_CONG_CTRL, &svsk->sk_xprt.xpt_flags);
-
- serv->sv_bc_xprt = xprt;
-
- return xprt;
-}
-
-/*
- * Free a back channel svc_sock.
- */
-static void svc_bc_sock_free(struct svc_xprt *xprt)
-{
- if (xprt)
- kfree(container_of(xprt, struct svc_sock, sk_xprt));
-}
-#endif /* CONFIG_SUNRPC_BACKCHANNEL */
diff --git a/net/sunrpc/sysctl.c b/net/sunrpc/sysctl.c
index 8c39364..d75f17b 100644
--- a/net/sunrpc/sysctl.c
+++ b/net/sunrpc/sysctl.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/net/sunrpc/sysctl.c
*
@@ -89,7 +90,7 @@
left = *lenp;
if (write) {
- if (!access_ok(VERIFY_READ, buffer, left))
+ if (!access_ok(buffer, left))
return -EFAULT;
p = buffer;
while (left && __get_user(c, p) >= 0 && isspace(c))
diff --git a/net/sunrpc/timer.c b/net/sunrpc/timer.c
index 08881d0..81ae35b 100644
--- a/net/sunrpc/timer.c
+++ b/net/sunrpc/timer.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/net/sunrpc/timer.c
*
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index ac16f50..14ba9e7 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/net/sunrpc/xdr.c
*
@@ -15,6 +16,8 @@
#include <linux/errno.h>
#include <linux/sunrpc/xdr.h>
#include <linux/sunrpc/msg_prot.h>
+#include <linux/bvec.h>
+#include <trace/events/sunrpc.h>
/*
* XDR functions for basic NFS types
@@ -128,6 +131,48 @@
}
EXPORT_SYMBOL_GPL(xdr_terminate_string);
+size_t
+xdr_buf_pagecount(struct xdr_buf *buf)
+{
+ if (!buf->page_len)
+ return 0;
+ return (buf->page_base + buf->page_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+}
+
+int
+xdr_alloc_bvec(struct xdr_buf *buf, gfp_t gfp)
+{
+ size_t i, n = xdr_buf_pagecount(buf);
+
+ if (n != 0 && buf->bvec == NULL) {
+ buf->bvec = kmalloc_array(n, sizeof(buf->bvec[0]), gfp);
+ if (!buf->bvec)
+ return -ENOMEM;
+ for (i = 0; i < n; i++) {
+ buf->bvec[i].bv_page = buf->pages[i];
+ buf->bvec[i].bv_len = PAGE_SIZE;
+ buf->bvec[i].bv_offset = 0;
+ }
+ }
+ return 0;
+}
+
+void
+xdr_free_bvec(struct xdr_buf *buf)
+{
+ kfree(buf->bvec);
+ buf->bvec = NULL;
+}
+
+/**
+ * xdr_inline_pages - Prepare receive buffer for a large reply
+ * @xdr: xdr_buf into which reply will be placed
+ * @offset: expected offset where data payload will start, in bytes
+ * @pages: vector of struct page pointers
+ * @base: offset in first page where receive should start, in bytes
+ * @len: expected size of the upper layer data payload, in bytes
+ *
+ */
void
xdr_inline_pages(struct xdr_buf *xdr, unsigned int offset,
struct page **pages, unsigned int base, unsigned int len)
@@ -145,6 +190,8 @@
tail->iov_base = buf + offset;
tail->iov_len = buflen - offset;
+ if ((xdr->page_len & 3) == 0)
+ tail->iov_len -= sizeof(__be32);
xdr->buflen += len;
}
@@ -312,13 +359,15 @@
* 'len' bytes. The extra data is not lost, but is instead
* moved into the inlined pages and/or the tail.
*/
-static void
+static unsigned int
xdr_shrink_bufhead(struct xdr_buf *buf, size_t len)
{
struct kvec *head, *tail;
size_t copy, offs;
unsigned int pglen = buf->page_len;
+ unsigned int result;
+ result = 0;
tail = buf->tail;
head = buf->head;
@@ -332,6 +381,7 @@
copy = tail->iov_len - len;
memmove((char *)tail->iov_base + len,
tail->iov_base, copy);
+ result += copy;
}
/* Copy from the inlined pages into the tail */
copy = len;
@@ -342,11 +392,13 @@
copy = 0;
else if (copy > tail->iov_len - offs)
copy = tail->iov_len - offs;
- if (copy != 0)
+ if (copy != 0) {
_copy_from_pages((char *)tail->iov_base + offs,
buf->pages,
buf->page_base + pglen + offs - len,
copy);
+ result += copy;
+ }
/* Do we also need to copy data from the head into the tail ? */
if (len > pglen) {
offs = copy = len - pglen;
@@ -356,6 +408,7 @@
(char *)head->iov_base +
head->iov_len - offs,
copy);
+ result += copy;
}
}
/* Now handle pages */
@@ -371,12 +424,15 @@
_copy_to_pages(buf->pages, buf->page_base,
(char *)head->iov_base + head->iov_len - len,
copy);
+ result += copy;
}
head->iov_len -= len;
buf->buflen -= len;
/* Have we truncated the message? */
if (buf->len > buf->buflen)
buf->len = buf->buflen;
+
+ return result;
}
/**
@@ -388,14 +444,16 @@
* 'len' bytes. The extra data is not lost, but is instead
* moved into the tail.
*/
-static void
+static unsigned int
xdr_shrink_pagelen(struct xdr_buf *buf, size_t len)
{
struct kvec *tail;
size_t copy;
unsigned int pglen = buf->page_len;
unsigned int tailbuf_len;
+ unsigned int result;
+ result = 0;
tail = buf->tail;
BUG_ON (len > pglen);
@@ -413,18 +471,22 @@
if (tail->iov_len > len) {
char *p = (char *)tail->iov_base + len;
memmove(p, tail->iov_base, tail->iov_len - len);
+ result += tail->iov_len - len;
} else
copy = tail->iov_len;
/* Copy from the inlined pages into the tail */
_copy_from_pages((char *)tail->iov_base,
buf->pages, buf->page_base + pglen - len,
copy);
+ result += copy;
}
buf->page_len -= len;
buf->buflen -= len;
/* Have we truncated the message? */
if (buf->len > buf->buflen)
buf->len = buf->buflen;
+
+ return result;
}
void
@@ -449,6 +511,7 @@
* @xdr: pointer to xdr_stream struct
* @buf: pointer to XDR buffer in which to encode data
* @p: current pointer inside XDR buffer
+ * @rqst: pointer to controlling rpc_rqst, for debugging
*
* Note: at the moment the RPC client only passes the length of our
* scratch buffer in the xdr_buf's header kvec. Previously this
@@ -457,7 +520,8 @@
* of the buffer length, and takes care of adjusting the kvec
* length for us.
*/
-void xdr_init_encode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p)
+void xdr_init_encode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p,
+ struct rpc_rqst *rqst)
{
struct kvec *iov = buf->head;
int scratch_len = buf->buflen - buf->page_len - buf->tail[0].iov_len;
@@ -479,6 +543,7 @@
buf->len += len;
iov->iov_len += len;
}
+ xdr->rqst = rqst;
}
EXPORT_SYMBOL_GPL(xdr_init_encode);
@@ -495,7 +560,7 @@
* required at the end of encoding, or any other time when the xdr_buf
* data might be read.
*/
-void xdr_commit_encode(struct xdr_stream *xdr)
+inline void xdr_commit_encode(struct xdr_stream *xdr)
{
int shift = xdr->scratch.iov_len;
void *page;
@@ -517,9 +582,9 @@
int frag1bytes, frag2bytes;
if (nbytes > PAGE_SIZE)
- return NULL; /* Bigger buffers require special handling */
+ goto out_overflow; /* Bigger buffers require special handling */
if (xdr->buf->len + nbytes > xdr->buf->buflen)
- return NULL; /* Sorry, we're totally out of space */
+ goto out_overflow; /* Sorry, we're totally out of space */
frag1bytes = (xdr->end - xdr->p) << 2;
frag2bytes = nbytes - frag1bytes;
if (xdr->iov)
@@ -548,6 +613,9 @@
xdr->buf->page_len += frag2bytes;
xdr->buf->len += nbytes;
return p;
+out_overflow:
+ trace_rpc_xdr_overflow(xdr, nbytes);
+ return NULL;
}
/**
@@ -785,8 +853,10 @@
* @xdr: pointer to xdr_stream struct
* @buf: pointer to XDR buffer from which to decode data
* @p: current pointer inside XDR buffer
+ * @rqst: pointer to controlling rpc_rqst, for debugging
*/
-void xdr_init_decode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p)
+void xdr_init_decode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p,
+ struct rpc_rqst *rqst)
{
xdr->buf = buf;
xdr->scratch.iov_base = NULL;
@@ -802,6 +872,7 @@
xdr->nwords -= p - xdr->p;
xdr->p = p;
}
+ xdr->rqst = rqst;
}
EXPORT_SYMBOL_GPL(xdr_init_decode);
@@ -820,7 +891,7 @@
buf->page_len = len;
buf->buflen = len;
buf->len = len;
- xdr_init_decode(xdr, buf, NULL);
+ xdr_init_decode(xdr, buf, NULL, NULL);
}
EXPORT_SYMBOL_GPL(xdr_init_decode_pages);
@@ -862,20 +933,23 @@
size_t cplen = (char *)xdr->end - (char *)xdr->p;
if (nbytes > xdr->scratch.iov_len)
- return NULL;
+ goto out_overflow;
p = __xdr_inline_decode(xdr, cplen);
if (p == NULL)
return NULL;
memcpy(cpdest, p, cplen);
+ if (!xdr_set_next_buffer(xdr))
+ goto out_overflow;
cpdest += cplen;
nbytes -= cplen;
- if (!xdr_set_next_buffer(xdr))
- return NULL;
p = __xdr_inline_decode(xdr, nbytes);
if (p == NULL)
return NULL;
memcpy(cpdest, p, nbytes);
return xdr->scratch.iov_base;
+out_overflow:
+ trace_rpc_xdr_overflow(xdr, nbytes);
+ return NULL;
}
/**
@@ -892,14 +966,17 @@
{
__be32 *p;
- if (nbytes == 0)
+ if (unlikely(nbytes == 0))
return xdr->p;
if (xdr->p == xdr->end && !xdr_set_next_buffer(xdr))
- return NULL;
+ goto out_overflow;
p = __xdr_inline_decode(xdr, nbytes);
if (p != NULL)
return p;
return xdr_copy_to_scratch(xdr, nbytes);
+out_overflow:
+ trace_rpc_xdr_overflow(xdr, nbytes);
+ return NULL;
}
EXPORT_SYMBOL_GPL(xdr_inline_decode);
@@ -909,13 +986,17 @@
struct kvec *iov;
unsigned int nwords = XDR_QUADLEN(len);
unsigned int cur = xdr_stream_pos(xdr);
+ unsigned int copied, offset;
if (xdr->nwords == 0)
return 0;
+
/* Realign pages to current pointer position */
- iov = buf->head;
+ iov = buf->head;
if (iov->iov_len > cur) {
- xdr_shrink_bufhead(buf, iov->iov_len - cur);
+ offset = iov->iov_len - cur;
+ copied = xdr_shrink_bufhead(buf, offset);
+ trace_rpc_xdr_alignment(xdr, offset, copied);
xdr->nwords = XDR_QUADLEN(buf->len - cur);
}
@@ -927,7 +1008,9 @@
len = buf->page_len;
else if (nwords < xdr->nwords) {
/* Truncate page data and move it into the tail */
- xdr_shrink_pagelen(buf, buf->page_len - len);
+ offset = buf->page_len - len;
+ copied = xdr_shrink_pagelen(buf, offset);
+ trace_rpc_xdr_alignment(xdr, offset, copied);
xdr->nwords = XDR_QUADLEN(buf->len - cur);
}
return len;
@@ -1068,47 +1151,6 @@
}
EXPORT_SYMBOL_GPL(xdr_buf_subsegment);
-/**
- * xdr_buf_trim - lop at most "len" bytes off the end of "buf"
- * @buf: buf to be trimmed
- * @len: number of bytes to reduce "buf" by
- *
- * Trim an xdr_buf by the given number of bytes by fixing up the lengths. Note
- * that it's possible that we'll trim less than that amount if the xdr_buf is
- * too small, or if (for instance) it's all in the head and the parser has
- * already read too far into it.
- */
-void xdr_buf_trim(struct xdr_buf *buf, unsigned int len)
-{
- size_t cur;
- unsigned int trim = len;
-
- if (buf->tail[0].iov_len) {
- cur = min_t(size_t, buf->tail[0].iov_len, trim);
- buf->tail[0].iov_len -= cur;
- trim -= cur;
- if (!trim)
- goto fix_len;
- }
-
- if (buf->page_len) {
- cur = min_t(unsigned int, buf->page_len, trim);
- buf->page_len -= cur;
- trim -= cur;
- if (!trim)
- goto fix_len;
- }
-
- if (buf->head[0].iov_len) {
- cur = min_t(size_t, buf->head[0].iov_len, trim);
- buf->head[0].iov_len -= cur;
- trim -= cur;
- }
-fix_len:
- buf->len -= (len - trim);
-}
-EXPORT_SYMBOL_GPL(xdr_buf_trim);
-
static void __read_bytes_from_xdr_buf(struct xdr_buf *subbuf, void *obj, unsigned int len)
{
unsigned int this_len;
@@ -1194,43 +1236,60 @@
}
EXPORT_SYMBOL_GPL(xdr_encode_word);
-/* If the netobj starting offset bytes from the start of xdr_buf is contained
- * entirely in the head or the tail, set object to point to it; otherwise
- * try to find space for it at the end of the tail, copy it there, and
- * set obj to point to it. */
-int xdr_buf_read_netobj(struct xdr_buf *buf, struct xdr_netobj *obj, unsigned int offset)
+/**
+ * xdr_buf_read_mic() - obtain the address of the GSS mic from xdr buf
+ * @buf: pointer to buffer containing a mic
+ * @mic: on success, returns the address of the mic
+ * @offset: the offset in buf where mic may be found
+ *
+ * This function may modify the xdr buf if the mic is found to be straddling
+ * a boundary between head, pages, and tail. On success the mic can be read
+ * from the address returned. There is no need to free the mic.
+ *
+ * Return: Success returns 0, otherwise an integer error.
+ */
+int xdr_buf_read_mic(struct xdr_buf *buf, struct xdr_netobj *mic, unsigned int offset)
{
struct xdr_buf subbuf;
+ unsigned int boundary;
- if (xdr_decode_word(buf, offset, &obj->len))
+ if (xdr_decode_word(buf, offset, &mic->len))
return -EFAULT;
- if (xdr_buf_subsegment(buf, &subbuf, offset + 4, obj->len))
+ offset += 4;
+
+ /* Is the mic partially in the head? */
+ boundary = buf->head[0].iov_len;
+ if (offset < boundary && (offset + mic->len) > boundary)
+ xdr_shift_buf(buf, boundary - offset);
+
+ /* Is the mic partially in the pages? */
+ boundary += buf->page_len;
+ if (offset < boundary && (offset + mic->len) > boundary)
+ xdr_shrink_pagelen(buf, boundary - offset);
+
+ if (xdr_buf_subsegment(buf, &subbuf, offset, mic->len))
return -EFAULT;
- /* Is the obj contained entirely in the head? */
- obj->data = subbuf.head[0].iov_base;
- if (subbuf.head[0].iov_len == obj->len)
+ /* Is the mic contained entirely in the head? */
+ mic->data = subbuf.head[0].iov_base;
+ if (subbuf.head[0].iov_len == mic->len)
return 0;
- /* ..or is the obj contained entirely in the tail? */
- obj->data = subbuf.tail[0].iov_base;
- if (subbuf.tail[0].iov_len == obj->len)
+ /* ..or is the mic contained entirely in the tail? */
+ mic->data = subbuf.tail[0].iov_base;
+ if (subbuf.tail[0].iov_len == mic->len)
return 0;
- /* use end of tail as storage for obj:
- * (We don't copy to the beginning because then we'd have
- * to worry about doing a potentially overlapping copy.
- * This assumes the object is at most half the length of the
- * tail.) */
- if (obj->len > buf->buflen - buf->len)
+ /* Find a contiguous area in @buf to hold all of @mic */
+ if (mic->len > buf->buflen - buf->len)
return -ENOMEM;
if (buf->tail[0].iov_len != 0)
- obj->data = buf->tail[0].iov_base + buf->tail[0].iov_len;
+ mic->data = buf->tail[0].iov_base + buf->tail[0].iov_len;
else
- obj->data = buf->head[0].iov_base + buf->head[0].iov_len;
- __read_bytes_from_xdr_buf(&subbuf, obj->data, obj->len);
+ mic->data = buf->head[0].iov_base + buf->head[0].iov_len;
+ __read_bytes_from_xdr_buf(&subbuf, mic->data, mic->len);
return 0;
}
-EXPORT_SYMBOL_GPL(xdr_buf_read_netobj);
+EXPORT_SYMBOL_GPL(xdr_buf_read_mic);
/* Returns 0 on success, or else a negative error code. */
static int
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index d066aae..41df4c5 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/net/sunrpc/xprt.c
*
@@ -49,6 +50,7 @@
#include <linux/sunrpc/metrics.h>
#include <linux/sunrpc/bc_xprt.h>
#include <linux/rcupdate.h>
+#include <linux/sched/mm.h>
#include <trace/events/sunrpc.h>
@@ -67,14 +69,20 @@
*/
static void xprt_init(struct rpc_xprt *xprt, struct net *net);
static __be32 xprt_alloc_xid(struct rpc_xprt *xprt);
-static void xprt_connect_status(struct rpc_task *task);
-static int __xprt_get_cong(struct rpc_xprt *, struct rpc_task *);
-static void __xprt_put_cong(struct rpc_xprt *, struct rpc_rqst *);
static void xprt_destroy(struct rpc_xprt *xprt);
static DEFINE_SPINLOCK(xprt_list_lock);
static LIST_HEAD(xprt_list);
+static unsigned long xprt_request_timeout(const struct rpc_rqst *req)
+{
+ unsigned long timeout = jiffies + req->rq_timeout;
+
+ if (time_before(timeout, req->rq_majortimeo))
+ return timeout;
+ return req->rq_majortimeo;
+}
+
/**
* xprt_register_transport - register a transport implementation
* @transport: transport to register
@@ -171,6 +179,17 @@
}
EXPORT_SYMBOL_GPL(xprt_load_transport);
+static void xprt_clear_locked(struct rpc_xprt *xprt)
+{
+ xprt->snd_task = NULL;
+ if (!test_bit(XPRT_CLOSE_WAIT, &xprt->state)) {
+ smp_mb__before_atomic();
+ clear_bit(XPRT_LOCKED, &xprt->state);
+ smp_mb__after_atomic();
+ } else
+ queue_work(xprtiod_workqueue, &xprt->task_cleanup);
+}
+
/**
* xprt_reserve_xprt - serialize write access to transports
* @task: task that is requesting access to the transport
@@ -183,44 +202,56 @@
int xprt_reserve_xprt(struct rpc_xprt *xprt, struct rpc_task *task)
{
struct rpc_rqst *req = task->tk_rqstp;
- int priority;
if (test_and_set_bit(XPRT_LOCKED, &xprt->state)) {
if (task == xprt->snd_task)
return 1;
goto out_sleep;
}
+ if (test_bit(XPRT_WRITE_SPACE, &xprt->state))
+ goto out_unlock;
xprt->snd_task = task;
- if (req != NULL)
- req->rq_ntrans++;
return 1;
+out_unlock:
+ xprt_clear_locked(xprt);
out_sleep:
dprintk("RPC: %5u failed to lock transport %p\n",
task->tk_pid, xprt);
- task->tk_timeout = 0;
task->tk_status = -EAGAIN;
- if (req == NULL)
- priority = RPC_PRIORITY_LOW;
- else if (!req->rq_ntrans)
- priority = RPC_PRIORITY_NORMAL;
+ if (RPC_IS_SOFT(task))
+ rpc_sleep_on_timeout(&xprt->sending, task, NULL,
+ xprt_request_timeout(req));
else
- priority = RPC_PRIORITY_HIGH;
- rpc_sleep_on_priority(&xprt->sending, task, NULL, priority);
+ rpc_sleep_on(&xprt->sending, task, NULL);
return 0;
}
EXPORT_SYMBOL_GPL(xprt_reserve_xprt);
-static void xprt_clear_locked(struct rpc_xprt *xprt)
+static bool
+xprt_need_congestion_window_wait(struct rpc_xprt *xprt)
{
- xprt->snd_task = NULL;
- if (!test_bit(XPRT_CLOSE_WAIT, &xprt->state)) {
- smp_mb__before_atomic();
- clear_bit(XPRT_LOCKED, &xprt->state);
- smp_mb__after_atomic();
- } else
- queue_work(xprtiod_workqueue, &xprt->task_cleanup);
+ return test_bit(XPRT_CWND_WAIT, &xprt->state);
+}
+
+static void
+xprt_set_congestion_window_wait(struct rpc_xprt *xprt)
+{
+ if (!list_empty(&xprt->xmit_queue)) {
+ /* Peek at head of queue to see if it can make progress */
+ if (list_first_entry(&xprt->xmit_queue, struct rpc_rqst,
+ rq_xmit)->rq_cong)
+ return;
+ }
+ set_bit(XPRT_CWND_WAIT, &xprt->state);
+}
+
+static void
+xprt_test_and_clear_congestion_window_wait(struct rpc_xprt *xprt)
+{
+ if (!RPCXPRT_CONGESTED(xprt))
+ clear_bit(XPRT_CWND_WAIT, &xprt->state);
}
/*
@@ -230,11 +261,11 @@
* Same as xprt_reserve_xprt, but Van Jacobson congestion control is
* integrated into the decision of whether a request is allowed to be
* woken up and given access to the transport.
+ * Note that the lock is only granted if we know there are free slots.
*/
int xprt_reserve_xprt_cong(struct rpc_xprt *xprt, struct rpc_task *task)
{
struct rpc_rqst *req = task->tk_rqstp;
- int priority;
if (test_and_set_bit(XPRT_LOCKED, &xprt->state)) {
if (task == xprt->snd_task)
@@ -245,25 +276,22 @@
xprt->snd_task = task;
return 1;
}
- if (__xprt_get_cong(xprt, task)) {
+ if (test_bit(XPRT_WRITE_SPACE, &xprt->state))
+ goto out_unlock;
+ if (!xprt_need_congestion_window_wait(xprt)) {
xprt->snd_task = task;
- req->rq_ntrans++;
return 1;
}
+out_unlock:
xprt_clear_locked(xprt);
out_sleep:
- if (req)
- __xprt_put_cong(xprt, req);
dprintk("RPC: %5u failed to lock transport %p\n", task->tk_pid, xprt);
- task->tk_timeout = 0;
task->tk_status = -EAGAIN;
- if (req == NULL)
- priority = RPC_PRIORITY_LOW;
- else if (!req->rq_ntrans)
- priority = RPC_PRIORITY_NORMAL;
+ if (RPC_IS_SOFT(task))
+ rpc_sleep_on_timeout(&xprt->sending, task, NULL,
+ xprt_request_timeout(req));
else
- priority = RPC_PRIORITY_HIGH;
- rpc_sleep_on_priority(&xprt->sending, task, NULL, priority);
+ rpc_sleep_on(&xprt->sending, task, NULL);
return 0;
}
EXPORT_SYMBOL_GPL(xprt_reserve_xprt_cong);
@@ -272,21 +300,19 @@
{
int retval;
- spin_lock_bh(&xprt->transport_lock);
+ if (test_bit(XPRT_LOCKED, &xprt->state) && xprt->snd_task == task)
+ return 1;
+ spin_lock(&xprt->transport_lock);
retval = xprt->ops->reserve_xprt(xprt, task);
- spin_unlock_bh(&xprt->transport_lock);
+ spin_unlock(&xprt->transport_lock);
return retval;
}
static bool __xprt_lock_write_func(struct rpc_task *task, void *data)
{
struct rpc_xprt *xprt = data;
- struct rpc_rqst *req;
- req = task->tk_rqstp;
xprt->snd_task = task;
- if (req)
- req->rq_ntrans++;
return true;
}
@@ -294,53 +320,30 @@
{
if (test_and_set_bit(XPRT_LOCKED, &xprt->state))
return;
-
+ if (test_bit(XPRT_WRITE_SPACE, &xprt->state))
+ goto out_unlock;
if (rpc_wake_up_first_on_wq(xprtiod_workqueue, &xprt->sending,
__xprt_lock_write_func, xprt))
return;
+out_unlock:
xprt_clear_locked(xprt);
}
-static bool __xprt_lock_write_cong_func(struct rpc_task *task, void *data)
-{
- struct rpc_xprt *xprt = data;
- struct rpc_rqst *req;
-
- req = task->tk_rqstp;
- if (req == NULL) {
- xprt->snd_task = task;
- return true;
- }
- if (__xprt_get_cong(xprt, task)) {
- xprt->snd_task = task;
- req->rq_ntrans++;
- return true;
- }
- return false;
-}
-
static void __xprt_lock_write_next_cong(struct rpc_xprt *xprt)
{
if (test_and_set_bit(XPRT_LOCKED, &xprt->state))
return;
- if (RPCXPRT_CONGESTED(xprt))
+ if (test_bit(XPRT_WRITE_SPACE, &xprt->state))
+ goto out_unlock;
+ if (xprt_need_congestion_window_wait(xprt))
goto out_unlock;
if (rpc_wake_up_first_on_wq(xprtiod_workqueue, &xprt->sending,
- __xprt_lock_write_cong_func, xprt))
+ __xprt_lock_write_func, xprt))
return;
out_unlock:
xprt_clear_locked(xprt);
}
-static void xprt_task_clear_bytes_sent(struct rpc_task *task)
-{
- if (task != NULL) {
- struct rpc_rqst *req = task->tk_rqstp;
- if (req != NULL)
- req->rq_bytes_sent = 0;
- }
-}
-
/**
* xprt_release_xprt - allow other requests to use a transport
* @xprt: transport with other tasks potentially waiting
@@ -351,7 +354,6 @@
void xprt_release_xprt(struct rpc_xprt *xprt, struct rpc_task *task)
{
if (xprt->snd_task == task) {
- xprt_task_clear_bytes_sent(task);
xprt_clear_locked(xprt);
__xprt_lock_write_next(xprt);
}
@@ -369,7 +371,6 @@
void xprt_release_xprt_cong(struct rpc_xprt *xprt, struct rpc_task *task)
{
if (xprt->snd_task == task) {
- xprt_task_clear_bytes_sent(task);
xprt_clear_locked(xprt);
__xprt_lock_write_next_cong(xprt);
}
@@ -378,9 +379,11 @@
static inline void xprt_release_write(struct rpc_xprt *xprt, struct rpc_task *task)
{
- spin_lock_bh(&xprt->transport_lock);
+ if (xprt->snd_task != task)
+ return;
+ spin_lock(&xprt->transport_lock);
xprt->ops->release_xprt(xprt, task);
- spin_unlock_bh(&xprt->transport_lock);
+ spin_unlock(&xprt->transport_lock);
}
/*
@@ -388,16 +391,16 @@
* overflowed. Put the task to sleep if this is the case.
*/
static int
-__xprt_get_cong(struct rpc_xprt *xprt, struct rpc_task *task)
+__xprt_get_cong(struct rpc_xprt *xprt, struct rpc_rqst *req)
{
- struct rpc_rqst *req = task->tk_rqstp;
-
if (req->rq_cong)
return 1;
dprintk("RPC: %5u xprt_cwnd_limited cong = %lu cwnd = %lu\n",
- task->tk_pid, xprt->cong, xprt->cwnd);
- if (RPCXPRT_CONGESTED(xprt))
+ req->rq_task->tk_pid, xprt->cong, xprt->cwnd);
+ if (RPCXPRT_CONGESTED(xprt)) {
+ xprt_set_congestion_window_wait(xprt);
return 0;
+ }
req->rq_cong = 1;
xprt->cong += RPC_CWNDSCALE;
return 1;
@@ -414,10 +417,32 @@
return;
req->rq_cong = 0;
xprt->cong -= RPC_CWNDSCALE;
+ xprt_test_and_clear_congestion_window_wait(xprt);
__xprt_lock_write_next_cong(xprt);
}
/**
+ * xprt_request_get_cong - Request congestion control credits
+ * @xprt: pointer to transport
+ * @req: pointer to RPC request
+ *
+ * Useful for transports that require congestion control.
+ */
+bool
+xprt_request_get_cong(struct rpc_xprt *xprt, struct rpc_rqst *req)
+{
+ bool ret = false;
+
+ if (req->rq_cong)
+ return true;
+ spin_lock(&xprt->transport_lock);
+ ret = __xprt_get_cong(xprt, req) != 0;
+ spin_unlock(&xprt->transport_lock);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(xprt_request_get_cong);
+
+/**
* xprt_release_rqst_cong - housekeeping when request is complete
* @task: RPC request that recently completed
*
@@ -431,6 +456,26 @@
}
EXPORT_SYMBOL_GPL(xprt_release_rqst_cong);
+static void xprt_clear_congestion_window_wait_locked(struct rpc_xprt *xprt)
+{
+ if (test_and_clear_bit(XPRT_CWND_WAIT, &xprt->state))
+ __xprt_lock_write_next_cong(xprt);
+}
+
+/*
+ * Clear the congestion window wait flag and wake up the next
+ * entry on xprt->sending
+ */
+static void
+xprt_clear_congestion_window_wait(struct rpc_xprt *xprt)
+{
+ if (test_and_clear_bit(XPRT_CWND_WAIT, &xprt->state)) {
+ spin_lock(&xprt->transport_lock);
+ __xprt_lock_write_next_cong(xprt);
+ spin_unlock(&xprt->transport_lock);
+ }
+}
+
/**
* xprt_adjust_cwnd - adjust transport congestion window
* @xprt: pointer to xprt
@@ -488,89 +533,87 @@
/**
* xprt_wait_for_buffer_space - wait for transport output buffer to clear
- * @task: task to be put to sleep
- * @action: function pointer to be executed after wait
+ * @xprt: transport
*
* Note that we only set the timer for the case of RPC_IS_SOFT(), since
* we don't in general want to force a socket disconnection due to
* an incomplete RPC call transmission.
*/
-void xprt_wait_for_buffer_space(struct rpc_task *task, rpc_action action)
+void xprt_wait_for_buffer_space(struct rpc_xprt *xprt)
{
- struct rpc_rqst *req = task->tk_rqstp;
- struct rpc_xprt *xprt = req->rq_xprt;
-
- task->tk_timeout = RPC_IS_SOFT(task) ? req->rq_timeout : 0;
- rpc_sleep_on(&xprt->pending, task, action);
+ set_bit(XPRT_WRITE_SPACE, &xprt->state);
}
EXPORT_SYMBOL_GPL(xprt_wait_for_buffer_space);
+static bool
+xprt_clear_write_space_locked(struct rpc_xprt *xprt)
+{
+ if (test_and_clear_bit(XPRT_WRITE_SPACE, &xprt->state)) {
+ __xprt_lock_write_next(xprt);
+ dprintk("RPC: write space: waking waiting task on "
+ "xprt %p\n", xprt);
+ return true;
+ }
+ return false;
+}
+
/**
* xprt_write_space - wake the task waiting for transport output buffer space
* @xprt: transport with waiting tasks
*
* Can be called in a soft IRQ context, so xprt_write_space never sleeps.
*/
-void xprt_write_space(struct rpc_xprt *xprt)
+bool xprt_write_space(struct rpc_xprt *xprt)
{
- spin_lock_bh(&xprt->transport_lock);
- if (xprt->snd_task) {
- dprintk("RPC: write space: waking waiting task on "
- "xprt %p\n", xprt);
- rpc_wake_up_queued_task_on_wq(xprtiod_workqueue,
- &xprt->pending, xprt->snd_task);
- }
- spin_unlock_bh(&xprt->transport_lock);
+ bool ret;
+
+ if (!test_bit(XPRT_WRITE_SPACE, &xprt->state))
+ return false;
+ spin_lock(&xprt->transport_lock);
+ ret = xprt_clear_write_space_locked(xprt);
+ spin_unlock(&xprt->transport_lock);
+ return ret;
}
EXPORT_SYMBOL_GPL(xprt_write_space);
-/**
- * xprt_set_retrans_timeout_def - set a request's retransmit timeout
- * @task: task whose timeout is to be set
- *
- * Set a request's retransmit timeout based on the transport's
- * default timeout parameters. Used by transports that don't adjust
- * the retransmit timeout based on round-trip time estimation.
- */
-void xprt_set_retrans_timeout_def(struct rpc_task *task)
+static unsigned long xprt_abs_ktime_to_jiffies(ktime_t abstime)
{
- task->tk_timeout = task->tk_rqstp->rq_timeout;
+ s64 delta = ktime_to_ns(ktime_get() - abstime);
+ return likely(delta >= 0) ?
+ jiffies - nsecs_to_jiffies(delta) :
+ jiffies + nsecs_to_jiffies(-delta);
}
-EXPORT_SYMBOL_GPL(xprt_set_retrans_timeout_def);
-/**
- * xprt_set_retrans_timeout_rtt - set a request's retransmit timeout
- * @task: task whose timeout is to be set
- *
- * Set a request's retransmit timeout using the RTT estimator.
- */
-void xprt_set_retrans_timeout_rtt(struct rpc_task *task)
+static unsigned long xprt_calc_majortimeo(struct rpc_rqst *req)
{
- int timer = task->tk_msg.rpc_proc->p_timer;
- struct rpc_clnt *clnt = task->tk_client;
- struct rpc_rtt *rtt = clnt->cl_rtt;
- struct rpc_rqst *req = task->tk_rqstp;
- unsigned long max_timeout = clnt->cl_timeout->to_maxval;
+ const struct rpc_timeout *to = req->rq_task->tk_client->cl_timeout;
+ unsigned long majortimeo = req->rq_timeout;
- task->tk_timeout = rpc_calc_rto(rtt, timer);
- task->tk_timeout <<= rpc_ntimeo(rtt, timer) + req->rq_retries;
- if (task->tk_timeout > max_timeout || task->tk_timeout == 0)
- task->tk_timeout = max_timeout;
+ if (to->to_exponential)
+ majortimeo <<= to->to_retries;
+ else
+ majortimeo += to->to_increment * to->to_retries;
+ if (majortimeo > to->to_maxval || majortimeo == 0)
+ majortimeo = to->to_maxval;
+ return majortimeo;
}
-EXPORT_SYMBOL_GPL(xprt_set_retrans_timeout_rtt);
static void xprt_reset_majortimeo(struct rpc_rqst *req)
{
- const struct rpc_timeout *to = req->rq_task->tk_client->cl_timeout;
+ req->rq_majortimeo += xprt_calc_majortimeo(req);
+}
- req->rq_majortimeo = req->rq_timeout;
- if (to->to_exponential)
- req->rq_majortimeo <<= to->to_retries;
+static void xprt_init_majortimeo(struct rpc_task *task, struct rpc_rqst *req)
+{
+ unsigned long time_init;
+ struct rpc_xprt *xprt = req->rq_xprt;
+
+ if (likely(xprt && xprt_connected(xprt)))
+ time_init = jiffies;
else
- req->rq_majortimeo += to->to_increment * to->to_retries;
- if (req->rq_majortimeo > to->to_maxval || req->rq_majortimeo == 0)
- req->rq_majortimeo = to->to_maxval;
- req->rq_majortimeo += jiffies;
+ time_init = xprt_abs_ktime_to_jiffies(task->tk_start);
+ req->rq_timeout = task->tk_client->cl_timeout->to_initval;
+ req->rq_majortimeo = time_init + xprt_calc_majortimeo(req);
}
/**
@@ -597,9 +640,9 @@
req->rq_retries = 0;
xprt_reset_majortimeo(req);
/* Reset the RTT counters == "slow start" */
- spin_lock_bh(&xprt->transport_lock);
+ spin_lock(&xprt->transport_lock);
rpc_init_rtt(req->rq_task->tk_client->cl_rtt, to->to_initval);
- spin_unlock_bh(&xprt->transport_lock);
+ spin_unlock(&xprt->transport_lock);
status = -ETIMEDOUT;
}
@@ -614,11 +657,13 @@
{
struct rpc_xprt *xprt =
container_of(work, struct rpc_xprt, task_cleanup);
+ unsigned int pflags = memalloc_nofs_save();
clear_bit(XPRT_CLOSE_WAIT, &xprt->state);
xprt->ops->close(xprt);
xprt_release_write(xprt, NULL);
wake_up_bit(&xprt->state, XPRT_LOCKED);
+ memalloc_nofs_restore(pflags);
}
/**
@@ -629,10 +674,12 @@
void xprt_disconnect_done(struct rpc_xprt *xprt)
{
dprintk("RPC: disconnected transport %p\n", xprt);
- spin_lock_bh(&xprt->transport_lock);
+ spin_lock(&xprt->transport_lock);
xprt_clear_connected(xprt);
- xprt_wake_pending_tasks(xprt, -EAGAIN);
- spin_unlock_bh(&xprt->transport_lock);
+ xprt_clear_write_space_locked(xprt);
+ xprt_clear_congestion_window_wait_locked(xprt);
+ xprt_wake_pending_tasks(xprt, -ENOTCONN);
+ spin_unlock(&xprt->transport_lock);
}
EXPORT_SYMBOL_GPL(xprt_disconnect_done);
@@ -644,16 +691,34 @@
void xprt_force_disconnect(struct rpc_xprt *xprt)
{
/* Don't race with the test_bit() in xprt_clear_locked() */
- spin_lock_bh(&xprt->transport_lock);
+ spin_lock(&xprt->transport_lock);
set_bit(XPRT_CLOSE_WAIT, &xprt->state);
/* Try to schedule an autoclose RPC call */
if (test_and_set_bit(XPRT_LOCKED, &xprt->state) == 0)
queue_work(xprtiod_workqueue, &xprt->task_cleanup);
- xprt_wake_pending_tasks(xprt, -EAGAIN);
- spin_unlock_bh(&xprt->transport_lock);
+ else if (xprt->snd_task)
+ rpc_wake_up_queued_task_set_status(&xprt->pending,
+ xprt->snd_task, -ENOTCONN);
+ spin_unlock(&xprt->transport_lock);
}
EXPORT_SYMBOL_GPL(xprt_force_disconnect);
+static unsigned int
+xprt_connect_cookie(struct rpc_xprt *xprt)
+{
+ return READ_ONCE(xprt->connect_cookie);
+}
+
+static bool
+xprt_request_retransmit_after_disconnect(struct rpc_task *task)
+{
+ struct rpc_rqst *req = task->tk_rqstp;
+ struct rpc_xprt *xprt = req->rq_xprt;
+
+ return req->rq_connect_cookie != xprt_connect_cookie(xprt) ||
+ !xprt_connected(xprt);
+}
+
/**
* xprt_conditional_disconnect - force a transport to disconnect
* @xprt: transport to disconnect
@@ -668,7 +733,7 @@
void xprt_conditional_disconnect(struct rpc_xprt *xprt, unsigned int cookie)
{
/* Don't race with the test_bit() in xprt_clear_locked() */
- spin_lock_bh(&xprt->transport_lock);
+ spin_lock(&xprt->transport_lock);
if (cookie != xprt->connect_cookie)
goto out;
if (test_bit(XPRT_CLOSING, &xprt->state))
@@ -679,7 +744,7 @@
queue_work(xprtiod_workqueue, &xprt->task_cleanup);
xprt_wake_pending_tasks(xprt, -EAGAIN);
out:
- spin_unlock_bh(&xprt->transport_lock);
+ spin_unlock(&xprt->transport_lock);
}
static bool
@@ -692,7 +757,8 @@
xprt_schedule_autodisconnect(struct rpc_xprt *xprt)
__must_hold(&xprt->transport_lock)
{
- if (list_empty(&xprt->recv) && xprt_has_timer(xprt))
+ xprt->last_used = jiffies;
+ if (RB_EMPTY_ROOT(&xprt->recv_queue) && xprt_has_timer(xprt))
mod_timer(&xprt->timer, xprt->last_used + xprt->idle_timeout);
}
@@ -701,18 +767,13 @@
{
struct rpc_xprt *xprt = from_timer(xprt, t, timer);
- spin_lock(&xprt->transport_lock);
- if (!list_empty(&xprt->recv))
- goto out_abort;
+ if (!RB_EMPTY_ROOT(&xprt->recv_queue))
+ return;
/* Reset xprt->last_used to avoid connect/autodisconnect cycling */
xprt->last_used = jiffies;
if (test_and_set_bit(XPRT_LOCKED, &xprt->state))
- goto out_abort;
- spin_unlock(&xprt->transport_lock);
+ return;
queue_work(xprtiod_workqueue, &xprt->task_cleanup);
- return;
-out_abort:
- spin_unlock(&xprt->transport_lock);
}
bool xprt_lock_connect(struct rpc_xprt *xprt,
@@ -721,22 +782,21 @@
{
bool ret = false;
- spin_lock_bh(&xprt->transport_lock);
+ spin_lock(&xprt->transport_lock);
if (!test_bit(XPRT_LOCKED, &xprt->state))
goto out;
if (xprt->snd_task != task)
goto out;
- xprt_task_clear_bytes_sent(task);
xprt->snd_task = cookie;
ret = true;
out:
- spin_unlock_bh(&xprt->transport_lock);
+ spin_unlock(&xprt->transport_lock);
return ret;
}
void xprt_unlock_connect(struct rpc_xprt *xprt, void *cookie)
{
- spin_lock_bh(&xprt->transport_lock);
+ spin_lock(&xprt->transport_lock);
if (xprt->snd_task != cookie)
goto out;
if (!test_bit(XPRT_LOCKED, &xprt->state))
@@ -745,7 +805,7 @@
xprt->ops->release_xprt(xprt, NULL);
xprt_schedule_autodisconnect(xprt);
out:
- spin_unlock_bh(&xprt->transport_lock);
+ spin_unlock(&xprt->transport_lock);
wake_up_bit(&xprt->state, XPRT_LOCKED);
}
@@ -772,10 +832,9 @@
xprt->ops->close(xprt);
if (!xprt_connected(xprt)) {
- task->tk_rqstp->rq_bytes_sent = 0;
- task->tk_timeout = task->tk_rqstp->rq_timeout;
task->tk_rqstp->rq_connect_cookie = xprt->connect_cookie;
- rpc_sleep_on(&xprt->pending, task, xprt_connect_status);
+ rpc_sleep_on_timeout(&xprt->pending, task, NULL,
+ xprt_request_timeout(task->tk_rqstp));
if (test_bit(XPRT_CLOSING, &xprt->state))
return;
@@ -794,38 +853,105 @@
xprt_release_write(xprt, task);
}
-static void xprt_connect_status(struct rpc_task *task)
+/**
+ * xprt_reconnect_delay - compute the wait before scheduling a connect
+ * @xprt: transport instance
+ *
+ */
+unsigned long xprt_reconnect_delay(const struct rpc_xprt *xprt)
{
- struct rpc_xprt *xprt = task->tk_rqstp->rq_xprt;
+ unsigned long start, now = jiffies;
- if (task->tk_status == 0) {
- xprt->stat.connect_count++;
- xprt->stat.connect_time += (long)jiffies - xprt->stat.connect_start;
- dprintk("RPC: %5u xprt_connect_status: connection established\n",
- task->tk_pid);
- return;
- }
+ start = xprt->stat.connect_start + xprt->reestablish_timeout;
+ if (time_after(start, now))
+ return start - now;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(xprt_reconnect_delay);
- switch (task->tk_status) {
- case -ECONNREFUSED:
- case -ECONNRESET:
- case -ECONNABORTED:
- case -ENETUNREACH:
- case -EHOSTUNREACH:
- case -EPIPE:
- case -EAGAIN:
- dprintk("RPC: %5u xprt_connect_status: retrying\n", task->tk_pid);
- break;
- case -ETIMEDOUT:
- dprintk("RPC: %5u xprt_connect_status: connect attempt timed "
- "out\n", task->tk_pid);
- break;
- default:
- dprintk("RPC: %5u xprt_connect_status: error %d connecting to "
- "server %s\n", task->tk_pid, -task->tk_status,
- xprt->servername);
- task->tk_status = -EIO;
+/**
+ * xprt_reconnect_backoff - compute the new re-establish timeout
+ * @xprt: transport instance
+ * @init_to: initial reestablish timeout
+ *
+ */
+void xprt_reconnect_backoff(struct rpc_xprt *xprt, unsigned long init_to)
+{
+ xprt->reestablish_timeout <<= 1;
+ if (xprt->reestablish_timeout > xprt->max_reconnect_timeout)
+ xprt->reestablish_timeout = xprt->max_reconnect_timeout;
+ if (xprt->reestablish_timeout < init_to)
+ xprt->reestablish_timeout = init_to;
+}
+EXPORT_SYMBOL_GPL(xprt_reconnect_backoff);
+
+enum xprt_xid_rb_cmp {
+ XID_RB_EQUAL,
+ XID_RB_LEFT,
+ XID_RB_RIGHT,
+};
+static enum xprt_xid_rb_cmp
+xprt_xid_cmp(__be32 xid1, __be32 xid2)
+{
+ if (xid1 == xid2)
+ return XID_RB_EQUAL;
+ if ((__force u32)xid1 < (__force u32)xid2)
+ return XID_RB_LEFT;
+ return XID_RB_RIGHT;
+}
+
+static struct rpc_rqst *
+xprt_request_rb_find(struct rpc_xprt *xprt, __be32 xid)
+{
+ struct rb_node *n = xprt->recv_queue.rb_node;
+ struct rpc_rqst *req;
+
+ while (n != NULL) {
+ req = rb_entry(n, struct rpc_rqst, rq_recv);
+ switch (xprt_xid_cmp(xid, req->rq_xid)) {
+ case XID_RB_LEFT:
+ n = n->rb_left;
+ break;
+ case XID_RB_RIGHT:
+ n = n->rb_right;
+ break;
+ case XID_RB_EQUAL:
+ return req;
+ }
}
+ return NULL;
+}
+
+static void
+xprt_request_rb_insert(struct rpc_xprt *xprt, struct rpc_rqst *new)
+{
+ struct rb_node **p = &xprt->recv_queue.rb_node;
+ struct rb_node *n = NULL;
+ struct rpc_rqst *req;
+
+ while (*p != NULL) {
+ n = *p;
+ req = rb_entry(n, struct rpc_rqst, rq_recv);
+ switch(xprt_xid_cmp(new->rq_xid, req->rq_xid)) {
+ case XID_RB_LEFT:
+ p = &n->rb_left;
+ break;
+ case XID_RB_RIGHT:
+ p = &n->rb_right;
+ break;
+ case XID_RB_EQUAL:
+ WARN_ON_ONCE(new != req);
+ return;
+ }
+ }
+ rb_link_node(&new->rq_recv, n, p);
+ rb_insert_color(&new->rq_recv, &xprt->recv_queue);
+}
+
+static void
+xprt_request_rb_remove(struct rpc_xprt *xprt, struct rpc_rqst *req)
+{
+ rb_erase(&req->rq_recv, &xprt->recv_queue);
}
/**
@@ -833,18 +959,18 @@
* @xprt: transport on which the original request was transmitted
* @xid: RPC XID of incoming reply
*
- * Caller holds xprt->recv_lock.
+ * Caller holds xprt->queue_lock.
*/
struct rpc_rqst *xprt_lookup_rqst(struct rpc_xprt *xprt, __be32 xid)
{
struct rpc_rqst *entry;
- list_for_each_entry(entry, &xprt->recv, rq_list)
- if (entry->rq_xid == xid) {
- trace_xprt_lookup_rqst(xprt, xid, 0);
- entry->rq_rtt = ktime_sub(ktime_get(), entry->rq_xtime);
- return entry;
- }
+ entry = xprt_request_rb_find(xprt, xid);
+ if (entry != NULL) {
+ trace_xprt_lookup_rqst(xprt, xid, 0);
+ entry->rq_rtt = ktime_sub(ktime_get(), entry->rq_xtime);
+ return entry;
+ }
dprintk("RPC: xprt_lookup_rqst did not find xid %08x\n",
ntohl(xid));
@@ -854,16 +980,22 @@
}
EXPORT_SYMBOL_GPL(xprt_lookup_rqst);
+static bool
+xprt_is_pinned_rqst(struct rpc_rqst *req)
+{
+ return atomic_read(&req->rq_pin) != 0;
+}
+
/**
* xprt_pin_rqst - Pin a request on the transport receive list
* @req: Request to pin
*
* Caller must ensure this is atomic with the call to xprt_lookup_rqst()
- * so should be holding the xprt transport lock.
+ * so should be holding xprt->queue_lock.
*/
void xprt_pin_rqst(struct rpc_rqst *req)
{
- set_bit(RPC_TASK_MSG_RECV, &req->rq_task->tk_runstate);
+ atomic_inc(&req->rq_pin);
}
EXPORT_SYMBOL_GPL(xprt_pin_rqst);
@@ -871,38 +1003,88 @@
* xprt_unpin_rqst - Unpin a request on the transport receive list
* @req: Request to pin
*
- * Caller should be holding the xprt transport lock.
+ * Caller should be holding xprt->queue_lock.
*/
void xprt_unpin_rqst(struct rpc_rqst *req)
{
- struct rpc_task *task = req->rq_task;
-
- clear_bit(RPC_TASK_MSG_RECV, &task->tk_runstate);
- if (test_bit(RPC_TASK_MSG_RECV_WAIT, &task->tk_runstate))
- wake_up_bit(&task->tk_runstate, RPC_TASK_MSG_RECV);
+ if (!test_bit(RPC_TASK_MSG_PIN_WAIT, &req->rq_task->tk_runstate)) {
+ atomic_dec(&req->rq_pin);
+ return;
+ }
+ if (atomic_dec_and_test(&req->rq_pin))
+ wake_up_var(&req->rq_pin);
}
EXPORT_SYMBOL_GPL(xprt_unpin_rqst);
static void xprt_wait_on_pinned_rqst(struct rpc_rqst *req)
-__must_hold(&req->rq_xprt->recv_lock)
{
- struct rpc_task *task = req->rq_task;
+ wait_var_event(&req->rq_pin, !xprt_is_pinned_rqst(req));
+}
- if (task && test_bit(RPC_TASK_MSG_RECV, &task->tk_runstate)) {
- spin_unlock(&req->rq_xprt->recv_lock);
- set_bit(RPC_TASK_MSG_RECV_WAIT, &task->tk_runstate);
- wait_on_bit(&task->tk_runstate, RPC_TASK_MSG_RECV,
- TASK_UNINTERRUPTIBLE);
- clear_bit(RPC_TASK_MSG_RECV_WAIT, &task->tk_runstate);
- spin_lock(&req->rq_xprt->recv_lock);
- }
+static bool
+xprt_request_data_received(struct rpc_task *task)
+{
+ return !test_bit(RPC_TASK_NEED_RECV, &task->tk_runstate) &&
+ READ_ONCE(task->tk_rqstp->rq_reply_bytes_recvd) != 0;
+}
+
+static bool
+xprt_request_need_enqueue_receive(struct rpc_task *task, struct rpc_rqst *req)
+{
+ return !test_bit(RPC_TASK_NEED_RECV, &task->tk_runstate) &&
+ READ_ONCE(task->tk_rqstp->rq_reply_bytes_recvd) == 0;
+}
+
+/**
+ * xprt_request_enqueue_receive - Add an request to the receive queue
+ * @task: RPC task
+ *
+ */
+void
+xprt_request_enqueue_receive(struct rpc_task *task)
+{
+ struct rpc_rqst *req = task->tk_rqstp;
+ struct rpc_xprt *xprt = req->rq_xprt;
+
+ if (!xprt_request_need_enqueue_receive(task, req))
+ return;
+
+ xprt_request_prepare(task->tk_rqstp);
+ spin_lock(&xprt->queue_lock);
+
+ /* Update the softirq receive buffer */
+ memcpy(&req->rq_private_buf, &req->rq_rcv_buf,
+ sizeof(req->rq_private_buf));
+
+ /* Add request to the receive list */
+ xprt_request_rb_insert(xprt, req);
+ set_bit(RPC_TASK_NEED_RECV, &task->tk_runstate);
+ spin_unlock(&xprt->queue_lock);
+
+ /* Turn off autodisconnect */
+ del_singleshot_timer_sync(&xprt->timer);
+}
+
+/**
+ * xprt_request_dequeue_receive_locked - Remove a request from the receive queue
+ * @task: RPC task
+ *
+ * Caller must hold xprt->queue_lock.
+ */
+static void
+xprt_request_dequeue_receive_locked(struct rpc_task *task)
+{
+ struct rpc_rqst *req = task->tk_rqstp;
+
+ if (test_and_clear_bit(RPC_TASK_NEED_RECV, &task->tk_runstate))
+ xprt_request_rb_remove(req->rq_xprt, req);
}
/**
* xprt_update_rtt - Update RPC RTT statistics
* @task: RPC request that recently completed
*
- * Caller holds xprt->recv_lock.
+ * Caller holds xprt->queue_lock.
*/
void xprt_update_rtt(struct rpc_task *task)
{
@@ -924,7 +1106,7 @@
* @task: RPC request that recently completed
* @copied: actual number of bytes received from the transport
*
- * Caller holds xprt->recv_lock.
+ * Caller holds xprt->queue_lock.
*/
void xprt_complete_rqst(struct rpc_task *task, int copied)
{
@@ -937,12 +1119,12 @@
xprt->stat.recvs++;
- list_del_init(&req->rq_list);
req->rq_private_buf.len = copied;
/* Ensure all writes are done before we update */
/* req->rq_reply_bytes_recvd */
smp_wmb();
req->rq_reply_bytes_recvd = copied;
+ xprt_request_dequeue_receive_locked(task);
rpc_wake_up_queued_task(&xprt->pending, task);
}
EXPORT_SYMBOL_GPL(xprt_complete_rqst);
@@ -964,6 +1146,249 @@
}
/**
+ * xprt_wait_for_reply_request_def - wait for reply
+ * @task: pointer to rpc_task
+ *
+ * Set a request's retransmit timeout based on the transport's
+ * default timeout parameters. Used by transports that don't adjust
+ * the retransmit timeout based on round-trip time estimation,
+ * and put the task to sleep on the pending queue.
+ */
+void xprt_wait_for_reply_request_def(struct rpc_task *task)
+{
+ struct rpc_rqst *req = task->tk_rqstp;
+
+ rpc_sleep_on_timeout(&req->rq_xprt->pending, task, xprt_timer,
+ xprt_request_timeout(req));
+}
+EXPORT_SYMBOL_GPL(xprt_wait_for_reply_request_def);
+
+/**
+ * xprt_wait_for_reply_request_rtt - wait for reply using RTT estimator
+ * @task: pointer to rpc_task
+ *
+ * Set a request's retransmit timeout using the RTT estimator,
+ * and put the task to sleep on the pending queue.
+ */
+void xprt_wait_for_reply_request_rtt(struct rpc_task *task)
+{
+ int timer = task->tk_msg.rpc_proc->p_timer;
+ struct rpc_clnt *clnt = task->tk_client;
+ struct rpc_rtt *rtt = clnt->cl_rtt;
+ struct rpc_rqst *req = task->tk_rqstp;
+ unsigned long max_timeout = clnt->cl_timeout->to_maxval;
+ unsigned long timeout;
+
+ timeout = rpc_calc_rto(rtt, timer);
+ timeout <<= rpc_ntimeo(rtt, timer) + req->rq_retries;
+ if (timeout > max_timeout || timeout == 0)
+ timeout = max_timeout;
+ rpc_sleep_on_timeout(&req->rq_xprt->pending, task, xprt_timer,
+ jiffies + timeout);
+}
+EXPORT_SYMBOL_GPL(xprt_wait_for_reply_request_rtt);
+
+/**
+ * xprt_request_wait_receive - wait for the reply to an RPC request
+ * @task: RPC task about to send a request
+ *
+ */
+void xprt_request_wait_receive(struct rpc_task *task)
+{
+ struct rpc_rqst *req = task->tk_rqstp;
+ struct rpc_xprt *xprt = req->rq_xprt;
+
+ if (!test_bit(RPC_TASK_NEED_RECV, &task->tk_runstate))
+ return;
+ /*
+ * Sleep on the pending queue if we're expecting a reply.
+ * The spinlock ensures atomicity between the test of
+ * req->rq_reply_bytes_recvd, and the call to rpc_sleep_on().
+ */
+ spin_lock(&xprt->queue_lock);
+ if (test_bit(RPC_TASK_NEED_RECV, &task->tk_runstate)) {
+ xprt->ops->wait_for_reply_request(task);
+ /*
+ * Send an extra queue wakeup call if the
+ * connection was dropped in case the call to
+ * rpc_sleep_on() raced.
+ */
+ if (xprt_request_retransmit_after_disconnect(task))
+ rpc_wake_up_queued_task_set_status(&xprt->pending,
+ task, -ENOTCONN);
+ }
+ spin_unlock(&xprt->queue_lock);
+}
+
+static bool
+xprt_request_need_enqueue_transmit(struct rpc_task *task, struct rpc_rqst *req)
+{
+ return !test_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate);
+}
+
+/**
+ * xprt_request_enqueue_transmit - queue a task for transmission
+ * @task: pointer to rpc_task
+ *
+ * Add a task to the transmission queue.
+ */
+void
+xprt_request_enqueue_transmit(struct rpc_task *task)
+{
+ struct rpc_rqst *pos, *req = task->tk_rqstp;
+ struct rpc_xprt *xprt = req->rq_xprt;
+
+ if (xprt_request_need_enqueue_transmit(task, req)) {
+ req->rq_bytes_sent = 0;
+ spin_lock(&xprt->queue_lock);
+ /*
+ * Requests that carry congestion control credits are added
+ * to the head of the list to avoid starvation issues.
+ */
+ if (req->rq_cong) {
+ xprt_clear_congestion_window_wait(xprt);
+ list_for_each_entry(pos, &xprt->xmit_queue, rq_xmit) {
+ if (pos->rq_cong)
+ continue;
+ /* Note: req is added _before_ pos */
+ list_add_tail(&req->rq_xmit, &pos->rq_xmit);
+ INIT_LIST_HEAD(&req->rq_xmit2);
+ trace_xprt_enq_xmit(task, 1);
+ goto out;
+ }
+ } else if (RPC_IS_SWAPPER(task)) {
+ list_for_each_entry(pos, &xprt->xmit_queue, rq_xmit) {
+ if (pos->rq_cong || pos->rq_bytes_sent)
+ continue;
+ if (RPC_IS_SWAPPER(pos->rq_task))
+ continue;
+ /* Note: req is added _before_ pos */
+ list_add_tail(&req->rq_xmit, &pos->rq_xmit);
+ INIT_LIST_HEAD(&req->rq_xmit2);
+ trace_xprt_enq_xmit(task, 2);
+ goto out;
+ }
+ } else if (!req->rq_seqno) {
+ list_for_each_entry(pos, &xprt->xmit_queue, rq_xmit) {
+ if (pos->rq_task->tk_owner != task->tk_owner)
+ continue;
+ list_add_tail(&req->rq_xmit2, &pos->rq_xmit2);
+ INIT_LIST_HEAD(&req->rq_xmit);
+ trace_xprt_enq_xmit(task, 3);
+ goto out;
+ }
+ }
+ list_add_tail(&req->rq_xmit, &xprt->xmit_queue);
+ INIT_LIST_HEAD(&req->rq_xmit2);
+ trace_xprt_enq_xmit(task, 4);
+out:
+ set_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate);
+ spin_unlock(&xprt->queue_lock);
+ }
+}
+
+/**
+ * xprt_request_dequeue_transmit_locked - remove a task from the transmission queue
+ * @task: pointer to rpc_task
+ *
+ * Remove a task from the transmission queue
+ * Caller must hold xprt->queue_lock
+ */
+static void
+xprt_request_dequeue_transmit_locked(struct rpc_task *task)
+{
+ struct rpc_rqst *req = task->tk_rqstp;
+
+ if (!test_and_clear_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate))
+ return;
+ if (!list_empty(&req->rq_xmit)) {
+ list_del(&req->rq_xmit);
+ if (!list_empty(&req->rq_xmit2)) {
+ struct rpc_rqst *next = list_first_entry(&req->rq_xmit2,
+ struct rpc_rqst, rq_xmit2);
+ list_del(&req->rq_xmit2);
+ list_add_tail(&next->rq_xmit, &next->rq_xprt->xmit_queue);
+ }
+ } else
+ list_del(&req->rq_xmit2);
+}
+
+/**
+ * xprt_request_dequeue_transmit - remove a task from the transmission queue
+ * @task: pointer to rpc_task
+ *
+ * Remove a task from the transmission queue
+ */
+static void
+xprt_request_dequeue_transmit(struct rpc_task *task)
+{
+ struct rpc_rqst *req = task->tk_rqstp;
+ struct rpc_xprt *xprt = req->rq_xprt;
+
+ spin_lock(&xprt->queue_lock);
+ xprt_request_dequeue_transmit_locked(task);
+ spin_unlock(&xprt->queue_lock);
+}
+
+/**
+ * xprt_request_dequeue_xprt - remove a task from the transmit+receive queue
+ * @task: pointer to rpc_task
+ *
+ * Remove a task from the transmit and receive queues, and ensure that
+ * it is not pinned by the receive work item.
+ */
+void
+xprt_request_dequeue_xprt(struct rpc_task *task)
+{
+ struct rpc_rqst *req = task->tk_rqstp;
+ struct rpc_xprt *xprt = req->rq_xprt;
+
+ if (test_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate) ||
+ test_bit(RPC_TASK_NEED_RECV, &task->tk_runstate) ||
+ xprt_is_pinned_rqst(req)) {
+ spin_lock(&xprt->queue_lock);
+ xprt_request_dequeue_transmit_locked(task);
+ xprt_request_dequeue_receive_locked(task);
+ while (xprt_is_pinned_rqst(req)) {
+ set_bit(RPC_TASK_MSG_PIN_WAIT, &task->tk_runstate);
+ spin_unlock(&xprt->queue_lock);
+ xprt_wait_on_pinned_rqst(req);
+ spin_lock(&xprt->queue_lock);
+ clear_bit(RPC_TASK_MSG_PIN_WAIT, &task->tk_runstate);
+ }
+ spin_unlock(&xprt->queue_lock);
+ }
+}
+
+/**
+ * xprt_request_prepare - prepare an encoded request for transport
+ * @req: pointer to rpc_rqst
+ *
+ * Calls into the transport layer to do whatever is needed to prepare
+ * the request for transmission or receive.
+ */
+void
+xprt_request_prepare(struct rpc_rqst *req)
+{
+ struct rpc_xprt *xprt = req->rq_xprt;
+
+ if (xprt->ops->prepare_request)
+ xprt->ops->prepare_request(req);
+}
+
+/**
+ * xprt_request_need_retransmit - Test if a task needs retransmission
+ * @task: pointer to rpc_task
+ *
+ * Test for whether a connection breakage requires the task to retransmit
+ */
+bool
+xprt_request_need_retransmit(struct rpc_task *task)
+{
+ return xprt_request_retransmit_after_disconnect(task);
+}
+
+/**
* xprt_prepare_transmit - reserve the transport before sending a request
* @task: RPC task about to send a request
*
@@ -972,32 +1397,18 @@
{
struct rpc_rqst *req = task->tk_rqstp;
struct rpc_xprt *xprt = req->rq_xprt;
- bool ret = false;
dprintk("RPC: %5u xprt_prepare_transmit\n", task->tk_pid);
- spin_lock_bh(&xprt->transport_lock);
- if (!req->rq_bytes_sent) {
- if (req->rq_reply_bytes_recvd) {
- task->tk_status = req->rq_reply_bytes_recvd;
- goto out_unlock;
- }
- if ((task->tk_flags & RPC_TASK_NO_RETRANS_TIMEOUT)
- && xprt_connected(xprt)
- && req->rq_connect_cookie == xprt->connect_cookie) {
- xprt->ops->set_retrans_timeout(task);
- rpc_sleep_on(&xprt->pending, task, xprt_timer);
- goto out_unlock;
- }
+ if (!xprt_lock_write(xprt, task)) {
+ /* Race breaker: someone may have transmitted us */
+ if (!test_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate))
+ rpc_wake_up_queued_task_set_status(&xprt->sending,
+ task, 0);
+ return false;
+
}
- if (!xprt->ops->reserve_xprt(xprt, task)) {
- task->tk_status = -EAGAIN;
- goto out_unlock;
- }
- ret = true;
-out_unlock:
- spin_unlock_bh(&xprt->transport_lock);
- return ret;
+ return true;
}
void xprt_end_transmit(struct rpc_task *task)
@@ -1006,81 +1417,115 @@
}
/**
- * xprt_transmit - send an RPC request on a transport
- * @task: controlling RPC task
+ * xprt_request_transmit - send an RPC request on a transport
+ * @req: pointer to request to transmit
+ * @snd_task: RPC task that owns the transport lock
*
- * We have to copy the iovec because sendmsg fiddles with its contents.
+ * This performs the transmission of a single request.
+ * Note that if the request is not the same as snd_task, then it
+ * does need to be pinned.
+ * Returns '0' on success.
*/
-void xprt_transmit(struct rpc_task *task)
+static int
+xprt_request_transmit(struct rpc_rqst *req, struct rpc_task *snd_task)
{
- struct rpc_rqst *req = task->tk_rqstp;
- struct rpc_xprt *xprt = req->rq_xprt;
+ struct rpc_xprt *xprt = req->rq_xprt;
+ struct rpc_task *task = req->rq_task;
unsigned int connect_cookie;
+ int is_retrans = RPC_WAS_SENT(task);
int status;
- dprintk("RPC: %5u xprt_transmit(%u)\n", task->tk_pid, req->rq_slen);
-
- if (!req->rq_reply_bytes_recvd) {
- if (list_empty(&req->rq_list) && rpc_reply_expected(task)) {
- /*
- * Add to the list only if we're expecting a reply
- */
- /* Update the softirq receive buffer */
- memcpy(&req->rq_private_buf, &req->rq_rcv_buf,
- sizeof(req->rq_private_buf));
- /* Add request to the receive list */
- spin_lock(&xprt->recv_lock);
- list_add_tail(&req->rq_list, &xprt->recv);
- spin_unlock(&xprt->recv_lock);
- xprt_reset_majortimeo(req);
- /* Turn off autodisconnect */
- del_singleshot_timer_sync(&xprt->timer);
+ if (!req->rq_bytes_sent) {
+ if (xprt_request_data_received(task)) {
+ status = 0;
+ goto out_dequeue;
}
- } else if (!req->rq_bytes_sent)
- return;
+ /* Verify that our message lies in the RPCSEC_GSS window */
+ if (rpcauth_xmit_need_reencode(task)) {
+ status = -EBADMSG;
+ goto out_dequeue;
+ }
+ if (RPC_SIGNALLED(task)) {
+ status = -ERESTARTSYS;
+ goto out_dequeue;
+ }
+ }
+
+ /*
+ * Update req->rq_ntrans before transmitting to avoid races with
+ * xprt_update_rtt(), which needs to know that it is recording a
+ * reply to the first transmission.
+ */
+ req->rq_ntrans++;
connect_cookie = xprt->connect_cookie;
- status = xprt->ops->send_request(task);
- trace_xprt_transmit(xprt, req->rq_xid, status);
+ status = xprt->ops->send_request(req);
if (status != 0) {
- task->tk_status = status;
- return;
+ req->rq_ntrans--;
+ trace_xprt_transmit(req, status);
+ return status;
}
+
+ if (is_retrans)
+ task->tk_client->cl_stats->rpcretrans++;
+
xprt_inject_disconnect(xprt);
- dprintk("RPC: %5u xmit complete\n", task->tk_pid);
task->tk_flags |= RPC_TASK_SENT;
- spin_lock_bh(&xprt->transport_lock);
-
- xprt->ops->set_retrans_timeout(task);
+ spin_lock(&xprt->transport_lock);
xprt->stat.sends++;
xprt->stat.req_u += xprt->stat.sends - xprt->stat.recvs;
xprt->stat.bklog_u += xprt->backlog.qlen;
xprt->stat.sending_u += xprt->sending.qlen;
xprt->stat.pending_u += xprt->pending.qlen;
- spin_unlock_bh(&xprt->transport_lock);
+ spin_unlock(&xprt->transport_lock);
req->rq_connect_cookie = connect_cookie;
- if (rpc_reply_expected(task) && !READ_ONCE(req->rq_reply_bytes_recvd)) {
- /*
- * Sleep on the pending queue if we're expecting a reply.
- * The spinlock ensures atomicity between the test of
- * req->rq_reply_bytes_recvd, and the call to rpc_sleep_on().
- */
- spin_lock(&xprt->recv_lock);
- if (!req->rq_reply_bytes_recvd) {
- rpc_sleep_on(&xprt->pending, task, xprt_timer);
- /*
- * Send an extra queue wakeup call if the
- * connection was dropped in case the call to
- * rpc_sleep_on() raced.
- */
- if (!xprt_connected(xprt))
- xprt_wake_pending_tasks(xprt, -ENOTCONN);
- }
- spin_unlock(&xprt->recv_lock);
+out_dequeue:
+ trace_xprt_transmit(req, status);
+ xprt_request_dequeue_transmit(task);
+ rpc_wake_up_queued_task_set_status(&xprt->sending, task, status);
+ return status;
+}
+
+/**
+ * xprt_transmit - send an RPC request on a transport
+ * @task: controlling RPC task
+ *
+ * Attempts to drain the transmit queue. On exit, either the transport
+ * signalled an error that needs to be handled before transmission can
+ * resume, or @task finished transmitting, and detected that it already
+ * received a reply.
+ */
+void
+xprt_transmit(struct rpc_task *task)
+{
+ struct rpc_rqst *next, *req = task->tk_rqstp;
+ struct rpc_xprt *xprt = req->rq_xprt;
+ int status;
+
+ spin_lock(&xprt->queue_lock);
+ while (!list_empty(&xprt->xmit_queue)) {
+ next = list_first_entry(&xprt->xmit_queue,
+ struct rpc_rqst, rq_xmit);
+ xprt_pin_rqst(next);
+ spin_unlock(&xprt->queue_lock);
+ status = xprt_request_transmit(next, task);
+ if (status == -EBADMSG && next != req)
+ status = 0;
+ cond_resched();
+ spin_lock(&xprt->queue_lock);
+ xprt_unpin_rqst(next);
+ if (status == 0) {
+ if (!xprt_request_data_received(task) ||
+ test_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate))
+ continue;
+ } else if (test_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate))
+ task->tk_status = status;
+ break;
}
+ spin_unlock(&xprt->queue_lock);
}
static void xprt_add_backlog(struct rpc_xprt *xprt, struct rpc_task *task)
@@ -1177,20 +1622,6 @@
}
EXPORT_SYMBOL_GPL(xprt_alloc_slot);
-void xprt_lock_and_alloc_slot(struct rpc_xprt *xprt, struct rpc_task *task)
-{
- /* Note: grabbing the xprt_lock_write() ensures that we throttle
- * new slot allocation if the transport is congested (i.e. when
- * reconnecting a stream transport or when out of socket write
- * buffer space).
- */
- if (xprt_lock_write(xprt, task)) {
- xprt_alloc_slot(xprt, task);
- xprt_release_write(xprt, task);
- }
-}
-EXPORT_SYMBOL_GPL(xprt_lock_and_alloc_slot);
-
void xprt_free_slot(struct rpc_xprt *xprt, struct rpc_rqst *req)
{
spin_lock(&xprt->reserve_lock);
@@ -1257,6 +1688,60 @@
}
EXPORT_SYMBOL_GPL(xprt_free);
+static void
+xprt_init_connect_cookie(struct rpc_rqst *req, struct rpc_xprt *xprt)
+{
+ req->rq_connect_cookie = xprt_connect_cookie(xprt) - 1;
+}
+
+static __be32
+xprt_alloc_xid(struct rpc_xprt *xprt)
+{
+ __be32 xid;
+
+ spin_lock(&xprt->reserve_lock);
+ xid = (__force __be32)xprt->xid++;
+ spin_unlock(&xprt->reserve_lock);
+ return xid;
+}
+
+static void
+xprt_init_xid(struct rpc_xprt *xprt)
+{
+ xprt->xid = prandom_u32();
+}
+
+static void
+xprt_request_init(struct rpc_task *task)
+{
+ struct rpc_xprt *xprt = task->tk_xprt;
+ struct rpc_rqst *req = task->tk_rqstp;
+
+ req->rq_task = task;
+ req->rq_xprt = xprt;
+ req->rq_buffer = NULL;
+ req->rq_xid = xprt_alloc_xid(xprt);
+ xprt_init_connect_cookie(req, xprt);
+ req->rq_snd_buf.len = 0;
+ req->rq_snd_buf.buflen = 0;
+ req->rq_rcv_buf.len = 0;
+ req->rq_rcv_buf.buflen = 0;
+ req->rq_snd_buf.bvec = NULL;
+ req->rq_rcv_buf.bvec = NULL;
+ req->rq_release_snd_buf = NULL;
+ xprt_init_majortimeo(task, req);
+ dprintk("RPC: %5u reserved req %p xid %08x\n", task->tk_pid,
+ req, ntohl(req->rq_xid));
+}
+
+static void
+xprt_do_reserve(struct rpc_xprt *xprt, struct rpc_task *task)
+{
+ xprt->ops->alloc_slot(xprt, task);
+ if (task->tk_rqstp != NULL)
+ xprt_request_init(task);
+}
+
/**
* xprt_reserve - allocate an RPC request slot
* @task: RPC task requesting a slot allocation
@@ -1273,10 +1758,9 @@
if (task->tk_rqstp != NULL)
return;
- task->tk_timeout = 0;
task->tk_status = -EAGAIN;
if (!xprt_throttle_congested(xprt, task))
- xprt->ops->alloc_slot(xprt, task);
+ xprt_do_reserve(xprt, task);
}
/**
@@ -1296,47 +1780,8 @@
if (task->tk_rqstp != NULL)
return;
- task->tk_timeout = 0;
task->tk_status = -EAGAIN;
- xprt->ops->alloc_slot(xprt, task);
-}
-
-static inline __be32 xprt_alloc_xid(struct rpc_xprt *xprt)
-{
- __be32 xid;
-
- spin_lock(&xprt->reserve_lock);
- xid = (__force __be32)xprt->xid++;
- spin_unlock(&xprt->reserve_lock);
- return xid;
-}
-
-static inline void xprt_init_xid(struct rpc_xprt *xprt)
-{
- xprt->xid = prandom_u32();
-}
-
-void xprt_request_init(struct rpc_task *task)
-{
- struct rpc_xprt *xprt = task->tk_xprt;
- struct rpc_rqst *req = task->tk_rqstp;
-
- INIT_LIST_HEAD(&req->rq_list);
- req->rq_timeout = task->tk_client->cl_timeout->to_initval;
- req->rq_task = task;
- req->rq_xprt = xprt;
- req->rq_buffer = NULL;
- req->rq_xid = xprt_alloc_xid(xprt);
- req->rq_connect_cookie = xprt->connect_cookie - 1;
- req->rq_bytes_sent = 0;
- req->rq_snd_buf.len = 0;
- req->rq_snd_buf.buflen = 0;
- req->rq_rcv_buf.len = 0;
- req->rq_rcv_buf.buflen = 0;
- req->rq_release_snd_buf = NULL;
- xprt_reset_majortimeo(req);
- dprintk("RPC: %5u reserved req %p xid %08x\n", task->tk_pid,
- req, ntohl(req->rq_xid));
+ xprt_do_reserve(xprt, task);
}
/**
@@ -1352,33 +1797,24 @@
if (req == NULL) {
if (task->tk_client) {
xprt = task->tk_xprt;
- if (xprt->snd_task == task)
- xprt_release_write(xprt, task);
+ xprt_release_write(xprt, task);
}
return;
}
xprt = req->rq_xprt;
- if (task->tk_ops->rpc_count_stats != NULL)
- task->tk_ops->rpc_count_stats(task, task->tk_calldata);
- else if (task->tk_client)
- rpc_count_iostats(task, task->tk_client->cl_metrics);
- spin_lock(&xprt->recv_lock);
- if (!list_empty(&req->rq_list)) {
- list_del_init(&req->rq_list);
- xprt_wait_on_pinned_rqst(req);
- }
- spin_unlock(&xprt->recv_lock);
- spin_lock_bh(&xprt->transport_lock);
+ xprt_request_dequeue_xprt(task);
+ spin_lock(&xprt->transport_lock);
xprt->ops->release_xprt(xprt, task);
if (xprt->ops->release_request)
xprt->ops->release_request(task);
- xprt->last_used = jiffies;
xprt_schedule_autodisconnect(xprt);
- spin_unlock_bh(&xprt->transport_lock);
+ spin_unlock(&xprt->transport_lock);
if (req->rq_buffer)
xprt->ops->buf_free(task);
xprt_inject_disconnect(xprt);
+ xdr_free_bvec(&req->rq_rcv_buf);
+ xdr_free_bvec(&req->rq_snd_buf);
if (req->rq_cred != NULL)
put_rpccred(req->rq_cred);
task->tk_rqstp = NULL;
@@ -1392,16 +1828,35 @@
xprt_free_bc_request(req);
}
+#ifdef CONFIG_SUNRPC_BACKCHANNEL
+void
+xprt_init_bc_request(struct rpc_rqst *req, struct rpc_task *task)
+{
+ struct xdr_buf *xbufp = &req->rq_snd_buf;
+
+ task->tk_rqstp = req;
+ req->rq_task = task;
+ xprt_init_connect_cookie(req, req->rq_xprt);
+ /*
+ * Set up the xdr_buf length.
+ * This also indicates that the buffer is XDR encoded already.
+ */
+ xbufp->len = xbufp->head[0].iov_len + xbufp->page_len +
+ xbufp->tail[0].iov_len;
+}
+#endif
+
static void xprt_init(struct rpc_xprt *xprt, struct net *net)
{
kref_init(&xprt->kref);
spin_lock_init(&xprt->transport_lock);
spin_lock_init(&xprt->reserve_lock);
- spin_lock_init(&xprt->recv_lock);
+ spin_lock_init(&xprt->queue_lock);
INIT_LIST_HEAD(&xprt->free);
- INIT_LIST_HEAD(&xprt->recv);
+ xprt->recv_queue = RB_ROOT;
+ INIT_LIST_HEAD(&xprt->xmit_queue);
#if defined(CONFIG_SUNRPC_BACKCHANNEL)
spin_lock_init(&xprt->bc_pa_lock);
INIT_LIST_HEAD(&xprt->bc_pa_list);
@@ -1414,7 +1869,7 @@
rpc_init_wait_queue(&xprt->binding, "xprt_binding");
rpc_init_wait_queue(&xprt->pending, "xprt_pending");
- rpc_init_priority_wait_queue(&xprt->sending, "xprt_sending");
+ rpc_init_wait_queue(&xprt->sending, "xprt_sending");
rpc_init_priority_wait_queue(&xprt->backlog, "xprt_backlog");
xprt_init_xid(xprt);
@@ -1488,6 +1943,11 @@
rpc_destroy_wait_queue(&xprt->backlog);
kfree(xprt->servername);
/*
+ * Destroy any existing back channel
+ */
+ xprt_destroy_backchannel(xprt, UINT_MAX);
+
+ /*
* Tear down transport state and free the rpc_xprt
*/
xprt->ops->destroy(xprt);
diff --git a/net/sunrpc/xprtmultipath.c b/net/sunrpc/xprtmultipath.c
index e2d64c7..78c075a 100644
--- a/net/sunrpc/xprtmultipath.c
+++ b/net/sunrpc/xprtmultipath.c
@@ -19,7 +19,7 @@
#include <linux/sunrpc/addr.h>
#include <linux/sunrpc/xprtmultipath.h>
-typedef struct rpc_xprt *(*xprt_switch_find_xprt_t)(struct list_head *head,
+typedef struct rpc_xprt *(*xprt_switch_find_xprt_t)(struct rpc_xprt_switch *xps,
const struct rpc_xprt *cur);
static const struct rpc_xprt_iter_ops rpc_xprt_iter_singular;
@@ -36,6 +36,7 @@
if (xps->xps_nxprts == 0)
xps->xps_net = xprt->xprt_net;
xps->xps_nxprts++;
+ xps->xps_nactive++;
}
/**
@@ -51,8 +52,7 @@
if (xprt == NULL)
return;
spin_lock(&xps->xps_lock);
- if ((xps->xps_net == xprt->xprt_net || xps->xps_net == NULL) &&
- !rpc_xprt_switch_has_addr(xps, (struct sockaddr *)&xprt->addr))
+ if (xps->xps_net == xprt->xprt_net || xps->xps_net == NULL)
xprt_switch_add_xprt_locked(xps, xprt);
spin_unlock(&xps->xps_lock);
}
@@ -62,6 +62,7 @@
{
if (unlikely(xprt == NULL))
return;
+ xps->xps_nactive--;
xps->xps_nxprts--;
if (xps->xps_nxprts == 0)
xps->xps_net = NULL;
@@ -102,7 +103,9 @@
if (xps != NULL) {
spin_lock_init(&xps->xps_lock);
kref_init(&xps->xps_kref);
- xps->xps_nxprts = 0;
+ xps->xps_nxprts = xps->xps_nactive = 0;
+ atomic_long_set(&xps->xps_queuelen, 0);
+ xps->xps_net = NULL;
INIT_LIST_HEAD(&xps->xps_xprt_list);
xps->xps_iter_ops = &rpc_xprt_iter_singular;
xprt_switch_add_xprt_locked(xps, xprt);
@@ -193,9 +196,21 @@
}
static
+bool xprt_is_active(const struct rpc_xprt *xprt)
+{
+ return kref_read(&xprt->kref) != 0;
+}
+
+static
struct rpc_xprt *xprt_switch_find_first_entry(struct list_head *head)
{
- return list_first_or_null_rcu(head, struct rpc_xprt, xprt_switch);
+ struct rpc_xprt *pos;
+
+ list_for_each_entry_rcu(pos, head, xprt_switch) {
+ if (xprt_is_active(pos))
+ return pos;
+ }
+ return NULL;
}
static
@@ -213,9 +228,12 @@
const struct rpc_xprt *cur)
{
struct rpc_xprt *pos;
+ bool found = false;
list_for_each_entry_rcu(pos, head, xprt_switch) {
if (cur == pos)
+ found = true;
+ if (found && xprt_is_active(pos))
return pos;
}
return NULL;
@@ -260,9 +278,12 @@
const struct rpc_xprt *cur)
{
struct rpc_xprt *pos, *prev = NULL;
+ bool found = false;
list_for_each_entry_rcu(pos, head, xprt_switch) {
if (cur == prev)
+ found = true;
+ if (found && xprt_is_active(pos))
return pos;
prev = pos;
}
@@ -270,22 +291,15 @@
}
static
-struct rpc_xprt *xprt_switch_set_next_cursor(struct list_head *head,
+struct rpc_xprt *xprt_switch_set_next_cursor(struct rpc_xprt_switch *xps,
struct rpc_xprt **cursor,
xprt_switch_find_xprt_t find_next)
{
- struct rpc_xprt *cur, *pos, *old;
+ struct rpc_xprt *pos, *old;
- cur = READ_ONCE(*cursor);
- for (;;) {
- old = cur;
- pos = find_next(head, old);
- if (pos == NULL)
- break;
- cur = cmpxchg_relaxed(cursor, old, pos);
- if (cur == old)
- break;
- }
+ old = smp_load_acquire(cursor);
+ pos = find_next(xps, old);
+ smp_store_release(cursor, pos);
return pos;
}
@@ -297,13 +311,11 @@
if (xps == NULL)
return NULL;
- return xprt_switch_set_next_cursor(&xps->xps_xprt_list,
- &xpi->xpi_cursor,
- find_next);
+ return xprt_switch_set_next_cursor(xps, &xpi->xpi_cursor, find_next);
}
static
-struct rpc_xprt *xprt_switch_find_next_entry_roundrobin(struct list_head *head,
+struct rpc_xprt *__xprt_switch_find_next_entry_roundrobin(struct list_head *head,
const struct rpc_xprt *cur)
{
struct rpc_xprt *ret;
@@ -315,6 +327,31 @@
}
static
+struct rpc_xprt *xprt_switch_find_next_entry_roundrobin(struct rpc_xprt_switch *xps,
+ const struct rpc_xprt *cur)
+{
+ struct list_head *head = &xps->xps_xprt_list;
+ struct rpc_xprt *xprt;
+ unsigned int nactive;
+
+ for (;;) {
+ unsigned long xprt_queuelen, xps_queuelen;
+
+ xprt = __xprt_switch_find_next_entry_roundrobin(head, cur);
+ if (!xprt)
+ break;
+ xprt_queuelen = atomic_long_read(&xprt->queuelen);
+ xps_queuelen = atomic_long_read(&xps->xps_queuelen);
+ nactive = READ_ONCE(xps->xps_nactive);
+ /* Exit loop if xprt_queuelen <= average queue length */
+ if (xprt_queuelen * nactive <= xps_queuelen)
+ break;
+ cur = xprt;
+ }
+ return xprt;
+}
+
+static
struct rpc_xprt *xprt_iter_next_entry_roundrobin(struct rpc_xprt_iter *xpi)
{
return xprt_iter_next_entry_multiple(xpi,
@@ -322,9 +359,17 @@
}
static
+struct rpc_xprt *xprt_switch_find_next_entry_all(struct rpc_xprt_switch *xps,
+ const struct rpc_xprt *cur)
+{
+ return xprt_switch_find_next_entry(&xps->xps_xprt_list, cur);
+}
+
+static
struct rpc_xprt *xprt_iter_next_entry_all(struct rpc_xprt_iter *xpi)
{
- return xprt_iter_next_entry_multiple(xpi, xprt_switch_find_next_entry);
+ return xprt_iter_next_entry_multiple(xpi,
+ xprt_switch_find_next_entry_all);
}
/*
@@ -383,7 +428,7 @@
/**
* xprt_iter_xchg_switch - Atomically swap out the rpc_xprt_switch
* @xpi: pointer to rpc_xprt_iter
- * @xps: pointer to a new rpc_xprt_switch or NULL
+ * @newswitch: pointer to a new rpc_xprt_switch or NULL
*
* Swaps out the existing xpi->xpi_xpswitch with a new value.
*/
@@ -401,7 +446,7 @@
/**
* xprt_iter_destroy - Destroys the xprt iterator
- * @xpi pointer to rpc_xprt_iter
+ * @xpi: pointer to rpc_xprt_iter
*/
void xprt_iter_destroy(struct rpc_xprt_iter *xpi)
{
diff --git a/net/sunrpc/xprtrdma/Makefile b/net/sunrpc/xprtrdma/Makefile
index 8bf19e1..8ed0377 100644
--- a/net/sunrpc/xprtrdma/Makefile
+++ b/net/sunrpc/xprtrdma/Makefile
@@ -1,8 +1,7 @@
# SPDX-License-Identifier: GPL-2.0
obj-$(CONFIG_SUNRPC_XPRT_RDMA) += rpcrdma.o
-rpcrdma-y := transport.o rpc_rdma.o verbs.o \
- fmr_ops.o frwr_ops.o \
+rpcrdma-y := transport.o rpc_rdma.o verbs.o frwr_ops.o \
svc_rdma.o svc_rdma_backchannel.o svc_rdma_transport.o \
svc_rdma_sendto.o svc_rdma_recvfrom.o svc_rdma_rw.o \
module.o
diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c
index 90adeff..b458bf5 100644
--- a/net/sunrpc/xprtrdma/backchannel.c
+++ b/net/sunrpc/xprtrdma/backchannel.c
@@ -5,7 +5,6 @@
* Support for backward direction RPCs on RPC/RDMA.
*/
-#include <linux/module.h>
#include <linux/sunrpc/xprt.h>
#include <linux/sunrpc/svc.h>
#include <linux/sunrpc/svc_xprt.h>
@@ -20,59 +19,6 @@
#undef RPCRDMA_BACKCHANNEL_DEBUG
-static void rpcrdma_bc_free_rqst(struct rpcrdma_xprt *r_xprt,
- struct rpc_rqst *rqst)
-{
- struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
- struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
-
- spin_lock(&buf->rb_reqslock);
- list_del(&req->rl_all);
- spin_unlock(&buf->rb_reqslock);
-
- rpcrdma_destroy_req(req);
-}
-
-static int rpcrdma_bc_setup_reqs(struct rpcrdma_xprt *r_xprt,
- unsigned int count)
-{
- struct rpc_xprt *xprt = &r_xprt->rx_xprt;
- struct rpc_rqst *rqst;
- unsigned int i;
-
- for (i = 0; i < (count << 1); i++) {
- struct rpcrdma_regbuf *rb;
- struct rpcrdma_req *req;
- size_t size;
-
- req = rpcrdma_create_req(r_xprt);
- if (IS_ERR(req))
- return PTR_ERR(req);
- rqst = &req->rl_slot;
-
- rqst->rq_xprt = xprt;
- INIT_LIST_HEAD(&rqst->rq_list);
- INIT_LIST_HEAD(&rqst->rq_bc_list);
- __set_bit(RPC_BC_PA_IN_USE, &rqst->rq_bc_pa_state);
- spin_lock_bh(&xprt->bc_pa_lock);
- list_add(&rqst->rq_bc_pa_list, &xprt->bc_pa_list);
- spin_unlock_bh(&xprt->bc_pa_lock);
-
- size = r_xprt->rx_data.inline_rsize;
- rb = rpcrdma_alloc_regbuf(size, DMA_TO_DEVICE, GFP_KERNEL);
- if (IS_ERR(rb))
- goto out_fail;
- req->rl_sendbuf = rb;
- xdr_buf_init(&rqst->rq_snd_buf, rb->rg_base,
- min_t(size_t, size, PAGE_SIZE));
- }
- return 0;
-
-out_fail:
- rpcrdma_bc_free_rqst(r_xprt, rqst);
- return -ENOMEM;
-}
-
/**
* xprt_rdma_bc_setup - Pre-allocate resources for handling backchannel requests
* @xprt: transport associated with these backchannel resources
@@ -83,55 +29,10 @@
int xprt_rdma_bc_setup(struct rpc_xprt *xprt, unsigned int reqs)
{
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
- int rc;
- /* The backchannel reply path returns each rpc_rqst to the
- * bc_pa_list _after_ the reply is sent. If the server is
- * faster than the client, it can send another backward
- * direction request before the rpc_rqst is returned to the
- * list. The client rejects the request in this case.
- *
- * Twice as many rpc_rqsts are prepared to ensure there is
- * always an rpc_rqst available as soon as a reply is sent.
- */
- if (reqs > RPCRDMA_BACKWARD_WRS >> 1)
- goto out_err;
-
- rc = rpcrdma_bc_setup_reqs(r_xprt, reqs);
- if (rc)
- goto out_free;
-
- r_xprt->rx_buf.rb_bc_srv_max_requests = reqs;
- request_module("svcrdma");
+ r_xprt->rx_buf.rb_bc_srv_max_requests = RPCRDMA_BACKWARD_WRS >> 1;
trace_xprtrdma_cb_setup(r_xprt, reqs);
return 0;
-
-out_free:
- xprt_rdma_bc_destroy(xprt, reqs);
-
-out_err:
- pr_err("RPC: %s: setup backchannel transport failed\n", __func__);
- return -ENOMEM;
-}
-
-/**
- * xprt_rdma_bc_up - Create transport endpoint for backchannel service
- * @serv: server endpoint
- * @net: network namespace
- *
- * The "xprt" is an implied argument: it supplies the name of the
- * backchannel transport class.
- *
- * Returns zero on success, negative errno on failure
- */
-int xprt_rdma_bc_up(struct svc_serv *serv, struct net *net)
-{
- int ret;
-
- ret = svc_create_xprt(serv, "rdma-bc", net, PF_INET, 0, 0);
- if (ret < 0)
- return ret;
- return 0;
}
/**
@@ -143,14 +44,19 @@
size_t xprt_rdma_bc_maxpayload(struct rpc_xprt *xprt)
{
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
- struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
+ struct rpcrdma_ep *ep = &r_xprt->rx_ep;
size_t maxmsg;
- maxmsg = min_t(unsigned int, cdata->inline_rsize, cdata->inline_wsize);
+ maxmsg = min_t(unsigned int, ep->rep_inline_send, ep->rep_inline_recv);
maxmsg = min_t(unsigned int, maxmsg, PAGE_SIZE);
return maxmsg - RPCRDMA_HDRLEN_MIN;
}
+unsigned int xprt_rdma_bc_max_slots(struct rpc_xprt *xprt)
+{
+ return RPCRDMA_BACKWARD_WRS >> 1;
+}
+
static int rpcrdma_bc_marshal_reply(struct rpc_rqst *rqst)
{
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt);
@@ -159,7 +65,7 @@
rpcrdma_set_xdrlen(&req->rl_hdrbuf, 0);
xdr_init_encode(&req->rl_stream, &req->rl_hdrbuf,
- req->rl_rdmabuf->rg_base);
+ rdmab_data(req->rl_rdmabuf), rqst);
p = xdr_reserve_space(&req->rl_stream, 28);
if (unlikely(!p))
@@ -194,18 +100,21 @@
*/
int xprt_rdma_bc_send_reply(struct rpc_rqst *rqst)
{
- struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt);
+ struct rpc_xprt *xprt = rqst->rq_xprt;
+ struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
int rc;
- if (!xprt_connected(rqst->rq_xprt))
- goto drop_connection;
+ if (!xprt_connected(xprt))
+ return -ENOTCONN;
+
+ if (!xprt_request_get_cong(xprt, rqst))
+ return -EBADSLT;
rc = rpcrdma_bc_marshal_reply(rqst);
if (rc < 0)
goto failed_marshal;
- rpcrdma_post_recvs(r_xprt, true);
if (rpcrdma_ep_post(&r_xprt->rx_ia, &r_xprt->rx_ep, req))
goto drop_connection;
return 0;
@@ -214,7 +123,7 @@
if (rc != -ENOTCONN)
return rc;
drop_connection:
- xprt_disconnect_done(rqst->rq_xprt);
+ xprt_rdma_close(xprt);
return -ENOTCONN;
}
@@ -225,19 +134,18 @@
*/
void xprt_rdma_bc_destroy(struct rpc_xprt *xprt, unsigned int reqs)
{
- struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
struct rpc_rqst *rqst, *tmp;
- spin_lock_bh(&xprt->bc_pa_lock);
+ spin_lock(&xprt->bc_pa_lock);
list_for_each_entry_safe(rqst, tmp, &xprt->bc_pa_list, rq_bc_pa_list) {
list_del(&rqst->rq_bc_pa_list);
- spin_unlock_bh(&xprt->bc_pa_lock);
+ spin_unlock(&xprt->bc_pa_lock);
- rpcrdma_bc_free_rqst(r_xprt, rqst);
+ rpcrdma_req_destroy(rpcr_to_rdmar(rqst));
- spin_lock_bh(&xprt->bc_pa_lock);
+ spin_lock(&xprt->bc_pa_lock);
}
- spin_unlock_bh(&xprt->bc_pa_lock);
+ spin_unlock(&xprt->bc_pa_lock);
}
/**
@@ -249,15 +157,50 @@
struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
struct rpc_xprt *xprt = rqst->rq_xprt;
- dprintk("RPC: %s: freeing rqst %p (req %p)\n",
- __func__, rqst, req);
-
rpcrdma_recv_buffer_put(req->rl_reply);
req->rl_reply = NULL;
- spin_lock_bh(&xprt->bc_pa_lock);
+ spin_lock(&xprt->bc_pa_lock);
list_add_tail(&rqst->rq_bc_pa_list, &xprt->bc_pa_list);
- spin_unlock_bh(&xprt->bc_pa_lock);
+ spin_unlock(&xprt->bc_pa_lock);
+ xprt_put(xprt);
+}
+
+static struct rpc_rqst *rpcrdma_bc_rqst_get(struct rpcrdma_xprt *r_xprt)
+{
+ struct rpc_xprt *xprt = &r_xprt->rx_xprt;
+ struct rpcrdma_req *req;
+ struct rpc_rqst *rqst;
+ size_t size;
+
+ spin_lock(&xprt->bc_pa_lock);
+ rqst = list_first_entry_or_null(&xprt->bc_pa_list, struct rpc_rqst,
+ rq_bc_pa_list);
+ if (!rqst)
+ goto create_req;
+ list_del(&rqst->rq_bc_pa_list);
+ spin_unlock(&xprt->bc_pa_lock);
+ return rqst;
+
+create_req:
+ spin_unlock(&xprt->bc_pa_lock);
+
+ /* Set a limit to prevent a remote from overrunning our resources.
+ */
+ if (xprt->bc_alloc_count >= RPCRDMA_BACKWARD_WRS)
+ return NULL;
+
+ size = min_t(size_t, r_xprt->rx_ep.rep_inline_recv, PAGE_SIZE);
+ req = rpcrdma_req_create(r_xprt, size, GFP_KERNEL);
+ if (!req)
+ return NULL;
+
+ xprt->bc_alloc_count++;
+ rqst = &req->rl_slot;
+ rqst->rq_xprt = xprt;
+ __set_bit(RPC_BC_PA_IN_USE, &rqst->rq_bc_pa_state);
+ xdr_buf_init(&rqst->rq_snd_buf, rdmab_data(req->rl_sendbuf), size);
+ return rqst;
}
/**
@@ -291,20 +234,11 @@
pr_info("RPC: %s: %*ph\n", __func__, size, p);
#endif
- /* Grab a free bc rqst */
- spin_lock(&xprt->bc_pa_lock);
- if (list_empty(&xprt->bc_pa_list)) {
- spin_unlock(&xprt->bc_pa_lock);
+ rqst = rpcrdma_bc_rqst_get(r_xprt);
+ if (!rqst)
goto out_overflow;
- }
- rqst = list_first_entry(&xprt->bc_pa_list,
- struct rpc_rqst, rq_bc_pa_list);
- list_del(&rqst->rq_bc_pa_list);
- spin_unlock(&xprt->bc_pa_lock);
- /* Prepare rqst */
rqst->rq_reply_bytes_recvd = 0;
- rqst->rq_bytes_sent = 0;
rqst->rq_xid = *p;
rqst->rq_private_buf.len = size;
@@ -326,6 +260,7 @@
/* Queue rqst for ULP's callback service */
bc_serv = xprt->bc_serv;
+ xprt_get(xprt);
spin_lock(&bc_serv->sv_cb_lock);
list_add(&rqst->rq_bc_list, &bc_serv->sv_cb_list);
spin_unlock(&bc_serv->sv_cb_lock);
@@ -337,7 +272,7 @@
out_overflow:
pr_warn("RPC/RDMA backchannel overflow\n");
- xprt_disconnect_done(xprt);
+ xprt_force_disconnect(xprt);
/* This receive buffer gets reposted automatically
* when the connection is re-established.
*/
diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c
deleted file mode 100644
index 0f7c465..0000000
--- a/net/sunrpc/xprtrdma/fmr_ops.c
+++ /dev/null
@@ -1,348 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (c) 2015, 2017 Oracle. All rights reserved.
- * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
- */
-
-/* Lightweight memory registration using Fast Memory Regions (FMR).
- * Referred to sometimes as MTHCAFMR mode.
- *
- * FMR uses synchronous memory registration and deregistration.
- * FMR registration is known to be fast, but FMR deregistration
- * can take tens of usecs to complete.
- */
-
-/* Normal operation
- *
- * A Memory Region is prepared for RDMA READ or WRITE using the
- * ib_map_phys_fmr verb (fmr_op_map). When the RDMA operation is
- * finished, the Memory Region is unmapped using the ib_unmap_fmr
- * verb (fmr_op_unmap).
- */
-
-#include <linux/sunrpc/svc_rdma.h>
-
-#include "xprt_rdma.h"
-#include <trace/events/rpcrdma.h>
-
-#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
-# define RPCDBG_FACILITY RPCDBG_TRANS
-#endif
-
-/* Maximum scatter/gather per FMR */
-#define RPCRDMA_MAX_FMR_SGES (64)
-
-/* Access mode of externally registered pages */
-enum {
- RPCRDMA_FMR_ACCESS_FLAGS = IB_ACCESS_REMOTE_WRITE |
- IB_ACCESS_REMOTE_READ,
-};
-
-bool
-fmr_is_supported(struct rpcrdma_ia *ia)
-{
- if (!ia->ri_device->alloc_fmr) {
- pr_info("rpcrdma: 'fmr' mode is not supported by device %s\n",
- ia->ri_device->name);
- return false;
- }
- return true;
-}
-
-static int
-fmr_op_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr)
-{
- static struct ib_fmr_attr fmr_attr = {
- .max_pages = RPCRDMA_MAX_FMR_SGES,
- .max_maps = 1,
- .page_shift = PAGE_SHIFT
- };
-
- mr->fmr.fm_physaddrs = kcalloc(RPCRDMA_MAX_FMR_SGES,
- sizeof(u64), GFP_KERNEL);
- if (!mr->fmr.fm_physaddrs)
- goto out_free;
-
- mr->mr_sg = kcalloc(RPCRDMA_MAX_FMR_SGES,
- sizeof(*mr->mr_sg), GFP_KERNEL);
- if (!mr->mr_sg)
- goto out_free;
-
- sg_init_table(mr->mr_sg, RPCRDMA_MAX_FMR_SGES);
-
- mr->fmr.fm_mr = ib_alloc_fmr(ia->ri_pd, RPCRDMA_FMR_ACCESS_FLAGS,
- &fmr_attr);
- if (IS_ERR(mr->fmr.fm_mr))
- goto out_fmr_err;
-
- INIT_LIST_HEAD(&mr->mr_list);
- return 0;
-
-out_fmr_err:
- dprintk("RPC: %s: ib_alloc_fmr returned %ld\n", __func__,
- PTR_ERR(mr->fmr.fm_mr));
-
-out_free:
- kfree(mr->mr_sg);
- kfree(mr->fmr.fm_physaddrs);
- return -ENOMEM;
-}
-
-static int
-__fmr_unmap(struct rpcrdma_mr *mr)
-{
- LIST_HEAD(l);
- int rc;
-
- list_add(&mr->fmr.fm_mr->list, &l);
- rc = ib_unmap_fmr(&l);
- list_del(&mr->fmr.fm_mr->list);
- return rc;
-}
-
-static void
-fmr_op_release_mr(struct rpcrdma_mr *mr)
-{
- LIST_HEAD(unmap_list);
- int rc;
-
- kfree(mr->fmr.fm_physaddrs);
- kfree(mr->mr_sg);
-
- /* In case this one was left mapped, try to unmap it
- * to prevent dealloc_fmr from failing with EBUSY
- */
- rc = __fmr_unmap(mr);
- if (rc)
- pr_err("rpcrdma: final ib_unmap_fmr for %p failed %i\n",
- mr, rc);
-
- rc = ib_dealloc_fmr(mr->fmr.fm_mr);
- if (rc)
- pr_err("rpcrdma: final ib_dealloc_fmr for %p returned %i\n",
- mr, rc);
-
- kfree(mr);
-}
-
-/* Reset of a single FMR.
- */
-static void
-fmr_op_recover_mr(struct rpcrdma_mr *mr)
-{
- struct rpcrdma_xprt *r_xprt = mr->mr_xprt;
- int rc;
-
- /* ORDER: invalidate first */
- rc = __fmr_unmap(mr);
- if (rc)
- goto out_release;
-
- /* ORDER: then DMA unmap */
- rpcrdma_mr_unmap_and_put(mr);
-
- r_xprt->rx_stats.mrs_recovered++;
- return;
-
-out_release:
- pr_err("rpcrdma: FMR reset failed (%d), %p released\n", rc, mr);
- r_xprt->rx_stats.mrs_orphaned++;
-
- trace_xprtrdma_dma_unmap(mr);
- ib_dma_unmap_sg(r_xprt->rx_ia.ri_device,
- mr->mr_sg, mr->mr_nents, mr->mr_dir);
-
- spin_lock(&r_xprt->rx_buf.rb_mrlock);
- list_del(&mr->mr_all);
- spin_unlock(&r_xprt->rx_buf.rb_mrlock);
-
- fmr_op_release_mr(mr);
-}
-
-/* On success, sets:
- * ep->rep_attr.cap.max_send_wr
- * ep->rep_attr.cap.max_recv_wr
- * cdata->max_requests
- * ia->ri_max_segs
- */
-static int
-fmr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
- struct rpcrdma_create_data_internal *cdata)
-{
- int max_qp_wr;
-
- max_qp_wr = ia->ri_device->attrs.max_qp_wr;
- max_qp_wr -= RPCRDMA_BACKWARD_WRS;
- max_qp_wr -= 1;
- if (max_qp_wr < RPCRDMA_MIN_SLOT_TABLE)
- return -ENOMEM;
- if (cdata->max_requests > max_qp_wr)
- cdata->max_requests = max_qp_wr;
- ep->rep_attr.cap.max_send_wr = cdata->max_requests;
- ep->rep_attr.cap.max_send_wr += RPCRDMA_BACKWARD_WRS;
- ep->rep_attr.cap.max_send_wr += 1; /* for ib_drain_sq */
- ep->rep_attr.cap.max_recv_wr = cdata->max_requests;
- ep->rep_attr.cap.max_recv_wr += RPCRDMA_BACKWARD_WRS;
- ep->rep_attr.cap.max_recv_wr += 1; /* for ib_drain_rq */
-
- ia->ri_max_segs = max_t(unsigned int, 1, RPCRDMA_MAX_DATA_SEGS /
- RPCRDMA_MAX_FMR_SGES);
- return 0;
-}
-
-/* FMR mode conveys up to 64 pages of payload per chunk segment.
- */
-static size_t
-fmr_op_maxpages(struct rpcrdma_xprt *r_xprt)
-{
- return min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS,
- RPCRDMA_MAX_HDR_SEGS * RPCRDMA_MAX_FMR_SGES);
-}
-
-/* Use the ib_map_phys_fmr() verb to register a memory region
- * for remote access via RDMA READ or RDMA WRITE.
- */
-static struct rpcrdma_mr_seg *
-fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
- int nsegs, bool writing, struct rpcrdma_mr **out)
-{
- struct rpcrdma_mr_seg *seg1 = seg;
- int len, pageoff, i, rc;
- struct rpcrdma_mr *mr;
- u64 *dma_pages;
-
- mr = rpcrdma_mr_get(r_xprt);
- if (!mr)
- return ERR_PTR(-EAGAIN);
-
- pageoff = offset_in_page(seg1->mr_offset);
- seg1->mr_offset -= pageoff; /* start of page */
- seg1->mr_len += pageoff;
- len = -pageoff;
- if (nsegs > RPCRDMA_MAX_FMR_SGES)
- nsegs = RPCRDMA_MAX_FMR_SGES;
- for (i = 0; i < nsegs;) {
- if (seg->mr_page)
- sg_set_page(&mr->mr_sg[i],
- seg->mr_page,
- seg->mr_len,
- offset_in_page(seg->mr_offset));
- else
- sg_set_buf(&mr->mr_sg[i], seg->mr_offset,
- seg->mr_len);
- len += seg->mr_len;
- ++seg;
- ++i;
- /* Check for holes */
- if ((i < nsegs && offset_in_page(seg->mr_offset)) ||
- offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
- break;
- }
- mr->mr_dir = rpcrdma_data_dir(writing);
-
- mr->mr_nents = ib_dma_map_sg(r_xprt->rx_ia.ri_device,
- mr->mr_sg, i, mr->mr_dir);
- if (!mr->mr_nents)
- goto out_dmamap_err;
- trace_xprtrdma_dma_map(mr);
-
- for (i = 0, dma_pages = mr->fmr.fm_physaddrs; i < mr->mr_nents; i++)
- dma_pages[i] = sg_dma_address(&mr->mr_sg[i]);
- rc = ib_map_phys_fmr(mr->fmr.fm_mr, dma_pages, mr->mr_nents,
- dma_pages[0]);
- if (rc)
- goto out_maperr;
-
- mr->mr_handle = mr->fmr.fm_mr->rkey;
- mr->mr_length = len;
- mr->mr_offset = dma_pages[0] + pageoff;
-
- *out = mr;
- return seg;
-
-out_dmamap_err:
- pr_err("rpcrdma: failed to DMA map sg %p sg_nents %d\n",
- mr->mr_sg, i);
- rpcrdma_mr_put(mr);
- return ERR_PTR(-EIO);
-
-out_maperr:
- pr_err("rpcrdma: ib_map_phys_fmr %u@0x%llx+%i (%d) status %i\n",
- len, (unsigned long long)dma_pages[0],
- pageoff, mr->mr_nents, rc);
- rpcrdma_mr_unmap_and_put(mr);
- return ERR_PTR(-EIO);
-}
-
-/* Post Send WR containing the RPC Call message.
- */
-static int
-fmr_op_send(struct rpcrdma_ia *ia, struct rpcrdma_req *req)
-{
- return ib_post_send(ia->ri_id->qp, &req->rl_sendctx->sc_wr, NULL);
-}
-
-/* Invalidate all memory regions that were registered for "req".
- *
- * Sleeps until it is safe for the host CPU to access the
- * previously mapped memory regions.
- *
- * Caller ensures that @mrs is not empty before the call. This
- * function empties the list.
- */
-static void
-fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mrs)
-{
- struct rpcrdma_mr *mr;
- LIST_HEAD(unmap_list);
- int rc;
-
- /* ORDER: Invalidate all of the req's MRs first
- *
- * ib_unmap_fmr() is slow, so use a single call instead
- * of one call per mapped FMR.
- */
- list_for_each_entry(mr, mrs, mr_list) {
- dprintk("RPC: %s: unmapping fmr %p\n",
- __func__, &mr->fmr);
- trace_xprtrdma_localinv(mr);
- list_add_tail(&mr->fmr.fm_mr->list, &unmap_list);
- }
- r_xprt->rx_stats.local_inv_needed++;
- rc = ib_unmap_fmr(&unmap_list);
- if (rc)
- goto out_reset;
-
- /* ORDER: Now DMA unmap all of the req's MRs, and return
- * them to the free MW list.
- */
- while (!list_empty(mrs)) {
- mr = rpcrdma_mr_pop(mrs);
- list_del(&mr->fmr.fm_mr->list);
- rpcrdma_mr_unmap_and_put(mr);
- }
-
- return;
-
-out_reset:
- pr_err("rpcrdma: ib_unmap_fmr failed (%i)\n", rc);
-
- while (!list_empty(mrs)) {
- mr = rpcrdma_mr_pop(mrs);
- list_del(&mr->fmr.fm_mr->list);
- fmr_op_recover_mr(mr);
- }
-}
-
-const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops = {
- .ro_map = fmr_op_map,
- .ro_send = fmr_op_send,
- .ro_unmap_sync = fmr_op_unmap_sync,
- .ro_recover_mr = fmr_op_recover_mr,
- .ro_open = fmr_op_open,
- .ro_maxpages = fmr_op_maxpages,
- .ro_init_mr = fmr_op_init_mr,
- .ro_release_mr = fmr_op_release_mr,
- .ro_displayname = "fmr",
- .ro_send_w_inv_ok = 0,
-};
diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
index 1bb00dd..30065a2 100644
--- a/net/sunrpc/xprtrdma/frwr_ops.c
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -7,67 +7,37 @@
/* Lightweight memory registration using Fast Registration Work
* Requests (FRWR).
*
- * FRWR features ordered asynchronous registration and deregistration
- * of arbitrarily sized memory regions. This is the fastest and safest
+ * FRWR features ordered asynchronous registration and invalidation
+ * of arbitrarily-sized memory regions. This is the fastest and safest
* but most complex memory registration mode.
*/
/* Normal operation
*
- * A Memory Region is prepared for RDMA READ or WRITE using a FAST_REG
- * Work Request (frwr_op_map). When the RDMA operation is finished, this
+ * A Memory Region is prepared for RDMA Read or Write using a FAST_REG
+ * Work Request (frwr_map). When the RDMA operation is finished, this
* Memory Region is invalidated using a LOCAL_INV Work Request
- * (frwr_op_unmap_sync).
+ * (frwr_unmap_async and frwr_unmap_sync).
*
- * Typically these Work Requests are not signaled, and neither are RDMA
- * SEND Work Requests (with the exception of signaling occasionally to
- * prevent provider work queue overflows). This greatly reduces HCA
+ * Typically FAST_REG Work Requests are not signaled, and neither are
+ * RDMA Send Work Requests (with the exception of signaling occasionally
+ * to prevent provider work queue overflows). This greatly reduces HCA
* interrupt workload.
- *
- * As an optimization, frwr_op_unmap marks MRs INVALID before the
- * LOCAL_INV WR is posted. If posting succeeds, the MR is placed on
- * rb_mrs immediately so that no work (like managing a linked list
- * under a spinlock) is needed in the completion upcall.
- *
- * But this means that frwr_op_map() can occasionally encounter an MR
- * that is INVALID but the LOCAL_INV WR has not completed. Work Queue
- * ordering prevents a subsequent FAST_REG WR from executing against
- * that MR while it is still being invalidated.
*/
/* Transport recovery
*
- * ->op_map and the transport connect worker cannot run at the same
- * time, but ->op_unmap can fire while the transport connect worker
- * is running. Thus MR recovery is handled in ->op_map, to guarantee
- * that recovered MRs are owned by a sending RPC, and not one where
- * ->op_unmap could fire at the same time transport reconnect is
- * being done.
+ * frwr_map and frwr_unmap_* cannot run at the same time the transport
+ * connect worker is running. The connect worker holds the transport
+ * send lock, just as ->send_request does. This prevents frwr_map and
+ * the connect worker from running concurrently. When a connection is
+ * closed, the Receive completion queue is drained before the allowing
+ * the connect worker to get control. This prevents frwr_unmap and the
+ * connect worker from running concurrently.
*
- * When the underlying transport disconnects, MRs are left in one of
- * four states:
- *
- * INVALID: The MR was not in use before the QP entered ERROR state.
- *
- * VALID: The MR was registered before the QP entered ERROR state.
- *
- * FLUSHED_FR: The MR was being registered when the QP entered ERROR
- * state, and the pending WR was flushed.
- *
- * FLUSHED_LI: The MR was being invalidated when the QP entered ERROR
- * state, and the pending WR was flushed.
- *
- * When frwr_op_map encounters FLUSHED and VALID MRs, they are recovered
- * with ib_dereg_mr and then are re-initialized. Because MR recovery
- * allocates fresh resources, it is deferred to a workqueue, and the
- * recovered MRs are placed back on the rb_mrs list when recovery is
- * complete. frwr_op_map allocates another MR for the current RPC while
- * the broken MR is reset.
- *
- * To ensure that frwr_op_map doesn't encounter an MR that is marked
- * INVALID but that is about to be flushed due to a previous transport
- * disconnect, the transport connect worker attempts to drain all
- * pending send queue WRs before the transport is reconnected.
+ * When the underlying transport disconnects, MRs that are in flight
+ * are flushed and are likely unusable. Thus all flushed MRs are
+ * destroyed. New MRs are created on demand.
*/
#include <linux/sunrpc/rpc_rdma.h>
@@ -80,10 +50,15 @@
# define RPCDBG_FACILITY RPCDBG_TRANS
#endif
-bool
-frwr_is_supported(struct rpcrdma_ia *ia)
+/**
+ * frwr_is_supported - Check if device supports FRWR
+ * @device: interface adapter to check
+ *
+ * Returns true if device supports FRWR, otherwise false
+ */
+bool frwr_is_supported(struct ib_device *device)
{
- struct ib_device_attr *attrs = &ia->ri_device->attrs;
+ struct ib_device_attr *attrs = &device->attrs;
if (!(attrs->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS))
goto out_not_supported;
@@ -93,142 +68,172 @@
out_not_supported:
pr_info("rpcrdma: 'frwr' mode is not supported by device %s\n",
- ia->ri_device->name);
+ device->name);
return false;
}
-static int
-frwr_op_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr)
-{
- unsigned int depth = ia->ri_max_frwr_depth;
- struct rpcrdma_frwr *frwr = &mr->frwr;
- int rc;
-
- frwr->fr_mr = ib_alloc_mr(ia->ri_pd, ia->ri_mrtype, depth);
- if (IS_ERR(frwr->fr_mr))
- goto out_mr_err;
-
- mr->mr_sg = kcalloc(depth, sizeof(*mr->mr_sg), GFP_KERNEL);
- if (!mr->mr_sg)
- goto out_list_err;
-
- INIT_LIST_HEAD(&mr->mr_list);
- sg_init_table(mr->mr_sg, depth);
- init_completion(&frwr->fr_linv_done);
- return 0;
-
-out_mr_err:
- rc = PTR_ERR(frwr->fr_mr);
- dprintk("RPC: %s: ib_alloc_mr status %i\n",
- __func__, rc);
- return rc;
-
-out_list_err:
- rc = -ENOMEM;
- dprintk("RPC: %s: sg allocation failure\n",
- __func__);
- ib_dereg_mr(frwr->fr_mr);
- return rc;
-}
-
-static void
-frwr_op_release_mr(struct rpcrdma_mr *mr)
+/**
+ * frwr_release_mr - Destroy one MR
+ * @mr: MR allocated by frwr_init_mr
+ *
+ */
+void frwr_release_mr(struct rpcrdma_mr *mr)
{
int rc;
rc = ib_dereg_mr(mr->frwr.fr_mr);
if (rc)
- pr_err("rpcrdma: final ib_dereg_mr for %p returned %i\n",
- mr, rc);
+ trace_xprtrdma_frwr_dereg(mr, rc);
kfree(mr->mr_sg);
kfree(mr);
}
-static int
-__frwr_mr_reset(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr)
+static void frwr_mr_recycle(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr *mr)
{
- struct rpcrdma_frwr *frwr = &mr->frwr;
- int rc;
+ trace_xprtrdma_mr_recycle(mr);
- rc = ib_dereg_mr(frwr->fr_mr);
- if (rc) {
- pr_warn("rpcrdma: ib_dereg_mr status %d, frwr %p orphaned\n",
- rc, mr);
- return rc;
+ if (mr->mr_dir != DMA_NONE) {
+ trace_xprtrdma_mr_unmap(mr);
+ ib_dma_unmap_sg(r_xprt->rx_ia.ri_id->device,
+ mr->mr_sg, mr->mr_nents, mr->mr_dir);
+ mr->mr_dir = DMA_NONE;
}
- frwr->fr_mr = ib_alloc_mr(ia->ri_pd, ia->ri_mrtype,
- ia->ri_max_frwr_depth);
- if (IS_ERR(frwr->fr_mr)) {
- pr_warn("rpcrdma: ib_alloc_mr status %ld, frwr %p orphaned\n",
- PTR_ERR(frwr->fr_mr), mr);
- return PTR_ERR(frwr->fr_mr);
- }
+ spin_lock(&r_xprt->rx_buf.rb_lock);
+ list_del(&mr->mr_all);
+ r_xprt->rx_stats.mrs_recycled++;
+ spin_unlock(&r_xprt->rx_buf.rb_lock);
- dprintk("RPC: %s: recovered FRWR %p\n", __func__, frwr);
- frwr->fr_state = FRWR_IS_INVALID;
- return 0;
+ frwr_release_mr(mr);
}
-/* Reset of a single FRWR. Generate a fresh rkey by replacing the MR.
+/* MRs are dynamically allocated, so simply clean up and release the MR.
+ * A replacement MR will subsequently be allocated on demand.
*/
static void
-frwr_op_recover_mr(struct rpcrdma_mr *mr)
+frwr_mr_recycle_worker(struct work_struct *work)
{
- enum rpcrdma_frwr_state state = mr->frwr.fr_state;
- struct rpcrdma_xprt *r_xprt = mr->mr_xprt;
- struct rpcrdma_ia *ia = &r_xprt->rx_ia;
- int rc;
+ struct rpcrdma_mr *mr = container_of(work, struct rpcrdma_mr,
+ mr_recycle);
- rc = __frwr_mr_reset(ia, mr);
- if (state != FRWR_FLUSHED_LI) {
- trace_xprtrdma_dma_unmap(mr);
- ib_dma_unmap_sg(ia->ri_device,
- mr->mr_sg, mr->mr_nents, mr->mr_dir);
- }
- if (rc)
- goto out_release;
-
- rpcrdma_mr_put(mr);
- r_xprt->rx_stats.mrs_recovered++;
- return;
-
-out_release:
- pr_err("rpcrdma: FRWR reset failed %d, %p released\n", rc, mr);
- r_xprt->rx_stats.mrs_orphaned++;
-
- spin_lock(&r_xprt->rx_buf.rb_mrlock);
- list_del(&mr->mr_all);
- spin_unlock(&r_xprt->rx_buf.rb_mrlock);
-
- frwr_op_release_mr(mr);
+ frwr_mr_recycle(mr->mr_xprt, mr);
}
-/* On success, sets:
+/* frwr_recycle - Discard MRs
+ * @req: request to reset
+ *
+ * Used after a reconnect. These MRs could be in flight, we can't
+ * tell. Safe thing to do is release them.
+ */
+void frwr_recycle(struct rpcrdma_req *req)
+{
+ struct rpcrdma_mr *mr;
+
+ while ((mr = rpcrdma_mr_pop(&req->rl_registered)))
+ frwr_mr_recycle(mr->mr_xprt, mr);
+}
+
+/* frwr_reset - Place MRs back on the free list
+ * @req: request to reset
+ *
+ * Used after a failed marshal. For FRWR, this means the MRs
+ * don't have to be fully released and recreated.
+ *
+ * NB: This is safe only as long as none of @req's MRs are
+ * involved with an ongoing asynchronous FAST_REG or LOCAL_INV
+ * Work Request.
+ */
+void frwr_reset(struct rpcrdma_req *req)
+{
+ struct rpcrdma_mr *mr;
+
+ while ((mr = rpcrdma_mr_pop(&req->rl_registered)))
+ rpcrdma_mr_put(mr);
+}
+
+/**
+ * frwr_init_mr - Initialize one MR
+ * @ia: interface adapter
+ * @mr: generic MR to prepare for FRWR
+ *
+ * Returns zero if successful. Otherwise a negative errno
+ * is returned.
+ */
+int frwr_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr)
+{
+ unsigned int depth = ia->ri_max_frwr_depth;
+ struct scatterlist *sg;
+ struct ib_mr *frmr;
+ int rc;
+
+ /* NB: ib_alloc_mr and device drivers typically allocate
+ * memory with GFP_KERNEL.
+ */
+ frmr = ib_alloc_mr(ia->ri_pd, ia->ri_mrtype, depth);
+ if (IS_ERR(frmr))
+ goto out_mr_err;
+
+ sg = kcalloc(depth, sizeof(*sg), GFP_NOFS);
+ if (!sg)
+ goto out_list_err;
+
+ mr->frwr.fr_mr = frmr;
+ mr->mr_dir = DMA_NONE;
+ INIT_LIST_HEAD(&mr->mr_list);
+ INIT_WORK(&mr->mr_recycle, frwr_mr_recycle_worker);
+ init_completion(&mr->frwr.fr_linv_done);
+
+ sg_init_table(sg, depth);
+ mr->mr_sg = sg;
+ return 0;
+
+out_mr_err:
+ rc = PTR_ERR(frmr);
+ trace_xprtrdma_frwr_alloc(mr, rc);
+ return rc;
+
+out_list_err:
+ ib_dereg_mr(frmr);
+ return -ENOMEM;
+}
+
+/**
+ * frwr_open - Prepare an endpoint for use with FRWR
+ * @ia: interface adapter this endpoint will use
+ * @ep: endpoint to prepare
+ *
+ * On success, sets:
* ep->rep_attr.cap.max_send_wr
* ep->rep_attr.cap.max_recv_wr
- * cdata->max_requests
+ * ep->rep_max_requests
* ia->ri_max_segs
*
* And these FRWR-related fields:
* ia->ri_max_frwr_depth
* ia->ri_mrtype
+ *
+ * On failure, a negative errno is returned.
*/
-static int
-frwr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
- struct rpcrdma_create_data_internal *cdata)
+int frwr_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep)
{
- struct ib_device_attr *attrs = &ia->ri_device->attrs;
+ struct ib_device_attr *attrs = &ia->ri_id->device->attrs;
int max_qp_wr, depth, delta;
ia->ri_mrtype = IB_MR_TYPE_MEM_REG;
if (attrs->device_cap_flags & IB_DEVICE_SG_GAPS_REG)
ia->ri_mrtype = IB_MR_TYPE_SG_GAPS;
- ia->ri_max_frwr_depth =
- min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS,
- attrs->max_fast_reg_page_list_len);
- dprintk("RPC: %s: device's max FR page list len = %u\n",
+ /* Quirk: Some devices advertise a large max_fast_reg_page_list_len
+ * capability, but perform optimally when the MRs are not larger
+ * than a page.
+ */
+ if (attrs->max_sge_rd > 1)
+ ia->ri_max_frwr_depth = attrs->max_sge_rd;
+ else
+ ia->ri_max_frwr_depth = attrs->max_fast_reg_page_list_len;
+ if (ia->ri_max_frwr_depth > RPCRDMA_MAX_DATA_SEGS)
+ ia->ri_max_frwr_depth = RPCRDMA_MAX_DATA_SEGS;
+ dprintk("RPC: %s: max FR page list depth = %u\n",
__func__, ia->ri_max_frwr_depth);
/* Add room for frwr register and invalidate WRs.
@@ -253,145 +258,78 @@
} while (delta > 0);
}
- max_qp_wr = ia->ri_device->attrs.max_qp_wr;
+ max_qp_wr = ia->ri_id->device->attrs.max_qp_wr;
max_qp_wr -= RPCRDMA_BACKWARD_WRS;
max_qp_wr -= 1;
if (max_qp_wr < RPCRDMA_MIN_SLOT_TABLE)
return -ENOMEM;
- if (cdata->max_requests > max_qp_wr)
- cdata->max_requests = max_qp_wr;
- ep->rep_attr.cap.max_send_wr = cdata->max_requests * depth;
+ if (ep->rep_max_requests > max_qp_wr)
+ ep->rep_max_requests = max_qp_wr;
+ ep->rep_attr.cap.max_send_wr = ep->rep_max_requests * depth;
if (ep->rep_attr.cap.max_send_wr > max_qp_wr) {
- cdata->max_requests = max_qp_wr / depth;
- if (!cdata->max_requests)
+ ep->rep_max_requests = max_qp_wr / depth;
+ if (!ep->rep_max_requests)
return -EINVAL;
- ep->rep_attr.cap.max_send_wr = cdata->max_requests *
- depth;
+ ep->rep_attr.cap.max_send_wr = ep->rep_max_requests * depth;
}
ep->rep_attr.cap.max_send_wr += RPCRDMA_BACKWARD_WRS;
ep->rep_attr.cap.max_send_wr += 1; /* for ib_drain_sq */
- ep->rep_attr.cap.max_recv_wr = cdata->max_requests;
+ ep->rep_attr.cap.max_recv_wr = ep->rep_max_requests;
ep->rep_attr.cap.max_recv_wr += RPCRDMA_BACKWARD_WRS;
ep->rep_attr.cap.max_recv_wr += 1; /* for ib_drain_rq */
- ia->ri_max_segs = max_t(unsigned int, 1, RPCRDMA_MAX_DATA_SEGS /
- ia->ri_max_frwr_depth);
+ ia->ri_max_segs =
+ DIV_ROUND_UP(RPCRDMA_MAX_DATA_SEGS, ia->ri_max_frwr_depth);
+ /* Reply chunks require segments for head and tail buffers */
+ ia->ri_max_segs += 2;
+ if (ia->ri_max_segs > RPCRDMA_MAX_HDR_SEGS)
+ ia->ri_max_segs = RPCRDMA_MAX_HDR_SEGS;
return 0;
}
-/* FRWR mode conveys a list of pages per chunk segment. The
+/**
+ * frwr_maxpages - Compute size of largest payload
+ * @r_xprt: transport
+ *
+ * Returns maximum size of an RPC message, in pages.
+ *
+ * FRWR mode conveys a list of pages per chunk segment. The
* maximum length of that list is the FRWR page list depth.
*/
-static size_t
-frwr_op_maxpages(struct rpcrdma_xprt *r_xprt)
+size_t frwr_maxpages(struct rpcrdma_xprt *r_xprt)
{
struct rpcrdma_ia *ia = &r_xprt->rx_ia;
return min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS,
- RPCRDMA_MAX_HDR_SEGS * ia->ri_max_frwr_depth);
-}
-
-static void
-__frwr_sendcompletion_flush(struct ib_wc *wc, const char *wr)
-{
- if (wc->status != IB_WC_WR_FLUSH_ERR)
- pr_err("rpcrdma: %s: %s (%u/0x%x)\n",
- wr, ib_wc_status_msg(wc->status),
- wc->status, wc->vendor_err);
+ (ia->ri_max_segs - 2) * ia->ri_max_frwr_depth);
}
/**
- * frwr_wc_fastreg - Invoked by RDMA provider for a flushed FastReg WC
- * @cq: completion queue (ignored)
- * @wc: completed WR
+ * frwr_map - Register a memory region
+ * @r_xprt: controlling transport
+ * @seg: memory region co-ordinates
+ * @nsegs: number of segments remaining
+ * @writing: true when RDMA Write will be used
+ * @xid: XID of RPC using the registered memory
+ * @mr: MR to fill in
*
- */
-static void
-frwr_wc_fastreg(struct ib_cq *cq, struct ib_wc *wc)
-{
- struct ib_cqe *cqe = wc->wr_cqe;
- struct rpcrdma_frwr *frwr =
- container_of(cqe, struct rpcrdma_frwr, fr_cqe);
-
- /* WARNING: Only wr_cqe and status are reliable at this point */
- if (wc->status != IB_WC_SUCCESS) {
- frwr->fr_state = FRWR_FLUSHED_FR;
- __frwr_sendcompletion_flush(wc, "fastreg");
- }
- trace_xprtrdma_wc_fastreg(wc, frwr);
-}
-
-/**
- * frwr_wc_localinv - Invoked by RDMA provider for a flushed LocalInv WC
- * @cq: completion queue (ignored)
- * @wc: completed WR
- *
- */
-static void
-frwr_wc_localinv(struct ib_cq *cq, struct ib_wc *wc)
-{
- struct ib_cqe *cqe = wc->wr_cqe;
- struct rpcrdma_frwr *frwr = container_of(cqe, struct rpcrdma_frwr,
- fr_cqe);
-
- /* WARNING: Only wr_cqe and status are reliable at this point */
- if (wc->status != IB_WC_SUCCESS) {
- frwr->fr_state = FRWR_FLUSHED_LI;
- __frwr_sendcompletion_flush(wc, "localinv");
- }
- trace_xprtrdma_wc_li(wc, frwr);
-}
-
-/**
- * frwr_wc_localinv_wake - Invoked by RDMA provider for a signaled LocalInv WC
- * @cq: completion queue (ignored)
- * @wc: completed WR
- *
- * Awaken anyone waiting for an MR to finish being fenced.
- */
-static void
-frwr_wc_localinv_wake(struct ib_cq *cq, struct ib_wc *wc)
-{
- struct ib_cqe *cqe = wc->wr_cqe;
- struct rpcrdma_frwr *frwr = container_of(cqe, struct rpcrdma_frwr,
- fr_cqe);
-
- /* WARNING: Only wr_cqe and status are reliable at this point */
- if (wc->status != IB_WC_SUCCESS) {
- frwr->fr_state = FRWR_FLUSHED_LI;
- __frwr_sendcompletion_flush(wc, "localinv");
- }
- complete(&frwr->fr_linv_done);
- trace_xprtrdma_wc_li_wake(wc, frwr);
-}
-
-/* Post a REG_MR Work Request to register a memory region
+ * Prepare a REG_MR Work Request to register a memory region
* for remote access via RDMA READ or RDMA WRITE.
+ *
+ * Returns the next segment or a negative errno pointer.
+ * On success, @mr is filled in.
*/
-static struct rpcrdma_mr_seg *
-frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
- int nsegs, bool writing, struct rpcrdma_mr **out)
+struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt,
+ struct rpcrdma_mr_seg *seg,
+ int nsegs, bool writing, __be32 xid,
+ struct rpcrdma_mr *mr)
{
struct rpcrdma_ia *ia = &r_xprt->rx_ia;
- bool holes_ok = ia->ri_mrtype == IB_MR_TYPE_SG_GAPS;
- struct rpcrdma_frwr *frwr;
- struct rpcrdma_mr *mr;
- struct ib_mr *ibmr;
struct ib_reg_wr *reg_wr;
+ struct ib_mr *ibmr;
int i, n;
u8 key;
- mr = NULL;
- do {
- if (mr)
- rpcrdma_mr_defer_recovery(mr);
- mr = rpcrdma_mr_get(r_xprt);
- if (!mr)
- return ERR_PTR(-EAGAIN);
- } while (mr->frwr.fr_state != FRWR_IS_INVALID);
- frwr = &mr->frwr;
- frwr->fr_state = FRWR_IS_VALID;
-
if (nsegs > ia->ri_max_frwr_depth)
nsegs = ia->ri_max_frwr_depth;
for (i = 0; i < nsegs;) {
@@ -406,7 +344,7 @@
++seg;
++i;
- if (holes_ok)
+ if (ia->ri_mrtype == IB_MR_TYPE_SG_GAPS)
continue;
if ((i < nsegs && offset_in_page(seg->mr_offset)) ||
offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
@@ -414,20 +352,22 @@
}
mr->mr_dir = rpcrdma_data_dir(writing);
- mr->mr_nents = ib_dma_map_sg(ia->ri_device, mr->mr_sg, i, mr->mr_dir);
+ mr->mr_nents =
+ ib_dma_map_sg(ia->ri_id->device, mr->mr_sg, i, mr->mr_dir);
if (!mr->mr_nents)
goto out_dmamap_err;
- trace_xprtrdma_dma_map(mr);
- ibmr = frwr->fr_mr;
+ ibmr = mr->frwr.fr_mr;
n = ib_map_mr_sg(ibmr, mr->mr_sg, mr->mr_nents, NULL, PAGE_SIZE);
if (unlikely(n != mr->mr_nents))
goto out_mapmr_err;
+ ibmr->iova &= 0x00000000ffffffff;
+ ibmr->iova |= ((u64)be32_to_cpu(xid)) << 32;
key = (u8)(ibmr->rkey & 0x000000FF);
ib_update_fast_reg_key(ibmr, ++key);
- reg_wr = &frwr->fr_regwr;
+ reg_wr = &mr->frwr.fr_regwr;
reg_wr->mr = ibmr;
reg_wr->key = ibmr->rkey;
reg_wr->access = writing ?
@@ -437,32 +377,49 @@
mr->mr_handle = ibmr->rkey;
mr->mr_length = ibmr->length;
mr->mr_offset = ibmr->iova;
+ trace_xprtrdma_mr_map(mr);
- *out = mr;
return seg;
out_dmamap_err:
- pr_err("rpcrdma: failed to DMA map sg %p sg_nents %d\n",
- mr->mr_sg, i);
- frwr->fr_state = FRWR_IS_INVALID;
- rpcrdma_mr_put(mr);
+ mr->mr_dir = DMA_NONE;
+ trace_xprtrdma_frwr_sgerr(mr, i);
return ERR_PTR(-EIO);
out_mapmr_err:
- pr_err("rpcrdma: failed to map mr %p (%d/%d)\n",
- frwr->fr_mr, n, mr->mr_nents);
- rpcrdma_mr_defer_recovery(mr);
+ trace_xprtrdma_frwr_maperr(mr, n);
return ERR_PTR(-EIO);
}
-/* Post Send WR containing the RPC Call message.
+/**
+ * frwr_wc_fastreg - Invoked by RDMA provider for a flushed FastReg WC
+ * @cq: completion queue (ignored)
+ * @wc: completed WR
*
- * For FRMR, chain any FastReg WRs to the Send WR. Only a
+ */
+static void frwr_wc_fastreg(struct ib_cq *cq, struct ib_wc *wc)
+{
+ struct ib_cqe *cqe = wc->wr_cqe;
+ struct rpcrdma_frwr *frwr =
+ container_of(cqe, struct rpcrdma_frwr, fr_cqe);
+
+ /* WARNING: Only wr_cqe and status are reliable at this point */
+ trace_xprtrdma_wc_fastreg(wc, frwr);
+ /* The MR will get recycled when the associated req is retransmitted */
+}
+
+/**
+ * frwr_send - post Send WR containing the RPC Call message
+ * @ia: interface adapter
+ * @req: Prepared RPC Call
+ *
+ * For FRWR, chain any FastReg WRs to the Send WR. Only a
* single ib_post_send call is needed to register memory
* and then post the Send WR.
+ *
+ * Returns the result of ib_post_send.
*/
-static int
-frwr_op_send(struct rpcrdma_ia *ia, struct rpcrdma_req *req)
+int frwr_send(struct rpcrdma_ia *ia, struct rpcrdma_req *req)
{
struct ib_send_wr *post_wr;
struct rpcrdma_mr *mr;
@@ -484,45 +441,94 @@
}
/* If ib_post_send fails, the next ->send_request for
- * @req will queue these MWs for recovery.
+ * @req will queue these MRs for recovery.
*/
return ib_post_send(ia->ri_id->qp, post_wr, NULL);
}
-/* Handle a remotely invalidated mr on the @mrs list
+/**
+ * frwr_reminv - handle a remotely invalidated mr on the @mrs list
+ * @rep: Received reply
+ * @mrs: list of MRs to check
+ *
*/
-static void
-frwr_op_reminv(struct rpcrdma_rep *rep, struct list_head *mrs)
+void frwr_reminv(struct rpcrdma_rep *rep, struct list_head *mrs)
{
struct rpcrdma_mr *mr;
list_for_each_entry(mr, mrs, mr_list)
if (mr->mr_handle == rep->rr_inv_rkey) {
list_del_init(&mr->mr_list);
- trace_xprtrdma_remoteinv(mr);
- mr->frwr.fr_state = FRWR_IS_INVALID;
- rpcrdma_mr_unmap_and_put(mr);
+ trace_xprtrdma_mr_remoteinv(mr);
+ rpcrdma_mr_put(mr);
break; /* only one invalidated MR per RPC */
}
}
-/* Invalidate all memory regions that were registered for "req".
+static void __frwr_release_mr(struct ib_wc *wc, struct rpcrdma_mr *mr)
+{
+ if (wc->status != IB_WC_SUCCESS)
+ rpcrdma_mr_recycle(mr);
+ else
+ rpcrdma_mr_put(mr);
+}
+
+/**
+ * frwr_wc_localinv - Invoked by RDMA provider for a LOCAL_INV WC
+ * @cq: completion queue (ignored)
+ * @wc: completed WR
*
- * Sleeps until it is safe for the host CPU to access the
- * previously mapped memory regions.
- *
- * Caller ensures that @mrs is not empty before the call. This
- * function empties the list.
*/
-static void
-frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mrs)
+static void frwr_wc_localinv(struct ib_cq *cq, struct ib_wc *wc)
+{
+ struct ib_cqe *cqe = wc->wr_cqe;
+ struct rpcrdma_frwr *frwr =
+ container_of(cqe, struct rpcrdma_frwr, fr_cqe);
+ struct rpcrdma_mr *mr = container_of(frwr, struct rpcrdma_mr, frwr);
+
+ /* WARNING: Only wr_cqe and status are reliable at this point */
+ trace_xprtrdma_wc_li(wc, frwr);
+ __frwr_release_mr(wc, mr);
+}
+
+/**
+ * frwr_wc_localinv_wake - Invoked by RDMA provider for a LOCAL_INV WC
+ * @cq: completion queue (ignored)
+ * @wc: completed WR
+ *
+ * Awaken anyone waiting for an MR to finish being fenced.
+ */
+static void frwr_wc_localinv_wake(struct ib_cq *cq, struct ib_wc *wc)
+{
+ struct ib_cqe *cqe = wc->wr_cqe;
+ struct rpcrdma_frwr *frwr =
+ container_of(cqe, struct rpcrdma_frwr, fr_cqe);
+ struct rpcrdma_mr *mr = container_of(frwr, struct rpcrdma_mr, frwr);
+
+ /* WARNING: Only wr_cqe and status are reliable at this point */
+ trace_xprtrdma_wc_li_wake(wc, frwr);
+ __frwr_release_mr(wc, mr);
+ complete(&frwr->fr_linv_done);
+}
+
+/**
+ * frwr_unmap_sync - invalidate memory regions that were registered for @req
+ * @r_xprt: controlling transport instance
+ * @req: rpcrdma_req with a non-empty list of MRs to process
+ *
+ * Sleeps until it is safe for the host CPU to access the previously mapped
+ * memory regions. This guarantees that registered MRs are properly fenced
+ * from the server before the RPC consumer accesses the data in them. It
+ * also ensures proper Send flow control: waking the next RPC waits until
+ * this RPC has relinquished all its Send Queue entries.
+ */
+void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
{
struct ib_send_wr *first, **prev, *last;
const struct ib_send_wr *bad_wr;
- struct rpcrdma_ia *ia = &r_xprt->rx_ia;
struct rpcrdma_frwr *frwr;
struct rpcrdma_mr *mr;
- int count, rc;
+ int rc;
/* ORDER: Invalidate all of the MRs first
*
@@ -530,33 +536,31 @@
* a single ib_post_send() call.
*/
frwr = NULL;
- count = 0;
prev = &first;
- list_for_each_entry(mr, mrs, mr_list) {
- mr->frwr.fr_state = FRWR_IS_INVALID;
+ while ((mr = rpcrdma_mr_pop(&req->rl_registered))) {
+
+ trace_xprtrdma_mr_localinv(mr);
+ r_xprt->rx_stats.local_inv_needed++;
frwr = &mr->frwr;
- trace_xprtrdma_localinv(mr);
-
frwr->fr_cqe.done = frwr_wc_localinv;
last = &frwr->fr_invwr;
- memset(last, 0, sizeof(*last));
+ last->next = NULL;
last->wr_cqe = &frwr->fr_cqe;
+ last->sg_list = NULL;
+ last->num_sge = 0;
last->opcode = IB_WR_LOCAL_INV;
+ last->send_flags = IB_SEND_SIGNALED;
last->ex.invalidate_rkey = mr->mr_handle;
- count++;
*prev = last;
prev = &last->next;
}
- if (!frwr)
- goto unmap;
/* Strong send queue ordering guarantees that when the
* last WR in the chain completes, all WRs in the chain
* are complete.
*/
- last->send_flags = IB_SEND_SIGNALED;
frwr->fr_cqe.done = frwr_wc_localinv_wake;
reinit_completion(&frwr->fr_linv_done);
@@ -564,52 +568,128 @@
* replaces the QP. The RPC reply handler won't call us
* unless ri_id->qp is a valid pointer.
*/
- r_xprt->rx_stats.local_inv_needed++;
bad_wr = NULL;
- rc = ib_post_send(ia->ri_id->qp, first, &bad_wr);
+ rc = ib_post_send(r_xprt->rx_ia.ri_id->qp, first, &bad_wr);
+ trace_xprtrdma_post_send(req, rc);
+
+ /* The final LOCAL_INV WR in the chain is supposed to
+ * do the wake. If it was never posted, the wake will
+ * not happen, so don't wait in that case.
+ */
if (bad_wr != first)
wait_for_completion(&frwr->fr_linv_done);
- if (rc)
- goto reset_mrs;
+ if (!rc)
+ return;
- /* ORDER: Now DMA unmap all of the MRs, and return
- * them to the free MR list.
- */
-unmap:
- while (!list_empty(mrs)) {
- mr = rpcrdma_mr_pop(mrs);
- rpcrdma_mr_unmap_and_put(mr);
- }
- return;
-
-reset_mrs:
- pr_err("rpcrdma: FRWR invalidate ib_post_send returned %i\n", rc);
-
- /* Find and reset the MRs in the LOCAL_INV WRs that did not
- * get posted.
+ /* Recycle MRs in the LOCAL_INV chain that did not get posted.
*/
while (bad_wr) {
frwr = container_of(bad_wr, struct rpcrdma_frwr,
fr_invwr);
mr = container_of(frwr, struct rpcrdma_mr, frwr);
-
- __frwr_mr_reset(ia, mr);
-
bad_wr = bad_wr->next;
+
+ list_del_init(&mr->mr_list);
+ rpcrdma_mr_recycle(mr);
}
- goto unmap;
}
-const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops = {
- .ro_map = frwr_op_map,
- .ro_send = frwr_op_send,
- .ro_reminv = frwr_op_reminv,
- .ro_unmap_sync = frwr_op_unmap_sync,
- .ro_recover_mr = frwr_op_recover_mr,
- .ro_open = frwr_op_open,
- .ro_maxpages = frwr_op_maxpages,
- .ro_init_mr = frwr_op_init_mr,
- .ro_release_mr = frwr_op_release_mr,
- .ro_displayname = "frwr",
- .ro_send_w_inv_ok = RPCRDMA_CMP_F_SND_W_INV_OK,
-};
+/**
+ * frwr_wc_localinv_done - Invoked by RDMA provider for a signaled LOCAL_INV WC
+ * @cq: completion queue (ignored)
+ * @wc: completed WR
+ *
+ */
+static void frwr_wc_localinv_done(struct ib_cq *cq, struct ib_wc *wc)
+{
+ struct ib_cqe *cqe = wc->wr_cqe;
+ struct rpcrdma_frwr *frwr =
+ container_of(cqe, struct rpcrdma_frwr, fr_cqe);
+ struct rpcrdma_mr *mr = container_of(frwr, struct rpcrdma_mr, frwr);
+ struct rpcrdma_rep *rep = mr->mr_req->rl_reply;
+
+ /* WARNING: Only wr_cqe and status are reliable at this point */
+ trace_xprtrdma_wc_li_done(wc, frwr);
+ __frwr_release_mr(wc, mr);
+
+ /* Ensure @rep is generated before __frwr_release_mr */
+ smp_rmb();
+ rpcrdma_complete_rqst(rep);
+}
+
+/**
+ * frwr_unmap_async - invalidate memory regions that were registered for @req
+ * @r_xprt: controlling transport instance
+ * @req: rpcrdma_req with a non-empty list of MRs to process
+ *
+ * This guarantees that registered MRs are properly fenced from the
+ * server before the RPC consumer accesses the data in them. It also
+ * ensures proper Send flow control: waking the next RPC waits until
+ * this RPC has relinquished all its Send Queue entries.
+ */
+void frwr_unmap_async(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
+{
+ struct ib_send_wr *first, *last, **prev;
+ const struct ib_send_wr *bad_wr;
+ struct rpcrdma_frwr *frwr;
+ struct rpcrdma_mr *mr;
+ int rc;
+
+ /* Chain the LOCAL_INV Work Requests and post them with
+ * a single ib_post_send() call.
+ */
+ frwr = NULL;
+ prev = &first;
+ while ((mr = rpcrdma_mr_pop(&req->rl_registered))) {
+
+ trace_xprtrdma_mr_localinv(mr);
+ r_xprt->rx_stats.local_inv_needed++;
+
+ frwr = &mr->frwr;
+ frwr->fr_cqe.done = frwr_wc_localinv;
+ last = &frwr->fr_invwr;
+ last->next = NULL;
+ last->wr_cqe = &frwr->fr_cqe;
+ last->sg_list = NULL;
+ last->num_sge = 0;
+ last->opcode = IB_WR_LOCAL_INV;
+ last->send_flags = IB_SEND_SIGNALED;
+ last->ex.invalidate_rkey = mr->mr_handle;
+
+ *prev = last;
+ prev = &last->next;
+ }
+
+ /* Strong send queue ordering guarantees that when the
+ * last WR in the chain completes, all WRs in the chain
+ * are complete. The last completion will wake up the
+ * RPC waiter.
+ */
+ frwr->fr_cqe.done = frwr_wc_localinv_done;
+
+ /* Transport disconnect drains the receive CQ before it
+ * replaces the QP. The RPC reply handler won't call us
+ * unless ri_id->qp is a valid pointer.
+ */
+ bad_wr = NULL;
+ rc = ib_post_send(r_xprt->rx_ia.ri_id->qp, first, &bad_wr);
+ trace_xprtrdma_post_send(req, rc);
+ if (!rc)
+ return;
+
+ /* Recycle MRs in the LOCAL_INV chain that did not get posted.
+ */
+ while (bad_wr) {
+ frwr = container_of(bad_wr, struct rpcrdma_frwr, fr_invwr);
+ mr = container_of(frwr, struct rpcrdma_mr, frwr);
+ bad_wr = bad_wr->next;
+
+ rpcrdma_mr_recycle(mr);
+ }
+
+ /* The final LOCAL_INV WR in the chain is supposed to
+ * do the wake. If it was never posted, the wake will
+ * not happen, so wake here in that case.
+ */
+ rpcrdma_complete_rqst(req->rl_reply);
+}
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index c8ae983..b86b5fd 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -71,7 +71,6 @@
size = RPCRDMA_HDRLEN_MIN;
/* Maximum Read list size */
- maxsegs += 2; /* segment for head and tail buffers */
size = maxsegs * rpcrdma_readchunk_maxsz * sizeof(__be32);
/* Minimal Read chunk size */
@@ -97,7 +96,6 @@
size = RPCRDMA_HDRLEN_MIN;
/* Maximum Write list size */
- maxsegs += 2; /* segment for head and tail buffers */
size = sizeof(__be32); /* segment count */
size += maxsegs * rpcrdma_segment_maxsz * sizeof(__be32);
size += sizeof(__be32); /* list discriminator */
@@ -107,16 +105,23 @@
return size;
}
+/**
+ * rpcrdma_set_max_header_sizes - Initialize inline payload sizes
+ * @r_xprt: transport instance to initialize
+ *
+ * The max_inline fields contain the maximum size of an RPC message
+ * so the marshaling code doesn't have to repeat this calculation
+ * for every RPC.
+ */
void rpcrdma_set_max_header_sizes(struct rpcrdma_xprt *r_xprt)
{
- struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
- struct rpcrdma_ia *ia = &r_xprt->rx_ia;
- unsigned int maxsegs = ia->ri_max_segs;
+ unsigned int maxsegs = r_xprt->rx_ia.ri_max_segs;
+ struct rpcrdma_ep *ep = &r_xprt->rx_ep;
- ia->ri_max_inline_write = cdata->inline_wsize -
- rpcrdma_max_call_header_size(maxsegs);
- ia->ri_max_inline_read = cdata->inline_rsize -
- rpcrdma_max_reply_header_size(maxsegs);
+ ep->rep_max_inline_send =
+ ep->rep_inline_send - rpcrdma_max_call_header_size(maxsegs);
+ ep->rep_max_inline_recv =
+ ep->rep_inline_recv - rpcrdma_max_reply_header_size(maxsegs);
}
/* The client can send a request inline as long as the RPCRDMA header
@@ -133,7 +138,7 @@
struct xdr_buf *xdr = &rqst->rq_snd_buf;
unsigned int count, remaining, offset;
- if (xdr->len > r_xprt->rx_ia.ri_max_inline_write)
+ if (xdr->len > r_xprt->rx_ep.rep_max_inline_send)
return false;
if (xdr->page_len) {
@@ -161,9 +166,21 @@
static bool rpcrdma_results_inline(struct rpcrdma_xprt *r_xprt,
struct rpc_rqst *rqst)
{
- struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+ return rqst->rq_rcv_buf.buflen <= r_xprt->rx_ep.rep_max_inline_recv;
+}
- return rqst->rq_rcv_buf.buflen <= ia->ri_max_inline_read;
+/* The client is required to provide a Reply chunk if the maximum
+ * size of the non-payload part of the RPC Reply is larger than
+ * the inline threshold.
+ */
+static bool
+rpcrdma_nonpayload_inline(const struct rpcrdma_xprt *r_xprt,
+ const struct rpc_rqst *rqst)
+{
+ const struct xdr_buf *buf = &rqst->rq_rcv_buf;
+
+ return (buf->head[0].iov_len + buf->tail[0].iov_len) <
+ r_xprt->rx_ep.rep_max_inline_recv;
}
/* Split @vec on page boundaries into SGEs. FMR registers pages, not
@@ -220,11 +237,12 @@
ppages = xdrbuf->pages + (xdrbuf->page_base >> PAGE_SHIFT);
page_base = offset_in_page(xdrbuf->page_base);
while (len) {
- if (unlikely(!*ppages)) {
- /* XXX: Certain upper layer operations do
- * not provide receive buffer pages.
- */
- *ppages = alloc_page(GFP_ATOMIC);
+ /* ACL likes to be lazy in allocating pages - ACLs
+ * are small by default but can get huge.
+ */
+ if (unlikely(xdrbuf->flags & XDRBUF_SPARSE_PAGES)) {
+ if (!*ppages)
+ *ppages = alloc_page(GFP_NOWAIT | __GFP_NOWARN);
if (!*ppages)
return -ENOBUFS;
}
@@ -324,6 +342,32 @@
return 0;
}
+static struct rpcrdma_mr_seg *rpcrdma_mr_prepare(struct rpcrdma_xprt *r_xprt,
+ struct rpcrdma_req *req,
+ struct rpcrdma_mr_seg *seg,
+ int nsegs, bool writing,
+ struct rpcrdma_mr **mr)
+{
+ *mr = rpcrdma_mr_pop(&req->rl_free_mrs);
+ if (!*mr) {
+ *mr = rpcrdma_mr_get(r_xprt);
+ if (!*mr)
+ goto out_getmr_err;
+ trace_xprtrdma_mr_get(req);
+ (*mr)->mr_req = req;
+ }
+
+ rpcrdma_mr_push(*mr, &req->rl_registered);
+ return frwr_map(r_xprt, seg, nsegs, writing, req->rl_slot.rq_xid, *mr);
+
+out_getmr_err:
+ trace_xprtrdma_nomrs(req);
+ xprt_wait_for_buffer_space(&r_xprt->rx_xprt);
+ if (r_xprt->rx_ep.rep_connected != -ENODEV)
+ schedule_work(&r_xprt->rx_buf.rb_refresh_worker);
+ return ERR_PTR(-EAGAIN);
+}
+
/* Register and XDR encode the Read list. Supports encoding a list of read
* segments that belong to a single read chunk.
*
@@ -338,9 +382,10 @@
*
* Only a single @pos value is currently supported.
*/
-static noinline int
-rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
- struct rpc_rqst *rqst, enum rpcrdma_chunktype rtype)
+static int rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt,
+ struct rpcrdma_req *req,
+ struct rpc_rqst *rqst,
+ enum rpcrdma_chunktype rtype)
{
struct xdr_stream *xdr = &req->rl_stream;
struct rpcrdma_mr_seg *seg;
@@ -348,6 +393,9 @@
unsigned int pos;
int nsegs;
+ if (rtype == rpcrdma_noch)
+ goto done;
+
pos = rqst->rq_snd_buf.head[0].iov_len;
if (rtype == rpcrdma_areadch)
pos = 0;
@@ -358,21 +406,20 @@
return nsegs;
do {
- seg = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs,
- false, &mr);
+ seg = rpcrdma_mr_prepare(r_xprt, req, seg, nsegs, false, &mr);
if (IS_ERR(seg))
return PTR_ERR(seg);
- rpcrdma_mr_push(mr, &req->rl_registered);
if (encode_read_segment(xdr, mr, pos) < 0)
return -EMSGSIZE;
- trace_xprtrdma_read_chunk(rqst->rq_task, pos, mr, nsegs);
+ trace_xprtrdma_chunk_read(rqst->rq_task, pos, mr, nsegs);
r_xprt->rx_stats.read_chunk_count++;
nsegs -= mr->mr_nents;
} while (nsegs);
- return 0;
+done:
+ return encode_item_not_present(xdr);
}
/* Register and XDR encode the Write list. Supports encoding a list
@@ -390,9 +437,10 @@
*
* Only a single Write chunk is currently supported.
*/
-static noinline int
-rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
- struct rpc_rqst *rqst, enum rpcrdma_chunktype wtype)
+static int rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt,
+ struct rpcrdma_req *req,
+ struct rpc_rqst *rqst,
+ enum rpcrdma_chunktype wtype)
{
struct xdr_stream *xdr = &req->rl_stream;
struct rpcrdma_mr_seg *seg;
@@ -400,6 +448,9 @@
int nsegs, nchunks;
__be32 *segcount;
+ if (wtype != rpcrdma_writech)
+ goto done;
+
seg = req->rl_segments;
nsegs = rpcrdma_convert_iovs(r_xprt, &rqst->rq_rcv_buf,
rqst->rq_rcv_buf.head[0].iov_len,
@@ -416,16 +467,14 @@
nchunks = 0;
do {
- seg = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs,
- true, &mr);
+ seg = rpcrdma_mr_prepare(r_xprt, req, seg, nsegs, true, &mr);
if (IS_ERR(seg))
return PTR_ERR(seg);
- rpcrdma_mr_push(mr, &req->rl_registered);
if (encode_rdma_segment(xdr, mr) < 0)
return -EMSGSIZE;
- trace_xprtrdma_write_chunk(rqst->rq_task, mr, nsegs);
+ trace_xprtrdma_chunk_write(rqst->rq_task, mr, nsegs);
r_xprt->rx_stats.write_chunk_count++;
r_xprt->rx_stats.total_rdma_request += mr->mr_length;
nchunks++;
@@ -435,7 +484,8 @@
/* Update count of segments in this Write chunk */
*segcount = cpu_to_be32(nchunks);
- return 0;
+done:
+ return encode_item_not_present(xdr);
}
/* Register and XDR encode the Reply chunk. Supports encoding an array
@@ -450,9 +500,10 @@
* Returns zero on success, or a negative errno if a failure occurred.
* @xdr is advanced to the next position in the stream.
*/
-static noinline int
-rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
- struct rpc_rqst *rqst, enum rpcrdma_chunktype wtype)
+static int rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt,
+ struct rpcrdma_req *req,
+ struct rpc_rqst *rqst,
+ enum rpcrdma_chunktype wtype)
{
struct xdr_stream *xdr = &req->rl_stream;
struct rpcrdma_mr_seg *seg;
@@ -460,6 +511,9 @@
int nsegs, nchunks;
__be32 *segcount;
+ if (wtype != rpcrdma_replych)
+ return encode_item_not_present(xdr);
+
seg = req->rl_segments;
nsegs = rpcrdma_convert_iovs(r_xprt, &rqst->rq_rcv_buf, 0, wtype, seg);
if (nsegs < 0)
@@ -474,16 +528,14 @@
nchunks = 0;
do {
- seg = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs,
- true, &mr);
+ seg = rpcrdma_mr_prepare(r_xprt, req, seg, nsegs, true, &mr);
if (IS_ERR(seg))
return PTR_ERR(seg);
- rpcrdma_mr_push(mr, &req->rl_registered);
if (encode_rdma_segment(xdr, mr) < 0)
return -EMSGSIZE;
- trace_xprtrdma_reply_chunk(rqst->rq_task, mr, nsegs);
+ trace_xprtrdma_chunk_reply(rqst->rq_task, mr, nsegs);
r_xprt->rx_stats.reply_chunk_count++;
r_xprt->rx_stats.total_rdma_request += mr->mr_length;
nchunks++;
@@ -496,51 +548,57 @@
return 0;
}
+static void rpcrdma_sendctx_done(struct kref *kref)
+{
+ struct rpcrdma_req *req =
+ container_of(kref, struct rpcrdma_req, rl_kref);
+ struct rpcrdma_rep *rep = req->rl_reply;
+
+ rpcrdma_complete_rqst(rep);
+ rep->rr_rxprt->rx_stats.reply_waits_for_send++;
+}
+
/**
- * rpcrdma_unmap_sendctx - DMA-unmap Send buffers
+ * rpcrdma_sendctx_unmap - DMA-unmap Send buffer
* @sc: sendctx containing SGEs to unmap
*
*/
-void
-rpcrdma_unmap_sendctx(struct rpcrdma_sendctx *sc)
+void rpcrdma_sendctx_unmap(struct rpcrdma_sendctx *sc)
{
- struct rpcrdma_ia *ia = &sc->sc_xprt->rx_ia;
struct ib_sge *sge;
- unsigned int count;
+
+ if (!sc->sc_unmap_count)
+ return;
/* The first two SGEs contain the transport header and
* the inline buffer. These are always left mapped so
* they can be cheaply re-used.
*/
- sge = &sc->sc_sges[2];
- for (count = sc->sc_unmap_count; count; ++sge, --count)
- ib_dma_unmap_page(ia->ri_device,
- sge->addr, sge->length, DMA_TO_DEVICE);
+ for (sge = &sc->sc_sges[2]; sc->sc_unmap_count;
+ ++sge, --sc->sc_unmap_count)
+ ib_dma_unmap_page(sc->sc_device, sge->addr, sge->length,
+ DMA_TO_DEVICE);
- if (test_and_clear_bit(RPCRDMA_REQ_F_TX_RESOURCES, &sc->sc_req->rl_flags)) {
- smp_mb__after_atomic();
- wake_up_bit(&sc->sc_req->rl_flags, RPCRDMA_REQ_F_TX_RESOURCES);
- }
+ kref_put(&sc->sc_req->rl_kref, rpcrdma_sendctx_done);
}
/* Prepare an SGE for the RPC-over-RDMA transport header.
*/
-static bool
-rpcrdma_prepare_hdr_sge(struct rpcrdma_ia *ia, struct rpcrdma_req *req,
- u32 len)
+static bool rpcrdma_prepare_hdr_sge(struct rpcrdma_xprt *r_xprt,
+ struct rpcrdma_req *req, u32 len)
{
struct rpcrdma_sendctx *sc = req->rl_sendctx;
struct rpcrdma_regbuf *rb = req->rl_rdmabuf;
struct ib_sge *sge = sc->sc_sges;
- if (!rpcrdma_dma_map_regbuf(ia, rb))
+ if (!rpcrdma_regbuf_dma_map(r_xprt, rb))
goto out_regbuf;
sge->addr = rdmab_addr(rb);
sge->length = len;
sge->lkey = rdmab_lkey(rb);
- ib_dma_sync_single_for_device(rdmab_device(rb), sge->addr,
- sge->length, DMA_TO_DEVICE);
+ ib_dma_sync_single_for_device(rdmab_device(rb), sge->addr, sge->length,
+ DMA_TO_DEVICE);
sc->sc_wr.num_sge++;
return true;
@@ -552,23 +610,23 @@
/* Prepare the Send SGEs. The head and tail iovec, and each entry
* in the page list, gets its own SGE.
*/
-static bool
-rpcrdma_prepare_msg_sges(struct rpcrdma_ia *ia, struct rpcrdma_req *req,
- struct xdr_buf *xdr, enum rpcrdma_chunktype rtype)
+static bool rpcrdma_prepare_msg_sges(struct rpcrdma_xprt *r_xprt,
+ struct rpcrdma_req *req,
+ struct xdr_buf *xdr,
+ enum rpcrdma_chunktype rtype)
{
struct rpcrdma_sendctx *sc = req->rl_sendctx;
unsigned int sge_no, page_base, len, remaining;
struct rpcrdma_regbuf *rb = req->rl_sendbuf;
- struct ib_device *device = ia->ri_device;
struct ib_sge *sge = sc->sc_sges;
- u32 lkey = ia->ri_pd->local_dma_lkey;
struct page *page, **ppages;
/* The head iovec is straightforward, as it is already
* DMA-mapped. Sync the content that has changed.
*/
- if (!rpcrdma_dma_map_regbuf(ia, rb))
+ if (!rpcrdma_regbuf_dma_map(r_xprt, rb))
goto out_regbuf;
+ sc->sc_device = rdmab_device(rb);
sge_no = 1;
sge[sge_no].addr = rdmab_addr(rb);
sge[sge_no].length = xdr->head[0].iov_len;
@@ -615,13 +673,14 @@
goto out_mapping_overflow;
len = min_t(u32, PAGE_SIZE - page_base, remaining);
- sge[sge_no].addr = ib_dma_map_page(device, *ppages,
- page_base, len,
- DMA_TO_DEVICE);
- if (ib_dma_mapping_error(device, sge[sge_no].addr))
+ sge[sge_no].addr =
+ ib_dma_map_page(rdmab_device(rb), *ppages,
+ page_base, len, DMA_TO_DEVICE);
+ if (ib_dma_mapping_error(rdmab_device(rb),
+ sge[sge_no].addr))
goto out_mapping_err;
sge[sge_no].length = len;
- sge[sge_no].lkey = lkey;
+ sge[sge_no].lkey = rdmab_lkey(rb);
sc->sc_unmap_count++;
ppages++;
@@ -642,20 +701,20 @@
map_tail:
sge_no++;
- sge[sge_no].addr = ib_dma_map_page(device, page,
- page_base, len,
- DMA_TO_DEVICE);
- if (ib_dma_mapping_error(device, sge[sge_no].addr))
+ sge[sge_no].addr =
+ ib_dma_map_page(rdmab_device(rb), page, page_base, len,
+ DMA_TO_DEVICE);
+ if (ib_dma_mapping_error(rdmab_device(rb), sge[sge_no].addr))
goto out_mapping_err;
sge[sge_no].length = len;
- sge[sge_no].lkey = lkey;
+ sge[sge_no].lkey = rdmab_lkey(rb);
sc->sc_unmap_count++;
}
out:
sc->sc_wr.num_sge += sge_no;
if (sc->sc_unmap_count)
- __set_bit(RPCRDMA_REQ_F_TX_RESOURCES, &req->rl_flags);
+ kref_get(&req->rl_kref);
return true;
out_regbuf:
@@ -663,13 +722,13 @@
return false;
out_mapping_overflow:
- rpcrdma_unmap_sendctx(sc);
+ rpcrdma_sendctx_unmap(sc);
pr_err("rpcrdma: too many Send SGEs (%u)\n", sge_no);
return false;
out_mapping_err:
- rpcrdma_unmap_sendctx(sc);
- pr_err("rpcrdma: Send mapping error\n");
+ rpcrdma_sendctx_unmap(sc);
+ trace_xprtrdma_dma_maperr(sge[sge_no].addr);
return false;
}
@@ -688,22 +747,28 @@
struct rpcrdma_req *req, u32 hdrlen,
struct xdr_buf *xdr, enum rpcrdma_chunktype rtype)
{
- req->rl_sendctx = rpcrdma_sendctx_get_locked(&r_xprt->rx_buf);
+ int ret;
+
+ ret = -EAGAIN;
+ req->rl_sendctx = rpcrdma_sendctx_get_locked(r_xprt);
if (!req->rl_sendctx)
- return -EAGAIN;
+ goto err;
req->rl_sendctx->sc_wr.num_sge = 0;
req->rl_sendctx->sc_unmap_count = 0;
req->rl_sendctx->sc_req = req;
- __clear_bit(RPCRDMA_REQ_F_TX_RESOURCES, &req->rl_flags);
+ kref_init(&req->rl_kref);
- if (!rpcrdma_prepare_hdr_sge(&r_xprt->rx_ia, req, hdrlen))
- return -EIO;
-
+ ret = -EIO;
+ if (!rpcrdma_prepare_hdr_sge(r_xprt, req, hdrlen))
+ goto err;
if (rtype != rpcrdma_areadch)
- if (!rpcrdma_prepare_msg_sges(&r_xprt->rx_ia, req, xdr, rtype))
- return -EIO;
-
+ if (!rpcrdma_prepare_msg_sges(r_xprt, req, xdr, rtype))
+ goto err;
return 0;
+
+err:
+ trace_xprtrdma_prepsend_failed(&req->rl_slot, ret);
+ return ret;
}
/**
@@ -736,8 +801,8 @@
int ret;
rpcrdma_set_xdrlen(&req->rl_hdrbuf, 0);
- xdr_init_encode(xdr, &req->rl_hdrbuf,
- req->rl_rdmabuf->rg_base);
+ xdr_init_encode(xdr, &req->rl_hdrbuf, rdmab_data(req->rl_rdmabuf),
+ rqst);
/* Fixed header fields */
ret = -EMSGSIZE;
@@ -766,7 +831,8 @@
*/
if (rpcrdma_results_inline(r_xprt, rqst))
wtype = rpcrdma_noch;
- else if (ddp_allowed && rqst->rq_rcv_buf.flags & XDRBUF_READ)
+ else if ((ddp_allowed && rqst->rq_rcv_buf.flags & XDRBUF_READ) &&
+ rpcrdma_nonpayload_inline(r_xprt, rqst))
wtype = rpcrdma_writech;
else
wtype = rpcrdma_replych;
@@ -801,12 +867,7 @@
* chunks. Very likely the connection has been replaced,
* so these registrations are invalid and unusable.
*/
- while (unlikely(!list_empty(&req->rl_registered))) {
- struct rpcrdma_mr *mr;
-
- mr = rpcrdma_mr_pop(&req->rl_registered);
- rpcrdma_mr_defer_recovery(mr);
- }
+ frwr_recycle(req);
/* This implementation supports the following combinations
* of chunk lists in one RPC-over-RDMA Call message:
@@ -830,49 +891,28 @@
* send a Call message with a Position Zero Read chunk and a
* regular Read chunk at the same time.
*/
- if (rtype != rpcrdma_noch) {
- ret = rpcrdma_encode_read_list(r_xprt, req, rqst, rtype);
- if (ret)
- goto out_err;
- }
- ret = encode_item_not_present(xdr);
+ ret = rpcrdma_encode_read_list(r_xprt, req, rqst, rtype);
+ if (ret)
+ goto out_err;
+ ret = rpcrdma_encode_write_list(r_xprt, req, rqst, wtype);
+ if (ret)
+ goto out_err;
+ ret = rpcrdma_encode_reply_chunk(r_xprt, req, rqst, wtype);
if (ret)
goto out_err;
- if (wtype == rpcrdma_writech) {
- ret = rpcrdma_encode_write_list(r_xprt, req, rqst, wtype);
- if (ret)
- goto out_err;
- }
- ret = encode_item_not_present(xdr);
- if (ret)
- goto out_err;
-
- if (wtype != rpcrdma_replych)
- ret = encode_item_not_present(xdr);
- else
- ret = rpcrdma_encode_reply_chunk(r_xprt, req, rqst, wtype);
- if (ret)
- goto out_err;
-
- trace_xprtrdma_marshal(rqst, xdr_stream_pos(xdr), rtype, wtype);
-
- ret = rpcrdma_prepare_send_sges(r_xprt, req, xdr_stream_pos(xdr),
+ ret = rpcrdma_prepare_send_sges(r_xprt, req, req->rl_hdrbuf.len,
&rqst->rq_snd_buf, rtype);
if (ret)
goto out_err;
+
+ trace_xprtrdma_marshal(req, rtype, wtype);
return 0;
out_err:
- switch (ret) {
- case -EAGAIN:
- xprt_wait_for_buffer_space(rqst->rq_task, NULL);
- break;
- case -ENOBUFS:
- break;
- default:
- r_xprt->rx_stats.failed_marshal_count++;
- }
+ trace_xprtrdma_marshal_failed(rqst, ret);
+ r_xprt->rx_stats.failed_marshal_count++;
+ frwr_reset(req);
return ret;
}
@@ -1190,17 +1230,20 @@
p = xdr_inline_decode(xdr, 2 * sizeof(*p));
if (!p)
break;
- dprintk("RPC: %5u: %s: server reports version error (%u-%u)\n",
- rqst->rq_task->tk_pid, __func__,
- be32_to_cpup(p), be32_to_cpu(*(p + 1)));
+ dprintk("RPC: %s: server reports "
+ "version error (%u-%u), xid %08x\n", __func__,
+ be32_to_cpup(p), be32_to_cpu(*(p + 1)),
+ be32_to_cpu(rep->rr_xid));
break;
case err_chunk:
- dprintk("RPC: %5u: %s: server reports header decoding error\n",
- rqst->rq_task->tk_pid, __func__);
+ dprintk("RPC: %s: server reports "
+ "header decoding error, xid %08x\n", __func__,
+ be32_to_cpu(rep->rr_xid));
break;
default:
- dprintk("RPC: %5u: %s: server reports unrecognized error %d\n",
- rqst->rq_task->tk_pid, __func__, be32_to_cpup(p));
+ dprintk("RPC: %s: server reports "
+ "unrecognized error %d, xid %08x\n", __func__,
+ be32_to_cpup(p), be32_to_cpu(rep->rr_xid));
}
r_xprt->rx_stats.bad_reply_count++;
@@ -1216,11 +1259,8 @@
struct rpcrdma_xprt *r_xprt = rep->rr_rxprt;
struct rpc_xprt *xprt = &r_xprt->rx_xprt;
struct rpc_rqst *rqst = rep->rr_rqst;
- unsigned long cwnd;
int status;
- xprt->reestablish_timeout = 0;
-
switch (rep->rr_proc) {
case rdma_msg:
status = rpcrdma_decode_msg(r_xprt, rep, rqst);
@@ -1238,15 +1278,10 @@
goto out_badheader;
out:
- spin_lock(&xprt->recv_lock);
- cwnd = xprt->cwnd;
- xprt->cwnd = r_xprt->rx_buf.rb_credits << RPC_CWNDSHIFT;
- if (xprt->cwnd > cwnd)
- xprt_release_rqst_cong(rqst->rq_task);
-
+ spin_lock(&xprt->queue_lock);
xprt_complete_rqst(rqst->rq_task, status);
xprt_unpin_rqst(rqst);
- spin_unlock(&xprt->recv_lock);
+ spin_unlock(&xprt->queue_lock);
return;
/* If the incoming reply terminated a pending RPC, the next
@@ -1256,56 +1291,20 @@
out_badheader:
trace_xprtrdma_reply_hdr(rep);
r_xprt->rx_stats.bad_reply_count++;
- status = -EIO;
goto out;
}
-void rpcrdma_release_rqst(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
+static void rpcrdma_reply_done(struct kref *kref)
{
- /* Invalidate and unmap the data payloads before waking
- * the waiting application. This guarantees the memory
- * regions are properly fenced from the server before the
- * application accesses the data. It also ensures proper
- * send flow control: waking the next RPC waits until this
- * RPC has relinquished all its Send Queue entries.
- */
- if (!list_empty(&req->rl_registered))
- r_xprt->rx_ia.ri_ops->ro_unmap_sync(r_xprt,
- &req->rl_registered);
+ struct rpcrdma_req *req =
+ container_of(kref, struct rpcrdma_req, rl_kref);
- /* Ensure that any DMA mapped pages associated with
- * the Send of the RPC Call have been unmapped before
- * allowing the RPC to complete. This protects argument
- * memory not controlled by the RPC client from being
- * re-used before we're done with it.
- */
- if (test_bit(RPCRDMA_REQ_F_TX_RESOURCES, &req->rl_flags)) {
- r_xprt->rx_stats.reply_waits_for_send++;
- out_of_line_wait_on_bit(&req->rl_flags,
- RPCRDMA_REQ_F_TX_RESOURCES,
- bit_wait,
- TASK_UNINTERRUPTIBLE);
- }
+ rpcrdma_complete_rqst(req->rl_reply);
}
-/* Reply handling runs in the poll worker thread. Anything that
- * might wait is deferred to a separate workqueue.
- */
-void rpcrdma_deferred_completion(struct work_struct *work)
-{
- struct rpcrdma_rep *rep =
- container_of(work, struct rpcrdma_rep, rr_work);
- struct rpcrdma_req *req = rpcr_to_rdmar(rep->rr_rqst);
- struct rpcrdma_xprt *r_xprt = rep->rr_rxprt;
-
- trace_xprtrdma_defer_cmp(rep);
- if (rep->rr_wc_flags & IB_WC_WITH_INVALIDATE)
- r_xprt->rx_ia.ri_ops->ro_reminv(rep, &req->rl_registered);
- rpcrdma_release_rqst(r_xprt, req);
- rpcrdma_complete_rqst(rep);
-}
-
-/* Process received RPC/RDMA messages.
+/**
+ * rpcrdma_reply_handler - Process received RPC/RDMA messages
+ * @rep: Incoming rpcrdma_rep object to process
*
* Errors must result in the RPC task either being awakened, or
* allowed to timeout, to discover the errors at that time.
@@ -1320,14 +1319,15 @@
u32 credits;
__be32 *p;
- --buf->rb_posted_receives;
-
- if (rep->rr_hdrbuf.head[0].iov_len == 0)
- goto out_badstatus;
+ /* Any data means we had a useful conversation, so
+ * then we don't need to delay the next reconnect.
+ */
+ if (xprt->reestablish_timeout)
+ xprt->reestablish_timeout = 0;
/* Fixed transport header fields */
xdr_init_decode(&rep->rr_stream, &rep->rr_hdrbuf,
- rep->rr_hdrbuf.head[0].iov_base);
+ rep->rr_hdrbuf.head[0].iov_base, NULL);
p = xdr_inline_decode(&rep->rr_stream, 4 * sizeof(*p));
if (unlikely(!p))
goto out_shortreply;
@@ -1345,51 +1345,55 @@
/* Match incoming rpcrdma_rep to an rpcrdma_req to
* get context for handling any incoming chunks.
*/
- spin_lock(&xprt->recv_lock);
+ spin_lock(&xprt->queue_lock);
rqst = xprt_lookup_rqst(xprt, rep->rr_xid);
if (!rqst)
goto out_norqst;
xprt_pin_rqst(rqst);
+ spin_unlock(&xprt->queue_lock);
if (credits == 0)
credits = 1; /* don't deadlock */
else if (credits > buf->rb_max_requests)
credits = buf->rb_max_requests;
- buf->rb_credits = credits;
-
- spin_unlock(&xprt->recv_lock);
+ if (buf->rb_credits != credits) {
+ spin_lock(&xprt->transport_lock);
+ buf->rb_credits = credits;
+ xprt->cwnd = credits << RPC_CWNDSHIFT;
+ spin_unlock(&xprt->transport_lock);
+ }
req = rpcr_to_rdmar(rqst);
+ if (req->rl_reply) {
+ trace_xprtrdma_leaked_rep(rqst, req->rl_reply);
+ rpcrdma_recv_buffer_put(req->rl_reply);
+ }
req->rl_reply = rep;
rep->rr_rqst = rqst;
- clear_bit(RPCRDMA_REQ_F_PENDING, &req->rl_flags);
trace_xprtrdma_reply(rqst->rq_task, rep, req, credits);
- rpcrdma_post_recvs(r_xprt, false);
- queue_work(rpcrdma_receive_wq, &rep->rr_work);
+ if (rep->rr_wc_flags & IB_WC_WITH_INVALIDATE)
+ frwr_reminv(rep, &req->rl_registered);
+ if (!list_empty(&req->rl_registered))
+ frwr_unmap_async(r_xprt, req);
+ /* LocalInv completion will complete the RPC */
+ else
+ kref_put(&req->rl_kref, rpcrdma_reply_done);
return;
out_badversion:
trace_xprtrdma_reply_vers(rep);
- goto repost;
+ goto out;
-/* The RPC transaction has already been terminated, or the header
- * is corrupt.
- */
out_norqst:
- spin_unlock(&xprt->recv_lock);
+ spin_unlock(&xprt->queue_lock);
trace_xprtrdma_reply_rqst(rep);
- goto repost;
+ goto out;
out_shortreply:
trace_xprtrdma_reply_short(rep);
-/* If no pending RPC transaction was matched, post a replacement
- * receive buffer before returning.
- */
-repost:
- rpcrdma_post_recvs(r_xprt, false);
-out_badstatus:
+out:
rpcrdma_recv_buffer_put(rep);
}
diff --git a/net/sunrpc/xprtrdma/svc_rdma.c b/net/sunrpc/xprtrdma/svc_rdma.c
index 134bef6..97bca50 100644
--- a/net/sunrpc/xprtrdma/svc_rdma.c
+++ b/net/sunrpc/xprtrdma/svc_rdma.c
@@ -73,8 +73,6 @@
atomic_t rdma_stat_sq_poll;
atomic_t rdma_stat_sq_prod;
-struct workqueue_struct *svc_rdma_wq;
-
/*
* This function implements reading and resetting an atomic_t stat
* variable through read/write to a proc file. Any write to the file
@@ -230,14 +228,10 @@
void svc_rdma_cleanup(void)
{
dprintk("SVCRDMA Module Removed, deregister RPC RDMA transport\n");
- destroy_workqueue(svc_rdma_wq);
if (svcrdma_table_header) {
unregister_sysctl_table(svcrdma_table_header);
svcrdma_table_header = NULL;
}
-#if defined(CONFIG_SUNRPC_BACKCHANNEL)
- svc_unreg_xprt_class(&svc_rdma_bc_class);
-#endif
svc_unreg_xprt_class(&svc_rdma_class);
}
@@ -249,18 +243,11 @@
dprintk("\tmax_bc_requests : %u\n", svcrdma_max_bc_requests);
dprintk("\tmax_inline : %d\n", svcrdma_max_req_size);
- svc_rdma_wq = alloc_workqueue("svc_rdma", 0, 0);
- if (!svc_rdma_wq)
- return -ENOMEM;
-
if (!svcrdma_table_header)
svcrdma_table_header =
register_sysctl_table(svcrdma_root_table);
/* Register RDMA with the SVC transport switch */
svc_reg_xprt_class(&svc_rdma_class);
-#if defined(CONFIG_SUNRPC_BACKCHANNEL)
- svc_reg_xprt_class(&svc_rdma_bc_class);
-#endif
return 0;
}
diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
index b982766..d1fcc41 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
@@ -5,8 +5,6 @@
* Support for backward direction RPCs on RPC/RDMA (server-side).
*/
-#include <linux/module.h>
-
#include <linux/sunrpc/svc_rdma.h>
#include "xprt_rdma.h"
@@ -32,7 +30,6 @@
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
struct kvec *dst, *src = &rcvbuf->head[0];
struct rpc_rqst *req;
- unsigned long cwnd;
u32 credits;
size_t len;
__be32 xid;
@@ -56,7 +53,7 @@
if (src->iov_len < 24)
goto out_shortreply;
- spin_lock(&xprt->recv_lock);
+ spin_lock(&xprt->queue_lock);
req = xprt_lookup_rqst(xprt, xid);
if (!req)
goto out_notfound;
@@ -66,6 +63,8 @@
if (dst->iov_len < len)
goto out_unlock;
memcpy(dst->iov_base, p, len);
+ xprt_pin_rqst(req);
+ spin_unlock(&xprt->queue_lock);
credits = be32_to_cpup(rdma_resp + 2);
if (credits == 0)
@@ -73,20 +72,18 @@
else if (credits > r_xprt->rx_buf.rb_bc_max_requests)
credits = r_xprt->rx_buf.rb_bc_max_requests;
- spin_lock_bh(&xprt->transport_lock);
- cwnd = xprt->cwnd;
+ spin_lock(&xprt->transport_lock);
xprt->cwnd = credits << RPC_CWNDSHIFT;
- if (xprt->cwnd > cwnd)
- xprt_release_rqst_cong(req->rq_task);
- spin_unlock_bh(&xprt->transport_lock);
+ spin_unlock(&xprt->transport_lock);
-
+ spin_lock(&xprt->queue_lock);
ret = 0;
xprt_complete_rqst(req->rq_task, rcvbuf->len);
+ xprt_unpin_rqst(req);
rcvbuf->len = 0;
out_unlock:
- spin_unlock(&xprt->recv_lock);
+ spin_unlock(&xprt->queue_lock);
out:
return ret;
@@ -203,11 +200,10 @@
svc_rdma_send_ctxt_put(rdma, ctxt);
goto drop_connection;
}
- return rc;
+ return 0;
drop_connection:
dprintk("svcrdma: failed to send bc call\n");
- xprt_disconnect_done(xprt);
return -ENOTCONN;
}
@@ -215,9 +211,8 @@
* connection.
*/
static int
-xprt_rdma_bc_send_request(struct rpc_task *task)
+xprt_rdma_bc_send_request(struct rpc_rqst *rqst)
{
- struct rpc_rqst *rqst = task->tk_rqstp;
struct svc_xprt *sxprt = rqst->rq_xprt->bc_xprt;
struct svcxprt_rdma *rdma;
int ret;
@@ -225,17 +220,15 @@
dprintk("svcrdma: sending bc call with xid: %08x\n",
be32_to_cpu(rqst->rq_xid));
- if (!mutex_trylock(&sxprt->xpt_mutex)) {
- rpc_sleep_on(&sxprt->xpt_bc_pending, task, NULL);
- if (!mutex_trylock(&sxprt->xpt_mutex))
- return -EAGAIN;
- rpc_wake_up_queued_task(&sxprt->xpt_bc_pending, task);
- }
+ mutex_lock(&sxprt->xpt_mutex);
ret = -ENOTCONN;
rdma = container_of(sxprt, struct svcxprt_rdma, sc_xprt);
- if (!test_bit(XPT_DEAD, &sxprt->xpt_flags))
+ if (!test_bit(XPT_DEAD, &sxprt->xpt_flags)) {
ret = rpcrdma_bc_send_request(rdma, rqst);
+ if (ret == -ENOTCONN)
+ svc_close_xprt(sxprt);
+ }
mutex_unlock(&sxprt->xpt_mutex);
@@ -257,7 +250,6 @@
dprintk("svcrdma: %s: xprt %p\n", __func__, xprt);
xprt_free(xprt);
- module_put(THIS_MODULE);
}
static const struct rpc_xprt_ops xprt_rdma_bc_procs = {
@@ -269,7 +261,7 @@
.buf_alloc = xprt_rdma_bc_allocate,
.buf_free = xprt_rdma_bc_free,
.send_request = xprt_rdma_bc_send_request,
- .set_retrans_timeout = xprt_set_retrans_timeout_def,
+ .wait_for_reply_request = xprt_wait_for_reply_request_def,
.close = xprt_rdma_bc_close,
.destroy = xprt_rdma_bc_put,
.print_stats = xprt_rdma_print_stats
@@ -312,7 +304,6 @@
xprt->idle_timeout = RPCRDMA_IDLE_DISC_TO;
xprt->prot = XPRT_TRANSPORT_BC_RDMA;
- xprt->tsh_size = 0;
xprt->ops = &xprt_rdma_bc_procs;
memcpy(&xprt->addr, args->dstaddr, args->addrlen);
@@ -329,20 +320,9 @@
args->bc_xprt->xpt_bc_xprt = xprt;
xprt->bc_xprt = args->bc_xprt;
- if (!try_module_get(THIS_MODULE))
- goto out_fail;
-
/* Final put for backchannel xprt is in __svc_rdma_free */
xprt_get(xprt);
return xprt;
-
-out_fail:
- xprt_rdma_free_addresses(xprt);
- args->bc_xprt->xpt_bc_xprt = NULL;
- args->bc_xprt->xpt_bc_xps = NULL;
- xprt_put(xprt);
- xprt_free(xprt);
- return ERR_PTR(-EINVAL);
}
struct xprt_class xprt_rdma_bc = {
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index b24d5b8..96bccd3 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -172,9 +172,10 @@
void svc_rdma_recv_ctxts_destroy(struct svcxprt_rdma *rdma)
{
struct svc_rdma_recv_ctxt *ctxt;
+ struct llist_node *node;
- while ((ctxt = svc_rdma_next_recv_ctxt(&rdma->sc_recv_ctxts))) {
- list_del(&ctxt->rc_list);
+ while ((node = llist_del_first(&rdma->sc_recv_ctxts))) {
+ ctxt = llist_entry(node, struct svc_rdma_recv_ctxt, rc_node);
svc_rdma_recv_ctxt_destroy(rdma, ctxt);
}
}
@@ -183,21 +184,18 @@
svc_rdma_recv_ctxt_get(struct svcxprt_rdma *rdma)
{
struct svc_rdma_recv_ctxt *ctxt;
+ struct llist_node *node;
- spin_lock(&rdma->sc_recv_lock);
- ctxt = svc_rdma_next_recv_ctxt(&rdma->sc_recv_ctxts);
- if (!ctxt)
+ node = llist_del_first(&rdma->sc_recv_ctxts);
+ if (!node)
goto out_empty;
- list_del(&ctxt->rc_list);
- spin_unlock(&rdma->sc_recv_lock);
+ ctxt = llist_entry(node, struct svc_rdma_recv_ctxt, rc_node);
out:
ctxt->rc_page_count = 0;
return ctxt;
out_empty:
- spin_unlock(&rdma->sc_recv_lock);
-
ctxt = svc_rdma_recv_ctxt_alloc(rdma);
if (!ctxt)
return NULL;
@@ -218,11 +216,9 @@
for (i = 0; i < ctxt->rc_page_count; i++)
put_page(ctxt->rc_pages[i]);
- if (!ctxt->rc_temp) {
- spin_lock(&rdma->sc_recv_lock);
- list_add(&ctxt->rc_list, &rdma->sc_recv_ctxts);
- spin_unlock(&rdma->sc_recv_lock);
- } else
+ if (!ctxt->rc_temp)
+ llist_add(&ctxt->rc_node, &rdma->sc_recv_ctxts);
+ else
svc_rdma_recv_ctxt_destroy(rdma, ctxt);
}
@@ -272,11 +268,8 @@
return false;
ctxt->rc_temp = true;
ret = __svc_rdma_post_recv(rdma, ctxt);
- if (ret) {
- pr_err("svcrdma: failure posting recv buffers: %d\n",
- ret);
+ if (ret)
return false;
- }
}
return true;
}
@@ -314,17 +307,14 @@
spin_lock(&rdma->sc_rq_dto_lock);
list_add_tail(&ctxt->rc_list, &rdma->sc_rq_dto_q);
- spin_unlock(&rdma->sc_rq_dto_lock);
+ /* Note the unlock pairs with the smp_rmb in svc_xprt_ready: */
set_bit(XPT_DATA, &rdma->sc_xprt.xpt_flags);
+ spin_unlock(&rdma->sc_rq_dto_lock);
if (!test_bit(RDMAXPRT_CONN_PENDING, &rdma->sc_flags))
svc_xprt_enqueue(&rdma->sc_xprt);
goto out;
flushed:
- if (wc->status != IB_WC_WR_FLUSH_ERR)
- pr_err("svcrdma: Recv: %s (%u/0x%x)\n",
- ib_wc_status_msg(wc->status),
- wc->status, wc->vendor_err);
post_err:
svc_rdma_recv_ctxt_put(rdma, ctxt);
set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags);
@@ -485,6 +475,68 @@
return p;
}
+/* RPC-over-RDMA Version One private extension: Remote Invalidation.
+ * Responder's choice: requester signals it can handle Send With
+ * Invalidate, and responder chooses one R_key to invalidate.
+ *
+ * If there is exactly one distinct R_key in the received transport
+ * header, set rc_inv_rkey to that R_key. Otherwise, set it to zero.
+ *
+ * Perform this operation while the received transport header is
+ * still in the CPU cache.
+ */
+static void svc_rdma_get_inv_rkey(struct svcxprt_rdma *rdma,
+ struct svc_rdma_recv_ctxt *ctxt)
+{
+ __be32 inv_rkey, *p;
+ u32 i, segcount;
+
+ ctxt->rc_inv_rkey = 0;
+
+ if (!rdma->sc_snd_w_inv)
+ return;
+
+ inv_rkey = xdr_zero;
+ p = ctxt->rc_recv_buf;
+ p += rpcrdma_fixed_maxsz;
+
+ /* Read list */
+ while (*p++ != xdr_zero) {
+ p++; /* position */
+ if (inv_rkey == xdr_zero)
+ inv_rkey = *p;
+ else if (inv_rkey != *p)
+ return;
+ p += 4;
+ }
+
+ /* Write list */
+ while (*p++ != xdr_zero) {
+ segcount = be32_to_cpup(p++);
+ for (i = 0; i < segcount; i++) {
+ if (inv_rkey == xdr_zero)
+ inv_rkey = *p;
+ else if (inv_rkey != *p)
+ return;
+ p += 4;
+ }
+ }
+
+ /* Reply chunk */
+ if (*p++ != xdr_zero) {
+ segcount = be32_to_cpup(p++);
+ for (i = 0; i < segcount; i++) {
+ if (inv_rkey == xdr_zero)
+ inv_rkey = *p;
+ else if (inv_rkey != *p)
+ return;
+ p += 4;
+ }
+ }
+
+ ctxt->rc_inv_rkey = be32_to_cpu(inv_rkey);
+}
+
/* On entry, xdr->head[0].iov_base points to first byte in the
* RPC-over-RDMA header.
*
@@ -746,6 +798,7 @@
svc_rdma_recv_ctxt_put(rdma_xprt, ctxt);
return ret;
}
+ svc_rdma_get_inv_rkey(rdma_xprt, ctxt);
p += rpcrdma_fixed_maxsz;
if (*p != xdr_zero)
diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c
index dc19517..48fe3b1 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_rw.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c
@@ -64,8 +64,7 @@
spin_unlock(&rdma->sc_rw_ctxt_lock);
} else {
spin_unlock(&rdma->sc_rw_ctxt_lock);
- ctxt = kmalloc(sizeof(*ctxt) +
- SG_CHUNK_SIZE * sizeof(struct scatterlist),
+ ctxt = kmalloc(struct_size(ctxt, rw_first_sgl, SG_CHUNK_SIZE),
GFP_KERNEL);
if (!ctxt)
goto out;
@@ -74,7 +73,8 @@
ctxt->rw_sg_table.sgl = ctxt->rw_first_sgl;
if (sg_alloc_table_chained(&ctxt->rw_sg_table, sges,
- ctxt->rw_sg_table.sgl)) {
+ ctxt->rw_sg_table.sgl,
+ SG_CHUNK_SIZE)) {
kfree(ctxt);
ctxt = NULL;
}
@@ -85,7 +85,7 @@
static void svc_rdma_put_rw_ctxt(struct svcxprt_rdma *rdma,
struct svc_rdma_rw_ctxt *ctxt)
{
- sg_free_table_chained(&ctxt->rw_sg_table, true);
+ sg_free_table_chained(&ctxt->rw_sg_table, SG_CHUNK_SIZE);
spin_lock(&rdma->sc_rw_ctxt_lock);
list_add(&ctxt->rw_list, &rdma->sc_rw_ctxts);
@@ -213,13 +213,8 @@
atomic_add(cc->cc_sqecount, &rdma->sc_sq_avail);
wake_up(&rdma->sc_send_wait);
- if (unlikely(wc->status != IB_WC_SUCCESS)) {
+ if (unlikely(wc->status != IB_WC_SUCCESS))
set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags);
- if (wc->status != IB_WC_WR_FLUSH_ERR)
- pr_err("svcrdma: write ctx: %s (%u/0x%x)\n",
- ib_wc_status_msg(wc->status),
- wc->status, wc->vendor_err);
- }
svc_rdma_write_info_free(info);
}
@@ -278,18 +273,15 @@
if (unlikely(wc->status != IB_WC_SUCCESS)) {
set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags);
- if (wc->status != IB_WC_WR_FLUSH_ERR)
- pr_err("svcrdma: read ctx: %s (%u/0x%x)\n",
- ib_wc_status_msg(wc->status),
- wc->status, wc->vendor_err);
svc_rdma_recv_ctxt_put(rdma, info->ri_readctxt);
} else {
spin_lock(&rdma->sc_rq_dto_lock);
list_add_tail(&info->ri_readctxt->rc_list,
&rdma->sc_read_complete_q);
+ /* Note the unlock pairs with the smp_rmb in svc_xprt_ready: */
+ set_bit(XPT_DATA, &rdma->sc_xprt.xpt_flags);
spin_unlock(&rdma->sc_rq_dto_lock);
- set_bit(XPT_DATA, &rdma->sc_xprt.xpt_flags);
svc_xprt_enqueue(&rdma->sc_xprt);
}
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index 8602a5f..6fdba72 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -272,10 +272,6 @@
if (unlikely(wc->status != IB_WC_SUCCESS)) {
set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags);
svc_xprt_enqueue(&rdma->sc_xprt);
- if (wc->status != IB_WC_WR_FLUSH_ERR)
- pr_err("svcrdma: Send: %s (%u/0x%x)\n",
- ib_wc_status_msg(wc->status),
- wc->status, wc->vendor_err);
}
svc_xprt_put(&rdma->sc_xprt);
@@ -484,32 +480,6 @@
*reply = NULL;
}
-/* RPC-over-RDMA Version One private extension: Remote Invalidation.
- * Responder's choice: requester signals it can handle Send With
- * Invalidate, and responder chooses one rkey to invalidate.
- *
- * Find a candidate rkey to invalidate when sending a reply. Picks the
- * first R_key it finds in the chunk lists.
- *
- * Returns zero if RPC's chunk lists are empty.
- */
-static u32 svc_rdma_get_inv_rkey(__be32 *rdma_argp,
- __be32 *wr_lst, __be32 *rp_ch)
-{
- __be32 *p;
-
- p = rdma_argp + rpcrdma_fixed_maxsz;
- if (*p != xdr_zero)
- p += 2;
- else if (wr_lst && be32_to_cpup(wr_lst + 1))
- p = wr_lst + 2;
- else if (rp_ch && be32_to_cpup(rp_ch + 1))
- p = rp_ch + 2;
- else
- return 0;
- return be32_to_cpup(p);
-}
-
static int svc_rdma_dma_map_page(struct svcxprt_rdma *rdma,
struct svc_rdma_send_ctxt *ctxt,
struct page *page,
@@ -563,6 +533,99 @@
DMA_TO_DEVICE);
}
+/* If the xdr_buf has more elements than the device can
+ * transmit in a single RDMA Send, then the reply will
+ * have to be copied into a bounce buffer.
+ */
+static bool svc_rdma_pull_up_needed(struct svcxprt_rdma *rdma,
+ struct xdr_buf *xdr,
+ __be32 *wr_lst)
+{
+ int elements;
+
+ /* xdr->head */
+ elements = 1;
+
+ /* xdr->pages */
+ if (!wr_lst) {
+ unsigned int remaining;
+ unsigned long pageoff;
+
+ pageoff = xdr->page_base & ~PAGE_MASK;
+ remaining = xdr->page_len;
+ while (remaining) {
+ ++elements;
+ remaining -= min_t(u32, PAGE_SIZE - pageoff,
+ remaining);
+ pageoff = 0;
+ }
+ }
+
+ /* xdr->tail */
+ if (xdr->tail[0].iov_len)
+ ++elements;
+
+ /* assume 1 SGE is needed for the transport header */
+ return elements >= rdma->sc_max_send_sges;
+}
+
+/* The device is not capable of sending the reply directly.
+ * Assemble the elements of @xdr into the transport header
+ * buffer.
+ */
+static int svc_rdma_pull_up_reply_msg(struct svcxprt_rdma *rdma,
+ struct svc_rdma_send_ctxt *ctxt,
+ struct xdr_buf *xdr, __be32 *wr_lst)
+{
+ unsigned char *dst, *tailbase;
+ unsigned int taillen;
+
+ dst = ctxt->sc_xprt_buf;
+ dst += ctxt->sc_sges[0].length;
+
+ memcpy(dst, xdr->head[0].iov_base, xdr->head[0].iov_len);
+ dst += xdr->head[0].iov_len;
+
+ tailbase = xdr->tail[0].iov_base;
+ taillen = xdr->tail[0].iov_len;
+ if (wr_lst) {
+ u32 xdrpad;
+
+ xdrpad = xdr_padsize(xdr->page_len);
+ if (taillen && xdrpad) {
+ tailbase += xdrpad;
+ taillen -= xdrpad;
+ }
+ } else {
+ unsigned int len, remaining;
+ unsigned long pageoff;
+ struct page **ppages;
+
+ ppages = xdr->pages + (xdr->page_base >> PAGE_SHIFT);
+ pageoff = xdr->page_base & ~PAGE_MASK;
+ remaining = xdr->page_len;
+ while (remaining) {
+ len = min_t(u32, PAGE_SIZE - pageoff, remaining);
+
+ memcpy(dst, page_address(*ppages), len);
+ remaining -= len;
+ dst += len;
+ pageoff = 0;
+ }
+ }
+
+ if (taillen)
+ memcpy(dst, tailbase, taillen);
+
+ ctxt->sc_sges[0].length += xdr->len;
+ ib_dma_sync_single_for_device(rdma->sc_pd->device,
+ ctxt->sc_sges[0].addr,
+ ctxt->sc_sges[0].length,
+ DMA_TO_DEVICE);
+
+ return 0;
+}
+
/* svc_rdma_map_reply_msg - Map the buffer holding RPC message
* @rdma: controlling transport
* @ctxt: send_ctxt for the Send WR
@@ -585,8 +648,10 @@
u32 xdr_pad;
int ret;
- if (++ctxt->sc_cur_sge_no >= rdma->sc_max_send_sges)
- return -EIO;
+ if (svc_rdma_pull_up_needed(rdma, xdr, wr_lst))
+ return svc_rdma_pull_up_reply_msg(rdma, ctxt, xdr, wr_lst);
+
+ ++ctxt->sc_cur_sge_no;
ret = svc_rdma_dma_map_buf(rdma, ctxt,
xdr->head[0].iov_base,
xdr->head[0].iov_len);
@@ -617,8 +682,7 @@
while (remaining) {
len = min_t(u32, PAGE_SIZE - page_off, remaining);
- if (++ctxt->sc_cur_sge_no >= rdma->sc_max_send_sges)
- return -EIO;
+ ++ctxt->sc_cur_sge_no;
ret = svc_rdma_dma_map_page(rdma, ctxt, *ppages++,
page_off, len);
if (ret < 0)
@@ -632,8 +696,7 @@
len = xdr->tail[0].iov_len;
tail:
if (len) {
- if (++ctxt->sc_cur_sge_no >= rdma->sc_max_send_sges)
- return -EIO;
+ ++ctxt->sc_cur_sge_no;
ret = svc_rdma_dma_map_buf(rdma, ctxt, base, len);
if (ret < 0)
return ret;
@@ -672,7 +735,7 @@
*
* RDMA Send is the last step of transmitting an RPC reply. Pages
* involved in the earlier RDMA Writes are here transferred out
- * of the rqstp and into the ctxt's page array. These pages are
+ * of the rqstp and into the sctxt's page array. These pages are
* DMA unmapped by each Write completion, but the subsequent Send
* completion finally releases these pages.
*
@@ -680,32 +743,31 @@
* - The Reply's transport header will never be larger than a page.
*/
static int svc_rdma_send_reply_msg(struct svcxprt_rdma *rdma,
- struct svc_rdma_send_ctxt *ctxt,
- __be32 *rdma_argp,
+ struct svc_rdma_send_ctxt *sctxt,
+ struct svc_rdma_recv_ctxt *rctxt,
struct svc_rqst *rqstp,
__be32 *wr_lst, __be32 *rp_ch)
{
int ret;
if (!rp_ch) {
- ret = svc_rdma_map_reply_msg(rdma, ctxt,
+ ret = svc_rdma_map_reply_msg(rdma, sctxt,
&rqstp->rq_res, wr_lst);
if (ret < 0)
return ret;
}
- svc_rdma_save_io_pages(rqstp, ctxt);
+ svc_rdma_save_io_pages(rqstp, sctxt);
- ctxt->sc_send_wr.opcode = IB_WR_SEND;
- if (rdma->sc_snd_w_inv) {
- ctxt->sc_send_wr.ex.invalidate_rkey =
- svc_rdma_get_inv_rkey(rdma_argp, wr_lst, rp_ch);
- if (ctxt->sc_send_wr.ex.invalidate_rkey)
- ctxt->sc_send_wr.opcode = IB_WR_SEND_WITH_INV;
+ if (rctxt->rc_inv_rkey) {
+ sctxt->sc_send_wr.opcode = IB_WR_SEND_WITH_INV;
+ sctxt->sc_send_wr.ex.invalidate_rkey = rctxt->rc_inv_rkey;
+ } else {
+ sctxt->sc_send_wr.opcode = IB_WR_SEND;
}
dprintk("svcrdma: posting Send WR with %u sge(s)\n",
- ctxt->sc_send_wr.num_sge);
- return svc_rdma_send(rdma, &ctxt->sc_send_wr);
+ sctxt->sc_send_wr.num_sge);
+ return svc_rdma_send(rdma, &sctxt->sc_send_wr);
}
/* Given the client-provided Write and Reply chunks, the server was not
@@ -741,10 +803,6 @@
return 0;
}
-void svc_rdma_prep_reply_hdr(struct svc_rqst *rqstp)
-{
-}
-
/**
* svc_rdma_sendto - Transmit an RPC reply
* @rqstp: processed RPC request, reply XDR already in ::rq_res
@@ -809,7 +867,7 @@
}
svc_rdma_sync_reply_hdr(rdma, sctxt, svc_rdma_reply_hdr_len(rdma_resp));
- ret = svc_rdma_send_reply_msg(rdma, sctxt, rdma_argp, rqstp,
+ ret = svc_rdma_send_reply_msg(rdma, sctxt, rctxt, rqstp,
wr_lst, rp_ch);
if (ret < 0)
goto err1;
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index 2848caf..145a361 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -85,7 +85,6 @@
.xpo_release_rqst = svc_rdma_release_rqst,
.xpo_detach = svc_rdma_detach,
.xpo_free = svc_rdma_free,
- .xpo_prep_reply_hdr = svc_rdma_prep_reply_hdr,
.xpo_has_wspace = svc_rdma_has_wspace,
.xpo_accept = svc_rdma_accept,
.xpo_secure_port = svc_rdma_secure_port,
@@ -100,64 +99,6 @@
.xcl_ident = XPRT_TRANSPORT_RDMA,
};
-#if defined(CONFIG_SUNRPC_BACKCHANNEL)
-static struct svc_xprt *svc_rdma_bc_create(struct svc_serv *, struct net *,
- struct sockaddr *, int, int);
-static void svc_rdma_bc_detach(struct svc_xprt *);
-static void svc_rdma_bc_free(struct svc_xprt *);
-
-static const struct svc_xprt_ops svc_rdma_bc_ops = {
- .xpo_create = svc_rdma_bc_create,
- .xpo_detach = svc_rdma_bc_detach,
- .xpo_free = svc_rdma_bc_free,
- .xpo_prep_reply_hdr = svc_rdma_prep_reply_hdr,
- .xpo_secure_port = svc_rdma_secure_port,
-};
-
-struct svc_xprt_class svc_rdma_bc_class = {
- .xcl_name = "rdma-bc",
- .xcl_owner = THIS_MODULE,
- .xcl_ops = &svc_rdma_bc_ops,
- .xcl_max_payload = (1024 - RPCRDMA_HDRLEN_MIN)
-};
-
-static struct svc_xprt *svc_rdma_bc_create(struct svc_serv *serv,
- struct net *net,
- struct sockaddr *sa, int salen,
- int flags)
-{
- struct svcxprt_rdma *cma_xprt;
- struct svc_xprt *xprt;
-
- cma_xprt = svc_rdma_create_xprt(serv, net);
- if (!cma_xprt)
- return ERR_PTR(-ENOMEM);
- xprt = &cma_xprt->sc_xprt;
-
- svc_xprt_init(net, &svc_rdma_bc_class, xprt, serv);
- set_bit(XPT_CONG_CTRL, &xprt->xpt_flags);
- serv->sv_bc_xprt = xprt;
-
- dprintk("svcrdma: %s(%p)\n", __func__, xprt);
- return xprt;
-}
-
-static void svc_rdma_bc_detach(struct svc_xprt *xprt)
-{
- dprintk("svcrdma: %s(%p)\n", __func__, xprt);
-}
-
-static void svc_rdma_bc_free(struct svc_xprt *xprt)
-{
- struct svcxprt_rdma *rdma =
- container_of(xprt, struct svcxprt_rdma, sc_xprt);
-
- dprintk("svcrdma: %s(%p)\n", __func__, xprt);
- if (xprt)
- kfree(rdma);
-}
-#endif /* CONFIG_SUNRPC_BACKCHANNEL */
-
/* QP event handler */
static void qp_event_handler(struct ib_event *event, void *context)
{
@@ -199,14 +140,13 @@
INIT_LIST_HEAD(&cma_xprt->sc_rq_dto_q);
INIT_LIST_HEAD(&cma_xprt->sc_read_complete_q);
INIT_LIST_HEAD(&cma_xprt->sc_send_ctxts);
- INIT_LIST_HEAD(&cma_xprt->sc_recv_ctxts);
+ init_llist_head(&cma_xprt->sc_recv_ctxts);
INIT_LIST_HEAD(&cma_xprt->sc_rw_ctxts);
init_waitqueue_head(&cma_xprt->sc_send_wait);
spin_lock_init(&cma_xprt->sc_lock);
spin_lock_init(&cma_xprt->sc_rq_dto_lock);
spin_lock_init(&cma_xprt->sc_send_lock);
- spin_lock_init(&cma_xprt->sc_recv_lock);
spin_lock_init(&cma_xprt->sc_rw_ctxt_lock);
/*
@@ -270,9 +210,14 @@
/* Save client advertised inbound read limit for use later in accept. */
newxprt->sc_ord = param->initiator_depth;
- /* Set the local and remote addresses in the transport */
sa = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.dst_addr;
svc_xprt_set_remote(&newxprt->sc_xprt, sa, svc_addr_len(sa));
+ /* The remote port is arbitrary and not under the control of the
+ * client ULP. Set it to a fixed value so that the DRC continues
+ * to be effective after a reconnect.
+ */
+ rpc_set_port((struct sockaddr *)&newxprt->sc_xprt.xpt_remote, 0);
+
sa = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.src_addr;
svc_xprt_set_local(&newxprt->sc_xprt, sa, svc_addr_len(sa));
@@ -280,9 +225,9 @@
* Enqueue the new transport on the accept queue of the listening
* transport
*/
- spin_lock_bh(&listen_xprt->sc_lock);
+ spin_lock(&listen_xprt->sc_lock);
list_add_tail(&newxprt->sc_accept_q, &listen_xprt->sc_accept_q);
- spin_unlock_bh(&listen_xprt->sc_lock);
+ spin_unlock(&listen_xprt->sc_lock);
set_bit(XPT_CONN, &listen_xprt->sc_xprt.xpt_flags);
svc_xprt_enqueue(&listen_xprt->sc_xprt);
@@ -449,13 +394,13 @@
struct ib_qp_init_attr qp_attr;
unsigned int ctxts, rq_depth;
struct ib_device *dev;
- struct sockaddr *sap;
int ret = 0;
+ RPC_IFDEBUG(struct sockaddr *sap);
listen_rdma = container_of(xprt, struct svcxprt_rdma, sc_xprt);
clear_bit(XPT_CONN, &xprt->xpt_flags);
/* Get the next entry off the accept list */
- spin_lock_bh(&listen_rdma->sc_lock);
+ spin_lock(&listen_rdma->sc_lock);
if (!list_empty(&listen_rdma->sc_accept_q)) {
newxprt = list_entry(listen_rdma->sc_accept_q.next,
struct svcxprt_rdma, sc_accept_q);
@@ -463,7 +408,7 @@
}
if (!list_empty(&listen_rdma->sc_accept_q))
set_bit(XPT_CONN, &listen_rdma->sc_xprt.xpt_flags);
- spin_unlock_bh(&listen_rdma->sc_lock);
+ spin_unlock(&listen_rdma->sc_lock);
if (!newxprt)
return NULL;
@@ -475,13 +420,12 @@
/* Qualify the transport resource defaults with the
* capabilities of this particular device */
- newxprt->sc_max_send_sges = dev->attrs.max_send_sge;
- /* transport hdr, head iovec, one page list entry, tail iovec */
- if (newxprt->sc_max_send_sges < 4) {
- pr_err("svcrdma: too few Send SGEs available (%d)\n",
- newxprt->sc_max_send_sges);
- goto errout;
- }
+ /* Transport header, head iovec, tail iovec */
+ newxprt->sc_max_send_sges = 3;
+ /* Add one SGE per page list entry */
+ newxprt->sc_max_send_sges += (svcrdma_max_req_size / PAGE_SIZE) + 1;
+ if (newxprt->sc_max_send_sges > dev->attrs.max_send_sge)
+ newxprt->sc_max_send_sges = dev->attrs.max_send_sge;
newxprt->sc_max_req_size = svcrdma_max_req_size;
newxprt->sc_max_requests = svcrdma_max_requests;
newxprt->sc_max_bc_requests = svcrdma_max_bc_requests;
@@ -509,14 +453,14 @@
dprintk("svcrdma: error creating PD for connect request\n");
goto errout;
}
- newxprt->sc_sq_cq = ib_alloc_cq(dev, newxprt, newxprt->sc_sq_depth,
- 0, IB_POLL_WORKQUEUE);
+ newxprt->sc_sq_cq = ib_alloc_cq_any(dev, newxprt, newxprt->sc_sq_depth,
+ IB_POLL_WORKQUEUE);
if (IS_ERR(newxprt->sc_sq_cq)) {
dprintk("svcrdma: error creating SQ CQ for connect request\n");
goto errout;
}
- newxprt->sc_rq_cq = ib_alloc_cq(dev, newxprt, rq_depth,
- 0, IB_POLL_WORKQUEUE);
+ newxprt->sc_rq_cq =
+ ib_alloc_cq_any(dev, newxprt, rq_depth, IB_POLL_WORKQUEUE);
if (IS_ERR(newxprt->sc_rq_cq)) {
dprintk("svcrdma: error creating RQ CQ for connect request\n");
goto errout;
@@ -585,6 +529,7 @@
if (ret)
goto errout;
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
dprintk("svcrdma: new connection %p accepted:\n", newxprt);
sap = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.src_addr;
dprintk(" local address : %pIS:%u\n", sap, rpc_get_port(sap));
@@ -595,6 +540,7 @@
dprintk(" rdma_rw_ctxs : %d\n", ctxts);
dprintk(" max_requests : %d\n", newxprt->sc_max_requests);
dprintk(" ord : %d\n", conn_param.initiator_depth);
+#endif
trace_svcrdma_xprt_accept(&newxprt->sc_xprt);
return &newxprt->sc_xprt;
@@ -648,11 +594,6 @@
if (rdma->sc_qp && !IS_ERR(rdma->sc_qp))
ib_drain_qp(rdma->sc_qp);
- /* We should only be called from kref_put */
- if (kref_read(&xprt->xpt_ref) != 0)
- pr_err("svcrdma: sc_xprt still in use? (%d)\n",
- kref_read(&xprt->xpt_ref));
-
svc_rdma_flush_recv_queues(rdma);
/* Final put of backchannel client transport */
@@ -688,8 +629,9 @@
{
struct svcxprt_rdma *rdma =
container_of(xprt, struct svcxprt_rdma, sc_xprt);
+
INIT_WORK(&rdma->sc_work, __svc_rdma_free);
- queue_work(svc_rdma_wq, &rdma->sc_work);
+ schedule_work(&rdma->sc_work);
}
static int svc_rdma_has_wspace(struct svc_xprt *xprt)
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index 98cbc7b..160558b 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -68,9 +68,9 @@
* tunables
*/
-static unsigned int xprt_rdma_slot_table_entries = RPCRDMA_DEF_SLOT_TABLE;
+unsigned int xprt_rdma_slot_table_entries = RPCRDMA_DEF_SLOT_TABLE;
unsigned int xprt_rdma_max_inline_read = RPCRDMA_DEF_INLINE;
-static unsigned int xprt_rdma_max_inline_write = RPCRDMA_DEF_INLINE;
+unsigned int xprt_rdma_max_inline_write = RPCRDMA_DEF_INLINE;
unsigned int xprt_rdma_memreg_strategy = RPCRDMA_FRWR;
int xprt_rdma_pad_optimize;
@@ -80,7 +80,6 @@
static unsigned int max_slot_table_size = RPCRDMA_MAX_SLOT_TABLE;
static unsigned int min_inline_size = RPCRDMA_MIN_INLINE;
static unsigned int max_inline_size = RPCRDMA_MAX_INLINE;
-static unsigned int zero;
static unsigned int max_padding = PAGE_SIZE;
static unsigned int min_memreg = RPCRDMA_BOUNCEBUFFERS;
static unsigned int max_memreg = RPCRDMA_LAST - 1;
@@ -122,7 +121,7 @@
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
- .extra1 = &zero,
+ .extra1 = SYSCTL_ZERO,
.extra2 = &max_padding,
},
{
@@ -225,82 +224,70 @@
}
}
-void
-rpcrdma_conn_func(struct rpcrdma_ep *ep)
-{
- schedule_delayed_work(&ep->rep_connect_worker, 0);
-}
-
-void
-rpcrdma_connect_worker(struct work_struct *work)
-{
- struct rpcrdma_ep *ep =
- container_of(work, struct rpcrdma_ep, rep_connect_worker.work);
- struct rpcrdma_xprt *r_xprt =
- container_of(ep, struct rpcrdma_xprt, rx_ep);
- struct rpc_xprt *xprt = &r_xprt->rx_xprt;
-
- spin_lock_bh(&xprt->transport_lock);
- if (ep->rep_connected > 0) {
- if (!xprt_test_and_set_connected(xprt))
- xprt_wake_pending_tasks(xprt, 0);
- } else {
- if (xprt_test_and_clear_connected(xprt))
- xprt_wake_pending_tasks(xprt, -ENOTCONN);
- }
- spin_unlock_bh(&xprt->transport_lock);
-}
-
+/**
+ * xprt_rdma_connect_worker - establish connection in the background
+ * @work: worker thread context
+ *
+ * Requester holds the xprt's send lock to prevent activity on this
+ * transport while a fresh connection is being established. RPC tasks
+ * sleep on the xprt's pending queue waiting for connect to complete.
+ */
static void
xprt_rdma_connect_worker(struct work_struct *work)
{
struct rpcrdma_xprt *r_xprt = container_of(work, struct rpcrdma_xprt,
rx_connect_worker.work);
struct rpc_xprt *xprt = &r_xprt->rx_xprt;
- int rc = 0;
-
- xprt_clear_connected(xprt);
+ int rc;
rc = rpcrdma_ep_connect(&r_xprt->rx_ep, &r_xprt->rx_ia);
- if (rc)
- xprt_wake_pending_tasks(xprt, rc);
-
xprt_clear_connecting(xprt);
+ if (r_xprt->rx_ep.rep_connected > 0) {
+ if (!xprt_test_and_set_connected(xprt)) {
+ xprt->stat.connect_count++;
+ xprt->stat.connect_time += (long)jiffies -
+ xprt->stat.connect_start;
+ xprt_wake_pending_tasks(xprt, -EAGAIN);
+ }
+ } else {
+ if (xprt_test_and_clear_connected(xprt))
+ xprt_wake_pending_tasks(xprt, rc);
+ }
}
+/**
+ * xprt_rdma_inject_disconnect - inject a connection fault
+ * @xprt: transport context
+ *
+ * If @xprt is connected, disconnect it to simulate spurious connection
+ * loss.
+ */
static void
xprt_rdma_inject_disconnect(struct rpc_xprt *xprt)
{
- struct rpcrdma_xprt *r_xprt = container_of(xprt, struct rpcrdma_xprt,
- rx_xprt);
+ struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
- trace_xprtrdma_inject_dsc(r_xprt);
+ trace_xprtrdma_op_inject_dsc(r_xprt);
rdma_disconnect(r_xprt->rx_ia.ri_id);
}
-/*
- * xprt_rdma_destroy
+/**
+ * xprt_rdma_destroy - Full tear down of transport
+ * @xprt: doomed transport context
*
- * Destroy the xprt.
- * Free all memory associated with the object, including its own.
- * NOTE: none of the *destroy methods free memory for their top-level
- * objects, even though they may have allocated it (they do free
- * private memory). It's up to the caller to handle it. In this
- * case (RDMA transport), all structure memory is inlined with the
- * struct rpcrdma_xprt.
+ * Caller guarantees there will be no more calls to us with
+ * this @xprt.
*/
static void
xprt_rdma_destroy(struct rpc_xprt *xprt)
{
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
- trace_xprtrdma_destroy(r_xprt);
+ trace_xprtrdma_op_destroy(r_xprt);
cancel_delayed_work_sync(&r_xprt->rx_connect_worker);
- xprt_clear_connected(xprt);
-
- rpcrdma_ep_destroy(&r_xprt->rx_ep, &r_xprt->rx_ia);
+ rpcrdma_ep_destroy(r_xprt);
rpcrdma_buffer_destroy(&r_xprt->rx_buf);
rpcrdma_ia_close(&r_xprt->rx_ia);
@@ -310,6 +297,7 @@
module_put(THIS_MODULE);
}
+/* 60 second timeout, no retries */
static const struct rpc_timeout xprt_rdma_default_timeout = {
.to_initval = 60 * HZ,
.to_maxval = 60 * HZ,
@@ -323,33 +311,26 @@
static struct rpc_xprt *
xprt_setup_rdma(struct xprt_create *args)
{
- struct rpcrdma_create_data_internal cdata;
struct rpc_xprt *xprt;
struct rpcrdma_xprt *new_xprt;
- struct rpcrdma_ep *new_ep;
struct sockaddr *sap;
int rc;
- if (args->addrlen > sizeof(xprt->addr)) {
- dprintk("RPC: %s: address too large\n", __func__);
+ if (args->addrlen > sizeof(xprt->addr))
return ERR_PTR(-EBADF);
- }
xprt = xprt_alloc(args->net, sizeof(struct rpcrdma_xprt), 0, 0);
- if (xprt == NULL) {
- dprintk("RPC: %s: couldn't allocate rpcrdma_xprt\n",
- __func__);
+ if (!xprt)
return ERR_PTR(-ENOMEM);
- }
- /* 60 second timeout, no retries */
xprt->timeout = &xprt_rdma_default_timeout;
+ xprt->connect_timeout = xprt->timeout->to_initval;
+ xprt->max_reconnect_timeout = xprt->timeout->to_maxval;
xprt->bind_timeout = RPCRDMA_BIND_TO;
xprt->reestablish_timeout = RPCRDMA_INIT_REEST_TO;
xprt->idle_timeout = RPCRDMA_IDLE_DISC_TO;
xprt->resvport = 0; /* privileged port not needed */
- xprt->tsh_size = 0; /* RPC-RDMA handles framing */
xprt->ops = &xprt_rdma_procs;
/*
@@ -367,40 +348,12 @@
xprt_set_bound(xprt);
xprt_rdma_format_addresses(xprt, sap);
- cdata.max_requests = xprt_rdma_slot_table_entries;
-
- cdata.rsize = RPCRDMA_MAX_SEGS * PAGE_SIZE; /* RDMA write max */
- cdata.wsize = RPCRDMA_MAX_SEGS * PAGE_SIZE; /* RDMA read max */
-
- cdata.inline_wsize = xprt_rdma_max_inline_write;
- if (cdata.inline_wsize > cdata.wsize)
- cdata.inline_wsize = cdata.wsize;
-
- cdata.inline_rsize = xprt_rdma_max_inline_read;
- if (cdata.inline_rsize > cdata.rsize)
- cdata.inline_rsize = cdata.rsize;
-
- /*
- * Create new transport instance, which includes initialized
- * o ia
- * o endpoint
- * o buffers
- */
-
new_xprt = rpcx_to_rdmax(xprt);
-
rc = rpcrdma_ia_open(new_xprt);
if (rc)
goto out1;
- /*
- * initialize and create ep
- */
- new_xprt->rx_data = cdata;
- new_ep = &new_xprt->rx_ep;
-
- rc = rpcrdma_ep_create(&new_xprt->rx_ep,
- &new_xprt->rx_ia, &new_xprt->rx_data);
+ rc = rpcrdma_ep_create(new_xprt);
if (rc)
goto out2;
@@ -411,7 +364,7 @@
INIT_DELAYED_WORK(&new_xprt->rx_connect_worker,
xprt_rdma_connect_worker);
- xprt->max_payload = new_xprt->rx_ia.ri_ops->ro_maxpages(new_xprt);
+ xprt->max_payload = frwr_maxpages(new_xprt);
if (xprt->max_payload == 0)
goto out4;
xprt->max_payload <<= PAGE_SHIFT;
@@ -431,42 +384,45 @@
rpcrdma_buffer_destroy(&new_xprt->rx_buf);
rc = -ENODEV;
out3:
- rpcrdma_ep_destroy(new_ep, &new_xprt->rx_ia);
+ rpcrdma_ep_destroy(new_xprt);
out2:
rpcrdma_ia_close(&new_xprt->rx_ia);
out1:
- trace_xprtrdma_destroy(new_xprt);
+ trace_xprtrdma_op_destroy(new_xprt);
xprt_rdma_free_addresses(xprt);
xprt_free(xprt);
return ERR_PTR(rc);
}
/**
- * xprt_rdma_close - Close down RDMA connection
- * @xprt: generic transport to be closed
+ * xprt_rdma_close - close a transport connection
+ * @xprt: transport context
*
- * Called during transport shutdown reconnect, or device
- * removal. Caller holds the transport's write lock.
+ * Called during autoclose or device removal.
+ *
+ * Caller holds @xprt's send lock to prevent activity on this
+ * transport while the connection is torn down.
*/
-static void
-xprt_rdma_close(struct rpc_xprt *xprt)
+void xprt_rdma_close(struct rpc_xprt *xprt)
{
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
struct rpcrdma_ep *ep = &r_xprt->rx_ep;
struct rpcrdma_ia *ia = &r_xprt->rx_ia;
- dprintk("RPC: %s: closing xprt %p\n", __func__, xprt);
+ might_sleep();
+
+ trace_xprtrdma_op_close(r_xprt);
+
+ /* Prevent marshaling and sending of new requests */
+ xprt_clear_connected(xprt);
if (test_and_clear_bit(RPCRDMA_IAF_REMOVING, &ia->ri_flags)) {
- xprt_clear_connected(xprt);
rpcrdma_ia_remove(ia);
- return;
+ goto out;
}
+
if (ep->rep_connected == -ENODEV)
return;
- if (ep->rep_connected > 0)
- xprt->reestablish_timeout = 0;
- xprt_disconnect_done(xprt);
rpcrdma_ep_disconnect(ep, ia);
/* Prepare @xprt for the next connection by reinitializing
@@ -474,6 +430,11 @@
*/
r_xprt->rx_buf.rb_credits = 1;
xprt->cwnd = RPC_CWNDSHIFT;
+
+out:
+ xprt->reestablish_timeout = 0;
+ ++xprt->connect_cookie;
+ xprt_disconnect_done(xprt);
}
/**
@@ -525,25 +486,65 @@
xprt_force_disconnect(xprt);
}
+/**
+ * xprt_rdma_set_connect_timeout - set timeouts for establishing a connection
+ * @xprt: controlling transport instance
+ * @connect_timeout: reconnect timeout after client disconnects
+ * @reconnect_timeout: reconnect timeout after server disconnects
+ *
+ */
+static void xprt_rdma_set_connect_timeout(struct rpc_xprt *xprt,
+ unsigned long connect_timeout,
+ unsigned long reconnect_timeout)
+{
+ struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+
+ trace_xprtrdma_op_set_cto(r_xprt, connect_timeout, reconnect_timeout);
+
+ spin_lock(&xprt->transport_lock);
+
+ if (connect_timeout < xprt->connect_timeout) {
+ struct rpc_timeout to;
+ unsigned long initval;
+
+ to = *xprt->timeout;
+ initval = connect_timeout;
+ if (initval < RPCRDMA_INIT_REEST_TO << 1)
+ initval = RPCRDMA_INIT_REEST_TO << 1;
+ to.to_initval = initval;
+ to.to_maxval = initval;
+ r_xprt->rx_timeout = to;
+ xprt->timeout = &r_xprt->rx_timeout;
+ xprt->connect_timeout = connect_timeout;
+ }
+
+ if (reconnect_timeout < xprt->max_reconnect_timeout)
+ xprt->max_reconnect_timeout = reconnect_timeout;
+
+ spin_unlock(&xprt->transport_lock);
+}
+
+/**
+ * xprt_rdma_connect - schedule an attempt to reconnect
+ * @xprt: transport state
+ * @task: RPC scheduler context (unused)
+ *
+ */
static void
xprt_rdma_connect(struct rpc_xprt *xprt, struct rpc_task *task)
{
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+ unsigned long delay;
+ trace_xprtrdma_op_connect(r_xprt);
+
+ delay = 0;
if (r_xprt->rx_ep.rep_connected != 0) {
- /* Reconnect */
- schedule_delayed_work(&r_xprt->rx_connect_worker,
- xprt->reestablish_timeout);
- xprt->reestablish_timeout <<= 1;
- if (xprt->reestablish_timeout > RPCRDMA_MAX_REEST_TO)
- xprt->reestablish_timeout = RPCRDMA_MAX_REEST_TO;
- else if (xprt->reestablish_timeout < RPCRDMA_INIT_REEST_TO)
- xprt->reestablish_timeout = RPCRDMA_INIT_REEST_TO;
- } else {
- schedule_delayed_work(&r_xprt->rx_connect_worker, 0);
- if (!RPC_IS_ASYNC(task))
- flush_delayed_work(&r_xprt->rx_connect_worker);
+ delay = xprt_reconnect_delay(xprt);
+ xprt_reconnect_backoff(xprt, RPCRDMA_INIT_REEST_TO);
}
+ queue_delayed_work(xprtiod_workqueue, &r_xprt->rx_connect_worker,
+ delay);
}
/**
@@ -569,6 +570,7 @@
return;
out_sleep:
+ set_bit(XPRT_CONGESTED, &xprt->state);
rpc_sleep_on(&xprt->backlog, task, NULL);
task->tk_status = -EAGAIN;
}
@@ -582,57 +584,24 @@
static void
xprt_rdma_free_slot(struct rpc_xprt *xprt, struct rpc_rqst *rqst)
{
+ struct rpcrdma_xprt *r_xprt =
+ container_of(xprt, struct rpcrdma_xprt, rx_xprt);
+
memset(rqst, 0, sizeof(*rqst));
- rpcrdma_buffer_put(rpcr_to_rdmar(rqst));
- rpc_wake_up_next(&xprt->backlog);
+ rpcrdma_buffer_put(&r_xprt->rx_buf, rpcr_to_rdmar(rqst));
+ if (unlikely(!rpc_wake_up_next(&xprt->backlog)))
+ clear_bit(XPRT_CONGESTED, &xprt->state);
}
-static bool
-rpcrdma_get_sendbuf(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
- size_t size, gfp_t flags)
+static bool rpcrdma_check_regbuf(struct rpcrdma_xprt *r_xprt,
+ struct rpcrdma_regbuf *rb, size_t size,
+ gfp_t flags)
{
- struct rpcrdma_regbuf *rb;
-
- if (req->rl_sendbuf && rdmab_length(req->rl_sendbuf) >= size)
- return true;
-
- rb = rpcrdma_alloc_regbuf(size, DMA_TO_DEVICE, flags);
- if (IS_ERR(rb))
- return false;
-
- rpcrdma_free_regbuf(req->rl_sendbuf);
- r_xprt->rx_stats.hardway_register_count += size;
- req->rl_sendbuf = rb;
- return true;
-}
-
-/* The rq_rcv_buf is used only if a Reply chunk is necessary.
- * The decision to use a Reply chunk is made later in
- * rpcrdma_marshal_req. This buffer is registered at that time.
- *
- * Otherwise, the associated RPC Reply arrives in a separate
- * Receive buffer, arbitrarily chosen by the HCA. The buffer
- * allocated here for the RPC Reply is not utilized in that
- * case. See rpcrdma_inline_fixup.
- *
- * A regbuf is used here to remember the buffer size.
- */
-static bool
-rpcrdma_get_recvbuf(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
- size_t size, gfp_t flags)
-{
- struct rpcrdma_regbuf *rb;
-
- if (req->rl_recvbuf && rdmab_length(req->rl_recvbuf) >= size)
- return true;
-
- rb = rpcrdma_alloc_regbuf(size, DMA_NONE, flags);
- if (IS_ERR(rb))
- return false;
-
- rpcrdma_free_regbuf(req->rl_recvbuf);
- r_xprt->rx_stats.hardway_register_count += size;
- req->rl_recvbuf = rb;
+ if (unlikely(rdmab_length(rb) < size)) {
+ if (!rpcrdma_regbuf_realloc(rb, size, flags))
+ return false;
+ r_xprt->rx_stats.hardway_register_count += size;
+ }
return true;
}
@@ -644,13 +613,6 @@
* 0: Success; rq_buffer points to RPC buffer to use
* ENOMEM: Out of memory, call again later
* EIO: A permanent error occurred, do not retry
- *
- * The RDMA allocate/free functions need the task structure as a place
- * to hide the struct rpcrdma_req, which is necessary for the actual
- * send/recv sequence.
- *
- * xprt_rdma_allocate provides buffers that are already mapped for
- * DMA, and a local DMA lkey is provided for each.
*/
static int
xprt_rdma_allocate(struct rpc_task *task)
@@ -664,18 +626,20 @@
if (RPC_IS_SWAPPER(task))
flags = __GFP_MEMALLOC | GFP_NOWAIT | __GFP_NOWARN;
- if (!rpcrdma_get_sendbuf(r_xprt, req, rqst->rq_callsize, flags))
+ if (!rpcrdma_check_regbuf(r_xprt, req->rl_sendbuf, rqst->rq_callsize,
+ flags))
goto out_fail;
- if (!rpcrdma_get_recvbuf(r_xprt, req, rqst->rq_rcvsize, flags))
+ if (!rpcrdma_check_regbuf(r_xprt, req->rl_recvbuf, rqst->rq_rcvsize,
+ flags))
goto out_fail;
- rqst->rq_buffer = req->rl_sendbuf->rg_base;
- rqst->rq_rbuffer = req->rl_recvbuf->rg_base;
- trace_xprtrdma_allocate(task, req);
+ rqst->rq_buffer = rdmab_data(req->rl_sendbuf);
+ rqst->rq_rbuffer = rdmab_data(req->rl_recvbuf);
+ trace_xprtrdma_op_allocate(task, req);
return 0;
out_fail:
- trace_xprtrdma_allocate(task, NULL);
+ trace_xprtrdma_op_allocate(task, NULL);
return -ENOMEM;
}
@@ -692,14 +656,21 @@
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt);
struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
- if (test_bit(RPCRDMA_REQ_F_PENDING, &req->rl_flags))
- rpcrdma_release_rqst(r_xprt, req);
- trace_xprtrdma_rpc_done(task, req);
+ trace_xprtrdma_op_free(task, req);
+
+ if (!list_empty(&req->rl_registered))
+ frwr_unmap_sync(r_xprt, req);
+
+ /* XXX: If the RPC is completing because of a signal and
+ * not because a reply was received, we ought to ensure
+ * that the Send completion has fired, so that memory
+ * involved with the Send is not still visible to the NIC.
+ */
}
/**
* xprt_rdma_send_request - marshal and send an RPC request
- * @task: RPC task with an RPC message in rq_snd_buf
+ * @rqst: RPC message in rq_snd_buf
*
* Caller holds the transport's write lock.
*
@@ -708,13 +679,14 @@
* %-ENOTCONN if the caller should reconnect and call again
* %-EAGAIN if the caller should call again
* %-ENOBUFS if the caller should call again after a delay
- * %-EIO if a permanent error occurred and the request was not
- * sent. Do not try to send this message again.
+ * %-EMSGSIZE if encoding ran out of buffer space. The request
+ * was not sent. Do not try to send this message again.
+ * %-EIO if an I/O error occurred. The request was not sent.
+ * Do not try to send this message again.
*/
static int
-xprt_rdma_send_request(struct rpc_task *task)
+xprt_rdma_send_request(struct rpc_rqst *rqst)
{
- struct rpc_rqst *rqst = task->tk_rqstp;
struct rpc_xprt *xprt = rqst->rq_xprt;
struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
@@ -726,7 +698,10 @@
#endif /* CONFIG_SUNRPC_BACKCHANNEL */
if (!xprt_connected(xprt))
- goto drop_connection;
+ return -ENOTCONN;
+
+ if (!xprt_request_get_cong(xprt, rqst))
+ return -EBADSLT;
rc = rpcrdma_marshal_req(r_xprt, rqst);
if (rc < 0)
@@ -737,17 +712,15 @@
goto drop_connection;
rqst->rq_xtime = ktime_get();
- __set_bit(RPCRDMA_REQ_F_PENDING, &req->rl_flags);
if (rpcrdma_ep_post(&r_xprt->rx_ia, &r_xprt->rx_ep, req))
goto drop_connection;
rqst->rq_xmit_bytes_sent += rqst->rq_snd_buf.len;
- rqst->rq_bytes_sent = 0;
/* An RPC with no reply will throw off credit accounting,
* so drop the connection to reset the credit grant.
*/
- if (!rpc_reply_expected(task))
+ if (!rpc_reply_expected(rqst->rq_task))
goto drop_connection;
return 0;
@@ -755,8 +728,8 @@
if (rc != -ENOTCONN)
return rc;
drop_connection:
- xprt_disconnect_done(xprt);
- return -ENOTCONN; /* implies disconnect */
+ xprt_rdma_close(xprt);
+ return -ENOTCONN;
}
void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq)
@@ -772,7 +745,7 @@
0, /* need a local port? */
xprt->stat.bind_count,
xprt->stat.connect_count,
- xprt->stat.connect_time,
+ xprt->stat.connect_time / HZ,
idle_time,
xprt->stat.sends,
xprt->stat.recvs,
@@ -792,7 +765,7 @@
r_xprt->rx_stats.bad_reply_count,
r_xprt->rx_stats.nomsg_call_count);
seq_printf(seq, "%lu %lu %lu %lu %lu %lu\n",
- r_xprt->rx_stats.mrs_recovered,
+ r_xprt->rx_stats.mrs_recycled,
r_xprt->rx_stats.mrs_orphaned,
r_xprt->rx_stats.mrs_allocated,
r_xprt->rx_stats.local_inv_needed,
@@ -821,7 +794,7 @@
.alloc_slot = xprt_rdma_alloc_slot,
.free_slot = xprt_rdma_free_slot,
.release_request = xprt_release_rqst_cong, /* ditto */
- .set_retrans_timeout = xprt_set_retrans_timeout_def, /* ditto */
+ .wait_for_reply_request = xprt_wait_for_reply_request_def, /* ditto */
.timer = xprt_rdma_timer,
.rpcbind = rpcb_getport_async, /* sunrpc/rpcb_clnt.c */
.set_port = xprt_rdma_set_port,
@@ -831,14 +804,15 @@
.send_request = xprt_rdma_send_request,
.close = xprt_rdma_close,
.destroy = xprt_rdma_destroy,
+ .set_connect_timeout = xprt_rdma_set_connect_timeout,
.print_stats = xprt_rdma_print_stats,
.enable_swap = xprt_rdma_enable_swap,
.disable_swap = xprt_rdma_disable_swap,
.inject_disconnect = xprt_rdma_inject_disconnect,
#if defined(CONFIG_SUNRPC_BACKCHANNEL)
.bc_setup = xprt_rdma_bc_setup,
- .bc_up = xprt_rdma_bc_up,
.bc_maxpayload = xprt_rdma_bc_maxpayload,
+ .bc_num_slots = xprt_rdma_bc_max_slots,
.bc_free_rqst = xprt_rdma_bc_free_rqst,
.bc_destroy = xprt_rdma_bc_destroy,
#endif
@@ -854,58 +828,31 @@
void xprt_rdma_cleanup(void)
{
- int rc;
-
- dprintk("RPCRDMA Module Removed, deregister RPC RDMA transport\n");
#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
if (sunrpc_table_header) {
unregister_sysctl_table(sunrpc_table_header);
sunrpc_table_header = NULL;
}
#endif
- rc = xprt_unregister_transport(&xprt_rdma);
- if (rc)
- dprintk("RPC: %s: xprt_unregister returned %i\n",
- __func__, rc);
- rpcrdma_destroy_wq();
-
- rc = xprt_unregister_transport(&xprt_rdma_bc);
- if (rc)
- dprintk("RPC: %s: xprt_unregister(bc) returned %i\n",
- __func__, rc);
+ xprt_unregister_transport(&xprt_rdma);
+ xprt_unregister_transport(&xprt_rdma_bc);
}
int xprt_rdma_init(void)
{
int rc;
- rc = rpcrdma_alloc_wq();
+ rc = xprt_register_transport(&xprt_rdma);
if (rc)
return rc;
- rc = xprt_register_transport(&xprt_rdma);
- if (rc) {
- rpcrdma_destroy_wq();
- return rc;
- }
-
rc = xprt_register_transport(&xprt_rdma_bc);
if (rc) {
xprt_unregister_transport(&xprt_rdma);
- rpcrdma_destroy_wq();
return rc;
}
- dprintk("RPCRDMA Module Init, register RPC RDMA transport\n");
-
- dprintk("Defaults:\n");
- dprintk("\tSlots %d\n"
- "\tMaxInlineRead %d\n\tMaxInlineWrite %d\n",
- xprt_rdma_slot_table_entries,
- xprt_rdma_max_inline_read, xprt_rdma_max_inline_write);
- dprintk("\tPadding 0\n\tMemreg %d\n", xprt_rdma_memreg_strategy);
-
#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
if (!sunrpc_table_header)
sunrpc_table_header = register_sysctl_table(sunrpc_table);
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 956a5ea..3a90753 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -53,6 +53,7 @@
#include <linux/slab.h>
#include <linux/sunrpc/addr.h>
#include <linux/sunrpc/svc_rdma.h>
+#include <linux/log2.h>
#include <asm-generic/barrier.h>
#include <asm/bitops.h>
@@ -74,56 +75,52 @@
* internal functions
*/
static void rpcrdma_sendctx_put_locked(struct rpcrdma_sendctx *sc);
+static void rpcrdma_reps_destroy(struct rpcrdma_buffer *buf);
static void rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt);
static void rpcrdma_mrs_destroy(struct rpcrdma_buffer *buf);
-static int rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt, bool temp);
-static void rpcrdma_dma_unmap_regbuf(struct rpcrdma_regbuf *rb);
+static void rpcrdma_mr_free(struct rpcrdma_mr *mr);
+static struct rpcrdma_regbuf *
+rpcrdma_regbuf_alloc(size_t size, enum dma_data_direction direction,
+ gfp_t flags);
+static void rpcrdma_regbuf_dma_unmap(struct rpcrdma_regbuf *rb);
+static void rpcrdma_regbuf_free(struct rpcrdma_regbuf *rb);
+static void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp);
-struct workqueue_struct *rpcrdma_receive_wq __read_mostly;
-
-int
-rpcrdma_alloc_wq(void)
+/* Wait for outstanding transport work to finish. ib_drain_qp
+ * handles the drains in the wrong order for us, so open code
+ * them here.
+ */
+static void rpcrdma_xprt_drain(struct rpcrdma_xprt *r_xprt)
{
- struct workqueue_struct *recv_wq;
+ struct rpcrdma_ia *ia = &r_xprt->rx_ia;
- recv_wq = alloc_workqueue("xprtrdma_receive",
- WQ_MEM_RECLAIM | WQ_HIGHPRI,
- 0);
- if (!recv_wq)
- return -ENOMEM;
+ /* Flush Receives, then wait for deferred Reply work
+ * to complete.
+ */
+ ib_drain_rq(ia->ri_id->qp);
- rpcrdma_receive_wq = recv_wq;
- return 0;
+ /* Deferred Reply processing might have scheduled
+ * local invalidations.
+ */
+ ib_drain_sq(ia->ri_id->qp);
}
-void
-rpcrdma_destroy_wq(void)
-{
- struct workqueue_struct *wq;
-
- if (rpcrdma_receive_wq) {
- wq = rpcrdma_receive_wq;
- rpcrdma_receive_wq = NULL;
- destroy_workqueue(wq);
- }
-}
-
+/**
+ * rpcrdma_qp_event_handler - Handle one QP event (error notification)
+ * @event: details of the event
+ * @context: ep that owns QP where event occurred
+ *
+ * Called from the RDMA provider (device driver) possibly in an interrupt
+ * context.
+ */
static void
-rpcrdma_qp_async_error_upcall(struct ib_event *event, void *context)
+rpcrdma_qp_event_handler(struct ib_event *event, void *context)
{
struct rpcrdma_ep *ep = context;
struct rpcrdma_xprt *r_xprt = container_of(ep, struct rpcrdma_xprt,
rx_ep);
- trace_xprtrdma_qp_error(r_xprt, event);
- pr_err("rpcrdma: %s on device %s ep %p\n",
- ib_event_msg(event->event), event->device->name, context);
-
- if (ep->rep_connected == 1) {
- ep->rep_connected = -EIO;
- rpcrdma_conn_func(ep);
- wake_up_all(&ep->rep_connect_wait);
- }
+ trace_xprtrdma_qp_event(r_xprt, event);
}
/**
@@ -141,11 +138,6 @@
/* WARNING: Only wr_cqe and status are reliable at this point */
trace_xprtrdma_wc_send(sc, wc);
- if (wc->status != IB_WC_SUCCESS && wc->status != IB_WC_WR_FLUSH_ERR)
- pr_err("rpcrdma: Send: %s (%u/0x%x)\n",
- ib_wc_status_msg(wc->status),
- wc->status, wc->vendor_err);
-
rpcrdma_sendctx_put_locked(sc);
}
@@ -161,11 +153,13 @@
struct ib_cqe *cqe = wc->wr_cqe;
struct rpcrdma_rep *rep = container_of(cqe, struct rpcrdma_rep,
rr_cqe);
+ struct rpcrdma_xprt *r_xprt = rep->rr_rxprt;
- /* WARNING: Only wr_id and status are reliable at this point */
+ /* WARNING: Only wr_cqe and status are reliable at this point */
trace_xprtrdma_wc_receive(wc);
+ --r_xprt->rx_ep.rep_receive_count;
if (wc->status != IB_WC_SUCCESS)
- goto out_fail;
+ goto out_flushed;
/* status == SUCCESS means all fields in wc are trustworthy */
rpcrdma_set_xdrlen(&rep->rr_hdrbuf, wc->byte_len);
@@ -176,24 +170,18 @@
rdmab_addr(rep->rr_rdmabuf),
wc->byte_len, DMA_FROM_DEVICE);
-out_schedule:
+ rpcrdma_post_recvs(r_xprt, false);
rpcrdma_reply_handler(rep);
return;
-out_fail:
- if (wc->status != IB_WC_WR_FLUSH_ERR)
- pr_err("rpcrdma: Recv: %s (%u/0x%x)\n",
- ib_wc_status_msg(wc->status),
- wc->status, wc->vendor_err);
- rpcrdma_set_xdrlen(&rep->rr_hdrbuf, 0);
- goto out_schedule;
+out_flushed:
+ rpcrdma_recv_buffer_put(rep);
}
static void
rpcrdma_update_connect_private(struct rpcrdma_xprt *r_xprt,
struct rdma_conn_param *param)
{
- struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
const struct rpcrdma_connect_private *pmsg = param->private_data;
unsigned int rsize, wsize;
@@ -210,89 +198,96 @@
wsize = rpcrdma_decode_buffer_size(pmsg->cp_recv_size);
}
- if (rsize < cdata->inline_rsize)
- cdata->inline_rsize = rsize;
- if (wsize < cdata->inline_wsize)
- cdata->inline_wsize = wsize;
- dprintk("RPC: %s: max send %u, max recv %u\n",
- __func__, cdata->inline_wsize, cdata->inline_rsize);
+ if (rsize < r_xprt->rx_ep.rep_inline_recv)
+ r_xprt->rx_ep.rep_inline_recv = rsize;
+ if (wsize < r_xprt->rx_ep.rep_inline_send)
+ r_xprt->rx_ep.rep_inline_send = wsize;
+ dprintk("RPC: %s: max send %u, max recv %u\n", __func__,
+ r_xprt->rx_ep.rep_inline_send,
+ r_xprt->rx_ep.rep_inline_recv);
rpcrdma_set_max_header_sizes(r_xprt);
}
+/**
+ * rpcrdma_cm_event_handler - Handle RDMA CM events
+ * @id: rdma_cm_id on which an event has occurred
+ * @event: details of the event
+ *
+ * Called with @id's mutex held. Returns 1 if caller should
+ * destroy @id, otherwise 0.
+ */
static int
-rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
+rpcrdma_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event)
{
- struct rpcrdma_xprt *xprt = id->context;
- struct rpcrdma_ia *ia = &xprt->rx_ia;
- struct rpcrdma_ep *ep = &xprt->rx_ep;
- int connstate = 0;
+ struct rpcrdma_xprt *r_xprt = id->context;
+ struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+ struct rpcrdma_ep *ep = &r_xprt->rx_ep;
+ struct rpc_xprt *xprt = &r_xprt->rx_xprt;
- trace_xprtrdma_conn_upcall(xprt, event);
+ might_sleep();
+
+ trace_xprtrdma_cm_event(r_xprt, event);
switch (event->event) {
case RDMA_CM_EVENT_ADDR_RESOLVED:
case RDMA_CM_EVENT_ROUTE_RESOLVED:
ia->ri_async_rc = 0;
complete(&ia->ri_done);
- break;
+ return 0;
case RDMA_CM_EVENT_ADDR_ERROR:
ia->ri_async_rc = -EPROTO;
complete(&ia->ri_done);
- break;
+ return 0;
case RDMA_CM_EVENT_ROUTE_ERROR:
ia->ri_async_rc = -ENETUNREACH;
complete(&ia->ri_done);
- break;
+ return 0;
case RDMA_CM_EVENT_DEVICE_REMOVAL:
#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
pr_info("rpcrdma: removing device %s for %s:%s\n",
- ia->ri_device->name,
- rpcrdma_addrstr(xprt), rpcrdma_portstr(xprt));
+ ia->ri_id->device->name,
+ rpcrdma_addrstr(r_xprt), rpcrdma_portstr(r_xprt));
#endif
set_bit(RPCRDMA_IAF_REMOVING, &ia->ri_flags);
ep->rep_connected = -ENODEV;
- xprt_force_disconnect(&xprt->rx_xprt);
+ xprt_force_disconnect(xprt);
wait_for_completion(&ia->ri_remove_done);
ia->ri_id = NULL;
- ia->ri_device = NULL;
/* Return 1 to ensure the core destroys the id. */
return 1;
case RDMA_CM_EVENT_ESTABLISHED:
- ++xprt->rx_xprt.connect_cookie;
- connstate = 1;
- rpcrdma_update_connect_private(xprt, &event->param.conn);
- goto connected;
+ ++xprt->connect_cookie;
+ ep->rep_connected = 1;
+ rpcrdma_update_connect_private(r_xprt, &event->param.conn);
+ wake_up_all(&ep->rep_connect_wait);
+ break;
case RDMA_CM_EVENT_CONNECT_ERROR:
- connstate = -ENOTCONN;
- goto connected;
+ ep->rep_connected = -ENOTCONN;
+ goto disconnected;
case RDMA_CM_EVENT_UNREACHABLE:
- connstate = -ENETUNREACH;
- goto connected;
+ ep->rep_connected = -ENETUNREACH;
+ goto disconnected;
case RDMA_CM_EVENT_REJECTED:
dprintk("rpcrdma: connection to %s:%s rejected: %s\n",
- rpcrdma_addrstr(xprt), rpcrdma_portstr(xprt),
+ rpcrdma_addrstr(r_xprt), rpcrdma_portstr(r_xprt),
rdma_reject_msg(id, event->status));
- connstate = -ECONNREFUSED;
+ ep->rep_connected = -ECONNREFUSED;
if (event->status == IB_CM_REJ_STALE_CONN)
- connstate = -EAGAIN;
- goto connected;
+ ep->rep_connected = -EAGAIN;
+ goto disconnected;
case RDMA_CM_EVENT_DISCONNECTED:
- ++xprt->rx_xprt.connect_cookie;
- connstate = -ECONNABORTED;
-connected:
- ep->rep_connected = connstate;
- rpcrdma_conn_func(ep);
+ ep->rep_connected = -ECONNABORTED;
+disconnected:
+ xprt_force_disconnect(xprt);
wake_up_all(&ep->rep_connect_wait);
- /*FALLTHROUGH*/
+ break;
default:
- dprintk("RPC: %s: %s:%s on %s/%s (ep 0x%p): %s\n",
- __func__,
- rpcrdma_addrstr(xprt), rpcrdma_portstr(xprt),
- ia->ri_device->name, ia->ri_ops->ro_displayname,
- ep, rdma_event_msg(event->event));
break;
}
+ dprintk("RPC: %s: %s:%s on %s/frwr: %s\n", __func__,
+ rpcrdma_addrstr(r_xprt), rpcrdma_portstr(r_xprt),
+ ia->ri_id->device->name, rdma_event_msg(event->event));
return 0;
}
@@ -308,24 +303,17 @@
init_completion(&ia->ri_done);
init_completion(&ia->ri_remove_done);
- id = rdma_create_id(xprt->rx_xprt.xprt_net, rpcrdma_conn_upcall,
+ id = rdma_create_id(xprt->rx_xprt.xprt_net, rpcrdma_cm_event_handler,
xprt, RDMA_PS_TCP, IB_QPT_RC);
- if (IS_ERR(id)) {
- rc = PTR_ERR(id);
- dprintk("RPC: %s: rdma_create_id() failed %i\n",
- __func__, rc);
+ if (IS_ERR(id))
return id;
- }
ia->ri_async_rc = -ETIMEDOUT;
rc = rdma_resolve_addr(id, NULL,
(struct sockaddr *)&xprt->rx_xprt.addr,
RDMA_RESOLVE_TIMEOUT);
- if (rc) {
- dprintk("RPC: %s: rdma_resolve_addr() failed %i\n",
- __func__, rc);
+ if (rc)
goto out;
- }
rc = wait_for_completion_interruptible_timeout(&ia->ri_done, wtimeout);
if (rc < 0) {
trace_xprtrdma_conn_tout(xprt);
@@ -338,11 +326,8 @@
ia->ri_async_rc = -ETIMEDOUT;
rc = rdma_resolve_route(id, RDMA_RESOLVE_TIMEOUT);
- if (rc) {
- dprintk("RPC: %s: rdma_resolve_route() failed %i\n",
- __func__, rc);
+ if (rc)
goto out;
- }
rc = wait_for_completion_interruptible_timeout(&ia->ri_done, wtimeout);
if (rc < 0) {
trace_xprtrdma_conn_tout(xprt);
@@ -381,9 +366,8 @@
rc = PTR_ERR(ia->ri_id);
goto out_err;
}
- ia->ri_device = ia->ri_id->device;
- ia->ri_pd = ib_alloc_pd(ia->ri_device, 0);
+ ia->ri_pd = ib_alloc_pd(ia->ri_id->device, 0);
if (IS_ERR(ia->ri_pd)) {
rc = PTR_ERR(ia->ri_pd);
pr_err("rpcrdma: ib_alloc_pd() returned %d\n", rc);
@@ -392,20 +376,12 @@
switch (xprt_rdma_memreg_strategy) {
case RPCRDMA_FRWR:
- if (frwr_is_supported(ia)) {
- ia->ri_ops = &rpcrdma_frwr_memreg_ops;
+ if (frwr_is_supported(ia->ri_id->device))
break;
- }
- /*FALLTHROUGH*/
- case RPCRDMA_MTHCAFMR:
- if (fmr_is_supported(ia)) {
- ia->ri_ops = &rpcrdma_fmr_memreg_ops;
- break;
- }
/*FALLTHROUGH*/
default:
pr_err("rpcrdma: Device %s does not support memreg mode %d\n",
- ia->ri_device->name, xprt_rdma_memreg_strategy);
+ ia->ri_id->device->name, xprt_rdma_memreg_strategy);
rc = -EINVAL;
goto out_err;
}
@@ -432,9 +408,8 @@
struct rpcrdma_ep *ep = &r_xprt->rx_ep;
struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
struct rpcrdma_req *req;
- struct rpcrdma_rep *rep;
- cancel_delayed_work_sync(&buf->rb_refresh_worker);
+ cancel_work_sync(&buf->rb_refresh_worker);
/* This is similar to rpcrdma_ep_destroy, but:
* - Don't cancel the connect worker.
@@ -444,7 +419,7 @@
* connection is already gone.
*/
if (ia->ri_id->qp) {
- ib_drain_qp(ia->ri_id->qp);
+ rpcrdma_xprt_drain(r_xprt);
rdma_destroy_qp(ia->ri_id);
ia->ri_id->qp = NULL;
}
@@ -456,12 +431,11 @@
/* The ULP is responsible for ensuring all DMA
* mappings and MRs are gone.
*/
- list_for_each_entry(rep, &buf->rb_recv_bufs, rr_list)
- rpcrdma_dma_unmap_regbuf(rep->rr_rdmabuf);
+ rpcrdma_reps_destroy(buf);
list_for_each_entry(req, &buf->rb_allreqs, rl_all) {
- rpcrdma_dma_unmap_regbuf(req->rl_rdmabuf);
- rpcrdma_dma_unmap_regbuf(req->rl_sendbuf);
- rpcrdma_dma_unmap_regbuf(req->rl_recvbuf);
+ rpcrdma_regbuf_dma_unmap(req->rl_rdmabuf);
+ rpcrdma_regbuf_dma_unmap(req->rl_sendbuf);
+ rpcrdma_regbuf_dma_unmap(req->rl_recvbuf);
}
rpcrdma_mrs_destroy(buf);
ib_dealloc_pd(ia->ri_pd);
@@ -487,7 +461,6 @@
rdma_destroy_id(ia->ri_id);
}
ia->ri_id = NULL;
- ia->ri_device = NULL;
/* If the pd is still busy, xprtrdma missed freeing a resource */
if (ia->ri_pd && !IS_ERR(ia->ri_pd))
@@ -495,19 +468,26 @@
ia->ri_pd = NULL;
}
-/*
- * Create unconnected endpoint.
+/**
+ * rpcrdma_ep_create - Create unconnected endpoint
+ * @r_xprt: transport to instantiate
+ *
+ * Returns zero on success, or a negative errno.
*/
-int
-rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
- struct rpcrdma_create_data_internal *cdata)
+int rpcrdma_ep_create(struct rpcrdma_xprt *r_xprt)
{
+ struct rpcrdma_ep *ep = &r_xprt->rx_ep;
+ struct rpcrdma_ia *ia = &r_xprt->rx_ia;
struct rpcrdma_connect_private *pmsg = &ep->rep_cm_private;
struct ib_cq *sendcq, *recvcq;
unsigned int max_sge;
int rc;
- max_sge = min_t(unsigned int, ia->ri_device->attrs.max_send_sge,
+ ep->rep_max_requests = xprt_rdma_slot_table_entries;
+ ep->rep_inline_send = xprt_rdma_max_inline_write;
+ ep->rep_inline_recv = xprt_rdma_max_inline_read;
+
+ max_sge = min_t(unsigned int, ia->ri_id->device->attrs.max_send_sge,
RPCRDMA_MAX_SEND_SGES);
if (max_sge < RPCRDMA_MIN_SEND_SGES) {
pr_warn("rpcrdma: HCA provides only %d send SGEs\n", max_sge);
@@ -515,11 +495,11 @@
}
ia->ri_max_send_sges = max_sge;
- rc = ia->ri_ops->ro_open(ia, ep, cdata);
+ rc = frwr_open(ia, ep);
if (rc)
return rc;
- ep->rep_attr.event_handler = rpcrdma_qp_async_error_upcall;
+ ep->rep_attr.event_handler = rpcrdma_qp_event_handler;
ep->rep_attr.qp_context = ep;
ep->rep_attr.srq = NULL;
ep->rep_attr.cap.max_send_sge = max_sge;
@@ -537,30 +517,24 @@
ep->rep_attr.cap.max_send_sge,
ep->rep_attr.cap.max_recv_sge);
- /* set trigger for requesting send completion */
- ep->rep_send_batch = min_t(unsigned int, RPCRDMA_MAX_SEND_BATCH,
- cdata->max_requests >> 2);
+ ep->rep_send_batch = ep->rep_max_requests >> 3;
ep->rep_send_count = ep->rep_send_batch;
init_waitqueue_head(&ep->rep_connect_wait);
- INIT_DELAYED_WORK(&ep->rep_connect_worker, rpcrdma_connect_worker);
+ ep->rep_receive_count = 0;
- sendcq = ib_alloc_cq(ia->ri_device, NULL,
- ep->rep_attr.cap.max_send_wr + 1,
- 1, IB_POLL_WORKQUEUE);
+ sendcq = ib_alloc_cq_any(ia->ri_id->device, NULL,
+ ep->rep_attr.cap.max_send_wr + 1,
+ IB_POLL_WORKQUEUE);
if (IS_ERR(sendcq)) {
rc = PTR_ERR(sendcq);
- dprintk("RPC: %s: failed to create send CQ: %i\n",
- __func__, rc);
goto out1;
}
- recvcq = ib_alloc_cq(ia->ri_device, NULL,
- ep->rep_attr.cap.max_recv_wr + 1,
- 0, IB_POLL_WORKQUEUE);
+ recvcq = ib_alloc_cq_any(ia->ri_id->device, NULL,
+ ep->rep_attr.cap.max_recv_wr + 1,
+ IB_POLL_WORKQUEUE);
if (IS_ERR(recvcq)) {
rc = PTR_ERR(recvcq);
- dprintk("RPC: %s: failed to create recv CQ: %i\n",
- __func__, rc);
goto out2;
}
@@ -573,16 +547,16 @@
/* Prepare RDMA-CM private message */
pmsg->cp_magic = rpcrdma_cmp_magic;
pmsg->cp_version = RPCRDMA_CMP_VERSION;
- pmsg->cp_flags |= ia->ri_ops->ro_send_w_inv_ok;
- pmsg->cp_send_size = rpcrdma_encode_buffer_size(cdata->inline_wsize);
- pmsg->cp_recv_size = rpcrdma_encode_buffer_size(cdata->inline_rsize);
+ pmsg->cp_flags |= RPCRDMA_CMP_F_SND_W_INV_OK;
+ pmsg->cp_send_size = rpcrdma_encode_buffer_size(ep->rep_inline_send);
+ pmsg->cp_recv_size = rpcrdma_encode_buffer_size(ep->rep_inline_recv);
ep->rep_remote_cma.private_data = pmsg;
ep->rep_remote_cma.private_data_len = sizeof(*pmsg);
/* Client offers RDMA Read but does not initiate */
ep->rep_remote_cma.initiator_depth = 0;
ep->rep_remote_cma.responder_resources =
- min_t(int, U8_MAX, ia->ri_device->attrs.max_qp_rd_atom);
+ min_t(int, U8_MAX, ia->ri_id->device->attrs.max_qp_rd_atom);
/* Limit transport retries so client can detect server
* GID changes quickly. RPC layer handles re-establishing
@@ -605,17 +579,15 @@
return rc;
}
-/*
- * rpcrdma_ep_destroy
+/**
+ * rpcrdma_ep_destroy - Disconnect and destroy endpoint.
+ * @r_xprt: transport instance to shut down
*
- * Disconnect and destroy endpoint. After this, the only
- * valid operations on the ep are to free it (if dynamically
- * allocated) or re-create it.
*/
-void
-rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
+void rpcrdma_ep_destroy(struct rpcrdma_xprt *r_xprt)
{
- cancel_delayed_work_sync(&ep->rep_connect_worker);
+ struct rpcrdma_ep *ep = &r_xprt->rx_ep;
+ struct rpcrdma_ia *ia = &r_xprt->rx_ia;
if (ia->ri_id && ia->ri_id->qp) {
rpcrdma_ep_disconnect(ep, ia);
@@ -633,10 +605,10 @@
* Unlike a normal reconnection, a fresh PD and a new set
* of MRs and buffers is needed.
*/
-static int
-rpcrdma_ep_recreate_xprt(struct rpcrdma_xprt *r_xprt,
- struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
+static int rpcrdma_ep_recreate_xprt(struct rpcrdma_xprt *r_xprt,
+ struct ib_qp_init_attr *qp_init_attr)
{
+ struct rpcrdma_ia *ia = &r_xprt->rx_ia;
int rc, err;
trace_xprtrdma_reinsert(r_xprt);
@@ -646,14 +618,14 @@
goto out1;
rc = -ENOMEM;
- err = rpcrdma_ep_create(ep, ia, &r_xprt->rx_data);
+ err = rpcrdma_ep_create(r_xprt);
if (err) {
pr_err("rpcrdma: rpcrdma_ep_create returned %d\n", err);
goto out2;
}
rc = -ENETUNREACH;
- err = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr);
+ err = rdma_create_qp(ia->ri_id, ia->ri_pd, qp_init_attr);
if (err) {
pr_err("rpcrdma: rdma_create_qp returned %d\n", err);
goto out3;
@@ -663,23 +635,23 @@
return 0;
out3:
- rpcrdma_ep_destroy(ep, ia);
+ rpcrdma_ep_destroy(r_xprt);
out2:
rpcrdma_ia_close(ia);
out1:
return rc;
}
-static int
-rpcrdma_ep_reconnect(struct rpcrdma_xprt *r_xprt, struct rpcrdma_ep *ep,
- struct rpcrdma_ia *ia)
+static int rpcrdma_ep_reconnect(struct rpcrdma_xprt *r_xprt,
+ struct ib_qp_init_attr *qp_init_attr)
{
+ struct rpcrdma_ia *ia = &r_xprt->rx_ia;
struct rdma_cm_id *id, *old;
int err, rc;
trace_xprtrdma_reconnect(r_xprt);
- rpcrdma_ep_disconnect(ep, ia);
+ rpcrdma_ep_disconnect(&r_xprt->rx_ep, ia);
rc = -EHOSTUNREACH;
id = rpcrdma_create_id(r_xprt, ia);
@@ -696,17 +668,14 @@
*/
old = id;
rc = -ENETUNREACH;
- if (ia->ri_device != id->device) {
+ if (ia->ri_id->device != id->device) {
pr_err("rpcrdma: can't reconnect on different device!\n");
goto out_destroy;
}
- err = rdma_create_qp(id, ia->ri_pd, &ep->rep_attr);
- if (err) {
- dprintk("RPC: %s: rdma_create_qp returned %d\n",
- __func__, err);
+ err = rdma_create_qp(id, ia->ri_pd, qp_init_attr);
+ if (err)
goto out_destroy;
- }
/* Atomically replace the transport's ID and QP. */
rc = 0;
@@ -728,41 +697,43 @@
{
struct rpcrdma_xprt *r_xprt = container_of(ia, struct rpcrdma_xprt,
rx_ia);
+ struct rpc_xprt *xprt = &r_xprt->rx_xprt;
+ struct ib_qp_init_attr qp_init_attr;
int rc;
retry:
+ memcpy(&qp_init_attr, &ep->rep_attr, sizeof(qp_init_attr));
switch (ep->rep_connected) {
case 0:
dprintk("RPC: %s: connecting...\n", __func__);
- rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr);
+ rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &qp_init_attr);
if (rc) {
- dprintk("RPC: %s: rdma_create_qp failed %i\n",
- __func__, rc);
rc = -ENETUNREACH;
goto out_noupdate;
}
break;
case -ENODEV:
- rc = rpcrdma_ep_recreate_xprt(r_xprt, ep, ia);
+ rc = rpcrdma_ep_recreate_xprt(r_xprt, &qp_init_attr);
if (rc)
goto out_noupdate;
break;
default:
- rc = rpcrdma_ep_reconnect(r_xprt, ep, ia);
+ rc = rpcrdma_ep_reconnect(r_xprt, &qp_init_attr);
if (rc)
goto out;
}
ep->rep_connected = 0;
+ xprt_clear_connected(xprt);
+
rpcrdma_post_recvs(r_xprt, true);
rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma);
- if (rc) {
- dprintk("RPC: %s: rdma_connect() failed with %i\n",
- __func__, rc);
+ if (rc)
goto out;
- }
+ if (xprt->reestablish_timeout < RPCRDMA_INIT_REEST_TO)
+ xprt->reestablish_timeout = RPCRDMA_INIT_REEST_TO;
wait_event_interruptible(ep->rep_connect_wait, ep->rep_connected != 0);
if (ep->rep_connected <= 0) {
if (ep->rep_connected == -EAGAIN)
@@ -781,8 +752,10 @@
return rc;
}
-/*
- * rpcrdma_ep_disconnect
+/**
+ * rpcrdma_ep_disconnect - Disconnect underlying transport
+ * @ep: endpoint to disconnect
+ * @ia: associated interface adapter
*
* This is separate from destroy to facilitate the ability
* to reconnect without recreating the endpoint.
@@ -793,19 +766,20 @@
void
rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
{
+ struct rpcrdma_xprt *r_xprt = container_of(ep, struct rpcrdma_xprt,
+ rx_ep);
int rc;
+ /* returns without wait if ID is not connected */
rc = rdma_disconnect(ia->ri_id);
if (!rc)
- /* returns without wait if not connected */
wait_event_interruptible(ep->rep_connect_wait,
ep->rep_connected != 1);
else
ep->rep_connected = rc;
- trace_xprtrdma_disconnect(container_of(ep, struct rpcrdma_xprt,
- rx_ep), rc);
+ trace_xprtrdma_disconnect(r_xprt, rc);
- ib_drain_qp(ia->ri_id->qp);
+ rpcrdma_xprt_drain(r_xprt);
}
/* Fixed-size circular FIFO queue. This implementation is wait-free and
@@ -822,8 +796,8 @@
*/
/* rpcrdma_sendctxs_destroy() assumes caller has already quiesced
- * queue activity, and ib_drain_qp has flushed all remaining Send
- * requests.
+ * queue activity, and rpcrdma_xprt_drain has flushed all remaining
+ * Send requests.
*/
static void rpcrdma_sendctxs_destroy(struct rpcrdma_buffer *buf)
{
@@ -838,8 +812,7 @@
{
struct rpcrdma_sendctx *sc;
- sc = kzalloc(sizeof(*sc) +
- ia->ri_max_send_sges * sizeof(struct ib_sge),
+ sc = kzalloc(struct_size(sc, sc_sges, ia->ri_max_send_sges),
GFP_KERNEL);
if (!sc)
return NULL;
@@ -872,18 +845,13 @@
for (i = 0; i <= buf->rb_sc_last; i++) {
sc = rpcrdma_sendctx_create(&r_xprt->rx_ia);
if (!sc)
- goto out_destroy;
+ return -ENOMEM;
sc->sc_xprt = r_xprt;
buf->rb_sc_ctxs[i] = sc;
}
- buf->rb_flags = 0;
return 0;
-
-out_destroy:
- rpcrdma_sendctxs_destroy(buf);
- return -ENOMEM;
}
/* The sendctx queue is not guaranteed to have a size that is a
@@ -898,20 +866,20 @@
/**
* rpcrdma_sendctx_get_locked - Acquire a send context
- * @buf: transport buffers from which to acquire an unused context
+ * @r_xprt: controlling transport instance
*
* Returns pointer to a free send completion context; or NULL if
* the queue is empty.
*
* Usage: Called to acquire an SGE array before preparing a Send WR.
*
- * The caller serializes calls to this function (per rpcrdma_buffer),
- * and provides an effective memory barrier that flushes the new value
+ * The caller serializes calls to this function (per transport), and
+ * provides an effective memory barrier that flushes the new value
* of rb_sc_head.
*/
-struct rpcrdma_sendctx *rpcrdma_sendctx_get_locked(struct rpcrdma_buffer *buf)
+struct rpcrdma_sendctx *rpcrdma_sendctx_get_locked(struct rpcrdma_xprt *r_xprt)
{
- struct rpcrdma_xprt *r_xprt;
+ struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
struct rpcrdma_sendctx *sc;
unsigned long next_head;
@@ -935,8 +903,7 @@
* completions recently. This is a sign the Send Queue is
* backing up. Cause the caller to pause and try again.
*/
- set_bit(RPCRDMA_BUF_F_EMPTY_SCQ, &buf->rb_flags);
- r_xprt = container_of(buf, struct rpcrdma_xprt, rx_buf);
+ xprt_wait_for_buffer_space(&r_xprt->rx_xprt);
r_xprt->rx_stats.empty_sendctx_q++;
return NULL;
}
@@ -948,7 +915,7 @@
* Usage: Called from Send completion to return a sendctxt
* to the queue.
*
- * The caller serializes calls to this function (per rpcrdma_buffer).
+ * The caller serializes calls to this function (per transport).
*/
static void
rpcrdma_sendctx_put_locked(struct rpcrdma_sendctx *sc)
@@ -956,7 +923,7 @@
struct rpcrdma_buffer *buf = &sc->sc_xprt->rx_buf;
unsigned long next_tail;
- /* Unmap SGEs of previously completed by unsignaled
+ /* Unmap SGEs of previously completed but unsignaled
* Sends by walking up the queue until @sc is found.
*/
next_tail = buf->rb_sc_tail;
@@ -964,50 +931,14 @@
next_tail = rpcrdma_sendctx_next(buf, next_tail);
/* ORDER: item must be accessed _before_ tail is updated */
- rpcrdma_unmap_sendctx(buf->rb_sc_ctxs[next_tail]);
+ rpcrdma_sendctx_unmap(buf->rb_sc_ctxs[next_tail]);
} while (buf->rb_sc_ctxs[next_tail] != sc);
/* Paired with READ_ONCE */
smp_store_release(&buf->rb_sc_tail, next_tail);
- if (test_and_clear_bit(RPCRDMA_BUF_F_EMPTY_SCQ, &buf->rb_flags)) {
- smp_mb__after_atomic();
- xprt_write_space(&sc->sc_xprt->rx_xprt);
- }
-}
-
-static void
-rpcrdma_mr_recovery_worker(struct work_struct *work)
-{
- struct rpcrdma_buffer *buf = container_of(work, struct rpcrdma_buffer,
- rb_recovery_worker.work);
- struct rpcrdma_mr *mr;
-
- spin_lock(&buf->rb_recovery_lock);
- while (!list_empty(&buf->rb_stale_mrs)) {
- mr = rpcrdma_mr_pop(&buf->rb_stale_mrs);
- spin_unlock(&buf->rb_recovery_lock);
-
- trace_xprtrdma_recover_mr(mr);
- mr->mr_xprt->rx_ia.ri_ops->ro_recover_mr(mr);
-
- spin_lock(&buf->rb_recovery_lock);
- }
- spin_unlock(&buf->rb_recovery_lock);
-}
-
-void
-rpcrdma_mr_defer_recovery(struct rpcrdma_mr *mr)
-{
- struct rpcrdma_xprt *r_xprt = mr->mr_xprt;
- struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
-
- spin_lock(&buf->rb_recovery_lock);
- rpcrdma_mr_push(mr, &buf->rb_stale_mrs);
- spin_unlock(&buf->rb_recovery_lock);
-
- schedule_delayed_work(&buf->rb_recovery_worker, 0);
+ xprt_write_space(&sc->sc_xprt->rx_xprt);
}
static void
@@ -1016,18 +947,16 @@
struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
struct rpcrdma_ia *ia = &r_xprt->rx_ia;
unsigned int count;
- LIST_HEAD(free);
- LIST_HEAD(all);
- for (count = 0; count < 3; count++) {
+ for (count = 0; count < ia->ri_max_segs; count++) {
struct rpcrdma_mr *mr;
int rc;
- mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+ mr = kzalloc(sizeof(*mr), GFP_NOFS);
if (!mr)
break;
- rc = ia->ri_ops->ro_init_mr(ia, mr);
+ rc = frwr_init_mr(ia, mr);
if (rc) {
kfree(mr);
break;
@@ -1035,143 +964,185 @@
mr->mr_xprt = r_xprt;
- list_add(&mr->mr_list, &free);
- list_add(&mr->mr_all, &all);
+ spin_lock(&buf->rb_lock);
+ list_add(&mr->mr_list, &buf->rb_mrs);
+ list_add(&mr->mr_all, &buf->rb_all_mrs);
+ spin_unlock(&buf->rb_lock);
}
- spin_lock(&buf->rb_mrlock);
- list_splice(&free, &buf->rb_mrs);
- list_splice(&all, &buf->rb_all);
r_xprt->rx_stats.mrs_allocated += count;
- spin_unlock(&buf->rb_mrlock);
trace_xprtrdma_createmrs(r_xprt, count);
-
- xprt_write_space(&r_xprt->rx_xprt);
}
static void
rpcrdma_mr_refresh_worker(struct work_struct *work)
{
struct rpcrdma_buffer *buf = container_of(work, struct rpcrdma_buffer,
- rb_refresh_worker.work);
+ rb_refresh_worker);
struct rpcrdma_xprt *r_xprt = container_of(buf, struct rpcrdma_xprt,
rx_buf);
rpcrdma_mrs_create(r_xprt);
+ xprt_write_space(&r_xprt->rx_xprt);
}
-struct rpcrdma_req *
-rpcrdma_create_req(struct rpcrdma_xprt *r_xprt)
+/**
+ * rpcrdma_req_create - Allocate an rpcrdma_req object
+ * @r_xprt: controlling r_xprt
+ * @size: initial size, in bytes, of send and receive buffers
+ * @flags: GFP flags passed to memory allocators
+ *
+ * Returns an allocated and fully initialized rpcrdma_req or NULL.
+ */
+struct rpcrdma_req *rpcrdma_req_create(struct rpcrdma_xprt *r_xprt, size_t size,
+ gfp_t flags)
{
struct rpcrdma_buffer *buffer = &r_xprt->rx_buf;
struct rpcrdma_regbuf *rb;
struct rpcrdma_req *req;
+ size_t maxhdrsize;
- req = kzalloc(sizeof(*req), GFP_KERNEL);
+ req = kzalloc(sizeof(*req), flags);
if (req == NULL)
- return ERR_PTR(-ENOMEM);
+ goto out1;
- rb = rpcrdma_alloc_regbuf(RPCRDMA_HDRBUF_SIZE,
- DMA_TO_DEVICE, GFP_KERNEL);
- if (IS_ERR(rb)) {
- kfree(req);
- return ERR_PTR(-ENOMEM);
- }
+ /* Compute maximum header buffer size in bytes */
+ maxhdrsize = rpcrdma_fixed_maxsz + 3 +
+ r_xprt->rx_ia.ri_max_segs * rpcrdma_readchunk_maxsz;
+ maxhdrsize *= sizeof(__be32);
+ rb = rpcrdma_regbuf_alloc(__roundup_pow_of_two(maxhdrsize),
+ DMA_TO_DEVICE, flags);
+ if (!rb)
+ goto out2;
req->rl_rdmabuf = rb;
- xdr_buf_init(&req->rl_hdrbuf, rb->rg_base, rdmab_length(rb));
- req->rl_buffer = buffer;
- INIT_LIST_HEAD(&req->rl_registered);
+ xdr_buf_init(&req->rl_hdrbuf, rdmab_data(rb), rdmab_length(rb));
- spin_lock(&buffer->rb_reqslock);
+ req->rl_sendbuf = rpcrdma_regbuf_alloc(size, DMA_TO_DEVICE, flags);
+ if (!req->rl_sendbuf)
+ goto out3;
+
+ req->rl_recvbuf = rpcrdma_regbuf_alloc(size, DMA_NONE, flags);
+ if (!req->rl_recvbuf)
+ goto out4;
+
+ INIT_LIST_HEAD(&req->rl_free_mrs);
+ INIT_LIST_HEAD(&req->rl_registered);
+ spin_lock(&buffer->rb_lock);
list_add(&req->rl_all, &buffer->rb_allreqs);
- spin_unlock(&buffer->rb_reqslock);
+ spin_unlock(&buffer->rb_lock);
return req;
+
+out4:
+ kfree(req->rl_sendbuf);
+out3:
+ kfree(req->rl_rdmabuf);
+out2:
+ kfree(req);
+out1:
+ return NULL;
}
-static int
-rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt, bool temp)
+static struct rpcrdma_rep *rpcrdma_rep_create(struct rpcrdma_xprt *r_xprt,
+ bool temp)
{
- struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
- struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
struct rpcrdma_rep *rep;
- int rc;
- rc = -ENOMEM;
rep = kzalloc(sizeof(*rep), GFP_KERNEL);
if (rep == NULL)
goto out;
- rep->rr_rdmabuf = rpcrdma_alloc_regbuf(cdata->inline_rsize,
+ rep->rr_rdmabuf = rpcrdma_regbuf_alloc(r_xprt->rx_ep.rep_inline_recv,
DMA_FROM_DEVICE, GFP_KERNEL);
- if (IS_ERR(rep->rr_rdmabuf)) {
- rc = PTR_ERR(rep->rr_rdmabuf);
+ if (!rep->rr_rdmabuf)
goto out_free;
- }
- xdr_buf_init(&rep->rr_hdrbuf, rep->rr_rdmabuf->rg_base,
- rdmab_length(rep->rr_rdmabuf));
+ xdr_buf_init(&rep->rr_hdrbuf, rdmab_data(rep->rr_rdmabuf),
+ rdmab_length(rep->rr_rdmabuf));
rep->rr_cqe.done = rpcrdma_wc_receive;
rep->rr_rxprt = r_xprt;
- INIT_WORK(&rep->rr_work, rpcrdma_deferred_completion);
rep->rr_recv_wr.next = NULL;
rep->rr_recv_wr.wr_cqe = &rep->rr_cqe;
rep->rr_recv_wr.sg_list = &rep->rr_rdmabuf->rg_iov;
rep->rr_recv_wr.num_sge = 1;
rep->rr_temp = temp;
-
- spin_lock(&buf->rb_lock);
- list_add(&rep->rr_list, &buf->rb_recv_bufs);
- spin_unlock(&buf->rb_lock);
- return 0;
+ return rep;
out_free:
kfree(rep);
out:
- dprintk("RPC: %s: reply buffer %d alloc failed\n",
- __func__, rc);
- return rc;
+ return NULL;
}
-int
-rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt)
+static void rpcrdma_rep_destroy(struct rpcrdma_rep *rep)
+{
+ rpcrdma_regbuf_free(rep->rr_rdmabuf);
+ kfree(rep);
+}
+
+static struct rpcrdma_rep *rpcrdma_rep_get_locked(struct rpcrdma_buffer *buf)
+{
+ struct llist_node *node;
+
+ /* Calls to llist_del_first are required to be serialized */
+ node = llist_del_first(&buf->rb_free_reps);
+ if (!node)
+ return NULL;
+ return llist_entry(node, struct rpcrdma_rep, rr_node);
+}
+
+static void rpcrdma_rep_put(struct rpcrdma_buffer *buf,
+ struct rpcrdma_rep *rep)
+{
+ if (!rep->rr_temp)
+ llist_add(&rep->rr_node, &buf->rb_free_reps);
+ else
+ rpcrdma_rep_destroy(rep);
+}
+
+static void rpcrdma_reps_destroy(struct rpcrdma_buffer *buf)
+{
+ struct rpcrdma_rep *rep;
+
+ while ((rep = rpcrdma_rep_get_locked(buf)) != NULL)
+ rpcrdma_rep_destroy(rep);
+}
+
+/**
+ * rpcrdma_buffer_create - Create initial set of req/rep objects
+ * @r_xprt: transport instance to (re)initialize
+ *
+ * Returns zero on success, otherwise a negative errno.
+ */
+int rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt)
{
struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
int i, rc;
- buf->rb_max_requests = r_xprt->rx_data.max_requests;
+ buf->rb_max_requests = r_xprt->rx_ep.rep_max_requests;
buf->rb_bc_srv_max_requests = 0;
- spin_lock_init(&buf->rb_mrlock);
spin_lock_init(&buf->rb_lock);
- spin_lock_init(&buf->rb_recovery_lock);
INIT_LIST_HEAD(&buf->rb_mrs);
- INIT_LIST_HEAD(&buf->rb_all);
- INIT_LIST_HEAD(&buf->rb_stale_mrs);
- INIT_DELAYED_WORK(&buf->rb_refresh_worker,
- rpcrdma_mr_refresh_worker);
- INIT_DELAYED_WORK(&buf->rb_recovery_worker,
- rpcrdma_mr_recovery_worker);
+ INIT_LIST_HEAD(&buf->rb_all_mrs);
+ INIT_WORK(&buf->rb_refresh_worker, rpcrdma_mr_refresh_worker);
rpcrdma_mrs_create(r_xprt);
INIT_LIST_HEAD(&buf->rb_send_bufs);
INIT_LIST_HEAD(&buf->rb_allreqs);
- spin_lock_init(&buf->rb_reqslock);
+
+ rc = -ENOMEM;
for (i = 0; i < buf->rb_max_requests; i++) {
struct rpcrdma_req *req;
- req = rpcrdma_create_req(r_xprt);
- if (IS_ERR(req)) {
- dprintk("RPC: %s: request buffer %d alloc"
- " failed\n", __func__, i);
- rc = PTR_ERR(req);
+ req = rpcrdma_req_create(r_xprt, RPCRDMA_V1_DEF_INLINE_SIZE,
+ GFP_KERNEL);
+ if (!req)
goto out;
- }
list_add(&req->rl_list, &buf->rb_send_bufs);
}
buf->rb_credits = 1;
- buf->rb_posted_receives = 0;
- INIT_LIST_HEAD(&buf->rb_recv_bufs);
+ init_llist_head(&buf->rb_free_reps);
rc = rpcrdma_sendctxs_create(r_xprt);
if (rc)
@@ -1183,19 +1154,23 @@
return rc;
}
-static void
-rpcrdma_destroy_rep(struct rpcrdma_rep *rep)
+/**
+ * rpcrdma_req_destroy - Destroy an rpcrdma_req object
+ * @req: unused object to be destroyed
+ *
+ * This function assumes that the caller prevents concurrent device
+ * unload and transport tear-down.
+ */
+void rpcrdma_req_destroy(struct rpcrdma_req *req)
{
- rpcrdma_free_regbuf(rep->rr_rdmabuf);
- kfree(rep);
-}
+ list_del(&req->rl_all);
-void
-rpcrdma_destroy_req(struct rpcrdma_req *req)
-{
- rpcrdma_free_regbuf(req->rl_recvbuf);
- rpcrdma_free_regbuf(req->rl_sendbuf);
- rpcrdma_free_regbuf(req->rl_rdmabuf);
+ while (!list_empty(&req->rl_free_mrs))
+ rpcrdma_mr_free(rpcrdma_mr_pop(&req->rl_free_mrs));
+
+ rpcrdma_regbuf_free(req->rl_recvbuf);
+ rpcrdma_regbuf_free(req->rl_sendbuf);
+ rpcrdma_regbuf_free(req->rl_rdmabuf);
kfree(req);
}
@@ -1204,62 +1179,49 @@
{
struct rpcrdma_xprt *r_xprt = container_of(buf, struct rpcrdma_xprt,
rx_buf);
- struct rpcrdma_ia *ia = rdmab_to_ia(buf);
struct rpcrdma_mr *mr;
unsigned int count;
count = 0;
- spin_lock(&buf->rb_mrlock);
- while (!list_empty(&buf->rb_all)) {
- mr = list_entry(buf->rb_all.next, struct rpcrdma_mr, mr_all);
+ spin_lock(&buf->rb_lock);
+ while ((mr = list_first_entry_or_null(&buf->rb_all_mrs,
+ struct rpcrdma_mr,
+ mr_all)) != NULL) {
list_del(&mr->mr_all);
+ spin_unlock(&buf->rb_lock);
- spin_unlock(&buf->rb_mrlock);
-
- /* Ensure MW is not on any rl_registered list */
- if (!list_empty(&mr->mr_list))
- list_del(&mr->mr_list);
-
- ia->ri_ops->ro_release_mr(mr);
+ frwr_release_mr(mr);
count++;
- spin_lock(&buf->rb_mrlock);
+ spin_lock(&buf->rb_lock);
}
- spin_unlock(&buf->rb_mrlock);
+ spin_unlock(&buf->rb_lock);
r_xprt->rx_stats.mrs_allocated = 0;
-
- dprintk("RPC: %s: released %u MRs\n", __func__, count);
}
+/**
+ * rpcrdma_buffer_destroy - Release all hw resources
+ * @buf: root control block for resources
+ *
+ * ORDERING: relies on a prior rpcrdma_xprt_drain :
+ * - No more Send or Receive completions can occur
+ * - All MRs, reps, and reqs are returned to their free lists
+ */
void
rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
{
- cancel_delayed_work_sync(&buf->rb_recovery_worker);
- cancel_delayed_work_sync(&buf->rb_refresh_worker);
+ cancel_work_sync(&buf->rb_refresh_worker);
rpcrdma_sendctxs_destroy(buf);
+ rpcrdma_reps_destroy(buf);
- while (!list_empty(&buf->rb_recv_bufs)) {
- struct rpcrdma_rep *rep;
-
- rep = list_first_entry(&buf->rb_recv_bufs,
- struct rpcrdma_rep, rr_list);
- list_del(&rep->rr_list);
- rpcrdma_destroy_rep(rep);
- }
-
- spin_lock(&buf->rb_reqslock);
- while (!list_empty(&buf->rb_allreqs)) {
+ while (!list_empty(&buf->rb_send_bufs)) {
struct rpcrdma_req *req;
- req = list_first_entry(&buf->rb_allreqs,
- struct rpcrdma_req, rl_all);
- list_del(&req->rl_all);
-
- spin_unlock(&buf->rb_reqslock);
- rpcrdma_destroy_req(req);
- spin_lock(&buf->rb_reqslock);
+ req = list_first_entry(&buf->rb_send_bufs,
+ struct rpcrdma_req, rl_list);
+ list_del(&req->rl_list);
+ rpcrdma_req_destroy(req);
}
- spin_unlock(&buf->rb_reqslock);
rpcrdma_mrs_destroy(buf);
}
@@ -1275,61 +1237,42 @@
rpcrdma_mr_get(struct rpcrdma_xprt *r_xprt)
{
struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
- struct rpcrdma_mr *mr = NULL;
+ struct rpcrdma_mr *mr;
- spin_lock(&buf->rb_mrlock);
- if (!list_empty(&buf->rb_mrs))
- mr = rpcrdma_mr_pop(&buf->rb_mrs);
- spin_unlock(&buf->rb_mrlock);
-
- if (!mr)
- goto out_nomrs;
+ spin_lock(&buf->rb_lock);
+ mr = rpcrdma_mr_pop(&buf->rb_mrs);
+ spin_unlock(&buf->rb_lock);
return mr;
-
-out_nomrs:
- trace_xprtrdma_nomrs(r_xprt);
- if (r_xprt->rx_ep.rep_connected != -ENODEV)
- schedule_delayed_work(&buf->rb_refresh_worker, 0);
-
- /* Allow the reply handler and refresh worker to run */
- cond_resched();
-
- return NULL;
-}
-
-static void
-__rpcrdma_mr_put(struct rpcrdma_buffer *buf, struct rpcrdma_mr *mr)
-{
- spin_lock(&buf->rb_mrlock);
- rpcrdma_mr_push(mr, &buf->rb_mrs);
- spin_unlock(&buf->rb_mrlock);
}
/**
- * rpcrdma_mr_put - Release an rpcrdma_mr object
- * @mr: object to release
+ * rpcrdma_mr_put - DMA unmap an MR and release it
+ * @mr: MR to release
*
*/
-void
-rpcrdma_mr_put(struct rpcrdma_mr *mr)
-{
- __rpcrdma_mr_put(&mr->mr_xprt->rx_buf, mr);
-}
-
-/**
- * rpcrdma_mr_unmap_and_put - DMA unmap an MR and release it
- * @mr: object to release
- *
- */
-void
-rpcrdma_mr_unmap_and_put(struct rpcrdma_mr *mr)
+void rpcrdma_mr_put(struct rpcrdma_mr *mr)
{
struct rpcrdma_xprt *r_xprt = mr->mr_xprt;
- trace_xprtrdma_dma_unmap(mr);
- ib_dma_unmap_sg(r_xprt->rx_ia.ri_device,
- mr->mr_sg, mr->mr_nents, mr->mr_dir);
- __rpcrdma_mr_put(&r_xprt->rx_buf, mr);
+ if (mr->mr_dir != DMA_NONE) {
+ trace_xprtrdma_mr_unmap(mr);
+ ib_dma_unmap_sg(r_xprt->rx_ia.ri_id->device,
+ mr->mr_sg, mr->mr_nents, mr->mr_dir);
+ mr->mr_dir = DMA_NONE;
+ }
+
+ rpcrdma_mr_push(mr, &mr->mr_req->rl_free_mrs);
+}
+
+static void rpcrdma_mr_free(struct rpcrdma_mr *mr)
+{
+ struct rpcrdma_xprt *r_xprt = mr->mr_xprt;
+ struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
+
+ mr->mr_req = NULL;
+ spin_lock(&buf->rb_lock);
+ rpcrdma_mr_push(mr, &buf->rb_mrs);
+ spin_unlock(&buf->rb_lock);
}
/**
@@ -1354,105 +1297,112 @@
/**
* rpcrdma_buffer_put - Put request/reply buffers back into pool
+ * @buffers: buffer pool
* @req: object to return
*
*/
-void
-rpcrdma_buffer_put(struct rpcrdma_req *req)
+void rpcrdma_buffer_put(struct rpcrdma_buffer *buffers, struct rpcrdma_req *req)
{
- struct rpcrdma_buffer *buffers = req->rl_buffer;
- struct rpcrdma_rep *rep = req->rl_reply;
-
+ if (req->rl_reply)
+ rpcrdma_rep_put(buffers, req->rl_reply);
req->rl_reply = NULL;
spin_lock(&buffers->rb_lock);
list_add(&req->rl_list, &buffers->rb_send_bufs);
- if (rep) {
- if (!rep->rr_temp) {
- list_add(&rep->rr_list, &buffers->rb_recv_bufs);
- rep = NULL;
- }
- }
spin_unlock(&buffers->rb_lock);
- if (rep)
- rpcrdma_destroy_rep(rep);
-}
-
-/*
- * Put reply buffers back into pool when not attached to
- * request. This happens in error conditions.
- */
-void
-rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep)
-{
- struct rpcrdma_buffer *buffers = &rep->rr_rxprt->rx_buf;
-
- if (!rep->rr_temp) {
- spin_lock(&buffers->rb_lock);
- list_add(&rep->rr_list, &buffers->rb_recv_bufs);
- spin_unlock(&buffers->rb_lock);
- } else {
- rpcrdma_destroy_rep(rep);
- }
}
/**
- * rpcrdma_alloc_regbuf - allocate and DMA-map memory for SEND/RECV buffers
- * @size: size of buffer to be allocated, in bytes
- * @direction: direction of data movement
- * @flags: GFP flags
+ * rpcrdma_recv_buffer_put - Release rpcrdma_rep back to free list
+ * @rep: rep to release
*
- * Returns an ERR_PTR, or a pointer to a regbuf, a buffer that
- * can be persistently DMA-mapped for I/O.
+ * Used after error conditions.
+ */
+void rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep)
+{
+ rpcrdma_rep_put(&rep->rr_rxprt->rx_buf, rep);
+}
+
+/* Returns a pointer to a rpcrdma_regbuf object, or NULL.
*
* xprtrdma uses a regbuf for posting an outgoing RDMA SEND, or for
* receiving the payload of RDMA RECV operations. During Long Calls
- * or Replies they may be registered externally via ro_map.
+ * or Replies they may be registered externally via frwr_map.
*/
-struct rpcrdma_regbuf *
-rpcrdma_alloc_regbuf(size_t size, enum dma_data_direction direction,
+static struct rpcrdma_regbuf *
+rpcrdma_regbuf_alloc(size_t size, enum dma_data_direction direction,
gfp_t flags)
{
struct rpcrdma_regbuf *rb;
- rb = kmalloc(sizeof(*rb) + size, flags);
- if (rb == NULL)
- return ERR_PTR(-ENOMEM);
+ rb = kmalloc(sizeof(*rb), flags);
+ if (!rb)
+ return NULL;
+ rb->rg_data = kmalloc(size, flags);
+ if (!rb->rg_data) {
+ kfree(rb);
+ return NULL;
+ }
rb->rg_device = NULL;
rb->rg_direction = direction;
rb->rg_iov.length = size;
-
return rb;
}
/**
- * __rpcrdma_map_regbuf - DMA-map a regbuf
- * @ia: controlling rpcrdma_ia
- * @rb: regbuf to be mapped
+ * rpcrdma_regbuf_realloc - re-allocate a SEND/RECV buffer
+ * @rb: regbuf to reallocate
+ * @size: size of buffer to be allocated, in bytes
+ * @flags: GFP flags
+ *
+ * Returns true if reallocation was successful. If false is
+ * returned, @rb is left untouched.
*/
-bool
-__rpcrdma_dma_map_regbuf(struct rpcrdma_ia *ia, struct rpcrdma_regbuf *rb)
+bool rpcrdma_regbuf_realloc(struct rpcrdma_regbuf *rb, size_t size, gfp_t flags)
{
- struct ib_device *device = ia->ri_device;
+ void *buf;
+
+ buf = kmalloc(size, flags);
+ if (!buf)
+ return false;
+
+ rpcrdma_regbuf_dma_unmap(rb);
+ kfree(rb->rg_data);
+
+ rb->rg_data = buf;
+ rb->rg_iov.length = size;
+ return true;
+}
+
+/**
+ * __rpcrdma_regbuf_dma_map - DMA-map a regbuf
+ * @r_xprt: controlling transport instance
+ * @rb: regbuf to be mapped
+ *
+ * Returns true if the buffer is now DMA mapped to @r_xprt's device
+ */
+bool __rpcrdma_regbuf_dma_map(struct rpcrdma_xprt *r_xprt,
+ struct rpcrdma_regbuf *rb)
+{
+ struct ib_device *device = r_xprt->rx_ia.ri_id->device;
if (rb->rg_direction == DMA_NONE)
return false;
- rb->rg_iov.addr = ib_dma_map_single(device,
- (void *)rb->rg_base,
- rdmab_length(rb),
- rb->rg_direction);
- if (ib_dma_mapping_error(device, rdmab_addr(rb)))
+ rb->rg_iov.addr = ib_dma_map_single(device, rdmab_data(rb),
+ rdmab_length(rb), rb->rg_direction);
+ if (ib_dma_mapping_error(device, rdmab_addr(rb))) {
+ trace_xprtrdma_dma_maperr(rdmab_addr(rb));
return false;
+ }
rb->rg_device = device;
- rb->rg_iov.lkey = ia->ri_pd->local_dma_lkey;
+ rb->rg_iov.lkey = r_xprt->rx_ia.ri_pd->local_dma_lkey;
return true;
}
-static void
-rpcrdma_dma_unmap_regbuf(struct rpcrdma_regbuf *rb)
+static void rpcrdma_regbuf_dma_unmap(struct rpcrdma_regbuf *rb)
{
if (!rb)
return;
@@ -1460,26 +1410,27 @@
if (!rpcrdma_regbuf_is_mapped(rb))
return;
- ib_dma_unmap_single(rb->rg_device, rdmab_addr(rb),
- rdmab_length(rb), rb->rg_direction);
+ ib_dma_unmap_single(rb->rg_device, rdmab_addr(rb), rdmab_length(rb),
+ rb->rg_direction);
rb->rg_device = NULL;
}
-/**
- * rpcrdma_free_regbuf - deregister and free registered buffer
- * @rb: regbuf to be deregistered and freed
- */
-void
-rpcrdma_free_regbuf(struct rpcrdma_regbuf *rb)
+static void rpcrdma_regbuf_free(struct rpcrdma_regbuf *rb)
{
- rpcrdma_dma_unmap_regbuf(rb);
+ rpcrdma_regbuf_dma_unmap(rb);
+ if (rb)
+ kfree(rb->rg_data);
kfree(rb);
}
-/*
- * Prepost any receive buffer, then post send.
+/**
+ * rpcrdma_ep_post - Post WRs to a transport's Send Queue
+ * @ia: transport's device information
+ * @ep: transport's RDMA endpoint information
+ * @req: rpcrdma_req containing the Send WR to post
*
- * Receive buffer is donated to hardware, reclaimed upon recv completion.
+ * Returns 0 if the post was successful, otherwise -ENOTCONN
+ * is returned.
*/
int
rpcrdma_ep_post(struct rpcrdma_ia *ia,
@@ -1489,8 +1440,7 @@
struct ib_send_wr *send_wr = &req->rl_sendctx->sc_wr;
int rc;
- if (!ep->rep_send_count ||
- test_bit(RPCRDMA_REQ_F_TX_RESOURCES, &req->rl_flags)) {
+ if (!ep->rep_send_count || kref_read(&req->rl_kref) > 1) {
send_wr->send_flags |= IB_SEND_SIGNALED;
ep->rep_send_count = ep->rep_send_batch;
} else {
@@ -1498,77 +1448,79 @@
--ep->rep_send_count;
}
- rc = ia->ri_ops->ro_send(ia, req);
+ rc = frwr_send(ia, req);
trace_xprtrdma_post_send(req, rc);
if (rc)
return -ENOTCONN;
return 0;
}
-/**
- * rpcrdma_post_recvs - Maybe post some Receive buffers
- * @r_xprt: controlling transport
- * @temp: when true, allocate temp rpcrdma_rep objects
- *
- */
-void
+static void
rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp)
{
struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
- struct ib_recv_wr *wr, *bad_wr;
+ struct rpcrdma_ep *ep = &r_xprt->rx_ep;
+ struct ib_recv_wr *i, *wr, *bad_wr;
+ struct rpcrdma_rep *rep;
int needed, count, rc;
- needed = buf->rb_credits + (buf->rb_bc_srv_max_requests << 1);
- if (buf->rb_posted_receives > needed)
- return;
- needed -= buf->rb_posted_receives;
-
+ rc = 0;
count = 0;
+
+ needed = buf->rb_credits + (buf->rb_bc_srv_max_requests << 1);
+ if (likely(ep->rep_receive_count > needed))
+ goto out;
+ needed -= ep->rep_receive_count;
+ if (!temp)
+ needed += RPCRDMA_MAX_RECV_BATCH;
+
+ /* fast path: all needed reps can be found on the free list */
wr = NULL;
while (needed) {
- struct rpcrdma_regbuf *rb;
- struct rpcrdma_rep *rep;
+ rep = rpcrdma_rep_get_locked(buf);
+ if (!rep)
+ rep = rpcrdma_rep_create(r_xprt, temp);
+ if (!rep)
+ break;
- spin_lock(&buf->rb_lock);
- rep = list_first_entry_or_null(&buf->rb_recv_bufs,
- struct rpcrdma_rep, rr_list);
- if (likely(rep))
- list_del(&rep->rr_list);
- spin_unlock(&buf->rb_lock);
- if (!rep) {
- if (rpcrdma_create_rep(r_xprt, temp))
- break;
- continue;
- }
-
- rb = rep->rr_rdmabuf;
- if (!rpcrdma_regbuf_is_mapped(rb)) {
- if (!__rpcrdma_dma_map_regbuf(&r_xprt->rx_ia, rb)) {
- rpcrdma_recv_buffer_put(rep);
- break;
- }
- }
-
- trace_xprtrdma_post_recv(rep->rr_recv_wr.wr_cqe);
rep->rr_recv_wr.next = wr;
wr = &rep->rr_recv_wr;
- ++count;
--needed;
}
- if (!count)
- return;
+ if (!wr)
+ goto out;
+
+ for (i = wr; i; i = i->next) {
+ rep = container_of(i, struct rpcrdma_rep, rr_recv_wr);
+
+ if (!rpcrdma_regbuf_dma_map(r_xprt, rep->rr_rdmabuf))
+ goto release_wrs;
+
+ trace_xprtrdma_post_recv(rep);
+ ++count;
+ }
rc = ib_post_recv(r_xprt->rx_ia.ri_id->qp, wr,
(const struct ib_recv_wr **)&bad_wr);
+out:
+ trace_xprtrdma_post_recvs(r_xprt, count, rc);
if (rc) {
- for (wr = bad_wr; wr; wr = wr->next) {
+ for (wr = bad_wr; wr;) {
struct rpcrdma_rep *rep;
rep = container_of(wr, struct rpcrdma_rep, rr_recv_wr);
+ wr = wr->next;
rpcrdma_recv_buffer_put(rep);
--count;
}
}
- buf->rb_posted_receives += count;
- trace_xprtrdma_post_recvs(r_xprt, count, rc);
+ ep->rep_receive_count += count;
+ return;
+
+release_wrs:
+ for (i = wr; i;) {
+ rep = container_of(i, struct rpcrdma_rep, rr_recv_wr);
+ i = i->next;
+ rpcrdma_recv_buffer_put(rep);
+ }
}
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index 2ca14f7..65e6b0e 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -44,8 +44,10 @@
#include <linux/wait.h> /* wait_queue_head_t, etc */
#include <linux/spinlock.h> /* spinlock_t, etc */
-#include <linux/atomic.h> /* atomic_t, etc */
+#include <linux/atomic.h> /* atomic_t, etc */
+#include <linux/kref.h> /* struct kref */
#include <linux/workqueue.h> /* struct work_struct */
+#include <linux/llist.h>
#include <rdma/rdma_cm.h> /* RDMA connection api */
#include <rdma/ib_verbs.h> /* RDMA verbs api */
@@ -66,23 +68,17 @@
* Interface Adapter -- one per transport instance
*/
struct rpcrdma_ia {
- const struct rpcrdma_memreg_ops *ri_ops;
- struct ib_device *ri_device;
struct rdma_cm_id *ri_id;
struct ib_pd *ri_pd;
- struct completion ri_done;
- struct completion ri_remove_done;
int ri_async_rc;
unsigned int ri_max_segs;
unsigned int ri_max_frwr_depth;
- unsigned int ri_max_inline_write;
- unsigned int ri_max_inline_read;
unsigned int ri_max_send_sges;
bool ri_implicit_roundup;
enum ib_mr_type ri_mrtype;
unsigned long ri_flags;
- struct ib_qp_attr ri_qp_attr;
- struct ib_qp_init_attr ri_qp_init_attr;
+ struct completion ri_done;
+ struct completion ri_remove_done;
};
enum {
@@ -96,84 +92,86 @@
struct rpcrdma_ep {
unsigned int rep_send_count;
unsigned int rep_send_batch;
+ unsigned int rep_max_inline_send;
+ unsigned int rep_max_inline_recv;
int rep_connected;
struct ib_qp_init_attr rep_attr;
wait_queue_head_t rep_connect_wait;
struct rpcrdma_connect_private rep_cm_private;
struct rdma_conn_param rep_remote_cma;
- struct delayed_work rep_connect_worker;
+ unsigned int rep_max_requests; /* set by /proc */
+ unsigned int rep_inline_send; /* negotiated */
+ unsigned int rep_inline_recv; /* negotiated */
+ int rep_receive_count;
};
/* Pre-allocate extra Work Requests for handling backward receives
* and sends. This is a fixed value because the Work Queues are
- * allocated when the forward channel is set up.
+ * allocated when the forward channel is set up, long before the
+ * backchannel is provisioned. This value is two times
+ * NFS4_DEF_CB_SLOT_TABLE_SIZE.
*/
#if defined(CONFIG_SUNRPC_BACKCHANNEL)
-#define RPCRDMA_BACKWARD_WRS (8)
+#define RPCRDMA_BACKWARD_WRS (32)
#else
-#define RPCRDMA_BACKWARD_WRS (0)
+#define RPCRDMA_BACKWARD_WRS (0)
#endif
/* Registered buffer -- registered kmalloc'd memory for RDMA SEND/RECV
- *
- * The below structure appears at the front of a large region of kmalloc'd
- * memory, which always starts on a good alignment boundary.
*/
struct rpcrdma_regbuf {
struct ib_sge rg_iov;
struct ib_device *rg_device;
enum dma_data_direction rg_direction;
- __be32 rg_base[0] __attribute__ ((aligned(256)));
+ void *rg_data;
};
-static inline u64
-rdmab_addr(struct rpcrdma_regbuf *rb)
+static inline u64 rdmab_addr(struct rpcrdma_regbuf *rb)
{
return rb->rg_iov.addr;
}
-static inline u32
-rdmab_length(struct rpcrdma_regbuf *rb)
+static inline u32 rdmab_length(struct rpcrdma_regbuf *rb)
{
return rb->rg_iov.length;
}
-static inline u32
-rdmab_lkey(struct rpcrdma_regbuf *rb)
+static inline u32 rdmab_lkey(struct rpcrdma_regbuf *rb)
{
return rb->rg_iov.lkey;
}
-static inline struct ib_device *
-rdmab_device(struct rpcrdma_regbuf *rb)
+static inline struct ib_device *rdmab_device(struct rpcrdma_regbuf *rb)
{
return rb->rg_device;
}
+static inline void *rdmab_data(const struct rpcrdma_regbuf *rb)
+{
+ return rb->rg_data;
+}
+
#define RPCRDMA_DEF_GFP (GFP_NOIO | __GFP_NOWARN)
/* To ensure a transport can always make forward progress,
* the number of RDMA segments allowed in header chunk lists
- * is capped at 8. This prevents less-capable devices and
- * memory registrations from overrunning the Send buffer
- * while building chunk lists.
+ * is capped at 16. This prevents less-capable devices from
+ * overrunning the Send buffer while building chunk lists.
*
* Elements of the Read list take up more room than the
- * Write list or Reply chunk. 8 read segments means the Read
- * list (or Write list or Reply chunk) cannot consume more
- * than
+ * Write list or Reply chunk. 16 read segments means the
+ * chunk lists cannot consume more than
*
- * ((8 + 2) * read segment size) + 1 XDR words, or 244 bytes.
+ * ((16 + 2) * read segment size) + 1 XDR words,
*
- * And the fixed part of the header is another 24 bytes.
- *
- * The smallest inline threshold is 1024 bytes, ensuring that
- * at least 750 bytes are available for RPC messages.
+ * or about 400 bytes. The fixed part of the header is
+ * another 24 bytes. Thus when the inline threshold is
+ * 1024 bytes, at least 600 bytes are available for RPC
+ * message bodies.
*/
enum {
- RPCRDMA_MAX_HDR_SEGS = 8,
- RPCRDMA_HDRBUF_SIZE = 256,
+ RPCRDMA_MAX_HDR_SEGS = 16,
};
/*
@@ -200,14 +198,23 @@
bool rr_temp;
struct rpcrdma_regbuf *rr_rdmabuf;
struct rpcrdma_xprt *rr_rxprt;
- struct work_struct rr_work;
+ struct rpc_rqst *rr_rqst;
struct xdr_buf rr_hdrbuf;
struct xdr_stream rr_stream;
- struct rpc_rqst *rr_rqst;
- struct list_head rr_list;
+ struct llist_node rr_node;
struct ib_recv_wr rr_recv_wr;
};
+/* To reduce the rate at which a transport invokes ib_post_recv
+ * (and thus the hardware doorbell rate), xprtrdma posts Receive
+ * WRs in batches.
+ *
+ * Setting this to zero disables Receive post batching.
+ */
+enum {
+ RPCRDMA_MAX_RECV_BATCH = 7,
+};
+
/* struct rpcrdma_sendctx - DMA mapped SGEs to unmap after Send completes
*/
struct rpcrdma_req;
@@ -215,46 +222,22 @@
struct rpcrdma_sendctx {
struct ib_send_wr sc_wr;
struct ib_cqe sc_cqe;
+ struct ib_device *sc_device;
struct rpcrdma_xprt *sc_xprt;
struct rpcrdma_req *sc_req;
unsigned int sc_unmap_count;
struct ib_sge sc_sges[];
};
-/* Limit the number of SGEs that can be unmapped during one
- * Send completion. This caps the amount of work a single
- * completion can do before returning to the provider.
- *
- * Setting this to zero disables Send completion batching.
- */
-enum {
- RPCRDMA_MAX_SEND_BATCH = 7,
-};
-
/*
* struct rpcrdma_mr - external memory region metadata
*
* An external memory region is any buffer or page that is registered
* on the fly (ie, not pre-registered).
- *
- * Each rpcrdma_buffer has a list of free MWs anchored in rb_mrs. During
- * call_allocate, rpcrdma_buffer_get() assigns one to each segment in
- * an rpcrdma_req. Then rpcrdma_register_external() grabs these to keep
- * track of registration metadata while each RPC is pending.
- * rpcrdma_deregister_external() uses this metadata to unmap and
- * release these resources when an RPC is complete.
*/
-enum rpcrdma_frwr_state {
- FRWR_IS_INVALID, /* ready to be used */
- FRWR_IS_VALID, /* in use */
- FRWR_FLUSHED_FR, /* flushed FASTREG WR */
- FRWR_FLUSHED_LI, /* flushed LOCALINV WR */
-};
-
struct rpcrdma_frwr {
struct ib_mr *fr_mr;
struct ib_cqe fr_cqe;
- enum rpcrdma_frwr_state fr_state;
struct completion fr_linv_done;
union {
struct ib_reg_wr fr_regwr;
@@ -262,24 +245,19 @@
};
};
-struct rpcrdma_fmr {
- struct ib_fmr *fm_mr;
- u64 *fm_physaddrs;
-};
-
+struct rpcrdma_req;
struct rpcrdma_mr {
struct list_head mr_list;
+ struct rpcrdma_req *mr_req;
struct scatterlist *mr_sg;
int mr_nents;
enum dma_data_direction mr_dir;
- union {
- struct rpcrdma_fmr fmr;
- struct rpcrdma_frwr frwr;
- };
+ struct rpcrdma_frwr frwr;
struct rpcrdma_xprt *mr_xprt;
u32 mr_handle;
u32 mr_length;
u64 mr_offset;
+ struct work_struct mr_recycle;
struct list_head mr_all;
};
@@ -337,7 +315,6 @@
struct rpcrdma_req {
struct list_head rl_list;
struct rpc_rqst rl_slot;
- struct rpcrdma_buffer *rl_buffer;
struct rpcrdma_rep *rl_reply;
struct xdr_stream rl_stream;
struct xdr_buf rl_hdrbuf;
@@ -347,18 +324,13 @@
struct rpcrdma_regbuf *rl_recvbuf; /* rq_rcv_buf */
struct list_head rl_all;
- unsigned long rl_flags;
+ struct kref rl_kref;
- struct list_head rl_registered; /* registered segments */
+ struct list_head rl_free_mrs;
+ struct list_head rl_registered;
struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS];
};
-/* rl_flags */
-enum {
- RPCRDMA_REQ_F_PENDING = 0,
- RPCRDMA_REQ_F_TX_RESOURCES,
-};
-
static inline struct rpcrdma_req *
rpcr_to_rdmar(const struct rpc_rqst *rqst)
{
@@ -368,7 +340,7 @@
static inline void
rpcrdma_mr_push(struct rpcrdma_mr *mr, struct list_head *list)
{
- list_add_tail(&mr->mr_list, list);
+ list_add(&mr->mr_list, list);
}
static inline struct rpcrdma_mr *
@@ -376,8 +348,9 @@
{
struct rpcrdma_mr *mr;
- mr = list_first_entry(list, struct rpcrdma_mr, mr_list);
- list_del_init(&mr->mr_list);
+ mr = list_first_entry_or_null(list, struct rpcrdma_mr, mr_list);
+ if (mr)
+ list_del_init(&mr->mr_list);
return mr;
}
@@ -388,53 +361,27 @@
* One of these is associated with a transport instance
*/
struct rpcrdma_buffer {
- spinlock_t rb_mrlock; /* protect rb_mrs list */
+ spinlock_t rb_lock;
+ struct list_head rb_send_bufs;
struct list_head rb_mrs;
- struct list_head rb_all;
unsigned long rb_sc_head;
unsigned long rb_sc_tail;
unsigned long rb_sc_last;
struct rpcrdma_sendctx **rb_sc_ctxs;
- spinlock_t rb_lock; /* protect buf lists */
- struct list_head rb_send_bufs;
- struct list_head rb_recv_bufs;
- unsigned long rb_flags;
+ struct list_head rb_allreqs;
+ struct list_head rb_all_mrs;
+
+ struct llist_head rb_free_reps;
+
u32 rb_max_requests;
u32 rb_credits; /* most recent credit grant */
- int rb_posted_receives;
u32 rb_bc_srv_max_requests;
- spinlock_t rb_reqslock; /* protect rb_allreqs */
- struct list_head rb_allreqs;
-
u32 rb_bc_max_requests;
- spinlock_t rb_recovery_lock; /* protect rb_stale_mrs */
- struct list_head rb_stale_mrs;
- struct delayed_work rb_recovery_worker;
- struct delayed_work rb_refresh_worker;
-};
-#define rdmab_to_ia(b) (&container_of((b), struct rpcrdma_xprt, rx_buf)->rx_ia)
-
-/* rb_flags */
-enum {
- RPCRDMA_BUF_F_EMPTY_SCQ = 0,
-};
-
-/*
- * Internal structure for transport instance creation. This
- * exists primarily for modularity.
- *
- * This data should be set with mount options
- */
-struct rpcrdma_create_data_internal {
- unsigned int max_requests; /* max requests (slots) in flight */
- unsigned int rsize; /* mount rsize - max read hdr+data */
- unsigned int wsize; /* mount wsize - max write hdr+data */
- unsigned int inline_rsize; /* max non-rdma read data payload */
- unsigned int inline_wsize; /* max non-rdma write data payload */
+ struct work_struct rb_refresh_worker;
};
/*
@@ -452,7 +399,7 @@
unsigned long hardway_register_count;
unsigned long failed_marshal_count;
unsigned long bad_reply_count;
- unsigned long mrs_recovered;
+ unsigned long mrs_recycled;
unsigned long mrs_orphaned;
unsigned long mrs_allocated;
unsigned long empty_sendctx_q;
@@ -467,36 +414,6 @@
};
/*
- * Per-registration mode operations
- */
-struct rpcrdma_xprt;
-struct rpcrdma_memreg_ops {
- struct rpcrdma_mr_seg *
- (*ro_map)(struct rpcrdma_xprt *,
- struct rpcrdma_mr_seg *, int, bool,
- struct rpcrdma_mr **);
- int (*ro_send)(struct rpcrdma_ia *ia,
- struct rpcrdma_req *req);
- void (*ro_reminv)(struct rpcrdma_rep *rep,
- struct list_head *mrs);
- void (*ro_unmap_sync)(struct rpcrdma_xprt *,
- struct list_head *);
- void (*ro_recover_mr)(struct rpcrdma_mr *mr);
- int (*ro_open)(struct rpcrdma_ia *,
- struct rpcrdma_ep *,
- struct rpcrdma_create_data_internal *);
- size_t (*ro_maxpages)(struct rpcrdma_xprt *);
- int (*ro_init_mr)(struct rpcrdma_ia *,
- struct rpcrdma_mr *);
- void (*ro_release_mr)(struct rpcrdma_mr *mr);
- const char *ro_displayname;
- const int ro_send_w_inv_ok;
-};
-
-extern const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops;
-extern const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops;
-
-/*
* RPCRDMA transport -- encapsulates the structures above for
* integration with RPC.
*
@@ -511,13 +428,12 @@
struct rpcrdma_ia rx_ia;
struct rpcrdma_ep rx_ep;
struct rpcrdma_buffer rx_buf;
- struct rpcrdma_create_data_internal rx_data;
struct delayed_work rx_connect_worker;
+ struct rpc_timeout rx_timeout;
struct rpcrdma_stats rx_stats;
};
#define rpcx_to_rdmax(x) container_of(x, struct rpcrdma_xprt, rx_xprt)
-#define rpcx_to_rdmad(x) (rpcx_to_rdmax(x)->rx_data)
static inline const char *
rpcrdma_addrstr(const struct rpcrdma_xprt *r_xprt)
@@ -547,65 +463,72 @@
int rpcrdma_ia_open(struct rpcrdma_xprt *xprt);
void rpcrdma_ia_remove(struct rpcrdma_ia *ia);
void rpcrdma_ia_close(struct rpcrdma_ia *);
-bool frwr_is_supported(struct rpcrdma_ia *);
-bool fmr_is_supported(struct rpcrdma_ia *);
-
-extern struct workqueue_struct *rpcrdma_receive_wq;
/*
* Endpoint calls - xprtrdma/verbs.c
*/
-int rpcrdma_ep_create(struct rpcrdma_ep *, struct rpcrdma_ia *,
- struct rpcrdma_create_data_internal *);
-void rpcrdma_ep_destroy(struct rpcrdma_ep *, struct rpcrdma_ia *);
+int rpcrdma_ep_create(struct rpcrdma_xprt *r_xprt);
+void rpcrdma_ep_destroy(struct rpcrdma_xprt *r_xprt);
int rpcrdma_ep_connect(struct rpcrdma_ep *, struct rpcrdma_ia *);
-void rpcrdma_conn_func(struct rpcrdma_ep *ep);
void rpcrdma_ep_disconnect(struct rpcrdma_ep *, struct rpcrdma_ia *);
int rpcrdma_ep_post(struct rpcrdma_ia *, struct rpcrdma_ep *,
struct rpcrdma_req *);
-void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp);
/*
* Buffer calls - xprtrdma/verbs.c
*/
-struct rpcrdma_req *rpcrdma_create_req(struct rpcrdma_xprt *);
-void rpcrdma_destroy_req(struct rpcrdma_req *);
+struct rpcrdma_req *rpcrdma_req_create(struct rpcrdma_xprt *r_xprt, size_t size,
+ gfp_t flags);
+void rpcrdma_req_destroy(struct rpcrdma_req *req);
int rpcrdma_buffer_create(struct rpcrdma_xprt *);
void rpcrdma_buffer_destroy(struct rpcrdma_buffer *);
-struct rpcrdma_sendctx *rpcrdma_sendctx_get_locked(struct rpcrdma_buffer *buf);
+struct rpcrdma_sendctx *rpcrdma_sendctx_get_locked(struct rpcrdma_xprt *r_xprt);
struct rpcrdma_mr *rpcrdma_mr_get(struct rpcrdma_xprt *r_xprt);
void rpcrdma_mr_put(struct rpcrdma_mr *mr);
-void rpcrdma_mr_unmap_and_put(struct rpcrdma_mr *mr);
-void rpcrdma_mr_defer_recovery(struct rpcrdma_mr *mr);
+
+static inline void
+rpcrdma_mr_recycle(struct rpcrdma_mr *mr)
+{
+ schedule_work(&mr->mr_recycle);
+}
struct rpcrdma_req *rpcrdma_buffer_get(struct rpcrdma_buffer *);
-void rpcrdma_buffer_put(struct rpcrdma_req *);
+void rpcrdma_buffer_put(struct rpcrdma_buffer *buffers,
+ struct rpcrdma_req *req);
void rpcrdma_recv_buffer_put(struct rpcrdma_rep *);
-struct rpcrdma_regbuf *rpcrdma_alloc_regbuf(size_t, enum dma_data_direction,
- gfp_t);
-bool __rpcrdma_dma_map_regbuf(struct rpcrdma_ia *, struct rpcrdma_regbuf *);
-void rpcrdma_free_regbuf(struct rpcrdma_regbuf *);
+bool rpcrdma_regbuf_realloc(struct rpcrdma_regbuf *rb, size_t size,
+ gfp_t flags);
+bool __rpcrdma_regbuf_dma_map(struct rpcrdma_xprt *r_xprt,
+ struct rpcrdma_regbuf *rb);
-static inline bool
-rpcrdma_regbuf_is_mapped(struct rpcrdma_regbuf *rb)
+/**
+ * rpcrdma_regbuf_is_mapped - check if buffer is DMA mapped
+ *
+ * Returns true if the buffer is now mapped to rb->rg_device.
+ */
+static inline bool rpcrdma_regbuf_is_mapped(struct rpcrdma_regbuf *rb)
{
return rb->rg_device != NULL;
}
-static inline bool
-rpcrdma_dma_map_regbuf(struct rpcrdma_ia *ia, struct rpcrdma_regbuf *rb)
+/**
+ * rpcrdma_regbuf_dma_map - DMA-map a regbuf
+ * @r_xprt: controlling transport instance
+ * @rb: regbuf to be mapped
+ *
+ * Returns true if the buffer is currently DMA mapped.
+ */
+static inline bool rpcrdma_regbuf_dma_map(struct rpcrdma_xprt *r_xprt,
+ struct rpcrdma_regbuf *rb)
{
if (likely(rpcrdma_regbuf_is_mapped(rb)))
return true;
- return __rpcrdma_dma_map_regbuf(ia, rb);
+ return __rpcrdma_regbuf_dma_map(r_xprt, rb);
}
-int rpcrdma_alloc_wq(void);
-void rpcrdma_destroy_wq(void);
-
/*
* Wrappers for chunk registration, shared by read/write chunk code.
*/
@@ -616,6 +539,24 @@
return writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
}
+/* Memory registration calls xprtrdma/frwr_ops.c
+ */
+bool frwr_is_supported(struct ib_device *device);
+void frwr_recycle(struct rpcrdma_req *req);
+void frwr_reset(struct rpcrdma_req *req);
+int frwr_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep);
+int frwr_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr);
+void frwr_release_mr(struct rpcrdma_mr *mr);
+size_t frwr_maxpages(struct rpcrdma_xprt *r_xprt);
+struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt,
+ struct rpcrdma_mr_seg *seg,
+ int nsegs, bool writing, __be32 xid,
+ struct rpcrdma_mr *mr);
+int frwr_send(struct rpcrdma_ia *ia, struct rpcrdma_req *req);
+void frwr_reminv(struct rpcrdma_rep *rep, struct list_head *mrs);
+void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req);
+void frwr_unmap_async(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req);
+
/*
* RPC/RDMA protocol calls - xprtrdma/rpc_rdma.c
*/
@@ -632,14 +573,11 @@
struct rpcrdma_req *req, u32 hdrlen,
struct xdr_buf *xdr,
enum rpcrdma_chunktype rtype);
-void rpcrdma_unmap_sendctx(struct rpcrdma_sendctx *sc);
+void rpcrdma_sendctx_unmap(struct rpcrdma_sendctx *sc);
int rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst);
void rpcrdma_set_max_header_sizes(struct rpcrdma_xprt *);
void rpcrdma_complete_rqst(struct rpcrdma_rep *rep);
void rpcrdma_reply_handler(struct rpcrdma_rep *rep);
-void rpcrdma_release_rqst(struct rpcrdma_xprt *r_xprt,
- struct rpcrdma_req *req);
-void rpcrdma_deferred_completion(struct work_struct *work);
static inline void rpcrdma_set_xdrlen(struct xdr_buf *xdr, size_t len)
{
@@ -649,10 +587,12 @@
/* RPC/RDMA module init - xprtrdma/transport.c
*/
+extern unsigned int xprt_rdma_slot_table_entries;
extern unsigned int xprt_rdma_max_inline_read;
+extern unsigned int xprt_rdma_max_inline_write;
void xprt_rdma_format_addresses(struct rpc_xprt *xprt, struct sockaddr *sap);
void xprt_rdma_free_addresses(struct rpc_xprt *xprt);
-void rpcrdma_connect_worker(struct work_struct *work);
+void xprt_rdma_close(struct rpc_xprt *xprt);
void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq);
int xprt_rdma_init(void);
void xprt_rdma_cleanup(void);
@@ -661,8 +601,8 @@
*/
#if defined(CONFIG_SUNRPC_BACKCHANNEL)
int xprt_rdma_bc_setup(struct rpc_xprt *, unsigned int);
-int xprt_rdma_bc_up(struct svc_serv *, struct net *);
size_t xprt_rdma_bc_maxpayload(struct rpc_xprt *);
+unsigned int xprt_rdma_bc_max_slots(struct rpc_xprt *);
int rpcrdma_bc_post_recv(struct rpcrdma_xprt *, unsigned int);
void rpcrdma_bc_receive_call(struct rpcrdma_xprt *, struct rpcrdma_rep *);
int xprt_rdma_bc_send_reply(struct rpc_rqst *rqst);
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 6b7539c..70e52f5 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -47,13 +47,15 @@
#include <net/checksum.h>
#include <net/udp.h>
#include <net/tcp.h>
+#include <linux/bvec.h>
+#include <linux/highmem.h>
+#include <linux/uio.h>
+#include <linux/sched/mm.h>
#include <trace/events/sunrpc.h>
#include "sunrpc.h"
-#define RPC_TCP_READ_CHUNK_SZ (3*512*1024)
-
static void xs_close(struct rpc_xprt *xprt);
static void xs_tcp_set_socket_timeouts(struct rpc_xprt *xprt,
struct socket *sock);
@@ -68,8 +70,6 @@
static unsigned int xprt_min_resvport = RPC_DEF_MIN_RESVPORT;
static unsigned int xprt_max_resvport = RPC_DEF_MAX_RESVPORT;
-#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
-
#define XS_TCP_LINGER_TO (15U * HZ)
static unsigned int xs_tcp_fin_timeout __read_mostly = XS_TCP_LINGER_TO;
@@ -129,7 +129,7 @@
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = &xprt_min_resvport_limit,
- .extra2 = &xprt_max_resvport
+ .extra2 = &xprt_max_resvport_limit
},
{
.procname = "max_resvport",
@@ -137,7 +137,7 @@
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
- .extra1 = &xprt_min_resvport,
+ .extra1 = &xprt_min_resvport_limit,
.extra2 = &xprt_max_resvport_limit
},
{
@@ -159,8 +159,6 @@
{ },
};
-#endif
-
/*
* Wait duration for a reply from the RPC portmapper.
*/
@@ -325,61 +323,480 @@
}
}
-#define XS_SENDMSG_FLAGS (MSG_DONTWAIT | MSG_NOSIGNAL)
-
-static int xs_send_kvec(struct socket *sock, struct sockaddr *addr, int addrlen, struct kvec *vec, unsigned int base, int more)
+static size_t
+xs_alloc_sparse_pages(struct xdr_buf *buf, size_t want, gfp_t gfp)
{
- struct msghdr msg = {
- .msg_name = addr,
- .msg_namelen = addrlen,
- .msg_flags = XS_SENDMSG_FLAGS | (more ? MSG_MORE : 0),
- };
- struct kvec iov = {
- .iov_base = vec->iov_base + base,
- .iov_len = vec->iov_len - base,
- };
+ size_t i,n;
- if (iov.iov_len != 0)
- return kernel_sendmsg(sock, &msg, &iov, 1, iov.iov_len);
- return kernel_sendmsg(sock, &msg, NULL, 0, 0);
+ if (!want || !(buf->flags & XDRBUF_SPARSE_PAGES))
+ return want;
+ n = (buf->page_base + want + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ for (i = 0; i < n; i++) {
+ if (buf->pages[i])
+ continue;
+ buf->bvec[i].bv_page = buf->pages[i] = alloc_page(gfp);
+ if (!buf->pages[i]) {
+ i *= PAGE_SIZE;
+ return i > buf->page_base ? i - buf->page_base : 0;
+ }
+ }
+ return want;
}
-static int xs_send_pagedata(struct socket *sock, struct xdr_buf *xdr, unsigned int base, int more, bool zerocopy, int *sent_p)
+static ssize_t
+xs_sock_recvmsg(struct socket *sock, struct msghdr *msg, int flags, size_t seek)
{
- ssize_t (*do_sendpage)(struct socket *sock, struct page *page,
- int offset, size_t size, int flags);
- struct page **ppage;
- unsigned int remainder;
+ ssize_t ret;
+ if (seek != 0)
+ iov_iter_advance(&msg->msg_iter, seek);
+ ret = sock_recvmsg(sock, msg, flags);
+ return ret > 0 ? ret + seek : ret;
+}
+
+static ssize_t
+xs_read_kvec(struct socket *sock, struct msghdr *msg, int flags,
+ struct kvec *kvec, size_t count, size_t seek)
+{
+ iov_iter_kvec(&msg->msg_iter, READ, kvec, 1, count);
+ return xs_sock_recvmsg(sock, msg, flags, seek);
+}
+
+static ssize_t
+xs_read_bvec(struct socket *sock, struct msghdr *msg, int flags,
+ struct bio_vec *bvec, unsigned long nr, size_t count,
+ size_t seek)
+{
+ iov_iter_bvec(&msg->msg_iter, READ, bvec, nr, count);
+ return xs_sock_recvmsg(sock, msg, flags, seek);
+}
+
+static ssize_t
+xs_read_discard(struct socket *sock, struct msghdr *msg, int flags,
+ size_t count)
+{
+ iov_iter_discard(&msg->msg_iter, READ, count);
+ return sock_recvmsg(sock, msg, flags);
+}
+
+#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
+static void
+xs_flush_bvec(const struct bio_vec *bvec, size_t count, size_t seek)
+{
+ struct bvec_iter bi = {
+ .bi_size = count,
+ };
+ struct bio_vec bv;
+
+ bvec_iter_advance(bvec, &bi, seek & PAGE_MASK);
+ for_each_bvec(bv, bvec, bi, bi)
+ flush_dcache_page(bv.bv_page);
+}
+#else
+static inline void
+xs_flush_bvec(const struct bio_vec *bvec, size_t count, size_t seek)
+{
+}
+#endif
+
+static ssize_t
+xs_read_xdr_buf(struct socket *sock, struct msghdr *msg, int flags,
+ struct xdr_buf *buf, size_t count, size_t seek, size_t *read)
+{
+ size_t want, seek_init = seek, offset = 0;
+ ssize_t ret;
+
+ want = min_t(size_t, count, buf->head[0].iov_len);
+ if (seek < want) {
+ ret = xs_read_kvec(sock, msg, flags, &buf->head[0], want, seek);
+ if (ret <= 0)
+ goto sock_err;
+ offset += ret;
+ if (offset == count || msg->msg_flags & (MSG_EOR|MSG_TRUNC))
+ goto out;
+ if (ret != want)
+ goto out;
+ seek = 0;
+ } else {
+ seek -= want;
+ offset += want;
+ }
+
+ want = xs_alloc_sparse_pages(buf,
+ min_t(size_t, count - offset, buf->page_len),
+ GFP_KERNEL);
+ if (seek < want) {
+ ret = xs_read_bvec(sock, msg, flags, buf->bvec,
+ xdr_buf_pagecount(buf),
+ want + buf->page_base,
+ seek + buf->page_base);
+ if (ret <= 0)
+ goto sock_err;
+ xs_flush_bvec(buf->bvec, ret, seek + buf->page_base);
+ offset += ret - buf->page_base;
+ if (offset == count || msg->msg_flags & (MSG_EOR|MSG_TRUNC))
+ goto out;
+ if (ret != want)
+ goto out;
+ seek = 0;
+ } else {
+ seek -= want;
+ offset += want;
+ }
+
+ want = min_t(size_t, count - offset, buf->tail[0].iov_len);
+ if (seek < want) {
+ ret = xs_read_kvec(sock, msg, flags, &buf->tail[0], want, seek);
+ if (ret <= 0)
+ goto sock_err;
+ offset += ret;
+ if (offset == count || msg->msg_flags & (MSG_EOR|MSG_TRUNC))
+ goto out;
+ if (ret != want)
+ goto out;
+ } else if (offset < seek_init)
+ offset = seek_init;
+ ret = -EMSGSIZE;
+out:
+ *read = offset - seek_init;
+ return ret;
+sock_err:
+ offset += seek;
+ goto out;
+}
+
+static void
+xs_read_header(struct sock_xprt *transport, struct xdr_buf *buf)
+{
+ if (!transport->recv.copied) {
+ if (buf->head[0].iov_len >= transport->recv.offset)
+ memcpy(buf->head[0].iov_base,
+ &transport->recv.xid,
+ transport->recv.offset);
+ transport->recv.copied = transport->recv.offset;
+ }
+}
+
+static bool
+xs_read_stream_request_done(struct sock_xprt *transport)
+{
+ return transport->recv.fraghdr & cpu_to_be32(RPC_LAST_STREAM_FRAGMENT);
+}
+
+static void
+xs_read_stream_check_eor(struct sock_xprt *transport,
+ struct msghdr *msg)
+{
+ if (xs_read_stream_request_done(transport))
+ msg->msg_flags |= MSG_EOR;
+}
+
+static ssize_t
+xs_read_stream_request(struct sock_xprt *transport, struct msghdr *msg,
+ int flags, struct rpc_rqst *req)
+{
+ struct xdr_buf *buf = &req->rq_private_buf;
+ size_t want, uninitialized_var(read);
+ ssize_t uninitialized_var(ret);
+
+ xs_read_header(transport, buf);
+
+ want = transport->recv.len - transport->recv.offset;
+ if (want != 0) {
+ ret = xs_read_xdr_buf(transport->sock, msg, flags, buf,
+ transport->recv.copied + want,
+ transport->recv.copied,
+ &read);
+ transport->recv.offset += read;
+ transport->recv.copied += read;
+ }
+
+ if (transport->recv.offset == transport->recv.len)
+ xs_read_stream_check_eor(transport, msg);
+
+ if (want == 0)
+ return 0;
+
+ switch (ret) {
+ default:
+ break;
+ case -EFAULT:
+ case -EMSGSIZE:
+ msg->msg_flags |= MSG_TRUNC;
+ return read;
+ case 0:
+ return -ESHUTDOWN;
+ }
+ return ret < 0 ? ret : read;
+}
+
+static size_t
+xs_read_stream_headersize(bool isfrag)
+{
+ if (isfrag)
+ return sizeof(__be32);
+ return 3 * sizeof(__be32);
+}
+
+static ssize_t
+xs_read_stream_header(struct sock_xprt *transport, struct msghdr *msg,
+ int flags, size_t want, size_t seek)
+{
+ struct kvec kvec = {
+ .iov_base = &transport->recv.fraghdr,
+ .iov_len = want,
+ };
+ return xs_read_kvec(transport->sock, msg, flags, &kvec, want, seek);
+}
+
+#if defined(CONFIG_SUNRPC_BACKCHANNEL)
+static ssize_t
+xs_read_stream_call(struct sock_xprt *transport, struct msghdr *msg, int flags)
+{
+ struct rpc_xprt *xprt = &transport->xprt;
+ struct rpc_rqst *req;
+ ssize_t ret;
+
+ /* Look up and lock the request corresponding to the given XID */
+ req = xprt_lookup_bc_request(xprt, transport->recv.xid);
+ if (!req) {
+ printk(KERN_WARNING "Callback slot table overflowed\n");
+ return -ESHUTDOWN;
+ }
+ if (transport->recv.copied && !req->rq_private_buf.len)
+ return -ESHUTDOWN;
+
+ ret = xs_read_stream_request(transport, msg, flags, req);
+ if (msg->msg_flags & (MSG_EOR|MSG_TRUNC))
+ xprt_complete_bc_request(req, transport->recv.copied);
+ else
+ req->rq_private_buf.len = transport->recv.copied;
+
+ return ret;
+}
+#else /* CONFIG_SUNRPC_BACKCHANNEL */
+static ssize_t
+xs_read_stream_call(struct sock_xprt *transport, struct msghdr *msg, int flags)
+{
+ return -ESHUTDOWN;
+}
+#endif /* CONFIG_SUNRPC_BACKCHANNEL */
+
+static ssize_t
+xs_read_stream_reply(struct sock_xprt *transport, struct msghdr *msg, int flags)
+{
+ struct rpc_xprt *xprt = &transport->xprt;
+ struct rpc_rqst *req;
+ ssize_t ret = 0;
+
+ /* Look up and lock the request corresponding to the given XID */
+ spin_lock(&xprt->queue_lock);
+ req = xprt_lookup_rqst(xprt, transport->recv.xid);
+ if (!req || (transport->recv.copied && !req->rq_private_buf.len)) {
+ msg->msg_flags |= MSG_TRUNC;
+ goto out;
+ }
+ xprt_pin_rqst(req);
+ spin_unlock(&xprt->queue_lock);
+
+ ret = xs_read_stream_request(transport, msg, flags, req);
+
+ spin_lock(&xprt->queue_lock);
+ if (msg->msg_flags & (MSG_EOR|MSG_TRUNC))
+ xprt_complete_rqst(req->rq_task, transport->recv.copied);
+ else
+ req->rq_private_buf.len = transport->recv.copied;
+ xprt_unpin_rqst(req);
+out:
+ spin_unlock(&xprt->queue_lock);
+ return ret;
+}
+
+static ssize_t
+xs_read_stream(struct sock_xprt *transport, int flags)
+{
+ struct msghdr msg = { 0 };
+ size_t want, read = 0;
+ ssize_t ret = 0;
+
+ if (transport->recv.len == 0) {
+ want = xs_read_stream_headersize(transport->recv.copied != 0);
+ ret = xs_read_stream_header(transport, &msg, flags, want,
+ transport->recv.offset);
+ if (ret <= 0)
+ goto out_err;
+ transport->recv.offset = ret;
+ if (transport->recv.offset != want)
+ return transport->recv.offset;
+ transport->recv.len = be32_to_cpu(transport->recv.fraghdr) &
+ RPC_FRAGMENT_SIZE_MASK;
+ transport->recv.offset -= sizeof(transport->recv.fraghdr);
+ read = ret;
+ }
+
+ switch (be32_to_cpu(transport->recv.calldir)) {
+ default:
+ msg.msg_flags |= MSG_TRUNC;
+ break;
+ case RPC_CALL:
+ ret = xs_read_stream_call(transport, &msg, flags);
+ break;
+ case RPC_REPLY:
+ ret = xs_read_stream_reply(transport, &msg, flags);
+ }
+ if (msg.msg_flags & MSG_TRUNC) {
+ transport->recv.calldir = cpu_to_be32(-1);
+ transport->recv.copied = -1;
+ }
+ if (ret < 0)
+ goto out_err;
+ read += ret;
+ if (transport->recv.offset < transport->recv.len) {
+ if (!(msg.msg_flags & MSG_TRUNC))
+ return read;
+ msg.msg_flags = 0;
+ ret = xs_read_discard(transport->sock, &msg, flags,
+ transport->recv.len - transport->recv.offset);
+ if (ret <= 0)
+ goto out_err;
+ transport->recv.offset += ret;
+ read += ret;
+ if (transport->recv.offset != transport->recv.len)
+ return read;
+ }
+ if (xs_read_stream_request_done(transport)) {
+ trace_xs_stream_read_request(transport);
+ transport->recv.copied = 0;
+ }
+ transport->recv.offset = 0;
+ transport->recv.len = 0;
+ return read;
+out_err:
+ return ret != 0 ? ret : -ESHUTDOWN;
+}
+
+static __poll_t xs_poll_socket(struct sock_xprt *transport)
+{
+ return transport->sock->ops->poll(transport->file, transport->sock,
+ NULL);
+}
+
+static bool xs_poll_socket_readable(struct sock_xprt *transport)
+{
+ __poll_t events = xs_poll_socket(transport);
+
+ return (events & (EPOLLIN | EPOLLRDNORM)) && !(events & EPOLLRDHUP);
+}
+
+static void xs_poll_check_readable(struct sock_xprt *transport)
+{
+
+ clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state);
+ if (!xs_poll_socket_readable(transport))
+ return;
+ if (!test_and_set_bit(XPRT_SOCK_DATA_READY, &transport->sock_state))
+ queue_work(xprtiod_workqueue, &transport->recv_worker);
+}
+
+static void xs_stream_data_receive(struct sock_xprt *transport)
+{
+ size_t read = 0;
+ ssize_t ret = 0;
+
+ mutex_lock(&transport->recv_mutex);
+ if (transport->sock == NULL)
+ goto out;
+ for (;;) {
+ ret = xs_read_stream(transport, MSG_DONTWAIT);
+ if (ret < 0)
+ break;
+ read += ret;
+ cond_resched();
+ }
+ if (ret == -ESHUTDOWN)
+ kernel_sock_shutdown(transport->sock, SHUT_RDWR);
+ else
+ xs_poll_check_readable(transport);
+out:
+ mutex_unlock(&transport->recv_mutex);
+ trace_xs_stream_read_data(&transport->xprt, ret, read);
+}
+
+static void xs_stream_data_receive_workfn(struct work_struct *work)
+{
+ struct sock_xprt *transport =
+ container_of(work, struct sock_xprt, recv_worker);
+ unsigned int pflags = memalloc_nofs_save();
+
+ xs_stream_data_receive(transport);
+ memalloc_nofs_restore(pflags);
+}
+
+static void
+xs_stream_reset_connect(struct sock_xprt *transport)
+{
+ transport->recv.offset = 0;
+ transport->recv.len = 0;
+ transport->recv.copied = 0;
+ transport->xmit.offset = 0;
+}
+
+static void
+xs_stream_start_connect(struct sock_xprt *transport)
+{
+ transport->xprt.stat.connect_count++;
+ transport->xprt.stat.connect_start = jiffies;
+}
+
+#define XS_SENDMSG_FLAGS (MSG_DONTWAIT | MSG_NOSIGNAL)
+
+static int xs_sendmsg(struct socket *sock, struct msghdr *msg, size_t seek)
+{
+ if (seek)
+ iov_iter_advance(&msg->msg_iter, seek);
+ return sock_sendmsg(sock, msg);
+}
+
+static int xs_send_kvec(struct socket *sock, struct msghdr *msg, struct kvec *vec, size_t seek)
+{
+ iov_iter_kvec(&msg->msg_iter, WRITE, vec, 1, vec->iov_len);
+ return xs_sendmsg(sock, msg, seek);
+}
+
+static int xs_send_pagedata(struct socket *sock, struct msghdr *msg, struct xdr_buf *xdr, size_t base)
+{
int err;
- remainder = xdr->page_len - base;
- base += xdr->page_base;
- ppage = xdr->pages + (base >> PAGE_SHIFT);
- base &= ~PAGE_MASK;
- do_sendpage = sock->ops->sendpage;
- if (!zerocopy)
- do_sendpage = sock_no_sendpage;
- for(;;) {
- unsigned int len = min_t(unsigned int, PAGE_SIZE - base, remainder);
- int flags = XS_SENDMSG_FLAGS;
+ err = xdr_alloc_bvec(xdr, GFP_KERNEL);
+ if (err < 0)
+ return err;
- remainder -= len;
- if (more)
- flags |= MSG_MORE;
- if (remainder != 0)
- flags |= MSG_SENDPAGE_NOTLAST | MSG_MORE;
- err = do_sendpage(sock, *ppage, base, len, flags);
- if (remainder == 0 || err != len)
- break;
- *sent_p += err;
- ppage++;
- base = 0;
- }
- if (err > 0) {
- *sent_p += err;
- err = 0;
- }
- return err;
+ iov_iter_bvec(&msg->msg_iter, WRITE, xdr->bvec,
+ xdr_buf_pagecount(xdr),
+ xdr->page_len + xdr->page_base);
+ return xs_sendmsg(sock, msg, base + xdr->page_base);
+}
+
+#define xs_record_marker_len() sizeof(rpc_fraghdr)
+
+/* Common case:
+ * - stream transport
+ * - sending from byte 0 of the message
+ * - the message is wholly contained in @xdr's head iovec
+ */
+static int xs_send_rm_and_kvec(struct socket *sock, struct msghdr *msg,
+ rpc_fraghdr marker, struct kvec *vec, size_t base)
+{
+ struct kvec iov[2] = {
+ [0] = {
+ .iov_base = &marker,
+ .iov_len = sizeof(marker)
+ },
+ [1] = *vec,
+ };
+ size_t len = iov[0].iov_len + iov[1].iov_len;
+
+ iov_iter_kvec(&msg->msg_iter, WRITE, iov, 2, len);
+ return xs_sendmsg(sock, msg, base);
}
/**
@@ -389,49 +806,60 @@
* @addrlen: UDP only -- length of destination address
* @xdr: buffer containing this request
* @base: starting position in the buffer
- * @zerocopy: true if it is safe to use sendpage()
+ * @rm: stream record marker field
* @sent_p: return the total number of bytes successfully queued for sending
*
*/
-static int xs_sendpages(struct socket *sock, struct sockaddr *addr, int addrlen, struct xdr_buf *xdr, unsigned int base, bool zerocopy, int *sent_p)
+static int xs_sendpages(struct socket *sock, struct sockaddr *addr, int addrlen, struct xdr_buf *xdr, unsigned int base, rpc_fraghdr rm, int *sent_p)
{
- unsigned int remainder = xdr->len - base;
+ struct msghdr msg = {
+ .msg_name = addr,
+ .msg_namelen = addrlen,
+ .msg_flags = XS_SENDMSG_FLAGS | MSG_MORE,
+ };
+ unsigned int rmsize = rm ? sizeof(rm) : 0;
+ unsigned int remainder = rmsize + xdr->len - base;
+ unsigned int want;
int err = 0;
- int sent = 0;
if (unlikely(!sock))
return -ENOTSOCK;
- if (base != 0) {
- addr = NULL;
- addrlen = 0;
- }
-
- if (base < xdr->head[0].iov_len || addr != NULL) {
- unsigned int len = xdr->head[0].iov_len - base;
+ want = xdr->head[0].iov_len + rmsize;
+ if (base < want) {
+ unsigned int len = want - base;
remainder -= len;
- err = xs_send_kvec(sock, addr, addrlen, &xdr->head[0], base, remainder != 0);
+ if (remainder == 0)
+ msg.msg_flags &= ~MSG_MORE;
+ if (rmsize)
+ err = xs_send_rm_and_kvec(sock, &msg, rm,
+ &xdr->head[0], base);
+ else
+ err = xs_send_kvec(sock, &msg, &xdr->head[0], base);
if (remainder == 0 || err != len)
goto out;
*sent_p += err;
base = 0;
} else
- base -= xdr->head[0].iov_len;
+ base -= want;
if (base < xdr->page_len) {
unsigned int len = xdr->page_len - base;
remainder -= len;
- err = xs_send_pagedata(sock, xdr, base, remainder != 0, zerocopy, &sent);
- *sent_p += sent;
- if (remainder == 0 || sent != len)
+ if (remainder == 0)
+ msg.msg_flags &= ~MSG_MORE;
+ err = xs_send_pagedata(sock, &msg, xdr, base);
+ if (remainder == 0 || err != len)
goto out;
+ *sent_p += err;
base = 0;
} else
base -= xdr->page_len;
if (base >= xdr->tail[0].iov_len)
return 0;
- err = xs_send_kvec(sock, NULL, 0, &xdr->tail[0], base, 0);
+ msg.msg_flags &= ~MSG_MORE;
+ err = xs_send_kvec(sock, &msg, &xdr->tail[0], base);
out:
if (err > 0) {
*sent_p += err;
@@ -440,42 +868,35 @@
return err;
}
-static void xs_nospace_callback(struct rpc_task *task)
-{
- struct sock_xprt *transport = container_of(task->tk_rqstp->rq_xprt, struct sock_xprt, xprt);
-
- transport->inet->sk_write_pending--;
-}
-
/**
- * xs_nospace - place task on wait queue if transmit was incomplete
- * @task: task to put to sleep
+ * xs_nospace - handle transmit was incomplete
+ * @req: pointer to RPC request
*
*/
-static int xs_nospace(struct rpc_task *task)
+static int xs_nospace(struct rpc_rqst *req)
{
- struct rpc_rqst *req = task->tk_rqstp;
struct rpc_xprt *xprt = req->rq_xprt;
struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
struct sock *sk = transport->inet;
int ret = -EAGAIN;
dprintk("RPC: %5u xmit incomplete (%u left of %u)\n",
- task->tk_pid, req->rq_slen - req->rq_bytes_sent,
+ req->rq_task->tk_pid,
+ req->rq_slen - transport->xmit.offset,
req->rq_slen);
/* Protect against races with write_space */
- spin_lock_bh(&xprt->transport_lock);
+ spin_lock(&xprt->transport_lock);
/* Don't race with disconnect */
if (xprt_connected(xprt)) {
/* wait for more buffer space */
sk->sk_write_pending++;
- xprt_wait_for_buffer_space(task, xs_nospace_callback);
+ xprt_wait_for_buffer_space(xprt);
} else
ret = -ENOTCONN;
- spin_unlock_bh(&xprt->transport_lock);
+ spin_unlock(&xprt->transport_lock);
/* Race breaker in case memory is freed before above code is called */
if (ret == -EAGAIN) {
@@ -491,19 +912,37 @@
return ret;
}
-/*
- * Construct a stream transport record marker in @buf.
- */
-static inline void xs_encode_stream_record_marker(struct xdr_buf *buf)
+static void
+xs_stream_prepare_request(struct rpc_rqst *req)
{
- u32 reclen = buf->len - sizeof(rpc_fraghdr);
- rpc_fraghdr *base = buf->head[0].iov_base;
- *base = cpu_to_be32(RPC_LAST_STREAM_FRAGMENT | reclen);
+ xdr_free_bvec(&req->rq_rcv_buf);
+ req->rq_task->tk_status = xdr_alloc_bvec(&req->rq_rcv_buf, GFP_KERNEL);
+}
+
+/*
+ * Determine if the previous message in the stream was aborted before it
+ * could complete transmission.
+ */
+static bool
+xs_send_request_was_aborted(struct sock_xprt *transport, struct rpc_rqst *req)
+{
+ return transport->xmit.offset != 0 && req->rq_bytes_sent == 0;
+}
+
+/*
+ * Return the stream record marker field for a record of length < 2^31-1
+ */
+static rpc_fraghdr
+xs_stream_record_marker(struct xdr_buf *xdr)
+{
+ if (!xdr->len)
+ return 0;
+ return cpu_to_be32(RPC_LAST_STREAM_FRAGMENT | (u32)xdr->len);
}
/**
* xs_local_send_request - write an RPC request to an AF_LOCAL socket
- * @task: RPC task that manages the state of an RPC request
+ * @req: pointer to RPC request
*
* Return values:
* 0: The request has been sent
@@ -512,35 +951,41 @@
* ENOTCONN: Caller needs to invoke connect logic then call again
* other: Some other error occured, the request was not sent
*/
-static int xs_local_send_request(struct rpc_task *task)
+static int xs_local_send_request(struct rpc_rqst *req)
{
- struct rpc_rqst *req = task->tk_rqstp;
struct rpc_xprt *xprt = req->rq_xprt;
struct sock_xprt *transport =
container_of(xprt, struct sock_xprt, xprt);
struct xdr_buf *xdr = &req->rq_snd_buf;
+ rpc_fraghdr rm = xs_stream_record_marker(xdr);
+ unsigned int msglen = rm ? req->rq_slen + sizeof(rm) : req->rq_slen;
int status;
int sent = 0;
- xs_encode_stream_record_marker(&req->rq_snd_buf);
+ /* Close the stream if the previous transmission was incomplete */
+ if (xs_send_request_was_aborted(transport, req)) {
+ xs_close(xprt);
+ return -ENOTCONN;
+ }
xs_pktdump("packet data:",
req->rq_svec->iov_base, req->rq_svec->iov_len);
req->rq_xtime = ktime_get();
- status = xs_sendpages(transport->sock, NULL, 0, xdr, req->rq_bytes_sent,
- true, &sent);
+ status = xs_sendpages(transport->sock, NULL, 0, xdr,
+ transport->xmit.offset, rm, &sent);
dprintk("RPC: %s(%u) = %d\n",
- __func__, xdr->len - req->rq_bytes_sent, status);
+ __func__, xdr->len - transport->xmit.offset, status);
if (status == -EAGAIN && sock_writeable(transport->inet))
status = -ENOBUFS;
if (likely(sent > 0) || status == 0) {
- req->rq_bytes_sent += sent;
- req->rq_xmit_bytes_sent += sent;
- if (likely(req->rq_bytes_sent >= req->rq_slen)) {
- req->rq_bytes_sent = 0;
+ transport->xmit.offset += sent;
+ req->rq_bytes_sent = transport->xmit.offset;
+ if (likely(req->rq_bytes_sent >= msglen)) {
+ req->rq_xmit_bytes_sent += transport->xmit.offset;
+ transport->xmit.offset = 0;
return 0;
}
status = -EAGAIN;
@@ -550,7 +995,7 @@
case -ENOBUFS:
break;
case -EAGAIN:
- status = xs_nospace(task);
+ status = xs_nospace(req);
break;
default:
dprintk("RPC: sendmsg returned unrecognized error %d\n",
@@ -566,7 +1011,7 @@
/**
* xs_udp_send_request - write an RPC request to a UDP socket
- * @task: address of RPC task that manages the state of an RPC request
+ * @req: pointer to RPC request
*
* Return values:
* 0: The request has been sent
@@ -575,9 +1020,8 @@
* ENOTCONN: Caller needs to invoke connect logic then call again
* other: Some other error occurred, the request was not sent
*/
-static int xs_udp_send_request(struct rpc_task *task)
+static int xs_udp_send_request(struct rpc_rqst *req)
{
- struct rpc_rqst *req = task->tk_rqstp;
struct rpc_xprt *xprt = req->rq_xprt;
struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
struct xdr_buf *xdr = &req->rq_snd_buf;
@@ -590,12 +1034,16 @@
if (!xprt_bound(xprt))
return -ENOTCONN;
+
+ if (!xprt_request_get_cong(xprt, req))
+ return -EBADSLT;
+
req->rq_xtime = ktime_get();
status = xs_sendpages(transport->sock, xs_addr(xprt), xprt->addrlen,
- xdr, req->rq_bytes_sent, true, &sent);
+ xdr, 0, 0, &sent);
dprintk("RPC: xs_udp_send_request(%u) = %d\n",
- xdr->len - req->rq_bytes_sent, status);
+ xdr->len, status);
/* firewall is blocking us, don't return -EAGAIN or we end up looping */
if (status == -EPERM)
@@ -619,7 +1067,7 @@
/* Should we call xs_close() here? */
break;
case -EAGAIN:
- status = xs_nospace(task);
+ status = xs_nospace(req);
break;
case -ENETUNREACH:
case -ENOBUFS:
@@ -639,7 +1087,7 @@
/**
* xs_tcp_send_request - write an RPC request to a TCP socket
- * @task: address of RPC task that manages the state of an RPC request
+ * @req: pointer to RPC request
*
* Return values:
* 0: The request has been sent
@@ -651,28 +1099,27 @@
* XXX: In the case of soft timeouts, should we eventually give up
* if sendmsg is not able to make progress?
*/
-static int xs_tcp_send_request(struct rpc_task *task)
+static int xs_tcp_send_request(struct rpc_rqst *req)
{
- struct rpc_rqst *req = task->tk_rqstp;
struct rpc_xprt *xprt = req->rq_xprt;
struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
struct xdr_buf *xdr = &req->rq_snd_buf;
- bool zerocopy = true;
+ rpc_fraghdr rm = xs_stream_record_marker(xdr);
+ unsigned int msglen = rm ? req->rq_slen + sizeof(rm) : req->rq_slen;
bool vm_wait = false;
int status;
int sent;
- xs_encode_stream_record_marker(&req->rq_snd_buf);
+ /* Close the stream if the previous transmission was incomplete */
+ if (xs_send_request_was_aborted(transport, req)) {
+ if (transport->sock != NULL)
+ kernel_sock_shutdown(transport->sock, SHUT_RDWR);
+ return -ENOTCONN;
+ }
xs_pktdump("packet data:",
req->rq_svec->iov_base,
req->rq_svec->iov_len);
- /* Don't use zero copy if this is a resend. If the RPC call
- * completes while the socket holds a reference to the pages,
- * then we may end up resending corrupted data.
- */
- if (task->tk_flags & RPC_TASK_SENT)
- zerocopy = false;
if (test_bit(XPRT_SOCK_UPD_TIMEOUT, &transport->sock_state))
xs_tcp_set_socket_timeouts(xprt, transport->sock);
@@ -684,17 +1131,18 @@
while (1) {
sent = 0;
status = xs_sendpages(transport->sock, NULL, 0, xdr,
- req->rq_bytes_sent, zerocopy, &sent);
+ transport->xmit.offset, rm, &sent);
dprintk("RPC: xs_tcp_send_request(%u) = %d\n",
- xdr->len - req->rq_bytes_sent, status);
+ xdr->len - transport->xmit.offset, status);
/* If we've sent the entire packet, immediately
* reset the count of bytes sent. */
- req->rq_bytes_sent += sent;
- req->rq_xmit_bytes_sent += sent;
- if (likely(req->rq_bytes_sent >= req->rq_slen)) {
- req->rq_bytes_sent = 0;
+ transport->xmit.offset += sent;
+ req->rq_bytes_sent = transport->xmit.offset;
+ if (likely(req->rq_bytes_sent >= msglen)) {
+ req->rq_xmit_bytes_sent += transport->xmit.offset;
+ transport->xmit.offset = 0;
return 0;
}
@@ -732,7 +1180,7 @@
/* Should we call xs_close() here? */
break;
case -EAGAIN:
- status = xs_nospace(task);
+ status = xs_nospace(req);
break;
case -ECONNRESET:
case -ECONNREFUSED:
@@ -749,35 +1197,6 @@
return status;
}
-/**
- * xs_tcp_release_xprt - clean up after a tcp transmission
- * @xprt: transport
- * @task: rpc task
- *
- * This cleans up if an error causes us to abort the transmission of a request.
- * In this case, the socket may need to be reset in order to avoid confusing
- * the server.
- */
-static void xs_tcp_release_xprt(struct rpc_xprt *xprt, struct rpc_task *task)
-{
- struct rpc_rqst *req;
-
- if (task != xprt->snd_task)
- return;
- if (task == NULL)
- goto out_release;
- req = task->tk_rqstp;
- if (req == NULL)
- goto out_release;
- if (req->rq_bytes_sent == 0)
- goto out_release;
- if (req->rq_bytes_sent == req->rq_snd_buf.len)
- goto out_release;
- set_bit(XPRT_CLOSE_WAIT, &xprt->state);
-out_release:
- xprt_release_xprt(xprt, task);
-}
-
static void xs_save_old_callbacks(struct sock_xprt *transport, struct sock *sk)
{
transport->old_data_ready = sk->sk_data_ready;
@@ -799,6 +1218,15 @@
struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state);
+ clear_bit(XPRT_SOCK_WAKE_ERROR, &transport->sock_state);
+ clear_bit(XPRT_SOCK_WAKE_WRITE, &transport->sock_state);
+ clear_bit(XPRT_SOCK_WAKE_DISCONNECT, &transport->sock_state);
+}
+
+static void xs_run_error_worker(struct sock_xprt *transport, unsigned int nr)
+{
+ set_bit(nr, &transport->sock_state);
+ queue_work(xprtiod_workqueue, &transport->error_worker);
}
static void xs_sock_reset_connection_flags(struct rpc_xprt *xprt)
@@ -819,20 +1247,24 @@
*/
static void xs_error_report(struct sock *sk)
{
+ struct sock_xprt *transport;
struct rpc_xprt *xprt;
- int err;
read_lock_bh(&sk->sk_callback_lock);
if (!(xprt = xprt_from_sock(sk)))
goto out;
- err = -sk->sk_err;
- if (err == 0)
+ transport = container_of(xprt, struct sock_xprt, xprt);
+ transport->xprt_err = -sk->sk_err;
+ if (transport->xprt_err == 0)
goto out;
dprintk("RPC: xs_error_report client %p, error=%d...\n",
- xprt, -err);
- trace_rpc_socket_error(xprt, sk->sk_socket, err);
- xprt_wake_pending_tasks(xprt, err);
+ xprt, -transport->xprt_err);
+ trace_rpc_socket_error(xprt, sk->sk_socket, transport->xprt_err);
+
+ /* barrier ensures xprt_err is set before XPRT_SOCK_WAKE_ERROR */
+ smp_mb__before_atomic();
+ xs_run_error_worker(transport, XPRT_SOCK_WAKE_ERROR);
out:
read_unlock_bh(&sk->sk_callback_lock);
}
@@ -842,6 +1274,7 @@
struct socket *sock = transport->sock;
struct sock *sk = transport->inet;
struct rpc_xprt *xprt = &transport->xprt;
+ struct file *filp = transport->file;
if (sk == NULL)
return;
@@ -855,6 +1288,7 @@
write_lock_bh(&sk->sk_callback_lock);
transport->inet = NULL;
transport->sock = NULL;
+ transport->file = NULL;
sk->sk_user_data = NULL;
@@ -862,10 +1296,14 @@
xprt_clear_connected(xprt);
write_unlock_bh(&sk->sk_callback_lock);
xs_sock_reset_connection_flags(xprt);
+ /* Reset stream record info */
+ xs_stream_reset_connect(transport);
mutex_unlock(&transport->recv_mutex);
trace_rpc_socket_close(xprt, sock);
- sock_release(sock);
+ fput(filp);
+
+ xprt_disconnect_done(xprt);
}
/**
@@ -886,8 +1324,6 @@
xs_reset_transport(transport);
xprt->reestablish_timeout = 0;
-
- xprt_disconnect_done(xprt);
}
static void xs_inject_disconnect(struct rpc_xprt *xprt)
@@ -917,118 +1353,11 @@
cancel_delayed_work_sync(&transport->connect_worker);
xs_close(xprt);
cancel_work_sync(&transport->recv_worker);
+ cancel_work_sync(&transport->error_worker);
xs_xprt_free(xprt);
module_put(THIS_MODULE);
}
-static int xs_local_copy_to_xdr(struct xdr_buf *xdr, struct sk_buff *skb)
-{
- struct xdr_skb_reader desc = {
- .skb = skb,
- .offset = sizeof(rpc_fraghdr),
- .count = skb->len - sizeof(rpc_fraghdr),
- };
-
- if (xdr_partial_copy_from_skb(xdr, 0, &desc, xdr_skb_read_bits) < 0)
- return -1;
- if (desc.count)
- return -1;
- return 0;
-}
-
-/**
- * xs_local_data_read_skb
- * @xprt: transport
- * @sk: socket
- * @skb: skbuff
- *
- * Currently this assumes we can read the whole reply in a single gulp.
- */
-static void xs_local_data_read_skb(struct rpc_xprt *xprt,
- struct sock *sk,
- struct sk_buff *skb)
-{
- struct rpc_task *task;
- struct rpc_rqst *rovr;
- int repsize, copied;
- u32 _xid;
- __be32 *xp;
-
- repsize = skb->len - sizeof(rpc_fraghdr);
- if (repsize < 4) {
- dprintk("RPC: impossible RPC reply size %d\n", repsize);
- return;
- }
-
- /* Copy the XID from the skb... */
- xp = skb_header_pointer(skb, sizeof(rpc_fraghdr), sizeof(_xid), &_xid);
- if (xp == NULL)
- return;
-
- /* Look up and lock the request corresponding to the given XID */
- spin_lock(&xprt->recv_lock);
- rovr = xprt_lookup_rqst(xprt, *xp);
- if (!rovr)
- goto out_unlock;
- xprt_pin_rqst(rovr);
- spin_unlock(&xprt->recv_lock);
- task = rovr->rq_task;
-
- copied = rovr->rq_private_buf.buflen;
- if (copied > repsize)
- copied = repsize;
-
- if (xs_local_copy_to_xdr(&rovr->rq_private_buf, skb)) {
- dprintk("RPC: sk_buff copy failed\n");
- spin_lock(&xprt->recv_lock);
- goto out_unpin;
- }
-
- spin_lock(&xprt->recv_lock);
- xprt_complete_rqst(task, copied);
-out_unpin:
- xprt_unpin_rqst(rovr);
- out_unlock:
- spin_unlock(&xprt->recv_lock);
-}
-
-static void xs_local_data_receive(struct sock_xprt *transport)
-{
- struct sk_buff *skb;
- struct sock *sk;
- int err;
-
-restart:
- mutex_lock(&transport->recv_mutex);
- sk = transport->inet;
- if (sk == NULL)
- goto out;
- for (;;) {
- skb = skb_recv_datagram(sk, 0, 1, &err);
- if (skb != NULL) {
- xs_local_data_read_skb(&transport->xprt, sk, skb);
- skb_free_datagram(sk, skb);
- continue;
- }
- if (!test_and_clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state))
- break;
- if (need_resched()) {
- mutex_unlock(&transport->recv_mutex);
- cond_resched();
- goto restart;
- }
- }
-out:
- mutex_unlock(&transport->recv_mutex);
-}
-
-static void xs_local_data_receive_workfn(struct work_struct *work)
-{
- struct sock_xprt *transport =
- container_of(work, struct sock_xprt, recv_worker);
- xs_local_data_receive(transport);
-}
-
/**
* xs_udp_data_read_skb - receive callback for UDP sockets
* @xprt: transport
@@ -1058,13 +1387,13 @@
return;
/* Look up and lock the request corresponding to the given XID */
- spin_lock(&xprt->recv_lock);
+ spin_lock(&xprt->queue_lock);
rovr = xprt_lookup_rqst(xprt, *xp);
if (!rovr)
goto out_unlock;
xprt_pin_rqst(rovr);
xprt_update_rtt(rovr->rq_task);
- spin_unlock(&xprt->recv_lock);
+ spin_unlock(&xprt->queue_lock);
task = rovr->rq_task;
if ((copied = rovr->rq_private_buf.buflen) > repsize)
@@ -1072,22 +1401,22 @@
/* Suck it into the iovec, verify checksum if not done by hw. */
if (csum_partial_copy_to_xdr(&rovr->rq_private_buf, skb)) {
- spin_lock(&xprt->recv_lock);
+ spin_lock(&xprt->queue_lock);
__UDPX_INC_STATS(sk, UDP_MIB_INERRORS);
goto out_unpin;
}
- spin_lock_bh(&xprt->transport_lock);
+ spin_lock(&xprt->transport_lock);
xprt_adjust_cwnd(xprt, task, copied);
- spin_unlock_bh(&xprt->transport_lock);
- spin_lock(&xprt->recv_lock);
+ spin_unlock(&xprt->transport_lock);
+ spin_lock(&xprt->queue_lock);
xprt_complete_rqst(task, copied);
__UDPX_INC_STATS(sk, UDP_MIB_INDATAGRAMS);
out_unpin:
xprt_unpin_rqst(rovr);
out_unlock:
- spin_unlock(&xprt->recv_lock);
+ spin_unlock(&xprt->queue_lock);
}
static void xs_udp_data_receive(struct sock_xprt *transport)
@@ -1096,26 +1425,19 @@
struct sock *sk;
int err;
-restart:
mutex_lock(&transport->recv_mutex);
sk = transport->inet;
if (sk == NULL)
goto out;
for (;;) {
skb = skb_recv_udp(sk, 0, 1, &err);
- if (skb != NULL) {
- xs_udp_data_read_skb(&transport->xprt, sk, skb);
- consume_skb(skb);
- continue;
- }
- if (!test_and_clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state))
+ if (skb == NULL)
break;
- if (need_resched()) {
- mutex_unlock(&transport->recv_mutex);
- cond_resched();
- goto restart;
- }
+ xs_udp_data_read_skb(&transport->xprt, sk, skb);
+ consume_skb(skb);
+ cond_resched();
}
+ xs_poll_check_readable(transport);
out:
mutex_unlock(&transport->recv_mutex);
}
@@ -1124,7 +1446,10 @@
{
struct sock_xprt *transport =
container_of(work, struct sock_xprt, recv_worker);
+ unsigned int pflags = memalloc_nofs_save();
+
xs_udp_data_receive(transport);
+ memalloc_nofs_restore(pflags);
}
/**
@@ -1163,417 +1488,13 @@
xprt_force_disconnect(xprt);
}
-static inline void xs_tcp_read_fraghdr(struct rpc_xprt *xprt, struct xdr_skb_reader *desc)
-{
- struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
- size_t len, used;
- char *p;
-
- p = ((char *) &transport->tcp_fraghdr) + transport->tcp_offset;
- len = sizeof(transport->tcp_fraghdr) - transport->tcp_offset;
- used = xdr_skb_read_bits(desc, p, len);
- transport->tcp_offset += used;
- if (used != len)
- return;
-
- transport->tcp_reclen = ntohl(transport->tcp_fraghdr);
- if (transport->tcp_reclen & RPC_LAST_STREAM_FRAGMENT)
- transport->tcp_flags |= TCP_RCV_LAST_FRAG;
- else
- transport->tcp_flags &= ~TCP_RCV_LAST_FRAG;
- transport->tcp_reclen &= RPC_FRAGMENT_SIZE_MASK;
-
- transport->tcp_flags &= ~TCP_RCV_COPY_FRAGHDR;
- transport->tcp_offset = 0;
-
- /* Sanity check of the record length */
- if (unlikely(transport->tcp_reclen < 8)) {
- dprintk("RPC: invalid TCP record fragment length\n");
- xs_tcp_force_close(xprt);
- return;
- }
- dprintk("RPC: reading TCP record fragment of length %d\n",
- transport->tcp_reclen);
-}
-
-static void xs_tcp_check_fraghdr(struct sock_xprt *transport)
-{
- if (transport->tcp_offset == transport->tcp_reclen) {
- transport->tcp_flags |= TCP_RCV_COPY_FRAGHDR;
- transport->tcp_offset = 0;
- if (transport->tcp_flags & TCP_RCV_LAST_FRAG) {
- transport->tcp_flags &= ~TCP_RCV_COPY_DATA;
- transport->tcp_flags |= TCP_RCV_COPY_XID;
- transport->tcp_copied = 0;
- }
- }
-}
-
-static inline void xs_tcp_read_xid(struct sock_xprt *transport, struct xdr_skb_reader *desc)
-{
- size_t len, used;
- char *p;
-
- len = sizeof(transport->tcp_xid) - transport->tcp_offset;
- dprintk("RPC: reading XID (%zu bytes)\n", len);
- p = ((char *) &transport->tcp_xid) + transport->tcp_offset;
- used = xdr_skb_read_bits(desc, p, len);
- transport->tcp_offset += used;
- if (used != len)
- return;
- transport->tcp_flags &= ~TCP_RCV_COPY_XID;
- transport->tcp_flags |= TCP_RCV_READ_CALLDIR;
- transport->tcp_copied = 4;
- dprintk("RPC: reading %s XID %08x\n",
- (transport->tcp_flags & TCP_RPC_REPLY) ? "reply for"
- : "request with",
- ntohl(transport->tcp_xid));
- xs_tcp_check_fraghdr(transport);
-}
-
-static inline void xs_tcp_read_calldir(struct sock_xprt *transport,
- struct xdr_skb_reader *desc)
-{
- size_t len, used;
- u32 offset;
- char *p;
-
- /*
- * We want transport->tcp_offset to be 8 at the end of this routine
- * (4 bytes for the xid and 4 bytes for the call/reply flag).
- * When this function is called for the first time,
- * transport->tcp_offset is 4 (after having already read the xid).
- */
- offset = transport->tcp_offset - sizeof(transport->tcp_xid);
- len = sizeof(transport->tcp_calldir) - offset;
- dprintk("RPC: reading CALL/REPLY flag (%zu bytes)\n", len);
- p = ((char *) &transport->tcp_calldir) + offset;
- used = xdr_skb_read_bits(desc, p, len);
- transport->tcp_offset += used;
- if (used != len)
- return;
- transport->tcp_flags &= ~TCP_RCV_READ_CALLDIR;
- /*
- * We don't yet have the XDR buffer, so we will write the calldir
- * out after we get the buffer from the 'struct rpc_rqst'
- */
- switch (ntohl(transport->tcp_calldir)) {
- case RPC_REPLY:
- transport->tcp_flags |= TCP_RCV_COPY_CALLDIR;
- transport->tcp_flags |= TCP_RCV_COPY_DATA;
- transport->tcp_flags |= TCP_RPC_REPLY;
- break;
- case RPC_CALL:
- transport->tcp_flags |= TCP_RCV_COPY_CALLDIR;
- transport->tcp_flags |= TCP_RCV_COPY_DATA;
- transport->tcp_flags &= ~TCP_RPC_REPLY;
- break;
- default:
- dprintk("RPC: invalid request message type\n");
- xs_tcp_force_close(&transport->xprt);
- }
- xs_tcp_check_fraghdr(transport);
-}
-
-static inline void xs_tcp_read_common(struct rpc_xprt *xprt,
- struct xdr_skb_reader *desc,
- struct rpc_rqst *req)
-{
- struct sock_xprt *transport =
- container_of(xprt, struct sock_xprt, xprt);
- struct xdr_buf *rcvbuf;
- size_t len;
- ssize_t r;
-
- rcvbuf = &req->rq_private_buf;
-
- if (transport->tcp_flags & TCP_RCV_COPY_CALLDIR) {
- /*
- * Save the RPC direction in the XDR buffer
- */
- memcpy(rcvbuf->head[0].iov_base + transport->tcp_copied,
- &transport->tcp_calldir,
- sizeof(transport->tcp_calldir));
- transport->tcp_copied += sizeof(transport->tcp_calldir);
- transport->tcp_flags &= ~TCP_RCV_COPY_CALLDIR;
- }
-
- len = desc->count;
- if (len > transport->tcp_reclen - transport->tcp_offset)
- desc->count = transport->tcp_reclen - transport->tcp_offset;
- r = xdr_partial_copy_from_skb(rcvbuf, transport->tcp_copied,
- desc, xdr_skb_read_bits);
-
- if (desc->count) {
- /* Error when copying to the receive buffer,
- * usually because we weren't able to allocate
- * additional buffer pages. All we can do now
- * is turn off TCP_RCV_COPY_DATA, so the request
- * will not receive any additional updates,
- * and time out.
- * Any remaining data from this record will
- * be discarded.
- */
- transport->tcp_flags &= ~TCP_RCV_COPY_DATA;
- dprintk("RPC: XID %08x truncated request\n",
- ntohl(transport->tcp_xid));
- dprintk("RPC: xprt = %p, tcp_copied = %lu, "
- "tcp_offset = %u, tcp_reclen = %u\n",
- xprt, transport->tcp_copied,
- transport->tcp_offset, transport->tcp_reclen);
- return;
- }
-
- transport->tcp_copied += r;
- transport->tcp_offset += r;
- desc->count = len - r;
-
- dprintk("RPC: XID %08x read %zd bytes\n",
- ntohl(transport->tcp_xid), r);
- dprintk("RPC: xprt = %p, tcp_copied = %lu, tcp_offset = %u, "
- "tcp_reclen = %u\n", xprt, transport->tcp_copied,
- transport->tcp_offset, transport->tcp_reclen);
-
- if (transport->tcp_copied == req->rq_private_buf.buflen)
- transport->tcp_flags &= ~TCP_RCV_COPY_DATA;
- else if (transport->tcp_offset == transport->tcp_reclen) {
- if (transport->tcp_flags & TCP_RCV_LAST_FRAG)
- transport->tcp_flags &= ~TCP_RCV_COPY_DATA;
- }
-}
-
-/*
- * Finds the request corresponding to the RPC xid and invokes the common
- * tcp read code to read the data.
- */
-static inline int xs_tcp_read_reply(struct rpc_xprt *xprt,
- struct xdr_skb_reader *desc)
-{
- struct sock_xprt *transport =
- container_of(xprt, struct sock_xprt, xprt);
- struct rpc_rqst *req;
-
- dprintk("RPC: read reply XID %08x\n", ntohl(transport->tcp_xid));
-
- /* Find and lock the request corresponding to this xid */
- spin_lock(&xprt->recv_lock);
- req = xprt_lookup_rqst(xprt, transport->tcp_xid);
- if (!req) {
- dprintk("RPC: XID %08x request not found!\n",
- ntohl(transport->tcp_xid));
- spin_unlock(&xprt->recv_lock);
- return -1;
- }
- xprt_pin_rqst(req);
- spin_unlock(&xprt->recv_lock);
-
- xs_tcp_read_common(xprt, desc, req);
-
- spin_lock(&xprt->recv_lock);
- if (!(transport->tcp_flags & TCP_RCV_COPY_DATA))
- xprt_complete_rqst(req->rq_task, transport->tcp_copied);
- xprt_unpin_rqst(req);
- spin_unlock(&xprt->recv_lock);
- return 0;
-}
-
#if defined(CONFIG_SUNRPC_BACKCHANNEL)
-/*
- * Obtains an rpc_rqst previously allocated and invokes the common
- * tcp read code to read the data. The result is placed in the callback
- * queue.
- * If we're unable to obtain the rpc_rqst we schedule the closing of the
- * connection and return -1.
- */
-static int xs_tcp_read_callback(struct rpc_xprt *xprt,
- struct xdr_skb_reader *desc)
-{
- struct sock_xprt *transport =
- container_of(xprt, struct sock_xprt, xprt);
- struct rpc_rqst *req;
-
- /* Look up the request corresponding to the given XID */
- req = xprt_lookup_bc_request(xprt, transport->tcp_xid);
- if (req == NULL) {
- printk(KERN_WARNING "Callback slot table overflowed\n");
- xprt_force_disconnect(xprt);
- return -1;
- }
-
- dprintk("RPC: read callback XID %08x\n", ntohl(req->rq_xid));
- xs_tcp_read_common(xprt, desc, req);
-
- if (!(transport->tcp_flags & TCP_RCV_COPY_DATA))
- xprt_complete_bc_request(req, transport->tcp_copied);
-
- return 0;
-}
-
-static inline int _xs_tcp_read_data(struct rpc_xprt *xprt,
- struct xdr_skb_reader *desc)
-{
- struct sock_xprt *transport =
- container_of(xprt, struct sock_xprt, xprt);
-
- return (transport->tcp_flags & TCP_RPC_REPLY) ?
- xs_tcp_read_reply(xprt, desc) :
- xs_tcp_read_callback(xprt, desc);
-}
-
-static int xs_tcp_bc_up(struct svc_serv *serv, struct net *net)
-{
- int ret;
-
- ret = svc_create_xprt(serv, "tcp-bc", net, PF_INET, 0,
- SVC_SOCK_ANONYMOUS);
- if (ret < 0)
- return ret;
- return 0;
-}
-
static size_t xs_tcp_bc_maxpayload(struct rpc_xprt *xprt)
{
return PAGE_SIZE;
}
-#else
-static inline int _xs_tcp_read_data(struct rpc_xprt *xprt,
- struct xdr_skb_reader *desc)
-{
- return xs_tcp_read_reply(xprt, desc);
-}
#endif /* CONFIG_SUNRPC_BACKCHANNEL */
-/*
- * Read data off the transport. This can be either an RPC_CALL or an
- * RPC_REPLY. Relay the processing to helper functions.
- */
-static void xs_tcp_read_data(struct rpc_xprt *xprt,
- struct xdr_skb_reader *desc)
-{
- struct sock_xprt *transport =
- container_of(xprt, struct sock_xprt, xprt);
-
- if (_xs_tcp_read_data(xprt, desc) == 0)
- xs_tcp_check_fraghdr(transport);
- else {
- /*
- * The transport_lock protects the request handling.
- * There's no need to hold it to update the tcp_flags.
- */
- transport->tcp_flags &= ~TCP_RCV_COPY_DATA;
- }
-}
-
-static inline void xs_tcp_read_discard(struct sock_xprt *transport, struct xdr_skb_reader *desc)
-{
- size_t len;
-
- len = transport->tcp_reclen - transport->tcp_offset;
- if (len > desc->count)
- len = desc->count;
- desc->count -= len;
- desc->offset += len;
- transport->tcp_offset += len;
- dprintk("RPC: discarded %zu bytes\n", len);
- xs_tcp_check_fraghdr(transport);
-}
-
-static int xs_tcp_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb, unsigned int offset, size_t len)
-{
- struct rpc_xprt *xprt = rd_desc->arg.data;
- struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
- struct xdr_skb_reader desc = {
- .skb = skb,
- .offset = offset,
- .count = len,
- };
- size_t ret;
-
- dprintk("RPC: xs_tcp_data_recv started\n");
- do {
- trace_xs_tcp_data_recv(transport);
- /* Read in a new fragment marker if necessary */
- /* Can we ever really expect to get completely empty fragments? */
- if (transport->tcp_flags & TCP_RCV_COPY_FRAGHDR) {
- xs_tcp_read_fraghdr(xprt, &desc);
- continue;
- }
- /* Read in the xid if necessary */
- if (transport->tcp_flags & TCP_RCV_COPY_XID) {
- xs_tcp_read_xid(transport, &desc);
- continue;
- }
- /* Read in the call/reply flag */
- if (transport->tcp_flags & TCP_RCV_READ_CALLDIR) {
- xs_tcp_read_calldir(transport, &desc);
- continue;
- }
- /* Read in the request data */
- if (transport->tcp_flags & TCP_RCV_COPY_DATA) {
- xs_tcp_read_data(xprt, &desc);
- continue;
- }
- /* Skip over any trailing bytes on short reads */
- xs_tcp_read_discard(transport, &desc);
- } while (desc.count);
- ret = len - desc.count;
- if (ret < rd_desc->count)
- rd_desc->count -= ret;
- else
- rd_desc->count = 0;
- trace_xs_tcp_data_recv(transport);
- dprintk("RPC: xs_tcp_data_recv done\n");
- return ret;
-}
-
-static void xs_tcp_data_receive(struct sock_xprt *transport)
-{
- struct rpc_xprt *xprt = &transport->xprt;
- struct sock *sk;
- read_descriptor_t rd_desc = {
- .arg.data = xprt,
- };
- unsigned long total = 0;
- int read = 0;
-
-restart:
- mutex_lock(&transport->recv_mutex);
- sk = transport->inet;
- if (sk == NULL)
- goto out;
-
- /* We use rd_desc to pass struct xprt to xs_tcp_data_recv */
- for (;;) {
- rd_desc.count = RPC_TCP_READ_CHUNK_SZ;
- lock_sock(sk);
- read = tcp_read_sock(sk, &rd_desc, xs_tcp_data_recv);
- if (rd_desc.count != 0 || read < 0) {
- clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state);
- release_sock(sk);
- break;
- }
- release_sock(sk);
- total += read;
- if (need_resched()) {
- mutex_unlock(&transport->recv_mutex);
- cond_resched();
- goto restart;
- }
- }
- if (test_bit(XPRT_SOCK_DATA_READY, &transport->sock_state))
- queue_work(xprtiod_workqueue, &transport->recv_worker);
-out:
- mutex_unlock(&transport->recv_mutex);
- trace_xs_tcp_data_ready(xprt, read, total);
-}
-
-static void xs_tcp_data_receive_workfn(struct work_struct *work)
-{
- struct sock_xprt *transport =
- container_of(work, struct sock_xprt, recv_worker);
- xs_tcp_data_receive(transport);
-}
-
/**
* xs_tcp_state_change - callback to handle TCP socket state changes
* @sk: socket whose state has changed
@@ -1598,22 +1519,16 @@
trace_rpc_socket_state_change(xprt, sk->sk_socket);
switch (sk->sk_state) {
case TCP_ESTABLISHED:
- spin_lock(&xprt->transport_lock);
if (!xprt_test_and_set_connected(xprt)) {
-
- /* Reset TCP record info */
- transport->tcp_offset = 0;
- transport->tcp_reclen = 0;
- transport->tcp_copied = 0;
- transport->tcp_flags =
- TCP_RCV_COPY_FRAGHDR | TCP_RCV_COPY_XID;
xprt->connect_cookie++;
clear_bit(XPRT_SOCK_CONNECTING, &transport->sock_state);
xprt_clear_connecting(xprt);
- xprt_wake_pending_tasks(xprt, -EAGAIN);
+ xprt->stat.connect_count++;
+ xprt->stat.connect_time += (long)jiffies -
+ xprt->stat.connect_start;
+ xs_run_error_worker(transport, XPRT_SOCK_WAKE_PENDING);
}
- spin_unlock(&xprt->transport_lock);
break;
case TCP_FIN_WAIT1:
/* The client initiated a shutdown of the socket */
@@ -1629,7 +1544,7 @@
/* The server initiated a shutdown of the socket */
xprt->connect_cookie++;
clear_bit(XPRT_CONNECTED, &xprt->state);
- xs_tcp_force_close(xprt);
+ xs_run_error_worker(transport, XPRT_SOCK_WAKE_DISCONNECT);
/* fall through */
case TCP_CLOSING:
/*
@@ -1650,10 +1565,8 @@
&transport->sock_state))
xprt_clear_connecting(xprt);
clear_bit(XPRT_CLOSING, &xprt->state);
- if (sk->sk_err)
- xprt_wake_pending_tasks(xprt, -sk->sk_err);
/* Trigger the socket release */
- xs_tcp_force_close(xprt);
+ xs_run_error_worker(transport, XPRT_SOCK_WAKE_DISCONNECT);
}
out:
read_unlock_bh(&sk->sk_callback_lock);
@@ -1662,6 +1575,7 @@
static void xs_write_space(struct sock *sk)
{
struct socket_wq *wq;
+ struct sock_xprt *transport;
struct rpc_xprt *xprt;
if (!sk->sk_socket)
@@ -1670,12 +1584,14 @@
if (unlikely(!(xprt = xprt_from_sock(sk))))
return;
+ transport = container_of(xprt, struct sock_xprt, xprt);
rcu_read_lock();
wq = rcu_dereference(sk->sk_wq);
if (!wq || test_and_clear_bit(SOCKWQ_ASYNC_NOSPACE, &wq->flags) == 0)
goto out;
- xprt_write_space(xprt);
+ xs_run_error_worker(transport, XPRT_SOCK_WAKE_WRITE);
+ sk->sk_write_pending--;
out:
rcu_read_unlock();
}
@@ -1762,22 +1678,29 @@
/**
* xs_udp_timer - called when a retransmit timeout occurs on a UDP transport
+ * @xprt: controlling transport
* @task: task that timed out
*
* Adjust the congestion window after a retransmit timeout has occurred.
*/
static void xs_udp_timer(struct rpc_xprt *xprt, struct rpc_task *task)
{
- spin_lock_bh(&xprt->transport_lock);
+ spin_lock(&xprt->transport_lock);
xprt_adjust_cwnd(xprt, task, -ETIMEDOUT);
- spin_unlock_bh(&xprt->transport_lock);
+ spin_unlock(&xprt->transport_lock);
}
-static unsigned short xs_get_random_port(void)
+static int xs_get_random_port(void)
{
- unsigned short range = xprt_max_resvport - xprt_min_resvport + 1;
- unsigned short rand = (unsigned short) prandom_u32() % range;
- return rand + xprt_min_resvport;
+ unsigned short min = xprt_min_resvport, max = xprt_max_resvport;
+ unsigned short range;
+ unsigned short rand;
+
+ if (max < min)
+ return -EADDRINUSE;
+ range = max - min + 1;
+ rand = (unsigned short) prandom_u32() % range;
+ return rand + min;
}
/**
@@ -1833,9 +1756,9 @@
transport->srcport = xs_sock_getport(sock);
}
-static unsigned short xs_get_srcport(struct sock_xprt *transport)
+static int xs_get_srcport(struct sock_xprt *transport)
{
- unsigned short port = transport->srcport;
+ int port = transport->srcport;
if (port == 0 && transport->xprt.resvport)
port = xs_get_random_port();
@@ -1856,7 +1779,7 @@
{
struct sockaddr_storage myaddr;
int err, nloop = 0;
- unsigned short port = xs_get_srcport(transport);
+ int port = xs_get_srcport(transport);
unsigned short last;
/*
@@ -1874,8 +1797,8 @@
* transport->xprt.resvport == 1) xs_get_srcport above will
* ensure that port is non-zero and we will bind as needed.
*/
- if (port == 0)
- return 0;
+ if (port <= 0)
+ return port;
memcpy(&myaddr, &transport->srcaddr, transport->xprt.addrlen);
do {
@@ -1974,6 +1897,7 @@
struct sock_xprt *transport, int family, int type,
int protocol, bool reuseport)
{
+ struct file *filp;
struct socket *sock;
int err;
@@ -1994,6 +1918,11 @@
goto out;
}
+ filp = sock_alloc_file(sock, O_NONBLOCK, NULL);
+ if (IS_ERR(filp))
+ return ERR_CAST(filp);
+ transport->file = filp;
+
return sock;
out:
return ERR_PTR(err);
@@ -2017,7 +1946,6 @@
sk->sk_write_space = xs_udp_write_space;
sock_set_flag(sk, SOCK_FASYNC);
sk->sk_error_report = xs_error_report;
- sk->sk_allocation = GFP_NOIO;
xprt_clear_connected(xprt);
@@ -2028,9 +1956,8 @@
write_unlock_bh(&sk->sk_callback_lock);
}
- /* Tell the socket layer to start connecting... */
- xprt->stat.connect_count++;
- xprt->stat.connect_start = jiffies;
+ xs_stream_start_connect(transport);
+
return kernel_connect(sock, xs_addr(xprt), xprt->addrlen, 0);
}
@@ -2041,6 +1968,7 @@
static int xs_local_setup_socket(struct sock_xprt *transport)
{
struct rpc_xprt *xprt = &transport->xprt;
+ struct file *filp;
struct socket *sock;
int status = -EIO;
@@ -2053,6 +1981,13 @@
}
xs_reclassify_socket(AF_LOCAL, sock);
+ filp = sock_alloc_file(sock, O_NONBLOCK, NULL);
+ if (IS_ERR(filp)) {
+ status = PTR_ERR(filp);
+ goto out;
+ }
+ transport->file = filp;
+
dprintk("RPC: worker connecting xprt %p via AF_LOCAL to %s\n",
xprt, xprt->address_strings[RPC_DISPLAY_ADDR]);
@@ -2062,6 +1997,9 @@
case 0:
dprintk("RPC: xprt %p connected to %s\n",
xprt, xprt->address_strings[RPC_DISPLAY_ADDR]);
+ xprt->stat.connect_count++;
+ xprt->stat.connect_time += (long)jiffies -
+ xprt->stat.connect_start;
xprt_set_connected(xprt);
case -ENOBUFS:
break;
@@ -2100,6 +2038,7 @@
* we'll need to figure out how to pass a namespace to
* connect.
*/
+ task->tk_rpc_status = -ENOTCONN;
rpc_exit(task, -ENOTCONN);
return;
}
@@ -2203,7 +2142,6 @@
sk->sk_data_ready = xs_data_ready;
sk->sk_write_space = xs_udp_write_space;
sock_set_flag(sk, SOCK_FASYNC);
- sk->sk_allocation = GFP_NOIO;
xprt_set_connected(xprt);
@@ -2244,8 +2182,8 @@
trace_rpc_socket_connect(xprt, sock, 0);
status = 0;
out:
- xprt_unlock_connect(xprt, transport);
xprt_clear_connecting(xprt);
+ xprt_unlock_connect(xprt, transport);
xprt_wake_pending_tasks(xprt, status);
}
@@ -2284,13 +2222,13 @@
unsigned int opt_on = 1;
unsigned int timeo;
- spin_lock_bh(&xprt->transport_lock);
+ spin_lock(&xprt->transport_lock);
keepidle = DIV_ROUND_UP(xprt->timeout->to_initval, HZ);
keepcnt = xprt->timeout->to_retries + 1;
timeo = jiffies_to_msecs(xprt->timeout->to_initval) *
(xprt->timeout->to_retries + 1);
clear_bit(XPRT_SOCK_UPD_TIMEOUT, &transport->sock_state);
- spin_unlock_bh(&xprt->transport_lock);
+ spin_unlock(&xprt->transport_lock);
/* TCP Keepalive options */
kernel_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE,
@@ -2315,7 +2253,7 @@
struct rpc_timeout to;
unsigned long initval;
- spin_lock_bh(&xprt->transport_lock);
+ spin_lock(&xprt->transport_lock);
if (reconnect_timeout < xprt->max_reconnect_timeout)
xprt->max_reconnect_timeout = reconnect_timeout;
if (connect_timeout < xprt->connect_timeout) {
@@ -2332,7 +2270,7 @@
xprt->connect_timeout = connect_timeout;
}
set_bit(XPRT_SOCK_UPD_TIMEOUT, &transport->sock_state);
- spin_unlock_bh(&xprt->transport_lock);
+ spin_unlock(&xprt->transport_lock);
}
static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
@@ -2366,7 +2304,6 @@
sk->sk_write_space = xs_tcp_write_space;
sock_set_flag(sk, SOCK_FASYNC);
sk->sk_error_report = xs_error_report;
- sk->sk_allocation = GFP_NOIO;
/* socket options */
sock_reset_flag(sk, SOCK_LINGER);
@@ -2386,9 +2323,9 @@
xs_set_memalloc(xprt);
+ xs_stream_start_connect(transport);
+
/* Tell the socket layer to start connecting... */
- xprt->stat.connect_count++;
- xprt->stat.connect_start = jiffies;
set_bit(XPRT_SOCK_CONNECTING, &transport->sock_state);
ret = kernel_connect(sock, xs_addr(xprt), xprt->addrlen, O_NONBLOCK);
switch (ret) {
@@ -2410,6 +2347,7 @@
/**
* xs_tcp_setup_socket - create a TCP socket and connect to a remote endpoint
+ * @work: queued work item
*
* Invoked by a work queue tasklet.
*/
@@ -2480,30 +2418,11 @@
}
status = -EAGAIN;
out:
- xprt_unlock_connect(xprt, transport);
xprt_clear_connecting(xprt);
+ xprt_unlock_connect(xprt, transport);
xprt_wake_pending_tasks(xprt, status);
}
-static unsigned long xs_reconnect_delay(const struct rpc_xprt *xprt)
-{
- unsigned long start, now = jiffies;
-
- start = xprt->stat.connect_start + xprt->reestablish_timeout;
- if (time_after(start, now))
- return start - now;
- return 0;
-}
-
-static void xs_reconnect_backoff(struct rpc_xprt *xprt)
-{
- xprt->reestablish_timeout <<= 1;
- if (xprt->reestablish_timeout > xprt->max_reconnect_timeout)
- xprt->reestablish_timeout = xprt->max_reconnect_timeout;
- if (xprt->reestablish_timeout < XS_TCP_INIT_REEST_TO)
- xprt->reestablish_timeout = XS_TCP_INIT_REEST_TO;
-}
-
/**
* xs_connect - connect a socket to a remote endpoint
* @xprt: pointer to transport structure
@@ -2533,8 +2452,8 @@
/* Start by resetting any existing state */
xs_reset_transport(transport);
- delay = xs_reconnect_delay(xprt);
- xs_reconnect_backoff(xprt);
+ delay = xprt_reconnect_delay(xprt);
+ xprt_reconnect_backoff(xprt, XS_TCP_INIT_REEST_TO);
} else
dprintk("RPC: xs_connect scheduled xprt %p\n", xprt);
@@ -2544,6 +2463,53 @@
delay);
}
+static void xs_wake_disconnect(struct sock_xprt *transport)
+{
+ if (test_and_clear_bit(XPRT_SOCK_WAKE_DISCONNECT, &transport->sock_state))
+ xs_tcp_force_close(&transport->xprt);
+}
+
+static void xs_wake_write(struct sock_xprt *transport)
+{
+ if (test_and_clear_bit(XPRT_SOCK_WAKE_WRITE, &transport->sock_state))
+ xprt_write_space(&transport->xprt);
+}
+
+static void xs_wake_error(struct sock_xprt *transport)
+{
+ int sockerr;
+
+ if (!test_bit(XPRT_SOCK_WAKE_ERROR, &transport->sock_state))
+ return;
+ mutex_lock(&transport->recv_mutex);
+ if (transport->sock == NULL)
+ goto out;
+ if (!test_and_clear_bit(XPRT_SOCK_WAKE_ERROR, &transport->sock_state))
+ goto out;
+ sockerr = xchg(&transport->xprt_err, 0);
+ if (sockerr < 0)
+ xprt_wake_pending_tasks(&transport->xprt, sockerr);
+out:
+ mutex_unlock(&transport->recv_mutex);
+}
+
+static void xs_wake_pending(struct sock_xprt *transport)
+{
+ if (test_and_clear_bit(XPRT_SOCK_WAKE_PENDING, &transport->sock_state))
+ xprt_wake_pending_tasks(&transport->xprt, -EAGAIN);
+}
+
+static void xs_error_handle(struct work_struct *work)
+{
+ struct sock_xprt *transport = container_of(work,
+ struct sock_xprt, error_worker);
+
+ xs_wake_disconnect(transport);
+ xs_wake_write(transport);
+ xs_wake_error(transport);
+ xs_wake_pending(transport);
+}
+
/**
* xs_local_print_stats - display AF_LOCAL socket-specifc stats
* @xprt: rpc_xprt struct containing statistics
@@ -2561,7 +2527,7 @@
"%llu %llu %lu %llu %llu\n",
xprt->stat.bind_count,
xprt->stat.connect_count,
- xprt->stat.connect_time,
+ xprt->stat.connect_time / HZ,
idle_time,
xprt->stat.sends,
xprt->stat.recvs,
@@ -2616,7 +2582,7 @@
transport->srcport,
xprt->stat.bind_count,
xprt->stat.connect_count,
- xprt->stat.connect_time,
+ xprt->stat.connect_time / HZ,
idle_time,
xprt->stat.sends,
xprt->stat.recvs,
@@ -2678,35 +2644,43 @@
{
int len;
struct xdr_buf *xbufp = &req->rq_snd_buf;
- struct rpc_xprt *xprt = req->rq_xprt;
struct sock_xprt *transport =
- container_of(xprt, struct sock_xprt, xprt);
- struct socket *sock = transport->sock;
+ container_of(req->rq_xprt, struct sock_xprt, xprt);
unsigned long headoff;
unsigned long tailoff;
+ struct page *tailpage;
+ struct msghdr msg = {
+ .msg_flags = MSG_MORE
+ };
+ rpc_fraghdr marker = cpu_to_be32(RPC_LAST_STREAM_FRAGMENT |
+ (u32)xbufp->len);
+ struct kvec iov = {
+ .iov_base = &marker,
+ .iov_len = sizeof(marker),
+ };
- xs_encode_stream_record_marker(xbufp);
+ len = kernel_sendmsg(transport->sock, &msg, &iov, 1, iov.iov_len);
+ if (len != iov.iov_len)
+ return -EAGAIN;
+ tailpage = NULL;
+ if (xbufp->tail[0].iov_len)
+ tailpage = virt_to_page(xbufp->tail[0].iov_base);
tailoff = (unsigned long)xbufp->tail[0].iov_base & ~PAGE_MASK;
headoff = (unsigned long)xbufp->head[0].iov_base & ~PAGE_MASK;
- len = svc_send_common(sock, xbufp,
+ len = svc_send_common(transport->sock, xbufp,
virt_to_page(xbufp->head[0].iov_base), headoff,
- xbufp->tail[0].iov_base, tailoff);
-
- if (len != xbufp->len) {
- printk(KERN_NOTICE "Error sending entire callback!\n");
- len = -EAGAIN;
- }
-
+ tailpage, tailoff);
+ if (len != xbufp->len)
+ return -EAGAIN;
return len;
}
/*
* The send routine. Borrows from svc_send
*/
-static int bc_send_request(struct rpc_task *task)
+static int bc_send_request(struct rpc_rqst *req)
{
- struct rpc_rqst *req = task->tk_rqstp;
struct svc_xprt *xprt;
int len;
@@ -2720,12 +2694,7 @@
* Grab the mutex to serialize data as the connection is shared
* with the fore channel
*/
- if (!mutex_trylock(&xprt->xpt_mutex)) {
- rpc_sleep_on(&xprt->xpt_bc_pending, task, NULL);
- if (!mutex_trylock(&xprt->xpt_mutex))
- return -EAGAIN;
- rpc_wake_up_queued_task(&xprt->xpt_bc_pending, task);
- }
+ mutex_lock(&xprt->xpt_mutex);
if (test_bit(XPT_DEAD, &xprt->xpt_flags))
len = -ENOTCONN;
else
@@ -2761,7 +2730,7 @@
static const struct rpc_xprt_ops xs_local_ops = {
.reserve_xprt = xprt_reserve_xprt,
- .release_xprt = xs_tcp_release_xprt,
+ .release_xprt = xprt_release_xprt,
.alloc_slot = xprt_alloc_slot,
.free_slot = xprt_free_slot,
.rpcbind = xs_local_rpcbind,
@@ -2769,8 +2738,9 @@
.connect = xs_local_connect,
.buf_alloc = rpc_malloc,
.buf_free = rpc_free,
+ .prepare_request = xs_stream_prepare_request,
.send_request = xs_local_send_request,
- .set_retrans_timeout = xprt_set_retrans_timeout_def,
+ .wait_for_reply_request = xprt_wait_for_reply_request_def,
.close = xs_close,
.destroy = xs_destroy,
.print_stats = xs_local_print_stats,
@@ -2790,7 +2760,7 @@
.buf_alloc = rpc_malloc,
.buf_free = rpc_free,
.send_request = xs_udp_send_request,
- .set_retrans_timeout = xprt_set_retrans_timeout_rtt,
+ .wait_for_reply_request = xprt_wait_for_reply_request_rtt,
.timer = xs_udp_timer,
.release_request = xprt_release_rqst_cong,
.close = xs_close,
@@ -2803,16 +2773,17 @@
static const struct rpc_xprt_ops xs_tcp_ops = {
.reserve_xprt = xprt_reserve_xprt,
- .release_xprt = xs_tcp_release_xprt,
- .alloc_slot = xprt_lock_and_alloc_slot,
+ .release_xprt = xprt_release_xprt,
+ .alloc_slot = xprt_alloc_slot,
.free_slot = xprt_free_slot,
.rpcbind = rpcb_getport_async,
.set_port = xs_set_port,
.connect = xs_connect,
.buf_alloc = rpc_malloc,
.buf_free = rpc_free,
+ .prepare_request = xs_stream_prepare_request,
.send_request = xs_tcp_send_request,
- .set_retrans_timeout = xprt_set_retrans_timeout_def,
+ .wait_for_reply_request = xprt_wait_for_reply_request_def,
.close = xs_tcp_shutdown,
.destroy = xs_destroy,
.set_connect_timeout = xs_tcp_set_connect_timeout,
@@ -2822,8 +2793,8 @@
.inject_disconnect = xs_inject_disconnect,
#ifdef CONFIG_SUNRPC_BACKCHANNEL
.bc_setup = xprt_setup_bc,
- .bc_up = xs_tcp_bc_up,
.bc_maxpayload = xs_tcp_bc_maxpayload,
+ .bc_num_slots = xprt_bc_max_slots,
.bc_free_rqst = xprt_free_bc_rqst,
.bc_destroy = xprt_destroy_bc,
#endif
@@ -2841,7 +2812,7 @@
.buf_alloc = bc_malloc,
.buf_free = bc_free,
.send_request = bc_send_request,
- .set_retrans_timeout = xprt_set_retrans_timeout_def,
+ .wait_for_reply_request = xprt_wait_for_reply_request_def,
.close = bc_close,
.destroy = bc_destroy,
.print_stats = xs_tcp_print_stats,
@@ -2942,7 +2913,6 @@
transport = container_of(xprt, struct sock_xprt, xprt);
xprt->prot = 0;
- xprt->tsh_size = sizeof(rpc_fraghdr) / sizeof(u32);
xprt->max_payload = RPC_MAX_FRAGMENT_SIZE;
xprt->bind_timeout = XS_BIND_TO;
@@ -2952,9 +2922,9 @@
xprt->ops = &xs_local_ops;
xprt->timeout = &xs_local_default_timeout;
- INIT_WORK(&transport->recv_worker, xs_local_data_receive_workfn);
- INIT_DELAYED_WORK(&transport->connect_worker,
- xs_dummy_setup_socket);
+ INIT_WORK(&transport->recv_worker, xs_stream_data_receive_workfn);
+ INIT_WORK(&transport->error_worker, xs_error_handle);
+ INIT_DELAYED_WORK(&transport->connect_worker, xs_dummy_setup_socket);
switch (sun->sun_family) {
case AF_LOCAL:
@@ -3012,7 +2982,6 @@
transport = container_of(xprt, struct sock_xprt, xprt);
xprt->prot = IPPROTO_UDP;
- xprt->tsh_size = 0;
/* XXX: header size can vary due to auth type, IPv6, etc. */
xprt->max_payload = (1U << 16) - (MAX_HEADER << 3);
@@ -3025,6 +2994,7 @@
xprt->timeout = &xs_udp_default_timeout;
INIT_WORK(&transport->recv_worker, xs_udp_data_receive_workfn);
+ INIT_WORK(&transport->error_worker, xs_error_handle);
INIT_DELAYED_WORK(&transport->connect_worker, xs_udp_setup_socket);
switch (addr->sa_family) {
@@ -3092,7 +3062,6 @@
transport = container_of(xprt, struct sock_xprt, xprt);
xprt->prot = IPPROTO_TCP;
- xprt->tsh_size = sizeof(rpc_fraghdr) / sizeof(u32);
xprt->max_payload = RPC_MAX_FRAGMENT_SIZE;
xprt->bind_timeout = XS_BIND_TO;
@@ -3106,7 +3075,8 @@
xprt->connect_timeout = xprt->timeout->to_initval *
(xprt->timeout->to_retries + 1);
- INIT_WORK(&transport->recv_worker, xs_tcp_data_receive_workfn);
+ INIT_WORK(&transport->recv_worker, xs_stream_data_receive_workfn);
+ INIT_WORK(&transport->error_worker, xs_error_handle);
INIT_DELAYED_WORK(&transport->connect_worker, xs_tcp_setup_socket);
switch (addr->sa_family) {
@@ -3165,7 +3135,6 @@
transport = container_of(xprt, struct sock_xprt, xprt);
xprt->prot = IPPROTO_TCP;
- xprt->tsh_size = sizeof(rpc_fraghdr) / sizeof(u32);
xprt->max_payload = RPC_MAX_FRAGMENT_SIZE;
xprt->timeout = &xs_tcp_default_timeout;
@@ -3265,10 +3234,8 @@
*/
int init_socket_xprt(void)
{
-#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
if (!sunrpc_table_header)
sunrpc_table_header = register_sysctl_table(sunrpc_table);
-#endif
xprt_register_transport(&xs_local_transport);
xprt_register_transport(&xs_udp_transport);
@@ -3284,12 +3251,10 @@
*/
void cleanup_socket_xprt(void)
{
-#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
if (sunrpc_table_header) {
unregister_sysctl_table(sunrpc_table_header);
sunrpc_table_header = NULL;
}
-#endif
xprt_unregister_transport(&xs_local_transport);
xprt_unregister_transport(&xs_udp_transport);
@@ -3317,12 +3282,8 @@
static int param_set_portnr(const char *val, const struct kernel_param *kp)
{
- if (kp->arg == &xprt_min_resvport)
- return param_set_uint_minmax(val, kp,
- RPC_MIN_RESVPORT,
- xprt_max_resvport);
return param_set_uint_minmax(val, kp,
- xprt_min_resvport,
+ RPC_MIN_RESVPORT,
RPC_MAX_RESVPORT);
}