Update Linux to v5.4.2
Change-Id: Idf6911045d9d382da2cfe01b1edff026404ac8fd
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 119a427..f2d1e57 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* xfrm_policy.c
*
@@ -26,10 +27,15 @@
#include <linux/cache.h>
#include <linux/cpu.h>
#include <linux/audit.h>
+#include <linux/rhashtable.h>
+#include <linux/if_tunnel.h>
#include <net/dst.h>
#include <net/flow.h>
#include <net/xfrm.h>
#include <net/ip.h>
+#if IS_ENABLED(CONFIG_IPV6_MIP6)
+#include <net/mip6.h>
+#endif
#ifdef CONFIG_XFRM_STATISTICS
#include <net/snmp.h>
#endif
@@ -45,6 +51,99 @@
u8 flags;
};
+/* prefixes smaller than this are stored in lists, not trees. */
+#define INEXACT_PREFIXLEN_IPV4 16
+#define INEXACT_PREFIXLEN_IPV6 48
+
+struct xfrm_pol_inexact_node {
+ struct rb_node node;
+ union {
+ xfrm_address_t addr;
+ struct rcu_head rcu;
+ };
+ u8 prefixlen;
+
+ struct rb_root root;
+
+ /* the policies matching this node, can be empty list */
+ struct hlist_head hhead;
+};
+
+/* xfrm inexact policy search tree:
+ * xfrm_pol_inexact_bin = hash(dir,type,family,if_id);
+ * |
+ * +---- root_d: sorted by daddr:prefix
+ * | |
+ * | xfrm_pol_inexact_node
+ * | |
+ * | +- root: sorted by saddr/prefix
+ * | | |
+ * | | xfrm_pol_inexact_node
+ * | | |
+ * | | + root: unused
+ * | | |
+ * | | + hhead: saddr:daddr policies
+ * | |
+ * | +- coarse policies and all any:daddr policies
+ * |
+ * +---- root_s: sorted by saddr:prefix
+ * | |
+ * | xfrm_pol_inexact_node
+ * | |
+ * | + root: unused
+ * | |
+ * | + hhead: saddr:any policies
+ * |
+ * +---- coarse policies and all any:any policies
+ *
+ * Lookups return four candidate lists:
+ * 1. any:any list from top-level xfrm_pol_inexact_bin
+ * 2. any:daddr list from daddr tree
+ * 3. saddr:daddr list from 2nd level daddr tree
+ * 4. saddr:any list from saddr tree
+ *
+ * This result set then needs to be searched for the policy with
+ * the lowest priority. If two results have same prio, youngest one wins.
+ */
+
+struct xfrm_pol_inexact_key {
+ possible_net_t net;
+ u32 if_id;
+ u16 family;
+ u8 dir, type;
+};
+
+struct xfrm_pol_inexact_bin {
+ struct xfrm_pol_inexact_key k;
+ struct rhash_head head;
+ /* list containing '*:*' policies */
+ struct hlist_head hhead;
+
+ seqcount_t count;
+ /* tree sorted by daddr/prefix */
+ struct rb_root root_d;
+
+ /* tree sorted by saddr/prefix */
+ struct rb_root root_s;
+
+ /* slow path below */
+ struct list_head inexact_bins;
+ struct rcu_head rcu;
+};
+
+enum xfrm_pol_inexact_candidate_type {
+ XFRM_POL_CAND_BOTH,
+ XFRM_POL_CAND_SADDR,
+ XFRM_POL_CAND_DADDR,
+ XFRM_POL_CAND_ANY,
+
+ XFRM_POL_CAND_MAX,
+};
+
+struct xfrm_pol_inexact_candidates {
+ struct hlist_head *res[XFRM_POL_CAND_MAX];
+};
+
static DEFINE_SPINLOCK(xfrm_if_cb_lock);
static struct xfrm_if_cb const __rcu *xfrm_if_cb __read_mostly;
@@ -55,6 +154,9 @@
static struct kmem_cache *xfrm_dst_cache __ro_after_init;
static __read_mostly seqcount_t xfrm_policy_hash_generation;
+static struct rhashtable xfrm_policy_inexact_table;
+static const struct rhashtable_params xfrm_pol_inexact_params;
+
static void xfrm_init_pmtu(struct xfrm_dst **bundle, int nr);
static int stale_bundle(struct dst_entry *dst);
static int xfrm_bundle_ok(struct xfrm_dst *xdst);
@@ -64,6 +166,25 @@
static struct xfrm_policy *__xfrm_policy_unlink(struct xfrm_policy *pol,
int dir);
+static struct xfrm_pol_inexact_bin *
+xfrm_policy_inexact_lookup(struct net *net, u8 type, u16 family, u8 dir,
+ u32 if_id);
+
+static struct xfrm_pol_inexact_bin *
+xfrm_policy_inexact_lookup_rcu(struct net *net,
+ u8 type, u16 family, u8 dir, u32 if_id);
+static struct xfrm_policy *
+xfrm_policy_insert_list(struct hlist_head *chain, struct xfrm_policy *policy,
+ bool excl);
+static void xfrm_policy_insert_inexact_list(struct hlist_head *chain,
+ struct xfrm_policy *policy);
+
+static bool
+xfrm_policy_find_inexact_candidates(struct xfrm_pol_inexact_candidates *cand,
+ struct xfrm_pol_inexact_bin *b,
+ const xfrm_address_t *saddr,
+ const xfrm_address_t *daddr);
+
static inline bool xfrm_pol_hold_rcu(struct xfrm_policy *policy)
{
return refcount_inc_not_zero(&policy->refcnt);
@@ -269,6 +390,7 @@
if (policy) {
write_pnet(&policy->xp_net, net);
INIT_LIST_HEAD(&policy->walk.all);
+ INIT_HLIST_NODE(&policy->bydst_inexact_list);
INIT_HLIST_NODE(&policy->bydst);
INIT_HLIST_NODE(&policy->byidx);
rwlock_init(&policy->lock);
@@ -365,7 +487,7 @@
hash = __sel_hash(sel, family, hmask, dbits, sbits);
if (hash == hmask + 1)
- return &net->xfrm.policy_inexact[dir];
+ return NULL;
return rcu_dereference_check(net->xfrm.policy_bydst[dir].table,
lockdep_is_held(&net->xfrm.xfrm_policy_lock)) + hash;
@@ -463,9 +585,6 @@
odst = rcu_dereference_protected(net->xfrm.policy_bydst[dir].table,
lockdep_is_held(&net->xfrm.xfrm_policy_lock));
- odst = rcu_dereference_protected(net->xfrm.policy_bydst[dir].table,
- lockdep_is_held(&net->xfrm.xfrm_policy_lock));
-
for (i = hmask; i >= 0; i--)
xfrm_dst_hash_transfer(net, odst + i, ndst, nhashmask, dir);
@@ -563,6 +682,526 @@
mutex_unlock(&hash_resize_mutex);
}
+/* Make sure *pol can be inserted into fastbin.
+ * Useful to check that later insert requests will be sucessful
+ * (provided xfrm_policy_lock is held throughout).
+ */
+static struct xfrm_pol_inexact_bin *
+xfrm_policy_inexact_alloc_bin(const struct xfrm_policy *pol, u8 dir)
+{
+ struct xfrm_pol_inexact_bin *bin, *prev;
+ struct xfrm_pol_inexact_key k = {
+ .family = pol->family,
+ .type = pol->type,
+ .dir = dir,
+ .if_id = pol->if_id,
+ };
+ struct net *net = xp_net(pol);
+
+ lockdep_assert_held(&net->xfrm.xfrm_policy_lock);
+
+ write_pnet(&k.net, net);
+ bin = rhashtable_lookup_fast(&xfrm_policy_inexact_table, &k,
+ xfrm_pol_inexact_params);
+ if (bin)
+ return bin;
+
+ bin = kzalloc(sizeof(*bin), GFP_ATOMIC);
+ if (!bin)
+ return NULL;
+
+ bin->k = k;
+ INIT_HLIST_HEAD(&bin->hhead);
+ bin->root_d = RB_ROOT;
+ bin->root_s = RB_ROOT;
+ seqcount_init(&bin->count);
+
+ prev = rhashtable_lookup_get_insert_key(&xfrm_policy_inexact_table,
+ &bin->k, &bin->head,
+ xfrm_pol_inexact_params);
+ if (!prev) {
+ list_add(&bin->inexact_bins, &net->xfrm.inexact_bins);
+ return bin;
+ }
+
+ kfree(bin);
+
+ return IS_ERR(prev) ? NULL : prev;
+}
+
+static bool xfrm_pol_inexact_addr_use_any_list(const xfrm_address_t *addr,
+ int family, u8 prefixlen)
+{
+ if (xfrm_addr_any(addr, family))
+ return true;
+
+ if (family == AF_INET6 && prefixlen < INEXACT_PREFIXLEN_IPV6)
+ return true;
+
+ if (family == AF_INET && prefixlen < INEXACT_PREFIXLEN_IPV4)
+ return true;
+
+ return false;
+}
+
+static bool
+xfrm_policy_inexact_insert_use_any_list(const struct xfrm_policy *policy)
+{
+ const xfrm_address_t *addr;
+ bool saddr_any, daddr_any;
+ u8 prefixlen;
+
+ addr = &policy->selector.saddr;
+ prefixlen = policy->selector.prefixlen_s;
+
+ saddr_any = xfrm_pol_inexact_addr_use_any_list(addr,
+ policy->family,
+ prefixlen);
+ addr = &policy->selector.daddr;
+ prefixlen = policy->selector.prefixlen_d;
+ daddr_any = xfrm_pol_inexact_addr_use_any_list(addr,
+ policy->family,
+ prefixlen);
+ return saddr_any && daddr_any;
+}
+
+static void xfrm_pol_inexact_node_init(struct xfrm_pol_inexact_node *node,
+ const xfrm_address_t *addr, u8 prefixlen)
+{
+ node->addr = *addr;
+ node->prefixlen = prefixlen;
+}
+
+static struct xfrm_pol_inexact_node *
+xfrm_pol_inexact_node_alloc(const xfrm_address_t *addr, u8 prefixlen)
+{
+ struct xfrm_pol_inexact_node *node;
+
+ node = kzalloc(sizeof(*node), GFP_ATOMIC);
+ if (node)
+ xfrm_pol_inexact_node_init(node, addr, prefixlen);
+
+ return node;
+}
+
+static int xfrm_policy_addr_delta(const xfrm_address_t *a,
+ const xfrm_address_t *b,
+ u8 prefixlen, u16 family)
+{
+ unsigned int pdw, pbi;
+ int delta = 0;
+
+ switch (family) {
+ case AF_INET:
+ if (sizeof(long) == 4 && prefixlen == 0)
+ return ntohl(a->a4) - ntohl(b->a4);
+ return (ntohl(a->a4) & ((~0UL << (32 - prefixlen)))) -
+ (ntohl(b->a4) & ((~0UL << (32 - prefixlen))));
+ case AF_INET6:
+ pdw = prefixlen >> 5;
+ pbi = prefixlen & 0x1f;
+
+ if (pdw) {
+ delta = memcmp(a->a6, b->a6, pdw << 2);
+ if (delta)
+ return delta;
+ }
+ if (pbi) {
+ u32 mask = ~0u << (32 - pbi);
+
+ delta = (ntohl(a->a6[pdw]) & mask) -
+ (ntohl(b->a6[pdw]) & mask);
+ }
+ break;
+ default:
+ break;
+ }
+
+ return delta;
+}
+
+static void xfrm_policy_inexact_list_reinsert(struct net *net,
+ struct xfrm_pol_inexact_node *n,
+ u16 family)
+{
+ unsigned int matched_s, matched_d;
+ struct xfrm_policy *policy, *p;
+
+ matched_s = 0;
+ matched_d = 0;
+
+ list_for_each_entry_reverse(policy, &net->xfrm.policy_all, walk.all) {
+ struct hlist_node *newpos = NULL;
+ bool matches_s, matches_d;
+
+ if (!policy->bydst_reinsert)
+ continue;
+
+ WARN_ON_ONCE(policy->family != family);
+
+ policy->bydst_reinsert = false;
+ hlist_for_each_entry(p, &n->hhead, bydst) {
+ if (policy->priority > p->priority)
+ newpos = &p->bydst;
+ else if (policy->priority == p->priority &&
+ policy->pos > p->pos)
+ newpos = &p->bydst;
+ else
+ break;
+ }
+
+ if (newpos)
+ hlist_add_behind_rcu(&policy->bydst, newpos);
+ else
+ hlist_add_head_rcu(&policy->bydst, &n->hhead);
+
+ /* paranoia checks follow.
+ * Check that the reinserted policy matches at least
+ * saddr or daddr for current node prefix.
+ *
+ * Matching both is fine, matching saddr in one policy
+ * (but not daddr) and then matching only daddr in another
+ * is a bug.
+ */
+ matches_s = xfrm_policy_addr_delta(&policy->selector.saddr,
+ &n->addr,
+ n->prefixlen,
+ family) == 0;
+ matches_d = xfrm_policy_addr_delta(&policy->selector.daddr,
+ &n->addr,
+ n->prefixlen,
+ family) == 0;
+ if (matches_s && matches_d)
+ continue;
+
+ WARN_ON_ONCE(!matches_s && !matches_d);
+ if (matches_s)
+ matched_s++;
+ if (matches_d)
+ matched_d++;
+ WARN_ON_ONCE(matched_s && matched_d);
+ }
+}
+
+static void xfrm_policy_inexact_node_reinsert(struct net *net,
+ struct xfrm_pol_inexact_node *n,
+ struct rb_root *new,
+ u16 family)
+{
+ struct xfrm_pol_inexact_node *node;
+ struct rb_node **p, *parent;
+
+ /* we should not have another subtree here */
+ WARN_ON_ONCE(!RB_EMPTY_ROOT(&n->root));
+restart:
+ parent = NULL;
+ p = &new->rb_node;
+ while (*p) {
+ u8 prefixlen;
+ int delta;
+
+ parent = *p;
+ node = rb_entry(*p, struct xfrm_pol_inexact_node, node);
+
+ prefixlen = min(node->prefixlen, n->prefixlen);
+
+ delta = xfrm_policy_addr_delta(&n->addr, &node->addr,
+ prefixlen, family);
+ if (delta < 0) {
+ p = &parent->rb_left;
+ } else if (delta > 0) {
+ p = &parent->rb_right;
+ } else {
+ bool same_prefixlen = node->prefixlen == n->prefixlen;
+ struct xfrm_policy *tmp;
+
+ hlist_for_each_entry(tmp, &n->hhead, bydst) {
+ tmp->bydst_reinsert = true;
+ hlist_del_rcu(&tmp->bydst);
+ }
+
+ node->prefixlen = prefixlen;
+
+ xfrm_policy_inexact_list_reinsert(net, node, family);
+
+ if (same_prefixlen) {
+ kfree_rcu(n, rcu);
+ return;
+ }
+
+ rb_erase(*p, new);
+ kfree_rcu(n, rcu);
+ n = node;
+ goto restart;
+ }
+ }
+
+ rb_link_node_rcu(&n->node, parent, p);
+ rb_insert_color(&n->node, new);
+}
+
+/* merge nodes v and n */
+static void xfrm_policy_inexact_node_merge(struct net *net,
+ struct xfrm_pol_inexact_node *v,
+ struct xfrm_pol_inexact_node *n,
+ u16 family)
+{
+ struct xfrm_pol_inexact_node *node;
+ struct xfrm_policy *tmp;
+ struct rb_node *rnode;
+
+ /* To-be-merged node v has a subtree.
+ *
+ * Dismantle it and insert its nodes to n->root.
+ */
+ while ((rnode = rb_first(&v->root)) != NULL) {
+ node = rb_entry(rnode, struct xfrm_pol_inexact_node, node);
+ rb_erase(&node->node, &v->root);
+ xfrm_policy_inexact_node_reinsert(net, node, &n->root,
+ family);
+ }
+
+ hlist_for_each_entry(tmp, &v->hhead, bydst) {
+ tmp->bydst_reinsert = true;
+ hlist_del_rcu(&tmp->bydst);
+ }
+
+ xfrm_policy_inexact_list_reinsert(net, n, family);
+}
+
+static struct xfrm_pol_inexact_node *
+xfrm_policy_inexact_insert_node(struct net *net,
+ struct rb_root *root,
+ xfrm_address_t *addr,
+ u16 family, u8 prefixlen, u8 dir)
+{
+ struct xfrm_pol_inexact_node *cached = NULL;
+ struct rb_node **p, *parent = NULL;
+ struct xfrm_pol_inexact_node *node;
+
+ p = &root->rb_node;
+ while (*p) {
+ int delta;
+
+ parent = *p;
+ node = rb_entry(*p, struct xfrm_pol_inexact_node, node);
+
+ delta = xfrm_policy_addr_delta(addr, &node->addr,
+ node->prefixlen,
+ family);
+ if (delta == 0 && prefixlen >= node->prefixlen) {
+ WARN_ON_ONCE(cached); /* ipsec policies got lost */
+ return node;
+ }
+
+ if (delta < 0)
+ p = &parent->rb_left;
+ else
+ p = &parent->rb_right;
+
+ if (prefixlen < node->prefixlen) {
+ delta = xfrm_policy_addr_delta(addr, &node->addr,
+ prefixlen,
+ family);
+ if (delta)
+ continue;
+
+ /* This node is a subnet of the new prefix. It needs
+ * to be removed and re-inserted with the smaller
+ * prefix and all nodes that are now also covered
+ * by the reduced prefixlen.
+ */
+ rb_erase(&node->node, root);
+
+ if (!cached) {
+ xfrm_pol_inexact_node_init(node, addr,
+ prefixlen);
+ cached = node;
+ } else {
+ /* This node also falls within the new
+ * prefixlen. Merge the to-be-reinserted
+ * node and this one.
+ */
+ xfrm_policy_inexact_node_merge(net, node,
+ cached, family);
+ kfree_rcu(node, rcu);
+ }
+
+ /* restart */
+ p = &root->rb_node;
+ parent = NULL;
+ }
+ }
+
+ node = cached;
+ if (!node) {
+ node = xfrm_pol_inexact_node_alloc(addr, prefixlen);
+ if (!node)
+ return NULL;
+ }
+
+ rb_link_node_rcu(&node->node, parent, p);
+ rb_insert_color(&node->node, root);
+
+ return node;
+}
+
+static void xfrm_policy_inexact_gc_tree(struct rb_root *r, bool rm)
+{
+ struct xfrm_pol_inexact_node *node;
+ struct rb_node *rn = rb_first(r);
+
+ while (rn) {
+ node = rb_entry(rn, struct xfrm_pol_inexact_node, node);
+
+ xfrm_policy_inexact_gc_tree(&node->root, rm);
+ rn = rb_next(rn);
+
+ if (!hlist_empty(&node->hhead) || !RB_EMPTY_ROOT(&node->root)) {
+ WARN_ON_ONCE(rm);
+ continue;
+ }
+
+ rb_erase(&node->node, r);
+ kfree_rcu(node, rcu);
+ }
+}
+
+static void __xfrm_policy_inexact_prune_bin(struct xfrm_pol_inexact_bin *b, bool net_exit)
+{
+ write_seqcount_begin(&b->count);
+ xfrm_policy_inexact_gc_tree(&b->root_d, net_exit);
+ xfrm_policy_inexact_gc_tree(&b->root_s, net_exit);
+ write_seqcount_end(&b->count);
+
+ if (!RB_EMPTY_ROOT(&b->root_d) || !RB_EMPTY_ROOT(&b->root_s) ||
+ !hlist_empty(&b->hhead)) {
+ WARN_ON_ONCE(net_exit);
+ return;
+ }
+
+ if (rhashtable_remove_fast(&xfrm_policy_inexact_table, &b->head,
+ xfrm_pol_inexact_params) == 0) {
+ list_del(&b->inexact_bins);
+ kfree_rcu(b, rcu);
+ }
+}
+
+static void xfrm_policy_inexact_prune_bin(struct xfrm_pol_inexact_bin *b)
+{
+ struct net *net = read_pnet(&b->k.net);
+
+ spin_lock_bh(&net->xfrm.xfrm_policy_lock);
+ __xfrm_policy_inexact_prune_bin(b, false);
+ spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
+}
+
+static void __xfrm_policy_inexact_flush(struct net *net)
+{
+ struct xfrm_pol_inexact_bin *bin, *t;
+
+ lockdep_assert_held(&net->xfrm.xfrm_policy_lock);
+
+ list_for_each_entry_safe(bin, t, &net->xfrm.inexact_bins, inexact_bins)
+ __xfrm_policy_inexact_prune_bin(bin, false);
+}
+
+static struct hlist_head *
+xfrm_policy_inexact_alloc_chain(struct xfrm_pol_inexact_bin *bin,
+ struct xfrm_policy *policy, u8 dir)
+{
+ struct xfrm_pol_inexact_node *n;
+ struct net *net;
+
+ net = xp_net(policy);
+ lockdep_assert_held(&net->xfrm.xfrm_policy_lock);
+
+ if (xfrm_policy_inexact_insert_use_any_list(policy))
+ return &bin->hhead;
+
+ if (xfrm_pol_inexact_addr_use_any_list(&policy->selector.daddr,
+ policy->family,
+ policy->selector.prefixlen_d)) {
+ write_seqcount_begin(&bin->count);
+ n = xfrm_policy_inexact_insert_node(net,
+ &bin->root_s,
+ &policy->selector.saddr,
+ policy->family,
+ policy->selector.prefixlen_s,
+ dir);
+ write_seqcount_end(&bin->count);
+ if (!n)
+ return NULL;
+
+ return &n->hhead;
+ }
+
+ /* daddr is fixed */
+ write_seqcount_begin(&bin->count);
+ n = xfrm_policy_inexact_insert_node(net,
+ &bin->root_d,
+ &policy->selector.daddr,
+ policy->family,
+ policy->selector.prefixlen_d, dir);
+ write_seqcount_end(&bin->count);
+ if (!n)
+ return NULL;
+
+ /* saddr is wildcard */
+ if (xfrm_pol_inexact_addr_use_any_list(&policy->selector.saddr,
+ policy->family,
+ policy->selector.prefixlen_s))
+ return &n->hhead;
+
+ write_seqcount_begin(&bin->count);
+ n = xfrm_policy_inexact_insert_node(net,
+ &n->root,
+ &policy->selector.saddr,
+ policy->family,
+ policy->selector.prefixlen_s, dir);
+ write_seqcount_end(&bin->count);
+ if (!n)
+ return NULL;
+
+ return &n->hhead;
+}
+
+static struct xfrm_policy *
+xfrm_policy_inexact_insert(struct xfrm_policy *policy, u8 dir, int excl)
+{
+ struct xfrm_pol_inexact_bin *bin;
+ struct xfrm_policy *delpol;
+ struct hlist_head *chain;
+ struct net *net;
+
+ bin = xfrm_policy_inexact_alloc_bin(policy, dir);
+ if (!bin)
+ return ERR_PTR(-ENOMEM);
+
+ net = xp_net(policy);
+ lockdep_assert_held(&net->xfrm.xfrm_policy_lock);
+
+ chain = xfrm_policy_inexact_alloc_chain(bin, policy, dir);
+ if (!chain) {
+ __xfrm_policy_inexact_prune_bin(bin, false);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ delpol = xfrm_policy_insert_list(chain, policy, excl);
+ if (delpol && excl) {
+ __xfrm_policy_inexact_prune_bin(bin, false);
+ return ERR_PTR(-EEXIST);
+ }
+
+ chain = &net->xfrm.policy_inexact[dir];
+ xfrm_policy_insert_inexact_list(chain, policy);
+
+ if (delpol)
+ __xfrm_policy_inexact_prune_bin(bin, false);
+
+ return delpol;
+}
+
static void xfrm_hash_rebuild(struct work_struct *work)
{
struct net *net = container_of(work, struct net,
@@ -591,14 +1230,66 @@
} while (read_seqretry(&net->xfrm.policy_hthresh.lock, seq));
spin_lock_bh(&net->xfrm.xfrm_policy_lock);
+ write_seqcount_begin(&xfrm_policy_hash_generation);
+
+ /* make sure that we can insert the indirect policies again before
+ * we start with destructive action.
+ */
+ list_for_each_entry(policy, &net->xfrm.policy_all, walk.all) {
+ struct xfrm_pol_inexact_bin *bin;
+ u8 dbits, sbits;
+
+ dir = xfrm_policy_id2dir(policy->index);
+ if (policy->walk.dead || dir >= XFRM_POLICY_MAX)
+ continue;
+
+ if ((dir & XFRM_POLICY_MASK) == XFRM_POLICY_OUT) {
+ if (policy->family == AF_INET) {
+ dbits = rbits4;
+ sbits = lbits4;
+ } else {
+ dbits = rbits6;
+ sbits = lbits6;
+ }
+ } else {
+ if (policy->family == AF_INET) {
+ dbits = lbits4;
+ sbits = rbits4;
+ } else {
+ dbits = lbits6;
+ sbits = rbits6;
+ }
+ }
+
+ if (policy->selector.prefixlen_d < dbits ||
+ policy->selector.prefixlen_s < sbits)
+ continue;
+
+ bin = xfrm_policy_inexact_alloc_bin(policy, dir);
+ if (!bin)
+ goto out_unlock;
+
+ if (!xfrm_policy_inexact_alloc_chain(bin, policy, dir))
+ goto out_unlock;
+ }
/* reset the bydst and inexact table in all directions */
for (dir = 0; dir < XFRM_POLICY_MAX; dir++) {
- INIT_HLIST_HEAD(&net->xfrm.policy_inexact[dir]);
+ struct hlist_node *n;
+
+ hlist_for_each_entry_safe(policy, n,
+ &net->xfrm.policy_inexact[dir],
+ bydst_inexact_list) {
+ hlist_del_rcu(&policy->bydst);
+ hlist_del_init(&policy->bydst_inexact_list);
+ }
+
hmask = net->xfrm.policy_bydst[dir].hmask;
odst = net->xfrm.policy_bydst[dir].table;
- for (i = hmask; i >= 0; i--)
- INIT_HLIST_HEAD(odst + i);
+ for (i = hmask; i >= 0; i--) {
+ hlist_for_each_entry_safe(policy, n, odst + i, bydst)
+ hlist_del_rcu(&policy->bydst);
+ }
if ((dir & XFRM_POLICY_MASK) == XFRM_POLICY_OUT) {
/* dir out => dst = remote, src = local */
net->xfrm.policy_bydst[dir].dbits4 = rbits4;
@@ -616,15 +1307,24 @@
/* re-insert all policies by order of creation */
list_for_each_entry_reverse(policy, &net->xfrm.policy_all, walk.all) {
- if (policy->walk.dead ||
- xfrm_policy_id2dir(policy->index) >= XFRM_POLICY_MAX) {
+ if (policy->walk.dead)
+ continue;
+ dir = xfrm_policy_id2dir(policy->index);
+ if (dir >= XFRM_POLICY_MAX) {
/* skip socket policies */
continue;
}
newpos = NULL;
chain = policy_hash_bysel(net, &policy->selector,
- policy->family,
- xfrm_policy_id2dir(policy->index));
+ policy->family, dir);
+
+ if (!chain) {
+ void *p = xfrm_policy_inexact_insert(policy, dir, 0);
+
+ WARN_ONCE(IS_ERR(p), "reinsert: %ld\n", PTR_ERR(p));
+ continue;
+ }
+
hlist_for_each_entry(pol, chain, bydst) {
if (policy->priority >= pol->priority)
newpos = &pol->bydst;
@@ -637,6 +1337,9 @@
hlist_add_head_rcu(&policy->bydst, chain);
}
+out_unlock:
+ __xfrm_policy_inexact_flush(net);
+ write_seqcount_end(&xfrm_policy_hash_generation);
spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
mutex_unlock(&hash_resize_mutex);
@@ -740,18 +1443,97 @@
return false;
}
-int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl)
+static u32 xfrm_pol_bin_key(const void *data, u32 len, u32 seed)
{
- struct net *net = xp_net(policy);
- struct xfrm_policy *pol;
- struct xfrm_policy *delpol;
- struct hlist_head *chain;
- struct hlist_node *newpos;
+ const struct xfrm_pol_inexact_key *k = data;
+ u32 a = k->type << 24 | k->dir << 16 | k->family;
- spin_lock_bh(&net->xfrm.xfrm_policy_lock);
- chain = policy_hash_bysel(net, &policy->selector, policy->family, dir);
- delpol = NULL;
- newpos = NULL;
+ return jhash_3words(a, k->if_id, net_hash_mix(read_pnet(&k->net)),
+ seed);
+}
+
+static u32 xfrm_pol_bin_obj(const void *data, u32 len, u32 seed)
+{
+ const struct xfrm_pol_inexact_bin *b = data;
+
+ return xfrm_pol_bin_key(&b->k, 0, seed);
+}
+
+static int xfrm_pol_bin_cmp(struct rhashtable_compare_arg *arg,
+ const void *ptr)
+{
+ const struct xfrm_pol_inexact_key *key = arg->key;
+ const struct xfrm_pol_inexact_bin *b = ptr;
+ int ret;
+
+ if (!net_eq(read_pnet(&b->k.net), read_pnet(&key->net)))
+ return -1;
+
+ ret = b->k.dir ^ key->dir;
+ if (ret)
+ return ret;
+
+ ret = b->k.type ^ key->type;
+ if (ret)
+ return ret;
+
+ ret = b->k.family ^ key->family;
+ if (ret)
+ return ret;
+
+ return b->k.if_id ^ key->if_id;
+}
+
+static const struct rhashtable_params xfrm_pol_inexact_params = {
+ .head_offset = offsetof(struct xfrm_pol_inexact_bin, head),
+ .hashfn = xfrm_pol_bin_key,
+ .obj_hashfn = xfrm_pol_bin_obj,
+ .obj_cmpfn = xfrm_pol_bin_cmp,
+ .automatic_shrinking = true,
+};
+
+static void xfrm_policy_insert_inexact_list(struct hlist_head *chain,
+ struct xfrm_policy *policy)
+{
+ struct xfrm_policy *pol, *delpol = NULL;
+ struct hlist_node *newpos = NULL;
+ int i = 0;
+
+ hlist_for_each_entry(pol, chain, bydst_inexact_list) {
+ if (pol->type == policy->type &&
+ pol->if_id == policy->if_id &&
+ !selector_cmp(&pol->selector, &policy->selector) &&
+ xfrm_policy_mark_match(policy, pol) &&
+ xfrm_sec_ctx_match(pol->security, policy->security) &&
+ !WARN_ON(delpol)) {
+ delpol = pol;
+ if (policy->priority > pol->priority)
+ continue;
+ } else if (policy->priority >= pol->priority) {
+ newpos = &pol->bydst_inexact_list;
+ continue;
+ }
+ if (delpol)
+ break;
+ }
+
+ if (newpos)
+ hlist_add_behind_rcu(&policy->bydst_inexact_list, newpos);
+ else
+ hlist_add_head_rcu(&policy->bydst_inexact_list, chain);
+
+ hlist_for_each_entry(pol, chain, bydst_inexact_list) {
+ pol->pos = i;
+ i++;
+ }
+}
+
+static struct xfrm_policy *xfrm_policy_insert_list(struct hlist_head *chain,
+ struct xfrm_policy *policy,
+ bool excl)
+{
+ struct xfrm_policy *pol, *newpos = NULL, *delpol = NULL;
+
hlist_for_each_entry(pol, chain, bydst) {
if (pol->type == policy->type &&
pol->if_id == policy->if_id &&
@@ -759,24 +1541,45 @@
xfrm_policy_mark_match(policy, pol) &&
xfrm_sec_ctx_match(pol->security, policy->security) &&
!WARN_ON(delpol)) {
- if (excl) {
- spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
- return -EEXIST;
- }
+ if (excl)
+ return ERR_PTR(-EEXIST);
delpol = pol;
if (policy->priority > pol->priority)
continue;
} else if (policy->priority >= pol->priority) {
- newpos = &pol->bydst;
+ newpos = pol;
continue;
}
if (delpol)
break;
}
+
if (newpos)
- hlist_add_behind_rcu(&policy->bydst, newpos);
+ hlist_add_behind_rcu(&policy->bydst, &newpos->bydst);
else
hlist_add_head_rcu(&policy->bydst, chain);
+
+ return delpol;
+}
+
+int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl)
+{
+ struct net *net = xp_net(policy);
+ struct xfrm_policy *delpol;
+ struct hlist_head *chain;
+
+ spin_lock_bh(&net->xfrm.xfrm_policy_lock);
+ chain = policy_hash_bysel(net, &policy->selector, policy->family, dir);
+ if (chain)
+ delpol = xfrm_policy_insert_list(chain, policy, excl);
+ else
+ delpol = xfrm_policy_inexact_insert(policy, dir, excl);
+
+ if (IS_ERR(delpol)) {
+ spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
+ return PTR_ERR(delpol);
+ }
+
__xfrm_policy_link(policy, dir);
/* After previous checking, family can either be AF_INET or AF_INET6 */
@@ -806,43 +1609,96 @@
}
EXPORT_SYMBOL(xfrm_policy_insert);
+static struct xfrm_policy *
+__xfrm_policy_bysel_ctx(struct hlist_head *chain, u32 mark, u32 if_id,
+ u8 type, int dir,
+ struct xfrm_selector *sel,
+ struct xfrm_sec_ctx *ctx)
+{
+ struct xfrm_policy *pol;
+
+ if (!chain)
+ return NULL;
+
+ hlist_for_each_entry(pol, chain, bydst) {
+ if (pol->type == type &&
+ pol->if_id == if_id &&
+ (mark & pol->mark.m) == pol->mark.v &&
+ !selector_cmp(sel, &pol->selector) &&
+ xfrm_sec_ctx_match(ctx, pol->security))
+ return pol;
+ }
+
+ return NULL;
+}
+
struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net, u32 mark, u32 if_id,
u8 type, int dir,
struct xfrm_selector *sel,
struct xfrm_sec_ctx *ctx, int delete,
int *err)
{
- struct xfrm_policy *pol, *ret;
+ struct xfrm_pol_inexact_bin *bin = NULL;
+ struct xfrm_policy *pol, *ret = NULL;
struct hlist_head *chain;
*err = 0;
spin_lock_bh(&net->xfrm.xfrm_policy_lock);
chain = policy_hash_bysel(net, sel, sel->family, dir);
- ret = NULL;
- hlist_for_each_entry(pol, chain, bydst) {
- if (pol->type == type &&
- pol->if_id == if_id &&
- (mark & pol->mark.m) == pol->mark.v &&
- !selector_cmp(sel, &pol->selector) &&
- xfrm_sec_ctx_match(ctx, pol->security)) {
- xfrm_pol_hold(pol);
- if (delete) {
- *err = security_xfrm_policy_delete(
- pol->security);
- if (*err) {
- spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
- return pol;
- }
- __xfrm_policy_unlink(pol, dir);
- }
- ret = pol;
- break;
+ if (!chain) {
+ struct xfrm_pol_inexact_candidates cand;
+ int i;
+
+ bin = xfrm_policy_inexact_lookup(net, type,
+ sel->family, dir, if_id);
+ if (!bin) {
+ spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
+ return NULL;
}
+
+ if (!xfrm_policy_find_inexact_candidates(&cand, bin,
+ &sel->saddr,
+ &sel->daddr)) {
+ spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
+ return NULL;
+ }
+
+ pol = NULL;
+ for (i = 0; i < ARRAY_SIZE(cand.res); i++) {
+ struct xfrm_policy *tmp;
+
+ tmp = __xfrm_policy_bysel_ctx(cand.res[i], mark,
+ if_id, type, dir,
+ sel, ctx);
+ if (!tmp)
+ continue;
+
+ if (!pol || tmp->pos < pol->pos)
+ pol = tmp;
+ }
+ } else {
+ pol = __xfrm_policy_bysel_ctx(chain, mark, if_id, type, dir,
+ sel, ctx);
+ }
+
+ if (pol) {
+ xfrm_pol_hold(pol);
+ if (delete) {
+ *err = security_xfrm_policy_delete(pol->security);
+ if (*err) {
+ spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
+ return pol;
+ }
+ __xfrm_policy_unlink(pol, dir);
+ }
+ ret = pol;
}
spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
if (ret && delete)
xfrm_policy_kill(ret);
+ if (bin && delete)
+ xfrm_policy_inexact_prune_bin(bin);
return ret;
}
EXPORT_SYMBOL(xfrm_policy_bysel_ctx);
@@ -892,36 +1748,19 @@
static inline int
xfrm_policy_flush_secctx_check(struct net *net, u8 type, bool task_valid)
{
- int dir, err = 0;
+ struct xfrm_policy *pol;
+ int err = 0;
- for (dir = 0; dir < XFRM_POLICY_MAX; dir++) {
- struct xfrm_policy *pol;
- int i;
+ list_for_each_entry(pol, &net->xfrm.policy_all, walk.all) {
+ if (pol->walk.dead ||
+ xfrm_policy_id2dir(pol->index) >= XFRM_POLICY_MAX ||
+ pol->type != type)
+ continue;
- hlist_for_each_entry(pol,
- &net->xfrm.policy_inexact[dir], bydst) {
- if (pol->type != type)
- continue;
- err = security_xfrm_policy_delete(pol->security);
- if (err) {
- xfrm_audit_policy_delete(pol, 0, task_valid);
- return err;
- }
- }
- for (i = net->xfrm.policy_bydst[dir].hmask; i >= 0; i--) {
- hlist_for_each_entry(pol,
- net->xfrm.policy_bydst[dir].table + i,
- bydst) {
- if (pol->type != type)
- continue;
- err = security_xfrm_policy_delete(
- pol->security);
- if (err) {
- xfrm_audit_policy_delete(pol, 0,
- task_valid);
- return err;
- }
- }
+ err = security_xfrm_policy_delete(pol->security);
+ if (err) {
+ xfrm_audit_policy_delete(pol, 0, task_valid);
+ return err;
}
}
return err;
@@ -937,6 +1776,7 @@
int xfrm_policy_flush(struct net *net, u8 type, bool task_valid)
{
int dir, err = 0, cnt = 0;
+ struct xfrm_policy *pol;
spin_lock_bh(&net->xfrm.xfrm_policy_lock);
@@ -944,48 +1784,25 @@
if (err)
goto out;
- for (dir = 0; dir < XFRM_POLICY_MAX; dir++) {
- struct xfrm_policy *pol;
- int i;
+again:
+ list_for_each_entry(pol, &net->xfrm.policy_all, walk.all) {
+ dir = xfrm_policy_id2dir(pol->index);
+ if (pol->walk.dead ||
+ dir >= XFRM_POLICY_MAX ||
+ pol->type != type)
+ continue;
- again1:
- hlist_for_each_entry(pol,
- &net->xfrm.policy_inexact[dir], bydst) {
- if (pol->type != type)
- continue;
- __xfrm_policy_unlink(pol, dir);
- spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
- cnt++;
-
- xfrm_audit_policy_delete(pol, 1, task_valid);
-
- xfrm_policy_kill(pol);
-
- spin_lock_bh(&net->xfrm.xfrm_policy_lock);
- goto again1;
- }
-
- for (i = net->xfrm.policy_bydst[dir].hmask; i >= 0; i--) {
- again2:
- hlist_for_each_entry(pol,
- net->xfrm.policy_bydst[dir].table + i,
- bydst) {
- if (pol->type != type)
- continue;
- __xfrm_policy_unlink(pol, dir);
- spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
- cnt++;
-
- xfrm_audit_policy_delete(pol, 1, task_valid);
- xfrm_policy_kill(pol);
-
- spin_lock_bh(&net->xfrm.xfrm_policy_lock);
- goto again2;
- }
- }
-
+ __xfrm_policy_unlink(pol, dir);
+ spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
+ cnt++;
+ xfrm_audit_policy_delete(pol, 1, task_valid);
+ xfrm_policy_kill(pol);
+ spin_lock_bh(&net->xfrm.xfrm_policy_lock);
+ goto again;
}
- if (!cnt)
+ if (cnt)
+ __xfrm_policy_inexact_flush(net);
+ else
err = -ESRCH;
out:
spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
@@ -1084,21 +1901,188 @@
if (match)
ret = security_xfrm_policy_lookup(pol->security, fl->flowi_secid,
dir);
-
return ret;
}
+static struct xfrm_pol_inexact_node *
+xfrm_policy_lookup_inexact_addr(const struct rb_root *r,
+ seqcount_t *count,
+ const xfrm_address_t *addr, u16 family)
+{
+ const struct rb_node *parent;
+ int seq;
+
+again:
+ seq = read_seqcount_begin(count);
+
+ parent = rcu_dereference_raw(r->rb_node);
+ while (parent) {
+ struct xfrm_pol_inexact_node *node;
+ int delta;
+
+ node = rb_entry(parent, struct xfrm_pol_inexact_node, node);
+
+ delta = xfrm_policy_addr_delta(addr, &node->addr,
+ node->prefixlen, family);
+ if (delta < 0) {
+ parent = rcu_dereference_raw(parent->rb_left);
+ continue;
+ } else if (delta > 0) {
+ parent = rcu_dereference_raw(parent->rb_right);
+ continue;
+ }
+
+ return node;
+ }
+
+ if (read_seqcount_retry(count, seq))
+ goto again;
+
+ return NULL;
+}
+
+static bool
+xfrm_policy_find_inexact_candidates(struct xfrm_pol_inexact_candidates *cand,
+ struct xfrm_pol_inexact_bin *b,
+ const xfrm_address_t *saddr,
+ const xfrm_address_t *daddr)
+{
+ struct xfrm_pol_inexact_node *n;
+ u16 family;
+
+ if (!b)
+ return false;
+
+ family = b->k.family;
+ memset(cand, 0, sizeof(*cand));
+ cand->res[XFRM_POL_CAND_ANY] = &b->hhead;
+
+ n = xfrm_policy_lookup_inexact_addr(&b->root_d, &b->count, daddr,
+ family);
+ if (n) {
+ cand->res[XFRM_POL_CAND_DADDR] = &n->hhead;
+ n = xfrm_policy_lookup_inexact_addr(&n->root, &b->count, saddr,
+ family);
+ if (n)
+ cand->res[XFRM_POL_CAND_BOTH] = &n->hhead;
+ }
+
+ n = xfrm_policy_lookup_inexact_addr(&b->root_s, &b->count, saddr,
+ family);
+ if (n)
+ cand->res[XFRM_POL_CAND_SADDR] = &n->hhead;
+
+ return true;
+}
+
+static struct xfrm_pol_inexact_bin *
+xfrm_policy_inexact_lookup_rcu(struct net *net, u8 type, u16 family,
+ u8 dir, u32 if_id)
+{
+ struct xfrm_pol_inexact_key k = {
+ .family = family,
+ .type = type,
+ .dir = dir,
+ .if_id = if_id,
+ };
+
+ write_pnet(&k.net, net);
+
+ return rhashtable_lookup(&xfrm_policy_inexact_table, &k,
+ xfrm_pol_inexact_params);
+}
+
+static struct xfrm_pol_inexact_bin *
+xfrm_policy_inexact_lookup(struct net *net, u8 type, u16 family,
+ u8 dir, u32 if_id)
+{
+ struct xfrm_pol_inexact_bin *bin;
+
+ lockdep_assert_held(&net->xfrm.xfrm_policy_lock);
+
+ rcu_read_lock();
+ bin = xfrm_policy_inexact_lookup_rcu(net, type, family, dir, if_id);
+ rcu_read_unlock();
+
+ return bin;
+}
+
+static struct xfrm_policy *
+__xfrm_policy_eval_candidates(struct hlist_head *chain,
+ struct xfrm_policy *prefer,
+ const struct flowi *fl,
+ u8 type, u16 family, int dir, u32 if_id)
+{
+ u32 priority = prefer ? prefer->priority : ~0u;
+ struct xfrm_policy *pol;
+
+ if (!chain)
+ return NULL;
+
+ hlist_for_each_entry_rcu(pol, chain, bydst) {
+ int err;
+
+ if (pol->priority > priority)
+ break;
+
+ err = xfrm_policy_match(pol, fl, type, family, dir, if_id);
+ if (err) {
+ if (err != -ESRCH)
+ return ERR_PTR(err);
+
+ continue;
+ }
+
+ if (prefer) {
+ /* matches. Is it older than *prefer? */
+ if (pol->priority == priority &&
+ prefer->pos < pol->pos)
+ return prefer;
+ }
+
+ return pol;
+ }
+
+ return NULL;
+}
+
+static struct xfrm_policy *
+xfrm_policy_eval_candidates(struct xfrm_pol_inexact_candidates *cand,
+ struct xfrm_policy *prefer,
+ const struct flowi *fl,
+ u8 type, u16 family, int dir, u32 if_id)
+{
+ struct xfrm_policy *tmp;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(cand->res); i++) {
+ tmp = __xfrm_policy_eval_candidates(cand->res[i],
+ prefer,
+ fl, type, family, dir,
+ if_id);
+ if (!tmp)
+ continue;
+
+ if (IS_ERR(tmp))
+ return tmp;
+ prefer = tmp;
+ }
+
+ return prefer;
+}
+
static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type,
const struct flowi *fl,
u16 family, u8 dir,
u32 if_id)
{
- int err;
- struct xfrm_policy *pol, *ret;
+ struct xfrm_pol_inexact_candidates cand;
const xfrm_address_t *daddr, *saddr;
+ struct xfrm_pol_inexact_bin *bin;
+ struct xfrm_policy *pol, *ret;
struct hlist_head *chain;
unsigned int sequence;
- u32 priority;
+ int err;
daddr = xfrm_flowi_daddr(fl, family);
saddr = xfrm_flowi_saddr(fl, family);
@@ -1112,7 +2096,6 @@
chain = policy_hash_direct(net, daddr, saddr, family, dir);
} while (read_seqcount_retry(&xfrm_policy_hash_generation, sequence));
- priority = ~0U;
ret = NULL;
hlist_for_each_entry_rcu(pol, chain, bydst) {
err = xfrm_policy_match(pol, fl, type, family, dir, if_id);
@@ -1125,29 +2108,23 @@
}
} else {
ret = pol;
- priority = ret->priority;
break;
}
}
- chain = &net->xfrm.policy_inexact[dir];
- hlist_for_each_entry_rcu(pol, chain, bydst) {
- if ((pol->priority >= priority) && ret)
- break;
+ bin = xfrm_policy_inexact_lookup_rcu(net, type, family, dir, if_id);
+ if (!bin || !xfrm_policy_find_inexact_candidates(&cand, bin, saddr,
+ daddr))
+ goto skip_inexact;
- err = xfrm_policy_match(pol, fl, type, family, dir, if_id);
- if (err) {
- if (err == -ESRCH)
- continue;
- else {
- ret = ERR_PTR(err);
- goto fail;
- }
- } else {
- ret = pol;
- break;
- }
+ pol = xfrm_policy_eval_candidates(&cand, ret, fl, type,
+ family, dir, if_id);
+ if (pol) {
+ ret = pol;
+ if (IS_ERR(pol))
+ goto fail;
}
+skip_inexact:
if (read_seqcount_retry(&xfrm_policy_hash_generation, sequence))
goto retry;
@@ -1239,6 +2216,7 @@
/* Socket policies are not hashed. */
if (!hlist_unhashed(&pol->bydst)) {
hlist_del_rcu(&pol->bydst);
+ hlist_del_init(&pol->bydst_inexact_list);
hlist_del(&pol->byidx);
}
@@ -1478,18 +2456,10 @@
static int xfrm_get_tos(const struct flowi *fl, int family)
{
- const struct xfrm_policy_afinfo *afinfo;
- int tos;
+ if (family == AF_INET)
+ return IPTOS_RT_MASK & fl->u.ip4.flowi4_tos;
- afinfo = xfrm_policy_get_afinfo(family);
- if (!afinfo)
- return 0;
-
- tos = afinfo->get_tos(fl);
-
- rcu_read_unlock();
-
- return tos;
+ return 0;
}
static inline struct xfrm_dst *xfrm_alloc_dst(struct net *net, int family)
@@ -1527,21 +2497,14 @@
return xdst;
}
-static inline int xfrm_init_path(struct xfrm_dst *path, struct dst_entry *dst,
- int nfheader_len)
+static void xfrm_init_path(struct xfrm_dst *path, struct dst_entry *dst,
+ int nfheader_len)
{
- const struct xfrm_policy_afinfo *afinfo =
- xfrm_policy_get_afinfo(dst->ops->family);
- int err;
-
- if (!afinfo)
- return -EINVAL;
-
- err = afinfo->init_path(path, dst, nfheader_len);
-
- rcu_read_unlock();
-
- return err;
+ if (dst->ops->family == AF_INET6) {
+ struct rt6_info *rt = (struct rt6_info *)dst;
+ path->path_cookie = rt6_get_cookie(rt);
+ path->u.rt6.rt6i_nfheader_len = nfheader_len;
+ }
}
static inline int xfrm_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
@@ -1573,10 +2536,11 @@
const struct flowi *fl,
struct dst_entry *dst)
{
+ const struct xfrm_state_afinfo *afinfo;
+ const struct xfrm_mode *inner_mode;
struct net *net = xp_net(policy);
unsigned long now = jiffies;
struct net_device *dev;
- struct xfrm_mode *inner_mode;
struct xfrm_dst *xdst_prev = NULL;
struct xfrm_dst *xdst0 = NULL;
int i = 0;
@@ -1622,13 +2586,16 @@
goto put_states;
}
} else
- inner_mode = xfrm[i]->inner_mode;
+ inner_mode = &xfrm[i]->inner_mode;
xdst->route = dst;
dst_copy_metrics(dst1, dst);
if (xfrm[i]->props.mode != XFRM_MODE_TRANSPORT) {
- __u32 mark = xfrm_smark_get(fl->flowi_mark, xfrm[i]);
+ __u32 mark = 0;
+
+ if (xfrm[i]->props.smark.v || xfrm[i]->props.smark.m)
+ mark = xfrm_smark_get(fl->flowi_mark, xfrm[i]);
family = xfrm[i]->props.family;
dst = xfrm_dst_lookup(xfrm[i], tos, fl->flowi_oif,
@@ -1647,7 +2614,14 @@
dst1->lastuse = now;
dst1->input = dst_discard;
- dst1->output = inner_mode->afinfo->output;
+
+ rcu_read_lock();
+ afinfo = xfrm_state_afinfo_get_rcu(inner_mode->family);
+ if (likely(afinfo))
+ dst1->output = afinfo->output;
+ else
+ dst1->output = dst_discard_out;
+ rcu_read_unlock();
xdst_prev = xdst;
@@ -1811,7 +2785,7 @@
pq->timeout = pq->timeout << 1;
if (!mod_timer(&pq->hold_timer, jiffies + pq->timeout))
xfrm_pol_hold(pol);
- goto out;
+ goto out;
}
dst_release(dst);
@@ -1834,7 +2808,7 @@
continue;
}
- nf_reset(skb);
+ nf_reset_ct(skb);
skb_dst_drop(skb);
skb_dst_set(skb, dst);
@@ -2225,11 +3199,12 @@
static inline int
xfrm_secpath_reject(int idx, struct sk_buff *skb, const struct flowi *fl)
{
+ struct sec_path *sp = skb_sec_path(skb);
struct xfrm_state *x;
- if (!skb->sp || idx < 0 || idx >= skb->sp->len)
+ if (!sp || idx < 0 || idx >= sp->len)
return 0;
- x = skb->sp->xvec[idx];
+ x = sp->xvec[idx];
if (!x->type->reject)
return 0;
return x->type->reject(x, skb, fl);
@@ -2287,20 +3262,231 @@
return start;
}
+static void
+decode_session4(struct sk_buff *skb, struct flowi *fl, bool reverse)
+{
+ const struct iphdr *iph = ip_hdr(skb);
+ int ihl = iph->ihl;
+ u8 *xprth = skb_network_header(skb) + ihl * 4;
+ struct flowi4 *fl4 = &fl->u.ip4;
+ int oif = 0;
+
+ if (skb_dst(skb) && skb_dst(skb)->dev)
+ oif = skb_dst(skb)->dev->ifindex;
+
+ memset(fl4, 0, sizeof(struct flowi4));
+ fl4->flowi4_mark = skb->mark;
+ fl4->flowi4_oif = reverse ? skb->skb_iif : oif;
+
+ fl4->flowi4_proto = iph->protocol;
+ fl4->daddr = reverse ? iph->saddr : iph->daddr;
+ fl4->saddr = reverse ? iph->daddr : iph->saddr;
+ fl4->flowi4_tos = iph->tos;
+
+ if (!ip_is_fragment(iph)) {
+ switch (iph->protocol) {
+ case IPPROTO_UDP:
+ case IPPROTO_UDPLITE:
+ case IPPROTO_TCP:
+ case IPPROTO_SCTP:
+ case IPPROTO_DCCP:
+ if (xprth + 4 < skb->data ||
+ pskb_may_pull(skb, xprth + 4 - skb->data)) {
+ __be16 *ports;
+
+ xprth = skb_network_header(skb) + ihl * 4;
+ ports = (__be16 *)xprth;
+
+ fl4->fl4_sport = ports[!!reverse];
+ fl4->fl4_dport = ports[!reverse];
+ }
+ break;
+ case IPPROTO_ICMP:
+ if (xprth + 2 < skb->data ||
+ pskb_may_pull(skb, xprth + 2 - skb->data)) {
+ u8 *icmp;
+
+ xprth = skb_network_header(skb) + ihl * 4;
+ icmp = xprth;
+
+ fl4->fl4_icmp_type = icmp[0];
+ fl4->fl4_icmp_code = icmp[1];
+ }
+ break;
+ case IPPROTO_ESP:
+ if (xprth + 4 < skb->data ||
+ pskb_may_pull(skb, xprth + 4 - skb->data)) {
+ __be32 *ehdr;
+
+ xprth = skb_network_header(skb) + ihl * 4;
+ ehdr = (__be32 *)xprth;
+
+ fl4->fl4_ipsec_spi = ehdr[0];
+ }
+ break;
+ case IPPROTO_AH:
+ if (xprth + 8 < skb->data ||
+ pskb_may_pull(skb, xprth + 8 - skb->data)) {
+ __be32 *ah_hdr;
+
+ xprth = skb_network_header(skb) + ihl * 4;
+ ah_hdr = (__be32 *)xprth;
+
+ fl4->fl4_ipsec_spi = ah_hdr[1];
+ }
+ break;
+ case IPPROTO_COMP:
+ if (xprth + 4 < skb->data ||
+ pskb_may_pull(skb, xprth + 4 - skb->data)) {
+ __be16 *ipcomp_hdr;
+
+ xprth = skb_network_header(skb) + ihl * 4;
+ ipcomp_hdr = (__be16 *)xprth;
+
+ fl4->fl4_ipsec_spi = htonl(ntohs(ipcomp_hdr[1]));
+ }
+ break;
+ case IPPROTO_GRE:
+ if (xprth + 12 < skb->data ||
+ pskb_may_pull(skb, xprth + 12 - skb->data)) {
+ __be16 *greflags;
+ __be32 *gre_hdr;
+
+ xprth = skb_network_header(skb) + ihl * 4;
+ greflags = (__be16 *)xprth;
+ gre_hdr = (__be32 *)xprth;
+
+ if (greflags[0] & GRE_KEY) {
+ if (greflags[0] & GRE_CSUM)
+ gre_hdr++;
+ fl4->fl4_gre_key = gre_hdr[1];
+ }
+ }
+ break;
+ default:
+ fl4->fl4_ipsec_spi = 0;
+ break;
+ }
+ }
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+static void
+decode_session6(struct sk_buff *skb, struct flowi *fl, bool reverse)
+{
+ struct flowi6 *fl6 = &fl->u.ip6;
+ int onlyproto = 0;
+ const struct ipv6hdr *hdr = ipv6_hdr(skb);
+ u32 offset = sizeof(*hdr);
+ struct ipv6_opt_hdr *exthdr;
+ const unsigned char *nh = skb_network_header(skb);
+ u16 nhoff = IP6CB(skb)->nhoff;
+ int oif = 0;
+ u8 nexthdr;
+
+ if (!nhoff)
+ nhoff = offsetof(struct ipv6hdr, nexthdr);
+
+ nexthdr = nh[nhoff];
+
+ if (skb_dst(skb) && skb_dst(skb)->dev)
+ oif = skb_dst(skb)->dev->ifindex;
+
+ memset(fl6, 0, sizeof(struct flowi6));
+ fl6->flowi6_mark = skb->mark;
+ fl6->flowi6_oif = reverse ? skb->skb_iif : oif;
+
+ fl6->daddr = reverse ? hdr->saddr : hdr->daddr;
+ fl6->saddr = reverse ? hdr->daddr : hdr->saddr;
+
+ while (nh + offset + sizeof(*exthdr) < skb->data ||
+ pskb_may_pull(skb, nh + offset + sizeof(*exthdr) - skb->data)) {
+ nh = skb_network_header(skb);
+ exthdr = (struct ipv6_opt_hdr *)(nh + offset);
+
+ switch (nexthdr) {
+ case NEXTHDR_FRAGMENT:
+ onlyproto = 1;
+ /* fall through */
+ case NEXTHDR_ROUTING:
+ case NEXTHDR_HOP:
+ case NEXTHDR_DEST:
+ offset += ipv6_optlen(exthdr);
+ nexthdr = exthdr->nexthdr;
+ exthdr = (struct ipv6_opt_hdr *)(nh + offset);
+ break;
+ case IPPROTO_UDP:
+ case IPPROTO_UDPLITE:
+ case IPPROTO_TCP:
+ case IPPROTO_SCTP:
+ case IPPROTO_DCCP:
+ if (!onlyproto && (nh + offset + 4 < skb->data ||
+ pskb_may_pull(skb, nh + offset + 4 - skb->data))) {
+ __be16 *ports;
+
+ nh = skb_network_header(skb);
+ ports = (__be16 *)(nh + offset);
+ fl6->fl6_sport = ports[!!reverse];
+ fl6->fl6_dport = ports[!reverse];
+ }
+ fl6->flowi6_proto = nexthdr;
+ return;
+ case IPPROTO_ICMPV6:
+ if (!onlyproto && (nh + offset + 2 < skb->data ||
+ pskb_may_pull(skb, nh + offset + 2 - skb->data))) {
+ u8 *icmp;
+
+ nh = skb_network_header(skb);
+ icmp = (u8 *)(nh + offset);
+ fl6->fl6_icmp_type = icmp[0];
+ fl6->fl6_icmp_code = icmp[1];
+ }
+ fl6->flowi6_proto = nexthdr;
+ return;
+#if IS_ENABLED(CONFIG_IPV6_MIP6)
+ case IPPROTO_MH:
+ offset += ipv6_optlen(exthdr);
+ if (!onlyproto && (nh + offset + 3 < skb->data ||
+ pskb_may_pull(skb, nh + offset + 3 - skb->data))) {
+ struct ip6_mh *mh;
+
+ nh = skb_network_header(skb);
+ mh = (struct ip6_mh *)(nh + offset);
+ fl6->fl6_mh_type = mh->ip6mh_type;
+ }
+ fl6->flowi6_proto = nexthdr;
+ return;
+#endif
+ /* XXX Why are there these headers? */
+ case IPPROTO_AH:
+ case IPPROTO_ESP:
+ case IPPROTO_COMP:
+ default:
+ fl6->fl6_ipsec_spi = 0;
+ fl6->flowi6_proto = nexthdr;
+ return;
+ }
+ }
+}
+#endif
+
int __xfrm_decode_session(struct sk_buff *skb, struct flowi *fl,
unsigned int family, int reverse)
{
- const struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
- int err;
-
- if (unlikely(afinfo == NULL))
+ switch (family) {
+ case AF_INET:
+ decode_session4(skb, fl, reverse);
+ break;
+#if IS_ENABLED(CONFIG_IPV6)
+ case AF_INET6:
+ decode_session6(skb, fl, reverse);
+ break;
+#endif
+ default:
return -EAFNOSUPPORT;
+ }
- afinfo->decode_session(skb, fl, reverse);
-
- err = security_xfrm_decode_session(skb, &fl->flowi_secid);
- rcu_read_unlock();
- return err;
+ return security_xfrm_decode_session(skb, &fl->flowi_secid);
}
EXPORT_SYMBOL(__xfrm_decode_session);
@@ -2329,6 +3515,7 @@
struct flowi fl;
int xerr_idx = -1;
const struct xfrm_if_cb *ifcb;
+ struct sec_path *sp;
struct xfrm_if *xi;
u32 if_id = 0;
@@ -2336,9 +3523,11 @@
ifcb = xfrm_if_get_cb();
if (ifcb) {
- xi = ifcb->decode_session(skb);
- if (xi)
+ xi = ifcb->decode_session(skb, family);
+ if (xi) {
if_id = xi->p.if_id;
+ net = xi->net;
+ }
}
rcu_read_unlock();
@@ -2353,11 +3542,12 @@
nf_nat_decode_session(skb, &fl, family);
/* First, check used SA against their selectors. */
- if (skb->sp) {
+ sp = skb_sec_path(skb);
+ if (sp) {
int i;
- for (i = skb->sp->len-1; i >= 0; i--) {
- struct xfrm_state *x = skb->sp->xvec[i];
+ for (i = sp->len - 1; i >= 0; i--) {
+ struct xfrm_state *x = sp->xvec[i];
if (!xfrm_selector_match(&x->sel, &fl, family)) {
XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEMISMATCH);
return 0;
@@ -2384,7 +3574,7 @@
}
if (!pol) {
- if (skb->sp && secpath_has_nontransport(skb->sp, 0, &xerr_idx)) {
+ if (sp && secpath_has_nontransport(sp, 0, &xerr_idx)) {
xfrm_secpath_reject(xerr_idx, skb, &fl);
XFRM_INC_STATS(net, LINUX_MIB_XFRMINNOPOLS);
return 0;
@@ -2413,7 +3603,6 @@
#endif
if (pol->action == XFRM_POLICY_ALLOW) {
- struct sec_path *sp;
static struct sec_path dummy;
struct xfrm_tmpl *tp[XFRM_MAX_DEPTH];
struct xfrm_tmpl *stp[XFRM_MAX_DEPTH];
@@ -2421,7 +3610,8 @@
int ti = 0;
int i, k;
- if ((sp = skb->sp) == NULL)
+ sp = skb_sec_path(skb);
+ if (!sp)
sp = &dummy;
for (pi = 0; pi < npols; pi++) {
@@ -2439,7 +3629,7 @@
}
xfrm_nr = ti;
if (npols > 1) {
- xfrm_tmpl_sort(stp, tpp, xfrm_nr, family, net);
+ xfrm_tmpl_sort(stp, tpp, xfrm_nr, family);
tpp = stp;
}
@@ -2816,13 +4006,17 @@
static int __net_init xfrm_policy_init(struct net *net)
{
unsigned int hmask, sz;
- int dir;
+ int dir, err;
- if (net_eq(net, &init_net))
+ if (net_eq(net, &init_net)) {
xfrm_dst_cache = kmem_cache_create("xfrm_dst_cache",
sizeof(struct xfrm_dst),
0, SLAB_HWCACHE_ALIGN|SLAB_PANIC,
NULL);
+ err = rhashtable_init(&xfrm_policy_inexact_table,
+ &xfrm_pol_inexact_params);
+ BUG_ON(err);
+ }
hmask = 8 - 1;
sz = (hmask+1) * sizeof(struct hlist_head);
@@ -2857,6 +4051,7 @@
seqlock_init(&net->xfrm.policy_hthresh.lock);
INIT_LIST_HEAD(&net->xfrm.policy_all);
+ INIT_LIST_HEAD(&net->xfrm.inexact_bins);
INIT_WORK(&net->xfrm.policy_hash_work, xfrm_hash_resize);
INIT_WORK(&net->xfrm.policy_hthresh.work, xfrm_hash_rebuild);
return 0;
@@ -2875,6 +4070,7 @@
static void xfrm_policy_fini(struct net *net)
{
+ struct xfrm_pol_inexact_bin *b, *t;
unsigned int sz;
int dir;
@@ -2900,6 +4096,11 @@
sz = (net->xfrm.policy_idx_hmask + 1) * sizeof(struct hlist_head);
WARN_ON(!hlist_empty(net->xfrm.policy_byidx));
xfrm_hash_free(net->xfrm.policy_byidx, sz);
+
+ spin_lock_bh(&net->xfrm.xfrm_policy_lock);
+ list_for_each_entry_safe(b, t, &net->xfrm.inexact_bins, inexact_bins)
+ __xfrm_policy_inexact_prune_bin(b, true);
+ spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
}
static int __net_init xfrm_net_init(struct net *net)
@@ -3065,7 +4266,7 @@
}
}
chain = &net->xfrm.policy_inexact[dir];
- hlist_for_each_entry(pol, chain, bydst) {
+ hlist_for_each_entry(pol, chain, bydst_inexact_list) {
if ((pol->priority >= priority) && ret)
break;