Update Linux to v5.4.2
Change-Id: Idf6911045d9d382da2cfe01b1edff026404ac8fd
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index e957413..2985509 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
#
# Traffic control configuration.
#
@@ -194,6 +195,17 @@
To compile this code as a module, choose M here: the
module will be called sch_etf.
+config NET_SCH_TAPRIO
+ tristate "Time Aware Priority (taprio) Scheduler"
+ help
+ Say Y here if you want to use the Time Aware Priority (taprio) packet
+ scheduling algorithm.
+
+ See the top of <file:net/sched/sch_taprio.c> for more details.
+
+ To compile this code as a module, choose M here: the
+ module will be called sch_taprio.
+
config NET_SCH_GRED
tristate "Generic Random Early Detection (GRED)"
---help---
@@ -312,7 +324,7 @@
tristate "Common Applications Kept Enhanced (CAKE)"
help
Say Y here if you want to use the Common Applications Kept Enhanced
- (CAKE) queue management algorithm.
+ (CAKE) queue management algorithm.
To compile this driver as a module, choose M here: the module
will be called sch_cake.
@@ -347,8 +359,7 @@
help
Say Y here if you want to use the Proportional Integral controller
Enhanced scheduler packet scheduling algorithm.
- For more information, please see
- http://tools.ietf.org/html/draft-pan-tsvwg-pie-00
+ For more information, please see https://tools.ietf.org/html/rfc8033
To compile this driver as a module, choose M here: the module
will be called sch_pie.
@@ -719,8 +730,8 @@
config NET_ACT_POLICE
tristate "Traffic Policing"
- depends on NET_CLS_ACT
- ---help---
+ depends on NET_CLS_ACT
+ ---help---
Say Y here if you want to do traffic policing, i.e. strict
bandwidth limiting. This action replaces the existing policing
module.
@@ -729,9 +740,9 @@
module will be called act_police.
config NET_ACT_GACT
- tristate "Generic actions"
- depends on NET_CLS_ACT
- ---help---
+ tristate "Generic actions"
+ depends on NET_CLS_ACT
+ ---help---
Say Y here to take generic actions such as dropping and
accepting packets.
@@ -739,15 +750,15 @@
module will be called act_gact.
config GACT_PROB
- bool "Probability support"
- depends on NET_ACT_GACT
- ---help---
+ bool "Probability support"
+ depends on NET_ACT_GACT
+ ---help---
Say Y here to use the generic action randomly or deterministically.
config NET_ACT_MIRRED
- tristate "Redirecting and Mirroring"
- depends on NET_CLS_ACT
- ---help---
+ tristate "Redirecting and Mirroring"
+ depends on NET_CLS_ACT
+ ---help---
Say Y here to allow packets to be mirrored or redirected to
other devices.
@@ -755,10 +766,10 @@
module will be called act_mirred.
config NET_ACT_SAMPLE
- tristate "Traffic Sampling"
- depends on NET_CLS_ACT
- select PSAMPLE
- ---help---
+ tristate "Traffic Sampling"
+ depends on NET_CLS_ACT
+ select PSAMPLE
+ ---help---
Say Y here to allow packet sampling tc action. The packet sample
action consists of statistically choosing packets and sampling
them using the psample module.
@@ -767,9 +778,9 @@
module will be called act_sample.
config NET_ACT_IPT
- tristate "IPtables targets"
- depends on NET_CLS_ACT && NETFILTER && IP_NF_IPTABLES
- ---help---
+ tristate "IPtables targets"
+ depends on NET_CLS_ACT && NETFILTER && IP_NF_IPTABLES
+ ---help---
Say Y here to be able to invoke iptables targets after successful
classification.
@@ -777,9 +788,9 @@
module will be called act_ipt.
config NET_ACT_NAT
- tristate "Stateless NAT"
- depends on NET_CLS_ACT
- ---help---
+ tristate "Stateless NAT"
+ depends on NET_CLS_ACT
+ ---help---
Say Y here to do stateless NAT on IPv4 packets. You should use
netfilter for NAT unless you know what you are doing.
@@ -787,18 +798,18 @@
module will be called act_nat.
config NET_ACT_PEDIT
- tristate "Packet Editing"
- depends on NET_CLS_ACT
- ---help---
+ tristate "Packet Editing"
+ depends on NET_CLS_ACT
+ ---help---
Say Y here if you want to mangle the content of packets.
To compile this code as a module, choose M here: the
module will be called act_pedit.
config NET_ACT_SIMP
- tristate "Simple Example (Debug)"
- depends on NET_CLS_ACT
- ---help---
+ tristate "Simple Example (Debug)"
+ depends on NET_CLS_ACT
+ ---help---
Say Y here to add a simple action for demonstration purposes.
It is meant as an example and for debugging purposes. It will
print a configured policy string followed by the packet count
@@ -810,9 +821,9 @@
module will be called act_simple.
config NET_ACT_SKBEDIT
- tristate "SKB Editing"
- depends on NET_CLS_ACT
- ---help---
+ tristate "SKB Editing"
+ depends on NET_CLS_ACT
+ ---help---
Say Y here to change skb priority or queue_mapping settings.
If unsure, say N.
@@ -821,20 +832,31 @@
module will be called act_skbedit.
config NET_ACT_CSUM
- tristate "Checksum Updating"
- depends on NET_CLS_ACT && INET
- select LIBCRC32C
- ---help---
+ tristate "Checksum Updating"
+ depends on NET_CLS_ACT && INET
+ select LIBCRC32C
+ ---help---
Say Y here to update some common checksum after some direct
packet alterations.
To compile this code as a module, choose M here: the
module will be called act_csum.
+config NET_ACT_MPLS
+ tristate "MPLS manipulation"
+ depends on NET_CLS_ACT
+ help
+ Say Y here to push or pop MPLS headers.
+
+ If unsure, say N.
+
+ To compile this code as a module, choose M here: the
+ module will be called act_mpls.
+
config NET_ACT_VLAN
- tristate "Vlan manipulation"
- depends on NET_CLS_ACT
- ---help---
+ tristate "Vlan manipulation"
+ depends on NET_CLS_ACT
+ ---help---
Say Y here to push or pop vlan headers.
If unsure, say N.
@@ -843,9 +865,9 @@
module will be called act_vlan.
config NET_ACT_BPF
- tristate "BPF based action"
- depends on NET_CLS_ACT
- ---help---
+ tristate "BPF based action"
+ depends on NET_CLS_ACT
+ ---help---
Say Y here to execute BPF code on packets. The BPF code will decide
if the packet should be dropped or not.
@@ -855,10 +877,10 @@
module will be called act_bpf.
config NET_ACT_CONNMARK
- tristate "Netfilter Connection Mark Retriever"
- depends on NET_CLS_ACT && NETFILTER && IP_NF_IPTABLES
- depends on NF_CONNTRACK && NF_CONNTRACK_MARK
- ---help---
+ tristate "Netfilter Connection Mark Retriever"
+ depends on NET_CLS_ACT && NETFILTER && IP_NF_IPTABLES
+ depends on NF_CONNTRACK && NF_CONNTRACK_MARK
+ ---help---
Say Y here to allow retrieving of conn mark
If unsure, say N.
@@ -866,22 +888,39 @@
To compile this code as a module, choose M here: the
module will be called act_connmark.
+config NET_ACT_CTINFO
+ tristate "Netfilter Connection Mark Actions"
+ depends on NET_CLS_ACT && NETFILTER && IP_NF_IPTABLES
+ depends on NF_CONNTRACK && NF_CONNTRACK_MARK
+ help
+ Say Y here to allow transfer of a connmark stored information.
+ Current actions transfer connmark stored DSCP into
+ ipv4/v6 diffserv and/or to transfer connmark to packet
+ mark. Both are useful for restoring egress based marks
+ back onto ingress connections for qdisc priority mapping
+ purposes.
+
+ If unsure, say N.
+
+ To compile this code as a module, choose M here: the
+ module will be called act_ctinfo.
+
config NET_ACT_SKBMOD
- tristate "skb data modification action"
- depends on NET_CLS_ACT
- ---help---
- Say Y here to allow modification of skb data
+ tristate "skb data modification action"
+ depends on NET_CLS_ACT
+ ---help---
+ Say Y here to allow modification of skb data
- If unsure, say N.
+ If unsure, say N.
- To compile this code as a module, choose M here: the
- module will be called act_skbmod.
+ To compile this code as a module, choose M here: the
+ module will be called act_skbmod.
config NET_ACT_IFE
- tristate "Inter-FE action based on IETF ForCES InterFE LFB"
- depends on NET_CLS_ACT
- select NET_IFE
- ---help---
+ tristate "Inter-FE action based on IETF ForCES InterFE LFB"
+ depends on NET_CLS_ACT
+ select NET_IFE
+ ---help---
Say Y here to allow for sourcing and terminating metadata
For details refer to netdev01 paper:
"Distributing Linux Traffic Control Classifier-Action Subsystem"
@@ -891,9 +930,9 @@
module will be called act_ife.
config NET_ACT_TUNNEL_KEY
- tristate "IP tunnel metadata manipulation"
- depends on NET_CLS_ACT
- ---help---
+ tristate "IP tunnel metadata manipulation"
+ depends on NET_CLS_ACT
+ ---help---
Say Y here to set/release ip tunnel metadata.
If unsure, say N.
@@ -901,25 +940,40 @@
To compile this code as a module, choose M here: the
module will be called act_tunnel_key.
+config NET_ACT_CT
+ tristate "connection tracking tc action"
+ depends on NET_CLS_ACT && NF_CONNTRACK && NF_NAT
+ help
+ Say Y here to allow sending the packets to conntrack module.
+
+ If unsure, say N.
+
+ To compile this code as a module, choose M here: the
+ module will be called act_ct.
+
config NET_IFE_SKBMARK
- tristate "Support to encoding decoding skb mark on IFE action"
- depends on NET_ACT_IFE
+ tristate "Support to encoding decoding skb mark on IFE action"
+ depends on NET_ACT_IFE
config NET_IFE_SKBPRIO
- tristate "Support to encoding decoding skb prio on IFE action"
- depends on NET_ACT_IFE
+ tristate "Support to encoding decoding skb prio on IFE action"
+ depends on NET_ACT_IFE
config NET_IFE_SKBTCINDEX
- tristate "Support to encoding decoding skb tcindex on IFE action"
- depends on NET_ACT_IFE
+ tristate "Support to encoding decoding skb tcindex on IFE action"
+ depends on NET_ACT_IFE
-config NET_CLS_IND
- bool "Incoming device classification"
- depends on NET_CLS_U32 || NET_CLS_FW
- ---help---
- Say Y here to extend the u32 and fw classifier to support
- classification based on the incoming device. This option is
- likely to disappear in favour of the metadata ematch.
+config NET_TC_SKB_EXT
+ bool "TC recirculation support"
+ depends on NET_CLS_ACT
+ select SKB_EXTENSIONS
+
+ help
+ Say Y here to allow tc chain misses to continue in OvS datapath in
+ the correct recirc_id, and hardware chain misses to continue in
+ the correct chain in tc software datapath.
+
+ Say N here if you won't be using tc<->ovs offload or tc chains offload.
endif # NET_SCHED
diff --git a/net/sched/Makefile b/net/sched/Makefile
index f0403f4..415d1e1 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -18,15 +18,18 @@
obj-$(CONFIG_NET_ACT_SIMP) += act_simple.o
obj-$(CONFIG_NET_ACT_SKBEDIT) += act_skbedit.o
obj-$(CONFIG_NET_ACT_CSUM) += act_csum.o
+obj-$(CONFIG_NET_ACT_MPLS) += act_mpls.o
obj-$(CONFIG_NET_ACT_VLAN) += act_vlan.o
obj-$(CONFIG_NET_ACT_BPF) += act_bpf.o
obj-$(CONFIG_NET_ACT_CONNMARK) += act_connmark.o
+obj-$(CONFIG_NET_ACT_CTINFO) += act_ctinfo.o
obj-$(CONFIG_NET_ACT_SKBMOD) += act_skbmod.o
obj-$(CONFIG_NET_ACT_IFE) += act_ife.o
obj-$(CONFIG_NET_IFE_SKBMARK) += act_meta_mark.o
obj-$(CONFIG_NET_IFE_SKBPRIO) += act_meta_skbprio.o
obj-$(CONFIG_NET_IFE_SKBTCINDEX) += act_meta_skbtcindex.o
obj-$(CONFIG_NET_ACT_TUNNEL_KEY)+= act_tunnel_key.o
+obj-$(CONFIG_NET_ACT_CT) += act_ct.o
obj-$(CONFIG_NET_SCH_FIFO) += sch_fifo.o
obj-$(CONFIG_NET_SCH_CBQ) += sch_cbq.o
obj-$(CONFIG_NET_SCH_HTB) += sch_htb.o
@@ -57,6 +60,7 @@
obj-$(CONFIG_NET_SCH_PIE) += sch_pie.o
obj-$(CONFIG_NET_SCH_CBS) += sch_cbs.o
obj-$(CONFIG_NET_SCH_ETF) += sch_etf.o
+obj-$(CONFIG_NET_SCH_TAPRIO) += sch_taprio.o
obj-$(CONFIG_NET_CLS_U32) += cls_u32.o
obj-$(CONFIG_NET_CLS_ROUTE4) += cls_route.o
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index e12f8ef..69d4676 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -1,14 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* net/sched/act_api.c Packet action API.
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Author: Jamal Hadi Salim
- *
- *
*/
#include <linux/types.h>
@@ -21,8 +15,6 @@
#include <linux/kmod.h>
#include <linux/err.h>
#include <linux/module.h>
-#include <linux/rhashtable.h>
-#include <linux/list.h>
#include <net/net_namespace.h>
#include <net/sock.h>
#include <net/sch_generic.h>
@@ -30,27 +22,10 @@
#include <net/act_api.h>
#include <net/netlink.h>
-static int tcf_action_goto_chain_init(struct tc_action *a, struct tcf_proto *tp)
-{
- u32 chain_index = a->tcfa_action & TC_ACT_EXT_VAL_MASK;
-
- if (!tp)
- return -EINVAL;
- a->goto_chain = tcf_chain_get_by_act(tp->chain->block, chain_index);
- if (!a->goto_chain)
- return -ENOMEM;
- return 0;
-}
-
-static void tcf_action_goto_chain_fini(struct tc_action *a)
-{
- tcf_chain_put_by_act(a->goto_chain);
-}
-
static void tcf_action_goto_chain_exec(const struct tc_action *a,
struct tcf_result *res)
{
- const struct tcf_chain *chain = a->goto_chain;
+ const struct tcf_chain *chain = rcu_dereference_bh(a->goto_chain);
res->goto_tp = rcu_dereference_bh(chain->filter_chain);
}
@@ -73,6 +48,51 @@
call_rcu(&old->rcu, tcf_free_cookie_rcu);
}
+int tcf_action_check_ctrlact(int action, struct tcf_proto *tp,
+ struct tcf_chain **newchain,
+ struct netlink_ext_ack *extack)
+{
+ int opcode = TC_ACT_EXT_OPCODE(action), ret = -EINVAL;
+ u32 chain_index;
+
+ if (!opcode)
+ ret = action > TC_ACT_VALUE_MAX ? -EINVAL : 0;
+ else if (opcode <= TC_ACT_EXT_OPCODE_MAX || action == TC_ACT_UNSPEC)
+ ret = 0;
+ if (ret) {
+ NL_SET_ERR_MSG(extack, "invalid control action");
+ goto end;
+ }
+
+ if (TC_ACT_EXT_CMP(action, TC_ACT_GOTO_CHAIN)) {
+ chain_index = action & TC_ACT_EXT_VAL_MASK;
+ if (!tp || !newchain) {
+ ret = -EINVAL;
+ NL_SET_ERR_MSG(extack,
+ "can't goto NULL proto/chain");
+ goto end;
+ }
+ *newchain = tcf_chain_get_by_act(tp->chain->block, chain_index);
+ if (!*newchain) {
+ ret = -ENOMEM;
+ NL_SET_ERR_MSG(extack,
+ "can't allocate goto_chain");
+ }
+ }
+end:
+ return ret;
+}
+EXPORT_SYMBOL(tcf_action_check_ctrlact);
+
+struct tcf_chain *tcf_action_set_ctrlact(struct tc_action *a, int action,
+ struct tcf_chain *goto_chain)
+{
+ a->tcfa_action = action;
+ rcu_swap_protected(a->goto_chain, goto_chain, 1);
+ return goto_chain;
+}
+EXPORT_SYMBOL(tcf_action_set_ctrlact);
+
/* XXX: For standalone actions, we don't need a RCU grace period either, because
* actions are always connected to filters and filters are already destroyed in
* RCU callbacks, so after a RCU grace period actions are already disconnected
@@ -80,12 +100,15 @@
*/
static void free_tcf(struct tc_action *p)
{
+ struct tcf_chain *chain = rcu_dereference_protected(p->goto_chain, 1);
+
free_percpu(p->cpu_bstats);
+ free_percpu(p->cpu_bstats_hw);
free_percpu(p->cpu_qstats);
tcf_set_action_cookie(&p->act_cookie, NULL);
- if (p->goto_chain)
- tcf_action_goto_chain_fini(p);
+ if (chain)
+ tcf_chain_put_by_act(chain);
kfree(p);
}
@@ -103,11 +126,11 @@
{
struct tcf_idrinfo *idrinfo = p->idrinfo;
- if (refcount_dec_and_lock(&p->tcfa_refcnt, &idrinfo->lock)) {
+ if (refcount_dec_and_mutex_lock(&p->tcfa_refcnt, &idrinfo->lock)) {
if (bind)
atomic_dec(&p->tcfa_bindcnt);
idr_remove(&idrinfo->action_idr, p->tcfa_index);
- spin_unlock(&idrinfo->lock);
+ mutex_unlock(&idrinfo->lock);
tcf_action_cleanup(p);
return 1;
@@ -198,12 +221,13 @@
struct idr *idr = &idrinfo->action_idr;
struct tc_action *p;
unsigned long id = 1;
+ unsigned long tmp;
- spin_lock(&idrinfo->lock);
+ mutex_lock(&idrinfo->lock);
s_i = cb->args[0];
- idr_for_each_entry_ul(idr, p, id) {
+ idr_for_each_entry_ul(idr, p, tmp, id) {
index++;
if (index < s_i)
continue;
@@ -213,7 +237,7 @@
(unsigned long)p->tcfa_tm.lastuse))
continue;
- nest = nla_nest_start(skb, n_i);
+ nest = nla_nest_start_noflag(skb, n_i);
if (!nest) {
index--;
goto nla_put_failure;
@@ -234,7 +258,7 @@
if (index >= 0)
cb->args[0] = index + 1;
- spin_unlock(&idrinfo->lock);
+ mutex_unlock(&idrinfo->lock);
if (n_i) {
if (act_flags & TCA_FLAG_LARGE_DUMP_ON)
cb->args[1] = n_i;
@@ -246,6 +270,20 @@
goto done;
}
+static int tcf_idr_release_unsafe(struct tc_action *p)
+{
+ if (atomic_read(&p->tcfa_bindcnt) > 0)
+ return -EPERM;
+
+ if (refcount_dec_and_test(&p->tcfa_refcnt)) {
+ idr_remove(&p->idrinfo->action_idr, p->tcfa_index);
+ tcf_action_cleanup(p);
+ return ACT_P_DELETED;
+ }
+
+ return 0;
+}
+
static int tcf_del_walker(struct tcf_idrinfo *idrinfo, struct sk_buff *skb,
const struct tc_action_ops *ops)
{
@@ -255,22 +293,27 @@
struct idr *idr = &idrinfo->action_idr;
struct tc_action *p;
unsigned long id = 1;
+ unsigned long tmp;
- nest = nla_nest_start(skb, 0);
+ nest = nla_nest_start_noflag(skb, 0);
if (nest == NULL)
goto nla_put_failure;
if (nla_put_string(skb, TCA_KIND, ops->kind))
goto nla_put_failure;
- idr_for_each_entry_ul(idr, p, id) {
- ret = __tcf_idr_release(p, false, true);
+ mutex_lock(&idrinfo->lock);
+ idr_for_each_entry_ul(idr, p, tmp, id) {
+ ret = tcf_idr_release_unsafe(p);
if (ret == ACT_P_DELETED) {
module_put(ops->owner);
n_i++;
} else if (ret < 0) {
+ mutex_unlock(&idrinfo->lock);
goto nla_put_failure;
}
}
+ mutex_unlock(&idrinfo->lock);
+
if (nla_put_u32(skb, TCA_FCNT, n_i))
goto nla_put_failure;
nla_nest_end(skb, nest);
@@ -305,13 +348,13 @@
struct tcf_idrinfo *idrinfo = tn->idrinfo;
struct tc_action *p;
- spin_lock(&idrinfo->lock);
+ mutex_lock(&idrinfo->lock);
p = idr_find(&idrinfo->action_idr, index);
if (IS_ERR(p))
p = NULL;
else if (p)
refcount_inc(&p->tcfa_refcnt);
- spin_unlock(&idrinfo->lock);
+ mutex_unlock(&idrinfo->lock);
if (p) {
*a = p;
@@ -326,10 +369,10 @@
struct tc_action *p;
int ret = 0;
- spin_lock(&idrinfo->lock);
+ mutex_lock(&idrinfo->lock);
p = idr_find(&idrinfo->action_idr, index);
if (!p) {
- spin_unlock(&idrinfo->lock);
+ mutex_unlock(&idrinfo->lock);
return -ENOENT;
}
@@ -339,7 +382,7 @@
WARN_ON(p != idr_remove(&idrinfo->action_idr,
p->tcfa_index));
- spin_unlock(&idrinfo->lock);
+ mutex_unlock(&idrinfo->lock);
tcf_action_cleanup(p);
module_put(owner);
@@ -350,7 +393,7 @@
ret = -EPERM;
}
- spin_unlock(&idrinfo->lock);
+ mutex_unlock(&idrinfo->lock);
return ret;
}
@@ -372,9 +415,12 @@
p->cpu_bstats = netdev_alloc_pcpu_stats(struct gnet_stats_basic_cpu);
if (!p->cpu_bstats)
goto err1;
+ p->cpu_bstats_hw = netdev_alloc_pcpu_stats(struct gnet_stats_basic_cpu);
+ if (!p->cpu_bstats_hw)
+ goto err2;
p->cpu_qstats = alloc_percpu(struct gnet_stats_queue);
if (!p->cpu_qstats)
- goto err2;
+ goto err3;
}
spin_lock_init(&p->tcfa_lock);
p->tcfa_index = index;
@@ -386,15 +432,17 @@
&p->tcfa_rate_est,
&p->tcfa_lock, NULL, est);
if (err)
- goto err3;
+ goto err4;
}
p->idrinfo = idrinfo;
p->ops = ops;
*a = p;
return 0;
-err3:
+err4:
free_percpu(p->cpu_qstats);
+err3:
+ free_percpu(p->cpu_bstats_hw);
err2:
free_percpu(p->cpu_bstats);
err1:
@@ -407,10 +455,10 @@
{
struct tcf_idrinfo *idrinfo = tn->idrinfo;
- spin_lock(&idrinfo->lock);
+ mutex_lock(&idrinfo->lock);
/* Replace ERR_PTR(-EBUSY) allocated by tcf_idr_check_alloc */
WARN_ON(!IS_ERR(idr_replace(&idrinfo->action_idr, a, a->tcfa_index)));
- spin_unlock(&idrinfo->lock);
+ mutex_unlock(&idrinfo->lock);
}
EXPORT_SYMBOL(tcf_idr_insert);
@@ -420,10 +468,10 @@
{
struct tcf_idrinfo *idrinfo = tn->idrinfo;
- spin_lock(&idrinfo->lock);
+ mutex_lock(&idrinfo->lock);
/* Remove ERR_PTR(-EBUSY) allocated by tcf_idr_check_alloc */
WARN_ON(!IS_ERR(idr_remove(&idrinfo->action_idr, index)));
- spin_unlock(&idrinfo->lock);
+ mutex_unlock(&idrinfo->lock);
}
EXPORT_SYMBOL(tcf_idr_cleanup);
@@ -441,14 +489,14 @@
int ret;
again:
- spin_lock(&idrinfo->lock);
+ mutex_lock(&idrinfo->lock);
if (*index) {
p = idr_find(&idrinfo->action_idr, *index);
if (IS_ERR(p)) {
/* This means that another process allocated
* index but did not assign the pointer yet.
*/
- spin_unlock(&idrinfo->lock);
+ mutex_unlock(&idrinfo->lock);
goto again;
}
@@ -461,7 +509,7 @@
} else {
*a = NULL;
ret = idr_alloc_u32(&idrinfo->action_idr, NULL, index,
- *index, GFP_ATOMIC);
+ *index, GFP_KERNEL);
if (!ret)
idr_replace(&idrinfo->action_idr,
ERR_PTR(-EBUSY), *index);
@@ -470,12 +518,12 @@
*index = 1;
*a = NULL;
ret = idr_alloc_u32(&idrinfo->action_idr, NULL, index,
- UINT_MAX, GFP_ATOMIC);
+ UINT_MAX, GFP_KERNEL);
if (!ret)
idr_replace(&idrinfo->action_idr, ERR_PTR(-EBUSY),
*index);
}
- spin_unlock(&idrinfo->lock);
+ mutex_unlock(&idrinfo->lock);
return ret;
}
EXPORT_SYMBOL(tcf_idr_check_alloc);
@@ -487,8 +535,9 @@
struct tc_action *p;
int ret;
unsigned long id = 1;
+ unsigned long tmp;
- idr_for_each_entry_ul(idr, p, id) {
+ idr_for_each_entry_ul(idr, p, tmp, id) {
ret = __tcf_idr_release(p, false, true);
if (ret == ACT_P_DELETED)
module_put(ops->owner);
@@ -521,7 +570,7 @@
write_lock(&act_mod_lock);
list_for_each_entry(a, &act_base, head) {
- if (act->type == a->type || (strcmp(act->kind, a->kind) == 0)) {
+ if (act->id == a->id || (strcmp(act->kind, a->kind) == 0)) {
write_unlock(&act_mod_lock);
unregister_pernet_subsys(ops);
return -EEXIST;
@@ -632,6 +681,10 @@
return TC_ACT_OK;
}
} else if (TC_ACT_EXT_CMP(ret, TC_ACT_GOTO_CHAIN)) {
+ if (unlikely(!rcu_access_pointer(a->goto_chain))) {
+ net_warn_ratelimited("can't go to NULL chain!\n");
+ return TC_ACT_SHOT;
+ }
tcf_action_goto_chain_exec(a, res);
}
@@ -720,7 +773,7 @@
}
rcu_read_unlock();
- nest = nla_nest_start(skb, TCA_OPTIONS);
+ nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
if (nest == NULL)
goto nla_put_failure;
err = tcf_action_dump_old(skb, a, bind, ref);
@@ -744,7 +797,7 @@
for (i = 0; i < TCA_ACT_MAX_PRIO && actions[i]; i++) {
a = actions[i];
- nest = nla_nest_start(skb, a->order);
+ nest = nla_nest_start_noflag(skb, i + 1);
if (nest == NULL)
goto nla_put_failure;
err = tcf_action_dump_1(skb, a, bind, ref);
@@ -778,14 +831,13 @@
return c;
}
-static bool tcf_action_valid(int action)
-{
- int opcode = TC_ACT_EXT_OPCODE(action);
-
- if (!opcode)
- return action <= TC_ACT_VALUE_MAX;
- return opcode <= TC_ACT_EXT_OPCODE_MAX || action == TC_ACT_UNSPEC;
-}
+static const struct nla_policy tcf_action_policy[TCA_ACT_MAX + 1] = {
+ [TCA_ACT_KIND] = { .type = NLA_STRING },
+ [TCA_ACT_INDEX] = { .type = NLA_U32 },
+ [TCA_ACT_COOKIE] = { .type = NLA_BINARY,
+ .len = TC_COOKIE_MAX_SIZE },
+ [TCA_ACT_OPTIONS] = { .type = NLA_NESTED },
+};
struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp,
struct nlattr *nla, struct nlattr *est,
@@ -802,7 +854,8 @@
int err;
if (name == NULL) {
- err = nla_parse_nested(tb, TCA_ACT_MAX, nla, NULL, extack);
+ err = nla_parse_nested_deprecated(tb, TCA_ACT_MAX, nla,
+ tcf_action_policy, extack);
if (err < 0)
goto err_out;
err = -EINVAL;
@@ -816,13 +869,6 @@
goto err_out;
}
if (tb[TCA_ACT_COOKIE]) {
- int cklen = nla_len(tb[TCA_ACT_COOKIE]);
-
- if (cklen > TC_COOKIE_MAX_SIZE) {
- NL_SET_ERR_MSG(extack, "TC cookie size above the maximum");
- goto err_out;
- }
-
cookie = nla_memdup_cookie(tb);
if (!cookie) {
NL_SET_ERR_MSG(extack, "No memory to generate TC cookie");
@@ -868,10 +914,10 @@
/* backward compatibility for policer */
if (name == NULL)
err = a_o->init(net, tb[TCA_ACT_OPTIONS], est, &a, ovr, bind,
- rtnl_held, extack);
+ rtnl_held, tp, extack);
else
err = a_o->init(net, nla, est, &a, ovr, bind, rtnl_held,
- extack);
+ tp, extack);
if (err < 0)
goto err_mod;
@@ -885,18 +931,10 @@
if (err != ACT_P_CREATED)
module_put(a_o->owner);
- if (TC_ACT_EXT_CMP(a->tcfa_action, TC_ACT_GOTO_CHAIN)) {
- err = tcf_action_goto_chain_init(a, tp);
- if (err) {
- tcf_action_destroy_1(a, bind);
- NL_SET_ERR_MSG(extack, "Failed to init TC action chain");
- return ERR_PTR(err);
- }
- }
-
- if (!tcf_action_valid(a->tcfa_action)) {
+ if (TC_ACT_EXT_CMP(a->tcfa_action, TC_ACT_GOTO_CHAIN) &&
+ !rcu_access_pointer(a->goto_chain)) {
tcf_action_destroy_1(a, bind);
- NL_SET_ERR_MSG(extack, "Invalid control action value");
+ NL_SET_ERR_MSG(extack, "can't use goto chain with NULL chain");
return ERR_PTR(-EINVAL);
}
@@ -925,7 +963,8 @@
int err;
int i;
- err = nla_parse_nested(tb, TCA_ACT_MAX_PRIO, nla, NULL, extack);
+ err = nla_parse_nested_deprecated(tb, TCA_ACT_MAX_PRIO, nla, NULL,
+ extack);
if (err < 0)
return err;
@@ -979,6 +1018,8 @@
goto errout;
if (gnet_stats_copy_basic(NULL, &d, p->cpu_bstats, &p->tcfa_bstats) < 0 ||
+ gnet_stats_copy_basic_hw(NULL, &d, p->cpu_bstats_hw,
+ &p->tcfa_bstats_hw) < 0 ||
gnet_stats_copy_rate_est(&d, &p->tcfa_rate_est) < 0 ||
gnet_stats_copy_queue(&d, p->cpu_qstats,
&p->tcfa_qstats,
@@ -1011,7 +1052,7 @@
t->tca__pad1 = 0;
t->tca__pad2 = 0;
- nest = nla_nest_start(skb, TCA_ACT_TAB);
+ nest = nla_nest_start_noflag(skb, TCA_ACT_TAB);
if (!nest)
goto out_nlmsg_trim;
@@ -1058,7 +1099,8 @@
int index;
int err;
- err = nla_parse_nested(tb, TCA_ACT_MAX, nla, NULL, extack);
+ err = nla_parse_nested_deprecated(tb, TCA_ACT_MAX, nla,
+ tcf_action_policy, extack);
if (err < 0)
goto err_out;
@@ -1073,12 +1115,14 @@
err = -EINVAL;
ops = tc_lookup_action(tb[TCA_ACT_KIND]);
if (!ops) { /* could happen in batch of actions */
- NL_SET_ERR_MSG(extack, "Specified TC action not found");
+ NL_SET_ERR_MSG(extack, "Specified TC action kind not found");
goto err_out;
}
err = -ENOENT;
- if (ops->lookup(net, &a, index, extack) == 0)
+ if (ops->lookup(net, &a, index) == 0) {
+ NL_SET_ERR_MSG(extack, "TC action with specified index not found");
goto err_mod;
+ }
module_put(ops->owner);
return a;
@@ -1110,7 +1154,8 @@
b = skb_tail_pointer(skb);
- err = nla_parse_nested(tb, TCA_ACT_MAX, nla, NULL, extack);
+ err = nla_parse_nested_deprecated(tb, TCA_ACT_MAX, nla,
+ tcf_action_policy, extack);
if (err < 0)
goto err_out;
@@ -1133,7 +1178,7 @@
t->tca__pad1 = 0;
t->tca__pad2 = 0;
- nest = nla_nest_start(skb, TCA_ACT_TAB);
+ nest = nla_nest_start_noflag(skb, TCA_ACT_TAB);
if (!nest) {
NL_SET_ERR_MSG(extack, "Failed to add new netlink message");
goto out_module_put;
@@ -1239,7 +1284,8 @@
size_t attr_size = 0;
struct tc_action *actions[TCA_ACT_MAX_PRIO] = {};
- ret = nla_parse_nested(tb, TCA_ACT_MAX_PRIO, nla, NULL, extack);
+ ret = nla_parse_nested_deprecated(tb, TCA_ACT_MAX_PRIO, nla, NULL,
+ extack);
if (ret < 0)
return ret;
@@ -1257,7 +1303,6 @@
ret = PTR_ERR(act);
goto err;
}
- act->order = i;
attr_size += tcf_action_fill_size(act);
actions[i - 1] = act;
}
@@ -1308,11 +1353,16 @@
struct netlink_ext_ack *extack)
{
size_t attr_size = 0;
- int ret = 0;
+ int loop, ret;
struct tc_action *actions[TCA_ACT_MAX_PRIO] = {};
- ret = tcf_action_init(net, NULL, nla, NULL, NULL, ovr, 0, actions,
- &attr_size, true, extack);
+ for (loop = 0; loop < 10; loop++) {
+ ret = tcf_action_init(net, NULL, nla, NULL, NULL, ovr, 0,
+ actions, &attr_size, true, extack);
+ if (ret != -EAGAIN)
+ break;
+ }
+
if (ret < 0)
return ret;
ret = tcf_add_notify(net, n, actions, portid, attr_size, extack);
@@ -1341,8 +1391,8 @@
!netlink_capable(skb, CAP_NET_ADMIN))
return -EPERM;
- ret = nlmsg_parse(n, sizeof(struct tcamsg), tca, TCA_ROOT_MAX, NULL,
- extack);
+ ret = nlmsg_parse_deprecated(n, sizeof(struct tcamsg), tca,
+ TCA_ROOT_MAX, NULL, extack);
if (ret < 0)
return ret;
@@ -1362,11 +1412,8 @@
*/
if (n->nlmsg_flags & NLM_F_REPLACE)
ovr = 1;
-replay:
ret = tcf_action_add(net, tca[TCA_ACT_TAB], n, portid, ovr,
extack);
- if (ret == -EAGAIN)
- goto replay;
break;
case RTM_DELACTION:
ret = tca_action_gd(net, tca[TCA_ACT_TAB], n,
@@ -1393,13 +1440,12 @@
if (tb1 == NULL)
return NULL;
- if (nla_parse(tb, TCA_ACT_MAX_PRIO, nla_data(tb1),
- NLMSG_ALIGN(nla_len(tb1)), NULL, NULL) < 0)
+ if (nla_parse_deprecated(tb, TCA_ACT_MAX_PRIO, nla_data(tb1), NLMSG_ALIGN(nla_len(tb1)), NULL, NULL) < 0)
return NULL;
if (tb[1] == NULL)
return NULL;
- if (nla_parse_nested(tb2, TCA_ACT_MAX, tb[1], NULL, NULL) < 0)
+ if (nla_parse_nested_deprecated(tb2, TCA_ACT_MAX, tb[1], tcf_action_policy, NULL) < 0)
return NULL;
kind = tb2[TCA_ACT_KIND];
@@ -1423,8 +1469,8 @@
u32 msecs_since = 0;
u32 act_count = 0;
- ret = nlmsg_parse(cb->nlh, sizeof(struct tcamsg), tb, TCA_ROOT_MAX,
- tcaa_policy, NULL);
+ ret = nlmsg_parse_deprecated(cb->nlh, sizeof(struct tcamsg), tb,
+ TCA_ROOT_MAX, tcaa_policy, cb->extack);
if (ret < 0)
return ret;
@@ -1465,7 +1511,7 @@
if (!count_attr)
goto out_module_put;
- nest = nla_nest_start(skb, TCA_ACT_TAB);
+ nest = nla_nest_start_noflag(skb, TCA_ACT_TAB);
if (nest == NULL)
goto out_module_put;
@@ -1494,227 +1540,8 @@
return skb->len;
}
-struct tcf_action_net {
- struct rhashtable egdev_ht;
-};
-
-static unsigned int tcf_action_net_id;
-
-struct tcf_action_egdev_cb {
- struct list_head list;
- tc_setup_cb_t *cb;
- void *cb_priv;
-};
-
-struct tcf_action_egdev {
- struct rhash_head ht_node;
- const struct net_device *dev;
- unsigned int refcnt;
- struct list_head cb_list;
-};
-
-static const struct rhashtable_params tcf_action_egdev_ht_params = {
- .key_offset = offsetof(struct tcf_action_egdev, dev),
- .head_offset = offsetof(struct tcf_action_egdev, ht_node),
- .key_len = sizeof(const struct net_device *),
-};
-
-static struct tcf_action_egdev *
-tcf_action_egdev_lookup(const struct net_device *dev)
-{
- struct net *net = dev_net(dev);
- struct tcf_action_net *tan = net_generic(net, tcf_action_net_id);
-
- return rhashtable_lookup_fast(&tan->egdev_ht, &dev,
- tcf_action_egdev_ht_params);
-}
-
-static struct tcf_action_egdev *
-tcf_action_egdev_get(const struct net_device *dev)
-{
- struct tcf_action_egdev *egdev;
- struct tcf_action_net *tan;
-
- egdev = tcf_action_egdev_lookup(dev);
- if (egdev)
- goto inc_ref;
-
- egdev = kzalloc(sizeof(*egdev), GFP_KERNEL);
- if (!egdev)
- return NULL;
- INIT_LIST_HEAD(&egdev->cb_list);
- egdev->dev = dev;
- tan = net_generic(dev_net(dev), tcf_action_net_id);
- rhashtable_insert_fast(&tan->egdev_ht, &egdev->ht_node,
- tcf_action_egdev_ht_params);
-
-inc_ref:
- egdev->refcnt++;
- return egdev;
-}
-
-static void tcf_action_egdev_put(struct tcf_action_egdev *egdev)
-{
- struct tcf_action_net *tan;
-
- if (--egdev->refcnt)
- return;
- tan = net_generic(dev_net(egdev->dev), tcf_action_net_id);
- rhashtable_remove_fast(&tan->egdev_ht, &egdev->ht_node,
- tcf_action_egdev_ht_params);
- kfree(egdev);
-}
-
-static struct tcf_action_egdev_cb *
-tcf_action_egdev_cb_lookup(struct tcf_action_egdev *egdev,
- tc_setup_cb_t *cb, void *cb_priv)
-{
- struct tcf_action_egdev_cb *egdev_cb;
-
- list_for_each_entry(egdev_cb, &egdev->cb_list, list)
- if (egdev_cb->cb == cb && egdev_cb->cb_priv == cb_priv)
- return egdev_cb;
- return NULL;
-}
-
-static int tcf_action_egdev_cb_call(struct tcf_action_egdev *egdev,
- enum tc_setup_type type,
- void *type_data, bool err_stop)
-{
- struct tcf_action_egdev_cb *egdev_cb;
- int ok_count = 0;
- int err;
-
- list_for_each_entry(egdev_cb, &egdev->cb_list, list) {
- err = egdev_cb->cb(type, type_data, egdev_cb->cb_priv);
- if (err) {
- if (err_stop)
- return err;
- } else {
- ok_count++;
- }
- }
- return ok_count;
-}
-
-static int tcf_action_egdev_cb_add(struct tcf_action_egdev *egdev,
- tc_setup_cb_t *cb, void *cb_priv)
-{
- struct tcf_action_egdev_cb *egdev_cb;
-
- egdev_cb = tcf_action_egdev_cb_lookup(egdev, cb, cb_priv);
- if (WARN_ON(egdev_cb))
- return -EEXIST;
- egdev_cb = kzalloc(sizeof(*egdev_cb), GFP_KERNEL);
- if (!egdev_cb)
- return -ENOMEM;
- egdev_cb->cb = cb;
- egdev_cb->cb_priv = cb_priv;
- list_add(&egdev_cb->list, &egdev->cb_list);
- return 0;
-}
-
-static void tcf_action_egdev_cb_del(struct tcf_action_egdev *egdev,
- tc_setup_cb_t *cb, void *cb_priv)
-{
- struct tcf_action_egdev_cb *egdev_cb;
-
- egdev_cb = tcf_action_egdev_cb_lookup(egdev, cb, cb_priv);
- if (WARN_ON(!egdev_cb))
- return;
- list_del(&egdev_cb->list);
- kfree(egdev_cb);
-}
-
-static int __tc_setup_cb_egdev_register(const struct net_device *dev,
- tc_setup_cb_t *cb, void *cb_priv)
-{
- struct tcf_action_egdev *egdev = tcf_action_egdev_get(dev);
- int err;
-
- if (!egdev)
- return -ENOMEM;
- err = tcf_action_egdev_cb_add(egdev, cb, cb_priv);
- if (err)
- goto err_cb_add;
- return 0;
-
-err_cb_add:
- tcf_action_egdev_put(egdev);
- return err;
-}
-int tc_setup_cb_egdev_register(const struct net_device *dev,
- tc_setup_cb_t *cb, void *cb_priv)
-{
- int err;
-
- rtnl_lock();
- err = __tc_setup_cb_egdev_register(dev, cb, cb_priv);
- rtnl_unlock();
- return err;
-}
-EXPORT_SYMBOL_GPL(tc_setup_cb_egdev_register);
-
-static void __tc_setup_cb_egdev_unregister(const struct net_device *dev,
- tc_setup_cb_t *cb, void *cb_priv)
-{
- struct tcf_action_egdev *egdev = tcf_action_egdev_lookup(dev);
-
- if (WARN_ON(!egdev))
- return;
- tcf_action_egdev_cb_del(egdev, cb, cb_priv);
- tcf_action_egdev_put(egdev);
-}
-void tc_setup_cb_egdev_unregister(const struct net_device *dev,
- tc_setup_cb_t *cb, void *cb_priv)
-{
- rtnl_lock();
- __tc_setup_cb_egdev_unregister(dev, cb, cb_priv);
- rtnl_unlock();
-}
-EXPORT_SYMBOL_GPL(tc_setup_cb_egdev_unregister);
-
-int tc_setup_cb_egdev_call(const struct net_device *dev,
- enum tc_setup_type type, void *type_data,
- bool err_stop)
-{
- struct tcf_action_egdev *egdev = tcf_action_egdev_lookup(dev);
-
- if (!egdev)
- return 0;
- return tcf_action_egdev_cb_call(egdev, type, type_data, err_stop);
-}
-EXPORT_SYMBOL_GPL(tc_setup_cb_egdev_call);
-
-static __net_init int tcf_action_net_init(struct net *net)
-{
- struct tcf_action_net *tan = net_generic(net, tcf_action_net_id);
-
- return rhashtable_init(&tan->egdev_ht, &tcf_action_egdev_ht_params);
-}
-
-static void __net_exit tcf_action_net_exit(struct net *net)
-{
- struct tcf_action_net *tan = net_generic(net, tcf_action_net_id);
-
- rhashtable_destroy(&tan->egdev_ht);
-}
-
-static struct pernet_operations tcf_action_net_ops = {
- .init = tcf_action_net_init,
- .exit = tcf_action_net_exit,
- .id = &tcf_action_net_id,
- .size = sizeof(struct tcf_action_net),
-};
-
static int __init tc_action_init(void)
{
- int err;
-
- err = register_pernet_subsys(&tcf_action_net_ops);
- if (err)
- return err;
-
rtnl_register(PF_UNSPEC, RTM_NEWACTION, tc_ctl_action, NULL, 0);
rtnl_register(PF_UNSPEC, RTM_DELACTION, tc_ctl_action, NULL, 0);
rtnl_register(PF_UNSPEC, RTM_GETACTION, tc_ctl_action, tc_dump_action,
diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c
index 0c68bc9..04b7bd4 100644
--- a/net/sched/act_bpf.c
+++ b/net/sched/act_bpf.c
@@ -1,10 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Copyright (c) 2015 Jiri Pirko <jiri@resnulli.us>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
*/
#include <linux/module.h>
@@ -17,6 +13,7 @@
#include <net/netlink.h>
#include <net/pkt_sched.h>
+#include <net/pkt_cls.h>
#include <linux/tc_act/tc_bpf.h>
#include <net/tc_act/tc_bpf.h>
@@ -278,20 +275,23 @@
static int tcf_bpf_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **act,
int replace, int bind, bool rtnl_held,
- struct netlink_ext_ack *extack)
+ struct tcf_proto *tp, struct netlink_ext_ack *extack)
{
struct tc_action_net *tn = net_generic(net, bpf_net_id);
struct nlattr *tb[TCA_ACT_BPF_MAX + 1];
+ struct tcf_chain *goto_ch = NULL;
struct tcf_bpf_cfg cfg, old;
struct tc_act_bpf *parm;
struct tcf_bpf *prog;
bool is_bpf, is_ebpf;
int ret, res = 0;
+ u32 index;
if (!nla)
return -EINVAL;
- ret = nla_parse_nested(tb, TCA_ACT_BPF_MAX, nla, act_bpf_policy, NULL);
+ ret = nla_parse_nested_deprecated(tb, TCA_ACT_BPF_MAX, nla,
+ act_bpf_policy, NULL);
if (ret < 0)
return ret;
@@ -299,13 +299,13 @@
return -EINVAL;
parm = nla_data(tb[TCA_ACT_BPF_PARMS]);
-
- ret = tcf_idr_check_alloc(tn, &parm->index, act, bind);
+ index = parm->index;
+ ret = tcf_idr_check_alloc(tn, &index, act, bind);
if (!ret) {
- ret = tcf_idr_create(tn, parm->index, est, act,
+ ret = tcf_idr_create(tn, index, est, act,
&act_bpf_ops, bind, true);
if (ret < 0) {
- tcf_idr_cleanup(tn, parm->index);
+ tcf_idr_cleanup(tn, index);
return ret;
}
@@ -323,12 +323,16 @@
return ret;
}
+ ret = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack);
+ if (ret < 0)
+ goto release_idr;
+
is_bpf = tb[TCA_ACT_BPF_OPS_LEN] && tb[TCA_ACT_BPF_OPS];
is_ebpf = tb[TCA_ACT_BPF_FD];
if ((!is_bpf && !is_ebpf) || (is_bpf && is_ebpf)) {
ret = -EINVAL;
- goto out;
+ goto put_chain;
}
memset(&cfg, 0, sizeof(cfg));
@@ -336,7 +340,7 @@
ret = is_bpf ? tcf_bpf_init_from_ops(tb, &cfg) :
tcf_bpf_init_from_efd(tb, &cfg);
if (ret < 0)
- goto out;
+ goto put_chain;
prog = to_bpf(*act);
@@ -350,10 +354,13 @@
if (cfg.bpf_num_ops)
prog->bpf_num_ops = cfg.bpf_num_ops;
- prog->tcf_action = parm->action;
+ goto_ch = tcf_action_set_ctrlact(*act, parm->action, goto_ch);
rcu_assign_pointer(prog->filter, cfg.filter);
spin_unlock_bh(&prog->tcf_lock);
+ if (goto_ch)
+ tcf_chain_put_by_act(goto_ch);
+
if (res == ACT_P_CREATED) {
tcf_idr_insert(tn, *act);
} else {
@@ -363,9 +370,13 @@
}
return res;
-out:
- tcf_idr_release(*act, bind);
+put_chain:
+ if (goto_ch)
+ tcf_chain_put_by_act(goto_ch);
+
+release_idr:
+ tcf_idr_release(*act, bind);
return ret;
}
@@ -387,8 +398,7 @@
return tcf_generic_walker(tn, skb, cb, type, ops, extack);
}
-static int tcf_bpf_search(struct net *net, struct tc_action **a, u32 index,
- struct netlink_ext_ack *extack)
+static int tcf_bpf_search(struct net *net, struct tc_action **a, u32 index)
{
struct tc_action_net *tn = net_generic(net, bpf_net_id);
@@ -397,7 +407,7 @@
static struct tc_action_ops act_bpf_ops __read_mostly = {
.kind = "bpf",
- .type = TCA_ACT_BPF,
+ .id = TCA_ID_BPF,
.owner = THIS_MODULE,
.act = tcf_bpf_act,
.dump = tcf_bpf_dump,
@@ -412,7 +422,7 @@
{
struct tc_action_net *tn = net_generic(net, bpf_net_id);
- return tc_action_net_init(tn, &act_bpf_ops);
+ return tc_action_net_init(net, tn, &act_bpf_ops);
}
static void __net_exit bpf_exit_net(struct list_head *net_list)
diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c
index 6f0f273..2b43cac 100644
--- a/net/sched/act_connmark.c
+++ b/net/sched/act_connmark.c
@@ -1,13 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* net/sched/act_connmark.c netfilter connmark retriever action
* skb mark is over-written
*
* Copyright (c) 2011 Felix Fietkau <nbd@openwrt.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
*/
#include <linux/module.h>
@@ -21,6 +17,7 @@
#include <net/netlink.h>
#include <net/pkt_sched.h>
#include <net/act_api.h>
+#include <net/pkt_cls.h>
#include <uapi/linux/tc_act/tc_connmark.h>
#include <net/tc_act/tc_connmark.h>
@@ -97,19 +94,22 @@
static int tcf_connmark_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a,
int ovr, int bind, bool rtnl_held,
+ struct tcf_proto *tp,
struct netlink_ext_ack *extack)
{
struct tc_action_net *tn = net_generic(net, connmark_net_id);
struct nlattr *tb[TCA_CONNMARK_MAX + 1];
+ struct tcf_chain *goto_ch = NULL;
struct tcf_connmark_info *ci;
struct tc_connmark *parm;
- int ret = 0;
+ int ret = 0, err;
+ u32 index;
if (!nla)
return -EINVAL;
- ret = nla_parse_nested(tb, TCA_CONNMARK_MAX, nla, connmark_policy,
- NULL);
+ ret = nla_parse_nested_deprecated(tb, TCA_CONNMARK_MAX, nla,
+ connmark_policy, NULL);
if (ret < 0)
return ret;
@@ -117,18 +117,22 @@
return -EINVAL;
parm = nla_data(tb[TCA_CONNMARK_PARMS]);
-
- ret = tcf_idr_check_alloc(tn, &parm->index, a, bind);
+ index = parm->index;
+ ret = tcf_idr_check_alloc(tn, &index, a, bind);
if (!ret) {
- ret = tcf_idr_create(tn, parm->index, est, a,
+ ret = tcf_idr_create(tn, index, est, a,
&act_connmark_ops, bind, false);
if (ret) {
- tcf_idr_cleanup(tn, parm->index);
+ tcf_idr_cleanup(tn, index);
return ret;
}
ci = to_connmark(*a);
- ci->tcf_action = parm->action;
+ err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch,
+ extack);
+ if (err < 0)
+ goto release_idr;
+ tcf_action_set_ctrlact(*a, parm->action, goto_ch);
ci->net = net;
ci->zone = parm->zone;
@@ -142,13 +146,24 @@
tcf_idr_release(*a, bind);
return -EEXIST;
}
+ err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch,
+ extack);
+ if (err < 0)
+ goto release_idr;
/* replacing action and zone */
- ci->tcf_action = parm->action;
+ spin_lock_bh(&ci->tcf_lock);
+ goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch);
ci->zone = parm->zone;
+ spin_unlock_bh(&ci->tcf_lock);
+ if (goto_ch)
+ tcf_chain_put_by_act(goto_ch);
ret = 0;
}
return ret;
+release_idr:
+ tcf_idr_release(*a, bind);
+ return err;
}
static inline int tcf_connmark_dump(struct sk_buff *skb, struct tc_action *a,
@@ -156,16 +171,16 @@
{
unsigned char *b = skb_tail_pointer(skb);
struct tcf_connmark_info *ci = to_connmark(a);
-
struct tc_connmark opt = {
.index = ci->tcf_index,
.refcnt = refcount_read(&ci->tcf_refcnt) - ref,
.bindcnt = atomic_read(&ci->tcf_bindcnt) - bind,
- .action = ci->tcf_action,
- .zone = ci->zone,
};
struct tcf_t t;
+ spin_lock_bh(&ci->tcf_lock);
+ opt.action = ci->tcf_action;
+ opt.zone = ci->zone;
if (nla_put(skb, TCA_CONNMARK_PARMS, sizeof(opt), &opt))
goto nla_put_failure;
@@ -173,9 +188,12 @@
if (nla_put_64bit(skb, TCA_CONNMARK_TM, sizeof(t), &t,
TCA_CONNMARK_PAD))
goto nla_put_failure;
+ spin_unlock_bh(&ci->tcf_lock);
return skb->len;
+
nla_put_failure:
+ spin_unlock_bh(&ci->tcf_lock);
nlmsg_trim(skb, b);
return -1;
}
@@ -190,8 +208,7 @@
return tcf_generic_walker(tn, skb, cb, type, ops, extack);
}
-static int tcf_connmark_search(struct net *net, struct tc_action **a, u32 index,
- struct netlink_ext_ack *extack)
+static int tcf_connmark_search(struct net *net, struct tc_action **a, u32 index)
{
struct tc_action_net *tn = net_generic(net, connmark_net_id);
@@ -200,7 +217,7 @@
static struct tc_action_ops act_connmark_ops = {
.kind = "connmark",
- .type = TCA_ACT_CONNMARK,
+ .id = TCA_ID_CONNMARK,
.owner = THIS_MODULE,
.act = tcf_connmark_act,
.dump = tcf_connmark_dump,
@@ -214,7 +231,7 @@
{
struct tc_action_net *tn = net_generic(net, connmark_net_id);
- return tc_action_net_init(tn, &act_connmark_ops);
+ return tc_action_net_init(net, tn, &act_connmark_ops);
}
static void __net_exit connmark_exit_net(struct list_head *net_list)
diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c
index b8a67ae..d3cfad8 100644
--- a/net/sched/act_csum.c
+++ b/net/sched/act_csum.c
@@ -1,13 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Checksum updating actions
*
* Copyright (c) 2010 Gregoire Baron <baronchon@n7mm.org>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the Free
- * Software Foundation; either version 2 of the License, or (at your option)
- * any later version.
- *
*/
#include <linux/types.h>
@@ -33,6 +28,7 @@
#include <net/sctp/checksum.h>
#include <net/act_api.h>
+#include <net/pkt_cls.h>
#include <linux/tc_act/tc_csum.h>
#include <net/tc_act/tc_csum.h>
@@ -46,33 +42,36 @@
static int tcf_csum_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a, int ovr,
- int bind, bool rtnl_held,
+ int bind, bool rtnl_held, struct tcf_proto *tp,
struct netlink_ext_ack *extack)
{
struct tc_action_net *tn = net_generic(net, csum_net_id);
struct tcf_csum_params *params_new;
struct nlattr *tb[TCA_CSUM_MAX + 1];
+ struct tcf_chain *goto_ch = NULL;
struct tc_csum *parm;
struct tcf_csum *p;
int ret = 0, err;
+ u32 index;
if (nla == NULL)
return -EINVAL;
- err = nla_parse_nested(tb, TCA_CSUM_MAX, nla, csum_policy, NULL);
+ err = nla_parse_nested_deprecated(tb, TCA_CSUM_MAX, nla, csum_policy,
+ NULL);
if (err < 0)
return err;
if (tb[TCA_CSUM_PARMS] == NULL)
return -EINVAL;
parm = nla_data(tb[TCA_CSUM_PARMS]);
-
- err = tcf_idr_check_alloc(tn, &parm->index, a, bind);
+ index = parm->index;
+ err = tcf_idr_check_alloc(tn, &index, a, bind);
if (!err) {
- ret = tcf_idr_create(tn, parm->index, est, a,
+ ret = tcf_idr_create(tn, index, est, a,
&act_csum_ops, bind, true);
if (ret) {
- tcf_idr_cleanup(tn, parm->index);
+ tcf_idr_cleanup(tn, index);
return ret;
}
ret = ACT_P_CREATED;
@@ -87,21 +86,27 @@
return err;
}
+ err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack);
+ if (err < 0)
+ goto release_idr;
+
p = to_tcf_csum(*a);
params_new = kzalloc(sizeof(*params_new), GFP_KERNEL);
if (unlikely(!params_new)) {
- tcf_idr_release(*a, bind);
- return -ENOMEM;
+ err = -ENOMEM;
+ goto put_chain;
}
params_new->update_flags = parm->update_flags;
spin_lock_bh(&p->tcf_lock);
- p->tcf_action = parm->action;
+ goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch);
rcu_swap_protected(p->params, params_new,
lockdep_is_held(&p->tcf_lock));
spin_unlock_bh(&p->tcf_lock);
+ if (goto_ch)
+ tcf_chain_put_by_act(goto_ch);
if (params_new)
kfree_rcu(params_new, rcu);
@@ -109,6 +114,12 @@
tcf_idr_insert(tn, *a);
return ret;
+put_chain:
+ if (goto_ch)
+ tcf_chain_put_by_act(goto_ch);
+release_idr:
+ tcf_idr_release(*a, bind);
+ return err;
}
/**
@@ -559,8 +570,11 @@
struct tcf_result *res)
{
struct tcf_csum *p = to_tcf_csum(a);
+ bool orig_vlan_tag_present = false;
+ unsigned int vlan_hdr_count = 0;
struct tcf_csum_params *params;
u32 update_flags;
+ __be16 protocol;
int action;
params = rcu_dereference_bh(p->params);
@@ -573,7 +587,9 @@
goto drop;
update_flags = params->update_flags;
- switch (tc_skb_protocol(skb)) {
+ protocol = tc_skb_protocol(skb);
+again:
+ switch (protocol) {
case cpu_to_be16(ETH_P_IP):
if (!tcf_csum_ipv4(skb, update_flags))
goto drop;
@@ -582,13 +598,35 @@
if (!tcf_csum_ipv6(skb, update_flags))
goto drop;
break;
+ case cpu_to_be16(ETH_P_8021AD): /* fall through */
+ case cpu_to_be16(ETH_P_8021Q):
+ if (skb_vlan_tag_present(skb) && !orig_vlan_tag_present) {
+ protocol = skb->protocol;
+ orig_vlan_tag_present = true;
+ } else {
+ struct vlan_hdr *vlan = (struct vlan_hdr *)skb->data;
+
+ protocol = vlan->h_vlan_encapsulated_proto;
+ skb_pull(skb, VLAN_HLEN);
+ skb_reset_network_header(skb);
+ vlan_hdr_count++;
+ }
+ goto again;
+ }
+
+out:
+ /* Restore the skb for the pulled VLAN tags */
+ while (vlan_hdr_count--) {
+ skb_push(skb, VLAN_HLEN);
+ skb_reset_network_header(skb);
}
return action;
drop:
qstats_drop_inc(this_cpu_ptr(p->common.cpu_qstats));
- return TC_ACT_SHOT;
+ action = TC_ACT_SHOT;
+ goto out;
}
static int tcf_csum_dump(struct sk_buff *skb, struct tc_action *a, int bind,
@@ -646,8 +684,7 @@
return tcf_generic_walker(tn, skb, cb, type, ops, extack);
}
-static int tcf_csum_search(struct net *net, struct tc_action **a, u32 index,
- struct netlink_ext_ack *extack)
+static int tcf_csum_search(struct net *net, struct tc_action **a, u32 index)
{
struct tc_action_net *tn = net_generic(net, csum_net_id);
@@ -661,7 +698,7 @@
static struct tc_action_ops act_csum_ops = {
.kind = "csum",
- .type = TCA_ACT_CSUM,
+ .id = TCA_ID_CSUM,
.owner = THIS_MODULE,
.act = tcf_csum_act,
.dump = tcf_csum_dump,
@@ -677,7 +714,7 @@
{
struct tc_action_net *tn = net_generic(net, csum_net_id);
- return tc_action_net_init(tn, &act_csum_ops);
+ return tc_action_net_init(net, tn, &act_csum_ops);
}
static void __net_exit csum_exit_net(struct list_head *net_list)
diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c
new file mode 100644
index 0000000..fcc4602
--- /dev/null
+++ b/net/sched/act_ct.c
@@ -0,0 +1,985 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/* -
+ * net/sched/act_ct.c Connection Tracking action
+ *
+ * Authors: Paul Blakey <paulb@mellanox.com>
+ * Yossi Kuperman <yossiku@mellanox.com>
+ * Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/pkt_cls.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+#include <net/pkt_cls.h>
+#include <net/act_api.h>
+#include <net/ip.h>
+#include <net/ipv6_frag.h>
+#include <uapi/linux/tc_act/tc_ct.h>
+#include <net/tc_act/tc_ct.h>
+
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_zones.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
+#include <uapi/linux/netfilter/nf_nat.h>
+
+static struct tc_action_ops act_ct_ops;
+static unsigned int ct_net_id;
+
+struct tc_ct_action_net {
+ struct tc_action_net tn; /* Must be first */
+ bool labels;
+};
+
+/* Determine whether skb->_nfct is equal to the result of conntrack lookup. */
+static bool tcf_ct_skb_nfct_cached(struct net *net, struct sk_buff *skb,
+ u16 zone_id, bool force)
+{
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct;
+
+ ct = nf_ct_get(skb, &ctinfo);
+ if (!ct)
+ return false;
+ if (!net_eq(net, read_pnet(&ct->ct_net)))
+ return false;
+ if (nf_ct_zone(ct)->id != zone_id)
+ return false;
+
+ /* Force conntrack entry direction. */
+ if (force && CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) {
+ if (nf_ct_is_confirmed(ct))
+ nf_ct_kill(ct);
+
+ nf_conntrack_put(&ct->ct_general);
+ nf_ct_set(skb, NULL, IP_CT_UNTRACKED);
+
+ return false;
+ }
+
+ return true;
+}
+
+/* Trim the skb to the length specified by the IP/IPv6 header,
+ * removing any trailing lower-layer padding. This prepares the skb
+ * for higher-layer processing that assumes skb->len excludes padding
+ * (such as nf_ip_checksum). The caller needs to pull the skb to the
+ * network header, and ensure ip_hdr/ipv6_hdr points to valid data.
+ */
+static int tcf_ct_skb_network_trim(struct sk_buff *skb, int family)
+{
+ unsigned int len;
+ int err;
+
+ switch (family) {
+ case NFPROTO_IPV4:
+ len = ntohs(ip_hdr(skb)->tot_len);
+ break;
+ case NFPROTO_IPV6:
+ len = sizeof(struct ipv6hdr)
+ + ntohs(ipv6_hdr(skb)->payload_len);
+ break;
+ default:
+ len = skb->len;
+ }
+
+ err = pskb_trim_rcsum(skb, len);
+
+ return err;
+}
+
+static u8 tcf_ct_skb_nf_family(struct sk_buff *skb)
+{
+ u8 family = NFPROTO_UNSPEC;
+
+ switch (skb->protocol) {
+ case htons(ETH_P_IP):
+ family = NFPROTO_IPV4;
+ break;
+ case htons(ETH_P_IPV6):
+ family = NFPROTO_IPV6;
+ break;
+ default:
+ break;
+ }
+
+ return family;
+}
+
+static int tcf_ct_ipv4_is_fragment(struct sk_buff *skb, bool *frag)
+{
+ unsigned int len;
+
+ len = skb_network_offset(skb) + sizeof(struct iphdr);
+ if (unlikely(skb->len < len))
+ return -EINVAL;
+ if (unlikely(!pskb_may_pull(skb, len)))
+ return -ENOMEM;
+
+ *frag = ip_is_fragment(ip_hdr(skb));
+ return 0;
+}
+
+static int tcf_ct_ipv6_is_fragment(struct sk_buff *skb, bool *frag)
+{
+ unsigned int flags = 0, len, payload_ofs = 0;
+ unsigned short frag_off;
+ int nexthdr;
+
+ len = skb_network_offset(skb) + sizeof(struct ipv6hdr);
+ if (unlikely(skb->len < len))
+ return -EINVAL;
+ if (unlikely(!pskb_may_pull(skb, len)))
+ return -ENOMEM;
+
+ nexthdr = ipv6_find_hdr(skb, &payload_ofs, -1, &frag_off, &flags);
+ if (unlikely(nexthdr < 0))
+ return -EPROTO;
+
+ *frag = flags & IP6_FH_F_FRAG;
+ return 0;
+}
+
+static int tcf_ct_handle_fragments(struct net *net, struct sk_buff *skb,
+ u8 family, u16 zone)
+{
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct;
+ int err = 0;
+ bool frag;
+
+ /* Previously seen (loopback)? Ignore. */
+ ct = nf_ct_get(skb, &ctinfo);
+ if ((ct && !nf_ct_is_template(ct)) || ctinfo == IP_CT_UNTRACKED)
+ return 0;
+
+ if (family == NFPROTO_IPV4)
+ err = tcf_ct_ipv4_is_fragment(skb, &frag);
+ else
+ err = tcf_ct_ipv6_is_fragment(skb, &frag);
+ if (err || !frag)
+ return err;
+
+ skb_get(skb);
+
+ if (family == NFPROTO_IPV4) {
+ enum ip_defrag_users user = IP_DEFRAG_CONNTRACK_IN + zone;
+
+ memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
+ local_bh_disable();
+ err = ip_defrag(net, skb, user);
+ local_bh_enable();
+ if (err && err != -EINPROGRESS)
+ goto out_free;
+ } else { /* NFPROTO_IPV6 */
+#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6)
+ enum ip6_defrag_users user = IP6_DEFRAG_CONNTRACK_IN + zone;
+
+ memset(IP6CB(skb), 0, sizeof(struct inet6_skb_parm));
+ err = nf_ct_frag6_gather(net, skb, user);
+ if (err && err != -EINPROGRESS)
+ goto out_free;
+#else
+ err = -EOPNOTSUPP;
+ goto out_free;
+#endif
+ }
+
+ skb_clear_hash(skb);
+ skb->ignore_df = 1;
+ return err;
+
+out_free:
+ kfree_skb(skb);
+ return err;
+}
+
+static void tcf_ct_params_free(struct rcu_head *head)
+{
+ struct tcf_ct_params *params = container_of(head,
+ struct tcf_ct_params, rcu);
+
+ if (params->tmpl)
+ nf_conntrack_put(¶ms->tmpl->ct_general);
+ kfree(params);
+}
+
+#if IS_ENABLED(CONFIG_NF_NAT)
+/* Modelled after nf_nat_ipv[46]_fn().
+ * range is only used for new, uninitialized NAT state.
+ * Returns either NF_ACCEPT or NF_DROP.
+ */
+static int ct_nat_execute(struct sk_buff *skb, struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ const struct nf_nat_range2 *range,
+ enum nf_nat_manip_type maniptype)
+{
+ int hooknum, err = NF_ACCEPT;
+
+ /* See HOOK2MANIP(). */
+ if (maniptype == NF_NAT_MANIP_SRC)
+ hooknum = NF_INET_LOCAL_IN; /* Source NAT */
+ else
+ hooknum = NF_INET_LOCAL_OUT; /* Destination NAT */
+
+ switch (ctinfo) {
+ case IP_CT_RELATED:
+ case IP_CT_RELATED_REPLY:
+ if (skb->protocol == htons(ETH_P_IP) &&
+ ip_hdr(skb)->protocol == IPPROTO_ICMP) {
+ if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo,
+ hooknum))
+ err = NF_DROP;
+ goto out;
+ } else if (IS_ENABLED(CONFIG_IPV6) &&
+ skb->protocol == htons(ETH_P_IPV6)) {
+ __be16 frag_off;
+ u8 nexthdr = ipv6_hdr(skb)->nexthdr;
+ int hdrlen = ipv6_skip_exthdr(skb,
+ sizeof(struct ipv6hdr),
+ &nexthdr, &frag_off);
+
+ if (hdrlen >= 0 && nexthdr == IPPROTO_ICMPV6) {
+ if (!nf_nat_icmpv6_reply_translation(skb, ct,
+ ctinfo,
+ hooknum,
+ hdrlen))
+ err = NF_DROP;
+ goto out;
+ }
+ }
+ /* Non-ICMP, fall thru to initialize if needed. */
+ /* fall through */
+ case IP_CT_NEW:
+ /* Seen it before? This can happen for loopback, retrans,
+ * or local packets.
+ */
+ if (!nf_nat_initialized(ct, maniptype)) {
+ /* Initialize according to the NAT action. */
+ err = (range && range->flags & NF_NAT_RANGE_MAP_IPS)
+ /* Action is set up to establish a new
+ * mapping.
+ */
+ ? nf_nat_setup_info(ct, range, maniptype)
+ : nf_nat_alloc_null_binding(ct, hooknum);
+ if (err != NF_ACCEPT)
+ goto out;
+ }
+ break;
+
+ case IP_CT_ESTABLISHED:
+ case IP_CT_ESTABLISHED_REPLY:
+ break;
+
+ default:
+ err = NF_DROP;
+ goto out;
+ }
+
+ err = nf_nat_packet(ct, ctinfo, hooknum, skb);
+out:
+ return err;
+}
+#endif /* CONFIG_NF_NAT */
+
+static void tcf_ct_act_set_mark(struct nf_conn *ct, u32 mark, u32 mask)
+{
+#if IS_ENABLED(CONFIG_NF_CONNTRACK_MARK)
+ u32 new_mark;
+
+ if (!mask)
+ return;
+
+ new_mark = mark | (ct->mark & ~(mask));
+ if (ct->mark != new_mark) {
+ ct->mark = new_mark;
+ if (nf_ct_is_confirmed(ct))
+ nf_conntrack_event_cache(IPCT_MARK, ct);
+ }
+#endif
+}
+
+static void tcf_ct_act_set_labels(struct nf_conn *ct,
+ u32 *labels,
+ u32 *labels_m)
+{
+#if IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS)
+ size_t labels_sz = FIELD_SIZEOF(struct tcf_ct_params, labels);
+
+ if (!memchr_inv(labels_m, 0, labels_sz))
+ return;
+
+ nf_connlabels_replace(ct, labels, labels_m, 4);
+#endif
+}
+
+static int tcf_ct_act_nat(struct sk_buff *skb,
+ struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ int ct_action,
+ struct nf_nat_range2 *range,
+ bool commit)
+{
+#if IS_ENABLED(CONFIG_NF_NAT)
+ enum nf_nat_manip_type maniptype;
+
+ if (!(ct_action & TCA_CT_ACT_NAT))
+ return NF_ACCEPT;
+
+ /* Add NAT extension if not confirmed yet. */
+ if (!nf_ct_is_confirmed(ct) && !nf_ct_nat_ext_add(ct))
+ return NF_DROP; /* Can't NAT. */
+
+ if (ctinfo != IP_CT_NEW && (ct->status & IPS_NAT_MASK) &&
+ (ctinfo != IP_CT_RELATED || commit)) {
+ /* NAT an established or related connection like before. */
+ if (CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY)
+ /* This is the REPLY direction for a connection
+ * for which NAT was applied in the forward
+ * direction. Do the reverse NAT.
+ */
+ maniptype = ct->status & IPS_SRC_NAT
+ ? NF_NAT_MANIP_DST : NF_NAT_MANIP_SRC;
+ else
+ maniptype = ct->status & IPS_SRC_NAT
+ ? NF_NAT_MANIP_SRC : NF_NAT_MANIP_DST;
+ } else if (ct_action & TCA_CT_ACT_NAT_SRC) {
+ maniptype = NF_NAT_MANIP_SRC;
+ } else if (ct_action & TCA_CT_ACT_NAT_DST) {
+ maniptype = NF_NAT_MANIP_DST;
+ } else {
+ return NF_ACCEPT;
+ }
+
+ return ct_nat_execute(skb, ct, ctinfo, range, maniptype);
+#else
+ return NF_ACCEPT;
+#endif
+}
+
+static int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a,
+ struct tcf_result *res)
+{
+ struct net *net = dev_net(skb->dev);
+ bool cached, commit, clear, force;
+ enum ip_conntrack_info ctinfo;
+ struct tcf_ct *c = to_ct(a);
+ struct nf_conn *tmpl = NULL;
+ struct nf_hook_state state;
+ int nh_ofs, err, retval;
+ struct tcf_ct_params *p;
+ struct nf_conn *ct;
+ u8 family;
+
+ p = rcu_dereference_bh(c->params);
+
+ retval = READ_ONCE(c->tcf_action);
+ commit = p->ct_action & TCA_CT_ACT_COMMIT;
+ clear = p->ct_action & TCA_CT_ACT_CLEAR;
+ force = p->ct_action & TCA_CT_ACT_FORCE;
+ tmpl = p->tmpl;
+
+ if (clear) {
+ ct = nf_ct_get(skb, &ctinfo);
+ if (ct) {
+ nf_conntrack_put(&ct->ct_general);
+ nf_ct_set(skb, NULL, IP_CT_UNTRACKED);
+ }
+
+ goto out;
+ }
+
+ family = tcf_ct_skb_nf_family(skb);
+ if (family == NFPROTO_UNSPEC)
+ goto drop;
+
+ /* The conntrack module expects to be working at L3.
+ * We also try to pull the IPv4/6 header to linear area
+ */
+ nh_ofs = skb_network_offset(skb);
+ skb_pull_rcsum(skb, nh_ofs);
+ err = tcf_ct_handle_fragments(net, skb, family, p->zone);
+ if (err == -EINPROGRESS) {
+ retval = TC_ACT_STOLEN;
+ goto out;
+ }
+ if (err)
+ goto drop;
+
+ err = tcf_ct_skb_network_trim(skb, family);
+ if (err)
+ goto drop;
+
+ /* If we are recirculating packets to match on ct fields and
+ * committing with a separate ct action, then we don't need to
+ * actually run the packet through conntrack twice unless it's for a
+ * different zone.
+ */
+ cached = tcf_ct_skb_nfct_cached(net, skb, p->zone, force);
+ if (!cached) {
+ /* Associate skb with specified zone. */
+ if (tmpl) {
+ ct = nf_ct_get(skb, &ctinfo);
+ if (skb_nfct(skb))
+ nf_conntrack_put(skb_nfct(skb));
+ nf_conntrack_get(&tmpl->ct_general);
+ nf_ct_set(skb, tmpl, IP_CT_NEW);
+ }
+
+ state.hook = NF_INET_PRE_ROUTING;
+ state.net = net;
+ state.pf = family;
+ err = nf_conntrack_in(skb, &state);
+ if (err != NF_ACCEPT)
+ goto out_push;
+ }
+
+ ct = nf_ct_get(skb, &ctinfo);
+ if (!ct)
+ goto out_push;
+ nf_ct_deliver_cached_events(ct);
+
+ err = tcf_ct_act_nat(skb, ct, ctinfo, p->ct_action, &p->range, commit);
+ if (err != NF_ACCEPT)
+ goto drop;
+
+ if (commit) {
+ tcf_ct_act_set_mark(ct, p->mark, p->mark_mask);
+ tcf_ct_act_set_labels(ct, p->labels, p->labels_mask);
+
+ /* This will take care of sending queued events
+ * even if the connection is already confirmed.
+ */
+ nf_conntrack_confirm(skb);
+ }
+
+out_push:
+ skb_push_rcsum(skb, nh_ofs);
+
+out:
+ bstats_cpu_update(this_cpu_ptr(a->cpu_bstats), skb);
+ return retval;
+
+drop:
+ qstats_drop_inc(this_cpu_ptr(a->cpu_qstats));
+ return TC_ACT_SHOT;
+}
+
+static const struct nla_policy ct_policy[TCA_CT_MAX + 1] = {
+ [TCA_CT_UNSPEC] = { .strict_start_type = TCA_CT_UNSPEC + 1 },
+ [TCA_CT_ACTION] = { .type = NLA_U16 },
+ [TCA_CT_PARMS] = { .type = NLA_EXACT_LEN, .len = sizeof(struct tc_ct) },
+ [TCA_CT_ZONE] = { .type = NLA_U16 },
+ [TCA_CT_MARK] = { .type = NLA_U32 },
+ [TCA_CT_MARK_MASK] = { .type = NLA_U32 },
+ [TCA_CT_LABELS] = { .type = NLA_BINARY,
+ .len = 128 / BITS_PER_BYTE },
+ [TCA_CT_LABELS_MASK] = { .type = NLA_BINARY,
+ .len = 128 / BITS_PER_BYTE },
+ [TCA_CT_NAT_IPV4_MIN] = { .type = NLA_U32 },
+ [TCA_CT_NAT_IPV4_MAX] = { .type = NLA_U32 },
+ [TCA_CT_NAT_IPV6_MIN] = { .type = NLA_EXACT_LEN,
+ .len = sizeof(struct in6_addr) },
+ [TCA_CT_NAT_IPV6_MAX] = { .type = NLA_EXACT_LEN,
+ .len = sizeof(struct in6_addr) },
+ [TCA_CT_NAT_PORT_MIN] = { .type = NLA_U16 },
+ [TCA_CT_NAT_PORT_MAX] = { .type = NLA_U16 },
+};
+
+static int tcf_ct_fill_params_nat(struct tcf_ct_params *p,
+ struct tc_ct *parm,
+ struct nlattr **tb,
+ struct netlink_ext_ack *extack)
+{
+ struct nf_nat_range2 *range;
+
+ if (!(p->ct_action & TCA_CT_ACT_NAT))
+ return 0;
+
+ if (!IS_ENABLED(CONFIG_NF_NAT)) {
+ NL_SET_ERR_MSG_MOD(extack, "Netfilter nat isn't enabled in kernel");
+ return -EOPNOTSUPP;
+ }
+
+ if (!(p->ct_action & (TCA_CT_ACT_NAT_SRC | TCA_CT_ACT_NAT_DST)))
+ return 0;
+
+ if ((p->ct_action & TCA_CT_ACT_NAT_SRC) &&
+ (p->ct_action & TCA_CT_ACT_NAT_DST)) {
+ NL_SET_ERR_MSG_MOD(extack, "dnat and snat can't be enabled at the same time");
+ return -EOPNOTSUPP;
+ }
+
+ range = &p->range;
+ if (tb[TCA_CT_NAT_IPV4_MIN]) {
+ struct nlattr *max_attr = tb[TCA_CT_NAT_IPV4_MAX];
+
+ p->ipv4_range = true;
+ range->flags |= NF_NAT_RANGE_MAP_IPS;
+ range->min_addr.ip =
+ nla_get_in_addr(tb[TCA_CT_NAT_IPV4_MIN]);
+
+ range->max_addr.ip = max_attr ?
+ nla_get_in_addr(max_attr) :
+ range->min_addr.ip;
+ } else if (tb[TCA_CT_NAT_IPV6_MIN]) {
+ struct nlattr *max_attr = tb[TCA_CT_NAT_IPV6_MAX];
+
+ p->ipv4_range = false;
+ range->flags |= NF_NAT_RANGE_MAP_IPS;
+ range->min_addr.in6 =
+ nla_get_in6_addr(tb[TCA_CT_NAT_IPV6_MIN]);
+
+ range->max_addr.in6 = max_attr ?
+ nla_get_in6_addr(max_attr) :
+ range->min_addr.in6;
+ }
+
+ if (tb[TCA_CT_NAT_PORT_MIN]) {
+ range->flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
+ range->min_proto.all = nla_get_be16(tb[TCA_CT_NAT_PORT_MIN]);
+
+ range->max_proto.all = tb[TCA_CT_NAT_PORT_MAX] ?
+ nla_get_be16(tb[TCA_CT_NAT_PORT_MAX]) :
+ range->min_proto.all;
+ }
+
+ return 0;
+}
+
+static void tcf_ct_set_key_val(struct nlattr **tb,
+ void *val, int val_type,
+ void *mask, int mask_type,
+ int len)
+{
+ if (!tb[val_type])
+ return;
+ nla_memcpy(val, tb[val_type], len);
+
+ if (!mask)
+ return;
+
+ if (mask_type == TCA_CT_UNSPEC || !tb[mask_type])
+ memset(mask, 0xff, len);
+ else
+ nla_memcpy(mask, tb[mask_type], len);
+}
+
+static int tcf_ct_fill_params(struct net *net,
+ struct tcf_ct_params *p,
+ struct tc_ct *parm,
+ struct nlattr **tb,
+ struct netlink_ext_ack *extack)
+{
+ struct tc_ct_action_net *tn = net_generic(net, ct_net_id);
+ struct nf_conntrack_zone zone;
+ struct nf_conn *tmpl;
+ int err;
+
+ p->zone = NF_CT_DEFAULT_ZONE_ID;
+
+ tcf_ct_set_key_val(tb,
+ &p->ct_action, TCA_CT_ACTION,
+ NULL, TCA_CT_UNSPEC,
+ sizeof(p->ct_action));
+
+ if (p->ct_action & TCA_CT_ACT_CLEAR)
+ return 0;
+
+ err = tcf_ct_fill_params_nat(p, parm, tb, extack);
+ if (err)
+ return err;
+
+ if (tb[TCA_CT_MARK]) {
+ if (!IS_ENABLED(CONFIG_NF_CONNTRACK_MARK)) {
+ NL_SET_ERR_MSG_MOD(extack, "Conntrack mark isn't enabled.");
+ return -EOPNOTSUPP;
+ }
+ tcf_ct_set_key_val(tb,
+ &p->mark, TCA_CT_MARK,
+ &p->mark_mask, TCA_CT_MARK_MASK,
+ sizeof(p->mark));
+ }
+
+ if (tb[TCA_CT_LABELS]) {
+ if (!IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS)) {
+ NL_SET_ERR_MSG_MOD(extack, "Conntrack labels isn't enabled.");
+ return -EOPNOTSUPP;
+ }
+
+ if (!tn->labels) {
+ NL_SET_ERR_MSG_MOD(extack, "Failed to set connlabel length");
+ return -EOPNOTSUPP;
+ }
+ tcf_ct_set_key_val(tb,
+ p->labels, TCA_CT_LABELS,
+ p->labels_mask, TCA_CT_LABELS_MASK,
+ sizeof(p->labels));
+ }
+
+ if (tb[TCA_CT_ZONE]) {
+ if (!IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES)) {
+ NL_SET_ERR_MSG_MOD(extack, "Conntrack zones isn't enabled.");
+ return -EOPNOTSUPP;
+ }
+
+ tcf_ct_set_key_val(tb,
+ &p->zone, TCA_CT_ZONE,
+ NULL, TCA_CT_UNSPEC,
+ sizeof(p->zone));
+ }
+
+ if (p->zone == NF_CT_DEFAULT_ZONE_ID)
+ return 0;
+
+ nf_ct_zone_init(&zone, p->zone, NF_CT_DEFAULT_ZONE_DIR, 0);
+ tmpl = nf_ct_tmpl_alloc(net, &zone, GFP_KERNEL);
+ if (!tmpl) {
+ NL_SET_ERR_MSG_MOD(extack, "Failed to allocate conntrack template");
+ return -ENOMEM;
+ }
+ __set_bit(IPS_CONFIRMED_BIT, &tmpl->status);
+ nf_conntrack_get(&tmpl->ct_general);
+ p->tmpl = tmpl;
+
+ return 0;
+}
+
+static int tcf_ct_init(struct net *net, struct nlattr *nla,
+ struct nlattr *est, struct tc_action **a,
+ int replace, int bind, bool rtnl_held,
+ struct tcf_proto *tp,
+ struct netlink_ext_ack *extack)
+{
+ struct tc_action_net *tn = net_generic(net, ct_net_id);
+ struct tcf_ct_params *params = NULL;
+ struct nlattr *tb[TCA_CT_MAX + 1];
+ struct tcf_chain *goto_ch = NULL;
+ struct tc_ct *parm;
+ struct tcf_ct *c;
+ int err, res = 0;
+ u32 index;
+
+ if (!nla) {
+ NL_SET_ERR_MSG_MOD(extack, "Ct requires attributes to be passed");
+ return -EINVAL;
+ }
+
+ err = nla_parse_nested(tb, TCA_CT_MAX, nla, ct_policy, extack);
+ if (err < 0)
+ return err;
+
+ if (!tb[TCA_CT_PARMS]) {
+ NL_SET_ERR_MSG_MOD(extack, "Missing required ct parameters");
+ return -EINVAL;
+ }
+ parm = nla_data(tb[TCA_CT_PARMS]);
+ index = parm->index;
+ err = tcf_idr_check_alloc(tn, &index, a, bind);
+ if (err < 0)
+ return err;
+
+ if (!err) {
+ err = tcf_idr_create(tn, index, est, a,
+ &act_ct_ops, bind, true);
+ if (err) {
+ tcf_idr_cleanup(tn, index);
+ return err;
+ }
+ res = ACT_P_CREATED;
+ } else {
+ if (bind)
+ return 0;
+
+ if (!replace) {
+ tcf_idr_release(*a, bind);
+ return -EEXIST;
+ }
+ }
+ err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack);
+ if (err < 0)
+ goto cleanup;
+
+ c = to_ct(*a);
+
+ params = kzalloc(sizeof(*params), GFP_KERNEL);
+ if (unlikely(!params)) {
+ err = -ENOMEM;
+ goto cleanup;
+ }
+
+ err = tcf_ct_fill_params(net, params, parm, tb, extack);
+ if (err)
+ goto cleanup;
+
+ spin_lock_bh(&c->tcf_lock);
+ goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch);
+ rcu_swap_protected(c->params, params, lockdep_is_held(&c->tcf_lock));
+ spin_unlock_bh(&c->tcf_lock);
+
+ if (goto_ch)
+ tcf_chain_put_by_act(goto_ch);
+ if (params)
+ kfree_rcu(params, rcu);
+ if (res == ACT_P_CREATED)
+ tcf_idr_insert(tn, *a);
+
+ return res;
+
+cleanup:
+ if (goto_ch)
+ tcf_chain_put_by_act(goto_ch);
+ kfree(params);
+ tcf_idr_release(*a, bind);
+ return err;
+}
+
+static void tcf_ct_cleanup(struct tc_action *a)
+{
+ struct tcf_ct_params *params;
+ struct tcf_ct *c = to_ct(a);
+
+ params = rcu_dereference_protected(c->params, 1);
+ if (params)
+ call_rcu(¶ms->rcu, tcf_ct_params_free);
+}
+
+static int tcf_ct_dump_key_val(struct sk_buff *skb,
+ void *val, int val_type,
+ void *mask, int mask_type,
+ int len)
+{
+ int err;
+
+ if (mask && !memchr_inv(mask, 0, len))
+ return 0;
+
+ err = nla_put(skb, val_type, len, val);
+ if (err)
+ return err;
+
+ if (mask_type != TCA_CT_UNSPEC) {
+ err = nla_put(skb, mask_type, len, mask);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+static int tcf_ct_dump_nat(struct sk_buff *skb, struct tcf_ct_params *p)
+{
+ struct nf_nat_range2 *range = &p->range;
+
+ if (!(p->ct_action & TCA_CT_ACT_NAT))
+ return 0;
+
+ if (!(p->ct_action & (TCA_CT_ACT_NAT_SRC | TCA_CT_ACT_NAT_DST)))
+ return 0;
+
+ if (range->flags & NF_NAT_RANGE_MAP_IPS) {
+ if (p->ipv4_range) {
+ if (nla_put_in_addr(skb, TCA_CT_NAT_IPV4_MIN,
+ range->min_addr.ip))
+ return -1;
+ if (nla_put_in_addr(skb, TCA_CT_NAT_IPV4_MAX,
+ range->max_addr.ip))
+ return -1;
+ } else {
+ if (nla_put_in6_addr(skb, TCA_CT_NAT_IPV6_MIN,
+ &range->min_addr.in6))
+ return -1;
+ if (nla_put_in6_addr(skb, TCA_CT_NAT_IPV6_MAX,
+ &range->max_addr.in6))
+ return -1;
+ }
+ }
+
+ if (range->flags & NF_NAT_RANGE_PROTO_SPECIFIED) {
+ if (nla_put_be16(skb, TCA_CT_NAT_PORT_MIN,
+ range->min_proto.all))
+ return -1;
+ if (nla_put_be16(skb, TCA_CT_NAT_PORT_MAX,
+ range->max_proto.all))
+ return -1;
+ }
+
+ return 0;
+}
+
+static inline int tcf_ct_dump(struct sk_buff *skb, struct tc_action *a,
+ int bind, int ref)
+{
+ unsigned char *b = skb_tail_pointer(skb);
+ struct tcf_ct *c = to_ct(a);
+ struct tcf_ct_params *p;
+
+ struct tc_ct opt = {
+ .index = c->tcf_index,
+ .refcnt = refcount_read(&c->tcf_refcnt) - ref,
+ .bindcnt = atomic_read(&c->tcf_bindcnt) - bind,
+ };
+ struct tcf_t t;
+
+ spin_lock_bh(&c->tcf_lock);
+ p = rcu_dereference_protected(c->params,
+ lockdep_is_held(&c->tcf_lock));
+ opt.action = c->tcf_action;
+
+ if (tcf_ct_dump_key_val(skb,
+ &p->ct_action, TCA_CT_ACTION,
+ NULL, TCA_CT_UNSPEC,
+ sizeof(p->ct_action)))
+ goto nla_put_failure;
+
+ if (p->ct_action & TCA_CT_ACT_CLEAR)
+ goto skip_dump;
+
+ if (IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) &&
+ tcf_ct_dump_key_val(skb,
+ &p->mark, TCA_CT_MARK,
+ &p->mark_mask, TCA_CT_MARK_MASK,
+ sizeof(p->mark)))
+ goto nla_put_failure;
+
+ if (IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS) &&
+ tcf_ct_dump_key_val(skb,
+ p->labels, TCA_CT_LABELS,
+ p->labels_mask, TCA_CT_LABELS_MASK,
+ sizeof(p->labels)))
+ goto nla_put_failure;
+
+ if (IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES) &&
+ tcf_ct_dump_key_val(skb,
+ &p->zone, TCA_CT_ZONE,
+ NULL, TCA_CT_UNSPEC,
+ sizeof(p->zone)))
+ goto nla_put_failure;
+
+ if (tcf_ct_dump_nat(skb, p))
+ goto nla_put_failure;
+
+skip_dump:
+ if (nla_put(skb, TCA_CT_PARMS, sizeof(opt), &opt))
+ goto nla_put_failure;
+
+ tcf_tm_dump(&t, &c->tcf_tm);
+ if (nla_put_64bit(skb, TCA_CT_TM, sizeof(t), &t, TCA_CT_PAD))
+ goto nla_put_failure;
+ spin_unlock_bh(&c->tcf_lock);
+
+ return skb->len;
+nla_put_failure:
+ spin_unlock_bh(&c->tcf_lock);
+ nlmsg_trim(skb, b);
+ return -1;
+}
+
+static int tcf_ct_walker(struct net *net, struct sk_buff *skb,
+ struct netlink_callback *cb, int type,
+ const struct tc_action_ops *ops,
+ struct netlink_ext_ack *extack)
+{
+ struct tc_action_net *tn = net_generic(net, ct_net_id);
+
+ return tcf_generic_walker(tn, skb, cb, type, ops, extack);
+}
+
+static int tcf_ct_search(struct net *net, struct tc_action **a, u32 index)
+{
+ struct tc_action_net *tn = net_generic(net, ct_net_id);
+
+ return tcf_idr_search(tn, a, index);
+}
+
+static void tcf_stats_update(struct tc_action *a, u64 bytes, u32 packets,
+ u64 lastuse, bool hw)
+{
+ struct tcf_ct *c = to_ct(a);
+
+ _bstats_cpu_update(this_cpu_ptr(a->cpu_bstats), bytes, packets);
+
+ if (hw)
+ _bstats_cpu_update(this_cpu_ptr(a->cpu_bstats_hw),
+ bytes, packets);
+ c->tcf_tm.lastuse = max_t(u64, c->tcf_tm.lastuse, lastuse);
+}
+
+static struct tc_action_ops act_ct_ops = {
+ .kind = "ct",
+ .id = TCA_ID_CT,
+ .owner = THIS_MODULE,
+ .act = tcf_ct_act,
+ .dump = tcf_ct_dump,
+ .init = tcf_ct_init,
+ .cleanup = tcf_ct_cleanup,
+ .walk = tcf_ct_walker,
+ .lookup = tcf_ct_search,
+ .stats_update = tcf_stats_update,
+ .size = sizeof(struct tcf_ct),
+};
+
+static __net_init int ct_init_net(struct net *net)
+{
+ unsigned int n_bits = FIELD_SIZEOF(struct tcf_ct_params, labels) * 8;
+ struct tc_ct_action_net *tn = net_generic(net, ct_net_id);
+
+ if (nf_connlabels_get(net, n_bits - 1)) {
+ tn->labels = false;
+ pr_err("act_ct: Failed to set connlabels length");
+ } else {
+ tn->labels = true;
+ }
+
+ return tc_action_net_init(net, &tn->tn, &act_ct_ops);
+}
+
+static void __net_exit ct_exit_net(struct list_head *net_list)
+{
+ struct net *net;
+
+ rtnl_lock();
+ list_for_each_entry(net, net_list, exit_list) {
+ struct tc_ct_action_net *tn = net_generic(net, ct_net_id);
+
+ if (tn->labels)
+ nf_connlabels_put(net);
+ }
+ rtnl_unlock();
+
+ tc_action_net_exit(net_list, ct_net_id);
+}
+
+static struct pernet_operations ct_net_ops = {
+ .init = ct_init_net,
+ .exit_batch = ct_exit_net,
+ .id = &ct_net_id,
+ .size = sizeof(struct tc_ct_action_net),
+};
+
+static int __init ct_init_module(void)
+{
+ return tcf_register_action(&act_ct_ops, &ct_net_ops);
+}
+
+static void __exit ct_cleanup_module(void)
+{
+ tcf_unregister_action(&act_ct_ops, &ct_net_ops);
+}
+
+module_init(ct_init_module);
+module_exit(ct_cleanup_module);
+MODULE_AUTHOR("Paul Blakey <paulb@mellanox.com>");
+MODULE_AUTHOR("Yossi Kuperman <yossiku@mellanox.com>");
+MODULE_AUTHOR("Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>");
+MODULE_DESCRIPTION("Connection tracking action");
+MODULE_LICENSE("GPL v2");
+
diff --git a/net/sched/act_ctinfo.c b/net/sched/act_ctinfo.c
new file mode 100644
index 0000000..0dbcfd1
--- /dev/null
+++ b/net/sched/act_ctinfo.c
@@ -0,0 +1,408 @@
+// SPDX-License-Identifier: GPL-2.0+
+/* net/sched/act_ctinfo.c netfilter ctinfo connmark actions
+ *
+ * Copyright (c) 2019 Kevin Darbyshire-Bryant <ldir@darbyshire-bryant.me.uk>
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/pkt_cls.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+#include <net/act_api.h>
+#include <net/pkt_cls.h>
+#include <uapi/linux/tc_act/tc_ctinfo.h>
+#include <net/tc_act/tc_ctinfo.h>
+
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_ecache.h>
+#include <net/netfilter/nf_conntrack_zones.h>
+
+static struct tc_action_ops act_ctinfo_ops;
+static unsigned int ctinfo_net_id;
+
+static void tcf_ctinfo_dscp_set(struct nf_conn *ct, struct tcf_ctinfo *ca,
+ struct tcf_ctinfo_params *cp,
+ struct sk_buff *skb, int wlen, int proto)
+{
+ u8 dscp, newdscp;
+
+ newdscp = (((ct->mark & cp->dscpmask) >> cp->dscpmaskshift) << 2) &
+ ~INET_ECN_MASK;
+
+ switch (proto) {
+ case NFPROTO_IPV4:
+ dscp = ipv4_get_dsfield(ip_hdr(skb)) & ~INET_ECN_MASK;
+ if (dscp != newdscp) {
+ if (likely(!skb_try_make_writable(skb, wlen))) {
+ ipv4_change_dsfield(ip_hdr(skb),
+ INET_ECN_MASK,
+ newdscp);
+ ca->stats_dscp_set++;
+ } else {
+ ca->stats_dscp_error++;
+ }
+ }
+ break;
+ case NFPROTO_IPV6:
+ dscp = ipv6_get_dsfield(ipv6_hdr(skb)) & ~INET_ECN_MASK;
+ if (dscp != newdscp) {
+ if (likely(!skb_try_make_writable(skb, wlen))) {
+ ipv6_change_dsfield(ipv6_hdr(skb),
+ INET_ECN_MASK,
+ newdscp);
+ ca->stats_dscp_set++;
+ } else {
+ ca->stats_dscp_error++;
+ }
+ }
+ break;
+ default:
+ break;
+ }
+}
+
+static void tcf_ctinfo_cpmark_set(struct nf_conn *ct, struct tcf_ctinfo *ca,
+ struct tcf_ctinfo_params *cp,
+ struct sk_buff *skb)
+{
+ ca->stats_cpmark_set++;
+ skb->mark = ct->mark & cp->cpmarkmask;
+}
+
+static int tcf_ctinfo_act(struct sk_buff *skb, const struct tc_action *a,
+ struct tcf_result *res)
+{
+ const struct nf_conntrack_tuple_hash *thash = NULL;
+ struct tcf_ctinfo *ca = to_ctinfo(a);
+ struct nf_conntrack_tuple tuple;
+ struct nf_conntrack_zone zone;
+ enum ip_conntrack_info ctinfo;
+ struct tcf_ctinfo_params *cp;
+ struct nf_conn *ct;
+ int proto, wlen;
+ int action;
+
+ cp = rcu_dereference_bh(ca->params);
+
+ tcf_lastuse_update(&ca->tcf_tm);
+ bstats_update(&ca->tcf_bstats, skb);
+ action = READ_ONCE(ca->tcf_action);
+
+ wlen = skb_network_offset(skb);
+ if (tc_skb_protocol(skb) == htons(ETH_P_IP)) {
+ wlen += sizeof(struct iphdr);
+ if (!pskb_may_pull(skb, wlen))
+ goto out;
+
+ proto = NFPROTO_IPV4;
+ } else if (tc_skb_protocol(skb) == htons(ETH_P_IPV6)) {
+ wlen += sizeof(struct ipv6hdr);
+ if (!pskb_may_pull(skb, wlen))
+ goto out;
+
+ proto = NFPROTO_IPV6;
+ } else {
+ goto out;
+ }
+
+ ct = nf_ct_get(skb, &ctinfo);
+ if (!ct) { /* look harder, usually ingress */
+ if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb),
+ proto, cp->net, &tuple))
+ goto out;
+ zone.id = cp->zone;
+ zone.dir = NF_CT_DEFAULT_ZONE_DIR;
+
+ thash = nf_conntrack_find_get(cp->net, &zone, &tuple);
+ if (!thash)
+ goto out;
+
+ ct = nf_ct_tuplehash_to_ctrack(thash);
+ }
+
+ if (cp->mode & CTINFO_MODE_DSCP)
+ if (!cp->dscpstatemask || (ct->mark & cp->dscpstatemask))
+ tcf_ctinfo_dscp_set(ct, ca, cp, skb, wlen, proto);
+
+ if (cp->mode & CTINFO_MODE_CPMARK)
+ tcf_ctinfo_cpmark_set(ct, ca, cp, skb);
+
+ if (thash)
+ nf_ct_put(ct);
+out:
+ return action;
+}
+
+static const struct nla_policy ctinfo_policy[TCA_CTINFO_MAX + 1] = {
+ [TCA_CTINFO_ACT] = { .type = NLA_EXACT_LEN,
+ .len = sizeof(struct
+ tc_ctinfo) },
+ [TCA_CTINFO_ZONE] = { .type = NLA_U16 },
+ [TCA_CTINFO_PARMS_DSCP_MASK] = { .type = NLA_U32 },
+ [TCA_CTINFO_PARMS_DSCP_STATEMASK] = { .type = NLA_U32 },
+ [TCA_CTINFO_PARMS_CPMARK_MASK] = { .type = NLA_U32 },
+};
+
+static int tcf_ctinfo_init(struct net *net, struct nlattr *nla,
+ struct nlattr *est, struct tc_action **a,
+ int ovr, int bind, bool rtnl_held,
+ struct tcf_proto *tp,
+ struct netlink_ext_ack *extack)
+{
+ struct tc_action_net *tn = net_generic(net, ctinfo_net_id);
+ u32 dscpmask = 0, dscpstatemask, index;
+ struct nlattr *tb[TCA_CTINFO_MAX + 1];
+ struct tcf_ctinfo_params *cp_new;
+ struct tcf_chain *goto_ch = NULL;
+ struct tc_ctinfo *actparm;
+ struct tcf_ctinfo *ci;
+ u8 dscpmaskshift;
+ int ret = 0, err;
+
+ if (!nla) {
+ NL_SET_ERR_MSG_MOD(extack, "ctinfo requires attributes to be passed");
+ return -EINVAL;
+ }
+
+ err = nla_parse_nested(tb, TCA_CTINFO_MAX, nla, ctinfo_policy, extack);
+ if (err < 0)
+ return err;
+
+ if (!tb[TCA_CTINFO_ACT]) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Missing required TCA_CTINFO_ACT attribute");
+ return -EINVAL;
+ }
+ actparm = nla_data(tb[TCA_CTINFO_ACT]);
+
+ /* do some basic validation here before dynamically allocating things */
+ /* that we would otherwise have to clean up. */
+ if (tb[TCA_CTINFO_PARMS_DSCP_MASK]) {
+ dscpmask = nla_get_u32(tb[TCA_CTINFO_PARMS_DSCP_MASK]);
+ /* need contiguous 6 bit mask */
+ dscpmaskshift = dscpmask ? __ffs(dscpmask) : 0;
+ if ((~0 & (dscpmask >> dscpmaskshift)) != 0x3f) {
+ NL_SET_ERR_MSG_ATTR(extack,
+ tb[TCA_CTINFO_PARMS_DSCP_MASK],
+ "dscp mask must be 6 contiguous bits");
+ return -EINVAL;
+ }
+ dscpstatemask = tb[TCA_CTINFO_PARMS_DSCP_STATEMASK] ?
+ nla_get_u32(tb[TCA_CTINFO_PARMS_DSCP_STATEMASK]) : 0;
+ /* mask & statemask must not overlap */
+ if (dscpmask & dscpstatemask) {
+ NL_SET_ERR_MSG_ATTR(extack,
+ tb[TCA_CTINFO_PARMS_DSCP_STATEMASK],
+ "dscp statemask must not overlap dscp mask");
+ return -EINVAL;
+ }
+ }
+
+ /* done the validation:now to the actual action allocation */
+ index = actparm->index;
+ err = tcf_idr_check_alloc(tn, &index, a, bind);
+ if (!err) {
+ ret = tcf_idr_create(tn, index, est, a,
+ &act_ctinfo_ops, bind, false);
+ if (ret) {
+ tcf_idr_cleanup(tn, index);
+ return ret;
+ }
+ ret = ACT_P_CREATED;
+ } else if (err > 0) {
+ if (bind) /* don't override defaults */
+ return 0;
+ if (!ovr) {
+ tcf_idr_release(*a, bind);
+ return -EEXIST;
+ }
+ } else {
+ return err;
+ }
+
+ err = tcf_action_check_ctrlact(actparm->action, tp, &goto_ch, extack);
+ if (err < 0)
+ goto release_idr;
+
+ ci = to_ctinfo(*a);
+
+ cp_new = kzalloc(sizeof(*cp_new), GFP_KERNEL);
+ if (unlikely(!cp_new)) {
+ err = -ENOMEM;
+ goto put_chain;
+ }
+
+ cp_new->net = net;
+ cp_new->zone = tb[TCA_CTINFO_ZONE] ?
+ nla_get_u16(tb[TCA_CTINFO_ZONE]) : 0;
+ if (dscpmask) {
+ cp_new->dscpmask = dscpmask;
+ cp_new->dscpmaskshift = dscpmaskshift;
+ cp_new->dscpstatemask = dscpstatemask;
+ cp_new->mode |= CTINFO_MODE_DSCP;
+ }
+
+ if (tb[TCA_CTINFO_PARMS_CPMARK_MASK]) {
+ cp_new->cpmarkmask =
+ nla_get_u32(tb[TCA_CTINFO_PARMS_CPMARK_MASK]);
+ cp_new->mode |= CTINFO_MODE_CPMARK;
+ }
+
+ spin_lock_bh(&ci->tcf_lock);
+ goto_ch = tcf_action_set_ctrlact(*a, actparm->action, goto_ch);
+ rcu_swap_protected(ci->params, cp_new,
+ lockdep_is_held(&ci->tcf_lock));
+ spin_unlock_bh(&ci->tcf_lock);
+
+ if (goto_ch)
+ tcf_chain_put_by_act(goto_ch);
+ if (cp_new)
+ kfree_rcu(cp_new, rcu);
+
+ if (ret == ACT_P_CREATED)
+ tcf_idr_insert(tn, *a);
+
+ return ret;
+
+put_chain:
+ if (goto_ch)
+ tcf_chain_put_by_act(goto_ch);
+release_idr:
+ tcf_idr_release(*a, bind);
+ return err;
+}
+
+static int tcf_ctinfo_dump(struct sk_buff *skb, struct tc_action *a,
+ int bind, int ref)
+{
+ struct tcf_ctinfo *ci = to_ctinfo(a);
+ struct tc_ctinfo opt = {
+ .index = ci->tcf_index,
+ .refcnt = refcount_read(&ci->tcf_refcnt) - ref,
+ .bindcnt = atomic_read(&ci->tcf_bindcnt) - bind,
+ };
+ unsigned char *b = skb_tail_pointer(skb);
+ struct tcf_ctinfo_params *cp;
+ struct tcf_t t;
+
+ spin_lock_bh(&ci->tcf_lock);
+ cp = rcu_dereference_protected(ci->params,
+ lockdep_is_held(&ci->tcf_lock));
+
+ tcf_tm_dump(&t, &ci->tcf_tm);
+ if (nla_put_64bit(skb, TCA_CTINFO_TM, sizeof(t), &t, TCA_CTINFO_PAD))
+ goto nla_put_failure;
+
+ opt.action = ci->tcf_action;
+ if (nla_put(skb, TCA_CTINFO_ACT, sizeof(opt), &opt))
+ goto nla_put_failure;
+
+ if (nla_put_u16(skb, TCA_CTINFO_ZONE, cp->zone))
+ goto nla_put_failure;
+
+ if (cp->mode & CTINFO_MODE_DSCP) {
+ if (nla_put_u32(skb, TCA_CTINFO_PARMS_DSCP_MASK,
+ cp->dscpmask))
+ goto nla_put_failure;
+ if (nla_put_u32(skb, TCA_CTINFO_PARMS_DSCP_STATEMASK,
+ cp->dscpstatemask))
+ goto nla_put_failure;
+ }
+
+ if (cp->mode & CTINFO_MODE_CPMARK) {
+ if (nla_put_u32(skb, TCA_CTINFO_PARMS_CPMARK_MASK,
+ cp->cpmarkmask))
+ goto nla_put_failure;
+ }
+
+ if (nla_put_u64_64bit(skb, TCA_CTINFO_STATS_DSCP_SET,
+ ci->stats_dscp_set, TCA_CTINFO_PAD))
+ goto nla_put_failure;
+
+ if (nla_put_u64_64bit(skb, TCA_CTINFO_STATS_DSCP_ERROR,
+ ci->stats_dscp_error, TCA_CTINFO_PAD))
+ goto nla_put_failure;
+
+ if (nla_put_u64_64bit(skb, TCA_CTINFO_STATS_CPMARK_SET,
+ ci->stats_cpmark_set, TCA_CTINFO_PAD))
+ goto nla_put_failure;
+
+ spin_unlock_bh(&ci->tcf_lock);
+ return skb->len;
+
+nla_put_failure:
+ spin_unlock_bh(&ci->tcf_lock);
+ nlmsg_trim(skb, b);
+ return -1;
+}
+
+static int tcf_ctinfo_walker(struct net *net, struct sk_buff *skb,
+ struct netlink_callback *cb, int type,
+ const struct tc_action_ops *ops,
+ struct netlink_ext_ack *extack)
+{
+ struct tc_action_net *tn = net_generic(net, ctinfo_net_id);
+
+ return tcf_generic_walker(tn, skb, cb, type, ops, extack);
+}
+
+static int tcf_ctinfo_search(struct net *net, struct tc_action **a, u32 index)
+{
+ struct tc_action_net *tn = net_generic(net, ctinfo_net_id);
+
+ return tcf_idr_search(tn, a, index);
+}
+
+static struct tc_action_ops act_ctinfo_ops = {
+ .kind = "ctinfo",
+ .id = TCA_ID_CTINFO,
+ .owner = THIS_MODULE,
+ .act = tcf_ctinfo_act,
+ .dump = tcf_ctinfo_dump,
+ .init = tcf_ctinfo_init,
+ .walk = tcf_ctinfo_walker,
+ .lookup = tcf_ctinfo_search,
+ .size = sizeof(struct tcf_ctinfo),
+};
+
+static __net_init int ctinfo_init_net(struct net *net)
+{
+ struct tc_action_net *tn = net_generic(net, ctinfo_net_id);
+
+ return tc_action_net_init(net, tn, &act_ctinfo_ops);
+}
+
+static void __net_exit ctinfo_exit_net(struct list_head *net_list)
+{
+ tc_action_net_exit(net_list, ctinfo_net_id);
+}
+
+static struct pernet_operations ctinfo_net_ops = {
+ .init = ctinfo_init_net,
+ .exit_batch = ctinfo_exit_net,
+ .id = &ctinfo_net_id,
+ .size = sizeof(struct tc_action_net),
+};
+
+static int __init ctinfo_init_module(void)
+{
+ return tcf_register_action(&act_ctinfo_ops, &ctinfo_net_ops);
+}
+
+static void __exit ctinfo_cleanup_module(void)
+{
+ tcf_unregister_action(&act_ctinfo_ops, &ctinfo_net_ops);
+}
+
+module_init(ctinfo_init_module);
+module_exit(ctinfo_cleanup_module);
+MODULE_AUTHOR("Kevin Darbyshire-Bryant <ldir@darbyshire-bryant.me.uk>");
+MODULE_DESCRIPTION("Connection tracking mark actions");
+MODULE_LICENSE("GPL");
diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c
index cd1d9bd..324f1d1 100644
--- a/net/sched/act_gact.c
+++ b/net/sched/act_gact.c
@@ -1,13 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* net/sched/act_gact.c Generic actions
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* copyright Jamal Hadi Salim (2002-4)
- *
*/
#include <linux/types.h>
@@ -20,6 +15,7 @@
#include <linux/init.h>
#include <net/netlink.h>
#include <net/pkt_sched.h>
+#include <net/pkt_cls.h>
#include <linux/tc_act/tc_gact.h>
#include <net/tc_act/tc_gact.h>
@@ -57,13 +53,15 @@
static int tcf_gact_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a,
int ovr, int bind, bool rtnl_held,
- struct netlink_ext_ack *extack)
+ struct tcf_proto *tp, struct netlink_ext_ack *extack)
{
struct tc_action_net *tn = net_generic(net, gact_net_id);
struct nlattr *tb[TCA_GACT_MAX + 1];
+ struct tcf_chain *goto_ch = NULL;
struct tc_gact *parm;
struct tcf_gact *gact;
int ret = 0;
+ u32 index;
int err;
#ifdef CONFIG_GACT_PROB
struct tc_gact_p *p_parm = NULL;
@@ -72,13 +70,15 @@
if (nla == NULL)
return -EINVAL;
- err = nla_parse_nested(tb, TCA_GACT_MAX, nla, gact_policy, NULL);
+ err = nla_parse_nested_deprecated(tb, TCA_GACT_MAX, nla, gact_policy,
+ NULL);
if (err < 0)
return err;
if (tb[TCA_GACT_PARMS] == NULL)
return -EINVAL;
parm = nla_data(tb[TCA_GACT_PARMS]);
+ index = parm->index;
#ifndef CONFIG_GACT_PROB
if (tb[TCA_GACT_PROB] != NULL)
@@ -88,15 +88,20 @@
p_parm = nla_data(tb[TCA_GACT_PROB]);
if (p_parm->ptype >= MAX_RAND)
return -EINVAL;
+ if (TC_ACT_EXT_CMP(p_parm->paction, TC_ACT_GOTO_CHAIN)) {
+ NL_SET_ERR_MSG(extack,
+ "goto chain not allowed on fallback");
+ return -EINVAL;
+ }
}
#endif
- err = tcf_idr_check_alloc(tn, &parm->index, a, bind);
+ err = tcf_idr_check_alloc(tn, &index, a, bind);
if (!err) {
- ret = tcf_idr_create(tn, parm->index, est, a,
+ ret = tcf_idr_create(tn, index, est, a,
&act_gact_ops, bind, true);
if (ret) {
- tcf_idr_cleanup(tn, parm->index);
+ tcf_idr_cleanup(tn, index);
return ret;
}
ret = ACT_P_CREATED;
@@ -111,10 +116,13 @@
return err;
}
+ err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack);
+ if (err < 0)
+ goto release_idr;
gact = to_gact(*a);
spin_lock_bh(&gact->tcf_lock);
- gact->tcf_action = parm->action;
+ goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch);
#ifdef CONFIG_GACT_PROB
if (p_parm) {
gact->tcfg_paction = p_parm->paction;
@@ -128,9 +136,15 @@
#endif
spin_unlock_bh(&gact->tcf_lock);
+ if (goto_ch)
+ tcf_chain_put_by_act(goto_ch);
+
if (ret == ACT_P_CREATED)
tcf_idr_insert(tn, *a);
return ret;
+release_idr:
+ tcf_idr_release(*a, bind);
+ return err;
}
static int tcf_gact_act(struct sk_buff *skb, const struct tc_action *a,
@@ -157,7 +171,7 @@
}
static void tcf_gact_stats_update(struct tc_action *a, u64 bytes, u32 packets,
- u64 lastuse)
+ u64 lastuse, bool hw)
{
struct tcf_gact *gact = to_gact(a);
int action = READ_ONCE(gact->tcf_action);
@@ -168,6 +182,10 @@
if (action == TC_ACT_SHOT)
this_cpu_ptr(gact->common.cpu_qstats)->drops += packets;
+ if (hw)
+ _bstats_cpu_update(this_cpu_ptr(gact->common.cpu_bstats_hw),
+ bytes, packets);
+
tm->lastuse = max_t(u64, tm->lastuse, lastuse);
}
@@ -222,8 +240,7 @@
return tcf_generic_walker(tn, skb, cb, type, ops, extack);
}
-static int tcf_gact_search(struct net *net, struct tc_action **a, u32 index,
- struct netlink_ext_ack *extack)
+static int tcf_gact_search(struct net *net, struct tc_action **a, u32 index)
{
struct tc_action_net *tn = net_generic(net, gact_net_id);
@@ -245,7 +262,7 @@
static struct tc_action_ops act_gact_ops = {
.kind = "gact",
- .type = TCA_ACT_GACT,
+ .id = TCA_ID_GACT,
.owner = THIS_MODULE,
.act = tcf_gact_act,
.stats_update = tcf_gact_stats_update,
@@ -261,7 +278,7 @@
{
struct tc_action_net *tn = net_generic(net, gact_net_id);
- return tc_action_net_init(tn, &act_gact_ops);
+ return tc_action_net_init(net, tn, &act_gact_ops);
}
static void __net_exit gact_exit_net(struct list_head *net_list)
diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c
index 06a3d48..3a31e24 100644
--- a/net/sched/act_ife.c
+++ b/net/sched/act_ife.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* net/sched/ife.c Inter-FE action based on ForCES WG InterFE LFB
*
@@ -9,13 +10,7 @@
* Subsystem"
* Authors: Jamal Hadi Salim and Damascene M. Joachimpillai
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* copyright Jamal Hadi Salim (2015)
- *
*/
#include <linux/types.h>
@@ -29,6 +24,7 @@
#include <net/net_namespace.h>
#include <net/netlink.h>
#include <net/pkt_sched.h>
+#include <net/pkt_cls.h>
#include <uapi/linux/tc_act/tc_ife.h>
#include <net/tc_act/tc_ife.h>
#include <linux/etherdevice.h>
@@ -386,7 +382,7 @@
if (list_empty(&ife->metalist))
return 0;
- nest = nla_nest_start(skb, TCA_IFE_METALST);
+ nest = nla_nest_start_noflag(skb, TCA_IFE_METALST);
if (!nest)
goto out_nlmsg_trim;
@@ -469,11 +465,12 @@
static int tcf_ife_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a,
int ovr, int bind, bool rtnl_held,
- struct netlink_ext_ack *extack)
+ struct tcf_proto *tp, struct netlink_ext_ack *extack)
{
struct tc_action_net *tn = net_generic(net, ife_net_id);
struct nlattr *tb[TCA_IFE_MAX + 1];
struct nlattr *tb2[IFE_META_MAX + 1];
+ struct tcf_chain *goto_ch = NULL;
struct tcf_ife_params *p;
struct tcf_ife_info *ife;
u16 ife_type = ETH_P_IFE;
@@ -482,9 +479,16 @@
u8 *saddr = NULL;
bool exists = false;
int ret = 0;
+ u32 index;
int err;
- err = nla_parse_nested(tb, TCA_IFE_MAX, nla, ife_policy, NULL);
+ if (!nla) {
+ NL_SET_ERR_MSG_MOD(extack, "IFE requires attributes to be passed");
+ return -EINVAL;
+ }
+
+ err = nla_parse_nested_deprecated(tb, TCA_IFE_MAX, nla, ife_policy,
+ NULL);
if (err < 0)
return err;
@@ -504,7 +508,8 @@
if (!p)
return -ENOMEM;
- err = tcf_idr_check_alloc(tn, &parm->index, a, bind);
+ index = parm->index;
+ err = tcf_idr_check_alloc(tn, &index, a, bind);
if (err < 0) {
kfree(p);
return err;
@@ -516,10 +521,10 @@
}
if (!exists) {
- ret = tcf_idr_create(tn, parm->index, est, a, &act_ife_ops,
+ ret = tcf_idr_create(tn, index, est, a, &act_ife_ops,
bind, true);
if (ret) {
- tcf_idr_cleanup(tn, parm->index);
+ tcf_idr_cleanup(tn, index);
kfree(p);
return ret;
}
@@ -531,6 +536,10 @@
}
ife = to_ife(*a);
+ err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack);
+ if (err < 0)
+ goto release_idr;
+
p->flags = parm->flags;
if (parm->flags & IFE_ENCODE) {
@@ -561,15 +570,11 @@
INIT_LIST_HEAD(&ife->metalist);
if (tb[TCA_IFE_METALST]) {
- err = nla_parse_nested(tb2, IFE_META_MAX, tb[TCA_IFE_METALST],
- NULL, NULL);
- if (err) {
-metadata_parse_err:
- tcf_idr_release(*a, bind);
- kfree(p);
- return err;
- }
-
+ err = nla_parse_nested_deprecated(tb2, IFE_META_MAX,
+ tb[TCA_IFE_METALST], NULL,
+ NULL);
+ if (err)
+ goto metadata_parse_err;
err = populate_metalist(ife, tb2, exists, rtnl_held);
if (err)
goto metadata_parse_err;
@@ -581,21 +586,20 @@
* going to bail out
*/
err = use_all_metadata(ife, exists);
- if (err) {
- tcf_idr_release(*a, bind);
- kfree(p);
- return err;
- }
+ if (err)
+ goto metadata_parse_err;
}
if (exists)
spin_lock_bh(&ife->tcf_lock);
- ife->tcf_action = parm->action;
/* protected by tcf_lock when modifying existing action */
+ goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch);
rcu_swap_protected(ife->params, p, 1);
if (exists)
spin_unlock_bh(&ife->tcf_lock);
+ if (goto_ch)
+ tcf_chain_put_by_act(goto_ch);
if (p)
kfree_rcu(p, rcu);
@@ -603,6 +607,13 @@
tcf_idr_insert(tn, *a);
return ret;
+metadata_parse_err:
+ if (goto_ch)
+ tcf_chain_put_by_act(goto_ch);
+release_idr:
+ kfree(p);
+ tcf_idr_release(*a, bind);
+ return err;
}
static int tcf_ife_dump(struct sk_buff *skb, struct tc_action *a, int bind,
@@ -855,8 +866,7 @@
return tcf_generic_walker(tn, skb, cb, type, ops, extack);
}
-static int tcf_ife_search(struct net *net, struct tc_action **a, u32 index,
- struct netlink_ext_ack *extack)
+static int tcf_ife_search(struct net *net, struct tc_action **a, u32 index)
{
struct tc_action_net *tn = net_generic(net, ife_net_id);
@@ -865,7 +875,7 @@
static struct tc_action_ops act_ife_ops = {
.kind = "ife",
- .type = TCA_ACT_IFE,
+ .id = TCA_ID_IFE,
.owner = THIS_MODULE,
.act = tcf_ife_act,
.dump = tcf_ife_dump,
@@ -880,7 +890,7 @@
{
struct tc_action_net *tn = net_generic(net, ife_net_id);
- return tc_action_net_init(tn, &act_ife_ops);
+ return tc_action_net_init(net, tn, &act_ife_ops);
}
static void __net_exit ife_exit_net(struct list_head *net_list)
diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c
index 8525de8..214a03d 100644
--- a/net/sched/act_ipt.c
+++ b/net/sched/act_ipt.c
@@ -1,13 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* net/sched/act_ipt.c iptables target interface
*
*TODO: Add other tables. For now we only support the ipv4 table targets
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Copyright: Jamal Hadi Salim (2002-13)
*/
@@ -65,12 +61,13 @@
return 0;
}
-static void ipt_destroy_target(struct xt_entry_target *t)
+static void ipt_destroy_target(struct xt_entry_target *t, struct net *net)
{
struct xt_tgdtor_param par = {
.target = t->u.kernel.target,
.targinfo = t->data,
.family = NFPROTO_IPV4,
+ .net = net,
};
if (par.target->destroy != NULL)
par.target->destroy(&par);
@@ -82,7 +79,7 @@
struct tcf_ipt *ipt = to_ipt(a);
if (ipt->tcfi_t) {
- ipt_destroy_target(ipt->tcfi_t);
+ ipt_destroy_target(ipt->tcfi_t, a->idrinfo->net);
kfree(ipt->tcfi_t);
}
kfree(ipt->tcfi_tname);
@@ -97,7 +94,8 @@
static int __tcf_ipt_init(struct net *net, unsigned int id, struct nlattr *nla,
struct nlattr *est, struct tc_action **a,
- const struct tc_action_ops *ops, int ovr, int bind)
+ const struct tc_action_ops *ops, int ovr, int bind,
+ struct tcf_proto *tp)
{
struct tc_action_net *tn = net_generic(net, id);
struct nlattr *tb[TCA_IPT_MAX + 1];
@@ -112,7 +110,8 @@
if (nla == NULL)
return -EINVAL;
- err = nla_parse_nested(tb, TCA_IPT_MAX, nla, ipt_policy, NULL);
+ err = nla_parse_nested_deprecated(tb, TCA_IPT_MAX, nla, ipt_policy,
+ NULL);
if (err < 0)
return err;
@@ -182,7 +181,7 @@
spin_lock_bh(&ipt->tcf_lock);
if (ret != ACT_P_CREATED) {
- ipt_destroy_target(ipt->tcfi_t);
+ ipt_destroy_target(ipt->tcfi_t, net);
kfree(ipt->tcfi_tname);
kfree(ipt->tcfi_t);
}
@@ -199,27 +198,26 @@
err2:
kfree(tname);
err1:
- if (ret == ACT_P_CREATED)
- tcf_idr_release(*a, bind);
+ tcf_idr_release(*a, bind);
return err;
}
static int tcf_ipt_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a, int ovr,
- int bind, bool rtnl_held,
+ int bind, bool rtnl_held, struct tcf_proto *tp,
struct netlink_ext_ack *extack)
{
return __tcf_ipt_init(net, ipt_net_id, nla, est, a, &act_ipt_ops, ovr,
- bind);
+ bind, tp);
}
static int tcf_xt_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a, int ovr,
- int bind, bool unlocked,
+ int bind, bool unlocked, struct tcf_proto *tp,
struct netlink_ext_ack *extack)
{
return __tcf_ipt_init(net, xt_net_id, nla, est, a, &act_xt_ops, ovr,
- bind);
+ bind, tp);
}
static int tcf_ipt_act(struct sk_buff *skb, const struct tc_action *a,
@@ -329,8 +327,7 @@
return tcf_generic_walker(tn, skb, cb, type, ops, extack);
}
-static int tcf_ipt_search(struct net *net, struct tc_action **a, u32 index,
- struct netlink_ext_ack *extack)
+static int tcf_ipt_search(struct net *net, struct tc_action **a, u32 index)
{
struct tc_action_net *tn = net_generic(net, ipt_net_id);
@@ -339,7 +336,7 @@
static struct tc_action_ops act_ipt_ops = {
.kind = "ipt",
- .type = TCA_ACT_IPT,
+ .id = TCA_ID_IPT,
.owner = THIS_MODULE,
.act = tcf_ipt_act,
.dump = tcf_ipt_dump,
@@ -354,7 +351,7 @@
{
struct tc_action_net *tn = net_generic(net, ipt_net_id);
- return tc_action_net_init(tn, &act_ipt_ops);
+ return tc_action_net_init(net, tn, &act_ipt_ops);
}
static void __net_exit ipt_exit_net(struct list_head *net_list)
@@ -379,8 +376,7 @@
return tcf_generic_walker(tn, skb, cb, type, ops, extack);
}
-static int tcf_xt_search(struct net *net, struct tc_action **a, u32 index,
- struct netlink_ext_ack *extack)
+static int tcf_xt_search(struct net *net, struct tc_action **a, u32 index)
{
struct tc_action_net *tn = net_generic(net, xt_net_id);
@@ -389,7 +385,7 @@
static struct tc_action_ops act_xt_ops = {
.kind = "xt",
- .type = TCA_ACT_XT,
+ .id = TCA_ID_XT,
.owner = THIS_MODULE,
.act = tcf_ipt_act,
.dump = tcf_ipt_dump,
@@ -404,7 +400,7 @@
{
struct tc_action_net *tn = net_generic(net, xt_net_id);
- return tc_action_net_init(tn, &act_xt_ops);
+ return tc_action_net_init(net, tn, &act_xt_ops);
}
static void __net_exit xt_exit_net(struct list_head *net_list)
diff --git a/net/sched/act_meta_mark.c b/net/sched/act_meta_mark.c
index 6445184..ea0573c 100644
--- a/net/sched/act_meta_mark.c
+++ b/net/sched/act_meta_mark.c
@@ -1,13 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* net/sched/act_meta_mark.c IFE skb->mark metadata module
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* copyright Jamal Hadi Salim (2015)
- *
*/
#include <linux/types.h>
diff --git a/net/sched/act_meta_skbprio.c b/net/sched/act_meta_skbprio.c
index 4033f9f..2df3133 100644
--- a/net/sched/act_meta_skbprio.c
+++ b/net/sched/act_meta_skbprio.c
@@ -1,13 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* net/sched/act_meta_prio.c IFE skb->priority metadata module
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* copyright Jamal Hadi Salim (2015)
- *
*/
#include <linux/types.h>
diff --git a/net/sched/act_meta_skbtcindex.c b/net/sched/act_meta_skbtcindex.c
index 7221437..44547ca 100644
--- a/net/sched/act_meta_skbtcindex.c
+++ b/net/sched/act_meta_skbtcindex.c
@@ -1,13 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* net/sched/act_meta_tc_index.c IFE skb->tc_index metadata module
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* copyright Jamal Hadi Salim (2016)
- *
*/
#include <linux/types.h>
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index 8bf66d0..08923b2 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -1,15 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* net/sched/act_mirred.c packet mirroring and redirect actions
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Authors: Jamal Hadi Salim (2002-4)
*
* TODO: Add ingress support (and socket redirect support)
- *
*/
#include <linux/types.h>
@@ -32,6 +27,9 @@
static LIST_HEAD(mirred_list);
static DEFINE_SPINLOCK(mirred_list_lock);
+#define MIRRED_RECURSION_LIMIT 4
+static DEFINE_PER_CPU(unsigned int, mirred_rec_level);
+
static bool tcf_mirred_is_act_redirect(int action)
{
return action == TCA_EGRESS_REDIR || action == TCA_INGRESS_REDIR;
@@ -94,22 +92,26 @@
static int tcf_mirred_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a,
int ovr, int bind, bool rtnl_held,
+ struct tcf_proto *tp,
struct netlink_ext_ack *extack)
{
struct tc_action_net *tn = net_generic(net, mirred_net_id);
struct nlattr *tb[TCA_MIRRED_MAX + 1];
+ struct tcf_chain *goto_ch = NULL;
bool mac_header_xmit = false;
struct tc_mirred *parm;
struct tcf_mirred *m;
struct net_device *dev;
bool exists = false;
int ret, err;
+ u32 index;
if (!nla) {
NL_SET_ERR_MSG_MOD(extack, "Mirred requires attributes to be passed");
return -EINVAL;
}
- ret = nla_parse_nested(tb, TCA_MIRRED_MAX, nla, mirred_policy, extack);
+ ret = nla_parse_nested_deprecated(tb, TCA_MIRRED_MAX, nla,
+ mirred_policy, extack);
if (ret < 0)
return ret;
if (!tb[TCA_MIRRED_PARMS]) {
@@ -117,8 +119,8 @@
return -EINVAL;
}
parm = nla_data(tb[TCA_MIRRED_PARMS]);
-
- err = tcf_idr_check_alloc(tn, &parm->index, a, bind);
+ index = parm->index;
+ err = tcf_idr_check_alloc(tn, &index, a, bind);
if (err < 0)
return err;
exists = err;
@@ -135,21 +137,21 @@
if (exists)
tcf_idr_release(*a, bind);
else
- tcf_idr_cleanup(tn, parm->index);
+ tcf_idr_cleanup(tn, index);
NL_SET_ERR_MSG_MOD(extack, "Unknown mirred option");
return -EINVAL;
}
if (!exists) {
if (!parm->ifindex) {
- tcf_idr_cleanup(tn, parm->index);
+ tcf_idr_cleanup(tn, index);
NL_SET_ERR_MSG_MOD(extack, "Specified device does not exist");
return -EINVAL;
}
- ret = tcf_idr_create(tn, parm->index, est, a,
+ ret = tcf_idr_create(tn, index, est, a,
&act_mirred_ops, bind, true);
if (ret) {
- tcf_idr_cleanup(tn, parm->index);
+ tcf_idr_cleanup(tn, index);
return ret;
}
ret = ACT_P_CREATED;
@@ -157,18 +159,23 @@
tcf_idr_release(*a, bind);
return -EEXIST;
}
+
m = to_mirred(*a);
+ if (ret == ACT_P_CREATED)
+ INIT_LIST_HEAD(&m->tcfm_list);
+
+ err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack);
+ if (err < 0)
+ goto release_idr;
spin_lock_bh(&m->tcf_lock);
- m->tcf_action = parm->action;
- m->tcfm_eaction = parm->eaction;
if (parm->ifindex) {
dev = dev_get_by_index(net, parm->ifindex);
if (!dev) {
spin_unlock_bh(&m->tcf_lock);
- tcf_idr_release(*a, bind);
- return -ENODEV;
+ err = -ENODEV;
+ goto put_chain;
}
mac_header_xmit = dev_is_mac_header_xmit(dev);
rcu_swap_protected(m->tcfm_dev, dev,
@@ -177,7 +184,11 @@
dev_put(dev);
m->tcfm_mac_header_xmit = mac_header_xmit;
}
+ goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch);
+ m->tcfm_eaction = parm->eaction;
spin_unlock_bh(&m->tcf_lock);
+ if (goto_ch)
+ tcf_chain_put_by_act(goto_ch);
if (ret == ACT_P_CREATED) {
spin_lock(&mirred_list_lock);
@@ -188,6 +199,12 @@
}
return ret;
+put_chain:
+ if (goto_ch)
+ tcf_chain_put_by_act(goto_ch);
+release_idr:
+ tcf_idr_release(*a, bind);
+ return err;
}
static int tcf_mirred_act(struct sk_buff *skb, const struct tc_action *a,
@@ -197,6 +214,7 @@
struct sk_buff *skb2 = skb;
bool m_mac_header_xmit;
struct net_device *dev;
+ unsigned int rec_level;
int retval, err = 0;
bool use_reinsert;
bool want_ingress;
@@ -204,6 +222,14 @@
int m_eaction;
int mac_len;
+ rec_level = __this_cpu_inc_return(mirred_rec_level);
+ if (unlikely(rec_level > MIRRED_RECURSION_LIMIT)) {
+ net_warn_ratelimited("Packet exceeded mirred recursion limit on dev %s\n",
+ netdev_name(skb->dev));
+ __this_cpu_dec(mirred_rec_level);
+ return TC_ACT_SHOT;
+ }
+
tcf_lastuse_update(&m->tcf_tm);
bstats_cpu_update(this_cpu_ptr(m->common.cpu_bstats), skb);
@@ -258,12 +284,15 @@
if (is_redirect) {
skb2->tc_redirected = 1;
skb2->tc_from_ingress = skb2->tc_at_ingress;
-
+ if (skb2->tc_from_ingress)
+ skb2->tstamp = 0;
/* let's the caller reinsert the packet, if possible */
if (use_reinsert) {
res->ingress = want_ingress;
res->qstats = this_cpu_ptr(m->common.cpu_qstats);
- return TC_ACT_REINSERT;
+ skb_tc_reinsert(skb, res);
+ __this_cpu_dec(mirred_rec_level);
+ return TC_ACT_CONSUMED;
}
}
@@ -278,17 +307,21 @@
if (tcf_mirred_is_act_redirect(m_eaction))
retval = TC_ACT_SHOT;
}
+ __this_cpu_dec(mirred_rec_level);
return retval;
}
static void tcf_stats_update(struct tc_action *a, u64 bytes, u32 packets,
- u64 lastuse)
+ u64 lastuse, bool hw)
{
struct tcf_mirred *m = to_mirred(a);
struct tcf_t *tm = &m->tcf_tm;
_bstats_cpu_update(this_cpu_ptr(a->cpu_bstats), bytes, packets);
+ if (hw)
+ _bstats_cpu_update(this_cpu_ptr(a->cpu_bstats_hw),
+ bytes, packets);
tm->lastuse = max_t(u64, tm->lastuse, lastuse);
}
@@ -338,8 +371,7 @@
return tcf_generic_walker(tn, skb, cb, type, ops, extack);
}
-static int tcf_mirred_search(struct net *net, struct tc_action **a, u32 index,
- struct netlink_ext_ack *extack)
+static int tcf_mirred_search(struct net *net, struct tc_action **a, u32 index)
{
struct tc_action_net *tn = net_generic(net, mirred_net_id);
@@ -376,28 +408,39 @@
.notifier_call = mirred_device_event,
};
-static struct net_device *tcf_mirred_get_dev(const struct tc_action *a)
+static void tcf_mirred_dev_put(void *priv)
+{
+ struct net_device *dev = priv;
+
+ dev_put(dev);
+}
+
+static struct net_device *
+tcf_mirred_get_dev(const struct tc_action *a,
+ tc_action_priv_destructor *destructor)
{
struct tcf_mirred *m = to_mirred(a);
struct net_device *dev;
rcu_read_lock();
dev = rcu_dereference(m->tcfm_dev);
- if (dev)
+ if (dev) {
dev_hold(dev);
+ *destructor = tcf_mirred_dev_put;
+ }
rcu_read_unlock();
return dev;
}
-static void tcf_mirred_put_dev(struct net_device *dev)
+static size_t tcf_mirred_get_fill_size(const struct tc_action *act)
{
- dev_put(dev);
+ return nla_total_size(sizeof(struct tc_mirred));
}
static struct tc_action_ops act_mirred_ops = {
.kind = "mirred",
- .type = TCA_ACT_MIRRED,
+ .id = TCA_ID_MIRRED,
.owner = THIS_MODULE,
.act = tcf_mirred_act,
.stats_update = tcf_stats_update,
@@ -406,16 +449,16 @@
.init = tcf_mirred_init,
.walk = tcf_mirred_walker,
.lookup = tcf_mirred_search,
+ .get_fill_size = tcf_mirred_get_fill_size,
.size = sizeof(struct tcf_mirred),
.get_dev = tcf_mirred_get_dev,
- .put_dev = tcf_mirred_put_dev,
};
static __net_init int mirred_init_net(struct net *net)
{
struct tc_action_net *tn = net_generic(net, mirred_net_id);
- return tc_action_net_init(tn, &act_mirred_ops);
+ return tc_action_net_init(net, tn, &act_mirred_ops);
}
static void __net_exit mirred_exit_net(struct list_head *net_list)
@@ -441,7 +484,11 @@
return err;
pr_info("Mirror/redirect action on\n");
- return tcf_register_action(&act_mirred_ops, &mirred_net_ops);
+ err = tcf_register_action(&act_mirred_ops, &mirred_net_ops);
+ if (err)
+ unregister_netdevice_notifier(&mirred_device_notifier);
+
+ return err;
}
static void __exit mirred_cleanup_module(void)
diff --git a/net/sched/act_mpls.c b/net/sched/act_mpls.c
new file mode 100644
index 0000000..4cf6c55
--- /dev/null
+++ b/net/sched/act_mpls.c
@@ -0,0 +1,412 @@
+// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+/* Copyright (C) 2019 Netronome Systems, Inc. */
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/mpls.h>
+#include <linux/rtnetlink.h>
+#include <linux/skbuff.h>
+#include <linux/tc_act/tc_mpls.h>
+#include <net/mpls.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+#include <net/pkt_cls.h>
+#include <net/tc_act/tc_mpls.h>
+
+static unsigned int mpls_net_id;
+static struct tc_action_ops act_mpls_ops;
+
+#define ACT_MPLS_TTL_DEFAULT 255
+
+static __be32 tcf_mpls_get_lse(struct mpls_shim_hdr *lse,
+ struct tcf_mpls_params *p, bool set_bos)
+{
+ u32 new_lse = 0;
+
+ if (lse)
+ new_lse = be32_to_cpu(lse->label_stack_entry);
+
+ if (p->tcfm_label != ACT_MPLS_LABEL_NOT_SET) {
+ new_lse &= ~MPLS_LS_LABEL_MASK;
+ new_lse |= p->tcfm_label << MPLS_LS_LABEL_SHIFT;
+ }
+ if (p->tcfm_ttl) {
+ new_lse &= ~MPLS_LS_TTL_MASK;
+ new_lse |= p->tcfm_ttl << MPLS_LS_TTL_SHIFT;
+ }
+ if (p->tcfm_tc != ACT_MPLS_TC_NOT_SET) {
+ new_lse &= ~MPLS_LS_TC_MASK;
+ new_lse |= p->tcfm_tc << MPLS_LS_TC_SHIFT;
+ }
+ if (p->tcfm_bos != ACT_MPLS_BOS_NOT_SET) {
+ new_lse &= ~MPLS_LS_S_MASK;
+ new_lse |= p->tcfm_bos << MPLS_LS_S_SHIFT;
+ } else if (set_bos) {
+ new_lse |= 1 << MPLS_LS_S_SHIFT;
+ }
+
+ return cpu_to_be32(new_lse);
+}
+
+static int tcf_mpls_act(struct sk_buff *skb, const struct tc_action *a,
+ struct tcf_result *res)
+{
+ struct tcf_mpls *m = to_mpls(a);
+ struct tcf_mpls_params *p;
+ __be32 new_lse;
+ int ret, mac_len;
+
+ tcf_lastuse_update(&m->tcf_tm);
+ bstats_cpu_update(this_cpu_ptr(m->common.cpu_bstats), skb);
+
+ /* Ensure 'data' points at mac_header prior calling mpls manipulating
+ * functions.
+ */
+ if (skb_at_tc_ingress(skb)) {
+ skb_push_rcsum(skb, skb->mac_len);
+ mac_len = skb->mac_len;
+ } else {
+ mac_len = skb_network_header(skb) - skb_mac_header(skb);
+ }
+
+ ret = READ_ONCE(m->tcf_action);
+
+ p = rcu_dereference_bh(m->mpls_p);
+
+ switch (p->tcfm_action) {
+ case TCA_MPLS_ACT_POP:
+ if (skb_mpls_pop(skb, p->tcfm_proto, mac_len))
+ goto drop;
+ break;
+ case TCA_MPLS_ACT_PUSH:
+ new_lse = tcf_mpls_get_lse(NULL, p, !eth_p_mpls(skb->protocol));
+ if (skb_mpls_push(skb, new_lse, p->tcfm_proto, mac_len))
+ goto drop;
+ break;
+ case TCA_MPLS_ACT_MODIFY:
+ new_lse = tcf_mpls_get_lse(mpls_hdr(skb), p, false);
+ if (skb_mpls_update_lse(skb, new_lse))
+ goto drop;
+ break;
+ case TCA_MPLS_ACT_DEC_TTL:
+ if (skb_mpls_dec_ttl(skb))
+ goto drop;
+ break;
+ }
+
+ if (skb_at_tc_ingress(skb))
+ skb_pull_rcsum(skb, skb->mac_len);
+
+ return ret;
+
+drop:
+ qstats_drop_inc(this_cpu_ptr(m->common.cpu_qstats));
+ return TC_ACT_SHOT;
+}
+
+static int valid_label(const struct nlattr *attr,
+ struct netlink_ext_ack *extack)
+{
+ const u32 *label = nla_data(attr);
+
+ if (*label & ~MPLS_LABEL_MASK || *label == MPLS_LABEL_IMPLNULL) {
+ NL_SET_ERR_MSG_MOD(extack, "MPLS label out of range");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static const struct nla_policy mpls_policy[TCA_MPLS_MAX + 1] = {
+ [TCA_MPLS_UNSPEC] = { .strict_start_type = TCA_MPLS_UNSPEC + 1 },
+ [TCA_MPLS_PARMS] = NLA_POLICY_EXACT_LEN(sizeof(struct tc_mpls)),
+ [TCA_MPLS_PROTO] = { .type = NLA_U16 },
+ [TCA_MPLS_LABEL] = NLA_POLICY_VALIDATE_FN(NLA_U32, valid_label),
+ [TCA_MPLS_TC] = NLA_POLICY_RANGE(NLA_U8, 0, 7),
+ [TCA_MPLS_TTL] = NLA_POLICY_MIN(NLA_U8, 1),
+ [TCA_MPLS_BOS] = NLA_POLICY_RANGE(NLA_U8, 0, 1),
+};
+
+static int tcf_mpls_init(struct net *net, struct nlattr *nla,
+ struct nlattr *est, struct tc_action **a,
+ int ovr, int bind, bool rtnl_held,
+ struct tcf_proto *tp, struct netlink_ext_ack *extack)
+{
+ struct tc_action_net *tn = net_generic(net, mpls_net_id);
+ struct nlattr *tb[TCA_MPLS_MAX + 1];
+ struct tcf_chain *goto_ch = NULL;
+ struct tcf_mpls_params *p;
+ struct tc_mpls *parm;
+ bool exists = false;
+ struct tcf_mpls *m;
+ int ret = 0, err;
+ u8 mpls_ttl = 0;
+ u32 index;
+
+ if (!nla) {
+ NL_SET_ERR_MSG_MOD(extack, "Missing netlink attributes");
+ return -EINVAL;
+ }
+
+ err = nla_parse_nested(tb, TCA_MPLS_MAX, nla, mpls_policy, extack);
+ if (err < 0)
+ return err;
+
+ if (!tb[TCA_MPLS_PARMS]) {
+ NL_SET_ERR_MSG_MOD(extack, "No MPLS params");
+ return -EINVAL;
+ }
+ parm = nla_data(tb[TCA_MPLS_PARMS]);
+ index = parm->index;
+
+ /* Verify parameters against action type. */
+ switch (parm->m_action) {
+ case TCA_MPLS_ACT_POP:
+ if (!tb[TCA_MPLS_PROTO]) {
+ NL_SET_ERR_MSG_MOD(extack, "Protocol must be set for MPLS pop");
+ return -EINVAL;
+ }
+ if (!eth_proto_is_802_3(nla_get_be16(tb[TCA_MPLS_PROTO]))) {
+ NL_SET_ERR_MSG_MOD(extack, "Invalid protocol type for MPLS pop");
+ return -EINVAL;
+ }
+ if (tb[TCA_MPLS_LABEL] || tb[TCA_MPLS_TTL] || tb[TCA_MPLS_TC] ||
+ tb[TCA_MPLS_BOS]) {
+ NL_SET_ERR_MSG_MOD(extack, "Label, TTL, TC or BOS cannot be used with MPLS pop");
+ return -EINVAL;
+ }
+ break;
+ case TCA_MPLS_ACT_DEC_TTL:
+ if (tb[TCA_MPLS_PROTO] || tb[TCA_MPLS_LABEL] ||
+ tb[TCA_MPLS_TTL] || tb[TCA_MPLS_TC] || tb[TCA_MPLS_BOS]) {
+ NL_SET_ERR_MSG_MOD(extack, "Label, TTL, TC, BOS or protocol cannot be used with MPLS dec_ttl");
+ return -EINVAL;
+ }
+ break;
+ case TCA_MPLS_ACT_PUSH:
+ if (!tb[TCA_MPLS_LABEL]) {
+ NL_SET_ERR_MSG_MOD(extack, "Label is required for MPLS push");
+ return -EINVAL;
+ }
+ if (tb[TCA_MPLS_PROTO] &&
+ !eth_p_mpls(nla_get_be16(tb[TCA_MPLS_PROTO]))) {
+ NL_SET_ERR_MSG_MOD(extack, "Protocol must be an MPLS type for MPLS push");
+ return -EPROTONOSUPPORT;
+ }
+ /* Push needs a TTL - if not specified, set a default value. */
+ if (!tb[TCA_MPLS_TTL]) {
+#if IS_ENABLED(CONFIG_MPLS)
+ mpls_ttl = net->mpls.default_ttl ?
+ net->mpls.default_ttl : ACT_MPLS_TTL_DEFAULT;
+#else
+ mpls_ttl = ACT_MPLS_TTL_DEFAULT;
+#endif
+ }
+ break;
+ case TCA_MPLS_ACT_MODIFY:
+ if (tb[TCA_MPLS_PROTO]) {
+ NL_SET_ERR_MSG_MOD(extack, "Protocol cannot be used with MPLS modify");
+ return -EINVAL;
+ }
+ break;
+ default:
+ NL_SET_ERR_MSG_MOD(extack, "Unknown MPLS action");
+ return -EINVAL;
+ }
+
+ err = tcf_idr_check_alloc(tn, &index, a, bind);
+ if (err < 0)
+ return err;
+ exists = err;
+ if (exists && bind)
+ return 0;
+
+ if (!exists) {
+ ret = tcf_idr_create(tn, index, est, a,
+ &act_mpls_ops, bind, true);
+ if (ret) {
+ tcf_idr_cleanup(tn, index);
+ return ret;
+ }
+
+ ret = ACT_P_CREATED;
+ } else if (!ovr) {
+ tcf_idr_release(*a, bind);
+ return -EEXIST;
+ }
+
+ err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack);
+ if (err < 0)
+ goto release_idr;
+
+ m = to_mpls(*a);
+
+ p = kzalloc(sizeof(*p), GFP_KERNEL);
+ if (!p) {
+ err = -ENOMEM;
+ goto put_chain;
+ }
+
+ p->tcfm_action = parm->m_action;
+ p->tcfm_label = tb[TCA_MPLS_LABEL] ? nla_get_u32(tb[TCA_MPLS_LABEL]) :
+ ACT_MPLS_LABEL_NOT_SET;
+ p->tcfm_tc = tb[TCA_MPLS_TC] ? nla_get_u8(tb[TCA_MPLS_TC]) :
+ ACT_MPLS_TC_NOT_SET;
+ p->tcfm_ttl = tb[TCA_MPLS_TTL] ? nla_get_u8(tb[TCA_MPLS_TTL]) :
+ mpls_ttl;
+ p->tcfm_bos = tb[TCA_MPLS_BOS] ? nla_get_u8(tb[TCA_MPLS_BOS]) :
+ ACT_MPLS_BOS_NOT_SET;
+ p->tcfm_proto = tb[TCA_MPLS_PROTO] ? nla_get_be16(tb[TCA_MPLS_PROTO]) :
+ htons(ETH_P_MPLS_UC);
+
+ spin_lock_bh(&m->tcf_lock);
+ goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch);
+ rcu_swap_protected(m->mpls_p, p, lockdep_is_held(&m->tcf_lock));
+ spin_unlock_bh(&m->tcf_lock);
+
+ if (goto_ch)
+ tcf_chain_put_by_act(goto_ch);
+ if (p)
+ kfree_rcu(p, rcu);
+
+ if (ret == ACT_P_CREATED)
+ tcf_idr_insert(tn, *a);
+ return ret;
+put_chain:
+ if (goto_ch)
+ tcf_chain_put_by_act(goto_ch);
+release_idr:
+ tcf_idr_release(*a, bind);
+ return err;
+}
+
+static void tcf_mpls_cleanup(struct tc_action *a)
+{
+ struct tcf_mpls *m = to_mpls(a);
+ struct tcf_mpls_params *p;
+
+ p = rcu_dereference_protected(m->mpls_p, 1);
+ if (p)
+ kfree_rcu(p, rcu);
+}
+
+static int tcf_mpls_dump(struct sk_buff *skb, struct tc_action *a,
+ int bind, int ref)
+{
+ unsigned char *b = skb_tail_pointer(skb);
+ struct tcf_mpls *m = to_mpls(a);
+ struct tcf_mpls_params *p;
+ struct tc_mpls opt = {
+ .index = m->tcf_index,
+ .refcnt = refcount_read(&m->tcf_refcnt) - ref,
+ .bindcnt = atomic_read(&m->tcf_bindcnt) - bind,
+ };
+ struct tcf_t t;
+
+ spin_lock_bh(&m->tcf_lock);
+ opt.action = m->tcf_action;
+ p = rcu_dereference_protected(m->mpls_p, lockdep_is_held(&m->tcf_lock));
+ opt.m_action = p->tcfm_action;
+
+ if (nla_put(skb, TCA_MPLS_PARMS, sizeof(opt), &opt))
+ goto nla_put_failure;
+
+ if (p->tcfm_label != ACT_MPLS_LABEL_NOT_SET &&
+ nla_put_u32(skb, TCA_MPLS_LABEL, p->tcfm_label))
+ goto nla_put_failure;
+
+ if (p->tcfm_tc != ACT_MPLS_TC_NOT_SET &&
+ nla_put_u8(skb, TCA_MPLS_TC, p->tcfm_tc))
+ goto nla_put_failure;
+
+ if (p->tcfm_ttl && nla_put_u8(skb, TCA_MPLS_TTL, p->tcfm_ttl))
+ goto nla_put_failure;
+
+ if (p->tcfm_bos != ACT_MPLS_BOS_NOT_SET &&
+ nla_put_u8(skb, TCA_MPLS_BOS, p->tcfm_bos))
+ goto nla_put_failure;
+
+ if (nla_put_be16(skb, TCA_MPLS_PROTO, p->tcfm_proto))
+ goto nla_put_failure;
+
+ tcf_tm_dump(&t, &m->tcf_tm);
+
+ if (nla_put_64bit(skb, TCA_MPLS_TM, sizeof(t), &t, TCA_MPLS_PAD))
+ goto nla_put_failure;
+
+ spin_unlock_bh(&m->tcf_lock);
+
+ return skb->len;
+
+nla_put_failure:
+ spin_unlock_bh(&m->tcf_lock);
+ nlmsg_trim(skb, b);
+ return -EMSGSIZE;
+}
+
+static int tcf_mpls_walker(struct net *net, struct sk_buff *skb,
+ struct netlink_callback *cb, int type,
+ const struct tc_action_ops *ops,
+ struct netlink_ext_ack *extack)
+{
+ struct tc_action_net *tn = net_generic(net, mpls_net_id);
+
+ return tcf_generic_walker(tn, skb, cb, type, ops, extack);
+}
+
+static int tcf_mpls_search(struct net *net, struct tc_action **a, u32 index)
+{
+ struct tc_action_net *tn = net_generic(net, mpls_net_id);
+
+ return tcf_idr_search(tn, a, index);
+}
+
+static struct tc_action_ops act_mpls_ops = {
+ .kind = "mpls",
+ .id = TCA_ID_MPLS,
+ .owner = THIS_MODULE,
+ .act = tcf_mpls_act,
+ .dump = tcf_mpls_dump,
+ .init = tcf_mpls_init,
+ .cleanup = tcf_mpls_cleanup,
+ .walk = tcf_mpls_walker,
+ .lookup = tcf_mpls_search,
+ .size = sizeof(struct tcf_mpls),
+};
+
+static __net_init int mpls_init_net(struct net *net)
+{
+ struct tc_action_net *tn = net_generic(net, mpls_net_id);
+
+ return tc_action_net_init(net, tn, &act_mpls_ops);
+}
+
+static void __net_exit mpls_exit_net(struct list_head *net_list)
+{
+ tc_action_net_exit(net_list, mpls_net_id);
+}
+
+static struct pernet_operations mpls_net_ops = {
+ .init = mpls_init_net,
+ .exit_batch = mpls_exit_net,
+ .id = &mpls_net_id,
+ .size = sizeof(struct tc_action_net),
+};
+
+static int __init mpls_init_module(void)
+{
+ return tcf_register_action(&act_mpls_ops, &mpls_net_ops);
+}
+
+static void __exit mpls_cleanup_module(void)
+{
+ tcf_unregister_action(&act_mpls_ops, &mpls_net_ops);
+}
+
+module_init(mpls_init_module);
+module_exit(mpls_cleanup_module);
+
+MODULE_AUTHOR("Netronome Systems <oss-drivers@netronome.com>");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("MPLS manipulation actions");
diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c
index 4313aa1..ea4c535 100644
--- a/net/sched/act_nat.c
+++ b/net/sched/act_nat.c
@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Stateless NAT actions
*
* Copyright (c) 2007 Herbert Xu <herbert@gondor.apana.org.au>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the Free
- * Software Foundation; either version 2 of the License, or (at your option)
- * any later version.
*/
#include <linux/errno.h>
@@ -21,6 +17,7 @@
#include <linux/string.h>
#include <linux/tc_act/tc_nat.h>
#include <net/act_api.h>
+#include <net/pkt_cls.h>
#include <net/icmp.h>
#include <net/ip.h>
#include <net/netlink.h>
@@ -38,31 +35,35 @@
static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est,
struct tc_action **a, int ovr, int bind,
- bool rtnl_held, struct netlink_ext_ack *extack)
+ bool rtnl_held, struct tcf_proto *tp,
+ struct netlink_ext_ack *extack)
{
struct tc_action_net *tn = net_generic(net, nat_net_id);
struct nlattr *tb[TCA_NAT_MAX + 1];
+ struct tcf_chain *goto_ch = NULL;
struct tc_nat *parm;
int ret = 0, err;
struct tcf_nat *p;
+ u32 index;
if (nla == NULL)
return -EINVAL;
- err = nla_parse_nested(tb, TCA_NAT_MAX, nla, nat_policy, NULL);
+ err = nla_parse_nested_deprecated(tb, TCA_NAT_MAX, nla, nat_policy,
+ NULL);
if (err < 0)
return err;
if (tb[TCA_NAT_PARMS] == NULL)
return -EINVAL;
parm = nla_data(tb[TCA_NAT_PARMS]);
-
- err = tcf_idr_check_alloc(tn, &parm->index, a, bind);
+ index = parm->index;
+ err = tcf_idr_check_alloc(tn, &index, a, bind);
if (!err) {
- ret = tcf_idr_create(tn, parm->index, est, a,
+ ret = tcf_idr_create(tn, index, est, a,
&act_nat_ops, bind, false);
if (ret) {
- tcf_idr_cleanup(tn, parm->index);
+ tcf_idr_cleanup(tn, index);
return ret;
}
ret = ACT_P_CREATED;
@@ -76,6 +77,9 @@
} else {
return err;
}
+ err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack);
+ if (err < 0)
+ goto release_idr;
p = to_tcf_nat(*a);
spin_lock_bh(&p->tcf_lock);
@@ -84,13 +88,18 @@
p->mask = parm->mask;
p->flags = parm->flags;
- p->tcf_action = parm->action;
+ goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch);
spin_unlock_bh(&p->tcf_lock);
+ if (goto_ch)
+ tcf_chain_put_by_act(goto_ch);
if (ret == ACT_P_CREATED)
tcf_idr_insert(tn, *a);
return ret;
+release_idr:
+ tcf_idr_release(*a, bind);
+ return err;
}
static int tcf_nat_act(struct sk_buff *skb, const struct tc_action *a,
@@ -256,28 +265,31 @@
unsigned char *b = skb_tail_pointer(skb);
struct tcf_nat *p = to_tcf_nat(a);
struct tc_nat opt = {
- .old_addr = p->old_addr,
- .new_addr = p->new_addr,
- .mask = p->mask,
- .flags = p->flags,
-
.index = p->tcf_index,
- .action = p->tcf_action,
.refcnt = refcount_read(&p->tcf_refcnt) - ref,
.bindcnt = atomic_read(&p->tcf_bindcnt) - bind,
};
struct tcf_t t;
+ spin_lock_bh(&p->tcf_lock);
+ opt.old_addr = p->old_addr;
+ opt.new_addr = p->new_addr;
+ opt.mask = p->mask;
+ opt.flags = p->flags;
+ opt.action = p->tcf_action;
+
if (nla_put(skb, TCA_NAT_PARMS, sizeof(opt), &opt))
goto nla_put_failure;
tcf_tm_dump(&t, &p->tcf_tm);
if (nla_put_64bit(skb, TCA_NAT_TM, sizeof(t), &t, TCA_NAT_PAD))
goto nla_put_failure;
+ spin_unlock_bh(&p->tcf_lock);
return skb->len;
nla_put_failure:
+ spin_unlock_bh(&p->tcf_lock);
nlmsg_trim(skb, b);
return -1;
}
@@ -292,8 +304,7 @@
return tcf_generic_walker(tn, skb, cb, type, ops, extack);
}
-static int tcf_nat_search(struct net *net, struct tc_action **a, u32 index,
- struct netlink_ext_ack *extack)
+static int tcf_nat_search(struct net *net, struct tc_action **a, u32 index)
{
struct tc_action_net *tn = net_generic(net, nat_net_id);
@@ -302,7 +313,7 @@
static struct tc_action_ops act_nat_ops = {
.kind = "nat",
- .type = TCA_ACT_NAT,
+ .id = TCA_ID_NAT,
.owner = THIS_MODULE,
.act = tcf_nat_act,
.dump = tcf_nat_dump,
@@ -316,7 +327,7 @@
{
struct tc_action_net *tn = net_generic(net, nat_net_id);
- return tc_action_net_init(tn, &act_nat_ops);
+ return tc_action_net_init(net, tn, &act_nat_ops);
}
static void __net_exit nat_exit_net(struct list_head *net_list)
diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c
index ca535a8..b5bc631 100644
--- a/net/sched/act_pedit.c
+++ b/net/sched/act_pedit.c
@@ -1,11 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* net/sched/act_pedit.c Generic packet editor
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Authors: Jamal Hadi Salim (2002-4)
*/
@@ -23,6 +19,7 @@
#include <linux/tc_act/tc_pedit.h>
#include <net/tc_act/tc_pedit.h>
#include <uapi/linux/tc_act/tc_pedit.h>
+#include <net/pkt_cls.h>
static unsigned int pedit_net_id;
static struct tc_action_ops act_pedit_ops;
@@ -46,7 +43,7 @@
int err = -EINVAL;
int rem;
- if (!nla || !n)
+ if (!nla)
return NULL;
keys_ex = kcalloc(n, sizeof(*k), GFP_KERNEL);
@@ -69,8 +66,9 @@
goto err_out;
}
- err = nla_parse_nested(tb, TCA_PEDIT_KEY_EX_MAX, ka,
- pedit_key_ex_policy, NULL);
+ err = nla_parse_nested_deprecated(tb, TCA_PEDIT_KEY_EX_MAX,
+ ka, pedit_key_ex_policy,
+ NULL);
if (err)
goto err_out;
@@ -107,14 +105,15 @@
static int tcf_pedit_key_ex_dump(struct sk_buff *skb,
struct tcf_pedit_key_ex *keys_ex, int n)
{
- struct nlattr *keys_start = nla_nest_start(skb, TCA_PEDIT_KEYS_EX);
+ struct nlattr *keys_start = nla_nest_start_noflag(skb,
+ TCA_PEDIT_KEYS_EX);
if (!keys_start)
goto nla_failure;
for (; n > 0; n--) {
struct nlattr *key_start;
- key_start = nla_nest_start(skb, TCA_PEDIT_KEY_EX);
+ key_start = nla_nest_start_noflag(skb, TCA_PEDIT_KEY_EX);
if (!key_start)
goto nla_failure;
@@ -138,10 +137,11 @@
static int tcf_pedit_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a,
int ovr, int bind, bool rtnl_held,
- struct netlink_ext_ack *extack)
+ struct tcf_proto *tp, struct netlink_ext_ack *extack)
{
struct tc_action_net *tn = net_generic(net, pedit_net_id);
struct nlattr *tb[TCA_PEDIT_MAX + 1];
+ struct tcf_chain *goto_ch = NULL;
struct tc_pedit_key *keys = NULL;
struct tcf_pedit_key_ex *keys_ex;
struct tc_pedit *parm;
@@ -149,13 +149,15 @@
struct tcf_pedit *p;
int ret = 0, err;
int ksize;
+ u32 index;
if (!nla) {
NL_SET_ERR_MSG_MOD(extack, "Pedit requires attributes to be passed");
return -EINVAL;
}
- err = nla_parse_nested(tb, TCA_PEDIT_MAX, nla, pedit_policy, NULL);
+ err = nla_parse_nested_deprecated(tb, TCA_PEDIT_MAX, nla,
+ pedit_policy, NULL);
if (err < 0)
return err;
@@ -168,6 +170,10 @@
}
parm = nla_data(pattr);
+ if (!parm->nkeys) {
+ NL_SET_ERR_MSG_MOD(extack, "Pedit requires keys to be passed");
+ return -EINVAL;
+ }
ksize = parm->nkeys * sizeof(struct tc_pedit_key);
if (nla_len(pattr) < sizeof(*parm) + ksize) {
NL_SET_ERR_MSG_ATTR(extack, pattr, "Length of TCA_PEDIT_PARMS or TCA_PEDIT_PARMS_EX pedit attribute is invalid");
@@ -178,18 +184,13 @@
if (IS_ERR(keys_ex))
return PTR_ERR(keys_ex);
- err = tcf_idr_check_alloc(tn, &parm->index, a, bind);
+ index = parm->index;
+ err = tcf_idr_check_alloc(tn, &index, a, bind);
if (!err) {
- if (!parm->nkeys) {
- tcf_idr_cleanup(tn, parm->index);
- NL_SET_ERR_MSG_MOD(extack, "Pedit requires keys to be passed");
- ret = -EINVAL;
- goto out_free;
- }
- ret = tcf_idr_create(tn, parm->index, est, a,
+ ret = tcf_idr_create(tn, index, est, a,
&act_pedit_ops, bind, false);
if (ret) {
- tcf_idr_cleanup(tn, parm->index);
+ tcf_idr_cleanup(tn, index);
goto out_free;
}
ret = ACT_P_CREATED;
@@ -205,6 +206,11 @@
goto out_free;
}
+ err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack);
+ if (err < 0) {
+ ret = err;
+ goto out_release;
+ }
p = to_pedit(*a);
spin_lock_bh(&p->tcf_lock);
@@ -214,7 +220,7 @@
if (!keys) {
spin_unlock_bh(&p->tcf_lock);
ret = -ENOMEM;
- goto out_release;
+ goto put_chain;
}
kfree(p->tcfp_keys);
p->tcfp_keys = keys;
@@ -223,16 +229,21 @@
memcpy(p->tcfp_keys, parm->keys, ksize);
p->tcfp_flags = parm->flags;
- p->tcf_action = parm->action;
+ goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch);
kfree(p->tcfp_keys_ex);
p->tcfp_keys_ex = keys_ex;
spin_unlock_bh(&p->tcf_lock);
+ if (goto_ch)
+ tcf_chain_put_by_act(goto_ch);
if (ret == ACT_P_CREATED)
tcf_idr_insert(tn, *a);
return ret;
+put_chain:
+ if (goto_ch)
+ tcf_chain_put_by_act(goto_ch);
out_release:
tcf_idr_release(*a, bind);
out_free:
@@ -406,7 +417,7 @@
struct tcf_t t;
int s;
- s = sizeof(*opt) + p->tcfp_nkeys * sizeof(struct tc_pedit_key);
+ s = struct_size(opt, keys, p->tcfp_nkeys);
/* netlink spinlocks held above us - must use ATOMIC */
opt = kzalloc(s, GFP_ATOMIC);
@@ -461,8 +472,7 @@
return tcf_generic_walker(tn, skb, cb, type, ops, extack);
}
-static int tcf_pedit_search(struct net *net, struct tc_action **a, u32 index,
- struct netlink_ext_ack *extack)
+static int tcf_pedit_search(struct net *net, struct tc_action **a, u32 index)
{
struct tc_action_net *tn = net_generic(net, pedit_net_id);
@@ -471,7 +481,7 @@
static struct tc_action_ops act_pedit_ops = {
.kind = "pedit",
- .type = TCA_ACT_PEDIT,
+ .id = TCA_ID_PEDIT,
.owner = THIS_MODULE,
.act = tcf_pedit_act,
.dump = tcf_pedit_dump,
@@ -486,7 +496,7 @@
{
struct tc_action_net *tn = net_generic(net, pedit_net_id);
- return tc_action_net_init(tn, &act_pedit_ops);
+ return tc_action_net_init(net, tn, &act_pedit_ops);
}
static void __net_exit pedit_exit_net(struct list_head *net_list)
diff --git a/net/sched/act_police.c b/net/sched/act_police.c
index 5d8bfa8..89c04c5 100644
--- a/net/sched/act_police.c
+++ b/net/sched/act_police.c
@@ -1,11 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* net/sched/act_police.c Input police filter
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
* J Hadi Salim (action changes)
*/
@@ -21,35 +17,8 @@
#include <linux/slab.h>
#include <net/act_api.h>
#include <net/netlink.h>
-
-struct tcf_police {
- struct tc_action common;
- int tcfp_result;
- u32 tcfp_ewma_rate;
- s64 tcfp_burst;
- u32 tcfp_mtu;
- s64 tcfp_toks;
- s64 tcfp_ptoks;
- s64 tcfp_mtu_ptoks;
- s64 tcfp_t_c;
- struct psched_ratecfg rate;
- bool rate_present;
- struct psched_ratecfg peak;
- bool peak_present;
-};
-
-#define to_police(pc) ((struct tcf_police *)pc)
-
-/* old policer structure from before tc actions */
-struct tc_police_compat {
- u32 index;
- int action;
- u32 limit;
- u32 burst;
- u32 mtu;
- struct tc_ratespec rate;
- struct tc_ratespec peakrate;
-};
+#include <net/pkt_cls.h>
+#include <net/tc_act/tc_police.h>
/* Each policer is serialized by its individual spinlock */
@@ -71,26 +40,33 @@
[TCA_POLICE_PEAKRATE] = { .len = TC_RTAB_SIZE },
[TCA_POLICE_AVRATE] = { .type = NLA_U32 },
[TCA_POLICE_RESULT] = { .type = NLA_U32 },
+ [TCA_POLICE_RATE64] = { .type = NLA_U64 },
+ [TCA_POLICE_PEAKRATE64] = { .type = NLA_U64 },
};
static int tcf_police_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a,
int ovr, int bind, bool rtnl_held,
+ struct tcf_proto *tp,
struct netlink_ext_ack *extack)
{
- int ret = 0, err;
+ int ret = 0, tcfp_result = TC_ACT_OK, err, size;
struct nlattr *tb[TCA_POLICE_MAX + 1];
+ struct tcf_chain *goto_ch = NULL;
struct tc_police *parm;
struct tcf_police *police;
struct qdisc_rate_table *R_tab = NULL, *P_tab = NULL;
struct tc_action_net *tn = net_generic(net, police_net_id);
+ struct tcf_police_params *new;
bool exists = false;
- int size;
+ u32 index;
+ u64 rate64, prate64;
if (nla == NULL)
return -EINVAL;
- err = nla_parse_nested(tb, TCA_POLICE_MAX, nla, police_policy, NULL);
+ err = nla_parse_nested_deprecated(tb, TCA_POLICE_MAX, nla,
+ police_policy, NULL);
if (err < 0)
return err;
@@ -101,7 +77,8 @@
return -EINVAL;
parm = nla_data(tb[TCA_POLICE_TBF]);
- err = tcf_idr_check_alloc(tn, &parm->index, a, bind);
+ index = parm->index;
+ err = tcf_idr_check_alloc(tn, &index, a, bind);
if (err < 0)
return err;
exists = err;
@@ -109,17 +86,21 @@
return 0;
if (!exists) {
- ret = tcf_idr_create(tn, parm->index, NULL, a,
- &act_police_ops, bind, false);
+ ret = tcf_idr_create(tn, index, NULL, a,
+ &act_police_ops, bind, true);
if (ret) {
- tcf_idr_cleanup(tn, parm->index);
+ tcf_idr_cleanup(tn, index);
return ret;
}
ret = ACT_P_CREATED;
+ spin_lock_init(&(to_police(*a)->tcfp_lock));
} else if (!ovr) {
tcf_idr_release(*a, bind);
return -EEXIST;
}
+ err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack);
+ if (err < 0)
+ goto release_idr;
police = to_police(*a);
if (parm->rate.rate) {
@@ -137,7 +118,8 @@
}
if (est) {
- err = gen_replace_estimator(&police->tcf_bstats, NULL,
+ err = gen_replace_estimator(&police->tcf_bstats,
+ police->common.cpu_bstats,
&police->tcf_rate_est,
&police->tcf_lock,
NULL, est);
@@ -150,55 +132,85 @@
goto failure;
}
- spin_lock_bh(&police->tcf_lock);
+ if (tb[TCA_POLICE_RESULT]) {
+ tcfp_result = nla_get_u32(tb[TCA_POLICE_RESULT]);
+ if (TC_ACT_EXT_CMP(tcfp_result, TC_ACT_GOTO_CHAIN)) {
+ NL_SET_ERR_MSG(extack,
+ "goto chain not allowed on fallback");
+ err = -EINVAL;
+ goto failure;
+ }
+ }
+
+ new = kzalloc(sizeof(*new), GFP_KERNEL);
+ if (unlikely(!new)) {
+ err = -ENOMEM;
+ goto failure;
+ }
+
/* No failure allowed after this point */
- police->tcfp_mtu = parm->mtu;
- if (police->tcfp_mtu == 0) {
- police->tcfp_mtu = ~0;
+ new->tcfp_result = tcfp_result;
+ new->tcfp_mtu = parm->mtu;
+ if (!new->tcfp_mtu) {
+ new->tcfp_mtu = ~0;
if (R_tab)
- police->tcfp_mtu = 255 << R_tab->rate.cell_log;
+ new->tcfp_mtu = 255 << R_tab->rate.cell_log;
}
if (R_tab) {
- police->rate_present = true;
- psched_ratecfg_precompute(&police->rate, &R_tab->rate, 0);
+ new->rate_present = true;
+ rate64 = tb[TCA_POLICE_RATE64] ?
+ nla_get_u64(tb[TCA_POLICE_RATE64]) : 0;
+ psched_ratecfg_precompute(&new->rate, &R_tab->rate, rate64);
qdisc_put_rtab(R_tab);
} else {
- police->rate_present = false;
+ new->rate_present = false;
}
if (P_tab) {
- police->peak_present = true;
- psched_ratecfg_precompute(&police->peak, &P_tab->rate, 0);
+ new->peak_present = true;
+ prate64 = tb[TCA_POLICE_PEAKRATE64] ?
+ nla_get_u64(tb[TCA_POLICE_PEAKRATE64]) : 0;
+ psched_ratecfg_precompute(&new->peak, &P_tab->rate, prate64);
qdisc_put_rtab(P_tab);
} else {
- police->peak_present = false;
+ new->peak_present = false;
}
- if (tb[TCA_POLICE_RESULT])
- police->tcfp_result = nla_get_u32(tb[TCA_POLICE_RESULT]);
- police->tcfp_burst = PSCHED_TICKS2NS(parm->burst);
- police->tcfp_toks = police->tcfp_burst;
- if (police->peak_present) {
- police->tcfp_mtu_ptoks = (s64) psched_l2t_ns(&police->peak,
- police->tcfp_mtu);
- police->tcfp_ptoks = police->tcfp_mtu_ptoks;
- }
- police->tcf_action = parm->action;
+ new->tcfp_burst = PSCHED_TICKS2NS(parm->burst);
+ if (new->peak_present)
+ new->tcfp_mtu_ptoks = (s64)psched_l2t_ns(&new->peak,
+ new->tcfp_mtu);
if (tb[TCA_POLICE_AVRATE])
- police->tcfp_ewma_rate = nla_get_u32(tb[TCA_POLICE_AVRATE]);
+ new->tcfp_ewma_rate = nla_get_u32(tb[TCA_POLICE_AVRATE]);
- spin_unlock_bh(&police->tcf_lock);
- if (ret != ACT_P_CREATED)
- return ret;
-
+ spin_lock_bh(&police->tcf_lock);
+ spin_lock_bh(&police->tcfp_lock);
police->tcfp_t_c = ktime_get_ns();
- tcf_idr_insert(tn, *a);
+ police->tcfp_toks = new->tcfp_burst;
+ if (new->peak_present)
+ police->tcfp_ptoks = new->tcfp_mtu_ptoks;
+ spin_unlock_bh(&police->tcfp_lock);
+ goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch);
+ rcu_swap_protected(police->params,
+ new,
+ lockdep_is_held(&police->tcf_lock));
+ spin_unlock_bh(&police->tcf_lock);
+ if (goto_ch)
+ tcf_chain_put_by_act(goto_ch);
+ if (new)
+ kfree_rcu(new, rcu);
+
+ if (ret == ACT_P_CREATED)
+ tcf_idr_insert(tn, *a);
return ret;
failure:
qdisc_put_rtab(P_tab);
qdisc_put_rtab(R_tab);
+ if (goto_ch)
+ tcf_chain_put_by_act(goto_ch);
+release_idr:
tcf_idr_release(*a, bind);
return err;
}
@@ -207,64 +219,86 @@
struct tcf_result *res)
{
struct tcf_police *police = to_police(a);
- s64 now;
- s64 toks;
- s64 ptoks = 0;
+ struct tcf_police_params *p;
+ s64 now, toks, ptoks = 0;
+ int ret;
- spin_lock(&police->tcf_lock);
-
- bstats_update(&police->tcf_bstats, skb);
tcf_lastuse_update(&police->tcf_tm);
+ bstats_cpu_update(this_cpu_ptr(police->common.cpu_bstats), skb);
- if (police->tcfp_ewma_rate) {
+ ret = READ_ONCE(police->tcf_action);
+ p = rcu_dereference_bh(police->params);
+
+ if (p->tcfp_ewma_rate) {
struct gnet_stats_rate_est64 sample;
if (!gen_estimator_read(&police->tcf_rate_est, &sample) ||
- sample.bps >= police->tcfp_ewma_rate) {
- police->tcf_qstats.overlimits++;
- if (police->tcf_action == TC_ACT_SHOT)
- police->tcf_qstats.drops++;
- spin_unlock(&police->tcf_lock);
- return police->tcf_action;
- }
+ sample.bps >= p->tcfp_ewma_rate)
+ goto inc_overlimits;
}
- if (qdisc_pkt_len(skb) <= police->tcfp_mtu) {
- if (!police->rate_present) {
- spin_unlock(&police->tcf_lock);
- return police->tcfp_result;
+ if (qdisc_pkt_len(skb) <= p->tcfp_mtu) {
+ if (!p->rate_present) {
+ ret = p->tcfp_result;
+ goto end;
}
now = ktime_get_ns();
- toks = min_t(s64, now - police->tcfp_t_c,
- police->tcfp_burst);
- if (police->peak_present) {
+ spin_lock_bh(&police->tcfp_lock);
+ toks = min_t(s64, now - police->tcfp_t_c, p->tcfp_burst);
+ if (p->peak_present) {
ptoks = toks + police->tcfp_ptoks;
- if (ptoks > police->tcfp_mtu_ptoks)
- ptoks = police->tcfp_mtu_ptoks;
- ptoks -= (s64) psched_l2t_ns(&police->peak,
- qdisc_pkt_len(skb));
+ if (ptoks > p->tcfp_mtu_ptoks)
+ ptoks = p->tcfp_mtu_ptoks;
+ ptoks -= (s64)psched_l2t_ns(&p->peak,
+ qdisc_pkt_len(skb));
}
toks += police->tcfp_toks;
- if (toks > police->tcfp_burst)
- toks = police->tcfp_burst;
- toks -= (s64) psched_l2t_ns(&police->rate, qdisc_pkt_len(skb));
+ if (toks > p->tcfp_burst)
+ toks = p->tcfp_burst;
+ toks -= (s64)psched_l2t_ns(&p->rate, qdisc_pkt_len(skb));
if ((toks|ptoks) >= 0) {
police->tcfp_t_c = now;
police->tcfp_toks = toks;
police->tcfp_ptoks = ptoks;
- if (police->tcfp_result == TC_ACT_SHOT)
- police->tcf_qstats.drops++;
- spin_unlock(&police->tcf_lock);
- return police->tcfp_result;
+ spin_unlock_bh(&police->tcfp_lock);
+ ret = p->tcfp_result;
+ goto inc_drops;
}
+ spin_unlock_bh(&police->tcfp_lock);
}
- police->tcf_qstats.overlimits++;
- if (police->tcf_action == TC_ACT_SHOT)
- police->tcf_qstats.drops++;
- spin_unlock(&police->tcf_lock);
- return police->tcf_action;
+inc_overlimits:
+ qstats_overlimit_inc(this_cpu_ptr(police->common.cpu_qstats));
+inc_drops:
+ if (ret == TC_ACT_SHOT)
+ qstats_drop_inc(this_cpu_ptr(police->common.cpu_qstats));
+end:
+ return ret;
+}
+
+static void tcf_police_cleanup(struct tc_action *a)
+{
+ struct tcf_police *police = to_police(a);
+ struct tcf_police_params *p;
+
+ p = rcu_dereference_protected(police->params, 1);
+ if (p)
+ kfree_rcu(p, rcu);
+}
+
+static void tcf_police_stats_update(struct tc_action *a,
+ u64 bytes, u32 packets,
+ u64 lastuse, bool hw)
+{
+ struct tcf_police *police = to_police(a);
+ struct tcf_t *tm = &police->tcf_tm;
+
+ _bstats_cpu_update(this_cpu_ptr(a->cpu_bstats), bytes, packets);
+ if (hw)
+ _bstats_cpu_update(this_cpu_ptr(a->cpu_bstats_hw),
+ bytes, packets);
+ tm->lastuse = max_t(u64, tm->lastuse, lastuse);
}
static int tcf_police_dump(struct sk_buff *skb, struct tc_action *a,
@@ -272,6 +306,7 @@
{
unsigned char *b = skb_tail_pointer(skb);
struct tcf_police *police = to_police(a);
+ struct tcf_police_params *p;
struct tc_police opt = {
.index = police->tcf_index,
.refcnt = refcount_read(&police->tcf_refcnt) - ref,
@@ -281,19 +316,33 @@
spin_lock_bh(&police->tcf_lock);
opt.action = police->tcf_action;
- opt.mtu = police->tcfp_mtu;
- opt.burst = PSCHED_NS2TICKS(police->tcfp_burst);
- if (police->rate_present)
- psched_ratecfg_getrate(&opt.rate, &police->rate);
- if (police->peak_present)
- psched_ratecfg_getrate(&opt.peakrate, &police->peak);
+ p = rcu_dereference_protected(police->params,
+ lockdep_is_held(&police->tcf_lock));
+ opt.mtu = p->tcfp_mtu;
+ opt.burst = PSCHED_NS2TICKS(p->tcfp_burst);
+ if (p->rate_present) {
+ psched_ratecfg_getrate(&opt.rate, &p->rate);
+ if ((police->params->rate.rate_bytes_ps >= (1ULL << 32)) &&
+ nla_put_u64_64bit(skb, TCA_POLICE_RATE64,
+ police->params->rate.rate_bytes_ps,
+ TCA_POLICE_PAD))
+ goto nla_put_failure;
+ }
+ if (p->peak_present) {
+ psched_ratecfg_getrate(&opt.peakrate, &p->peak);
+ if ((police->params->peak.rate_bytes_ps >= (1ULL << 32)) &&
+ nla_put_u64_64bit(skb, TCA_POLICE_PEAKRATE64,
+ police->params->peak.rate_bytes_ps,
+ TCA_POLICE_PAD))
+ goto nla_put_failure;
+ }
if (nla_put(skb, TCA_POLICE_TBF, sizeof(opt), &opt))
goto nla_put_failure;
- if (police->tcfp_result &&
- nla_put_u32(skb, TCA_POLICE_RESULT, police->tcfp_result))
+ if (p->tcfp_result &&
+ nla_put_u32(skb, TCA_POLICE_RESULT, p->tcfp_result))
goto nla_put_failure;
- if (police->tcfp_ewma_rate &&
- nla_put_u32(skb, TCA_POLICE_AVRATE, police->tcfp_ewma_rate))
+ if (p->tcfp_ewma_rate &&
+ nla_put_u32(skb, TCA_POLICE_AVRATE, p->tcfp_ewma_rate))
goto nla_put_failure;
t.install = jiffies_to_clock_t(jiffies - police->tcf_tm.install);
@@ -312,8 +361,7 @@
return -1;
}
-static int tcf_police_search(struct net *net, struct tc_action **a, u32 index,
- struct netlink_ext_ack *extack)
+static int tcf_police_search(struct net *net, struct tc_action **a, u32 index)
{
struct tc_action_net *tn = net_generic(net, police_net_id);
@@ -326,13 +374,15 @@
static struct tc_action_ops act_police_ops = {
.kind = "police",
- .type = TCA_ID_POLICE,
+ .id = TCA_ID_POLICE,
.owner = THIS_MODULE,
+ .stats_update = tcf_police_stats_update,
.act = tcf_police_act,
.dump = tcf_police_dump,
.init = tcf_police_init,
.walk = tcf_police_walker,
.lookup = tcf_police_search,
+ .cleanup = tcf_police_cleanup,
.size = sizeof(struct tcf_police),
};
@@ -340,7 +390,7 @@
{
struct tc_action_net *tn = net_generic(net, police_net_id);
- return tc_action_net_init(tn, &act_police_ops);
+ return tc_action_net_init(net, tn, &act_police_ops);
}
static void __net_exit police_exit_net(struct list_head *net_list)
diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c
index 6b67aa1..514456a 100644
--- a/net/sched/act_sample.c
+++ b/net/sched/act_sample.c
@@ -1,10 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* net/sched/act_sample.c - Packet sampling tc action
* Copyright (c) 2017 Yotam Gigi <yotamg@mellanox.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/types.h>
@@ -22,6 +19,7 @@
#include <linux/tc_act/tc_sample.h>
#include <net/tc_act/tc_sample.h>
#include <net/psample.h>
+#include <net/pkt_cls.h>
#include <linux/if_arp.h>
@@ -37,21 +35,23 @@
static int tcf_sample_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a, int ovr,
- int bind, bool rtnl_held,
+ int bind, bool rtnl_held, struct tcf_proto *tp,
struct netlink_ext_ack *extack)
{
struct tc_action_net *tn = net_generic(net, sample_net_id);
struct nlattr *tb[TCA_SAMPLE_MAX + 1];
struct psample_group *psample_group;
+ u32 psample_group_num, rate, index;
+ struct tcf_chain *goto_ch = NULL;
struct tc_sample *parm;
- u32 psample_group_num;
struct tcf_sample *s;
bool exists = false;
int ret, err;
if (!nla)
return -EINVAL;
- ret = nla_parse_nested(tb, TCA_SAMPLE_MAX, nla, sample_policy, NULL);
+ ret = nla_parse_nested_deprecated(tb, TCA_SAMPLE_MAX, nla,
+ sample_policy, NULL);
if (ret < 0)
return ret;
if (!tb[TCA_SAMPLE_PARMS] || !tb[TCA_SAMPLE_RATE] ||
@@ -59,8 +59,8 @@
return -EINVAL;
parm = nla_data(tb[TCA_SAMPLE_PARMS]);
-
- err = tcf_idr_check_alloc(tn, &parm->index, a, bind);
+ index = parm->index;
+ err = tcf_idr_check_alloc(tn, &index, a, bind);
if (err < 0)
return err;
exists = err;
@@ -68,10 +68,10 @@
return 0;
if (!exists) {
- ret = tcf_idr_create(tn, parm->index, est, a,
+ ret = tcf_idr_create(tn, index, est, a,
&act_sample_ops, bind, true);
if (ret) {
- tcf_idr_cleanup(tn, parm->index);
+ tcf_idr_cleanup(tn, index);
return ret;
}
ret = ACT_P_CREATED;
@@ -79,21 +79,31 @@
tcf_idr_release(*a, bind);
return -EEXIST;
}
+ err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack);
+ if (err < 0)
+ goto release_idr;
+ rate = nla_get_u32(tb[TCA_SAMPLE_RATE]);
+ if (!rate) {
+ NL_SET_ERR_MSG(extack, "invalid sample rate");
+ err = -EINVAL;
+ goto put_chain;
+ }
psample_group_num = nla_get_u32(tb[TCA_SAMPLE_PSAMPLE_GROUP]);
psample_group = psample_group_get(net, psample_group_num);
if (!psample_group) {
- tcf_idr_release(*a, bind);
- return -ENOMEM;
+ err = -ENOMEM;
+ goto put_chain;
}
s = to_sample(*a);
spin_lock_bh(&s->tcf_lock);
- s->tcf_action = parm->action;
- s->rate = nla_get_u32(tb[TCA_SAMPLE_RATE]);
+ goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch);
+ s->rate = rate;
s->psample_group_num = psample_group_num;
- RCU_INIT_POINTER(s->psample_group, psample_group);
+ rcu_swap_protected(s->psample_group, psample_group,
+ lockdep_is_held(&s->tcf_lock));
if (tb[TCA_SAMPLE_TRUNC_SIZE]) {
s->truncate = true;
@@ -101,9 +111,20 @@
}
spin_unlock_bh(&s->tcf_lock);
+ if (psample_group)
+ psample_group_put(psample_group);
+ if (goto_ch)
+ tcf_chain_put_by_act(goto_ch);
+
if (ret == ACT_P_CREATED)
tcf_idr_insert(tn, *a);
return ret;
+put_chain:
+ if (goto_ch)
+ tcf_chain_put_by_act(goto_ch);
+release_idr:
+ tcf_idr_release(*a, bind);
+ return err;
}
static void tcf_sample_cleanup(struct tc_action *a)
@@ -125,6 +146,7 @@
case ARPHRD_TUNNEL6:
case ARPHRD_SIT:
case ARPHRD_IPGRE:
+ case ARPHRD_IP6GRE:
case ARPHRD_VOID:
case ARPHRD_NONE:
return false;
@@ -224,17 +246,42 @@
return tcf_generic_walker(tn, skb, cb, type, ops, extack);
}
-static int tcf_sample_search(struct net *net, struct tc_action **a, u32 index,
- struct netlink_ext_ack *extack)
+static int tcf_sample_search(struct net *net, struct tc_action **a, u32 index)
{
struct tc_action_net *tn = net_generic(net, sample_net_id);
return tcf_idr_search(tn, a, index);
}
+static void tcf_psample_group_put(void *priv)
+{
+ struct psample_group *group = priv;
+
+ psample_group_put(group);
+}
+
+static struct psample_group *
+tcf_sample_get_group(const struct tc_action *a,
+ tc_action_priv_destructor *destructor)
+{
+ struct tcf_sample *s = to_sample(a);
+ struct psample_group *group;
+
+ spin_lock_bh(&s->tcf_lock);
+ group = rcu_dereference_protected(s->psample_group,
+ lockdep_is_held(&s->tcf_lock));
+ if (group) {
+ psample_group_take(group);
+ *destructor = tcf_psample_group_put;
+ }
+ spin_unlock_bh(&s->tcf_lock);
+
+ return group;
+}
+
static struct tc_action_ops act_sample_ops = {
.kind = "sample",
- .type = TCA_ACT_SAMPLE,
+ .id = TCA_ID_SAMPLE,
.owner = THIS_MODULE,
.act = tcf_sample_act,
.dump = tcf_sample_dump,
@@ -242,6 +289,7 @@
.cleanup = tcf_sample_cleanup,
.walk = tcf_sample_walker,
.lookup = tcf_sample_search,
+ .get_psample_group = tcf_sample_get_group,
.size = sizeof(struct tcf_sample),
};
@@ -249,7 +297,7 @@
{
struct tc_action_net *tn = net_generic(net, sample_net_id);
- return tc_action_net_init(tn, &act_sample_ops);
+ return tc_action_net_init(net, tn, &act_sample_ops);
}
static void __net_exit sample_exit_net(struct list_head *net_list)
diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c
index 52400d4..6120e56 100644
--- a/net/sched/act_simple.c
+++ b/net/sched/act_simple.c
@@ -1,13 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* net/sched/act_simple.c Simple example of an action
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Authors: Jamal Hadi Salim (2005-8)
- *
*/
#include <linux/module.h>
@@ -18,8 +13,7 @@
#include <linux/rtnetlink.h>
#include <net/netlink.h>
#include <net/pkt_sched.h>
-
-#define TCA_ACT_SIMP 22
+#include <net/pkt_cls.h>
#include <linux/tc_act/tc_defact.h>
#include <net/tc_act/tc_defact.h>
@@ -62,14 +56,26 @@
return 0;
}
-static void reset_policy(struct tcf_defact *d, const struct nlattr *defdata,
- struct tc_defact *p)
+static int reset_policy(struct tc_action *a, const struct nlattr *defdata,
+ struct tc_defact *p, struct tcf_proto *tp,
+ struct netlink_ext_ack *extack)
{
+ struct tcf_chain *goto_ch = NULL;
+ struct tcf_defact *d;
+ int err;
+
+ err = tcf_action_check_ctrlact(p->action, tp, &goto_ch, extack);
+ if (err < 0)
+ return err;
+ d = to_defact(a);
spin_lock_bh(&d->tcf_lock);
- d->tcf_action = p->action;
+ goto_ch = tcf_action_set_ctrlact(a, p->action, goto_ch);
memset(d->tcfd_defdata, 0, SIMP_MAX_DATA);
nla_strlcpy(d->tcfd_defdata, defdata, SIMP_MAX_DATA);
spin_unlock_bh(&d->tcf_lock);
+ if (goto_ch)
+ tcf_chain_put_by_act(goto_ch);
+ return 0;
}
static const struct nla_policy simple_policy[TCA_DEF_MAX + 1] = {
@@ -80,19 +86,22 @@
static int tcf_simp_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a,
int ovr, int bind, bool rtnl_held,
- struct netlink_ext_ack *extack)
+ struct tcf_proto *tp, struct netlink_ext_ack *extack)
{
struct tc_action_net *tn = net_generic(net, simp_net_id);
struct nlattr *tb[TCA_DEF_MAX + 1];
+ struct tcf_chain *goto_ch = NULL;
struct tc_defact *parm;
struct tcf_defact *d;
bool exists = false;
int ret = 0, err;
+ u32 index;
if (nla == NULL)
return -EINVAL;
- err = nla_parse_nested(tb, TCA_DEF_MAX, nla, simple_policy, NULL);
+ err = nla_parse_nested_deprecated(tb, TCA_DEF_MAX, nla, simple_policy,
+ NULL);
if (err < 0)
return err;
@@ -100,7 +109,8 @@
return -EINVAL;
parm = nla_data(tb[TCA_DEF_PARMS]);
- err = tcf_idr_check_alloc(tn, &parm->index, a, bind);
+ index = parm->index;
+ err = tcf_idr_check_alloc(tn, &index, a, bind);
if (err < 0)
return err;
exists = err;
@@ -111,40 +121,50 @@
if (exists)
tcf_idr_release(*a, bind);
else
- tcf_idr_cleanup(tn, parm->index);
+ tcf_idr_cleanup(tn, index);
return -EINVAL;
}
if (!exists) {
- ret = tcf_idr_create(tn, parm->index, est, a,
+ ret = tcf_idr_create(tn, index, est, a,
&act_simp_ops, bind, false);
if (ret) {
- tcf_idr_cleanup(tn, parm->index);
+ tcf_idr_cleanup(tn, index);
return ret;
}
d = to_defact(*a);
- ret = alloc_defdata(d, tb[TCA_DEF_DATA]);
- if (ret < 0) {
- tcf_idr_release(*a, bind);
- return ret;
- }
- d->tcf_action = parm->action;
+ err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch,
+ extack);
+ if (err < 0)
+ goto release_idr;
+
+ err = alloc_defdata(d, tb[TCA_DEF_DATA]);
+ if (err < 0)
+ goto put_chain;
+
+ tcf_action_set_ctrlact(*a, parm->action, goto_ch);
ret = ACT_P_CREATED;
} else {
- d = to_defact(*a);
-
if (!ovr) {
- tcf_idr_release(*a, bind);
- return -EEXIST;
+ err = -EEXIST;
+ goto release_idr;
}
- reset_policy(d, tb[TCA_DEF_DATA], parm);
+ err = reset_policy(*a, tb[TCA_DEF_DATA], parm, tp, extack);
+ if (err)
+ goto release_idr;
}
if (ret == ACT_P_CREATED)
tcf_idr_insert(tn, *a);
return ret;
+put_chain:
+ if (goto_ch)
+ tcf_chain_put_by_act(goto_ch);
+release_idr:
+ tcf_idr_release(*a, bind);
+ return err;
}
static int tcf_simp_dump(struct sk_buff *skb, struct tc_action *a,
@@ -188,8 +208,7 @@
return tcf_generic_walker(tn, skb, cb, type, ops, extack);
}
-static int tcf_simp_search(struct net *net, struct tc_action **a, u32 index,
- struct netlink_ext_ack *extack)
+static int tcf_simp_search(struct net *net, struct tc_action **a, u32 index)
{
struct tc_action_net *tn = net_generic(net, simp_net_id);
@@ -198,7 +217,7 @@
static struct tc_action_ops act_simp_ops = {
.kind = "simple",
- .type = TCA_ACT_SIMP,
+ .id = TCA_ID_SIMP,
.owner = THIS_MODULE,
.act = tcf_simp_act,
.dump = tcf_simp_dump,
@@ -213,7 +232,7 @@
{
struct tc_action_net *tn = net_generic(net, simp_net_id);
- return tc_action_net_init(tn, &act_simp_ops);
+ return tc_action_net_init(net, tn, &act_simp_ops);
}
static void __net_exit simp_exit_net(struct list_head *net_list)
diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c
index 73e44ce..6a8d333 100644
--- a/net/sched/act_skbedit.c
+++ b/net/sched/act_skbedit.c
@@ -1,18 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (c) 2008, Intel Corporation.
*
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, see <http://www.gnu.org/licenses/>.
- *
* Author: Alexander Duyck <alexander.h.duyck@intel.com>
*/
@@ -26,6 +15,7 @@
#include <net/ip.h>
#include <net/ipv6.h>
#include <net/dsfield.h>
+#include <net/pkt_cls.h>
#include <linux/tc_act/tc_skbedit.h>
#include <net/tc_act/tc_skbedit.h>
@@ -96,22 +86,26 @@
static int tcf_skbedit_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a,
int ovr, int bind, bool rtnl_held,
+ struct tcf_proto *tp,
struct netlink_ext_ack *extack)
{
struct tc_action_net *tn = net_generic(net, skbedit_net_id);
- struct tcf_skbedit_params *params_old, *params_new;
+ struct tcf_skbedit_params *params_new;
struct nlattr *tb[TCA_SKBEDIT_MAX + 1];
+ struct tcf_chain *goto_ch = NULL;
struct tc_skbedit *parm;
struct tcf_skbedit *d;
u32 flags = 0, *priority = NULL, *mark = NULL, *mask = NULL;
u16 *queue_mapping = NULL, *ptype = NULL;
bool exists = false;
int ret = 0, err;
+ u32 index;
if (nla == NULL)
return -EINVAL;
- err = nla_parse_nested(tb, TCA_SKBEDIT_MAX, nla, skbedit_policy, NULL);
+ err = nla_parse_nested_deprecated(tb, TCA_SKBEDIT_MAX, nla,
+ skbedit_policy, NULL);
if (err < 0)
return err;
@@ -153,8 +147,8 @@
}
parm = nla_data(tb[TCA_SKBEDIT_PARMS]);
-
- err = tcf_idr_check_alloc(tn, &parm->index, a, bind);
+ index = parm->index;
+ err = tcf_idr_check_alloc(tn, &index, a, bind);
if (err < 0)
return err;
exists = err;
@@ -165,15 +159,15 @@
if (exists)
tcf_idr_release(*a, bind);
else
- tcf_idr_cleanup(tn, parm->index);
+ tcf_idr_cleanup(tn, index);
return -EINVAL;
}
if (!exists) {
- ret = tcf_idr_create(tn, parm->index, est, a,
+ ret = tcf_idr_create(tn, index, est, a,
&act_skbedit_ops, bind, true);
if (ret) {
- tcf_idr_cleanup(tn, parm->index);
+ tcf_idr_cleanup(tn, index);
return ret;
}
@@ -186,14 +180,14 @@
return -EEXIST;
}
}
-
- ASSERT_RTNL();
+ err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack);
+ if (err < 0)
+ goto release_idr;
params_new = kzalloc(sizeof(*params_new), GFP_KERNEL);
if (unlikely(!params_new)) {
- if (ret == ACT_P_CREATED)
- tcf_idr_release(*a, bind);
- return -ENOMEM;
+ err = -ENOMEM;
+ goto put_chain;
}
params_new->flags = flags;
@@ -210,15 +204,25 @@
if (flags & SKBEDIT_F_MASK)
params_new->mask = *mask;
- d->tcf_action = parm->action;
- params_old = rtnl_dereference(d->params);
- rcu_assign_pointer(d->params, params_new);
- if (params_old)
- kfree_rcu(params_old, rcu);
+ spin_lock_bh(&d->tcf_lock);
+ goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch);
+ rcu_swap_protected(d->params, params_new,
+ lockdep_is_held(&d->tcf_lock));
+ spin_unlock_bh(&d->tcf_lock);
+ if (params_new)
+ kfree_rcu(params_new, rcu);
+ if (goto_ch)
+ tcf_chain_put_by_act(goto_ch);
if (ret == ACT_P_CREATED)
tcf_idr_insert(tn, *a);
return ret;
+put_chain:
+ if (goto_ch)
+ tcf_chain_put_by_act(goto_ch);
+release_idr:
+ tcf_idr_release(*a, bind);
+ return err;
}
static int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a,
@@ -231,12 +235,14 @@
.index = d->tcf_index,
.refcnt = refcount_read(&d->tcf_refcnt) - ref,
.bindcnt = atomic_read(&d->tcf_bindcnt) - bind,
- .action = d->tcf_action,
};
u64 pure_flags = 0;
struct tcf_t t;
- params = rtnl_dereference(d->params);
+ spin_lock_bh(&d->tcf_lock);
+ params = rcu_dereference_protected(d->params,
+ lockdep_is_held(&d->tcf_lock));
+ opt.action = d->tcf_action;
if (nla_put(skb, TCA_SKBEDIT_PARMS, sizeof(opt), &opt))
goto nla_put_failure;
@@ -264,9 +270,12 @@
tcf_tm_dump(&t, &d->tcf_tm);
if (nla_put_64bit(skb, TCA_SKBEDIT_TM, sizeof(t), &t, TCA_SKBEDIT_PAD))
goto nla_put_failure;
+ spin_unlock_bh(&d->tcf_lock);
+
return skb->len;
nla_put_failure:
+ spin_unlock_bh(&d->tcf_lock);
nlmsg_trim(skb, b);
return -1;
}
@@ -291,23 +300,34 @@
return tcf_generic_walker(tn, skb, cb, type, ops, extack);
}
-static int tcf_skbedit_search(struct net *net, struct tc_action **a, u32 index,
- struct netlink_ext_ack *extack)
+static int tcf_skbedit_search(struct net *net, struct tc_action **a, u32 index)
{
struct tc_action_net *tn = net_generic(net, skbedit_net_id);
return tcf_idr_search(tn, a, index);
}
+static size_t tcf_skbedit_get_fill_size(const struct tc_action *act)
+{
+ return nla_total_size(sizeof(struct tc_skbedit))
+ + nla_total_size(sizeof(u32)) /* TCA_SKBEDIT_PRIORITY */
+ + nla_total_size(sizeof(u16)) /* TCA_SKBEDIT_QUEUE_MAPPING */
+ + nla_total_size(sizeof(u32)) /* TCA_SKBEDIT_MARK */
+ + nla_total_size(sizeof(u16)) /* TCA_SKBEDIT_PTYPE */
+ + nla_total_size(sizeof(u32)) /* TCA_SKBEDIT_MASK */
+ + nla_total_size_64bit(sizeof(u64)); /* TCA_SKBEDIT_FLAGS */
+}
+
static struct tc_action_ops act_skbedit_ops = {
.kind = "skbedit",
- .type = TCA_ACT_SKBEDIT,
+ .id = TCA_ID_SKBEDIT,
.owner = THIS_MODULE,
.act = tcf_skbedit_act,
.dump = tcf_skbedit_dump,
.init = tcf_skbedit_init,
.cleanup = tcf_skbedit_cleanup,
.walk = tcf_skbedit_walker,
+ .get_fill_size = tcf_skbedit_get_fill_size,
.lookup = tcf_skbedit_search,
.size = sizeof(struct tcf_skbedit),
};
@@ -316,7 +336,7 @@
{
struct tc_action_net *tn = net_generic(net, skbedit_net_id);
- return tc_action_net_init(tn, &act_skbedit_ops);
+ return tc_action_net_init(net, tn, &act_skbedit_ops);
}
static void __net_exit skbedit_exit_net(struct list_head *net_list)
diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c
index 588077f..888437f 100644
--- a/net/sched/act_skbmod.c
+++ b/net/sched/act_skbmod.c
@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* net/sched/act_skbmod.c skb data modifier
*
* Copyright (c) 2016 Jamal Hadi Salim <jhs@mojatatu.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
*/
#include <linux/module.h>
@@ -16,6 +12,7 @@
#include <linux/rtnetlink.h>
#include <net/netlink.h>
#include <net/pkt_sched.h>
+#include <net/pkt_cls.h>
#include <linux/tc_act/tc_skbmod.h>
#include <net/tc_act/tc_skbmod.h>
@@ -82,24 +79,27 @@
static int tcf_skbmod_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a,
int ovr, int bind, bool rtnl_held,
+ struct tcf_proto *tp,
struct netlink_ext_ack *extack)
{
struct tc_action_net *tn = net_generic(net, skbmod_net_id);
struct nlattr *tb[TCA_SKBMOD_MAX + 1];
struct tcf_skbmod_params *p, *p_old;
+ struct tcf_chain *goto_ch = NULL;
struct tc_skbmod *parm;
+ u32 lflags = 0, index;
struct tcf_skbmod *d;
bool exists = false;
u8 *daddr = NULL;
u8 *saddr = NULL;
u16 eth_type = 0;
- u32 lflags = 0;
int ret = 0, err;
if (!nla)
return -EINVAL;
- err = nla_parse_nested(tb, TCA_SKBMOD_MAX, nla, skbmod_policy, NULL);
+ err = nla_parse_nested_deprecated(tb, TCA_SKBMOD_MAX, nla,
+ skbmod_policy, NULL);
if (err < 0)
return err;
@@ -122,10 +122,11 @@
}
parm = nla_data(tb[TCA_SKBMOD_PARMS]);
+ index = parm->index;
if (parm->flags & SKBMOD_F_SWAPMAC)
lflags = SKBMOD_F_SWAPMAC;
- err = tcf_idr_check_alloc(tn, &parm->index, a, bind);
+ err = tcf_idr_check_alloc(tn, &index, a, bind);
if (err < 0)
return err;
exists = err;
@@ -136,15 +137,15 @@
if (exists)
tcf_idr_release(*a, bind);
else
- tcf_idr_cleanup(tn, parm->index);
+ tcf_idr_cleanup(tn, index);
return -EINVAL;
}
if (!exists) {
- ret = tcf_idr_create(tn, parm->index, est, a,
+ ret = tcf_idr_create(tn, index, est, a,
&act_skbmod_ops, bind, true);
if (ret) {
- tcf_idr_cleanup(tn, parm->index);
+ tcf_idr_cleanup(tn, index);
return ret;
}
@@ -153,21 +154,24 @@
tcf_idr_release(*a, bind);
return -EEXIST;
}
+ err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack);
+ if (err < 0)
+ goto release_idr;
d = to_skbmod(*a);
p = kzalloc(sizeof(struct tcf_skbmod_params), GFP_KERNEL);
if (unlikely(!p)) {
- tcf_idr_release(*a, bind);
- return -ENOMEM;
+ err = -ENOMEM;
+ goto put_chain;
}
p->flags = lflags;
- d->tcf_action = parm->action;
if (ovr)
spin_lock_bh(&d->tcf_lock);
/* Protected by tcf_lock if overwriting existing action. */
+ goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch);
p_old = rcu_dereference_protected(d->skbmod_p, 1);
if (lflags & SKBMOD_F_DMAC)
@@ -183,10 +187,18 @@
if (p_old)
kfree_rcu(p_old, rcu);
+ if (goto_ch)
+ tcf_chain_put_by_act(goto_ch);
if (ret == ACT_P_CREATED)
tcf_idr_insert(tn, *a);
return ret;
+put_chain:
+ if (goto_ch)
+ tcf_chain_put_by_act(goto_ch);
+release_idr:
+ tcf_idr_release(*a, bind);
+ return err;
}
static void tcf_skbmod_cleanup(struct tc_action *a)
@@ -251,8 +263,7 @@
return tcf_generic_walker(tn, skb, cb, type, ops, extack);
}
-static int tcf_skbmod_search(struct net *net, struct tc_action **a, u32 index,
- struct netlink_ext_ack *extack)
+static int tcf_skbmod_search(struct net *net, struct tc_action **a, u32 index)
{
struct tc_action_net *tn = net_generic(net, skbmod_net_id);
@@ -261,7 +272,7 @@
static struct tc_action_ops act_skbmod_ops = {
.kind = "skbmod",
- .type = TCA_ACT_SKBMOD,
+ .id = TCA_ACT_SKBMOD,
.owner = THIS_MODULE,
.act = tcf_skbmod_act,
.dump = tcf_skbmod_dump,
@@ -276,7 +287,7 @@
{
struct tc_action_net *tn = net_generic(net, skbmod_net_id);
- return tc_action_net_init(tn, &act_skbmod_ops);
+ return tc_action_net_init(net, tn, &act_skbmod_ops);
}
static void __net_exit skbmod_exit_net(struct list_head *net_list)
diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c
index 681f6f0..d55669e 100644
--- a/net/sched/act_tunnel_key.c
+++ b/net/sched/act_tunnel_key.c
@@ -1,11 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Copyright (c) 2016, Amir Vadai <amir@vadai.me>
* Copyright (c) 2016, Mellanox Technologies. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
*/
#include <linux/module.h>
@@ -17,6 +13,7 @@
#include <net/netlink.h>
#include <net/pkt_sched.h>
#include <net/dst.h>
+#include <net/pkt_cls.h>
#include <linux/tc_act/tc_tunnel_key.h>
#include <net/tc_act/tc_tunnel_key.h>
@@ -75,8 +72,9 @@
int err, data_len, opt_len;
u8 *data;
- err = nla_parse_nested(tb, TCA_TUNNEL_KEY_ENC_OPT_GENEVE_MAX,
- nla, geneve_opt_policy, extack);
+ err = nla_parse_nested_deprecated(tb,
+ TCA_TUNNEL_KEY_ENC_OPT_GENEVE_MAX,
+ nla, geneve_opt_policy, extack);
if (err < 0)
return err;
@@ -124,8 +122,8 @@
int err, rem, opt_len, len = nla_len(nla), opts_len = 0;
const struct nlattr *attr, *head = nla_data(nla);
- err = nla_validate(head, len, TCA_TUNNEL_KEY_ENC_OPTS_MAX,
- enc_opts_policy, extack);
+ err = nla_validate_deprecated(head, len, TCA_TUNNEL_KEY_ENC_OPTS_MAX,
+ enc_opts_policy, extack);
if (err)
return err;
@@ -137,6 +135,10 @@
if (opt_len < 0)
return opt_len;
opts_len += opt_len;
+ if (opts_len > IP_TUNNEL_OPTS_MAX) {
+ NL_SET_ERR_MSG(extack, "Tunnel options exceeds max size");
+ return -EINVAL;
+ }
if (dst) {
dst_len -= opt_len;
dst += opt_len;
@@ -197,24 +199,37 @@
[TCA_TUNNEL_KEY_ENC_TTL] = { .type = NLA_U8 },
};
+static void tunnel_key_release_params(struct tcf_tunnel_key_params *p)
+{
+ if (!p)
+ return;
+ if (p->tcft_action == TCA_TUNNEL_KEY_ACT_SET)
+ dst_release(&p->tcft_enc_metadata->dst);
+
+ kfree_rcu(p, rcu);
+}
+
static int tunnel_key_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a,
int ovr, int bind, bool rtnl_held,
+ struct tcf_proto *tp,
struct netlink_ext_ack *extack)
{
struct tc_action_net *tn = net_generic(net, tunnel_key_net_id);
struct nlattr *tb[TCA_TUNNEL_KEY_MAX + 1];
struct tcf_tunnel_key_params *params_new;
struct metadata_dst *metadata = NULL;
+ struct tcf_chain *goto_ch = NULL;
struct tc_tunnel_key *parm;
struct tcf_tunnel_key *t;
bool exists = false;
__be16 dst_port = 0;
+ __be64 key_id = 0;
int opts_len = 0;
- __be64 key_id;
- __be16 flags;
+ __be16 flags = 0;
u8 tos, ttl;
int ret = 0;
+ u32 index;
int err;
if (!nla) {
@@ -222,8 +237,8 @@
return -EINVAL;
}
- err = nla_parse_nested(tb, TCA_TUNNEL_KEY_MAX, nla, tunnel_key_policy,
- extack);
+ err = nla_parse_nested_deprecated(tb, TCA_TUNNEL_KEY_MAX, nla,
+ tunnel_key_policy, extack);
if (err < 0) {
NL_SET_ERR_MSG(extack, "Failed to parse nested tunnel key attributes");
return err;
@@ -235,7 +250,8 @@
}
parm = nla_data(tb[TCA_TUNNEL_KEY_PARMS]);
- err = tcf_idr_check_alloc(tn, &parm->index, a, bind);
+ index = parm->index;
+ err = tcf_idr_check_alloc(tn, &index, a, bind);
if (err < 0)
return err;
exists = err;
@@ -246,15 +262,15 @@
case TCA_TUNNEL_KEY_ACT_RELEASE:
break;
case TCA_TUNNEL_KEY_ACT_SET:
- if (!tb[TCA_TUNNEL_KEY_ENC_KEY_ID]) {
- NL_SET_ERR_MSG(extack, "Missing tunnel key id");
- ret = -EINVAL;
- goto err_out;
+ if (tb[TCA_TUNNEL_KEY_ENC_KEY_ID]) {
+ __be32 key32;
+
+ key32 = nla_get_be32(tb[TCA_TUNNEL_KEY_ENC_KEY_ID]);
+ key_id = key32_to_tunnel_id(key32);
+ flags = TUNNEL_KEY;
}
- key_id = key32_to_tunnel_id(nla_get_be32(tb[TCA_TUNNEL_KEY_ENC_KEY_ID]));
-
- flags = TUNNEL_KEY | TUNNEL_CSUM;
+ flags |= TUNNEL_CSUM;
if (tb[TCA_TUNNEL_KEY_NO_CSUM] &&
nla_get_u8(tb[TCA_TUNNEL_KEY_NO_CSUM]))
flags &= ~TUNNEL_CSUM;
@@ -312,6 +328,12 @@
goto err_out;
}
+#ifdef CONFIG_DST_CACHE
+ ret = dst_cache_init(&metadata->u.tun_info.dst_cache, GFP_KERNEL);
+ if (ret)
+ goto release_tun_meta;
+#endif
+
if (opts_len) {
ret = tunnel_key_opts_set(tb[TCA_TUNNEL_KEY_ENC_OPTS],
&metadata->u.tun_info,
@@ -329,7 +351,7 @@
}
if (!exists) {
- ret = tcf_idr_create(tn, parm->index, est, a,
+ ret = tcf_idr_create(tn, index, est, a,
&act_tunnel_key_ops, bind, true);
if (ret) {
NL_SET_ERR_MSG(extack, "Cannot create TC IDR");
@@ -343,6 +365,12 @@
goto release_tun_meta;
}
+ err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack);
+ if (err < 0) {
+ ret = err;
+ exists = true;
+ goto release_tun_meta;
+ }
t = to_tunnel_key(*a);
params_new = kzalloc(sizeof(*params_new), GFP_KERNEL);
@@ -350,32 +378,38 @@
NL_SET_ERR_MSG(extack, "Cannot allocate tunnel key parameters");
ret = -ENOMEM;
exists = true;
- goto release_tun_meta;
+ goto put_chain;
}
params_new->tcft_action = parm->t_action;
params_new->tcft_enc_metadata = metadata;
spin_lock_bh(&t->tcf_lock);
- t->tcf_action = parm->action;
+ goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch);
rcu_swap_protected(t->params, params_new,
lockdep_is_held(&t->tcf_lock));
spin_unlock_bh(&t->tcf_lock);
- if (params_new)
- kfree_rcu(params_new, rcu);
+ tunnel_key_release_params(params_new);
+ if (goto_ch)
+ tcf_chain_put_by_act(goto_ch);
if (ret == ACT_P_CREATED)
tcf_idr_insert(tn, *a);
return ret;
+put_chain:
+ if (goto_ch)
+ tcf_chain_put_by_act(goto_ch);
+
release_tun_meta:
- dst_release(&metadata->dst);
+ if (metadata)
+ dst_release(&metadata->dst);
err_out:
if (exists)
tcf_idr_release(*a, bind);
else
- tcf_idr_cleanup(tn, parm->index);
+ tcf_idr_cleanup(tn, index);
return ret;
}
@@ -385,12 +419,7 @@
struct tcf_tunnel_key_params *params;
params = rcu_dereference_protected(t->params, 1);
- if (params) {
- if (params->tcft_action == TCA_TUNNEL_KEY_ACT_SET)
- dst_release(¶ms->tcft_enc_metadata->dst);
-
- kfree_rcu(params, rcu);
- }
+ tunnel_key_release_params(params);
}
static int tunnel_key_geneve_opts_dump(struct sk_buff *skb,
@@ -400,7 +429,7 @@
u8 *src = (u8 *)(info + 1);
struct nlattr *start;
- start = nla_nest_start(skb, TCA_TUNNEL_KEY_ENC_OPTS_GENEVE);
+ start = nla_nest_start_noflag(skb, TCA_TUNNEL_KEY_ENC_OPTS_GENEVE);
if (!start)
return -EMSGSIZE;
@@ -434,7 +463,7 @@
if (!info->options_len)
return 0;
- start = nla_nest_start(skb, TCA_TUNNEL_KEY_ENC_OPTS);
+ start = nla_nest_start_noflag(skb, TCA_TUNNEL_KEY_ENC_OPTS);
if (!start)
return -EMSGSIZE;
@@ -508,10 +537,13 @@
struct ip_tunnel_key *key = &info->key;
__be32 key_id = tunnel_id_to_key32(key->tun_id);
- if (nla_put_be32(skb, TCA_TUNNEL_KEY_ENC_KEY_ID, key_id) ||
+ if (((key->tun_flags & TUNNEL_KEY) &&
+ nla_put_be32(skb, TCA_TUNNEL_KEY_ENC_KEY_ID, key_id)) ||
tunnel_key_dump_addresses(skb,
¶ms->tcft_enc_metadata->u.tun_info) ||
- nla_put_be16(skb, TCA_TUNNEL_KEY_ENC_DST_PORT, key->tp_dst) ||
+ (key->tp_dst &&
+ nla_put_be16(skb, TCA_TUNNEL_KEY_ENC_DST_PORT,
+ key->tp_dst)) ||
nla_put_u8(skb, TCA_TUNNEL_KEY_NO_CSUM,
!(key->tun_flags & TUNNEL_CSUM)) ||
tunnel_key_opts_dump(skb, info))
@@ -548,8 +580,7 @@
return tcf_generic_walker(tn, skb, cb, type, ops, extack);
}
-static int tunnel_key_search(struct net *net, struct tc_action **a, u32 index,
- struct netlink_ext_ack *extack)
+static int tunnel_key_search(struct net *net, struct tc_action **a, u32 index)
{
struct tc_action_net *tn = net_generic(net, tunnel_key_net_id);
@@ -558,7 +589,7 @@
static struct tc_action_ops act_tunnel_key_ops = {
.kind = "tunnel_key",
- .type = TCA_ACT_TUNNEL_KEY,
+ .id = TCA_ID_TUNNEL_KEY,
.owner = THIS_MODULE,
.act = tunnel_key_act,
.dump = tunnel_key_dump,
@@ -573,7 +604,7 @@
{
struct tc_action_net *tn = net_generic(net, tunnel_key_net_id);
- return tc_action_net_init(tn, &act_tunnel_key_ops);
+ return tc_action_net_init(net, tn, &act_tunnel_key_ops);
}
static void __net_exit tunnel_key_exit_net(struct list_head *net_list)
diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c
index 033d273..08aaf71 100644
--- a/net/sched/act_vlan.c
+++ b/net/sched/act_vlan.c
@@ -1,10 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Copyright (c) 2014 Jiri Pirko <jiri@resnulli.us>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
*/
#include <linux/module.h>
@@ -15,6 +11,7 @@
#include <linux/if_vlan.h>
#include <net/netlink.h>
#include <net/pkt_sched.h>
+#include <net/pkt_cls.h>
#include <linux/tc_act/tc_vlan.h>
#include <net/tc_act/tc_vlan.h>
@@ -63,7 +60,7 @@
/* extract existing tag (and guarantee no hw-accel tag) */
if (skb_vlan_tag_present(skb)) {
tci = skb_vlan_tag_get(skb);
- skb->vlan_tci = 0;
+ __vlan_hwaccel_clear_tag(skb);
} else {
/* in-payload vlan tag, pop it */
err = __skb_vlan_pop(skb, &tci);
@@ -105,10 +102,11 @@
static int tcf_vlan_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a,
int ovr, int bind, bool rtnl_held,
- struct netlink_ext_ack *extack)
+ struct tcf_proto *tp, struct netlink_ext_ack *extack)
{
struct tc_action_net *tn = net_generic(net, vlan_net_id);
struct nlattr *tb[TCA_VLAN_MAX + 1];
+ struct tcf_chain *goto_ch = NULL;
struct tcf_vlan_params *p;
struct tc_vlan *parm;
struct tcf_vlan *v;
@@ -118,18 +116,21 @@
u8 push_prio = 0;
bool exists = false;
int ret = 0, err;
+ u32 index;
if (!nla)
return -EINVAL;
- err = nla_parse_nested(tb, TCA_VLAN_MAX, nla, vlan_policy, NULL);
+ err = nla_parse_nested_deprecated(tb, TCA_VLAN_MAX, nla, vlan_policy,
+ NULL);
if (err < 0)
return err;
if (!tb[TCA_VLAN_PARMS])
return -EINVAL;
parm = nla_data(tb[TCA_VLAN_PARMS]);
- err = tcf_idr_check_alloc(tn, &parm->index, a, bind);
+ index = parm->index;
+ err = tcf_idr_check_alloc(tn, &index, a, bind);
if (err < 0)
return err;
exists = err;
@@ -145,7 +146,7 @@
if (exists)
tcf_idr_release(*a, bind);
else
- tcf_idr_cleanup(tn, parm->index);
+ tcf_idr_cleanup(tn, index);
return -EINVAL;
}
push_vid = nla_get_u16(tb[TCA_VLAN_PUSH_VLAN_ID]);
@@ -153,7 +154,7 @@
if (exists)
tcf_idr_release(*a, bind);
else
- tcf_idr_cleanup(tn, parm->index);
+ tcf_idr_cleanup(tn, index);
return -ERANGE;
}
@@ -167,7 +168,7 @@
if (exists)
tcf_idr_release(*a, bind);
else
- tcf_idr_cleanup(tn, parm->index);
+ tcf_idr_cleanup(tn, index);
return -EPROTONOSUPPORT;
}
} else {
@@ -181,16 +182,16 @@
if (exists)
tcf_idr_release(*a, bind);
else
- tcf_idr_cleanup(tn, parm->index);
+ tcf_idr_cleanup(tn, index);
return -EINVAL;
}
action = parm->v_action;
if (!exists) {
- ret = tcf_idr_create(tn, parm->index, est, a,
+ ret = tcf_idr_create(tn, index, est, a,
&act_vlan_ops, bind, true);
if (ret) {
- tcf_idr_cleanup(tn, parm->index);
+ tcf_idr_cleanup(tn, index);
return ret;
}
@@ -200,12 +201,16 @@
return -EEXIST;
}
+ err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack);
+ if (err < 0)
+ goto release_idr;
+
v = to_vlan(*a);
p = kzalloc(sizeof(*p), GFP_KERNEL);
if (!p) {
- tcf_idr_release(*a, bind);
- return -ENOMEM;
+ err = -ENOMEM;
+ goto put_chain;
}
p->tcfv_action = action;
@@ -214,16 +219,24 @@
p->tcfv_push_proto = push_proto;
spin_lock_bh(&v->tcf_lock);
- v->tcf_action = parm->action;
+ goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch);
rcu_swap_protected(v->vlan_p, p, lockdep_is_held(&v->tcf_lock));
spin_unlock_bh(&v->tcf_lock);
+ if (goto_ch)
+ tcf_chain_put_by_act(goto_ch);
if (p)
kfree_rcu(p, rcu);
if (ret == ACT_P_CREATED)
tcf_idr_insert(tn, *a);
return ret;
+put_chain:
+ if (goto_ch)
+ tcf_chain_put_by_act(goto_ch);
+release_idr:
+ tcf_idr_release(*a, bind);
+ return err;
}
static void tcf_vlan_cleanup(struct tc_action *a)
@@ -288,23 +301,45 @@
return tcf_generic_walker(tn, skb, cb, type, ops, extack);
}
-static int tcf_vlan_search(struct net *net, struct tc_action **a, u32 index,
- struct netlink_ext_ack *extack)
+static void tcf_vlan_stats_update(struct tc_action *a, u64 bytes, u32 packets,
+ u64 lastuse, bool hw)
+{
+ struct tcf_vlan *v = to_vlan(a);
+ struct tcf_t *tm = &v->tcf_tm;
+
+ _bstats_cpu_update(this_cpu_ptr(a->cpu_bstats), bytes, packets);
+ if (hw)
+ _bstats_cpu_update(this_cpu_ptr(a->cpu_bstats_hw),
+ bytes, packets);
+ tm->lastuse = max_t(u64, tm->lastuse, lastuse);
+}
+
+static int tcf_vlan_search(struct net *net, struct tc_action **a, u32 index)
{
struct tc_action_net *tn = net_generic(net, vlan_net_id);
return tcf_idr_search(tn, a, index);
}
+static size_t tcf_vlan_get_fill_size(const struct tc_action *act)
+{
+ return nla_total_size(sizeof(struct tc_vlan))
+ + nla_total_size(sizeof(u16)) /* TCA_VLAN_PUSH_VLAN_ID */
+ + nla_total_size(sizeof(u16)) /* TCA_VLAN_PUSH_VLAN_PROTOCOL */
+ + nla_total_size(sizeof(u8)); /* TCA_VLAN_PUSH_VLAN_PRIORITY */
+}
+
static struct tc_action_ops act_vlan_ops = {
.kind = "vlan",
- .type = TCA_ACT_VLAN,
+ .id = TCA_ID_VLAN,
.owner = THIS_MODULE,
.act = tcf_vlan_act,
.dump = tcf_vlan_dump,
.init = tcf_vlan_init,
.cleanup = tcf_vlan_cleanup,
.walk = tcf_vlan_walker,
+ .stats_update = tcf_vlan_stats_update,
+ .get_fill_size = tcf_vlan_get_fill_size,
.lookup = tcf_vlan_search,
.size = sizeof(struct tcf_vlan),
};
@@ -313,7 +348,7 @@
{
struct tc_action_net *tn = net_generic(net, vlan_net_id);
- return tc_action_net_init(tn, &act_vlan_ops);
+ return tc_action_net_init(net, tn, &act_vlan_ops);
}
static void __net_exit vlan_exit_net(struct list_head *net_list)
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 70f144a..20d60b8 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -1,17 +1,12 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* net/sched/cls_api.c Packet classifier API.
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
*
* Changes:
*
* Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
- *
*/
#include <linux/module.h>
@@ -25,11 +20,25 @@
#include <linux/kmod.h>
#include <linux/slab.h>
#include <linux/idr.h>
+#include <linux/rhashtable.h>
+#include <linux/jhash.h>
#include <net/net_namespace.h>
#include <net/sock.h>
#include <net/netlink.h>
#include <net/pkt_sched.h>
#include <net/pkt_cls.h>
+#include <net/tc_act/tc_pedit.h>
+#include <net/tc_act/tc_mirred.h>
+#include <net/tc_act/tc_vlan.h>
+#include <net/tc_act/tc_tunnel_key.h>
+#include <net/tc_act/tc_csum.h>
+#include <net/tc_act/tc_gact.h>
+#include <net/tc_act/tc_police.h>
+#include <net/tc_act/tc_sample.h>
+#include <net/tc_act/tc_skbedit.h>
+#include <net/tc_act/tc_ct.h>
+#include <net/tc_act/tc_mpls.h>
+#include <net/flow_offload.h>
extern const struct nla_policy rtm_tca_policy[TCA_MAX + 1];
@@ -39,6 +48,62 @@
/* Protects list of registered TC modules. It is pure SMP lock. */
static DEFINE_RWLOCK(cls_mod_lock);
+static u32 destroy_obj_hashfn(const struct tcf_proto *tp)
+{
+ return jhash_3words(tp->chain->index, tp->prio,
+ (__force __u32)tp->protocol, 0);
+}
+
+static void tcf_proto_signal_destroying(struct tcf_chain *chain,
+ struct tcf_proto *tp)
+{
+ struct tcf_block *block = chain->block;
+
+ mutex_lock(&block->proto_destroy_lock);
+ hash_add_rcu(block->proto_destroy_ht, &tp->destroy_ht_node,
+ destroy_obj_hashfn(tp));
+ mutex_unlock(&block->proto_destroy_lock);
+}
+
+static bool tcf_proto_cmp(const struct tcf_proto *tp1,
+ const struct tcf_proto *tp2)
+{
+ return tp1->chain->index == tp2->chain->index &&
+ tp1->prio == tp2->prio &&
+ tp1->protocol == tp2->protocol;
+}
+
+static bool tcf_proto_exists_destroying(struct tcf_chain *chain,
+ struct tcf_proto *tp)
+{
+ u32 hash = destroy_obj_hashfn(tp);
+ struct tcf_proto *iter;
+ bool found = false;
+
+ rcu_read_lock();
+ hash_for_each_possible_rcu(chain->block->proto_destroy_ht, iter,
+ destroy_ht_node, hash) {
+ if (tcf_proto_cmp(tp, iter)) {
+ found = true;
+ break;
+ }
+ }
+ rcu_read_unlock();
+
+ return found;
+}
+
+static void
+tcf_proto_signal_destroyed(struct tcf_chain *chain, struct tcf_proto *tp)
+{
+ struct tcf_block *block = chain->block;
+
+ mutex_lock(&block->proto_destroy_lock);
+ if (hash_hashed(&tp->destroy_ht_node))
+ hash_del_rcu(&tp->destroy_ht_node);
+ mutex_unlock(&block->proto_destroy_lock);
+}
+
/* Find classifier type by string name */
static const struct tcf_proto_ops *__tcf_proto_lookup_ops(const char *kind)
@@ -60,7 +125,8 @@
}
static const struct tcf_proto_ops *
-tcf_proto_lookup_ops(const char *kind, struct netlink_ext_ack *extack)
+tcf_proto_lookup_ops(const char *kind, bool rtnl_held,
+ struct netlink_ext_ack *extack)
{
const struct tcf_proto_ops *ops;
@@ -68,9 +134,11 @@
if (ops)
return ops;
#ifdef CONFIG_MODULES
- rtnl_unlock();
+ if (rtnl_held)
+ rtnl_unlock();
request_module("cls_%s", kind);
- rtnl_lock();
+ if (rtnl_held)
+ rtnl_lock();
ops = __tcf_proto_lookup_ops(kind);
/* We dropped the RTNL semaphore in order to perform
* the module load. So, even if we succeeded in loading
@@ -151,8 +219,37 @@
return TC_H_MAJ(first);
}
+static bool tcf_proto_check_kind(struct nlattr *kind, char *name)
+{
+ if (kind)
+ return nla_strlcpy(name, kind, IFNAMSIZ) >= IFNAMSIZ;
+ memset(name, 0, IFNAMSIZ);
+ return false;
+}
+
+static bool tcf_proto_is_unlocked(const char *kind)
+{
+ const struct tcf_proto_ops *ops;
+ bool ret;
+
+ if (strlen(kind) == 0)
+ return false;
+
+ ops = tcf_proto_lookup_ops(kind, false, NULL);
+ /* On error return false to take rtnl lock. Proto lookup/create
+ * functions will perform lookup again and properly handle errors.
+ */
+ if (IS_ERR(ops))
+ return false;
+
+ ret = !!(ops->flags & TCF_PROTO_OPS_DOIT_UNLOCKED);
+ module_put(ops->owner);
+ return ret;
+}
+
static struct tcf_proto *tcf_proto_create(const char *kind, u32 protocol,
u32 prio, struct tcf_chain *chain,
+ bool rtnl_held,
struct netlink_ext_ack *extack)
{
struct tcf_proto *tp;
@@ -162,7 +259,7 @@
if (!tp)
return ERR_PTR(-ENOBUFS);
- tp->ops = tcf_proto_lookup_ops(kind, extack);
+ tp->ops = tcf_proto_lookup_ops(kind, rtnl_held, extack);
if (IS_ERR(tp->ops)) {
err = PTR_ERR(tp->ops);
goto errout;
@@ -171,6 +268,8 @@
tp->protocol = protocol;
tp->prio = prio;
tp->chain = chain;
+ spin_lock_init(&tp->lock);
+ refcount_set(&tp->refcnt, 1);
err = tp->ops->init(tp);
if (err) {
@@ -184,14 +283,82 @@
return ERR_PTR(err);
}
-static void tcf_proto_destroy(struct tcf_proto *tp,
- struct netlink_ext_ack *extack)
+static void tcf_proto_get(struct tcf_proto *tp)
{
- tp->ops->destroy(tp, extack);
+ refcount_inc(&tp->refcnt);
+}
+
+static void tcf_chain_put(struct tcf_chain *chain);
+
+static void tcf_proto_destroy(struct tcf_proto *tp, bool rtnl_held,
+ bool sig_destroy, struct netlink_ext_ack *extack)
+{
+ tp->ops->destroy(tp, rtnl_held, extack);
+ if (sig_destroy)
+ tcf_proto_signal_destroyed(tp->chain, tp);
+ tcf_chain_put(tp->chain);
module_put(tp->ops->owner);
kfree_rcu(tp, rcu);
}
+static void tcf_proto_put(struct tcf_proto *tp, bool rtnl_held,
+ struct netlink_ext_ack *extack)
+{
+ if (refcount_dec_and_test(&tp->refcnt))
+ tcf_proto_destroy(tp, rtnl_held, true, extack);
+}
+
+static int walker_check_empty(struct tcf_proto *tp, void *fh,
+ struct tcf_walker *arg)
+{
+ if (fh) {
+ arg->nonempty = true;
+ return -1;
+ }
+ return 0;
+}
+
+static bool tcf_proto_is_empty(struct tcf_proto *tp, bool rtnl_held)
+{
+ struct tcf_walker walker = { .fn = walker_check_empty, };
+
+ if (tp->ops->walk) {
+ tp->ops->walk(tp, &walker, rtnl_held);
+ return !walker.nonempty;
+ }
+ return true;
+}
+
+static bool tcf_proto_check_delete(struct tcf_proto *tp, bool rtnl_held)
+{
+ spin_lock(&tp->lock);
+ if (tcf_proto_is_empty(tp, rtnl_held))
+ tp->deleting = true;
+ spin_unlock(&tp->lock);
+ return tp->deleting;
+}
+
+static void tcf_proto_mark_delete(struct tcf_proto *tp)
+{
+ spin_lock(&tp->lock);
+ tp->deleting = true;
+ spin_unlock(&tp->lock);
+}
+
+static bool tcf_proto_is_deleting(struct tcf_proto *tp)
+{
+ bool deleting;
+
+ spin_lock(&tp->lock);
+ deleting = tp->deleting;
+ spin_unlock(&tp->lock);
+
+ return deleting;
+}
+
+#define ASSERT_BLOCK_LOCKED(block) \
+ lockdep_assert_held(&(block)->lock)
+
struct tcf_filter_chain_list_item {
struct list_head list;
tcf_chain_head_change_t *chain_head_change;
@@ -203,10 +370,13 @@
{
struct tcf_chain *chain;
+ ASSERT_BLOCK_LOCKED(block);
+
chain = kzalloc(sizeof(*chain), GFP_KERNEL);
if (!chain)
return NULL;
list_add_tail(&chain->list, &block->chain_list);
+ mutex_init(&chain->filter_chain_lock);
chain->block = block;
chain->index = chain_index;
chain->refcnt = 1;
@@ -230,29 +400,60 @@
if (chain->index)
return;
+
+ mutex_lock(&block->lock);
list_for_each_entry(item, &block->chain0.filter_chain_list, list)
tcf_chain_head_change_item(item, tp_head);
+ mutex_unlock(&block->lock);
}
-static void tcf_chain_destroy(struct tcf_chain *chain)
+/* Returns true if block can be safely freed. */
+
+static bool tcf_chain_detach(struct tcf_chain *chain)
{
struct tcf_block *block = chain->block;
+ ASSERT_BLOCK_LOCKED(block);
+
list_del(&chain->list);
if (!chain->index)
block->chain0.chain = NULL;
- kfree(chain);
- if (list_empty(&block->chain_list) && block->refcnt == 0)
- kfree(block);
+
+ if (list_empty(&block->chain_list) &&
+ refcount_read(&block->refcnt) == 0)
+ return true;
+
+ return false;
+}
+
+static void tcf_block_destroy(struct tcf_block *block)
+{
+ mutex_destroy(&block->lock);
+ mutex_destroy(&block->proto_destroy_lock);
+ kfree_rcu(block, rcu);
+}
+
+static void tcf_chain_destroy(struct tcf_chain *chain, bool free_block)
+{
+ struct tcf_block *block = chain->block;
+
+ mutex_destroy(&chain->filter_chain_lock);
+ kfree_rcu(chain, rcu);
+ if (free_block)
+ tcf_block_destroy(block);
}
static void tcf_chain_hold(struct tcf_chain *chain)
{
+ ASSERT_BLOCK_LOCKED(chain->block);
+
++chain->refcnt;
}
static bool tcf_chain_held_by_acts_only(struct tcf_chain *chain)
{
+ ASSERT_BLOCK_LOCKED(chain->block);
+
/* In case all the references are action references, this
* chain should not be shown to the user.
*/
@@ -264,6 +465,8 @@
{
struct tcf_chain *chain;
+ ASSERT_BLOCK_LOCKED(block);
+
list_for_each_entry(chain, &block->chain_list, list) {
if (chain->index == chain_index)
return chain;
@@ -278,31 +481,40 @@
u32 chain_index, bool create,
bool by_act)
{
- struct tcf_chain *chain = tcf_chain_lookup(block, chain_index);
+ struct tcf_chain *chain = NULL;
+ bool is_first_reference;
+ mutex_lock(&block->lock);
+ chain = tcf_chain_lookup(block, chain_index);
if (chain) {
tcf_chain_hold(chain);
} else {
if (!create)
- return NULL;
+ goto errout;
chain = tcf_chain_create(block, chain_index);
if (!chain)
- return NULL;
+ goto errout;
}
if (by_act)
++chain->action_refcnt;
+ is_first_reference = chain->refcnt - chain->action_refcnt == 1;
+ mutex_unlock(&block->lock);
/* Send notification only in case we got the first
* non-action reference. Until then, the chain acts only as
* a placeholder for actions pointing to it and user ought
* not know about them.
*/
- if (chain->refcnt - chain->action_refcnt == 1 && !by_act)
+ if (is_first_reference && !by_act)
tc_chain_notify(chain, NULL, 0, NLM_F_CREATE | NLM_F_EXCL,
RTM_NEWCHAIN, false);
return chain;
+
+errout:
+ mutex_unlock(&block->lock);
+ return chain;
}
static struct tcf_chain *tcf_chain_get(struct tcf_block *block, u32 chain_index,
@@ -317,72 +529,208 @@
}
EXPORT_SYMBOL(tcf_chain_get_by_act);
-static void tc_chain_tmplt_del(struct tcf_chain *chain);
+static void tc_chain_tmplt_del(const struct tcf_proto_ops *tmplt_ops,
+ void *tmplt_priv);
+static int tc_chain_notify_delete(const struct tcf_proto_ops *tmplt_ops,
+ void *tmplt_priv, u32 chain_index,
+ struct tcf_block *block, struct sk_buff *oskb,
+ u32 seq, u16 flags, bool unicast);
-static void __tcf_chain_put(struct tcf_chain *chain, bool by_act)
+static void __tcf_chain_put(struct tcf_chain *chain, bool by_act,
+ bool explicitly_created)
{
+ struct tcf_block *block = chain->block;
+ const struct tcf_proto_ops *tmplt_ops;
+ bool free_block = false;
+ unsigned int refcnt;
+ void *tmplt_priv;
+
+ mutex_lock(&block->lock);
+ if (explicitly_created) {
+ if (!chain->explicitly_created) {
+ mutex_unlock(&block->lock);
+ return;
+ }
+ chain->explicitly_created = false;
+ }
+
if (by_act)
chain->action_refcnt--;
- chain->refcnt--;
+
+ /* tc_chain_notify_delete can't be called while holding block lock.
+ * However, when block is unlocked chain can be changed concurrently, so
+ * save these to temporary variables.
+ */
+ refcnt = --chain->refcnt;
+ tmplt_ops = chain->tmplt_ops;
+ tmplt_priv = chain->tmplt_priv;
/* The last dropped non-action reference will trigger notification. */
- if (chain->refcnt - chain->action_refcnt == 0 && !by_act)
- tc_chain_notify(chain, NULL, 0, 0, RTM_DELCHAIN, false);
+ if (refcnt - chain->action_refcnt == 0 && !by_act) {
+ tc_chain_notify_delete(tmplt_ops, tmplt_priv, chain->index,
+ block, NULL, 0, 0, false);
+ /* Last reference to chain, no need to lock. */
+ chain->flushing = false;
+ }
- if (chain->refcnt == 0) {
- tc_chain_tmplt_del(chain);
- tcf_chain_destroy(chain);
+ if (refcnt == 0)
+ free_block = tcf_chain_detach(chain);
+ mutex_unlock(&block->lock);
+
+ if (refcnt == 0) {
+ tc_chain_tmplt_del(tmplt_ops, tmplt_priv);
+ tcf_chain_destroy(chain, free_block);
}
}
static void tcf_chain_put(struct tcf_chain *chain)
{
- __tcf_chain_put(chain, false);
+ __tcf_chain_put(chain, false, false);
}
void tcf_chain_put_by_act(struct tcf_chain *chain)
{
- __tcf_chain_put(chain, true);
+ __tcf_chain_put(chain, true, false);
}
EXPORT_SYMBOL(tcf_chain_put_by_act);
static void tcf_chain_put_explicitly_created(struct tcf_chain *chain)
{
- if (chain->explicitly_created)
- tcf_chain_put(chain);
+ __tcf_chain_put(chain, false, true);
}
-static void tcf_chain_flush(struct tcf_chain *chain)
+static void tcf_chain_flush(struct tcf_chain *chain, bool rtnl_held)
{
- struct tcf_proto *tp = rtnl_dereference(chain->filter_chain);
+ struct tcf_proto *tp, *tp_next;
- tcf_chain0_head_change(chain, NULL);
+ mutex_lock(&chain->filter_chain_lock);
+ tp = tcf_chain_dereference(chain->filter_chain, chain);
while (tp) {
- RCU_INIT_POINTER(chain->filter_chain, tp->next);
- tcf_proto_destroy(tp, NULL);
- tp = rtnl_dereference(chain->filter_chain);
- tcf_chain_put(chain);
+ tp_next = rcu_dereference_protected(tp->next, 1);
+ tcf_proto_signal_destroying(chain, tp);
+ tp = tp_next;
}
+ tp = tcf_chain_dereference(chain->filter_chain, chain);
+ RCU_INIT_POINTER(chain->filter_chain, NULL);
+ tcf_chain0_head_change(chain, NULL);
+ chain->flushing = true;
+ mutex_unlock(&chain->filter_chain_lock);
+
+ while (tp) {
+ tp_next = rcu_dereference_protected(tp->next, 1);
+ tcf_proto_put(tp, rtnl_held, NULL);
+ tp = tp_next;
+ }
+}
+
+static int tcf_block_setup(struct tcf_block *block,
+ struct flow_block_offload *bo);
+
+static void tc_indr_block_ing_cmd(struct net_device *dev,
+ struct tcf_block *block,
+ flow_indr_block_bind_cb_t *cb,
+ void *cb_priv,
+ enum flow_block_command command)
+{
+ struct flow_block_offload bo = {
+ .command = command,
+ .binder_type = FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS,
+ .net = dev_net(dev),
+ .block_shared = tcf_block_non_null_shared(block),
+ };
+ INIT_LIST_HEAD(&bo.cb_list);
+
+ if (!block)
+ return;
+
+ bo.block = &block->flow_block;
+
+ down_write(&block->cb_lock);
+ cb(dev, cb_priv, TC_SETUP_BLOCK, &bo);
+
+ tcf_block_setup(block, &bo);
+ up_write(&block->cb_lock);
+}
+
+static struct tcf_block *tc_dev_ingress_block(struct net_device *dev)
+{
+ const struct Qdisc_class_ops *cops;
+ struct Qdisc *qdisc;
+
+ if (!dev_ingress_queue(dev))
+ return NULL;
+
+ qdisc = dev_ingress_queue(dev)->qdisc_sleeping;
+ if (!qdisc)
+ return NULL;
+
+ cops = qdisc->ops->cl_ops;
+ if (!cops)
+ return NULL;
+
+ if (!cops->tcf_block)
+ return NULL;
+
+ return cops->tcf_block(qdisc, TC_H_MIN_INGRESS, NULL);
+}
+
+static void tc_indr_block_get_and_ing_cmd(struct net_device *dev,
+ flow_indr_block_bind_cb_t *cb,
+ void *cb_priv,
+ enum flow_block_command command)
+{
+ struct tcf_block *block = tc_dev_ingress_block(dev);
+
+ tc_indr_block_ing_cmd(dev, block, cb, cb_priv, command);
+}
+
+static void tc_indr_block_call(struct tcf_block *block,
+ struct net_device *dev,
+ struct tcf_block_ext_info *ei,
+ enum flow_block_command command,
+ struct netlink_ext_ack *extack)
+{
+ struct flow_block_offload bo = {
+ .command = command,
+ .binder_type = ei->binder_type,
+ .net = dev_net(dev),
+ .block = &block->flow_block,
+ .block_shared = tcf_block_shared(block),
+ .extack = extack,
+ };
+ INIT_LIST_HEAD(&bo.cb_list);
+
+ flow_indr_block_call(dev, &bo, command);
+ tcf_block_setup(block, &bo);
}
static bool tcf_block_offload_in_use(struct tcf_block *block)
{
- return block->offloadcnt;
+ return atomic_read(&block->offloadcnt);
}
static int tcf_block_offload_cmd(struct tcf_block *block,
struct net_device *dev,
struct tcf_block_ext_info *ei,
- enum tc_block_command command,
+ enum flow_block_command command,
struct netlink_ext_ack *extack)
{
- struct tc_block_offload bo = {};
+ struct flow_block_offload bo = {};
+ int err;
+ bo.net = dev_net(dev);
bo.command = command;
bo.binder_type = ei->binder_type;
- bo.block = block;
+ bo.block = &block->flow_block;
+ bo.block_shared = tcf_block_shared(block);
bo.extack = extack;
- return dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_BLOCK, &bo);
+ INIT_LIST_HEAD(&bo.cb_list);
+
+ err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_BLOCK, &bo);
+ if (err < 0)
+ return err;
+
+ return tcf_block_setup(block, &bo);
}
static int tcf_block_offload_bind(struct tcf_block *block, struct Qdisc *q,
@@ -392,6 +740,7 @@
struct net_device *dev = q->dev_queue->dev;
int err;
+ down_write(&block->cb_lock);
if (!dev->netdev_ops->ndo_setup_tc)
goto no_offload_dev_inc;
@@ -400,19 +749,31 @@
*/
if (!tc_can_offload(dev) && tcf_block_offload_in_use(block)) {
NL_SET_ERR_MSG(extack, "Bind to offloaded block failed as dev has offload disabled");
- return -EOPNOTSUPP;
+ err = -EOPNOTSUPP;
+ goto err_unlock;
}
- err = tcf_block_offload_cmd(block, dev, ei, TC_BLOCK_BIND, extack);
+ err = tcf_block_offload_cmd(block, dev, ei, FLOW_BLOCK_BIND, extack);
if (err == -EOPNOTSUPP)
goto no_offload_dev_inc;
- return err;
+ if (err)
+ goto err_unlock;
+
+ tc_indr_block_call(block, dev, ei, FLOW_BLOCK_BIND, extack);
+ up_write(&block->cb_lock);
+ return 0;
no_offload_dev_inc:
- if (tcf_block_offload_in_use(block))
- return -EOPNOTSUPP;
+ if (tcf_block_offload_in_use(block)) {
+ err = -EOPNOTSUPP;
+ goto err_unlock;
+ }
+ err = 0;
block->nooffloaddevcnt++;
- return 0;
+ tc_indr_block_call(block, dev, ei, FLOW_BLOCK_BIND, extack);
+err_unlock:
+ up_write(&block->cb_lock);
+ return err;
}
static void tcf_block_offload_unbind(struct tcf_block *block, struct Qdisc *q,
@@ -421,15 +782,20 @@
struct net_device *dev = q->dev_queue->dev;
int err;
+ down_write(&block->cb_lock);
+ tc_indr_block_call(block, dev, ei, FLOW_BLOCK_UNBIND, NULL);
+
if (!dev->netdev_ops->ndo_setup_tc)
goto no_offload_dev_dec;
- err = tcf_block_offload_cmd(block, dev, ei, TC_BLOCK_UNBIND, NULL);
+ err = tcf_block_offload_cmd(block, dev, ei, FLOW_BLOCK_UNBIND, NULL);
if (err == -EOPNOTSUPP)
goto no_offload_dev_dec;
+ up_write(&block->cb_lock);
return;
no_offload_dev_dec:
WARN_ON(block->nooffloaddevcnt-- == 0);
+ up_write(&block->cb_lock);
}
static int
@@ -437,8 +803,8 @@
struct tcf_block_ext_info *ei,
struct netlink_ext_ack *extack)
{
- struct tcf_chain *chain0 = block->chain0.chain;
struct tcf_filter_chain_list_item *item;
+ struct tcf_chain *chain0;
item = kmalloc(sizeof(*item), GFP_KERNEL);
if (!item) {
@@ -447,9 +813,32 @@
}
item->chain_head_change = ei->chain_head_change;
item->chain_head_change_priv = ei->chain_head_change_priv;
- if (chain0 && chain0->filter_chain)
- tcf_chain_head_change_item(item, chain0->filter_chain);
- list_add(&item->list, &block->chain0.filter_chain_list);
+
+ mutex_lock(&block->lock);
+ chain0 = block->chain0.chain;
+ if (chain0)
+ tcf_chain_hold(chain0);
+ else
+ list_add(&item->list, &block->chain0.filter_chain_list);
+ mutex_unlock(&block->lock);
+
+ if (chain0) {
+ struct tcf_proto *tp_head;
+
+ mutex_lock(&chain0->filter_chain_lock);
+
+ tp_head = tcf_chain_dereference(chain0->filter_chain, chain0);
+ if (tp_head)
+ tcf_chain_head_change_item(item, tp_head);
+
+ mutex_lock(&block->lock);
+ list_add(&item->list, &block->chain0.filter_chain_list);
+ mutex_unlock(&block->lock);
+
+ mutex_unlock(&chain0->filter_chain_lock);
+ tcf_chain_put(chain0);
+ }
+
return 0;
}
@@ -457,24 +846,28 @@
tcf_chain0_head_change_cb_del(struct tcf_block *block,
struct tcf_block_ext_info *ei)
{
- struct tcf_chain *chain0 = block->chain0.chain;
struct tcf_filter_chain_list_item *item;
+ mutex_lock(&block->lock);
list_for_each_entry(item, &block->chain0.filter_chain_list, list) {
if ((!ei->chain_head_change && !ei->chain_head_change_priv) ||
(item->chain_head_change == ei->chain_head_change &&
item->chain_head_change_priv == ei->chain_head_change_priv)) {
- if (chain0)
+ if (block->chain0.chain)
tcf_chain_head_change_item(item, NULL);
list_del(&item->list);
+ mutex_unlock(&block->lock);
+
kfree(item);
return;
}
}
+ mutex_unlock(&block->lock);
WARN_ON(1);
}
struct tcf_net {
+ spinlock_t idr_lock; /* Protects idr */
struct idr idr;
};
@@ -484,16 +877,25 @@
struct netlink_ext_ack *extack)
{
struct tcf_net *tn = net_generic(net, tcf_net_id);
+ int err;
- return idr_alloc_u32(&tn->idr, block, &block->index, block->index,
- GFP_KERNEL);
+ idr_preload(GFP_KERNEL);
+ spin_lock(&tn->idr_lock);
+ err = idr_alloc_u32(&tn->idr, block, &block->index, block->index,
+ GFP_NOWAIT);
+ spin_unlock(&tn->idr_lock);
+ idr_preload_end();
+
+ return err;
}
static void tcf_block_remove(struct tcf_block *block, struct net *net)
{
struct tcf_net *tn = net_generic(net, tcf_net_id);
+ spin_lock(&tn->idr_lock);
idr_remove(&tn->idr, block->index);
+ spin_unlock(&tn->idr_lock);
}
static struct tcf_block *tcf_block_create(struct net *net, struct Qdisc *q,
@@ -507,12 +909,15 @@
NL_SET_ERR_MSG(extack, "Memory allocation for block failed");
return ERR_PTR(-ENOMEM);
}
+ mutex_init(&block->lock);
+ mutex_init(&block->proto_destroy_lock);
+ init_rwsem(&block->cb_lock);
+ flow_block_init(&block->flow_block);
INIT_LIST_HEAD(&block->chain_list);
- INIT_LIST_HEAD(&block->cb_list);
INIT_LIST_HEAD(&block->owner_list);
INIT_LIST_HEAD(&block->chain0.filter_chain_list);
- block->refcnt = 1;
+ refcount_set(&block->refcnt, 1);
block->net = net;
block->index = block_index;
@@ -529,6 +934,301 @@
return idr_find(&tn->idr, block_index);
}
+static struct tcf_block *tcf_block_refcnt_get(struct net *net, u32 block_index)
+{
+ struct tcf_block *block;
+
+ rcu_read_lock();
+ block = tcf_block_lookup(net, block_index);
+ if (block && !refcount_inc_not_zero(&block->refcnt))
+ block = NULL;
+ rcu_read_unlock();
+
+ return block;
+}
+
+static struct tcf_chain *
+__tcf_get_next_chain(struct tcf_block *block, struct tcf_chain *chain)
+{
+ mutex_lock(&block->lock);
+ if (chain)
+ chain = list_is_last(&chain->list, &block->chain_list) ?
+ NULL : list_next_entry(chain, list);
+ else
+ chain = list_first_entry_or_null(&block->chain_list,
+ struct tcf_chain, list);
+
+ /* skip all action-only chains */
+ while (chain && tcf_chain_held_by_acts_only(chain))
+ chain = list_is_last(&chain->list, &block->chain_list) ?
+ NULL : list_next_entry(chain, list);
+
+ if (chain)
+ tcf_chain_hold(chain);
+ mutex_unlock(&block->lock);
+
+ return chain;
+}
+
+/* Function to be used by all clients that want to iterate over all chains on
+ * block. It properly obtains block->lock and takes reference to chain before
+ * returning it. Users of this function must be tolerant to concurrent chain
+ * insertion/deletion or ensure that no concurrent chain modification is
+ * possible. Note that all netlink dump callbacks cannot guarantee to provide
+ * consistent dump because rtnl lock is released each time skb is filled with
+ * data and sent to user-space.
+ */
+
+struct tcf_chain *
+tcf_get_next_chain(struct tcf_block *block, struct tcf_chain *chain)
+{
+ struct tcf_chain *chain_next = __tcf_get_next_chain(block, chain);
+
+ if (chain)
+ tcf_chain_put(chain);
+
+ return chain_next;
+}
+EXPORT_SYMBOL(tcf_get_next_chain);
+
+static struct tcf_proto *
+__tcf_get_next_proto(struct tcf_chain *chain, struct tcf_proto *tp)
+{
+ u32 prio = 0;
+
+ ASSERT_RTNL();
+ mutex_lock(&chain->filter_chain_lock);
+
+ if (!tp) {
+ tp = tcf_chain_dereference(chain->filter_chain, chain);
+ } else if (tcf_proto_is_deleting(tp)) {
+ /* 'deleting' flag is set and chain->filter_chain_lock was
+ * unlocked, which means next pointer could be invalid. Restart
+ * search.
+ */
+ prio = tp->prio + 1;
+ tp = tcf_chain_dereference(chain->filter_chain, chain);
+
+ for (; tp; tp = tcf_chain_dereference(tp->next, chain))
+ if (!tp->deleting && tp->prio >= prio)
+ break;
+ } else {
+ tp = tcf_chain_dereference(tp->next, chain);
+ }
+
+ if (tp)
+ tcf_proto_get(tp);
+
+ mutex_unlock(&chain->filter_chain_lock);
+
+ return tp;
+}
+
+/* Function to be used by all clients that want to iterate over all tp's on
+ * chain. Users of this function must be tolerant to concurrent tp
+ * insertion/deletion or ensure that no concurrent chain modification is
+ * possible. Note that all netlink dump callbacks cannot guarantee to provide
+ * consistent dump because rtnl lock is released each time skb is filled with
+ * data and sent to user-space.
+ */
+
+struct tcf_proto *
+tcf_get_next_proto(struct tcf_chain *chain, struct tcf_proto *tp,
+ bool rtnl_held)
+{
+ struct tcf_proto *tp_next = __tcf_get_next_proto(chain, tp);
+
+ if (tp)
+ tcf_proto_put(tp, rtnl_held, NULL);
+
+ return tp_next;
+}
+EXPORT_SYMBOL(tcf_get_next_proto);
+
+static void tcf_block_flush_all_chains(struct tcf_block *block, bool rtnl_held)
+{
+ struct tcf_chain *chain;
+
+ /* Last reference to block. At this point chains cannot be added or
+ * removed concurrently.
+ */
+ for (chain = tcf_get_next_chain(block, NULL);
+ chain;
+ chain = tcf_get_next_chain(block, chain)) {
+ tcf_chain_put_explicitly_created(chain);
+ tcf_chain_flush(chain, rtnl_held);
+ }
+}
+
+/* Lookup Qdisc and increments its reference counter.
+ * Set parent, if necessary.
+ */
+
+static int __tcf_qdisc_find(struct net *net, struct Qdisc **q,
+ u32 *parent, int ifindex, bool rtnl_held,
+ struct netlink_ext_ack *extack)
+{
+ const struct Qdisc_class_ops *cops;
+ struct net_device *dev;
+ int err = 0;
+
+ if (ifindex == TCM_IFINDEX_MAGIC_BLOCK)
+ return 0;
+
+ rcu_read_lock();
+
+ /* Find link */
+ dev = dev_get_by_index_rcu(net, ifindex);
+ if (!dev) {
+ rcu_read_unlock();
+ return -ENODEV;
+ }
+
+ /* Find qdisc */
+ if (!*parent) {
+ *q = dev->qdisc;
+ *parent = (*q)->handle;
+ } else {
+ *q = qdisc_lookup_rcu(dev, TC_H_MAJ(*parent));
+ if (!*q) {
+ NL_SET_ERR_MSG(extack, "Parent Qdisc doesn't exists");
+ err = -EINVAL;
+ goto errout_rcu;
+ }
+ }
+
+ *q = qdisc_refcount_inc_nz(*q);
+ if (!*q) {
+ NL_SET_ERR_MSG(extack, "Parent Qdisc doesn't exists");
+ err = -EINVAL;
+ goto errout_rcu;
+ }
+
+ /* Is it classful? */
+ cops = (*q)->ops->cl_ops;
+ if (!cops) {
+ NL_SET_ERR_MSG(extack, "Qdisc not classful");
+ err = -EINVAL;
+ goto errout_qdisc;
+ }
+
+ if (!cops->tcf_block) {
+ NL_SET_ERR_MSG(extack, "Class doesn't support blocks");
+ err = -EOPNOTSUPP;
+ goto errout_qdisc;
+ }
+
+errout_rcu:
+ /* At this point we know that qdisc is not noop_qdisc,
+ * which means that qdisc holds a reference to net_device
+ * and we hold a reference to qdisc, so it is safe to release
+ * rcu read lock.
+ */
+ rcu_read_unlock();
+ return err;
+
+errout_qdisc:
+ rcu_read_unlock();
+
+ if (rtnl_held)
+ qdisc_put(*q);
+ else
+ qdisc_put_unlocked(*q);
+ *q = NULL;
+
+ return err;
+}
+
+static int __tcf_qdisc_cl_find(struct Qdisc *q, u32 parent, unsigned long *cl,
+ int ifindex, struct netlink_ext_ack *extack)
+{
+ if (ifindex == TCM_IFINDEX_MAGIC_BLOCK)
+ return 0;
+
+ /* Do we search for filter, attached to class? */
+ if (TC_H_MIN(parent)) {
+ const struct Qdisc_class_ops *cops = q->ops->cl_ops;
+
+ *cl = cops->find(q, parent);
+ if (*cl == 0) {
+ NL_SET_ERR_MSG(extack, "Specified class doesn't exist");
+ return -ENOENT;
+ }
+ }
+
+ return 0;
+}
+
+static struct tcf_block *__tcf_block_find(struct net *net, struct Qdisc *q,
+ unsigned long cl, int ifindex,
+ u32 block_index,
+ struct netlink_ext_ack *extack)
+{
+ struct tcf_block *block;
+
+ if (ifindex == TCM_IFINDEX_MAGIC_BLOCK) {
+ block = tcf_block_refcnt_get(net, block_index);
+ if (!block) {
+ NL_SET_ERR_MSG(extack, "Block of given index was not found");
+ return ERR_PTR(-EINVAL);
+ }
+ } else {
+ const struct Qdisc_class_ops *cops = q->ops->cl_ops;
+
+ block = cops->tcf_block(q, cl, extack);
+ if (!block)
+ return ERR_PTR(-EINVAL);
+
+ if (tcf_block_shared(block)) {
+ NL_SET_ERR_MSG(extack, "This filter block is shared. Please use the block index to manipulate the filters");
+ return ERR_PTR(-EOPNOTSUPP);
+ }
+
+ /* Always take reference to block in order to support execution
+ * of rules update path of cls API without rtnl lock. Caller
+ * must release block when it is finished using it. 'if' block
+ * of this conditional obtain reference to block by calling
+ * tcf_block_refcnt_get().
+ */
+ refcount_inc(&block->refcnt);
+ }
+
+ return block;
+}
+
+static void __tcf_block_put(struct tcf_block *block, struct Qdisc *q,
+ struct tcf_block_ext_info *ei, bool rtnl_held)
+{
+ if (refcount_dec_and_mutex_lock(&block->refcnt, &block->lock)) {
+ /* Flushing/putting all chains will cause the block to be
+ * deallocated when last chain is freed. However, if chain_list
+ * is empty, block has to be manually deallocated. After block
+ * reference counter reached 0, it is no longer possible to
+ * increment it or add new chains to block.
+ */
+ bool free_block = list_empty(&block->chain_list);
+
+ mutex_unlock(&block->lock);
+ if (tcf_block_shared(block))
+ tcf_block_remove(block, block->net);
+
+ if (q)
+ tcf_block_offload_unbind(block, q, ei);
+
+ if (free_block)
+ tcf_block_destroy(block);
+ else
+ tcf_block_flush_all_chains(block, rtnl_held);
+ } else if (q) {
+ tcf_block_offload_unbind(block, q, ei);
+ }
+}
+
+static void tcf_block_refcnt_put(struct tcf_block *block, bool rtnl_held)
+{
+ __tcf_block_put(block, NULL, NULL, rtnl_held);
+}
+
/* Find tcf block.
* Set q, parent, cl when appropriate.
*/
@@ -539,82 +1239,62 @@
struct netlink_ext_ack *extack)
{
struct tcf_block *block;
+ int err = 0;
- if (ifindex == TCM_IFINDEX_MAGIC_BLOCK) {
- block = tcf_block_lookup(net, block_index);
- if (!block) {
- NL_SET_ERR_MSG(extack, "Block of given index was not found");
- return ERR_PTR(-EINVAL);
- }
- } else {
- const struct Qdisc_class_ops *cops;
- struct net_device *dev;
+ ASSERT_RTNL();
- /* Find link */
- dev = __dev_get_by_index(net, ifindex);
- if (!dev)
- return ERR_PTR(-ENODEV);
+ err = __tcf_qdisc_find(net, q, parent, ifindex, true, extack);
+ if (err)
+ goto errout;
- /* Find qdisc */
- if (!*parent) {
- *q = dev->qdisc;
- *parent = (*q)->handle;
- } else {
- *q = qdisc_lookup(dev, TC_H_MAJ(*parent));
- if (!*q) {
- NL_SET_ERR_MSG(extack, "Parent Qdisc doesn't exists");
- return ERR_PTR(-EINVAL);
- }
- }
+ err = __tcf_qdisc_cl_find(*q, *parent, cl, ifindex, extack);
+ if (err)
+ goto errout_qdisc;
- /* Is it classful? */
- cops = (*q)->ops->cl_ops;
- if (!cops) {
- NL_SET_ERR_MSG(extack, "Qdisc not classful");
- return ERR_PTR(-EINVAL);
- }
-
- if (!cops->tcf_block) {
- NL_SET_ERR_MSG(extack, "Class doesn't support blocks");
- return ERR_PTR(-EOPNOTSUPP);
- }
-
- /* Do we search for filter, attached to class? */
- if (TC_H_MIN(*parent)) {
- *cl = cops->find(*q, *parent);
- if (*cl == 0) {
- NL_SET_ERR_MSG(extack, "Specified class doesn't exist");
- return ERR_PTR(-ENOENT);
- }
- }
-
- /* And the last stroke */
- block = cops->tcf_block(*q, *cl, extack);
- if (!block)
- return ERR_PTR(-EINVAL);
- if (tcf_block_shared(block)) {
- NL_SET_ERR_MSG(extack, "This filter block is shared. Please use the block index to manipulate the filters");
- return ERR_PTR(-EOPNOTSUPP);
- }
+ block = __tcf_block_find(net, *q, *cl, ifindex, block_index, extack);
+ if (IS_ERR(block)) {
+ err = PTR_ERR(block);
+ goto errout_qdisc;
}
return block;
+
+errout_qdisc:
+ if (*q)
+ qdisc_put(*q);
+errout:
+ *q = NULL;
+ return ERR_PTR(err);
+}
+
+static void tcf_block_release(struct Qdisc *q, struct tcf_block *block,
+ bool rtnl_held)
+{
+ if (!IS_ERR_OR_NULL(block))
+ tcf_block_refcnt_put(block, rtnl_held);
+
+ if (q) {
+ if (rtnl_held)
+ qdisc_put(q);
+ else
+ qdisc_put_unlocked(q);
+ }
}
struct tcf_block_owner_item {
struct list_head list;
struct Qdisc *q;
- enum tcf_block_binder_type binder_type;
+ enum flow_block_binder_type binder_type;
};
static void
tcf_block_owner_netif_keep_dst(struct tcf_block *block,
struct Qdisc *q,
- enum tcf_block_binder_type binder_type)
+ enum flow_block_binder_type binder_type)
{
if (block->keep_dst &&
- binder_type != TCF_BLOCK_BINDER_TYPE_CLSACT_INGRESS &&
- binder_type != TCF_BLOCK_BINDER_TYPE_CLSACT_EGRESS)
+ binder_type != FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS &&
+ binder_type != FLOW_BLOCK_BINDER_TYPE_CLSACT_EGRESS)
netif_keep_dst(qdisc_dev(q));
}
@@ -631,7 +1311,7 @@
static int tcf_block_owner_add(struct tcf_block *block,
struct Qdisc *q,
- enum tcf_block_binder_type binder_type)
+ enum flow_block_binder_type binder_type)
{
struct tcf_block_owner_item *item;
@@ -646,7 +1326,7 @@
static void tcf_block_owner_del(struct tcf_block *block,
struct Qdisc *q,
- enum tcf_block_binder_type binder_type)
+ enum flow_block_binder_type binder_type)
{
struct tcf_block_owner_item *item;
@@ -666,21 +1346,16 @@
{
struct net *net = qdisc_net(q);
struct tcf_block *block = NULL;
- bool created = false;
int err;
- if (ei->block_index) {
+ if (ei->block_index)
/* block_index not 0 means the shared block is requested */
- block = tcf_block_lookup(net, ei->block_index);
- if (block)
- block->refcnt++;
- }
+ block = tcf_block_refcnt_get(net, ei->block_index);
if (!block) {
block = tcf_block_create(net, q, ei->block_index, extack);
if (IS_ERR(block))
return PTR_ERR(block);
- created = true;
if (tcf_block_shared(block)) {
err = tcf_block_insert(block, net, extack);
if (err)
@@ -710,14 +1385,8 @@
err_chain0_head_change_cb_add:
tcf_block_owner_del(block, q, ei->binder_type);
err_block_owner_add:
- if (created) {
- if (tcf_block_shared(block))
- tcf_block_remove(block, net);
err_block_insert:
- kfree(block);
- } else {
- block->refcnt--;
- }
+ tcf_block_refcnt_put(block, true);
return err;
}
EXPORT_SYMBOL(tcf_block_get_ext);
@@ -749,42 +1418,12 @@
void tcf_block_put_ext(struct tcf_block *block, struct Qdisc *q,
struct tcf_block_ext_info *ei)
{
- struct tcf_chain *chain, *tmp;
-
if (!block)
return;
tcf_chain0_head_change_cb_del(block, ei);
tcf_block_owner_del(block, q, ei->binder_type);
- if (block->refcnt == 1) {
- if (tcf_block_shared(block))
- tcf_block_remove(block, block->net);
-
- /* Hold a refcnt for all chains, so that they don't disappear
- * while we are iterating.
- */
- list_for_each_entry(chain, &block->chain_list, list)
- tcf_chain_hold(chain);
-
- list_for_each_entry(chain, &block->chain_list, list)
- tcf_chain_flush(chain);
- }
-
- tcf_block_offload_unbind(block, q, ei);
-
- if (block->refcnt == 1) {
- /* At this point, all the chains should have refcnt >= 1. */
- list_for_each_entry_safe(chain, tmp, &block->chain_list, list) {
- tcf_chain_put_explicitly_created(chain);
- tcf_chain_put(chain);
- }
-
- block->refcnt--;
- if (list_empty(&block->chain_list))
- kfree(block);
- } else {
- block->refcnt--;
- }
+ __tcf_block_put(block, q, ei, true);
}
EXPORT_SYMBOL(tcf_block_put_ext);
@@ -799,55 +1438,26 @@
EXPORT_SYMBOL(tcf_block_put);
-struct tcf_block_cb {
- struct list_head list;
- tc_setup_cb_t *cb;
- void *cb_ident;
- void *cb_priv;
- unsigned int refcnt;
-};
-
-void *tcf_block_cb_priv(struct tcf_block_cb *block_cb)
-{
- return block_cb->cb_priv;
-}
-EXPORT_SYMBOL(tcf_block_cb_priv);
-
-struct tcf_block_cb *tcf_block_cb_lookup(struct tcf_block *block,
- tc_setup_cb_t *cb, void *cb_ident)
-{ struct tcf_block_cb *block_cb;
-
- list_for_each_entry(block_cb, &block->cb_list, list)
- if (block_cb->cb == cb && block_cb->cb_ident == cb_ident)
- return block_cb;
- return NULL;
-}
-EXPORT_SYMBOL(tcf_block_cb_lookup);
-
-void tcf_block_cb_incref(struct tcf_block_cb *block_cb)
-{
- block_cb->refcnt++;
-}
-EXPORT_SYMBOL(tcf_block_cb_incref);
-
-unsigned int tcf_block_cb_decref(struct tcf_block_cb *block_cb)
-{
- return --block_cb->refcnt;
-}
-EXPORT_SYMBOL(tcf_block_cb_decref);
-
static int
-tcf_block_playback_offloads(struct tcf_block *block, tc_setup_cb_t *cb,
+tcf_block_playback_offloads(struct tcf_block *block, flow_setup_cb_t *cb,
void *cb_priv, bool add, bool offload_in_use,
struct netlink_ext_ack *extack)
{
- struct tcf_chain *chain;
- struct tcf_proto *tp;
+ struct tcf_chain *chain, *chain_prev;
+ struct tcf_proto *tp, *tp_prev;
int err;
- list_for_each_entry(chain, &block->chain_list, list) {
- for (tp = rtnl_dereference(chain->filter_chain); tp;
- tp = rtnl_dereference(tp->next)) {
+ lockdep_assert_held(&block->cb_lock);
+
+ for (chain = __tcf_get_next_chain(block, NULL);
+ chain;
+ chain_prev = chain,
+ chain = __tcf_get_next_chain(block, chain),
+ tcf_chain_put(chain_prev)) {
+ for (tp = __tcf_get_next_proto(chain, NULL); tp;
+ tp_prev = tp,
+ tp = __tcf_get_next_proto(chain, tp),
+ tcf_proto_put(tp_prev, true, NULL)) {
if (tp->ops->reoffload) {
err = tp->ops->reoffload(tp, add, cb, cb_priv,
extack);
@@ -864,93 +1474,92 @@
return 0;
err_playback_remove:
+ tcf_proto_put(tp, true, NULL);
+ tcf_chain_put(chain);
tcf_block_playback_offloads(block, cb, cb_priv, false, offload_in_use,
extack);
return err;
}
-struct tcf_block_cb *__tcf_block_cb_register(struct tcf_block *block,
- tc_setup_cb_t *cb, void *cb_ident,
- void *cb_priv,
- struct netlink_ext_ack *extack)
+static int tcf_block_bind(struct tcf_block *block,
+ struct flow_block_offload *bo)
{
- struct tcf_block_cb *block_cb;
- int err;
+ struct flow_block_cb *block_cb, *next;
+ int err, i = 0;
- /* Replay any already present rules */
- err = tcf_block_playback_offloads(block, cb, cb_priv, true,
- tcf_block_offload_in_use(block),
- extack);
- if (err)
- return ERR_PTR(err);
+ lockdep_assert_held(&block->cb_lock);
- block_cb = kzalloc(sizeof(*block_cb), GFP_KERNEL);
- if (!block_cb)
- return ERR_PTR(-ENOMEM);
- block_cb->cb = cb;
- block_cb->cb_ident = cb_ident;
- block_cb->cb_priv = cb_priv;
- list_add(&block_cb->list, &block->cb_list);
- return block_cb;
-}
-EXPORT_SYMBOL(__tcf_block_cb_register);
+ list_for_each_entry(block_cb, &bo->cb_list, list) {
+ err = tcf_block_playback_offloads(block, block_cb->cb,
+ block_cb->cb_priv, true,
+ tcf_block_offload_in_use(block),
+ bo->extack);
+ if (err)
+ goto err_unroll;
+ if (!bo->unlocked_driver_cb)
+ block->lockeddevcnt++;
-int tcf_block_cb_register(struct tcf_block *block,
- tc_setup_cb_t *cb, void *cb_ident,
- void *cb_priv, struct netlink_ext_ack *extack)
-{
- struct tcf_block_cb *block_cb;
-
- block_cb = __tcf_block_cb_register(block, cb, cb_ident, cb_priv,
- extack);
- return PTR_ERR_OR_ZERO(block_cb);
-}
-EXPORT_SYMBOL(tcf_block_cb_register);
-
-void __tcf_block_cb_unregister(struct tcf_block *block,
- struct tcf_block_cb *block_cb)
-{
- tcf_block_playback_offloads(block, block_cb->cb, block_cb->cb_priv,
- false, tcf_block_offload_in_use(block),
- NULL);
- list_del(&block_cb->list);
- kfree(block_cb);
-}
-EXPORT_SYMBOL(__tcf_block_cb_unregister);
-
-void tcf_block_cb_unregister(struct tcf_block *block,
- tc_setup_cb_t *cb, void *cb_ident)
-{
- struct tcf_block_cb *block_cb;
-
- block_cb = tcf_block_cb_lookup(block, cb, cb_ident);
- if (!block_cb)
- return;
- __tcf_block_cb_unregister(block, block_cb);
-}
-EXPORT_SYMBOL(tcf_block_cb_unregister);
-
-static int tcf_block_cb_call(struct tcf_block *block, enum tc_setup_type type,
- void *type_data, bool err_stop)
-{
- struct tcf_block_cb *block_cb;
- int ok_count = 0;
- int err;
-
- /* Make sure all netdevs sharing this block are offload-capable. */
- if (block->nooffloaddevcnt && err_stop)
- return -EOPNOTSUPP;
-
- list_for_each_entry(block_cb, &block->cb_list, list) {
- err = block_cb->cb(type, type_data, block_cb->cb_priv);
- if (err) {
- if (err_stop)
- return err;
- } else {
- ok_count++;
- }
+ i++;
}
- return ok_count;
+ list_splice(&bo->cb_list, &block->flow_block.cb_list);
+
+ return 0;
+
+err_unroll:
+ list_for_each_entry_safe(block_cb, next, &bo->cb_list, list) {
+ if (i-- > 0) {
+ list_del(&block_cb->list);
+ tcf_block_playback_offloads(block, block_cb->cb,
+ block_cb->cb_priv, false,
+ tcf_block_offload_in_use(block),
+ NULL);
+ if (!bo->unlocked_driver_cb)
+ block->lockeddevcnt--;
+ }
+ flow_block_cb_free(block_cb);
+ }
+
+ return err;
+}
+
+static void tcf_block_unbind(struct tcf_block *block,
+ struct flow_block_offload *bo)
+{
+ struct flow_block_cb *block_cb, *next;
+
+ lockdep_assert_held(&block->cb_lock);
+
+ list_for_each_entry_safe(block_cb, next, &bo->cb_list, list) {
+ tcf_block_playback_offloads(block, block_cb->cb,
+ block_cb->cb_priv, false,
+ tcf_block_offload_in_use(block),
+ NULL);
+ list_del(&block_cb->list);
+ flow_block_cb_free(block_cb);
+ if (!bo->unlocked_driver_cb)
+ block->lockeddevcnt--;
+ }
+}
+
+static int tcf_block_setup(struct tcf_block *block,
+ struct flow_block_offload *bo)
+{
+ int err;
+
+ switch (bo->command) {
+ case FLOW_BLOCK_BIND:
+ err = tcf_block_bind(block, bo);
+ break;
+ case FLOW_BLOCK_UNBIND:
+ err = 0;
+ tcf_block_unbind(block, bo);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ err = -EOPNOTSUPP;
+ }
+
+ return err;
}
/* Main classifier routine: scans classifier chain attached
@@ -960,7 +1569,6 @@
int tcf_classify(struct sk_buff *skb, const struct tcf_proto *tp,
struct tcf_result *res, bool compat_mode)
{
- __be16 protocol = tc_skb_protocol(skb);
#ifdef CONFIG_NET_CLS_ACT
const int max_reclassify_loop = 4;
const struct tcf_proto *orig_tp = tp;
@@ -970,6 +1578,7 @@
reclassify:
#endif
for (; tp; tp = rcu_dereference_bh(tp->next)) {
+ __be16 protocol = tc_skb_protocol(skb);
int err;
if (tp->protocol != protocol &&
@@ -983,6 +1592,18 @@
goto reset;
} else if (unlikely(TC_ACT_EXT_CMP(err, TC_ACT_GOTO_CHAIN))) {
first_tp = res->goto_tp;
+
+#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
+ {
+ struct tc_skb_ext *ext;
+
+ ext = skb_ext_add(skb, TC_SKB_EXT);
+ if (WARN_ON_ONCE(!ext))
+ return TC_ACT_SHOT;
+
+ ext->chain = err & TC_ACT_EXT_VAL_MASK;
+ }
+#endif
goto reset;
}
#endif
@@ -1002,7 +1623,6 @@
}
tp = first_tp;
- protocol = tc_skb_protocol(skb);
goto reclassify;
#endif
}
@@ -1013,32 +1633,123 @@
struct tcf_proto __rcu *next;
};
-static struct tcf_proto *tcf_chain_tp_prev(struct tcf_chain_info *chain_info)
+static struct tcf_proto *tcf_chain_tp_prev(struct tcf_chain *chain,
+ struct tcf_chain_info *chain_info)
{
- return rtnl_dereference(*chain_info->pprev);
+ return tcf_chain_dereference(*chain_info->pprev, chain);
}
-static void tcf_chain_tp_insert(struct tcf_chain *chain,
- struct tcf_chain_info *chain_info,
- struct tcf_proto *tp)
+static int tcf_chain_tp_insert(struct tcf_chain *chain,
+ struct tcf_chain_info *chain_info,
+ struct tcf_proto *tp)
{
+ if (chain->flushing)
+ return -EAGAIN;
+
if (*chain_info->pprev == chain->filter_chain)
tcf_chain0_head_change(chain, tp);
- RCU_INIT_POINTER(tp->next, tcf_chain_tp_prev(chain_info));
+ tcf_proto_get(tp);
+ RCU_INIT_POINTER(tp->next, tcf_chain_tp_prev(chain, chain_info));
rcu_assign_pointer(*chain_info->pprev, tp);
- tcf_chain_hold(chain);
+
+ return 0;
}
static void tcf_chain_tp_remove(struct tcf_chain *chain,
struct tcf_chain_info *chain_info,
struct tcf_proto *tp)
{
- struct tcf_proto *next = rtnl_dereference(chain_info->next);
+ struct tcf_proto *next = tcf_chain_dereference(chain_info->next, chain);
+ tcf_proto_mark_delete(tp);
if (tp == chain->filter_chain)
tcf_chain0_head_change(chain, next);
RCU_INIT_POINTER(*chain_info->pprev, next);
- tcf_chain_put(chain);
+}
+
+static struct tcf_proto *tcf_chain_tp_find(struct tcf_chain *chain,
+ struct tcf_chain_info *chain_info,
+ u32 protocol, u32 prio,
+ bool prio_allocate);
+
+/* Try to insert new proto.
+ * If proto with specified priority already exists, free new proto
+ * and return existing one.
+ */
+
+static struct tcf_proto *tcf_chain_tp_insert_unique(struct tcf_chain *chain,
+ struct tcf_proto *tp_new,
+ u32 protocol, u32 prio,
+ bool rtnl_held)
+{
+ struct tcf_chain_info chain_info;
+ struct tcf_proto *tp;
+ int err = 0;
+
+ mutex_lock(&chain->filter_chain_lock);
+
+ if (tcf_proto_exists_destroying(chain, tp_new)) {
+ mutex_unlock(&chain->filter_chain_lock);
+ tcf_proto_destroy(tp_new, rtnl_held, false, NULL);
+ return ERR_PTR(-EAGAIN);
+ }
+
+ tp = tcf_chain_tp_find(chain, &chain_info,
+ protocol, prio, false);
+ if (!tp)
+ err = tcf_chain_tp_insert(chain, &chain_info, tp_new);
+ mutex_unlock(&chain->filter_chain_lock);
+
+ if (tp) {
+ tcf_proto_destroy(tp_new, rtnl_held, false, NULL);
+ tp_new = tp;
+ } else if (err) {
+ tcf_proto_destroy(tp_new, rtnl_held, false, NULL);
+ tp_new = ERR_PTR(err);
+ }
+
+ return tp_new;
+}
+
+static void tcf_chain_tp_delete_empty(struct tcf_chain *chain,
+ struct tcf_proto *tp, bool rtnl_held,
+ struct netlink_ext_ack *extack)
+{
+ struct tcf_chain_info chain_info;
+ struct tcf_proto *tp_iter;
+ struct tcf_proto **pprev;
+ struct tcf_proto *next;
+
+ mutex_lock(&chain->filter_chain_lock);
+
+ /* Atomically find and remove tp from chain. */
+ for (pprev = &chain->filter_chain;
+ (tp_iter = tcf_chain_dereference(*pprev, chain));
+ pprev = &tp_iter->next) {
+ if (tp_iter == tp) {
+ chain_info.pprev = pprev;
+ chain_info.next = tp_iter->next;
+ WARN_ON(tp_iter->deleting);
+ break;
+ }
+ }
+ /* Verify that tp still exists and no new filters were inserted
+ * concurrently.
+ * Mark tp for deletion if it is empty.
+ */
+ if (!tp_iter || !tcf_proto_check_delete(tp, rtnl_held)) {
+ mutex_unlock(&chain->filter_chain_lock);
+ return;
+ }
+
+ tcf_proto_signal_destroying(chain, tp);
+ next = tcf_chain_dereference(chain_info.next, chain);
+ if (tp == chain->filter_chain)
+ tcf_chain0_head_change(chain, next);
+ RCU_INIT_POINTER(*chain_info.pprev, next);
+ mutex_unlock(&chain->filter_chain_lock);
+
+ tcf_proto_put(tp, rtnl_held, extack);
}
static struct tcf_proto *tcf_chain_tp_find(struct tcf_chain *chain,
@@ -1051,7 +1762,8 @@
/* Check the chain for existence of proto-tcf with this priority */
for (pprev = &chain->filter_chain;
- (tp = rtnl_dereference(*pprev)); pprev = &tp->next) {
+ (tp = tcf_chain_dereference(*pprev, chain));
+ pprev = &tp->next) {
if (tp->prio >= prio) {
if (tp->prio == prio) {
if (prio_allocate ||
@@ -1064,14 +1776,20 @@
}
}
chain_info->pprev = pprev;
- chain_info->next = tp ? tp->next : NULL;
+ if (tp) {
+ chain_info->next = tp->next;
+ tcf_proto_get(tp);
+ } else {
+ chain_info->next = NULL;
+ }
return tp;
}
static int tcf_fill_node(struct net *net, struct sk_buff *skb,
struct tcf_proto *tp, struct tcf_block *block,
struct Qdisc *q, u32 parent, void *fh,
- u32 portid, u32 seq, u16 flags, int event)
+ u32 portid, u32 seq, u16 flags, int event,
+ bool rtnl_held)
{
struct tcmsg *tcm;
struct nlmsghdr *nlh;
@@ -1099,7 +1817,8 @@
if (!fh) {
tcm->tcm_handle = 0;
} else {
- if (tp->ops->dump && tp->ops->dump(net, tp, fh, skb, tcm) < 0)
+ if (tp->ops->dump &&
+ tp->ops->dump(net, tp, fh, skb, tcm, rtnl_held) < 0)
goto nla_put_failure;
}
nlh->nlmsg_len = skb_tail_pointer(skb) - b;
@@ -1114,33 +1833,40 @@
static int tfilter_notify(struct net *net, struct sk_buff *oskb,
struct nlmsghdr *n, struct tcf_proto *tp,
struct tcf_block *block, struct Qdisc *q,
- u32 parent, void *fh, int event, bool unicast)
+ u32 parent, void *fh, int event, bool unicast,
+ bool rtnl_held)
{
struct sk_buff *skb;
u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
+ int err = 0;
skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
if (!skb)
return -ENOBUFS;
if (tcf_fill_node(net, skb, tp, block, q, parent, fh, portid,
- n->nlmsg_seq, n->nlmsg_flags, event) <= 0) {
+ n->nlmsg_seq, n->nlmsg_flags, event,
+ rtnl_held) <= 0) {
kfree_skb(skb);
return -EINVAL;
}
if (unicast)
- return netlink_unicast(net->rtnl, skb, portid, MSG_DONTWAIT);
+ err = netlink_unicast(net->rtnl, skb, portid, MSG_DONTWAIT);
+ else
+ err = rtnetlink_send(skb, net, portid, RTNLGRP_TC,
+ n->nlmsg_flags & NLM_F_ECHO);
- return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
- n->nlmsg_flags & NLM_F_ECHO);
+ if (err > 0)
+ err = 0;
+ return err;
}
static int tfilter_del_notify(struct net *net, struct sk_buff *oskb,
struct nlmsghdr *n, struct tcf_proto *tp,
struct tcf_block *block, struct Qdisc *q,
u32 parent, void *fh, bool unicast, bool *last,
- struct netlink_ext_ack *extack)
+ bool rtnl_held, struct netlink_ext_ack *extack)
{
struct sk_buff *skb;
u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
@@ -1151,39 +1877,50 @@
return -ENOBUFS;
if (tcf_fill_node(net, skb, tp, block, q, parent, fh, portid,
- n->nlmsg_seq, n->nlmsg_flags, RTM_DELTFILTER) <= 0) {
+ n->nlmsg_seq, n->nlmsg_flags, RTM_DELTFILTER,
+ rtnl_held) <= 0) {
NL_SET_ERR_MSG(extack, "Failed to build del event notification");
kfree_skb(skb);
return -EINVAL;
}
- err = tp->ops->delete(tp, fh, last, extack);
+ err = tp->ops->delete(tp, fh, last, rtnl_held, extack);
if (err) {
kfree_skb(skb);
return err;
}
if (unicast)
- return netlink_unicast(net->rtnl, skb, portid, MSG_DONTWAIT);
-
- err = rtnetlink_send(skb, net, portid, RTNLGRP_TC,
- n->nlmsg_flags & NLM_F_ECHO);
+ err = netlink_unicast(net->rtnl, skb, portid, MSG_DONTWAIT);
+ else
+ err = rtnetlink_send(skb, net, portid, RTNLGRP_TC,
+ n->nlmsg_flags & NLM_F_ECHO);
if (err < 0)
NL_SET_ERR_MSG(extack, "Failed to send filter delete notification");
+
+ if (err > 0)
+ err = 0;
return err;
}
static void tfilter_notify_chain(struct net *net, struct sk_buff *oskb,
struct tcf_block *block, struct Qdisc *q,
u32 parent, struct nlmsghdr *n,
- struct tcf_chain *chain, int event)
+ struct tcf_chain *chain, int event,
+ bool rtnl_held)
{
struct tcf_proto *tp;
- for (tp = rtnl_dereference(chain->filter_chain);
- tp; tp = rtnl_dereference(tp->next))
+ for (tp = tcf_get_next_proto(chain, NULL, rtnl_held);
+ tp; tp = tcf_get_next_proto(chain, tp, rtnl_held))
tfilter_notify(net, oskb, n, tp, block,
- q, parent, NULL, event, false);
+ q, parent, NULL, event, false, rtnl_held);
+}
+
+static void tfilter_put(struct tcf_proto *tp, void *fh)
+{
+ if (tp->ops->put && fh)
+ tp->ops->put(tp, fh);
}
static int tc_new_tfilter(struct sk_buff *skb, struct nlmsghdr *n,
@@ -1191,6 +1928,7 @@
{
struct net *net = sock_net(skb->sk);
struct nlattr *tca[TCA_MAX + 1];
+ char name[IFNAMSIZ];
struct tcmsg *t;
u32 protocol;
u32 prio;
@@ -1206,6 +1944,7 @@
void *fh;
int err;
int tp_created;
+ bool rtnl_held = false;
if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
return -EPERM;
@@ -1213,7 +1952,8 @@
replay:
tp_created = 0;
- err = nlmsg_parse(n, sizeof(*t), tca, TCA_MAX, rtm_tca_policy, extack);
+ err = nlmsg_parse_deprecated(n, sizeof(*t), tca, TCA_MAX,
+ rtm_tca_policy, extack);
if (err < 0)
return err;
@@ -1222,7 +1962,9 @@
prio = TC_H_MAJ(t->tcm_info);
prio_allocate = false;
parent = t->tcm_parent;
+ tp = NULL;
cl = 0;
+ block = NULL;
if (prio == 0) {
/* If no priority is provided by the user,
@@ -1239,8 +1981,33 @@
/* Find head of filter chain. */
- block = tcf_block_find(net, &q, &parent, &cl,
- t->tcm_ifindex, t->tcm_block_index, extack);
+ err = __tcf_qdisc_find(net, &q, &parent, t->tcm_ifindex, false, extack);
+ if (err)
+ return err;
+
+ if (tcf_proto_check_kind(tca[TCA_KIND], name)) {
+ NL_SET_ERR_MSG(extack, "Specified TC filter name too long");
+ err = -EINVAL;
+ goto errout;
+ }
+
+ /* Take rtnl mutex if rtnl_held was set to true on previous iteration,
+ * block is shared (no qdisc found), qdisc is not unlocked, classifier
+ * type is not specified, classifier is not unlocked.
+ */
+ if (rtnl_held ||
+ (q && !(q->ops->cl_ops->flags & QDISC_CLASS_OPS_DOIT_UNLOCKED)) ||
+ !tcf_proto_is_unlocked(name)) {
+ rtnl_held = true;
+ rtnl_lock();
+ }
+
+ err = __tcf_qdisc_cl_find(q, parent, &cl, t->tcm_ifindex, extack);
+ if (err)
+ goto errout;
+
+ block = __tcf_block_find(net, q, cl, t->tcm_ifindex, t->tcm_block_index,
+ extack);
if (IS_ERR(block)) {
err = PTR_ERR(block);
goto errout;
@@ -1259,40 +2026,62 @@
goto errout;
}
+ mutex_lock(&chain->filter_chain_lock);
tp = tcf_chain_tp_find(chain, &chain_info, protocol,
prio, prio_allocate);
if (IS_ERR(tp)) {
NL_SET_ERR_MSG(extack, "Filter with specified priority/protocol not found");
err = PTR_ERR(tp);
- goto errout;
+ goto errout_locked;
}
if (tp == NULL) {
+ struct tcf_proto *tp_new = NULL;
+
+ if (chain->flushing) {
+ err = -EAGAIN;
+ goto errout_locked;
+ }
+
/* Proto-tcf does not exist, create new one */
if (tca[TCA_KIND] == NULL || !protocol) {
NL_SET_ERR_MSG(extack, "Filter kind and protocol must be specified");
err = -EINVAL;
- goto errout;
+ goto errout_locked;
}
if (!(n->nlmsg_flags & NLM_F_CREATE)) {
NL_SET_ERR_MSG(extack, "Need both RTM_NEWTFILTER and NLM_F_CREATE to create a new filter");
err = -ENOENT;
- goto errout;
+ goto errout_locked;
}
if (prio_allocate)
- prio = tcf_auto_prio(tcf_chain_tp_prev(&chain_info));
+ prio = tcf_auto_prio(tcf_chain_tp_prev(chain,
+ &chain_info));
- tp = tcf_proto_create(nla_data(tca[TCA_KIND]),
- protocol, prio, chain, extack);
+ mutex_unlock(&chain->filter_chain_lock);
+ tp_new = tcf_proto_create(nla_data(tca[TCA_KIND]),
+ protocol, prio, chain, rtnl_held,
+ extack);
+ if (IS_ERR(tp_new)) {
+ err = PTR_ERR(tp_new);
+ goto errout_tp;
+ }
+
+ tp_created = 1;
+ tp = tcf_chain_tp_insert_unique(chain, tp_new, protocol, prio,
+ rtnl_held);
if (IS_ERR(tp)) {
err = PTR_ERR(tp);
- goto errout;
+ goto errout_tp;
}
- tp_created = 1;
- } else if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], tp->ops->kind)) {
+ } else {
+ mutex_unlock(&chain->filter_chain_lock);
+ }
+
+ if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], tp->ops->kind)) {
NL_SET_ERR_MSG(extack, "Specified filter kind does not match existing one");
err = -EINVAL;
goto errout;
@@ -1307,6 +2096,7 @@
goto errout;
}
} else if (n->nlmsg_flags & NLM_F_EXCL) {
+ tfilter_put(tp, fh);
NL_SET_ERR_MSG(extack, "Filter already exists");
err = -EEXIST;
goto errout;
@@ -1320,24 +2110,44 @@
err = tp->ops->change(net, skb, tp, cl, t->tcm_handle, tca, &fh,
n->nlmsg_flags & NLM_F_CREATE ? TCA_ACT_NOREPLACE : TCA_ACT_REPLACE,
- extack);
+ rtnl_held, extack);
if (err == 0) {
- if (tp_created)
- tcf_chain_tp_insert(chain, &chain_info, tp);
tfilter_notify(net, skb, n, tp, block, q, parent, fh,
- RTM_NEWTFILTER, false);
- } else {
- if (tp_created)
- tcf_proto_destroy(tp, NULL);
+ RTM_NEWTFILTER, false, rtnl_held);
+ tfilter_put(tp, fh);
+ /* q pointer is NULL for shared blocks */
+ if (q)
+ q->flags &= ~TCQ_F_CAN_BYPASS;
}
errout:
- if (chain)
- tcf_chain_put(chain);
- if (err == -EAGAIN)
+ if (err && tp_created)
+ tcf_chain_tp_delete_empty(chain, tp, rtnl_held, NULL);
+errout_tp:
+ if (chain) {
+ if (tp && !IS_ERR(tp))
+ tcf_proto_put(tp, rtnl_held, NULL);
+ if (!tp_created)
+ tcf_chain_put(chain);
+ }
+ tcf_block_release(q, block, rtnl_held);
+
+ if (rtnl_held)
+ rtnl_unlock();
+
+ if (err == -EAGAIN) {
+ /* Take rtnl lock in case EAGAIN is caused by concurrent flush
+ * of target chain.
+ */
+ rtnl_held = true;
/* Replay the request. */
goto replay;
+ }
return err;
+
+errout_locked:
+ mutex_unlock(&chain->filter_chain_lock);
+ goto errout;
}
static int tc_del_tfilter(struct sk_buff *skb, struct nlmsghdr *n,
@@ -1345,6 +2155,7 @@
{
struct net *net = sock_net(skb->sk);
struct nlattr *tca[TCA_MAX + 1];
+ char name[IFNAMSIZ];
struct tcmsg *t;
u32 protocol;
u32 prio;
@@ -1353,16 +2164,18 @@
struct Qdisc *q = NULL;
struct tcf_chain_info chain_info;
struct tcf_chain *chain = NULL;
- struct tcf_block *block;
+ struct tcf_block *block = NULL;
struct tcf_proto *tp = NULL;
unsigned long cl = 0;
void *fh = NULL;
int err;
+ bool rtnl_held = false;
if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
return -EPERM;
- err = nlmsg_parse(n, sizeof(*t), tca, TCA_MAX, rtm_tca_policy, extack);
+ err = nlmsg_parse_deprecated(n, sizeof(*t), tca, TCA_MAX,
+ rtm_tca_policy, extack);
if (err < 0)
return err;
@@ -1378,8 +2191,32 @@
/* Find head of filter chain. */
- block = tcf_block_find(net, &q, &parent, &cl,
- t->tcm_ifindex, t->tcm_block_index, extack);
+ err = __tcf_qdisc_find(net, &q, &parent, t->tcm_ifindex, false, extack);
+ if (err)
+ return err;
+
+ if (tcf_proto_check_kind(tca[TCA_KIND], name)) {
+ NL_SET_ERR_MSG(extack, "Specified TC filter name too long");
+ err = -EINVAL;
+ goto errout;
+ }
+ /* Take rtnl mutex if flushing whole chain, block is shared (no qdisc
+ * found), qdisc is not unlocked, classifier type is not specified,
+ * classifier is not unlocked.
+ */
+ if (!prio ||
+ (q && !(q->ops->cl_ops->flags & QDISC_CLASS_OPS_DOIT_UNLOCKED)) ||
+ !tcf_proto_is_unlocked(name)) {
+ rtnl_held = true;
+ rtnl_lock();
+ }
+
+ err = __tcf_qdisc_cl_find(q, parent, &cl, t->tcm_ifindex, extack);
+ if (err)
+ goto errout;
+
+ block = __tcf_block_find(net, q, cl, t->tcm_ifindex, t->tcm_block_index,
+ extack);
if (IS_ERR(block)) {
err = PTR_ERR(block);
goto errout;
@@ -1407,55 +2244,70 @@
if (prio == 0) {
tfilter_notify_chain(net, skb, block, q, parent, n,
- chain, RTM_DELTFILTER);
- tcf_chain_flush(chain);
+ chain, RTM_DELTFILTER, rtnl_held);
+ tcf_chain_flush(chain, rtnl_held);
err = 0;
goto errout;
}
+ mutex_lock(&chain->filter_chain_lock);
tp = tcf_chain_tp_find(chain, &chain_info, protocol,
prio, false);
if (!tp || IS_ERR(tp)) {
NL_SET_ERR_MSG(extack, "Filter with specified priority/protocol not found");
err = tp ? PTR_ERR(tp) : -ENOENT;
- goto errout;
+ goto errout_locked;
} else if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], tp->ops->kind)) {
NL_SET_ERR_MSG(extack, "Specified filter kind does not match existing one");
err = -EINVAL;
+ goto errout_locked;
+ } else if (t->tcm_handle == 0) {
+ tcf_proto_signal_destroying(chain, tp);
+ tcf_chain_tp_remove(chain, &chain_info, tp);
+ mutex_unlock(&chain->filter_chain_lock);
+
+ tcf_proto_put(tp, rtnl_held, NULL);
+ tfilter_notify(net, skb, n, tp, block, q, parent, fh,
+ RTM_DELTFILTER, false, rtnl_held);
+ err = 0;
goto errout;
}
+ mutex_unlock(&chain->filter_chain_lock);
fh = tp->ops->get(tp, t->tcm_handle);
if (!fh) {
- if (t->tcm_handle == 0) {
- tcf_chain_tp_remove(chain, &chain_info, tp);
- tfilter_notify(net, skb, n, tp, block, q, parent, fh,
- RTM_DELTFILTER, false);
- tcf_proto_destroy(tp, extack);
- err = 0;
- } else {
- NL_SET_ERR_MSG(extack, "Specified filter handle not found");
- err = -ENOENT;
- }
+ NL_SET_ERR_MSG(extack, "Specified filter handle not found");
+ err = -ENOENT;
} else {
bool last;
err = tfilter_del_notify(net, skb, n, tp, block,
q, parent, fh, false, &last,
- extack);
+ rtnl_held, extack);
+
if (err)
goto errout;
- if (last) {
- tcf_chain_tp_remove(chain, &chain_info, tp);
- tcf_proto_destroy(tp, extack);
- }
+ if (last)
+ tcf_chain_tp_delete_empty(chain, tp, rtnl_held, extack);
}
errout:
- if (chain)
+ if (chain) {
+ if (tp && !IS_ERR(tp))
+ tcf_proto_put(tp, rtnl_held, NULL);
tcf_chain_put(chain);
+ }
+ tcf_block_release(q, block, rtnl_held);
+
+ if (rtnl_held)
+ rtnl_unlock();
+
return err;
+
+errout_locked:
+ mutex_unlock(&chain->filter_chain_lock);
+ goto errout;
}
static int tc_get_tfilter(struct sk_buff *skb, struct nlmsghdr *n,
@@ -1463,6 +2315,7 @@
{
struct net *net = sock_net(skb->sk);
struct nlattr *tca[TCA_MAX + 1];
+ char name[IFNAMSIZ];
struct tcmsg *t;
u32 protocol;
u32 prio;
@@ -1471,13 +2324,15 @@
struct Qdisc *q = NULL;
struct tcf_chain_info chain_info;
struct tcf_chain *chain = NULL;
- struct tcf_block *block;
+ struct tcf_block *block = NULL;
struct tcf_proto *tp = NULL;
unsigned long cl = 0;
void *fh = NULL;
int err;
+ bool rtnl_held = false;
- err = nlmsg_parse(n, sizeof(*t), tca, TCA_MAX, rtm_tca_policy, extack);
+ err = nlmsg_parse_deprecated(n, sizeof(*t), tca, TCA_MAX,
+ rtm_tca_policy, extack);
if (err < 0)
return err;
@@ -1493,8 +2348,31 @@
/* Find head of filter chain. */
- block = tcf_block_find(net, &q, &parent, &cl,
- t->tcm_ifindex, t->tcm_block_index, extack);
+ err = __tcf_qdisc_find(net, &q, &parent, t->tcm_ifindex, false, extack);
+ if (err)
+ return err;
+
+ if (tcf_proto_check_kind(tca[TCA_KIND], name)) {
+ NL_SET_ERR_MSG(extack, "Specified TC filter name too long");
+ err = -EINVAL;
+ goto errout;
+ }
+ /* Take rtnl mutex if block is shared (no qdisc found), qdisc is not
+ * unlocked, classifier type is not specified, classifier is not
+ * unlocked.
+ */
+ if ((q && !(q->ops->cl_ops->flags & QDISC_CLASS_OPS_DOIT_UNLOCKED)) ||
+ !tcf_proto_is_unlocked(name)) {
+ rtnl_held = true;
+ rtnl_lock();
+ }
+
+ err = __tcf_qdisc_cl_find(q, parent, &cl, t->tcm_ifindex, extack);
+ if (err)
+ goto errout;
+
+ block = __tcf_block_find(net, q, cl, t->tcm_ifindex, t->tcm_block_index,
+ extack);
if (IS_ERR(block)) {
err = PTR_ERR(block);
goto errout;
@@ -1513,8 +2391,10 @@
goto errout;
}
+ mutex_lock(&chain->filter_chain_lock);
tp = tcf_chain_tp_find(chain, &chain_info, protocol,
prio, false);
+ mutex_unlock(&chain->filter_chain_lock);
if (!tp || IS_ERR(tp)) {
NL_SET_ERR_MSG(extack, "Filter with specified priority/protocol not found");
err = tp ? PTR_ERR(tp) : -ENOENT;
@@ -1532,14 +2412,23 @@
err = -ENOENT;
} else {
err = tfilter_notify(net, skb, n, tp, block, q, parent,
- fh, RTM_NEWTFILTER, true);
+ fh, RTM_NEWTFILTER, true, rtnl_held);
if (err < 0)
NL_SET_ERR_MSG(extack, "Failed to send filter notify message");
}
+ tfilter_put(tp, fh);
errout:
- if (chain)
+ if (chain) {
+ if (tp && !IS_ERR(tp))
+ tcf_proto_put(tp, rtnl_held, NULL);
tcf_chain_put(chain);
+ }
+ tcf_block_release(q, block, rtnl_held);
+
+ if (rtnl_held)
+ rtnl_unlock();
+
return err;
}
@@ -1560,7 +2449,7 @@
return tcf_fill_node(net, a->skb, tp, a->block, a->q, a->parent,
n, NETLINK_CB(a->cb->skb).portid,
a->cb->nlh->nlmsg_seq, NLM_F_MULTI,
- RTM_NEWTFILTER);
+ RTM_NEWTFILTER, true);
}
static bool tcf_chain_dump(struct tcf_chain *chain, struct Qdisc *q, u32 parent,
@@ -1570,11 +2459,15 @@
struct net *net = sock_net(skb->sk);
struct tcf_block *block = chain->block;
struct tcmsg *tcm = nlmsg_data(cb->nlh);
+ struct tcf_proto *tp, *tp_prev;
struct tcf_dump_args arg;
- struct tcf_proto *tp;
- for (tp = rtnl_dereference(chain->filter_chain);
- tp; tp = rtnl_dereference(tp->next), (*p_index)++) {
+ for (tp = __tcf_get_next_proto(chain, NULL);
+ tp;
+ tp_prev = tp,
+ tp = __tcf_get_next_proto(chain, tp),
+ tcf_proto_put(tp_prev, true, NULL),
+ (*p_index)++) {
if (*p_index < index_start)
continue;
if (TC_H_MAJ(tcm->tcm_info) &&
@@ -1590,9 +2483,8 @@
if (tcf_fill_node(net, skb, tp, block, q, parent, NULL,
NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq, NLM_F_MULTI,
- RTM_NEWTFILTER) <= 0)
- return false;
-
+ RTM_NEWTFILTER, true) <= 0)
+ goto errout;
cb->args[1] = 1;
}
if (!tp->ops->walk)
@@ -1607,23 +2499,27 @@
arg.w.skip = cb->args[1] - 1;
arg.w.count = 0;
arg.w.cookie = cb->args[2];
- tp->ops->walk(tp, &arg.w);
+ tp->ops->walk(tp, &arg.w, true);
cb->args[2] = arg.w.cookie;
cb->args[1] = arg.w.count + 1;
if (arg.w.stop)
- return false;
+ goto errout;
}
return true;
+
+errout:
+ tcf_proto_put(tp, true, NULL);
+ return false;
}
/* called with RTNL */
static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb)
{
+ struct tcf_chain *chain, *chain_prev;
struct net *net = sock_net(skb->sk);
struct nlattr *tca[TCA_MAX + 1];
struct Qdisc *q = NULL;
struct tcf_block *block;
- struct tcf_chain *chain;
struct tcmsg *tcm = nlmsg_data(cb->nlh);
long index_start;
long index;
@@ -1633,12 +2529,13 @@
if (nlmsg_len(cb->nlh) < sizeof(*tcm))
return skb->len;
- err = nlmsg_parse(cb->nlh, sizeof(*tcm), tca, TCA_MAX, NULL, NULL);
+ err = nlmsg_parse_deprecated(cb->nlh, sizeof(*tcm), tca, TCA_MAX,
+ NULL, cb->extack);
if (err)
return err;
if (tcm->tcm_ifindex == TCM_IFINDEX_MAGIC_BLOCK) {
- block = tcf_block_lookup(net, tcm->tcm_block_index);
+ block = tcf_block_refcnt_get(net, tcm->tcm_block_index);
if (!block)
goto out;
/* If we work with block index, q is NULL and parent value
@@ -1686,17 +2583,24 @@
index_start = cb->args[0];
index = 0;
- list_for_each_entry(chain, &block->chain_list, list) {
+ for (chain = __tcf_get_next_chain(block, NULL);
+ chain;
+ chain_prev = chain,
+ chain = __tcf_get_next_chain(block, chain),
+ tcf_chain_put(chain_prev)) {
if (tca[TCA_CHAIN] &&
nla_get_u32(tca[TCA_CHAIN]) != chain->index)
continue;
if (!tcf_chain_dump(chain, q, parent, skb, cb,
index_start, &index)) {
+ tcf_chain_put(chain);
err = -EMSGSIZE;
break;
}
}
+ if (tcm->tcm_ifindex == TCM_IFINDEX_MAGIC_BLOCK)
+ tcf_block_refcnt_put(block, true);
cb->args[0] = index;
out:
@@ -1706,8 +2610,10 @@
return skb->len;
}
-static int tc_chain_fill_node(struct tcf_chain *chain, struct net *net,
- struct sk_buff *skb, struct tcf_block *block,
+static int tc_chain_fill_node(const struct tcf_proto_ops *tmplt_ops,
+ void *tmplt_priv, u32 chain_index,
+ struct net *net, struct sk_buff *skb,
+ struct tcf_block *block,
u32 portid, u32 seq, u16 flags, int event)
{
unsigned char *b = skb_tail_pointer(skb);
@@ -1716,8 +2622,8 @@
struct tcmsg *tcm;
void *priv;
- ops = chain->tmplt_ops;
- priv = chain->tmplt_priv;
+ ops = tmplt_ops;
+ priv = tmplt_priv;
nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
if (!nlh)
@@ -1735,7 +2641,7 @@
tcm->tcm_block_index = block->index;
}
- if (nla_put_u32(skb, TCA_CHAIN, chain->index))
+ if (nla_put_u32(skb, TCA_CHAIN, chain_index))
goto nla_put_failure;
if (ops) {
@@ -1761,18 +2667,50 @@
struct tcf_block *block = chain->block;
struct net *net = block->net;
struct sk_buff *skb;
+ int err = 0;
skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
if (!skb)
return -ENOBUFS;
- if (tc_chain_fill_node(chain, net, skb, block, portid,
+ if (tc_chain_fill_node(chain->tmplt_ops, chain->tmplt_priv,
+ chain->index, net, skb, block, portid,
seq, flags, event) <= 0) {
kfree_skb(skb);
return -EINVAL;
}
if (unicast)
+ err = netlink_unicast(net->rtnl, skb, portid, MSG_DONTWAIT);
+ else
+ err = rtnetlink_send(skb, net, portid, RTNLGRP_TC,
+ flags & NLM_F_ECHO);
+
+ if (err > 0)
+ err = 0;
+ return err;
+}
+
+static int tc_chain_notify_delete(const struct tcf_proto_ops *tmplt_ops,
+ void *tmplt_priv, u32 chain_index,
+ struct tcf_block *block, struct sk_buff *oskb,
+ u32 seq, u16 flags, bool unicast)
+{
+ u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
+ struct net *net = block->net;
+ struct sk_buff *skb;
+
+ skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+ if (!skb)
+ return -ENOBUFS;
+
+ if (tc_chain_fill_node(tmplt_ops, tmplt_priv, chain_index, net, skb,
+ block, portid, seq, flags, RTM_DELCHAIN) <= 0) {
+ kfree_skb(skb);
+ return -EINVAL;
+ }
+
+ if (unicast)
return netlink_unicast(net->rtnl, skb, portid, MSG_DONTWAIT);
return rtnetlink_send(skb, net, portid, RTNLGRP_TC, flags & NLM_F_ECHO);
@@ -1789,7 +2727,7 @@
if (!tca[TCA_KIND])
return 0;
- ops = tcf_proto_lookup_ops(nla_data(tca[TCA_KIND]), extack);
+ ops = tcf_proto_lookup_ops(nla_data(tca[TCA_KIND]), true, extack);
if (IS_ERR(ops))
return PTR_ERR(ops);
if (!ops->tmplt_create || !ops->tmplt_destroy || !ops->tmplt_dump) {
@@ -1807,16 +2745,15 @@
return 0;
}
-static void tc_chain_tmplt_del(struct tcf_chain *chain)
+static void tc_chain_tmplt_del(const struct tcf_proto_ops *tmplt_ops,
+ void *tmplt_priv)
{
- const struct tcf_proto_ops *ops = chain->tmplt_ops;
-
/* If template ops are set, no work to do for us. */
- if (!ops)
+ if (!tmplt_ops)
return;
- ops->tmplt_destroy(chain->tmplt_priv);
- module_put(ops->owner);
+ tmplt_ops->tmplt_destroy(tmplt_priv);
+ module_put(tmplt_ops->owner);
}
/* Add/delete/get a chain */
@@ -1840,7 +2777,8 @@
return -EPERM;
replay:
- err = nlmsg_parse(n, sizeof(*t), tca, TCA_MAX, rtm_tca_policy, extack);
+ err = nlmsg_parse_deprecated(n, sizeof(*t), tca, TCA_MAX,
+ rtm_tca_policy, extack);
if (err < 0)
return err;
@@ -1856,8 +2794,11 @@
chain_index = tca[TCA_CHAIN] ? nla_get_u32(tca[TCA_CHAIN]) : 0;
if (chain_index > TC_ACT_EXT_VAL_MASK) {
NL_SET_ERR_MSG(extack, "Specified chain index exceeds upper limit");
- return -EINVAL;
+ err = -EINVAL;
+ goto errout_block;
}
+
+ mutex_lock(&block->lock);
chain = tcf_chain_lookup(block, chain_index);
if (n->nlmsg_type == RTM_NEWCHAIN) {
if (chain) {
@@ -1868,51 +2809,62 @@
tcf_chain_hold(chain);
} else {
NL_SET_ERR_MSG(extack, "Filter chain already exists");
- return -EEXIST;
+ err = -EEXIST;
+ goto errout_block_locked;
}
} else {
if (!(n->nlmsg_flags & NLM_F_CREATE)) {
NL_SET_ERR_MSG(extack, "Need both RTM_NEWCHAIN and NLM_F_CREATE to create a new chain");
- return -ENOENT;
+ err = -ENOENT;
+ goto errout_block_locked;
}
chain = tcf_chain_create(block, chain_index);
if (!chain) {
NL_SET_ERR_MSG(extack, "Failed to create filter chain");
- return -ENOMEM;
+ err = -ENOMEM;
+ goto errout_block_locked;
}
}
} else {
if (!chain || tcf_chain_held_by_acts_only(chain)) {
NL_SET_ERR_MSG(extack, "Cannot find specified filter chain");
- return -EINVAL;
+ err = -EINVAL;
+ goto errout_block_locked;
}
tcf_chain_hold(chain);
}
- switch (n->nlmsg_type) {
- case RTM_NEWCHAIN:
- err = tc_chain_tmplt_add(chain, net, tca, extack);
- if (err)
- goto errout;
- /* In case the chain was successfully added, take a reference
- * to the chain. This ensures that an empty chain
- * does not disappear at the end of this function.
+ if (n->nlmsg_type == RTM_NEWCHAIN) {
+ /* Modifying chain requires holding parent block lock. In case
+ * the chain was successfully added, take a reference to the
+ * chain. This ensures that an empty chain does not disappear at
+ * the end of this function.
*/
tcf_chain_hold(chain);
chain->explicitly_created = true;
+ }
+ mutex_unlock(&block->lock);
+
+ switch (n->nlmsg_type) {
+ case RTM_NEWCHAIN:
+ err = tc_chain_tmplt_add(chain, net, tca, extack);
+ if (err) {
+ tcf_chain_put_explicitly_created(chain);
+ goto errout;
+ }
+
tc_chain_notify(chain, NULL, 0, NLM_F_CREATE | NLM_F_EXCL,
RTM_NEWCHAIN, false);
break;
case RTM_DELCHAIN:
tfilter_notify_chain(net, skb, block, q, parent, n,
- chain, RTM_DELTFILTER);
+ chain, RTM_DELTFILTER, true);
/* Flush the chain first as the user requested chain removal. */
- tcf_chain_flush(chain);
+ tcf_chain_flush(chain, true);
/* In case the chain was successfully deleted, put a reference
* to the chain previously taken during addition.
*/
tcf_chain_put_explicitly_created(chain);
- chain->explicitly_created = false;
break;
case RTM_GETCHAIN:
err = tc_chain_notify(chain, skb, n->nlmsg_seq,
@@ -1928,10 +2880,16 @@
errout:
tcf_chain_put(chain);
+errout_block:
+ tcf_block_release(q, block, true);
if (err == -EAGAIN)
/* Replay the request. */
goto replay;
return err;
+
+errout_block_locked:
+ mutex_unlock(&block->lock);
+ goto errout_block;
}
/* called with RTNL */
@@ -1941,8 +2899,8 @@
struct nlattr *tca[TCA_MAX + 1];
struct Qdisc *q = NULL;
struct tcf_block *block;
- struct tcf_chain *chain;
struct tcmsg *tcm = nlmsg_data(cb->nlh);
+ struct tcf_chain *chain;
long index_start;
long index;
u32 parent;
@@ -1951,13 +2909,13 @@
if (nlmsg_len(cb->nlh) < sizeof(*tcm))
return skb->len;
- err = nlmsg_parse(cb->nlh, sizeof(*tcm), tca, TCA_MAX, rtm_tca_policy,
- NULL);
+ err = nlmsg_parse_deprecated(cb->nlh, sizeof(*tcm), tca, TCA_MAX,
+ rtm_tca_policy, cb->extack);
if (err)
return err;
if (tcm->tcm_ifindex == TCM_IFINDEX_MAGIC_BLOCK) {
- block = tcf_block_lookup(net, tcm->tcm_block_index);
+ block = tcf_block_refcnt_get(net, tcm->tcm_block_index);
if (!block)
goto out;
/* If we work with block index, q is NULL and parent value
@@ -2005,6 +2963,7 @@
index_start = cb->args[0];
index = 0;
+ mutex_lock(&block->lock);
list_for_each_entry(chain, &block->chain_list, list) {
if ((tca[TCA_CHAIN] &&
nla_get_u32(tca[TCA_CHAIN]) != chain->index))
@@ -2015,7 +2974,8 @@
}
if (tcf_chain_held_by_acts_only(chain))
continue;
- err = tc_chain_fill_node(chain, net, skb, block,
+ err = tc_chain_fill_node(chain->tmplt_ops, chain->tmplt_priv,
+ chain->index, net, skb, block,
NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq, NLM_F_MULTI,
RTM_NEWCHAIN);
@@ -2023,7 +2983,10 @@
break;
index++;
}
+ mutex_unlock(&block->lock);
+ if (tcm->tcm_ifindex == TCM_IFINDEX_MAGIC_BLOCK)
+ tcf_block_refcnt_put(block, true);
cb->args[0] = index;
out:
@@ -2036,8 +2999,10 @@
void tcf_exts_destroy(struct tcf_exts *exts)
{
#ifdef CONFIG_NET_CLS_ACT
- tcf_action_destroy(exts->actions, TCA_ACT_UNBIND);
- kfree(exts->actions);
+ if (exts->actions) {
+ tcf_action_destroy(exts->actions, TCA_ACT_UNBIND);
+ kfree(exts->actions);
+ }
exts->nr_actions = 0;
#endif
}
@@ -2045,7 +3010,7 @@
int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb,
struct nlattr *rate_tlv, struct tcf_exts *exts, bool ovr,
- struct netlink_ext_ack *extack)
+ bool rtnl_held, struct netlink_ext_ack *extack)
{
#ifdef CONFIG_NET_CLS_ACT
{
@@ -2055,7 +3020,8 @@
if (exts->police && tb[exts->police]) {
act = tcf_action_init_1(net, tp, tb[exts->police],
rate_tlv, "police", ovr,
- TCA_ACT_BIND, true, extack);
+ TCA_ACT_BIND, rtnl_held,
+ extack);
if (IS_ERR(act))
return PTR_ERR(act);
@@ -2067,13 +3033,12 @@
err = tcf_action_init(net, tp, tb[exts->action],
rate_tlv, NULL, ovr, TCA_ACT_BIND,
- exts->actions, &attr_size, true,
- extack);
+ exts->actions, &attr_size,
+ rtnl_held, extack);
if (err < 0)
return err;
exts->nr_actions = err;
}
- exts->net = net;
}
#else
if ((exts->action && tb[exts->action]) ||
@@ -2120,7 +3085,7 @@
* tc data even if iproute2 was newer - jhs
*/
if (exts->type != TCA_OLD_COMPAT) {
- nest = nla_nest_start(skb, exts->action);
+ nest = nla_nest_start_noflag(skb, exts->action);
if (nest == NULL)
goto nla_put_failure;
@@ -2129,7 +3094,7 @@
nla_nest_end(skb, nest);
} else if (exts->police) {
struct tc_action *act = tcf_exts_first_act(exts);
- nest = nla_nest_start(skb, exts->police);
+ nest = nla_nest_start_noflag(skb, exts->police);
if (nest == NULL || !act)
goto nla_put_failure;
if (tcf_action_dump_old(skb, act, 0, 0) < 0)
@@ -2160,62 +3125,489 @@
}
EXPORT_SYMBOL(tcf_exts_dump_stats);
-static int tc_exts_setup_cb_egdev_call(struct tcf_exts *exts,
- enum tc_setup_type type,
- void *type_data, bool err_stop)
+static void tcf_block_offload_inc(struct tcf_block *block, u32 *flags)
{
- int ok_count = 0;
-#ifdef CONFIG_NET_CLS_ACT
- const struct tc_action *a;
- struct net_device *dev;
- int i, ret;
+ if (*flags & TCA_CLS_FLAGS_IN_HW)
+ return;
+ *flags |= TCA_CLS_FLAGS_IN_HW;
+ atomic_inc(&block->offloadcnt);
+}
- if (!tcf_exts_has_actions(exts))
- return 0;
+static void tcf_block_offload_dec(struct tcf_block *block, u32 *flags)
+{
+ if (!(*flags & TCA_CLS_FLAGS_IN_HW))
+ return;
+ *flags &= ~TCA_CLS_FLAGS_IN_HW;
+ atomic_dec(&block->offloadcnt);
+}
- for (i = 0; i < exts->nr_actions; i++) {
- a = exts->actions[i];
- if (!a->ops->get_dev)
- continue;
- dev = a->ops->get_dev(a);
- if (!dev)
- continue;
- ret = tc_setup_cb_egdev_call(dev, type, type_data, err_stop);
- a->ops->put_dev(dev);
- if (ret < 0)
- return ret;
- ok_count += ret;
+static void tc_cls_offload_cnt_update(struct tcf_block *block,
+ struct tcf_proto *tp, u32 *cnt,
+ u32 *flags, u32 diff, bool add)
+{
+ lockdep_assert_held(&block->cb_lock);
+
+ spin_lock(&tp->lock);
+ if (add) {
+ if (!*cnt)
+ tcf_block_offload_inc(block, flags);
+ *cnt += diff;
+ } else {
+ *cnt -= diff;
+ if (!*cnt)
+ tcf_block_offload_dec(block, flags);
}
-#endif
+ spin_unlock(&tp->lock);
+}
+
+static void
+tc_cls_offload_cnt_reset(struct tcf_block *block, struct tcf_proto *tp,
+ u32 *cnt, u32 *flags)
+{
+ lockdep_assert_held(&block->cb_lock);
+
+ spin_lock(&tp->lock);
+ tcf_block_offload_dec(block, flags);
+ *cnt = 0;
+ spin_unlock(&tp->lock);
+}
+
+static int
+__tc_setup_cb_call(struct tcf_block *block, enum tc_setup_type type,
+ void *type_data, bool err_stop)
+{
+ struct flow_block_cb *block_cb;
+ int ok_count = 0;
+ int err;
+
+ list_for_each_entry(block_cb, &block->flow_block.cb_list, list) {
+ err = block_cb->cb(type, type_data, block_cb->cb_priv);
+ if (err) {
+ if (err_stop)
+ return err;
+ } else {
+ ok_count++;
+ }
+ }
return ok_count;
}
-int tc_setup_cb_call(struct tcf_block *block, struct tcf_exts *exts,
- enum tc_setup_type type, void *type_data, bool err_stop)
+int tc_setup_cb_call(struct tcf_block *block, enum tc_setup_type type,
+ void *type_data, bool err_stop, bool rtnl_held)
{
+ bool take_rtnl = READ_ONCE(block->lockeddevcnt) && !rtnl_held;
int ok_count;
- int ret;
- ret = tcf_block_cb_call(block, type, type_data, err_stop);
- if (ret < 0)
- return ret;
- ok_count = ret;
+retry:
+ if (take_rtnl)
+ rtnl_lock();
+ down_read(&block->cb_lock);
+ /* Need to obtain rtnl lock if block is bound to devs that require it.
+ * In block bind code cb_lock is obtained while holding rtnl, so we must
+ * obtain the locks in same order here.
+ */
+ if (!rtnl_held && !take_rtnl && block->lockeddevcnt) {
+ up_read(&block->cb_lock);
+ take_rtnl = true;
+ goto retry;
+ }
- if (!exts || ok_count)
- return ok_count;
- ret = tc_exts_setup_cb_egdev_call(exts, type, type_data, err_stop);
- if (ret < 0)
- return ret;
- ok_count += ret;
+ ok_count = __tc_setup_cb_call(block, type, type_data, err_stop);
+ up_read(&block->cb_lock);
+ if (take_rtnl)
+ rtnl_unlock();
return ok_count;
}
EXPORT_SYMBOL(tc_setup_cb_call);
+/* Non-destructive filter add. If filter that wasn't already in hardware is
+ * successfully offloaded, increment block offloads counter. On failure,
+ * previously offloaded filter is considered to be intact and offloads counter
+ * is not decremented.
+ */
+
+int tc_setup_cb_add(struct tcf_block *block, struct tcf_proto *tp,
+ enum tc_setup_type type, void *type_data, bool err_stop,
+ u32 *flags, unsigned int *in_hw_count, bool rtnl_held)
+{
+ bool take_rtnl = READ_ONCE(block->lockeddevcnt) && !rtnl_held;
+ int ok_count;
+
+retry:
+ if (take_rtnl)
+ rtnl_lock();
+ down_read(&block->cb_lock);
+ /* Need to obtain rtnl lock if block is bound to devs that require it.
+ * In block bind code cb_lock is obtained while holding rtnl, so we must
+ * obtain the locks in same order here.
+ */
+ if (!rtnl_held && !take_rtnl && block->lockeddevcnt) {
+ up_read(&block->cb_lock);
+ take_rtnl = true;
+ goto retry;
+ }
+
+ /* Make sure all netdevs sharing this block are offload-capable. */
+ if (block->nooffloaddevcnt && err_stop) {
+ ok_count = -EOPNOTSUPP;
+ goto err_unlock;
+ }
+
+ ok_count = __tc_setup_cb_call(block, type, type_data, err_stop);
+ if (ok_count < 0)
+ goto err_unlock;
+
+ if (tp->ops->hw_add)
+ tp->ops->hw_add(tp, type_data);
+ if (ok_count > 0)
+ tc_cls_offload_cnt_update(block, tp, in_hw_count, flags,
+ ok_count, true);
+err_unlock:
+ up_read(&block->cb_lock);
+ if (take_rtnl)
+ rtnl_unlock();
+ return ok_count < 0 ? ok_count : 0;
+}
+EXPORT_SYMBOL(tc_setup_cb_add);
+
+/* Destructive filter replace. If filter that wasn't already in hardware is
+ * successfully offloaded, increment block offload counter. On failure,
+ * previously offloaded filter is considered to be destroyed and offload counter
+ * is decremented.
+ */
+
+int tc_setup_cb_replace(struct tcf_block *block, struct tcf_proto *tp,
+ enum tc_setup_type type, void *type_data, bool err_stop,
+ u32 *old_flags, unsigned int *old_in_hw_count,
+ u32 *new_flags, unsigned int *new_in_hw_count,
+ bool rtnl_held)
+{
+ bool take_rtnl = READ_ONCE(block->lockeddevcnt) && !rtnl_held;
+ int ok_count;
+
+retry:
+ if (take_rtnl)
+ rtnl_lock();
+ down_read(&block->cb_lock);
+ /* Need to obtain rtnl lock if block is bound to devs that require it.
+ * In block bind code cb_lock is obtained while holding rtnl, so we must
+ * obtain the locks in same order here.
+ */
+ if (!rtnl_held && !take_rtnl && block->lockeddevcnt) {
+ up_read(&block->cb_lock);
+ take_rtnl = true;
+ goto retry;
+ }
+
+ /* Make sure all netdevs sharing this block are offload-capable. */
+ if (block->nooffloaddevcnt && err_stop) {
+ ok_count = -EOPNOTSUPP;
+ goto err_unlock;
+ }
+
+ tc_cls_offload_cnt_reset(block, tp, old_in_hw_count, old_flags);
+ if (tp->ops->hw_del)
+ tp->ops->hw_del(tp, type_data);
+
+ ok_count = __tc_setup_cb_call(block, type, type_data, err_stop);
+ if (ok_count < 0)
+ goto err_unlock;
+
+ if (tp->ops->hw_add)
+ tp->ops->hw_add(tp, type_data);
+ if (ok_count > 0)
+ tc_cls_offload_cnt_update(block, tp, new_in_hw_count,
+ new_flags, ok_count, true);
+err_unlock:
+ up_read(&block->cb_lock);
+ if (take_rtnl)
+ rtnl_unlock();
+ return ok_count < 0 ? ok_count : 0;
+}
+EXPORT_SYMBOL(tc_setup_cb_replace);
+
+/* Destroy filter and decrement block offload counter, if filter was previously
+ * offloaded.
+ */
+
+int tc_setup_cb_destroy(struct tcf_block *block, struct tcf_proto *tp,
+ enum tc_setup_type type, void *type_data, bool err_stop,
+ u32 *flags, unsigned int *in_hw_count, bool rtnl_held)
+{
+ bool take_rtnl = READ_ONCE(block->lockeddevcnt) && !rtnl_held;
+ int ok_count;
+
+retry:
+ if (take_rtnl)
+ rtnl_lock();
+ down_read(&block->cb_lock);
+ /* Need to obtain rtnl lock if block is bound to devs that require it.
+ * In block bind code cb_lock is obtained while holding rtnl, so we must
+ * obtain the locks in same order here.
+ */
+ if (!rtnl_held && !take_rtnl && block->lockeddevcnt) {
+ up_read(&block->cb_lock);
+ take_rtnl = true;
+ goto retry;
+ }
+
+ ok_count = __tc_setup_cb_call(block, type, type_data, err_stop);
+
+ tc_cls_offload_cnt_reset(block, tp, in_hw_count, flags);
+ if (tp->ops->hw_del)
+ tp->ops->hw_del(tp, type_data);
+
+ up_read(&block->cb_lock);
+ if (take_rtnl)
+ rtnl_unlock();
+ return ok_count < 0 ? ok_count : 0;
+}
+EXPORT_SYMBOL(tc_setup_cb_destroy);
+
+int tc_setup_cb_reoffload(struct tcf_block *block, struct tcf_proto *tp,
+ bool add, flow_setup_cb_t *cb,
+ enum tc_setup_type type, void *type_data,
+ void *cb_priv, u32 *flags, unsigned int *in_hw_count)
+{
+ int err = cb(type, type_data, cb_priv);
+
+ if (err) {
+ if (add && tc_skip_sw(*flags))
+ return err;
+ } else {
+ tc_cls_offload_cnt_update(block, tp, in_hw_count, flags, 1,
+ add);
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(tc_setup_cb_reoffload);
+
+void tc_cleanup_flow_action(struct flow_action *flow_action)
+{
+ struct flow_action_entry *entry;
+ int i;
+
+ flow_action_for_each(i, entry, flow_action)
+ if (entry->destructor)
+ entry->destructor(entry->destructor_priv);
+}
+EXPORT_SYMBOL(tc_cleanup_flow_action);
+
+static void tcf_mirred_get_dev(struct flow_action_entry *entry,
+ const struct tc_action *act)
+{
+#ifdef CONFIG_NET_CLS_ACT
+ entry->dev = act->ops->get_dev(act, &entry->destructor);
+ if (!entry->dev)
+ return;
+ entry->destructor_priv = entry->dev;
+#endif
+}
+
+static void tcf_tunnel_encap_put_tunnel(void *priv)
+{
+ struct ip_tunnel_info *tunnel = priv;
+
+ kfree(tunnel);
+}
+
+static int tcf_tunnel_encap_get_tunnel(struct flow_action_entry *entry,
+ const struct tc_action *act)
+{
+ entry->tunnel = tcf_tunnel_info_copy(act);
+ if (!entry->tunnel)
+ return -ENOMEM;
+ entry->destructor = tcf_tunnel_encap_put_tunnel;
+ entry->destructor_priv = entry->tunnel;
+ return 0;
+}
+
+static void tcf_sample_get_group(struct flow_action_entry *entry,
+ const struct tc_action *act)
+{
+#ifdef CONFIG_NET_CLS_ACT
+ entry->sample.psample_group =
+ act->ops->get_psample_group(act, &entry->destructor);
+ entry->destructor_priv = entry->sample.psample_group;
+#endif
+}
+
+int tc_setup_flow_action(struct flow_action *flow_action,
+ const struct tcf_exts *exts, bool rtnl_held)
+{
+ const struct tc_action *act;
+ int i, j, k, err = 0;
+
+ if (!exts)
+ return 0;
+
+ if (!rtnl_held)
+ rtnl_lock();
+
+ j = 0;
+ tcf_exts_for_each_action(i, act, exts) {
+ struct flow_action_entry *entry;
+
+ entry = &flow_action->entries[j];
+ if (is_tcf_gact_ok(act)) {
+ entry->id = FLOW_ACTION_ACCEPT;
+ } else if (is_tcf_gact_shot(act)) {
+ entry->id = FLOW_ACTION_DROP;
+ } else if (is_tcf_gact_trap(act)) {
+ entry->id = FLOW_ACTION_TRAP;
+ } else if (is_tcf_gact_goto_chain(act)) {
+ entry->id = FLOW_ACTION_GOTO;
+ entry->chain_index = tcf_gact_goto_chain_index(act);
+ } else if (is_tcf_mirred_egress_redirect(act)) {
+ entry->id = FLOW_ACTION_REDIRECT;
+ tcf_mirred_get_dev(entry, act);
+ } else if (is_tcf_mirred_egress_mirror(act)) {
+ entry->id = FLOW_ACTION_MIRRED;
+ tcf_mirred_get_dev(entry, act);
+ } else if (is_tcf_mirred_ingress_redirect(act)) {
+ entry->id = FLOW_ACTION_REDIRECT_INGRESS;
+ tcf_mirred_get_dev(entry, act);
+ } else if (is_tcf_mirred_ingress_mirror(act)) {
+ entry->id = FLOW_ACTION_MIRRED_INGRESS;
+ tcf_mirred_get_dev(entry, act);
+ } else if (is_tcf_vlan(act)) {
+ switch (tcf_vlan_action(act)) {
+ case TCA_VLAN_ACT_PUSH:
+ entry->id = FLOW_ACTION_VLAN_PUSH;
+ entry->vlan.vid = tcf_vlan_push_vid(act);
+ entry->vlan.proto = tcf_vlan_push_proto(act);
+ entry->vlan.prio = tcf_vlan_push_prio(act);
+ break;
+ case TCA_VLAN_ACT_POP:
+ entry->id = FLOW_ACTION_VLAN_POP;
+ break;
+ case TCA_VLAN_ACT_MODIFY:
+ entry->id = FLOW_ACTION_VLAN_MANGLE;
+ entry->vlan.vid = tcf_vlan_push_vid(act);
+ entry->vlan.proto = tcf_vlan_push_proto(act);
+ entry->vlan.prio = tcf_vlan_push_prio(act);
+ break;
+ default:
+ err = -EOPNOTSUPP;
+ goto err_out;
+ }
+ } else if (is_tcf_tunnel_set(act)) {
+ entry->id = FLOW_ACTION_TUNNEL_ENCAP;
+ err = tcf_tunnel_encap_get_tunnel(entry, act);
+ if (err)
+ goto err_out;
+ } else if (is_tcf_tunnel_release(act)) {
+ entry->id = FLOW_ACTION_TUNNEL_DECAP;
+ } else if (is_tcf_pedit(act)) {
+ for (k = 0; k < tcf_pedit_nkeys(act); k++) {
+ switch (tcf_pedit_cmd(act, k)) {
+ case TCA_PEDIT_KEY_EX_CMD_SET:
+ entry->id = FLOW_ACTION_MANGLE;
+ break;
+ case TCA_PEDIT_KEY_EX_CMD_ADD:
+ entry->id = FLOW_ACTION_ADD;
+ break;
+ default:
+ err = -EOPNOTSUPP;
+ goto err_out;
+ }
+ entry->mangle.htype = tcf_pedit_htype(act, k);
+ entry->mangle.mask = tcf_pedit_mask(act, k);
+ entry->mangle.val = tcf_pedit_val(act, k);
+ entry->mangle.offset = tcf_pedit_offset(act, k);
+ entry = &flow_action->entries[++j];
+ }
+ } else if (is_tcf_csum(act)) {
+ entry->id = FLOW_ACTION_CSUM;
+ entry->csum_flags = tcf_csum_update_flags(act);
+ } else if (is_tcf_skbedit_mark(act)) {
+ entry->id = FLOW_ACTION_MARK;
+ entry->mark = tcf_skbedit_mark(act);
+ } else if (is_tcf_sample(act)) {
+ entry->id = FLOW_ACTION_SAMPLE;
+ entry->sample.trunc_size = tcf_sample_trunc_size(act);
+ entry->sample.truncate = tcf_sample_truncate(act);
+ entry->sample.rate = tcf_sample_rate(act);
+ tcf_sample_get_group(entry, act);
+ } else if (is_tcf_police(act)) {
+ entry->id = FLOW_ACTION_POLICE;
+ entry->police.burst = tcf_police_tcfp_burst(act);
+ entry->police.rate_bytes_ps =
+ tcf_police_rate_bytes_ps(act);
+ } else if (is_tcf_ct(act)) {
+ entry->id = FLOW_ACTION_CT;
+ entry->ct.action = tcf_ct_action(act);
+ entry->ct.zone = tcf_ct_zone(act);
+ } else if (is_tcf_mpls(act)) {
+ switch (tcf_mpls_action(act)) {
+ case TCA_MPLS_ACT_PUSH:
+ entry->id = FLOW_ACTION_MPLS_PUSH;
+ entry->mpls_push.proto = tcf_mpls_proto(act);
+ entry->mpls_push.label = tcf_mpls_label(act);
+ entry->mpls_push.tc = tcf_mpls_tc(act);
+ entry->mpls_push.bos = tcf_mpls_bos(act);
+ entry->mpls_push.ttl = tcf_mpls_ttl(act);
+ break;
+ case TCA_MPLS_ACT_POP:
+ entry->id = FLOW_ACTION_MPLS_POP;
+ entry->mpls_pop.proto = tcf_mpls_proto(act);
+ break;
+ case TCA_MPLS_ACT_MODIFY:
+ entry->id = FLOW_ACTION_MPLS_MANGLE;
+ entry->mpls_mangle.label = tcf_mpls_label(act);
+ entry->mpls_mangle.tc = tcf_mpls_tc(act);
+ entry->mpls_mangle.bos = tcf_mpls_bos(act);
+ entry->mpls_mangle.ttl = tcf_mpls_ttl(act);
+ break;
+ default:
+ goto err_out;
+ }
+ } else if (is_tcf_skbedit_ptype(act)) {
+ entry->id = FLOW_ACTION_PTYPE;
+ entry->ptype = tcf_skbedit_ptype(act);
+ } else {
+ err = -EOPNOTSUPP;
+ goto err_out;
+ }
+
+ if (!is_tcf_pedit(act))
+ j++;
+ }
+
+err_out:
+ if (!rtnl_held)
+ rtnl_unlock();
+
+ if (err)
+ tc_cleanup_flow_action(flow_action);
+
+ return err;
+}
+EXPORT_SYMBOL(tc_setup_flow_action);
+
+unsigned int tcf_exts_num_actions(struct tcf_exts *exts)
+{
+ unsigned int num_acts = 0;
+ struct tc_action *act;
+ int i;
+
+ tcf_exts_for_each_action(i, act, exts) {
+ if (is_tcf_pedit(act))
+ num_acts += tcf_pedit_nkeys(act);
+ else
+ num_acts++;
+ }
+ return num_acts;
+}
+EXPORT_SYMBOL(tcf_exts_num_actions);
+
static __net_init int tcf_net_init(struct net *net)
{
struct tcf_net *tn = net_generic(net, tcf_net_id);
+ spin_lock_init(&tn->idr_lock);
idr_init(&tn->idr);
return 0;
}
@@ -2234,6 +3626,11 @@
.size = sizeof(struct tcf_net),
};
+static struct flow_indr_block_ing_entry block_ing_entry = {
+ .cb = tc_indr_block_get_and_ing_cmd,
+ .list = LIST_HEAD_INIT(block_ing_entry.list),
+};
+
static int __init tc_filter_init(void)
{
int err;
@@ -2246,10 +3643,14 @@
if (err)
goto err_register_pernet_subsys;
- rtnl_register(PF_UNSPEC, RTM_NEWTFILTER, tc_new_tfilter, NULL, 0);
- rtnl_register(PF_UNSPEC, RTM_DELTFILTER, tc_del_tfilter, NULL, 0);
+ flow_indr_add_block_ing_cb(&block_ing_entry);
+
+ rtnl_register(PF_UNSPEC, RTM_NEWTFILTER, tc_new_tfilter, NULL,
+ RTNL_FLAG_DOIT_UNLOCKED);
+ rtnl_register(PF_UNSPEC, RTM_DELTFILTER, tc_del_tfilter, NULL,
+ RTNL_FLAG_DOIT_UNLOCKED);
rtnl_register(PF_UNSPEC, RTM_GETTFILTER, tc_get_tfilter,
- tc_dump_tfilter, 0);
+ tc_dump_tfilter, RTNL_FLAG_DOIT_UNLOCKED);
rtnl_register(PF_UNSPEC, RTM_NEWCHAIN, tc_ctl_chain, NULL, 0);
rtnl_register(PF_UNSPEC, RTM_DELCHAIN, tc_ctl_chain, NULL, 0);
rtnl_register(PF_UNSPEC, RTM_GETCHAIN, tc_ctl_chain,
diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c
index 6a5dce8..4aafbe3 100644
--- a/net/sched/cls_basic.c
+++ b/net/sched/cls_basic.c
@@ -1,11 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* net/sched/cls_basic.c Basic Packet Classifier.
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Authors: Thomas Graf <tgraf@suug.ch>
*/
@@ -18,6 +14,7 @@
#include <linux/rtnetlink.h>
#include <linux/skbuff.h>
#include <linux/idr.h>
+#include <linux/percpu.h>
#include <net/netlink.h>
#include <net/act_api.h>
#include <net/pkt_cls.h>
@@ -35,6 +32,7 @@
struct tcf_result res;
struct tcf_proto *tp;
struct list_head link;
+ struct tc_basic_pcnt __percpu *pf;
struct rcu_work rwork;
};
@@ -46,8 +44,10 @@
struct basic_filter *f;
list_for_each_entry_rcu(f, &head->flist, link) {
+ __this_cpu_inc(f->pf->rcnt);
if (!tcf_em_tree_match(skb, &f->ematches, NULL))
continue;
+ __this_cpu_inc(f->pf->rhit);
*res = f->res;
r = tcf_exts_exec(skb, &f->exts, res);
if (r < 0)
@@ -89,6 +89,7 @@
tcf_exts_destroy(&f->exts);
tcf_em_tree_destroy(&f->ematches);
tcf_exts_put_net(&f->exts);
+ free_percpu(f->pf);
kfree(f);
}
@@ -102,7 +103,8 @@
rtnl_unlock();
}
-static void basic_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack)
+static void basic_destroy(struct tcf_proto *tp, bool rtnl_held,
+ struct netlink_ext_ack *extack)
{
struct basic_head *head = rtnl_dereference(tp->root);
struct basic_filter *f, *n;
@@ -121,7 +123,7 @@
}
static int basic_delete(struct tcf_proto *tp, void *arg, bool *last,
- struct netlink_ext_ack *extack)
+ bool rtnl_held, struct netlink_ext_ack *extack)
{
struct basic_head *head = rtnl_dereference(tp->root);
struct basic_filter *f = arg;
@@ -148,7 +150,7 @@
{
int err;
- err = tcf_exts_validate(net, tp, tb, est, &f->exts, ovr, extack);
+ err = tcf_exts_validate(net, tp, tb, est, &f->exts, ovr, true, extack);
if (err < 0)
return err;
@@ -168,7 +170,7 @@
static int basic_change(struct net *net, struct sk_buff *in_skb,
struct tcf_proto *tp, unsigned long base, u32 handle,
struct nlattr **tca, void **arg, bool ovr,
- struct netlink_ext_ack *extack)
+ bool rtnl_held, struct netlink_ext_ack *extack)
{
int err;
struct basic_head *head = rtnl_dereference(tp->root);
@@ -179,8 +181,8 @@
if (tca[TCA_OPTIONS] == NULL)
return -EINVAL;
- err = nla_parse_nested(tb, TCA_BASIC_MAX, tca[TCA_OPTIONS],
- basic_policy, NULL);
+ err = nla_parse_nested_deprecated(tb, TCA_BASIC_MAX, tca[TCA_OPTIONS],
+ basic_policy, NULL);
if (err < 0)
return err;
@@ -193,7 +195,7 @@
if (!fnew)
return -ENOBUFS;
- err = tcf_exts_init(&fnew->exts, TCA_BASIC_ACT, TCA_BASIC_POLICE);
+ err = tcf_exts_init(&fnew->exts, net, TCA_BASIC_ACT, TCA_BASIC_POLICE);
if (err < 0)
goto errout;
@@ -208,6 +210,11 @@
if (err)
goto errout;
fnew->handle = handle;
+ fnew->pf = alloc_percpu(struct tc_basic_pcnt);
+ if (!fnew->pf) {
+ err = -ENOMEM;
+ goto errout;
+ }
err = basic_set_parms(net, tp, fnew, base, tb, tca[TCA_RATE], ovr,
extack);
@@ -231,12 +238,14 @@
return 0;
errout:
+ free_percpu(fnew->pf);
tcf_exts_destroy(&fnew->exts);
kfree(fnew);
return err;
}
-static void basic_walk(struct tcf_proto *tp, struct tcf_walker *arg)
+static void basic_walk(struct tcf_proto *tp, struct tcf_walker *arg,
+ bool rtnl_held)
{
struct basic_head *head = rtnl_dereference(tp->root);
struct basic_filter *f;
@@ -263,17 +272,19 @@
}
static int basic_dump(struct net *net, struct tcf_proto *tp, void *fh,
- struct sk_buff *skb, struct tcmsg *t)
+ struct sk_buff *skb, struct tcmsg *t, bool rtnl_held)
{
+ struct tc_basic_pcnt gpf = {};
struct basic_filter *f = fh;
struct nlattr *nest;
+ int cpu;
if (f == NULL)
return skb->len;
t->tcm_handle = f->handle;
- nest = nla_nest_start(skb, TCA_OPTIONS);
+ nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
if (nest == NULL)
goto nla_put_failure;
@@ -281,6 +292,18 @@
nla_put_u32(skb, TCA_BASIC_CLASSID, f->res.classid))
goto nla_put_failure;
+ for_each_possible_cpu(cpu) {
+ struct tc_basic_pcnt *pf = per_cpu_ptr(f->pf, cpu);
+
+ gpf.rcnt += pf->rcnt;
+ gpf.rhit += pf->rhit;
+ }
+
+ if (nla_put_64bit(skb, TCA_BASIC_PCNT,
+ sizeof(struct tc_basic_pcnt),
+ &gpf, TCA_BASIC_PAD))
+ goto nla_put_failure;
+
if (tcf_exts_dump(skb, &f->exts) < 0 ||
tcf_em_tree_dump(skb, &f->ematches, TCA_BASIC_EMATCHES) < 0)
goto nla_put_failure;
diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
index fa6fe2f..8229ed4 100644
--- a/net/sched/cls_bpf.c
+++ b/net/sched/cls_bpf.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Berkeley Packet Filter based traffic classifier
*
@@ -6,10 +7,6 @@
* ematches.
*
* (C) 2013 Daniel Borkmann <dborkman@redhat.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/module.h>
@@ -157,8 +154,7 @@
skip_sw = prog && tc_skip_sw(prog->gen_flags);
obj = prog ?: oldprog;
- tc_cls_common_offload_init(&cls_bpf.common, tp, obj->gen_flags,
- extack);
+ tc_cls_common_offload_init(&cls_bpf.common, tp, obj->gen_flags, extack);
cls_bpf.command = TC_CLSBPF_OFFLOAD;
cls_bpf.exts = &obj->exts;
cls_bpf.prog = prog ? prog->filter : NULL;
@@ -166,18 +162,24 @@
cls_bpf.name = obj->bpf_name;
cls_bpf.exts_integrated = obj->exts_integrated;
- if (oldprog)
- tcf_block_offload_dec(block, &oldprog->gen_flags);
+ if (oldprog && prog)
+ err = tc_setup_cb_replace(block, tp, TC_SETUP_CLSBPF, &cls_bpf,
+ skip_sw, &oldprog->gen_flags,
+ &oldprog->in_hw_count,
+ &prog->gen_flags, &prog->in_hw_count,
+ true);
+ else if (prog)
+ err = tc_setup_cb_add(block, tp, TC_SETUP_CLSBPF, &cls_bpf,
+ skip_sw, &prog->gen_flags,
+ &prog->in_hw_count, true);
+ else
+ err = tc_setup_cb_destroy(block, tp, TC_SETUP_CLSBPF, &cls_bpf,
+ skip_sw, &oldprog->gen_flags,
+ &oldprog->in_hw_count, true);
- err = tc_setup_cb_call(block, NULL, TC_SETUP_CLSBPF, &cls_bpf, skip_sw);
- if (prog) {
- if (err < 0) {
- cls_bpf_offload_cmd(tp, oldprog, prog, extack);
- return err;
- } else if (err > 0) {
- prog->in_hw_count = err;
- tcf_block_offload_inc(block, &prog->gen_flags);
- }
+ if (prog && err) {
+ cls_bpf_offload_cmd(tp, oldprog, prog, extack);
+ return err;
}
if (prog && skip_sw && !(prog->gen_flags & TCA_CLS_FLAGS_IN_HW))
@@ -234,7 +236,7 @@
cls_bpf.name = prog->bpf_name;
cls_bpf.exts_integrated = prog->exts_integrated;
- tc_setup_cb_call(block, NULL, TC_SETUP_CLSBPF, &cls_bpf, false);
+ tc_setup_cb_call(block, TC_SETUP_CLSBPF, &cls_bpf, false, true);
}
static int cls_bpf_init(struct tcf_proto *tp)
@@ -298,7 +300,7 @@
}
static int cls_bpf_delete(struct tcf_proto *tp, void *arg, bool *last,
- struct netlink_ext_ack *extack)
+ bool rtnl_held, struct netlink_ext_ack *extack)
{
struct cls_bpf_head *head = rtnl_dereference(tp->root);
@@ -307,7 +309,7 @@
return 0;
}
-static void cls_bpf_destroy(struct tcf_proto *tp,
+static void cls_bpf_destroy(struct tcf_proto *tp, bool rtnl_held,
struct netlink_ext_ack *extack)
{
struct cls_bpf_head *head = rtnl_dereference(tp->root);
@@ -417,7 +419,8 @@
if ((!is_bpf && !is_ebpf) || (is_bpf && is_ebpf))
return -EINVAL;
- ret = tcf_exts_validate(net, tp, tb, est, &prog->exts, ovr, extack);
+ ret = tcf_exts_validate(net, tp, tb, est, &prog->exts, ovr, true,
+ extack);
if (ret < 0)
return ret;
@@ -455,7 +458,8 @@
static int cls_bpf_change(struct net *net, struct sk_buff *in_skb,
struct tcf_proto *tp, unsigned long base,
u32 handle, struct nlattr **tca,
- void **arg, bool ovr, struct netlink_ext_ack *extack)
+ void **arg, bool ovr, bool rtnl_held,
+ struct netlink_ext_ack *extack)
{
struct cls_bpf_head *head = rtnl_dereference(tp->root);
struct cls_bpf_prog *oldprog = *arg;
@@ -466,8 +470,8 @@
if (tca[TCA_OPTIONS] == NULL)
return -EINVAL;
- ret = nla_parse_nested(tb, TCA_BPF_MAX, tca[TCA_OPTIONS], bpf_policy,
- NULL);
+ ret = nla_parse_nested_deprecated(tb, TCA_BPF_MAX, tca[TCA_OPTIONS],
+ bpf_policy, NULL);
if (ret < 0)
return ret;
@@ -475,7 +479,7 @@
if (!prog)
return -ENOBUFS;
- ret = tcf_exts_init(&prog->exts, TCA_BPF_ACT, TCA_BPF_POLICE);
+ ret = tcf_exts_init(&prog->exts, net, TCA_BPF_ACT, TCA_BPF_POLICE);
if (ret < 0)
goto errout;
@@ -575,7 +579,7 @@
}
static int cls_bpf_dump(struct net *net, struct tcf_proto *tp, void *fh,
- struct sk_buff *skb, struct tcmsg *tm)
+ struct sk_buff *skb, struct tcmsg *tm, bool rtnl_held)
{
struct cls_bpf_prog *prog = fh;
struct nlattr *nest;
@@ -589,7 +593,7 @@
cls_bpf_offload_update_stats(tp, prog);
- nest = nla_nest_start(skb, TCA_OPTIONS);
+ nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
if (nest == NULL)
goto nla_put_failure;
@@ -635,7 +639,8 @@
prog->res.class = cl;
}
-static void cls_bpf_walk(struct tcf_proto *tp, struct tcf_walker *arg)
+static void cls_bpf_walk(struct tcf_proto *tp, struct tcf_walker *arg,
+ bool rtnl_held)
{
struct cls_bpf_head *head = rtnl_dereference(tp->root);
struct cls_bpf_prog *prog;
@@ -652,7 +657,7 @@
}
}
-static int cls_bpf_reoffload(struct tcf_proto *tp, bool add, tc_setup_cb_t *cb,
+static int cls_bpf_reoffload(struct tcf_proto *tp, bool add, flow_setup_cb_t *cb,
void *cb_priv, struct netlink_ext_ack *extack)
{
struct cls_bpf_head *head = rtnl_dereference(tp->root);
@@ -674,15 +679,11 @@
cls_bpf.name = prog->bpf_name;
cls_bpf.exts_integrated = prog->exts_integrated;
- err = cb(TC_SETUP_CLSBPF, &cls_bpf, cb_priv);
- if (err) {
- if (add && tc_skip_sw(prog->gen_flags))
- return err;
- continue;
- }
-
- tc_cls_offload_cnt_update(block, &prog->in_hw_count,
- &prog->gen_flags, add);
+ err = tc_setup_cb_reoffload(block, tp, add, cb, TC_SETUP_CLSBPF,
+ &cls_bpf, cb_priv, &prog->gen_flags,
+ &prog->in_hw_count);
+ if (err)
+ return err;
}
return 0;
diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c
index 3bc01bd..fb88114 100644
--- a/net/sched/cls_cgroup.c
+++ b/net/sched/cls_cgroup.c
@@ -1,11 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* net/sched/cls_cgroup.c Control Group Classifier
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Authors: Thomas Graf <tgraf@suug.ch>
*/
@@ -32,6 +28,8 @@
struct cls_cgroup_head *head = rcu_dereference_bh(tp->root);
u32 classid = task_get_classid(skb);
+ if (unlikely(!head))
+ return -1;
if (!classid)
return -1;
if (!tcf_em_tree_match(skb, &head->ematches, NULL))
@@ -78,7 +76,7 @@
static int cls_cgroup_change(struct net *net, struct sk_buff *in_skb,
struct tcf_proto *tp, unsigned long base,
u32 handle, struct nlattr **tca,
- void **arg, bool ovr,
+ void **arg, bool ovr, bool rtnl_held,
struct netlink_ext_ack *extack)
{
struct nlattr *tb[TCA_CGROUP_MAX + 1];
@@ -99,18 +97,19 @@
if (!new)
return -ENOBUFS;
- err = tcf_exts_init(&new->exts, TCA_CGROUP_ACT, TCA_CGROUP_POLICE);
+ err = tcf_exts_init(&new->exts, net, TCA_CGROUP_ACT, TCA_CGROUP_POLICE);
if (err < 0)
goto errout;
new->handle = handle;
new->tp = tp;
- err = nla_parse_nested(tb, TCA_CGROUP_MAX, tca[TCA_OPTIONS],
- cgroup_policy, NULL);
+ err = nla_parse_nested_deprecated(tb, TCA_CGROUP_MAX,
+ tca[TCA_OPTIONS], cgroup_policy,
+ NULL);
if (err < 0)
goto errout;
err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &new->exts, ovr,
- extack);
+ true, extack);
if (err < 0)
goto errout;
@@ -130,7 +129,7 @@
return err;
}
-static void cls_cgroup_destroy(struct tcf_proto *tp,
+static void cls_cgroup_destroy(struct tcf_proto *tp, bool rtnl_held,
struct netlink_ext_ack *extack)
{
struct cls_cgroup_head *head = rtnl_dereference(tp->root);
@@ -145,18 +144,21 @@
}
static int cls_cgroup_delete(struct tcf_proto *tp, void *arg, bool *last,
- struct netlink_ext_ack *extack)
+ bool rtnl_held, struct netlink_ext_ack *extack)
{
return -EOPNOTSUPP;
}
-static void cls_cgroup_walk(struct tcf_proto *tp, struct tcf_walker *arg)
+static void cls_cgroup_walk(struct tcf_proto *tp, struct tcf_walker *arg,
+ bool rtnl_held)
{
struct cls_cgroup_head *head = rtnl_dereference(tp->root);
if (arg->count < arg->skip)
goto skip;
+ if (!head)
+ return;
if (arg->fn(tp, head, arg) < 0) {
arg->stop = 1;
return;
@@ -166,14 +168,14 @@
}
static int cls_cgroup_dump(struct net *net, struct tcf_proto *tp, void *fh,
- struct sk_buff *skb, struct tcmsg *t)
+ struct sk_buff *skb, struct tcmsg *t, bool rtnl_held)
{
struct cls_cgroup_head *head = rtnl_dereference(tp->root);
struct nlattr *nest;
t->tcm_handle = head->handle;
- nest = nla_nest_start(skb, TCA_OPTIONS);
+ nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
if (nest == NULL)
goto nla_put_failure;
diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c
index 2bb043c..80ae7b9 100644
--- a/net/sched/cls_flow.c
+++ b/net/sched/cls_flow.c
@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* net/sched/cls_flow.c Generic flow classifier
*
* Copyright (c) 2007, 2008 Patrick McHardy <kaber@trash.net>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version 2
- * of the License, or (at your option) any later version.
*/
#include <linux/kernel.h>
@@ -391,7 +387,8 @@
static int flow_change(struct net *net, struct sk_buff *in_skb,
struct tcf_proto *tp, unsigned long base,
u32 handle, struct nlattr **tca,
- void **arg, bool ovr, struct netlink_ext_ack *extack)
+ void **arg, bool ovr, bool rtnl_held,
+ struct netlink_ext_ack *extack)
{
struct flow_head *head = rtnl_dereference(tp->root);
struct flow_filter *fold, *fnew;
@@ -407,7 +404,8 @@
if (opt == NULL)
return -EINVAL;
- err = nla_parse_nested(tb, TCA_FLOW_MAX, opt, flow_policy, NULL);
+ err = nla_parse_nested_deprecated(tb, TCA_FLOW_MAX, opt, flow_policy,
+ NULL);
if (err < 0)
return err;
@@ -440,12 +438,12 @@
if (err < 0)
goto err1;
- err = tcf_exts_init(&fnew->exts, TCA_FLOW_ACT, TCA_FLOW_POLICE);
+ err = tcf_exts_init(&fnew->exts, net, TCA_FLOW_ACT, TCA_FLOW_POLICE);
if (err < 0)
goto err2;
err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &fnew->exts, ovr,
- extack);
+ true, extack);
if (err < 0)
goto err2;
@@ -566,7 +564,7 @@
}
static int flow_delete(struct tcf_proto *tp, void *arg, bool *last,
- struct netlink_ext_ack *extack)
+ bool rtnl_held, struct netlink_ext_ack *extack)
{
struct flow_head *head = rtnl_dereference(tp->root);
struct flow_filter *f = arg;
@@ -590,7 +588,8 @@
return 0;
}
-static void flow_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack)
+static void flow_destroy(struct tcf_proto *tp, bool rtnl_held,
+ struct netlink_ext_ack *extack)
{
struct flow_head *head = rtnl_dereference(tp->root);
struct flow_filter *f, *next;
@@ -617,7 +616,7 @@
}
static int flow_dump(struct net *net, struct tcf_proto *tp, void *fh,
- struct sk_buff *skb, struct tcmsg *t)
+ struct sk_buff *skb, struct tcmsg *t, bool rtnl_held)
{
struct flow_filter *f = fh;
struct nlattr *nest;
@@ -627,7 +626,7 @@
t->tcm_handle = f->handle;
- nest = nla_nest_start(skb, TCA_OPTIONS);
+ nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
if (nest == NULL)
goto nla_put_failure;
@@ -677,7 +676,8 @@
return -1;
}
-static void flow_walk(struct tcf_proto *tp, struct tcf_walker *arg)
+static void flow_walk(struct tcf_proto *tp, struct tcf_walker *arg,
+ bool rtnl_held)
{
struct flow_head *head = rtnl_dereference(tp->root);
struct flow_filter *f;
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 7fade71..74221e3 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* net/sched/cls_flower.c Flower classifier
*
* Copyright (c) 2015 Jiri Pirko <jiri@resnulli.us>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
*/
#include <linux/kernel.h>
@@ -14,6 +10,7 @@
#include <linux/module.h>
#include <linux/rhashtable.h>
#include <linux/workqueue.h>
+#include <linux/refcount.h>
#include <linux/if_ether.h>
#include <linux/in6.h>
@@ -29,8 +26,10 @@
#include <net/dst.h>
#include <net/dst_metadata.h>
+#include <uapi/linux/netfilter/nf_conntrack_common.h>
+
struct fl_flow_key {
- int indev_ifindex;
+ struct flow_dissector_key_meta meta;
struct flow_dissector_key_control control;
struct flow_dissector_key_control enc_control;
struct flow_dissector_key_basic basic;
@@ -55,6 +54,9 @@
struct flow_dissector_key_ip ip;
struct flow_dissector_key_ip enc_ip;
struct flow_dissector_key_enc_opts enc_opts;
+ struct flow_dissector_key_ports tp_min;
+ struct flow_dissector_key_ports tp_max;
+ struct flow_dissector_key_ct ct;
} __aligned(BITS_PER_LONG / 8); /* Ensure that we can do comparisons as longs. */
struct fl_flow_mask_range {
@@ -65,6 +67,7 @@
struct fl_flow_mask {
struct fl_flow_key key;
struct fl_flow_mask_range range;
+ u32 flags;
struct rhash_head ht_node;
struct rhashtable ht;
struct rhashtable_params filter_ht_params;
@@ -72,6 +75,7 @@
struct list_head filters;
struct rcu_work rwork;
struct list_head list;
+ refcount_t refcnt;
};
struct fl_flow_tmplt {
@@ -83,7 +87,9 @@
struct cls_fl_head {
struct rhashtable ht;
+ spinlock_t masks_lock; /* Protect masks list */
struct list_head masks;
+ struct list_head hw_filters;
struct rcu_work rwork;
struct idr handle_idr;
};
@@ -96,11 +102,18 @@
struct tcf_result res;
struct fl_flow_key key;
struct list_head list;
+ struct list_head hw_list;
u32 handle;
u32 flags;
- unsigned int in_hw_count;
+ u32 in_hw_count;
struct rcu_work rwork;
struct net_device *hw_dev;
+ /* Flower classifier is unlocked, which means that its reference counter
+ * can be changed concurrently without any kind of external
+ * synchronization. Use atomic reference counter to be concurrency-safe.
+ */
+ refcount_t refcnt;
+ bool deleted;
};
static const struct rhashtable_params mask_ht_params = {
@@ -179,36 +192,128 @@
memset(fl_key_get_start(key, mask), 0, fl_mask_range(mask));
}
-static struct cls_fl_filter *fl_lookup(struct fl_flow_mask *mask,
- struct fl_flow_key *mkey)
+static bool fl_range_port_dst_cmp(struct cls_fl_filter *filter,
+ struct fl_flow_key *key,
+ struct fl_flow_key *mkey)
+{
+ __be16 min_mask, max_mask, min_val, max_val;
+
+ min_mask = htons(filter->mask->key.tp_min.dst);
+ max_mask = htons(filter->mask->key.tp_max.dst);
+ min_val = htons(filter->key.tp_min.dst);
+ max_val = htons(filter->key.tp_max.dst);
+
+ if (min_mask && max_mask) {
+ if (htons(key->tp.dst) < min_val ||
+ htons(key->tp.dst) > max_val)
+ return false;
+
+ /* skb does not have min and max values */
+ mkey->tp_min.dst = filter->mkey.tp_min.dst;
+ mkey->tp_max.dst = filter->mkey.tp_max.dst;
+ }
+ return true;
+}
+
+static bool fl_range_port_src_cmp(struct cls_fl_filter *filter,
+ struct fl_flow_key *key,
+ struct fl_flow_key *mkey)
+{
+ __be16 min_mask, max_mask, min_val, max_val;
+
+ min_mask = htons(filter->mask->key.tp_min.src);
+ max_mask = htons(filter->mask->key.tp_max.src);
+ min_val = htons(filter->key.tp_min.src);
+ max_val = htons(filter->key.tp_max.src);
+
+ if (min_mask && max_mask) {
+ if (htons(key->tp.src) < min_val ||
+ htons(key->tp.src) > max_val)
+ return false;
+
+ /* skb does not have min and max values */
+ mkey->tp_min.src = filter->mkey.tp_min.src;
+ mkey->tp_max.src = filter->mkey.tp_max.src;
+ }
+ return true;
+}
+
+static struct cls_fl_filter *__fl_lookup(struct fl_flow_mask *mask,
+ struct fl_flow_key *mkey)
{
return rhashtable_lookup_fast(&mask->ht, fl_key_get_start(mkey, mask),
mask->filter_ht_params);
}
+static struct cls_fl_filter *fl_lookup_range(struct fl_flow_mask *mask,
+ struct fl_flow_key *mkey,
+ struct fl_flow_key *key)
+{
+ struct cls_fl_filter *filter, *f;
+
+ list_for_each_entry_rcu(filter, &mask->filters, list) {
+ if (!fl_range_port_dst_cmp(filter, key, mkey))
+ continue;
+
+ if (!fl_range_port_src_cmp(filter, key, mkey))
+ continue;
+
+ f = __fl_lookup(mask, mkey);
+ if (f)
+ return f;
+ }
+ return NULL;
+}
+
+static struct cls_fl_filter *fl_lookup(struct fl_flow_mask *mask,
+ struct fl_flow_key *mkey,
+ struct fl_flow_key *key)
+{
+ if ((mask->flags & TCA_FLOWER_MASK_FLAGS_RANGE))
+ return fl_lookup_range(mask, mkey, key);
+
+ return __fl_lookup(mask, mkey);
+}
+
+static u16 fl_ct_info_to_flower_map[] = {
+ [IP_CT_ESTABLISHED] = TCA_FLOWER_KEY_CT_FLAGS_TRACKED |
+ TCA_FLOWER_KEY_CT_FLAGS_ESTABLISHED,
+ [IP_CT_RELATED] = TCA_FLOWER_KEY_CT_FLAGS_TRACKED |
+ TCA_FLOWER_KEY_CT_FLAGS_RELATED,
+ [IP_CT_ESTABLISHED_REPLY] = TCA_FLOWER_KEY_CT_FLAGS_TRACKED |
+ TCA_FLOWER_KEY_CT_FLAGS_ESTABLISHED,
+ [IP_CT_RELATED_REPLY] = TCA_FLOWER_KEY_CT_FLAGS_TRACKED |
+ TCA_FLOWER_KEY_CT_FLAGS_RELATED,
+ [IP_CT_NEW] = TCA_FLOWER_KEY_CT_FLAGS_TRACKED |
+ TCA_FLOWER_KEY_CT_FLAGS_NEW,
+};
+
static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp,
struct tcf_result *res)
{
struct cls_fl_head *head = rcu_dereference_bh(tp->root);
- struct cls_fl_filter *f;
- struct fl_flow_mask *mask;
- struct fl_flow_key skb_key;
struct fl_flow_key skb_mkey;
+ struct fl_flow_key skb_key;
+ struct fl_flow_mask *mask;
+ struct cls_fl_filter *f;
list_for_each_entry_rcu(mask, &head->masks, list) {
fl_clear_masked_range(&skb_key, mask);
- skb_key.indev_ifindex = skb->skb_iif;
+ skb_flow_dissect_meta(skb, &mask->dissector, &skb_key);
/* skb_flow_dissect() does not set n_proto in case an unknown
* protocol, so do it rather here.
*/
skb_key.basic.n_proto = skb->protocol;
skb_flow_dissect_tunnel_info(skb, &mask->dissector, &skb_key);
+ skb_flow_dissect_ct(skb, &mask->dissector, &skb_key,
+ fl_ct_info_to_flower_map,
+ ARRAY_SIZE(fl_ct_info_to_flower_map));
skb_flow_dissect(skb, &mask->dissector, &skb_key, 0);
fl_set_masked_key(&skb_mkey, &skb_key, mask);
- f = fl_lookup(mask, &skb_mkey);
+ f = fl_lookup(mask, &skb_mkey, &skb_key);
if (f && !tc_skip_sw(f->flags)) {
*res = f->res;
return tcf_exts_exec(skb, &f->exts, res);
@@ -225,16 +330,22 @@
if (!head)
return -ENOBUFS;
+ spin_lock_init(&head->masks_lock);
INIT_LIST_HEAD_RCU(&head->masks);
+ INIT_LIST_HEAD(&head->hw_filters);
rcu_assign_pointer(tp->root, head);
idr_init(&head->handle_idr);
return rhashtable_init(&head->ht, &mask_ht_params);
}
-static void fl_mask_free(struct fl_flow_mask *mask)
+static void fl_mask_free(struct fl_flow_mask *mask, bool mask_init_done)
{
- rhashtable_destroy(&mask->ht);
+ /* temporary masks don't have their filters list and ht initialized */
+ if (mask_init_done) {
+ WARN_ON(!list_empty(&mask->filters));
+ rhashtable_destroy(&mask->ht);
+ }
kfree(mask);
}
@@ -243,25 +354,43 @@
struct fl_flow_mask *mask = container_of(to_rcu_work(work),
struct fl_flow_mask, rwork);
- fl_mask_free(mask);
+ fl_mask_free(mask, true);
}
-static bool fl_mask_put(struct cls_fl_head *head, struct fl_flow_mask *mask,
- bool async)
+static void fl_uninit_mask_free_work(struct work_struct *work)
{
- if (!list_empty(&mask->filters))
+ struct fl_flow_mask *mask = container_of(to_rcu_work(work),
+ struct fl_flow_mask, rwork);
+
+ fl_mask_free(mask, false);
+}
+
+static bool fl_mask_put(struct cls_fl_head *head, struct fl_flow_mask *mask)
+{
+ if (!refcount_dec_and_test(&mask->refcnt))
return false;
rhashtable_remove_fast(&head->ht, &mask->ht_node, mask_ht_params);
+
+ spin_lock(&head->masks_lock);
list_del_rcu(&mask->list);
- if (async)
- tcf_queue_work(&mask->rwork, fl_mask_free_work);
- else
- fl_mask_free(mask);
+ spin_unlock(&head->masks_lock);
+
+ tcf_queue_work(&mask->rwork, fl_mask_free_work);
return true;
}
+static struct cls_fl_head *fl_head_dereference(struct tcf_proto *tp)
+{
+ /* Flower classifier only changes root pointer during init and destroy.
+ * Users must obtain reference to tcf_proto instance before calling its
+ * API, so tp->root pointer is protected from concurrent call to
+ * fl_destroy() by reference counting.
+ */
+ return rcu_dereference_raw(tp->root);
+}
+
static void __fl_destroy_filter(struct cls_fl_filter *f)
{
tcf_exts_destroy(&f->exts);
@@ -274,52 +403,64 @@
struct cls_fl_filter *f = container_of(to_rcu_work(work),
struct cls_fl_filter, rwork);
- rtnl_lock();
__fl_destroy_filter(f);
- rtnl_unlock();
}
static void fl_hw_destroy_filter(struct tcf_proto *tp, struct cls_fl_filter *f,
- struct netlink_ext_ack *extack)
+ bool rtnl_held, struct netlink_ext_ack *extack)
{
- struct tc_cls_flower_offload cls_flower = {};
struct tcf_block *block = tp->chain->block;
+ struct flow_cls_offload cls_flower = {};
tc_cls_common_offload_init(&cls_flower.common, tp, f->flags, extack);
- cls_flower.command = TC_CLSFLOWER_DESTROY;
+ cls_flower.command = FLOW_CLS_DESTROY;
cls_flower.cookie = (unsigned long) f;
- tc_setup_cb_call(block, &f->exts, TC_SETUP_CLSFLOWER,
- &cls_flower, false);
- tcf_block_offload_dec(block, &f->flags);
+ tc_setup_cb_destroy(block, tp, TC_SETUP_CLSFLOWER, &cls_flower, false,
+ &f->flags, &f->in_hw_count, rtnl_held);
+
}
static int fl_hw_replace_filter(struct tcf_proto *tp,
- struct cls_fl_filter *f,
+ struct cls_fl_filter *f, bool rtnl_held,
struct netlink_ext_ack *extack)
{
- struct tc_cls_flower_offload cls_flower = {};
struct tcf_block *block = tp->chain->block;
+ struct flow_cls_offload cls_flower = {};
bool skip_sw = tc_skip_sw(f->flags);
- int err;
+ int err = 0;
+
+ cls_flower.rule = flow_rule_alloc(tcf_exts_num_actions(&f->exts));
+ if (!cls_flower.rule)
+ return -ENOMEM;
tc_cls_common_offload_init(&cls_flower.common, tp, f->flags, extack);
- cls_flower.command = TC_CLSFLOWER_REPLACE;
+ cls_flower.command = FLOW_CLS_REPLACE;
cls_flower.cookie = (unsigned long) f;
- cls_flower.dissector = &f->mask->dissector;
- cls_flower.mask = &f->mask->key;
- cls_flower.key = &f->mkey;
- cls_flower.exts = &f->exts;
+ cls_flower.rule->match.dissector = &f->mask->dissector;
+ cls_flower.rule->match.mask = &f->mask->key;
+ cls_flower.rule->match.key = &f->mkey;
cls_flower.classid = f->res.classid;
- err = tc_setup_cb_call(block, &f->exts, TC_SETUP_CLSFLOWER,
- &cls_flower, skip_sw);
- if (err < 0) {
- fl_hw_destroy_filter(tp, f, NULL);
+ err = tc_setup_flow_action(&cls_flower.rule->action, &f->exts,
+ rtnl_held);
+ if (err) {
+ kfree(cls_flower.rule);
+ if (skip_sw) {
+ NL_SET_ERR_MSG_MOD(extack, "Failed to setup flow action");
+ return err;
+ }
+ return 0;
+ }
+
+ err = tc_setup_cb_add(block, tp, TC_SETUP_CLSFLOWER, &cls_flower,
+ skip_sw, &f->flags, &f->in_hw_count, rtnl_held);
+ tc_cleanup_flow_action(&cls_flower.rule->action);
+ kfree(cls_flower.rule);
+
+ if (err) {
+ fl_hw_destroy_filter(tp, f, rtnl_held, NULL);
return err;
- } else if (err > 0) {
- f->in_hw_count = err;
- tcf_block_offload_inc(block, &f->flags);
}
if (skip_sw && !(f->flags & TCA_CLS_FLAGS_IN_HW))
@@ -328,40 +469,77 @@
return 0;
}
-static void fl_hw_update_stats(struct tcf_proto *tp, struct cls_fl_filter *f)
+static void fl_hw_update_stats(struct tcf_proto *tp, struct cls_fl_filter *f,
+ bool rtnl_held)
{
- struct tc_cls_flower_offload cls_flower = {};
struct tcf_block *block = tp->chain->block;
+ struct flow_cls_offload cls_flower = {};
tc_cls_common_offload_init(&cls_flower.common, tp, f->flags, NULL);
- cls_flower.command = TC_CLSFLOWER_STATS;
+ cls_flower.command = FLOW_CLS_STATS;
cls_flower.cookie = (unsigned long) f;
- cls_flower.exts = &f->exts;
cls_flower.classid = f->res.classid;
- tc_setup_cb_call(block, &f->exts, TC_SETUP_CLSFLOWER,
- &cls_flower, false);
+ tc_setup_cb_call(block, TC_SETUP_CLSFLOWER, &cls_flower, false,
+ rtnl_held);
+
+ tcf_exts_stats_update(&f->exts, cls_flower.stats.bytes,
+ cls_flower.stats.pkts,
+ cls_flower.stats.lastused);
}
-static bool __fl_delete(struct tcf_proto *tp, struct cls_fl_filter *f,
- struct netlink_ext_ack *extack)
+static void __fl_put(struct cls_fl_filter *f)
{
- struct cls_fl_head *head = rtnl_dereference(tp->root);
- bool async = tcf_exts_get_net(&f->exts);
- bool last;
+ if (!refcount_dec_and_test(&f->refcnt))
+ return;
- idr_remove(&head->handle_idr, f->handle);
- list_del_rcu(&f->list);
- last = fl_mask_put(head, f->mask, async);
- if (!tc_skip_hw(f->flags))
- fl_hw_destroy_filter(tp, f, extack);
- tcf_unbind_filter(tp, &f->res);
- if (async)
+ if (tcf_exts_get_net(&f->exts))
tcf_queue_work(&f->rwork, fl_destroy_filter_work);
else
__fl_destroy_filter(f);
+}
- return last;
+static struct cls_fl_filter *__fl_get(struct cls_fl_head *head, u32 handle)
+{
+ struct cls_fl_filter *f;
+
+ rcu_read_lock();
+ f = idr_find(&head->handle_idr, handle);
+ if (f && !refcount_inc_not_zero(&f->refcnt))
+ f = NULL;
+ rcu_read_unlock();
+
+ return f;
+}
+
+static int __fl_delete(struct tcf_proto *tp, struct cls_fl_filter *f,
+ bool *last, bool rtnl_held,
+ struct netlink_ext_ack *extack)
+{
+ struct cls_fl_head *head = fl_head_dereference(tp);
+
+ *last = false;
+
+ spin_lock(&tp->lock);
+ if (f->deleted) {
+ spin_unlock(&tp->lock);
+ return -ENOENT;
+ }
+
+ f->deleted = true;
+ rhashtable_remove_fast(&f->mask->ht, &f->ht_node,
+ f->mask->filter_ht_params);
+ idr_remove(&head->handle_idr, f->handle);
+ list_del_rcu(&f->list);
+ spin_unlock(&tp->lock);
+
+ *last = fl_mask_put(head, f->mask);
+ if (!tc_skip_hw(f->flags))
+ fl_hw_destroy_filter(tp, f, rtnl_held, extack);
+ tcf_unbind_filter(tp, &f->res);
+ __fl_put(f);
+
+ return 0;
}
static void fl_destroy_sleepable(struct work_struct *work)
@@ -375,15 +553,18 @@
module_put(THIS_MODULE);
}
-static void fl_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack)
+static void fl_destroy(struct tcf_proto *tp, bool rtnl_held,
+ struct netlink_ext_ack *extack)
{
- struct cls_fl_head *head = rtnl_dereference(tp->root);
+ struct cls_fl_head *head = fl_head_dereference(tp);
struct fl_flow_mask *mask, *next_mask;
struct cls_fl_filter *f, *next;
+ bool last;
list_for_each_entry_safe(mask, next_mask, &head->masks, list) {
list_for_each_entry_safe(f, next, &mask->filters, list) {
- if (__fl_delete(tp, f, extack))
+ __fl_delete(tp, f, &last, rtnl_held, extack);
+ if (last)
break;
}
}
@@ -393,11 +574,18 @@
tcf_queue_work(&head->rwork, fl_destroy_sleepable);
}
+static void fl_put(struct tcf_proto *tp, void *arg)
+{
+ struct cls_fl_filter *f = arg;
+
+ __fl_put(f);
+}
+
static void *fl_get(struct tcf_proto *tp, u32 handle)
{
- struct cls_fl_head *head = rtnl_dereference(tp->root);
+ struct cls_fl_head *head = fl_head_dereference(tp);
- return idr_find(&head->handle_idr, handle);
+ return __fl_get(head, handle);
}
static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = {
@@ -486,6 +674,16 @@
[TCA_FLOWER_KEY_ENC_IP_TTL_MASK] = { .type = NLA_U8 },
[TCA_FLOWER_KEY_ENC_OPTS] = { .type = NLA_NESTED },
[TCA_FLOWER_KEY_ENC_OPTS_MASK] = { .type = NLA_NESTED },
+ [TCA_FLOWER_KEY_CT_STATE] = { .type = NLA_U16 },
+ [TCA_FLOWER_KEY_CT_STATE_MASK] = { .type = NLA_U16 },
+ [TCA_FLOWER_KEY_CT_ZONE] = { .type = NLA_U16 },
+ [TCA_FLOWER_KEY_CT_ZONE_MASK] = { .type = NLA_U16 },
+ [TCA_FLOWER_KEY_CT_MARK] = { .type = NLA_U32 },
+ [TCA_FLOWER_KEY_CT_MARK_MASK] = { .type = NLA_U32 },
+ [TCA_FLOWER_KEY_CT_LABELS] = { .type = NLA_BINARY,
+ .len = 128 / BITS_PER_BYTE },
+ [TCA_FLOWER_KEY_CT_LABELS_MASK] = { .type = NLA_BINARY,
+ .len = 128 / BITS_PER_BYTE },
};
static const struct nla_policy
@@ -507,11 +705,36 @@
{
if (!tb[val_type])
return;
- memcpy(val, nla_data(tb[val_type]), len);
+ nla_memcpy(val, tb[val_type], len);
if (mask_type == TCA_FLOWER_UNSPEC || !tb[mask_type])
memset(mask, 0xff, len);
else
- memcpy(mask, nla_data(tb[mask_type]), len);
+ nla_memcpy(mask, tb[mask_type], len);
+}
+
+static int fl_set_key_port_range(struct nlattr **tb, struct fl_flow_key *key,
+ struct fl_flow_key *mask)
+{
+ fl_set_key_val(tb, &key->tp_min.dst,
+ TCA_FLOWER_KEY_PORT_DST_MIN, &mask->tp_min.dst,
+ TCA_FLOWER_UNSPEC, sizeof(key->tp_min.dst));
+ fl_set_key_val(tb, &key->tp_max.dst,
+ TCA_FLOWER_KEY_PORT_DST_MAX, &mask->tp_max.dst,
+ TCA_FLOWER_UNSPEC, sizeof(key->tp_max.dst));
+ fl_set_key_val(tb, &key->tp_min.src,
+ TCA_FLOWER_KEY_PORT_SRC_MIN, &mask->tp_min.src,
+ TCA_FLOWER_UNSPEC, sizeof(key->tp_min.src));
+ fl_set_key_val(tb, &key->tp_max.src,
+ TCA_FLOWER_KEY_PORT_SRC_MAX, &mask->tp_max.src,
+ TCA_FLOWER_UNSPEC, sizeof(key->tp_max.src));
+
+ if ((mask->tp_min.dst && mask->tp_max.dst &&
+ htons(key->tp_max.dst) <= htons(key->tp_min.dst)) ||
+ (mask->tp_min.src && mask->tp_max.src &&
+ htons(key->tp_max.src) <= htons(key->tp_min.src)))
+ return -EINVAL;
+
+ return 0;
}
static int fl_set_key_mpls(struct nlattr **tb,
@@ -648,8 +871,9 @@
return -EINVAL;
}
- err = nla_parse_nested(tb, TCA_FLOWER_KEY_ENC_OPT_GENEVE_MAX,
- nla, geneve_opt_policy, extack);
+ err = nla_parse_nested_deprecated(tb,
+ TCA_FLOWER_KEY_ENC_OPT_GENEVE_MAX,
+ nla, geneve_opt_policy, extack);
if (err < 0)
return err;
@@ -711,18 +935,18 @@
const struct nlattr *nla_enc_key, *nla_opt_key, *nla_opt_msk = NULL;
int err, option_len, key_depth, msk_depth = 0;
- err = nla_validate_nested(tb[TCA_FLOWER_KEY_ENC_OPTS],
- TCA_FLOWER_KEY_ENC_OPTS_MAX,
- enc_opts_policy, extack);
+ err = nla_validate_nested_deprecated(tb[TCA_FLOWER_KEY_ENC_OPTS],
+ TCA_FLOWER_KEY_ENC_OPTS_MAX,
+ enc_opts_policy, extack);
if (err)
return err;
nla_enc_key = nla_data(tb[TCA_FLOWER_KEY_ENC_OPTS]);
if (tb[TCA_FLOWER_KEY_ENC_OPTS_MASK]) {
- err = nla_validate_nested(tb[TCA_FLOWER_KEY_ENC_OPTS_MASK],
- TCA_FLOWER_KEY_ENC_OPTS_MAX,
- enc_opts_policy, extack);
+ err = nla_validate_nested_deprecated(tb[TCA_FLOWER_KEY_ENC_OPTS_MASK],
+ TCA_FLOWER_KEY_ENC_OPTS_MAX,
+ enc_opts_policy, extack);
if (err)
return err;
@@ -771,21 +995,65 @@
return 0;
}
+static int fl_set_key_ct(struct nlattr **tb,
+ struct flow_dissector_key_ct *key,
+ struct flow_dissector_key_ct *mask,
+ struct netlink_ext_ack *extack)
+{
+ if (tb[TCA_FLOWER_KEY_CT_STATE]) {
+ if (!IS_ENABLED(CONFIG_NF_CONNTRACK)) {
+ NL_SET_ERR_MSG(extack, "Conntrack isn't enabled");
+ return -EOPNOTSUPP;
+ }
+ fl_set_key_val(tb, &key->ct_state, TCA_FLOWER_KEY_CT_STATE,
+ &mask->ct_state, TCA_FLOWER_KEY_CT_STATE_MASK,
+ sizeof(key->ct_state));
+ }
+ if (tb[TCA_FLOWER_KEY_CT_ZONE]) {
+ if (!IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES)) {
+ NL_SET_ERR_MSG(extack, "Conntrack zones isn't enabled");
+ return -EOPNOTSUPP;
+ }
+ fl_set_key_val(tb, &key->ct_zone, TCA_FLOWER_KEY_CT_ZONE,
+ &mask->ct_zone, TCA_FLOWER_KEY_CT_ZONE_MASK,
+ sizeof(key->ct_zone));
+ }
+ if (tb[TCA_FLOWER_KEY_CT_MARK]) {
+ if (!IS_ENABLED(CONFIG_NF_CONNTRACK_MARK)) {
+ NL_SET_ERR_MSG(extack, "Conntrack mark isn't enabled");
+ return -EOPNOTSUPP;
+ }
+ fl_set_key_val(tb, &key->ct_mark, TCA_FLOWER_KEY_CT_MARK,
+ &mask->ct_mark, TCA_FLOWER_KEY_CT_MARK_MASK,
+ sizeof(key->ct_mark));
+ }
+ if (tb[TCA_FLOWER_KEY_CT_LABELS]) {
+ if (!IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS)) {
+ NL_SET_ERR_MSG(extack, "Conntrack labels aren't enabled");
+ return -EOPNOTSUPP;
+ }
+ fl_set_key_val(tb, key->ct_labels, TCA_FLOWER_KEY_CT_LABELS,
+ mask->ct_labels, TCA_FLOWER_KEY_CT_LABELS_MASK,
+ sizeof(key->ct_labels));
+ }
+
+ return 0;
+}
+
static int fl_set_key(struct net *net, struct nlattr **tb,
struct fl_flow_key *key, struct fl_flow_key *mask,
struct netlink_ext_ack *extack)
{
__be16 ethertype;
int ret = 0;
-#ifdef CONFIG_NET_CLS_IND
+
if (tb[TCA_FLOWER_INDEV]) {
int err = tcf_change_indev(net, tb[TCA_FLOWER_INDEV], extack);
if (err < 0)
return err;
- key->indev_ifindex = err;
- mask->indev_ifindex = 0xffffffff;
+ key->meta.ingress_ifindex = err;
+ mask->meta.ingress_ifindex = 0xffffffff;
}
-#endif
fl_set_key_val(tb, key->eth.dst, TCA_FLOWER_KEY_ETH_DST,
mask->eth.dst, TCA_FLOWER_KEY_ETH_DST_MASK,
@@ -921,6 +1189,14 @@
sizeof(key->arp.tha));
}
+ if (key->basic.ip_proto == IPPROTO_TCP ||
+ key->basic.ip_proto == IPPROTO_UDP ||
+ key->basic.ip_proto == IPPROTO_SCTP) {
+ ret = fl_set_key_port_range(tb, key, mask);
+ if (ret)
+ return ret;
+ }
+
if (tb[TCA_FLOWER_KEY_ENC_IPV4_SRC] ||
tb[TCA_FLOWER_KEY_ENC_IPV4_DST]) {
key->enc_control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
@@ -973,6 +1249,10 @@
return ret;
}
+ ret = fl_set_key_ct(tb, &key->ct, &mask->ct, extack);
+ if (ret)
+ return ret;
+
if (tb[TCA_FLOWER_KEY_FLAGS])
ret = fl_set_key_flags(tb, &key->control.flags, &mask->control.flags);
@@ -1005,7 +1285,7 @@
}
#define FL_KEY_MEMBER_OFFSET(member) offsetof(struct fl_flow_key, member)
-#define FL_KEY_MEMBER_SIZE(member) (sizeof(((struct fl_flow_key *) 0)->member))
+#define FL_KEY_MEMBER_SIZE(member) FIELD_SIZEOF(struct fl_flow_key, member)
#define FL_KEY_IS_MASKED(mask, member) \
memchr_inv(((char *)mask) + FL_KEY_MEMBER_OFFSET(member), \
@@ -1030,6 +1310,8 @@
struct flow_dissector_key keys[FLOW_DISSECTOR_KEY_MAX];
size_t cnt = 0;
+ FL_KEY_SET_IF_MASKED(mask, keys, cnt,
+ FLOW_DISSECTOR_KEY_META, meta);
FL_KEY_SET(keys, cnt, FLOW_DISSECTOR_KEY_CONTROL, control);
FL_KEY_SET(keys, cnt, FLOW_DISSECTOR_KEY_BASIC, basic);
FL_KEY_SET_IF_MASKED(mask, keys, cnt,
@@ -1038,8 +1320,9 @@
FLOW_DISSECTOR_KEY_IPV4_ADDRS, ipv4);
FL_KEY_SET_IF_MASKED(mask, keys, cnt,
FLOW_DISSECTOR_KEY_IPV6_ADDRS, ipv6);
- FL_KEY_SET_IF_MASKED(mask, keys, cnt,
- FLOW_DISSECTOR_KEY_PORTS, tp);
+ if (FL_KEY_IS_MASKED(mask, tp) ||
+ FL_KEY_IS_MASKED(mask, tp_min) || FL_KEY_IS_MASKED(mask, tp_max))
+ FL_KEY_SET(keys, cnt, FLOW_DISSECTOR_KEY_PORTS, tp);
FL_KEY_SET_IF_MASKED(mask, keys, cnt,
FLOW_DISSECTOR_KEY_IP, ip);
FL_KEY_SET_IF_MASKED(mask, keys, cnt,
@@ -1070,6 +1353,8 @@
FLOW_DISSECTOR_KEY_ENC_IP, enc_ip);
FL_KEY_SET_IF_MASKED(mask, keys, cnt,
FLOW_DISSECTOR_KEY_ENC_OPTS, enc_opts);
+ FL_KEY_SET_IF_MASKED(mask, keys, cnt,
+ FLOW_DISSECTOR_KEY_CT, ct);
skb_flow_dissector_init(dissector, keys, cnt);
}
@@ -1086,6 +1371,10 @@
fl_mask_copy(newmask, mask);
+ if ((newmask->key.tp_min.dst && newmask->key.tp_max.dst) ||
+ (newmask->key.tp_min.src && newmask->key.tp_max.src))
+ newmask->flags |= TCA_FLOWER_MASK_FLAGS_RANGE;
+
err = fl_init_mask_hashtable(newmask);
if (err)
goto errout_free;
@@ -1094,12 +1383,15 @@
INIT_LIST_HEAD_RCU(&newmask->filters);
- err = rhashtable_insert_fast(&head->ht, &newmask->ht_node,
- mask_ht_params);
+ refcount_set(&newmask->refcnt, 1);
+ err = rhashtable_replace_fast(&head->ht, &mask->ht_node,
+ &newmask->ht_node, mask_ht_params);
if (err)
goto errout_destroy;
+ spin_lock(&head->masks_lock);
list_add_tail_rcu(&newmask->list, &head->masks);
+ spin_unlock(&head->masks_lock);
return newmask;
@@ -1117,40 +1409,71 @@
struct fl_flow_mask *mask)
{
struct fl_flow_mask *newmask;
+ int ret = 0;
- fnew->mask = rhashtable_lookup_fast(&head->ht, mask, mask_ht_params);
+ rcu_read_lock();
+
+ /* Insert mask as temporary node to prevent concurrent creation of mask
+ * with same key. Any concurrent lookups with same key will return
+ * -EAGAIN because mask's refcnt is zero.
+ */
+ fnew->mask = rhashtable_lookup_get_insert_fast(&head->ht,
+ &mask->ht_node,
+ mask_ht_params);
if (!fnew->mask) {
- if (fold)
- return -EINVAL;
+ rcu_read_unlock();
+
+ if (fold) {
+ ret = -EINVAL;
+ goto errout_cleanup;
+ }
newmask = fl_create_new_mask(head, mask);
- if (IS_ERR(newmask))
- return PTR_ERR(newmask);
+ if (IS_ERR(newmask)) {
+ ret = PTR_ERR(newmask);
+ goto errout_cleanup;
+ }
fnew->mask = newmask;
+ return 0;
+ } else if (IS_ERR(fnew->mask)) {
+ ret = PTR_ERR(fnew->mask);
} else if (fold && fold->mask != fnew->mask) {
- return -EINVAL;
+ ret = -EINVAL;
+ } else if (!refcount_inc_not_zero(&fnew->mask->refcnt)) {
+ /* Mask was deleted concurrently, try again */
+ ret = -EAGAIN;
}
+ rcu_read_unlock();
+ return ret;
- return 0;
+errout_cleanup:
+ rhashtable_remove_fast(&head->ht, &mask->ht_node,
+ mask_ht_params);
+ return ret;
}
static int fl_set_parms(struct net *net, struct tcf_proto *tp,
struct cls_fl_filter *f, struct fl_flow_mask *mask,
unsigned long base, struct nlattr **tb,
struct nlattr *est, bool ovr,
- struct fl_flow_tmplt *tmplt,
+ struct fl_flow_tmplt *tmplt, bool rtnl_held,
struct netlink_ext_ack *extack)
{
int err;
- err = tcf_exts_validate(net, tp, tb, est, &f->exts, ovr, extack);
+ err = tcf_exts_validate(net, tp, tb, est, &f->exts, ovr, rtnl_held,
+ extack);
if (err < 0)
return err;
if (tb[TCA_FLOWER_CLASSID]) {
f->res.classid = nla_get_u32(tb[TCA_FLOWER_CLASSID]);
+ if (!rtnl_held)
+ rtnl_lock();
tcf_bind_filter(tp, &f->res, base);
+ if (!rtnl_held)
+ rtnl_unlock();
}
err = fl_set_key(net, tb, &f->key, &mask->key, extack);
@@ -1168,27 +1491,61 @@
return 0;
}
+static int fl_ht_insert_unique(struct cls_fl_filter *fnew,
+ struct cls_fl_filter *fold,
+ bool *in_ht)
+{
+ struct fl_flow_mask *mask = fnew->mask;
+ int err;
+
+ err = rhashtable_lookup_insert_fast(&mask->ht,
+ &fnew->ht_node,
+ mask->filter_ht_params);
+ if (err) {
+ *in_ht = false;
+ /* It is okay if filter with same key exists when
+ * overwriting.
+ */
+ return fold && err == -EEXIST ? 0 : err;
+ }
+
+ *in_ht = true;
+ return 0;
+}
+
static int fl_change(struct net *net, struct sk_buff *in_skb,
struct tcf_proto *tp, unsigned long base,
u32 handle, struct nlattr **tca,
- void **arg, bool ovr, struct netlink_ext_ack *extack)
+ void **arg, bool ovr, bool rtnl_held,
+ struct netlink_ext_ack *extack)
{
- struct cls_fl_head *head = rtnl_dereference(tp->root);
+ struct cls_fl_head *head = fl_head_dereference(tp);
struct cls_fl_filter *fold = *arg;
struct cls_fl_filter *fnew;
+ struct fl_flow_mask *mask;
struct nlattr **tb;
- struct fl_flow_mask mask = {};
+ bool in_ht;
int err;
- if (!tca[TCA_OPTIONS])
- return -EINVAL;
+ if (!tca[TCA_OPTIONS]) {
+ err = -EINVAL;
+ goto errout_fold;
+ }
+
+ mask = kzalloc(sizeof(struct fl_flow_mask), GFP_KERNEL);
+ if (!mask) {
+ err = -ENOBUFS;
+ goto errout_fold;
+ }
tb = kcalloc(TCA_FLOWER_MAX + 1, sizeof(struct nlattr *), GFP_KERNEL);
- if (!tb)
- return -ENOBUFS;
+ if (!tb) {
+ err = -ENOBUFS;
+ goto errout_mask_alloc;
+ }
- err = nla_parse_nested(tb, TCA_FLOWER_MAX, tca[TCA_OPTIONS],
- fl_policy, NULL);
+ err = nla_parse_nested_deprecated(tb, TCA_FLOWER_MAX,
+ tca[TCA_OPTIONS], fl_policy, NULL);
if (err < 0)
goto errout_tb;
@@ -1202,208 +1559,340 @@
err = -ENOBUFS;
goto errout_tb;
}
+ INIT_LIST_HEAD(&fnew->hw_list);
+ refcount_set(&fnew->refcnt, 1);
- err = tcf_exts_init(&fnew->exts, TCA_FLOWER_ACT, 0);
+ err = tcf_exts_init(&fnew->exts, net, TCA_FLOWER_ACT, 0);
if (err < 0)
goto errout;
- if (!handle) {
- handle = 1;
- err = idr_alloc_u32(&head->handle_idr, fnew, &handle,
- INT_MAX, GFP_KERNEL);
- } else if (!fold) {
- /* user specifies a handle and it doesn't exist */
- err = idr_alloc_u32(&head->handle_idr, fnew, &handle,
- handle, GFP_KERNEL);
- }
- if (err)
- goto errout;
- fnew->handle = handle;
-
if (tb[TCA_FLOWER_FLAGS]) {
fnew->flags = nla_get_u32(tb[TCA_FLOWER_FLAGS]);
if (!tc_flags_valid(fnew->flags)) {
err = -EINVAL;
- goto errout_idr;
+ goto errout;
}
}
- err = fl_set_parms(net, tp, fnew, &mask, base, tb, tca[TCA_RATE], ovr,
- tp->chain->tmplt_priv, extack);
+ err = fl_set_parms(net, tp, fnew, mask, base, tb, tca[TCA_RATE], ovr,
+ tp->chain->tmplt_priv, rtnl_held, extack);
if (err)
- goto errout_idr;
+ goto errout;
- err = fl_check_assign_mask(head, fnew, fold, &mask);
+ err = fl_check_assign_mask(head, fnew, fold, mask);
if (err)
- goto errout_idr;
+ goto errout;
- if (!tc_skip_sw(fnew->flags)) {
- if (!fold && fl_lookup(fnew->mask, &fnew->mkey)) {
- err = -EEXIST;
- goto errout_mask;
- }
-
- err = rhashtable_insert_fast(&fnew->mask->ht, &fnew->ht_node,
- fnew->mask->filter_ht_params);
- if (err)
- goto errout_mask;
- }
+ err = fl_ht_insert_unique(fnew, fold, &in_ht);
+ if (err)
+ goto errout_mask;
if (!tc_skip_hw(fnew->flags)) {
- err = fl_hw_replace_filter(tp, fnew, extack);
+ err = fl_hw_replace_filter(tp, fnew, rtnl_held, extack);
if (err)
- goto errout_mask;
+ goto errout_ht;
}
if (!tc_in_hw(fnew->flags))
fnew->flags |= TCA_CLS_FLAGS_NOT_IN_HW;
+ spin_lock(&tp->lock);
+
+ /* tp was deleted concurrently. -EAGAIN will cause caller to lookup
+ * proto again or create new one, if necessary.
+ */
+ if (tp->deleting) {
+ err = -EAGAIN;
+ goto errout_hw;
+ }
+
if (fold) {
- if (!tc_skip_sw(fold->flags))
- rhashtable_remove_fast(&fold->mask->ht,
- &fold->ht_node,
- fold->mask->filter_ht_params);
+ /* Fold filter was deleted concurrently. Retry lookup. */
+ if (fold->deleted) {
+ err = -EAGAIN;
+ goto errout_hw;
+ }
+
+ fnew->handle = handle;
+
+ if (!in_ht) {
+ struct rhashtable_params params =
+ fnew->mask->filter_ht_params;
+
+ err = rhashtable_insert_fast(&fnew->mask->ht,
+ &fnew->ht_node,
+ params);
+ if (err)
+ goto errout_hw;
+ in_ht = true;
+ }
+
+ refcount_inc(&fnew->refcnt);
+ rhashtable_remove_fast(&fold->mask->ht,
+ &fold->ht_node,
+ fold->mask->filter_ht_params);
+ idr_replace(&head->handle_idr, fnew, fnew->handle);
+ list_replace_rcu(&fold->list, &fnew->list);
+ fold->deleted = true;
+
+ spin_unlock(&tp->lock);
+
+ fl_mask_put(head, fold->mask);
if (!tc_skip_hw(fold->flags))
- fl_hw_destroy_filter(tp, fold, NULL);
+ fl_hw_destroy_filter(tp, fold, rtnl_held, NULL);
+ tcf_unbind_filter(tp, &fold->res);
+ /* Caller holds reference to fold, so refcnt is always > 0
+ * after this.
+ */
+ refcount_dec(&fold->refcnt);
+ __fl_put(fold);
+ } else {
+ if (handle) {
+ /* user specifies a handle and it doesn't exist */
+ err = idr_alloc_u32(&head->handle_idr, fnew, &handle,
+ handle, GFP_ATOMIC);
+
+ /* Filter with specified handle was concurrently
+ * inserted after initial check in cls_api. This is not
+ * necessarily an error if NLM_F_EXCL is not set in
+ * message flags. Returning EAGAIN will cause cls_api to
+ * try to update concurrently inserted rule.
+ */
+ if (err == -ENOSPC)
+ err = -EAGAIN;
+ } else {
+ handle = 1;
+ err = idr_alloc_u32(&head->handle_idr, fnew, &handle,
+ INT_MAX, GFP_ATOMIC);
+ }
+ if (err)
+ goto errout_hw;
+
+ refcount_inc(&fnew->refcnt);
+ fnew->handle = handle;
+ list_add_tail_rcu(&fnew->list, &fnew->mask->filters);
+ spin_unlock(&tp->lock);
}
*arg = fnew;
- if (fold) {
- idr_replace(&head->handle_idr, fnew, fnew->handle);
- list_replace_rcu(&fold->list, &fnew->list);
- tcf_unbind_filter(tp, &fold->res);
- tcf_exts_get_net(&fold->exts);
- tcf_queue_work(&fold->rwork, fl_destroy_filter_work);
- } else {
- list_add_tail_rcu(&fnew->list, &fnew->mask->filters);
- }
-
kfree(tb);
+ tcf_queue_work(&mask->rwork, fl_uninit_mask_free_work);
return 0;
+errout_ht:
+ spin_lock(&tp->lock);
+errout_hw:
+ fnew->deleted = true;
+ spin_unlock(&tp->lock);
+ if (!tc_skip_hw(fnew->flags))
+ fl_hw_destroy_filter(tp, fnew, rtnl_held, NULL);
+ if (in_ht)
+ rhashtable_remove_fast(&fnew->mask->ht, &fnew->ht_node,
+ fnew->mask->filter_ht_params);
errout_mask:
- fl_mask_put(head, fnew->mask, false);
-
-errout_idr:
- if (!fold)
- idr_remove(&head->handle_idr, fnew->handle);
+ fl_mask_put(head, fnew->mask);
errout:
- tcf_exts_destroy(&fnew->exts);
- kfree(fnew);
+ __fl_put(fnew);
errout_tb:
kfree(tb);
+errout_mask_alloc:
+ tcf_queue_work(&mask->rwork, fl_uninit_mask_free_work);
+errout_fold:
+ if (fold)
+ __fl_put(fold);
return err;
}
static int fl_delete(struct tcf_proto *tp, void *arg, bool *last,
- struct netlink_ext_ack *extack)
+ bool rtnl_held, struct netlink_ext_ack *extack)
{
- struct cls_fl_head *head = rtnl_dereference(tp->root);
+ struct cls_fl_head *head = fl_head_dereference(tp);
struct cls_fl_filter *f = arg;
+ bool last_on_mask;
+ int err = 0;
- if (!tc_skip_sw(f->flags))
- rhashtable_remove_fast(&f->mask->ht, &f->ht_node,
- f->mask->filter_ht_params);
- __fl_delete(tp, f, extack);
+ err = __fl_delete(tp, f, &last_on_mask, rtnl_held, extack);
*last = list_empty(&head->masks);
- return 0;
+ __fl_put(f);
+
+ return err;
}
-static void fl_walk(struct tcf_proto *tp, struct tcf_walker *arg)
+static void fl_walk(struct tcf_proto *tp, struct tcf_walker *arg,
+ bool rtnl_held)
{
- struct cls_fl_head *head = rtnl_dereference(tp->root);
+ struct cls_fl_head *head = fl_head_dereference(tp);
+ unsigned long id = arg->cookie, tmp;
struct cls_fl_filter *f;
arg->count = arg->skip;
- while ((f = idr_get_next_ul(&head->handle_idr,
- &arg->cookie)) != NULL) {
+ idr_for_each_entry_continue_ul(&head->handle_idr, f, tmp, id) {
+ /* don't return filters that are being deleted */
+ if (!refcount_inc_not_zero(&f->refcnt))
+ continue;
if (arg->fn(tp, f, arg) < 0) {
+ __fl_put(f);
arg->stop = 1;
break;
}
- arg->cookie = f->handle + 1;
+ __fl_put(f);
arg->count++;
}
+ arg->cookie = id;
}
-static int fl_reoffload(struct tcf_proto *tp, bool add, tc_setup_cb_t *cb,
+static struct cls_fl_filter *
+fl_get_next_hw_filter(struct tcf_proto *tp, struct cls_fl_filter *f, bool add)
+{
+ struct cls_fl_head *head = fl_head_dereference(tp);
+
+ spin_lock(&tp->lock);
+ if (list_empty(&head->hw_filters)) {
+ spin_unlock(&tp->lock);
+ return NULL;
+ }
+
+ if (!f)
+ f = list_entry(&head->hw_filters, struct cls_fl_filter,
+ hw_list);
+ list_for_each_entry_continue(f, &head->hw_filters, hw_list) {
+ if (!(add && f->deleted) && refcount_inc_not_zero(&f->refcnt)) {
+ spin_unlock(&tp->lock);
+ return f;
+ }
+ }
+
+ spin_unlock(&tp->lock);
+ return NULL;
+}
+
+static int fl_reoffload(struct tcf_proto *tp, bool add, flow_setup_cb_t *cb,
void *cb_priv, struct netlink_ext_ack *extack)
{
- struct cls_fl_head *head = rtnl_dereference(tp->root);
- struct tc_cls_flower_offload cls_flower = {};
struct tcf_block *block = tp->chain->block;
- struct fl_flow_mask *mask;
- struct cls_fl_filter *f;
+ struct flow_cls_offload cls_flower = {};
+ struct cls_fl_filter *f = NULL;
int err;
- list_for_each_entry(mask, &head->masks, list) {
- list_for_each_entry(f, &mask->filters, list) {
- if (tc_skip_hw(f->flags))
- continue;
+ /* hw_filters list can only be changed by hw offload functions after
+ * obtaining rtnl lock. Make sure it is not changed while reoffload is
+ * iterating it.
+ */
+ ASSERT_RTNL();
- tc_cls_common_offload_init(&cls_flower.common, tp,
- f->flags, extack);
- cls_flower.command = add ?
- TC_CLSFLOWER_REPLACE : TC_CLSFLOWER_DESTROY;
- cls_flower.cookie = (unsigned long)f;
- cls_flower.dissector = &mask->dissector;
- cls_flower.mask = &mask->key;
- cls_flower.key = &f->mkey;
- cls_flower.exts = &f->exts;
- cls_flower.classid = f->res.classid;
-
- err = cb(TC_SETUP_CLSFLOWER, &cls_flower, cb_priv);
- if (err) {
- if (add && tc_skip_sw(f->flags))
- return err;
- continue;
- }
-
- tc_cls_offload_cnt_update(block, &f->in_hw_count,
- &f->flags, add);
+ while ((f = fl_get_next_hw_filter(tp, f, add))) {
+ cls_flower.rule =
+ flow_rule_alloc(tcf_exts_num_actions(&f->exts));
+ if (!cls_flower.rule) {
+ __fl_put(f);
+ return -ENOMEM;
}
+
+ tc_cls_common_offload_init(&cls_flower.common, tp, f->flags,
+ extack);
+ cls_flower.command = add ?
+ FLOW_CLS_REPLACE : FLOW_CLS_DESTROY;
+ cls_flower.cookie = (unsigned long)f;
+ cls_flower.rule->match.dissector = &f->mask->dissector;
+ cls_flower.rule->match.mask = &f->mask->key;
+ cls_flower.rule->match.key = &f->mkey;
+
+ err = tc_setup_flow_action(&cls_flower.rule->action, &f->exts,
+ true);
+ if (err) {
+ kfree(cls_flower.rule);
+ if (tc_skip_sw(f->flags)) {
+ NL_SET_ERR_MSG_MOD(extack, "Failed to setup flow action");
+ __fl_put(f);
+ return err;
+ }
+ goto next_flow;
+ }
+
+ cls_flower.classid = f->res.classid;
+
+ err = tc_setup_cb_reoffload(block, tp, add, cb,
+ TC_SETUP_CLSFLOWER, &cls_flower,
+ cb_priv, &f->flags,
+ &f->in_hw_count);
+ tc_cleanup_flow_action(&cls_flower.rule->action);
+ kfree(cls_flower.rule);
+
+ if (err) {
+ __fl_put(f);
+ return err;
+ }
+next_flow:
+ __fl_put(f);
}
return 0;
}
-static void fl_hw_create_tmplt(struct tcf_chain *chain,
- struct fl_flow_tmplt *tmplt)
+static void fl_hw_add(struct tcf_proto *tp, void *type_data)
{
- struct tc_cls_flower_offload cls_flower = {};
+ struct flow_cls_offload *cls_flower = type_data;
+ struct cls_fl_filter *f =
+ (struct cls_fl_filter *) cls_flower->cookie;
+ struct cls_fl_head *head = fl_head_dereference(tp);
+
+ spin_lock(&tp->lock);
+ list_add(&f->hw_list, &head->hw_filters);
+ spin_unlock(&tp->lock);
+}
+
+static void fl_hw_del(struct tcf_proto *tp, void *type_data)
+{
+ struct flow_cls_offload *cls_flower = type_data;
+ struct cls_fl_filter *f =
+ (struct cls_fl_filter *) cls_flower->cookie;
+
+ spin_lock(&tp->lock);
+ if (!list_empty(&f->hw_list))
+ list_del_init(&f->hw_list);
+ spin_unlock(&tp->lock);
+}
+
+static int fl_hw_create_tmplt(struct tcf_chain *chain,
+ struct fl_flow_tmplt *tmplt)
+{
+ struct flow_cls_offload cls_flower = {};
struct tcf_block *block = chain->block;
- struct tcf_exts dummy_exts = { 0, };
+
+ cls_flower.rule = flow_rule_alloc(0);
+ if (!cls_flower.rule)
+ return -ENOMEM;
cls_flower.common.chain_index = chain->index;
- cls_flower.command = TC_CLSFLOWER_TMPLT_CREATE;
+ cls_flower.command = FLOW_CLS_TMPLT_CREATE;
cls_flower.cookie = (unsigned long) tmplt;
- cls_flower.dissector = &tmplt->dissector;
- cls_flower.mask = &tmplt->mask;
- cls_flower.key = &tmplt->dummy_key;
- cls_flower.exts = &dummy_exts;
+ cls_flower.rule->match.dissector = &tmplt->dissector;
+ cls_flower.rule->match.mask = &tmplt->mask;
+ cls_flower.rule->match.key = &tmplt->dummy_key;
/* We don't care if driver (any of them) fails to handle this
* call. It serves just as a hint for it.
*/
- tc_setup_cb_call(block, NULL, TC_SETUP_CLSFLOWER,
- &cls_flower, false);
+ tc_setup_cb_call(block, TC_SETUP_CLSFLOWER, &cls_flower, false, true);
+ kfree(cls_flower.rule);
+
+ return 0;
}
static void fl_hw_destroy_tmplt(struct tcf_chain *chain,
struct fl_flow_tmplt *tmplt)
{
- struct tc_cls_flower_offload cls_flower = {};
+ struct flow_cls_offload cls_flower = {};
struct tcf_block *block = chain->block;
cls_flower.common.chain_index = chain->index;
- cls_flower.command = TC_CLSFLOWER_TMPLT_DESTROY;
+ cls_flower.command = FLOW_CLS_TMPLT_DESTROY;
cls_flower.cookie = (unsigned long) tmplt;
- tc_setup_cb_call(block, NULL, TC_SETUP_CLSFLOWER,
- &cls_flower, false);
+ tc_setup_cb_call(block, TC_SETUP_CLSFLOWER, &cls_flower, false, true);
}
static void *fl_tmplt_create(struct net *net, struct tcf_chain *chain,
@@ -1420,8 +1909,8 @@
tb = kcalloc(TCA_FLOWER_MAX + 1, sizeof(struct nlattr *), GFP_KERNEL);
if (!tb)
return ERR_PTR(-ENOBUFS);
- err = nla_parse_nested(tb, TCA_FLOWER_MAX, tca[TCA_OPTIONS],
- fl_policy, NULL);
+ err = nla_parse_nested_deprecated(tb, TCA_FLOWER_MAX,
+ tca[TCA_OPTIONS], fl_policy, NULL);
if (err)
goto errout_tb;
@@ -1434,12 +1923,14 @@
err = fl_set_key(net, tb, &tmplt->dummy_key, &tmplt->mask, extack);
if (err)
goto errout_tmplt;
- kfree(tb);
fl_init_dissector(&tmplt->dissector, &tmplt->mask);
- fl_hw_create_tmplt(chain, tmplt);
+ err = fl_hw_create_tmplt(chain, tmplt);
+ if (err)
+ goto errout_tmplt;
+ kfree(tb);
return tmplt;
errout_tmplt:
@@ -1476,6 +1967,26 @@
return 0;
}
+static int fl_dump_key_port_range(struct sk_buff *skb, struct fl_flow_key *key,
+ struct fl_flow_key *mask)
+{
+ if (fl_dump_key_val(skb, &key->tp_min.dst, TCA_FLOWER_KEY_PORT_DST_MIN,
+ &mask->tp_min.dst, TCA_FLOWER_UNSPEC,
+ sizeof(key->tp_min.dst)) ||
+ fl_dump_key_val(skb, &key->tp_max.dst, TCA_FLOWER_KEY_PORT_DST_MAX,
+ &mask->tp_max.dst, TCA_FLOWER_UNSPEC,
+ sizeof(key->tp_max.dst)) ||
+ fl_dump_key_val(skb, &key->tp_min.src, TCA_FLOWER_KEY_PORT_SRC_MIN,
+ &mask->tp_min.src, TCA_FLOWER_UNSPEC,
+ sizeof(key->tp_min.src)) ||
+ fl_dump_key_val(skb, &key->tp_max.src, TCA_FLOWER_KEY_PORT_SRC_MAX,
+ &mask->tp_max.src, TCA_FLOWER_UNSPEC,
+ sizeof(key->tp_max.src)))
+ return -1;
+
+ return 0;
+}
+
static int fl_dump_key_mpls(struct sk_buff *skb,
struct flow_dissector_key_mpls *mpls_key,
struct flow_dissector_key_mpls *mpls_mask)
@@ -1597,7 +2108,7 @@
struct nlattr *nest;
int opt_off = 0;
- nest = nla_nest_start(skb, TCA_FLOWER_KEY_ENC_OPTS_GENEVE);
+ nest = nla_nest_start_noflag(skb, TCA_FLOWER_KEY_ENC_OPTS_GENEVE);
if (!nest)
goto nla_put_failure;
@@ -1624,6 +2135,40 @@
return -EMSGSIZE;
}
+static int fl_dump_key_ct(struct sk_buff *skb,
+ struct flow_dissector_key_ct *key,
+ struct flow_dissector_key_ct *mask)
+{
+ if (IS_ENABLED(CONFIG_NF_CONNTRACK) &&
+ fl_dump_key_val(skb, &key->ct_state, TCA_FLOWER_KEY_CT_STATE,
+ &mask->ct_state, TCA_FLOWER_KEY_CT_STATE_MASK,
+ sizeof(key->ct_state)))
+ goto nla_put_failure;
+
+ if (IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES) &&
+ fl_dump_key_val(skb, &key->ct_zone, TCA_FLOWER_KEY_CT_ZONE,
+ &mask->ct_zone, TCA_FLOWER_KEY_CT_ZONE_MASK,
+ sizeof(key->ct_zone)))
+ goto nla_put_failure;
+
+ if (IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) &&
+ fl_dump_key_val(skb, &key->ct_mark, TCA_FLOWER_KEY_CT_MARK,
+ &mask->ct_mark, TCA_FLOWER_KEY_CT_MARK_MASK,
+ sizeof(key->ct_mark)))
+ goto nla_put_failure;
+
+ if (IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS) &&
+ fl_dump_key_val(skb, &key->ct_labels, TCA_FLOWER_KEY_CT_LABELS,
+ &mask->ct_labels, TCA_FLOWER_KEY_CT_LABELS_MASK,
+ sizeof(key->ct_labels)))
+ goto nla_put_failure;
+
+ return 0;
+
+nla_put_failure:
+ return -EMSGSIZE;
+}
+
static int fl_dump_key_options(struct sk_buff *skb, int enc_opt_type,
struct flow_dissector_key_enc_opts *enc_opts)
{
@@ -1633,7 +2178,7 @@
if (!enc_opts->len)
return 0;
- nest = nla_nest_start(skb, enc_opt_type);
+ nest = nla_nest_start_noflag(skb, enc_opt_type);
if (!nest)
goto nla_put_failure;
@@ -1670,10 +2215,10 @@
static int fl_dump_key(struct sk_buff *skb, struct net *net,
struct fl_flow_key *key, struct fl_flow_key *mask)
{
- if (mask->indev_ifindex) {
+ if (mask->meta.ingress_ifindex) {
struct net_device *dev;
- dev = __dev_get_by_index(net, key->indev_ifindex);
+ dev = __dev_get_by_index(net, key->meta.ingress_ifindex);
if (dev && nla_put_string(skb, TCA_FLOWER_INDEV, dev->name))
goto nla_put_failure;
}
@@ -1812,6 +2357,12 @@
sizeof(key->arp.tha))))
goto nla_put_failure;
+ if ((key->basic.ip_proto == IPPROTO_TCP ||
+ key->basic.ip_proto == IPPROTO_UDP ||
+ key->basic.ip_proto == IPPROTO_SCTP) &&
+ fl_dump_key_port_range(skb, key, mask))
+ goto nla_put_failure;
+
if (key->enc_control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS &&
(fl_dump_key_val(skb, &key->enc_ipv4.src,
TCA_FLOWER_KEY_ENC_IPV4_SRC, &mask->enc_ipv4.src,
@@ -1851,6 +2402,9 @@
fl_dump_key_enc_opt(skb, &key->enc_opts, &mask->enc_opts))
goto nla_put_failure;
+ if (fl_dump_key_ct(skb, &key->ct, &mask->ct))
+ goto nla_put_failure;
+
if (fl_dump_key_flags(skb, key->control.flags, mask->control.flags))
goto nla_put_failure;
@@ -1861,35 +2415,44 @@
}
static int fl_dump(struct net *net, struct tcf_proto *tp, void *fh,
- struct sk_buff *skb, struct tcmsg *t)
+ struct sk_buff *skb, struct tcmsg *t, bool rtnl_held)
{
struct cls_fl_filter *f = fh;
struct nlattr *nest;
struct fl_flow_key *key, *mask;
+ bool skip_hw;
if (!f)
return skb->len;
t->tcm_handle = f->handle;
- nest = nla_nest_start(skb, TCA_OPTIONS);
+ nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
if (!nest)
goto nla_put_failure;
+ spin_lock(&tp->lock);
+
if (f->res.classid &&
nla_put_u32(skb, TCA_FLOWER_CLASSID, f->res.classid))
- goto nla_put_failure;
+ goto nla_put_failure_locked;
key = &f->key;
mask = &f->mask->key;
+ skip_hw = tc_skip_hw(f->flags);
if (fl_dump_key(skb, net, key, mask))
- goto nla_put_failure;
-
- if (!tc_skip_hw(f->flags))
- fl_hw_update_stats(tp, f);
+ goto nla_put_failure_locked;
if (f->flags && nla_put_u32(skb, TCA_FLOWER_FLAGS, f->flags))
+ goto nla_put_failure_locked;
+
+ spin_unlock(&tp->lock);
+
+ if (!skip_hw)
+ fl_hw_update_stats(tp, f, rtnl_held);
+
+ if (nla_put_u32(skb, TCA_FLOWER_IN_HW_COUNT, f->in_hw_count))
goto nla_put_failure;
if (tcf_exts_dump(skb, &f->exts))
@@ -1902,6 +2465,8 @@
return skb->len;
+nla_put_failure_locked:
+ spin_unlock(&tp->lock);
nla_put_failure:
nla_nest_cancel(skb, nest);
return -1;
@@ -1913,7 +2478,7 @@
struct fl_flow_key *key, *mask;
struct nlattr *nest;
- nest = nla_nest_start(skb, TCA_OPTIONS);
+ nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
if (!nest)
goto nla_put_failure;
@@ -1946,16 +2511,20 @@
.init = fl_init,
.destroy = fl_destroy,
.get = fl_get,
+ .put = fl_put,
.change = fl_change,
.delete = fl_delete,
.walk = fl_walk,
.reoffload = fl_reoffload,
+ .hw_add = fl_hw_add,
+ .hw_del = fl_hw_del,
.dump = fl_dump,
.bind_class = fl_bind_class,
.tmplt_create = fl_tmplt_create,
.tmplt_destroy = fl_tmplt_destroy,
.tmplt_dump = fl_tmplt_dump,
.owner = THIS_MODULE,
+ .flags = TCF_PROTO_OPS_DOIT_UNLOCKED,
};
static int __init cls_fl_init(void)
diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c
index 29eeeaf..c9496c9 100644
--- a/net/sched/cls_fw.c
+++ b/net/sched/cls_fw.c
@@ -1,21 +1,13 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* net/sched/cls_fw.c Classifier mapping ipchains' fwmark to traffic class.
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
*
* Changes:
* Karlis Peisenieks <karlis@mt.lv> : 990415 : fw_walk off by one
* Karlis Peisenieks <karlis@mt.lv> : 990415 : fw_delete killed all the filter (and kernel).
* Alex <alex@pilotsoft.com> : 2004xxyy: Added Action extension
- *
- * JHS: We should remove the CONFIG_NET_CLS_IND from here
- * eventually when the meta match extension is made available
- *
*/
#include <linux/module.h>
@@ -42,9 +34,7 @@
struct fw_filter __rcu *next;
u32 id;
struct tcf_result res;
-#ifdef CONFIG_NET_CLS_IND
int ifindex;
-#endif /* CONFIG_NET_CLS_IND */
struct tcf_exts exts;
struct tcf_proto *tp;
struct rcu_work rwork;
@@ -72,10 +62,8 @@
f = rcu_dereference_bh(f->next)) {
if (f->id == id) {
*res = f->res;
-#ifdef CONFIG_NET_CLS_IND
if (!tcf_match_indev(skb, f->ifindex))
continue;
-#endif /* CONFIG_NET_CLS_IND */
r = tcf_exts_exec(skb, &f->exts, res);
if (r < 0)
continue;
@@ -139,7 +127,8 @@
rtnl_unlock();
}
-static void fw_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack)
+static void fw_destroy(struct tcf_proto *tp, bool rtnl_held,
+ struct netlink_ext_ack *extack)
{
struct fw_head *head = rtnl_dereference(tp->root);
struct fw_filter *f;
@@ -163,7 +152,7 @@
}
static int fw_delete(struct tcf_proto *tp, void *arg, bool *last,
- struct netlink_ext_ack *extack)
+ bool rtnl_held, struct netlink_ext_ack *extack)
{
struct fw_head *head = rtnl_dereference(tp->root);
struct fw_filter *f = arg;
@@ -217,7 +206,7 @@
int err;
err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &f->exts, ovr,
- extack);
+ true, extack);
if (err < 0)
return err;
@@ -226,7 +215,6 @@
tcf_bind_filter(tp, &f->res, base);
}
-#ifdef CONFIG_NET_CLS_IND
if (tb[TCA_FW_INDEV]) {
int ret;
ret = tcf_change_indev(net, tb[TCA_FW_INDEV], extack);
@@ -234,7 +222,6 @@
return ret;
f->ifindex = ret;
}
-#endif /* CONFIG_NET_CLS_IND */
err = -EINVAL;
if (tb[TCA_FW_MASK]) {
@@ -250,7 +237,8 @@
static int fw_change(struct net *net, struct sk_buff *in_skb,
struct tcf_proto *tp, unsigned long base,
u32 handle, struct nlattr **tca, void **arg,
- bool ovr, struct netlink_ext_ack *extack)
+ bool ovr, bool rtnl_held,
+ struct netlink_ext_ack *extack)
{
struct fw_head *head = rtnl_dereference(tp->root);
struct fw_filter *f = *arg;
@@ -261,7 +249,8 @@
if (!opt)
return handle ? -EINVAL : 0; /* Succeed if it is old method. */
- err = nla_parse_nested(tb, TCA_FW_MAX, opt, fw_policy, NULL);
+ err = nla_parse_nested_deprecated(tb, TCA_FW_MAX, opt, fw_policy,
+ NULL);
if (err < 0)
return err;
@@ -278,12 +267,11 @@
fnew->id = f->id;
fnew->res = f->res;
-#ifdef CONFIG_NET_CLS_IND
fnew->ifindex = f->ifindex;
-#endif /* CONFIG_NET_CLS_IND */
fnew->tp = f->tp;
- err = tcf_exts_init(&fnew->exts, TCA_FW_ACT, TCA_FW_POLICE);
+ err = tcf_exts_init(&fnew->exts, net, TCA_FW_ACT,
+ TCA_FW_POLICE);
if (err < 0) {
kfree(fnew);
return err;
@@ -332,7 +320,7 @@
if (f == NULL)
return -ENOBUFS;
- err = tcf_exts_init(&f->exts, TCA_FW_ACT, TCA_FW_POLICE);
+ err = tcf_exts_init(&f->exts, net, TCA_FW_ACT, TCA_FW_POLICE);
if (err < 0)
goto errout;
f->id = handle;
@@ -354,7 +342,8 @@
return err;
}
-static void fw_walk(struct tcf_proto *tp, struct tcf_walker *arg)
+static void fw_walk(struct tcf_proto *tp, struct tcf_walker *arg,
+ bool rtnl_held)
{
struct fw_head *head = rtnl_dereference(tp->root);
int h;
@@ -384,7 +373,7 @@
}
static int fw_dump(struct net *net, struct tcf_proto *tp, void *fh,
- struct sk_buff *skb, struct tcmsg *t)
+ struct sk_buff *skb, struct tcmsg *t, bool rtnl_held)
{
struct fw_head *head = rtnl_dereference(tp->root);
struct fw_filter *f = fh;
@@ -398,21 +387,19 @@
if (!f->res.classid && !tcf_exts_has_actions(&f->exts))
return skb->len;
- nest = nla_nest_start(skb, TCA_OPTIONS);
+ nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
if (nest == NULL)
goto nla_put_failure;
if (f->res.classid &&
nla_put_u32(skb, TCA_FW_CLASSID, f->res.classid))
goto nla_put_failure;
-#ifdef CONFIG_NET_CLS_IND
if (f->ifindex) {
struct net_device *dev;
dev = __dev_get_by_index(net, f->ifindex);
if (dev && nla_put_string(skb, TCA_FW_INDEV, dev->name))
goto nla_put_failure;
}
-#endif /* CONFIG_NET_CLS_IND */
if (head->mask != 0xFFFFFFFF &&
nla_put_u32(skb, TCA_FW_MASK, head->mask))
goto nla_put_failure;
diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c
index 856fa79..7fc2eb6 100644
--- a/net/sched/cls_matchall.c
+++ b/net/sched/cls_matchall.c
@@ -1,17 +1,14 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* net/sched/cls_matchll.c Match-all classifier
*
* Copyright (c) 2016 Jiri Pirko <jiri@mellanox.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
*/
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/module.h>
+#include <linux/percpu.h>
#include <net/sch_generic.h>
#include <net/pkt_cls.h>
@@ -22,7 +19,9 @@
u32 handle;
u32 flags;
unsigned int in_hw_count;
+ struct tc_matchall_pcnt __percpu *pf;
struct rcu_work rwork;
+ bool deleting;
};
static int mall_classify(struct sk_buff *skb, const struct tcf_proto *tp,
@@ -30,10 +29,14 @@
{
struct cls_mall_head *head = rcu_dereference_bh(tp->root);
+ if (unlikely(!head))
+ return -1;
+
if (tc_skip_sw(head->flags))
return -1;
*res = head->res;
+ __this_cpu_inc(head->pf->rhit);
return tcf_exts_exec(skb, &head->exts, res);
}
@@ -46,6 +49,7 @@
{
tcf_exts_destroy(&head->exts);
tcf_exts_put_net(&head->exts);
+ free_percpu(head->pf);
kfree(head);
}
@@ -71,8 +75,8 @@
cls_mall.command = TC_CLSMATCHALL_DESTROY;
cls_mall.cookie = cookie;
- tc_setup_cb_call(block, NULL, TC_SETUP_CLSMATCHALL, &cls_mall, false);
- tcf_block_offload_dec(block, &head->flags);
+ tc_setup_cb_destroy(block, tp, TC_SETUP_CLSMATCHALL, &cls_mall, false,
+ &head->flags, &head->in_hw_count, true);
}
static int mall_replace_hw_filter(struct tcf_proto *tp,
@@ -85,19 +89,34 @@
bool skip_sw = tc_skip_sw(head->flags);
int err;
+ cls_mall.rule = flow_rule_alloc(tcf_exts_num_actions(&head->exts));
+ if (!cls_mall.rule)
+ return -ENOMEM;
+
tc_cls_common_offload_init(&cls_mall.common, tp, head->flags, extack);
cls_mall.command = TC_CLSMATCHALL_REPLACE;
- cls_mall.exts = &head->exts;
cls_mall.cookie = cookie;
- err = tc_setup_cb_call(block, NULL, TC_SETUP_CLSMATCHALL,
- &cls_mall, skip_sw);
- if (err < 0) {
+ err = tc_setup_flow_action(&cls_mall.rule->action, &head->exts, true);
+ if (err) {
+ kfree(cls_mall.rule);
+ mall_destroy_hw_filter(tp, head, cookie, NULL);
+ if (skip_sw)
+ NL_SET_ERR_MSG_MOD(extack, "Failed to setup flow action");
+ else
+ err = 0;
+
+ return err;
+ }
+
+ err = tc_setup_cb_add(block, tp, TC_SETUP_CLSMATCHALL, &cls_mall,
+ skip_sw, &head->flags, &head->in_hw_count, true);
+ tc_cleanup_flow_action(&cls_mall.rule->action);
+ kfree(cls_mall.rule);
+
+ if (err) {
mall_destroy_hw_filter(tp, head, cookie, NULL);
return err;
- } else if (err > 0) {
- head->in_hw_count = err;
- tcf_block_offload_inc(block, &head->flags);
}
if (skip_sw && !(head->flags & TCA_CLS_FLAGS_IN_HW))
@@ -106,7 +125,8 @@
return 0;
}
-static void mall_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack)
+static void mall_destroy(struct tcf_proto *tp, bool rtnl_held,
+ struct netlink_ext_ack *extack)
{
struct cls_mall_head *head = rtnl_dereference(tp->root);
@@ -126,6 +146,11 @@
static void *mall_get(struct tcf_proto *tp, u32 handle)
{
+ struct cls_mall_head *head = rtnl_dereference(tp->root);
+
+ if (head && head->handle == handle)
+ return head;
+
return NULL;
}
@@ -142,7 +167,8 @@
{
int err;
- err = tcf_exts_validate(net, tp, tb, est, &head->exts, ovr, extack);
+ err = tcf_exts_validate(net, tp, tb, est, &head->exts, ovr, true,
+ extack);
if (err < 0)
return err;
@@ -156,7 +182,8 @@
static int mall_change(struct net *net, struct sk_buff *in_skb,
struct tcf_proto *tp, unsigned long base,
u32 handle, struct nlattr **tca,
- void **arg, bool ovr, struct netlink_ext_ack *extack)
+ void **arg, bool ovr, bool rtnl_held,
+ struct netlink_ext_ack *extack)
{
struct cls_mall_head *head = rtnl_dereference(tp->root);
struct nlattr *tb[TCA_MATCHALL_MAX + 1];
@@ -170,8 +197,8 @@
if (head)
return -EEXIST;
- err = nla_parse_nested(tb, TCA_MATCHALL_MAX, tca[TCA_OPTIONS],
- mall_policy, NULL);
+ err = nla_parse_nested_deprecated(tb, TCA_MATCHALL_MAX,
+ tca[TCA_OPTIONS], mall_policy, NULL);
if (err < 0)
return err;
@@ -185,7 +212,7 @@
if (!new)
return -ENOBUFS;
- err = tcf_exts_init(&new->exts, TCA_MATCHALL_ACT, 0);
+ err = tcf_exts_init(&new->exts, net, TCA_MATCHALL_ACT, 0);
if (err)
goto err_exts_init;
@@ -193,6 +220,11 @@
handle = 1;
new->handle = handle;
new->flags = flags;
+ new->pf = alloc_percpu(struct tc_matchall_pcnt);
+ if (!new->pf) {
+ err = -ENOMEM;
+ goto err_alloc_percpu;
+ }
err = mall_set_parms(net, tp, new, base, tb, tca[TCA_RATE], ovr,
extack);
@@ -215,6 +247,8 @@
err_replace_hw_filter:
err_set_parms:
+ free_percpu(new->pf);
+err_alloc_percpu:
tcf_exts_destroy(&new->exts);
err_exts_init:
kfree(new);
@@ -222,24 +256,32 @@
}
static int mall_delete(struct tcf_proto *tp, void *arg, bool *last,
- struct netlink_ext_ack *extack)
+ bool rtnl_held, struct netlink_ext_ack *extack)
{
- return -EOPNOTSUPP;
+ struct cls_mall_head *head = rtnl_dereference(tp->root);
+
+ head->deleting = true;
+ *last = true;
+ return 0;
}
-static void mall_walk(struct tcf_proto *tp, struct tcf_walker *arg)
+static void mall_walk(struct tcf_proto *tp, struct tcf_walker *arg,
+ bool rtnl_held)
{
struct cls_mall_head *head = rtnl_dereference(tp->root);
if (arg->count < arg->skip)
goto skip;
+
+ if (!head || head->deleting)
+ return;
if (arg->fn(tp, head, arg) < 0)
arg->stop = 1;
skip:
arg->count++;
}
-static int mall_reoffload(struct tcf_proto *tp, bool add, tc_setup_cb_t *cb,
+static int mall_reoffload(struct tcf_proto *tp, bool add, flow_setup_cb_t *cb,
void *cb_priv, struct netlink_ext_ack *extack)
{
struct cls_mall_head *head = rtnl_dereference(tp->root);
@@ -250,36 +292,71 @@
if (tc_skip_hw(head->flags))
return 0;
+ cls_mall.rule = flow_rule_alloc(tcf_exts_num_actions(&head->exts));
+ if (!cls_mall.rule)
+ return -ENOMEM;
+
tc_cls_common_offload_init(&cls_mall.common, tp, head->flags, extack);
cls_mall.command = add ?
TC_CLSMATCHALL_REPLACE : TC_CLSMATCHALL_DESTROY;
- cls_mall.exts = &head->exts;
cls_mall.cookie = (unsigned long)head;
- err = cb(TC_SETUP_CLSMATCHALL, &cls_mall, cb_priv);
+ err = tc_setup_flow_action(&cls_mall.rule->action, &head->exts, true);
if (err) {
- if (add && tc_skip_sw(head->flags))
+ kfree(cls_mall.rule);
+ if (add && tc_skip_sw(head->flags)) {
+ NL_SET_ERR_MSG_MOD(extack, "Failed to setup flow action");
return err;
+ }
return 0;
}
- tc_cls_offload_cnt_update(block, &head->in_hw_count, &head->flags, add);
+ err = tc_setup_cb_reoffload(block, tp, add, cb, TC_SETUP_CLSMATCHALL,
+ &cls_mall, cb_priv, &head->flags,
+ &head->in_hw_count);
+ tc_cleanup_flow_action(&cls_mall.rule->action);
+ kfree(cls_mall.rule);
+
+ if (err)
+ return err;
return 0;
}
-static int mall_dump(struct net *net, struct tcf_proto *tp, void *fh,
- struct sk_buff *skb, struct tcmsg *t)
+static void mall_stats_hw_filter(struct tcf_proto *tp,
+ struct cls_mall_head *head,
+ unsigned long cookie)
{
+ struct tc_cls_matchall_offload cls_mall = {};
+ struct tcf_block *block = tp->chain->block;
+
+ tc_cls_common_offload_init(&cls_mall.common, tp, head->flags, NULL);
+ cls_mall.command = TC_CLSMATCHALL_STATS;
+ cls_mall.cookie = cookie;
+
+ tc_setup_cb_call(block, TC_SETUP_CLSMATCHALL, &cls_mall, false, true);
+
+ tcf_exts_stats_update(&head->exts, cls_mall.stats.bytes,
+ cls_mall.stats.pkts, cls_mall.stats.lastused);
+}
+
+static int mall_dump(struct net *net, struct tcf_proto *tp, void *fh,
+ struct sk_buff *skb, struct tcmsg *t, bool rtnl_held)
+{
+ struct tc_matchall_pcnt gpf = {};
struct cls_mall_head *head = fh;
struct nlattr *nest;
+ int cpu;
if (!head)
return skb->len;
+ if (!tc_skip_hw(head->flags))
+ mall_stats_hw_filter(tp, head, (unsigned long)head);
+
t->tcm_handle = head->handle;
- nest = nla_nest_start(skb, TCA_OPTIONS);
+ nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
if (!nest)
goto nla_put_failure;
@@ -290,6 +367,17 @@
if (head->flags && nla_put_u32(skb, TCA_MATCHALL_FLAGS, head->flags))
goto nla_put_failure;
+ for_each_possible_cpu(cpu) {
+ struct tc_matchall_pcnt *pf = per_cpu_ptr(head->pf, cpu);
+
+ gpf.rhit += pf->rhit;
+ }
+
+ if (nla_put_64bit(skb, TCA_MATCHALL_PCNT,
+ sizeof(struct tc_matchall_pcnt),
+ &gpf, TCA_MATCHALL_PAD))
+ goto nla_put_failure;
+
if (tcf_exts_dump(skb, &head->exts))
goto nla_put_failure;
diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c
index 0404aa5..2d9e0b4 100644
--- a/net/sched/cls_route.c
+++ b/net/sched/cls_route.c
@@ -1,11 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* net/sched/cls_route.c ROUTE4 classifier.
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
*/
@@ -276,7 +272,8 @@
tcf_queue_work(&f->rwork, route4_delete_filter_work);
}
-static void route4_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack)
+static void route4_destroy(struct tcf_proto *tp, bool rtnl_held,
+ struct netlink_ext_ack *extack)
{
struct route4_head *head = rtnl_dereference(tp->root);
int h1, h2;
@@ -312,7 +309,7 @@
}
static int route4_delete(struct tcf_proto *tp, void *arg, bool *last,
- struct netlink_ext_ack *extack)
+ bool rtnl_held, struct netlink_ext_ack *extack)
{
struct route4_head *head = rtnl_dereference(tp->root);
struct route4_filter *f = arg;
@@ -393,7 +390,7 @@
struct route4_bucket *b;
int err;
- err = tcf_exts_validate(net, tp, tb, est, &f->exts, ovr, extack);
+ err = tcf_exts_validate(net, tp, tb, est, &f->exts, ovr, true, extack);
if (err < 0)
return err;
@@ -468,7 +465,7 @@
static int route4_change(struct net *net, struct sk_buff *in_skb,
struct tcf_proto *tp, unsigned long base, u32 handle,
struct nlattr **tca, void **arg, bool ovr,
- struct netlink_ext_ack *extack)
+ bool rtnl_held, struct netlink_ext_ack *extack)
{
struct route4_head *head = rtnl_dereference(tp->root);
struct route4_filter __rcu **fp;
@@ -483,7 +480,8 @@
if (opt == NULL)
return handle ? -EINVAL : 0;
- err = nla_parse_nested(tb, TCA_ROUTE4_MAX, opt, route4_policy, NULL);
+ err = nla_parse_nested_deprecated(tb, TCA_ROUTE4_MAX, opt,
+ route4_policy, NULL);
if (err < 0)
return err;
@@ -496,7 +494,7 @@
if (!f)
goto errout;
- err = tcf_exts_init(&f->exts, TCA_ROUTE4_ACT, TCA_ROUTE4_POLICE);
+ err = tcf_exts_init(&f->exts, net, TCA_ROUTE4_ACT, TCA_ROUTE4_POLICE);
if (err < 0)
goto errout;
@@ -560,15 +558,13 @@
return err;
}
-static void route4_walk(struct tcf_proto *tp, struct tcf_walker *arg)
+static void route4_walk(struct tcf_proto *tp, struct tcf_walker *arg,
+ bool rtnl_held)
{
struct route4_head *head = rtnl_dereference(tp->root);
unsigned int h, h1;
- if (head == NULL)
- arg->stop = 1;
-
- if (arg->stop)
+ if (head == NULL || arg->stop)
return;
for (h = 0; h <= 256; h++) {
@@ -597,7 +593,7 @@
}
static int route4_dump(struct net *net, struct tcf_proto *tp, void *fh,
- struct sk_buff *skb, struct tcmsg *t)
+ struct sk_buff *skb, struct tcmsg *t, bool rtnl_held)
{
struct route4_filter *f = fh;
struct nlattr *nest;
@@ -608,7 +604,7 @@
t->tcm_handle = f->handle;
- nest = nla_nest_start(skb, TCA_OPTIONS);
+ nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
if (nest == NULL)
goto nla_put_failure;
diff --git a/net/sched/cls_rsvp.c b/net/sched/cls_rsvp.c
index cbb5e0d..de1c1d4 100644
--- a/net/sched/cls_rsvp.c
+++ b/net/sched/cls_rsvp.c
@@ -1,11 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* net/sched/cls_rsvp.c Special RSVP packet classifier for IPv4.
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
*/
diff --git a/net/sched/cls_rsvp.h b/net/sched/cls_rsvp.h
index e9ccf7d..2f3c03b 100644
--- a/net/sched/cls_rsvp.h
+++ b/net/sched/cls_rsvp.h
@@ -1,11 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
* net/sched/cls_rsvp.h Template file for RSVPv[46] classifiers.
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
*/
@@ -312,7 +308,8 @@
__rsvp_delete_filter(f);
}
-static void rsvp_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack)
+static void rsvp_destroy(struct tcf_proto *tp, bool rtnl_held,
+ struct netlink_ext_ack *extack)
{
struct rsvp_head *data = rtnl_dereference(tp->root);
int h1, h2;
@@ -341,7 +338,7 @@
}
static int rsvp_delete(struct tcf_proto *tp, void *arg, bool *last,
- struct netlink_ext_ack *extack)
+ bool rtnl_held, struct netlink_ext_ack *extack)
{
struct rsvp_head *head = rtnl_dereference(tp->root);
struct rsvp_filter *nfp, *f = arg;
@@ -477,7 +474,8 @@
struct tcf_proto *tp, unsigned long base,
u32 handle,
struct nlattr **tca,
- void **arg, bool ovr, struct netlink_ext_ack *extack)
+ void **arg, bool ovr, bool rtnl_held,
+ struct netlink_ext_ack *extack)
{
struct rsvp_head *data = rtnl_dereference(tp->root);
struct rsvp_filter *f, *nfp;
@@ -495,14 +493,16 @@
if (opt == NULL)
return handle ? -EINVAL : 0;
- err = nla_parse_nested(tb, TCA_RSVP_MAX, opt, rsvp_policy, NULL);
+ err = nla_parse_nested_deprecated(tb, TCA_RSVP_MAX, opt, rsvp_policy,
+ NULL);
if (err < 0)
return err;
- err = tcf_exts_init(&e, TCA_RSVP_ACT, TCA_RSVP_POLICE);
+ err = tcf_exts_init(&e, net, TCA_RSVP_ACT, TCA_RSVP_POLICE);
if (err < 0)
return err;
- err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, ovr, extack);
+ err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, ovr, true,
+ extack);
if (err < 0)
goto errout2;
@@ -520,7 +520,8 @@
goto errout2;
}
- err = tcf_exts_init(&n->exts, TCA_RSVP_ACT, TCA_RSVP_POLICE);
+ err = tcf_exts_init(&n->exts, net, TCA_RSVP_ACT,
+ TCA_RSVP_POLICE);
if (err < 0) {
kfree(n);
goto errout2;
@@ -548,7 +549,7 @@
if (f == NULL)
goto errout2;
- err = tcf_exts_init(&f->exts, TCA_RSVP_ACT, TCA_RSVP_POLICE);
+ err = tcf_exts_init(&f->exts, net, TCA_RSVP_ACT, TCA_RSVP_POLICE);
if (err < 0)
goto errout;
h2 = 16;
@@ -654,7 +655,8 @@
return err;
}
-static void rsvp_walk(struct tcf_proto *tp, struct tcf_walker *arg)
+static void rsvp_walk(struct tcf_proto *tp, struct tcf_walker *arg,
+ bool rtnl_held)
{
struct rsvp_head *head = rtnl_dereference(tp->root);
unsigned int h, h1;
@@ -688,7 +690,7 @@
}
static int rsvp_dump(struct net *net, struct tcf_proto *tp, void *fh,
- struct sk_buff *skb, struct tcmsg *t)
+ struct sk_buff *skb, struct tcmsg *t, bool rtnl_held)
{
struct rsvp_filter *f = fh;
struct rsvp_session *s;
@@ -701,7 +703,7 @@
t->tcm_handle = f->handle;
- nest = nla_nest_start(skb, TCA_OPTIONS);
+ nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
if (nest == NULL)
goto nla_put_failure;
diff --git a/net/sched/cls_rsvp6.c b/net/sched/cls_rsvp6.c
index dd08aea..6407884 100644
--- a/net/sched/cls_rsvp6.c
+++ b/net/sched/cls_rsvp6.c
@@ -1,11 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* net/sched/cls_rsvp6.c Special RSVP packet classifier for IPv6.
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
*/
diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c
index 9ccc93f..e573e5a 100644
--- a/net/sched/cls_tcindex.c
+++ b/net/sched/cls_tcindex.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* net/sched/cls_tcindex.c Packet classifier for skb->tc_index
*
@@ -48,7 +49,7 @@
u32 hash; /* hash table size; 0 if undefined */
u32 alloc_hash; /* allocated size */
u32 fall_through; /* 0: only classify if explicit match */
- struct rcu_head rcu;
+ struct rcu_work rwork;
};
static inline int tcindex_filter_is_set(struct tcindex_filter_result *r)
@@ -173,7 +174,7 @@
}
static int tcindex_delete(struct tcf_proto *tp, void *arg, bool *last,
- struct netlink_ext_ack *extack)
+ bool rtnl_held, struct netlink_ext_ack *extack)
{
struct tcindex_data *p = rtnl_dereference(tp->root);
struct tcindex_filter_result *r = arg;
@@ -221,17 +222,11 @@
return 0;
}
-static int tcindex_destroy_element(struct tcf_proto *tp,
- void *arg, struct tcf_walker *walker)
+static void tcindex_destroy_work(struct work_struct *work)
{
- bool last;
-
- return tcindex_delete(tp, arg, &last, NULL);
-}
-
-static void __tcindex_destroy(struct rcu_head *head)
-{
- struct tcindex_data *p = container_of(head, struct tcindex_data, rcu);
+ struct tcindex_data *p = container_of(to_rcu_work(work),
+ struct tcindex_data,
+ rwork);
kfree(p->perfect);
kfree(p->h);
@@ -252,15 +247,19 @@
[TCA_TCINDEX_CLASSID] = { .type = NLA_U32 },
};
-static int tcindex_filter_result_init(struct tcindex_filter_result *r)
+static int tcindex_filter_result_init(struct tcindex_filter_result *r,
+ struct net *net)
{
memset(r, 0, sizeof(*r));
- return tcf_exts_init(&r->exts, TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE);
+ return tcf_exts_init(&r->exts, net, TCA_TCINDEX_ACT,
+ TCA_TCINDEX_POLICE);
}
-static void __tcindex_partial_destroy(struct rcu_head *head)
+static void tcindex_partial_destroy_work(struct work_struct *work)
{
- struct tcindex_data *p = container_of(head, struct tcindex_data, rcu);
+ struct tcindex_data *p = container_of(to_rcu_work(work),
+ struct tcindex_data,
+ rwork);
kfree(p->perfect);
kfree(p);
@@ -275,7 +274,7 @@
kfree(cp->perfect);
}
-static int tcindex_alloc_perfect_hash(struct tcindex_data *cp)
+static int tcindex_alloc_perfect_hash(struct net *net, struct tcindex_data *cp)
{
int i, err = 0;
@@ -285,7 +284,7 @@
return -ENOMEM;
for (i = 0; i < cp->hash; i++) {
- err = tcf_exts_init(&cp->perfect[i].exts,
+ err = tcf_exts_init(&cp->perfect[i].exts, net,
TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE);
if (err < 0)
goto errout;
@@ -305,16 +304,16 @@
struct nlattr *est, bool ovr, struct netlink_ext_ack *extack)
{
struct tcindex_filter_result new_filter_result, *old_r = r;
- struct tcindex_filter_result cr;
struct tcindex_data *cp = NULL, *oldp;
struct tcindex_filter *f = NULL; /* make gcc behave */
+ struct tcf_result cr = {};
int err, balloc = 0;
struct tcf_exts e;
- err = tcf_exts_init(&e, TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE);
+ err = tcf_exts_init(&e, net, TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE);
if (err < 0)
return err;
- err = tcf_exts_validate(net, tp, tb, est, &e, ovr, extack);
+ err = tcf_exts_validate(net, tp, tb, est, &e, ovr, true, extack);
if (err < 0)
goto errout;
@@ -337,7 +336,7 @@
if (p->perfect) {
int i;
- if (tcindex_alloc_perfect_hash(cp) < 0)
+ if (tcindex_alloc_perfect_hash(net, cp) < 0)
goto errout;
for (i = 0; i < cp->hash; i++)
cp->perfect[i].res = p->perfect[i].res;
@@ -345,14 +344,11 @@
}
cp->h = p->h;
- err = tcindex_filter_result_init(&new_filter_result);
- if (err < 0)
- goto errout1;
- err = tcindex_filter_result_init(&cr);
+ err = tcindex_filter_result_init(&new_filter_result, net);
if (err < 0)
goto errout1;
if (old_r)
- cr.res = r->res;
+ cr = r->res;
if (tb[TCA_TCINDEX_HASH])
cp->hash = nla_get_u32(tb[TCA_TCINDEX_HASH]);
@@ -406,7 +402,7 @@
err = -ENOMEM;
if (!cp->perfect && !cp->h) {
if (valid_perfect_hash(cp)) {
- if (tcindex_alloc_perfect_hash(cp) < 0)
+ if (tcindex_alloc_perfect_hash(net, cp) < 0)
goto errout_alloc;
balloc = 1;
} else {
@@ -435,7 +431,7 @@
goto errout_alloc;
f->key = handle;
f->next = NULL;
- err = tcindex_filter_result_init(&f->result);
+ err = tcindex_filter_result_init(&f->result, net);
if (err < 0) {
kfree(f);
goto errout_alloc;
@@ -443,12 +439,12 @@
}
if (tb[TCA_TCINDEX_CLASSID]) {
- cr.res.classid = nla_get_u32(tb[TCA_TCINDEX_CLASSID]);
- tcf_bind_filter(tp, &cr.res, base);
+ cr.classid = nla_get_u32(tb[TCA_TCINDEX_CLASSID]);
+ tcf_bind_filter(tp, &cr, base);
}
if (old_r && old_r != r) {
- err = tcindex_filter_result_init(old_r);
+ err = tcindex_filter_result_init(old_r, net);
if (err < 0) {
kfree(f);
goto errout_alloc;
@@ -456,7 +452,7 @@
}
oldp = p;
- r->res = cr.res;
+ r->res = cr;
tcf_exts_change(&r->exts, &e);
rcu_assign_pointer(tp->root, cp);
@@ -475,10 +471,12 @@
; /* nothing */
rcu_assign_pointer(*fp, f);
+ } else {
+ tcf_exts_destroy(&new_filter_result.exts);
}
if (oldp)
- call_rcu(&oldp->rcu, __tcindex_partial_destroy);
+ tcf_queue_work(&oldp->rwork, tcindex_partial_destroy_work);
return 0;
errout_alloc:
@@ -487,7 +485,6 @@
else if (balloc == 2)
kfree(cp->h);
errout1:
- tcf_exts_destroy(&cr.exts);
tcf_exts_destroy(&new_filter_result.exts);
errout:
kfree(cp);
@@ -499,7 +496,7 @@
tcindex_change(struct net *net, struct sk_buff *in_skb,
struct tcf_proto *tp, unsigned long base, u32 handle,
struct nlattr **tca, void **arg, bool ovr,
- struct netlink_ext_ack *extack)
+ bool rtnl_held, struct netlink_ext_ack *extack)
{
struct nlattr *opt = tca[TCA_OPTIONS];
struct nlattr *tb[TCA_TCINDEX_MAX + 1];
@@ -514,7 +511,8 @@
if (!opt)
return 0;
- err = nla_parse_nested(tb, TCA_TCINDEX_MAX, opt, tcindex_policy, NULL);
+ err = nla_parse_nested_deprecated(tb, TCA_TCINDEX_MAX, opt,
+ tcindex_policy, NULL);
if (err < 0)
return err;
@@ -522,7 +520,8 @@
tca[TCA_RATE], ovr, extack);
}
-static void tcindex_walk(struct tcf_proto *tp, struct tcf_walker *walker)
+static void tcindex_walk(struct tcf_proto *tp, struct tcf_walker *walker,
+ bool rtnl_held)
{
struct tcindex_data *p = rtnl_dereference(tp->root);
struct tcindex_filter *f, *next;
@@ -558,24 +557,43 @@
}
}
-static void tcindex_destroy(struct tcf_proto *tp,
+static void tcindex_destroy(struct tcf_proto *tp, bool rtnl_held,
struct netlink_ext_ack *extack)
{
struct tcindex_data *p = rtnl_dereference(tp->root);
- struct tcf_walker walker;
+ int i;
pr_debug("tcindex_destroy(tp %p),p %p\n", tp, p);
- walker.count = 0;
- walker.skip = 0;
- walker.fn = tcindex_destroy_element;
- tcindex_walk(tp, &walker);
- call_rcu(&p->rcu, __tcindex_destroy);
+ if (p->perfect) {
+ for (i = 0; i < p->hash; i++) {
+ struct tcindex_filter_result *r = p->perfect + i;
+
+ tcf_unbind_filter(tp, &r->res);
+ if (tcf_exts_get_net(&r->exts))
+ tcf_queue_work(&r->rwork,
+ tcindex_destroy_rexts_work);
+ else
+ __tcindex_destroy_rexts(r);
+ }
+ }
+
+ for (i = 0; p->h && i < p->hash; i++) {
+ struct tcindex_filter *f, *next;
+ bool last;
+
+ for (f = rtnl_dereference(p->h[i]); f; f = next) {
+ next = rtnl_dereference(f->next);
+ tcindex_delete(tp, &f->result, &last, rtnl_held, NULL);
+ }
+ }
+
+ tcf_queue_work(&p->rwork, tcindex_destroy_work);
}
static int tcindex_dump(struct net *net, struct tcf_proto *tp, void *fh,
- struct sk_buff *skb, struct tcmsg *t)
+ struct sk_buff *skb, struct tcmsg *t, bool rtnl_held)
{
struct tcindex_data *p = rtnl_dereference(tp->root);
struct tcindex_filter_result *r = fh;
@@ -585,7 +603,7 @@
tp, fh, skb, t, p, r);
pr_debug("p->perfect %p p->h %p\n", p->perfect, p->h);
- nest = nla_nest_start(skb, TCA_OPTIONS);
+ nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
if (nest == NULL)
goto nla_put_failure;
diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index b2c3406..a0e6fac 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -1,11 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* net/sched/cls_u32.c Ugly (or Universal) 32bit key Packet Classifier.
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
*
* The filters are packed to hash tables of key nodes
@@ -24,9 +20,6 @@
* pure RSVP doesn't need such a general approach and can use
* much simpler (and faster) schemes, sort of cls_rsvp.c.
*
- * JHS: We should remove the CONFIG_NET_CLS_IND from here
- * eventually when the meta match extension is made available
- *
* nfmark match added by Catalin(ux aka Dino) BOIE <catab at umbrella.ro>
*/
@@ -52,9 +45,7 @@
u32 handle;
struct tc_u_hnode __rcu *ht_up;
struct tcf_exts exts;
-#ifdef CONFIG_NET_CLS_IND
int ifindex;
-#endif
u8 fshift;
struct tcf_result res;
struct tc_u_hnode __rcu *ht_down;
@@ -68,7 +59,6 @@
u32 mask;
u32 __percpu *pcpu_success;
#endif
- struct tcf_proto *tp;
struct rcu_work rwork;
/* The 'sel' field MUST be the last field in structure to allow for
* tc_u32_keys allocated at end of structure.
@@ -80,10 +70,10 @@
struct tc_u_hnode __rcu *next;
u32 handle;
u32 prio;
- struct tc_u_common *tp_c;
int refcnt;
unsigned int divisor;
struct idr handle_idr;
+ bool is_root;
struct rcu_head rcu;
u32 flags;
/* The 'ht' field MUST be the last field in structure to allow for
@@ -98,7 +88,7 @@
int refcnt;
struct idr handle_idr;
struct hlist_node hnode;
- struct rcu_head rcu;
+ long knodes;
};
static inline unsigned int u32_hash_fold(__be32 key,
@@ -181,12 +171,10 @@
if (n->sel.flags & TC_U32_TERMINAL) {
*res = n->res;
-#ifdef CONFIG_NET_CLS_IND
if (!tcf_match_indev(skb, n->ifindex)) {
n = rcu_dereference_bh(n->next);
goto next_knode;
}
-#endif
#ifdef CONFIG_CLS_U32_PERF
__this_cpu_inc(n->pf->rhit);
#endif
@@ -344,19 +332,16 @@
return block->q;
}
-static unsigned int tc_u_hash(const struct tcf_proto *tp)
+static struct hlist_head *tc_u_hash(void *key)
{
- return hash_ptr(tc_u_common_ptr(tp), U32_HASH_SHIFT);
+ return tc_u_common_hash + hash_ptr(key, U32_HASH_SHIFT);
}
-static struct tc_u_common *tc_u_common_find(const struct tcf_proto *tp)
+static struct tc_u_common *tc_u_common_find(void *key)
{
struct tc_u_common *tc;
- unsigned int h;
-
- h = tc_u_hash(tp);
- hlist_for_each_entry(tc, &tc_u_common_hash[h], hnode) {
- if (tc->ptr == tc_u_common_ptr(tp))
+ hlist_for_each_entry(tc, tc_u_hash(key), hnode) {
+ if (tc->ptr == key)
return tc;
}
return NULL;
@@ -365,10 +350,8 @@
static int u32_init(struct tcf_proto *tp)
{
struct tc_u_hnode *root_ht;
- struct tc_u_common *tp_c;
- unsigned int h;
-
- tp_c = tc_u_common_find(tp);
+ void *key = tc_u_common_ptr(tp);
+ struct tc_u_common *tp_c = tc_u_common_find(key);
root_ht = kzalloc(sizeof(*root_ht), GFP_KERNEL);
if (root_ht == NULL)
@@ -377,6 +360,7 @@
root_ht->refcnt++;
root_ht->handle = tp_c ? gen_new_htid(tp_c, root_ht) : 0x80000000;
root_ht->prio = tp->prio;
+ root_ht->is_root = true;
idr_init(&root_ht->handle_idr);
if (tp_c == NULL) {
@@ -385,18 +369,16 @@
kfree(root_ht);
return -ENOBUFS;
}
- tp_c->ptr = tc_u_common_ptr(tp);
+ tp_c->ptr = key;
INIT_HLIST_NODE(&tp_c->hnode);
idr_init(&tp_c->handle_idr);
- h = tc_u_hash(tp);
- hlist_add_head(&tp_c->hnode, &tc_u_common_hash[h]);
+ hlist_add_head(&tp_c->hnode, tc_u_hash(key));
}
tp_c->refcnt++;
RCU_INIT_POINTER(root_ht->next, tp_c->hlist);
rcu_assign_pointer(tp_c->hlist, root_ht);
- root_ht->tp_c = tp_c;
root_ht->refcnt++;
rcu_assign_pointer(tp->root, root_ht);
@@ -404,8 +386,7 @@
return 0;
}
-static int u32_destroy_key(struct tcf_proto *tp, struct tc_u_knode *n,
- bool free_pf)
+static int u32_destroy_key(struct tc_u_knode *n, bool free_pf)
{
struct tc_u_hnode *ht = rtnl_dereference(n->ht_down);
@@ -439,7 +420,7 @@
struct tc_u_knode,
rwork);
rtnl_lock();
- u32_destroy_key(key->tp, key, false);
+ u32_destroy_key(key, false);
rtnl_unlock();
}
@@ -456,12 +437,13 @@
struct tc_u_knode,
rwork);
rtnl_lock();
- u32_destroy_key(key->tp, key, true);
+ u32_destroy_key(key, true);
rtnl_unlock();
}
static int u32_delete_key(struct tcf_proto *tp, struct tc_u_knode *key)
{
+ struct tc_u_common *tp_c = tp->data;
struct tc_u_knode __rcu **kp;
struct tc_u_knode *pkp;
struct tc_u_hnode *ht = rtnl_dereference(key->ht_up);
@@ -472,6 +454,7 @@
kp = &pkp->next, pkp = rtnl_dereference(*kp)) {
if (pkp == key) {
RCU_INIT_POINTER(*kp, key->next);
+ tp_c->knodes--;
tcf_unbind_filter(tp, &key->res);
idr_remove(&ht->handle_idr, key->handle);
@@ -497,7 +480,7 @@
cls_u32.hnode.handle = h->handle;
cls_u32.hnode.prio = h->prio;
- tc_setup_cb_call(block, NULL, TC_SETUP_CLSU32, &cls_u32, false);
+ tc_setup_cb_call(block, TC_SETUP_CLSU32, &cls_u32, false, true);
}
static int u32_replace_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h,
@@ -515,7 +498,7 @@
cls_u32.hnode.handle = h->handle;
cls_u32.hnode.prio = h->prio;
- err = tc_setup_cb_call(block, NULL, TC_SETUP_CLSU32, &cls_u32, skip_sw);
+ err = tc_setup_cb_call(block, TC_SETUP_CLSU32, &cls_u32, skip_sw, true);
if (err < 0) {
u32_clear_hw_hnode(tp, h, NULL);
return err;
@@ -539,8 +522,8 @@
cls_u32.command = TC_CLSU32_DELETE_KNODE;
cls_u32.knode.handle = n->handle;
- tc_setup_cb_call(block, NULL, TC_SETUP_CLSU32, &cls_u32, false);
- tcf_block_offload_dec(block, &n->flags);
+ tc_setup_cb_destroy(block, tp, TC_SETUP_CLSU32, &cls_u32, false,
+ &n->flags, &n->in_hw_count, true);
}
static int u32_replace_hw_knode(struct tcf_proto *tp, struct tc_u_knode *n,
@@ -564,17 +547,16 @@
cls_u32.knode.mask = 0;
#endif
cls_u32.knode.sel = &n->sel;
+ cls_u32.knode.res = &n->res;
cls_u32.knode.exts = &n->exts;
if (n->ht_down)
cls_u32.knode.link_handle = ht->handle;
- err = tc_setup_cb_call(block, NULL, TC_SETUP_CLSU32, &cls_u32, skip_sw);
- if (err < 0) {
+ err = tc_setup_cb_add(block, tp, TC_SETUP_CLSU32, &cls_u32, skip_sw,
+ &n->flags, &n->in_hw_count, true);
+ if (err) {
u32_remove_hw_knode(tp, n, NULL);
return err;
- } else if (err > 0) {
- n->in_hw_count = err;
- tcf_block_offload_inc(block, &n->flags);
}
if (skip_sw && !(n->flags & TCA_CLS_FLAGS_IN_HW))
@@ -586,6 +568,7 @@
static void u32_clear_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht,
struct netlink_ext_ack *extack)
{
+ struct tc_u_common *tp_c = tp->data;
struct tc_u_knode *n;
unsigned int h;
@@ -593,13 +576,14 @@
while ((n = rtnl_dereference(ht->ht[h])) != NULL) {
RCU_INIT_POINTER(ht->ht[h],
rtnl_dereference(n->next));
+ tp_c->knodes--;
tcf_unbind_filter(tp, &n->res);
u32_remove_hw_knode(tp, n, extack);
idr_remove(&ht->handle_idr, n->handle);
if (tcf_exts_get_net(&n->exts))
tcf_queue_work(&n->rwork, u32_delete_key_freepf_work);
else
- u32_destroy_key(n->tp, n, true);
+ u32_destroy_key(n, true);
}
}
}
@@ -632,18 +616,8 @@
return -ENOENT;
}
-static bool ht_empty(struct tc_u_hnode *ht)
-{
- unsigned int h;
-
- for (h = 0; h <= ht->divisor; h++)
- if (rcu_access_pointer(ht->ht[h]))
- return false;
-
- return true;
-}
-
-static void u32_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack)
+static void u32_destroy(struct tcf_proto *tp, bool rtnl_held,
+ struct netlink_ext_ack *extack)
{
struct tc_u_common *tp_c = tp->data;
struct tc_u_hnode *root_ht = rtnl_dereference(tp->root);
@@ -677,23 +651,19 @@
}
static int u32_delete(struct tcf_proto *tp, void *arg, bool *last,
- struct netlink_ext_ack *extack)
+ bool rtnl_held, struct netlink_ext_ack *extack)
{
struct tc_u_hnode *ht = arg;
- struct tc_u_hnode *root_ht = rtnl_dereference(tp->root);
struct tc_u_common *tp_c = tp->data;
int ret = 0;
- if (ht == NULL)
- goto out;
-
if (TC_U32_KEY(ht->handle)) {
u32_remove_hw_knode(tp, (struct tc_u_knode *)ht, extack);
ret = u32_delete_key(tp, (struct tc_u_knode *)ht);
goto out;
}
- if (root_ht == ht) {
+ if (ht->is_root) {
NL_SET_ERR_MSG_MOD(extack, "Not allowed to delete root node");
return -EINVAL;
}
@@ -706,38 +676,7 @@
}
out:
- *last = true;
- if (root_ht) {
- if (root_ht->refcnt > 2) {
- *last = false;
- goto ret;
- }
- if (root_ht->refcnt == 2) {
- if (!ht_empty(root_ht)) {
- *last = false;
- goto ret;
- }
- }
- }
-
- if (tp_c->refcnt > 1) {
- *last = false;
- goto ret;
- }
-
- if (tp_c->refcnt == 1) {
- struct tc_u_hnode *ht;
-
- for (ht = rtnl_dereference(tp_c->hlist);
- ht;
- ht = rtnl_dereference(ht->next))
- if (!ht_empty(ht)) {
- *last = false;
- break;
- }
- }
-
-ret:
+ *last = tp_c->refcnt == 1 && tp_c->knodes == 0;
return ret;
}
@@ -768,14 +707,14 @@
};
static int u32_set_parms(struct net *net, struct tcf_proto *tp,
- unsigned long base, struct tc_u_hnode *ht,
+ unsigned long base,
struct tc_u_knode *n, struct nlattr **tb,
struct nlattr *est, bool ovr,
struct netlink_ext_ack *extack)
{
int err;
- err = tcf_exts_validate(net, tp, tb, est, &n->exts, ovr, extack);
+ err = tcf_exts_validate(net, tp, tb, est, &n->exts, ovr, true, extack);
if (err < 0)
return err;
@@ -789,12 +728,16 @@
}
if (handle) {
- ht_down = u32_lookup_ht(ht->tp_c, handle);
+ ht_down = u32_lookup_ht(tp->data, handle);
if (!ht_down) {
NL_SET_ERR_MSG_MOD(extack, "Link hash table not found");
return -EINVAL;
}
+ if (ht_down->is_root) {
+ NL_SET_ERR_MSG_MOD(extack, "Not linking to root node");
+ return -EINVAL;
+ }
ht_down->refcnt++;
}
@@ -809,7 +752,6 @@
tcf_bind_filter(tp, &n->res, base);
}
-#ifdef CONFIG_NET_CLS_IND
if (tb[TCA_U32_INDEV]) {
int ret;
ret = tcf_change_indev(net, tb[TCA_U32_INDEV], extack);
@@ -817,7 +759,6 @@
return -EINVAL;
n->ifindex = ret;
}
-#endif
return 0;
}
@@ -848,7 +789,7 @@
rcu_assign_pointer(*ins, n);
}
-static struct tc_u_knode *u32_init_knode(struct tcf_proto *tp,
+static struct tc_u_knode *u32_init_knode(struct net *net, struct tcf_proto *tp,
struct tc_u_knode *n)
{
struct tc_u_hnode *ht = rtnl_dereference(n->ht_down);
@@ -865,9 +806,7 @@
new->handle = n->handle;
RCU_INIT_POINTER(new->ht_up, n->ht_up);
-#ifdef CONFIG_NET_CLS_IND
new->ifindex = n->ifindex;
-#endif
new->fshift = n->fshift;
new->res = n->res;
new->flags = n->flags;
@@ -891,10 +830,9 @@
/* Similarly success statistics must be moved as pointers */
new->pcpu_success = n->pcpu_success;
#endif
- new->tp = tp;
- memcpy(&new->sel, s, sizeof(*s) + s->nkeys*sizeof(struct tc_u32_key));
+ memcpy(&new->sel, s, struct_size(s, keys, s->nkeys));
- if (tcf_exts_init(&new->exts, TCA_U32_ACT, TCA_U32_POLICE)) {
+ if (tcf_exts_init(&new->exts, net, TCA_U32_ACT, TCA_U32_POLICE)) {
kfree(new);
return NULL;
}
@@ -904,7 +842,7 @@
static int u32_change(struct net *net, struct sk_buff *in_skb,
struct tcf_proto *tp, unsigned long base, u32 handle,
- struct nlattr **tca, void **arg, bool ovr,
+ struct nlattr **tca, void **arg, bool ovr, bool rtnl_held,
struct netlink_ext_ack *extack)
{
struct tc_u_common *tp_c = tp->data;
@@ -929,7 +867,8 @@
}
}
- err = nla_parse_nested(tb, TCA_U32_MAX, opt, u32_policy, extack);
+ err = nla_parse_nested_deprecated(tb, TCA_U32_MAX, opt, u32_policy,
+ extack);
if (err < 0)
return err;
@@ -956,22 +895,21 @@
return -EINVAL;
}
- new = u32_init_knode(tp, n);
+ new = u32_init_knode(net, tp, n);
if (!new)
return -ENOMEM;
- err = u32_set_parms(net, tp, base,
- rtnl_dereference(n->ht_up), new, tb,
+ err = u32_set_parms(net, tp, base, new, tb,
tca[TCA_RATE], ovr, extack);
if (err) {
- u32_destroy_key(tp, new, false);
+ u32_destroy_key(new, false);
return err;
}
err = u32_replace_hw_knode(tp, new, flags, extack);
if (err) {
- u32_destroy_key(tp, new, false);
+ u32_destroy_key(new, false);
return err;
}
@@ -988,7 +926,11 @@
if (tb[TCA_U32_DIVISOR]) {
unsigned int divisor = nla_get_u32(tb[TCA_U32_DIVISOR]);
- if (--divisor > 0x100) {
+ if (!is_power_of_2(divisor)) {
+ NL_SET_ERR_MSG_MOD(extack, "Divisor is not a power of 2");
+ return -EINVAL;
+ }
+ if (divisor-- > 0x100) {
NL_SET_ERR_MSG_MOD(extack, "Exceeded maximum 256 hash buckets");
return -EINVAL;
}
@@ -1013,7 +955,6 @@
return err;
}
}
- ht->tp_c = tp_c;
ht->refcnt = 1;
ht->divisor = divisor;
ht->handle = handle;
@@ -1103,9 +1044,8 @@
n->handle = handle;
n->fshift = s->hmask ? ffs(ntohl(s->hmask)) - 1 : 0;
n->flags = flags;
- n->tp = tp;
- err = tcf_exts_init(&n->exts, TCA_U32_ACT, TCA_U32_POLICE);
+ err = tcf_exts_init(&n->exts, net, TCA_U32_ACT, TCA_U32_POLICE);
if (err < 0)
goto errout;
@@ -1125,7 +1065,7 @@
}
#endif
- err = u32_set_parms(net, tp, base, ht, n, tb, tca[TCA_RATE], ovr,
+ err = u32_set_parms(net, tp, base, n, tb, tca[TCA_RATE], ovr,
extack);
if (err == 0) {
struct tc_u_knode __rcu **ins;
@@ -1146,6 +1086,7 @@
RCU_INIT_POINTER(n->next, pins);
rcu_assign_pointer(*ins, n);
+ tp_c->knodes++;
*arg = n;
return 0;
}
@@ -1167,7 +1108,8 @@
return err;
}
-static void u32_walk(struct tcf_proto *tp, struct tcf_walker *arg)
+static void u32_walk(struct tcf_proto *tp, struct tcf_walker *arg,
+ bool rtnl_held)
{
struct tc_u_common *tp_c = tp->data;
struct tc_u_hnode *ht;
@@ -1208,7 +1150,7 @@
}
static int u32_reoffload_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht,
- bool add, tc_setup_cb_t *cb, void *cb_priv,
+ bool add, flow_setup_cb_t *cb, void *cb_priv,
struct netlink_ext_ack *extack)
{
struct tc_cls_u32_offload cls_u32 = {};
@@ -1228,7 +1170,7 @@
}
static int u32_reoffload_knode(struct tcf_proto *tp, struct tc_u_knode *n,
- bool add, tc_setup_cb_t *cb, void *cb_priv,
+ bool add, flow_setup_cb_t *cb, void *cb_priv,
struct netlink_ext_ack *extack)
{
struct tc_u_hnode *ht = rtnl_dereference(n->ht_down);
@@ -1251,24 +1193,22 @@
cls_u32.knode.mask = 0;
#endif
cls_u32.knode.sel = &n->sel;
+ cls_u32.knode.res = &n->res;
cls_u32.knode.exts = &n->exts;
if (n->ht_down)
cls_u32.knode.link_handle = ht->handle;
}
- err = cb(TC_SETUP_CLSU32, &cls_u32, cb_priv);
- if (err) {
- if (add && tc_skip_sw(n->flags))
- return err;
- return 0;
- }
-
- tc_cls_offload_cnt_update(block, &n->in_hw_count, &n->flags, add);
+ err = tc_setup_cb_reoffload(block, tp, add, cb, TC_SETUP_CLSU32,
+ &cls_u32, cb_priv, &n->flags,
+ &n->in_hw_count);
+ if (err)
+ return err;
return 0;
}
-static int u32_reoffload(struct tcf_proto *tp, bool add, tc_setup_cb_t *cb,
+static int u32_reoffload(struct tcf_proto *tp, bool add, flow_setup_cb_t *cb,
void *cb_priv, struct netlink_ext_ack *extack)
{
struct tc_u_common *tp_c = tp->data;
@@ -1324,7 +1264,7 @@
}
static int u32_dump(struct net *net, struct tcf_proto *tp, void *fh,
- struct sk_buff *skb, struct tcmsg *t)
+ struct sk_buff *skb, struct tcmsg *t, bool rtnl_held)
{
struct tc_u_knode *n = fh;
struct tc_u_hnode *ht_up, *ht_down;
@@ -1335,7 +1275,7 @@
t->tcm_handle = n->handle;
- nest = nla_nest_start(skb, TCA_OPTIONS);
+ nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
if (nest == NULL)
goto nla_put_failure;
@@ -1395,14 +1335,12 @@
if (tcf_exts_dump(skb, &n->exts) < 0)
goto nla_put_failure;
-#ifdef CONFIG_NET_CLS_IND
if (n->ifindex) {
struct net_device *dev;
dev = __dev_get_by_index(net, n->ifindex);
if (dev && nla_put_string(skb, TCA_U32_INDEV, dev->name))
goto nla_put_failure;
}
-#endif
#ifdef CONFIG_CLS_U32_PERF
gpf = kzalloc(sizeof(struct tc_u32_pcnt) +
n->sel.nkeys * sizeof(u64),
@@ -1466,9 +1404,7 @@
#ifdef CONFIG_CLS_U32_PERF
pr_info(" Performance counters on\n");
#endif
-#ifdef CONFIG_NET_CLS_IND
pr_info(" input device check on\n");
-#endif
#ifdef CONFIG_NET_CLS_ACT
pr_info(" Actions configured\n");
#endif
diff --git a/net/sched/em_canid.c b/net/sched/em_canid.c
index ddd883c..b9a94fd 100644
--- a/net/sched/em_canid.c
+++ b/net/sched/em_canid.c
@@ -1,11 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* em_canid.c Ematch rule to match CAN frames according to their CAN IDs
*
- * This program is free software; you can distribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Idea: Oliver Hartkopp <oliver.hartkopp@volkswagen.de>
* Copyright: (c) 2011 Czech Technical University in Prague
* (c) 2011 Volkswagen Group Research
diff --git a/net/sched/em_cmp.c b/net/sched/em_cmp.c
index 1c8360a..a4d09b1 100644
--- a/net/sched/em_cmp.c
+++ b/net/sched/em_cmp.c
@@ -1,11 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* net/sched/em_cmp.c Simple packet data comparison ematch
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Authors: Thomas Graf <tgraf@suug.ch>
*/
diff --git a/net/sched/em_ipset.c b/net/sched/em_ipset.c
index c1b23e3..df00566 100644
--- a/net/sched/em_ipset.c
+++ b/net/sched/em_ipset.c
@@ -1,11 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* net/sched/em_ipset.c ipset ematch
*
* Copyright (c) 2012 Florian Westphal <fw@strlen.de>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * version 2 as published by the Free Software Foundation.
*/
#include <linux/gfp.h>
diff --git a/net/sched/em_ipt.c b/net/sched/em_ipt.c
index a5f34e9..9fff648 100644
--- a/net/sched/em_ipt.c
+++ b/net/sched/em_ipt.c
@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* net/sched/em_ipt.c IPtables matches Ematch
*
* (c) 2018 Eyal Birger <eyal.birger@gmail.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include <linux/gfp.h>
@@ -25,6 +21,7 @@
struct em_ipt_match {
const struct xt_match *match;
u32 hook;
+ u8 nfproto;
u8 match_data[0] __aligned(8);
};
@@ -75,11 +72,25 @@
return 0;
}
+static int addrtype_validate_match_data(struct nlattr **tb, u8 mrev)
+{
+ if (mrev != 1) {
+ pr_err("only addrtype match revision 1 supported");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
static const struct em_ipt_xt_match em_ipt_xt_matches[] = {
{
.match_name = "policy",
.validate_match_data = policy_validate_match_data
},
+ {
+ .match_name = "addrtype",
+ .validate_match_data = addrtype_validate_match_data
+ },
{}
};
@@ -119,9 +130,10 @@
struct em_ipt_match *im = NULL;
struct xt_match *match;
int mdata_len, ret;
+ u8 nfproto;
- ret = nla_parse(tb, TCA_EM_IPT_MAX, data, data_len, em_ipt_policy,
- NULL);
+ ret = nla_parse_deprecated(tb, TCA_EM_IPT_MAX, data, data_len,
+ em_ipt_policy, NULL);
if (ret < 0)
return ret;
@@ -129,6 +141,15 @@
!tb[TCA_EM_IPT_MATCH_DATA] || !tb[TCA_EM_IPT_NFPROTO])
return -EINVAL;
+ nfproto = nla_get_u8(tb[TCA_EM_IPT_NFPROTO]);
+ switch (nfproto) {
+ case NFPROTO_IPV4:
+ case NFPROTO_IPV6:
+ break;
+ default:
+ return -EINVAL;
+ }
+
match = get_xt_match(tb);
if (IS_ERR(match)) {
pr_err("unable to load match\n");
@@ -144,6 +165,7 @@
im->match = match;
im->hook = nla_get_u32(tb[TCA_EM_IPT_HOOK]);
+ im->nfproto = nfproto;
nla_memcpy(im->match_data, tb[TCA_EM_IPT_MATCH_DATA], mdata_len);
ret = check_match(net, im, mdata_len);
@@ -186,15 +208,33 @@
const struct em_ipt_match *im = (const void *)em->data;
struct xt_action_param acpar = {};
struct net_device *indev = NULL;
+ u8 nfproto = im->match->family;
struct nf_hook_state state;
int ret;
+ switch (tc_skb_protocol(skb)) {
+ case htons(ETH_P_IP):
+ if (!pskb_network_may_pull(skb, sizeof(struct iphdr)))
+ return 0;
+ if (nfproto == NFPROTO_UNSPEC)
+ nfproto = NFPROTO_IPV4;
+ break;
+ case htons(ETH_P_IPV6):
+ if (!pskb_network_may_pull(skb, sizeof(struct ipv6hdr)))
+ return 0;
+ if (nfproto == NFPROTO_UNSPEC)
+ nfproto = NFPROTO_IPV6;
+ break;
+ default:
+ return 0;
+ }
+
rcu_read_lock();
if (skb->skb_iif)
indev = dev_get_by_index_rcu(em->net, skb->skb_iif);
- nf_hook_state_init(&state, im->hook, im->match->family,
+ nf_hook_state_init(&state, im->hook, nfproto,
indev ?: skb->dev, skb->dev, NULL, em->net, NULL);
acpar.match = im->match;
@@ -217,7 +257,7 @@
return -EMSGSIZE;
if (nla_put_u8(skb, TCA_EM_IPT_MATCH_REVISION, im->match->revision) < 0)
return -EMSGSIZE;
- if (nla_put_u8(skb, TCA_EM_IPT_NFPROTO, im->match->family) < 0)
+ if (nla_put_u8(skb, TCA_EM_IPT_NFPROTO, im->nfproto) < 0)
return -EMSGSIZE;
if (nla_put(skb, TCA_EM_IPT_MATCH_DATA,
im->match->usersize ?: im->match->matchsize,
diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c
index d6e9711..3177dcb 100644
--- a/net/sched/em_meta.c
+++ b/net/sched/em_meta.c
@@ -1,11 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* net/sched/em_meta.c Metadata ematch
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Authors: Thomas Graf <tgraf@suug.ch>
*
* ==========================================================================
@@ -450,7 +446,7 @@
*err = -1;
return;
}
- dst->value = sk->sk_wmem_queued;
+ dst->value = READ_ONCE(sk->sk_wmem_queued);
}
META_COLLECTOR(int_sk_fwd_alloc)
@@ -558,7 +554,7 @@
*err = -1;
return;
}
- dst->value = sk->sk_rcvlowat;
+ dst->value = READ_ONCE(sk->sk_rcvlowat);
}
META_COLLECTOR(int_sk_rcvtimeo)
@@ -912,7 +908,8 @@
struct tcf_meta_hdr *hdr;
struct meta_match *meta = NULL;
- err = nla_parse(tb, TCA_EM_META_MAX, data, len, meta_policy, NULL);
+ err = nla_parse_deprecated(tb, TCA_EM_META_MAX, data, len,
+ meta_policy, NULL);
if (err < 0)
goto errout;
diff --git a/net/sched/em_nbyte.c b/net/sched/em_nbyte.c
index 07c10ba..88c7ce4 100644
--- a/net/sched/em_nbyte.c
+++ b/net/sched/em_nbyte.c
@@ -1,11 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* net/sched/em_nbyte.c N-Byte ematch
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Authors: Thomas Graf <tgraf@suug.ch>
*/
diff --git a/net/sched/em_text.c b/net/sched/em_text.c
index 73e2ed5..6f3c1fb 100644
--- a/net/sched/em_text.c
+++ b/net/sched/em_text.c
@@ -1,11 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* net/sched/em_text.c Textsearch ematch
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Authors: Thomas Graf <tgraf@suug.ch>
*/
diff --git a/net/sched/em_u32.c b/net/sched/em_u32.c
index 797bdb8..71b070d 100644
--- a/net/sched/em_u32.c
+++ b/net/sched/em_u32.c
@@ -1,11 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* net/sched/em_u32.c U32 Ematch
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Authors: Thomas Graf <tgraf@suug.ch>
* Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
*
diff --git a/net/sched/ematch.c b/net/sched/ematch.c
index 1331a4c..8f2ad70 100644
--- a/net/sched/ematch.c
+++ b/net/sched/ematch.c
@@ -1,11 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* net/sched/ematch.c Extended Match API
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Authors: Thomas Graf <tgraf@suug.ch>
*
* ==========================================================================
@@ -314,7 +310,8 @@
if (!nla)
return 0;
- err = nla_parse_nested(tb, TCA_EMATCH_TREE_MAX, nla, em_policy, NULL);
+ err = nla_parse_nested_deprecated(tb, TCA_EMATCH_TREE_MAX, nla,
+ em_policy, NULL);
if (err < 0)
goto errout;
@@ -440,14 +437,14 @@
struct nlattr *top_start;
struct nlattr *list_start;
- top_start = nla_nest_start(skb, tlv);
+ top_start = nla_nest_start_noflag(skb, tlv);
if (top_start == NULL)
goto nla_put_failure;
if (nla_put(skb, TCA_EMATCH_TREE_HDR, sizeof(tree->hdr), &tree->hdr))
goto nla_put_failure;
- list_start = nla_nest_start(skb, TCA_EMATCH_TREE_LIST);
+ list_start = nla_nest_start_noflag(skb, TCA_EMATCH_TREE_LIST);
if (list_start == NULL)
goto nla_put_failure;
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index be7cd14..1047825 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -1,11 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* net/sched/sch_api.c Packet scheduler API.
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
*
* Fixes:
@@ -27,7 +23,6 @@
#include <linux/kmod.h>
#include <linux/list.h>
#include <linux/hrtimer.h>
-#include <linux/lockdep.h>
#include <linux/slab.h>
#include <linux/hashtable.h>
@@ -315,10 +310,27 @@
return q;
}
+struct Qdisc *qdisc_lookup_rcu(struct net_device *dev, u32 handle)
+{
+ struct netdev_queue *nq;
+ struct Qdisc *q;
+
+ if (!handle)
+ return NULL;
+ q = qdisc_match_from_root(dev->qdisc, handle);
+ if (q)
+ goto out;
+
+ nq = dev_ingress_queue_rcu(dev);
+ if (nq)
+ q = qdisc_match_from_root(nq->qdisc_sleeping, handle);
+out:
+ return q;
+}
+
static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
{
unsigned long cl;
- struct Qdisc *leaf;
const struct Qdisc_class_ops *cops = p->ops->cl_ops;
if (cops == NULL)
@@ -327,8 +339,7 @@
if (cl == 0)
return NULL;
- leaf = cops->leaf(p, cl);
- return leaf;
+ return cops->leaf(p, cl);
}
/* Find queueing discipline by name */
@@ -464,7 +475,8 @@
u16 *tab = NULL;
int err;
- err = nla_parse_nested(tb, TCA_STAB_MAX, opt, stab_policy, extack);
+ err = nla_parse_nested_deprecated(tb, TCA_STAB_MAX, opt, stab_policy,
+ extack);
if (err < 0)
return ERR_PTR(err);
if (!tb[TCA_STAB_BASE]) {
@@ -511,11 +523,6 @@
return stab;
}
-static void stab_kfree_rcu(struct rcu_head *head)
-{
- kfree(container_of(head, struct qdisc_size_table, rcu));
-}
-
void qdisc_put_stab(struct qdisc_size_table *tab)
{
if (!tab)
@@ -523,7 +530,7 @@
if (--tab->refcnt == 0) {
list_del(&tab->list);
- call_rcu_bh(&tab->rcu, stab_kfree_rcu);
+ kfree_rcu(tab, rcu);
}
}
EXPORT_SYMBOL(qdisc_put_stab);
@@ -532,7 +539,7 @@
{
struct nlattr *nest;
- nest = nla_nest_start(skb, TCA_STAB);
+ nest = nla_nest_start_noflag(skb, TCA_STAB);
if (nest == NULL)
goto nla_put_failure;
if (nla_put(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts))
@@ -743,8 +750,7 @@
return 0;
}
-void qdisc_tree_reduce_backlog(struct Qdisc *sch, unsigned int n,
- unsigned int len)
+void qdisc_tree_reduce_backlog(struct Qdisc *sch, int n, int len)
{
bool qdisc_is_offloaded = sch->flags & TCQ_F_OFFLOADED;
const struct Qdisc_class_ops *cops;
@@ -793,6 +799,71 @@
}
EXPORT_SYMBOL(qdisc_tree_reduce_backlog);
+int qdisc_offload_dump_helper(struct Qdisc *sch, enum tc_setup_type type,
+ void *type_data)
+{
+ struct net_device *dev = qdisc_dev(sch);
+ int err;
+
+ sch->flags &= ~TCQ_F_OFFLOADED;
+ if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
+ return 0;
+
+ err = dev->netdev_ops->ndo_setup_tc(dev, type, type_data);
+ if (err == -EOPNOTSUPP)
+ return 0;
+
+ if (!err)
+ sch->flags |= TCQ_F_OFFLOADED;
+
+ return err;
+}
+EXPORT_SYMBOL(qdisc_offload_dump_helper);
+
+void qdisc_offload_graft_helper(struct net_device *dev, struct Qdisc *sch,
+ struct Qdisc *new, struct Qdisc *old,
+ enum tc_setup_type type, void *type_data,
+ struct netlink_ext_ack *extack)
+{
+ bool any_qdisc_is_offloaded;
+ int err;
+
+ if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
+ return;
+
+ err = dev->netdev_ops->ndo_setup_tc(dev, type, type_data);
+
+ /* Don't report error if the graft is part of destroy operation. */
+ if (!err || !new || new == &noop_qdisc)
+ return;
+
+ /* Don't report error if the parent, the old child and the new
+ * one are not offloaded.
+ */
+ any_qdisc_is_offloaded = new->flags & TCQ_F_OFFLOADED;
+ any_qdisc_is_offloaded |= sch && sch->flags & TCQ_F_OFFLOADED;
+ any_qdisc_is_offloaded |= old && old->flags & TCQ_F_OFFLOADED;
+
+ if (any_qdisc_is_offloaded)
+ NL_SET_ERR_MSG(extack, "Offloading graft operation failed.");
+}
+EXPORT_SYMBOL(qdisc_offload_graft_helper);
+
+static void qdisc_offload_graft_root(struct net_device *dev,
+ struct Qdisc *new, struct Qdisc *old,
+ struct netlink_ext_ack *extack)
+{
+ struct tc_root_qopt_offload graft_offload = {
+ .command = TC_ROOT_GRAFT,
+ .handle = new ? new->handle : 0,
+ .ingress = (new && new->flags & TCQ_F_INGRESS) ||
+ (old && old->flags & TCQ_F_INGRESS),
+ };
+
+ qdisc_offload_graft_helper(dev, NULL, new, old,
+ TC_SETUP_ROOT_QDISC, &graft_offload, extack);
+}
+
static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
u32 portid, u32 seq, u16 flags, int event)
{
@@ -921,7 +992,20 @@
qdisc_notify(net, skb, n, clid, old, new);
if (old)
- qdisc_destroy(old);
+ qdisc_put(old);
+}
+
+static void qdisc_clear_nolock(struct Qdisc *sch)
+{
+ sch->flags &= ~TCQ_F_NOLOCK;
+ if (!(sch->flags & TCQ_F_CPUSTATS))
+ return;
+
+ free_percpu(sch->cpu_bstats);
+ free_percpu(sch->cpu_qstats);
+ sch->cpu_bstats = NULL;
+ sch->cpu_qstats = NULL;
+ sch->flags &= ~TCQ_F_CPUSTATS;
}
/* Graft qdisc "new" to class "classid" of qdisc "parent" or
@@ -940,7 +1024,6 @@
{
struct Qdisc *q = old;
struct net *net = dev_net(dev);
- int err = 0;
if (parent == NULL) {
unsigned int i, num_q, ingress;
@@ -960,6 +1043,8 @@
if (dev->flags & IFF_UP)
dev_deactivate(dev);
+ qdisc_offload_graft_root(dev, new, old, extack);
+
if (new && new->ops->attach)
goto skip;
@@ -974,7 +1059,7 @@
qdisc_refcount_inc(new);
if (!ingress)
- qdisc_destroy(old);
+ qdisc_put(old);
}
skip:
@@ -995,28 +1080,29 @@
dev_activate(dev);
} else {
const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
+ unsigned long cl;
+ int err;
/* Only support running class lockless if parent is lockless */
if (new && (new->flags & TCQ_F_NOLOCK) &&
parent && !(parent->flags & TCQ_F_NOLOCK))
- new->flags &= ~TCQ_F_NOLOCK;
+ qdisc_clear_nolock(new);
- err = -EOPNOTSUPP;
- if (cops && cops->graft) {
- unsigned long cl = cops->find(parent, classid);
+ if (!cops || !cops->graft)
+ return -EOPNOTSUPP;
- if (cl) {
- err = cops->graft(parent, cl, new, &old,
- extack);
- } else {
- NL_SET_ERR_MSG(extack, "Specified class not found");
- err = -ENOENT;
- }
+ cl = cops->find(parent, classid);
+ if (!cl) {
+ NL_SET_ERR_MSG(extack, "Specified class not found");
+ return -ENOENT;
}
- if (!err)
- notify_and_destroy(net, skb, n, classid, old, new);
+
+ err = cops->graft(parent, cl, new, &old, extack);
+ if (err)
+ return err;
+ notify_and_destroy(net, skb, n, classid, old, new);
}
- return err;
+ return 0;
}
static int qdisc_block_indexes_set(struct Qdisc *sch, struct nlattr **tca,
@@ -1053,10 +1139,6 @@
return 0;
}
-/* lockdep annotation is needed for ingress; egress gets it only for name */
-static struct lock_class_key qdisc_tx_lock;
-static struct lock_class_key qdisc_rx_lock;
-
/*
Allocate and initialize new qdisc.
@@ -1121,15 +1203,15 @@
if (handle == TC_H_INGRESS) {
sch->flags |= TCQ_F_INGRESS;
handle = TC_H_MAKE(TC_H_INGRESS, 0);
- lockdep_set_class(qdisc_lock(sch), &qdisc_rx_lock);
} else {
if (handle == 0) {
handle = qdisc_alloc_handle(dev);
- err = -ENOMEM;
- if (handle == 0)
+ if (handle == 0) {
+ NL_SET_ERR_MSG(extack, "Maximum number of qdisc handles was exceeded");
+ err = -ENOSPC;
goto err_out3;
+ }
}
- lockdep_set_class(qdisc_lock(sch), &qdisc_tx_lock);
if (!netif_is_multiqueue(dev))
sch->flags |= TCQ_F_ONETXQUEUE;
}
@@ -1338,8 +1420,8 @@
!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
return -EPERM;
- err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, rtm_tca_policy,
- extack);
+ err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
+ rtm_tca_policy, extack);
if (err < 0)
return err;
@@ -1423,8 +1505,8 @@
replay:
/* Reinit, just in case something touches this. */
- err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, rtm_tca_policy,
- extack);
+ err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
+ rtm_tca_policy, extack);
if (err < 0)
return err;
@@ -1581,7 +1663,7 @@
err = qdisc_graft(dev, p, skb, n, clid, q, NULL, extack);
if (err) {
if (q)
- qdisc_destroy(q);
+ qdisc_put(q);
return err;
}
@@ -1658,8 +1740,8 @@
idx = 0;
ASSERT_RTNL();
- err = nlmsg_parse(nlh, sizeof(struct tcmsg), tca, TCA_MAX,
- rtm_tca_policy, NULL);
+ err = nlmsg_parse_deprecated(nlh, sizeof(struct tcmsg), tca, TCA_MAX,
+ rtm_tca_policy, cb->extack);
if (err < 0)
return err;
@@ -1752,6 +1834,7 @@
{
struct sk_buff *skb;
u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
+ int err = 0;
skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
if (!skb)
@@ -1762,8 +1845,11 @@
return -EINVAL;
}
- return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
- n->nlmsg_flags & NLM_F_ECHO);
+ err = rtnetlink_send(skb, net, portid, RTNLGRP_TC,
+ n->nlmsg_flags & NLM_F_ECHO);
+ if (err > 0)
+ err = 0;
+ return err;
}
static int tclass_del_notify(struct net *net,
@@ -1794,8 +1880,11 @@
return err;
}
- return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
- n->nlmsg_flags & NLM_F_ECHO);
+ err = rtnetlink_send(skb, net, portid, RTNLGRP_TC,
+ n->nlmsg_flags & NLM_F_ECHO);
+ if (err > 0)
+ err = 0;
+ return err;
}
#ifdef CONFIG_NET_CLS
@@ -1831,20 +1920,24 @@
cl = cops->find(q, portid);
if (!cl)
return;
+ if (!cops->tcf_block)
+ return;
block = cops->tcf_block(q, cl, NULL);
if (!block)
return;
- list_for_each_entry(chain, &block->chain_list, list) {
+ for (chain = tcf_get_next_chain(block, NULL);
+ chain;
+ chain = tcf_get_next_chain(block, chain)) {
struct tcf_proto *tp;
- for (tp = rtnl_dereference(chain->filter_chain);
- tp; tp = rtnl_dereference(tp->next)) {
+ for (tp = tcf_get_next_proto(chain, NULL, true);
+ tp; tp = tcf_get_next_proto(chain, tp, true)) {
struct tcf_bind_args arg = {};
arg.w.fn = tcf_node_bind;
arg.classid = clid;
arg.cl = new_cl;
- tp->ops->walk(tp, &arg.w);
+ tp->ops->walk(tp, &arg.w, true);
}
}
}
@@ -1878,8 +1971,8 @@
!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
return -EPERM;
- err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, rtm_tca_policy,
- extack);
+ err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
+ rtm_tca_policy, extack);
if (err < 0)
return err;
diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c
index cd49afc..f4f9b8c 100644
--- a/net/sched/sch_atm.c
+++ b/net/sched/sch_atm.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* net/sched/sch_atm.c - ATM VC selection "queueing discipline" */
/* Written 1998-2000 by Werner Almesberger, EPFL ICA */
@@ -150,7 +151,7 @@
pr_debug("atm_tc_put: destroying\n");
list_del_init(&flow->list);
pr_debug("atm_tc_put: qdisc %p\n", flow->q);
- qdisc_destroy(flow->q);
+ qdisc_put(flow->q);
tcf_block_put(flow->block);
if (flow->sock) {
pr_debug("atm_tc_put: f_count %ld\n",
@@ -223,7 +224,8 @@
if (opt == NULL)
return -EINVAL;
- error = nla_parse_nested(tb, TCA_ATM_MAX, opt, atm_policy, NULL);
+ error = nla_parse_nested_deprecated(tb, TCA_ATM_MAX, opt, atm_policy,
+ NULL);
if (error < 0)
return error;
@@ -609,7 +611,7 @@
tcm->tcm_handle = flow->common.classid;
tcm->tcm_info = flow->q->handle;
- nest = nla_nest_start(skb, TCA_OPTIONS);
+ nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
if (nest == NULL)
goto nla_put_failure;
diff --git a/net/sched/sch_blackhole.c b/net/sched/sch_blackhole.c
index 9c4c2bb..a7f7667 100644
--- a/net/sched/sch_blackhole.c
+++ b/net/sched/sch_blackhole.c
@@ -1,11 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* net/sched/sch_blackhole.c Black hole queue
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Authors: Thomas Graf <tgraf@suug.ch>
*
* Note: Quantum tunneling is not supported.
diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c
index 793016d..53a80bc 100644
--- a/net/sched/sch_cake.c
+++ b/net/sched/sch_cake.c
@@ -138,8 +138,8 @@
struct cake_host {
u32 srchost_tag;
u32 dsthost_tag;
- u16 srchost_refcnt;
- u16 dsthost_refcnt;
+ u16 srchost_bulk_flow_count;
+ u16 dsthost_bulk_flow_count;
};
struct cake_heap_entry {
@@ -211,6 +211,9 @@
u8 ack_filter;
u8 atm_mode;
+ u32 fwmark_mask;
+ u16 fwmark_shft;
+
/* time_next = time_this + ((len * rate_ns) >> rate_shft) */
u16 rate_shft;
ktime_t time_next_packet;
@@ -746,8 +749,10 @@
* queue, accept the collision, update the host tags.
*/
q->way_collisions++;
- q->hosts[q->flows[reduced_hash].srchost].srchost_refcnt--;
- q->hosts[q->flows[reduced_hash].dsthost].dsthost_refcnt--;
+ if (q->flows[outer_hash + k].set == CAKE_SET_BULK) {
+ q->hosts[q->flows[reduced_hash].srchost].srchost_bulk_flow_count--;
+ q->hosts[q->flows[reduced_hash].dsthost].dsthost_bulk_flow_count--;
+ }
allocate_src = cake_dsrc(flow_mode);
allocate_dst = cake_ddst(flow_mode);
found:
@@ -767,13 +772,14 @@
}
for (i = 0; i < CAKE_SET_WAYS;
i++, k = (k + 1) % CAKE_SET_WAYS) {
- if (!q->hosts[outer_hash + k].srchost_refcnt)
+ if (!q->hosts[outer_hash + k].srchost_bulk_flow_count)
break;
}
q->hosts[outer_hash + k].srchost_tag = srchost_hash;
found_src:
srchost_idx = outer_hash + k;
- q->hosts[srchost_idx].srchost_refcnt++;
+ if (q->flows[reduced_hash].set == CAKE_SET_BULK)
+ q->hosts[srchost_idx].srchost_bulk_flow_count++;
q->flows[reduced_hash].srchost = srchost_idx;
}
@@ -789,13 +795,14 @@
}
for (i = 0; i < CAKE_SET_WAYS;
i++, k = (k + 1) % CAKE_SET_WAYS) {
- if (!q->hosts[outer_hash + k].dsthost_refcnt)
+ if (!q->hosts[outer_hash + k].dsthost_bulk_flow_count)
break;
}
q->hosts[outer_hash + k].dsthost_tag = dsthost_hash;
found_dst:
dsthost_idx = outer_hash + k;
- q->hosts[dsthost_idx].dsthost_refcnt++;
+ if (q->flows[reduced_hash].set == CAKE_SET_BULK)
+ q->hosts[dsthost_idx].dsthost_bulk_flow_count++;
q->flows[reduced_hash].dsthost = dsthost_idx;
}
}
@@ -812,7 +819,7 @@
if (skb) {
flow->head = skb->next;
- skb->next = NULL;
+ skb_mark_not_on_list(skb);
}
return skb;
@@ -1252,7 +1259,7 @@
else
flow->head = elig_ack->next;
- elig_ack->next = NULL;
+ skb_mark_not_on_list(elig_ack);
return elig_ack;
}
@@ -1508,32 +1515,29 @@
return idx + (tin << 16);
}
-static void cake_wash_diffserv(struct sk_buff *skb)
-{
- switch (skb->protocol) {
- case htons(ETH_P_IP):
- ipv4_change_dsfield(ip_hdr(skb), INET_ECN_MASK, 0);
- break;
- case htons(ETH_P_IPV6):
- ipv6_change_dsfield(ipv6_hdr(skb), INET_ECN_MASK, 0);
- break;
- default:
- break;
- }
-}
-
static u8 cake_handle_diffserv(struct sk_buff *skb, u16 wash)
{
+ int wlen = skb_network_offset(skb);
u8 dscp;
- switch (skb->protocol) {
+ switch (tc_skb_protocol(skb)) {
case htons(ETH_P_IP):
+ wlen += sizeof(struct iphdr);
+ if (!pskb_may_pull(skb, wlen) ||
+ skb_try_make_writable(skb, wlen))
+ return 0;
+
dscp = ipv4_get_dsfield(ip_hdr(skb)) >> 2;
if (wash && dscp)
ipv4_change_dsfield(ip_hdr(skb), INET_ECN_MASK, 0);
return dscp;
case htons(ETH_P_IPV6):
+ wlen += sizeof(struct ipv6hdr);
+ if (!pskb_may_pull(skb, wlen) ||
+ skb_try_make_writable(skb, wlen))
+ return 0;
+
dscp = ipv6_get_dsfield(ipv6_hdr(skb)) >> 2;
if (wash && dscp)
ipv6_change_dsfield(ipv6_hdr(skb), INET_ECN_MASK, 0);
@@ -1552,26 +1556,32 @@
struct sk_buff *skb)
{
struct cake_sched_data *q = qdisc_priv(sch);
- u32 tin;
+ u32 tin, mark;
+ u8 dscp;
- if (TC_H_MAJ(skb->priority) == sch->handle &&
- TC_H_MIN(skb->priority) > 0 &&
- TC_H_MIN(skb->priority) <= q->tin_cnt) {
+ /* Tin selection: Default to diffserv-based selection, allow overriding
+ * using firewall marks or skb->priority.
+ */
+ dscp = cake_handle_diffserv(skb,
+ q->rate_flags & CAKE_FLAG_WASH);
+ mark = (skb->mark & q->fwmark_mask) >> q->fwmark_shft;
+
+ if (q->tin_mode == CAKE_DIFFSERV_BESTEFFORT)
+ tin = 0;
+
+ else if (mark && mark <= q->tin_cnt)
+ tin = q->tin_order[mark - 1];
+
+ else if (TC_H_MAJ(skb->priority) == sch->handle &&
+ TC_H_MIN(skb->priority) > 0 &&
+ TC_H_MIN(skb->priority) <= q->tin_cnt)
tin = q->tin_order[TC_H_MIN(skb->priority) - 1];
- if (q->rate_flags & CAKE_FLAG_WASH)
- cake_wash_diffserv(skb);
- } else if (q->tin_mode != CAKE_DIFFSERV_BESTEFFORT) {
- /* extract the Diffserv Precedence field, if it exists */
- /* and clear DSCP bits if washing */
- tin = q->tin_index[cake_handle_diffserv(skb,
- q->rate_flags & CAKE_FLAG_WASH)];
+ else {
+ tin = q->tin_index[dscp];
+
if (unlikely(tin >= q->tin_cnt))
tin = 0;
- } else {
- tin = 0;
- if (q->rate_flags & CAKE_FLAG_WASH)
- cake_wash_diffserv(skb);
}
return &q->tins[tin];
@@ -1667,7 +1677,7 @@
if (skb_is_gso(skb) && q->rate_flags & CAKE_FLAG_SPLIT_GSO) {
struct sk_buff *segs, *nskb;
netdev_features_t features = netif_skb_features(skb);
- unsigned int slen = 0;
+ unsigned int slen = 0, numsegs = 0;
segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
if (IS_ERR_OR_NULL(segs))
@@ -1675,7 +1685,7 @@
while (segs) {
nskb = segs->next;
- segs->next = NULL;
+ skb_mark_not_on_list(segs);
qdisc_skb_cb(segs)->pkt_len = segs->len;
cobalt_set_enqueue_time(segs, now);
get_cobalt_cb(segs)->adjusted_len = cake_overhead(q,
@@ -1683,6 +1693,7 @@
flow_queue_add(flow, segs);
sch->q.qlen++;
+ numsegs++;
slen += segs->len;
q->buffer_used += segs->truesize;
b->packets++;
@@ -1696,7 +1707,7 @@
sch->qstats.backlog += slen;
q->avg_window_bytes += slen;
- qdisc_tree_reduce_backlog(sch, 1, len);
+ qdisc_tree_reduce_backlog(sch, 1-numsegs, len-slen);
consume_skb(skb);
} else {
/* not splitting */
@@ -1793,20 +1804,30 @@
b->sparse_flow_count++;
if (cake_dsrc(q->flow_mode))
- host_load = max(host_load, srchost->srchost_refcnt);
+ host_load = max(host_load, srchost->srchost_bulk_flow_count);
if (cake_ddst(q->flow_mode))
- host_load = max(host_load, dsthost->dsthost_refcnt);
+ host_load = max(host_load, dsthost->dsthost_bulk_flow_count);
flow->deficit = (b->flow_quantum *
quantum_div[host_load]) >> 16;
} else if (flow->set == CAKE_SET_SPARSE_WAIT) {
+ struct cake_host *srchost = &b->hosts[flow->srchost];
+ struct cake_host *dsthost = &b->hosts[flow->dsthost];
+
/* this flow was empty, accounted as a sparse flow, but actually
* in the bulk rotation.
*/
flow->set = CAKE_SET_BULK;
b->sparse_flow_count--;
b->bulk_flow_count++;
+
+ if (cake_dsrc(q->flow_mode))
+ srchost->srchost_bulk_flow_count++;
+
+ if (cake_ddst(q->flow_mode))
+ dsthost->dsthost_bulk_flow_count++;
+
}
if (q->buffer_used > q->buffer_max_used)
@@ -1974,23 +1995,8 @@
dsthost = &b->hosts[flow->dsthost];
host_load = 1;
- if (cake_dsrc(q->flow_mode))
- host_load = max(host_load, srchost->srchost_refcnt);
-
- if (cake_ddst(q->flow_mode))
- host_load = max(host_load, dsthost->dsthost_refcnt);
-
- WARN_ON(host_load > CAKE_QUEUES);
-
/* flow isolation (DRR++) */
if (flow->deficit <= 0) {
- /* The shifted prandom_u32() is a way to apply dithering to
- * avoid accumulating roundoff errors
- */
- flow->deficit += (b->flow_quantum * quantum_div[host_load] +
- (prandom_u32() >> 16)) >> 16;
- list_move_tail(&flow->flowchain, &b->old_flows);
-
/* Keep all flows with deficits out of the sparse and decaying
* rotations. No non-empty flow can go into the decaying
* rotation, so they can't get deficits
@@ -1999,6 +2005,13 @@
if (flow->head) {
b->sparse_flow_count--;
b->bulk_flow_count++;
+
+ if (cake_dsrc(q->flow_mode))
+ srchost->srchost_bulk_flow_count++;
+
+ if (cake_ddst(q->flow_mode))
+ dsthost->dsthost_bulk_flow_count++;
+
flow->set = CAKE_SET_BULK;
} else {
/* we've moved it to the bulk rotation for
@@ -2008,6 +2021,22 @@
flow->set = CAKE_SET_SPARSE_WAIT;
}
}
+
+ if (cake_dsrc(q->flow_mode))
+ host_load = max(host_load, srchost->srchost_bulk_flow_count);
+
+ if (cake_ddst(q->flow_mode))
+ host_load = max(host_load, dsthost->dsthost_bulk_flow_count);
+
+ WARN_ON(host_load > CAKE_QUEUES);
+
+ /* The shifted prandom_u32() is a way to apply dithering to
+ * avoid accumulating roundoff errors
+ */
+ flow->deficit += (b->flow_quantum * quantum_div[host_load] +
+ (prandom_u32() >> 16)) >> 16;
+ list_move_tail(&flow->flowchain, &b->old_flows);
+
goto retry;
}
@@ -2028,6 +2057,13 @@
&b->decaying_flows);
if (flow->set == CAKE_SET_BULK) {
b->bulk_flow_count--;
+
+ if (cake_dsrc(q->flow_mode))
+ srchost->srchost_bulk_flow_count--;
+
+ if (cake_ddst(q->flow_mode))
+ dsthost->dsthost_bulk_flow_count--;
+
b->decaying_flow_count++;
} else if (flow->set == CAKE_SET_SPARSE ||
flow->set == CAKE_SET_SPARSE_WAIT) {
@@ -2041,14 +2077,19 @@
if (flow->set == CAKE_SET_SPARSE ||
flow->set == CAKE_SET_SPARSE_WAIT)
b->sparse_flow_count--;
- else if (flow->set == CAKE_SET_BULK)
+ else if (flow->set == CAKE_SET_BULK) {
b->bulk_flow_count--;
- else
+
+ if (cake_dsrc(q->flow_mode))
+ srchost->srchost_bulk_flow_count--;
+
+ if (cake_ddst(q->flow_mode))
+ dsthost->dsthost_bulk_flow_count--;
+
+ } else
b->decaying_flow_count--;
flow->set = CAKE_SET_NONE;
- srchost->srchost_refcnt--;
- dsthost->dsthost_refcnt--;
}
goto begin;
}
@@ -2143,6 +2184,7 @@
[TCA_CAKE_MPU] = { .type = NLA_U32 },
[TCA_CAKE_INGRESS] = { .type = NLA_U32 },
[TCA_CAKE_ACK_FILTER] = { .type = NLA_U32 },
+ [TCA_CAKE_FWMARK] = { .type = NLA_U32 },
};
static void cake_set_rate(struct cake_tin_data *b, u64 rate, u32 mtu,
@@ -2489,7 +2531,8 @@
if (!opt)
return -EINVAL;
- err = nla_parse_nested(tb, TCA_CAKE_MAX, opt, cake_policy, extack);
+ err = nla_parse_nested_deprecated(tb, TCA_CAKE_MAX, opt, cake_policy,
+ extack);
if (err < 0)
return err;
@@ -2589,6 +2632,11 @@
q->rate_flags &= ~CAKE_FLAG_SPLIT_GSO;
}
+ if (tb[TCA_CAKE_FWMARK]) {
+ q->fwmark_mask = nla_get_u32(tb[TCA_CAKE_FWMARK]);
+ q->fwmark_shft = q->fwmark_mask ? __ffs(q->fwmark_mask) : 0;
+ }
+
if (q->tins) {
sch_tree_lock(sch);
cake_reconfigure(sch);
@@ -2688,7 +2736,7 @@
struct cake_sched_data *q = qdisc_priv(sch);
struct nlattr *opts;
- opts = nla_nest_start(skb, TCA_OPTIONS);
+ opts = nla_nest_start_noflag(skb, TCA_OPTIONS);
if (!opts)
goto nla_put_failure;
@@ -2748,6 +2796,9 @@
!!(q->rate_flags & CAKE_FLAG_SPLIT_GSO)))
goto nla_put_failure;
+ if (nla_put_u32(skb, TCA_CAKE_FWMARK, q->fwmark_mask))
+ goto nla_put_failure;
+
return nla_nest_end(skb, opts);
nla_put_failure:
@@ -2756,7 +2807,7 @@
static int cake_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
{
- struct nlattr *stats = nla_nest_start(d->skb, TCA_STATS_APP);
+ struct nlattr *stats = nla_nest_start_noflag(d->skb, TCA_STATS_APP);
struct cake_sched_data *q = qdisc_priv(sch);
struct nlattr *tstats, *ts;
int i;
@@ -2786,7 +2837,7 @@
#undef PUT_STAT_U32
#undef PUT_STAT_U64
- tstats = nla_nest_start(d->skb, TCA_CAKE_STATS_TIN_STATS);
+ tstats = nla_nest_start_noflag(d->skb, TCA_CAKE_STATS_TIN_STATS);
if (!tstats)
goto nla_put_failure;
@@ -2803,7 +2854,7 @@
for (i = 0; i < q->tin_cnt; i++) {
struct cake_tin_data *b = &q->tins[q->tin_order[i]];
- ts = nla_nest_start(d->skb, i + 1);
+ ts = nla_nest_start_noflag(d->skb, i + 1);
if (!ts)
goto nla_put_failure;
@@ -2923,7 +2974,7 @@
if (flow) {
ktime_t now = ktime_get();
- stats = nla_nest_start(d->skb, TCA_STATS_APP);
+ stats = nla_nest_start_noflag(d->skb, TCA_STATS_APP);
if (!stats)
return -1;
diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c
index f42025d..39b427d 100644
--- a/net/sched/sch_cbq.c
+++ b/net/sched/sch_cbq.c
@@ -1,13 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* net/sched/sch_cbq.c Class-Based Queueing discipline.
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
- *
*/
#include <linux/module.h>
@@ -1132,6 +1127,33 @@
[TCA_CBQ_POLICE] = { .len = sizeof(struct tc_cbq_police) },
};
+static int cbq_opt_parse(struct nlattr *tb[TCA_CBQ_MAX + 1],
+ struct nlattr *opt,
+ struct netlink_ext_ack *extack)
+{
+ int err;
+
+ if (!opt) {
+ NL_SET_ERR_MSG(extack, "CBQ options are required for this operation");
+ return -EINVAL;
+ }
+
+ err = nla_parse_nested_deprecated(tb, TCA_CBQ_MAX, opt,
+ cbq_policy, extack);
+ if (err < 0)
+ return err;
+
+ if (tb[TCA_CBQ_WRROPT]) {
+ const struct tc_cbq_wrropt *wrr = nla_data(tb[TCA_CBQ_WRROPT]);
+
+ if (wrr->priority > TC_CBQ_MAXPRIO) {
+ NL_SET_ERR_MSG(extack, "priority is bigger than TC_CBQ_MAXPRIO");
+ err = -EINVAL;
+ }
+ }
+ return err;
+}
+
static int cbq_init(struct Qdisc *sch, struct nlattr *opt,
struct netlink_ext_ack *extack)
{
@@ -1144,12 +1166,7 @@
hrtimer_init(&q->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
q->delay_timer.function = cbq_undelay;
- if (!opt) {
- NL_SET_ERR_MSG(extack, "CBQ options are required for this operation");
- return -EINVAL;
- }
-
- err = nla_parse_nested(tb, TCA_CBQ_MAX, opt, cbq_policy, extack);
+ err = cbq_opt_parse(tb, opt, extack);
if (err < 0)
return err;
@@ -1305,7 +1322,7 @@
struct cbq_sched_data *q = qdisc_priv(sch);
struct nlattr *nest;
- nest = nla_nest_start(skb, TCA_OPTIONS);
+ nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
if (nest == NULL)
goto nla_put_failure;
if (cbq_dump_attr(skb, &q->link) < 0)
@@ -1340,7 +1357,7 @@
tcm->tcm_handle = cl->common.classid;
tcm->tcm_info = cl->q->handle;
- nest = nla_nest_start(skb, TCA_OPTIONS);
+ nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
if (nest == NULL)
goto nla_put_failure;
if (cbq_dump_attr(skb, cl) < 0)
@@ -1358,9 +1375,11 @@
{
struct cbq_sched_data *q = qdisc_priv(sch);
struct cbq_class *cl = (struct cbq_class *)arg;
+ __u32 qlen;
cl->xstats.avgidle = cl->avgidle;
cl->xstats.undertime = 0;
+ qdisc_qstats_qlen_backlog(cl->q, &qlen, &cl->qstats.backlog);
if (cl->undertime != PSCHED_PASTPERFECT)
cl->xstats.undertime = cl->undertime - q->now;
@@ -1368,7 +1387,7 @@
if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
d, NULL, &cl->bstats) < 0 ||
gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
- gnet_stats_copy_queue(d, NULL, &cl->qstats, cl->q->q.qlen) < 0)
+ gnet_stats_copy_queue(d, NULL, &cl->qstats, qlen) < 0)
return -1;
return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats));
@@ -1418,7 +1437,7 @@
WARN_ON(cl->filters);
tcf_block_put(cl->block);
- qdisc_destroy(cl->q);
+ qdisc_put(cl->q);
qdisc_put_rtab(cl->R_tab);
gen_kill_estimator(&cl->rate_est);
if (cl != &q->link)
@@ -1466,12 +1485,7 @@
struct cbq_class *parent;
struct qdisc_rate_table *rtab = NULL;
- if (!opt) {
- NL_SET_ERR_MSG(extack, "Mandatory qdisc options missing");
- return -EINVAL;
- }
-
- err = nla_parse_nested(tb, TCA_CBQ_MAX, opt, cbq_policy, extack);
+ err = cbq_opt_parse(tb, opt, extack);
if (err < 0)
return err;
@@ -1665,17 +1679,13 @@
{
struct cbq_sched_data *q = qdisc_priv(sch);
struct cbq_class *cl = (struct cbq_class *)arg;
- unsigned int qlen, backlog;
if (cl->filters || cl->children || cl == &q->link)
return -EBUSY;
sch_tree_lock(sch);
- qlen = cl->q->q.qlen;
- backlog = cl->q->qstats.backlog;
- qdisc_reset(cl->q);
- qdisc_tree_reduce_backlog(cl->q, qlen, backlog);
+ qdisc_purge_queue(cl->q);
if (cl->next_alive)
cbq_deactivate_class(cl);
diff --git a/net/sched/sch_cbs.c b/net/sched/sch_cbs.c
index e26a240..b2905b0 100644
--- a/net/sched/sch_cbs.c
+++ b/net/sched/sch_cbs.c
@@ -1,13 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* net/sched/sch_cbs.c Credit Based Shaper
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Authors: Vinicius Costa Gomes <vinicius.gomes@intel.com>
- *
*/
/* Credit Based Shaper (CBS)
@@ -61,16 +56,20 @@
#include <linux/string.h>
#include <linux/errno.h>
#include <linux/skbuff.h>
+#include <net/netevent.h>
#include <net/netlink.h>
#include <net/sch_generic.h>
#include <net/pkt_sched.h>
+static LIST_HEAD(cbs_list);
+static DEFINE_SPINLOCK(cbs_list_lock);
+
#define BYTES_PER_KBIT (1000LL / 8)
struct cbs_sched_data {
bool offload;
int queue;
- s64 port_rate; /* in bytes/s */
+ atomic64_t port_rate; /* in bytes/s */
s64 last; /* timestamp in ns */
s64 credits; /* in bytes */
s32 locredit; /* in bytes */
@@ -82,19 +81,21 @@
struct sk_buff **to_free);
struct sk_buff *(*dequeue)(struct Qdisc *sch);
struct Qdisc *qdisc;
+ struct list_head cbs_list;
};
static int cbs_child_enqueue(struct sk_buff *skb, struct Qdisc *sch,
struct Qdisc *child,
struct sk_buff **to_free)
{
+ unsigned int len = qdisc_pkt_len(skb);
int err;
err = child->ops->enqueue(skb, child, to_free);
if (err != NET_XMIT_SUCCESS)
return err;
- qdisc_qstats_backlog_inc(sch, skb);
+ sch->qstats.backlog += len;
sch->q.qlen++;
return NET_XMIT_SUCCESS;
@@ -206,7 +207,8 @@
/* As sendslope is a negative number, this will decrease the
* amount of q->credits.
*/
- credits = credits_from_len(len, q->sendslope, q->port_rate);
+ credits = credits_from_len(len, q->sendslope,
+ atomic64_read(&q->port_rate));
credits += q->credits;
q->credits = max_t(s64, credits, q->locredit);
@@ -293,6 +295,58 @@
return 0;
}
+static void cbs_set_port_rate(struct net_device *dev, struct cbs_sched_data *q)
+{
+ struct ethtool_link_ksettings ecmd;
+ int speed = SPEED_10;
+ int port_rate;
+ int err;
+
+ err = __ethtool_get_link_ksettings(dev, &ecmd);
+ if (err < 0)
+ goto skip;
+
+ if (ecmd.base.speed && ecmd.base.speed != SPEED_UNKNOWN)
+ speed = ecmd.base.speed;
+
+skip:
+ port_rate = speed * 1000 * BYTES_PER_KBIT;
+
+ atomic64_set(&q->port_rate, port_rate);
+ netdev_dbg(dev, "cbs: set %s's port_rate to: %lld, linkspeed: %d\n",
+ dev->name, (long long)atomic64_read(&q->port_rate),
+ ecmd.base.speed);
+}
+
+static int cbs_dev_notifier(struct notifier_block *nb, unsigned long event,
+ void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct cbs_sched_data *q;
+ struct net_device *qdev;
+ bool found = false;
+
+ ASSERT_RTNL();
+
+ if (event != NETDEV_UP && event != NETDEV_CHANGE)
+ return NOTIFY_DONE;
+
+ spin_lock(&cbs_list_lock);
+ list_for_each_entry(q, &cbs_list, cbs_list) {
+ qdev = qdisc_dev(q->qdisc);
+ if (qdev == dev) {
+ found = true;
+ break;
+ }
+ }
+ spin_unlock(&cbs_list_lock);
+
+ if (found)
+ cbs_set_port_rate(dev, q);
+
+ return NOTIFY_DONE;
+}
+
static int cbs_change(struct Qdisc *sch, struct nlattr *opt,
struct netlink_ext_ack *extack)
{
@@ -302,7 +356,8 @@
struct tc_cbs_qopt *qopt;
int err;
- err = nla_parse_nested(tb, TCA_CBS_MAX, opt, cbs_policy, extack);
+ err = nla_parse_nested_deprecated(tb, TCA_CBS_MAX, opt, cbs_policy,
+ extack);
if (err < 0)
return err;
@@ -314,16 +369,7 @@
qopt = nla_data(tb[TCA_CBS_PARMS]);
if (!qopt->offload) {
- struct ethtool_link_ksettings ecmd;
- s64 link_speed;
-
- if (!__ethtool_get_link_ksettings(dev, &ecmd))
- link_speed = ecmd.base.speed;
- else
- link_speed = SPEED_1000;
-
- q->port_rate = link_speed * 1000 * BYTES_PER_KBIT;
-
+ cbs_set_port_rate(dev, q);
cbs_disable_offload(dev, q);
} else {
err = cbs_enable_offload(dev, q, qopt, extack);
@@ -357,6 +403,10 @@
if (!q->qdisc)
return -ENOMEM;
+ spin_lock(&cbs_list_lock);
+ list_add(&q->cbs_list, &cbs_list);
+ spin_unlock(&cbs_list_lock);
+
qdisc_hash_add(q->qdisc, false);
q->queue = sch->dev_queue - netdev_get_tx_queue(dev, 0);
@@ -374,12 +424,18 @@
struct cbs_sched_data *q = qdisc_priv(sch);
struct net_device *dev = qdisc_dev(sch);
- qdisc_watchdog_cancel(&q->watchdog);
+ /* Nothing to do if we couldn't create the underlying qdisc */
+ if (!q->qdisc)
+ return;
+ qdisc_watchdog_cancel(&q->watchdog);
cbs_disable_offload(dev, q);
- if (q->qdisc)
- qdisc_destroy(q->qdisc);
+ spin_lock(&cbs_list_lock);
+ list_del(&q->cbs_list);
+ spin_unlock(&cbs_list_lock);
+
+ qdisc_put(q->qdisc);
}
static int cbs_dump(struct Qdisc *sch, struct sk_buff *skb)
@@ -388,7 +444,7 @@
struct tc_cbs_qopt opt = { };
struct nlattr *nest;
- nest = nla_nest_start(skb, TCA_OPTIONS);
+ nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
if (!nest)
goto nla_put_failure;
@@ -486,14 +542,29 @@
.owner = THIS_MODULE,
};
+static struct notifier_block cbs_device_notifier = {
+ .notifier_call = cbs_dev_notifier,
+};
+
static int __init cbs_module_init(void)
{
- return register_qdisc(&cbs_qdisc_ops);
+ int err;
+
+ err = register_netdevice_notifier(&cbs_device_notifier);
+ if (err)
+ return err;
+
+ err = register_qdisc(&cbs_qdisc_ops);
+ if (err)
+ unregister_netdevice_notifier(&cbs_device_notifier);
+
+ return err;
}
static void __exit cbs_module_exit(void)
{
unregister_qdisc(&cbs_qdisc_ops);
+ unregister_netdevice_notifier(&cbs_device_notifier);
}
module_init(cbs_module_init)
module_exit(cbs_module_exit)
diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c
index eafc0d1..dba7037 100644
--- a/net/sched/sch_choke.c
+++ b/net/sched/sch_choke.c
@@ -1,13 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* net/sched/sch_choke.c CHOKE scheduler
*
* Copyright (c) 2011 Stephen Hemminger <shemminger@vyatta.com>
* Copyright (c) 2011 Eric Dumazet <eric.dumazet@gmail.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * version 2 as published by the Free Software Foundation.
- *
*/
#include <linux/module.h>
@@ -358,7 +354,8 @@
if (opt == NULL)
return -EINVAL;
- err = nla_parse_nested(tb, TCA_CHOKE_MAX, opt, choke_policy, NULL);
+ err = nla_parse_nested_deprecated(tb, TCA_CHOKE_MAX, opt,
+ choke_policy, NULL);
if (err < 0)
return err;
@@ -452,7 +449,7 @@
.Scell_log = q->parms.Scell_log,
};
- opts = nla_nest_start(skb, TCA_OPTIONS);
+ opts = nla_nest_start_noflag(skb, TCA_OPTIONS);
if (opts == NULL)
goto nla_put_failure;
diff --git a/net/sched/sch_codel.c b/net/sched/sch_codel.c
index 17cd81f..30169b3 100644
--- a/net/sched/sch_codel.c
+++ b/net/sched/sch_codel.c
@@ -71,10 +71,10 @@
struct Qdisc *sch = ctx;
struct sk_buff *skb = __qdisc_dequeue_head(&sch->q);
- if (skb)
+ if (skb) {
sch->qstats.backlog -= qdisc_pkt_len(skb);
-
- prefetch(&skb->end); /* we'll need skb_shinfo() */
+ prefetch(&skb->end); /* we'll need skb_shinfo() */
+ }
return skb;
}
@@ -141,7 +141,8 @@
if (!opt)
return -EINVAL;
- err = nla_parse_nested(tb, TCA_CODEL_MAX, opt, codel_policy, NULL);
+ err = nla_parse_nested_deprecated(tb, TCA_CODEL_MAX, opt,
+ codel_policy, NULL);
if (err < 0)
return err;
@@ -217,7 +218,7 @@
struct codel_sched_data *q = qdisc_priv(sch);
struct nlattr *opts;
- opts = nla_nest_start(skb, TCA_OPTIONS);
+ opts = nla_nest_start_noflag(skb, TCA_OPTIONS);
if (opts == NULL)
goto nla_put_failure;
diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c
index e0b0cf8..07a2b0b 100644
--- a/net/sched/sch_drr.c
+++ b/net/sched/sch_drr.c
@@ -1,11 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* net/sched/sch_drr.c Deficit Round Robin scheduler
*
* Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * version 2 as published by the Free Software Foundation.
*/
#include <linux/module.h>
@@ -50,15 +47,6 @@
return container_of(clc, struct drr_class, common);
}
-static void drr_purge_queue(struct drr_class *cl)
-{
- unsigned int len = cl->qdisc->q.qlen;
- unsigned int backlog = cl->qdisc->qstats.backlog;
-
- qdisc_reset(cl->qdisc);
- qdisc_tree_reduce_backlog(cl->qdisc, len, backlog);
-}
-
static const struct nla_policy drr_policy[TCA_DRR_MAX + 1] = {
[TCA_DRR_QUANTUM] = { .type = NLA_U32 },
};
@@ -79,7 +67,8 @@
return -EINVAL;
}
- err = nla_parse_nested(tb, TCA_DRR_MAX, opt, drr_policy, extack);
+ err = nla_parse_nested_deprecated(tb, TCA_DRR_MAX, opt, drr_policy,
+ extack);
if (err < 0)
return err;
@@ -134,7 +123,7 @@
tca[TCA_RATE]);
if (err) {
NL_SET_ERR_MSG(extack, "Failed to replace estimator");
- qdisc_destroy(cl->qdisc);
+ qdisc_put(cl->qdisc);
kfree(cl);
return err;
}
@@ -153,7 +142,7 @@
static void drr_destroy_class(struct Qdisc *sch, struct drr_class *cl)
{
gen_kill_estimator(&cl->rate_est);
- qdisc_destroy(cl->qdisc);
+ qdisc_put(cl->qdisc);
kfree(cl);
}
@@ -167,7 +156,7 @@
sch_tree_lock(sch);
- drr_purge_queue(cl);
+ qdisc_purge_queue(cl->qdisc);
qdisc_class_hash_remove(&q->clhash, &cl->common);
sch_tree_unlock(sch);
@@ -253,7 +242,7 @@
tcm->tcm_handle = cl->common.classid;
tcm->tcm_info = cl->qdisc->handle;
- nest = nla_nest_start(skb, TCA_OPTIONS);
+ nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
if (nest == NULL)
goto nla_put_failure;
if (nla_put_u32(skb, TCA_DRR_QUANTUM, cl->quantum))
@@ -269,7 +258,8 @@
struct gnet_dump *d)
{
struct drr_class *cl = (struct drr_class *)arg;
- __u32 qlen = cl->qdisc->q.qlen;
+ __u32 qlen = qdisc_qlen_sum(cl->qdisc);
+ struct Qdisc *cl_q = cl->qdisc;
struct tc_drr_stats xstats;
memset(&xstats, 0, sizeof(xstats));
@@ -279,7 +269,7 @@
if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
d, NULL, &cl->bstats) < 0 ||
gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
- gnet_stats_copy_queue(d, NULL, &cl->qdisc->qstats, qlen) < 0)
+ gnet_stats_copy_queue(d, cl_q->cpu_qstats, &cl_q->qstats, qlen) < 0)
return -1;
return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
@@ -350,9 +340,11 @@
static int drr_enqueue(struct sk_buff *skb, struct Qdisc *sch,
struct sk_buff **to_free)
{
+ unsigned int len = qdisc_pkt_len(skb);
struct drr_sched *q = qdisc_priv(sch);
struct drr_class *cl;
int err = 0;
+ bool first;
cl = drr_classify(skb, sch, &err);
if (cl == NULL) {
@@ -362,6 +354,7 @@
return err;
}
+ first = !cl->qdisc->q.qlen;
err = qdisc_enqueue(skb, cl->qdisc, to_free);
if (unlikely(err != NET_XMIT_SUCCESS)) {
if (net_xmit_drop_count(err)) {
@@ -371,12 +364,12 @@
return err;
}
- if (cl->qdisc->q.qlen == 1) {
+ if (first) {
list_add_tail(&cl->alist, &q->active);
cl->deficit = cl->quantum;
}
- qdisc_qstats_backlog_inc(sch, skb);
+ sch->qstats.backlog += len;
sch->q.qlen++;
return err;
}
diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c
index 049714c..05605b3 100644
--- a/net/sched/sch_dsmark.c
+++ b/net/sched/sch_dsmark.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* net/sched/sch_dsmark.c - Differentiated Services field marker */
/* Written 1998-2000 by Werner Almesberger, EPFL ICA */
@@ -132,7 +133,8 @@
if (!opt)
goto errout;
- err = nla_parse_nested(tb, TCA_DSMARK_MAX, opt, dsmark_policy, NULL);
+ err = nla_parse_nested_deprecated(tb, TCA_DSMARK_MAX, opt,
+ dsmark_policy, NULL);
if (err < 0)
goto errout;
@@ -199,6 +201,7 @@
static int dsmark_enqueue(struct sk_buff *skb, struct Qdisc *sch,
struct sk_buff **to_free)
{
+ unsigned int len = qdisc_pkt_len(skb);
struct dsmark_qdisc_data *p = qdisc_priv(sch);
int err;
@@ -271,7 +274,7 @@
return err;
}
- qdisc_qstats_backlog_inc(sch, skb);
+ sch->qstats.backlog += len;
sch->q.qlen++;
return NET_XMIT_SUCCESS;
@@ -352,11 +355,14 @@
if (err)
return err;
- err = nla_parse_nested(tb, TCA_DSMARK_MAX, opt, dsmark_policy, NULL);
+ err = nla_parse_nested_deprecated(tb, TCA_DSMARK_MAX, opt,
+ dsmark_policy, NULL);
if (err < 0)
goto errout;
err = -EINVAL;
+ if (!tb[TCA_DSMARK_INDICES])
+ goto errout;
indices = nla_get_u16(tb[TCA_DSMARK_INDICES]);
if (hweight32(indices) != 1)
@@ -412,7 +418,7 @@
pr_debug("%s(sch %p,[qdisc %p])\n", __func__, sch, p);
tcf_block_put(p->block);
- qdisc_destroy(p->q);
+ qdisc_put(p->q);
if (p->mv != p->embedded)
kfree(p->mv);
}
@@ -431,7 +437,7 @@
tcm->tcm_handle = TC_H_MAKE(TC_H_MAJ(sch->handle), cl - 1);
tcm->tcm_info = p->q->handle;
- opts = nla_nest_start(skb, TCA_OPTIONS);
+ opts = nla_nest_start_noflag(skb, TCA_OPTIONS);
if (opts == NULL)
goto nla_put_failure;
if (nla_put_u8(skb, TCA_DSMARK_MASK, p->mv[cl - 1].mask) ||
@@ -450,7 +456,7 @@
struct dsmark_qdisc_data *p = qdisc_priv(sch);
struct nlattr *opts = NULL;
- opts = nla_nest_start(skb, TCA_OPTIONS);
+ opts = nla_nest_start_noflag(skb, TCA_OPTIONS);
if (opts == NULL)
goto nla_put_failure;
if (nla_put_u16(skb, TCA_DSMARK_INDICES, p->indices))
diff --git a/net/sched/sch_etf.c b/net/sched/sch_etf.c
index 1538d6f..b1da558 100644
--- a/net/sched/sch_etf.c
+++ b/net/sched/sch_etf.c
@@ -22,15 +22,17 @@
#define DEADLINE_MODE_IS_ON(x) ((x)->flags & TC_ETF_DEADLINE_MODE_ON)
#define OFFLOAD_IS_ON(x) ((x)->flags & TC_ETF_OFFLOAD_ON)
+#define SKIP_SOCK_CHECK_IS_SET(x) ((x)->flags & TC_ETF_SKIP_SOCK_CHECK)
struct etf_sched_data {
bool offload;
bool deadline_mode;
+ bool skip_sock_check;
int clockid;
int queue;
s32 delta; /* in ns */
ktime_t last; /* The txtime of the last skb sent to the netdevice. */
- struct rb_root head;
+ struct rb_root_cached head;
struct qdisc_watchdog watchdog;
ktime_t (*get_time)(void);
};
@@ -77,6 +79,9 @@
struct sock *sk = nskb->sk;
ktime_t now;
+ if (q->skip_sock_check)
+ goto skip;
+
if (!sk)
return false;
@@ -92,6 +97,7 @@
if (sk->sk_txtime_deadline_mode != q->deadline_mode)
return false;
+skip:
now = q->get_time();
if (ktime_before(txtime, now) || ktime_before(txtime, q->last))
return false;
@@ -104,7 +110,7 @@
struct etf_sched_data *q = qdisc_priv(sch);
struct rb_node *p;
- p = rb_first(&q->head);
+ p = rb_first_cached(&q->head);
if (!p)
return NULL;
@@ -117,8 +123,10 @@
struct sk_buff *skb = etf_peek_timesortedlist(sch);
ktime_t next;
- if (!skb)
+ if (!skb) {
+ qdisc_watchdog_cancel(&q->watchdog);
return;
+ }
next = ktime_sub_ns(skb->tstamp, q->delta);
qdisc_watchdog_schedule_ns(&q->watchdog, ktime_to_ns(next));
@@ -154,8 +162,9 @@
struct sk_buff **to_free)
{
struct etf_sched_data *q = qdisc_priv(sch);
- struct rb_node **p = &q->head.rb_node, *parent = NULL;
+ struct rb_node **p = &q->head.rb_root.rb_node, *parent = NULL;
ktime_t txtime = nskb->tstamp;
+ bool leftmost = true;
if (!is_packet_valid(sch, nskb)) {
report_sock_error(nskb, EINVAL,
@@ -168,13 +177,15 @@
parent = *p;
skb = rb_to_skb(parent);
- if (ktime_after(txtime, skb->tstamp))
+ if (ktime_compare(txtime, skb->tstamp) >= 0) {
p = &parent->rb_right;
- else
+ leftmost = false;
+ } else {
p = &parent->rb_left;
+ }
}
rb_link_node(&nskb->rbnode, parent, p);
- rb_insert_color(&nskb->rbnode, &q->head);
+ rb_insert_color_cached(&nskb->rbnode, &q->head, leftmost);
qdisc_qstats_backlog_inc(sch, nskb);
sch->q.qlen++;
@@ -185,12 +196,42 @@
return NET_XMIT_SUCCESS;
}
-static void timesortedlist_erase(struct Qdisc *sch, struct sk_buff *skb,
- bool drop)
+static void timesortedlist_drop(struct Qdisc *sch, struct sk_buff *skb,
+ ktime_t now)
+{
+ struct etf_sched_data *q = qdisc_priv(sch);
+ struct sk_buff *to_free = NULL;
+ struct sk_buff *tmp = NULL;
+
+ skb_rbtree_walk_from_safe(skb, tmp) {
+ if (ktime_after(skb->tstamp, now))
+ break;
+
+ rb_erase_cached(&skb->rbnode, &q->head);
+
+ /* The rbnode field in the skb re-uses these fields, now that
+ * we are done with the rbnode, reset them.
+ */
+ skb->next = NULL;
+ skb->prev = NULL;
+ skb->dev = qdisc_dev(sch);
+
+ report_sock_error(skb, ECANCELED, SO_EE_CODE_TXTIME_MISSED);
+
+ qdisc_qstats_backlog_dec(sch, skb);
+ qdisc_drop(skb, sch, &to_free);
+ qdisc_qstats_overlimit(sch);
+ sch->q.qlen--;
+ }
+
+ kfree_skb_list(to_free);
+}
+
+static void timesortedlist_remove(struct Qdisc *sch, struct sk_buff *skb)
{
struct etf_sched_data *q = qdisc_priv(sch);
- rb_erase(&skb->rbnode, &q->head);
+ rb_erase_cached(&skb->rbnode, &q->head);
/* The rbnode field in the skb re-uses these fields, now that
* we are done with the rbnode, reset them.
@@ -201,19 +242,9 @@
qdisc_qstats_backlog_dec(sch, skb);
- if (drop) {
- struct sk_buff *to_free = NULL;
+ qdisc_bstats_update(sch, skb);
- report_sock_error(skb, ECANCELED, SO_EE_CODE_TXTIME_MISSED);
-
- qdisc_drop(skb, sch, &to_free);
- kfree_skb_list(to_free);
- qdisc_qstats_overlimit(sch);
- } else {
- qdisc_bstats_update(sch, skb);
-
- q->last = skb->tstamp;
- }
+ q->last = skb->tstamp;
sch->q.qlen--;
}
@@ -232,7 +263,7 @@
/* Drop if packet has expired while in queue. */
if (ktime_before(skb->tstamp, now)) {
- timesortedlist_erase(sch, skb, true);
+ timesortedlist_drop(sch, skb, now);
skb = NULL;
goto out;
}
@@ -241,7 +272,7 @@
* txtime from deadline to (now + delta).
*/
if (q->deadline_mode) {
- timesortedlist_erase(sch, skb, false);
+ timesortedlist_remove(sch, skb);
skb->tstamp = now;
goto out;
}
@@ -250,7 +281,7 @@
/* Dequeue only if now is within the [txtime - delta, txtime] range. */
if (ktime_after(now, next))
- timesortedlist_erase(sch, skb, false);
+ timesortedlist_remove(sch, skb);
else
skb = NULL;
@@ -326,7 +357,8 @@
return -EINVAL;
}
- err = nla_parse_nested(tb, TCA_ETF_MAX, opt, etf_policy, extack);
+ err = nla_parse_nested_deprecated(tb, TCA_ETF_MAX, opt, etf_policy,
+ extack);
if (err < 0)
return err;
@@ -359,6 +391,7 @@
q->clockid = qopt->clockid;
q->offload = OFFLOAD_IS_ON(qopt);
q->deadline_mode = DEADLINE_MODE_IS_ON(qopt);
+ q->skip_sock_check = SKIP_SOCK_CHECK_IS_SET(qopt);
switch (q->clockid) {
case CLOCK_REALTIME:
@@ -386,14 +419,14 @@
static void timesortedlist_clear(struct Qdisc *sch)
{
struct etf_sched_data *q = qdisc_priv(sch);
- struct rb_node *p = rb_first(&q->head);
+ struct rb_node *p = rb_first_cached(&q->head);
while (p) {
struct sk_buff *skb = rb_to_skb(p);
p = rb_next(p);
- rb_erase(&skb->rbnode, &q->head);
+ rb_erase_cached(&skb->rbnode, &q->head);
rtnl_kfree_skbs(skb, skb);
sch->q.qlen--;
}
@@ -435,7 +468,7 @@
struct tc_etf_qopt opt = { };
struct nlattr *nest;
- nest = nla_nest_start(skb, TCA_OPTIONS);
+ nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
if (!nest)
goto nla_put_failure;
@@ -447,6 +480,9 @@
if (q->deadline_mode)
opt.flags |= TC_ETF_DEADLINE_MODE_ON;
+ if (q->skip_sock_check)
+ opt.flags |= TC_ETF_SKIP_SOCK_CHECK;
+
if (nla_put(skb, TCA_ETF_PARMS, sizeof(opt), &opt))
goto nla_put_failure;
diff --git a/net/sched/sch_fifo.c b/net/sched/sch_fifo.c
index 24893d3..37c8aa7 100644
--- a/net/sched/sch_fifo.c
+++ b/net/sched/sch_fifo.c
@@ -1,11 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* net/sched/sch_fifo.c The simplest FIFO queue.
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
*/
@@ -177,7 +173,7 @@
if (q) {
err = fifo_set_limit(q, limit);
if (err < 0) {
- qdisc_destroy(q);
+ qdisc_put(q);
q = NULL;
}
}
diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c
index 4808713..98dd87c 100644
--- a/net/sched/sch_fq.c
+++ b/net/sched/sch_fq.c
@@ -1,13 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* net/sched/sch_fq.c Fair Queue Packet Scheduler (per flow pacing)
*
* Copyright (C) 2013-2015 Eric Dumazet <edumazet@google.com>
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Meant to be mostly used for locally generated traffic :
* Fast classification depends on skb->sk being set before reaching us.
* If not, (router workload), we use rxhash as fallback, with 32 bits wide hash.
@@ -54,10 +50,23 @@
#include <net/tcp_states.h>
#include <net/tcp.h>
+struct fq_skb_cb {
+ u64 time_to_send;
+};
+
+static inline struct fq_skb_cb *fq_skb_cb(struct sk_buff *skb)
+{
+ qdisc_cb_private_validate(skb, sizeof(struct fq_skb_cb));
+ return (struct fq_skb_cb *)qdisc_skb_cb(skb)->data;
+}
+
/*
- * Per flow structure, dynamically allocated
+ * Per flow structure, dynamically allocated.
+ * If packets have monotically increasing time_to_send, they are placed in O(1)
+ * in linear list (head,tail), otherwise are placed in a rbtree (t_root).
*/
struct fq_flow {
+ struct rb_root t_root;
struct sk_buff *head; /* list of skbs for this flow : first skb */
union {
struct sk_buff *tail; /* last skb in the list */
@@ -92,8 +101,9 @@
u32 quantum;
u32 initial_quantum;
u32 flow_refill_delay;
- u32 flow_max_rate; /* optional max rate per flow */
u32 flow_plimit; /* max packets per flow */
+ unsigned long flow_max_rate; /* optional max rate per flow */
+ u64 ce_threshold;
u32 orphan_mask; /* mask for orphaned skb */
u32 low_rate_threshold;
struct rb_root *fq_root;
@@ -106,8 +116,8 @@
u64 stat_gc_flows;
u64 stat_internal_packets;
- u64 stat_tcp_retrans;
u64 stat_throttled;
+ u64 stat_ce_mark;
u64 stat_flows_plimit;
u64 stat_pkts_too_long;
u64 stat_allocation_errors;
@@ -256,6 +266,17 @@
*/
sk = (struct sock *)((hash << 1) | 1UL);
skb_orphan(skb);
+ } else if (sk->sk_state == TCP_CLOSE) {
+ unsigned long hash = skb_get_hash(skb) & q->orphan_mask;
+ /*
+ * Sockets in TCP_CLOSE are non connected.
+ * Typical use case is UDP sockets, they can send packets
+ * with sendto() to many different destinations.
+ * We probably could use a generic bit advertising
+ * non connected sockets, instead of sk_state == TCP_CLOSE,
+ * if we care enough.
+ */
+ sk = (struct sock *)((hash << 1) | 1UL);
}
root = &q->fq_root[hash_ptr(sk, q->fq_trees_log)];
@@ -276,7 +297,7 @@
* It not, we need to refill credit with
* initial quantum
*/
- if (unlikely(skb->sk &&
+ if (unlikely(skb->sk == sk &&
f->socket_hash != sk->sk_hash)) {
f->credit = q->initial_quantum;
f->socket_hash = sk->sk_hash;
@@ -297,9 +318,11 @@
q->stat_allocation_errors++;
return &q->internal;
}
+ /* f->t_root is already zeroed after kmem_cache_zalloc() */
+
fq_flow_set_detached(f);
f->sk = sk;
- if (skb->sk)
+ if (skb->sk == sk)
f->socket_hash = sk->sk_hash;
f->credit = q->initial_quantum;
@@ -311,15 +334,41 @@
return f;
}
+static struct sk_buff *fq_peek(struct fq_flow *flow)
+{
+ struct sk_buff *skb = skb_rb_first(&flow->t_root);
+ struct sk_buff *head = flow->head;
+
+ if (!skb)
+ return head;
+
+ if (!head)
+ return skb;
+
+ if (fq_skb_cb(skb)->time_to_send < fq_skb_cb(head)->time_to_send)
+ return skb;
+ return head;
+}
+
+static void fq_erase_head(struct Qdisc *sch, struct fq_flow *flow,
+ struct sk_buff *skb)
+{
+ if (skb == flow->head) {
+ flow->head = skb->next;
+ } else {
+ rb_erase(&skb->rbnode, &flow->t_root);
+ skb->dev = qdisc_dev(sch);
+ }
+}
/* remove one skb from head of flow queue */
static struct sk_buff *fq_dequeue_head(struct Qdisc *sch, struct fq_flow *flow)
{
- struct sk_buff *skb = flow->head;
+ struct sk_buff *skb = fq_peek(flow);
if (skb) {
- flow->head = skb->next;
- skb->next = NULL;
+ fq_erase_head(sch, flow, skb);
+ skb_mark_not_on_list(skb);
flow->qlen--;
qdisc_qstats_backlog_dec(sch, skb);
sch->q.qlen--;
@@ -327,62 +376,38 @@
return skb;
}
-/* We might add in the future detection of retransmits
- * For the time being, just return false
- */
-static bool skb_is_retransmit(struct sk_buff *skb)
-{
- return false;
-}
-
-/* add skb to flow queue
- * flow queue is a linked list, kind of FIFO, except for TCP retransmits
- * We special case tcp retransmits to be transmitted before other packets.
- * We rely on fact that TCP retransmits are unlikely, so we do not waste
- * a separate queue or a pointer.
- * head-> [retrans pkt 1]
- * [retrans pkt 2]
- * [ normal pkt 1]
- * [ normal pkt 2]
- * [ normal pkt 3]
- * tail-> [ normal pkt 4]
- */
static void flow_queue_add(struct fq_flow *flow, struct sk_buff *skb)
{
- struct sk_buff *prev, *head = flow->head;
+ struct rb_node **p, *parent;
+ struct sk_buff *head, *aux;
- skb->next = NULL;
- if (!head) {
- flow->head = skb;
- flow->tail = skb;
- return;
- }
- if (likely(!skb_is_retransmit(skb))) {
- flow->tail->next = skb;
- flow->tail = skb;
- return;
- }
+ fq_skb_cb(skb)->time_to_send = skb->tstamp ?: ktime_get_ns();
- /* This skb is a tcp retransmit,
- * find the last retrans packet in the queue
- */
- prev = NULL;
- while (skb_is_retransmit(head)) {
- prev = head;
- head = head->next;
+ head = flow->head;
+ if (!head ||
+ fq_skb_cb(skb)->time_to_send >= fq_skb_cb(flow->tail)->time_to_send) {
if (!head)
- break;
- }
- if (!prev) { /* no rtx packet in queue, become the new head */
- skb->next = flow->head;
- flow->head = skb;
- } else {
- if (prev == flow->tail)
- flow->tail = skb;
+ flow->head = skb;
else
- skb->next = prev->next;
- prev->next = skb;
+ flow->tail->next = skb;
+ flow->tail = skb;
+ skb->next = NULL;
+ return;
}
+
+ p = &flow->t_root.rb_node;
+ parent = NULL;
+
+ while (*p) {
+ parent = *p;
+ aux = rb_to_skb(parent);
+ if (fq_skb_cb(skb)->time_to_send >= fq_skb_cb(aux)->time_to_send)
+ p = &parent->rb_right;
+ else
+ p = &parent->rb_left;
+ }
+ rb_link_node(&skb->rbnode, parent, p);
+ rb_insert_color(&skb->rbnode, &flow->t_root);
}
static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
@@ -401,8 +426,6 @@
}
f->qlen++;
- if (skb_is_retransmit(skb))
- q->stat_tcp_retrans++;
qdisc_qstats_backlog_inc(sch, skb);
if (fq_flow_is_detached(f)) {
struct sock *sk = skb->sk;
@@ -460,15 +483,21 @@
static struct sk_buff *fq_dequeue(struct Qdisc *sch)
{
struct fq_sched_data *q = qdisc_priv(sch);
- u64 now = ktime_get_ns();
struct fq_flow_head *head;
struct sk_buff *skb;
struct fq_flow *f;
- u32 rate, plen;
+ unsigned long rate;
+ u32 plen;
+ u64 now;
+
+ if (!sch->q.qlen)
+ return NULL;
skb = fq_dequeue_head(sch, &q->internal);
if (skb)
goto out;
+
+ now = ktime_get_ns();
fq_check_throttled(q, now);
begin:
head = &q->new_flows;
@@ -490,12 +519,22 @@
goto begin;
}
- skb = f->head;
- if (unlikely(skb && now < f->time_next_packet &&
- !skb_is_tcp_pure_ack(skb))) {
- head->first = f->next;
- fq_flow_set_throttled(q, f);
- goto begin;
+ skb = fq_peek(f);
+ if (skb) {
+ u64 time_next_packet = max_t(u64, fq_skb_cb(skb)->time_to_send,
+ f->time_next_packet);
+
+ if (now < time_next_packet) {
+ head->first = f->next;
+ f->time_next_packet = time_next_packet;
+ fq_flow_set_throttled(q, f);
+ goto begin;
+ }
+ if (time_next_packet &&
+ (s64)(now - time_next_packet - q->ce_threshold) > 0) {
+ INET_ECN_set_ce(skb);
+ q->stat_ce_mark++;
+ }
}
skb = fq_dequeue_head(sch, f);
@@ -511,32 +550,35 @@
goto begin;
}
prefetch(&skb->end);
- f->credit -= qdisc_pkt_len(skb);
+ plen = qdisc_pkt_len(skb);
+ f->credit -= plen;
if (!q->rate_enable)
goto out;
- /* Do not pace locally generated ack packets */
- if (skb_is_tcp_pure_ack(skb))
- goto out;
-
rate = q->flow_max_rate;
- if (skb->sk)
- rate = min(skb->sk->sk_pacing_rate, rate);
- if (rate <= q->low_rate_threshold) {
- f->credit = 0;
- plen = qdisc_pkt_len(skb);
- } else {
- plen = max(qdisc_pkt_len(skb), q->quantum);
- if (f->credit > 0)
- goto out;
+ /* If EDT time was provided for this skb, we need to
+ * update f->time_next_packet only if this qdisc enforces
+ * a flow max rate.
+ */
+ if (!skb->tstamp) {
+ if (skb->sk)
+ rate = min(skb->sk->sk_pacing_rate, rate);
+
+ if (rate <= q->low_rate_threshold) {
+ f->credit = 0;
+ } else {
+ plen = max(plen, q->quantum);
+ if (f->credit > 0)
+ goto out;
+ }
}
- if (rate != ~0U) {
+ if (rate != ~0UL) {
u64 len = (u64)plen * NSEC_PER_SEC;
if (likely(rate))
- do_div(len, rate);
+ len = div64_ul(len, rate);
/* Since socket rate can change later,
* clamp the delay to 1 second.
* Really, providers of too big packets should be fixed !
@@ -560,6 +602,15 @@
static void fq_flow_purge(struct fq_flow *flow)
{
+ struct rb_node *p = rb_first(&flow->t_root);
+
+ while (p) {
+ struct sk_buff *skb = rb_to_skb(p);
+
+ p = rb_next(p);
+ rb_erase(&skb->rbnode, &flow->t_root);
+ rtnl_kfree_skbs(skb, skb);
+ }
rtnl_kfree_skbs(flow->head, flow->tail);
flow->head = NULL;
flow->qlen = 0;
@@ -696,6 +747,7 @@
[TCA_FQ_BUCKETS_LOG] = { .type = NLA_U32 },
[TCA_FQ_FLOW_REFILL_DELAY] = { .type = NLA_U32 },
[TCA_FQ_LOW_RATE_THRESHOLD] = { .type = NLA_U32 },
+ [TCA_FQ_CE_THRESHOLD] = { .type = NLA_U32 },
};
static int fq_change(struct Qdisc *sch, struct nlattr *opt,
@@ -710,7 +762,8 @@
if (!opt)
return -EINVAL;
- err = nla_parse_nested(tb, TCA_FQ_MAX, opt, fq_policy, NULL);
+ err = nla_parse_nested_deprecated(tb, TCA_FQ_MAX, opt, fq_policy,
+ NULL);
if (err < 0)
return err;
@@ -748,9 +801,11 @@
pr_warn_ratelimited("sch_fq: defrate %u ignored.\n",
nla_get_u32(tb[TCA_FQ_FLOW_DEFAULT_RATE]));
- if (tb[TCA_FQ_FLOW_MAX_RATE])
- q->flow_max_rate = nla_get_u32(tb[TCA_FQ_FLOW_MAX_RATE]);
+ if (tb[TCA_FQ_FLOW_MAX_RATE]) {
+ u32 rate = nla_get_u32(tb[TCA_FQ_FLOW_MAX_RATE]);
+ q->flow_max_rate = (rate == ~0U) ? ~0UL : rate;
+ }
if (tb[TCA_FQ_LOW_RATE_THRESHOLD])
q->low_rate_threshold =
nla_get_u32(tb[TCA_FQ_LOW_RATE_THRESHOLD]);
@@ -773,6 +828,10 @@
if (tb[TCA_FQ_ORPHAN_MASK])
q->orphan_mask = nla_get_u32(tb[TCA_FQ_ORPHAN_MASK]);
+ if (tb[TCA_FQ_CE_THRESHOLD])
+ q->ce_threshold = (u64)NSEC_PER_USEC *
+ nla_get_u32(tb[TCA_FQ_CE_THRESHOLD]);
+
if (!err) {
sch_tree_unlock(sch);
err = fq_resize(sch, fq_log);
@@ -813,7 +872,7 @@
q->quantum = 2 * psched_mtu(qdisc_dev(sch));
q->initial_quantum = 10 * psched_mtu(qdisc_dev(sch));
q->flow_refill_delay = msecs_to_jiffies(40);
- q->flow_max_rate = ~0U;
+ q->flow_max_rate = ~0UL;
q->time_next_delayed_flow = ~0ULL;
q->rate_enable = 1;
q->new_flows.first = NULL;
@@ -823,7 +882,11 @@
q->fq_trees_log = ilog2(1024);
q->orphan_mask = 1024 - 1;
q->low_rate_threshold = 550000 / 8;
- qdisc_watchdog_init(&q->watchdog, sch);
+
+ /* Default ce_threshold of 4294 seconds */
+ q->ce_threshold = (u64)NSEC_PER_USEC * ~0U;
+
+ qdisc_watchdog_init_clockid(&q->watchdog, sch, CLOCK_MONOTONIC);
if (opt)
err = fq_change(sch, opt, extack);
@@ -836,25 +899,30 @@
static int fq_dump(struct Qdisc *sch, struct sk_buff *skb)
{
struct fq_sched_data *q = qdisc_priv(sch);
+ u64 ce_threshold = q->ce_threshold;
struct nlattr *opts;
- opts = nla_nest_start(skb, TCA_OPTIONS);
+ opts = nla_nest_start_noflag(skb, TCA_OPTIONS);
if (opts == NULL)
goto nla_put_failure;
/* TCA_FQ_FLOW_DEFAULT_RATE is not used anymore */
+ do_div(ce_threshold, NSEC_PER_USEC);
+
if (nla_put_u32(skb, TCA_FQ_PLIMIT, sch->limit) ||
nla_put_u32(skb, TCA_FQ_FLOW_PLIMIT, q->flow_plimit) ||
nla_put_u32(skb, TCA_FQ_QUANTUM, q->quantum) ||
nla_put_u32(skb, TCA_FQ_INITIAL_QUANTUM, q->initial_quantum) ||
nla_put_u32(skb, TCA_FQ_RATE_ENABLE, q->rate_enable) ||
- nla_put_u32(skb, TCA_FQ_FLOW_MAX_RATE, q->flow_max_rate) ||
+ nla_put_u32(skb, TCA_FQ_FLOW_MAX_RATE,
+ min_t(unsigned long, q->flow_max_rate, ~0U)) ||
nla_put_u32(skb, TCA_FQ_FLOW_REFILL_DELAY,
jiffies_to_usecs(q->flow_refill_delay)) ||
nla_put_u32(skb, TCA_FQ_ORPHAN_MASK, q->orphan_mask) ||
nla_put_u32(skb, TCA_FQ_LOW_RATE_THRESHOLD,
q->low_rate_threshold) ||
+ nla_put_u32(skb, TCA_FQ_CE_THRESHOLD, (u32)ce_threshold) ||
nla_put_u32(skb, TCA_FQ_BUCKETS_LOG, q->fq_trees_log))
goto nla_put_failure;
@@ -873,7 +941,7 @@
st.gc_flows = q->stat_gc_flows;
st.highprio_packets = q->stat_internal_packets;
- st.tcp_retrans = q->stat_tcp_retrans;
+ st.tcp_retrans = 0;
st.throttled = q->stat_throttled;
st.flows_plimit = q->stat_flows_plimit;
st.pkts_too_long = q->stat_pkts_too_long;
@@ -884,6 +952,7 @@
st.throttled_flows = q->throttled_flows;
st.unthrottle_latency_ns = min_t(unsigned long,
q->unthrottle_latency_ns, ~0U);
+ st.ce_mark = q->stat_ce_mark;
sch_tree_unlock(sch);
return gnet_stats_copy_app(d, &st, sizeof(st));
diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c
index 6c0a9d5..c261c0a 100644
--- a/net/sched/sch_fq_codel.c
+++ b/net/sched/sch_fq_codel.c
@@ -1,11 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Fair Queue CoDel discipline
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Copyright (C) 2012,2015 Eric Dumazet <edumazet@google.com>
*/
@@ -49,7 +45,6 @@
struct sk_buff *tail;
struct list_head flowchain;
int deficit;
- u32 dropped; /* number of drops (or ECN marks) on this flow */
struct codel_vars cvars;
}; /* please try to keep this structure <= 64 bytes */
@@ -124,7 +119,7 @@
struct sk_buff *skb = flow->head;
flow->head = skb->next;
- skb->next = NULL;
+ skb_mark_not_on_list(skb);
return skb;
}
@@ -177,7 +172,8 @@
__qdisc_drop(skb, to_free);
} while (++i < max_packets && len < threshold);
- flow->dropped += i;
+ /* Tell codel to increase its signal strength also */
+ flow->cvars.count += i;
q->backlogs[idx] -= len;
q->memory_usage -= mem;
sch->qstats.drops += i;
@@ -215,7 +211,6 @@
list_add_tail(&flow->flowchain, &q->new_flows);
q->new_flow_count++;
flow->deficit = q->quantum;
- flow->dropped = 0;
}
get_codel_cb(skb)->mem_usage = skb->truesize;
q->memory_usage += get_codel_cb(skb)->mem_usage;
@@ -290,7 +285,6 @@
struct sk_buff *skb;
struct fq_codel_flow *flow;
struct list_head *head;
- u32 prev_drop_count, prev_ecn_mark;
begin:
head = &q->new_flows;
@@ -307,16 +301,10 @@
goto begin;
}
- prev_drop_count = q->cstats.drop_count;
- prev_ecn_mark = q->cstats.ecn_mark;
-
skb = codel_dequeue(sch, &sch->qstats.backlog, &q->cparams,
&flow->cvars, &q->cstats, qdisc_pkt_len,
codel_get_enqueue_time, drop_func, dequeue_func);
- flow->dropped += q->cstats.drop_count - prev_drop_count;
- flow->dropped += q->cstats.ecn_mark - prev_ecn_mark;
-
if (!skb) {
/* force a pass through old_flows to prevent starvation */
if ((head == &q->new_flows) && !list_empty(&q->old_flows))
@@ -387,8 +375,8 @@
if (!opt)
return -EINVAL;
- err = nla_parse_nested(tb, TCA_FQ_CODEL_MAX, opt, fq_codel_policy,
- NULL);
+ err = nla_parse_nested_deprecated(tb, TCA_FQ_CODEL_MAX, opt,
+ fq_codel_policy, NULL);
if (err < 0)
return err;
if (tb[TCA_FQ_CODEL_FLOWS]) {
@@ -527,7 +515,7 @@
struct fq_codel_sched_data *q = qdisc_priv(sch);
struct nlattr *opts;
- opts = nla_nest_start(skb, TCA_OPTIONS);
+ opts = nla_nest_start_noflag(skb, TCA_OPTIONS);
if (opts == NULL)
goto nla_put_failure;
@@ -600,8 +588,6 @@
static unsigned long fq_codel_bind(struct Qdisc *sch, unsigned long parent,
u32 classid)
{
- /* we cannot bypass queue discipline anymore */
- sch->flags &= ~TCQ_F_CAN_BYPASS;
return 0;
}
@@ -664,7 +650,7 @@
sch_tree_unlock(sch);
}
qs.backlog = q->backlogs[idx];
- qs.drops = flow->dropped;
+ qs.drops = 0;
}
if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0)
return -1;
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 69078c8..8769b4b 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -1,11 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* net/sched/sch_generic.c Generic packet scheduler routines.
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
* Jamal Hadi Salim, <hadi@cyberus.ca> 990601
* - Ingress support
@@ -32,6 +28,7 @@
#include <net/pkt_sched.h>
#include <net/dst.h>
#include <trace/events/qdisc.h>
+#include <trace/events/net.h>
#include <net/xfrm.h>
/* Qdisc to use by default */
@@ -49,6 +46,8 @@
* - updates to tree and tree walking are only done under the rtnl mutex.
*/
+#define SKB_XOFF_MAGIC ((struct sk_buff *)1UL)
+
static inline struct sk_buff *__skb_dequeue_bad_txq(struct Qdisc *q)
{
const struct netdev_queue *txq = q->dev_queue;
@@ -74,7 +73,7 @@
q->q.qlen--;
}
} else {
- skb = NULL;
+ skb = SKB_XOFF_MAGIC;
}
}
@@ -118,52 +117,36 @@
spin_unlock(lock);
}
-static inline int __dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q)
+static inline void dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q)
{
- while (skb) {
- struct sk_buff *next = skb->next;
+ spinlock_t *lock = NULL;
- __skb_queue_tail(&q->gso_skb, skb);
- q->qstats.requeues++;
- qdisc_qstats_backlog_inc(q, skb);
- q->q.qlen++; /* it's still part of the queue */
-
- skb = next;
+ if (q->flags & TCQ_F_NOLOCK) {
+ lock = qdisc_lock(q);
+ spin_lock(lock);
}
- __netif_schedule(q);
- return 0;
-}
-
-static inline int dev_requeue_skb_locked(struct sk_buff *skb, struct Qdisc *q)
-{
- spinlock_t *lock = qdisc_lock(q);
-
- spin_lock(lock);
while (skb) {
struct sk_buff *next = skb->next;
__skb_queue_tail(&q->gso_skb, skb);
- qdisc_qstats_cpu_requeues_inc(q);
- qdisc_qstats_cpu_backlog_inc(q, skb);
- qdisc_qstats_cpu_qlen_inc(q);
+ /* it's still part of the queue */
+ if (qdisc_is_percpu_stats(q)) {
+ qdisc_qstats_cpu_requeues_inc(q);
+ qdisc_qstats_cpu_backlog_inc(q, skb);
+ qdisc_qstats_cpu_qlen_inc(q);
+ } else {
+ q->qstats.requeues++;
+ qdisc_qstats_backlog_inc(q, skb);
+ q->q.qlen++;
+ }
skb = next;
}
- spin_unlock(lock);
-
+ if (lock)
+ spin_unlock(lock);
__netif_schedule(q);
-
- return 0;
-}
-
-static inline int dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q)
-{
- if (q->flags & TCQ_F_NOLOCK)
- return dev_requeue_skb_locked(skb, q);
- else
- return __dev_requeue_skb(skb, q);
}
static void try_bulk_dequeue_skb(struct Qdisc *q,
@@ -184,7 +167,7 @@
skb = nskb;
(*packets)++; /* GSO counts as one pkt */
}
- skb->next = NULL;
+ skb_mark_not_on_list(skb);
}
/* This variant of try_bulk_dequeue_skb() makes sure
@@ -210,7 +193,7 @@
skb = nskb;
} while (++cnt < 8);
(*packets) += cnt;
- skb->next = NULL;
+ skb_mark_not_on_list(skb);
}
/* Note that dequeue_skb can possibly return a SKB list (via skb->next).
@@ -272,8 +255,11 @@
return skb;
skb = qdisc_dequeue_skb_bad_txq(q);
- if (unlikely(skb))
+ if (unlikely(skb)) {
+ if (skb == SKB_XOFF_MAGIC)
+ return NULL;
goto bulk;
+ }
skb = q->dequeue(q);
if (skb) {
bulk:
@@ -457,6 +443,7 @@
}
if (some_queue_timedout) {
+ trace_net_dev_xmit_timeout(dev, i);
WARN_ONCE(1, KERN_INFO "NETDEV WATCHDOG: %s (%s): transmit queue %u timed out\n",
dev->name, netdev_drivername(dev), i);
dev->netdev_ops->ndo_tx_timeout(dev);
@@ -500,7 +487,7 @@
* netif_carrier_on - set carrier
* @dev: network device
*
- * Device has detected that carrier.
+ * Device has detected acquisition of carrier.
*/
void netif_carrier_on(struct net_device *dev)
{
@@ -559,7 +546,7 @@
};
static struct netdev_queue noop_netdev_queue = {
- .qdisc = &noop_qdisc,
+ RCU_POINTER_INITIALIZER(qdisc, &noop_qdisc),
.qdisc_sleeping = &noop_qdisc,
};
@@ -572,6 +559,18 @@
.dev_queue = &noop_netdev_queue,
.running = SEQCNT_ZERO(noop_qdisc.running),
.busylock = __SPIN_LOCK_UNLOCKED(noop_qdisc.busylock),
+ .gso_skb = {
+ .next = (struct sk_buff *)&noop_qdisc.gso_skb,
+ .prev = (struct sk_buff *)&noop_qdisc.gso_skb,
+ .qlen = 0,
+ .lock = __SPIN_LOCK_UNLOCKED(noop_qdisc.gso_skb.lock),
+ },
+ .skb_bad_txq = {
+ .next = (struct sk_buff *)&noop_qdisc.skb_bad_txq,
+ .prev = (struct sk_buff *)&noop_qdisc.skb_bad_txq,
+ .qlen = 0,
+ .lock = __SPIN_LOCK_UNLOCKED(noop_qdisc.skb_bad_txq.lock),
+ },
};
EXPORT_SYMBOL(noop_qdisc);
@@ -630,14 +629,14 @@
err = skb_array_produce(q, skb);
- if (unlikely(err))
- return qdisc_drop_cpu(skb, qdisc, to_free);
+ if (unlikely(err)) {
+ if (qdisc_is_percpu_stats(qdisc))
+ return qdisc_drop_cpu(skb, qdisc, to_free);
+ else
+ return qdisc_drop(skb, qdisc, to_free);
+ }
- qdisc_qstats_cpu_qlen_inc(qdisc);
- /* Note: skb can not be used after skb_array_produce(),
- * so we better not use qdisc_qstats_cpu_backlog_inc()
- */
- this_cpu_add(qdisc->cpu_qstats->backlog, pkt_len);
+ qdisc_update_stats_at_enqueue(qdisc, pkt_len);
return NET_XMIT_SUCCESS;
}
@@ -656,9 +655,9 @@
skb = __skb_array_consume(q);
}
if (likely(skb)) {
- qdisc_qstats_cpu_backlog_dec(qdisc, skb);
- qdisc_bstats_cpu_update(qdisc, skb);
- qdisc_qstats_cpu_qlen_dec(qdisc);
+ qdisc_update_stats_at_dequeue(qdisc, skb);
+ } else {
+ qdisc->empty = true;
}
return skb;
@@ -698,11 +697,14 @@
kfree_skb(skb);
}
- for_each_possible_cpu(i) {
- struct gnet_stats_queue *q = per_cpu_ptr(qdisc->cpu_qstats, i);
+ if (qdisc_is_percpu_stats(qdisc)) {
+ for_each_possible_cpu(i) {
+ struct gnet_stats_queue *q;
- q->backlog = 0;
- q->qlen = 0;
+ q = per_cpu_ptr(qdisc->cpu_qstats, i);
+ q->backlog = 0;
+ q->qlen = 0;
+ }
}
}
@@ -797,9 +799,6 @@
};
EXPORT_SYMBOL(pfifo_fast_ops);
-static struct lock_class_key qdisc_tx_busylock;
-static struct lock_class_key qdisc_running_key;
-
struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
const struct Qdisc_ops *ops,
struct netlink_ext_ack *extack)
@@ -852,26 +851,25 @@
}
spin_lock_init(&sch->busylock);
- lockdep_set_class(&sch->busylock,
- dev->qdisc_tx_busylock ?: &qdisc_tx_busylock);
-
/* seqlock has the same scope of busylock, for NOLOCK qdisc */
spin_lock_init(&sch->seqlock);
- lockdep_set_class(&sch->busylock,
- dev->qdisc_tx_busylock ?: &qdisc_tx_busylock);
-
seqcount_init(&sch->running);
- lockdep_set_class(&sch->running,
- dev->qdisc_running_key ?: &qdisc_running_key);
sch->ops = ops;
sch->flags = ops->static_flags;
sch->enqueue = ops->enqueue;
sch->dequeue = ops->dequeue;
sch->dev_queue = dev_queue;
+ sch->empty = true;
dev_hold(dev);
refcount_set(&sch->refcnt, 1);
+ if (sch != &noop_qdisc) {
+ lockdep_set_class(&sch->busylock, &dev->qdisc_tx_busylock_key);
+ lockdep_set_class(&sch->seqlock, &dev->qdisc_tx_busylock_key);
+ lockdep_set_class(&sch->running, &dev->qdisc_running_key);
+ }
+
return sch;
errout1:
kfree(p);
@@ -901,7 +899,7 @@
if (!ops->init || ops->init(sch, NULL, extack) == 0)
return sch;
- qdisc_destroy(sch);
+ qdisc_put(sch);
return NULL;
}
EXPORT_SYMBOL(qdisc_create_dflt);
@@ -941,15 +939,18 @@
kfree((char *) qdisc - qdisc->padded);
}
-void qdisc_destroy(struct Qdisc *qdisc)
+static void qdisc_free_cb(struct rcu_head *head)
+{
+ struct Qdisc *q = container_of(head, struct Qdisc, rcu);
+
+ qdisc_free(q);
+}
+
+static void qdisc_destroy(struct Qdisc *qdisc)
{
const struct Qdisc_ops *ops = qdisc->ops;
struct sk_buff *skb, *tmp;
- if (qdisc->flags & TCQ_F_BUILTIN ||
- !refcount_dec_and_test(&qdisc->refcnt))
- return;
-
#ifdef CONFIG_NET_SCHED
qdisc_hash_del(qdisc);
@@ -974,9 +975,37 @@
kfree_skb_list(skb);
}
- qdisc_free(qdisc);
+ call_rcu(&qdisc->rcu, qdisc_free_cb);
}
-EXPORT_SYMBOL(qdisc_destroy);
+
+void qdisc_put(struct Qdisc *qdisc)
+{
+ if (!qdisc)
+ return;
+
+ if (qdisc->flags & TCQ_F_BUILTIN ||
+ !refcount_dec_and_test(&qdisc->refcnt))
+ return;
+
+ qdisc_destroy(qdisc);
+}
+EXPORT_SYMBOL(qdisc_put);
+
+/* Version of qdisc_put() that is called with rtnl mutex unlocked.
+ * Intended to be used as optimization, this function only takes rtnl lock if
+ * qdisc reference counter reached zero.
+ */
+
+void qdisc_put_unlocked(struct Qdisc *qdisc)
+{
+ if (qdisc->flags & TCQ_F_BUILTIN ||
+ !refcount_dec_and_rtnl_lock(&qdisc->refcnt))
+ return;
+
+ qdisc_destroy(qdisc);
+ rtnl_unlock();
+}
+EXPORT_SYMBOL(qdisc_put_unlocked);
/* Attach toplevel qdisc to device queue. */
struct Qdisc *dev_graft_qdisc(struct netdev_queue *dev_queue,
@@ -1009,6 +1038,8 @@
if (dev->priv_flags & IFF_NO_QUEUE)
ops = &noqueue_qdisc_ops;
+ else if(dev->type == ARPHRD_CAN)
+ ops = &pfifo_fast_ops;
qdisc = qdisc_create_dflt(dev_queue, ops, TC_H_ROOT, NULL);
if (!qdisc) {
@@ -1245,8 +1276,6 @@
rcu_assign_pointer(dev_queue->qdisc, qdisc);
dev_queue->qdisc_sleeping = qdisc;
- __skb_queue_head_init(&qdisc->gso_skb);
- __skb_queue_head_init(&qdisc->skb_bad_txq);
}
void dev_init_scheduler(struct net_device *dev)
@@ -1270,7 +1299,7 @@
rcu_assign_pointer(dev_queue->qdisc, qdisc_default);
dev_queue->qdisc_sleeping = qdisc_default;
- qdisc_destroy(qdisc);
+ qdisc_put(qdisc);
}
}
@@ -1279,7 +1308,7 @@
netdev_for_each_tx_queue(dev, shutdown_scheduler_queue, &noop_qdisc);
if (dev_ingress_queue(dev))
shutdown_scheduler_queue(dev, dev_ingress_queue(dev), &noop_qdisc);
- qdisc_destroy(dev->qdisc);
+ qdisc_put(dev->qdisc);
dev->qdisc = &noop_qdisc;
WARN_ON(timer_pending(&dev->watchdog_timer));
@@ -1328,13 +1357,17 @@
void mini_qdisc_pair_swap(struct mini_Qdisc_pair *miniqp,
struct tcf_proto *tp_head)
{
- struct mini_Qdisc *miniq_old = rtnl_dereference(*miniqp->p_miniq);
+ /* Protected with chain0->filter_chain_lock.
+ * Can't access chain directly because tp_head can be NULL.
+ */
+ struct mini_Qdisc *miniq_old =
+ rcu_dereference_protected(*miniqp->p_miniq, 1);
struct mini_Qdisc *miniq;
if (!tp_head) {
RCU_INIT_POINTER(*miniqp->p_miniq, NULL);
/* Wait for flying RCU callback before it is freed. */
- rcu_barrier_bh();
+ rcu_barrier();
return;
}
@@ -1342,10 +1375,10 @@
&miniqp->miniq1 : &miniqp->miniq2;
/* We need to make sure that readers won't see the miniq
- * we are about to modify. So wait until previous call_rcu_bh callback
+ * we are about to modify. So wait until previous call_rcu callback
* is done.
*/
- rcu_barrier_bh();
+ rcu_barrier();
miniq->filter_list = tp_head;
rcu_assign_pointer(*miniqp->p_miniq, miniq);
@@ -1354,7 +1387,7 @@
* block potential new user of miniq_old until all readers
* are not seeing it.
*/
- call_rcu_bh(&miniq_old->rcu, mini_qdisc_rcu_func);
+ call_rcu(&miniq_old->rcu, mini_qdisc_rcu_func);
}
EXPORT_SYMBOL(mini_qdisc_pair_swap);
diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c
index 4a042ab..8599c6f 100644
--- a/net/sched/sch_gred.c
+++ b/net/sched/sch_gred.c
@@ -1,12 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* net/sched/sch_gred.c Generic Random Early Detection queue.
*
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Authors: J Hadi Salim (hadi@cyberus.ca) 1998-2002
*
* 991129: - Bug fix with grio mode
@@ -23,19 +18,23 @@
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/skbuff.h>
+#include <net/pkt_cls.h>
#include <net/pkt_sched.h>
#include <net/red.h>
#define GRED_DEF_PRIO (MAX_DPs / 2)
#define GRED_VQ_MASK (MAX_DPs - 1)
+#define GRED_VQ_RED_FLAGS (TC_RED_ECN | TC_RED_HARDDROP)
+
struct gred_sched_data;
struct gred_sched;
struct gred_sched_data {
u32 limit; /* HARD maximal queue length */
u32 DP; /* the drop parameters */
- u32 bytesin; /* bytes seen on virtualQ so far*/
+ u32 red_flags; /* virtualQ version of red_flags */
+ u64 bytesin; /* bytes seen on virtualQ so far*/
u32 packetsin; /* packets seen on virtualQ so far*/
u32 backlog; /* bytes on the virtualQ */
u8 prio; /* the prio of this vq */
@@ -139,14 +138,27 @@
table->wred_set.qidlestart = q->vars.qidlestart;
}
-static inline int gred_use_ecn(struct gred_sched *t)
+static int gred_use_ecn(struct gred_sched_data *q)
{
- return t->red_flags & TC_RED_ECN;
+ return q->red_flags & TC_RED_ECN;
}
-static inline int gred_use_harddrop(struct gred_sched *t)
+static int gred_use_harddrop(struct gred_sched_data *q)
{
- return t->red_flags & TC_RED_HARDDROP;
+ return q->red_flags & TC_RED_HARDDROP;
+}
+
+static bool gred_per_vq_red_flags_used(struct gred_sched *table)
+{
+ unsigned int i;
+
+ /* Local per-vq flags couldn't have been set unless global are 0 */
+ if (table->red_flags)
+ return false;
+ for (i = 0; i < MAX_DPs; i++)
+ if (table->tab[i] && table->tab[i]->red_flags)
+ return true;
+ return false;
}
static int gred_enqueue(struct sk_buff *skb, struct Qdisc *sch,
@@ -212,7 +224,7 @@
case RED_PROB_MARK:
qdisc_qstats_overlimit(sch);
- if (!gred_use_ecn(t) || !INET_ECN_set_ce(skb)) {
+ if (!gred_use_ecn(q) || !INET_ECN_set_ce(skb)) {
q->stats.prob_drop++;
goto congestion_drop;
}
@@ -222,7 +234,7 @@
case RED_HARD_MARK:
qdisc_qstats_overlimit(sch);
- if (gred_use_harddrop(t) || !gred_use_ecn(t) ||
+ if (gred_use_harddrop(q) || !gred_use_ecn(q) ||
!INET_ECN_set_ce(skb)) {
q->stats.forced_drop++;
goto congestion_drop;
@@ -295,15 +307,103 @@
}
}
+static void gred_offload(struct Qdisc *sch, enum tc_gred_command command)
+{
+ struct gred_sched *table = qdisc_priv(sch);
+ struct net_device *dev = qdisc_dev(sch);
+ struct tc_gred_qopt_offload opt = {
+ .command = command,
+ .handle = sch->handle,
+ .parent = sch->parent,
+ };
+
+ if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
+ return;
+
+ if (command == TC_GRED_REPLACE) {
+ unsigned int i;
+
+ opt.set.grio_on = gred_rio_mode(table);
+ opt.set.wred_on = gred_wred_mode(table);
+ opt.set.dp_cnt = table->DPs;
+ opt.set.dp_def = table->def;
+
+ for (i = 0; i < table->DPs; i++) {
+ struct gred_sched_data *q = table->tab[i];
+
+ if (!q)
+ continue;
+ opt.set.tab[i].present = true;
+ opt.set.tab[i].limit = q->limit;
+ opt.set.tab[i].prio = q->prio;
+ opt.set.tab[i].min = q->parms.qth_min >> q->parms.Wlog;
+ opt.set.tab[i].max = q->parms.qth_max >> q->parms.Wlog;
+ opt.set.tab[i].is_ecn = gred_use_ecn(q);
+ opt.set.tab[i].is_harddrop = gred_use_harddrop(q);
+ opt.set.tab[i].probability = q->parms.max_P;
+ opt.set.tab[i].backlog = &q->backlog;
+ }
+ opt.set.qstats = &sch->qstats;
+ }
+
+ dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_GRED, &opt);
+}
+
+static int gred_offload_dump_stats(struct Qdisc *sch)
+{
+ struct gred_sched *table = qdisc_priv(sch);
+ struct tc_gred_qopt_offload *hw_stats;
+ unsigned int i;
+ int ret;
+
+ hw_stats = kzalloc(sizeof(*hw_stats), GFP_KERNEL);
+ if (!hw_stats)
+ return -ENOMEM;
+
+ hw_stats->command = TC_GRED_STATS;
+ hw_stats->handle = sch->handle;
+ hw_stats->parent = sch->parent;
+
+ for (i = 0; i < MAX_DPs; i++)
+ if (table->tab[i])
+ hw_stats->stats.xstats[i] = &table->tab[i]->stats;
+
+ ret = qdisc_offload_dump_helper(sch, TC_SETUP_QDISC_GRED, hw_stats);
+ /* Even if driver returns failure adjust the stats - in case offload
+ * ended but driver still wants to adjust the values.
+ */
+ for (i = 0; i < MAX_DPs; i++) {
+ if (!table->tab[i])
+ continue;
+ table->tab[i]->packetsin += hw_stats->stats.bstats[i].packets;
+ table->tab[i]->bytesin += hw_stats->stats.bstats[i].bytes;
+ table->tab[i]->backlog += hw_stats->stats.qstats[i].backlog;
+
+ _bstats_update(&sch->bstats,
+ hw_stats->stats.bstats[i].bytes,
+ hw_stats->stats.bstats[i].packets);
+ sch->qstats.qlen += hw_stats->stats.qstats[i].qlen;
+ sch->qstats.backlog += hw_stats->stats.qstats[i].backlog;
+ sch->qstats.drops += hw_stats->stats.qstats[i].drops;
+ sch->qstats.requeues += hw_stats->stats.qstats[i].requeues;
+ sch->qstats.overlimits += hw_stats->stats.qstats[i].overlimits;
+ }
+
+ kfree(hw_stats);
+ return ret;
+}
+
static inline void gred_destroy_vq(struct gred_sched_data *q)
{
kfree(q);
}
-static inline int gred_change_table_def(struct Qdisc *sch, struct nlattr *dps)
+static int gred_change_table_def(struct Qdisc *sch, struct nlattr *dps,
+ struct netlink_ext_ack *extack)
{
struct gred_sched *table = qdisc_priv(sch);
struct tc_gred_sopt *sopt;
+ bool red_flags_changed;
int i;
if (!dps)
@@ -311,13 +411,28 @@
sopt = nla_data(dps);
- if (sopt->DPs > MAX_DPs || sopt->DPs == 0 ||
- sopt->def_DP >= sopt->DPs)
+ if (sopt->DPs > MAX_DPs) {
+ NL_SET_ERR_MSG_MOD(extack, "number of virtual queues too high");
return -EINVAL;
+ }
+ if (sopt->DPs == 0) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "number of virtual queues can't be 0");
+ return -EINVAL;
+ }
+ if (sopt->def_DP >= sopt->DPs) {
+ NL_SET_ERR_MSG_MOD(extack, "default virtual queue above virtual queue count");
+ return -EINVAL;
+ }
+ if (sopt->flags && gred_per_vq_red_flags_used(table)) {
+ NL_SET_ERR_MSG_MOD(extack, "can't set per-Qdisc RED flags when per-virtual queue flags are used");
+ return -EINVAL;
+ }
sch_tree_lock(sch);
table->DPs = sopt->DPs;
table->def = sopt->def_DP;
+ red_flags_changed = table->red_flags != sopt->flags;
table->red_flags = sopt->flags;
/*
@@ -337,6 +452,12 @@
gred_disable_wred_mode(table);
}
+ if (red_flags_changed)
+ for (i = 0; i < table->DPs; i++)
+ if (table->tab[i])
+ table->tab[i]->red_flags =
+ table->red_flags & GRED_VQ_RED_FLAGS;
+
for (i = table->DPs; i < MAX_DPs; i++) {
if (table->tab[i]) {
pr_warn("GRED: Warning: Destroying shadowed VQ 0x%x\n",
@@ -346,25 +467,30 @@
}
}
+ gred_offload(sch, TC_GRED_REPLACE);
return 0;
}
static inline int gred_change_vq(struct Qdisc *sch, int dp,
struct tc_gred_qopt *ctl, int prio,
u8 *stab, u32 max_P,
- struct gred_sched_data **prealloc)
+ struct gred_sched_data **prealloc,
+ struct netlink_ext_ack *extack)
{
struct gred_sched *table = qdisc_priv(sch);
struct gred_sched_data *q = table->tab[dp];
- if (!red_check_params(ctl->qth_min, ctl->qth_max, ctl->Wlog))
+ if (!red_check_params(ctl->qth_min, ctl->qth_max, ctl->Wlog)) {
+ NL_SET_ERR_MSG_MOD(extack, "invalid RED parameters");
return -EINVAL;
+ }
if (!q) {
table->tab[dp] = q = *prealloc;
*prealloc = NULL;
if (!q)
return -ENOMEM;
+ q->red_flags = table->red_flags & GRED_VQ_RED_FLAGS;
}
q->DP = dp;
@@ -384,14 +510,128 @@
return 0;
}
+static const struct nla_policy gred_vq_policy[TCA_GRED_VQ_MAX + 1] = {
+ [TCA_GRED_VQ_DP] = { .type = NLA_U32 },
+ [TCA_GRED_VQ_FLAGS] = { .type = NLA_U32 },
+};
+
+static const struct nla_policy gred_vqe_policy[TCA_GRED_VQ_ENTRY_MAX + 1] = {
+ [TCA_GRED_VQ_ENTRY] = { .type = NLA_NESTED },
+};
+
static const struct nla_policy gred_policy[TCA_GRED_MAX + 1] = {
[TCA_GRED_PARMS] = { .len = sizeof(struct tc_gred_qopt) },
[TCA_GRED_STAB] = { .len = 256 },
[TCA_GRED_DPS] = { .len = sizeof(struct tc_gred_sopt) },
[TCA_GRED_MAX_P] = { .type = NLA_U32 },
[TCA_GRED_LIMIT] = { .type = NLA_U32 },
+ [TCA_GRED_VQ_LIST] = { .type = NLA_NESTED },
};
+static void gred_vq_apply(struct gred_sched *table, const struct nlattr *entry)
+{
+ struct nlattr *tb[TCA_GRED_VQ_MAX + 1];
+ u32 dp;
+
+ nla_parse_nested_deprecated(tb, TCA_GRED_VQ_MAX, entry,
+ gred_vq_policy, NULL);
+
+ dp = nla_get_u32(tb[TCA_GRED_VQ_DP]);
+
+ if (tb[TCA_GRED_VQ_FLAGS])
+ table->tab[dp]->red_flags = nla_get_u32(tb[TCA_GRED_VQ_FLAGS]);
+}
+
+static void gred_vqs_apply(struct gred_sched *table, struct nlattr *vqs)
+{
+ const struct nlattr *attr;
+ int rem;
+
+ nla_for_each_nested(attr, vqs, rem) {
+ switch (nla_type(attr)) {
+ case TCA_GRED_VQ_ENTRY:
+ gred_vq_apply(table, attr);
+ break;
+ }
+ }
+}
+
+static int gred_vq_validate(struct gred_sched *table, u32 cdp,
+ const struct nlattr *entry,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[TCA_GRED_VQ_MAX + 1];
+ int err;
+ u32 dp;
+
+ err = nla_parse_nested_deprecated(tb, TCA_GRED_VQ_MAX, entry,
+ gred_vq_policy, extack);
+ if (err < 0)
+ return err;
+
+ if (!tb[TCA_GRED_VQ_DP]) {
+ NL_SET_ERR_MSG_MOD(extack, "Virtual queue with no index specified");
+ return -EINVAL;
+ }
+ dp = nla_get_u32(tb[TCA_GRED_VQ_DP]);
+ if (dp >= table->DPs) {
+ NL_SET_ERR_MSG_MOD(extack, "Virtual queue with index out of bounds");
+ return -EINVAL;
+ }
+ if (dp != cdp && !table->tab[dp]) {
+ NL_SET_ERR_MSG_MOD(extack, "Virtual queue not yet instantiated");
+ return -EINVAL;
+ }
+
+ if (tb[TCA_GRED_VQ_FLAGS]) {
+ u32 red_flags = nla_get_u32(tb[TCA_GRED_VQ_FLAGS]);
+
+ if (table->red_flags && table->red_flags != red_flags) {
+ NL_SET_ERR_MSG_MOD(extack, "can't change per-virtual queue RED flags when per-Qdisc flags are used");
+ return -EINVAL;
+ }
+ if (red_flags & ~GRED_VQ_RED_FLAGS) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "invalid RED flags specified");
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
+static int gred_vqs_validate(struct gred_sched *table, u32 cdp,
+ struct nlattr *vqs, struct netlink_ext_ack *extack)
+{
+ const struct nlattr *attr;
+ int rem, err;
+
+ err = nla_validate_nested_deprecated(vqs, TCA_GRED_VQ_ENTRY_MAX,
+ gred_vqe_policy, extack);
+ if (err < 0)
+ return err;
+
+ nla_for_each_nested(attr, vqs, rem) {
+ switch (nla_type(attr)) {
+ case TCA_GRED_VQ_ENTRY:
+ err = gred_vq_validate(table, cdp, attr, extack);
+ if (err)
+ return err;
+ break;
+ default:
+ NL_SET_ERR_MSG_MOD(extack, "GRED_VQ_LIST can contain only entry attributes");
+ return -EINVAL;
+ }
+ }
+
+ if (rem > 0) {
+ NL_SET_ERR_MSG_MOD(extack, "Trailing data after parsing virtual queue list");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
static int gred_change(struct Qdisc *sch, struct nlattr *opt,
struct netlink_ext_ack *extack)
{
@@ -406,29 +646,40 @@
if (opt == NULL)
return -EINVAL;
- err = nla_parse_nested(tb, TCA_GRED_MAX, opt, gred_policy, NULL);
+ err = nla_parse_nested_deprecated(tb, TCA_GRED_MAX, opt, gred_policy,
+ extack);
if (err < 0)
return err;
if (tb[TCA_GRED_PARMS] == NULL && tb[TCA_GRED_STAB] == NULL) {
if (tb[TCA_GRED_LIMIT] != NULL)
sch->limit = nla_get_u32(tb[TCA_GRED_LIMIT]);
- return gred_change_table_def(sch, tb[TCA_GRED_DPS]);
+ return gred_change_table_def(sch, tb[TCA_GRED_DPS], extack);
}
if (tb[TCA_GRED_PARMS] == NULL ||
tb[TCA_GRED_STAB] == NULL ||
- tb[TCA_GRED_LIMIT] != NULL)
+ tb[TCA_GRED_LIMIT] != NULL) {
+ NL_SET_ERR_MSG_MOD(extack, "can't configure Qdisc and virtual queue at the same time");
return -EINVAL;
+ }
max_P = tb[TCA_GRED_MAX_P] ? nla_get_u32(tb[TCA_GRED_MAX_P]) : 0;
- err = -EINVAL;
ctl = nla_data(tb[TCA_GRED_PARMS]);
stab = nla_data(tb[TCA_GRED_STAB]);
- if (ctl->DP >= table->DPs)
- goto errout;
+ if (ctl->DP >= table->DPs) {
+ NL_SET_ERR_MSG_MOD(extack, "virtual queue index above virtual queue count");
+ return -EINVAL;
+ }
+
+ if (tb[TCA_GRED_VQ_LIST]) {
+ err = gred_vqs_validate(table, ctl->DP, tb[TCA_GRED_VQ_LIST],
+ extack);
+ if (err)
+ return err;
+ }
if (gred_rio_mode(table)) {
if (ctl->prio == 0) {
@@ -448,9 +699,13 @@
prealloc = kzalloc(sizeof(*prealloc), GFP_KERNEL);
sch_tree_lock(sch);
- err = gred_change_vq(sch, ctl->DP, ctl, prio, stab, max_P, &prealloc);
+ err = gred_change_vq(sch, ctl->DP, ctl, prio, stab, max_P, &prealloc,
+ extack);
if (err < 0)
- goto errout_locked;
+ goto err_unlock_free;
+
+ if (tb[TCA_GRED_VQ_LIST])
+ gred_vqs_apply(table, tb[TCA_GRED_VQ_LIST]);
if (gred_rio_mode(table)) {
gred_disable_wred_mode(table);
@@ -458,12 +713,15 @@
gred_enable_wred_mode(table);
}
- err = 0;
-
-errout_locked:
sch_tree_unlock(sch);
kfree(prealloc);
-errout:
+
+ gred_offload(sch, TC_GRED_REPLACE);
+ return 0;
+
+err_unlock_free:
+ sch_tree_unlock(sch);
+ kfree(prealloc);
return err;
}
@@ -476,12 +734,16 @@
if (!opt)
return -EINVAL;
- err = nla_parse_nested(tb, TCA_GRED_MAX, opt, gred_policy, NULL);
+ err = nla_parse_nested_deprecated(tb, TCA_GRED_MAX, opt, gred_policy,
+ extack);
if (err < 0)
return err;
- if (tb[TCA_GRED_PARMS] || tb[TCA_GRED_STAB])
+ if (tb[TCA_GRED_PARMS] || tb[TCA_GRED_STAB]) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "virtual queue configuration can't be specified at initialization time");
return -EINVAL;
+ }
if (tb[TCA_GRED_LIMIT])
sch->limit = nla_get_u32(tb[TCA_GRED_LIMIT]);
@@ -489,13 +751,13 @@
sch->limit = qdisc_dev(sch)->tx_queue_len
* psched_mtu(qdisc_dev(sch));
- return gred_change_table_def(sch, tb[TCA_GRED_DPS]);
+ return gred_change_table_def(sch, tb[TCA_GRED_DPS], extack);
}
static int gred_dump(struct Qdisc *sch, struct sk_buff *skb)
{
struct gred_sched *table = qdisc_priv(sch);
- struct nlattr *parms, *opts = NULL;
+ struct nlattr *parms, *vqs, *opts = NULL;
int i;
u32 max_p[MAX_DPs];
struct tc_gred_sopt sopt = {
@@ -505,7 +767,10 @@
.flags = table->red_flags,
};
- opts = nla_nest_start(skb, TCA_OPTIONS);
+ if (gred_offload_dump_stats(sch))
+ goto nla_put_failure;
+
+ opts = nla_nest_start_noflag(skb, TCA_OPTIONS);
if (opts == NULL)
goto nla_put_failure;
if (nla_put(skb, TCA_GRED_DPS, sizeof(sopt), &sopt))
@@ -522,7 +787,8 @@
if (nla_put_u32(skb, TCA_GRED_LIMIT, sch->limit))
goto nla_put_failure;
- parms = nla_nest_start(skb, TCA_GRED_PARMS);
+ /* Old style all-in-one dump of VQs */
+ parms = nla_nest_start_noflag(skb, TCA_GRED_PARMS);
if (parms == NULL)
goto nla_put_failure;
@@ -572,6 +838,58 @@
nla_nest_end(skb, parms);
+ /* Dump the VQs again, in more structured way */
+ vqs = nla_nest_start_noflag(skb, TCA_GRED_VQ_LIST);
+ if (!vqs)
+ goto nla_put_failure;
+
+ for (i = 0; i < MAX_DPs; i++) {
+ struct gred_sched_data *q = table->tab[i];
+ struct nlattr *vq;
+
+ if (!q)
+ continue;
+
+ vq = nla_nest_start_noflag(skb, TCA_GRED_VQ_ENTRY);
+ if (!vq)
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, TCA_GRED_VQ_DP, q->DP))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, TCA_GRED_VQ_FLAGS, q->red_flags))
+ goto nla_put_failure;
+
+ /* Stats */
+ if (nla_put_u64_64bit(skb, TCA_GRED_VQ_STAT_BYTES, q->bytesin,
+ TCA_GRED_VQ_PAD))
+ goto nla_put_failure;
+ if (nla_put_u32(skb, TCA_GRED_VQ_STAT_PACKETS, q->packetsin))
+ goto nla_put_failure;
+ if (nla_put_u32(skb, TCA_GRED_VQ_STAT_BACKLOG,
+ gred_backlog(table, q, sch)))
+ goto nla_put_failure;
+ if (nla_put_u32(skb, TCA_GRED_VQ_STAT_PROB_DROP,
+ q->stats.prob_drop))
+ goto nla_put_failure;
+ if (nla_put_u32(skb, TCA_GRED_VQ_STAT_PROB_MARK,
+ q->stats.prob_mark))
+ goto nla_put_failure;
+ if (nla_put_u32(skb, TCA_GRED_VQ_STAT_FORCED_DROP,
+ q->stats.forced_drop))
+ goto nla_put_failure;
+ if (nla_put_u32(skb, TCA_GRED_VQ_STAT_FORCED_MARK,
+ q->stats.forced_mark))
+ goto nla_put_failure;
+ if (nla_put_u32(skb, TCA_GRED_VQ_STAT_PDROP, q->stats.pdrop))
+ goto nla_put_failure;
+ if (nla_put_u32(skb, TCA_GRED_VQ_STAT_OTHER, q->stats.other))
+ goto nla_put_failure;
+
+ nla_nest_end(skb, vq);
+ }
+ nla_nest_end(skb, vqs);
+
return nla_nest_end(skb, opts);
nla_put_failure:
@@ -588,6 +906,7 @@
if (table->tab[i])
gred_destroy_vq(table->tab[i]);
}
+ gred_offload(sch, TC_GRED_DESTROY);
}
static struct Qdisc_ops gred_qdisc_ops __read_mostly = {
diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c
index 3278a76..433f219 100644
--- a/net/sched/sch_hfsc.c
+++ b/net/sched/sch_hfsc.c
@@ -845,16 +845,6 @@
}
static void
-hfsc_purge_queue(struct Qdisc *sch, struct hfsc_class *cl)
-{
- unsigned int len = cl->qdisc->q.qlen;
- unsigned int backlog = cl->qdisc->qstats.backlog;
-
- qdisc_reset(cl->qdisc);
- qdisc_tree_reduce_backlog(cl->qdisc, len, backlog);
-}
-
-static void
hfsc_adjust_levels(struct hfsc_class *cl)
{
struct hfsc_class *p;
@@ -936,7 +926,8 @@
if (opt == NULL)
return -EINVAL;
- err = nla_parse_nested(tb, TCA_HFSC_MAX, opt, hfsc_policy, NULL);
+ err = nla_parse_nested_deprecated(tb, TCA_HFSC_MAX, opt, hfsc_policy,
+ NULL);
if (err < 0)
return err;
@@ -1076,7 +1067,7 @@
qdisc_class_hash_insert(&q->clhash, &cl->cl_common);
list_add_tail(&cl->siblings, &parent->children);
if (parent->level == 0)
- hfsc_purge_queue(sch, parent);
+ qdisc_purge_queue(parent->qdisc);
hfsc_adjust_levels(parent);
sch_tree_unlock(sch);
@@ -1092,7 +1083,7 @@
struct hfsc_sched *q = qdisc_priv(sch);
tcf_block_put(cl->block);
- qdisc_destroy(cl->qdisc);
+ qdisc_put(cl->qdisc);
gen_kill_estimator(&cl->rate_est);
if (cl != &q->root)
kfree(cl);
@@ -1112,7 +1103,7 @@
list_del(&cl->siblings);
hfsc_adjust_levels(cl->cl_parent);
- hfsc_purge_queue(sch, cl);
+ qdisc_purge_queue(cl->qdisc);
qdisc_class_hash_remove(&q->clhash, &cl->cl_common);
sch_tree_unlock(sch);
@@ -1310,7 +1301,7 @@
if (cl->level == 0)
tcm->tcm_info = cl->qdisc->handle;
- nest = nla_nest_start(skb, TCA_OPTIONS);
+ nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
if (nest == NULL)
goto nla_put_failure;
if (hfsc_dump_curves(skb, cl) < 0)
@@ -1328,8 +1319,9 @@
{
struct hfsc_class *cl = (struct hfsc_class *)arg;
struct tc_hfsc_stats xstats;
+ __u32 qlen;
- cl->qstats.backlog = cl->qdisc->qstats.backlog;
+ qdisc_qstats_qlen_backlog(cl->qdisc, &qlen, &cl->qstats.backlog);
xstats.level = cl->level;
xstats.period = cl->cl_vtperiod;
xstats.work = cl->cl_total;
@@ -1337,7 +1329,7 @@
if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), d, NULL, &cl->bstats) < 0 ||
gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
- gnet_stats_copy_queue(d, NULL, &cl->qstats, cl->qdisc->q.qlen) < 0)
+ gnet_stats_copy_queue(d, NULL, &cl->qstats, qlen) < 0)
return -1;
return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
@@ -1539,8 +1531,10 @@
static int
hfsc_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free)
{
+ unsigned int len = qdisc_pkt_len(skb);
struct hfsc_class *cl;
int uninitialized_var(err);
+ bool first;
cl = hfsc_classify(skb, sch, &err);
if (cl == NULL) {
@@ -1550,6 +1544,7 @@
return err;
}
+ first = !cl->qdisc->q.qlen;
err = qdisc_enqueue(skb, cl->qdisc, to_free);
if (unlikely(err != NET_XMIT_SUCCESS)) {
if (net_xmit_drop_count(err)) {
@@ -1559,9 +1554,7 @@
return err;
}
- if (cl->qdisc->q.qlen == 1) {
- unsigned int len = qdisc_pkt_len(skb);
-
+ if (first) {
if (cl->cl_flags & HFSC_RSC)
init_ed(cl, len);
if (cl->cl_flags & HFSC_FSC)
@@ -1576,7 +1569,7 @@
}
- qdisc_qstats_backlog_inc(sch, skb);
+ sch->qstats.backlog += len;
sch->q.qlen++;
return NET_XMIT_SUCCESS;
diff --git a/net/sched/sch_hhf.c b/net/sched/sch_hhf.c
index c3a8388..be35f03 100644
--- a/net/sched/sch_hhf.c
+++ b/net/sched/sch_hhf.c
@@ -1,14 +1,15 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* net/sched/sch_hhf.c Heavy-Hitter Filter (HHF)
*
* Copyright (C) 2013 Terry Lam <vtlam@google.com>
* Copyright (C) 2013 Nandita Dukkipati <nanditad@google.com>
*/
-#include <linux/jhash.h>
#include <linux/jiffies.h>
#include <linux/module.h>
#include <linux/skbuff.h>
#include <linux/vmalloc.h>
+#include <linux/siphash.h>
#include <net/pkt_sched.h>
#include <net/sock.h>
@@ -125,7 +126,7 @@
struct hhf_sched_data {
struct wdrr_bucket buckets[WDRR_BUCKET_CNT];
- u32 perturbation; /* hash perturbation */
+ siphash_key_t perturbation; /* hash perturbation */
u32 quantum; /* psched_mtu(qdisc_dev(sch)); */
u32 drop_overlimit; /* number of times max qdisc packet
* limit was hit
@@ -263,7 +264,7 @@
}
/* Get hashed flow-id of the skb. */
- hash = skb_get_hash_perturb(skb, q->perturbation);
+ hash = skb_get_hash_perturb(skb, &q->perturbation);
/* Check if this packet belongs to an already established HH flow. */
flow_pos = hash & HHF_BIT_MASK;
@@ -330,7 +331,7 @@
struct sk_buff *skb = bucket->head;
bucket->head = skb->next;
- skb->next = NULL;
+ skb_mark_not_on_list(skb);
return skb;
}
@@ -518,7 +519,8 @@
if (!opt)
return -EINVAL;
- err = nla_parse_nested(tb, TCA_HHF_MAX, opt, hhf_policy, NULL);
+ err = nla_parse_nested_deprecated(tb, TCA_HHF_MAX, opt, hhf_policy,
+ NULL);
if (err < 0)
return err;
@@ -529,7 +531,7 @@
new_hhf_non_hh_weight = nla_get_u32(tb[TCA_HHF_NON_HH_WEIGHT]);
non_hh_quantum = (u64)new_quantum * new_hhf_non_hh_weight;
- if (non_hh_quantum > INT_MAX)
+ if (non_hh_quantum == 0 || non_hh_quantum > INT_MAX)
return -EINVAL;
sch_tree_lock(sch);
@@ -580,7 +582,7 @@
sch->limit = 1000;
q->quantum = psched_mtu(qdisc_dev(sch));
- q->perturbation = prandom_u32();
+ get_random_bytes(&q->perturbation, sizeof(q->perturbation));
INIT_LIST_HEAD(&q->new_buckets);
INIT_LIST_HEAD(&q->old_buckets);
@@ -654,7 +656,7 @@
struct hhf_sched_data *q = qdisc_priv(sch);
struct nlattr *opts;
- opts = nla_nest_start(skb, TCA_OPTIONS);
+ opts = nla_nest_start_noflag(skb, TCA_OPTIONS);
if (opts == NULL)
goto nla_put_failure;
diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
index 43c4bfe..8184c87 100644
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
@@ -1,11 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* net/sched/sch_htb.c Hierarchical token bucket, feed tree version
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Authors: Martin Devera, <devik@cdi.cz>
*
* Credits (in time order) for older HTB versions:
@@ -132,7 +128,7 @@
struct htb_class_inner {
struct htb_prio clprio[TC_HTB_NUMPRIO];
} inner;
- } un;
+ };
s64 pq_key;
int prio_activity; /* for which prios are we active */
@@ -165,7 +161,8 @@
/* non shaped skbs; let them go directly thru */
struct qdisc_skb_head direct_queue;
- long direct_pkts;
+ u32 direct_pkts;
+ u32 overlimits;
struct qdisc_watchdog watchdog;
@@ -411,13 +408,13 @@
int prio = ffz(~m);
m &= ~(1 << prio);
- if (p->un.inner.clprio[prio].feed.rb_node)
+ if (p->inner.clprio[prio].feed.rb_node)
/* parent already has its feed in use so that
* reset bit in mask as parent is already ok
*/
mask &= ~(1 << prio);
- htb_add_to_id_tree(&p->un.inner.clprio[prio].feed, cl, prio);
+ htb_add_to_id_tree(&p->inner.clprio[prio].feed, cl, prio);
}
p->prio_activity |= mask;
cl = p;
@@ -447,19 +444,19 @@
int prio = ffz(~m);
m &= ~(1 << prio);
- if (p->un.inner.clprio[prio].ptr == cl->node + prio) {
+ if (p->inner.clprio[prio].ptr == cl->node + prio) {
/* we are removing child which is pointed to from
* parent feed - forget the pointer but remember
* classid
*/
- p->un.inner.clprio[prio].last_ptr_id = cl->common.classid;
- p->un.inner.clprio[prio].ptr = NULL;
+ p->inner.clprio[prio].last_ptr_id = cl->common.classid;
+ p->inner.clprio[prio].ptr = NULL;
}
htb_safe_rb_erase(cl->node + prio,
- &p->un.inner.clprio[prio].feed);
+ &p->inner.clprio[prio].feed);
- if (!p->un.inner.clprio[prio].feed.rb_node)
+ if (!p->inner.clprio[prio].feed.rb_node)
mask |= 1 << prio;
}
@@ -533,8 +530,10 @@
if (new_mode == cl->cmode)
return;
- if (new_mode == HTB_CANT_SEND)
+ if (new_mode == HTB_CANT_SEND) {
cl->overlimits++;
+ q->overlimits++;
+ }
if (cl->prio_activity) { /* not necessary: speed optimization */
if (cl->cmode != HTB_CANT_SEND)
@@ -555,7 +554,7 @@
*/
static inline void htb_activate(struct htb_sched *q, struct htb_class *cl)
{
- WARN_ON(cl->level || !cl->un.leaf.q || !cl->un.leaf.q->q.qlen);
+ WARN_ON(cl->level || !cl->leaf.q || !cl->leaf.q->q.qlen);
if (!cl->prio_activity) {
cl->prio_activity = 1 << cl->prio;
@@ -577,33 +576,18 @@
cl->prio_activity = 0;
}
-static void htb_enqueue_tail(struct sk_buff *skb, struct Qdisc *sch,
- struct qdisc_skb_head *qh)
-{
- struct sk_buff *last = qh->tail;
-
- if (last) {
- skb->next = NULL;
- last->next = skb;
- qh->tail = skb;
- } else {
- qh->tail = skb;
- qh->head = skb;
- }
- qh->qlen++;
-}
-
static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch,
struct sk_buff **to_free)
{
int uninitialized_var(ret);
+ unsigned int len = qdisc_pkt_len(skb);
struct htb_sched *q = qdisc_priv(sch);
struct htb_class *cl = htb_classify(skb, sch, &ret);
if (cl == HTB_DIRECT) {
/* enqueue to helper queue */
if (q->direct_queue.qlen < q->direct_qlen) {
- htb_enqueue_tail(skb, sch, &q->direct_queue);
+ __qdisc_enqueue_tail(skb, &q->direct_queue);
q->direct_pkts++;
} else {
return qdisc_drop(skb, sch, to_free);
@@ -615,7 +599,7 @@
__qdisc_drop(skb, to_free);
return ret;
#endif
- } else if ((ret = qdisc_enqueue(skb, cl->un.leaf.q,
+ } else if ((ret = qdisc_enqueue(skb, cl->leaf.q,
to_free)) != NET_XMIT_SUCCESS) {
if (net_xmit_drop_count(ret)) {
qdisc_qstats_drop(sch);
@@ -626,7 +610,7 @@
htb_activate(q, cl);
}
- qdisc_qstats_backlog_inc(sch, skb);
+ sch->qstats.backlog += len;
sch->q.qlen++;
return NET_XMIT_SUCCESS;
}
@@ -823,7 +807,7 @@
cl = rb_entry(*sp->pptr, struct htb_class, node[prio]);
if (!cl->level)
return cl;
- clp = &cl->un.inner.clprio[prio];
+ clp = &cl->inner.clprio[prio];
(++sp)->root = clp->feed.rb_node;
sp->pptr = &clp->ptr;
sp->pid = &clp->last_ptr_id;
@@ -857,7 +841,7 @@
* graft operation on the leaf since last dequeue;
* simply deactivate and skip such class
*/
- if (unlikely(cl->un.leaf.q->q.qlen == 0)) {
+ if (unlikely(cl->leaf.q->q.qlen == 0)) {
struct htb_class *next;
htb_deactivate(q, cl);
@@ -873,12 +857,12 @@
goto next;
}
- skb = cl->un.leaf.q->dequeue(cl->un.leaf.q);
+ skb = cl->leaf.q->dequeue(cl->leaf.q);
if (likely(skb != NULL))
break;
- qdisc_warn_nonwc("htb", cl->un.leaf.q);
- htb_next_rb_node(level ? &cl->parent->un.inner.clprio[prio].ptr:
+ qdisc_warn_nonwc("htb", cl->leaf.q);
+ htb_next_rb_node(level ? &cl->parent->inner.clprio[prio].ptr:
&q->hlevel[0].hprio[prio].ptr);
cl = htb_lookup_leaf(hprio, prio);
@@ -886,16 +870,16 @@
if (likely(skb != NULL)) {
bstats_update(&cl->bstats, skb);
- cl->un.leaf.deficit[level] -= qdisc_pkt_len(skb);
- if (cl->un.leaf.deficit[level] < 0) {
- cl->un.leaf.deficit[level] += cl->quantum;
- htb_next_rb_node(level ? &cl->parent->un.inner.clprio[prio].ptr :
+ cl->leaf.deficit[level] -= qdisc_pkt_len(skb);
+ if (cl->leaf.deficit[level] < 0) {
+ cl->leaf.deficit[level] += cl->quantum;
+ htb_next_rb_node(level ? &cl->parent->inner.clprio[prio].ptr :
&q->hlevel[0].hprio[prio].ptr);
}
/* this used to be after charge_class but this constelation
* gives us slightly better performance
*/
- if (!cl->un.leaf.q->q.qlen)
+ if (!cl->leaf.q->q.qlen)
htb_deactivate(q, cl);
htb_charge_class(q, cl, level, skb);
}
@@ -952,7 +936,6 @@
goto ok;
}
}
- qdisc_qstats_overlimit(sch);
if (likely(next_event > q->now))
qdisc_watchdog_schedule_ns(&q->watchdog, next_event);
else
@@ -972,10 +955,10 @@
for (i = 0; i < q->clhash.hashsize; i++) {
hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
if (cl->level)
- memset(&cl->un.inner, 0, sizeof(cl->un.inner));
+ memset(&cl->inner, 0, sizeof(cl->inner));
else {
- if (cl->un.leaf.q)
- qdisc_reset(cl->un.leaf.q);
+ if (cl->leaf.q)
+ qdisc_reset(cl->leaf.q);
}
cl->prio_activity = 0;
cl->cmode = HTB_CAN_SEND;
@@ -1027,7 +1010,8 @@
if (err)
return err;
- err = nla_parse_nested(tb, TCA_HTB_MAX, opt, htb_policy, NULL);
+ err = nla_parse_nested_deprecated(tb, TCA_HTB_MAX, opt, htb_policy,
+ NULL);
if (err < 0)
return err;
@@ -1062,6 +1046,7 @@
struct nlattr *nest;
struct tc_htb_glob gopt;
+ sch->qstats.overlimits = q->overlimits;
/* Its safe to not acquire qdisc lock. As we hold RTNL,
* no change can happen on the qdisc parameters.
*/
@@ -1072,7 +1057,7 @@
gopt.defcls = q->defcls;
gopt.debug = 0;
- nest = nla_nest_start(skb, TCA_OPTIONS);
+ nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
if (nest == NULL)
goto nla_put_failure;
if (nla_put(skb, TCA_HTB_INIT, sizeof(gopt), &gopt) ||
@@ -1098,10 +1083,10 @@
*/
tcm->tcm_parent = cl->parent ? cl->parent->common.classid : TC_H_ROOT;
tcm->tcm_handle = cl->common.classid;
- if (!cl->level && cl->un.leaf.q)
- tcm->tcm_info = cl->un.leaf.q->handle;
+ if (!cl->level && cl->leaf.q)
+ tcm->tcm_info = cl->leaf.q->handle;
- nest = nla_nest_start(skb, TCA_OPTIONS);
+ nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
if (nest == NULL)
goto nla_put_failure;
@@ -1142,10 +1127,9 @@
};
__u32 qlen = 0;
- if (!cl->level && cl->un.leaf.q) {
- qlen = cl->un.leaf.q->q.qlen;
- qs.backlog = cl->un.leaf.q->qstats.backlog;
- }
+ if (!cl->level && cl->leaf.q)
+ qdisc_qstats_qlen_backlog(cl->leaf.q, &qlen, &qs.backlog);
+
cl->xstats.tokens = clamp_t(s64, PSCHED_NS2TICKS(cl->tokens),
INT_MIN, INT_MAX);
cl->xstats.ctokens = clamp_t(s64, PSCHED_NS2TICKS(cl->ctokens),
@@ -1172,14 +1156,14 @@
cl->common.classid, extack)) == NULL)
return -ENOBUFS;
- *old = qdisc_replace(sch, new, &cl->un.leaf.q);
+ *old = qdisc_replace(sch, new, &cl->leaf.q);
return 0;
}
static struct Qdisc *htb_leaf(struct Qdisc *sch, unsigned long arg)
{
struct htb_class *cl = (struct htb_class *)arg;
- return !cl->level ? cl->un.leaf.q : NULL;
+ return !cl->level ? cl->leaf.q : NULL;
}
static void htb_qlen_notify(struct Qdisc *sch, unsigned long arg)
@@ -1205,15 +1189,15 @@
{
struct htb_class *parent = cl->parent;
- WARN_ON(cl->level || !cl->un.leaf.q || cl->prio_activity);
+ WARN_ON(cl->level || !cl->leaf.q || cl->prio_activity);
if (parent->cmode != HTB_CAN_SEND)
htb_safe_rb_erase(&parent->pq_node,
&q->hlevel[parent->level].wait_pq);
parent->level = 0;
- memset(&parent->un.inner, 0, sizeof(parent->un.inner));
- parent->un.leaf.q = new_q ? new_q : &noop_qdisc;
+ memset(&parent->inner, 0, sizeof(parent->inner));
+ parent->leaf.q = new_q ? new_q : &noop_qdisc;
parent->tokens = parent->buffer;
parent->ctokens = parent->cbuffer;
parent->t_c = ktime_get_ns();
@@ -1223,8 +1207,8 @@
static void htb_destroy_class(struct Qdisc *sch, struct htb_class *cl)
{
if (!cl->level) {
- WARN_ON(!cl->un.leaf.q);
- qdisc_destroy(cl->un.leaf.q);
+ WARN_ON(!cl->leaf.q);
+ qdisc_put(cl->leaf.q);
}
gen_kill_estimator(&cl->rate_est);
tcf_block_put(cl->block);
@@ -1285,13 +1269,8 @@
sch_tree_lock(sch);
- if (!cl->level) {
- unsigned int qlen = cl->un.leaf.q->q.qlen;
- unsigned int backlog = cl->un.leaf.q->qstats.backlog;
-
- qdisc_reset(cl->un.leaf.q);
- qdisc_tree_reduce_backlog(cl->un.leaf.q, qlen, backlog);
- }
+ if (!cl->level)
+ qdisc_purge_queue(cl->leaf.q);
/* delete from hash and active; remainder in destroy_class */
qdisc_class_hash_remove(&q->clhash, &cl->common);
@@ -1323,6 +1302,7 @@
struct htb_class *cl = (struct htb_class *)*arg, *parent;
struct nlattr *opt = tca[TCA_OPTIONS];
struct nlattr *tb[TCA_HTB_MAX + 1];
+ struct Qdisc *parent_qdisc = NULL;
struct tc_htb_opt *hopt;
u64 rate64, ceil64;
int warn = 0;
@@ -1331,7 +1311,8 @@
if (!opt)
goto failure;
- err = nla_parse_nested(tb, TCA_HTB_MAX, opt, htb_policy, NULL);
+ err = nla_parse_nested_deprecated(tb, TCA_HTB_MAX, opt, htb_policy,
+ NULL);
if (err < 0)
goto failure;
@@ -1419,13 +1400,9 @@
classid, NULL);
sch_tree_lock(sch);
if (parent && !parent->level) {
- unsigned int qlen = parent->un.leaf.q->q.qlen;
- unsigned int backlog = parent->un.leaf.q->qstats.backlog;
-
/* turn parent into inner node */
- qdisc_reset(parent->un.leaf.q);
- qdisc_tree_reduce_backlog(parent->un.leaf.q, qlen, backlog);
- qdisc_destroy(parent->un.leaf.q);
+ qdisc_purge_queue(parent->leaf.q);
+ parent_qdisc = parent->leaf.q;
if (parent->prio_activity)
htb_deactivate(q, parent);
@@ -1436,10 +1413,10 @@
}
parent->level = (parent->parent ? parent->parent->level
: TC_HTB_MAXDEPTH) - 1;
- memset(&parent->un.inner, 0, sizeof(parent->un.inner));
+ memset(&parent->inner, 0, sizeof(parent->inner));
}
/* leaf (we) needs elementary qdisc */
- cl->un.leaf.q = new_q ? new_q : &noop_qdisc;
+ cl->leaf.q = new_q ? new_q : &noop_qdisc;
cl->common.classid = classid;
cl->parent = parent;
@@ -1455,8 +1432,8 @@
qdisc_class_hash_insert(&q->clhash, &cl->common);
if (parent)
parent->children++;
- if (cl->un.leaf.q != &noop_qdisc)
- qdisc_hash_add(cl->un.leaf.q, true);
+ if (cl->leaf.q != &noop_qdisc)
+ qdisc_hash_add(cl->leaf.q, true);
} else {
if (tca[TCA_RATE]) {
err = gen_replace_estimator(&cl->bstats, NULL,
@@ -1478,7 +1455,7 @@
psched_ratecfg_precompute(&cl->ceil, &hopt->ceil, ceil64);
/* it used to be a nasty bug here, we have to check that node
- * is really leaf before changing cl->un.leaf !
+ * is really leaf before changing cl->leaf !
*/
if (!cl->level) {
u64 quantum = cl->rate.rate_bytes_ps;
@@ -1504,6 +1481,7 @@
cl->cbuffer = PSCHED_TICKS2NS(hopt->cbuffer);
sch_tree_unlock(sch);
+ qdisc_put(parent_qdisc);
if (warn)
pr_warn("HTB: quantum of class %X is %s. Consider r2q change.\n",
diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c
index ce3f552..bf56aa5 100644
--- a/net/sched/sch_ingress.c
+++ b/net/sched/sch_ingress.c
@@ -1,10 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/* net/sched/sch_ingress.c - Ingress and clsact qdisc
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Authors: Jamal Hadi Salim 1999
*/
@@ -87,7 +83,7 @@
mini_qdisc_pair_init(&q->miniqp, sch, &dev->miniq_ingress);
- q->block_info.binder_type = TCF_BLOCK_BINDER_TYPE_CLSACT_INGRESS;
+ q->block_info.binder_type = FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS;
q->block_info.chain_head_change = clsact_chain_head_change;
q->block_info.chain_head_change_priv = &q->miniqp;
@@ -106,7 +102,7 @@
{
struct nlattr *nest;
- nest = nla_nest_start(skb, TCA_OPTIONS);
+ nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
if (nest == NULL)
goto nla_put_failure;
@@ -118,6 +114,7 @@
}
static const struct Qdisc_class_ops ingress_class_ops = {
+ .flags = QDISC_CLASS_OPS_DOIT_UNLOCKED,
.leaf = ingress_leaf,
.find = ingress_find,
.walk = ingress_walk,
@@ -220,7 +217,7 @@
mini_qdisc_pair_init(&q->miniqp_ingress, sch, &dev->miniq_ingress);
- q->ingress_block_info.binder_type = TCF_BLOCK_BINDER_TYPE_CLSACT_INGRESS;
+ q->ingress_block_info.binder_type = FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS;
q->ingress_block_info.chain_head_change = clsact_chain_head_change;
q->ingress_block_info.chain_head_change_priv = &q->miniqp_ingress;
@@ -231,7 +228,7 @@
mini_qdisc_pair_init(&q->miniqp_egress, sch, &dev->miniq_egress);
- q->egress_block_info.binder_type = TCF_BLOCK_BINDER_TYPE_CLSACT_EGRESS;
+ q->egress_block_info.binder_type = FLOW_BLOCK_BINDER_TYPE_CLSACT_EGRESS;
q->egress_block_info.chain_head_change = clsact_chain_head_change;
q->egress_block_info.chain_head_change_priv = &q->miniqp_egress;
@@ -250,6 +247,7 @@
}
static const struct Qdisc_class_ops clsact_class_ops = {
+ .flags = QDISC_CLASS_OPS_DOIT_UNLOCKED,
.leaf = ingress_leaf,
.find = clsact_find,
.walk = ingress_walk,
diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c
index d6b8ae4..278c0b2 100644
--- a/net/sched/sch_mq.c
+++ b/net/sched/sch_mq.c
@@ -1,11 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* net/sched/sch_mq.c Classful multiqueue dummy scheduler
*
* Copyright (c) 2009 Patrick McHardy <kaber@trash.net>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * version 2 as published by the Free Software Foundation.
*/
#include <linux/types.h>
@@ -38,9 +35,8 @@
return dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_MQ, &opt);
}
-static void mq_offload_stats(struct Qdisc *sch)
+static int mq_offload_stats(struct Qdisc *sch)
{
- struct net_device *dev = qdisc_dev(sch);
struct tc_mq_qopt_offload opt = {
.command = TC_MQ_STATS,
.handle = sch->handle,
@@ -50,8 +46,7 @@
},
};
- if (tc_can_offload(dev) && dev->netdev_ops->ndo_setup_tc)
- dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_MQ, &opt);
+ return qdisc_offload_dump_helper(sch, TC_SETUP_QDISC_MQ, &opt);
}
static void mq_destroy(struct Qdisc *sch)
@@ -65,7 +60,7 @@
if (!priv->qdiscs)
return;
for (ntx = 0; ntx < dev->num_tx_queues && priv->qdiscs[ntx]; ntx++)
- qdisc_destroy(priv->qdiscs[ntx]);
+ qdisc_put(priv->qdiscs[ntx]);
kfree(priv->qdiscs);
}
@@ -119,7 +114,7 @@
qdisc = priv->qdiscs[ntx];
old = dev_graft_qdisc(qdisc->dev_queue, qdisc);
if (old)
- qdisc_destroy(old);
+ qdisc_put(old);
#ifdef CONFIG_NET_SCHED
if (ntx < dev->real_num_tx_queues)
qdisc_hash_add(qdisc, false);
@@ -171,9 +166,8 @@
spin_unlock_bh(qdisc_lock(qdisc));
}
- mq_offload_stats(sch);
- return 0;
+ return mq_offload_stats(sch);
}
static struct netdev_queue *mq_queue_get(struct Qdisc *sch, unsigned long cl)
@@ -196,6 +190,7 @@
struct Qdisc **old, struct netlink_ext_ack *extack)
{
struct netdev_queue *dev_queue = mq_queue_get(sch, cl);
+ struct tc_mq_qopt_offload graft_offload;
struct net_device *dev = qdisc_dev(sch);
if (dev->flags & IFF_UP)
@@ -206,6 +201,14 @@
new->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT;
if (dev->flags & IFF_UP)
dev_activate(dev);
+
+ graft_offload.handle = sch->handle;
+ graft_offload.graft_params.queue = cl - 1;
+ graft_offload.graft_params.child_handle = new ? new->handle : 0;
+ graft_offload.command = TC_MQ_GRAFT;
+
+ qdisc_offload_graft_helper(qdisc_dev(sch), sch, new, *old,
+ TC_SETUP_QDISC_MQ, &graft_offload, extack);
return 0;
}
@@ -242,8 +245,9 @@
struct netdev_queue *dev_queue = mq_queue_get(sch, cl);
sch = dev_queue->qdisc_sleeping;
- if (gnet_stats_copy_basic(&sch->running, d, NULL, &sch->bstats) < 0 ||
- gnet_stats_copy_queue(d, NULL, &sch->qstats, sch->q.qlen) < 0)
+ if (gnet_stats_copy_basic(&sch->running, d, sch->cpu_bstats,
+ &sch->bstats) < 0 ||
+ qdisc_qstats_copy(d, sch) < 0)
return -1;
return 0;
}
diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c
index 0e9d761..0d0113a 100644
--- a/net/sched/sch_mqprio.c
+++ b/net/sched/sch_mqprio.c
@@ -1,11 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* net/sched/sch_mqprio.c
*
* Copyright (c) 2010 John Fastabend <john.r.fastabend@intel.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * version 2 as published by the Free Software Foundation.
*/
#include <linux/types.h>
@@ -40,7 +37,7 @@
for (ntx = 0;
ntx < dev->num_tx_queues && priv->qdiscs[ntx];
ntx++)
- qdisc_destroy(priv->qdiscs[ntx]);
+ qdisc_put(priv->qdiscs[ntx]);
kfree(priv->qdiscs);
}
@@ -125,8 +122,9 @@
int nested_len = nla_len(nla) - NLA_ALIGN(len);
if (nested_len >= nla_attr_size(0))
- return nla_parse(tb, maxtype, nla_data(nla) + NLA_ALIGN(len),
- nested_len, policy, NULL);
+ return nla_parse_deprecated(tb, maxtype,
+ nla_data(nla) + NLA_ALIGN(len),
+ nested_len, policy, NULL);
memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1));
return 0;
@@ -300,7 +298,7 @@
qdisc = priv->qdiscs[ntx];
old = dev_graft_qdisc(qdisc->dev_queue, qdisc);
if (old)
- qdisc_destroy(old);
+ qdisc_put(old);
if (ntx < dev->real_num_tx_queues)
qdisc_hash_add(qdisc, false);
}
@@ -349,7 +347,7 @@
int i;
if (priv->flags & TC_MQPRIO_F_MIN_RATE) {
- nest = nla_nest_start(skb, TCA_MQPRIO_MIN_RATE64);
+ nest = nla_nest_start_noflag(skb, TCA_MQPRIO_MIN_RATE64);
if (!nest)
goto nla_put_failure;
@@ -363,7 +361,7 @@
}
if (priv->flags & TC_MQPRIO_F_MAX_RATE) {
- nest = nla_nest_start(skb, TCA_MQPRIO_MAX_RATE64);
+ nest = nla_nest_start_noflag(skb, TCA_MQPRIO_MAX_RATE64);
if (!nest)
goto nla_put_failure;
@@ -559,10 +557,9 @@
struct netdev_queue *dev_queue = mqprio_queue_get(sch, cl);
sch = dev_queue->qdisc_sleeping;
- if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
- d, NULL, &sch->bstats) < 0 ||
- gnet_stats_copy_queue(d, NULL,
- &sch->qstats, sch->q.qlen) < 0)
+ if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), d,
+ sch->cpu_bstats, &sch->bstats) < 0 ||
+ qdisc_qstats_copy(d, sch) < 0)
return -1;
}
return 0;
diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c
index 1da7ea8..1330ad2 100644
--- a/net/sched/sch_multiq.c
+++ b/net/sched/sch_multiq.c
@@ -1,18 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (c) 2008, Intel Corporation.
*
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, see <http://www.gnu.org/licenses/>.
- *
* Author: Alexander Duyck <alexander.h.duyck@intel.com>
*/
@@ -175,7 +164,7 @@
tcf_block_put(q->block);
for (band = 0; band < q->bands; band++)
- qdisc_destroy(q->queues[band]);
+ qdisc_put(q->queues[band]);
kfree(q->queues);
}
@@ -185,7 +174,8 @@
{
struct multiq_sched_data *q = qdisc_priv(sch);
struct tc_multiq_qopt *qopt;
- int i;
+ struct Qdisc **removed;
+ int i, n_removed = 0;
if (!netif_is_multiqueue(qdisc_dev(sch)))
return -EOPNOTSUPP;
@@ -196,20 +186,29 @@
qopt->bands = qdisc_dev(sch)->real_num_tx_queues;
+ removed = kmalloc(sizeof(*removed) * (q->max_bands - q->bands),
+ GFP_KERNEL);
+ if (!removed)
+ return -ENOMEM;
+
sch_tree_lock(sch);
q->bands = qopt->bands;
for (i = q->bands; i < q->max_bands; i++) {
if (q->queues[i] != &noop_qdisc) {
struct Qdisc *child = q->queues[i];
+
q->queues[i] = &noop_qdisc;
- qdisc_tree_reduce_backlog(child, child->q.qlen,
- child->qstats.backlog);
- qdisc_destroy(child);
+ qdisc_purge_queue(child);
+ removed[n_removed++] = child;
}
}
sch_tree_unlock(sch);
+ for (i = 0; i < n_removed; i++)
+ qdisc_put(removed[i]);
+ kfree(removed);
+
for (i = 0; i < q->bands; i++) {
if (q->queues[i] == &noop_qdisc) {
struct Qdisc *child, *old;
@@ -224,13 +223,10 @@
if (child != &noop_qdisc)
qdisc_hash_add(child, true);
- if (old != &noop_qdisc) {
- qdisc_tree_reduce_backlog(old,
- old->q.qlen,
- old->qstats.backlog);
- qdisc_destroy(old);
- }
+ if (old != &noop_qdisc)
+ qdisc_purge_queue(old);
sch_tree_unlock(sch);
+ qdisc_put(old);
}
}
}
@@ -343,8 +339,8 @@
cl_q = q->queues[cl - 1];
if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
- d, NULL, &cl_q->bstats) < 0 ||
- gnet_stats_copy_queue(d, NULL, &cl_q->qstats, cl_q->q.qlen) < 0)
+ d, cl_q->cpu_bstats, &cl_q->bstats) < 0 ||
+ qdisc_qstats_copy(d, cl_q) < 0)
return -1;
return 0;
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index 74c0f65..42e557d 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -1,11 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* net/sched/sch_netem.c Network emulator
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License.
- *
* Many of the algorithms and ideas for this came from
* NIST Net which is not copyrighted.
*
@@ -77,6 +73,10 @@
/* internal t(ime)fifo qdisc uses t_root and sch->limit */
struct rb_root t_root;
+ /* a linear queue; reduces rbtree rebalancing when jitter is low */
+ struct sk_buff *t_head;
+ struct sk_buff *t_tail;
+
/* optional qdisc for classful handling (NULL at netem init) */
struct Qdisc *qdisc;
@@ -369,26 +369,39 @@
rb_erase(&skb->rbnode, &q->t_root);
rtnl_kfree_skbs(skb, skb);
}
+
+ rtnl_kfree_skbs(q->t_head, q->t_tail);
+ q->t_head = NULL;
+ q->t_tail = NULL;
}
static void tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch)
{
struct netem_sched_data *q = qdisc_priv(sch);
u64 tnext = netem_skb_cb(nskb)->time_to_send;
- struct rb_node **p = &q->t_root.rb_node, *parent = NULL;
- while (*p) {
- struct sk_buff *skb;
-
- parent = *p;
- skb = rb_to_skb(parent);
- if (tnext >= netem_skb_cb(skb)->time_to_send)
- p = &parent->rb_right;
+ if (!q->t_tail || tnext >= netem_skb_cb(q->t_tail)->time_to_send) {
+ if (q->t_tail)
+ q->t_tail->next = nskb;
else
- p = &parent->rb_left;
+ q->t_head = nskb;
+ q->t_tail = nskb;
+ } else {
+ struct rb_node **p = &q->t_root.rb_node, *parent = NULL;
+
+ while (*p) {
+ struct sk_buff *skb;
+
+ parent = *p;
+ skb = rb_to_skb(parent);
+ if (tnext >= netem_skb_cb(skb)->time_to_send)
+ p = &parent->rb_right;
+ else
+ p = &parent->rb_left;
+ }
+ rb_link_node(&nskb->rbnode, parent, p);
+ rb_insert_color(&nskb->rbnode, &q->t_root);
}
- rb_link_node(&nskb->rbnode, parent, p);
- rb_insert_color(&nskb->rbnode, &q->t_root);
sch->q.qlen++;
}
@@ -412,16 +425,6 @@
return segs;
}
-static void netem_enqueue_skb_head(struct qdisc_skb_head *qh, struct sk_buff *skb)
-{
- skb->next = qh->head;
-
- if (!qh->head)
- qh->tail = skb;
- qh->head = skb;
- qh->qlen++;
-}
-
/*
* Insert one skb into qdisc.
* Note: parent depends on return value to account for queue length.
@@ -436,10 +439,10 @@
struct netem_skb_cb *cb;
struct sk_buff *skb2;
struct sk_buff *segs = NULL;
- unsigned int len = 0, last_len, prev_len = qdisc_pkt_len(skb);
- int nb = 0;
+ unsigned int prev_len = qdisc_pkt_len(skb);
int count = 1;
int rc = NET_XMIT_SUCCESS;
+ int rc_drop = NET_XMIT_DROP;
/* Do not fool qdisc_drop_all() */
skb->prev = NULL;
@@ -473,12 +476,13 @@
* skb will be queued.
*/
if (count > 1 && (skb2 = skb_clone(skb, GFP_ATOMIC)) != NULL) {
- struct Qdisc *rootq = qdisc_root(sch);
+ struct Qdisc *rootq = qdisc_root_bh(sch);
u32 dupsave = q->duplicate; /* prevent duplicating a dup... */
q->duplicate = 0;
rootq->enqueue(skb2, rootq, to_free);
q->duplicate = dupsave;
+ rc_drop = NET_XMIT_SUCCESS;
}
/*
@@ -489,16 +493,14 @@
*/
if (q->corrupt && q->corrupt >= get_crandom(&q->corrupt_cor)) {
if (skb_is_gso(skb)) {
- segs = netem_segment(skb, sch, to_free);
- if (!segs)
- return NET_XMIT_DROP;
- } else {
- segs = skb;
+ skb = netem_segment(skb, sch, to_free);
+ if (!skb)
+ return rc_drop;
+ segs = skb->next;
+ skb_mark_not_on_list(skb);
+ qdisc_skb_cb(skb)->pkt_len = skb->len;
}
- skb = segs;
- segs = segs->next;
-
skb = skb_unshare(skb, GFP_ATOMIC);
if (unlikely(!skb)) {
qdisc_qstats_drop(sch);
@@ -507,6 +509,7 @@
if (skb->ip_summed == CHECKSUM_PARTIAL &&
skb_checksum_help(skb)) {
qdisc_drop(skb, sch, to_free);
+ skb = NULL;
goto finish_segs;
}
@@ -514,8 +517,12 @@
1<<(prandom_u32() % 8);
}
- if (unlikely(sch->q.qlen >= sch->limit))
- return qdisc_drop_all(skb, sch, to_free);
+ if (unlikely(sch->q.qlen >= sch->limit)) {
+ /* re-link segs, so that qdisc_drop_all() frees them all */
+ skb->next = segs;
+ qdisc_drop_all(skb, sch, to_free);
+ return rc_drop;
+ }
qdisc_qstats_backlog_inc(sch, skb);
@@ -543,9 +550,16 @@
t_skb = skb_rb_last(&q->t_root);
t_last = netem_skb_cb(t_skb);
if (!last ||
- t_last->time_to_send > last->time_to_send) {
+ t_last->time_to_send > last->time_to_send)
last = t_last;
- }
+ }
+ if (q->t_tail) {
+ struct netem_skb_cb *t_last =
+ netem_skb_cb(q->t_tail);
+
+ if (!last ||
+ t_last->time_to_send > last->time_to_send)
+ last = t_last;
}
if (last) {
@@ -573,15 +587,21 @@
cb->time_to_send = ktime_get_ns();
q->counter = 0;
- netem_enqueue_skb_head(&sch->q, skb);
+ __qdisc_enqueue_head(skb, &sch->q);
sch->qstats.requeues++;
}
finish_segs:
if (segs) {
+ unsigned int len, last_len;
+ int nb;
+
+ len = skb ? skb->len : 0;
+ nb = skb ? 1 : 0;
+
while (segs) {
skb2 = segs->next;
- segs->next = NULL;
+ skb_mark_not_on_list(segs);
qdisc_skb_cb(segs)->pkt_len = segs->len;
last_len = segs->len;
rc = qdisc_enqueue(segs, sch, to_free);
@@ -594,9 +614,10 @@
}
segs = skb2;
}
- sch->q.qlen += nb;
- if (nb > 1)
- qdisc_tree_reduce_backlog(sch, 1 - nb, prev_len - len);
+ /* Parent qdiscs accounted for 1 skb of size @prev_len */
+ qdisc_tree_reduce_backlog(sch, -(nb - 1), -(len - prev_len));
+ } else if (!skb) {
+ return NET_XMIT_DROP;
}
return NET_XMIT_SUCCESS;
}
@@ -624,11 +645,38 @@
q->slot.bytes_left = q->slot_config.max_bytes;
}
+static struct sk_buff *netem_peek(struct netem_sched_data *q)
+{
+ struct sk_buff *skb = skb_rb_first(&q->t_root);
+ u64 t1, t2;
+
+ if (!skb)
+ return q->t_head;
+ if (!q->t_head)
+ return skb;
+
+ t1 = netem_skb_cb(skb)->time_to_send;
+ t2 = netem_skb_cb(q->t_head)->time_to_send;
+ if (t1 < t2)
+ return skb;
+ return q->t_head;
+}
+
+static void netem_erase_head(struct netem_sched_data *q, struct sk_buff *skb)
+{
+ if (skb == q->t_head) {
+ q->t_head = skb->next;
+ if (!q->t_head)
+ q->t_tail = NULL;
+ } else {
+ rb_erase(&skb->rbnode, &q->t_root);
+ }
+}
+
static struct sk_buff *netem_dequeue(struct Qdisc *sch)
{
struct netem_sched_data *q = qdisc_priv(sch);
struct sk_buff *skb;
- struct rb_node *p;
tfifo_dequeue:
skb = __qdisc_dequeue_head(&sch->q);
@@ -638,20 +686,18 @@
qdisc_bstats_update(sch, skb);
return skb;
}
- p = rb_first(&q->t_root);
- if (p) {
+ skb = netem_peek(q);
+ if (skb) {
u64 time_to_send;
u64 now = ktime_get_ns();
- skb = rb_to_skb(p);
-
/* if more time remaining? */
time_to_send = netem_skb_cb(skb)->time_to_send;
if (q->slot.slot_next && q->slot.slot_next < time_to_send)
get_slot_next(q, now);
- if (time_to_send <= now && q->slot.slot_next <= now) {
- rb_erase(p, &q->t_root);
+ if (time_to_send <= now && q->slot.slot_next <= now) {
+ netem_erase_head(q, skb);
sch->q.qlen--;
qdisc_qstats_backlog_dec(sch, skb);
skb->next = NULL;
@@ -661,15 +707,6 @@
*/
skb->dev = qdisc_dev(sch);
-#ifdef CONFIG_NET_CLS_ACT
- /*
- * If it's at ingress let's pretend the delay is
- * from the network (tstamp will be updated).
- */
- if (skb->tc_redirected && skb->tc_from_ingress)
- skb->tstamp = 0;
-#endif
-
if (q->slot.slot_next) {
q->slot.packets_left--;
q->slot.bytes_left -= qdisc_pkt_len(skb);
@@ -745,7 +782,7 @@
struct disttable *d;
int i;
- if (n > NETEM_DIST_MAX)
+ if (!n || n > NETEM_DIST_MAX)
return -EINVAL;
d = kvmalloc(sizeof(struct disttable) + n * sizeof(s16), GFP_KERNEL);
@@ -901,8 +938,9 @@
}
if (nested_len >= nla_attr_size(0))
- return nla_parse(tb, maxtype, nla_data(nla) + NLA_ALIGN(len),
- nested_len, policy, NULL);
+ return nla_parse_deprecated(tb, maxtype,
+ nla_data(nla) + NLA_ALIGN(len),
+ nested_len, policy, NULL);
memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1));
return 0;
@@ -1035,7 +1073,7 @@
qdisc_watchdog_cancel(&q->watchdog);
if (q->qdisc)
- qdisc_destroy(q->qdisc);
+ qdisc_put(q->qdisc);
dist_free(q->delay_dist);
dist_free(q->slot_dist);
}
@@ -1045,7 +1083,7 @@
{
struct nlattr *nest;
- nest = nla_nest_start(skb, TCA_NETEM_LOSS);
+ nest = nla_nest_start_noflag(skb, TCA_NETEM_LOSS);
if (nest == NULL)
goto nla_put_failure;
diff --git a/net/sched/sch_pie.c b/net/sched/sch_pie.c
index 18d30bb..df98a88 100644
--- a/net/sched/sch_pie.c
+++ b/net/sched/sch_pie.c
@@ -1,15 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* Copyright (C) 2013 Cisco Systems, Inc, 2013.
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version 2
- * of the License.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
* Author: Vijay Subramanian <vijaynsu@cisco.com>
* Author: Mythili Prabhu <mysuryan@cisco.com>
*
@@ -17,9 +8,7 @@
* University of Oslo, Norway.
*
* References:
- * IETF draft submission: http://tools.ietf.org/html/draft-pan-aqm-pie-00
- * IEEE Conference on High Performance Switching and Routing 2013 :
- * "PIE: A * Lightweight Control Scheme to Address the Bufferbloat Problem"
+ * RFC 8033: https://tools.ietf.org/html/rfc8033
*/
#include <linux/module.h>
@@ -31,9 +20,9 @@
#include <net/pkt_sched.h>
#include <net/inet_ecn.h>
-#define QUEUE_THRESHOLD 10000
+#define QUEUE_THRESHOLD 16384
#define DQCOUNT_INVALID -1
-#define MAX_PROB 0xffffffff
+#define MAX_PROB 0xffffffffffffffff
#define PIE_SCALE 8
/* parameters used */
@@ -49,14 +38,16 @@
/* variables used */
struct pie_vars {
- u32 prob; /* probability but scaled by u32 limit. */
+ u64 prob; /* probability but scaled by u64 limit. */
psched_time_t burst_time;
psched_time_t qdelay;
psched_time_t qdelay_old;
u64 dq_count; /* measured in bytes */
psched_time_t dq_tstamp; /* drain rate */
+ u64 accu_prob; /* accumulated drop probability */
u32 avg_dq_rate; /* bytes per pschedtime tick,scaled */
u32 qlen_old; /* in bytes */
+ u8 accu_prob_overflows; /* overflows of accu_prob */
};
/* statistics gathering */
@@ -81,9 +72,9 @@
{
params->alpha = 2;
params->beta = 20;
- params->tupdate = usecs_to_jiffies(30 * USEC_PER_MSEC); /* 30 ms */
+ params->tupdate = usecs_to_jiffies(15 * USEC_PER_MSEC); /* 15 ms */
params->limit = 1000; /* default of 1000 packets */
- params->target = PSCHED_NS2TICKS(20 * NSEC_PER_MSEC); /* 20 ms */
+ params->target = PSCHED_NS2TICKS(15 * NSEC_PER_MSEC); /* 15 ms */
params->ecn = false;
params->bytemode = false;
}
@@ -91,16 +82,18 @@
static void pie_vars_init(struct pie_vars *vars)
{
vars->dq_count = DQCOUNT_INVALID;
+ vars->accu_prob = 0;
vars->avg_dq_rate = 0;
- /* default of 100 ms in pschedtime */
- vars->burst_time = PSCHED_NS2TICKS(100 * NSEC_PER_MSEC);
+ /* default of 150 ms in pschedtime */
+ vars->burst_time = PSCHED_NS2TICKS(150 * NSEC_PER_MSEC);
+ vars->accu_prob_overflows = 0;
}
static bool drop_early(struct Qdisc *sch, u32 packet_size)
{
struct pie_sched_data *q = qdisc_priv(sch);
- u32 rnd;
- u32 local_prob = q->vars.prob;
+ u64 rnd;
+ u64 local_prob = q->vars.prob;
u32 mtu = psched_mtu(qdisc_dev(sch));
/* If there is still burst allowance left skip random early drop */
@@ -110,8 +103,8 @@
/* If current delay is less than half of target, and
* if drop prob is low already, disable early_drop
*/
- if ((q->vars.qdelay < q->params.target / 2)
- && (q->vars.prob < MAX_PROB / 5))
+ if ((q->vars.qdelay < q->params.target / 2) &&
+ (q->vars.prob < MAX_PROB / 5))
return false;
/* If we have fewer than 2 mtu-sized packets, disable drop_early,
@@ -124,14 +117,34 @@
* probablity. Smaller packets will have lower drop prob in this case
*/
if (q->params.bytemode && packet_size <= mtu)
- local_prob = (local_prob / mtu) * packet_size;
+ local_prob = (u64)packet_size * div_u64(local_prob, mtu);
else
local_prob = q->vars.prob;
- rnd = prandom_u32();
- if (rnd < local_prob)
+ if (local_prob == 0) {
+ q->vars.accu_prob = 0;
+ q->vars.accu_prob_overflows = 0;
+ }
+
+ if (local_prob > MAX_PROB - q->vars.accu_prob)
+ q->vars.accu_prob_overflows++;
+
+ q->vars.accu_prob += local_prob;
+
+ if (q->vars.accu_prob_overflows == 0 &&
+ q->vars.accu_prob < (MAX_PROB / 100) * 85)
+ return false;
+ if (q->vars.accu_prob_overflows == 8 &&
+ q->vars.accu_prob >= MAX_PROB / 2)
return true;
+ prandom_bytes(&rnd, 8);
+ if (rnd < local_prob) {
+ q->vars.accu_prob = 0;
+ q->vars.accu_prob_overflows = 0;
+ return true;
+ }
+
return false;
}
@@ -168,6 +181,8 @@
out:
q->stats.dropped++;
+ q->vars.accu_prob = 0;
+ q->vars.accu_prob_overflows = 0;
return qdisc_drop(skb, sch, to_free);
}
@@ -192,7 +207,8 @@
if (!opt)
return -EINVAL;
- err = nla_parse_nested(tb, TCA_PIE_MAX, opt, pie_policy, NULL);
+ err = nla_parse_nested_deprecated(tb, TCA_PIE_MAX, opt, pie_policy,
+ NULL);
if (err < 0)
return err;
@@ -209,7 +225,8 @@
/* tupdate is in jiffies */
if (tb[TCA_PIE_TUPDATE])
- q->params.tupdate = usecs_to_jiffies(nla_get_u32(tb[TCA_PIE_TUPDATE]));
+ q->params.tupdate =
+ usecs_to_jiffies(nla_get_u32(tb[TCA_PIE_TUPDATE]));
if (tb[TCA_PIE_LIMIT]) {
u32 limit = nla_get_u32(tb[TCA_PIE_LIMIT]);
@@ -247,7 +264,6 @@
static void pie_process_dequeue(struct Qdisc *sch, struct sk_buff *skb)
{
-
struct pie_sched_data *q = qdisc_priv(sch);
int qlen = sch->qstats.backlog; /* current queue size in bytes */
@@ -294,9 +310,9 @@
* dq_count to 0 to re-enter the if block when the next
* packet is dequeued
*/
- if (qlen < QUEUE_THRESHOLD)
+ if (qlen < QUEUE_THRESHOLD) {
q->vars.dq_count = DQCOUNT_INVALID;
- else {
+ } else {
q->vars.dq_count = 0;
q->vars.dq_tstamp = psched_get_time();
}
@@ -317,9 +333,10 @@
u32 qlen = sch->qstats.backlog; /* queue size in bytes */
psched_time_t qdelay = 0; /* in pschedtime */
psched_time_t qdelay_old = q->vars.qdelay; /* in pschedtime */
- s32 delta = 0; /* determines the change in probability */
- u32 oldprob;
- u32 alpha, beta;
+ s64 delta = 0; /* determines the change in probability */
+ u64 oldprob;
+ u64 alpha, beta;
+ u32 power;
bool update_prob = true;
q->vars.qdelay_old = q->vars.qdelay;
@@ -339,38 +356,36 @@
* value for alpha as 0.125. In this implementation, we use values 0-32
* passed from user space to represent this. Also, alpha and beta have
* unit of HZ and need to be scaled before they can used to update
- * probability. alpha/beta are updated locally below by 1) scaling them
- * appropriately 2) scaling down by 16 to come to 0-2 range.
- * Please see paper for details.
- *
- * We scale alpha and beta differently depending on whether we are in
- * light, medium or high dropping mode.
+ * probability. alpha/beta are updated locally below by scaling down
+ * by 16 to come to 0-2 range.
*/
- if (q->vars.prob < MAX_PROB / 100) {
- alpha =
- (q->params.alpha * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 7;
- beta =
- (q->params.beta * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 7;
- } else if (q->vars.prob < MAX_PROB / 10) {
- alpha =
- (q->params.alpha * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 5;
- beta =
- (q->params.beta * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 5;
- } else {
- alpha =
- (q->params.alpha * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 4;
- beta =
- (q->params.beta * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 4;
+ alpha = ((u64)q->params.alpha * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 4;
+ beta = ((u64)q->params.beta * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 4;
+
+ /* We scale alpha and beta differently depending on how heavy the
+ * congestion is. Please see RFC 8033 for details.
+ */
+ if (q->vars.prob < MAX_PROB / 10) {
+ alpha >>= 1;
+ beta >>= 1;
+
+ power = 100;
+ while (q->vars.prob < div_u64(MAX_PROB, power) &&
+ power <= 1000000) {
+ alpha >>= 2;
+ beta >>= 2;
+ power *= 10;
+ }
}
/* alpha and beta should be between 0 and 32, in multiples of 1/16 */
- delta += alpha * ((qdelay - q->params.target));
- delta += beta * ((qdelay - qdelay_old));
+ delta += alpha * (u64)(qdelay - q->params.target);
+ delta += beta * (u64)(qdelay - qdelay_old);
oldprob = q->vars.prob;
/* to ensure we increase probability in steps of no more than 2% */
- if (delta > (s32) (MAX_PROB / (100 / 2)) &&
+ if (delta > (s64)(MAX_PROB / (100 / 2)) &&
q->vars.prob >= MAX_PROB / 10)
delta = (MAX_PROB / 100) * 2;
@@ -405,8 +420,9 @@
* delay is 0 for 2 consecutive Tupdate periods.
*/
- if ((qdelay == 0) && (qdelay_old == 0) && update_prob)
- q->vars.prob = (q->vars.prob * 98) / 100;
+ if (qdelay == 0 && qdelay_old == 0 && update_prob)
+ /* Reduce drop probability to 98.4% */
+ q->vars.prob -= q->vars.prob / 64u;
q->vars.qdelay = qdelay;
q->vars.qlen_old = qlen;
@@ -419,8 +435,8 @@
*/
if ((q->vars.qdelay < q->params.target / 2) &&
(q->vars.qdelay_old < q->params.target / 2) &&
- (q->vars.prob == 0) &&
- (q->vars.avg_dq_rate > 0))
+ q->vars.prob == 0 &&
+ q->vars.avg_dq_rate > 0)
pie_vars_init(&q->vars);
}
@@ -437,7 +453,6 @@
if (q->params.tupdate)
mod_timer(&q->adapt_timer, jiffies + q->params.tupdate);
spin_unlock(root_lock);
-
}
static int pie_init(struct Qdisc *sch, struct nlattr *opt,
@@ -468,16 +483,17 @@
struct pie_sched_data *q = qdisc_priv(sch);
struct nlattr *opts;
- opts = nla_nest_start(skb, TCA_OPTIONS);
- if (opts == NULL)
+ opts = nla_nest_start_noflag(skb, TCA_OPTIONS);
+ if (!opts)
goto nla_put_failure;
/* convert target from pschedtime to us */
if (nla_put_u32(skb, TCA_PIE_TARGET,
- ((u32) PSCHED_TICKS2NS(q->params.target)) /
+ ((u32)PSCHED_TICKS2NS(q->params.target)) /
NSEC_PER_USEC) ||
nla_put_u32(skb, TCA_PIE_LIMIT, sch->limit) ||
- nla_put_u32(skb, TCA_PIE_TUPDATE, jiffies_to_usecs(q->params.tupdate)) ||
+ nla_put_u32(skb, TCA_PIE_TUPDATE,
+ jiffies_to_usecs(q->params.tupdate)) ||
nla_put_u32(skb, TCA_PIE_ALPHA, q->params.alpha) ||
nla_put_u32(skb, TCA_PIE_BETA, q->params.beta) ||
nla_put_u32(skb, TCA_PIE_ECN, q->params.ecn) ||
@@ -489,7 +505,6 @@
nla_put_failure:
nla_nest_cancel(skb, opts);
return -1;
-
}
static int pie_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
@@ -497,7 +512,7 @@
struct pie_sched_data *q = qdisc_priv(sch);
struct tc_pie_xstats st = {
.prob = q->vars.prob,
- .delay = ((u32) PSCHED_TICKS2NS(q->vars.qdelay)) /
+ .delay = ((u32)PSCHED_TICKS2NS(q->vars.qdelay)) /
NSEC_PER_USEC,
/* unscale and return dq_rate in bytes per sec */
.avg_dq_rate = q->vars.avg_dq_rate *
@@ -514,8 +529,7 @@
static struct sk_buff *pie_qdisc_dequeue(struct Qdisc *sch)
{
- struct sk_buff *skb;
- skb = qdisc_dequeue_head(sch);
+ struct sk_buff *skb = qdisc_dequeue_head(sch);
if (!skb)
return NULL;
@@ -527,6 +541,7 @@
static void pie_reset(struct Qdisc *sch)
{
struct pie_sched_data *q = qdisc_priv(sch);
+
qdisc_reset_queue(sch);
pie_vars_init(&q->vars);
}
@@ -534,6 +549,7 @@
static void pie_destroy(struct Qdisc *sch)
{
struct pie_sched_data *q = qdisc_priv(sch);
+
q->params.tupdate = 0;
del_timer_sync(&q->adapt_timer);
}
diff --git a/net/sched/sch_plug.c b/net/sched/sch_plug.c
index 5619d2e..cbc2ebc 100644
--- a/net/sched/sch_plug.c
+++ b/net/sched/sch_plug.c
@@ -1,11 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* sch_plug.c Queue traffic until an explicit release command
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* There are two ways to use this qdisc:
* 1. A simple "instantaneous" plug/unplug operation, by issuing an alternating
* sequence of TCQ_PLUG_BUFFER & TCQ_PLUG_RELEASE_INDEFINITE commands.
diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c
index 222e53d..18b884c 100644
--- a/net/sched/sch_prio.c
+++ b/net/sched/sch_prio.c
@@ -1,11 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* net/sched/sch_prio.c Simple 3-band priority "scheduler".
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
* Fixes: 19990609: J Hadi Salim <hadi@nortelnetworks.com>:
* Init -- EINVAL when opt undefined
@@ -72,6 +68,7 @@
static int
prio_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free)
{
+ unsigned int len = qdisc_pkt_len(skb);
struct Qdisc *qdisc;
int ret;
@@ -88,7 +85,7 @@
ret = qdisc_enqueue(skb, qdisc, to_free);
if (ret == NET_XMIT_SUCCESS) {
- qdisc_qstats_backlog_inc(sch, skb);
+ sch->qstats.backlog += len;
sch->q.qlen++;
return NET_XMIT_SUCCESS;
}
@@ -175,7 +172,7 @@
tcf_block_put(q->block);
prio_offload(sch, NULL);
for (prio = 0; prio < q->bands; prio++)
- qdisc_destroy(q->queues[prio]);
+ qdisc_put(q->queues[prio]);
}
static int prio_tune(struct Qdisc *sch, struct nlattr *opt,
@@ -205,7 +202,7 @@
extack);
if (!queues[i]) {
while (i > oldbands)
- qdisc_destroy(queues[--i]);
+ qdisc_put(queues[--i]);
return -ENOMEM;
}
}
@@ -215,13 +212,8 @@
q->bands = qopt->bands;
memcpy(q->prio2band, qopt->priomap, TC_PRIO_MAX+1);
- for (i = q->bands; i < oldbands; i++) {
- struct Qdisc *child = q->queues[i];
-
- qdisc_tree_reduce_backlog(child, child->q.qlen,
- child->qstats.backlog);
- qdisc_destroy(child);
- }
+ for (i = q->bands; i < oldbands; i++)
+ qdisc_tree_flush_backlog(q->queues[i]);
for (i = oldbands; i < q->bands; i++) {
q->queues[i] = queues[i];
@@ -230,6 +222,9 @@
}
sch_tree_unlock(sch);
+
+ for (i = q->bands; i < oldbands; i++)
+ qdisc_put(q->queues[i]);
return 0;
}
@@ -251,7 +246,6 @@
static int prio_dump_offload(struct Qdisc *sch)
{
- struct net_device *dev = qdisc_dev(sch);
struct tc_prio_qopt_offload hw_stats = {
.command = TC_PRIO_STATS,
.handle = sch->handle,
@@ -263,21 +257,8 @@
},
},
};
- int err;
- sch->flags &= ~TCQ_F_OFFLOADED;
- if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
- return 0;
-
- err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_PRIO,
- &hw_stats);
- if (err == -EOPNOTSUPP)
- return 0;
-
- if (!err)
- sch->flags |= TCQ_F_OFFLOADED;
-
- return err;
+ return qdisc_offload_dump_helper(sch, TC_SETUP_QDISC_PRIO, &hw_stats);
}
static int prio_dump(struct Qdisc *sch, struct sk_buff *skb)
@@ -309,43 +290,22 @@
{
struct prio_sched_data *q = qdisc_priv(sch);
struct tc_prio_qopt_offload graft_offload;
- struct net_device *dev = qdisc_dev(sch);
unsigned long band = arg - 1;
- bool any_qdisc_is_offloaded;
- int err;
if (new == NULL)
new = &noop_qdisc;
*old = qdisc_replace(sch, new, &q->queues[band]);
- if (!tc_can_offload(dev))
- return 0;
-
graft_offload.handle = sch->handle;
graft_offload.parent = sch->parent;
graft_offload.graft_params.band = band;
graft_offload.graft_params.child_handle = new->handle;
graft_offload.command = TC_PRIO_GRAFT;
- err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_PRIO,
- &graft_offload);
-
- /* Don't report error if the graft is part of destroy operation. */
- if (err && new != &noop_qdisc) {
- /* Don't report error if the parent, the old child and the new
- * one are not offloaded.
- */
- any_qdisc_is_offloaded = sch->flags & TCQ_F_OFFLOADED;
- any_qdisc_is_offloaded |= new->flags & TCQ_F_OFFLOADED;
- if (*old)
- any_qdisc_is_offloaded |= (*old)->flags &
- TCQ_F_OFFLOADED;
-
- if (any_qdisc_is_offloaded)
- NL_SET_ERR_MSG(extack, "Offloading graft operation failed.");
- }
-
+ qdisc_offload_graft_helper(qdisc_dev(sch), sch, new, *old,
+ TC_SETUP_QDISC_PRIO, &graft_offload,
+ extack);
return 0;
}
@@ -396,8 +356,8 @@
cl_q = q->queues[cl - 1];
if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
- d, NULL, &cl_q->bstats) < 0 ||
- gnet_stats_copy_queue(d, NULL, &cl_q->qstats, cl_q->q.qlen) < 0)
+ d, cl_q->cpu_bstats, &cl_q->bstats) < 0 ||
+ qdisc_qstats_copy(d, cl_q) < 0)
return -1;
return 0;
diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c
index bb1a9c1..0b05ac7 100644
--- a/net/sched/sch_qfq.c
+++ b/net/sched/sch_qfq.c
@@ -1,12 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* net/sched/sch_qfq.c Quick Fair Queueing Plus Scheduler.
*
* Copyright (c) 2009 Fabio Checconi, Luigi Rizzo, and Paolo Valente.
* Copyright (c) 2012 Paolo Valente.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * version 2 as published by the Free Software Foundation.
*/
#include <linux/module.h>
@@ -217,15 +214,6 @@
return container_of(clc, struct qfq_class, common);
}
-static void qfq_purge_queue(struct qfq_class *cl)
-{
- unsigned int len = cl->qdisc->q.qlen;
- unsigned int backlog = cl->qdisc->qstats.backlog;
-
- qdisc_reset(cl->qdisc);
- qdisc_tree_reduce_backlog(cl->qdisc, len, backlog);
-}
-
static const struct nla_policy qfq_policy[TCA_QFQ_MAX + 1] = {
[TCA_QFQ_WEIGHT] = { .type = NLA_U32 },
[TCA_QFQ_LMAX] = { .type = NLA_U32 },
@@ -419,8 +407,8 @@
return -EINVAL;
}
- err = nla_parse_nested(tb, TCA_QFQ_MAX, tca[TCA_OPTIONS], qfq_policy,
- NULL);
+ err = nla_parse_nested_deprecated(tb, TCA_QFQ_MAX, tca[TCA_OPTIONS],
+ qfq_policy, NULL);
if (err < 0)
return err;
@@ -526,7 +514,7 @@
return 0;
destroy_class:
- qdisc_destroy(cl->qdisc);
+ qdisc_put(cl->qdisc);
kfree(cl);
return err;
}
@@ -537,7 +525,7 @@
qfq_rm_from_agg(q, cl);
gen_kill_estimator(&cl->rate_est);
- qdisc_destroy(cl->qdisc);
+ qdisc_put(cl->qdisc);
kfree(cl);
}
@@ -551,7 +539,7 @@
sch_tree_lock(sch);
- qfq_purge_queue(cl);
+ qdisc_purge_queue(cl->qdisc);
qdisc_class_hash_remove(&q->clhash, &cl->common);
sch_tree_unlock(sch);
@@ -628,7 +616,7 @@
tcm->tcm_handle = cl->common.classid;
tcm->tcm_info = cl->qdisc->handle;
- nest = nla_nest_start(skb, TCA_OPTIONS);
+ nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
if (nest == NULL)
goto nla_put_failure;
if (nla_put_u32(skb, TCA_QFQ_WEIGHT, cl->agg->class_weight) ||
@@ -655,8 +643,7 @@
if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
d, NULL, &cl->bstats) < 0 ||
gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
- gnet_stats_copy_queue(d, NULL,
- &cl->qdisc->qstats, cl->qdisc->q.qlen) < 0)
+ qdisc_qstats_copy(d, cl->qdisc) < 0)
return -1;
return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
@@ -1210,10 +1197,12 @@
static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
struct sk_buff **to_free)
{
+ unsigned int len = qdisc_pkt_len(skb), gso_segs;
struct qfq_sched *q = qdisc_priv(sch);
struct qfq_class *cl;
struct qfq_aggregate *agg;
int err = 0;
+ bool first;
cl = qfq_classify(skb, sch, &err);
if (cl == NULL) {
@@ -1224,17 +1213,18 @@
}
pr_debug("qfq_enqueue: cl = %x\n", cl->common.classid);
- if (unlikely(cl->agg->lmax < qdisc_pkt_len(skb))) {
+ if (unlikely(cl->agg->lmax < len)) {
pr_debug("qfq: increasing maxpkt from %u to %u for class %u",
- cl->agg->lmax, qdisc_pkt_len(skb), cl->common.classid);
- err = qfq_change_agg(sch, cl, cl->agg->class_weight,
- qdisc_pkt_len(skb));
+ cl->agg->lmax, len, cl->common.classid);
+ err = qfq_change_agg(sch, cl, cl->agg->class_weight, len);
if (err) {
cl->qstats.drops++;
return qdisc_drop(skb, sch, to_free);
}
}
+ gso_segs = skb_is_gso(skb) ? skb_shinfo(skb)->gso_segs : 1;
+ first = !cl->qdisc->q.qlen;
err = qdisc_enqueue(skb, cl->qdisc, to_free);
if (unlikely(err != NET_XMIT_SUCCESS)) {
pr_debug("qfq_enqueue: enqueue failed %d\n", err);
@@ -1245,16 +1235,17 @@
return err;
}
- bstats_update(&cl->bstats, skb);
- qdisc_qstats_backlog_inc(sch, skb);
+ cl->bstats.bytes += len;
+ cl->bstats.packets += gso_segs;
+ sch->qstats.backlog += len;
++sch->q.qlen;
agg = cl->agg;
/* if the queue was not empty, then done here */
- if (cl->qdisc->q.qlen != 1) {
+ if (!first) {
if (unlikely(skb == cl->qdisc->ops->peek(cl->qdisc)) &&
list_first_entry(&agg->active, struct qfq_class, alist)
- == cl && cl->deficit < qdisc_pkt_len(skb))
+ == cl && cl->deficit < len)
list_move_tail(&cl->alist, &agg->active);
return err;
diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c
index 56c181c..1695421 100644
--- a/net/sched/sch_red.c
+++ b/net/sched/sch_red.c
@@ -1,11 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* net/sched/sch_red.c Random Early Detection queue.
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
*
* Changes:
@@ -166,7 +162,9 @@
opt.set.min = q->parms.qth_min >> q->parms.Wlog;
opt.set.max = q->parms.qth_max >> q->parms.Wlog;
opt.set.probability = q->parms.max_P;
+ opt.set.limit = q->limit;
opt.set.is_ecn = red_use_ecn(q);
+ opt.set.is_harddrop = red_use_harddrop(q);
opt.set.qstats = &sch->qstats;
} else {
opt.command = TC_RED_DESTROY;
@@ -181,7 +179,7 @@
del_timer_sync(&q->adapt_timer);
red_offload(sch, false);
- qdisc_destroy(q->qdisc);
+ qdisc_put(q->qdisc);
}
static const struct nla_policy red_policy[TCA_RED_MAX + 1] = {
@@ -193,17 +191,18 @@
static int red_change(struct Qdisc *sch, struct nlattr *opt,
struct netlink_ext_ack *extack)
{
+ struct Qdisc *old_child = NULL, *child = NULL;
struct red_sched_data *q = qdisc_priv(sch);
struct nlattr *tb[TCA_RED_MAX + 1];
struct tc_red_qopt *ctl;
- struct Qdisc *child = NULL;
int err;
u32 max_P;
if (opt == NULL)
return -EINVAL;
- err = nla_parse_nested(tb, TCA_RED_MAX, opt, red_policy, NULL);
+ err = nla_parse_nested_deprecated(tb, TCA_RED_MAX, opt, red_policy,
+ NULL);
if (err < 0)
return err;
@@ -231,9 +230,8 @@
q->flags = ctl->flags;
q->limit = ctl->limit;
if (child) {
- qdisc_tree_reduce_backlog(q->qdisc, q->qdisc->q.qlen,
- q->qdisc->qstats.backlog);
- qdisc_destroy(q->qdisc);
+ qdisc_tree_flush_backlog(q->qdisc);
+ old_child = q->qdisc;
q->qdisc = child;
}
@@ -252,7 +250,11 @@
red_start_of_idle_period(&q->vars);
sch_tree_unlock(sch);
+
red_offload(sch, true);
+
+ if (old_child)
+ qdisc_put(old_child);
return 0;
}
@@ -279,9 +281,8 @@
return red_change(sch, opt, extack);
}
-static int red_dump_offload_stats(struct Qdisc *sch, struct tc_red_qopt *opt)
+static int red_dump_offload_stats(struct Qdisc *sch)
{
- struct net_device *dev = qdisc_dev(sch);
struct tc_red_qopt_offload hw_stats = {
.command = TC_RED_STATS,
.handle = sch->handle,
@@ -291,22 +292,8 @@
.stats.qstats = &sch->qstats,
},
};
- int err;
- sch->flags &= ~TCQ_F_OFFLOADED;
-
- if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
- return 0;
-
- err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_RED,
- &hw_stats);
- if (err == -EOPNOTSUPP)
- return 0;
-
- if (!err)
- sch->flags |= TCQ_F_OFFLOADED;
-
- return err;
+ return qdisc_offload_dump_helper(sch, TC_SETUP_QDISC_RED, &hw_stats);
}
static int red_dump(struct Qdisc *sch, struct sk_buff *skb)
@@ -324,11 +311,11 @@
};
int err;
- err = red_dump_offload_stats(sch, &opt);
+ err = red_dump_offload_stats(sch);
if (err)
goto nla_put_failure;
- opts = nla_nest_start(skb, TCA_OPTIONS);
+ opts = nla_nest_start_noflag(skb, TCA_OPTIONS);
if (opts == NULL)
goto nla_put_failure;
if (nla_put(skb, TCA_RED_PARMS, sizeof(opt), &opt) ||
@@ -377,6 +364,21 @@
return 0;
}
+static void red_graft_offload(struct Qdisc *sch,
+ struct Qdisc *new, struct Qdisc *old,
+ struct netlink_ext_ack *extack)
+{
+ struct tc_red_qopt_offload graft_offload = {
+ .handle = sch->handle,
+ .parent = sch->parent,
+ .child_handle = new->handle,
+ .command = TC_RED_GRAFT,
+ };
+
+ qdisc_offload_graft_helper(qdisc_dev(sch), sch, new, old,
+ TC_SETUP_QDISC_RED, &graft_offload, extack);
+}
+
static int red_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
struct Qdisc **old, struct netlink_ext_ack *extack)
{
@@ -386,6 +388,8 @@
new = &noop_qdisc;
*old = qdisc_replace(sch, new, &q->qdisc);
+
+ red_graft_offload(sch, new, *old, extack);
return 0;
}
diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c
index 7cbdad8..4074c50 100644
--- a/net/sched/sch_sfb.c
+++ b/net/sched/sch_sfb.c
@@ -1,19 +1,15 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* net/sched/sch_sfb.c Stochastic Fair Blue
*
* Copyright (c) 2008-2011 Juliusz Chroboczek <jch@pps.jussieu.fr>
* Copyright (c) 2011 Eric Dumazet <eric.dumazet@gmail.com>
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * version 2 as published by the Free Software Foundation.
- *
* W. Feng, D. Kandlur, D. Saha, K. Shin. Blue:
* A New Class of Active Queue Management Algorithms.
* U. Michigan CSE-TR-387-99, April 1999.
*
* http://www.thefengs.com/wuchang/blue/CSE-TR-387-99.pdf
- *
*/
#include <linux/module.h>
@@ -22,7 +18,7 @@
#include <linux/errno.h>
#include <linux/skbuff.h>
#include <linux/random.h>
-#include <linux/jhash.h>
+#include <linux/siphash.h>
#include <net/ip.h>
#include <net/pkt_sched.h>
#include <net/pkt_cls.h>
@@ -49,7 +45,7 @@
* (Section 4.4 of SFB reference : moving hash functions)
*/
struct sfb_bins {
- u32 perturbation; /* jhash perturbation */
+ siphash_key_t perturbation; /* siphash key */
struct sfb_bucket bins[SFB_LEVELS][SFB_NUMBUCKETS];
};
@@ -221,7 +217,8 @@
static void sfb_init_perturbation(u32 slot, struct sfb_sched_data *q)
{
- q->bins[slot].perturbation = prandom_u32();
+ get_random_bytes(&q->bins[slot].perturbation,
+ sizeof(q->bins[slot].perturbation));
}
static void sfb_swap_slot(struct sfb_sched_data *q)
@@ -318,9 +315,9 @@
/* If using external classifiers, get result and record it. */
if (!sfb_classify(skb, fl, &ret, &salt))
goto other_drop;
- sfbhash = jhash_1word(salt, q->bins[slot].perturbation);
+ sfbhash = siphash_1u32(salt, &q->bins[slot].perturbation);
} else {
- sfbhash = skb_get_hash_perturb(skb, q->bins[slot].perturbation);
+ sfbhash = skb_get_hash_perturb(skb, &q->bins[slot].perturbation);
}
@@ -356,7 +353,7 @@
/* Inelastic flow */
if (q->double_buffering) {
sfbhash = skb_get_hash_perturb(skb,
- q->bins[slot].perturbation);
+ &q->bins[slot].perturbation);
if (!sfbhash)
sfbhash = 1;
sfb_skb_cb(skb)->hashes[slot] = sfbhash;
@@ -469,7 +466,7 @@
struct sfb_sched_data *q = qdisc_priv(sch);
tcf_block_put(q->block);
- qdisc_destroy(q->qdisc);
+ qdisc_put(q->qdisc);
}
static const struct nla_policy sfb_policy[TCA_SFB_MAX + 1] = {
@@ -492,14 +489,15 @@
struct netlink_ext_ack *extack)
{
struct sfb_sched_data *q = qdisc_priv(sch);
- struct Qdisc *child;
+ struct Qdisc *child, *old;
struct nlattr *tb[TCA_SFB_MAX + 1];
const struct tc_sfb_qopt *ctl = &sfb_default_ops;
u32 limit;
int err;
if (opt) {
- err = nla_parse_nested(tb, TCA_SFB_MAX, opt, sfb_policy, NULL);
+ err = nla_parse_nested_deprecated(tb, TCA_SFB_MAX, opt,
+ sfb_policy, NULL);
if (err < 0)
return -EINVAL;
@@ -521,9 +519,8 @@
qdisc_hash_add(child, true);
sch_tree_lock(sch);
- qdisc_tree_reduce_backlog(q->qdisc, q->qdisc->q.qlen,
- q->qdisc->qstats.backlog);
- qdisc_destroy(q->qdisc);
+ qdisc_purge_queue(q->qdisc);
+ old = q->qdisc;
q->qdisc = child;
q->rehash_interval = msecs_to_jiffies(ctl->rehash_interval);
@@ -546,6 +543,7 @@
sfb_init_perturbation(1, q);
sch_tree_unlock(sch);
+ qdisc_put(old);
return 0;
}
@@ -581,7 +579,7 @@
};
sch->qstats.backlog = q->qdisc->qstats.backlog;
- opts = nla_nest_start(skb, TCA_OPTIONS);
+ opts = nla_nest_start_noflag(skb, TCA_OPTIONS);
if (opts == NULL)
goto nla_put_failure;
if (nla_put(skb, TCA_SFB_PARMS, sizeof(opt), &opt))
diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c
index 2f26781..c787d4d 100644
--- a/net/sched/sch_sfq.c
+++ b/net/sched/sch_sfq.c
@@ -1,11 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* net/sched/sch_sfq.c Stochastic Fairness Queueing discipline.
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
*/
@@ -18,7 +14,7 @@
#include <linux/errno.h>
#include <linux/init.h>
#include <linux/skbuff.h>
-#include <linux/jhash.h>
+#include <linux/siphash.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <net/netlink.h>
@@ -121,7 +117,7 @@
u8 headdrop;
u8 maxdepth; /* limit of packets per flow */
- u32 perturbation;
+ siphash_key_t perturbation;
u8 cur_depth; /* depth of longest slot */
u8 flags;
unsigned short scaled_quantum; /* SFQ_ALLOT_SIZE(quantum) */
@@ -161,7 +157,7 @@
static unsigned int sfq_hash(const struct sfq_sched_data *q,
const struct sk_buff *skb)
{
- return skb_get_hash_perturb(skb, q->perturbation) & (q->divisor - 1);
+ return skb_get_hash_perturb(skb, &q->perturbation) & (q->divisor - 1);
}
static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch,
@@ -611,9 +607,11 @@
struct sfq_sched_data *q = from_timer(q, t, perturb_timer);
struct Qdisc *sch = q->sch;
spinlock_t *root_lock = qdisc_lock(qdisc_root_sleeping(sch));
+ siphash_key_t nkey;
+ get_random_bytes(&nkey, sizeof(nkey));
spin_lock(root_lock);
- q->perturbation = prandom_u32();
+ q->perturbation = nkey;
if (!q->filter_list && q->tail)
sfq_rehash(sch);
spin_unlock(root_lock);
@@ -692,7 +690,7 @@
del_timer(&q->perturb_timer);
if (q->perturb_period) {
mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
- q->perturbation = prandom_u32();
+ get_random_bytes(&q->perturbation, sizeof(q->perturbation));
}
sch_tree_unlock(sch);
kfree(p);
@@ -749,7 +747,7 @@
q->quantum = psched_mtu(qdisc_dev(sch));
q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);
q->perturb_period = 0;
- q->perturbation = prandom_u32();
+ get_random_bytes(&q->perturbation, sizeof(q->perturbation));
if (opt) {
int err = sfq_change(sch, opt);
@@ -828,8 +826,6 @@
static unsigned long sfq_bind(struct Qdisc *sch, unsigned long parent,
u32 classid)
{
- /* we cannot bypass queue discipline anymore */
- sch->flags &= ~TCQ_F_CAN_BYPASS;
return 0;
}
diff --git a/net/sched/sch_skbprio.c b/net/sched/sch_skbprio.c
index 52c0b6d..0fb10ab 100644
--- a/net/sched/sch_skbprio.c
+++ b/net/sched/sch_skbprio.c
@@ -1,11 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* net/sched/sch_skbprio.c SKB Priority Queue.
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Authors: Nishanth Devarajan, <ndev2021@gmail.com>
* Cody Doucette, <doucette@bu.edu>
* original idea by Michel Machado, Cody Doucette, and Qiaobin Fu
diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c
new file mode 100644
index 0000000..c609373
--- /dev/null
+++ b/net/sched/sch_taprio.c
@@ -0,0 +1,1933 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/* net/sched/sch_taprio.c Time Aware Priority Scheduler
+ *
+ * Authors: Vinicius Costa Gomes <vinicius.gomes@intel.com>
+ *
+ */
+
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/list.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <linux/math64.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/rcupdate.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+#include <net/pkt_cls.h>
+#include <net/sch_generic.h>
+#include <net/sock.h>
+#include <net/tcp.h>
+
+static LIST_HEAD(taprio_list);
+static DEFINE_SPINLOCK(taprio_list_lock);
+
+#define TAPRIO_ALL_GATES_OPEN -1
+
+#define TXTIME_ASSIST_IS_ENABLED(flags) ((flags) & TCA_TAPRIO_ATTR_FLAG_TXTIME_ASSIST)
+#define FULL_OFFLOAD_IS_ENABLED(flags) ((flags) & TCA_TAPRIO_ATTR_FLAG_FULL_OFFLOAD)
+
+struct sched_entry {
+ struct list_head list;
+
+ /* The instant that this entry "closes" and the next one
+ * should open, the qdisc will make some effort so that no
+ * packet leaves after this time.
+ */
+ ktime_t close_time;
+ ktime_t next_txtime;
+ atomic_t budget;
+ int index;
+ u32 gate_mask;
+ u32 interval;
+ u8 command;
+};
+
+struct sched_gate_list {
+ struct rcu_head rcu;
+ struct list_head entries;
+ size_t num_entries;
+ ktime_t cycle_close_time;
+ s64 cycle_time;
+ s64 cycle_time_extension;
+ s64 base_time;
+};
+
+struct taprio_sched {
+ struct Qdisc **qdiscs;
+ struct Qdisc *root;
+ u32 flags;
+ enum tk_offsets tk_offset;
+ int clockid;
+ atomic64_t picos_per_byte; /* Using picoseconds because for 10Gbps+
+ * speeds it's sub-nanoseconds per byte
+ */
+
+ /* Protects the update side of the RCU protected current_entry */
+ spinlock_t current_entry_lock;
+ struct sched_entry __rcu *current_entry;
+ struct sched_gate_list __rcu *oper_sched;
+ struct sched_gate_list __rcu *admin_sched;
+ struct hrtimer advance_timer;
+ struct list_head taprio_list;
+ struct sk_buff *(*dequeue)(struct Qdisc *sch);
+ struct sk_buff *(*peek)(struct Qdisc *sch);
+ u32 txtime_delay;
+};
+
+struct __tc_taprio_qopt_offload {
+ refcount_t users;
+ struct tc_taprio_qopt_offload offload;
+};
+
+static ktime_t sched_base_time(const struct sched_gate_list *sched)
+{
+ if (!sched)
+ return KTIME_MAX;
+
+ return ns_to_ktime(sched->base_time);
+}
+
+static ktime_t taprio_get_time(struct taprio_sched *q)
+{
+ ktime_t mono = ktime_get();
+
+ switch (q->tk_offset) {
+ case TK_OFFS_MAX:
+ return mono;
+ default:
+ return ktime_mono_to_any(mono, q->tk_offset);
+ }
+
+ return KTIME_MAX;
+}
+
+static void taprio_free_sched_cb(struct rcu_head *head)
+{
+ struct sched_gate_list *sched = container_of(head, struct sched_gate_list, rcu);
+ struct sched_entry *entry, *n;
+
+ if (!sched)
+ return;
+
+ list_for_each_entry_safe(entry, n, &sched->entries, list) {
+ list_del(&entry->list);
+ kfree(entry);
+ }
+
+ kfree(sched);
+}
+
+static void switch_schedules(struct taprio_sched *q,
+ struct sched_gate_list **admin,
+ struct sched_gate_list **oper)
+{
+ rcu_assign_pointer(q->oper_sched, *admin);
+ rcu_assign_pointer(q->admin_sched, NULL);
+
+ if (*oper)
+ call_rcu(&(*oper)->rcu, taprio_free_sched_cb);
+
+ *oper = *admin;
+ *admin = NULL;
+}
+
+/* Get how much time has been already elapsed in the current cycle. */
+static s32 get_cycle_time_elapsed(struct sched_gate_list *sched, ktime_t time)
+{
+ ktime_t time_since_sched_start;
+ s32 time_elapsed;
+
+ time_since_sched_start = ktime_sub(time, sched->base_time);
+ div_s64_rem(time_since_sched_start, sched->cycle_time, &time_elapsed);
+
+ return time_elapsed;
+}
+
+static ktime_t get_interval_end_time(struct sched_gate_list *sched,
+ struct sched_gate_list *admin,
+ struct sched_entry *entry,
+ ktime_t intv_start)
+{
+ s32 cycle_elapsed = get_cycle_time_elapsed(sched, intv_start);
+ ktime_t intv_end, cycle_ext_end, cycle_end;
+
+ cycle_end = ktime_add_ns(intv_start, sched->cycle_time - cycle_elapsed);
+ intv_end = ktime_add_ns(intv_start, entry->interval);
+ cycle_ext_end = ktime_add(cycle_end, sched->cycle_time_extension);
+
+ if (ktime_before(intv_end, cycle_end))
+ return intv_end;
+ else if (admin && admin != sched &&
+ ktime_after(admin->base_time, cycle_end) &&
+ ktime_before(admin->base_time, cycle_ext_end))
+ return admin->base_time;
+ else
+ return cycle_end;
+}
+
+static int length_to_duration(struct taprio_sched *q, int len)
+{
+ return div_u64(len * atomic64_read(&q->picos_per_byte), 1000);
+}
+
+/* Returns the entry corresponding to next available interval. If
+ * validate_interval is set, it only validates whether the timestamp occurs
+ * when the gate corresponding to the skb's traffic class is open.
+ */
+static struct sched_entry *find_entry_to_transmit(struct sk_buff *skb,
+ struct Qdisc *sch,
+ struct sched_gate_list *sched,
+ struct sched_gate_list *admin,
+ ktime_t time,
+ ktime_t *interval_start,
+ ktime_t *interval_end,
+ bool validate_interval)
+{
+ ktime_t curr_intv_start, curr_intv_end, cycle_end, packet_transmit_time;
+ ktime_t earliest_txtime = KTIME_MAX, txtime, cycle, transmit_end_time;
+ struct sched_entry *entry = NULL, *entry_found = NULL;
+ struct taprio_sched *q = qdisc_priv(sch);
+ struct net_device *dev = qdisc_dev(sch);
+ bool entry_available = false;
+ s32 cycle_elapsed;
+ int tc, n;
+
+ tc = netdev_get_prio_tc_map(dev, skb->priority);
+ packet_transmit_time = length_to_duration(q, qdisc_pkt_len(skb));
+
+ *interval_start = 0;
+ *interval_end = 0;
+
+ if (!sched)
+ return NULL;
+
+ cycle = sched->cycle_time;
+ cycle_elapsed = get_cycle_time_elapsed(sched, time);
+ curr_intv_end = ktime_sub_ns(time, cycle_elapsed);
+ cycle_end = ktime_add_ns(curr_intv_end, cycle);
+
+ list_for_each_entry(entry, &sched->entries, list) {
+ curr_intv_start = curr_intv_end;
+ curr_intv_end = get_interval_end_time(sched, admin, entry,
+ curr_intv_start);
+
+ if (ktime_after(curr_intv_start, cycle_end))
+ break;
+
+ if (!(entry->gate_mask & BIT(tc)) ||
+ packet_transmit_time > entry->interval)
+ continue;
+
+ txtime = entry->next_txtime;
+
+ if (ktime_before(txtime, time) || validate_interval) {
+ transmit_end_time = ktime_add_ns(time, packet_transmit_time);
+ if ((ktime_before(curr_intv_start, time) &&
+ ktime_before(transmit_end_time, curr_intv_end)) ||
+ (ktime_after(curr_intv_start, time) && !validate_interval)) {
+ entry_found = entry;
+ *interval_start = curr_intv_start;
+ *interval_end = curr_intv_end;
+ break;
+ } else if (!entry_available && !validate_interval) {
+ /* Here, we are just trying to find out the
+ * first available interval in the next cycle.
+ */
+ entry_available = 1;
+ entry_found = entry;
+ *interval_start = ktime_add_ns(curr_intv_start, cycle);
+ *interval_end = ktime_add_ns(curr_intv_end, cycle);
+ }
+ } else if (ktime_before(txtime, earliest_txtime) &&
+ !entry_available) {
+ earliest_txtime = txtime;
+ entry_found = entry;
+ n = div_s64(ktime_sub(txtime, curr_intv_start), cycle);
+ *interval_start = ktime_add(curr_intv_start, n * cycle);
+ *interval_end = ktime_add(curr_intv_end, n * cycle);
+ }
+ }
+
+ return entry_found;
+}
+
+static bool is_valid_interval(struct sk_buff *skb, struct Qdisc *sch)
+{
+ struct taprio_sched *q = qdisc_priv(sch);
+ struct sched_gate_list *sched, *admin;
+ ktime_t interval_start, interval_end;
+ struct sched_entry *entry;
+
+ rcu_read_lock();
+ sched = rcu_dereference(q->oper_sched);
+ admin = rcu_dereference(q->admin_sched);
+
+ entry = find_entry_to_transmit(skb, sch, sched, admin, skb->tstamp,
+ &interval_start, &interval_end, true);
+ rcu_read_unlock();
+
+ return entry;
+}
+
+static bool taprio_flags_valid(u32 flags)
+{
+ /* Make sure no other flag bits are set. */
+ if (flags & ~(TCA_TAPRIO_ATTR_FLAG_TXTIME_ASSIST |
+ TCA_TAPRIO_ATTR_FLAG_FULL_OFFLOAD))
+ return false;
+ /* txtime-assist and full offload are mutually exclusive */
+ if ((flags & TCA_TAPRIO_ATTR_FLAG_TXTIME_ASSIST) &&
+ (flags & TCA_TAPRIO_ATTR_FLAG_FULL_OFFLOAD))
+ return false;
+ return true;
+}
+
+/* This returns the tstamp value set by TCP in terms of the set clock. */
+static ktime_t get_tcp_tstamp(struct taprio_sched *q, struct sk_buff *skb)
+{
+ unsigned int offset = skb_network_offset(skb);
+ const struct ipv6hdr *ipv6h;
+ const struct iphdr *iph;
+ struct ipv6hdr _ipv6h;
+
+ ipv6h = skb_header_pointer(skb, offset, sizeof(_ipv6h), &_ipv6h);
+ if (!ipv6h)
+ return 0;
+
+ if (ipv6h->version == 4) {
+ iph = (struct iphdr *)ipv6h;
+ offset += iph->ihl * 4;
+
+ /* special-case 6in4 tunnelling, as that is a common way to get
+ * v6 connectivity in the home
+ */
+ if (iph->protocol == IPPROTO_IPV6) {
+ ipv6h = skb_header_pointer(skb, offset,
+ sizeof(_ipv6h), &_ipv6h);
+
+ if (!ipv6h || ipv6h->nexthdr != IPPROTO_TCP)
+ return 0;
+ } else if (iph->protocol != IPPROTO_TCP) {
+ return 0;
+ }
+ } else if (ipv6h->version == 6 && ipv6h->nexthdr != IPPROTO_TCP) {
+ return 0;
+ }
+
+ return ktime_mono_to_any(skb->skb_mstamp_ns, q->tk_offset);
+}
+
+/* There are a few scenarios where we will have to modify the txtime from
+ * what is read from next_txtime in sched_entry. They are:
+ * 1. If txtime is in the past,
+ * a. The gate for the traffic class is currently open and packet can be
+ * transmitted before it closes, schedule the packet right away.
+ * b. If the gate corresponding to the traffic class is going to open later
+ * in the cycle, set the txtime of packet to the interval start.
+ * 2. If txtime is in the future, there are packets corresponding to the
+ * current traffic class waiting to be transmitted. So, the following
+ * possibilities exist:
+ * a. We can transmit the packet before the window containing the txtime
+ * closes.
+ * b. The window might close before the transmission can be completed
+ * successfully. So, schedule the packet in the next open window.
+ */
+static long get_packet_txtime(struct sk_buff *skb, struct Qdisc *sch)
+{
+ ktime_t transmit_end_time, interval_end, interval_start, tcp_tstamp;
+ struct taprio_sched *q = qdisc_priv(sch);
+ struct sched_gate_list *sched, *admin;
+ ktime_t minimum_time, now, txtime;
+ int len, packet_transmit_time;
+ struct sched_entry *entry;
+ bool sched_changed;
+
+ now = taprio_get_time(q);
+ minimum_time = ktime_add_ns(now, q->txtime_delay);
+
+ tcp_tstamp = get_tcp_tstamp(q, skb);
+ minimum_time = max_t(ktime_t, minimum_time, tcp_tstamp);
+
+ rcu_read_lock();
+ admin = rcu_dereference(q->admin_sched);
+ sched = rcu_dereference(q->oper_sched);
+ if (admin && ktime_after(minimum_time, admin->base_time))
+ switch_schedules(q, &admin, &sched);
+
+ /* Until the schedule starts, all the queues are open */
+ if (!sched || ktime_before(minimum_time, sched->base_time)) {
+ txtime = minimum_time;
+ goto done;
+ }
+
+ len = qdisc_pkt_len(skb);
+ packet_transmit_time = length_to_duration(q, len);
+
+ do {
+ sched_changed = 0;
+
+ entry = find_entry_to_transmit(skb, sch, sched, admin,
+ minimum_time,
+ &interval_start, &interval_end,
+ false);
+ if (!entry) {
+ txtime = 0;
+ goto done;
+ }
+
+ txtime = entry->next_txtime;
+ txtime = max_t(ktime_t, txtime, minimum_time);
+ txtime = max_t(ktime_t, txtime, interval_start);
+
+ if (admin && admin != sched &&
+ ktime_after(txtime, admin->base_time)) {
+ sched = admin;
+ sched_changed = 1;
+ continue;
+ }
+
+ transmit_end_time = ktime_add(txtime, packet_transmit_time);
+ minimum_time = transmit_end_time;
+
+ /* Update the txtime of current entry to the next time it's
+ * interval starts.
+ */
+ if (ktime_after(transmit_end_time, interval_end))
+ entry->next_txtime = ktime_add(interval_start, sched->cycle_time);
+ } while (sched_changed || ktime_after(transmit_end_time, interval_end));
+
+ entry->next_txtime = transmit_end_time;
+
+done:
+ rcu_read_unlock();
+ return txtime;
+}
+
+static int taprio_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+ struct sk_buff **to_free)
+{
+ struct taprio_sched *q = qdisc_priv(sch);
+ struct Qdisc *child;
+ int queue;
+
+ queue = skb_get_queue_mapping(skb);
+
+ child = q->qdiscs[queue];
+ if (unlikely(!child))
+ return qdisc_drop(skb, sch, to_free);
+
+ if (skb->sk && sock_flag(skb->sk, SOCK_TXTIME)) {
+ if (!is_valid_interval(skb, sch))
+ return qdisc_drop(skb, sch, to_free);
+ } else if (TXTIME_ASSIST_IS_ENABLED(q->flags)) {
+ skb->tstamp = get_packet_txtime(skb, sch);
+ if (!skb->tstamp)
+ return qdisc_drop(skb, sch, to_free);
+ }
+
+ qdisc_qstats_backlog_inc(sch, skb);
+ sch->q.qlen++;
+
+ return qdisc_enqueue(skb, child, to_free);
+}
+
+static struct sk_buff *taprio_peek_soft(struct Qdisc *sch)
+{
+ struct taprio_sched *q = qdisc_priv(sch);
+ struct net_device *dev = qdisc_dev(sch);
+ struct sched_entry *entry;
+ struct sk_buff *skb;
+ u32 gate_mask;
+ int i;
+
+ rcu_read_lock();
+ entry = rcu_dereference(q->current_entry);
+ gate_mask = entry ? entry->gate_mask : TAPRIO_ALL_GATES_OPEN;
+ rcu_read_unlock();
+
+ if (!gate_mask)
+ return NULL;
+
+ for (i = 0; i < dev->num_tx_queues; i++) {
+ struct Qdisc *child = q->qdiscs[i];
+ int prio;
+ u8 tc;
+
+ if (unlikely(!child))
+ continue;
+
+ skb = child->ops->peek(child);
+ if (!skb)
+ continue;
+
+ if (TXTIME_ASSIST_IS_ENABLED(q->flags))
+ return skb;
+
+ prio = skb->priority;
+ tc = netdev_get_prio_tc_map(dev, prio);
+
+ if (!(gate_mask & BIT(tc)))
+ continue;
+
+ return skb;
+ }
+
+ return NULL;
+}
+
+static struct sk_buff *taprio_peek_offload(struct Qdisc *sch)
+{
+ struct taprio_sched *q = qdisc_priv(sch);
+ struct net_device *dev = qdisc_dev(sch);
+ struct sk_buff *skb;
+ int i;
+
+ for (i = 0; i < dev->num_tx_queues; i++) {
+ struct Qdisc *child = q->qdiscs[i];
+
+ if (unlikely(!child))
+ continue;
+
+ skb = child->ops->peek(child);
+ if (!skb)
+ continue;
+
+ return skb;
+ }
+
+ return NULL;
+}
+
+static struct sk_buff *taprio_peek(struct Qdisc *sch)
+{
+ struct taprio_sched *q = qdisc_priv(sch);
+
+ return q->peek(sch);
+}
+
+static void taprio_set_budget(struct taprio_sched *q, struct sched_entry *entry)
+{
+ atomic_set(&entry->budget,
+ div64_u64((u64)entry->interval * 1000,
+ atomic64_read(&q->picos_per_byte)));
+}
+
+static struct sk_buff *taprio_dequeue_soft(struct Qdisc *sch)
+{
+ struct taprio_sched *q = qdisc_priv(sch);
+ struct net_device *dev = qdisc_dev(sch);
+ struct sk_buff *skb = NULL;
+ struct sched_entry *entry;
+ u32 gate_mask;
+ int i;
+
+ rcu_read_lock();
+ entry = rcu_dereference(q->current_entry);
+ /* if there's no entry, it means that the schedule didn't
+ * start yet, so force all gates to be open, this is in
+ * accordance to IEEE 802.1Qbv-2015 Section 8.6.9.4.5
+ * "AdminGateSates"
+ */
+ gate_mask = entry ? entry->gate_mask : TAPRIO_ALL_GATES_OPEN;
+
+ if (!gate_mask)
+ goto done;
+
+ for (i = 0; i < dev->num_tx_queues; i++) {
+ struct Qdisc *child = q->qdiscs[i];
+ ktime_t guard;
+ int prio;
+ int len;
+ u8 tc;
+
+ if (unlikely(!child))
+ continue;
+
+ if (TXTIME_ASSIST_IS_ENABLED(q->flags)) {
+ skb = child->ops->dequeue(child);
+ if (!skb)
+ continue;
+ goto skb_found;
+ }
+
+ skb = child->ops->peek(child);
+ if (!skb)
+ continue;
+
+ prio = skb->priority;
+ tc = netdev_get_prio_tc_map(dev, prio);
+
+ if (!(gate_mask & BIT(tc)))
+ continue;
+
+ len = qdisc_pkt_len(skb);
+ guard = ktime_add_ns(taprio_get_time(q),
+ length_to_duration(q, len));
+
+ /* In the case that there's no gate entry, there's no
+ * guard band ...
+ */
+ if (gate_mask != TAPRIO_ALL_GATES_OPEN &&
+ ktime_after(guard, entry->close_time))
+ continue;
+
+ /* ... and no budget. */
+ if (gate_mask != TAPRIO_ALL_GATES_OPEN &&
+ atomic_sub_return(len, &entry->budget) < 0)
+ continue;
+
+ skb = child->ops->dequeue(child);
+ if (unlikely(!skb))
+ goto done;
+
+skb_found:
+ qdisc_bstats_update(sch, skb);
+ qdisc_qstats_backlog_dec(sch, skb);
+ sch->q.qlen--;
+
+ goto done;
+ }
+
+done:
+ rcu_read_unlock();
+
+ return skb;
+}
+
+static struct sk_buff *taprio_dequeue_offload(struct Qdisc *sch)
+{
+ struct taprio_sched *q = qdisc_priv(sch);
+ struct net_device *dev = qdisc_dev(sch);
+ struct sk_buff *skb;
+ int i;
+
+ for (i = 0; i < dev->num_tx_queues; i++) {
+ struct Qdisc *child = q->qdiscs[i];
+
+ if (unlikely(!child))
+ continue;
+
+ skb = child->ops->dequeue(child);
+ if (unlikely(!skb))
+ continue;
+
+ qdisc_bstats_update(sch, skb);
+ qdisc_qstats_backlog_dec(sch, skb);
+ sch->q.qlen--;
+
+ return skb;
+ }
+
+ return NULL;
+}
+
+static struct sk_buff *taprio_dequeue(struct Qdisc *sch)
+{
+ struct taprio_sched *q = qdisc_priv(sch);
+
+ return q->dequeue(sch);
+}
+
+static bool should_restart_cycle(const struct sched_gate_list *oper,
+ const struct sched_entry *entry)
+{
+ if (list_is_last(&entry->list, &oper->entries))
+ return true;
+
+ if (ktime_compare(entry->close_time, oper->cycle_close_time) == 0)
+ return true;
+
+ return false;
+}
+
+static bool should_change_schedules(const struct sched_gate_list *admin,
+ const struct sched_gate_list *oper,
+ ktime_t close_time)
+{
+ ktime_t next_base_time, extension_time;
+
+ if (!admin)
+ return false;
+
+ next_base_time = sched_base_time(admin);
+
+ /* This is the simple case, the close_time would fall after
+ * the next schedule base_time.
+ */
+ if (ktime_compare(next_base_time, close_time) <= 0)
+ return true;
+
+ /* This is the cycle_time_extension case, if the close_time
+ * plus the amount that can be extended would fall after the
+ * next schedule base_time, we can extend the current schedule
+ * for that amount.
+ */
+ extension_time = ktime_add_ns(close_time, oper->cycle_time_extension);
+
+ /* FIXME: the IEEE 802.1Q-2018 Specification isn't clear about
+ * how precisely the extension should be made. So after
+ * conformance testing, this logic may change.
+ */
+ if (ktime_compare(next_base_time, extension_time) <= 0)
+ return true;
+
+ return false;
+}
+
+static enum hrtimer_restart advance_sched(struct hrtimer *timer)
+{
+ struct taprio_sched *q = container_of(timer, struct taprio_sched,
+ advance_timer);
+ struct sched_gate_list *oper, *admin;
+ struct sched_entry *entry, *next;
+ struct Qdisc *sch = q->root;
+ ktime_t close_time;
+
+ spin_lock(&q->current_entry_lock);
+ entry = rcu_dereference_protected(q->current_entry,
+ lockdep_is_held(&q->current_entry_lock));
+ oper = rcu_dereference_protected(q->oper_sched,
+ lockdep_is_held(&q->current_entry_lock));
+ admin = rcu_dereference_protected(q->admin_sched,
+ lockdep_is_held(&q->current_entry_lock));
+
+ if (!oper)
+ switch_schedules(q, &admin, &oper);
+
+ /* This can happen in two cases: 1. this is the very first run
+ * of this function (i.e. we weren't running any schedule
+ * previously); 2. The previous schedule just ended. The first
+ * entry of all schedules are pre-calculated during the
+ * schedule initialization.
+ */
+ if (unlikely(!entry || entry->close_time == oper->base_time)) {
+ next = list_first_entry(&oper->entries, struct sched_entry,
+ list);
+ close_time = next->close_time;
+ goto first_run;
+ }
+
+ if (should_restart_cycle(oper, entry)) {
+ next = list_first_entry(&oper->entries, struct sched_entry,
+ list);
+ oper->cycle_close_time = ktime_add_ns(oper->cycle_close_time,
+ oper->cycle_time);
+ } else {
+ next = list_next_entry(entry, list);
+ }
+
+ close_time = ktime_add_ns(entry->close_time, next->interval);
+ close_time = min_t(ktime_t, close_time, oper->cycle_close_time);
+
+ if (should_change_schedules(admin, oper, close_time)) {
+ /* Set things so the next time this runs, the new
+ * schedule runs.
+ */
+ close_time = sched_base_time(admin);
+ switch_schedules(q, &admin, &oper);
+ }
+
+ next->close_time = close_time;
+ taprio_set_budget(q, next);
+
+first_run:
+ rcu_assign_pointer(q->current_entry, next);
+ spin_unlock(&q->current_entry_lock);
+
+ hrtimer_set_expires(&q->advance_timer, close_time);
+
+ rcu_read_lock();
+ __netif_schedule(sch);
+ rcu_read_unlock();
+
+ return HRTIMER_RESTART;
+}
+
+static const struct nla_policy entry_policy[TCA_TAPRIO_SCHED_ENTRY_MAX + 1] = {
+ [TCA_TAPRIO_SCHED_ENTRY_INDEX] = { .type = NLA_U32 },
+ [TCA_TAPRIO_SCHED_ENTRY_CMD] = { .type = NLA_U8 },
+ [TCA_TAPRIO_SCHED_ENTRY_GATE_MASK] = { .type = NLA_U32 },
+ [TCA_TAPRIO_SCHED_ENTRY_INTERVAL] = { .type = NLA_U32 },
+};
+
+static const struct nla_policy taprio_policy[TCA_TAPRIO_ATTR_MAX + 1] = {
+ [TCA_TAPRIO_ATTR_PRIOMAP] = {
+ .len = sizeof(struct tc_mqprio_qopt)
+ },
+ [TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST] = { .type = NLA_NESTED },
+ [TCA_TAPRIO_ATTR_SCHED_BASE_TIME] = { .type = NLA_S64 },
+ [TCA_TAPRIO_ATTR_SCHED_SINGLE_ENTRY] = { .type = NLA_NESTED },
+ [TCA_TAPRIO_ATTR_SCHED_CLOCKID] = { .type = NLA_S32 },
+ [TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME] = { .type = NLA_S64 },
+ [TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME_EXTENSION] = { .type = NLA_S64 },
+};
+
+static int fill_sched_entry(struct nlattr **tb, struct sched_entry *entry,
+ struct netlink_ext_ack *extack)
+{
+ u32 interval = 0;
+
+ if (tb[TCA_TAPRIO_SCHED_ENTRY_CMD])
+ entry->command = nla_get_u8(
+ tb[TCA_TAPRIO_SCHED_ENTRY_CMD]);
+
+ if (tb[TCA_TAPRIO_SCHED_ENTRY_GATE_MASK])
+ entry->gate_mask = nla_get_u32(
+ tb[TCA_TAPRIO_SCHED_ENTRY_GATE_MASK]);
+
+ if (tb[TCA_TAPRIO_SCHED_ENTRY_INTERVAL])
+ interval = nla_get_u32(
+ tb[TCA_TAPRIO_SCHED_ENTRY_INTERVAL]);
+
+ if (interval == 0) {
+ NL_SET_ERR_MSG(extack, "Invalid interval for schedule entry");
+ return -EINVAL;
+ }
+
+ entry->interval = interval;
+
+ return 0;
+}
+
+static int parse_sched_entry(struct nlattr *n, struct sched_entry *entry,
+ int index, struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[TCA_TAPRIO_SCHED_ENTRY_MAX + 1] = { };
+ int err;
+
+ err = nla_parse_nested_deprecated(tb, TCA_TAPRIO_SCHED_ENTRY_MAX, n,
+ entry_policy, NULL);
+ if (err < 0) {
+ NL_SET_ERR_MSG(extack, "Could not parse nested entry");
+ return -EINVAL;
+ }
+
+ entry->index = index;
+
+ return fill_sched_entry(tb, entry, extack);
+}
+
+static int parse_sched_list(struct nlattr *list,
+ struct sched_gate_list *sched,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *n;
+ int err, rem;
+ int i = 0;
+
+ if (!list)
+ return -EINVAL;
+
+ nla_for_each_nested(n, list, rem) {
+ struct sched_entry *entry;
+
+ if (nla_type(n) != TCA_TAPRIO_SCHED_ENTRY) {
+ NL_SET_ERR_MSG(extack, "Attribute is not of type 'entry'");
+ continue;
+ }
+
+ entry = kzalloc(sizeof(*entry), GFP_KERNEL);
+ if (!entry) {
+ NL_SET_ERR_MSG(extack, "Not enough memory for entry");
+ return -ENOMEM;
+ }
+
+ err = parse_sched_entry(n, entry, i, extack);
+ if (err < 0) {
+ kfree(entry);
+ return err;
+ }
+
+ list_add_tail(&entry->list, &sched->entries);
+ i++;
+ }
+
+ sched->num_entries = i;
+
+ return i;
+}
+
+static int parse_taprio_schedule(struct nlattr **tb,
+ struct sched_gate_list *new,
+ struct netlink_ext_ack *extack)
+{
+ int err = 0;
+
+ if (tb[TCA_TAPRIO_ATTR_SCHED_SINGLE_ENTRY]) {
+ NL_SET_ERR_MSG(extack, "Adding a single entry is not supported");
+ return -ENOTSUPP;
+ }
+
+ if (tb[TCA_TAPRIO_ATTR_SCHED_BASE_TIME])
+ new->base_time = nla_get_s64(tb[TCA_TAPRIO_ATTR_SCHED_BASE_TIME]);
+
+ if (tb[TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME_EXTENSION])
+ new->cycle_time_extension = nla_get_s64(tb[TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME_EXTENSION]);
+
+ if (tb[TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME])
+ new->cycle_time = nla_get_s64(tb[TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME]);
+
+ if (tb[TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST])
+ err = parse_sched_list(
+ tb[TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST], new, extack);
+ if (err < 0)
+ return err;
+
+ if (!new->cycle_time) {
+ struct sched_entry *entry;
+ ktime_t cycle = 0;
+
+ list_for_each_entry(entry, &new->entries, list)
+ cycle = ktime_add_ns(cycle, entry->interval);
+ new->cycle_time = cycle;
+ }
+
+ return 0;
+}
+
+static int taprio_parse_mqprio_opt(struct net_device *dev,
+ struct tc_mqprio_qopt *qopt,
+ struct netlink_ext_ack *extack,
+ u32 taprio_flags)
+{
+ int i, j;
+
+ if (!qopt && !dev->num_tc) {
+ NL_SET_ERR_MSG(extack, "'mqprio' configuration is necessary");
+ return -EINVAL;
+ }
+
+ /* If num_tc is already set, it means that the user already
+ * configured the mqprio part
+ */
+ if (dev->num_tc)
+ return 0;
+
+ /* Verify num_tc is not out of max range */
+ if (qopt->num_tc > TC_MAX_QUEUE) {
+ NL_SET_ERR_MSG(extack, "Number of traffic classes is outside valid range");
+ return -EINVAL;
+ }
+
+ /* taprio imposes that traffic classes map 1:n to tx queues */
+ if (qopt->num_tc > dev->num_tx_queues) {
+ NL_SET_ERR_MSG(extack, "Number of traffic classes is greater than number of HW queues");
+ return -EINVAL;
+ }
+
+ /* Verify priority mapping uses valid tcs */
+ for (i = 0; i <= TC_BITMASK; i++) {
+ if (qopt->prio_tc_map[i] >= qopt->num_tc) {
+ NL_SET_ERR_MSG(extack, "Invalid traffic class in priority to traffic class mapping");
+ return -EINVAL;
+ }
+ }
+
+ for (i = 0; i < qopt->num_tc; i++) {
+ unsigned int last = qopt->offset[i] + qopt->count[i];
+
+ /* Verify the queue count is in tx range being equal to the
+ * real_num_tx_queues indicates the last queue is in use.
+ */
+ if (qopt->offset[i] >= dev->num_tx_queues ||
+ !qopt->count[i] ||
+ last > dev->real_num_tx_queues) {
+ NL_SET_ERR_MSG(extack, "Invalid queue in traffic class to queue mapping");
+ return -EINVAL;
+ }
+
+ if (TXTIME_ASSIST_IS_ENABLED(taprio_flags))
+ continue;
+
+ /* Verify that the offset and counts do not overlap */
+ for (j = i + 1; j < qopt->num_tc; j++) {
+ if (last > qopt->offset[j]) {
+ NL_SET_ERR_MSG(extack, "Detected overlap in the traffic class to queue mapping");
+ return -EINVAL;
+ }
+ }
+ }
+
+ return 0;
+}
+
+static int taprio_get_start_time(struct Qdisc *sch,
+ struct sched_gate_list *sched,
+ ktime_t *start)
+{
+ struct taprio_sched *q = qdisc_priv(sch);
+ ktime_t now, base, cycle;
+ s64 n;
+
+ base = sched_base_time(sched);
+ now = taprio_get_time(q);
+
+ if (ktime_after(base, now)) {
+ *start = base;
+ return 0;
+ }
+
+ cycle = sched->cycle_time;
+
+ /* The qdisc is expected to have at least one sched_entry. Moreover,
+ * any entry must have 'interval' > 0. Thus if the cycle time is zero,
+ * something went really wrong. In that case, we should warn about this
+ * inconsistent state and return error.
+ */
+ if (WARN_ON(!cycle))
+ return -EFAULT;
+
+ /* Schedule the start time for the beginning of the next
+ * cycle.
+ */
+ n = div64_s64(ktime_sub_ns(now, base), cycle);
+ *start = ktime_add_ns(base, (n + 1) * cycle);
+ return 0;
+}
+
+static void setup_first_close_time(struct taprio_sched *q,
+ struct sched_gate_list *sched, ktime_t base)
+{
+ struct sched_entry *first;
+ ktime_t cycle;
+
+ first = list_first_entry(&sched->entries,
+ struct sched_entry, list);
+
+ cycle = sched->cycle_time;
+
+ /* FIXME: find a better place to do this */
+ sched->cycle_close_time = ktime_add_ns(base, cycle);
+
+ first->close_time = ktime_add_ns(base, first->interval);
+ taprio_set_budget(q, first);
+ rcu_assign_pointer(q->current_entry, NULL);
+}
+
+static void taprio_start_sched(struct Qdisc *sch,
+ ktime_t start, struct sched_gate_list *new)
+{
+ struct taprio_sched *q = qdisc_priv(sch);
+ ktime_t expires;
+
+ if (FULL_OFFLOAD_IS_ENABLED(q->flags))
+ return;
+
+ expires = hrtimer_get_expires(&q->advance_timer);
+ if (expires == 0)
+ expires = KTIME_MAX;
+
+ /* If the new schedule starts before the next expiration, we
+ * reprogram it to the earliest one, so we change the admin
+ * schedule to the operational one at the right time.
+ */
+ start = min_t(ktime_t, start, expires);
+
+ hrtimer_start(&q->advance_timer, start, HRTIMER_MODE_ABS);
+}
+
+static void taprio_set_picos_per_byte(struct net_device *dev,
+ struct taprio_sched *q)
+{
+ struct ethtool_link_ksettings ecmd;
+ int speed = SPEED_10;
+ int picos_per_byte;
+ int err;
+
+ err = __ethtool_get_link_ksettings(dev, &ecmd);
+ if (err < 0)
+ goto skip;
+
+ if (ecmd.base.speed && ecmd.base.speed != SPEED_UNKNOWN)
+ speed = ecmd.base.speed;
+
+skip:
+ picos_per_byte = (USEC_PER_SEC * 8) / speed;
+
+ atomic64_set(&q->picos_per_byte, picos_per_byte);
+ netdev_dbg(dev, "taprio: set %s's picos_per_byte to: %lld, linkspeed: %d\n",
+ dev->name, (long long)atomic64_read(&q->picos_per_byte),
+ ecmd.base.speed);
+}
+
+static int taprio_dev_notifier(struct notifier_block *nb, unsigned long event,
+ void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct net_device *qdev;
+ struct taprio_sched *q;
+ bool found = false;
+
+ ASSERT_RTNL();
+
+ if (event != NETDEV_UP && event != NETDEV_CHANGE)
+ return NOTIFY_DONE;
+
+ spin_lock(&taprio_list_lock);
+ list_for_each_entry(q, &taprio_list, taprio_list) {
+ qdev = qdisc_dev(q->root);
+ if (qdev == dev) {
+ found = true;
+ break;
+ }
+ }
+ spin_unlock(&taprio_list_lock);
+
+ if (found)
+ taprio_set_picos_per_byte(dev, q);
+
+ return NOTIFY_DONE;
+}
+
+static void setup_txtime(struct taprio_sched *q,
+ struct sched_gate_list *sched, ktime_t base)
+{
+ struct sched_entry *entry;
+ u32 interval = 0;
+
+ list_for_each_entry(entry, &sched->entries, list) {
+ entry->next_txtime = ktime_add_ns(base, interval);
+ interval += entry->interval;
+ }
+}
+
+static struct tc_taprio_qopt_offload *taprio_offload_alloc(int num_entries)
+{
+ size_t size = sizeof(struct tc_taprio_sched_entry) * num_entries +
+ sizeof(struct __tc_taprio_qopt_offload);
+ struct __tc_taprio_qopt_offload *__offload;
+
+ __offload = kzalloc(size, GFP_KERNEL);
+ if (!__offload)
+ return NULL;
+
+ refcount_set(&__offload->users, 1);
+
+ return &__offload->offload;
+}
+
+struct tc_taprio_qopt_offload *taprio_offload_get(struct tc_taprio_qopt_offload
+ *offload)
+{
+ struct __tc_taprio_qopt_offload *__offload;
+
+ __offload = container_of(offload, struct __tc_taprio_qopt_offload,
+ offload);
+
+ refcount_inc(&__offload->users);
+
+ return offload;
+}
+EXPORT_SYMBOL_GPL(taprio_offload_get);
+
+void taprio_offload_free(struct tc_taprio_qopt_offload *offload)
+{
+ struct __tc_taprio_qopt_offload *__offload;
+
+ __offload = container_of(offload, struct __tc_taprio_qopt_offload,
+ offload);
+
+ if (!refcount_dec_and_test(&__offload->users))
+ return;
+
+ kfree(__offload);
+}
+EXPORT_SYMBOL_GPL(taprio_offload_free);
+
+/* The function will only serve to keep the pointers to the "oper" and "admin"
+ * schedules valid in relation to their base times, so when calling dump() the
+ * users looks at the right schedules.
+ * When using full offload, the admin configuration is promoted to oper at the
+ * base_time in the PHC time domain. But because the system time is not
+ * necessarily in sync with that, we can't just trigger a hrtimer to call
+ * switch_schedules at the right hardware time.
+ * At the moment we call this by hand right away from taprio, but in the future
+ * it will be useful to create a mechanism for drivers to notify taprio of the
+ * offload state (PENDING, ACTIVE, INACTIVE) so it can be visible in dump().
+ * This is left as TODO.
+ */
+static void taprio_offload_config_changed(struct taprio_sched *q)
+{
+ struct sched_gate_list *oper, *admin;
+
+ spin_lock(&q->current_entry_lock);
+
+ oper = rcu_dereference_protected(q->oper_sched,
+ lockdep_is_held(&q->current_entry_lock));
+ admin = rcu_dereference_protected(q->admin_sched,
+ lockdep_is_held(&q->current_entry_lock));
+
+ switch_schedules(q, &admin, &oper);
+
+ spin_unlock(&q->current_entry_lock);
+}
+
+static void taprio_sched_to_offload(struct taprio_sched *q,
+ struct sched_gate_list *sched,
+ const struct tc_mqprio_qopt *mqprio,
+ struct tc_taprio_qopt_offload *offload)
+{
+ struct sched_entry *entry;
+ int i = 0;
+
+ offload->base_time = sched->base_time;
+ offload->cycle_time = sched->cycle_time;
+ offload->cycle_time_extension = sched->cycle_time_extension;
+
+ list_for_each_entry(entry, &sched->entries, list) {
+ struct tc_taprio_sched_entry *e = &offload->entries[i];
+
+ e->command = entry->command;
+ e->interval = entry->interval;
+ e->gate_mask = entry->gate_mask;
+ i++;
+ }
+
+ offload->num_entries = i;
+}
+
+static int taprio_enable_offload(struct net_device *dev,
+ struct tc_mqprio_qopt *mqprio,
+ struct taprio_sched *q,
+ struct sched_gate_list *sched,
+ struct netlink_ext_ack *extack)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
+ struct tc_taprio_qopt_offload *offload;
+ int err = 0;
+
+ if (!ops->ndo_setup_tc) {
+ NL_SET_ERR_MSG(extack,
+ "Device does not support taprio offload");
+ return -EOPNOTSUPP;
+ }
+
+ offload = taprio_offload_alloc(sched->num_entries);
+ if (!offload) {
+ NL_SET_ERR_MSG(extack,
+ "Not enough memory for enabling offload mode");
+ return -ENOMEM;
+ }
+ offload->enable = 1;
+ taprio_sched_to_offload(q, sched, mqprio, offload);
+
+ err = ops->ndo_setup_tc(dev, TC_SETUP_QDISC_TAPRIO, offload);
+ if (err < 0) {
+ NL_SET_ERR_MSG(extack,
+ "Device failed to setup taprio offload");
+ goto done;
+ }
+
+done:
+ taprio_offload_free(offload);
+
+ return err;
+}
+
+static int taprio_disable_offload(struct net_device *dev,
+ struct taprio_sched *q,
+ struct netlink_ext_ack *extack)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
+ struct tc_taprio_qopt_offload *offload;
+ int err;
+
+ if (!FULL_OFFLOAD_IS_ENABLED(q->flags))
+ return 0;
+
+ if (!ops->ndo_setup_tc)
+ return -EOPNOTSUPP;
+
+ offload = taprio_offload_alloc(0);
+ if (!offload) {
+ NL_SET_ERR_MSG(extack,
+ "Not enough memory to disable offload mode");
+ return -ENOMEM;
+ }
+ offload->enable = 0;
+
+ err = ops->ndo_setup_tc(dev, TC_SETUP_QDISC_TAPRIO, offload);
+ if (err < 0) {
+ NL_SET_ERR_MSG(extack,
+ "Device failed to disable offload");
+ goto out;
+ }
+
+out:
+ taprio_offload_free(offload);
+
+ return err;
+}
+
+/* If full offload is enabled, the only possible clockid is the net device's
+ * PHC. For that reason, specifying a clockid through netlink is incorrect.
+ * For txtime-assist, it is implicitly assumed that the device's PHC is kept
+ * in sync with the specified clockid via a user space daemon such as phc2sys.
+ * For both software taprio and txtime-assist, the clockid is used for the
+ * hrtimer that advances the schedule and hence mandatory.
+ */
+static int taprio_parse_clockid(struct Qdisc *sch, struct nlattr **tb,
+ struct netlink_ext_ack *extack)
+{
+ struct taprio_sched *q = qdisc_priv(sch);
+ struct net_device *dev = qdisc_dev(sch);
+ int err = -EINVAL;
+
+ if (FULL_OFFLOAD_IS_ENABLED(q->flags)) {
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ struct ethtool_ts_info info = {
+ .cmd = ETHTOOL_GET_TS_INFO,
+ .phc_index = -1,
+ };
+
+ if (tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID]) {
+ NL_SET_ERR_MSG(extack,
+ "The 'clockid' cannot be specified for full offload");
+ goto out;
+ }
+
+ if (ops && ops->get_ts_info)
+ err = ops->get_ts_info(dev, &info);
+
+ if (err || info.phc_index < 0) {
+ NL_SET_ERR_MSG(extack,
+ "Device does not have a PTP clock");
+ err = -ENOTSUPP;
+ goto out;
+ }
+ } else if (tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID]) {
+ int clockid = nla_get_s32(tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID]);
+
+ /* We only support static clockids and we don't allow
+ * for it to be modified after the first init.
+ */
+ if (clockid < 0 ||
+ (q->clockid != -1 && q->clockid != clockid)) {
+ NL_SET_ERR_MSG(extack,
+ "Changing the 'clockid' of a running schedule is not supported");
+ err = -ENOTSUPP;
+ goto out;
+ }
+
+ switch (clockid) {
+ case CLOCK_REALTIME:
+ q->tk_offset = TK_OFFS_REAL;
+ break;
+ case CLOCK_MONOTONIC:
+ q->tk_offset = TK_OFFS_MAX;
+ break;
+ case CLOCK_BOOTTIME:
+ q->tk_offset = TK_OFFS_BOOT;
+ break;
+ case CLOCK_TAI:
+ q->tk_offset = TK_OFFS_TAI;
+ break;
+ default:
+ NL_SET_ERR_MSG(extack, "Invalid 'clockid'");
+ err = -EINVAL;
+ goto out;
+ }
+
+ q->clockid = clockid;
+ } else {
+ NL_SET_ERR_MSG(extack, "Specifying a 'clockid' is mandatory");
+ goto out;
+ }
+
+ /* Everything went ok, return success. */
+ err = 0;
+
+out:
+ return err;
+}
+
+static int taprio_mqprio_cmp(const struct net_device *dev,
+ const struct tc_mqprio_qopt *mqprio)
+{
+ int i;
+
+ if (!mqprio || mqprio->num_tc != dev->num_tc)
+ return -1;
+
+ for (i = 0; i < mqprio->num_tc; i++)
+ if (dev->tc_to_txq[i].count != mqprio->count[i] ||
+ dev->tc_to_txq[i].offset != mqprio->offset[i])
+ return -1;
+
+ for (i = 0; i <= TC_BITMASK; i++)
+ if (dev->prio_tc_map[i] != mqprio->prio_tc_map[i])
+ return -1;
+
+ return 0;
+}
+
+static int taprio_change(struct Qdisc *sch, struct nlattr *opt,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[TCA_TAPRIO_ATTR_MAX + 1] = { };
+ struct sched_gate_list *oper, *admin, *new_admin;
+ struct taprio_sched *q = qdisc_priv(sch);
+ struct net_device *dev = qdisc_dev(sch);
+ struct tc_mqprio_qopt *mqprio = NULL;
+ u32 taprio_flags = 0;
+ unsigned long flags;
+ ktime_t start;
+ int i, err;
+
+ err = nla_parse_nested_deprecated(tb, TCA_TAPRIO_ATTR_MAX, opt,
+ taprio_policy, extack);
+ if (err < 0)
+ return err;
+
+ if (tb[TCA_TAPRIO_ATTR_PRIOMAP])
+ mqprio = nla_data(tb[TCA_TAPRIO_ATTR_PRIOMAP]);
+
+ if (tb[TCA_TAPRIO_ATTR_FLAGS]) {
+ taprio_flags = nla_get_u32(tb[TCA_TAPRIO_ATTR_FLAGS]);
+
+ if (q->flags != 0 && q->flags != taprio_flags) {
+ NL_SET_ERR_MSG_MOD(extack, "Changing 'flags' of a running schedule is not supported");
+ return -EOPNOTSUPP;
+ } else if (!taprio_flags_valid(taprio_flags)) {
+ NL_SET_ERR_MSG_MOD(extack, "Specified 'flags' are not valid");
+ return -EINVAL;
+ }
+
+ q->flags = taprio_flags;
+ }
+
+ err = taprio_parse_mqprio_opt(dev, mqprio, extack, taprio_flags);
+ if (err < 0)
+ return err;
+
+ new_admin = kzalloc(sizeof(*new_admin), GFP_KERNEL);
+ if (!new_admin) {
+ NL_SET_ERR_MSG(extack, "Not enough memory for a new schedule");
+ return -ENOMEM;
+ }
+ INIT_LIST_HEAD(&new_admin->entries);
+
+ rcu_read_lock();
+ oper = rcu_dereference(q->oper_sched);
+ admin = rcu_dereference(q->admin_sched);
+ rcu_read_unlock();
+
+ /* no changes - no new mqprio settings */
+ if (!taprio_mqprio_cmp(dev, mqprio))
+ mqprio = NULL;
+
+ if (mqprio && (oper || admin)) {
+ NL_SET_ERR_MSG(extack, "Changing the traffic mapping of a running schedule is not supported");
+ err = -ENOTSUPP;
+ goto free_sched;
+ }
+
+ err = parse_taprio_schedule(tb, new_admin, extack);
+ if (err < 0)
+ goto free_sched;
+
+ if (new_admin->num_entries == 0) {
+ NL_SET_ERR_MSG(extack, "There should be at least one entry in the schedule");
+ err = -EINVAL;
+ goto free_sched;
+ }
+
+ err = taprio_parse_clockid(sch, tb, extack);
+ if (err < 0)
+ goto free_sched;
+
+ taprio_set_picos_per_byte(dev, q);
+
+ if (FULL_OFFLOAD_IS_ENABLED(taprio_flags))
+ err = taprio_enable_offload(dev, mqprio, q, new_admin, extack);
+ else
+ err = taprio_disable_offload(dev, q, extack);
+ if (err)
+ goto free_sched;
+
+ /* Protects against enqueue()/dequeue() */
+ spin_lock_bh(qdisc_lock(sch));
+
+ if (tb[TCA_TAPRIO_ATTR_TXTIME_DELAY]) {
+ if (!TXTIME_ASSIST_IS_ENABLED(q->flags)) {
+ NL_SET_ERR_MSG_MOD(extack, "txtime-delay can only be set when txtime-assist mode is enabled");
+ err = -EINVAL;
+ goto unlock;
+ }
+
+ q->txtime_delay = nla_get_u32(tb[TCA_TAPRIO_ATTR_TXTIME_DELAY]);
+ }
+
+ if (!TXTIME_ASSIST_IS_ENABLED(taprio_flags) &&
+ !FULL_OFFLOAD_IS_ENABLED(taprio_flags) &&
+ !hrtimer_active(&q->advance_timer)) {
+ hrtimer_init(&q->advance_timer, q->clockid, HRTIMER_MODE_ABS);
+ q->advance_timer.function = advance_sched;
+ }
+
+ if (mqprio) {
+ netdev_set_num_tc(dev, mqprio->num_tc);
+ for (i = 0; i < mqprio->num_tc; i++)
+ netdev_set_tc_queue(dev, i,
+ mqprio->count[i],
+ mqprio->offset[i]);
+
+ /* Always use supplied priority mappings */
+ for (i = 0; i <= TC_BITMASK; i++)
+ netdev_set_prio_tc_map(dev, i,
+ mqprio->prio_tc_map[i]);
+ }
+
+ if (FULL_OFFLOAD_IS_ENABLED(taprio_flags)) {
+ q->dequeue = taprio_dequeue_offload;
+ q->peek = taprio_peek_offload;
+ } else {
+ /* Be sure to always keep the function pointers
+ * in a consistent state.
+ */
+ q->dequeue = taprio_dequeue_soft;
+ q->peek = taprio_peek_soft;
+ }
+
+ err = taprio_get_start_time(sch, new_admin, &start);
+ if (err < 0) {
+ NL_SET_ERR_MSG(extack, "Internal error: failed get start time");
+ goto unlock;
+ }
+
+ if (TXTIME_ASSIST_IS_ENABLED(taprio_flags)) {
+ setup_txtime(q, new_admin, start);
+
+ if (!oper) {
+ rcu_assign_pointer(q->oper_sched, new_admin);
+ err = 0;
+ new_admin = NULL;
+ goto unlock;
+ }
+
+ rcu_assign_pointer(q->admin_sched, new_admin);
+ if (admin)
+ call_rcu(&admin->rcu, taprio_free_sched_cb);
+ } else {
+ setup_first_close_time(q, new_admin, start);
+
+ /* Protects against advance_sched() */
+ spin_lock_irqsave(&q->current_entry_lock, flags);
+
+ taprio_start_sched(sch, start, new_admin);
+
+ rcu_assign_pointer(q->admin_sched, new_admin);
+ if (admin)
+ call_rcu(&admin->rcu, taprio_free_sched_cb);
+
+ spin_unlock_irqrestore(&q->current_entry_lock, flags);
+
+ if (FULL_OFFLOAD_IS_ENABLED(taprio_flags))
+ taprio_offload_config_changed(q);
+ }
+
+ new_admin = NULL;
+ err = 0;
+
+unlock:
+ spin_unlock_bh(qdisc_lock(sch));
+
+free_sched:
+ if (new_admin)
+ call_rcu(&new_admin->rcu, taprio_free_sched_cb);
+
+ return err;
+}
+
+static void taprio_destroy(struct Qdisc *sch)
+{
+ struct taprio_sched *q = qdisc_priv(sch);
+ struct net_device *dev = qdisc_dev(sch);
+ unsigned int i;
+
+ spin_lock(&taprio_list_lock);
+ list_del(&q->taprio_list);
+ spin_unlock(&taprio_list_lock);
+
+ hrtimer_cancel(&q->advance_timer);
+
+ taprio_disable_offload(dev, q, NULL);
+
+ if (q->qdiscs) {
+ for (i = 0; i < dev->num_tx_queues && q->qdiscs[i]; i++)
+ qdisc_put(q->qdiscs[i]);
+
+ kfree(q->qdiscs);
+ }
+ q->qdiscs = NULL;
+
+ netdev_set_num_tc(dev, 0);
+
+ if (q->oper_sched)
+ call_rcu(&q->oper_sched->rcu, taprio_free_sched_cb);
+
+ if (q->admin_sched)
+ call_rcu(&q->admin_sched->rcu, taprio_free_sched_cb);
+}
+
+static int taprio_init(struct Qdisc *sch, struct nlattr *opt,
+ struct netlink_ext_ack *extack)
+{
+ struct taprio_sched *q = qdisc_priv(sch);
+ struct net_device *dev = qdisc_dev(sch);
+ int i;
+
+ spin_lock_init(&q->current_entry_lock);
+
+ hrtimer_init(&q->advance_timer, CLOCK_TAI, HRTIMER_MODE_ABS);
+ q->advance_timer.function = advance_sched;
+
+ q->dequeue = taprio_dequeue_soft;
+ q->peek = taprio_peek_soft;
+
+ q->root = sch;
+
+ /* We only support static clockids. Use an invalid value as default
+ * and get the valid one on taprio_change().
+ */
+ q->clockid = -1;
+
+ spin_lock(&taprio_list_lock);
+ list_add(&q->taprio_list, &taprio_list);
+ spin_unlock(&taprio_list_lock);
+
+ if (sch->parent != TC_H_ROOT)
+ return -EOPNOTSUPP;
+
+ if (!netif_is_multiqueue(dev))
+ return -EOPNOTSUPP;
+
+ /* pre-allocate qdisc, attachment can't fail */
+ q->qdiscs = kcalloc(dev->num_tx_queues,
+ sizeof(q->qdiscs[0]),
+ GFP_KERNEL);
+
+ if (!q->qdiscs)
+ return -ENOMEM;
+
+ if (!opt)
+ return -EINVAL;
+
+ for (i = 0; i < dev->num_tx_queues; i++) {
+ struct netdev_queue *dev_queue;
+ struct Qdisc *qdisc;
+
+ dev_queue = netdev_get_tx_queue(dev, i);
+ qdisc = qdisc_create_dflt(dev_queue,
+ &pfifo_qdisc_ops,
+ TC_H_MAKE(TC_H_MAJ(sch->handle),
+ TC_H_MIN(i + 1)),
+ extack);
+ if (!qdisc)
+ return -ENOMEM;
+
+ if (i < dev->real_num_tx_queues)
+ qdisc_hash_add(qdisc, false);
+
+ q->qdiscs[i] = qdisc;
+ }
+
+ return taprio_change(sch, opt, extack);
+}
+
+static struct netdev_queue *taprio_queue_get(struct Qdisc *sch,
+ unsigned long cl)
+{
+ struct net_device *dev = qdisc_dev(sch);
+ unsigned long ntx = cl - 1;
+
+ if (ntx >= dev->num_tx_queues)
+ return NULL;
+
+ return netdev_get_tx_queue(dev, ntx);
+}
+
+static int taprio_graft(struct Qdisc *sch, unsigned long cl,
+ struct Qdisc *new, struct Qdisc **old,
+ struct netlink_ext_ack *extack)
+{
+ struct taprio_sched *q = qdisc_priv(sch);
+ struct net_device *dev = qdisc_dev(sch);
+ struct netdev_queue *dev_queue = taprio_queue_get(sch, cl);
+
+ if (!dev_queue)
+ return -EINVAL;
+
+ if (dev->flags & IFF_UP)
+ dev_deactivate(dev);
+
+ *old = q->qdiscs[cl - 1];
+ q->qdiscs[cl - 1] = new;
+
+ if (new)
+ new->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT;
+
+ if (dev->flags & IFF_UP)
+ dev_activate(dev);
+
+ return 0;
+}
+
+static int dump_entry(struct sk_buff *msg,
+ const struct sched_entry *entry)
+{
+ struct nlattr *item;
+
+ item = nla_nest_start_noflag(msg, TCA_TAPRIO_SCHED_ENTRY);
+ if (!item)
+ return -ENOSPC;
+
+ if (nla_put_u32(msg, TCA_TAPRIO_SCHED_ENTRY_INDEX, entry->index))
+ goto nla_put_failure;
+
+ if (nla_put_u8(msg, TCA_TAPRIO_SCHED_ENTRY_CMD, entry->command))
+ goto nla_put_failure;
+
+ if (nla_put_u32(msg, TCA_TAPRIO_SCHED_ENTRY_GATE_MASK,
+ entry->gate_mask))
+ goto nla_put_failure;
+
+ if (nla_put_u32(msg, TCA_TAPRIO_SCHED_ENTRY_INTERVAL,
+ entry->interval))
+ goto nla_put_failure;
+
+ return nla_nest_end(msg, item);
+
+nla_put_failure:
+ nla_nest_cancel(msg, item);
+ return -1;
+}
+
+static int dump_schedule(struct sk_buff *msg,
+ const struct sched_gate_list *root)
+{
+ struct nlattr *entry_list;
+ struct sched_entry *entry;
+
+ if (nla_put_s64(msg, TCA_TAPRIO_ATTR_SCHED_BASE_TIME,
+ root->base_time, TCA_TAPRIO_PAD))
+ return -1;
+
+ if (nla_put_s64(msg, TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME,
+ root->cycle_time, TCA_TAPRIO_PAD))
+ return -1;
+
+ if (nla_put_s64(msg, TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME_EXTENSION,
+ root->cycle_time_extension, TCA_TAPRIO_PAD))
+ return -1;
+
+ entry_list = nla_nest_start_noflag(msg,
+ TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST);
+ if (!entry_list)
+ goto error_nest;
+
+ list_for_each_entry(entry, &root->entries, list) {
+ if (dump_entry(msg, entry) < 0)
+ goto error_nest;
+ }
+
+ nla_nest_end(msg, entry_list);
+ return 0;
+
+error_nest:
+ nla_nest_cancel(msg, entry_list);
+ return -1;
+}
+
+static int taprio_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+ struct taprio_sched *q = qdisc_priv(sch);
+ struct net_device *dev = qdisc_dev(sch);
+ struct sched_gate_list *oper, *admin;
+ struct tc_mqprio_qopt opt = { 0 };
+ struct nlattr *nest, *sched_nest;
+ unsigned int i;
+
+ rcu_read_lock();
+ oper = rcu_dereference(q->oper_sched);
+ admin = rcu_dereference(q->admin_sched);
+
+ opt.num_tc = netdev_get_num_tc(dev);
+ memcpy(opt.prio_tc_map, dev->prio_tc_map, sizeof(opt.prio_tc_map));
+
+ for (i = 0; i < netdev_get_num_tc(dev); i++) {
+ opt.count[i] = dev->tc_to_txq[i].count;
+ opt.offset[i] = dev->tc_to_txq[i].offset;
+ }
+
+ nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
+ if (!nest)
+ goto start_error;
+
+ if (nla_put(skb, TCA_TAPRIO_ATTR_PRIOMAP, sizeof(opt), &opt))
+ goto options_error;
+
+ if (!FULL_OFFLOAD_IS_ENABLED(q->flags) &&
+ nla_put_s32(skb, TCA_TAPRIO_ATTR_SCHED_CLOCKID, q->clockid))
+ goto options_error;
+
+ if (q->flags && nla_put_u32(skb, TCA_TAPRIO_ATTR_FLAGS, q->flags))
+ goto options_error;
+
+ if (q->txtime_delay &&
+ nla_put_u32(skb, TCA_TAPRIO_ATTR_TXTIME_DELAY, q->txtime_delay))
+ goto options_error;
+
+ if (oper && dump_schedule(skb, oper))
+ goto options_error;
+
+ if (!admin)
+ goto done;
+
+ sched_nest = nla_nest_start_noflag(skb, TCA_TAPRIO_ATTR_ADMIN_SCHED);
+ if (!sched_nest)
+ goto options_error;
+
+ if (dump_schedule(skb, admin))
+ goto admin_error;
+
+ nla_nest_end(skb, sched_nest);
+
+done:
+ rcu_read_unlock();
+
+ return nla_nest_end(skb, nest);
+
+admin_error:
+ nla_nest_cancel(skb, sched_nest);
+
+options_error:
+ nla_nest_cancel(skb, nest);
+
+start_error:
+ rcu_read_unlock();
+ return -ENOSPC;
+}
+
+static struct Qdisc *taprio_leaf(struct Qdisc *sch, unsigned long cl)
+{
+ struct netdev_queue *dev_queue = taprio_queue_get(sch, cl);
+
+ if (!dev_queue)
+ return NULL;
+
+ return dev_queue->qdisc_sleeping;
+}
+
+static unsigned long taprio_find(struct Qdisc *sch, u32 classid)
+{
+ unsigned int ntx = TC_H_MIN(classid);
+
+ if (!taprio_queue_get(sch, ntx))
+ return 0;
+ return ntx;
+}
+
+static int taprio_dump_class(struct Qdisc *sch, unsigned long cl,
+ struct sk_buff *skb, struct tcmsg *tcm)
+{
+ struct netdev_queue *dev_queue = taprio_queue_get(sch, cl);
+
+ tcm->tcm_parent = TC_H_ROOT;
+ tcm->tcm_handle |= TC_H_MIN(cl);
+ tcm->tcm_info = dev_queue->qdisc_sleeping->handle;
+
+ return 0;
+}
+
+static int taprio_dump_class_stats(struct Qdisc *sch, unsigned long cl,
+ struct gnet_dump *d)
+ __releases(d->lock)
+ __acquires(d->lock)
+{
+ struct netdev_queue *dev_queue = taprio_queue_get(sch, cl);
+
+ sch = dev_queue->qdisc_sleeping;
+ if (gnet_stats_copy_basic(&sch->running, d, NULL, &sch->bstats) < 0 ||
+ qdisc_qstats_copy(d, sch) < 0)
+ return -1;
+ return 0;
+}
+
+static void taprio_walk(struct Qdisc *sch, struct qdisc_walker *arg)
+{
+ struct net_device *dev = qdisc_dev(sch);
+ unsigned long ntx;
+
+ if (arg->stop)
+ return;
+
+ arg->count = arg->skip;
+ for (ntx = arg->skip; ntx < dev->num_tx_queues; ntx++) {
+ if (arg->fn(sch, ntx + 1, arg) < 0) {
+ arg->stop = 1;
+ break;
+ }
+ arg->count++;
+ }
+}
+
+static struct netdev_queue *taprio_select_queue(struct Qdisc *sch,
+ struct tcmsg *tcm)
+{
+ return taprio_queue_get(sch, TC_H_MIN(tcm->tcm_parent));
+}
+
+static const struct Qdisc_class_ops taprio_class_ops = {
+ .graft = taprio_graft,
+ .leaf = taprio_leaf,
+ .find = taprio_find,
+ .walk = taprio_walk,
+ .dump = taprio_dump_class,
+ .dump_stats = taprio_dump_class_stats,
+ .select_queue = taprio_select_queue,
+};
+
+static struct Qdisc_ops taprio_qdisc_ops __read_mostly = {
+ .cl_ops = &taprio_class_ops,
+ .id = "taprio",
+ .priv_size = sizeof(struct taprio_sched),
+ .init = taprio_init,
+ .change = taprio_change,
+ .destroy = taprio_destroy,
+ .peek = taprio_peek,
+ .dequeue = taprio_dequeue,
+ .enqueue = taprio_enqueue,
+ .dump = taprio_dump,
+ .owner = THIS_MODULE,
+};
+
+static struct notifier_block taprio_device_notifier = {
+ .notifier_call = taprio_dev_notifier,
+};
+
+static int __init taprio_module_init(void)
+{
+ int err = register_netdevice_notifier(&taprio_device_notifier);
+
+ if (err)
+ return err;
+
+ return register_qdisc(&taprio_qdisc_ops);
+}
+
+static void __exit taprio_module_exit(void)
+{
+ unregister_qdisc(&taprio_qdisc_ops);
+ unregister_netdevice_notifier(&taprio_device_notifier);
+}
+
+module_init(taprio_module_init);
+module_exit(taprio_module_exit);
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c
index 6f74a42..5f72f3f 100644
--- a/net/sched/sch_tbf.c
+++ b/net/sched/sch_tbf.c
@@ -1,15 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* net/sched/sch_tbf.c Token Bucket Filter queue.
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
* Dmitry Torokhov <dtor@mail.ru> - allow attaching inner qdiscs -
* original idea by Martin Devera
- *
*/
#include <linux/module.h>
@@ -162,7 +157,7 @@
nb = 0;
while (segs) {
nskb = segs->next;
- segs->next = NULL;
+ skb_mark_not_on_list(segs);
qdisc_skb_cb(segs)->pkt_len = segs->len;
len += segs->len;
ret = qdisc_enqueue(segs, q->qdisc, to_free);
@@ -185,6 +180,7 @@
struct sk_buff **to_free)
{
struct tbf_sched_data *q = qdisc_priv(sch);
+ unsigned int len = qdisc_pkt_len(skb);
int ret;
if (qdisc_pkt_len(skb) > q->max_size) {
@@ -200,7 +196,7 @@
return ret;
}
- qdisc_qstats_backlog_inc(sch, skb);
+ sch->qstats.backlog += len;
sch->q.qlen++;
return NET_XMIT_SUCCESS;
}
@@ -307,7 +303,8 @@
s64 buffer, mtu;
u64 rate64 = 0, prate64 = 0;
- err = nla_parse_nested(tb, TCA_TBF_MAX, opt, tbf_policy, NULL);
+ err = nla_parse_nested_deprecated(tb, TCA_TBF_MAX, opt, tbf_policy,
+ NULL);
if (err < 0)
return err;
@@ -390,9 +387,8 @@
sch_tree_lock(sch);
if (child) {
- qdisc_tree_reduce_backlog(q->qdisc, q->qdisc->q.qlen,
- q->qdisc->qstats.backlog);
- qdisc_destroy(q->qdisc);
+ qdisc_tree_flush_backlog(q->qdisc);
+ qdisc_put(q->qdisc);
q->qdisc = child;
}
q->limit = qopt->limit;
@@ -438,7 +434,7 @@
struct tbf_sched_data *q = qdisc_priv(sch);
qdisc_watchdog_cancel(&q->watchdog);
- qdisc_destroy(q->qdisc);
+ qdisc_put(q->qdisc);
}
static int tbf_dump(struct Qdisc *sch, struct sk_buff *skb)
@@ -448,7 +444,7 @@
struct tc_tbf_qopt opt;
sch->qstats.backlog = q->qdisc->qstats.backlog;
- nest = nla_nest_start(skb, TCA_OPTIONS);
+ nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
if (nest == NULL)
goto nla_put_failure;
diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c
index 93f04cf..689ef6f 100644
--- a/net/sched/sch_teql.c
+++ b/net/sched/sch_teql.c
@@ -1,10 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/* net/sched/sch_teql.c "True" (or "trivial") link equalizer.
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
*/