Update Linux to v5.10.109
Sourced from [1]
[1] https://cdn.kernel.org/pub/linux/kernel/v5.x/linux-5.10.109.tar.xz
Change-Id: I19bca9fc6762d4e63bcf3e4cba88bbe560d9c76c
Signed-off-by: Olivier Deprez <olivier.deprez@arm.com>
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index 2985509..d762e89 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -6,7 +6,7 @@
menuconfig NET_SCHED
bool "QoS and/or fair queueing"
select NET_SCH_FIFO
- ---help---
+ help
When the kernel has several packets to send out over a network
device, it has to decide which ones to send first, which ones to
delay, and which ones to drop. This is the job of the queueing
@@ -47,7 +47,7 @@
config NET_SCH_CBQ
tristate "Class Based Queueing (CBQ)"
- ---help---
+ help
Say Y here if you want to use the Class-Based Queueing (CBQ) packet
scheduling algorithm. This algorithm classifies the waiting packets
into a tree-like hierarchy of classes; the leaves of this tree are
@@ -64,7 +64,7 @@
config NET_SCH_HTB
tristate "Hierarchical Token Bucket (HTB)"
- ---help---
+ help
Say Y here if you want to use the Hierarchical Token Buckets (HTB)
packet scheduling algorithm. See
<http://luxik.cdi.cz/~devik/qos/htb/> for complete manual and
@@ -78,7 +78,7 @@
config NET_SCH_HFSC
tristate "Hierarchical Fair Service Curve (HFSC)"
- ---help---
+ help
Say Y here if you want to use the Hierarchical Fair Service Curve
(HFSC) packet scheduling algorithm.
@@ -88,7 +88,7 @@
config NET_SCH_ATM
tristate "ATM Virtual Circuits (ATM)"
depends on ATM
- ---help---
+ help
Say Y here if you want to use the ATM pseudo-scheduler. This
provides a framework for invoking classifiers, which in turn
select classes of this queuing discipline. Each class maps
@@ -101,7 +101,7 @@
config NET_SCH_PRIO
tristate "Multi Band Priority Queueing (PRIO)"
- ---help---
+ help
Say Y here if you want to use an n-band priority queue packet
scheduler.
@@ -110,7 +110,7 @@
config NET_SCH_MULTIQ
tristate "Hardware Multiqueue-aware Multi Band Queuing (MULTIQ)"
- ---help---
+ help
Say Y here if you want to use an n-band queue packet scheduler
to support devices that have multiple hardware transmit queues.
@@ -119,7 +119,7 @@
config NET_SCH_RED
tristate "Random Early Detection (RED)"
- ---help---
+ help
Say Y here if you want to use the Random Early Detection (RED)
packet scheduling algorithm.
@@ -130,7 +130,7 @@
config NET_SCH_SFB
tristate "Stochastic Fair Blue (SFB)"
- ---help---
+ help
Say Y here if you want to use the Stochastic Fair Blue (SFB)
packet scheduling algorithm.
@@ -141,7 +141,7 @@
config NET_SCH_SFQ
tristate "Stochastic Fairness Queueing (SFQ)"
- ---help---
+ help
Say Y here if you want to use the Stochastic Fairness Queueing (SFQ)
packet scheduling algorithm.
@@ -152,7 +152,7 @@
config NET_SCH_TEQL
tristate "True Link Equalizer (TEQL)"
- ---help---
+ help
Say Y here if you want to use the True Link Equalizer (TLE) packet
scheduling algorithm. This queueing discipline allows the combination
of several physical devices into one virtual device.
@@ -164,7 +164,7 @@
config NET_SCH_TBF
tristate "Token Bucket Filter (TBF)"
- ---help---
+ help
Say Y here if you want to use the Token Bucket Filter (TBF) packet
scheduling algorithm.
@@ -175,7 +175,7 @@
config NET_SCH_CBS
tristate "Credit Based Shaper (CBS)"
- ---help---
+ help
Say Y here if you want to use the Credit Based Shaper (CBS) packet
scheduling algorithm.
@@ -208,7 +208,7 @@
config NET_SCH_GRED
tristate "Generic Random Early Detection (GRED)"
- ---help---
+ help
Say Y here if you want to use the Generic Random Early Detection
(GRED) packet scheduling algorithm for some of your network devices
(see the top of <file:net/sched/sch_red.c> for details and
@@ -219,7 +219,7 @@
config NET_SCH_DSMARK
tristate "Differentiated Services marker (DSMARK)"
- ---help---
+ help
Say Y if you want to schedule packets according to the
Differentiated Services architecture proposed in RFC 2475.
Technical information on this method, with pointers to associated
@@ -230,7 +230,7 @@
config NET_SCH_NETEM
tristate "Network emulator (NETEM)"
- ---help---
+ help
Say Y if you want to emulate network delay, loss, and packet
re-ordering. This is often useful to simulate networks when
testing applications or protocols.
@@ -366,12 +366,25 @@
If unsure, say N.
+config NET_SCH_FQ_PIE
+ depends on NET_SCH_PIE
+ tristate "Flow Queue Proportional Integral controller Enhanced (FQ-PIE)"
+ help
+ Say Y here if you want to use the Flow Queue Proportional Integral
+ controller Enhanced (FQ-PIE) packet scheduling algorithm.
+ For more information, please see https://tools.ietf.org/html/rfc8033
+
+ To compile this driver as a module, choose M here: the module
+ will be called sch_fq_pie.
+
+ If unsure, say N.
+
config NET_SCH_INGRESS
tristate "Ingress/classifier-action Qdisc"
depends on NET_CLS_ACT
select NET_INGRESS
select NET_EGRESS
- ---help---
+ help
Say Y here if you want to use classifiers for incoming and/or outgoing
packets. This qdisc doesn't do anything else besides running classifiers,
which can also have actions attached to them. In case of outgoing packets,
@@ -385,7 +398,7 @@
config NET_SCH_PLUG
tristate "Plug network traffic until release (PLUG)"
- ---help---
+ help
This queuing discipline allows userspace to plug/unplug a network
output queue, using the netlink interface. When it receives an
@@ -409,9 +422,26 @@
To compile this code as a module, choose M here: the
module will be called sch_plug.
+config NET_SCH_ETS
+ tristate "Enhanced transmission selection scheduler (ETS)"
+ help
+ The Enhanced Transmission Selection scheduler is a classful
+ queuing discipline that merges functionality of PRIO and DRR
+ qdiscs in one scheduler. ETS makes it easy to configure a set of
+ strict and bandwidth-sharing bands to implement the transmission
+ selection described in 802.1Qaz.
+
+ Say Y here if you want to use the ETS packet scheduling
+ algorithm.
+
+ To compile this driver as a module, choose M here: the module
+ will be called sch_ets.
+
+ If unsure, say N.
+
menuconfig NET_SCH_DEFAULT
bool "Allow override default queue discipline"
- ---help---
+ help
Support for selection of default queuing discipline.
Nearly all users can safely say no here, and the default
@@ -438,6 +468,9 @@
config DEFAULT_FQ_CODEL
bool "Fair Queue Controlled Delay" if NET_SCH_FQ_CODEL
+ config DEFAULT_FQ_PIE
+ bool "Flow Queue Proportional Integral controller Enhanced" if NET_SCH_FQ_PIE
+
config DEFAULT_SFQ
bool "Stochastic Fair Queue" if NET_SCH_SFQ
@@ -450,6 +483,7 @@
default "pfifo_fast" if DEFAULT_PFIFO_FAST
default "fq" if DEFAULT_FQ
default "fq_codel" if DEFAULT_FQ_CODEL
+ default "fq_pie" if DEFAULT_FQ_PIE
default "sfq" if DEFAULT_SFQ
default "pfifo_fast"
endif
@@ -462,7 +496,7 @@
config NET_CLS_BASIC
tristate "Elementary classification (BASIC)"
select NET_CLS
- ---help---
+ help
Say Y here if you want to be able to classify packets using
only extended matches and actions.
@@ -472,7 +506,7 @@
config NET_CLS_TCINDEX
tristate "Traffic-Control Index (TCINDEX)"
select NET_CLS
- ---help---
+ help
Say Y here if you want to be able to classify packets based on
traffic control indices. You will want this feature if you want
to implement Differentiated Services together with DSMARK.
@@ -485,7 +519,7 @@
depends on INET
select IP_ROUTE_CLASSID
select NET_CLS
- ---help---
+ help
If you say Y here, you will be able to classify packets
according to the route table entry they matched.
@@ -495,7 +529,7 @@
config NET_CLS_FW
tristate "Netfilter mark (FW)"
select NET_CLS
- ---help---
+ help
If you say Y here, you will be able to classify packets
according to netfilter/firewall marks.
@@ -505,7 +539,7 @@
config NET_CLS_U32
tristate "Universal 32bit comparisons w/ hashing (U32)"
select NET_CLS
- ---help---
+ help
Say Y here to be able to classify packets using a universal
32bit pieces based comparison scheme.
@@ -515,20 +549,20 @@
config CLS_U32_PERF
bool "Performance counters support"
depends on NET_CLS_U32
- ---help---
+ help
Say Y here to make u32 gather additional statistics useful for
fine tuning u32 classifiers.
config CLS_U32_MARK
bool "Netfilter marks support"
depends on NET_CLS_U32
- ---help---
+ help
Say Y here to be able to use netfilter marks as u32 key.
config NET_CLS_RSVP
tristate "IPv4 Resource Reservation Protocol (RSVP)"
select NET_CLS
- ---help---
+ help
The Resource Reservation Protocol (RSVP) permits end systems to
request a minimum and maximum data flow rate for a connection; this
is important for real time data such as streaming sound or video.
@@ -542,7 +576,7 @@
config NET_CLS_RSVP6
tristate "IPv6 Resource Reservation Protocol (RSVP6)"
select NET_CLS
- ---help---
+ help
The Resource Reservation Protocol (RSVP) permits end systems to
request a minimum and maximum data flow rate for a connection; this
is important for real time data such as streaming sound or video.
@@ -556,7 +590,7 @@
config NET_CLS_FLOW
tristate "Flow classifier"
select NET_CLS
- ---help---
+ help
If you say Y here, you will be able to classify packets based on
a configurable combination of packet keys. This is mostly useful
in combination with SFQ.
@@ -569,7 +603,7 @@
select NET_CLS
select CGROUP_NET_CLASSID
depends on CGROUPS
- ---help---
+ help
Say Y here if you want to classify packets based on the control
cgroup of their process.
@@ -579,7 +613,7 @@
config NET_CLS_BPF
tristate "BPF-based classifier"
select NET_CLS
- ---help---
+ help
If you say Y here, you will be able to classify packets based on
programmable BPF (JIT'ed) filters as an alternative to ematches.
@@ -589,7 +623,7 @@
config NET_CLS_FLOWER
tristate "Flower classifier"
select NET_CLS
- ---help---
+ help
If you say Y here, you will be able to classify packets based on
a configurable combination of packet keys and masks.
@@ -599,7 +633,7 @@
config NET_CLS_MATCHALL
tristate "Match-all classifier"
select NET_CLS
- ---help---
+ help
If you say Y here, you will be able to classify packets based on
nothing. Every packet will match.
@@ -609,7 +643,7 @@
config NET_EMATCH
bool "Extended Matches"
select NET_CLS
- ---help---
+ help
Say Y here if you want to use extended matches on top of classifiers
and select the extended matches below.
@@ -623,7 +657,7 @@
int "Stack size"
depends on NET_EMATCH
default "32"
- ---help---
+ help
Size of the local stack variable used while evaluating the tree of
ematches. Limits the depth of the tree, i.e. the number of
encapsulated precedences. Every level requires 4 bytes of additional
@@ -632,7 +666,7 @@
config NET_EMATCH_CMP
tristate "Simple packet data comparison"
depends on NET_EMATCH
- ---help---
+ help
Say Y here if you want to be able to classify packets based on
simple packet data comparisons for 8, 16, and 32bit values.
@@ -642,7 +676,7 @@
config NET_EMATCH_NBYTE
tristate "Multi byte comparison"
depends on NET_EMATCH
- ---help---
+ help
Say Y here if you want to be able to classify packets based on
multiple byte comparisons mainly useful for IPv6 address comparisons.
@@ -652,7 +686,7 @@
config NET_EMATCH_U32
tristate "U32 key"
depends on NET_EMATCH
- ---help---
+ help
Say Y here if you want to be able to classify packets using
the famous u32 key in combination with logic relations.
@@ -662,7 +696,7 @@
config NET_EMATCH_META
tristate "Metadata"
depends on NET_EMATCH
- ---help---
+ help
Say Y here if you want to be able to classify packets based on
metadata such as load average, netfilter attributes, socket
attributes and routing decisions.
@@ -677,7 +711,7 @@
select TEXTSEARCH_KMP
select TEXTSEARCH_BM
select TEXTSEARCH_FSM
- ---help---
+ help
Say Y here if you want to be able to classify packets based on
textsearch comparisons.
@@ -687,7 +721,7 @@
config NET_EMATCH_CANID
tristate "CAN Identifier"
depends on NET_EMATCH && (CAN=y || CAN=m)
- ---help---
+ help
Say Y here if you want to be able to classify CAN frames based
on CAN Identifier.
@@ -697,7 +731,7 @@
config NET_EMATCH_IPSET
tristate "IPset"
depends on NET_EMATCH && IP_SET
- ---help---
+ help
Say Y here if you want to be able to classify packets based on
ipset membership.
@@ -707,7 +741,7 @@
config NET_EMATCH_IPT
tristate "IPtables Matches"
depends on NET_EMATCH && NETFILTER && NETFILTER_XTABLES
- ---help---
+ help
Say Y here to be able to classify packets based on iptables
matches.
Current supported match is "policy" which allows packet classification
@@ -719,7 +753,7 @@
config NET_CLS_ACT
bool "Actions"
select NET_CLS
- ---help---
+ help
Say Y here if you want to use traffic control actions. Actions
get attached to classifiers and are invoked after a successful
classification. They are used to overwrite the classification
@@ -731,7 +765,7 @@
config NET_ACT_POLICE
tristate "Traffic Policing"
depends on NET_CLS_ACT
- ---help---
+ help
Say Y here if you want to do traffic policing, i.e. strict
bandwidth limiting. This action replaces the existing policing
module.
@@ -742,7 +776,7 @@
config NET_ACT_GACT
tristate "Generic actions"
depends on NET_CLS_ACT
- ---help---
+ help
Say Y here to take generic actions such as dropping and
accepting packets.
@@ -752,13 +786,13 @@
config GACT_PROB
bool "Probability support"
depends on NET_ACT_GACT
- ---help---
+ help
Say Y here to use the generic action randomly or deterministically.
config NET_ACT_MIRRED
tristate "Redirecting and Mirroring"
depends on NET_CLS_ACT
- ---help---
+ help
Say Y here to allow packets to be mirrored or redirected to
other devices.
@@ -769,7 +803,7 @@
tristate "Traffic Sampling"
depends on NET_CLS_ACT
select PSAMPLE
- ---help---
+ help
Say Y here to allow packet sampling tc action. The packet sample
action consists of statistically choosing packets and sampling
them using the psample module.
@@ -779,8 +813,8 @@
config NET_ACT_IPT
tristate "IPtables targets"
- depends on NET_CLS_ACT && NETFILTER && IP_NF_IPTABLES
- ---help---
+ depends on NET_CLS_ACT && NETFILTER && NETFILTER_XTABLES
+ help
Say Y here to be able to invoke iptables targets after successful
classification.
@@ -790,7 +824,7 @@
config NET_ACT_NAT
tristate "Stateless NAT"
depends on NET_CLS_ACT
- ---help---
+ help
Say Y here to do stateless NAT on IPv4 packets. You should use
netfilter for NAT unless you know what you are doing.
@@ -800,7 +834,7 @@
config NET_ACT_PEDIT
tristate "Packet Editing"
depends on NET_CLS_ACT
- ---help---
+ help
Say Y here if you want to mangle the content of packets.
To compile this code as a module, choose M here: the
@@ -809,7 +843,7 @@
config NET_ACT_SIMP
tristate "Simple Example (Debug)"
depends on NET_CLS_ACT
- ---help---
+ help
Say Y here to add a simple action for demonstration purposes.
It is meant as an example and for debugging purposes. It will
print a configured policy string followed by the packet count
@@ -823,7 +857,7 @@
config NET_ACT_SKBEDIT
tristate "SKB Editing"
depends on NET_CLS_ACT
- ---help---
+ help
Say Y here to change skb priority or queue_mapping settings.
If unsure, say N.
@@ -835,7 +869,7 @@
tristate "Checksum Updating"
depends on NET_CLS_ACT && INET
select LIBCRC32C
- ---help---
+ help
Say Y here to update some common checksum after some direct
packet alterations.
@@ -856,7 +890,7 @@
config NET_ACT_VLAN
tristate "Vlan manipulation"
depends on NET_CLS_ACT
- ---help---
+ help
Say Y here to push or pop vlan headers.
If unsure, say N.
@@ -867,7 +901,7 @@
config NET_ACT_BPF
tristate "BPF based action"
depends on NET_CLS_ACT
- ---help---
+ help
Say Y here to execute BPF code on packets. The BPF code will decide
if the packet should be dropped or not.
@@ -878,9 +912,9 @@
config NET_ACT_CONNMARK
tristate "Netfilter Connection Mark Retriever"
- depends on NET_CLS_ACT && NETFILTER && IP_NF_IPTABLES
+ depends on NET_CLS_ACT && NETFILTER
depends on NF_CONNTRACK && NF_CONNTRACK_MARK
- ---help---
+ help
Say Y here to allow retrieving of conn mark
If unsure, say N.
@@ -890,7 +924,7 @@
config NET_ACT_CTINFO
tristate "Netfilter Connection Mark Actions"
- depends on NET_CLS_ACT && NETFILTER && IP_NF_IPTABLES
+ depends on NET_CLS_ACT && NETFILTER
depends on NF_CONNTRACK && NF_CONNTRACK_MARK
help
Say Y here to allow transfer of a connmark stored information.
@@ -908,7 +942,7 @@
config NET_ACT_SKBMOD
tristate "skb data modification action"
depends on NET_CLS_ACT
- ---help---
+ help
Say Y here to allow modification of skb data
If unsure, say N.
@@ -920,7 +954,7 @@
tristate "Inter-FE action based on IETF ForCES InterFE LFB"
depends on NET_CLS_ACT
select NET_IFE
- ---help---
+ help
Say Y here to allow for sourcing and terminating metadata
For details refer to netdev01 paper:
"Distributing Linux Traffic Control Classifier-Action Subsystem"
@@ -932,7 +966,7 @@
config NET_ACT_TUNNEL_KEY
tristate "IP tunnel metadata manipulation"
depends on NET_CLS_ACT
- ---help---
+ help
Say Y here to set/release ip tunnel metadata.
If unsure, say N.
@@ -942,7 +976,7 @@
config NET_ACT_CT
tristate "connection tracking tc action"
- depends on NET_CLS_ACT && NF_CONNTRACK && NF_NAT
+ depends on NET_CLS_ACT && NF_CONNTRACK && NF_NAT && NF_FLOW_TABLE
help
Say Y here to allow sending the packets to conntrack module.
@@ -951,6 +985,18 @@
To compile this code as a module, choose M here: the
module will be called act_ct.
+config NET_ACT_GATE
+ tristate "Frame gate entry list control tc action"
+ depends on NET_CLS_ACT
+ help
+ Say Y here to allow to control the ingress flow to be passed at
+ specific time slot and be dropped at other specific time slot by
+ the gate entry list.
+
+ If unsure, say N.
+ To compile this code as a module, choose M here: the
+ module will be called act_gate.
+
config NET_IFE_SKBMARK
tristate "Support to encoding decoding skb mark on IFE action"
depends on NET_ACT_IFE
diff --git a/net/sched/Makefile b/net/sched/Makefile
index 415d1e1..66bbf9a 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -30,6 +30,7 @@
obj-$(CONFIG_NET_IFE_SKBTCINDEX) += act_meta_skbtcindex.o
obj-$(CONFIG_NET_ACT_TUNNEL_KEY)+= act_tunnel_key.o
obj-$(CONFIG_NET_ACT_CT) += act_ct.o
+obj-$(CONFIG_NET_ACT_GATE) += act_gate.o
obj-$(CONFIG_NET_SCH_FIFO) += sch_fifo.o
obj-$(CONFIG_NET_SCH_CBQ) += sch_cbq.o
obj-$(CONFIG_NET_SCH_HTB) += sch_htb.o
@@ -48,6 +49,7 @@
obj-$(CONFIG_NET_SCH_NETEM) += sch_netem.o
obj-$(CONFIG_NET_SCH_DRR) += sch_drr.o
obj-$(CONFIG_NET_SCH_PLUG) += sch_plug.o
+obj-$(CONFIG_NET_SCH_ETS) += sch_ets.o
obj-$(CONFIG_NET_SCH_MQPRIO) += sch_mqprio.o
obj-$(CONFIG_NET_SCH_SKBPRIO) += sch_skbprio.o
obj-$(CONFIG_NET_SCH_CHOKE) += sch_choke.o
@@ -58,6 +60,7 @@
obj-$(CONFIG_NET_SCH_FQ) += sch_fq.o
obj-$(CONFIG_NET_SCH_HHF) += sch_hhf.o
obj-$(CONFIG_NET_SCH_PIE) += sch_pie.o
+obj-$(CONFIG_NET_SCH_FQ_PIE) += sch_fq_pie.o
obj-$(CONFIG_NET_SCH_CBS) += sch_cbs.o
obj-$(CONFIG_NET_SCH_ETF) += sch_etf.o
obj-$(CONFIG_NET_SCH_TAPRIO) += sch_taprio.o
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 75132d0..7b29aa1 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -88,7 +88,7 @@
struct tcf_chain *goto_chain)
{
a->tcfa_action = action;
- rcu_swap_protected(a->goto_chain, goto_chain, 1);
+ goto_chain = rcu_replace_pointer(a->goto_chain, goto_chain, 1);
return goto_chain;
}
EXPORT_SYMBOL(tcf_action_set_ctrlact);
@@ -142,7 +142,7 @@
return 0;
}
-int __tcf_idr_release(struct tc_action *p, bool bind, bool strict)
+static int __tcf_idr_release(struct tc_action *p, bool bind, bool strict)
{
int ret = 0;
@@ -168,7 +168,18 @@
return ret;
}
-EXPORT_SYMBOL(__tcf_idr_release);
+
+int tcf_idr_release(struct tc_action *a, bool bind)
+{
+ const struct tc_action_ops *ops = a->ops;
+ int ret;
+
+ ret = __tcf_idr_release(a, bind, false);
+ if (ret == ACT_P_DELETED)
+ module_put(ops->owner);
+ return ret;
+}
+EXPORT_SYMBOL(tcf_idr_release);
static size_t tcf_action_shared_attrs_size(const struct tc_action *act)
{
@@ -185,9 +196,13 @@
return nla_total_size(0) /* action number nested */
+ nla_total_size(IFNAMSIZ) /* TCA_ACT_KIND */
+ cookie_len /* TCA_ACT_COOKIE */
+ + nla_total_size(sizeof(struct nla_bitfield32)) /* TCA_ACT_HW_STATS */
+ nla_total_size(0) /* TCA_ACT_STATS nested */
+ + nla_total_size(sizeof(struct nla_bitfield32)) /* TCA_ACT_FLAGS */
/* TCA_STATS_BASIC */
+ nla_total_size_64bit(sizeof(struct gnet_stats_basic))
+ /* TCA_STATS_PKT64 */
+ + nla_total_size_64bit(sizeof(u64))
/* TCA_STATS_QUEUE */
+ nla_total_size_64bit(sizeof(struct gnet_stats_queue))
+ nla_total_size(0) /* TCA_OPTIONS nested */
@@ -404,7 +419,7 @@
int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est,
struct tc_action **a, const struct tc_action_ops *ops,
- int bind, bool cpustats)
+ int bind, bool cpustats, u32 flags)
{
struct tc_action *p = kzalloc(ops->size, GFP_KERNEL);
struct tcf_idrinfo *idrinfo = tn->idrinfo;
@@ -432,6 +447,7 @@
p->tcfa_tm.install = jiffies;
p->tcfa_tm.lastuse = jiffies;
p->tcfa_tm.firstuse = 0;
+ p->tcfa_flags = flags;
if (est) {
err = gen_new_estimator(&p->tcfa_bstats, p->cpu_bstats,
&p->tcfa_rate_est,
@@ -441,6 +457,7 @@
}
p->idrinfo = idrinfo;
+ __module_get(ops->owner);
p->ops = ops;
*a = p;
return 0;
@@ -456,6 +473,17 @@
}
EXPORT_SYMBOL(tcf_idr_create);
+int tcf_idr_create_from_flags(struct tc_action_net *tn, u32 index,
+ struct nlattr *est, struct tc_action **a,
+ const struct tc_action_ops *ops, int bind,
+ u32 flags)
+{
+ /* Set cpustats according to actions flags. */
+ return tcf_idr_create(tn, index, est, a, ops, bind,
+ !(flags & TCA_ACT_FLAGS_NO_PERCPU_STATS), flags);
+}
+EXPORT_SYMBOL(tcf_idr_create_from_flags);
+
/* Cleanup idr index that was allocated but not initialized. */
void tcf_idr_cleanup(struct tc_action_net *tn, u32 index)
@@ -652,15 +680,24 @@
restart_act_graph:
for (i = 0; i < nr_actions; i++) {
const struct tc_action *a = actions[i];
+ int repeat_ttl;
if (jmp_prgcnt > 0) {
jmp_prgcnt -= 1;
continue;
}
+
+ repeat_ttl = 32;
repeat:
ret = a->ops->act(skb, a, res);
- if (ret == TC_ACT_REPEAT)
- goto repeat; /* we need a ttl - JHS */
+
+ if (unlikely(ret == TC_ACT_REPEAT)) {
+ if (--repeat_ttl != 0)
+ goto repeat;
+ /* suspicious opcode, stop pipeline */
+ net_warn_ratelimited("TC_ACT_REPEAT abuse ?\n");
+ return TC_ACT_OK;
+ }
if (TC_ACT_EXT_CMP(ret, TC_ACT_JUMP)) {
jmp_prgcnt = ret & TCA_ACT_MAX_PRIO_MASK;
@@ -737,12 +774,10 @@
return a->ops->dump(skb, a, bind, ref);
}
-int
-tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
+static int
+tcf_action_dump_terse(struct sk_buff *skb, struct tc_action *a)
{
- int err = -EINVAL;
unsigned char *b = skb_tail_pointer(skb);
- struct nlattr *nest;
struct tc_cookie *cookie;
if (nla_put_string(skb, TCA_KIND, a->ops->kind))
@@ -760,6 +795,38 @@
}
rcu_read_unlock();
+ return 0;
+
+nla_put_failure:
+ nlmsg_trim(skb, b);
+ return -1;
+}
+
+int
+tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
+{
+ int err = -EINVAL;
+ unsigned char *b = skb_tail_pointer(skb);
+ struct nlattr *nest;
+
+ if (tcf_action_dump_terse(skb, a))
+ goto nla_put_failure;
+
+ if (a->hw_stats != TCA_ACT_HW_STATS_ANY &&
+ nla_put_bitfield32(skb, TCA_ACT_HW_STATS,
+ a->hw_stats, TCA_ACT_HW_STATS_ANY))
+ goto nla_put_failure;
+
+ if (a->used_hw_stats_valid &&
+ nla_put_bitfield32(skb, TCA_ACT_USED_HW_STATS,
+ a->used_hw_stats, TCA_ACT_HW_STATS_ANY))
+ goto nla_put_failure;
+
+ if (a->tcfa_flags &&
+ nla_put_bitfield32(skb, TCA_ACT_FLAGS,
+ a->tcfa_flags, a->tcfa_flags))
+ goto nla_put_failure;
+
nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
if (nest == NULL)
goto nla_put_failure;
@@ -776,7 +843,7 @@
EXPORT_SYMBOL(tcf_action_dump_1);
int tcf_action_dump(struct sk_buff *skb, struct tc_action *actions[],
- int bind, int ref)
+ int bind, int ref, bool terse)
{
struct tc_action *a;
int err = -EINVAL, i;
@@ -787,7 +854,8 @@
nest = nla_nest_start_noflag(skb, i + 1);
if (nest == NULL)
goto nla_put_failure;
- err = tcf_action_dump_1(skb, a, bind, ref);
+ err = terse ? tcf_action_dump_terse(skb, a) :
+ tcf_action_dump_1(skb, a, bind, ref);
if (err < 0)
goto errout;
nla_nest_end(skb, nest);
@@ -818,12 +886,28 @@
return c;
}
+static u8 tcf_action_hw_stats_get(struct nlattr *hw_stats_attr)
+{
+ struct nla_bitfield32 hw_stats_bf;
+
+ /* If the user did not pass the attr, that means he does
+ * not care about the type. Return "any" in that case
+ * which is setting on all supported types.
+ */
+ if (!hw_stats_attr)
+ return TCA_ACT_HW_STATS_ANY;
+ hw_stats_bf = nla_get_bitfield32(hw_stats_attr);
+ return hw_stats_bf.value;
+}
+
static const struct nla_policy tcf_action_policy[TCA_ACT_MAX + 1] = {
[TCA_ACT_KIND] = { .type = NLA_STRING },
[TCA_ACT_INDEX] = { .type = NLA_U32 },
[TCA_ACT_COOKIE] = { .type = NLA_BINARY,
.len = TC_COOKIE_MAX_SIZE },
[TCA_ACT_OPTIONS] = { .type = NLA_NESTED },
+ [TCA_ACT_FLAGS] = NLA_POLICY_BITFIELD32(TCA_ACT_FLAGS_NO_PERCPU_STATS),
+ [TCA_ACT_HW_STATS] = NLA_POLICY_BITFIELD32(TCA_ACT_HW_STATS_ANY),
};
void tcf_idr_insert_many(struct tc_action *actions[])
@@ -846,17 +930,13 @@
}
}
-struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp,
- struct nlattr *nla, struct nlattr *est,
- char *name, int ovr, int bind,
- bool rtnl_held,
- struct netlink_ext_ack *extack)
+struct tc_action_ops *tc_action_load_ops(char *name, struct nlattr *nla,
+ bool rtnl_held,
+ struct netlink_ext_ack *extack)
{
- struct tc_action *a;
- struct tc_action_ops *a_o;
- struct tc_cookie *cookie = NULL;
- char act_name[IFNAMSIZ];
struct nlattr *tb[TCA_ACT_MAX + 1];
+ struct tc_action_ops *a_o;
+ char act_name[IFNAMSIZ];
struct nlattr *kind;
int err;
@@ -864,30 +944,21 @@
err = nla_parse_nested_deprecated(tb, TCA_ACT_MAX, nla,
tcf_action_policy, extack);
if (err < 0)
- goto err_out;
+ return ERR_PTR(err);
err = -EINVAL;
kind = tb[TCA_ACT_KIND];
if (!kind) {
NL_SET_ERR_MSG(extack, "TC action kind must be specified");
- goto err_out;
+ return ERR_PTR(err);
}
if (nla_strlcpy(act_name, kind, IFNAMSIZ) >= IFNAMSIZ) {
NL_SET_ERR_MSG(extack, "TC action name too long");
- goto err_out;
- }
- if (tb[TCA_ACT_COOKIE]) {
- cookie = nla_memdup_cookie(tb);
- if (!cookie) {
- NL_SET_ERR_MSG(extack, "No memory to generate TC cookie");
- err = -ENOMEM;
- goto err_out;
- }
+ return ERR_PTR(err);
}
} else {
if (strlcpy(act_name, name, IFNAMSIZ) >= IFNAMSIZ) {
NL_SET_ERR_MSG(extack, "TC action name too long");
- err = -EINVAL;
- goto err_out;
+ return ERR_PTR(-EINVAL);
}
}
@@ -909,42 +980,67 @@
* indicate this using -EAGAIN.
*/
if (a_o != NULL) {
- err = -EAGAIN;
- goto err_mod;
+ module_put(a_o->owner);
+ return ERR_PTR(-EAGAIN);
}
#endif
NL_SET_ERR_MSG(extack, "Failed to load TC action module");
- err = -ENOENT;
- goto err_out;
+ return ERR_PTR(-ENOENT);
}
+ return a_o;
+}
+
+struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp,
+ struct nlattr *nla, struct nlattr *est,
+ char *name, int ovr, int bind,
+ struct tc_action_ops *a_o, int *init_res,
+ bool rtnl_held,
+ struct netlink_ext_ack *extack)
+{
+ struct nla_bitfield32 flags = { 0, 0 };
+ u8 hw_stats = TCA_ACT_HW_STATS_ANY;
+ struct nlattr *tb[TCA_ACT_MAX + 1];
+ struct tc_cookie *cookie = NULL;
+ struct tc_action *a;
+ int err;
+
/* backward compatibility for policer */
- if (name == NULL)
+ if (name == NULL) {
+ err = nla_parse_nested_deprecated(tb, TCA_ACT_MAX, nla,
+ tcf_action_policy, extack);
+ if (err < 0)
+ return ERR_PTR(err);
+ if (tb[TCA_ACT_COOKIE]) {
+ cookie = nla_memdup_cookie(tb);
+ if (!cookie) {
+ NL_SET_ERR_MSG(extack, "No memory to generate TC cookie");
+ err = -ENOMEM;
+ goto err_out;
+ }
+ }
+ hw_stats = tcf_action_hw_stats_get(tb[TCA_ACT_HW_STATS]);
+ if (tb[TCA_ACT_FLAGS])
+ flags = nla_get_bitfield32(tb[TCA_ACT_FLAGS]);
+
err = a_o->init(net, tb[TCA_ACT_OPTIONS], est, &a, ovr, bind,
- rtnl_held, tp, extack);
- else
+ rtnl_held, tp, flags.value, extack);
+ } else {
err = a_o->init(net, nla, est, &a, ovr, bind, rtnl_held,
- tp, extack);
+ tp, flags.value, extack);
+ }
if (err < 0)
- goto err_mod;
+ goto err_out;
+ *init_res = err;
if (!name && tb[TCA_ACT_COOKIE])
tcf_set_action_cookie(&a->act_cookie, cookie);
- /* module count goes up only when brand new policy is created
- * if it exists and is only bound to in a_o->init() then
- * ACT_P_CREATED is not returned (a zero is).
- */
- if (err != ACT_P_CREATED)
- module_put(a_o->owner);
-
- if (!bind && ovr && err == ACT_P_CREATED)
- refcount_set(&a->tcfa_refcnt, 2);
+ if (!name)
+ a->hw_stats = hw_stats;
return a;
-err_mod:
- module_put(a_o->owner);
err_out:
if (cookie) {
kfree(cookie->data);
@@ -957,9 +1053,10 @@
int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla,
struct nlattr *est, char *name, int ovr, int bind,
- struct tc_action *actions[], size_t *attr_size,
+ struct tc_action *actions[], int init_res[], size_t *attr_size,
bool rtnl_held, struct netlink_ext_ack *extack)
{
+ struct tc_action_ops *ops[TCA_ACT_MAX_PRIO] = {};
struct nlattr *tb[TCA_ACT_MAX_PRIO + 1];
struct tc_action *act;
size_t sz = 0;
@@ -972,13 +1069,24 @@
return err;
for (i = 1; i <= TCA_ACT_MAX_PRIO && tb[i]; i++) {
+ struct tc_action_ops *a_o;
+
+ a_o = tc_action_load_ops(name, tb[i], rtnl_held, extack);
+ if (IS_ERR(a_o)) {
+ err = PTR_ERR(a_o);
+ goto err_mod;
+ }
+ ops[i - 1] = a_o;
+ }
+
+ for (i = 1; i <= TCA_ACT_MAX_PRIO && tb[i]; i++) {
act = tcf_action_init_1(net, tp, tb[i], est, name, ovr, bind,
- rtnl_held, extack);
+ ops[i - 1], &init_res[i - 1], rtnl_held,
+ extack);
if (IS_ERR(act)) {
err = PTR_ERR(act);
goto err;
}
- act->order = i;
sz += tcf_action_fill_size(act);
/* Start from index 0 */
actions[i - 1] = act;
@@ -990,13 +1098,40 @@
tcf_idr_insert_many(actions);
*attr_size = tcf_action_full_attrs_size(sz);
- return i - 1;
+ err = i - 1;
+ goto err_mod;
err:
tcf_action_destroy(actions, bind);
+err_mod:
+ for (i = 0; i < TCA_ACT_MAX_PRIO; i++) {
+ if (ops[i])
+ module_put(ops[i]->owner);
+ }
return err;
}
+void tcf_action_update_stats(struct tc_action *a, u64 bytes, u64 packets,
+ u64 drops, bool hw)
+{
+ if (a->cpu_bstats) {
+ _bstats_cpu_update(this_cpu_ptr(a->cpu_bstats), bytes, packets);
+
+ this_cpu_ptr(a->cpu_qstats)->drops += drops;
+
+ if (hw)
+ _bstats_cpu_update(this_cpu_ptr(a->cpu_bstats_hw),
+ bytes, packets);
+ return;
+ }
+
+ _bstats_update(&a->tcfa_bstats, bytes, packets);
+ a->tcfa_qstats.drops += drops;
+ if (hw)
+ _bstats_update(&a->tcfa_bstats_hw, bytes, packets);
+}
+EXPORT_SYMBOL(tcf_action_update_stats);
+
int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *p,
int compat_mode)
{
@@ -1064,7 +1199,7 @@
if (!nest)
goto out_nlmsg_trim;
- if (tcf_action_dump(skb, actions, bind, ref) < 0)
+ if (tcf_action_dump(skb, actions, bind, ref, false) < 0)
goto out_nlmsg_trim;
nla_nest_end(skb, nest);
@@ -1361,12 +1496,13 @@
struct netlink_ext_ack *extack)
{
size_t attr_size = 0;
- int loop, ret;
+ int loop, ret, i;
struct tc_action *actions[TCA_ACT_MAX_PRIO] = {};
+ int init_res[TCA_ACT_MAX_PRIO] = {};
for (loop = 0; loop < 10; loop++) {
ret = tcf_action_init(net, NULL, nla, NULL, NULL, ovr, 0,
- actions, &attr_size, true, extack);
+ actions, init_res, &attr_size, true, extack);
if (ret != -EAGAIN)
break;
}
@@ -1374,16 +1510,18 @@
if (ret < 0)
return ret;
ret = tcf_add_notify(net, n, actions, portid, attr_size, extack);
- if (ovr)
- tcf_action_put_many(actions);
+
+ /* only put existing actions */
+ for (i = 0; i < TCA_ACT_MAX_PRIO; i++)
+ if (init_res[i] == ACT_P_CREATED)
+ actions[i] = NULL;
+ tcf_action_put_many(actions);
return ret;
}
-static u32 tcaa_root_flags_allowed = TCA_FLAG_LARGE_DUMP_ON;
static const struct nla_policy tcaa_policy[TCA_ROOT_MAX + 1] = {
- [TCA_ROOT_FLAGS] = { .type = NLA_BITFIELD32,
- .validation_data = &tcaa_root_flags_allowed },
+ [TCA_ROOT_FLAGS] = NLA_POLICY_BITFIELD32(TCA_FLAG_LARGE_DUMP_ON),
[TCA_ROOT_TIME_DELTA] = { .type = NLA_U32 },
};
@@ -1392,7 +1530,7 @@
{
struct net *net = sock_net(skb->sk);
struct nlattr *tca[TCA_ROOT_MAX + 1];
- u32 portid = skb ? NETLINK_CB(skb).portid : 0;
+ u32 portid = NETLINK_CB(skb).portid;
int ret = 0, ovr = 0;
if ((n->nlmsg_type != RTM_GETACTION) &&
diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c
index b7d83d0..a4c7ba3 100644
--- a/net/sched/act_bpf.c
+++ b/net/sched/act_bpf.c
@@ -12,6 +12,7 @@
#include <linux/bpf.h>
#include <net/netlink.h>
+#include <net/sock.h>
#include <net/pkt_sched.h>
#include <net/pkt_cls.h>
@@ -53,6 +54,8 @@
bpf_compute_data_pointers(skb);
filter_res = BPF_PROG_RUN(filter, skb);
}
+ if (skb_sk_is_prefetched(skb) && filter_res != TC_ACT_OK)
+ skb_orphan(skb);
rcu_read_unlock();
/* A BPF program may overwrite the default action opcode.
@@ -275,7 +278,8 @@
static int tcf_bpf_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **act,
int replace, int bind, bool rtnl_held,
- struct tcf_proto *tp, struct netlink_ext_ack *extack)
+ struct tcf_proto *tp, u32 flags,
+ struct netlink_ext_ack *extack)
{
struct tc_action_net *tn = net_generic(net, bpf_net_id);
struct nlattr *tb[TCA_ACT_BPF_MAX + 1];
@@ -303,7 +307,7 @@
ret = tcf_idr_check_alloc(tn, &index, act, bind);
if (!ret) {
ret = tcf_idr_create(tn, index, est, act,
- &act_bpf_ops, bind, true);
+ &act_bpf_ops, bind, true, 0);
if (ret < 0) {
tcf_idr_cleanup(tn, index);
return ret;
diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c
index b00105e..e19885d 100644
--- a/net/sched/act_connmark.c
+++ b/net/sched/act_connmark.c
@@ -97,7 +97,7 @@
static int tcf_connmark_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a,
int ovr, int bind, bool rtnl_held,
- struct tcf_proto *tp,
+ struct tcf_proto *tp, u32 flags,
struct netlink_ext_ack *extack)
{
struct tc_action_net *tn = net_generic(net, connmark_net_id);
@@ -124,7 +124,7 @@
ret = tcf_idr_check_alloc(tn, &index, a, bind);
if (!ret) {
ret = tcf_idr_create(tn, index, est, a,
- &act_connmark_ops, bind, false);
+ &act_connmark_ops, bind, false, 0);
if (ret) {
tcf_idr_cleanup(tn, index);
return ret;
diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c
index fa1b1fd..4fa4fcb 100644
--- a/net/sched/act_csum.c
+++ b/net/sched/act_csum.c
@@ -43,7 +43,7 @@
static int tcf_csum_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a, int ovr,
int bind, bool rtnl_held, struct tcf_proto *tp,
- struct netlink_ext_ack *extack)
+ u32 flags, struct netlink_ext_ack *extack)
{
struct tc_action_net *tn = net_generic(net, csum_net_id);
struct tcf_csum_params *params_new;
@@ -68,8 +68,8 @@
index = parm->index;
err = tcf_idr_check_alloc(tn, &index, a, bind);
if (!err) {
- ret = tcf_idr_create(tn, index, est, a,
- &act_csum_ops, bind, true);
+ ret = tcf_idr_create_from_flags(tn, index, est, a,
+ &act_csum_ops, bind, flags);
if (ret) {
tcf_idr_cleanup(tn, index);
return ret;
@@ -101,8 +101,8 @@
spin_lock_bh(&p->tcf_lock);
goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch);
- rcu_swap_protected(p->params, params_new,
- lockdep_is_held(&p->tcf_lock));
+ params_new = rcu_replace_pointer(p->params, params_new,
+ lockdep_is_held(&p->tcf_lock));
spin_unlock_bh(&p->tcf_lock);
if (goto_ch)
@@ -577,7 +577,7 @@
params = rcu_dereference_bh(p->params);
tcf_lastuse_update(&p->tcf_tm);
- bstats_cpu_update(this_cpu_ptr(p->common.cpu_bstats), skb);
+ tcf_action_update_bstats(&p->common, skb);
action = READ_ONCE(p->tcf_action);
if (unlikely(action == TC_ACT_SHOT))
@@ -595,7 +595,8 @@
if (!tcf_csum_ipv6(skb, update_flags))
goto drop;
break;
- case cpu_to_be16(ETH_P_8021AD): /* fall through */
+ case cpu_to_be16(ETH_P_8021AD):
+ fallthrough;
case cpu_to_be16(ETH_P_8021Q):
if (skb_vlan_tag_present(skb) && !orig_vlan_tag_present) {
protocol = skb->protocol;
@@ -621,7 +622,7 @@
return action;
drop:
- qstats_drop_inc(this_cpu_ptr(p->common.cpu_qstats));
+ tcf_action_inc_drop_qstats(&p->common);
action = TC_ACT_SHOT;
goto out;
}
diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c
index 02d4491..825b3e9 100644
--- a/net/sched/act_ct.c
+++ b/net/sched/act_ct.c
@@ -15,6 +15,7 @@
#include <linux/pkt_cls.h>
#include <linux/ip.h>
#include <linux/ipv6.h>
+#include <linux/rhashtable.h>
#include <net/netlink.h>
#include <net/pkt_sched.h>
#include <net/pkt_cls.h>
@@ -24,13 +25,542 @@
#include <uapi/linux/tc_act/tc_ct.h>
#include <net/tc_act/tc_ct.h>
+#include <net/netfilter/nf_flow_table.h>
#include <net/netfilter/nf_conntrack.h>
#include <net/netfilter/nf_conntrack_core.h>
#include <net/netfilter/nf_conntrack_zones.h>
#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_acct.h>
#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
#include <uapi/linux/netfilter/nf_nat.h>
+static struct workqueue_struct *act_ct_wq;
+static struct rhashtable zones_ht;
+static DEFINE_MUTEX(zones_mutex);
+
+struct tcf_ct_flow_table {
+ struct rhash_head node; /* In zones tables */
+
+ struct rcu_work rwork;
+ struct nf_flowtable nf_ft;
+ refcount_t ref;
+ u16 zone;
+
+ bool dying;
+};
+
+static const struct rhashtable_params zones_params = {
+ .head_offset = offsetof(struct tcf_ct_flow_table, node),
+ .key_offset = offsetof(struct tcf_ct_flow_table, zone),
+ .key_len = sizeof_field(struct tcf_ct_flow_table, zone),
+ .automatic_shrinking = true,
+};
+
+static struct flow_action_entry *
+tcf_ct_flow_table_flow_action_get_next(struct flow_action *flow_action)
+{
+ int i = flow_action->num_entries++;
+
+ return &flow_action->entries[i];
+}
+
+static void tcf_ct_add_mangle_action(struct flow_action *action,
+ enum flow_action_mangle_base htype,
+ u32 offset,
+ u32 mask,
+ u32 val)
+{
+ struct flow_action_entry *entry;
+
+ entry = tcf_ct_flow_table_flow_action_get_next(action);
+ entry->id = FLOW_ACTION_MANGLE;
+ entry->mangle.htype = htype;
+ entry->mangle.mask = ~mask;
+ entry->mangle.offset = offset;
+ entry->mangle.val = val;
+}
+
+/* The following nat helper functions check if the inverted reverse tuple
+ * (target) is different then the current dir tuple - meaning nat for ports
+ * and/or ip is needed, and add the relevant mangle actions.
+ */
+static void
+tcf_ct_flow_table_add_action_nat_ipv4(const struct nf_conntrack_tuple *tuple,
+ struct nf_conntrack_tuple target,
+ struct flow_action *action)
+{
+ if (memcmp(&target.src.u3, &tuple->src.u3, sizeof(target.src.u3)))
+ tcf_ct_add_mangle_action(action, FLOW_ACT_MANGLE_HDR_TYPE_IP4,
+ offsetof(struct iphdr, saddr),
+ 0xFFFFFFFF,
+ be32_to_cpu(target.src.u3.ip));
+ if (memcmp(&target.dst.u3, &tuple->dst.u3, sizeof(target.dst.u3)))
+ tcf_ct_add_mangle_action(action, FLOW_ACT_MANGLE_HDR_TYPE_IP4,
+ offsetof(struct iphdr, daddr),
+ 0xFFFFFFFF,
+ be32_to_cpu(target.dst.u3.ip));
+}
+
+static void
+tcf_ct_add_ipv6_addr_mangle_action(struct flow_action *action,
+ union nf_inet_addr *addr,
+ u32 offset)
+{
+ int i;
+
+ for (i = 0; i < sizeof(struct in6_addr) / sizeof(u32); i++)
+ tcf_ct_add_mangle_action(action, FLOW_ACT_MANGLE_HDR_TYPE_IP6,
+ i * sizeof(u32) + offset,
+ 0xFFFFFFFF, be32_to_cpu(addr->ip6[i]));
+}
+
+static void
+tcf_ct_flow_table_add_action_nat_ipv6(const struct nf_conntrack_tuple *tuple,
+ struct nf_conntrack_tuple target,
+ struct flow_action *action)
+{
+ if (memcmp(&target.src.u3, &tuple->src.u3, sizeof(target.src.u3)))
+ tcf_ct_add_ipv6_addr_mangle_action(action, &target.src.u3,
+ offsetof(struct ipv6hdr,
+ saddr));
+ if (memcmp(&target.dst.u3, &tuple->dst.u3, sizeof(target.dst.u3)))
+ tcf_ct_add_ipv6_addr_mangle_action(action, &target.dst.u3,
+ offsetof(struct ipv6hdr,
+ daddr));
+}
+
+static void
+tcf_ct_flow_table_add_action_nat_tcp(const struct nf_conntrack_tuple *tuple,
+ struct nf_conntrack_tuple target,
+ struct flow_action *action)
+{
+ __be16 target_src = target.src.u.tcp.port;
+ __be16 target_dst = target.dst.u.tcp.port;
+
+ if (target_src != tuple->src.u.tcp.port)
+ tcf_ct_add_mangle_action(action, FLOW_ACT_MANGLE_HDR_TYPE_TCP,
+ offsetof(struct tcphdr, source),
+ 0xFFFF, be16_to_cpu(target_src));
+ if (target_dst != tuple->dst.u.tcp.port)
+ tcf_ct_add_mangle_action(action, FLOW_ACT_MANGLE_HDR_TYPE_TCP,
+ offsetof(struct tcphdr, dest),
+ 0xFFFF, be16_to_cpu(target_dst));
+}
+
+static void
+tcf_ct_flow_table_add_action_nat_udp(const struct nf_conntrack_tuple *tuple,
+ struct nf_conntrack_tuple target,
+ struct flow_action *action)
+{
+ __be16 target_src = target.src.u.udp.port;
+ __be16 target_dst = target.dst.u.udp.port;
+
+ if (target_src != tuple->src.u.udp.port)
+ tcf_ct_add_mangle_action(action, FLOW_ACT_MANGLE_HDR_TYPE_UDP,
+ offsetof(struct udphdr, source),
+ 0xFFFF, be16_to_cpu(target_src));
+ if (target_dst != tuple->dst.u.udp.port)
+ tcf_ct_add_mangle_action(action, FLOW_ACT_MANGLE_HDR_TYPE_UDP,
+ offsetof(struct udphdr, dest),
+ 0xFFFF, be16_to_cpu(target_dst));
+}
+
+static void tcf_ct_flow_table_add_action_meta(struct nf_conn *ct,
+ enum ip_conntrack_dir dir,
+ struct flow_action *action)
+{
+ struct nf_conn_labels *ct_labels;
+ struct flow_action_entry *entry;
+ enum ip_conntrack_info ctinfo;
+ u32 *act_ct_labels;
+
+ entry = tcf_ct_flow_table_flow_action_get_next(action);
+ entry->id = FLOW_ACTION_CT_METADATA;
+#if IS_ENABLED(CONFIG_NF_CONNTRACK_MARK)
+ entry->ct_metadata.mark = ct->mark;
+#endif
+ ctinfo = dir == IP_CT_DIR_ORIGINAL ? IP_CT_ESTABLISHED :
+ IP_CT_ESTABLISHED_REPLY;
+ /* aligns with the CT reference on the SKB nf_ct_set */
+ entry->ct_metadata.cookie = (unsigned long)ct | ctinfo;
+
+ act_ct_labels = entry->ct_metadata.labels;
+ ct_labels = nf_ct_labels_find(ct);
+ if (ct_labels)
+ memcpy(act_ct_labels, ct_labels->bits, NF_CT_LABELS_MAX_SIZE);
+ else
+ memset(act_ct_labels, 0, NF_CT_LABELS_MAX_SIZE);
+}
+
+static int tcf_ct_flow_table_add_action_nat(struct net *net,
+ struct nf_conn *ct,
+ enum ip_conntrack_dir dir,
+ struct flow_action *action)
+{
+ const struct nf_conntrack_tuple *tuple = &ct->tuplehash[dir].tuple;
+ struct nf_conntrack_tuple target;
+
+ if (!(ct->status & IPS_NAT_MASK))
+ return 0;
+
+ nf_ct_invert_tuple(&target, &ct->tuplehash[!dir].tuple);
+
+ switch (tuple->src.l3num) {
+ case NFPROTO_IPV4:
+ tcf_ct_flow_table_add_action_nat_ipv4(tuple, target,
+ action);
+ break;
+ case NFPROTO_IPV6:
+ tcf_ct_flow_table_add_action_nat_ipv6(tuple, target,
+ action);
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ switch (nf_ct_protonum(ct)) {
+ case IPPROTO_TCP:
+ tcf_ct_flow_table_add_action_nat_tcp(tuple, target, action);
+ break;
+ case IPPROTO_UDP:
+ tcf_ct_flow_table_add_action_nat_udp(tuple, target, action);
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ return 0;
+}
+
+static int tcf_ct_flow_table_fill_actions(struct net *net,
+ const struct flow_offload *flow,
+ enum flow_offload_tuple_dir tdir,
+ struct nf_flow_rule *flow_rule)
+{
+ struct flow_action *action = &flow_rule->rule->action;
+ int num_entries = action->num_entries;
+ struct nf_conn *ct = flow->ct;
+ enum ip_conntrack_dir dir;
+ int i, err;
+
+ switch (tdir) {
+ case FLOW_OFFLOAD_DIR_ORIGINAL:
+ dir = IP_CT_DIR_ORIGINAL;
+ break;
+ case FLOW_OFFLOAD_DIR_REPLY:
+ dir = IP_CT_DIR_REPLY;
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ err = tcf_ct_flow_table_add_action_nat(net, ct, dir, action);
+ if (err)
+ goto err_nat;
+
+ tcf_ct_flow_table_add_action_meta(ct, dir, action);
+ return 0;
+
+err_nat:
+ /* Clear filled actions */
+ for (i = num_entries; i < action->num_entries; i++)
+ memset(&action->entries[i], 0, sizeof(action->entries[i]));
+ action->num_entries = num_entries;
+
+ return err;
+}
+
+static struct nf_flowtable_type flowtable_ct = {
+ .action = tcf_ct_flow_table_fill_actions,
+ .owner = THIS_MODULE,
+};
+
+static int tcf_ct_flow_table_get(struct tcf_ct_params *params)
+{
+ struct tcf_ct_flow_table *ct_ft;
+ int err = -ENOMEM;
+
+ mutex_lock(&zones_mutex);
+ ct_ft = rhashtable_lookup_fast(&zones_ht, ¶ms->zone, zones_params);
+ if (ct_ft && refcount_inc_not_zero(&ct_ft->ref))
+ goto out_unlock;
+
+ ct_ft = kzalloc(sizeof(*ct_ft), GFP_KERNEL);
+ if (!ct_ft)
+ goto err_alloc;
+ refcount_set(&ct_ft->ref, 1);
+
+ ct_ft->zone = params->zone;
+ err = rhashtable_insert_fast(&zones_ht, &ct_ft->node, zones_params);
+ if (err)
+ goto err_insert;
+
+ ct_ft->nf_ft.type = &flowtable_ct;
+ ct_ft->nf_ft.flags |= NF_FLOWTABLE_HW_OFFLOAD;
+ err = nf_flow_table_init(&ct_ft->nf_ft);
+ if (err)
+ goto err_init;
+
+ __module_get(THIS_MODULE);
+out_unlock:
+ params->ct_ft = ct_ft;
+ params->nf_ft = &ct_ft->nf_ft;
+ mutex_unlock(&zones_mutex);
+
+ return 0;
+
+err_init:
+ rhashtable_remove_fast(&zones_ht, &ct_ft->node, zones_params);
+err_insert:
+ kfree(ct_ft);
+err_alloc:
+ mutex_unlock(&zones_mutex);
+ return err;
+}
+
+static void tcf_ct_flow_table_cleanup_work(struct work_struct *work)
+{
+ struct flow_block_cb *block_cb, *tmp_cb;
+ struct tcf_ct_flow_table *ct_ft;
+ struct flow_block *block;
+
+ ct_ft = container_of(to_rcu_work(work), struct tcf_ct_flow_table,
+ rwork);
+ nf_flow_table_free(&ct_ft->nf_ft);
+
+ /* Remove any remaining callbacks before cleanup */
+ block = &ct_ft->nf_ft.flow_block;
+ down_write(&ct_ft->nf_ft.flow_block_lock);
+ list_for_each_entry_safe(block_cb, tmp_cb, &block->cb_list, list) {
+ list_del(&block_cb->list);
+ flow_block_cb_free(block_cb);
+ }
+ up_write(&ct_ft->nf_ft.flow_block_lock);
+ kfree(ct_ft);
+
+ module_put(THIS_MODULE);
+}
+
+static void tcf_ct_flow_table_put(struct tcf_ct_params *params)
+{
+ struct tcf_ct_flow_table *ct_ft = params->ct_ft;
+
+ if (refcount_dec_and_test(¶ms->ct_ft->ref)) {
+ rhashtable_remove_fast(&zones_ht, &ct_ft->node, zones_params);
+ INIT_RCU_WORK(&ct_ft->rwork, tcf_ct_flow_table_cleanup_work);
+ queue_rcu_work(act_ct_wq, &ct_ft->rwork);
+ }
+}
+
+static void tcf_ct_flow_table_add(struct tcf_ct_flow_table *ct_ft,
+ struct nf_conn *ct,
+ bool tcp)
+{
+ struct flow_offload *entry;
+ int err;
+
+ if (test_and_set_bit(IPS_OFFLOAD_BIT, &ct->status))
+ return;
+
+ entry = flow_offload_alloc(ct);
+ if (!entry) {
+ WARN_ON_ONCE(1);
+ goto err_alloc;
+ }
+
+ if (tcp) {
+ ct->proto.tcp.seen[0].flags |= IP_CT_TCP_FLAG_BE_LIBERAL;
+ ct->proto.tcp.seen[1].flags |= IP_CT_TCP_FLAG_BE_LIBERAL;
+ }
+
+ err = flow_offload_add(&ct_ft->nf_ft, entry);
+ if (err)
+ goto err_add;
+
+ return;
+
+err_add:
+ flow_offload_free(entry);
+err_alloc:
+ clear_bit(IPS_OFFLOAD_BIT, &ct->status);
+}
+
+static void tcf_ct_flow_table_process_conn(struct tcf_ct_flow_table *ct_ft,
+ struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo)
+{
+ bool tcp = false;
+
+ if (ctinfo != IP_CT_ESTABLISHED && ctinfo != IP_CT_ESTABLISHED_REPLY)
+ return;
+
+ switch (nf_ct_protonum(ct)) {
+ case IPPROTO_TCP:
+ tcp = true;
+ if (ct->proto.tcp.state != TCP_CONNTRACK_ESTABLISHED)
+ return;
+ break;
+ case IPPROTO_UDP:
+ break;
+ default:
+ return;
+ }
+
+ if (nf_ct_ext_exist(ct, NF_CT_EXT_HELPER) ||
+ ct->status & IPS_SEQ_ADJUST)
+ return;
+
+ tcf_ct_flow_table_add(ct_ft, ct, tcp);
+}
+
+static bool
+tcf_ct_flow_table_fill_tuple_ipv4(struct sk_buff *skb,
+ struct flow_offload_tuple *tuple,
+ struct tcphdr **tcph)
+{
+ struct flow_ports *ports;
+ unsigned int thoff;
+ struct iphdr *iph;
+
+ if (!pskb_network_may_pull(skb, sizeof(*iph)))
+ return false;
+
+ iph = ip_hdr(skb);
+ thoff = iph->ihl * 4;
+
+ if (ip_is_fragment(iph) ||
+ unlikely(thoff != sizeof(struct iphdr)))
+ return false;
+
+ if (iph->protocol != IPPROTO_TCP &&
+ iph->protocol != IPPROTO_UDP)
+ return false;
+
+ if (iph->ttl <= 1)
+ return false;
+
+ if (!pskb_network_may_pull(skb, iph->protocol == IPPROTO_TCP ?
+ thoff + sizeof(struct tcphdr) :
+ thoff + sizeof(*ports)))
+ return false;
+
+ iph = ip_hdr(skb);
+ if (iph->protocol == IPPROTO_TCP)
+ *tcph = (void *)(skb_network_header(skb) + thoff);
+
+ ports = (struct flow_ports *)(skb_network_header(skb) + thoff);
+ tuple->src_v4.s_addr = iph->saddr;
+ tuple->dst_v4.s_addr = iph->daddr;
+ tuple->src_port = ports->source;
+ tuple->dst_port = ports->dest;
+ tuple->l3proto = AF_INET;
+ tuple->l4proto = iph->protocol;
+
+ return true;
+}
+
+static bool
+tcf_ct_flow_table_fill_tuple_ipv6(struct sk_buff *skb,
+ struct flow_offload_tuple *tuple,
+ struct tcphdr **tcph)
+{
+ struct flow_ports *ports;
+ struct ipv6hdr *ip6h;
+ unsigned int thoff;
+
+ if (!pskb_network_may_pull(skb, sizeof(*ip6h)))
+ return false;
+
+ ip6h = ipv6_hdr(skb);
+
+ if (ip6h->nexthdr != IPPROTO_TCP &&
+ ip6h->nexthdr != IPPROTO_UDP)
+ return false;
+
+ if (ip6h->hop_limit <= 1)
+ return false;
+
+ thoff = sizeof(*ip6h);
+ if (!pskb_network_may_pull(skb, ip6h->nexthdr == IPPROTO_TCP ?
+ thoff + sizeof(struct tcphdr) :
+ thoff + sizeof(*ports)))
+ return false;
+
+ ip6h = ipv6_hdr(skb);
+ if (ip6h->nexthdr == IPPROTO_TCP)
+ *tcph = (void *)(skb_network_header(skb) + thoff);
+
+ ports = (struct flow_ports *)(skb_network_header(skb) + thoff);
+ tuple->src_v6 = ip6h->saddr;
+ tuple->dst_v6 = ip6h->daddr;
+ tuple->src_port = ports->source;
+ tuple->dst_port = ports->dest;
+ tuple->l3proto = AF_INET6;
+ tuple->l4proto = ip6h->nexthdr;
+
+ return true;
+}
+
+static bool tcf_ct_flow_table_lookup(struct tcf_ct_params *p,
+ struct sk_buff *skb,
+ u8 family)
+{
+ struct nf_flowtable *nf_ft = &p->ct_ft->nf_ft;
+ struct flow_offload_tuple_rhash *tuplehash;
+ struct flow_offload_tuple tuple = {};
+ enum ip_conntrack_info ctinfo;
+ struct tcphdr *tcph = NULL;
+ struct flow_offload *flow;
+ struct nf_conn *ct;
+ u8 dir;
+
+ switch (family) {
+ case NFPROTO_IPV4:
+ if (!tcf_ct_flow_table_fill_tuple_ipv4(skb, &tuple, &tcph))
+ return false;
+ break;
+ case NFPROTO_IPV6:
+ if (!tcf_ct_flow_table_fill_tuple_ipv6(skb, &tuple, &tcph))
+ return false;
+ break;
+ default:
+ return false;
+ }
+
+ tuplehash = flow_offload_lookup(nf_ft, &tuple);
+ if (!tuplehash)
+ return false;
+
+ dir = tuplehash->tuple.dir;
+ flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]);
+ ct = flow->ct;
+
+ if (tcph && (unlikely(tcph->fin || tcph->rst))) {
+ flow_offload_teardown(flow);
+ return false;
+ }
+
+ ctinfo = dir == FLOW_OFFLOAD_DIR_ORIGINAL ? IP_CT_ESTABLISHED :
+ IP_CT_ESTABLISHED_REPLY;
+
+ flow_offload_refresh(nf_ft, flow);
+ nf_conntrack_get(&ct->ct_general);
+ nf_ct_set(skb, ct, ctinfo);
+ nf_ct_acct_update(ct, dir, skb->len);
+
+ return true;
+}
+
+static int tcf_ct_flow_tables_init(void)
+{
+ return rhashtable_init(&zones_ht, &zones_params);
+}
+
+static void tcf_ct_flow_tables_uninit(void)
+{
+ rhashtable_destroy(&zones_ht);
+}
+
static struct tc_action_ops act_ct_ops;
static unsigned int ct_net_id;
@@ -149,9 +679,10 @@
}
static int tcf_ct_handle_fragments(struct net *net, struct sk_buff *skb,
- u8 family, u16 zone)
+ u8 family, u16 zone, bool *defrag)
{
enum ip_conntrack_info ctinfo;
+ struct qdisc_skb_cb cb;
struct nf_conn *ct;
int err = 0;
bool frag;
@@ -169,6 +700,7 @@
return err;
skb_get(skb);
+ cb = *qdisc_skb_cb(skb);
if (family == NFPROTO_IPV4) {
enum ip_defrag_users user = IP_DEFRAG_CONNTRACK_IN + zone;
@@ -178,7 +710,12 @@
err = ip_defrag(net, skb, user);
local_bh_enable();
if (err && err != -EINPROGRESS)
- goto out_free;
+ return err;
+
+ if (!err) {
+ *defrag = true;
+ cb.mru = IPCB(skb)->frag_max_size;
+ }
} else { /* NFPROTO_IPV6 */
#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6)
enum ip6_defrag_users user = IP6_DEFRAG_CONNTRACK_IN + zone;
@@ -186,13 +723,19 @@
memset(IP6CB(skb), 0, sizeof(struct inet6_skb_parm));
err = nf_ct_frag6_gather(net, skb, user);
if (err && err != -EINPROGRESS)
- return err;
+ goto out_free;
+
+ if (!err) {
+ *defrag = true;
+ cb.mru = IP6CB(skb)->frag_max_size;
+ }
#else
err = -EOPNOTSUPP;
goto out_free;
#endif
}
+ *qdisc_skb_cb(skb) = cb;
skb_clear_hash(skb);
skb->ignore_df = 1;
return err;
@@ -207,6 +750,8 @@
struct tcf_ct_params *params = container_of(head,
struct tcf_ct_params, rcu);
+ tcf_ct_flow_table_put(params);
+
if (params->tmpl)
nf_conntrack_put(¶ms->tmpl->ct_general);
kfree(params);
@@ -257,7 +802,7 @@
}
}
/* Non-ICMP, fall thru to initialize if needed. */
- /* fall through */
+ fallthrough;
case IP_CT_NEW:
/* Seen it before? This can happen for loopback, retrans,
* or local packets.
@@ -312,7 +857,7 @@
u32 *labels_m)
{
#if IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS)
- size_t labels_sz = FIELD_SIZEOF(struct tcf_ct_params, labels);
+ size_t labels_sz = sizeof_field(struct tcf_ct_params, labels);
if (!memchr_inv(labels_m, 0, labels_sz))
return;
@@ -392,6 +937,8 @@
struct nf_hook_state state;
int nh_ofs, err, retval;
struct tcf_ct_params *p;
+ bool skip_add = false;
+ bool defrag = false;
struct nf_conn *ct;
u8 family;
@@ -403,6 +950,8 @@
force = p->ct_action & TCA_CT_ACT_FORCE;
tmpl = p->tmpl;
+ tcf_lastuse_update(&c->tcf_tm);
+
if (clear) {
ct = nf_ct_get(skb, &ctinfo);
if (ct) {
@@ -422,7 +971,7 @@
*/
nh_ofs = skb_network_offset(skb);
skb_pull_rcsum(skb, nh_ofs);
- err = tcf_ct_handle_fragments(net, skb, family, p->zone);
+ err = tcf_ct_handle_fragments(net, skb, family, p->zone, &defrag);
if (err == -EINPROGRESS) {
retval = TC_ACT_STOLEN;
goto out;
@@ -441,6 +990,11 @@
*/
cached = tcf_ct_skb_nfct_cached(net, skb, p->zone, force);
if (!cached) {
+ if (tcf_ct_flow_table_lookup(p, skb, family)) {
+ skip_add = true;
+ goto do_nat;
+ }
+
/* Associate skb with specified zone. */
if (tmpl) {
ct = nf_ct_get(skb, &ctinfo);
@@ -458,6 +1012,7 @@
goto out_push;
}
+do_nat:
ct = nf_ct_get(skb, &ctinfo);
if (!ct)
goto out_push;
@@ -478,22 +1033,26 @@
goto drop;
}
+ if (!skip_add)
+ tcf_ct_flow_table_process_conn(p->ct_ft, ct, ctinfo);
+
out_push:
skb_push_rcsum(skb, nh_ofs);
out:
- bstats_cpu_update(this_cpu_ptr(a->cpu_bstats), skb);
+ tcf_action_update_bstats(&c->common, skb);
+ if (defrag)
+ qdisc_skb_cb(skb)->pkt_len = skb->len;
return retval;
drop:
- qstats_drop_inc(this_cpu_ptr(a->cpu_qstats));
+ tcf_action_inc_drop_qstats(&c->common);
return TC_ACT_SHOT;
}
static const struct nla_policy ct_policy[TCA_CT_MAX + 1] = {
- [TCA_CT_UNSPEC] = { .strict_start_type = TCA_CT_UNSPEC + 1 },
[TCA_CT_ACTION] = { .type = NLA_U16 },
- [TCA_CT_PARMS] = { .type = NLA_EXACT_LEN, .len = sizeof(struct tc_ct) },
+ [TCA_CT_PARMS] = NLA_POLICY_EXACT_LEN(sizeof(struct tc_ct)),
[TCA_CT_ZONE] = { .type = NLA_U16 },
[TCA_CT_MARK] = { .type = NLA_U32 },
[TCA_CT_MARK_MASK] = { .type = NLA_U32 },
@@ -503,10 +1062,8 @@
.len = 128 / BITS_PER_BYTE },
[TCA_CT_NAT_IPV4_MIN] = { .type = NLA_U32 },
[TCA_CT_NAT_IPV4_MAX] = { .type = NLA_U32 },
- [TCA_CT_NAT_IPV6_MIN] = { .type = NLA_EXACT_LEN,
- .len = sizeof(struct in6_addr) },
- [TCA_CT_NAT_IPV6_MAX] = { .type = NLA_EXACT_LEN,
- .len = sizeof(struct in6_addr) },
+ [TCA_CT_NAT_IPV6_MIN] = NLA_POLICY_EXACT_LEN(sizeof(struct in6_addr)),
+ [TCA_CT_NAT_IPV6_MAX] = NLA_POLICY_EXACT_LEN(sizeof(struct in6_addr)),
[TCA_CT_NAT_PORT_MIN] = { .type = NLA_U16 },
[TCA_CT_NAT_PORT_MAX] = { .type = NLA_U16 },
};
@@ -670,7 +1227,7 @@
static int tcf_ct_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a,
int replace, int bind, bool rtnl_held,
- struct tcf_proto *tp,
+ struct tcf_proto *tp, u32 flags,
struct netlink_ext_ack *extack)
{
struct tc_action_net *tn = net_generic(net, ct_net_id);
@@ -702,8 +1259,8 @@
return err;
if (!err) {
- err = tcf_idr_create(tn, index, est, a,
- &act_ct_ops, bind, true);
+ err = tcf_idr_create_from_flags(tn, index, est, a,
+ &act_ct_ops, bind, flags);
if (err) {
tcf_idr_cleanup(tn, index);
return err;
@@ -734,9 +1291,14 @@
if (err)
goto cleanup;
+ err = tcf_ct_flow_table_get(params);
+ if (err)
+ goto cleanup;
+
spin_lock_bh(&c->tcf_lock);
goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch);
- rcu_swap_protected(c->params, params, lockdep_is_held(&c->tcf_lock));
+ params = rcu_replace_pointer(c->params, params,
+ lockdep_is_held(&c->tcf_lock));
spin_unlock_bh(&c->tcf_lock);
if (goto_ch)
@@ -912,16 +1474,12 @@
return tcf_idr_search(tn, a, index);
}
-static void tcf_stats_update(struct tc_action *a, u64 bytes, u32 packets,
- u64 lastuse, bool hw)
+static void tcf_stats_update(struct tc_action *a, u64 bytes, u64 packets,
+ u64 drops, u64 lastuse, bool hw)
{
struct tcf_ct *c = to_ct(a);
- _bstats_cpu_update(this_cpu_ptr(a->cpu_bstats), bytes, packets);
-
- if (hw)
- _bstats_cpu_update(this_cpu_ptr(a->cpu_bstats_hw),
- bytes, packets);
+ tcf_action_update_stats(a, bytes, packets, drops, hw);
c->tcf_tm.lastuse = max_t(u64, c->tcf_tm.lastuse, lastuse);
}
@@ -941,7 +1499,7 @@
static __net_init int ct_init_net(struct net *net)
{
- unsigned int n_bits = FIELD_SIZEOF(struct tcf_ct_params, labels) * 8;
+ unsigned int n_bits = sizeof_field(struct tcf_ct_params, labels) * 8;
struct tc_ct_action_net *tn = net_generic(net, ct_net_id);
if (nf_connlabels_get(net, n_bits - 1)) {
@@ -979,12 +1537,34 @@
static int __init ct_init_module(void)
{
- return tcf_register_action(&act_ct_ops, &ct_net_ops);
+ int err;
+
+ act_ct_wq = alloc_ordered_workqueue("act_ct_workqueue", 0);
+ if (!act_ct_wq)
+ return -ENOMEM;
+
+ err = tcf_ct_flow_tables_init();
+ if (err)
+ goto err_tbl_init;
+
+ err = tcf_register_action(&act_ct_ops, &ct_net_ops);
+ if (err)
+ goto err_register;
+
+ return 0;
+
+err_register:
+ tcf_ct_flow_tables_uninit();
+err_tbl_init:
+ destroy_workqueue(act_ct_wq);
+ return err;
}
static void __exit ct_cleanup_module(void)
{
tcf_unregister_action(&act_ct_ops, &ct_net_ops);
+ tcf_ct_flow_tables_uninit();
+ destroy_workqueue(act_ct_wq);
}
module_init(ct_init_module);
diff --git a/net/sched/act_ctinfo.c b/net/sched/act_ctinfo.c
index 9722204..b20c8ce 100644
--- a/net/sched/act_ctinfo.c
+++ b/net/sched/act_ctinfo.c
@@ -144,9 +144,8 @@
}
static const struct nla_policy ctinfo_policy[TCA_CTINFO_MAX + 1] = {
- [TCA_CTINFO_ACT] = { .type = NLA_EXACT_LEN,
- .len = sizeof(struct
- tc_ctinfo) },
+ [TCA_CTINFO_ACT] =
+ NLA_POLICY_EXACT_LEN(sizeof(struct tc_ctinfo)),
[TCA_CTINFO_ZONE] = { .type = NLA_U16 },
[TCA_CTINFO_PARMS_DSCP_MASK] = { .type = NLA_U32 },
[TCA_CTINFO_PARMS_DSCP_STATEMASK] = { .type = NLA_U32 },
@@ -156,7 +155,7 @@
static int tcf_ctinfo_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a,
int ovr, int bind, bool rtnl_held,
- struct tcf_proto *tp,
+ struct tcf_proto *tp, u32 flags,
struct netlink_ext_ack *extack)
{
struct tc_action_net *tn = net_generic(net, ctinfo_net_id);
@@ -213,7 +212,7 @@
err = tcf_idr_check_alloc(tn, &index, a, bind);
if (!err) {
ret = tcf_idr_create(tn, index, est, a,
- &act_ctinfo_ops, bind, false);
+ &act_ctinfo_ops, bind, false, 0);
if (ret) {
tcf_idr_cleanup(tn, index);
return ret;
@@ -260,8 +259,8 @@
spin_lock_bh(&ci->tcf_lock);
goto_ch = tcf_action_set_ctrlact(*a, actparm->action, goto_ch);
- rcu_swap_protected(ci->params, cp_new,
- lockdep_is_held(&ci->tcf_lock));
+ cp_new = rcu_replace_pointer(ci->params, cp_new,
+ lockdep_is_held(&ci->tcf_lock));
spin_unlock_bh(&ci->tcf_lock);
if (goto_ch)
diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c
index faf68a4..73c3926 100644
--- a/net/sched/act_gact.c
+++ b/net/sched/act_gact.c
@@ -53,7 +53,8 @@
static int tcf_gact_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a,
int ovr, int bind, bool rtnl_held,
- struct tcf_proto *tp, struct netlink_ext_ack *extack)
+ struct tcf_proto *tp, u32 flags,
+ struct netlink_ext_ack *extack)
{
struct tc_action_net *tn = net_generic(net, gact_net_id);
struct nlattr *tb[TCA_GACT_MAX + 1];
@@ -98,8 +99,8 @@
err = tcf_idr_check_alloc(tn, &index, a, bind);
if (!err) {
- ret = tcf_idr_create(tn, index, est, a,
- &act_gact_ops, bind, true);
+ ret = tcf_idr_create_from_flags(tn, index, est, a,
+ &act_gact_ops, bind, flags);
if (ret) {
tcf_idr_cleanup(tn, index);
return ret;
@@ -159,31 +160,24 @@
action = gact_rand[ptype](gact);
}
#endif
- bstats_cpu_update(this_cpu_ptr(gact->common.cpu_bstats), skb);
+ tcf_action_update_bstats(&gact->common, skb);
if (action == TC_ACT_SHOT)
- qstats_drop_inc(this_cpu_ptr(gact->common.cpu_qstats));
+ tcf_action_inc_drop_qstats(&gact->common);
tcf_lastuse_update(&gact->tcf_tm);
return action;
}
-static void tcf_gact_stats_update(struct tc_action *a, u64 bytes, u32 packets,
- u64 lastuse, bool hw)
+static void tcf_gact_stats_update(struct tc_action *a, u64 bytes, u64 packets,
+ u64 drops, u64 lastuse, bool hw)
{
struct tcf_gact *gact = to_gact(a);
int action = READ_ONCE(gact->tcf_action);
struct tcf_t *tm = &gact->tcf_tm;
- _bstats_cpu_update(this_cpu_ptr(gact->common.cpu_bstats), bytes,
- packets);
- if (action == TC_ACT_SHOT)
- this_cpu_ptr(gact->common.cpu_qstats)->drops += packets;
-
- if (hw)
- _bstats_cpu_update(this_cpu_ptr(gact->common.cpu_bstats_hw),
- bytes, packets);
-
+ tcf_action_update_stats(a, bytes, packets,
+ action == TC_ACT_SHOT ? packets : drops, hw);
tm->lastuse = max_t(u64, tm->lastuse, lastuse);
}
diff --git a/net/sched/act_gate.c b/net/sched/act_gate.c
new file mode 100644
index 0000000..a78cb79
--- /dev/null
+++ b/net/sched/act_gate.c
@@ -0,0 +1,646 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* Copyright 2020 NXP */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <net/act_api.h>
+#include <net/netlink.h>
+#include <net/pkt_cls.h>
+#include <net/tc_act/tc_gate.h>
+
+static unsigned int gate_net_id;
+static struct tc_action_ops act_gate_ops;
+
+static ktime_t gate_get_time(struct tcf_gate *gact)
+{
+ ktime_t mono = ktime_get();
+
+ switch (gact->tk_offset) {
+ case TK_OFFS_MAX:
+ return mono;
+ default:
+ return ktime_mono_to_any(mono, gact->tk_offset);
+ }
+
+ return KTIME_MAX;
+}
+
+static void gate_get_start_time(struct tcf_gate *gact, ktime_t *start)
+{
+ struct tcf_gate_params *param = &gact->param;
+ ktime_t now, base, cycle;
+ u64 n;
+
+ base = ns_to_ktime(param->tcfg_basetime);
+ now = gate_get_time(gact);
+
+ if (ktime_after(base, now)) {
+ *start = base;
+ return;
+ }
+
+ cycle = param->tcfg_cycletime;
+
+ n = div64_u64(ktime_sub_ns(now, base), cycle);
+ *start = ktime_add_ns(base, (n + 1) * cycle);
+}
+
+static void gate_start_timer(struct tcf_gate *gact, ktime_t start)
+{
+ ktime_t expires;
+
+ expires = hrtimer_get_expires(&gact->hitimer);
+ if (expires == 0)
+ expires = KTIME_MAX;
+
+ start = min_t(ktime_t, start, expires);
+
+ hrtimer_start(&gact->hitimer, start, HRTIMER_MODE_ABS_SOFT);
+}
+
+static enum hrtimer_restart gate_timer_func(struct hrtimer *timer)
+{
+ struct tcf_gate *gact = container_of(timer, struct tcf_gate,
+ hitimer);
+ struct tcf_gate_params *p = &gact->param;
+ struct tcfg_gate_entry *next;
+ ktime_t close_time, now;
+
+ spin_lock(&gact->tcf_lock);
+
+ next = gact->next_entry;
+
+ /* cycle start, clear pending bit, clear total octets */
+ gact->current_gate_status = next->gate_state ? GATE_ACT_GATE_OPEN : 0;
+ gact->current_entry_octets = 0;
+ gact->current_max_octets = next->maxoctets;
+
+ gact->current_close_time = ktime_add_ns(gact->current_close_time,
+ next->interval);
+
+ close_time = gact->current_close_time;
+
+ if (list_is_last(&next->list, &p->entries))
+ next = list_first_entry(&p->entries,
+ struct tcfg_gate_entry, list);
+ else
+ next = list_next_entry(next, list);
+
+ now = gate_get_time(gact);
+
+ if (ktime_after(now, close_time)) {
+ ktime_t cycle, base;
+ u64 n;
+
+ cycle = p->tcfg_cycletime;
+ base = ns_to_ktime(p->tcfg_basetime);
+ n = div64_u64(ktime_sub_ns(now, base), cycle);
+ close_time = ktime_add_ns(base, (n + 1) * cycle);
+ }
+
+ gact->next_entry = next;
+
+ hrtimer_set_expires(&gact->hitimer, close_time);
+
+ spin_unlock(&gact->tcf_lock);
+
+ return HRTIMER_RESTART;
+}
+
+static int tcf_gate_act(struct sk_buff *skb, const struct tc_action *a,
+ struct tcf_result *res)
+{
+ struct tcf_gate *gact = to_gate(a);
+
+ spin_lock(&gact->tcf_lock);
+
+ tcf_lastuse_update(&gact->tcf_tm);
+ bstats_update(&gact->tcf_bstats, skb);
+
+ if (unlikely(gact->current_gate_status & GATE_ACT_PENDING)) {
+ spin_unlock(&gact->tcf_lock);
+ return gact->tcf_action;
+ }
+
+ if (!(gact->current_gate_status & GATE_ACT_GATE_OPEN))
+ goto drop;
+
+ if (gact->current_max_octets >= 0) {
+ gact->current_entry_octets += qdisc_pkt_len(skb);
+ if (gact->current_entry_octets > gact->current_max_octets) {
+ gact->tcf_qstats.overlimits++;
+ goto drop;
+ }
+ }
+
+ spin_unlock(&gact->tcf_lock);
+
+ return gact->tcf_action;
+drop:
+ gact->tcf_qstats.drops++;
+ spin_unlock(&gact->tcf_lock);
+
+ return TC_ACT_SHOT;
+}
+
+static const struct nla_policy entry_policy[TCA_GATE_ENTRY_MAX + 1] = {
+ [TCA_GATE_ENTRY_INDEX] = { .type = NLA_U32 },
+ [TCA_GATE_ENTRY_GATE] = { .type = NLA_FLAG },
+ [TCA_GATE_ENTRY_INTERVAL] = { .type = NLA_U32 },
+ [TCA_GATE_ENTRY_IPV] = { .type = NLA_S32 },
+ [TCA_GATE_ENTRY_MAX_OCTETS] = { .type = NLA_S32 },
+};
+
+static const struct nla_policy gate_policy[TCA_GATE_MAX + 1] = {
+ [TCA_GATE_PARMS] =
+ NLA_POLICY_EXACT_LEN(sizeof(struct tc_gate)),
+ [TCA_GATE_PRIORITY] = { .type = NLA_S32 },
+ [TCA_GATE_ENTRY_LIST] = { .type = NLA_NESTED },
+ [TCA_GATE_BASE_TIME] = { .type = NLA_U64 },
+ [TCA_GATE_CYCLE_TIME] = { .type = NLA_U64 },
+ [TCA_GATE_CYCLE_TIME_EXT] = { .type = NLA_U64 },
+ [TCA_GATE_FLAGS] = { .type = NLA_U32 },
+ [TCA_GATE_CLOCKID] = { .type = NLA_S32 },
+};
+
+static int fill_gate_entry(struct nlattr **tb, struct tcfg_gate_entry *entry,
+ struct netlink_ext_ack *extack)
+{
+ u32 interval = 0;
+
+ entry->gate_state = nla_get_flag(tb[TCA_GATE_ENTRY_GATE]);
+
+ if (tb[TCA_GATE_ENTRY_INTERVAL])
+ interval = nla_get_u32(tb[TCA_GATE_ENTRY_INTERVAL]);
+
+ if (interval == 0) {
+ NL_SET_ERR_MSG(extack, "Invalid interval for schedule entry");
+ return -EINVAL;
+ }
+
+ entry->interval = interval;
+
+ if (tb[TCA_GATE_ENTRY_IPV])
+ entry->ipv = nla_get_s32(tb[TCA_GATE_ENTRY_IPV]);
+ else
+ entry->ipv = -1;
+
+ if (tb[TCA_GATE_ENTRY_MAX_OCTETS])
+ entry->maxoctets = nla_get_s32(tb[TCA_GATE_ENTRY_MAX_OCTETS]);
+ else
+ entry->maxoctets = -1;
+
+ return 0;
+}
+
+static int parse_gate_entry(struct nlattr *n, struct tcfg_gate_entry *entry,
+ int index, struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[TCA_GATE_ENTRY_MAX + 1] = { };
+ int err;
+
+ err = nla_parse_nested(tb, TCA_GATE_ENTRY_MAX, n, entry_policy, extack);
+ if (err < 0) {
+ NL_SET_ERR_MSG(extack, "Could not parse nested entry");
+ return -EINVAL;
+ }
+
+ entry->index = index;
+
+ return fill_gate_entry(tb, entry, extack);
+}
+
+static void release_entry_list(struct list_head *entries)
+{
+ struct tcfg_gate_entry *entry, *e;
+
+ list_for_each_entry_safe(entry, e, entries, list) {
+ list_del(&entry->list);
+ kfree(entry);
+ }
+}
+
+static int parse_gate_list(struct nlattr *list_attr,
+ struct tcf_gate_params *sched,
+ struct netlink_ext_ack *extack)
+{
+ struct tcfg_gate_entry *entry;
+ struct nlattr *n;
+ int err, rem;
+ int i = 0;
+
+ if (!list_attr)
+ return -EINVAL;
+
+ nla_for_each_nested(n, list_attr, rem) {
+ if (nla_type(n) != TCA_GATE_ONE_ENTRY) {
+ NL_SET_ERR_MSG(extack, "Attribute isn't type 'entry'");
+ continue;
+ }
+
+ entry = kzalloc(sizeof(*entry), GFP_ATOMIC);
+ if (!entry) {
+ NL_SET_ERR_MSG(extack, "Not enough memory for entry");
+ err = -ENOMEM;
+ goto release_list;
+ }
+
+ err = parse_gate_entry(n, entry, i, extack);
+ if (err < 0) {
+ kfree(entry);
+ goto release_list;
+ }
+
+ list_add_tail(&entry->list, &sched->entries);
+ i++;
+ }
+
+ sched->num_entries = i;
+
+ return i;
+
+release_list:
+ release_entry_list(&sched->entries);
+
+ return err;
+}
+
+static void gate_setup_timer(struct tcf_gate *gact, u64 basetime,
+ enum tk_offsets tko, s32 clockid,
+ bool do_init)
+{
+ if (!do_init) {
+ if (basetime == gact->param.tcfg_basetime &&
+ tko == gact->tk_offset &&
+ clockid == gact->param.tcfg_clockid)
+ return;
+
+ spin_unlock_bh(&gact->tcf_lock);
+ hrtimer_cancel(&gact->hitimer);
+ spin_lock_bh(&gact->tcf_lock);
+ }
+ gact->param.tcfg_basetime = basetime;
+ gact->param.tcfg_clockid = clockid;
+ gact->tk_offset = tko;
+ hrtimer_init(&gact->hitimer, clockid, HRTIMER_MODE_ABS_SOFT);
+ gact->hitimer.function = gate_timer_func;
+}
+
+static int tcf_gate_init(struct net *net, struct nlattr *nla,
+ struct nlattr *est, struct tc_action **a,
+ int ovr, int bind, bool rtnl_held,
+ struct tcf_proto *tp, u32 flags,
+ struct netlink_ext_ack *extack)
+{
+ struct tc_action_net *tn = net_generic(net, gate_net_id);
+ enum tk_offsets tk_offset = TK_OFFS_TAI;
+ struct nlattr *tb[TCA_GATE_MAX + 1];
+ struct tcf_chain *goto_ch = NULL;
+ u64 cycletime = 0, basetime = 0;
+ struct tcf_gate_params *p;
+ s32 clockid = CLOCK_TAI;
+ struct tcf_gate *gact;
+ struct tc_gate *parm;
+ int ret = 0, err;
+ u32 gflags = 0;
+ s32 prio = -1;
+ ktime_t start;
+ u32 index;
+
+ if (!nla)
+ return -EINVAL;
+
+ err = nla_parse_nested(tb, TCA_GATE_MAX, nla, gate_policy, extack);
+ if (err < 0)
+ return err;
+
+ if (!tb[TCA_GATE_PARMS])
+ return -EINVAL;
+
+ if (tb[TCA_GATE_CLOCKID]) {
+ clockid = nla_get_s32(tb[TCA_GATE_CLOCKID]);
+ switch (clockid) {
+ case CLOCK_REALTIME:
+ tk_offset = TK_OFFS_REAL;
+ break;
+ case CLOCK_MONOTONIC:
+ tk_offset = TK_OFFS_MAX;
+ break;
+ case CLOCK_BOOTTIME:
+ tk_offset = TK_OFFS_BOOT;
+ break;
+ case CLOCK_TAI:
+ tk_offset = TK_OFFS_TAI;
+ break;
+ default:
+ NL_SET_ERR_MSG(extack, "Invalid 'clockid'");
+ return -EINVAL;
+ }
+ }
+
+ parm = nla_data(tb[TCA_GATE_PARMS]);
+ index = parm->index;
+
+ err = tcf_idr_check_alloc(tn, &index, a, bind);
+ if (err < 0)
+ return err;
+
+ if (err && bind)
+ return 0;
+
+ if (!err) {
+ ret = tcf_idr_create(tn, index, est, a,
+ &act_gate_ops, bind, false, 0);
+ if (ret) {
+ tcf_idr_cleanup(tn, index);
+ return ret;
+ }
+
+ ret = ACT_P_CREATED;
+ } else if (!ovr) {
+ tcf_idr_release(*a, bind);
+ return -EEXIST;
+ }
+
+ if (tb[TCA_GATE_PRIORITY])
+ prio = nla_get_s32(tb[TCA_GATE_PRIORITY]);
+
+ if (tb[TCA_GATE_BASE_TIME])
+ basetime = nla_get_u64(tb[TCA_GATE_BASE_TIME]);
+
+ if (tb[TCA_GATE_FLAGS])
+ gflags = nla_get_u32(tb[TCA_GATE_FLAGS]);
+
+ gact = to_gate(*a);
+ if (ret == ACT_P_CREATED)
+ INIT_LIST_HEAD(&gact->param.entries);
+
+ err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack);
+ if (err < 0)
+ goto release_idr;
+
+ spin_lock_bh(&gact->tcf_lock);
+ p = &gact->param;
+
+ if (tb[TCA_GATE_CYCLE_TIME])
+ cycletime = nla_get_u64(tb[TCA_GATE_CYCLE_TIME]);
+
+ if (tb[TCA_GATE_ENTRY_LIST]) {
+ err = parse_gate_list(tb[TCA_GATE_ENTRY_LIST], p, extack);
+ if (err < 0)
+ goto chain_put;
+ }
+
+ if (!cycletime) {
+ struct tcfg_gate_entry *entry;
+ ktime_t cycle = 0;
+
+ list_for_each_entry(entry, &p->entries, list)
+ cycle = ktime_add_ns(cycle, entry->interval);
+ cycletime = cycle;
+ if (!cycletime) {
+ err = -EINVAL;
+ goto chain_put;
+ }
+ }
+ p->tcfg_cycletime = cycletime;
+
+ if (tb[TCA_GATE_CYCLE_TIME_EXT])
+ p->tcfg_cycletime_ext =
+ nla_get_u64(tb[TCA_GATE_CYCLE_TIME_EXT]);
+
+ gate_setup_timer(gact, basetime, tk_offset, clockid,
+ ret == ACT_P_CREATED);
+ p->tcfg_priority = prio;
+ p->tcfg_flags = gflags;
+ gate_get_start_time(gact, &start);
+
+ gact->current_close_time = start;
+ gact->current_gate_status = GATE_ACT_GATE_OPEN | GATE_ACT_PENDING;
+
+ gact->next_entry = list_first_entry(&p->entries,
+ struct tcfg_gate_entry, list);
+
+ goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch);
+
+ gate_start_timer(gact, start);
+
+ spin_unlock_bh(&gact->tcf_lock);
+
+ if (goto_ch)
+ tcf_chain_put_by_act(goto_ch);
+
+ return ret;
+
+chain_put:
+ spin_unlock_bh(&gact->tcf_lock);
+
+ if (goto_ch)
+ tcf_chain_put_by_act(goto_ch);
+release_idr:
+ /* action is not inserted in any list: it's safe to init hitimer
+ * without taking tcf_lock.
+ */
+ if (ret == ACT_P_CREATED)
+ gate_setup_timer(gact, gact->param.tcfg_basetime,
+ gact->tk_offset, gact->param.tcfg_clockid,
+ true);
+ tcf_idr_release(*a, bind);
+ return err;
+}
+
+static void tcf_gate_cleanup(struct tc_action *a)
+{
+ struct tcf_gate *gact = to_gate(a);
+ struct tcf_gate_params *p;
+
+ p = &gact->param;
+ hrtimer_cancel(&gact->hitimer);
+ release_entry_list(&p->entries);
+}
+
+static int dumping_entry(struct sk_buff *skb,
+ struct tcfg_gate_entry *entry)
+{
+ struct nlattr *item;
+
+ item = nla_nest_start_noflag(skb, TCA_GATE_ONE_ENTRY);
+ if (!item)
+ return -ENOSPC;
+
+ if (nla_put_u32(skb, TCA_GATE_ENTRY_INDEX, entry->index))
+ goto nla_put_failure;
+
+ if (entry->gate_state && nla_put_flag(skb, TCA_GATE_ENTRY_GATE))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, TCA_GATE_ENTRY_INTERVAL, entry->interval))
+ goto nla_put_failure;
+
+ if (nla_put_s32(skb, TCA_GATE_ENTRY_MAX_OCTETS, entry->maxoctets))
+ goto nla_put_failure;
+
+ if (nla_put_s32(skb, TCA_GATE_ENTRY_IPV, entry->ipv))
+ goto nla_put_failure;
+
+ return nla_nest_end(skb, item);
+
+nla_put_failure:
+ nla_nest_cancel(skb, item);
+ return -1;
+}
+
+static int tcf_gate_dump(struct sk_buff *skb, struct tc_action *a,
+ int bind, int ref)
+{
+ unsigned char *b = skb_tail_pointer(skb);
+ struct tcf_gate *gact = to_gate(a);
+ struct tc_gate opt = {
+ .index = gact->tcf_index,
+ .refcnt = refcount_read(&gact->tcf_refcnt) - ref,
+ .bindcnt = atomic_read(&gact->tcf_bindcnt) - bind,
+ };
+ struct tcfg_gate_entry *entry;
+ struct tcf_gate_params *p;
+ struct nlattr *entry_list;
+ struct tcf_t t;
+
+ spin_lock_bh(&gact->tcf_lock);
+ opt.action = gact->tcf_action;
+
+ p = &gact->param;
+
+ if (nla_put(skb, TCA_GATE_PARMS, sizeof(opt), &opt))
+ goto nla_put_failure;
+
+ if (nla_put_u64_64bit(skb, TCA_GATE_BASE_TIME,
+ p->tcfg_basetime, TCA_GATE_PAD))
+ goto nla_put_failure;
+
+ if (nla_put_u64_64bit(skb, TCA_GATE_CYCLE_TIME,
+ p->tcfg_cycletime, TCA_GATE_PAD))
+ goto nla_put_failure;
+
+ if (nla_put_u64_64bit(skb, TCA_GATE_CYCLE_TIME_EXT,
+ p->tcfg_cycletime_ext, TCA_GATE_PAD))
+ goto nla_put_failure;
+
+ if (nla_put_s32(skb, TCA_GATE_CLOCKID, p->tcfg_clockid))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, TCA_GATE_FLAGS, p->tcfg_flags))
+ goto nla_put_failure;
+
+ if (nla_put_s32(skb, TCA_GATE_PRIORITY, p->tcfg_priority))
+ goto nla_put_failure;
+
+ entry_list = nla_nest_start_noflag(skb, TCA_GATE_ENTRY_LIST);
+ if (!entry_list)
+ goto nla_put_failure;
+
+ list_for_each_entry(entry, &p->entries, list) {
+ if (dumping_entry(skb, entry) < 0)
+ goto nla_put_failure;
+ }
+
+ nla_nest_end(skb, entry_list);
+
+ tcf_tm_dump(&t, &gact->tcf_tm);
+ if (nla_put_64bit(skb, TCA_GATE_TM, sizeof(t), &t, TCA_GATE_PAD))
+ goto nla_put_failure;
+ spin_unlock_bh(&gact->tcf_lock);
+
+ return skb->len;
+
+nla_put_failure:
+ spin_unlock_bh(&gact->tcf_lock);
+ nlmsg_trim(skb, b);
+ return -1;
+}
+
+static int tcf_gate_walker(struct net *net, struct sk_buff *skb,
+ struct netlink_callback *cb, int type,
+ const struct tc_action_ops *ops,
+ struct netlink_ext_ack *extack)
+{
+ struct tc_action_net *tn = net_generic(net, gate_net_id);
+
+ return tcf_generic_walker(tn, skb, cb, type, ops, extack);
+}
+
+static void tcf_gate_stats_update(struct tc_action *a, u64 bytes, u64 packets,
+ u64 drops, u64 lastuse, bool hw)
+{
+ struct tcf_gate *gact = to_gate(a);
+ struct tcf_t *tm = &gact->tcf_tm;
+
+ tcf_action_update_stats(a, bytes, packets, drops, hw);
+ tm->lastuse = max_t(u64, tm->lastuse, lastuse);
+}
+
+static int tcf_gate_search(struct net *net, struct tc_action **a, u32 index)
+{
+ struct tc_action_net *tn = net_generic(net, gate_net_id);
+
+ return tcf_idr_search(tn, a, index);
+}
+
+static size_t tcf_gate_get_fill_size(const struct tc_action *act)
+{
+ return nla_total_size(sizeof(struct tc_gate));
+}
+
+static struct tc_action_ops act_gate_ops = {
+ .kind = "gate",
+ .id = TCA_ID_GATE,
+ .owner = THIS_MODULE,
+ .act = tcf_gate_act,
+ .dump = tcf_gate_dump,
+ .init = tcf_gate_init,
+ .cleanup = tcf_gate_cleanup,
+ .walk = tcf_gate_walker,
+ .stats_update = tcf_gate_stats_update,
+ .get_fill_size = tcf_gate_get_fill_size,
+ .lookup = tcf_gate_search,
+ .size = sizeof(struct tcf_gate),
+};
+
+static __net_init int gate_init_net(struct net *net)
+{
+ struct tc_action_net *tn = net_generic(net, gate_net_id);
+
+ return tc_action_net_init(net, tn, &act_gate_ops);
+}
+
+static void __net_exit gate_exit_net(struct list_head *net_list)
+{
+ tc_action_net_exit(net_list, gate_net_id);
+}
+
+static struct pernet_operations gate_net_ops = {
+ .init = gate_init_net,
+ .exit_batch = gate_exit_net,
+ .id = &gate_net_id,
+ .size = sizeof(struct tc_action_net),
+};
+
+static int __init gate_init_module(void)
+{
+ return tcf_register_action(&act_gate_ops, &gate_net_ops);
+}
+
+static void __exit gate_cleanup_module(void)
+{
+ tcf_unregister_action(&act_gate_ops, &gate_net_ops);
+}
+
+module_init(gate_init_module);
+module_exit(gate_cleanup_module);
+MODULE_LICENSE("GPL v2");
diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c
index 488d104..a2ddea0 100644
--- a/net/sched/act_ife.c
+++ b/net/sched/act_ife.c
@@ -480,7 +480,8 @@
static int tcf_ife_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a,
int ovr, int bind, bool rtnl_held,
- struct tcf_proto *tp, struct netlink_ext_ack *extack)
+ struct tcf_proto *tp, u32 flags,
+ struct netlink_ext_ack *extack)
{
struct tc_action_net *tn = net_generic(net, ife_net_id);
struct nlattr *tb[TCA_IFE_MAX + 1];
@@ -552,7 +553,7 @@
if (!exists) {
ret = tcf_idr_create(tn, index, est, a, &act_ife_ops,
- bind, true);
+ bind, true, 0);
if (ret) {
tcf_idr_cleanup(tn, index);
kfree(p);
@@ -617,7 +618,7 @@
spin_lock_bh(&ife->tcf_lock);
/* protected by tcf_lock when modifying existing action */
goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch);
- rcu_swap_protected(ife->params, p, 1);
+ p = rcu_replace_pointer(ife->params, p, 1);
if (exists)
spin_unlock_bh(&ife->tcf_lock);
diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c
index 02b0cb6..8dc3bec 100644
--- a/net/sched/act_ipt.c
+++ b/net/sched/act_ipt.c
@@ -95,7 +95,7 @@
static int __tcf_ipt_init(struct net *net, unsigned int id, struct nlattr *nla,
struct nlattr *est, struct tc_action **a,
const struct tc_action_ops *ops, int ovr, int bind,
- struct tcf_proto *tp)
+ struct tcf_proto *tp, u32 flags)
{
struct tc_action_net *tn = net_generic(net, id);
struct nlattr *tb[TCA_IPT_MAX + 1];
@@ -144,7 +144,7 @@
if (!exists) {
ret = tcf_idr_create(tn, index, est, a, ops, bind,
- false);
+ false, 0);
if (ret) {
tcf_idr_cleanup(tn, index);
return ret;
@@ -203,19 +203,19 @@
static int tcf_ipt_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a, int ovr,
int bind, bool rtnl_held, struct tcf_proto *tp,
- struct netlink_ext_ack *extack)
+ u32 flags, struct netlink_ext_ack *extack)
{
return __tcf_ipt_init(net, ipt_net_id, nla, est, a, &act_ipt_ops, ovr,
- bind, tp);
+ bind, tp, flags);
}
static int tcf_xt_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a, int ovr,
int bind, bool unlocked, struct tcf_proto *tp,
- struct netlink_ext_ack *extack)
+ u32 flags, struct netlink_ext_ack *extack)
{
return __tcf_ipt_init(net, xt_net_id, nla, est, a, &act_xt_ops, ovr,
- bind, tp);
+ bind, tp, flags);
}
static int tcf_ipt_act(struct sk_buff *skb, const struct tc_action *a,
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index e3ff884..24d561d 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -19,6 +19,7 @@
#include <linux/if_arp.h>
#include <net/net_namespace.h>
#include <net/netlink.h>
+#include <net/dst.h>
#include <net/pkt_sched.h>
#include <net/pkt_cls.h>
#include <linux/tc_act/tc_mirred.h>
@@ -93,7 +94,7 @@
struct nlattr *est, struct tc_action **a,
int ovr, int bind, bool rtnl_held,
struct tcf_proto *tp,
- struct netlink_ext_ack *extack)
+ u32 flags, struct netlink_ext_ack *extack)
{
struct tc_action_net *tn = net_generic(net, mirred_net_id);
struct nlattr *tb[TCA_MIRRED_MAX + 1];
@@ -148,8 +149,8 @@
NL_SET_ERR_MSG_MOD(extack, "Specified device does not exist");
return -EINVAL;
}
- ret = tcf_idr_create(tn, index, est, a,
- &act_mirred_ops, bind, true);
+ ret = tcf_idr_create_from_flags(tn, index, est, a,
+ &act_mirred_ops, bind, flags);
if (ret) {
tcf_idr_cleanup(tn, index);
return ret;
@@ -178,8 +179,8 @@
goto put_chain;
}
mac_header_xmit = dev_is_mac_header_xmit(dev);
- rcu_swap_protected(m->tcfm_dev, dev,
- lockdep_is_held(&m->tcf_lock));
+ dev = rcu_replace_pointer(m->tcfm_dev, dev,
+ lockdep_is_held(&m->tcf_lock));
if (dev)
dev_put(dev);
m->tcfm_mac_header_xmit = mac_header_xmit;
@@ -218,6 +219,7 @@
bool want_ingress;
bool is_redirect;
bool expects_nh;
+ bool at_ingress;
int m_eaction;
int mac_len;
bool at_nh;
@@ -231,7 +233,7 @@
}
tcf_lastuse_update(&m->tcf_tm);
- bstats_cpu_update(this_cpu_ptr(m->common.cpu_bstats), skb);
+ tcf_action_update_bstats(&m->common, skb);
m_mac_header_xmit = READ_ONCE(m->tcfm_mac_header_xmit);
m_eaction = READ_ONCE(m->tcfm_eaction);
@@ -253,7 +255,8 @@
* ingress - that covers the TC S/W datapath.
*/
is_redirect = tcf_mirred_is_act_redirect(m_eaction);
- use_reinsert = skb_at_tc_ingress(skb) && is_redirect &&
+ at_ingress = skb_at_tc_ingress(skb);
+ use_reinsert = at_ingress && is_redirect &&
tcf_mirred_can_reinsert(retval);
if (!use_reinsert) {
skb2 = skb_clone(skb, GFP_ATOMIC);
@@ -261,10 +264,12 @@
goto out;
}
+ want_ingress = tcf_mirred_act_wants_ingress(m_eaction);
+
/* All mirred/redirected skbs should clear previous ct info */
nf_reset_ct(skb2);
-
- want_ingress = tcf_mirred_act_wants_ingress(m_eaction);
+ if (want_ingress && !at_ingress) /* drop dst for egress -> ingress */
+ skb_dst_drop(skb2);
expects_nh = want_ingress || !m_mac_header_xmit;
at_nh = skb->data == skb_network_header(skb);
@@ -290,8 +295,8 @@
/* let's the caller reinsert the packet, if possible */
if (use_reinsert) {
res->ingress = want_ingress;
- res->qstats = this_cpu_ptr(m->common.cpu_qstats);
- skb_tc_reinsert(skb, res);
+ if (skb_tc_reinsert(skb, res))
+ tcf_action_inc_overlimit_qstats(&m->common);
__this_cpu_dec(mirred_rec_level);
return TC_ACT_CONSUMED;
}
@@ -304,7 +309,7 @@
if (err) {
out:
- qstats_overlimit_inc(this_cpu_ptr(m->common.cpu_qstats));
+ tcf_action_inc_overlimit_qstats(&m->common);
if (tcf_mirred_is_act_redirect(m_eaction))
retval = TC_ACT_SHOT;
}
@@ -313,16 +318,13 @@
return retval;
}
-static void tcf_stats_update(struct tc_action *a, u64 bytes, u32 packets,
- u64 lastuse, bool hw)
+static void tcf_stats_update(struct tc_action *a, u64 bytes, u64 packets,
+ u64 drops, u64 lastuse, bool hw)
{
struct tcf_mirred *m = to_mirred(a);
struct tcf_t *tm = &m->tcf_tm;
- _bstats_cpu_update(this_cpu_ptr(a->cpu_bstats), bytes, packets);
- if (hw)
- _bstats_cpu_update(this_cpu_ptr(a->cpu_bstats_hw),
- bytes, packets);
+ tcf_action_update_stats(a, bytes, packets, drops, hw);
tm->lastuse = max_t(u64, tm->lastuse, lastuse);
}
diff --git a/net/sched/act_mpls.c b/net/sched/act_mpls.c
index 0fccae3..d1486ea 100644
--- a/net/sched/act_mpls.c
+++ b/net/sched/act_mpls.c
@@ -87,6 +87,23 @@
skb->dev && skb->dev->type == ARPHRD_ETHER))
goto drop;
break;
+ case TCA_MPLS_ACT_MAC_PUSH:
+ if (skb_vlan_tag_present(skb)) {
+ if (__vlan_insert_inner_tag(skb, skb->vlan_proto,
+ skb_vlan_tag_get(skb),
+ ETH_HLEN) < 0)
+ goto drop;
+
+ skb->protocol = skb->vlan_proto;
+ __vlan_hwaccel_clear_tag(skb);
+ }
+
+ new_lse = tcf_mpls_get_lse(NULL, p, mac_len ||
+ !eth_p_mpls(skb->protocol));
+
+ if (skb_mpls_push(skb, new_lse, p->tcfm_proto, 0, false))
+ goto drop;
+ break;
case TCA_MPLS_ACT_MODIFY:
if (!pskb_may_pull(skb,
skb_network_offset(skb) + MPLS_HLEN))
@@ -125,7 +142,6 @@
}
static const struct nla_policy mpls_policy[TCA_MPLS_MAX + 1] = {
- [TCA_MPLS_UNSPEC] = { .strict_start_type = TCA_MPLS_UNSPEC + 1 },
[TCA_MPLS_PARMS] = NLA_POLICY_EXACT_LEN(sizeof(struct tc_mpls)),
[TCA_MPLS_PROTO] = { .type = NLA_U16 },
[TCA_MPLS_LABEL] = NLA_POLICY_VALIDATE_FN(NLA_U32, valid_label),
@@ -137,7 +153,8 @@
static int tcf_mpls_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a,
int ovr, int bind, bool rtnl_held,
- struct tcf_proto *tp, struct netlink_ext_ack *extack)
+ struct tcf_proto *tp, u32 flags,
+ struct netlink_ext_ack *extack)
{
struct tc_action_net *tn = net_generic(net, mpls_net_id);
struct nlattr *tb[TCA_MPLS_MAX + 1];
@@ -191,6 +208,7 @@
}
break;
case TCA_MPLS_ACT_PUSH:
+ case TCA_MPLS_ACT_MAC_PUSH:
if (!tb[TCA_MPLS_LABEL]) {
NL_SET_ERR_MSG_MOD(extack, "Label is required for MPLS push");
return -EINVAL;
@@ -230,7 +248,7 @@
if (!exists) {
ret = tcf_idr_create(tn, index, est, a,
- &act_mpls_ops, bind, true);
+ &act_mpls_ops, bind, true, 0);
if (ret) {
tcf_idr_cleanup(tn, index);
return ret;
@@ -268,7 +286,7 @@
spin_lock_bh(&m->tcf_lock);
goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch);
- rcu_swap_protected(m->mpls_p, p, lockdep_is_held(&m->tcf_lock));
+ p = rcu_replace_pointer(m->mpls_p, p, lockdep_is_held(&m->tcf_lock));
spin_unlock_bh(&m->tcf_lock);
if (goto_ch)
diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c
index 8b5d236..1ebd2a8 100644
--- a/net/sched/act_nat.c
+++ b/net/sched/act_nat.c
@@ -36,7 +36,7 @@
static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est,
struct tc_action **a, int ovr, int bind,
bool rtnl_held, struct tcf_proto *tp,
- struct netlink_ext_ack *extack)
+ u32 flags, struct netlink_ext_ack *extack)
{
struct tc_action_net *tn = net_generic(net, nat_net_id);
struct nlattr *tb[TCA_NAT_MAX + 1];
@@ -61,7 +61,7 @@
err = tcf_idr_check_alloc(tn, &index, a, bind);
if (!err) {
ret = tcf_idr_create(tn, index, est, a,
- &act_nat_ops, bind, false);
+ &act_nat_ops, bind, false, 0);
if (ret) {
tcf_idr_cleanup(tn, index);
return ret;
@@ -203,9 +203,7 @@
icmph = (void *)(skb_network_header(skb) + ihl);
- if ((icmph->type != ICMP_DEST_UNREACH) &&
- (icmph->type != ICMP_TIME_EXCEEDED) &&
- (icmph->type != ICMP_PARAMETERPROB))
+ if (!icmp_is_err(icmph->type))
break;
if (!pskb_may_pull(skb, ihl + sizeof(*icmph) + sizeof(*iph) +
diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c
index ff4f243..b453044 100644
--- a/net/sched/act_pedit.c
+++ b/net/sched/act_pedit.c
@@ -137,7 +137,8 @@
static int tcf_pedit_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a,
int ovr, int bind, bool rtnl_held,
- struct tcf_proto *tp, struct netlink_ext_ack *extack)
+ struct tcf_proto *tp, u32 flags,
+ struct netlink_ext_ack *extack)
{
struct tc_action_net *tn = net_generic(net, pedit_net_id);
struct nlattr *tb[TCA_PEDIT_MAX + 1];
@@ -188,7 +189,7 @@
err = tcf_idr_check_alloc(tn, &index, a, bind);
if (!err) {
ret = tcf_idr_create(tn, index, est, a,
- &act_pedit_ops, bind, false);
+ &act_pedit_ops, bind, false, 0);
if (ret) {
tcf_idr_cleanup(tn, index);
goto out_free;
@@ -406,6 +407,16 @@
return p->tcf_action;
}
+static void tcf_pedit_stats_update(struct tc_action *a, u64 bytes, u64 packets,
+ u64 drops, u64 lastuse, bool hw)
+{
+ struct tcf_pedit *d = to_pedit(a);
+ struct tcf_t *tm = &d->tcf_tm;
+
+ tcf_action_update_stats(a, bytes, packets, drops, hw);
+ tm->lastuse = max_t(u64, tm->lastuse, lastuse);
+}
+
static int tcf_pedit_dump(struct sk_buff *skb, struct tc_action *a,
int bind, int ref)
{
@@ -423,8 +434,7 @@
return -ENOBUFS;
spin_lock_bh(&p->tcf_lock);
- memcpy(opt->keys, p->tcfp_keys,
- p->tcfp_nkeys * sizeof(struct tc_pedit_key));
+ memcpy(opt->keys, p->tcfp_keys, flex_array_size(opt, keys, p->tcfp_nkeys));
opt->index = p->tcf_index;
opt->nkeys = p->tcfp_nkeys;
opt->flags = p->tcfp_flags;
@@ -482,6 +492,7 @@
.id = TCA_ID_PEDIT,
.owner = THIS_MODULE,
.act = tcf_pedit_act,
+ .stats_update = tcf_pedit_stats_update,
.dump = tcf_pedit_dump,
.cleanup = tcf_pedit_cleanup,
.init = tcf_pedit_init,
diff --git a/net/sched/act_police.c b/net/sched/act_police.c
index 8fd23a8..8d8452b 100644
--- a/net/sched/act_police.c
+++ b/net/sched/act_police.c
@@ -47,7 +47,7 @@
static int tcf_police_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a,
int ovr, int bind, bool rtnl_held,
- struct tcf_proto *tp,
+ struct tcf_proto *tp, u32 flags,
struct netlink_ext_ack *extack)
{
int ret = 0, tcfp_result = TC_ACT_OK, err, size;
@@ -87,7 +87,7 @@
if (!exists) {
ret = tcf_idr_create(tn, index, NULL, a,
- &act_police_ops, bind, true);
+ &act_police_ops, bind, true, 0);
if (ret) {
tcf_idr_cleanup(tn, index);
return ret;
@@ -191,9 +191,9 @@
police->tcfp_ptoks = new->tcfp_mtu_ptoks;
spin_unlock_bh(&police->tcfp_lock);
goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch);
- rcu_swap_protected(police->params,
- new,
- lockdep_is_held(&police->tcf_lock));
+ new = rcu_replace_pointer(police->params,
+ new,
+ lockdep_is_held(&police->tcf_lock));
spin_unlock_bh(&police->tcf_lock);
if (goto_ch)
@@ -286,16 +286,13 @@
}
static void tcf_police_stats_update(struct tc_action *a,
- u64 bytes, u32 packets,
+ u64 bytes, u64 packets, u64 drops,
u64 lastuse, bool hw)
{
struct tcf_police *police = to_police(a);
struct tcf_t *tm = &police->tcf_tm;
- _bstats_cpu_update(this_cpu_ptr(a->cpu_bstats), bytes, packets);
- if (hw)
- _bstats_cpu_update(this_cpu_ptr(a->cpu_bstats_hw),
- bytes, packets);
+ tcf_action_update_stats(a, bytes, packets, drops, hw);
tm->lastuse = max_t(u64, tm->lastuse, lastuse);
}
@@ -343,10 +340,7 @@
nla_put_u32(skb, TCA_POLICE_AVRATE, p->tcfp_ewma_rate))
goto nla_put_failure;
- t.install = jiffies_to_clock_t(jiffies - police->tcf_tm.install);
- t.lastuse = jiffies_to_clock_t(jiffies - police->tcf_tm.lastuse);
- t.firstuse = jiffies_to_clock_t(jiffies - police->tcf_tm.firstuse);
- t.expires = jiffies_to_clock_t(police->tcf_tm.expires);
+ tcf_tm_dump(&t, &police->tcf_tm);
if (nla_put_64bit(skb, TCA_POLICE_TM, sizeof(t), &t, TCA_POLICE_PAD))
goto nla_put_failure;
spin_unlock_bh(&police->tcf_lock);
diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c
index 74450b0..3ebf9ed 100644
--- a/net/sched/act_sample.c
+++ b/net/sched/act_sample.c
@@ -36,7 +36,7 @@
static int tcf_sample_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a, int ovr,
int bind, bool rtnl_held, struct tcf_proto *tp,
- struct netlink_ext_ack *extack)
+ u32 flags, struct netlink_ext_ack *extack)
{
struct tc_action_net *tn = net_generic(net, sample_net_id);
struct nlattr *tb[TCA_SAMPLE_MAX + 1];
@@ -69,7 +69,7 @@
if (!exists) {
ret = tcf_idr_create(tn, index, est, a,
- &act_sample_ops, bind, true);
+ &act_sample_ops, bind, true, 0);
if (ret) {
tcf_idr_cleanup(tn, index);
return ret;
@@ -102,8 +102,8 @@
goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch);
s->rate = rate;
s->psample_group_num = psample_group_num;
- rcu_swap_protected(s->psample_group, psample_group,
- lockdep_is_held(&s->tcf_lock));
+ psample_group = rcu_replace_pointer(s->psample_group, psample_group,
+ lockdep_is_held(&s->tcf_lock));
if (tb[TCA_SAMPLE_TRUNC_SIZE]) {
s->truncate = true;
@@ -265,14 +265,12 @@
struct tcf_sample *s = to_sample(a);
struct psample_group *group;
- spin_lock_bh(&s->tcf_lock);
group = rcu_dereference_protected(s->psample_group,
lockdep_is_held(&s->tcf_lock));
if (group) {
psample_group_take(group);
*destructor = tcf_psample_group_put;
}
- spin_unlock_bh(&s->tcf_lock);
return group;
}
diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c
index 6b0617d..a4f3d0f 100644
--- a/net/sched/act_simple.c
+++ b/net/sched/act_simple.c
@@ -35,7 +35,7 @@
* Example if this was the 3rd packet and the string was "hello"
* then it would look like "hello_3" (without quotes)
*/
- pr_info("simple: %s_%d\n",
+ pr_info("simple: %s_%llu\n",
(char *)d->tcfd_defdata, d->tcf_bstats.packets);
spin_unlock(&d->tcf_lock);
return d->tcf_action;
@@ -86,7 +86,8 @@
static int tcf_simp_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a,
int ovr, int bind, bool rtnl_held,
- struct tcf_proto *tp, struct netlink_ext_ack *extack)
+ struct tcf_proto *tp, u32 flags,
+ struct netlink_ext_ack *extack)
{
struct tc_action_net *tn = net_generic(net, simp_net_id);
struct nlattr *tb[TCA_DEF_MAX + 1];
@@ -127,7 +128,7 @@
if (!exists) {
ret = tcf_idr_create(tn, index, est, a,
- &act_simp_ops, bind, false);
+ &act_simp_ops, bind, false, 0);
if (ret) {
tcf_idr_cleanup(tn, index);
return ret;
diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c
index f736da5..e5f3fb8 100644
--- a/net/sched/act_skbedit.c
+++ b/net/sched/act_skbedit.c
@@ -73,6 +73,17 @@
return TC_ACT_SHOT;
}
+static void tcf_skbedit_stats_update(struct tc_action *a, u64 bytes,
+ u64 packets, u64 drops,
+ u64 lastuse, bool hw)
+{
+ struct tcf_skbedit *d = to_skbedit(a);
+ struct tcf_t *tm = &d->tcf_tm;
+
+ tcf_action_update_stats(a, bytes, packets, drops, hw);
+ tm->lastuse = max_t(u64, tm->lastuse, lastuse);
+}
+
static const struct nla_policy skbedit_policy[TCA_SKBEDIT_MAX + 1] = {
[TCA_SKBEDIT_PARMS] = { .len = sizeof(struct tc_skbedit) },
[TCA_SKBEDIT_PRIORITY] = { .len = sizeof(u32) },
@@ -86,7 +97,7 @@
static int tcf_skbedit_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a,
int ovr, int bind, bool rtnl_held,
- struct tcf_proto *tp,
+ struct tcf_proto *tp, u32 act_flags,
struct netlink_ext_ack *extack)
{
struct tc_action_net *tn = net_generic(net, skbedit_net_id);
@@ -165,7 +176,7 @@
if (!exists) {
ret = tcf_idr_create(tn, index, est, a,
- &act_skbedit_ops, bind, true);
+ &act_skbedit_ops, bind, true, 0);
if (ret) {
tcf_idr_cleanup(tn, index);
return ret;
@@ -206,8 +217,8 @@
spin_lock_bh(&d->tcf_lock);
goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch);
- rcu_swap_protected(d->params, params_new,
- lockdep_is_held(&d->tcf_lock));
+ params_new = rcu_replace_pointer(d->params, params_new,
+ lockdep_is_held(&d->tcf_lock));
spin_unlock_bh(&d->tcf_lock);
if (params_new)
kfree_rcu(params_new, rcu);
@@ -321,6 +332,7 @@
.id = TCA_ID_SKBEDIT,
.owner = THIS_MODULE,
.act = tcf_skbedit_act,
+ .stats_update = tcf_skbedit_stats_update,
.dump = tcf_skbedit_dump,
.init = tcf_skbedit_init,
.cleanup = tcf_skbedit_cleanup,
diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c
index f60d349..8d17a54 100644
--- a/net/sched/act_skbmod.c
+++ b/net/sched/act_skbmod.c
@@ -83,7 +83,7 @@
static int tcf_skbmod_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a,
int ovr, int bind, bool rtnl_held,
- struct tcf_proto *tp,
+ struct tcf_proto *tp, u32 flags,
struct netlink_ext_ack *extack)
{
struct tc_action_net *tn = net_generic(net, skbmod_net_id);
@@ -147,7 +147,7 @@
if (!exists) {
ret = tcf_idr_create(tn, index, est, a,
- &act_skbmod_ops, bind, true);
+ &act_skbmod_ops, bind, true, 0);
if (ret) {
tcf_idr_cleanup(tn, index);
return ret;
diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c
index a5a2bf0..85c0d0d 100644
--- a/net/sched/act_tunnel_key.c
+++ b/net/sched/act_tunnel_key.c
@@ -10,6 +10,8 @@
#include <linux/skbuff.h>
#include <linux/rtnetlink.h>
#include <net/geneve.h>
+#include <net/vxlan.h>
+#include <net/erspan.h>
#include <net/netlink.h>
#include <net/pkt_sched.h>
#include <net/dst.h>
@@ -31,7 +33,7 @@
params = rcu_dereference_bh(t->params);
tcf_lastuse_update(&t->tcf_tm);
- bstats_cpu_update(this_cpu_ptr(t->common.cpu_bstats), skb);
+ tcf_action_update_bstats(&t->common, skb);
action = READ_ONCE(t->tcf_action);
switch (params->tcft_action) {
@@ -53,7 +55,11 @@
static const struct nla_policy
enc_opts_policy[TCA_TUNNEL_KEY_ENC_OPTS_MAX + 1] = {
+ [TCA_TUNNEL_KEY_ENC_OPTS_UNSPEC] = {
+ .strict_start_type = TCA_TUNNEL_KEY_ENC_OPTS_VXLAN },
[TCA_TUNNEL_KEY_ENC_OPTS_GENEVE] = { .type = NLA_NESTED },
+ [TCA_TUNNEL_KEY_ENC_OPTS_VXLAN] = { .type = NLA_NESTED },
+ [TCA_TUNNEL_KEY_ENC_OPTS_ERSPAN] = { .type = NLA_NESTED },
};
static const struct nla_policy
@@ -64,6 +70,19 @@
.len = 128 },
};
+static const struct nla_policy
+vxlan_opt_policy[TCA_TUNNEL_KEY_ENC_OPT_VXLAN_MAX + 1] = {
+ [TCA_TUNNEL_KEY_ENC_OPT_VXLAN_GBP] = { .type = NLA_U32 },
+};
+
+static const struct nla_policy
+erspan_opt_policy[TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_MAX + 1] = {
+ [TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_VER] = { .type = NLA_U8 },
+ [TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_INDEX] = { .type = NLA_U32 },
+ [TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_DIR] = { .type = NLA_U8 },
+ [TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_HWID] = { .type = NLA_U8 },
+};
+
static int
tunnel_key_copy_geneve_opt(const struct nlattr *nla, void *dst, int dst_len,
struct netlink_ext_ack *extack)
@@ -116,10 +135,90 @@
return opt_len;
}
+static int
+tunnel_key_copy_vxlan_opt(const struct nlattr *nla, void *dst, int dst_len,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[TCA_TUNNEL_KEY_ENC_OPT_VXLAN_MAX + 1];
+ int err;
+
+ err = nla_parse_nested(tb, TCA_TUNNEL_KEY_ENC_OPT_VXLAN_MAX, nla,
+ vxlan_opt_policy, extack);
+ if (err < 0)
+ return err;
+
+ if (!tb[TCA_TUNNEL_KEY_ENC_OPT_VXLAN_GBP]) {
+ NL_SET_ERR_MSG(extack, "Missing tunnel key vxlan option gbp");
+ return -EINVAL;
+ }
+
+ if (dst) {
+ struct vxlan_metadata *md = dst;
+
+ md->gbp = nla_get_u32(tb[TCA_TUNNEL_KEY_ENC_OPT_VXLAN_GBP]);
+ md->gbp &= VXLAN_GBP_MASK;
+ }
+
+ return sizeof(struct vxlan_metadata);
+}
+
+static int
+tunnel_key_copy_erspan_opt(const struct nlattr *nla, void *dst, int dst_len,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_MAX + 1];
+ int err;
+ u8 ver;
+
+ err = nla_parse_nested(tb, TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_MAX, nla,
+ erspan_opt_policy, extack);
+ if (err < 0)
+ return err;
+
+ if (!tb[TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_VER]) {
+ NL_SET_ERR_MSG(extack, "Missing tunnel key erspan option ver");
+ return -EINVAL;
+ }
+
+ ver = nla_get_u8(tb[TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_VER]);
+ if (ver == 1) {
+ if (!tb[TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_INDEX]) {
+ NL_SET_ERR_MSG(extack, "Missing tunnel key erspan option index");
+ return -EINVAL;
+ }
+ } else if (ver == 2) {
+ if (!tb[TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_DIR] ||
+ !tb[TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_HWID]) {
+ NL_SET_ERR_MSG(extack, "Missing tunnel key erspan option dir or hwid");
+ return -EINVAL;
+ }
+ } else {
+ NL_SET_ERR_MSG(extack, "Tunnel key erspan option ver is incorrect");
+ return -EINVAL;
+ }
+
+ if (dst) {
+ struct erspan_metadata *md = dst;
+
+ md->version = ver;
+ if (ver == 1) {
+ nla = tb[TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_INDEX];
+ md->u.index = nla_get_be32(nla);
+ } else {
+ nla = tb[TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_DIR];
+ md->u.md2.dir = nla_get_u8(nla);
+ nla = tb[TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_HWID];
+ set_hwid(&md->u.md2, nla_get_u8(nla));
+ }
+ }
+
+ return sizeof(struct erspan_metadata);
+}
+
static int tunnel_key_copy_opts(const struct nlattr *nla, u8 *dst,
int dst_len, struct netlink_ext_ack *extack)
{
- int err, rem, opt_len, len = nla_len(nla), opts_len = 0;
+ int err, rem, opt_len, len = nla_len(nla), opts_len = 0, type = 0;
const struct nlattr *attr, *head = nla_data(nla);
err = nla_validate_deprecated(head, len, TCA_TUNNEL_KEY_ENC_OPTS_MAX,
@@ -130,6 +229,10 @@
nla_for_each_attr(attr, head, len, rem) {
switch (nla_type(attr)) {
case TCA_TUNNEL_KEY_ENC_OPTS_GENEVE:
+ if (type && type != TUNNEL_GENEVE_OPT) {
+ NL_SET_ERR_MSG(extack, "Duplicate type for geneve options");
+ return -EINVAL;
+ }
opt_len = tunnel_key_copy_geneve_opt(attr, dst,
dst_len, extack);
if (opt_len < 0)
@@ -143,6 +246,31 @@
dst_len -= opt_len;
dst += opt_len;
}
+ type = TUNNEL_GENEVE_OPT;
+ break;
+ case TCA_TUNNEL_KEY_ENC_OPTS_VXLAN:
+ if (type) {
+ NL_SET_ERR_MSG(extack, "Duplicate type for vxlan options");
+ return -EINVAL;
+ }
+ opt_len = tunnel_key_copy_vxlan_opt(attr, dst,
+ dst_len, extack);
+ if (opt_len < 0)
+ return opt_len;
+ opts_len += opt_len;
+ type = TUNNEL_VXLAN_OPT;
+ break;
+ case TCA_TUNNEL_KEY_ENC_OPTS_ERSPAN:
+ if (type) {
+ NL_SET_ERR_MSG(extack, "Duplicate type for erspan options");
+ return -EINVAL;
+ }
+ opt_len = tunnel_key_copy_erspan_opt(attr, dst,
+ dst_len, extack);
+ if (opt_len < 0)
+ return opt_len;
+ opts_len += opt_len;
+ type = TUNNEL_ERSPAN_OPT;
break;
}
}
@@ -179,6 +307,22 @@
#else
return -EAFNOSUPPORT;
#endif
+ case TCA_TUNNEL_KEY_ENC_OPTS_VXLAN:
+#if IS_ENABLED(CONFIG_INET)
+ info->key.tun_flags |= TUNNEL_VXLAN_OPT;
+ return tunnel_key_copy_opts(nla, ip_tunnel_info_opts(info),
+ opts_len, extack);
+#else
+ return -EAFNOSUPPORT;
+#endif
+ case TCA_TUNNEL_KEY_ENC_OPTS_ERSPAN:
+#if IS_ENABLED(CONFIG_INET)
+ info->key.tun_flags |= TUNNEL_ERSPAN_OPT;
+ return tunnel_key_copy_opts(nla, ip_tunnel_info_opts(info),
+ opts_len, extack);
+#else
+ return -EAFNOSUPPORT;
+#endif
default:
NL_SET_ERR_MSG(extack, "Cannot set tunnel options for unknown tunnel type");
return -EINVAL;
@@ -212,7 +356,7 @@
static int tunnel_key_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a,
int ovr, int bind, bool rtnl_held,
- struct tcf_proto *tp,
+ struct tcf_proto *tp, u32 act_flags,
struct netlink_ext_ack *extack)
{
struct tc_action_net *tn = net_generic(net, tunnel_key_net_id);
@@ -351,8 +495,9 @@
}
if (!exists) {
- ret = tcf_idr_create(tn, index, est, a,
- &act_tunnel_key_ops, bind, true);
+ ret = tcf_idr_create_from_flags(tn, index, est, a,
+ &act_tunnel_key_ops, bind,
+ act_flags);
if (ret) {
NL_SET_ERR_MSG(extack, "Cannot create TC IDR");
goto release_tun_meta;
@@ -385,8 +530,8 @@
spin_lock_bh(&t->tcf_lock);
goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch);
- rcu_swap_protected(t->params, params_new,
- lockdep_is_held(&t->tcf_lock));
+ params_new = rcu_replace_pointer(t->params, params_new,
+ lockdep_is_held(&t->tcf_lock));
spin_unlock_bh(&t->tcf_lock);
tunnel_key_release_params(params_new);
if (goto_ch)
@@ -451,6 +596,56 @@
return 0;
}
+static int tunnel_key_vxlan_opts_dump(struct sk_buff *skb,
+ const struct ip_tunnel_info *info)
+{
+ struct vxlan_metadata *md = (struct vxlan_metadata *)(info + 1);
+ struct nlattr *start;
+
+ start = nla_nest_start_noflag(skb, TCA_TUNNEL_KEY_ENC_OPTS_VXLAN);
+ if (!start)
+ return -EMSGSIZE;
+
+ if (nla_put_u32(skb, TCA_TUNNEL_KEY_ENC_OPT_VXLAN_GBP, md->gbp)) {
+ nla_nest_cancel(skb, start);
+ return -EMSGSIZE;
+ }
+
+ nla_nest_end(skb, start);
+ return 0;
+}
+
+static int tunnel_key_erspan_opts_dump(struct sk_buff *skb,
+ const struct ip_tunnel_info *info)
+{
+ struct erspan_metadata *md = (struct erspan_metadata *)(info + 1);
+ struct nlattr *start;
+
+ start = nla_nest_start_noflag(skb, TCA_TUNNEL_KEY_ENC_OPTS_ERSPAN);
+ if (!start)
+ return -EMSGSIZE;
+
+ if (nla_put_u8(skb, TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_VER, md->version))
+ goto err;
+
+ if (md->version == 1 &&
+ nla_put_be32(skb, TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_INDEX, md->u.index))
+ goto err;
+
+ if (md->version == 2 &&
+ (nla_put_u8(skb, TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_DIR,
+ md->u.md2.dir) ||
+ nla_put_u8(skb, TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_HWID,
+ get_hwid(&md->u.md2))))
+ goto err;
+
+ nla_nest_end(skb, start);
+ return 0;
+err:
+ nla_nest_cancel(skb, start);
+ return -EMSGSIZE;
+}
+
static int tunnel_key_opts_dump(struct sk_buff *skb,
const struct ip_tunnel_info *info)
{
@@ -468,6 +663,14 @@
err = tunnel_key_geneve_opts_dump(skb, info);
if (err)
goto err_out;
+ } else if (info->key.tun_flags & TUNNEL_VXLAN_OPT) {
+ err = tunnel_key_vxlan_opts_dump(skb, info);
+ if (err)
+ goto err_out;
+ } else if (info->key.tun_flags & TUNNEL_ERSPAN_OPT) {
+ err = tunnel_key_erspan_opts_dump(skb, info);
+ if (err)
+ goto err_out;
} else {
err_out:
nla_nest_cancel(skb, start);
diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c
index 7dc76c6..a108469 100644
--- a/net/sched/act_vlan.c
+++ b/net/sched/act_vlan.c
@@ -29,7 +29,7 @@
u16 tci;
tcf_lastuse_update(&v->tcf_tm);
- bstats_cpu_update(this_cpu_ptr(v->common.cpu_bstats), skb);
+ tcf_action_update_bstats(&v->common, skb);
/* Ensure 'data' points at mac_header prior calling vlan manipulating
* functions.
@@ -77,6 +77,16 @@
/* put updated tci as hwaccel tag */
__vlan_hwaccel_put_tag(skb, p->tcfv_push_proto, tci);
break;
+ case TCA_VLAN_ACT_POP_ETH:
+ err = skb_eth_pop(skb);
+ if (err)
+ goto drop;
+ break;
+ case TCA_VLAN_ACT_PUSH_ETH:
+ err = skb_eth_push(skb, p->tcfv_push_dst, p->tcfv_push_src);
+ if (err)
+ goto drop;
+ break;
default:
BUG();
}
@@ -88,21 +98,25 @@
return action;
drop:
- qstats_drop_inc(this_cpu_ptr(v->common.cpu_qstats));
+ tcf_action_inc_drop_qstats(&v->common);
return TC_ACT_SHOT;
}
static const struct nla_policy vlan_policy[TCA_VLAN_MAX + 1] = {
+ [TCA_VLAN_UNSPEC] = { .strict_start_type = TCA_VLAN_PUSH_ETH_DST },
[TCA_VLAN_PARMS] = { .len = sizeof(struct tc_vlan) },
[TCA_VLAN_PUSH_VLAN_ID] = { .type = NLA_U16 },
[TCA_VLAN_PUSH_VLAN_PROTOCOL] = { .type = NLA_U16 },
[TCA_VLAN_PUSH_VLAN_PRIORITY] = { .type = NLA_U8 },
+ [TCA_VLAN_PUSH_ETH_DST] = NLA_POLICY_ETH_ADDR,
+ [TCA_VLAN_PUSH_ETH_SRC] = NLA_POLICY_ETH_ADDR,
};
static int tcf_vlan_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a,
int ovr, int bind, bool rtnl_held,
- struct tcf_proto *tp, struct netlink_ext_ack *extack)
+ struct tcf_proto *tp, u32 flags,
+ struct netlink_ext_ack *extack)
{
struct tc_action_net *tn = net_generic(net, vlan_net_id);
struct nlattr *tb[TCA_VLAN_MAX + 1];
@@ -180,6 +194,17 @@
if (push_prio_exists)
push_prio = nla_get_u8(tb[TCA_VLAN_PUSH_VLAN_PRIORITY]);
break;
+ case TCA_VLAN_ACT_POP_ETH:
+ break;
+ case TCA_VLAN_ACT_PUSH_ETH:
+ if (!tb[TCA_VLAN_PUSH_ETH_DST] || !tb[TCA_VLAN_PUSH_ETH_SRC]) {
+ if (exists)
+ tcf_idr_release(*a, bind);
+ else
+ tcf_idr_cleanup(tn, index);
+ return -EINVAL;
+ }
+ break;
default:
if (exists)
tcf_idr_release(*a, bind);
@@ -190,8 +215,8 @@
action = parm->v_action;
if (!exists) {
- ret = tcf_idr_create(tn, index, est, a,
- &act_vlan_ops, bind, true);
+ ret = tcf_idr_create_from_flags(tn, index, est, a,
+ &act_vlan_ops, bind, flags);
if (ret) {
tcf_idr_cleanup(tn, index);
return ret;
@@ -221,9 +246,16 @@
p->tcfv_push_prio_exists = push_prio_exists || action == TCA_VLAN_ACT_PUSH;
p->tcfv_push_proto = push_proto;
+ if (action == TCA_VLAN_ACT_PUSH_ETH) {
+ nla_memcpy(&p->tcfv_push_dst, tb[TCA_VLAN_PUSH_ETH_DST],
+ ETH_ALEN);
+ nla_memcpy(&p->tcfv_push_src, tb[TCA_VLAN_PUSH_ETH_SRC],
+ ETH_ALEN);
+ }
+
spin_lock_bh(&v->tcf_lock);
goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch);
- rcu_swap_protected(v->vlan_p, p, lockdep_is_held(&v->tcf_lock));
+ p = rcu_replace_pointer(v->vlan_p, p, lockdep_is_held(&v->tcf_lock));
spin_unlock_bh(&v->tcf_lock);
if (goto_ch)
@@ -279,6 +311,15 @@
p->tcfv_push_prio))))
goto nla_put_failure;
+ if (p->tcfv_action == TCA_VLAN_ACT_PUSH_ETH) {
+ if (nla_put(skb, TCA_VLAN_PUSH_ETH_DST, ETH_ALEN,
+ p->tcfv_push_dst))
+ goto nla_put_failure;
+ if (nla_put(skb, TCA_VLAN_PUSH_ETH_SRC, ETH_ALEN,
+ p->tcfv_push_src))
+ goto nla_put_failure;
+ }
+
tcf_tm_dump(&t, &v->tcf_tm);
if (nla_put_64bit(skb, TCA_VLAN_TM, sizeof(t), &t, TCA_VLAN_PAD))
goto nla_put_failure;
@@ -302,16 +343,13 @@
return tcf_generic_walker(tn, skb, cb, type, ops, extack);
}
-static void tcf_vlan_stats_update(struct tc_action *a, u64 bytes, u32 packets,
- u64 lastuse, bool hw)
+static void tcf_vlan_stats_update(struct tc_action *a, u64 bytes, u64 packets,
+ u64 drops, u64 lastuse, bool hw)
{
struct tcf_vlan *v = to_vlan(a);
struct tcf_t *tm = &v->tcf_tm;
- _bstats_cpu_update(this_cpu_ptr(a->cpu_bstats), bytes, packets);
- if (hw)
- _bstats_cpu_update(this_cpu_ptr(a->cpu_bstats_hw),
- bytes, packets);
+ tcf_action_update_stats(a, bytes, packets, drops, hw);
tm->lastuse = max_t(u64, tm->lastuse, lastuse);
}
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 7f20fd3..9a789a0 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -20,8 +20,8 @@
#include <linux/kmod.h>
#include <linux/slab.h>
#include <linux/idr.h>
-#include <linux/rhashtable.h>
#include <linux/jhash.h>
+#include <linux/rculist.h>
#include <net/net_namespace.h>
#include <net/sock.h>
#include <net/netlink.h>
@@ -38,6 +38,7 @@
#include <net/tc_act/tc_skbedit.h>
#include <net/tc_act/tc_ct.h>
#include <net/tc_act/tc_mpls.h>
+#include <net/tc_act/tc_gate.h>
#include <net/flow_offload.h>
extern const struct nla_policy rtm_tca_policy[TCA_MAX + 1];
@@ -354,7 +355,7 @@
chain = kzalloc(sizeof(*chain), GFP_KERNEL);
if (!chain)
return NULL;
- list_add_tail(&chain->list, &block->chain_list);
+ list_add_tail_rcu(&chain->list, &block->chain_list);
mutex_init(&chain->filter_chain_lock);
chain->block = block;
chain->index = chain_index;
@@ -394,7 +395,7 @@
ASSERT_BLOCK_LOCKED(block);
- list_del(&chain->list);
+ list_del_rcu(&chain->list);
if (!chain->index)
block->chain0.chain = NULL;
@@ -453,6 +454,20 @@
return NULL;
}
+#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
+static struct tcf_chain *tcf_chain_lookup_rcu(const struct tcf_block *block,
+ u32 chain_index)
+{
+ struct tcf_chain *chain;
+
+ list_for_each_entry_rcu(chain, &block->chain_list, list) {
+ if (chain->index == chain_index)
+ return chain;
+ }
+ return NULL;
+}
+#endif
+
static int tc_chain_notify(struct tcf_chain *chain, struct sk_buff *oskb,
u32 seq, u16 flags, int event, bool unicast);
@@ -605,96 +620,46 @@
static int tcf_block_setup(struct tcf_block *block,
struct flow_block_offload *bo);
-static void tc_indr_block_cmd(struct net_device *dev, struct tcf_block *block,
- flow_indr_block_bind_cb_t *cb, void *cb_priv,
- enum flow_block_command command, bool ingress)
+static void tcf_block_offload_init(struct flow_block_offload *bo,
+ struct net_device *dev, struct Qdisc *sch,
+ enum flow_block_command command,
+ enum flow_block_binder_type binder_type,
+ struct flow_block *flow_block,
+ bool shared, struct netlink_ext_ack *extack)
{
- struct flow_block_offload bo = {
- .command = command,
- .binder_type = ingress ?
- FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS :
- FLOW_BLOCK_BINDER_TYPE_CLSACT_EGRESS,
- .net = dev_net(dev),
- .block_shared = tcf_block_non_null_shared(block),
- };
- INIT_LIST_HEAD(&bo.cb_list);
+ bo->net = dev_net(dev);
+ bo->command = command;
+ bo->binder_type = binder_type;
+ bo->block = flow_block;
+ bo->block_shared = shared;
+ bo->extack = extack;
+ bo->sch = sch;
+ bo->cb_list_head = &flow_block->cb_list;
+ INIT_LIST_HEAD(&bo->cb_list);
+}
- if (!block)
- return;
+static void tcf_block_unbind(struct tcf_block *block,
+ struct flow_block_offload *bo);
- bo.block = &block->flow_block;
+static void tc_block_indr_cleanup(struct flow_block_cb *block_cb)
+{
+ struct tcf_block *block = block_cb->indr.data;
+ struct net_device *dev = block_cb->indr.dev;
+ struct Qdisc *sch = block_cb->indr.sch;
+ struct netlink_ext_ack extack = {};
+ struct flow_block_offload bo = {};
+ tcf_block_offload_init(&bo, dev, sch, FLOW_BLOCK_UNBIND,
+ block_cb->indr.binder_type,
+ &block->flow_block, tcf_block_shared(block),
+ &extack);
+ rtnl_lock();
down_write(&block->cb_lock);
- cb(dev, cb_priv, TC_SETUP_BLOCK, &bo);
-
- tcf_block_setup(block, &bo);
+ list_del(&block_cb->driver_list);
+ list_move(&block_cb->list, &bo.cb_list);
+ tcf_block_unbind(block, &bo);
up_write(&block->cb_lock);
-}
-
-static struct tcf_block *tc_dev_block(struct net_device *dev, bool ingress)
-{
- const struct Qdisc_class_ops *cops;
- const struct Qdisc_ops *ops;
- struct Qdisc *qdisc;
-
- if (!dev_ingress_queue(dev))
- return NULL;
-
- qdisc = dev_ingress_queue(dev)->qdisc_sleeping;
- if (!qdisc)
- return NULL;
-
- ops = qdisc->ops;
- if (!ops)
- return NULL;
-
- if (!ingress && !strcmp("ingress", ops->id))
- return NULL;
-
- cops = ops->cl_ops;
- if (!cops)
- return NULL;
-
- if (!cops->tcf_block)
- return NULL;
-
- return cops->tcf_block(qdisc,
- ingress ? TC_H_MIN_INGRESS : TC_H_MIN_EGRESS,
- NULL);
-}
-
-static void tc_indr_block_get_and_cmd(struct net_device *dev,
- flow_indr_block_bind_cb_t *cb,
- void *cb_priv,
- enum flow_block_command command)
-{
- struct tcf_block *block;
-
- block = tc_dev_block(dev, true);
- tc_indr_block_cmd(dev, block, cb, cb_priv, command, true);
-
- block = tc_dev_block(dev, false);
- tc_indr_block_cmd(dev, block, cb, cb_priv, command, false);
-}
-
-static void tc_indr_block_call(struct tcf_block *block,
- struct net_device *dev,
- struct tcf_block_ext_info *ei,
- enum flow_block_command command,
- struct netlink_ext_ack *extack)
-{
- struct flow_block_offload bo = {
- .command = command,
- .binder_type = ei->binder_type,
- .net = dev_net(dev),
- .block = &block->flow_block,
- .block_shared = tcf_block_shared(block),
- .extack = extack,
- };
- INIT_LIST_HEAD(&bo.cb_list);
-
- flow_indr_block_call(dev, &bo, command);
- tcf_block_setup(block, &bo);
+ rtnl_unlock();
}
static bool tcf_block_offload_in_use(struct tcf_block *block)
@@ -703,27 +668,35 @@
}
static int tcf_block_offload_cmd(struct tcf_block *block,
- struct net_device *dev,
+ struct net_device *dev, struct Qdisc *sch,
struct tcf_block_ext_info *ei,
enum flow_block_command command,
struct netlink_ext_ack *extack)
{
struct flow_block_offload bo = {};
- int err;
- bo.net = dev_net(dev);
- bo.command = command;
- bo.binder_type = ei->binder_type;
- bo.block = &block->flow_block;
- bo.block_shared = tcf_block_shared(block);
- bo.extack = extack;
- INIT_LIST_HEAD(&bo.cb_list);
+ tcf_block_offload_init(&bo, dev, sch, command, ei->binder_type,
+ &block->flow_block, tcf_block_shared(block),
+ extack);
- err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_BLOCK, &bo);
- if (err < 0)
- return err;
+ if (dev->netdev_ops->ndo_setup_tc) {
+ int err;
- return tcf_block_setup(block, &bo);
+ err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_BLOCK, &bo);
+ if (err < 0) {
+ if (err != -EOPNOTSUPP)
+ NL_SET_ERR_MSG(extack, "Driver ndo_setup_tc failed");
+ return err;
+ }
+
+ return tcf_block_setup(block, &bo);
+ }
+
+ flow_indr_dev_setup_offload(dev, sch, TC_SETUP_BLOCK, block, &bo,
+ tc_block_indr_cleanup);
+ tcf_block_setup(block, &bo);
+
+ return -EOPNOTSUPP;
}
static int tcf_block_offload_bind(struct tcf_block *block, struct Qdisc *q,
@@ -734,36 +707,33 @@
int err;
down_write(&block->cb_lock);
- if (!dev->netdev_ops->ndo_setup_tc)
- goto no_offload_dev_inc;
/* If tc offload feature is disabled and the block we try to bind
* to already has some offloaded filters, forbid to bind.
*/
- if (!tc_can_offload(dev) && tcf_block_offload_in_use(block)) {
+ if (dev->netdev_ops->ndo_setup_tc &&
+ !tc_can_offload(dev) &&
+ tcf_block_offload_in_use(block)) {
NL_SET_ERR_MSG(extack, "Bind to offloaded block failed as dev has offload disabled");
err = -EOPNOTSUPP;
goto err_unlock;
}
- err = tcf_block_offload_cmd(block, dev, ei, FLOW_BLOCK_BIND, extack);
+ err = tcf_block_offload_cmd(block, dev, q, ei, FLOW_BLOCK_BIND, extack);
if (err == -EOPNOTSUPP)
goto no_offload_dev_inc;
if (err)
goto err_unlock;
- tc_indr_block_call(block, dev, ei, FLOW_BLOCK_BIND, extack);
up_write(&block->cb_lock);
return 0;
no_offload_dev_inc:
- if (tcf_block_offload_in_use(block)) {
- err = -EOPNOTSUPP;
+ if (tcf_block_offload_in_use(block))
goto err_unlock;
- }
+
err = 0;
block->nooffloaddevcnt++;
- tc_indr_block_call(block, dev, ei, FLOW_BLOCK_BIND, extack);
err_unlock:
up_write(&block->cb_lock);
return err;
@@ -776,11 +746,7 @@
int err;
down_write(&block->cb_lock);
- tc_indr_block_call(block, dev, ei, FLOW_BLOCK_UNBIND, NULL);
-
- if (!dev->netdev_ops->ndo_setup_tc)
- goto no_offload_dev_dec;
- err = tcf_block_offload_cmd(block, dev, ei, FLOW_BLOCK_UNBIND, NULL);
+ err = tcf_block_offload_cmd(block, dev, q, ei, FLOW_BLOCK_UNBIND, NULL);
if (err == -EOPNOTSUPP)
goto no_offload_dev_dec;
up_write(&block->cb_lock);
@@ -1079,7 +1045,7 @@
/* Find qdisc */
if (!*parent) {
- *q = dev->qdisc;
+ *q = rcu_dereference(dev->qdisc);
*parent = (*q)->handle;
} else {
*q = qdisc_lookup_rcu(dev, TC_H_MAJ(*parent));
@@ -1559,12 +1525,15 @@
* to this qdisc, (optionally) tests for protocol and asks
* specific classifiers.
*/
-int tcf_classify(struct sk_buff *skb, const struct tcf_proto *tp,
- struct tcf_result *res, bool compat_mode)
+static inline int __tcf_classify(struct sk_buff *skb,
+ const struct tcf_proto *tp,
+ const struct tcf_proto *orig_tp,
+ struct tcf_result *res,
+ bool compat_mode,
+ u32 *last_executed_chain)
{
#ifdef CONFIG_NET_CLS_ACT
- const int max_reclassify_loop = 4;
- const struct tcf_proto *orig_tp = tp;
+ const int max_reclassify_loop = 16;
const struct tcf_proto *first_tp;
int limit = 0;
@@ -1582,21 +1551,11 @@
#ifdef CONFIG_NET_CLS_ACT
if (unlikely(err == TC_ACT_RECLASSIFY && !compat_mode)) {
first_tp = orig_tp;
+ *last_executed_chain = first_tp->chain->index;
goto reset;
} else if (unlikely(TC_ACT_EXT_CMP(err, TC_ACT_GOTO_CHAIN))) {
first_tp = res->goto_tp;
-
-#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
- {
- struct tc_skb_ext *ext;
-
- ext = skb_ext_add(skb, TC_SKB_EXT);
- if (WARN_ON_ONCE(!ext))
- return TC_ACT_SHOT;
-
- ext->chain = err & TC_ACT_EXT_VAL_MASK;
- }
-#endif
+ *last_executed_chain = err & TC_ACT_EXT_VAL_MASK;
goto reset;
}
#endif
@@ -1619,8 +1578,66 @@
goto reclassify;
#endif
}
+
+int tcf_classify(struct sk_buff *skb, const struct tcf_proto *tp,
+ struct tcf_result *res, bool compat_mode)
+{
+ u32 last_executed_chain = 0;
+
+ return __tcf_classify(skb, tp, tp, res, compat_mode,
+ &last_executed_chain);
+}
EXPORT_SYMBOL(tcf_classify);
+int tcf_classify_ingress(struct sk_buff *skb,
+ const struct tcf_block *ingress_block,
+ const struct tcf_proto *tp,
+ struct tcf_result *res, bool compat_mode)
+{
+#if !IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
+ u32 last_executed_chain = 0;
+
+ return __tcf_classify(skb, tp, tp, res, compat_mode,
+ &last_executed_chain);
+#else
+ u32 last_executed_chain = tp ? tp->chain->index : 0;
+ const struct tcf_proto *orig_tp = tp;
+ struct tc_skb_ext *ext;
+ int ret;
+
+ ext = skb_ext_find(skb, TC_SKB_EXT);
+
+ if (ext && ext->chain) {
+ struct tcf_chain *fchain;
+
+ fchain = tcf_chain_lookup_rcu(ingress_block, ext->chain);
+ if (!fchain)
+ return TC_ACT_SHOT;
+
+ /* Consume, so cloned/redirect skbs won't inherit ext */
+ skb_ext_del(skb, TC_SKB_EXT);
+
+ tp = rcu_dereference_bh(fchain->filter_chain);
+ last_executed_chain = fchain->index;
+ }
+
+ ret = __tcf_classify(skb, tp, orig_tp, res, compat_mode,
+ &last_executed_chain);
+
+ /* If we missed on some chain */
+ if (ret == TC_ACT_UNSPEC && last_executed_chain) {
+ ext = tc_skb_ext_alloc(skb);
+ if (WARN_ON_ONCE(!ext))
+ return TC_ACT_SHOT;
+ ext->chain = last_executed_chain;
+ ext->mru = qdisc_skb_cb(skb)->mru;
+ }
+
+ return ret;
+#endif
+}
+EXPORT_SYMBOL(tcf_classify_ingress);
+
struct tcf_chain_info {
struct tcf_proto __rcu **pprev;
struct tcf_proto __rcu *next;
@@ -1782,7 +1799,7 @@
struct tcf_proto *tp, struct tcf_block *block,
struct Qdisc *q, u32 parent, void *fh,
u32 portid, u32 seq, u16 flags, int event,
- bool rtnl_held)
+ bool terse_dump, bool rtnl_held)
{
struct tcmsg *tcm;
struct nlmsghdr *nlh;
@@ -1809,6 +1826,14 @@
goto nla_put_failure;
if (!fh) {
tcm->tcm_handle = 0;
+ } else if (terse_dump) {
+ if (tp->ops->terse_dump) {
+ if (tp->ops->terse_dump(net, tp, fh, skb, tcm,
+ rtnl_held) < 0)
+ goto nla_put_failure;
+ } else {
+ goto cls_op_not_supp;
+ }
} else {
if (tp->ops->dump &&
tp->ops->dump(net, tp, fh, skb, tcm, rtnl_held) < 0)
@@ -1819,6 +1844,7 @@
out_nlmsg_trim:
nla_put_failure:
+cls_op_not_supp:
nlmsg_trim(skb, b);
return -1;
}
@@ -1839,7 +1865,7 @@
if (tcf_fill_node(net, skb, tp, block, q, parent, fh, portid,
n->nlmsg_seq, n->nlmsg_flags, event,
- rtnl_held) <= 0) {
+ false, rtnl_held) <= 0) {
kfree_skb(skb);
return -EINVAL;
}
@@ -1871,7 +1897,7 @@
if (tcf_fill_node(net, skb, tp, block, q, parent, fh, portid,
n->nlmsg_seq, n->nlmsg_flags, RTM_DELTFILTER,
- rtnl_held) <= 0) {
+ false, rtnl_held) <= 0) {
NL_SET_ERR_MSG(extack, "Failed to build del event notification");
kfree_skb(skb);
return -EINVAL;
@@ -1928,9 +1954,9 @@
bool prio_allocate;
u32 parent;
u32 chain_index;
- struct Qdisc *q = NULL;
+ struct Qdisc *q;
struct tcf_chain_info chain_info;
- struct tcf_chain *chain = NULL;
+ struct tcf_chain *chain;
struct tcf_block *block;
struct tcf_proto *tp;
unsigned long cl;
@@ -1958,6 +1984,8 @@
tp = NULL;
cl = 0;
block = NULL;
+ q = NULL;
+ chain = NULL;
if (prio == 0) {
/* If no priority is provided by the user,
@@ -2432,6 +2460,7 @@
struct tcf_block *block;
struct Qdisc *q;
u32 parent;
+ bool terse_dump;
};
static int tcf_node_dump(struct tcf_proto *tp, void *n, struct tcf_walker *arg)
@@ -2442,12 +2471,12 @@
return tcf_fill_node(net, a->skb, tp, a->block, a->q, a->parent,
n, NETLINK_CB(a->cb->skb).portid,
a->cb->nlh->nlmsg_seq, NLM_F_MULTI,
- RTM_NEWTFILTER, true);
+ RTM_NEWTFILTER, a->terse_dump, true);
}
static bool tcf_chain_dump(struct tcf_chain *chain, struct Qdisc *q, u32 parent,
struct sk_buff *skb, struct netlink_callback *cb,
- long index_start, long *p_index)
+ long index_start, long *p_index, bool terse)
{
struct net *net = sock_net(skb->sk);
struct tcf_block *block = chain->block;
@@ -2476,7 +2505,7 @@
if (tcf_fill_node(net, skb, tp, block, q, parent, NULL,
NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq, NLM_F_MULTI,
- RTM_NEWTFILTER, true) <= 0)
+ RTM_NEWTFILTER, false, true) <= 0)
goto errout;
cb->args[1] = 1;
}
@@ -2492,6 +2521,7 @@
arg.w.skip = cb->args[1] - 1;
arg.w.count = 0;
arg.w.cookie = cb->args[2];
+ arg.terse_dump = terse;
tp->ops->walk(tp, &arg.w, true);
cb->args[2] = arg.w.cookie;
cb->args[1] = arg.w.count + 1;
@@ -2505,6 +2535,10 @@
return false;
}
+static const struct nla_policy tcf_tfilter_dump_policy[TCA_MAX + 1] = {
+ [TCA_DUMP_FLAGS] = NLA_POLICY_BITFIELD32(TCA_DUMP_FLAGS_TERSE),
+};
+
/* called with RTNL */
static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb)
{
@@ -2514,6 +2548,7 @@
struct Qdisc *q = NULL;
struct tcf_block *block;
struct tcmsg *tcm = nlmsg_data(cb->nlh);
+ bool terse_dump = false;
long index_start;
long index;
u32 parent;
@@ -2523,10 +2558,17 @@
return skb->len;
err = nlmsg_parse_deprecated(cb->nlh, sizeof(*tcm), tca, TCA_MAX,
- NULL, cb->extack);
+ tcf_tfilter_dump_policy, cb->extack);
if (err)
return err;
+ if (tca[TCA_DUMP_FLAGS]) {
+ struct nla_bitfield32 flags =
+ nla_get_bitfield32(tca[TCA_DUMP_FLAGS]);
+
+ terse_dump = flags.value & TCA_DUMP_FLAGS_TERSE;
+ }
+
if (tcm->tcm_ifindex == TCM_IFINDEX_MAGIC_BLOCK) {
block = tcf_block_refcnt_get(net, tcm->tcm_block_index);
if (!block)
@@ -2549,7 +2591,7 @@
parent = tcm->tcm_parent;
if (!parent)
- q = dev->qdisc;
+ q = rtnl_dereference(dev->qdisc);
else
q = qdisc_lookup(dev, TC_H_MAJ(tcm->tcm_parent));
if (!q)
@@ -2584,7 +2626,7 @@
nla_get_u32(tca[TCA_CHAIN]) != chain->index)
continue;
if (!tcf_chain_dump(chain, q, parent, skb, cb,
- index_start, &index)) {
+ index_start, &index, terse_dump)) {
tcf_chain_put(chain);
err = -EMSGSIZE;
break;
@@ -2764,8 +2806,8 @@
struct tcmsg *t;
u32 parent;
u32 chain_index;
- struct Qdisc *q = NULL;
- struct tcf_chain *chain = NULL;
+ struct Qdisc *q;
+ struct tcf_chain *chain;
struct tcf_block *block;
unsigned long cl;
int err;
@@ -2775,6 +2817,7 @@
return -EPERM;
replay:
+ q = NULL;
err = nlmsg_parse_deprecated(n, sizeof(*t), tca, TCA_MAX,
rtm_tca_policy, extack);
if (err < 0)
@@ -2934,7 +2977,7 @@
parent = tcm->tcm_parent;
if (!parent) {
- q = dev->qdisc;
+ q = rtnl_dereference(dev->qdisc);
parent = q->handle;
} else {
q = qdisc_lookup(dev, TC_H_MAJ(tcm->tcm_parent));
@@ -3012,14 +3055,21 @@
{
#ifdef CONFIG_NET_CLS_ACT
{
+ int init_res[TCA_ACT_MAX_PRIO] = {};
struct tc_action *act;
size_t attr_size = 0;
if (exts->police && tb[exts->police]) {
+ struct tc_action_ops *a_o;
+
+ a_o = tc_action_load_ops("police", tb[exts->police], rtnl_held, extack);
+ if (IS_ERR(a_o))
+ return PTR_ERR(a_o);
act = tcf_action_init_1(net, tp, tb[exts->police],
rate_tlv, "police", ovr,
- TCA_ACT_BIND, rtnl_held,
- extack);
+ TCA_ACT_BIND, a_o, init_res,
+ rtnl_held, extack);
+ module_put(a_o->owner);
if (IS_ERR(act))
return PTR_ERR(act);
@@ -3032,8 +3082,8 @@
err = tcf_action_init(net, tp, tb[exts->action],
rate_tlv, NULL, ovr, TCA_ACT_BIND,
- exts->actions, &attr_size,
- rtnl_held, extack);
+ exts->actions, init_res,
+ &attr_size, rtnl_held, extack);
if (err < 0)
return err;
exts->nr_actions = err;
@@ -3088,7 +3138,8 @@
if (nest == NULL)
goto nla_put_failure;
- if (tcf_action_dump(skb, exts->actions, 0, 0) < 0)
+ if (tcf_action_dump(skb, exts->actions, 0, 0, false)
+ < 0)
goto nla_put_failure;
nla_nest_end(skb, nest);
} else if (exts->police) {
@@ -3112,6 +3163,31 @@
}
EXPORT_SYMBOL(tcf_exts_dump);
+int tcf_exts_terse_dump(struct sk_buff *skb, struct tcf_exts *exts)
+{
+#ifdef CONFIG_NET_CLS_ACT
+ struct nlattr *nest;
+
+ if (!exts->action || !tcf_exts_has_actions(exts))
+ return 0;
+
+ nest = nla_nest_start_noflag(skb, exts->action);
+ if (!nest)
+ goto nla_put_failure;
+
+ if (tcf_action_dump(skb, exts->actions, 0, 0, true) < 0)
+ goto nla_put_failure;
+ nla_nest_end(skb, nest);
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, nest);
+ return -1;
+#else
+ return 0;
+#endif
+}
+EXPORT_SYMBOL(tcf_exts_terse_dump);
int tcf_exts_dump_stats(struct sk_buff *skb, struct tcf_exts *exts)
{
@@ -3383,14 +3459,40 @@
}
EXPORT_SYMBOL(tc_setup_cb_reoffload);
+static int tcf_act_get_cookie(struct flow_action_entry *entry,
+ const struct tc_action *act)
+{
+ struct tc_cookie *cookie;
+ int err = 0;
+
+ rcu_read_lock();
+ cookie = rcu_dereference(act->act_cookie);
+ if (cookie) {
+ entry->cookie = flow_action_cookie_create(cookie->data,
+ cookie->len,
+ GFP_ATOMIC);
+ if (!entry->cookie)
+ err = -ENOMEM;
+ }
+ rcu_read_unlock();
+ return err;
+}
+
+static void tcf_act_put_cookie(struct flow_action_entry *entry)
+{
+ flow_action_cookie_destroy(entry->cookie);
+}
+
void tc_cleanup_flow_action(struct flow_action *flow_action)
{
struct flow_action_entry *entry;
int i;
- flow_action_for_each(i, entry, flow_action)
+ flow_action_for_each(i, entry, flow_action) {
+ tcf_act_put_cookie(entry);
if (entry->destructor)
entry->destructor(entry->destructor_priv);
+ }
}
EXPORT_SYMBOL(tc_cleanup_flow_action);
@@ -3433,23 +3535,62 @@
#endif
}
-int tc_setup_flow_action(struct flow_action *flow_action,
- const struct tcf_exts *exts, bool rtnl_held)
+static void tcf_gate_entry_destructor(void *priv)
{
- const struct tc_action *act;
+ struct action_gate_entry *oe = priv;
+
+ kfree(oe);
+}
+
+static int tcf_gate_get_entries(struct flow_action_entry *entry,
+ const struct tc_action *act)
+{
+ entry->gate.entries = tcf_gate_get_list(act);
+
+ if (!entry->gate.entries)
+ return -EINVAL;
+
+ entry->destructor = tcf_gate_entry_destructor;
+ entry->destructor_priv = entry->gate.entries;
+
+ return 0;
+}
+
+static enum flow_action_hw_stats tc_act_hw_stats(u8 hw_stats)
+{
+ if (WARN_ON_ONCE(hw_stats > TCA_ACT_HW_STATS_ANY))
+ return FLOW_ACTION_HW_STATS_DONT_CARE;
+ else if (!hw_stats)
+ return FLOW_ACTION_HW_STATS_DISABLED;
+
+ return hw_stats;
+}
+
+int tc_setup_flow_action(struct flow_action *flow_action,
+ const struct tcf_exts *exts)
+{
+ struct tc_action *act;
int i, j, k, err = 0;
+ BUILD_BUG_ON(TCA_ACT_HW_STATS_ANY != FLOW_ACTION_HW_STATS_ANY);
+ BUILD_BUG_ON(TCA_ACT_HW_STATS_IMMEDIATE != FLOW_ACTION_HW_STATS_IMMEDIATE);
+ BUILD_BUG_ON(TCA_ACT_HW_STATS_DELAYED != FLOW_ACTION_HW_STATS_DELAYED);
+
if (!exts)
return 0;
- if (!rtnl_held)
- rtnl_lock();
-
j = 0;
tcf_exts_for_each_action(i, act, exts) {
struct flow_action_entry *entry;
entry = &flow_action->entries[j];
+ spin_lock_bh(&act->tcfa_lock);
+ err = tcf_act_get_cookie(entry, act);
+ if (err)
+ goto err_out_locked;
+
+ entry->hw_stats = tc_act_hw_stats(act->hw_stats);
+
if (is_tcf_gact_ok(act)) {
entry->id = FLOW_ACTION_ACCEPT;
} else if (is_tcf_gact_shot(act)) {
@@ -3490,13 +3631,13 @@
break;
default:
err = -EOPNOTSUPP;
- goto err_out;
+ goto err_out_locked;
}
} else if (is_tcf_tunnel_set(act)) {
entry->id = FLOW_ACTION_TUNNEL_ENCAP;
err = tcf_tunnel_encap_get_tunnel(entry, act);
if (err)
- goto err_out;
+ goto err_out_locked;
} else if (is_tcf_tunnel_release(act)) {
entry->id = FLOW_ACTION_TUNNEL_DECAP;
} else if (is_tcf_pedit(act)) {
@@ -3510,12 +3651,13 @@
break;
default:
err = -EOPNOTSUPP;
- goto err_out;
+ goto err_out_locked;
}
entry->mangle.htype = tcf_pedit_htype(act, k);
entry->mangle.mask = tcf_pedit_mask(act, k);
entry->mangle.val = tcf_pedit_val(act, k);
entry->mangle.offset = tcf_pedit_offset(act, k);
+ entry->hw_stats = tc_act_hw_stats(act->hw_stats);
entry = &flow_action->entries[++j];
}
} else if (is_tcf_csum(act)) {
@@ -3532,13 +3674,16 @@
tcf_sample_get_group(entry, act);
} else if (is_tcf_police(act)) {
entry->id = FLOW_ACTION_POLICE;
- entry->police.burst = tcf_police_tcfp_burst(act);
+ entry->police.burst = tcf_police_burst(act);
entry->police.rate_bytes_ps =
tcf_police_rate_bytes_ps(act);
+ entry->police.mtu = tcf_police_tcfp_mtu(act);
+ entry->police.index = act->tcfa_index;
} else if (is_tcf_ct(act)) {
entry->id = FLOW_ACTION_CT;
entry->ct.action = tcf_ct_action(act);
entry->ct.zone = tcf_ct_zone(act);
+ entry->ct.flow_table = tcf_ct_ft(act);
} else if (is_tcf_mpls(act)) {
switch (tcf_mpls_action(act)) {
case TCA_MPLS_ACT_PUSH:
@@ -3561,28 +3706,44 @@
entry->mpls_mangle.ttl = tcf_mpls_ttl(act);
break;
default:
- goto err_out;
+ err = -EOPNOTSUPP;
+ goto err_out_locked;
}
} else if (is_tcf_skbedit_ptype(act)) {
entry->id = FLOW_ACTION_PTYPE;
entry->ptype = tcf_skbedit_ptype(act);
+ } else if (is_tcf_skbedit_priority(act)) {
+ entry->id = FLOW_ACTION_PRIORITY;
+ entry->priority = tcf_skbedit_priority(act);
+ } else if (is_tcf_gate(act)) {
+ entry->id = FLOW_ACTION_GATE;
+ entry->gate.index = tcf_gate_index(act);
+ entry->gate.prio = tcf_gate_prio(act);
+ entry->gate.basetime = tcf_gate_basetime(act);
+ entry->gate.cycletime = tcf_gate_cycletime(act);
+ entry->gate.cycletimeext = tcf_gate_cycletimeext(act);
+ entry->gate.num_entries = tcf_gate_num_entries(act);
+ err = tcf_gate_get_entries(entry, act);
+ if (err)
+ goto err_out_locked;
} else {
err = -EOPNOTSUPP;
- goto err_out;
+ goto err_out_locked;
}
+ spin_unlock_bh(&act->tcfa_lock);
if (!is_tcf_pedit(act))
j++;
}
err_out:
- if (!rtnl_held)
- rtnl_unlock();
-
if (err)
tc_cleanup_flow_action(flow_action);
return err;
+err_out_locked:
+ spin_unlock_bh(&act->tcfa_lock);
+ goto err_out;
}
EXPORT_SYMBOL(tc_setup_flow_action);
@@ -3602,6 +3763,119 @@
}
EXPORT_SYMBOL(tcf_exts_num_actions);
+#ifdef CONFIG_NET_CLS_ACT
+static int tcf_qevent_parse_block_index(struct nlattr *block_index_attr,
+ u32 *p_block_index,
+ struct netlink_ext_ack *extack)
+{
+ *p_block_index = nla_get_u32(block_index_attr);
+ if (!*p_block_index) {
+ NL_SET_ERR_MSG(extack, "Block number may not be zero");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+int tcf_qevent_init(struct tcf_qevent *qe, struct Qdisc *sch,
+ enum flow_block_binder_type binder_type,
+ struct nlattr *block_index_attr,
+ struct netlink_ext_ack *extack)
+{
+ u32 block_index;
+ int err;
+
+ if (!block_index_attr)
+ return 0;
+
+ err = tcf_qevent_parse_block_index(block_index_attr, &block_index, extack);
+ if (err)
+ return err;
+
+ if (!block_index)
+ return 0;
+
+ qe->info.binder_type = binder_type;
+ qe->info.chain_head_change = tcf_chain_head_change_dflt;
+ qe->info.chain_head_change_priv = &qe->filter_chain;
+ qe->info.block_index = block_index;
+
+ return tcf_block_get_ext(&qe->block, sch, &qe->info, extack);
+}
+EXPORT_SYMBOL(tcf_qevent_init);
+
+void tcf_qevent_destroy(struct tcf_qevent *qe, struct Qdisc *sch)
+{
+ if (qe->info.block_index)
+ tcf_block_put_ext(qe->block, sch, &qe->info);
+}
+EXPORT_SYMBOL(tcf_qevent_destroy);
+
+int tcf_qevent_validate_change(struct tcf_qevent *qe, struct nlattr *block_index_attr,
+ struct netlink_ext_ack *extack)
+{
+ u32 block_index;
+ int err;
+
+ if (!block_index_attr)
+ return 0;
+
+ err = tcf_qevent_parse_block_index(block_index_attr, &block_index, extack);
+ if (err)
+ return err;
+
+ /* Bounce newly-configured block or change in block. */
+ if (block_index != qe->info.block_index) {
+ NL_SET_ERR_MSG(extack, "Change of blocks is not supported");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(tcf_qevent_validate_change);
+
+struct sk_buff *tcf_qevent_handle(struct tcf_qevent *qe, struct Qdisc *sch, struct sk_buff *skb,
+ struct sk_buff **to_free, int *ret)
+{
+ struct tcf_result cl_res;
+ struct tcf_proto *fl;
+
+ if (!qe->info.block_index)
+ return skb;
+
+ fl = rcu_dereference_bh(qe->filter_chain);
+
+ switch (tcf_classify(skb, fl, &cl_res, false)) {
+ case TC_ACT_SHOT:
+ qdisc_qstats_drop(sch);
+ __qdisc_drop(skb, to_free);
+ *ret = __NET_XMIT_BYPASS;
+ return NULL;
+ case TC_ACT_STOLEN:
+ case TC_ACT_QUEUED:
+ case TC_ACT_TRAP:
+ __qdisc_drop(skb, to_free);
+ *ret = __NET_XMIT_STOLEN;
+ return NULL;
+ case TC_ACT_REDIRECT:
+ skb_do_redirect(skb);
+ *ret = __NET_XMIT_STOLEN;
+ return NULL;
+ }
+
+ return skb;
+}
+EXPORT_SYMBOL(tcf_qevent_handle);
+
+int tcf_qevent_dump(struct sk_buff *skb, int attr_name, struct tcf_qevent *qe)
+{
+ if (!qe->info.block_index)
+ return 0;
+ return nla_put_u32(skb, attr_name, qe->info.block_index);
+}
+EXPORT_SYMBOL(tcf_qevent_dump);
+#endif
+
static __net_init int tcf_net_init(struct net *net)
{
struct tcf_net *tn = net_generic(net, tcf_net_id);
@@ -3625,11 +3899,6 @@
.size = sizeof(struct tcf_net),
};
-static struct flow_indr_block_entry block_entry = {
- .cb = tc_indr_block_get_and_cmd,
- .list = LIST_HEAD_INIT(block_entry.list),
-};
-
static int __init tc_filter_init(void)
{
int err;
@@ -3642,8 +3911,6 @@
if (err)
goto err_register_pernet_subsys;
- flow_indr_add_block_cb(&block_entry);
-
rtnl_register(PF_UNSPEC, RTM_NEWTFILTER, tc_new_tfilter, NULL,
RTNL_FLAG_DOIT_UNLOCKED);
rtnl_register(PF_UNSPEC, RTM_DELTFILTER, tc_del_tfilter, NULL,
diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c
index ab53a93..87398af 100644
--- a/net/sched/cls_flow.c
+++ b/net/sched/cls_flow.c
@@ -225,7 +225,7 @@
static u32 flow_get_vlan_tag(const struct sk_buff *skb)
{
- u16 uninitialized_var(tag);
+ u16 tag;
if (vlan_get_tag(skb, &tag) < 0)
return 0;
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index c5a0f2c..8ff6945 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -22,12 +22,19 @@
#include <net/ip.h>
#include <net/flow_dissector.h>
#include <net/geneve.h>
+#include <net/vxlan.h>
+#include <net/erspan.h>
#include <net/dst.h>
#include <net/dst_metadata.h>
#include <uapi/linux/netfilter/nf_conntrack_common.h>
+#define TCA_FLOWER_KEY_CT_FLAGS_MAX \
+ ((__TCA_FLOWER_KEY_CT_FLAGS_MAX - 1) << 1)
+#define TCA_FLOWER_KEY_CT_FLAGS_MASK \
+ (TCA_FLOWER_KEY_CT_FLAGS_MAX - 1)
+
struct fl_flow_key {
struct flow_dissector_key_meta meta;
struct flow_dissector_key_control control;
@@ -62,6 +69,7 @@
};
} tp_range;
struct flow_dissector_key_ct ct;
+ struct flow_dissector_key_hash hash;
} __aligned(BITS_PER_LONG / 8); /* Ensure that we can do comparisons as longs. */
struct fl_flow_mask_range {
@@ -201,16 +209,16 @@
struct fl_flow_key *key,
struct fl_flow_key *mkey)
{
- __be16 min_mask, max_mask, min_val, max_val;
+ u16 min_mask, max_mask, min_val, max_val;
- min_mask = htons(filter->mask->key.tp_range.tp_min.dst);
- max_mask = htons(filter->mask->key.tp_range.tp_max.dst);
- min_val = htons(filter->key.tp_range.tp_min.dst);
- max_val = htons(filter->key.tp_range.tp_max.dst);
+ min_mask = ntohs(filter->mask->key.tp_range.tp_min.dst);
+ max_mask = ntohs(filter->mask->key.tp_range.tp_max.dst);
+ min_val = ntohs(filter->key.tp_range.tp_min.dst);
+ max_val = ntohs(filter->key.tp_range.tp_max.dst);
if (min_mask && max_mask) {
- if (htons(key->tp_range.tp.dst) < min_val ||
- htons(key->tp_range.tp.dst) > max_val)
+ if (ntohs(key->tp_range.tp.dst) < min_val ||
+ ntohs(key->tp_range.tp.dst) > max_val)
return false;
/* skb does not have min and max values */
@@ -224,16 +232,16 @@
struct fl_flow_key *key,
struct fl_flow_key *mkey)
{
- __be16 min_mask, max_mask, min_val, max_val;
+ u16 min_mask, max_mask, min_val, max_val;
- min_mask = htons(filter->mask->key.tp_range.tp_min.src);
- max_mask = htons(filter->mask->key.tp_range.tp_max.src);
- min_val = htons(filter->key.tp_range.tp_min.src);
- max_val = htons(filter->key.tp_range.tp_max.src);
+ min_mask = ntohs(filter->mask->key.tp_range.tp_min.src);
+ max_mask = ntohs(filter->mask->key.tp_range.tp_max.src);
+ min_val = ntohs(filter->key.tp_range.tp_min.src);
+ max_val = ntohs(filter->key.tp_range.tp_max.src);
if (min_mask && max_mask) {
- if (htons(key->tp_range.tp.src) < min_val ||
- htons(key->tp_range.tp.src) > max_val)
+ if (ntohs(key->tp_range.tp.src) < min_val ||
+ ntohs(key->tp_range.tp.src) > max_val)
return false;
/* skb does not have min and max values */
@@ -270,14 +278,16 @@
return NULL;
}
-static struct cls_fl_filter *fl_lookup(struct fl_flow_mask *mask,
- struct fl_flow_key *mkey,
- struct fl_flow_key *key)
+static noinline_for_stack
+struct cls_fl_filter *fl_mask_lookup(struct fl_flow_mask *mask, struct fl_flow_key *key)
{
- if ((mask->flags & TCA_FLOWER_MASK_FLAGS_RANGE))
- return fl_lookup_range(mask, mkey, key);
+ struct fl_flow_key mkey;
- return __fl_lookup(mask, mkey);
+ fl_set_masked_key(&mkey, key, mask);
+ if ((mask->flags & TCA_FLOWER_MASK_FLAGS_RANGE))
+ return fl_lookup_range(mask, &mkey, key);
+
+ return __fl_lookup(mask, &mkey);
}
static u16 fl_ct_info_to_flower_map[] = {
@@ -297,7 +307,6 @@
struct tcf_result *res)
{
struct cls_fl_head *head = rcu_dereference_bh(tp->root);
- struct fl_flow_key skb_mkey;
struct fl_flow_key skb_key;
struct fl_flow_mask *mask;
struct cls_fl_filter *f;
@@ -315,11 +324,10 @@
skb_flow_dissect_ct(skb, &mask->dissector, &skb_key,
fl_ct_info_to_flower_map,
ARRAY_SIZE(fl_ct_info_to_flower_map));
+ skb_flow_dissect_hash(skb, &mask->dissector, &skb_key);
skb_flow_dissect(skb, &mask->dissector, &skb_key, 0);
- fl_set_masked_key(&skb_mkey, &skb_key, mask);
-
- f = fl_lookup(mask, &skb_mkey, &skb_key);
+ f = fl_mask_lookup(mask, &skb_key);
if (f && !tc_skip_sw(f->flags)) {
*res = f->res;
return tcf_exts_exec(skb, &f->exts, res);
@@ -448,8 +456,7 @@
cls_flower.rule->match.key = &f->mkey;
cls_flower.classid = f->res.classid;
- err = tc_setup_flow_action(&cls_flower.rule->action, &f->exts,
- rtnl_held);
+ err = tc_setup_flow_action(&cls_flower.rule->action, &f->exts);
if (err) {
kfree(cls_flower.rule);
if (skip_sw) {
@@ -491,7 +498,10 @@
tcf_exts_stats_update(&f->exts, cls_flower.stats.bytes,
cls_flower.stats.pkts,
- cls_flower.stats.lastused);
+ cls_flower.stats.drops,
+ cls_flower.stats.lastused,
+ cls_flower.stats.used_hw_stats,
+ cls_flower.stats.used_hw_stats_valid);
}
static void __fl_put(struct cls_fl_filter *f)
@@ -665,6 +675,7 @@
[TCA_FLOWER_KEY_MPLS_BOS] = { .type = NLA_U8 },
[TCA_FLOWER_KEY_MPLS_TC] = { .type = NLA_U8 },
[TCA_FLOWER_KEY_MPLS_LABEL] = { .type = NLA_U32 },
+ [TCA_FLOWER_KEY_MPLS_OPTS] = { .type = NLA_NESTED },
[TCA_FLOWER_KEY_TCP_FLAGS] = { .type = NLA_U16 },
[TCA_FLOWER_KEY_TCP_FLAGS_MASK] = { .type = NLA_U16 },
[TCA_FLOWER_KEY_IP_TOS] = { .type = NLA_U8 },
@@ -680,8 +691,10 @@
[TCA_FLOWER_KEY_ENC_IP_TTL_MASK] = { .type = NLA_U8 },
[TCA_FLOWER_KEY_ENC_OPTS] = { .type = NLA_NESTED },
[TCA_FLOWER_KEY_ENC_OPTS_MASK] = { .type = NLA_NESTED },
- [TCA_FLOWER_KEY_CT_STATE] = { .type = NLA_U16 },
- [TCA_FLOWER_KEY_CT_STATE_MASK] = { .type = NLA_U16 },
+ [TCA_FLOWER_KEY_CT_STATE] =
+ NLA_POLICY_MASK(NLA_U16, TCA_FLOWER_KEY_CT_FLAGS_MASK),
+ [TCA_FLOWER_KEY_CT_STATE_MASK] =
+ NLA_POLICY_MASK(NLA_U16, TCA_FLOWER_KEY_CT_FLAGS_MASK),
[TCA_FLOWER_KEY_CT_ZONE] = { .type = NLA_U16 },
[TCA_FLOWER_KEY_CT_ZONE_MASK] = { .type = NLA_U16 },
[TCA_FLOWER_KEY_CT_MARK] = { .type = NLA_U32 },
@@ -691,11 +704,18 @@
[TCA_FLOWER_KEY_CT_LABELS_MASK] = { .type = NLA_BINARY,
.len = 128 / BITS_PER_BYTE },
[TCA_FLOWER_FLAGS] = { .type = NLA_U32 },
+ [TCA_FLOWER_KEY_HASH] = { .type = NLA_U32 },
+ [TCA_FLOWER_KEY_HASH_MASK] = { .type = NLA_U32 },
+
};
static const struct nla_policy
enc_opts_policy[TCA_FLOWER_KEY_ENC_OPTS_MAX + 1] = {
+ [TCA_FLOWER_KEY_ENC_OPTS_UNSPEC] = {
+ .strict_start_type = TCA_FLOWER_KEY_ENC_OPTS_VXLAN },
[TCA_FLOWER_KEY_ENC_OPTS_GENEVE] = { .type = NLA_NESTED },
+ [TCA_FLOWER_KEY_ENC_OPTS_VXLAN] = { .type = NLA_NESTED },
+ [TCA_FLOWER_KEY_ENC_OPTS_ERSPAN] = { .type = NLA_NESTED },
};
static const struct nla_policy
@@ -706,6 +726,28 @@
.len = 128 },
};
+static const struct nla_policy
+vxlan_opt_policy[TCA_FLOWER_KEY_ENC_OPT_VXLAN_MAX + 1] = {
+ [TCA_FLOWER_KEY_ENC_OPT_VXLAN_GBP] = { .type = NLA_U32 },
+};
+
+static const struct nla_policy
+erspan_opt_policy[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_MAX + 1] = {
+ [TCA_FLOWER_KEY_ENC_OPT_ERSPAN_VER] = { .type = NLA_U8 },
+ [TCA_FLOWER_KEY_ENC_OPT_ERSPAN_INDEX] = { .type = NLA_U32 },
+ [TCA_FLOWER_KEY_ENC_OPT_ERSPAN_DIR] = { .type = NLA_U8 },
+ [TCA_FLOWER_KEY_ENC_OPT_ERSPAN_HWID] = { .type = NLA_U8 },
+};
+
+static const struct nla_policy
+mpls_stack_entry_policy[TCA_FLOWER_KEY_MPLS_OPT_LSE_MAX + 1] = {
+ [TCA_FLOWER_KEY_MPLS_OPT_LSE_DEPTH] = { .type = NLA_U8 },
+ [TCA_FLOWER_KEY_MPLS_OPT_LSE_TTL] = { .type = NLA_U8 },
+ [TCA_FLOWER_KEY_MPLS_OPT_LSE_BOS] = { .type = NLA_U8 },
+ [TCA_FLOWER_KEY_MPLS_OPT_LSE_TC] = { .type = NLA_U8 },
+ [TCA_FLOWER_KEY_MPLS_OPT_LSE_LABEL] = { .type = NLA_U32 },
+};
+
static void fl_set_key_val(struct nlattr **tb,
void *val, int val_type,
void *mask, int mask_type, int len)
@@ -720,7 +762,8 @@
}
static int fl_set_key_port_range(struct nlattr **tb, struct fl_flow_key *key,
- struct fl_flow_key *mask)
+ struct fl_flow_key *mask,
+ struct netlink_ext_ack *extack)
{
fl_set_key_val(tb, &key->tp_range.tp_min.dst,
TCA_FLOWER_KEY_PORT_DST_MIN, &mask->tp_range.tp_min.dst,
@@ -735,48 +778,219 @@
TCA_FLOWER_KEY_PORT_SRC_MAX, &mask->tp_range.tp_max.src,
TCA_FLOWER_UNSPEC, sizeof(key->tp_range.tp_max.src));
- if ((mask->tp_range.tp_min.dst && mask->tp_range.tp_max.dst &&
- htons(key->tp_range.tp_max.dst) <=
- htons(key->tp_range.tp_min.dst)) ||
- (mask->tp_range.tp_min.src && mask->tp_range.tp_max.src &&
- htons(key->tp_range.tp_max.src) <=
- htons(key->tp_range.tp_min.src)))
+ if (mask->tp_range.tp_min.dst && mask->tp_range.tp_max.dst &&
+ ntohs(key->tp_range.tp_max.dst) <=
+ ntohs(key->tp_range.tp_min.dst)) {
+ NL_SET_ERR_MSG_ATTR(extack,
+ tb[TCA_FLOWER_KEY_PORT_DST_MIN],
+ "Invalid destination port range (min must be strictly smaller than max)");
return -EINVAL;
+ }
+ if (mask->tp_range.tp_min.src && mask->tp_range.tp_max.src &&
+ ntohs(key->tp_range.tp_max.src) <=
+ ntohs(key->tp_range.tp_min.src)) {
+ NL_SET_ERR_MSG_ATTR(extack,
+ tb[TCA_FLOWER_KEY_PORT_SRC_MIN],
+ "Invalid source port range (min must be strictly smaller than max)");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int fl_set_key_mpls_lse(const struct nlattr *nla_lse,
+ struct flow_dissector_key_mpls *key_val,
+ struct flow_dissector_key_mpls *key_mask,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[TCA_FLOWER_KEY_MPLS_OPT_LSE_MAX + 1];
+ struct flow_dissector_mpls_lse *lse_mask;
+ struct flow_dissector_mpls_lse *lse_val;
+ u8 lse_index;
+ u8 depth;
+ int err;
+
+ err = nla_parse_nested(tb, TCA_FLOWER_KEY_MPLS_OPT_LSE_MAX, nla_lse,
+ mpls_stack_entry_policy, extack);
+ if (err < 0)
+ return err;
+
+ if (!tb[TCA_FLOWER_KEY_MPLS_OPT_LSE_DEPTH]) {
+ NL_SET_ERR_MSG(extack, "Missing MPLS option \"depth\"");
+ return -EINVAL;
+ }
+
+ depth = nla_get_u8(tb[TCA_FLOWER_KEY_MPLS_OPT_LSE_DEPTH]);
+
+ /* LSE depth starts at 1, for consistency with terminology used by
+ * RFC 3031 (section 3.9), where depth 0 refers to unlabeled packets.
+ */
+ if (depth < 1 || depth > FLOW_DIS_MPLS_MAX) {
+ NL_SET_ERR_MSG_ATTR(extack,
+ tb[TCA_FLOWER_KEY_MPLS_OPT_LSE_DEPTH],
+ "Invalid MPLS depth");
+ return -EINVAL;
+ }
+ lse_index = depth - 1;
+
+ dissector_set_mpls_lse(key_val, lse_index);
+ dissector_set_mpls_lse(key_mask, lse_index);
+
+ lse_val = &key_val->ls[lse_index];
+ lse_mask = &key_mask->ls[lse_index];
+
+ if (tb[TCA_FLOWER_KEY_MPLS_OPT_LSE_TTL]) {
+ lse_val->mpls_ttl = nla_get_u8(tb[TCA_FLOWER_KEY_MPLS_OPT_LSE_TTL]);
+ lse_mask->mpls_ttl = MPLS_TTL_MASK;
+ }
+ if (tb[TCA_FLOWER_KEY_MPLS_OPT_LSE_BOS]) {
+ u8 bos = nla_get_u8(tb[TCA_FLOWER_KEY_MPLS_OPT_LSE_BOS]);
+
+ if (bos & ~MPLS_BOS_MASK) {
+ NL_SET_ERR_MSG_ATTR(extack,
+ tb[TCA_FLOWER_KEY_MPLS_OPT_LSE_BOS],
+ "Bottom Of Stack (BOS) must be 0 or 1");
+ return -EINVAL;
+ }
+ lse_val->mpls_bos = bos;
+ lse_mask->mpls_bos = MPLS_BOS_MASK;
+ }
+ if (tb[TCA_FLOWER_KEY_MPLS_OPT_LSE_TC]) {
+ u8 tc = nla_get_u8(tb[TCA_FLOWER_KEY_MPLS_OPT_LSE_TC]);
+
+ if (tc & ~MPLS_TC_MASK) {
+ NL_SET_ERR_MSG_ATTR(extack,
+ tb[TCA_FLOWER_KEY_MPLS_OPT_LSE_TC],
+ "Traffic Class (TC) must be between 0 and 7");
+ return -EINVAL;
+ }
+ lse_val->mpls_tc = tc;
+ lse_mask->mpls_tc = MPLS_TC_MASK;
+ }
+ if (tb[TCA_FLOWER_KEY_MPLS_OPT_LSE_LABEL]) {
+ u32 label = nla_get_u32(tb[TCA_FLOWER_KEY_MPLS_OPT_LSE_LABEL]);
+
+ if (label & ~MPLS_LABEL_MASK) {
+ NL_SET_ERR_MSG_ATTR(extack,
+ tb[TCA_FLOWER_KEY_MPLS_OPT_LSE_LABEL],
+ "Label must be between 0 and 1048575");
+ return -EINVAL;
+ }
+ lse_val->mpls_label = label;
+ lse_mask->mpls_label = MPLS_LABEL_MASK;
+ }
+
+ return 0;
+}
+
+static int fl_set_key_mpls_opts(const struct nlattr *nla_mpls_opts,
+ struct flow_dissector_key_mpls *key_val,
+ struct flow_dissector_key_mpls *key_mask,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *nla_lse;
+ int rem;
+ int err;
+
+ if (!(nla_mpls_opts->nla_type & NLA_F_NESTED)) {
+ NL_SET_ERR_MSG_ATTR(extack, nla_mpls_opts,
+ "NLA_F_NESTED is missing");
+ return -EINVAL;
+ }
+
+ nla_for_each_nested(nla_lse, nla_mpls_opts, rem) {
+ if (nla_type(nla_lse) != TCA_FLOWER_KEY_MPLS_OPTS_LSE) {
+ NL_SET_ERR_MSG_ATTR(extack, nla_lse,
+ "Invalid MPLS option type");
+ return -EINVAL;
+ }
+
+ err = fl_set_key_mpls_lse(nla_lse, key_val, key_mask, extack);
+ if (err < 0)
+ return err;
+ }
+ if (rem) {
+ NL_SET_ERR_MSG(extack,
+ "Bytes leftover after parsing MPLS options");
+ return -EINVAL;
+ }
return 0;
}
static int fl_set_key_mpls(struct nlattr **tb,
struct flow_dissector_key_mpls *key_val,
- struct flow_dissector_key_mpls *key_mask)
+ struct flow_dissector_key_mpls *key_mask,
+ struct netlink_ext_ack *extack)
{
+ struct flow_dissector_mpls_lse *lse_mask;
+ struct flow_dissector_mpls_lse *lse_val;
+
+ if (tb[TCA_FLOWER_KEY_MPLS_OPTS]) {
+ if (tb[TCA_FLOWER_KEY_MPLS_TTL] ||
+ tb[TCA_FLOWER_KEY_MPLS_BOS] ||
+ tb[TCA_FLOWER_KEY_MPLS_TC] ||
+ tb[TCA_FLOWER_KEY_MPLS_LABEL]) {
+ NL_SET_ERR_MSG_ATTR(extack,
+ tb[TCA_FLOWER_KEY_MPLS_OPTS],
+ "MPLS label, Traffic Class, Bottom Of Stack and Time To Live must be encapsulated in the MPLS options attribute");
+ return -EBADMSG;
+ }
+
+ return fl_set_key_mpls_opts(tb[TCA_FLOWER_KEY_MPLS_OPTS],
+ key_val, key_mask, extack);
+ }
+
+ lse_val = &key_val->ls[0];
+ lse_mask = &key_mask->ls[0];
+
if (tb[TCA_FLOWER_KEY_MPLS_TTL]) {
- key_val->mpls_ttl = nla_get_u8(tb[TCA_FLOWER_KEY_MPLS_TTL]);
- key_mask->mpls_ttl = MPLS_TTL_MASK;
+ lse_val->mpls_ttl = nla_get_u8(tb[TCA_FLOWER_KEY_MPLS_TTL]);
+ lse_mask->mpls_ttl = MPLS_TTL_MASK;
+ dissector_set_mpls_lse(key_val, 0);
+ dissector_set_mpls_lse(key_mask, 0);
}
if (tb[TCA_FLOWER_KEY_MPLS_BOS]) {
u8 bos = nla_get_u8(tb[TCA_FLOWER_KEY_MPLS_BOS]);
- if (bos & ~MPLS_BOS_MASK)
+ if (bos & ~MPLS_BOS_MASK) {
+ NL_SET_ERR_MSG_ATTR(extack,
+ tb[TCA_FLOWER_KEY_MPLS_BOS],
+ "Bottom Of Stack (BOS) must be 0 or 1");
return -EINVAL;
- key_val->mpls_bos = bos;
- key_mask->mpls_bos = MPLS_BOS_MASK;
+ }
+ lse_val->mpls_bos = bos;
+ lse_mask->mpls_bos = MPLS_BOS_MASK;
+ dissector_set_mpls_lse(key_val, 0);
+ dissector_set_mpls_lse(key_mask, 0);
}
if (tb[TCA_FLOWER_KEY_MPLS_TC]) {
u8 tc = nla_get_u8(tb[TCA_FLOWER_KEY_MPLS_TC]);
- if (tc & ~MPLS_TC_MASK)
+ if (tc & ~MPLS_TC_MASK) {
+ NL_SET_ERR_MSG_ATTR(extack,
+ tb[TCA_FLOWER_KEY_MPLS_TC],
+ "Traffic Class (TC) must be between 0 and 7");
return -EINVAL;
- key_val->mpls_tc = tc;
- key_mask->mpls_tc = MPLS_TC_MASK;
+ }
+ lse_val->mpls_tc = tc;
+ lse_mask->mpls_tc = MPLS_TC_MASK;
+ dissector_set_mpls_lse(key_val, 0);
+ dissector_set_mpls_lse(key_mask, 0);
}
if (tb[TCA_FLOWER_KEY_MPLS_LABEL]) {
u32 label = nla_get_u32(tb[TCA_FLOWER_KEY_MPLS_LABEL]);
- if (label & ~MPLS_LABEL_MASK)
+ if (label & ~MPLS_LABEL_MASK) {
+ NL_SET_ERR_MSG_ATTR(extack,
+ tb[TCA_FLOWER_KEY_MPLS_LABEL],
+ "Label must be between 0 and 1048575");
return -EINVAL;
- key_val->mpls_label = label;
- key_mask->mpls_label = MPLS_LABEL_MASK;
+ }
+ lse_val->mpls_label = label;
+ lse_mask->mpls_label = MPLS_LABEL_MASK;
+ dissector_set_mpls_lse(key_val, 0);
+ dissector_set_mpls_lse(key_mask, 0);
}
return 0;
}
@@ -815,14 +1029,16 @@
}
}
-static int fl_set_key_flags(struct nlattr **tb,
- u32 *flags_key, u32 *flags_mask)
+static int fl_set_key_flags(struct nlattr **tb, u32 *flags_key,
+ u32 *flags_mask, struct netlink_ext_ack *extack)
{
u32 key, mask;
/* mask is mandatory for flags */
- if (!tb[TCA_FLOWER_KEY_FLAGS_MASK])
+ if (!tb[TCA_FLOWER_KEY_FLAGS_MASK]) {
+ NL_SET_ERR_MSG(extack, "Missing flags mask");
return -EINVAL;
+ }
key = be32_to_cpu(nla_get_u32(tb[TCA_FLOWER_KEY_FLAGS]));
mask = be32_to_cpu(nla_get_u32(tb[TCA_FLOWER_KEY_FLAGS_MASK]));
@@ -937,6 +1153,108 @@
return sizeof(struct geneve_opt) + data_len;
}
+static int fl_set_vxlan_opt(const struct nlattr *nla, struct fl_flow_key *key,
+ int depth, int option_len,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[TCA_FLOWER_KEY_ENC_OPT_VXLAN_MAX + 1];
+ struct vxlan_metadata *md;
+ int err;
+
+ md = (struct vxlan_metadata *)&key->enc_opts.data[key->enc_opts.len];
+ memset(md, 0xff, sizeof(*md));
+
+ if (!depth)
+ return sizeof(*md);
+
+ if (nla_type(nla) != TCA_FLOWER_KEY_ENC_OPTS_VXLAN) {
+ NL_SET_ERR_MSG(extack, "Non-vxlan option type for mask");
+ return -EINVAL;
+ }
+
+ err = nla_parse_nested(tb, TCA_FLOWER_KEY_ENC_OPT_VXLAN_MAX, nla,
+ vxlan_opt_policy, extack);
+ if (err < 0)
+ return err;
+
+ if (!option_len && !tb[TCA_FLOWER_KEY_ENC_OPT_VXLAN_GBP]) {
+ NL_SET_ERR_MSG(extack, "Missing tunnel key vxlan option gbp");
+ return -EINVAL;
+ }
+
+ if (tb[TCA_FLOWER_KEY_ENC_OPT_VXLAN_GBP]) {
+ md->gbp = nla_get_u32(tb[TCA_FLOWER_KEY_ENC_OPT_VXLAN_GBP]);
+ md->gbp &= VXLAN_GBP_MASK;
+ }
+
+ return sizeof(*md);
+}
+
+static int fl_set_erspan_opt(const struct nlattr *nla, struct fl_flow_key *key,
+ int depth, int option_len,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_MAX + 1];
+ struct erspan_metadata *md;
+ int err;
+
+ md = (struct erspan_metadata *)&key->enc_opts.data[key->enc_opts.len];
+ memset(md, 0xff, sizeof(*md));
+ md->version = 1;
+
+ if (!depth)
+ return sizeof(*md);
+
+ if (nla_type(nla) != TCA_FLOWER_KEY_ENC_OPTS_ERSPAN) {
+ NL_SET_ERR_MSG(extack, "Non-erspan option type for mask");
+ return -EINVAL;
+ }
+
+ err = nla_parse_nested(tb, TCA_FLOWER_KEY_ENC_OPT_ERSPAN_MAX, nla,
+ erspan_opt_policy, extack);
+ if (err < 0)
+ return err;
+
+ if (!option_len && !tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_VER]) {
+ NL_SET_ERR_MSG(extack, "Missing tunnel key erspan option ver");
+ return -EINVAL;
+ }
+
+ if (tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_VER])
+ md->version = nla_get_u8(tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_VER]);
+
+ if (md->version == 1) {
+ if (!option_len && !tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_INDEX]) {
+ NL_SET_ERR_MSG(extack, "Missing tunnel key erspan option index");
+ return -EINVAL;
+ }
+ if (tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_INDEX]) {
+ nla = tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_INDEX];
+ memset(&md->u, 0x00, sizeof(md->u));
+ md->u.index = nla_get_be32(nla);
+ }
+ } else if (md->version == 2) {
+ if (!option_len && (!tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_DIR] ||
+ !tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_HWID])) {
+ NL_SET_ERR_MSG(extack, "Missing tunnel key erspan option dir or hwid");
+ return -EINVAL;
+ }
+ if (tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_DIR]) {
+ nla = tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_DIR];
+ md->u.md2.dir = nla_get_u8(nla);
+ }
+ if (tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_HWID]) {
+ nla = tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_HWID];
+ set_hwid(&md->u.md2, nla_get_u8(nla));
+ }
+ } else {
+ NL_SET_ERR_MSG(extack, "Tunnel key erspan option ver is incorrect");
+ return -EINVAL;
+ }
+
+ return sizeof(*md);
+}
+
static int fl_set_enc_opt(struct nlattr **tb, struct fl_flow_key *key,
struct fl_flow_key *mask,
struct netlink_ext_ack *extack)
@@ -961,12 +1279,21 @@
nla_opt_msk = nla_data(tb[TCA_FLOWER_KEY_ENC_OPTS_MASK]);
msk_depth = nla_len(tb[TCA_FLOWER_KEY_ENC_OPTS_MASK]);
+ if (!nla_ok(nla_opt_msk, msk_depth)) {
+ NL_SET_ERR_MSG(extack, "Invalid nested attribute for masks");
+ return -EINVAL;
+ }
}
nla_for_each_attr(nla_opt_key, nla_enc_key,
nla_len(tb[TCA_FLOWER_KEY_ENC_OPTS]), key_depth) {
switch (nla_type(nla_opt_key)) {
case TCA_FLOWER_KEY_ENC_OPTS_GENEVE:
+ if (key->enc_opts.dst_opt_type &&
+ key->enc_opts.dst_opt_type != TUNNEL_GENEVE_OPT) {
+ NL_SET_ERR_MSG(extack, "Duplicate type for geneve options");
+ return -EINVAL;
+ }
option_len = 0;
key->enc_opts.dst_opt_type = TUNNEL_GENEVE_OPT;
option_len = fl_set_geneve_opt(nla_opt_key, key,
@@ -991,14 +1318,99 @@
NL_SET_ERR_MSG(extack, "Key and mask miss aligned");
return -EINVAL;
}
+ break;
+ case TCA_FLOWER_KEY_ENC_OPTS_VXLAN:
+ if (key->enc_opts.dst_opt_type) {
+ NL_SET_ERR_MSG(extack, "Duplicate type for vxlan options");
+ return -EINVAL;
+ }
+ option_len = 0;
+ key->enc_opts.dst_opt_type = TUNNEL_VXLAN_OPT;
+ option_len = fl_set_vxlan_opt(nla_opt_key, key,
+ key_depth, option_len,
+ extack);
+ if (option_len < 0)
+ return option_len;
- if (msk_depth)
- nla_opt_msk = nla_next(nla_opt_msk, &msk_depth);
+ key->enc_opts.len += option_len;
+ /* At the same time we need to parse through the mask
+ * in order to verify exact and mask attribute lengths.
+ */
+ mask->enc_opts.dst_opt_type = TUNNEL_VXLAN_OPT;
+ option_len = fl_set_vxlan_opt(nla_opt_msk, mask,
+ msk_depth, option_len,
+ extack);
+ if (option_len < 0)
+ return option_len;
+
+ mask->enc_opts.len += option_len;
+ if (key->enc_opts.len != mask->enc_opts.len) {
+ NL_SET_ERR_MSG(extack, "Key and mask miss aligned");
+ return -EINVAL;
+ }
+ break;
+ case TCA_FLOWER_KEY_ENC_OPTS_ERSPAN:
+ if (key->enc_opts.dst_opt_type) {
+ NL_SET_ERR_MSG(extack, "Duplicate type for erspan options");
+ return -EINVAL;
+ }
+ option_len = 0;
+ key->enc_opts.dst_opt_type = TUNNEL_ERSPAN_OPT;
+ option_len = fl_set_erspan_opt(nla_opt_key, key,
+ key_depth, option_len,
+ extack);
+ if (option_len < 0)
+ return option_len;
+
+ key->enc_opts.len += option_len;
+ /* At the same time we need to parse through the mask
+ * in order to verify exact and mask attribute lengths.
+ */
+ mask->enc_opts.dst_opt_type = TUNNEL_ERSPAN_OPT;
+ option_len = fl_set_erspan_opt(nla_opt_msk, mask,
+ msk_depth, option_len,
+ extack);
+ if (option_len < 0)
+ return option_len;
+
+ mask->enc_opts.len += option_len;
+ if (key->enc_opts.len != mask->enc_opts.len) {
+ NL_SET_ERR_MSG(extack, "Key and mask miss aligned");
+ return -EINVAL;
+ }
break;
default:
NL_SET_ERR_MSG(extack, "Unknown tunnel option type");
return -EINVAL;
}
+
+ if (!msk_depth)
+ continue;
+
+ if (!nla_ok(nla_opt_msk, msk_depth)) {
+ NL_SET_ERR_MSG(extack, "A mask attribute is invalid");
+ return -EINVAL;
+ }
+ nla_opt_msk = nla_next(nla_opt_msk, &msk_depth);
+ }
+
+ return 0;
+}
+
+static int fl_validate_ct_state(u16 state, struct nlattr *tb,
+ struct netlink_ext_ack *extack)
+{
+ if (state && !(state & TCA_FLOWER_KEY_CT_FLAGS_TRACKED)) {
+ NL_SET_ERR_MSG_ATTR(extack, tb,
+ "no trk, so no other flag can be set");
+ return -EINVAL;
+ }
+
+ if (state & TCA_FLOWER_KEY_CT_FLAGS_NEW &&
+ state & TCA_FLOWER_KEY_CT_FLAGS_ESTABLISHED) {
+ NL_SET_ERR_MSG_ATTR(extack, tb,
+ "new and est are mutually exclusive");
+ return -EINVAL;
}
return 0;
@@ -1010,6 +1422,8 @@
struct netlink_ext_ack *extack)
{
if (tb[TCA_FLOWER_KEY_CT_STATE]) {
+ int err;
+
if (!IS_ENABLED(CONFIG_NF_CONNTRACK)) {
NL_SET_ERR_MSG(extack, "Conntrack isn't enabled");
return -EOPNOTSUPP;
@@ -1017,6 +1431,13 @@
fl_set_key_val(tb, &key->ct_state, TCA_FLOWER_KEY_CT_STATE,
&mask->ct_state, TCA_FLOWER_KEY_CT_STATE_MASK,
sizeof(key->ct_state));
+
+ err = fl_validate_ct_state(key->ct_state & mask->ct_state,
+ tb[TCA_FLOWER_KEY_CT_STATE_MASK],
+ extack);
+ if (err)
+ return err;
+
}
if (tb[TCA_FLOWER_KEY_CT_ZONE]) {
if (!IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES)) {
@@ -1176,7 +1597,7 @@
sizeof(key->icmp.code));
} else if (key->basic.n_proto == htons(ETH_P_MPLS_UC) ||
key->basic.n_proto == htons(ETH_P_MPLS_MC)) {
- ret = fl_set_key_mpls(tb, &key->mpls, &mask->mpls);
+ ret = fl_set_key_mpls(tb, &key->mpls, &mask->mpls, extack);
if (ret)
return ret;
} else if (key->basic.n_proto == htons(ETH_P_ARP) ||
@@ -1201,7 +1622,7 @@
if (key->basic.ip_proto == IPPROTO_TCP ||
key->basic.ip_proto == IPPROTO_UDP ||
key->basic.ip_proto == IPPROTO_SCTP) {
- ret = fl_set_key_port_range(tb, key, mask);
+ ret = fl_set_key_port_range(tb, key, mask, extack);
if (ret)
return ret;
}
@@ -1252,6 +1673,10 @@
fl_set_key_ip(tb, true, &key->enc_ip, &mask->enc_ip);
+ fl_set_key_val(tb, &key->hash.hash, TCA_FLOWER_KEY_HASH,
+ &mask->hash.hash, TCA_FLOWER_KEY_HASH_MASK,
+ sizeof(key->hash.hash));
+
if (tb[TCA_FLOWER_KEY_ENC_OPTS]) {
ret = fl_set_enc_opt(tb, key, mask, extack);
if (ret)
@@ -1263,7 +1688,8 @@
return ret;
if (tb[TCA_FLOWER_KEY_FLAGS])
- ret = fl_set_key_flags(tb, &key->control.flags, &mask->control.flags);
+ ret = fl_set_key_flags(tb, &key->control.flags,
+ &mask->control.flags, extack);
return ret;
}
@@ -1294,7 +1720,7 @@
}
#define FL_KEY_MEMBER_OFFSET(member) offsetof(struct fl_flow_key, member)
-#define FL_KEY_MEMBER_SIZE(member) FIELD_SIZEOF(struct fl_flow_key, member)
+#define FL_KEY_MEMBER_SIZE(member) sizeof_field(struct fl_flow_key, member)
#define FL_KEY_IS_MASKED(mask, member) \
memchr_inv(((char *)mask) + FL_KEY_MEMBER_OFFSET(member), \
@@ -1365,6 +1791,8 @@
FLOW_DISSECTOR_KEY_ENC_OPTS, enc_opts);
FL_KEY_SET_IF_MASKED(mask, keys, cnt,
FLOW_DISSECTOR_KEY_CT, ct);
+ FL_KEY_SET_IF_MASKED(mask, keys, cnt,
+ FLOW_DISSECTOR_KEY_HASH, hash);
skb_flow_dissector_init(dissector, keys, cnt);
}
@@ -1741,18 +2169,24 @@
arg->count = arg->skip;
+ rcu_read_lock();
idr_for_each_entry_continue_ul(&head->handle_idr, f, tmp, id) {
/* don't return filters that are being deleted */
if (!refcount_inc_not_zero(&f->refcnt))
continue;
+ rcu_read_unlock();
+
if (arg->fn(tp, f, arg) < 0) {
__fl_put(f);
arg->stop = 1;
+ rcu_read_lock();
break;
}
__fl_put(f);
arg->count++;
+ rcu_read_lock();
}
+ rcu_read_unlock();
arg->cookie = id;
}
@@ -1812,8 +2246,7 @@
cls_flower.rule->match.mask = &f->mask->key;
cls_flower.rule->match.key = &f->mkey;
- err = tc_setup_flow_action(&cls_flower.rule->action, &f->exts,
- true);
+ err = tc_setup_flow_action(&cls_flower.rule->action, &f->exts);
if (err) {
kfree(cls_flower.rule);
if (tc_skip_sw(f->flags)) {
@@ -2003,35 +2436,132 @@
return 0;
}
+static int fl_dump_key_mpls_opt_lse(struct sk_buff *skb,
+ struct flow_dissector_key_mpls *mpls_key,
+ struct flow_dissector_key_mpls *mpls_mask,
+ u8 lse_index)
+{
+ struct flow_dissector_mpls_lse *lse_mask = &mpls_mask->ls[lse_index];
+ struct flow_dissector_mpls_lse *lse_key = &mpls_key->ls[lse_index];
+ int err;
+
+ err = nla_put_u8(skb, TCA_FLOWER_KEY_MPLS_OPT_LSE_DEPTH,
+ lse_index + 1);
+ if (err)
+ return err;
+
+ if (lse_mask->mpls_ttl) {
+ err = nla_put_u8(skb, TCA_FLOWER_KEY_MPLS_OPT_LSE_TTL,
+ lse_key->mpls_ttl);
+ if (err)
+ return err;
+ }
+ if (lse_mask->mpls_bos) {
+ err = nla_put_u8(skb, TCA_FLOWER_KEY_MPLS_OPT_LSE_BOS,
+ lse_key->mpls_bos);
+ if (err)
+ return err;
+ }
+ if (lse_mask->mpls_tc) {
+ err = nla_put_u8(skb, TCA_FLOWER_KEY_MPLS_OPT_LSE_TC,
+ lse_key->mpls_tc);
+ if (err)
+ return err;
+ }
+ if (lse_mask->mpls_label) {
+ err = nla_put_u32(skb, TCA_FLOWER_KEY_MPLS_OPT_LSE_LABEL,
+ lse_key->mpls_label);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+static int fl_dump_key_mpls_opts(struct sk_buff *skb,
+ struct flow_dissector_key_mpls *mpls_key,
+ struct flow_dissector_key_mpls *mpls_mask)
+{
+ struct nlattr *opts;
+ struct nlattr *lse;
+ u8 lse_index;
+ int err;
+
+ opts = nla_nest_start(skb, TCA_FLOWER_KEY_MPLS_OPTS);
+ if (!opts)
+ return -EMSGSIZE;
+
+ for (lse_index = 0; lse_index < FLOW_DIS_MPLS_MAX; lse_index++) {
+ if (!(mpls_mask->used_lses & 1 << lse_index))
+ continue;
+
+ lse = nla_nest_start(skb, TCA_FLOWER_KEY_MPLS_OPTS_LSE);
+ if (!lse) {
+ err = -EMSGSIZE;
+ goto err_opts;
+ }
+
+ err = fl_dump_key_mpls_opt_lse(skb, mpls_key, mpls_mask,
+ lse_index);
+ if (err)
+ goto err_opts_lse;
+ nla_nest_end(skb, lse);
+ }
+ nla_nest_end(skb, opts);
+
+ return 0;
+
+err_opts_lse:
+ nla_nest_cancel(skb, lse);
+err_opts:
+ nla_nest_cancel(skb, opts);
+
+ return err;
+}
+
static int fl_dump_key_mpls(struct sk_buff *skb,
struct flow_dissector_key_mpls *mpls_key,
struct flow_dissector_key_mpls *mpls_mask)
{
+ struct flow_dissector_mpls_lse *lse_mask;
+ struct flow_dissector_mpls_lse *lse_key;
int err;
- if (!memchr_inv(mpls_mask, 0, sizeof(*mpls_mask)))
+ if (!mpls_mask->used_lses)
return 0;
- if (mpls_mask->mpls_ttl) {
+
+ lse_mask = &mpls_mask->ls[0];
+ lse_key = &mpls_key->ls[0];
+
+ /* For backward compatibility, don't use the MPLS nested attributes if
+ * the rule can be expressed using the old attributes.
+ */
+ if (mpls_mask->used_lses & ~1 ||
+ (!lse_mask->mpls_ttl && !lse_mask->mpls_bos &&
+ !lse_mask->mpls_tc && !lse_mask->mpls_label))
+ return fl_dump_key_mpls_opts(skb, mpls_key, mpls_mask);
+
+ if (lse_mask->mpls_ttl) {
err = nla_put_u8(skb, TCA_FLOWER_KEY_MPLS_TTL,
- mpls_key->mpls_ttl);
+ lse_key->mpls_ttl);
if (err)
return err;
}
- if (mpls_mask->mpls_tc) {
+ if (lse_mask->mpls_tc) {
err = nla_put_u8(skb, TCA_FLOWER_KEY_MPLS_TC,
- mpls_key->mpls_tc);
+ lse_key->mpls_tc);
if (err)
return err;
}
- if (mpls_mask->mpls_label) {
+ if (lse_mask->mpls_label) {
err = nla_put_u32(skb, TCA_FLOWER_KEY_MPLS_LABEL,
- mpls_key->mpls_label);
+ lse_key->mpls_label);
if (err)
return err;
}
- if (mpls_mask->mpls_bos) {
+ if (lse_mask->mpls_bos) {
err = nla_put_u8(skb, TCA_FLOWER_KEY_MPLS_BOS,
- mpls_key->mpls_bos);
+ lse_key->mpls_bos);
if (err)
return err;
}
@@ -2151,6 +2681,61 @@
return -EMSGSIZE;
}
+static int fl_dump_key_vxlan_opt(struct sk_buff *skb,
+ struct flow_dissector_key_enc_opts *enc_opts)
+{
+ struct vxlan_metadata *md;
+ struct nlattr *nest;
+
+ nest = nla_nest_start_noflag(skb, TCA_FLOWER_KEY_ENC_OPTS_VXLAN);
+ if (!nest)
+ goto nla_put_failure;
+
+ md = (struct vxlan_metadata *)&enc_opts->data[0];
+ if (nla_put_u32(skb, TCA_FLOWER_KEY_ENC_OPT_VXLAN_GBP, md->gbp))
+ goto nla_put_failure;
+
+ nla_nest_end(skb, nest);
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, nest);
+ return -EMSGSIZE;
+}
+
+static int fl_dump_key_erspan_opt(struct sk_buff *skb,
+ struct flow_dissector_key_enc_opts *enc_opts)
+{
+ struct erspan_metadata *md;
+ struct nlattr *nest;
+
+ nest = nla_nest_start_noflag(skb, TCA_FLOWER_KEY_ENC_OPTS_ERSPAN);
+ if (!nest)
+ goto nla_put_failure;
+
+ md = (struct erspan_metadata *)&enc_opts->data[0];
+ if (nla_put_u8(skb, TCA_FLOWER_KEY_ENC_OPT_ERSPAN_VER, md->version))
+ goto nla_put_failure;
+
+ if (md->version == 1 &&
+ nla_put_be32(skb, TCA_FLOWER_KEY_ENC_OPT_ERSPAN_INDEX, md->u.index))
+ goto nla_put_failure;
+
+ if (md->version == 2 &&
+ (nla_put_u8(skb, TCA_FLOWER_KEY_ENC_OPT_ERSPAN_DIR,
+ md->u.md2.dir) ||
+ nla_put_u8(skb, TCA_FLOWER_KEY_ENC_OPT_ERSPAN_HWID,
+ get_hwid(&md->u.md2))))
+ goto nla_put_failure;
+
+ nla_nest_end(skb, nest);
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, nest);
+ return -EMSGSIZE;
+}
+
static int fl_dump_key_ct(struct sk_buff *skb,
struct flow_dissector_key_ct *key,
struct flow_dissector_key_ct *mask)
@@ -2204,6 +2789,16 @@
if (err)
goto nla_put_failure;
break;
+ case TUNNEL_VXLAN_OPT:
+ err = fl_dump_key_vxlan_opt(skb, enc_opts);
+ if (err)
+ goto nla_put_failure;
+ break;
+ case TUNNEL_ERSPAN_OPT:
+ err = fl_dump_key_erspan_opt(skb, enc_opts);
+ if (err)
+ goto nla_put_failure;
+ break;
default:
goto nla_put_failure;
}
@@ -2424,6 +3019,11 @@
if (fl_dump_key_flags(skb, key->control.flags, mask->control.flags))
goto nla_put_failure;
+ if (fl_dump_key_val(skb, &key->hash.hash, TCA_FLOWER_KEY_HASH,
+ &mask->hash.hash, TCA_FLOWER_KEY_HASH_MASK,
+ sizeof(key->hash.hash)))
+ goto nla_put_failure;
+
return 0;
nla_put_failure:
@@ -2488,6 +3088,48 @@
return -1;
}
+static int fl_terse_dump(struct net *net, struct tcf_proto *tp, void *fh,
+ struct sk_buff *skb, struct tcmsg *t, bool rtnl_held)
+{
+ struct cls_fl_filter *f = fh;
+ struct nlattr *nest;
+ bool skip_hw;
+
+ if (!f)
+ return skb->len;
+
+ t->tcm_handle = f->handle;
+
+ nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
+ if (!nest)
+ goto nla_put_failure;
+
+ spin_lock(&tp->lock);
+
+ skip_hw = tc_skip_hw(f->flags);
+
+ if (f->flags && nla_put_u32(skb, TCA_FLOWER_FLAGS, f->flags))
+ goto nla_put_failure_locked;
+
+ spin_unlock(&tp->lock);
+
+ if (!skip_hw)
+ fl_hw_update_stats(tp, f, rtnl_held);
+
+ if (tcf_exts_terse_dump(skb, &f->exts))
+ goto nla_put_failure;
+
+ nla_nest_end(skb, nest);
+
+ return skb->len;
+
+nla_put_failure_locked:
+ spin_unlock(&tp->lock);
+nla_put_failure:
+ nla_nest_cancel(skb, nest);
+ return -1;
+}
+
static int fl_tmplt_dump(struct sk_buff *skb, struct net *net, void *tmplt_priv)
{
struct fl_flow_tmplt *tmplt = tmplt_priv;
@@ -2552,6 +3194,7 @@
.hw_add = fl_hw_add,
.hw_del = fl_hw_del,
.dump = fl_dump,
+ .terse_dump = fl_terse_dump,
.bind_class = fl_bind_class,
.tmplt_create = fl_tmplt_create,
.tmplt_destroy = fl_tmplt_destroy,
diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c
index 610a0b7..cafb844 100644
--- a/net/sched/cls_matchall.c
+++ b/net/sched/cls_matchall.c
@@ -97,7 +97,7 @@
cls_mall.command = TC_CLSMATCHALL_REPLACE;
cls_mall.cookie = cookie;
- err = tc_setup_flow_action(&cls_mall.rule->action, &head->exts, true);
+ err = tc_setup_flow_action(&cls_mall.rule->action, &head->exts);
if (err) {
kfree(cls_mall.rule);
mall_destroy_hw_filter(tp, head, cookie, NULL);
@@ -302,7 +302,7 @@
TC_CLSMATCHALL_REPLACE : TC_CLSMATCHALL_DESTROY;
cls_mall.cookie = (unsigned long)head;
- err = tc_setup_flow_action(&cls_mall.rule->action, &head->exts, true);
+ err = tc_setup_flow_action(&cls_mall.rule->action, &head->exts);
if (err) {
kfree(cls_mall.rule);
if (add && tc_skip_sw(head->flags)) {
@@ -338,7 +338,10 @@
tc_setup_cb_call(block, TC_SETUP_CLSMATCHALL, &cls_mall, false, true);
tcf_exts_stats_update(&head->exts, cls_mall.stats.bytes,
- cls_mall.stats.pkts, cls_mall.stats.lastused);
+ cls_mall.stats.pkts, cls_mall.stats.drops,
+ cls_mall.stats.lastused,
+ cls_mall.stats.used_hw_stats,
+ cls_mall.stats.used_hw_stats_valid);
}
static int mall_dump(struct net *net, struct tcf_proto *tp, void *fh,
diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c
index 684187a..e9a8a2c 100644
--- a/net/sched/cls_tcindex.c
+++ b/net/sched/cls_tcindex.c
@@ -540,7 +540,7 @@
pr_debug("tcindex_change(tp %p,handle 0x%08x,tca %p,arg %p),opt %p,"
"p %p,r %p,*arg %p\n",
- tp, handle, tca, arg, opt, p, r, arg ? *arg : NULL);
+ tp, handle, tca, arg, opt, p, r, *arg);
if (!opt)
return 0;
diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index e15ff33..54209a1 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -79,7 +79,7 @@
/* The 'ht' field MUST be the last field in structure to allow for
* more entries allocated at end of structure.
*/
- struct tc_u_knode __rcu *ht[1];
+ struct tc_u_knode __rcu *ht[];
};
struct tc_u_common {
@@ -353,7 +353,7 @@
void *key = tc_u_common_ptr(tp);
struct tc_u_common *tp_c = tc_u_common_find(key);
- root_ht = kzalloc(sizeof(*root_ht), GFP_KERNEL);
+ root_ht = kzalloc(struct_size(root_ht, ht, 1), GFP_KERNEL);
if (root_ht == NULL)
return -ENOBUFS;
@@ -364,7 +364,7 @@
idr_init(&root_ht->handle_idr);
if (tp_c == NULL) {
- tp_c = kzalloc(sizeof(*tp_c), GFP_KERNEL);
+ tp_c = kzalloc(struct_size(tp_c, hlist->ht, 1), GFP_KERNEL);
if (tp_c == NULL) {
kfree(root_ht);
return -ENOBUFS;
@@ -796,9 +796,7 @@
struct tc_u32_sel *s = &n->sel;
struct tc_u_knode *new;
- new = kzalloc(sizeof(*n) + s->nkeys*sizeof(struct tc_u32_key),
- GFP_KERNEL);
-
+ new = kzalloc(struct_size(new, sel.keys, s->nkeys), GFP_KERNEL);
if (!new)
return NULL;
@@ -854,9 +852,6 @@
u32 htid, flags = 0;
size_t sel_size;
int err;
-#ifdef CONFIG_CLS_U32_PERF
- size_t size;
-#endif
if (!opt) {
if (handle) {
@@ -938,7 +933,7 @@
NL_SET_ERR_MSG_MOD(extack, "Divisor can only be used on a hash table");
return -EINVAL;
}
- ht = kzalloc(sizeof(*ht) + divisor*sizeof(void *), GFP_KERNEL);
+ ht = kzalloc(struct_size(ht, ht, divisor + 1), GFP_KERNEL);
if (ht == NULL)
return -ENOBUFS;
if (handle == 0) {
@@ -1024,15 +1019,15 @@
goto erridr;
}
- n = kzalloc(offsetof(typeof(*n), sel) + sel_size, GFP_KERNEL);
+ n = kzalloc(struct_size(n, sel.keys, s->nkeys), GFP_KERNEL);
if (n == NULL) {
err = -ENOBUFS;
goto erridr;
}
#ifdef CONFIG_CLS_U32_PERF
- size = sizeof(struct tc_u32_pcnt) + s->nkeys * sizeof(u64);
- n->pf = __alloc_percpu(size, __alignof__(struct tc_u32_pcnt));
+ n->pf = __alloc_percpu(struct_size(n->pf, kcnts, s->nkeys),
+ __alignof__(struct tc_u32_pcnt));
if (!n->pf) {
err = -ENOBUFS;
goto errfree;
@@ -1296,8 +1291,7 @@
int cpu;
#endif
- if (nla_put(skb, TCA_U32_SEL,
- sizeof(n->sel) + n->sel.nkeys*sizeof(struct tc_u32_key),
+ if (nla_put(skb, TCA_U32_SEL, struct_size(&n->sel, keys, n->sel.nkeys),
&n->sel))
goto nla_put_failure;
@@ -1347,9 +1341,7 @@
goto nla_put_failure;
}
#ifdef CONFIG_CLS_U32_PERF
- gpf = kzalloc(sizeof(struct tc_u32_pcnt) +
- n->sel.nkeys * sizeof(u64),
- GFP_KERNEL);
+ gpf = kzalloc(struct_size(gpf, kcnts, n->sel.nkeys), GFP_KERNEL);
if (!gpf)
goto nla_put_failure;
@@ -1363,9 +1355,7 @@
gpf->kcnts[i] += pf->kcnts[i];
}
- if (nla_put_64bit(skb, TCA_U32_PCNT,
- sizeof(struct tc_u32_pcnt) +
- n->sel.nkeys * sizeof(u64),
+ if (nla_put_64bit(skb, TCA_U32_PCNT, struct_size(gpf, kcnts, n->sel.nkeys),
gpf, TCA_U32_PAD)) {
kfree(gpf);
goto nla_put_failure;
diff --git a/net/sched/em_canid.c b/net/sched/em_canid.c
index b9a94fd..5ea84de 100644
--- a/net/sched/em_canid.c
+++ b/net/sched/em_canid.c
@@ -40,6 +40,7 @@
/**
* em_canid_get_id() - Extracts Can ID out of the sk_buff structure.
+ * @skb: buffer to extract Can ID from
*/
static canid_t em_canid_get_id(struct sk_buff *skb)
{
diff --git a/net/sched/em_ipt.c b/net/sched/em_ipt.c
index e2c157d..3650117 100644
--- a/net/sched/em_ipt.c
+++ b/net/sched/em_ipt.c
@@ -22,7 +22,7 @@
const struct xt_match *match;
u32 hook;
u8 nfproto;
- u8 match_data[0] __aligned(8);
+ u8 match_data[] __aligned(8);
};
struct em_ipt_xt_match {
@@ -199,7 +199,7 @@
im->match->destroy(&par);
}
module_put(im->match->me);
- kfree((void *)im);
+ kfree(im);
}
static int em_ipt_match(struct sk_buff *skb, struct tcf_ematch *em,
diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c
index ad007cd..4625496 100644
--- a/net/sched/em_meta.c
+++ b/net/sched/em_meta.c
@@ -521,7 +521,7 @@
*err = -1;
return;
}
- dst->value = sk->sk_ack_backlog;
+ dst->value = READ_ONCE(sk->sk_ack_backlog);
}
META_COLLECTOR(int_sk_max_ack_bl)
@@ -532,7 +532,7 @@
*err = -1;
return;
}
- dst->value = sk->sk_max_ack_backlog;
+ dst->value = READ_ONCE(sk->sk_max_ack_backlog);
}
META_COLLECTOR(int_sk_prio)
diff --git a/net/sched/em_nbyte.c b/net/sched/em_nbyte.c
index 88c7ce4..2c1192a 100644
--- a/net/sched/em_nbyte.c
+++ b/net/sched/em_nbyte.c
@@ -16,7 +16,7 @@
struct nbyte_data {
struct tcf_em_nbyte hdr;
- char pattern[0];
+ char pattern[];
};
static int em_nbyte_change(struct net *net, void *data, int data_len,
diff --git a/net/sched/ematch.c b/net/sched/ematch.c
index dd3b8c1..f885bea 100644
--- a/net/sched/ematch.c
+++ b/net/sched/ematch.c
@@ -389,7 +389,6 @@
/**
* tcf_em_tree_destroy - destroy an ematch tree
*
- * @tp: classifier kind handle
* @tree: ematch tree to be deleted
*
* This functions destroys an ematch tree previously created by
@@ -425,7 +424,7 @@
* tcf_em_tree_dump - dump ematch tree into a rtnl message
*
* @skb: skb holding the rtnl message
- * @t: ematch tree to be dumped
+ * @tree: ematch tree to be dumped
* @tlv: TLV type to be used to encapsulate the tree
*
* This function dumps a ematch tree into a rtnl message. It is valid to
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index 3b1b5ee..6e18aa4 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -32,6 +32,8 @@
#include <net/pkt_sched.h>
#include <net/pkt_cls.h>
+#include <trace/events/qdisc.h>
+
/*
Short review.
@@ -265,7 +267,8 @@
root->handle == handle)
return root;
- hash_for_each_possible_rcu(qdisc_dev(root)->qdisc_hash, q, hash, handle) {
+ hash_for_each_possible_rcu(qdisc_dev(root)->qdisc_hash, q, hash, handle,
+ lockdep_rtnl_is_held()) {
if (q->handle == handle)
return q;
}
@@ -298,7 +301,7 @@
if (!handle)
return NULL;
- q = qdisc_match_from_root(dev->qdisc, handle);
+ q = qdisc_match_from_root(rtnl_dereference(dev->qdisc), handle);
if (q)
goto out;
@@ -317,7 +320,7 @@
if (!handle)
return NULL;
- q = qdisc_match_from_root(dev->qdisc, handle);
+ q = qdisc_match_from_root(rcu_dereference(dev->qdisc), handle);
if (q)
goto out;
@@ -510,6 +513,12 @@
return stab;
}
+ if (s->size_log > STAB_SIZE_LOG_MAX ||
+ s->cell_log > STAB_SIZE_LOG_MAX) {
+ NL_SET_ERR_MSG(extack, "Invalid logarithmic size of size table");
+ return ERR_PTR(-EINVAL);
+ }
+
stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL);
if (!stab)
return ERR_PTR(-ENOMEM);
@@ -619,21 +628,28 @@
}
EXPORT_SYMBOL(qdisc_watchdog_init);
-void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires)
+void qdisc_watchdog_schedule_range_ns(struct qdisc_watchdog *wd, u64 expires,
+ u64 delta_ns)
{
if (test_bit(__QDISC_STATE_DEACTIVATED,
&qdisc_root_sleeping(wd->qdisc)->state))
return;
- if (wd->last_expires == expires)
- return;
+ if (hrtimer_is_queued(&wd->timer)) {
+ /* If timer is already set in [expires, expires + delta_ns],
+ * do not reprogram it.
+ */
+ if (wd->last_expires - expires <= delta_ns)
+ return;
+ }
wd->last_expires = expires;
- hrtimer_start(&wd->timer,
- ns_to_ktime(expires),
- HRTIMER_MODE_ABS_PINNED);
+ hrtimer_start_range_ns(&wd->timer,
+ ns_to_ktime(expires),
+ delta_ns,
+ HRTIMER_MODE_ABS_PINNED);
}
-EXPORT_SYMBOL(qdisc_watchdog_schedule_ns);
+EXPORT_SYMBOL(qdisc_watchdog_schedule_range_ns);
void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
{
@@ -1066,10 +1082,10 @@
skip:
if (!ingress) {
notify_and_destroy(net, skb, n, classid,
- dev->qdisc, new);
+ rtnl_dereference(dev->qdisc), new);
if (new && !new->ops->attach)
qdisc_refcount_inc(new);
- dev->qdisc = new ? : &noop_qdisc;
+ rcu_assign_pointer(dev->qdisc, new ? : &noop_qdisc);
if (new && new->ops->attach)
new->ops->attach(new);
@@ -1085,8 +1101,7 @@
int err;
/* Only support running class lockless if parent is lockless */
- if (new && (new->flags & TCQ_F_NOLOCK) &&
- parent && !(parent->flags & TCQ_F_NOLOCK))
+ if (new && (new->flags & TCQ_F_NOLOCK) && !(parent->flags & TCQ_F_NOLOCK))
qdisc_clear_nolock(new);
if (!cops || !cops->graft)
@@ -1189,7 +1204,7 @@
err = -ENOENT;
if (!ops) {
- NL_SET_ERR_MSG(extack, "Specified qdisc not found");
+ NL_SET_ERR_MSG(extack, "Specified qdisc kind is unknown");
goto err_out;
}
@@ -1277,6 +1292,7 @@
}
qdisc_hash_add(sch, false);
+ trace_qdisc_create(ops, dev, parent);
return sch;
@@ -1444,7 +1460,7 @@
q = dev_ingress_queue(dev)->qdisc_sleeping;
}
} else {
- q = dev->qdisc;
+ q = rtnl_dereference(dev->qdisc);
}
if (!q) {
NL_SET_ERR_MSG(extack, "Cannot find specified qdisc on specified device");
@@ -1533,7 +1549,7 @@
q = dev_ingress_queue(dev)->qdisc_sleeping;
}
} else {
- q = dev->qdisc;
+ q = rtnl_dereference(dev->qdisc);
}
/* It may be default qdisc, ignore it */
@@ -1755,7 +1771,8 @@
s_q_idx = 0;
q_idx = 0;
- if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx,
+ if (tc_dump_qdisc_root(rtnl_dereference(dev->qdisc),
+ skb, cb, &q_idx, s_q_idx,
true, tca[TCA_DUMP_INVISIBLE]) < 0)
goto done;
@@ -2031,7 +2048,7 @@
} else if (qid1) {
qid = qid1;
} else if (qid == 0)
- qid = dev->qdisc->handle;
+ qid = rtnl_dereference(dev->qdisc)->handle;
/* Now qid is genuine qdisc handle consistent
* both with parent and child.
@@ -2042,7 +2059,7 @@
portid = TC_H_MAKE(qid, portid);
} else {
if (qid == 0)
- qid = dev->qdisc->handle;
+ qid = rtnl_dereference(dev->qdisc)->handle;
}
/* OK. Locate qdisc */
@@ -2203,7 +2220,8 @@
s_t = cb->args[0];
t = 0;
- if (tc_dump_tclass_root(dev->qdisc, skb, tcm, cb, &t, s_t, true) < 0)
+ if (tc_dump_tclass_root(rtnl_dereference(dev->qdisc),
+ skb, tcm, cb, &t, s_t, true) < 0)
goto done;
dev_queue = dev_ingress_queue(dev);
diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c
index 6385995..1c281cc 100644
--- a/net/sched/sch_atm.c
+++ b/net/sched/sch_atm.c
@@ -58,7 +58,7 @@
struct atm_flow_data *excess; /* flow for excess traffic;
NULL to set CLP instead */
int hdr_len;
- unsigned char hdr[0]; /* header data; MUST BE LAST */
+ unsigned char hdr[]; /* header data; MUST BE LAST */
};
struct atm_qdisc_data {
diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c
index e8eebe4..c580139 100644
--- a/net/sched/sch_cake.c
+++ b/net/sched/sch_cake.c
@@ -173,8 +173,7 @@
u64 tin_rate_bps;
u16 tin_rate_shft;
- u16 tin_quantum_prio;
- u16 tin_quantum_band;
+ u16 tin_quantum;
s32 tin_deficit;
u32 tin_backlog;
u32 tin_dropped;
@@ -313,8 +312,8 @@
};
static const u8 diffserv8[] = {
- 2, 5, 1, 2, 4, 2, 2, 2,
- 0, 2, 1, 2, 1, 2, 1, 2,
+ 2, 0, 1, 2, 4, 2, 2, 2,
+ 1, 2, 1, 2, 1, 2, 1, 2,
5, 2, 4, 2, 4, 2, 4, 2,
3, 2, 3, 2, 3, 2, 3, 2,
6, 2, 3, 2, 3, 2, 3, 2,
@@ -324,7 +323,7 @@
};
static const u8 diffserv4[] = {
- 0, 2, 0, 0, 2, 0, 0, 0,
+ 0, 1, 0, 0, 2, 0, 0, 0,
1, 0, 0, 0, 0, 0, 0, 0,
2, 0, 2, 0, 2, 0, 2, 0,
2, 0, 2, 0, 2, 0, 2, 0,
@@ -335,7 +334,7 @@
};
static const u8 diffserv3[] = {
- 0, 0, 0, 0, 2, 0, 0, 0,
+ 0, 1, 0, 0, 2, 0, 0, 0,
1, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
@@ -585,26 +584,48 @@
return drop;
}
-static void cake_update_flowkeys(struct flow_keys *keys,
+static bool cake_update_flowkeys(struct flow_keys *keys,
const struct sk_buff *skb)
{
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
struct nf_conntrack_tuple tuple = {};
- bool rev = !skb->_nfct;
+ bool rev = !skb->_nfct, upd = false;
+ __be32 ip;
if (skb_protocol(skb, true) != htons(ETH_P_IP))
- return;
+ return false;
if (!nf_ct_get_tuple_skb(&tuple, skb))
- return;
+ return false;
- keys->addrs.v4addrs.src = rev ? tuple.dst.u3.ip : tuple.src.u3.ip;
- keys->addrs.v4addrs.dst = rev ? tuple.src.u3.ip : tuple.dst.u3.ip;
+ ip = rev ? tuple.dst.u3.ip : tuple.src.u3.ip;
+ if (ip != keys->addrs.v4addrs.src) {
+ keys->addrs.v4addrs.src = ip;
+ upd = true;
+ }
+ ip = rev ? tuple.src.u3.ip : tuple.dst.u3.ip;
+ if (ip != keys->addrs.v4addrs.dst) {
+ keys->addrs.v4addrs.dst = ip;
+ upd = true;
+ }
if (keys->ports.ports) {
- keys->ports.src = rev ? tuple.dst.u.all : tuple.src.u.all;
- keys->ports.dst = rev ? tuple.src.u.all : tuple.dst.u.all;
+ __be16 port;
+
+ port = rev ? tuple.dst.u.all : tuple.src.u.all;
+ if (port != keys->ports.src) {
+ keys->ports.src = port;
+ upd = true;
+ }
+ port = rev ? tuple.src.u.all : tuple.dst.u.all;
+ if (port != keys->ports.dst) {
+ port = keys->ports.dst;
+ upd = true;
+ }
}
+ return upd;
+#else
+ return false;
#endif
}
@@ -625,23 +646,36 @@
static u32 cake_hash(struct cake_tin_data *q, const struct sk_buff *skb,
int flow_mode, u16 flow_override, u16 host_override)
{
+ bool hash_flows = (!flow_override && !!(flow_mode & CAKE_FLOW_FLOWS));
+ bool hash_hosts = (!host_override && !!(flow_mode & CAKE_FLOW_HOSTS));
+ bool nat_enabled = !!(flow_mode & CAKE_FLOW_NAT_FLAG);
u32 flow_hash = 0, srchost_hash = 0, dsthost_hash = 0;
u16 reduced_hash, srchost_idx, dsthost_idx;
struct flow_keys keys, host_keys;
+ bool use_skbhash = skb->l4_hash;
if (unlikely(flow_mode == CAKE_FLOW_NONE))
return 0;
- /* If both overrides are set we can skip packet dissection entirely */
- if ((flow_override || !(flow_mode & CAKE_FLOW_FLOWS)) &&
- (host_override || !(flow_mode & CAKE_FLOW_HOSTS)))
+ /* If both overrides are set, or we can use the SKB hash and nat mode is
+ * disabled, we can skip packet dissection entirely. If nat mode is
+ * enabled there's another check below after doing the conntrack lookup.
+ */
+ if ((!hash_flows || (use_skbhash && !nat_enabled)) && !hash_hosts)
goto skip_hash;
skb_flow_dissect_flow_keys(skb, &keys,
FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL);
- if (flow_mode & CAKE_FLOW_NAT_FLAG)
- cake_update_flowkeys(&keys, skb);
+ /* Don't use the SKB hash if we change the lookup keys from conntrack */
+ if (nat_enabled && cake_update_flowkeys(&keys, skb))
+ use_skbhash = false;
+
+ /* If we can still use the SKB hash and don't need the host hash, we can
+ * skip the rest of the hashing procedure
+ */
+ if (use_skbhash && !hash_hosts)
+ goto skip_hash;
/* flow_hash_from_keys() sorts the addresses by value, so we have
* to preserve their order in a separate data structure to treat
@@ -680,12 +714,14 @@
/* This *must* be after the above switch, since as a
* side-effect it sorts the src and dst addresses.
*/
- if (flow_mode & CAKE_FLOW_FLOWS)
+ if (hash_flows && !use_skbhash)
flow_hash = flow_hash_from_keys(&keys);
skip_hash:
if (flow_override)
flow_hash = flow_override - 1;
+ else if (use_skbhash && (flow_mode & CAKE_FLOW_FLOWS))
+ flow_hash = skb->hash;
if (host_override) {
dsthost_hash = host_override - 1;
srchost_hash = host_override - 1;
@@ -1638,7 +1674,7 @@
case TC_ACT_QUEUED:
case TC_ACT_TRAP:
*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
- /* fall through */
+ fallthrough;
case TC_ACT_SHOT:
return 0;
}
@@ -1660,7 +1696,7 @@
{
struct cake_sched_data *q = qdisc_priv(sch);
int len = qdisc_pkt_len(skb);
- int uninitialized_var(ret);
+ int ret;
struct sk_buff *ack = NULL;
ktime_t now = ktime_get();
struct cake_tin_data *b;
@@ -1711,8 +1747,7 @@
if (IS_ERR_OR_NULL(segs))
return qdisc_drop(skb, sch, to_free);
- while (segs) {
- nskb = segs->next;
+ skb_list_walk_safe(segs, segs, nskb) {
skb_mark_not_on_list(segs);
qdisc_skb_cb(segs)->pkt_len = segs->len;
cobalt_set_enqueue_time(segs, now);
@@ -1725,7 +1760,6 @@
slen += segs->len;
q->buffer_used += segs->truesize;
b->packets++;
- segs = nskb;
}
/* stats */
@@ -1947,7 +1981,7 @@
while (b->tin_deficit < 0 ||
!(b->sparse_flow_count + b->bulk_flow_count)) {
if (b->tin_deficit <= 0)
- b->tin_deficit += b->tin_quantum_band;
+ b->tin_deficit += b->tin_quantum;
if (b->sparse_flow_count + b->bulk_flow_count)
empty = false;
@@ -2269,8 +2303,7 @@
cake_set_rate(b, rate, mtu,
us_to_ns(q->target), us_to_ns(q->interval));
- b->tin_quantum_band = 65535;
- b->tin_quantum_prio = 65535;
+ b->tin_quantum = 65535;
return 0;
}
@@ -2281,8 +2314,7 @@
struct cake_sched_data *q = qdisc_priv(sch);
u32 mtu = psched_mtu(qdisc_dev(sch));
u64 rate = q->rate_bps;
- u32 quantum1 = 256;
- u32 quantum2 = 256;
+ u32 quantum = 256;
u32 i;
q->tin_cnt = 8;
@@ -2295,18 +2327,14 @@
cake_set_rate(b, rate, mtu, us_to_ns(q->target),
us_to_ns(q->interval));
- b->tin_quantum_prio = max_t(u16, 1U, quantum1);
- b->tin_quantum_band = max_t(u16, 1U, quantum2);
+ b->tin_quantum = max_t(u16, 1U, quantum);
/* calculate next class's parameters */
rate *= 7;
rate >>= 3;
- quantum1 *= 3;
- quantum1 >>= 1;
-
- quantum2 *= 7;
- quantum2 >>= 3;
+ quantum *= 7;
+ quantum >>= 3;
}
return 0;
@@ -2375,8 +2403,7 @@
struct cake_sched_data *q = qdisc_priv(sch);
u32 mtu = psched_mtu(qdisc_dev(sch));
u64 rate = q->rate_bps;
- u32 quantum1 = 256;
- u32 quantum2 = 256;
+ u32 quantum = 256;
u32 i;
q->tin_cnt = 8;
@@ -2392,18 +2419,14 @@
cake_set_rate(b, rate, mtu, us_to_ns(q->target),
us_to_ns(q->interval));
- b->tin_quantum_prio = max_t(u16, 1U, quantum1);
- b->tin_quantum_band = max_t(u16, 1U, quantum2);
+ b->tin_quantum = max_t(u16, 1U, quantum);
/* calculate next class's parameters */
rate *= 7;
rate >>= 3;
- quantum1 *= 3;
- quantum1 >>= 1;
-
- quantum2 *= 7;
- quantum2 >>= 3;
+ quantum *= 7;
+ quantum >>= 3;
}
return 0;
@@ -2442,17 +2465,11 @@
cake_set_rate(&q->tins[3], rate >> 2, mtu,
us_to_ns(q->target), us_to_ns(q->interval));
- /* priority weights */
- q->tins[0].tin_quantum_prio = quantum;
- q->tins[1].tin_quantum_prio = quantum >> 4;
- q->tins[2].tin_quantum_prio = quantum << 2;
- q->tins[3].tin_quantum_prio = quantum << 4;
-
/* bandwidth-sharing weights */
- q->tins[0].tin_quantum_band = quantum;
- q->tins[1].tin_quantum_band = quantum >> 4;
- q->tins[2].tin_quantum_band = quantum >> 1;
- q->tins[3].tin_quantum_band = quantum >> 2;
+ q->tins[0].tin_quantum = quantum;
+ q->tins[1].tin_quantum = quantum >> 4;
+ q->tins[2].tin_quantum = quantum >> 1;
+ q->tins[3].tin_quantum = quantum >> 2;
return 0;
}
@@ -2483,15 +2500,10 @@
cake_set_rate(&q->tins[2], rate >> 2, mtu,
us_to_ns(q->target), us_to_ns(q->interval));
- /* priority weights */
- q->tins[0].tin_quantum_prio = quantum;
- q->tins[1].tin_quantum_prio = quantum >> 4;
- q->tins[2].tin_quantum_prio = quantum << 4;
-
/* bandwidth-sharing weights */
- q->tins[0].tin_quantum_band = quantum;
- q->tins[1].tin_quantum_band = quantum >> 4;
- q->tins[2].tin_quantum_band = quantum >> 2;
+ q->tins[0].tin_quantum = quantum;
+ q->tins[1].tin_quantum = quantum >> 4;
+ q->tins[2].tin_quantum = quantum >> 2;
return 0;
}
@@ -2724,7 +2736,7 @@
q->tins = kvcalloc(CAKE_MAX_TINS, sizeof(struct cake_tin_data),
GFP_KERNEL);
if (!q->tins)
- goto nomem;
+ return -ENOMEM;
for (i = 0; i < CAKE_MAX_TINS; i++) {
struct cake_tin_data *b = q->tins + i;
@@ -2754,10 +2766,6 @@
q->min_netlen = ~0;
q->min_adjlen = ~0;
return 0;
-
-nomem:
- cake_destroy(sch);
- return -ENOMEM;
}
static int cake_dump(struct Qdisc *sch, struct sk_buff *skb)
diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c
index e597288..4a78fcf 100644
--- a/net/sched/sch_cbq.c
+++ b/net/sched/sch_cbq.c
@@ -250,7 +250,7 @@
case TC_ACT_STOLEN:
case TC_ACT_TRAP:
*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
- /* fall through */
+ fallthrough;
case TC_ACT_SHOT:
return NULL;
case TC_ACT_RECLASSIFY:
@@ -360,7 +360,7 @@
struct sk_buff **to_free)
{
struct cbq_sched_data *q = qdisc_priv(sch);
- int uninitialized_var(ret);
+ int ret;
struct cbq_class *cl = cbq_classify(skb, sch, &ret);
#ifdef CONFIG_NET_CLS_ACT
diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c
index e54f6ea..2adbd94 100644
--- a/net/sched/sch_choke.c
+++ b/net/sched/sch_choke.c
@@ -131,7 +131,6 @@
}
struct choke_skb_cb {
- u16 classid;
u8 keys_valid;
struct flow_keys_digest keys;
};
@@ -142,11 +141,6 @@
return (struct choke_skb_cb *)qdisc_skb_cb(skb)->data;
}
-static inline void choke_set_classid(struct sk_buff *skb, u16 classid)
-{
- choke_skb_cb(skb)->classid = classid;
-}
-
/*
* Compare flow of two packets
* Returns true only if source and destination address and port match.
@@ -379,7 +373,7 @@
if (mask != q->tab_mask) {
struct sk_buff **ntab;
- ntab = kvmalloc_array((mask + 1), sizeof(struct sk_buff *), GFP_KERNEL | __GFP_ZERO);
+ ntab = kvcalloc(mask + 1, sizeof(struct sk_buff *), GFP_KERNEL);
if (!ntab)
return -ENOMEM;
diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c
index 07a2b0b..dde5646 100644
--- a/net/sched/sch_drr.c
+++ b/net/sched/sch_drr.c
@@ -324,7 +324,7 @@
case TC_ACT_STOLEN:
case TC_ACT_TRAP:
*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
- /* fall through */
+ fallthrough;
case TC_ACT_SHOT:
return NULL;
}
diff --git a/net/sched/sch_ets.c b/net/sched/sch_ets.c
new file mode 100644
index 0000000..9c22487
--- /dev/null
+++ b/net/sched/sch_ets.c
@@ -0,0 +1,837 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * net/sched/sch_ets.c Enhanced Transmission Selection scheduler
+ *
+ * Description
+ * -----------
+ *
+ * The Enhanced Transmission Selection scheduler is a classful queuing
+ * discipline that merges functionality of PRIO and DRR qdiscs in one scheduler.
+ * ETS makes it easy to configure a set of strict and bandwidth-sharing bands to
+ * implement the transmission selection described in 802.1Qaz.
+ *
+ * Although ETS is technically classful, it's not possible to add and remove
+ * classes at will. Instead one specifies number of classes, how many are
+ * PRIO-like and how many DRR-like, and quanta for the latter.
+ *
+ * Algorithm
+ * ---------
+ *
+ * The strict classes, if any, are tried for traffic first: first band 0, if it
+ * has no traffic then band 1, etc.
+ *
+ * When there is no traffic in any of the strict queues, the bandwidth-sharing
+ * ones are tried next. Each band is assigned a deficit counter, initialized to
+ * "quantum" of that band. ETS maintains a list of active bandwidth-sharing
+ * bands whose qdiscs are non-empty. A packet is dequeued from the band at the
+ * head of the list if the packet size is smaller or equal to the deficit
+ * counter. If the counter is too small, it is increased by "quantum" and the
+ * scheduler moves on to the next band in the active list.
+ */
+
+#include <linux/module.h>
+#include <net/gen_stats.h>
+#include <net/netlink.h>
+#include <net/pkt_cls.h>
+#include <net/pkt_sched.h>
+#include <net/sch_generic.h>
+
+struct ets_class {
+ struct list_head alist; /* In struct ets_sched.active. */
+ struct Qdisc *qdisc;
+ u32 quantum;
+ u32 deficit;
+ struct gnet_stats_basic_packed bstats;
+ struct gnet_stats_queue qstats;
+};
+
+struct ets_sched {
+ struct list_head active;
+ struct tcf_proto __rcu *filter_list;
+ struct tcf_block *block;
+ unsigned int nbands;
+ unsigned int nstrict;
+ u8 prio2band[TC_PRIO_MAX + 1];
+ struct ets_class classes[TCQ_ETS_MAX_BANDS];
+};
+
+static const struct nla_policy ets_policy[TCA_ETS_MAX + 1] = {
+ [TCA_ETS_NBANDS] = { .type = NLA_U8 },
+ [TCA_ETS_NSTRICT] = { .type = NLA_U8 },
+ [TCA_ETS_QUANTA] = { .type = NLA_NESTED },
+ [TCA_ETS_PRIOMAP] = { .type = NLA_NESTED },
+};
+
+static const struct nla_policy ets_priomap_policy[TCA_ETS_MAX + 1] = {
+ [TCA_ETS_PRIOMAP_BAND] = { .type = NLA_U8 },
+};
+
+static const struct nla_policy ets_quanta_policy[TCA_ETS_MAX + 1] = {
+ [TCA_ETS_QUANTA_BAND] = { .type = NLA_U32 },
+};
+
+static const struct nla_policy ets_class_policy[TCA_ETS_MAX + 1] = {
+ [TCA_ETS_QUANTA_BAND] = { .type = NLA_U32 },
+};
+
+static int ets_quantum_parse(struct Qdisc *sch, const struct nlattr *attr,
+ unsigned int *quantum,
+ struct netlink_ext_ack *extack)
+{
+ *quantum = nla_get_u32(attr);
+ if (!*quantum) {
+ NL_SET_ERR_MSG(extack, "ETS quantum cannot be zero");
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static struct ets_class *
+ets_class_from_arg(struct Qdisc *sch, unsigned long arg)
+{
+ struct ets_sched *q = qdisc_priv(sch);
+
+ return &q->classes[arg - 1];
+}
+
+static u32 ets_class_id(struct Qdisc *sch, const struct ets_class *cl)
+{
+ struct ets_sched *q = qdisc_priv(sch);
+ int band = cl - q->classes;
+
+ return TC_H_MAKE(sch->handle, band + 1);
+}
+
+static void ets_offload_change(struct Qdisc *sch)
+{
+ struct net_device *dev = qdisc_dev(sch);
+ struct ets_sched *q = qdisc_priv(sch);
+ struct tc_ets_qopt_offload qopt;
+ unsigned int w_psum_prev = 0;
+ unsigned int q_psum = 0;
+ unsigned int q_sum = 0;
+ unsigned int quantum;
+ unsigned int w_psum;
+ unsigned int weight;
+ unsigned int i;
+
+ if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
+ return;
+
+ qopt.command = TC_ETS_REPLACE;
+ qopt.handle = sch->handle;
+ qopt.parent = sch->parent;
+ qopt.replace_params.bands = q->nbands;
+ qopt.replace_params.qstats = &sch->qstats;
+ memcpy(&qopt.replace_params.priomap,
+ q->prio2band, sizeof(q->prio2band));
+
+ for (i = 0; i < q->nbands; i++)
+ q_sum += q->classes[i].quantum;
+
+ for (i = 0; i < q->nbands; i++) {
+ quantum = q->classes[i].quantum;
+ q_psum += quantum;
+ w_psum = quantum ? q_psum * 100 / q_sum : 0;
+ weight = w_psum - w_psum_prev;
+ w_psum_prev = w_psum;
+
+ qopt.replace_params.quanta[i] = quantum;
+ qopt.replace_params.weights[i] = weight;
+ }
+
+ dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_ETS, &qopt);
+}
+
+static void ets_offload_destroy(struct Qdisc *sch)
+{
+ struct net_device *dev = qdisc_dev(sch);
+ struct tc_ets_qopt_offload qopt;
+
+ if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
+ return;
+
+ qopt.command = TC_ETS_DESTROY;
+ qopt.handle = sch->handle;
+ qopt.parent = sch->parent;
+ dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_ETS, &qopt);
+}
+
+static void ets_offload_graft(struct Qdisc *sch, struct Qdisc *new,
+ struct Qdisc *old, unsigned long arg,
+ struct netlink_ext_ack *extack)
+{
+ struct net_device *dev = qdisc_dev(sch);
+ struct tc_ets_qopt_offload qopt;
+
+ qopt.command = TC_ETS_GRAFT;
+ qopt.handle = sch->handle;
+ qopt.parent = sch->parent;
+ qopt.graft_params.band = arg - 1;
+ qopt.graft_params.child_handle = new->handle;
+
+ qdisc_offload_graft_helper(dev, sch, new, old, TC_SETUP_QDISC_ETS,
+ &qopt, extack);
+}
+
+static int ets_offload_dump(struct Qdisc *sch)
+{
+ struct tc_ets_qopt_offload qopt;
+
+ qopt.command = TC_ETS_STATS;
+ qopt.handle = sch->handle;
+ qopt.parent = sch->parent;
+ qopt.stats.bstats = &sch->bstats;
+ qopt.stats.qstats = &sch->qstats;
+
+ return qdisc_offload_dump_helper(sch, TC_SETUP_QDISC_ETS, &qopt);
+}
+
+static bool ets_class_is_strict(struct ets_sched *q, const struct ets_class *cl)
+{
+ unsigned int band = cl - q->classes;
+
+ return band < q->nstrict;
+}
+
+static int ets_class_change(struct Qdisc *sch, u32 classid, u32 parentid,
+ struct nlattr **tca, unsigned long *arg,
+ struct netlink_ext_ack *extack)
+{
+ struct ets_class *cl = ets_class_from_arg(sch, *arg);
+ struct ets_sched *q = qdisc_priv(sch);
+ struct nlattr *opt = tca[TCA_OPTIONS];
+ struct nlattr *tb[TCA_ETS_MAX + 1];
+ unsigned int quantum;
+ int err;
+
+ /* Classes can be added and removed only through Qdisc_ops.change
+ * interface.
+ */
+ if (!cl) {
+ NL_SET_ERR_MSG(extack, "Fine-grained class addition and removal is not supported");
+ return -EOPNOTSUPP;
+ }
+
+ if (!opt) {
+ NL_SET_ERR_MSG(extack, "ETS options are required for this operation");
+ return -EINVAL;
+ }
+
+ err = nla_parse_nested(tb, TCA_ETS_MAX, opt, ets_class_policy, extack);
+ if (err < 0)
+ return err;
+
+ if (!tb[TCA_ETS_QUANTA_BAND])
+ /* Nothing to configure. */
+ return 0;
+
+ if (ets_class_is_strict(q, cl)) {
+ NL_SET_ERR_MSG(extack, "Strict bands do not have a configurable quantum");
+ return -EINVAL;
+ }
+
+ err = ets_quantum_parse(sch, tb[TCA_ETS_QUANTA_BAND], &quantum,
+ extack);
+ if (err)
+ return err;
+
+ sch_tree_lock(sch);
+ cl->quantum = quantum;
+ sch_tree_unlock(sch);
+
+ ets_offload_change(sch);
+ return 0;
+}
+
+static int ets_class_graft(struct Qdisc *sch, unsigned long arg,
+ struct Qdisc *new, struct Qdisc **old,
+ struct netlink_ext_ack *extack)
+{
+ struct ets_class *cl = ets_class_from_arg(sch, arg);
+
+ if (!new) {
+ new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
+ ets_class_id(sch, cl), NULL);
+ if (!new)
+ new = &noop_qdisc;
+ else
+ qdisc_hash_add(new, true);
+ }
+
+ *old = qdisc_replace(sch, new, &cl->qdisc);
+ ets_offload_graft(sch, new, *old, arg, extack);
+ return 0;
+}
+
+static struct Qdisc *ets_class_leaf(struct Qdisc *sch, unsigned long arg)
+{
+ struct ets_class *cl = ets_class_from_arg(sch, arg);
+
+ return cl->qdisc;
+}
+
+static unsigned long ets_class_find(struct Qdisc *sch, u32 classid)
+{
+ unsigned long band = TC_H_MIN(classid);
+ struct ets_sched *q = qdisc_priv(sch);
+
+ if (band - 1 >= q->nbands)
+ return 0;
+ return band;
+}
+
+static void ets_class_qlen_notify(struct Qdisc *sch, unsigned long arg)
+{
+ struct ets_class *cl = ets_class_from_arg(sch, arg);
+ struct ets_sched *q = qdisc_priv(sch);
+
+ /* We get notified about zero-length child Qdiscs as well if they are
+ * offloaded. Those aren't on the active list though, so don't attempt
+ * to remove them.
+ */
+ if (!ets_class_is_strict(q, cl) && sch->q.qlen)
+ list_del(&cl->alist);
+}
+
+static int ets_class_dump(struct Qdisc *sch, unsigned long arg,
+ struct sk_buff *skb, struct tcmsg *tcm)
+{
+ struct ets_class *cl = ets_class_from_arg(sch, arg);
+ struct ets_sched *q = qdisc_priv(sch);
+ struct nlattr *nest;
+
+ tcm->tcm_parent = TC_H_ROOT;
+ tcm->tcm_handle = ets_class_id(sch, cl);
+ tcm->tcm_info = cl->qdisc->handle;
+
+ nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
+ if (!nest)
+ goto nla_put_failure;
+ if (!ets_class_is_strict(q, cl)) {
+ if (nla_put_u32(skb, TCA_ETS_QUANTA_BAND, cl->quantum))
+ goto nla_put_failure;
+ }
+ return nla_nest_end(skb, nest);
+
+nla_put_failure:
+ nla_nest_cancel(skb, nest);
+ return -EMSGSIZE;
+}
+
+static int ets_class_dump_stats(struct Qdisc *sch, unsigned long arg,
+ struct gnet_dump *d)
+{
+ struct ets_class *cl = ets_class_from_arg(sch, arg);
+ struct Qdisc *cl_q = cl->qdisc;
+
+ if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
+ d, NULL, &cl_q->bstats) < 0 ||
+ qdisc_qstats_copy(d, cl_q) < 0)
+ return -1;
+
+ return 0;
+}
+
+static void ets_qdisc_walk(struct Qdisc *sch, struct qdisc_walker *arg)
+{
+ struct ets_sched *q = qdisc_priv(sch);
+ int i;
+
+ if (arg->stop)
+ return;
+
+ for (i = 0; i < q->nbands; i++) {
+ if (arg->count < arg->skip) {
+ arg->count++;
+ continue;
+ }
+ if (arg->fn(sch, i + 1, arg) < 0) {
+ arg->stop = 1;
+ break;
+ }
+ arg->count++;
+ }
+}
+
+static struct tcf_block *
+ets_qdisc_tcf_block(struct Qdisc *sch, unsigned long cl,
+ struct netlink_ext_ack *extack)
+{
+ struct ets_sched *q = qdisc_priv(sch);
+
+ if (cl) {
+ NL_SET_ERR_MSG(extack, "ETS classid must be zero");
+ return NULL;
+ }
+
+ return q->block;
+}
+
+static unsigned long ets_qdisc_bind_tcf(struct Qdisc *sch, unsigned long parent,
+ u32 classid)
+{
+ return ets_class_find(sch, classid);
+}
+
+static void ets_qdisc_unbind_tcf(struct Qdisc *sch, unsigned long arg)
+{
+}
+
+static struct ets_class *ets_classify(struct sk_buff *skb, struct Qdisc *sch,
+ int *qerr)
+{
+ struct ets_sched *q = qdisc_priv(sch);
+ u32 band = skb->priority;
+ struct tcf_result res;
+ struct tcf_proto *fl;
+ int err;
+
+ *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
+ if (TC_H_MAJ(skb->priority) != sch->handle) {
+ fl = rcu_dereference_bh(q->filter_list);
+ err = tcf_classify(skb, fl, &res, false);
+#ifdef CONFIG_NET_CLS_ACT
+ switch (err) {
+ case TC_ACT_STOLEN:
+ case TC_ACT_QUEUED:
+ case TC_ACT_TRAP:
+ *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
+ fallthrough;
+ case TC_ACT_SHOT:
+ return NULL;
+ }
+#endif
+ if (!fl || err < 0) {
+ if (TC_H_MAJ(band))
+ band = 0;
+ return &q->classes[q->prio2band[band & TC_PRIO_MAX]];
+ }
+ band = res.classid;
+ }
+ band = TC_H_MIN(band) - 1;
+ if (band >= q->nbands)
+ return &q->classes[q->prio2band[0]];
+ return &q->classes[band];
+}
+
+static int ets_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+ struct sk_buff **to_free)
+{
+ unsigned int len = qdisc_pkt_len(skb);
+ struct ets_sched *q = qdisc_priv(sch);
+ struct ets_class *cl;
+ int err = 0;
+ bool first;
+
+ cl = ets_classify(skb, sch, &err);
+ if (!cl) {
+ if (err & __NET_XMIT_BYPASS)
+ qdisc_qstats_drop(sch);
+ __qdisc_drop(skb, to_free);
+ return err;
+ }
+
+ first = !cl->qdisc->q.qlen;
+ err = qdisc_enqueue(skb, cl->qdisc, to_free);
+ if (unlikely(err != NET_XMIT_SUCCESS)) {
+ if (net_xmit_drop_count(err)) {
+ cl->qstats.drops++;
+ qdisc_qstats_drop(sch);
+ }
+ return err;
+ }
+
+ if (first && !ets_class_is_strict(q, cl)) {
+ list_add_tail(&cl->alist, &q->active);
+ cl->deficit = cl->quantum;
+ }
+
+ sch->qstats.backlog += len;
+ sch->q.qlen++;
+ return err;
+}
+
+static struct sk_buff *
+ets_qdisc_dequeue_skb(struct Qdisc *sch, struct sk_buff *skb)
+{
+ qdisc_bstats_update(sch, skb);
+ qdisc_qstats_backlog_dec(sch, skb);
+ sch->q.qlen--;
+ return skb;
+}
+
+static struct sk_buff *ets_qdisc_dequeue(struct Qdisc *sch)
+{
+ struct ets_sched *q = qdisc_priv(sch);
+ struct ets_class *cl;
+ struct sk_buff *skb;
+ unsigned int band;
+ unsigned int len;
+
+ while (1) {
+ for (band = 0; band < q->nstrict; band++) {
+ cl = &q->classes[band];
+ skb = qdisc_dequeue_peeked(cl->qdisc);
+ if (skb)
+ return ets_qdisc_dequeue_skb(sch, skb);
+ }
+
+ if (list_empty(&q->active))
+ goto out;
+
+ cl = list_first_entry(&q->active, struct ets_class, alist);
+ skb = cl->qdisc->ops->peek(cl->qdisc);
+ if (!skb) {
+ qdisc_warn_nonwc(__func__, cl->qdisc);
+ goto out;
+ }
+
+ len = qdisc_pkt_len(skb);
+ if (len <= cl->deficit) {
+ cl->deficit -= len;
+ skb = qdisc_dequeue_peeked(cl->qdisc);
+ if (unlikely(!skb))
+ goto out;
+ if (cl->qdisc->q.qlen == 0)
+ list_del(&cl->alist);
+ return ets_qdisc_dequeue_skb(sch, skb);
+ }
+
+ cl->deficit += cl->quantum;
+ list_move_tail(&cl->alist, &q->active);
+ }
+out:
+ return NULL;
+}
+
+static int ets_qdisc_priomap_parse(struct nlattr *priomap_attr,
+ unsigned int nbands, u8 *priomap,
+ struct netlink_ext_ack *extack)
+{
+ const struct nlattr *attr;
+ int prio = 0;
+ u8 band;
+ int rem;
+ int err;
+
+ err = __nla_validate_nested(priomap_attr, TCA_ETS_MAX,
+ ets_priomap_policy, NL_VALIDATE_STRICT,
+ extack);
+ if (err)
+ return err;
+
+ nla_for_each_nested(attr, priomap_attr, rem) {
+ switch (nla_type(attr)) {
+ case TCA_ETS_PRIOMAP_BAND:
+ if (prio > TC_PRIO_MAX) {
+ NL_SET_ERR_MSG_MOD(extack, "Too many priorities in ETS priomap");
+ return -EINVAL;
+ }
+ band = nla_get_u8(attr);
+ if (band >= nbands) {
+ NL_SET_ERR_MSG_MOD(extack, "Invalid band number in ETS priomap");
+ return -EINVAL;
+ }
+ priomap[prio++] = band;
+ break;
+ default:
+ WARN_ON_ONCE(1); /* Validate should have caught this. */
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
+static int ets_qdisc_quanta_parse(struct Qdisc *sch, struct nlattr *quanta_attr,
+ unsigned int nbands, unsigned int nstrict,
+ unsigned int *quanta,
+ struct netlink_ext_ack *extack)
+{
+ const struct nlattr *attr;
+ int band = nstrict;
+ int rem;
+ int err;
+
+ err = __nla_validate_nested(quanta_attr, TCA_ETS_MAX,
+ ets_quanta_policy, NL_VALIDATE_STRICT,
+ extack);
+ if (err < 0)
+ return err;
+
+ nla_for_each_nested(attr, quanta_attr, rem) {
+ switch (nla_type(attr)) {
+ case TCA_ETS_QUANTA_BAND:
+ if (band >= nbands) {
+ NL_SET_ERR_MSG_MOD(extack, "ETS quanta has more values than bands");
+ return -EINVAL;
+ }
+ err = ets_quantum_parse(sch, attr, &quanta[band++],
+ extack);
+ if (err)
+ return err;
+ break;
+ default:
+ WARN_ON_ONCE(1); /* Validate should have caught this. */
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
+static int ets_qdisc_change(struct Qdisc *sch, struct nlattr *opt,
+ struct netlink_ext_ack *extack)
+{
+ unsigned int quanta[TCQ_ETS_MAX_BANDS] = {0};
+ struct Qdisc *queues[TCQ_ETS_MAX_BANDS];
+ struct ets_sched *q = qdisc_priv(sch);
+ struct nlattr *tb[TCA_ETS_MAX + 1];
+ unsigned int oldbands = q->nbands;
+ u8 priomap[TC_PRIO_MAX + 1];
+ unsigned int nstrict = 0;
+ unsigned int nbands;
+ unsigned int i;
+ int err;
+
+ if (!opt) {
+ NL_SET_ERR_MSG(extack, "ETS options are required for this operation");
+ return -EINVAL;
+ }
+
+ err = nla_parse_nested(tb, TCA_ETS_MAX, opt, ets_policy, extack);
+ if (err < 0)
+ return err;
+
+ if (!tb[TCA_ETS_NBANDS]) {
+ NL_SET_ERR_MSG_MOD(extack, "Number of bands is a required argument");
+ return -EINVAL;
+ }
+ nbands = nla_get_u8(tb[TCA_ETS_NBANDS]);
+ if (nbands < 1 || nbands > TCQ_ETS_MAX_BANDS) {
+ NL_SET_ERR_MSG_MOD(extack, "Invalid number of bands");
+ return -EINVAL;
+ }
+ /* Unless overridden, traffic goes to the last band. */
+ memset(priomap, nbands - 1, sizeof(priomap));
+
+ if (tb[TCA_ETS_NSTRICT]) {
+ nstrict = nla_get_u8(tb[TCA_ETS_NSTRICT]);
+ if (nstrict > nbands) {
+ NL_SET_ERR_MSG_MOD(extack, "Invalid number of strict bands");
+ return -EINVAL;
+ }
+ }
+
+ if (tb[TCA_ETS_PRIOMAP]) {
+ err = ets_qdisc_priomap_parse(tb[TCA_ETS_PRIOMAP],
+ nbands, priomap, extack);
+ if (err)
+ return err;
+ }
+
+ if (tb[TCA_ETS_QUANTA]) {
+ err = ets_qdisc_quanta_parse(sch, tb[TCA_ETS_QUANTA],
+ nbands, nstrict, quanta, extack);
+ if (err)
+ return err;
+ }
+ /* If there are more bands than strict + quanta provided, the remaining
+ * ones are ETS with quantum of MTU. Initialize the missing values here.
+ */
+ for (i = nstrict; i < nbands; i++) {
+ if (!quanta[i])
+ quanta[i] = psched_mtu(qdisc_dev(sch));
+ }
+
+ /* Before commit, make sure we can allocate all new qdiscs */
+ for (i = oldbands; i < nbands; i++) {
+ queues[i] = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
+ ets_class_id(sch, &q->classes[i]),
+ extack);
+ if (!queues[i]) {
+ while (i > oldbands)
+ qdisc_put(queues[--i]);
+ return -ENOMEM;
+ }
+ }
+
+ sch_tree_lock(sch);
+
+ q->nbands = nbands;
+ for (i = nstrict; i < q->nstrict; i++) {
+ INIT_LIST_HEAD(&q->classes[i].alist);
+ if (q->classes[i].qdisc->q.qlen) {
+ list_add_tail(&q->classes[i].alist, &q->active);
+ q->classes[i].deficit = quanta[i];
+ }
+ }
+ for (i = q->nbands; i < oldbands; i++) {
+ if (i >= q->nstrict && q->classes[i].qdisc->q.qlen)
+ list_del(&q->classes[i].alist);
+ qdisc_tree_flush_backlog(q->classes[i].qdisc);
+ }
+ q->nstrict = nstrict;
+ memcpy(q->prio2band, priomap, sizeof(priomap));
+
+ for (i = 0; i < q->nbands; i++)
+ q->classes[i].quantum = quanta[i];
+
+ for (i = oldbands; i < q->nbands; i++) {
+ q->classes[i].qdisc = queues[i];
+ if (q->classes[i].qdisc != &noop_qdisc)
+ qdisc_hash_add(q->classes[i].qdisc, true);
+ }
+
+ sch_tree_unlock(sch);
+
+ ets_offload_change(sch);
+ for (i = q->nbands; i < oldbands; i++) {
+ qdisc_put(q->classes[i].qdisc);
+ memset(&q->classes[i], 0, sizeof(q->classes[i]));
+ }
+ return 0;
+}
+
+static int ets_qdisc_init(struct Qdisc *sch, struct nlattr *opt,
+ struct netlink_ext_ack *extack)
+{
+ struct ets_sched *q = qdisc_priv(sch);
+ int err;
+
+ if (!opt)
+ return -EINVAL;
+
+ err = tcf_block_get(&q->block, &q->filter_list, sch, extack);
+ if (err)
+ return err;
+
+ INIT_LIST_HEAD(&q->active);
+ return ets_qdisc_change(sch, opt, extack);
+}
+
+static void ets_qdisc_reset(struct Qdisc *sch)
+{
+ struct ets_sched *q = qdisc_priv(sch);
+ int band;
+
+ for (band = q->nstrict; band < q->nbands; band++) {
+ if (q->classes[band].qdisc->q.qlen)
+ list_del(&q->classes[band].alist);
+ }
+ for (band = 0; band < q->nbands; band++)
+ qdisc_reset(q->classes[band].qdisc);
+ sch->qstats.backlog = 0;
+ sch->q.qlen = 0;
+}
+
+static void ets_qdisc_destroy(struct Qdisc *sch)
+{
+ struct ets_sched *q = qdisc_priv(sch);
+ int band;
+
+ ets_offload_destroy(sch);
+ tcf_block_put(q->block);
+ for (band = 0; band < q->nbands; band++)
+ qdisc_put(q->classes[band].qdisc);
+}
+
+static int ets_qdisc_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+ struct ets_sched *q = qdisc_priv(sch);
+ struct nlattr *opts;
+ struct nlattr *nest;
+ int band;
+ int prio;
+ int err;
+
+ err = ets_offload_dump(sch);
+ if (err)
+ return err;
+
+ opts = nla_nest_start_noflag(skb, TCA_OPTIONS);
+ if (!opts)
+ goto nla_err;
+
+ if (nla_put_u8(skb, TCA_ETS_NBANDS, q->nbands))
+ goto nla_err;
+
+ if (q->nstrict &&
+ nla_put_u8(skb, TCA_ETS_NSTRICT, q->nstrict))
+ goto nla_err;
+
+ if (q->nbands > q->nstrict) {
+ nest = nla_nest_start(skb, TCA_ETS_QUANTA);
+ if (!nest)
+ goto nla_err;
+
+ for (band = q->nstrict; band < q->nbands; band++) {
+ if (nla_put_u32(skb, TCA_ETS_QUANTA_BAND,
+ q->classes[band].quantum))
+ goto nla_err;
+ }
+
+ nla_nest_end(skb, nest);
+ }
+
+ nest = nla_nest_start(skb, TCA_ETS_PRIOMAP);
+ if (!nest)
+ goto nla_err;
+
+ for (prio = 0; prio <= TC_PRIO_MAX; prio++) {
+ if (nla_put_u8(skb, TCA_ETS_PRIOMAP_BAND, q->prio2band[prio]))
+ goto nla_err;
+ }
+
+ nla_nest_end(skb, nest);
+
+ return nla_nest_end(skb, opts);
+
+nla_err:
+ nla_nest_cancel(skb, opts);
+ return -EMSGSIZE;
+}
+
+static const struct Qdisc_class_ops ets_class_ops = {
+ .change = ets_class_change,
+ .graft = ets_class_graft,
+ .leaf = ets_class_leaf,
+ .find = ets_class_find,
+ .qlen_notify = ets_class_qlen_notify,
+ .dump = ets_class_dump,
+ .dump_stats = ets_class_dump_stats,
+ .walk = ets_qdisc_walk,
+ .tcf_block = ets_qdisc_tcf_block,
+ .bind_tcf = ets_qdisc_bind_tcf,
+ .unbind_tcf = ets_qdisc_unbind_tcf,
+};
+
+static struct Qdisc_ops ets_qdisc_ops __read_mostly = {
+ .cl_ops = &ets_class_ops,
+ .id = "ets",
+ .priv_size = sizeof(struct ets_sched),
+ .enqueue = ets_qdisc_enqueue,
+ .dequeue = ets_qdisc_dequeue,
+ .peek = qdisc_peek_dequeued,
+ .change = ets_qdisc_change,
+ .init = ets_qdisc_init,
+ .reset = ets_qdisc_reset,
+ .destroy = ets_qdisc_destroy,
+ .dump = ets_qdisc_dump,
+ .owner = THIS_MODULE,
+};
+
+static int __init ets_init(void)
+{
+ return register_qdisc(&ets_qdisc_ops);
+}
+
+static void __exit ets_exit(void)
+{
+ unregister_qdisc(&ets_qdisc_ops);
+}
+
+module_init(ets_init);
+module_exit(ets_exit);
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_fifo.c b/net/sched/sch_fifo.c
index 37c8aa7..e104042 100644
--- a/net/sched/sch_fifo.c
+++ b/net/sched/sch_fifo.c
@@ -12,6 +12,7 @@
#include <linux/errno.h>
#include <linux/skbuff.h>
#include <net/pkt_sched.h>
+#include <net/pkt_cls.h>
/* 1 band FIFO pseudo-"scheduler" */
@@ -51,8 +52,49 @@
return NET_XMIT_CN;
}
-static int fifo_init(struct Qdisc *sch, struct nlattr *opt,
- struct netlink_ext_ack *extack)
+static void fifo_offload_init(struct Qdisc *sch)
+{
+ struct net_device *dev = qdisc_dev(sch);
+ struct tc_fifo_qopt_offload qopt;
+
+ if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
+ return;
+
+ qopt.command = TC_FIFO_REPLACE;
+ qopt.handle = sch->handle;
+ qopt.parent = sch->parent;
+ dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_FIFO, &qopt);
+}
+
+static void fifo_offload_destroy(struct Qdisc *sch)
+{
+ struct net_device *dev = qdisc_dev(sch);
+ struct tc_fifo_qopt_offload qopt;
+
+ if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
+ return;
+
+ qopt.command = TC_FIFO_DESTROY;
+ qopt.handle = sch->handle;
+ qopt.parent = sch->parent;
+ dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_FIFO, &qopt);
+}
+
+static int fifo_offload_dump(struct Qdisc *sch)
+{
+ struct tc_fifo_qopt_offload qopt;
+
+ qopt.command = TC_FIFO_STATS;
+ qopt.handle = sch->handle;
+ qopt.parent = sch->parent;
+ qopt.stats.bstats = &sch->bstats;
+ qopt.stats.qstats = &sch->qstats;
+
+ return qdisc_offload_dump_helper(sch, TC_SETUP_QDISC_FIFO, &qopt);
+}
+
+static int __fifo_init(struct Qdisc *sch, struct nlattr *opt,
+ struct netlink_ext_ack *extack)
{
bool bypass;
bool is_bfifo = sch->ops == &bfifo_qdisc_ops;
@@ -82,10 +124,35 @@
sch->flags |= TCQ_F_CAN_BYPASS;
else
sch->flags &= ~TCQ_F_CAN_BYPASS;
+
return 0;
}
-static int fifo_dump(struct Qdisc *sch, struct sk_buff *skb)
+static int fifo_init(struct Qdisc *sch, struct nlattr *opt,
+ struct netlink_ext_ack *extack)
+{
+ int err;
+
+ err = __fifo_init(sch, opt, extack);
+ if (err)
+ return err;
+
+ fifo_offload_init(sch);
+ return 0;
+}
+
+static int fifo_hd_init(struct Qdisc *sch, struct nlattr *opt,
+ struct netlink_ext_ack *extack)
+{
+ return __fifo_init(sch, opt, extack);
+}
+
+static void fifo_destroy(struct Qdisc *sch)
+{
+ fifo_offload_destroy(sch);
+}
+
+static int __fifo_dump(struct Qdisc *sch, struct sk_buff *skb)
{
struct tc_fifo_qopt opt = { .limit = sch->limit };
@@ -97,6 +164,22 @@
return -1;
}
+static int fifo_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+ int err;
+
+ err = fifo_offload_dump(sch);
+ if (err)
+ return err;
+
+ return __fifo_dump(sch, skb);
+}
+
+static int fifo_hd_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+ return __fifo_dump(sch, skb);
+}
+
struct Qdisc_ops pfifo_qdisc_ops __read_mostly = {
.id = "pfifo",
.priv_size = 0,
@@ -104,6 +187,7 @@
.dequeue = qdisc_dequeue_head,
.peek = qdisc_peek_head,
.init = fifo_init,
+ .destroy = fifo_destroy,
.reset = qdisc_reset_queue,
.change = fifo_init,
.dump = fifo_dump,
@@ -118,6 +202,7 @@
.dequeue = qdisc_dequeue_head,
.peek = qdisc_peek_head,
.init = fifo_init,
+ .destroy = fifo_destroy,
.reset = qdisc_reset_queue,
.change = fifo_init,
.dump = fifo_dump,
@@ -131,10 +216,10 @@
.enqueue = pfifo_tail_enqueue,
.dequeue = qdisc_dequeue_head,
.peek = qdisc_peek_head,
- .init = fifo_init,
+ .init = fifo_hd_init,
.reset = qdisc_reset_queue,
- .change = fifo_init,
- .dump = fifo_dump,
+ .change = fifo_hd_init,
+ .dump = fifo_hd_dump,
.owner = THIS_MODULE,
};
@@ -148,6 +233,9 @@
if (strncmp(q->ops->id + 1, "fifo", 4) != 0)
return 0;
+ if (!q->ops->change)
+ return 0;
+
nla = kmalloc(nla_attr_size(sizeof(struct tc_fifo_qopt)), GFP_KERNEL);
if (nla) {
nla->nla_type = RTM_NEWQDISC;
diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c
index f757ea9..2fb76fc 100644
--- a/net/sched/sch_fq.c
+++ b/net/sched/sch_fq.c
@@ -66,22 +66,27 @@
* in linear list (head,tail), otherwise are placed in a rbtree (t_root).
*/
struct fq_flow {
+/* First cache line : used in fq_gc(), fq_enqueue(), fq_dequeue() */
struct rb_root t_root;
struct sk_buff *head; /* list of skbs for this flow : first skb */
union {
struct sk_buff *tail; /* last skb in the list */
- unsigned long age; /* jiffies when flow was emptied, for gc */
+ unsigned long age; /* (jiffies | 1UL) when flow was emptied, for gc */
};
struct rb_node fq_node; /* anchor in fq_root[] trees */
struct sock *sk;
- int qlen; /* number of packets in flow queue */
- int credit;
u32 socket_hash; /* sk_hash */
- struct fq_flow *next; /* next pointer in RR lists, or &detached */
+ int qlen; /* number of packets in flow queue */
+
+/* Second cache line, used in fq_dequeue() */
+ int credit;
+ /* 32bit hole on 64bit arches */
+
+ struct fq_flow *next; /* next pointer in RR lists */
struct rb_node rate_node; /* anchor in q->delayed tree */
u64 time_next_packet;
-};
+} ____cacheline_aligned_in_smp;
struct fq_flow_head {
struct fq_flow *first;
@@ -95,6 +100,7 @@
struct rb_root delayed; /* for rate limited flows */
u64 time_next_delayed_flow;
+ u64 ktime_cache; /* copy of last ktime_get_ns() */
unsigned long unthrottle_latency_ns;
struct fq_flow internal; /* for non classified or high prio packets */
@@ -104,12 +110,13 @@
u32 flow_plimit; /* max packets per flow */
unsigned long flow_max_rate; /* optional max rate per flow */
u64 ce_threshold;
+ u64 horizon; /* horizon in ns */
u32 orphan_mask; /* mask for orphaned skb */
u32 low_rate_threshold;
struct rb_root *fq_root;
u8 rate_enable;
u8 fq_trees_log;
-
+ u8 horizon_drop;
u32 flows;
u32 inactive_flows;
u32 throttled_flows;
@@ -118,26 +125,35 @@
u64 stat_internal_packets;
u64 stat_throttled;
u64 stat_ce_mark;
+ u64 stat_horizon_drops;
+ u64 stat_horizon_caps;
u64 stat_flows_plimit;
u64 stat_pkts_too_long;
u64 stat_allocation_errors;
+
+ u32 timer_slack; /* hrtimer slack in ns */
struct qdisc_watchdog watchdog;
};
-/* special value to mark a detached flow (not on old/new list) */
-static struct fq_flow detached, throttled;
-
+/*
+ * f->tail and f->age share the same location.
+ * We can use the low order bit to differentiate if this location points
+ * to a sk_buff or contains a jiffies value, if we force this value to be odd.
+ * This assumes f->tail low order bit must be 0 since alignof(struct sk_buff) >= 2
+ */
static void fq_flow_set_detached(struct fq_flow *f)
{
- f->next = &detached;
- f->age = jiffies;
+ f->age = jiffies | 1UL;
}
static bool fq_flow_is_detached(const struct fq_flow *f)
{
- return f->next == &detached;
+ return !!(f->age & 1UL);
}
+/* special value to mark a throttled flow (not on old/new list) */
+static struct fq_flow throttled;
+
static bool fq_flow_is_throttled(const struct fq_flow *f)
{
return f->next == &throttled;
@@ -202,9 +218,10 @@
struct rb_root *root,
struct sock *sk)
{
- struct fq_flow *f, *tofree[FQ_GC_MAX];
struct rb_node **p, *parent;
- int fcnt = 0;
+ void *tofree[FQ_GC_MAX];
+ struct fq_flow *f;
+ int i, fcnt = 0;
p = &root->rb_node;
parent = NULL;
@@ -227,15 +244,18 @@
p = &parent->rb_left;
}
+ if (!fcnt)
+ return;
+
+ for (i = fcnt; i > 0; ) {
+ f = tofree[--i];
+ rb_erase(&f->fq_node, root);
+ }
q->flows -= fcnt;
q->inactive_flows -= fcnt;
q->stat_gc_flows += fcnt;
- while (fcnt) {
- struct fq_flow *f = tofree[--fcnt];
- rb_erase(&f->fq_node, root);
- kmem_cache_free(fq_flow_cachep, f);
- }
+ kmem_cache_free_bulk(fq_flow_cachep, fcnt, tofree);
}
static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q)
@@ -368,19 +388,17 @@
}
}
-/* remove one skb from head of flow queue */
-static struct sk_buff *fq_dequeue_head(struct Qdisc *sch, struct fq_flow *flow)
+/* Remove one skb from flow queue.
+ * This skb must be the return value of prior fq_peek().
+ */
+static void fq_dequeue_skb(struct Qdisc *sch, struct fq_flow *flow,
+ struct sk_buff *skb)
{
- struct sk_buff *skb = fq_peek(flow);
-
- if (skb) {
- fq_erase_head(sch, flow, skb);
- skb_mark_not_on_list(skb);
- flow->qlen--;
- qdisc_qstats_backlog_dec(sch, skb);
- sch->q.qlen--;
- }
- return skb;
+ fq_erase_head(sch, flow, skb);
+ skb_mark_not_on_list(skb);
+ flow->qlen--;
+ qdisc_qstats_backlog_dec(sch, skb);
+ sch->q.qlen--;
}
static void flow_queue_add(struct fq_flow *flow, struct sk_buff *skb)
@@ -388,8 +406,6 @@
struct rb_node **p, *parent;
struct sk_buff *head, *aux;
- fq_skb_cb(skb)->time_to_send = skb->tstamp ?: ktime_get_ns();
-
head = flow->head;
if (!head ||
fq_skb_cb(skb)->time_to_send >= fq_skb_cb(flow->tail)->time_to_send) {
@@ -417,6 +433,12 @@
rb_insert_color(&skb->rbnode, &flow->t_root);
}
+static bool fq_packet_beyond_horizon(const struct sk_buff *skb,
+ const struct fq_sched_data *q)
+{
+ return unlikely((s64)skb->tstamp > (s64)(q->ktime_cache + q->horizon));
+}
+
static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
struct sk_buff **to_free)
{
@@ -426,6 +448,28 @@
if (unlikely(sch->q.qlen >= sch->limit))
return qdisc_drop(skb, sch, to_free);
+ if (!skb->tstamp) {
+ fq_skb_cb(skb)->time_to_send = q->ktime_cache = ktime_get_ns();
+ } else {
+ /* Check if packet timestamp is too far in the future.
+ * Try first if our cached value, to avoid ktime_get_ns()
+ * cost in most cases.
+ */
+ if (fq_packet_beyond_horizon(skb, q)) {
+ /* Refresh our cache and check another time */
+ q->ktime_cache = ktime_get_ns();
+ if (fq_packet_beyond_horizon(skb, q)) {
+ if (q->horizon_drop) {
+ q->stat_horizon_drops++;
+ return qdisc_drop(skb, sch, to_free);
+ }
+ q->stat_horizon_caps++;
+ skb->tstamp = q->ktime_cache + q->horizon;
+ }
+ }
+ fq_skb_cb(skb)->time_to_send = skb->tstamp;
+ }
+
f = fq_classify(skb, q);
if (unlikely(f->qlen >= q->flow_plimit && f != &q->internal)) {
q->stat_flows_plimit++;
@@ -492,11 +536,13 @@
if (!sch->q.qlen)
return NULL;
- skb = fq_dequeue_head(sch, &q->internal);
- if (skb)
+ skb = fq_peek(&q->internal);
+ if (unlikely(skb)) {
+ fq_dequeue_skb(sch, &q->internal, skb);
goto out;
+ }
- now = ktime_get_ns();
+ q->ktime_cache = now = ktime_get_ns();
fq_check_throttled(q, now);
begin:
head = &q->new_flows;
@@ -504,8 +550,9 @@
head = &q->old_flows;
if (!head->first) {
if (q->time_next_delayed_flow != ~0ULL)
- qdisc_watchdog_schedule_ns(&q->watchdog,
- q->time_next_delayed_flow);
+ qdisc_watchdog_schedule_range_ns(&q->watchdog,
+ q->time_next_delayed_flow,
+ q->timer_slack);
return NULL;
}
}
@@ -529,15 +576,13 @@
fq_flow_set_throttled(q, f);
goto begin;
}
- if (time_next_packet &&
- (s64)(now - time_next_packet - q->ce_threshold) > 0) {
+ prefetch(&skb->end);
+ if ((s64)(now - time_next_packet - q->ce_threshold) > 0) {
INET_ECN_set_ce(skb);
q->stat_ce_mark++;
}
- }
-
- skb = fq_dequeue_head(sch, f);
- if (!skb) {
+ fq_dequeue_skb(sch, f, skb);
+ } else {
head->first = f->next;
/* force a pass through old_flows to prevent starvation */
if ((head == &q->new_flows) && q->old_flows.first) {
@@ -548,7 +593,6 @@
}
goto begin;
}
- prefetch(&skb->end);
plen = qdisc_pkt_len(skb);
f->credit -= plen;
@@ -736,6 +780,8 @@
}
static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = {
+ [TCA_FQ_UNSPEC] = { .strict_start_type = TCA_FQ_TIMER_SLACK },
+
[TCA_FQ_PLIMIT] = { .type = NLA_U32 },
[TCA_FQ_FLOW_PLIMIT] = { .type = NLA_U32 },
[TCA_FQ_QUANTUM] = { .type = NLA_U32 },
@@ -748,6 +794,9 @@
[TCA_FQ_ORPHAN_MASK] = { .type = NLA_U32 },
[TCA_FQ_LOW_RATE_THRESHOLD] = { .type = NLA_U32 },
[TCA_FQ_CE_THRESHOLD] = { .type = NLA_U32 },
+ [TCA_FQ_TIMER_SLACK] = { .type = NLA_U32 },
+ [TCA_FQ_HORIZON] = { .type = NLA_U32 },
+ [TCA_FQ_HORIZON_DROP] = { .type = NLA_U8 },
};
static int fq_change(struct Qdisc *sch, struct nlattr *opt,
@@ -834,7 +883,18 @@
q->ce_threshold = (u64)NSEC_PER_USEC *
nla_get_u32(tb[TCA_FQ_CE_THRESHOLD]);
+ if (tb[TCA_FQ_TIMER_SLACK])
+ q->timer_slack = nla_get_u32(tb[TCA_FQ_TIMER_SLACK]);
+
+ if (tb[TCA_FQ_HORIZON])
+ q->horizon = (u64)NSEC_PER_USEC *
+ nla_get_u32(tb[TCA_FQ_HORIZON]);
+
+ if (tb[TCA_FQ_HORIZON_DROP])
+ q->horizon_drop = nla_get_u8(tb[TCA_FQ_HORIZON_DROP]);
+
if (!err) {
+
sch_tree_unlock(sch);
err = fq_resize(sch, fq_log);
sch_tree_lock(sch);
@@ -885,6 +945,11 @@
q->orphan_mask = 1024 - 1;
q->low_rate_threshold = 550000 / 8;
+ q->timer_slack = 10 * NSEC_PER_USEC; /* 10 usec of hrtimer slack */
+
+ q->horizon = 10ULL * NSEC_PER_SEC; /* 10 seconds */
+ q->horizon_drop = 1; /* by default, drop packets beyond horizon */
+
/* Default ce_threshold of 4294 seconds */
q->ce_threshold = (u64)NSEC_PER_USEC * ~0U;
@@ -902,6 +967,7 @@
{
struct fq_sched_data *q = qdisc_priv(sch);
u64 ce_threshold = q->ce_threshold;
+ u64 horizon = q->horizon;
struct nlattr *opts;
opts = nla_nest_start_noflag(skb, TCA_OPTIONS);
@@ -911,6 +977,7 @@
/* TCA_FQ_FLOW_DEFAULT_RATE is not used anymore */
do_div(ce_threshold, NSEC_PER_USEC);
+ do_div(horizon, NSEC_PER_USEC);
if (nla_put_u32(skb, TCA_FQ_PLIMIT, sch->limit) ||
nla_put_u32(skb, TCA_FQ_FLOW_PLIMIT, q->flow_plimit) ||
@@ -925,7 +992,10 @@
nla_put_u32(skb, TCA_FQ_LOW_RATE_THRESHOLD,
q->low_rate_threshold) ||
nla_put_u32(skb, TCA_FQ_CE_THRESHOLD, (u32)ce_threshold) ||
- nla_put_u32(skb, TCA_FQ_BUCKETS_LOG, q->fq_trees_log))
+ nla_put_u32(skb, TCA_FQ_BUCKETS_LOG, q->fq_trees_log) ||
+ nla_put_u32(skb, TCA_FQ_TIMER_SLACK, q->timer_slack) ||
+ nla_put_u32(skb, TCA_FQ_HORIZON, (u32)horizon) ||
+ nla_put_u8(skb, TCA_FQ_HORIZON_DROP, q->horizon_drop))
goto nla_put_failure;
return nla_nest_end(skb, opts);
@@ -948,13 +1018,16 @@
st.flows_plimit = q->stat_flows_plimit;
st.pkts_too_long = q->stat_pkts_too_long;
st.allocation_errors = q->stat_allocation_errors;
- st.time_next_delayed_flow = q->time_next_delayed_flow - ktime_get_ns();
+ st.time_next_delayed_flow = q->time_next_delayed_flow + q->timer_slack -
+ ktime_get_ns();
st.flows = q->flows;
st.inactive_flows = q->inactive_flows;
st.throttled_flows = q->throttled_flows;
st.unthrottle_latency_ns = min_t(unsigned long,
q->unthrottle_latency_ns, ~0U);
st.ce_mark = q->stat_ce_mark;
+ st.horizon_drops = q->stat_horizon_drops;
+ st.horizon_caps = q->stat_horizon_caps;
sch_tree_unlock(sch);
return gnet_stats_copy_app(d, &st, sizeof(st));
@@ -1002,3 +1075,4 @@
module_exit(fq_module_exit)
MODULE_AUTHOR("Eric Dumazet");
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Fair Queue Packet Scheduler");
diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c
index 86fb2f9..99e8db2 100644
--- a/net/sched/sch_fq_codel.c
+++ b/net/sched/sch_fq_codel.c
@@ -14,7 +14,6 @@
#include <linux/errno.h>
#include <linux/init.h>
#include <linux/skbuff.h>
-#include <linux/jhash.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <net/netlink.h>
@@ -100,7 +99,7 @@
case TC_ACT_QUEUED:
case TC_ACT_TRAP:
*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
- /* fall through */
+ fallthrough;
case TC_ACT_SHOT:
return 0;
}
@@ -188,7 +187,7 @@
struct fq_codel_sched_data *q = qdisc_priv(sch);
unsigned int idx, prev_backlog, prev_qlen;
struct fq_codel_flow *flow;
- int uninitialized_var(ret);
+ int ret;
unsigned int pkt_len;
bool memory_limited;
@@ -730,3 +729,4 @@
module_exit(fq_codel_module_exit)
MODULE_AUTHOR("Eric Dumazet");
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Fair Queue CoDel discipline");
diff --git a/net/sched/sch_fq_pie.c b/net/sched/sch_fq_pie.c
new file mode 100644
index 0000000..c708027
--- /dev/null
+++ b/net/sched/sch_fq_pie.c
@@ -0,0 +1,569 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Flow Queue PIE discipline
+ *
+ * Copyright (C) 2019 Mohit P. Tahiliani <tahiliani@nitk.edu.in>
+ * Copyright (C) 2019 Sachin D. Patil <sdp.sachin@gmail.com>
+ * Copyright (C) 2019 V. Saicharan <vsaicharan1998@gmail.com>
+ * Copyright (C) 2019 Mohit Bhasi <mohitbhasi1998@gmail.com>
+ * Copyright (C) 2019 Leslie Monis <lesliemonis@gmail.com>
+ * Copyright (C) 2019 Gautam Ramakrishnan <gautamramk@gmail.com>
+ */
+
+#include <linux/jhash.h>
+#include <linux/sizes.h>
+#include <linux/vmalloc.h>
+#include <net/pkt_cls.h>
+#include <net/pie.h>
+
+/* Flow Queue PIE
+ *
+ * Principles:
+ * - Packets are classified on flows.
+ * - This is a Stochastic model (as we use a hash, several flows might
+ * be hashed to the same slot)
+ * - Each flow has a PIE managed queue.
+ * - Flows are linked onto two (Round Robin) lists,
+ * so that new flows have priority on old ones.
+ * - For a given flow, packets are not reordered.
+ * - Drops during enqueue only.
+ * - ECN capability is off by default.
+ * - ECN threshold (if ECN is enabled) is at 10% by default.
+ * - Uses timestamps to calculate queue delay by default.
+ */
+
+/**
+ * struct fq_pie_flow - contains data for each flow
+ * @vars: pie vars associated with the flow
+ * @deficit: number of remaining byte credits
+ * @backlog: size of data in the flow
+ * @qlen: number of packets in the flow
+ * @flowchain: flowchain for the flow
+ * @head: first packet in the flow
+ * @tail: last packet in the flow
+ */
+struct fq_pie_flow {
+ struct pie_vars vars;
+ s32 deficit;
+ u32 backlog;
+ u32 qlen;
+ struct list_head flowchain;
+ struct sk_buff *head;
+ struct sk_buff *tail;
+};
+
+struct fq_pie_sched_data {
+ struct tcf_proto __rcu *filter_list; /* optional external classifier */
+ struct tcf_block *block;
+ struct fq_pie_flow *flows;
+ struct Qdisc *sch;
+ struct list_head old_flows;
+ struct list_head new_flows;
+ struct pie_params p_params;
+ u32 ecn_prob;
+ u32 flows_cnt;
+ u32 quantum;
+ u32 memory_limit;
+ u32 new_flow_count;
+ u32 memory_usage;
+ u32 overmemory;
+ struct pie_stats stats;
+ struct timer_list adapt_timer;
+};
+
+static unsigned int fq_pie_hash(const struct fq_pie_sched_data *q,
+ struct sk_buff *skb)
+{
+ return reciprocal_scale(skb_get_hash(skb), q->flows_cnt);
+}
+
+static unsigned int fq_pie_classify(struct sk_buff *skb, struct Qdisc *sch,
+ int *qerr)
+{
+ struct fq_pie_sched_data *q = qdisc_priv(sch);
+ struct tcf_proto *filter;
+ struct tcf_result res;
+ int result;
+
+ if (TC_H_MAJ(skb->priority) == sch->handle &&
+ TC_H_MIN(skb->priority) > 0 &&
+ TC_H_MIN(skb->priority) <= q->flows_cnt)
+ return TC_H_MIN(skb->priority);
+
+ filter = rcu_dereference_bh(q->filter_list);
+ if (!filter)
+ return fq_pie_hash(q, skb) + 1;
+
+ *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
+ result = tcf_classify(skb, filter, &res, false);
+ if (result >= 0) {
+#ifdef CONFIG_NET_CLS_ACT
+ switch (result) {
+ case TC_ACT_STOLEN:
+ case TC_ACT_QUEUED:
+ case TC_ACT_TRAP:
+ *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
+ fallthrough;
+ case TC_ACT_SHOT:
+ return 0;
+ }
+#endif
+ if (TC_H_MIN(res.classid) <= q->flows_cnt)
+ return TC_H_MIN(res.classid);
+ }
+ return 0;
+}
+
+/* add skb to flow queue (tail add) */
+static inline void flow_queue_add(struct fq_pie_flow *flow,
+ struct sk_buff *skb)
+{
+ if (!flow->head)
+ flow->head = skb;
+ else
+ flow->tail->next = skb;
+ flow->tail = skb;
+ skb->next = NULL;
+}
+
+static int fq_pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+ struct sk_buff **to_free)
+{
+ struct fq_pie_sched_data *q = qdisc_priv(sch);
+ struct fq_pie_flow *sel_flow;
+ int ret;
+ u8 memory_limited = false;
+ u8 enqueue = false;
+ u32 pkt_len;
+ u32 idx;
+
+ /* Classifies packet into corresponding flow */
+ idx = fq_pie_classify(skb, sch, &ret);
+ if (idx == 0) {
+ if (ret & __NET_XMIT_BYPASS)
+ qdisc_qstats_drop(sch);
+ __qdisc_drop(skb, to_free);
+ return ret;
+ }
+ idx--;
+
+ sel_flow = &q->flows[idx];
+ /* Checks whether adding a new packet would exceed memory limit */
+ get_pie_cb(skb)->mem_usage = skb->truesize;
+ memory_limited = q->memory_usage > q->memory_limit + skb->truesize;
+
+ /* Checks if the qdisc is full */
+ if (unlikely(qdisc_qlen(sch) >= sch->limit)) {
+ q->stats.overlimit++;
+ goto out;
+ } else if (unlikely(memory_limited)) {
+ q->overmemory++;
+ }
+
+ if (!pie_drop_early(sch, &q->p_params, &sel_flow->vars,
+ sel_flow->backlog, skb->len)) {
+ enqueue = true;
+ } else if (q->p_params.ecn &&
+ sel_flow->vars.prob <= (MAX_PROB / 100) * q->ecn_prob &&
+ INET_ECN_set_ce(skb)) {
+ /* If packet is ecn capable, mark it if drop probability
+ * is lower than the parameter ecn_prob, else drop it.
+ */
+ q->stats.ecn_mark++;
+ enqueue = true;
+ }
+ if (enqueue) {
+ /* Set enqueue time only when dq_rate_estimator is disabled. */
+ if (!q->p_params.dq_rate_estimator)
+ pie_set_enqueue_time(skb);
+
+ pkt_len = qdisc_pkt_len(skb);
+ q->stats.packets_in++;
+ q->memory_usage += skb->truesize;
+ sch->qstats.backlog += pkt_len;
+ sch->q.qlen++;
+ flow_queue_add(sel_flow, skb);
+ if (list_empty(&sel_flow->flowchain)) {
+ list_add_tail(&sel_flow->flowchain, &q->new_flows);
+ q->new_flow_count++;
+ sel_flow->deficit = q->quantum;
+ sel_flow->qlen = 0;
+ sel_flow->backlog = 0;
+ }
+ sel_flow->qlen++;
+ sel_flow->backlog += pkt_len;
+ return NET_XMIT_SUCCESS;
+ }
+out:
+ q->stats.dropped++;
+ sel_flow->vars.accu_prob = 0;
+ __qdisc_drop(skb, to_free);
+ qdisc_qstats_drop(sch);
+ return NET_XMIT_CN;
+}
+
+static const struct nla_policy fq_pie_policy[TCA_FQ_PIE_MAX + 1] = {
+ [TCA_FQ_PIE_LIMIT] = {.type = NLA_U32},
+ [TCA_FQ_PIE_FLOWS] = {.type = NLA_U32},
+ [TCA_FQ_PIE_TARGET] = {.type = NLA_U32},
+ [TCA_FQ_PIE_TUPDATE] = {.type = NLA_U32},
+ [TCA_FQ_PIE_ALPHA] = {.type = NLA_U32},
+ [TCA_FQ_PIE_BETA] = {.type = NLA_U32},
+ [TCA_FQ_PIE_QUANTUM] = {.type = NLA_U32},
+ [TCA_FQ_PIE_MEMORY_LIMIT] = {.type = NLA_U32},
+ [TCA_FQ_PIE_ECN_PROB] = {.type = NLA_U32},
+ [TCA_FQ_PIE_ECN] = {.type = NLA_U32},
+ [TCA_FQ_PIE_BYTEMODE] = {.type = NLA_U32},
+ [TCA_FQ_PIE_DQ_RATE_ESTIMATOR] = {.type = NLA_U32},
+};
+
+static inline struct sk_buff *dequeue_head(struct fq_pie_flow *flow)
+{
+ struct sk_buff *skb = flow->head;
+
+ flow->head = skb->next;
+ skb->next = NULL;
+ return skb;
+}
+
+static struct sk_buff *fq_pie_qdisc_dequeue(struct Qdisc *sch)
+{
+ struct fq_pie_sched_data *q = qdisc_priv(sch);
+ struct sk_buff *skb = NULL;
+ struct fq_pie_flow *flow;
+ struct list_head *head;
+ u32 pkt_len;
+
+begin:
+ head = &q->new_flows;
+ if (list_empty(head)) {
+ head = &q->old_flows;
+ if (list_empty(head))
+ return NULL;
+ }
+
+ flow = list_first_entry(head, struct fq_pie_flow, flowchain);
+ /* Flow has exhausted all its credits */
+ if (flow->deficit <= 0) {
+ flow->deficit += q->quantum;
+ list_move_tail(&flow->flowchain, &q->old_flows);
+ goto begin;
+ }
+
+ if (flow->head) {
+ skb = dequeue_head(flow);
+ pkt_len = qdisc_pkt_len(skb);
+ sch->qstats.backlog -= pkt_len;
+ sch->q.qlen--;
+ qdisc_bstats_update(sch, skb);
+ }
+
+ if (!skb) {
+ /* force a pass through old_flows to prevent starvation */
+ if (head == &q->new_flows && !list_empty(&q->old_flows))
+ list_move_tail(&flow->flowchain, &q->old_flows);
+ else
+ list_del_init(&flow->flowchain);
+ goto begin;
+ }
+
+ flow->qlen--;
+ flow->deficit -= pkt_len;
+ flow->backlog -= pkt_len;
+ q->memory_usage -= get_pie_cb(skb)->mem_usage;
+ pie_process_dequeue(skb, &q->p_params, &flow->vars, flow->backlog);
+ return skb;
+}
+
+static int fq_pie_change(struct Qdisc *sch, struct nlattr *opt,
+ struct netlink_ext_ack *extack)
+{
+ struct fq_pie_sched_data *q = qdisc_priv(sch);
+ struct nlattr *tb[TCA_FQ_PIE_MAX + 1];
+ unsigned int len_dropped = 0;
+ unsigned int num_dropped = 0;
+ int err;
+
+ if (!opt)
+ return -EINVAL;
+
+ err = nla_parse_nested(tb, TCA_FQ_PIE_MAX, opt, fq_pie_policy, extack);
+ if (err < 0)
+ return err;
+
+ sch_tree_lock(sch);
+ if (tb[TCA_FQ_PIE_LIMIT]) {
+ u32 limit = nla_get_u32(tb[TCA_FQ_PIE_LIMIT]);
+
+ q->p_params.limit = limit;
+ sch->limit = limit;
+ }
+ if (tb[TCA_FQ_PIE_FLOWS]) {
+ if (q->flows) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Number of flows cannot be changed");
+ goto flow_error;
+ }
+ q->flows_cnt = nla_get_u32(tb[TCA_FQ_PIE_FLOWS]);
+ if (!q->flows_cnt || q->flows_cnt > 65536) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Number of flows must range in [1..65536]");
+ goto flow_error;
+ }
+ }
+
+ /* convert from microseconds to pschedtime */
+ if (tb[TCA_FQ_PIE_TARGET]) {
+ /* target is in us */
+ u32 target = nla_get_u32(tb[TCA_FQ_PIE_TARGET]);
+
+ /* convert to pschedtime */
+ q->p_params.target =
+ PSCHED_NS2TICKS((u64)target * NSEC_PER_USEC);
+ }
+
+ /* tupdate is in jiffies */
+ if (tb[TCA_FQ_PIE_TUPDATE])
+ q->p_params.tupdate =
+ usecs_to_jiffies(nla_get_u32(tb[TCA_FQ_PIE_TUPDATE]));
+
+ if (tb[TCA_FQ_PIE_ALPHA])
+ q->p_params.alpha = nla_get_u32(tb[TCA_FQ_PIE_ALPHA]);
+
+ if (tb[TCA_FQ_PIE_BETA])
+ q->p_params.beta = nla_get_u32(tb[TCA_FQ_PIE_BETA]);
+
+ if (tb[TCA_FQ_PIE_QUANTUM])
+ q->quantum = nla_get_u32(tb[TCA_FQ_PIE_QUANTUM]);
+
+ if (tb[TCA_FQ_PIE_MEMORY_LIMIT])
+ q->memory_limit = nla_get_u32(tb[TCA_FQ_PIE_MEMORY_LIMIT]);
+
+ if (tb[TCA_FQ_PIE_ECN_PROB])
+ q->ecn_prob = nla_get_u32(tb[TCA_FQ_PIE_ECN_PROB]);
+
+ if (tb[TCA_FQ_PIE_ECN])
+ q->p_params.ecn = nla_get_u32(tb[TCA_FQ_PIE_ECN]);
+
+ if (tb[TCA_FQ_PIE_BYTEMODE])
+ q->p_params.bytemode = nla_get_u32(tb[TCA_FQ_PIE_BYTEMODE]);
+
+ if (tb[TCA_FQ_PIE_DQ_RATE_ESTIMATOR])
+ q->p_params.dq_rate_estimator =
+ nla_get_u32(tb[TCA_FQ_PIE_DQ_RATE_ESTIMATOR]);
+
+ /* Drop excess packets if new limit is lower */
+ while (sch->q.qlen > sch->limit) {
+ struct sk_buff *skb = fq_pie_qdisc_dequeue(sch);
+
+ len_dropped += qdisc_pkt_len(skb);
+ num_dropped += 1;
+ rtnl_kfree_skbs(skb, skb);
+ }
+ qdisc_tree_reduce_backlog(sch, num_dropped, len_dropped);
+
+ sch_tree_unlock(sch);
+ return 0;
+
+flow_error:
+ sch_tree_unlock(sch);
+ return -EINVAL;
+}
+
+static void fq_pie_timer(struct timer_list *t)
+{
+ struct fq_pie_sched_data *q = from_timer(q, t, adapt_timer);
+ struct Qdisc *sch = q->sch;
+ spinlock_t *root_lock; /* to lock qdisc for probability calculations */
+ u32 idx;
+
+ root_lock = qdisc_lock(qdisc_root_sleeping(sch));
+ spin_lock(root_lock);
+
+ for (idx = 0; idx < q->flows_cnt; idx++)
+ pie_calculate_probability(&q->p_params, &q->flows[idx].vars,
+ q->flows[idx].backlog);
+
+ /* reset the timer to fire after 'tupdate' jiffies. */
+ if (q->p_params.tupdate)
+ mod_timer(&q->adapt_timer, jiffies + q->p_params.tupdate);
+
+ spin_unlock(root_lock);
+}
+
+static int fq_pie_init(struct Qdisc *sch, struct nlattr *opt,
+ struct netlink_ext_ack *extack)
+{
+ struct fq_pie_sched_data *q = qdisc_priv(sch);
+ int err;
+ u32 idx;
+
+ pie_params_init(&q->p_params);
+ sch->limit = 10 * 1024;
+ q->p_params.limit = sch->limit;
+ q->quantum = psched_mtu(qdisc_dev(sch));
+ q->sch = sch;
+ q->ecn_prob = 10;
+ q->flows_cnt = 1024;
+ q->memory_limit = SZ_32M;
+
+ INIT_LIST_HEAD(&q->new_flows);
+ INIT_LIST_HEAD(&q->old_flows);
+ timer_setup(&q->adapt_timer, fq_pie_timer, 0);
+
+ if (opt) {
+ err = fq_pie_change(sch, opt, extack);
+
+ if (err)
+ return err;
+ }
+
+ err = tcf_block_get(&q->block, &q->filter_list, sch, extack);
+ if (err)
+ goto init_failure;
+
+ q->flows = kvcalloc(q->flows_cnt, sizeof(struct fq_pie_flow),
+ GFP_KERNEL);
+ if (!q->flows) {
+ err = -ENOMEM;
+ goto init_failure;
+ }
+ for (idx = 0; idx < q->flows_cnt; idx++) {
+ struct fq_pie_flow *flow = q->flows + idx;
+
+ INIT_LIST_HEAD(&flow->flowchain);
+ pie_vars_init(&flow->vars);
+ }
+
+ mod_timer(&q->adapt_timer, jiffies + HZ / 2);
+
+ return 0;
+
+init_failure:
+ q->flows_cnt = 0;
+
+ return err;
+}
+
+static int fq_pie_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+ struct fq_pie_sched_data *q = qdisc_priv(sch);
+ struct nlattr *opts;
+
+ opts = nla_nest_start(skb, TCA_OPTIONS);
+ if (!opts)
+ return -EMSGSIZE;
+
+ /* convert target from pschedtime to us */
+ if (nla_put_u32(skb, TCA_FQ_PIE_LIMIT, sch->limit) ||
+ nla_put_u32(skb, TCA_FQ_PIE_FLOWS, q->flows_cnt) ||
+ nla_put_u32(skb, TCA_FQ_PIE_TARGET,
+ ((u32)PSCHED_TICKS2NS(q->p_params.target)) /
+ NSEC_PER_USEC) ||
+ nla_put_u32(skb, TCA_FQ_PIE_TUPDATE,
+ jiffies_to_usecs(q->p_params.tupdate)) ||
+ nla_put_u32(skb, TCA_FQ_PIE_ALPHA, q->p_params.alpha) ||
+ nla_put_u32(skb, TCA_FQ_PIE_BETA, q->p_params.beta) ||
+ nla_put_u32(skb, TCA_FQ_PIE_QUANTUM, q->quantum) ||
+ nla_put_u32(skb, TCA_FQ_PIE_MEMORY_LIMIT, q->memory_limit) ||
+ nla_put_u32(skb, TCA_FQ_PIE_ECN_PROB, q->ecn_prob) ||
+ nla_put_u32(skb, TCA_FQ_PIE_ECN, q->p_params.ecn) ||
+ nla_put_u32(skb, TCA_FQ_PIE_BYTEMODE, q->p_params.bytemode) ||
+ nla_put_u32(skb, TCA_FQ_PIE_DQ_RATE_ESTIMATOR,
+ q->p_params.dq_rate_estimator))
+ goto nla_put_failure;
+
+ return nla_nest_end(skb, opts);
+
+nla_put_failure:
+ nla_nest_cancel(skb, opts);
+ return -EMSGSIZE;
+}
+
+static int fq_pie_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
+{
+ struct fq_pie_sched_data *q = qdisc_priv(sch);
+ struct tc_fq_pie_xstats st = {
+ .packets_in = q->stats.packets_in,
+ .overlimit = q->stats.overlimit,
+ .overmemory = q->overmemory,
+ .dropped = q->stats.dropped,
+ .ecn_mark = q->stats.ecn_mark,
+ .new_flow_count = q->new_flow_count,
+ .memory_usage = q->memory_usage,
+ };
+ struct list_head *pos;
+
+ sch_tree_lock(sch);
+ list_for_each(pos, &q->new_flows)
+ st.new_flows_len++;
+
+ list_for_each(pos, &q->old_flows)
+ st.old_flows_len++;
+ sch_tree_unlock(sch);
+
+ return gnet_stats_copy_app(d, &st, sizeof(st));
+}
+
+static void fq_pie_reset(struct Qdisc *sch)
+{
+ struct fq_pie_sched_data *q = qdisc_priv(sch);
+ u32 idx;
+
+ INIT_LIST_HEAD(&q->new_flows);
+ INIT_LIST_HEAD(&q->old_flows);
+ for (idx = 0; idx < q->flows_cnt; idx++) {
+ struct fq_pie_flow *flow = q->flows + idx;
+
+ /* Removes all packets from flow */
+ rtnl_kfree_skbs(flow->head, flow->tail);
+ flow->head = NULL;
+
+ INIT_LIST_HEAD(&flow->flowchain);
+ pie_vars_init(&flow->vars);
+ }
+
+ sch->q.qlen = 0;
+ sch->qstats.backlog = 0;
+}
+
+static void fq_pie_destroy(struct Qdisc *sch)
+{
+ struct fq_pie_sched_data *q = qdisc_priv(sch);
+
+ tcf_block_put(q->block);
+ q->p_params.tupdate = 0;
+ del_timer_sync(&q->adapt_timer);
+ kvfree(q->flows);
+}
+
+static struct Qdisc_ops fq_pie_qdisc_ops __read_mostly = {
+ .id = "fq_pie",
+ .priv_size = sizeof(struct fq_pie_sched_data),
+ .enqueue = fq_pie_qdisc_enqueue,
+ .dequeue = fq_pie_qdisc_dequeue,
+ .peek = qdisc_peek_dequeued,
+ .init = fq_pie_init,
+ .destroy = fq_pie_destroy,
+ .reset = fq_pie_reset,
+ .change = fq_pie_change,
+ .dump = fq_pie_dump,
+ .dump_stats = fq_pie_dump_stats,
+ .owner = THIS_MODULE,
+};
+
+static int __init fq_pie_module_init(void)
+{
+ return register_qdisc(&fq_pie_qdisc_ops);
+}
+
+static void __exit fq_pie_module_exit(void)
+{
+ unregister_qdisc(&fq_pie_qdisc_ops);
+}
+
+module_init(fq_pie_module_init);
+module_exit(fq_pie_module_exit);
+
+MODULE_DESCRIPTION("Flow Queue Proportional Integral controller Enhanced (FQ-PIE)");
+MODULE_AUTHOR("Mohit P. Tahiliani");
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 9bc5cbe..5d5391a 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -407,13 +407,8 @@
int packets;
while (qdisc_restart(q, &packets)) {
- /*
- * Ordered by possible occurrence: Postpone processing if
- * 1. we've exceeded packet quota
- * 2. another process needs the CPU;
- */
quota -= packets;
- if (quota <= 0 || need_resched()) {
+ if (quota <= 0) {
__netif_schedule(q);
break;
}
@@ -471,7 +466,7 @@
trace_net_dev_xmit_timeout(dev, i);
WARN_ONCE(1, KERN_INFO "NETDEV WATCHDOG: %s (%s): transmit queue %u timed out\n",
dev->name, netdev_drivername(dev), i);
- dev->netdev_ops->ndo_tx_timeout(dev);
+ dev->netdev_ops->ndo_tx_timeout(dev, i);
}
if (!mod_timer(&dev->watchdog_timer,
round_jiffies(jiffies +
@@ -844,13 +839,15 @@
};
EXPORT_SYMBOL(pfifo_fast_ops);
+static struct lock_class_key qdisc_tx_busylock;
+static struct lock_class_key qdisc_running_key;
+
struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
const struct Qdisc_ops *ops,
struct netlink_ext_ack *extack)
{
- void *p;
struct Qdisc *sch;
- unsigned int size = QDISC_ALIGN(sizeof(*sch)) + ops->priv_size;
+ unsigned int size = sizeof(*sch) + ops->priv_size;
int err = -ENOBUFS;
struct net_device *dev;
@@ -861,22 +858,10 @@
}
dev = dev_queue->dev;
- p = kzalloc_node(size, GFP_KERNEL,
- netdev_queue_numa_node_read(dev_queue));
+ sch = kzalloc_node(size, GFP_KERNEL, netdev_queue_numa_node_read(dev_queue));
- if (!p)
+ if (!sch)
goto errout;
- sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p);
- /* if we got non aligned memory, ask more and do alignment ourself */
- if (sch != p) {
- kfree(p);
- p = kzalloc_node(size + QDISC_ALIGNTO - 1, GFP_KERNEL,
- netdev_queue_numa_node_read(dev_queue));
- if (!p)
- goto errout;
- sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p);
- sch->padded = (char *) sch - (char *) p;
- }
__skb_queue_head_init(&sch->gso_skb);
__skb_queue_head_init(&sch->skb_bad_txq);
qdisc_skb_head_init(&sch->q);
@@ -896,9 +881,17 @@
}
spin_lock_init(&sch->busylock);
+ lockdep_set_class(&sch->busylock,
+ dev->qdisc_tx_busylock ?: &qdisc_tx_busylock);
+
/* seqlock has the same scope of busylock, for NOLOCK qdisc */
spin_lock_init(&sch->seqlock);
+ lockdep_set_class(&sch->seqlock,
+ dev->qdisc_tx_busylock ?: &qdisc_tx_busylock);
+
seqcount_init(&sch->running);
+ lockdep_set_class(&sch->running,
+ dev->qdisc_running_key ?: &qdisc_running_key);
sch->ops = ops;
sch->flags = ops->static_flags;
@@ -909,15 +902,9 @@
dev_hold(dev);
refcount_set(&sch->refcnt, 1);
- if (sch != &noop_qdisc) {
- lockdep_set_class(&sch->busylock, &dev->qdisc_tx_busylock_key);
- lockdep_set_class(&sch->seqlock, &dev->qdisc_tx_busylock_key);
- lockdep_set_class(&sch->running, &dev->qdisc_running_key);
- }
-
return sch;
errout1:
- kfree(p);
+ kfree(sch);
errout:
return ERR_PTR(err);
}
@@ -941,8 +928,10 @@
}
sch->parent = parentid;
- if (!ops->init || ops->init(sch, NULL, extack) == 0)
+ if (!ops->init || ops->init(sch, NULL, extack) == 0) {
+ trace_qdisc_create(ops, dev_queue->dev, parentid);
return sch;
+ }
qdisc_put(sch);
return NULL;
@@ -956,6 +945,8 @@
const struct Qdisc_ops *ops = qdisc->ops;
struct sk_buff *skb, *tmp;
+ trace_qdisc_reset(qdisc);
+
if (ops->reset)
ops->reset(qdisc);
@@ -981,7 +972,7 @@
free_percpu(qdisc->cpu_qstats);
}
- kfree((char *) qdisc - qdisc->padded);
+ kfree(qdisc);
}
static void qdisc_free_cb(struct rcu_head *head)
@@ -994,7 +985,6 @@
static void qdisc_destroy(struct Qdisc *qdisc)
{
const struct Qdisc_ops *ops = qdisc->ops;
- struct sk_buff *skb, *tmp;
#ifdef CONFIG_NET_SCHED
qdisc_hash_del(qdisc);
@@ -1002,23 +992,16 @@
qdisc_put_stab(rtnl_dereference(qdisc->stab));
#endif
gen_kill_estimator(&qdisc->rate_est);
- if (ops->reset)
- ops->reset(qdisc);
+
+ qdisc_reset(qdisc);
+
if (ops->destroy)
ops->destroy(qdisc);
module_put(ops->owner);
dev_put(qdisc_dev(qdisc));
- skb_queue_walk_safe(&qdisc->gso_skb, skb, tmp) {
- __skb_unlink(skb, &qdisc->gso_skb);
- kfree_skb_list(skb);
- }
-
- skb_queue_walk_safe(&qdisc->skb_bad_txq, skb, tmp) {
- __skb_unlink(skb, &qdisc->skb_bad_txq);
- kfree_skb_list(skb);
- }
+ trace_qdisc_destroy(qdisc);
call_rcu(&qdisc->rcu, qdisc_free_cb);
}
@@ -1087,10 +1070,9 @@
ops = &pfifo_fast_ops;
qdisc = qdisc_create_dflt(dev_queue, ops, TC_H_ROOT, NULL);
- if (!qdisc) {
- netdev_info(dev, "activation failed\n");
+ if (!qdisc)
return;
- }
+
if (!netif_is_multiqueue(dev))
qdisc->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT;
dev_queue->qdisc_sleeping = qdisc;
@@ -1106,18 +1088,33 @@
if (!netif_is_multiqueue(dev) ||
dev->priv_flags & IFF_NO_QUEUE) {
netdev_for_each_tx_queue(dev, attach_one_default_qdisc, NULL);
- dev->qdisc = txq->qdisc_sleeping;
- qdisc_refcount_inc(dev->qdisc);
+ qdisc = txq->qdisc_sleeping;
+ rcu_assign_pointer(dev->qdisc, qdisc);
+ qdisc_refcount_inc(qdisc);
} else {
qdisc = qdisc_create_dflt(txq, &mq_qdisc_ops, TC_H_ROOT, NULL);
if (qdisc) {
- dev->qdisc = qdisc;
+ rcu_assign_pointer(dev->qdisc, qdisc);
qdisc->ops->attach(qdisc);
}
}
+ qdisc = rtnl_dereference(dev->qdisc);
+
+ /* Detect default qdisc setup/init failed and fallback to "noqueue" */
+ if (qdisc == &noop_qdisc) {
+ netdev_warn(dev, "default qdisc (%s) fail, fallback to %s\n",
+ default_qdisc_ops->id, noqueue_qdisc_ops.id);
+ dev->priv_flags |= IFF_NO_QUEUE;
+ netdev_for_each_tx_queue(dev, attach_one_default_qdisc, NULL);
+ qdisc = txq->qdisc_sleeping;
+ rcu_assign_pointer(dev->qdisc, qdisc);
+ qdisc_refcount_inc(qdisc);
+ dev->priv_flags ^= IFF_NO_QUEUE;
+ }
+
#ifdef CONFIG_NET_SCHED
- if (dev->qdisc != &noop_qdisc)
- qdisc_hash_add(dev->qdisc, false);
+ if (qdisc != &noop_qdisc)
+ qdisc_hash_add(qdisc, false);
#endif
}
@@ -1147,7 +1144,7 @@
* and noqueue_qdisc for virtual interfaces
*/
- if (dev->qdisc == &noop_qdisc)
+ if (rtnl_dereference(dev->qdisc) == &noop_qdisc)
attach_default_qdiscs(dev);
if (!netif_carrier_ok(dev))
@@ -1166,17 +1163,24 @@
}
EXPORT_SYMBOL(dev_activate);
+static void qdisc_deactivate(struct Qdisc *qdisc)
+{
+ if (qdisc->flags & TCQ_F_BUILTIN)
+ return;
+
+ set_bit(__QDISC_STATE_DEACTIVATED, &qdisc->state);
+}
+
static void dev_deactivate_queue(struct net_device *dev,
struct netdev_queue *dev_queue,
void *_qdisc_default)
{
- struct Qdisc *qdisc = rtnl_dereference(dev_queue->qdisc);
struct Qdisc *qdisc_default = _qdisc_default;
+ struct Qdisc *qdisc;
+ qdisc = rtnl_dereference(dev_queue->qdisc);
if (qdisc) {
- if (!(qdisc->flags & TCQ_F_BUILTIN))
- set_bit(__QDISC_STATE_DEACTIVATED, &qdisc->state);
-
+ qdisc_deactivate(qdisc);
rcu_assign_pointer(dev_queue->qdisc, qdisc_default);
}
}
@@ -1234,16 +1238,6 @@
return false;
}
-static void dev_qdisc_reset(struct net_device *dev,
- struct netdev_queue *dev_queue,
- void *none)
-{
- struct Qdisc *qdisc = dev_queue->qdisc_sleeping;
-
- if (qdisc)
- qdisc_reset(qdisc);
-}
-
/**
* dev_deactivate_many - deactivate transmissions on several devices
* @head: list of devices to deactivate
@@ -1281,14 +1275,13 @@
/* Wait for outstanding qdisc_run calls. */
list_for_each_entry(dev, head, close_list) {
- while (some_qdisc_is_busy(dev))
- yield();
- /* The new qdisc is assigned at this point so we can safely
- * unwind stale skb lists and qdisc statistics
- */
- netdev_for_each_tx_queue(dev, dev_qdisc_reset, NULL);
- if (dev_ingress_queue(dev))
- dev_qdisc_reset(dev, dev_ingress_queue(dev), NULL);
+ while (some_qdisc_is_busy(dev)) {
+ /* wait_event() would avoid this sleep-loop but would
+ * require expensive checks in the fast paths of packet
+ * processing which isn't worth it.
+ */
+ schedule_timeout_uninterruptible(1);
+ }
}
}
@@ -1313,6 +1306,15 @@
return 0;
}
+void dev_qdisc_change_real_num_tx(struct net_device *dev,
+ unsigned int new_real_tx)
+{
+ struct Qdisc *qdisc = rtnl_dereference(dev->qdisc);
+
+ if (qdisc->ops->change_real_num_tx)
+ qdisc->ops->change_real_num_tx(qdisc, new_real_tx);
+}
+
int dev_qdisc_change_tx_queue_len(struct net_device *dev)
{
bool up = dev->flags & IFF_UP;
@@ -1347,7 +1349,7 @@
void dev_init_scheduler(struct net_device *dev)
{
- dev->qdisc = &noop_qdisc;
+ rcu_assign_pointer(dev->qdisc, &noop_qdisc);
netdev_for_each_tx_queue(dev, dev_init_scheduler_queue, &noop_qdisc);
if (dev_ingress_queue(dev))
dev_init_scheduler_queue(dev, dev_ingress_queue(dev), &noop_qdisc);
@@ -1375,8 +1377,8 @@
netdev_for_each_tx_queue(dev, shutdown_scheduler_queue, &noop_qdisc);
if (dev_ingress_queue(dev))
shutdown_scheduler_queue(dev, dev_ingress_queue(dev), &noop_qdisc);
- qdisc_put(dev->qdisc);
- dev->qdisc = &noop_qdisc;
+ qdisc_put(rtnl_dereference(dev->qdisc));
+ rcu_assign_pointer(dev->qdisc, &noop_qdisc);
WARN_ON(timer_pending(&dev->watchdog_timer));
}
@@ -1387,6 +1389,7 @@
{
memset(r, 0, sizeof(*r));
r->overhead = conf->overhead;
+ r->mpu = conf->mpu;
r->rate_bytes_ps = max_t(u64, conf->rate, rate64);
r->linklayer = (conf->linklayer & TC_LINKLAYER_MASK);
r->mult = 1;
@@ -1458,6 +1461,14 @@
}
EXPORT_SYMBOL(mini_qdisc_pair_swap);
+void mini_qdisc_pair_block_init(struct mini_Qdisc_pair *miniqp,
+ struct tcf_block *block)
+{
+ miniqp->miniq1.block = block;
+ miniqp->miniq2.block = block;
+}
+EXPORT_SYMBOL(mini_qdisc_pair_block_init);
+
void mini_qdisc_pair_init(struct mini_Qdisc_pair *miniqp, struct Qdisc *qdisc,
struct mini_Qdisc __rcu **p_miniq)
{
diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c
index 433f219..d1902fc 100644
--- a/net/sched/sch_hfsc.c
+++ b/net/sched/sch_hfsc.c
@@ -1136,7 +1136,7 @@
case TC_ACT_STOLEN:
case TC_ACT_TRAP:
*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
- /* fall through */
+ fallthrough;
case TC_ACT_SHOT:
return NULL;
}
@@ -1533,7 +1533,7 @@
{
unsigned int len = qdisc_pkt_len(skb);
struct hfsc_class *cl;
- int uninitialized_var(err);
+ int err;
bool first;
cl = hfsc_classify(skb, sch, &err);
diff --git a/net/sched/sch_hhf.c b/net/sched/sch_hhf.c
index be35f03..420ede8 100644
--- a/net/sched/sch_hhf.c
+++ b/net/sched/sch_hhf.c
@@ -721,3 +721,4 @@
MODULE_AUTHOR("Terry Lam");
MODULE_AUTHOR("Nandita Dukkipati");
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Heavy-Hitter Filter (HHF)");
diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
index 8184c87..cd70dbc 100644
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
@@ -239,7 +239,7 @@
case TC_ACT_STOLEN:
case TC_ACT_TRAP:
*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
- /* fall through */
+ fallthrough;
case TC_ACT_SHOT:
return NULL;
}
@@ -579,7 +579,7 @@
static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch,
struct sk_buff **to_free)
{
- int uninitialized_var(ret);
+ int ret;
unsigned int len = qdisc_pkt_len(skb);
struct htb_sched *q = qdisc_priv(sch);
struct htb_class *cl = htb_classify(skb, sch, &ret);
diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c
index bf56aa5..8483812 100644
--- a/net/sched/sch_ingress.c
+++ b/net/sched/sch_ingress.c
@@ -78,6 +78,7 @@
{
struct ingress_sched_data *q = qdisc_priv(sch);
struct net_device *dev = qdisc_dev(sch);
+ int err;
net_inc_ingress_queue();
@@ -87,7 +88,13 @@
q->block_info.chain_head_change = clsact_chain_head_change;
q->block_info.chain_head_change_priv = &q->miniqp;
- return tcf_block_get_ext(&q->block, sch, &q->block_info, extack);
+ err = tcf_block_get_ext(&q->block, sch, &q->block_info, extack);
+ if (err)
+ return err;
+
+ mini_qdisc_pair_block_init(&q->miniqp, q->block);
+
+ return 0;
}
static void ingress_destroy(struct Qdisc *sch)
@@ -226,6 +233,8 @@
if (err)
return err;
+ mini_qdisc_pair_block_init(&q->miniqp_ingress, q->ingress_block);
+
mini_qdisc_pair_init(&q->miniqp_egress, sch, &dev->miniq_egress);
q->egress_block_info.binder_type = FLOW_BLOCK_BINDER_TYPE_CLSACT_EGRESS;
diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c
index e79f1af..db18d8a 100644
--- a/net/sched/sch_mq.c
+++ b/net/sched/sch_mq.c
@@ -125,6 +125,29 @@
priv->qdiscs = NULL;
}
+static void mq_change_real_num_tx(struct Qdisc *sch, unsigned int new_real_tx)
+{
+#ifdef CONFIG_NET_SCHED
+ struct net_device *dev = qdisc_dev(sch);
+ struct Qdisc *qdisc;
+ unsigned int i;
+
+ for (i = new_real_tx; i < dev->real_num_tx_queues; i++) {
+ qdisc = netdev_get_tx_queue(dev, i)->qdisc_sleeping;
+ /* Only update the default qdiscs we created,
+ * qdiscs with handles are always hashed.
+ */
+ if (qdisc != &noop_qdisc && !qdisc->handle)
+ qdisc_hash_del(qdisc);
+ }
+ for (i = dev->real_num_tx_queues; i < new_real_tx; i++) {
+ qdisc = netdev_get_tx_queue(dev, i)->qdisc_sleeping;
+ if (qdisc != &noop_qdisc && !qdisc->handle)
+ qdisc_hash_add(qdisc, false);
+ }
+#endif
+}
+
static int mq_dump(struct Qdisc *sch, struct sk_buff *skb)
{
struct net_device *dev = qdisc_dev(sch);
@@ -288,6 +311,7 @@
.init = mq_init,
.destroy = mq_destroy,
.attach = mq_attach,
+ .change_real_num_tx = mq_change_real_num_tx,
.dump = mq_dump,
.owner = THIS_MODULE,
};
diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c
index 8766ab5..50e15ad 100644
--- a/net/sched/sch_mqprio.c
+++ b/net/sched/sch_mqprio.c
@@ -306,6 +306,28 @@
priv->qdiscs = NULL;
}
+static void mqprio_change_real_num_tx(struct Qdisc *sch,
+ unsigned int new_real_tx)
+{
+ struct net_device *dev = qdisc_dev(sch);
+ struct Qdisc *qdisc;
+ unsigned int i;
+
+ for (i = new_real_tx; i < dev->real_num_tx_queues; i++) {
+ qdisc = netdev_get_tx_queue(dev, i)->qdisc_sleeping;
+ /* Only update the default qdiscs we created,
+ * qdiscs with handles are always hashed.
+ */
+ if (qdisc != &noop_qdisc && !qdisc->handle)
+ qdisc_hash_del(qdisc);
+ }
+ for (i = dev->real_num_tx_queues; i < new_real_tx; i++) {
+ qdisc = netdev_get_tx_queue(dev, i)->qdisc_sleeping;
+ if (qdisc != &noop_qdisc && !qdisc->handle)
+ qdisc_hash_add(qdisc, false);
+ }
+}
+
static struct netdev_queue *mqprio_queue_get(struct Qdisc *sch,
unsigned long cl)
{
@@ -529,22 +551,28 @@
for (i = tc.offset; i < tc.offset + tc.count; i++) {
struct netdev_queue *q = netdev_get_tx_queue(dev, i);
struct Qdisc *qdisc = rtnl_dereference(q->qdisc);
- struct gnet_stats_basic_cpu __percpu *cpu_bstats = NULL;
- struct gnet_stats_queue __percpu *cpu_qstats = NULL;
spin_lock_bh(qdisc_lock(qdisc));
- if (qdisc_is_percpu_stats(qdisc)) {
- cpu_bstats = qdisc->cpu_bstats;
- cpu_qstats = qdisc->cpu_qstats;
- }
- qlen = qdisc_qlen_sum(qdisc);
- __gnet_stats_copy_basic(NULL, &sch->bstats,
- cpu_bstats, &qdisc->bstats);
- __gnet_stats_copy_queue(&sch->qstats,
- cpu_qstats,
- &qdisc->qstats,
- qlen);
+ if (qdisc_is_percpu_stats(qdisc)) {
+ qlen = qdisc_qlen_sum(qdisc);
+
+ __gnet_stats_copy_basic(NULL, &bstats,
+ qdisc->cpu_bstats,
+ &qdisc->bstats);
+ __gnet_stats_copy_queue(&qstats,
+ qdisc->cpu_qstats,
+ &qdisc->qstats,
+ qlen);
+ } else {
+ qlen += qdisc->q.qlen;
+ bstats.bytes += qdisc->bstats.bytes;
+ bstats.packets += qdisc->bstats.packets;
+ qstats.backlog += qdisc->qstats.backlog;
+ qstats.drops += qdisc->qstats.drops;
+ qstats.requeues += qdisc->qstats.requeues;
+ qstats.overlimits += qdisc->qstats.overlimits;
+ }
spin_unlock_bh(qdisc_lock(qdisc));
}
@@ -623,6 +651,7 @@
.init = mqprio_init,
.destroy = mqprio_destroy,
.attach = mqprio_attach,
+ .change_real_num_tx = mqprio_change_real_num_tx,
.dump = mqprio_dump,
.owner = THIS_MODULE,
};
diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c
index 1330ad2..5c27b42 100644
--- a/net/sched/sch_multiq.c
+++ b/net/sched/sch_multiq.c
@@ -43,7 +43,7 @@
case TC_ACT_QUEUED:
case TC_ACT_TRAP:
*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
- /* fall through */
+ fallthrough;
case TC_ACT_SHOT:
return NULL;
}
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index f4101a9..0c345e4 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -66,7 +66,7 @@
struct disttable {
u32 size;
- s16 table[0];
+ s16 table[];
};
struct netem_sched_data {
diff --git a/net/sched/sch_pie.c b/net/sched/sch_pie.c
index df98a88..c65077f 100644
--- a/net/sched/sch_pie.c
+++ b/net/sched/sch_pie.c
@@ -19,134 +19,68 @@
#include <linux/skbuff.h>
#include <net/pkt_sched.h>
#include <net/inet_ecn.h>
-
-#define QUEUE_THRESHOLD 16384
-#define DQCOUNT_INVALID -1
-#define MAX_PROB 0xffffffffffffffff
-#define PIE_SCALE 8
-
-/* parameters used */
-struct pie_params {
- psched_time_t target; /* user specified target delay in pschedtime */
- u32 tupdate; /* timer frequency (in jiffies) */
- u32 limit; /* number of packets that can be enqueued */
- u32 alpha; /* alpha and beta are between 0 and 32 */
- u32 beta; /* and are used for shift relative to 1 */
- bool ecn; /* true if ecn is enabled */
- bool bytemode; /* to scale drop early prob based on pkt size */
-};
-
-/* variables used */
-struct pie_vars {
- u64 prob; /* probability but scaled by u64 limit. */
- psched_time_t burst_time;
- psched_time_t qdelay;
- psched_time_t qdelay_old;
- u64 dq_count; /* measured in bytes */
- psched_time_t dq_tstamp; /* drain rate */
- u64 accu_prob; /* accumulated drop probability */
- u32 avg_dq_rate; /* bytes per pschedtime tick,scaled */
- u32 qlen_old; /* in bytes */
- u8 accu_prob_overflows; /* overflows of accu_prob */
-};
-
-/* statistics gathering */
-struct pie_stats {
- u32 packets_in; /* total number of packets enqueued */
- u32 dropped; /* packets dropped due to pie_action */
- u32 overlimit; /* dropped due to lack of space in queue */
- u32 maxq; /* maximum queue size */
- u32 ecn_mark; /* packets marked with ECN */
-};
+#include <net/pie.h>
/* private data for the Qdisc */
struct pie_sched_data {
- struct pie_params params;
struct pie_vars vars;
+ struct pie_params params;
struct pie_stats stats;
struct timer_list adapt_timer;
struct Qdisc *sch;
};
-static void pie_params_init(struct pie_params *params)
+bool pie_drop_early(struct Qdisc *sch, struct pie_params *params,
+ struct pie_vars *vars, u32 backlog, u32 packet_size)
{
- params->alpha = 2;
- params->beta = 20;
- params->tupdate = usecs_to_jiffies(15 * USEC_PER_MSEC); /* 15 ms */
- params->limit = 1000; /* default of 1000 packets */
- params->target = PSCHED_NS2TICKS(15 * NSEC_PER_MSEC); /* 15 ms */
- params->ecn = false;
- params->bytemode = false;
-}
-
-static void pie_vars_init(struct pie_vars *vars)
-{
- vars->dq_count = DQCOUNT_INVALID;
- vars->accu_prob = 0;
- vars->avg_dq_rate = 0;
- /* default of 150 ms in pschedtime */
- vars->burst_time = PSCHED_NS2TICKS(150 * NSEC_PER_MSEC);
- vars->accu_prob_overflows = 0;
-}
-
-static bool drop_early(struct Qdisc *sch, u32 packet_size)
-{
- struct pie_sched_data *q = qdisc_priv(sch);
u64 rnd;
- u64 local_prob = q->vars.prob;
+ u64 local_prob = vars->prob;
u32 mtu = psched_mtu(qdisc_dev(sch));
/* If there is still burst allowance left skip random early drop */
- if (q->vars.burst_time > 0)
+ if (vars->burst_time > 0)
return false;
/* If current delay is less than half of target, and
* if drop prob is low already, disable early_drop
*/
- if ((q->vars.qdelay < q->params.target / 2) &&
- (q->vars.prob < MAX_PROB / 5))
+ if ((vars->qdelay < params->target / 2) &&
+ (vars->prob < MAX_PROB / 5))
return false;
- /* If we have fewer than 2 mtu-sized packets, disable drop_early,
+ /* If we have fewer than 2 mtu-sized packets, disable pie_drop_early,
* similar to min_th in RED
*/
- if (sch->qstats.backlog < 2 * mtu)
+ if (backlog < 2 * mtu)
return false;
/* If bytemode is turned on, use packet size to compute new
* probablity. Smaller packets will have lower drop prob in this case
*/
- if (q->params.bytemode && packet_size <= mtu)
+ if (params->bytemode && packet_size <= mtu)
local_prob = (u64)packet_size * div_u64(local_prob, mtu);
else
- local_prob = q->vars.prob;
+ local_prob = vars->prob;
- if (local_prob == 0) {
- q->vars.accu_prob = 0;
- q->vars.accu_prob_overflows = 0;
- }
+ if (local_prob == 0)
+ vars->accu_prob = 0;
+ else
+ vars->accu_prob += local_prob;
- if (local_prob > MAX_PROB - q->vars.accu_prob)
- q->vars.accu_prob_overflows++;
-
- q->vars.accu_prob += local_prob;
-
- if (q->vars.accu_prob_overflows == 0 &&
- q->vars.accu_prob < (MAX_PROB / 100) * 85)
+ if (vars->accu_prob < (MAX_PROB / 100) * 85)
return false;
- if (q->vars.accu_prob_overflows == 8 &&
- q->vars.accu_prob >= MAX_PROB / 2)
+ if (vars->accu_prob >= (MAX_PROB / 2) * 17)
return true;
prandom_bytes(&rnd, 8);
- if (rnd < local_prob) {
- q->vars.accu_prob = 0;
- q->vars.accu_prob_overflows = 0;
+ if ((rnd >> BITS_PER_BYTE) < local_prob) {
+ vars->accu_prob = 0;
return true;
}
return false;
}
+EXPORT_SYMBOL_GPL(pie_drop_early);
static int pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch,
struct sk_buff **to_free)
@@ -159,7 +93,8 @@
goto out;
}
- if (!drop_early(sch, skb->len)) {
+ if (!pie_drop_early(sch, &q->params, &q->vars, sch->qstats.backlog,
+ skb->len)) {
enqueue = true;
} else if (q->params.ecn && (q->vars.prob <= MAX_PROB / 10) &&
INET_ECN_set_ce(skb)) {
@@ -172,6 +107,10 @@
/* we can enqueue the packet */
if (enqueue) {
+ /* Set enqueue time only when dq_rate_estimator is disabled. */
+ if (!q->params.dq_rate_estimator)
+ pie_set_enqueue_time(skb);
+
q->stats.packets_in++;
if (qdisc_qlen(sch) > q->stats.maxq)
q->stats.maxq = qdisc_qlen(sch);
@@ -182,18 +121,18 @@
out:
q->stats.dropped++;
q->vars.accu_prob = 0;
- q->vars.accu_prob_overflows = 0;
return qdisc_drop(skb, sch, to_free);
}
static const struct nla_policy pie_policy[TCA_PIE_MAX + 1] = {
- [TCA_PIE_TARGET] = {.type = NLA_U32},
- [TCA_PIE_LIMIT] = {.type = NLA_U32},
- [TCA_PIE_TUPDATE] = {.type = NLA_U32},
- [TCA_PIE_ALPHA] = {.type = NLA_U32},
- [TCA_PIE_BETA] = {.type = NLA_U32},
- [TCA_PIE_ECN] = {.type = NLA_U32},
- [TCA_PIE_BYTEMODE] = {.type = NLA_U32},
+ [TCA_PIE_TARGET] = {.type = NLA_U32},
+ [TCA_PIE_LIMIT] = {.type = NLA_U32},
+ [TCA_PIE_TUPDATE] = {.type = NLA_U32},
+ [TCA_PIE_ALPHA] = {.type = NLA_U32},
+ [TCA_PIE_BETA] = {.type = NLA_U32},
+ [TCA_PIE_ECN] = {.type = NLA_U32},
+ [TCA_PIE_BYTEMODE] = {.type = NLA_U32},
+ [TCA_PIE_DQ_RATE_ESTIMATOR] = {.type = NLA_U32},
};
static int pie_change(struct Qdisc *sch, struct nlattr *opt,
@@ -247,6 +186,10 @@
if (tb[TCA_PIE_BYTEMODE])
q->params.bytemode = nla_get_u32(tb[TCA_PIE_BYTEMODE]);
+ if (tb[TCA_PIE_DQ_RATE_ESTIMATOR])
+ q->params.dq_rate_estimator =
+ nla_get_u32(tb[TCA_PIE_DQ_RATE_ESTIMATOR]);
+
/* Drop excess packets if new limit is lower */
qlen = sch->q.qlen;
while (sch->q.qlen > sch->limit) {
@@ -262,94 +205,126 @@
return 0;
}
-static void pie_process_dequeue(struct Qdisc *sch, struct sk_buff *skb)
+void pie_process_dequeue(struct sk_buff *skb, struct pie_params *params,
+ struct pie_vars *vars, u32 backlog)
{
- struct pie_sched_data *q = qdisc_priv(sch);
- int qlen = sch->qstats.backlog; /* current queue size in bytes */
+ psched_time_t now = psched_get_time();
+ u32 dtime = 0;
+
+ /* If dq_rate_estimator is disabled, calculate qdelay using the
+ * packet timestamp.
+ */
+ if (!params->dq_rate_estimator) {
+ vars->qdelay = now - pie_get_enqueue_time(skb);
+
+ if (vars->dq_tstamp != DTIME_INVALID)
+ dtime = now - vars->dq_tstamp;
+
+ vars->dq_tstamp = now;
+
+ if (backlog == 0)
+ vars->qdelay = 0;
+
+ if (dtime == 0)
+ return;
+
+ goto burst_allowance_reduction;
+ }
/* If current queue is about 10 packets or more and dq_count is unset
* we have enough packets to calculate the drain rate. Save
* current time as dq_tstamp and start measurement cycle.
*/
- if (qlen >= QUEUE_THRESHOLD && q->vars.dq_count == DQCOUNT_INVALID) {
- q->vars.dq_tstamp = psched_get_time();
- q->vars.dq_count = 0;
+ if (backlog >= QUEUE_THRESHOLD && vars->dq_count == DQCOUNT_INVALID) {
+ vars->dq_tstamp = psched_get_time();
+ vars->dq_count = 0;
}
- /* Calculate the average drain rate from this value. If queue length
- * has receded to a small value viz., <= QUEUE_THRESHOLD bytes,reset
+ /* Calculate the average drain rate from this value. If queue length
+ * has receded to a small value viz., <= QUEUE_THRESHOLD bytes, reset
* the dq_count to -1 as we don't have enough packets to calculate the
- * drain rate anymore The following if block is entered only when we
+ * drain rate anymore. The following if block is entered only when we
* have a substantial queue built up (QUEUE_THRESHOLD bytes or more)
* and we calculate the drain rate for the threshold here. dq_count is
* in bytes, time difference in psched_time, hence rate is in
* bytes/psched_time.
*/
- if (q->vars.dq_count != DQCOUNT_INVALID) {
- q->vars.dq_count += skb->len;
+ if (vars->dq_count != DQCOUNT_INVALID) {
+ vars->dq_count += skb->len;
- if (q->vars.dq_count >= QUEUE_THRESHOLD) {
- psched_time_t now = psched_get_time();
- u32 dtime = now - q->vars.dq_tstamp;
- u32 count = q->vars.dq_count << PIE_SCALE;
+ if (vars->dq_count >= QUEUE_THRESHOLD) {
+ u32 count = vars->dq_count << PIE_SCALE;
+
+ dtime = now - vars->dq_tstamp;
if (dtime == 0)
return;
count = count / dtime;
- if (q->vars.avg_dq_rate == 0)
- q->vars.avg_dq_rate = count;
+ if (vars->avg_dq_rate == 0)
+ vars->avg_dq_rate = count;
else
- q->vars.avg_dq_rate =
- (q->vars.avg_dq_rate -
- (q->vars.avg_dq_rate >> 3)) + (count >> 3);
+ vars->avg_dq_rate =
+ (vars->avg_dq_rate -
+ (vars->avg_dq_rate >> 3)) + (count >> 3);
/* If the queue has receded below the threshold, we hold
* on to the last drain rate calculated, else we reset
* dq_count to 0 to re-enter the if block when the next
* packet is dequeued
*/
- if (qlen < QUEUE_THRESHOLD) {
- q->vars.dq_count = DQCOUNT_INVALID;
+ if (backlog < QUEUE_THRESHOLD) {
+ vars->dq_count = DQCOUNT_INVALID;
} else {
- q->vars.dq_count = 0;
- q->vars.dq_tstamp = psched_get_time();
+ vars->dq_count = 0;
+ vars->dq_tstamp = psched_get_time();
}
- if (q->vars.burst_time > 0) {
- if (q->vars.burst_time > dtime)
- q->vars.burst_time -= dtime;
- else
- q->vars.burst_time = 0;
- }
+ goto burst_allowance_reduction;
}
}
-}
-static void calculate_probability(struct Qdisc *sch)
+ return;
+
+burst_allowance_reduction:
+ if (vars->burst_time > 0) {
+ if (vars->burst_time > dtime)
+ vars->burst_time -= dtime;
+ else
+ vars->burst_time = 0;
+ }
+}
+EXPORT_SYMBOL_GPL(pie_process_dequeue);
+
+void pie_calculate_probability(struct pie_params *params, struct pie_vars *vars,
+ u32 backlog)
{
- struct pie_sched_data *q = qdisc_priv(sch);
- u32 qlen = sch->qstats.backlog; /* queue size in bytes */
psched_time_t qdelay = 0; /* in pschedtime */
- psched_time_t qdelay_old = q->vars.qdelay; /* in pschedtime */
+ psched_time_t qdelay_old = 0; /* in pschedtime */
s64 delta = 0; /* determines the change in probability */
u64 oldprob;
u64 alpha, beta;
u32 power;
bool update_prob = true;
- q->vars.qdelay_old = q->vars.qdelay;
+ if (params->dq_rate_estimator) {
+ qdelay_old = vars->qdelay;
+ vars->qdelay_old = vars->qdelay;
- if (q->vars.avg_dq_rate > 0)
- qdelay = (qlen << PIE_SCALE) / q->vars.avg_dq_rate;
- else
- qdelay = 0;
+ if (vars->avg_dq_rate > 0)
+ qdelay = (backlog << PIE_SCALE) / vars->avg_dq_rate;
+ else
+ qdelay = 0;
+ } else {
+ qdelay = vars->qdelay;
+ qdelay_old = vars->qdelay_old;
+ }
- /* If qdelay is zero and qlen is not, it means qlen is very small, less
- * than dequeue_rate, so we do not update probabilty in this round
+ /* If qdelay is zero and backlog is not, it means backlog is very small,
+ * so we do not update probabilty in this round.
*/
- if (qdelay == 0 && qlen != 0)
+ if (qdelay == 0 && backlog != 0)
update_prob = false;
/* In the algorithm, alpha and beta are between 0 and 2 with typical
@@ -359,18 +334,18 @@
* probability. alpha/beta are updated locally below by scaling down
* by 16 to come to 0-2 range.
*/
- alpha = ((u64)q->params.alpha * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 4;
- beta = ((u64)q->params.beta * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 4;
+ alpha = ((u64)params->alpha * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 4;
+ beta = ((u64)params->beta * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 4;
/* We scale alpha and beta differently depending on how heavy the
* congestion is. Please see RFC 8033 for details.
*/
- if (q->vars.prob < MAX_PROB / 10) {
+ if (vars->prob < MAX_PROB / 10) {
alpha >>= 1;
beta >>= 1;
power = 100;
- while (q->vars.prob < div_u64(MAX_PROB, power) &&
+ while (vars->prob < div_u64(MAX_PROB, power) &&
power <= 1000000) {
alpha >>= 2;
beta >>= 2;
@@ -379,14 +354,14 @@
}
/* alpha and beta should be between 0 and 32, in multiples of 1/16 */
- delta += alpha * (u64)(qdelay - q->params.target);
- delta += beta * (u64)(qdelay - qdelay_old);
+ delta += alpha * (qdelay - params->target);
+ delta += beta * (qdelay - qdelay_old);
- oldprob = q->vars.prob;
+ oldprob = vars->prob;
/* to ensure we increase probability in steps of no more than 2% */
if (delta > (s64)(MAX_PROB / (100 / 2)) &&
- q->vars.prob >= MAX_PROB / 10)
+ vars->prob >= MAX_PROB / 10)
delta = (MAX_PROB / 100) * 2;
/* Non-linear drop:
@@ -397,12 +372,12 @@
if (qdelay > (PSCHED_NS2TICKS(250 * NSEC_PER_MSEC)))
delta += MAX_PROB / (100 / 2);
- q->vars.prob += delta;
+ vars->prob += delta;
if (delta > 0) {
/* prevent overflow */
- if (q->vars.prob < oldprob) {
- q->vars.prob = MAX_PROB;
+ if (vars->prob < oldprob) {
+ vars->prob = MAX_PROB;
/* Prevent normalization error. If probability is at
* maximum value already, we normalize it here, and
* skip the check to do a non-linear drop in the next
@@ -412,8 +387,8 @@
}
} else {
/* prevent underflow */
- if (q->vars.prob > oldprob)
- q->vars.prob = 0;
+ if (vars->prob > oldprob)
+ vars->prob = 0;
}
/* Non-linear drop in probability: Reduce drop probability quickly if
@@ -422,23 +397,28 @@
if (qdelay == 0 && qdelay_old == 0 && update_prob)
/* Reduce drop probability to 98.4% */
- q->vars.prob -= q->vars.prob / 64u;
+ vars->prob -= vars->prob / 64;
- q->vars.qdelay = qdelay;
- q->vars.qlen_old = qlen;
+ vars->qdelay = qdelay;
+ vars->backlog_old = backlog;
/* We restart the measurement cycle if the following conditions are met
* 1. If the delay has been low for 2 consecutive Tupdate periods
* 2. Calculated drop probability is zero
- * 3. We have atleast one estimate for the avg_dq_rate ie.,
- * is a non-zero value
+ * 3. If average dq_rate_estimator is enabled, we have atleast one
+ * estimate for the avg_dq_rate ie., is a non-zero value
*/
- if ((q->vars.qdelay < q->params.target / 2) &&
- (q->vars.qdelay_old < q->params.target / 2) &&
- q->vars.prob == 0 &&
- q->vars.avg_dq_rate > 0)
- pie_vars_init(&q->vars);
+ if ((vars->qdelay < params->target / 2) &&
+ (vars->qdelay_old < params->target / 2) &&
+ vars->prob == 0 &&
+ (!params->dq_rate_estimator || vars->avg_dq_rate > 0)) {
+ pie_vars_init(vars);
+ }
+
+ if (!params->dq_rate_estimator)
+ vars->qdelay_old = qdelay;
}
+EXPORT_SYMBOL_GPL(pie_calculate_probability);
static void pie_timer(struct timer_list *t)
{
@@ -447,7 +427,7 @@
spinlock_t *root_lock = qdisc_lock(qdisc_root_sleeping(sch));
spin_lock(root_lock);
- calculate_probability(sch);
+ pie_calculate_probability(&q->params, &q->vars, sch->qstats.backlog);
/* reset the timer to fire after 'tupdate'. tupdate is in jiffies. */
if (q->params.tupdate)
@@ -497,7 +477,9 @@
nla_put_u32(skb, TCA_PIE_ALPHA, q->params.alpha) ||
nla_put_u32(skb, TCA_PIE_BETA, q->params.beta) ||
nla_put_u32(skb, TCA_PIE_ECN, q->params.ecn) ||
- nla_put_u32(skb, TCA_PIE_BYTEMODE, q->params.bytemode))
+ nla_put_u32(skb, TCA_PIE_BYTEMODE, q->params.bytemode) ||
+ nla_put_u32(skb, TCA_PIE_DQ_RATE_ESTIMATOR,
+ q->params.dq_rate_estimator))
goto nla_put_failure;
return nla_nest_end(skb, opts);
@@ -511,12 +493,9 @@
{
struct pie_sched_data *q = qdisc_priv(sch);
struct tc_pie_xstats st = {
- .prob = q->vars.prob,
+ .prob = q->vars.prob << BITS_PER_BYTE,
.delay = ((u32)PSCHED_TICKS2NS(q->vars.qdelay)) /
NSEC_PER_USEC,
- /* unscale and return dq_rate in bytes per sec */
- .avg_dq_rate = q->vars.avg_dq_rate *
- (PSCHED_TICKS_PER_SEC) >> PIE_SCALE,
.packets_in = q->stats.packets_in,
.overlimit = q->stats.overlimit,
.maxq = q->stats.maxq,
@@ -524,17 +503,26 @@
.ecn_mark = q->stats.ecn_mark,
};
+ /* avg_dq_rate is only valid if dq_rate_estimator is enabled */
+ st.dq_rate_estimating = q->params.dq_rate_estimator;
+
+ /* unscale and return dq_rate in bytes per sec */
+ if (q->params.dq_rate_estimator)
+ st.avg_dq_rate = q->vars.avg_dq_rate *
+ (PSCHED_TICKS_PER_SEC) >> PIE_SCALE;
+
return gnet_stats_copy_app(d, &st, sizeof(st));
}
static struct sk_buff *pie_qdisc_dequeue(struct Qdisc *sch)
{
+ struct pie_sched_data *q = qdisc_priv(sch);
struct sk_buff *skb = qdisc_dequeue_head(sch);
if (!skb)
return NULL;
- pie_process_dequeue(sch, skb);
+ pie_process_dequeue(skb, &q->params, &q->vars, sch->qstats.backlog);
return skb;
}
@@ -555,7 +543,7 @@
}
static struct Qdisc_ops pie_qdisc_ops __read_mostly = {
- .id = "pie",
+ .id = "pie",
.priv_size = sizeof(struct pie_sched_data),
.enqueue = pie_qdisc_enqueue,
.dequeue = pie_qdisc_dequeue,
diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c
index 6479417..3eabb87 100644
--- a/net/sched/sch_prio.c
+++ b/net/sched/sch_prio.c
@@ -46,7 +46,7 @@
case TC_ACT_QUEUED:
case TC_ACT_TRAP:
*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
- /* fall through */
+ fallthrough;
case TC_ACT_SHOT:
return NULL;
}
diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c
index b046fd3..af8c63a 100644
--- a/net/sched/sch_qfq.c
+++ b/net/sched/sch_qfq.c
@@ -697,7 +697,7 @@
case TC_ACT_STOLEN:
case TC_ACT_TRAP:
*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
- /* fall through */
+ fallthrough;
case TC_ACT_SHOT:
return NULL;
}
@@ -1421,10 +1421,8 @@
if (err < 0)
return err;
- if (qdisc_dev(sch)->tx_queue_len + 1 > QFQ_MAX_AGG_CLASSES)
- max_classes = QFQ_MAX_AGG_CLASSES;
- else
- max_classes = qdisc_dev(sch)->tx_queue_len + 1;
+ max_classes = min_t(u64, (u64)qdisc_dev(sch)->tx_queue_len + 1,
+ QFQ_MAX_AGG_CLASSES);
/* max_cl_shift = floor(log_2(max_classes)) */
max_cl_shift = __fls(max_classes);
q->max_agg_classes = 1<<max_cl_shift;
diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c
index 7741f10..40adf1f 100644
--- a/net/sched/sch_red.c
+++ b/net/sched/sch_red.c
@@ -35,15 +35,23 @@
struct red_sched_data {
u32 limit; /* HARD maximal queue length */
+
unsigned char flags;
+ /* Non-flags in tc_red_qopt.flags. */
+ unsigned char userbits;
+
struct timer_list adapt_timer;
struct Qdisc *sch;
struct red_parms parms;
struct red_vars vars;
struct red_stats stats;
struct Qdisc *qdisc;
+ struct tcf_qevent qe_early_drop;
+ struct tcf_qevent qe_mark;
};
+#define TC_RED_SUPPORTED_FLAGS (TC_RED_HISTORIC_FLAGS | TC_RED_NODROP)
+
static inline int red_use_ecn(struct red_sched_data *q)
{
return q->flags & TC_RED_ECN;
@@ -54,6 +62,11 @@
return q->flags & TC_RED_HARDDROP;
}
+static int red_use_nodrop(struct red_sched_data *q)
+{
+ return q->flags & TC_RED_NODROP;
+}
+
static int red_enqueue(struct sk_buff *skb, struct Qdisc *sch,
struct sk_buff **to_free)
{
@@ -74,23 +87,42 @@
case RED_PROB_MARK:
qdisc_qstats_overlimit(sch);
- if (!red_use_ecn(q) || !INET_ECN_set_ce(skb)) {
+ if (!red_use_ecn(q)) {
q->stats.prob_drop++;
goto congestion_drop;
}
- q->stats.prob_mark++;
+ if (INET_ECN_set_ce(skb)) {
+ q->stats.prob_mark++;
+ skb = tcf_qevent_handle(&q->qe_mark, sch, skb, to_free, &ret);
+ if (!skb)
+ return NET_XMIT_CN | ret;
+ } else if (!red_use_nodrop(q)) {
+ q->stats.prob_drop++;
+ goto congestion_drop;
+ }
+
+ /* Non-ECT packet in ECN nodrop mode: queue it. */
break;
case RED_HARD_MARK:
qdisc_qstats_overlimit(sch);
- if (red_use_harddrop(q) || !red_use_ecn(q) ||
- !INET_ECN_set_ce(skb)) {
+ if (red_use_harddrop(q) || !red_use_ecn(q)) {
q->stats.forced_drop++;
goto congestion_drop;
}
- q->stats.forced_mark++;
+ if (INET_ECN_set_ce(skb)) {
+ q->stats.forced_mark++;
+ skb = tcf_qevent_handle(&q->qe_mark, sch, skb, to_free, &ret);
+ if (!skb)
+ return NET_XMIT_CN | ret;
+ } else if (!red_use_nodrop(q)) {
+ q->stats.forced_drop++;
+ goto congestion_drop;
+ }
+
+ /* Non-ECT packet in ECN nodrop mode: queue it. */
break;
}
@@ -105,6 +137,10 @@
return ret;
congestion_drop:
+ skb = tcf_qevent_handle(&q->qe_early_drop, sch, skb, to_free, &ret);
+ if (!skb)
+ return NET_XMIT_CN | ret;
+
qdisc_drop(skb, sch, to_free);
return NET_XMIT_CN;
}
@@ -165,6 +201,7 @@
opt.set.limit = q->limit;
opt.set.is_ecn = red_use_ecn(q);
opt.set.is_harddrop = red_use_harddrop(q);
+ opt.set.is_nodrop = red_use_nodrop(q);
opt.set.qstats = &sch->qstats;
} else {
opt.command = TC_RED_DESTROY;
@@ -177,36 +214,36 @@
{
struct red_sched_data *q = qdisc_priv(sch);
+ tcf_qevent_destroy(&q->qe_mark, sch);
+ tcf_qevent_destroy(&q->qe_early_drop, sch);
del_timer_sync(&q->adapt_timer);
red_offload(sch, false);
qdisc_put(q->qdisc);
}
static const struct nla_policy red_policy[TCA_RED_MAX + 1] = {
+ [TCA_RED_UNSPEC] = { .strict_start_type = TCA_RED_FLAGS },
[TCA_RED_PARMS] = { .len = sizeof(struct tc_red_qopt) },
[TCA_RED_STAB] = { .len = RED_STAB_SIZE },
[TCA_RED_MAX_P] = { .type = NLA_U32 },
+ [TCA_RED_FLAGS] = NLA_POLICY_BITFIELD32(TC_RED_SUPPORTED_FLAGS),
+ [TCA_RED_EARLY_DROP_BLOCK] = { .type = NLA_U32 },
+ [TCA_RED_MARK_BLOCK] = { .type = NLA_U32 },
};
-static int red_change(struct Qdisc *sch, struct nlattr *opt,
- struct netlink_ext_ack *extack)
+static int __red_change(struct Qdisc *sch, struct nlattr **tb,
+ struct netlink_ext_ack *extack)
{
struct Qdisc *old_child = NULL, *child = NULL;
struct red_sched_data *q = qdisc_priv(sch);
- struct nlattr *tb[TCA_RED_MAX + 1];
+ struct nla_bitfield32 flags_bf;
struct tc_red_qopt *ctl;
+ unsigned char userbits;
+ unsigned char flags;
int err;
u32 max_P;
u8 *stab;
- if (opt == NULL)
- return -EINVAL;
-
- err = nla_parse_nested_deprecated(tb, TCA_RED_MAX, opt, red_policy,
- NULL);
- if (err < 0)
- return err;
-
if (tb[TCA_RED_PARMS] == NULL ||
tb[TCA_RED_STAB] == NULL)
return -EINVAL;
@@ -219,6 +256,12 @@
ctl->Scell_log, stab))
return -EINVAL;
+ err = red_get_flags(ctl->flags, TC_RED_HISTORIC_FLAGS,
+ tb[TCA_RED_FLAGS], TC_RED_SUPPORTED_FLAGS,
+ &flags_bf, &userbits, extack);
+ if (err)
+ return err;
+
if (ctl->limit > 0) {
child = fifo_create_dflt(sch, &bfifo_qdisc_ops, ctl->limit,
extack);
@@ -230,7 +273,14 @@
}
sch_tree_lock(sch);
- q->flags = ctl->flags;
+
+ flags = (q->flags & ~flags_bf.selector) | flags_bf.value;
+ err = red_validate_flags(flags, extack);
+ if (err)
+ goto unlock_out;
+
+ q->flags = flags;
+ q->userbits = userbits;
q->limit = ctl->limit;
if (child) {
qdisc_tree_flush_backlog(q->qdisc);
@@ -259,6 +309,12 @@
if (old_child)
qdisc_put(old_child);
return 0;
+
+unlock_out:
+ sch_tree_unlock(sch);
+ if (child)
+ qdisc_put(child);
+ return err;
}
static inline void red_adaptative_timer(struct timer_list *t)
@@ -277,11 +333,62 @@
struct netlink_ext_ack *extack)
{
struct red_sched_data *q = qdisc_priv(sch);
+ struct nlattr *tb[TCA_RED_MAX + 1];
+ int err;
q->qdisc = &noop_qdisc;
q->sch = sch;
timer_setup(&q->adapt_timer, red_adaptative_timer, 0);
- return red_change(sch, opt, extack);
+
+ if (!opt)
+ return -EINVAL;
+
+ err = nla_parse_nested_deprecated(tb, TCA_RED_MAX, opt, red_policy,
+ extack);
+ if (err < 0)
+ return err;
+
+ err = __red_change(sch, tb, extack);
+ if (err)
+ return err;
+
+ err = tcf_qevent_init(&q->qe_early_drop, sch,
+ FLOW_BLOCK_BINDER_TYPE_RED_EARLY_DROP,
+ tb[TCA_RED_EARLY_DROP_BLOCK], extack);
+ if (err)
+ return err;
+
+ return tcf_qevent_init(&q->qe_mark, sch,
+ FLOW_BLOCK_BINDER_TYPE_RED_MARK,
+ tb[TCA_RED_MARK_BLOCK], extack);
+}
+
+static int red_change(struct Qdisc *sch, struct nlattr *opt,
+ struct netlink_ext_ack *extack)
+{
+ struct red_sched_data *q = qdisc_priv(sch);
+ struct nlattr *tb[TCA_RED_MAX + 1];
+ int err;
+
+ if (!opt)
+ return -EINVAL;
+
+ err = nla_parse_nested_deprecated(tb, TCA_RED_MAX, opt, red_policy,
+ extack);
+ if (err < 0)
+ return err;
+
+ err = tcf_qevent_validate_change(&q->qe_early_drop,
+ tb[TCA_RED_EARLY_DROP_BLOCK], extack);
+ if (err)
+ return err;
+
+ err = tcf_qevent_validate_change(&q->qe_mark,
+ tb[TCA_RED_MARK_BLOCK], extack);
+ if (err)
+ return err;
+
+ return __red_change(sch, tb, extack);
}
static int red_dump_offload_stats(struct Qdisc *sch)
@@ -305,7 +412,8 @@
struct nlattr *opts = NULL;
struct tc_red_qopt opt = {
.limit = q->limit,
- .flags = q->flags,
+ .flags = (q->flags & TC_RED_HISTORIC_FLAGS) |
+ q->userbits,
.qth_min = q->parms.qth_min >> q->parms.Wlog,
.qth_max = q->parms.qth_max >> q->parms.Wlog,
.Wlog = q->parms.Wlog,
@@ -322,7 +430,11 @@
if (opts == NULL)
goto nla_put_failure;
if (nla_put(skb, TCA_RED_PARMS, sizeof(opt), &opt) ||
- nla_put_u32(skb, TCA_RED_MAX_P, q->parms.max_P))
+ nla_put_u32(skb, TCA_RED_MAX_P, q->parms.max_P) ||
+ nla_put_bitfield32(skb, TCA_RED_FLAGS,
+ q->flags, TC_RED_SUPPORTED_FLAGS) ||
+ tcf_qevent_dump(skb, TCA_RED_MARK_BLOCK, &q->qe_mark) ||
+ tcf_qevent_dump(skb, TCA_RED_EARLY_DROP_BLOCK, &q->qe_early_drop))
goto nla_put_failure;
return nla_nest_end(skb, opts);
diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c
index 4074c50..da047a3 100644
--- a/net/sched/sch_sfb.c
+++ b/net/sched/sch_sfb.c
@@ -265,7 +265,7 @@
case TC_ACT_QUEUED:
case TC_ACT_TRAP:
*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
- /* fall through */
+ fallthrough;
case TC_ACT_SHOT:
return false;
}
diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c
index b92bafa..066754a 100644
--- a/net/sched/sch_sfq.c
+++ b/net/sched/sch_sfq.c
@@ -186,7 +186,7 @@
case TC_ACT_QUEUED:
case TC_ACT_TRAP:
*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
- /* fall through */
+ fallthrough;
case TC_ACT_SHOT:
return 0;
}
@@ -349,7 +349,7 @@
unsigned int hash, dropped;
sfq_index x, qlen;
struct sfq_slot *slot;
- int uninitialized_var(ret);
+ int ret;
struct sk_buff *head;
int delta;
diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c
index da9ed06..806babd 100644
--- a/net/sched/sch_taprio.c
+++ b/net/sched/sch_taprio.c
@@ -94,18 +94,22 @@
return ns_to_ktime(sched->base_time);
}
-static ktime_t taprio_get_time(struct taprio_sched *q)
+static ktime_t taprio_mono_to_any(const struct taprio_sched *q, ktime_t mono)
{
- ktime_t mono = ktime_get();
+ /* This pairs with WRITE_ONCE() in taprio_parse_clockid() */
+ enum tk_offsets tk_offset = READ_ONCE(q->tk_offset);
- switch (q->tk_offset) {
+ switch (tk_offset) {
case TK_OFFS_MAX:
return mono;
default:
- return ktime_mono_to_any(mono, q->tk_offset);
+ return ktime_mono_to_any(mono, tk_offset);
}
+}
- return KTIME_MAX;
+static ktime_t taprio_get_time(const struct taprio_sched *q)
+{
+ return taprio_mono_to_any(q, ktime_get());
}
static void taprio_free_sched_cb(struct rcu_head *head)
@@ -321,7 +325,7 @@
return 0;
}
- return ktime_mono_to_any(skb->skb_mstamp_ns, q->tk_offset);
+ return taprio_mono_to_any(q, skb->skb_mstamp_ns);
}
/* There are a few scenarios where we will have to modify the txtime from
@@ -1120,11 +1124,10 @@
static struct tc_taprio_qopt_offload *taprio_offload_alloc(int num_entries)
{
- size_t size = sizeof(struct tc_taprio_sched_entry) * num_entries +
- sizeof(struct __tc_taprio_qopt_offload);
struct __tc_taprio_qopt_offload *__offload;
- __offload = kzalloc(size, GFP_KERNEL);
+ __offload = kzalloc(struct_size(__offload, offload.entries, num_entries),
+ GFP_KERNEL);
if (!__offload)
return NULL;
@@ -1342,6 +1345,7 @@
}
} else if (tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID]) {
int clockid = nla_get_s32(tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID]);
+ enum tk_offsets tk_offset;
/* We only support static clockids and we don't allow
* for it to be modified after the first init.
@@ -1356,22 +1360,24 @@
switch (clockid) {
case CLOCK_REALTIME:
- q->tk_offset = TK_OFFS_REAL;
+ tk_offset = TK_OFFS_REAL;
break;
case CLOCK_MONOTONIC:
- q->tk_offset = TK_OFFS_MAX;
+ tk_offset = TK_OFFS_MAX;
break;
case CLOCK_BOOTTIME:
- q->tk_offset = TK_OFFS_BOOT;
+ tk_offset = TK_OFFS_BOOT;
break;
case CLOCK_TAI:
- q->tk_offset = TK_OFFS_TAI;
+ tk_offset = TK_OFFS_TAI;
break;
default:
NL_SET_ERR_MSG(extack, "Invalid 'clockid'");
err = -EINVAL;
goto out;
}
+ /* This pairs with READ_ONCE() in taprio_mono_to_any */
+ WRITE_ONCE(q->tk_offset, tk_offset);
q->clockid = clockid;
} else {
@@ -1613,8 +1619,9 @@
hrtimer_cancel(&q->advance_timer);
if (q->qdiscs) {
- for (i = 0; i < dev->num_tx_queues && q->qdiscs[i]; i++)
- qdisc_reset(q->qdiscs[i]);
+ for (i = 0; i < dev->num_tx_queues; i++)
+ if (q->qdiscs[i])
+ qdisc_reset(q->qdiscs[i]);
}
sch->qstats.backlog = 0;
sch->q.qlen = 0;
@@ -1630,6 +1637,10 @@
list_del(&q->taprio_list);
spin_unlock(&taprio_list_lock);
+ /* Note that taprio_reset() might not be called if an error
+ * happens in qdisc_create(), after taprio_init() has been called.
+ */
+ hrtimer_cancel(&q->advance_timer);
taprio_disable_offload(dev, q, NULL);
diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c
index 5f72f3f..78e7902 100644
--- a/net/sched/sch_tbf.c
+++ b/net/sched/sch_tbf.c
@@ -15,6 +15,7 @@
#include <linux/skbuff.h>
#include <net/netlink.h>
#include <net/sch_generic.h>
+#include <net/pkt_cls.h>
#include <net/pkt_sched.h>
@@ -137,6 +138,52 @@
return len;
}
+static void tbf_offload_change(struct Qdisc *sch)
+{
+ struct tbf_sched_data *q = qdisc_priv(sch);
+ struct net_device *dev = qdisc_dev(sch);
+ struct tc_tbf_qopt_offload qopt;
+
+ if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
+ return;
+
+ qopt.command = TC_TBF_REPLACE;
+ qopt.handle = sch->handle;
+ qopt.parent = sch->parent;
+ qopt.replace_params.rate = q->rate;
+ qopt.replace_params.max_size = q->max_size;
+ qopt.replace_params.qstats = &sch->qstats;
+
+ dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_TBF, &qopt);
+}
+
+static void tbf_offload_destroy(struct Qdisc *sch)
+{
+ struct net_device *dev = qdisc_dev(sch);
+ struct tc_tbf_qopt_offload qopt;
+
+ if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
+ return;
+
+ qopt.command = TC_TBF_DESTROY;
+ qopt.handle = sch->handle;
+ qopt.parent = sch->parent;
+ dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_TBF, &qopt);
+}
+
+static int tbf_offload_dump(struct Qdisc *sch)
+{
+ struct tc_tbf_qopt_offload qopt;
+
+ qopt.command = TC_TBF_STATS;
+ qopt.handle = sch->handle;
+ qopt.parent = sch->parent;
+ qopt.stats.bstats = &sch->bstats;
+ qopt.stats.qstats = &sch->qstats;
+
+ return qdisc_offload_dump_helper(sch, TC_SETUP_QDISC_TBF, &qopt);
+}
+
/* GSO packet is too big, segment it so that tbf can transmit
* each segment in time
*/
@@ -155,8 +202,7 @@
return qdisc_drop(skb, sch, to_free);
nb = 0;
- while (segs) {
- nskb = segs->next;
+ skb_list_walk_safe(segs, segs, nskb) {
skb_mark_not_on_list(segs);
qdisc_skb_cb(segs)->pkt_len = segs->len;
len += segs->len;
@@ -167,7 +213,6 @@
} else {
nb++;
}
- segs = nskb;
}
sch->q.qlen += nb;
if (nb > 1)
@@ -409,6 +454,8 @@
sch_tree_unlock(sch);
err = 0;
+
+ tbf_offload_change(sch);
done:
return err;
}
@@ -434,6 +481,7 @@
struct tbf_sched_data *q = qdisc_priv(sch);
qdisc_watchdog_cancel(&q->watchdog);
+ tbf_offload_destroy(sch);
qdisc_put(q->qdisc);
}
@@ -442,8 +490,12 @@
struct tbf_sched_data *q = qdisc_priv(sch);
struct nlattr *nest;
struct tc_tbf_qopt opt;
+ int err;
- sch->qstats.backlog = q->qdisc->qstats.backlog;
+ err = tbf_offload_dump(sch);
+ if (err)
+ return err;
+
nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
if (nest == NULL)
goto nla_put_failure;