Update Linux to v5.10.109
Sourced from [1]
[1] https://cdn.kernel.org/pub/linux/kernel/v5.x/linux-5.10.109.tar.xz
Change-Id: I19bca9fc6762d4e63bcf3e4cba88bbe560d9c76c
Signed-off-by: Olivier Deprez <olivier.deprez@arm.com>
diff --git a/tools/testing/selftests/net/.gitignore b/tools/testing/selftests/net/.gitignore
index 8aefd81..61ae899 100644
--- a/tools/testing/selftests/net/.gitignore
+++ b/tools/testing/selftests/net/.gitignore
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: GPL-2.0-only
+ipsec
msg_zerocopy
socket
psock_fanout
@@ -22,3 +24,9 @@
so_txtime
tcp_fastopen_backup_key
nettest
+fin_ack_lat
+reuseaddr_ports_exhausted
+hwtstamp_config
+rxtimestamp
+timestamping
+txtimestamp
diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile
index 0bd6b23..ef35247 100644
--- a/tools/testing/selftests/net/Makefile
+++ b/tools/testing/selftests/net/Makefile
@@ -10,7 +10,17 @@
TEST_PROGS += udpgso_bench.sh fib_rule_tests.sh msg_zerocopy.sh psock_snd.sh
TEST_PROGS += udpgro_bench.sh udpgro.sh test_vxlan_under_vrf.sh reuseport_addr_any.sh
TEST_PROGS += test_vxlan_fdb_changelink.sh so_txtime.sh ipv6_flowlabel.sh
-TEST_PROGS += tcp_fastopen_backup_key.sh fcnal-test.sh l2tp.sh
+TEST_PROGS += tcp_fastopen_backup_key.sh fcnal-test.sh l2tp.sh traceroute.sh
+TEST_PROGS += fin_ack_lat.sh fib_nexthop_multiprefix.sh fib_nexthops.sh
+TEST_PROGS += altnames.sh icmp_redirect.sh ip6_gre_headroom.sh
+TEST_PROGS += route_localnet.sh
+TEST_PROGS += reuseaddr_ports_exhausted.sh
+TEST_PROGS += txtimestamp.sh
+TEST_PROGS += vrf-xfrm-tests.sh
+TEST_PROGS += rxtimestamp.sh
+TEST_PROGS += devlink_port_split.py
+TEST_PROGS += drop_monitor_tests.sh
+TEST_PROGS += vrf_route_leaking.sh
TEST_PROGS_EXTENDED := in_netns.sh
TEST_GEN_FILES = socket nettest
TEST_GEN_FILES += psock_fanout psock_tpacket msg_zerocopy reuseport_addr_any
@@ -18,6 +28,10 @@
TEST_GEN_FILES += udpgso udpgso_bench_tx udpgso_bench_rx ip_defrag
TEST_GEN_FILES += so_txtime ipv6_flowlabel ipv6_flowlabel_mgr
TEST_GEN_FILES += tcp_fastopen_backup_key
+TEST_GEN_FILES += fin_ack_lat
+TEST_GEN_FILES += reuseaddr_ports_exhausted
+TEST_GEN_FILES += hwtstamp_config rxtimestamp timestamping txtimestamp
+TEST_GEN_FILES += ipsec
TEST_GEN_PROGS = reuseport_bpf reuseport_bpf_cpu reuseport_bpf_numa
TEST_GEN_PROGS += reuseport_dualstack reuseaddr_conflict tls
@@ -25,5 +39,5 @@
include ../lib.mk
$(OUTPUT)/reuseport_bpf_numa: LDLIBS += -lnuma
-$(OUTPUT)/tcp_mmap: LDFLAGS += -lpthread
-$(OUTPUT)/tcp_inq: LDFLAGS += -lpthread
+$(OUTPUT)/tcp_mmap: LDLIBS += -lpthread
+$(OUTPUT)/tcp_inq: LDLIBS += -lpthread
diff --git a/tools/testing/selftests/net/altnames.sh b/tools/testing/selftests/net/altnames.sh
new file mode 100755
index 0000000..1ef9e41
--- /dev/null
+++ b/tools/testing/selftests/net/altnames.sh
@@ -0,0 +1,75 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+lib_dir=$(dirname $0)/forwarding
+
+ALL_TESTS="altnames_test"
+NUM_NETIFS=0
+source $lib_dir/lib.sh
+
+DUMMY_DEV=dummytest
+SHORT_NAME=shortname
+LONG_NAME=someveryveryveryveryveryverylongname
+
+altnames_test()
+{
+ RET=0
+ local output
+ local name
+
+ ip link property add $DUMMY_DEV altname $SHORT_NAME
+ check_err $? "Failed to add short alternative name"
+
+ output=$(ip -j -p link show $SHORT_NAME)
+ check_err $? "Failed to do link show with short alternative name"
+
+ name=$(echo $output | jq -e -r ".[0].altnames[0]")
+ check_err $? "Failed to get short alternative name from link show JSON"
+
+ [ "$name" == "$SHORT_NAME" ]
+ check_err $? "Got unexpected short alternative name from link show JSON"
+
+ ip -j -p link show $DUMMY_DEV &>/dev/null
+ check_err $? "Failed to do link show with original name"
+
+ ip link property add $DUMMY_DEV altname $LONG_NAME
+ check_err $? "Failed to add long alternative name"
+
+ output=$(ip -j -p link show $LONG_NAME)
+ check_err $? "Failed to do link show with long alternative name"
+
+ name=$(echo $output | jq -e -r ".[0].altnames[1]")
+ check_err $? "Failed to get long alternative name from link show JSON"
+
+ [ "$name" == "$LONG_NAME" ]
+ check_err $? "Got unexpected long alternative name from link show JSON"
+
+ ip link property del $DUMMY_DEV altname $SHORT_NAME
+ check_err $? "Failed to delete short alternative name"
+
+ ip -j -p link show $SHORT_NAME &>/dev/null
+ check_fail $? "Unexpected success while trying to do link show with deleted short alternative name"
+
+ # long name is left there on purpose to be removed alongside the device
+
+ log_test "altnames test"
+}
+
+setup_prepare()
+{
+ ip link add name $DUMMY_DEV type dummy
+}
+
+cleanup()
+{
+ pre_cleanup
+ ip link del name $DUMMY_DEV
+}
+
+trap cleanup EXIT
+
+setup_prepare
+
+tests_run
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/net/config b/tools/testing/selftests/net/config
index 81fcc25..4d5df8e 100644
--- a/tools/testing/selftests/net/config
+++ b/tools/testing/selftests/net/config
@@ -12,6 +12,7 @@
CONFIG_DUMMY=y
CONFIG_BRIDGE=y
CONFIG_VLAN_8021Q=y
+CONFIG_IFB=y
CONFIG_NETFILTER=y
CONFIG_NETFILTER_ADVANCED=y
CONFIG_NF_CONNTRACK=m
@@ -23,10 +24,13 @@
CONFIG_NF_TABLES=m
CONFIG_NF_TABLES_IPV6=y
CONFIG_NF_TABLES_IPV4=y
-CONFIG_NFT_CHAIN_NAT_IPV6=m
-CONFIG_NFT_CHAIN_NAT_IPV4=m
+CONFIG_NFT_NAT=m
CONFIG_NET_SCH_FQ=m
CONFIG_NET_SCH_ETF=m
+CONFIG_NET_SCH_NETEM=y
CONFIG_TEST_BLACKHOLE_DEV=m
CONFIG_KALLSYMS=y
+CONFIG_TRACEPOINTS=y
+CONFIG_NET_DROP_MONITOR=m
+CONFIG_NETDEVSIM=m
CONFIG_NET_FOU=m
diff --git a/tools/testing/selftests/net/devlink_port_split.py b/tools/testing/selftests/net/devlink_port_split.py
new file mode 100755
index 0000000..834066d
--- /dev/null
+++ b/tools/testing/selftests/net/devlink_port_split.py
@@ -0,0 +1,277 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: GPL-2.0
+
+from subprocess import PIPE, Popen
+import json
+import time
+import argparse
+import collections
+import sys
+
+#
+# Test port split configuration using devlink-port lanes attribute.
+# The test is skipped in case the attribute is not available.
+#
+# First, check that all the ports with 1 lane fail to split.
+# Second, check that all the ports with more than 1 lane can be split
+# to all valid configurations (e.g., split to 2, split to 4 etc.)
+#
+
+
+Port = collections.namedtuple('Port', 'bus_info name')
+
+
+def run_command(cmd, should_fail=False):
+ """
+ Run a command in subprocess.
+ Return: Tuple of (stdout, stderr).
+ """
+
+ p = Popen(cmd, stdout=PIPE, stderr=PIPE, shell=True)
+ stdout, stderr = p.communicate()
+ stdout, stderr = stdout.decode(), stderr.decode()
+
+ if stderr != "" and not should_fail:
+ print("Error sending command: %s" % cmd)
+ print(stdout)
+ print(stderr)
+ return stdout, stderr
+
+
+class devlink_ports(object):
+ """
+ Class that holds information on the devlink ports, required to the tests;
+ if_names: A list of interfaces in the devlink ports.
+ """
+
+ def get_if_names(dev):
+ """
+ Get a list of physical devlink ports.
+ Return: Array of tuples (bus_info/port, if_name).
+ """
+
+ arr = []
+
+ cmd = "devlink -j port show"
+ stdout, stderr = run_command(cmd)
+ assert stderr == ""
+ ports = json.loads(stdout)['port']
+
+ for port in ports:
+ if dev in port:
+ if ports[port]['flavour'] == 'physical':
+ arr.append(Port(bus_info=port, name=ports[port]['netdev']))
+
+ return arr
+
+ def __init__(self, dev):
+ self.if_names = devlink_ports.get_if_names(dev)
+
+
+def get_max_lanes(port):
+ """
+ Get the $port's maximum number of lanes.
+ Return: number of lanes, e.g. 1, 2, 4 and 8.
+ """
+
+ cmd = "devlink -j port show %s" % port
+ stdout, stderr = run_command(cmd)
+ assert stderr == ""
+ values = list(json.loads(stdout)['port'].values())[0]
+
+ if 'lanes' in values:
+ lanes = values['lanes']
+ else:
+ lanes = 0
+ return lanes
+
+
+def get_split_ability(port):
+ """
+ Get the $port split ability.
+ Return: split ability, true or false.
+ """
+
+ cmd = "devlink -j port show %s" % port.name
+ stdout, stderr = run_command(cmd)
+ assert stderr == ""
+ values = list(json.loads(stdout)['port'].values())[0]
+
+ return values['splittable']
+
+
+def split(k, port, should_fail=False):
+ """
+ Split $port into $k ports.
+ If should_fail == True, the split should fail. Otherwise, should pass.
+ Return: Array of sub ports after splitting.
+ If the $port wasn't split, the array will be empty.
+ """
+
+ cmd = "devlink port split %s count %s" % (port.bus_info, k)
+ stdout, stderr = run_command(cmd, should_fail=should_fail)
+
+ if should_fail:
+ if not test(stderr != "", "%s is unsplittable" % port.name):
+ print("split an unsplittable port %s" % port.name)
+ return create_split_group(port, k)
+ else:
+ if stderr == "":
+ return create_split_group(port, k)
+ print("didn't split a splittable port %s" % port.name)
+
+ return []
+
+
+def unsplit(port):
+ """
+ Unsplit $port.
+ """
+
+ cmd = "devlink port unsplit %s" % port
+ stdout, stderr = run_command(cmd)
+ test(stderr == "", "Unsplit port %s" % port)
+
+
+def exists(port, dev):
+ """
+ Check if $port exists in the devlink ports.
+ Return: True is so, False otherwise.
+ """
+
+ return any(dev_port.name == port
+ for dev_port in devlink_ports.get_if_names(dev))
+
+
+def exists_and_lanes(ports, lanes, dev):
+ """
+ Check if every port in the list $ports exists in the devlink ports and has
+ $lanes number of lanes after splitting.
+ Return: True if both are True, False otherwise.
+ """
+
+ for port in ports:
+ max_lanes = get_max_lanes(port)
+ if not exists(port, dev):
+ print("port %s doesn't exist in devlink ports" % port)
+ return False
+ if max_lanes != lanes:
+ print("port %s has %d lanes, but %s were expected"
+ % (port, lanes, max_lanes))
+ return False
+ return True
+
+
+def test(cond, msg):
+ """
+ Check $cond and print a message accordingly.
+ Return: True is pass, False otherwise.
+ """
+
+ if cond:
+ print("TEST: %-60s [ OK ]" % msg)
+ else:
+ print("TEST: %-60s [FAIL]" % msg)
+
+ return cond
+
+
+def create_split_group(port, k):
+ """
+ Create the split group for $port.
+ Return: Array with $k elements, which are the split port group.
+ """
+
+ return list(port.name + "s" + str(i) for i in range(k))
+
+
+def split_unsplittable_port(port, k):
+ """
+ Test that splitting of unsplittable port fails.
+ """
+
+ # split to max
+ new_split_group = split(k, port, should_fail=True)
+
+ if new_split_group != []:
+ unsplit(port.bus_info)
+
+
+def split_splittable_port(port, k, lanes, dev):
+ """
+ Test that splitting of splittable port passes correctly.
+ """
+
+ new_split_group = split(k, port)
+
+ # Once the split command ends, it takes some time to the sub ifaces'
+ # to get their names. Use udevadm to continue only when all current udev
+ # events are handled.
+ cmd = "udevadm settle"
+ stdout, stderr = run_command(cmd)
+ assert stderr == ""
+
+ if new_split_group != []:
+ test(exists_and_lanes(new_split_group, lanes/k, dev),
+ "split port %s into %s" % (port.name, k))
+
+ unsplit(port.bus_info)
+
+
+def make_parser():
+ parser = argparse.ArgumentParser(description='A test for port splitting.')
+ parser.add_argument('--dev',
+ help='The devlink handle of the device under test. ' +
+ 'The default is the first registered devlink ' +
+ 'handle.')
+
+ return parser
+
+
+def main(cmdline=None):
+ parser = make_parser()
+ args = parser.parse_args(cmdline)
+
+ dev = args.dev
+ if not dev:
+ cmd = "devlink -j dev show"
+ stdout, stderr = run_command(cmd)
+ assert stderr == ""
+
+ devs = json.loads(stdout)['dev']
+ dev = list(devs.keys())[0]
+
+ cmd = "devlink dev show %s" % dev
+ stdout, stderr = run_command(cmd)
+ if stderr != "":
+ print("devlink device %s can not be found" % dev)
+ sys.exit(1)
+
+ ports = devlink_ports(dev)
+
+ for port in ports.if_names:
+ max_lanes = get_max_lanes(port.name)
+
+ # If max lanes is 0, do not test port splitting at all
+ if max_lanes == 0:
+ continue
+
+ # If 1 lane, shouldn't be able to split
+ elif max_lanes == 1:
+ test(not get_split_ability(port),
+ "%s should not be able to split" % port.name)
+ split_unsplittable_port(port, max_lanes)
+
+ # Else, splitting should pass and all the split ports should exist.
+ else:
+ lane = max_lanes
+ test(get_split_ability(port),
+ "%s should be able to split" % port.name)
+ while lane > 1:
+ split_splittable_port(port, lane, max_lanes, dev)
+
+ lane //= 2
+
+
+if __name__ == "__main__":
+ main()
diff --git a/tools/testing/selftests/net/drop_monitor_tests.sh b/tools/testing/selftests/net/drop_monitor_tests.sh
new file mode 100755
index 0000000..b7650e3
--- /dev/null
+++ b/tools/testing/selftests/net/drop_monitor_tests.sh
@@ -0,0 +1,215 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# This test is for checking drop monitor functionality.
+
+ret=0
+# Kselftest framework requirement - SKIP code is 4.
+ksft_skip=4
+
+# all tests in this script. Can be overridden with -t option
+TESTS="
+ sw_drops
+ hw_drops
+"
+
+IP="ip -netns ns1"
+TC="tc -netns ns1"
+DEVLINK="devlink -N ns1"
+NS_EXEC="ip netns exec ns1"
+NETDEVSIM_PATH=/sys/bus/netdevsim/
+DEV_ADDR=1337
+DEV=netdevsim${DEV_ADDR}
+DEVLINK_DEV=netdevsim/${DEV}
+
+log_test()
+{
+ local rc=$1
+ local expected=$2
+ local msg="$3"
+
+ if [ ${rc} -eq ${expected} ]; then
+ printf " TEST: %-60s [ OK ]\n" "${msg}"
+ nsuccess=$((nsuccess+1))
+ else
+ ret=1
+ nfail=$((nfail+1))
+ printf " TEST: %-60s [FAIL]\n" "${msg}"
+ fi
+}
+
+setup()
+{
+ modprobe netdevsim &> /dev/null
+
+ set -e
+ ip netns add ns1
+ $IP link add dummy10 up type dummy
+
+ $NS_EXEC echo "$DEV_ADDR 1" > ${NETDEVSIM_PATH}/new_device
+ udevadm settle
+ local netdev=$($NS_EXEC ls ${NETDEVSIM_PATH}/devices/${DEV}/net/)
+ $IP link set dev $netdev up
+
+ set +e
+}
+
+cleanup()
+{
+ $NS_EXEC echo "$DEV_ADDR" > ${NETDEVSIM_PATH}/del_device
+ ip netns del ns1
+}
+
+sw_drops_test()
+{
+ echo
+ echo "Software drops test"
+
+ setup
+
+ local dir=$(mktemp -d)
+
+ $TC qdisc add dev dummy10 clsact
+ $TC filter add dev dummy10 egress pref 1 handle 101 proto ip \
+ flower dst_ip 192.0.2.10 action drop
+
+ $NS_EXEC mausezahn dummy10 -a 00:11:22:33:44:55 -b 00:aa:bb:cc:dd:ee \
+ -A 192.0.2.1 -B 192.0.2.10 -t udp sp=12345,dp=54321 -c 0 -q \
+ -d 100msec &
+ timeout 5 dwdump -o sw -w ${dir}/packets.pcap
+ (( $(tshark -r ${dir}/packets.pcap \
+ -Y 'ip.dst == 192.0.2.10' 2> /dev/null | wc -l) != 0))
+ log_test $? 0 "Capturing active software drops"
+
+ rm ${dir}/packets.pcap
+
+ { kill %% && wait %%; } 2>/dev/null
+ timeout 5 dwdump -o sw -w ${dir}/packets.pcap
+ (( $(tshark -r ${dir}/packets.pcap \
+ -Y 'ip.dst == 192.0.2.10' 2> /dev/null | wc -l) == 0))
+ log_test $? 0 "Capturing inactive software drops"
+
+ rm -r $dir
+
+ cleanup
+}
+
+hw_drops_test()
+{
+ echo
+ echo "Hardware drops test"
+
+ setup
+
+ local dir=$(mktemp -d)
+
+ $DEVLINK trap set $DEVLINK_DEV trap blackhole_route action trap
+ timeout 5 dwdump -o hw -w ${dir}/packets.pcap
+ (( $(tshark -r ${dir}/packets.pcap \
+ -Y 'net_dm.hw_trap_name== blackhole_route' 2> /dev/null \
+ | wc -l) != 0))
+ log_test $? 0 "Capturing active hardware drops"
+
+ rm ${dir}/packets.pcap
+
+ $DEVLINK trap set $DEVLINK_DEV trap blackhole_route action drop
+ timeout 5 dwdump -o hw -w ${dir}/packets.pcap
+ (( $(tshark -r ${dir}/packets.pcap \
+ -Y 'net_dm.hw_trap_name== blackhole_route' 2> /dev/null \
+ | wc -l) == 0))
+ log_test $? 0 "Capturing inactive hardware drops"
+
+ rm -r $dir
+
+ cleanup
+}
+
+################################################################################
+# usage
+
+usage()
+{
+ cat <<EOF
+usage: ${0##*/} OPTS
+
+ -t <test> Test(s) to run (default: all)
+ (options: $TESTS)
+EOF
+}
+
+################################################################################
+# main
+
+while getopts ":t:h" opt; do
+ case $opt in
+ t) TESTS=$OPTARG;;
+ h) usage; exit 0;;
+ *) usage; exit 1;;
+ esac
+done
+
+if [ "$(id -u)" -ne 0 ];then
+ echo "SKIP: Need root privileges"
+ exit $ksft_skip;
+fi
+
+if [ ! -x "$(command -v ip)" ]; then
+ echo "SKIP: Could not run test without ip tool"
+ exit $ksft_skip
+fi
+
+if [ ! -x "$(command -v devlink)" ]; then
+ echo "SKIP: Could not run test without devlink tool"
+ exit $ksft_skip
+fi
+
+if [ ! -x "$(command -v tshark)" ]; then
+ echo "SKIP: Could not run test without tshark tool"
+ exit $ksft_skip
+fi
+
+if [ ! -x "$(command -v dwdump)" ]; then
+ echo "SKIP: Could not run test without dwdump tool"
+ exit $ksft_skip
+fi
+
+if [ ! -x "$(command -v udevadm)" ]; then
+ echo "SKIP: Could not run test without udevadm tool"
+ exit $ksft_skip
+fi
+
+if [ ! -x "$(command -v timeout)" ]; then
+ echo "SKIP: Could not run test without timeout tool"
+ exit $ksft_skip
+fi
+
+if [ ! -x "$(command -v mausezahn)" ]; then
+ echo "SKIP: Could not run test without mausezahn tool"
+ exit $ksft_skip
+fi
+
+tshark -G fields 2> /dev/null | grep -q net_dm
+if [ $? -ne 0 ]; then
+ echo "SKIP: tshark too old, missing net_dm dissector"
+ exit $ksft_skip
+fi
+
+# start clean
+cleanup &> /dev/null
+
+for t in $TESTS
+do
+ case $t in
+ sw_drops|sw) sw_drops_test;;
+ hw_drops|hw) hw_drops_test;;
+
+ help) echo "Test names: $TESTS"; exit 0;;
+ esac
+done
+
+if [ "$TESTS" != "none" ]; then
+ printf "\nTests passed: %3d\n" ${nsuccess}
+ printf "Tests failed: %3d\n" ${nfail}
+fi
+
+exit $ret
diff --git a/tools/testing/selftests/net/fcnal-test.sh b/tools/testing/selftests/net/fcnal-test.sh
index 38133da..ace976d 100755
--- a/tools/testing/selftests/net/fcnal-test.sh
+++ b/tools/testing/selftests/net/fcnal-test.sh
@@ -32,12 +32,17 @@
# lo2: 127.0.0.1/8, ::1/128
# 172.16.2.2/32, 2001:db8:2::2/128
#
+# ns-A to ns-C connection - only for VRF and same config
+# as ns-A to ns-B
+#
# server / client nomenclature relative to ns-A
VERBOSE=0
NSA_DEV=eth1
+NSA_DEV2=eth2
NSB_DEV=eth1
+NSC_DEV=eth2
VRF=red
VRF_TABLE=1101
@@ -45,17 +50,22 @@
NSA_IP=172.16.1.1
NSB_IP=172.16.1.2
VRF_IP=172.16.3.1
+NS_NET=172.16.1.0/24
# IPv6 config
NSA_IP6=2001:db8:1::1
NSB_IP6=2001:db8:1::2
VRF_IP6=2001:db8:3::1
+NS_NET6=2001:db8:1::/120
NSA_LO_IP=172.16.2.1
NSB_LO_IP=172.16.2.2
NSA_LO_IP6=2001:db8:2::1
NSB_LO_IP6=2001:db8:2::2
+MD5_PW=abc123
+MD5_WRONG_PW=abc1234
+
MCAST=ff02::1
# set after namespace create
NSA_LINKIP6=
@@ -63,9 +73,11 @@
NSA=ns-A
NSB=ns-B
+NSC=ns-C
NSA_CMD="ip netns exec ${NSA}"
NSB_CMD="ip netns exec ${NSB}"
+NSC_CMD="ip netns exec ${NSC}"
which ping6 > /dev/null 2>&1 && ping6=$(which ping6) || ping6=$(which ping)
@@ -195,6 +207,11 @@
do_run_cmd ${NSB_CMD} $*
}
+run_cmd_nsc()
+{
+ do_run_cmd ${NSC_CMD} $*
+}
+
setup_cmd()
{
local cmd="$*"
@@ -419,10 +436,30 @@
ip -netns ${NSA} link set dev ${NSA_DEV} down
ip -netns ${NSA} link del dev ${NSA_DEV}
+ ip netns pids ${NSA} | xargs kill 2>/dev/null
ip netns del ${NSA}
fi
+ ip netns pids ${NSB} | xargs kill 2>/dev/null
ip netns del ${NSB}
+ ip netns pids ${NSC} | xargs kill 2>/dev/null
+ ip netns del ${NSC} >/dev/null 2>&1
+}
+
+cleanup_vrf_dup()
+{
+ ip link del ${NSA_DEV2} >/dev/null 2>&1
+ ip netns pids ${NSC} | xargs kill 2>/dev/null
+ ip netns del ${NSC} >/dev/null 2>&1
+}
+
+setup_vrf_dup()
+{
+ # some VRF tests use ns-C which has the same config as
+ # ns-B but for a device NOT in the VRF
+ create_ns ${NSC} "-" "-"
+ connect_ns ${NSA} ${NSA_DEV2} ${NSA_IP}/24 ${NSA_IP6}/64 \
+ ${NSC} ${NSC_DEV} ${NSB_IP}/24 ${NSB_IP6}/64
}
setup()
@@ -766,6 +803,218 @@
################################################################################
# IPv4 TCP
+#
+# MD5 tests without VRF
+#
+ipv4_tcp_md5_novrf()
+{
+ #
+ # single address
+ #
+
+ # basic use case
+ log_start
+ run_cmd nettest -s -M ${MD5_PW} -r ${NSB_IP} &
+ sleep 1
+ run_cmd_nsb nettest -r ${NSA_IP} -M ${MD5_PW}
+ log_test $? 0 "MD5: Single address config"
+
+ # client sends MD5, server not configured
+ log_start
+ show_hint "Should timeout due to MD5 mismatch"
+ run_cmd nettest -s &
+ sleep 1
+ run_cmd_nsb nettest -r ${NSA_IP} -M ${MD5_PW}
+ log_test $? 2 "MD5: Server no config, client uses password"
+
+ # wrong password
+ log_start
+ show_hint "Should timeout since client uses wrong password"
+ run_cmd nettest -s -M ${MD5_PW} -r ${NSB_IP} &
+ sleep 1
+ run_cmd_nsb nettest -r ${NSA_IP} -M ${MD5_WRONG_PW}
+ log_test $? 2 "MD5: Client uses wrong password"
+
+ # client from different address
+ log_start
+ show_hint "Should timeout due to MD5 mismatch"
+ run_cmd nettest -s -M ${MD5_PW} -r ${NSB_LO_IP} &
+ sleep 1
+ run_cmd_nsb nettest -r ${NSA_IP} -M ${MD5_PW}
+ log_test $? 2 "MD5: Client address does not match address configured with password"
+
+ #
+ # MD5 extension - prefix length
+ #
+
+ # client in prefix
+ log_start
+ run_cmd nettest -s -M ${MD5_PW} -m ${NS_NET} &
+ sleep 1
+ run_cmd_nsb nettest -r ${NSA_IP} -M ${MD5_PW}
+ log_test $? 0 "MD5: Prefix config"
+
+ # client in prefix, wrong password
+ log_start
+ show_hint "Should timeout since client uses wrong password"
+ run_cmd nettest -s -M ${MD5_PW} -m ${NS_NET} &
+ sleep 1
+ run_cmd_nsb nettest -r ${NSA_IP} -M ${MD5_WRONG_PW}
+ log_test $? 2 "MD5: Prefix config, client uses wrong password"
+
+ # client outside of prefix
+ log_start
+ show_hint "Should timeout due to MD5 mismatch"
+ run_cmd nettest -s -M ${MD5_PW} -m ${NS_NET} &
+ sleep 1
+ run_cmd_nsb nettest -l ${NSB_LO_IP} -r ${NSA_IP} -M ${MD5_PW}
+ log_test $? 2 "MD5: Prefix config, client address not in configured prefix"
+}
+
+#
+# MD5 tests with VRF
+#
+ipv4_tcp_md5()
+{
+ #
+ # single address
+ #
+
+ # basic use case
+ log_start
+ run_cmd nettest -s -d ${VRF} -M ${MD5_PW} -r ${NSB_IP} &
+ sleep 1
+ run_cmd_nsb nettest -r ${NSA_IP} -M ${MD5_PW}
+ log_test $? 0 "MD5: VRF: Single address config"
+
+ # client sends MD5, server not configured
+ log_start
+ show_hint "Should timeout since server does not have MD5 auth"
+ run_cmd nettest -s -d ${VRF} &
+ sleep 1
+ run_cmd_nsb nettest -r ${NSA_IP} -M ${MD5_PW}
+ log_test $? 2 "MD5: VRF: Server no config, client uses password"
+
+ # wrong password
+ log_start
+ show_hint "Should timeout since client uses wrong password"
+ run_cmd nettest -s -d ${VRF} -M ${MD5_PW} -r ${NSB_IP} &
+ sleep 1
+ run_cmd_nsb nettest -r ${NSA_IP} -M ${MD5_WRONG_PW}
+ log_test $? 2 "MD5: VRF: Client uses wrong password"
+
+ # client from different address
+ log_start
+ show_hint "Should timeout since server config differs from client"
+ run_cmd nettest -s -d ${VRF} -M ${MD5_PW} -r ${NSB_LO_IP} &
+ sleep 1
+ run_cmd_nsb nettest -r ${NSA_IP} -M ${MD5_PW}
+ log_test $? 2 "MD5: VRF: Client address does not match address configured with password"
+
+ #
+ # MD5 extension - prefix length
+ #
+
+ # client in prefix
+ log_start
+ run_cmd nettest -s -d ${VRF} -M ${MD5_PW} -m ${NS_NET} &
+ sleep 1
+ run_cmd_nsb nettest -r ${NSA_IP} -M ${MD5_PW}
+ log_test $? 0 "MD5: VRF: Prefix config"
+
+ # client in prefix, wrong password
+ log_start
+ show_hint "Should timeout since client uses wrong password"
+ run_cmd nettest -s -d ${VRF} -M ${MD5_PW} -m ${NS_NET} &
+ sleep 1
+ run_cmd_nsb nettest -r ${NSA_IP} -M ${MD5_WRONG_PW}
+ log_test $? 2 "MD5: VRF: Prefix config, client uses wrong password"
+
+ # client outside of prefix
+ log_start
+ show_hint "Should timeout since client address is outside of prefix"
+ run_cmd nettest -s -d ${VRF} -M ${MD5_PW} -m ${NS_NET} &
+ sleep 1
+ run_cmd_nsb nettest -l ${NSB_LO_IP} -r ${NSA_IP} -M ${MD5_PW}
+ log_test $? 2 "MD5: VRF: Prefix config, client address not in configured prefix"
+
+ #
+ # duplicate config between default VRF and a VRF
+ #
+
+ log_start
+ run_cmd nettest -s -d ${VRF} -M ${MD5_PW} -r ${NSB_IP} &
+ run_cmd nettest -s -M ${MD5_WRONG_PW} -r ${NSB_IP} &
+ sleep 1
+ run_cmd_nsb nettest -r ${NSA_IP} -M ${MD5_PW}
+ log_test $? 0 "MD5: VRF: Single address config in default VRF and VRF, conn in VRF"
+
+ log_start
+ run_cmd nettest -s -d ${VRF} -M ${MD5_PW} -r ${NSB_IP} &
+ run_cmd nettest -s -M ${MD5_WRONG_PW} -r ${NSB_IP} &
+ sleep 1
+ run_cmd_nsc nettest -r ${NSA_IP} -M ${MD5_WRONG_PW}
+ log_test $? 0 "MD5: VRF: Single address config in default VRF and VRF, conn in default VRF"
+
+ log_start
+ show_hint "Should timeout since client in default VRF uses VRF password"
+ run_cmd nettest -s -d ${VRF} -M ${MD5_PW} -r ${NSB_IP} &
+ run_cmd nettest -s -M ${MD5_WRONG_PW} -r ${NSB_IP} &
+ sleep 1
+ run_cmd_nsc nettest -r ${NSA_IP} -M ${MD5_PW}
+ log_test $? 2 "MD5: VRF: Single address config in default VRF and VRF, conn in default VRF with VRF pw"
+
+ log_start
+ show_hint "Should timeout since client in VRF uses default VRF password"
+ run_cmd nettest -s -d ${VRF} -M ${MD5_PW} -r ${NSB_IP} &
+ run_cmd nettest -s -M ${MD5_WRONG_PW} -r ${NSB_IP} &
+ sleep 1
+ run_cmd_nsb nettest -r ${NSA_IP} -M ${MD5_WRONG_PW}
+ log_test $? 2 "MD5: VRF: Single address config in default VRF and VRF, conn in VRF with default VRF pw"
+
+ log_start
+ run_cmd nettest -s -d ${VRF} -M ${MD5_PW} -m ${NS_NET} &
+ run_cmd nettest -s -M ${MD5_WRONG_PW} -m ${NS_NET} &
+ sleep 1
+ run_cmd_nsb nettest -r ${NSA_IP} -M ${MD5_PW}
+ log_test $? 0 "MD5: VRF: Prefix config in default VRF and VRF, conn in VRF"
+
+ log_start
+ run_cmd nettest -s -d ${VRF} -M ${MD5_PW} -m ${NS_NET} &
+ run_cmd nettest -s -M ${MD5_WRONG_PW} -m ${NS_NET} &
+ sleep 1
+ run_cmd_nsc nettest -r ${NSA_IP} -M ${MD5_WRONG_PW}
+ log_test $? 0 "MD5: VRF: Prefix config in default VRF and VRF, conn in default VRF"
+
+ log_start
+ show_hint "Should timeout since client in default VRF uses VRF password"
+ run_cmd nettest -s -d ${VRF} -M ${MD5_PW} -m ${NS_NET} &
+ run_cmd nettest -s -M ${MD5_WRONG_PW} -m ${NS_NET} &
+ sleep 1
+ run_cmd_nsc nettest -r ${NSA_IP} -M ${MD5_PW}
+ log_test $? 2 "MD5: VRF: Prefix config in default VRF and VRF, conn in default VRF with VRF pw"
+
+ log_start
+ show_hint "Should timeout since client in VRF uses default VRF password"
+ run_cmd nettest -s -d ${VRF} -M ${MD5_PW} -m ${NS_NET} &
+ run_cmd nettest -s -M ${MD5_WRONG_PW} -m ${NS_NET} &
+ sleep 1
+ run_cmd_nsb nettest -r ${NSA_IP} -M ${MD5_WRONG_PW}
+ log_test $? 2 "MD5: VRF: Prefix config in default VRF and VRF, conn in VRF with default VRF pw"
+
+ #
+ # negative tests
+ #
+ log_start
+ run_cmd nettest -s -d ${NSA_DEV} -M ${MD5_PW} -r ${NSB_IP}
+ log_test $? 1 "MD5: VRF: Device must be a VRF - single address"
+
+ log_start
+ run_cmd nettest -s -d ${NSA_DEV} -M ${MD5_PW} -m ${NS_NET}
+ log_test $? 1 "MD5: VRF: Device must be a VRF - prefix"
+
+}
+
ipv4_tcp_novrf()
{
local a
@@ -883,6 +1132,8 @@
show_hint "Should fail 'Connection refused'"
run_cmd nettest -d ${NSA_DEV} -r ${a}
log_test_addr ${a} $? 1 "No server, device client, local conn"
+
+ ipv4_tcp_md5_novrf
}
ipv4_tcp_vrf()
@@ -935,6 +1186,11 @@
run_cmd nettest -r ${a} -d ${NSA_DEV}
log_test_addr ${a} $? 1 "Global server, local connection"
+ # run MD5 tests
+ setup_vrf_dup
+ ipv4_tcp_md5
+ cleanup_vrf_dup
+
#
# enable VRF global server
#
@@ -976,8 +1232,8 @@
for a in ${NSA_IP} ${VRF_IP}
do
log_start
- show_hint "Should fail 'No route to host' since client is not bound to VRF"
- run_cmd nettest -s -2 ${VRF} &
+ show_hint "Should fail 'Connection refused' since client is not bound to VRF"
+ run_cmd nettest -s -d ${VRF} &
sleep 1
run_cmd nettest -r ${a}
log_test_addr ${a} $? 1 "Global server, local connection"
@@ -1491,8 +1747,9 @@
for a in ${NSA_IP} ${VRF_IP}
do
log_start
+ show_hint "Socket not bound to VRF, but address is in VRF"
run_cmd nettest -s -R -P icmp -l ${a} -b
- log_test_addr ${a} $? 0 "Raw socket bind to local address"
+ log_test_addr ${a} $? 1 "Raw socket bind to local address"
log_start
run_cmd nettest -s -R -P icmp -l ${a} -d ${NSA_DEV} -b
@@ -1884,7 +2141,7 @@
log_start
show_hint "Fails since VRF device does not support linklocal or multicast"
run_cmd ${ping6} -c1 -w1 ${a}
- log_test_addr ${a} $? 2 "ping out, VRF bind"
+ log_test_addr ${a} $? 1 "ping out, VRF bind"
done
for a in ${NSB_IP6} ${NSB_LO_IP6} ${NSB_LINKIP6}%${NSA_DEV} ${MCAST}%${NSA_DEV}
@@ -2013,6 +2270,218 @@
################################################################################
# IPv6 TCP
+#
+# MD5 tests without VRF
+#
+ipv6_tcp_md5_novrf()
+{
+ #
+ # single address
+ #
+
+ # basic use case
+ log_start
+ run_cmd nettest -6 -s -M ${MD5_PW} -r ${NSB_IP6} &
+ sleep 1
+ run_cmd_nsb nettest -6 -r ${NSA_IP6} -M ${MD5_PW}
+ log_test $? 0 "MD5: Single address config"
+
+ # client sends MD5, server not configured
+ log_start
+ show_hint "Should timeout due to MD5 mismatch"
+ run_cmd nettest -6 -s &
+ sleep 1
+ run_cmd_nsb nettest -6 -r ${NSA_IP6} -M ${MD5_PW}
+ log_test $? 2 "MD5: Server no config, client uses password"
+
+ # wrong password
+ log_start
+ show_hint "Should timeout since client uses wrong password"
+ run_cmd nettest -6 -s -M ${MD5_PW} -r ${NSB_IP6} &
+ sleep 1
+ run_cmd_nsb nettest -6 -r ${NSA_IP6} -M ${MD5_WRONG_PW}
+ log_test $? 2 "MD5: Client uses wrong password"
+
+ # client from different address
+ log_start
+ show_hint "Should timeout due to MD5 mismatch"
+ run_cmd nettest -6 -s -M ${MD5_PW} -r ${NSB_LO_IP6} &
+ sleep 1
+ run_cmd_nsb nettest -6 -r ${NSA_IP6} -M ${MD5_PW}
+ log_test $? 2 "MD5: Client address does not match address configured with password"
+
+ #
+ # MD5 extension - prefix length
+ #
+
+ # client in prefix
+ log_start
+ run_cmd nettest -6 -s -M ${MD5_PW} -m ${NS_NET6} &
+ sleep 1
+ run_cmd_nsb nettest -6 -r ${NSA_IP6} -M ${MD5_PW}
+ log_test $? 0 "MD5: Prefix config"
+
+ # client in prefix, wrong password
+ log_start
+ show_hint "Should timeout since client uses wrong password"
+ run_cmd nettest -6 -s -M ${MD5_PW} -m ${NS_NET6} &
+ sleep 1
+ run_cmd_nsb nettest -6 -r ${NSA_IP6} -M ${MD5_WRONG_PW}
+ log_test $? 2 "MD5: Prefix config, client uses wrong password"
+
+ # client outside of prefix
+ log_start
+ show_hint "Should timeout due to MD5 mismatch"
+ run_cmd nettest -6 -s -M ${MD5_PW} -m ${NS_NET6} &
+ sleep 1
+ run_cmd_nsb nettest -6 -l ${NSB_LO_IP6} -r ${NSA_IP6} -M ${MD5_PW}
+ log_test $? 2 "MD5: Prefix config, client address not in configured prefix"
+}
+
+#
+# MD5 tests with VRF
+#
+ipv6_tcp_md5()
+{
+ #
+ # single address
+ #
+
+ # basic use case
+ log_start
+ run_cmd nettest -6 -s -d ${VRF} -M ${MD5_PW} -r ${NSB_IP6} &
+ sleep 1
+ run_cmd_nsb nettest -6 -r ${NSA_IP6} -M ${MD5_PW}
+ log_test $? 0 "MD5: VRF: Single address config"
+
+ # client sends MD5, server not configured
+ log_start
+ show_hint "Should timeout since server does not have MD5 auth"
+ run_cmd nettest -6 -s -d ${VRF} &
+ sleep 1
+ run_cmd_nsb nettest -6 -r ${NSA_IP6} -M ${MD5_PW}
+ log_test $? 2 "MD5: VRF: Server no config, client uses password"
+
+ # wrong password
+ log_start
+ show_hint "Should timeout since client uses wrong password"
+ run_cmd nettest -6 -s -d ${VRF} -M ${MD5_PW} -r ${NSB_IP6} &
+ sleep 1
+ run_cmd_nsb nettest -6 -r ${NSA_IP6} -M ${MD5_WRONG_PW}
+ log_test $? 2 "MD5: VRF: Client uses wrong password"
+
+ # client from different address
+ log_start
+ show_hint "Should timeout since server config differs from client"
+ run_cmd nettest -6 -s -d ${VRF} -M ${MD5_PW} -r ${NSB_LO_IP6} &
+ sleep 1
+ run_cmd_nsb nettest -6 -r ${NSA_IP6} -M ${MD5_PW}
+ log_test $? 2 "MD5: VRF: Client address does not match address configured with password"
+
+ #
+ # MD5 extension - prefix length
+ #
+
+ # client in prefix
+ log_start
+ run_cmd nettest -6 -s -d ${VRF} -M ${MD5_PW} -m ${NS_NET6} &
+ sleep 1
+ run_cmd_nsb nettest -6 -r ${NSA_IP6} -M ${MD5_PW}
+ log_test $? 0 "MD5: VRF: Prefix config"
+
+ # client in prefix, wrong password
+ log_start
+ show_hint "Should timeout since client uses wrong password"
+ run_cmd nettest -6 -s -d ${VRF} -M ${MD5_PW} -m ${NS_NET6} &
+ sleep 1
+ run_cmd_nsb nettest -6 -r ${NSA_IP6} -M ${MD5_WRONG_PW}
+ log_test $? 2 "MD5: VRF: Prefix config, client uses wrong password"
+
+ # client outside of prefix
+ log_start
+ show_hint "Should timeout since client address is outside of prefix"
+ run_cmd nettest -6 -s -d ${VRF} -M ${MD5_PW} -m ${NS_NET6} &
+ sleep 1
+ run_cmd_nsb nettest -6 -l ${NSB_LO_IP6} -r ${NSA_IP6} -M ${MD5_PW}
+ log_test $? 2 "MD5: VRF: Prefix config, client address not in configured prefix"
+
+ #
+ # duplicate config between default VRF and a VRF
+ #
+
+ log_start
+ run_cmd nettest -6 -s -d ${VRF} -M ${MD5_PW} -r ${NSB_IP6} &
+ run_cmd nettest -6 -s -M ${MD5_WRONG_PW} -r ${NSB_IP6} &
+ sleep 1
+ run_cmd_nsb nettest -6 -r ${NSA_IP6} -M ${MD5_PW}
+ log_test $? 0 "MD5: VRF: Single address config in default VRF and VRF, conn in VRF"
+
+ log_start
+ run_cmd nettest -6 -s -d ${VRF} -M ${MD5_PW} -r ${NSB_IP6} &
+ run_cmd nettest -6 -s -M ${MD5_WRONG_PW} -r ${NSB_IP6} &
+ sleep 1
+ run_cmd_nsc nettest -6 -r ${NSA_IP6} -M ${MD5_WRONG_PW}
+ log_test $? 0 "MD5: VRF: Single address config in default VRF and VRF, conn in default VRF"
+
+ log_start
+ show_hint "Should timeout since client in default VRF uses VRF password"
+ run_cmd nettest -6 -s -d ${VRF} -M ${MD5_PW} -r ${NSB_IP6} &
+ run_cmd nettest -6 -s -M ${MD5_WRONG_PW} -r ${NSB_IP6} &
+ sleep 1
+ run_cmd_nsc nettest -6 -r ${NSA_IP6} -M ${MD5_PW}
+ log_test $? 2 "MD5: VRF: Single address config in default VRF and VRF, conn in default VRF with VRF pw"
+
+ log_start
+ show_hint "Should timeout since client in VRF uses default VRF password"
+ run_cmd nettest -6 -s -d ${VRF} -M ${MD5_PW} -r ${NSB_IP6} &
+ run_cmd nettest -6 -s -M ${MD5_WRONG_PW} -r ${NSB_IP6} &
+ sleep 1
+ run_cmd_nsb nettest -6 -r ${NSA_IP6} -M ${MD5_WRONG_PW}
+ log_test $? 2 "MD5: VRF: Single address config in default VRF and VRF, conn in VRF with default VRF pw"
+
+ log_start
+ run_cmd nettest -6 -s -d ${VRF} -M ${MD5_PW} -m ${NS_NET6} &
+ run_cmd nettest -6 -s -M ${MD5_WRONG_PW} -m ${NS_NET6} &
+ sleep 1
+ run_cmd_nsb nettest -6 -r ${NSA_IP6} -M ${MD5_PW}
+ log_test $? 0 "MD5: VRF: Prefix config in default VRF and VRF, conn in VRF"
+
+ log_start
+ run_cmd nettest -6 -s -d ${VRF} -M ${MD5_PW} -m ${NS_NET6} &
+ run_cmd nettest -6 -s -M ${MD5_WRONG_PW} -m ${NS_NET6} &
+ sleep 1
+ run_cmd_nsc nettest -6 -r ${NSA_IP6} -M ${MD5_WRONG_PW}
+ log_test $? 0 "MD5: VRF: Prefix config in default VRF and VRF, conn in default VRF"
+
+ log_start
+ show_hint "Should timeout since client in default VRF uses VRF password"
+ run_cmd nettest -6 -s -d ${VRF} -M ${MD5_PW} -m ${NS_NET6} &
+ run_cmd nettest -6 -s -M ${MD5_WRONG_PW} -m ${NS_NET6} &
+ sleep 1
+ run_cmd_nsc nettest -6 -r ${NSA_IP6} -M ${MD5_PW}
+ log_test $? 2 "MD5: VRF: Prefix config in default VRF and VRF, conn in default VRF with VRF pw"
+
+ log_start
+ show_hint "Should timeout since client in VRF uses default VRF password"
+ run_cmd nettest -6 -s -d ${VRF} -M ${MD5_PW} -m ${NS_NET6} &
+ run_cmd nettest -6 -s -M ${MD5_WRONG_PW} -m ${NS_NET6} &
+ sleep 1
+ run_cmd_nsb nettest -6 -r ${NSA_IP6} -M ${MD5_WRONG_PW}
+ log_test $? 2 "MD5: VRF: Prefix config in default VRF and VRF, conn in VRF with default VRF pw"
+
+ #
+ # negative tests
+ #
+ log_start
+ run_cmd nettest -6 -s -d ${NSA_DEV} -M ${MD5_PW} -r ${NSB_IP6}
+ log_test $? 1 "MD5: VRF: Device must be a VRF - single address"
+
+ log_start
+ run_cmd nettest -6 -s -d ${NSA_DEV} -M ${MD5_PW} -m ${NS_NET6}
+ log_test $? 1 "MD5: VRF: Device must be a VRF - prefix"
+
+}
+
ipv6_tcp_novrf()
{
local a
@@ -2129,6 +2598,8 @@
run_cmd nettest -6 -d ${NSA_DEV} -r ${a}
log_test_addr ${a} $? 1 "No server, device client, local conn"
done
+
+ ipv6_tcp_md5_novrf
}
ipv6_tcp_vrf()
@@ -2197,6 +2668,11 @@
run_cmd nettest -6 -r ${a} -d ${NSA_DEV}
log_test_addr ${a} $? 1 "Global server, local connection"
+ # run MD5 tests
+ setup_vrf_dup
+ ipv6_tcp_md5
+ cleanup_vrf_dup
+
#
# enable VRF global server
#
@@ -2257,8 +2733,8 @@
for a in ${NSA_IP6} ${VRF_IP6}
do
log_start
- show_hint "Fails 'No route to host' since client is not in VRF"
- run_cmd nettest -6 -s -2 ${VRF} &
+ show_hint "Fails 'Connection refused' since client is not in VRF"
+ run_cmd nettest -6 -s -d ${VRF} &
sleep 1
run_cmd nettest -6 -r ${a}
log_test_addr ${a} $? 1 "Global server, local connection"
@@ -2890,11 +3366,14 @@
run_cmd nettest -6 -s -l ${a} -d ${NSA_DEV} -t1 -b
log_test_addr ${a} $? 0 "TCP socket bind to local address after device bind"
+ # Sadly, the kernel allows binding a socket to a device and then
+ # binding to an address not on the device. So this test passes
+ # when it really should not
a=${NSA_LO_IP6}
log_start
- show_hint "Should fail with 'Cannot assign requested address'"
- run_cmd nettest -6 -s -l ${a} -d ${NSA_DEV} -t1 -b
- log_test_addr ${a} $? 1 "TCP socket bind to out of scope local address"
+ show_hint "Tecnically should fail since address is not on device but kernel allows"
+ run_cmd nettest -6 -s -l ${a} -I ${NSA_DEV} -t1 -b
+ log_test_addr ${a} $? 0 "TCP socket bind to out of scope local address"
}
ipv6_addr_bind_vrf()
@@ -2935,10 +3414,15 @@
run_cmd nettest -6 -s -l ${a} -d ${NSA_DEV} -t1 -b
log_test_addr ${a} $? 0 "TCP socket bind to local address with device bind"
+ # Sadly, the kernel allows binding a socket to a device and then
+ # binding to an address not on the device. The only restriction
+ # is that the address is valid in the L3 domain. So this test
+ # passes when it really should not
a=${VRF_IP6}
log_start
- run_cmd nettest -6 -s -l ${a} -d ${NSA_DEV} -t1 -b
- log_test_addr ${a} $? 1 "TCP socket bind to VRF address with device bind"
+ show_hint "Tecnically should fail since address is not on device but kernel allows"
+ run_cmd nettest -6 -s -l ${a} -I ${NSA_DEV} -t1 -b
+ log_test_addr ${a} $? 0 "TCP socket bind to VRF address with device bind"
a=${NSA_LO_IP6}
log_start
@@ -3450,8 +3934,8 @@
################################################################################
# main
-TESTS_IPV4="ipv4_ping ipv4_tcp ipv4_udp ipv4_addr_bind ipv4_runtime ipv4_netfilter"
-TESTS_IPV6="ipv6_ping ipv6_tcp ipv6_udp ipv6_addr_bind ipv6_runtime ipv6_netfilter"
+TESTS_IPV4="ipv4_ping ipv4_tcp ipv4_udp ipv4_bind ipv4_runtime ipv4_netfilter"
+TESTS_IPV6="ipv6_ping ipv6_tcp ipv6_udp ipv6_bind ipv6_runtime ipv6_netfilter"
TESTS_OTHER="use_cases"
PAUSE_ON_FAIL=no
diff --git a/tools/testing/selftests/net/fib_nexthops.sh b/tools/testing/selftests/net/fib_nexthops.sh
index 09830b8..4c7d336 100755
--- a/tools/testing/selftests/net/fib_nexthops.sh
+++ b/tools/testing/selftests/net/fib_nexthops.sh
@@ -19,8 +19,8 @@
ksft_skip=4
# all tests in this script. Can be overridden with -t option
-IPV4_TESTS="ipv4_fcnal ipv4_grp_fcnal ipv4_withv6_fcnal ipv4_fcnal_runtime"
-IPV6_TESTS="ipv6_fcnal ipv6_grp_fcnal ipv6_fcnal_runtime"
+IPV4_TESTS="ipv4_fcnal ipv4_grp_fcnal ipv4_withv6_fcnal ipv4_fcnal_runtime ipv4_large_grp ipv4_compat_mode ipv4_fdb_grp_fcnal ipv4_torture"
+IPV6_TESTS="ipv6_fcnal ipv6_grp_fcnal ipv6_fcnal_runtime ipv6_large_grp ipv6_compat_mode ipv6_fdb_grp_fcnal ipv6_torture"
ALL_TESTS="basic ${IPV4_TESTS} ${IPV6_TESTS}"
TESTS="${ALL_TESTS}"
@@ -146,35 +146,36 @@
create_ns remote
IP="ip -netns me"
+ BRIDGE="bridge -netns me"
set -e
$IP li add veth1 type veth peer name veth2
$IP li set veth1 up
$IP addr add 172.16.1.1/24 dev veth1
- $IP -6 addr add 2001:db8:91::1/64 dev veth1
+ $IP -6 addr add 2001:db8:91::1/64 dev veth1 nodad
$IP li add veth3 type veth peer name veth4
$IP li set veth3 up
$IP addr add 172.16.2.1/24 dev veth3
- $IP -6 addr add 2001:db8:92::1/64 dev veth3
+ $IP -6 addr add 2001:db8:92::1/64 dev veth3 nodad
$IP li set veth2 netns peer up
ip -netns peer addr add 172.16.1.2/24 dev veth2
- ip -netns peer -6 addr add 2001:db8:91::2/64 dev veth2
+ ip -netns peer -6 addr add 2001:db8:91::2/64 dev veth2 nodad
$IP li set veth4 netns peer up
ip -netns peer addr add 172.16.2.2/24 dev veth4
- ip -netns peer -6 addr add 2001:db8:92::2/64 dev veth4
+ ip -netns peer -6 addr add 2001:db8:92::2/64 dev veth4 nodad
ip -netns remote li add veth5 type veth peer name veth6
ip -netns remote li set veth5 up
ip -netns remote addr add dev veth5 172.16.101.1/24
- ip -netns remote addr add dev veth5 2001:db8:101::1/64
+ ip -netns remote -6 addr add dev veth5 2001:db8:101::1/64 nodad
ip -netns remote ro add 172.16.0.0/22 via 172.16.101.2
ip -netns remote -6 ro add 2001:db8:90::/40 via 2001:db8:101::2
ip -netns remote li set veth6 netns peer up
ip -netns peer addr add dev veth6 172.16.101.2/24
- ip -netns peer addr add dev veth6 2001:db8:101::2/64
+ ip -netns peer -6 addr add dev veth6 2001:db8:101::2/64 nodad
set +e
}
@@ -248,11 +249,261 @@
local expected="$2"
local out
- out=$($IP -6 route ls match ${pfx} 2>/dev/null)
+ out=$($IP -6 route ls match ${pfx} 2>/dev/null | sed -e 's/pref medium//')
check_output "${out}" "${expected}"
}
+check_large_grp()
+{
+ local ipv=$1
+ local ecmp=$2
+ local grpnum=100
+ local nhidstart=100
+ local grpidstart=1000
+ local iter=0
+ local nhidstr=""
+ local grpidstr=""
+ local grpstr=""
+ local ipstr=""
+
+ if [ $ipv -eq 4 ]; then
+ ipstr="172.16.1."
+ else
+ ipstr="2001:db8:91::"
+ fi
+
+ #
+ # Create $grpnum groups with specified $ecmp and dump them
+ #
+
+ # create nexthops with different gateways
+ iter=2
+ while [ $iter -le $(($ecmp + 1)) ]
+ do
+ nhidstr="$(($nhidstart + $iter))"
+ run_cmd "$IP nexthop add id $nhidstr via $ipstr$iter dev veth1"
+ check_nexthop "id $nhidstr" "id $nhidstr via $ipstr$iter dev veth1 scope link"
+
+ if [ $iter -le $ecmp ]; then
+ grpstr+="$nhidstr/"
+ else
+ grpstr+="$nhidstr"
+ fi
+ ((iter++))
+ done
+
+ # create duplicate large ecmp groups
+ iter=0
+ while [ $iter -le $grpnum ]
+ do
+ grpidstr="$(($grpidstart + $iter))"
+ run_cmd "$IP nexthop add id $grpidstr group $grpstr"
+ check_nexthop "id $grpidstr" "id $grpidstr group $grpstr"
+ ((iter++))
+ done
+
+ # dump large groups
+ run_cmd "$IP nexthop list"
+ log_test $? 0 "Dump large (x$ecmp) ecmp groups"
+}
+
+start_ip_monitor()
+{
+ local mtype=$1
+
+ # start the monitor in the background
+ tmpfile=`mktemp /var/run/nexthoptestXXX`
+ mpid=`($IP monitor $mtype > $tmpfile & echo $!) 2>/dev/null`
+ sleep 0.2
+ echo "$mpid $tmpfile"
+}
+
+stop_ip_monitor()
+{
+ local mpid=$1
+ local tmpfile=$2
+ local el=$3
+
+ # check the monitor results
+ kill $mpid
+ lines=`wc -l $tmpfile | cut "-d " -f1`
+ test $lines -eq $el
+ rc=$?
+ rm -rf $tmpfile
+
+ return $rc
+}
+
+check_nexthop_fdb_support()
+{
+ $IP nexthop help 2>&1 | grep -q fdb
+ if [ $? -ne 0 ]; then
+ echo "SKIP: iproute2 too old, missing fdb nexthop support"
+ return $ksft_skip
+ fi
+}
+
+ipv6_fdb_grp_fcnal()
+{
+ local rc
+
+ echo
+ echo "IPv6 fdb groups functional"
+ echo "--------------------------"
+
+ check_nexthop_fdb_support
+ if [ $? -eq $ksft_skip ]; then
+ return $ksft_skip
+ fi
+
+ # create group with multiple nexthops
+ run_cmd "$IP nexthop add id 61 via 2001:db8:91::2 fdb"
+ run_cmd "$IP nexthop add id 62 via 2001:db8:91::3 fdb"
+ run_cmd "$IP nexthop add id 102 group 61/62 fdb"
+ check_nexthop "id 102" "id 102 group 61/62 fdb"
+ log_test $? 0 "Fdb Nexthop group with multiple nexthops"
+
+ ## get nexthop group
+ run_cmd "$IP nexthop get id 102"
+ check_nexthop "id 102" "id 102 group 61/62 fdb"
+ log_test $? 0 "Get Fdb nexthop group by id"
+
+ # fdb nexthop group can only contain fdb nexthops
+ run_cmd "$IP nexthop add id 63 via 2001:db8:91::4"
+ run_cmd "$IP nexthop add id 64 via 2001:db8:91::5"
+ run_cmd "$IP nexthop add id 103 group 63/64 fdb"
+ log_test $? 2 "Fdb Nexthop group with non-fdb nexthops"
+
+ # Non fdb nexthop group can not contain fdb nexthops
+ run_cmd "$IP nexthop add id 65 via 2001:db8:91::5 fdb"
+ run_cmd "$IP nexthop add id 66 via 2001:db8:91::6 fdb"
+ run_cmd "$IP nexthop add id 104 group 65/66"
+ log_test $? 2 "Non-Fdb Nexthop group with fdb nexthops"
+
+ # fdb nexthop cannot have blackhole
+ run_cmd "$IP nexthop add id 67 blackhole fdb"
+ log_test $? 2 "Fdb Nexthop with blackhole"
+
+ # fdb nexthop with oif
+ run_cmd "$IP nexthop add id 68 via 2001:db8:91::7 dev veth1 fdb"
+ log_test $? 2 "Fdb Nexthop with oif"
+
+ # fdb nexthop with onlink
+ run_cmd "$IP nexthop add id 68 via 2001:db8:91::7 onlink fdb"
+ log_test $? 2 "Fdb Nexthop with onlink"
+
+ # fdb nexthop with encap
+ run_cmd "$IP nexthop add id 69 encap mpls 101 via 2001:db8:91::8 dev veth1 fdb"
+ log_test $? 2 "Fdb Nexthop with encap"
+
+ run_cmd "$IP link add name vx10 type vxlan id 1010 local 2001:db8:91::9 remote 2001:db8:91::10 dstport 4789 nolearning noudpcsum tos inherit ttl 100"
+ run_cmd "$BRIDGE fdb add 02:02:00:00:00:13 dev vx10 nhid 102 self"
+ log_test $? 0 "Fdb mac add with nexthop group"
+
+ ## fdb nexthops can only reference nexthop groups and not nexthops
+ run_cmd "$BRIDGE fdb add 02:02:00:00:00:14 dev vx10 nhid 61 self"
+ log_test $? 255 "Fdb mac add with nexthop"
+
+ run_cmd "$IP -6 ro add 2001:db8:101::1/128 nhid 66"
+ log_test $? 2 "Route add with fdb nexthop"
+
+ run_cmd "$IP -6 ro add 2001:db8:101::1/128 nhid 103"
+ log_test $? 2 "Route add with fdb nexthop group"
+
+ run_cmd "$IP nexthop del id 61"
+ run_cmd "$BRIDGE fdb get to 02:02:00:00:00:13 dev vx10 self"
+ log_test $? 0 "Fdb entry after deleting a single nexthop"
+
+ run_cmd "$IP nexthop del id 102"
+ log_test $? 0 "Fdb nexthop delete"
+
+ run_cmd "$BRIDGE fdb get to 02:02:00:00:00:13 dev vx10 self"
+ log_test $? 254 "Fdb entry after deleting a nexthop group"
+
+ $IP link del dev vx10
+}
+
+ipv4_fdb_grp_fcnal()
+{
+ local rc
+
+ echo
+ echo "IPv4 fdb groups functional"
+ echo "--------------------------"
+
+ check_nexthop_fdb_support
+ if [ $? -eq $ksft_skip ]; then
+ return $ksft_skip
+ fi
+
+ # create group with multiple nexthops
+ run_cmd "$IP nexthop add id 12 via 172.16.1.2 fdb"
+ run_cmd "$IP nexthop add id 13 via 172.16.1.3 fdb"
+ run_cmd "$IP nexthop add id 102 group 12/13 fdb"
+ check_nexthop "id 102" "id 102 group 12/13 fdb"
+ log_test $? 0 "Fdb Nexthop group with multiple nexthops"
+
+ # get nexthop group
+ run_cmd "$IP nexthop get id 102"
+ check_nexthop "id 102" "id 102 group 12/13 fdb"
+ log_test $? 0 "Get Fdb nexthop group by id"
+
+ # fdb nexthop group can only contain fdb nexthops
+ run_cmd "$IP nexthop add id 14 via 172.16.1.2"
+ run_cmd "$IP nexthop add id 15 via 172.16.1.3"
+ run_cmd "$IP nexthop add id 103 group 14/15 fdb"
+ log_test $? 2 "Fdb Nexthop group with non-fdb nexthops"
+
+ # Non fdb nexthop group can not contain fdb nexthops
+ run_cmd "$IP nexthop add id 16 via 172.16.1.2 fdb"
+ run_cmd "$IP nexthop add id 17 via 172.16.1.3 fdb"
+ run_cmd "$IP nexthop add id 104 group 14/15"
+ log_test $? 2 "Non-Fdb Nexthop group with fdb nexthops"
+
+ # fdb nexthop cannot have blackhole
+ run_cmd "$IP nexthop add id 18 blackhole fdb"
+ log_test $? 2 "Fdb Nexthop with blackhole"
+
+ # fdb nexthop with oif
+ run_cmd "$IP nexthop add id 16 via 172.16.1.2 dev veth1 fdb"
+ log_test $? 2 "Fdb Nexthop with oif"
+
+ # fdb nexthop with onlink
+ run_cmd "$IP nexthop add id 16 via 172.16.1.2 onlink fdb"
+ log_test $? 2 "Fdb Nexthop with onlink"
+
+ # fdb nexthop with encap
+ run_cmd "$IP nexthop add id 17 encap mpls 101 via 172.16.1.2 dev veth1 fdb"
+ log_test $? 2 "Fdb Nexthop with encap"
+
+ run_cmd "$IP link add name vx10 type vxlan id 1010 local 10.0.0.1 remote 10.0.0.2 dstport 4789 nolearning noudpcsum tos inherit ttl 100"
+ run_cmd "$BRIDGE fdb add 02:02:00:00:00:13 dev vx10 nhid 102 self"
+ log_test $? 0 "Fdb mac add with nexthop group"
+
+ # fdb nexthops can only reference nexthop groups and not nexthops
+ run_cmd "$BRIDGE fdb add 02:02:00:00:00:14 dev vx10 nhid 12 self"
+ log_test $? 255 "Fdb mac add with nexthop"
+
+ run_cmd "$IP ro add 172.16.0.0/22 nhid 15"
+ log_test $? 2 "Route add with fdb nexthop"
+
+ run_cmd "$IP ro add 172.16.0.0/22 nhid 103"
+ log_test $? 2 "Route add with fdb nexthop group"
+
+ run_cmd "$IP nexthop del id 12"
+ run_cmd "$BRIDGE fdb get to 02:02:00:00:00:13 dev vx10 self"
+ log_test $? 0 "Fdb entry after deleting a single nexthop"
+
+ run_cmd "$IP nexthop del id 102"
+ log_test $? 0 "Fdb nexthop delete"
+
+ run_cmd "$BRIDGE fdb get to 02:02:00:00:00:13 dev vx10 self"
+ log_test $? 254 "Fdb entry after deleting a nexthop group"
+
+ $IP link del dev vx10
+}
+
################################################################################
# basic operations (add, delete, replace) on nexthops and nexthop groups
#
@@ -423,8 +674,6 @@
echo "IPv6 functional runtime"
echo "-----------------------"
- sleep 5
-
#
# IPv6 - the basics
#
@@ -481,12 +730,12 @@
run_cmd "$IP -6 nexthop add id 85 dev veth1"
run_cmd "$IP ro replace 2001:db8:101::1/128 nhid 85"
log_test $? 0 "IPv6 route with device only nexthop"
- check_route6 "2001:db8:101::1" "2001:db8:101::1 nhid 85 dev veth1 metric 1024 pref medium"
+ check_route6 "2001:db8:101::1" "2001:db8:101::1 nhid 85 dev veth1 metric 1024"
run_cmd "$IP nexthop add id 123 group 81/85"
run_cmd "$IP ro replace 2001:db8:101::1/128 nhid 123"
log_test $? 0 "IPv6 multipath route with nexthop mix - dev only + gw"
- check_route6 "2001:db8:101::1" "2001:db8:101::1 nhid 123 metric 1024 nexthop via 2001:db8:91::2 dev veth1 weight 1 nexthop dev veth1 weight 1 pref medium"
+ check_route6 "2001:db8:101::1" "2001:db8:101::1 nhid 123 metric 1024 nexthop via 2001:db8:91::2 dev veth1 weight 1 nexthop dev veth1 weight 1"
#
# IPv6 route with v4 nexthop - not allowed
@@ -504,6 +753,36 @@
run_cmd "$IP nexthop replace id 81 via 172.16.1.1 dev veth1"
log_test $? 2 "Nexthop replace of group entry - v6 route, v4 nexthop"
+ run_cmd "$IP nexthop add id 86 via 2001:db8:92::2 dev veth3"
+ run_cmd "$IP nexthop add id 87 via 172.16.1.1 dev veth1"
+ run_cmd "$IP nexthop add id 88 via 172.16.1.1 dev veth1"
+ run_cmd "$IP nexthop add id 124 group 86/87/88"
+ run_cmd "$IP ro replace 2001:db8:101::1/128 nhid 124"
+ log_test $? 2 "IPv6 route can not have a group with v4 and v6 gateways"
+
+ run_cmd "$IP nexthop del id 88"
+ run_cmd "$IP ro replace 2001:db8:101::1/128 nhid 124"
+ log_test $? 2 "IPv6 route can not have a group with v4 and v6 gateways"
+
+ run_cmd "$IP nexthop del id 87"
+ run_cmd "$IP ro replace 2001:db8:101::1/128 nhid 124"
+ log_test $? 0 "IPv6 route using a group after removing v4 gateways"
+
+ run_cmd "$IP ro delete 2001:db8:101::1/128"
+ run_cmd "$IP nexthop add id 87 via 172.16.1.1 dev veth1"
+ run_cmd "$IP nexthop add id 88 via 172.16.1.1 dev veth1"
+ run_cmd "$IP nexthop replace id 124 group 86/87/88"
+ run_cmd "$IP ro replace 2001:db8:101::1/128 nhid 124"
+ log_test $? 2 "IPv6 route can not have a group with v4 and v6 gateways"
+
+ run_cmd "$IP nexthop replace id 88 via 2001:db8:92::2 dev veth3"
+ run_cmd "$IP ro replace 2001:db8:101::1/128 nhid 124"
+ log_test $? 2 "IPv6 route can not have a group with v4 and v6 gateways"
+
+ run_cmd "$IP nexthop replace id 87 via 2001:db8:92::2 dev veth3"
+ run_cmd "$IP ro replace 2001:db8:101::1/128 nhid 124"
+ log_test $? 0 "IPv6 route using a group after replacing v4 gateways"
+
$IP nexthop flush >/dev/null 2>&1
#
@@ -532,6 +811,75 @@
# route with src address and using nexthop - not allowed
}
+ipv6_large_grp()
+{
+ local ecmp=32
+
+ echo
+ echo "IPv6 large groups (x$ecmp)"
+ echo "---------------------"
+
+ check_large_grp 6 $ecmp
+
+ $IP nexthop flush >/dev/null 2>&1
+}
+
+ipv6_del_add_loop1()
+{
+ while :; do
+ $IP nexthop del id 100
+ $IP nexthop add id 100 via 2001:db8:91::2 dev veth1
+ done >/dev/null 2>&1
+}
+
+ipv6_grp_replace_loop()
+{
+ while :; do
+ $IP nexthop replace id 102 group 100/101
+ done >/dev/null 2>&1
+}
+
+ipv6_torture()
+{
+ local pid1
+ local pid2
+ local pid3
+ local pid4
+ local pid5
+
+ echo
+ echo "IPv6 runtime torture"
+ echo "--------------------"
+ if [ ! -x "$(command -v mausezahn)" ]; then
+ echo "SKIP: Could not run test; need mausezahn tool"
+ return
+ fi
+
+ run_cmd "$IP nexthop add id 100 via 2001:db8:91::2 dev veth1"
+ run_cmd "$IP nexthop add id 101 via 2001:db8:92::2 dev veth3"
+ run_cmd "$IP nexthop add id 102 group 100/101"
+ run_cmd "$IP route add 2001:db8:101::1 nhid 102"
+ run_cmd "$IP route add 2001:db8:101::2 nhid 102"
+
+ ipv6_del_add_loop1 &
+ pid1=$!
+ ipv6_grp_replace_loop &
+ pid2=$!
+ ip netns exec me ping -f 2001:db8:101::1 >/dev/null 2>&1 &
+ pid3=$!
+ ip netns exec me ping -f 2001:db8:101::2 >/dev/null 2>&1 &
+ pid4=$!
+ ip netns exec me mausezahn -6 veth1 -B 2001:db8:101::2 -A 2001:db8:91::1 -c 0 -t tcp "dp=1-1023, flags=syn" >/dev/null 2>&1 &
+ pid5=$!
+
+ sleep 300
+ kill -9 $pid1 $pid2 $pid3 $pid4 $pid5
+
+ # if we did not crash, success
+ log_test 0 0 "IPv6 torture test"
+}
+
+
ipv4_fcnal()
{
local rc
@@ -879,6 +1227,11 @@
$IP neigh sh | grep 'dev veth1'
fi
+ run_cmd "$IP ro del 172.16.101.1/32 via inet6 ${lladdr} dev veth1"
+ run_cmd "$IP -4 ro add default via inet6 ${lladdr} dev veth1"
+ run_cmd "ip netns exec me ping -c1 -w1 172.16.101.1"
+ log_test $? 0 "IPv4 default route with IPv6 gateway"
+
#
# MPLS as an example of LWT encap
#
@@ -893,6 +1246,241 @@
log_test $? 0 "IPv4 route with MPLS encap, v6 gw - check"
}
+ipv4_large_grp()
+{
+ local ecmp=32
+
+ echo
+ echo "IPv4 large groups (x$ecmp)"
+ echo "---------------------"
+
+ check_large_grp 4 $ecmp
+
+ $IP nexthop flush >/dev/null 2>&1
+}
+
+sysctl_nexthop_compat_mode_check()
+{
+ local sysctlname="net.ipv4.nexthop_compat_mode"
+ local lprefix=$1
+
+ IPE="ip netns exec me"
+
+ $IPE sysctl -q $sysctlname 2>&1 >/dev/null
+ if [ $? -ne 0 ]; then
+ echo "SKIP: kernel lacks nexthop compat mode sysctl control"
+ return $ksft_skip
+ fi
+
+ out=$($IPE sysctl $sysctlname 2>/dev/null)
+ log_test $? 0 "$lprefix default nexthop compat mode check"
+ check_output "${out}" "$sysctlname = 1"
+}
+
+sysctl_nexthop_compat_mode_set()
+{
+ local sysctlname="net.ipv4.nexthop_compat_mode"
+ local mode=$1
+ local lprefix=$2
+
+ IPE="ip netns exec me"
+
+ out=$($IPE sysctl -w $sysctlname=$mode)
+ log_test $? 0 "$lprefix set compat mode - $mode"
+ check_output "${out}" "net.ipv4.nexthop_compat_mode = $mode"
+}
+
+ipv6_compat_mode()
+{
+ local rc
+
+ echo
+ echo "IPv6 nexthop api compat mode test"
+ echo "--------------------------------"
+
+ sysctl_nexthop_compat_mode_check "IPv6"
+ if [ $? -eq $ksft_skip ]; then
+ return $ksft_skip
+ fi
+
+ run_cmd "$IP nexthop add id 62 via 2001:db8:91::2 dev veth1"
+ run_cmd "$IP nexthop add id 63 via 2001:db8:91::3 dev veth1"
+ run_cmd "$IP nexthop add id 122 group 62/63"
+ ipmout=$(start_ip_monitor route)
+
+ run_cmd "$IP -6 ro add 2001:db8:101::1/128 nhid 122"
+ # route add notification should contain expanded nexthops
+ stop_ip_monitor $ipmout 3
+ log_test $? 0 "IPv6 compat mode on - route add notification"
+
+ # route dump should contain expanded nexthops
+ check_route6 "2001:db8:101::1" "2001:db8:101::1 nhid 122 metric 1024 nexthop via 2001:db8:91::2 dev veth1 weight 1 nexthop via 2001:db8:91::3 dev veth1 weight 1"
+ log_test $? 0 "IPv6 compat mode on - route dump"
+
+ # change in nexthop group should generate route notification
+ run_cmd "$IP nexthop add id 64 via 2001:db8:91::4 dev veth1"
+ ipmout=$(start_ip_monitor route)
+ run_cmd "$IP nexthop replace id 122 group 62/64"
+ stop_ip_monitor $ipmout 3
+
+ log_test $? 0 "IPv6 compat mode on - nexthop change"
+
+ # set compat mode off
+ sysctl_nexthop_compat_mode_set 0 "IPv6"
+
+ run_cmd "$IP -6 ro del 2001:db8:101::1/128 nhid 122"
+
+ run_cmd "$IP nexthop add id 62 via 2001:db8:91::2 dev veth1"
+ run_cmd "$IP nexthop add id 63 via 2001:db8:91::3 dev veth1"
+ run_cmd "$IP nexthop add id 122 group 62/63"
+ ipmout=$(start_ip_monitor route)
+
+ run_cmd "$IP -6 ro add 2001:db8:101::1/128 nhid 122"
+ # route add notification should not contain expanded nexthops
+ stop_ip_monitor $ipmout 1
+ log_test $? 0 "IPv6 compat mode off - route add notification"
+
+ # route dump should not contain expanded nexthops
+ check_route6 "2001:db8:101::1" "2001:db8:101::1 nhid 122 metric 1024"
+ log_test $? 0 "IPv6 compat mode off - route dump"
+
+ # change in nexthop group should not generate route notification
+ run_cmd "$IP nexthop add id 64 via 2001:db8:91::4 dev veth1"
+ ipmout=$(start_ip_monitor route)
+ run_cmd "$IP nexthop replace id 122 group 62/64"
+ stop_ip_monitor $ipmout 0
+ log_test $? 0 "IPv6 compat mode off - nexthop change"
+
+ # nexthop delete should not generate route notification
+ ipmout=$(start_ip_monitor route)
+ run_cmd "$IP nexthop del id 122"
+ stop_ip_monitor $ipmout 0
+ log_test $? 0 "IPv6 compat mode off - nexthop delete"
+
+ # set compat mode back on
+ sysctl_nexthop_compat_mode_set 1 "IPv6"
+}
+
+ipv4_compat_mode()
+{
+ local rc
+
+ echo
+ echo "IPv4 nexthop api compat mode"
+ echo "----------------------------"
+
+ sysctl_nexthop_compat_mode_check "IPv4"
+ if [ $? -eq $ksft_skip ]; then
+ return $ksft_skip
+ fi
+
+ run_cmd "$IP nexthop add id 21 via 172.16.1.2 dev veth1"
+ run_cmd "$IP nexthop add id 22 via 172.16.1.2 dev veth1"
+ run_cmd "$IP nexthop add id 122 group 21/22"
+ ipmout=$(start_ip_monitor route)
+
+ run_cmd "$IP ro add 172.16.101.1/32 nhid 122"
+ stop_ip_monitor $ipmout 3
+
+ # route add notification should contain expanded nexthops
+ log_test $? 0 "IPv4 compat mode on - route add notification"
+
+ # route dump should contain expanded nexthops
+ check_route "172.16.101.1" "172.16.101.1 nhid 122 nexthop via 172.16.1.2 dev veth1 weight 1 nexthop via 172.16.1.2 dev veth1 weight 1"
+ log_test $? 0 "IPv4 compat mode on - route dump"
+
+ # change in nexthop group should generate route notification
+ run_cmd "$IP nexthop add id 23 via 172.16.1.3 dev veth1"
+ ipmout=$(start_ip_monitor route)
+ run_cmd "$IP nexthop replace id 122 group 21/23"
+ stop_ip_monitor $ipmout 3
+ log_test $? 0 "IPv4 compat mode on - nexthop change"
+
+ sysctl_nexthop_compat_mode_set 0 "IPv4"
+
+ # cleanup
+ run_cmd "$IP ro del 172.16.101.1/32 nhid 122"
+
+ ipmout=$(start_ip_monitor route)
+ run_cmd "$IP ro add 172.16.101.1/32 nhid 122"
+ stop_ip_monitor $ipmout 1
+ # route add notification should not contain expanded nexthops
+ log_test $? 0 "IPv4 compat mode off - route add notification"
+
+ # route dump should not contain expanded nexthops
+ check_route "172.16.101.1" "172.16.101.1 nhid 122"
+ log_test $? 0 "IPv4 compat mode off - route dump"
+
+ # change in nexthop group should not generate route notification
+ ipmout=$(start_ip_monitor route)
+ run_cmd "$IP nexthop replace id 122 group 21/22"
+ stop_ip_monitor $ipmout 0
+ log_test $? 0 "IPv4 compat mode off - nexthop change"
+
+ # nexthop delete should not generate route notification
+ ipmout=$(start_ip_monitor route)
+ run_cmd "$IP nexthop del id 122"
+ stop_ip_monitor $ipmout 0
+ log_test $? 0 "IPv4 compat mode off - nexthop delete"
+
+ sysctl_nexthop_compat_mode_set 1 "IPv4"
+}
+
+ipv4_del_add_loop1()
+{
+ while :; do
+ $IP nexthop del id 100
+ $IP nexthop add id 100 via 172.16.1.2 dev veth1
+ done >/dev/null 2>&1
+}
+
+ipv4_grp_replace_loop()
+{
+ while :; do
+ $IP nexthop replace id 102 group 100/101
+ done >/dev/null 2>&1
+}
+
+ipv4_torture()
+{
+ local pid1
+ local pid2
+ local pid3
+ local pid4
+ local pid5
+
+ echo
+ echo "IPv4 runtime torture"
+ echo "--------------------"
+ if [ ! -x "$(command -v mausezahn)" ]; then
+ echo "SKIP: Could not run test; need mausezahn tool"
+ return
+ fi
+
+ run_cmd "$IP nexthop add id 100 via 172.16.1.2 dev veth1"
+ run_cmd "$IP nexthop add id 101 via 172.16.2.2 dev veth3"
+ run_cmd "$IP nexthop add id 102 group 100/101"
+ run_cmd "$IP route add 172.16.101.1 nhid 102"
+ run_cmd "$IP route add 172.16.101.2 nhid 102"
+
+ ipv4_del_add_loop1 &
+ pid1=$!
+ ipv4_grp_replace_loop &
+ pid2=$!
+ ip netns exec me ping -f 172.16.101.1 >/dev/null 2>&1 &
+ pid3=$!
+ ip netns exec me ping -f 172.16.101.2 >/dev/null 2>&1 &
+ pid4=$!
+ ip netns exec me mausezahn veth1 -B 172.16.101.2 -A 172.16.1.1 -c 0 -t tcp "dp=1-1023, flags=syn" >/dev/null 2>&1 &
+ pid5=$!
+
+ sleep 300
+ kill -9 $pid1 $pid2 $pid3 $pid4 $pid5
+
+ # if we did not crash, success
+ log_test 0 0 "IPv4 torture test"
+}
+
basic()
{
echo
diff --git a/tools/testing/selftests/net/fib_tests.sh b/tools/testing/selftests/net/fib_tests.sh
index 4e19a1c..a7f53c2 100755
--- a/tools/testing/selftests/net/fib_tests.sh
+++ b/tools/testing/selftests/net/fib_tests.sh
@@ -9,7 +9,7 @@
ksft_skip=4
# all tests in this script. Can be overridden with -t option
-TESTS="unregister down carrier nexthop suppress ipv6_rt ipv4_rt ipv6_addr_metric ipv4_addr_metric ipv6_route_metrics ipv4_route_metrics ipv4_route_v6_gw rp_filter"
+TESTS="unregister down carrier nexthop suppress ipv6_rt ipv4_rt ipv6_addr_metric ipv4_addr_metric ipv6_route_metrics ipv4_route_metrics ipv4_route_v6_gw rp_filter ipv4_del_addr"
VERBOSE=0
PAUSE_ON_FAIL=no
@@ -444,24 +444,63 @@
setup
set -e
+ ip netns add ns2
+ ip netns set ns2 auto
+
+ ip -netns ns2 link set dev lo up
+
+ $IP link add name veth1 type veth peer name veth2
+ $IP link set dev veth2 netns ns2
+ $IP address add 192.0.2.1/24 dev veth1
+ ip -netns ns2 address add 192.0.2.1/24 dev veth2
+ $IP link set dev veth1 up
+ ip -netns ns2 link set dev veth2 up
+
$IP link set dev lo address 52:54:00:6a:c7:5e
- $IP link set dummy0 address 52:54:00:6a:c7:5e
- $IP link add dummy1 type dummy
- $IP link set dummy1 address 52:54:00:6a:c7:5e
- $IP link set dev dummy1 up
+ $IP link set dev veth1 address 52:54:00:6a:c7:5e
+ ip -netns ns2 link set dev lo address 52:54:00:6a:c7:5e
+ ip -netns ns2 link set dev veth2 address 52:54:00:6a:c7:5e
+
+ # 1. (ns2) redirect lo's egress to veth2's egress
+ ip netns exec ns2 tc qdisc add dev lo parent root handle 1: fq_codel
+ ip netns exec ns2 tc filter add dev lo parent 1: protocol arp basic \
+ action mirred egress redirect dev veth2
+ ip netns exec ns2 tc filter add dev lo parent 1: protocol ip basic \
+ action mirred egress redirect dev veth2
+
+ # 2. (ns1) redirect veth1's ingress to lo's ingress
+ $NS_EXEC tc qdisc add dev veth1 ingress
+ $NS_EXEC tc filter add dev veth1 ingress protocol arp basic \
+ action mirred ingress redirect dev lo
+ $NS_EXEC tc filter add dev veth1 ingress protocol ip basic \
+ action mirred ingress redirect dev lo
+
+ # 3. (ns1) redirect lo's egress to veth1's egress
+ $NS_EXEC tc qdisc add dev lo parent root handle 1: fq_codel
+ $NS_EXEC tc filter add dev lo parent 1: protocol arp basic \
+ action mirred egress redirect dev veth1
+ $NS_EXEC tc filter add dev lo parent 1: protocol ip basic \
+ action mirred egress redirect dev veth1
+
+ # 4. (ns2) redirect veth2's ingress to lo's ingress
+ ip netns exec ns2 tc qdisc add dev veth2 ingress
+ ip netns exec ns2 tc filter add dev veth2 ingress protocol arp basic \
+ action mirred ingress redirect dev lo
+ ip netns exec ns2 tc filter add dev veth2 ingress protocol ip basic \
+ action mirred ingress redirect dev lo
+
$NS_EXEC sysctl -qw net.ipv4.conf.all.rp_filter=1
$NS_EXEC sysctl -qw net.ipv4.conf.all.accept_local=1
$NS_EXEC sysctl -qw net.ipv4.conf.all.route_localnet=1
-
- $NS_EXEC tc qd add dev dummy1 parent root handle 1: fq_codel
- $NS_EXEC tc filter add dev dummy1 parent 1: protocol arp basic action mirred egress redirect dev lo
- $NS_EXEC tc filter add dev dummy1 parent 1: protocol ip basic action mirred egress redirect dev lo
+ ip netns exec ns2 sysctl -qw net.ipv4.conf.all.rp_filter=1
+ ip netns exec ns2 sysctl -qw net.ipv4.conf.all.accept_local=1
+ ip netns exec ns2 sysctl -qw net.ipv4.conf.all.route_localnet=1
set +e
- run_cmd "ip netns exec ns1 ping -I dummy1 -w1 -c1 198.51.100.1"
+ run_cmd "ip netns exec ns2 ping -w1 -c1 192.0.2.1"
log_test $? 0 "rp_filter passes local packets"
- run_cmd "ip netns exec ns1 ping -I dummy1 -w1 -c1 127.0.0.1"
+ run_cmd "ip netns exec ns2 ping -w1 -c1 127.0.0.1"
log_test $? 0 "rp_filter passes loopback packets"
cleanup
@@ -1384,12 +1423,37 @@
ipv4_rt_replace_mpath
}
+# checks that cached input route on VRF port is deleted
+# when VRF is deleted
+ipv4_local_rt_cache()
+{
+ run_cmd "ip addr add 10.0.0.1/32 dev lo"
+ run_cmd "ip netns add test-ns"
+ run_cmd "ip link add veth-outside type veth peer name veth-inside"
+ run_cmd "ip link add vrf-100 type vrf table 1100"
+ run_cmd "ip link set veth-outside master vrf-100"
+ run_cmd "ip link set veth-inside netns test-ns"
+ run_cmd "ip link set veth-outside up"
+ run_cmd "ip link set vrf-100 up"
+ run_cmd "ip route add 10.1.1.1/32 dev veth-outside table 1100"
+ run_cmd "ip netns exec test-ns ip link set veth-inside up"
+ run_cmd "ip netns exec test-ns ip addr add 10.1.1.1/32 dev veth-inside"
+ run_cmd "ip netns exec test-ns ip route add 10.0.0.1/32 dev veth-inside"
+ run_cmd "ip netns exec test-ns ip route add default via 10.0.0.1"
+ run_cmd "ip netns exec test-ns ping 10.0.0.1 -c 1 -i 1"
+ run_cmd "ip link delete vrf-100"
+
+ # if we do not hang test is a success
+ log_test $? 0 "Cached route removed from VRF port device"
+}
+
ipv4_route_test()
{
route_setup
ipv4_rt_add
ipv4_rt_replace
+ ipv4_local_rt_cache
route_cleanup
}
@@ -1539,6 +1603,55 @@
route_cleanup
}
+ipv4_del_addr_test()
+{
+ echo
+ echo "IPv4 delete address route tests"
+
+ setup
+
+ set -e
+ $IP li add dummy1 type dummy
+ $IP li set dummy1 up
+ $IP li add dummy2 type dummy
+ $IP li set dummy2 up
+ $IP li add red type vrf table 1111
+ $IP li set red up
+ $IP ro add vrf red unreachable default
+ $IP li set dummy2 vrf red
+
+ $IP addr add dev dummy1 172.16.104.1/24
+ $IP addr add dev dummy1 172.16.104.11/24
+ $IP addr add dev dummy2 172.16.104.1/24
+ $IP addr add dev dummy2 172.16.104.11/24
+ $IP route add 172.16.105.0/24 via 172.16.104.2 src 172.16.104.11
+ $IP route add vrf red 172.16.105.0/24 via 172.16.104.2 src 172.16.104.11
+ set +e
+
+ # removing address from device in vrf should only remove route from vrf table
+ $IP addr del dev dummy2 172.16.104.11/24
+ $IP ro ls vrf red | grep -q 172.16.105.0/24
+ log_test $? 1 "Route removed from VRF when source address deleted"
+
+ $IP ro ls | grep -q 172.16.105.0/24
+ log_test $? 0 "Route in default VRF not removed"
+
+ $IP addr add dev dummy2 172.16.104.11/24
+ $IP route add vrf red 172.16.105.0/24 via 172.16.104.2 src 172.16.104.11
+
+ $IP addr del dev dummy1 172.16.104.11/24
+ $IP ro ls | grep -q 172.16.105.0/24
+ log_test $? 1 "Route removed in default VRF when source address deleted"
+
+ $IP ro ls vrf red | grep -q 172.16.105.0/24
+ log_test $? 0 "Route in VRF is not removed by address delete"
+
+ $IP li del dummy1
+ $IP li del dummy2
+ cleanup
+}
+
+
ipv4_route_v6_gw_test()
{
local rc
@@ -1672,6 +1785,7 @@
ipv4_route_test|ipv4_rt) ipv4_route_test;;
ipv6_addr_metric) ipv6_addr_metric_test;;
ipv4_addr_metric) ipv4_addr_metric_test;;
+ ipv4_del_addr) ipv4_del_addr_test;;
ipv6_route_metrics) ipv6_route_metrics_test;;
ipv4_route_metrics) ipv4_route_metrics_test;;
ipv4_route_v6_gw) ipv4_route_v6_gw_test;;
diff --git a/tools/testing/selftests/net/fin_ack_lat.c b/tools/testing/selftests/net/fin_ack_lat.c
new file mode 100644
index 0000000..7018749
--- /dev/null
+++ b/tools/testing/selftests/net/fin_ack_lat.c
@@ -0,0 +1,151 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <arpa/inet.h>
+#include <errno.h>
+#include <error.h>
+#include <netinet/in.h>
+#include <netinet/tcp.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/socket.h>
+#include <sys/time.h>
+#include <unistd.h>
+
+static int child_pid;
+
+static unsigned long timediff(struct timeval s, struct timeval e)
+{
+ unsigned long s_us, e_us;
+
+ s_us = s.tv_sec * 1000000 + s.tv_usec;
+ e_us = e.tv_sec * 1000000 + e.tv_usec;
+ if (s_us > e_us)
+ return 0;
+ return e_us - s_us;
+}
+
+static void client(int port)
+{
+ int sock = 0;
+ struct sockaddr_in addr, laddr;
+ socklen_t len = sizeof(laddr);
+ struct linger sl;
+ int flag = 1;
+ int buffer;
+ struct timeval start, end;
+ unsigned long lat, sum_lat = 0, nr_lat = 0;
+
+ while (1) {
+ gettimeofday(&start, NULL);
+
+ sock = socket(AF_INET, SOCK_STREAM, 0);
+ if (sock < 0)
+ error(-1, errno, "socket creation");
+
+ sl.l_onoff = 1;
+ sl.l_linger = 0;
+ if (setsockopt(sock, SOL_SOCKET, SO_LINGER, &sl, sizeof(sl)))
+ error(-1, errno, "setsockopt(linger)");
+
+ if (setsockopt(sock, IPPROTO_TCP, TCP_NODELAY,
+ &flag, sizeof(flag)))
+ error(-1, errno, "setsockopt(nodelay)");
+
+ addr.sin_family = AF_INET;
+ addr.sin_port = htons(port);
+
+ if (inet_pton(AF_INET, "127.0.0.1", &addr.sin_addr) <= 0)
+ error(-1, errno, "inet_pton");
+
+ if (connect(sock, (struct sockaddr *)&addr, sizeof(addr)) < 0)
+ error(-1, errno, "connect");
+
+ send(sock, &buffer, sizeof(buffer), 0);
+ if (read(sock, &buffer, sizeof(buffer)) == -1)
+ error(-1, errno, "waiting read");
+
+ gettimeofday(&end, NULL);
+ lat = timediff(start, end);
+ sum_lat += lat;
+ nr_lat++;
+ if (lat < 100000)
+ goto close;
+
+ if (getsockname(sock, (struct sockaddr *)&laddr, &len) == -1)
+ error(-1, errno, "getsockname");
+ printf("port: %d, lat: %lu, avg: %lu, nr: %lu\n",
+ ntohs(laddr.sin_port), lat,
+ sum_lat / nr_lat, nr_lat);
+close:
+ fflush(stdout);
+ close(sock);
+ }
+}
+
+static void server(int sock, struct sockaddr_in address)
+{
+ int accepted;
+ int addrlen = sizeof(address);
+ int buffer;
+
+ while (1) {
+ accepted = accept(sock, (struct sockaddr *)&address,
+ (socklen_t *)&addrlen);
+ if (accepted < 0)
+ error(-1, errno, "accept");
+
+ if (read(accepted, &buffer, sizeof(buffer)) == -1)
+ error(-1, errno, "read");
+ close(accepted);
+ }
+}
+
+static void sig_handler(int signum)
+{
+ kill(SIGTERM, child_pid);
+ exit(0);
+}
+
+int main(int argc, char const *argv[])
+{
+ int sock;
+ int opt = 1;
+ struct sockaddr_in address;
+ struct sockaddr_in laddr;
+ socklen_t len = sizeof(laddr);
+
+ if (signal(SIGTERM, sig_handler) == SIG_ERR)
+ error(-1, errno, "signal");
+
+ sock = socket(AF_INET, SOCK_STREAM, 0);
+ if (sock < 0)
+ error(-1, errno, "socket");
+
+ if (setsockopt(sock, SOL_SOCKET, SO_REUSEADDR | SO_REUSEPORT,
+ &opt, sizeof(opt)) == -1)
+ error(-1, errno, "setsockopt");
+
+ address.sin_family = AF_INET;
+ address.sin_addr.s_addr = INADDR_ANY;
+ /* dynamically allocate unused port */
+ address.sin_port = 0;
+
+ if (bind(sock, (struct sockaddr *)&address, sizeof(address)) < 0)
+ error(-1, errno, "bind");
+
+ if (listen(sock, 3) < 0)
+ error(-1, errno, "listen");
+
+ if (getsockname(sock, (struct sockaddr *)&laddr, &len) == -1)
+ error(-1, errno, "getsockname");
+
+ fprintf(stderr, "server port: %d\n", ntohs(laddr.sin_port));
+ child_pid = fork();
+ if (!child_pid)
+ client(ntohs(laddr.sin_port));
+ else
+ server(sock, laddr);
+
+ return 0;
+}
diff --git a/tools/testing/selftests/net/fin_ack_lat.sh b/tools/testing/selftests/net/fin_ack_lat.sh
new file mode 100755
index 0000000..a3ff6e0
--- /dev/null
+++ b/tools/testing/selftests/net/fin_ack_lat.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Test latency spikes caused by FIN/ACK handling race.
+
+set +x
+set -e
+
+tmpfile=$(mktemp /tmp/fin_ack_latency.XXXX.log)
+
+cleanup() {
+ kill $(pidof fin_ack_lat)
+ rm -f $tmpfile
+}
+
+trap cleanup EXIT
+
+do_test() {
+ RUNTIME=$1
+
+ ./fin_ack_lat | tee $tmpfile &
+ PID=$!
+
+ sleep $RUNTIME
+ NR_SPIKES=$(wc -l $tmpfile | awk '{print $1}')
+ if [ $NR_SPIKES -gt 0 ]
+ then
+ echo "FAIL: $NR_SPIKES spikes detected"
+ return 1
+ fi
+ return 0
+}
+
+do_test "30"
+echo "test done"
diff --git a/tools/testing/selftests/net/forwarding/.gitignore b/tools/testing/selftests/net/forwarding/.gitignore
index a793eef..2dea317 100644
--- a/tools/testing/selftests/net/forwarding/.gitignore
+++ b/tools/testing/selftests/net/forwarding/.gitignore
@@ -1 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
forwarding.config
diff --git a/tools/testing/selftests/net/forwarding/Makefile b/tools/testing/selftests/net/forwarding/Makefile
new file mode 100644
index 0000000..881e680
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/Makefile
@@ -0,0 +1,77 @@
+# SPDX-License-Identifier: GPL-2.0+ OR MIT
+
+TEST_PROGS = bridge_igmp.sh \
+ bridge_port_isolation.sh \
+ bridge_sticky_fdb.sh \
+ bridge_vlan_aware.sh \
+ bridge_vlan_unaware.sh \
+ ethtool.sh \
+ gre_inner_v4_multipath.sh \
+ gre_inner_v6_multipath.sh \
+ gre_multipath.sh \
+ ip6_forward_instats_vrf.sh \
+ ip6gre_inner_v4_multipath.sh \
+ ip6gre_inner_v6_multipath.sh \
+ ipip_flat_gre_key.sh \
+ ipip_flat_gre_keys.sh \
+ ipip_flat_gre.sh \
+ ipip_hier_gre_key.sh \
+ ipip_hier_gre_keys.sh \
+ ipip_hier_gre.sh \
+ loopback.sh \
+ mirror_gre_bound.sh \
+ mirror_gre_bridge_1d.sh \
+ mirror_gre_bridge_1d_vlan.sh \
+ mirror_gre_bridge_1q_lag.sh \
+ mirror_gre_bridge_1q.sh \
+ mirror_gre_changes.sh \
+ mirror_gre_flower.sh \
+ mirror_gre_lag_lacp.sh \
+ mirror_gre_neigh.sh \
+ mirror_gre_nh.sh \
+ mirror_gre.sh \
+ mirror_gre_vlan_bridge_1q.sh \
+ mirror_gre_vlan.sh \
+ mirror_vlan.sh \
+ router_bridge.sh \
+ router_bridge_vlan.sh \
+ router_broadcast.sh \
+ router_mpath_nh.sh \
+ router_multicast.sh \
+ router_multipath.sh \
+ router.sh \
+ router_vid_1.sh \
+ sch_ets.sh \
+ sch_tbf_ets.sh \
+ sch_tbf_prio.sh \
+ sch_tbf_root.sh \
+ tc_actions.sh \
+ tc_chains.sh \
+ tc_flower_router.sh \
+ tc_flower.sh \
+ tc_shblocks.sh \
+ tc_vlan_modify.sh \
+ vxlan_asymmetric.sh \
+ vxlan_bridge_1d_port_8472.sh \
+ vxlan_bridge_1d.sh \
+ vxlan_bridge_1q_port_8472.sh \
+ vxlan_bridge_1q.sh \
+ vxlan_symmetric.sh
+
+TEST_PROGS_EXTENDED := devlink_lib.sh \
+ ethtool_lib.sh \
+ fib_offload_lib.sh \
+ forwarding.config.sample \
+ ipip_lib.sh \
+ lib.sh \
+ mirror_gre_lib.sh \
+ mirror_gre_topo_lib.sh \
+ mirror_lib.sh \
+ mirror_topo_lib.sh \
+ sch_ets_core.sh \
+ sch_ets_tests.sh \
+ sch_tbf_core.sh \
+ sch_tbf_etsprio.sh \
+ tc_common.sh
+
+include ../../lib.mk
diff --git a/tools/testing/selftests/net/forwarding/devlink_lib.sh b/tools/testing/selftests/net/forwarding/devlink_lib.sh
index 13d03a6..9c12c4f 100644
--- a/tools/testing/selftests/net/forwarding/devlink_lib.sh
+++ b/tools/testing/selftests/net/forwarding/devlink_lib.sh
@@ -5,7 +5,7 @@
# Defines
if [[ ! -v DEVLINK_DEV ]]; then
- DEVLINK_DEV=$(devlink port show "${NETIFS[p1]}" -j \
+ DEVLINK_DEV=$(devlink port show "${NETIFS[p1]:-$NETIF_NO_CABLE}" -j \
| jq -r '.port | keys[]' | cut -d/ -f-2)
if [ -z "$DEVLINK_DEV" ]; then
echo "SKIP: ${NETIFS[p1]} has no devlink device registered for it"
@@ -35,6 +35,12 @@
exit 1
fi
+devlink dev help 2>&1 | grep info &> /dev/null
+if [ $? -ne 0 ]; then
+ echo "SKIP: iproute2 too old, missing devlink dev info support"
+ exit 1
+fi
+
##############################################################################
# Devlink helpers
@@ -92,6 +98,11 @@
check_err $? "Failed setting path $path to size $size"
}
+devlink_resource_occ_get()
+{
+ devlink_resource_get "$@" | jq '.["occ"]'
+}
+
devlink_reload()
{
local still_pending
@@ -106,6 +117,12 @@
declare -A DEVLINK_ORIG
+# Changing pool type from static to dynamic causes reinterpretation of threshold
+# values. They therefore need to be saved before pool type is changed, then the
+# pool type can be changed, and then the new values need to be set up. Therefore
+# instead of saving the current state implicitly in the _set call, provide
+# functions for all three primitives: save, set, and restore.
+
devlink_port_pool_threshold()
{
local port=$1; shift
@@ -115,14 +132,21 @@
| jq '.port_pool."'"$port"'"[].threshold'
}
+devlink_port_pool_th_save()
+{
+ local port=$1; shift
+ local pool=$1; shift
+ local key="port_pool($port,$pool).threshold"
+
+ DEVLINK_ORIG[$key]=$(devlink_port_pool_threshold $port $pool)
+}
+
devlink_port_pool_th_set()
{
local port=$1; shift
local pool=$1; shift
local th=$1; shift
- local key="port_pool($port,$pool).threshold"
- DEVLINK_ORIG[$key]=$(devlink_port_pool_threshold $port $pool)
devlink sb port pool set $port pool $pool th $th
}
@@ -131,8 +155,13 @@
local port=$1; shift
local pool=$1; shift
local key="port_pool($port,$pool).threshold"
+ local -a orig=(${DEVLINK_ORIG[$key]})
- devlink sb port pool set $port pool $pool th ${DEVLINK_ORIG[$key]}
+ if [[ -z $orig ]]; then
+ echo "WARNING: Mismatched devlink_port_pool_th_restore"
+ else
+ devlink sb port pool set $port pool $pool th $orig
+ fi
}
devlink_pool_size_thtype()
@@ -143,14 +172,20 @@
| jq -r '.pool[][] | (.size, .thtype)'
}
+devlink_pool_size_thtype_save()
+{
+ local pool=$1; shift
+ local key="pool($pool).size_thtype"
+
+ DEVLINK_ORIG[$key]=$(devlink_pool_size_thtype $pool)
+}
+
devlink_pool_size_thtype_set()
{
local pool=$1; shift
local thtype=$1; shift
local size=$1; shift
- local key="pool($pool).size_thtype"
- DEVLINK_ORIG[$key]=$(devlink_pool_size_thtype $pool)
devlink sb pool set "$DEVLINK_DEV" pool $pool size $size thtype $thtype
}
@@ -160,8 +195,12 @@
local key="pool($pool).size_thtype"
local -a orig=(${DEVLINK_ORIG[$key]})
- devlink sb pool set "$DEVLINK_DEV" pool $pool \
- size ${orig[0]} thtype ${orig[1]}
+ if [[ -z ${orig[0]} ]]; then
+ echo "WARNING: Mismatched devlink_pool_size_thtype_restore"
+ else
+ devlink sb pool set "$DEVLINK_DEV" pool $pool \
+ size ${orig[0]} thtype ${orig[1]}
+ fi
}
devlink_tc_bind_pool_th()
@@ -174,6 +213,16 @@
| jq -r '.tc_bind[][] | (.pool, .threshold)'
}
+devlink_tc_bind_pool_th_save()
+{
+ local port=$1; shift
+ local tc=$1; shift
+ local dir=$1; shift
+ local key="tc_bind($port,$dir,$tc).pool_th"
+
+ DEVLINK_ORIG[$key]=$(devlink_tc_bind_pool_th $port $tc $dir)
+}
+
devlink_tc_bind_pool_th_set()
{
local port=$1; shift
@@ -181,9 +230,7 @@
local dir=$1; shift
local pool=$1; shift
local th=$1; shift
- local key="tc_bind($port,$dir,$tc).pool_th"
- DEVLINK_ORIG[$key]=$(devlink_tc_bind_pool_th $port $tc $dir)
devlink sb tc bind set $port tc $tc type $dir pool $pool th $th
}
@@ -195,8 +242,12 @@
local key="tc_bind($port,$dir,$tc).pool_th"
local -a orig=(${DEVLINK_ORIG[$key]})
- devlink sb tc bind set $port tc $tc type $dir \
- pool ${orig[0]} th ${orig[1]}
+ if [[ -z ${orig[0]} ]]; then
+ echo "WARNING: Mismatched devlink_tc_bind_pool_th_restore"
+ else
+ devlink sb tc bind set $port tc $tc type $dir \
+ pool ${orig[0]} th ${orig[1]}
+ fi
}
devlink_traps_num_get()
@@ -355,3 +406,152 @@
return 1
fi
}
+
+devlink_trap_exception_test()
+{
+ local trap_name=$1; shift
+ local group_name
+
+ group_name=$(devlink_trap_group_get $trap_name)
+
+ devlink_trap_stats_idle_test $trap_name
+ check_fail $? "Trap stats idle when packets should have been trapped"
+
+ devlink_trap_group_stats_idle_test $group_name
+ check_fail $? "Trap group idle when packets should have been trapped"
+}
+
+devlink_trap_drop_test()
+{
+ local trap_name=$1; shift
+ local dev=$1; shift
+ local handle=$1; shift
+ local group_name
+
+ group_name=$(devlink_trap_group_get $trap_name)
+
+ # This is the common part of all the tests. It checks that stats are
+ # initially idle, then non-idle after changing the trap action and
+ # finally idle again. It also makes sure the packets are dropped and
+ # never forwarded.
+ devlink_trap_stats_idle_test $trap_name
+ check_err $? "Trap stats not idle with initial drop action"
+ devlink_trap_group_stats_idle_test $group_name
+ check_err $? "Trap group stats not idle with initial drop action"
+
+ devlink_trap_action_set $trap_name "trap"
+ devlink_trap_stats_idle_test $trap_name
+ check_fail $? "Trap stats idle after setting action to trap"
+ devlink_trap_group_stats_idle_test $group_name
+ check_fail $? "Trap group stats idle after setting action to trap"
+
+ devlink_trap_action_set $trap_name "drop"
+
+ devlink_trap_stats_idle_test $trap_name
+ check_err $? "Trap stats not idle after setting action to drop"
+ devlink_trap_group_stats_idle_test $group_name
+ check_err $? "Trap group stats not idle after setting action to drop"
+
+ tc_check_packets "dev $dev egress" $handle 0
+ check_err $? "Packets were not dropped"
+}
+
+devlink_trap_drop_cleanup()
+{
+ local mz_pid=$1; shift
+ local dev=$1; shift
+ local proto=$1; shift
+ local pref=$1; shift
+ local handle=$1; shift
+
+ kill $mz_pid && wait $mz_pid &> /dev/null
+ tc filter del dev $dev egress protocol $proto pref $pref handle $handle flower
+}
+
+devlink_trap_stats_test()
+{
+ local test_name=$1; shift
+ local trap_name=$1; shift
+ local send_one="$@"
+ local t0_packets
+ local t1_packets
+
+ RET=0
+
+ t0_packets=$(devlink_trap_rx_packets_get $trap_name)
+
+ $send_one && sleep 1
+
+ t1_packets=$(devlink_trap_rx_packets_get $trap_name)
+
+ if [[ $t1_packets -eq $t0_packets ]]; then
+ check_err 1 "Trap stats did not increase"
+ fi
+
+ log_test "$test_name"
+}
+
+devlink_trap_policers_num_get()
+{
+ devlink -j -p trap policer show | jq '.[]["'$DEVLINK_DEV'"] | length'
+}
+
+devlink_trap_policer_rate_get()
+{
+ local policer_id=$1; shift
+
+ devlink -j -p trap policer show $DEVLINK_DEV policer $policer_id \
+ | jq '.[][][]["rate"]'
+}
+
+devlink_trap_policer_burst_get()
+{
+ local policer_id=$1; shift
+
+ devlink -j -p trap policer show $DEVLINK_DEV policer $policer_id \
+ | jq '.[][][]["burst"]'
+}
+
+devlink_trap_policer_rx_dropped_get()
+{
+ local policer_id=$1; shift
+
+ devlink -j -p -s trap policer show $DEVLINK_DEV policer $policer_id \
+ | jq '.[][][]["stats"]["rx"]["dropped"]'
+}
+
+devlink_trap_group_policer_get()
+{
+ local group_name=$1; shift
+
+ devlink -j -p trap group show $DEVLINK_DEV group $group_name \
+ | jq '.[][][]["policer"]'
+}
+
+devlink_trap_policer_ids_get()
+{
+ devlink -j -p trap policer show \
+ | jq '.[]["'$DEVLINK_DEV'"][]["policer"]'
+}
+
+devlink_port_by_netdev()
+{
+ local if_name=$1
+
+ devlink -j port show $if_name | jq -e '.[] | keys' | jq -r '.[]'
+}
+
+devlink_cpu_port_get()
+{
+ local cpu_dl_port_num=$(devlink port list | grep "$DEVLINK_DEV" |
+ grep cpu | cut -d/ -f3 | cut -d: -f1 |
+ sed -n '1p')
+
+ echo "$DEVLINK_DEV/$cpu_dl_port_num"
+}
+
+devlink_cell_size_get()
+{
+ devlink sb pool show "$DEVLINK_DEV" pool 0 -j \
+ | jq '.pool[][].cell_size'
+}
diff --git a/tools/testing/selftests/net/forwarding/ethtool.sh b/tools/testing/selftests/net/forwarding/ethtool.sh
new file mode 100755
index 0000000..dbb9fcf
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/ethtool.sh
@@ -0,0 +1,299 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+ALL_TESTS="
+ same_speeds_autoneg_off
+ different_speeds_autoneg_off
+ combination_of_neg_on_and_off
+ advertise_subset_of_speeds
+ check_highest_speed_is_chosen
+ different_speeds_autoneg_on
+"
+NUM_NETIFS=2
+source lib.sh
+source ethtool_lib.sh
+
+h1_create()
+{
+ simple_if_init $h1 192.0.2.1/24
+}
+
+h1_destroy()
+{
+ simple_if_fini $h1 192.0.2.1/24
+}
+
+h2_create()
+{
+ simple_if_init $h2 192.0.2.2/24
+}
+
+h2_destroy()
+{
+ simple_if_fini $h2 192.0.2.2/24
+}
+
+setup_prepare()
+{
+ h1=${NETIFS[p1]}
+ h2=${NETIFS[p2]}
+
+ h1_create
+ h2_create
+}
+
+cleanup()
+{
+ pre_cleanup
+
+ h2_destroy
+ h1_destroy
+}
+
+same_speeds_autoneg_off()
+{
+ # Check that when each of the reported speeds is forced, the links come
+ # up and are operational.
+ local -a speeds_arr=($(common_speeds_get $h1 $h2 0 0))
+
+ for speed in "${speeds_arr[@]}"; do
+ RET=0
+ ethtool_set $h1 speed $speed autoneg off
+ ethtool_set $h2 speed $speed autoneg off
+
+ setup_wait_dev_with_timeout $h1
+ setup_wait_dev_with_timeout $h2
+ ping_do $h1 192.0.2.2
+ check_err $? "speed $speed autoneg off"
+ log_test "force of same speed autoneg off"
+ log_info "speed = $speed"
+ done
+
+ ethtool -s $h2 autoneg on
+ ethtool -s $h1 autoneg on
+}
+
+different_speeds_autoneg_off()
+{
+ # Test that when we force different speeds, links are not up and ping
+ # fails.
+ RET=0
+
+ local -a speeds_arr=($(different_speeds_get $h1 $h2 0 0))
+ local speed1=${speeds_arr[0]}
+ local speed2=${speeds_arr[1]}
+
+ ethtool_set $h1 speed $speed1 autoneg off
+ ethtool_set $h2 speed $speed2 autoneg off
+
+ setup_wait_dev_with_timeout $h1
+ setup_wait_dev_with_timeout $h2
+ ping_do $h1 192.0.2.2
+ check_fail $? "ping with different speeds"
+
+ log_test "force of different speeds autoneg off"
+
+ ethtool -s $h2 autoneg on
+ ethtool -s $h1 autoneg on
+}
+
+combination_of_neg_on_and_off()
+{
+ # Test that when one device is forced to a speed supported by both
+ # endpoints and the other device is configured to autoneg on, the links
+ # are up and ping passes.
+ local -a speeds_arr=($(common_speeds_get $h1 $h2 0 1))
+
+ for speed in "${speeds_arr[@]}"; do
+ RET=0
+ ethtool_set $h1 speed $speed autoneg off
+
+ setup_wait_dev_with_timeout $h1
+ setup_wait_dev_with_timeout $h2
+ ping_do $h1 192.0.2.2
+ check_err $? "h1-speed=$speed autoneg off, h2 autoneg on"
+ log_test "one side with autoneg off and another with autoneg on"
+ log_info "force speed = $speed"
+ done
+
+ ethtool -s $h1 autoneg on
+}
+
+hex_speed_value_get()
+{
+ local speed=$1; shift
+
+ local shift_size=${speed_values[$speed]}
+ speed=$((0x1 << $"shift_size"))
+ printf "%#x" "$speed"
+}
+
+subset_of_common_speeds_get()
+{
+ local dev1=$1; shift
+ local dev2=$1; shift
+ local adver=$1; shift
+
+ local -a speeds_arr=($(common_speeds_get $dev1 $dev2 0 $adver))
+ local speed_to_advertise=0
+ local speed_to_remove=${speeds_arr[0]}
+ speed_to_remove+='base'
+
+ local -a speeds_mode_arr=($(common_speeds_get $dev1 $dev2 1 $adver))
+
+ for speed in ${speeds_mode_arr[@]}; do
+ if [[ $speed != $speed_to_remove* ]]; then
+ speed=$(hex_speed_value_get $speed)
+ speed_to_advertise=$(($speed_to_advertise | \
+ $speed))
+ fi
+
+ done
+
+ # Convert to hex.
+ printf "%#x" "$speed_to_advertise"
+}
+
+speed_to_advertise_get()
+{
+ # The function returns the hex number that is composed by OR-ing all
+ # the modes corresponding to the provided speed.
+ local speed_without_mode=$1; shift
+ local supported_speeds=("$@"); shift
+ local speed_to_advertise=0
+
+ speed_without_mode+='base'
+
+ for speed in ${supported_speeds[@]}; do
+ if [[ $speed == $speed_without_mode* ]]; then
+ speed=$(hex_speed_value_get $speed)
+ speed_to_advertise=$(($speed_to_advertise | \
+ $speed))
+ fi
+
+ done
+
+ # Convert to hex.
+ printf "%#x" "$speed_to_advertise"
+}
+
+advertise_subset_of_speeds()
+{
+ # Test that when one device advertises a subset of speeds and another
+ # advertises a specific speed (but all modes of this speed), the links
+ # are up and ping passes.
+ RET=0
+
+ local speed_1_to_advertise=$(subset_of_common_speeds_get $h1 $h2 1)
+ ethtool_set $h1 advertise $speed_1_to_advertise
+
+ if [ $RET != 0 ]; then
+ log_test "advertise subset of speeds"
+ return
+ fi
+
+ local -a speeds_arr_without_mode=($(common_speeds_get $h1 $h2 0 1))
+ # Check only speeds that h1 advertised. Remove the first speed.
+ unset speeds_arr_without_mode[0]
+ local -a speeds_arr_with_mode=($(common_speeds_get $h1 $h2 1 1))
+
+ for speed_value in ${speeds_arr_without_mode[@]}; do
+ RET=0
+ local speed_2_to_advertise=$(speed_to_advertise_get $speed_value \
+ "${speeds_arr_with_mode[@]}")
+ ethtool_set $h2 advertise $speed_2_to_advertise
+
+ setup_wait_dev_with_timeout $h1
+ setup_wait_dev_with_timeout $h2
+ ping_do $h1 192.0.2.2
+ check_err $? "h1=$speed_1_to_advertise, h2=$speed_2_to_advertise ($speed_value)"
+
+ log_test "advertise subset of speeds"
+ log_info "h1=$speed_1_to_advertise, h2=$speed_2_to_advertise"
+ done
+
+ ethtool -s $h2 autoneg on
+ ethtool -s $h1 autoneg on
+}
+
+check_highest_speed_is_chosen()
+{
+ # Test that when one device advertises a subset of speeds, the other
+ # chooses the highest speed. This test checks configuration without
+ # traffic.
+ RET=0
+
+ local max_speed
+ local chosen_speed
+ local speed_to_advertise=$(subset_of_common_speeds_get $h1 $h2 1)
+
+ ethtool_set $h1 advertise $speed_to_advertise
+
+ if [ $RET != 0 ]; then
+ log_test "check highest speed"
+ return
+ fi
+
+ local -a speeds_arr=($(common_speeds_get $h1 $h2 0 1))
+
+ max_speed=${speeds_arr[0]}
+ for current in ${speeds_arr[@]}; do
+ if [[ $current -gt $max_speed ]]; then
+ max_speed=$current
+ fi
+ done
+
+ setup_wait_dev_with_timeout $h1
+ setup_wait_dev_with_timeout $h2
+ chosen_speed=$(ethtool $h1 | grep 'Speed:')
+ chosen_speed=${chosen_speed%"Mb/s"*}
+ chosen_speed=${chosen_speed#*"Speed: "}
+ ((chosen_speed == max_speed))
+ check_err $? "h1 advertise $speed_to_advertise, h2 sync to speed $chosen_speed"
+
+ log_test "check highest speed"
+
+ ethtool -s $h2 autoneg on
+ ethtool -s $h1 autoneg on
+}
+
+different_speeds_autoneg_on()
+{
+ # Test that when we configure links to advertise different speeds,
+ # links are not up and ping fails.
+ RET=0
+
+ local -a speeds=($(different_speeds_get $h1 $h2 1 1))
+ local speed1=${speeds[0]}
+ local speed2=${speeds[1]}
+
+ speed1=$(hex_speed_value_get $speed1)
+ speed2=$(hex_speed_value_get $speed2)
+
+ ethtool_set $h1 advertise $speed1
+ ethtool_set $h2 advertise $speed2
+
+ if (($RET)); then
+ setup_wait_dev_with_timeout $h1
+ setup_wait_dev_with_timeout $h2
+ ping_do $h1 192.0.2.2
+ check_fail $? "ping with different speeds autoneg on"
+ fi
+
+ log_test "advertise different speeds autoneg on"
+
+ ethtool -s $h2 autoneg on
+ ethtool -s $h1 autoneg on
+}
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+
+declare -gA speed_values
+eval "speed_values=($(speeds_arr_get))"
+
+tests_run
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/net/forwarding/ethtool_extended_state.sh b/tools/testing/selftests/net/forwarding/ethtool_extended_state.sh
new file mode 100755
index 0000000..4b42dfd
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/ethtool_extended_state.sh
@@ -0,0 +1,102 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+ALL_TESTS="
+ autoneg
+ autoneg_force_mode
+ no_cable
+"
+
+NUM_NETIFS=2
+source lib.sh
+source ethtool_lib.sh
+
+setup_prepare()
+{
+ swp1=${NETIFS[p1]}
+ swp2=${NETIFS[p2]}
+ swp3=$NETIF_NO_CABLE
+}
+
+ethtool_extended_state_check()
+{
+ local dev=$1; shift
+ local expected_ext_state=$1; shift
+ local expected_ext_substate=${1:-""}; shift
+
+ local ext_state=$(ethtool $dev | grep "Link detected" \
+ | cut -d "(" -f2 | cut -d ")" -f1)
+ local ext_substate=$(echo $ext_state | cut -sd "," -f2 \
+ | sed -e 's/^[[:space:]]*//')
+ ext_state=$(echo $ext_state | cut -d "," -f1)
+
+ [[ $ext_state == $expected_ext_state ]]
+ check_err $? "Expected \"$expected_ext_state\", got \"$ext_state\""
+
+ [[ $ext_substate == $expected_ext_substate ]]
+ check_err $? "Expected \"$expected_ext_substate\", got \"$ext_substate\""
+}
+
+autoneg()
+{
+ RET=0
+
+ ip link set dev $swp1 up
+
+ sleep 4
+ ethtool_extended_state_check $swp1 "Autoneg" "No partner detected"
+
+ log_test "Autoneg, No partner detected"
+
+ ip link set dev $swp1 down
+}
+
+autoneg_force_mode()
+{
+ RET=0
+
+ ip link set dev $swp1 up
+ ip link set dev $swp2 up
+
+ local -a speeds_arr=($(different_speeds_get $swp1 $swp2 0 0))
+ local speed1=${speeds_arr[0]}
+ local speed2=${speeds_arr[1]}
+
+ ethtool_set $swp1 speed $speed1 autoneg off
+ ethtool_set $swp2 speed $speed2 autoneg off
+
+ sleep 4
+ ethtool_extended_state_check $swp1 "Autoneg" \
+ "No partner detected during force mode"
+
+ ethtool_extended_state_check $swp2 "Autoneg" \
+ "No partner detected during force mode"
+
+ log_test "Autoneg, No partner detected during force mode"
+
+ ethtool -s $swp2 autoneg on
+ ethtool -s $swp1 autoneg on
+
+ ip link set dev $swp2 down
+ ip link set dev $swp1 down
+}
+
+no_cable()
+{
+ RET=0
+
+ ip link set dev $swp3 up
+
+ sleep 1
+ ethtool_extended_state_check $swp3 "No cable"
+
+ log_test "No cable"
+
+ ip link set dev $swp3 down
+}
+
+setup_prepare
+
+tests_run
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/net/forwarding/ethtool_lib.sh b/tools/testing/selftests/net/forwarding/ethtool_lib.sh
new file mode 100644
index 0000000..9188e62
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/ethtool_lib.sh
@@ -0,0 +1,86 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+speeds_arr_get()
+{
+ cmd='/ETHTOOL_LINK_MODE_[^[:space:]]*_BIT[[:space:]]+=[[:space:]]+/ \
+ {sub(/,$/, "") \
+ sub(/ETHTOOL_LINK_MODE_/,"") \
+ sub(/_BIT/,"") \
+ sub(/_Full/,"/Full") \
+ sub(/_Half/,"/Half");\
+ print "["$1"]="$3}'
+
+ awk "${cmd}" /usr/include/linux/ethtool.h
+}
+
+ethtool_set()
+{
+ local cmd="$@"
+ local out=$(ethtool -s $cmd 2>&1 | wc -l)
+
+ check_err $out "error in configuration. $cmd"
+}
+
+dev_speeds_get()
+{
+ local dev=$1; shift
+ local with_mode=$1; shift
+ local adver=$1; shift
+ local speeds_str
+
+ if (($adver)); then
+ mode="Advertised link modes"
+ else
+ mode="Supported link modes"
+ fi
+
+ speeds_str=$(ethtool "$dev" | \
+ # Snip everything before the link modes section.
+ sed -n '/'"$mode"':/,$p' | \
+ # Quit processing the rest at the start of the next section.
+ # When checking, skip the header of this section (hence the 2,).
+ sed -n '2,${/^[\t][^ \t]/q};p' | \
+ # Drop the section header of the current section.
+ cut -d':' -f2)
+
+ local -a speeds_arr=($speeds_str)
+ if [[ $with_mode -eq 0 ]]; then
+ for ((i=0; i<${#speeds_arr[@]}; i++)); do
+ speeds_arr[$i]=${speeds_arr[$i]%base*}
+ done
+ fi
+ echo ${speeds_arr[@]}
+}
+
+common_speeds_get()
+{
+ dev1=$1; shift
+ dev2=$1; shift
+ with_mode=$1; shift
+ adver=$1; shift
+
+ local -a dev1_speeds=($(dev_speeds_get $dev1 $with_mode $adver))
+ local -a dev2_speeds=($(dev_speeds_get $dev2 $with_mode $adver))
+
+ comm -12 \
+ <(printf '%s\n' "${dev1_speeds[@]}" | sort -u) \
+ <(printf '%s\n' "${dev2_speeds[@]}" | sort -u)
+}
+
+different_speeds_get()
+{
+ local dev1=$1; shift
+ local dev2=$1; shift
+ local with_mode=$1; shift
+ local adver=$1; shift
+
+ local -a speeds_arr
+
+ speeds_arr=($(common_speeds_get $dev1 $dev2 $with_mode $adver))
+ if [[ ${#speeds_arr[@]} < 2 ]]; then
+ check_err 1 "cannot check different speeds. There are not enough speeds"
+ fi
+
+ echo ${speeds_arr[0]} ${speeds_arr[1]}
+}
diff --git a/tools/testing/selftests/net/forwarding/fib_offload_lib.sh b/tools/testing/selftests/net/forwarding/fib_offload_lib.sh
new file mode 100644
index 0000000..6649665
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/fib_offload_lib.sh
@@ -0,0 +1,873 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Various helpers and tests to verify FIB offload.
+
+__fib_trap_check()
+{
+ local ns=$1; shift
+ local family=$1; shift
+ local route=$1; shift
+ local should_fail=$1; shift
+ local ret
+
+ ip -n $ns -j -p -$family route show $route \
+ | jq -e '.[]["flags"] | contains(["trap"])' &> /dev/null
+ ret=$?
+ if [[ $should_fail == "true" ]]; then
+ if [[ $ret -ne 0 ]]; then
+ return 0
+ else
+ return 1
+ fi
+ fi
+
+ return $ret
+}
+
+fib_trap_check()
+{
+ local ns=$1; shift
+ local family=$1; shift
+ local route=$1; shift
+ local should_fail=$1; shift
+
+ busywait 5000 __fib_trap_check $ns $family "$route" $should_fail
+}
+
+fib4_trap_check()
+{
+ local ns=$1; shift
+ local route=$1; shift
+ local should_fail=$1; shift
+
+ fib_trap_check $ns 4 "$route" $should_fail
+}
+
+fib6_trap_check()
+{
+ local ns=$1; shift
+ local route=$1; shift
+ local should_fail=$1; shift
+
+ fib_trap_check $ns 6 "$route" $should_fail
+}
+
+fib_ipv4_identical_routes_test()
+{
+ local ns=$1; shift
+ local i
+
+ RET=0
+
+ for i in $(seq 1 3); do
+ ip -n $ns link add name dummy$i type dummy
+ ip -n $ns link set dev dummy$i up
+ done
+
+ ip -n $ns route add 192.0.2.0/24 dev dummy1 tos 0 metric 1024
+ fib4_trap_check $ns "192.0.2.0/24 dev dummy1 tos 0 metric 1024" false
+ check_err $? "Route not in hardware when should"
+
+ ip -n $ns route append 192.0.2.0/24 dev dummy2 tos 0 metric 1024
+ fib4_trap_check $ns "192.0.2.0/24 dev dummy2 tos 0 metric 1024" true
+ check_err $? "Appended route in hardware when should not"
+
+ ip -n $ns route prepend 192.0.2.0/24 dev dummy3 tos 0 metric 1024
+ fib4_trap_check $ns "192.0.2.0/24 dev dummy3 tos 0 metric 1024" false
+ check_err $? "Prepended route not in hardware when should"
+
+ fib4_trap_check $ns "192.0.2.0/24 dev dummy1 tos 0 metric 1024" true
+ check_err $? "Route was not replaced in hardware by prepended one"
+
+ log_test "IPv4 identical routes"
+
+ for i in $(seq 1 3); do
+ ip -n $ns link del dev dummy$i
+ done
+}
+
+fib_ipv4_tos_test()
+{
+ local ns=$1; shift
+
+ RET=0
+
+ ip -n $ns link add name dummy1 type dummy
+ ip -n $ns link set dev dummy1 up
+
+ ip -n $ns route add 192.0.2.0/24 dev dummy1 tos 0 metric 1024
+ fib4_trap_check $ns "192.0.2.0/24 dev dummy1 tos 0 metric 1024" false
+ check_err $? "Route not in hardware when should"
+
+ ip -n $ns route add 192.0.2.0/24 dev dummy1 tos 2 metric 1024
+ fib4_trap_check $ns "192.0.2.0/24 dev dummy1 tos 2 metric 1024" false
+ check_err $? "Highest TOS route not in hardware when should"
+
+ fib4_trap_check $ns "192.0.2.0/24 dev dummy1 tos 0 metric 1024" true
+ check_err $? "Lowest TOS route still in hardware when should not"
+
+ ip -n $ns route add 192.0.2.0/24 dev dummy1 tos 1 metric 1024
+ fib4_trap_check $ns "192.0.2.0/24 dev dummy1 tos 1 metric 1024" true
+ check_err $? "Middle TOS route in hardware when should not"
+
+ log_test "IPv4 routes with TOS"
+
+ ip -n $ns link del dev dummy1
+}
+
+fib_ipv4_metric_test()
+{
+ local ns=$1; shift
+
+ RET=0
+
+ ip -n $ns link add name dummy1 type dummy
+ ip -n $ns link set dev dummy1 up
+
+ ip -n $ns route add 192.0.2.0/24 dev dummy1 metric 1024
+ fib4_trap_check $ns "192.0.2.0/24 dev dummy1 metric 1024" false
+ check_err $? "Route not in hardware when should"
+
+ ip -n $ns route add 192.0.2.0/24 dev dummy1 metric 1022
+ fib4_trap_check $ns "192.0.2.0/24 dev dummy1 metric 1022" false
+ check_err $? "Lowest metric route not in hardware when should"
+
+ fib4_trap_check $ns "192.0.2.0/24 dev dummy1 metric 1024" true
+ check_err $? "Highest metric route still in hardware when should not"
+
+ ip -n $ns route add 192.0.2.0/24 dev dummy1 metric 1023
+ fib4_trap_check $ns "192.0.2.0/24 dev dummy1 metric 1023" true
+ check_err $? "Middle metric route in hardware when should not"
+
+ log_test "IPv4 routes with metric"
+
+ ip -n $ns link del dev dummy1
+}
+
+fib_ipv4_replace_test()
+{
+ local ns=$1; shift
+ local i
+
+ RET=0
+
+ for i in $(seq 1 2); do
+ ip -n $ns link add name dummy$i type dummy
+ ip -n $ns link set dev dummy$i up
+ done
+
+ ip -n $ns route add 192.0.2.0/24 dev dummy1 metric 1024
+ fib4_trap_check $ns "192.0.2.0/24 dev dummy1 metric 1024" false
+ check_err $? "Route not in hardware when should"
+
+ ip -n $ns route replace 192.0.2.0/24 dev dummy2 metric 1024
+ fib4_trap_check $ns "192.0.2.0/24 dev dummy2 metric 1024" false
+ check_err $? "Replacement route not in hardware when should"
+
+ # Add a route with an higher metric and make sure that replacing it
+ # does not affect the lower metric one.
+ ip -n $ns route add 192.0.2.0/24 dev dummy1 metric 1025
+ ip -n $ns route replace 192.0.2.0/24 dev dummy2 metric 1025
+
+ fib4_trap_check $ns "192.0.2.0/24 dev dummy2 metric 1024" false
+ check_err $? "Lowest metric route not in hardware when should"
+ fib4_trap_check $ns "192.0.2.0/24 dev dummy2 metric 1025" true
+ check_err $? "Highest metric route in hardware when should not"
+
+ log_test "IPv4 route replace"
+
+ for i in $(seq 1 2); do
+ ip -n $ns link del dev dummy$i
+ done
+}
+
+fib_ipv4_delete_test()
+{
+ local ns=$1; shift
+ local metric
+
+ RET=0
+
+ ip -n $ns link add name dummy1 type dummy
+ ip -n $ns link set dev dummy1 up
+
+ # Insert multiple routes with the same prefix and length and varying
+ # metrics. Make sure that throughout delete operations the lowest
+ # metric route is the one in hardware.
+ for metric in $(seq 1024 1026); do
+ ip -n $ns route add 192.0.2.0/24 dev dummy1 metric $metric
+ done
+
+ fib4_trap_check $ns "192.0.2.0/24 dev dummy1 metric 1024" false
+ check_err $? "Route not in hardware when should"
+
+ ip -n $ns route del 192.0.2.0/24 dev dummy1 metric 1024
+ fib4_trap_check $ns "192.0.2.0/24 dev dummy1 metric 1025" false
+ check_err $? "Lowest metric route not in hardware when should"
+
+ ip -n $ns route del 192.0.2.0/24 dev dummy1 metric 1026
+ fib4_trap_check $ns "192.0.2.0/24 dev dummy1 metric 1025" false
+ check_err $? "Sole route not in hardware when should"
+
+ log_test "IPv4 route delete"
+
+ ip -n $ns link del dev dummy1
+}
+
+fib_ipv4_plen_test()
+{
+ local ns=$1; shift
+
+ RET=0
+
+ ip -n $ns link add name dummy1 type dummy
+ ip -n $ns link set dev dummy1 up
+
+ # Add two routes with the same key and different prefix length and
+ # make sure both are in hardware. It can be verfied that both are
+ # sharing the same leaf by checking the /proc/net/fib_trie
+ ip -n $ns route add 192.0.2.0/24 dev dummy1
+ ip -n $ns route add 192.0.2.0/25 dev dummy1
+
+ fib4_trap_check $ns "192.0.2.0/24 dev dummy1" false
+ check_err $? "/24 not in hardware when should"
+
+ fib4_trap_check $ns "192.0.2.0/25 dev dummy1" false
+ check_err $? "/25 not in hardware when should"
+
+ log_test "IPv4 routes with different prefix length"
+
+ ip -n $ns link del dev dummy1
+}
+
+fib_ipv4_replay_metric_test()
+{
+ local ns=$1; shift
+ local devlink_dev=$1; shift
+
+ RET=0
+
+ ip -n $ns link add name dummy1 type dummy
+ ip -n $ns link set dev dummy1 up
+
+ ip -n $ns route add 192.0.2.0/24 dev dummy1 metric 1024
+ ip -n $ns route add 192.0.2.0/24 dev dummy1 metric 1025
+
+ devlink -N $ns dev reload $devlink_dev
+
+ fib4_trap_check $ns "192.0.2.0/24 dev dummy1 metric 1024" false
+ check_err $? "Lowest metric route not in hardware when should"
+
+ fib4_trap_check $ns "192.0.2.0/24 dev dummy1 metric 1025" true
+ check_err $? "Highest metric route in hardware when should not"
+
+ log_test "IPv4 routes replay - metric"
+
+ ip -n $ns link del dev dummy1
+}
+
+fib_ipv4_replay_tos_test()
+{
+ local ns=$1; shift
+ local devlink_dev=$1; shift
+
+ RET=0
+
+ ip -n $ns link add name dummy1 type dummy
+ ip -n $ns link set dev dummy1 up
+
+ ip -n $ns route add 192.0.2.0/24 dev dummy1 tos 0
+ ip -n $ns route add 192.0.2.0/24 dev dummy1 tos 1
+
+ devlink -N $ns dev reload $devlink_dev
+
+ fib4_trap_check $ns "192.0.2.0/24 dev dummy1 tos 1" false
+ check_err $? "Highest TOS route not in hardware when should"
+
+ fib4_trap_check $ns "192.0.2.0/24 dev dummy1 tos 0" true
+ check_err $? "Lowest TOS route in hardware when should not"
+
+ log_test "IPv4 routes replay - TOS"
+
+ ip -n $ns link del dev dummy1
+}
+
+fib_ipv4_replay_plen_test()
+{
+ local ns=$1; shift
+ local devlink_dev=$1; shift
+
+ RET=0
+
+ ip -n $ns link add name dummy1 type dummy
+ ip -n $ns link set dev dummy1 up
+
+ ip -n $ns route add 192.0.2.0/24 dev dummy1
+ ip -n $ns route add 192.0.2.0/25 dev dummy1
+
+ devlink -N $ns dev reload $devlink_dev
+
+ fib4_trap_check $ns "192.0.2.0/24 dev dummy1" false
+ check_err $? "/24 not in hardware when should"
+
+ fib4_trap_check $ns "192.0.2.0/25 dev dummy1" false
+ check_err $? "/25 not in hardware when should"
+
+ log_test "IPv4 routes replay - prefix length"
+
+ ip -n $ns link del dev dummy1
+}
+
+fib_ipv4_flush_test()
+{
+ local ns=$1; shift
+ local metric
+
+ RET=0
+
+ ip -n $ns link add name dummy1 type dummy
+ ip -n $ns link set dev dummy1 up
+
+ # Exercise the routes flushing code paths by inserting various
+ # prefix routes on a netdev and then deleting it.
+ for metric in $(seq 1 20); do
+ ip -n $ns route add 192.0.2.0/24 dev dummy1 metric $metric
+ done
+
+ ip -n $ns link del dev dummy1
+
+ log_test "IPv4 routes flushing"
+}
+
+fib_ipv6_add_test()
+{
+ local ns=$1; shift
+
+ RET=0
+
+ for i in $(seq 1 2); do
+ ip -n $ns link add name dummy$i type dummy
+ ip -n $ns link set dev dummy$i up
+ done
+
+ ip -n $ns route add 2001:db8:1::/64 dev dummy1 metric 1024
+ fib6_trap_check $ns "2001:db8:1::/64 dev dummy1 metric 1024" false
+ check_err $? "Route not in hardware when should"
+
+ ip -n $ns route append 2001:db8:1::/64 dev dummy2 metric 1024
+ fib6_trap_check $ns "2001:db8:1::/64 dev dummy2 metric 1024" true
+ check_err $? "Route in hardware when should not"
+
+ fib6_trap_check $ns "2001:db8:1::/64 dev dummy1 metric 1024" false
+ check_err $? "Route not in hardware after appending route"
+
+ log_test "IPv6 single route add"
+
+ for i in $(seq 1 2); do
+ ip -n $ns link del dev dummy$i
+ done
+}
+
+fib_ipv6_metric_test()
+{
+ local ns=$1; shift
+
+ RET=0
+
+ ip -n $ns link add name dummy1 type dummy
+ ip -n $ns link set dev dummy1 up
+
+ ip -n $ns route add 2001:db8:1::/64 dev dummy1 metric 1024
+ fib6_trap_check $ns "2001:db8:1::/64 dev dummy1 metric 1024" false
+ check_err $? "Route not in hardware when should"
+
+ ip -n $ns route add 2001:db8:1::/64 dev dummy1 metric 1022
+ fib6_trap_check $ns "2001:db8:1::/64 dev dummy1 metric 1022" false
+ check_err $? "Lowest metric route not in hardware when should"
+
+ fib6_trap_check $ns "2001:db8:1::/64 dev dummy1 metric 1024" true
+ check_err $? "Highest metric route still in hardware when should not"
+
+ ip -n $ns route add 2001:db8:1::/64 dev dummy1 metric 1023
+ fib6_trap_check $ns "2001:db8:1::/64 dev dummy1 metric 1023" true
+ check_err $? "Middle metric route in hardware when should not"
+
+ log_test "IPv6 routes with metric"
+
+ ip -n $ns link del dev dummy1
+}
+
+fib_ipv6_append_single_test()
+{
+ local ns=$1; shift
+
+ # When an IPv6 multipath route is added without the 'nexthop' keyword,
+ # different code paths are taken compared to when the keyword is used.
+ # This test tries to verify the former.
+ RET=0
+
+ for i in $(seq 1 2); do
+ ip -n $ns link add name dummy$i type dummy
+ ip -n $ns link set dev dummy$i up
+ ip -n $ns address add 2001:db8:$i::1/64 dev dummy$i
+ done
+
+ ip -n $ns route add 2001:db8:10::/64 via 2001:db8:1::2 metric 1024
+ fib6_trap_check $ns "2001:db8:10::/64 metric 1024" false
+ check_err $? "Route not in hardware when should"
+
+ ip -n $ns route append 2001:db8:10::/64 via 2001:db8:2::2 metric 1024
+ fib6_trap_check $ns "2001:db8:10::/64 metric 1024" false
+ check_err $? "Route not in hardware after appending"
+
+ ip -n $ns route add 2001:db8:10::/64 via 2001:db8:1::2 metric 1025
+ fib6_trap_check $ns "2001:db8:10::/64 metric 1025" true
+ check_err $? "Route in hardware when should not"
+
+ ip -n $ns route append 2001:db8:10::/64 via 2001:db8:2::2 metric 1025
+ fib6_trap_check $ns "2001:db8:10::/64 metric 1025" true
+ check_err $? "Route in hardware when should not after appending"
+
+ fib6_trap_check $ns "2001:db8:10::/64 metric 1024" false
+ check_err $? "Lowest metric route not in hardware when should"
+
+ log_test "IPv6 append single route without 'nexthop' keyword"
+
+ for i in $(seq 1 2); do
+ ip -n $ns link del dev dummy$i
+ done
+}
+
+fib_ipv6_replace_single_test()
+{
+ local ns=$1; shift
+ local i
+
+ RET=0
+
+ for i in $(seq 1 2); do
+ ip -n $ns link add name dummy$i type dummy
+ ip -n $ns link set dev dummy$i up
+ done
+
+ ip -n $ns route add 2001:db8:1::/64 dev dummy1 metric 1024
+ fib6_trap_check $ns "2001:db8:1::/64 dev dummy1 metric 1024" false
+ check_err $? "Route not in hardware when should"
+
+ ip -n $ns route replace 2001:db8:1::/64 dev dummy2 metric 1024
+ fib6_trap_check $ns "2001:db8:1::/64 dev dummy2 metric 1024" false
+ check_err $? "Replacement route not in hardware when should"
+
+ # Add a route with an higher metric and make sure that replacing it
+ # does not affect the lower metric one.
+ ip -n $ns route add 2001:db8:1::/64 dev dummy1 metric 1025
+ ip -n $ns route replace 2001:db8:1::/64 dev dummy2 metric 1025
+
+ fib6_trap_check $ns "2001:db8:1::/64 dev dummy2 metric 1024" false
+ check_err $? "Lowest metric route not in hardware when should"
+ fib6_trap_check $ns "2001:db8:1::/64 dev dummy2 metric 1025" true
+ check_err $? "Highest metric route in hardware when should not"
+
+ log_test "IPv6 single route replace"
+
+ for i in $(seq 1 2); do
+ ip -n $ns link del dev dummy$i
+ done
+}
+
+fib_ipv6_metric_multipath_test()
+{
+ local ns=$1; shift
+
+ RET=0
+
+ for i in $(seq 1 2); do
+ ip -n $ns link add name dummy$i type dummy
+ ip -n $ns link set dev dummy$i up
+ ip -n $ns address add 2001:db8:$i::1/64 dev dummy$i
+ done
+
+ ip -n $ns route add 2001:db8:10::/64 metric 1024 \
+ nexthop via 2001:db8:1::2 dev dummy1 \
+ nexthop via 2001:db8:2::2 dev dummy2
+ fib6_trap_check $ns "2001:db8:10::/64 metric 1024" false
+ check_err $? "Route not in hardware when should"
+
+ ip -n $ns route add 2001:db8:10::/64 metric 1022 \
+ nexthop via 2001:db8:1::2 dev dummy1 \
+ nexthop via 2001:db8:2::2 dev dummy2
+ fib6_trap_check $ns "2001:db8:10::/64 metric 1022" false
+ check_err $? "Lowest metric route not in hardware when should"
+
+ ip -n $ns route add 2001:db8:10::/64 metric 1023 \
+ nexthop via 2001:db8:1::2 dev dummy1 \
+ nexthop via 2001:db8:2::2 dev dummy2
+ fib6_trap_check $ns "2001:db8:10::/64 metric 1024" true
+ check_err $? "Highest metric route still in hardware when should not"
+
+ fib6_trap_check $ns "2001:db8:10::/64 metric 1023" true
+ check_err $? "Middle metric route in hardware when should not"
+
+ log_test "IPv6 multipath routes with metric"
+
+ for i in $(seq 1 2); do
+ ip -n $ns link del dev dummy$i
+ done
+}
+
+fib_ipv6_append_multipath_test()
+{
+ local ns=$1; shift
+
+ RET=0
+
+ for i in $(seq 1 3); do
+ ip -n $ns link add name dummy$i type dummy
+ ip -n $ns link set dev dummy$i up
+ ip -n $ns address add 2001:db8:$i::1/64 dev dummy$i
+ done
+
+ ip -n $ns route add 2001:db8:10::/64 metric 1024 \
+ nexthop via 2001:db8:1::2 dev dummy1
+ fib6_trap_check $ns "2001:db8:10::/64 metric 1024" false
+ check_err $? "Route not in hardware when should"
+
+ ip -n $ns route append 2001:db8:10::/64 metric 1024 \
+ nexthop via 2001:db8:2::2 dev dummy2 \
+ nexthop via 2001:db8:3::2 dev dummy3
+ fib6_trap_check $ns "2001:db8:10::/64 metric 1024" false
+ check_err $? "Route not in hardware after appending"
+
+ ip -n $ns route add 2001:db8:10::/64 metric 1025 \
+ nexthop via 2001:db8:1::2 dev dummy1
+ fib6_trap_check $ns "2001:db8:10::/64 metric 1025" true
+ check_err $? "Route in hardware when should not"
+
+ ip -n $ns route append 2001:db8:10::/64 metric 1025 \
+ nexthop via 2001:db8:2::2 dev dummy2 \
+ nexthop via 2001:db8:3::2 dev dummy3
+ fib6_trap_check $ns "2001:db8:10::/64 metric 1025" true
+ check_err $? "Route in hardware when should not after appending"
+
+ fib6_trap_check $ns "2001:db8:10::/64 metric 1024" false
+ check_err $? "Lowest metric route not in hardware when should"
+
+ log_test "IPv6 append multipath route with 'nexthop' keyword"
+
+ for i in $(seq 1 3); do
+ ip -n $ns link del dev dummy$i
+ done
+}
+
+fib_ipv6_replace_multipath_test()
+{
+ local ns=$1; shift
+ local i
+
+ RET=0
+
+ for i in $(seq 1 3); do
+ ip -n $ns link add name dummy$i type dummy
+ ip -n $ns link set dev dummy$i up
+ ip -n $ns address add 2001:db8:$i::1/64 dev dummy$i
+ done
+
+ ip -n $ns route add 2001:db8:10::/64 metric 1024 \
+ nexthop via 2001:db8:1::2 dev dummy1 \
+ nexthop via 2001:db8:2::2 dev dummy2
+ fib6_trap_check $ns "2001:db8:10::/64 metric 1024" false
+ check_err $? "Route not in hardware when should"
+
+ ip -n $ns route replace 2001:db8:10::/64 metric 1024 \
+ nexthop via 2001:db8:1::2 dev dummy1 \
+ nexthop via 2001:db8:3::2 dev dummy3
+ fib6_trap_check $ns "2001:db8:10::/64 metric 1024" false
+ check_err $? "Replacement route not in hardware when should"
+
+ # Add a route with an higher metric and make sure that replacing it
+ # does not affect the lower metric one.
+ ip -n $ns route add 2001:db8:10::/64 metric 1025 \
+ nexthop via 2001:db8:1::2 dev dummy1 \
+ nexthop via 2001:db8:2::2 dev dummy2
+ ip -n $ns route replace 2001:db8:10::/64 metric 1025 \
+ nexthop via 2001:db8:1::2 dev dummy1 \
+ nexthop via 2001:db8:3::2 dev dummy3
+
+ fib6_trap_check $ns "2001:db8:10::/64 metric 1024" false
+ check_err $? "Lowest metric route not in hardware when should"
+ fib6_trap_check $ns "2001:db8:10::/64 metric 1025" true
+ check_err $? "Highest metric route in hardware when should not"
+
+ log_test "IPv6 multipath route replace"
+
+ for i in $(seq 1 3); do
+ ip -n $ns link del dev dummy$i
+ done
+}
+
+fib_ipv6_append_multipath_to_single_test()
+{
+ local ns=$1; shift
+
+ # Test that when the first route in the leaf is not a multipath route
+ # and we try to append a multipath route with the same metric to it, it
+ # is not notified.
+ RET=0
+
+ for i in $(seq 1 2); do
+ ip -n $ns link add name dummy$i type dummy
+ ip -n $ns link set dev dummy$i up
+ ip -n $ns address add 2001:db8:$i::1/64 dev dummy$i
+ done
+
+ ip -n $ns route add 2001:db8:10::/64 dev dummy1 metric 1024
+ fib6_trap_check $ns "2001:db8:10::/64 dev dummy1 metric 1024" false
+ check_err $? "Route not in hardware when should"
+
+ ip -n $ns route append 2001:db8:10::/64 metric 1024 \
+ nexthop via 2001:db8:2::2 dev dummy2
+ fib6_trap_check $ns "2001:db8:10::/64 dev dummy2 metric 1024" true
+ check_err $? "Route in hardware when should not"
+
+ fib6_trap_check $ns "2001:db8:10::/64 dev dummy1 metric 1024" false
+ check_err $? "Route not in hardware after append"
+
+ log_test "IPv6 append multipath route to non-multipath route"
+
+ for i in $(seq 1 2); do
+ ip -n $ns link del dev dummy$i
+ done
+}
+
+fib_ipv6_delete_single_test()
+{
+ local ns=$1; shift
+
+ # Test various deletion scenarios, where only a single route is
+ # deleted from the FIB node.
+ for i in $(seq 1 2); do
+ ip -n $ns link add name dummy$i type dummy
+ ip -n $ns link set dev dummy$i up
+ ip -n $ns address add 2001:db8:$i::1/64 dev dummy$i
+ done
+
+ # Test deletion of a single route when it is the only route in the FIB
+ # node.
+ RET=0
+
+ ip -n $ns route add 2001:db8:10::/64 dev dummy1 metric 1024
+ ip -n $ns route del 2001:db8:10::/64 dev dummy1 metric 1024
+
+ log_test "IPv6 delete sole single route"
+
+ # Test that deletion of last route does not affect the first one.
+ RET=0
+
+ ip -n $ns route add 2001:db8:10::/64 dev dummy1 metric 1024
+ ip -n $ns route add 2001:db8:10::/64 dev dummy1 metric 1025
+ ip -n $ns route del 2001:db8:10::/64 dev dummy1 metric 1025
+
+ fib6_trap_check $ns "2001:db8:10::/64 dev dummy1 metric 1024" false
+ check_err $? "Route not in hardware after deleting higher metric route"
+
+ log_test "IPv6 delete single route not in hardware"
+
+ ip -n $ns route del 2001:db8:10::/64 dev dummy1 metric 1024
+
+ # Test that first route is replaced by next single route in the FIB
+ # node.
+ RET=0
+
+ ip -n $ns route add 2001:db8:10::/64 dev dummy1 metric 1024
+ ip -n $ns route add 2001:db8:10::/64 dev dummy1 metric 1025
+ ip -n $ns route del 2001:db8:10::/64 dev dummy1 metric 1024
+
+ fib6_trap_check $ns "2001:db8:10::/64 dev dummy1 metric 1025" false
+ check_err $? "Route not in hardware after deleting lowest metric route"
+
+ log_test "IPv6 delete single route - replaced by single"
+
+ ip -n $ns route del 2001:db8:10::/64 dev dummy1 metric 1025
+
+ # Test that first route is replaced by next multipath route in the FIB
+ # node.
+ RET=0
+
+ ip -n $ns route add 2001:db8:10::/64 dev dummy1 metric 1024
+ ip -n $ns route add 2001:db8:10::/64 metric 1025 \
+ nexthop via 2001:db8:1::2 dev dummy1 \
+ nexthop via 2001:db8:2::2 dev dummy2
+ ip -n $ns route del 2001:db8:10::/64 dev dummy1 metric 1024
+
+ fib6_trap_check $ns "2001:db8:10::/64 metric 1025" false
+ check_err $? "Route not in hardware after deleting lowest metric route"
+
+ log_test "IPv6 delete single route - replaced by multipath"
+
+ ip -n $ns route del 2001:db8:10::/64 metric 1025
+
+ # Test deletion of a single nexthop from a multipath route.
+ ip -n $ns route add 2001:db8:10::/64 metric 1024 \
+ nexthop via 2001:db8:1::2 dev dummy1 \
+ nexthop via 2001:db8:2::2 dev dummy2
+ ip -n $ns route del 2001:db8:10::/64 metric 1024 \
+ nexthop via 2001:db8:1::2 dev dummy1
+
+ fib6_trap_check $ns "2001:db8:10::/64 metric 1024" false
+ check_err $? "Route not in hardware after deleting a single nexthop"
+
+ log_test "IPv6 delete single nexthop"
+
+ ip -n $ns route del 2001:db8:10::/64 metric 1024
+
+ for i in $(seq 1 2); do
+ ip -n $ns link del dev dummy$i
+ done
+}
+
+fib_ipv6_delete_multipath_test()
+{
+ local ns=$1; shift
+
+ # Test various deletion scenarios, where an entire multipath route is
+ # deleted from the FIB node.
+ for i in $(seq 1 2); do
+ ip -n $ns link add name dummy$i type dummy
+ ip -n $ns link set dev dummy$i up
+ ip -n $ns address add 2001:db8:$i::1/64 dev dummy$i
+ done
+
+ # Test deletion of a multipath route when it is the only route in the
+ # FIB node.
+ RET=0
+
+ ip -n $ns route add 2001:db8:10::/64 metric 1024 \
+ nexthop via 2001:db8:1::2 dev dummy1 \
+ nexthop via 2001:db8:2::2 dev dummy2
+ ip -n $ns route del 2001:db8:10::/64 metric 1024
+
+ log_test "IPv6 delete sole multipath route"
+
+ # Test that deletion of last route does not affect the first one.
+ RET=0
+
+ ip -n $ns route add 2001:db8:10::/64 metric 1024 \
+ nexthop via 2001:db8:1::2 dev dummy1 \
+ nexthop via 2001:db8:2::2 dev dummy2
+ ip -n $ns route add 2001:db8:10::/64 metric 1025 \
+ nexthop via 2001:db8:1::2 dev dummy1 \
+ nexthop via 2001:db8:2::2 dev dummy2
+ ip -n $ns route del 2001:db8:10::/64 metric 1025
+
+ fib6_trap_check $ns "2001:db8:10::/64 metric 1024" false
+ check_err $? "Route not in hardware after deleting higher metric route"
+
+ log_test "IPv6 delete multipath route not in hardware"
+
+ ip -n $ns route del 2001:db8:10::/64 metric 1024
+
+ # Test that first route is replaced by next single route in the FIB
+ # node.
+ RET=0
+
+ ip -n $ns route add 2001:db8:10::/64 metric 1024 \
+ nexthop via 2001:db8:1::2 dev dummy1 \
+ nexthop via 2001:db8:2::2 dev dummy2
+ ip -n $ns route add 2001:db8:10::/64 dev dummy1 metric 1025
+ ip -n $ns route del 2001:db8:10::/64 metric 1024
+
+ fib6_trap_check $ns "2001:db8:10::/64 dev dummy1 metric 1025" false
+ check_err $? "Route not in hardware after deleting lowest metric route"
+
+ log_test "IPv6 delete multipath route - replaced by single"
+
+ ip -n $ns route del 2001:db8:10::/64 dev dummy1 metric 1025
+
+ # Test that first route is replaced by next multipath route in the FIB
+ # node.
+ RET=0
+
+ ip -n $ns route add 2001:db8:10::/64 metric 1024 \
+ nexthop via 2001:db8:1::2 dev dummy1 \
+ nexthop via 2001:db8:2::2 dev dummy2
+ ip -n $ns route add 2001:db8:10::/64 metric 1025 \
+ nexthop via 2001:db8:1::2 dev dummy1 \
+ nexthop via 2001:db8:2::2 dev dummy2
+ ip -n $ns route del 2001:db8:10::/64 metric 1024
+
+ fib6_trap_check $ns "2001:db8:10::/64 metric 1025" false
+ check_err $? "Route not in hardware after deleting lowest metric route"
+
+ log_test "IPv6 delete multipath route - replaced by multipath"
+
+ ip -n $ns route del 2001:db8:10::/64 metric 1025
+
+ for i in $(seq 1 2); do
+ ip -n $ns link del dev dummy$i
+ done
+}
+
+fib_ipv6_replay_single_test()
+{
+ local ns=$1; shift
+ local devlink_dev=$1; shift
+
+ RET=0
+
+ for i in $(seq 1 2); do
+ ip -n $ns link add name dummy$i type dummy
+ ip -n $ns link set dev dummy$i up
+ done
+
+ ip -n $ns route add 2001:db8:1::/64 dev dummy1
+ ip -n $ns route append 2001:db8:1::/64 dev dummy2
+
+ devlink -N $ns dev reload $devlink_dev
+
+ fib6_trap_check $ns "2001:db8:1::/64 dev dummy1" false
+ check_err $? "First route not in hardware when should"
+
+ fib6_trap_check $ns "2001:db8:1::/64 dev dummy2" true
+ check_err $? "Second route in hardware when should not"
+
+ log_test "IPv6 routes replay - single route"
+
+ for i in $(seq 1 2); do
+ ip -n $ns link del dev dummy$i
+ done
+}
+
+fib_ipv6_replay_multipath_test()
+{
+ local ns=$1; shift
+ local devlink_dev=$1; shift
+
+ RET=0
+
+ for i in $(seq 1 2); do
+ ip -n $ns link add name dummy$i type dummy
+ ip -n $ns link set dev dummy$i up
+ ip -n $ns address add 2001:db8:$i::1/64 dev dummy$i
+ done
+
+ ip -n $ns route add 2001:db8:10::/64 metric 1024 \
+ nexthop via 2001:db8:1::2 dev dummy1 \
+ nexthop via 2001:db8:2::2 dev dummy2
+ ip -n $ns route add 2001:db8:10::/64 metric 1025 \
+ nexthop via 2001:db8:1::2 dev dummy1 \
+ nexthop via 2001:db8:2::2 dev dummy2
+
+ devlink -N $ns dev reload $devlink_dev
+
+ fib6_trap_check $ns "2001:db8:10::/64 metric 1024" false
+ check_err $? "First route not in hardware when should"
+
+ fib6_trap_check $ns "2001:db8:10::/64 metric 1025" true
+ check_err $? "Second route in hardware when should not"
+
+ log_test "IPv6 routes replay - multipath route"
+
+ for i in $(seq 1 2); do
+ ip -n $ns link del dev dummy$i
+ done
+}
diff --git a/tools/testing/selftests/net/forwarding/forwarding.config.sample b/tools/testing/selftests/net/forwarding/forwarding.config.sample
index e2adb53..e51def3 100644
--- a/tools/testing/selftests/net/forwarding/forwarding.config.sample
+++ b/tools/testing/selftests/net/forwarding/forwarding.config.sample
@@ -13,6 +13,11 @@
NETIFS[p6]=veth5
NETIFS[p7]=veth6
NETIFS[p8]=veth7
+NETIFS[p9]=veth8
+NETIFS[p10]=veth9
+
+# Port that does not have a cable connected.
+NETIF_NO_CABLE=eth8
##############################################################################
# Defines
@@ -36,3 +41,5 @@
# Timeout (in seconds) before ping exits regardless of how many packets have
# been sent or received
PING_TIMEOUT=5
+# IPv6 traceroute utility name.
+TROUTE6=traceroute6
diff --git a/tools/testing/selftests/net/forwarding/ip6_forward_instats_vrf.sh b/tools/testing/selftests/net/forwarding/ip6_forward_instats_vrf.sh
new file mode 100755
index 0000000..9f5b3e2
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/ip6_forward_instats_vrf.sh
@@ -0,0 +1,172 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# Test ipv6 stats on the incoming if when forwarding with VRF
+
+ALL_TESTS="
+ ipv6_ping
+ ipv6_in_too_big_err
+ ipv6_in_hdr_err
+ ipv6_in_addr_err
+ ipv6_in_discard
+"
+
+NUM_NETIFS=4
+source lib.sh
+
+h1_create()
+{
+ simple_if_init $h1 2001:1:1::2/64
+ ip -6 route add vrf v$h1 2001:1:2::/64 via 2001:1:1::1
+}
+
+h1_destroy()
+{
+ ip -6 route del vrf v$h1 2001:1:2::/64 via 2001:1:1::1
+ simple_if_fini $h1 2001:1:1::2/64
+}
+
+router_create()
+{
+ vrf_create router
+ __simple_if_init $rtr1 router 2001:1:1::1/64
+ __simple_if_init $rtr2 router 2001:1:2::1/64
+ mtu_set $rtr2 1280
+}
+
+router_destroy()
+{
+ mtu_restore $rtr2
+ __simple_if_fini $rtr2 2001:1:2::1/64
+ __simple_if_fini $rtr1 2001:1:1::1/64
+ vrf_destroy router
+}
+
+h2_create()
+{
+ simple_if_init $h2 2001:1:2::2/64
+ ip -6 route add vrf v$h2 2001:1:1::/64 via 2001:1:2::1
+ mtu_set $h2 1280
+}
+
+h2_destroy()
+{
+ mtu_restore $h2
+ ip -6 route del vrf v$h2 2001:1:1::/64 via 2001:1:2::1
+ simple_if_fini $h2 2001:1:2::2/64
+}
+
+setup_prepare()
+{
+ h1=${NETIFS[p1]}
+ rtr1=${NETIFS[p2]}
+
+ rtr2=${NETIFS[p3]}
+ h2=${NETIFS[p4]}
+
+ vrf_prepare
+ h1_create
+ router_create
+ h2_create
+
+ forwarding_enable
+}
+
+cleanup()
+{
+ pre_cleanup
+
+ forwarding_restore
+
+ h2_destroy
+ router_destroy
+ h1_destroy
+ vrf_cleanup
+}
+
+ipv6_in_too_big_err()
+{
+ RET=0
+
+ local t0=$(ipv6_stats_get $rtr1 Ip6InTooBigErrors)
+ local vrf_name=$(master_name_get $h1)
+
+ # Send too big packets
+ ip vrf exec $vrf_name \
+ $PING6 -s 1300 2001:1:2::2 -c 1 -w $PING_TIMEOUT &> /dev/null
+
+ local t1=$(ipv6_stats_get $rtr1 Ip6InTooBigErrors)
+ test "$((t1 - t0))" -ne 0
+ check_err $?
+ log_test "Ip6InTooBigErrors"
+}
+
+ipv6_in_hdr_err()
+{
+ RET=0
+
+ local t0=$(ipv6_stats_get $rtr1 Ip6InHdrErrors)
+ local vrf_name=$(master_name_get $h1)
+
+ # Send packets with hop limit 1, easiest with traceroute6 as some ping6
+ # doesn't allow hop limit to be specified
+ ip vrf exec $vrf_name \
+ $TROUTE6 2001:1:2::2 &> /dev/null
+
+ local t1=$(ipv6_stats_get $rtr1 Ip6InHdrErrors)
+ test "$((t1 - t0))" -ne 0
+ check_err $?
+ log_test "Ip6InHdrErrors"
+}
+
+ipv6_in_addr_err()
+{
+ RET=0
+
+ local t0=$(ipv6_stats_get $rtr1 Ip6InAddrErrors)
+ local vrf_name=$(master_name_get $h1)
+
+ # Disable forwarding temporary while sending the packet
+ sysctl -qw net.ipv6.conf.all.forwarding=0
+ ip vrf exec $vrf_name \
+ $PING6 2001:1:2::2 -c 1 -w $PING_TIMEOUT &> /dev/null
+ sysctl -qw net.ipv6.conf.all.forwarding=1
+
+ local t1=$(ipv6_stats_get $rtr1 Ip6InAddrErrors)
+ test "$((t1 - t0))" -ne 0
+ check_err $?
+ log_test "Ip6InAddrErrors"
+}
+
+ipv6_in_discard()
+{
+ RET=0
+
+ local t0=$(ipv6_stats_get $rtr1 Ip6InDiscards)
+ local vrf_name=$(master_name_get $h1)
+
+ # Add a policy to discard
+ ip xfrm policy add dst 2001:1:2::2/128 dir fwd action block
+ ip vrf exec $vrf_name \
+ $PING6 2001:1:2::2 -c 1 -w $PING_TIMEOUT &> /dev/null
+ ip xfrm policy del dst 2001:1:2::2/128 dir fwd
+
+ local t1=$(ipv6_stats_get $rtr1 Ip6InDiscards)
+ test "$((t1 - t0))" -ne 0
+ check_err $?
+ log_test "Ip6InDiscards"
+}
+ipv6_ping()
+{
+ RET=0
+
+ ping6_test $h1 2001:1:2::2
+}
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+tests_run
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/net/forwarding/lib.sh b/tools/testing/selftests/net/forwarding/lib.sh
index 85c587a..be6fa80 100644
--- a/tools/testing/selftests/net/forwarding/lib.sh
+++ b/tools/testing/selftests/net/forwarding/lib.sh
@@ -18,6 +18,8 @@
MCD=${MCD:=smcrouted}
MC_CLI=${MC_CLI:=smcroutectl}
PING_TIMEOUT=${PING_TIMEOUT:=5}
+WAIT_TIMEOUT=${WAIT_TIMEOUT:=20}
+INTERFACE_TIMEOUT=${INTERFACE_TIMEOUT:=600}
relative_path="${BASH_SOURCE%/*}"
if [[ "$relative_path" == "${BASH_SOURCE}" ]]; then
@@ -58,6 +60,15 @@
fi
}
+check_tc_action_hw_stats_support()
+{
+ tc actions help 2>&1 | grep -q hw_stats
+ if [[ $? -ne 0 ]]; then
+ echo "SKIP: iproute2 too old; tc is missing action hw_stats support"
+ exit 1
+ fi
+}
+
if [[ "$(id -u)" -ne 0 ]]; then
echo "SKIP: need root privileges"
exit 0
@@ -223,27 +234,116 @@
echo "INFO: $msg"
}
+busywait()
+{
+ local timeout=$1; shift
+
+ local start_time="$(date -u +%s%3N)"
+ while true
+ do
+ local out
+ out=$("$@")
+ local ret=$?
+ if ((!ret)); then
+ echo -n "$out"
+ return 0
+ fi
+
+ local current_time="$(date -u +%s%3N)"
+ if ((current_time - start_time > timeout)); then
+ echo -n "$out"
+ return 1
+ fi
+ done
+}
+
+not()
+{
+ "$@"
+ [[ $? != 0 ]]
+}
+
+grep_bridge_fdb()
+{
+ local addr=$1; shift
+ local word
+ local flag
+
+ if [ "$1" == "self" ] || [ "$1" == "master" ]; then
+ word=$1; shift
+ if [ "$1" == "-v" ]; then
+ flag=$1; shift
+ fi
+ fi
+
+ $@ | grep $addr | grep $flag "$word"
+}
+
+wait_for_offload()
+{
+ "$@" | grep -q offload
+}
+
+until_counter_is()
+{
+ local expr=$1; shift
+ local current=$("$@")
+
+ echo $((current))
+ ((current $expr))
+}
+
+busywait_for_counter()
+{
+ local timeout=$1; shift
+ local delta=$1; shift
+
+ local base=$("$@")
+ busywait "$timeout" until_counter_is ">= $((base + delta))" "$@"
+}
+
setup_wait_dev()
{
local dev=$1; shift
+ local wait_time=${1:-$WAIT_TIME}; shift
- while true; do
+ setup_wait_dev_with_timeout "$dev" $INTERFACE_TIMEOUT $wait_time
+
+ if (($?)); then
+ check_err 1
+ log_test setup_wait_dev ": Interface $dev does not come up."
+ exit 1
+ fi
+}
+
+setup_wait_dev_with_timeout()
+{
+ local dev=$1; shift
+ local max_iterations=${1:-$WAIT_TIMEOUT}; shift
+ local wait_time=${1:-$WAIT_TIME}; shift
+ local i
+
+ for ((i = 1; i <= $max_iterations; ++i)); do
ip link show dev $dev up \
| grep 'state UP' &> /dev/null
if [[ $? -ne 0 ]]; then
sleep 1
else
- break
+ sleep $wait_time
+ return 0
fi
done
+
+ return 1
}
setup_wait()
{
local num_netifs=${1:-$NUM_NETIFS}
+ local i
for ((i = 1; i <= num_netifs; ++i)); do
- setup_wait_dev ${NETIFS[p$i]}
+ setup_wait_dev ${NETIFS[p$i]} 0
done
# Make sure links are ready.
@@ -254,6 +354,7 @@
{
local cmd=$1
local jq_exp=$2
+ local jq_opts=$3
local ret
local output
@@ -263,7 +364,11 @@
if [[ $ret -ne 0 ]]; then
return $ret
fi
- output=$(echo $output | jq -r "$jq_exp")
+ output=$(echo $output | jq -r $jq_opts "$jq_exp")
+ ret=$?
+ if [[ $ret -ne 0 ]]; then
+ return $ret
+ fi
echo $output
# return success only in case of non-empty output
[ ! -z "$output" ]
@@ -524,9 +629,21 @@
local dev=$1; shift
local pref=$1; shift
local dir=$1; shift
+ local selector=${1:-.packets}; shift
tc -j -s filter show dev $dev ${dir:-ingress} pref $pref \
- | jq '.[1].options.actions[].stats.packets'
+ | jq ".[1].options.actions[].stats$selector"
+}
+
+tc_rule_handle_stats_get()
+{
+ local id=$1; shift
+ local handle=$1; shift
+ local selector=${1:-.packets}; shift
+
+ tc -j -s filter show $id \
+ | jq ".[] | select(.options.handle == $handle) | \
+ .options.actions[0].stats$selector"
}
ethtool_stats_get()
@@ -537,6 +654,58 @@
ethtool -S $dev | grep "^ *$stat:" | head -n 1 | cut -d: -f2
}
+qdisc_stats_get()
+{
+ local dev=$1; shift
+ local handle=$1; shift
+ local selector=$1; shift
+
+ tc -j -s qdisc show dev "$dev" \
+ | jq '.[] | select(.handle == "'"$handle"'") | '"$selector"
+}
+
+qdisc_parent_stats_get()
+{
+ local dev=$1; shift
+ local parent=$1; shift
+ local selector=$1; shift
+
+ tc -j -s qdisc show dev "$dev" invisible \
+ | jq '.[] | select(.parent == "'"$parent"'") | '"$selector"
+}
+
+ipv6_stats_get()
+{
+ local dev=$1; shift
+ local stat=$1; shift
+
+ cat /proc/net/dev_snmp6/$dev | grep "^$stat" | cut -f2
+}
+
+humanize()
+{
+ local speed=$1; shift
+
+ for unit in bps Kbps Mbps Gbps; do
+ if (($(echo "$speed < 1024" | bc))); then
+ break
+ fi
+
+ speed=$(echo "scale=1; $speed / 1024" | bc)
+ done
+
+ echo "$speed${unit}"
+}
+
+rate()
+{
+ local t0=$1; shift
+ local t1=$1; shift
+ local interval=$1; shift
+
+ echo $((8 * (t1 - t0) / interval))
+}
+
mac_get()
{
local if_name=$1
@@ -1037,3 +1206,75 @@
flood_unicast_test $br_port $host1_if $host2_if
flood_multicast_test $br_port $host1_if $host2_if
}
+
+__start_traffic()
+{
+ local proto=$1; shift
+ local h_in=$1; shift # Where the traffic egresses the host
+ local sip=$1; shift
+ local dip=$1; shift
+ local dmac=$1; shift
+
+ $MZ $h_in -p 8000 -A $sip -B $dip -c 0 \
+ -a own -b $dmac -t "$proto" -q "$@" &
+ sleep 1
+}
+
+start_traffic()
+{
+ __start_traffic udp "$@"
+}
+
+start_tcp_traffic()
+{
+ __start_traffic tcp "$@"
+}
+
+stop_traffic()
+{
+ # Suppress noise from killing mausezahn.
+ { kill %% && wait %%; } 2>/dev/null
+}
+
+tcpdump_start()
+{
+ local if_name=$1; shift
+ local ns=$1; shift
+
+ capfile=$(mktemp)
+ capout=$(mktemp)
+
+ if [ -z $ns ]; then
+ ns_cmd=""
+ else
+ ns_cmd="ip netns exec ${ns}"
+ fi
+
+ if [ -z $SUDO_USER ] ; then
+ capuser=""
+ else
+ capuser="-Z $SUDO_USER"
+ fi
+
+ $ns_cmd tcpdump -e -n -Q in -i $if_name \
+ -s 65535 -B 32768 $capuser -w $capfile > "$capout" 2>&1 &
+ cappid=$!
+
+ sleep 1
+}
+
+tcpdump_stop()
+{
+ $ns_cmd kill $cappid
+ sleep 1
+}
+
+tcpdump_cleanup()
+{
+ rm $capfile $capout
+}
+
+tcpdump_show()
+{
+ tcpdump -e -n -r $capfile 2>&1
+}
diff --git a/tools/testing/selftests/net/forwarding/mirror_lib.sh b/tools/testing/selftests/net/forwarding/mirror_lib.sh
index 0079759..6406cd7 100644
--- a/tools/testing/selftests/net/forwarding/mirror_lib.sh
+++ b/tools/testing/selftests/net/forwarding/mirror_lib.sh
@@ -20,6 +20,13 @@
tc filter del dev $swp1 $direction pref 1000
}
+is_ipv6()
+{
+ local addr=$1; shift
+
+ [[ -z ${addr//[0-9a-fA-F:]/} ]]
+}
+
mirror_test()
{
local vrf_name=$1; shift
@@ -29,11 +36,17 @@
local pref=$1; shift
local expect=$1; shift
- local ping_timeout=$((PING_TIMEOUT * 5))
+ if is_ipv6 $dip; then
+ local proto=-6
+ local type="icmp6 type=128" # Echo request.
+ else
+ local proto=
+ local type="icmp echoreq"
+ fi
+
local t0=$(tc_rule_stats_get $dev $pref)
- ip vrf exec $vrf_name \
- ${PING} ${sip:+-I $sip} $dip -c 10 -i 0.5 -w $ping_timeout \
- &> /dev/null
+ $MZ $proto $vrf_name ${sip:+-A $sip} -B $dip -a own -b bc -q \
+ -c 10 -d 100msec -t $type
sleep 0.5
local t1=$(tc_rule_stats_get $dev $pref)
local delta=$((t1 - t0))
diff --git a/tools/testing/selftests/net/forwarding/pedit_dsfield.sh b/tools/testing/selftests/net/forwarding/pedit_dsfield.sh
new file mode 100755
index 0000000..64fbd21
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/pedit_dsfield.sh
@@ -0,0 +1,311 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# This test sends traffic from H1 to H2. Either on ingress of $swp1, or on
+# egress of $swp2, the traffic is acted upon by a pedit action. An ingress
+# filter installed on $h2 verifies that the packet looks like expected.
+#
+# +----------------------+ +----------------------+
+# | H1 | | H2 |
+# | + $h1 | | $h2 + |
+# | | 192.0.2.1/28 | | 192.0.2.2/28 | |
+# +----|-----------------+ +----------------|-----+
+# | |
+# +----|----------------------------------------------------------------|-----+
+# | SW | | |
+# | +-|----------------------------------------------------------------|-+ |
+# | | + $swp1 BR $swp2 + | |
+# | +--------------------------------------------------------------------+ |
+# +---------------------------------------------------------------------------+
+
+ALL_TESTS="
+ ping_ipv4
+ ping_ipv6
+ test_ip_dsfield
+ test_ip_dscp
+ test_ip_ecn
+ test_ip_dscp_ecn
+ test_ip6_dsfield
+ test_ip6_dscp
+ test_ip6_ecn
+"
+
+NUM_NETIFS=4
+source lib.sh
+source tc_common.sh
+
+: ${HIT_TIMEOUT:=2000} # ms
+
+h1_create()
+{
+ simple_if_init $h1 192.0.2.1/28 2001:db8:1::1/64
+}
+
+h1_destroy()
+{
+ simple_if_fini $h1 192.0.2.1/28 2001:db8:1::1/64
+}
+
+h2_create()
+{
+ simple_if_init $h2 192.0.2.2/28 2001:db8:1::2/64
+ tc qdisc add dev $h2 clsact
+}
+
+h2_destroy()
+{
+ tc qdisc del dev $h2 clsact
+ simple_if_fini $h2 192.0.2.2/28 2001:db8:1::2/64
+}
+
+switch_create()
+{
+ ip link add name br1 up type bridge vlan_filtering 1
+ ip link set dev $swp1 master br1
+ ip link set dev $swp1 up
+ ip link set dev $swp2 master br1
+ ip link set dev $swp2 up
+
+ tc qdisc add dev $swp1 clsact
+ tc qdisc add dev $swp2 clsact
+}
+
+switch_destroy()
+{
+ tc qdisc del dev $swp2 clsact
+ tc qdisc del dev $swp1 clsact
+
+ ip link set dev $swp2 down
+ ip link set dev $swp2 nomaster
+ ip link set dev $swp1 down
+ ip link set dev $swp1 nomaster
+ ip link del dev br1
+}
+
+setup_prepare()
+{
+ h1=${NETIFS[p1]}
+ swp1=${NETIFS[p2]}
+
+ swp2=${NETIFS[p3]}
+ h2=${NETIFS[p4]}
+
+ h2mac=$(mac_get $h2)
+
+ vrf_prepare
+ h1_create
+ h2_create
+ switch_create
+}
+
+cleanup()
+{
+ pre_cleanup
+
+ switch_destroy
+ h2_destroy
+ h1_destroy
+ vrf_cleanup
+}
+
+ping_ipv4()
+{
+ ping_test $h1 192.0.2.2
+}
+
+ping_ipv6()
+{
+ ping6_test $h1 2001:db8:1::2
+}
+
+do_test_pedit_dsfield_common()
+{
+ local pedit_locus=$1; shift
+ local pedit_action=$1; shift
+ local mz_flags=$1; shift
+
+ RET=0
+
+ # TOS 125: DSCP 31, ECN 1. Used for testing that the relevant part is
+ # overwritten when zero is selected.
+ $MZ $mz_flags $h1 -c 10 -d 20msec -p 100 \
+ -a own -b $h2mac -q -t tcp tos=0x7d,sp=54321,dp=12345
+
+ local pkts
+ pkts=$(busywait "$TC_HIT_TIMEOUT" until_counter_is ">= 10" \
+ tc_rule_handle_stats_get "dev $h2 ingress" 101)
+ check_err $? "Expected to get 10 packets on test probe, but got $pkts."
+
+ pkts=$(tc_rule_handle_stats_get "$pedit_locus" 101)
+ ((pkts >= 10))
+ check_err $? "Expected to get 10 packets on pedit rule, but got $pkts."
+
+ log_test "$pedit_locus pedit $pedit_action"
+}
+
+do_test_pedit_dsfield()
+{
+ local pedit_locus=$1; shift
+ local pedit_action=$1; shift
+ local match_prot=$1; shift
+ local match_flower=$1; shift
+ local mz_flags=$1; shift
+ local saddr=$1; shift
+ local daddr=$1; shift
+
+ tc filter add $pedit_locus handle 101 pref 1 \
+ flower action pedit ex munge $pedit_action
+ tc filter add dev $h2 ingress handle 101 pref 1 prot $match_prot \
+ flower skip_hw $match_flower action pass
+
+ do_test_pedit_dsfield_common "$pedit_locus" "$pedit_action" "$mz_flags"
+
+ tc filter del dev $h2 ingress pref 1
+ tc filter del $pedit_locus pref 1
+}
+
+do_test_ip_dsfield()
+{
+ local locus=$1; shift
+ local dsfield
+
+ for dsfield in 0 1 2 3 128 252 253 254 255; do
+ do_test_pedit_dsfield "$locus" \
+ "ip dsfield set $dsfield" \
+ ip "ip_tos $dsfield" \
+ "-A 192.0.2.1 -B 192.0.2.2"
+ done
+}
+
+test_ip_dsfield()
+{
+ do_test_ip_dsfield "dev $swp1 ingress"
+ do_test_ip_dsfield "dev $swp2 egress"
+}
+
+do_test_ip_dscp()
+{
+ local locus=$1; shift
+ local dscp
+
+ for dscp in 0 1 2 3 32 61 62 63; do
+ do_test_pedit_dsfield "$locus" \
+ "ip dsfield set $((dscp << 2)) retain 0xfc" \
+ ip "ip_tos $(((dscp << 2) | 1))" \
+ "-A 192.0.2.1 -B 192.0.2.2"
+ done
+}
+
+test_ip_dscp()
+{
+ do_test_ip_dscp "dev $swp1 ingress"
+ do_test_ip_dscp "dev $swp2 egress"
+}
+
+do_test_ip_ecn()
+{
+ local locus=$1; shift
+ local ecn
+
+ for ecn in 0 1 2 3; do
+ do_test_pedit_dsfield "$locus" \
+ "ip dsfield set $ecn retain 0x03" \
+ ip "ip_tos $((124 | $ecn))" \
+ "-A 192.0.2.1 -B 192.0.2.2"
+ done
+}
+
+test_ip_ecn()
+{
+ do_test_ip_ecn "dev $swp1 ingress"
+ do_test_ip_ecn "dev $swp2 egress"
+}
+
+do_test_ip_dscp_ecn()
+{
+ local locus=$1; shift
+
+ tc filter add $locus handle 101 pref 1 \
+ flower action pedit ex munge ip dsfield set 124 retain 0xfc \
+ action pedit ex munge ip dsfield set 1 retain 0x03
+ tc filter add dev $h2 ingress handle 101 pref 1 prot ip \
+ flower skip_hw ip_tos 125 action pass
+
+ do_test_pedit_dsfield_common "$locus" "set DSCP + set ECN" \
+ "-A 192.0.2.1 -B 192.0.2.2"
+
+ tc filter del dev $h2 ingress pref 1
+ tc filter del $locus pref 1
+}
+
+test_ip_dscp_ecn()
+{
+ do_test_ip_dscp_ecn "dev $swp1 ingress"
+ do_test_ip_dscp_ecn "dev $swp2 egress"
+}
+
+do_test_ip6_dsfield()
+{
+ local locus=$1; shift
+ local dsfield
+
+ for dsfield in 0 1 2 3 128 252 253 254 255; do
+ do_test_pedit_dsfield "$locus" \
+ "ip6 traffic_class set $dsfield" \
+ ipv6 "ip_tos $dsfield" \
+ "-6 -A 2001:db8:1::1 -B 2001:db8:1::2"
+ done
+}
+
+test_ip6_dsfield()
+{
+ do_test_ip6_dsfield "dev $swp1 ingress"
+ do_test_ip6_dsfield "dev $swp2 egress"
+}
+
+do_test_ip6_dscp()
+{
+ local locus=$1; shift
+ local dscp
+
+ for dscp in 0 1 2 3 32 61 62 63; do
+ do_test_pedit_dsfield "$locus" \
+ "ip6 traffic_class set $((dscp << 2)) retain 0xfc" \
+ ipv6 "ip_tos $(((dscp << 2) | 1))" \
+ "-6 -A 2001:db8:1::1 -B 2001:db8:1::2"
+ done
+}
+
+test_ip6_dscp()
+{
+ do_test_ip6_dscp "dev $swp1 ingress"
+ do_test_ip6_dscp "dev $swp2 egress"
+}
+
+do_test_ip6_ecn()
+{
+ local locus=$1; shift
+ local ecn
+
+ for ecn in 0 1 2 3; do
+ do_test_pedit_dsfield "$locus" \
+ "ip6 traffic_class set $ecn retain 0x3" \
+ ipv6 "ip_tos $((124 | $ecn))" \
+ "-6 -A 2001:db8:1::1 -B 2001:db8:1::2"
+ done
+}
+
+test_ip6_ecn()
+{
+ do_test_ip6_ecn "dev $swp1 ingress"
+ do_test_ip6_ecn "dev $swp2 egress"
+}
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+
+tests_run
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/net/forwarding/pedit_l4port.sh b/tools/testing/selftests/net/forwarding/pedit_l4port.sh
new file mode 100755
index 0000000..10e594c
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/pedit_l4port.sh
@@ -0,0 +1,200 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# This test sends traffic from H1 to H2. Either on ingress of $swp1, or on egress of $swp2, the
+# traffic is acted upon by a pedit action. An ingress filter installed on $h2 verifies that the
+# packet looks like expected.
+#
+# +----------------------+ +----------------------+
+# | H1 | | H2 |
+# | + $h1 | | $h2 + |
+# | | 192.0.2.1/28 | | 192.0.2.2/28 | |
+# +----|-----------------+ +----------------|-----+
+# | |
+# +----|----------------------------------------------------------------|-----+
+# | SW | | |
+# | +-|----------------------------------------------------------------|-+ |
+# | | + $swp1 BR $swp2 + | |
+# | +--------------------------------------------------------------------+ |
+# +---------------------------------------------------------------------------+
+
+ALL_TESTS="
+ ping_ipv4
+ test_udp_sport
+ test_udp_dport
+ test_tcp_sport
+ test_tcp_dport
+"
+
+NUM_NETIFS=4
+source lib.sh
+source tc_common.sh
+
+: ${HIT_TIMEOUT:=2000} # ms
+
+h1_create()
+{
+ simple_if_init $h1 192.0.2.1/28 2001:db8:1::1/64
+}
+
+h1_destroy()
+{
+ simple_if_fini $h1 192.0.2.1/28 2001:db8:1::1/64
+}
+
+h2_create()
+{
+ simple_if_init $h2 192.0.2.2/28 2001:db8:1::2/64
+ tc qdisc add dev $h2 clsact
+}
+
+h2_destroy()
+{
+ tc qdisc del dev $h2 clsact
+ simple_if_fini $h2 192.0.2.2/28 2001:db8:1::2/64
+}
+
+switch_create()
+{
+ ip link add name br1 up type bridge vlan_filtering 1
+ ip link set dev $swp1 master br1
+ ip link set dev $swp1 up
+ ip link set dev $swp2 master br1
+ ip link set dev $swp2 up
+
+ tc qdisc add dev $swp1 clsact
+ tc qdisc add dev $swp2 clsact
+}
+
+switch_destroy()
+{
+ tc qdisc del dev $swp2 clsact
+ tc qdisc del dev $swp1 clsact
+
+ ip link set dev $swp2 down
+ ip link set dev $swp2 nomaster
+ ip link set dev $swp1 down
+ ip link set dev $swp1 nomaster
+ ip link del dev br1
+}
+
+setup_prepare()
+{
+ h1=${NETIFS[p1]}
+ swp1=${NETIFS[p2]}
+
+ swp2=${NETIFS[p3]}
+ h2=${NETIFS[p4]}
+
+ h2mac=$(mac_get $h2)
+
+ vrf_prepare
+ h1_create
+ h2_create
+ switch_create
+}
+
+cleanup()
+{
+ pre_cleanup
+
+ switch_destroy
+ h2_destroy
+ h1_destroy
+ vrf_cleanup
+}
+
+ping_ipv4()
+{
+ ping_test $h1 192.0.2.2
+}
+
+ping_ipv6()
+{
+ ping6_test $h1 2001:db8:1::2
+}
+
+do_test_pedit_l4port_one()
+{
+ local pedit_locus=$1; shift
+ local pedit_prot=$1; shift
+ local pedit_action=$1; shift
+ local match_prot=$1; shift
+ local match_flower=$1; shift
+ local mz_flags=$1; shift
+ local saddr=$1; shift
+ local daddr=$1; shift
+
+ tc filter add $pedit_locus handle 101 pref 1 \
+ flower action pedit ex munge $pedit_action
+ tc filter add dev $h2 ingress handle 101 pref 1 prot $match_prot \
+ flower skip_hw $match_flower action pass
+
+ RET=0
+
+ $MZ $mz_flags $h1 -c 10 -d 20msec -p 100 \
+ -a own -b $h2mac -q -t $pedit_prot sp=54321,dp=12345
+
+ local pkts
+ pkts=$(busywait "$TC_HIT_TIMEOUT" until_counter_is ">= 10" \
+ tc_rule_handle_stats_get "dev $h2 ingress" 101)
+ check_err $? "Expected to get 10 packets, but got $pkts."
+
+ pkts=$(tc_rule_handle_stats_get "$pedit_locus" 101)
+ ((pkts >= 10))
+ check_err $? "Expected to get 10 packets on pedit rule, but got $pkts."
+
+ log_test "$pedit_locus pedit $pedit_action"
+
+ tc filter del dev $h2 ingress pref 1
+ tc filter del $pedit_locus pref 1
+}
+
+do_test_pedit_l4port()
+{
+ local locus=$1; shift
+ local prot=$1; shift
+ local pedit_port=$1; shift
+ local flower_port=$1; shift
+ local port
+
+ for port in 1 11111 65535; do
+ do_test_pedit_l4port_one "$locus" "$prot" \
+ "$prot $pedit_port set $port" \
+ ip "ip_proto $prot $flower_port $port" \
+ "-A 192.0.2.1 -B 192.0.2.2"
+ done
+}
+
+test_udp_sport()
+{
+ do_test_pedit_l4port "dev $swp1 ingress" udp sport src_port
+ do_test_pedit_l4port "dev $swp2 egress" udp sport src_port
+}
+
+test_udp_dport()
+{
+ do_test_pedit_l4port "dev $swp1 ingress" udp dport dst_port
+ do_test_pedit_l4port "dev $swp2 egress" udp dport dst_port
+}
+
+test_tcp_sport()
+{
+ do_test_pedit_l4port "dev $swp1 ingress" tcp sport src_port
+ do_test_pedit_l4port "dev $swp2 egress" tcp sport src_port
+}
+
+test_tcp_dport()
+{
+ do_test_pedit_l4port "dev $swp1 ingress" tcp dport dst_port
+ do_test_pedit_l4port "dev $swp2 egress" tcp dport dst_port
+}
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+
+tests_run
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/net/forwarding/router.sh b/tools/testing/selftests/net/forwarding/router.sh
index a75cb51..057f91b 100755
--- a/tools/testing/selftests/net/forwarding/router.sh
+++ b/tools/testing/selftests/net/forwarding/router.sh
@@ -1,9 +1,23 @@
#!/bin/bash
# SPDX-License-Identifier: GPL-2.0
-ALL_TESTS="ping_ipv4 ping_ipv6"
+ALL_TESTS="
+ ping_ipv4
+ ping_ipv6
+ sip_in_class_e
+ mc_mac_mismatch
+ ipv4_sip_equal_dip
+ ipv6_sip_equal_dip
+ ipv4_dip_link_local
+"
+
NUM_NETIFS=4
source lib.sh
+source tc_common.sh
+
+require_command $MCD
+require_command $MC_CLI
+table_name=selftests
h1_create()
{
@@ -64,6 +78,8 @@
ip link set dev $rp1 up
ip link set dev $rp2 up
+ tc qdisc add dev $rp2 clsact
+
ip address add 192.0.2.1/24 dev $rp1
ip address add 2001:db8:1::1/64 dev $rp1
@@ -79,10 +95,31 @@
ip address del 2001:db8:1::1/64 dev $rp1
ip address del 192.0.2.1/24 dev $rp1
+ tc qdisc del dev $rp2 clsact
+
ip link set dev $rp2 down
ip link set dev $rp1 down
}
+start_mcd()
+{
+ SMCROUTEDIR="$(mktemp -d)"
+
+ for ((i = 1; i <= $NUM_NETIFS; ++i)); do
+ echo "phyint ${NETIFS[p$i]} enable" >> \
+ $SMCROUTEDIR/$table_name.conf
+ done
+
+ $MCD -N -I $table_name -f $SMCROUTEDIR/$table_name.conf \
+ -P $SMCROUTEDIR/$table_name.pid
+}
+
+kill_mcd()
+{
+ pkill $MCD
+ rm -rf $SMCROUTEDIR
+}
+
setup_prepare()
{
h1=${NETIFS[p1]}
@@ -91,6 +128,10 @@
rp2=${NETIFS[p3]}
h2=${NETIFS[p4]}
+ rp1mac=$(mac_get $rp1)
+
+ start_mcd
+
vrf_prepare
h1_create
@@ -113,6 +154,8 @@
h1_destroy
vrf_cleanup
+
+ kill_mcd
}
ping_ipv4()
@@ -125,6 +168,150 @@
ping6_test $h1 2001:db8:2::2
}
+sip_in_class_e()
+{
+ RET=0
+
+ # Disable rpfilter to prevent packets to be dropped because of it.
+ sysctl_set net.ipv4.conf.all.rp_filter 0
+ sysctl_set net.ipv4.conf.$rp1.rp_filter 0
+
+ tc filter add dev $rp2 egress protocol ip pref 1 handle 101 \
+ flower src_ip 240.0.0.1 ip_proto udp action pass
+
+ $MZ $h1 -t udp "sp=54321,dp=12345" -c 5 -d 1msec \
+ -A 240.0.0.1 -b $rp1mac -B 198.51.100.2 -q
+
+ tc_check_packets "dev $rp2 egress" 101 5
+ check_err $? "Packets were dropped"
+
+ log_test "Source IP in class E"
+
+ tc filter del dev $rp2 egress protocol ip pref 1 handle 101 flower
+ sysctl_restore net.ipv4.conf.$rp1.rp_filter
+ sysctl_restore net.ipv4.conf.all.rp_filter
+}
+
+create_mcast_sg()
+{
+ local if_name=$1; shift
+ local s_addr=$1; shift
+ local mcast=$1; shift
+ local dest_ifs=${@}
+
+ $MC_CLI -I $table_name add $if_name $s_addr $mcast $dest_ifs
+}
+
+delete_mcast_sg()
+{
+ local if_name=$1; shift
+ local s_addr=$1; shift
+ local mcast=$1; shift
+ local dest_ifs=${@}
+
+ $MC_CLI -I $table_name remove $if_name $s_addr $mcast $dest_ifs
+}
+
+__mc_mac_mismatch()
+{
+ local desc=$1; shift
+ local proto=$1; shift
+ local sip=$1; shift
+ local dip=$1; shift
+ local flags=${1:-""}; shift
+ local dmac=01:02:03:04:05:06
+
+ RET=0
+
+ tc filter add dev $rp2 egress protocol $proto pref 1 handle 101 \
+ flower dst_ip $dip action pass
+
+ create_mcast_sg $rp1 $sip $dip $rp2
+
+ $MZ $flags $h1 -t udp "sp=54321,dp=12345" -c 5 -d 1msec -b $dmac \
+ -B $dip -q
+
+ tc_check_packets "dev $rp2 egress" 101 5
+ check_err $? "Packets were dropped"
+
+ log_test "Multicast MAC mismatch: $desc"
+
+ delete_mcast_sg $rp1 $sip $dip $rp2
+ tc filter del dev $rp2 egress protocol $proto pref 1 handle 101 flower
+}
+
+mc_mac_mismatch()
+{
+ __mc_mac_mismatch "IPv4" "ip" 192.0.2.2 225.1.2.3
+ __mc_mac_mismatch "IPv6" "ipv6" 2001:db8:1::2 ff0e::3 "-6"
+}
+
+ipv4_sip_equal_dip()
+{
+ RET=0
+
+ # Disable rpfilter to prevent packets to be dropped because of it.
+ sysctl_set net.ipv4.conf.all.rp_filter 0
+ sysctl_set net.ipv4.conf.$rp1.rp_filter 0
+
+ tc filter add dev $rp2 egress protocol ip pref 1 handle 101 \
+ flower src_ip 198.51.100.2 action pass
+
+ $MZ $h1 -t udp "sp=54321,dp=12345" -c 5 -d 1msec \
+ -A 198.51.100.2 -b $rp1mac -B 198.51.100.2 -q
+
+ tc_check_packets "dev $rp2 egress" 101 5
+ check_err $? "Packets were dropped"
+
+ log_test "Source IP is equal to destination IP: IPv4"
+
+ tc filter del dev $rp2 egress protocol ip pref 1 handle 101 flower
+ sysctl_restore net.ipv4.conf.$rp1.rp_filter
+ sysctl_restore net.ipv4.conf.all.rp_filter
+}
+
+ipv6_sip_equal_dip()
+{
+ RET=0
+
+ tc filter add dev $rp2 egress protocol ipv6 pref 1 handle 101 \
+ flower src_ip 2001:db8:2::2 action pass
+
+ $MZ -6 $h1 -t udp "sp=54321,dp=12345" -c 5 -d 1msec \
+ -A 2001:db8:2::2 -b $rp1mac -B 2001:db8:2::2 -q
+
+ tc_check_packets "dev $rp2 egress" 101 5
+ check_err $? "Packets were dropped"
+
+ log_test "Source IP is equal to destination IP: IPv6"
+
+ tc filter del dev $rp2 egress protocol ipv6 pref 1 handle 101 flower
+}
+
+ipv4_dip_link_local()
+{
+ local dip=169.254.1.1
+
+ RET=0
+
+ tc filter add dev $rp2 egress protocol ip pref 1 handle 101 \
+ flower dst_ip $dip action pass
+
+ ip neigh add 169.254.1.1 lladdr 00:11:22:33:44:55 dev $rp2
+ ip route add 169.254.1.0/24 dev $rp2
+
+ $MZ $h1 -t udp "sp=54321,dp=12345" -c 5 -d 1msec -b $rp1mac -B $dip -q
+
+ tc_check_packets "dev $rp2 egress" 101 5
+ check_err $? "Packets were dropped"
+
+ log_test "IPv4 destination IP is link-local"
+
+ ip route del 169.254.1.0/24 dev $rp2
+ ip neigh del 169.254.1.1 lladdr 00:11:22:33:44:55 dev $rp2
+ tc filter del dev $rp2 egress protocol ip pref 1 handle 101 flower
+}
+
trap cleanup EXIT
setup_prepare
diff --git a/tools/testing/selftests/net/forwarding/sch_ets.sh b/tools/testing/selftests/net/forwarding/sch_ets.sh
new file mode 100755
index 0000000..e60c8b4
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/sch_ets.sh
@@ -0,0 +1,47 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# A driver for the ETS selftest that implements testing in slowpath.
+lib_dir=.
+source sch_ets_core.sh
+
+ALL_TESTS="
+ ping_ipv4
+ priomap_mode
+ ets_test_strict
+ ets_test_mixed
+ ets_test_dwrr
+ classifier_mode
+ ets_test_strict
+ ets_test_mixed
+ ets_test_dwrr
+"
+
+switch_create()
+{
+ ets_switch_create
+
+ # Create a bottleneck so that the DWRR process can kick in.
+ tc qdisc add dev $swp2 root handle 1: tbf \
+ rate 1Gbit burst 1Mbit latency 100ms
+ PARENT="parent 1:"
+}
+
+switch_destroy()
+{
+ ets_switch_destroy
+ tc qdisc del dev $swp2 root
+}
+
+# Callback from sch_ets_tests.sh
+collect_stats()
+{
+ local -a streams=("$@")
+ local stream
+
+ for stream in ${streams[@]}; do
+ qdisc_parent_stats_get $swp2 10:$((stream + 1)) .bytes
+ done
+}
+
+ets_run
diff --git a/tools/testing/selftests/net/forwarding/sch_ets_core.sh b/tools/testing/selftests/net/forwarding/sch_ets_core.sh
new file mode 100644
index 0000000..f906fcc
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/sch_ets_core.sh
@@ -0,0 +1,300 @@
+# SPDX-License-Identifier: GPL-2.0
+
+# This is a template for ETS Qdisc test.
+#
+# This test sends from H1 several traffic streams with 802.1p-tagged packets.
+# The tags are used at $swp1 to prioritize the traffic. Each stream is then
+# queued at a different ETS band according to the assigned priority. After
+# runnig for a while, counters at H2 are consulted to determine whether the
+# traffic scheduling was according to the ETS configuration.
+#
+# This template is supposed to be embedded by a test driver, which implements
+# statistics collection, any HW-specific stuff, and prominently configures the
+# system to assure that there is overcommitment at $swp2. That is necessary so
+# that the ETS traffic selection algorithm kicks in and has to schedule some
+# traffic at the expense of other.
+#
+# A driver for veth-based testing is in sch_ets.sh, an example of a driver for
+# an offloaded data path is in selftests/drivers/net/mlxsw/sch_ets.sh.
+#
+# +---------------------------------------------------------------------+
+# | H1 |
+# | + $h1.10 + $h1.11 + $h1.12 |
+# | | 192.0.2.1/28 | 192.0.2.17/28 | 192.0.2.33/28 |
+# | | egress-qos-map | egress-qos-map | egress-qos-map |
+# | | 0:0 | 0:1 | 0:2 |
+# | \____________________ | ____________________/ |
+# | \|/ |
+# | + $h1 |
+# +---------------------------|-----------------------------------------+
+# |
+# +---------------------------|-----------------------------------------+
+# | SW + $swp1 |
+# | | >1Gbps |
+# | ____________________/|\____________________ |
+# | / | \ |
+# | +--|----------------+ +--|----------------+ +--|----------------+ |
+# | | + $swp1.10 | | + $swp1.11 | | + $swp1.12 | |
+# | | ingress-qos-map| | ingress-qos-map| | ingress-qos-map| |
+# | | 0:0 1:1 2:2 | | 0:0 1:1 2:2 | | 0:0 1:1 2:2 | |
+# | | | | | | | |
+# | | BR10 | | BR11 | | BR12 | |
+# | | | | | | | |
+# | | + $swp2.10 | | + $swp2.11 | | + $swp2.12 | |
+# | +--|----------------+ +--|----------------+ +--|----------------+ |
+# | \____________________ | ____________________/ |
+# | \|/ |
+# | + $swp2 |
+# | | 1Gbps (ethtool or HTB qdisc) |
+# | | qdisc ets quanta $W0 $W1 $W2 |
+# | | priomap 0 1 2 |
+# +---------------------------|-----------------------------------------+
+# |
+# +---------------------------|-----------------------------------------+
+# | H2 + $h2 |
+# | ____________________/|\____________________ |
+# | / | \ |
+# | + $h2.10 + $h2.11 + $h2.12 |
+# | 192.0.2.2/28 192.0.2.18/28 192.0.2.34/28 |
+# +---------------------------------------------------------------------+
+
+NUM_NETIFS=4
+CHECK_TC=yes
+source $lib_dir/lib.sh
+source $lib_dir/sch_ets_tests.sh
+
+PARENT=root
+QDISC_DEV=
+
+sip()
+{
+ echo 192.0.2.$((16 * $1 + 1))
+}
+
+dip()
+{
+ echo 192.0.2.$((16 * $1 + 2))
+}
+
+# Callback from sch_ets_tests.sh
+ets_start_traffic()
+{
+ local dst_mac=$(mac_get $h2)
+ local i=$1; shift
+
+ start_traffic $h1.1$i $(sip $i) $(dip $i) $dst_mac
+}
+
+ETS_CHANGE_QDISC=
+
+priomap_mode()
+{
+ echo "Running in priomap mode"
+ ets_delete_qdisc
+ ETS_CHANGE_QDISC=ets_change_qdisc_priomap
+}
+
+classifier_mode()
+{
+ echo "Running in classifier mode"
+ ets_delete_qdisc
+ ETS_CHANGE_QDISC=ets_change_qdisc_classifier
+}
+
+ets_change_qdisc_priomap()
+{
+ local dev=$1; shift
+ local nstrict=$1; shift
+ local priomap=$1; shift
+ local quanta=("${@}")
+
+ local op=$(if [[ -n $QDISC_DEV ]]; then echo change; else echo add; fi)
+
+ tc qdisc $op dev $dev $PARENT handle 10: ets \
+ $(if ((nstrict)); then echo strict $nstrict; fi) \
+ $(if ((${#quanta[@]})); then echo quanta ${quanta[@]}; fi) \
+ priomap $priomap
+ QDISC_DEV=$dev
+}
+
+ets_change_qdisc_classifier()
+{
+ local dev=$1; shift
+ local nstrict=$1; shift
+ local priomap=$1; shift
+ local quanta=("${@}")
+
+ local op=$(if [[ -n $QDISC_DEV ]]; then echo change; else echo add; fi)
+
+ tc qdisc $op dev $dev $PARENT handle 10: ets \
+ $(if ((nstrict)); then echo strict $nstrict; fi) \
+ $(if ((${#quanta[@]})); then echo quanta ${quanta[@]}; fi)
+
+ if [[ $op == add ]]; then
+ local prio=0
+ local band
+
+ for band in $priomap; do
+ tc filter add dev $dev parent 10: basic \
+ match "meta(priority eq $prio)" \
+ flowid 10:$((band + 1))
+ ((prio++))
+ done
+ fi
+ QDISC_DEV=$dev
+}
+
+# Callback from sch_ets_tests.sh
+ets_change_qdisc()
+{
+ if [[ -z "$ETS_CHANGE_QDISC" ]]; then
+ exit 1
+ fi
+ $ETS_CHANGE_QDISC "$@"
+}
+
+ets_delete_qdisc()
+{
+ if [[ -n $QDISC_DEV ]]; then
+ tc qdisc del dev $QDISC_DEV $PARENT
+ QDISC_DEV=
+ fi
+}
+
+h1_create()
+{
+ local i;
+
+ simple_if_init $h1
+ mtu_set $h1 9900
+ for i in {0..2}; do
+ vlan_create $h1 1$i v$h1 $(sip $i)/28
+ ip link set dev $h1.1$i type vlan egress 0:$i
+ done
+}
+
+h1_destroy()
+{
+ local i
+
+ for i in {0..2}; do
+ vlan_destroy $h1 1$i
+ done
+ mtu_restore $h1
+ simple_if_fini $h1
+}
+
+h2_create()
+{
+ local i
+
+ simple_if_init $h2
+ mtu_set $h2 9900
+ for i in {0..2}; do
+ vlan_create $h2 1$i v$h2 $(dip $i)/28
+ done
+}
+
+h2_destroy()
+{
+ local i
+
+ for i in {0..2}; do
+ vlan_destroy $h2 1$i
+ done
+ mtu_restore $h2
+ simple_if_fini $h2
+}
+
+ets_switch_create()
+{
+ local i
+
+ ip link set dev $swp1 up
+ mtu_set $swp1 9900
+
+ ip link set dev $swp2 up
+ mtu_set $swp2 9900
+
+ for i in {0..2}; do
+ vlan_create $swp1 1$i
+ ip link set dev $swp1.1$i type vlan ingress 0:0 1:1 2:2
+
+ vlan_create $swp2 1$i
+
+ ip link add dev br1$i type bridge
+ ip link set dev $swp1.1$i master br1$i
+ ip link set dev $swp2.1$i master br1$i
+
+ ip link set dev br1$i up
+ ip link set dev $swp1.1$i up
+ ip link set dev $swp2.1$i up
+ done
+}
+
+ets_switch_destroy()
+{
+ local i
+
+ ets_delete_qdisc
+
+ for i in {0..2}; do
+ ip link del dev br1$i
+ vlan_destroy $swp2 1$i
+ vlan_destroy $swp1 1$i
+ done
+
+ mtu_restore $swp2
+ ip link set dev $swp2 down
+
+ mtu_restore $swp1
+ ip link set dev $swp1 down
+}
+
+setup_prepare()
+{
+ h1=${NETIFS[p1]}
+ swp1=${NETIFS[p2]}
+
+ swp2=${NETIFS[p3]}
+ h2=${NETIFS[p4]}
+
+ put=$swp2
+ hut=$h2
+
+ vrf_prepare
+
+ h1_create
+ h2_create
+ switch_create
+}
+
+cleanup()
+{
+ pre_cleanup
+
+ switch_destroy
+ h2_destroy
+ h1_destroy
+
+ vrf_cleanup
+}
+
+ping_ipv4()
+{
+ ping_test $h1.10 $(dip 0) " vlan 10"
+ ping_test $h1.11 $(dip 1) " vlan 11"
+ ping_test $h1.12 $(dip 2) " vlan 12"
+}
+
+ets_run()
+{
+ trap cleanup EXIT
+
+ setup_prepare
+ setup_wait
+
+ tests_run
+
+ exit $EXIT_STATUS
+}
diff --git a/tools/testing/selftests/net/forwarding/sch_ets_tests.sh b/tools/testing/selftests/net/forwarding/sch_ets_tests.sh
new file mode 100644
index 0000000..cdf689e
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/sch_ets_tests.sh
@@ -0,0 +1,223 @@
+# SPDX-License-Identifier: GPL-2.0
+
+# Global interface:
+# $put -- port under test (e.g. $swp2)
+# collect_stats($streams...) -- A function to get stats for individual streams
+# ets_start_traffic($band) -- Start traffic for this band
+# ets_change_qdisc($op, $dev, $nstrict, $quanta...) -- Add or change qdisc
+
+# WS describes the Qdisc configuration. It has one value per band (so the
+# number of array elements indicates the number of bands). If the value is
+# 0, it is a strict band, otherwise the it's a DRR band and the value is
+# that band's quantum.
+declare -a WS
+
+qdisc_describe()
+{
+ local nbands=${#WS[@]}
+ local nstrict=0
+ local i
+
+ for ((i = 0; i < nbands; i++)); do
+ if ((!${WS[$i]})); then
+ : $((nstrict++))
+ fi
+ done
+
+ echo -n "ets bands $nbands"
+ if ((nstrict)); then
+ echo -n " strict $nstrict"
+ fi
+ if ((nstrict < nbands)); then
+ echo -n " quanta"
+ for ((i = nstrict; i < nbands; i++)); do
+ echo -n " ${WS[$i]}"
+ done
+ fi
+}
+
+__strict_eval()
+{
+ local desc=$1; shift
+ local d=$1; shift
+ local total=$1; shift
+ local above=$1; shift
+
+ RET=0
+
+ if ((! total)); then
+ check_err 1 "No traffic observed"
+ log_test "$desc"
+ return
+ fi
+
+ local ratio=$(echo "scale=2; 100 * $d / $total" | bc -l)
+ if ((above)); then
+ test $(echo "$ratio > 95.0" | bc -l) -eq 1
+ check_err $? "Not enough traffic"
+ log_test "$desc"
+ log_info "Expected ratio >95% Measured ratio $ratio"
+ else
+ test $(echo "$ratio < 5" | bc -l) -eq 1
+ check_err $? "Too much traffic"
+ log_test "$desc"
+ log_info "Expected ratio <5% Measured ratio $ratio"
+ fi
+}
+
+strict_eval()
+{
+ __strict_eval "$@" 1
+}
+
+notraf_eval()
+{
+ __strict_eval "$@" 0
+}
+
+__ets_dwrr_test()
+{
+ local -a streams=("$@")
+
+ local low_stream=${streams[0]}
+ local seen_strict=0
+ local -a t0 t1 d
+ local stream
+ local total
+ local i
+
+ echo "Testing $(qdisc_describe), streams ${streams[@]}"
+
+ for stream in ${streams[@]}; do
+ ets_start_traffic $stream
+ done
+
+ sleep 10
+
+ t0=($(collect_stats "${streams[@]}"))
+
+ sleep 10
+
+ t1=($(collect_stats "${streams[@]}"))
+ d=($(for ((i = 0; i < ${#streams[@]}; i++)); do
+ echo $((${t1[$i]} - ${t0[$i]}))
+ done))
+ total=$(echo ${d[@]} | sed 's/ /+/g' | bc)
+
+ for ((i = 0; i < ${#streams[@]}; i++)); do
+ local stream=${streams[$i]}
+ if ((seen_strict)); then
+ notraf_eval "band $stream" ${d[$i]} $total
+ elif ((${WS[$stream]} == 0)); then
+ strict_eval "band $stream" ${d[$i]} $total
+ seen_strict=1
+ elif ((stream == low_stream)); then
+ # Low stream is used as DWRR evaluation reference.
+ continue
+ else
+ multipath_eval "bands $low_stream:$stream" \
+ ${WS[$low_stream]} ${WS[$stream]} \
+ ${d[0]} ${d[$i]}
+ fi
+ done
+
+ for stream in ${streams[@]}; do
+ stop_traffic
+ done
+}
+
+ets_dwrr_test_012()
+{
+ __ets_dwrr_test 0 1 2
+}
+
+ets_dwrr_test_01()
+{
+ __ets_dwrr_test 0 1
+}
+
+ets_dwrr_test_12()
+{
+ __ets_dwrr_test 1 2
+}
+
+ets_qdisc_setup()
+{
+ local dev=$1; shift
+ local nstrict=$1; shift
+ local -a quanta=("$@")
+
+ local ndwrr=${#quanta[@]}
+ local nbands=$((nstrict + ndwrr))
+ local nstreams=$(if ((nbands > 3)); then echo 3; else echo $nbands; fi)
+ local priomap=$(seq 0 $((nstreams - 1)))
+ local i
+
+ WS=($(
+ for ((i = 0; i < nstrict; i++)); do
+ echo 0
+ done
+ for ((i = 0; i < ndwrr; i++)); do
+ echo ${quanta[$i]}
+ done
+ ))
+
+ ets_change_qdisc $dev $nstrict "$priomap" ${quanta[@]}
+}
+
+ets_set_dwrr_uniform()
+{
+ ets_qdisc_setup $put 0 3300 3300 3300
+}
+
+ets_set_dwrr_varying()
+{
+ ets_qdisc_setup $put 0 5000 3500 1500
+}
+
+ets_set_strict()
+{
+ ets_qdisc_setup $put 3
+}
+
+ets_set_mixed()
+{
+ ets_qdisc_setup $put 1 5000 2500 1500
+}
+
+ets_change_quantum()
+{
+ tc class change dev $put classid 10:2 ets quantum 8000
+ WS[1]=8000
+}
+
+ets_set_dwrr_two_bands()
+{
+ ets_qdisc_setup $put 0 5000 2500
+}
+
+ets_test_strict()
+{
+ ets_set_strict
+ ets_dwrr_test_01
+ ets_dwrr_test_12
+}
+
+ets_test_mixed()
+{
+ ets_set_mixed
+ ets_dwrr_test_01
+ ets_dwrr_test_12
+}
+
+ets_test_dwrr()
+{
+ ets_set_dwrr_uniform
+ ets_dwrr_test_012
+ ets_set_dwrr_varying
+ ets_dwrr_test_012
+ ets_change_quantum
+ ets_dwrr_test_012
+ ets_set_dwrr_two_bands
+ ets_dwrr_test_01
+}
diff --git a/tools/testing/selftests/net/forwarding/sch_red.sh b/tools/testing/selftests/net/forwarding/sch_red.sh
new file mode 100755
index 0000000..e714bae
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/sch_red.sh
@@ -0,0 +1,492 @@
+# SPDX-License-Identifier: GPL-2.0
+
+# This test sends one stream of traffic from H1 through a TBF shaper, to a RED
+# within TBF shaper on $swp3. The two shapers have the same configuration, and
+# thus the resulting stream should fill all available bandwidth on the latter
+# shaper. A second stream is sent from H2 also via $swp3, and used to inject
+# additional traffic. Since all available bandwidth is taken, this traffic has
+# to go to backlog.
+#
+# +--------------------------+ +--------------------------+
+# | H1 | | H2 |
+# | + $h1 | | + $h2 |
+# | | 192.0.2.1/28 | | | 192.0.2.2/28 |
+# | | TBF 10Mbps | | | |
+# +-----|--------------------+ +-----|--------------------+
+# | |
+# +-----|------------------------------------------------|--------------------+
+# | SW | | |
+# | +--|------------------------------------------------|----------------+ |
+# | | + $swp1 + $swp2 | |
+# | | BR | |
+# | | | |
+# | | + $swp3 | |
+# | | | TBF 10Mbps / RED | |
+# | +--------------------------------|-----------------------------------+ |
+# | | |
+# +-----------------------------------|---------------------------------------+
+# |
+# +-----|--------------------+
+# | H3 | |
+# | + $h1 |
+# | 192.0.2.3/28 |
+# | |
+# +--------------------------+
+
+ALL_TESTS="
+ ping_ipv4
+ ecn_test
+ ecn_nodrop_test
+ red_test
+ red_qevent_test
+ ecn_qevent_test
+"
+
+NUM_NETIFS=6
+CHECK_TC="yes"
+source lib.sh
+
+BACKLOG=30000
+PKTSZ=1400
+
+h1_create()
+{
+ simple_if_init $h1 192.0.2.1/28
+ mtu_set $h1 10000
+ tc qdisc replace dev $h1 root handle 1: tbf \
+ rate 10Mbit burst 10K limit 1M
+}
+
+h1_destroy()
+{
+ tc qdisc del dev $h1 root
+ mtu_restore $h1
+ simple_if_fini $h1 192.0.2.1/28
+}
+
+h2_create()
+{
+ simple_if_init $h2 192.0.2.2/28
+ mtu_set $h2 10000
+}
+
+h2_destroy()
+{
+ mtu_restore $h2
+ simple_if_fini $h2 192.0.2.2/28
+}
+
+h3_create()
+{
+ simple_if_init $h3 192.0.2.3/28
+ mtu_set $h3 10000
+}
+
+h3_destroy()
+{
+ mtu_restore $h3
+ simple_if_fini $h3 192.0.2.3/28
+}
+
+switch_create()
+{
+ ip link add dev br up type bridge
+ ip link set dev $swp1 up master br
+ ip link set dev $swp2 up master br
+ ip link set dev $swp3 up master br
+
+ mtu_set $swp1 10000
+ mtu_set $swp2 10000
+ mtu_set $swp3 10000
+
+ tc qdisc replace dev $swp3 root handle 1: tbf \
+ rate 10Mbit burst 10K limit 1M
+ ip link add name _drop_test up type dummy
+}
+
+switch_destroy()
+{
+ ip link del dev _drop_test
+ tc qdisc del dev $swp3 root
+
+ mtu_restore $h3
+ mtu_restore $h2
+ mtu_restore $h1
+
+ ip link set dev $swp3 down nomaster
+ ip link set dev $swp2 down nomaster
+ ip link set dev $swp1 down nomaster
+ ip link del dev br
+}
+
+setup_prepare()
+{
+ h1=${NETIFS[p1]}
+ swp1=${NETIFS[p2]}
+
+ h2=${NETIFS[p3]}
+ swp2=${NETIFS[p4]}
+
+ swp3=${NETIFS[p5]}
+ h3=${NETIFS[p6]}
+
+ h3_mac=$(mac_get $h3)
+
+ vrf_prepare
+
+ h1_create
+ h2_create
+ h3_create
+ switch_create
+}
+
+cleanup()
+{
+ pre_cleanup
+
+ switch_destroy
+ h3_destroy
+ h2_destroy
+ h1_destroy
+
+ vrf_cleanup
+}
+
+ping_ipv4()
+{
+ ping_test $h1 192.0.2.3 " from host 1"
+ ping_test $h2 192.0.2.3 " from host 2"
+}
+
+get_qdisc_backlog()
+{
+ qdisc_stats_get $swp3 11: .backlog
+}
+
+get_nmarked()
+{
+ qdisc_stats_get $swp3 11: .marked
+}
+
+get_qdisc_npackets()
+{
+ qdisc_stats_get $swp3 11: .packets
+}
+
+get_nmirrored()
+{
+ link_stats_get _drop_test tx packets
+}
+
+send_packets()
+{
+ local proto=$1; shift
+ local pkts=$1; shift
+
+ $MZ $h2 -p $PKTSZ -a own -b $h3_mac -A 192.0.2.2 -B 192.0.2.3 -t $proto -q -c $pkts "$@"
+}
+
+# This sends traffic in an attempt to build a backlog of $size. Returns 0 on
+# success. After 10 failed attempts it bails out and returns 1. It dumps the
+# backlog size to stdout.
+build_backlog()
+{
+ local size=$1; shift
+ local proto=$1; shift
+
+ local i=0
+
+ while :; do
+ local cur=$(get_qdisc_backlog)
+ local diff=$((size - cur))
+ local pkts=$(((diff + PKTSZ - 1) / PKTSZ))
+
+ if ((cur >= size)); then
+ echo $cur
+ return 0
+ elif ((i++ > 10)); then
+ echo $cur
+ return 1
+ fi
+
+ send_packets $proto $pkts "$@"
+ sleep 1
+ done
+}
+
+check_marking()
+{
+ local cond=$1; shift
+
+ local npackets_0=$(get_qdisc_npackets)
+ local nmarked_0=$(get_nmarked)
+ sleep 5
+ local npackets_1=$(get_qdisc_npackets)
+ local nmarked_1=$(get_nmarked)
+
+ local nmarked_d=$((nmarked_1 - nmarked_0))
+ local npackets_d=$((npackets_1 - npackets_0))
+ local pct=$((100 * nmarked_d / npackets_d))
+
+ echo $pct
+ ((pct $cond))
+}
+
+check_mirroring()
+{
+ local cond=$1; shift
+
+ local npackets_0=$(get_qdisc_npackets)
+ local nmirrored_0=$(get_nmirrored)
+ sleep 5
+ local npackets_1=$(get_qdisc_npackets)
+ local nmirrored_1=$(get_nmirrored)
+
+ local nmirrored_d=$((nmirrored_1 - nmirrored_0))
+ local npackets_d=$((npackets_1 - npackets_0))
+ local pct=$((100 * nmirrored_d / npackets_d))
+
+ echo $pct
+ ((pct $cond))
+}
+
+ecn_test_common()
+{
+ local name=$1; shift
+ local limit=$1; shift
+ local backlog
+ local pct
+
+ # Build the below-the-limit backlog using UDP. We could use TCP just
+ # fine, but this way we get a proof that UDP is accepted when queue
+ # length is below the limit. The main stream is using TCP, and if the
+ # limit is misconfigured, we would see this traffic being ECN marked.
+ RET=0
+ backlog=$(build_backlog $((2 * limit / 3)) udp)
+ check_err $? "Could not build the requested backlog"
+ pct=$(check_marking "== 0")
+ check_err $? "backlog $backlog / $limit Got $pct% marked packets, expected == 0."
+ log_test "$name backlog < limit"
+
+ # Now push TCP, because non-TCP traffic would be early-dropped after the
+ # backlog crosses the limit, and we want to make sure that the backlog
+ # is above the limit.
+ RET=0
+ backlog=$(build_backlog $((3 * limit / 2)) tcp tos=0x01)
+ check_err $? "Could not build the requested backlog"
+ pct=$(check_marking ">= 95")
+ check_err $? "backlog $backlog / $limit Got $pct% marked packets, expected >= 95."
+ log_test "$name backlog > limit"
+}
+
+do_ecn_test()
+{
+ local limit=$1; shift
+ local name=ECN
+
+ $MZ $h1 -p $PKTSZ -A 192.0.2.1 -B 192.0.2.3 -c 0 \
+ -a own -b $h3_mac -t tcp -q tos=0x01 &
+ sleep 1
+
+ ecn_test_common "$name" $limit
+
+ # Up there we saw that UDP gets accepted when backlog is below the
+ # limit. Now that it is above, it should all get dropped, and backlog
+ # building should fail.
+ RET=0
+ build_backlog $((2 * limit)) udp >/dev/null
+ check_fail $? "UDP traffic went into backlog instead of being early-dropped"
+ log_test "$name backlog > limit: UDP early-dropped"
+
+ stop_traffic
+ sleep 1
+}
+
+do_ecn_nodrop_test()
+{
+ local limit=$1; shift
+ local name="ECN nodrop"
+
+ $MZ $h1 -p $PKTSZ -A 192.0.2.1 -B 192.0.2.3 -c 0 \
+ -a own -b $h3_mac -t tcp -q tos=0x01 &
+ sleep 1
+
+ ecn_test_common "$name" $limit
+
+ # Up there we saw that UDP gets accepted when backlog is below the
+ # limit. Now that it is above, in nodrop mode, make sure it goes to
+ # backlog as well.
+ RET=0
+ build_backlog $((2 * limit)) udp >/dev/null
+ check_err $? "UDP traffic was early-dropped instead of getting into backlog"
+ log_test "$name backlog > limit: UDP not dropped"
+
+ stop_traffic
+ sleep 1
+}
+
+do_red_test()
+{
+ local limit=$1; shift
+ local backlog
+ local pct
+
+ # Use ECN-capable TCP to verify there's no marking even though the queue
+ # is above limit.
+ $MZ $h1 -p $PKTSZ -A 192.0.2.1 -B 192.0.2.3 -c 0 \
+ -a own -b $h3_mac -t tcp -q tos=0x01 &
+
+ # Pushing below the queue limit should work.
+ RET=0
+ backlog=$(build_backlog $((2 * limit / 3)) tcp tos=0x01)
+ check_err $? "Could not build the requested backlog"
+ pct=$(check_marking "== 0")
+ check_err $? "backlog $backlog / $limit Got $pct% marked packets, expected == 0."
+ log_test "RED backlog < limit"
+
+ # Pushing above should not.
+ RET=0
+ backlog=$(build_backlog $((3 * limit / 2)) tcp tos=0x01)
+ check_fail $? "Traffic went into backlog instead of being early-dropped"
+ pct=$(check_marking "== 0")
+ check_err $? "backlog $backlog / $limit Got $pct% marked packets, expected == 0."
+ log_test "RED backlog > limit"
+
+ stop_traffic
+ sleep 1
+}
+
+do_red_qevent_test()
+{
+ local limit=$1; shift
+ local backlog
+ local base
+ local now
+ local pct
+
+ RET=0
+
+ $MZ $h1 -p $PKTSZ -A 192.0.2.1 -B 192.0.2.3 -c 0 \
+ -a own -b $h3_mac -t udp -q &
+ sleep 1
+
+ tc filter add block 10 pref 1234 handle 102 matchall skip_hw \
+ action mirred egress mirror dev _drop_test
+
+ # Push to the queue until it's at the limit. The configured limit is
+ # rounded by the qdisc, so this is the best we can do to get to the real
+ # limit.
+ build_backlog $((3 * limit / 2)) udp >/dev/null
+
+ base=$(get_nmirrored)
+ send_packets udp 100
+ sleep 1
+ now=$(get_nmirrored)
+ ((now >= base + 100))
+ check_err $? "Dropped packets not observed: 100 expected, $((now - base)) seen"
+
+ tc filter del block 10 pref 1234 handle 102 matchall
+
+ base=$(get_nmirrored)
+ send_packets udp 100
+ sleep 1
+ now=$(get_nmirrored)
+ ((now == base))
+ check_err $? "Dropped packets still observed: 0 expected, $((now - base)) seen"
+
+ log_test "RED early_dropped packets mirrored"
+
+ stop_traffic
+ sleep 1
+}
+
+do_ecn_qevent_test()
+{
+ local limit=$1; shift
+ local name=ECN
+
+ RET=0
+
+ $MZ $h1 -p $PKTSZ -A 192.0.2.1 -B 192.0.2.3 -c 0 \
+ -a own -b $h3_mac -t tcp -q tos=0x01 &
+ sleep 1
+
+ tc filter add block 10 pref 1234 handle 102 matchall skip_hw \
+ action mirred egress mirror dev _drop_test
+
+ backlog=$(build_backlog $((2 * limit / 3)) tcp tos=0x01)
+ check_err $? "Could not build the requested backlog"
+ pct=$(check_mirroring "== 0")
+ check_err $? "backlog $backlog / $limit Got $pct% mirrored packets, expected == 0."
+
+ backlog=$(build_backlog $((3 * limit / 2)) tcp tos=0x01)
+ check_err $? "Could not build the requested backlog"
+ pct=$(check_mirroring ">= 95")
+ check_err $? "backlog $backlog / $limit Got $pct% mirrored packets, expected >= 95."
+
+ tc filter del block 10 pref 1234 handle 102 matchall
+
+ log_test "ECN marked packets mirrored"
+
+ stop_traffic
+ sleep 1
+}
+
+install_qdisc()
+{
+ local -a args=("$@")
+
+ tc qdisc replace dev $swp3 parent 1:1 handle 11: red \
+ limit 1M avpkt $PKTSZ probability 1 \
+ min $BACKLOG max $((BACKLOG + 1)) burst 38 "${args[@]}"
+ sleep 1
+}
+
+uninstall_qdisc()
+{
+ tc qdisc del dev $swp3 parent 1:1
+}
+
+ecn_test()
+{
+ install_qdisc ecn
+ do_ecn_test $BACKLOG
+ uninstall_qdisc
+}
+
+ecn_nodrop_test()
+{
+ install_qdisc ecn nodrop
+ do_ecn_nodrop_test $BACKLOG
+ uninstall_qdisc
+}
+
+red_test()
+{
+ install_qdisc
+ do_red_test $BACKLOG
+ uninstall_qdisc
+}
+
+red_qevent_test()
+{
+ install_qdisc qevent early_drop block 10
+ do_red_qevent_test $BACKLOG
+ uninstall_qdisc
+}
+
+ecn_qevent_test()
+{
+ install_qdisc ecn qevent mark block 10
+ do_ecn_qevent_test $BACKLOG
+ uninstall_qdisc
+}
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+
+tests_run
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/net/forwarding/sch_tbf_core.sh b/tools/testing/selftests/net/forwarding/sch_tbf_core.sh
new file mode 100644
index 0000000..d1f26cb
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/sch_tbf_core.sh
@@ -0,0 +1,233 @@
+# SPDX-License-Identifier: GPL-2.0
+
+# This test sends a stream of traffic from H1 through a switch, to H2. On the
+# egress port from the switch ($swp2), a shaper is installed. The test verifies
+# that the rates on the port match the configured shaper.
+#
+# In order to test per-class shaping, $swp2 actually contains TBF under PRIO or
+# ETS, with two different configurations. Traffic is prioritized using 802.1p.
+#
+# +-------------------------------------------+
+# | H1 |
+# | + $h1.10 $h1.11 + |
+# | | 192.0.2.1/28 192.0.2.17/28 | |
+# | | | |
+# | \______________ _____________/ |
+# | \ / |
+# | + $h1 |
+# +---------------------|---------------------+
+# |
+# +---------------------|---------------------+
+# | SW + $swp1 |
+# | _______________/ \_______________ |
+# | / \ |
+# | +-|--------------+ +--------------|-+ |
+# | | + $swp1.10 | | $swp1.11 + | |
+# | | | | | |
+# | | BR10 | | BR11 | |
+# | | | | | |
+# | | + $swp2.10 | | $swp2.11 + | |
+# | +-|--------------+ +--------------|-+ |
+# | \_______________ ______________/ |
+# | \ / |
+# | + $swp2 |
+# +---------------------|---------------------+
+# |
+# +---------------------|---------------------+
+# | H2 + $h2 |
+# | ______________/ \______________ |
+# | / \ |
+# | | | |
+# | + $h2.10 $h2.11 + |
+# | 192.0.2.2/28 192.0.2.18/28 |
+# +-------------------------------------------+
+
+NUM_NETIFS=4
+CHECK_TC="yes"
+source $lib_dir/lib.sh
+
+ipaddr()
+{
+ local host=$1; shift
+ local vlan=$1; shift
+
+ echo 192.0.2.$((16 * (vlan - 10) + host))
+}
+
+host_create()
+{
+ local dev=$1; shift
+ local host=$1; shift
+
+ simple_if_init $dev
+ mtu_set $dev 10000
+
+ vlan_create $dev 10 v$dev $(ipaddr $host 10)/28
+ ip link set dev $dev.10 type vlan egress 0:0
+
+ vlan_create $dev 11 v$dev $(ipaddr $host 11)/28
+ ip link set dev $dev.11 type vlan egress 0:1
+}
+
+host_destroy()
+{
+ local dev=$1; shift
+
+ vlan_destroy $dev 11
+ vlan_destroy $dev 10
+ mtu_restore $dev
+ simple_if_fini $dev
+}
+
+h1_create()
+{
+ host_create $h1 1
+}
+
+h1_destroy()
+{
+ host_destroy $h1
+}
+
+h2_create()
+{
+ host_create $h2 2
+
+ tc qdisc add dev $h2 clsact
+ tc filter add dev $h2 ingress pref 1010 prot 802.1q \
+ flower $TCFLAGS vlan_id 10 action pass
+ tc filter add dev $h2 ingress pref 1011 prot 802.1q \
+ flower $TCFLAGS vlan_id 11 action pass
+}
+
+h2_destroy()
+{
+ tc qdisc del dev $h2 clsact
+ host_destroy $h2
+}
+
+switch_create()
+{
+ local intf
+ local vlan
+
+ ip link add dev br10 type bridge
+ ip link add dev br11 type bridge
+
+ for intf in $swp1 $swp2; do
+ ip link set dev $intf up
+ mtu_set $intf 10000
+
+ for vlan in 10 11; do
+ vlan_create $intf $vlan
+ ip link set dev $intf.$vlan master br$vlan
+ ip link set dev $intf.$vlan up
+ done
+ done
+
+ for vlan in 10 11; do
+ ip link set dev $swp1.$vlan type vlan ingress 0:0 1:1
+ done
+
+ ip link set dev br10 up
+ ip link set dev br11 up
+}
+
+switch_destroy()
+{
+ local intf
+ local vlan
+
+ # A test may have been interrupted mid-run, with Qdisc installed. Delete
+ # it here.
+ tc qdisc del dev $swp2 root 2>/dev/null
+
+ ip link set dev br11 down
+ ip link set dev br10 down
+
+ for intf in $swp2 $swp1; do
+ for vlan in 11 10; do
+ ip link set dev $intf.$vlan down
+ ip link set dev $intf.$vlan nomaster
+ vlan_destroy $intf $vlan
+ done
+
+ mtu_restore $intf
+ ip link set dev $intf down
+ done
+
+ ip link del dev br11
+ ip link del dev br10
+}
+
+setup_prepare()
+{
+ h1=${NETIFS[p1]}
+ swp1=${NETIFS[p2]}
+
+ swp2=${NETIFS[p3]}
+ h2=${NETIFS[p4]}
+
+ swp3=${NETIFS[p5]}
+ h3=${NETIFS[p6]}
+
+ swp4=${NETIFS[p7]}
+ swp5=${NETIFS[p8]}
+
+ h2_mac=$(mac_get $h2)
+
+ vrf_prepare
+
+ h1_create
+ h2_create
+ switch_create
+}
+
+cleanup()
+{
+ pre_cleanup
+
+ switch_destroy
+ h2_destroy
+ h1_destroy
+
+ vrf_cleanup
+}
+
+ping_ipv4()
+{
+ ping_test $h1.10 $(ipaddr 2 10) " vlan 10"
+ ping_test $h1.11 $(ipaddr 2 11) " vlan 11"
+}
+
+tbf_get_counter()
+{
+ local vlan=$1; shift
+
+ tc_rule_stats_get $h2 10$vlan ingress .bytes
+}
+
+do_tbf_test()
+{
+ local vlan=$1; shift
+ local mbit=$1; shift
+
+ start_traffic $h1.$vlan $(ipaddr 1 $vlan) $(ipaddr 2 $vlan) $h2_mac
+ sleep 5 # Wait for the burst to dwindle
+
+ local t2=$(busywait_for_counter 1000 +1 tbf_get_counter $vlan)
+ sleep 10
+ local t3=$(tbf_get_counter $vlan)
+ stop_traffic
+
+ RET=0
+
+ # Note: TBF uses 10^6 Mbits, not 2^20 ones.
+ local er=$((mbit * 1000 * 1000))
+ local nr=$(rate $t2 $t3 10)
+ local nr_pct=$((100 * (nr - er) / er))
+ ((-5 <= nr_pct && nr_pct <= 5))
+ check_err $? "Expected rate $(humanize $er), got $(humanize $nr), which is $nr_pct% off. Required accuracy is +-5%."
+
+ log_test "TC $((vlan - 10)): TBF rate ${mbit}Mbit"
+}
diff --git a/tools/testing/selftests/net/forwarding/sch_tbf_ets.sh b/tools/testing/selftests/net/forwarding/sch_tbf_ets.sh
new file mode 100755
index 0000000..84fb6ca
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/sch_tbf_ets.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+QDISC="ets strict"
+: ${lib_dir:=.}
+source $lib_dir/sch_tbf_etsprio.sh
diff --git a/tools/testing/selftests/net/forwarding/sch_tbf_etsprio.sh b/tools/testing/selftests/net/forwarding/sch_tbf_etsprio.sh
new file mode 100644
index 0000000..8bd85da
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/sch_tbf_etsprio.sh
@@ -0,0 +1,39 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+ALL_TESTS="
+ ping_ipv4
+ tbf_test
+"
+source $lib_dir/sch_tbf_core.sh
+
+tbf_test_one()
+{
+ local bs=$1; shift
+
+ tc qdisc replace dev $swp2 parent 10:3 handle 103: tbf \
+ rate 400Mbit burst $bs limit 1M
+ tc qdisc replace dev $swp2 parent 10:2 handle 102: tbf \
+ rate 800Mbit burst $bs limit 1M
+
+ do_tbf_test 10 400 $bs
+ do_tbf_test 11 800 $bs
+}
+
+tbf_test()
+{
+ # This test is used for both ETS and PRIO. Even though we only need two
+ # bands, PRIO demands a minimum of three.
+ tc qdisc add dev $swp2 root handle 10: $QDISC 3 priomap 2 1 0
+ tbf_test_one 128K
+ tc qdisc del dev $swp2 root
+}
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+
+tests_run
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/net/forwarding/sch_tbf_prio.sh b/tools/testing/selftests/net/forwarding/sch_tbf_prio.sh
new file mode 100755
index 0000000..9c8cb1c
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/sch_tbf_prio.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+QDISC="prio bands"
+: ${lib_dir:=.}
+source $lib_dir/sch_tbf_etsprio.sh
diff --git a/tools/testing/selftests/net/forwarding/sch_tbf_root.sh b/tools/testing/selftests/net/forwarding/sch_tbf_root.sh
new file mode 100755
index 0000000..72aa21b
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/sch_tbf_root.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+ALL_TESTS="
+ ping_ipv4
+ tbf_test
+"
+: ${lib_dir:=.}
+source $lib_dir/sch_tbf_core.sh
+
+tbf_test_one()
+{
+ local bs=$1; shift
+
+ tc qdisc replace dev $swp2 root handle 108: tbf \
+ rate 400Mbit burst $bs limit 1M
+ do_tbf_test 10 400 $bs
+}
+
+tbf_test()
+{
+ tbf_test_one 128K
+ tc qdisc del dev $swp2 root
+}
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+
+tests_run
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/net/forwarding/skbedit_priority.sh b/tools/testing/selftests/net/forwarding/skbedit_priority.sh
new file mode 100755
index 0000000..bde11dc
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/skbedit_priority.sh
@@ -0,0 +1,170 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# This test sends traffic from H1 to H2. Either on ingress of $swp1, or on
+# egress of $swp2, the traffic is acted upon by an action skbedit priority. The
+# new priority should be taken into account when classifying traffic on the PRIO
+# qdisc at $swp2. The test verifies that for different priority values, the
+# traffic ends up in expected PRIO band.
+#
+# +----------------------+ +----------------------+
+# | H1 | | H2 |
+# | + $h1 | | $h2 + |
+# | | 192.0.2.1/28 | | 192.0.2.2/28 | |
+# +----|-----------------+ +----------------|-----+
+# | |
+# +----|----------------------------------------------------------------|-----+
+# | SW | | |
+# | +-|----------------------------------------------------------------|-+ |
+# | | + $swp1 BR $swp2 + | |
+# | | PRIO | |
+# | +--------------------------------------------------------------------+ |
+# +---------------------------------------------------------------------------+
+
+ALL_TESTS="
+ ping_ipv4
+ test_ingress
+ test_egress
+"
+
+NUM_NETIFS=4
+source lib.sh
+
+: ${HIT_TIMEOUT:=2000} # ms
+
+h1_create()
+{
+ simple_if_init $h1 192.0.2.1/28
+}
+
+h1_destroy()
+{
+ simple_if_fini $h1 192.0.2.1/28
+}
+
+h2_create()
+{
+ simple_if_init $h2 192.0.2.2/28
+}
+
+h2_destroy()
+{
+ simple_if_fini $h2 192.0.2.2/28
+}
+
+switch_create()
+{
+ ip link add name br1 up type bridge vlan_filtering 1
+ ip link set dev $swp1 master br1
+ ip link set dev $swp1 up
+ ip link set dev $swp2 master br1
+ ip link set dev $swp2 up
+
+ tc qdisc add dev $swp1 clsact
+ tc qdisc add dev $swp2 clsact
+ tc qdisc add dev $swp2 root handle 10: \
+ prio bands 8 priomap 7 6 5 4 3 2 1 0
+}
+
+switch_destroy()
+{
+ tc qdisc del dev $swp2 root
+ tc qdisc del dev $swp2 clsact
+ tc qdisc del dev $swp1 clsact
+
+ ip link set dev $swp2 down
+ ip link set dev $swp2 nomaster
+ ip link set dev $swp1 down
+ ip link set dev $swp1 nomaster
+ ip link del dev br1
+}
+
+setup_prepare()
+{
+ h1=${NETIFS[p1]}
+ swp1=${NETIFS[p2]}
+
+ swp2=${NETIFS[p3]}
+ h2=${NETIFS[p4]}
+
+ h2mac=$(mac_get $h2)
+
+ vrf_prepare
+ h1_create
+ h2_create
+ switch_create
+}
+
+cleanup()
+{
+ pre_cleanup
+
+ switch_destroy
+ h2_destroy
+ h1_destroy
+ vrf_cleanup
+}
+
+ping_ipv4()
+{
+ ping_test $h1 192.0.2.2
+}
+
+test_skbedit_priority_one()
+{
+ local locus=$1; shift
+ local prio=$1; shift
+ local classid=$1; shift
+
+ RET=0
+
+ tc filter add $locus handle 101 pref 1 \
+ flower action skbedit priority $prio
+
+ local pkt0=$(qdisc_parent_stats_get $swp2 $classid .packets)
+ local pkt2=$(tc_rule_handle_stats_get "$locus" 101)
+ $MZ $h1 -t udp "sp=54321,dp=12345" -c 10 -d 20msec -p 100 \
+ -a own -b $h2mac -A 192.0.2.1 -B 192.0.2.2 -q
+
+ local pkt1
+ pkt1=$(busywait "$HIT_TIMEOUT" until_counter_is ">= $((pkt0 + 10))" \
+ qdisc_parent_stats_get $swp2 $classid .packets)
+ check_err $? "Expected to get 10 packets on class $classid, but got $((pkt1 - pkt0))."
+
+ local pkt3=$(tc_rule_handle_stats_get "$locus" 101)
+ ((pkt3 >= pkt2 + 10))
+ check_err $? "Expected to get 10 packets on skbedit rule but got $((pkt3 - pkt2))."
+
+ log_test "$locus skbedit priority $prio -> classid $classid"
+
+ tc filter del $locus pref 1
+}
+
+test_ingress()
+{
+ local prio
+
+ for prio in {0..7}; do
+ test_skbedit_priority_one "dev $swp1 ingress" \
+ $prio 10:$((8 - prio))
+ done
+}
+
+test_egress()
+{
+ local prio
+
+ for prio in {0..7}; do
+ test_skbedit_priority_one "dev $swp2 egress" \
+ $prio 10:$((8 - prio))
+ done
+}
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+
+tests_run
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/net/forwarding/tc_actions.sh b/tools/testing/selftests/net/forwarding/tc_actions.sh
index 813d02d..d9eca22 100755
--- a/tools/testing/selftests/net/forwarding/tc_actions.sh
+++ b/tools/testing/selftests/net/forwarding/tc_actions.sh
@@ -2,7 +2,8 @@
# SPDX-License-Identifier: GPL-2.0
ALL_TESTS="gact_drop_and_ok_test mirred_egress_redirect_test \
- mirred_egress_mirror_test gact_trap_test"
+ mirred_egress_mirror_test matchall_mirred_egress_mirror_test \
+ gact_trap_test"
NUM_NETIFS=4
source tc_common.sh
source lib.sh
@@ -50,6 +51,9 @@
mirred_egress_test()
{
local action=$1
+ local protocol=$2
+ local classifier=$3
+ local classifier_args=$4
RET=0
@@ -62,9 +66,9 @@
tc_check_packets "dev $h2 ingress" 101 1
check_fail $? "Matched without redirect rule inserted"
- tc filter add dev $swp1 ingress protocol ip pref 1 handle 101 flower \
- $tcflags dst_ip 192.0.2.2 action mirred egress $action \
- dev $swp2
+ tc filter add dev $swp1 ingress protocol $protocol pref 1 handle 101 \
+ $classifier $tcflags $classifier_args \
+ action mirred egress $action dev $swp2
$MZ $h1 -c 1 -p 64 -a $h1mac -b $h2mac -A 192.0.2.1 -B 192.0.2.2 \
-t ip -q
@@ -72,10 +76,11 @@
tc_check_packets "dev $h2 ingress" 101 1
check_err $? "Did not match incoming $action packet"
- tc filter del dev $swp1 ingress protocol ip pref 1 handle 101 flower
+ tc filter del dev $swp1 ingress protocol $protocol pref 1 handle 101 \
+ $classifier
tc filter del dev $h2 ingress protocol ip pref 1 handle 101 flower
- log_test "mirred egress $action ($tcflags)"
+ log_test "mirred egress $classifier $action ($tcflags)"
}
gact_drop_and_ok_test()
@@ -187,12 +192,17 @@
mirred_egress_redirect_test()
{
- mirred_egress_test "redirect"
+ mirred_egress_test "redirect" "ip" "flower" "dst_ip 192.0.2.2"
}
mirred_egress_mirror_test()
{
- mirred_egress_test "mirror"
+ mirred_egress_test "mirror" "ip" "flower" "dst_ip 192.0.2.2"
+}
+
+matchall_mirred_egress_mirror_test()
+{
+ mirred_egress_test "mirror" "all" "matchall" ""
}
trap cleanup EXIT
diff --git a/tools/testing/selftests/net/forwarding/tc_common.sh b/tools/testing/selftests/net/forwarding/tc_common.sh
index 315e934..0e18e8b 100644
--- a/tools/testing/selftests/net/forwarding/tc_common.sh
+++ b/tools/testing/selftests/net/forwarding/tc_common.sh
@@ -3,14 +3,24 @@
CHECK_TC="yes"
+# Can be overridden by the configuration file. See lib.sh
+TC_HIT_TIMEOUT=${TC_HIT_TIMEOUT:=1000} # ms
+
tc_check_packets()
{
local id=$1
local handle=$2
local count=$3
- cmd_jq "tc -j -s filter show $id" \
- ".[] | select(.options.handle == $handle) | \
- select(.options.actions[0].stats.packets == $count)" \
- &> /dev/null
+ busywait "$TC_HIT_TIMEOUT" until_counter_is "== $count" \
+ tc_rule_handle_stats_get "$id" "$handle" > /dev/null
+}
+
+tc_check_packets_hitting()
+{
+ local id=$1
+ local handle=$2
+
+ busywait "$TC_HIT_TIMEOUT" until_counter_is "> 0" \
+ tc_rule_handle_stats_get "$id" "$handle" > /dev/null
}
diff --git a/tools/testing/selftests/net/forwarding/tc_police.sh b/tools/testing/selftests/net/forwarding/tc_police.sh
new file mode 100755
index 0000000..160f9cc
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/tc_police.sh
@@ -0,0 +1,333 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Test tc-police action.
+#
+# +---------------------------------+
+# | H1 (vrf) |
+# | + $h1 |
+# | | 192.0.2.1/24 |
+# | | |
+# | | default via 192.0.2.2 |
+# +----|----------------------------+
+# |
+# +----|----------------------------------------------------------------------+
+# | SW | |
+# | + $rp1 |
+# | 192.0.2.2/24 |
+# | |
+# | 198.51.100.2/24 203.0.113.2/24 |
+# | + $rp2 + $rp3 |
+# | | | |
+# +----|-----------------------------------------|----------------------------+
+# | |
+# +----|----------------------------+ +----|----------------------------+
+# | | default via 198.51.100.2 | | | default via 203.0.113.2 |
+# | | | | | |
+# | | 198.51.100.1/24 | | | 203.0.113.1/24 |
+# | + $h2 | | + $h3 |
+# | H2 (vrf) | | H3 (vrf) |
+# +---------------------------------+ +---------------------------------+
+
+ALL_TESTS="
+ police_rx_test
+ police_tx_test
+ police_shared_test
+ police_rx_mirror_test
+ police_tx_mirror_test
+"
+NUM_NETIFS=6
+source tc_common.sh
+source lib.sh
+
+h1_create()
+{
+ simple_if_init $h1 192.0.2.1/24
+
+ ip -4 route add default vrf v$h1 nexthop via 192.0.2.2
+}
+
+h1_destroy()
+{
+ ip -4 route del default vrf v$h1 nexthop via 192.0.2.2
+
+ simple_if_fini $h1 192.0.2.1/24
+}
+
+h2_create()
+{
+ simple_if_init $h2 198.51.100.1/24
+
+ ip -4 route add default vrf v$h2 nexthop via 198.51.100.2
+
+ tc qdisc add dev $h2 clsact
+}
+
+h2_destroy()
+{
+ tc qdisc del dev $h2 clsact
+
+ ip -4 route del default vrf v$h2 nexthop via 198.51.100.2
+
+ simple_if_fini $h2 198.51.100.1/24
+}
+
+h3_create()
+{
+ simple_if_init $h3 203.0.113.1/24
+
+ ip -4 route add default vrf v$h3 nexthop via 203.0.113.2
+
+ tc qdisc add dev $h3 clsact
+}
+
+h3_destroy()
+{
+ tc qdisc del dev $h3 clsact
+
+ ip -4 route del default vrf v$h3 nexthop via 203.0.113.2
+
+ simple_if_fini $h3 203.0.113.1/24
+}
+
+router_create()
+{
+ ip link set dev $rp1 up
+ ip link set dev $rp2 up
+ ip link set dev $rp3 up
+
+ __addr_add_del $rp1 add 192.0.2.2/24
+ __addr_add_del $rp2 add 198.51.100.2/24
+ __addr_add_del $rp3 add 203.0.113.2/24
+
+ tc qdisc add dev $rp1 clsact
+ tc qdisc add dev $rp2 clsact
+}
+
+router_destroy()
+{
+ tc qdisc del dev $rp2 clsact
+ tc qdisc del dev $rp1 clsact
+
+ __addr_add_del $rp3 del 203.0.113.2/24
+ __addr_add_del $rp2 del 198.51.100.2/24
+ __addr_add_del $rp1 del 192.0.2.2/24
+
+ ip link set dev $rp3 down
+ ip link set dev $rp2 down
+ ip link set dev $rp1 down
+}
+
+police_common_test()
+{
+ local test_name=$1; shift
+
+ RET=0
+
+ # Rule to measure bandwidth on ingress of $h2
+ tc filter add dev $h2 ingress protocol ip pref 1 handle 101 flower \
+ dst_ip 198.51.100.1 ip_proto udp dst_port 54321 \
+ action drop
+
+ mausezahn $h1 -a own -b $(mac_get $rp1) -A 192.0.2.1 -B 198.51.100.1 \
+ -t udp sp=12345,dp=54321 -p 1000 -c 0 -q &
+
+ local t0=$(tc_rule_stats_get $h2 1 ingress .bytes)
+ sleep 10
+ local t1=$(tc_rule_stats_get $h2 1 ingress .bytes)
+
+ local er=$((80 * 1000 * 1000))
+ local nr=$(rate $t0 $t1 10)
+ local nr_pct=$((100 * (nr - er) / er))
+ ((-10 <= nr_pct && nr_pct <= 10))
+ check_err $? "Expected rate $(humanize $er), got $(humanize $nr), which is $nr_pct% off. Required accuracy is +-10%."
+
+ log_test "$test_name"
+
+ { kill %% && wait %%; } 2>/dev/null
+ tc filter del dev $h2 ingress protocol ip pref 1 handle 101 flower
+}
+
+police_rx_test()
+{
+ # Rule to police traffic destined to $h2 on ingress of $rp1
+ tc filter add dev $rp1 ingress protocol ip pref 1 handle 101 flower \
+ dst_ip 198.51.100.1 ip_proto udp dst_port 54321 \
+ action police rate 80mbit burst 16k conform-exceed drop/ok
+
+ police_common_test "police on rx"
+
+ tc filter del dev $rp1 ingress protocol ip pref 1 handle 101 flower
+}
+
+police_tx_test()
+{
+ # Rule to police traffic destined to $h2 on egress of $rp2
+ tc filter add dev $rp2 egress protocol ip pref 1 handle 101 flower \
+ dst_ip 198.51.100.1 ip_proto udp dst_port 54321 \
+ action police rate 80mbit burst 16k conform-exceed drop/ok
+
+ police_common_test "police on tx"
+
+ tc filter del dev $rp2 egress protocol ip pref 1 handle 101 flower
+}
+
+police_shared_common_test()
+{
+ local dport=$1; shift
+ local test_name=$1; shift
+
+ RET=0
+
+ mausezahn $h1 -a own -b $(mac_get $rp1) -A 192.0.2.1 -B 198.51.100.1 \
+ -t udp sp=12345,dp=$dport -p 1000 -c 0 -q &
+
+ local t0=$(tc_rule_stats_get $h2 1 ingress .bytes)
+ sleep 10
+ local t1=$(tc_rule_stats_get $h2 1 ingress .bytes)
+
+ local er=$((80 * 1000 * 1000))
+ local nr=$(rate $t0 $t1 10)
+ local nr_pct=$((100 * (nr - er) / er))
+ ((-10 <= nr_pct && nr_pct <= 10))
+ check_err $? "Expected rate $(humanize $er), got $(humanize $nr), which is $nr_pct% off. Required accuracy is +-10%."
+
+ log_test "$test_name"
+
+ { kill %% && wait %%; } 2>/dev/null
+}
+
+police_shared_test()
+{
+ # Rule to measure bandwidth on ingress of $h2
+ tc filter add dev $h2 ingress protocol ip pref 1 handle 101 flower \
+ dst_ip 198.51.100.1 ip_proto udp src_port 12345 \
+ action drop
+
+ # Rule to police traffic destined to $h2 on ingress of $rp1
+ tc filter add dev $rp1 ingress protocol ip pref 1 handle 101 flower \
+ dst_ip 198.51.100.1 ip_proto udp dst_port 54321 \
+ action police rate 80mbit burst 16k conform-exceed drop/ok \
+ index 10
+
+ # Rule to police a different flow destined to $h2 on egress of $rp2
+ # using same policer
+ tc filter add dev $rp2 egress protocol ip pref 1 handle 101 flower \
+ dst_ip 198.51.100.1 ip_proto udp dst_port 22222 \
+ action police index 10
+
+ police_shared_common_test 54321 "police with shared policer - rx"
+
+ police_shared_common_test 22222 "police with shared policer - tx"
+
+ tc filter del dev $rp2 egress protocol ip pref 1 handle 101 flower
+ tc filter del dev $rp1 ingress protocol ip pref 1 handle 101 flower
+ tc filter del dev $h2 ingress protocol ip pref 1 handle 101 flower
+}
+
+police_mirror_common_test()
+{
+ local pol_if=$1; shift
+ local dir=$1; shift
+ local test_name=$1; shift
+
+ RET=0
+
+ # Rule to measure bandwidth on ingress of $h2
+ tc filter add dev $h2 ingress protocol ip pref 1 handle 101 flower \
+ dst_ip 198.51.100.1 ip_proto udp dst_port 54321 \
+ action drop
+
+ # Rule to measure bandwidth of mirrored traffic on ingress of $h3
+ tc filter add dev $h3 ingress protocol ip pref 1 handle 101 flower \
+ dst_ip 198.51.100.1 ip_proto udp dst_port 54321 \
+ action drop
+
+ # Rule to police traffic destined to $h2 and mirror to $h3
+ tc filter add dev $pol_if $dir protocol ip pref 1 handle 101 flower \
+ dst_ip 198.51.100.1 ip_proto udp dst_port 54321 \
+ action police rate 80mbit burst 16k conform-exceed drop/pipe \
+ action mirred egress mirror dev $rp3
+
+ mausezahn $h1 -a own -b $(mac_get $rp1) -A 192.0.2.1 -B 198.51.100.1 \
+ -t udp sp=12345,dp=54321 -p 1000 -c 0 -q &
+
+ local t0=$(tc_rule_stats_get $h2 1 ingress .bytes)
+ sleep 10
+ local t1=$(tc_rule_stats_get $h2 1 ingress .bytes)
+
+ local er=$((80 * 1000 * 1000))
+ local nr=$(rate $t0 $t1 10)
+ local nr_pct=$((100 * (nr - er) / er))
+ ((-10 <= nr_pct && nr_pct <= 10))
+ check_err $? "Expected rate $(humanize $er), got $(humanize $nr), which is $nr_pct% off. Required accuracy is +-10%."
+
+ local t0=$(tc_rule_stats_get $h3 1 ingress .bytes)
+ sleep 10
+ local t1=$(tc_rule_stats_get $h3 1 ingress .bytes)
+
+ local er=$((80 * 1000 * 1000))
+ local nr=$(rate $t0 $t1 10)
+ local nr_pct=$((100 * (nr - er) / er))
+ ((-10 <= nr_pct && nr_pct <= 10))
+ check_err $? "Expected rate $(humanize $er), got $(humanize $nr), which is $nr_pct% off. Required accuracy is +-10%."
+
+ log_test "$test_name"
+
+ { kill %% && wait %%; } 2>/dev/null
+ tc filter del dev $pol_if $dir protocol ip pref 1 handle 101 flower
+ tc filter del dev $h3 ingress protocol ip pref 1 handle 101 flower
+ tc filter del dev $h2 ingress protocol ip pref 1 handle 101 flower
+}
+
+police_rx_mirror_test()
+{
+ police_mirror_common_test $rp1 ingress "police rx and mirror"
+}
+
+police_tx_mirror_test()
+{
+ police_mirror_common_test $rp2 egress "police tx and mirror"
+}
+
+setup_prepare()
+{
+ h1=${NETIFS[p1]}
+ rp1=${NETIFS[p2]}
+
+ rp2=${NETIFS[p3]}
+ h2=${NETIFS[p4]}
+
+ rp3=${NETIFS[p5]}
+ h3=${NETIFS[p6]}
+
+ vrf_prepare
+ forwarding_enable
+
+ h1_create
+ h2_create
+ h3_create
+ router_create
+}
+
+cleanup()
+{
+ pre_cleanup
+
+ router_destroy
+ h3_destroy
+ h2_destroy
+ h1_destroy
+
+ forwarding_restore
+ vrf_cleanup
+}
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+
+tests_run
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/net/hwtstamp_config.c b/tools/testing/selftests/net/hwtstamp_config.c
new file mode 100644
index 0000000..e1fdee8
--- /dev/null
+++ b/tools/testing/selftests/net/hwtstamp_config.c
@@ -0,0 +1,135 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Test program for SIOC{G,S}HWTSTAMP
+ * Copyright 2013 Solarflare Communications
+ * Author: Ben Hutchings
+ */
+
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <sys/socket.h>
+#include <sys/ioctl.h>
+
+#include <linux/if.h>
+#include <linux/net_tstamp.h>
+#include <linux/sockios.h>
+
+static int
+lookup_value(const char **names, int size, const char *name)
+{
+ int value;
+
+ for (value = 0; value < size; value++)
+ if (names[value] && strcasecmp(names[value], name) == 0)
+ return value;
+
+ return -1;
+}
+
+static const char *
+lookup_name(const char **names, int size, int value)
+{
+ return (value >= 0 && value < size) ? names[value] : NULL;
+}
+
+static void list_names(FILE *f, const char **names, int size)
+{
+ int value;
+
+ for (value = 0; value < size; value++)
+ if (names[value])
+ fprintf(f, " %s\n", names[value]);
+}
+
+static const char *tx_types[] = {
+#define TX_TYPE(name) [HWTSTAMP_TX_ ## name] = #name
+ TX_TYPE(OFF),
+ TX_TYPE(ON),
+ TX_TYPE(ONESTEP_SYNC)
+#undef TX_TYPE
+};
+#define N_TX_TYPES ((int)(sizeof(tx_types) / sizeof(tx_types[0])))
+
+static const char *rx_filters[] = {
+#define RX_FILTER(name) [HWTSTAMP_FILTER_ ## name] = #name
+ RX_FILTER(NONE),
+ RX_FILTER(ALL),
+ RX_FILTER(SOME),
+ RX_FILTER(PTP_V1_L4_EVENT),
+ RX_FILTER(PTP_V1_L4_SYNC),
+ RX_FILTER(PTP_V1_L4_DELAY_REQ),
+ RX_FILTER(PTP_V2_L4_EVENT),
+ RX_FILTER(PTP_V2_L4_SYNC),
+ RX_FILTER(PTP_V2_L4_DELAY_REQ),
+ RX_FILTER(PTP_V2_L2_EVENT),
+ RX_FILTER(PTP_V2_L2_SYNC),
+ RX_FILTER(PTP_V2_L2_DELAY_REQ),
+ RX_FILTER(PTP_V2_EVENT),
+ RX_FILTER(PTP_V2_SYNC),
+ RX_FILTER(PTP_V2_DELAY_REQ),
+#undef RX_FILTER
+};
+#define N_RX_FILTERS ((int)(sizeof(rx_filters) / sizeof(rx_filters[0])))
+
+static void usage(void)
+{
+ fputs("Usage: hwtstamp_config if_name [tx_type rx_filter]\n"
+ "tx_type is any of (case-insensitive):\n",
+ stderr);
+ list_names(stderr, tx_types, N_TX_TYPES);
+ fputs("rx_filter is any of (case-insensitive):\n", stderr);
+ list_names(stderr, rx_filters, N_RX_FILTERS);
+}
+
+int main(int argc, char **argv)
+{
+ struct ifreq ifr;
+ struct hwtstamp_config config;
+ const char *name;
+ int sock;
+
+ if ((argc != 2 && argc != 4) || (strlen(argv[1]) >= IFNAMSIZ)) {
+ usage();
+ return 2;
+ }
+
+ if (argc == 4) {
+ config.flags = 0;
+ config.tx_type = lookup_value(tx_types, N_TX_TYPES, argv[2]);
+ config.rx_filter = lookup_value(rx_filters, N_RX_FILTERS, argv[3]);
+ if (config.tx_type < 0 || config.rx_filter < 0) {
+ usage();
+ return 2;
+ }
+ }
+
+ sock = socket(AF_INET, SOCK_DGRAM, 0);
+ if (sock < 0) {
+ perror("socket");
+ return 1;
+ }
+
+ strcpy(ifr.ifr_name, argv[1]);
+ ifr.ifr_data = (caddr_t)&config;
+
+ if (ioctl(sock, (argc == 2) ? SIOCGHWTSTAMP : SIOCSHWTSTAMP, &ifr)) {
+ perror("ioctl");
+ return 1;
+ }
+
+ printf("flags = %#x\n", config.flags);
+ name = lookup_name(tx_types, N_TX_TYPES, config.tx_type);
+ if (name)
+ printf("tx_type = %s\n", name);
+ else
+ printf("tx_type = %d\n", config.tx_type);
+ name = lookup_name(rx_filters, N_RX_FILTERS, config.rx_filter);
+ if (name)
+ printf("rx_filter = %s\n", name);
+ else
+ printf("rx_filter = %d\n", config.rx_filter);
+
+ return 0;
+}
diff --git a/tools/testing/selftests/net/ip_defrag.c b/tools/testing/selftests/net/ip_defrag.c
index c0c9ecb..f9ed749 100644
--- a/tools/testing/selftests/net/ip_defrag.c
+++ b/tools/testing/selftests/net/ip_defrag.c
@@ -192,9 +192,9 @@
}
res = sendto(fd_raw, ip_frame, frag_len, 0, addr, alen);
- if (res < 0)
+ if (res < 0 && errno != EPERM)
error(1, errno, "send_fragment");
- if (res != frag_len)
+ if (res >= 0 && res != frag_len)
error(1, 0, "send_fragment: %d vs %d", res, frag_len);
frag_counter++;
@@ -313,9 +313,9 @@
iphdr->ip_len = htons(frag_len);
}
res = sendto(fd_raw, ip_frame, frag_len, 0, addr, alen);
- if (res < 0)
+ if (res < 0 && errno != EPERM)
error(1, errno, "sendto overlap: %d", frag_len);
- if (res != frag_len)
+ if (res >= 0 && res != frag_len)
error(1, 0, "sendto overlap: %d vs %d", (int)res, frag_len);
frag_counter++;
}
diff --git a/tools/testing/selftests/net/ipsec.c b/tools/testing/selftests/net/ipsec.c
new file mode 100644
index 0000000..17ced7d
--- /dev/null
+++ b/tools/testing/selftests/net/ipsec.c
@@ -0,0 +1,2195 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * ipsec.c - Check xfrm on veth inside a net-ns.
+ * Copyright (c) 2018 Dmitry Safonov
+ */
+
+#define _GNU_SOURCE
+
+#include <arpa/inet.h>
+#include <asm/types.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <linux/limits.h>
+#include <linux/netlink.h>
+#include <linux/random.h>
+#include <linux/rtnetlink.h>
+#include <linux/veth.h>
+#include <linux/xfrm.h>
+#include <netinet/in.h>
+#include <net/if.h>
+#include <sched.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <time.h>
+#include <unistd.h>
+
+#include "../kselftest.h"
+
+#define printk(fmt, ...) \
+ ksft_print_msg("%d[%u] " fmt "\n", getpid(), __LINE__, ##__VA_ARGS__)
+
+#define pr_err(fmt, ...) printk(fmt ": %m", ##__VA_ARGS__)
+
+#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]))
+#define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)]))
+
+#define IPV4_STR_SZ 16 /* xxx.xxx.xxx.xxx is longest + \0 */
+#define MAX_PAYLOAD 2048
+#define XFRM_ALGO_KEY_BUF_SIZE 512
+#define MAX_PROCESSES (1 << 14) /* /16 mask divided by /30 subnets */
+#define INADDR_A ((in_addr_t) 0x0a000000) /* 10.0.0.0 */
+#define INADDR_B ((in_addr_t) 0xc0a80000) /* 192.168.0.0 */
+
+/* /30 mask for one veth connection */
+#define PREFIX_LEN 30
+#define child_ip(nr) (4*nr + 1)
+#define grchild_ip(nr) (4*nr + 2)
+
+#define VETH_FMT "ktst-%d"
+#define VETH_LEN 12
+
+static int nsfd_parent = -1;
+static int nsfd_childa = -1;
+static int nsfd_childb = -1;
+static long page_size;
+
+/*
+ * ksft_cnt is static in kselftest, so isn't shared with children.
+ * We have to send a test result back to parent and count there.
+ * results_fd is a pipe with test feedback from children.
+ */
+static int results_fd[2];
+
+const unsigned int ping_delay_nsec = 50 * 1000 * 1000;
+const unsigned int ping_timeout = 300;
+const unsigned int ping_count = 100;
+const unsigned int ping_success = 80;
+
+static void randomize_buffer(void *buf, size_t buflen)
+{
+ int *p = (int *)buf;
+ size_t words = buflen / sizeof(int);
+ size_t leftover = buflen % sizeof(int);
+
+ if (!buflen)
+ return;
+
+ while (words--)
+ *p++ = rand();
+
+ if (leftover) {
+ int tmp = rand();
+
+ memcpy(buf + buflen - leftover, &tmp, leftover);
+ }
+
+ return;
+}
+
+static int unshare_open(void)
+{
+ const char *netns_path = "/proc/self/ns/net";
+ int fd;
+
+ if (unshare(CLONE_NEWNET) != 0) {
+ pr_err("unshare()");
+ return -1;
+ }
+
+ fd = open(netns_path, O_RDONLY);
+ if (fd <= 0) {
+ pr_err("open(%s)", netns_path);
+ return -1;
+ }
+
+ return fd;
+}
+
+static int switch_ns(int fd)
+{
+ if (setns(fd, CLONE_NEWNET)) {
+ pr_err("setns()");
+ return -1;
+ }
+ return 0;
+}
+
+/*
+ * Running the test inside a new parent net namespace to bother less
+ * about cleanup on error-path.
+ */
+static int init_namespaces(void)
+{
+ nsfd_parent = unshare_open();
+ if (nsfd_parent <= 0)
+ return -1;
+
+ nsfd_childa = unshare_open();
+ if (nsfd_childa <= 0)
+ return -1;
+
+ if (switch_ns(nsfd_parent))
+ return -1;
+
+ nsfd_childb = unshare_open();
+ if (nsfd_childb <= 0)
+ return -1;
+
+ if (switch_ns(nsfd_parent))
+ return -1;
+ return 0;
+}
+
+static int netlink_sock(int *sock, uint32_t *seq_nr, int proto)
+{
+ if (*sock > 0) {
+ seq_nr++;
+ return 0;
+ }
+
+ *sock = socket(AF_NETLINK, SOCK_RAW | SOCK_CLOEXEC, proto);
+ if (*sock <= 0) {
+ pr_err("socket(AF_NETLINK)");
+ return -1;
+ }
+
+ randomize_buffer(seq_nr, sizeof(*seq_nr));
+
+ return 0;
+}
+
+static inline struct rtattr *rtattr_hdr(struct nlmsghdr *nh)
+{
+ return (struct rtattr *)((char *)(nh) + RTA_ALIGN((nh)->nlmsg_len));
+}
+
+static int rtattr_pack(struct nlmsghdr *nh, size_t req_sz,
+ unsigned short rta_type, const void *payload, size_t size)
+{
+ /* NLMSG_ALIGNTO == RTA_ALIGNTO, nlmsg_len already aligned */
+ struct rtattr *attr = rtattr_hdr(nh);
+ size_t nl_size = RTA_ALIGN(nh->nlmsg_len) + RTA_LENGTH(size);
+
+ if (req_sz < nl_size) {
+ printk("req buf is too small: %zu < %zu", req_sz, nl_size);
+ return -1;
+ }
+ nh->nlmsg_len = nl_size;
+
+ attr->rta_len = RTA_LENGTH(size);
+ attr->rta_type = rta_type;
+ memcpy(RTA_DATA(attr), payload, size);
+
+ return 0;
+}
+
+static struct rtattr *_rtattr_begin(struct nlmsghdr *nh, size_t req_sz,
+ unsigned short rta_type, const void *payload, size_t size)
+{
+ struct rtattr *ret = rtattr_hdr(nh);
+
+ if (rtattr_pack(nh, req_sz, rta_type, payload, size))
+ return 0;
+
+ return ret;
+}
+
+static inline struct rtattr *rtattr_begin(struct nlmsghdr *nh, size_t req_sz,
+ unsigned short rta_type)
+{
+ return _rtattr_begin(nh, req_sz, rta_type, 0, 0);
+}
+
+static inline void rtattr_end(struct nlmsghdr *nh, struct rtattr *attr)
+{
+ char *nlmsg_end = (char *)nh + nh->nlmsg_len;
+
+ attr->rta_len = nlmsg_end - (char *)attr;
+}
+
+static int veth_pack_peerb(struct nlmsghdr *nh, size_t req_sz,
+ const char *peer, int ns)
+{
+ struct ifinfomsg pi;
+ struct rtattr *peer_attr;
+
+ memset(&pi, 0, sizeof(pi));
+ pi.ifi_family = AF_UNSPEC;
+ pi.ifi_change = 0xFFFFFFFF;
+
+ peer_attr = _rtattr_begin(nh, req_sz, VETH_INFO_PEER, &pi, sizeof(pi));
+ if (!peer_attr)
+ return -1;
+
+ if (rtattr_pack(nh, req_sz, IFLA_IFNAME, peer, strlen(peer)))
+ return -1;
+
+ if (rtattr_pack(nh, req_sz, IFLA_NET_NS_FD, &ns, sizeof(ns)))
+ return -1;
+
+ rtattr_end(nh, peer_attr);
+
+ return 0;
+}
+
+static int netlink_check_answer(int sock)
+{
+ struct nlmsgerror {
+ struct nlmsghdr hdr;
+ int error;
+ struct nlmsghdr orig_msg;
+ } answer;
+
+ if (recv(sock, &answer, sizeof(answer), 0) < 0) {
+ pr_err("recv()");
+ return -1;
+ } else if (answer.hdr.nlmsg_type != NLMSG_ERROR) {
+ printk("expected NLMSG_ERROR, got %d", (int)answer.hdr.nlmsg_type);
+ return -1;
+ } else if (answer.error) {
+ printk("NLMSG_ERROR: %d: %s",
+ answer.error, strerror(-answer.error));
+ return answer.error;
+ }
+
+ return 0;
+}
+
+static int veth_add(int sock, uint32_t seq, const char *peera, int ns_a,
+ const char *peerb, int ns_b)
+{
+ uint16_t flags = NLM_F_REQUEST | NLM_F_ACK | NLM_F_EXCL | NLM_F_CREATE;
+ struct {
+ struct nlmsghdr nh;
+ struct ifinfomsg info;
+ char attrbuf[MAX_PAYLOAD];
+ } req;
+ const char veth_type[] = "veth";
+ struct rtattr *link_info, *info_data;
+
+ memset(&req, 0, sizeof(req));
+ req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(req.info));
+ req.nh.nlmsg_type = RTM_NEWLINK;
+ req.nh.nlmsg_flags = flags;
+ req.nh.nlmsg_seq = seq;
+ req.info.ifi_family = AF_UNSPEC;
+ req.info.ifi_change = 0xFFFFFFFF;
+
+ if (rtattr_pack(&req.nh, sizeof(req), IFLA_IFNAME, peera, strlen(peera)))
+ return -1;
+
+ if (rtattr_pack(&req.nh, sizeof(req), IFLA_NET_NS_FD, &ns_a, sizeof(ns_a)))
+ return -1;
+
+ link_info = rtattr_begin(&req.nh, sizeof(req), IFLA_LINKINFO);
+ if (!link_info)
+ return -1;
+
+ if (rtattr_pack(&req.nh, sizeof(req), IFLA_INFO_KIND, veth_type, sizeof(veth_type)))
+ return -1;
+
+ info_data = rtattr_begin(&req.nh, sizeof(req), IFLA_INFO_DATA);
+ if (!info_data)
+ return -1;
+
+ if (veth_pack_peerb(&req.nh, sizeof(req), peerb, ns_b))
+ return -1;
+
+ rtattr_end(&req.nh, info_data);
+ rtattr_end(&req.nh, link_info);
+
+ if (send(sock, &req, req.nh.nlmsg_len, 0) < 0) {
+ pr_err("send()");
+ return -1;
+ }
+ return netlink_check_answer(sock);
+}
+
+static int ip4_addr_set(int sock, uint32_t seq, const char *intf,
+ struct in_addr addr, uint8_t prefix)
+{
+ uint16_t flags = NLM_F_REQUEST | NLM_F_ACK | NLM_F_EXCL | NLM_F_CREATE;
+ struct {
+ struct nlmsghdr nh;
+ struct ifaddrmsg info;
+ char attrbuf[MAX_PAYLOAD];
+ } req;
+
+ memset(&req, 0, sizeof(req));
+ req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(req.info));
+ req.nh.nlmsg_type = RTM_NEWADDR;
+ req.nh.nlmsg_flags = flags;
+ req.nh.nlmsg_seq = seq;
+ req.info.ifa_family = AF_INET;
+ req.info.ifa_prefixlen = prefix;
+ req.info.ifa_index = if_nametoindex(intf);
+
+#ifdef DEBUG
+ {
+ char addr_str[IPV4_STR_SZ] = {};
+
+ strncpy(addr_str, inet_ntoa(addr), IPV4_STR_SZ - 1);
+
+ printk("ip addr set %s", addr_str);
+ }
+#endif
+
+ if (rtattr_pack(&req.nh, sizeof(req), IFA_LOCAL, &addr, sizeof(addr)))
+ return -1;
+
+ if (rtattr_pack(&req.nh, sizeof(req), IFA_ADDRESS, &addr, sizeof(addr)))
+ return -1;
+
+ if (send(sock, &req, req.nh.nlmsg_len, 0) < 0) {
+ pr_err("send()");
+ return -1;
+ }
+ return netlink_check_answer(sock);
+}
+
+static int link_set_up(int sock, uint32_t seq, const char *intf)
+{
+ struct {
+ struct nlmsghdr nh;
+ struct ifinfomsg info;
+ char attrbuf[MAX_PAYLOAD];
+ } req;
+
+ memset(&req, 0, sizeof(req));
+ req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(req.info));
+ req.nh.nlmsg_type = RTM_NEWLINK;
+ req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
+ req.nh.nlmsg_seq = seq;
+ req.info.ifi_family = AF_UNSPEC;
+ req.info.ifi_change = 0xFFFFFFFF;
+ req.info.ifi_index = if_nametoindex(intf);
+ req.info.ifi_flags = IFF_UP;
+ req.info.ifi_change = IFF_UP;
+
+ if (send(sock, &req, req.nh.nlmsg_len, 0) < 0) {
+ pr_err("send()");
+ return -1;
+ }
+ return netlink_check_answer(sock);
+}
+
+static int ip4_route_set(int sock, uint32_t seq, const char *intf,
+ struct in_addr src, struct in_addr dst)
+{
+ struct {
+ struct nlmsghdr nh;
+ struct rtmsg rt;
+ char attrbuf[MAX_PAYLOAD];
+ } req;
+ unsigned int index = if_nametoindex(intf);
+
+ memset(&req, 0, sizeof(req));
+ req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(req.rt));
+ req.nh.nlmsg_type = RTM_NEWROUTE;
+ req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK | NLM_F_CREATE;
+ req.nh.nlmsg_seq = seq;
+ req.rt.rtm_family = AF_INET;
+ req.rt.rtm_dst_len = 32;
+ req.rt.rtm_table = RT_TABLE_MAIN;
+ req.rt.rtm_protocol = RTPROT_BOOT;
+ req.rt.rtm_scope = RT_SCOPE_LINK;
+ req.rt.rtm_type = RTN_UNICAST;
+
+ if (rtattr_pack(&req.nh, sizeof(req), RTA_DST, &dst, sizeof(dst)))
+ return -1;
+
+ if (rtattr_pack(&req.nh, sizeof(req), RTA_PREFSRC, &src, sizeof(src)))
+ return -1;
+
+ if (rtattr_pack(&req.nh, sizeof(req), RTA_OIF, &index, sizeof(index)))
+ return -1;
+
+ if (send(sock, &req, req.nh.nlmsg_len, 0) < 0) {
+ pr_err("send()");
+ return -1;
+ }
+
+ return netlink_check_answer(sock);
+}
+
+static int tunnel_set_route(int route_sock, uint32_t *route_seq, char *veth,
+ struct in_addr tunsrc, struct in_addr tundst)
+{
+ if (ip4_addr_set(route_sock, (*route_seq)++, "lo",
+ tunsrc, PREFIX_LEN)) {
+ printk("Failed to set ipv4 addr");
+ return -1;
+ }
+
+ if (ip4_route_set(route_sock, (*route_seq)++, veth, tunsrc, tundst)) {
+ printk("Failed to set ipv4 route");
+ return -1;
+ }
+
+ return 0;
+}
+
+static int init_child(int nsfd, char *veth, unsigned int src, unsigned int dst)
+{
+ struct in_addr intsrc = inet_makeaddr(INADDR_B, src);
+ struct in_addr tunsrc = inet_makeaddr(INADDR_A, src);
+ struct in_addr tundst = inet_makeaddr(INADDR_A, dst);
+ int route_sock = -1, ret = -1;
+ uint32_t route_seq;
+
+ if (switch_ns(nsfd))
+ return -1;
+
+ if (netlink_sock(&route_sock, &route_seq, NETLINK_ROUTE)) {
+ printk("Failed to open netlink route socket in child");
+ return -1;
+ }
+
+ if (ip4_addr_set(route_sock, route_seq++, veth, intsrc, PREFIX_LEN)) {
+ printk("Failed to set ipv4 addr");
+ goto err;
+ }
+
+ if (link_set_up(route_sock, route_seq++, veth)) {
+ printk("Failed to bring up %s", veth);
+ goto err;
+ }
+
+ if (tunnel_set_route(route_sock, &route_seq, veth, tunsrc, tundst)) {
+ printk("Failed to add tunnel route on %s", veth);
+ goto err;
+ }
+ ret = 0;
+
+err:
+ close(route_sock);
+ return ret;
+}
+
+#define ALGO_LEN 64
+enum desc_type {
+ CREATE_TUNNEL = 0,
+ ALLOCATE_SPI,
+ MONITOR_ACQUIRE,
+ EXPIRE_STATE,
+ EXPIRE_POLICY,
+};
+const char *desc_name[] = {
+ "create tunnel",
+ "alloc spi",
+ "monitor acquire",
+ "expire state",
+ "expire policy"
+};
+struct xfrm_desc {
+ enum desc_type type;
+ uint8_t proto;
+ char a_algo[ALGO_LEN];
+ char e_algo[ALGO_LEN];
+ char c_algo[ALGO_LEN];
+ char ae_algo[ALGO_LEN];
+ unsigned int icv_len;
+ /* unsigned key_len; */
+};
+
+enum msg_type {
+ MSG_ACK = 0,
+ MSG_EXIT,
+ MSG_PING,
+ MSG_XFRM_PREPARE,
+ MSG_XFRM_ADD,
+ MSG_XFRM_DEL,
+ MSG_XFRM_CLEANUP,
+};
+
+struct test_desc {
+ enum msg_type type;
+ union {
+ struct {
+ in_addr_t reply_ip;
+ unsigned int port;
+ } ping;
+ struct xfrm_desc xfrm_desc;
+ } body;
+};
+
+struct test_result {
+ struct xfrm_desc desc;
+ unsigned int res;
+};
+
+static void write_test_result(unsigned int res, struct xfrm_desc *d)
+{
+ struct test_result tr = {};
+ ssize_t ret;
+
+ tr.desc = *d;
+ tr.res = res;
+
+ ret = write(results_fd[1], &tr, sizeof(tr));
+ if (ret != sizeof(tr))
+ pr_err("Failed to write the result in pipe %zd", ret);
+}
+
+static void write_msg(int fd, struct test_desc *msg, bool exit_of_fail)
+{
+ ssize_t bytes = write(fd, msg, sizeof(*msg));
+
+ /* Make sure that write/read is atomic to a pipe */
+ BUILD_BUG_ON(sizeof(struct test_desc) > PIPE_BUF);
+
+ if (bytes < 0) {
+ pr_err("write()");
+ if (exit_of_fail)
+ exit(KSFT_FAIL);
+ }
+ if (bytes != sizeof(*msg)) {
+ pr_err("sent part of the message %zd/%zu", bytes, sizeof(*msg));
+ if (exit_of_fail)
+ exit(KSFT_FAIL);
+ }
+}
+
+static void read_msg(int fd, struct test_desc *msg, bool exit_of_fail)
+{
+ ssize_t bytes = read(fd, msg, sizeof(*msg));
+
+ if (bytes < 0) {
+ pr_err("read()");
+ if (exit_of_fail)
+ exit(KSFT_FAIL);
+ }
+ if (bytes != sizeof(*msg)) {
+ pr_err("got incomplete message %zd/%zu", bytes, sizeof(*msg));
+ if (exit_of_fail)
+ exit(KSFT_FAIL);
+ }
+}
+
+static int udp_ping_init(struct in_addr listen_ip, unsigned int u_timeout,
+ unsigned int *server_port, int sock[2])
+{
+ struct sockaddr_in server;
+ struct timeval t = { .tv_sec = 0, .tv_usec = u_timeout };
+ socklen_t s_len = sizeof(server);
+
+ sock[0] = socket(AF_INET, SOCK_DGRAM, 0);
+ if (sock[0] < 0) {
+ pr_err("socket()");
+ return -1;
+ }
+
+ server.sin_family = AF_INET;
+ server.sin_port = 0;
+ memcpy(&server.sin_addr.s_addr, &listen_ip, sizeof(struct in_addr));
+
+ if (bind(sock[0], (struct sockaddr *)&server, s_len)) {
+ pr_err("bind()");
+ goto err_close_server;
+ }
+
+ if (getsockname(sock[0], (struct sockaddr *)&server, &s_len)) {
+ pr_err("getsockname()");
+ goto err_close_server;
+ }
+
+ *server_port = ntohs(server.sin_port);
+
+ if (setsockopt(sock[0], SOL_SOCKET, SO_RCVTIMEO, (const char *)&t, sizeof t)) {
+ pr_err("setsockopt()");
+ goto err_close_server;
+ }
+
+ sock[1] = socket(AF_INET, SOCK_DGRAM, 0);
+ if (sock[1] < 0) {
+ pr_err("socket()");
+ goto err_close_server;
+ }
+
+ return 0;
+
+err_close_server:
+ close(sock[0]);
+ return -1;
+}
+
+static int udp_ping_send(int sock[2], in_addr_t dest_ip, unsigned int port,
+ char *buf, size_t buf_len)
+{
+ struct sockaddr_in server;
+ const struct sockaddr *dest_addr = (struct sockaddr *)&server;
+ char *sock_buf[buf_len];
+ ssize_t r_bytes, s_bytes;
+
+ server.sin_family = AF_INET;
+ server.sin_port = htons(port);
+ server.sin_addr.s_addr = dest_ip;
+
+ s_bytes = sendto(sock[1], buf, buf_len, 0, dest_addr, sizeof(server));
+ if (s_bytes < 0) {
+ pr_err("sendto()");
+ return -1;
+ } else if (s_bytes != buf_len) {
+ printk("send part of the message: %zd/%zu", s_bytes, sizeof(server));
+ return -1;
+ }
+
+ r_bytes = recv(sock[0], sock_buf, buf_len, 0);
+ if (r_bytes < 0) {
+ if (errno != EAGAIN)
+ pr_err("recv()");
+ return -1;
+ } else if (r_bytes == 0) { /* EOF */
+ printk("EOF on reply to ping");
+ return -1;
+ } else if (r_bytes != buf_len || memcmp(buf, sock_buf, buf_len)) {
+ printk("ping reply packet is corrupted %zd/%zu", r_bytes, buf_len);
+ return -1;
+ }
+
+ return 0;
+}
+
+static int udp_ping_reply(int sock[2], in_addr_t dest_ip, unsigned int port,
+ char *buf, size_t buf_len)
+{
+ struct sockaddr_in server;
+ const struct sockaddr *dest_addr = (struct sockaddr *)&server;
+ char *sock_buf[buf_len];
+ ssize_t r_bytes, s_bytes;
+
+ server.sin_family = AF_INET;
+ server.sin_port = htons(port);
+ server.sin_addr.s_addr = dest_ip;
+
+ r_bytes = recv(sock[0], sock_buf, buf_len, 0);
+ if (r_bytes < 0) {
+ if (errno != EAGAIN)
+ pr_err("recv()");
+ return -1;
+ }
+ if (r_bytes == 0) { /* EOF */
+ printk("EOF on reply to ping");
+ return -1;
+ }
+ if (r_bytes != buf_len || memcmp(buf, sock_buf, buf_len)) {
+ printk("ping reply packet is corrupted %zd/%zu", r_bytes, buf_len);
+ return -1;
+ }
+
+ s_bytes = sendto(sock[1], buf, buf_len, 0, dest_addr, sizeof(server));
+ if (s_bytes < 0) {
+ pr_err("sendto()");
+ return -1;
+ } else if (s_bytes != buf_len) {
+ printk("send part of the message: %zd/%zu", s_bytes, sizeof(server));
+ return -1;
+ }
+
+ return 0;
+}
+
+typedef int (*ping_f)(int sock[2], in_addr_t dest_ip, unsigned int port,
+ char *buf, size_t buf_len);
+static int do_ping(int cmd_fd, char *buf, size_t buf_len, struct in_addr from,
+ bool init_side, int d_port, in_addr_t to, ping_f func)
+{
+ struct test_desc msg;
+ unsigned int s_port, i, ping_succeeded = 0;
+ int ping_sock[2];
+ char to_str[IPV4_STR_SZ] = {}, from_str[IPV4_STR_SZ] = {};
+
+ if (udp_ping_init(from, ping_timeout, &s_port, ping_sock)) {
+ printk("Failed to init ping");
+ return -1;
+ }
+
+ memset(&msg, 0, sizeof(msg));
+ msg.type = MSG_PING;
+ msg.body.ping.port = s_port;
+ memcpy(&msg.body.ping.reply_ip, &from, sizeof(from));
+
+ write_msg(cmd_fd, &msg, 0);
+ if (init_side) {
+ /* The other end sends ip to ping */
+ read_msg(cmd_fd, &msg, 0);
+ if (msg.type != MSG_PING)
+ return -1;
+ to = msg.body.ping.reply_ip;
+ d_port = msg.body.ping.port;
+ }
+
+ for (i = 0; i < ping_count ; i++) {
+ struct timespec sleep_time = {
+ .tv_sec = 0,
+ .tv_nsec = ping_delay_nsec,
+ };
+
+ ping_succeeded += !func(ping_sock, to, d_port, buf, page_size);
+ nanosleep(&sleep_time, 0);
+ }
+
+ close(ping_sock[0]);
+ close(ping_sock[1]);
+
+ strncpy(to_str, inet_ntoa(*(struct in_addr *)&to), IPV4_STR_SZ - 1);
+ strncpy(from_str, inet_ntoa(from), IPV4_STR_SZ - 1);
+
+ if (ping_succeeded < ping_success) {
+ printk("ping (%s) %s->%s failed %u/%u times",
+ init_side ? "send" : "reply", from_str, to_str,
+ ping_count - ping_succeeded, ping_count);
+ return -1;
+ }
+
+#ifdef DEBUG
+ printk("ping (%s) %s->%s succeeded %u/%u times",
+ init_side ? "send" : "reply", from_str, to_str,
+ ping_succeeded, ping_count);
+#endif
+
+ return 0;
+}
+
+static int xfrm_fill_key(char *name, char *buf,
+ size_t buf_len, unsigned int *key_len)
+{
+ /* TODO: use set/map instead */
+ if (strncmp(name, "digest_null", ALGO_LEN) == 0)
+ *key_len = 0;
+ else if (strncmp(name, "ecb(cipher_null)", ALGO_LEN) == 0)
+ *key_len = 0;
+ else if (strncmp(name, "cbc(des)", ALGO_LEN) == 0)
+ *key_len = 64;
+ else if (strncmp(name, "hmac(md5)", ALGO_LEN) == 0)
+ *key_len = 128;
+ else if (strncmp(name, "cmac(aes)", ALGO_LEN) == 0)
+ *key_len = 128;
+ else if (strncmp(name, "xcbc(aes)", ALGO_LEN) == 0)
+ *key_len = 128;
+ else if (strncmp(name, "cbc(cast5)", ALGO_LEN) == 0)
+ *key_len = 128;
+ else if (strncmp(name, "cbc(serpent)", ALGO_LEN) == 0)
+ *key_len = 128;
+ else if (strncmp(name, "hmac(sha1)", ALGO_LEN) == 0)
+ *key_len = 160;
+ else if (strncmp(name, "hmac(rmd160)", ALGO_LEN) == 0)
+ *key_len = 160;
+ else if (strncmp(name, "cbc(des3_ede)", ALGO_LEN) == 0)
+ *key_len = 192;
+ else if (strncmp(name, "hmac(sha256)", ALGO_LEN) == 0)
+ *key_len = 256;
+ else if (strncmp(name, "cbc(aes)", ALGO_LEN) == 0)
+ *key_len = 256;
+ else if (strncmp(name, "cbc(camellia)", ALGO_LEN) == 0)
+ *key_len = 256;
+ else if (strncmp(name, "cbc(twofish)", ALGO_LEN) == 0)
+ *key_len = 256;
+ else if (strncmp(name, "rfc3686(ctr(aes))", ALGO_LEN) == 0)
+ *key_len = 288;
+ else if (strncmp(name, "hmac(sha384)", ALGO_LEN) == 0)
+ *key_len = 384;
+ else if (strncmp(name, "cbc(blowfish)", ALGO_LEN) == 0)
+ *key_len = 448;
+ else if (strncmp(name, "hmac(sha512)", ALGO_LEN) == 0)
+ *key_len = 512;
+ else if (strncmp(name, "rfc4106(gcm(aes))-128", ALGO_LEN) == 0)
+ *key_len = 160;
+ else if (strncmp(name, "rfc4543(gcm(aes))-128", ALGO_LEN) == 0)
+ *key_len = 160;
+ else if (strncmp(name, "rfc4309(ccm(aes))-128", ALGO_LEN) == 0)
+ *key_len = 152;
+ else if (strncmp(name, "rfc4106(gcm(aes))-192", ALGO_LEN) == 0)
+ *key_len = 224;
+ else if (strncmp(name, "rfc4543(gcm(aes))-192", ALGO_LEN) == 0)
+ *key_len = 224;
+ else if (strncmp(name, "rfc4309(ccm(aes))-192", ALGO_LEN) == 0)
+ *key_len = 216;
+ else if (strncmp(name, "rfc4106(gcm(aes))-256", ALGO_LEN) == 0)
+ *key_len = 288;
+ else if (strncmp(name, "rfc4543(gcm(aes))-256", ALGO_LEN) == 0)
+ *key_len = 288;
+ else if (strncmp(name, "rfc4309(ccm(aes))-256", ALGO_LEN) == 0)
+ *key_len = 280;
+ else if (strncmp(name, "rfc7539(chacha20,poly1305)-128", ALGO_LEN) == 0)
+ *key_len = 0;
+
+ if (*key_len > buf_len) {
+ printk("Can't pack a key - too big for buffer");
+ return -1;
+ }
+
+ randomize_buffer(buf, *key_len);
+
+ return 0;
+}
+
+static int xfrm_state_pack_algo(struct nlmsghdr *nh, size_t req_sz,
+ struct xfrm_desc *desc)
+{
+ struct {
+ union {
+ struct xfrm_algo alg;
+ struct xfrm_algo_aead aead;
+ struct xfrm_algo_auth auth;
+ } u;
+ char buf[XFRM_ALGO_KEY_BUF_SIZE];
+ } alg = {};
+ size_t alen, elen, clen, aelen;
+ unsigned short type;
+
+ alen = strlen(desc->a_algo);
+ elen = strlen(desc->e_algo);
+ clen = strlen(desc->c_algo);
+ aelen = strlen(desc->ae_algo);
+
+ /* Verify desc */
+ switch (desc->proto) {
+ case IPPROTO_AH:
+ if (!alen || elen || clen || aelen) {
+ printk("BUG: buggy ah desc");
+ return -1;
+ }
+ strncpy(alg.u.alg.alg_name, desc->a_algo, ALGO_LEN - 1);
+ if (xfrm_fill_key(desc->a_algo, alg.u.alg.alg_key,
+ sizeof(alg.buf), &alg.u.alg.alg_key_len))
+ return -1;
+ type = XFRMA_ALG_AUTH;
+ break;
+ case IPPROTO_COMP:
+ if (!clen || elen || alen || aelen) {
+ printk("BUG: buggy comp desc");
+ return -1;
+ }
+ strncpy(alg.u.alg.alg_name, desc->c_algo, ALGO_LEN - 1);
+ if (xfrm_fill_key(desc->c_algo, alg.u.alg.alg_key,
+ sizeof(alg.buf), &alg.u.alg.alg_key_len))
+ return -1;
+ type = XFRMA_ALG_COMP;
+ break;
+ case IPPROTO_ESP:
+ if (!((alen && elen) ^ aelen) || clen) {
+ printk("BUG: buggy esp desc");
+ return -1;
+ }
+ if (aelen) {
+ alg.u.aead.alg_icv_len = desc->icv_len;
+ strncpy(alg.u.aead.alg_name, desc->ae_algo, ALGO_LEN - 1);
+ if (xfrm_fill_key(desc->ae_algo, alg.u.aead.alg_key,
+ sizeof(alg.buf), &alg.u.aead.alg_key_len))
+ return -1;
+ type = XFRMA_ALG_AEAD;
+ } else {
+
+ strncpy(alg.u.alg.alg_name, desc->e_algo, ALGO_LEN - 1);
+ type = XFRMA_ALG_CRYPT;
+ if (xfrm_fill_key(desc->e_algo, alg.u.alg.alg_key,
+ sizeof(alg.buf), &alg.u.alg.alg_key_len))
+ return -1;
+ if (rtattr_pack(nh, req_sz, type, &alg, sizeof(alg)))
+ return -1;
+
+ strncpy(alg.u.alg.alg_name, desc->a_algo, ALGO_LEN);
+ type = XFRMA_ALG_AUTH;
+ if (xfrm_fill_key(desc->a_algo, alg.u.alg.alg_key,
+ sizeof(alg.buf), &alg.u.alg.alg_key_len))
+ return -1;
+ }
+ break;
+ default:
+ printk("BUG: unknown proto in desc");
+ return -1;
+ }
+
+ if (rtattr_pack(nh, req_sz, type, &alg, sizeof(alg)))
+ return -1;
+
+ return 0;
+}
+
+static inline uint32_t gen_spi(struct in_addr src)
+{
+ return htonl(inet_lnaof(src));
+}
+
+static int xfrm_state_add(int xfrm_sock, uint32_t seq, uint32_t spi,
+ struct in_addr src, struct in_addr dst,
+ struct xfrm_desc *desc)
+{
+ struct {
+ struct nlmsghdr nh;
+ struct xfrm_usersa_info info;
+ char attrbuf[MAX_PAYLOAD];
+ } req;
+
+ memset(&req, 0, sizeof(req));
+ req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(req.info));
+ req.nh.nlmsg_type = XFRM_MSG_NEWSA;
+ req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
+ req.nh.nlmsg_seq = seq;
+
+ /* Fill selector. */
+ memcpy(&req.info.sel.daddr, &dst, sizeof(dst));
+ memcpy(&req.info.sel.saddr, &src, sizeof(src));
+ req.info.sel.family = AF_INET;
+ req.info.sel.prefixlen_d = PREFIX_LEN;
+ req.info.sel.prefixlen_s = PREFIX_LEN;
+
+ /* Fill id */
+ memcpy(&req.info.id.daddr, &dst, sizeof(dst));
+ /* Note: zero-spi cannot be deleted */
+ req.info.id.spi = spi;
+ req.info.id.proto = desc->proto;
+
+ memcpy(&req.info.saddr, &src, sizeof(src));
+
+ /* Fill lifteme_cfg */
+ req.info.lft.soft_byte_limit = XFRM_INF;
+ req.info.lft.hard_byte_limit = XFRM_INF;
+ req.info.lft.soft_packet_limit = XFRM_INF;
+ req.info.lft.hard_packet_limit = XFRM_INF;
+
+ req.info.family = AF_INET;
+ req.info.mode = XFRM_MODE_TUNNEL;
+
+ if (xfrm_state_pack_algo(&req.nh, sizeof(req), desc))
+ return -1;
+
+ if (send(xfrm_sock, &req, req.nh.nlmsg_len, 0) < 0) {
+ pr_err("send()");
+ return -1;
+ }
+
+ return netlink_check_answer(xfrm_sock);
+}
+
+static bool xfrm_usersa_found(struct xfrm_usersa_info *info, uint32_t spi,
+ struct in_addr src, struct in_addr dst,
+ struct xfrm_desc *desc)
+{
+ if (memcmp(&info->sel.daddr, &dst, sizeof(dst)))
+ return false;
+
+ if (memcmp(&info->sel.saddr, &src, sizeof(src)))
+ return false;
+
+ if (info->sel.family != AF_INET ||
+ info->sel.prefixlen_d != PREFIX_LEN ||
+ info->sel.prefixlen_s != PREFIX_LEN)
+ return false;
+
+ if (info->id.spi != spi || info->id.proto != desc->proto)
+ return false;
+
+ if (memcmp(&info->id.daddr, &dst, sizeof(dst)))
+ return false;
+
+ if (memcmp(&info->saddr, &src, sizeof(src)))
+ return false;
+
+ if (info->lft.soft_byte_limit != XFRM_INF ||
+ info->lft.hard_byte_limit != XFRM_INF ||
+ info->lft.soft_packet_limit != XFRM_INF ||
+ info->lft.hard_packet_limit != XFRM_INF)
+ return false;
+
+ if (info->family != AF_INET || info->mode != XFRM_MODE_TUNNEL)
+ return false;
+
+ /* XXX: check xfrm algo, see xfrm_state_pack_algo(). */
+
+ return true;
+}
+
+static int xfrm_state_check(int xfrm_sock, uint32_t seq, uint32_t spi,
+ struct in_addr src, struct in_addr dst,
+ struct xfrm_desc *desc)
+{
+ struct {
+ struct nlmsghdr nh;
+ char attrbuf[MAX_PAYLOAD];
+ } req;
+ struct {
+ struct nlmsghdr nh;
+ union {
+ struct xfrm_usersa_info info;
+ int error;
+ };
+ char attrbuf[MAX_PAYLOAD];
+ } answer;
+ struct xfrm_address_filter filter = {};
+ bool found = false;
+
+
+ memset(&req, 0, sizeof(req));
+ req.nh.nlmsg_len = NLMSG_LENGTH(0);
+ req.nh.nlmsg_type = XFRM_MSG_GETSA;
+ req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
+ req.nh.nlmsg_seq = seq;
+
+ /*
+ * Add dump filter by source address as there may be other tunnels
+ * in this netns (if tests run in parallel).
+ */
+ filter.family = AF_INET;
+ filter.splen = 0x1f; /* 0xffffffff mask see addr_match() */
+ memcpy(&filter.saddr, &src, sizeof(src));
+ if (rtattr_pack(&req.nh, sizeof(req), XFRMA_ADDRESS_FILTER,
+ &filter, sizeof(filter)))
+ return -1;
+
+ if (send(xfrm_sock, &req, req.nh.nlmsg_len, 0) < 0) {
+ pr_err("send()");
+ return -1;
+ }
+
+ while (1) {
+ if (recv(xfrm_sock, &answer, sizeof(answer), 0) < 0) {
+ pr_err("recv()");
+ return -1;
+ }
+ if (answer.nh.nlmsg_type == NLMSG_ERROR) {
+ printk("NLMSG_ERROR: %d: %s",
+ answer.error, strerror(-answer.error));
+ return -1;
+ } else if (answer.nh.nlmsg_type == NLMSG_DONE) {
+ if (found)
+ return 0;
+ printk("didn't find allocated xfrm state in dump");
+ return -1;
+ } else if (answer.nh.nlmsg_type == XFRM_MSG_NEWSA) {
+ if (xfrm_usersa_found(&answer.info, spi, src, dst, desc))
+ found = true;
+ }
+ }
+}
+
+static int xfrm_set(int xfrm_sock, uint32_t *seq,
+ struct in_addr src, struct in_addr dst,
+ struct in_addr tunsrc, struct in_addr tundst,
+ struct xfrm_desc *desc)
+{
+ int err;
+
+ err = xfrm_state_add(xfrm_sock, (*seq)++, gen_spi(src), src, dst, desc);
+ if (err) {
+ printk("Failed to add xfrm state");
+ return -1;
+ }
+
+ err = xfrm_state_add(xfrm_sock, (*seq)++, gen_spi(src), dst, src, desc);
+ if (err) {
+ printk("Failed to add xfrm state");
+ return -1;
+ }
+
+ /* Check dumps for XFRM_MSG_GETSA */
+ err = xfrm_state_check(xfrm_sock, (*seq)++, gen_spi(src), src, dst, desc);
+ err |= xfrm_state_check(xfrm_sock, (*seq)++, gen_spi(src), dst, src, desc);
+ if (err) {
+ printk("Failed to check xfrm state");
+ return -1;
+ }
+
+ return 0;
+}
+
+static int xfrm_policy_add(int xfrm_sock, uint32_t seq, uint32_t spi,
+ struct in_addr src, struct in_addr dst, uint8_t dir,
+ struct in_addr tunsrc, struct in_addr tundst, uint8_t proto)
+{
+ struct {
+ struct nlmsghdr nh;
+ struct xfrm_userpolicy_info info;
+ char attrbuf[MAX_PAYLOAD];
+ } req;
+ struct xfrm_user_tmpl tmpl;
+
+ memset(&req, 0, sizeof(req));
+ memset(&tmpl, 0, sizeof(tmpl));
+ req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(req.info));
+ req.nh.nlmsg_type = XFRM_MSG_NEWPOLICY;
+ req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
+ req.nh.nlmsg_seq = seq;
+
+ /* Fill selector. */
+ memcpy(&req.info.sel.daddr, &dst, sizeof(tundst));
+ memcpy(&req.info.sel.saddr, &src, sizeof(tunsrc));
+ req.info.sel.family = AF_INET;
+ req.info.sel.prefixlen_d = PREFIX_LEN;
+ req.info.sel.prefixlen_s = PREFIX_LEN;
+
+ /* Fill lifteme_cfg */
+ req.info.lft.soft_byte_limit = XFRM_INF;
+ req.info.lft.hard_byte_limit = XFRM_INF;
+ req.info.lft.soft_packet_limit = XFRM_INF;
+ req.info.lft.hard_packet_limit = XFRM_INF;
+
+ req.info.dir = dir;
+
+ /* Fill tmpl */
+ memcpy(&tmpl.id.daddr, &dst, sizeof(dst));
+ /* Note: zero-spi cannot be deleted */
+ tmpl.id.spi = spi;
+ tmpl.id.proto = proto;
+ tmpl.family = AF_INET;
+ memcpy(&tmpl.saddr, &src, sizeof(src));
+ tmpl.mode = XFRM_MODE_TUNNEL;
+ tmpl.aalgos = (~(uint32_t)0);
+ tmpl.ealgos = (~(uint32_t)0);
+ tmpl.calgos = (~(uint32_t)0);
+
+ if (rtattr_pack(&req.nh, sizeof(req), XFRMA_TMPL, &tmpl, sizeof(tmpl)))
+ return -1;
+
+ if (send(xfrm_sock, &req, req.nh.nlmsg_len, 0) < 0) {
+ pr_err("send()");
+ return -1;
+ }
+
+ return netlink_check_answer(xfrm_sock);
+}
+
+static int xfrm_prepare(int xfrm_sock, uint32_t *seq,
+ struct in_addr src, struct in_addr dst,
+ struct in_addr tunsrc, struct in_addr tundst, uint8_t proto)
+{
+ if (xfrm_policy_add(xfrm_sock, (*seq)++, gen_spi(src), src, dst,
+ XFRM_POLICY_OUT, tunsrc, tundst, proto)) {
+ printk("Failed to add xfrm policy");
+ return -1;
+ }
+
+ if (xfrm_policy_add(xfrm_sock, (*seq)++, gen_spi(src), dst, src,
+ XFRM_POLICY_IN, tunsrc, tundst, proto)) {
+ printk("Failed to add xfrm policy");
+ return -1;
+ }
+
+ return 0;
+}
+
+static int xfrm_policy_del(int xfrm_sock, uint32_t seq,
+ struct in_addr src, struct in_addr dst, uint8_t dir,
+ struct in_addr tunsrc, struct in_addr tundst)
+{
+ struct {
+ struct nlmsghdr nh;
+ struct xfrm_userpolicy_id id;
+ char attrbuf[MAX_PAYLOAD];
+ } req;
+
+ memset(&req, 0, sizeof(req));
+ req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(req.id));
+ req.nh.nlmsg_type = XFRM_MSG_DELPOLICY;
+ req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
+ req.nh.nlmsg_seq = seq;
+
+ /* Fill id */
+ memcpy(&req.id.sel.daddr, &dst, sizeof(tundst));
+ memcpy(&req.id.sel.saddr, &src, sizeof(tunsrc));
+ req.id.sel.family = AF_INET;
+ req.id.sel.prefixlen_d = PREFIX_LEN;
+ req.id.sel.prefixlen_s = PREFIX_LEN;
+ req.id.dir = dir;
+
+ if (send(xfrm_sock, &req, req.nh.nlmsg_len, 0) < 0) {
+ pr_err("send()");
+ return -1;
+ }
+
+ return netlink_check_answer(xfrm_sock);
+}
+
+static int xfrm_cleanup(int xfrm_sock, uint32_t *seq,
+ struct in_addr src, struct in_addr dst,
+ struct in_addr tunsrc, struct in_addr tundst)
+{
+ if (xfrm_policy_del(xfrm_sock, (*seq)++, src, dst,
+ XFRM_POLICY_OUT, tunsrc, tundst)) {
+ printk("Failed to add xfrm policy");
+ return -1;
+ }
+
+ if (xfrm_policy_del(xfrm_sock, (*seq)++, dst, src,
+ XFRM_POLICY_IN, tunsrc, tundst)) {
+ printk("Failed to add xfrm policy");
+ return -1;
+ }
+
+ return 0;
+}
+
+static int xfrm_state_del(int xfrm_sock, uint32_t seq, uint32_t spi,
+ struct in_addr src, struct in_addr dst, uint8_t proto)
+{
+ struct {
+ struct nlmsghdr nh;
+ struct xfrm_usersa_id id;
+ char attrbuf[MAX_PAYLOAD];
+ } req;
+ xfrm_address_t saddr = {};
+
+ memset(&req, 0, sizeof(req));
+ req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(req.id));
+ req.nh.nlmsg_type = XFRM_MSG_DELSA;
+ req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
+ req.nh.nlmsg_seq = seq;
+
+ memcpy(&req.id.daddr, &dst, sizeof(dst));
+ req.id.family = AF_INET;
+ req.id.proto = proto;
+ /* Note: zero-spi cannot be deleted */
+ req.id.spi = spi;
+
+ memcpy(&saddr, &src, sizeof(src));
+ if (rtattr_pack(&req.nh, sizeof(req), XFRMA_SRCADDR, &saddr, sizeof(saddr)))
+ return -1;
+
+ if (send(xfrm_sock, &req, req.nh.nlmsg_len, 0) < 0) {
+ pr_err("send()");
+ return -1;
+ }
+
+ return netlink_check_answer(xfrm_sock);
+}
+
+static int xfrm_delete(int xfrm_sock, uint32_t *seq,
+ struct in_addr src, struct in_addr dst,
+ struct in_addr tunsrc, struct in_addr tundst, uint8_t proto)
+{
+ if (xfrm_state_del(xfrm_sock, (*seq)++, gen_spi(src), src, dst, proto)) {
+ printk("Failed to remove xfrm state");
+ return -1;
+ }
+
+ if (xfrm_state_del(xfrm_sock, (*seq)++, gen_spi(src), dst, src, proto)) {
+ printk("Failed to remove xfrm state");
+ return -1;
+ }
+
+ return 0;
+}
+
+static int xfrm_state_allocspi(int xfrm_sock, uint32_t *seq,
+ uint32_t spi, uint8_t proto)
+{
+ struct {
+ struct nlmsghdr nh;
+ struct xfrm_userspi_info spi;
+ } req;
+ struct {
+ struct nlmsghdr nh;
+ union {
+ struct xfrm_usersa_info info;
+ int error;
+ };
+ } answer;
+
+ memset(&req, 0, sizeof(req));
+ req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(req.spi));
+ req.nh.nlmsg_type = XFRM_MSG_ALLOCSPI;
+ req.nh.nlmsg_flags = NLM_F_REQUEST;
+ req.nh.nlmsg_seq = (*seq)++;
+
+ req.spi.info.family = AF_INET;
+ req.spi.min = spi;
+ req.spi.max = spi;
+ req.spi.info.id.proto = proto;
+
+ if (send(xfrm_sock, &req, req.nh.nlmsg_len, 0) < 0) {
+ pr_err("send()");
+ return KSFT_FAIL;
+ }
+
+ if (recv(xfrm_sock, &answer, sizeof(answer), 0) < 0) {
+ pr_err("recv()");
+ return KSFT_FAIL;
+ } else if (answer.nh.nlmsg_type == XFRM_MSG_NEWSA) {
+ uint32_t new_spi = htonl(answer.info.id.spi);
+
+ if (new_spi != spi) {
+ printk("allocated spi is different from requested: %#x != %#x",
+ new_spi, spi);
+ return KSFT_FAIL;
+ }
+ return KSFT_PASS;
+ } else if (answer.nh.nlmsg_type != NLMSG_ERROR) {
+ printk("expected NLMSG_ERROR, got %d", (int)answer.nh.nlmsg_type);
+ return KSFT_FAIL;
+ }
+
+ printk("NLMSG_ERROR: %d: %s", answer.error, strerror(-answer.error));
+ return (answer.error) ? KSFT_FAIL : KSFT_PASS;
+}
+
+static int netlink_sock_bind(int *sock, uint32_t *seq, int proto, uint32_t groups)
+{
+ struct sockaddr_nl snl = {};
+ socklen_t addr_len;
+ int ret = -1;
+
+ snl.nl_family = AF_NETLINK;
+ snl.nl_groups = groups;
+
+ if (netlink_sock(sock, seq, proto)) {
+ printk("Failed to open xfrm netlink socket");
+ return -1;
+ }
+
+ if (bind(*sock, (struct sockaddr *)&snl, sizeof(snl)) < 0) {
+ pr_err("bind()");
+ goto out_close;
+ }
+
+ addr_len = sizeof(snl);
+ if (getsockname(*sock, (struct sockaddr *)&snl, &addr_len) < 0) {
+ pr_err("getsockname()");
+ goto out_close;
+ }
+ if (addr_len != sizeof(snl)) {
+ printk("Wrong address length %d", addr_len);
+ goto out_close;
+ }
+ if (snl.nl_family != AF_NETLINK) {
+ printk("Wrong address family %d", snl.nl_family);
+ goto out_close;
+ }
+ return 0;
+
+out_close:
+ close(*sock);
+ return ret;
+}
+
+static int xfrm_monitor_acquire(int xfrm_sock, uint32_t *seq, unsigned int nr)
+{
+ struct {
+ struct nlmsghdr nh;
+ union {
+ struct xfrm_user_acquire acq;
+ int error;
+ };
+ char attrbuf[MAX_PAYLOAD];
+ } req;
+ struct xfrm_user_tmpl xfrm_tmpl = {};
+ int xfrm_listen = -1, ret = KSFT_FAIL;
+ uint32_t seq_listen;
+
+ if (netlink_sock_bind(&xfrm_listen, &seq_listen, NETLINK_XFRM, XFRMNLGRP_ACQUIRE))
+ return KSFT_FAIL;
+
+ memset(&req, 0, sizeof(req));
+ req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(req.acq));
+ req.nh.nlmsg_type = XFRM_MSG_ACQUIRE;
+ req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
+ req.nh.nlmsg_seq = (*seq)++;
+
+ req.acq.policy.sel.family = AF_INET;
+ req.acq.aalgos = 0xfeed;
+ req.acq.ealgos = 0xbaad;
+ req.acq.calgos = 0xbabe;
+
+ xfrm_tmpl.family = AF_INET;
+ xfrm_tmpl.id.proto = IPPROTO_ESP;
+ if (rtattr_pack(&req.nh, sizeof(req), XFRMA_TMPL, &xfrm_tmpl, sizeof(xfrm_tmpl)))
+ goto out_close;
+
+ if (send(xfrm_sock, &req, req.nh.nlmsg_len, 0) < 0) {
+ pr_err("send()");
+ goto out_close;
+ }
+
+ if (recv(xfrm_sock, &req, sizeof(req), 0) < 0) {
+ pr_err("recv()");
+ goto out_close;
+ } else if (req.nh.nlmsg_type != NLMSG_ERROR) {
+ printk("expected NLMSG_ERROR, got %d", (int)req.nh.nlmsg_type);
+ goto out_close;
+ }
+
+ if (req.error) {
+ printk("NLMSG_ERROR: %d: %s", req.error, strerror(-req.error));
+ ret = req.error;
+ goto out_close;
+ }
+
+ if (recv(xfrm_listen, &req, sizeof(req), 0) < 0) {
+ pr_err("recv()");
+ goto out_close;
+ }
+
+ if (req.acq.aalgos != 0xfeed || req.acq.ealgos != 0xbaad
+ || req.acq.calgos != 0xbabe) {
+ printk("xfrm_user_acquire has changed %x %x %x",
+ req.acq.aalgos, req.acq.ealgos, req.acq.calgos);
+ goto out_close;
+ }
+
+ ret = KSFT_PASS;
+out_close:
+ close(xfrm_listen);
+ return ret;
+}
+
+static int xfrm_expire_state(int xfrm_sock, uint32_t *seq,
+ unsigned int nr, struct xfrm_desc *desc)
+{
+ struct {
+ struct nlmsghdr nh;
+ union {
+ struct xfrm_user_expire expire;
+ int error;
+ };
+ } req;
+ struct in_addr src, dst;
+ int xfrm_listen = -1, ret = KSFT_FAIL;
+ uint32_t seq_listen;
+
+ src = inet_makeaddr(INADDR_B, child_ip(nr));
+ dst = inet_makeaddr(INADDR_B, grchild_ip(nr));
+
+ if (xfrm_state_add(xfrm_sock, (*seq)++, gen_spi(src), src, dst, desc)) {
+ printk("Failed to add xfrm state");
+ return KSFT_FAIL;
+ }
+
+ if (netlink_sock_bind(&xfrm_listen, &seq_listen, NETLINK_XFRM, XFRMNLGRP_EXPIRE))
+ return KSFT_FAIL;
+
+ memset(&req, 0, sizeof(req));
+ req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(req.expire));
+ req.nh.nlmsg_type = XFRM_MSG_EXPIRE;
+ req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
+ req.nh.nlmsg_seq = (*seq)++;
+
+ memcpy(&req.expire.state.id.daddr, &dst, sizeof(dst));
+ req.expire.state.id.spi = gen_spi(src);
+ req.expire.state.id.proto = desc->proto;
+ req.expire.state.family = AF_INET;
+ req.expire.hard = 0xff;
+
+ if (send(xfrm_sock, &req, req.nh.nlmsg_len, 0) < 0) {
+ pr_err("send()");
+ goto out_close;
+ }
+
+ if (recv(xfrm_sock, &req, sizeof(req), 0) < 0) {
+ pr_err("recv()");
+ goto out_close;
+ } else if (req.nh.nlmsg_type != NLMSG_ERROR) {
+ printk("expected NLMSG_ERROR, got %d", (int)req.nh.nlmsg_type);
+ goto out_close;
+ }
+
+ if (req.error) {
+ printk("NLMSG_ERROR: %d: %s", req.error, strerror(-req.error));
+ ret = req.error;
+ goto out_close;
+ }
+
+ if (recv(xfrm_listen, &req, sizeof(req), 0) < 0) {
+ pr_err("recv()");
+ goto out_close;
+ }
+
+ if (req.expire.hard != 0x1) {
+ printk("expire.hard is not set: %x", req.expire.hard);
+ goto out_close;
+ }
+
+ ret = KSFT_PASS;
+out_close:
+ close(xfrm_listen);
+ return ret;
+}
+
+static int xfrm_expire_policy(int xfrm_sock, uint32_t *seq,
+ unsigned int nr, struct xfrm_desc *desc)
+{
+ struct {
+ struct nlmsghdr nh;
+ union {
+ struct xfrm_user_polexpire expire;
+ int error;
+ };
+ } req;
+ struct in_addr src, dst, tunsrc, tundst;
+ int xfrm_listen = -1, ret = KSFT_FAIL;
+ uint32_t seq_listen;
+
+ src = inet_makeaddr(INADDR_B, child_ip(nr));
+ dst = inet_makeaddr(INADDR_B, grchild_ip(nr));
+ tunsrc = inet_makeaddr(INADDR_A, child_ip(nr));
+ tundst = inet_makeaddr(INADDR_A, grchild_ip(nr));
+
+ if (xfrm_policy_add(xfrm_sock, (*seq)++, gen_spi(src), src, dst,
+ XFRM_POLICY_OUT, tunsrc, tundst, desc->proto)) {
+ printk("Failed to add xfrm policy");
+ return KSFT_FAIL;
+ }
+
+ if (netlink_sock_bind(&xfrm_listen, &seq_listen, NETLINK_XFRM, XFRMNLGRP_EXPIRE))
+ return KSFT_FAIL;
+
+ memset(&req, 0, sizeof(req));
+ req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(req.expire));
+ req.nh.nlmsg_type = XFRM_MSG_POLEXPIRE;
+ req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
+ req.nh.nlmsg_seq = (*seq)++;
+
+ /* Fill selector. */
+ memcpy(&req.expire.pol.sel.daddr, &dst, sizeof(tundst));
+ memcpy(&req.expire.pol.sel.saddr, &src, sizeof(tunsrc));
+ req.expire.pol.sel.family = AF_INET;
+ req.expire.pol.sel.prefixlen_d = PREFIX_LEN;
+ req.expire.pol.sel.prefixlen_s = PREFIX_LEN;
+ req.expire.pol.dir = XFRM_POLICY_OUT;
+ req.expire.hard = 0xff;
+
+ if (send(xfrm_sock, &req, req.nh.nlmsg_len, 0) < 0) {
+ pr_err("send()");
+ goto out_close;
+ }
+
+ if (recv(xfrm_sock, &req, sizeof(req), 0) < 0) {
+ pr_err("recv()");
+ goto out_close;
+ } else if (req.nh.nlmsg_type != NLMSG_ERROR) {
+ printk("expected NLMSG_ERROR, got %d", (int)req.nh.nlmsg_type);
+ goto out_close;
+ }
+
+ if (req.error) {
+ printk("NLMSG_ERROR: %d: %s", req.error, strerror(-req.error));
+ ret = req.error;
+ goto out_close;
+ }
+
+ if (recv(xfrm_listen, &req, sizeof(req), 0) < 0) {
+ pr_err("recv()");
+ goto out_close;
+ }
+
+ if (req.expire.hard != 0x1) {
+ printk("expire.hard is not set: %x", req.expire.hard);
+ goto out_close;
+ }
+
+ ret = KSFT_PASS;
+out_close:
+ close(xfrm_listen);
+ return ret;
+}
+
+static int child_serv(int xfrm_sock, uint32_t *seq,
+ unsigned int nr, int cmd_fd, void *buf, struct xfrm_desc *desc)
+{
+ struct in_addr src, dst, tunsrc, tundst;
+ struct test_desc msg;
+ int ret = KSFT_FAIL;
+
+ src = inet_makeaddr(INADDR_B, child_ip(nr));
+ dst = inet_makeaddr(INADDR_B, grchild_ip(nr));
+ tunsrc = inet_makeaddr(INADDR_A, child_ip(nr));
+ tundst = inet_makeaddr(INADDR_A, grchild_ip(nr));
+
+ /* UDP pinging without xfrm */
+ if (do_ping(cmd_fd, buf, page_size, src, true, 0, 0, udp_ping_send)) {
+ printk("ping failed before setting xfrm");
+ return KSFT_FAIL;
+ }
+
+ memset(&msg, 0, sizeof(msg));
+ msg.type = MSG_XFRM_PREPARE;
+ memcpy(&msg.body.xfrm_desc, desc, sizeof(*desc));
+ write_msg(cmd_fd, &msg, 1);
+
+ if (xfrm_prepare(xfrm_sock, seq, src, dst, tunsrc, tundst, desc->proto)) {
+ printk("failed to prepare xfrm");
+ goto cleanup;
+ }
+
+ memset(&msg, 0, sizeof(msg));
+ msg.type = MSG_XFRM_ADD;
+ memcpy(&msg.body.xfrm_desc, desc, sizeof(*desc));
+ write_msg(cmd_fd, &msg, 1);
+ if (xfrm_set(xfrm_sock, seq, src, dst, tunsrc, tundst, desc)) {
+ printk("failed to set xfrm");
+ goto delete;
+ }
+
+ /* UDP pinging with xfrm tunnel */
+ if (do_ping(cmd_fd, buf, page_size, tunsrc,
+ true, 0, 0, udp_ping_send)) {
+ printk("ping failed for xfrm");
+ goto delete;
+ }
+
+ ret = KSFT_PASS;
+delete:
+ /* xfrm delete */
+ memset(&msg, 0, sizeof(msg));
+ msg.type = MSG_XFRM_DEL;
+ memcpy(&msg.body.xfrm_desc, desc, sizeof(*desc));
+ write_msg(cmd_fd, &msg, 1);
+
+ if (xfrm_delete(xfrm_sock, seq, src, dst, tunsrc, tundst, desc->proto)) {
+ printk("failed ping to remove xfrm");
+ ret = KSFT_FAIL;
+ }
+
+cleanup:
+ memset(&msg, 0, sizeof(msg));
+ msg.type = MSG_XFRM_CLEANUP;
+ memcpy(&msg.body.xfrm_desc, desc, sizeof(*desc));
+ write_msg(cmd_fd, &msg, 1);
+ if (xfrm_cleanup(xfrm_sock, seq, src, dst, tunsrc, tundst)) {
+ printk("failed ping to cleanup xfrm");
+ ret = KSFT_FAIL;
+ }
+ return ret;
+}
+
+static int child_f(unsigned int nr, int test_desc_fd, int cmd_fd, void *buf)
+{
+ struct xfrm_desc desc;
+ struct test_desc msg;
+ int xfrm_sock = -1;
+ uint32_t seq;
+
+ if (switch_ns(nsfd_childa))
+ exit(KSFT_FAIL);
+
+ if (netlink_sock(&xfrm_sock, &seq, NETLINK_XFRM)) {
+ printk("Failed to open xfrm netlink socket");
+ exit(KSFT_FAIL);
+ }
+
+ /* Check that seq sock is ready, just for sure. */
+ memset(&msg, 0, sizeof(msg));
+ msg.type = MSG_ACK;
+ write_msg(cmd_fd, &msg, 1);
+ read_msg(cmd_fd, &msg, 1);
+ if (msg.type != MSG_ACK) {
+ printk("Ack failed");
+ exit(KSFT_FAIL);
+ }
+
+ for (;;) {
+ ssize_t received = read(test_desc_fd, &desc, sizeof(desc));
+ int ret;
+
+ if (received == 0) /* EOF */
+ break;
+
+ if (received != sizeof(desc)) {
+ pr_err("read() returned %zd", received);
+ exit(KSFT_FAIL);
+ }
+
+ switch (desc.type) {
+ case CREATE_TUNNEL:
+ ret = child_serv(xfrm_sock, &seq, nr,
+ cmd_fd, buf, &desc);
+ break;
+ case ALLOCATE_SPI:
+ ret = xfrm_state_allocspi(xfrm_sock, &seq,
+ -1, desc.proto);
+ break;
+ case MONITOR_ACQUIRE:
+ ret = xfrm_monitor_acquire(xfrm_sock, &seq, nr);
+ break;
+ case EXPIRE_STATE:
+ ret = xfrm_expire_state(xfrm_sock, &seq, nr, &desc);
+ break;
+ case EXPIRE_POLICY:
+ ret = xfrm_expire_policy(xfrm_sock, &seq, nr, &desc);
+ break;
+ default:
+ printk("Unknown desc type %d", desc.type);
+ exit(KSFT_FAIL);
+ }
+ write_test_result(ret, &desc);
+ }
+
+ close(xfrm_sock);
+
+ msg.type = MSG_EXIT;
+ write_msg(cmd_fd, &msg, 1);
+ exit(KSFT_PASS);
+}
+
+static void grand_child_serv(unsigned int nr, int cmd_fd, void *buf,
+ struct test_desc *msg, int xfrm_sock, uint32_t *seq)
+{
+ struct in_addr src, dst, tunsrc, tundst;
+ bool tun_reply;
+ struct xfrm_desc *desc = &msg->body.xfrm_desc;
+
+ src = inet_makeaddr(INADDR_B, grchild_ip(nr));
+ dst = inet_makeaddr(INADDR_B, child_ip(nr));
+ tunsrc = inet_makeaddr(INADDR_A, grchild_ip(nr));
+ tundst = inet_makeaddr(INADDR_A, child_ip(nr));
+
+ switch (msg->type) {
+ case MSG_EXIT:
+ exit(KSFT_PASS);
+ case MSG_ACK:
+ write_msg(cmd_fd, msg, 1);
+ break;
+ case MSG_PING:
+ tun_reply = memcmp(&dst, &msg->body.ping.reply_ip, sizeof(in_addr_t));
+ /* UDP pinging without xfrm */
+ if (do_ping(cmd_fd, buf, page_size, tun_reply ? tunsrc : src,
+ false, msg->body.ping.port,
+ msg->body.ping.reply_ip, udp_ping_reply)) {
+ printk("ping failed before setting xfrm");
+ }
+ break;
+ case MSG_XFRM_PREPARE:
+ if (xfrm_prepare(xfrm_sock, seq, src, dst, tunsrc, tundst,
+ desc->proto)) {
+ xfrm_cleanup(xfrm_sock, seq, src, dst, tunsrc, tundst);
+ printk("failed to prepare xfrm");
+ }
+ break;
+ case MSG_XFRM_ADD:
+ if (xfrm_set(xfrm_sock, seq, src, dst, tunsrc, tundst, desc)) {
+ xfrm_cleanup(xfrm_sock, seq, src, dst, tunsrc, tundst);
+ printk("failed to set xfrm");
+ }
+ break;
+ case MSG_XFRM_DEL:
+ if (xfrm_delete(xfrm_sock, seq, src, dst, tunsrc, tundst,
+ desc->proto)) {
+ xfrm_cleanup(xfrm_sock, seq, src, dst, tunsrc, tundst);
+ printk("failed to remove xfrm");
+ }
+ break;
+ case MSG_XFRM_CLEANUP:
+ if (xfrm_cleanup(xfrm_sock, seq, src, dst, tunsrc, tundst)) {
+ printk("failed to cleanup xfrm");
+ }
+ break;
+ default:
+ printk("got unknown msg type %d", msg->type);
+ };
+}
+
+static int grand_child_f(unsigned int nr, int cmd_fd, void *buf)
+{
+ struct test_desc msg;
+ int xfrm_sock = -1;
+ uint32_t seq;
+
+ if (switch_ns(nsfd_childb))
+ exit(KSFT_FAIL);
+
+ if (netlink_sock(&xfrm_sock, &seq, NETLINK_XFRM)) {
+ printk("Failed to open xfrm netlink socket");
+ exit(KSFT_FAIL);
+ }
+
+ do {
+ read_msg(cmd_fd, &msg, 1);
+ grand_child_serv(nr, cmd_fd, buf, &msg, xfrm_sock, &seq);
+ } while (1);
+
+ close(xfrm_sock);
+ exit(KSFT_FAIL);
+}
+
+static int start_child(unsigned int nr, char *veth, int test_desc_fd[2])
+{
+ int cmd_sock[2];
+ void *data_map;
+ pid_t child;
+
+ if (init_child(nsfd_childa, veth, child_ip(nr), grchild_ip(nr)))
+ return -1;
+
+ if (init_child(nsfd_childb, veth, grchild_ip(nr), child_ip(nr)))
+ return -1;
+
+ child = fork();
+ if (child < 0) {
+ pr_err("fork()");
+ return -1;
+ } else if (child) {
+ /* in parent - selftest */
+ return switch_ns(nsfd_parent);
+ }
+
+ if (close(test_desc_fd[1])) {
+ pr_err("close()");
+ return -1;
+ }
+
+ /* child */
+ data_map = mmap(0, page_size, PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_ANONYMOUS, -1, 0);
+ if (data_map == MAP_FAILED) {
+ pr_err("mmap()");
+ return -1;
+ }
+
+ randomize_buffer(data_map, page_size);
+
+ if (socketpair(PF_LOCAL, SOCK_SEQPACKET, 0, cmd_sock)) {
+ pr_err("socketpair()");
+ return -1;
+ }
+
+ child = fork();
+ if (child < 0) {
+ pr_err("fork()");
+ return -1;
+ } else if (child) {
+ if (close(cmd_sock[0])) {
+ pr_err("close()");
+ return -1;
+ }
+ return child_f(nr, test_desc_fd[0], cmd_sock[1], data_map);
+ }
+ if (close(cmd_sock[1])) {
+ pr_err("close()");
+ return -1;
+ }
+ return grand_child_f(nr, cmd_sock[0], data_map);
+}
+
+static void exit_usage(char **argv)
+{
+ printk("Usage: %s [nr_process]", argv[0]);
+ exit(KSFT_FAIL);
+}
+
+static int __write_desc(int test_desc_fd, struct xfrm_desc *desc)
+{
+ ssize_t ret;
+
+ ret = write(test_desc_fd, desc, sizeof(*desc));
+
+ if (ret == sizeof(*desc))
+ return 0;
+
+ pr_err("Writing test's desc failed %ld", ret);
+
+ return -1;
+}
+
+static int write_desc(int proto, int test_desc_fd,
+ char *a, char *e, char *c, char *ae)
+{
+ struct xfrm_desc desc = {};
+
+ desc.type = CREATE_TUNNEL;
+ desc.proto = proto;
+
+ if (a)
+ strncpy(desc.a_algo, a, ALGO_LEN - 1);
+ if (e)
+ strncpy(desc.e_algo, e, ALGO_LEN - 1);
+ if (c)
+ strncpy(desc.c_algo, c, ALGO_LEN - 1);
+ if (ae)
+ strncpy(desc.ae_algo, ae, ALGO_LEN - 1);
+
+ return __write_desc(test_desc_fd, &desc);
+}
+
+int proto_list[] = { IPPROTO_AH, IPPROTO_COMP, IPPROTO_ESP };
+char *ah_list[] = {
+ "digest_null", "hmac(md5)", "hmac(sha1)", "hmac(sha256)",
+ "hmac(sha384)", "hmac(sha512)", "hmac(rmd160)",
+ "xcbc(aes)", "cmac(aes)"
+};
+char *comp_list[] = {
+ "deflate",
+#if 0
+ /* No compression backend realization */
+ "lzs", "lzjh"
+#endif
+};
+char *e_list[] = {
+ "ecb(cipher_null)", "cbc(des)", "cbc(des3_ede)", "cbc(cast5)",
+ "cbc(blowfish)", "cbc(aes)", "cbc(serpent)", "cbc(camellia)",
+ "cbc(twofish)", "rfc3686(ctr(aes))"
+};
+char *ae_list[] = {
+#if 0
+ /* not implemented */
+ "rfc4106(gcm(aes))", "rfc4309(ccm(aes))", "rfc4543(gcm(aes))",
+ "rfc7539esp(chacha20,poly1305)"
+#endif
+};
+
+const unsigned int proto_plan = ARRAY_SIZE(ah_list) + ARRAY_SIZE(comp_list) \
+ + (ARRAY_SIZE(ah_list) * ARRAY_SIZE(e_list)) \
+ + ARRAY_SIZE(ae_list);
+
+static int write_proto_plan(int fd, int proto)
+{
+ unsigned int i;
+
+ switch (proto) {
+ case IPPROTO_AH:
+ for (i = 0; i < ARRAY_SIZE(ah_list); i++) {
+ if (write_desc(proto, fd, ah_list[i], 0, 0, 0))
+ return -1;
+ }
+ break;
+ case IPPROTO_COMP:
+ for (i = 0; i < ARRAY_SIZE(comp_list); i++) {
+ if (write_desc(proto, fd, 0, 0, comp_list[i], 0))
+ return -1;
+ }
+ break;
+ case IPPROTO_ESP:
+ for (i = 0; i < ARRAY_SIZE(ah_list); i++) {
+ int j;
+
+ for (j = 0; j < ARRAY_SIZE(e_list); j++) {
+ if (write_desc(proto, fd, ah_list[i],
+ e_list[j], 0, 0))
+ return -1;
+ }
+ }
+ for (i = 0; i < ARRAY_SIZE(ae_list); i++) {
+ if (write_desc(proto, fd, 0, 0, 0, ae_list[i]))
+ return -1;
+ }
+ break;
+ default:
+ printk("BUG: Specified unknown proto %d", proto);
+ return -1;
+ }
+
+ return 0;
+}
+
+/*
+ * Some structures in xfrm uapi header differ in size between
+ * 64-bit and 32-bit ABI:
+ *
+ * 32-bit UABI | 64-bit UABI
+ * -------------------------------------|-------------------------------------
+ * sizeof(xfrm_usersa_info) = 220 | sizeof(xfrm_usersa_info) = 224
+ * sizeof(xfrm_userpolicy_info) = 164 | sizeof(xfrm_userpolicy_info) = 168
+ * sizeof(xfrm_userspi_info) = 228 | sizeof(xfrm_userspi_info) = 232
+ * sizeof(xfrm_user_acquire) = 276 | sizeof(xfrm_user_acquire) = 280
+ * sizeof(xfrm_user_expire) = 224 | sizeof(xfrm_user_expire) = 232
+ * sizeof(xfrm_user_polexpire) = 168 | sizeof(xfrm_user_polexpire) = 176
+ *
+ * Check the affected by the UABI difference structures.
+ */
+const unsigned int compat_plan = 4;
+static int write_compat_struct_tests(int test_desc_fd)
+{
+ struct xfrm_desc desc = {};
+
+ desc.type = ALLOCATE_SPI;
+ desc.proto = IPPROTO_AH;
+ strncpy(desc.a_algo, ah_list[0], ALGO_LEN - 1);
+
+ if (__write_desc(test_desc_fd, &desc))
+ return -1;
+
+ desc.type = MONITOR_ACQUIRE;
+ if (__write_desc(test_desc_fd, &desc))
+ return -1;
+
+ desc.type = EXPIRE_STATE;
+ if (__write_desc(test_desc_fd, &desc))
+ return -1;
+
+ desc.type = EXPIRE_POLICY;
+ if (__write_desc(test_desc_fd, &desc))
+ return -1;
+
+ return 0;
+}
+
+static int write_test_plan(int test_desc_fd)
+{
+ unsigned int i;
+ pid_t child;
+
+ child = fork();
+ if (child < 0) {
+ pr_err("fork()");
+ return -1;
+ }
+ if (child) {
+ if (close(test_desc_fd))
+ printk("close(): %m");
+ return 0;
+ }
+
+ if (write_compat_struct_tests(test_desc_fd))
+ exit(KSFT_FAIL);
+
+ for (i = 0; i < ARRAY_SIZE(proto_list); i++) {
+ if (write_proto_plan(test_desc_fd, proto_list[i]))
+ exit(KSFT_FAIL);
+ }
+
+ exit(KSFT_PASS);
+}
+
+static int children_cleanup(void)
+{
+ unsigned ret = KSFT_PASS;
+
+ while (1) {
+ int status;
+ pid_t p = wait(&status);
+
+ if ((p < 0) && errno == ECHILD)
+ break;
+
+ if (p < 0) {
+ pr_err("wait()");
+ return KSFT_FAIL;
+ }
+
+ if (!WIFEXITED(status)) {
+ ret = KSFT_FAIL;
+ continue;
+ }
+
+ if (WEXITSTATUS(status) == KSFT_FAIL)
+ ret = KSFT_FAIL;
+ }
+
+ return ret;
+}
+
+typedef void (*print_res)(const char *, ...);
+
+static int check_results(void)
+{
+ struct test_result tr = {};
+ struct xfrm_desc *d = &tr.desc;
+ int ret = KSFT_PASS;
+
+ while (1) {
+ ssize_t received = read(results_fd[0], &tr, sizeof(tr));
+ print_res result;
+
+ if (received == 0) /* EOF */
+ break;
+
+ if (received != sizeof(tr)) {
+ pr_err("read() returned %zd", received);
+ return KSFT_FAIL;
+ }
+
+ switch (tr.res) {
+ case KSFT_PASS:
+ result = ksft_test_result_pass;
+ break;
+ case KSFT_FAIL:
+ default:
+ result = ksft_test_result_fail;
+ ret = KSFT_FAIL;
+ }
+
+ result(" %s: [%u, '%s', '%s', '%s', '%s', %u]\n",
+ desc_name[d->type], (unsigned int)d->proto, d->a_algo,
+ d->e_algo, d->c_algo, d->ae_algo, d->icv_len);
+ }
+
+ return ret;
+}
+
+int main(int argc, char **argv)
+{
+ unsigned int nr_process = 1;
+ int route_sock = -1, ret = KSFT_SKIP;
+ int test_desc_fd[2];
+ uint32_t route_seq;
+ unsigned int i;
+
+ if (argc > 2)
+ exit_usage(argv);
+
+ if (argc > 1) {
+ char *endptr;
+
+ errno = 0;
+ nr_process = strtol(argv[1], &endptr, 10);
+ if ((errno == ERANGE && (nr_process == LONG_MAX || nr_process == LONG_MIN))
+ || (errno != 0 && nr_process == 0)
+ || (endptr == argv[1]) || (*endptr != '\0')) {
+ printk("Failed to parse [nr_process]");
+ exit_usage(argv);
+ }
+
+ if (nr_process > MAX_PROCESSES || !nr_process) {
+ printk("nr_process should be between [1; %u]",
+ MAX_PROCESSES);
+ exit_usage(argv);
+ }
+ }
+
+ srand(time(NULL));
+ page_size = sysconf(_SC_PAGESIZE);
+ if (page_size < 1)
+ ksft_exit_skip("sysconf(): %m\n");
+
+ if (pipe2(test_desc_fd, O_DIRECT) < 0)
+ ksft_exit_skip("pipe(): %m\n");
+
+ if (pipe2(results_fd, O_DIRECT) < 0)
+ ksft_exit_skip("pipe(): %m\n");
+
+ if (init_namespaces())
+ ksft_exit_skip("Failed to create namespaces\n");
+
+ if (netlink_sock(&route_sock, &route_seq, NETLINK_ROUTE))
+ ksft_exit_skip("Failed to open netlink route socket\n");
+
+ for (i = 0; i < nr_process; i++) {
+ char veth[VETH_LEN];
+
+ snprintf(veth, VETH_LEN, VETH_FMT, i);
+
+ if (veth_add(route_sock, route_seq++, veth, nsfd_childa, veth, nsfd_childb)) {
+ close(route_sock);
+ ksft_exit_fail_msg("Failed to create veth device");
+ }
+
+ if (start_child(i, veth, test_desc_fd)) {
+ close(route_sock);
+ ksft_exit_fail_msg("Child %u failed to start", i);
+ }
+ }
+
+ if (close(route_sock) || close(test_desc_fd[0]) || close(results_fd[1]))
+ ksft_exit_fail_msg("close(): %m");
+
+ ksft_set_plan(proto_plan + compat_plan);
+
+ if (write_test_plan(test_desc_fd[1]))
+ ksft_exit_fail_msg("Failed to write test plan to pipe");
+
+ ret = check_results();
+
+ if (children_cleanup() == KSFT_FAIL)
+ exit(KSFT_FAIL);
+
+ exit(ret);
+}
diff --git a/tools/testing/selftests/net/mptcp/.gitignore b/tools/testing/selftests/net/mptcp/.gitignore
new file mode 100644
index 0000000..260336d
--- /dev/null
+++ b/tools/testing/selftests/net/mptcp/.gitignore
@@ -0,0 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
+mptcp_connect
+pm_nl_ctl
+*.pcap
diff --git a/tools/testing/selftests/net/mptcp/Makefile b/tools/testing/selftests/net/mptcp/Makefile
new file mode 100644
index 0000000..00bb158
--- /dev/null
+++ b/tools/testing/selftests/net/mptcp/Makefile
@@ -0,0 +1,17 @@
+# SPDX-License-Identifier: GPL-2.0
+
+top_srcdir = ../../../../..
+KSFT_KHDR_INSTALL := 1
+
+CFLAGS = -Wall -Wl,--no-as-needed -O2 -g -I$(top_srcdir)/usr/include
+
+TEST_PROGS := mptcp_connect.sh pm_netlink.sh mptcp_join.sh diag.sh \
+ simult_flows.sh
+
+TEST_GEN_FILES = mptcp_connect pm_nl_ctl
+
+TEST_FILES := settings
+
+EXTRA_CLEAN := *.pcap
+
+include ../../lib.mk
diff --git a/tools/testing/selftests/net/mptcp/config b/tools/testing/selftests/net/mptcp/config
new file mode 100644
index 0000000..741a1c4
--- /dev/null
+++ b/tools/testing/selftests/net/mptcp/config
@@ -0,0 +1,7 @@
+CONFIG_MPTCP=y
+CONFIG_IPV6=y
+CONFIG_MPTCP_IPV6=y
+CONFIG_INET_DIAG=m
+CONFIG_INET_MPTCP_DIAG=m
+CONFIG_VETH=y
+CONFIG_NET_SCH_NETEM=m
diff --git a/tools/testing/selftests/net/mptcp/diag.sh b/tools/testing/selftests/net/mptcp/diag.sh
new file mode 100755
index 0000000..39edce4
--- /dev/null
+++ b/tools/testing/selftests/net/mptcp/diag.sh
@@ -0,0 +1,121 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+rndh=$(printf %x $sec)-$(mktemp -u XXXXXX)
+ns="ns1-$rndh"
+ksft_skip=4
+test_cnt=1
+ret=0
+pids=()
+
+flush_pids()
+{
+ # mptcp_connect in join mode will sleep a bit before completing,
+ # give it some time
+ sleep 1.1
+
+ for pid in ${pids[@]}; do
+ [ -d /proc/$pid ] && kill -SIGUSR1 $pid >/dev/null 2>&1
+ done
+ pids=()
+}
+
+cleanup()
+{
+ ip netns del $ns
+ for pid in ${pids[@]}; do
+ [ -d /proc/$pid ] && kill -9 $pid >/dev/null 2>&1
+ done
+}
+
+ip -Version > /dev/null 2>&1
+if [ $? -ne 0 ];then
+ echo "SKIP: Could not run test without ip tool"
+ exit $ksft_skip
+fi
+ss -h | grep -q MPTCP
+if [ $? -ne 0 ];then
+ echo "SKIP: ss tool does not support MPTCP"
+ exit $ksft_skip
+fi
+
+__chk_nr()
+{
+ local condition="$1"
+ local expected=$2
+ local msg nr
+
+ shift 2
+ msg=$*
+ nr=$(ss -inmHMN $ns | $condition)
+
+ printf "%-50s" "$msg"
+ if [ $nr != $expected ]; then
+ echo "[ fail ] expected $expected found $nr"
+ ret=$test_cnt
+ else
+ echo "[ ok ]"
+ fi
+ test_cnt=$((test_cnt+1))
+}
+
+chk_msk_nr()
+{
+ __chk_nr "grep -c token:" $*
+}
+
+chk_msk_fallback_nr()
+{
+ __chk_nr "grep -c fallback" $*
+}
+
+chk_msk_remote_key_nr()
+{
+ __chk_nr "grep -c remote_key" $*
+}
+
+
+trap cleanup EXIT
+ip netns add $ns
+ip -n $ns link set dev lo up
+
+echo "a" | ip netns exec $ns ./mptcp_connect -p 10000 -l 0.0.0.0 -t 100 >/dev/null &
+sleep 0.1
+pids[0]=$!
+chk_msk_nr 0 "no msk on netns creation"
+
+echo "b" | ip netns exec $ns ./mptcp_connect -p 10000 127.0.0.1 -j -t 100 >/dev/null &
+sleep 0.1
+pids[1]=$!
+chk_msk_nr 2 "after MPC handshake "
+chk_msk_remote_key_nr 2 "....chk remote_key"
+chk_msk_fallback_nr 0 "....chk no fallback"
+flush_pids
+
+
+echo "a" | ip netns exec $ns ./mptcp_connect -p 10001 -s TCP -l 0.0.0.0 -t 100 >/dev/null &
+pids[0]=$!
+sleep 0.1
+echo "b" | ip netns exec $ns ./mptcp_connect -p 10001 127.0.0.1 -j -t 100 >/dev/null &
+pids[1]=$!
+sleep 0.1
+chk_msk_fallback_nr 1 "check fallback"
+flush_pids
+
+NR_CLIENTS=100
+for I in `seq 1 $NR_CLIENTS`; do
+ echo "a" | ip netns exec $ns ./mptcp_connect -p $((I+10001)) -l 0.0.0.0 -t 100 -w 10 >/dev/null &
+ pids[$((I*2))]=$!
+done
+sleep 0.1
+
+for I in `seq 1 $NR_CLIENTS`; do
+ echo "b" | ip netns exec $ns ./mptcp_connect -p $((I+10001)) 127.0.0.1 -t 100 -w 10 >/dev/null &
+ pids[$((I*2 + 1))]=$!
+done
+sleep 1.5
+
+chk_msk_nr $((NR_CLIENTS*2)) "many msk socket present"
+flush_pids
+
+exit $ret
diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.c b/tools/testing/selftests/net/mptcp/mptcp_connect.c
new file mode 100644
index 0000000..77bb62f
--- /dev/null
+++ b/tools/testing/selftests/net/mptcp/mptcp_connect.c
@@ -0,0 +1,915 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define _GNU_SOURCE
+
+#include <errno.h>
+#include <limits.h>
+#include <fcntl.h>
+#include <string.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <strings.h>
+#include <signal.h>
+#include <unistd.h>
+
+#include <sys/poll.h>
+#include <sys/sendfile.h>
+#include <sys/stat.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <sys/mman.h>
+
+#include <netdb.h>
+#include <netinet/in.h>
+
+#include <linux/tcp.h>
+
+extern int optind;
+
+#ifndef IPPROTO_MPTCP
+#define IPPROTO_MPTCP 262
+#endif
+#ifndef TCP_ULP
+#define TCP_ULP 31
+#endif
+
+static int poll_timeout = 10 * 1000;
+static bool listen_mode;
+static bool quit;
+
+enum cfg_mode {
+ CFG_MODE_POLL,
+ CFG_MODE_MMAP,
+ CFG_MODE_SENDFILE,
+};
+
+static enum cfg_mode cfg_mode = CFG_MODE_POLL;
+static const char *cfg_host;
+static const char *cfg_port = "12000";
+static int cfg_sock_proto = IPPROTO_MPTCP;
+static bool tcpulp_audit;
+static int pf = AF_INET;
+static int cfg_sndbuf;
+static int cfg_rcvbuf;
+static bool cfg_join;
+static bool cfg_remove;
+static int cfg_wait;
+
+static void die_usage(void)
+{
+ fprintf(stderr, "Usage: mptcp_connect [-6] [-u] [-s MPTCP|TCP] [-p port] [-m mode]"
+ "[-l] [-w sec] connect_address\n");
+ fprintf(stderr, "\t-6 use ipv6\n");
+ fprintf(stderr, "\t-t num -- set poll timeout to num\n");
+ fprintf(stderr, "\t-S num -- set SO_SNDBUF to num\n");
+ fprintf(stderr, "\t-R num -- set SO_RCVBUF to num\n");
+ fprintf(stderr, "\t-p num -- use port num\n");
+ fprintf(stderr, "\t-s [MPTCP|TCP] -- use mptcp(default) or tcp sockets\n");
+ fprintf(stderr, "\t-m [poll|mmap|sendfile] -- use poll(default)/mmap+write/sendfile\n");
+ fprintf(stderr, "\t-u -- check mptcp ulp\n");
+ fprintf(stderr, "\t-w num -- wait num sec before closing the socket\n");
+ exit(1);
+}
+
+static void handle_signal(int nr)
+{
+ quit = true;
+}
+
+static const char *getxinfo_strerr(int err)
+{
+ if (err == EAI_SYSTEM)
+ return strerror(errno);
+
+ return gai_strerror(err);
+}
+
+static void xgetnameinfo(const struct sockaddr *addr, socklen_t addrlen,
+ char *host, socklen_t hostlen,
+ char *serv, socklen_t servlen)
+{
+ int flags = NI_NUMERICHOST | NI_NUMERICSERV;
+ int err = getnameinfo(addr, addrlen, host, hostlen, serv, servlen,
+ flags);
+
+ if (err) {
+ const char *errstr = getxinfo_strerr(err);
+
+ fprintf(stderr, "Fatal: getnameinfo: %s\n", errstr);
+ exit(1);
+ }
+}
+
+static void xgetaddrinfo(const char *node, const char *service,
+ const struct addrinfo *hints,
+ struct addrinfo **res)
+{
+ int err = getaddrinfo(node, service, hints, res);
+
+ if (err) {
+ const char *errstr = getxinfo_strerr(err);
+
+ fprintf(stderr, "Fatal: getaddrinfo(%s:%s): %s\n",
+ node ? node : "", service ? service : "", errstr);
+ exit(1);
+ }
+}
+
+static void set_rcvbuf(int fd, unsigned int size)
+{
+ int err;
+
+ err = setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &size, sizeof(size));
+ if (err) {
+ perror("set SO_RCVBUF");
+ exit(1);
+ }
+}
+
+static void set_sndbuf(int fd, unsigned int size)
+{
+ int err;
+
+ err = setsockopt(fd, SOL_SOCKET, SO_SNDBUF, &size, sizeof(size));
+ if (err) {
+ perror("set SO_SNDBUF");
+ exit(1);
+ }
+}
+
+static int sock_listen_mptcp(const char * const listenaddr,
+ const char * const port)
+{
+ int sock;
+ struct addrinfo hints = {
+ .ai_protocol = IPPROTO_TCP,
+ .ai_socktype = SOCK_STREAM,
+ .ai_flags = AI_PASSIVE | AI_NUMERICHOST
+ };
+
+ hints.ai_family = pf;
+
+ struct addrinfo *a, *addr;
+ int one = 1;
+
+ xgetaddrinfo(listenaddr, port, &hints, &addr);
+ hints.ai_family = pf;
+
+ for (a = addr; a; a = a->ai_next) {
+ sock = socket(a->ai_family, a->ai_socktype, cfg_sock_proto);
+ if (sock < 0)
+ continue;
+
+ if (-1 == setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, &one,
+ sizeof(one)))
+ perror("setsockopt");
+
+ if (bind(sock, a->ai_addr, a->ai_addrlen) == 0)
+ break; /* success */
+
+ perror("bind");
+ close(sock);
+ sock = -1;
+ }
+
+ freeaddrinfo(addr);
+
+ if (sock < 0) {
+ fprintf(stderr, "Could not create listen socket\n");
+ return sock;
+ }
+
+ if (listen(sock, 20)) {
+ perror("listen");
+ close(sock);
+ return -1;
+ }
+
+ return sock;
+}
+
+static bool sock_test_tcpulp(const char * const remoteaddr,
+ const char * const port)
+{
+ struct addrinfo hints = {
+ .ai_protocol = IPPROTO_TCP,
+ .ai_socktype = SOCK_STREAM,
+ };
+ struct addrinfo *a, *addr;
+ int sock = -1, ret = 0;
+ bool test_pass = false;
+
+ hints.ai_family = AF_INET;
+
+ xgetaddrinfo(remoteaddr, port, &hints, &addr);
+ for (a = addr; a; a = a->ai_next) {
+ sock = socket(a->ai_family, a->ai_socktype, IPPROTO_TCP);
+ if (sock < 0) {
+ perror("socket");
+ continue;
+ }
+ ret = setsockopt(sock, IPPROTO_TCP, TCP_ULP, "mptcp",
+ sizeof("mptcp"));
+ if (ret == -1 && errno == EOPNOTSUPP)
+ test_pass = true;
+ close(sock);
+
+ if (test_pass)
+ break;
+ if (!ret)
+ fprintf(stderr,
+ "setsockopt(TCP_ULP) returned 0\n");
+ else
+ perror("setsockopt(TCP_ULP)");
+ }
+ return test_pass;
+}
+
+static int sock_connect_mptcp(const char * const remoteaddr,
+ const char * const port, int proto)
+{
+ struct addrinfo hints = {
+ .ai_protocol = IPPROTO_TCP,
+ .ai_socktype = SOCK_STREAM,
+ };
+ struct addrinfo *a, *addr;
+ int sock = -1;
+
+ hints.ai_family = pf;
+
+ xgetaddrinfo(remoteaddr, port, &hints, &addr);
+ for (a = addr; a; a = a->ai_next) {
+ sock = socket(a->ai_family, a->ai_socktype, proto);
+ if (sock < 0) {
+ perror("socket");
+ continue;
+ }
+
+ if (connect(sock, a->ai_addr, a->ai_addrlen) == 0)
+ break; /* success */
+
+ perror("connect()");
+ close(sock);
+ sock = -1;
+ }
+
+ freeaddrinfo(addr);
+ return sock;
+}
+
+static size_t do_rnd_write(const int fd, char *buf, const size_t len)
+{
+ static bool first = true;
+ unsigned int do_w;
+ ssize_t bw;
+
+ do_w = rand() & 0xffff;
+ if (do_w == 0 || do_w > len)
+ do_w = len;
+
+ if (cfg_join && first && do_w > 100)
+ do_w = 100;
+
+ if (cfg_remove && do_w > 50)
+ do_w = 50;
+
+ bw = write(fd, buf, do_w);
+ if (bw < 0)
+ perror("write");
+
+ /* let the join handshake complete, before going on */
+ if (cfg_join && first) {
+ usleep(200000);
+ first = false;
+ }
+
+ if (cfg_remove)
+ usleep(200000);
+
+ return bw;
+}
+
+static size_t do_write(const int fd, char *buf, const size_t len)
+{
+ size_t offset = 0;
+
+ while (offset < len) {
+ size_t written;
+ ssize_t bw;
+
+ bw = write(fd, buf + offset, len - offset);
+ if (bw < 0) {
+ perror("write");
+ return 0;
+ }
+
+ written = (size_t)bw;
+ offset += written;
+ }
+
+ return offset;
+}
+
+static ssize_t do_rnd_read(const int fd, char *buf, const size_t len)
+{
+ size_t cap = rand();
+
+ cap &= 0xffff;
+
+ if (cap == 0)
+ cap = 1;
+ else if (cap > len)
+ cap = len;
+
+ return read(fd, buf, cap);
+}
+
+static void set_nonblock(int fd)
+{
+ int flags = fcntl(fd, F_GETFL);
+
+ if (flags == -1)
+ return;
+
+ fcntl(fd, F_SETFL, flags | O_NONBLOCK);
+}
+
+static int copyfd_io_poll(int infd, int peerfd, int outfd)
+{
+ struct pollfd fds = {
+ .fd = peerfd,
+ .events = POLLIN | POLLOUT,
+ };
+ unsigned int woff = 0, wlen = 0;
+ char wbuf[8192];
+
+ set_nonblock(peerfd);
+
+ for (;;) {
+ char rbuf[8192];
+ ssize_t len;
+
+ if (fds.events == 0)
+ break;
+
+ switch (poll(&fds, 1, poll_timeout)) {
+ case -1:
+ if (errno == EINTR)
+ continue;
+ perror("poll");
+ return 1;
+ case 0:
+ fprintf(stderr, "%s: poll timed out (events: "
+ "POLLIN %u, POLLOUT %u)\n", __func__,
+ fds.events & POLLIN, fds.events & POLLOUT);
+ return 2;
+ }
+
+ if (fds.revents & POLLIN) {
+ len = do_rnd_read(peerfd, rbuf, sizeof(rbuf));
+ if (len == 0) {
+ /* no more data to receive:
+ * peer has closed its write side
+ */
+ fds.events &= ~POLLIN;
+
+ if ((fds.events & POLLOUT) == 0)
+ /* and nothing more to send */
+ break;
+
+ /* Else, still have data to transmit */
+ } else if (len < 0) {
+ perror("read");
+ return 3;
+ }
+
+ do_write(outfd, rbuf, len);
+ }
+
+ if (fds.revents & POLLOUT) {
+ if (wlen == 0) {
+ woff = 0;
+ wlen = read(infd, wbuf, sizeof(wbuf));
+ }
+
+ if (wlen > 0) {
+ ssize_t bw;
+
+ bw = do_rnd_write(peerfd, wbuf + woff, wlen);
+ if (bw < 0)
+ return 111;
+
+ woff += bw;
+ wlen -= bw;
+ } else if (wlen == 0) {
+ /* We have no more data to send. */
+ fds.events &= ~POLLOUT;
+
+ if ((fds.events & POLLIN) == 0)
+ /* ... and peer also closed already */
+ break;
+
+ /* ... but we still receive.
+ * Close our write side, ev. give some time
+ * for address notification and/or checking
+ * the current status
+ */
+ if (cfg_wait)
+ usleep(cfg_wait);
+ shutdown(peerfd, SHUT_WR);
+ } else {
+ if (errno == EINTR)
+ continue;
+ perror("read");
+ return 4;
+ }
+ }
+
+ if (fds.revents & (POLLERR | POLLNVAL)) {
+ fprintf(stderr, "Unexpected revents: "
+ "POLLERR/POLLNVAL(%x)\n", fds.revents);
+ return 5;
+ }
+ }
+
+ /* leave some time for late join/announce */
+ if (cfg_join || cfg_remove)
+ usleep(cfg_wait);
+
+ close(peerfd);
+ return 0;
+}
+
+static int do_recvfile(int infd, int outfd)
+{
+ ssize_t r;
+
+ do {
+ char buf[16384];
+
+ r = do_rnd_read(infd, buf, sizeof(buf));
+ if (r > 0) {
+ if (write(outfd, buf, r) != r)
+ break;
+ } else if (r < 0) {
+ perror("read");
+ }
+ } while (r > 0);
+
+ return (int)r;
+}
+
+static int do_mmap(int infd, int outfd, unsigned int size)
+{
+ char *inbuf = mmap(NULL, size, PROT_READ, MAP_SHARED, infd, 0);
+ ssize_t ret = 0, off = 0;
+ size_t rem;
+
+ if (inbuf == MAP_FAILED) {
+ perror("mmap");
+ return 1;
+ }
+
+ rem = size;
+
+ while (rem > 0) {
+ ret = write(outfd, inbuf + off, rem);
+
+ if (ret < 0) {
+ perror("write");
+ break;
+ }
+
+ off += ret;
+ rem -= ret;
+ }
+
+ munmap(inbuf, size);
+ return rem;
+}
+
+static int get_infd_size(int fd)
+{
+ struct stat sb;
+ ssize_t count;
+ int err;
+
+ err = fstat(fd, &sb);
+ if (err < 0) {
+ perror("fstat");
+ return -1;
+ }
+
+ if ((sb.st_mode & S_IFMT) != S_IFREG) {
+ fprintf(stderr, "%s: stdin is not a regular file\n", __func__);
+ return -2;
+ }
+
+ count = sb.st_size;
+ if (count > INT_MAX) {
+ fprintf(stderr, "File too large: %zu\n", count);
+ return -3;
+ }
+
+ return (int)count;
+}
+
+static int do_sendfile(int infd, int outfd, unsigned int count)
+{
+ while (count > 0) {
+ ssize_t r;
+
+ r = sendfile(outfd, infd, NULL, count);
+ if (r < 0) {
+ perror("sendfile");
+ return 3;
+ }
+
+ count -= r;
+ }
+
+ return 0;
+}
+
+static int copyfd_io_mmap(int infd, int peerfd, int outfd,
+ unsigned int size)
+{
+ int err;
+
+ if (listen_mode) {
+ err = do_recvfile(peerfd, outfd);
+ if (err)
+ return err;
+
+ err = do_mmap(infd, peerfd, size);
+ } else {
+ err = do_mmap(infd, peerfd, size);
+ if (err)
+ return err;
+
+ shutdown(peerfd, SHUT_WR);
+
+ err = do_recvfile(peerfd, outfd);
+ }
+
+ return err;
+}
+
+static int copyfd_io_sendfile(int infd, int peerfd, int outfd,
+ unsigned int size)
+{
+ int err;
+
+ if (listen_mode) {
+ err = do_recvfile(peerfd, outfd);
+ if (err)
+ return err;
+
+ err = do_sendfile(infd, peerfd, size);
+ } else {
+ err = do_sendfile(infd, peerfd, size);
+ if (err)
+ return err;
+ err = do_recvfile(peerfd, outfd);
+ }
+
+ return err;
+}
+
+static int copyfd_io(int infd, int peerfd, int outfd)
+{
+ int file_size;
+
+ switch (cfg_mode) {
+ case CFG_MODE_POLL:
+ return copyfd_io_poll(infd, peerfd, outfd);
+ case CFG_MODE_MMAP:
+ file_size = get_infd_size(infd);
+ if (file_size < 0)
+ return file_size;
+ return copyfd_io_mmap(infd, peerfd, outfd, file_size);
+ case CFG_MODE_SENDFILE:
+ file_size = get_infd_size(infd);
+ if (file_size < 0)
+ return file_size;
+ return copyfd_io_sendfile(infd, peerfd, outfd, file_size);
+ }
+
+ fprintf(stderr, "Invalid mode %d\n", cfg_mode);
+
+ die_usage();
+ return 1;
+}
+
+static void check_sockaddr(int pf, struct sockaddr_storage *ss,
+ socklen_t salen)
+{
+ struct sockaddr_in6 *sin6;
+ struct sockaddr_in *sin;
+ socklen_t wanted_size = 0;
+
+ switch (pf) {
+ case AF_INET:
+ wanted_size = sizeof(*sin);
+ sin = (void *)ss;
+ if (!sin->sin_port)
+ fprintf(stderr, "accept: something wrong: ip connection from port 0");
+ break;
+ case AF_INET6:
+ wanted_size = sizeof(*sin6);
+ sin6 = (void *)ss;
+ if (!sin6->sin6_port)
+ fprintf(stderr, "accept: something wrong: ipv6 connection from port 0");
+ break;
+ default:
+ fprintf(stderr, "accept: Unknown pf %d, salen %u\n", pf, salen);
+ return;
+ }
+
+ if (salen != wanted_size)
+ fprintf(stderr, "accept: size mismatch, got %d expected %d\n",
+ (int)salen, wanted_size);
+
+ if (ss->ss_family != pf)
+ fprintf(stderr, "accept: pf mismatch, expect %d, ss_family is %d\n",
+ (int)ss->ss_family, pf);
+}
+
+static void check_getpeername(int fd, struct sockaddr_storage *ss, socklen_t salen)
+{
+ struct sockaddr_storage peerss;
+ socklen_t peersalen = sizeof(peerss);
+
+ if (getpeername(fd, (struct sockaddr *)&peerss, &peersalen) < 0) {
+ perror("getpeername");
+ return;
+ }
+
+ if (peersalen != salen) {
+ fprintf(stderr, "%s: %d vs %d\n", __func__, peersalen, salen);
+ return;
+ }
+
+ if (memcmp(ss, &peerss, peersalen)) {
+ char a[INET6_ADDRSTRLEN];
+ char b[INET6_ADDRSTRLEN];
+ char c[INET6_ADDRSTRLEN];
+ char d[INET6_ADDRSTRLEN];
+
+ xgetnameinfo((struct sockaddr *)ss, salen,
+ a, sizeof(a), b, sizeof(b));
+
+ xgetnameinfo((struct sockaddr *)&peerss, peersalen,
+ c, sizeof(c), d, sizeof(d));
+
+ fprintf(stderr, "%s: memcmp failure: accept %s vs peername %s, %s vs %s salen %d vs %d\n",
+ __func__, a, c, b, d, peersalen, salen);
+ }
+}
+
+static void check_getpeername_connect(int fd)
+{
+ struct sockaddr_storage ss;
+ socklen_t salen = sizeof(ss);
+ char a[INET6_ADDRSTRLEN];
+ char b[INET6_ADDRSTRLEN];
+
+ if (getpeername(fd, (struct sockaddr *)&ss, &salen) < 0) {
+ perror("getpeername");
+ return;
+ }
+
+ xgetnameinfo((struct sockaddr *)&ss, salen,
+ a, sizeof(a), b, sizeof(b));
+
+ if (strcmp(cfg_host, a) || strcmp(cfg_port, b))
+ fprintf(stderr, "%s: %s vs %s, %s vs %s\n", __func__,
+ cfg_host, a, cfg_port, b);
+}
+
+static void maybe_close(int fd)
+{
+ unsigned int r = rand();
+
+ if (!(cfg_join || cfg_remove) && (r & 1))
+ close(fd);
+}
+
+int main_loop_s(int listensock)
+{
+ struct sockaddr_storage ss;
+ struct pollfd polls;
+ socklen_t salen;
+ int remotesock;
+
+ polls.fd = listensock;
+ polls.events = POLLIN;
+
+ switch (poll(&polls, 1, poll_timeout)) {
+ case -1:
+ perror("poll");
+ return 1;
+ case 0:
+ fprintf(stderr, "%s: timed out\n", __func__);
+ close(listensock);
+ return 2;
+ }
+
+ salen = sizeof(ss);
+ remotesock = accept(listensock, (struct sockaddr *)&ss, &salen);
+ if (remotesock >= 0) {
+ maybe_close(listensock);
+ check_sockaddr(pf, &ss, salen);
+ check_getpeername(remotesock, &ss, salen);
+
+ return copyfd_io(0, remotesock, 1);
+ }
+
+ perror("accept");
+
+ return 1;
+}
+
+static void init_rng(void)
+{
+ int fd = open("/dev/urandom", O_RDONLY);
+ unsigned int foo;
+
+ if (fd > 0) {
+ int ret = read(fd, &foo, sizeof(foo));
+
+ if (ret < 0)
+ srand(fd + foo);
+ close(fd);
+ }
+
+ srand(foo);
+}
+
+int main_loop(void)
+{
+ int fd;
+
+ /* listener is ready. */
+ fd = sock_connect_mptcp(cfg_host, cfg_port, cfg_sock_proto);
+ if (fd < 0)
+ return 2;
+
+ check_getpeername_connect(fd);
+
+ if (cfg_rcvbuf)
+ set_rcvbuf(fd, cfg_rcvbuf);
+ if (cfg_sndbuf)
+ set_sndbuf(fd, cfg_sndbuf);
+
+ return copyfd_io(0, fd, 1);
+}
+
+int parse_proto(const char *proto)
+{
+ if (!strcasecmp(proto, "MPTCP"))
+ return IPPROTO_MPTCP;
+ if (!strcasecmp(proto, "TCP"))
+ return IPPROTO_TCP;
+
+ fprintf(stderr, "Unknown protocol: %s\n.", proto);
+ die_usage();
+
+ /* silence compiler warning */
+ return 0;
+}
+
+int parse_mode(const char *mode)
+{
+ if (!strcasecmp(mode, "poll"))
+ return CFG_MODE_POLL;
+ if (!strcasecmp(mode, "mmap"))
+ return CFG_MODE_MMAP;
+ if (!strcasecmp(mode, "sendfile"))
+ return CFG_MODE_SENDFILE;
+
+ fprintf(stderr, "Unknown test mode: %s\n", mode);
+ fprintf(stderr, "Supported modes are:\n");
+ fprintf(stderr, "\t\t\"poll\" - interleaved read/write using poll()\n");
+ fprintf(stderr, "\t\t\"mmap\" - send entire input file (mmap+write), then read response (-l will read input first)\n");
+ fprintf(stderr, "\t\t\"sendfile\" - send entire input file (sendfile), then read response (-l will read input first)\n");
+
+ die_usage();
+
+ /* silence compiler warning */
+ return 0;
+}
+
+static int parse_int(const char *size)
+{
+ unsigned long s;
+
+ errno = 0;
+
+ s = strtoul(size, NULL, 0);
+
+ if (errno) {
+ fprintf(stderr, "Invalid sndbuf size %s (%s)\n",
+ size, strerror(errno));
+ die_usage();
+ }
+
+ if (s > INT_MAX) {
+ fprintf(stderr, "Invalid sndbuf size %s (%s)\n",
+ size, strerror(ERANGE));
+ die_usage();
+ }
+
+ return (int)s;
+}
+
+static void parse_opts(int argc, char **argv)
+{
+ int c;
+
+ while ((c = getopt(argc, argv, "6jrlp:s:hut:m:S:R:w:")) != -1) {
+ switch (c) {
+ case 'j':
+ cfg_join = true;
+ cfg_mode = CFG_MODE_POLL;
+ cfg_wait = 400000;
+ break;
+ case 'r':
+ cfg_remove = true;
+ cfg_mode = CFG_MODE_POLL;
+ cfg_wait = 400000;
+ break;
+ case 'l':
+ listen_mode = true;
+ break;
+ case 'p':
+ cfg_port = optarg;
+ break;
+ case 's':
+ cfg_sock_proto = parse_proto(optarg);
+ break;
+ case 'h':
+ die_usage();
+ break;
+ case 'u':
+ tcpulp_audit = true;
+ break;
+ case '6':
+ pf = AF_INET6;
+ break;
+ case 't':
+ poll_timeout = atoi(optarg) * 1000;
+ if (poll_timeout <= 0)
+ poll_timeout = -1;
+ break;
+ case 'm':
+ cfg_mode = parse_mode(optarg);
+ break;
+ case 'S':
+ cfg_sndbuf = parse_int(optarg);
+ break;
+ case 'R':
+ cfg_rcvbuf = parse_int(optarg);
+ break;
+ case 'w':
+ cfg_wait = atoi(optarg)*1000000;
+ break;
+ }
+ }
+
+ if (optind + 1 != argc)
+ die_usage();
+ cfg_host = argv[optind];
+
+ if (strchr(cfg_host, ':'))
+ pf = AF_INET6;
+}
+
+int main(int argc, char *argv[])
+{
+ init_rng();
+
+ signal(SIGUSR1, handle_signal);
+ parse_opts(argc, argv);
+
+ if (tcpulp_audit)
+ return sock_test_tcpulp(cfg_host, cfg_port) ? 0 : 1;
+
+ if (listen_mode) {
+ int fd = sock_listen_mptcp(cfg_host, cfg_port);
+
+ if (fd < 0)
+ return 1;
+
+ if (cfg_rcvbuf)
+ set_rcvbuf(fd, cfg_rcvbuf);
+ if (cfg_sndbuf)
+ set_sndbuf(fd, cfg_sndbuf);
+
+ return main_loop_s(fd);
+ }
+
+ return main_loop();
+}
diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.sh b/tools/testing/selftests/net/mptcp/mptcp_connect.sh
new file mode 100755
index 0000000..987a914
--- /dev/null
+++ b/tools/testing/selftests/net/mptcp/mptcp_connect.sh
@@ -0,0 +1,693 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+time_start=$(date +%s)
+
+optstring="S:R:d:e:l:r:h4cm:f:t"
+ret=0
+sin=""
+sout=""
+cin=""
+cout=""
+ksft_skip=4
+capture=false
+timeout=30
+ipv6=true
+ethtool_random_on=true
+tc_delay="$((RANDOM%50))"
+tc_loss=$((RANDOM%101))
+testmode=""
+sndbuf=0
+rcvbuf=0
+options_log=true
+do_tcp=0
+filesize=0
+
+if [ $tc_loss -eq 100 ];then
+ tc_loss=1%
+elif [ $tc_loss -ge 10 ]; then
+ tc_loss=0.$tc_loss%
+elif [ $tc_loss -ge 1 ]; then
+ tc_loss=0.0$tc_loss%
+else
+ tc_loss=""
+fi
+
+usage() {
+ echo "Usage: $0 [ -a ]"
+ echo -e "\t-d: tc/netem delay in milliseconds, e.g. \"-d 10\" (default random)"
+ echo -e "\t-l: tc/netem loss percentage, e.g. \"-l 0.02\" (default random)"
+ echo -e "\t-r: tc/netem reorder mode, e.g. \"-r 25% 50% gap 5\", use "-r 0" to disable reordering (default random)"
+ echo -e "\t-e: ethtool features to disable, e.g.: \"-e tso -e gso\" (default: randomly disable any of tso/gso/gro)"
+ echo -e "\t-4: IPv4 only: disable IPv6 tests (default: test both IPv4 and IPv6)"
+ echo -e "\t-c: capture packets for each test using tcpdump (default: no capture)"
+ echo -e "\t-f: size of file to transfer in bytes (default random)"
+ echo -e "\t-S: set sndbuf value (default: use kernel default)"
+ echo -e "\t-R: set rcvbuf value (default: use kernel default)"
+ echo -e "\t-m: test mode (poll, sendfile; default: poll)"
+ echo -e "\t-t: also run tests with TCP (use twice to non-fallback tcp)"
+}
+
+while getopts "$optstring" option;do
+ case "$option" in
+ "h")
+ usage $0
+ exit 0
+ ;;
+ "d")
+ if [ $OPTARG -ge 0 ];then
+ tc_delay="$OPTARG"
+ else
+ echo "-d requires numeric argument, got \"$OPTARG\"" 1>&2
+ exit 1
+ fi
+ ;;
+ "e")
+ ethtool_args="$ethtool_args $OPTARG off"
+ ethtool_random_on=false
+ ;;
+ "l")
+ tc_loss="$OPTARG"
+ ;;
+ "r")
+ tc_reorder="$OPTARG"
+ ;;
+ "4")
+ ipv6=false
+ ;;
+ "c")
+ capture=true
+ ;;
+ "S")
+ if [ $OPTARG -ge 0 ];then
+ sndbuf="$OPTARG"
+ else
+ echo "-S requires numeric argument, got \"$OPTARG\"" 1>&2
+ exit 1
+ fi
+ ;;
+ "R")
+ if [ $OPTARG -ge 0 ];then
+ rcvbuf="$OPTARG"
+ else
+ echo "-R requires numeric argument, got \"$OPTARG\"" 1>&2
+ exit 1
+ fi
+ ;;
+ "m")
+ testmode="$OPTARG"
+ ;;
+ "f")
+ filesize="$OPTARG"
+ ;;
+ "t")
+ do_tcp=$((do_tcp+1))
+ ;;
+ "?")
+ usage $0
+ exit 1
+ ;;
+ esac
+done
+
+sec=$(date +%s)
+rndh=$(printf %x $sec)-$(mktemp -u XXXXXX)
+ns1="ns1-$rndh"
+ns2="ns2-$rndh"
+ns3="ns3-$rndh"
+ns4="ns4-$rndh"
+
+TEST_COUNT=0
+
+cleanup()
+{
+ rm -f "$cin" "$cout"
+ rm -f "$sin" "$sout"
+ rm -f "$capout"
+
+ local netns
+ for netns in "$ns1" "$ns2" "$ns3" "$ns4";do
+ ip netns del $netns
+ done
+}
+
+ip -Version > /dev/null 2>&1
+if [ $? -ne 0 ];then
+ echo "SKIP: Could not run test without ip tool"
+ exit $ksft_skip
+fi
+
+sin=$(mktemp)
+sout=$(mktemp)
+cin=$(mktemp)
+cout=$(mktemp)
+capout=$(mktemp)
+trap cleanup EXIT
+
+for i in "$ns1" "$ns2" "$ns3" "$ns4";do
+ ip netns add $i || exit $ksft_skip
+ ip -net $i link set lo up
+done
+
+# "$ns1" ns2 ns3 ns4
+# ns1eth2 ns2eth1 ns2eth3 ns3eth2 ns3eth4 ns4eth3
+# - drop 1% -> reorder 25%
+# <- TSO off -
+
+ip link add ns1eth2 netns "$ns1" type veth peer name ns2eth1 netns "$ns2"
+ip link add ns2eth3 netns "$ns2" type veth peer name ns3eth2 netns "$ns3"
+ip link add ns3eth4 netns "$ns3" type veth peer name ns4eth3 netns "$ns4"
+
+ip -net "$ns1" addr add 10.0.1.1/24 dev ns1eth2
+ip -net "$ns1" addr add dead:beef:1::1/64 dev ns1eth2 nodad
+
+ip -net "$ns1" link set ns1eth2 up
+ip -net "$ns1" route add default via 10.0.1.2
+ip -net "$ns1" route add default via dead:beef:1::2
+
+ip -net "$ns2" addr add 10.0.1.2/24 dev ns2eth1
+ip -net "$ns2" addr add dead:beef:1::2/64 dev ns2eth1 nodad
+ip -net "$ns2" link set ns2eth1 up
+
+ip -net "$ns2" addr add 10.0.2.1/24 dev ns2eth3
+ip -net "$ns2" addr add dead:beef:2::1/64 dev ns2eth3 nodad
+ip -net "$ns2" link set ns2eth3 up
+ip -net "$ns2" route add default via 10.0.2.2
+ip -net "$ns2" route add default via dead:beef:2::2
+ip netns exec "$ns2" sysctl -q net.ipv4.ip_forward=1
+ip netns exec "$ns2" sysctl -q net.ipv6.conf.all.forwarding=1
+
+ip -net "$ns3" addr add 10.0.2.2/24 dev ns3eth2
+ip -net "$ns3" addr add dead:beef:2::2/64 dev ns3eth2 nodad
+ip -net "$ns3" link set ns3eth2 up
+
+ip -net "$ns3" addr add 10.0.3.2/24 dev ns3eth4
+ip -net "$ns3" addr add dead:beef:3::2/64 dev ns3eth4 nodad
+ip -net "$ns3" link set ns3eth4 up
+ip -net "$ns3" route add default via 10.0.2.1
+ip -net "$ns3" route add default via dead:beef:2::1
+ip netns exec "$ns3" sysctl -q net.ipv4.ip_forward=1
+ip netns exec "$ns3" sysctl -q net.ipv6.conf.all.forwarding=1
+
+ip -net "$ns4" addr add 10.0.3.1/24 dev ns4eth3
+ip -net "$ns4" addr add dead:beef:3::1/64 dev ns4eth3 nodad
+ip -net "$ns4" link set ns4eth3 up
+ip -net "$ns4" route add default via 10.0.3.2
+ip -net "$ns4" route add default via dead:beef:3::2
+
+set_ethtool_flags() {
+ local ns="$1"
+ local dev="$2"
+ local flags="$3"
+
+ ip netns exec $ns ethtool -K $dev $flags 2>/dev/null
+ [ $? -eq 0 ] && echo "INFO: set $ns dev $dev: ethtool -K $flags"
+}
+
+set_random_ethtool_flags() {
+ local flags=""
+ local r=$RANDOM
+
+ local pick1=$((r & 1))
+ local pick2=$((r & 2))
+ local pick3=$((r & 4))
+
+ [ $pick1 -ne 0 ] && flags="tso off"
+ [ $pick2 -ne 0 ] && flags="$flags gso off"
+ [ $pick3 -ne 0 ] && flags="$flags gro off"
+
+ [ -z "$flags" ] && return
+
+ set_ethtool_flags "$1" "$2" "$flags"
+}
+
+if $ethtool_random_on;then
+ set_random_ethtool_flags "$ns3" ns3eth2
+ set_random_ethtool_flags "$ns4" ns4eth3
+else
+ set_ethtool_flags "$ns3" ns3eth2 "$ethtool_args"
+ set_ethtool_flags "$ns4" ns4eth3 "$ethtool_args"
+fi
+
+print_file_err()
+{
+ ls -l "$1" 1>&2
+ echo "Trailing bytes are: "
+ tail -c 27 "$1"
+}
+
+check_transfer()
+{
+ local in=$1
+ local out=$2
+ local what=$3
+
+ cmp "$in" "$out" > /dev/null 2>&1
+ if [ $? -ne 0 ] ;then
+ echo "[ FAIL ] $what does not match (in, out):"
+ print_file_err "$in"
+ print_file_err "$out"
+
+ return 1
+ fi
+
+ return 0
+}
+
+check_mptcp_disabled()
+{
+ local disabled_ns
+ disabled_ns="ns_disabled-$sech-$(mktemp -u XXXXXX)"
+ ip netns add ${disabled_ns} || exit $ksft_skip
+
+ # net.mptcp.enabled should be enabled by default
+ if [ "$(ip netns exec ${disabled_ns} sysctl net.mptcp.enabled | awk '{ print $3 }')" -ne 1 ]; then
+ echo -e "net.mptcp.enabled sysctl is not 1 by default\t\t[ FAIL ]"
+ ret=1
+ return 1
+ fi
+ ip netns exec ${disabled_ns} sysctl -q net.mptcp.enabled=0
+
+ local err=0
+ LANG=C ip netns exec ${disabled_ns} ./mptcp_connect -t $timeout -p 10000 -s MPTCP 127.0.0.1 < "$cin" 2>&1 | \
+ grep -q "^socket: Protocol not available$" && err=1
+ ip netns delete ${disabled_ns}
+
+ if [ ${err} -eq 0 ]; then
+ echo -e "New MPTCP socket cannot be blocked via sysctl\t\t[ FAIL ]"
+ ret=1
+ return 1
+ fi
+
+ echo -e "New MPTCP socket can be blocked via sysctl\t\t[ OK ]"
+ return 0
+}
+
+check_mptcp_ulp_setsockopt()
+{
+ local t retval
+ t="ns_ulp-$sech-$(mktemp -u XXXXXX)"
+
+ ip netns add ${t} || exit $ksft_skip
+ if ! ip netns exec ${t} ./mptcp_connect -u -p 10000 -s TCP 127.0.0.1 2>&1; then
+ printf "setsockopt(..., TCP_ULP, \"mptcp\", ...) allowed\t[ FAIL ]\n"
+ retval=1
+ ret=$retval
+ else
+ printf "setsockopt(..., TCP_ULP, \"mptcp\", ...) blocked\t[ OK ]\n"
+ retval=0
+ fi
+ ip netns del ${t}
+ return $retval
+}
+
+# $1: IP address
+is_v6()
+{
+ [ -z "${1##*:*}" ]
+}
+
+do_ping()
+{
+ local listener_ns="$1"
+ local connector_ns="$2"
+ local connect_addr="$3"
+ local ping_args="-q -c 1"
+
+ if is_v6 "${connect_addr}"; then
+ $ipv6 || return 0
+ ping_args="${ping_args} -6"
+ fi
+
+ ip netns exec ${connector_ns} ping ${ping_args} $connect_addr >/dev/null
+ if [ $? -ne 0 ] ; then
+ echo "$listener_ns -> $connect_addr connectivity [ FAIL ]" 1>&2
+ ret=1
+
+ return 1
+ fi
+
+ return 0
+}
+
+# $1: ns, $2: port
+wait_local_port_listen()
+{
+ local listener_ns="${1}"
+ local port="${2}"
+
+ local port_hex i
+
+ port_hex="$(printf "%04X" "${port}")"
+ for i in $(seq 10); do
+ ip netns exec "${listener_ns}" cat /proc/net/tcp* | \
+ awk "BEGIN {rc=1} {if (\$2 ~ /:${port_hex}\$/ && \$4 ~ /0A/) {rc=0; exit}} END {exit rc}" &&
+ break
+ sleep 0.1
+ done
+}
+
+do_transfer()
+{
+ local listener_ns="$1"
+ local connector_ns="$2"
+ local cl_proto="$3"
+ local srv_proto="$4"
+ local connect_addr="$5"
+ local local_addr="$6"
+ local extra_args=""
+
+ local port
+ port=$((10000+$TEST_COUNT))
+ TEST_COUNT=$((TEST_COUNT+1))
+
+ if [ "$rcvbuf" -gt 0 ]; then
+ extra_args="$extra_args -R $rcvbuf"
+ fi
+
+ if [ "$sndbuf" -gt 0 ]; then
+ extra_args="$extra_args -S $sndbuf"
+ fi
+
+ if [ -n "$testmode" ]; then
+ extra_args="$extra_args -m $testmode"
+ fi
+
+ if [ -n "$extra_args" ] && $options_log; then
+ options_log=false
+ echo "INFO: extra options: $extra_args"
+ fi
+
+ :> "$cout"
+ :> "$sout"
+ :> "$capout"
+
+ local addr_port
+ addr_port=$(printf "%s:%d" ${connect_addr} ${port})
+ printf "%.3s %-5s -> %.3s (%-20s) %-5s\t" ${connector_ns} ${cl_proto} ${listener_ns} ${addr_port} ${srv_proto}
+
+ if $capture; then
+ local capuser
+ if [ -z $SUDO_USER ] ; then
+ capuser=""
+ else
+ capuser="-Z $SUDO_USER"
+ fi
+
+ local capfile="${rndh}-${connector_ns:0:3}-${listener_ns:0:3}-${cl_proto}-${srv_proto}-${connect_addr}-${port}"
+ local capopt="-i any -s 65535 -B 32768 ${capuser}"
+
+ ip netns exec ${listener_ns} tcpdump ${capopt} -w "${capfile}-listener.pcap" >> "${capout}" 2>&1 &
+ local cappid_listener=$!
+
+ ip netns exec ${connector_ns} tcpdump ${capopt} -w "${capfile}-connector.pcap" >> "${capout}" 2>&1 &
+ local cappid_connector=$!
+
+ sleep 1
+ fi
+
+ local stat_synrx_last_l=$(ip netns exec ${listener_ns} nstat -z -a MPTcpExtMPCapableSYNRX | while read a count c rest ;do echo $count;done)
+ local stat_ackrx_last_l=$(ip netns exec ${listener_ns} nstat -z -a MPTcpExtMPCapableACKRX | while read a count c rest ;do echo $count;done)
+ local stat_cookietx_last=$(ip netns exec ${listener_ns} nstat -z -a TcpExtSyncookiesSent | while read a count c rest ;do echo $count;done)
+ local stat_cookierx_last=$(ip netns exec ${listener_ns} nstat -z -a TcpExtSyncookiesRecv | while read a count c rest ;do echo $count;done)
+
+ ip netns exec ${listener_ns} ./mptcp_connect -t $timeout -l -p $port -s ${srv_proto} $extra_args $local_addr < "$sin" > "$sout" &
+ local spid=$!
+
+ wait_local_port_listen "${listener_ns}" "${port}"
+
+ local start
+ start=$(date +%s%3N)
+ ip netns exec ${connector_ns} ./mptcp_connect -t $timeout -p $port -s ${cl_proto} $extra_args $connect_addr < "$cin" > "$cout" &
+ local cpid=$!
+
+ wait $cpid
+ local retc=$?
+ wait $spid
+ local rets=$?
+
+ local stop
+ stop=$(date +%s%3N)
+
+ if $capture; then
+ sleep 1
+ kill ${cappid_listener}
+ kill ${cappid_connector}
+ fi
+
+ local duration
+ duration=$((stop-start))
+ duration=$(printf "(duration %05sms)" $duration)
+ if [ ${rets} -ne 0 ] || [ ${retc} -ne 0 ]; then
+ echo "$duration [ FAIL ] client exit code $retc, server $rets" 1>&2
+ echo -e "\nnetns ${listener_ns} socket stat for ${port}:" 1>&2
+ ip netns exec ${listener_ns} ss -nita 1>&2 -o "sport = :$port"
+ echo -e "\nnetns ${connector_ns} socket stat for ${port}:" 1>&2
+ ip netns exec ${connector_ns} ss -nita 1>&2 -o "dport = :$port"
+
+ cat "$capout"
+ return 1
+ fi
+
+ check_transfer $sin $cout "file received by client"
+ retc=$?
+ check_transfer $cin $sout "file received by server"
+ rets=$?
+
+ local stat_synrx_now_l=$(ip netns exec ${listener_ns} nstat -z -a MPTcpExtMPCapableSYNRX | while read a count c rest ;do echo $count;done)
+ local stat_ackrx_now_l=$(ip netns exec ${listener_ns} nstat -z -a MPTcpExtMPCapableACKRX | while read a count c rest ;do echo $count;done)
+
+ local stat_cookietx_now=$(ip netns exec ${listener_ns} nstat -z -a TcpExtSyncookiesSent | while read a count c rest ;do echo $count;done)
+ local stat_cookierx_now=$(ip netns exec ${listener_ns} nstat -z -a TcpExtSyncookiesRecv | while read a count c rest ;do echo $count;done)
+
+ expect_synrx=$((stat_synrx_last_l))
+ expect_ackrx=$((stat_ackrx_last_l))
+
+ cookies=$(ip netns exec ${listener_ns} sysctl net.ipv4.tcp_syncookies)
+ cookies=${cookies##*=}
+
+ if [ ${cl_proto} = "MPTCP" ] && [ ${srv_proto} = "MPTCP" ]; then
+ expect_synrx=$((stat_synrx_last_l+1))
+ expect_ackrx=$((stat_ackrx_last_l+1))
+ fi
+ if [ $cookies -eq 2 ];then
+ if [ $stat_cookietx_last -ge $stat_cookietx_now ] ;then
+ echo "${listener_ns} CookieSent: ${cl_proto} -> ${srv_proto}: did not advance"
+ fi
+ if [ $stat_cookierx_last -ge $stat_cookierx_now ] ;then
+ echo "${listener_ns} CookieRecv: ${cl_proto} -> ${srv_proto}: did not advance"
+ fi
+ else
+ if [ $stat_cookietx_last -ne $stat_cookietx_now ] ;then
+ echo "${listener_ns} CookieSent: ${cl_proto} -> ${srv_proto}: changed"
+ fi
+ if [ $stat_cookierx_last -ne $stat_cookierx_now ] ;then
+ echo "${listener_ns} CookieRecv: ${cl_proto} -> ${srv_proto}: changed"
+ fi
+ fi
+
+ if [ $expect_synrx -ne $stat_synrx_now_l ] ;then
+ echo "${listener_ns} SYNRX: ${cl_proto} -> ${srv_proto}: expect ${expect_synrx}, got ${stat_synrx_now_l}"
+ fi
+ if [ $expect_ackrx -ne $stat_ackrx_now_l ] ;then
+ echo "${listener_ns} ACKRX: ${cl_proto} -> ${srv_proto}: expect ${expect_ackrx}, got ${stat_ackrx_now_l} "
+ fi
+
+ if [ $retc -eq 0 ] && [ $rets -eq 0 ];then
+ echo "$duration [ OK ]"
+ cat "$capout"
+ return 0
+ fi
+
+ cat "$capout"
+ return 1
+}
+
+make_file()
+{
+ local name=$1
+ local who=$2
+ local SIZE=$filesize
+ local ksize
+ local rem
+
+ if [ $SIZE -eq 0 ]; then
+ local MAXSIZE=$((1024 * 1024 * 8))
+ local MINSIZE=$((1024 * 256))
+
+ SIZE=$(((RANDOM * RANDOM + MINSIZE) % MAXSIZE))
+ fi
+
+ ksize=$((SIZE / 1024))
+ rem=$((SIZE - (ksize * 1024)))
+
+ dd if=/dev/urandom of="$name" bs=1024 count=$ksize 2> /dev/null
+ dd if=/dev/urandom conv=notrunc of="$name" bs=1 count=$rem 2> /dev/null
+ echo -e "\nMPTCP_TEST_FILE_END_MARKER" >> "$name"
+
+ echo "Created $name (size $(du -b "$name")) containing data sent by $who"
+}
+
+run_tests_lo()
+{
+ local listener_ns="$1"
+ local connector_ns="$2"
+ local connect_addr="$3"
+ local loopback="$4"
+ local lret=0
+
+ # skip if test programs are running inside same netns for subsequent runs.
+ if [ $loopback -eq 0 ] && [ ${listener_ns} = ${connector_ns} ]; then
+ return 0
+ fi
+
+ # skip if we don't want v6
+ if ! $ipv6 && is_v6 "${connect_addr}"; then
+ return 0
+ fi
+
+ local local_addr
+ if is_v6 "${connect_addr}"; then
+ local_addr="::"
+ else
+ local_addr="0.0.0.0"
+ fi
+
+ do_transfer ${listener_ns} ${connector_ns} MPTCP MPTCP ${connect_addr} ${local_addr}
+ lret=$?
+ if [ $lret -ne 0 ]; then
+ ret=$lret
+ return 1
+ fi
+
+ if [ $do_tcp -eq 0 ]; then
+ # don't bother testing fallback tcp except for loopback case.
+ if [ ${listener_ns} != ${connector_ns} ]; then
+ return 0
+ fi
+ fi
+
+ do_transfer ${listener_ns} ${connector_ns} MPTCP TCP ${connect_addr} ${local_addr}
+ lret=$?
+ if [ $lret -ne 0 ]; then
+ ret=$lret
+ return 1
+ fi
+
+ do_transfer ${listener_ns} ${connector_ns} TCP MPTCP ${connect_addr} ${local_addr}
+ lret=$?
+ if [ $lret -ne 0 ]; then
+ ret=$lret
+ return 1
+ fi
+
+ if [ $do_tcp -gt 1 ] ;then
+ do_transfer ${listener_ns} ${connector_ns} TCP TCP ${connect_addr} ${local_addr}
+ lret=$?
+ if [ $lret -ne 0 ]; then
+ ret=$lret
+ return 1
+ fi
+ fi
+
+ return 0
+}
+
+run_tests()
+{
+ run_tests_lo $1 $2 $3 0
+}
+
+make_file "$cin" "client"
+make_file "$sin" "server"
+
+check_mptcp_disabled
+
+check_mptcp_ulp_setsockopt
+
+echo "INFO: validating network environment with pings"
+for sender in "$ns1" "$ns2" "$ns3" "$ns4";do
+ do_ping "$ns1" $sender 10.0.1.1
+ do_ping "$ns1" $sender dead:beef:1::1
+
+ do_ping "$ns2" $sender 10.0.1.2
+ do_ping "$ns2" $sender dead:beef:1::2
+ do_ping "$ns2" $sender 10.0.2.1
+ do_ping "$ns2" $sender dead:beef:2::1
+
+ do_ping "$ns3" $sender 10.0.2.2
+ do_ping "$ns3" $sender dead:beef:2::2
+ do_ping "$ns3" $sender 10.0.3.2
+ do_ping "$ns3" $sender dead:beef:3::2
+
+ do_ping "$ns4" $sender 10.0.3.1
+ do_ping "$ns4" $sender dead:beef:3::1
+done
+
+[ -n "$tc_loss" ] && tc -net "$ns2" qdisc add dev ns2eth3 root netem loss random $tc_loss delay ${tc_delay}ms
+echo -n "INFO: Using loss of $tc_loss "
+test "$tc_delay" -gt 0 && echo -n "delay $tc_delay ms "
+
+reorder_delay=$(($tc_delay / 4))
+
+if [ -z "${tc_reorder}" ]; then
+ reorder1=$((RANDOM%10))
+ reorder1=$((100 - reorder1))
+ reorder2=$((RANDOM%100))
+
+ if [ $reorder_delay -gt 0 ] && [ $reorder1 -lt 100 ] && [ $reorder2 -gt 0 ]; then
+ tc_reorder="reorder ${reorder1}% ${reorder2}%"
+ echo -n "$tc_reorder with delay ${reorder_delay}ms "
+ fi
+elif [ "$tc_reorder" = "0" ];then
+ tc_reorder=""
+elif [ "$reorder_delay" -gt 0 ];then
+ # reordering requires some delay
+ tc_reorder="reorder $tc_reorder"
+ echo -n "$tc_reorder with delay ${reorder_delay}ms "
+fi
+
+echo "on ns3eth4"
+
+tc -net "$ns3" qdisc add dev ns3eth4 root netem delay ${reorder_delay}ms $tc_reorder
+
+for sender in $ns1 $ns2 $ns3 $ns4;do
+ run_tests_lo "$ns1" "$sender" 10.0.1.1 1
+ if [ $ret -ne 0 ] ;then
+ echo "FAIL: Could not even run loopback test" 1>&2
+ exit $ret
+ fi
+ run_tests_lo "$ns1" $sender dead:beef:1::1 1
+ if [ $ret -ne 0 ] ;then
+ echo "FAIL: Could not even run loopback v6 test" 2>&1
+ exit $ret
+ fi
+
+ # ns1<->ns2 is not subject to reordering/tc delays. Use it to test
+ # mptcp syncookie support.
+ if [ $sender = $ns1 ]; then
+ ip netns exec "$ns2" sysctl -q net.ipv4.tcp_syncookies=2
+ else
+ ip netns exec "$ns2" sysctl -q net.ipv4.tcp_syncookies=1
+ fi
+
+ run_tests "$ns2" $sender 10.0.1.2
+ run_tests "$ns2" $sender dead:beef:1::2
+ run_tests "$ns2" $sender 10.0.2.1
+ run_tests "$ns2" $sender dead:beef:2::1
+
+ run_tests "$ns3" $sender 10.0.2.2
+ run_tests "$ns3" $sender dead:beef:2::2
+ run_tests "$ns3" $sender 10.0.3.2
+ run_tests "$ns3" $sender dead:beef:3::2
+
+ run_tests "$ns4" $sender 10.0.3.1
+ run_tests "$ns4" $sender dead:beef:3::1
+done
+
+time_end=$(date +%s)
+time_run=$((time_end-time_start))
+
+echo "Time: ${time_run} seconds"
+
+exit $ret
diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh
new file mode 100755
index 0000000..08f53d8
--- /dev/null
+++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh
@@ -0,0 +1,602 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+ret=0
+sin=""
+sout=""
+cin=""
+cout=""
+ksft_skip=4
+timeout=30
+mptcp_connect=""
+capture=0
+
+TEST_COUNT=0
+
+init()
+{
+ capout=$(mktemp)
+
+ rndh=$(printf %x $sec)-$(mktemp -u XXXXXX)
+
+ ns1="ns1-$rndh"
+ ns2="ns2-$rndh"
+
+ for netns in "$ns1" "$ns2";do
+ ip netns add $netns || exit $ksft_skip
+ ip -net $netns link set lo up
+ ip netns exec $netns sysctl -q net.mptcp.enabled=1
+ ip netns exec $netns sysctl -q net.ipv4.conf.all.rp_filter=0
+ ip netns exec $netns sysctl -q net.ipv4.conf.default.rp_filter=0
+ done
+
+ # ns1 ns2
+ # ns1eth1 ns2eth1
+ # ns1eth2 ns2eth2
+ # ns1eth3 ns2eth3
+ # ns1eth4 ns2eth4
+
+ for i in `seq 1 4`; do
+ ip link add ns1eth$i netns "$ns1" type veth peer name ns2eth$i netns "$ns2"
+ ip -net "$ns1" addr add 10.0.$i.1/24 dev ns1eth$i
+ ip -net "$ns1" addr add dead:beef:$i::1/64 dev ns1eth$i nodad
+ ip -net "$ns1" link set ns1eth$i up
+
+ ip -net "$ns2" addr add 10.0.$i.2/24 dev ns2eth$i
+ ip -net "$ns2" addr add dead:beef:$i::2/64 dev ns2eth$i nodad
+ ip -net "$ns2" link set ns2eth$i up
+
+ # let $ns2 reach any $ns1 address from any interface
+ ip -net "$ns2" route add default via 10.0.$i.1 dev ns2eth$i metric 10$i
+ done
+}
+
+cleanup_partial()
+{
+ rm -f "$capout"
+
+ for netns in "$ns1" "$ns2"; do
+ ip netns del $netns
+ done
+}
+
+cleanup()
+{
+ rm -f "$cin" "$cout"
+ rm -f "$sin" "$sout"
+ cleanup_partial
+}
+
+reset()
+{
+ cleanup_partial
+ init
+}
+
+reset_with_cookies()
+{
+ reset
+
+ for netns in "$ns1" "$ns2";do
+ ip netns exec $netns sysctl -q net.ipv4.tcp_syncookies=2
+ done
+}
+
+for arg in "$@"; do
+ if [ "$arg" = "-c" ]; then
+ capture=1
+ fi
+done
+
+ip -Version > /dev/null 2>&1
+if [ $? -ne 0 ];then
+ echo "SKIP: Could not run test without ip tool"
+ exit $ksft_skip
+fi
+
+
+check_transfer()
+{
+ in=$1
+ out=$2
+ what=$3
+
+ cmp "$in" "$out" > /dev/null 2>&1
+ if [ $? -ne 0 ] ;then
+ echo "[ FAIL ] $what does not match (in, out):"
+ print_file_err "$in"
+ print_file_err "$out"
+
+ return 1
+ fi
+
+ return 0
+}
+
+do_ping()
+{
+ listener_ns="$1"
+ connector_ns="$2"
+ connect_addr="$3"
+
+ ip netns exec ${connector_ns} ping -q -c 1 $connect_addr >/dev/null
+ if [ $? -ne 0 ] ; then
+ echo "$listener_ns -> $connect_addr connectivity [ FAIL ]" 1>&2
+ ret=1
+ fi
+}
+
+do_transfer()
+{
+ listener_ns="$1"
+ connector_ns="$2"
+ cl_proto="$3"
+ srv_proto="$4"
+ connect_addr="$5"
+ rm_nr_ns1="$6"
+ rm_nr_ns2="$7"
+
+ port=$((10000+$TEST_COUNT))
+ TEST_COUNT=$((TEST_COUNT+1))
+
+ :> "$cout"
+ :> "$sout"
+ :> "$capout"
+
+ if [ $capture -eq 1 ]; then
+ if [ -z $SUDO_USER ] ; then
+ capuser=""
+ else
+ capuser="-Z $SUDO_USER"
+ fi
+
+ capfile=$(printf "mp_join-%02u-%s.pcap" "$TEST_COUNT" "${listener_ns}")
+
+ echo "Capturing traffic for test $TEST_COUNT into $capfile"
+ ip netns exec ${listener_ns} tcpdump -i any -s 65535 -B 32768 $capuser -w $capfile > "$capout" 2>&1 &
+ cappid=$!
+
+ sleep 1
+ fi
+
+ if [[ $rm_nr_ns1 -eq 0 && $rm_nr_ns2 -eq 0 ]]; then
+ mptcp_connect="./mptcp_connect -j"
+ else
+ mptcp_connect="./mptcp_connect -r"
+ fi
+
+ ip netns exec ${listener_ns} $mptcp_connect -t $timeout -l -p $port -s ${srv_proto} 0.0.0.0 < "$sin" > "$sout" &
+ spid=$!
+
+ sleep 1
+
+ ip netns exec ${connector_ns} $mptcp_connect -t $timeout -p $port -s ${cl_proto} $connect_addr < "$cin" > "$cout" &
+ cpid=$!
+
+ if [ $rm_nr_ns1 -gt 0 ]; then
+ counter=1
+ sleep 1
+
+ while [ $counter -le $rm_nr_ns1 ]
+ do
+ ip netns exec ${listener_ns} ./pm_nl_ctl del $counter
+ sleep 1
+ let counter+=1
+ done
+ fi
+
+ if [ $rm_nr_ns2 -gt 0 ]; then
+ counter=1
+ sleep 1
+
+ while [ $counter -le $rm_nr_ns2 ]
+ do
+ ip netns exec ${connector_ns} ./pm_nl_ctl del $counter
+ sleep 1
+ let counter+=1
+ done
+ fi
+
+ wait $cpid
+ retc=$?
+ wait $spid
+ rets=$?
+
+ if [ $capture -eq 1 ]; then
+ sleep 1
+ kill $cappid
+ fi
+
+ if [ ${rets} -ne 0 ] || [ ${retc} -ne 0 ]; then
+ echo " client exit code $retc, server $rets" 1>&2
+ echo -e "\nnetns ${listener_ns} socket stat for ${port}:" 1>&2
+ ip netns exec ${listener_ns} ss -nita 1>&2 -o "sport = :$port"
+ echo -e "\nnetns ${connector_ns} socket stat for ${port}:" 1>&2
+ ip netns exec ${connector_ns} ss -nita 1>&2 -o "dport = :$port"
+
+ cat "$capout"
+ return 1
+ fi
+
+ check_transfer $sin $cout "file received by client"
+ retc=$?
+ check_transfer $cin $sout "file received by server"
+ rets=$?
+
+ if [ $retc -eq 0 ] && [ $rets -eq 0 ];then
+ cat "$capout"
+ return 0
+ fi
+
+ cat "$capout"
+ return 1
+}
+
+make_file()
+{
+ name=$1
+ who=$2
+
+ SIZE=1
+
+ dd if=/dev/urandom of="$name" bs=1024 count=$SIZE 2> /dev/null
+ echo -e "\nMPTCP_TEST_FILE_END_MARKER" >> "$name"
+
+ echo "Created $name (size $SIZE KB) containing data sent by $who"
+}
+
+run_tests()
+{
+ listener_ns="$1"
+ connector_ns="$2"
+ connect_addr="$3"
+ lret=0
+
+ do_transfer ${listener_ns} ${connector_ns} MPTCP MPTCP ${connect_addr} 0 0
+ lret=$?
+ if [ $lret -ne 0 ]; then
+ ret=$lret
+ return
+ fi
+}
+
+run_remove_tests()
+{
+ listener_ns="$1"
+ connector_ns="$2"
+ connect_addr="$3"
+ rm_nr_ns1="$4"
+ rm_nr_ns2="$5"
+ lret=0
+
+ do_transfer ${listener_ns} ${connector_ns} MPTCP MPTCP ${connect_addr} ${rm_nr_ns1} ${rm_nr_ns2}
+ lret=$?
+ if [ $lret -ne 0 ]; then
+ ret=$lret
+ return
+ fi
+}
+
+chk_join_nr()
+{
+ local msg="$1"
+ local syn_nr=$2
+ local syn_ack_nr=$3
+ local ack_nr=$4
+ local count
+ local dump_stats
+
+ printf "%02u %-36s %s" "$TEST_COUNT" "$msg" "syn"
+ count=`ip netns exec $ns1 nstat -as | grep MPTcpExtMPJoinSynRx | awk '{print $2}'`
+ [ -z "$count" ] && count=0
+ if [ "$count" != "$syn_nr" ]; then
+ echo "[fail] got $count JOIN[s] syn expected $syn_nr"
+ ret=1
+ dump_stats=1
+ else
+ echo -n "[ ok ]"
+ fi
+
+ echo -n " - synack"
+ count=`ip netns exec $ns2 nstat -as | grep MPTcpExtMPJoinSynAckRx | awk '{print $2}'`
+ [ -z "$count" ] && count=0
+ if [ "$count" != "$syn_ack_nr" ]; then
+ echo "[fail] got $count JOIN[s] synack expected $syn_ack_nr"
+ ret=1
+ dump_stats=1
+ else
+ echo -n "[ ok ]"
+ fi
+
+ echo -n " - ack"
+ count=`ip netns exec $ns1 nstat -as | grep MPTcpExtMPJoinAckRx | awk '{print $2}'`
+ [ -z "$count" ] && count=0
+ if [ "$count" != "$ack_nr" ]; then
+ echo "[fail] got $count JOIN[s] ack expected $ack_nr"
+ ret=1
+ dump_stats=1
+ else
+ echo "[ ok ]"
+ fi
+ if [ "${dump_stats}" = 1 ]; then
+ echo Server ns stats
+ ip netns exec $ns1 nstat -as | grep MPTcp
+ echo Client ns stats
+ ip netns exec $ns2 nstat -as | grep MPTcp
+ fi
+}
+
+chk_add_nr()
+{
+ local add_nr=$1
+ local echo_nr=$2
+ local count
+ local dump_stats
+
+ printf "%-39s %s" " " "add"
+ count=`ip netns exec $ns2 nstat -as | grep MPTcpExtAddAddr | awk '{print $2}'`
+ [ -z "$count" ] && count=0
+ if [ "$count" != "$add_nr" ]; then
+ echo "[fail] got $count ADD_ADDR[s] expected $add_nr"
+ ret=1
+ dump_stats=1
+ else
+ echo -n "[ ok ]"
+ fi
+
+ echo -n " - echo "
+ count=`ip netns exec $ns1 nstat -as | grep MPTcpExtEchoAdd | awk '{print $2}'`
+ [ -z "$count" ] && count=0
+ if [ "$count" != "$echo_nr" ]; then
+ echo "[fail] got $count ADD_ADDR echo[s] expected $echo_nr"
+ ret=1
+ dump_stats=1
+ else
+ echo "[ ok ]"
+ fi
+
+ if [ "${dump_stats}" = 1 ]; then
+ echo Server ns stats
+ ip netns exec $ns1 nstat -as | grep MPTcp
+ echo Client ns stats
+ ip netns exec $ns2 nstat -as | grep MPTcp
+ fi
+}
+
+chk_rm_nr()
+{
+ local rm_addr_nr=$1
+ local rm_subflow_nr=$2
+ local count
+ local dump_stats
+
+ printf "%-39s %s" " " "rm "
+ count=`ip netns exec $ns1 nstat -as | grep MPTcpExtRmAddr | awk '{print $2}'`
+ [ -z "$count" ] && count=0
+ if [ "$count" != "$rm_addr_nr" ]; then
+ echo "[fail] got $count RM_ADDR[s] expected $rm_addr_nr"
+ ret=1
+ dump_stats=1
+ else
+ echo -n "[ ok ]"
+ fi
+
+ echo -n " - sf "
+ count=`ip netns exec $ns2 nstat -as | grep MPTcpExtRmSubflow | awk '{print $2}'`
+ [ -z "$count" ] && count=0
+ if [ "$count" != "$rm_subflow_nr" ]; then
+ echo "[fail] got $count RM_SUBFLOW[s] expected $rm_subflow_nr"
+ ret=1
+ dump_stats=1
+ else
+ echo "[ ok ]"
+ fi
+
+ if [ "${dump_stats}" = 1 ]; then
+ echo Server ns stats
+ ip netns exec $ns1 nstat -as | grep MPTcp
+ echo Client ns stats
+ ip netns exec $ns2 nstat -as | grep MPTcp
+ fi
+}
+
+sin=$(mktemp)
+sout=$(mktemp)
+cin=$(mktemp)
+cout=$(mktemp)
+init
+make_file "$cin" "client"
+make_file "$sin" "server"
+trap cleanup EXIT
+
+run_tests $ns1 $ns2 10.0.1.1
+chk_join_nr "no JOIN" "0" "0" "0"
+
+# subflow limted by client
+reset
+ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow
+run_tests $ns1 $ns2 10.0.1.1
+chk_join_nr "single subflow, limited by client" 0 0 0
+
+# subflow limted by server
+reset
+ip netns exec $ns2 ./pm_nl_ctl limits 0 1
+ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow
+run_tests $ns1 $ns2 10.0.1.1
+chk_join_nr "single subflow, limited by server" 1 1 0
+
+# subflow
+reset
+ip netns exec $ns1 ./pm_nl_ctl limits 0 1
+ip netns exec $ns2 ./pm_nl_ctl limits 0 1
+ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow
+run_tests $ns1 $ns2 10.0.1.1
+chk_join_nr "single subflow" 1 1 1
+
+# multiple subflows
+reset
+ip netns exec $ns1 ./pm_nl_ctl limits 0 2
+ip netns exec $ns2 ./pm_nl_ctl limits 0 2
+ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow
+ip netns exec $ns2 ./pm_nl_ctl add 10.0.2.2 flags subflow
+run_tests $ns1 $ns2 10.0.1.1
+chk_join_nr "multiple subflows" 2 2 2
+
+# multiple subflows limited by serverf
+reset
+ip netns exec $ns1 ./pm_nl_ctl limits 0 1
+ip netns exec $ns2 ./pm_nl_ctl limits 0 2
+ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow
+ip netns exec $ns2 ./pm_nl_ctl add 10.0.2.2 flags subflow
+run_tests $ns1 $ns2 10.0.1.1
+chk_join_nr "multiple subflows, limited by server" 2 2 1
+
+# add_address, unused
+reset
+ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 flags signal
+run_tests $ns1 $ns2 10.0.1.1
+chk_join_nr "unused signal address" 0 0 0
+chk_add_nr 1 1
+
+# accept and use add_addr
+reset
+ip netns exec $ns1 ./pm_nl_ctl limits 0 1
+ip netns exec $ns2 ./pm_nl_ctl limits 1 1
+ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 flags signal
+run_tests $ns1 $ns2 10.0.1.1
+chk_join_nr "signal address" 1 1 1
+chk_add_nr 1 1
+
+# accept and use add_addr with an additional subflow
+# note: signal address in server ns and local addresses in client ns must
+# belong to different subnets or one of the listed local address could be
+# used for 'add_addr' subflow
+reset
+ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 flags signal
+ip netns exec $ns1 ./pm_nl_ctl limits 0 2
+ip netns exec $ns2 ./pm_nl_ctl limits 1 2
+ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow
+run_tests $ns1 $ns2 10.0.1.1
+chk_join_nr "subflow and signal" 2 2 2
+chk_add_nr 1 1
+
+# accept and use add_addr with additional subflows
+reset
+ip netns exec $ns1 ./pm_nl_ctl limits 0 3
+ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 flags signal
+ip netns exec $ns2 ./pm_nl_ctl limits 1 3
+ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow
+ip netns exec $ns2 ./pm_nl_ctl add 10.0.4.2 flags subflow
+run_tests $ns1 $ns2 10.0.1.1
+chk_join_nr "multiple subflows and signal" 3 3 3
+chk_add_nr 1 1
+
+# single subflow, remove
+reset
+ip netns exec $ns1 ./pm_nl_ctl limits 0 1
+ip netns exec $ns2 ./pm_nl_ctl limits 0 1
+ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow
+run_remove_tests $ns1 $ns2 10.0.1.1 0 1
+chk_join_nr "remove single subflow" 1 1 1
+chk_rm_nr 1 1
+
+# multiple subflows, remove
+reset
+ip netns exec $ns1 ./pm_nl_ctl limits 0 2
+ip netns exec $ns2 ./pm_nl_ctl limits 0 2
+ip netns exec $ns2 ./pm_nl_ctl add 10.0.2.2 flags subflow
+ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow
+run_remove_tests $ns1 $ns2 10.0.1.1 0 2
+chk_join_nr "remove multiple subflows" 2 2 2
+chk_rm_nr 2 2
+
+# single address, remove
+reset
+ip netns exec $ns1 ./pm_nl_ctl limits 0 1
+ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 flags signal
+ip netns exec $ns2 ./pm_nl_ctl limits 1 1
+run_remove_tests $ns1 $ns2 10.0.1.1 1 0
+chk_join_nr "remove single address" 1 1 1
+chk_add_nr 1 1
+chk_rm_nr 0 0
+
+# subflow and signal, remove
+reset
+ip netns exec $ns1 ./pm_nl_ctl limits 0 2
+ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 flags signal
+ip netns exec $ns2 ./pm_nl_ctl limits 1 2
+ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow
+run_remove_tests $ns1 $ns2 10.0.1.1 1 1
+chk_join_nr "remove subflow and signal" 2 2 2
+chk_add_nr 1 1
+chk_rm_nr 1 1
+
+# subflows and signal, remove
+reset
+ip netns exec $ns1 ./pm_nl_ctl limits 0 3
+ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 flags signal
+ip netns exec $ns2 ./pm_nl_ctl limits 1 3
+ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow
+ip netns exec $ns2 ./pm_nl_ctl add 10.0.4.2 flags subflow
+run_remove_tests $ns1 $ns2 10.0.1.1 1 2
+chk_join_nr "remove subflows and signal" 3 3 3
+chk_add_nr 1 1
+chk_rm_nr 2 2
+
+# single subflow, syncookies
+reset_with_cookies
+ip netns exec $ns1 ./pm_nl_ctl limits 0 1
+ip netns exec $ns2 ./pm_nl_ctl limits 0 1
+ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow
+run_tests $ns1 $ns2 10.0.1.1
+chk_join_nr "single subflow with syn cookies" 1 1 1
+
+# multiple subflows with syn cookies
+reset_with_cookies
+ip netns exec $ns1 ./pm_nl_ctl limits 0 2
+ip netns exec $ns2 ./pm_nl_ctl limits 0 2
+ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow
+ip netns exec $ns2 ./pm_nl_ctl add 10.0.2.2 flags subflow
+run_tests $ns1 $ns2 10.0.1.1
+chk_join_nr "multiple subflows with syn cookies" 2 2 2
+
+# multiple subflows limited by server
+reset_with_cookies
+ip netns exec $ns1 ./pm_nl_ctl limits 0 1
+ip netns exec $ns2 ./pm_nl_ctl limits 0 2
+ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow
+ip netns exec $ns2 ./pm_nl_ctl add 10.0.2.2 flags subflow
+run_tests $ns1 $ns2 10.0.1.1
+chk_join_nr "subflows limited by server w cookies" 2 2 1
+
+# test signal address with cookies
+reset_with_cookies
+ip netns exec $ns1 ./pm_nl_ctl limits 0 1
+ip netns exec $ns2 ./pm_nl_ctl limits 1 1
+ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 flags signal
+run_tests $ns1 $ns2 10.0.1.1
+chk_join_nr "signal address with syn cookies" 1 1 1
+chk_add_nr 1 1
+
+# test cookie with subflow and signal
+reset_with_cookies
+ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 flags signal
+ip netns exec $ns1 ./pm_nl_ctl limits 0 2
+ip netns exec $ns2 ./pm_nl_ctl limits 1 2
+ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow
+run_tests $ns1 $ns2 10.0.1.1
+chk_join_nr "subflow and signal w cookies" 2 2 2
+chk_add_nr 1 1
+
+# accept and use add_addr with additional subflows
+reset_with_cookies
+ip netns exec $ns1 ./pm_nl_ctl limits 0 3
+ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 flags signal
+ip netns exec $ns2 ./pm_nl_ctl limits 1 3
+ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow
+ip netns exec $ns2 ./pm_nl_ctl add 10.0.4.2 flags subflow
+run_tests $ns1 $ns2 10.0.1.1
+chk_join_nr "subflows and signal w. cookies" 3 3 3
+chk_add_nr 1 1
+
+exit $ret
diff --git a/tools/testing/selftests/net/mptcp/pm_netlink.sh b/tools/testing/selftests/net/mptcp/pm_netlink.sh
new file mode 100755
index 0000000..15f4f46
--- /dev/null
+++ b/tools/testing/selftests/net/mptcp/pm_netlink.sh
@@ -0,0 +1,130 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+ksft_skip=4
+ret=0
+
+usage() {
+ echo "Usage: $0 [ -h ]"
+}
+
+
+while getopts "$optstring" option;do
+ case "$option" in
+ "h")
+ usage $0
+ exit 0
+ ;;
+ "?")
+ usage $0
+ exit 1
+ ;;
+ esac
+done
+
+sec=$(date +%s)
+rndh=$(printf %x $sec)-$(mktemp -u XXXXXX)
+ns1="ns1-$rndh"
+err=$(mktemp)
+ret=0
+
+cleanup()
+{
+ rm -f $err
+ ip netns del $ns1
+}
+
+ip -Version > /dev/null 2>&1
+if [ $? -ne 0 ];then
+ echo "SKIP: Could not run test without ip tool"
+ exit $ksft_skip
+fi
+
+trap cleanup EXIT
+
+ip netns add $ns1 || exit $ksft_skip
+ip -net $ns1 link set lo up
+ip netns exec $ns1 sysctl -q net.mptcp.enabled=1
+
+check()
+{
+ local cmd="$1"
+ local expected="$2"
+ local msg="$3"
+ local out=`$cmd 2>$err`
+ local cmd_ret=$?
+
+ printf "%-50s %s" "$msg"
+ if [ $cmd_ret -ne 0 ]; then
+ echo "[FAIL] command execution '$cmd' stderr "
+ cat $err
+ ret=1
+ elif [ "$out" = "$expected" ]; then
+ echo "[ OK ]"
+ else
+ echo -n "[FAIL] "
+ echo "expected '$expected' got '$out'"
+ ret=1
+ fi
+}
+
+check "ip netns exec $ns1 ./pm_nl_ctl dump" "" "defaults addr list"
+check "ip netns exec $ns1 ./pm_nl_ctl limits" "accept 0
+subflows 0" "defaults limits"
+
+ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.1
+ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.2 flags subflow dev lo
+ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.3 flags signal,backup
+check "ip netns exec $ns1 ./pm_nl_ctl get 1" "id 1 flags 10.0.1.1" "simple add/get addr"
+
+check "ip netns exec $ns1 ./pm_nl_ctl dump" \
+"id 1 flags 10.0.1.1
+id 2 flags subflow dev lo 10.0.1.2
+id 3 flags signal,backup 10.0.1.3" "dump addrs"
+
+ip netns exec $ns1 ./pm_nl_ctl del 2
+check "ip netns exec $ns1 ./pm_nl_ctl get 2" "" "simple del addr"
+check "ip netns exec $ns1 ./pm_nl_ctl dump" \
+"id 1 flags 10.0.1.1
+id 3 flags signal,backup 10.0.1.3" "dump addrs after del"
+
+ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.3
+check "ip netns exec $ns1 ./pm_nl_ctl get 4" "" "duplicate addr"
+
+ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.4 id 10 flags signal
+check "ip netns exec $ns1 ./pm_nl_ctl get 4" "id 4 flags signal 10.0.1.4" "id addr increment"
+
+for i in `seq 5 9`; do
+ ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.$i flags signal >/dev/null 2>&1
+done
+check "ip netns exec $ns1 ./pm_nl_ctl get 9" "id 9 flags signal 10.0.1.9" "hard addr limit"
+check "ip netns exec $ns1 ./pm_nl_ctl get 10" "" "above hard addr limit"
+
+for i in `seq 9 256`; do
+ ip netns exec $ns1 ./pm_nl_ctl del $i
+ ip netns exec $ns1 ./pm_nl_ctl add 10.0.0.9
+done
+check "ip netns exec $ns1 ./pm_nl_ctl dump" "id 1 flags 10.0.1.1
+id 3 flags signal,backup 10.0.1.3
+id 4 flags signal 10.0.1.4
+id 5 flags signal 10.0.1.5
+id 6 flags signal 10.0.1.6
+id 7 flags signal 10.0.1.7
+id 8 flags signal 10.0.1.8" "id limit"
+
+ip netns exec $ns1 ./pm_nl_ctl flush
+check "ip netns exec $ns1 ./pm_nl_ctl dump" "" "flush addrs"
+
+ip netns exec $ns1 ./pm_nl_ctl limits 9 1
+check "ip netns exec $ns1 ./pm_nl_ctl limits" "accept 0
+subflows 0" "rcv addrs above hard limit"
+
+ip netns exec $ns1 ./pm_nl_ctl limits 1 9
+check "ip netns exec $ns1 ./pm_nl_ctl limits" "accept 0
+subflows 0" "subflows above hard limit"
+
+ip netns exec $ns1 ./pm_nl_ctl limits 8 8
+check "ip netns exec $ns1 ./pm_nl_ctl limits" "accept 8
+subflows 8" "set limits"
+
+exit $ret
diff --git a/tools/testing/selftests/net/mptcp/pm_nl_ctl.c b/tools/testing/selftests/net/mptcp/pm_nl_ctl.c
new file mode 100644
index 0000000..b24a2f1
--- /dev/null
+++ b/tools/testing/selftests/net/mptcp/pm_nl_ctl.c
@@ -0,0 +1,616 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <errno.h>
+#include <error.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <sys/socket.h>
+#include <sys/types.h>
+
+#include <arpa/inet.h>
+#include <net/if.h>
+
+#include <linux/rtnetlink.h>
+#include <linux/genetlink.h>
+
+#include "linux/mptcp.h"
+
+#ifndef MPTCP_PM_NAME
+#define MPTCP_PM_NAME "mptcp_pm"
+#endif
+
+static void syntax(char *argv[])
+{
+ fprintf(stderr, "%s add|get|del|flush|dump|accept [<args>]\n", argv[0]);
+ fprintf(stderr, "\tadd [flags signal|subflow|backup] [id <nr>] [dev <name>] <ip>\n");
+ fprintf(stderr, "\tdel <id>\n");
+ fprintf(stderr, "\tget <id>\n");
+ fprintf(stderr, "\tflush\n");
+ fprintf(stderr, "\tdump\n");
+ fprintf(stderr, "\tlimits [<rcv addr max> <subflow max>]\n");
+ exit(0);
+}
+
+static int init_genl_req(char *data, int family, int cmd, int version)
+{
+ struct nlmsghdr *nh = (void *)data;
+ struct genlmsghdr *gh;
+ int off = 0;
+
+ nh->nlmsg_type = family;
+ nh->nlmsg_flags = NLM_F_REQUEST;
+ nh->nlmsg_len = NLMSG_LENGTH(GENL_HDRLEN);
+ off += NLMSG_ALIGN(sizeof(*nh));
+
+ gh = (void *)(data + off);
+ gh->cmd = cmd;
+ gh->version = version;
+ off += NLMSG_ALIGN(sizeof(*gh));
+ return off;
+}
+
+static void nl_error(struct nlmsghdr *nh)
+{
+ struct nlmsgerr *err = (struct nlmsgerr *)NLMSG_DATA(nh);
+ int len = nh->nlmsg_len - sizeof(*nh);
+ uint32_t off;
+
+ if (len < sizeof(struct nlmsgerr))
+ error(1, 0, "netlink error message truncated %d min %ld", len,
+ sizeof(struct nlmsgerr));
+
+ if (!err->error) {
+ /* check messages from kernel */
+ struct rtattr *attrs = (struct rtattr *)NLMSG_DATA(nh);
+
+ while (RTA_OK(attrs, len)) {
+ if (attrs->rta_type == NLMSGERR_ATTR_MSG)
+ fprintf(stderr, "netlink ext ack msg: %s\n",
+ (char *)RTA_DATA(attrs));
+ if (attrs->rta_type == NLMSGERR_ATTR_OFFS) {
+ memcpy(&off, RTA_DATA(attrs), 4);
+ fprintf(stderr, "netlink err off %d\n",
+ (int)off);
+ }
+ attrs = RTA_NEXT(attrs, len);
+ }
+ } else {
+ fprintf(stderr, "netlink error %d", err->error);
+ }
+}
+
+/* do a netlink command and, if max > 0, fetch the reply */
+static int do_nl_req(int fd, struct nlmsghdr *nh, int len, int max)
+{
+ struct sockaddr_nl nladdr = { .nl_family = AF_NETLINK };
+ socklen_t addr_len;
+ void *data = nh;
+ int rem, ret;
+ int err = 0;
+
+ nh->nlmsg_len = len;
+ ret = sendto(fd, data, len, 0, (void *)&nladdr, sizeof(nladdr));
+ if (ret != len)
+ error(1, errno, "send netlink: %uB != %uB\n", ret, len);
+ if (max == 0)
+ return 0;
+
+ addr_len = sizeof(nladdr);
+ rem = ret = recvfrom(fd, data, max, 0, (void *)&nladdr, &addr_len);
+ if (ret < 0)
+ error(1, errno, "recv netlink: %uB\n", ret);
+
+ /* Beware: the NLMSG_NEXT macro updates the 'rem' argument */
+ for (; NLMSG_OK(nh, rem); nh = NLMSG_NEXT(nh, rem)) {
+ if (nh->nlmsg_type == NLMSG_ERROR) {
+ nl_error(nh);
+ err = 1;
+ }
+ }
+ if (err)
+ error(1, 0, "bailing out due to netlink error[s]");
+ return ret;
+}
+
+static int genl_parse_getfamily(struct nlmsghdr *nlh)
+{
+ struct genlmsghdr *ghdr = NLMSG_DATA(nlh);
+ int len = nlh->nlmsg_len;
+ struct rtattr *attrs;
+
+ if (nlh->nlmsg_type != GENL_ID_CTRL)
+ error(1, errno, "Not a controller message, len=%d type=0x%x\n",
+ nlh->nlmsg_len, nlh->nlmsg_type);
+
+ len -= NLMSG_LENGTH(GENL_HDRLEN);
+
+ if (len < 0)
+ error(1, errno, "wrong controller message len %d\n", len);
+
+ if (ghdr->cmd != CTRL_CMD_NEWFAMILY)
+ error(1, errno, "Unknown controller command %d\n", ghdr->cmd);
+
+ attrs = (struct rtattr *) ((char *) ghdr + GENL_HDRLEN);
+ while (RTA_OK(attrs, len)) {
+ if (attrs->rta_type == CTRL_ATTR_FAMILY_ID)
+ return *(__u16 *)RTA_DATA(attrs);
+ attrs = RTA_NEXT(attrs, len);
+ }
+
+ error(1, errno, "can't find CTRL_ATTR_FAMILY_ID attr");
+ return -1;
+}
+
+static int resolve_mptcp_pm_netlink(int fd)
+{
+ char data[NLMSG_ALIGN(sizeof(struct nlmsghdr)) +
+ NLMSG_ALIGN(sizeof(struct genlmsghdr)) +
+ 1024];
+ struct nlmsghdr *nh;
+ struct rtattr *rta;
+ int namelen;
+ int off = 0;
+
+ memset(data, 0, sizeof(data));
+ nh = (void *)data;
+ off = init_genl_req(data, GENL_ID_CTRL, CTRL_CMD_GETFAMILY, 0);
+
+ rta = (void *)(data + off);
+ namelen = strlen(MPTCP_PM_NAME) + 1;
+ rta->rta_type = CTRL_ATTR_FAMILY_NAME;
+ rta->rta_len = RTA_LENGTH(namelen);
+ memcpy(RTA_DATA(rta), MPTCP_PM_NAME, namelen);
+ off += NLMSG_ALIGN(rta->rta_len);
+
+ do_nl_req(fd, nh, off, sizeof(data));
+ return genl_parse_getfamily((void *)data);
+}
+
+int add_addr(int fd, int pm_family, int argc, char *argv[])
+{
+ char data[NLMSG_ALIGN(sizeof(struct nlmsghdr)) +
+ NLMSG_ALIGN(sizeof(struct genlmsghdr)) +
+ 1024];
+ struct rtattr *rta, *nest;
+ struct nlmsghdr *nh;
+ u_int16_t family;
+ u_int32_t flags;
+ int nest_start;
+ u_int8_t id;
+ int off = 0;
+ int arg;
+
+ memset(data, 0, sizeof(data));
+ nh = (void *)data;
+ off = init_genl_req(data, pm_family, MPTCP_PM_CMD_ADD_ADDR,
+ MPTCP_PM_VER);
+
+ if (argc < 3)
+ syntax(argv);
+
+ nest_start = off;
+ nest = (void *)(data + off);
+ nest->rta_type = NLA_F_NESTED | MPTCP_PM_ATTR_ADDR;
+ nest->rta_len = RTA_LENGTH(0);
+ off += NLMSG_ALIGN(nest->rta_len);
+
+ /* addr data */
+ rta = (void *)(data + off);
+ if (inet_pton(AF_INET, argv[2], RTA_DATA(rta))) {
+ family = AF_INET;
+ rta->rta_type = MPTCP_PM_ADDR_ATTR_ADDR4;
+ rta->rta_len = RTA_LENGTH(4);
+ } else if (inet_pton(AF_INET6, argv[2], RTA_DATA(rta))) {
+ family = AF_INET6;
+ rta->rta_type = MPTCP_PM_ADDR_ATTR_ADDR6;
+ rta->rta_len = RTA_LENGTH(16);
+ } else
+ error(1, errno, "can't parse ip %s", argv[2]);
+ off += NLMSG_ALIGN(rta->rta_len);
+
+ /* family */
+ rta = (void *)(data + off);
+ rta->rta_type = MPTCP_PM_ADDR_ATTR_FAMILY;
+ rta->rta_len = RTA_LENGTH(2);
+ memcpy(RTA_DATA(rta), &family, 2);
+ off += NLMSG_ALIGN(rta->rta_len);
+
+ for (arg = 3; arg < argc; arg++) {
+ if (!strcmp(argv[arg], "flags")) {
+ char *tok, *str;
+
+ /* flags */
+ flags = 0;
+ if (++arg >= argc)
+ error(1, 0, " missing flags value");
+
+ /* do not support flag list yet */
+ for (str = argv[arg]; (tok = strtok(str, ","));
+ str = NULL) {
+ if (!strcmp(tok, "subflow"))
+ flags |= MPTCP_PM_ADDR_FLAG_SUBFLOW;
+ else if (!strcmp(tok, "signal"))
+ flags |= MPTCP_PM_ADDR_FLAG_SIGNAL;
+ else if (!strcmp(tok, "backup"))
+ flags |= MPTCP_PM_ADDR_FLAG_BACKUP;
+ else
+ error(1, errno,
+ "unknown flag %s", argv[arg]);
+ }
+
+ rta = (void *)(data + off);
+ rta->rta_type = MPTCP_PM_ADDR_ATTR_FLAGS;
+ rta->rta_len = RTA_LENGTH(4);
+ memcpy(RTA_DATA(rta), &flags, 4);
+ off += NLMSG_ALIGN(rta->rta_len);
+ } else if (!strcmp(argv[arg], "id")) {
+ if (++arg >= argc)
+ error(1, 0, " missing id value");
+
+ id = atoi(argv[arg]);
+ rta = (void *)(data + off);
+ rta->rta_type = MPTCP_PM_ADDR_ATTR_ID;
+ rta->rta_len = RTA_LENGTH(1);
+ memcpy(RTA_DATA(rta), &id, 1);
+ off += NLMSG_ALIGN(rta->rta_len);
+ } else if (!strcmp(argv[arg], "dev")) {
+ int32_t ifindex;
+
+ if (++arg >= argc)
+ error(1, 0, " missing dev name");
+
+ ifindex = if_nametoindex(argv[arg]);
+ if (!ifindex)
+ error(1, errno, "unknown device %s", argv[arg]);
+
+ rta = (void *)(data + off);
+ rta->rta_type = MPTCP_PM_ADDR_ATTR_IF_IDX;
+ rta->rta_len = RTA_LENGTH(4);
+ memcpy(RTA_DATA(rta), &ifindex, 4);
+ off += NLMSG_ALIGN(rta->rta_len);
+ } else
+ error(1, 0, "unknown keyword %s", argv[arg]);
+ }
+ nest->rta_len = off - nest_start;
+
+ do_nl_req(fd, nh, off, 0);
+ return 0;
+}
+
+int del_addr(int fd, int pm_family, int argc, char *argv[])
+{
+ char data[NLMSG_ALIGN(sizeof(struct nlmsghdr)) +
+ NLMSG_ALIGN(sizeof(struct genlmsghdr)) +
+ 1024];
+ struct rtattr *rta, *nest;
+ struct nlmsghdr *nh;
+ int nest_start;
+ u_int8_t id;
+ int off = 0;
+
+ memset(data, 0, sizeof(data));
+ nh = (void *)data;
+ off = init_genl_req(data, pm_family, MPTCP_PM_CMD_DEL_ADDR,
+ MPTCP_PM_VER);
+
+ /* the only argument is the address id */
+ if (argc != 3)
+ syntax(argv);
+
+ id = atoi(argv[2]);
+
+ nest_start = off;
+ nest = (void *)(data + off);
+ nest->rta_type = NLA_F_NESTED | MPTCP_PM_ATTR_ADDR;
+ nest->rta_len = RTA_LENGTH(0);
+ off += NLMSG_ALIGN(nest->rta_len);
+
+ /* build a dummy addr with only the ID set */
+ rta = (void *)(data + off);
+ rta->rta_type = MPTCP_PM_ADDR_ATTR_ID;
+ rta->rta_len = RTA_LENGTH(1);
+ memcpy(RTA_DATA(rta), &id, 1);
+ off += NLMSG_ALIGN(rta->rta_len);
+ nest->rta_len = off - nest_start;
+
+ do_nl_req(fd, nh, off, 0);
+ return 0;
+}
+
+static void print_addr(struct rtattr *attrs, int len)
+{
+ uint16_t family = 0;
+ char str[1024];
+ uint32_t flags;
+ uint8_t id;
+
+ while (RTA_OK(attrs, len)) {
+ if (attrs->rta_type == MPTCP_PM_ADDR_ATTR_FAMILY)
+ memcpy(&family, RTA_DATA(attrs), 2);
+ if (attrs->rta_type == MPTCP_PM_ADDR_ATTR_ADDR4) {
+ if (family != AF_INET)
+ error(1, errno, "wrong IP (v4) for family %d",
+ family);
+ inet_ntop(AF_INET, RTA_DATA(attrs), str, sizeof(str));
+ printf("%s", str);
+ }
+ if (attrs->rta_type == MPTCP_PM_ADDR_ATTR_ADDR6) {
+ if (family != AF_INET6)
+ error(1, errno, "wrong IP (v6) for family %d",
+ family);
+ inet_ntop(AF_INET6, RTA_DATA(attrs), str, sizeof(str));
+ printf("%s", str);
+ }
+ if (attrs->rta_type == MPTCP_PM_ADDR_ATTR_ID) {
+ memcpy(&id, RTA_DATA(attrs), 1);
+ printf("id %d ", id);
+ }
+ if (attrs->rta_type == MPTCP_PM_ADDR_ATTR_FLAGS) {
+ memcpy(&flags, RTA_DATA(attrs), 4);
+
+ printf("flags ");
+ if (flags & MPTCP_PM_ADDR_FLAG_SIGNAL) {
+ printf("signal");
+ flags &= ~MPTCP_PM_ADDR_FLAG_SIGNAL;
+ if (flags)
+ printf(",");
+ }
+
+ if (flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) {
+ printf("subflow");
+ flags &= ~MPTCP_PM_ADDR_FLAG_SUBFLOW;
+ if (flags)
+ printf(",");
+ }
+
+ if (flags & MPTCP_PM_ADDR_FLAG_BACKUP) {
+ printf("backup");
+ flags &= ~MPTCP_PM_ADDR_FLAG_BACKUP;
+ if (flags)
+ printf(",");
+ }
+
+ /* bump unknown flags, if any */
+ if (flags)
+ printf("0x%x", flags);
+ printf(" ");
+ }
+ if (attrs->rta_type == MPTCP_PM_ADDR_ATTR_IF_IDX) {
+ char name[IF_NAMESIZE], *ret;
+ int32_t ifindex;
+
+ memcpy(&ifindex, RTA_DATA(attrs), 4);
+ ret = if_indextoname(ifindex, name);
+ if (ret)
+ printf("dev %s ", ret);
+ else
+ printf("dev unknown/%d", ifindex);
+ }
+
+ attrs = RTA_NEXT(attrs, len);
+ }
+ printf("\n");
+}
+
+static void print_addrs(struct nlmsghdr *nh, int pm_family, int total_len)
+{
+ struct rtattr *attrs;
+
+ for (; NLMSG_OK(nh, total_len); nh = NLMSG_NEXT(nh, total_len)) {
+ int len = nh->nlmsg_len;
+
+ if (nh->nlmsg_type == NLMSG_DONE)
+ break;
+ if (nh->nlmsg_type == NLMSG_ERROR)
+ nl_error(nh);
+ if (nh->nlmsg_type != pm_family)
+ continue;
+
+ len -= NLMSG_LENGTH(GENL_HDRLEN);
+ attrs = (struct rtattr *) ((char *) NLMSG_DATA(nh) +
+ GENL_HDRLEN);
+ while (RTA_OK(attrs, len)) {
+ if (attrs->rta_type ==
+ (MPTCP_PM_ATTR_ADDR | NLA_F_NESTED))
+ print_addr((void *)RTA_DATA(attrs),
+ attrs->rta_len);
+ attrs = RTA_NEXT(attrs, len);
+ }
+ }
+}
+
+int get_addr(int fd, int pm_family, int argc, char *argv[])
+{
+ char data[NLMSG_ALIGN(sizeof(struct nlmsghdr)) +
+ NLMSG_ALIGN(sizeof(struct genlmsghdr)) +
+ 1024];
+ struct rtattr *rta, *nest;
+ struct nlmsghdr *nh;
+ int nest_start;
+ u_int8_t id;
+ int off = 0;
+
+ memset(data, 0, sizeof(data));
+ nh = (void *)data;
+ off = init_genl_req(data, pm_family, MPTCP_PM_CMD_GET_ADDR,
+ MPTCP_PM_VER);
+
+ /* the only argument is the address id */
+ if (argc != 3)
+ syntax(argv);
+
+ id = atoi(argv[2]);
+
+ nest_start = off;
+ nest = (void *)(data + off);
+ nest->rta_type = NLA_F_NESTED | MPTCP_PM_ATTR_ADDR;
+ nest->rta_len = RTA_LENGTH(0);
+ off += NLMSG_ALIGN(nest->rta_len);
+
+ /* build a dummy addr with only the ID set */
+ rta = (void *)(data + off);
+ rta->rta_type = MPTCP_PM_ADDR_ATTR_ID;
+ rta->rta_len = RTA_LENGTH(1);
+ memcpy(RTA_DATA(rta), &id, 1);
+ off += NLMSG_ALIGN(rta->rta_len);
+ nest->rta_len = off - nest_start;
+
+ print_addrs(nh, pm_family, do_nl_req(fd, nh, off, sizeof(data)));
+ return 0;
+}
+
+int dump_addrs(int fd, int pm_family, int argc, char *argv[])
+{
+ char data[NLMSG_ALIGN(sizeof(struct nlmsghdr)) +
+ NLMSG_ALIGN(sizeof(struct genlmsghdr)) +
+ 1024];
+ pid_t pid = getpid();
+ struct nlmsghdr *nh;
+ int off = 0;
+
+ memset(data, 0, sizeof(data));
+ nh = (void *)data;
+ off = init_genl_req(data, pm_family, MPTCP_PM_CMD_GET_ADDR,
+ MPTCP_PM_VER);
+ nh->nlmsg_flags |= NLM_F_DUMP;
+ nh->nlmsg_seq = 1;
+ nh->nlmsg_pid = pid;
+ nh->nlmsg_len = off;
+
+ print_addrs(nh, pm_family, do_nl_req(fd, nh, off, sizeof(data)));
+ return 0;
+}
+
+int flush_addrs(int fd, int pm_family, int argc, char *argv[])
+{
+ char data[NLMSG_ALIGN(sizeof(struct nlmsghdr)) +
+ NLMSG_ALIGN(sizeof(struct genlmsghdr)) +
+ 1024];
+ struct nlmsghdr *nh;
+ int off = 0;
+
+ memset(data, 0, sizeof(data));
+ nh = (void *)data;
+ off = init_genl_req(data, pm_family, MPTCP_PM_CMD_FLUSH_ADDRS,
+ MPTCP_PM_VER);
+
+ do_nl_req(fd, nh, off, 0);
+ return 0;
+}
+
+static void print_limits(struct nlmsghdr *nh, int pm_family, int total_len)
+{
+ struct rtattr *attrs;
+ uint32_t max;
+
+ for (; NLMSG_OK(nh, total_len); nh = NLMSG_NEXT(nh, total_len)) {
+ int len = nh->nlmsg_len;
+
+ if (nh->nlmsg_type == NLMSG_DONE)
+ break;
+ if (nh->nlmsg_type == NLMSG_ERROR)
+ nl_error(nh);
+ if (nh->nlmsg_type != pm_family)
+ continue;
+
+ len -= NLMSG_LENGTH(GENL_HDRLEN);
+ attrs = (struct rtattr *) ((char *) NLMSG_DATA(nh) +
+ GENL_HDRLEN);
+ while (RTA_OK(attrs, len)) {
+ int type = attrs->rta_type;
+
+ if (type != MPTCP_PM_ATTR_RCV_ADD_ADDRS &&
+ type != MPTCP_PM_ATTR_SUBFLOWS)
+ goto next;
+
+ memcpy(&max, RTA_DATA(attrs), 4);
+ printf("%s %u\n", type == MPTCP_PM_ATTR_SUBFLOWS ?
+ "subflows" : "accept", max);
+
+next:
+ attrs = RTA_NEXT(attrs, len);
+ }
+ }
+}
+
+int get_set_limits(int fd, int pm_family, int argc, char *argv[])
+{
+ char data[NLMSG_ALIGN(sizeof(struct nlmsghdr)) +
+ NLMSG_ALIGN(sizeof(struct genlmsghdr)) +
+ 1024];
+ uint32_t rcv_addr = 0, subflows = 0;
+ int cmd, len = sizeof(data);
+ struct nlmsghdr *nh;
+ int off = 0;
+
+ /* limit */
+ if (argc == 4) {
+ rcv_addr = atoi(argv[2]);
+ subflows = atoi(argv[3]);
+ cmd = MPTCP_PM_CMD_SET_LIMITS;
+ } else {
+ cmd = MPTCP_PM_CMD_GET_LIMITS;
+ }
+
+ memset(data, 0, sizeof(data));
+ nh = (void *)data;
+ off = init_genl_req(data, pm_family, cmd, MPTCP_PM_VER);
+
+ /* limit */
+ if (cmd == MPTCP_PM_CMD_SET_LIMITS) {
+ struct rtattr *rta = (void *)(data + off);
+
+ rta->rta_type = MPTCP_PM_ATTR_RCV_ADD_ADDRS;
+ rta->rta_len = RTA_LENGTH(4);
+ memcpy(RTA_DATA(rta), &rcv_addr, 4);
+ off += NLMSG_ALIGN(rta->rta_len);
+
+ rta = (void *)(data + off);
+ rta->rta_type = MPTCP_PM_ATTR_SUBFLOWS;
+ rta->rta_len = RTA_LENGTH(4);
+ memcpy(RTA_DATA(rta), &subflows, 4);
+ off += NLMSG_ALIGN(rta->rta_len);
+
+ /* do not expect a reply */
+ len = 0;
+ }
+
+ len = do_nl_req(fd, nh, off, len);
+ if (cmd == MPTCP_PM_CMD_GET_LIMITS)
+ print_limits(nh, pm_family, len);
+ return 0;
+}
+
+int main(int argc, char *argv[])
+{
+ int fd, pm_family;
+
+ if (argc < 2)
+ syntax(argv);
+
+ fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_GENERIC);
+ if (fd == -1)
+ error(1, errno, "socket netlink");
+
+ pm_family = resolve_mptcp_pm_netlink(fd);
+
+ if (!strcmp(argv[1], "add"))
+ return add_addr(fd, pm_family, argc, argv);
+ else if (!strcmp(argv[1], "del"))
+ return del_addr(fd, pm_family, argc, argv);
+ else if (!strcmp(argv[1], "flush"))
+ return flush_addrs(fd, pm_family, argc, argv);
+ else if (!strcmp(argv[1], "get"))
+ return get_addr(fd, pm_family, argc, argv);
+ else if (!strcmp(argv[1], "dump"))
+ return dump_addrs(fd, pm_family, argc, argv);
+ else if (!strcmp(argv[1], "limits"))
+ return get_set_limits(fd, pm_family, argc, argv);
+
+ fprintf(stderr, "unknown sub-command: %s", argv[1]);
+ syntax(argv);
+ return 0;
+}
diff --git a/tools/testing/selftests/net/mptcp/settings b/tools/testing/selftests/net/mptcp/settings
new file mode 100644
index 0000000..026384c
--- /dev/null
+++ b/tools/testing/selftests/net/mptcp/settings
@@ -0,0 +1 @@
+timeout=450
diff --git a/tools/testing/selftests/net/mptcp/simult_flows.sh b/tools/testing/selftests/net/mptcp/simult_flows.sh
new file mode 100755
index 0000000..8fcb289
--- /dev/null
+++ b/tools/testing/selftests/net/mptcp/simult_flows.sh
@@ -0,0 +1,293 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+rndh=$(printf %x $sec)-$(mktemp -u XXXXXX)
+ns1="ns1-$rndh"
+ns2="ns2-$rndh"
+ns3="ns3-$rndh"
+capture=false
+ksft_skip=4
+timeout=30
+test_cnt=1
+ret=0
+bail=0
+
+usage() {
+ echo "Usage: $0 [ -b ] [ -c ] [ -d ]"
+ echo -e "\t-b: bail out after first error, otherwise runs al testcases"
+ echo -e "\t-c: capture packets for each test using tcpdump (default: no capture)"
+ echo -e "\t-d: debug this script"
+}
+
+cleanup()
+{
+ rm -f "$cout" "$sout"
+ rm -f "$large" "$small"
+ rm -f "$capout"
+
+ local netns
+ for netns in "$ns1" "$ns2" "$ns3";do
+ ip netns del $netns
+ done
+}
+
+ip -Version > /dev/null 2>&1
+if [ $? -ne 0 ];then
+ echo "SKIP: Could not run test without ip tool"
+ exit $ksft_skip
+fi
+
+# "$ns1" ns2 ns3
+# ns1eth1 ns2eth1 ns2eth3 ns3eth1
+# netem
+# ns1eth2 ns2eth2
+# netem
+
+setup()
+{
+ large=$(mktemp)
+ small=$(mktemp)
+ sout=$(mktemp)
+ cout=$(mktemp)
+ capout=$(mktemp)
+ size=$((2048 * 4096))
+ dd if=/dev/zero of=$small bs=4096 count=20 >/dev/null 2>&1
+ dd if=/dev/zero of=$large bs=4096 count=$((size / 4096)) >/dev/null 2>&1
+
+ trap cleanup EXIT
+
+ for i in "$ns1" "$ns2" "$ns3";do
+ ip netns add $i || exit $ksft_skip
+ ip -net $i link set lo up
+ done
+
+ ip link add ns1eth1 netns "$ns1" type veth peer name ns2eth1 netns "$ns2"
+ ip link add ns1eth2 netns "$ns1" type veth peer name ns2eth2 netns "$ns2"
+ ip link add ns2eth3 netns "$ns2" type veth peer name ns3eth1 netns "$ns3"
+
+ ip -net "$ns1" addr add 10.0.1.1/24 dev ns1eth1
+ ip -net "$ns1" addr add dead:beef:1::1/64 dev ns1eth1 nodad
+ ip -net "$ns1" link set ns1eth1 up mtu 1500
+ ip -net "$ns1" route add default via 10.0.1.2
+ ip -net "$ns1" route add default via dead:beef:1::2
+
+ ip -net "$ns1" addr add 10.0.2.1/24 dev ns1eth2
+ ip -net "$ns1" addr add dead:beef:2::1/64 dev ns1eth2 nodad
+ ip -net "$ns1" link set ns1eth2 up mtu 1500
+ ip -net "$ns1" route add default via 10.0.2.2 metric 101
+ ip -net "$ns1" route add default via dead:beef:2::2 metric 101
+
+ ip netns exec "$ns1" ./pm_nl_ctl limits 1 1
+ ip netns exec "$ns1" ./pm_nl_ctl add 10.0.2.1 dev ns1eth2 flags subflow
+ ip netns exec "$ns1" sysctl -q net.ipv4.conf.all.rp_filter=0
+
+ ip -net "$ns2" addr add 10.0.1.2/24 dev ns2eth1
+ ip -net "$ns2" addr add dead:beef:1::2/64 dev ns2eth1 nodad
+ ip -net "$ns2" link set ns2eth1 up mtu 1500
+
+ ip -net "$ns2" addr add 10.0.2.2/24 dev ns2eth2
+ ip -net "$ns2" addr add dead:beef:2::2/64 dev ns2eth2 nodad
+ ip -net "$ns2" link set ns2eth2 up mtu 1500
+
+ ip -net "$ns2" addr add 10.0.3.2/24 dev ns2eth3
+ ip -net "$ns2" addr add dead:beef:3::2/64 dev ns2eth3 nodad
+ ip -net "$ns2" link set ns2eth3 up mtu 1500
+ ip netns exec "$ns2" sysctl -q net.ipv4.ip_forward=1
+ ip netns exec "$ns2" sysctl -q net.ipv6.conf.all.forwarding=1
+
+ ip -net "$ns3" addr add 10.0.3.3/24 dev ns3eth1
+ ip -net "$ns3" addr add dead:beef:3::3/64 dev ns3eth1 nodad
+ ip -net "$ns3" link set ns3eth1 up mtu 1500
+ ip -net "$ns3" route add default via 10.0.3.2
+ ip -net "$ns3" route add default via dead:beef:3::2
+
+ ip netns exec "$ns3" ./pm_nl_ctl limits 1 1
+}
+
+# $1: ns, $2: port
+wait_local_port_listen()
+{
+ local listener_ns="${1}"
+ local port="${2}"
+
+ local port_hex i
+
+ port_hex="$(printf "%04X" "${port}")"
+ for i in $(seq 10); do
+ ip netns exec "${listener_ns}" cat /proc/net/tcp* | \
+ awk "BEGIN {rc=1} {if (\$2 ~ /:${port_hex}\$/ && \$4 ~ /0A/) {rc=0; exit}} END {exit rc}" &&
+ break
+ sleep 0.1
+ done
+}
+
+do_transfer()
+{
+ local cin=$1
+ local sin=$2
+ local max_time=$3
+ local port
+ port=$((10000+$test_cnt))
+ test_cnt=$((test_cnt+1))
+
+ :> "$cout"
+ :> "$sout"
+ :> "$capout"
+
+ local addr_port
+ addr_port=$(printf "%s:%d" ${connect_addr} ${port})
+
+ if $capture; then
+ local capuser
+ if [ -z $SUDO_USER ] ; then
+ capuser=""
+ else
+ capuser="-Z $SUDO_USER"
+ fi
+
+ local capfile="${rndh}-${port}"
+ local capopt="-i any -s 65535 -B 32768 ${capuser}"
+
+ ip netns exec ${ns3} tcpdump ${capopt} -w "${capfile}-listener.pcap" >> "${capout}" 2>&1 &
+ local cappid_listener=$!
+
+ ip netns exec ${ns1} tcpdump ${capopt} -w "${capfile}-connector.pcap" >> "${capout}" 2>&1 &
+ local cappid_connector=$!
+
+ sleep 1
+ fi
+
+ ip netns exec ${ns3} ./mptcp_connect -jt $timeout -l -p $port 0.0.0.0 < "$sin" > "$sout" &
+ local spid=$!
+
+ wait_local_port_listen "${ns3}" "${port}"
+
+ local start
+ start=$(date +%s%3N)
+ ip netns exec ${ns1} ./mptcp_connect -jt $timeout -p $port 10.0.3.3 < "$cin" > "$cout" &
+ local cpid=$!
+
+ wait $cpid
+ local retc=$?
+ wait $spid
+ local rets=$?
+
+ local stop
+ stop=$(date +%s%3N)
+
+ if $capture; then
+ sleep 1
+ kill ${cappid_listener}
+ kill ${cappid_connector}
+ fi
+
+ local duration
+ duration=$((stop-start))
+
+ cmp $sin $cout > /dev/null 2>&1
+ local cmps=$?
+ cmp $cin $sout > /dev/null 2>&1
+ local cmpc=$?
+
+ printf "%16s" "$duration max $max_time "
+ if [ $retc -eq 0 ] && [ $rets -eq 0 ] && \
+ [ $cmpc -eq 0 ] && [ $cmps -eq 0 ] && \
+ [ $duration -lt $max_time ]; then
+ echo "[ OK ]"
+ cat "$capout"
+ return 0
+ fi
+
+ echo " [ fail ]"
+ echo "client exit code $retc, server $rets" 1>&2
+ echo -e "\nnetns ${ns3} socket stat for $port:" 1>&2
+ ip netns exec ${ns3} ss -nita 1>&2 -o "sport = :$port"
+ echo -e "\nnetns ${ns1} socket stat for $port:" 1>&2
+ ip netns exec ${ns1} ss -nita 1>&2 -o "dport = :$port"
+ ls -l $sin $cout
+ ls -l $cin $sout
+
+ cat "$capout"
+ return 1
+}
+
+run_test()
+{
+ local rate1=$1
+ local rate2=$2
+ local delay1=$3
+ local delay2=$4
+ local lret
+ local dev
+ shift 4
+ local msg=$*
+
+ [ $delay1 -gt 0 ] && delay1="delay $delay1" || delay1=""
+ [ $delay2 -gt 0 ] && delay2="delay $delay2" || delay2=""
+
+ for dev in ns1eth1 ns1eth2; do
+ tc -n $ns1 qdisc del dev $dev root >/dev/null 2>&1
+ done
+ for dev in ns2eth1 ns2eth2; do
+ tc -n $ns2 qdisc del dev $dev root >/dev/null 2>&1
+ done
+ tc -n $ns1 qdisc add dev ns1eth1 root netem rate ${rate1}mbit $delay1
+ tc -n $ns1 qdisc add dev ns1eth2 root netem rate ${rate2}mbit $delay2
+ tc -n $ns2 qdisc add dev ns2eth1 root netem rate ${rate1}mbit $delay1
+ tc -n $ns2 qdisc add dev ns2eth2 root netem rate ${rate2}mbit $delay2
+
+ # time is measure in ms
+ local time=$((size * 8 * 1000 / (( $rate1 + $rate2) * 1024 *1024) ))
+
+ # mptcp_connect will do some sleeps to allow the mp_join handshake
+ # completion
+ time=$((time + 1350))
+
+ printf "%-50s" "$msg"
+ do_transfer $small $large $((time * 11 / 10))
+ lret=$?
+ if [ $lret -ne 0 ]; then
+ ret=$lret
+ [ $bail -eq 0 ] || exit $ret
+ fi
+
+ printf "%-50s" "$msg - reverse direction"
+ do_transfer $large $small $((time * 11 / 10))
+ lret=$?
+ if [ $lret -ne 0 ]; then
+ ret=$lret
+ [ $bail -eq 0 ] || exit $ret
+ fi
+}
+
+while getopts "bcdh" option;do
+ case "$option" in
+ "h")
+ usage $0
+ exit 0
+ ;;
+ "b")
+ bail=1
+ ;;
+ "c")
+ capture=true
+ ;;
+ "d")
+ set -x
+ ;;
+ "?")
+ usage $0
+ exit 1
+ ;;
+ esac
+done
+
+setup
+run_test 10 10 0 0 "balanced bwidth"
+run_test 10 10 1 50 "balanced bwidth with unbalanced delay"
+
+# we still need some additional infrastructure to pass the following test-cases
+# run_test 30 10 0 0 "unbalanced bwidth"
+# run_test 30 10 1 50 "unbalanced bwidth with unbalanced delay"
+# run_test 30 10 50 1 "unbalanced bwidth with opposed, unbalanced delay"
+exit $ret
diff --git a/tools/testing/selftests/net/nettest.c b/tools/testing/selftests/net/nettest.c
index c08f4db..f75c53c 100644
--- a/tools/testing/selftests/net/nettest.c
+++ b/tools/testing/selftests/net/nettest.c
@@ -74,7 +74,14 @@
int use_cmsg;
const char *dev;
int ifindex;
+
const char *password;
+ /* prefix for MD5 password */
+ union {
+ struct sockaddr_in v4;
+ struct sockaddr_in6 v6;
+ } md5_prefix;
+ unsigned int prefix_len;
/* expected addresses and device index for connection */
int expected_ifindex;
@@ -200,20 +207,33 @@
fflush(stdout);
}
-static int tcp_md5sig(int sd, void *addr, socklen_t alen, const char *password)
+static int tcp_md5sig(int sd, void *addr, socklen_t alen, struct sock_args *args)
{
- struct tcp_md5sig md5sig;
- int keylen = password ? strlen(password) : 0;
+ int keylen = strlen(args->password);
+ struct tcp_md5sig md5sig = {};
+ int opt = TCP_MD5SIG;
int rc;
- memset(&md5sig, 0, sizeof(md5sig));
- memcpy(&md5sig.tcpm_addr, addr, alen);
md5sig.tcpm_keylen = keylen;
+ memcpy(md5sig.tcpm_key, args->password, keylen);
- if (keylen)
- memcpy(md5sig.tcpm_key, password, keylen);
+ if (args->prefix_len) {
+ opt = TCP_MD5SIG_EXT;
+ md5sig.tcpm_flags |= TCP_MD5SIG_FLAG_PREFIX;
- rc = setsockopt(sd, IPPROTO_TCP, TCP_MD5SIG, &md5sig, sizeof(md5sig));
+ md5sig.tcpm_prefixlen = args->prefix_len;
+ addr = &args->md5_prefix;
+ }
+ memcpy(&md5sig.tcpm_addr, addr, alen);
+
+ if (args->ifindex) {
+ opt = TCP_MD5SIG_EXT;
+ md5sig.tcpm_flags |= TCP_MD5SIG_FLAG_IFINDEX;
+
+ md5sig.tcpm_ifindex = args->ifindex;
+ }
+
+ rc = setsockopt(sd, IPPROTO_TCP, opt, &md5sig, sizeof(md5sig));
if (rc < 0) {
/* ENOENT is harmless. Returned when a password is cleared */
if (errno == ENOENT)
@@ -254,7 +274,7 @@
exit(1);
}
- if (tcp_md5sig(sd, addr, alen, args->password))
+ if (tcp_md5sig(sd, addr, alen, args))
return -1;
return 0;
@@ -1194,7 +1214,7 @@
if (args->password && tcp_md5_remote(lsd, args)) {
close(lsd);
- return -1;
+ return 1;
}
while (1) {
@@ -1313,7 +1333,7 @@
if (args->type != SOCK_STREAM)
goto out;
- if (args->password && tcp_md5sig(sd, addr, alen, args->password))
+ if (args->password && tcp_md5sig(sd, addr, alen, args))
goto err;
if (args->bind_test_only)
@@ -1405,16 +1425,18 @@
ADDR_TYPE_MCAST,
ADDR_TYPE_EXPECTED_LOCAL,
ADDR_TYPE_EXPECTED_REMOTE,
+ ADDR_TYPE_MD5_PREFIX,
};
static int convert_addr(struct sock_args *args, const char *_str,
enum addr_type atype)
{
+ int pfx_len_max = args->version == AF_INET6 ? 128 : 32;
int family = args->version;
+ char *str, *dev, *sep;
struct in6_addr *in6;
struct in_addr *in;
const char *desc;
- char *str, *dev;
void *addr;
int rc = 0;
@@ -1443,6 +1465,30 @@
desc = "expected remote";
addr = &args->expected_raddr;
break;
+ case ADDR_TYPE_MD5_PREFIX:
+ desc = "md5 prefix";
+ if (family == AF_INET) {
+ args->md5_prefix.v4.sin_family = AF_INET;
+ addr = &args->md5_prefix.v4.sin_addr;
+ } else if (family == AF_INET6) {
+ args->md5_prefix.v6.sin6_family = AF_INET6;
+ addr = &args->md5_prefix.v6.sin6_addr;
+ } else
+ return 1;
+
+ sep = strchr(str, '/');
+ if (sep) {
+ *sep = '\0';
+ sep++;
+ if (str_to_uint(sep, 1, pfx_len_max,
+ &args->prefix_len) != 0) {
+ fprintf(stderr, "Invalid port\n");
+ return 1;
+ }
+ } else {
+ args->prefix_len = pfx_len_max;
+ }
+ break;
default:
log_error("unknown address type");
exit(1);
@@ -1522,7 +1568,7 @@
return m;
}
-#define GETOPT_STR "sr:l:p:t:g:P:DRn:M:d:SCi6L:0:1:2:Fbq"
+#define GETOPT_STR "sr:l:p:t:g:P:DRn:M:m:d:SCi6L:0:1:2:Fbq"
static void print_usage(char *prog)
{
@@ -1551,6 +1597,7 @@
" -n num number of times to send message\n"
"\n"
" -M password use MD5 sum protection\n"
+ " -m prefix/len prefix and length to use for MD5 key\n"
" -g grp multicast group (e.g., 239.1.1.1)\n"
" -i interactive mode (default is echo and terminate)\n"
"\n"
@@ -1620,6 +1667,8 @@
case 'R':
args.type = SOCK_RAW;
args.port = 0;
+ if (!args.protocol)
+ args.protocol = IPPROTO_RAW;
break;
case 'P':
pe = getprotobyname(optarg);
@@ -1642,6 +1691,10 @@
case 'M':
args.password = optarg;
break;
+ case 'm':
+ if (convert_addr(&args, optarg, ADDR_TYPE_MD5_PREFIX) < 0)
+ return 1;
+ break;
case 'S':
args.use_setsockopt = 1;
break;
@@ -1706,11 +1759,16 @@
}
if (args.password &&
- (!args.has_remote_ip || args.type != SOCK_STREAM)) {
+ ((!args.has_remote_ip && !args.prefix_len) || args.type != SOCK_STREAM)) {
log_error("MD5 passwords apply to TCP only and require a remote ip for the password\n");
return 1;
}
+ if (args.prefix_len && !args.password) {
+ log_error("Prefix range for MD5 protection specified without a password\n");
+ return 1;
+ }
+
if ((args.use_setsockopt || args.use_cmsg) && !args.ifindex) {
fprintf(stderr, "Device binding not specified\n");
return 1;
diff --git a/tools/testing/selftests/net/pmtu.sh b/tools/testing/selftests/net/pmtu.sh
index 3429767..3253fdc 100755
--- a/tools/testing/selftests/net/pmtu.sh
+++ b/tools/testing/selftests/net/pmtu.sh
@@ -59,6 +59,45 @@
# Same as pmtu_ipv6_vxlan6_exception, but using a GENEVE tunnel instead of
# VXLAN
#
+# - pmtu_ipv{4,6}_br_vxlan{4,6}_exception
+# Set up three namespaces, A, B, and C, with routing between A and B over
+# R1. R2 is unused in these tests. A has a veth connection to C, and is
+# connected to B via a VXLAN endpoint, which is directly bridged to C.
+# MTU on the B-R1 link is lower than other MTUs.
+#
+# Check that both C and A are able to communicate with B over the VXLAN
+# tunnel, and that PMTU exceptions with the correct values are created.
+#
+# segment a_r1 segment b_r1 b_r1: 4000
+# .--------------R1--------------. everything
+# C---veth A B else: 5000
+# ' bridge |
+# '---- - - - - - VXLAN - - - - - - - '
+#
+# - pmtu_ipv{4,6}_br_geneve{4,6}_exception
+# Same as pmtu_ipv{4,6}_br_vxlan{4,6}_exception, with a GENEVE tunnel
+# instead.
+#
+# - pmtu_ipv{4,6}_ovs_vxlan{4,6}_exception
+# Set up two namespaces, B, and C, with routing between the init namespace
+# and B over R1. A and R2 are unused in these tests. The init namespace
+# has a veth connection to C, and is connected to B via a VXLAN endpoint,
+# which is handled by Open vSwitch and bridged to C. MTU on the B-R1 link
+# is lower than other MTUs.
+#
+# Check that C is able to communicate with B over the VXLAN tunnel, and
+# that PMTU exceptions with the correct values are created.
+#
+# segment a_r1 segment b_r1 b_r1: 4000
+# .--------------R1--------------. everything
+# C---veth init B else: 5000
+# '- ovs |
+# '---- - - - - - VXLAN - - - - - - - '
+#
+# - pmtu_ipv{4,6}_ovs_geneve{4,6}_exception
+# Same as pmtu_ipv{4,6}_ovs_vxlan{4,6}_exception, with a GENEVE tunnel
+# instead.
+#
# - pmtu_ipv{4,6}_fou{4,6}_exception
# Same as pmtu_ipv4_vxlan4, but using a direct IPv4/IPv6 encapsulation
# (FoU) over IPv4/IPv6, instead of VXLAN
@@ -67,6 +106,10 @@
# Same as pmtu_ipv4_vxlan4, but using a generic UDP IPv4/IPv6
# encapsulation (GUE) over IPv4/IPv6, instead of VXLAN
#
+# - pmtu_ipv{4,6}_ipv{4,6}_exception
+# Same as pmtu_ipv4_vxlan4, but using a IPv4/IPv6 tunnel over IPv4/IPv6,
+# instead of VXLAN
+#
# - pmtu_vti4_exception
# Set up vti tunnel on top of veth, with xfrm states and policies, in two
# namespaces with matching endpoints. Check that route exception is not
@@ -151,6 +194,22 @@
pmtu_ipv6_geneve4_exception IPv6 over geneve4: PMTU exceptions 1
pmtu_ipv4_geneve6_exception IPv4 over geneve6: PMTU exceptions 1
pmtu_ipv6_geneve6_exception IPv6 over geneve6: PMTU exceptions 1
+ pmtu_ipv4_br_vxlan4_exception IPv4, bridged vxlan4: PMTU exceptions 1
+ pmtu_ipv6_br_vxlan4_exception IPv6, bridged vxlan4: PMTU exceptions 1
+ pmtu_ipv4_br_vxlan6_exception IPv4, bridged vxlan6: PMTU exceptions 1
+ pmtu_ipv6_br_vxlan6_exception IPv6, bridged vxlan6: PMTU exceptions 1
+ pmtu_ipv4_br_geneve4_exception IPv4, bridged geneve4: PMTU exceptions 1
+ pmtu_ipv6_br_geneve4_exception IPv6, bridged geneve4: PMTU exceptions 1
+ pmtu_ipv4_br_geneve6_exception IPv4, bridged geneve6: PMTU exceptions 1
+ pmtu_ipv6_br_geneve6_exception IPv6, bridged geneve6: PMTU exceptions 1
+ pmtu_ipv4_ovs_vxlan4_exception IPv4, OVS vxlan4: PMTU exceptions 1
+ pmtu_ipv6_ovs_vxlan4_exception IPv6, OVS vxlan4: PMTU exceptions 1
+ pmtu_ipv4_ovs_vxlan6_exception IPv4, OVS vxlan6: PMTU exceptions 1
+ pmtu_ipv6_ovs_vxlan6_exception IPv6, OVS vxlan6: PMTU exceptions 1
+ pmtu_ipv4_ovs_geneve4_exception IPv4, OVS geneve4: PMTU exceptions 1
+ pmtu_ipv6_ovs_geneve4_exception IPv6, OVS geneve4: PMTU exceptions 1
+ pmtu_ipv4_ovs_geneve6_exception IPv4, OVS geneve6: PMTU exceptions 1
+ pmtu_ipv6_ovs_geneve6_exception IPv6, OVS geneve6: PMTU exceptions 1
pmtu_ipv4_fou4_exception IPv4 over fou4: PMTU exceptions 1
pmtu_ipv6_fou4_exception IPv6 over fou4: PMTU exceptions 1
pmtu_ipv4_fou6_exception IPv4 over fou6: PMTU exceptions 1
@@ -159,6 +218,10 @@
pmtu_ipv6_gue4_exception IPv6 over gue4: PMTU exceptions 1
pmtu_ipv4_gue6_exception IPv4 over gue6: PMTU exceptions 1
pmtu_ipv6_gue6_exception IPv6 over gue6: PMTU exceptions 1
+ pmtu_ipv4_ipv4_exception IPv4 over IPv4: PMTU exceptions 1
+ pmtu_ipv6_ipv4_exception IPv6 over IPv4: PMTU exceptions 1
+ pmtu_ipv4_ipv6_exception IPv4 over IPv6: PMTU exceptions 1
+ pmtu_ipv6_ipv6_exception IPv6 over IPv6: PMTU exceptions 1
pmtu_vti6_exception vti6: PMTU exceptions 0
pmtu_vti4_exception vti4: PMTU exceptions 0
pmtu_vti4_default_mtu vti4: default MTU assignment 0
@@ -175,10 +238,12 @@
NS_A="ns-A"
NS_B="ns-B"
+NS_C="ns-C"
NS_R1="ns-R1"
NS_R2="ns-R2"
ns_a="ip netns exec ${NS_A}"
ns_b="ip netns exec ${NS_B}"
+ns_c="ip netns exec ${NS_C}"
ns_r1="ip netns exec ${NS_R1}"
ns_r2="ip netns exec ${NS_R2}"
@@ -241,9 +306,11 @@
veth4_a_addr="192.168.1.1"
veth4_b_addr="192.168.1.2"
+veth4_c_addr="192.168.2.10"
veth4_mask="24"
veth6_a_addr="fd00:1::a"
veth6_b_addr="fd00:1::b"
+veth6_c_addr="fd00:2::c"
veth6_mask="64"
tunnel4_a_addr="192.168.2.1"
@@ -373,8 +440,64 @@
setup_fou_or_gue 6 6 gue
}
+setup_ipvX_over_ipvY() {
+ inner=${1}
+ outer=${2}
+
+ if [ "${outer}" -eq 4 ]; then
+ a_addr="${prefix4}.${a_r1}.1"
+ b_addr="${prefix4}.${b_r1}.1"
+ if [ "${inner}" -eq 4 ]; then
+ type="ipip"
+ mode="ipip"
+ else
+ type="sit"
+ mode="ip6ip"
+ fi
+ else
+ a_addr="${prefix6}:${a_r1}::1"
+ b_addr="${prefix6}:${b_r1}::1"
+ type="ip6tnl"
+ if [ "${inner}" -eq 4 ]; then
+ mode="ipip6"
+ else
+ mode="ip6ip6"
+ fi
+ fi
+
+ run_cmd ${ns_a} ip link add ip_a type ${type} local ${a_addr} remote ${b_addr} mode ${mode} || return 2
+ run_cmd ${ns_b} ip link add ip_b type ${type} local ${b_addr} remote ${a_addr} mode ${mode}
+
+ run_cmd ${ns_a} ip link set ip_a up
+ run_cmd ${ns_b} ip link set ip_b up
+
+ if [ "${inner}" = "4" ]; then
+ run_cmd ${ns_a} ip addr add ${tunnel4_a_addr}/${tunnel4_mask} dev ip_a
+ run_cmd ${ns_b} ip addr add ${tunnel4_b_addr}/${tunnel4_mask} dev ip_b
+ else
+ run_cmd ${ns_a} ip addr add ${tunnel6_a_addr}/${tunnel6_mask} dev ip_a
+ run_cmd ${ns_b} ip addr add ${tunnel6_b_addr}/${tunnel6_mask} dev ip_b
+ fi
+}
+
+setup_ip4ip4() {
+ setup_ipvX_over_ipvY 4 4
+}
+
+setup_ip6ip4() {
+ setup_ipvX_over_ipvY 6 4
+}
+
+setup_ip4ip6() {
+ setup_ipvX_over_ipvY 4 6
+}
+
+setup_ip6ip6() {
+ setup_ipvX_over_ipvY 6 6
+}
+
setup_namespaces() {
- for n in ${NS_A} ${NS_B} ${NS_R1} ${NS_R2}; do
+ for n in ${NS_A} ${NS_B} ${NS_C} ${NS_R1} ${NS_R2}; do
ip netns add ${n} || return 1
# Disable DAD, so that we don't have to wait to use the
@@ -430,6 +553,7 @@
a_addr="${2}"
b_addr="${3}"
opts="${4}"
+ br_if_a="${5}"
if [ "${type}" = "vxlan" ]; then
opts="${opts} ttl 64 dstport 4789"
@@ -443,10 +567,16 @@
run_cmd ${ns_a} ip link add ${type}_a type ${type} id 1 ${opts_a} remote ${b_addr} ${opts} || return 1
run_cmd ${ns_b} ip link add ${type}_b type ${type} id 1 ${opts_b} remote ${a_addr} ${opts}
- run_cmd ${ns_a} ip addr add ${tunnel4_a_addr}/${tunnel4_mask} dev ${type}_a
- run_cmd ${ns_b} ip addr add ${tunnel4_b_addr}/${tunnel4_mask} dev ${type}_b
+ if [ -n "${br_if_a}" ]; then
+ run_cmd ${ns_a} ip addr add ${tunnel4_a_addr}/${tunnel4_mask} dev ${br_if_a}
+ run_cmd ${ns_a} ip addr add ${tunnel6_a_addr}/${tunnel6_mask} dev ${br_if_a}
+ run_cmd ${ns_a} ip link set ${type}_a master ${br_if_a}
+ else
+ run_cmd ${ns_a} ip addr add ${tunnel4_a_addr}/${tunnel4_mask} dev ${type}_a
+ run_cmd ${ns_a} ip addr add ${tunnel6_a_addr}/${tunnel6_mask} dev ${type}_a
+ fi
- run_cmd ${ns_a} ip addr add ${tunnel6_a_addr}/${tunnel6_mask} dev ${type}_a
+ run_cmd ${ns_b} ip addr add ${tunnel4_b_addr}/${tunnel4_mask} dev ${type}_b
run_cmd ${ns_b} ip addr add ${tunnel6_b_addr}/${tunnel6_mask} dev ${type}_b
run_cmd ${ns_a} ip link set ${type}_a up
@@ -462,11 +592,27 @@
}
setup_geneve6() {
- setup_vxlan_or_geneve geneve ${prefix6}:${a_r1}::1 ${prefix6}:${b_r1}::1
+ setup_vxlan_or_geneve geneve ${prefix6}:${a_r1}::1 ${prefix6}:${b_r1}::1 ""
}
setup_vxlan6() {
- setup_vxlan_or_geneve vxlan ${prefix6}:${a_r1}::1 ${prefix6}:${b_r1}::1
+ setup_vxlan_or_geneve vxlan ${prefix6}:${a_r1}::1 ${prefix6}:${b_r1}::1 ""
+}
+
+setup_bridged_geneve4() {
+ setup_vxlan_or_geneve geneve ${prefix4}.${a_r1}.1 ${prefix4}.${b_r1}.1 "df set" "br0"
+}
+
+setup_bridged_vxlan4() {
+ setup_vxlan_or_geneve vxlan ${prefix4}.${a_r1}.1 ${prefix4}.${b_r1}.1 "df set" "br0"
+}
+
+setup_bridged_geneve6() {
+ setup_vxlan_or_geneve geneve ${prefix6}:${a_r1}::1 ${prefix6}:${b_r1}::1 "" "br0"
+}
+
+setup_bridged_vxlan6() {
+ setup_vxlan_or_geneve vxlan ${prefix6}:${a_r1}::1 ${prefix6}:${b_r1}::1 "" "br0"
}
setup_xfrm() {
@@ -576,10 +722,83 @@
return 0
}
+setup_bridge() {
+ run_cmd ${ns_a} ip link add br0 type bridge || return 2
+ run_cmd ${ns_a} ip link set br0 up
+
+ run_cmd ${ns_c} ip link add veth_C-A type veth peer name veth_A-C
+ run_cmd ${ns_c} ip link set veth_A-C netns ns-A
+
+ run_cmd ${ns_a} ip link set veth_A-C up
+ run_cmd ${ns_c} ip link set veth_C-A up
+ run_cmd ${ns_c} ip addr add ${veth4_c_addr}/${veth4_mask} dev veth_C-A
+ run_cmd ${ns_c} ip addr add ${veth6_c_addr}/${veth6_mask} dev veth_C-A
+ run_cmd ${ns_a} ip link set veth_A-C master br0
+}
+
+setup_ovs_vxlan_or_geneve() {
+ type="${1}"
+ a_addr="${2}"
+ b_addr="${3}"
+
+ if [ "${type}" = "vxlan" ]; then
+ opts="${opts} ttl 64 dstport 4789"
+ opts_b="local ${b_addr}"
+ fi
+
+ run_cmd ovs-vsctl add-port ovs_br0 ${type}_a -- \
+ set interface ${type}_a type=${type} \
+ options:remote_ip=${b_addr} options:key=1 options:csum=true || return 1
+
+ run_cmd ${ns_b} ip link add ${type}_b type ${type} id 1 ${opts_b} remote ${a_addr} ${opts} || return 1
+
+ run_cmd ${ns_b} ip addr add ${tunnel4_b_addr}/${tunnel4_mask} dev ${type}_b
+ run_cmd ${ns_b} ip addr add ${tunnel6_b_addr}/${tunnel6_mask} dev ${type}_b
+
+ run_cmd ${ns_b} ip link set ${type}_b up
+}
+
+setup_ovs_geneve4() {
+ setup_ovs_vxlan_or_geneve geneve ${prefix4}.${a_r1}.1 ${prefix4}.${b_r1}.1
+}
+
+setup_ovs_vxlan4() {
+ setup_ovs_vxlan_or_geneve vxlan ${prefix4}.${a_r1}.1 ${prefix4}.${b_r1}.1
+}
+
+setup_ovs_geneve6() {
+ setup_ovs_vxlan_or_geneve geneve ${prefix6}:${a_r1}::1 ${prefix6}:${b_r1}::1
+}
+
+setup_ovs_vxlan6() {
+ setup_ovs_vxlan_or_geneve vxlan ${prefix6}:${a_r1}::1 ${prefix6}:${b_r1}::1
+}
+
+setup_ovs_bridge() {
+ run_cmd ovs-vsctl add-br ovs_br0 || return 2
+ run_cmd ip link set ovs_br0 up
+
+ run_cmd ${ns_c} ip link add veth_C-A type veth peer name veth_A-C
+ run_cmd ${ns_c} ip link set veth_A-C netns 1
+
+ run_cmd ip link set veth_A-C up
+ run_cmd ${ns_c} ip link set veth_C-A up
+ run_cmd ${ns_c} ip addr add ${veth4_c_addr}/${veth4_mask} dev veth_C-A
+ run_cmd ${ns_c} ip addr add ${veth6_c_addr}/${veth6_mask} dev veth_C-A
+ run_cmd ovs-vsctl add-port ovs_br0 veth_A-C
+
+ # Move veth_A-R1 to init
+ run_cmd ${ns_a} ip link set veth_A-R1 netns 1
+ run_cmd ip addr add ${prefix4}.${a_r1}.1/${veth4_mask} dev veth_A-R1
+ run_cmd ip addr add ${prefix6}:${a_r1}::1/${veth6_mask} dev veth_A-R1
+ run_cmd ip link set veth_A-R1 up
+ run_cmd ip route add ${prefix4}.${b_r1}.1 via ${prefix4}.${a_r1}.2
+ run_cmd ip route add ${prefix6}:${b_r1}::1 via ${prefix6}:${a_r1}::2
+}
+
setup() {
[ "$(id -u)" -ne 0 ] && echo " need to run as root" && return $ksft_skip
- cleanup
for arg do
eval setup_${arg} || { echo " ${arg} not supported"; return 1; }
done
@@ -590,7 +809,7 @@
for arg do
[ "${ns_cmd}" = "" ] && ns_cmd="${arg}" && continue
- ${ns_cmd} tcpdump -s 0 -i "${arg}" -w "${name}_${arg}.pcap" 2> /dev/null &
+ ${ns_cmd} tcpdump --immediate-mode -s 0 -i "${arg}" -w "${name}_${arg}.pcap" 2> /dev/null &
tcpdump_pids="${tcpdump_pids} $!"
ns_cmd=
done
@@ -603,9 +822,14 @@
done
tcpdump_pids=
- for n in ${NS_A} ${NS_B} ${NS_R1} ${NS_R2}; do
+ for n in ${NS_A} ${NS_B} ${NS_C} ${NS_R1} ${NS_R2}; do
ip netns del ${n} 2> /dev/null
done
+
+ ip link del veth_A-C 2>/dev/null
+ ip link del veth_A-R1 2>/dev/null
+ ovs-vsctl --if-exists del-port vxlan_a 2>/dev/null
+ ovs-vsctl --if-exists del-br ovs_br0 2>/dev/null
}
mtu() {
@@ -838,6 +1062,177 @@
test_pmtu_ipvX_over_vxlanY_or_geneveY_exception geneve 6 6
}
+test_pmtu_ipvX_over_bridged_vxlanY_or_geneveY_exception() {
+ type=${1}
+ family=${2}
+ outer_family=${3}
+ ll_mtu=4000
+
+ if [ ${outer_family} -eq 4 ]; then
+ setup namespaces routing bridge bridged_${type}4 || return 2
+ # IPv4 header UDP header VXLAN/GENEVE header Ethernet header
+ exp_mtu=$((${ll_mtu} - 20 - 8 - 8 - 14))
+ else
+ setup namespaces routing bridge bridged_${type}6 || return 2
+ # IPv6 header UDP header VXLAN/GENEVE header Ethernet header
+ exp_mtu=$((${ll_mtu} - 40 - 8 - 8 - 14))
+ fi
+
+ trace "${ns_a}" ${type}_a "${ns_b}" ${type}_b \
+ "${ns_a}" veth_A-R1 "${ns_r1}" veth_R1-A \
+ "${ns_b}" veth_B-R1 "${ns_r1}" veth_R1-B \
+ "${ns_a}" br0 "${ns_a}" veth-A-C \
+ "${ns_c}" veth_C-A
+
+ if [ ${family} -eq 4 ]; then
+ ping=ping
+ dst=${tunnel4_b_addr}
+ else
+ ping=${ping6}
+ dst=${tunnel6_b_addr}
+ fi
+
+ # Create route exception by exceeding link layer MTU
+ mtu "${ns_a}" veth_A-R1 $((${ll_mtu} + 1000))
+ mtu "${ns_a}" br0 $((${ll_mtu} + 1000))
+ mtu "${ns_a}" veth_A-C $((${ll_mtu} + 1000))
+ mtu "${ns_c}" veth_C-A $((${ll_mtu} + 1000))
+ mtu "${ns_r1}" veth_R1-A $((${ll_mtu} + 1000))
+ mtu "${ns_b}" veth_B-R1 ${ll_mtu}
+ mtu "${ns_r1}" veth_R1-B ${ll_mtu}
+
+ mtu "${ns_a}" ${type}_a $((${ll_mtu} + 1000))
+ mtu "${ns_b}" ${type}_b $((${ll_mtu} + 1000))
+
+ run_cmd ${ns_c} ${ping} -q -M want -i 0.1 -c 10 -s $((${ll_mtu} + 500)) ${dst} || return 1
+ run_cmd ${ns_a} ${ping} -q -M want -i 0.1 -w 1 -s $((${ll_mtu} + 500)) ${dst} || return 1
+
+ # Check that exceptions were created
+ pmtu="$(route_get_dst_pmtu_from_exception "${ns_c}" ${dst})"
+ check_pmtu_value ${exp_mtu} "${pmtu}" "exceeding link layer MTU on bridged ${type} interface"
+ pmtu="$(route_get_dst_pmtu_from_exception "${ns_a}" ${dst})"
+ check_pmtu_value ${exp_mtu} "${pmtu}" "exceeding link layer MTU on locally bridged ${type} interface"
+}
+
+test_pmtu_ipv4_br_vxlan4_exception() {
+ test_pmtu_ipvX_over_bridged_vxlanY_or_geneveY_exception vxlan 4 4
+}
+
+test_pmtu_ipv6_br_vxlan4_exception() {
+ test_pmtu_ipvX_over_bridged_vxlanY_or_geneveY_exception vxlan 6 4
+}
+
+test_pmtu_ipv4_br_geneve4_exception() {
+ test_pmtu_ipvX_over_bridged_vxlanY_or_geneveY_exception geneve 4 4
+}
+
+test_pmtu_ipv6_br_geneve4_exception() {
+ test_pmtu_ipvX_over_bridged_vxlanY_or_geneveY_exception geneve 6 4
+}
+
+test_pmtu_ipv4_br_vxlan6_exception() {
+ test_pmtu_ipvX_over_bridged_vxlanY_or_geneveY_exception vxlan 4 6
+}
+
+test_pmtu_ipv6_br_vxlan6_exception() {
+ test_pmtu_ipvX_over_bridged_vxlanY_or_geneveY_exception vxlan 6 6
+}
+
+test_pmtu_ipv4_br_geneve6_exception() {
+ test_pmtu_ipvX_over_bridged_vxlanY_or_geneveY_exception geneve 4 6
+}
+
+test_pmtu_ipv6_br_geneve6_exception() {
+ test_pmtu_ipvX_over_bridged_vxlanY_or_geneveY_exception geneve 6 6
+}
+
+test_pmtu_ipvX_over_ovs_vxlanY_or_geneveY_exception() {
+ type=${1}
+ family=${2}
+ outer_family=${3}
+ ll_mtu=4000
+
+ if [ ${outer_family} -eq 4 ]; then
+ setup namespaces routing ovs_bridge ovs_${type}4 || return 2
+ # IPv4 header UDP header VXLAN/GENEVE header Ethernet header
+ exp_mtu=$((${ll_mtu} - 20 - 8 - 8 - 14))
+ else
+ setup namespaces routing ovs_bridge ovs_${type}6 || return 2
+ # IPv6 header UDP header VXLAN/GENEVE header Ethernet header
+ exp_mtu=$((${ll_mtu} - 40 - 8 - 8 - 14))
+ fi
+
+ if [ "${type}" = "vxlan" ]; then
+ tun_a="vxlan_sys_4789"
+ elif [ "${type}" = "geneve" ]; then
+ tun_a="genev_sys_6081"
+ fi
+
+ trace "" "${tun_a}" "${ns_b}" ${type}_b \
+ "" veth_A-R1 "${ns_r1}" veth_R1-A \
+ "${ns_b}" veth_B-R1 "${ns_r1}" veth_R1-B \
+ "" ovs_br0 "" veth-A-C \
+ "${ns_c}" veth_C-A
+
+ if [ ${family} -eq 4 ]; then
+ ping=ping
+ dst=${tunnel4_b_addr}
+ else
+ ping=${ping6}
+ dst=${tunnel6_b_addr}
+ fi
+
+ # Create route exception by exceeding link layer MTU
+ mtu "" veth_A-R1 $((${ll_mtu} + 1000))
+ mtu "" ovs_br0 $((${ll_mtu} + 1000))
+ mtu "" veth_A-C $((${ll_mtu} + 1000))
+ mtu "${ns_c}" veth_C-A $((${ll_mtu} + 1000))
+ mtu "${ns_r1}" veth_R1-A $((${ll_mtu} + 1000))
+ mtu "${ns_b}" veth_B-R1 ${ll_mtu}
+ mtu "${ns_r1}" veth_R1-B ${ll_mtu}
+
+ mtu "" ${tun_a} $((${ll_mtu} + 1000))
+ mtu "${ns_b}" ${type}_b $((${ll_mtu} + 1000))
+
+ run_cmd ${ns_c} ${ping} -q -M want -i 0.1 -c 20 -s $((${ll_mtu} + 500)) ${dst} || return 1
+
+ # Check that exceptions were created
+ pmtu="$(route_get_dst_pmtu_from_exception "${ns_c}" ${dst})"
+ check_pmtu_value ${exp_mtu} "${pmtu}" "exceeding link layer MTU on Open vSwitch ${type} interface"
+}
+
+test_pmtu_ipv4_ovs_vxlan4_exception() {
+ test_pmtu_ipvX_over_ovs_vxlanY_or_geneveY_exception vxlan 4 4
+}
+
+test_pmtu_ipv6_ovs_vxlan4_exception() {
+ test_pmtu_ipvX_over_ovs_vxlanY_or_geneveY_exception vxlan 6 4
+}
+
+test_pmtu_ipv4_ovs_geneve4_exception() {
+ test_pmtu_ipvX_over_ovs_vxlanY_or_geneveY_exception geneve 4 4
+}
+
+test_pmtu_ipv6_ovs_geneve4_exception() {
+ test_pmtu_ipvX_over_ovs_vxlanY_or_geneveY_exception geneve 6 4
+}
+
+test_pmtu_ipv4_ovs_vxlan6_exception() {
+ test_pmtu_ipvX_over_ovs_vxlanY_or_geneveY_exception vxlan 4 6
+}
+
+test_pmtu_ipv6_ovs_vxlan6_exception() {
+ test_pmtu_ipvX_over_ovs_vxlanY_or_geneveY_exception vxlan 6 6
+}
+
+test_pmtu_ipv4_ovs_geneve6_exception() {
+ test_pmtu_ipvX_over_ovs_vxlanY_or_geneveY_exception geneve 4 6
+}
+
+test_pmtu_ipv6_ovs_geneve6_exception() {
+ test_pmtu_ipvX_over_ovs_vxlanY_or_geneveY_exception geneve 6 6
+}
+
test_pmtu_ipvX_over_fouY_or_gueY() {
inner_family=${1}
outer_family=${2}
@@ -918,6 +1313,64 @@
test_pmtu_ipvX_over_fouY_or_gueY 6 6 gue
}
+test_pmtu_ipvX_over_ipvY_exception() {
+ inner=${1}
+ outer=${2}
+ ll_mtu=4000
+
+ setup namespaces routing ip${inner}ip${outer} || return 2
+
+ trace "${ns_a}" ip_a "${ns_b}" ip_b \
+ "${ns_a}" veth_A-R1 "${ns_r1}" veth_R1-A \
+ "${ns_b}" veth_B-R1 "${ns_r1}" veth_R1-B
+
+ if [ ${inner} -eq 4 ]; then
+ ping=ping
+ dst=${tunnel4_b_addr}
+ else
+ ping=${ping6}
+ dst=${tunnel6_b_addr}
+ fi
+
+ if [ ${outer} -eq 4 ]; then
+ # IPv4 header
+ exp_mtu=$((${ll_mtu} - 20))
+ else
+ # IPv6 header Option 4
+ exp_mtu=$((${ll_mtu} - 40 - 8))
+ fi
+
+ # Create route exception by exceeding link layer MTU
+ mtu "${ns_a}" veth_A-R1 $((${ll_mtu} + 1000))
+ mtu "${ns_r1}" veth_R1-A $((${ll_mtu} + 1000))
+ mtu "${ns_b}" veth_B-R1 ${ll_mtu}
+ mtu "${ns_r1}" veth_R1-B ${ll_mtu}
+
+ mtu "${ns_a}" ip_a $((${ll_mtu} + 1000)) || return
+ mtu "${ns_b}" ip_b $((${ll_mtu} + 1000)) || return
+ run_cmd ${ns_a} ${ping} -q -M want -i 0.1 -w 1 -s $((${ll_mtu} + 500)) ${dst}
+
+ # Check that exception was created
+ pmtu="$(route_get_dst_pmtu_from_exception "${ns_a}" ${dst})"
+ check_pmtu_value ${exp_mtu} "${pmtu}" "exceeding link layer MTU on ip${inner}ip${outer} interface"
+}
+
+test_pmtu_ipv4_ipv4_exception() {
+ test_pmtu_ipvX_over_ipvY_exception 4 4
+}
+
+test_pmtu_ipv6_ipv4_exception() {
+ test_pmtu_ipvX_over_ipvY_exception 6 4
+}
+
+test_pmtu_ipv4_ipv6_exception() {
+ test_pmtu_ipvX_over_ipvY_exception 4 6
+}
+
+test_pmtu_ipv6_ipv6_exception() {
+ test_pmtu_ipvX_over_ipvY_exception 6 6
+}
+
test_pmtu_vti4_exception() {
setup namespaces veth vti4 xfrm4 || return 2
trace "${ns_a}" veth_a "${ns_b}" veth_b \
@@ -1182,6 +1635,10 @@
unset IFS
+ # Since cleanup() relies on variables modified by this subshell, it
+ # has to run in this context.
+ trap cleanup EXIT
+
if [ "$VERBOSE" = "1" ]; then
printf "\n##########################################################################\n\n"
fi
diff --git a/tools/testing/selftests/net/psock_snd.sh b/tools/testing/selftests/net/psock_snd.sh
index 6331d91..170be65 100755
--- a/tools/testing/selftests/net/psock_snd.sh
+++ b/tools/testing/selftests/net/psock_snd.sh
@@ -45,7 +45,7 @@
echo "raw csum_off"
./in_netns.sh ./psock_snd -v -c
-echo "raw csum_off with bad offset (fails)"
+echo "raw csum_off with bad offset (expected to fail)"
(! ./in_netns.sh ./psock_snd -v -c -C)
@@ -57,7 +57,7 @@
echo "raw mtu size"
./in_netns.sh ./psock_snd -l "${mss}"
-echo "raw mtu size + 1 (fails)"
+echo "raw mtu size + 1 (expected to fail)"
(! ./in_netns.sh ./psock_snd -l "${mss_exceeds}")
# fails due to ARPHRD_ETHER check in packet_extra_vlan_len_allowed
@@ -65,19 +65,19 @@
# echo "raw vlan mtu size"
# ./in_netns.sh ./psock_snd -V -l "${mss}"
-echo "raw vlan mtu size + 1 (fails)"
+echo "raw vlan mtu size + 1 (expected to fail)"
(! ./in_netns.sh ./psock_snd -V -l "${mss_exceeds}")
echo "dgram mtu size"
./in_netns.sh ./psock_snd -d -l "${mss}"
-echo "dgram mtu size + 1 (fails)"
+echo "dgram mtu size + 1 (expected to fail)"
(! ./in_netns.sh ./psock_snd -d -l "${mss_exceeds}")
-echo "raw truncate hlen (fails: does not arrive)"
+echo "raw truncate hlen (expected to fail: does not arrive)"
(! ./in_netns.sh ./psock_snd -t "$((${vnet_hlen} + ${eth_hlen}))")
-echo "raw truncate hlen - 1 (fails: EINVAL)"
+echo "raw truncate hlen - 1 (expected to fail: EINVAL)"
(! ./in_netns.sh ./psock_snd -t "$((${vnet_hlen} + ${eth_hlen} - 1))")
@@ -86,13 +86,13 @@
echo "raw gso min size"
./in_netns.sh ./psock_snd -v -c -g -l "${mss_exceeds}"
-echo "raw gso min size - 1 (fails)"
+echo "raw gso min size - 1 (expected to fail)"
(! ./in_netns.sh ./psock_snd -v -c -g -l "${mss}")
echo "raw gso max size"
./in_netns.sh ./psock_snd -v -c -g -l "${max_mss}"
-echo "raw gso max size + 1 (fails)"
+echo "raw gso max size + 1 (expected to fail)"
(! ./in_netns.sh ./psock_snd -v -c -g -l "${max_mss_exceeds}")
echo "OK. All tests passed"
diff --git a/tools/testing/selftests/net/reuseaddr_ports_exhausted.c b/tools/testing/selftests/net/reuseaddr_ports_exhausted.c
new file mode 100644
index 0000000..066efd3
--- /dev/null
+++ b/tools/testing/selftests/net/reuseaddr_ports_exhausted.c
@@ -0,0 +1,162 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Check if we can fully utilize 4-tuples for connect().
+ *
+ * Rules to bind sockets to the same port when all ephemeral ports are
+ * exhausted.
+ *
+ * 1. if there are TCP_LISTEN sockets on the port, fail to bind.
+ * 2. if there are sockets without SO_REUSEADDR, fail to bind.
+ * 3. if SO_REUSEADDR is disabled, fail to bind.
+ * 4. if SO_REUSEADDR is enabled and SO_REUSEPORT is disabled,
+ * succeed to bind.
+ * 5. if SO_REUSEADDR and SO_REUSEPORT are enabled and
+ * there is no socket having the both options and the same EUID,
+ * succeed to bind.
+ * 6. fail to bind.
+ *
+ * Author: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
+ */
+#include <arpa/inet.h>
+#include <netinet/in.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include "../kselftest_harness.h"
+
+struct reuse_opts {
+ int reuseaddr[2];
+ int reuseport[2];
+};
+
+struct reuse_opts unreusable_opts[12] = {
+ {{0, 0}, {0, 0}},
+ {{0, 0}, {0, 1}},
+ {{0, 0}, {1, 0}},
+ {{0, 0}, {1, 1}},
+ {{0, 1}, {0, 0}},
+ {{0, 1}, {0, 1}},
+ {{0, 1}, {1, 0}},
+ {{0, 1}, {1, 1}},
+ {{1, 0}, {0, 0}},
+ {{1, 0}, {0, 1}},
+ {{1, 0}, {1, 0}},
+ {{1, 0}, {1, 1}},
+};
+
+struct reuse_opts reusable_opts[4] = {
+ {{1, 1}, {0, 0}},
+ {{1, 1}, {0, 1}},
+ {{1, 1}, {1, 0}},
+ {{1, 1}, {1, 1}},
+};
+
+int bind_port(struct __test_metadata *_metadata, int reuseaddr, int reuseport)
+{
+ struct sockaddr_in local_addr;
+ int len = sizeof(local_addr);
+ int fd, ret;
+
+ fd = socket(AF_INET, SOCK_STREAM, 0);
+ ASSERT_NE(-1, fd) TH_LOG("failed to open socket.");
+
+ ret = setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &reuseaddr, sizeof(int));
+ ASSERT_EQ(0, ret) TH_LOG("failed to setsockopt: SO_REUSEADDR.");
+
+ ret = setsockopt(fd, SOL_SOCKET, SO_REUSEPORT, &reuseport, sizeof(int));
+ ASSERT_EQ(0, ret) TH_LOG("failed to setsockopt: SO_REUSEPORT.");
+
+ local_addr.sin_family = AF_INET;
+ local_addr.sin_addr.s_addr = inet_addr("127.0.0.1");
+ local_addr.sin_port = 0;
+
+ if (bind(fd, (struct sockaddr *)&local_addr, len) == -1) {
+ close(fd);
+ return -1;
+ }
+
+ return fd;
+}
+
+TEST(reuseaddr_ports_exhausted_unreusable)
+{
+ struct reuse_opts *opts;
+ int i, j, fd[2];
+
+ for (i = 0; i < 12; i++) {
+ opts = &unreusable_opts[i];
+
+ for (j = 0; j < 2; j++)
+ fd[j] = bind_port(_metadata, opts->reuseaddr[j], opts->reuseport[j]);
+
+ ASSERT_NE(-1, fd[0]) TH_LOG("failed to bind.");
+ EXPECT_EQ(-1, fd[1]) TH_LOG("should fail to bind.");
+
+ for (j = 0; j < 2; j++)
+ if (fd[j] != -1)
+ close(fd[j]);
+ }
+}
+
+TEST(reuseaddr_ports_exhausted_reusable_same_euid)
+{
+ struct reuse_opts *opts;
+ int i, j, fd[2];
+
+ for (i = 0; i < 4; i++) {
+ opts = &reusable_opts[i];
+
+ for (j = 0; j < 2; j++)
+ fd[j] = bind_port(_metadata, opts->reuseaddr[j], opts->reuseport[j]);
+
+ ASSERT_NE(-1, fd[0]) TH_LOG("failed to bind.");
+
+ if (opts->reuseport[0] && opts->reuseport[1]) {
+ EXPECT_EQ(-1, fd[1]) TH_LOG("should fail to bind because both sockets succeed to be listened.");
+ } else {
+ EXPECT_NE(-1, fd[1]) TH_LOG("should succeed to bind to connect to different destinations.");
+ }
+
+ for (j = 0; j < 2; j++)
+ if (fd[j] != -1)
+ close(fd[j]);
+ }
+}
+
+TEST(reuseaddr_ports_exhausted_reusable_different_euid)
+{
+ struct reuse_opts *opts;
+ int i, j, ret, fd[2];
+ uid_t euid[2] = {10, 20};
+
+ for (i = 0; i < 4; i++) {
+ opts = &reusable_opts[i];
+
+ for (j = 0; j < 2; j++) {
+ ret = seteuid(euid[j]);
+ ASSERT_EQ(0, ret) TH_LOG("failed to seteuid: %d.", euid[j]);
+
+ fd[j] = bind_port(_metadata, opts->reuseaddr[j], opts->reuseport[j]);
+
+ ret = seteuid(0);
+ ASSERT_EQ(0, ret) TH_LOG("failed to seteuid: 0.");
+ }
+
+ ASSERT_NE(-1, fd[0]) TH_LOG("failed to bind.");
+ EXPECT_NE(-1, fd[1]) TH_LOG("should succeed to bind because one socket can be bound in each euid.");
+
+ if (fd[1] != -1) {
+ ret = listen(fd[0], 5);
+ ASSERT_EQ(0, ret) TH_LOG("failed to listen.");
+
+ ret = listen(fd[1], 5);
+ EXPECT_EQ(-1, ret) TH_LOG("should fail to listen because only one uid reserves the port in TCP_LISTEN.");
+ }
+
+ for (j = 0; j < 2; j++)
+ if (fd[j] != -1)
+ close(fd[j]);
+ }
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/net/reuseaddr_ports_exhausted.sh b/tools/testing/selftests/net/reuseaddr_ports_exhausted.sh
new file mode 100755
index 0000000..20e3a29
--- /dev/null
+++ b/tools/testing/selftests/net/reuseaddr_ports_exhausted.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Run tests when all ephemeral ports are exhausted.
+#
+# Author: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
+
+set +x
+set -e
+
+readonly NETNS="ns-$(mktemp -u XXXXXX)"
+
+setup() {
+ ip netns add "${NETNS}"
+ ip -netns "${NETNS}" link set lo up
+ ip netns exec "${NETNS}" \
+ sysctl -w net.ipv4.ip_local_port_range="32768 32768" \
+ > /dev/null 2>&1
+ ip netns exec "${NETNS}" \
+ sysctl -w net.ipv4.ip_autobind_reuse=1 > /dev/null 2>&1
+}
+
+cleanup() {
+ ip netns del "${NETNS}"
+}
+
+trap cleanup EXIT
+setup
+
+do_test() {
+ ip netns exec "${NETNS}" ./reuseaddr_ports_exhausted
+}
+
+do_test
+echo "tests done"
diff --git a/tools/testing/selftests/net/rtnetlink.sh b/tools/testing/selftests/net/rtnetlink.sh
index 28ea375..c9ce3df 100755
--- a/tools/testing/selftests/net/rtnetlink.sh
+++ b/tools/testing/selftests/net/rtnetlink.sh
@@ -5,7 +5,6 @@
# set -e
devdummy="test-dummy0"
-ret=0
# Kselftest framework requirement - SKIP code is 4.
ksft_skip=4
@@ -66,7 +65,7 @@
devbr="test-br0"
vlandev="testbr-vlan1"
- ret=0
+ local ret=0
ip link add name "$devbr" type bridge
check_err $?
@@ -113,7 +112,7 @@
rem=10.42.42.1
loc=10.0.0.1
- ret=0
+ local ret=0
ip tunnel add $gredev mode gre remote $rem local $loc ttl 1
check_err $?
ip link set $gredev up
@@ -149,7 +148,7 @@
kci_test_tc()
{
dev=lo
- ret=0
+ local ret=0
tc qdisc add dev "$dev" root handle 1: htb
check_err $?
@@ -184,7 +183,7 @@
kci_test_polrouting()
{
- ret=0
+ local ret=0
ip rule add fwmark 1 lookup 100
check_err $?
ip route add local 0.0.0.0/0 dev lo table 100
@@ -207,7 +206,7 @@
{
local hash_policy=$(sysctl -n net.ipv4.fib_multipath_hash_policy)
- ret=0
+ local ret=0
ip route get 127.0.0.1 > /dev/null
check_err $?
@@ -290,7 +289,7 @@
kci_test_addrlabel()
{
- ret=0
+ local ret=0
ip addrlabel add prefix dead::/64 dev lo label 1
check_err $?
@@ -330,7 +329,7 @@
kci_test_ifalias()
{
- ret=0
+ local ret=0
namewant=$(uuidgen)
syspathname="/sys/class/net/$devdummy/ifalias"
@@ -385,7 +384,7 @@
kci_test_vrf()
{
vrfname="test-vrf"
- ret=0
+ local ret=0
ip link show type vrf 2>/dev/null
if [ $? -ne 0 ]; then
@@ -425,7 +424,7 @@
kci_test_encap_vxlan()
{
- ret=0
+ local ret=0
vxlan="test-vxlan0"
vlan="test-vlan0"
testns="$1"
@@ -511,7 +510,7 @@
kci_test_encap_fou()
{
- ret=0
+ local ret=0
name="test-fou"
testns="$1"
@@ -553,7 +552,7 @@
kci_test_encap()
{
testns="testns"
- ret=0
+ local ret=0
ip netns add "$testns"
if [ $? -ne 0 ]; then
@@ -570,15 +569,18 @@
check_err $?
kci_test_encap_vxlan "$testns"
+ check_err $?
kci_test_encap_fou "$testns"
+ check_err $?
ip netns del "$testns"
+ return $ret
}
kci_test_macsec()
{
msname="test_macsec0"
- ret=0
+ local ret=0
ip macsec help 2>&1 | grep -q "^Usage: ip macsec"
if [ $? -ne 0 ]; then
@@ -636,7 +638,7 @@
#-------------------------------------------------------------------
kci_test_ipsec()
{
- ret=0
+ local ret=0
algo="aead rfc4106(gcm(aes)) 0x3132333435363738393031323334353664636261 128"
srcip=192.168.123.1
dstip=192.168.123.2
@@ -736,7 +738,7 @@
#-------------------------------------------------------------------
kci_test_ipsec_offload()
{
- ret=0
+ local ret=0
algo="aead rfc4106(gcm(aes)) 0x3132333435363738393031323334353664636261 128"
srcip=192.168.123.3
dstip=192.168.123.4
@@ -846,7 +848,7 @@
{
testns="testns"
DEV_NS=gretap00
- ret=0
+ local ret=0
ip netns add "$testns"
if [ $? -ne 0 ]; then
@@ -896,7 +898,7 @@
{
testns="testns"
DEV_NS=ip6gretap00
- ret=0
+ local ret=0
ip netns add "$testns"
if [ $? -ne 0 ]; then
@@ -946,7 +948,7 @@
{
testns="testns"
DEV_NS=erspan00
- ret=0
+ local ret=0
ip link help erspan 2>&1 | grep -q "^Usage:"
if [ $? -ne 0 ];then
@@ -1011,7 +1013,7 @@
{
testns="testns"
DEV_NS=ip6erspan00
- ret=0
+ local ret=0
ip link help ip6erspan 2>&1 | grep -q "^Usage:"
if [ $? -ne 0 ];then
@@ -1082,7 +1084,7 @@
test_mac=de:ad:be:ef:13:37
localip="10.0.2.2"
dstip="10.0.2.3"
- ret=0
+ local ret=0
bridge fdb help 2>&1 |grep -q 'bridge fdb get'
if [ $? -ne 0 ];then
@@ -1130,7 +1132,7 @@
dstmac=de:ad:be:ef:13:37
dstip=10.0.2.4
dstip6=dead::2
- ret=0
+ local ret=0
ip neigh help 2>&1 |grep -q 'ip neigh get'
if [ $? -ne 0 ];then
@@ -1178,8 +1180,54 @@
echo "PASS: neigh get"
}
+kci_test_bridge_parent_id()
+{
+ local ret=0
+ sysfsnet=/sys/bus/netdevsim/devices/netdevsim
+ probed=false
+
+ if [ ! -w /sys/bus/netdevsim/new_device ] ; then
+ modprobe -q netdevsim
+ check_err $?
+ if [ $ret -ne 0 ]; then
+ echo "SKIP: bridge_parent_id can't load netdevsim"
+ return $ksft_skip
+ fi
+ probed=true
+ fi
+
+ echo "10 1" > /sys/bus/netdevsim/new_device
+ while [ ! -d ${sysfsnet}10 ] ; do :; done
+ echo "20 1" > /sys/bus/netdevsim/new_device
+ while [ ! -d ${sysfsnet}20 ] ; do :; done
+ udevadm settle
+ dev10=`ls ${sysfsnet}10/net/`
+ dev20=`ls ${sysfsnet}20/net/`
+
+ ip link add name test-bond0 type bond mode 802.3ad
+ ip link set dev $dev10 master test-bond0
+ ip link set dev $dev20 master test-bond0
+ ip link add name test-br0 type bridge
+ ip link set dev test-bond0 master test-br0
+ check_err $?
+
+ # clean up any leftovers
+ ip link del dev test-br0
+ ip link del dev test-bond0
+ echo 20 > /sys/bus/netdevsim/del_device
+ echo 10 > /sys/bus/netdevsim/del_device
+ $probed && rmmod netdevsim
+
+ if [ $ret -ne 0 ]; then
+ echo "FAIL: bridge_parent_id"
+ return 1
+ fi
+ echo "PASS: bridge_parent_id"
+}
+
kci_test_rtnl()
{
+ local ret=0
kci_add_dummy
if [ $ret -ne 0 ];then
echo "FAIL: cannot add dummy interface"
@@ -1187,27 +1235,50 @@
fi
kci_test_polrouting
+ check_err $?
kci_test_route_get
+ check_err $?
kci_test_addrlft
+ check_err $?
kci_test_promote_secondaries
+ check_err $?
kci_test_tc
+ check_err $?
kci_test_gre
+ check_err $?
kci_test_gretap
+ check_err $?
kci_test_ip6gretap
+ check_err $?
kci_test_erspan
+ check_err $?
kci_test_ip6erspan
+ check_err $?
kci_test_bridge
+ check_err $?
kci_test_addrlabel
+ check_err $?
kci_test_ifalias
+ check_err $?
kci_test_vrf
+ check_err $?
kci_test_encap
+ check_err $?
kci_test_macsec
+ check_err $?
kci_test_ipsec
+ check_err $?
kci_test_ipsec_offload
+ check_err $?
kci_test_fdb_get
+ check_err $?
kci_test_neigh_get
+ check_err $?
+ kci_test_bridge_parent_id
+ check_err $?
kci_del_dummy
+ return $ret
}
#check for needed privileges
@@ -1226,4 +1297,4 @@
kci_test_rtnl
-exit $ret
+exit $?
diff --git a/tools/testing/selftests/net/rxtimestamp.c b/tools/testing/selftests/net/rxtimestamp.c
new file mode 100644
index 0000000..e4613ce
--- /dev/null
+++ b/tools/testing/selftests/net/rxtimestamp.c
@@ -0,0 +1,430 @@
+#include <errno.h>
+#include <error.h>
+#include <getopt.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <sys/time.h>
+#include <sys/socket.h>
+#include <sys/select.h>
+#include <sys/ioctl.h>
+#include <arpa/inet.h>
+#include <net/if.h>
+
+#include <asm/types.h>
+#include <linux/net_tstamp.h>
+#include <linux/errqueue.h>
+
+#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]))
+
+struct options {
+ int so_timestamp;
+ int so_timestampns;
+ int so_timestamping;
+};
+
+struct tstamps {
+ bool tstamp;
+ bool tstampns;
+ bool swtstamp;
+ bool hwtstamp;
+};
+
+struct socket_type {
+ char *friendly_name;
+ int type;
+ int protocol;
+ bool enabled;
+};
+
+struct test_case {
+ struct options sockopt;
+ struct tstamps expected;
+ bool enabled;
+ bool warn_on_fail;
+};
+
+struct sof_flag {
+ int mask;
+ char *name;
+};
+
+static struct sof_flag sof_flags[] = {
+#define SOF_FLAG(f) { f, #f }
+ SOF_FLAG(SOF_TIMESTAMPING_SOFTWARE),
+ SOF_FLAG(SOF_TIMESTAMPING_RX_SOFTWARE),
+ SOF_FLAG(SOF_TIMESTAMPING_RX_HARDWARE),
+};
+
+static struct socket_type socket_types[] = {
+ { "ip", SOCK_RAW, IPPROTO_EGP },
+ { "udp", SOCK_DGRAM, IPPROTO_UDP },
+ { "tcp", SOCK_STREAM, IPPROTO_TCP },
+};
+
+static struct test_case test_cases[] = {
+ { {}, {} },
+ {
+ { .so_timestamp = 1 },
+ { .tstamp = true }
+ },
+ {
+ { .so_timestampns = 1 },
+ { .tstampns = true }
+ },
+ {
+ { .so_timestamp = 1, .so_timestampns = 1 },
+ { .tstampns = true }
+ },
+ {
+ { .so_timestamping = SOF_TIMESTAMPING_RX_SOFTWARE },
+ {}
+ },
+ {
+ /* Loopback device does not support hw timestamps. */
+ { .so_timestamping = SOF_TIMESTAMPING_RX_HARDWARE },
+ {}
+ },
+ {
+ { .so_timestamping = SOF_TIMESTAMPING_SOFTWARE },
+ .warn_on_fail = true
+ },
+ {
+ { .so_timestamping = SOF_TIMESTAMPING_RX_SOFTWARE
+ | SOF_TIMESTAMPING_RX_HARDWARE },
+ {}
+ },
+ {
+ { .so_timestamping = SOF_TIMESTAMPING_SOFTWARE
+ | SOF_TIMESTAMPING_RX_SOFTWARE },
+ { .swtstamp = true }
+ },
+ {
+ { .so_timestamp = 1, .so_timestamping = SOF_TIMESTAMPING_SOFTWARE
+ | SOF_TIMESTAMPING_RX_SOFTWARE },
+ { .tstamp = true, .swtstamp = true }
+ },
+};
+
+static struct option long_options[] = {
+ { "list_tests", no_argument, 0, 'l' },
+ { "test_num", required_argument, 0, 'n' },
+ { "op_size", required_argument, 0, 's' },
+ { "tcp", no_argument, 0, 't' },
+ { "udp", no_argument, 0, 'u' },
+ { "ip", no_argument, 0, 'i' },
+ { "strict", no_argument, 0, 'S' },
+ { "ipv4", no_argument, 0, '4' },
+ { "ipv6", no_argument, 0, '6' },
+ { NULL, 0, NULL, 0 },
+};
+
+static int next_port = 19999;
+static int op_size = 10 * 1024;
+
+void print_test_case(struct test_case *t)
+{
+ int f = 0;
+
+ printf("sockopts {");
+ if (t->sockopt.so_timestamp)
+ printf(" SO_TIMESTAMP ");
+ if (t->sockopt.so_timestampns)
+ printf(" SO_TIMESTAMPNS ");
+ if (t->sockopt.so_timestamping) {
+ printf(" SO_TIMESTAMPING: {");
+ for (f = 0; f < ARRAY_SIZE(sof_flags); f++)
+ if (t->sockopt.so_timestamping & sof_flags[f].mask)
+ printf(" %s |", sof_flags[f].name);
+ printf("}");
+ }
+ printf("} expected cmsgs: {");
+ if (t->expected.tstamp)
+ printf(" SCM_TIMESTAMP ");
+ if (t->expected.tstampns)
+ printf(" SCM_TIMESTAMPNS ");
+ if (t->expected.swtstamp || t->expected.hwtstamp) {
+ printf(" SCM_TIMESTAMPING {");
+ if (t->expected.swtstamp)
+ printf("0");
+ if (t->expected.swtstamp && t->expected.hwtstamp)
+ printf(",");
+ if (t->expected.hwtstamp)
+ printf("2");
+ printf("}");
+ }
+ printf("}\n");
+}
+
+void do_send(int src)
+{
+ int r;
+ char *buf = malloc(op_size);
+
+ memset(buf, 'z', op_size);
+ r = write(src, buf, op_size);
+ if (r < 0)
+ error(1, errno, "Failed to sendmsg");
+
+ free(buf);
+}
+
+bool do_recv(int rcv, int read_size, struct tstamps expected)
+{
+ const int CMSG_SIZE = 1024;
+
+ struct scm_timestamping *ts;
+ struct tstamps actual = {};
+ char cmsg_buf[CMSG_SIZE];
+ struct iovec recv_iov;
+ struct cmsghdr *cmsg;
+ bool failed = false;
+ struct msghdr hdr;
+ int flags = 0;
+ int r;
+
+ memset(&hdr, 0, sizeof(hdr));
+ hdr.msg_iov = &recv_iov;
+ hdr.msg_iovlen = 1;
+ recv_iov.iov_base = malloc(read_size);
+ recv_iov.iov_len = read_size;
+
+ hdr.msg_control = cmsg_buf;
+ hdr.msg_controllen = sizeof(cmsg_buf);
+
+ r = recvmsg(rcv, &hdr, flags);
+ if (r < 0)
+ error(1, errno, "Failed to recvmsg");
+ if (r != read_size)
+ error(1, 0, "Only received %d bytes of payload.", r);
+
+ if (hdr.msg_flags & (MSG_TRUNC | MSG_CTRUNC))
+ error(1, 0, "Message was truncated.");
+
+ for (cmsg = CMSG_FIRSTHDR(&hdr); cmsg != NULL;
+ cmsg = CMSG_NXTHDR(&hdr, cmsg)) {
+ if (cmsg->cmsg_level != SOL_SOCKET)
+ error(1, 0, "Unexpected cmsg_level %d",
+ cmsg->cmsg_level);
+ switch (cmsg->cmsg_type) {
+ case SCM_TIMESTAMP:
+ actual.tstamp = true;
+ break;
+ case SCM_TIMESTAMPNS:
+ actual.tstampns = true;
+ break;
+ case SCM_TIMESTAMPING:
+ ts = (struct scm_timestamping *)CMSG_DATA(cmsg);
+ actual.swtstamp = !!ts->ts[0].tv_sec;
+ if (ts->ts[1].tv_sec != 0)
+ error(0, 0, "ts[1] should not be set.");
+ actual.hwtstamp = !!ts->ts[2].tv_sec;
+ break;
+ default:
+ error(1, 0, "Unexpected cmsg_type %d", cmsg->cmsg_type);
+ }
+ }
+
+#define VALIDATE(field) \
+ do { \
+ if (expected.field != actual.field) { \
+ if (expected.field) \
+ error(0, 0, "Expected " #field " to be set."); \
+ else \
+ error(0, 0, \
+ "Expected " #field " to not be set."); \
+ failed = true; \
+ } \
+ } while (0)
+
+ VALIDATE(tstamp);
+ VALIDATE(tstampns);
+ VALIDATE(swtstamp);
+ VALIDATE(hwtstamp);
+#undef VALIDATE
+
+ free(recv_iov.iov_base);
+
+ return failed;
+}
+
+void config_so_flags(int rcv, struct options o)
+{
+ int on = 1;
+
+ if (setsockopt(rcv, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) < 0)
+ error(1, errno, "Failed to enable SO_REUSEADDR");
+
+ if (o.so_timestamp &&
+ setsockopt(rcv, SOL_SOCKET, SO_TIMESTAMP,
+ &o.so_timestamp, sizeof(o.so_timestamp)) < 0)
+ error(1, errno, "Failed to enable SO_TIMESTAMP");
+
+ if (o.so_timestampns &&
+ setsockopt(rcv, SOL_SOCKET, SO_TIMESTAMPNS,
+ &o.so_timestampns, sizeof(o.so_timestampns)) < 0)
+ error(1, errno, "Failed to enable SO_TIMESTAMPNS");
+
+ if (o.so_timestamping &&
+ setsockopt(rcv, SOL_SOCKET, SO_TIMESTAMPING,
+ &o.so_timestamping, sizeof(o.so_timestamping)) < 0)
+ error(1, errno, "Failed to set SO_TIMESTAMPING");
+}
+
+bool run_test_case(struct socket_type *s, int test_num, char ip_version,
+ bool strict)
+{
+ union {
+ struct sockaddr_in6 addr6;
+ struct sockaddr_in addr4;
+ struct sockaddr addr_un;
+ } addr;
+ int read_size = op_size;
+ int src, dst, rcv, port;
+ socklen_t addr_size;
+ bool failed = false;
+
+ port = (s->type == SOCK_RAW) ? 0 : next_port++;
+ memset(&addr, 0, sizeof(addr));
+ if (ip_version == '4') {
+ addr.addr4.sin_family = AF_INET;
+ addr.addr4.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
+ addr.addr4.sin_port = htons(port);
+ addr_size = sizeof(addr.addr4);
+ if (s->type == SOCK_RAW)
+ read_size += 20; /* for IPv4 header */
+ } else {
+ addr.addr6.sin6_family = AF_INET6;
+ addr.addr6.sin6_addr = in6addr_loopback;
+ addr.addr6.sin6_port = htons(port);
+ addr_size = sizeof(addr.addr6);
+ }
+ printf("Starting testcase %d over ipv%c...\n", test_num, ip_version);
+ src = socket(addr.addr_un.sa_family, s->type,
+ s->protocol);
+ if (src < 0)
+ error(1, errno, "Failed to open src socket");
+
+ dst = socket(addr.addr_un.sa_family, s->type,
+ s->protocol);
+ if (dst < 0)
+ error(1, errno, "Failed to open dst socket");
+
+ if (bind(dst, &addr.addr_un, addr_size) < 0)
+ error(1, errno, "Failed to bind to port %d", port);
+
+ if (s->type == SOCK_STREAM && (listen(dst, 1) < 0))
+ error(1, errno, "Failed to listen");
+
+ if (connect(src, &addr.addr_un, addr_size) < 0)
+ error(1, errno, "Failed to connect");
+
+ if (s->type == SOCK_STREAM) {
+ rcv = accept(dst, NULL, NULL);
+ if (rcv < 0)
+ error(1, errno, "Failed to accept");
+ close(dst);
+ } else {
+ rcv = dst;
+ }
+
+ config_so_flags(rcv, test_cases[test_num].sockopt);
+ usleep(20000); /* setsockopt for SO_TIMESTAMPING is asynchronous */
+ do_send(src);
+
+ failed = do_recv(rcv, read_size, test_cases[test_num].expected);
+
+ close(rcv);
+ close(src);
+
+ if (failed) {
+ printf("FAILURE in testcase %d over ipv%c ", test_num,
+ ip_version);
+ print_test_case(&test_cases[test_num]);
+ if (!strict && test_cases[test_num].warn_on_fail)
+ failed = false;
+ }
+ return failed;
+}
+
+int main(int argc, char **argv)
+{
+ bool all_protocols = true;
+ bool all_tests = true;
+ bool cfg_ipv4 = false;
+ bool cfg_ipv6 = false;
+ bool strict = false;
+ int arg_index = 0;
+ int failures = 0;
+ int s, t, opt;
+
+ while ((opt = getopt_long(argc, argv, "", long_options,
+ &arg_index)) != -1) {
+ switch (opt) {
+ case 'l':
+ for (t = 0; t < ARRAY_SIZE(test_cases); t++) {
+ printf("%d\t", t);
+ print_test_case(&test_cases[t]);
+ }
+ return 0;
+ case 'n':
+ t = atoi(optarg);
+ if (t >= ARRAY_SIZE(test_cases))
+ error(1, 0, "Invalid test case: %d", t);
+ all_tests = false;
+ test_cases[t].enabled = true;
+ break;
+ case 's':
+ op_size = atoi(optarg);
+ break;
+ case 't':
+ all_protocols = false;
+ socket_types[2].enabled = true;
+ break;
+ case 'u':
+ all_protocols = false;
+ socket_types[1].enabled = true;
+ break;
+ case 'i':
+ all_protocols = false;
+ socket_types[0].enabled = true;
+ break;
+ case 'S':
+ strict = true;
+ break;
+ case '4':
+ cfg_ipv4 = true;
+ break;
+ case '6':
+ cfg_ipv6 = true;
+ break;
+ default:
+ error(1, 0, "Failed to parse parameters.");
+ }
+ }
+
+ for (s = 0; s < ARRAY_SIZE(socket_types); s++) {
+ if (!all_protocols && !socket_types[s].enabled)
+ continue;
+
+ printf("Testing %s...\n", socket_types[s].friendly_name);
+ for (t = 0; t < ARRAY_SIZE(test_cases); t++) {
+ if (!all_tests && !test_cases[t].enabled)
+ continue;
+ if (cfg_ipv4 || !cfg_ipv6)
+ if (run_test_case(&socket_types[s], t, '4',
+ strict))
+ failures++;
+ if (cfg_ipv6 || !cfg_ipv4)
+ if (run_test_case(&socket_types[s], t, '6',
+ strict))
+ failures++;
+ }
+ }
+ if (!failures)
+ printf("PASSED.\n");
+ return failures;
+}
diff --git a/tools/testing/selftests/net/rxtimestamp.sh b/tools/testing/selftests/net/rxtimestamp.sh
new file mode 100755
index 0000000..91631e8
--- /dev/null
+++ b/tools/testing/selftests/net/rxtimestamp.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+./in_netns.sh ./rxtimestamp $@
diff --git a/tools/testing/selftests/net/tcp_mmap.c b/tools/testing/selftests/net/tcp_mmap.c
index 31ced79..00f837c 100644
--- a/tools/testing/selftests/net/tcp_mmap.c
+++ b/tools/testing/selftests/net/tcp_mmap.c
@@ -71,7 +71,7 @@
#define MSG_ZEROCOPY 0x4000000
#endif
-#define FILE_SZ (1UL << 35)
+#define FILE_SZ (1ULL << 35)
static int cfg_family = AF_INET6;
static socklen_t cfg_alen = sizeof(struct sockaddr_in6);
static int cfg_port = 8787;
@@ -82,7 +82,9 @@
static int xflg; /* hash received data (simple xor) (-h option) */
static int keepflag; /* -k option: receiver shall keep all received file in memory (no munmap() calls) */
-static int chunk_size = 512*1024;
+static size_t chunk_size = 512*1024;
+
+static size_t map_align;
unsigned long htotal;
@@ -118,6 +120,31 @@
htotal = temp;
}
+#define ALIGN_UP(x, align_to) (((x) + ((align_to)-1)) & ~((align_to)-1))
+#define ALIGN_PTR_UP(p, ptr_align_to) ((typeof(p))ALIGN_UP((unsigned long)(p), ptr_align_to))
+
+
+static void *mmap_large_buffer(size_t need, size_t *allocated)
+{
+ void *buffer;
+ size_t sz;
+
+ /* Attempt to use huge pages if possible. */
+ sz = ALIGN_UP(need, map_align);
+ buffer = mmap(NULL, sz, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, -1, 0);
+
+ if (buffer == (void *)-1) {
+ sz = need;
+ buffer = mmap(NULL, sz, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ if (buffer != (void *)-1)
+ fprintf(stderr, "MAP_HUGETLB attempt failed, look at /sys/kernel/mm/hugepages for optimal performance\n");
+ }
+ *allocated = sz;
+ return buffer;
+}
+
void *child_thread(void *arg)
{
unsigned long total_mmap = 0, total = 0;
@@ -126,9 +153,11 @@
int flags = MAP_SHARED;
struct timeval t0, t1;
char *buffer = NULL;
+ void *raddr = NULL;
void *addr = NULL;
double throughput;
struct rusage ru;
+ size_t buffer_sz;
int lu, fd;
fd = (int)(unsigned long)arg;
@@ -136,15 +165,19 @@
gettimeofday(&t0, NULL);
fcntl(fd, F_SETFL, O_NDELAY);
- buffer = malloc(chunk_size);
- if (!buffer) {
- perror("malloc");
+ buffer = mmap_large_buffer(chunk_size, &buffer_sz);
+ if (buffer == (void *)-1) {
+ perror("mmap");
goto error;
}
if (zflg) {
- addr = mmap(NULL, chunk_size, PROT_READ, flags, fd, 0);
- if (addr == (void *)-1)
+ raddr = mmap(NULL, chunk_size + map_align, PROT_READ, flags, fd, 0);
+ if (raddr == (void *)-1) {
+ perror("mmap");
zflg = 0;
+ } else {
+ addr = ALIGN_PTR_UP(raddr, map_align);
+ }
}
while (1) {
struct pollfd pfd = { .fd = fd, .events = POLLIN, };
@@ -155,9 +188,10 @@
socklen_t zc_len = sizeof(zc);
int res;
- zc.address = (__u64)addr;
+ memset(&zc, 0, sizeof(zc));
+ zc.address = (__u64)((unsigned long)addr);
zc.length = chunk_size;
- zc.recv_skip_hint = 0;
+
res = getsockopt(fd, IPPROTO_TCP, TCP_ZEROCOPY_RECEIVE,
&zc, &zc_len);
if (res == -1)
@@ -168,6 +202,10 @@
total_mmap += zc.length;
if (xflg)
hash_zone(addr, zc.length);
+ /* It is more efficient to unmap the pages right now,
+ * instead of doing this in next TCP_ZEROCOPY_RECEIVE.
+ */
+ madvise(addr, zc.length, MADV_DONTNEED);
total += zc.length;
}
if (zc.recv_skip_hint) {
@@ -219,10 +257,10 @@
ru.ru_nvcsw);
}
error:
- free(buffer);
+ munmap(buffer, buffer_sz);
close(fd);
if (zflg)
- munmap(addr, chunk_size);
+ munmap(raddr, chunk_size + map_align);
pthread_exit(0);
}
@@ -270,8 +308,15 @@
static void do_accept(int fdlisten)
{
+ pthread_attr_t attr;
+ int rcvlowat;
+
+ pthread_attr_init(&attr);
+ pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED);
+
+ rcvlowat = chunk_size;
if (setsockopt(fdlisten, SOL_SOCKET, SO_RCVLOWAT,
- &chunk_size, sizeof(chunk_size)) == -1) {
+ &rcvlowat, sizeof(rcvlowat)) == -1) {
perror("setsockopt SO_RCVLOWAT");
}
@@ -288,7 +333,7 @@
perror("accept");
continue;
}
- res = pthread_create(&th, NULL, child_thread,
+ res = pthread_create(&th, &attr, child_thread,
(void *)(unsigned long)fd);
if (res) {
errno = res;
@@ -298,18 +343,43 @@
}
}
+/* Each thread should reserve a big enough vma to avoid
+ * spinlock collisions in ptl locks.
+ * This size is 2MB on x86_64, and is exported in /proc/meminfo.
+ */
+static unsigned long default_huge_page_size(void)
+{
+ FILE *f = fopen("/proc/meminfo", "r");
+ unsigned long hps = 0;
+ size_t linelen = 0;
+ char *line = NULL;
+
+ if (!f)
+ return 0;
+ while (getline(&line, &linelen, f) > 0) {
+ if (sscanf(line, "Hugepagesize: %lu kB", &hps) == 1) {
+ hps <<= 10;
+ break;
+ }
+ }
+ free(line);
+ fclose(f);
+ return hps;
+}
+
int main(int argc, char *argv[])
{
struct sockaddr_storage listenaddr, addr;
unsigned int max_pacing_rate = 0;
- unsigned long total = 0;
+ uint64_t total = 0;
char *host = NULL;
int fd, c, on = 1;
+ size_t buffer_sz;
char *buffer;
int sflg = 0;
int mss = 0;
- while ((c = getopt(argc, argv, "46p:svr:w:H:zxkP:M:")) != -1) {
+ while ((c = getopt(argc, argv, "46p:svr:w:H:zxkP:M:C:a:")) != -1) {
switch (c) {
case '4':
cfg_family = PF_INET;
@@ -349,10 +419,24 @@
case 'P':
max_pacing_rate = atoi(optarg) ;
break;
+ case 'C':
+ chunk_size = atol(optarg);
+ break;
+ case 'a':
+ map_align = atol(optarg);
+ break;
default:
exit(1);
}
}
+ if (!map_align) {
+ map_align = default_huge_page_size();
+ /* if really /proc/meminfo is not helping,
+ * we use the default x86_64 hugepagesize.
+ */
+ if (!map_align)
+ map_align = 2*1024*1024;
+ }
if (sflg) {
int fdlisten = socket(cfg_family, SOCK_STREAM, 0);
@@ -381,8 +465,8 @@
}
do_accept(fdlisten);
}
- buffer = mmap(NULL, chunk_size, PROT_READ | PROT_WRITE,
- MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+
+ buffer = mmap_large_buffer(chunk_size, &buffer_sz);
if (buffer == (char *)-1) {
perror("mmap");
exit(1);
@@ -417,17 +501,17 @@
zflg = 0;
}
while (total < FILE_SZ) {
- long wr = FILE_SZ - total;
+ int64_t wr = FILE_SZ - total;
if (wr > chunk_size)
wr = chunk_size;
/* Note : we just want to fill the pipe with 0 bytes */
- wr = send(fd, buffer, wr, zflg ? MSG_ZEROCOPY : 0);
+ wr = send(fd, buffer, (size_t)wr, zflg ? MSG_ZEROCOPY : 0);
if (wr <= 0)
break;
total += wr;
}
close(fd);
- munmap(buffer, chunk_size);
+ munmap(buffer, buffer_sz);
return 0;
}
diff --git a/tools/testing/selftests/net/timestamping.c b/tools/testing/selftests/net/timestamping.c
new file mode 100644
index 0000000..f4bb4fe
--- /dev/null
+++ b/tools/testing/selftests/net/timestamping.c
@@ -0,0 +1,515 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * This program demonstrates how the various time stamping features in
+ * the Linux kernel work. It emulates the behavior of a PTP
+ * implementation in stand-alone master mode by sending PTPv1 Sync
+ * multicasts once every second. It looks for similar packets, but
+ * beyond that doesn't actually implement PTP.
+ *
+ * Outgoing packets are time stamped with SO_TIMESTAMPING with or
+ * without hardware support.
+ *
+ * Incoming packets are time stamped with SO_TIMESTAMPING with or
+ * without hardware support, SIOCGSTAMP[NS] (per-socket time stamp) and
+ * SO_TIMESTAMP[NS].
+ *
+ * Copyright (C) 2009 Intel Corporation.
+ * Author: Patrick Ohly <patrick.ohly@intel.com>
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <string.h>
+
+#include <sys/time.h>
+#include <sys/socket.h>
+#include <sys/select.h>
+#include <sys/ioctl.h>
+#include <arpa/inet.h>
+#include <net/if.h>
+
+#include <asm/types.h>
+#include <linux/net_tstamp.h>
+#include <linux/errqueue.h>
+#include <linux/sockios.h>
+
+#ifndef SO_TIMESTAMPING
+# define SO_TIMESTAMPING 37
+# define SCM_TIMESTAMPING SO_TIMESTAMPING
+#endif
+
+#ifndef SO_TIMESTAMPNS
+# define SO_TIMESTAMPNS 35
+#endif
+
+static void usage(const char *error)
+{
+ if (error)
+ printf("invalid option: %s\n", error);
+ printf("timestamping interface option*\n\n"
+ "Options:\n"
+ " IP_MULTICAST_LOOP - looping outgoing multicasts\n"
+ " SO_TIMESTAMP - normal software time stamping, ms resolution\n"
+ " SO_TIMESTAMPNS - more accurate software time stamping\n"
+ " SOF_TIMESTAMPING_TX_HARDWARE - hardware time stamping of outgoing packets\n"
+ " SOF_TIMESTAMPING_TX_SOFTWARE - software fallback for outgoing packets\n"
+ " SOF_TIMESTAMPING_RX_HARDWARE - hardware time stamping of incoming packets\n"
+ " SOF_TIMESTAMPING_RX_SOFTWARE - software fallback for incoming packets\n"
+ " SOF_TIMESTAMPING_SOFTWARE - request reporting of software time stamps\n"
+ " SOF_TIMESTAMPING_RAW_HARDWARE - request reporting of raw HW time stamps\n"
+ " SIOCGSTAMP - check last socket time stamp\n"
+ " SIOCGSTAMPNS - more accurate socket time stamp\n");
+ exit(1);
+}
+
+static void bail(const char *error)
+{
+ printf("%s: %s\n", error, strerror(errno));
+ exit(1);
+}
+
+static const unsigned char sync[] = {
+ 0x00, 0x01, 0x00, 0x01,
+ 0x5f, 0x44, 0x46, 0x4c,
+ 0x54, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00,
+ 0x01, 0x01,
+
+ /* fake uuid */
+ 0x00, 0x01,
+ 0x02, 0x03, 0x04, 0x05,
+
+ 0x00, 0x01, 0x00, 0x37,
+ 0x00, 0x00, 0x00, 0x08,
+ 0x00, 0x00, 0x00, 0x00,
+ 0x49, 0x05, 0xcd, 0x01,
+ 0x29, 0xb1, 0x8d, 0xb0,
+ 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x01,
+
+ /* fake uuid */
+ 0x00, 0x01,
+ 0x02, 0x03, 0x04, 0x05,
+
+ 0x00, 0x00, 0x00, 0x37,
+ 0x00, 0x00, 0x00, 0x04,
+ 0x44, 0x46, 0x4c, 0x54,
+ 0x00, 0x00, 0xf0, 0x60,
+ 0x00, 0x01, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x01,
+ 0x00, 0x00, 0xf0, 0x60,
+ 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x04,
+ 0x44, 0x46, 0x4c, 0x54,
+ 0x00, 0x01,
+
+ /* fake uuid */
+ 0x00, 0x01,
+ 0x02, 0x03, 0x04, 0x05,
+
+ 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00
+};
+
+static void sendpacket(int sock, struct sockaddr *addr, socklen_t addr_len)
+{
+ struct timeval now;
+ int res;
+
+ res = sendto(sock, sync, sizeof(sync), 0,
+ addr, addr_len);
+ gettimeofday(&now, 0);
+ if (res < 0)
+ printf("%s: %s\n", "send", strerror(errno));
+ else
+ printf("%ld.%06ld: sent %d bytes\n",
+ (long)now.tv_sec, (long)now.tv_usec,
+ res);
+}
+
+static void printpacket(struct msghdr *msg, int res,
+ char *data,
+ int sock, int recvmsg_flags,
+ int siocgstamp, int siocgstampns)
+{
+ struct sockaddr_in *from_addr = (struct sockaddr_in *)msg->msg_name;
+ struct cmsghdr *cmsg;
+ struct timeval tv;
+ struct timespec ts;
+ struct timeval now;
+
+ gettimeofday(&now, 0);
+
+ printf("%ld.%06ld: received %s data, %d bytes from %s, %zu bytes control messages\n",
+ (long)now.tv_sec, (long)now.tv_usec,
+ (recvmsg_flags & MSG_ERRQUEUE) ? "error" : "regular",
+ res,
+ inet_ntoa(from_addr->sin_addr),
+ msg->msg_controllen);
+ for (cmsg = CMSG_FIRSTHDR(msg);
+ cmsg;
+ cmsg = CMSG_NXTHDR(msg, cmsg)) {
+ printf(" cmsg len %zu: ", cmsg->cmsg_len);
+ switch (cmsg->cmsg_level) {
+ case SOL_SOCKET:
+ printf("SOL_SOCKET ");
+ switch (cmsg->cmsg_type) {
+ case SO_TIMESTAMP: {
+ struct timeval *stamp =
+ (struct timeval *)CMSG_DATA(cmsg);
+ printf("SO_TIMESTAMP %ld.%06ld",
+ (long)stamp->tv_sec,
+ (long)stamp->tv_usec);
+ break;
+ }
+ case SO_TIMESTAMPNS: {
+ struct timespec *stamp =
+ (struct timespec *)CMSG_DATA(cmsg);
+ printf("SO_TIMESTAMPNS %ld.%09ld",
+ (long)stamp->tv_sec,
+ (long)stamp->tv_nsec);
+ break;
+ }
+ case SO_TIMESTAMPING: {
+ struct timespec *stamp =
+ (struct timespec *)CMSG_DATA(cmsg);
+ printf("SO_TIMESTAMPING ");
+ printf("SW %ld.%09ld ",
+ (long)stamp->tv_sec,
+ (long)stamp->tv_nsec);
+ stamp++;
+ /* skip deprecated HW transformed */
+ stamp++;
+ printf("HW raw %ld.%09ld",
+ (long)stamp->tv_sec,
+ (long)stamp->tv_nsec);
+ break;
+ }
+ default:
+ printf("type %d", cmsg->cmsg_type);
+ break;
+ }
+ break;
+ case IPPROTO_IP:
+ printf("IPPROTO_IP ");
+ switch (cmsg->cmsg_type) {
+ case IP_RECVERR: {
+ struct sock_extended_err *err =
+ (struct sock_extended_err *)CMSG_DATA(cmsg);
+ printf("IP_RECVERR ee_errno '%s' ee_origin %d => %s",
+ strerror(err->ee_errno),
+ err->ee_origin,
+#ifdef SO_EE_ORIGIN_TIMESTAMPING
+ err->ee_origin == SO_EE_ORIGIN_TIMESTAMPING ?
+ "bounced packet" : "unexpected origin"
+#else
+ "probably SO_EE_ORIGIN_TIMESTAMPING"
+#endif
+ );
+ if (res < sizeof(sync))
+ printf(" => truncated data?!");
+ else if (!memcmp(sync, data + res - sizeof(sync),
+ sizeof(sync)))
+ printf(" => GOT OUR DATA BACK (HURRAY!)");
+ break;
+ }
+ case IP_PKTINFO: {
+ struct in_pktinfo *pktinfo =
+ (struct in_pktinfo *)CMSG_DATA(cmsg);
+ printf("IP_PKTINFO interface index %u",
+ pktinfo->ipi_ifindex);
+ break;
+ }
+ default:
+ printf("type %d", cmsg->cmsg_type);
+ break;
+ }
+ break;
+ default:
+ printf("level %d type %d",
+ cmsg->cmsg_level,
+ cmsg->cmsg_type);
+ break;
+ }
+ printf("\n");
+ }
+
+ if (siocgstamp) {
+ if (ioctl(sock, SIOCGSTAMP, &tv))
+ printf(" %s: %s\n", "SIOCGSTAMP", strerror(errno));
+ else
+ printf("SIOCGSTAMP %ld.%06ld\n",
+ (long)tv.tv_sec,
+ (long)tv.tv_usec);
+ }
+ if (siocgstampns) {
+ if (ioctl(sock, SIOCGSTAMPNS, &ts))
+ printf(" %s: %s\n", "SIOCGSTAMPNS", strerror(errno));
+ else
+ printf("SIOCGSTAMPNS %ld.%09ld\n",
+ (long)ts.tv_sec,
+ (long)ts.tv_nsec);
+ }
+}
+
+static void recvpacket(int sock, int recvmsg_flags,
+ int siocgstamp, int siocgstampns)
+{
+ char data[256];
+ struct msghdr msg;
+ struct iovec entry;
+ struct sockaddr_in from_addr;
+ struct {
+ struct cmsghdr cm;
+ char control[512];
+ } control;
+ int res;
+
+ memset(&msg, 0, sizeof(msg));
+ msg.msg_iov = &entry;
+ msg.msg_iovlen = 1;
+ entry.iov_base = data;
+ entry.iov_len = sizeof(data);
+ msg.msg_name = (caddr_t)&from_addr;
+ msg.msg_namelen = sizeof(from_addr);
+ msg.msg_control = &control;
+ msg.msg_controllen = sizeof(control);
+
+ res = recvmsg(sock, &msg, recvmsg_flags|MSG_DONTWAIT);
+ if (res < 0) {
+ printf("%s %s: %s\n",
+ "recvmsg",
+ (recvmsg_flags & MSG_ERRQUEUE) ? "error" : "regular",
+ strerror(errno));
+ } else {
+ printpacket(&msg, res, data,
+ sock, recvmsg_flags,
+ siocgstamp, siocgstampns);
+ }
+}
+
+int main(int argc, char **argv)
+{
+ int so_timestamping_flags = 0;
+ int so_timestamp = 0;
+ int so_timestampns = 0;
+ int siocgstamp = 0;
+ int siocgstampns = 0;
+ int ip_multicast_loop = 0;
+ char *interface;
+ int i;
+ int enabled = 1;
+ int sock;
+ struct ifreq device;
+ struct ifreq hwtstamp;
+ struct hwtstamp_config hwconfig, hwconfig_requested;
+ struct sockaddr_in addr;
+ struct ip_mreq imr;
+ struct in_addr iaddr;
+ int val;
+ socklen_t len;
+ struct timeval next;
+ size_t if_len;
+
+ if (argc < 2)
+ usage(0);
+ interface = argv[1];
+ if_len = strlen(interface);
+ if (if_len >= IFNAMSIZ) {
+ printf("interface name exceeds IFNAMSIZ\n");
+ exit(1);
+ }
+
+ for (i = 2; i < argc; i++) {
+ if (!strcasecmp(argv[i], "SO_TIMESTAMP"))
+ so_timestamp = 1;
+ else if (!strcasecmp(argv[i], "SO_TIMESTAMPNS"))
+ so_timestampns = 1;
+ else if (!strcasecmp(argv[i], "SIOCGSTAMP"))
+ siocgstamp = 1;
+ else if (!strcasecmp(argv[i], "SIOCGSTAMPNS"))
+ siocgstampns = 1;
+ else if (!strcasecmp(argv[i], "IP_MULTICAST_LOOP"))
+ ip_multicast_loop = 1;
+ else if (!strcasecmp(argv[i], "SOF_TIMESTAMPING_TX_HARDWARE"))
+ so_timestamping_flags |= SOF_TIMESTAMPING_TX_HARDWARE;
+ else if (!strcasecmp(argv[i], "SOF_TIMESTAMPING_TX_SOFTWARE"))
+ so_timestamping_flags |= SOF_TIMESTAMPING_TX_SOFTWARE;
+ else if (!strcasecmp(argv[i], "SOF_TIMESTAMPING_RX_HARDWARE"))
+ so_timestamping_flags |= SOF_TIMESTAMPING_RX_HARDWARE;
+ else if (!strcasecmp(argv[i], "SOF_TIMESTAMPING_RX_SOFTWARE"))
+ so_timestamping_flags |= SOF_TIMESTAMPING_RX_SOFTWARE;
+ else if (!strcasecmp(argv[i], "SOF_TIMESTAMPING_SOFTWARE"))
+ so_timestamping_flags |= SOF_TIMESTAMPING_SOFTWARE;
+ else if (!strcasecmp(argv[i], "SOF_TIMESTAMPING_RAW_HARDWARE"))
+ so_timestamping_flags |= SOF_TIMESTAMPING_RAW_HARDWARE;
+ else
+ usage(argv[i]);
+ }
+
+ sock = socket(PF_INET, SOCK_DGRAM, IPPROTO_UDP);
+ if (sock < 0)
+ bail("socket");
+
+ memset(&device, 0, sizeof(device));
+ memcpy(device.ifr_name, interface, if_len + 1);
+ if (ioctl(sock, SIOCGIFADDR, &device) < 0)
+ bail("getting interface IP address");
+
+ memset(&hwtstamp, 0, sizeof(hwtstamp));
+ memcpy(hwtstamp.ifr_name, interface, if_len + 1);
+ hwtstamp.ifr_data = (void *)&hwconfig;
+ memset(&hwconfig, 0, sizeof(hwconfig));
+ hwconfig.tx_type =
+ (so_timestamping_flags & SOF_TIMESTAMPING_TX_HARDWARE) ?
+ HWTSTAMP_TX_ON : HWTSTAMP_TX_OFF;
+ hwconfig.rx_filter =
+ (so_timestamping_flags & SOF_TIMESTAMPING_RX_HARDWARE) ?
+ HWTSTAMP_FILTER_PTP_V1_L4_SYNC : HWTSTAMP_FILTER_NONE;
+ hwconfig_requested = hwconfig;
+ if (ioctl(sock, SIOCSHWTSTAMP, &hwtstamp) < 0) {
+ if ((errno == EINVAL || errno == ENOTSUP) &&
+ hwconfig_requested.tx_type == HWTSTAMP_TX_OFF &&
+ hwconfig_requested.rx_filter == HWTSTAMP_FILTER_NONE)
+ printf("SIOCSHWTSTAMP: disabling hardware time stamping not possible\n");
+ else
+ bail("SIOCSHWTSTAMP");
+ }
+ printf("SIOCSHWTSTAMP: tx_type %d requested, got %d; rx_filter %d requested, got %d\n",
+ hwconfig_requested.tx_type, hwconfig.tx_type,
+ hwconfig_requested.rx_filter, hwconfig.rx_filter);
+
+ /* bind to PTP port */
+ addr.sin_family = AF_INET;
+ addr.sin_addr.s_addr = htonl(INADDR_ANY);
+ addr.sin_port = htons(319 /* PTP event port */);
+ if (bind(sock,
+ (struct sockaddr *)&addr,
+ sizeof(struct sockaddr_in)) < 0)
+ bail("bind");
+
+ /* set multicast group for outgoing packets */
+ inet_aton("224.0.1.130", &iaddr); /* alternate PTP domain 1 */
+ addr.sin_addr = iaddr;
+ imr.imr_multiaddr.s_addr = iaddr.s_addr;
+ imr.imr_interface.s_addr =
+ ((struct sockaddr_in *)&device.ifr_addr)->sin_addr.s_addr;
+ if (setsockopt(sock, IPPROTO_IP, IP_MULTICAST_IF,
+ &imr.imr_interface.s_addr, sizeof(struct in_addr)) < 0)
+ bail("set multicast");
+
+ /* join multicast group, loop our own packet */
+ if (setsockopt(sock, IPPROTO_IP, IP_ADD_MEMBERSHIP,
+ &imr, sizeof(struct ip_mreq)) < 0)
+ bail("join multicast group");
+
+ if (setsockopt(sock, IPPROTO_IP, IP_MULTICAST_LOOP,
+ &ip_multicast_loop, sizeof(enabled)) < 0) {
+ bail("loop multicast");
+ }
+
+ /* set socket options for time stamping */
+ if (so_timestamp &&
+ setsockopt(sock, SOL_SOCKET, SO_TIMESTAMP,
+ &enabled, sizeof(enabled)) < 0)
+ bail("setsockopt SO_TIMESTAMP");
+
+ if (so_timestampns &&
+ setsockopt(sock, SOL_SOCKET, SO_TIMESTAMPNS,
+ &enabled, sizeof(enabled)) < 0)
+ bail("setsockopt SO_TIMESTAMPNS");
+
+ if (so_timestamping_flags &&
+ setsockopt(sock, SOL_SOCKET, SO_TIMESTAMPING,
+ &so_timestamping_flags,
+ sizeof(so_timestamping_flags)) < 0)
+ bail("setsockopt SO_TIMESTAMPING");
+
+ /* request IP_PKTINFO for debugging purposes */
+ if (setsockopt(sock, SOL_IP, IP_PKTINFO,
+ &enabled, sizeof(enabled)) < 0)
+ printf("%s: %s\n", "setsockopt IP_PKTINFO", strerror(errno));
+
+ /* verify socket options */
+ len = sizeof(val);
+ if (getsockopt(sock, SOL_SOCKET, SO_TIMESTAMP, &val, &len) < 0)
+ printf("%s: %s\n", "getsockopt SO_TIMESTAMP", strerror(errno));
+ else
+ printf("SO_TIMESTAMP %d\n", val);
+
+ if (getsockopt(sock, SOL_SOCKET, SO_TIMESTAMPNS, &val, &len) < 0)
+ printf("%s: %s\n", "getsockopt SO_TIMESTAMPNS",
+ strerror(errno));
+ else
+ printf("SO_TIMESTAMPNS %d\n", val);
+
+ if (getsockopt(sock, SOL_SOCKET, SO_TIMESTAMPING, &val, &len) < 0) {
+ printf("%s: %s\n", "getsockopt SO_TIMESTAMPING",
+ strerror(errno));
+ } else {
+ printf("SO_TIMESTAMPING %d\n", val);
+ if (val != so_timestamping_flags)
+ printf(" not the expected value %d\n",
+ so_timestamping_flags);
+ }
+
+ /* send packets forever every five seconds */
+ gettimeofday(&next, 0);
+ next.tv_sec = (next.tv_sec + 1) / 5 * 5;
+ next.tv_usec = 0;
+ while (1) {
+ struct timeval now;
+ struct timeval delta;
+ long delta_us;
+ int res;
+ fd_set readfs, errorfs;
+
+ gettimeofday(&now, 0);
+ delta_us = (long)(next.tv_sec - now.tv_sec) * 1000000 +
+ (long)(next.tv_usec - now.tv_usec);
+ if (delta_us > 0) {
+ /* continue waiting for timeout or data */
+ delta.tv_sec = delta_us / 1000000;
+ delta.tv_usec = delta_us % 1000000;
+
+ FD_ZERO(&readfs);
+ FD_ZERO(&errorfs);
+ FD_SET(sock, &readfs);
+ FD_SET(sock, &errorfs);
+ printf("%ld.%06ld: select %ldus\n",
+ (long)now.tv_sec, (long)now.tv_usec,
+ delta_us);
+ res = select(sock + 1, &readfs, 0, &errorfs, &delta);
+ gettimeofday(&now, 0);
+ printf("%ld.%06ld: select returned: %d, %s\n",
+ (long)now.tv_sec, (long)now.tv_usec,
+ res,
+ res < 0 ? strerror(errno) : "success");
+ if (res > 0) {
+ if (FD_ISSET(sock, &readfs))
+ printf("ready for reading\n");
+ if (FD_ISSET(sock, &errorfs))
+ printf("has error\n");
+ recvpacket(sock, 0,
+ siocgstamp,
+ siocgstampns);
+ recvpacket(sock, MSG_ERRQUEUE,
+ siocgstamp,
+ siocgstampns);
+ }
+ } else {
+ /* write one packet */
+ sendpacket(sock,
+ (struct sockaddr *)&addr,
+ sizeof(addr));
+ next.tv_sec += 5;
+ continue;
+ }
+ }
+
+ return 0;
+}
diff --git a/tools/testing/selftests/net/tls.c b/tools/testing/selftests/net/tls.c
index 0ea44d9..b599f1f 100644
--- a/tools/testing/selftests/net/tls.c
+++ b/tools/testing/selftests/net/tls.c
@@ -101,6 +101,21 @@
bool notls;
};
+FIXTURE_VARIANT(tls)
+{
+ unsigned int tls_version;
+};
+
+FIXTURE_VARIANT_ADD(tls, 12)
+{
+ .tls_version = TLS_1_2_VERSION,
+};
+
+FIXTURE_VARIANT_ADD(tls, 13)
+{
+ .tls_version = TLS_1_3_VERSION,
+};
+
FIXTURE_SETUP(tls)
{
struct tls12_crypto_info_aes_gcm_128 tls12;
@@ -112,7 +127,7 @@
len = sizeof(addr);
memset(&tls12, 0, sizeof(tls12));
- tls12.info.version = TLS_1_3_VERSION;
+ tls12.info.version = variant->tls_version;
tls12.info.cipher_type = TLS_CIPHER_AES_GCM_128;
addr.sin_family = AF_INET;
@@ -198,6 +213,64 @@
EXPECT_EQ(recv(self->cfd, buf, st.st_size, MSG_WAITALL), st.st_size);
}
+static void chunked_sendfile(struct __test_metadata *_metadata,
+ struct _test_data_tls *self,
+ uint16_t chunk_size,
+ uint16_t extra_payload_size)
+{
+ char buf[TLS_PAYLOAD_MAX_LEN];
+ uint16_t test_payload_size;
+ int size = 0;
+ int ret;
+ char filename[] = "/tmp/mytemp.XXXXXX";
+ int fd = mkstemp(filename);
+ off_t offset = 0;
+
+ unlink(filename);
+ ASSERT_GE(fd, 0);
+ EXPECT_GE(chunk_size, 1);
+ test_payload_size = chunk_size + extra_payload_size;
+ ASSERT_GE(TLS_PAYLOAD_MAX_LEN, test_payload_size);
+ memset(buf, 1, test_payload_size);
+ size = write(fd, buf, test_payload_size);
+ EXPECT_EQ(size, test_payload_size);
+ fsync(fd);
+
+ while (size > 0) {
+ ret = sendfile(self->fd, fd, &offset, chunk_size);
+ EXPECT_GE(ret, 0);
+ size -= ret;
+ }
+
+ EXPECT_EQ(recv(self->cfd, buf, test_payload_size, MSG_WAITALL),
+ test_payload_size);
+
+ close(fd);
+}
+
+TEST_F(tls, multi_chunk_sendfile)
+{
+ chunked_sendfile(_metadata, self, 4096, 4096);
+ chunked_sendfile(_metadata, self, 4096, 0);
+ chunked_sendfile(_metadata, self, 4096, 1);
+ chunked_sendfile(_metadata, self, 4096, 2048);
+ chunked_sendfile(_metadata, self, 8192, 2048);
+ chunked_sendfile(_metadata, self, 4096, 8192);
+ chunked_sendfile(_metadata, self, 8192, 4096);
+ chunked_sendfile(_metadata, self, 12288, 1024);
+ chunked_sendfile(_metadata, self, 12288, 2000);
+ chunked_sendfile(_metadata, self, 15360, 100);
+ chunked_sendfile(_metadata, self, 15360, 300);
+ chunked_sendfile(_metadata, self, 1, 4096);
+ chunked_sendfile(_metadata, self, 2048, 4096);
+ chunked_sendfile(_metadata, self, 2048, 8192);
+ chunked_sendfile(_metadata, self, 4096, 8192);
+ chunked_sendfile(_metadata, self, 1024, 12288);
+ chunked_sendfile(_metadata, self, 2000, 12288);
+ chunked_sendfile(_metadata, self, 100, 15360);
+ chunked_sendfile(_metadata, self, 300, 15360);
+}
+
TEST_F(tls, recv_max)
{
unsigned int send_len = TLS_PAYLOAD_MAX_LEN;
@@ -733,7 +806,7 @@
struct tls12_crypto_info_aes_gcm_128 tls12;
memset(&tls12, 0, sizeof(tls12));
- tls12.info.version = TLS_1_3_VERSION;
+ tls12.info.version = variant->tls_version;
tls12.info.cipher_type = TLS_CIPHER_AES_GCM_128;
ret = setsockopt(self->fd, SOL_TLS, TLS_RX, &tls12,
@@ -1258,78 +1331,4 @@
close(cfd);
}
-TEST(tls12) {
- int fd, cfd;
- bool notls;
-
- struct tls12_crypto_info_aes_gcm_128 tls12;
- struct sockaddr_in addr;
- socklen_t len;
- int sfd, ret;
-
- notls = false;
- len = sizeof(addr);
-
- memset(&tls12, 0, sizeof(tls12));
- tls12.info.version = TLS_1_2_VERSION;
- tls12.info.cipher_type = TLS_CIPHER_AES_GCM_128;
-
- addr.sin_family = AF_INET;
- addr.sin_addr.s_addr = htonl(INADDR_ANY);
- addr.sin_port = 0;
-
- fd = socket(AF_INET, SOCK_STREAM, 0);
- sfd = socket(AF_INET, SOCK_STREAM, 0);
-
- ret = bind(sfd, &addr, sizeof(addr));
- ASSERT_EQ(ret, 0);
- ret = listen(sfd, 10);
- ASSERT_EQ(ret, 0);
-
- ret = getsockname(sfd, &addr, &len);
- ASSERT_EQ(ret, 0);
-
- ret = connect(fd, &addr, sizeof(addr));
- ASSERT_EQ(ret, 0);
-
- ret = setsockopt(fd, IPPROTO_TCP, TCP_ULP, "tls", sizeof("tls"));
- if (ret != 0) {
- notls = true;
- printf("Failure setting TCP_ULP, testing without tls\n");
- }
-
- if (!notls) {
- ret = setsockopt(fd, SOL_TLS, TLS_TX, &tls12,
- sizeof(tls12));
- ASSERT_EQ(ret, 0);
- }
-
- cfd = accept(sfd, &addr, &len);
- ASSERT_GE(cfd, 0);
-
- if (!notls) {
- ret = setsockopt(cfd, IPPROTO_TCP, TCP_ULP, "tls",
- sizeof("tls"));
- ASSERT_EQ(ret, 0);
-
- ret = setsockopt(cfd, SOL_TLS, TLS_RX, &tls12,
- sizeof(tls12));
- ASSERT_EQ(ret, 0);
- }
-
- close(sfd);
-
- char const *test_str = "test_read";
- int send_len = 10;
- char buf[10];
-
- send_len = strlen(test_str) + 1;
- EXPECT_EQ(send(fd, test_str, send_len, 0), send_len);
- EXPECT_NE(recv(cfd, buf, send_len, 0), -1);
- EXPECT_EQ(memcmp(buf, test_str, send_len), 0);
-
- close(fd);
- close(cfd);
-}
-
TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/net/traceroute.sh b/tools/testing/selftests/net/traceroute.sh
new file mode 100755
index 0000000..de9ca97
--- /dev/null
+++ b/tools/testing/selftests/net/traceroute.sh
@@ -0,0 +1,322 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Run traceroute/traceroute6 tests
+#
+
+VERBOSE=0
+PAUSE_ON_FAIL=no
+
+################################################################################
+#
+log_test()
+{
+ local rc=$1
+ local expected=$2
+ local msg="$3"
+
+ if [ ${rc} -eq ${expected} ]; then
+ printf "TEST: %-60s [ OK ]\n" "${msg}"
+ nsuccess=$((nsuccess+1))
+ else
+ ret=1
+ nfail=$((nfail+1))
+ printf "TEST: %-60s [FAIL]\n" "${msg}"
+ if [ "${PAUSE_ON_FAIL}" = "yes" ]; then
+ echo
+ echo "hit enter to continue, 'q' to quit"
+ read a
+ [ "$a" = "q" ] && exit 1
+ fi
+ fi
+}
+
+run_cmd()
+{
+ local ns
+ local cmd
+ local out
+ local rc
+
+ ns="$1"
+ shift
+ cmd="$*"
+
+ if [ "$VERBOSE" = "1" ]; then
+ printf " COMMAND: $cmd\n"
+ fi
+
+ out=$(eval ip netns exec ${ns} ${cmd} 2>&1)
+ rc=$?
+ if [ "$VERBOSE" = "1" -a -n "$out" ]; then
+ echo " $out"
+ fi
+
+ [ "$VERBOSE" = "1" ] && echo
+
+ return $rc
+}
+
+################################################################################
+# create namespaces and interconnects
+
+create_ns()
+{
+ local ns=$1
+ local addr=$2
+ local addr6=$3
+
+ [ -z "${addr}" ] && addr="-"
+ [ -z "${addr6}" ] && addr6="-"
+
+ ip netns add ${ns}
+
+ ip netns exec ${ns} ip link set lo up
+ if [ "${addr}" != "-" ]; then
+ ip netns exec ${ns} ip addr add dev lo ${addr}
+ fi
+ if [ "${addr6}" != "-" ]; then
+ ip netns exec ${ns} ip -6 addr add dev lo ${addr6}
+ fi
+
+ ip netns exec ${ns} ip ro add unreachable default metric 8192
+ ip netns exec ${ns} ip -6 ro add unreachable default metric 8192
+
+ ip netns exec ${ns} sysctl -qw net.ipv4.ip_forward=1
+ ip netns exec ${ns} sysctl -qw net.ipv6.conf.all.keep_addr_on_down=1
+ ip netns exec ${ns} sysctl -qw net.ipv6.conf.all.forwarding=1
+ ip netns exec ${ns} sysctl -qw net.ipv6.conf.default.forwarding=1
+ ip netns exec ${ns} sysctl -qw net.ipv6.conf.default.accept_dad=0
+}
+
+# create veth pair to connect namespaces and apply addresses.
+connect_ns()
+{
+ local ns1=$1
+ local ns1_dev=$2
+ local ns1_addr=$3
+ local ns1_addr6=$4
+ local ns2=$5
+ local ns2_dev=$6
+ local ns2_addr=$7
+ local ns2_addr6=$8
+
+ ip netns exec ${ns1} ip li add ${ns1_dev} type veth peer name tmp
+ ip netns exec ${ns1} ip li set ${ns1_dev} up
+ ip netns exec ${ns1} ip li set tmp netns ${ns2} name ${ns2_dev}
+ ip netns exec ${ns2} ip li set ${ns2_dev} up
+
+ if [ "${ns1_addr}" != "-" ]; then
+ ip netns exec ${ns1} ip addr add dev ${ns1_dev} ${ns1_addr}
+ fi
+
+ if [ "${ns2_addr}" != "-" ]; then
+ ip netns exec ${ns2} ip addr add dev ${ns2_dev} ${ns2_addr}
+ fi
+
+ if [ "${ns1_addr6}" != "-" ]; then
+ ip netns exec ${ns1} ip addr add dev ${ns1_dev} ${ns1_addr6}
+ fi
+
+ if [ "${ns2_addr6}" != "-" ]; then
+ ip netns exec ${ns2} ip addr add dev ${ns2_dev} ${ns2_addr6}
+ fi
+}
+
+################################################################################
+# traceroute6 test
+#
+# Verify that in this scenario
+#
+# ------------------------ N2
+# | |
+# ------ ------ N3 ----
+# | R1 | | R2 |------|H2|
+# ------ ------ ----
+# | |
+# ------------------------ N1
+# |
+# ----
+# |H1|
+# ----
+#
+# where H1's default route goes through R1 and R1's default route goes
+# through R2 over N2, traceroute6 from H1 to H2 reports R2's address
+# on N2 and not N1.
+#
+# Addresses are assigned as follows:
+#
+# N1: 2000:101::/64
+# N2: 2000:102::/64
+# N3: 2000:103::/64
+#
+# R1's host part of address: 1
+# R2's host part of address: 2
+# H1's host part of address: 3
+# H2's host part of address: 4
+#
+# For example:
+# the IPv6 address of R1's interface on N2 is 2000:102::1/64
+
+cleanup_traceroute6()
+{
+ local ns
+
+ for ns in host-1 host-2 router-1 router-2
+ do
+ ip netns del ${ns} 2>/dev/null
+ done
+}
+
+setup_traceroute6()
+{
+ brdev=br0
+
+ # start clean
+ cleanup_traceroute6
+
+ set -e
+ create_ns host-1
+ create_ns host-2
+ create_ns router-1
+ create_ns router-2
+
+ # Setup N3
+ connect_ns router-2 eth3 - 2000:103::2/64 host-2 eth3 - 2000:103::4/64
+ ip netns exec host-2 ip route add default via 2000:103::2
+
+ # Setup N2
+ connect_ns router-1 eth2 - 2000:102::1/64 router-2 eth2 - 2000:102::2/64
+ ip netns exec router-1 ip route add default via 2000:102::2
+
+ # Setup N1. host-1 and router-2 connect to a bridge in router-1.
+ ip netns exec router-1 ip link add name ${brdev} type bridge
+ ip netns exec router-1 ip link set ${brdev} up
+ ip netns exec router-1 ip addr add 2000:101::1/64 dev ${brdev}
+
+ connect_ns host-1 eth0 - 2000:101::3/64 router-1 eth0 - -
+ ip netns exec router-1 ip link set dev eth0 master ${brdev}
+ ip netns exec host-1 ip route add default via 2000:101::1
+
+ connect_ns router-2 eth1 - 2000:101::2/64 router-1 eth1 - -
+ ip netns exec router-1 ip link set dev eth1 master ${brdev}
+
+ # Prime the network
+ ip netns exec host-1 ping6 -c5 2000:103::4 >/dev/null 2>&1
+
+ set +e
+}
+
+run_traceroute6()
+{
+ if [ ! -x "$(command -v traceroute6)" ]; then
+ echo "SKIP: Could not run IPV6 test without traceroute6"
+ return
+ fi
+
+ setup_traceroute6
+
+ # traceroute6 host-2 from host-1 (expects 2000:102::2)
+ run_cmd host-1 "traceroute6 2000:103::4 | grep -q 2000:102::2"
+ log_test $? 0 "IPV6 traceroute"
+
+ cleanup_traceroute6
+}
+
+################################################################################
+# traceroute test
+#
+# Verify that traceroute from H1 to H2 shows 1.0.1.1 in this scenario
+#
+# 1.0.3.1/24
+# ---- 1.0.1.3/24 1.0.1.1/24 ---- 1.0.2.1/24 1.0.2.4/24 ----
+# |H1|--------------------------|R1|--------------------------|H2|
+# ---- N1 ---- N2 ----
+#
+# where net.ipv4.icmp_errors_use_inbound_ifaddr is set on R1 and
+# 1.0.3.1/24 and 1.0.1.1/24 are respectively R1's primary and secondary
+# address on N1.
+#
+
+cleanup_traceroute()
+{
+ local ns
+
+ for ns in host-1 host-2 router
+ do
+ ip netns del ${ns} 2>/dev/null
+ done
+}
+
+setup_traceroute()
+{
+ # start clean
+ cleanup_traceroute
+
+ set -e
+ create_ns host-1
+ create_ns host-2
+ create_ns router
+
+ connect_ns host-1 eth0 1.0.1.3/24 - \
+ router eth1 1.0.3.1/24 -
+ ip netns exec host-1 ip route add default via 1.0.1.1
+
+ ip netns exec router ip addr add 1.0.1.1/24 dev eth1
+ ip netns exec router sysctl -qw \
+ net.ipv4.icmp_errors_use_inbound_ifaddr=1
+
+ connect_ns host-2 eth0 1.0.2.4/24 - \
+ router eth2 1.0.2.1/24 -
+ ip netns exec host-2 ip route add default via 1.0.2.1
+
+ # Prime the network
+ ip netns exec host-1 ping -c5 1.0.2.4 >/dev/null 2>&1
+
+ set +e
+}
+
+run_traceroute()
+{
+ if [ ! -x "$(command -v traceroute)" ]; then
+ echo "SKIP: Could not run IPV4 test without traceroute"
+ return
+ fi
+
+ setup_traceroute
+
+ # traceroute host-2 from host-1 (expects 1.0.1.1). Takes a while.
+ run_cmd host-1 "traceroute 1.0.2.4 | grep -q 1.0.1.1"
+ log_test $? 0 "IPV4 traceroute"
+
+ cleanup_traceroute
+}
+
+################################################################################
+# Run tests
+
+run_tests()
+{
+ run_traceroute6
+ run_traceroute
+}
+
+################################################################################
+# main
+
+declare -i nfail=0
+declare -i nsuccess=0
+
+while getopts :pv o
+do
+ case $o in
+ p) PAUSE_ON_FAIL=yes;;
+ v) VERBOSE=$(($VERBOSE + 1));;
+ *) exit 1;;
+ esac
+done
+
+run_tests
+
+printf "\nTests passed: %3d\n" ${nsuccess}
+printf "Tests failed: %3d\n" ${nfail}
diff --git a/tools/testing/selftests/net/txtimestamp.c b/tools/testing/selftests/net/txtimestamp.c
new file mode 100644
index 0000000..fabb1d5
--- /dev/null
+++ b/tools/testing/selftests/net/txtimestamp.c
@@ -0,0 +1,922 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2014 Google Inc.
+ * Author: willemb@google.com (Willem de Bruijn)
+ *
+ * Test software tx timestamping, including
+ *
+ * - SCHED, SND and ACK timestamps
+ * - RAW, UDP and TCP
+ * - IPv4 and IPv6
+ * - various packet sizes (to test GSO and TSO)
+ *
+ * Consult the command line arguments for help on running
+ * the various testcases.
+ *
+ * This test requires a dummy TCP server.
+ * A simple `nc6 [-u] -l -p $DESTPORT` will do
+ */
+
+#define _GNU_SOURCE
+
+#include <arpa/inet.h>
+#include <asm/types.h>
+#include <error.h>
+#include <errno.h>
+#include <inttypes.h>
+#include <linux/errqueue.h>
+#include <linux/if_ether.h>
+#include <linux/if_packet.h>
+#include <linux/ipv6.h>
+#include <linux/net_tstamp.h>
+#include <netdb.h>
+#include <net/if.h>
+#include <netinet/in.h>
+#include <netinet/ip.h>
+#include <netinet/udp.h>
+#include <netinet/tcp.h>
+#include <poll.h>
+#include <stdarg.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/epoll.h>
+#include <sys/ioctl.h>
+#include <sys/select.h>
+#include <sys/socket.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <time.h>
+#include <unistd.h>
+
+#define NSEC_PER_USEC 1000L
+#define USEC_PER_SEC 1000000L
+#define NSEC_PER_SEC 1000000000LL
+
+/* command line parameters */
+static int cfg_proto = SOCK_STREAM;
+static int cfg_ipproto = IPPROTO_TCP;
+static int cfg_num_pkts = 4;
+static int do_ipv4 = 1;
+static int do_ipv6 = 1;
+static int cfg_payload_len = 10;
+static int cfg_poll_timeout = 100;
+static int cfg_delay_snd;
+static int cfg_delay_ack;
+static int cfg_delay_tolerance_usec = 500;
+static bool cfg_show_payload;
+static bool cfg_do_pktinfo;
+static bool cfg_busy_poll;
+static int cfg_sleep_usec = 50 * 1000;
+static bool cfg_loop_nodata;
+static bool cfg_use_cmsg;
+static bool cfg_use_pf_packet;
+static bool cfg_use_epoll;
+static bool cfg_epollet;
+static bool cfg_do_listen;
+static uint16_t dest_port = 9000;
+static bool cfg_print_nsec;
+
+static struct sockaddr_in daddr;
+static struct sockaddr_in6 daddr6;
+static struct timespec ts_usr;
+
+static int saved_tskey = -1;
+static int saved_tskey_type = -1;
+
+struct timing_event {
+ int64_t min;
+ int64_t max;
+ int64_t total;
+ int count;
+};
+
+static struct timing_event usr_enq;
+static struct timing_event usr_snd;
+static struct timing_event usr_ack;
+
+static bool test_failed;
+
+static int64_t timespec_to_ns64(struct timespec *ts)
+{
+ return ts->tv_sec * NSEC_PER_SEC + ts->tv_nsec;
+}
+
+static int64_t timespec_to_us64(struct timespec *ts)
+{
+ return ts->tv_sec * USEC_PER_SEC + ts->tv_nsec / NSEC_PER_USEC;
+}
+
+static void init_timing_event(struct timing_event *te)
+{
+ te->min = INT64_MAX;
+ te->max = 0;
+ te->total = 0;
+ te->count = 0;
+}
+
+static void add_timing_event(struct timing_event *te,
+ struct timespec *t_start, struct timespec *t_end)
+{
+ int64_t ts_delta = timespec_to_ns64(t_end) - timespec_to_ns64(t_start);
+
+ te->count++;
+ if (ts_delta < te->min)
+ te->min = ts_delta;
+ if (ts_delta > te->max)
+ te->max = ts_delta;
+ te->total += ts_delta;
+}
+
+static void validate_key(int tskey, int tstype)
+{
+ int stepsize;
+
+ /* compare key for each subsequent request
+ * must only test for one type, the first one requested
+ */
+ if (saved_tskey == -1)
+ saved_tskey_type = tstype;
+ else if (saved_tskey_type != tstype)
+ return;
+
+ stepsize = cfg_proto == SOCK_STREAM ? cfg_payload_len : 1;
+ if (tskey != saved_tskey + stepsize) {
+ fprintf(stderr, "ERROR: key %d, expected %d\n",
+ tskey, saved_tskey + stepsize);
+ test_failed = true;
+ }
+
+ saved_tskey = tskey;
+}
+
+static void validate_timestamp(struct timespec *cur, int min_delay)
+{
+ int64_t cur64, start64;
+ int max_delay;
+
+ cur64 = timespec_to_us64(cur);
+ start64 = timespec_to_us64(&ts_usr);
+ max_delay = min_delay + cfg_delay_tolerance_usec;
+
+ if (cur64 < start64 + min_delay || cur64 > start64 + max_delay) {
+ fprintf(stderr, "ERROR: %lu us expected between %d and %d\n",
+ cur64 - start64, min_delay, max_delay);
+ test_failed = true;
+ }
+}
+
+static void __print_ts_delta_formatted(int64_t ts_delta)
+{
+ if (cfg_print_nsec)
+ fprintf(stderr, "%lu ns", ts_delta);
+ else
+ fprintf(stderr, "%lu us", ts_delta / NSEC_PER_USEC);
+}
+
+static void __print_timestamp(const char *name, struct timespec *cur,
+ uint32_t key, int payload_len)
+{
+ int64_t ts_delta;
+
+ if (!(cur->tv_sec | cur->tv_nsec))
+ return;
+
+ if (cfg_print_nsec)
+ fprintf(stderr, " %s: %lu s %lu ns (seq=%u, len=%u)",
+ name, cur->tv_sec, cur->tv_nsec,
+ key, payload_len);
+ else
+ fprintf(stderr, " %s: %lu s %lu us (seq=%u, len=%u)",
+ name, cur->tv_sec, cur->tv_nsec / NSEC_PER_USEC,
+ key, payload_len);
+
+ if (cur != &ts_usr) {
+ ts_delta = timespec_to_ns64(cur) - timespec_to_ns64(&ts_usr);
+ fprintf(stderr, " (USR +");
+ __print_ts_delta_formatted(ts_delta);
+ fprintf(stderr, ")");
+ }
+
+ fprintf(stderr, "\n");
+}
+
+static void print_timestamp_usr(void)
+{
+ if (clock_gettime(CLOCK_REALTIME, &ts_usr))
+ error(1, errno, "clock_gettime");
+
+ __print_timestamp(" USR", &ts_usr, 0, 0);
+}
+
+static void print_timestamp(struct scm_timestamping *tss, int tstype,
+ int tskey, int payload_len)
+{
+ const char *tsname;
+
+ validate_key(tskey, tstype);
+
+ switch (tstype) {
+ case SCM_TSTAMP_SCHED:
+ tsname = " ENQ";
+ validate_timestamp(&tss->ts[0], 0);
+ add_timing_event(&usr_enq, &ts_usr, &tss->ts[0]);
+ break;
+ case SCM_TSTAMP_SND:
+ tsname = " SND";
+ validate_timestamp(&tss->ts[0], cfg_delay_snd);
+ add_timing_event(&usr_snd, &ts_usr, &tss->ts[0]);
+ break;
+ case SCM_TSTAMP_ACK:
+ tsname = " ACK";
+ validate_timestamp(&tss->ts[0], cfg_delay_ack);
+ add_timing_event(&usr_ack, &ts_usr, &tss->ts[0]);
+ break;
+ default:
+ error(1, 0, "unknown timestamp type: %u",
+ tstype);
+ }
+ __print_timestamp(tsname, &tss->ts[0], tskey, payload_len);
+}
+
+static void print_timing_event(char *name, struct timing_event *te)
+{
+ if (!te->count)
+ return;
+
+ fprintf(stderr, " %s: count=%d", name, te->count);
+ fprintf(stderr, ", avg=");
+ __print_ts_delta_formatted((int64_t)(te->total / te->count));
+ fprintf(stderr, ", min=");
+ __print_ts_delta_formatted(te->min);
+ fprintf(stderr, ", max=");
+ __print_ts_delta_formatted(te->max);
+ fprintf(stderr, "\n");
+}
+
+/* TODO: convert to check_and_print payload once API is stable */
+static void print_payload(char *data, int len)
+{
+ int i;
+
+ if (!len)
+ return;
+
+ if (len > 70)
+ len = 70;
+
+ fprintf(stderr, "payload: ");
+ for (i = 0; i < len; i++)
+ fprintf(stderr, "%02hhx ", data[i]);
+ fprintf(stderr, "\n");
+}
+
+static void print_pktinfo(int family, int ifindex, void *saddr, void *daddr)
+{
+ char sa[INET6_ADDRSTRLEN], da[INET6_ADDRSTRLEN];
+
+ fprintf(stderr, " pktinfo: ifindex=%u src=%s dst=%s\n",
+ ifindex,
+ saddr ? inet_ntop(family, saddr, sa, sizeof(sa)) : "unknown",
+ daddr ? inet_ntop(family, daddr, da, sizeof(da)) : "unknown");
+}
+
+static void __epoll(int epfd)
+{
+ struct epoll_event events;
+ int ret;
+
+ memset(&events, 0, sizeof(events));
+ ret = epoll_wait(epfd, &events, 1, cfg_poll_timeout);
+ if (ret != 1)
+ error(1, errno, "epoll_wait");
+}
+
+static void __poll(int fd)
+{
+ struct pollfd pollfd;
+ int ret;
+
+ memset(&pollfd, 0, sizeof(pollfd));
+ pollfd.fd = fd;
+ ret = poll(&pollfd, 1, cfg_poll_timeout);
+ if (ret != 1)
+ error(1, errno, "poll");
+}
+
+static void __recv_errmsg_cmsg(struct msghdr *msg, int payload_len)
+{
+ struct sock_extended_err *serr = NULL;
+ struct scm_timestamping *tss = NULL;
+ struct cmsghdr *cm;
+ int batch = 0;
+
+ for (cm = CMSG_FIRSTHDR(msg);
+ cm && cm->cmsg_len;
+ cm = CMSG_NXTHDR(msg, cm)) {
+ if (cm->cmsg_level == SOL_SOCKET &&
+ cm->cmsg_type == SCM_TIMESTAMPING) {
+ tss = (void *) CMSG_DATA(cm);
+ } else if ((cm->cmsg_level == SOL_IP &&
+ cm->cmsg_type == IP_RECVERR) ||
+ (cm->cmsg_level == SOL_IPV6 &&
+ cm->cmsg_type == IPV6_RECVERR) ||
+ (cm->cmsg_level == SOL_PACKET &&
+ cm->cmsg_type == PACKET_TX_TIMESTAMP)) {
+ serr = (void *) CMSG_DATA(cm);
+ if (serr->ee_errno != ENOMSG ||
+ serr->ee_origin != SO_EE_ORIGIN_TIMESTAMPING) {
+ fprintf(stderr, "unknown ip error %d %d\n",
+ serr->ee_errno,
+ serr->ee_origin);
+ serr = NULL;
+ }
+ } else if (cm->cmsg_level == SOL_IP &&
+ cm->cmsg_type == IP_PKTINFO) {
+ struct in_pktinfo *info = (void *) CMSG_DATA(cm);
+ print_pktinfo(AF_INET, info->ipi_ifindex,
+ &info->ipi_spec_dst, &info->ipi_addr);
+ } else if (cm->cmsg_level == SOL_IPV6 &&
+ cm->cmsg_type == IPV6_PKTINFO) {
+ struct in6_pktinfo *info6 = (void *) CMSG_DATA(cm);
+ print_pktinfo(AF_INET6, info6->ipi6_ifindex,
+ NULL, &info6->ipi6_addr);
+ } else
+ fprintf(stderr, "unknown cmsg %d,%d\n",
+ cm->cmsg_level, cm->cmsg_type);
+
+ if (serr && tss) {
+ print_timestamp(tss, serr->ee_info, serr->ee_data,
+ payload_len);
+ serr = NULL;
+ tss = NULL;
+ batch++;
+ }
+ }
+
+ if (batch > 1)
+ fprintf(stderr, "batched %d timestamps\n", batch);
+}
+
+static int recv_errmsg(int fd)
+{
+ static char ctrl[1024 /* overprovision*/];
+ static struct msghdr msg;
+ struct iovec entry;
+ static char *data;
+ int ret = 0;
+
+ data = malloc(cfg_payload_len);
+ if (!data)
+ error(1, 0, "malloc");
+
+ memset(&msg, 0, sizeof(msg));
+ memset(&entry, 0, sizeof(entry));
+ memset(ctrl, 0, sizeof(ctrl));
+
+ entry.iov_base = data;
+ entry.iov_len = cfg_payload_len;
+ msg.msg_iov = &entry;
+ msg.msg_iovlen = 1;
+ msg.msg_name = NULL;
+ msg.msg_namelen = 0;
+ msg.msg_control = ctrl;
+ msg.msg_controllen = sizeof(ctrl);
+
+ ret = recvmsg(fd, &msg, MSG_ERRQUEUE);
+ if (ret == -1 && errno != EAGAIN)
+ error(1, errno, "recvmsg");
+
+ if (ret >= 0) {
+ __recv_errmsg_cmsg(&msg, ret);
+ if (cfg_show_payload)
+ print_payload(data, cfg_payload_len);
+ }
+
+ free(data);
+ return ret == -1;
+}
+
+static uint16_t get_ip_csum(const uint16_t *start, int num_words,
+ unsigned long sum)
+{
+ int i;
+
+ for (i = 0; i < num_words; i++)
+ sum += start[i];
+
+ while (sum >> 16)
+ sum = (sum & 0xFFFF) + (sum >> 16);
+
+ return ~sum;
+}
+
+static uint16_t get_udp_csum(const struct udphdr *udph, int alen)
+{
+ unsigned long pseudo_sum, csum_len;
+ const void *csum_start = udph;
+
+ pseudo_sum = htons(IPPROTO_UDP);
+ pseudo_sum += udph->len;
+
+ /* checksum ip(v6) addresses + udp header + payload */
+ csum_start -= alen * 2;
+ csum_len = ntohs(udph->len) + alen * 2;
+
+ return get_ip_csum(csum_start, csum_len >> 1, pseudo_sum);
+}
+
+static int fill_header_ipv4(void *p)
+{
+ struct iphdr *iph = p;
+
+ memset(iph, 0, sizeof(*iph));
+
+ iph->ihl = 5;
+ iph->version = 4;
+ iph->ttl = 2;
+ iph->saddr = daddr.sin_addr.s_addr; /* set for udp csum calc */
+ iph->daddr = daddr.sin_addr.s_addr;
+ iph->protocol = IPPROTO_UDP;
+
+ /* kernel writes saddr, csum, len */
+
+ return sizeof(*iph);
+}
+
+static int fill_header_ipv6(void *p)
+{
+ struct ipv6hdr *ip6h = p;
+
+ memset(ip6h, 0, sizeof(*ip6h));
+
+ ip6h->version = 6;
+ ip6h->payload_len = htons(sizeof(struct udphdr) + cfg_payload_len);
+ ip6h->nexthdr = IPPROTO_UDP;
+ ip6h->hop_limit = 64;
+
+ ip6h->saddr = daddr6.sin6_addr;
+ ip6h->daddr = daddr6.sin6_addr;
+
+ /* kernel does not write saddr in case of ipv6 */
+
+ return sizeof(*ip6h);
+}
+
+static void fill_header_udp(void *p, bool is_ipv4)
+{
+ struct udphdr *udph = p;
+
+ udph->source = ntohs(dest_port + 1); /* spoof */
+ udph->dest = ntohs(dest_port);
+ udph->len = ntohs(sizeof(*udph) + cfg_payload_len);
+ udph->check = 0;
+
+ udph->check = get_udp_csum(udph, is_ipv4 ? sizeof(struct in_addr) :
+ sizeof(struct in6_addr));
+}
+
+static void do_test(int family, unsigned int report_opt)
+{
+ char control[CMSG_SPACE(sizeof(uint32_t))];
+ struct sockaddr_ll laddr;
+ unsigned int sock_opt;
+ struct cmsghdr *cmsg;
+ struct msghdr msg;
+ struct iovec iov;
+ char *buf;
+ int fd, i, val = 1, total_len, epfd = 0;
+
+ init_timing_event(&usr_enq);
+ init_timing_event(&usr_snd);
+ init_timing_event(&usr_ack);
+
+ total_len = cfg_payload_len;
+ if (cfg_use_pf_packet || cfg_proto == SOCK_RAW) {
+ total_len += sizeof(struct udphdr);
+ if (cfg_use_pf_packet || cfg_ipproto == IPPROTO_RAW) {
+ if (family == PF_INET)
+ total_len += sizeof(struct iphdr);
+ else
+ total_len += sizeof(struct ipv6hdr);
+ }
+ /* special case, only rawv6_sendmsg:
+ * pass proto in sin6_port if not connected
+ * also see ANK comment in net/ipv4/raw.c
+ */
+ daddr6.sin6_port = htons(cfg_ipproto);
+ }
+
+ buf = malloc(total_len);
+ if (!buf)
+ error(1, 0, "malloc");
+
+ fd = socket(cfg_use_pf_packet ? PF_PACKET : family,
+ cfg_proto, cfg_ipproto);
+ if (fd < 0)
+ error(1, errno, "socket");
+
+ if (cfg_use_epoll) {
+ struct epoll_event ev;
+
+ memset(&ev, 0, sizeof(ev));
+ ev.data.fd = fd;
+ if (cfg_epollet)
+ ev.events |= EPOLLET;
+ epfd = epoll_create(1);
+ if (epfd <= 0)
+ error(1, errno, "epoll_create");
+ if (epoll_ctl(epfd, EPOLL_CTL_ADD, fd, &ev))
+ error(1, errno, "epoll_ctl");
+ }
+
+ /* reset expected key on each new socket */
+ saved_tskey = -1;
+
+ if (cfg_proto == SOCK_STREAM) {
+ if (setsockopt(fd, IPPROTO_TCP, TCP_NODELAY,
+ (char*) &val, sizeof(val)))
+ error(1, 0, "setsockopt no nagle");
+
+ if (family == PF_INET) {
+ if (connect(fd, (void *) &daddr, sizeof(daddr)))
+ error(1, errno, "connect ipv4");
+ } else {
+ if (connect(fd, (void *) &daddr6, sizeof(daddr6)))
+ error(1, errno, "connect ipv6");
+ }
+ }
+
+ if (cfg_do_pktinfo) {
+ if (family == AF_INET6) {
+ if (setsockopt(fd, SOL_IPV6, IPV6_RECVPKTINFO,
+ &val, sizeof(val)))
+ error(1, errno, "setsockopt pktinfo ipv6");
+ } else {
+ if (setsockopt(fd, SOL_IP, IP_PKTINFO,
+ &val, sizeof(val)))
+ error(1, errno, "setsockopt pktinfo ipv4");
+ }
+ }
+
+ sock_opt = SOF_TIMESTAMPING_SOFTWARE |
+ SOF_TIMESTAMPING_OPT_CMSG |
+ SOF_TIMESTAMPING_OPT_ID;
+
+ if (!cfg_use_cmsg)
+ sock_opt |= report_opt;
+
+ if (cfg_loop_nodata)
+ sock_opt |= SOF_TIMESTAMPING_OPT_TSONLY;
+
+ if (setsockopt(fd, SOL_SOCKET, SO_TIMESTAMPING,
+ (char *) &sock_opt, sizeof(sock_opt)))
+ error(1, 0, "setsockopt timestamping");
+
+ for (i = 0; i < cfg_num_pkts; i++) {
+ memset(&msg, 0, sizeof(msg));
+ memset(buf, 'a' + i, total_len);
+
+ if (cfg_use_pf_packet || cfg_proto == SOCK_RAW) {
+ int off = 0;
+
+ if (cfg_use_pf_packet || cfg_ipproto == IPPROTO_RAW) {
+ if (family == PF_INET)
+ off = fill_header_ipv4(buf);
+ else
+ off = fill_header_ipv6(buf);
+ }
+
+ fill_header_udp(buf + off, family == PF_INET);
+ }
+
+ print_timestamp_usr();
+
+ iov.iov_base = buf;
+ iov.iov_len = total_len;
+
+ if (cfg_proto != SOCK_STREAM) {
+ if (cfg_use_pf_packet) {
+ memset(&laddr, 0, sizeof(laddr));
+
+ laddr.sll_family = AF_PACKET;
+ laddr.sll_ifindex = 1;
+ laddr.sll_protocol = htons(family == AF_INET ? ETH_P_IP : ETH_P_IPV6);
+ laddr.sll_halen = ETH_ALEN;
+
+ msg.msg_name = (void *)&laddr;
+ msg.msg_namelen = sizeof(laddr);
+ } else if (family == PF_INET) {
+ msg.msg_name = (void *)&daddr;
+ msg.msg_namelen = sizeof(daddr);
+ } else {
+ msg.msg_name = (void *)&daddr6;
+ msg.msg_namelen = sizeof(daddr6);
+ }
+ }
+
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+
+ if (cfg_use_cmsg) {
+ memset(control, 0, sizeof(control));
+
+ msg.msg_control = control;
+ msg.msg_controllen = sizeof(control);
+
+ cmsg = CMSG_FIRSTHDR(&msg);
+ cmsg->cmsg_level = SOL_SOCKET;
+ cmsg->cmsg_type = SO_TIMESTAMPING;
+ cmsg->cmsg_len = CMSG_LEN(sizeof(uint32_t));
+
+ *((uint32_t *) CMSG_DATA(cmsg)) = report_opt;
+ }
+
+ val = sendmsg(fd, &msg, 0);
+ if (val != total_len)
+ error(1, errno, "send");
+
+ /* wait for all errors to be queued, else ACKs arrive OOO */
+ if (cfg_sleep_usec)
+ usleep(cfg_sleep_usec);
+
+ if (!cfg_busy_poll) {
+ if (cfg_use_epoll)
+ __epoll(epfd);
+ else
+ __poll(fd);
+ }
+
+ while (!recv_errmsg(fd)) {}
+ }
+
+ print_timing_event("USR-ENQ", &usr_enq);
+ print_timing_event("USR-SND", &usr_snd);
+ print_timing_event("USR-ACK", &usr_ack);
+
+ if (close(fd))
+ error(1, errno, "close");
+
+ free(buf);
+ usleep(100 * NSEC_PER_USEC);
+}
+
+static void __attribute__((noreturn)) usage(const char *filepath)
+{
+ fprintf(stderr, "\nUsage: %s [options] hostname\n"
+ "\nwhere options are:\n"
+ " -4: only IPv4\n"
+ " -6: only IPv6\n"
+ " -h: show this message\n"
+ " -b: busy poll to read from error queue\n"
+ " -c N: number of packets for each test\n"
+ " -C: use cmsg to set tstamp recording options\n"
+ " -e: use level-triggered epoll() instead of poll()\n"
+ " -E: use event-triggered epoll() instead of poll()\n"
+ " -F: poll()/epoll() waits forever for an event\n"
+ " -I: request PKTINFO\n"
+ " -l N: send N bytes at a time\n"
+ " -L listen on hostname and port\n"
+ " -n: set no-payload option\n"
+ " -N: print timestamps and durations in nsec (instead of usec)\n"
+ " -p N: connect to port N\n"
+ " -P: use PF_PACKET\n"
+ " -r: use raw\n"
+ " -R: use raw (IP_HDRINCL)\n"
+ " -S N: usec to sleep before reading error queue\n"
+ " -t N: tolerance (usec) for timestamp validation\n"
+ " -u: use udp\n"
+ " -v: validate SND delay (usec)\n"
+ " -V: validate ACK delay (usec)\n"
+ " -x: show payload (up to 70 bytes)\n",
+ filepath);
+ exit(1);
+}
+
+static void parse_opt(int argc, char **argv)
+{
+ int proto_count = 0;
+ int c;
+
+ while ((c = getopt(argc, argv,
+ "46bc:CeEFhIl:LnNp:PrRS:t:uv:V:x")) != -1) {
+ switch (c) {
+ case '4':
+ do_ipv6 = 0;
+ break;
+ case '6':
+ do_ipv4 = 0;
+ break;
+ case 'b':
+ cfg_busy_poll = true;
+ break;
+ case 'c':
+ cfg_num_pkts = strtoul(optarg, NULL, 10);
+ break;
+ case 'C':
+ cfg_use_cmsg = true;
+ break;
+ case 'e':
+ cfg_use_epoll = true;
+ break;
+ case 'E':
+ cfg_use_epoll = true;
+ cfg_epollet = true;
+ case 'F':
+ cfg_poll_timeout = -1;
+ break;
+ case 'I':
+ cfg_do_pktinfo = true;
+ break;
+ case 'l':
+ cfg_payload_len = strtoul(optarg, NULL, 10);
+ break;
+ case 'L':
+ cfg_do_listen = true;
+ break;
+ case 'n':
+ cfg_loop_nodata = true;
+ break;
+ case 'N':
+ cfg_print_nsec = true;
+ break;
+ case 'p':
+ dest_port = strtoul(optarg, NULL, 10);
+ break;
+ case 'P':
+ proto_count++;
+ cfg_use_pf_packet = true;
+ cfg_proto = SOCK_DGRAM;
+ cfg_ipproto = 0;
+ break;
+ case 'r':
+ proto_count++;
+ cfg_proto = SOCK_RAW;
+ cfg_ipproto = IPPROTO_UDP;
+ break;
+ case 'R':
+ proto_count++;
+ cfg_proto = SOCK_RAW;
+ cfg_ipproto = IPPROTO_RAW;
+ break;
+ case 'S':
+ cfg_sleep_usec = strtoul(optarg, NULL, 10);
+ break;
+ case 't':
+ cfg_delay_tolerance_usec = strtoul(optarg, NULL, 10);
+ break;
+ case 'u':
+ proto_count++;
+ cfg_proto = SOCK_DGRAM;
+ cfg_ipproto = IPPROTO_UDP;
+ break;
+ case 'v':
+ cfg_delay_snd = strtoul(optarg, NULL, 10);
+ break;
+ case 'V':
+ cfg_delay_ack = strtoul(optarg, NULL, 10);
+ break;
+ case 'x':
+ cfg_show_payload = true;
+ break;
+ case 'h':
+ default:
+ usage(argv[0]);
+ }
+ }
+
+ if (!cfg_payload_len)
+ error(1, 0, "payload may not be nonzero");
+ if (cfg_proto != SOCK_STREAM && cfg_payload_len > 1472)
+ error(1, 0, "udp packet might exceed expected MTU");
+ if (!do_ipv4 && !do_ipv6)
+ error(1, 0, "pass -4 or -6, not both");
+ if (proto_count > 1)
+ error(1, 0, "pass -P, -r, -R or -u, not multiple");
+ if (cfg_do_pktinfo && cfg_use_pf_packet)
+ error(1, 0, "cannot ask for pktinfo over pf_packet");
+ if (cfg_busy_poll && cfg_use_epoll)
+ error(1, 0, "pass epoll or busy_poll, not both");
+
+ if (optind != argc - 1)
+ error(1, 0, "missing required hostname argument");
+}
+
+static void resolve_hostname(const char *hostname)
+{
+ struct addrinfo hints = { .ai_family = do_ipv4 ? AF_INET : AF_INET6 };
+ struct addrinfo *addrs, *cur;
+ int have_ipv4 = 0, have_ipv6 = 0;
+
+retry:
+ if (getaddrinfo(hostname, NULL, &hints, &addrs))
+ error(1, errno, "getaddrinfo");
+
+ cur = addrs;
+ while (cur && !have_ipv4 && !have_ipv6) {
+ if (!have_ipv4 && cur->ai_family == AF_INET) {
+ memcpy(&daddr, cur->ai_addr, sizeof(daddr));
+ daddr.sin_port = htons(dest_port);
+ have_ipv4 = 1;
+ }
+ else if (!have_ipv6 && cur->ai_family == AF_INET6) {
+ memcpy(&daddr6, cur->ai_addr, sizeof(daddr6));
+ daddr6.sin6_port = htons(dest_port);
+ have_ipv6 = 1;
+ }
+ cur = cur->ai_next;
+ }
+ if (addrs)
+ freeaddrinfo(addrs);
+
+ if (do_ipv6 && hints.ai_family != AF_INET6) {
+ hints.ai_family = AF_INET6;
+ goto retry;
+ }
+
+ do_ipv4 &= have_ipv4;
+ do_ipv6 &= have_ipv6;
+}
+
+static void do_listen(int family, void *addr, int alen)
+{
+ int fd, type;
+
+ type = cfg_proto == SOCK_RAW ? SOCK_DGRAM : cfg_proto;
+
+ fd = socket(family, type, 0);
+ if (fd == -1)
+ error(1, errno, "socket rx");
+
+ if (bind(fd, addr, alen))
+ error(1, errno, "bind rx");
+
+ if (type == SOCK_STREAM && listen(fd, 10))
+ error(1, errno, "listen rx");
+
+ /* leave fd open, will be closed on process exit.
+ * this enables connect() to succeed and avoids icmp replies
+ */
+}
+
+static void do_main(int family)
+{
+ fprintf(stderr, "family: %s %s\n",
+ family == PF_INET ? "INET" : "INET6",
+ cfg_use_pf_packet ? "(PF_PACKET)" : "");
+
+ fprintf(stderr, "test SND\n");
+ do_test(family, SOF_TIMESTAMPING_TX_SOFTWARE);
+
+ fprintf(stderr, "test ENQ\n");
+ do_test(family, SOF_TIMESTAMPING_TX_SCHED);
+
+ fprintf(stderr, "test ENQ + SND\n");
+ do_test(family, SOF_TIMESTAMPING_TX_SCHED |
+ SOF_TIMESTAMPING_TX_SOFTWARE);
+
+ if (cfg_proto == SOCK_STREAM) {
+ fprintf(stderr, "\ntest ACK\n");
+ do_test(family, SOF_TIMESTAMPING_TX_ACK);
+
+ fprintf(stderr, "\ntest SND + ACK\n");
+ do_test(family, SOF_TIMESTAMPING_TX_SOFTWARE |
+ SOF_TIMESTAMPING_TX_ACK);
+
+ fprintf(stderr, "\ntest ENQ + SND + ACK\n");
+ do_test(family, SOF_TIMESTAMPING_TX_SCHED |
+ SOF_TIMESTAMPING_TX_SOFTWARE |
+ SOF_TIMESTAMPING_TX_ACK);
+ }
+}
+
+const char *sock_names[] = { NULL, "TCP", "UDP", "RAW" };
+
+int main(int argc, char **argv)
+{
+ if (argc == 1)
+ usage(argv[0]);
+
+ parse_opt(argc, argv);
+ resolve_hostname(argv[argc - 1]);
+
+ fprintf(stderr, "protocol: %s\n", sock_names[cfg_proto]);
+ fprintf(stderr, "payload: %u\n", cfg_payload_len);
+ fprintf(stderr, "server port: %u\n", dest_port);
+ fprintf(stderr, "\n");
+
+ if (do_ipv4) {
+ if (cfg_do_listen)
+ do_listen(PF_INET, &daddr, sizeof(daddr));
+ do_main(PF_INET);
+ }
+
+ if (do_ipv6) {
+ if (cfg_do_listen)
+ do_listen(PF_INET6, &daddr6, sizeof(daddr6));
+ do_main(PF_INET6);
+ }
+
+ return test_failed;
+}
diff --git a/tools/testing/selftests/net/txtimestamp.sh b/tools/testing/selftests/net/txtimestamp.sh
new file mode 100755
index 0000000..3163776
--- /dev/null
+++ b/tools/testing/selftests/net/txtimestamp.sh
@@ -0,0 +1,82 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Send packets with transmit timestamps over loopback with netem
+# Verify that timestamps correspond to netem delay
+
+set -e
+
+setup() {
+ # set 1ms delay on lo egress
+ tc qdisc add dev lo root netem delay 1ms
+
+ # set 2ms delay on ifb0 egress
+ modprobe ifb
+ ip link add ifb_netem0 type ifb
+ ip link set dev ifb_netem0 up
+ tc qdisc add dev ifb_netem0 root netem delay 2ms
+
+ # redirect lo ingress through ifb0 egress
+ tc qdisc add dev lo handle ffff: ingress
+ tc filter add dev lo parent ffff: \
+ u32 match mark 0 0xffff \
+ action mirred egress redirect dev ifb_netem0
+}
+
+run_test_v4v6() {
+ # SND will be delayed 1000us
+ # ACK will be delayed 6000us: 1 + 2 ms round-trip
+ local -r args="$@ -v 1000 -V 6000"
+
+ ./txtimestamp ${args} -4 -L 127.0.0.1
+ ./txtimestamp ${args} -6 -L ::1
+}
+
+run_test_tcpudpraw() {
+ local -r args=$@
+
+ run_test_v4v6 ${args} # tcp
+ run_test_v4v6 ${args} -u # udp
+ run_test_v4v6 ${args} -r # raw
+ run_test_v4v6 ${args} -R # raw (IPPROTO_RAW)
+ run_test_v4v6 ${args} -P # pf_packet
+}
+
+run_test_all() {
+ setup
+ run_test_tcpudpraw # setsockopt
+ run_test_tcpudpraw -C # cmsg
+ run_test_tcpudpraw -n # timestamp w/o data
+ echo "OK. All tests passed"
+}
+
+run_test_one() {
+ setup
+ ./txtimestamp $@
+}
+
+usage() {
+ echo "Usage: $0 [ -r | --run ] <txtimestamp args> | [ -h | --help ]"
+ echo " (no args) Run all tests"
+ echo " -r|--run Run an individual test with arguments"
+ echo " -h|--help Help"
+}
+
+main() {
+ if [[ $# -eq 0 ]]; then
+ run_test_all
+ else
+ if [[ "$1" = "-r" || "$1" == "--run" ]]; then
+ shift
+ run_test_one $@
+ else
+ usage
+ fi
+ fi
+}
+
+if [[ -z "$(ip netns identify)" ]]; then
+ ./in_netns.sh $0 $@
+else
+ main $@
+fi
diff --git a/tools/testing/selftests/net/udpgso.c b/tools/testing/selftests/net/udpgso.c
index c66da6f..7badaf2 100644
--- a/tools/testing/selftests/net/udpgso.c
+++ b/tools/testing/selftests/net/udpgso.c
@@ -156,13 +156,13 @@
},
{
/* send max number of min sized segments */
- .tlen = UDP_MAX_SEGMENTS - CONST_HDRLEN_V4,
+ .tlen = UDP_MAX_SEGMENTS,
.gso_len = 1,
- .r_num_mss = UDP_MAX_SEGMENTS - CONST_HDRLEN_V4,
+ .r_num_mss = UDP_MAX_SEGMENTS,
},
{
/* send max number + 1 of min sized segments: fail */
- .tlen = UDP_MAX_SEGMENTS - CONST_HDRLEN_V4 + 1,
+ .tlen = UDP_MAX_SEGMENTS + 1,
.gso_len = 1,
.tfail = true,
},
@@ -259,13 +259,13 @@
},
{
/* send max number of min sized segments */
- .tlen = UDP_MAX_SEGMENTS - CONST_HDRLEN_V6,
+ .tlen = UDP_MAX_SEGMENTS,
.gso_len = 1,
- .r_num_mss = UDP_MAX_SEGMENTS - CONST_HDRLEN_V6,
+ .r_num_mss = UDP_MAX_SEGMENTS,
},
{
/* send max number + 1 of min sized segments: fail */
- .tlen = UDP_MAX_SEGMENTS - CONST_HDRLEN_V6 + 1,
+ .tlen = UDP_MAX_SEGMENTS + 1,
.gso_len = 1,
.tfail = true,
},
diff --git a/tools/testing/selftests/net/udpgso_bench_rx.c b/tools/testing/selftests/net/udpgso_bench_rx.c
index 76a2405..6a19342 100644
--- a/tools/testing/selftests/net/udpgso_bench_rx.c
+++ b/tools/testing/selftests/net/udpgso_bench_rx.c
@@ -293,19 +293,17 @@
static void parse_opts(int argc, char **argv)
{
+ const char *bind_addr = NULL;
int c;
- /* bind to any by default */
- setup_sockaddr(PF_INET6, "::", &cfg_bind_addr);
while ((c = getopt(argc, argv, "4b:C:Gl:n:p:rR:S:tv")) != -1) {
switch (c) {
case '4':
cfg_family = PF_INET;
cfg_alen = sizeof(struct sockaddr_in);
- setup_sockaddr(PF_INET, "0.0.0.0", &cfg_bind_addr);
break;
case 'b':
- setup_sockaddr(cfg_family, optarg, &cfg_bind_addr);
+ bind_addr = optarg;
break;
case 'C':
cfg_connect_timeout_ms = strtoul(optarg, NULL, 0);
@@ -341,6 +339,11 @@
}
}
+ if (!bind_addr)
+ bind_addr = cfg_family == PF_INET6 ? "::" : "0.0.0.0";
+
+ setup_sockaddr(cfg_family, bind_addr, &cfg_bind_addr);
+
if (optind != argc)
usage(argv[0]);
diff --git a/tools/testing/selftests/net/udpgso_bench_tx.c b/tools/testing/selftests/net/udpgso_bench_tx.c
index 17512a4..f1fdaa2 100644
--- a/tools/testing/selftests/net/udpgso_bench_tx.c
+++ b/tools/testing/selftests/net/udpgso_bench_tx.c
@@ -419,6 +419,7 @@
static void parse_opts(int argc, char **argv)
{
+ const char *bind_addr = NULL;
int max_len, hdrlen;
int c;
@@ -446,7 +447,7 @@
cfg_cpu = strtol(optarg, NULL, 0);
break;
case 'D':
- setup_sockaddr(cfg_family, optarg, &cfg_dst_addr);
+ bind_addr = optarg;
break;
case 'l':
cfg_runtime_ms = strtoul(optarg, NULL, 10) * 1000;
@@ -492,6 +493,11 @@
}
}
+ if (!bind_addr)
+ bind_addr = cfg_family == PF_INET6 ? "::" : "0.0.0.0";
+
+ setup_sockaddr(cfg_family, bind_addr, &cfg_dst_addr);
+
if (optind != argc)
usage(argv[0]);
diff --git a/tools/testing/selftests/net/vrf-xfrm-tests.sh b/tools/testing/selftests/net/vrf-xfrm-tests.sh
new file mode 100755
index 0000000..184da81
--- /dev/null
+++ b/tools/testing/selftests/net/vrf-xfrm-tests.sh
@@ -0,0 +1,436 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Various combinations of VRF with xfrms and qdisc.
+
+# Kselftest framework requirement - SKIP code is 4.
+ksft_skip=4
+
+PAUSE_ON_FAIL=no
+VERBOSE=0
+ret=0
+
+HOST1_4=192.168.1.1
+HOST2_4=192.168.1.2
+HOST1_6=2001:db8:1::1
+HOST2_6=2001:db8:1::2
+
+XFRM1_4=10.0.1.1
+XFRM2_4=10.0.1.2
+XFRM1_6=fc00:1000::1
+XFRM2_6=fc00:1000::2
+IF_ID=123
+
+VRF=red
+TABLE=300
+
+AUTH_1=0xd94fcfea65fddf21dc6e0d24a0253508
+AUTH_2=0xdc6e0d24a0253508d94fcfea65fddf21
+ENC_1=0xfc46c20f8048be9725930ff3fb07ac2a91f0347dffeacf62
+ENC_2=0x3fb07ac2a91f0347dffeacf62fc46c20f8048be9725930ff
+SPI_1=0x02122b77
+SPI_2=0x2b770212
+
+which ping6 > /dev/null 2>&1 && ping6=$(which ping6) || ping6=$(which ping)
+
+################################################################################
+#
+log_test()
+{
+ local rc=$1
+ local expected=$2
+ local msg="$3"
+
+ if [ ${rc} -eq ${expected} ]; then
+ printf "TEST: %-60s [ OK ]\n" "${msg}"
+ nsuccess=$((nsuccess+1))
+ else
+ ret=1
+ nfail=$((nfail+1))
+ printf "TEST: %-60s [FAIL]\n" "${msg}"
+ if [ "${PAUSE_ON_FAIL}" = "yes" ]; then
+ echo
+ echo "hit enter to continue, 'q' to quit"
+ read a
+ [ "$a" = "q" ] && exit 1
+ fi
+ fi
+}
+
+run_cmd_host1()
+{
+ local cmd="$*"
+ local out
+ local rc
+
+ if [ "$VERBOSE" = "1" ]; then
+ printf " COMMAND: $cmd\n"
+ fi
+
+ out=$(eval ip netns exec host1 $cmd 2>&1)
+ rc=$?
+ if [ "$VERBOSE" = "1" ]; then
+ if [ -n "$out" ]; then
+ echo
+ echo " $out"
+ fi
+ echo
+ fi
+
+ return $rc
+}
+
+################################################################################
+# create namespaces for hosts and sws
+
+create_vrf()
+{
+ local ns=$1
+ local vrf=$2
+ local table=$3
+
+ if [ -n "${ns}" ]; then
+ ns="-netns ${ns}"
+ fi
+
+ ip ${ns} link add ${vrf} type vrf table ${table}
+ ip ${ns} link set ${vrf} up
+ ip ${ns} route add vrf ${vrf} unreachable default metric 8192
+ ip ${ns} -6 route add vrf ${vrf} unreachable default metric 8192
+
+ ip ${ns} addr add 127.0.0.1/8 dev ${vrf}
+ ip ${ns} -6 addr add ::1 dev ${vrf} nodad
+
+ ip ${ns} ru del pref 0
+ ip ${ns} ru add pref 32765 from all lookup local
+ ip ${ns} -6 ru del pref 0
+ ip ${ns} -6 ru add pref 32765 from all lookup local
+}
+
+create_ns()
+{
+ local ns=$1
+ local addr=$2
+ local addr6=$3
+
+ [ -z "${addr}" ] && addr="-"
+ [ -z "${addr6}" ] && addr6="-"
+
+ ip netns add ${ns}
+
+ ip -netns ${ns} link set lo up
+ if [ "${addr}" != "-" ]; then
+ ip -netns ${ns} addr add dev lo ${addr}
+ fi
+ if [ "${addr6}" != "-" ]; then
+ ip -netns ${ns} -6 addr add dev lo ${addr6}
+ fi
+
+ ip -netns ${ns} ro add unreachable default metric 8192
+ ip -netns ${ns} -6 ro add unreachable default metric 8192
+
+ ip netns exec ${ns} sysctl -qw net.ipv4.ip_forward=1
+ ip netns exec ${ns} sysctl -qw net.ipv6.conf.all.keep_addr_on_down=1
+ ip netns exec ${ns} sysctl -qw net.ipv6.conf.all.forwarding=1
+ ip netns exec ${ns} sysctl -qw net.ipv6.conf.default.forwarding=1
+ ip netns exec ${ns} sysctl -qw net.ipv6.conf.default.accept_dad=0
+}
+
+# create veth pair to connect namespaces and apply addresses.
+connect_ns()
+{
+ local ns1=$1
+ local ns1_dev=$2
+ local ns1_addr=$3
+ local ns1_addr6=$4
+ local ns2=$5
+ local ns2_dev=$6
+ local ns2_addr=$7
+ local ns2_addr6=$8
+ local ns1arg
+ local ns2arg
+
+ if [ -n "${ns1}" ]; then
+ ns1arg="-netns ${ns1}"
+ fi
+ if [ -n "${ns2}" ]; then
+ ns2arg="-netns ${ns2}"
+ fi
+
+ ip ${ns1arg} li add ${ns1_dev} type veth peer name tmp
+ ip ${ns1arg} li set ${ns1_dev} up
+ ip ${ns1arg} li set tmp netns ${ns2} name ${ns2_dev}
+ ip ${ns2arg} li set ${ns2_dev} up
+
+ if [ "${ns1_addr}" != "-" ]; then
+ ip ${ns1arg} addr add dev ${ns1_dev} ${ns1_addr}
+ ip ${ns2arg} addr add dev ${ns2_dev} ${ns2_addr}
+ fi
+
+ if [ "${ns1_addr6}" != "-" ]; then
+ ip ${ns1arg} addr add dev ${ns1_dev} ${ns1_addr6} nodad
+ ip ${ns2arg} addr add dev ${ns2_dev} ${ns2_addr6} nodad
+ fi
+}
+
+################################################################################
+
+cleanup()
+{
+ ip netns del host1
+ ip netns del host2
+}
+
+setup()
+{
+ create_ns "host1"
+ create_ns "host2"
+
+ connect_ns "host1" eth0 ${HOST1_4}/24 ${HOST1_6}/64 \
+ "host2" eth0 ${HOST2_4}/24 ${HOST2_6}/64
+
+ create_vrf "host1" ${VRF} ${TABLE}
+ ip -netns host1 link set dev eth0 master ${VRF}
+}
+
+cleanup_xfrm()
+{
+ for ns in host1 host2
+ do
+ for x in state policy
+ do
+ ip -netns ${ns} xfrm ${x} flush
+ ip -6 -netns ${ns} xfrm ${x} flush
+ done
+ done
+}
+
+setup_xfrm()
+{
+ local h1_4=$1
+ local h2_4=$2
+ local h1_6=$3
+ local h2_6=$4
+ local devarg="$5"
+
+ #
+ # policy
+ #
+
+ # host1 - IPv4 out
+ ip -netns host1 xfrm policy add \
+ src ${h1_4} dst ${h2_4} ${devarg} dir out \
+ tmpl src ${HOST1_4} dst ${HOST2_4} proto esp mode tunnel
+
+ # host2 - IPv4 in
+ ip -netns host2 xfrm policy add \
+ src ${h1_4} dst ${h2_4} dir in \
+ tmpl src ${HOST1_4} dst ${HOST2_4} proto esp mode tunnel
+
+ # host1 - IPv4 in
+ ip -netns host1 xfrm policy add \
+ src ${h2_4} dst ${h1_4} ${devarg} dir in \
+ tmpl src ${HOST2_4} dst ${HOST1_4} proto esp mode tunnel
+
+ # host2 - IPv4 out
+ ip -netns host2 xfrm policy add \
+ src ${h2_4} dst ${h1_4} dir out \
+ tmpl src ${HOST2_4} dst ${HOST1_4} proto esp mode tunnel
+
+
+ # host1 - IPv6 out
+ ip -6 -netns host1 xfrm policy add \
+ src ${h1_6} dst ${h2_6} ${devarg} dir out \
+ tmpl src ${HOST1_6} dst ${HOST2_6} proto esp mode tunnel
+
+ # host2 - IPv6 in
+ ip -6 -netns host2 xfrm policy add \
+ src ${h1_6} dst ${h2_6} dir in \
+ tmpl src ${HOST1_6} dst ${HOST2_6} proto esp mode tunnel
+
+ # host1 - IPv6 in
+ ip -6 -netns host1 xfrm policy add \
+ src ${h2_6} dst ${h1_6} ${devarg} dir in \
+ tmpl src ${HOST2_6} dst ${HOST1_6} proto esp mode tunnel
+
+ # host2 - IPv6 out
+ ip -6 -netns host2 xfrm policy add \
+ src ${h2_6} dst ${h1_6} dir out \
+ tmpl src ${HOST2_6} dst ${HOST1_6} proto esp mode tunnel
+
+ #
+ # state
+ #
+ ip -netns host1 xfrm state add src ${HOST1_4} dst ${HOST2_4} \
+ proto esp spi ${SPI_1} reqid 0 mode tunnel \
+ replay-window 4 replay-oseq 0x4 \
+ auth-trunc 'hmac(md5)' ${AUTH_1} 96 \
+ enc 'cbc(des3_ede)' ${ENC_1} \
+ sel src ${h1_4} dst ${h2_4} ${devarg}
+
+ ip -netns host2 xfrm state add src ${HOST1_4} dst ${HOST2_4} \
+ proto esp spi ${SPI_1} reqid 0 mode tunnel \
+ replay-window 4 replay-oseq 0x4 \
+ auth-trunc 'hmac(md5)' ${AUTH_1} 96 \
+ enc 'cbc(des3_ede)' ${ENC_1} \
+ sel src ${h1_4} dst ${h2_4}
+
+
+ ip -netns host1 xfrm state add src ${HOST2_4} dst ${HOST1_4} \
+ proto esp spi ${SPI_2} reqid 0 mode tunnel \
+ replay-window 4 replay-oseq 0x4 \
+ auth-trunc 'hmac(md5)' ${AUTH_2} 96 \
+ enc 'cbc(des3_ede)' ${ENC_2} \
+ sel src ${h2_4} dst ${h1_4} ${devarg}
+
+ ip -netns host2 xfrm state add src ${HOST2_4} dst ${HOST1_4} \
+ proto esp spi ${SPI_2} reqid 0 mode tunnel \
+ replay-window 4 replay-oseq 0x4 \
+ auth-trunc 'hmac(md5)' ${AUTH_2} 96 \
+ enc 'cbc(des3_ede)' ${ENC_2} \
+ sel src ${h2_4} dst ${h1_4}
+
+
+ ip -6 -netns host1 xfrm state add src ${HOST1_6} dst ${HOST2_6} \
+ proto esp spi ${SPI_1} reqid 0 mode tunnel \
+ replay-window 4 replay-oseq 0x4 \
+ auth-trunc 'hmac(md5)' ${AUTH_1} 96 \
+ enc 'cbc(des3_ede)' ${ENC_1} \
+ sel src ${h1_6} dst ${h2_6} ${devarg}
+
+ ip -6 -netns host2 xfrm state add src ${HOST1_6} dst ${HOST2_6} \
+ proto esp spi ${SPI_1} reqid 0 mode tunnel \
+ replay-window 4 replay-oseq 0x4 \
+ auth-trunc 'hmac(md5)' ${AUTH_1} 96 \
+ enc 'cbc(des3_ede)' ${ENC_1} \
+ sel src ${h1_6} dst ${h2_6}
+
+
+ ip -6 -netns host1 xfrm state add src ${HOST2_6} dst ${HOST1_6} \
+ proto esp spi ${SPI_2} reqid 0 mode tunnel \
+ replay-window 4 replay-oseq 0x4 \
+ auth-trunc 'hmac(md5)' ${AUTH_2} 96 \
+ enc 'cbc(des3_ede)' ${ENC_2} \
+ sel src ${h2_6} dst ${h1_6} ${devarg}
+
+ ip -6 -netns host2 xfrm state add src ${HOST2_6} dst ${HOST1_6} \
+ proto esp spi ${SPI_2} reqid 0 mode tunnel \
+ replay-window 4 replay-oseq 0x4 \
+ auth-trunc 'hmac(md5)' ${AUTH_2} 96 \
+ enc 'cbc(des3_ede)' ${ENC_2} \
+ sel src ${h2_6} dst ${h1_6}
+}
+
+cleanup_xfrm_dev()
+{
+ ip -netns host1 li del xfrm0
+ ip -netns host2 addr del ${XFRM2_4}/24 dev eth0
+ ip -netns host2 addr del ${XFRM2_6}/64 dev eth0
+}
+
+setup_xfrm_dev()
+{
+ local vrfarg="vrf ${VRF}"
+
+ ip -netns host1 li add type xfrm dev eth0 if_id ${IF_ID}
+ ip -netns host1 li set xfrm0 ${vrfarg} up
+ ip -netns host1 addr add ${XFRM1_4}/24 dev xfrm0
+ ip -netns host1 addr add ${XFRM1_6}/64 dev xfrm0
+
+ ip -netns host2 addr add ${XFRM2_4}/24 dev eth0
+ ip -netns host2 addr add ${XFRM2_6}/64 dev eth0
+
+ setup_xfrm ${XFRM1_4} ${XFRM2_4} ${XFRM1_6} ${XFRM2_6} "if_id ${IF_ID}"
+}
+
+run_tests()
+{
+ cleanup_xfrm
+
+ # no IPsec
+ run_cmd_host1 ip vrf exec ${VRF} ping -c1 -w1 ${HOST2_4}
+ log_test $? 0 "IPv4 no xfrm policy"
+ run_cmd_host1 ip vrf exec ${VRF} ${ping6} -c1 -w1 ${HOST2_6}
+ log_test $? 0 "IPv6 no xfrm policy"
+
+ # xfrm without VRF in sel
+ setup_xfrm ${HOST1_4} ${HOST2_4} ${HOST1_6} ${HOST2_6}
+ run_cmd_host1 ip vrf exec ${VRF} ping -c1 -w1 ${HOST2_4}
+ log_test $? 0 "IPv4 xfrm policy based on address"
+ run_cmd_host1 ip vrf exec ${VRF} ${ping6} -c1 -w1 ${HOST2_6}
+ log_test $? 0 "IPv6 xfrm policy based on address"
+ cleanup_xfrm
+
+ # xfrm with VRF in sel
+ # Known failure: ipv4 resets the flow oif after the lookup. Fix is
+ # not straightforward.
+ # setup_xfrm ${HOST1_4} ${HOST2_4} ${HOST1_6} ${HOST2_6} "dev ${VRF}"
+ # run_cmd_host1 ip vrf exec ${VRF} ping -c1 -w1 ${HOST2_4}
+ # log_test $? 0 "IPv4 xfrm policy with VRF in selector"
+ run_cmd_host1 ip vrf exec ${VRF} ${ping6} -c1 -w1 ${HOST2_6}
+ log_test $? 0 "IPv6 xfrm policy with VRF in selector"
+ cleanup_xfrm
+
+ # xfrm with enslaved device in sel
+ # Known failures: combined with the above, __xfrm{4,6}_selector_match
+ # needs to consider both l3mdev and enslaved device index.
+ # setup_xfrm ${HOST1_4} ${HOST2_4} ${HOST1_6} ${HOST2_6} "dev eth0"
+ # run_cmd_host1 ip vrf exec ${VRF} ping -c1 -w1 ${HOST2_4}
+ # log_test $? 0 "IPv4 xfrm policy with enslaved device in selector"
+ # run_cmd_host1 ip vrf exec ${VRF} ${ping6} -c1 -w1 ${HOST2_6}
+ # log_test $? 0 "IPv6 xfrm policy with enslaved device in selector"
+ # cleanup_xfrm
+
+ # xfrm device
+ setup_xfrm_dev
+ run_cmd_host1 ip vrf exec ${VRF} ping -c1 -w1 ${XFRM2_4}
+ log_test $? 0 "IPv4 xfrm policy with xfrm device"
+ run_cmd_host1 ip vrf exec ${VRF} ${ping6} -c1 -w1 ${XFRM2_6}
+ log_test $? 0 "IPv6 xfrm policy with xfrm device"
+ cleanup_xfrm_dev
+}
+
+################################################################################
+# usage
+
+usage()
+{
+ cat <<EOF
+usage: ${0##*/} OPTS
+
+ -p Pause on fail
+ -v verbose mode (show commands and output)
+
+done
+EOF
+}
+
+################################################################################
+# main
+
+while getopts :pv o
+do
+ case $o in
+ p) PAUSE_ON_FAIL=yes;;
+ v) VERBOSE=$(($VERBOSE + 1));;
+ h) usage; exit 0;;
+ *) usage; exit 1;;
+ esac
+done
+
+cleanup 2>/dev/null
+setup
+
+echo
+echo "No qdisc on VRF device"
+run_tests
+
+run_cmd_host1 tc qdisc add dev ${VRF} root netem delay 100ms
+echo
+echo "netem qdisc on VRF device"
+run_tests
+
+printf "\nTests passed: %3d\n" ${nsuccess}
+printf "Tests failed: %3d\n" ${nfail}
+
+exit $ret
diff --git a/tools/testing/selftests/net/vrf_route_leaking.sh b/tools/testing/selftests/net/vrf_route_leaking.sh
new file mode 100755
index 0000000..23cf924
--- /dev/null
+++ b/tools/testing/selftests/net/vrf_route_leaking.sh
@@ -0,0 +1,626 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Copyright (c) 2019 David Ahern <dsahern@gmail.com>. All rights reserved.
+# Copyright (c) 2020 Michael Jeanson <mjeanson@efficios.com>. All rights reserved.
+#
+# Requires CONFIG_NET_VRF, CONFIG_VETH, CONFIG_BRIDGE and CONFIG_NET_NS.
+#
+#
+# Symmetric routing topology
+#
+# blue red
+# +----+ .253 +----+ .253 +----+
+# | h1 |-------------------| r1 |-------------------| h2 |
+# +----+ .1 +----+ .2 +----+
+# 172.16.1/24 172.16.2/24
+# 2001:db8:16:1/64 2001:db8:16:2/64
+#
+#
+# Route from h1 to h2 and back goes through r1, incoming vrf blue has a route
+# to the outgoing vrf red for the n2 network and red has a route back to n1.
+# The red VRF interface has a MTU of 1400.
+#
+# The first test sends a ping with a ttl of 1 from h1 to h2 and parses the
+# output of the command to check that a ttl expired error is received.
+#
+# The second test runs traceroute from h1 to h2 and parses the output to check
+# for a hop on r1.
+#
+# The third test sends a ping with a packet size of 1450 from h1 to h2 and
+# parses the output of the command to check that a fragmentation error is
+# received.
+#
+#
+# Asymmetric routing topology
+#
+# This topology represents a customer setup where the issue with icmp errors
+# and VRF route leaking was initialy reported. The MTU test isn't done here
+# because of the lack of a return route in the red VRF.
+#
+# blue red
+# .253 +----+ .253
+# +----| r1 |----+
+# | +----+ |
+# +----+ | | +----+
+# | h1 |--------------+ +--------------| h2 |
+# +----+ .1 | | .2 +----+
+# 172.16.1/24 | +----+ | 172.16.2/24
+# 2001:db8:16:1/64 +----| r2 |----+ 2001:db8:16:2/64
+# .254 +----+ .254
+#
+#
+# Route from h1 to h2 goes through r1, incoming vrf blue has a route to the
+# outgoing vrf red for the n2 network but red doesn't have a route back to n1.
+# Route from h2 to h1 goes through r2.
+#
+# The objective is to check that the incoming vrf routing table is selected
+# to send an ICMP error back to the source when the ttl of a packet reaches 1
+# while it is forwarded between different vrfs.
+
+VERBOSE=0
+PAUSE_ON_FAIL=no
+DEFAULT_TTYPE=sym
+
+H1_N1=172.16.1.0/24
+H1_N1_6=2001:db8:16:1::/64
+
+H1_N1_IP=172.16.1.1
+R1_N1_IP=172.16.1.253
+R2_N1_IP=172.16.1.254
+
+H1_N1_IP6=2001:db8:16:1::1
+R1_N1_IP6=2001:db8:16:1::253
+R2_N1_IP6=2001:db8:16:1::254
+
+H2_N2=172.16.2.0/24
+H2_N2_6=2001:db8:16:2::/64
+
+H2_N2_IP=172.16.2.2
+R1_N2_IP=172.16.2.253
+R2_N2_IP=172.16.2.254
+
+H2_N2_IP6=2001:db8:16:2::2
+R1_N2_IP6=2001:db8:16:2::253
+R2_N2_IP6=2001:db8:16:2::254
+
+################################################################################
+# helpers
+
+log_section()
+{
+ echo
+ echo "###########################################################################"
+ echo "$*"
+ echo "###########################################################################"
+ echo
+}
+
+log_test()
+{
+ local rc=$1
+ local expected=$2
+ local msg="$3"
+
+ if [ "${rc}" -eq "${expected}" ]; then
+ printf "TEST: %-60s [ OK ]\n" "${msg}"
+ nsuccess=$((nsuccess+1))
+ else
+ ret=1
+ nfail=$((nfail+1))
+ printf "TEST: %-60s [FAIL]\n" "${msg}"
+ if [ "${PAUSE_ON_FAIL}" = "yes" ]; then
+ echo
+ echo "hit enter to continue, 'q' to quit"
+ read -r a
+ [ "$a" = "q" ] && exit 1
+ fi
+ fi
+}
+
+run_cmd()
+{
+ local cmd="$*"
+ local out
+ local rc
+
+ if [ "$VERBOSE" = "1" ]; then
+ echo "COMMAND: $cmd"
+ fi
+
+ # shellcheck disable=SC2086
+ out=$(eval $cmd 2>&1)
+ rc=$?
+ if [ "$VERBOSE" = "1" ] && [ -n "$out" ]; then
+ echo "$out"
+ fi
+
+ [ "$VERBOSE" = "1" ] && echo
+
+ return $rc
+}
+
+run_cmd_grep()
+{
+ local grep_pattern="$1"
+ shift
+ local cmd="$*"
+ local out
+ local rc
+
+ if [ "$VERBOSE" = "1" ]; then
+ echo "COMMAND: $cmd"
+ fi
+
+ # shellcheck disable=SC2086
+ out=$(eval $cmd 2>&1)
+ if [ "$VERBOSE" = "1" ] && [ -n "$out" ]; then
+ echo "$out"
+ fi
+
+ echo "$out" | grep -q "$grep_pattern"
+ rc=$?
+
+ [ "$VERBOSE" = "1" ] && echo
+
+ return $rc
+}
+
+################################################################################
+# setup and teardown
+
+cleanup()
+{
+ local ns
+
+ for ns in h1 h2 r1 r2; do
+ ip netns del $ns 2>/dev/null
+ done
+}
+
+setup_vrf()
+{
+ local ns=$1
+
+ ip -netns "${ns}" rule del pref 0
+ ip -netns "${ns}" rule add pref 32765 from all lookup local
+ ip -netns "${ns}" -6 rule del pref 0
+ ip -netns "${ns}" -6 rule add pref 32765 from all lookup local
+}
+
+create_vrf()
+{
+ local ns=$1
+ local vrf=$2
+ local table=$3
+
+ ip -netns "${ns}" link add "${vrf}" type vrf table "${table}"
+ ip -netns "${ns}" link set "${vrf}" up
+ ip -netns "${ns}" route add vrf "${vrf}" unreachable default metric 8192
+ ip -netns "${ns}" -6 route add vrf "${vrf}" unreachable default metric 8192
+
+ ip -netns "${ns}" addr add 127.0.0.1/8 dev "${vrf}"
+ ip -netns "${ns}" -6 addr add ::1 dev "${vrf}" nodad
+}
+
+setup_sym()
+{
+ local ns
+
+ # make sure we are starting with a clean slate
+ cleanup
+
+ #
+ # create nodes as namespaces
+ #
+ for ns in h1 h2 r1; do
+ ip netns add $ns
+ ip -netns $ns link set lo up
+
+ case "${ns}" in
+ h[12]) ip netns exec $ns sysctl -q -w net.ipv6.conf.all.forwarding=0
+ ip netns exec $ns sysctl -q -w net.ipv6.conf.all.keep_addr_on_down=1
+ ;;
+ r1) ip netns exec $ns sysctl -q -w net.ipv4.ip_forward=1
+ ip netns exec $ns sysctl -q -w net.ipv6.conf.all.forwarding=1
+ esac
+ done
+
+ #
+ # create interconnects
+ #
+ ip -netns h1 link add eth0 type veth peer name r1h1
+ ip -netns h1 link set r1h1 netns r1 name eth0 up
+
+ ip -netns h2 link add eth0 type veth peer name r1h2
+ ip -netns h2 link set r1h2 netns r1 name eth1 up
+
+ #
+ # h1
+ #
+ ip -netns h1 addr add dev eth0 ${H1_N1_IP}/24
+ ip -netns h1 -6 addr add dev eth0 ${H1_N1_IP6}/64 nodad
+ ip -netns h1 link set eth0 up
+
+ # h1 to h2 via r1
+ ip -netns h1 route add ${H2_N2} via ${R1_N1_IP} dev eth0
+ ip -netns h1 -6 route add ${H2_N2_6} via "${R1_N1_IP6}" dev eth0
+
+ #
+ # h2
+ #
+ ip -netns h2 addr add dev eth0 ${H2_N2_IP}/24
+ ip -netns h2 -6 addr add dev eth0 ${H2_N2_IP6}/64 nodad
+ ip -netns h2 link set eth0 up
+
+ # h2 to h1 via r1
+ ip -netns h2 route add default via ${R1_N2_IP} dev eth0
+ ip -netns h2 -6 route add default via ${R1_N2_IP6} dev eth0
+
+ #
+ # r1
+ #
+ setup_vrf r1
+ create_vrf r1 blue 1101
+ create_vrf r1 red 1102
+ ip -netns r1 link set mtu 1400 dev eth1
+ ip -netns r1 link set eth0 vrf blue up
+ ip -netns r1 link set eth1 vrf red up
+ ip -netns r1 addr add dev eth0 ${R1_N1_IP}/24
+ ip -netns r1 -6 addr add dev eth0 ${R1_N1_IP6}/64 nodad
+ ip -netns r1 addr add dev eth1 ${R1_N2_IP}/24
+ ip -netns r1 -6 addr add dev eth1 ${R1_N2_IP6}/64 nodad
+
+ # Route leak from blue to red
+ ip -netns r1 route add vrf blue ${H2_N2} dev red
+ ip -netns r1 -6 route add vrf blue ${H2_N2_6} dev red
+
+ # Route leak from red to blue
+ ip -netns r1 route add vrf red ${H1_N1} dev blue
+ ip -netns r1 -6 route add vrf red ${H1_N1_6} dev blue
+
+
+ # Wait for ip config to settle
+ sleep 2
+}
+
+setup_asym()
+{
+ local ns
+
+ # make sure we are starting with a clean slate
+ cleanup
+
+ #
+ # create nodes as namespaces
+ #
+ for ns in h1 h2 r1 r2; do
+ ip netns add $ns
+ ip -netns $ns link set lo up
+
+ case "${ns}" in
+ h[12]) ip netns exec $ns sysctl -q -w net.ipv6.conf.all.forwarding=0
+ ip netns exec $ns sysctl -q -w net.ipv6.conf.all.keep_addr_on_down=1
+ ;;
+ r[12]) ip netns exec $ns sysctl -q -w net.ipv4.ip_forward=1
+ ip netns exec $ns sysctl -q -w net.ipv6.conf.all.forwarding=1
+ esac
+ done
+
+ #
+ # create interconnects
+ #
+ ip -netns h1 link add eth0 type veth peer name r1h1
+ ip -netns h1 link set r1h1 netns r1 name eth0 up
+
+ ip -netns h1 link add eth1 type veth peer name r2h1
+ ip -netns h1 link set r2h1 netns r2 name eth0 up
+
+ ip -netns h2 link add eth0 type veth peer name r1h2
+ ip -netns h2 link set r1h2 netns r1 name eth1 up
+
+ ip -netns h2 link add eth1 type veth peer name r2h2
+ ip -netns h2 link set r2h2 netns r2 name eth1 up
+
+ #
+ # h1
+ #
+ ip -netns h1 link add br0 type bridge
+ ip -netns h1 link set br0 up
+ ip -netns h1 addr add dev br0 ${H1_N1_IP}/24
+ ip -netns h1 -6 addr add dev br0 ${H1_N1_IP6}/64 nodad
+ ip -netns h1 link set eth0 master br0 up
+ ip -netns h1 link set eth1 master br0 up
+
+ # h1 to h2 via r1
+ ip -netns h1 route add ${H2_N2} via ${R1_N1_IP} dev br0
+ ip -netns h1 -6 route add ${H2_N2_6} via "${R1_N1_IP6}" dev br0
+
+ #
+ # h2
+ #
+ ip -netns h2 link add br0 type bridge
+ ip -netns h2 link set br0 up
+ ip -netns h2 addr add dev br0 ${H2_N2_IP}/24
+ ip -netns h2 -6 addr add dev br0 ${H2_N2_IP6}/64 nodad
+ ip -netns h2 link set eth0 master br0 up
+ ip -netns h2 link set eth1 master br0 up
+
+ # h2 to h1 via r2
+ ip -netns h2 route add default via ${R2_N2_IP} dev br0
+ ip -netns h2 -6 route add default via ${R2_N2_IP6} dev br0
+
+ #
+ # r1
+ #
+ setup_vrf r1
+ create_vrf r1 blue 1101
+ create_vrf r1 red 1102
+ ip -netns r1 link set mtu 1400 dev eth1
+ ip -netns r1 link set eth0 vrf blue up
+ ip -netns r1 link set eth1 vrf red up
+ ip -netns r1 addr add dev eth0 ${R1_N1_IP}/24
+ ip -netns r1 -6 addr add dev eth0 ${R1_N1_IP6}/64 nodad
+ ip -netns r1 addr add dev eth1 ${R1_N2_IP}/24
+ ip -netns r1 -6 addr add dev eth1 ${R1_N2_IP6}/64 nodad
+
+ # Route leak from blue to red
+ ip -netns r1 route add vrf blue ${H2_N2} dev red
+ ip -netns r1 -6 route add vrf blue ${H2_N2_6} dev red
+
+ # No route leak from red to blue
+
+ #
+ # r2
+ #
+ ip -netns r2 addr add dev eth0 ${R2_N1_IP}/24
+ ip -netns r2 -6 addr add dev eth0 ${R2_N1_IP6}/64 nodad
+ ip -netns r2 addr add dev eth1 ${R2_N2_IP}/24
+ ip -netns r2 -6 addr add dev eth1 ${R2_N2_IP6}/64 nodad
+
+ # Wait for ip config to settle
+ sleep 2
+}
+
+check_connectivity()
+{
+ ip netns exec h1 ping -c1 -w1 ${H2_N2_IP} >/dev/null 2>&1
+ log_test $? 0 "Basic IPv4 connectivity"
+ return $?
+}
+
+check_connectivity6()
+{
+ ip netns exec h1 "${ping6}" -c1 -w1 ${H2_N2_IP6} >/dev/null 2>&1
+ log_test $? 0 "Basic IPv6 connectivity"
+ return $?
+}
+
+check_traceroute()
+{
+ if [ ! -x "$(command -v traceroute)" ]; then
+ echo "SKIP: Could not run IPV4 test without traceroute"
+ return 1
+ fi
+}
+
+check_traceroute6()
+{
+ if [ ! -x "$(command -v traceroute6)" ]; then
+ echo "SKIP: Could not run IPV6 test without traceroute6"
+ return 1
+ fi
+}
+
+ipv4_traceroute()
+{
+ local ttype="$1"
+
+ [ "x$ttype" = "x" ] && ttype="$DEFAULT_TTYPE"
+
+ log_section "IPv4 ($ttype route): VRF ICMP error route lookup traceroute"
+
+ check_traceroute || return
+
+ setup_"$ttype"
+
+ check_connectivity || return
+
+ run_cmd_grep "${R1_N1_IP}" ip netns exec h1 traceroute ${H2_N2_IP}
+ log_test $? 0 "Traceroute reports a hop on r1"
+}
+
+ipv4_traceroute_asym()
+{
+ ipv4_traceroute asym
+}
+
+ipv6_traceroute()
+{
+ local ttype="$1"
+
+ [ "x$ttype" = "x" ] && ttype="$DEFAULT_TTYPE"
+
+ log_section "IPv6 ($ttype route): VRF ICMP error route lookup traceroute"
+
+ check_traceroute6 || return
+
+ setup_"$ttype"
+
+ check_connectivity6 || return
+
+ run_cmd_grep "${R1_N1_IP6}" ip netns exec h1 traceroute6 ${H2_N2_IP6}
+ log_test $? 0 "Traceroute6 reports a hop on r1"
+}
+
+ipv6_traceroute_asym()
+{
+ ipv6_traceroute asym
+}
+
+ipv4_ping_ttl()
+{
+ local ttype="$1"
+
+ [ "x$ttype" = "x" ] && ttype="$DEFAULT_TTYPE"
+
+ log_section "IPv4 ($ttype route): VRF ICMP ttl error route lookup ping"
+
+ setup_"$ttype"
+
+ check_connectivity || return
+
+ run_cmd_grep "Time to live exceeded" ip netns exec h1 ping -t1 -c1 -W2 ${H2_N2_IP}
+ log_test $? 0 "Ping received ICMP ttl exceeded"
+}
+
+ipv4_ping_ttl_asym()
+{
+ ipv4_ping_ttl asym
+}
+
+ipv4_ping_frag()
+{
+ local ttype="$1"
+
+ [ "x$ttype" = "x" ] && ttype="$DEFAULT_TTYPE"
+
+ log_section "IPv4 ($ttype route): VRF ICMP fragmentation error route lookup ping"
+
+ setup_"$ttype"
+
+ check_connectivity || return
+
+ run_cmd_grep "Frag needed" ip netns exec h1 ping -s 1450 -Mdo -c1 -W2 ${H2_N2_IP}
+ log_test $? 0 "Ping received ICMP Frag needed"
+}
+
+ipv4_ping_frag_asym()
+{
+ ipv4_ping_frag asym
+}
+
+ipv6_ping_ttl()
+{
+ local ttype="$1"
+
+ [ "x$ttype" = "x" ] && ttype="$DEFAULT_TTYPE"
+
+ log_section "IPv6 ($ttype route): VRF ICMP ttl error route lookup ping"
+
+ setup_"$ttype"
+
+ check_connectivity6 || return
+
+ run_cmd_grep "Time exceeded: Hop limit" ip netns exec h1 "${ping6}" -t1 -c1 -W2 ${H2_N2_IP6}
+ log_test $? 0 "Ping received ICMP Hop limit"
+}
+
+ipv6_ping_ttl_asym()
+{
+ ipv6_ping_ttl asym
+}
+
+ipv6_ping_frag()
+{
+ local ttype="$1"
+
+ [ "x$ttype" = "x" ] && ttype="$DEFAULT_TTYPE"
+
+ log_section "IPv6 ($ttype route): VRF ICMP fragmentation error route lookup ping"
+
+ setup_"$ttype"
+
+ check_connectivity6 || return
+
+ run_cmd_grep "Packet too big" ip netns exec h1 "${ping6}" -s 1450 -Mdo -c1 -W2 ${H2_N2_IP6}
+ log_test $? 0 "Ping received ICMP Packet too big"
+}
+
+ipv6_ping_frag_asym()
+{
+ ipv6_ping_frag asym
+}
+
+################################################################################
+# usage
+
+usage()
+{
+ cat <<EOF
+usage: ${0##*/} OPTS
+
+ -4 Run IPv4 tests only
+ -6 Run IPv6 tests only
+ -t TEST Run only TEST
+ -p Pause on fail
+ -v verbose mode (show commands and output)
+EOF
+}
+
+################################################################################
+# main
+
+# Some systems don't have a ping6 binary anymore
+command -v ping6 > /dev/null 2>&1 && ping6=$(command -v ping6) || ping6=$(command -v ping)
+
+TESTS_IPV4="ipv4_ping_ttl ipv4_traceroute ipv4_ping_frag ipv4_ping_ttl_asym ipv4_traceroute_asym"
+TESTS_IPV6="ipv6_ping_ttl ipv6_traceroute ipv6_ping_frag ipv6_ping_ttl_asym ipv6_traceroute_asym"
+
+ret=0
+nsuccess=0
+nfail=0
+
+while getopts :46t:pvh o
+do
+ case $o in
+ 4) TESTS=ipv4;;
+ 6) TESTS=ipv6;;
+ t) TESTS=$OPTARG;;
+ p) PAUSE_ON_FAIL=yes;;
+ v) VERBOSE=1;;
+ h) usage; exit 0;;
+ *) usage; exit 1;;
+ esac
+done
+
+#
+# show user test config
+#
+if [ -z "$TESTS" ]; then
+ TESTS="$TESTS_IPV4 $TESTS_IPV6"
+elif [ "$TESTS" = "ipv4" ]; then
+ TESTS="$TESTS_IPV4"
+elif [ "$TESTS" = "ipv6" ]; then
+ TESTS="$TESTS_IPV6"
+fi
+
+for t in $TESTS
+do
+ case $t in
+ ipv4_ping_ttl|ping) ipv4_ping_ttl;;&
+ ipv4_ping_ttl_asym|ping) ipv4_ping_ttl_asym;;&
+ ipv4_traceroute|traceroute) ipv4_traceroute;;&
+ ipv4_traceroute_asym|traceroute) ipv4_traceroute_asym;;&
+ ipv4_ping_frag|ping) ipv4_ping_frag;;&
+
+ ipv6_ping_ttl|ping) ipv6_ping_ttl;;&
+ ipv6_ping_ttl_asym|ping) ipv6_ping_ttl_asym;;&
+ ipv6_traceroute|traceroute) ipv6_traceroute;;&
+ ipv6_traceroute_asym|traceroute) ipv6_traceroute_asym;;&
+ ipv6_ping_frag|ping) ipv6_ping_frag;;&
+
+ # setup namespaces and config, but do not run any tests
+ setup_sym|setup) setup_sym; exit 0;;
+ setup_asym) setup_asym; exit 0;;
+
+ help) echo "Test names: $TESTS"; exit 0;;
+ esac
+done
+
+cleanup
+
+printf "\nTests passed: %3d\n" ${nsuccess}
+printf "Tests failed: %3d\n" ${nfail}
+
+exit $ret
diff --git a/tools/testing/selftests/net/vrf_strict_mode_test.sh b/tools/testing/selftests/net/vrf_strict_mode_test.sh
new file mode 100755
index 0000000..18b982d
--- /dev/null
+++ b/tools/testing/selftests/net/vrf_strict_mode_test.sh
@@ -0,0 +1,396 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# This test is designed for testing the new VRF strict_mode functionality.
+
+ret=0
+
+# identifies the "init" network namespace which is often called root network
+# namespace.
+INIT_NETNS_NAME="init"
+
+PAUSE_ON_FAIL=${PAUSE_ON_FAIL:=no}
+
+log_test()
+{
+ local rc=$1
+ local expected=$2
+ local msg="$3"
+
+ if [ ${rc} -eq ${expected} ]; then
+ nsuccess=$((nsuccess+1))
+ printf "\n TEST: %-60s [ OK ]\n" "${msg}"
+ else
+ ret=1
+ nfail=$((nfail+1))
+ printf "\n TEST: %-60s [FAIL]\n" "${msg}"
+ if [ "${PAUSE_ON_FAIL}" = "yes" ]; then
+ echo
+ echo "hit enter to continue, 'q' to quit"
+ read a
+ [ "$a" = "q" ] && exit 1
+ fi
+ fi
+}
+
+print_log_test_results()
+{
+ if [ "$TESTS" != "none" ]; then
+ printf "\nTests passed: %3d\n" ${nsuccess}
+ printf "Tests failed: %3d\n" ${nfail}
+ fi
+}
+
+log_section()
+{
+ echo
+ echo "################################################################################"
+ echo "TEST SECTION: $*"
+ echo "################################################################################"
+}
+
+ip_expand_args()
+{
+ local nsname=$1
+ local nsarg=""
+
+ if [ "${nsname}" != "${INIT_NETNS_NAME}" ]; then
+ nsarg="-netns ${nsname}"
+ fi
+
+ echo "${nsarg}"
+}
+
+vrf_count()
+{
+ local nsname=$1
+ local nsarg="$(ip_expand_args ${nsname})"
+
+ ip ${nsarg} -o link show type vrf | wc -l
+}
+
+count_vrf_by_table_id()
+{
+ local nsname=$1
+ local tableid=$2
+ local nsarg="$(ip_expand_args ${nsname})"
+
+ ip ${nsarg} -d -o link show type vrf | grep "table ${tableid}" | wc -l
+}
+
+add_vrf()
+{
+ local nsname=$1
+ local vrfname=$2
+ local vrftable=$3
+ local nsarg="$(ip_expand_args ${nsname})"
+
+ ip ${nsarg} link add ${vrfname} type vrf table ${vrftable} &>/dev/null
+}
+
+add_vrf_and_check()
+{
+ local nsname=$1
+ local vrfname=$2
+ local vrftable=$3
+ local cnt
+ local rc
+
+ add_vrf ${nsname} ${vrfname} ${vrftable}; rc=$?
+
+ cnt=$(count_vrf_by_table_id ${nsname} ${vrftable})
+
+ log_test ${rc} 0 "${nsname}: add vrf ${vrfname}, ${cnt} vrfs for table ${vrftable}"
+}
+
+add_vrf_and_check_fail()
+{
+ local nsname=$1
+ local vrfname=$2
+ local vrftable=$3
+ local cnt
+ local rc
+
+ add_vrf ${nsname} ${vrfname} ${vrftable}; rc=$?
+
+ cnt=$(count_vrf_by_table_id ${nsname} ${vrftable})
+
+ log_test ${rc} 2 "${nsname}: CANNOT add vrf ${vrfname}, ${cnt} vrfs for table ${vrftable}"
+}
+
+del_vrf_and_check()
+{
+ local nsname=$1
+ local vrfname=$2
+ local nsarg="$(ip_expand_args ${nsname})"
+
+ ip ${nsarg} link del ${vrfname}
+ log_test $? 0 "${nsname}: remove vrf ${vrfname}"
+}
+
+config_vrf_and_check()
+{
+ local nsname=$1
+ local addr=$2
+ local vrfname=$3
+ local nsarg="$(ip_expand_args ${nsname})"
+
+ ip ${nsarg} link set dev ${vrfname} up && \
+ ip ${nsarg} addr add ${addr} dev ${vrfname}
+ log_test $? 0 "${nsname}: vrf ${vrfname} up, addr ${addr}"
+}
+
+read_strict_mode()
+{
+ local nsname=$1
+ local rval
+ local rc=0
+ local nsexec=""
+
+ if [ "${nsname}" != "${INIT_NETNS_NAME}" ]; then
+ # a custom network namespace is provided
+ nsexec="ip netns exec ${nsname}"
+ fi
+
+ rval="$(${nsexec} bash -c "cat /proc/sys/net/vrf/strict_mode" | \
+ grep -E "^[0-1]$")" &> /dev/null
+ if [ $? -ne 0 ]; then
+ # set errors
+ rval=255
+ rc=1
+ fi
+
+ # on success, rval can be only 0 or 1; on error, rval is equal to 255
+ echo ${rval}
+ return ${rc}
+}
+
+read_strict_mode_compare_and_check()
+{
+ local nsname=$1
+ local expected=$2
+ local res
+
+ res="$(read_strict_mode ${nsname})"
+ log_test ${res} ${expected} "${nsname}: check strict_mode=${res}"
+}
+
+set_strict_mode()
+{
+ local nsname=$1
+ local val=$2
+ local nsexec=""
+
+ if [ "${nsname}" != "${INIT_NETNS_NAME}" ]; then
+ # a custom network namespace is provided
+ nsexec="ip netns exec ${nsname}"
+ fi
+
+ ${nsexec} bash -c "echo ${val} >/proc/sys/net/vrf/strict_mode" &>/dev/null
+}
+
+enable_strict_mode()
+{
+ local nsname=$1
+
+ set_strict_mode ${nsname} 1
+}
+
+disable_strict_mode()
+{
+ local nsname=$1
+
+ set_strict_mode ${nsname} 0
+}
+
+disable_strict_mode_and_check()
+{
+ local nsname=$1
+
+ disable_strict_mode ${nsname}
+ log_test $? 0 "${nsname}: disable strict_mode (=0)"
+}
+
+enable_strict_mode_and_check()
+{
+ local nsname=$1
+
+ enable_strict_mode ${nsname}
+ log_test $? 0 "${nsname}: enable strict_mode (=1)"
+}
+
+enable_strict_mode_and_check_fail()
+{
+ local nsname=$1
+
+ enable_strict_mode ${nsname}
+ log_test $? 1 "${nsname}: CANNOT enable strict_mode"
+}
+
+strict_mode_check_default()
+{
+ local nsname=$1
+ local strictmode
+ local vrfcnt
+
+ vrfcnt=$(vrf_count ${nsname})
+ strictmode=$(read_strict_mode ${nsname})
+ log_test ${strictmode} 0 "${nsname}: strict_mode=0 by default, ${vrfcnt} vrfs"
+}
+
+setup()
+{
+ modprobe vrf
+
+ ip netns add testns
+ ip netns exec testns ip link set lo up
+}
+
+cleanup()
+{
+ ip netns del testns 2>/dev/null
+
+ ip link del vrf100 2>/dev/null
+ ip link del vrf101 2>/dev/null
+ ip link del vrf102 2>/dev/null
+
+ echo 0 >/proc/sys/net/vrf/strict_mode 2>/dev/null
+}
+
+vrf_strict_mode_tests_init()
+{
+ vrf_strict_mode_check_support init
+
+ strict_mode_check_default init
+
+ add_vrf_and_check init vrf100 100
+ config_vrf_and_check init 172.16.100.1/24 vrf100
+
+ enable_strict_mode_and_check init
+
+ add_vrf_and_check_fail init vrf101 100
+
+ disable_strict_mode_and_check init
+
+ add_vrf_and_check init vrf101 100
+ config_vrf_and_check init 172.16.101.1/24 vrf101
+
+ enable_strict_mode_and_check_fail init
+
+ del_vrf_and_check init vrf101
+
+ enable_strict_mode_and_check init
+
+ add_vrf_and_check init vrf102 102
+ config_vrf_and_check init 172.16.102.1/24 vrf102
+
+ # the strict_modle is enabled in the init
+}
+
+vrf_strict_mode_tests_testns()
+{
+ vrf_strict_mode_check_support testns
+
+ strict_mode_check_default testns
+
+ enable_strict_mode_and_check testns
+
+ add_vrf_and_check testns vrf100 100
+ config_vrf_and_check testns 10.0.100.1/24 vrf100
+
+ add_vrf_and_check_fail testns vrf101 100
+
+ add_vrf_and_check_fail testns vrf102 100
+
+ add_vrf_and_check testns vrf200 200
+
+ disable_strict_mode_and_check testns
+
+ add_vrf_and_check testns vrf101 100
+
+ add_vrf_and_check testns vrf102 100
+
+ #the strict_mode is disabled in the testns
+}
+
+vrf_strict_mode_tests_mix()
+{
+ read_strict_mode_compare_and_check init 1
+
+ read_strict_mode_compare_and_check testns 0
+
+ del_vrf_and_check testns vrf101
+
+ del_vrf_and_check testns vrf102
+
+ disable_strict_mode_and_check init
+
+ enable_strict_mode_and_check testns
+
+ enable_strict_mode_and_check init
+ enable_strict_mode_and_check init
+
+ disable_strict_mode_and_check testns
+ disable_strict_mode_and_check testns
+
+ read_strict_mode_compare_and_check init 1
+
+ read_strict_mode_compare_and_check testns 0
+}
+
+vrf_strict_mode_tests()
+{
+ log_section "VRF strict_mode test on init network namespace"
+ vrf_strict_mode_tests_init
+
+ log_section "VRF strict_mode test on testns network namespace"
+ vrf_strict_mode_tests_testns
+
+ log_section "VRF strict_mode test mixing init and testns network namespaces"
+ vrf_strict_mode_tests_mix
+}
+
+vrf_strict_mode_check_support()
+{
+ local nsname=$1
+ local output
+ local rc
+
+ output="$(lsmod | grep '^vrf' | awk '{print $1}')"
+ if [ -z "${output}" ]; then
+ modinfo vrf || return $?
+ fi
+
+ # we do not care about the value of the strict_mode; we only check if
+ # the strict_mode parameter is available or not.
+ read_strict_mode ${nsname} &>/dev/null; rc=$?
+ log_test ${rc} 0 "${nsname}: net.vrf.strict_mode is available"
+
+ return ${rc}
+}
+
+if [ "$(id -u)" -ne 0 ];then
+ echo "SKIP: Need root privileges"
+ exit 0
+fi
+
+if [ ! -x "$(command -v ip)" ]; then
+ echo "SKIP: Could not run test without ip tool"
+ exit 0
+fi
+
+modprobe vrf &>/dev/null
+if [ ! -e /proc/sys/net/vrf/strict_mode ]; then
+ echo "SKIP: vrf sysctl does not exist"
+ exit 0
+fi
+
+cleanup &> /dev/null
+
+setup
+vrf_strict_mode_tests
+cleanup
+
+print_log_test_results
+
+exit $ret