Blame - net/ipv4/tcp_input.c - hafnium/third_party/linux

blob: 12dd08af12b5e74dcecd569fbd9a30c3390d8704 [file] [log] [blame]

Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1	// SPDX-License-Identifier: GPL-2.0
				2	/*
				3	* INET An implementation of the TCP/IP protocol suite for the LINUX
				4	* operating system. INET is implemented using the BSD Socket
				5	* interface as the means of communication with the user level.
				6	*
				7	* Implementation of the Transmission Control Protocol(TCP).
				8	*
				9	* Authors: Ross Biro
				10	* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
				11	* Mark Evans, <evansmp@uhura.aston.ac.uk>
				12	* Corey Minyard <wf-rch!minyard@relay.EU.net>
				13	* Florian La Roche, <flla@stud.uni-sb.de>
				14	* Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
				15	* Linus Torvalds, <torvalds@cs.helsinki.fi>
				16	* Alan Cox, <gw4pts@gw4pts.ampr.org>
				17	* Matthew Dillon, <dillon@apollo.west.oic.com>
				18	* Arnt Gulbrandsen, <agulbra@nvg.unit.no>
				19	* Jorge Cwik, <jorge@laser.satlink.net>
				20	*/
				21
				22	/*
				23	* Changes:
				24	* Pedro Roque : Fast Retransmit/Recovery.
				25	* Two receive queues.
				26	* Retransmit queue handled by TCP.
				27	* Better retransmit timer handling.
				28	* New congestion avoidance.
				29	* Header prediction.
				30	* Variable renaming.
				31	*
				32	* Eric : Fast Retransmit.
				33	* Randy Scott : MSS option defines.
				34	* Eric Schenk : Fixes to slow start algorithm.
				35	* Eric Schenk : Yet another double ACK bug.
				36	* Eric Schenk : Delayed ACK bug fixes.
				37	* Eric Schenk : Floyd style fast retrans war avoidance.
				38	* David S. Miller : Don't allow zero congestion window.
				39	* Eric Schenk : Fix retransmitter so that it sends
				40	* next packet on ack of previous packet.
				41	* Andi Kleen : Moved open_request checking here
				42	* and process RSTs for open_requests.
				43	* Andi Kleen : Better prune_queue, and other fixes.
				44	* Andrey Savochkin: Fix RTT measurements in the presence of
				45	* timestamps.
				46	* Andrey Savochkin: Check sequence numbers correctly when
				47	* removing SACKs due to in sequence incoming
				48	* data segments.
				49	* Andi Kleen: Make sure we never ack data there is not
				50	* enough room for. Also make this condition
				51	* a fatal error if it might still happen.
				52	* Andi Kleen: Add tcp_measure_rcv_mss to make
				53	* connections with MSS<min(MTU,ann. MSS)
				54	* work without delayed acks.
				55	* Andi Kleen: Process packets with PSH set in the
				56	* fast path.
				57	* J Hadi Salim: ECN support
				58	* Andrei Gurtov,
				59	* Pasi Sarolahti,
				60	* Panu Kuhlberg: Experimental audit of TCP (re)transmission
				61	* engine. Lots of bugs are found.
				62	* Pasi Sarolahti: F-RTO for dealing with spurious RTOs
				63	*/
				64
				65	#define pr_fmt(fmt) "TCP: " fmt
				66
				67	#include <linux/mm.h>
				68	#include <linux/slab.h>
				69	#include <linux/module.h>
				70	#include <linux/sysctl.h>
				71	#include <linux/kernel.h>
				72	#include <linux/prefetch.h>
				73	#include <net/dst.h>
				74	#include <net/tcp.h>
				75	#include <net/inet_common.h>
				76	#include <linux/ipsec.h>
				77	#include <asm/unaligned.h>
				78	#include <linux/errqueue.h>
				79	#include <trace/events/tcp.h>
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	80	#include <linux/jump_label_ratelimit.h>
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	81	#include <net/busy_poll.h>
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	82	#include <net/mptcp.h>
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	83
				84	int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
				85
				86	#define FLAG_DATA 0x01 /* Incoming frame contained data. */
				87	#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */
				88	#define FLAG_DATA_ACKED 0x04 /* This ACK acknowledged new data. */
				89	#define FLAG_RETRANS_DATA_ACKED 0x08 /* "" "" some of which was retransmitted. */
				90	#define FLAG_SYN_ACKED 0x10 /* This ACK acknowledged SYN. */
				91	#define FLAG_DATA_SACKED 0x20 /* New SACK. */
				92	#define FLAG_ECE 0x40 /* ECE in this ACK */
				93	#define FLAG_LOST_RETRANS 0x80 /* This ACK marks some retransmission lost */
				94	#define FLAG_SLOWPATH 0x100 /* Do not skip RFC checks for window update.*/
				95	#define FLAG_ORIG_SACK_ACKED 0x200 /* Never retransmitted data are (s)acked */
				96	#define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */
				97	#define FLAG_DSACKING_ACK 0x800 /* SACK blocks contained D-SACK info */
				98	#define FLAG_SET_XMIT_TIMER 0x1000 /* Set TLP or RTO timer */
				99	#define FLAG_SACK_RENEGING 0x2000 /* snd_una advanced to a sacked seq */
				100	#define FLAG_UPDATE_TS_RECENT 0x4000 /* tcp_replace_ts_recent() */
				101	#define FLAG_NO_CHALLENGE_ACK 0x8000 /* do not call tcp_send_challenge_ack() */
				102	#define FLAG_ACK_MAYBE_DELAYED 0x10000 /* Likely a delayed ACK */
				103
				104	#define FLAG_ACKED (FLAG_DATA_ACKED\|FLAG_SYN_ACKED)
				105	#define FLAG_NOT_DUP (FLAG_DATA\|FLAG_WIN_UPDATE\|FLAG_ACKED)
				106	#define FLAG_CA_ALERT (FLAG_DATA_SACKED\|FLAG_ECE\|FLAG_DSACKING_ACK)
				107	#define FLAG_FORWARD_PROGRESS (FLAG_ACKED\|FLAG_DATA_SACKED)
				108
				109	#define TCP_REMNANT (TCP_FLAG_FIN\|TCP_FLAG_URG\|TCP_FLAG_SYN\|TCP_FLAG_PSH)
				110	#define TCP_HP_BITS (~(TCP_RESERVED_BITS\|TCP_FLAG_PSH))
				111
				112	#define REXMIT_NONE 0 /* no loss recovery to do */
				113	#define REXMIT_LOST 1 /* retransmit packets marked lost */
				114	#define REXMIT_NEW 2 /* FRTO-style transmit of unsent/new packets */
				115
				116	#if IS_ENABLED(CONFIG_TLS_DEVICE)
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	117	static DEFINE_STATIC_KEY_DEFERRED_FALSE(clean_acked_data_enabled, HZ);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	118
				119	void clean_acked_data_enable(struct inet_connection_sock *icsk,
				120	void (cad)(struct sock sk, u32 ack_seq))
				121	{
				122	icsk->icsk_clean_acked = cad;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	123	static_branch_deferred_inc(&clean_acked_data_enabled);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	124	}
				125	EXPORT_SYMBOL_GPL(clean_acked_data_enable);
				126
				127	void clean_acked_data_disable(struct inet_connection_sock *icsk)
				128	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	129	static_branch_slow_dec_deferred(&clean_acked_data_enabled);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	130	icsk->icsk_clean_acked = NULL;
				131	}
				132	EXPORT_SYMBOL_GPL(clean_acked_data_disable);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	133
				134	void clean_acked_data_flush(void)
				135	{
				136	static_key_deferred_flush(&clean_acked_data_enabled);
				137	}
				138	EXPORT_SYMBOL_GPL(clean_acked_data_flush);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	139	#endif
				140
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	141	#ifdef CONFIG_CGROUP_BPF
				142	static void bpf_skops_parse_hdr(struct sock sk, struct sk_buff skb)
				143	{
				144	bool unknown_opt = tcp_sk(sk)->rx_opt.saw_unknown &&
				145	BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk),
				146	BPF_SOCK_OPS_PARSE_UNKNOWN_HDR_OPT_CB_FLAG);
				147	bool parse_all_opt = BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk),
				148	BPF_SOCK_OPS_PARSE_ALL_HDR_OPT_CB_FLAG);
				149	struct bpf_sock_ops_kern sock_ops;
				150
				151	if (likely(!unknown_opt && !parse_all_opt))
				152	return;
				153
				154	/* The skb will be handled in the
				155	* bpf_skops_established() or
				156	* bpf_skops_write_hdr_opt().
				157	*/
				158	switch (sk->sk_state) {
				159	case TCP_SYN_RECV:
				160	case TCP_SYN_SENT:
				161	case TCP_LISTEN:
				162	return;
				163	}
				164
				165	sock_owned_by_me(sk);
				166
				167	memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp));
				168	sock_ops.op = BPF_SOCK_OPS_PARSE_HDR_OPT_CB;
				169	sock_ops.is_fullsock = 1;
				170	sock_ops.sk = sk;
				171	bpf_skops_init_skb(&sock_ops, skb, tcp_hdrlen(skb));
				172
				173	BPF_CGROUP_RUN_PROG_SOCK_OPS(&sock_ops);
				174	}
				175
				176	static void bpf_skops_established(struct sock *sk, int bpf_op,
				177	struct sk_buff *skb)
				178	{
				179	struct bpf_sock_ops_kern sock_ops;
				180
				181	sock_owned_by_me(sk);
				182
				183	memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp));
				184	sock_ops.op = bpf_op;
				185	sock_ops.is_fullsock = 1;
				186	sock_ops.sk = sk;
				187	/* sk with TCP_REPAIR_ON does not have skb in tcp_finish_connect */
				188	if (skb)
				189	bpf_skops_init_skb(&sock_ops, skb, tcp_hdrlen(skb));
				190
				191	BPF_CGROUP_RUN_PROG_SOCK_OPS(&sock_ops);
				192	}
				193	#else
				194	static void bpf_skops_parse_hdr(struct sock sk, struct sk_buff skb)
				195	{
				196	}
				197
				198	static void bpf_skops_established(struct sock *sk, int bpf_op,
				199	struct sk_buff *skb)
				200	{
				201	}
				202	#endif
				203
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	204	static void tcp_gro_dev_warn(struct sock sk, const struct sk_buff skb,
				205	unsigned int len)
				206	{
				207	static bool __once __read_mostly;
				208
				209	if (!__once) {
				210	struct net_device *dev;
				211
				212	__once = true;
				213
				214	rcu_read_lock();
				215	dev = dev_get_by_index_rcu(sock_net(sk), skb->skb_iif);
				216	if (!dev \|\| len >= dev->mtu)
				217	pr_warn("%s: Driver has suspect GRO implementation, TCP performance may be compromised.\n",
				218	dev ? dev->name : "Unknown driver");
				219	rcu_read_unlock();
				220	}
				221	}
				222
				223	/* Adapt the MSS value used to make delayed ack decision to the
				224	* real world.
				225	*/
				226	static void tcp_measure_rcv_mss(struct sock sk, const struct sk_buff skb)
				227	{
				228	struct inet_connection_sock *icsk = inet_csk(sk);
				229	const unsigned int lss = icsk->icsk_ack.last_seg_size;
				230	unsigned int len;
				231
				232	icsk->icsk_ack.last_seg_size = 0;
				233
				234	/* skb->len may jitter because of SACKs, even if peer
				235	* sends good full-sized frames.
				236	*/
				237	len = skb_shinfo(skb)->gso_size ? : skb->len;
				238	if (len >= icsk->icsk_ack.rcv_mss) {
				239	icsk->icsk_ack.rcv_mss = min_t(unsigned int, len,
				240	tcp_sk(sk)->advmss);
				241	/* Account for possibly-removed options */
				242	if (unlikely(len > icsk->icsk_ack.rcv_mss +
				243	MAX_TCP_OPTION_SPACE))
				244	tcp_gro_dev_warn(sk, skb, len);
				245	} else {
				246	/* Otherwise, we make more careful check taking into account,
				247	* that SACKs block is variable.
				248	*
				249	* "len" is invariant segment length, including TCP header.
				250	*/
				251	len += skb->data - skb_transport_header(skb);
				252	if (len >= TCP_MSS_DEFAULT + sizeof(struct tcphdr) \|\|
				253	/* If PSH is not set, packet should be
				254	* full sized, provided peer TCP is not badly broken.
				255	* This observation (if it is correct 8)) allows
				256	* to handle super-low mtu links fairly.
				257	*/
				258	(len >= TCP_MIN_MSS + sizeof(struct tcphdr) &&
				259	!(tcp_flag_word(tcp_hdr(skb)) & TCP_REMNANT))) {
				260	/* Subtract also invariant (if peer is RFC compliant),
				261	* tcp header plus fixed timestamp option length.
				262	* Resulting "len" is MSS free of SACK jitter.
				263	*/
				264	len -= tcp_sk(sk)->tcp_header_len;
				265	icsk->icsk_ack.last_seg_size = len;
				266	if (len == lss) {
				267	icsk->icsk_ack.rcv_mss = len;
				268	return;
				269	}
				270	}
				271	if (icsk->icsk_ack.pending & ICSK_ACK_PUSHED)
				272	icsk->icsk_ack.pending \|= ICSK_ACK_PUSHED2;
				273	icsk->icsk_ack.pending \|= ICSK_ACK_PUSHED;
				274	}
				275	}
				276
				277	static void tcp_incr_quickack(struct sock *sk, unsigned int max_quickacks)
				278	{
				279	struct inet_connection_sock *icsk = inet_csk(sk);
				280	unsigned int quickacks = tcp_sk(sk)->rcv_wnd / (2 * icsk->icsk_ack.rcv_mss);
				281
				282	if (quickacks == 0)
				283	quickacks = 2;
				284	quickacks = min(quickacks, max_quickacks);
				285	if (quickacks > icsk->icsk_ack.quick)
				286	icsk->icsk_ack.quick = quickacks;
				287	}
				288
				289	void tcp_enter_quickack_mode(struct sock *sk, unsigned int max_quickacks)
				290	{
				291	struct inet_connection_sock *icsk = inet_csk(sk);
				292
				293	tcp_incr_quickack(sk, max_quickacks);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	294	inet_csk_exit_pingpong_mode(sk);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	295	icsk->icsk_ack.ato = TCP_ATO_MIN;
				296	}
				297	EXPORT_SYMBOL(tcp_enter_quickack_mode);
				298
				299	/* Send ACKs quickly, if "quick" count is not exhausted
				300	* and the session is not interactive.
				301	*/
				302
				303	static bool tcp_in_quickack_mode(struct sock *sk)
				304	{
				305	const struct inet_connection_sock *icsk = inet_csk(sk);
				306	const struct dst_entry *dst = __sk_dst_get(sk);
				307
				308	return (dst && dst_metric(dst, RTAX_QUICKACK)) \|\|
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	309	(icsk->icsk_ack.quick && !inet_csk_in_pingpong_mode(sk));
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	310	}
				311
				312	static void tcp_ecn_queue_cwr(struct tcp_sock *tp)
				313	{
				314	if (tp->ecn_flags & TCP_ECN_OK)
				315	tp->ecn_flags \|= TCP_ECN_QUEUE_CWR;
				316	}
				317
				318	static void tcp_ecn_accept_cwr(struct sock sk, const struct sk_buff skb)
				319	{
				320	if (tcp_hdr(skb)->cwr) {
				321	tcp_sk(sk)->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
				322
				323	/* If the sender is telling us it has entered CWR, then its
				324	* cwnd may be very low (even just 1 packet), so we should ACK
				325	* immediately.
				326	*/
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame]	327	if (TCP_SKB_CB(skb)->seq != TCP_SKB_CB(skb)->end_seq)
				328	inet_csk(sk)->icsk_ack.pending \|= ICSK_ACK_NOW;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	329	}
				330	}
				331
				332	static void tcp_ecn_withdraw_cwr(struct tcp_sock *tp)
				333	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	334	tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	335	}
				336
				337	static void __tcp_ecn_check_ce(struct sock sk, const struct sk_buff skb)
				338	{
				339	struct tcp_sock *tp = tcp_sk(sk);
				340
				341	switch (TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK) {
				342	case INET_ECN_NOT_ECT:
				343	/* Funny extension: if ECT is not set on a segment,
				344	* and we already seen ECT on a previous segment,
				345	* it is probably a retransmit.
				346	*/
				347	if (tp->ecn_flags & TCP_ECN_SEEN)
				348	tcp_enter_quickack_mode(sk, 2);
				349	break;
				350	case INET_ECN_CE:
				351	if (tcp_ca_needs_ecn(sk))
				352	tcp_ca_event(sk, CA_EVENT_ECN_IS_CE);
				353
				354	if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) {
				355	/* Better not delay acks, sender can have a very low cwnd */
				356	tcp_enter_quickack_mode(sk, 2);
				357	tp->ecn_flags \|= TCP_ECN_DEMAND_CWR;
				358	}
				359	tp->ecn_flags \|= TCP_ECN_SEEN;
				360	break;
				361	default:
				362	if (tcp_ca_needs_ecn(sk))
				363	tcp_ca_event(sk, CA_EVENT_ECN_NO_CE);
				364	tp->ecn_flags \|= TCP_ECN_SEEN;
				365	break;
				366	}
				367	}
				368
				369	static void tcp_ecn_check_ce(struct sock sk, const struct sk_buff skb)
				370	{
				371	if (tcp_sk(sk)->ecn_flags & TCP_ECN_OK)
				372	__tcp_ecn_check_ce(sk, skb);
				373	}
				374
				375	static void tcp_ecn_rcv_synack(struct tcp_sock tp, const struct tcphdr th)
				376	{
				377	if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece \|\| th->cwr))
				378	tp->ecn_flags &= ~TCP_ECN_OK;
				379	}
				380
				381	static void tcp_ecn_rcv_syn(struct tcp_sock tp, const struct tcphdr th)
				382	{
				383	if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece \|\| !th->cwr))
				384	tp->ecn_flags &= ~TCP_ECN_OK;
				385	}
				386
				387	static bool tcp_ecn_rcv_ecn_echo(const struct tcp_sock tp, const struct tcphdr th)
				388	{
				389	if (th->ece && !th->syn && (tp->ecn_flags & TCP_ECN_OK))
				390	return true;
				391	return false;
				392	}
				393
				394	/* Buffer size and advertised window tuning.
				395	*
				396	* 1. Tuning sk->sk_sndbuf, when connection enters established state.
				397	*/
				398
				399	static void tcp_sndbuf_expand(struct sock *sk)
				400	{
				401	const struct tcp_sock *tp = tcp_sk(sk);
				402	const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
				403	int sndmem, per_mss;
				404	u32 nr_segs;
				405
				406	/* Worst case is non GSO/TSO : each frame consumes one skb
				407	* and skb->head is kmalloced using power of two area of memory
				408	*/
				409	per_mss = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache) +
				410	MAX_TCP_HEADER +
				411	SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
				412
				413	per_mss = roundup_pow_of_two(per_mss) +
				414	SKB_DATA_ALIGN(sizeof(struct sk_buff));
				415
				416	nr_segs = max_t(u32, TCP_INIT_CWND, tp->snd_cwnd);
				417	nr_segs = max_t(u32, nr_segs, tp->reordering + 1);
				418
				419	/* Fast Recovery (RFC 5681 3.2) :
				420	* Cubic needs 1.7 factor, rounded to 2 to include
				421	* extra cushion (application might react slowly to EPOLLOUT)
				422	*/
				423	sndmem = ca_ops->sndbuf_expand ? ca_ops->sndbuf_expand(sk) : 2;
				424	sndmem = nr_segs per_mss;
				425
				426	if (sk->sk_sndbuf < sndmem)
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	427	WRITE_ONCE(sk->sk_sndbuf,
				428	min(sndmem, sock_net(sk)->ipv4.sysctl_tcp_wmem[2]));
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	429	}
				430
				431	/* 2. Tuning advertised window (window_clamp, rcv_ssthresh)
				432	*
				433	* All tcp_full_space() is split to two parts: "network" buffer, allocated
				434	* forward and advertised in receiver window (tp->rcv_wnd) and
				435	* "application buffer", required to isolate scheduling/application
				436	* latencies from network.
				437	* window_clamp is maximal advertised window. It can be less than
				438	* tcp_full_space(), in this case tcp_full_space() - window_clamp
				439	* is reserved for "application" buffer. The less window_clamp is
				440	* the smoother our behaviour from viewpoint of network, but the lower
				441	* throughput and the higher sensitivity of the connection to losses. 8)
				442	*
				443	* rcv_ssthresh is more strict window_clamp used at "slow start"
				444	* phase to predict further behaviour of this connection.
				445	* It is used for two goals:
				446	* - to enforce header prediction at sender, even when application
				447	* requires some significant "application buffer". It is check #1.
				448	* - to prevent pruning of receive queue because of misprediction
				449	* of receiver window. Check #2.
				450	*
				451	* The scheme does not work when sender sends good segments opening
				452	* window and then starts to feed us spaghetti. But it should work
				453	* in common situations. Otherwise, we have to rely on queue collapsing.
				454	*/
				455
				456	/* Slow part of check#2. */
				457	static int __tcp_grow_window(const struct sock sk, const struct sk_buff skb)
				458	{
				459	struct tcp_sock *tp = tcp_sk(sk);
				460	/* Optimize this! */
				461	int truesize = tcp_win_from_space(sk, skb->truesize) >> 1;
				462	int window = tcp_win_from_space(sk, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]) >> 1;
				463
				464	while (tp->rcv_ssthresh <= window) {
				465	if (truesize <= skb->len)
				466	return 2 * inet_csk(sk)->icsk_ack.rcv_mss;
				467
				468	truesize >>= 1;
				469	window >>= 1;
				470	}
				471	return 0;
				472	}
				473
				474	static void tcp_grow_window(struct sock sk, const struct sk_buff skb)
				475	{
				476	struct tcp_sock *tp = tcp_sk(sk);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	477	int room;
				478
				479	room = min_t(int, tp->window_clamp, tcp_space(sk)) - tp->rcv_ssthresh;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	480
				481	/* Check #1 */
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	482	if (room > 0 && !tcp_under_memory_pressure(sk)) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	483	int incr;
				484
				485	/* Check #2. Increase window, if skb with such overhead
				486	* will fit to rcvbuf in future.
				487	*/
				488	if (tcp_win_from_space(sk, skb->truesize) <= skb->len)
				489	incr = 2 * tp->advmss;
				490	else
				491	incr = __tcp_grow_window(sk, skb);
				492
				493	if (incr) {
				494	incr = max_t(int, incr, 2 * skb->len);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	495	tp->rcv_ssthresh += min(room, incr);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	496	inet_csk(sk)->icsk_ack.quick \|= 1;
				497	}
				498	}
				499	}
				500
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	501	/* 3. Try to fixup all. It is made immediately after connection enters
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	502	* established state.
				503	*/
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	504	static void tcp_init_buffer_space(struct sock *sk)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	505	{
				506	int tcp_app_win = sock_net(sk)->ipv4.sysctl_tcp_app_win;
				507	struct tcp_sock *tp = tcp_sk(sk);
				508	int maxwin;
				509
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	510	if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK))
				511	tcp_sndbuf_expand(sk);
				512
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	513	tcp_mstamp_refresh(tp);
				514	tp->rcvq_space.time = tp->tcp_mstamp;
				515	tp->rcvq_space.seq = tp->copied_seq;
				516
				517	maxwin = tcp_full_space(sk);
				518
				519	if (tp->window_clamp >= maxwin) {
				520	tp->window_clamp = maxwin;
				521
				522	if (tcp_app_win && maxwin > 4 * tp->advmss)
				523	tp->window_clamp = max(maxwin -
				524	(maxwin >> tcp_app_win),
				525	4 * tp->advmss);
				526	}
				527
				528	/* Force reservation of one segment. */
				529	if (tcp_app_win &&
				530	tp->window_clamp > 2 * tp->advmss &&
				531	tp->window_clamp + tp->advmss > maxwin)
				532	tp->window_clamp = max(2 * tp->advmss, maxwin - tp->advmss);
				533
				534	tp->rcv_ssthresh = min(tp->rcv_ssthresh, tp->window_clamp);
				535	tp->snd_cwnd_stamp = tcp_jiffies32;
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame]	536	tp->rcvq_space.space = min3(tp->rcv_ssthresh, tp->rcv_wnd,
				537	(u32)TCP_INIT_CWND * tp->advmss);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	538	}
				539
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	540	/* 4. Recalculate window clamp after socket hit its memory bounds. */
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	541	static void tcp_clamp_window(struct sock *sk)
				542	{
				543	struct tcp_sock *tp = tcp_sk(sk);
				544	struct inet_connection_sock *icsk = inet_csk(sk);
				545	struct net *net = sock_net(sk);
				546
				547	icsk->icsk_ack.quick = 0;
				548
				549	if (sk->sk_rcvbuf < net->ipv4.sysctl_tcp_rmem[2] &&
				550	!(sk->sk_userlocks & SOCK_RCVBUF_LOCK) &&
				551	!tcp_under_memory_pressure(sk) &&
				552	sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	553	WRITE_ONCE(sk->sk_rcvbuf,
				554	min(atomic_read(&sk->sk_rmem_alloc),
				555	net->ipv4.sysctl_tcp_rmem[2]));
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	556	}
				557	if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf)
				558	tp->rcv_ssthresh = min(tp->window_clamp, 2U * tp->advmss);
				559	}
				560
				561	/* Initialize RCV_MSS value.
				562	* RCV_MSS is an our guess about MSS used by the peer.
				563	* We haven't any direct information about the MSS.
				564	* It's better to underestimate the RCV_MSS rather than overestimate.
				565	* Overestimations make us ACKing less frequently than needed.
				566	* Underestimations are more easy to detect and fix by tcp_measure_rcv_mss().
				567	*/
				568	void tcp_initialize_rcv_mss(struct sock *sk)
				569	{
				570	const struct tcp_sock *tp = tcp_sk(sk);
				571	unsigned int hint = min_t(unsigned int, tp->advmss, tp->mss_cache);
				572
				573	hint = min(hint, tp->rcv_wnd / 2);
				574	hint = min(hint, TCP_MSS_DEFAULT);
				575	hint = max(hint, TCP_MIN_MSS);
				576
				577	inet_csk(sk)->icsk_ack.rcv_mss = hint;
				578	}
				579	EXPORT_SYMBOL(tcp_initialize_rcv_mss);
				580
				581	/* Receiver "autotuning" code.
				582	*
				583	* The algorithm for RTT estimation w/o timestamps is based on
				584	* Dynamic Right-Sizing (DRS) by Wu Feng and Mike Fisk of LANL.
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	585	* <https://public.lanl.gov/radiant/pubs.html#DRS>
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	586	*
				587	* More detail on this code can be found at
				588	* <http://staff.psc.edu/jheffner/>,
				589	* though this reference is out of date. A new paper
				590	* is pending.
				591	*/
				592	static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep)
				593	{
				594	u32 new_sample = tp->rcv_rtt_est.rtt_us;
				595	long m = sample;
				596
				597	if (new_sample != 0) {
				598	/* If we sample in larger samples in the non-timestamp
				599	* case, we could grossly overestimate the RTT especially
				600	* with chatty applications or bulk transfer apps which
				601	* are stalled on filesystem I/O.
				602	*
				603	* Also, since we are only going for a minimum in the
				604	* non-timestamp case, we do not smooth things out
				605	* else with timestamps disabled convergence takes too
				606	* long.
				607	*/
				608	if (!win_dep) {
				609	m -= (new_sample >> 3);
				610	new_sample += m;
				611	} else {
				612	m <<= 3;
				613	if (m < new_sample)
				614	new_sample = m;
				615	}
				616	} else {
				617	/* No previous measure. */
				618	new_sample = m << 3;
				619	}
				620
				621	tp->rcv_rtt_est.rtt_us = new_sample;
				622	}
				623
				624	static inline void tcp_rcv_rtt_measure(struct tcp_sock *tp)
				625	{
				626	u32 delta_us;
				627
				628	if (tp->rcv_rtt_est.time == 0)
				629	goto new_measure;
				630	if (before(tp->rcv_nxt, tp->rcv_rtt_est.seq))
				631	return;
				632	delta_us = tcp_stamp_us_delta(tp->tcp_mstamp, tp->rcv_rtt_est.time);
				633	if (!delta_us)
				634	delta_us = 1;
				635	tcp_rcv_rtt_update(tp, delta_us, 1);
				636
				637	new_measure:
				638	tp->rcv_rtt_est.seq = tp->rcv_nxt + tp->rcv_wnd;
				639	tp->rcv_rtt_est.time = tp->tcp_mstamp;
				640	}
				641
				642	static inline void tcp_rcv_rtt_measure_ts(struct sock *sk,
				643	const struct sk_buff *skb)
				644	{
				645	struct tcp_sock *tp = tcp_sk(sk);
				646
				647	if (tp->rx_opt.rcv_tsecr == tp->rcv_rtt_last_tsecr)
				648	return;
				649	tp->rcv_rtt_last_tsecr = tp->rx_opt.rcv_tsecr;
				650
				651	if (TCP_SKB_CB(skb)->end_seq -
				652	TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss) {
				653	u32 delta = tcp_time_stamp(tp) - tp->rx_opt.rcv_tsecr;
				654	u32 delta_us;
				655
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	656	if (likely(delta < INT_MAX / (USEC_PER_SEC / TCP_TS_HZ))) {
				657	if (!delta)
				658	delta = 1;
				659	delta_us = delta * (USEC_PER_SEC / TCP_TS_HZ);
				660	tcp_rcv_rtt_update(tp, delta_us, 0);
				661	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	662	}
				663	}
				664
				665	/*
				666	* This function should be called every time data is copied to user space.
				667	* It calculates the appropriate TCP receive buffer space.
				668	*/
				669	void tcp_rcv_space_adjust(struct sock *sk)
				670	{
				671	struct tcp_sock *tp = tcp_sk(sk);
				672	u32 copied;
				673	int time;
				674
				675	trace_tcp_rcv_space_adjust(sk);
				676
				677	tcp_mstamp_refresh(tp);
				678	time = tcp_stamp_us_delta(tp->tcp_mstamp, tp->rcvq_space.time);
				679	if (time < (tp->rcv_rtt_est.rtt_us >> 3) \|\| tp->rcv_rtt_est.rtt_us == 0)
				680	return;
				681
				682	/* Number of bytes copied to user in last RTT */
				683	copied = tp->copied_seq - tp->rcvq_space.seq;
				684	if (copied <= tp->rcvq_space.space)
				685	goto new_measure;
				686
				687	/* A bit of theory :
				688	* copied = bytes received in previous RTT, our base window
				689	* To cope with packet losses, we need a 2x factor
				690	* To cope with slow start, and sender growing its cwin by 100 %
				691	* every RTT, we need a 4x factor, because the ACK we are sending
				692	* now is for the next RTT, not the current one :
				693	* <prev RTT . ><current RTT .. ><next RTT .... >
				694	*/
				695
				696	if (sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf &&
				697	!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
				698	int rcvmem, rcvbuf;
				699	u64 rcvwin, grow;
				700
				701	/* minimal window to cope with packet losses, assuming
				702	* steady state. Add some cushion because of small variations.
				703	*/
				704	rcvwin = ((u64)copied << 1) + 16 * tp->advmss;
				705
				706	/* Accommodate for sender rate increase (eg. slow start) */
				707	grow = rcvwin * (copied - tp->rcvq_space.space);
				708	do_div(grow, tp->rcvq_space.space);
				709	rcvwin += (grow << 1);
				710
				711	rcvmem = SKB_TRUESIZE(tp->advmss + MAX_TCP_HEADER);
				712	while (tcp_win_from_space(sk, rcvmem) < tp->advmss)
				713	rcvmem += 128;
				714
				715	do_div(rcvwin, tp->advmss);
				716	rcvbuf = min_t(u64, rcvwin * rcvmem,
				717	sock_net(sk)->ipv4.sysctl_tcp_rmem[2]);
				718	if (rcvbuf > sk->sk_rcvbuf) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	719	WRITE_ONCE(sk->sk_rcvbuf, rcvbuf);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	720
				721	/* Make the window clamp follow along. */
				722	tp->window_clamp = tcp_win_from_space(sk, rcvbuf);
				723	}
				724	}
				725	tp->rcvq_space.space = copied;
				726
				727	new_measure:
				728	tp->rcvq_space.seq = tp->copied_seq;
				729	tp->rcvq_space.time = tp->tcp_mstamp;
				730	}
				731
				732	/* There is something which you must keep in mind when you analyze the
				733	* behavior of the tp->ato delayed ack timeout interval. When a
				734	* connection starts up, we want to ack as quickly as possible. The
				735	* problem is that "good" TCP's do slow start at the beginning of data
				736	* transmission. The means that until we send the first few ACK's the
				737	* sender will sit on his end and only queue most of his data, because
				738	* he can only send snd_cwnd unacked packets at any given time. For
				739	* each ACK we send, he increments snd_cwnd and transmits more of his
				740	* queue. -DaveM
				741	*/
				742	static void tcp_event_data_recv(struct sock sk, struct sk_buff skb)
				743	{
				744	struct tcp_sock *tp = tcp_sk(sk);
				745	struct inet_connection_sock *icsk = inet_csk(sk);
				746	u32 now;
				747
				748	inet_csk_schedule_ack(sk);
				749
				750	tcp_measure_rcv_mss(sk, skb);
				751
				752	tcp_rcv_rtt_measure(tp);
				753
				754	now = tcp_jiffies32;
				755
				756	if (!icsk->icsk_ack.ato) {
				757	/* The _first_ data packet received, initialize
				758	* delayed ACK engine.
				759	*/
				760	tcp_incr_quickack(sk, TCP_MAX_QUICKACKS);
				761	icsk->icsk_ack.ato = TCP_ATO_MIN;
				762	} else {
				763	int m = now - icsk->icsk_ack.lrcvtime;
				764
				765	if (m <= TCP_ATO_MIN / 2) {
				766	/* The fastest case is the first. */
				767	icsk->icsk_ack.ato = (icsk->icsk_ack.ato >> 1) + TCP_ATO_MIN / 2;
				768	} else if (m < icsk->icsk_ack.ato) {
				769	icsk->icsk_ack.ato = (icsk->icsk_ack.ato >> 1) + m;
				770	if (icsk->icsk_ack.ato > icsk->icsk_rto)
				771	icsk->icsk_ack.ato = icsk->icsk_rto;
				772	} else if (m > icsk->icsk_rto) {
				773	/* Too long gap. Apparently sender failed to
				774	* restart window, so that we send ACKs quickly.
				775	*/
				776	tcp_incr_quickack(sk, TCP_MAX_QUICKACKS);
				777	sk_mem_reclaim(sk);
				778	}
				779	}
				780	icsk->icsk_ack.lrcvtime = now;
				781
				782	tcp_ecn_check_ce(sk, skb);
				783
				784	if (skb->len >= 128)
				785	tcp_grow_window(sk, skb);
				786	}
				787
				788	/* Called to compute a smoothed rtt estimate. The data fed to this
				789	* routine either comes from timestamps, or from segments that were
				790	* known _not_ to have been retransmitted [see Karn/Partridge
				791	* Proceedings SIGCOMM 87]. The algorithm is from the SIGCOMM 88
				792	* piece by Van Jacobson.
				793	* NOTE: the next three routines used to be one big routine.
				794	* To save cycles in the RFC 1323 implementation it was better to break
				795	* it up into three procedures. -- erics
				796	*/
				797	static void tcp_rtt_estimator(struct sock *sk, long mrtt_us)
				798	{
				799	struct tcp_sock *tp = tcp_sk(sk);
				800	long m = mrtt_us; /* RTT */
				801	u32 srtt = tp->srtt_us;
				802
				803	/* The following amusing code comes from Jacobson's
				804	* article in SIGCOMM '88. Note that rtt and mdev
				805	* are scaled versions of rtt and mean deviation.
				806	* This is designed to be as fast as possible
				807	* m stands for "measurement".
				808	*
				809	* On a 1990 paper the rto value is changed to:
				810	* RTO = rtt + 4 * mdev
				811	*
				812	* Funny. This algorithm seems to be very broken.
				813	* These formulae increase RTO, when it should be decreased, increase
				814	* too slowly, when it should be increased quickly, decrease too quickly
				815	* etc. I guess in BSD RTO takes ONE value, so that it is absolutely
				816	* does not matter how to _calculate_ it. Seems, it was trap
				817	* that VJ failed to avoid. 8)
				818	*/
				819	if (srtt != 0) {
				820	m -= (srtt >> 3); /* m is now error in rtt est */
				821	srtt += m; /* rtt = 7/8 rtt + 1/8 new */
				822	if (m < 0) {
				823	m = -m; /* m is now abs(error) */
				824	m -= (tp->mdev_us >> 2); /* similar update on mdev */
				825	/* This is similar to one of Eifel findings.
				826	* Eifel blocks mdev updates when rtt decreases.
				827	* This solution is a bit different: we use finer gain
				828	* for mdev in this case (alpha*beta).
				829	* Like Eifel it also prevents growth of rto,
				830	* but also it limits too fast rto decreases,
				831	* happening in pure Eifel.
				832	*/
				833	if (m > 0)
				834	m >>= 3;
				835	} else {
				836	m -= (tp->mdev_us >> 2); /* similar update on mdev */
				837	}
				838	tp->mdev_us += m; /* mdev = 3/4 mdev + 1/4 new */
				839	if (tp->mdev_us > tp->mdev_max_us) {
				840	tp->mdev_max_us = tp->mdev_us;
				841	if (tp->mdev_max_us > tp->rttvar_us)
				842	tp->rttvar_us = tp->mdev_max_us;
				843	}
				844	if (after(tp->snd_una, tp->rtt_seq)) {
				845	if (tp->mdev_max_us < tp->rttvar_us)
				846	tp->rttvar_us -= (tp->rttvar_us - tp->mdev_max_us) >> 2;
				847	tp->rtt_seq = tp->snd_nxt;
				848	tp->mdev_max_us = tcp_rto_min_us(sk);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	849
				850	tcp_bpf_rtt(sk);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	851	}
				852	} else {
				853	/* no previous measure. */
				854	srtt = m << 3; /* take the measured time to be rtt */
				855	tp->mdev_us = m << 1; /* make sure rto = 3rtt /
				856	tp->rttvar_us = max(tp->mdev_us, tcp_rto_min_us(sk));
				857	tp->mdev_max_us = tp->rttvar_us;
				858	tp->rtt_seq = tp->snd_nxt;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	859
				860	tcp_bpf_rtt(sk);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	861	}
				862	tp->srtt_us = max(1U, srtt);
				863	}
				864
				865	static void tcp_update_pacing_rate(struct sock *sk)
				866	{
				867	const struct tcp_sock *tp = tcp_sk(sk);
				868	u64 rate;
				869
				870	/* set sk_pacing_rate to 200 % of current rate (mss * cwnd / srtt) */
				871	rate = (u64)tp->mss_cache * ((USEC_PER_SEC / 100) << 3);
				872
				873	/* current rate is (cwnd * mss) / srtt
				874	* In Slow Start [1], set sk_pacing_rate to 200 % the current rate.
				875	* In Congestion Avoidance phase, set it to 120 % the current rate.
				876	*
				877	* [1] : Normal Slow Start condition is (tp->snd_cwnd < tp->snd_ssthresh)
				878	* If snd_cwnd >= (tp->snd_ssthresh / 2), we are approaching
				879	* end of slow start and should slow down.
				880	*/
				881	if (tp->snd_cwnd < tp->snd_ssthresh / 2)
				882	rate *= sock_net(sk)->ipv4.sysctl_tcp_pacing_ss_ratio;
				883	else
				884	rate *= sock_net(sk)->ipv4.sysctl_tcp_pacing_ca_ratio;
				885
				886	rate *= max(tp->snd_cwnd, tp->packets_out);
				887
				888	if (likely(tp->srtt_us))
				889	do_div(rate, tp->srtt_us);
				890
				891	/* WRITE_ONCE() is needed because sch_fq fetches sk_pacing_rate
				892	* without any lock. We want to make sure compiler wont store
				893	* intermediate values in this location.
				894	*/
				895	WRITE_ONCE(sk->sk_pacing_rate, min_t(u64, rate,
				896	sk->sk_max_pacing_rate));
				897	}
				898
				899	/* Calculate rto without backoff. This is the second half of Van Jacobson's
				900	* routine referred to above.
				901	*/
				902	static void tcp_set_rto(struct sock *sk)
				903	{
				904	const struct tcp_sock *tp = tcp_sk(sk);
				905	/* Old crap is replaced with new one. 8)
				906	*
				907	* More seriously:
				908	* 1. If rtt variance happened to be less 50msec, it is hallucination.
				909	* It cannot be less due to utterly erratic ACK generation made
				910	* at least by solaris and freebsd. "Erratic ACKs" has _nothing_
				911	* to do with delayed acks, because at cwnd>2 true delack timeout
				912	* is invisible. Actually, Linux-2.4 also generates erratic
				913	* ACKs in some circumstances.
				914	*/
				915	inet_csk(sk)->icsk_rto = __tcp_set_rto(tp);
				916
				917	/* 2. Fixups made earlier cannot be right.
				918	* If we do not estimate RTO correctly without them,
				919	* all the algo is pure shit and should be replaced
				920	* with correct one. It is exactly, which we pretend to do.
				921	*/
				922
				923	/* NOTE: clamping at TCP_RTO_MIN is not required, current algo
				924	* guarantees that rto is higher.
				925	*/
				926	tcp_bound_rto(sk);
				927	}
				928
				929	__u32 tcp_init_cwnd(const struct tcp_sock tp, const struct dst_entry dst)
				930	{
				931	__u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0);
				932
				933	if (!cwnd)
				934	cwnd = TCP_INIT_CWND;
				935	return min_t(__u32, cwnd, tp->snd_cwnd_clamp);
				936	}
				937
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	938	struct tcp_sacktag_state {
				939	/* Timestamps for earliest and latest never-retransmitted segment
				940	* that was SACKed. RTO needs the earliest RTT to stay conservative,
				941	* but congestion control should still get an accurate delay signal.
				942	*/
				943	u64 first_sackt;
				944	u64 last_sackt;
				945	u32 reord;
				946	u32 sack_delivered;
				947	int flag;
				948	unsigned int mss_now;
				949	struct rate_sample *rate;
				950	};
				951
				952	/* Take a notice that peer is sending D-SACKs. Skip update of data delivery
				953	* and spurious retransmission information if this DSACK is unlikely caused by
				954	* sender's action:
				955	* - DSACKed sequence range is larger than maximum receiver's window.
				956	* - Total no. of DSACKed segments exceed the total no. of retransmitted segs.
				957	*/
				958	static u32 tcp_dsack_seen(struct tcp_sock *tp, u32 start_seq,
				959	u32 end_seq, struct tcp_sacktag_state *state)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	960	{
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	961	u32 seq_len, dup_segs = 1;
				962
				963	if (!before(start_seq, end_seq))
				964	return 0;
				965
				966	seq_len = end_seq - start_seq;
				967	/* Dubious DSACK: DSACKed range greater than maximum advertised rwnd */
				968	if (seq_len > tp->max_window)
				969	return 0;
				970	if (seq_len > tp->mss_cache)
				971	dup_segs = DIV_ROUND_UP(seq_len, tp->mss_cache);
				972
				973	tp->dsack_dups += dup_segs;
				974	/* Skip the DSACK if dup segs weren't retransmitted by sender */
				975	if (tp->dsack_dups > tp->total_retrans)
				976	return 0;
				977
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	978	tp->rx_opt.sack_ok \|= TCP_DSACK_SEEN;
				979	tp->rack.dsack_seen = 1;
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	980
				981	state->flag \|= FLAG_DSACKING_ACK;
				982	/* A spurious retransmission is delivered */
				983	state->sack_delivered += dup_segs;
				984
				985	return dup_segs;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	986	}
				987
				988	/* It's reordering when higher sequence was delivered (i.e. sacked) before
				989	* some lower never-retransmitted sequence ("low_seq"). The maximum reordering
				990	* distance is approximated in full-mss packet distance ("reordering").
				991	*/
				992	static void tcp_check_sack_reordering(struct sock *sk, const u32 low_seq,
				993	const int ts)
				994	{
				995	struct tcp_sock *tp = tcp_sk(sk);
				996	const u32 mss = tp->mss_cache;
				997	u32 fack, metric;
				998
				999	fack = tcp_highest_sack_seq(tp);
				1000	if (!before(low_seq, fack))
				1001	return;
				1002
				1003	metric = fack - low_seq;
				1004	if ((metric > tp->reordering * mss) && mss) {
				1005	#if FASTRETRANS_DEBUG > 1
				1006	pr_debug("Disorder%d %d %u f%u s%u rr%d\n",
				1007	tp->rx_opt.sack_ok, inet_csk(sk)->icsk_ca_state,
				1008	tp->reordering,
				1009	0,
				1010	tp->sacked_out,
				1011	tp->undo_marker ? tp->undo_retrans : 0);
				1012	#endif
				1013	tp->reordering = min_t(u32, (metric + mss - 1) / mss,
				1014	sock_net(sk)->ipv4.sysctl_tcp_max_reordering);
				1015	}
				1016
				1017	/* This exciting event is worth to be remembered. 8) */
				1018	tp->reord_seen++;
				1019	NET_INC_STATS(sock_net(sk),
				1020	ts ? LINUX_MIB_TCPTSREORDER : LINUX_MIB_TCPSACKREORDER);
				1021	}
				1022
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1023	/* This must be called before lost_out or retrans_out are updated
				1024	* on a new loss, because we want to know if all skbs previously
				1025	* known to be lost have already been retransmitted, indicating
				1026	* that this newly lost skb is our next skb to retransmit.
				1027	*/
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1028	static void tcp_verify_retransmit_hint(struct tcp_sock tp, struct sk_buff skb)
				1029	{
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame]	1030	if ((!tp->retransmit_skb_hint && tp->retrans_out >= tp->lost_out) \|\|
				1031	(tp->retransmit_skb_hint &&
				1032	before(TCP_SKB_CB(skb)->seq,
				1033	TCP_SKB_CB(tp->retransmit_skb_hint)->seq)))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1034	tp->retransmit_skb_hint = skb;
				1035	}
				1036
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1037	/* Sum the number of packets on the wire we have marked as lost, and
				1038	* notify the congestion control module that the given skb was marked lost.
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1039	*/
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1040	static void tcp_notify_skb_loss_event(struct tcp_sock tp, const struct sk_buff skb)
				1041	{
				1042	tp->lost += tcp_skb_pcount(skb);
				1043	}
				1044
				1045	void tcp_mark_skb_lost(struct sock sk, struct sk_buff skb)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1046	{
				1047	__u8 sacked = TCP_SKB_CB(skb)->sacked;
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1048	struct tcp_sock *tp = tcp_sk(sk);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1049
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1050	if (sacked & TCPCB_SACKED_ACKED)
				1051	return;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1052
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1053	tcp_verify_retransmit_hint(tp, skb);
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1054	if (sacked & TCPCB_LOST) {
				1055	if (sacked & TCPCB_SACKED_RETRANS) {
				1056	/* Account for retransmits that are lost again */
				1057	TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
				1058	tp->retrans_out -= tcp_skb_pcount(skb);
				1059	NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPLOSTRETRANSMIT,
				1060	tcp_skb_pcount(skb));
				1061	tcp_notify_skb_loss_event(tp, skb);
				1062	}
				1063	} else {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1064	tp->lost_out += tcp_skb_pcount(skb);
				1065	TCP_SKB_CB(skb)->sacked \|= TCPCB_LOST;
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1066	tcp_notify_skb_loss_event(tp, skb);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1067	}
				1068	}
				1069
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1070	/* Updates the delivered and delivered_ce counts */
				1071	static void tcp_count_delivered(struct tcp_sock *tp, u32 delivered,
				1072	bool ece_ack)
				1073	{
				1074	tp->delivered += delivered;
				1075	if (ece_ack)
				1076	tp->delivered_ce += delivered;
				1077	}
				1078
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1079	/* This procedure tags the retransmission queue when SACKs arrive.
				1080	*
				1081	* We have three tag bits: SACKED(S), RETRANS(R) and LOST(L).
				1082	* Packets in queue with these bits set are counted in variables
				1083	* sacked_out, retrans_out and lost_out, correspondingly.
				1084	*
				1085	* Valid combinations are:
				1086	* Tag InFlight Description
				1087	* 0 1 - orig segment is in flight.
				1088	* S 0 - nothing flies, orig reached receiver.
				1089	* L 0 - nothing flies, orig lost by net.
				1090	* R 2 - both orig and retransmit are in flight.
				1091	* L\|R 1 - orig is lost, retransmit is in flight.
				1092	* S\|R 1 - orig reached receiver, retrans is still in flight.
				1093	* (L\|S\|R is logically valid, it could occur when L\|R is sacked,
				1094	* but it is equivalent to plain S and code short-curcuits it to S.
				1095	* L\|S is logically invalid, it would mean -1 packet in flight 8))
				1096	*
				1097	* These 6 states form finite state machine, controlled by the following events:
				1098	* 1. New ACK (+SACK) arrives. (tcp_sacktag_write_queue())
				1099	* 2. Retransmission. (tcp_retransmit_skb(), tcp_xmit_retransmit_queue())
				1100	* 3. Loss detection event of two flavors:
				1101	* A. Scoreboard estimator decided the packet is lost.
				1102	* A'. Reno "three dupacks" marks head of queue lost.
				1103	* B. SACK arrives sacking SND.NXT at the moment, when the
				1104	* segment was retransmitted.
				1105	* 4. D-SACK added new rule: D-SACK changes any tag to S.
				1106	*
				1107	* It is pleasant to note, that state diagram turns out to be commutative,
				1108	* so that we are allowed not to be bothered by order of our actions,
				1109	* when multiple events arrive simultaneously. (see the function below).
				1110	*
				1111	* Reordering detection.
				1112	* --------------------
				1113	* Reordering metric is maximal distance, which a packet can be displaced
				1114	* in packet stream. With SACKs we can estimate it:
				1115	*
				1116	* 1. SACK fills old hole and the corresponding segment was not
				1117	* ever retransmitted -> reordering. Alas, we cannot use it
				1118	* when segment was retransmitted.
				1119	* 2. The last flaw is solved with D-SACK. D-SACK arrives
				1120	* for retransmitted and already SACKed segment -> reordering..
				1121	* Both of these heuristics are not used in Loss state, when we cannot
				1122	* account for retransmits accurately.
				1123	*
				1124	* SACK block validation.
				1125	* ----------------------
				1126	*
				1127	* SACK block range validation checks that the received SACK block fits to
				1128	* the expected sequence limits, i.e., it is between SND.UNA and SND.NXT.
				1129	* Note that SND.UNA is not included to the range though being valid because
				1130	* it means that the receiver is rather inconsistent with itself reporting
				1131	* SACK reneging when it should advance SND.UNA. Such SACK block this is
				1132	* perfectly valid, however, in light of RFC2018 which explicitly states
				1133	* that "SACK block MUST reflect the newest segment. Even if the newest
				1134	* segment is going to be discarded ...", not that it looks very clever
				1135	* in case of head skb. Due to potentional receiver driven attacks, we
				1136	* choose to avoid immediate execution of a walk in write queue due to
				1137	* reneging and defer head skb's loss recovery to standard loss recovery
				1138	* procedure that will eventually trigger (nothing forbids us doing this).
				1139	*
				1140	* Implements also blockage to start_seq wrap-around. Problem lies in the
				1141	* fact that though start_seq (s) is before end_seq (i.e., not reversed),
				1142	* there's no guarantee that it will be before snd_nxt (n). The problem
				1143	* happens when start_seq resides between end_seq wrap (e_w) and snd_nxt
				1144	* wrap (s_w):
				1145	*
				1146	* <- outs wnd -> <- wrapzone ->
				1147	* u e n u_w e_w s n_w
				1148	* \| \| \| \| \| \| \|
				1149	* \|<------------+------+----- TCP seqno space --------------+---------->\|
				1150	* ...-- <2^31 ->\| \|<--------...
				1151	* ...---- >2^31 ------>\| \|<--------...
				1152	*
				1153	* Current code wouldn't be vulnerable but it's better still to discard such
				1154	* crazy SACK blocks. Doing this check for start_seq alone closes somewhat
				1155	* similar case (end_seq after snd_nxt wrap) as earlier reversed check in
				1156	* snd_nxt wrap -> snd_una region will then become "well defined", i.e.,
				1157	* equal to the ideal case (infinite seqno space without wrap caused issues).
				1158	*
				1159	* With D-SACK the lower bound is extended to cover sequence space below
				1160	* SND.UNA down to undo_marker, which is the last point of interest. Yet
				1161	* again, D-SACK block must not to go across snd_una (for the same reason as
				1162	* for the normal SACK blocks, explained above). But there all simplicity
				1163	* ends, TCP might receive valid D-SACKs below that. As long as they reside
				1164	* fully below undo_marker they do not affect behavior in anyway and can
				1165	* therefore be safely ignored. In rare cases (which are more or less
				1166	* theoretical ones), the D-SACK will nicely cross that boundary due to skb
				1167	* fragmentation and packet reordering past skb's retransmission. To consider
				1168	* them correctly, the acceptable range must be extended even more though
				1169	* the exact amount is rather hard to quantify. However, tp->max_window can
				1170	* be used as an exaggerated estimate.
				1171	*/
				1172	static bool tcp_is_sackblock_valid(struct tcp_sock *tp, bool is_dsack,
				1173	u32 start_seq, u32 end_seq)
				1174	{
				1175	/* Too far in future, or reversed (interpretation is ambiguous) */
				1176	if (after(end_seq, tp->snd_nxt) \|\| !before(start_seq, end_seq))
				1177	return false;
				1178
				1179	/* Nasty start_seq wrap-around check (see comments above) */
				1180	if (!before(start_seq, tp->snd_nxt))
				1181	return false;
				1182
				1183	/* In outstanding window? ...This is valid exit for D-SACKs too.
				1184	* start_seq == snd_una is non-sensical (see comments above)
				1185	*/
				1186	if (after(start_seq, tp->snd_una))
				1187	return true;
				1188
				1189	if (!is_dsack \|\| !tp->undo_marker)
				1190	return false;
				1191
				1192	/* ...Then it's D-SACK, and must reside below snd_una completely */
				1193	if (after(end_seq, tp->snd_una))
				1194	return false;
				1195
				1196	if (!before(start_seq, tp->undo_marker))
				1197	return true;
				1198
				1199	/* Too old */
				1200	if (!after(end_seq, tp->undo_marker))
				1201	return false;
				1202
				1203	/* Undo_marker boundary crossing (overestimates a lot). Known already:
				1204	* start_seq < undo_marker and end_seq >= undo_marker.
				1205	*/
				1206	return !before(start_seq, end_seq - tp->max_window);
				1207	}
				1208
				1209	static bool tcp_check_dsack(struct sock sk, const struct sk_buff ack_skb,
				1210	struct tcp_sack_block_wire *sp, int num_sacks,
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1211	u32 prior_snd_una, struct tcp_sacktag_state *state)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1212	{
				1213	struct tcp_sock *tp = tcp_sk(sk);
				1214	u32 start_seq_0 = get_unaligned_be32(&sp[0].start_seq);
				1215	u32 end_seq_0 = get_unaligned_be32(&sp[0].end_seq);
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1216	u32 dup_segs;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1217
				1218	if (before(start_seq_0, TCP_SKB_CB(ack_skb)->ack_seq)) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1219	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKRECV);
				1220	} else if (num_sacks > 1) {
				1221	u32 end_seq_1 = get_unaligned_be32(&sp[1].end_seq);
				1222	u32 start_seq_1 = get_unaligned_be32(&sp[1].start_seq);
				1223
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1224	if (after(end_seq_0, end_seq_1) \|\| before(start_seq_0, start_seq_1))
				1225	return false;
				1226	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKOFORECV);
				1227	} else {
				1228	return false;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1229	}
				1230
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1231	dup_segs = tcp_dsack_seen(tp, start_seq_0, end_seq_0, state);
				1232	if (!dup_segs) { /* Skip dubious DSACK */
				1233	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKIGNOREDDUBIOUS);
				1234	return false;
				1235	}
				1236
				1237	NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPDSACKRECVSEGS, dup_segs);
				1238
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1239	/* D-SACK for already forgotten data... Do dumb counting. */
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1240	if (tp->undo_marker && tp->undo_retrans > 0 &&
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1241	!after(end_seq_0, prior_snd_una) &&
				1242	after(end_seq_0, tp->undo_marker))
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1243	tp->undo_retrans = max_t(int, 0, tp->undo_retrans - dup_segs);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1244
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1245	return true;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1246	}
				1247
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1248	/* Check if skb is fully within the SACK block. In presence of GSO skbs,
				1249	* the incoming SACK may not exactly match but we can find smaller MSS
				1250	* aligned portion of it that matches. Therefore we might need to fragment
				1251	* which may fail and creates some hassle (caller must handle error case
				1252	* returns).
				1253	*
				1254	* FIXME: this could be merged to shift decision code
				1255	*/
				1256	static int tcp_match_skb_to_sack(struct sock sk, struct sk_buff skb,
				1257	u32 start_seq, u32 end_seq)
				1258	{
				1259	int err;
				1260	bool in_sack;
				1261	unsigned int pkt_len;
				1262	unsigned int mss;
				1263
				1264	in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) &&
				1265	!before(end_seq, TCP_SKB_CB(skb)->end_seq);
				1266
				1267	if (tcp_skb_pcount(skb) > 1 && !in_sack &&
				1268	after(TCP_SKB_CB(skb)->end_seq, start_seq)) {
				1269	mss = tcp_skb_mss(skb);
				1270	in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq);
				1271
				1272	if (!in_sack) {
				1273	pkt_len = start_seq - TCP_SKB_CB(skb)->seq;
				1274	if (pkt_len < mss)
				1275	pkt_len = mss;
				1276	} else {
				1277	pkt_len = end_seq - TCP_SKB_CB(skb)->seq;
				1278	if (pkt_len < mss)
				1279	return -EINVAL;
				1280	}
				1281
				1282	/* Round if necessary so that SACKs cover only full MSSes
				1283	* and/or the remaining small portion (if present)
				1284	*/
				1285	if (pkt_len > mss) {
				1286	unsigned int new_len = (pkt_len / mss) * mss;
				1287	if (!in_sack && new_len < pkt_len)
				1288	new_len += mss;
				1289	pkt_len = new_len;
				1290	}
				1291
				1292	if (pkt_len >= skb->len && !in_sack)
				1293	return 0;
				1294
				1295	err = tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb,
				1296	pkt_len, mss, GFP_ATOMIC);
				1297	if (err < 0)
				1298	return err;
				1299	}
				1300
				1301	return in_sack;
				1302	}
				1303
				1304	/* Mark the given newly-SACKed range as such, adjusting counters and hints. */
				1305	static u8 tcp_sacktag_one(struct sock *sk,
				1306	struct tcp_sacktag_state *state, u8 sacked,
				1307	u32 start_seq, u32 end_seq,
				1308	int dup_sack, int pcount,
				1309	u64 xmit_time)
				1310	{
				1311	struct tcp_sock *tp = tcp_sk(sk);
				1312
				1313	/* Account D-SACK for retransmitted packet. */
				1314	if (dup_sack && (sacked & TCPCB_RETRANS)) {
				1315	if (tp->undo_marker && tp->undo_retrans > 0 &&
				1316	after(end_seq, tp->undo_marker))
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame]	1317	tp->undo_retrans = max_t(int, 0, tp->undo_retrans - pcount);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1318	if ((sacked & TCPCB_SACKED_ACKED) &&
				1319	before(start_seq, state->reord))
				1320	state->reord = start_seq;
				1321	}
				1322
				1323	/* Nothing to do; acked frame is about to be dropped (was ACKed). */
				1324	if (!after(end_seq, tp->snd_una))
				1325	return sacked;
				1326
				1327	if (!(sacked & TCPCB_SACKED_ACKED)) {
				1328	tcp_rack_advance(tp, sacked, end_seq, xmit_time);
				1329
				1330	if (sacked & TCPCB_SACKED_RETRANS) {
				1331	/* If the segment is not tagged as lost,
				1332	* we do not clear RETRANS, believing
				1333	* that retransmission is still in flight.
				1334	*/
				1335	if (sacked & TCPCB_LOST) {
				1336	sacked &= ~(TCPCB_LOST\|TCPCB_SACKED_RETRANS);
				1337	tp->lost_out -= pcount;
				1338	tp->retrans_out -= pcount;
				1339	}
				1340	} else {
				1341	if (!(sacked & TCPCB_RETRANS)) {
				1342	/* New sack for not retransmitted frame,
				1343	* which was in hole. It is reordering.
				1344	*/
				1345	if (before(start_seq,
				1346	tcp_highest_sack_seq(tp)) &&
				1347	before(start_seq, state->reord))
				1348	state->reord = start_seq;
				1349
				1350	if (!after(end_seq, tp->high_seq))
				1351	state->flag \|= FLAG_ORIG_SACK_ACKED;
				1352	if (state->first_sackt == 0)
				1353	state->first_sackt = xmit_time;
				1354	state->last_sackt = xmit_time;
				1355	}
				1356
				1357	if (sacked & TCPCB_LOST) {
				1358	sacked &= ~TCPCB_LOST;
				1359	tp->lost_out -= pcount;
				1360	}
				1361	}
				1362
				1363	sacked \|= TCPCB_SACKED_ACKED;
				1364	state->flag \|= FLAG_DATA_SACKED;
				1365	tp->sacked_out += pcount;
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1366	/* Out-of-order packets delivered */
				1367	state->sack_delivered += pcount;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1368
				1369	/* Lost marker hint past SACKed? Tweak RFC3517 cnt */
				1370	if (tp->lost_skb_hint &&
				1371	before(start_seq, TCP_SKB_CB(tp->lost_skb_hint)->seq))
				1372	tp->lost_cnt_hint += pcount;
				1373	}
				1374
				1375	/* D-SACK. We can detect redundant retransmission in S\|R and plain R
				1376	* frames and clear it. undo_retrans is decreased above, L\|R frames
				1377	* are accounted above as well.
				1378	*/
				1379	if (dup_sack && (sacked & TCPCB_SACKED_RETRANS)) {
				1380	sacked &= ~TCPCB_SACKED_RETRANS;
				1381	tp->retrans_out -= pcount;
				1382	}
				1383
				1384	return sacked;
				1385	}
				1386
				1387	/* Shift newly-SACKed bytes from this skb to the immediately previous
				1388	* already-SACKed sk_buff. Mark the newly-SACKed bytes as such.
				1389	*/
				1390	static bool tcp_shifted_skb(struct sock sk, struct sk_buff prev,
				1391	struct sk_buff *skb,
				1392	struct tcp_sacktag_state *state,
				1393	unsigned int pcount, int shifted, int mss,
				1394	bool dup_sack)
				1395	{
				1396	struct tcp_sock *tp = tcp_sk(sk);
				1397	u32 start_seq = TCP_SKB_CB(skb)->seq; /* start of newly-SACKed */
				1398	u32 end_seq = start_seq + shifted; /* end of newly-SACKed */
				1399
				1400	BUG_ON(!pcount);
				1401
				1402	/* Adjust counters and hints for the newly sacked sequence
				1403	* range but discard the return value since prev is already
				1404	* marked. We must tag the range first because the seq
				1405	* advancement below implicitly advances
				1406	* tcp_highest_sack_seq() when skb is highest_sack.
				1407	*/
				1408	tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked,
				1409	start_seq, end_seq, dup_sack, pcount,
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1410	tcp_skb_timestamp_us(skb));
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1411	tcp_rate_skb_delivered(sk, skb, state->rate);
				1412
				1413	if (skb == tp->lost_skb_hint)
				1414	tp->lost_cnt_hint += pcount;
				1415
				1416	TCP_SKB_CB(prev)->end_seq += shifted;
				1417	TCP_SKB_CB(skb)->seq += shifted;
				1418
				1419	tcp_skb_pcount_add(prev, pcount);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1420	WARN_ON_ONCE(tcp_skb_pcount(skb) < pcount);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1421	tcp_skb_pcount_add(skb, -pcount);
				1422
				1423	/* When we're adding to gso_segs == 1, gso_size will be zero,
				1424	* in theory this shouldn't be necessary but as long as DSACK
				1425	* code can come after this skb later on it's better to keep
				1426	* setting gso_size to something.
				1427	*/
				1428	if (!TCP_SKB_CB(prev)->tcp_gso_size)
				1429	TCP_SKB_CB(prev)->tcp_gso_size = mss;
				1430
				1431	/* CHECKME: To clear or not to clear? Mimics normal skb currently */
				1432	if (tcp_skb_pcount(skb) <= 1)
				1433	TCP_SKB_CB(skb)->tcp_gso_size = 0;
				1434
				1435	/* Difference in this won't matter, both ACKed by the same cumul. ACK */
				1436	TCP_SKB_CB(prev)->sacked \|= (TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS);
				1437
				1438	if (skb->len > 0) {
				1439	BUG_ON(!tcp_skb_pcount(skb));
				1440	NET_INC_STATS(sock_net(sk), LINUX_MIB_SACKSHIFTED);
				1441	return false;
				1442	}
				1443
				1444	/* Whole SKB was eaten :-) */
				1445
				1446	if (skb == tp->retransmit_skb_hint)
				1447	tp->retransmit_skb_hint = prev;
				1448	if (skb == tp->lost_skb_hint) {
				1449	tp->lost_skb_hint = prev;
				1450	tp->lost_cnt_hint -= tcp_skb_pcount(prev);
				1451	}
				1452
				1453	TCP_SKB_CB(prev)->tcp_flags \|= TCP_SKB_CB(skb)->tcp_flags;
				1454	TCP_SKB_CB(prev)->eor = TCP_SKB_CB(skb)->eor;
				1455	if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
				1456	TCP_SKB_CB(prev)->end_seq++;
				1457
				1458	if (skb == tcp_highest_sack(sk))
				1459	tcp_advance_highest_sack(sk, skb);
				1460
				1461	tcp_skb_collapse_tstamp(prev, skb);
				1462	if (unlikely(TCP_SKB_CB(prev)->tx.delivered_mstamp))
				1463	TCP_SKB_CB(prev)->tx.delivered_mstamp = 0;
				1464
				1465	tcp_rtx_queue_unlink_and_free(skb, sk);
				1466
				1467	NET_INC_STATS(sock_net(sk), LINUX_MIB_SACKMERGED);
				1468
				1469	return true;
				1470	}
				1471
				1472	/* I wish gso_size would have a bit more sane initialization than
				1473	* something-or-zero which complicates things
				1474	*/
				1475	static int tcp_skb_seglen(const struct sk_buff *skb)
				1476	{
				1477	return tcp_skb_pcount(skb) == 1 ? skb->len : tcp_skb_mss(skb);
				1478	}
				1479
				1480	/* Shifting pages past head area doesn't work */
				1481	static int skb_can_shift(const struct sk_buff *skb)
				1482	{
				1483	return !skb_headlen(skb) && skb_is_nonlinear(skb);
				1484	}
				1485
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1486	int tcp_skb_shift(struct sk_buff to, struct sk_buff from,
				1487	int pcount, int shiftlen)
				1488	{
				1489	/* TCP min gso_size is 8 bytes (TCP_MIN_GSO_SIZE)
				1490	* Since TCP_SKB_CB(skb)->tcp_gso_segs is 16 bits, we need
				1491	* to make sure not storing more than 65535 * 8 bytes per skb,
				1492	* even if current MSS is bigger.
				1493	*/
				1494	if (unlikely(to->len + shiftlen >= 65535 * TCP_MIN_GSO_SIZE))
				1495	return 0;
				1496	if (unlikely(tcp_skb_pcount(to) + pcount > 65535))
				1497	return 0;
				1498	return skb_shift(to, from, shiftlen);
				1499	}
				1500
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1501	/* Try collapsing SACK blocks spanning across multiple skbs to a single
				1502	* skb.
				1503	*/
				1504	static struct sk_buff tcp_shift_skb_data(struct sock sk, struct sk_buff *skb,
				1505	struct tcp_sacktag_state *state,
				1506	u32 start_seq, u32 end_seq,
				1507	bool dup_sack)
				1508	{
				1509	struct tcp_sock *tp = tcp_sk(sk);
				1510	struct sk_buff *prev;
				1511	int mss;
				1512	int pcount = 0;
				1513	int len;
				1514	int in_sack;
				1515
				1516	/* Normally R but no L won't result in plain S */
				1517	if (!dup_sack &&
				1518	(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST\|TCPCB_SACKED_RETRANS)) == TCPCB_SACKED_RETRANS)
				1519	goto fallback;
				1520	if (!skb_can_shift(skb))
				1521	goto fallback;
				1522	/* This frame is about to be dropped (was ACKed). */
				1523	if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
				1524	goto fallback;
				1525
				1526	/* Can only happen with delayed DSACK + discard craziness */
				1527	prev = skb_rb_prev(skb);
				1528	if (!prev)
				1529	goto fallback;
				1530
				1531	if ((TCP_SKB_CB(prev)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED)
				1532	goto fallback;
				1533
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1534	if (!tcp_skb_can_collapse(prev, skb))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1535	goto fallback;
				1536
				1537	in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) &&
				1538	!before(end_seq, TCP_SKB_CB(skb)->end_seq);
				1539
				1540	if (in_sack) {
				1541	len = skb->len;
				1542	pcount = tcp_skb_pcount(skb);
				1543	mss = tcp_skb_seglen(skb);
				1544
				1545	/* TODO: Fix DSACKs to not fragment already SACKed and we can
				1546	* drop this restriction as unnecessary
				1547	*/
				1548	if (mss != tcp_skb_seglen(prev))
				1549	goto fallback;
				1550	} else {
				1551	if (!after(TCP_SKB_CB(skb)->end_seq, start_seq))
				1552	goto noop;
				1553	/* CHECKME: This is non-MSS split case only?, this will
				1554	* cause skipped skbs due to advancing loop btw, original
				1555	* has that feature too
				1556	*/
				1557	if (tcp_skb_pcount(skb) <= 1)
				1558	goto noop;
				1559
				1560	in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq);
				1561	if (!in_sack) {
				1562	/* TODO: head merge to next could be attempted here
				1563	* if (!after(TCP_SKB_CB(skb)->end_seq, end_seq)),
				1564	* though it might not be worth of the additional hassle
				1565	*
				1566	* ...we can probably just fallback to what was done
				1567	* previously. We could try merging non-SACKed ones
				1568	* as well but it probably isn't going to buy off
				1569	* because later SACKs might again split them, and
				1570	* it would make skb timestamp tracking considerably
				1571	* harder problem.
				1572	*/
				1573	goto fallback;
				1574	}
				1575
				1576	len = end_seq - TCP_SKB_CB(skb)->seq;
				1577	BUG_ON(len < 0);
				1578	BUG_ON(len > skb->len);
				1579
				1580	/* MSS boundaries should be honoured or else pcount will
				1581	* severely break even though it makes things bit trickier.
				1582	* Optimize common case to avoid most of the divides
				1583	*/
				1584	mss = tcp_skb_mss(skb);
				1585
				1586	/* TODO: Fix DSACKs to not fragment already SACKed and we can
				1587	* drop this restriction as unnecessary
				1588	*/
				1589	if (mss != tcp_skb_seglen(prev))
				1590	goto fallback;
				1591
				1592	if (len == mss) {
				1593	pcount = 1;
				1594	} else if (len < mss) {
				1595	goto noop;
				1596	} else {
				1597	pcount = len / mss;
				1598	len = pcount * mss;
				1599	}
				1600	}
				1601
				1602	/* tcp_sacktag_one() won't SACK-tag ranges below snd_una */
				1603	if (!after(TCP_SKB_CB(skb)->seq + len, tp->snd_una))
				1604	goto fallback;
				1605
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1606	if (!tcp_skb_shift(prev, skb, pcount, len))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1607	goto fallback;
				1608	if (!tcp_shifted_skb(sk, prev, skb, state, pcount, len, mss, dup_sack))
				1609	goto out;
				1610
				1611	/* Hole filled allows collapsing with the next as well, this is very
				1612	* useful when hole on every nth skb pattern happens
				1613	*/
				1614	skb = skb_rb_next(prev);
				1615	if (!skb)
				1616	goto out;
				1617
				1618	if (!skb_can_shift(skb) \|\|
				1619	((TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED) \|\|
				1620	(mss != tcp_skb_seglen(skb)))
				1621	goto out;
				1622
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1623	if (!tcp_skb_can_collapse(prev, skb))
				1624	goto out;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1625	len = skb->len;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1626	pcount = tcp_skb_pcount(skb);
				1627	if (tcp_skb_shift(prev, skb, pcount, len))
				1628	tcp_shifted_skb(sk, prev, skb, state, pcount,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1629	len, mss, 0);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1630
				1631	out:
				1632	return prev;
				1633
				1634	noop:
				1635	return skb;
				1636
				1637	fallback:
				1638	NET_INC_STATS(sock_net(sk), LINUX_MIB_SACKSHIFTFALLBACK);
				1639	return NULL;
				1640	}
				1641
				1642	static struct sk_buff tcp_sacktag_walk(struct sk_buff skb, struct sock *sk,
				1643	struct tcp_sack_block *next_dup,
				1644	struct tcp_sacktag_state *state,
				1645	u32 start_seq, u32 end_seq,
				1646	bool dup_sack_in)
				1647	{
				1648	struct tcp_sock *tp = tcp_sk(sk);
				1649	struct sk_buff *tmp;
				1650
				1651	skb_rbtree_walk_from(skb) {
				1652	int in_sack = 0;
				1653	bool dup_sack = dup_sack_in;
				1654
				1655	/* queue is in-order => we can short-circuit the walk early */
				1656	if (!before(TCP_SKB_CB(skb)->seq, end_seq))
				1657	break;
				1658
				1659	if (next_dup &&
				1660	before(TCP_SKB_CB(skb)->seq, next_dup->end_seq)) {
				1661	in_sack = tcp_match_skb_to_sack(sk, skb,
				1662	next_dup->start_seq,
				1663	next_dup->end_seq);
				1664	if (in_sack > 0)
				1665	dup_sack = true;
				1666	}
				1667
				1668	/* skb reference here is a bit tricky to get right, since
				1669	* shifting can eat and free both this skb and the next,
				1670	* so not even _safe variant of the loop is enough.
				1671	*/
				1672	if (in_sack <= 0) {
				1673	tmp = tcp_shift_skb_data(sk, skb, state,
				1674	start_seq, end_seq, dup_sack);
				1675	if (tmp) {
				1676	if (tmp != skb) {
				1677	skb = tmp;
				1678	continue;
				1679	}
				1680
				1681	in_sack = 0;
				1682	} else {
				1683	in_sack = tcp_match_skb_to_sack(sk, skb,
				1684	start_seq,
				1685	end_seq);
				1686	}
				1687	}
				1688
				1689	if (unlikely(in_sack < 0))
				1690	break;
				1691
				1692	if (in_sack) {
				1693	TCP_SKB_CB(skb)->sacked =
				1694	tcp_sacktag_one(sk,
				1695	state,
				1696	TCP_SKB_CB(skb)->sacked,
				1697	TCP_SKB_CB(skb)->seq,
				1698	TCP_SKB_CB(skb)->end_seq,
				1699	dup_sack,
				1700	tcp_skb_pcount(skb),
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1701	tcp_skb_timestamp_us(skb));
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1702	tcp_rate_skb_delivered(sk, skb, state->rate);
				1703	if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
				1704	list_del_init(&skb->tcp_tsorted_anchor);
				1705
				1706	if (!before(TCP_SKB_CB(skb)->seq,
				1707	tcp_highest_sack_seq(tp)))
				1708	tcp_advance_highest_sack(sk, skb);
				1709	}
				1710	}
				1711	return skb;
				1712	}
				1713
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1714	static struct sk_buff tcp_sacktag_bsearch(struct sock sk, u32 seq)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1715	{
				1716	struct rb_node parent, *p = &sk->tcp_rtx_queue.rb_node;
				1717	struct sk_buff *skb;
				1718
				1719	while (*p) {
				1720	parent = *p;
				1721	skb = rb_to_skb(parent);
				1722	if (before(seq, TCP_SKB_CB(skb)->seq)) {
				1723	p = &parent->rb_left;
				1724	continue;
				1725	}
				1726	if (!before(seq, TCP_SKB_CB(skb)->end_seq)) {
				1727	p = &parent->rb_right;
				1728	continue;
				1729	}
				1730	return skb;
				1731	}
				1732	return NULL;
				1733	}
				1734
				1735	static struct sk_buff tcp_sacktag_skip(struct sk_buff skb, struct sock *sk,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1736	u32 skip_to_seq)
				1737	{
				1738	if (skb && after(TCP_SKB_CB(skb)->seq, skip_to_seq))
				1739	return skb;
				1740
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1741	return tcp_sacktag_bsearch(sk, skip_to_seq);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1742	}
				1743
				1744	static struct sk_buff tcp_maybe_skipping_dsack(struct sk_buff skb,
				1745	struct sock *sk,
				1746	struct tcp_sack_block *next_dup,
				1747	struct tcp_sacktag_state *state,
				1748	u32 skip_to_seq)
				1749	{
				1750	if (!next_dup)
				1751	return skb;
				1752
				1753	if (before(next_dup->start_seq, skip_to_seq)) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1754	skb = tcp_sacktag_skip(skb, sk, next_dup->start_seq);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1755	skb = tcp_sacktag_walk(skb, sk, NULL, state,
				1756	next_dup->start_seq, next_dup->end_seq,
				1757	1);
				1758	}
				1759
				1760	return skb;
				1761	}
				1762
				1763	static int tcp_sack_cache_ok(const struct tcp_sock tp, const struct tcp_sack_block cache)
				1764	{
				1765	return cache < tp->recv_sack_cache + ARRAY_SIZE(tp->recv_sack_cache);
				1766	}
				1767
				1768	static int
				1769	tcp_sacktag_write_queue(struct sock sk, const struct sk_buff ack_skb,
				1770	u32 prior_snd_una, struct tcp_sacktag_state *state)
				1771	{
				1772	struct tcp_sock *tp = tcp_sk(sk);
				1773	const unsigned char *ptr = (skb_transport_header(ack_skb) +
				1774	TCP_SKB_CB(ack_skb)->sacked);
				1775	struct tcp_sack_block_wire sp_wire = (struct tcp_sack_block_wire )(ptr+2);
				1776	struct tcp_sack_block sp[TCP_NUM_SACKS];
				1777	struct tcp_sack_block *cache;
				1778	struct sk_buff *skb;
				1779	int num_sacks = min(TCP_NUM_SACKS, (ptr[1] - TCPOLEN_SACK_BASE) >> 3);
				1780	int used_sacks;
				1781	bool found_dup_sack = false;
				1782	int i, j;
				1783	int first_sack_index;
				1784
				1785	state->flag = 0;
				1786	state->reord = tp->snd_nxt;
				1787
				1788	if (!tp->sacked_out)
				1789	tcp_highest_sack_reset(sk);
				1790
				1791	found_dup_sack = tcp_check_dsack(sk, ack_skb, sp_wire,
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1792	num_sacks, prior_snd_una, state);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1793
				1794	/* Eliminate too old ACKs, but take into
				1795	* account more or less fresh ones, they can
				1796	* contain valid SACK info.
				1797	*/
				1798	if (before(TCP_SKB_CB(ack_skb)->ack_seq, prior_snd_una - tp->max_window))
				1799	return 0;
				1800
				1801	if (!tp->packets_out)
				1802	goto out;
				1803
				1804	used_sacks = 0;
				1805	first_sack_index = 0;
				1806	for (i = 0; i < num_sacks; i++) {
				1807	bool dup_sack = !i && found_dup_sack;
				1808
				1809	sp[used_sacks].start_seq = get_unaligned_be32(&sp_wire[i].start_seq);
				1810	sp[used_sacks].end_seq = get_unaligned_be32(&sp_wire[i].end_seq);
				1811
				1812	if (!tcp_is_sackblock_valid(tp, dup_sack,
				1813	sp[used_sacks].start_seq,
				1814	sp[used_sacks].end_seq)) {
				1815	int mib_idx;
				1816
				1817	if (dup_sack) {
				1818	if (!tp->undo_marker)
				1819	mib_idx = LINUX_MIB_TCPDSACKIGNOREDNOUNDO;
				1820	else
				1821	mib_idx = LINUX_MIB_TCPDSACKIGNOREDOLD;
				1822	} else {
				1823	/* Don't count olds caused by ACK reordering */
				1824	if ((TCP_SKB_CB(ack_skb)->ack_seq != tp->snd_una) &&
				1825	!after(sp[used_sacks].end_seq, tp->snd_una))
				1826	continue;
				1827	mib_idx = LINUX_MIB_TCPSACKDISCARD;
				1828	}
				1829
				1830	NET_INC_STATS(sock_net(sk), mib_idx);
				1831	if (i == 0)
				1832	first_sack_index = -1;
				1833	continue;
				1834	}
				1835
				1836	/* Ignore very old stuff early */
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame]	1837	if (!after(sp[used_sacks].end_seq, prior_snd_una)) {
				1838	if (i == 0)
				1839	first_sack_index = -1;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1840	continue;
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame]	1841	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1842
				1843	used_sacks++;
				1844	}
				1845
				1846	/* order SACK blocks to allow in order walk of the retrans queue */
				1847	for (i = used_sacks - 1; i > 0; i--) {
				1848	for (j = 0; j < i; j++) {
				1849	if (after(sp[j].start_seq, sp[j + 1].start_seq)) {
				1850	swap(sp[j], sp[j + 1]);
				1851
				1852	/* Track where the first SACK block goes to */
				1853	if (j == first_sack_index)
				1854	first_sack_index = j + 1;
				1855	}
				1856	}
				1857	}
				1858
				1859	state->mss_now = tcp_current_mss(sk);
				1860	skb = NULL;
				1861	i = 0;
				1862
				1863	if (!tp->sacked_out) {
				1864	/* It's already past, so skip checking against it */
				1865	cache = tp->recv_sack_cache + ARRAY_SIZE(tp->recv_sack_cache);
				1866	} else {
				1867	cache = tp->recv_sack_cache;
				1868	/* Skip empty blocks in at head of the cache */
				1869	while (tcp_sack_cache_ok(tp, cache) && !cache->start_seq &&
				1870	!cache->end_seq)
				1871	cache++;
				1872	}
				1873
				1874	while (i < used_sacks) {
				1875	u32 start_seq = sp[i].start_seq;
				1876	u32 end_seq = sp[i].end_seq;
				1877	bool dup_sack = (found_dup_sack && (i == first_sack_index));
				1878	struct tcp_sack_block *next_dup = NULL;
				1879
				1880	if (found_dup_sack && ((i + 1) == first_sack_index))
				1881	next_dup = &sp[i + 1];
				1882
				1883	/* Skip too early cached blocks */
				1884	while (tcp_sack_cache_ok(tp, cache) &&
				1885	!before(start_seq, cache->end_seq))
				1886	cache++;
				1887
				1888	/* Can skip some work by looking recv_sack_cache? */
				1889	if (tcp_sack_cache_ok(tp, cache) && !dup_sack &&
				1890	after(end_seq, cache->start_seq)) {
				1891
				1892	/* Head todo? */
				1893	if (before(start_seq, cache->start_seq)) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1894	skb = tcp_sacktag_skip(skb, sk, start_seq);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1895	skb = tcp_sacktag_walk(skb, sk, next_dup,
				1896	state,
				1897	start_seq,
				1898	cache->start_seq,
				1899	dup_sack);
				1900	}
				1901
				1902	/* Rest of the block already fully processed? */
				1903	if (!after(end_seq, cache->end_seq))
				1904	goto advance_sp;
				1905
				1906	skb = tcp_maybe_skipping_dsack(skb, sk, next_dup,
				1907	state,
				1908	cache->end_seq);
				1909
				1910	/* ...tail remains todo... */
				1911	if (tcp_highest_sack_seq(tp) == cache->end_seq) {
				1912	/* ...but better entrypoint exists! */
				1913	skb = tcp_highest_sack(sk);
				1914	if (!skb)
				1915	break;
				1916	cache++;
				1917	goto walk;
				1918	}
				1919
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1920	skb = tcp_sacktag_skip(skb, sk, cache->end_seq);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1921	/* Check overlap against next cached too (past this one already) */
				1922	cache++;
				1923	continue;
				1924	}
				1925
				1926	if (!before(start_seq, tcp_highest_sack_seq(tp))) {
				1927	skb = tcp_highest_sack(sk);
				1928	if (!skb)
				1929	break;
				1930	}
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1931	skb = tcp_sacktag_skip(skb, sk, start_seq);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1932
				1933	walk:
				1934	skb = tcp_sacktag_walk(skb, sk, next_dup, state,
				1935	start_seq, end_seq, dup_sack);
				1936
				1937	advance_sp:
				1938	i++;
				1939	}
				1940
				1941	/* Clear the head of the cache sack blocks so we can skip it next time */
				1942	for (i = 0; i < ARRAY_SIZE(tp->recv_sack_cache) - used_sacks; i++) {
				1943	tp->recv_sack_cache[i].start_seq = 0;
				1944	tp->recv_sack_cache[i].end_seq = 0;
				1945	}
				1946	for (j = 0; j < used_sacks; j++)
				1947	tp->recv_sack_cache[i++] = sp[j];
				1948
				1949	if (inet_csk(sk)->icsk_ca_state != TCP_CA_Loss \|\| tp->undo_marker)
				1950	tcp_check_sack_reordering(sk, state->reord, 0);
				1951
				1952	tcp_verify_left_out(tp);
				1953	out:
				1954
				1955	#if FASTRETRANS_DEBUG > 0
				1956	WARN_ON((int)tp->sacked_out < 0);
				1957	WARN_ON((int)tp->lost_out < 0);
				1958	WARN_ON((int)tp->retrans_out < 0);
				1959	WARN_ON((int)tcp_packets_in_flight(tp) < 0);
				1960	#endif
				1961	return state->flag;
				1962	}
				1963
				1964	/* Limits sacked_out so that sum with lost_out isn't ever larger than
				1965	* packets_out. Returns false if sacked_out adjustement wasn't necessary.
				1966	*/
				1967	static bool tcp_limit_reno_sacked(struct tcp_sock *tp)
				1968	{
				1969	u32 holes;
				1970
				1971	holes = max(tp->lost_out, 1U);
				1972	holes = min(holes, tp->packets_out);
				1973
				1974	if ((tp->sacked_out + holes) > tp->packets_out) {
				1975	tp->sacked_out = tp->packets_out - holes;
				1976	return true;
				1977	}
				1978	return false;
				1979	}
				1980
				1981	/* If we receive more dupacks than we expected counting segments
				1982	* in assumption of absent reordering, interpret this as reordering.
				1983	* The only another reason could be bug in receiver TCP.
				1984	*/
				1985	static void tcp_check_reno_reordering(struct sock *sk, const int addend)
				1986	{
				1987	struct tcp_sock *tp = tcp_sk(sk);
				1988
				1989	if (!tcp_limit_reno_sacked(tp))
				1990	return;
				1991
				1992	tp->reordering = min_t(u32, tp->packets_out + addend,
				1993	sock_net(sk)->ipv4.sysctl_tcp_max_reordering);
				1994	tp->reord_seen++;
				1995	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRENOREORDER);
				1996	}
				1997
				1998	/* Emulate SACKs for SACKless connection: account for a new dupack. */
				1999
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2000	static void tcp_add_reno_sack(struct sock *sk, int num_dupack, bool ece_ack)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2001	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2002	if (num_dupack) {
				2003	struct tcp_sock *tp = tcp_sk(sk);
				2004	u32 prior_sacked = tp->sacked_out;
				2005	s32 delivered;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2006
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2007	tp->sacked_out += num_dupack;
				2008	tcp_check_reno_reordering(sk, 0);
				2009	delivered = tp->sacked_out - prior_sacked;
				2010	if (delivered > 0)
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2011	tcp_count_delivered(tp, delivered, ece_ack);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2012	tcp_verify_left_out(tp);
				2013	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2014	}
				2015
				2016	/* Account for ACK, ACKing some data in Reno Recovery phase. */
				2017
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2018	static void tcp_remove_reno_sacks(struct sock *sk, int acked, bool ece_ack)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2019	{
				2020	struct tcp_sock *tp = tcp_sk(sk);
				2021
				2022	if (acked > 0) {
				2023	/* One ACK acked hole. The rest eat duplicate ACKs. */
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2024	tcp_count_delivered(tp, max_t(int, acked - tp->sacked_out, 1),
				2025	ece_ack);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2026	if (acked - 1 >= tp->sacked_out)
				2027	tp->sacked_out = 0;
				2028	else
				2029	tp->sacked_out -= acked - 1;
				2030	}
				2031	tcp_check_reno_reordering(sk, acked);
				2032	tcp_verify_left_out(tp);
				2033	}
				2034
				2035	static inline void tcp_reset_reno_sack(struct tcp_sock *tp)
				2036	{
				2037	tp->sacked_out = 0;
				2038	}
				2039
				2040	void tcp_clear_retrans(struct tcp_sock *tp)
				2041	{
				2042	tp->retrans_out = 0;
				2043	tp->lost_out = 0;
				2044	tp->undo_marker = 0;
				2045	tp->undo_retrans = -1;
				2046	tp->sacked_out = 0;
				2047	}
				2048
				2049	static inline void tcp_init_undo(struct tcp_sock *tp)
				2050	{
				2051	tp->undo_marker = tp->snd_una;
				2052	/* Retransmission still in flight may cause DSACKs later. */
				2053	tp->undo_retrans = tp->retrans_out ? : -1;
				2054	}
				2055
				2056	static bool tcp_is_rack(const struct sock *sk)
				2057	{
				2058	return sock_net(sk)->ipv4.sysctl_tcp_recovery & TCP_RACK_LOSS_DETECTION;
				2059	}
				2060
				2061	/* If we detect SACK reneging, forget all SACK information
				2062	* and reset tags completely, otherwise preserve SACKs. If receiver
				2063	* dropped its ofo queue, we will know this due to reneging detection.
				2064	*/
				2065	static void tcp_timeout_mark_lost(struct sock *sk)
				2066	{
				2067	struct tcp_sock *tp = tcp_sk(sk);
				2068	struct sk_buff skb, head;
				2069	bool is_reneg; /* is receiver reneging on SACKs? */
				2070
				2071	head = tcp_rtx_queue_head(sk);
				2072	is_reneg = head && (TCP_SKB_CB(head)->sacked & TCPCB_SACKED_ACKED);
				2073	if (is_reneg) {
				2074	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSACKRENEGING);
				2075	tp->sacked_out = 0;
				2076	/* Mark SACK reneging until we recover from this loss event. */
				2077	tp->is_sack_reneg = 1;
				2078	} else if (tcp_is_reno(tp)) {
				2079	tcp_reset_reno_sack(tp);
				2080	}
				2081
				2082	skb = head;
				2083	skb_rbtree_walk_from(skb) {
				2084	if (is_reneg)
				2085	TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED;
				2086	else if (tcp_is_rack(sk) && skb != head &&
				2087	tcp_rack_skb_timeout(tp, skb, 0) > 0)
				2088	continue; /* Don't mark recently sent ones lost yet */
				2089	tcp_mark_skb_lost(sk, skb);
				2090	}
				2091	tcp_verify_left_out(tp);
				2092	tcp_clear_all_retrans_hints(tp);
				2093	}
				2094
				2095	/* Enter Loss state. */
				2096	void tcp_enter_loss(struct sock *sk)
				2097	{
				2098	const struct inet_connection_sock *icsk = inet_csk(sk);
				2099	struct tcp_sock *tp = tcp_sk(sk);
				2100	struct net *net = sock_net(sk);
				2101	bool new_recovery = icsk->icsk_ca_state < TCP_CA_Recovery;
				2102
				2103	tcp_timeout_mark_lost(sk);
				2104
				2105	/* Reduce ssthresh if it has not yet been made inside this window. */
				2106	if (icsk->icsk_ca_state <= TCP_CA_Disorder \|\|
				2107	!after(tp->high_seq, tp->snd_una) \|\|
				2108	(icsk->icsk_ca_state == TCP_CA_Loss && !icsk->icsk_retransmits)) {
				2109	tp->prior_ssthresh = tcp_current_ssthresh(sk);
				2110	tp->prior_cwnd = tp->snd_cwnd;
				2111	tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
				2112	tcp_ca_event(sk, CA_EVENT_LOSS);
				2113	tcp_init_undo(tp);
				2114	}
				2115	tp->snd_cwnd = tcp_packets_in_flight(tp) + 1;
				2116	tp->snd_cwnd_cnt = 0;
				2117	tp->snd_cwnd_stamp = tcp_jiffies32;
				2118
				2119	/* Timeout in disordered state after receiving substantial DUPACKs
				2120	* suggests that the degree of reordering is over-estimated.
				2121	*/
				2122	if (icsk->icsk_ca_state <= TCP_CA_Disorder &&
				2123	tp->sacked_out >= net->ipv4.sysctl_tcp_reordering)
				2124	tp->reordering = min_t(unsigned int, tp->reordering,
				2125	net->ipv4.sysctl_tcp_reordering);
				2126	tcp_set_ca_state(sk, TCP_CA_Loss);
				2127	tp->high_seq = tp->snd_nxt;
				2128	tcp_ecn_queue_cwr(tp);
				2129
				2130	/* F-RTO RFC5682 sec 3.1 step 1: retransmit SND.UNA if no previous
				2131	* loss recovery is underway except recurring timeout(s) on
				2132	* the same SND.UNA (sec 3.2). Disable F-RTO on path MTU probing
				2133	*/
				2134	tp->frto = net->ipv4.sysctl_tcp_frto &&
				2135	(new_recovery \|\| icsk->icsk_retransmits) &&
				2136	!inet_csk(sk)->icsk_mtup.probe_size;
				2137	}
				2138
				2139	/* If ACK arrived pointing to a remembered SACK, it means that our
				2140	* remembered SACKs do not reflect real state of receiver i.e.
				2141	* receiver _host_ is heavily congested (or buggy).
				2142	*
				2143	* To avoid big spurious retransmission bursts due to transient SACK
				2144	* scoreboard oddities that look like reneging, we give the receiver a
				2145	* little time (max(RTT/2, 10ms)) to send us some more ACKs that will
				2146	* restore sanity to the SACK scoreboard. If the apparent reneging
				2147	* persists until this RTO then we'll clear the SACK scoreboard.
				2148	*/
				2149	static bool tcp_check_sack_reneging(struct sock *sk, int flag)
				2150	{
				2151	if (flag & FLAG_SACK_RENEGING) {
				2152	struct tcp_sock *tp = tcp_sk(sk);
				2153	unsigned long delay = max(usecs_to_jiffies(tp->srtt_us >> 4),
				2154	msecs_to_jiffies(10));
				2155
				2156	inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
				2157	delay, TCP_RTO_MAX);
				2158	return true;
				2159	}
				2160	return false;
				2161	}
				2162
				2163	/* Heurestics to calculate number of duplicate ACKs. There's no dupACKs
				2164	* counter when SACK is enabled (without SACK, sacked_out is used for
				2165	* that purpose).
				2166	*
				2167	* With reordering, holes may still be in flight, so RFC3517 recovery
				2168	* uses pure sacked_out (total number of SACKed segments) even though
				2169	* it violates the RFC that uses duplicate ACKs, often these are equal
				2170	* but when e.g. out-of-window ACKs or packet duplication occurs,
				2171	* they differ. Since neither occurs due to loss, TCP should really
				2172	* ignore them.
				2173	*/
				2174	static inline int tcp_dupack_heuristics(const struct tcp_sock *tp)
				2175	{
				2176	return tp->sacked_out + 1;
				2177	}
				2178
				2179	/* Linux NewReno/SACK/ECN state machine.
				2180	* --------------------------------------
				2181	*
				2182	* "Open" Normal state, no dubious events, fast path.
				2183	* "Disorder" In all the respects it is "Open",
				2184	* but requires a bit more attention. It is entered when
				2185	* we see some SACKs or dupacks. It is split of "Open"
				2186	* mainly to move some processing from fast path to slow one.
				2187	* "CWR" CWND was reduced due to some Congestion Notification event.
				2188	* It can be ECN, ICMP source quench, local device congestion.
				2189	* "Recovery" CWND was reduced, we are fast-retransmitting.
				2190	* "Loss" CWND was reduced due to RTO timeout or SACK reneging.
				2191	*
				2192	* tcp_fastretrans_alert() is entered:
				2193	* - each incoming ACK, if state is not "Open"
				2194	* - when arrived ACK is unusual, namely:
				2195	* * SACK
				2196	* * Duplicate ACK.
				2197	* * ECN ECE.
				2198	*
				2199	* Counting packets in flight is pretty simple.
				2200	*
				2201	* in_flight = packets_out - left_out + retrans_out
				2202	*
				2203	* packets_out is SND.NXT-SND.UNA counted in packets.
				2204	*
				2205	* retrans_out is number of retransmitted segments.
				2206	*
				2207	* left_out is number of segments left network, but not ACKed yet.
				2208	*
				2209	* left_out = sacked_out + lost_out
				2210	*
				2211	* sacked_out: Packets, which arrived to receiver out of order
				2212	* and hence not ACKed. With SACKs this number is simply
				2213	* amount of SACKed data. Even without SACKs
				2214	* it is easy to give pretty reliable estimate of this number,
				2215	* counting duplicate ACKs.
				2216	*
				2217	* lost_out: Packets lost by network. TCP has no explicit
				2218	* "loss notification" feedback from network (for now).
				2219	* It means that this number can be only _guessed_.
				2220	* Actually, it is the heuristics to predict lossage that
				2221	* distinguishes different algorithms.
				2222	*
				2223	* F.e. after RTO, when all the queue is considered as lost,
				2224	* lost_out = packets_out and in_flight = retrans_out.
				2225	*
				2226	* Essentially, we have now a few algorithms detecting
				2227	* lost packets.
				2228	*
				2229	* If the receiver supports SACK:
				2230	*
				2231	* RFC6675/3517: It is the conventional algorithm. A packet is
				2232	* considered lost if the number of higher sequence packets
				2233	* SACKed is greater than or equal the DUPACK thoreshold
				2234	* (reordering). This is implemented in tcp_mark_head_lost and
				2235	* tcp_update_scoreboard.
				2236	*
				2237	* RACK (draft-ietf-tcpm-rack-01): it is a newer algorithm
				2238	* (2017-) that checks timing instead of counting DUPACKs.
				2239	* Essentially a packet is considered lost if it's not S/ACKed
				2240	* after RTT + reordering_window, where both metrics are
				2241	* dynamically measured and adjusted. This is implemented in
				2242	* tcp_rack_mark_lost.
				2243	*
				2244	* If the receiver does not support SACK:
				2245	*
				2246	* NewReno (RFC6582): in Recovery we assume that one segment
				2247	* is lost (classic Reno). While we are in Recovery and
				2248	* a partial ACK arrives, we assume that one more packet
				2249	* is lost (NewReno). This heuristics are the same in NewReno
				2250	* and SACK.
				2251	*
				2252	* Really tricky (and requiring careful tuning) part of algorithm
				2253	* is hidden in functions tcp_time_to_recover() and tcp_xmit_retransmit_queue().
				2254	* The first determines the moment _when_ we should reduce CWND and,
				2255	* hence, slow down forward transmission. In fact, it determines the moment
				2256	* when we decide that hole is caused by loss, rather than by a reorder.
				2257	*
				2258	* tcp_xmit_retransmit_queue() decides, _what_ we should retransmit to fill
				2259	* holes, caused by lost packets.
				2260	*
				2261	* And the most logically complicated part of algorithm is undo
				2262	* heuristics. We detect false retransmits due to both too early
				2263	* fast retransmit (reordering) and underestimated RTO, analyzing
				2264	* timestamps and D-SACKs. When we detect that some segments were
				2265	* retransmitted by mistake and CWND reduction was wrong, we undo
				2266	* window reduction and abort recovery phase. This logic is hidden
				2267	* inside several functions named tcp_try_undo_<something>.
				2268	*/
				2269
				2270	/* This function decides, when we should leave Disordered state
				2271	* and enter Recovery phase, reducing congestion window.
				2272	*
				2273	* Main question: may we further continue forward transmission
				2274	* with the same cwnd?
				2275	*/
				2276	static bool tcp_time_to_recover(struct sock *sk, int flag)
				2277	{
				2278	struct tcp_sock *tp = tcp_sk(sk);
				2279
				2280	/* Trick#1: The loss is proven. */
				2281	if (tp->lost_out)
				2282	return true;
				2283
				2284	/* Not-A-Trick#2 : Classic rule... */
				2285	if (!tcp_is_rack(sk) && tcp_dupack_heuristics(tp) > tp->reordering)
				2286	return true;
				2287
				2288	return false;
				2289	}
				2290
				2291	/* Detect loss in event "A" above by marking head of queue up as lost.
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2292	* For RFC3517 SACK, a segment is considered lost if it
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2293	* has at least tp->reordering SACKed seqments above it; "packets" refers to
				2294	* the maximum SACKed segments to pass before reaching this limit.
				2295	*/
				2296	static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head)
				2297	{
				2298	struct tcp_sock *tp = tcp_sk(sk);
				2299	struct sk_buff *skb;
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2300	int cnt;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2301	/* Use SACK to deduce losses of new sequences sent during recovery */
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2302	const u32 loss_high = tp->snd_nxt;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2303
				2304	WARN_ON(packets > tp->packets_out);
				2305	skb = tp->lost_skb_hint;
				2306	if (skb) {
				2307	/* Head already handled? */
				2308	if (mark_head && after(TCP_SKB_CB(skb)->seq, tp->snd_una))
				2309	return;
				2310	cnt = tp->lost_cnt_hint;
				2311	} else {
				2312	skb = tcp_rtx_queue_head(sk);
				2313	cnt = 0;
				2314	}
				2315
				2316	skb_rbtree_walk_from(skb) {
				2317	/* TODO: do this better */
				2318	/* this is not the most efficient way to do this... */
				2319	tp->lost_skb_hint = skb;
				2320	tp->lost_cnt_hint = cnt;
				2321
				2322	if (after(TCP_SKB_CB(skb)->end_seq, loss_high))
				2323	break;
				2324
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2325	if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2326	cnt += tcp_skb_pcount(skb);
				2327
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2328	if (cnt > packets)
				2329	break;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2330
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2331	if (!(TCP_SKB_CB(skb)->sacked & TCPCB_LOST))
				2332	tcp_mark_skb_lost(sk, skb);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2333
				2334	if (mark_head)
				2335	break;
				2336	}
				2337	tcp_verify_left_out(tp);
				2338	}
				2339
				2340	/* Account newly detected lost packet(s) */
				2341
				2342	static void tcp_update_scoreboard(struct sock *sk, int fast_rexmit)
				2343	{
				2344	struct tcp_sock *tp = tcp_sk(sk);
				2345
				2346	if (tcp_is_sack(tp)) {
				2347	int sacked_upto = tp->sacked_out - tp->reordering;
				2348	if (sacked_upto >= 0)
				2349	tcp_mark_head_lost(sk, sacked_upto, 0);
				2350	else if (fast_rexmit)
				2351	tcp_mark_head_lost(sk, 1, 1);
				2352	}
				2353	}
				2354
				2355	static bool tcp_tsopt_ecr_before(const struct tcp_sock *tp, u32 when)
				2356	{
				2357	return tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
				2358	before(tp->rx_opt.rcv_tsecr, when);
				2359	}
				2360
				2361	/* skb is spurious retransmitted if the returned timestamp echo
				2362	* reply is prior to the skb transmission time
				2363	*/
				2364	static bool tcp_skb_spurious_retrans(const struct tcp_sock *tp,
				2365	const struct sk_buff *skb)
				2366	{
				2367	return (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS) &&
				2368	tcp_tsopt_ecr_before(tp, tcp_skb_timestamp(skb));
				2369	}
				2370
				2371	/* Nothing was retransmitted or returned timestamp is less
				2372	* than timestamp of the first retransmission.
				2373	*/
				2374	static inline bool tcp_packet_delayed(const struct tcp_sock *tp)
				2375	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2376	return tp->retrans_stamp &&
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2377	tcp_tsopt_ecr_before(tp, tp->retrans_stamp);
				2378	}
				2379
				2380	/* Undo procedures. */
				2381
				2382	/* We can clear retrans_stamp when there are no retransmissions in the
				2383	* window. It would seem that it is trivially available for us in
				2384	* tp->retrans_out, however, that kind of assumptions doesn't consider
				2385	* what will happen if errors occur when sending retransmission for the
				2386	* second time. ...It could the that such segment has only
				2387	* TCPCB_EVER_RETRANS set at the present time. It seems that checking
				2388	* the head skb is enough except for some reneging corner cases that
				2389	* are not worth the effort.
				2390	*
				2391	* Main reason for all this complexity is the fact that connection dying
				2392	* time now depends on the validity of the retrans_stamp, in particular,
				2393	* that successive retransmissions of a segment must not advance
				2394	* retrans_stamp under any conditions.
				2395	*/
				2396	static bool tcp_any_retrans_done(const struct sock *sk)
				2397	{
				2398	const struct tcp_sock *tp = tcp_sk(sk);
				2399	struct sk_buff *skb;
				2400
				2401	if (tp->retrans_out)
				2402	return true;
				2403
				2404	skb = tcp_rtx_queue_head(sk);
				2405	if (unlikely(skb && TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS))
				2406	return true;
				2407
				2408	return false;
				2409	}
				2410
				2411	static void DBGUNDO(struct sock sk, const char msg)
				2412	{
				2413	#if FASTRETRANS_DEBUG > 1
				2414	struct tcp_sock *tp = tcp_sk(sk);
				2415	struct inet_sock *inet = inet_sk(sk);
				2416
				2417	if (sk->sk_family == AF_INET) {
				2418	pr_debug("Undo %s %pI4/%u c%u l%u ss%u/%u p%u\n",
				2419	msg,
				2420	&inet->inet_daddr, ntohs(inet->inet_dport),
				2421	tp->snd_cwnd, tcp_left_out(tp),
				2422	tp->snd_ssthresh, tp->prior_ssthresh,
				2423	tp->packets_out);
				2424	}
				2425	#if IS_ENABLED(CONFIG_IPV6)
				2426	else if (sk->sk_family == AF_INET6) {
				2427	pr_debug("Undo %s %pI6/%u c%u l%u ss%u/%u p%u\n",
				2428	msg,
				2429	&sk->sk_v6_daddr, ntohs(inet->inet_dport),
				2430	tp->snd_cwnd, tcp_left_out(tp),
				2431	tp->snd_ssthresh, tp->prior_ssthresh,
				2432	tp->packets_out);
				2433	}
				2434	#endif
				2435	#endif
				2436	}
				2437
				2438	static void tcp_undo_cwnd_reduction(struct sock *sk, bool unmark_loss)
				2439	{
				2440	struct tcp_sock *tp = tcp_sk(sk);
				2441
				2442	if (unmark_loss) {
				2443	struct sk_buff *skb;
				2444
				2445	skb_rbtree_walk(skb, &sk->tcp_rtx_queue) {
				2446	TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST;
				2447	}
				2448	tp->lost_out = 0;
				2449	tcp_clear_all_retrans_hints(tp);
				2450	}
				2451
				2452	if (tp->prior_ssthresh) {
				2453	const struct inet_connection_sock *icsk = inet_csk(sk);
				2454
				2455	tp->snd_cwnd = icsk->icsk_ca_ops->undo_cwnd(sk);
				2456
				2457	if (tp->prior_ssthresh > tp->snd_ssthresh) {
				2458	tp->snd_ssthresh = tp->prior_ssthresh;
				2459	tcp_ecn_withdraw_cwr(tp);
				2460	}
				2461	}
				2462	tp->snd_cwnd_stamp = tcp_jiffies32;
				2463	tp->undo_marker = 0;
				2464	tp->rack.advanced = 1; /* Force RACK to re-exam losses */
				2465	}
				2466
				2467	static inline bool tcp_may_undo(const struct tcp_sock *tp)
				2468	{
				2469	return tp->undo_marker && (!tp->undo_retrans \|\| tcp_packet_delayed(tp));
				2470	}
				2471
				2472	/* People celebrate: "We love our President!" */
				2473	static bool tcp_try_undo_recovery(struct sock *sk)
				2474	{
				2475	struct tcp_sock *tp = tcp_sk(sk);
				2476
				2477	if (tcp_may_undo(tp)) {
				2478	int mib_idx;
				2479
				2480	/* Happy end! We did not retransmit anything
				2481	* or our original transmission succeeded.
				2482	*/
				2483	DBGUNDO(sk, inet_csk(sk)->icsk_ca_state == TCP_CA_Loss ? "loss" : "retrans");
				2484	tcp_undo_cwnd_reduction(sk, false);
				2485	if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss)
				2486	mib_idx = LINUX_MIB_TCPLOSSUNDO;
				2487	else
				2488	mib_idx = LINUX_MIB_TCPFULLUNDO;
				2489
				2490	NET_INC_STATS(sock_net(sk), mib_idx);
				2491	} else if (tp->rack.reo_wnd_persist) {
				2492	tp->rack.reo_wnd_persist--;
				2493	}
				2494	if (tp->snd_una == tp->high_seq && tcp_is_reno(tp)) {
				2495	/* Hold old state until something above high_seq
				2496	* is ACKed. For Reno it is MUST to prevent false
				2497	* fast retransmits (RFC2582). SACK TCP is safe. */
				2498	if (!tcp_any_retrans_done(sk))
				2499	tp->retrans_stamp = 0;
				2500	return true;
				2501	}
				2502	tcp_set_ca_state(sk, TCP_CA_Open);
				2503	tp->is_sack_reneg = 0;
				2504	return false;
				2505	}
				2506
				2507	/* Try to undo cwnd reduction, because D-SACKs acked all retransmitted data */
				2508	static bool tcp_try_undo_dsack(struct sock *sk)
				2509	{
				2510	struct tcp_sock *tp = tcp_sk(sk);
				2511
				2512	if (tp->undo_marker && !tp->undo_retrans) {
				2513	tp->rack.reo_wnd_persist = min(TCP_RACK_RECOVERY_THRESH,
				2514	tp->rack.reo_wnd_persist + 1);
				2515	DBGUNDO(sk, "D-SACK");
				2516	tcp_undo_cwnd_reduction(sk, false);
				2517	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKUNDO);
				2518	return true;
				2519	}
				2520	return false;
				2521	}
				2522
				2523	/* Undo during loss recovery after partial ACK or using F-RTO. */
				2524	static bool tcp_try_undo_loss(struct sock *sk, bool frto_undo)
				2525	{
				2526	struct tcp_sock *tp = tcp_sk(sk);
				2527
				2528	if (frto_undo \|\| tcp_may_undo(tp)) {
				2529	tcp_undo_cwnd_reduction(sk, true);
				2530
				2531	DBGUNDO(sk, "partial loss");
				2532	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPLOSSUNDO);
				2533	if (frto_undo)
				2534	NET_INC_STATS(sock_net(sk),
				2535	LINUX_MIB_TCPSPURIOUSRTOS);
				2536	inet_csk(sk)->icsk_retransmits = 0;
				2537	if (frto_undo \|\| tcp_is_sack(tp)) {
				2538	tcp_set_ca_state(sk, TCP_CA_Open);
				2539	tp->is_sack_reneg = 0;
				2540	}
				2541	return true;
				2542	}
				2543	return false;
				2544	}
				2545
				2546	/* The cwnd reduction in CWR and Recovery uses the PRR algorithm in RFC 6937.
				2547	* It computes the number of packets to send (sndcnt) based on packets newly
				2548	* delivered:
				2549	* 1) If the packets in flight is larger than ssthresh, PRR spreads the
				2550	* cwnd reductions across a full RTT.
				2551	* 2) Otherwise PRR uses packet conservation to send as much as delivered.
				2552	* But when the retransmits are acked without further losses, PRR
				2553	* slow starts cwnd up to ssthresh to speed up the recovery.
				2554	*/
				2555	static void tcp_init_cwnd_reduction(struct sock *sk)
				2556	{
				2557	struct tcp_sock *tp = tcp_sk(sk);
				2558
				2559	tp->high_seq = tp->snd_nxt;
				2560	tp->tlp_high_seq = 0;
				2561	tp->snd_cwnd_cnt = 0;
				2562	tp->prior_cwnd = tp->snd_cwnd;
				2563	tp->prr_delivered = 0;
				2564	tp->prr_out = 0;
				2565	tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk);
				2566	tcp_ecn_queue_cwr(tp);
				2567	}
				2568
				2569	void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, int flag)
				2570	{
				2571	struct tcp_sock *tp = tcp_sk(sk);
				2572	int sndcnt = 0;
				2573	int delta = tp->snd_ssthresh - tcp_packets_in_flight(tp);
				2574
				2575	if (newly_acked_sacked <= 0 \|\| WARN_ON_ONCE(!tp->prior_cwnd))
				2576	return;
				2577
				2578	tp->prr_delivered += newly_acked_sacked;
				2579	if (delta < 0) {
				2580	u64 dividend = (u64)tp->snd_ssthresh * tp->prr_delivered +
				2581	tp->prior_cwnd - 1;
				2582	sndcnt = div_u64(dividend, tp->prior_cwnd) - tp->prr_out;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2583	} else if ((flag & (FLAG_RETRANS_DATA_ACKED \| FLAG_LOST_RETRANS)) ==
				2584	FLAG_RETRANS_DATA_ACKED) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2585	sndcnt = min_t(int, delta,
				2586	max_t(int, tp->prr_delivered - tp->prr_out,
				2587	newly_acked_sacked) + 1);
				2588	} else {
				2589	sndcnt = min(delta, newly_acked_sacked);
				2590	}
				2591	/* Force a fast retransmit upon entering fast recovery */
				2592	sndcnt = max(sndcnt, (tp->prr_out ? 0 : 1));
				2593	tp->snd_cwnd = tcp_packets_in_flight(tp) + sndcnt;
				2594	}
				2595
				2596	static inline void tcp_end_cwnd_reduction(struct sock *sk)
				2597	{
				2598	struct tcp_sock *tp = tcp_sk(sk);
				2599
				2600	if (inet_csk(sk)->icsk_ca_ops->cong_control)
				2601	return;
				2602
				2603	/* Reset cwnd to ssthresh in CWR or Recovery (unless it's undone) */
				2604	if (tp->snd_ssthresh < TCP_INFINITE_SSTHRESH &&
				2605	(inet_csk(sk)->icsk_ca_state == TCP_CA_CWR \|\| tp->undo_marker)) {
				2606	tp->snd_cwnd = tp->snd_ssthresh;
				2607	tp->snd_cwnd_stamp = tcp_jiffies32;
				2608	}
				2609	tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR);
				2610	}
				2611
				2612	/* Enter CWR state. Disable cwnd undo since congestion is proven with ECN */
				2613	void tcp_enter_cwr(struct sock *sk)
				2614	{
				2615	struct tcp_sock *tp = tcp_sk(sk);
				2616
				2617	tp->prior_ssthresh = 0;
				2618	if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) {
				2619	tp->undo_marker = 0;
				2620	tcp_init_cwnd_reduction(sk);
				2621	tcp_set_ca_state(sk, TCP_CA_CWR);
				2622	}
				2623	}
				2624	EXPORT_SYMBOL(tcp_enter_cwr);
				2625
				2626	static void tcp_try_keep_open(struct sock *sk)
				2627	{
				2628	struct tcp_sock *tp = tcp_sk(sk);
				2629	int state = TCP_CA_Open;
				2630
				2631	if (tcp_left_out(tp) \|\| tcp_any_retrans_done(sk))
				2632	state = TCP_CA_Disorder;
				2633
				2634	if (inet_csk(sk)->icsk_ca_state != state) {
				2635	tcp_set_ca_state(sk, state);
				2636	tp->high_seq = tp->snd_nxt;
				2637	}
				2638	}
				2639
				2640	static void tcp_try_to_open(struct sock *sk, int flag)
				2641	{
				2642	struct tcp_sock *tp = tcp_sk(sk);
				2643
				2644	tcp_verify_left_out(tp);
				2645
				2646	if (!tcp_any_retrans_done(sk))
				2647	tp->retrans_stamp = 0;
				2648
				2649	if (flag & FLAG_ECE)
				2650	tcp_enter_cwr(sk);
				2651
				2652	if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) {
				2653	tcp_try_keep_open(sk);
				2654	}
				2655	}
				2656
				2657	static void tcp_mtup_probe_failed(struct sock *sk)
				2658	{
				2659	struct inet_connection_sock *icsk = inet_csk(sk);
				2660
				2661	icsk->icsk_mtup.search_high = icsk->icsk_mtup.probe_size - 1;
				2662	icsk->icsk_mtup.probe_size = 0;
				2663	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMTUPFAIL);
				2664	}
				2665
				2666	static void tcp_mtup_probe_success(struct sock *sk)
				2667	{
				2668	struct tcp_sock *tp = tcp_sk(sk);
				2669	struct inet_connection_sock *icsk = inet_csk(sk);
				2670
				2671	/* FIXME: breaks with very large cwnd */
				2672	tp->prior_ssthresh = tcp_current_ssthresh(sk);
				2673	tp->snd_cwnd = tp->snd_cwnd *
				2674	tcp_mss_to_mtu(sk, tp->mss_cache) /
				2675	icsk->icsk_mtup.probe_size;
				2676	tp->snd_cwnd_cnt = 0;
				2677	tp->snd_cwnd_stamp = tcp_jiffies32;
				2678	tp->snd_ssthresh = tcp_current_ssthresh(sk);
				2679
				2680	icsk->icsk_mtup.search_low = icsk->icsk_mtup.probe_size;
				2681	icsk->icsk_mtup.probe_size = 0;
				2682	tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
				2683	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMTUPSUCCESS);
				2684	}
				2685
				2686	/* Do a simple retransmit without using the backoff mechanisms in
				2687	* tcp_timer. This is used for path mtu discovery.
				2688	* The socket is already locked here.
				2689	*/
				2690	void tcp_simple_retransmit(struct sock *sk)
				2691	{
				2692	const struct inet_connection_sock *icsk = inet_csk(sk);
				2693	struct tcp_sock *tp = tcp_sk(sk);
				2694	struct sk_buff *skb;
				2695	unsigned int mss = tcp_current_mss(sk);
				2696
				2697	skb_rbtree_walk(skb, &sk->tcp_rtx_queue) {
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2698	if (tcp_skb_seglen(skb) > mss)
				2699	tcp_mark_skb_lost(sk, skb);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2700	}
				2701
				2702	tcp_clear_retrans_hints_partial(tp);
				2703
				2704	if (!tp->lost_out)
				2705	return;
				2706
				2707	if (tcp_is_reno(tp))
				2708	tcp_limit_reno_sacked(tp);
				2709
				2710	tcp_verify_left_out(tp);
				2711
				2712	/* Don't muck with the congestion window here.
				2713	* Reason is that we do not increase amount of _data_
				2714	* in network, but units changed and effective
				2715	* cwnd/ssthresh really reduced now.
				2716	*/
				2717	if (icsk->icsk_ca_state != TCP_CA_Loss) {
				2718	tp->high_seq = tp->snd_nxt;
				2719	tp->snd_ssthresh = tcp_current_ssthresh(sk);
				2720	tp->prior_ssthresh = 0;
				2721	tp->undo_marker = 0;
				2722	tcp_set_ca_state(sk, TCP_CA_Loss);
				2723	}
				2724	tcp_xmit_retransmit_queue(sk);
				2725	}
				2726	EXPORT_SYMBOL(tcp_simple_retransmit);
				2727
				2728	void tcp_enter_recovery(struct sock *sk, bool ece_ack)
				2729	{
				2730	struct tcp_sock *tp = tcp_sk(sk);
				2731	int mib_idx;
				2732
				2733	if (tcp_is_reno(tp))
				2734	mib_idx = LINUX_MIB_TCPRENORECOVERY;
				2735	else
				2736	mib_idx = LINUX_MIB_TCPSACKRECOVERY;
				2737
				2738	NET_INC_STATS(sock_net(sk), mib_idx);
				2739
				2740	tp->prior_ssthresh = 0;
				2741	tcp_init_undo(tp);
				2742
				2743	if (!tcp_in_cwnd_reduction(sk)) {
				2744	if (!ece_ack)
				2745	tp->prior_ssthresh = tcp_current_ssthresh(sk);
				2746	tcp_init_cwnd_reduction(sk);
				2747	}
				2748	tcp_set_ca_state(sk, TCP_CA_Recovery);
				2749	}
				2750
				2751	/* Process an ACK in CA_Loss state. Move to CA_Open if lost data are
				2752	* recovered or spurious. Otherwise retransmits more on partial ACKs.
				2753	*/
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2754	static void tcp_process_loss(struct sock *sk, int flag, int num_dupack,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2755	int *rexmit)
				2756	{
				2757	struct tcp_sock *tp = tcp_sk(sk);
				2758	bool recovered = !before(tp->snd_una, tp->high_seq);
				2759
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2760	if ((flag & FLAG_SND_UNA_ADVANCED \|\| rcu_access_pointer(tp->fastopen_rsk)) &&
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2761	tcp_try_undo_loss(sk, false))
				2762	return;
				2763
				2764	if (tp->frto) { /* F-RTO RFC5682 sec 3.1 (sack enhanced version). */
				2765	/* Step 3.b. A timeout is spurious if not all data are
				2766	* lost, i.e., never-retransmitted data are (s)acked.
				2767	*/
				2768	if ((flag & FLAG_ORIG_SACK_ACKED) &&
				2769	tcp_try_undo_loss(sk, true))
				2770	return;
				2771
				2772	if (after(tp->snd_nxt, tp->high_seq)) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2773	if (flag & FLAG_DATA_SACKED \|\| num_dupack)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2774	tp->frto = 0; /* Step 3.a. loss was real */
				2775	} else if (flag & FLAG_SND_UNA_ADVANCED && !recovered) {
				2776	tp->high_seq = tp->snd_nxt;
				2777	/* Step 2.b. Try send new data (but deferred until cwnd
				2778	* is updated in tcp_ack()). Otherwise fall back to
				2779	* the conventional recovery.
				2780	*/
				2781	if (!tcp_write_queue_empty(sk) &&
				2782	after(tcp_wnd_end(tp), tp->snd_nxt)) {
				2783	*rexmit = REXMIT_NEW;
				2784	return;
				2785	}
				2786	tp->frto = 0;
				2787	}
				2788	}
				2789
				2790	if (recovered) {
				2791	/* F-RTO RFC5682 sec 3.1 step 2.a and 1st part of step 3.a */
				2792	tcp_try_undo_recovery(sk);
				2793	return;
				2794	}
				2795	if (tcp_is_reno(tp)) {
				2796	/* A Reno DUPACK means new data in F-RTO step 2.b above are
				2797	* delivered. Lower inflight to clock out (re)tranmissions.
				2798	*/
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2799	if (after(tp->snd_nxt, tp->high_seq) && num_dupack)
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2800	tcp_add_reno_sack(sk, num_dupack, flag & FLAG_ECE);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2801	else if (flag & FLAG_SND_UNA_ADVANCED)
				2802	tcp_reset_reno_sack(tp);
				2803	}
				2804	*rexmit = REXMIT_LOST;
				2805	}
				2806
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2807	static bool tcp_force_fast_retransmit(struct sock *sk)
				2808	{
				2809	struct tcp_sock *tp = tcp_sk(sk);
				2810
				2811	return after(tcp_highest_sack_seq(tp),
				2812	tp->snd_una + tp->reordering * tp->mss_cache);
				2813	}
				2814
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2815	/* Undo during fast recovery after partial ACK. */
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2816	static bool tcp_try_undo_partial(struct sock *sk, u32 prior_snd_una,
				2817	bool *do_lost)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2818	{
				2819	struct tcp_sock *tp = tcp_sk(sk);
				2820
				2821	if (tp->undo_marker && tcp_packet_delayed(tp)) {
				2822	/* Plain luck! Hole if filled with delayed
				2823	* packet, rather than with a retransmit. Check reordering.
				2824	*/
				2825	tcp_check_sack_reordering(sk, prior_snd_una, 1);
				2826
				2827	/* We are getting evidence that the reordering degree is higher
				2828	* than we realized. If there are no retransmits out then we
				2829	* can undo. Otherwise we clock out new packets but do not
				2830	* mark more packets lost or retransmit more.
				2831	*/
				2832	if (tp->retrans_out)
				2833	return true;
				2834
				2835	if (!tcp_any_retrans_done(sk))
				2836	tp->retrans_stamp = 0;
				2837
				2838	DBGUNDO(sk, "partial recovery");
				2839	tcp_undo_cwnd_reduction(sk, true);
				2840	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPARTIALUNDO);
				2841	tcp_try_keep_open(sk);
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2842	} else {
				2843	/* Partial ACK arrived. Force fast retransmit. */
				2844	*do_lost = tcp_force_fast_retransmit(sk);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2845	}
				2846	return false;
				2847	}
				2848
				2849	static void tcp_identify_packet_loss(struct sock sk, int ack_flag)
				2850	{
				2851	struct tcp_sock *tp = tcp_sk(sk);
				2852
				2853	if (tcp_rtx_queue_empty(sk))
				2854	return;
				2855
				2856	if (unlikely(tcp_is_reno(tp))) {
				2857	tcp_newreno_mark_lost(sk, *ack_flag & FLAG_SND_UNA_ADVANCED);
				2858	} else if (tcp_is_rack(sk)) {
				2859	u32 prior_retrans = tp->retrans_out;
				2860
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame]	2861	if (tcp_rack_mark_lost(sk))
				2862	*ack_flag &= ~FLAG_SET_XMIT_TIMER;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2863	if (prior_retrans > tp->retrans_out)
				2864	*ack_flag \|= FLAG_LOST_RETRANS;
				2865	}
				2866	}
				2867
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2868	/* Process an event, which can update packets-in-flight not trivially.
				2869	* Main goal of this function is to calculate new estimate for left_out,
				2870	* taking into account both packets sitting in receiver's buffer and
				2871	* packets lost by network.
				2872	*
				2873	* Besides that it updates the congestion state when packet loss or ECN
				2874	* is detected. But it does not reduce the cwnd, it is done by the
				2875	* congestion control later.
				2876	*
				2877	* It does _not_ decide what to send, it is made in function
				2878	* tcp_xmit_retransmit_queue().
				2879	*/
				2880	static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2881	int num_dupack, int ack_flag, int rexmit)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2882	{
				2883	struct inet_connection_sock *icsk = inet_csk(sk);
				2884	struct tcp_sock *tp = tcp_sk(sk);
				2885	int fast_rexmit = 0, flag = *ack_flag;
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2886	bool ece_ack = flag & FLAG_ECE;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2887	bool do_lost = num_dupack \|\| ((flag & FLAG_DATA_SACKED) &&
				2888	tcp_force_fast_retransmit(sk));
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2889
				2890	if (!tp->packets_out && tp->sacked_out)
				2891	tp->sacked_out = 0;
				2892
				2893	/* Now state machine starts.
				2894	* A. ECE, hence prohibit cwnd undoing, the reduction is required. */
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2895	if (ece_ack)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2896	tp->prior_ssthresh = 0;
				2897
				2898	/* B. In all the states check for reneging SACKs. */
				2899	if (tcp_check_sack_reneging(sk, flag))
				2900	return;
				2901
				2902	/* C. Check consistency of the current state. */
				2903	tcp_verify_left_out(tp);
				2904
				2905	/* D. Check state exit conditions. State can be terminated
				2906	* when high_seq is ACKed. */
				2907	if (icsk->icsk_ca_state == TCP_CA_Open) {
				2908	WARN_ON(tp->retrans_out != 0);
				2909	tp->retrans_stamp = 0;
				2910	} else if (!before(tp->snd_una, tp->high_seq)) {
				2911	switch (icsk->icsk_ca_state) {
				2912	case TCP_CA_CWR:
				2913	/* CWR is to be held something above high_seq
				2914	* is ACKed for CWR bit to reach receiver. */
				2915	if (tp->snd_una != tp->high_seq) {
				2916	tcp_end_cwnd_reduction(sk);
				2917	tcp_set_ca_state(sk, TCP_CA_Open);
				2918	}
				2919	break;
				2920
				2921	case TCP_CA_Recovery:
				2922	if (tcp_is_reno(tp))
				2923	tcp_reset_reno_sack(tp);
				2924	if (tcp_try_undo_recovery(sk))
				2925	return;
				2926	tcp_end_cwnd_reduction(sk);
				2927	break;
				2928	}
				2929	}
				2930
				2931	/* E. Process state. */
				2932	switch (icsk->icsk_ca_state) {
				2933	case TCP_CA_Recovery:
				2934	if (!(flag & FLAG_SND_UNA_ADVANCED)) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2935	if (tcp_is_reno(tp))
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2936	tcp_add_reno_sack(sk, num_dupack, ece_ack);
				2937	} else if (tcp_try_undo_partial(sk, prior_snd_una, &do_lost))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2938	return;
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2939
				2940	if (tcp_try_undo_dsack(sk))
				2941	tcp_try_keep_open(sk);
				2942
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2943	tcp_identify_packet_loss(sk, ack_flag);
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2944	if (icsk->icsk_ca_state != TCP_CA_Recovery) {
				2945	if (!tcp_time_to_recover(sk, flag))
				2946	return;
				2947	/* Undo reverts the recovery state. If loss is evident,
				2948	* starts a new recovery (e.g. reordering then loss);
				2949	*/
				2950	tcp_enter_recovery(sk, ece_ack);
				2951	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2952	break;
				2953	case TCP_CA_Loss:
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2954	tcp_process_loss(sk, flag, num_dupack, rexmit);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2955	tcp_identify_packet_loss(sk, ack_flag);
				2956	if (!(icsk->icsk_ca_state == TCP_CA_Open \|\|
				2957	(*ack_flag & FLAG_LOST_RETRANS)))
				2958	return;
				2959	/* Change state if cwnd is undone or retransmits are lost */
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2960	fallthrough;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2961	default:
				2962	if (tcp_is_reno(tp)) {
				2963	if (flag & FLAG_SND_UNA_ADVANCED)
				2964	tcp_reset_reno_sack(tp);
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2965	tcp_add_reno_sack(sk, num_dupack, ece_ack);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2966	}
				2967
				2968	if (icsk->icsk_ca_state <= TCP_CA_Disorder)
				2969	tcp_try_undo_dsack(sk);
				2970
				2971	tcp_identify_packet_loss(sk, ack_flag);
				2972	if (!tcp_time_to_recover(sk, flag)) {
				2973	tcp_try_to_open(sk, flag);
				2974	return;
				2975	}
				2976
				2977	/* MTU probe failure: don't reduce cwnd */
				2978	if (icsk->icsk_ca_state < TCP_CA_CWR &&
				2979	icsk->icsk_mtup.probe_size &&
				2980	tp->snd_una == tp->mtu_probe.probe_seq_start) {
				2981	tcp_mtup_probe_failed(sk);
				2982	/* Restores the reduction we did in tcp_mtup_probe() */
				2983	tp->snd_cwnd++;
				2984	tcp_simple_retransmit(sk);
				2985	return;
				2986	}
				2987
				2988	/* Otherwise enter Recovery state */
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2989	tcp_enter_recovery(sk, ece_ack);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2990	fast_rexmit = 1;
				2991	}
				2992
				2993	if (!tcp_is_rack(sk) && do_lost)
				2994	tcp_update_scoreboard(sk, fast_rexmit);
				2995	*rexmit = REXMIT_LOST;
				2996	}
				2997
				2998	static void tcp_update_rtt_min(struct sock *sk, u32 rtt_us, const int flag)
				2999	{
				3000	u32 wlen = sock_net(sk)->ipv4.sysctl_tcp_min_rtt_wlen * HZ;
				3001	struct tcp_sock *tp = tcp_sk(sk);
				3002
				3003	if ((flag & FLAG_ACK_MAYBE_DELAYED) && rtt_us > tcp_min_rtt(tp)) {
				3004	/* If the remote keeps returning delayed ACKs, eventually
				3005	* the min filter would pick it up and overestimate the
				3006	* prop. delay when it expires. Skip suspected delayed ACKs.
				3007	*/
				3008	return;
				3009	}
				3010	minmax_running_min(&tp->rtt_min, wlen, tcp_jiffies32,
				3011	rtt_us ? : jiffies_to_usecs(1));
				3012	}
				3013
				3014	static bool tcp_ack_update_rtt(struct sock *sk, const int flag,
				3015	long seq_rtt_us, long sack_rtt_us,
				3016	long ca_rtt_us, struct rate_sample *rs)
				3017	{
				3018	const struct tcp_sock *tp = tcp_sk(sk);
				3019
				3020	/* Prefer RTT measured from ACK's timing to TS-ECR. This is because
				3021	* broken middle-boxes or peers may corrupt TS-ECR fields. But
				3022	* Karn's algorithm forbids taking RTT if some retransmitted data
				3023	* is acked (RFC6298).
				3024	*/
				3025	if (seq_rtt_us < 0)
				3026	seq_rtt_us = sack_rtt_us;
				3027
				3028	/* RTTM Rule: A TSecr value received in a segment is used to
				3029	* update the averaged RTT measurement only if the segment
				3030	* acknowledges some new data, i.e., only if it advances the
				3031	* left edge of the send window.
				3032	* See draft-ietf-tcplw-high-performance-00, section 3.3.
				3033	*/
				3034	if (seq_rtt_us < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
				3035	flag & FLAG_ACKED) {
				3036	u32 delta = tcp_time_stamp(tp) - tp->rx_opt.rcv_tsecr;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3037
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3038	if (likely(delta < INT_MAX / (USEC_PER_SEC / TCP_TS_HZ))) {
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame]	3039	if (!delta)
				3040	delta = 1;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3041	seq_rtt_us = delta * (USEC_PER_SEC / TCP_TS_HZ);
				3042	ca_rtt_us = seq_rtt_us;
				3043	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3044	}
				3045	rs->rtt_us = ca_rtt_us; /* RTT of last (S)ACKed packet (or -1) */
				3046	if (seq_rtt_us < 0)
				3047	return false;
				3048
				3049	/* ca_rtt_us >= 0 is counting on the invariant that ca_rtt_us is
				3050	* always taken together with ACK, SACK, or TS-opts. Any negative
				3051	* values will be skipped with the seq_rtt_us < 0 check above.
				3052	*/
				3053	tcp_update_rtt_min(sk, ca_rtt_us, flag);
				3054	tcp_rtt_estimator(sk, seq_rtt_us);
				3055	tcp_set_rto(sk);
				3056
				3057	/* RFC6298: only reset backoff on valid RTT measurement. */
				3058	inet_csk(sk)->icsk_backoff = 0;
				3059	return true;
				3060	}
				3061
				3062	/* Compute time elapsed between (last) SYNACK and the ACK completing 3WHS. */
				3063	void tcp_synack_rtt_meas(struct sock sk, struct request_sock req)
				3064	{
				3065	struct rate_sample rs;
				3066	long rtt_us = -1L;
				3067
				3068	if (req && !req->num_retrans && tcp_rsk(req)->snt_synack)
				3069	rtt_us = tcp_stamp_us_delta(tcp_clock_us(), tcp_rsk(req)->snt_synack);
				3070
				3071	tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, rtt_us, -1L, rtt_us, &rs);
				3072	}
				3073
				3074
				3075	static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
				3076	{
				3077	const struct inet_connection_sock *icsk = inet_csk(sk);
				3078
				3079	icsk->icsk_ca_ops->cong_avoid(sk, ack, acked);
				3080	tcp_sk(sk)->snd_cwnd_stamp = tcp_jiffies32;
				3081	}
				3082
				3083	/* Restart timer after forward progress on connection.
				3084	* RFC2988 recommends to restart timer to now+rto.
				3085	*/
				3086	void tcp_rearm_rto(struct sock *sk)
				3087	{
				3088	const struct inet_connection_sock *icsk = inet_csk(sk);
				3089	struct tcp_sock *tp = tcp_sk(sk);
				3090
				3091	/* If the retrans timer is currently being used by Fast Open
				3092	* for SYN-ACK retrans purpose, stay put.
				3093	*/
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3094	if (rcu_access_pointer(tp->fastopen_rsk))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3095	return;
				3096
				3097	if (!tp->packets_out) {
				3098	inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS);
				3099	} else {
				3100	u32 rto = inet_csk(sk)->icsk_rto;
				3101	/* Offset the time elapsed after installing regular RTO */
				3102	if (icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT \|\|
				3103	icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
				3104	s64 delta_us = tcp_rto_delta_us(sk);
				3105	/* delta_us may not be positive if the socket is locked
				3106	* when the retrans timer fires and is rescheduled.
				3107	*/
				3108	rto = usecs_to_jiffies(max_t(int, delta_us, 1));
				3109	}
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3110	tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto,
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	3111	TCP_RTO_MAX);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3112	}
				3113	}
				3114
				3115	/* Try to schedule a loss probe; if that doesn't work, then schedule an RTO. */
				3116	static void tcp_set_xmit_timer(struct sock *sk)
				3117	{
				3118	if (!tcp_schedule_loss_probe(sk, true))
				3119	tcp_rearm_rto(sk);
				3120	}
				3121
				3122	/* If we get here, the whole TSO packet has not been acked. */
				3123	static u32 tcp_tso_acked(struct sock sk, struct sk_buff skb)
				3124	{
				3125	struct tcp_sock *tp = tcp_sk(sk);
				3126	u32 packets_acked;
				3127
				3128	BUG_ON(!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una));
				3129
				3130	packets_acked = tcp_skb_pcount(skb);
				3131	if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq))
				3132	return 0;
				3133	packets_acked -= tcp_skb_pcount(skb);
				3134
				3135	if (packets_acked) {
				3136	BUG_ON(tcp_skb_pcount(skb) == 0);
				3137	BUG_ON(!before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq));
				3138	}
				3139
				3140	return packets_acked;
				3141	}
				3142
				3143	static void tcp_ack_tstamp(struct sock sk, struct sk_buff skb,
				3144	u32 prior_snd_una)
				3145	{
				3146	const struct skb_shared_info *shinfo;
				3147
				3148	/* Avoid cache line misses to get skb_shinfo() and shinfo->tx_flags */
				3149	if (likely(!TCP_SKB_CB(skb)->txstamp_ack))
				3150	return;
				3151
				3152	shinfo = skb_shinfo(skb);
				3153	if (!before(shinfo->tskey, prior_snd_una) &&
				3154	before(shinfo->tskey, tcp_sk(sk)->snd_una)) {
				3155	tcp_skb_tsorted_save(skb) {
				3156	__skb_tstamp_tx(skb, NULL, sk, SCM_TSTAMP_ACK);
				3157	} tcp_skb_tsorted_restore(skb);
				3158	}
				3159	}
				3160
				3161	/* Remove acknowledged frames from the retransmission queue. If our packet
				3162	* is before the ack sequence we can discard it as it's confirmed to have
				3163	* arrived at the other end.
				3164	*/
				3165	static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack,
				3166	u32 prior_snd_una,
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	3167	struct tcp_sacktag_state *sack, bool ece_ack)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3168	{
				3169	const struct inet_connection_sock *icsk = inet_csk(sk);
				3170	u64 first_ackt, last_ackt;
				3171	struct tcp_sock *tp = tcp_sk(sk);
				3172	u32 prior_sacked = tp->sacked_out;
				3173	u32 reord = tp->snd_nxt; /* lowest acked un-retx un-sacked seq */
				3174	struct sk_buff skb, next;
				3175	bool fully_acked = true;
				3176	long sack_rtt_us = -1L;
				3177	long seq_rtt_us = -1L;
				3178	long ca_rtt_us = -1L;
				3179	u32 pkts_acked = 0;
				3180	u32 last_in_flight = 0;
				3181	bool rtt_update;
				3182	int flag = 0;
				3183
				3184	first_ackt = 0;
				3185
				3186	for (skb = skb_rb_first(&sk->tcp_rtx_queue); skb; skb = next) {
				3187	struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
				3188	const u32 start_seq = scb->seq;
				3189	u8 sacked = scb->sacked;
				3190	u32 acked_pcount;
				3191
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3192	/* Determine how many packets and what bytes were acked, tso and else */
				3193	if (after(scb->end_seq, tp->snd_una)) {
				3194	if (tcp_skb_pcount(skb) == 1 \|\|
				3195	!after(tp->snd_una, scb->seq))
				3196	break;
				3197
				3198	acked_pcount = tcp_tso_acked(sk, skb);
				3199	if (!acked_pcount)
				3200	break;
				3201	fully_acked = false;
				3202	} else {
				3203	acked_pcount = tcp_skb_pcount(skb);
				3204	}
				3205
				3206	if (unlikely(sacked & TCPCB_RETRANS)) {
				3207	if (sacked & TCPCB_SACKED_RETRANS)
				3208	tp->retrans_out -= acked_pcount;
				3209	flag \|= FLAG_RETRANS_DATA_ACKED;
				3210	} else if (!(sacked & TCPCB_SACKED_ACKED)) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3211	last_ackt = tcp_skb_timestamp_us(skb);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3212	WARN_ON_ONCE(last_ackt == 0);
				3213	if (!first_ackt)
				3214	first_ackt = last_ackt;
				3215
				3216	last_in_flight = TCP_SKB_CB(skb)->tx.in_flight;
				3217	if (before(start_seq, reord))
				3218	reord = start_seq;
				3219	if (!after(scb->end_seq, tp->high_seq))
				3220	flag \|= FLAG_ORIG_SACK_ACKED;
				3221	}
				3222
				3223	if (sacked & TCPCB_SACKED_ACKED) {
				3224	tp->sacked_out -= acked_pcount;
				3225	} else if (tcp_is_sack(tp)) {
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	3226	tcp_count_delivered(tp, acked_pcount, ece_ack);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3227	if (!tcp_skb_spurious_retrans(tp, skb))
				3228	tcp_rack_advance(tp, sacked, scb->end_seq,
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3229	tcp_skb_timestamp_us(skb));
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3230	}
				3231	if (sacked & TCPCB_LOST)
				3232	tp->lost_out -= acked_pcount;
				3233
				3234	tp->packets_out -= acked_pcount;
				3235	pkts_acked += acked_pcount;
				3236	tcp_rate_skb_delivered(sk, skb, sack->rate);
				3237
				3238	/* Initial outgoing SYN's get put onto the write_queue
				3239	* just like anything else we transmit. It is not
				3240	* true data, and if we misinform our callers that
				3241	* this ACK acks real data, we will erroneously exit
				3242	* connection startup slow start one packet too
				3243	* quickly. This is severely frowned upon behavior.
				3244	*/
				3245	if (likely(!(scb->tcp_flags & TCPHDR_SYN))) {
				3246	flag \|= FLAG_DATA_ACKED;
				3247	} else {
				3248	flag \|= FLAG_SYN_ACKED;
				3249	tp->retrans_stamp = 0;
				3250	}
				3251
				3252	if (!fully_acked)
				3253	break;
				3254
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	3255	tcp_ack_tstamp(sk, skb, prior_snd_una);
				3256
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3257	next = skb_rb_next(skb);
				3258	if (unlikely(skb == tp->retransmit_skb_hint))
				3259	tp->retransmit_skb_hint = NULL;
				3260	if (unlikely(skb == tp->lost_skb_hint))
				3261	tp->lost_skb_hint = NULL;
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame]	3262	tcp_highest_sack_replace(sk, skb, next);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3263	tcp_rtx_queue_unlink_and_free(skb, sk);
				3264	}
				3265
				3266	if (!skb)
				3267	tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
				3268
				3269	if (likely(between(tp->snd_up, prior_snd_una, tp->snd_una)))
				3270	tp->snd_up = tp->snd_una;
				3271
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	3272	if (skb) {
				3273	tcp_ack_tstamp(sk, skb, prior_snd_una);
				3274	if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
				3275	flag \|= FLAG_SACK_RENEGING;
				3276	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3277
				3278	if (likely(first_ackt) && !(flag & FLAG_RETRANS_DATA_ACKED)) {
				3279	seq_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, first_ackt);
				3280	ca_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, last_ackt);
				3281
				3282	if (pkts_acked == 1 && last_in_flight < tp->mss_cache &&
				3283	last_in_flight && !prior_sacked && fully_acked &&
				3284	sack->rate->prior_delivered + 1 == tp->delivered &&
				3285	!(flag & (FLAG_CA_ALERT \| FLAG_SYN_ACKED))) {
				3286	/* Conservatively mark a delayed ACK. It's typically
				3287	* from a lone runt packet over the round trip to
				3288	* a receiver w/o out-of-order or CE events.
				3289	*/
				3290	flag \|= FLAG_ACK_MAYBE_DELAYED;
				3291	}
				3292	}
				3293	if (sack->first_sackt) {
				3294	sack_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, sack->first_sackt);
				3295	ca_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, sack->last_sackt);
				3296	}
				3297	rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt_us, sack_rtt_us,
				3298	ca_rtt_us, sack->rate);
				3299
				3300	if (flag & FLAG_ACKED) {
				3301	flag \|= FLAG_SET_XMIT_TIMER; /* set TLP or RTO timer */
				3302	if (unlikely(icsk->icsk_mtup.probe_size &&
				3303	!after(tp->mtu_probe.probe_seq_end, tp->snd_una))) {
				3304	tcp_mtup_probe_success(sk);
				3305	}
				3306
				3307	if (tcp_is_reno(tp)) {
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	3308	tcp_remove_reno_sacks(sk, pkts_acked, ece_ack);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3309
				3310	/* If any of the cumulatively ACKed segments was
				3311	* retransmitted, non-SACK case cannot confirm that
				3312	* progress was due to original transmission due to
				3313	* lack of TCPCB_SACKED_ACKED bits even if some of
				3314	* the packets may have been never retransmitted.
				3315	*/
				3316	if (flag & FLAG_RETRANS_DATA_ACKED)
				3317	flag &= ~FLAG_ORIG_SACK_ACKED;
				3318	} else {
				3319	int delta;
				3320
				3321	/* Non-retransmitted hole got filled? That's reordering */
				3322	if (before(reord, prior_fack))
				3323	tcp_check_sack_reordering(sk, reord, 0);
				3324
				3325	delta = prior_sacked - tp->sacked_out;
				3326	tp->lost_cnt_hint -= min(tp->lost_cnt_hint, delta);
				3327	}
				3328	} else if (skb && rtt_update && sack_rtt_us >= 0 &&
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3329	sack_rtt_us > tcp_stamp_us_delta(tp->tcp_mstamp,
				3330	tcp_skb_timestamp_us(skb))) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3331	/* Do not re-arm RTO if the sack RTT is measured from data sent
				3332	* after when the head was last (re)transmitted. Otherwise the
				3333	* timeout may continue to extend in loss recovery.
				3334	*/
				3335	flag \|= FLAG_SET_XMIT_TIMER; /* set TLP or RTO timer */
				3336	}
				3337
				3338	if (icsk->icsk_ca_ops->pkts_acked) {
				3339	struct ack_sample sample = { .pkts_acked = pkts_acked,
				3340	.rtt_us = sack->rate->rtt_us,
				3341	.in_flight = last_in_flight };
				3342
				3343	icsk->icsk_ca_ops->pkts_acked(sk, &sample);
				3344	}
				3345
				3346	#if FASTRETRANS_DEBUG > 0
				3347	WARN_ON((int)tp->sacked_out < 0);
				3348	WARN_ON((int)tp->lost_out < 0);
				3349	WARN_ON((int)tp->retrans_out < 0);
				3350	if (!tp->packets_out && tcp_is_sack(tp)) {
				3351	icsk = inet_csk(sk);
				3352	if (tp->lost_out) {
				3353	pr_debug("Leak l=%u %d\n",
				3354	tp->lost_out, icsk->icsk_ca_state);
				3355	tp->lost_out = 0;
				3356	}
				3357	if (tp->sacked_out) {
				3358	pr_debug("Leak s=%u %d\n",
				3359	tp->sacked_out, icsk->icsk_ca_state);
				3360	tp->sacked_out = 0;
				3361	}
				3362	if (tp->retrans_out) {
				3363	pr_debug("Leak r=%u %d\n",
				3364	tp->retrans_out, icsk->icsk_ca_state);
				3365	tp->retrans_out = 0;
				3366	}
				3367	}
				3368	#endif
				3369	return flag;
				3370	}
				3371
				3372	static void tcp_ack_probe(struct sock *sk)
				3373	{
				3374	struct inet_connection_sock *icsk = inet_csk(sk);
				3375	struct sk_buff *head = tcp_send_head(sk);
				3376	const struct tcp_sock *tp = tcp_sk(sk);
				3377
				3378	/* Was it a usable window open? */
				3379	if (!head)
				3380	return;
				3381	if (!after(TCP_SKB_CB(head)->end_seq, tcp_wnd_end(tp))) {
				3382	icsk->icsk_backoff = 0;
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame]	3383	icsk->icsk_probes_tstamp = 0;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3384	inet_csk_clear_xmit_timer(sk, ICSK_TIME_PROBE0);
				3385	/* Socket must be waked up by subsequent tcp_data_snd_check().
				3386	* This function is not for random using!
				3387	*/
				3388	} else {
				3389	unsigned long when = tcp_probe0_when(sk, TCP_RTO_MAX);
				3390
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame]	3391	when = tcp_clamp_probe0_to_user_timeout(sk, when);
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	3392	tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0, when, TCP_RTO_MAX);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3393	}
				3394	}
				3395
				3396	static inline bool tcp_ack_is_dubious(const struct sock *sk, const int flag)
				3397	{
				3398	return !(flag & FLAG_NOT_DUP) \|\| (flag & FLAG_CA_ALERT) \|\|
				3399	inet_csk(sk)->icsk_ca_state != TCP_CA_Open;
				3400	}
				3401
				3402	/* Decide wheather to run the increase function of congestion control. */
				3403	static inline bool tcp_may_raise_cwnd(const struct sock *sk, const int flag)
				3404	{
				3405	/* If reordering is high then always grow cwnd whenever data is
				3406	* delivered regardless of its ordering. Otherwise stay conservative
				3407	* and only grow cwnd on in-order delivery (RFC5681). A stretched ACK w/
				3408	* new SACK or ECE mark may first advance cwnd here and later reduce
				3409	* cwnd in tcp_fastretrans_alert() based on more states.
				3410	*/
				3411	if (tcp_sk(sk)->reordering > sock_net(sk)->ipv4.sysctl_tcp_reordering)
				3412	return flag & FLAG_FORWARD_PROGRESS;
				3413
				3414	return flag & FLAG_DATA_ACKED;
				3415	}
				3416
				3417	/* The "ultimate" congestion control function that aims to replace the rigid
				3418	* cwnd increase and decrease control (tcp_cong_avoid,tcp_*cwnd_reduction).
				3419	* It's called toward the end of processing an ACK with precise rate
				3420	* information. All transmission or retransmission are delayed afterwards.
				3421	*/
				3422	static void tcp_cong_control(struct sock *sk, u32 ack, u32 acked_sacked,
				3423	int flag, const struct rate_sample *rs)
				3424	{
				3425	const struct inet_connection_sock *icsk = inet_csk(sk);
				3426
				3427	if (icsk->icsk_ca_ops->cong_control) {
				3428	icsk->icsk_ca_ops->cong_control(sk, rs);
				3429	return;
				3430	}
				3431
				3432	if (tcp_in_cwnd_reduction(sk)) {
				3433	/* Reduce cwnd if state mandates */
				3434	tcp_cwnd_reduction(sk, acked_sacked, flag);
				3435	} else if (tcp_may_raise_cwnd(sk, flag)) {
				3436	/* Advance cwnd if state allows */
				3437	tcp_cong_avoid(sk, ack, acked_sacked);
				3438	}
				3439	tcp_update_pacing_rate(sk);
				3440	}
				3441
				3442	/* Check that window update is acceptable.
				3443	* The function assumes that snd_una<=ack<=snd_next.
				3444	*/
				3445	static inline bool tcp_may_update_window(const struct tcp_sock *tp,
				3446	const u32 ack, const u32 ack_seq,
				3447	const u32 nwin)
				3448	{
				3449	return after(ack, tp->snd_una) \|\|
				3450	after(ack_seq, tp->snd_wl1) \|\|
				3451	(ack_seq == tp->snd_wl1 && nwin > tp->snd_wnd);
				3452	}
				3453
				3454	/* If we update tp->snd_una, also update tp->bytes_acked */
				3455	static void tcp_snd_una_update(struct tcp_sock *tp, u32 ack)
				3456	{
				3457	u32 delta = ack - tp->snd_una;
				3458
				3459	sock_owned_by_me((struct sock *)tp);
				3460	tp->bytes_acked += delta;
				3461	tp->snd_una = ack;
				3462	}
				3463
				3464	/* If we update tp->rcv_nxt, also update tp->bytes_received */
				3465	static void tcp_rcv_nxt_update(struct tcp_sock *tp, u32 seq)
				3466	{
				3467	u32 delta = seq - tp->rcv_nxt;
				3468
				3469	sock_owned_by_me((struct sock *)tp);
				3470	tp->bytes_received += delta;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3471	WRITE_ONCE(tp->rcv_nxt, seq);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3472	}
				3473
				3474	/* Update our send window.
				3475	*
				3476	* Window update algorithm, described in RFC793/RFC1122 (used in linux-2.2
				3477	* and in FreeBSD. NetBSD's one is even worse.) is wrong.
				3478	*/
				3479	static int tcp_ack_update_window(struct sock sk, const struct sk_buff skb, u32 ack,
				3480	u32 ack_seq)
				3481	{
				3482	struct tcp_sock *tp = tcp_sk(sk);
				3483	int flag = 0;
				3484	u32 nwin = ntohs(tcp_hdr(skb)->window);
				3485
				3486	if (likely(!tcp_hdr(skb)->syn))
				3487	nwin <<= tp->rx_opt.snd_wscale;
				3488
				3489	if (tcp_may_update_window(tp, ack, ack_seq, nwin)) {
				3490	flag \|= FLAG_WIN_UPDATE;
				3491	tcp_update_wl(tp, ack_seq);
				3492
				3493	if (tp->snd_wnd != nwin) {
				3494	tp->snd_wnd = nwin;
				3495
				3496	/* Note, it is the only place, where
				3497	* fast path is recovered for sending TCP.
				3498	*/
				3499	tp->pred_flags = 0;
				3500	tcp_fast_path_check(sk);
				3501
				3502	if (!tcp_write_queue_empty(sk))
				3503	tcp_slow_start_after_idle_check(sk);
				3504
				3505	if (nwin > tp->max_window) {
				3506	tp->max_window = nwin;
				3507	tcp_sync_mss(sk, inet_csk(sk)->icsk_pmtu_cookie);
				3508	}
				3509	}
				3510	}
				3511
				3512	tcp_snd_una_update(tp, ack);
				3513
				3514	return flag;
				3515	}
				3516
				3517	static bool __tcp_oow_rate_limited(struct net *net, int mib_idx,
				3518	u32 *last_oow_ack_time)
				3519	{
				3520	if (*last_oow_ack_time) {
				3521	s32 elapsed = (s32)(tcp_jiffies32 - *last_oow_ack_time);
				3522
				3523	if (0 <= elapsed && elapsed < net->ipv4.sysctl_tcp_invalid_ratelimit) {
				3524	NET_INC_STATS(net, mib_idx);
				3525	return true; /* rate-limited: don't send yet! */
				3526	}
				3527	}
				3528
				3529	*last_oow_ack_time = tcp_jiffies32;
				3530
				3531	return false; /* not rate-limited: go ahead, send dupack now! */
				3532	}
				3533
				3534	/* Return true if we're currently rate-limiting out-of-window ACKs and
				3535	* thus shouldn't send a dupack right now. We rate-limit dupacks in
				3536	* response to out-of-window SYNs or ACKs to mitigate ACK loops or DoS
				3537	* attacks that send repeated SYNs or ACKs for the same connection. To
				3538	* do this, we do not send a duplicate SYNACK or ACK if the remote
				3539	* endpoint is sending out-of-window SYNs or pure ACKs at a high rate.
				3540	*/
				3541	bool tcp_oow_rate_limited(struct net net, const struct sk_buff skb,
				3542	int mib_idx, u32 *last_oow_ack_time)
				3543	{
				3544	/* Data packets without SYNs are not likely part of an ACK loop. */
				3545	if ((TCP_SKB_CB(skb)->seq != TCP_SKB_CB(skb)->end_seq) &&
				3546	!tcp_hdr(skb)->syn)
				3547	return false;
				3548
				3549	return __tcp_oow_rate_limited(net, mib_idx, last_oow_ack_time);
				3550	}
				3551
				3552	/* RFC 5961 7 [ACK Throttling] */
				3553	static void tcp_send_challenge_ack(struct sock sk, const struct sk_buff skb)
				3554	{
				3555	/* unprotected vars, we dont care of overwrites */
				3556	static u32 challenge_timestamp;
				3557	static unsigned int challenge_count;
				3558	struct tcp_sock *tp = tcp_sk(sk);
				3559	struct net *net = sock_net(sk);
				3560	u32 count, now;
				3561
				3562	/* First check our per-socket dupack rate limit. */
				3563	if (__tcp_oow_rate_limited(net,
				3564	LINUX_MIB_TCPACKSKIPPEDCHALLENGE,
				3565	&tp->last_oow_ack_time))
				3566	return;
				3567
				3568	/* Then check host-wide RFC 5961 rate limit. */
				3569	now = jiffies / HZ;
				3570	if (now != challenge_timestamp) {
				3571	u32 ack_limit = net->ipv4.sysctl_tcp_challenge_ack_limit;
				3572	u32 half = (ack_limit + 1) >> 1;
				3573
				3574	challenge_timestamp = now;
				3575	WRITE_ONCE(challenge_count, half + prandom_u32_max(ack_limit));
				3576	}
				3577	count = READ_ONCE(challenge_count);
				3578	if (count > 0) {
				3579	WRITE_ONCE(challenge_count, count - 1);
				3580	NET_INC_STATS(net, LINUX_MIB_TCPCHALLENGEACK);
				3581	tcp_send_ack(sk);
				3582	}
				3583	}
				3584
				3585	static void tcp_store_ts_recent(struct tcp_sock *tp)
				3586	{
				3587	tp->rx_opt.ts_recent = tp->rx_opt.rcv_tsval;
				3588	tp->rx_opt.ts_recent_stamp = ktime_get_seconds();
				3589	}
				3590
				3591	static void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq)
				3592	{
				3593	if (tp->rx_opt.saw_tstamp && !after(seq, tp->rcv_wup)) {
				3594	/* PAWS bug workaround wrt. ACK frames, the PAWS discard
				3595	* extra check below makes sure this can only happen
				3596	* for pure ACK frames. -DaveM
				3597	*
				3598	* Not only, also it occurs for expired timestamps.
				3599	*/
				3600
				3601	if (tcp_paws_check(&tp->rx_opt, 0))
				3602	tcp_store_ts_recent(tp);
				3603	}
				3604	}
				3605
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame]	3606	/* This routine deals with acks during a TLP episode and ends an episode by
				3607	* resetting tlp_high_seq. Ref: TLP algorithm in draft-ietf-tcpm-rack
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3608	*/
				3609	static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)
				3610	{
				3611	struct tcp_sock *tp = tcp_sk(sk);
				3612
				3613	if (before(ack, tp->tlp_high_seq))
				3614	return;
				3615
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame]	3616	if (!tp->tlp_retrans) {
				3617	/* TLP of new data has been acknowledged */
				3618	tp->tlp_high_seq = 0;
				3619	} else if (flag & FLAG_DSACKING_ACK) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3620	/* This DSACK means original and TLP probe arrived; no loss */
				3621	tp->tlp_high_seq = 0;
				3622	} else if (after(ack, tp->tlp_high_seq)) {
				3623	/* ACK advances: there was a loss, so reduce cwnd. Reset
				3624	* tlp_high_seq in tcp_init_cwnd_reduction()
				3625	*/
				3626	tcp_init_cwnd_reduction(sk);
				3627	tcp_set_ca_state(sk, TCP_CA_CWR);
				3628	tcp_end_cwnd_reduction(sk);
				3629	tcp_try_keep_open(sk);
				3630	NET_INC_STATS(sock_net(sk),
				3631	LINUX_MIB_TCPLOSSPROBERECOVERY);
				3632	} else if (!(flag & (FLAG_SND_UNA_ADVANCED \|
				3633	FLAG_NOT_DUP \| FLAG_DATA_SACKED))) {
				3634	/* Pure dupack: original and TLP probe arrived; no loss */
				3635	tp->tlp_high_seq = 0;
				3636	}
				3637	}
				3638
				3639	static inline void tcp_in_ack_event(struct sock *sk, u32 flags)
				3640	{
				3641	const struct inet_connection_sock *icsk = inet_csk(sk);
				3642
				3643	if (icsk->icsk_ca_ops->in_ack_event)
				3644	icsk->icsk_ca_ops->in_ack_event(sk, flags);
				3645	}
				3646
				3647	/* Congestion control has updated the cwnd already. So if we're in
				3648	* loss recovery then now we do any new sends (for FRTO) or
				3649	* retransmits (for CA_Loss or CA_recovery) that make sense.
				3650	*/
				3651	static void tcp_xmit_recovery(struct sock *sk, int rexmit)
				3652	{
				3653	struct tcp_sock *tp = tcp_sk(sk);
				3654
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3655	if (rexmit == REXMIT_NONE \|\| sk->sk_state == TCP_SYN_SENT)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3656	return;
				3657
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	3658	if (unlikely(rexmit == REXMIT_NEW)) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3659	__tcp_push_pending_frames(sk, tcp_current_mss(sk),
				3660	TCP_NAGLE_OFF);
				3661	if (after(tp->snd_nxt, tp->high_seq))
				3662	return;
				3663	tp->frto = 0;
				3664	}
				3665	tcp_xmit_retransmit_queue(sk);
				3666	}
				3667
				3668	/* Returns the number of packets newly acked or sacked by the current ACK */
				3669	static u32 tcp_newly_delivered(struct sock *sk, u32 prior_delivered, int flag)
				3670	{
				3671	const struct net *net = sock_net(sk);
				3672	struct tcp_sock *tp = tcp_sk(sk);
				3673	u32 delivered;
				3674
				3675	delivered = tp->delivered - prior_delivered;
				3676	NET_ADD_STATS(net, LINUX_MIB_TCPDELIVERED, delivered);
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	3677	if (flag & FLAG_ECE)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3678	NET_ADD_STATS(net, LINUX_MIB_TCPDELIVEREDCE, delivered);
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	3679
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3680	return delivered;
				3681	}
				3682
				3683	/* This routine deals with incoming acks, but not outgoing ones. */
				3684	static int tcp_ack(struct sock sk, const struct sk_buff skb, int flag)
				3685	{
				3686	struct inet_connection_sock *icsk = inet_csk(sk);
				3687	struct tcp_sock *tp = tcp_sk(sk);
				3688	struct tcp_sacktag_state sack_state;
				3689	struct rate_sample rs = { .prior_delivered = 0 };
				3690	u32 prior_snd_una = tp->snd_una;
				3691	bool is_sack_reneg = tp->is_sack_reneg;
				3692	u32 ack_seq = TCP_SKB_CB(skb)->seq;
				3693	u32 ack = TCP_SKB_CB(skb)->ack_seq;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3694	int num_dupack = 0;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3695	int prior_packets = tp->packets_out;
				3696	u32 delivered = tp->delivered;
				3697	u32 lost = tp->lost;
				3698	int rexmit = REXMIT_NONE; /* Flag to (re)transmit to recover losses */
				3699	u32 prior_fack;
				3700
				3701	sack_state.first_sackt = 0;
				3702	sack_state.rate = &rs;
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	3703	sack_state.sack_delivered = 0;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3704
				3705	/* We very likely will need to access rtx queue. */
				3706	prefetch(sk->tcp_rtx_queue.rb_node);
				3707
				3708	/* If the ack is older than previous acks
				3709	* then we can probably ignore it.
				3710	*/
				3711	if (before(ack, prior_snd_una)) {
				3712	/* RFC 5961 5.2 [Blind Data Injection Attack].[Mitigation] */
				3713	if (before(ack, prior_snd_una - tp->max_window)) {
				3714	if (!(flag & FLAG_NO_CHALLENGE_ACK))
				3715	tcp_send_challenge_ack(sk, skb);
				3716	return -1;
				3717	}
				3718	goto old_ack;
				3719	}
				3720
				3721	/* If the ack includes data we haven't sent yet, discard
				3722	* this segment (RFC793 Section 3.9).
				3723	*/
				3724	if (after(ack, tp->snd_nxt))
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3725	return -1;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3726
				3727	if (after(ack, prior_snd_una)) {
				3728	flag \|= FLAG_SND_UNA_ADVANCED;
				3729	icsk->icsk_retransmits = 0;
				3730
				3731	#if IS_ENABLED(CONFIG_TLS_DEVICE)
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3732	if (static_branch_unlikely(&clean_acked_data_enabled.key))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3733	if (icsk->icsk_clean_acked)
				3734	icsk->icsk_clean_acked(sk, ack);
				3735	#endif
				3736	}
				3737
				3738	prior_fack = tcp_is_sack(tp) ? tcp_highest_sack_seq(tp) : tp->snd_una;
				3739	rs.prior_in_flight = tcp_packets_in_flight(tp);
				3740
				3741	/* ts_recent update must be made after we are sure that the packet
				3742	* is in window.
				3743	*/
				3744	if (flag & FLAG_UPDATE_TS_RECENT)
				3745	tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq);
				3746
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3747	if ((flag & (FLAG_SLOWPATH \| FLAG_SND_UNA_ADVANCED)) ==
				3748	FLAG_SND_UNA_ADVANCED) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3749	/* Window is constant, pure forward advance.
				3750	* No more checks are required.
				3751	* Note, we use the fact that SND.UNA>=SND.WL2.
				3752	*/
				3753	tcp_update_wl(tp, ack_seq);
				3754	tcp_snd_una_update(tp, ack);
				3755	flag \|= FLAG_WIN_UPDATE;
				3756
				3757	tcp_in_ack_event(sk, CA_ACK_WIN_UPDATE);
				3758
				3759	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPACKS);
				3760	} else {
				3761	u32 ack_ev_flags = CA_ACK_SLOWPATH;
				3762
				3763	if (ack_seq != TCP_SKB_CB(skb)->end_seq)
				3764	flag \|= FLAG_DATA;
				3765	else
				3766	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPUREACKS);
				3767
				3768	flag \|= tcp_ack_update_window(sk, skb, ack, ack_seq);
				3769
				3770	if (TCP_SKB_CB(skb)->sacked)
				3771	flag \|= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
				3772	&sack_state);
				3773
				3774	if (tcp_ecn_rcv_ecn_echo(tp, tcp_hdr(skb))) {
				3775	flag \|= FLAG_ECE;
				3776	ack_ev_flags \|= CA_ACK_ECE;
				3777	}
				3778
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	3779	if (sack_state.sack_delivered)
				3780	tcp_count_delivered(tp, sack_state.sack_delivered,
				3781	flag & FLAG_ECE);
				3782
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3783	if (flag & FLAG_WIN_UPDATE)
				3784	ack_ev_flags \|= CA_ACK_WIN_UPDATE;
				3785
				3786	tcp_in_ack_event(sk, ack_ev_flags);
				3787	}
				3788
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame]	3789	/* This is a deviation from RFC3168 since it states that:
				3790	* "When the TCP data sender is ready to set the CWR bit after reducing
				3791	* the congestion window, it SHOULD set the CWR bit only on the first
				3792	* new data packet that it transmits."
				3793	* We accept CWR on pure ACKs to be more robust
				3794	* with widely-deployed TCP implementations that do this.
				3795	*/
				3796	tcp_ecn_accept_cwr(sk, skb);
				3797
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3798	/* We passed data and got it acked, remove any soft error
				3799	* log. Something worked...
				3800	*/
				3801	sk->sk_err_soft = 0;
				3802	icsk->icsk_probes_out = 0;
				3803	tp->rcv_tstamp = tcp_jiffies32;
				3804	if (!prior_packets)
				3805	goto no_queue;
				3806
				3807	/* See if we can take anything off of the retransmit queue. */
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	3808	flag \|= tcp_clean_rtx_queue(sk, prior_fack, prior_snd_una, &sack_state,
				3809	flag & FLAG_ECE);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3810
				3811	tcp_rack_update_reo_wnd(sk, &rs);
				3812
				3813	if (tp->tlp_high_seq)
				3814	tcp_process_tlp_ack(sk, ack, flag);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3815
				3816	if (tcp_ack_is_dubious(sk, flag)) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3817	if (!(flag & (FLAG_SND_UNA_ADVANCED \| FLAG_NOT_DUP))) {
				3818	num_dupack = 1;
				3819	/* Consider if pure acks were aggregated in tcp_add_backlog() */
				3820	if (!(flag & FLAG_DATA))
				3821	num_dupack = max_t(u16, 1, skb_shinfo(skb)->gso_segs);
				3822	}
				3823	tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3824	&rexmit);
				3825	}
				3826
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame]	3827	/* If needed, reset TLP/RTO timer when RACK doesn't set. */
				3828	if (flag & FLAG_SET_XMIT_TIMER)
				3829	tcp_set_xmit_timer(sk);
				3830
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3831	if ((flag & FLAG_FORWARD_PROGRESS) \|\| !(flag & FLAG_NOT_DUP))
				3832	sk_dst_confirm(sk);
				3833
				3834	delivered = tcp_newly_delivered(sk, delivered, flag);
				3835	lost = tp->lost - lost; /* freshly marked lost */
				3836	rs.is_ack_delayed = !!(flag & FLAG_ACK_MAYBE_DELAYED);
				3837	tcp_rate_gen(sk, delivered, lost, is_sack_reneg, sack_state.rate);
				3838	tcp_cong_control(sk, ack, delivered, flag, sack_state.rate);
				3839	tcp_xmit_recovery(sk, rexmit);
				3840	return 1;
				3841
				3842	no_queue:
				3843	/* If data was DSACKed, see if we can undo a cwnd reduction. */
				3844	if (flag & FLAG_DSACKING_ACK) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3845	tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3846	&rexmit);
				3847	tcp_newly_delivered(sk, delivered, flag);
				3848	}
				3849	/* If this ack opens up a zero window, clear backoff. It was
				3850	* being used to time the probes, and is probably far higher than
				3851	* it needs to be for normal retransmission.
				3852	*/
				3853	tcp_ack_probe(sk);
				3854
				3855	if (tp->tlp_high_seq)
				3856	tcp_process_tlp_ack(sk, ack, flag);
				3857	return 1;
				3858
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3859	old_ack:
				3860	/* If data was SACKed, tag it and see if we should send more data.
				3861	* If data was DSACKed, see if we can undo a cwnd reduction.
				3862	*/
				3863	if (TCP_SKB_CB(skb)->sacked) {
				3864	flag \|= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
				3865	&sack_state);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3866	tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3867	&rexmit);
				3868	tcp_newly_delivered(sk, delivered, flag);
				3869	tcp_xmit_recovery(sk, rexmit);
				3870	}
				3871
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3872	return 0;
				3873	}
				3874
				3875	static void tcp_parse_fastopen_option(int len, const unsigned char *cookie,
				3876	bool syn, struct tcp_fastopen_cookie *foc,
				3877	bool exp_opt)
				3878	{
				3879	/* Valid only in SYN or SYN-ACK with an even length. */
				3880	if (!foc \|\| !syn \|\| len < 0 \|\| (len & 1))
				3881	return;
				3882
				3883	if (len >= TCP_FASTOPEN_COOKIE_MIN &&
				3884	len <= TCP_FASTOPEN_COOKIE_MAX)
				3885	memcpy(foc->val, cookie, len);
				3886	else if (len != 0)
				3887	len = -1;
				3888	foc->len = len;
				3889	foc->exp = exp_opt;
				3890	}
				3891
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	3892	static bool smc_parse_options(const struct tcphdr *th,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3893	struct tcp_options_received *opt_rx,
				3894	const unsigned char *ptr,
				3895	int opsize)
				3896	{
				3897	#if IS_ENABLED(CONFIG_SMC)
				3898	if (static_branch_unlikely(&tcp_have_smc)) {
				3899	if (th->syn && !(opsize & 1) &&
				3900	opsize >= TCPOLEN_EXP_SMC_BASE &&
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	3901	get_unaligned_be32(ptr) == TCPOPT_SMC_MAGIC) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3902	opt_rx->smc_ok = 1;
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	3903	return true;
				3904	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3905	}
				3906	#endif
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	3907	return false;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3908	}
				3909
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3910	/* Try to parse the MSS option from the TCP header. Return 0 on failure, clamped
				3911	* value on success.
				3912	*/
				3913	static u16 tcp_parse_mss_option(const struct tcphdr *th, u16 user_mss)
				3914	{
				3915	const unsigned char ptr = (const unsigned char )(th + 1);
				3916	int length = (th->doff * 4) - sizeof(struct tcphdr);
				3917	u16 mss = 0;
				3918
				3919	while (length > 0) {
				3920	int opcode = *ptr++;
				3921	int opsize;
				3922
				3923	switch (opcode) {
				3924	case TCPOPT_EOL:
				3925	return mss;
				3926	case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */
				3927	length--;
				3928	continue;
				3929	default:
				3930	if (length < 2)
				3931	return mss;
				3932	opsize = *ptr++;
				3933	if (opsize < 2) /* "silly options" */
				3934	return mss;
				3935	if (opsize > length)
				3936	return mss; /* fail on partial options */
				3937	if (opcode == TCPOPT_MSS && opsize == TCPOLEN_MSS) {
				3938	u16 in_mss = get_unaligned_be16(ptr);
				3939
				3940	if (in_mss) {
				3941	if (user_mss && user_mss < in_mss)
				3942	in_mss = user_mss;
				3943	mss = in_mss;
				3944	}
				3945	}
				3946	ptr += opsize - 2;
				3947	length -= opsize;
				3948	}
				3949	}
				3950	return mss;
				3951	}
				3952
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3953	/* Look for tcp options. Normally only called on SYN and SYNACK packets.
				3954	* But, this can also be called on packets in the established flow when
				3955	* the fast version below fails.
				3956	*/
				3957	void tcp_parse_options(const struct net *net,
				3958	const struct sk_buff *skb,
				3959	struct tcp_options_received *opt_rx, int estab,
				3960	struct tcp_fastopen_cookie *foc)
				3961	{
				3962	const unsigned char *ptr;
				3963	const struct tcphdr *th = tcp_hdr(skb);
				3964	int length = (th->doff * 4) - sizeof(struct tcphdr);
				3965
				3966	ptr = (const unsigned char *)(th + 1);
				3967	opt_rx->saw_tstamp = 0;
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	3968	opt_rx->saw_unknown = 0;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3969
				3970	while (length > 0) {
				3971	int opcode = *ptr++;
				3972	int opsize;
				3973
				3974	switch (opcode) {
				3975	case TCPOPT_EOL:
				3976	return;
				3977	case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */
				3978	length--;
				3979	continue;
				3980	default:
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3981	if (length < 2)
				3982	return;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3983	opsize = *ptr++;
				3984	if (opsize < 2) /* "silly options" */
				3985	return;
				3986	if (opsize > length)
				3987	return; /* don't parse partial options */
				3988	switch (opcode) {
				3989	case TCPOPT_MSS:
				3990	if (opsize == TCPOLEN_MSS && th->syn && !estab) {
				3991	u16 in_mss = get_unaligned_be16(ptr);
				3992	if (in_mss) {
				3993	if (opt_rx->user_mss &&
				3994	opt_rx->user_mss < in_mss)
				3995	in_mss = opt_rx->user_mss;
				3996	opt_rx->mss_clamp = in_mss;
				3997	}
				3998	}
				3999	break;
				4000	case TCPOPT_WINDOW:
				4001	if (opsize == TCPOLEN_WINDOW && th->syn &&
				4002	!estab && net->ipv4.sysctl_tcp_window_scaling) {
				4003	__u8 snd_wscale = (__u8 )ptr;
				4004	opt_rx->wscale_ok = 1;
				4005	if (snd_wscale > TCP_MAX_WSCALE) {
				4006	net_info_ratelimited("%s: Illegal window scaling value %d > %u received\n",
				4007	__func__,
				4008	snd_wscale,
				4009	TCP_MAX_WSCALE);
				4010	snd_wscale = TCP_MAX_WSCALE;
				4011	}
				4012	opt_rx->snd_wscale = snd_wscale;
				4013	}
				4014	break;
				4015	case TCPOPT_TIMESTAMP:
				4016	if ((opsize == TCPOLEN_TIMESTAMP) &&
				4017	((estab && opt_rx->tstamp_ok) \|\|
				4018	(!estab && net->ipv4.sysctl_tcp_timestamps))) {
				4019	opt_rx->saw_tstamp = 1;
				4020	opt_rx->rcv_tsval = get_unaligned_be32(ptr);
				4021	opt_rx->rcv_tsecr = get_unaligned_be32(ptr + 4);
				4022	}
				4023	break;
				4024	case TCPOPT_SACK_PERM:
				4025	if (opsize == TCPOLEN_SACK_PERM && th->syn &&
				4026	!estab && net->ipv4.sysctl_tcp_sack) {
				4027	opt_rx->sack_ok = TCP_SACK_SEEN;
				4028	tcp_sack_reset(opt_rx);
				4029	}
				4030	break;
				4031
				4032	case TCPOPT_SACK:
				4033	if ((opsize >= (TCPOLEN_SACK_BASE + TCPOLEN_SACK_PERBLOCK)) &&
				4034	!((opsize - TCPOLEN_SACK_BASE) % TCPOLEN_SACK_PERBLOCK) &&
				4035	opt_rx->sack_ok) {
				4036	TCP_SKB_CB(skb)->sacked = (ptr - 2) - (unsigned char *)th;
				4037	}
				4038	break;
				4039	#ifdef CONFIG_TCP_MD5SIG
				4040	case TCPOPT_MD5SIG:
				4041	/*
				4042	* The MD5 Hash has already been
				4043	* checked (see tcp_v{4,6}_do_rcv()).
				4044	*/
				4045	break;
				4046	#endif
				4047	case TCPOPT_FASTOPEN:
				4048	tcp_parse_fastopen_option(
				4049	opsize - TCPOLEN_FASTOPEN_BASE,
				4050	ptr, th->syn, foc, false);
				4051	break;
				4052
				4053	case TCPOPT_EXP:
				4054	/* Fast Open option shares code 254 using a
				4055	* 16 bits magic number.
				4056	*/
				4057	if (opsize >= TCPOLEN_EXP_FASTOPEN_BASE &&
				4058	get_unaligned_be16(ptr) ==
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	4059	TCPOPT_FASTOPEN_MAGIC) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4060	tcp_parse_fastopen_option(opsize -
				4061	TCPOLEN_EXP_FASTOPEN_BASE,
				4062	ptr + 2, th->syn, foc, true);
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	4063	break;
				4064	}
				4065
				4066	if (smc_parse_options(th, opt_rx, ptr, opsize))
				4067	break;
				4068
				4069	opt_rx->saw_unknown = 1;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4070	break;
				4071
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	4072	default:
				4073	opt_rx->saw_unknown = 1;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4074	}
				4075	ptr += opsize-2;
				4076	length -= opsize;
				4077	}
				4078	}
				4079	}
				4080	EXPORT_SYMBOL(tcp_parse_options);
				4081
				4082	static bool tcp_parse_aligned_timestamp(struct tcp_sock tp, const struct tcphdr th)
				4083	{
				4084	const __be32 ptr = (const __be32 )(th + 1);
				4085
				4086	if (*ptr == htonl((TCPOPT_NOP << 24) \| (TCPOPT_NOP << 16)
				4087	\| (TCPOPT_TIMESTAMP << 8) \| TCPOLEN_TIMESTAMP)) {
				4088	tp->rx_opt.saw_tstamp = 1;
				4089	++ptr;
				4090	tp->rx_opt.rcv_tsval = ntohl(*ptr);
				4091	++ptr;
				4092	if (*ptr)
				4093	tp->rx_opt.rcv_tsecr = ntohl(*ptr) - tp->tsoffset;
				4094	else
				4095	tp->rx_opt.rcv_tsecr = 0;
				4096	return true;
				4097	}
				4098	return false;
				4099	}
				4100
				4101	/* Fast parse options. This hopes to only see timestamps.
				4102	* If it is wrong it falls back on tcp_parse_options().
				4103	*/
				4104	static bool tcp_fast_parse_options(const struct net *net,
				4105	const struct sk_buff *skb,
				4106	const struct tcphdr th, struct tcp_sock tp)
				4107	{
				4108	/* In the spirit of fast parsing, compare doff directly to constant
				4109	* values. Because equality is used, short doff can be ignored here.
				4110	*/
				4111	if (th->doff == (sizeof(*th) / 4)) {
				4112	tp->rx_opt.saw_tstamp = 0;
				4113	return false;
				4114	} else if (tp->rx_opt.tstamp_ok &&
				4115	th->doff == ((sizeof(*th) + TCPOLEN_TSTAMP_ALIGNED) / 4)) {
				4116	if (tcp_parse_aligned_timestamp(tp, th))
				4117	return true;
				4118	}
				4119
				4120	tcp_parse_options(net, skb, &tp->rx_opt, 1, NULL);
				4121	if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
				4122	tp->rx_opt.rcv_tsecr -= tp->tsoffset;
				4123
				4124	return true;
				4125	}
				4126
				4127	#ifdef CONFIG_TCP_MD5SIG
				4128	/*
				4129	* Parse MD5 Signature option
				4130	*/
				4131	const u8 tcp_parse_md5sig_option(const struct tcphdr th)
				4132	{
				4133	int length = (th->doff << 2) - sizeof(*th);
				4134	const u8 ptr = (const u8 )(th + 1);
				4135
				4136	/* If not enough data remaining, we can short cut */
				4137	while (length >= TCPOLEN_MD5SIG) {
				4138	int opcode = *ptr++;
				4139	int opsize;
				4140
				4141	switch (opcode) {
				4142	case TCPOPT_EOL:
				4143	return NULL;
				4144	case TCPOPT_NOP:
				4145	length--;
				4146	continue;
				4147	default:
				4148	opsize = *ptr++;
				4149	if (opsize < 2 \|\| opsize > length)
				4150	return NULL;
				4151	if (opcode == TCPOPT_MD5SIG)
				4152	return opsize == TCPOLEN_MD5SIG ? ptr : NULL;
				4153	}
				4154	ptr += opsize - 2;
				4155	length -= opsize;
				4156	}
				4157	return NULL;
				4158	}
				4159	EXPORT_SYMBOL(tcp_parse_md5sig_option);
				4160	#endif
				4161
				4162	/* Sorry, PAWS as specified is broken wrt. pure-ACKs -DaveM
				4163	*
				4164	* It is not fatal. If this ACK does _not_ change critical state (seqs, window)
				4165	* it can pass through stack. So, the following predicate verifies that
				4166	* this segment is not used for anything but congestion avoidance or
				4167	* fast retransmit. Moreover, we even are able to eliminate most of such
				4168	* second order effects, if we apply some small "replay" window (~RTO)
				4169	* to timestamp space.
				4170	*
				4171	* All these measures still do not guarantee that we reject wrapped ACKs
				4172	* on networks with high bandwidth, when sequence space is recycled fastly,
				4173	* but it guarantees that such events will be very rare and do not affect
				4174	* connection seriously. This doesn't look nice, but alas, PAWS is really
				4175	* buggy extension.
				4176	*
				4177	* [ Later note. Even worse! It is buggy for segments _with_ data. RFC
				4178	* states that events when retransmit arrives after original data are rare.
				4179	* It is a blatant lie. VJ forgot about fast retransmit! 8)8) It is
				4180	* the biggest problem on large power networks even with minor reordering.
				4181	* OK, let's give it small replay window. If peer clock is even 1hz, it is safe
				4182	* up to bandwidth of 18Gigabit/sec. 8) ]
				4183	*/
				4184
				4185	static int tcp_disordered_ack(const struct sock sk, const struct sk_buff skb)
				4186	{
				4187	const struct tcp_sock *tp = tcp_sk(sk);
				4188	const struct tcphdr *th = tcp_hdr(skb);
				4189	u32 seq = TCP_SKB_CB(skb)->seq;
				4190	u32 ack = TCP_SKB_CB(skb)->ack_seq;
				4191
				4192	return (/* 1. Pure ACK with correct sequence number. */
				4193	(th->ack && seq == TCP_SKB_CB(skb)->end_seq && seq == tp->rcv_nxt) &&
				4194
				4195	/* 2. ... and duplicate ACK. */
				4196	ack == tp->snd_una &&
				4197
				4198	/* 3. ... and does not update window. */
				4199	!tcp_may_update_window(tp, ack, seq, ntohs(th->window) << tp->rx_opt.snd_wscale) &&
				4200
				4201	/* 4. ... and sits in replay window. */
				4202	(s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) <= (inet_csk(sk)->icsk_rto * 1024) / HZ);
				4203	}
				4204
				4205	static inline bool tcp_paws_discard(const struct sock *sk,
				4206	const struct sk_buff *skb)
				4207	{
				4208	const struct tcp_sock *tp = tcp_sk(sk);
				4209
				4210	return !tcp_paws_check(&tp->rx_opt, TCP_PAWS_WINDOW) &&
				4211	!tcp_disordered_ack(sk, skb);
				4212	}
				4213
				4214	/* Check segment sequence number for validity.
				4215	*
				4216	* Segment controls are considered valid, if the segment
				4217	* fits to the window after truncation to the window. Acceptability
				4218	* of data (and SYN, FIN, of course) is checked separately.
				4219	* See tcp_data_queue(), for example.
				4220	*
				4221	* Also, controls (RST is main one) are accepted using RCV.WUP instead
				4222	* of RCV.NXT. Peer still did not advance his SND.UNA when we
				4223	* delayed ACK, so that hisSND.UNA<=ourRCV.WUP.
				4224	* (borrowed from freebsd)
				4225	*/
				4226
				4227	static inline bool tcp_sequence(const struct tcp_sock *tp, u32 seq, u32 end_seq)
				4228	{
				4229	return !before(end_seq, tp->rcv_wup) &&
				4230	!after(seq, tp->rcv_nxt + tcp_receive_window(tp));
				4231	}
				4232
				4233	/* When we get a reset we do this. */
				4234	void tcp_reset(struct sock *sk)
				4235	{
				4236	trace_tcp_receive_reset(sk);
				4237
				4238	/* We want the right error as BSD sees it (and indeed as we do). */
				4239	switch (sk->sk_state) {
				4240	case TCP_SYN_SENT:
				4241	sk->sk_err = ECONNREFUSED;
				4242	break;
				4243	case TCP_CLOSE_WAIT:
				4244	sk->sk_err = EPIPE;
				4245	break;
				4246	case TCP_CLOSE:
				4247	return;
				4248	default:
				4249	sk->sk_err = ECONNRESET;
				4250	}
				4251	/* This barrier is coupled with smp_rmb() in tcp_poll() */
				4252	smp_wmb();
				4253
				4254	tcp_write_queue_purge(sk);
				4255	tcp_done(sk);
				4256
				4257	if (!sock_flag(sk, SOCK_DEAD))
				4258	sk->sk_error_report(sk);
				4259	}
				4260
				4261	/*
				4262	* Process the FIN bit. This now behaves as it is supposed to work
				4263	* and the FIN takes effect when it is validly part of sequence
				4264	* space. Not before when we get holes.
				4265	*
				4266	* If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT
				4267	* (and thence onto LAST-ACK and finally, CLOSE, we never enter
				4268	* TIME-WAIT)
				4269	*
				4270	* If we are in FINWAIT-1, a received FIN indicates simultaneous
				4271	* close and we go into CLOSING (and later onto TIME-WAIT)
				4272	*
				4273	* If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.
				4274	*/
				4275	void tcp_fin(struct sock *sk)
				4276	{
				4277	struct tcp_sock *tp = tcp_sk(sk);
				4278
				4279	inet_csk_schedule_ack(sk);
				4280
				4281	sk->sk_shutdown \|= RCV_SHUTDOWN;
				4282	sock_set_flag(sk, SOCK_DONE);
				4283
				4284	switch (sk->sk_state) {
				4285	case TCP_SYN_RECV:
				4286	case TCP_ESTABLISHED:
				4287	/* Move to CLOSE_WAIT */
				4288	tcp_set_state(sk, TCP_CLOSE_WAIT);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4289	inet_csk_enter_pingpong_mode(sk);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4290	break;
				4291
				4292	case TCP_CLOSE_WAIT:
				4293	case TCP_CLOSING:
				4294	/* Received a retransmission of the FIN, do
				4295	* nothing.
				4296	*/
				4297	break;
				4298	case TCP_LAST_ACK:
				4299	/* RFC793: Remain in the LAST-ACK state. */
				4300	break;
				4301
				4302	case TCP_FIN_WAIT1:
				4303	/* This case occurs when a simultaneous close
				4304	* happens, we must ack the received FIN and
				4305	* enter the CLOSING state.
				4306	*/
				4307	tcp_send_ack(sk);
				4308	tcp_set_state(sk, TCP_CLOSING);
				4309	break;
				4310	case TCP_FIN_WAIT2:
				4311	/* Received a FIN -- send ACK and enter TIME_WAIT. */
				4312	tcp_send_ack(sk);
				4313	tcp_time_wait(sk, TCP_TIME_WAIT, 0);
				4314	break;
				4315	default:
				4316	/* Only TCP_LISTEN and TCP_CLOSE are left, in these
				4317	* cases we should never reach this piece of code.
				4318	*/
				4319	pr_err("%s: Impossible, sk->sk_state=%d\n",
				4320	__func__, sk->sk_state);
				4321	break;
				4322	}
				4323
				4324	/* It _is_ possible, that we have something out-of-order _after_ FIN.
				4325	* Probably, we should reset in this case. For now drop them.
				4326	*/
				4327	skb_rbtree_purge(&tp->out_of_order_queue);
				4328	if (tcp_is_sack(tp))
				4329	tcp_sack_reset(&tp->rx_opt);
				4330	sk_mem_reclaim(sk);
				4331
				4332	if (!sock_flag(sk, SOCK_DEAD)) {
				4333	sk->sk_state_change(sk);
				4334
				4335	/* Do not send POLL_HUP for half duplex close. */
				4336	if (sk->sk_shutdown == SHUTDOWN_MASK \|\|
				4337	sk->sk_state == TCP_CLOSE)
				4338	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP);
				4339	else
				4340	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
				4341	}
				4342	}
				4343
				4344	static inline bool tcp_sack_extend(struct tcp_sack_block *sp, u32 seq,
				4345	u32 end_seq)
				4346	{
				4347	if (!after(seq, sp->end_seq) && !after(sp->start_seq, end_seq)) {
				4348	if (before(seq, sp->start_seq))
				4349	sp->start_seq = seq;
				4350	if (after(end_seq, sp->end_seq))
				4351	sp->end_seq = end_seq;
				4352	return true;
				4353	}
				4354	return false;
				4355	}
				4356
				4357	static void tcp_dsack_set(struct sock *sk, u32 seq, u32 end_seq)
				4358	{
				4359	struct tcp_sock *tp = tcp_sk(sk);
				4360
				4361	if (tcp_is_sack(tp) && sock_net(sk)->ipv4.sysctl_tcp_dsack) {
				4362	int mib_idx;
				4363
				4364	if (before(seq, tp->rcv_nxt))
				4365	mib_idx = LINUX_MIB_TCPDSACKOLDSENT;
				4366	else
				4367	mib_idx = LINUX_MIB_TCPDSACKOFOSENT;
				4368
				4369	NET_INC_STATS(sock_net(sk), mib_idx);
				4370
				4371	tp->rx_opt.dsack = 1;
				4372	tp->duplicate_sack[0].start_seq = seq;
				4373	tp->duplicate_sack[0].end_seq = end_seq;
				4374	}
				4375	}
				4376
				4377	static void tcp_dsack_extend(struct sock *sk, u32 seq, u32 end_seq)
				4378	{
				4379	struct tcp_sock *tp = tcp_sk(sk);
				4380
				4381	if (!tp->rx_opt.dsack)
				4382	tcp_dsack_set(sk, seq, end_seq);
				4383	else
				4384	tcp_sack_extend(tp->duplicate_sack, seq, end_seq);
				4385	}
				4386
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4387	static void tcp_rcv_spurious_retrans(struct sock sk, const struct sk_buff skb)
				4388	{
				4389	/* When the ACK path fails or drops most ACKs, the sender would
				4390	* timeout and spuriously retransmit the same segment repeatedly.
				4391	* The receiver remembers and reflects via DSACKs. Leverage the
				4392	* DSACK state and change the txhash to re-route speculatively.
				4393	*/
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	4394	if (TCP_SKB_CB(skb)->seq == tcp_sk(sk)->duplicate_sack[0].start_seq &&
				4395	sk_rethink_txhash(sk))
				4396	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDUPLICATEDATAREHASH);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4397	}
				4398
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4399	static void tcp_send_dupack(struct sock sk, const struct sk_buff skb)
				4400	{
				4401	struct tcp_sock *tp = tcp_sk(sk);
				4402
				4403	if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
				4404	before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
				4405	NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
				4406	tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS);
				4407
				4408	if (tcp_is_sack(tp) && sock_net(sk)->ipv4.sysctl_tcp_dsack) {
				4409	u32 end_seq = TCP_SKB_CB(skb)->end_seq;
				4410
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4411	tcp_rcv_spurious_retrans(sk, skb);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4412	if (after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))
				4413	end_seq = tp->rcv_nxt;
				4414	tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, end_seq);
				4415	}
				4416	}
				4417
				4418	tcp_send_ack(sk);
				4419	}
				4420
				4421	/* These routines update the SACK block as out-of-order packets arrive or
				4422	* in-order packets close up the sequence space.
				4423	*/
				4424	static void tcp_sack_maybe_coalesce(struct tcp_sock *tp)
				4425	{
				4426	int this_sack;
				4427	struct tcp_sack_block *sp = &tp->selective_acks[0];
				4428	struct tcp_sack_block *swalk = sp + 1;
				4429
				4430	/* See if the recent change to the first SACK eats into
				4431	* or hits the sequence space of other SACK blocks, if so coalesce.
				4432	*/
				4433	for (this_sack = 1; this_sack < tp->rx_opt.num_sacks;) {
				4434	if (tcp_sack_extend(sp, swalk->start_seq, swalk->end_seq)) {
				4435	int i;
				4436
				4437	/* Zap SWALK, by moving every further SACK up by one slot.
				4438	* Decrease num_sacks.
				4439	*/
				4440	tp->rx_opt.num_sacks--;
				4441	for (i = this_sack; i < tp->rx_opt.num_sacks; i++)
				4442	sp[i] = sp[i + 1];
				4443	continue;
				4444	}
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	4445	this_sack++;
				4446	swalk++;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4447	}
				4448	}
				4449
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	4450	static void tcp_sack_compress_send_ack(struct sock *sk)
				4451	{
				4452	struct tcp_sock *tp = tcp_sk(sk);
				4453
				4454	if (!tp->compressed_ack)
				4455	return;
				4456
				4457	if (hrtimer_try_to_cancel(&tp->compressed_ack_timer) == 1)
				4458	__sock_put(sk);
				4459
				4460	/* Since we have to send one ack finally,
				4461	* substract one from tp->compressed_ack to keep
				4462	* LINUX_MIB_TCPACKCOMPRESSED accurate.
				4463	*/
				4464	NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPACKCOMPRESSED,
				4465	tp->compressed_ack - 1);
				4466
				4467	tp->compressed_ack = 0;
				4468	tcp_send_ack(sk);
				4469	}
				4470
				4471	/* Reasonable amount of sack blocks included in TCP SACK option
				4472	* The max is 4, but this becomes 3 if TCP timestamps are there.
				4473	* Given that SACK packets might be lost, be conservative and use 2.
				4474	*/
				4475	#define TCP_SACK_BLOCKS_EXPECTED 2
				4476
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4477	static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq)
				4478	{
				4479	struct tcp_sock *tp = tcp_sk(sk);
				4480	struct tcp_sack_block *sp = &tp->selective_acks[0];
				4481	int cur_sacks = tp->rx_opt.num_sacks;
				4482	int this_sack;
				4483
				4484	if (!cur_sacks)
				4485	goto new_sack;
				4486
				4487	for (this_sack = 0; this_sack < cur_sacks; this_sack++, sp++) {
				4488	if (tcp_sack_extend(sp, seq, end_seq)) {
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	4489	if (this_sack >= TCP_SACK_BLOCKS_EXPECTED)
				4490	tcp_sack_compress_send_ack(sk);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4491	/* Rotate this_sack to the first one. */
				4492	for (; this_sack > 0; this_sack--, sp--)
				4493	swap(sp, (sp - 1));
				4494	if (cur_sacks > 1)
				4495	tcp_sack_maybe_coalesce(tp);
				4496	return;
				4497	}
				4498	}
				4499
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	4500	if (this_sack >= TCP_SACK_BLOCKS_EXPECTED)
				4501	tcp_sack_compress_send_ack(sk);
				4502
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4503	/* Could not find an adjacent existing SACK, build a new one,
				4504	* put it at the front, and shift everyone else down. We
				4505	* always know there is at least one SACK present already here.
				4506	*
				4507	* If the sack array is full, forget about the last one.
				4508	*/
				4509	if (this_sack >= TCP_NUM_SACKS) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4510	this_sack--;
				4511	tp->rx_opt.num_sacks--;
				4512	sp--;
				4513	}
				4514	for (; this_sack > 0; this_sack--, sp--)
				4515	sp = (sp - 1);
				4516
				4517	new_sack:
				4518	/* Build the new head SACK, and we're done. */
				4519	sp->start_seq = seq;
				4520	sp->end_seq = end_seq;
				4521	tp->rx_opt.num_sacks++;
				4522	}
				4523
				4524	/* RCV.NXT advances, some SACKs should be eaten. */
				4525
				4526	static void tcp_sack_remove(struct tcp_sock *tp)
				4527	{
				4528	struct tcp_sack_block *sp = &tp->selective_acks[0];
				4529	int num_sacks = tp->rx_opt.num_sacks;
				4530	int this_sack;
				4531
				4532	/* Empty ofo queue, hence, all the SACKs are eaten. Clear. */
				4533	if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
				4534	tp->rx_opt.num_sacks = 0;
				4535	return;
				4536	}
				4537
				4538	for (this_sack = 0; this_sack < num_sacks;) {
				4539	/* Check if the start of the sack is covered by RCV.NXT. */
				4540	if (!before(tp->rcv_nxt, sp->start_seq)) {
				4541	int i;
				4542
				4543	/* RCV.NXT must cover all the block! */
				4544	WARN_ON(before(tp->rcv_nxt, sp->end_seq));
				4545
				4546	/* Zap this SACK, by moving forward any other SACKS. */
				4547	for (i = this_sack+1; i < num_sacks; i++)
				4548	tp->selective_acks[i-1] = tp->selective_acks[i];
				4549	num_sacks--;
				4550	continue;
				4551	}
				4552	this_sack++;
				4553	sp++;
				4554	}
				4555	tp->rx_opt.num_sacks = num_sacks;
				4556	}
				4557
				4558	/**
				4559	* tcp_try_coalesce - try to merge skb to prior one
				4560	* @sk: socket
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4561	* @to: prior buffer
				4562	* @from: buffer to add in queue
				4563	* @fragstolen: pointer to boolean
				4564	*
				4565	* Before queueing skb @from after @to, try to merge them
				4566	* to reduce overall memory use and queue lengths, if cost is small.
				4567	* Packets in ofo or receive queues can stay a long time.
				4568	* Better try to coalesce them right now to avoid future collapses.
				4569	* Returns true if caller should free @from instead of queueing it
				4570	*/
				4571	static bool tcp_try_coalesce(struct sock *sk,
				4572	struct sk_buff *to,
				4573	struct sk_buff *from,
				4574	bool *fragstolen)
				4575	{
				4576	int delta;
				4577
				4578	*fragstolen = false;
				4579
				4580	/* Its possible this segment overlaps with prior segment in queue */
				4581	if (TCP_SKB_CB(from)->seq != TCP_SKB_CB(to)->end_seq)
				4582	return false;
				4583
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	4584	if (!mptcp_skb_can_collapse(to, from))
				4585	return false;
				4586
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4587	#ifdef CONFIG_TLS_DEVICE
				4588	if (from->decrypted != to->decrypted)
				4589	return false;
				4590	#endif
				4591
				4592	if (!skb_try_coalesce(to, from, fragstolen, &delta))
				4593	return false;
				4594
				4595	atomic_add(delta, &sk->sk_rmem_alloc);
				4596	sk_mem_charge(sk, delta);
				4597	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVCOALESCE);
				4598	TCP_SKB_CB(to)->end_seq = TCP_SKB_CB(from)->end_seq;
				4599	TCP_SKB_CB(to)->ack_seq = TCP_SKB_CB(from)->ack_seq;
				4600	TCP_SKB_CB(to)->tcp_flags \|= TCP_SKB_CB(from)->tcp_flags;
				4601
				4602	if (TCP_SKB_CB(from)->has_rxtstamp) {
				4603	TCP_SKB_CB(to)->has_rxtstamp = true;
				4604	to->tstamp = from->tstamp;
				4605	skb_hwtstamps(to)->hwtstamp = skb_hwtstamps(from)->hwtstamp;
				4606	}
				4607
				4608	return true;
				4609	}
				4610
				4611	static bool tcp_ooo_try_coalesce(struct sock *sk,
				4612	struct sk_buff *to,
				4613	struct sk_buff *from,
				4614	bool *fragstolen)
				4615	{
				4616	bool res = tcp_try_coalesce(sk, to, from, fragstolen);
				4617
				4618	/* In case tcp_drop() is called later, update to->gso_segs */
				4619	if (res) {
				4620	u32 gso_segs = max_t(u16, 1, skb_shinfo(to)->gso_segs) +
				4621	max_t(u16, 1, skb_shinfo(from)->gso_segs);
				4622
				4623	skb_shinfo(to)->gso_segs = min_t(u32, gso_segs, 0xFFFF);
				4624	}
				4625	return res;
				4626	}
				4627
				4628	static void tcp_drop(struct sock sk, struct sk_buff skb)
				4629	{
				4630	sk_drops_add(sk, skb);
				4631	__kfree_skb(skb);
				4632	}
				4633
				4634	/* This one checks to see if we can put data from the
				4635	* out_of_order queue into the receive_queue.
				4636	*/
				4637	static void tcp_ofo_queue(struct sock *sk)
				4638	{
				4639	struct tcp_sock *tp = tcp_sk(sk);
				4640	__u32 dsack_high = tp->rcv_nxt;
				4641	bool fin, fragstolen, eaten;
				4642	struct sk_buff skb, tail;
				4643	struct rb_node *p;
				4644
				4645	p = rb_first(&tp->out_of_order_queue);
				4646	while (p) {
				4647	skb = rb_to_skb(p);
				4648	if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
				4649	break;
				4650
				4651	if (before(TCP_SKB_CB(skb)->seq, dsack_high)) {
				4652	__u32 dsack = dsack_high;
				4653	if (before(TCP_SKB_CB(skb)->end_seq, dsack_high))
				4654	dsack_high = TCP_SKB_CB(skb)->end_seq;
				4655	tcp_dsack_extend(sk, TCP_SKB_CB(skb)->seq, dsack);
				4656	}
				4657	p = rb_next(p);
				4658	rb_erase(&skb->rbnode, &tp->out_of_order_queue);
				4659
				4660	if (unlikely(!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4661	tcp_drop(sk, skb);
				4662	continue;
				4663	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4664
				4665	tail = skb_peek_tail(&sk->sk_receive_queue);
				4666	eaten = tail && tcp_try_coalesce(sk, tail, skb, &fragstolen);
				4667	tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq);
				4668	fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN;
				4669	if (!eaten)
				4670	__skb_queue_tail(&sk->sk_receive_queue, skb);
				4671	else
				4672	kfree_skb_partial(skb, fragstolen);
				4673
				4674	if (unlikely(fin)) {
				4675	tcp_fin(sk);
				4676	/* tcp_fin() purges tp->out_of_order_queue,
				4677	* so we must end this loop right now.
				4678	*/
				4679	break;
				4680	}
				4681	}
				4682	}
				4683
				4684	static bool tcp_prune_ofo_queue(struct sock *sk);
				4685	static int tcp_prune_queue(struct sock *sk);
				4686
				4687	static int tcp_try_rmem_schedule(struct sock sk, struct sk_buff skb,
				4688	unsigned int size)
				4689	{
				4690	if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf \|\|
				4691	!sk_rmem_schedule(sk, skb, size)) {
				4692
				4693	if (tcp_prune_queue(sk) < 0)
				4694	return -1;
				4695
				4696	while (!sk_rmem_schedule(sk, skb, size)) {
				4697	if (!tcp_prune_ofo_queue(sk))
				4698	return -1;
				4699	}
				4700	}
				4701	return 0;
				4702	}
				4703
				4704	static void tcp_data_queue_ofo(struct sock sk, struct sk_buff skb)
				4705	{
				4706	struct tcp_sock *tp = tcp_sk(sk);
				4707	struct rb_node *p, parent;
				4708	struct sk_buff *skb1;
				4709	u32 seq, end_seq;
				4710	bool fragstolen;
				4711
				4712	tcp_ecn_check_ce(sk, skb);
				4713
				4714	if (unlikely(tcp_try_rmem_schedule(sk, skb, skb->truesize))) {
				4715	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFODROP);
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame]	4716	sk->sk_data_ready(sk);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4717	tcp_drop(sk, skb);
				4718	return;
				4719	}
				4720
				4721	/* Disable header prediction. */
				4722	tp->pred_flags = 0;
				4723	inet_csk_schedule_ack(sk);
				4724
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4725	tp->rcv_ooopack += max_t(u16, 1, skb_shinfo(skb)->gso_segs);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4726	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOQUEUE);
				4727	seq = TCP_SKB_CB(skb)->seq;
				4728	end_seq = TCP_SKB_CB(skb)->end_seq;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4729
				4730	p = &tp->out_of_order_queue.rb_node;
				4731	if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
				4732	/* Initial out of order segment, build 1 SACK. */
				4733	if (tcp_is_sack(tp)) {
				4734	tp->rx_opt.num_sacks = 1;
				4735	tp->selective_acks[0].start_seq = seq;
				4736	tp->selective_acks[0].end_seq = end_seq;
				4737	}
				4738	rb_link_node(&skb->rbnode, NULL, p);
				4739	rb_insert_color(&skb->rbnode, &tp->out_of_order_queue);
				4740	tp->ooo_last_skb = skb;
				4741	goto end;
				4742	}
				4743
				4744	/* In the typical case, we are adding an skb to the end of the list.
				4745	* Use of ooo_last_skb avoids the O(Log(N)) rbtree lookup.
				4746	*/
				4747	if (tcp_ooo_try_coalesce(sk, tp->ooo_last_skb,
				4748	skb, &fragstolen)) {
				4749	coalesce_done:
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame]	4750	/* For non sack flows, do not grow window to force DUPACK
				4751	* and trigger fast retransmit.
				4752	*/
				4753	if (tcp_is_sack(tp))
				4754	tcp_grow_window(sk, skb);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4755	kfree_skb_partial(skb, fragstolen);
				4756	skb = NULL;
				4757	goto add_sack;
				4758	}
				4759	/* Can avoid an rbtree lookup if we are adding skb after ooo_last_skb */
				4760	if (!before(seq, TCP_SKB_CB(tp->ooo_last_skb)->end_seq)) {
				4761	parent = &tp->ooo_last_skb->rbnode;
				4762	p = &parent->rb_right;
				4763	goto insert;
				4764	}
				4765
				4766	/* Find place to insert this segment. Handle overlaps on the way. */
				4767	parent = NULL;
				4768	while (*p) {
				4769	parent = *p;
				4770	skb1 = rb_to_skb(parent);
				4771	if (before(seq, TCP_SKB_CB(skb1)->seq)) {
				4772	p = &parent->rb_left;
				4773	continue;
				4774	}
				4775	if (before(seq, TCP_SKB_CB(skb1)->end_seq)) {
				4776	if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
				4777	/* All the bits are present. Drop. */
				4778	NET_INC_STATS(sock_net(sk),
				4779	LINUX_MIB_TCPOFOMERGE);
				4780	tcp_drop(sk, skb);
				4781	skb = NULL;
				4782	tcp_dsack_set(sk, seq, end_seq);
				4783	goto add_sack;
				4784	}
				4785	if (after(seq, TCP_SKB_CB(skb1)->seq)) {
				4786	/* Partial overlap. */
				4787	tcp_dsack_set(sk, seq, TCP_SKB_CB(skb1)->end_seq);
				4788	} else {
				4789	/* skb's seq == skb1's seq and skb covers skb1.
				4790	* Replace skb1 with skb.
				4791	*/
				4792	rb_replace_node(&skb1->rbnode, &skb->rbnode,
				4793	&tp->out_of_order_queue);
				4794	tcp_dsack_extend(sk,
				4795	TCP_SKB_CB(skb1)->seq,
				4796	TCP_SKB_CB(skb1)->end_seq);
				4797	NET_INC_STATS(sock_net(sk),
				4798	LINUX_MIB_TCPOFOMERGE);
				4799	tcp_drop(sk, skb1);
				4800	goto merge_right;
				4801	}
				4802	} else if (tcp_ooo_try_coalesce(sk, skb1,
				4803	skb, &fragstolen)) {
				4804	goto coalesce_done;
				4805	}
				4806	p = &parent->rb_right;
				4807	}
				4808	insert:
				4809	/* Insert segment into RB tree. */
				4810	rb_link_node(&skb->rbnode, parent, p);
				4811	rb_insert_color(&skb->rbnode, &tp->out_of_order_queue);
				4812
				4813	merge_right:
				4814	/* Remove other segments covered by skb. */
				4815	while ((skb1 = skb_rb_next(skb)) != NULL) {
				4816	if (!after(end_seq, TCP_SKB_CB(skb1)->seq))
				4817	break;
				4818	if (before(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
				4819	tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
				4820	end_seq);
				4821	break;
				4822	}
				4823	rb_erase(&skb1->rbnode, &tp->out_of_order_queue);
				4824	tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
				4825	TCP_SKB_CB(skb1)->end_seq);
				4826	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOMERGE);
				4827	tcp_drop(sk, skb1);
				4828	}
				4829	/* If there is no skb after us, we are the last_skb ! */
				4830	if (!skb1)
				4831	tp->ooo_last_skb = skb;
				4832
				4833	add_sack:
				4834	if (tcp_is_sack(tp))
				4835	tcp_sack_new_ofo_skb(sk, seq, end_seq);
				4836	end:
				4837	if (skb) {
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame]	4838	/* For non sack flows, do not grow window to force DUPACK
				4839	* and trigger fast retransmit.
				4840	*/
				4841	if (tcp_is_sack(tp))
				4842	tcp_grow_window(sk, skb);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4843	skb_condense(skb);
				4844	skb_set_owner_r(skb, sk);
				4845	}
				4846	}
				4847
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4848	static int __must_check tcp_queue_rcv(struct sock sk, struct sk_buff skb,
				4849	bool *fragstolen)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4850	{
				4851	int eaten;
				4852	struct sk_buff *tail = skb_peek_tail(&sk->sk_receive_queue);
				4853
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4854	eaten = (tail &&
				4855	tcp_try_coalesce(sk, tail,
				4856	skb, fragstolen)) ? 1 : 0;
				4857	tcp_rcv_nxt_update(tcp_sk(sk), TCP_SKB_CB(skb)->end_seq);
				4858	if (!eaten) {
				4859	__skb_queue_tail(&sk->sk_receive_queue, skb);
				4860	skb_set_owner_r(skb, sk);
				4861	}
				4862	return eaten;
				4863	}
				4864
				4865	int tcp_send_rcvq(struct sock sk, struct msghdr msg, size_t size)
				4866	{
				4867	struct sk_buff *skb;
				4868	int err = -ENOMEM;
				4869	int data_len = 0;
				4870	bool fragstolen;
				4871
				4872	if (size == 0)
				4873	return 0;
				4874
				4875	if (size > PAGE_SIZE) {
				4876	int npages = min_t(size_t, size >> PAGE_SHIFT, MAX_SKB_FRAGS);
				4877
				4878	data_len = npages << PAGE_SHIFT;
				4879	size = data_len + (size & ~PAGE_MASK);
				4880	}
				4881	skb = alloc_skb_with_frags(size - data_len, data_len,
				4882	PAGE_ALLOC_COSTLY_ORDER,
				4883	&err, sk->sk_allocation);
				4884	if (!skb)
				4885	goto err;
				4886
				4887	skb_put(skb, size - data_len);
				4888	skb->data_len = data_len;
				4889	skb->len = size;
				4890
				4891	if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) {
				4892	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVQDROP);
				4893	goto err_free;
				4894	}
				4895
				4896	err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
				4897	if (err)
				4898	goto err_free;
				4899
				4900	TCP_SKB_CB(skb)->seq = tcp_sk(sk)->rcv_nxt;
				4901	TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + size;
				4902	TCP_SKB_CB(skb)->ack_seq = tcp_sk(sk)->snd_una - 1;
				4903
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4904	if (tcp_queue_rcv(sk, skb, &fragstolen)) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4905	WARN_ON_ONCE(fragstolen); /* should not happen */
				4906	__kfree_skb(skb);
				4907	}
				4908	return size;
				4909
				4910	err_free:
				4911	kfree_skb(skb);
				4912	err:
				4913	return err;
				4914
				4915	}
				4916
				4917	void tcp_data_ready(struct sock *sk)
				4918	{
				4919	const struct tcp_sock *tp = tcp_sk(sk);
				4920	int avail = tp->rcv_nxt - tp->copied_seq;
				4921
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame]	4922	if (avail < sk->sk_rcvlowat && !tcp_rmem_pressure(sk) &&
				4923	!sock_flag(sk, SOCK_DONE) &&
				4924	tcp_receive_window(tp) > inet_csk(sk)->icsk_ack.rcv_mss)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4925	return;
				4926
				4927	sk->sk_data_ready(sk);
				4928	}
				4929
				4930	static void tcp_data_queue(struct sock sk, struct sk_buff skb)
				4931	{
				4932	struct tcp_sock *tp = tcp_sk(sk);
				4933	bool fragstolen;
				4934	int eaten;
				4935
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	4936	if (sk_is_mptcp(sk))
				4937	mptcp_incoming_options(sk, skb);
				4938
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4939	if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) {
				4940	__kfree_skb(skb);
				4941	return;
				4942	}
				4943	skb_dst_drop(skb);
				4944	__skb_pull(skb, tcp_hdr(skb)->doff * 4);
				4945
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4946	tp->rx_opt.dsack = 0;
				4947
				4948	/* Queue data for delivery to the user.
				4949	* Packets in sequence go to the receive queue.
				4950	* Out of sequence packets to the out_of_order_queue.
				4951	*/
				4952	if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) {
				4953	if (tcp_receive_window(tp) == 0) {
				4954	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPZEROWINDOWDROP);
				4955	goto out_of_window;
				4956	}
				4957
				4958	/* Ok. In sequence. In window. */
				4959	queue_and_out:
				4960	if (skb_queue_len(&sk->sk_receive_queue) == 0)
				4961	sk_forced_mem_schedule(sk, skb->truesize);
				4962	else if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) {
				4963	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVQDROP);
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame]	4964	sk->sk_data_ready(sk);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4965	goto drop;
				4966	}
				4967
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4968	eaten = tcp_queue_rcv(sk, skb, &fragstolen);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4969	if (skb->len)
				4970	tcp_event_data_recv(sk, skb);
				4971	if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
				4972	tcp_fin(sk);
				4973
				4974	if (!RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
				4975	tcp_ofo_queue(sk);
				4976
				4977	/* RFC5681. 4.2. SHOULD send immediate ACK, when
				4978	* gap in queue is filled.
				4979	*/
				4980	if (RB_EMPTY_ROOT(&tp->out_of_order_queue))
				4981	inet_csk(sk)->icsk_ack.pending \|= ICSK_ACK_NOW;
				4982	}
				4983
				4984	if (tp->rx_opt.num_sacks)
				4985	tcp_sack_remove(tp);
				4986
				4987	tcp_fast_path_check(sk);
				4988
				4989	if (eaten > 0)
				4990	kfree_skb_partial(skb, fragstolen);
				4991	if (!sock_flag(sk, SOCK_DEAD))
				4992	tcp_data_ready(sk);
				4993	return;
				4994	}
				4995
				4996	if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4997	tcp_rcv_spurious_retrans(sk, skb);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4998	/* A retransmit, 2nd most common case. Force an immediate ack. */
				4999	NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
				5000	tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
				5001
				5002	out_of_window:
				5003	tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS);
				5004	inet_csk_schedule_ack(sk);
				5005	drop:
				5006	tcp_drop(sk, skb);
				5007	return;
				5008	}
				5009
				5010	/* Out of window. F.e. zero window probe. */
				5011	if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt + tcp_receive_window(tp)))
				5012	goto out_of_window;
				5013
				5014	if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
				5015	/* Partial packet, seq < rcv_next < end_seq */
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5016	tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, tp->rcv_nxt);
				5017
				5018	/* If window is closed, drop tail of packet. But after
				5019	* remembering D-SACK for its head made in previous line.
				5020	*/
				5021	if (!tcp_receive_window(tp)) {
				5022	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPZEROWINDOWDROP);
				5023	goto out_of_window;
				5024	}
				5025	goto queue_and_out;
				5026	}
				5027
				5028	tcp_data_queue_ofo(sk, skb);
				5029	}
				5030
				5031	static struct sk_buff tcp_skb_next(struct sk_buff skb, struct sk_buff_head *list)
				5032	{
				5033	if (list)
				5034	return !skb_queue_is_last(list, skb) ? skb->next : NULL;
				5035
				5036	return skb_rb_next(skb);
				5037	}
				5038
				5039	static struct sk_buff tcp_collapse_one(struct sock sk, struct sk_buff *skb,
				5040	struct sk_buff_head *list,
				5041	struct rb_root *root)
				5042	{
				5043	struct sk_buff *next = tcp_skb_next(skb, list);
				5044
				5045	if (list)
				5046	__skb_unlink(skb, list);
				5047	else
				5048	rb_erase(&skb->rbnode, root);
				5049
				5050	__kfree_skb(skb);
				5051	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVCOLLAPSED);
				5052
				5053	return next;
				5054	}
				5055
				5056	/* Insert skb into rb tree, ordered by TCP_SKB_CB(skb)->seq */
				5057	void tcp_rbtree_insert(struct rb_root root, struct sk_buff skb)
				5058	{
				5059	struct rb_node **p = &root->rb_node;
				5060	struct rb_node *parent = NULL;
				5061	struct sk_buff *skb1;
				5062
				5063	while (*p) {
				5064	parent = *p;
				5065	skb1 = rb_to_skb(parent);
				5066	if (before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb1)->seq))
				5067	p = &parent->rb_left;
				5068	else
				5069	p = &parent->rb_right;
				5070	}
				5071	rb_link_node(&skb->rbnode, parent, p);
				5072	rb_insert_color(&skb->rbnode, root);
				5073	}
				5074
				5075	/* Collapse contiguous sequence of skbs head..tail with
				5076	* sequence numbers start..end.
				5077	*
				5078	* If tail is NULL, this means until the end of the queue.
				5079	*
				5080	* Segments with FIN/SYN are not collapsed (only because this
				5081	* simplifies code)
				5082	*/
				5083	static void
				5084	tcp_collapse(struct sock sk, struct sk_buff_head list, struct rb_root *root,
				5085	struct sk_buff head, struct sk_buff tail, u32 start, u32 end)
				5086	{
				5087	struct sk_buff skb = head, n;
				5088	struct sk_buff_head tmp;
				5089	bool end_of_skbs;
				5090
				5091	/* First, check that queue is collapsible and find
				5092	* the point where collapsing can be useful.
				5093	*/
				5094	restart:
				5095	for (end_of_skbs = true; skb != NULL && skb != tail; skb = n) {
				5096	n = tcp_skb_next(skb, list);
				5097
				5098	/* No new bits? It is possible on ofo queue. */
				5099	if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
				5100	skb = tcp_collapse_one(sk, skb, list, root);
				5101	if (!skb)
				5102	break;
				5103	goto restart;
				5104	}
				5105
				5106	/* The first skb to collapse is:
				5107	* - not SYN/FIN and
				5108	* - bloated or contains data before "start" or
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	5109	* overlaps to the next one and mptcp allow collapsing.
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5110	*/
				5111	if (!(TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN \| TCPHDR_FIN)) &&
				5112	(tcp_win_from_space(sk, skb->truesize) > skb->len \|\|
				5113	before(TCP_SKB_CB(skb)->seq, start))) {
				5114	end_of_skbs = false;
				5115	break;
				5116	}
				5117
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	5118	if (n && n != tail && mptcp_skb_can_collapse(skb, n) &&
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5119	TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(n)->seq) {
				5120	end_of_skbs = false;
				5121	break;
				5122	}
				5123
				5124	/* Decided to skip this, advance start seq. */
				5125	start = TCP_SKB_CB(skb)->end_seq;
				5126	}
				5127	if (end_of_skbs \|\|
				5128	(TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN \| TCPHDR_FIN)))
				5129	return;
				5130
				5131	__skb_queue_head_init(&tmp);
				5132
				5133	while (before(start, end)) {
				5134	int copy = min_t(int, SKB_MAX_ORDER(0, 0), end - start);
				5135	struct sk_buff *nskb;
				5136
				5137	nskb = alloc_skb(copy, GFP_ATOMIC);
				5138	if (!nskb)
				5139	break;
				5140
				5141	memcpy(nskb->cb, skb->cb, sizeof(skb->cb));
				5142	#ifdef CONFIG_TLS_DEVICE
				5143	nskb->decrypted = skb->decrypted;
				5144	#endif
				5145	TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start;
				5146	if (list)
				5147	__skb_queue_before(list, skb, nskb);
				5148	else
				5149	__skb_queue_tail(&tmp, nskb); /* defer rbtree insertion */
				5150	skb_set_owner_r(nskb, sk);
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	5151	mptcp_skb_ext_move(nskb, skb);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5152
				5153	/* Copy data, releasing collapsed skbs. */
				5154	while (copy > 0) {
				5155	int offset = start - TCP_SKB_CB(skb)->seq;
				5156	int size = TCP_SKB_CB(skb)->end_seq - start;
				5157
				5158	BUG_ON(offset < 0);
				5159	if (size > 0) {
				5160	size = min(copy, size);
				5161	if (skb_copy_bits(skb, offset, skb_put(nskb, size), size))
				5162	BUG();
				5163	TCP_SKB_CB(nskb)->end_seq += size;
				5164	copy -= size;
				5165	start += size;
				5166	}
				5167	if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
				5168	skb = tcp_collapse_one(sk, skb, list, root);
				5169	if (!skb \|\|
				5170	skb == tail \|\|
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	5171	!mptcp_skb_can_collapse(nskb, skb) \|\|
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5172	(TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN \| TCPHDR_FIN)))
				5173	goto end;
				5174	#ifdef CONFIG_TLS_DEVICE
				5175	if (skb->decrypted != nskb->decrypted)
				5176	goto end;
				5177	#endif
				5178	}
				5179	}
				5180	}
				5181	end:
				5182	skb_queue_walk_safe(&tmp, skb, n)
				5183	tcp_rbtree_insert(root, skb);
				5184	}
				5185
				5186	/* Collapse ofo queue. Algorithm: select contiguous sequence of skbs
				5187	* and tcp_collapse() them until all the queue is collapsed.
				5188	*/
				5189	static void tcp_collapse_ofo_queue(struct sock *sk)
				5190	{
				5191	struct tcp_sock *tp = tcp_sk(sk);
				5192	u32 range_truesize, sum_tiny = 0;
				5193	struct sk_buff skb, head;
				5194	u32 start, end;
				5195
				5196	skb = skb_rb_first(&tp->out_of_order_queue);
				5197	new_range:
				5198	if (!skb) {
				5199	tp->ooo_last_skb = skb_rb_last(&tp->out_of_order_queue);
				5200	return;
				5201	}
				5202	start = TCP_SKB_CB(skb)->seq;
				5203	end = TCP_SKB_CB(skb)->end_seq;
				5204	range_truesize = skb->truesize;
				5205
				5206	for (head = skb;;) {
				5207	skb = skb_rb_next(skb);
				5208
				5209	/* Range is terminated when we see a gap or when
				5210	* we are at the queue end.
				5211	*/
				5212	if (!skb \|\|
				5213	after(TCP_SKB_CB(skb)->seq, end) \|\|
				5214	before(TCP_SKB_CB(skb)->end_seq, start)) {
				5215	/* Do not attempt collapsing tiny skbs */
				5216	if (range_truesize != head->truesize \|\|
				5217	end - start >= SKB_WITH_OVERHEAD(SK_MEM_QUANTUM)) {
				5218	tcp_collapse(sk, NULL, &tp->out_of_order_queue,
				5219	head, skb, start, end);
				5220	} else {
				5221	sum_tiny += range_truesize;
				5222	if (sum_tiny > sk->sk_rcvbuf >> 3)
				5223	return;
				5224	}
				5225	goto new_range;
				5226	}
				5227
				5228	range_truesize += skb->truesize;
				5229	if (unlikely(before(TCP_SKB_CB(skb)->seq, start)))
				5230	start = TCP_SKB_CB(skb)->seq;
				5231	if (after(TCP_SKB_CB(skb)->end_seq, end))
				5232	end = TCP_SKB_CB(skb)->end_seq;
				5233	}
				5234	}
				5235
				5236	/*
				5237	* Clean the out-of-order queue to make room.
				5238	* We drop high sequences packets to :
				5239	* 1) Let a chance for holes to be filled.
				5240	* 2) not add too big latencies if thousands of packets sit there.
				5241	* (But if application shrinks SO_RCVBUF, we could still end up
				5242	* freeing whole queue here)
				5243	* 3) Drop at least 12.5 % of sk_rcvbuf to avoid malicious attacks.
				5244	*
				5245	* Return true if queue has shrunk.
				5246	*/
				5247	static bool tcp_prune_ofo_queue(struct sock *sk)
				5248	{
				5249	struct tcp_sock *tp = tcp_sk(sk);
				5250	struct rb_node node, prev;
				5251	int goal;
				5252
				5253	if (RB_EMPTY_ROOT(&tp->out_of_order_queue))
				5254	return false;
				5255
				5256	NET_INC_STATS(sock_net(sk), LINUX_MIB_OFOPRUNED);
				5257	goal = sk->sk_rcvbuf >> 3;
				5258	node = &tp->ooo_last_skb->rbnode;
				5259	do {
				5260	prev = rb_prev(node);
				5261	rb_erase(node, &tp->out_of_order_queue);
				5262	goal -= rb_to_skb(node)->truesize;
				5263	tcp_drop(sk, rb_to_skb(node));
				5264	if (!prev \|\| goal <= 0) {
				5265	sk_mem_reclaim(sk);
				5266	if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf &&
				5267	!tcp_under_memory_pressure(sk))
				5268	break;
				5269	goal = sk->sk_rcvbuf >> 3;
				5270	}
				5271	node = prev;
				5272	} while (node);
				5273	tp->ooo_last_skb = rb_to_skb(prev);
				5274
				5275	/* Reset SACK state. A conforming SACK implementation will
				5276	* do the same at a timeout based retransmit. When a connection
				5277	* is in a sad state like this, we care only about integrity
				5278	* of the connection not performance.
				5279	*/
				5280	if (tp->rx_opt.sack_ok)
				5281	tcp_sack_reset(&tp->rx_opt);
				5282	return true;
				5283	}
				5284
				5285	/* Reduce allocated memory if we can, trying to get
				5286	* the socket within its memory limits again.
				5287	*
				5288	* Return less than zero if we should start dropping frames
				5289	* until the socket owning process reads some of the data
				5290	* to stabilize the situation.
				5291	*/
				5292	static int tcp_prune_queue(struct sock *sk)
				5293	{
				5294	struct tcp_sock *tp = tcp_sk(sk);
				5295
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5296	NET_INC_STATS(sock_net(sk), LINUX_MIB_PRUNECALLED);
				5297
				5298	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
				5299	tcp_clamp_window(sk);
				5300	else if (tcp_under_memory_pressure(sk))
				5301	tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss);
				5302
				5303	if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
				5304	return 0;
				5305
				5306	tcp_collapse_ofo_queue(sk);
				5307	if (!skb_queue_empty(&sk->sk_receive_queue))
				5308	tcp_collapse(sk, &sk->sk_receive_queue, NULL,
				5309	skb_peek(&sk->sk_receive_queue),
				5310	NULL,
				5311	tp->copied_seq, tp->rcv_nxt);
				5312	sk_mem_reclaim(sk);
				5313
				5314	if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
				5315	return 0;
				5316
				5317	/* Collapsing did not help, destructive actions follow.
				5318	* This must not ever occur. */
				5319
				5320	tcp_prune_ofo_queue(sk);
				5321
				5322	if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
				5323	return 0;
				5324
				5325	/* If we are really being abused, tell the caller to silently
				5326	* drop receive data on the floor. It will get retransmitted
				5327	* and hopefully then we'll have sufficient space.
				5328	*/
				5329	NET_INC_STATS(sock_net(sk), LINUX_MIB_RCVPRUNED);
				5330
				5331	/* Massive buffer overcommit. */
				5332	tp->pred_flags = 0;
				5333	return -1;
				5334	}
				5335
				5336	static bool tcp_should_expand_sndbuf(const struct sock *sk)
				5337	{
				5338	const struct tcp_sock *tp = tcp_sk(sk);
				5339
				5340	/* If the user specified a specific send buffer setting, do
				5341	* not modify it.
				5342	*/
				5343	if (sk->sk_userlocks & SOCK_SNDBUF_LOCK)
				5344	return false;
				5345
				5346	/* If we are under global TCP memory pressure, do not expand. */
				5347	if (tcp_under_memory_pressure(sk))
				5348	return false;
				5349
				5350	/* If we are under soft global TCP memory pressure, do not expand. */
				5351	if (sk_memory_allocated(sk) >= sk_prot_mem_limits(sk, 0))
				5352	return false;
				5353
				5354	/* If we filled the congestion window, do not expand. */
				5355	if (tcp_packets_in_flight(tp) >= tp->snd_cwnd)
				5356	return false;
				5357
				5358	return true;
				5359	}
				5360
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5361	static void tcp_new_space(struct sock *sk)
				5362	{
				5363	struct tcp_sock *tp = tcp_sk(sk);
				5364
				5365	if (tcp_should_expand_sndbuf(sk)) {
				5366	tcp_sndbuf_expand(sk);
				5367	tp->snd_cwnd_stamp = tcp_jiffies32;
				5368	}
				5369
				5370	sk->sk_write_space(sk);
				5371	}
				5372
				5373	static void tcp_check_space(struct sock *sk)
				5374	{
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	5375	/* pairs with tcp_poll() */
				5376	smp_mb();
				5377	if (sk->sk_socket &&
				5378	test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
				5379	tcp_new_space(sk);
				5380	if (!test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))
				5381	tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5382	}
				5383	}
				5384
				5385	static inline void tcp_data_snd_check(struct sock *sk)
				5386	{
				5387	tcp_push_pending_frames(sk);
				5388	tcp_check_space(sk);
				5389	}
				5390
				5391	/*
				5392	* Check if sending an ack is needed.
				5393	*/
				5394	static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
				5395	{
				5396	struct tcp_sock *tp = tcp_sk(sk);
				5397	unsigned long rtt, delay;
				5398
				5399	/* More than one full frame received... */
				5400	if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss &&
				5401	/* ... and right edge of window advances far enough.
				5402	* (tcp_recvmsg() will send ACK otherwise).
				5403	* If application uses SO_RCVLOWAT, we want send ack now if
				5404	* we have not received enough bytes to satisfy the condition.
				5405	*/
				5406	(tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat \|\|
				5407	__tcp_select_window(sk) >= tp->rcv_wnd)) \|\|
				5408	/* We ACK each frame or... */
				5409	tcp_in_quickack_mode(sk) \|\|
				5410	/* Protocol state mandates a one-time immediate ACK */
				5411	inet_csk(sk)->icsk_ack.pending & ICSK_ACK_NOW) {
				5412	send_now:
				5413	tcp_send_ack(sk);
				5414	return;
				5415	}
				5416
				5417	if (!ofo_possible \|\| RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
				5418	tcp_send_delayed_ack(sk);
				5419	return;
				5420	}
				5421
				5422	if (!tcp_is_sack(tp) \|\|
				5423	tp->compressed_ack >= sock_net(sk)->ipv4.sysctl_tcp_comp_sack_nr)
				5424	goto send_now;
				5425
				5426	if (tp->compressed_ack_rcv_nxt != tp->rcv_nxt) {
				5427	tp->compressed_ack_rcv_nxt = tp->rcv_nxt;
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	5428	tp->dup_ack_counter = 0;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5429	}
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	5430	if (tp->dup_ack_counter < TCP_FASTRETRANS_THRESH) {
				5431	tp->dup_ack_counter++;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5432	goto send_now;
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	5433	}
				5434	tp->compressed_ack++;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5435	if (hrtimer_is_queued(&tp->compressed_ack_timer))
				5436	return;
				5437
				5438	/* compress ack timer : 5 % of rtt, but no more than tcp_comp_sack_delay_ns */
				5439
				5440	rtt = tp->rcv_rtt_est.rtt_us;
				5441	if (tp->srtt_us && tp->srtt_us < rtt)
				5442	rtt = tp->srtt_us;
				5443
				5444	delay = min_t(unsigned long, sock_net(sk)->ipv4.sysctl_tcp_comp_sack_delay_ns,
				5445	rtt * (NSEC_PER_USEC >> 3)/20);
				5446	sock_hold(sk);
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	5447	hrtimer_start_range_ns(&tp->compressed_ack_timer, ns_to_ktime(delay),
				5448	sock_net(sk)->ipv4.sysctl_tcp_comp_sack_slack_ns,
				5449	HRTIMER_MODE_REL_PINNED_SOFT);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5450	}
				5451
				5452	static inline void tcp_ack_snd_check(struct sock *sk)
				5453	{
				5454	if (!inet_csk_ack_scheduled(sk)) {
				5455	/* We sent a data segment already. */
				5456	return;
				5457	}
				5458	__tcp_ack_snd_check(sk, 1);
				5459	}
				5460
				5461	/*
				5462	* This routine is only called when we have urgent data
				5463	* signaled. Its the 'slow' part of tcp_urg. It could be
				5464	* moved inline now as tcp_urg is only called from one
				5465	* place. We handle URGent data wrong. We have to - as
				5466	* BSD still doesn't use the correction from RFC961.
				5467	* For 1003.1g we should support a new option TCP_STDURG to permit
				5468	* either form (or just set the sysctl tcp_stdurg).
				5469	*/
				5470
				5471	static void tcp_check_urg(struct sock sk, const struct tcphdr th)
				5472	{
				5473	struct tcp_sock *tp = tcp_sk(sk);
				5474	u32 ptr = ntohs(th->urg_ptr);
				5475
				5476	if (ptr && !sock_net(sk)->ipv4.sysctl_tcp_stdurg)
				5477	ptr--;
				5478	ptr += ntohl(th->seq);
				5479
				5480	/* Ignore urgent data that we've already seen and read. */
				5481	if (after(tp->copied_seq, ptr))
				5482	return;
				5483
				5484	/* Do not replay urg ptr.
				5485	*
				5486	* NOTE: interesting situation not covered by specs.
				5487	* Misbehaving sender may send urg ptr, pointing to segment,
				5488	* which we already have in ofo queue. We are not able to fetch
				5489	* such data and will stay in TCP_URG_NOTYET until will be eaten
				5490	* by recvmsg(). Seems, we are not obliged to handle such wicked
				5491	* situations. But it is worth to think about possibility of some
				5492	* DoSes using some hypothetical application level deadlock.
				5493	*/
				5494	if (before(ptr, tp->rcv_nxt))
				5495	return;
				5496
				5497	/* Do we already have a newer (or duplicate) urgent pointer? */
				5498	if (tp->urg_data && !after(ptr, tp->urg_seq))
				5499	return;
				5500
				5501	/* Tell the world about our new urgent pointer. */
				5502	sk_send_sigurg(sk);
				5503
				5504	/* We may be adding urgent data when the last byte read was
				5505	* urgent. To do this requires some care. We cannot just ignore
				5506	* tp->copied_seq since we would read the last urgent byte again
				5507	* as data, nor can we alter copied_seq until this data arrives
				5508	* or we break the semantics of SIOCATMARK (and thus sockatmark())
				5509	*
				5510	* NOTE. Double Dutch. Rendering to plain English: author of comment
				5511	* above did something sort of send("A", MSG_OOB); send("B", MSG_OOB);
				5512	* and expect that both A and B disappear from stream. This is _wrong_.
				5513	* Though this happens in BSD with high probability, this is occasional.
				5514	* Any application relying on this is buggy. Note also, that fix "works"
				5515	* only in this artificial test. Insert some normal data between A and B and we will
				5516	* decline of BSD again. Verdict: it is better to remove to trap
				5517	* buggy users.
				5518	*/
				5519	if (tp->urg_seq == tp->copied_seq && tp->urg_data &&
				5520	!sock_flag(sk, SOCK_URGINLINE) && tp->copied_seq != tp->rcv_nxt) {
				5521	struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
				5522	tp->copied_seq++;
				5523	if (skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq)) {
				5524	__skb_unlink(skb, &sk->sk_receive_queue);
				5525	__kfree_skb(skb);
				5526	}
				5527	}
				5528
				5529	tp->urg_data = TCP_URG_NOTYET;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5530	WRITE_ONCE(tp->urg_seq, ptr);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5531
				5532	/* Disable header prediction. */
				5533	tp->pred_flags = 0;
				5534	}
				5535
				5536	/* This is the 'fast' part of urgent handling. */
				5537	static void tcp_urg(struct sock sk, struct sk_buff skb, const struct tcphdr *th)
				5538	{
				5539	struct tcp_sock *tp = tcp_sk(sk);
				5540
				5541	/* Check if we get a new urgent pointer - normally not. */
				5542	if (th->urg)
				5543	tcp_check_urg(sk, th);
				5544
				5545	/* Do we wait for any urgent data? - normally not... */
				5546	if (tp->urg_data == TCP_URG_NOTYET) {
				5547	u32 ptr = tp->urg_seq - ntohl(th->seq) + (th->doff * 4) -
				5548	th->syn;
				5549
				5550	/* Is the urgent pointer pointing into this packet? */
				5551	if (ptr < skb->len) {
				5552	u8 tmp;
				5553	if (skb_copy_bits(skb, ptr, &tmp, 1))
				5554	BUG();
				5555	tp->urg_data = TCP_URG_VALID \| tmp;
				5556	if (!sock_flag(sk, SOCK_DEAD))
				5557	sk->sk_data_ready(sk);
				5558	}
				5559	}
				5560	}
				5561
				5562	/* Accept RST for rcv_nxt - 1 after a FIN.
				5563	* When tcp connections are abruptly terminated from Mac OSX (via ^C), a
				5564	* FIN is sent followed by a RST packet. The RST is sent with the same
				5565	* sequence number as the FIN, and thus according to RFC 5961 a challenge
				5566	* ACK should be sent. However, Mac OSX rate limits replies to challenge
				5567	* ACKs on the closed socket. In addition middleboxes can drop either the
				5568	* challenge ACK or a subsequent RST.
				5569	*/
				5570	static bool tcp_reset_check(const struct sock sk, const struct sk_buff skb)
				5571	{
				5572	struct tcp_sock *tp = tcp_sk(sk);
				5573
				5574	return unlikely(TCP_SKB_CB(skb)->seq == (tp->rcv_nxt - 1) &&
				5575	(1 << sk->sk_state) & (TCPF_CLOSE_WAIT \| TCPF_LAST_ACK \|
				5576	TCPF_CLOSING));
				5577	}
				5578
				5579	/* Does PAWS and seqno based validation of an incoming segment, flags will
				5580	* play significant role here.
				5581	*/
				5582	static bool tcp_validate_incoming(struct sock sk, struct sk_buff skb,
				5583	const struct tcphdr *th, int syn_inerr)
				5584	{
				5585	struct tcp_sock *tp = tcp_sk(sk);
				5586	bool rst_seq_match = false;
				5587
				5588	/* RFC1323: H1. Apply PAWS check first. */
				5589	if (tcp_fast_parse_options(sock_net(sk), skb, th, tp) &&
				5590	tp->rx_opt.saw_tstamp &&
				5591	tcp_paws_discard(sk, skb)) {
				5592	if (!th->rst) {
				5593	NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED);
				5594	if (!tcp_oow_rate_limited(sock_net(sk), skb,
				5595	LINUX_MIB_TCPACKSKIPPEDPAWS,
				5596	&tp->last_oow_ack_time))
				5597	tcp_send_dupack(sk, skb);
				5598	goto discard;
				5599	}
				5600	/* Reset is accepted even if it did not pass PAWS. */
				5601	}
				5602
				5603	/* Step 1: check sequence number */
				5604	if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)) {
				5605	/* RFC793, page 37: "In all states except SYN-SENT, all reset
				5606	* (RST) segments are validated by checking their SEQ-fields."
				5607	* And page 69: "If an incoming segment is not acceptable,
				5608	* an acknowledgment should be sent in reply (unless the RST
				5609	* bit is set, if so drop the segment and return)".
				5610	*/
				5611	if (!th->rst) {
				5612	if (th->syn)
				5613	goto syn_challenge;
				5614	if (!tcp_oow_rate_limited(sock_net(sk), skb,
				5615	LINUX_MIB_TCPACKSKIPPEDSEQ,
				5616	&tp->last_oow_ack_time))
				5617	tcp_send_dupack(sk, skb);
				5618	} else if (tcp_reset_check(sk, skb)) {
				5619	tcp_reset(sk);
				5620	}
				5621	goto discard;
				5622	}
				5623
				5624	/* Step 2: check RST bit */
				5625	if (th->rst) {
				5626	/* RFC 5961 3.2 (extend to match against (RCV.NXT - 1) after a
				5627	* FIN and SACK too if available):
				5628	* If seq num matches RCV.NXT or (RCV.NXT - 1) after a FIN, or
				5629	* the right-most SACK block,
				5630	* then
				5631	* RESET the connection
				5632	* else
				5633	* Send a challenge ACK
				5634	*/
				5635	if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt \|\|
				5636	tcp_reset_check(sk, skb)) {
				5637	rst_seq_match = true;
				5638	} else if (tcp_is_sack(tp) && tp->rx_opt.num_sacks > 0) {
				5639	struct tcp_sack_block *sp = &tp->selective_acks[0];
				5640	int max_sack = sp[0].end_seq;
				5641	int this_sack;
				5642
				5643	for (this_sack = 1; this_sack < tp->rx_opt.num_sacks;
				5644	++this_sack) {
				5645	max_sack = after(sp[this_sack].end_seq,
				5646	max_sack) ?
				5647	sp[this_sack].end_seq : max_sack;
				5648	}
				5649
				5650	if (TCP_SKB_CB(skb)->seq == max_sack)
				5651	rst_seq_match = true;
				5652	}
				5653
				5654	if (rst_seq_match)
				5655	tcp_reset(sk);
				5656	else {
				5657	/* Disable TFO if RST is out-of-order
				5658	* and no data has been received
				5659	* for current active TFO socket
				5660	*/
				5661	if (tp->syn_fastopen && !tp->data_segs_in &&
				5662	sk->sk_state == TCP_ESTABLISHED)
				5663	tcp_fastopen_active_disable(sk);
				5664	tcp_send_challenge_ack(sk, skb);
				5665	}
				5666	goto discard;
				5667	}
				5668
				5669	/* step 3: check security and precedence [ignored] */
				5670
				5671	/* step 4: Check for a SYN
				5672	* RFC 5961 4.2 : Send a challenge ack
				5673	*/
				5674	if (th->syn) {
				5675	syn_challenge:
				5676	if (syn_inerr)
				5677	TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
				5678	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNCHALLENGE);
				5679	tcp_send_challenge_ack(sk, skb);
				5680	goto discard;
				5681	}
				5682
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	5683	bpf_skops_parse_hdr(sk, skb);
				5684
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5685	return true;
				5686
				5687	discard:
				5688	tcp_drop(sk, skb);
				5689	return false;
				5690	}
				5691
				5692	/*
				5693	* TCP receive function for the ESTABLISHED state.
				5694	*
				5695	* It is split into a fast path and a slow path. The fast path is
				5696	* disabled when:
				5697	* - A zero window was announced from us - zero window probing
				5698	* is only handled properly in the slow path.
				5699	* - Out of order segments arrived.
				5700	* - Urgent data is expected.
				5701	* - There is no buffer space left
				5702	* - Unexpected TCP flags/window values/header lengths are received
				5703	* (detected by checking the TCP header against pred_flags)
				5704	* - Data is sent in both directions. Fast path only supports pure senders
				5705	* or pure receivers (this means either the sequence number or the ack
				5706	* value must stay constant)
				5707	* - Unexpected TCP option.
				5708	*
				5709	* When these conditions are not satisfied it drops into a standard
				5710	* receive procedure patterned after RFC793 to handle all cases.
				5711	* The first three cases are guaranteed by proper pred_flags setting,
				5712	* the rest is checked inline. Fast processing is turned on in
				5713	* tcp_data_queue when everything is OK.
				5714	*/
				5715	void tcp_rcv_established(struct sock sk, struct sk_buff skb)
				5716	{
				5717	const struct tcphdr th = (const struct tcphdr )skb->data;
				5718	struct tcp_sock *tp = tcp_sk(sk);
				5719	unsigned int len = skb->len;
				5720
				5721	/* TCP congestion window tracking */
				5722	trace_tcp_probe(sk, skb);
				5723
				5724	tcp_mstamp_refresh(tp);
				5725	if (unlikely(!sk->sk_rx_dst))
				5726	inet_csk(sk)->icsk_af_ops->sk_rx_dst_set(sk, skb);
				5727	/*
				5728	* Header prediction.
				5729	* The code loosely follows the one in the famous
				5730	* "30 instruction TCP receive" Van Jacobson mail.
				5731	*
				5732	* Van's trick is to deposit buffers into socket queue
				5733	* on a device interrupt, to call tcp_recv function
				5734	* on the receive process context and checksum and copy
				5735	* the buffer to user space. smart...
				5736	*
				5737	* Our current scheme is not silly either but we take the
				5738	* extra cost of the net_bh soft interrupt processing...
				5739	* We do checksum and copy also but from device to kernel.
				5740	*/
				5741
				5742	tp->rx_opt.saw_tstamp = 0;
				5743
				5744	/* pred_flags is 0xS?10 << 16 + snd_wnd
				5745	* if header_prediction is to be made
				5746	* 'S' will always be tp->tcp_header_len >> 2
				5747	* '?' will be 0 for the fast path, otherwise pred_flags is 0 to
				5748	* turn it off (when there are holes in the receive
				5749	* space for instance)
				5750	* PSH flag is ignored.
				5751	*/
				5752
				5753	if ((tcp_flag_word(th) & TCP_HP_BITS) == tp->pred_flags &&
				5754	TCP_SKB_CB(skb)->seq == tp->rcv_nxt &&
				5755	!after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt)) {
				5756	int tcp_header_len = tp->tcp_header_len;
				5757
				5758	/* Timestamp header prediction: tcp_header_len
				5759	* is automatically equal to th->doff*4 due to pred_flags
				5760	* match.
				5761	*/
				5762
				5763	/* Check timestamp */
				5764	if (tcp_header_len == sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) {
				5765	/* No? Slow path! */
				5766	if (!tcp_parse_aligned_timestamp(tp, th))
				5767	goto slow_path;
				5768
				5769	/* If PAWS failed, check it more carefully in slow path */
				5770	if ((s32)(tp->rx_opt.rcv_tsval - tp->rx_opt.ts_recent) < 0)
				5771	goto slow_path;
				5772
				5773	/* DO NOT update ts_recent here, if checksum fails
				5774	* and timestamp was corrupted part, it will result
				5775	* in a hung connection since we will drop all
				5776	* future packets due to the PAWS test.
				5777	*/
				5778	}
				5779
				5780	if (len <= tcp_header_len) {
				5781	/* Bulk data transfer: sender */
				5782	if (len == tcp_header_len) {
				5783	/* Predicted packet is in window by definition.
				5784	* seq == rcv_nxt and rcv_wup <= rcv_nxt.
				5785	* Hence, check seq<=rcv_wup reduces to:
				5786	*/
				5787	if (tcp_header_len ==
				5788	(sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) &&
				5789	tp->rcv_nxt == tp->rcv_wup)
				5790	tcp_store_ts_recent(tp);
				5791
				5792	/* We know that such packets are checksummed
				5793	* on entry.
				5794	*/
				5795	tcp_ack(sk, skb, 0);
				5796	__kfree_skb(skb);
				5797	tcp_data_snd_check(sk);
				5798	/* When receiving pure ack in fast path, update
				5799	* last ts ecr directly instead of calling
				5800	* tcp_rcv_rtt_measure_ts()
				5801	*/
				5802	tp->rcv_rtt_last_tsecr = tp->rx_opt.rcv_tsecr;
				5803	return;
				5804	} else { /* Header too small */
				5805	TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
				5806	goto discard;
				5807	}
				5808	} else {
				5809	int eaten = 0;
				5810	bool fragstolen = false;
				5811
				5812	if (tcp_checksum_complete(skb))
				5813	goto csum_error;
				5814
				5815	if ((int)skb->truesize > sk->sk_forward_alloc)
				5816	goto step5;
				5817
				5818	/* Predicted packet is in window by definition.
				5819	* seq == rcv_nxt and rcv_wup <= rcv_nxt.
				5820	* Hence, check seq<=rcv_wup reduces to:
				5821	*/
				5822	if (tcp_header_len ==
				5823	(sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) &&
				5824	tp->rcv_nxt == tp->rcv_wup)
				5825	tcp_store_ts_recent(tp);
				5826
				5827	tcp_rcv_rtt_measure_ts(sk, skb);
				5828
				5829	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPHITS);
				5830
				5831	/* Bulk data transfer: receiver */
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5832	__skb_pull(skb, tcp_header_len);
				5833	eaten = tcp_queue_rcv(sk, skb, &fragstolen);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5834
				5835	tcp_event_data_recv(sk, skb);
				5836
				5837	if (TCP_SKB_CB(skb)->ack_seq != tp->snd_una) {
				5838	/* Well, only one small jumplet in fast path... */
				5839	tcp_ack(sk, skb, FLAG_DATA);
				5840	tcp_data_snd_check(sk);
				5841	if (!inet_csk_ack_scheduled(sk))
				5842	goto no_ack;
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame]	5843	} else {
				5844	tcp_update_wl(tp, TCP_SKB_CB(skb)->seq);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5845	}
				5846
				5847	__tcp_ack_snd_check(sk, 0);
				5848	no_ack:
				5849	if (eaten)
				5850	kfree_skb_partial(skb, fragstolen);
				5851	tcp_data_ready(sk);
				5852	return;
				5853	}
				5854	}
				5855
				5856	slow_path:
				5857	if (len < (th->doff << 2) \|\| tcp_checksum_complete(skb))
				5858	goto csum_error;
				5859
				5860	if (!th->ack && !th->rst && !th->syn)
				5861	goto discard;
				5862
				5863	/*
				5864	* Standard slow path.
				5865	*/
				5866
				5867	if (!tcp_validate_incoming(sk, skb, th, 1))
				5868	return;
				5869
				5870	step5:
				5871	if (tcp_ack(sk, skb, FLAG_SLOWPATH \| FLAG_UPDATE_TS_RECENT) < 0)
				5872	goto discard;
				5873
				5874	tcp_rcv_rtt_measure_ts(sk, skb);
				5875
				5876	/* Process urgent data. */
				5877	tcp_urg(sk, skb, th);
				5878
				5879	/* step 7: process the segment text */
				5880	tcp_data_queue(sk, skb);
				5881
				5882	tcp_data_snd_check(sk);
				5883	tcp_ack_snd_check(sk);
				5884	return;
				5885
				5886	csum_error:
				5887	TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
				5888	TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
				5889
				5890	discard:
				5891	tcp_drop(sk, skb);
				5892	}
				5893	EXPORT_SYMBOL(tcp_rcv_established);
				5894
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	5895	void tcp_init_transfer(struct sock sk, int bpf_op, struct sk_buff skb)
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5896	{
				5897	struct inet_connection_sock *icsk = inet_csk(sk);
				5898	struct tcp_sock *tp = tcp_sk(sk);
				5899
				5900	tcp_mtup_init(sk);
				5901	icsk->icsk_af_ops->rebuild_header(sk);
				5902	tcp_init_metrics(sk);
				5903
				5904	/* Initialize the congestion window to start the transfer.
				5905	* Cut cwnd down to 1 per RFC5681 if SYN or SYN-ACK has been
				5906	* retransmitted. In light of RFC6298 more aggressive 1sec
				5907	* initRTO, we only reset cwnd when more than 1 SYN/SYN-ACK
				5908	* retransmission has occurred.
				5909	*/
				5910	if (tp->total_retrans > 1 && tp->undo_marker)
				5911	tp->snd_cwnd = 1;
				5912	else
				5913	tp->snd_cwnd = tcp_init_cwnd(tp, __sk_dst_get(sk));
				5914	tp->snd_cwnd_stamp = tcp_jiffies32;
				5915
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	5916	bpf_skops_established(sk, bpf_op, skb);
				5917	/* Initialize congestion control unless BPF initialized it already: */
				5918	if (!icsk->icsk_ca_initialized)
				5919	tcp_init_congestion_control(sk);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5920	tcp_init_buffer_space(sk);
				5921	}
				5922
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5923	void tcp_finish_connect(struct sock sk, struct sk_buff skb)
				5924	{
				5925	struct tcp_sock *tp = tcp_sk(sk);
				5926	struct inet_connection_sock *icsk = inet_csk(sk);
				5927
				5928	tcp_set_state(sk, TCP_ESTABLISHED);
				5929	icsk->icsk_ack.lrcvtime = tcp_jiffies32;
				5930
				5931	if (skb) {
				5932	icsk->icsk_af_ops->sk_rx_dst_set(sk, skb);
				5933	security_inet_conn_established(sk, skb);
				5934	sk_mark_napi_id(sk, skb);
				5935	}
				5936
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	5937	tcp_init_transfer(sk, BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB, skb);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5938
				5939	/* Prevent spurious tcp_cwnd_restart() on first data
				5940	* packet.
				5941	*/
				5942	tp->lsndtime = tcp_jiffies32;
				5943
				5944	if (sock_flag(sk, SOCK_KEEPOPEN))
				5945	inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp));
				5946
				5947	if (!tp->rx_opt.snd_wscale)
				5948	__tcp_fast_path_on(tp, tp->snd_wnd);
				5949	else
				5950	tp->pred_flags = 0;
				5951	}
				5952
				5953	static bool tcp_rcv_fastopen_synack(struct sock sk, struct sk_buff synack,
				5954	struct tcp_fastopen_cookie *cookie)
				5955	{
				5956	struct tcp_sock *tp = tcp_sk(sk);
				5957	struct sk_buff *data = tp->syn_data ? tcp_rtx_queue_head(sk) : NULL;
				5958	u16 mss = tp->rx_opt.mss_clamp, try_exp = 0;
				5959	bool syn_drop = false;
				5960
				5961	if (mss == tp->rx_opt.user_mss) {
				5962	struct tcp_options_received opt;
				5963
				5964	/* Get original SYNACK MSS value if user MSS sets mss_clamp */
				5965	tcp_clear_options(&opt);
				5966	opt.user_mss = opt.mss_clamp = 0;
				5967	tcp_parse_options(sock_net(sk), synack, &opt, 0, NULL);
				5968	mss = opt.mss_clamp;
				5969	}
				5970
				5971	if (!tp->syn_fastopen) {
				5972	/* Ignore an unsolicited cookie */
				5973	cookie->len = -1;
				5974	} else if (tp->total_retrans) {
				5975	/* SYN timed out and the SYN-ACK neither has a cookie nor
				5976	* acknowledges data. Presumably the remote received only
				5977	* the retransmitted (regular) SYNs: either the original
				5978	* SYN-data or the corresponding SYN-ACK was dropped.
				5979	*/
				5980	syn_drop = (cookie->len < 0 && data);
				5981	} else if (cookie->len < 0 && !tp->syn_data) {
				5982	/* We requested a cookie but didn't get it. If we did not use
				5983	* the (old) exp opt format then try so next time (try_exp=1).
				5984	* Otherwise we go back to use the RFC7413 opt (try_exp=2).
				5985	*/
				5986	try_exp = tp->syn_fastopen_exp ? 2 : 1;
				5987	}
				5988
				5989	tcp_fastopen_cache_set(sk, mss, cookie, syn_drop, try_exp);
				5990
				5991	if (data) { /* Retransmit unacked data in SYN */
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	5992	if (tp->total_retrans)
				5993	tp->fastopen_client_fail = TFO_SYN_RETRANSMITTED;
				5994	else
				5995	tp->fastopen_client_fail = TFO_DATA_NOT_ACKED;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5996	skb_rbtree_walk_from(data) {
				5997	if (__tcp_retransmit_skb(sk, data, 1))
				5998	break;
				5999	}
				6000	tcp_rearm_rto(sk);
				6001	NET_INC_STATS(sock_net(sk),
				6002	LINUX_MIB_TCPFASTOPENACTIVEFAIL);
				6003	return true;
				6004	}
				6005	tp->syn_data_acked = tp->syn_data;
				6006	if (tp->syn_data_acked) {
				6007	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVE);
				6008	/* SYN-data is counted as two separate packets in tcp_ack() */
				6009	if (tp->delivered > 1)
				6010	--tp->delivered;
				6011	}
				6012
				6013	tcp_fastopen_add_skb(sk, synack);
				6014
				6015	return false;
				6016	}
				6017
				6018	static void smc_check_reset_syn(struct tcp_sock *tp)
				6019	{
				6020	#if IS_ENABLED(CONFIG_SMC)
				6021	if (static_branch_unlikely(&tcp_have_smc)) {
				6022	if (tp->syn_smc && !tp->rx_opt.smc_ok)
				6023	tp->syn_smc = 0;
				6024	}
				6025	#endif
				6026	}
				6027
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6028	static void tcp_try_undo_spurious_syn(struct sock *sk)
				6029	{
				6030	struct tcp_sock *tp = tcp_sk(sk);
				6031	u32 syn_stamp;
				6032
				6033	/* undo_marker is set when SYN or SYNACK times out. The timeout is
				6034	* spurious if the ACK's timestamp option echo value matches the
				6035	* original SYN timestamp.
				6036	*/
				6037	syn_stamp = tp->retrans_stamp;
				6038	if (tp->undo_marker && syn_stamp && tp->rx_opt.saw_tstamp &&
				6039	syn_stamp == tp->rx_opt.rcv_tsecr)
				6040	tp->undo_marker = 0;
				6041	}
				6042
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6043	static int tcp_rcv_synsent_state_process(struct sock sk, struct sk_buff skb,
				6044	const struct tcphdr *th)
				6045	{
				6046	struct inet_connection_sock *icsk = inet_csk(sk);
				6047	struct tcp_sock *tp = tcp_sk(sk);
				6048	struct tcp_fastopen_cookie foc = { .len = -1 };
				6049	int saved_clamp = tp->rx_opt.mss_clamp;
				6050	bool fastopen_fail;
				6051
				6052	tcp_parse_options(sock_net(sk), skb, &tp->rx_opt, 0, &foc);
				6053	if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
				6054	tp->rx_opt.rcv_tsecr -= tp->tsoffset;
				6055
				6056	if (th->ack) {
				6057	/* rfc793:
				6058	* "If the state is SYN-SENT then
				6059	* first check the ACK bit
				6060	* If the ACK bit is set
				6061	* If SEG.ACK =< ISS, or SEG.ACK > SND.NXT, send
				6062	* a reset (unless the RST bit is set, if so drop
				6063	* the segment and return)"
				6064	*/
				6065	if (!after(TCP_SKB_CB(skb)->ack_seq, tp->snd_una) \|\|
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	6066	after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt)) {
				6067	/* Previous FIN/ACK or RST/ACK might be ignored. */
				6068	if (icsk->icsk_retransmits == 0)
				6069	inet_csk_reset_xmit_timer(sk,
				6070	ICSK_TIME_RETRANS,
				6071	TCP_TIMEOUT_MIN, TCP_RTO_MAX);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6072	goto reset_and_undo;
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	6073	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6074
				6075	if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
				6076	!between(tp->rx_opt.rcv_tsecr, tp->retrans_stamp,
				6077	tcp_time_stamp(tp))) {
				6078	NET_INC_STATS(sock_net(sk),
				6079	LINUX_MIB_PAWSACTIVEREJECTED);
				6080	goto reset_and_undo;
				6081	}
				6082
				6083	/* Now ACK is acceptable.
				6084	*
				6085	* "If the RST bit is set
				6086	* If the ACK was acceptable then signal the user "error:
				6087	* connection reset", drop the segment, enter CLOSED state,
				6088	* delete TCB, and return."
				6089	*/
				6090
				6091	if (th->rst) {
				6092	tcp_reset(sk);
				6093	goto discard;
				6094	}
				6095
				6096	/* rfc793:
				6097	* "fifth, if neither of the SYN or RST bits is set then
				6098	* drop the segment and return."
				6099	*
				6100	* See note below!
				6101	* --ANK(990513)
				6102	*/
				6103	if (!th->syn)
				6104	goto discard_and_undo;
				6105
				6106	/* rfc793:
				6107	* "If the SYN bit is on ...
				6108	* are acceptable then ...
				6109	* (our SYN has been ACKed), change the connection
				6110	* state to ESTABLISHED..."
				6111	*/
				6112
				6113	tcp_ecn_rcv_synack(tp, th);
				6114
				6115	tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6116	tcp_try_undo_spurious_syn(sk);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6117	tcp_ack(sk, skb, FLAG_SLOWPATH);
				6118
				6119	/* Ok.. it's good. Set up sequence numbers and
				6120	* move to established.
				6121	*/
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6122	WRITE_ONCE(tp->rcv_nxt, TCP_SKB_CB(skb)->seq + 1);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6123	tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1;
				6124
				6125	/* RFC1323: The window in SYN & SYN/ACK segments is
				6126	* never scaled.
				6127	*/
				6128	tp->snd_wnd = ntohs(th->window);
				6129
				6130	if (!tp->rx_opt.wscale_ok) {
				6131	tp->rx_opt.snd_wscale = tp->rx_opt.rcv_wscale = 0;
				6132	tp->window_clamp = min(tp->window_clamp, 65535U);
				6133	}
				6134
				6135	if (tp->rx_opt.saw_tstamp) {
				6136	tp->rx_opt.tstamp_ok = 1;
				6137	tp->tcp_header_len =
				6138	sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
				6139	tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
				6140	tcp_store_ts_recent(tp);
				6141	} else {
				6142	tp->tcp_header_len = sizeof(struct tcphdr);
				6143	}
				6144
				6145	tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
				6146	tcp_initialize_rcv_mss(sk);
				6147
				6148	/* Remember, tcp_poll() does not lock socket!
				6149	* Change state from SYN-SENT only after copied_seq
				6150	* is initialized. */
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6151	WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6152
				6153	smc_check_reset_syn(tp);
				6154
				6155	smp_mb();
				6156
				6157	tcp_finish_connect(sk, skb);
				6158
				6159	fastopen_fail = (tp->syn_fastopen \|\| tp->syn_data) &&
				6160	tcp_rcv_fastopen_synack(sk, skb, &foc);
				6161
				6162	if (!sock_flag(sk, SOCK_DEAD)) {
				6163	sk->sk_state_change(sk);
				6164	sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT);
				6165	}
				6166	if (fastopen_fail)
				6167	return -1;
				6168	if (sk->sk_write_pending \|\|
				6169	icsk->icsk_accept_queue.rskq_defer_accept \|\|
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6170	inet_csk_in_pingpong_mode(sk)) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6171	/* Save one ACK. Data will be ready after
				6172	* several ticks, if write_pending is set.
				6173	*
				6174	* It may be deleted, but with this feature tcpdumps
				6175	* look so _wonderfully_ clever, that I was not able
				6176	* to stand against the temptation 8) --ANK
				6177	*/
				6178	inet_csk_schedule_ack(sk);
				6179	tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS);
				6180	inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
				6181	TCP_DELACK_MAX, TCP_RTO_MAX);
				6182
				6183	discard:
				6184	tcp_drop(sk, skb);
				6185	return 0;
				6186	} else {
				6187	tcp_send_ack(sk);
				6188	}
				6189	return -1;
				6190	}
				6191
				6192	/* No ACK in the segment */
				6193
				6194	if (th->rst) {
				6195	/* rfc793:
				6196	* "If the RST bit is set
				6197	*
				6198	* Otherwise (no ACK) drop the segment and return."
				6199	*/
				6200
				6201	goto discard_and_undo;
				6202	}
				6203
				6204	/* PAWS check. */
				6205	if (tp->rx_opt.ts_recent_stamp && tp->rx_opt.saw_tstamp &&
				6206	tcp_paws_reject(&tp->rx_opt, 0))
				6207	goto discard_and_undo;
				6208
				6209	if (th->syn) {
				6210	/* We see SYN without ACK. It is attempt of
				6211	* simultaneous connect with crossed SYNs.
				6212	* Particularly, it can be connect to self.
				6213	*/
				6214	tcp_set_state(sk, TCP_SYN_RECV);
				6215
				6216	if (tp->rx_opt.saw_tstamp) {
				6217	tp->rx_opt.tstamp_ok = 1;
				6218	tcp_store_ts_recent(tp);
				6219	tp->tcp_header_len =
				6220	sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
				6221	} else {
				6222	tp->tcp_header_len = sizeof(struct tcphdr);
				6223	}
				6224
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6225	WRITE_ONCE(tp->rcv_nxt, TCP_SKB_CB(skb)->seq + 1);
				6226	WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6227	tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1;
				6228
				6229	/* RFC1323: The window in SYN & SYN/ACK segments is
				6230	* never scaled.
				6231	*/
				6232	tp->snd_wnd = ntohs(th->window);
				6233	tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
				6234	tp->max_window = tp->snd_wnd;
				6235
				6236	tcp_ecn_rcv_syn(tp, th);
				6237
				6238	tcp_mtup_init(sk);
				6239	tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
				6240	tcp_initialize_rcv_mss(sk);
				6241
				6242	tcp_send_synack(sk);
				6243	#if 0
				6244	/* Note, we could accept data and URG from this segment.
				6245	* There are no obstacles to make this (except that we must
				6246	* either change tcp_recvmsg() to prevent it from returning data
				6247	* before 3WHS completes per RFC793, or employ TCP Fast Open).
				6248	*
				6249	* However, if we ignore data in ACKless segments sometimes,
				6250	* we have no reasons to accept it sometimes.
				6251	* Also, seems the code doing it in step6 of tcp_rcv_state_process
				6252	* is not flawless. So, discard packet for sanity.
				6253	* Uncomment this return to process the data.
				6254	*/
				6255	return -1;
				6256	#else
				6257	goto discard;
				6258	#endif
				6259	}
				6260	/* "fifth, if neither of the SYN or RST bits is set then
				6261	* drop the segment and return."
				6262	*/
				6263
				6264	discard_and_undo:
				6265	tcp_clear_options(&tp->rx_opt);
				6266	tp->rx_opt.mss_clamp = saved_clamp;
				6267	goto discard;
				6268
				6269	reset_and_undo:
				6270	tcp_clear_options(&tp->rx_opt);
				6271	tp->rx_opt.mss_clamp = saved_clamp;
				6272	return 1;
				6273	}
				6274
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6275	static void tcp_rcv_synrecv_state_fastopen(struct sock *sk)
				6276	{
				6277	struct request_sock *req;
				6278
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame]	6279	/* If we are still handling the SYNACK RTO, see if timestamp ECR allows
				6280	* undo. If peer SACKs triggered fast recovery, we can't undo here.
				6281	*/
				6282	if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss)
				6283	tcp_try_undo_loss(sk, false);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6284
				6285	/* Reset rtx states to prevent spurious retransmits_timed_out() */
				6286	tcp_sk(sk)->retrans_stamp = 0;
				6287	inet_csk(sk)->icsk_retransmits = 0;
				6288
				6289	/* Once we leave TCP_SYN_RECV or TCP_FIN_WAIT_1,
				6290	* we no longer need req so release it.
				6291	*/
				6292	req = rcu_dereference_protected(tcp_sk(sk)->fastopen_rsk,
				6293	lockdep_sock_is_held(sk));
				6294	reqsk_fastopen_remove(sk, req, false);
				6295
				6296	/* Re-arm the timer because data may have been sent out.
				6297	* This is similar to the regular data transmission case
				6298	* when new data has just been ack'ed.
				6299	*
				6300	* (TFO) - we could try to be more aggressive and
				6301	* retransmitting any data sooner based on when they
				6302	* are sent out.
				6303	*/
				6304	tcp_rearm_rto(sk);
				6305	}
				6306
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6307	/*
				6308	* This function implements the receiving procedure of RFC 793 for
				6309	* all states except ESTABLISHED and TIME_WAIT.
				6310	* It's called from both tcp_v4_rcv and tcp_v6_rcv and should be
				6311	* address independent.
				6312	*/
				6313
				6314	int tcp_rcv_state_process(struct sock sk, struct sk_buff skb)
				6315	{
				6316	struct tcp_sock *tp = tcp_sk(sk);
				6317	struct inet_connection_sock *icsk = inet_csk(sk);
				6318	const struct tcphdr *th = tcp_hdr(skb);
				6319	struct request_sock *req;
				6320	int queued = 0;
				6321	bool acceptable;
				6322
				6323	switch (sk->sk_state) {
				6324	case TCP_CLOSE:
				6325	goto discard;
				6326
				6327	case TCP_LISTEN:
				6328	if (th->ack)
				6329	return 1;
				6330
				6331	if (th->rst)
				6332	goto discard;
				6333
				6334	if (th->syn) {
				6335	if (th->fin)
				6336	goto discard;
				6337	/* It is possible that we process SYN packets from backlog,
				6338	* so we need to make sure to disable BH and RCU right there.
				6339	*/
				6340	rcu_read_lock();
				6341	local_bh_disable();
				6342	acceptable = icsk->icsk_af_ops->conn_request(sk, skb) >= 0;
				6343	local_bh_enable();
				6344	rcu_read_unlock();
				6345
				6346	if (!acceptable)
				6347	return 1;
				6348	consume_skb(skb);
				6349	return 0;
				6350	}
				6351	goto discard;
				6352
				6353	case TCP_SYN_SENT:
				6354	tp->rx_opt.saw_tstamp = 0;
				6355	tcp_mstamp_refresh(tp);
				6356	queued = tcp_rcv_synsent_state_process(sk, skb, th);
				6357	if (queued >= 0)
				6358	return queued;
				6359
				6360	/* Do step6 onward by hand. */
				6361	tcp_urg(sk, skb, th);
				6362	__kfree_skb(skb);
				6363	tcp_data_snd_check(sk);
				6364	return 0;
				6365	}
				6366
				6367	tcp_mstamp_refresh(tp);
				6368	tp->rx_opt.saw_tstamp = 0;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6369	req = rcu_dereference_protected(tp->fastopen_rsk,
				6370	lockdep_sock_is_held(sk));
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6371	if (req) {
				6372	bool req_stolen;
				6373
				6374	WARN_ON_ONCE(sk->sk_state != TCP_SYN_RECV &&
				6375	sk->sk_state != TCP_FIN_WAIT1);
				6376
				6377	if (!tcp_check_req(sk, skb, req, true, &req_stolen))
				6378	goto discard;
				6379	}
				6380
				6381	if (!th->ack && !th->rst && !th->syn)
				6382	goto discard;
				6383
				6384	if (!tcp_validate_incoming(sk, skb, th, 0))
				6385	return 0;
				6386
				6387	/* step 5: check the ACK field */
				6388	acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH \|
				6389	FLAG_UPDATE_TS_RECENT \|
				6390	FLAG_NO_CHALLENGE_ACK) > 0;
				6391
				6392	if (!acceptable) {
				6393	if (sk->sk_state == TCP_SYN_RECV)
				6394	return 1; /* send one RST */
				6395	tcp_send_challenge_ack(sk, skb);
				6396	goto discard;
				6397	}
				6398	switch (sk->sk_state) {
				6399	case TCP_SYN_RECV:
				6400	tp->delivered++; /* SYN-ACK delivery isn't tracked in tcp_ack */
				6401	if (!tp->srtt_us)
				6402	tcp_synack_rtt_meas(sk, req);
				6403
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6404	if (req) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6405	tcp_rcv_synrecv_state_fastopen(sk);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6406	} else {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6407	tcp_try_undo_spurious_syn(sk);
				6408	tp->retrans_stamp = 0;
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	6409	tcp_init_transfer(sk, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB,
				6410	skb);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6411	WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6412	}
				6413	smp_mb();
				6414	tcp_set_state(sk, TCP_ESTABLISHED);
				6415	sk->sk_state_change(sk);
				6416
				6417	/* Note, that this wakeup is only for marginal crossed SYN case.
				6418	* Passively open sockets are not waked up, because
				6419	* sk->sk_sleep == NULL and sk->sk_socket == NULL.
				6420	*/
				6421	if (sk->sk_socket)
				6422	sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT);
				6423
				6424	tp->snd_una = TCP_SKB_CB(skb)->ack_seq;
				6425	tp->snd_wnd = ntohs(th->window) << tp->rx_opt.snd_wscale;
				6426	tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
				6427
				6428	if (tp->rx_opt.tstamp_ok)
				6429	tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
				6430
				6431	if (!inet_csk(sk)->icsk_ca_ops->cong_control)
				6432	tcp_update_pacing_rate(sk);
				6433
				6434	/* Prevent spurious tcp_cwnd_restart() on first data packet */
				6435	tp->lsndtime = tcp_jiffies32;
				6436
				6437	tcp_initialize_rcv_mss(sk);
				6438	tcp_fast_path_on(tp);
				6439	break;
				6440
				6441	case TCP_FIN_WAIT1: {
				6442	int tmo;
				6443
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6444	if (req)
				6445	tcp_rcv_synrecv_state_fastopen(sk);
				6446
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6447	if (tp->snd_una != tp->write_seq)
				6448	break;
				6449
				6450	tcp_set_state(sk, TCP_FIN_WAIT2);
				6451	sk->sk_shutdown \|= SEND_SHUTDOWN;
				6452
				6453	sk_dst_confirm(sk);
				6454
				6455	if (!sock_flag(sk, SOCK_DEAD)) {
				6456	/* Wake up lingering close() */
				6457	sk->sk_state_change(sk);
				6458	break;
				6459	}
				6460
				6461	if (tp->linger2 < 0) {
				6462	tcp_done(sk);
				6463	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
				6464	return 1;
				6465	}
				6466	if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
				6467	after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) {
				6468	/* Receive out of order FIN after close() */
				6469	if (tp->syn_fastopen && th->fin)
				6470	tcp_fastopen_active_disable(sk);
				6471	tcp_done(sk);
				6472	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
				6473	return 1;
				6474	}
				6475
				6476	tmo = tcp_fin_time(sk);
				6477	if (tmo > TCP_TIMEWAIT_LEN) {
				6478	inet_csk_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN);
				6479	} else if (th->fin \|\| sock_owned_by_user(sk)) {
				6480	/* Bad case. We could lose such FIN otherwise.
				6481	* It is not a big problem, but it looks confusing
				6482	* and not so rare event. We still can lose it now,
				6483	* if it spins in bh_lock_sock(), but it is really
				6484	* marginal case.
				6485	*/
				6486	inet_csk_reset_keepalive_timer(sk, tmo);
				6487	} else {
				6488	tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
				6489	goto discard;
				6490	}
				6491	break;
				6492	}
				6493
				6494	case TCP_CLOSING:
				6495	if (tp->snd_una == tp->write_seq) {
				6496	tcp_time_wait(sk, TCP_TIME_WAIT, 0);
				6497	goto discard;
				6498	}
				6499	break;
				6500
				6501	case TCP_LAST_ACK:
				6502	if (tp->snd_una == tp->write_seq) {
				6503	tcp_update_metrics(sk);
				6504	tcp_done(sk);
				6505	goto discard;
				6506	}
				6507	break;
				6508	}
				6509
				6510	/* step 6: check the URG bit */
				6511	tcp_urg(sk, skb, th);
				6512
				6513	/* step 7: process the segment text */
				6514	switch (sk->sk_state) {
				6515	case TCP_CLOSE_WAIT:
				6516	case TCP_CLOSING:
				6517	case TCP_LAST_ACK:
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	6518	if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
				6519	if (sk_is_mptcp(sk))
				6520	mptcp_incoming_options(sk, skb);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6521	break;
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	6522	}
				6523	fallthrough;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6524	case TCP_FIN_WAIT1:
				6525	case TCP_FIN_WAIT2:
				6526	/* RFC 793 says to queue data in these states,
				6527	* RFC 1122 says we MUST send a reset.
				6528	* BSD 4.4 also does reset.
				6529	*/
				6530	if (sk->sk_shutdown & RCV_SHUTDOWN) {
				6531	if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
				6532	after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) {
				6533	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
				6534	tcp_reset(sk);
				6535	return 1;
				6536	}
				6537	}
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	6538	fallthrough;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6539	case TCP_ESTABLISHED:
				6540	tcp_data_queue(sk, skb);
				6541	queued = 1;
				6542	break;
				6543	}
				6544
				6545	/* tcp_data could move socket to TIME-WAIT */
				6546	if (sk->sk_state != TCP_CLOSE) {
				6547	tcp_data_snd_check(sk);
				6548	tcp_ack_snd_check(sk);
				6549	}
				6550
				6551	if (!queued) {
				6552	discard:
				6553	tcp_drop(sk, skb);
				6554	}
				6555	return 0;
				6556	}
				6557	EXPORT_SYMBOL(tcp_rcv_state_process);
				6558
				6559	static inline void pr_drop_req(struct request_sock *req, __u16 port, int family)
				6560	{
				6561	struct inet_request_sock *ireq = inet_rsk(req);
				6562
				6563	if (family == AF_INET)
				6564	net_dbg_ratelimited("drop open request from %pI4/%u\n",
				6565	&ireq->ir_rmt_addr, port);
				6566	#if IS_ENABLED(CONFIG_IPV6)
				6567	else if (family == AF_INET6)
				6568	net_dbg_ratelimited("drop open request from %pI6/%u\n",
				6569	&ireq->ir_v6_rmt_addr, port);
				6570	#endif
				6571	}
				6572
				6573	/* RFC3168 : 6.1.1 SYN packets must not have ECT/ECN bits set
				6574	*
				6575	* If we receive a SYN packet with these bits set, it means a
				6576	* network is playing bad games with TOS bits. In order to
				6577	* avoid possible false congestion notifications, we disable
				6578	* TCP ECN negotiation.
				6579	*
				6580	* Exception: tcp_ca wants ECN. This is required for DCTCP
				6581	* congestion control: Linux DCTCP asserts ECT on all packets,
				6582	* including SYN, which is most optimal solution; however,
				6583	* others, such as FreeBSD do not.
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6584	*
				6585	* Exception: At least one of the reserved bits of the TCP header (th->res1) is
				6586	* set, indicating the use of a future TCP extension (such as AccECN). See
				6587	* RFC8311 §4.3 which updates RFC3168 to allow the development of such
				6588	* extensions.
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6589	*/
				6590	static void tcp_ecn_create_request(struct request_sock *req,
				6591	const struct sk_buff *skb,
				6592	const struct sock *listen_sk,
				6593	const struct dst_entry *dst)
				6594	{
				6595	const struct tcphdr *th = tcp_hdr(skb);
				6596	const struct net *net = sock_net(listen_sk);
				6597	bool th_ecn = th->ece && th->cwr;
				6598	bool ect, ecn_ok;
				6599	u32 ecn_ok_dst;
				6600
				6601	if (!th_ecn)
				6602	return;
				6603
				6604	ect = !INET_ECN_is_not_ect(TCP_SKB_CB(skb)->ip_dsfield);
				6605	ecn_ok_dst = dst_feature(dst, DST_FEATURE_ECN_MASK);
				6606	ecn_ok = net->ipv4.sysctl_tcp_ecn \|\| ecn_ok_dst;
				6607
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6608	if (((!ect \|\| th->res1) && ecn_ok) \|\| tcp_ca_needs_ecn(listen_sk) \|\|
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6609	(ecn_ok_dst & DST_FEATURE_ECN_CA) \|\|
				6610	tcp_bpf_ca_needs_ecn((struct sock *)req))
				6611	inet_rsk(req)->ecn_ok = 1;
				6612	}
				6613
				6614	static void tcp_openreq_init(struct request_sock *req,
				6615	const struct tcp_options_received *rx_opt,
				6616	struct sk_buff skb, const struct sock sk)
				6617	{
				6618	struct inet_request_sock *ireq = inet_rsk(req);
				6619
				6620	req->rsk_rcv_wnd = 0; /* So that tcp_send_synack() knows! */
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6621	tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq;
				6622	tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6623	tcp_rsk(req)->snt_synack = 0;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6624	tcp_rsk(req)->last_oow_ack_time = 0;
				6625	req->mss = rx_opt->mss_clamp;
				6626	req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : 0;
				6627	ireq->tstamp_ok = rx_opt->tstamp_ok;
				6628	ireq->sack_ok = rx_opt->sack_ok;
				6629	ireq->snd_wscale = rx_opt->snd_wscale;
				6630	ireq->wscale_ok = rx_opt->wscale_ok;
				6631	ireq->acked = 0;
				6632	ireq->ecn_ok = 0;
				6633	ireq->ir_rmt_port = tcp_hdr(skb)->source;
				6634	ireq->ir_num = ntohs(tcp_hdr(skb)->dest);
				6635	ireq->ir_mark = inet_request_mark(sk, skb);
				6636	#if IS_ENABLED(CONFIG_SMC)
				6637	ireq->smc_ok = rx_opt->smc_ok;
				6638	#endif
				6639	}
				6640
				6641	struct request_sock inet_reqsk_alloc(const struct request_sock_ops ops,
				6642	struct sock *sk_listener,
				6643	bool attach_listener)
				6644	{
				6645	struct request_sock *req = reqsk_alloc(ops, sk_listener,
				6646	attach_listener);
				6647
				6648	if (req) {
				6649	struct inet_request_sock *ireq = inet_rsk(req);
				6650
				6651	ireq->ireq_opt = NULL;
				6652	#if IS_ENABLED(CONFIG_IPV6)
				6653	ireq->pktopts = NULL;
				6654	#endif
				6655	atomic64_set(&ireq->ir_cookie, 0);
				6656	ireq->ireq_state = TCP_NEW_SYN_RECV;
				6657	write_pnet(&ireq->ireq_net, sock_net(sk_listener));
				6658	ireq->ireq_family = sk_listener->sk_family;
				6659	}
				6660
				6661	return req;
				6662	}
				6663	EXPORT_SYMBOL(inet_reqsk_alloc);
				6664
				6665	/*
				6666	* Return true if a syncookie should be sent
				6667	*/
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6668	static bool tcp_syn_flood_action(const struct sock sk, const char proto)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6669	{
				6670	struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
				6671	const char *msg = "Dropping request";
				6672	bool want_cookie = false;
				6673	struct net *net = sock_net(sk);
				6674
				6675	#ifdef CONFIG_SYN_COOKIES
				6676	if (net->ipv4.sysctl_tcp_syncookies) {
				6677	msg = "Sending cookies";
				6678	want_cookie = true;
				6679	__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES);
				6680	} else
				6681	#endif
				6682	__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP);
				6683
				6684	if (!queue->synflood_warned &&
				6685	net->ipv4.sysctl_tcp_syncookies != 2 &&
				6686	xchg(&queue->synflood_warned, 1) == 0)
				6687	net_info_ratelimited("%s: Possible SYN flooding on port %d. %s. Check SNMP counters.\n",
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6688	proto, sk->sk_num, msg);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6689
				6690	return want_cookie;
				6691	}
				6692
				6693	static void tcp_reqsk_record_syn(const struct sock *sk,
				6694	struct request_sock *req,
				6695	const struct sk_buff *skb)
				6696	{
				6697	if (tcp_sk(sk)->save_syn) {
				6698	u32 len = skb_network_header_len(skb) + tcp_hdrlen(skb);
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	6699	struct saved_syn *saved_syn;
				6700	u32 mac_hdrlen;
				6701	void *base;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6702
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	6703	if (tcp_sk(sk)->save_syn == 2) { /* Save full header. */
				6704	base = skb_mac_header(skb);
				6705	mac_hdrlen = skb_mac_header_len(skb);
				6706	len += mac_hdrlen;
				6707	} else {
				6708	base = skb_network_header(skb);
				6709	mac_hdrlen = 0;
				6710	}
				6711
				6712	saved_syn = kmalloc(struct_size(saved_syn, data, len),
				6713	GFP_ATOMIC);
				6714	if (saved_syn) {
				6715	saved_syn->mac_hdrlen = mac_hdrlen;
				6716	saved_syn->network_hdrlen = skb_network_header_len(skb);
				6717	saved_syn->tcp_hdrlen = tcp_hdrlen(skb);
				6718	memcpy(saved_syn->data, base, len);
				6719	req->saved_syn = saved_syn;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6720	}
				6721	}
				6722	}
				6723
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6724	/* If a SYN cookie is required and supported, returns a clamped MSS value to be
				6725	* used for SYN cookie generation.
				6726	*/
				6727	u16 tcp_get_syncookie_mss(struct request_sock_ops *rsk_ops,
				6728	const struct tcp_request_sock_ops *af_ops,
				6729	struct sock sk, struct tcphdr th)
				6730	{
				6731	struct tcp_sock *tp = tcp_sk(sk);
				6732	u16 mss;
				6733
				6734	if (sock_net(sk)->ipv4.sysctl_tcp_syncookies != 2 &&
				6735	!inet_csk_reqsk_queue_is_full(sk))
				6736	return 0;
				6737
				6738	if (!tcp_syn_flood_action(sk, rsk_ops->slab_name))
				6739	return 0;
				6740
				6741	if (sk_acceptq_is_full(sk)) {
				6742	NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
				6743	return 0;
				6744	}
				6745
				6746	mss = tcp_parse_mss_option(th, tp->rx_opt.user_mss);
				6747	if (!mss)
				6748	mss = af_ops->mss_clamp;
				6749
				6750	return mss;
				6751	}
				6752	EXPORT_SYMBOL_GPL(tcp_get_syncookie_mss);
				6753
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6754	int tcp_conn_request(struct request_sock_ops *rsk_ops,
				6755	const struct tcp_request_sock_ops *af_ops,
				6756	struct sock sk, struct sk_buff skb)
				6757	{
				6758	struct tcp_fastopen_cookie foc = { .len = -1 };
				6759	__u32 isn = TCP_SKB_CB(skb)->tcp_tw_isn;
				6760	struct tcp_options_received tmp_opt;
				6761	struct tcp_sock *tp = tcp_sk(sk);
				6762	struct net *net = sock_net(sk);
				6763	struct sock *fastopen_sk = NULL;
				6764	struct request_sock *req;
				6765	bool want_cookie = false;
				6766	struct dst_entry *dst;
				6767	struct flowi fl;
				6768
				6769	/* TW buckets are converted to open requests without
				6770	* limitations, they conserve resources and peer is
				6771	* evidently real one.
				6772	*/
				6773	if ((net->ipv4.sysctl_tcp_syncookies == 2 \|\|
				6774	inet_csk_reqsk_queue_is_full(sk)) && !isn) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6775	want_cookie = tcp_syn_flood_action(sk, rsk_ops->slab_name);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6776	if (!want_cookie)
				6777	goto drop;
				6778	}
				6779
				6780	if (sk_acceptq_is_full(sk)) {
				6781	NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
				6782	goto drop;
				6783	}
				6784
				6785	req = inet_reqsk_alloc(rsk_ops, sk, !want_cookie);
				6786	if (!req)
				6787	goto drop;
				6788
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	6789	req->syncookie = want_cookie;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6790	tcp_rsk(req)->af_specific = af_ops;
				6791	tcp_rsk(req)->ts_off = 0;
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	6792	#if IS_ENABLED(CONFIG_MPTCP)
				6793	tcp_rsk(req)->is_mptcp = 0;
				6794	#endif
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6795
				6796	tcp_clear_options(&tmp_opt);
				6797	tmp_opt.mss_clamp = af_ops->mss_clamp;
				6798	tmp_opt.user_mss = tp->rx_opt.user_mss;
				6799	tcp_parse_options(sock_net(sk), skb, &tmp_opt, 0,
				6800	want_cookie ? NULL : &foc);
				6801
				6802	if (want_cookie && !tmp_opt.saw_tstamp)
				6803	tcp_clear_options(&tmp_opt);
				6804
				6805	if (IS_ENABLED(CONFIG_SMC) && want_cookie)
				6806	tmp_opt.smc_ok = 0;
				6807
				6808	tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
				6809	tcp_openreq_init(req, &tmp_opt, skb, sk);
				6810	inet_rsk(req)->no_srccheck = inet_sk(sk)->transparent;
				6811
				6812	/* Note: tcp_v6_init_req() might override ir_iif for link locals */
				6813	inet_rsk(req)->ir_iif = inet_request_bound_dev_if(sk, skb);
				6814
				6815	af_ops->init_req(req, sk, skb);
				6816
				6817	if (security_inet_conn_request(sk, skb, req))
				6818	goto drop_and_free;
				6819
				6820	if (tmp_opt.tstamp_ok)
				6821	tcp_rsk(req)->ts_off = af_ops->init_ts_off(net, skb);
				6822
				6823	dst = af_ops->route_req(sk, &fl, req);
				6824	if (!dst)
				6825	goto drop_and_free;
				6826
				6827	if (!want_cookie && !isn) {
				6828	/* Kill the following clause, if you dislike this way. */
				6829	if (!net->ipv4.sysctl_tcp_syncookies &&
				6830	(net->ipv4.sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
				6831	(net->ipv4.sysctl_max_syn_backlog >> 2)) &&
				6832	!tcp_peer_is_proven(req, dst)) {
				6833	/* Without syncookies last quarter of
				6834	* backlog is filled with destinations,
				6835	* proven to be alive.
				6836	* It means that we continue to communicate
				6837	* to destinations, already remembered
				6838	* to the moment of synflood.
				6839	*/
				6840	pr_drop_req(req, ntohs(tcp_hdr(skb)->source),
				6841	rsk_ops->family);
				6842	goto drop_and_release;
				6843	}
				6844
				6845	isn = af_ops->init_seq(skb);
				6846	}
				6847
				6848	tcp_ecn_create_request(req, skb, sk, dst);
				6849
				6850	if (want_cookie) {
				6851	isn = cookie_init_sequence(af_ops, sk, skb, &req->mss);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6852	if (!tmp_opt.tstamp_ok)
				6853	inet_rsk(req)->ecn_ok = 0;
				6854	}
				6855
				6856	tcp_rsk(req)->snt_isn = isn;
				6857	tcp_rsk(req)->txhash = net_tx_rndhash();
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	6858	tcp_rsk(req)->syn_tos = TCP_SKB_CB(skb)->ip_dsfield;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6859	tcp_openreq_init_rwin(req, sk, dst);
				6860	sk_rx_queue_set(req_to_sk(req), skb);
				6861	if (!want_cookie) {
				6862	tcp_reqsk_record_syn(sk, req, skb);
				6863	fastopen_sk = tcp_try_fastopen(sk, skb, req, &foc, dst);
				6864	}
				6865	if (fastopen_sk) {
				6866	af_ops->send_synack(fastopen_sk, dst, &fl, req,
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	6867	&foc, TCP_SYNACK_FASTOPEN, skb);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6868	/* Add the child socket directly into the accept queue */
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6869	if (!inet_csk_reqsk_queue_add(sk, req, fastopen_sk)) {
				6870	reqsk_fastopen_remove(fastopen_sk, req, false);
				6871	bh_unlock_sock(fastopen_sk);
				6872	sock_put(fastopen_sk);
				6873	goto drop_and_free;
				6874	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6875	sk->sk_data_ready(sk);
				6876	bh_unlock_sock(fastopen_sk);
				6877	sock_put(fastopen_sk);
				6878	} else {
				6879	tcp_rsk(req)->tfo_listener = false;
				6880	if (!want_cookie)
				6881	inet_csk_reqsk_queue_hash_add(sk, req,
				6882	tcp_timeout_init((struct sock *)req));
				6883	af_ops->send_synack(sk, dst, &fl, req, &foc,
				6884	!want_cookie ? TCP_SYNACK_NORMAL :
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	6885	TCP_SYNACK_COOKIE,
				6886	skb);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6887	if (want_cookie) {
				6888	reqsk_free(req);
				6889	return 0;
				6890	}
				6891	}
				6892	reqsk_put(req);
				6893	return 0;
				6894
				6895	drop_and_release:
				6896	dst_release(dst);
				6897	drop_and_free:
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6898	__reqsk_free(req);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6899	drop:
				6900	tcp_listendrop(sk);
				6901	return 0;
				6902	}
				6903	EXPORT_SYMBOL(tcp_conn_request);