Blame - net/ipv4/tcp_input.c - hafnium/third_party/linux

blob: c0fcfa2964686ef2c9dc0d64d2ef778f9102aa0f [file] [log] [blame]

Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1	// SPDX-License-Identifier: GPL-2.0
				2	/*
				3	* INET An implementation of the TCP/IP protocol suite for the LINUX
				4	* operating system. INET is implemented using the BSD Socket
				5	* interface as the means of communication with the user level.
				6	*
				7	* Implementation of the Transmission Control Protocol(TCP).
				8	*
				9	* Authors: Ross Biro
				10	* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
				11	* Mark Evans, <evansmp@uhura.aston.ac.uk>
				12	* Corey Minyard <wf-rch!minyard@relay.EU.net>
				13	* Florian La Roche, <flla@stud.uni-sb.de>
				14	* Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
				15	* Linus Torvalds, <torvalds@cs.helsinki.fi>
				16	* Alan Cox, <gw4pts@gw4pts.ampr.org>
				17	* Matthew Dillon, <dillon@apollo.west.oic.com>
				18	* Arnt Gulbrandsen, <agulbra@nvg.unit.no>
				19	* Jorge Cwik, <jorge@laser.satlink.net>
				20	*/
				21
				22	/*
				23	* Changes:
				24	* Pedro Roque : Fast Retransmit/Recovery.
				25	* Two receive queues.
				26	* Retransmit queue handled by TCP.
				27	* Better retransmit timer handling.
				28	* New congestion avoidance.
				29	* Header prediction.
				30	* Variable renaming.
				31	*
				32	* Eric : Fast Retransmit.
				33	* Randy Scott : MSS option defines.
				34	* Eric Schenk : Fixes to slow start algorithm.
				35	* Eric Schenk : Yet another double ACK bug.
				36	* Eric Schenk : Delayed ACK bug fixes.
				37	* Eric Schenk : Floyd style fast retrans war avoidance.
				38	* David S. Miller : Don't allow zero congestion window.
				39	* Eric Schenk : Fix retransmitter so that it sends
				40	* next packet on ack of previous packet.
				41	* Andi Kleen : Moved open_request checking here
				42	* and process RSTs for open_requests.
				43	* Andi Kleen : Better prune_queue, and other fixes.
				44	* Andrey Savochkin: Fix RTT measurements in the presence of
				45	* timestamps.
				46	* Andrey Savochkin: Check sequence numbers correctly when
				47	* removing SACKs due to in sequence incoming
				48	* data segments.
				49	* Andi Kleen: Make sure we never ack data there is not
				50	* enough room for. Also make this condition
				51	* a fatal error if it might still happen.
				52	* Andi Kleen: Add tcp_measure_rcv_mss to make
				53	* connections with MSS<min(MTU,ann. MSS)
				54	* work without delayed acks.
				55	* Andi Kleen: Process packets with PSH set in the
				56	* fast path.
				57	* J Hadi Salim: ECN support
				58	* Andrei Gurtov,
				59	* Pasi Sarolahti,
				60	* Panu Kuhlberg: Experimental audit of TCP (re)transmission
				61	* engine. Lots of bugs are found.
				62	* Pasi Sarolahti: F-RTO for dealing with spurious RTOs
				63	*/
				64
				65	#define pr_fmt(fmt) "TCP: " fmt
				66
				67	#include <linux/mm.h>
				68	#include <linux/slab.h>
				69	#include <linux/module.h>
				70	#include <linux/sysctl.h>
				71	#include <linux/kernel.h>
				72	#include <linux/prefetch.h>
				73	#include <net/dst.h>
				74	#include <net/tcp.h>
				75	#include <net/inet_common.h>
				76	#include <linux/ipsec.h>
				77	#include <asm/unaligned.h>
				78	#include <linux/errqueue.h>
				79	#include <trace/events/tcp.h>
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	80	#include <linux/jump_label_ratelimit.h>
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	81	#include <net/busy_poll.h>
				82
				83	int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
				84
				85	#define FLAG_DATA 0x01 /* Incoming frame contained data. */
				86	#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */
				87	#define FLAG_DATA_ACKED 0x04 /* This ACK acknowledged new data. */
				88	#define FLAG_RETRANS_DATA_ACKED 0x08 /* "" "" some of which was retransmitted. */
				89	#define FLAG_SYN_ACKED 0x10 /* This ACK acknowledged SYN. */
				90	#define FLAG_DATA_SACKED 0x20 /* New SACK. */
				91	#define FLAG_ECE 0x40 /* ECE in this ACK */
				92	#define FLAG_LOST_RETRANS 0x80 /* This ACK marks some retransmission lost */
				93	#define FLAG_SLOWPATH 0x100 /* Do not skip RFC checks for window update.*/
				94	#define FLAG_ORIG_SACK_ACKED 0x200 /* Never retransmitted data are (s)acked */
				95	#define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */
				96	#define FLAG_DSACKING_ACK 0x800 /* SACK blocks contained D-SACK info */
				97	#define FLAG_SET_XMIT_TIMER 0x1000 /* Set TLP or RTO timer */
				98	#define FLAG_SACK_RENEGING 0x2000 /* snd_una advanced to a sacked seq */
				99	#define FLAG_UPDATE_TS_RECENT 0x4000 /* tcp_replace_ts_recent() */
				100	#define FLAG_NO_CHALLENGE_ACK 0x8000 /* do not call tcp_send_challenge_ack() */
				101	#define FLAG_ACK_MAYBE_DELAYED 0x10000 /* Likely a delayed ACK */
				102
				103	#define FLAG_ACKED (FLAG_DATA_ACKED\|FLAG_SYN_ACKED)
				104	#define FLAG_NOT_DUP (FLAG_DATA\|FLAG_WIN_UPDATE\|FLAG_ACKED)
				105	#define FLAG_CA_ALERT (FLAG_DATA_SACKED\|FLAG_ECE\|FLAG_DSACKING_ACK)
				106	#define FLAG_FORWARD_PROGRESS (FLAG_ACKED\|FLAG_DATA_SACKED)
				107
				108	#define TCP_REMNANT (TCP_FLAG_FIN\|TCP_FLAG_URG\|TCP_FLAG_SYN\|TCP_FLAG_PSH)
				109	#define TCP_HP_BITS (~(TCP_RESERVED_BITS\|TCP_FLAG_PSH))
				110
				111	#define REXMIT_NONE 0 /* no loss recovery to do */
				112	#define REXMIT_LOST 1 /* retransmit packets marked lost */
				113	#define REXMIT_NEW 2 /* FRTO-style transmit of unsent/new packets */
				114
				115	#if IS_ENABLED(CONFIG_TLS_DEVICE)
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	116	static DEFINE_STATIC_KEY_DEFERRED_FALSE(clean_acked_data_enabled, HZ);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	117
				118	void clean_acked_data_enable(struct inet_connection_sock *icsk,
				119	void (cad)(struct sock sk, u32 ack_seq))
				120	{
				121	icsk->icsk_clean_acked = cad;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	122	static_branch_deferred_inc(&clean_acked_data_enabled);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	123	}
				124	EXPORT_SYMBOL_GPL(clean_acked_data_enable);
				125
				126	void clean_acked_data_disable(struct inet_connection_sock *icsk)
				127	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	128	static_branch_slow_dec_deferred(&clean_acked_data_enabled);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	129	icsk->icsk_clean_acked = NULL;
				130	}
				131	EXPORT_SYMBOL_GPL(clean_acked_data_disable);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	132
				133	void clean_acked_data_flush(void)
				134	{
				135	static_key_deferred_flush(&clean_acked_data_enabled);
				136	}
				137	EXPORT_SYMBOL_GPL(clean_acked_data_flush);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	138	#endif
				139
				140	static void tcp_gro_dev_warn(struct sock sk, const struct sk_buff skb,
				141	unsigned int len)
				142	{
				143	static bool __once __read_mostly;
				144
				145	if (!__once) {
				146	struct net_device *dev;
				147
				148	__once = true;
				149
				150	rcu_read_lock();
				151	dev = dev_get_by_index_rcu(sock_net(sk), skb->skb_iif);
				152	if (!dev \|\| len >= dev->mtu)
				153	pr_warn("%s: Driver has suspect GRO implementation, TCP performance may be compromised.\n",
				154	dev ? dev->name : "Unknown driver");
				155	rcu_read_unlock();
				156	}
				157	}
				158
				159	/* Adapt the MSS value used to make delayed ack decision to the
				160	* real world.
				161	*/
				162	static void tcp_measure_rcv_mss(struct sock sk, const struct sk_buff skb)
				163	{
				164	struct inet_connection_sock *icsk = inet_csk(sk);
				165	const unsigned int lss = icsk->icsk_ack.last_seg_size;
				166	unsigned int len;
				167
				168	icsk->icsk_ack.last_seg_size = 0;
				169
				170	/* skb->len may jitter because of SACKs, even if peer
				171	* sends good full-sized frames.
				172	*/
				173	len = skb_shinfo(skb)->gso_size ? : skb->len;
				174	if (len >= icsk->icsk_ack.rcv_mss) {
				175	icsk->icsk_ack.rcv_mss = min_t(unsigned int, len,
				176	tcp_sk(sk)->advmss);
				177	/* Account for possibly-removed options */
				178	if (unlikely(len > icsk->icsk_ack.rcv_mss +
				179	MAX_TCP_OPTION_SPACE))
				180	tcp_gro_dev_warn(sk, skb, len);
				181	} else {
				182	/* Otherwise, we make more careful check taking into account,
				183	* that SACKs block is variable.
				184	*
				185	* "len" is invariant segment length, including TCP header.
				186	*/
				187	len += skb->data - skb_transport_header(skb);
				188	if (len >= TCP_MSS_DEFAULT + sizeof(struct tcphdr) \|\|
				189	/* If PSH is not set, packet should be
				190	* full sized, provided peer TCP is not badly broken.
				191	* This observation (if it is correct 8)) allows
				192	* to handle super-low mtu links fairly.
				193	*/
				194	(len >= TCP_MIN_MSS + sizeof(struct tcphdr) &&
				195	!(tcp_flag_word(tcp_hdr(skb)) & TCP_REMNANT))) {
				196	/* Subtract also invariant (if peer is RFC compliant),
				197	* tcp header plus fixed timestamp option length.
				198	* Resulting "len" is MSS free of SACK jitter.
				199	*/
				200	len -= tcp_sk(sk)->tcp_header_len;
				201	icsk->icsk_ack.last_seg_size = len;
				202	if (len == lss) {
				203	icsk->icsk_ack.rcv_mss = len;
				204	return;
				205	}
				206	}
				207	if (icsk->icsk_ack.pending & ICSK_ACK_PUSHED)
				208	icsk->icsk_ack.pending \|= ICSK_ACK_PUSHED2;
				209	icsk->icsk_ack.pending \|= ICSK_ACK_PUSHED;
				210	}
				211	}
				212
				213	static void tcp_incr_quickack(struct sock *sk, unsigned int max_quickacks)
				214	{
				215	struct inet_connection_sock *icsk = inet_csk(sk);
				216	unsigned int quickacks = tcp_sk(sk)->rcv_wnd / (2 * icsk->icsk_ack.rcv_mss);
				217
				218	if (quickacks == 0)
				219	quickacks = 2;
				220	quickacks = min(quickacks, max_quickacks);
				221	if (quickacks > icsk->icsk_ack.quick)
				222	icsk->icsk_ack.quick = quickacks;
				223	}
				224
				225	void tcp_enter_quickack_mode(struct sock *sk, unsigned int max_quickacks)
				226	{
				227	struct inet_connection_sock *icsk = inet_csk(sk);
				228
				229	tcp_incr_quickack(sk, max_quickacks);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	230	inet_csk_exit_pingpong_mode(sk);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	231	icsk->icsk_ack.ato = TCP_ATO_MIN;
				232	}
				233	EXPORT_SYMBOL(tcp_enter_quickack_mode);
				234
				235	/* Send ACKs quickly, if "quick" count is not exhausted
				236	* and the session is not interactive.
				237	*/
				238
				239	static bool tcp_in_quickack_mode(struct sock *sk)
				240	{
				241	const struct inet_connection_sock *icsk = inet_csk(sk);
				242	const struct dst_entry *dst = __sk_dst_get(sk);
				243
				244	return (dst && dst_metric(dst, RTAX_QUICKACK)) \|\|
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	245	(icsk->icsk_ack.quick && !inet_csk_in_pingpong_mode(sk));
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	246	}
				247
				248	static void tcp_ecn_queue_cwr(struct tcp_sock *tp)
				249	{
				250	if (tp->ecn_flags & TCP_ECN_OK)
				251	tp->ecn_flags \|= TCP_ECN_QUEUE_CWR;
				252	}
				253
				254	static void tcp_ecn_accept_cwr(struct sock sk, const struct sk_buff skb)
				255	{
				256	if (tcp_hdr(skb)->cwr) {
				257	tcp_sk(sk)->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
				258
				259	/* If the sender is telling us it has entered CWR, then its
				260	* cwnd may be very low (even just 1 packet), so we should ACK
				261	* immediately.
				262	*/
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	263	if (TCP_SKB_CB(skb)->seq != TCP_SKB_CB(skb)->end_seq)
				264	inet_csk(sk)->icsk_ack.pending \|= ICSK_ACK_NOW;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	265	}
				266	}
				267
				268	static void tcp_ecn_withdraw_cwr(struct tcp_sock *tp)
				269	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	270	tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	271	}
				272
				273	static void __tcp_ecn_check_ce(struct sock sk, const struct sk_buff skb)
				274	{
				275	struct tcp_sock *tp = tcp_sk(sk);
				276
				277	switch (TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK) {
				278	case INET_ECN_NOT_ECT:
				279	/* Funny extension: if ECT is not set on a segment,
				280	* and we already seen ECT on a previous segment,
				281	* it is probably a retransmit.
				282	*/
				283	if (tp->ecn_flags & TCP_ECN_SEEN)
				284	tcp_enter_quickack_mode(sk, 2);
				285	break;
				286	case INET_ECN_CE:
				287	if (tcp_ca_needs_ecn(sk))
				288	tcp_ca_event(sk, CA_EVENT_ECN_IS_CE);
				289
				290	if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) {
				291	/* Better not delay acks, sender can have a very low cwnd */
				292	tcp_enter_quickack_mode(sk, 2);
				293	tp->ecn_flags \|= TCP_ECN_DEMAND_CWR;
				294	}
				295	tp->ecn_flags \|= TCP_ECN_SEEN;
				296	break;
				297	default:
				298	if (tcp_ca_needs_ecn(sk))
				299	tcp_ca_event(sk, CA_EVENT_ECN_NO_CE);
				300	tp->ecn_flags \|= TCP_ECN_SEEN;
				301	break;
				302	}
				303	}
				304
				305	static void tcp_ecn_check_ce(struct sock sk, const struct sk_buff skb)
				306	{
				307	if (tcp_sk(sk)->ecn_flags & TCP_ECN_OK)
				308	__tcp_ecn_check_ce(sk, skb);
				309	}
				310
				311	static void tcp_ecn_rcv_synack(struct tcp_sock tp, const struct tcphdr th)
				312	{
				313	if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece \|\| th->cwr))
				314	tp->ecn_flags &= ~TCP_ECN_OK;
				315	}
				316
				317	static void tcp_ecn_rcv_syn(struct tcp_sock tp, const struct tcphdr th)
				318	{
				319	if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece \|\| !th->cwr))
				320	tp->ecn_flags &= ~TCP_ECN_OK;
				321	}
				322
				323	static bool tcp_ecn_rcv_ecn_echo(const struct tcp_sock tp, const struct tcphdr th)
				324	{
				325	if (th->ece && !th->syn && (tp->ecn_flags & TCP_ECN_OK))
				326	return true;
				327	return false;
				328	}
				329
				330	/* Buffer size and advertised window tuning.
				331	*
				332	* 1. Tuning sk->sk_sndbuf, when connection enters established state.
				333	*/
				334
				335	static void tcp_sndbuf_expand(struct sock *sk)
				336	{
				337	const struct tcp_sock *tp = tcp_sk(sk);
				338	const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
				339	int sndmem, per_mss;
				340	u32 nr_segs;
				341
				342	/* Worst case is non GSO/TSO : each frame consumes one skb
				343	* and skb->head is kmalloced using power of two area of memory
				344	*/
				345	per_mss = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache) +
				346	MAX_TCP_HEADER +
				347	SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
				348
				349	per_mss = roundup_pow_of_two(per_mss) +
				350	SKB_DATA_ALIGN(sizeof(struct sk_buff));
				351
				352	nr_segs = max_t(u32, TCP_INIT_CWND, tp->snd_cwnd);
				353	nr_segs = max_t(u32, nr_segs, tp->reordering + 1);
				354
				355	/* Fast Recovery (RFC 5681 3.2) :
				356	* Cubic needs 1.7 factor, rounded to 2 to include
				357	* extra cushion (application might react slowly to EPOLLOUT)
				358	*/
				359	sndmem = ca_ops->sndbuf_expand ? ca_ops->sndbuf_expand(sk) : 2;
				360	sndmem = nr_segs per_mss;
				361
				362	if (sk->sk_sndbuf < sndmem)
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	363	WRITE_ONCE(sk->sk_sndbuf,
				364	min(sndmem, sock_net(sk)->ipv4.sysctl_tcp_wmem[2]));
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	365	}
				366
				367	/* 2. Tuning advertised window (window_clamp, rcv_ssthresh)
				368	*
				369	* All tcp_full_space() is split to two parts: "network" buffer, allocated
				370	* forward and advertised in receiver window (tp->rcv_wnd) and
				371	* "application buffer", required to isolate scheduling/application
				372	* latencies from network.
				373	* window_clamp is maximal advertised window. It can be less than
				374	* tcp_full_space(), in this case tcp_full_space() - window_clamp
				375	* is reserved for "application" buffer. The less window_clamp is
				376	* the smoother our behaviour from viewpoint of network, but the lower
				377	* throughput and the higher sensitivity of the connection to losses. 8)
				378	*
				379	* rcv_ssthresh is more strict window_clamp used at "slow start"
				380	* phase to predict further behaviour of this connection.
				381	* It is used for two goals:
				382	* - to enforce header prediction at sender, even when application
				383	* requires some significant "application buffer". It is check #1.
				384	* - to prevent pruning of receive queue because of misprediction
				385	* of receiver window. Check #2.
				386	*
				387	* The scheme does not work when sender sends good segments opening
				388	* window and then starts to feed us spaghetti. But it should work
				389	* in common situations. Otherwise, we have to rely on queue collapsing.
				390	*/
				391
				392	/* Slow part of check#2. */
				393	static int __tcp_grow_window(const struct sock sk, const struct sk_buff skb)
				394	{
				395	struct tcp_sock *tp = tcp_sk(sk);
				396	/* Optimize this! */
				397	int truesize = tcp_win_from_space(sk, skb->truesize) >> 1;
				398	int window = tcp_win_from_space(sk, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]) >> 1;
				399
				400	while (tp->rcv_ssthresh <= window) {
				401	if (truesize <= skb->len)
				402	return 2 * inet_csk(sk)->icsk_ack.rcv_mss;
				403
				404	truesize >>= 1;
				405	window >>= 1;
				406	}
				407	return 0;
				408	}
				409
				410	static void tcp_grow_window(struct sock sk, const struct sk_buff skb)
				411	{
				412	struct tcp_sock *tp = tcp_sk(sk);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	413	int room;
				414
				415	room = min_t(int, tp->window_clamp, tcp_space(sk)) - tp->rcv_ssthresh;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	416
				417	/* Check #1 */
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	418	if (room > 0 && !tcp_under_memory_pressure(sk)) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	419	int incr;
				420
				421	/* Check #2. Increase window, if skb with such overhead
				422	* will fit to rcvbuf in future.
				423	*/
				424	if (tcp_win_from_space(sk, skb->truesize) <= skb->len)
				425	incr = 2 * tp->advmss;
				426	else
				427	incr = __tcp_grow_window(sk, skb);
				428
				429	if (incr) {
				430	incr = max_t(int, incr, 2 * skb->len);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	431	tp->rcv_ssthresh += min(room, incr);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	432	inet_csk(sk)->icsk_ack.quick \|= 1;
				433	}
				434	}
				435	}
				436
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	437	/* 3. Try to fixup all. It is made immediately after connection enters
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	438	* established state.
				439	*/
				440	void tcp_init_buffer_space(struct sock *sk)
				441	{
				442	int tcp_app_win = sock_net(sk)->ipv4.sysctl_tcp_app_win;
				443	struct tcp_sock *tp = tcp_sk(sk);
				444	int maxwin;
				445
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	446	if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK))
				447	tcp_sndbuf_expand(sk);
				448
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	449	tcp_mstamp_refresh(tp);
				450	tp->rcvq_space.time = tp->tcp_mstamp;
				451	tp->rcvq_space.seq = tp->copied_seq;
				452
				453	maxwin = tcp_full_space(sk);
				454
				455	if (tp->window_clamp >= maxwin) {
				456	tp->window_clamp = maxwin;
				457
				458	if (tcp_app_win && maxwin > 4 * tp->advmss)
				459	tp->window_clamp = max(maxwin -
				460	(maxwin >> tcp_app_win),
				461	4 * tp->advmss);
				462	}
				463
				464	/* Force reservation of one segment. */
				465	if (tcp_app_win &&
				466	tp->window_clamp > 2 * tp->advmss &&
				467	tp->window_clamp + tp->advmss > maxwin)
				468	tp->window_clamp = max(2 * tp->advmss, maxwin - tp->advmss);
				469
				470	tp->rcv_ssthresh = min(tp->rcv_ssthresh, tp->window_clamp);
				471	tp->snd_cwnd_stamp = tcp_jiffies32;
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	472	tp->rcvq_space.space = min3(tp->rcv_ssthresh, tp->rcv_wnd,
				473	(u32)TCP_INIT_CWND * tp->advmss);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	474	}
				475
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	476	/* 4. Recalculate window clamp after socket hit its memory bounds. */
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	477	static void tcp_clamp_window(struct sock *sk)
				478	{
				479	struct tcp_sock *tp = tcp_sk(sk);
				480	struct inet_connection_sock *icsk = inet_csk(sk);
				481	struct net *net = sock_net(sk);
				482
				483	icsk->icsk_ack.quick = 0;
				484
				485	if (sk->sk_rcvbuf < net->ipv4.sysctl_tcp_rmem[2] &&
				486	!(sk->sk_userlocks & SOCK_RCVBUF_LOCK) &&
				487	!tcp_under_memory_pressure(sk) &&
				488	sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	489	WRITE_ONCE(sk->sk_rcvbuf,
				490	min(atomic_read(&sk->sk_rmem_alloc),
				491	net->ipv4.sysctl_tcp_rmem[2]));
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	492	}
				493	if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf)
				494	tp->rcv_ssthresh = min(tp->window_clamp, 2U * tp->advmss);
				495	}
				496
				497	/* Initialize RCV_MSS value.
				498	* RCV_MSS is an our guess about MSS used by the peer.
				499	* We haven't any direct information about the MSS.
				500	* It's better to underestimate the RCV_MSS rather than overestimate.
				501	* Overestimations make us ACKing less frequently than needed.
				502	* Underestimations are more easy to detect and fix by tcp_measure_rcv_mss().
				503	*/
				504	void tcp_initialize_rcv_mss(struct sock *sk)
				505	{
				506	const struct tcp_sock *tp = tcp_sk(sk);
				507	unsigned int hint = min_t(unsigned int, tp->advmss, tp->mss_cache);
				508
				509	hint = min(hint, tp->rcv_wnd / 2);
				510	hint = min(hint, TCP_MSS_DEFAULT);
				511	hint = max(hint, TCP_MIN_MSS);
				512
				513	inet_csk(sk)->icsk_ack.rcv_mss = hint;
				514	}
				515	EXPORT_SYMBOL(tcp_initialize_rcv_mss);
				516
				517	/* Receiver "autotuning" code.
				518	*
				519	* The algorithm for RTT estimation w/o timestamps is based on
				520	* Dynamic Right-Sizing (DRS) by Wu Feng and Mike Fisk of LANL.
				521	* <http://public.lanl.gov/radiant/pubs.html#DRS>
				522	*
				523	* More detail on this code can be found at
				524	* <http://staff.psc.edu/jheffner/>,
				525	* though this reference is out of date. A new paper
				526	* is pending.
				527	*/
				528	static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep)
				529	{
				530	u32 new_sample = tp->rcv_rtt_est.rtt_us;
				531	long m = sample;
				532
				533	if (new_sample != 0) {
				534	/* If we sample in larger samples in the non-timestamp
				535	* case, we could grossly overestimate the RTT especially
				536	* with chatty applications or bulk transfer apps which
				537	* are stalled on filesystem I/O.
				538	*
				539	* Also, since we are only going for a minimum in the
				540	* non-timestamp case, we do not smooth things out
				541	* else with timestamps disabled convergence takes too
				542	* long.
				543	*/
				544	if (!win_dep) {
				545	m -= (new_sample >> 3);
				546	new_sample += m;
				547	} else {
				548	m <<= 3;
				549	if (m < new_sample)
				550	new_sample = m;
				551	}
				552	} else {
				553	/* No previous measure. */
				554	new_sample = m << 3;
				555	}
				556
				557	tp->rcv_rtt_est.rtt_us = new_sample;
				558	}
				559
				560	static inline void tcp_rcv_rtt_measure(struct tcp_sock *tp)
				561	{
				562	u32 delta_us;
				563
				564	if (tp->rcv_rtt_est.time == 0)
				565	goto new_measure;
				566	if (before(tp->rcv_nxt, tp->rcv_rtt_est.seq))
				567	return;
				568	delta_us = tcp_stamp_us_delta(tp->tcp_mstamp, tp->rcv_rtt_est.time);
				569	if (!delta_us)
				570	delta_us = 1;
				571	tcp_rcv_rtt_update(tp, delta_us, 1);
				572
				573	new_measure:
				574	tp->rcv_rtt_est.seq = tp->rcv_nxt + tp->rcv_wnd;
				575	tp->rcv_rtt_est.time = tp->tcp_mstamp;
				576	}
				577
				578	static inline void tcp_rcv_rtt_measure_ts(struct sock *sk,
				579	const struct sk_buff *skb)
				580	{
				581	struct tcp_sock *tp = tcp_sk(sk);
				582
				583	if (tp->rx_opt.rcv_tsecr == tp->rcv_rtt_last_tsecr)
				584	return;
				585	tp->rcv_rtt_last_tsecr = tp->rx_opt.rcv_tsecr;
				586
				587	if (TCP_SKB_CB(skb)->end_seq -
				588	TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss) {
				589	u32 delta = tcp_time_stamp(tp) - tp->rx_opt.rcv_tsecr;
				590	u32 delta_us;
				591
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	592	if (likely(delta < INT_MAX / (USEC_PER_SEC / TCP_TS_HZ))) {
				593	if (!delta)
				594	delta = 1;
				595	delta_us = delta * (USEC_PER_SEC / TCP_TS_HZ);
				596	tcp_rcv_rtt_update(tp, delta_us, 0);
				597	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	598	}
				599	}
				600
				601	/*
				602	* This function should be called every time data is copied to user space.
				603	* It calculates the appropriate TCP receive buffer space.
				604	*/
				605	void tcp_rcv_space_adjust(struct sock *sk)
				606	{
				607	struct tcp_sock *tp = tcp_sk(sk);
				608	u32 copied;
				609	int time;
				610
				611	trace_tcp_rcv_space_adjust(sk);
				612
				613	tcp_mstamp_refresh(tp);
				614	time = tcp_stamp_us_delta(tp->tcp_mstamp, tp->rcvq_space.time);
				615	if (time < (tp->rcv_rtt_est.rtt_us >> 3) \|\| tp->rcv_rtt_est.rtt_us == 0)
				616	return;
				617
				618	/* Number of bytes copied to user in last RTT */
				619	copied = tp->copied_seq - tp->rcvq_space.seq;
				620	if (copied <= tp->rcvq_space.space)
				621	goto new_measure;
				622
				623	/* A bit of theory :
				624	* copied = bytes received in previous RTT, our base window
				625	* To cope with packet losses, we need a 2x factor
				626	* To cope with slow start, and sender growing its cwin by 100 %
				627	* every RTT, we need a 4x factor, because the ACK we are sending
				628	* now is for the next RTT, not the current one :
				629	* <prev RTT . ><current RTT .. ><next RTT .... >
				630	*/
				631
				632	if (sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf &&
				633	!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
				634	int rcvmem, rcvbuf;
				635	u64 rcvwin, grow;
				636
				637	/* minimal window to cope with packet losses, assuming
				638	* steady state. Add some cushion because of small variations.
				639	*/
				640	rcvwin = ((u64)copied << 1) + 16 * tp->advmss;
				641
				642	/* Accommodate for sender rate increase (eg. slow start) */
				643	grow = rcvwin * (copied - tp->rcvq_space.space);
				644	do_div(grow, tp->rcvq_space.space);
				645	rcvwin += (grow << 1);
				646
				647	rcvmem = SKB_TRUESIZE(tp->advmss + MAX_TCP_HEADER);
				648	while (tcp_win_from_space(sk, rcvmem) < tp->advmss)
				649	rcvmem += 128;
				650
				651	do_div(rcvwin, tp->advmss);
				652	rcvbuf = min_t(u64, rcvwin * rcvmem,
				653	sock_net(sk)->ipv4.sysctl_tcp_rmem[2]);
				654	if (rcvbuf > sk->sk_rcvbuf) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	655	WRITE_ONCE(sk->sk_rcvbuf, rcvbuf);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	656
				657	/* Make the window clamp follow along. */
				658	tp->window_clamp = tcp_win_from_space(sk, rcvbuf);
				659	}
				660	}
				661	tp->rcvq_space.space = copied;
				662
				663	new_measure:
				664	tp->rcvq_space.seq = tp->copied_seq;
				665	tp->rcvq_space.time = tp->tcp_mstamp;
				666	}
				667
				668	/* There is something which you must keep in mind when you analyze the
				669	* behavior of the tp->ato delayed ack timeout interval. When a
				670	* connection starts up, we want to ack as quickly as possible. The
				671	* problem is that "good" TCP's do slow start at the beginning of data
				672	* transmission. The means that until we send the first few ACK's the
				673	* sender will sit on his end and only queue most of his data, because
				674	* he can only send snd_cwnd unacked packets at any given time. For
				675	* each ACK we send, he increments snd_cwnd and transmits more of his
				676	* queue. -DaveM
				677	*/
				678	static void tcp_event_data_recv(struct sock sk, struct sk_buff skb)
				679	{
				680	struct tcp_sock *tp = tcp_sk(sk);
				681	struct inet_connection_sock *icsk = inet_csk(sk);
				682	u32 now;
				683
				684	inet_csk_schedule_ack(sk);
				685
				686	tcp_measure_rcv_mss(sk, skb);
				687
				688	tcp_rcv_rtt_measure(tp);
				689
				690	now = tcp_jiffies32;
				691
				692	if (!icsk->icsk_ack.ato) {
				693	/* The _first_ data packet received, initialize
				694	* delayed ACK engine.
				695	*/
				696	tcp_incr_quickack(sk, TCP_MAX_QUICKACKS);
				697	icsk->icsk_ack.ato = TCP_ATO_MIN;
				698	} else {
				699	int m = now - icsk->icsk_ack.lrcvtime;
				700
				701	if (m <= TCP_ATO_MIN / 2) {
				702	/* The fastest case is the first. */
				703	icsk->icsk_ack.ato = (icsk->icsk_ack.ato >> 1) + TCP_ATO_MIN / 2;
				704	} else if (m < icsk->icsk_ack.ato) {
				705	icsk->icsk_ack.ato = (icsk->icsk_ack.ato >> 1) + m;
				706	if (icsk->icsk_ack.ato > icsk->icsk_rto)
				707	icsk->icsk_ack.ato = icsk->icsk_rto;
				708	} else if (m > icsk->icsk_rto) {
				709	/* Too long gap. Apparently sender failed to
				710	* restart window, so that we send ACKs quickly.
				711	*/
				712	tcp_incr_quickack(sk, TCP_MAX_QUICKACKS);
				713	sk_mem_reclaim(sk);
				714	}
				715	}
				716	icsk->icsk_ack.lrcvtime = now;
				717
				718	tcp_ecn_check_ce(sk, skb);
				719
				720	if (skb->len >= 128)
				721	tcp_grow_window(sk, skb);
				722	}
				723
				724	/* Called to compute a smoothed rtt estimate. The data fed to this
				725	* routine either comes from timestamps, or from segments that were
				726	* known _not_ to have been retransmitted [see Karn/Partridge
				727	* Proceedings SIGCOMM 87]. The algorithm is from the SIGCOMM 88
				728	* piece by Van Jacobson.
				729	* NOTE: the next three routines used to be one big routine.
				730	* To save cycles in the RFC 1323 implementation it was better to break
				731	* it up into three procedures. -- erics
				732	*/
				733	static void tcp_rtt_estimator(struct sock *sk, long mrtt_us)
				734	{
				735	struct tcp_sock *tp = tcp_sk(sk);
				736	long m = mrtt_us; /* RTT */
				737	u32 srtt = tp->srtt_us;
				738
				739	/* The following amusing code comes from Jacobson's
				740	* article in SIGCOMM '88. Note that rtt and mdev
				741	* are scaled versions of rtt and mean deviation.
				742	* This is designed to be as fast as possible
				743	* m stands for "measurement".
				744	*
				745	* On a 1990 paper the rto value is changed to:
				746	* RTO = rtt + 4 * mdev
				747	*
				748	* Funny. This algorithm seems to be very broken.
				749	* These formulae increase RTO, when it should be decreased, increase
				750	* too slowly, when it should be increased quickly, decrease too quickly
				751	* etc. I guess in BSD RTO takes ONE value, so that it is absolutely
				752	* does not matter how to _calculate_ it. Seems, it was trap
				753	* that VJ failed to avoid. 8)
				754	*/
				755	if (srtt != 0) {
				756	m -= (srtt >> 3); /* m is now error in rtt est */
				757	srtt += m; /* rtt = 7/8 rtt + 1/8 new */
				758	if (m < 0) {
				759	m = -m; /* m is now abs(error) */
				760	m -= (tp->mdev_us >> 2); /* similar update on mdev */
				761	/* This is similar to one of Eifel findings.
				762	* Eifel blocks mdev updates when rtt decreases.
				763	* This solution is a bit different: we use finer gain
				764	* for mdev in this case (alpha*beta).
				765	* Like Eifel it also prevents growth of rto,
				766	* but also it limits too fast rto decreases,
				767	* happening in pure Eifel.
				768	*/
				769	if (m > 0)
				770	m >>= 3;
				771	} else {
				772	m -= (tp->mdev_us >> 2); /* similar update on mdev */
				773	}
				774	tp->mdev_us += m; /* mdev = 3/4 mdev + 1/4 new */
				775	if (tp->mdev_us > tp->mdev_max_us) {
				776	tp->mdev_max_us = tp->mdev_us;
				777	if (tp->mdev_max_us > tp->rttvar_us)
				778	tp->rttvar_us = tp->mdev_max_us;
				779	}
				780	if (after(tp->snd_una, tp->rtt_seq)) {
				781	if (tp->mdev_max_us < tp->rttvar_us)
				782	tp->rttvar_us -= (tp->rttvar_us - tp->mdev_max_us) >> 2;
				783	tp->rtt_seq = tp->snd_nxt;
				784	tp->mdev_max_us = tcp_rto_min_us(sk);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	785
				786	tcp_bpf_rtt(sk);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	787	}
				788	} else {
				789	/* no previous measure. */
				790	srtt = m << 3; /* take the measured time to be rtt */
				791	tp->mdev_us = m << 1; /* make sure rto = 3rtt /
				792	tp->rttvar_us = max(tp->mdev_us, tcp_rto_min_us(sk));
				793	tp->mdev_max_us = tp->rttvar_us;
				794	tp->rtt_seq = tp->snd_nxt;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	795
				796	tcp_bpf_rtt(sk);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	797	}
				798	tp->srtt_us = max(1U, srtt);
				799	}
				800
				801	static void tcp_update_pacing_rate(struct sock *sk)
				802	{
				803	const struct tcp_sock *tp = tcp_sk(sk);
				804	u64 rate;
				805
				806	/* set sk_pacing_rate to 200 % of current rate (mss * cwnd / srtt) */
				807	rate = (u64)tp->mss_cache * ((USEC_PER_SEC / 100) << 3);
				808
				809	/* current rate is (cwnd * mss) / srtt
				810	* In Slow Start [1], set sk_pacing_rate to 200 % the current rate.
				811	* In Congestion Avoidance phase, set it to 120 % the current rate.
				812	*
				813	* [1] : Normal Slow Start condition is (tp->snd_cwnd < tp->snd_ssthresh)
				814	* If snd_cwnd >= (tp->snd_ssthresh / 2), we are approaching
				815	* end of slow start and should slow down.
				816	*/
				817	if (tp->snd_cwnd < tp->snd_ssthresh / 2)
				818	rate *= sock_net(sk)->ipv4.sysctl_tcp_pacing_ss_ratio;
				819	else
				820	rate *= sock_net(sk)->ipv4.sysctl_tcp_pacing_ca_ratio;
				821
				822	rate *= max(tp->snd_cwnd, tp->packets_out);
				823
				824	if (likely(tp->srtt_us))
				825	do_div(rate, tp->srtt_us);
				826
				827	/* WRITE_ONCE() is needed because sch_fq fetches sk_pacing_rate
				828	* without any lock. We want to make sure compiler wont store
				829	* intermediate values in this location.
				830	*/
				831	WRITE_ONCE(sk->sk_pacing_rate, min_t(u64, rate,
				832	sk->sk_max_pacing_rate));
				833	}
				834
				835	/* Calculate rto without backoff. This is the second half of Van Jacobson's
				836	* routine referred to above.
				837	*/
				838	static void tcp_set_rto(struct sock *sk)
				839	{
				840	const struct tcp_sock *tp = tcp_sk(sk);
				841	/* Old crap is replaced with new one. 8)
				842	*
				843	* More seriously:
				844	* 1. If rtt variance happened to be less 50msec, it is hallucination.
				845	* It cannot be less due to utterly erratic ACK generation made
				846	* at least by solaris and freebsd. "Erratic ACKs" has _nothing_
				847	* to do with delayed acks, because at cwnd>2 true delack timeout
				848	* is invisible. Actually, Linux-2.4 also generates erratic
				849	* ACKs in some circumstances.
				850	*/
				851	inet_csk(sk)->icsk_rto = __tcp_set_rto(tp);
				852
				853	/* 2. Fixups made earlier cannot be right.
				854	* If we do not estimate RTO correctly without them,
				855	* all the algo is pure shit and should be replaced
				856	* with correct one. It is exactly, which we pretend to do.
				857	*/
				858
				859	/* NOTE: clamping at TCP_RTO_MIN is not required, current algo
				860	* guarantees that rto is higher.
				861	*/
				862	tcp_bound_rto(sk);
				863	}
				864
				865	__u32 tcp_init_cwnd(const struct tcp_sock tp, const struct dst_entry dst)
				866	{
				867	__u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0);
				868
				869	if (!cwnd)
				870	cwnd = TCP_INIT_CWND;
				871	return min_t(__u32, cwnd, tp->snd_cwnd_clamp);
				872	}
				873
				874	/* Take a notice that peer is sending D-SACKs */
				875	static void tcp_dsack_seen(struct tcp_sock *tp)
				876	{
				877	tp->rx_opt.sack_ok \|= TCP_DSACK_SEEN;
				878	tp->rack.dsack_seen = 1;
				879	tp->dsack_dups++;
				880	}
				881
				882	/* It's reordering when higher sequence was delivered (i.e. sacked) before
				883	* some lower never-retransmitted sequence ("low_seq"). The maximum reordering
				884	* distance is approximated in full-mss packet distance ("reordering").
				885	*/
				886	static void tcp_check_sack_reordering(struct sock *sk, const u32 low_seq,
				887	const int ts)
				888	{
				889	struct tcp_sock *tp = tcp_sk(sk);
				890	const u32 mss = tp->mss_cache;
				891	u32 fack, metric;
				892
				893	fack = tcp_highest_sack_seq(tp);
				894	if (!before(low_seq, fack))
				895	return;
				896
				897	metric = fack - low_seq;
				898	if ((metric > tp->reordering * mss) && mss) {
				899	#if FASTRETRANS_DEBUG > 1
				900	pr_debug("Disorder%d %d %u f%u s%u rr%d\n",
				901	tp->rx_opt.sack_ok, inet_csk(sk)->icsk_ca_state,
				902	tp->reordering,
				903	0,
				904	tp->sacked_out,
				905	tp->undo_marker ? tp->undo_retrans : 0);
				906	#endif
				907	tp->reordering = min_t(u32, (metric + mss - 1) / mss,
				908	sock_net(sk)->ipv4.sysctl_tcp_max_reordering);
				909	}
				910
				911	/* This exciting event is worth to be remembered. 8) */
				912	tp->reord_seen++;
				913	NET_INC_STATS(sock_net(sk),
				914	ts ? LINUX_MIB_TCPTSREORDER : LINUX_MIB_TCPSACKREORDER);
				915	}
				916
				917	/* This must be called before lost_out is incremented */
				918	static void tcp_verify_retransmit_hint(struct tcp_sock tp, struct sk_buff skb)
				919	{
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	920	if ((!tp->retransmit_skb_hint && tp->retrans_out >= tp->lost_out) \|\|
				921	(tp->retransmit_skb_hint &&
				922	before(TCP_SKB_CB(skb)->seq,
				923	TCP_SKB_CB(tp->retransmit_skb_hint)->seq)))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	924	tp->retransmit_skb_hint = skb;
				925	}
				926
				927	/* Sum the number of packets on the wire we have marked as lost.
				928	* There are two cases we care about here:
				929	* a) Packet hasn't been marked lost (nor retransmitted),
				930	* and this is the first loss.
				931	* b) Packet has been marked both lost and retransmitted,
				932	* and this means we think it was lost again.
				933	*/
				934	static void tcp_sum_lost(struct tcp_sock tp, struct sk_buff skb)
				935	{
				936	__u8 sacked = TCP_SKB_CB(skb)->sacked;
				937
				938	if (!(sacked & TCPCB_LOST) \|\|
				939	((sacked & TCPCB_LOST) && (sacked & TCPCB_SACKED_RETRANS)))
				940	tp->lost += tcp_skb_pcount(skb);
				941	}
				942
				943	static void tcp_skb_mark_lost(struct tcp_sock tp, struct sk_buff skb)
				944	{
				945	if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST\|TCPCB_SACKED_ACKED))) {
				946	tcp_verify_retransmit_hint(tp, skb);
				947
				948	tp->lost_out += tcp_skb_pcount(skb);
				949	tcp_sum_lost(tp, skb);
				950	TCP_SKB_CB(skb)->sacked \|= TCPCB_LOST;
				951	}
				952	}
				953
				954	void tcp_skb_mark_lost_uncond_verify(struct tcp_sock tp, struct sk_buff skb)
				955	{
				956	tcp_verify_retransmit_hint(tp, skb);
				957
				958	tcp_sum_lost(tp, skb);
				959	if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST\|TCPCB_SACKED_ACKED))) {
				960	tp->lost_out += tcp_skb_pcount(skb);
				961	TCP_SKB_CB(skb)->sacked \|= TCPCB_LOST;
				962	}
				963	}
				964
				965	/* This procedure tags the retransmission queue when SACKs arrive.
				966	*
				967	* We have three tag bits: SACKED(S), RETRANS(R) and LOST(L).
				968	* Packets in queue with these bits set are counted in variables
				969	* sacked_out, retrans_out and lost_out, correspondingly.
				970	*
				971	* Valid combinations are:
				972	* Tag InFlight Description
				973	* 0 1 - orig segment is in flight.
				974	* S 0 - nothing flies, orig reached receiver.
				975	* L 0 - nothing flies, orig lost by net.
				976	* R 2 - both orig and retransmit are in flight.
				977	* L\|R 1 - orig is lost, retransmit is in flight.
				978	* S\|R 1 - orig reached receiver, retrans is still in flight.
				979	* (L\|S\|R is logically valid, it could occur when L\|R is sacked,
				980	* but it is equivalent to plain S and code short-curcuits it to S.
				981	* L\|S is logically invalid, it would mean -1 packet in flight 8))
				982	*
				983	* These 6 states form finite state machine, controlled by the following events:
				984	* 1. New ACK (+SACK) arrives. (tcp_sacktag_write_queue())
				985	* 2. Retransmission. (tcp_retransmit_skb(), tcp_xmit_retransmit_queue())
				986	* 3. Loss detection event of two flavors:
				987	* A. Scoreboard estimator decided the packet is lost.
				988	* A'. Reno "three dupacks" marks head of queue lost.
				989	* B. SACK arrives sacking SND.NXT at the moment, when the
				990	* segment was retransmitted.
				991	* 4. D-SACK added new rule: D-SACK changes any tag to S.
				992	*
				993	* It is pleasant to note, that state diagram turns out to be commutative,
				994	* so that we are allowed not to be bothered by order of our actions,
				995	* when multiple events arrive simultaneously. (see the function below).
				996	*
				997	* Reordering detection.
				998	* --------------------
				999	* Reordering metric is maximal distance, which a packet can be displaced
				1000	* in packet stream. With SACKs we can estimate it:
				1001	*
				1002	* 1. SACK fills old hole and the corresponding segment was not
				1003	* ever retransmitted -> reordering. Alas, we cannot use it
				1004	* when segment was retransmitted.
				1005	* 2. The last flaw is solved with D-SACK. D-SACK arrives
				1006	* for retransmitted and already SACKed segment -> reordering..
				1007	* Both of these heuristics are not used in Loss state, when we cannot
				1008	* account for retransmits accurately.
				1009	*
				1010	* SACK block validation.
				1011	* ----------------------
				1012	*
				1013	* SACK block range validation checks that the received SACK block fits to
				1014	* the expected sequence limits, i.e., it is between SND.UNA and SND.NXT.
				1015	* Note that SND.UNA is not included to the range though being valid because
				1016	* it means that the receiver is rather inconsistent with itself reporting
				1017	* SACK reneging when it should advance SND.UNA. Such SACK block this is
				1018	* perfectly valid, however, in light of RFC2018 which explicitly states
				1019	* that "SACK block MUST reflect the newest segment. Even if the newest
				1020	* segment is going to be discarded ...", not that it looks very clever
				1021	* in case of head skb. Due to potentional receiver driven attacks, we
				1022	* choose to avoid immediate execution of a walk in write queue due to
				1023	* reneging and defer head skb's loss recovery to standard loss recovery
				1024	* procedure that will eventually trigger (nothing forbids us doing this).
				1025	*
				1026	* Implements also blockage to start_seq wrap-around. Problem lies in the
				1027	* fact that though start_seq (s) is before end_seq (i.e., not reversed),
				1028	* there's no guarantee that it will be before snd_nxt (n). The problem
				1029	* happens when start_seq resides between end_seq wrap (e_w) and snd_nxt
				1030	* wrap (s_w):
				1031	*
				1032	* <- outs wnd -> <- wrapzone ->
				1033	* u e n u_w e_w s n_w
				1034	* \| \| \| \| \| \| \|
				1035	* \|<------------+------+----- TCP seqno space --------------+---------->\|
				1036	* ...-- <2^31 ->\| \|<--------...
				1037	* ...---- >2^31 ------>\| \|<--------...
				1038	*
				1039	* Current code wouldn't be vulnerable but it's better still to discard such
				1040	* crazy SACK blocks. Doing this check for start_seq alone closes somewhat
				1041	* similar case (end_seq after snd_nxt wrap) as earlier reversed check in
				1042	* snd_nxt wrap -> snd_una region will then become "well defined", i.e.,
				1043	* equal to the ideal case (infinite seqno space without wrap caused issues).
				1044	*
				1045	* With D-SACK the lower bound is extended to cover sequence space below
				1046	* SND.UNA down to undo_marker, which is the last point of interest. Yet
				1047	* again, D-SACK block must not to go across snd_una (for the same reason as
				1048	* for the normal SACK blocks, explained above). But there all simplicity
				1049	* ends, TCP might receive valid D-SACKs below that. As long as they reside
				1050	* fully below undo_marker they do not affect behavior in anyway and can
				1051	* therefore be safely ignored. In rare cases (which are more or less
				1052	* theoretical ones), the D-SACK will nicely cross that boundary due to skb
				1053	* fragmentation and packet reordering past skb's retransmission. To consider
				1054	* them correctly, the acceptable range must be extended even more though
				1055	* the exact amount is rather hard to quantify. However, tp->max_window can
				1056	* be used as an exaggerated estimate.
				1057	*/
				1058	static bool tcp_is_sackblock_valid(struct tcp_sock *tp, bool is_dsack,
				1059	u32 start_seq, u32 end_seq)
				1060	{
				1061	/* Too far in future, or reversed (interpretation is ambiguous) */
				1062	if (after(end_seq, tp->snd_nxt) \|\| !before(start_seq, end_seq))
				1063	return false;
				1064
				1065	/* Nasty start_seq wrap-around check (see comments above) */
				1066	if (!before(start_seq, tp->snd_nxt))
				1067	return false;
				1068
				1069	/* In outstanding window? ...This is valid exit for D-SACKs too.
				1070	* start_seq == snd_una is non-sensical (see comments above)
				1071	*/
				1072	if (after(start_seq, tp->snd_una))
				1073	return true;
				1074
				1075	if (!is_dsack \|\| !tp->undo_marker)
				1076	return false;
				1077
				1078	/* ...Then it's D-SACK, and must reside below snd_una completely */
				1079	if (after(end_seq, tp->snd_una))
				1080	return false;
				1081
				1082	if (!before(start_seq, tp->undo_marker))
				1083	return true;
				1084
				1085	/* Too old */
				1086	if (!after(end_seq, tp->undo_marker))
				1087	return false;
				1088
				1089	/* Undo_marker boundary crossing (overestimates a lot). Known already:
				1090	* start_seq < undo_marker and end_seq >= undo_marker.
				1091	*/
				1092	return !before(start_seq, end_seq - tp->max_window);
				1093	}
				1094
				1095	static bool tcp_check_dsack(struct sock sk, const struct sk_buff ack_skb,
				1096	struct tcp_sack_block_wire *sp, int num_sacks,
				1097	u32 prior_snd_una)
				1098	{
				1099	struct tcp_sock *tp = tcp_sk(sk);
				1100	u32 start_seq_0 = get_unaligned_be32(&sp[0].start_seq);
				1101	u32 end_seq_0 = get_unaligned_be32(&sp[0].end_seq);
				1102	bool dup_sack = false;
				1103
				1104	if (before(start_seq_0, TCP_SKB_CB(ack_skb)->ack_seq)) {
				1105	dup_sack = true;
				1106	tcp_dsack_seen(tp);
				1107	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKRECV);
				1108	} else if (num_sacks > 1) {
				1109	u32 end_seq_1 = get_unaligned_be32(&sp[1].end_seq);
				1110	u32 start_seq_1 = get_unaligned_be32(&sp[1].start_seq);
				1111
				1112	if (!after(end_seq_0, end_seq_1) &&
				1113	!before(start_seq_0, start_seq_1)) {
				1114	dup_sack = true;
				1115	tcp_dsack_seen(tp);
				1116	NET_INC_STATS(sock_net(sk),
				1117	LINUX_MIB_TCPDSACKOFORECV);
				1118	}
				1119	}
				1120
				1121	/* D-SACK for already forgotten data... Do dumb counting. */
				1122	if (dup_sack && tp->undo_marker && tp->undo_retrans > 0 &&
				1123	!after(end_seq_0, prior_snd_una) &&
				1124	after(end_seq_0, tp->undo_marker))
				1125	tp->undo_retrans--;
				1126
				1127	return dup_sack;
				1128	}
				1129
				1130	struct tcp_sacktag_state {
				1131	u32 reord;
				1132	/* Timestamps for earliest and latest never-retransmitted segment
				1133	* that was SACKed. RTO needs the earliest RTT to stay conservative,
				1134	* but congestion control should still get an accurate delay signal.
				1135	*/
				1136	u64 first_sackt;
				1137	u64 last_sackt;
				1138	struct rate_sample *rate;
				1139	int flag;
				1140	unsigned int mss_now;
				1141	};
				1142
				1143	/* Check if skb is fully within the SACK block. In presence of GSO skbs,
				1144	* the incoming SACK may not exactly match but we can find smaller MSS
				1145	* aligned portion of it that matches. Therefore we might need to fragment
				1146	* which may fail and creates some hassle (caller must handle error case
				1147	* returns).
				1148	*
				1149	* FIXME: this could be merged to shift decision code
				1150	*/
				1151	static int tcp_match_skb_to_sack(struct sock sk, struct sk_buff skb,
				1152	u32 start_seq, u32 end_seq)
				1153	{
				1154	int err;
				1155	bool in_sack;
				1156	unsigned int pkt_len;
				1157	unsigned int mss;
				1158
				1159	in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) &&
				1160	!before(end_seq, TCP_SKB_CB(skb)->end_seq);
				1161
				1162	if (tcp_skb_pcount(skb) > 1 && !in_sack &&
				1163	after(TCP_SKB_CB(skb)->end_seq, start_seq)) {
				1164	mss = tcp_skb_mss(skb);
				1165	in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq);
				1166
				1167	if (!in_sack) {
				1168	pkt_len = start_seq - TCP_SKB_CB(skb)->seq;
				1169	if (pkt_len < mss)
				1170	pkt_len = mss;
				1171	} else {
				1172	pkt_len = end_seq - TCP_SKB_CB(skb)->seq;
				1173	if (pkt_len < mss)
				1174	return -EINVAL;
				1175	}
				1176
				1177	/* Round if necessary so that SACKs cover only full MSSes
				1178	* and/or the remaining small portion (if present)
				1179	*/
				1180	if (pkt_len > mss) {
				1181	unsigned int new_len = (pkt_len / mss) * mss;
				1182	if (!in_sack && new_len < pkt_len)
				1183	new_len += mss;
				1184	pkt_len = new_len;
				1185	}
				1186
				1187	if (pkt_len >= skb->len && !in_sack)
				1188	return 0;
				1189
				1190	err = tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb,
				1191	pkt_len, mss, GFP_ATOMIC);
				1192	if (err < 0)
				1193	return err;
				1194	}
				1195
				1196	return in_sack;
				1197	}
				1198
				1199	/* Mark the given newly-SACKed range as such, adjusting counters and hints. */
				1200	static u8 tcp_sacktag_one(struct sock *sk,
				1201	struct tcp_sacktag_state *state, u8 sacked,
				1202	u32 start_seq, u32 end_seq,
				1203	int dup_sack, int pcount,
				1204	u64 xmit_time)
				1205	{
				1206	struct tcp_sock *tp = tcp_sk(sk);
				1207
				1208	/* Account D-SACK for retransmitted packet. */
				1209	if (dup_sack && (sacked & TCPCB_RETRANS)) {
				1210	if (tp->undo_marker && tp->undo_retrans > 0 &&
				1211	after(end_seq, tp->undo_marker))
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	1212	tp->undo_retrans = max_t(int, 0, tp->undo_retrans - pcount);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1213	if ((sacked & TCPCB_SACKED_ACKED) &&
				1214	before(start_seq, state->reord))
				1215	state->reord = start_seq;
				1216	}
				1217
				1218	/* Nothing to do; acked frame is about to be dropped (was ACKed). */
				1219	if (!after(end_seq, tp->snd_una))
				1220	return sacked;
				1221
				1222	if (!(sacked & TCPCB_SACKED_ACKED)) {
				1223	tcp_rack_advance(tp, sacked, end_seq, xmit_time);
				1224
				1225	if (sacked & TCPCB_SACKED_RETRANS) {
				1226	/* If the segment is not tagged as lost,
				1227	* we do not clear RETRANS, believing
				1228	* that retransmission is still in flight.
				1229	*/
				1230	if (sacked & TCPCB_LOST) {
				1231	sacked &= ~(TCPCB_LOST\|TCPCB_SACKED_RETRANS);
				1232	tp->lost_out -= pcount;
				1233	tp->retrans_out -= pcount;
				1234	}
				1235	} else {
				1236	if (!(sacked & TCPCB_RETRANS)) {
				1237	/* New sack for not retransmitted frame,
				1238	* which was in hole. It is reordering.
				1239	*/
				1240	if (before(start_seq,
				1241	tcp_highest_sack_seq(tp)) &&
				1242	before(start_seq, state->reord))
				1243	state->reord = start_seq;
				1244
				1245	if (!after(end_seq, tp->high_seq))
				1246	state->flag \|= FLAG_ORIG_SACK_ACKED;
				1247	if (state->first_sackt == 0)
				1248	state->first_sackt = xmit_time;
				1249	state->last_sackt = xmit_time;
				1250	}
				1251
				1252	if (sacked & TCPCB_LOST) {
				1253	sacked &= ~TCPCB_LOST;
				1254	tp->lost_out -= pcount;
				1255	}
				1256	}
				1257
				1258	sacked \|= TCPCB_SACKED_ACKED;
				1259	state->flag \|= FLAG_DATA_SACKED;
				1260	tp->sacked_out += pcount;
				1261	tp->delivered += pcount; /* Out-of-order packets delivered */
				1262
				1263	/* Lost marker hint past SACKed? Tweak RFC3517 cnt */
				1264	if (tp->lost_skb_hint &&
				1265	before(start_seq, TCP_SKB_CB(tp->lost_skb_hint)->seq))
				1266	tp->lost_cnt_hint += pcount;
				1267	}
				1268
				1269	/* D-SACK. We can detect redundant retransmission in S\|R and plain R
				1270	* frames and clear it. undo_retrans is decreased above, L\|R frames
				1271	* are accounted above as well.
				1272	*/
				1273	if (dup_sack && (sacked & TCPCB_SACKED_RETRANS)) {
				1274	sacked &= ~TCPCB_SACKED_RETRANS;
				1275	tp->retrans_out -= pcount;
				1276	}
				1277
				1278	return sacked;
				1279	}
				1280
				1281	/* Shift newly-SACKed bytes from this skb to the immediately previous
				1282	* already-SACKed sk_buff. Mark the newly-SACKed bytes as such.
				1283	*/
				1284	static bool tcp_shifted_skb(struct sock sk, struct sk_buff prev,
				1285	struct sk_buff *skb,
				1286	struct tcp_sacktag_state *state,
				1287	unsigned int pcount, int shifted, int mss,
				1288	bool dup_sack)
				1289	{
				1290	struct tcp_sock *tp = tcp_sk(sk);
				1291	u32 start_seq = TCP_SKB_CB(skb)->seq; /* start of newly-SACKed */
				1292	u32 end_seq = start_seq + shifted; /* end of newly-SACKed */
				1293
				1294	BUG_ON(!pcount);
				1295
				1296	/* Adjust counters and hints for the newly sacked sequence
				1297	* range but discard the return value since prev is already
				1298	* marked. We must tag the range first because the seq
				1299	* advancement below implicitly advances
				1300	* tcp_highest_sack_seq() when skb is highest_sack.
				1301	*/
				1302	tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked,
				1303	start_seq, end_seq, dup_sack, pcount,
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1304	tcp_skb_timestamp_us(skb));
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1305	tcp_rate_skb_delivered(sk, skb, state->rate);
				1306
				1307	if (skb == tp->lost_skb_hint)
				1308	tp->lost_cnt_hint += pcount;
				1309
				1310	TCP_SKB_CB(prev)->end_seq += shifted;
				1311	TCP_SKB_CB(skb)->seq += shifted;
				1312
				1313	tcp_skb_pcount_add(prev, pcount);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1314	WARN_ON_ONCE(tcp_skb_pcount(skb) < pcount);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1315	tcp_skb_pcount_add(skb, -pcount);
				1316
				1317	/* When we're adding to gso_segs == 1, gso_size will be zero,
				1318	* in theory this shouldn't be necessary but as long as DSACK
				1319	* code can come after this skb later on it's better to keep
				1320	* setting gso_size to something.
				1321	*/
				1322	if (!TCP_SKB_CB(prev)->tcp_gso_size)
				1323	TCP_SKB_CB(prev)->tcp_gso_size = mss;
				1324
				1325	/* CHECKME: To clear or not to clear? Mimics normal skb currently */
				1326	if (tcp_skb_pcount(skb) <= 1)
				1327	TCP_SKB_CB(skb)->tcp_gso_size = 0;
				1328
				1329	/* Difference in this won't matter, both ACKed by the same cumul. ACK */
				1330	TCP_SKB_CB(prev)->sacked \|= (TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS);
				1331
				1332	if (skb->len > 0) {
				1333	BUG_ON(!tcp_skb_pcount(skb));
				1334	NET_INC_STATS(sock_net(sk), LINUX_MIB_SACKSHIFTED);
				1335	return false;
				1336	}
				1337
				1338	/* Whole SKB was eaten :-) */
				1339
				1340	if (skb == tp->retransmit_skb_hint)
				1341	tp->retransmit_skb_hint = prev;
				1342	if (skb == tp->lost_skb_hint) {
				1343	tp->lost_skb_hint = prev;
				1344	tp->lost_cnt_hint -= tcp_skb_pcount(prev);
				1345	}
				1346
				1347	TCP_SKB_CB(prev)->tcp_flags \|= TCP_SKB_CB(skb)->tcp_flags;
				1348	TCP_SKB_CB(prev)->eor = TCP_SKB_CB(skb)->eor;
				1349	if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
				1350	TCP_SKB_CB(prev)->end_seq++;
				1351
				1352	if (skb == tcp_highest_sack(sk))
				1353	tcp_advance_highest_sack(sk, skb);
				1354
				1355	tcp_skb_collapse_tstamp(prev, skb);
				1356	if (unlikely(TCP_SKB_CB(prev)->tx.delivered_mstamp))
				1357	TCP_SKB_CB(prev)->tx.delivered_mstamp = 0;
				1358
				1359	tcp_rtx_queue_unlink_and_free(skb, sk);
				1360
				1361	NET_INC_STATS(sock_net(sk), LINUX_MIB_SACKMERGED);
				1362
				1363	return true;
				1364	}
				1365
				1366	/* I wish gso_size would have a bit more sane initialization than
				1367	* something-or-zero which complicates things
				1368	*/
				1369	static int tcp_skb_seglen(const struct sk_buff *skb)
				1370	{
				1371	return tcp_skb_pcount(skb) == 1 ? skb->len : tcp_skb_mss(skb);
				1372	}
				1373
				1374	/* Shifting pages past head area doesn't work */
				1375	static int skb_can_shift(const struct sk_buff *skb)
				1376	{
				1377	return !skb_headlen(skb) && skb_is_nonlinear(skb);
				1378	}
				1379
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1380	int tcp_skb_shift(struct sk_buff to, struct sk_buff from,
				1381	int pcount, int shiftlen)
				1382	{
				1383	/* TCP min gso_size is 8 bytes (TCP_MIN_GSO_SIZE)
				1384	* Since TCP_SKB_CB(skb)->tcp_gso_segs is 16 bits, we need
				1385	* to make sure not storing more than 65535 * 8 bytes per skb,
				1386	* even if current MSS is bigger.
				1387	*/
				1388	if (unlikely(to->len + shiftlen >= 65535 * TCP_MIN_GSO_SIZE))
				1389	return 0;
				1390	if (unlikely(tcp_skb_pcount(to) + pcount > 65535))
				1391	return 0;
				1392	return skb_shift(to, from, shiftlen);
				1393	}
				1394
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1395	/* Try collapsing SACK blocks spanning across multiple skbs to a single
				1396	* skb.
				1397	*/
				1398	static struct sk_buff tcp_shift_skb_data(struct sock sk, struct sk_buff *skb,
				1399	struct tcp_sacktag_state *state,
				1400	u32 start_seq, u32 end_seq,
				1401	bool dup_sack)
				1402	{
				1403	struct tcp_sock *tp = tcp_sk(sk);
				1404	struct sk_buff *prev;
				1405	int mss;
				1406	int pcount = 0;
				1407	int len;
				1408	int in_sack;
				1409
				1410	/* Normally R but no L won't result in plain S */
				1411	if (!dup_sack &&
				1412	(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST\|TCPCB_SACKED_RETRANS)) == TCPCB_SACKED_RETRANS)
				1413	goto fallback;
				1414	if (!skb_can_shift(skb))
				1415	goto fallback;
				1416	/* This frame is about to be dropped (was ACKed). */
				1417	if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
				1418	goto fallback;
				1419
				1420	/* Can only happen with delayed DSACK + discard craziness */
				1421	prev = skb_rb_prev(skb);
				1422	if (!prev)
				1423	goto fallback;
				1424
				1425	if ((TCP_SKB_CB(prev)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED)
				1426	goto fallback;
				1427
				1428	if (!tcp_skb_can_collapse_to(prev))
				1429	goto fallback;
				1430
				1431	in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) &&
				1432	!before(end_seq, TCP_SKB_CB(skb)->end_seq);
				1433
				1434	if (in_sack) {
				1435	len = skb->len;
				1436	pcount = tcp_skb_pcount(skb);
				1437	mss = tcp_skb_seglen(skb);
				1438
				1439	/* TODO: Fix DSACKs to not fragment already SACKed and we can
				1440	* drop this restriction as unnecessary
				1441	*/
				1442	if (mss != tcp_skb_seglen(prev))
				1443	goto fallback;
				1444	} else {
				1445	if (!after(TCP_SKB_CB(skb)->end_seq, start_seq))
				1446	goto noop;
				1447	/* CHECKME: This is non-MSS split case only?, this will
				1448	* cause skipped skbs due to advancing loop btw, original
				1449	* has that feature too
				1450	*/
				1451	if (tcp_skb_pcount(skb) <= 1)
				1452	goto noop;
				1453
				1454	in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq);
				1455	if (!in_sack) {
				1456	/* TODO: head merge to next could be attempted here
				1457	* if (!after(TCP_SKB_CB(skb)->end_seq, end_seq)),
				1458	* though it might not be worth of the additional hassle
				1459	*
				1460	* ...we can probably just fallback to what was done
				1461	* previously. We could try merging non-SACKed ones
				1462	* as well but it probably isn't going to buy off
				1463	* because later SACKs might again split them, and
				1464	* it would make skb timestamp tracking considerably
				1465	* harder problem.
				1466	*/
				1467	goto fallback;
				1468	}
				1469
				1470	len = end_seq - TCP_SKB_CB(skb)->seq;
				1471	BUG_ON(len < 0);
				1472	BUG_ON(len > skb->len);
				1473
				1474	/* MSS boundaries should be honoured or else pcount will
				1475	* severely break even though it makes things bit trickier.
				1476	* Optimize common case to avoid most of the divides
				1477	*/
				1478	mss = tcp_skb_mss(skb);
				1479
				1480	/* TODO: Fix DSACKs to not fragment already SACKed and we can
				1481	* drop this restriction as unnecessary
				1482	*/
				1483	if (mss != tcp_skb_seglen(prev))
				1484	goto fallback;
				1485
				1486	if (len == mss) {
				1487	pcount = 1;
				1488	} else if (len < mss) {
				1489	goto noop;
				1490	} else {
				1491	pcount = len / mss;
				1492	len = pcount * mss;
				1493	}
				1494	}
				1495
				1496	/* tcp_sacktag_one() won't SACK-tag ranges below snd_una */
				1497	if (!after(TCP_SKB_CB(skb)->seq + len, tp->snd_una))
				1498	goto fallback;
				1499
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1500	if (!tcp_skb_shift(prev, skb, pcount, len))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1501	goto fallback;
				1502	if (!tcp_shifted_skb(sk, prev, skb, state, pcount, len, mss, dup_sack))
				1503	goto out;
				1504
				1505	/* Hole filled allows collapsing with the next as well, this is very
				1506	* useful when hole on every nth skb pattern happens
				1507	*/
				1508	skb = skb_rb_next(prev);
				1509	if (!skb)
				1510	goto out;
				1511
				1512	if (!skb_can_shift(skb) \|\|
				1513	((TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED) \|\|
				1514	(mss != tcp_skb_seglen(skb)))
				1515	goto out;
				1516
				1517	len = skb->len;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1518	pcount = tcp_skb_pcount(skb);
				1519	if (tcp_skb_shift(prev, skb, pcount, len))
				1520	tcp_shifted_skb(sk, prev, skb, state, pcount,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1521	len, mss, 0);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1522
				1523	out:
				1524	return prev;
				1525
				1526	noop:
				1527	return skb;
				1528
				1529	fallback:
				1530	NET_INC_STATS(sock_net(sk), LINUX_MIB_SACKSHIFTFALLBACK);
				1531	return NULL;
				1532	}
				1533
				1534	static struct sk_buff tcp_sacktag_walk(struct sk_buff skb, struct sock *sk,
				1535	struct tcp_sack_block *next_dup,
				1536	struct tcp_sacktag_state *state,
				1537	u32 start_seq, u32 end_seq,
				1538	bool dup_sack_in)
				1539	{
				1540	struct tcp_sock *tp = tcp_sk(sk);
				1541	struct sk_buff *tmp;
				1542
				1543	skb_rbtree_walk_from(skb) {
				1544	int in_sack = 0;
				1545	bool dup_sack = dup_sack_in;
				1546
				1547	/* queue is in-order => we can short-circuit the walk early */
				1548	if (!before(TCP_SKB_CB(skb)->seq, end_seq))
				1549	break;
				1550
				1551	if (next_dup &&
				1552	before(TCP_SKB_CB(skb)->seq, next_dup->end_seq)) {
				1553	in_sack = tcp_match_skb_to_sack(sk, skb,
				1554	next_dup->start_seq,
				1555	next_dup->end_seq);
				1556	if (in_sack > 0)
				1557	dup_sack = true;
				1558	}
				1559
				1560	/* skb reference here is a bit tricky to get right, since
				1561	* shifting can eat and free both this skb and the next,
				1562	* so not even _safe variant of the loop is enough.
				1563	*/
				1564	if (in_sack <= 0) {
				1565	tmp = tcp_shift_skb_data(sk, skb, state,
				1566	start_seq, end_seq, dup_sack);
				1567	if (tmp) {
				1568	if (tmp != skb) {
				1569	skb = tmp;
				1570	continue;
				1571	}
				1572
				1573	in_sack = 0;
				1574	} else {
				1575	in_sack = tcp_match_skb_to_sack(sk, skb,
				1576	start_seq,
				1577	end_seq);
				1578	}
				1579	}
				1580
				1581	if (unlikely(in_sack < 0))
				1582	break;
				1583
				1584	if (in_sack) {
				1585	TCP_SKB_CB(skb)->sacked =
				1586	tcp_sacktag_one(sk,
				1587	state,
				1588	TCP_SKB_CB(skb)->sacked,
				1589	TCP_SKB_CB(skb)->seq,
				1590	TCP_SKB_CB(skb)->end_seq,
				1591	dup_sack,
				1592	tcp_skb_pcount(skb),
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1593	tcp_skb_timestamp_us(skb));
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1594	tcp_rate_skb_delivered(sk, skb, state->rate);
				1595	if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
				1596	list_del_init(&skb->tcp_tsorted_anchor);
				1597
				1598	if (!before(TCP_SKB_CB(skb)->seq,
				1599	tcp_highest_sack_seq(tp)))
				1600	tcp_advance_highest_sack(sk, skb);
				1601	}
				1602	}
				1603	return skb;
				1604	}
				1605
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1606	static struct sk_buff tcp_sacktag_bsearch(struct sock sk, u32 seq)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1607	{
				1608	struct rb_node parent, *p = &sk->tcp_rtx_queue.rb_node;
				1609	struct sk_buff *skb;
				1610
				1611	while (*p) {
				1612	parent = *p;
				1613	skb = rb_to_skb(parent);
				1614	if (before(seq, TCP_SKB_CB(skb)->seq)) {
				1615	p = &parent->rb_left;
				1616	continue;
				1617	}
				1618	if (!before(seq, TCP_SKB_CB(skb)->end_seq)) {
				1619	p = &parent->rb_right;
				1620	continue;
				1621	}
				1622	return skb;
				1623	}
				1624	return NULL;
				1625	}
				1626
				1627	static struct sk_buff tcp_sacktag_skip(struct sk_buff skb, struct sock *sk,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1628	u32 skip_to_seq)
				1629	{
				1630	if (skb && after(TCP_SKB_CB(skb)->seq, skip_to_seq))
				1631	return skb;
				1632
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1633	return tcp_sacktag_bsearch(sk, skip_to_seq);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1634	}
				1635
				1636	static struct sk_buff tcp_maybe_skipping_dsack(struct sk_buff skb,
				1637	struct sock *sk,
				1638	struct tcp_sack_block *next_dup,
				1639	struct tcp_sacktag_state *state,
				1640	u32 skip_to_seq)
				1641	{
				1642	if (!next_dup)
				1643	return skb;
				1644
				1645	if (before(next_dup->start_seq, skip_to_seq)) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1646	skb = tcp_sacktag_skip(skb, sk, next_dup->start_seq);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1647	skb = tcp_sacktag_walk(skb, sk, NULL, state,
				1648	next_dup->start_seq, next_dup->end_seq,
				1649	1);
				1650	}
				1651
				1652	return skb;
				1653	}
				1654
				1655	static int tcp_sack_cache_ok(const struct tcp_sock tp, const struct tcp_sack_block cache)
				1656	{
				1657	return cache < tp->recv_sack_cache + ARRAY_SIZE(tp->recv_sack_cache);
				1658	}
				1659
				1660	static int
				1661	tcp_sacktag_write_queue(struct sock sk, const struct sk_buff ack_skb,
				1662	u32 prior_snd_una, struct tcp_sacktag_state *state)
				1663	{
				1664	struct tcp_sock *tp = tcp_sk(sk);
				1665	const unsigned char *ptr = (skb_transport_header(ack_skb) +
				1666	TCP_SKB_CB(ack_skb)->sacked);
				1667	struct tcp_sack_block_wire sp_wire = (struct tcp_sack_block_wire )(ptr+2);
				1668	struct tcp_sack_block sp[TCP_NUM_SACKS];
				1669	struct tcp_sack_block *cache;
				1670	struct sk_buff *skb;
				1671	int num_sacks = min(TCP_NUM_SACKS, (ptr[1] - TCPOLEN_SACK_BASE) >> 3);
				1672	int used_sacks;
				1673	bool found_dup_sack = false;
				1674	int i, j;
				1675	int first_sack_index;
				1676
				1677	state->flag = 0;
				1678	state->reord = tp->snd_nxt;
				1679
				1680	if (!tp->sacked_out)
				1681	tcp_highest_sack_reset(sk);
				1682
				1683	found_dup_sack = tcp_check_dsack(sk, ack_skb, sp_wire,
				1684	num_sacks, prior_snd_una);
				1685	if (found_dup_sack) {
				1686	state->flag \|= FLAG_DSACKING_ACK;
				1687	tp->delivered++; /* A spurious retransmission is delivered */
				1688	}
				1689
				1690	/* Eliminate too old ACKs, but take into
				1691	* account more or less fresh ones, they can
				1692	* contain valid SACK info.
				1693	*/
				1694	if (before(TCP_SKB_CB(ack_skb)->ack_seq, prior_snd_una - tp->max_window))
				1695	return 0;
				1696
				1697	if (!tp->packets_out)
				1698	goto out;
				1699
				1700	used_sacks = 0;
				1701	first_sack_index = 0;
				1702	for (i = 0; i < num_sacks; i++) {
				1703	bool dup_sack = !i && found_dup_sack;
				1704
				1705	sp[used_sacks].start_seq = get_unaligned_be32(&sp_wire[i].start_seq);
				1706	sp[used_sacks].end_seq = get_unaligned_be32(&sp_wire[i].end_seq);
				1707
				1708	if (!tcp_is_sackblock_valid(tp, dup_sack,
				1709	sp[used_sacks].start_seq,
				1710	sp[used_sacks].end_seq)) {
				1711	int mib_idx;
				1712
				1713	if (dup_sack) {
				1714	if (!tp->undo_marker)
				1715	mib_idx = LINUX_MIB_TCPDSACKIGNOREDNOUNDO;
				1716	else
				1717	mib_idx = LINUX_MIB_TCPDSACKIGNOREDOLD;
				1718	} else {
				1719	/* Don't count olds caused by ACK reordering */
				1720	if ((TCP_SKB_CB(ack_skb)->ack_seq != tp->snd_una) &&
				1721	!after(sp[used_sacks].end_seq, tp->snd_una))
				1722	continue;
				1723	mib_idx = LINUX_MIB_TCPSACKDISCARD;
				1724	}
				1725
				1726	NET_INC_STATS(sock_net(sk), mib_idx);
				1727	if (i == 0)
				1728	first_sack_index = -1;
				1729	continue;
				1730	}
				1731
				1732	/* Ignore very old stuff early */
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	1733	if (!after(sp[used_sacks].end_seq, prior_snd_una)) {
				1734	if (i == 0)
				1735	first_sack_index = -1;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1736	continue;
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	1737	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1738
				1739	used_sacks++;
				1740	}
				1741
				1742	/* order SACK blocks to allow in order walk of the retrans queue */
				1743	for (i = used_sacks - 1; i > 0; i--) {
				1744	for (j = 0; j < i; j++) {
				1745	if (after(sp[j].start_seq, sp[j + 1].start_seq)) {
				1746	swap(sp[j], sp[j + 1]);
				1747
				1748	/* Track where the first SACK block goes to */
				1749	if (j == first_sack_index)
				1750	first_sack_index = j + 1;
				1751	}
				1752	}
				1753	}
				1754
				1755	state->mss_now = tcp_current_mss(sk);
				1756	skb = NULL;
				1757	i = 0;
				1758
				1759	if (!tp->sacked_out) {
				1760	/* It's already past, so skip checking against it */
				1761	cache = tp->recv_sack_cache + ARRAY_SIZE(tp->recv_sack_cache);
				1762	} else {
				1763	cache = tp->recv_sack_cache;
				1764	/* Skip empty blocks in at head of the cache */
				1765	while (tcp_sack_cache_ok(tp, cache) && !cache->start_seq &&
				1766	!cache->end_seq)
				1767	cache++;
				1768	}
				1769
				1770	while (i < used_sacks) {
				1771	u32 start_seq = sp[i].start_seq;
				1772	u32 end_seq = sp[i].end_seq;
				1773	bool dup_sack = (found_dup_sack && (i == first_sack_index));
				1774	struct tcp_sack_block *next_dup = NULL;
				1775
				1776	if (found_dup_sack && ((i + 1) == first_sack_index))
				1777	next_dup = &sp[i + 1];
				1778
				1779	/* Skip too early cached blocks */
				1780	while (tcp_sack_cache_ok(tp, cache) &&
				1781	!before(start_seq, cache->end_seq))
				1782	cache++;
				1783
				1784	/* Can skip some work by looking recv_sack_cache? */
				1785	if (tcp_sack_cache_ok(tp, cache) && !dup_sack &&
				1786	after(end_seq, cache->start_seq)) {
				1787
				1788	/* Head todo? */
				1789	if (before(start_seq, cache->start_seq)) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1790	skb = tcp_sacktag_skip(skb, sk, start_seq);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1791	skb = tcp_sacktag_walk(skb, sk, next_dup,
				1792	state,
				1793	start_seq,
				1794	cache->start_seq,
				1795	dup_sack);
				1796	}
				1797
				1798	/* Rest of the block already fully processed? */
				1799	if (!after(end_seq, cache->end_seq))
				1800	goto advance_sp;
				1801
				1802	skb = tcp_maybe_skipping_dsack(skb, sk, next_dup,
				1803	state,
				1804	cache->end_seq);
				1805
				1806	/* ...tail remains todo... */
				1807	if (tcp_highest_sack_seq(tp) == cache->end_seq) {
				1808	/* ...but better entrypoint exists! */
				1809	skb = tcp_highest_sack(sk);
				1810	if (!skb)
				1811	break;
				1812	cache++;
				1813	goto walk;
				1814	}
				1815
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1816	skb = tcp_sacktag_skip(skb, sk, cache->end_seq);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1817	/* Check overlap against next cached too (past this one already) */
				1818	cache++;
				1819	continue;
				1820	}
				1821
				1822	if (!before(start_seq, tcp_highest_sack_seq(tp))) {
				1823	skb = tcp_highest_sack(sk);
				1824	if (!skb)
				1825	break;
				1826	}
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1827	skb = tcp_sacktag_skip(skb, sk, start_seq);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1828
				1829	walk:
				1830	skb = tcp_sacktag_walk(skb, sk, next_dup, state,
				1831	start_seq, end_seq, dup_sack);
				1832
				1833	advance_sp:
				1834	i++;
				1835	}
				1836
				1837	/* Clear the head of the cache sack blocks so we can skip it next time */
				1838	for (i = 0; i < ARRAY_SIZE(tp->recv_sack_cache) - used_sacks; i++) {
				1839	tp->recv_sack_cache[i].start_seq = 0;
				1840	tp->recv_sack_cache[i].end_seq = 0;
				1841	}
				1842	for (j = 0; j < used_sacks; j++)
				1843	tp->recv_sack_cache[i++] = sp[j];
				1844
				1845	if (inet_csk(sk)->icsk_ca_state != TCP_CA_Loss \|\| tp->undo_marker)
				1846	tcp_check_sack_reordering(sk, state->reord, 0);
				1847
				1848	tcp_verify_left_out(tp);
				1849	out:
				1850
				1851	#if FASTRETRANS_DEBUG > 0
				1852	WARN_ON((int)tp->sacked_out < 0);
				1853	WARN_ON((int)tp->lost_out < 0);
				1854	WARN_ON((int)tp->retrans_out < 0);
				1855	WARN_ON((int)tcp_packets_in_flight(tp) < 0);
				1856	#endif
				1857	return state->flag;
				1858	}
				1859
				1860	/* Limits sacked_out so that sum with lost_out isn't ever larger than
				1861	* packets_out. Returns false if sacked_out adjustement wasn't necessary.
				1862	*/
				1863	static bool tcp_limit_reno_sacked(struct tcp_sock *tp)
				1864	{
				1865	u32 holes;
				1866
				1867	holes = max(tp->lost_out, 1U);
				1868	holes = min(holes, tp->packets_out);
				1869
				1870	if ((tp->sacked_out + holes) > tp->packets_out) {
				1871	tp->sacked_out = tp->packets_out - holes;
				1872	return true;
				1873	}
				1874	return false;
				1875	}
				1876
				1877	/* If we receive more dupacks than we expected counting segments
				1878	* in assumption of absent reordering, interpret this as reordering.
				1879	* The only another reason could be bug in receiver TCP.
				1880	*/
				1881	static void tcp_check_reno_reordering(struct sock *sk, const int addend)
				1882	{
				1883	struct tcp_sock *tp = tcp_sk(sk);
				1884
				1885	if (!tcp_limit_reno_sacked(tp))
				1886	return;
				1887
				1888	tp->reordering = min_t(u32, tp->packets_out + addend,
				1889	sock_net(sk)->ipv4.sysctl_tcp_max_reordering);
				1890	tp->reord_seen++;
				1891	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRENOREORDER);
				1892	}
				1893
				1894	/* Emulate SACKs for SACKless connection: account for a new dupack. */
				1895
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1896	static void tcp_add_reno_sack(struct sock *sk, int num_dupack)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1897	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1898	if (num_dupack) {
				1899	struct tcp_sock *tp = tcp_sk(sk);
				1900	u32 prior_sacked = tp->sacked_out;
				1901	s32 delivered;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1902
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1903	tp->sacked_out += num_dupack;
				1904	tcp_check_reno_reordering(sk, 0);
				1905	delivered = tp->sacked_out - prior_sacked;
				1906	if (delivered > 0)
				1907	tp->delivered += delivered;
				1908	tcp_verify_left_out(tp);
				1909	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1910	}
				1911
				1912	/* Account for ACK, ACKing some data in Reno Recovery phase. */
				1913
				1914	static void tcp_remove_reno_sacks(struct sock *sk, int acked)
				1915	{
				1916	struct tcp_sock *tp = tcp_sk(sk);
				1917
				1918	if (acked > 0) {
				1919	/* One ACK acked hole. The rest eat duplicate ACKs. */
				1920	tp->delivered += max_t(int, acked - tp->sacked_out, 1);
				1921	if (acked - 1 >= tp->sacked_out)
				1922	tp->sacked_out = 0;
				1923	else
				1924	tp->sacked_out -= acked - 1;
				1925	}
				1926	tcp_check_reno_reordering(sk, acked);
				1927	tcp_verify_left_out(tp);
				1928	}
				1929
				1930	static inline void tcp_reset_reno_sack(struct tcp_sock *tp)
				1931	{
				1932	tp->sacked_out = 0;
				1933	}
				1934
				1935	void tcp_clear_retrans(struct tcp_sock *tp)
				1936	{
				1937	tp->retrans_out = 0;
				1938	tp->lost_out = 0;
				1939	tp->undo_marker = 0;
				1940	tp->undo_retrans = -1;
				1941	tp->sacked_out = 0;
				1942	}
				1943
				1944	static inline void tcp_init_undo(struct tcp_sock *tp)
				1945	{
				1946	tp->undo_marker = tp->snd_una;
				1947	/* Retransmission still in flight may cause DSACKs later. */
				1948	tp->undo_retrans = tp->retrans_out ? : -1;
				1949	}
				1950
				1951	static bool tcp_is_rack(const struct sock *sk)
				1952	{
				1953	return sock_net(sk)->ipv4.sysctl_tcp_recovery & TCP_RACK_LOSS_DETECTION;
				1954	}
				1955
				1956	/* If we detect SACK reneging, forget all SACK information
				1957	* and reset tags completely, otherwise preserve SACKs. If receiver
				1958	* dropped its ofo queue, we will know this due to reneging detection.
				1959	*/
				1960	static void tcp_timeout_mark_lost(struct sock *sk)
				1961	{
				1962	struct tcp_sock *tp = tcp_sk(sk);
				1963	struct sk_buff skb, head;
				1964	bool is_reneg; /* is receiver reneging on SACKs? */
				1965
				1966	head = tcp_rtx_queue_head(sk);
				1967	is_reneg = head && (TCP_SKB_CB(head)->sacked & TCPCB_SACKED_ACKED);
				1968	if (is_reneg) {
				1969	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSACKRENEGING);
				1970	tp->sacked_out = 0;
				1971	/* Mark SACK reneging until we recover from this loss event. */
				1972	tp->is_sack_reneg = 1;
				1973	} else if (tcp_is_reno(tp)) {
				1974	tcp_reset_reno_sack(tp);
				1975	}
				1976
				1977	skb = head;
				1978	skb_rbtree_walk_from(skb) {
				1979	if (is_reneg)
				1980	TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED;
				1981	else if (tcp_is_rack(sk) && skb != head &&
				1982	tcp_rack_skb_timeout(tp, skb, 0) > 0)
				1983	continue; /* Don't mark recently sent ones lost yet */
				1984	tcp_mark_skb_lost(sk, skb);
				1985	}
				1986	tcp_verify_left_out(tp);
				1987	tcp_clear_all_retrans_hints(tp);
				1988	}
				1989
				1990	/* Enter Loss state. */
				1991	void tcp_enter_loss(struct sock *sk)
				1992	{
				1993	const struct inet_connection_sock *icsk = inet_csk(sk);
				1994	struct tcp_sock *tp = tcp_sk(sk);
				1995	struct net *net = sock_net(sk);
				1996	bool new_recovery = icsk->icsk_ca_state < TCP_CA_Recovery;
				1997
				1998	tcp_timeout_mark_lost(sk);
				1999
				2000	/* Reduce ssthresh if it has not yet been made inside this window. */
				2001	if (icsk->icsk_ca_state <= TCP_CA_Disorder \|\|
				2002	!after(tp->high_seq, tp->snd_una) \|\|
				2003	(icsk->icsk_ca_state == TCP_CA_Loss && !icsk->icsk_retransmits)) {
				2004	tp->prior_ssthresh = tcp_current_ssthresh(sk);
				2005	tp->prior_cwnd = tp->snd_cwnd;
				2006	tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
				2007	tcp_ca_event(sk, CA_EVENT_LOSS);
				2008	tcp_init_undo(tp);
				2009	}
				2010	tp->snd_cwnd = tcp_packets_in_flight(tp) + 1;
				2011	tp->snd_cwnd_cnt = 0;
				2012	tp->snd_cwnd_stamp = tcp_jiffies32;
				2013
				2014	/* Timeout in disordered state after receiving substantial DUPACKs
				2015	* suggests that the degree of reordering is over-estimated.
				2016	*/
				2017	if (icsk->icsk_ca_state <= TCP_CA_Disorder &&
				2018	tp->sacked_out >= net->ipv4.sysctl_tcp_reordering)
				2019	tp->reordering = min_t(unsigned int, tp->reordering,
				2020	net->ipv4.sysctl_tcp_reordering);
				2021	tcp_set_ca_state(sk, TCP_CA_Loss);
				2022	tp->high_seq = tp->snd_nxt;
				2023	tcp_ecn_queue_cwr(tp);
				2024
				2025	/* F-RTO RFC5682 sec 3.1 step 1: retransmit SND.UNA if no previous
				2026	* loss recovery is underway except recurring timeout(s) on
				2027	* the same SND.UNA (sec 3.2). Disable F-RTO on path MTU probing
				2028	*/
				2029	tp->frto = net->ipv4.sysctl_tcp_frto &&
				2030	(new_recovery \|\| icsk->icsk_retransmits) &&
				2031	!inet_csk(sk)->icsk_mtup.probe_size;
				2032	}
				2033
				2034	/* If ACK arrived pointing to a remembered SACK, it means that our
				2035	* remembered SACKs do not reflect real state of receiver i.e.
				2036	* receiver _host_ is heavily congested (or buggy).
				2037	*
				2038	* To avoid big spurious retransmission bursts due to transient SACK
				2039	* scoreboard oddities that look like reneging, we give the receiver a
				2040	* little time (max(RTT/2, 10ms)) to send us some more ACKs that will
				2041	* restore sanity to the SACK scoreboard. If the apparent reneging
				2042	* persists until this RTO then we'll clear the SACK scoreboard.
				2043	*/
				2044	static bool tcp_check_sack_reneging(struct sock *sk, int flag)
				2045	{
				2046	if (flag & FLAG_SACK_RENEGING) {
				2047	struct tcp_sock *tp = tcp_sk(sk);
				2048	unsigned long delay = max(usecs_to_jiffies(tp->srtt_us >> 4),
				2049	msecs_to_jiffies(10));
				2050
				2051	inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
				2052	delay, TCP_RTO_MAX);
				2053	return true;
				2054	}
				2055	return false;
				2056	}
				2057
				2058	/* Heurestics to calculate number of duplicate ACKs. There's no dupACKs
				2059	* counter when SACK is enabled (without SACK, sacked_out is used for
				2060	* that purpose).
				2061	*
				2062	* With reordering, holes may still be in flight, so RFC3517 recovery
				2063	* uses pure sacked_out (total number of SACKed segments) even though
				2064	* it violates the RFC that uses duplicate ACKs, often these are equal
				2065	* but when e.g. out-of-window ACKs or packet duplication occurs,
				2066	* they differ. Since neither occurs due to loss, TCP should really
				2067	* ignore them.
				2068	*/
				2069	static inline int tcp_dupack_heuristics(const struct tcp_sock *tp)
				2070	{
				2071	return tp->sacked_out + 1;
				2072	}
				2073
				2074	/* Linux NewReno/SACK/ECN state machine.
				2075	* --------------------------------------
				2076	*
				2077	* "Open" Normal state, no dubious events, fast path.
				2078	* "Disorder" In all the respects it is "Open",
				2079	* but requires a bit more attention. It is entered when
				2080	* we see some SACKs or dupacks. It is split of "Open"
				2081	* mainly to move some processing from fast path to slow one.
				2082	* "CWR" CWND was reduced due to some Congestion Notification event.
				2083	* It can be ECN, ICMP source quench, local device congestion.
				2084	* "Recovery" CWND was reduced, we are fast-retransmitting.
				2085	* "Loss" CWND was reduced due to RTO timeout or SACK reneging.
				2086	*
				2087	* tcp_fastretrans_alert() is entered:
				2088	* - each incoming ACK, if state is not "Open"
				2089	* - when arrived ACK is unusual, namely:
				2090	* * SACK
				2091	* * Duplicate ACK.
				2092	* * ECN ECE.
				2093	*
				2094	* Counting packets in flight is pretty simple.
				2095	*
				2096	* in_flight = packets_out - left_out + retrans_out
				2097	*
				2098	* packets_out is SND.NXT-SND.UNA counted in packets.
				2099	*
				2100	* retrans_out is number of retransmitted segments.
				2101	*
				2102	* left_out is number of segments left network, but not ACKed yet.
				2103	*
				2104	* left_out = sacked_out + lost_out
				2105	*
				2106	* sacked_out: Packets, which arrived to receiver out of order
				2107	* and hence not ACKed. With SACKs this number is simply
				2108	* amount of SACKed data. Even without SACKs
				2109	* it is easy to give pretty reliable estimate of this number,
				2110	* counting duplicate ACKs.
				2111	*
				2112	* lost_out: Packets lost by network. TCP has no explicit
				2113	* "loss notification" feedback from network (for now).
				2114	* It means that this number can be only _guessed_.
				2115	* Actually, it is the heuristics to predict lossage that
				2116	* distinguishes different algorithms.
				2117	*
				2118	* F.e. after RTO, when all the queue is considered as lost,
				2119	* lost_out = packets_out and in_flight = retrans_out.
				2120	*
				2121	* Essentially, we have now a few algorithms detecting
				2122	* lost packets.
				2123	*
				2124	* If the receiver supports SACK:
				2125	*
				2126	* RFC6675/3517: It is the conventional algorithm. A packet is
				2127	* considered lost if the number of higher sequence packets
				2128	* SACKed is greater than or equal the DUPACK thoreshold
				2129	* (reordering). This is implemented in tcp_mark_head_lost and
				2130	* tcp_update_scoreboard.
				2131	*
				2132	* RACK (draft-ietf-tcpm-rack-01): it is a newer algorithm
				2133	* (2017-) that checks timing instead of counting DUPACKs.
				2134	* Essentially a packet is considered lost if it's not S/ACKed
				2135	* after RTT + reordering_window, where both metrics are
				2136	* dynamically measured and adjusted. This is implemented in
				2137	* tcp_rack_mark_lost.
				2138	*
				2139	* If the receiver does not support SACK:
				2140	*
				2141	* NewReno (RFC6582): in Recovery we assume that one segment
				2142	* is lost (classic Reno). While we are in Recovery and
				2143	* a partial ACK arrives, we assume that one more packet
				2144	* is lost (NewReno). This heuristics are the same in NewReno
				2145	* and SACK.
				2146	*
				2147	* Really tricky (and requiring careful tuning) part of algorithm
				2148	* is hidden in functions tcp_time_to_recover() and tcp_xmit_retransmit_queue().
				2149	* The first determines the moment _when_ we should reduce CWND and,
				2150	* hence, slow down forward transmission. In fact, it determines the moment
				2151	* when we decide that hole is caused by loss, rather than by a reorder.
				2152	*
				2153	* tcp_xmit_retransmit_queue() decides, _what_ we should retransmit to fill
				2154	* holes, caused by lost packets.
				2155	*
				2156	* And the most logically complicated part of algorithm is undo
				2157	* heuristics. We detect false retransmits due to both too early
				2158	* fast retransmit (reordering) and underestimated RTO, analyzing
				2159	* timestamps and D-SACKs. When we detect that some segments were
				2160	* retransmitted by mistake and CWND reduction was wrong, we undo
				2161	* window reduction and abort recovery phase. This logic is hidden
				2162	* inside several functions named tcp_try_undo_<something>.
				2163	*/
				2164
				2165	/* This function decides, when we should leave Disordered state
				2166	* and enter Recovery phase, reducing congestion window.
				2167	*
				2168	* Main question: may we further continue forward transmission
				2169	* with the same cwnd?
				2170	*/
				2171	static bool tcp_time_to_recover(struct sock *sk, int flag)
				2172	{
				2173	struct tcp_sock *tp = tcp_sk(sk);
				2174
				2175	/* Trick#1: The loss is proven. */
				2176	if (tp->lost_out)
				2177	return true;
				2178
				2179	/* Not-A-Trick#2 : Classic rule... */
				2180	if (!tcp_is_rack(sk) && tcp_dupack_heuristics(tp) > tp->reordering)
				2181	return true;
				2182
				2183	return false;
				2184	}
				2185
				2186	/* Detect loss in event "A" above by marking head of queue up as lost.
				2187	* For non-SACK(Reno) senders, the first "packets" number of segments
				2188	* are considered lost. For RFC3517 SACK, a segment is considered lost if it
				2189	* has at least tp->reordering SACKed seqments above it; "packets" refers to
				2190	* the maximum SACKed segments to pass before reaching this limit.
				2191	*/
				2192	static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head)
				2193	{
				2194	struct tcp_sock *tp = tcp_sk(sk);
				2195	struct sk_buff *skb;
				2196	int cnt, oldcnt, lost;
				2197	unsigned int mss;
				2198	/* Use SACK to deduce losses of new sequences sent during recovery */
				2199	const u32 loss_high = tcp_is_sack(tp) ? tp->snd_nxt : tp->high_seq;
				2200
				2201	WARN_ON(packets > tp->packets_out);
				2202	skb = tp->lost_skb_hint;
				2203	if (skb) {
				2204	/* Head already handled? */
				2205	if (mark_head && after(TCP_SKB_CB(skb)->seq, tp->snd_una))
				2206	return;
				2207	cnt = tp->lost_cnt_hint;
				2208	} else {
				2209	skb = tcp_rtx_queue_head(sk);
				2210	cnt = 0;
				2211	}
				2212
				2213	skb_rbtree_walk_from(skb) {
				2214	/* TODO: do this better */
				2215	/* this is not the most efficient way to do this... */
				2216	tp->lost_skb_hint = skb;
				2217	tp->lost_cnt_hint = cnt;
				2218
				2219	if (after(TCP_SKB_CB(skb)->end_seq, loss_high))
				2220	break;
				2221
				2222	oldcnt = cnt;
				2223	if (tcp_is_reno(tp) \|\|
				2224	(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
				2225	cnt += tcp_skb_pcount(skb);
				2226
				2227	if (cnt > packets) {
				2228	if (tcp_is_sack(tp) \|\|
				2229	(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) \|\|
				2230	(oldcnt >= packets))
				2231	break;
				2232
				2233	mss = tcp_skb_mss(skb);
				2234	/* If needed, chop off the prefix to mark as lost. */
				2235	lost = (packets - oldcnt) * mss;
				2236	if (lost < skb->len &&
				2237	tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb,
				2238	lost, mss, GFP_ATOMIC) < 0)
				2239	break;
				2240	cnt = packets;
				2241	}
				2242
				2243	tcp_skb_mark_lost(tp, skb);
				2244
				2245	if (mark_head)
				2246	break;
				2247	}
				2248	tcp_verify_left_out(tp);
				2249	}
				2250
				2251	/* Account newly detected lost packet(s) */
				2252
				2253	static void tcp_update_scoreboard(struct sock *sk, int fast_rexmit)
				2254	{
				2255	struct tcp_sock *tp = tcp_sk(sk);
				2256
				2257	if (tcp_is_sack(tp)) {
				2258	int sacked_upto = tp->sacked_out - tp->reordering;
				2259	if (sacked_upto >= 0)
				2260	tcp_mark_head_lost(sk, sacked_upto, 0);
				2261	else if (fast_rexmit)
				2262	tcp_mark_head_lost(sk, 1, 1);
				2263	}
				2264	}
				2265
				2266	static bool tcp_tsopt_ecr_before(const struct tcp_sock *tp, u32 when)
				2267	{
				2268	return tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
				2269	before(tp->rx_opt.rcv_tsecr, when);
				2270	}
				2271
				2272	/* skb is spurious retransmitted if the returned timestamp echo
				2273	* reply is prior to the skb transmission time
				2274	*/
				2275	static bool tcp_skb_spurious_retrans(const struct tcp_sock *tp,
				2276	const struct sk_buff *skb)
				2277	{
				2278	return (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS) &&
				2279	tcp_tsopt_ecr_before(tp, tcp_skb_timestamp(skb));
				2280	}
				2281
				2282	/* Nothing was retransmitted or returned timestamp is less
				2283	* than timestamp of the first retransmission.
				2284	*/
				2285	static inline bool tcp_packet_delayed(const struct tcp_sock *tp)
				2286	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2287	return tp->retrans_stamp &&
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2288	tcp_tsopt_ecr_before(tp, tp->retrans_stamp);
				2289	}
				2290
				2291	/* Undo procedures. */
				2292
				2293	/* We can clear retrans_stamp when there are no retransmissions in the
				2294	* window. It would seem that it is trivially available for us in
				2295	* tp->retrans_out, however, that kind of assumptions doesn't consider
				2296	* what will happen if errors occur when sending retransmission for the
				2297	* second time. ...It could the that such segment has only
				2298	* TCPCB_EVER_RETRANS set at the present time. It seems that checking
				2299	* the head skb is enough except for some reneging corner cases that
				2300	* are not worth the effort.
				2301	*
				2302	* Main reason for all this complexity is the fact that connection dying
				2303	* time now depends on the validity of the retrans_stamp, in particular,
				2304	* that successive retransmissions of a segment must not advance
				2305	* retrans_stamp under any conditions.
				2306	*/
				2307	static bool tcp_any_retrans_done(const struct sock *sk)
				2308	{
				2309	const struct tcp_sock *tp = tcp_sk(sk);
				2310	struct sk_buff *skb;
				2311
				2312	if (tp->retrans_out)
				2313	return true;
				2314
				2315	skb = tcp_rtx_queue_head(sk);
				2316	if (unlikely(skb && TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS))
				2317	return true;
				2318
				2319	return false;
				2320	}
				2321
				2322	static void DBGUNDO(struct sock sk, const char msg)
				2323	{
				2324	#if FASTRETRANS_DEBUG > 1
				2325	struct tcp_sock *tp = tcp_sk(sk);
				2326	struct inet_sock *inet = inet_sk(sk);
				2327
				2328	if (sk->sk_family == AF_INET) {
				2329	pr_debug("Undo %s %pI4/%u c%u l%u ss%u/%u p%u\n",
				2330	msg,
				2331	&inet->inet_daddr, ntohs(inet->inet_dport),
				2332	tp->snd_cwnd, tcp_left_out(tp),
				2333	tp->snd_ssthresh, tp->prior_ssthresh,
				2334	tp->packets_out);
				2335	}
				2336	#if IS_ENABLED(CONFIG_IPV6)
				2337	else if (sk->sk_family == AF_INET6) {
				2338	pr_debug("Undo %s %pI6/%u c%u l%u ss%u/%u p%u\n",
				2339	msg,
				2340	&sk->sk_v6_daddr, ntohs(inet->inet_dport),
				2341	tp->snd_cwnd, tcp_left_out(tp),
				2342	tp->snd_ssthresh, tp->prior_ssthresh,
				2343	tp->packets_out);
				2344	}
				2345	#endif
				2346	#endif
				2347	}
				2348
				2349	static void tcp_undo_cwnd_reduction(struct sock *sk, bool unmark_loss)
				2350	{
				2351	struct tcp_sock *tp = tcp_sk(sk);
				2352
				2353	if (unmark_loss) {
				2354	struct sk_buff *skb;
				2355
				2356	skb_rbtree_walk(skb, &sk->tcp_rtx_queue) {
				2357	TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST;
				2358	}
				2359	tp->lost_out = 0;
				2360	tcp_clear_all_retrans_hints(tp);
				2361	}
				2362
				2363	if (tp->prior_ssthresh) {
				2364	const struct inet_connection_sock *icsk = inet_csk(sk);
				2365
				2366	tp->snd_cwnd = icsk->icsk_ca_ops->undo_cwnd(sk);
				2367
				2368	if (tp->prior_ssthresh > tp->snd_ssthresh) {
				2369	tp->snd_ssthresh = tp->prior_ssthresh;
				2370	tcp_ecn_withdraw_cwr(tp);
				2371	}
				2372	}
				2373	tp->snd_cwnd_stamp = tcp_jiffies32;
				2374	tp->undo_marker = 0;
				2375	tp->rack.advanced = 1; /* Force RACK to re-exam losses */
				2376	}
				2377
				2378	static inline bool tcp_may_undo(const struct tcp_sock *tp)
				2379	{
				2380	return tp->undo_marker && (!tp->undo_retrans \|\| tcp_packet_delayed(tp));
				2381	}
				2382
				2383	/* People celebrate: "We love our President!" */
				2384	static bool tcp_try_undo_recovery(struct sock *sk)
				2385	{
				2386	struct tcp_sock *tp = tcp_sk(sk);
				2387
				2388	if (tcp_may_undo(tp)) {
				2389	int mib_idx;
				2390
				2391	/* Happy end! We did not retransmit anything
				2392	* or our original transmission succeeded.
				2393	*/
				2394	DBGUNDO(sk, inet_csk(sk)->icsk_ca_state == TCP_CA_Loss ? "loss" : "retrans");
				2395	tcp_undo_cwnd_reduction(sk, false);
				2396	if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss)
				2397	mib_idx = LINUX_MIB_TCPLOSSUNDO;
				2398	else
				2399	mib_idx = LINUX_MIB_TCPFULLUNDO;
				2400
				2401	NET_INC_STATS(sock_net(sk), mib_idx);
				2402	} else if (tp->rack.reo_wnd_persist) {
				2403	tp->rack.reo_wnd_persist--;
				2404	}
				2405	if (tp->snd_una == tp->high_seq && tcp_is_reno(tp)) {
				2406	/* Hold old state until something above high_seq
				2407	* is ACKed. For Reno it is MUST to prevent false
				2408	* fast retransmits (RFC2582). SACK TCP is safe. */
				2409	if (!tcp_any_retrans_done(sk))
				2410	tp->retrans_stamp = 0;
				2411	return true;
				2412	}
				2413	tcp_set_ca_state(sk, TCP_CA_Open);
				2414	tp->is_sack_reneg = 0;
				2415	return false;
				2416	}
				2417
				2418	/* Try to undo cwnd reduction, because D-SACKs acked all retransmitted data */
				2419	static bool tcp_try_undo_dsack(struct sock *sk)
				2420	{
				2421	struct tcp_sock *tp = tcp_sk(sk);
				2422
				2423	if (tp->undo_marker && !tp->undo_retrans) {
				2424	tp->rack.reo_wnd_persist = min(TCP_RACK_RECOVERY_THRESH,
				2425	tp->rack.reo_wnd_persist + 1);
				2426	DBGUNDO(sk, "D-SACK");
				2427	tcp_undo_cwnd_reduction(sk, false);
				2428	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKUNDO);
				2429	return true;
				2430	}
				2431	return false;
				2432	}
				2433
				2434	/* Undo during loss recovery after partial ACK or using F-RTO. */
				2435	static bool tcp_try_undo_loss(struct sock *sk, bool frto_undo)
				2436	{
				2437	struct tcp_sock *tp = tcp_sk(sk);
				2438
				2439	if (frto_undo \|\| tcp_may_undo(tp)) {
				2440	tcp_undo_cwnd_reduction(sk, true);
				2441
				2442	DBGUNDO(sk, "partial loss");
				2443	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPLOSSUNDO);
				2444	if (frto_undo)
				2445	NET_INC_STATS(sock_net(sk),
				2446	LINUX_MIB_TCPSPURIOUSRTOS);
				2447	inet_csk(sk)->icsk_retransmits = 0;
				2448	if (frto_undo \|\| tcp_is_sack(tp)) {
				2449	tcp_set_ca_state(sk, TCP_CA_Open);
				2450	tp->is_sack_reneg = 0;
				2451	}
				2452	return true;
				2453	}
				2454	return false;
				2455	}
				2456
				2457	/* The cwnd reduction in CWR and Recovery uses the PRR algorithm in RFC 6937.
				2458	* It computes the number of packets to send (sndcnt) based on packets newly
				2459	* delivered:
				2460	* 1) If the packets in flight is larger than ssthresh, PRR spreads the
				2461	* cwnd reductions across a full RTT.
				2462	* 2) Otherwise PRR uses packet conservation to send as much as delivered.
				2463	* But when the retransmits are acked without further losses, PRR
				2464	* slow starts cwnd up to ssthresh to speed up the recovery.
				2465	*/
				2466	static void tcp_init_cwnd_reduction(struct sock *sk)
				2467	{
				2468	struct tcp_sock *tp = tcp_sk(sk);
				2469
				2470	tp->high_seq = tp->snd_nxt;
				2471	tp->tlp_high_seq = 0;
				2472	tp->snd_cwnd_cnt = 0;
				2473	tp->prior_cwnd = tp->snd_cwnd;
				2474	tp->prr_delivered = 0;
				2475	tp->prr_out = 0;
				2476	tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk);
				2477	tcp_ecn_queue_cwr(tp);
				2478	}
				2479
				2480	void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, int flag)
				2481	{
				2482	struct tcp_sock *tp = tcp_sk(sk);
				2483	int sndcnt = 0;
				2484	int delta = tp->snd_ssthresh - tcp_packets_in_flight(tp);
				2485
				2486	if (newly_acked_sacked <= 0 \|\| WARN_ON_ONCE(!tp->prior_cwnd))
				2487	return;
				2488
				2489	tp->prr_delivered += newly_acked_sacked;
				2490	if (delta < 0) {
				2491	u64 dividend = (u64)tp->snd_ssthresh * tp->prr_delivered +
				2492	tp->prior_cwnd - 1;
				2493	sndcnt = div_u64(dividend, tp->prior_cwnd) - tp->prr_out;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2494	} else if ((flag & (FLAG_RETRANS_DATA_ACKED \| FLAG_LOST_RETRANS)) ==
				2495	FLAG_RETRANS_DATA_ACKED) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2496	sndcnt = min_t(int, delta,
				2497	max_t(int, tp->prr_delivered - tp->prr_out,
				2498	newly_acked_sacked) + 1);
				2499	} else {
				2500	sndcnt = min(delta, newly_acked_sacked);
				2501	}
				2502	/* Force a fast retransmit upon entering fast recovery */
				2503	sndcnt = max(sndcnt, (tp->prr_out ? 0 : 1));
				2504	tp->snd_cwnd = tcp_packets_in_flight(tp) + sndcnt;
				2505	}
				2506
				2507	static inline void tcp_end_cwnd_reduction(struct sock *sk)
				2508	{
				2509	struct tcp_sock *tp = tcp_sk(sk);
				2510
				2511	if (inet_csk(sk)->icsk_ca_ops->cong_control)
				2512	return;
				2513
				2514	/* Reset cwnd to ssthresh in CWR or Recovery (unless it's undone) */
				2515	if (tp->snd_ssthresh < TCP_INFINITE_SSTHRESH &&
				2516	(inet_csk(sk)->icsk_ca_state == TCP_CA_CWR \|\| tp->undo_marker)) {
				2517	tp->snd_cwnd = tp->snd_ssthresh;
				2518	tp->snd_cwnd_stamp = tcp_jiffies32;
				2519	}
				2520	tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR);
				2521	}
				2522
				2523	/* Enter CWR state. Disable cwnd undo since congestion is proven with ECN */
				2524	void tcp_enter_cwr(struct sock *sk)
				2525	{
				2526	struct tcp_sock *tp = tcp_sk(sk);
				2527
				2528	tp->prior_ssthresh = 0;
				2529	if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) {
				2530	tp->undo_marker = 0;
				2531	tcp_init_cwnd_reduction(sk);
				2532	tcp_set_ca_state(sk, TCP_CA_CWR);
				2533	}
				2534	}
				2535	EXPORT_SYMBOL(tcp_enter_cwr);
				2536
				2537	static void tcp_try_keep_open(struct sock *sk)
				2538	{
				2539	struct tcp_sock *tp = tcp_sk(sk);
				2540	int state = TCP_CA_Open;
				2541
				2542	if (tcp_left_out(tp) \|\| tcp_any_retrans_done(sk))
				2543	state = TCP_CA_Disorder;
				2544
				2545	if (inet_csk(sk)->icsk_ca_state != state) {
				2546	tcp_set_ca_state(sk, state);
				2547	tp->high_seq = tp->snd_nxt;
				2548	}
				2549	}
				2550
				2551	static void tcp_try_to_open(struct sock *sk, int flag)
				2552	{
				2553	struct tcp_sock *tp = tcp_sk(sk);
				2554
				2555	tcp_verify_left_out(tp);
				2556
				2557	if (!tcp_any_retrans_done(sk))
				2558	tp->retrans_stamp = 0;
				2559
				2560	if (flag & FLAG_ECE)
				2561	tcp_enter_cwr(sk);
				2562
				2563	if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) {
				2564	tcp_try_keep_open(sk);
				2565	}
				2566	}
				2567
				2568	static void tcp_mtup_probe_failed(struct sock *sk)
				2569	{
				2570	struct inet_connection_sock *icsk = inet_csk(sk);
				2571
				2572	icsk->icsk_mtup.search_high = icsk->icsk_mtup.probe_size - 1;
				2573	icsk->icsk_mtup.probe_size = 0;
				2574	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMTUPFAIL);
				2575	}
				2576
				2577	static void tcp_mtup_probe_success(struct sock *sk)
				2578	{
				2579	struct tcp_sock *tp = tcp_sk(sk);
				2580	struct inet_connection_sock *icsk = inet_csk(sk);
				2581
				2582	/* FIXME: breaks with very large cwnd */
				2583	tp->prior_ssthresh = tcp_current_ssthresh(sk);
				2584	tp->snd_cwnd = tp->snd_cwnd *
				2585	tcp_mss_to_mtu(sk, tp->mss_cache) /
				2586	icsk->icsk_mtup.probe_size;
				2587	tp->snd_cwnd_cnt = 0;
				2588	tp->snd_cwnd_stamp = tcp_jiffies32;
				2589	tp->snd_ssthresh = tcp_current_ssthresh(sk);
				2590
				2591	icsk->icsk_mtup.search_low = icsk->icsk_mtup.probe_size;
				2592	icsk->icsk_mtup.probe_size = 0;
				2593	tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
				2594	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMTUPSUCCESS);
				2595	}
				2596
				2597	/* Do a simple retransmit without using the backoff mechanisms in
				2598	* tcp_timer. This is used for path mtu discovery.
				2599	* The socket is already locked here.
				2600	*/
				2601	void tcp_simple_retransmit(struct sock *sk)
				2602	{
				2603	const struct inet_connection_sock *icsk = inet_csk(sk);
				2604	struct tcp_sock *tp = tcp_sk(sk);
				2605	struct sk_buff *skb;
				2606	unsigned int mss = tcp_current_mss(sk);
				2607
				2608	skb_rbtree_walk(skb, &sk->tcp_rtx_queue) {
				2609	if (tcp_skb_seglen(skb) > mss &&
				2610	!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) {
				2611	if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
				2612	TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
				2613	tp->retrans_out -= tcp_skb_pcount(skb);
				2614	}
				2615	tcp_skb_mark_lost_uncond_verify(tp, skb);
				2616	}
				2617	}
				2618
				2619	tcp_clear_retrans_hints_partial(tp);
				2620
				2621	if (!tp->lost_out)
				2622	return;
				2623
				2624	if (tcp_is_reno(tp))
				2625	tcp_limit_reno_sacked(tp);
				2626
				2627	tcp_verify_left_out(tp);
				2628
				2629	/* Don't muck with the congestion window here.
				2630	* Reason is that we do not increase amount of _data_
				2631	* in network, but units changed and effective
				2632	* cwnd/ssthresh really reduced now.
				2633	*/
				2634	if (icsk->icsk_ca_state != TCP_CA_Loss) {
				2635	tp->high_seq = tp->snd_nxt;
				2636	tp->snd_ssthresh = tcp_current_ssthresh(sk);
				2637	tp->prior_ssthresh = 0;
				2638	tp->undo_marker = 0;
				2639	tcp_set_ca_state(sk, TCP_CA_Loss);
				2640	}
				2641	tcp_xmit_retransmit_queue(sk);
				2642	}
				2643	EXPORT_SYMBOL(tcp_simple_retransmit);
				2644
				2645	void tcp_enter_recovery(struct sock *sk, bool ece_ack)
				2646	{
				2647	struct tcp_sock *tp = tcp_sk(sk);
				2648	int mib_idx;
				2649
				2650	if (tcp_is_reno(tp))
				2651	mib_idx = LINUX_MIB_TCPRENORECOVERY;
				2652	else
				2653	mib_idx = LINUX_MIB_TCPSACKRECOVERY;
				2654
				2655	NET_INC_STATS(sock_net(sk), mib_idx);
				2656
				2657	tp->prior_ssthresh = 0;
				2658	tcp_init_undo(tp);
				2659
				2660	if (!tcp_in_cwnd_reduction(sk)) {
				2661	if (!ece_ack)
				2662	tp->prior_ssthresh = tcp_current_ssthresh(sk);
				2663	tcp_init_cwnd_reduction(sk);
				2664	}
				2665	tcp_set_ca_state(sk, TCP_CA_Recovery);
				2666	}
				2667
				2668	/* Process an ACK in CA_Loss state. Move to CA_Open if lost data are
				2669	* recovered or spurious. Otherwise retransmits more on partial ACKs.
				2670	*/
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2671	static void tcp_process_loss(struct sock *sk, int flag, int num_dupack,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2672	int *rexmit)
				2673	{
				2674	struct tcp_sock *tp = tcp_sk(sk);
				2675	bool recovered = !before(tp->snd_una, tp->high_seq);
				2676
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2677	if ((flag & FLAG_SND_UNA_ADVANCED \|\| rcu_access_pointer(tp->fastopen_rsk)) &&
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2678	tcp_try_undo_loss(sk, false))
				2679	return;
				2680
				2681	if (tp->frto) { /* F-RTO RFC5682 sec 3.1 (sack enhanced version). */
				2682	/* Step 3.b. A timeout is spurious if not all data are
				2683	* lost, i.e., never-retransmitted data are (s)acked.
				2684	*/
				2685	if ((flag & FLAG_ORIG_SACK_ACKED) &&
				2686	tcp_try_undo_loss(sk, true))
				2687	return;
				2688
				2689	if (after(tp->snd_nxt, tp->high_seq)) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2690	if (flag & FLAG_DATA_SACKED \|\| num_dupack)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2691	tp->frto = 0; /* Step 3.a. loss was real */
				2692	} else if (flag & FLAG_SND_UNA_ADVANCED && !recovered) {
				2693	tp->high_seq = tp->snd_nxt;
				2694	/* Step 2.b. Try send new data (but deferred until cwnd
				2695	* is updated in tcp_ack()). Otherwise fall back to
				2696	* the conventional recovery.
				2697	*/
				2698	if (!tcp_write_queue_empty(sk) &&
				2699	after(tcp_wnd_end(tp), tp->snd_nxt)) {
				2700	*rexmit = REXMIT_NEW;
				2701	return;
				2702	}
				2703	tp->frto = 0;
				2704	}
				2705	}
				2706
				2707	if (recovered) {
				2708	/* F-RTO RFC5682 sec 3.1 step 2.a and 1st part of step 3.a */
				2709	tcp_try_undo_recovery(sk);
				2710	return;
				2711	}
				2712	if (tcp_is_reno(tp)) {
				2713	/* A Reno DUPACK means new data in F-RTO step 2.b above are
				2714	* delivered. Lower inflight to clock out (re)tranmissions.
				2715	*/
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2716	if (after(tp->snd_nxt, tp->high_seq) && num_dupack)
				2717	tcp_add_reno_sack(sk, num_dupack);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2718	else if (flag & FLAG_SND_UNA_ADVANCED)
				2719	tcp_reset_reno_sack(tp);
				2720	}
				2721	*rexmit = REXMIT_LOST;
				2722	}
				2723
				2724	/* Undo during fast recovery after partial ACK. */
				2725	static bool tcp_try_undo_partial(struct sock *sk, u32 prior_snd_una)
				2726	{
				2727	struct tcp_sock *tp = tcp_sk(sk);
				2728
				2729	if (tp->undo_marker && tcp_packet_delayed(tp)) {
				2730	/* Plain luck! Hole if filled with delayed
				2731	* packet, rather than with a retransmit. Check reordering.
				2732	*/
				2733	tcp_check_sack_reordering(sk, prior_snd_una, 1);
				2734
				2735	/* We are getting evidence that the reordering degree is higher
				2736	* than we realized. If there are no retransmits out then we
				2737	* can undo. Otherwise we clock out new packets but do not
				2738	* mark more packets lost or retransmit more.
				2739	*/
				2740	if (tp->retrans_out)
				2741	return true;
				2742
				2743	if (!tcp_any_retrans_done(sk))
				2744	tp->retrans_stamp = 0;
				2745
				2746	DBGUNDO(sk, "partial recovery");
				2747	tcp_undo_cwnd_reduction(sk, true);
				2748	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPARTIALUNDO);
				2749	tcp_try_keep_open(sk);
				2750	return true;
				2751	}
				2752	return false;
				2753	}
				2754
				2755	static void tcp_identify_packet_loss(struct sock sk, int ack_flag)
				2756	{
				2757	struct tcp_sock *tp = tcp_sk(sk);
				2758
				2759	if (tcp_rtx_queue_empty(sk))
				2760	return;
				2761
				2762	if (unlikely(tcp_is_reno(tp))) {
				2763	tcp_newreno_mark_lost(sk, *ack_flag & FLAG_SND_UNA_ADVANCED);
				2764	} else if (tcp_is_rack(sk)) {
				2765	u32 prior_retrans = tp->retrans_out;
				2766
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	2767	if (tcp_rack_mark_lost(sk))
				2768	*ack_flag &= ~FLAG_SET_XMIT_TIMER;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2769	if (prior_retrans > tp->retrans_out)
				2770	*ack_flag \|= FLAG_LOST_RETRANS;
				2771	}
				2772	}
				2773
				2774	static bool tcp_force_fast_retransmit(struct sock *sk)
				2775	{
				2776	struct tcp_sock *tp = tcp_sk(sk);
				2777
				2778	return after(tcp_highest_sack_seq(tp),
				2779	tp->snd_una + tp->reordering * tp->mss_cache);
				2780	}
				2781
				2782	/* Process an event, which can update packets-in-flight not trivially.
				2783	* Main goal of this function is to calculate new estimate for left_out,
				2784	* taking into account both packets sitting in receiver's buffer and
				2785	* packets lost by network.
				2786	*
				2787	* Besides that it updates the congestion state when packet loss or ECN
				2788	* is detected. But it does not reduce the cwnd, it is done by the
				2789	* congestion control later.
				2790	*
				2791	* It does _not_ decide what to send, it is made in function
				2792	* tcp_xmit_retransmit_queue().
				2793	*/
				2794	static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2795	int num_dupack, int ack_flag, int rexmit)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2796	{
				2797	struct inet_connection_sock *icsk = inet_csk(sk);
				2798	struct tcp_sock *tp = tcp_sk(sk);
				2799	int fast_rexmit = 0, flag = *ack_flag;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2800	bool do_lost = num_dupack \|\| ((flag & FLAG_DATA_SACKED) &&
				2801	tcp_force_fast_retransmit(sk));
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2802
				2803	if (!tp->packets_out && tp->sacked_out)
				2804	tp->sacked_out = 0;
				2805
				2806	/* Now state machine starts.
				2807	* A. ECE, hence prohibit cwnd undoing, the reduction is required. */
				2808	if (flag & FLAG_ECE)
				2809	tp->prior_ssthresh = 0;
				2810
				2811	/* B. In all the states check for reneging SACKs. */
				2812	if (tcp_check_sack_reneging(sk, flag))
				2813	return;
				2814
				2815	/* C. Check consistency of the current state. */
				2816	tcp_verify_left_out(tp);
				2817
				2818	/* D. Check state exit conditions. State can be terminated
				2819	* when high_seq is ACKed. */
				2820	if (icsk->icsk_ca_state == TCP_CA_Open) {
				2821	WARN_ON(tp->retrans_out != 0);
				2822	tp->retrans_stamp = 0;
				2823	} else if (!before(tp->snd_una, tp->high_seq)) {
				2824	switch (icsk->icsk_ca_state) {
				2825	case TCP_CA_CWR:
				2826	/* CWR is to be held something above high_seq
				2827	* is ACKed for CWR bit to reach receiver. */
				2828	if (tp->snd_una != tp->high_seq) {
				2829	tcp_end_cwnd_reduction(sk);
				2830	tcp_set_ca_state(sk, TCP_CA_Open);
				2831	}
				2832	break;
				2833
				2834	case TCP_CA_Recovery:
				2835	if (tcp_is_reno(tp))
				2836	tcp_reset_reno_sack(tp);
				2837	if (tcp_try_undo_recovery(sk))
				2838	return;
				2839	tcp_end_cwnd_reduction(sk);
				2840	break;
				2841	}
				2842	}
				2843
				2844	/* E. Process state. */
				2845	switch (icsk->icsk_ca_state) {
				2846	case TCP_CA_Recovery:
				2847	if (!(flag & FLAG_SND_UNA_ADVANCED)) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2848	if (tcp_is_reno(tp))
				2849	tcp_add_reno_sack(sk, num_dupack);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2850	} else {
				2851	if (tcp_try_undo_partial(sk, prior_snd_una))
				2852	return;
				2853	/* Partial ACK arrived. Force fast retransmit. */
				2854	do_lost = tcp_is_reno(tp) \|\|
				2855	tcp_force_fast_retransmit(sk);
				2856	}
				2857	if (tcp_try_undo_dsack(sk)) {
				2858	tcp_try_keep_open(sk);
				2859	return;
				2860	}
				2861	tcp_identify_packet_loss(sk, ack_flag);
				2862	break;
				2863	case TCP_CA_Loss:
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2864	tcp_process_loss(sk, flag, num_dupack, rexmit);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2865	tcp_identify_packet_loss(sk, ack_flag);
				2866	if (!(icsk->icsk_ca_state == TCP_CA_Open \|\|
				2867	(*ack_flag & FLAG_LOST_RETRANS)))
				2868	return;
				2869	/* Change state if cwnd is undone or retransmits are lost */
				2870	/* fall through */
				2871	default:
				2872	if (tcp_is_reno(tp)) {
				2873	if (flag & FLAG_SND_UNA_ADVANCED)
				2874	tcp_reset_reno_sack(tp);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2875	tcp_add_reno_sack(sk, num_dupack);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2876	}
				2877
				2878	if (icsk->icsk_ca_state <= TCP_CA_Disorder)
				2879	tcp_try_undo_dsack(sk);
				2880
				2881	tcp_identify_packet_loss(sk, ack_flag);
				2882	if (!tcp_time_to_recover(sk, flag)) {
				2883	tcp_try_to_open(sk, flag);
				2884	return;
				2885	}
				2886
				2887	/* MTU probe failure: don't reduce cwnd */
				2888	if (icsk->icsk_ca_state < TCP_CA_CWR &&
				2889	icsk->icsk_mtup.probe_size &&
				2890	tp->snd_una == tp->mtu_probe.probe_seq_start) {
				2891	tcp_mtup_probe_failed(sk);
				2892	/* Restores the reduction we did in tcp_mtup_probe() */
				2893	tp->snd_cwnd++;
				2894	tcp_simple_retransmit(sk);
				2895	return;
				2896	}
				2897
				2898	/* Otherwise enter Recovery state */
				2899	tcp_enter_recovery(sk, (flag & FLAG_ECE));
				2900	fast_rexmit = 1;
				2901	}
				2902
				2903	if (!tcp_is_rack(sk) && do_lost)
				2904	tcp_update_scoreboard(sk, fast_rexmit);
				2905	*rexmit = REXMIT_LOST;
				2906	}
				2907
				2908	static void tcp_update_rtt_min(struct sock *sk, u32 rtt_us, const int flag)
				2909	{
				2910	u32 wlen = sock_net(sk)->ipv4.sysctl_tcp_min_rtt_wlen * HZ;
				2911	struct tcp_sock *tp = tcp_sk(sk);
				2912
				2913	if ((flag & FLAG_ACK_MAYBE_DELAYED) && rtt_us > tcp_min_rtt(tp)) {
				2914	/* If the remote keeps returning delayed ACKs, eventually
				2915	* the min filter would pick it up and overestimate the
				2916	* prop. delay when it expires. Skip suspected delayed ACKs.
				2917	*/
				2918	return;
				2919	}
				2920	minmax_running_min(&tp->rtt_min, wlen, tcp_jiffies32,
				2921	rtt_us ? : jiffies_to_usecs(1));
				2922	}
				2923
				2924	static bool tcp_ack_update_rtt(struct sock *sk, const int flag,
				2925	long seq_rtt_us, long sack_rtt_us,
				2926	long ca_rtt_us, struct rate_sample *rs)
				2927	{
				2928	const struct tcp_sock *tp = tcp_sk(sk);
				2929
				2930	/* Prefer RTT measured from ACK's timing to TS-ECR. This is because
				2931	* broken middle-boxes or peers may corrupt TS-ECR fields. But
				2932	* Karn's algorithm forbids taking RTT if some retransmitted data
				2933	* is acked (RFC6298).
				2934	*/
				2935	if (seq_rtt_us < 0)
				2936	seq_rtt_us = sack_rtt_us;
				2937
				2938	/* RTTM Rule: A TSecr value received in a segment is used to
				2939	* update the averaged RTT measurement only if the segment
				2940	* acknowledges some new data, i.e., only if it advances the
				2941	* left edge of the send window.
				2942	* See draft-ietf-tcplw-high-performance-00, section 3.3.
				2943	*/
				2944	if (seq_rtt_us < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
				2945	flag & FLAG_ACKED) {
				2946	u32 delta = tcp_time_stamp(tp) - tp->rx_opt.rcv_tsecr;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2947
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2948	if (likely(delta < INT_MAX / (USEC_PER_SEC / TCP_TS_HZ))) {
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	2949	if (!delta)
				2950	delta = 1;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2951	seq_rtt_us = delta * (USEC_PER_SEC / TCP_TS_HZ);
				2952	ca_rtt_us = seq_rtt_us;
				2953	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2954	}
				2955	rs->rtt_us = ca_rtt_us; /* RTT of last (S)ACKed packet (or -1) */
				2956	if (seq_rtt_us < 0)
				2957	return false;
				2958
				2959	/* ca_rtt_us >= 0 is counting on the invariant that ca_rtt_us is
				2960	* always taken together with ACK, SACK, or TS-opts. Any negative
				2961	* values will be skipped with the seq_rtt_us < 0 check above.
				2962	*/
				2963	tcp_update_rtt_min(sk, ca_rtt_us, flag);
				2964	tcp_rtt_estimator(sk, seq_rtt_us);
				2965	tcp_set_rto(sk);
				2966
				2967	/* RFC6298: only reset backoff on valid RTT measurement. */
				2968	inet_csk(sk)->icsk_backoff = 0;
				2969	return true;
				2970	}
				2971
				2972	/* Compute time elapsed between (last) SYNACK and the ACK completing 3WHS. */
				2973	void tcp_synack_rtt_meas(struct sock sk, struct request_sock req)
				2974	{
				2975	struct rate_sample rs;
				2976	long rtt_us = -1L;
				2977
				2978	if (req && !req->num_retrans && tcp_rsk(req)->snt_synack)
				2979	rtt_us = tcp_stamp_us_delta(tcp_clock_us(), tcp_rsk(req)->snt_synack);
				2980
				2981	tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, rtt_us, -1L, rtt_us, &rs);
				2982	}
				2983
				2984
				2985	static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
				2986	{
				2987	const struct inet_connection_sock *icsk = inet_csk(sk);
				2988
				2989	icsk->icsk_ca_ops->cong_avoid(sk, ack, acked);
				2990	tcp_sk(sk)->snd_cwnd_stamp = tcp_jiffies32;
				2991	}
				2992
				2993	/* Restart timer after forward progress on connection.
				2994	* RFC2988 recommends to restart timer to now+rto.
				2995	*/
				2996	void tcp_rearm_rto(struct sock *sk)
				2997	{
				2998	const struct inet_connection_sock *icsk = inet_csk(sk);
				2999	struct tcp_sock *tp = tcp_sk(sk);
				3000
				3001	/* If the retrans timer is currently being used by Fast Open
				3002	* for SYN-ACK retrans purpose, stay put.
				3003	*/
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3004	if (rcu_access_pointer(tp->fastopen_rsk))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3005	return;
				3006
				3007	if (!tp->packets_out) {
				3008	inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS);
				3009	} else {
				3010	u32 rto = inet_csk(sk)->icsk_rto;
				3011	/* Offset the time elapsed after installing regular RTO */
				3012	if (icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT \|\|
				3013	icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
				3014	s64 delta_us = tcp_rto_delta_us(sk);
				3015	/* delta_us may not be positive if the socket is locked
				3016	* when the retrans timer fires and is rescheduled.
				3017	*/
				3018	rto = usecs_to_jiffies(max_t(int, delta_us, 1));
				3019	}
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3020	tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto,
				3021	TCP_RTO_MAX, tcp_rtx_queue_head(sk));
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3022	}
				3023	}
				3024
				3025	/* Try to schedule a loss probe; if that doesn't work, then schedule an RTO. */
				3026	static void tcp_set_xmit_timer(struct sock *sk)
				3027	{
				3028	if (!tcp_schedule_loss_probe(sk, true))
				3029	tcp_rearm_rto(sk);
				3030	}
				3031
				3032	/* If we get here, the whole TSO packet has not been acked. */
				3033	static u32 tcp_tso_acked(struct sock sk, struct sk_buff skb)
				3034	{
				3035	struct tcp_sock *tp = tcp_sk(sk);
				3036	u32 packets_acked;
				3037
				3038	BUG_ON(!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una));
				3039
				3040	packets_acked = tcp_skb_pcount(skb);
				3041	if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq))
				3042	return 0;
				3043	packets_acked -= tcp_skb_pcount(skb);
				3044
				3045	if (packets_acked) {
				3046	BUG_ON(tcp_skb_pcount(skb) == 0);
				3047	BUG_ON(!before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq));
				3048	}
				3049
				3050	return packets_acked;
				3051	}
				3052
				3053	static void tcp_ack_tstamp(struct sock sk, struct sk_buff skb,
				3054	u32 prior_snd_una)
				3055	{
				3056	const struct skb_shared_info *shinfo;
				3057
				3058	/* Avoid cache line misses to get skb_shinfo() and shinfo->tx_flags */
				3059	if (likely(!TCP_SKB_CB(skb)->txstamp_ack))
				3060	return;
				3061
				3062	shinfo = skb_shinfo(skb);
				3063	if (!before(shinfo->tskey, prior_snd_una) &&
				3064	before(shinfo->tskey, tcp_sk(sk)->snd_una)) {
				3065	tcp_skb_tsorted_save(skb) {
				3066	__skb_tstamp_tx(skb, NULL, sk, SCM_TSTAMP_ACK);
				3067	} tcp_skb_tsorted_restore(skb);
				3068	}
				3069	}
				3070
				3071	/* Remove acknowledged frames from the retransmission queue. If our packet
				3072	* is before the ack sequence we can discard it as it's confirmed to have
				3073	* arrived at the other end.
				3074	*/
				3075	static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack,
				3076	u32 prior_snd_una,
				3077	struct tcp_sacktag_state *sack)
				3078	{
				3079	const struct inet_connection_sock *icsk = inet_csk(sk);
				3080	u64 first_ackt, last_ackt;
				3081	struct tcp_sock *tp = tcp_sk(sk);
				3082	u32 prior_sacked = tp->sacked_out;
				3083	u32 reord = tp->snd_nxt; /* lowest acked un-retx un-sacked seq */
				3084	struct sk_buff skb, next;
				3085	bool fully_acked = true;
				3086	long sack_rtt_us = -1L;
				3087	long seq_rtt_us = -1L;
				3088	long ca_rtt_us = -1L;
				3089	u32 pkts_acked = 0;
				3090	u32 last_in_flight = 0;
				3091	bool rtt_update;
				3092	int flag = 0;
				3093
				3094	first_ackt = 0;
				3095
				3096	for (skb = skb_rb_first(&sk->tcp_rtx_queue); skb; skb = next) {
				3097	struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
				3098	const u32 start_seq = scb->seq;
				3099	u8 sacked = scb->sacked;
				3100	u32 acked_pcount;
				3101
				3102	tcp_ack_tstamp(sk, skb, prior_snd_una);
				3103
				3104	/* Determine how many packets and what bytes were acked, tso and else */
				3105	if (after(scb->end_seq, tp->snd_una)) {
				3106	if (tcp_skb_pcount(skb) == 1 \|\|
				3107	!after(tp->snd_una, scb->seq))
				3108	break;
				3109
				3110	acked_pcount = tcp_tso_acked(sk, skb);
				3111	if (!acked_pcount)
				3112	break;
				3113	fully_acked = false;
				3114	} else {
				3115	acked_pcount = tcp_skb_pcount(skb);
				3116	}
				3117
				3118	if (unlikely(sacked & TCPCB_RETRANS)) {
				3119	if (sacked & TCPCB_SACKED_RETRANS)
				3120	tp->retrans_out -= acked_pcount;
				3121	flag \|= FLAG_RETRANS_DATA_ACKED;
				3122	} else if (!(sacked & TCPCB_SACKED_ACKED)) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3123	last_ackt = tcp_skb_timestamp_us(skb);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3124	WARN_ON_ONCE(last_ackt == 0);
				3125	if (!first_ackt)
				3126	first_ackt = last_ackt;
				3127
				3128	last_in_flight = TCP_SKB_CB(skb)->tx.in_flight;
				3129	if (before(start_seq, reord))
				3130	reord = start_seq;
				3131	if (!after(scb->end_seq, tp->high_seq))
				3132	flag \|= FLAG_ORIG_SACK_ACKED;
				3133	}
				3134
				3135	if (sacked & TCPCB_SACKED_ACKED) {
				3136	tp->sacked_out -= acked_pcount;
				3137	} else if (tcp_is_sack(tp)) {
				3138	tp->delivered += acked_pcount;
				3139	if (!tcp_skb_spurious_retrans(tp, skb))
				3140	tcp_rack_advance(tp, sacked, scb->end_seq,
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3141	tcp_skb_timestamp_us(skb));
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3142	}
				3143	if (sacked & TCPCB_LOST)
				3144	tp->lost_out -= acked_pcount;
				3145
				3146	tp->packets_out -= acked_pcount;
				3147	pkts_acked += acked_pcount;
				3148	tcp_rate_skb_delivered(sk, skb, sack->rate);
				3149
				3150	/* Initial outgoing SYN's get put onto the write_queue
				3151	* just like anything else we transmit. It is not
				3152	* true data, and if we misinform our callers that
				3153	* this ACK acks real data, we will erroneously exit
				3154	* connection startup slow start one packet too
				3155	* quickly. This is severely frowned upon behavior.
				3156	*/
				3157	if (likely(!(scb->tcp_flags & TCPHDR_SYN))) {
				3158	flag \|= FLAG_DATA_ACKED;
				3159	} else {
				3160	flag \|= FLAG_SYN_ACKED;
				3161	tp->retrans_stamp = 0;
				3162	}
				3163
				3164	if (!fully_acked)
				3165	break;
				3166
				3167	next = skb_rb_next(skb);
				3168	if (unlikely(skb == tp->retransmit_skb_hint))
				3169	tp->retransmit_skb_hint = NULL;
				3170	if (unlikely(skb == tp->lost_skb_hint))
				3171	tp->lost_skb_hint = NULL;
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	3172	tcp_highest_sack_replace(sk, skb, next);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3173	tcp_rtx_queue_unlink_and_free(skb, sk);
				3174	}
				3175
				3176	if (!skb)
				3177	tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
				3178
				3179	if (likely(between(tp->snd_up, prior_snd_una, tp->snd_una)))
				3180	tp->snd_up = tp->snd_una;
				3181
				3182	if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
				3183	flag \|= FLAG_SACK_RENEGING;
				3184
				3185	if (likely(first_ackt) && !(flag & FLAG_RETRANS_DATA_ACKED)) {
				3186	seq_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, first_ackt);
				3187	ca_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, last_ackt);
				3188
				3189	if (pkts_acked == 1 && last_in_flight < tp->mss_cache &&
				3190	last_in_flight && !prior_sacked && fully_acked &&
				3191	sack->rate->prior_delivered + 1 == tp->delivered &&
				3192	!(flag & (FLAG_CA_ALERT \| FLAG_SYN_ACKED))) {
				3193	/* Conservatively mark a delayed ACK. It's typically
				3194	* from a lone runt packet over the round trip to
				3195	* a receiver w/o out-of-order or CE events.
				3196	*/
				3197	flag \|= FLAG_ACK_MAYBE_DELAYED;
				3198	}
				3199	}
				3200	if (sack->first_sackt) {
				3201	sack_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, sack->first_sackt);
				3202	ca_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, sack->last_sackt);
				3203	}
				3204	rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt_us, sack_rtt_us,
				3205	ca_rtt_us, sack->rate);
				3206
				3207	if (flag & FLAG_ACKED) {
				3208	flag \|= FLAG_SET_XMIT_TIMER; /* set TLP or RTO timer */
				3209	if (unlikely(icsk->icsk_mtup.probe_size &&
				3210	!after(tp->mtu_probe.probe_seq_end, tp->snd_una))) {
				3211	tcp_mtup_probe_success(sk);
				3212	}
				3213
				3214	if (tcp_is_reno(tp)) {
				3215	tcp_remove_reno_sacks(sk, pkts_acked);
				3216
				3217	/* If any of the cumulatively ACKed segments was
				3218	* retransmitted, non-SACK case cannot confirm that
				3219	* progress was due to original transmission due to
				3220	* lack of TCPCB_SACKED_ACKED bits even if some of
				3221	* the packets may have been never retransmitted.
				3222	*/
				3223	if (flag & FLAG_RETRANS_DATA_ACKED)
				3224	flag &= ~FLAG_ORIG_SACK_ACKED;
				3225	} else {
				3226	int delta;
				3227
				3228	/* Non-retransmitted hole got filled? That's reordering */
				3229	if (before(reord, prior_fack))
				3230	tcp_check_sack_reordering(sk, reord, 0);
				3231
				3232	delta = prior_sacked - tp->sacked_out;
				3233	tp->lost_cnt_hint -= min(tp->lost_cnt_hint, delta);
				3234	}
				3235	} else if (skb && rtt_update && sack_rtt_us >= 0 &&
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3236	sack_rtt_us > tcp_stamp_us_delta(tp->tcp_mstamp,
				3237	tcp_skb_timestamp_us(skb))) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3238	/* Do not re-arm RTO if the sack RTT is measured from data sent
				3239	* after when the head was last (re)transmitted. Otherwise the
				3240	* timeout may continue to extend in loss recovery.
				3241	*/
				3242	flag \|= FLAG_SET_XMIT_TIMER; /* set TLP or RTO timer */
				3243	}
				3244
				3245	if (icsk->icsk_ca_ops->pkts_acked) {
				3246	struct ack_sample sample = { .pkts_acked = pkts_acked,
				3247	.rtt_us = sack->rate->rtt_us,
				3248	.in_flight = last_in_flight };
				3249
				3250	icsk->icsk_ca_ops->pkts_acked(sk, &sample);
				3251	}
				3252
				3253	#if FASTRETRANS_DEBUG > 0
				3254	WARN_ON((int)tp->sacked_out < 0);
				3255	WARN_ON((int)tp->lost_out < 0);
				3256	WARN_ON((int)tp->retrans_out < 0);
				3257	if (!tp->packets_out && tcp_is_sack(tp)) {
				3258	icsk = inet_csk(sk);
				3259	if (tp->lost_out) {
				3260	pr_debug("Leak l=%u %d\n",
				3261	tp->lost_out, icsk->icsk_ca_state);
				3262	tp->lost_out = 0;
				3263	}
				3264	if (tp->sacked_out) {
				3265	pr_debug("Leak s=%u %d\n",
				3266	tp->sacked_out, icsk->icsk_ca_state);
				3267	tp->sacked_out = 0;
				3268	}
				3269	if (tp->retrans_out) {
				3270	pr_debug("Leak r=%u %d\n",
				3271	tp->retrans_out, icsk->icsk_ca_state);
				3272	tp->retrans_out = 0;
				3273	}
				3274	}
				3275	#endif
				3276	return flag;
				3277	}
				3278
				3279	static void tcp_ack_probe(struct sock *sk)
				3280	{
				3281	struct inet_connection_sock *icsk = inet_csk(sk);
				3282	struct sk_buff *head = tcp_send_head(sk);
				3283	const struct tcp_sock *tp = tcp_sk(sk);
				3284
				3285	/* Was it a usable window open? */
				3286	if (!head)
				3287	return;
				3288	if (!after(TCP_SKB_CB(head)->end_seq, tcp_wnd_end(tp))) {
				3289	icsk->icsk_backoff = 0;
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	3290	icsk->icsk_probes_tstamp = 0;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3291	inet_csk_clear_xmit_timer(sk, ICSK_TIME_PROBE0);
				3292	/* Socket must be waked up by subsequent tcp_data_snd_check().
				3293	* This function is not for random using!
				3294	*/
				3295	} else {
				3296	unsigned long when = tcp_probe0_when(sk, TCP_RTO_MAX);
				3297
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	3298	when = tcp_clamp_probe0_to_user_timeout(sk, when);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3299	tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
				3300	when, TCP_RTO_MAX, NULL);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3301	}
				3302	}
				3303
				3304	static inline bool tcp_ack_is_dubious(const struct sock *sk, const int flag)
				3305	{
				3306	return !(flag & FLAG_NOT_DUP) \|\| (flag & FLAG_CA_ALERT) \|\|
				3307	inet_csk(sk)->icsk_ca_state != TCP_CA_Open;
				3308	}
				3309
				3310	/* Decide wheather to run the increase function of congestion control. */
				3311	static inline bool tcp_may_raise_cwnd(const struct sock *sk, const int flag)
				3312	{
				3313	/* If reordering is high then always grow cwnd whenever data is
				3314	* delivered regardless of its ordering. Otherwise stay conservative
				3315	* and only grow cwnd on in-order delivery (RFC5681). A stretched ACK w/
				3316	* new SACK or ECE mark may first advance cwnd here and later reduce
				3317	* cwnd in tcp_fastretrans_alert() based on more states.
				3318	*/
				3319	if (tcp_sk(sk)->reordering > sock_net(sk)->ipv4.sysctl_tcp_reordering)
				3320	return flag & FLAG_FORWARD_PROGRESS;
				3321
				3322	return flag & FLAG_DATA_ACKED;
				3323	}
				3324
				3325	/* The "ultimate" congestion control function that aims to replace the rigid
				3326	* cwnd increase and decrease control (tcp_cong_avoid,tcp_*cwnd_reduction).
				3327	* It's called toward the end of processing an ACK with precise rate
				3328	* information. All transmission or retransmission are delayed afterwards.
				3329	*/
				3330	static void tcp_cong_control(struct sock *sk, u32 ack, u32 acked_sacked,
				3331	int flag, const struct rate_sample *rs)
				3332	{
				3333	const struct inet_connection_sock *icsk = inet_csk(sk);
				3334
				3335	if (icsk->icsk_ca_ops->cong_control) {
				3336	icsk->icsk_ca_ops->cong_control(sk, rs);
				3337	return;
				3338	}
				3339
				3340	if (tcp_in_cwnd_reduction(sk)) {
				3341	/* Reduce cwnd if state mandates */
				3342	tcp_cwnd_reduction(sk, acked_sacked, flag);
				3343	} else if (tcp_may_raise_cwnd(sk, flag)) {
				3344	/* Advance cwnd if state allows */
				3345	tcp_cong_avoid(sk, ack, acked_sacked);
				3346	}
				3347	tcp_update_pacing_rate(sk);
				3348	}
				3349
				3350	/* Check that window update is acceptable.
				3351	* The function assumes that snd_una<=ack<=snd_next.
				3352	*/
				3353	static inline bool tcp_may_update_window(const struct tcp_sock *tp,
				3354	const u32 ack, const u32 ack_seq,
				3355	const u32 nwin)
				3356	{
				3357	return after(ack, tp->snd_una) \|\|
				3358	after(ack_seq, tp->snd_wl1) \|\|
				3359	(ack_seq == tp->snd_wl1 && nwin > tp->snd_wnd);
				3360	}
				3361
				3362	/* If we update tp->snd_una, also update tp->bytes_acked */
				3363	static void tcp_snd_una_update(struct tcp_sock *tp, u32 ack)
				3364	{
				3365	u32 delta = ack - tp->snd_una;
				3366
				3367	sock_owned_by_me((struct sock *)tp);
				3368	tp->bytes_acked += delta;
				3369	tp->snd_una = ack;
				3370	}
				3371
				3372	/* If we update tp->rcv_nxt, also update tp->bytes_received */
				3373	static void tcp_rcv_nxt_update(struct tcp_sock *tp, u32 seq)
				3374	{
				3375	u32 delta = seq - tp->rcv_nxt;
				3376
				3377	sock_owned_by_me((struct sock *)tp);
				3378	tp->bytes_received += delta;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3379	WRITE_ONCE(tp->rcv_nxt, seq);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3380	}
				3381
				3382	/* Update our send window.
				3383	*
				3384	* Window update algorithm, described in RFC793/RFC1122 (used in linux-2.2
				3385	* and in FreeBSD. NetBSD's one is even worse.) is wrong.
				3386	*/
				3387	static int tcp_ack_update_window(struct sock sk, const struct sk_buff skb, u32 ack,
				3388	u32 ack_seq)
				3389	{
				3390	struct tcp_sock *tp = tcp_sk(sk);
				3391	int flag = 0;
				3392	u32 nwin = ntohs(tcp_hdr(skb)->window);
				3393
				3394	if (likely(!tcp_hdr(skb)->syn))
				3395	nwin <<= tp->rx_opt.snd_wscale;
				3396
				3397	if (tcp_may_update_window(tp, ack, ack_seq, nwin)) {
				3398	flag \|= FLAG_WIN_UPDATE;
				3399	tcp_update_wl(tp, ack_seq);
				3400
				3401	if (tp->snd_wnd != nwin) {
				3402	tp->snd_wnd = nwin;
				3403
				3404	/* Note, it is the only place, where
				3405	* fast path is recovered for sending TCP.
				3406	*/
				3407	tp->pred_flags = 0;
				3408	tcp_fast_path_check(sk);
				3409
				3410	if (!tcp_write_queue_empty(sk))
				3411	tcp_slow_start_after_idle_check(sk);
				3412
				3413	if (nwin > tp->max_window) {
				3414	tp->max_window = nwin;
				3415	tcp_sync_mss(sk, inet_csk(sk)->icsk_pmtu_cookie);
				3416	}
				3417	}
				3418	}
				3419
				3420	tcp_snd_una_update(tp, ack);
				3421
				3422	return flag;
				3423	}
				3424
				3425	static bool __tcp_oow_rate_limited(struct net *net, int mib_idx,
				3426	u32 *last_oow_ack_time)
				3427	{
				3428	if (*last_oow_ack_time) {
				3429	s32 elapsed = (s32)(tcp_jiffies32 - *last_oow_ack_time);
				3430
				3431	if (0 <= elapsed && elapsed < net->ipv4.sysctl_tcp_invalid_ratelimit) {
				3432	NET_INC_STATS(net, mib_idx);
				3433	return true; /* rate-limited: don't send yet! */
				3434	}
				3435	}
				3436
				3437	*last_oow_ack_time = tcp_jiffies32;
				3438
				3439	return false; /* not rate-limited: go ahead, send dupack now! */
				3440	}
				3441
				3442	/* Return true if we're currently rate-limiting out-of-window ACKs and
				3443	* thus shouldn't send a dupack right now. We rate-limit dupacks in
				3444	* response to out-of-window SYNs or ACKs to mitigate ACK loops or DoS
				3445	* attacks that send repeated SYNs or ACKs for the same connection. To
				3446	* do this, we do not send a duplicate SYNACK or ACK if the remote
				3447	* endpoint is sending out-of-window SYNs or pure ACKs at a high rate.
				3448	*/
				3449	bool tcp_oow_rate_limited(struct net net, const struct sk_buff skb,
				3450	int mib_idx, u32 *last_oow_ack_time)
				3451	{
				3452	/* Data packets without SYNs are not likely part of an ACK loop. */
				3453	if ((TCP_SKB_CB(skb)->seq != TCP_SKB_CB(skb)->end_seq) &&
				3454	!tcp_hdr(skb)->syn)
				3455	return false;
				3456
				3457	return __tcp_oow_rate_limited(net, mib_idx, last_oow_ack_time);
				3458	}
				3459
				3460	/* RFC 5961 7 [ACK Throttling] */
				3461	static void tcp_send_challenge_ack(struct sock sk, const struct sk_buff skb)
				3462	{
				3463	/* unprotected vars, we dont care of overwrites */
				3464	static u32 challenge_timestamp;
				3465	static unsigned int challenge_count;
				3466	struct tcp_sock *tp = tcp_sk(sk);
				3467	struct net *net = sock_net(sk);
				3468	u32 count, now;
				3469
				3470	/* First check our per-socket dupack rate limit. */
				3471	if (__tcp_oow_rate_limited(net,
				3472	LINUX_MIB_TCPACKSKIPPEDCHALLENGE,
				3473	&tp->last_oow_ack_time))
				3474	return;
				3475
				3476	/* Then check host-wide RFC 5961 rate limit. */
				3477	now = jiffies / HZ;
				3478	if (now != challenge_timestamp) {
				3479	u32 ack_limit = net->ipv4.sysctl_tcp_challenge_ack_limit;
				3480	u32 half = (ack_limit + 1) >> 1;
				3481
				3482	challenge_timestamp = now;
				3483	WRITE_ONCE(challenge_count, half + prandom_u32_max(ack_limit));
				3484	}
				3485	count = READ_ONCE(challenge_count);
				3486	if (count > 0) {
				3487	WRITE_ONCE(challenge_count, count - 1);
				3488	NET_INC_STATS(net, LINUX_MIB_TCPCHALLENGEACK);
				3489	tcp_send_ack(sk);
				3490	}
				3491	}
				3492
				3493	static void tcp_store_ts_recent(struct tcp_sock *tp)
				3494	{
				3495	tp->rx_opt.ts_recent = tp->rx_opt.rcv_tsval;
				3496	tp->rx_opt.ts_recent_stamp = ktime_get_seconds();
				3497	}
				3498
				3499	static void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq)
				3500	{
				3501	if (tp->rx_opt.saw_tstamp && !after(seq, tp->rcv_wup)) {
				3502	/* PAWS bug workaround wrt. ACK frames, the PAWS discard
				3503	* extra check below makes sure this can only happen
				3504	* for pure ACK frames. -DaveM
				3505	*
				3506	* Not only, also it occurs for expired timestamps.
				3507	*/
				3508
				3509	if (tcp_paws_check(&tp->rx_opt, 0))
				3510	tcp_store_ts_recent(tp);
				3511	}
				3512	}
				3513
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	3514	/* This routine deals with acks during a TLP episode and ends an episode by
				3515	* resetting tlp_high_seq. Ref: TLP algorithm in draft-ietf-tcpm-rack
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3516	*/
				3517	static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)
				3518	{
				3519	struct tcp_sock *tp = tcp_sk(sk);
				3520
				3521	if (before(ack, tp->tlp_high_seq))
				3522	return;
				3523
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	3524	if (!tp->tlp_retrans) {
				3525	/* TLP of new data has been acknowledged */
				3526	tp->tlp_high_seq = 0;
				3527	} else if (flag & FLAG_DSACKING_ACK) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3528	/* This DSACK means original and TLP probe arrived; no loss */
				3529	tp->tlp_high_seq = 0;
				3530	} else if (after(ack, tp->tlp_high_seq)) {
				3531	/* ACK advances: there was a loss, so reduce cwnd. Reset
				3532	* tlp_high_seq in tcp_init_cwnd_reduction()
				3533	*/
				3534	tcp_init_cwnd_reduction(sk);
				3535	tcp_set_ca_state(sk, TCP_CA_CWR);
				3536	tcp_end_cwnd_reduction(sk);
				3537	tcp_try_keep_open(sk);
				3538	NET_INC_STATS(sock_net(sk),
				3539	LINUX_MIB_TCPLOSSPROBERECOVERY);
				3540	} else if (!(flag & (FLAG_SND_UNA_ADVANCED \|
				3541	FLAG_NOT_DUP \| FLAG_DATA_SACKED))) {
				3542	/* Pure dupack: original and TLP probe arrived; no loss */
				3543	tp->tlp_high_seq = 0;
				3544	}
				3545	}
				3546
				3547	static inline void tcp_in_ack_event(struct sock *sk, u32 flags)
				3548	{
				3549	const struct inet_connection_sock *icsk = inet_csk(sk);
				3550
				3551	if (icsk->icsk_ca_ops->in_ack_event)
				3552	icsk->icsk_ca_ops->in_ack_event(sk, flags);
				3553	}
				3554
				3555	/* Congestion control has updated the cwnd already. So if we're in
				3556	* loss recovery then now we do any new sends (for FRTO) or
				3557	* retransmits (for CA_Loss or CA_recovery) that make sense.
				3558	*/
				3559	static void tcp_xmit_recovery(struct sock *sk, int rexmit)
				3560	{
				3561	struct tcp_sock *tp = tcp_sk(sk);
				3562
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3563	if (rexmit == REXMIT_NONE \|\| sk->sk_state == TCP_SYN_SENT)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3564	return;
				3565
				3566	if (unlikely(rexmit == 2)) {
				3567	__tcp_push_pending_frames(sk, tcp_current_mss(sk),
				3568	TCP_NAGLE_OFF);
				3569	if (after(tp->snd_nxt, tp->high_seq))
				3570	return;
				3571	tp->frto = 0;
				3572	}
				3573	tcp_xmit_retransmit_queue(sk);
				3574	}
				3575
				3576	/* Returns the number of packets newly acked or sacked by the current ACK */
				3577	static u32 tcp_newly_delivered(struct sock *sk, u32 prior_delivered, int flag)
				3578	{
				3579	const struct net *net = sock_net(sk);
				3580	struct tcp_sock *tp = tcp_sk(sk);
				3581	u32 delivered;
				3582
				3583	delivered = tp->delivered - prior_delivered;
				3584	NET_ADD_STATS(net, LINUX_MIB_TCPDELIVERED, delivered);
				3585	if (flag & FLAG_ECE) {
				3586	tp->delivered_ce += delivered;
				3587	NET_ADD_STATS(net, LINUX_MIB_TCPDELIVEREDCE, delivered);
				3588	}
				3589	return delivered;
				3590	}
				3591
				3592	/* This routine deals with incoming acks, but not outgoing ones. */
				3593	static int tcp_ack(struct sock sk, const struct sk_buff skb, int flag)
				3594	{
				3595	struct inet_connection_sock *icsk = inet_csk(sk);
				3596	struct tcp_sock *tp = tcp_sk(sk);
				3597	struct tcp_sacktag_state sack_state;
				3598	struct rate_sample rs = { .prior_delivered = 0 };
				3599	u32 prior_snd_una = tp->snd_una;
				3600	bool is_sack_reneg = tp->is_sack_reneg;
				3601	u32 ack_seq = TCP_SKB_CB(skb)->seq;
				3602	u32 ack = TCP_SKB_CB(skb)->ack_seq;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3603	int num_dupack = 0;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3604	int prior_packets = tp->packets_out;
				3605	u32 delivered = tp->delivered;
				3606	u32 lost = tp->lost;
				3607	int rexmit = REXMIT_NONE; /* Flag to (re)transmit to recover losses */
				3608	u32 prior_fack;
				3609
				3610	sack_state.first_sackt = 0;
				3611	sack_state.rate = &rs;
				3612
				3613	/* We very likely will need to access rtx queue. */
				3614	prefetch(sk->tcp_rtx_queue.rb_node);
				3615
				3616	/* If the ack is older than previous acks
				3617	* then we can probably ignore it.
				3618	*/
				3619	if (before(ack, prior_snd_una)) {
				3620	/* RFC 5961 5.2 [Blind Data Injection Attack].[Mitigation] */
				3621	if (before(ack, prior_snd_una - tp->max_window)) {
				3622	if (!(flag & FLAG_NO_CHALLENGE_ACK))
				3623	tcp_send_challenge_ack(sk, skb);
				3624	return -1;
				3625	}
				3626	goto old_ack;
				3627	}
				3628
				3629	/* If the ack includes data we haven't sent yet, discard
				3630	* this segment (RFC793 Section 3.9).
				3631	*/
				3632	if (after(ack, tp->snd_nxt))
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3633	return -1;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3634
				3635	if (after(ack, prior_snd_una)) {
				3636	flag \|= FLAG_SND_UNA_ADVANCED;
				3637	icsk->icsk_retransmits = 0;
				3638
				3639	#if IS_ENABLED(CONFIG_TLS_DEVICE)
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3640	if (static_branch_unlikely(&clean_acked_data_enabled.key))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3641	if (icsk->icsk_clean_acked)
				3642	icsk->icsk_clean_acked(sk, ack);
				3643	#endif
				3644	}
				3645
				3646	prior_fack = tcp_is_sack(tp) ? tcp_highest_sack_seq(tp) : tp->snd_una;
				3647	rs.prior_in_flight = tcp_packets_in_flight(tp);
				3648
				3649	/* ts_recent update must be made after we are sure that the packet
				3650	* is in window.
				3651	*/
				3652	if (flag & FLAG_UPDATE_TS_RECENT)
				3653	tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq);
				3654
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3655	if ((flag & (FLAG_SLOWPATH \| FLAG_SND_UNA_ADVANCED)) ==
				3656	FLAG_SND_UNA_ADVANCED) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3657	/* Window is constant, pure forward advance.
				3658	* No more checks are required.
				3659	* Note, we use the fact that SND.UNA>=SND.WL2.
				3660	*/
				3661	tcp_update_wl(tp, ack_seq);
				3662	tcp_snd_una_update(tp, ack);
				3663	flag \|= FLAG_WIN_UPDATE;
				3664
				3665	tcp_in_ack_event(sk, CA_ACK_WIN_UPDATE);
				3666
				3667	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPACKS);
				3668	} else {
				3669	u32 ack_ev_flags = CA_ACK_SLOWPATH;
				3670
				3671	if (ack_seq != TCP_SKB_CB(skb)->end_seq)
				3672	flag \|= FLAG_DATA;
				3673	else
				3674	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPUREACKS);
				3675
				3676	flag \|= tcp_ack_update_window(sk, skb, ack, ack_seq);
				3677
				3678	if (TCP_SKB_CB(skb)->sacked)
				3679	flag \|= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
				3680	&sack_state);
				3681
				3682	if (tcp_ecn_rcv_ecn_echo(tp, tcp_hdr(skb))) {
				3683	flag \|= FLAG_ECE;
				3684	ack_ev_flags \|= CA_ACK_ECE;
				3685	}
				3686
				3687	if (flag & FLAG_WIN_UPDATE)
				3688	ack_ev_flags \|= CA_ACK_WIN_UPDATE;
				3689
				3690	tcp_in_ack_event(sk, ack_ev_flags);
				3691	}
				3692
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	3693	/* This is a deviation from RFC3168 since it states that:
				3694	* "When the TCP data sender is ready to set the CWR bit after reducing
				3695	* the congestion window, it SHOULD set the CWR bit only on the first
				3696	* new data packet that it transmits."
				3697	* We accept CWR on pure ACKs to be more robust
				3698	* with widely-deployed TCP implementations that do this.
				3699	*/
				3700	tcp_ecn_accept_cwr(sk, skb);
				3701
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3702	/* We passed data and got it acked, remove any soft error
				3703	* log. Something worked...
				3704	*/
				3705	sk->sk_err_soft = 0;
				3706	icsk->icsk_probes_out = 0;
				3707	tp->rcv_tstamp = tcp_jiffies32;
				3708	if (!prior_packets)
				3709	goto no_queue;
				3710
				3711	/* See if we can take anything off of the retransmit queue. */
				3712	flag \|= tcp_clean_rtx_queue(sk, prior_fack, prior_snd_una, &sack_state);
				3713
				3714	tcp_rack_update_reo_wnd(sk, &rs);
				3715
				3716	if (tp->tlp_high_seq)
				3717	tcp_process_tlp_ack(sk, ack, flag);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3718
				3719	if (tcp_ack_is_dubious(sk, flag)) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3720	if (!(flag & (FLAG_SND_UNA_ADVANCED \| FLAG_NOT_DUP))) {
				3721	num_dupack = 1;
				3722	/* Consider if pure acks were aggregated in tcp_add_backlog() */
				3723	if (!(flag & FLAG_DATA))
				3724	num_dupack = max_t(u16, 1, skb_shinfo(skb)->gso_segs);
				3725	}
				3726	tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3727	&rexmit);
				3728	}
				3729
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	3730	/* If needed, reset TLP/RTO timer when RACK doesn't set. */
				3731	if (flag & FLAG_SET_XMIT_TIMER)
				3732	tcp_set_xmit_timer(sk);
				3733
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3734	if ((flag & FLAG_FORWARD_PROGRESS) \|\| !(flag & FLAG_NOT_DUP))
				3735	sk_dst_confirm(sk);
				3736
				3737	delivered = tcp_newly_delivered(sk, delivered, flag);
				3738	lost = tp->lost - lost; /* freshly marked lost */
				3739	rs.is_ack_delayed = !!(flag & FLAG_ACK_MAYBE_DELAYED);
				3740	tcp_rate_gen(sk, delivered, lost, is_sack_reneg, sack_state.rate);
				3741	tcp_cong_control(sk, ack, delivered, flag, sack_state.rate);
				3742	tcp_xmit_recovery(sk, rexmit);
				3743	return 1;
				3744
				3745	no_queue:
				3746	/* If data was DSACKed, see if we can undo a cwnd reduction. */
				3747	if (flag & FLAG_DSACKING_ACK) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3748	tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3749	&rexmit);
				3750	tcp_newly_delivered(sk, delivered, flag);
				3751	}
				3752	/* If this ack opens up a zero window, clear backoff. It was
				3753	* being used to time the probes, and is probably far higher than
				3754	* it needs to be for normal retransmission.
				3755	*/
				3756	tcp_ack_probe(sk);
				3757
				3758	if (tp->tlp_high_seq)
				3759	tcp_process_tlp_ack(sk, ack, flag);
				3760	return 1;
				3761
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3762	old_ack:
				3763	/* If data was SACKed, tag it and see if we should send more data.
				3764	* If data was DSACKed, see if we can undo a cwnd reduction.
				3765	*/
				3766	if (TCP_SKB_CB(skb)->sacked) {
				3767	flag \|= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
				3768	&sack_state);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3769	tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3770	&rexmit);
				3771	tcp_newly_delivered(sk, delivered, flag);
				3772	tcp_xmit_recovery(sk, rexmit);
				3773	}
				3774
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3775	return 0;
				3776	}
				3777
				3778	static void tcp_parse_fastopen_option(int len, const unsigned char *cookie,
				3779	bool syn, struct tcp_fastopen_cookie *foc,
				3780	bool exp_opt)
				3781	{
				3782	/* Valid only in SYN or SYN-ACK with an even length. */
				3783	if (!foc \|\| !syn \|\| len < 0 \|\| (len & 1))
				3784	return;
				3785
				3786	if (len >= TCP_FASTOPEN_COOKIE_MIN &&
				3787	len <= TCP_FASTOPEN_COOKIE_MAX)
				3788	memcpy(foc->val, cookie, len);
				3789	else if (len != 0)
				3790	len = -1;
				3791	foc->len = len;
				3792	foc->exp = exp_opt;
				3793	}
				3794
				3795	static void smc_parse_options(const struct tcphdr *th,
				3796	struct tcp_options_received *opt_rx,
				3797	const unsigned char *ptr,
				3798	int opsize)
				3799	{
				3800	#if IS_ENABLED(CONFIG_SMC)
				3801	if (static_branch_unlikely(&tcp_have_smc)) {
				3802	if (th->syn && !(opsize & 1) &&
				3803	opsize >= TCPOLEN_EXP_SMC_BASE &&
				3804	get_unaligned_be32(ptr) == TCPOPT_SMC_MAGIC)
				3805	opt_rx->smc_ok = 1;
				3806	}
				3807	#endif
				3808	}
				3809
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3810	/* Try to parse the MSS option from the TCP header. Return 0 on failure, clamped
				3811	* value on success.
				3812	*/
				3813	static u16 tcp_parse_mss_option(const struct tcphdr *th, u16 user_mss)
				3814	{
				3815	const unsigned char ptr = (const unsigned char )(th + 1);
				3816	int length = (th->doff * 4) - sizeof(struct tcphdr);
				3817	u16 mss = 0;
				3818
				3819	while (length > 0) {
				3820	int opcode = *ptr++;
				3821	int opsize;
				3822
				3823	switch (opcode) {
				3824	case TCPOPT_EOL:
				3825	return mss;
				3826	case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */
				3827	length--;
				3828	continue;
				3829	default:
				3830	if (length < 2)
				3831	return mss;
				3832	opsize = *ptr++;
				3833	if (opsize < 2) /* "silly options" */
				3834	return mss;
				3835	if (opsize > length)
				3836	return mss; /* fail on partial options */
				3837	if (opcode == TCPOPT_MSS && opsize == TCPOLEN_MSS) {
				3838	u16 in_mss = get_unaligned_be16(ptr);
				3839
				3840	if (in_mss) {
				3841	if (user_mss && user_mss < in_mss)
				3842	in_mss = user_mss;
				3843	mss = in_mss;
				3844	}
				3845	}
				3846	ptr += opsize - 2;
				3847	length -= opsize;
				3848	}
				3849	}
				3850	return mss;
				3851	}
				3852
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3853	/* Look for tcp options. Normally only called on SYN and SYNACK packets.
				3854	* But, this can also be called on packets in the established flow when
				3855	* the fast version below fails.
				3856	*/
				3857	void tcp_parse_options(const struct net *net,
				3858	const struct sk_buff *skb,
				3859	struct tcp_options_received *opt_rx, int estab,
				3860	struct tcp_fastopen_cookie *foc)
				3861	{
				3862	const unsigned char *ptr;
				3863	const struct tcphdr *th = tcp_hdr(skb);
				3864	int length = (th->doff * 4) - sizeof(struct tcphdr);
				3865
				3866	ptr = (const unsigned char *)(th + 1);
				3867	opt_rx->saw_tstamp = 0;
				3868
				3869	while (length > 0) {
				3870	int opcode = *ptr++;
				3871	int opsize;
				3872
				3873	switch (opcode) {
				3874	case TCPOPT_EOL:
				3875	return;
				3876	case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */
				3877	length--;
				3878	continue;
				3879	default:
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3880	if (length < 2)
				3881	return;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3882	opsize = *ptr++;
				3883	if (opsize < 2) /* "silly options" */
				3884	return;
				3885	if (opsize > length)
				3886	return; /* don't parse partial options */
				3887	switch (opcode) {
				3888	case TCPOPT_MSS:
				3889	if (opsize == TCPOLEN_MSS && th->syn && !estab) {
				3890	u16 in_mss = get_unaligned_be16(ptr);
				3891	if (in_mss) {
				3892	if (opt_rx->user_mss &&
				3893	opt_rx->user_mss < in_mss)
				3894	in_mss = opt_rx->user_mss;
				3895	opt_rx->mss_clamp = in_mss;
				3896	}
				3897	}
				3898	break;
				3899	case TCPOPT_WINDOW:
				3900	if (opsize == TCPOLEN_WINDOW && th->syn &&
				3901	!estab && net->ipv4.sysctl_tcp_window_scaling) {
				3902	__u8 snd_wscale = (__u8 )ptr;
				3903	opt_rx->wscale_ok = 1;
				3904	if (snd_wscale > TCP_MAX_WSCALE) {
				3905	net_info_ratelimited("%s: Illegal window scaling value %d > %u received\n",
				3906	__func__,
				3907	snd_wscale,
				3908	TCP_MAX_WSCALE);
				3909	snd_wscale = TCP_MAX_WSCALE;
				3910	}
				3911	opt_rx->snd_wscale = snd_wscale;
				3912	}
				3913	break;
				3914	case TCPOPT_TIMESTAMP:
				3915	if ((opsize == TCPOLEN_TIMESTAMP) &&
				3916	((estab && opt_rx->tstamp_ok) \|\|
				3917	(!estab && net->ipv4.sysctl_tcp_timestamps))) {
				3918	opt_rx->saw_tstamp = 1;
				3919	opt_rx->rcv_tsval = get_unaligned_be32(ptr);
				3920	opt_rx->rcv_tsecr = get_unaligned_be32(ptr + 4);
				3921	}
				3922	break;
				3923	case TCPOPT_SACK_PERM:
				3924	if (opsize == TCPOLEN_SACK_PERM && th->syn &&
				3925	!estab && net->ipv4.sysctl_tcp_sack) {
				3926	opt_rx->sack_ok = TCP_SACK_SEEN;
				3927	tcp_sack_reset(opt_rx);
				3928	}
				3929	break;
				3930
				3931	case TCPOPT_SACK:
				3932	if ((opsize >= (TCPOLEN_SACK_BASE + TCPOLEN_SACK_PERBLOCK)) &&
				3933	!((opsize - TCPOLEN_SACK_BASE) % TCPOLEN_SACK_PERBLOCK) &&
				3934	opt_rx->sack_ok) {
				3935	TCP_SKB_CB(skb)->sacked = (ptr - 2) - (unsigned char *)th;
				3936	}
				3937	break;
				3938	#ifdef CONFIG_TCP_MD5SIG
				3939	case TCPOPT_MD5SIG:
				3940	/*
				3941	* The MD5 Hash has already been
				3942	* checked (see tcp_v{4,6}_do_rcv()).
				3943	*/
				3944	break;
				3945	#endif
				3946	case TCPOPT_FASTOPEN:
				3947	tcp_parse_fastopen_option(
				3948	opsize - TCPOLEN_FASTOPEN_BASE,
				3949	ptr, th->syn, foc, false);
				3950	break;
				3951
				3952	case TCPOPT_EXP:
				3953	/* Fast Open option shares code 254 using a
				3954	* 16 bits magic number.
				3955	*/
				3956	if (opsize >= TCPOLEN_EXP_FASTOPEN_BASE &&
				3957	get_unaligned_be16(ptr) ==
				3958	TCPOPT_FASTOPEN_MAGIC)
				3959	tcp_parse_fastopen_option(opsize -
				3960	TCPOLEN_EXP_FASTOPEN_BASE,
				3961	ptr + 2, th->syn, foc, true);
				3962	else
				3963	smc_parse_options(th, opt_rx, ptr,
				3964	opsize);
				3965	break;
				3966
				3967	}
				3968	ptr += opsize-2;
				3969	length -= opsize;
				3970	}
				3971	}
				3972	}
				3973	EXPORT_SYMBOL(tcp_parse_options);
				3974
				3975	static bool tcp_parse_aligned_timestamp(struct tcp_sock tp, const struct tcphdr th)
				3976	{
				3977	const __be32 ptr = (const __be32 )(th + 1);
				3978
				3979	if (*ptr == htonl((TCPOPT_NOP << 24) \| (TCPOPT_NOP << 16)
				3980	\| (TCPOPT_TIMESTAMP << 8) \| TCPOLEN_TIMESTAMP)) {
				3981	tp->rx_opt.saw_tstamp = 1;
				3982	++ptr;
				3983	tp->rx_opt.rcv_tsval = ntohl(*ptr);
				3984	++ptr;
				3985	if (*ptr)
				3986	tp->rx_opt.rcv_tsecr = ntohl(*ptr) - tp->tsoffset;
				3987	else
				3988	tp->rx_opt.rcv_tsecr = 0;
				3989	return true;
				3990	}
				3991	return false;
				3992	}
				3993
				3994	/* Fast parse options. This hopes to only see timestamps.
				3995	* If it is wrong it falls back on tcp_parse_options().
				3996	*/
				3997	static bool tcp_fast_parse_options(const struct net *net,
				3998	const struct sk_buff *skb,
				3999	const struct tcphdr th, struct tcp_sock tp)
				4000	{
				4001	/* In the spirit of fast parsing, compare doff directly to constant
				4002	* values. Because equality is used, short doff can be ignored here.
				4003	*/
				4004	if (th->doff == (sizeof(*th) / 4)) {
				4005	tp->rx_opt.saw_tstamp = 0;
				4006	return false;
				4007	} else if (tp->rx_opt.tstamp_ok &&
				4008	th->doff == ((sizeof(*th) + TCPOLEN_TSTAMP_ALIGNED) / 4)) {
				4009	if (tcp_parse_aligned_timestamp(tp, th))
				4010	return true;
				4011	}
				4012
				4013	tcp_parse_options(net, skb, &tp->rx_opt, 1, NULL);
				4014	if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
				4015	tp->rx_opt.rcv_tsecr -= tp->tsoffset;
				4016
				4017	return true;
				4018	}
				4019
				4020	#ifdef CONFIG_TCP_MD5SIG
				4021	/*
				4022	* Parse MD5 Signature option
				4023	*/
				4024	const u8 tcp_parse_md5sig_option(const struct tcphdr th)
				4025	{
				4026	int length = (th->doff << 2) - sizeof(*th);
				4027	const u8 ptr = (const u8 )(th + 1);
				4028
				4029	/* If not enough data remaining, we can short cut */
				4030	while (length >= TCPOLEN_MD5SIG) {
				4031	int opcode = *ptr++;
				4032	int opsize;
				4033
				4034	switch (opcode) {
				4035	case TCPOPT_EOL:
				4036	return NULL;
				4037	case TCPOPT_NOP:
				4038	length--;
				4039	continue;
				4040	default:
				4041	opsize = *ptr++;
				4042	if (opsize < 2 \|\| opsize > length)
				4043	return NULL;
				4044	if (opcode == TCPOPT_MD5SIG)
				4045	return opsize == TCPOLEN_MD5SIG ? ptr : NULL;
				4046	}
				4047	ptr += opsize - 2;
				4048	length -= opsize;
				4049	}
				4050	return NULL;
				4051	}
				4052	EXPORT_SYMBOL(tcp_parse_md5sig_option);
				4053	#endif
				4054
				4055	/* Sorry, PAWS as specified is broken wrt. pure-ACKs -DaveM
				4056	*
				4057	* It is not fatal. If this ACK does _not_ change critical state (seqs, window)
				4058	* it can pass through stack. So, the following predicate verifies that
				4059	* this segment is not used for anything but congestion avoidance or
				4060	* fast retransmit. Moreover, we even are able to eliminate most of such
				4061	* second order effects, if we apply some small "replay" window (~RTO)
				4062	* to timestamp space.
				4063	*
				4064	* All these measures still do not guarantee that we reject wrapped ACKs
				4065	* on networks with high bandwidth, when sequence space is recycled fastly,
				4066	* but it guarantees that such events will be very rare and do not affect
				4067	* connection seriously. This doesn't look nice, but alas, PAWS is really
				4068	* buggy extension.
				4069	*
				4070	* [ Later note. Even worse! It is buggy for segments _with_ data. RFC
				4071	* states that events when retransmit arrives after original data are rare.
				4072	* It is a blatant lie. VJ forgot about fast retransmit! 8)8) It is
				4073	* the biggest problem on large power networks even with minor reordering.
				4074	* OK, let's give it small replay window. If peer clock is even 1hz, it is safe
				4075	* up to bandwidth of 18Gigabit/sec. 8) ]
				4076	*/
				4077
				4078	static int tcp_disordered_ack(const struct sock sk, const struct sk_buff skb)
				4079	{
				4080	const struct tcp_sock *tp = tcp_sk(sk);
				4081	const struct tcphdr *th = tcp_hdr(skb);
				4082	u32 seq = TCP_SKB_CB(skb)->seq;
				4083	u32 ack = TCP_SKB_CB(skb)->ack_seq;
				4084
				4085	return (/* 1. Pure ACK with correct sequence number. */
				4086	(th->ack && seq == TCP_SKB_CB(skb)->end_seq && seq == tp->rcv_nxt) &&
				4087
				4088	/* 2. ... and duplicate ACK. */
				4089	ack == tp->snd_una &&
				4090
				4091	/* 3. ... and does not update window. */
				4092	!tcp_may_update_window(tp, ack, seq, ntohs(th->window) << tp->rx_opt.snd_wscale) &&
				4093
				4094	/* 4. ... and sits in replay window. */
				4095	(s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) <= (inet_csk(sk)->icsk_rto * 1024) / HZ);
				4096	}
				4097
				4098	static inline bool tcp_paws_discard(const struct sock *sk,
				4099	const struct sk_buff *skb)
				4100	{
				4101	const struct tcp_sock *tp = tcp_sk(sk);
				4102
				4103	return !tcp_paws_check(&tp->rx_opt, TCP_PAWS_WINDOW) &&
				4104	!tcp_disordered_ack(sk, skb);
				4105	}
				4106
				4107	/* Check segment sequence number for validity.
				4108	*
				4109	* Segment controls are considered valid, if the segment
				4110	* fits to the window after truncation to the window. Acceptability
				4111	* of data (and SYN, FIN, of course) is checked separately.
				4112	* See tcp_data_queue(), for example.
				4113	*
				4114	* Also, controls (RST is main one) are accepted using RCV.WUP instead
				4115	* of RCV.NXT. Peer still did not advance his SND.UNA when we
				4116	* delayed ACK, so that hisSND.UNA<=ourRCV.WUP.
				4117	* (borrowed from freebsd)
				4118	*/
				4119
				4120	static inline bool tcp_sequence(const struct tcp_sock *tp, u32 seq, u32 end_seq)
				4121	{
				4122	return !before(end_seq, tp->rcv_wup) &&
				4123	!after(seq, tp->rcv_nxt + tcp_receive_window(tp));
				4124	}
				4125
				4126	/* When we get a reset we do this. */
				4127	void tcp_reset(struct sock *sk)
				4128	{
				4129	trace_tcp_receive_reset(sk);
				4130
				4131	/* We want the right error as BSD sees it (and indeed as we do). */
				4132	switch (sk->sk_state) {
				4133	case TCP_SYN_SENT:
				4134	sk->sk_err = ECONNREFUSED;
				4135	break;
				4136	case TCP_CLOSE_WAIT:
				4137	sk->sk_err = EPIPE;
				4138	break;
				4139	case TCP_CLOSE:
				4140	return;
				4141	default:
				4142	sk->sk_err = ECONNRESET;
				4143	}
				4144	/* This barrier is coupled with smp_rmb() in tcp_poll() */
				4145	smp_wmb();
				4146
				4147	tcp_write_queue_purge(sk);
				4148	tcp_done(sk);
				4149
				4150	if (!sock_flag(sk, SOCK_DEAD))
				4151	sk->sk_error_report(sk);
				4152	}
				4153
				4154	/*
				4155	* Process the FIN bit. This now behaves as it is supposed to work
				4156	* and the FIN takes effect when it is validly part of sequence
				4157	* space. Not before when we get holes.
				4158	*
				4159	* If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT
				4160	* (and thence onto LAST-ACK and finally, CLOSE, we never enter
				4161	* TIME-WAIT)
				4162	*
				4163	* If we are in FINWAIT-1, a received FIN indicates simultaneous
				4164	* close and we go into CLOSING (and later onto TIME-WAIT)
				4165	*
				4166	* If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.
				4167	*/
				4168	void tcp_fin(struct sock *sk)
				4169	{
				4170	struct tcp_sock *tp = tcp_sk(sk);
				4171
				4172	inet_csk_schedule_ack(sk);
				4173
				4174	sk->sk_shutdown \|= RCV_SHUTDOWN;
				4175	sock_set_flag(sk, SOCK_DONE);
				4176
				4177	switch (sk->sk_state) {
				4178	case TCP_SYN_RECV:
				4179	case TCP_ESTABLISHED:
				4180	/* Move to CLOSE_WAIT */
				4181	tcp_set_state(sk, TCP_CLOSE_WAIT);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4182	inet_csk_enter_pingpong_mode(sk);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4183	break;
				4184
				4185	case TCP_CLOSE_WAIT:
				4186	case TCP_CLOSING:
				4187	/* Received a retransmission of the FIN, do
				4188	* nothing.
				4189	*/
				4190	break;
				4191	case TCP_LAST_ACK:
				4192	/* RFC793: Remain in the LAST-ACK state. */
				4193	break;
				4194
				4195	case TCP_FIN_WAIT1:
				4196	/* This case occurs when a simultaneous close
				4197	* happens, we must ack the received FIN and
				4198	* enter the CLOSING state.
				4199	*/
				4200	tcp_send_ack(sk);
				4201	tcp_set_state(sk, TCP_CLOSING);
				4202	break;
				4203	case TCP_FIN_WAIT2:
				4204	/* Received a FIN -- send ACK and enter TIME_WAIT. */
				4205	tcp_send_ack(sk);
				4206	tcp_time_wait(sk, TCP_TIME_WAIT, 0);
				4207	break;
				4208	default:
				4209	/* Only TCP_LISTEN and TCP_CLOSE are left, in these
				4210	* cases we should never reach this piece of code.
				4211	*/
				4212	pr_err("%s: Impossible, sk->sk_state=%d\n",
				4213	__func__, sk->sk_state);
				4214	break;
				4215	}
				4216
				4217	/* It _is_ possible, that we have something out-of-order _after_ FIN.
				4218	* Probably, we should reset in this case. For now drop them.
				4219	*/
				4220	skb_rbtree_purge(&tp->out_of_order_queue);
				4221	if (tcp_is_sack(tp))
				4222	tcp_sack_reset(&tp->rx_opt);
				4223	sk_mem_reclaim(sk);
				4224
				4225	if (!sock_flag(sk, SOCK_DEAD)) {
				4226	sk->sk_state_change(sk);
				4227
				4228	/* Do not send POLL_HUP for half duplex close. */
				4229	if (sk->sk_shutdown == SHUTDOWN_MASK \|\|
				4230	sk->sk_state == TCP_CLOSE)
				4231	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP);
				4232	else
				4233	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
				4234	}
				4235	}
				4236
				4237	static inline bool tcp_sack_extend(struct tcp_sack_block *sp, u32 seq,
				4238	u32 end_seq)
				4239	{
				4240	if (!after(seq, sp->end_seq) && !after(sp->start_seq, end_seq)) {
				4241	if (before(seq, sp->start_seq))
				4242	sp->start_seq = seq;
				4243	if (after(end_seq, sp->end_seq))
				4244	sp->end_seq = end_seq;
				4245	return true;
				4246	}
				4247	return false;
				4248	}
				4249
				4250	static void tcp_dsack_set(struct sock *sk, u32 seq, u32 end_seq)
				4251	{
				4252	struct tcp_sock *tp = tcp_sk(sk);
				4253
				4254	if (tcp_is_sack(tp) && sock_net(sk)->ipv4.sysctl_tcp_dsack) {
				4255	int mib_idx;
				4256
				4257	if (before(seq, tp->rcv_nxt))
				4258	mib_idx = LINUX_MIB_TCPDSACKOLDSENT;
				4259	else
				4260	mib_idx = LINUX_MIB_TCPDSACKOFOSENT;
				4261
				4262	NET_INC_STATS(sock_net(sk), mib_idx);
				4263
				4264	tp->rx_opt.dsack = 1;
				4265	tp->duplicate_sack[0].start_seq = seq;
				4266	tp->duplicate_sack[0].end_seq = end_seq;
				4267	}
				4268	}
				4269
				4270	static void tcp_dsack_extend(struct sock *sk, u32 seq, u32 end_seq)
				4271	{
				4272	struct tcp_sock *tp = tcp_sk(sk);
				4273
				4274	if (!tp->rx_opt.dsack)
				4275	tcp_dsack_set(sk, seq, end_seq);
				4276	else
				4277	tcp_sack_extend(tp->duplicate_sack, seq, end_seq);
				4278	}
				4279
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4280	static void tcp_rcv_spurious_retrans(struct sock sk, const struct sk_buff skb)
				4281	{
				4282	/* When the ACK path fails or drops most ACKs, the sender would
				4283	* timeout and spuriously retransmit the same segment repeatedly.
				4284	* The receiver remembers and reflects via DSACKs. Leverage the
				4285	* DSACK state and change the txhash to re-route speculatively.
				4286	*/
				4287	if (TCP_SKB_CB(skb)->seq == tcp_sk(sk)->duplicate_sack[0].start_seq)
				4288	sk_rethink_txhash(sk);
				4289	}
				4290
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4291	static void tcp_send_dupack(struct sock sk, const struct sk_buff skb)
				4292	{
				4293	struct tcp_sock *tp = tcp_sk(sk);
				4294
				4295	if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
				4296	before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
				4297	NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
				4298	tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS);
				4299
				4300	if (tcp_is_sack(tp) && sock_net(sk)->ipv4.sysctl_tcp_dsack) {
				4301	u32 end_seq = TCP_SKB_CB(skb)->end_seq;
				4302
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4303	tcp_rcv_spurious_retrans(sk, skb);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4304	if (after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))
				4305	end_seq = tp->rcv_nxt;
				4306	tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, end_seq);
				4307	}
				4308	}
				4309
				4310	tcp_send_ack(sk);
				4311	}
				4312
				4313	/* These routines update the SACK block as out-of-order packets arrive or
				4314	* in-order packets close up the sequence space.
				4315	*/
				4316	static void tcp_sack_maybe_coalesce(struct tcp_sock *tp)
				4317	{
				4318	int this_sack;
				4319	struct tcp_sack_block *sp = &tp->selective_acks[0];
				4320	struct tcp_sack_block *swalk = sp + 1;
				4321
				4322	/* See if the recent change to the first SACK eats into
				4323	* or hits the sequence space of other SACK blocks, if so coalesce.
				4324	*/
				4325	for (this_sack = 1; this_sack < tp->rx_opt.num_sacks;) {
				4326	if (tcp_sack_extend(sp, swalk->start_seq, swalk->end_seq)) {
				4327	int i;
				4328
				4329	/* Zap SWALK, by moving every further SACK up by one slot.
				4330	* Decrease num_sacks.
				4331	*/
				4332	tp->rx_opt.num_sacks--;
				4333	for (i = this_sack; i < tp->rx_opt.num_sacks; i++)
				4334	sp[i] = sp[i + 1];
				4335	continue;
				4336	}
				4337	this_sack++, swalk++;
				4338	}
				4339	}
				4340
				4341	static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq)
				4342	{
				4343	struct tcp_sock *tp = tcp_sk(sk);
				4344	struct tcp_sack_block *sp = &tp->selective_acks[0];
				4345	int cur_sacks = tp->rx_opt.num_sacks;
				4346	int this_sack;
				4347
				4348	if (!cur_sacks)
				4349	goto new_sack;
				4350
				4351	for (this_sack = 0; this_sack < cur_sacks; this_sack++, sp++) {
				4352	if (tcp_sack_extend(sp, seq, end_seq)) {
				4353	/* Rotate this_sack to the first one. */
				4354	for (; this_sack > 0; this_sack--, sp--)
				4355	swap(sp, (sp - 1));
				4356	if (cur_sacks > 1)
				4357	tcp_sack_maybe_coalesce(tp);
				4358	return;
				4359	}
				4360	}
				4361
				4362	/* Could not find an adjacent existing SACK, build a new one,
				4363	* put it at the front, and shift everyone else down. We
				4364	* always know there is at least one SACK present already here.
				4365	*
				4366	* If the sack array is full, forget about the last one.
				4367	*/
				4368	if (this_sack >= TCP_NUM_SACKS) {
				4369	if (tp->compressed_ack > TCP_FASTRETRANS_THRESH)
				4370	tcp_send_ack(sk);
				4371	this_sack--;
				4372	tp->rx_opt.num_sacks--;
				4373	sp--;
				4374	}
				4375	for (; this_sack > 0; this_sack--, sp--)
				4376	sp = (sp - 1);
				4377
				4378	new_sack:
				4379	/* Build the new head SACK, and we're done. */
				4380	sp->start_seq = seq;
				4381	sp->end_seq = end_seq;
				4382	tp->rx_opt.num_sacks++;
				4383	}
				4384
				4385	/* RCV.NXT advances, some SACKs should be eaten. */
				4386
				4387	static void tcp_sack_remove(struct tcp_sock *tp)
				4388	{
				4389	struct tcp_sack_block *sp = &tp->selective_acks[0];
				4390	int num_sacks = tp->rx_opt.num_sacks;
				4391	int this_sack;
				4392
				4393	/* Empty ofo queue, hence, all the SACKs are eaten. Clear. */
				4394	if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
				4395	tp->rx_opt.num_sacks = 0;
				4396	return;
				4397	}
				4398
				4399	for (this_sack = 0; this_sack < num_sacks;) {
				4400	/* Check if the start of the sack is covered by RCV.NXT. */
				4401	if (!before(tp->rcv_nxt, sp->start_seq)) {
				4402	int i;
				4403
				4404	/* RCV.NXT must cover all the block! */
				4405	WARN_ON(before(tp->rcv_nxt, sp->end_seq));
				4406
				4407	/* Zap this SACK, by moving forward any other SACKS. */
				4408	for (i = this_sack+1; i < num_sacks; i++)
				4409	tp->selective_acks[i-1] = tp->selective_acks[i];
				4410	num_sacks--;
				4411	continue;
				4412	}
				4413	this_sack++;
				4414	sp++;
				4415	}
				4416	tp->rx_opt.num_sacks = num_sacks;
				4417	}
				4418
				4419	/**
				4420	* tcp_try_coalesce - try to merge skb to prior one
				4421	* @sk: socket
				4422	* @dest: destination queue
				4423	* @to: prior buffer
				4424	* @from: buffer to add in queue
				4425	* @fragstolen: pointer to boolean
				4426	*
				4427	* Before queueing skb @from after @to, try to merge them
				4428	* to reduce overall memory use and queue lengths, if cost is small.
				4429	* Packets in ofo or receive queues can stay a long time.
				4430	* Better try to coalesce them right now to avoid future collapses.
				4431	* Returns true if caller should free @from instead of queueing it
				4432	*/
				4433	static bool tcp_try_coalesce(struct sock *sk,
				4434	struct sk_buff *to,
				4435	struct sk_buff *from,
				4436	bool *fragstolen)
				4437	{
				4438	int delta;
				4439
				4440	*fragstolen = false;
				4441
				4442	/* Its possible this segment overlaps with prior segment in queue */
				4443	if (TCP_SKB_CB(from)->seq != TCP_SKB_CB(to)->end_seq)
				4444	return false;
				4445
				4446	#ifdef CONFIG_TLS_DEVICE
				4447	if (from->decrypted != to->decrypted)
				4448	return false;
				4449	#endif
				4450
				4451	if (!skb_try_coalesce(to, from, fragstolen, &delta))
				4452	return false;
				4453
				4454	atomic_add(delta, &sk->sk_rmem_alloc);
				4455	sk_mem_charge(sk, delta);
				4456	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVCOALESCE);
				4457	TCP_SKB_CB(to)->end_seq = TCP_SKB_CB(from)->end_seq;
				4458	TCP_SKB_CB(to)->ack_seq = TCP_SKB_CB(from)->ack_seq;
				4459	TCP_SKB_CB(to)->tcp_flags \|= TCP_SKB_CB(from)->tcp_flags;
				4460
				4461	if (TCP_SKB_CB(from)->has_rxtstamp) {
				4462	TCP_SKB_CB(to)->has_rxtstamp = true;
				4463	to->tstamp = from->tstamp;
				4464	skb_hwtstamps(to)->hwtstamp = skb_hwtstamps(from)->hwtstamp;
				4465	}
				4466
				4467	return true;
				4468	}
				4469
				4470	static bool tcp_ooo_try_coalesce(struct sock *sk,
				4471	struct sk_buff *to,
				4472	struct sk_buff *from,
				4473	bool *fragstolen)
				4474	{
				4475	bool res = tcp_try_coalesce(sk, to, from, fragstolen);
				4476
				4477	/* In case tcp_drop() is called later, update to->gso_segs */
				4478	if (res) {
				4479	u32 gso_segs = max_t(u16, 1, skb_shinfo(to)->gso_segs) +
				4480	max_t(u16, 1, skb_shinfo(from)->gso_segs);
				4481
				4482	skb_shinfo(to)->gso_segs = min_t(u32, gso_segs, 0xFFFF);
				4483	}
				4484	return res;
				4485	}
				4486
				4487	static void tcp_drop(struct sock sk, struct sk_buff skb)
				4488	{
				4489	sk_drops_add(sk, skb);
				4490	__kfree_skb(skb);
				4491	}
				4492
				4493	/* This one checks to see if we can put data from the
				4494	* out_of_order queue into the receive_queue.
				4495	*/
				4496	static void tcp_ofo_queue(struct sock *sk)
				4497	{
				4498	struct tcp_sock *tp = tcp_sk(sk);
				4499	__u32 dsack_high = tp->rcv_nxt;
				4500	bool fin, fragstolen, eaten;
				4501	struct sk_buff skb, tail;
				4502	struct rb_node *p;
				4503
				4504	p = rb_first(&tp->out_of_order_queue);
				4505	while (p) {
				4506	skb = rb_to_skb(p);
				4507	if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
				4508	break;
				4509
				4510	if (before(TCP_SKB_CB(skb)->seq, dsack_high)) {
				4511	__u32 dsack = dsack_high;
				4512	if (before(TCP_SKB_CB(skb)->end_seq, dsack_high))
				4513	dsack_high = TCP_SKB_CB(skb)->end_seq;
				4514	tcp_dsack_extend(sk, TCP_SKB_CB(skb)->seq, dsack);
				4515	}
				4516	p = rb_next(p);
				4517	rb_erase(&skb->rbnode, &tp->out_of_order_queue);
				4518
				4519	if (unlikely(!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4520	tcp_drop(sk, skb);
				4521	continue;
				4522	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4523
				4524	tail = skb_peek_tail(&sk->sk_receive_queue);
				4525	eaten = tail && tcp_try_coalesce(sk, tail, skb, &fragstolen);
				4526	tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq);
				4527	fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN;
				4528	if (!eaten)
				4529	__skb_queue_tail(&sk->sk_receive_queue, skb);
				4530	else
				4531	kfree_skb_partial(skb, fragstolen);
				4532
				4533	if (unlikely(fin)) {
				4534	tcp_fin(sk);
				4535	/* tcp_fin() purges tp->out_of_order_queue,
				4536	* so we must end this loop right now.
				4537	*/
				4538	break;
				4539	}
				4540	}
				4541	}
				4542
				4543	static bool tcp_prune_ofo_queue(struct sock *sk);
				4544	static int tcp_prune_queue(struct sock *sk);
				4545
				4546	static int tcp_try_rmem_schedule(struct sock sk, struct sk_buff skb,
				4547	unsigned int size)
				4548	{
				4549	if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf \|\|
				4550	!sk_rmem_schedule(sk, skb, size)) {
				4551
				4552	if (tcp_prune_queue(sk) < 0)
				4553	return -1;
				4554
				4555	while (!sk_rmem_schedule(sk, skb, size)) {
				4556	if (!tcp_prune_ofo_queue(sk))
				4557	return -1;
				4558	}
				4559	}
				4560	return 0;
				4561	}
				4562
				4563	static void tcp_data_queue_ofo(struct sock sk, struct sk_buff skb)
				4564	{
				4565	struct tcp_sock *tp = tcp_sk(sk);
				4566	struct rb_node *p, parent;
				4567	struct sk_buff *skb1;
				4568	u32 seq, end_seq;
				4569	bool fragstolen;
				4570
				4571	tcp_ecn_check_ce(sk, skb);
				4572
				4573	if (unlikely(tcp_try_rmem_schedule(sk, skb, skb->truesize))) {
				4574	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFODROP);
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	4575	sk->sk_data_ready(sk);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4576	tcp_drop(sk, skb);
				4577	return;
				4578	}
				4579
				4580	/* Disable header prediction. */
				4581	tp->pred_flags = 0;
				4582	inet_csk_schedule_ack(sk);
				4583
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4584	tp->rcv_ooopack += max_t(u16, 1, skb_shinfo(skb)->gso_segs);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4585	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOQUEUE);
				4586	seq = TCP_SKB_CB(skb)->seq;
				4587	end_seq = TCP_SKB_CB(skb)->end_seq;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4588
				4589	p = &tp->out_of_order_queue.rb_node;
				4590	if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
				4591	/* Initial out of order segment, build 1 SACK. */
				4592	if (tcp_is_sack(tp)) {
				4593	tp->rx_opt.num_sacks = 1;
				4594	tp->selective_acks[0].start_seq = seq;
				4595	tp->selective_acks[0].end_seq = end_seq;
				4596	}
				4597	rb_link_node(&skb->rbnode, NULL, p);
				4598	rb_insert_color(&skb->rbnode, &tp->out_of_order_queue);
				4599	tp->ooo_last_skb = skb;
				4600	goto end;
				4601	}
				4602
				4603	/* In the typical case, we are adding an skb to the end of the list.
				4604	* Use of ooo_last_skb avoids the O(Log(N)) rbtree lookup.
				4605	*/
				4606	if (tcp_ooo_try_coalesce(sk, tp->ooo_last_skb,
				4607	skb, &fragstolen)) {
				4608	coalesce_done:
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	4609	/* For non sack flows, do not grow window to force DUPACK
				4610	* and trigger fast retransmit.
				4611	*/
				4612	if (tcp_is_sack(tp))
				4613	tcp_grow_window(sk, skb);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4614	kfree_skb_partial(skb, fragstolen);
				4615	skb = NULL;
				4616	goto add_sack;
				4617	}
				4618	/* Can avoid an rbtree lookup if we are adding skb after ooo_last_skb */
				4619	if (!before(seq, TCP_SKB_CB(tp->ooo_last_skb)->end_seq)) {
				4620	parent = &tp->ooo_last_skb->rbnode;
				4621	p = &parent->rb_right;
				4622	goto insert;
				4623	}
				4624
				4625	/* Find place to insert this segment. Handle overlaps on the way. */
				4626	parent = NULL;
				4627	while (*p) {
				4628	parent = *p;
				4629	skb1 = rb_to_skb(parent);
				4630	if (before(seq, TCP_SKB_CB(skb1)->seq)) {
				4631	p = &parent->rb_left;
				4632	continue;
				4633	}
				4634	if (before(seq, TCP_SKB_CB(skb1)->end_seq)) {
				4635	if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
				4636	/* All the bits are present. Drop. */
				4637	NET_INC_STATS(sock_net(sk),
				4638	LINUX_MIB_TCPOFOMERGE);
				4639	tcp_drop(sk, skb);
				4640	skb = NULL;
				4641	tcp_dsack_set(sk, seq, end_seq);
				4642	goto add_sack;
				4643	}
				4644	if (after(seq, TCP_SKB_CB(skb1)->seq)) {
				4645	/* Partial overlap. */
				4646	tcp_dsack_set(sk, seq, TCP_SKB_CB(skb1)->end_seq);
				4647	} else {
				4648	/* skb's seq == skb1's seq and skb covers skb1.
				4649	* Replace skb1 with skb.
				4650	*/
				4651	rb_replace_node(&skb1->rbnode, &skb->rbnode,
				4652	&tp->out_of_order_queue);
				4653	tcp_dsack_extend(sk,
				4654	TCP_SKB_CB(skb1)->seq,
				4655	TCP_SKB_CB(skb1)->end_seq);
				4656	NET_INC_STATS(sock_net(sk),
				4657	LINUX_MIB_TCPOFOMERGE);
				4658	tcp_drop(sk, skb1);
				4659	goto merge_right;
				4660	}
				4661	} else if (tcp_ooo_try_coalesce(sk, skb1,
				4662	skb, &fragstolen)) {
				4663	goto coalesce_done;
				4664	}
				4665	p = &parent->rb_right;
				4666	}
				4667	insert:
				4668	/* Insert segment into RB tree. */
				4669	rb_link_node(&skb->rbnode, parent, p);
				4670	rb_insert_color(&skb->rbnode, &tp->out_of_order_queue);
				4671
				4672	merge_right:
				4673	/* Remove other segments covered by skb. */
				4674	while ((skb1 = skb_rb_next(skb)) != NULL) {
				4675	if (!after(end_seq, TCP_SKB_CB(skb1)->seq))
				4676	break;
				4677	if (before(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
				4678	tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
				4679	end_seq);
				4680	break;
				4681	}
				4682	rb_erase(&skb1->rbnode, &tp->out_of_order_queue);
				4683	tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
				4684	TCP_SKB_CB(skb1)->end_seq);
				4685	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOMERGE);
				4686	tcp_drop(sk, skb1);
				4687	}
				4688	/* If there is no skb after us, we are the last_skb ! */
				4689	if (!skb1)
				4690	tp->ooo_last_skb = skb;
				4691
				4692	add_sack:
				4693	if (tcp_is_sack(tp))
				4694	tcp_sack_new_ofo_skb(sk, seq, end_seq);
				4695	end:
				4696	if (skb) {
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	4697	/* For non sack flows, do not grow window to force DUPACK
				4698	* and trigger fast retransmit.
				4699	*/
				4700	if (tcp_is_sack(tp))
				4701	tcp_grow_window(sk, skb);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4702	skb_condense(skb);
				4703	skb_set_owner_r(skb, sk);
				4704	}
				4705	}
				4706
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4707	static int __must_check tcp_queue_rcv(struct sock sk, struct sk_buff skb,
				4708	bool *fragstolen)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4709	{
				4710	int eaten;
				4711	struct sk_buff *tail = skb_peek_tail(&sk->sk_receive_queue);
				4712
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4713	eaten = (tail &&
				4714	tcp_try_coalesce(sk, tail,
				4715	skb, fragstolen)) ? 1 : 0;
				4716	tcp_rcv_nxt_update(tcp_sk(sk), TCP_SKB_CB(skb)->end_seq);
				4717	if (!eaten) {
				4718	__skb_queue_tail(&sk->sk_receive_queue, skb);
				4719	skb_set_owner_r(skb, sk);
				4720	}
				4721	return eaten;
				4722	}
				4723
				4724	int tcp_send_rcvq(struct sock sk, struct msghdr msg, size_t size)
				4725	{
				4726	struct sk_buff *skb;
				4727	int err = -ENOMEM;
				4728	int data_len = 0;
				4729	bool fragstolen;
				4730
				4731	if (size == 0)
				4732	return 0;
				4733
				4734	if (size > PAGE_SIZE) {
				4735	int npages = min_t(size_t, size >> PAGE_SHIFT, MAX_SKB_FRAGS);
				4736
				4737	data_len = npages << PAGE_SHIFT;
				4738	size = data_len + (size & ~PAGE_MASK);
				4739	}
				4740	skb = alloc_skb_with_frags(size - data_len, data_len,
				4741	PAGE_ALLOC_COSTLY_ORDER,
				4742	&err, sk->sk_allocation);
				4743	if (!skb)
				4744	goto err;
				4745
				4746	skb_put(skb, size - data_len);
				4747	skb->data_len = data_len;
				4748	skb->len = size;
				4749
				4750	if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) {
				4751	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVQDROP);
				4752	goto err_free;
				4753	}
				4754
				4755	err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
				4756	if (err)
				4757	goto err_free;
				4758
				4759	TCP_SKB_CB(skb)->seq = tcp_sk(sk)->rcv_nxt;
				4760	TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + size;
				4761	TCP_SKB_CB(skb)->ack_seq = tcp_sk(sk)->snd_una - 1;
				4762
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4763	if (tcp_queue_rcv(sk, skb, &fragstolen)) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4764	WARN_ON_ONCE(fragstolen); /* should not happen */
				4765	__kfree_skb(skb);
				4766	}
				4767	return size;
				4768
				4769	err_free:
				4770	kfree_skb(skb);
				4771	err:
				4772	return err;
				4773
				4774	}
				4775
				4776	void tcp_data_ready(struct sock *sk)
				4777	{
				4778	const struct tcp_sock *tp = tcp_sk(sk);
				4779	int avail = tp->rcv_nxt - tp->copied_seq;
				4780
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	4781	if (avail < sk->sk_rcvlowat && !tcp_rmem_pressure(sk) &&
				4782	!sock_flag(sk, SOCK_DONE) &&
				4783	tcp_receive_window(tp) > inet_csk(sk)->icsk_ack.rcv_mss)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4784	return;
				4785
				4786	sk->sk_data_ready(sk);
				4787	}
				4788
				4789	static void tcp_data_queue(struct sock sk, struct sk_buff skb)
				4790	{
				4791	struct tcp_sock *tp = tcp_sk(sk);
				4792	bool fragstolen;
				4793	int eaten;
				4794
				4795	if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) {
				4796	__kfree_skb(skb);
				4797	return;
				4798	}
				4799	skb_dst_drop(skb);
				4800	__skb_pull(skb, tcp_hdr(skb)->doff * 4);
				4801
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4802	tp->rx_opt.dsack = 0;
				4803
				4804	/* Queue data for delivery to the user.
				4805	* Packets in sequence go to the receive queue.
				4806	* Out of sequence packets to the out_of_order_queue.
				4807	*/
				4808	if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) {
				4809	if (tcp_receive_window(tp) == 0) {
				4810	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPZEROWINDOWDROP);
				4811	goto out_of_window;
				4812	}
				4813
				4814	/* Ok. In sequence. In window. */
				4815	queue_and_out:
				4816	if (skb_queue_len(&sk->sk_receive_queue) == 0)
				4817	sk_forced_mem_schedule(sk, skb->truesize);
				4818	else if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) {
				4819	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVQDROP);
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	4820	sk->sk_data_ready(sk);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4821	goto drop;
				4822	}
				4823
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4824	eaten = tcp_queue_rcv(sk, skb, &fragstolen);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4825	if (skb->len)
				4826	tcp_event_data_recv(sk, skb);
				4827	if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
				4828	tcp_fin(sk);
				4829
				4830	if (!RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
				4831	tcp_ofo_queue(sk);
				4832
				4833	/* RFC5681. 4.2. SHOULD send immediate ACK, when
				4834	* gap in queue is filled.
				4835	*/
				4836	if (RB_EMPTY_ROOT(&tp->out_of_order_queue))
				4837	inet_csk(sk)->icsk_ack.pending \|= ICSK_ACK_NOW;
				4838	}
				4839
				4840	if (tp->rx_opt.num_sacks)
				4841	tcp_sack_remove(tp);
				4842
				4843	tcp_fast_path_check(sk);
				4844
				4845	if (eaten > 0)
				4846	kfree_skb_partial(skb, fragstolen);
				4847	if (!sock_flag(sk, SOCK_DEAD))
				4848	tcp_data_ready(sk);
				4849	return;
				4850	}
				4851
				4852	if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4853	tcp_rcv_spurious_retrans(sk, skb);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4854	/* A retransmit, 2nd most common case. Force an immediate ack. */
				4855	NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
				4856	tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
				4857
				4858	out_of_window:
				4859	tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS);
				4860	inet_csk_schedule_ack(sk);
				4861	drop:
				4862	tcp_drop(sk, skb);
				4863	return;
				4864	}
				4865
				4866	/* Out of window. F.e. zero window probe. */
				4867	if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt + tcp_receive_window(tp)))
				4868	goto out_of_window;
				4869
				4870	if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
				4871	/* Partial packet, seq < rcv_next < end_seq */
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4872	tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, tp->rcv_nxt);
				4873
				4874	/* If window is closed, drop tail of packet. But after
				4875	* remembering D-SACK for its head made in previous line.
				4876	*/
				4877	if (!tcp_receive_window(tp)) {
				4878	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPZEROWINDOWDROP);
				4879	goto out_of_window;
				4880	}
				4881	goto queue_and_out;
				4882	}
				4883
				4884	tcp_data_queue_ofo(sk, skb);
				4885	}
				4886
				4887	static struct sk_buff tcp_skb_next(struct sk_buff skb, struct sk_buff_head *list)
				4888	{
				4889	if (list)
				4890	return !skb_queue_is_last(list, skb) ? skb->next : NULL;
				4891
				4892	return skb_rb_next(skb);
				4893	}
				4894
				4895	static struct sk_buff tcp_collapse_one(struct sock sk, struct sk_buff *skb,
				4896	struct sk_buff_head *list,
				4897	struct rb_root *root)
				4898	{
				4899	struct sk_buff *next = tcp_skb_next(skb, list);
				4900
				4901	if (list)
				4902	__skb_unlink(skb, list);
				4903	else
				4904	rb_erase(&skb->rbnode, root);
				4905
				4906	__kfree_skb(skb);
				4907	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVCOLLAPSED);
				4908
				4909	return next;
				4910	}
				4911
				4912	/* Insert skb into rb tree, ordered by TCP_SKB_CB(skb)->seq */
				4913	void tcp_rbtree_insert(struct rb_root root, struct sk_buff skb)
				4914	{
				4915	struct rb_node **p = &root->rb_node;
				4916	struct rb_node *parent = NULL;
				4917	struct sk_buff *skb1;
				4918
				4919	while (*p) {
				4920	parent = *p;
				4921	skb1 = rb_to_skb(parent);
				4922	if (before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb1)->seq))
				4923	p = &parent->rb_left;
				4924	else
				4925	p = &parent->rb_right;
				4926	}
				4927	rb_link_node(&skb->rbnode, parent, p);
				4928	rb_insert_color(&skb->rbnode, root);
				4929	}
				4930
				4931	/* Collapse contiguous sequence of skbs head..tail with
				4932	* sequence numbers start..end.
				4933	*
				4934	* If tail is NULL, this means until the end of the queue.
				4935	*
				4936	* Segments with FIN/SYN are not collapsed (only because this
				4937	* simplifies code)
				4938	*/
				4939	static void
				4940	tcp_collapse(struct sock sk, struct sk_buff_head list, struct rb_root *root,
				4941	struct sk_buff head, struct sk_buff tail, u32 start, u32 end)
				4942	{
				4943	struct sk_buff skb = head, n;
				4944	struct sk_buff_head tmp;
				4945	bool end_of_skbs;
				4946
				4947	/* First, check that queue is collapsible and find
				4948	* the point where collapsing can be useful.
				4949	*/
				4950	restart:
				4951	for (end_of_skbs = true; skb != NULL && skb != tail; skb = n) {
				4952	n = tcp_skb_next(skb, list);
				4953
				4954	/* No new bits? It is possible on ofo queue. */
				4955	if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
				4956	skb = tcp_collapse_one(sk, skb, list, root);
				4957	if (!skb)
				4958	break;
				4959	goto restart;
				4960	}
				4961
				4962	/* The first skb to collapse is:
				4963	* - not SYN/FIN and
				4964	* - bloated or contains data before "start" or
				4965	* overlaps to the next one.
				4966	*/
				4967	if (!(TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN \| TCPHDR_FIN)) &&
				4968	(tcp_win_from_space(sk, skb->truesize) > skb->len \|\|
				4969	before(TCP_SKB_CB(skb)->seq, start))) {
				4970	end_of_skbs = false;
				4971	break;
				4972	}
				4973
				4974	if (n && n != tail &&
				4975	TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(n)->seq) {
				4976	end_of_skbs = false;
				4977	break;
				4978	}
				4979
				4980	/* Decided to skip this, advance start seq. */
				4981	start = TCP_SKB_CB(skb)->end_seq;
				4982	}
				4983	if (end_of_skbs \|\|
				4984	(TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN \| TCPHDR_FIN)))
				4985	return;
				4986
				4987	__skb_queue_head_init(&tmp);
				4988
				4989	while (before(start, end)) {
				4990	int copy = min_t(int, SKB_MAX_ORDER(0, 0), end - start);
				4991	struct sk_buff *nskb;
				4992
				4993	nskb = alloc_skb(copy, GFP_ATOMIC);
				4994	if (!nskb)
				4995	break;
				4996
				4997	memcpy(nskb->cb, skb->cb, sizeof(skb->cb));
				4998	#ifdef CONFIG_TLS_DEVICE
				4999	nskb->decrypted = skb->decrypted;
				5000	#endif
				5001	TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start;
				5002	if (list)
				5003	__skb_queue_before(list, skb, nskb);
				5004	else
				5005	__skb_queue_tail(&tmp, nskb); /* defer rbtree insertion */
				5006	skb_set_owner_r(nskb, sk);
				5007
				5008	/* Copy data, releasing collapsed skbs. */
				5009	while (copy > 0) {
				5010	int offset = start - TCP_SKB_CB(skb)->seq;
				5011	int size = TCP_SKB_CB(skb)->end_seq - start;
				5012
				5013	BUG_ON(offset < 0);
				5014	if (size > 0) {
				5015	size = min(copy, size);
				5016	if (skb_copy_bits(skb, offset, skb_put(nskb, size), size))
				5017	BUG();
				5018	TCP_SKB_CB(nskb)->end_seq += size;
				5019	copy -= size;
				5020	start += size;
				5021	}
				5022	if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
				5023	skb = tcp_collapse_one(sk, skb, list, root);
				5024	if (!skb \|\|
				5025	skb == tail \|\|
				5026	(TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN \| TCPHDR_FIN)))
				5027	goto end;
				5028	#ifdef CONFIG_TLS_DEVICE
				5029	if (skb->decrypted != nskb->decrypted)
				5030	goto end;
				5031	#endif
				5032	}
				5033	}
				5034	}
				5035	end:
				5036	skb_queue_walk_safe(&tmp, skb, n)
				5037	tcp_rbtree_insert(root, skb);
				5038	}
				5039
				5040	/* Collapse ofo queue. Algorithm: select contiguous sequence of skbs
				5041	* and tcp_collapse() them until all the queue is collapsed.
				5042	*/
				5043	static void tcp_collapse_ofo_queue(struct sock *sk)
				5044	{
				5045	struct tcp_sock *tp = tcp_sk(sk);
				5046	u32 range_truesize, sum_tiny = 0;
				5047	struct sk_buff skb, head;
				5048	u32 start, end;
				5049
				5050	skb = skb_rb_first(&tp->out_of_order_queue);
				5051	new_range:
				5052	if (!skb) {
				5053	tp->ooo_last_skb = skb_rb_last(&tp->out_of_order_queue);
				5054	return;
				5055	}
				5056	start = TCP_SKB_CB(skb)->seq;
				5057	end = TCP_SKB_CB(skb)->end_seq;
				5058	range_truesize = skb->truesize;
				5059
				5060	for (head = skb;;) {
				5061	skb = skb_rb_next(skb);
				5062
				5063	/* Range is terminated when we see a gap or when
				5064	* we are at the queue end.
				5065	*/
				5066	if (!skb \|\|
				5067	after(TCP_SKB_CB(skb)->seq, end) \|\|
				5068	before(TCP_SKB_CB(skb)->end_seq, start)) {
				5069	/* Do not attempt collapsing tiny skbs */
				5070	if (range_truesize != head->truesize \|\|
				5071	end - start >= SKB_WITH_OVERHEAD(SK_MEM_QUANTUM)) {
				5072	tcp_collapse(sk, NULL, &tp->out_of_order_queue,
				5073	head, skb, start, end);
				5074	} else {
				5075	sum_tiny += range_truesize;
				5076	if (sum_tiny > sk->sk_rcvbuf >> 3)
				5077	return;
				5078	}
				5079	goto new_range;
				5080	}
				5081
				5082	range_truesize += skb->truesize;
				5083	if (unlikely(before(TCP_SKB_CB(skb)->seq, start)))
				5084	start = TCP_SKB_CB(skb)->seq;
				5085	if (after(TCP_SKB_CB(skb)->end_seq, end))
				5086	end = TCP_SKB_CB(skb)->end_seq;
				5087	}
				5088	}
				5089
				5090	/*
				5091	* Clean the out-of-order queue to make room.
				5092	* We drop high sequences packets to :
				5093	* 1) Let a chance for holes to be filled.
				5094	* 2) not add too big latencies if thousands of packets sit there.
				5095	* (But if application shrinks SO_RCVBUF, we could still end up
				5096	* freeing whole queue here)
				5097	* 3) Drop at least 12.5 % of sk_rcvbuf to avoid malicious attacks.
				5098	*
				5099	* Return true if queue has shrunk.
				5100	*/
				5101	static bool tcp_prune_ofo_queue(struct sock *sk)
				5102	{
				5103	struct tcp_sock *tp = tcp_sk(sk);
				5104	struct rb_node node, prev;
				5105	int goal;
				5106
				5107	if (RB_EMPTY_ROOT(&tp->out_of_order_queue))
				5108	return false;
				5109
				5110	NET_INC_STATS(sock_net(sk), LINUX_MIB_OFOPRUNED);
				5111	goal = sk->sk_rcvbuf >> 3;
				5112	node = &tp->ooo_last_skb->rbnode;
				5113	do {
				5114	prev = rb_prev(node);
				5115	rb_erase(node, &tp->out_of_order_queue);
				5116	goal -= rb_to_skb(node)->truesize;
				5117	tcp_drop(sk, rb_to_skb(node));
				5118	if (!prev \|\| goal <= 0) {
				5119	sk_mem_reclaim(sk);
				5120	if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf &&
				5121	!tcp_under_memory_pressure(sk))
				5122	break;
				5123	goal = sk->sk_rcvbuf >> 3;
				5124	}
				5125	node = prev;
				5126	} while (node);
				5127	tp->ooo_last_skb = rb_to_skb(prev);
				5128
				5129	/* Reset SACK state. A conforming SACK implementation will
				5130	* do the same at a timeout based retransmit. When a connection
				5131	* is in a sad state like this, we care only about integrity
				5132	* of the connection not performance.
				5133	*/
				5134	if (tp->rx_opt.sack_ok)
				5135	tcp_sack_reset(&tp->rx_opt);
				5136	return true;
				5137	}
				5138
				5139	/* Reduce allocated memory if we can, trying to get
				5140	* the socket within its memory limits again.
				5141	*
				5142	* Return less than zero if we should start dropping frames
				5143	* until the socket owning process reads some of the data
				5144	* to stabilize the situation.
				5145	*/
				5146	static int tcp_prune_queue(struct sock *sk)
				5147	{
				5148	struct tcp_sock *tp = tcp_sk(sk);
				5149
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5150	NET_INC_STATS(sock_net(sk), LINUX_MIB_PRUNECALLED);
				5151
				5152	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
				5153	tcp_clamp_window(sk);
				5154	else if (tcp_under_memory_pressure(sk))
				5155	tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss);
				5156
				5157	if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
				5158	return 0;
				5159
				5160	tcp_collapse_ofo_queue(sk);
				5161	if (!skb_queue_empty(&sk->sk_receive_queue))
				5162	tcp_collapse(sk, &sk->sk_receive_queue, NULL,
				5163	skb_peek(&sk->sk_receive_queue),
				5164	NULL,
				5165	tp->copied_seq, tp->rcv_nxt);
				5166	sk_mem_reclaim(sk);
				5167
				5168	if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
				5169	return 0;
				5170
				5171	/* Collapsing did not help, destructive actions follow.
				5172	* This must not ever occur. */
				5173
				5174	tcp_prune_ofo_queue(sk);
				5175
				5176	if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
				5177	return 0;
				5178
				5179	/* If we are really being abused, tell the caller to silently
				5180	* drop receive data on the floor. It will get retransmitted
				5181	* and hopefully then we'll have sufficient space.
				5182	*/
				5183	NET_INC_STATS(sock_net(sk), LINUX_MIB_RCVPRUNED);
				5184
				5185	/* Massive buffer overcommit. */
				5186	tp->pred_flags = 0;
				5187	return -1;
				5188	}
				5189
				5190	static bool tcp_should_expand_sndbuf(const struct sock *sk)
				5191	{
				5192	const struct tcp_sock *tp = tcp_sk(sk);
				5193
				5194	/* If the user specified a specific send buffer setting, do
				5195	* not modify it.
				5196	*/
				5197	if (sk->sk_userlocks & SOCK_SNDBUF_LOCK)
				5198	return false;
				5199
				5200	/* If we are under global TCP memory pressure, do not expand. */
				5201	if (tcp_under_memory_pressure(sk))
				5202	return false;
				5203
				5204	/* If we are under soft global TCP memory pressure, do not expand. */
				5205	if (sk_memory_allocated(sk) >= sk_prot_mem_limits(sk, 0))
				5206	return false;
				5207
				5208	/* If we filled the congestion window, do not expand. */
				5209	if (tcp_packets_in_flight(tp) >= tp->snd_cwnd)
				5210	return false;
				5211
				5212	return true;
				5213	}
				5214
				5215	/* When incoming ACK allowed to free some skb from write_queue,
				5216	* we remember this event in flag SOCK_QUEUE_SHRUNK and wake up socket
				5217	* on the exit from tcp input handler.
				5218	*
				5219	* PROBLEM: sndbuf expansion does not work well with largesend.
				5220	*/
				5221	static void tcp_new_space(struct sock *sk)
				5222	{
				5223	struct tcp_sock *tp = tcp_sk(sk);
				5224
				5225	if (tcp_should_expand_sndbuf(sk)) {
				5226	tcp_sndbuf_expand(sk);
				5227	tp->snd_cwnd_stamp = tcp_jiffies32;
				5228	}
				5229
				5230	sk->sk_write_space(sk);
				5231	}
				5232
				5233	static void tcp_check_space(struct sock *sk)
				5234	{
				5235	if (sock_flag(sk, SOCK_QUEUE_SHRUNK)) {
				5236	sock_reset_flag(sk, SOCK_QUEUE_SHRUNK);
				5237	/* pairs with tcp_poll() */
				5238	smp_mb();
				5239	if (sk->sk_socket &&
				5240	test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
				5241	tcp_new_space(sk);
				5242	if (!test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))
				5243	tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED);
				5244	}
				5245	}
				5246	}
				5247
				5248	static inline void tcp_data_snd_check(struct sock *sk)
				5249	{
				5250	tcp_push_pending_frames(sk);
				5251	tcp_check_space(sk);
				5252	}
				5253
				5254	/*
				5255	* Check if sending an ack is needed.
				5256	*/
				5257	static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
				5258	{
				5259	struct tcp_sock *tp = tcp_sk(sk);
				5260	unsigned long rtt, delay;
				5261
				5262	/* More than one full frame received... */
				5263	if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss &&
				5264	/* ... and right edge of window advances far enough.
				5265	* (tcp_recvmsg() will send ACK otherwise).
				5266	* If application uses SO_RCVLOWAT, we want send ack now if
				5267	* we have not received enough bytes to satisfy the condition.
				5268	*/
				5269	(tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat \|\|
				5270	__tcp_select_window(sk) >= tp->rcv_wnd)) \|\|
				5271	/* We ACK each frame or... */
				5272	tcp_in_quickack_mode(sk) \|\|
				5273	/* Protocol state mandates a one-time immediate ACK */
				5274	inet_csk(sk)->icsk_ack.pending & ICSK_ACK_NOW) {
				5275	send_now:
				5276	tcp_send_ack(sk);
				5277	return;
				5278	}
				5279
				5280	if (!ofo_possible \|\| RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
				5281	tcp_send_delayed_ack(sk);
				5282	return;
				5283	}
				5284
				5285	if (!tcp_is_sack(tp) \|\|
				5286	tp->compressed_ack >= sock_net(sk)->ipv4.sysctl_tcp_comp_sack_nr)
				5287	goto send_now;
				5288
				5289	if (tp->compressed_ack_rcv_nxt != tp->rcv_nxt) {
				5290	tp->compressed_ack_rcv_nxt = tp->rcv_nxt;
				5291	if (tp->compressed_ack > TCP_FASTRETRANS_THRESH)
				5292	NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPACKCOMPRESSED,
				5293	tp->compressed_ack - TCP_FASTRETRANS_THRESH);
				5294	tp->compressed_ack = 0;
				5295	}
				5296
				5297	if (++tp->compressed_ack <= TCP_FASTRETRANS_THRESH)
				5298	goto send_now;
				5299
				5300	if (hrtimer_is_queued(&tp->compressed_ack_timer))
				5301	return;
				5302
				5303	/* compress ack timer : 5 % of rtt, but no more than tcp_comp_sack_delay_ns */
				5304
				5305	rtt = tp->rcv_rtt_est.rtt_us;
				5306	if (tp->srtt_us && tp->srtt_us < rtt)
				5307	rtt = tp->srtt_us;
				5308
				5309	delay = min_t(unsigned long, sock_net(sk)->ipv4.sysctl_tcp_comp_sack_delay_ns,
				5310	rtt * (NSEC_PER_USEC >> 3)/20);
				5311	sock_hold(sk);
				5312	hrtimer_start(&tp->compressed_ack_timer, ns_to_ktime(delay),
				5313	HRTIMER_MODE_REL_PINNED_SOFT);
				5314	}
				5315
				5316	static inline void tcp_ack_snd_check(struct sock *sk)
				5317	{
				5318	if (!inet_csk_ack_scheduled(sk)) {
				5319	/* We sent a data segment already. */
				5320	return;
				5321	}
				5322	__tcp_ack_snd_check(sk, 1);
				5323	}
				5324
				5325	/*
				5326	* This routine is only called when we have urgent data
				5327	* signaled. Its the 'slow' part of tcp_urg. It could be
				5328	* moved inline now as tcp_urg is only called from one
				5329	* place. We handle URGent data wrong. We have to - as
				5330	* BSD still doesn't use the correction from RFC961.
				5331	* For 1003.1g we should support a new option TCP_STDURG to permit
				5332	* either form (or just set the sysctl tcp_stdurg).
				5333	*/
				5334
				5335	static void tcp_check_urg(struct sock sk, const struct tcphdr th)
				5336	{
				5337	struct tcp_sock *tp = tcp_sk(sk);
				5338	u32 ptr = ntohs(th->urg_ptr);
				5339
				5340	if (ptr && !sock_net(sk)->ipv4.sysctl_tcp_stdurg)
				5341	ptr--;
				5342	ptr += ntohl(th->seq);
				5343
				5344	/* Ignore urgent data that we've already seen and read. */
				5345	if (after(tp->copied_seq, ptr))
				5346	return;
				5347
				5348	/* Do not replay urg ptr.
				5349	*
				5350	* NOTE: interesting situation not covered by specs.
				5351	* Misbehaving sender may send urg ptr, pointing to segment,
				5352	* which we already have in ofo queue. We are not able to fetch
				5353	* such data and will stay in TCP_URG_NOTYET until will be eaten
				5354	* by recvmsg(). Seems, we are not obliged to handle such wicked
				5355	* situations. But it is worth to think about possibility of some
				5356	* DoSes using some hypothetical application level deadlock.
				5357	*/
				5358	if (before(ptr, tp->rcv_nxt))
				5359	return;
				5360
				5361	/* Do we already have a newer (or duplicate) urgent pointer? */
				5362	if (tp->urg_data && !after(ptr, tp->urg_seq))
				5363	return;
				5364
				5365	/* Tell the world about our new urgent pointer. */
				5366	sk_send_sigurg(sk);
				5367
				5368	/* We may be adding urgent data when the last byte read was
				5369	* urgent. To do this requires some care. We cannot just ignore
				5370	* tp->copied_seq since we would read the last urgent byte again
				5371	* as data, nor can we alter copied_seq until this data arrives
				5372	* or we break the semantics of SIOCATMARK (and thus sockatmark())
				5373	*
				5374	* NOTE. Double Dutch. Rendering to plain English: author of comment
				5375	* above did something sort of send("A", MSG_OOB); send("B", MSG_OOB);
				5376	* and expect that both A and B disappear from stream. This is _wrong_.
				5377	* Though this happens in BSD with high probability, this is occasional.
				5378	* Any application relying on this is buggy. Note also, that fix "works"
				5379	* only in this artificial test. Insert some normal data between A and B and we will
				5380	* decline of BSD again. Verdict: it is better to remove to trap
				5381	* buggy users.
				5382	*/
				5383	if (tp->urg_seq == tp->copied_seq && tp->urg_data &&
				5384	!sock_flag(sk, SOCK_URGINLINE) && tp->copied_seq != tp->rcv_nxt) {
				5385	struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
				5386	tp->copied_seq++;
				5387	if (skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq)) {
				5388	__skb_unlink(skb, &sk->sk_receive_queue);
				5389	__kfree_skb(skb);
				5390	}
				5391	}
				5392
				5393	tp->urg_data = TCP_URG_NOTYET;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5394	WRITE_ONCE(tp->urg_seq, ptr);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5395
				5396	/* Disable header prediction. */
				5397	tp->pred_flags = 0;
				5398	}
				5399
				5400	/* This is the 'fast' part of urgent handling. */
				5401	static void tcp_urg(struct sock sk, struct sk_buff skb, const struct tcphdr *th)
				5402	{
				5403	struct tcp_sock *tp = tcp_sk(sk);
				5404
				5405	/* Check if we get a new urgent pointer - normally not. */
				5406	if (th->urg)
				5407	tcp_check_urg(sk, th);
				5408
				5409	/* Do we wait for any urgent data? - normally not... */
				5410	if (tp->urg_data == TCP_URG_NOTYET) {
				5411	u32 ptr = tp->urg_seq - ntohl(th->seq) + (th->doff * 4) -
				5412	th->syn;
				5413
				5414	/* Is the urgent pointer pointing into this packet? */
				5415	if (ptr < skb->len) {
				5416	u8 tmp;
				5417	if (skb_copy_bits(skb, ptr, &tmp, 1))
				5418	BUG();
				5419	tp->urg_data = TCP_URG_VALID \| tmp;
				5420	if (!sock_flag(sk, SOCK_DEAD))
				5421	sk->sk_data_ready(sk);
				5422	}
				5423	}
				5424	}
				5425
				5426	/* Accept RST for rcv_nxt - 1 after a FIN.
				5427	* When tcp connections are abruptly terminated from Mac OSX (via ^C), a
				5428	* FIN is sent followed by a RST packet. The RST is sent with the same
				5429	* sequence number as the FIN, and thus according to RFC 5961 a challenge
				5430	* ACK should be sent. However, Mac OSX rate limits replies to challenge
				5431	* ACKs on the closed socket. In addition middleboxes can drop either the
				5432	* challenge ACK or a subsequent RST.
				5433	*/
				5434	static bool tcp_reset_check(const struct sock sk, const struct sk_buff skb)
				5435	{
				5436	struct tcp_sock *tp = tcp_sk(sk);
				5437
				5438	return unlikely(TCP_SKB_CB(skb)->seq == (tp->rcv_nxt - 1) &&
				5439	(1 << sk->sk_state) & (TCPF_CLOSE_WAIT \| TCPF_LAST_ACK \|
				5440	TCPF_CLOSING));
				5441	}
				5442
				5443	/* Does PAWS and seqno based validation of an incoming segment, flags will
				5444	* play significant role here.
				5445	*/
				5446	static bool tcp_validate_incoming(struct sock sk, struct sk_buff skb,
				5447	const struct tcphdr *th, int syn_inerr)
				5448	{
				5449	struct tcp_sock *tp = tcp_sk(sk);
				5450	bool rst_seq_match = false;
				5451
				5452	/* RFC1323: H1. Apply PAWS check first. */
				5453	if (tcp_fast_parse_options(sock_net(sk), skb, th, tp) &&
				5454	tp->rx_opt.saw_tstamp &&
				5455	tcp_paws_discard(sk, skb)) {
				5456	if (!th->rst) {
				5457	NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED);
				5458	if (!tcp_oow_rate_limited(sock_net(sk), skb,
				5459	LINUX_MIB_TCPACKSKIPPEDPAWS,
				5460	&tp->last_oow_ack_time))
				5461	tcp_send_dupack(sk, skb);
				5462	goto discard;
				5463	}
				5464	/* Reset is accepted even if it did not pass PAWS. */
				5465	}
				5466
				5467	/* Step 1: check sequence number */
				5468	if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)) {
				5469	/* RFC793, page 37: "In all states except SYN-SENT, all reset
				5470	* (RST) segments are validated by checking their SEQ-fields."
				5471	* And page 69: "If an incoming segment is not acceptable,
				5472	* an acknowledgment should be sent in reply (unless the RST
				5473	* bit is set, if so drop the segment and return)".
				5474	*/
				5475	if (!th->rst) {
				5476	if (th->syn)
				5477	goto syn_challenge;
				5478	if (!tcp_oow_rate_limited(sock_net(sk), skb,
				5479	LINUX_MIB_TCPACKSKIPPEDSEQ,
				5480	&tp->last_oow_ack_time))
				5481	tcp_send_dupack(sk, skb);
				5482	} else if (tcp_reset_check(sk, skb)) {
				5483	tcp_reset(sk);
				5484	}
				5485	goto discard;
				5486	}
				5487
				5488	/* Step 2: check RST bit */
				5489	if (th->rst) {
				5490	/* RFC 5961 3.2 (extend to match against (RCV.NXT - 1) after a
				5491	* FIN and SACK too if available):
				5492	* If seq num matches RCV.NXT or (RCV.NXT - 1) after a FIN, or
				5493	* the right-most SACK block,
				5494	* then
				5495	* RESET the connection
				5496	* else
				5497	* Send a challenge ACK
				5498	*/
				5499	if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt \|\|
				5500	tcp_reset_check(sk, skb)) {
				5501	rst_seq_match = true;
				5502	} else if (tcp_is_sack(tp) && tp->rx_opt.num_sacks > 0) {
				5503	struct tcp_sack_block *sp = &tp->selective_acks[0];
				5504	int max_sack = sp[0].end_seq;
				5505	int this_sack;
				5506
				5507	for (this_sack = 1; this_sack < tp->rx_opt.num_sacks;
				5508	++this_sack) {
				5509	max_sack = after(sp[this_sack].end_seq,
				5510	max_sack) ?
				5511	sp[this_sack].end_seq : max_sack;
				5512	}
				5513
				5514	if (TCP_SKB_CB(skb)->seq == max_sack)
				5515	rst_seq_match = true;
				5516	}
				5517
				5518	if (rst_seq_match)
				5519	tcp_reset(sk);
				5520	else {
				5521	/* Disable TFO if RST is out-of-order
				5522	* and no data has been received
				5523	* for current active TFO socket
				5524	*/
				5525	if (tp->syn_fastopen && !tp->data_segs_in &&
				5526	sk->sk_state == TCP_ESTABLISHED)
				5527	tcp_fastopen_active_disable(sk);
				5528	tcp_send_challenge_ack(sk, skb);
				5529	}
				5530	goto discard;
				5531	}
				5532
				5533	/* step 3: check security and precedence [ignored] */
				5534
				5535	/* step 4: Check for a SYN
				5536	* RFC 5961 4.2 : Send a challenge ack
				5537	*/
				5538	if (th->syn) {
				5539	syn_challenge:
				5540	if (syn_inerr)
				5541	TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
				5542	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNCHALLENGE);
				5543	tcp_send_challenge_ack(sk, skb);
				5544	goto discard;
				5545	}
				5546
				5547	return true;
				5548
				5549	discard:
				5550	tcp_drop(sk, skb);
				5551	return false;
				5552	}
				5553
				5554	/*
				5555	* TCP receive function for the ESTABLISHED state.
				5556	*
				5557	* It is split into a fast path and a slow path. The fast path is
				5558	* disabled when:
				5559	* - A zero window was announced from us - zero window probing
				5560	* is only handled properly in the slow path.
				5561	* - Out of order segments arrived.
				5562	* - Urgent data is expected.
				5563	* - There is no buffer space left
				5564	* - Unexpected TCP flags/window values/header lengths are received
				5565	* (detected by checking the TCP header against pred_flags)
				5566	* - Data is sent in both directions. Fast path only supports pure senders
				5567	* or pure receivers (this means either the sequence number or the ack
				5568	* value must stay constant)
				5569	* - Unexpected TCP option.
				5570	*
				5571	* When these conditions are not satisfied it drops into a standard
				5572	* receive procedure patterned after RFC793 to handle all cases.
				5573	* The first three cases are guaranteed by proper pred_flags setting,
				5574	* the rest is checked inline. Fast processing is turned on in
				5575	* tcp_data_queue when everything is OK.
				5576	*/
				5577	void tcp_rcv_established(struct sock sk, struct sk_buff skb)
				5578	{
				5579	const struct tcphdr th = (const struct tcphdr )skb->data;
				5580	struct tcp_sock *tp = tcp_sk(sk);
				5581	unsigned int len = skb->len;
				5582
				5583	/* TCP congestion window tracking */
				5584	trace_tcp_probe(sk, skb);
				5585
				5586	tcp_mstamp_refresh(tp);
				5587	if (unlikely(!sk->sk_rx_dst))
				5588	inet_csk(sk)->icsk_af_ops->sk_rx_dst_set(sk, skb);
				5589	/*
				5590	* Header prediction.
				5591	* The code loosely follows the one in the famous
				5592	* "30 instruction TCP receive" Van Jacobson mail.
				5593	*
				5594	* Van's trick is to deposit buffers into socket queue
				5595	* on a device interrupt, to call tcp_recv function
				5596	* on the receive process context and checksum and copy
				5597	* the buffer to user space. smart...
				5598	*
				5599	* Our current scheme is not silly either but we take the
				5600	* extra cost of the net_bh soft interrupt processing...
				5601	* We do checksum and copy also but from device to kernel.
				5602	*/
				5603
				5604	tp->rx_opt.saw_tstamp = 0;
				5605
				5606	/* pred_flags is 0xS?10 << 16 + snd_wnd
				5607	* if header_prediction is to be made
				5608	* 'S' will always be tp->tcp_header_len >> 2
				5609	* '?' will be 0 for the fast path, otherwise pred_flags is 0 to
				5610	* turn it off (when there are holes in the receive
				5611	* space for instance)
				5612	* PSH flag is ignored.
				5613	*/
				5614
				5615	if ((tcp_flag_word(th) & TCP_HP_BITS) == tp->pred_flags &&
				5616	TCP_SKB_CB(skb)->seq == tp->rcv_nxt &&
				5617	!after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt)) {
				5618	int tcp_header_len = tp->tcp_header_len;
				5619
				5620	/* Timestamp header prediction: tcp_header_len
				5621	* is automatically equal to th->doff*4 due to pred_flags
				5622	* match.
				5623	*/
				5624
				5625	/* Check timestamp */
				5626	if (tcp_header_len == sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) {
				5627	/* No? Slow path! */
				5628	if (!tcp_parse_aligned_timestamp(tp, th))
				5629	goto slow_path;
				5630
				5631	/* If PAWS failed, check it more carefully in slow path */
				5632	if ((s32)(tp->rx_opt.rcv_tsval - tp->rx_opt.ts_recent) < 0)
				5633	goto slow_path;
				5634
				5635	/* DO NOT update ts_recent here, if checksum fails
				5636	* and timestamp was corrupted part, it will result
				5637	* in a hung connection since we will drop all
				5638	* future packets due to the PAWS test.
				5639	*/
				5640	}
				5641
				5642	if (len <= tcp_header_len) {
				5643	/* Bulk data transfer: sender */
				5644	if (len == tcp_header_len) {
				5645	/* Predicted packet is in window by definition.
				5646	* seq == rcv_nxt and rcv_wup <= rcv_nxt.
				5647	* Hence, check seq<=rcv_wup reduces to:
				5648	*/
				5649	if (tcp_header_len ==
				5650	(sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) &&
				5651	tp->rcv_nxt == tp->rcv_wup)
				5652	tcp_store_ts_recent(tp);
				5653
				5654	/* We know that such packets are checksummed
				5655	* on entry.
				5656	*/
				5657	tcp_ack(sk, skb, 0);
				5658	__kfree_skb(skb);
				5659	tcp_data_snd_check(sk);
				5660	/* When receiving pure ack in fast path, update
				5661	* last ts ecr directly instead of calling
				5662	* tcp_rcv_rtt_measure_ts()
				5663	*/
				5664	tp->rcv_rtt_last_tsecr = tp->rx_opt.rcv_tsecr;
				5665	return;
				5666	} else { /* Header too small */
				5667	TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
				5668	goto discard;
				5669	}
				5670	} else {
				5671	int eaten = 0;
				5672	bool fragstolen = false;
				5673
				5674	if (tcp_checksum_complete(skb))
				5675	goto csum_error;
				5676
				5677	if ((int)skb->truesize > sk->sk_forward_alloc)
				5678	goto step5;
				5679
				5680	/* Predicted packet is in window by definition.
				5681	* seq == rcv_nxt and rcv_wup <= rcv_nxt.
				5682	* Hence, check seq<=rcv_wup reduces to:
				5683	*/
				5684	if (tcp_header_len ==
				5685	(sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) &&
				5686	tp->rcv_nxt == tp->rcv_wup)
				5687	tcp_store_ts_recent(tp);
				5688
				5689	tcp_rcv_rtt_measure_ts(sk, skb);
				5690
				5691	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPHITS);
				5692
				5693	/* Bulk data transfer: receiver */
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5694	__skb_pull(skb, tcp_header_len);
				5695	eaten = tcp_queue_rcv(sk, skb, &fragstolen);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5696
				5697	tcp_event_data_recv(sk, skb);
				5698
				5699	if (TCP_SKB_CB(skb)->ack_seq != tp->snd_una) {
				5700	/* Well, only one small jumplet in fast path... */
				5701	tcp_ack(sk, skb, FLAG_DATA);
				5702	tcp_data_snd_check(sk);
				5703	if (!inet_csk_ack_scheduled(sk))
				5704	goto no_ack;
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	5705	} else {
				5706	tcp_update_wl(tp, TCP_SKB_CB(skb)->seq);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5707	}
				5708
				5709	__tcp_ack_snd_check(sk, 0);
				5710	no_ack:
				5711	if (eaten)
				5712	kfree_skb_partial(skb, fragstolen);
				5713	tcp_data_ready(sk);
				5714	return;
				5715	}
				5716	}
				5717
				5718	slow_path:
				5719	if (len < (th->doff << 2) \|\| tcp_checksum_complete(skb))
				5720	goto csum_error;
				5721
				5722	if (!th->ack && !th->rst && !th->syn)
				5723	goto discard;
				5724
				5725	/*
				5726	* Standard slow path.
				5727	*/
				5728
				5729	if (!tcp_validate_incoming(sk, skb, th, 1))
				5730	return;
				5731
				5732	step5:
				5733	if (tcp_ack(sk, skb, FLAG_SLOWPATH \| FLAG_UPDATE_TS_RECENT) < 0)
				5734	goto discard;
				5735
				5736	tcp_rcv_rtt_measure_ts(sk, skb);
				5737
				5738	/* Process urgent data. */
				5739	tcp_urg(sk, skb, th);
				5740
				5741	/* step 7: process the segment text */
				5742	tcp_data_queue(sk, skb);
				5743
				5744	tcp_data_snd_check(sk);
				5745	tcp_ack_snd_check(sk);
				5746	return;
				5747
				5748	csum_error:
				5749	TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
				5750	TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
				5751
				5752	discard:
				5753	tcp_drop(sk, skb);
				5754	}
				5755	EXPORT_SYMBOL(tcp_rcv_established);
				5756
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5757	void tcp_init_transfer(struct sock *sk, int bpf_op)
				5758	{
				5759	struct inet_connection_sock *icsk = inet_csk(sk);
				5760	struct tcp_sock *tp = tcp_sk(sk);
				5761
				5762	tcp_mtup_init(sk);
				5763	icsk->icsk_af_ops->rebuild_header(sk);
				5764	tcp_init_metrics(sk);
				5765
				5766	/* Initialize the congestion window to start the transfer.
				5767	* Cut cwnd down to 1 per RFC5681 if SYN or SYN-ACK has been
				5768	* retransmitted. In light of RFC6298 more aggressive 1sec
				5769	* initRTO, we only reset cwnd when more than 1 SYN/SYN-ACK
				5770	* retransmission has occurred.
				5771	*/
				5772	if (tp->total_retrans > 1 && tp->undo_marker)
				5773	tp->snd_cwnd = 1;
				5774	else
				5775	tp->snd_cwnd = tcp_init_cwnd(tp, __sk_dst_get(sk));
				5776	tp->snd_cwnd_stamp = tcp_jiffies32;
				5777
				5778	tcp_call_bpf(sk, bpf_op, 0, NULL);
				5779	tcp_init_congestion_control(sk);
				5780	tcp_init_buffer_space(sk);
				5781	}
				5782
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5783	void tcp_finish_connect(struct sock sk, struct sk_buff skb)
				5784	{
				5785	struct tcp_sock *tp = tcp_sk(sk);
				5786	struct inet_connection_sock *icsk = inet_csk(sk);
				5787
				5788	tcp_set_state(sk, TCP_ESTABLISHED);
				5789	icsk->icsk_ack.lrcvtime = tcp_jiffies32;
				5790
				5791	if (skb) {
				5792	icsk->icsk_af_ops->sk_rx_dst_set(sk, skb);
				5793	security_inet_conn_established(sk, skb);
				5794	sk_mark_napi_id(sk, skb);
				5795	}
				5796
				5797	tcp_init_transfer(sk, BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB);
				5798
				5799	/* Prevent spurious tcp_cwnd_restart() on first data
				5800	* packet.
				5801	*/
				5802	tp->lsndtime = tcp_jiffies32;
				5803
				5804	if (sock_flag(sk, SOCK_KEEPOPEN))
				5805	inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp));
				5806
				5807	if (!tp->rx_opt.snd_wscale)
				5808	__tcp_fast_path_on(tp, tp->snd_wnd);
				5809	else
				5810	tp->pred_flags = 0;
				5811	}
				5812
				5813	static bool tcp_rcv_fastopen_synack(struct sock sk, struct sk_buff synack,
				5814	struct tcp_fastopen_cookie *cookie)
				5815	{
				5816	struct tcp_sock *tp = tcp_sk(sk);
				5817	struct sk_buff *data = tp->syn_data ? tcp_rtx_queue_head(sk) : NULL;
				5818	u16 mss = tp->rx_opt.mss_clamp, try_exp = 0;
				5819	bool syn_drop = false;
				5820
				5821	if (mss == tp->rx_opt.user_mss) {
				5822	struct tcp_options_received opt;
				5823
				5824	/* Get original SYNACK MSS value if user MSS sets mss_clamp */
				5825	tcp_clear_options(&opt);
				5826	opt.user_mss = opt.mss_clamp = 0;
				5827	tcp_parse_options(sock_net(sk), synack, &opt, 0, NULL);
				5828	mss = opt.mss_clamp;
				5829	}
				5830
				5831	if (!tp->syn_fastopen) {
				5832	/* Ignore an unsolicited cookie */
				5833	cookie->len = -1;
				5834	} else if (tp->total_retrans) {
				5835	/* SYN timed out and the SYN-ACK neither has a cookie nor
				5836	* acknowledges data. Presumably the remote received only
				5837	* the retransmitted (regular) SYNs: either the original
				5838	* SYN-data or the corresponding SYN-ACK was dropped.
				5839	*/
				5840	syn_drop = (cookie->len < 0 && data);
				5841	} else if (cookie->len < 0 && !tp->syn_data) {
				5842	/* We requested a cookie but didn't get it. If we did not use
				5843	* the (old) exp opt format then try so next time (try_exp=1).
				5844	* Otherwise we go back to use the RFC7413 opt (try_exp=2).
				5845	*/
				5846	try_exp = tp->syn_fastopen_exp ? 2 : 1;
				5847	}
				5848
				5849	tcp_fastopen_cache_set(sk, mss, cookie, syn_drop, try_exp);
				5850
				5851	if (data) { /* Retransmit unacked data in SYN */
				5852	skb_rbtree_walk_from(data) {
				5853	if (__tcp_retransmit_skb(sk, data, 1))
				5854	break;
				5855	}
				5856	tcp_rearm_rto(sk);
				5857	NET_INC_STATS(sock_net(sk),
				5858	LINUX_MIB_TCPFASTOPENACTIVEFAIL);
				5859	return true;
				5860	}
				5861	tp->syn_data_acked = tp->syn_data;
				5862	if (tp->syn_data_acked) {
				5863	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVE);
				5864	/* SYN-data is counted as two separate packets in tcp_ack() */
				5865	if (tp->delivered > 1)
				5866	--tp->delivered;
				5867	}
				5868
				5869	tcp_fastopen_add_skb(sk, synack);
				5870
				5871	return false;
				5872	}
				5873
				5874	static void smc_check_reset_syn(struct tcp_sock *tp)
				5875	{
				5876	#if IS_ENABLED(CONFIG_SMC)
				5877	if (static_branch_unlikely(&tcp_have_smc)) {
				5878	if (tp->syn_smc && !tp->rx_opt.smc_ok)
				5879	tp->syn_smc = 0;
				5880	}
				5881	#endif
				5882	}
				5883
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5884	static void tcp_try_undo_spurious_syn(struct sock *sk)
				5885	{
				5886	struct tcp_sock *tp = tcp_sk(sk);
				5887	u32 syn_stamp;
				5888
				5889	/* undo_marker is set when SYN or SYNACK times out. The timeout is
				5890	* spurious if the ACK's timestamp option echo value matches the
				5891	* original SYN timestamp.
				5892	*/
				5893	syn_stamp = tp->retrans_stamp;
				5894	if (tp->undo_marker && syn_stamp && tp->rx_opt.saw_tstamp &&
				5895	syn_stamp == tp->rx_opt.rcv_tsecr)
				5896	tp->undo_marker = 0;
				5897	}
				5898
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5899	static int tcp_rcv_synsent_state_process(struct sock sk, struct sk_buff skb,
				5900	const struct tcphdr *th)
				5901	{
				5902	struct inet_connection_sock *icsk = inet_csk(sk);
				5903	struct tcp_sock *tp = tcp_sk(sk);
				5904	struct tcp_fastopen_cookie foc = { .len = -1 };
				5905	int saved_clamp = tp->rx_opt.mss_clamp;
				5906	bool fastopen_fail;
				5907
				5908	tcp_parse_options(sock_net(sk), skb, &tp->rx_opt, 0, &foc);
				5909	if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
				5910	tp->rx_opt.rcv_tsecr -= tp->tsoffset;
				5911
				5912	if (th->ack) {
				5913	/* rfc793:
				5914	* "If the state is SYN-SENT then
				5915	* first check the ACK bit
				5916	* If the ACK bit is set
				5917	* If SEG.ACK =< ISS, or SEG.ACK > SND.NXT, send
				5918	* a reset (unless the RST bit is set, if so drop
				5919	* the segment and return)"
				5920	*/
				5921	if (!after(TCP_SKB_CB(skb)->ack_seq, tp->snd_una) \|\|
				5922	after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt))
				5923	goto reset_and_undo;
				5924
				5925	if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
				5926	!between(tp->rx_opt.rcv_tsecr, tp->retrans_stamp,
				5927	tcp_time_stamp(tp))) {
				5928	NET_INC_STATS(sock_net(sk),
				5929	LINUX_MIB_PAWSACTIVEREJECTED);
				5930	goto reset_and_undo;
				5931	}
				5932
				5933	/* Now ACK is acceptable.
				5934	*
				5935	* "If the RST bit is set
				5936	* If the ACK was acceptable then signal the user "error:
				5937	* connection reset", drop the segment, enter CLOSED state,
				5938	* delete TCB, and return."
				5939	*/
				5940
				5941	if (th->rst) {
				5942	tcp_reset(sk);
				5943	goto discard;
				5944	}
				5945
				5946	/* rfc793:
				5947	* "fifth, if neither of the SYN or RST bits is set then
				5948	* drop the segment and return."
				5949	*
				5950	* See note below!
				5951	* --ANK(990513)
				5952	*/
				5953	if (!th->syn)
				5954	goto discard_and_undo;
				5955
				5956	/* rfc793:
				5957	* "If the SYN bit is on ...
				5958	* are acceptable then ...
				5959	* (our SYN has been ACKed), change the connection
				5960	* state to ESTABLISHED..."
				5961	*/
				5962
				5963	tcp_ecn_rcv_synack(tp, th);
				5964
				5965	tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5966	tcp_try_undo_spurious_syn(sk);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5967	tcp_ack(sk, skb, FLAG_SLOWPATH);
				5968
				5969	/* Ok.. it's good. Set up sequence numbers and
				5970	* move to established.
				5971	*/
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5972	WRITE_ONCE(tp->rcv_nxt, TCP_SKB_CB(skb)->seq + 1);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5973	tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1;
				5974
				5975	/* RFC1323: The window in SYN & SYN/ACK segments is
				5976	* never scaled.
				5977	*/
				5978	tp->snd_wnd = ntohs(th->window);
				5979
				5980	if (!tp->rx_opt.wscale_ok) {
				5981	tp->rx_opt.snd_wscale = tp->rx_opt.rcv_wscale = 0;
				5982	tp->window_clamp = min(tp->window_clamp, 65535U);
				5983	}
				5984
				5985	if (tp->rx_opt.saw_tstamp) {
				5986	tp->rx_opt.tstamp_ok = 1;
				5987	tp->tcp_header_len =
				5988	sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
				5989	tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
				5990	tcp_store_ts_recent(tp);
				5991	} else {
				5992	tp->tcp_header_len = sizeof(struct tcphdr);
				5993	}
				5994
				5995	tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
				5996	tcp_initialize_rcv_mss(sk);
				5997
				5998	/* Remember, tcp_poll() does not lock socket!
				5999	* Change state from SYN-SENT only after copied_seq
				6000	* is initialized. */
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6001	WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6002
				6003	smc_check_reset_syn(tp);
				6004
				6005	smp_mb();
				6006
				6007	tcp_finish_connect(sk, skb);
				6008
				6009	fastopen_fail = (tp->syn_fastopen \|\| tp->syn_data) &&
				6010	tcp_rcv_fastopen_synack(sk, skb, &foc);
				6011
				6012	if (!sock_flag(sk, SOCK_DEAD)) {
				6013	sk->sk_state_change(sk);
				6014	sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT);
				6015	}
				6016	if (fastopen_fail)
				6017	return -1;
				6018	if (sk->sk_write_pending \|\|
				6019	icsk->icsk_accept_queue.rskq_defer_accept \|\|
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6020	inet_csk_in_pingpong_mode(sk)) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6021	/* Save one ACK. Data will be ready after
				6022	* several ticks, if write_pending is set.
				6023	*
				6024	* It may be deleted, but with this feature tcpdumps
				6025	* look so _wonderfully_ clever, that I was not able
				6026	* to stand against the temptation 8) --ANK
				6027	*/
				6028	inet_csk_schedule_ack(sk);
				6029	tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS);
				6030	inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
				6031	TCP_DELACK_MAX, TCP_RTO_MAX);
				6032
				6033	discard:
				6034	tcp_drop(sk, skb);
				6035	return 0;
				6036	} else {
				6037	tcp_send_ack(sk);
				6038	}
				6039	return -1;
				6040	}
				6041
				6042	/* No ACK in the segment */
				6043
				6044	if (th->rst) {
				6045	/* rfc793:
				6046	* "If the RST bit is set
				6047	*
				6048	* Otherwise (no ACK) drop the segment and return."
				6049	*/
				6050
				6051	goto discard_and_undo;
				6052	}
				6053
				6054	/* PAWS check. */
				6055	if (tp->rx_opt.ts_recent_stamp && tp->rx_opt.saw_tstamp &&
				6056	tcp_paws_reject(&tp->rx_opt, 0))
				6057	goto discard_and_undo;
				6058
				6059	if (th->syn) {
				6060	/* We see SYN without ACK. It is attempt of
				6061	* simultaneous connect with crossed SYNs.
				6062	* Particularly, it can be connect to self.
				6063	*/
				6064	tcp_set_state(sk, TCP_SYN_RECV);
				6065
				6066	if (tp->rx_opt.saw_tstamp) {
				6067	tp->rx_opt.tstamp_ok = 1;
				6068	tcp_store_ts_recent(tp);
				6069	tp->tcp_header_len =
				6070	sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
				6071	} else {
				6072	tp->tcp_header_len = sizeof(struct tcphdr);
				6073	}
				6074
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6075	WRITE_ONCE(tp->rcv_nxt, TCP_SKB_CB(skb)->seq + 1);
				6076	WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6077	tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1;
				6078
				6079	/* RFC1323: The window in SYN & SYN/ACK segments is
				6080	* never scaled.
				6081	*/
				6082	tp->snd_wnd = ntohs(th->window);
				6083	tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
				6084	tp->max_window = tp->snd_wnd;
				6085
				6086	tcp_ecn_rcv_syn(tp, th);
				6087
				6088	tcp_mtup_init(sk);
				6089	tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
				6090	tcp_initialize_rcv_mss(sk);
				6091
				6092	tcp_send_synack(sk);
				6093	#if 0
				6094	/* Note, we could accept data and URG from this segment.
				6095	* There are no obstacles to make this (except that we must
				6096	* either change tcp_recvmsg() to prevent it from returning data
				6097	* before 3WHS completes per RFC793, or employ TCP Fast Open).
				6098	*
				6099	* However, if we ignore data in ACKless segments sometimes,
				6100	* we have no reasons to accept it sometimes.
				6101	* Also, seems the code doing it in step6 of tcp_rcv_state_process
				6102	* is not flawless. So, discard packet for sanity.
				6103	* Uncomment this return to process the data.
				6104	*/
				6105	return -1;
				6106	#else
				6107	goto discard;
				6108	#endif
				6109	}
				6110	/* "fifth, if neither of the SYN or RST bits is set then
				6111	* drop the segment and return."
				6112	*/
				6113
				6114	discard_and_undo:
				6115	tcp_clear_options(&tp->rx_opt);
				6116	tp->rx_opt.mss_clamp = saved_clamp;
				6117	goto discard;
				6118
				6119	reset_and_undo:
				6120	tcp_clear_options(&tp->rx_opt);
				6121	tp->rx_opt.mss_clamp = saved_clamp;
				6122	return 1;
				6123	}
				6124
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6125	static void tcp_rcv_synrecv_state_fastopen(struct sock *sk)
				6126	{
				6127	struct request_sock *req;
				6128
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	6129	/* If we are still handling the SYNACK RTO, see if timestamp ECR allows
				6130	* undo. If peer SACKs triggered fast recovery, we can't undo here.
				6131	*/
				6132	if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss)
				6133	tcp_try_undo_loss(sk, false);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6134
				6135	/* Reset rtx states to prevent spurious retransmits_timed_out() */
				6136	tcp_sk(sk)->retrans_stamp = 0;
				6137	inet_csk(sk)->icsk_retransmits = 0;
				6138
				6139	/* Once we leave TCP_SYN_RECV or TCP_FIN_WAIT_1,
				6140	* we no longer need req so release it.
				6141	*/
				6142	req = rcu_dereference_protected(tcp_sk(sk)->fastopen_rsk,
				6143	lockdep_sock_is_held(sk));
				6144	reqsk_fastopen_remove(sk, req, false);
				6145
				6146	/* Re-arm the timer because data may have been sent out.
				6147	* This is similar to the regular data transmission case
				6148	* when new data has just been ack'ed.
				6149	*
				6150	* (TFO) - we could try to be more aggressive and
				6151	* retransmitting any data sooner based on when they
				6152	* are sent out.
				6153	*/
				6154	tcp_rearm_rto(sk);
				6155	}
				6156
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6157	/*
				6158	* This function implements the receiving procedure of RFC 793 for
				6159	* all states except ESTABLISHED and TIME_WAIT.
				6160	* It's called from both tcp_v4_rcv and tcp_v6_rcv and should be
				6161	* address independent.
				6162	*/
				6163
				6164	int tcp_rcv_state_process(struct sock sk, struct sk_buff skb)
				6165	{
				6166	struct tcp_sock *tp = tcp_sk(sk);
				6167	struct inet_connection_sock *icsk = inet_csk(sk);
				6168	const struct tcphdr *th = tcp_hdr(skb);
				6169	struct request_sock *req;
				6170	int queued = 0;
				6171	bool acceptable;
				6172
				6173	switch (sk->sk_state) {
				6174	case TCP_CLOSE:
				6175	goto discard;
				6176
				6177	case TCP_LISTEN:
				6178	if (th->ack)
				6179	return 1;
				6180
				6181	if (th->rst)
				6182	goto discard;
				6183
				6184	if (th->syn) {
				6185	if (th->fin)
				6186	goto discard;
				6187	/* It is possible that we process SYN packets from backlog,
				6188	* so we need to make sure to disable BH and RCU right there.
				6189	*/
				6190	rcu_read_lock();
				6191	local_bh_disable();
				6192	acceptable = icsk->icsk_af_ops->conn_request(sk, skb) >= 0;
				6193	local_bh_enable();
				6194	rcu_read_unlock();
				6195
				6196	if (!acceptable)
				6197	return 1;
				6198	consume_skb(skb);
				6199	return 0;
				6200	}
				6201	goto discard;
				6202
				6203	case TCP_SYN_SENT:
				6204	tp->rx_opt.saw_tstamp = 0;
				6205	tcp_mstamp_refresh(tp);
				6206	queued = tcp_rcv_synsent_state_process(sk, skb, th);
				6207	if (queued >= 0)
				6208	return queued;
				6209
				6210	/* Do step6 onward by hand. */
				6211	tcp_urg(sk, skb, th);
				6212	__kfree_skb(skb);
				6213	tcp_data_snd_check(sk);
				6214	return 0;
				6215	}
				6216
				6217	tcp_mstamp_refresh(tp);
				6218	tp->rx_opt.saw_tstamp = 0;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6219	req = rcu_dereference_protected(tp->fastopen_rsk,
				6220	lockdep_sock_is_held(sk));
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6221	if (req) {
				6222	bool req_stolen;
				6223
				6224	WARN_ON_ONCE(sk->sk_state != TCP_SYN_RECV &&
				6225	sk->sk_state != TCP_FIN_WAIT1);
				6226
				6227	if (!tcp_check_req(sk, skb, req, true, &req_stolen))
				6228	goto discard;
				6229	}
				6230
				6231	if (!th->ack && !th->rst && !th->syn)
				6232	goto discard;
				6233
				6234	if (!tcp_validate_incoming(sk, skb, th, 0))
				6235	return 0;
				6236
				6237	/* step 5: check the ACK field */
				6238	acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH \|
				6239	FLAG_UPDATE_TS_RECENT \|
				6240	FLAG_NO_CHALLENGE_ACK) > 0;
				6241
				6242	if (!acceptable) {
				6243	if (sk->sk_state == TCP_SYN_RECV)
				6244	return 1; /* send one RST */
				6245	tcp_send_challenge_ack(sk, skb);
				6246	goto discard;
				6247	}
				6248	switch (sk->sk_state) {
				6249	case TCP_SYN_RECV:
				6250	tp->delivered++; /* SYN-ACK delivery isn't tracked in tcp_ack */
				6251	if (!tp->srtt_us)
				6252	tcp_synack_rtt_meas(sk, req);
				6253
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6254	if (req) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6255	tcp_rcv_synrecv_state_fastopen(sk);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6256	} else {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6257	tcp_try_undo_spurious_syn(sk);
				6258	tp->retrans_stamp = 0;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6259	tcp_init_transfer(sk, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6260	WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6261	}
				6262	smp_mb();
				6263	tcp_set_state(sk, TCP_ESTABLISHED);
				6264	sk->sk_state_change(sk);
				6265
				6266	/* Note, that this wakeup is only for marginal crossed SYN case.
				6267	* Passively open sockets are not waked up, because
				6268	* sk->sk_sleep == NULL and sk->sk_socket == NULL.
				6269	*/
				6270	if (sk->sk_socket)
				6271	sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT);
				6272
				6273	tp->snd_una = TCP_SKB_CB(skb)->ack_seq;
				6274	tp->snd_wnd = ntohs(th->window) << tp->rx_opt.snd_wscale;
				6275	tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
				6276
				6277	if (tp->rx_opt.tstamp_ok)
				6278	tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
				6279
				6280	if (!inet_csk(sk)->icsk_ca_ops->cong_control)
				6281	tcp_update_pacing_rate(sk);
				6282
				6283	/* Prevent spurious tcp_cwnd_restart() on first data packet */
				6284	tp->lsndtime = tcp_jiffies32;
				6285
				6286	tcp_initialize_rcv_mss(sk);
				6287	tcp_fast_path_on(tp);
				6288	break;
				6289
				6290	case TCP_FIN_WAIT1: {
				6291	int tmo;
				6292
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6293	if (req)
				6294	tcp_rcv_synrecv_state_fastopen(sk);
				6295
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6296	if (tp->snd_una != tp->write_seq)
				6297	break;
				6298
				6299	tcp_set_state(sk, TCP_FIN_WAIT2);
				6300	sk->sk_shutdown \|= SEND_SHUTDOWN;
				6301
				6302	sk_dst_confirm(sk);
				6303
				6304	if (!sock_flag(sk, SOCK_DEAD)) {
				6305	/* Wake up lingering close() */
				6306	sk->sk_state_change(sk);
				6307	break;
				6308	}
				6309
				6310	if (tp->linger2 < 0) {
				6311	tcp_done(sk);
				6312	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
				6313	return 1;
				6314	}
				6315	if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
				6316	after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) {
				6317	/* Receive out of order FIN after close() */
				6318	if (tp->syn_fastopen && th->fin)
				6319	tcp_fastopen_active_disable(sk);
				6320	tcp_done(sk);
				6321	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
				6322	return 1;
				6323	}
				6324
				6325	tmo = tcp_fin_time(sk);
				6326	if (tmo > TCP_TIMEWAIT_LEN) {
				6327	inet_csk_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN);
				6328	} else if (th->fin \|\| sock_owned_by_user(sk)) {
				6329	/* Bad case. We could lose such FIN otherwise.
				6330	* It is not a big problem, but it looks confusing
				6331	* and not so rare event. We still can lose it now,
				6332	* if it spins in bh_lock_sock(), but it is really
				6333	* marginal case.
				6334	*/
				6335	inet_csk_reset_keepalive_timer(sk, tmo);
				6336	} else {
				6337	tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
				6338	goto discard;
				6339	}
				6340	break;
				6341	}
				6342
				6343	case TCP_CLOSING:
				6344	if (tp->snd_una == tp->write_seq) {
				6345	tcp_time_wait(sk, TCP_TIME_WAIT, 0);
				6346	goto discard;
				6347	}
				6348	break;
				6349
				6350	case TCP_LAST_ACK:
				6351	if (tp->snd_una == tp->write_seq) {
				6352	tcp_update_metrics(sk);
				6353	tcp_done(sk);
				6354	goto discard;
				6355	}
				6356	break;
				6357	}
				6358
				6359	/* step 6: check the URG bit */
				6360	tcp_urg(sk, skb, th);
				6361
				6362	/* step 7: process the segment text */
				6363	switch (sk->sk_state) {
				6364	case TCP_CLOSE_WAIT:
				6365	case TCP_CLOSING:
				6366	case TCP_LAST_ACK:
				6367	if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
				6368	break;
				6369	/* fall through */
				6370	case TCP_FIN_WAIT1:
				6371	case TCP_FIN_WAIT2:
				6372	/* RFC 793 says to queue data in these states,
				6373	* RFC 1122 says we MUST send a reset.
				6374	* BSD 4.4 also does reset.
				6375	*/
				6376	if (sk->sk_shutdown & RCV_SHUTDOWN) {
				6377	if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
				6378	after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) {
				6379	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
				6380	tcp_reset(sk);
				6381	return 1;
				6382	}
				6383	}
				6384	/* Fall through */
				6385	case TCP_ESTABLISHED:
				6386	tcp_data_queue(sk, skb);
				6387	queued = 1;
				6388	break;
				6389	}
				6390
				6391	/* tcp_data could move socket to TIME-WAIT */
				6392	if (sk->sk_state != TCP_CLOSE) {
				6393	tcp_data_snd_check(sk);
				6394	tcp_ack_snd_check(sk);
				6395	}
				6396
				6397	if (!queued) {
				6398	discard:
				6399	tcp_drop(sk, skb);
				6400	}
				6401	return 0;
				6402	}
				6403	EXPORT_SYMBOL(tcp_rcv_state_process);
				6404
				6405	static inline void pr_drop_req(struct request_sock *req, __u16 port, int family)
				6406	{
				6407	struct inet_request_sock *ireq = inet_rsk(req);
				6408
				6409	if (family == AF_INET)
				6410	net_dbg_ratelimited("drop open request from %pI4/%u\n",
				6411	&ireq->ir_rmt_addr, port);
				6412	#if IS_ENABLED(CONFIG_IPV6)
				6413	else if (family == AF_INET6)
				6414	net_dbg_ratelimited("drop open request from %pI6/%u\n",
				6415	&ireq->ir_v6_rmt_addr, port);
				6416	#endif
				6417	}
				6418
				6419	/* RFC3168 : 6.1.1 SYN packets must not have ECT/ECN bits set
				6420	*
				6421	* If we receive a SYN packet with these bits set, it means a
				6422	* network is playing bad games with TOS bits. In order to
				6423	* avoid possible false congestion notifications, we disable
				6424	* TCP ECN negotiation.
				6425	*
				6426	* Exception: tcp_ca wants ECN. This is required for DCTCP
				6427	* congestion control: Linux DCTCP asserts ECT on all packets,
				6428	* including SYN, which is most optimal solution; however,
				6429	* others, such as FreeBSD do not.
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6430	*
				6431	* Exception: At least one of the reserved bits of the TCP header (th->res1) is
				6432	* set, indicating the use of a future TCP extension (such as AccECN). See
				6433	* RFC8311 §4.3 which updates RFC3168 to allow the development of such
				6434	* extensions.
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6435	*/
				6436	static void tcp_ecn_create_request(struct request_sock *req,
				6437	const struct sk_buff *skb,
				6438	const struct sock *listen_sk,
				6439	const struct dst_entry *dst)
				6440	{
				6441	const struct tcphdr *th = tcp_hdr(skb);
				6442	const struct net *net = sock_net(listen_sk);
				6443	bool th_ecn = th->ece && th->cwr;
				6444	bool ect, ecn_ok;
				6445	u32 ecn_ok_dst;
				6446
				6447	if (!th_ecn)
				6448	return;
				6449
				6450	ect = !INET_ECN_is_not_ect(TCP_SKB_CB(skb)->ip_dsfield);
				6451	ecn_ok_dst = dst_feature(dst, DST_FEATURE_ECN_MASK);
				6452	ecn_ok = net->ipv4.sysctl_tcp_ecn \|\| ecn_ok_dst;
				6453
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6454	if (((!ect \|\| th->res1) && ecn_ok) \|\| tcp_ca_needs_ecn(listen_sk) \|\|
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6455	(ecn_ok_dst & DST_FEATURE_ECN_CA) \|\|
				6456	tcp_bpf_ca_needs_ecn((struct sock *)req))
				6457	inet_rsk(req)->ecn_ok = 1;
				6458	}
				6459
				6460	static void tcp_openreq_init(struct request_sock *req,
				6461	const struct tcp_options_received *rx_opt,
				6462	struct sk_buff skb, const struct sock sk)
				6463	{
				6464	struct inet_request_sock *ireq = inet_rsk(req);
				6465
				6466	req->rsk_rcv_wnd = 0; /* So that tcp_send_synack() knows! */
				6467	req->cookie_ts = 0;
				6468	tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq;
				6469	tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6470	tcp_rsk(req)->snt_synack = 0;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6471	tcp_rsk(req)->last_oow_ack_time = 0;
				6472	req->mss = rx_opt->mss_clamp;
				6473	req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : 0;
				6474	ireq->tstamp_ok = rx_opt->tstamp_ok;
				6475	ireq->sack_ok = rx_opt->sack_ok;
				6476	ireq->snd_wscale = rx_opt->snd_wscale;
				6477	ireq->wscale_ok = rx_opt->wscale_ok;
				6478	ireq->acked = 0;
				6479	ireq->ecn_ok = 0;
				6480	ireq->ir_rmt_port = tcp_hdr(skb)->source;
				6481	ireq->ir_num = ntohs(tcp_hdr(skb)->dest);
				6482	ireq->ir_mark = inet_request_mark(sk, skb);
				6483	#if IS_ENABLED(CONFIG_SMC)
				6484	ireq->smc_ok = rx_opt->smc_ok;
				6485	#endif
				6486	}
				6487
				6488	struct request_sock inet_reqsk_alloc(const struct request_sock_ops ops,
				6489	struct sock *sk_listener,
				6490	bool attach_listener)
				6491	{
				6492	struct request_sock *req = reqsk_alloc(ops, sk_listener,
				6493	attach_listener);
				6494
				6495	if (req) {
				6496	struct inet_request_sock *ireq = inet_rsk(req);
				6497
				6498	ireq->ireq_opt = NULL;
				6499	#if IS_ENABLED(CONFIG_IPV6)
				6500	ireq->pktopts = NULL;
				6501	#endif
				6502	atomic64_set(&ireq->ir_cookie, 0);
				6503	ireq->ireq_state = TCP_NEW_SYN_RECV;
				6504	write_pnet(&ireq->ireq_net, sock_net(sk_listener));
				6505	ireq->ireq_family = sk_listener->sk_family;
				6506	}
				6507
				6508	return req;
				6509	}
				6510	EXPORT_SYMBOL(inet_reqsk_alloc);
				6511
				6512	/*
				6513	* Return true if a syncookie should be sent
				6514	*/
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6515	static bool tcp_syn_flood_action(const struct sock sk, const char proto)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6516	{
				6517	struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
				6518	const char *msg = "Dropping request";
				6519	bool want_cookie = false;
				6520	struct net *net = sock_net(sk);
				6521
				6522	#ifdef CONFIG_SYN_COOKIES
				6523	if (net->ipv4.sysctl_tcp_syncookies) {
				6524	msg = "Sending cookies";
				6525	want_cookie = true;
				6526	__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES);
				6527	} else
				6528	#endif
				6529	__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP);
				6530
				6531	if (!queue->synflood_warned &&
				6532	net->ipv4.sysctl_tcp_syncookies != 2 &&
				6533	xchg(&queue->synflood_warned, 1) == 0)
				6534	net_info_ratelimited("%s: Possible SYN flooding on port %d. %s. Check SNMP counters.\n",
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6535	proto, sk->sk_num, msg);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6536
				6537	return want_cookie;
				6538	}
				6539
				6540	static void tcp_reqsk_record_syn(const struct sock *sk,
				6541	struct request_sock *req,
				6542	const struct sk_buff *skb)
				6543	{
				6544	if (tcp_sk(sk)->save_syn) {
				6545	u32 len = skb_network_header_len(skb) + tcp_hdrlen(skb);
				6546	u32 *copy;
				6547
				6548	copy = kmalloc(len + sizeof(u32), GFP_ATOMIC);
				6549	if (copy) {
				6550	copy[0] = len;
				6551	memcpy(&copy[1], skb_network_header(skb), len);
				6552	req->saved_syn = copy;
				6553	}
				6554	}
				6555	}
				6556
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6557	/* If a SYN cookie is required and supported, returns a clamped MSS value to be
				6558	* used for SYN cookie generation.
				6559	*/
				6560	u16 tcp_get_syncookie_mss(struct request_sock_ops *rsk_ops,
				6561	const struct tcp_request_sock_ops *af_ops,
				6562	struct sock sk, struct tcphdr th)
				6563	{
				6564	struct tcp_sock *tp = tcp_sk(sk);
				6565	u16 mss;
				6566
				6567	if (sock_net(sk)->ipv4.sysctl_tcp_syncookies != 2 &&
				6568	!inet_csk_reqsk_queue_is_full(sk))
				6569	return 0;
				6570
				6571	if (!tcp_syn_flood_action(sk, rsk_ops->slab_name))
				6572	return 0;
				6573
				6574	if (sk_acceptq_is_full(sk)) {
				6575	NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
				6576	return 0;
				6577	}
				6578
				6579	mss = tcp_parse_mss_option(th, tp->rx_opt.user_mss);
				6580	if (!mss)
				6581	mss = af_ops->mss_clamp;
				6582
				6583	return mss;
				6584	}
				6585	EXPORT_SYMBOL_GPL(tcp_get_syncookie_mss);
				6586
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6587	int tcp_conn_request(struct request_sock_ops *rsk_ops,
				6588	const struct tcp_request_sock_ops *af_ops,
				6589	struct sock sk, struct sk_buff skb)
				6590	{
				6591	struct tcp_fastopen_cookie foc = { .len = -1 };
				6592	__u32 isn = TCP_SKB_CB(skb)->tcp_tw_isn;
				6593	struct tcp_options_received tmp_opt;
				6594	struct tcp_sock *tp = tcp_sk(sk);
				6595	struct net *net = sock_net(sk);
				6596	struct sock *fastopen_sk = NULL;
				6597	struct request_sock *req;
				6598	bool want_cookie = false;
				6599	struct dst_entry *dst;
				6600	struct flowi fl;
				6601
				6602	/* TW buckets are converted to open requests without
				6603	* limitations, they conserve resources and peer is
				6604	* evidently real one.
				6605	*/
				6606	if ((net->ipv4.sysctl_tcp_syncookies == 2 \|\|
				6607	inet_csk_reqsk_queue_is_full(sk)) && !isn) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6608	want_cookie = tcp_syn_flood_action(sk, rsk_ops->slab_name);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6609	if (!want_cookie)
				6610	goto drop;
				6611	}
				6612
				6613	if (sk_acceptq_is_full(sk)) {
				6614	NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
				6615	goto drop;
				6616	}
				6617
				6618	req = inet_reqsk_alloc(rsk_ops, sk, !want_cookie);
				6619	if (!req)
				6620	goto drop;
				6621
				6622	tcp_rsk(req)->af_specific = af_ops;
				6623	tcp_rsk(req)->ts_off = 0;
				6624
				6625	tcp_clear_options(&tmp_opt);
				6626	tmp_opt.mss_clamp = af_ops->mss_clamp;
				6627	tmp_opt.user_mss = tp->rx_opt.user_mss;
				6628	tcp_parse_options(sock_net(sk), skb, &tmp_opt, 0,
				6629	want_cookie ? NULL : &foc);
				6630
				6631	if (want_cookie && !tmp_opt.saw_tstamp)
				6632	tcp_clear_options(&tmp_opt);
				6633
				6634	if (IS_ENABLED(CONFIG_SMC) && want_cookie)
				6635	tmp_opt.smc_ok = 0;
				6636
				6637	tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
				6638	tcp_openreq_init(req, &tmp_opt, skb, sk);
				6639	inet_rsk(req)->no_srccheck = inet_sk(sk)->transparent;
				6640
				6641	/* Note: tcp_v6_init_req() might override ir_iif for link locals */
				6642	inet_rsk(req)->ir_iif = inet_request_bound_dev_if(sk, skb);
				6643
				6644	af_ops->init_req(req, sk, skb);
				6645
				6646	if (security_inet_conn_request(sk, skb, req))
				6647	goto drop_and_free;
				6648
				6649	if (tmp_opt.tstamp_ok)
				6650	tcp_rsk(req)->ts_off = af_ops->init_ts_off(net, skb);
				6651
				6652	dst = af_ops->route_req(sk, &fl, req);
				6653	if (!dst)
				6654	goto drop_and_free;
				6655
				6656	if (!want_cookie && !isn) {
				6657	/* Kill the following clause, if you dislike this way. */
				6658	if (!net->ipv4.sysctl_tcp_syncookies &&
				6659	(net->ipv4.sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
				6660	(net->ipv4.sysctl_max_syn_backlog >> 2)) &&
				6661	!tcp_peer_is_proven(req, dst)) {
				6662	/* Without syncookies last quarter of
				6663	* backlog is filled with destinations,
				6664	* proven to be alive.
				6665	* It means that we continue to communicate
				6666	* to destinations, already remembered
				6667	* to the moment of synflood.
				6668	*/
				6669	pr_drop_req(req, ntohs(tcp_hdr(skb)->source),
				6670	rsk_ops->family);
				6671	goto drop_and_release;
				6672	}
				6673
				6674	isn = af_ops->init_seq(skb);
				6675	}
				6676
				6677	tcp_ecn_create_request(req, skb, sk, dst);
				6678
				6679	if (want_cookie) {
				6680	isn = cookie_init_sequence(af_ops, sk, skb, &req->mss);
				6681	req->cookie_ts = tmp_opt.tstamp_ok;
				6682	if (!tmp_opt.tstamp_ok)
				6683	inet_rsk(req)->ecn_ok = 0;
				6684	}
				6685
				6686	tcp_rsk(req)->snt_isn = isn;
				6687	tcp_rsk(req)->txhash = net_tx_rndhash();
				6688	tcp_openreq_init_rwin(req, sk, dst);
				6689	sk_rx_queue_set(req_to_sk(req), skb);
				6690	if (!want_cookie) {
				6691	tcp_reqsk_record_syn(sk, req, skb);
				6692	fastopen_sk = tcp_try_fastopen(sk, skb, req, &foc, dst);
				6693	}
				6694	if (fastopen_sk) {
				6695	af_ops->send_synack(fastopen_sk, dst, &fl, req,
				6696	&foc, TCP_SYNACK_FASTOPEN);
				6697	/* Add the child socket directly into the accept queue */
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6698	if (!inet_csk_reqsk_queue_add(sk, req, fastopen_sk)) {
				6699	reqsk_fastopen_remove(fastopen_sk, req, false);
				6700	bh_unlock_sock(fastopen_sk);
				6701	sock_put(fastopen_sk);
				6702	goto drop_and_free;
				6703	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6704	sk->sk_data_ready(sk);
				6705	bh_unlock_sock(fastopen_sk);
				6706	sock_put(fastopen_sk);
				6707	} else {
				6708	tcp_rsk(req)->tfo_listener = false;
				6709	if (!want_cookie)
				6710	inet_csk_reqsk_queue_hash_add(sk, req,
				6711	tcp_timeout_init((struct sock *)req));
				6712	af_ops->send_synack(sk, dst, &fl, req, &foc,
				6713	!want_cookie ? TCP_SYNACK_NORMAL :
				6714	TCP_SYNACK_COOKIE);
				6715	if (want_cookie) {
				6716	reqsk_free(req);
				6717	return 0;
				6718	}
				6719	}
				6720	reqsk_put(req);
				6721	return 0;
				6722
				6723	drop_and_release:
				6724	dst_release(dst);
				6725	drop_and_free:
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6726	__reqsk_free(req);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6727	drop:
				6728	tcp_listendrop(sk);
				6729	return 0;
				6730	}
				6731	EXPORT_SYMBOL(tcp_conn_request);