Blame - net/ipv4/tcp_output.c - hafnium/third_party/linux

blob: 638d7b49ad7165772f7a8b2db1251b7fd5f57bcb [file] [log] [blame]

David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1	// SPDX-License-Identifier: GPL-2.0-only
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2	/*
				3	* INET An implementation of the TCP/IP protocol suite for the LINUX
				4	* operating system. INET is implemented using the BSD Socket
				5	* interface as the means of communication with the user level.
				6	*
				7	* Implementation of the Transmission Control Protocol(TCP).
				8	*
				9	* Authors: Ross Biro
				10	* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
				11	* Mark Evans, <evansmp@uhura.aston.ac.uk>
				12	* Corey Minyard <wf-rch!minyard@relay.EU.net>
				13	* Florian La Roche, <flla@stud.uni-sb.de>
				14	* Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
				15	* Linus Torvalds, <torvalds@cs.helsinki.fi>
				16	* Alan Cox, <gw4pts@gw4pts.ampr.org>
				17	* Matthew Dillon, <dillon@apollo.west.oic.com>
				18	* Arnt Gulbrandsen, <agulbra@nvg.unit.no>
				19	* Jorge Cwik, <jorge@laser.satlink.net>
				20	*/
				21
				22	/*
				23	* Changes: Pedro Roque : Retransmit queue handled by TCP.
				24	* : Fragmentation on mtu decrease
				25	* : Segment collapse on retransmit
				26	* : AF independence
				27	*
				28	* Linus Torvalds : send_delayed_ack
				29	* David S. Miller : Charge memory using the right skb
				30	* during syn/ack processing.
				31	* David S. Miller : Output engine completely rewritten.
				32	* Andrea Arcangeli: SYNACK carry ts_recent in tsecr.
				33	* Cacophonix Gaul : draft-minshall-nagle-01
				34	* J Hadi Salim : ECN support
				35	*
				36	*/
				37
				38	#define pr_fmt(fmt) "TCP: " fmt
				39
				40	#include <net/tcp.h>
				41
				42	#include <linux/compiler.h>
				43	#include <linux/gfp.h>
				44	#include <linux/module.h>
				45	#include <linux/static_key.h>
				46
				47	#include <trace/events/tcp.h>
				48
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	49	/* Refresh clocks of a TCP socket,
				50	* ensuring monotically increasing values.
				51	*/
				52	void tcp_mstamp_refresh(struct tcp_sock *tp)
				53	{
				54	u64 val = tcp_clock_ns();
				55
				56	tp->tcp_clock_cache = val;
				57	tp->tcp_mstamp = div_u64(val, NSEC_PER_USEC);
				58	}
				59
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	60	static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
				61	int push_one, gfp_t gfp);
				62
				63	/* Account for new data that has been sent to the network. */
				64	static void tcp_event_new_data_sent(struct sock sk, struct sk_buff skb)
				65	{
				66	struct inet_connection_sock *icsk = inet_csk(sk);
				67	struct tcp_sock *tp = tcp_sk(sk);
				68	unsigned int prior_packets = tp->packets_out;
				69
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	70	WRITE_ONCE(tp->snd_nxt, TCP_SKB_CB(skb)->end_seq);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	71
				72	__skb_unlink(skb, &sk->sk_write_queue);
				73	tcp_rbtree_insert(&sk->tcp_rtx_queue, skb);
				74
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	75	if (tp->highest_sack == NULL)
				76	tp->highest_sack = skb;
				77
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	78	tp->packets_out += tcp_skb_pcount(skb);
				79	if (!prior_packets \|\| icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
				80	tcp_rearm_rto(sk);
				81
				82	NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT,
				83	tcp_skb_pcount(skb));
				84	}
				85
				86	/* SND.NXT, if window was not shrunk or the amount of shrunk was less than one
				87	* window scaling factor due to loss of precision.
				88	* If window has been shrunk, what should we make? It is not clear at all.
				89	* Using SND.UNA we will fail to open window, SND.NXT is out of window. :-(
				90	* Anything in between SND.UNA...SND.UNA+SND.WND also can be already
				91	* invalid. OK, let's make this for now:
				92	*/
				93	static inline __u32 tcp_acceptable_seq(const struct sock *sk)
				94	{
				95	const struct tcp_sock *tp = tcp_sk(sk);
				96
				97	if (!before(tcp_wnd_end(tp), tp->snd_nxt) \|\|
				98	(tp->rx_opt.wscale_ok &&
				99	((tp->snd_nxt - tcp_wnd_end(tp)) < (1 << tp->rx_opt.rcv_wscale))))
				100	return tp->snd_nxt;
				101	else
				102	return tcp_wnd_end(tp);
				103	}
				104
				105	/* Calculate mss to advertise in SYN segment.
				106	* RFC1122, RFC1063, draft-ietf-tcpimpl-pmtud-01 state that:
				107	*
				108	* 1. It is independent of path mtu.
				109	* 2. Ideally, it is maximal possible segment size i.e. 65535-40.
				110	* 3. For IPv4 it is reasonable to calculate it from maximal MTU of
				111	* attached devices, because some buggy hosts are confused by
				112	* large MSS.
				113	* 4. We do not make 3, we advertise MSS, calculated from first
				114	* hop device mtu, but allow to raise it to ip_rt_min_advmss.
				115	* This may be overridden via information stored in routing table.
				116	* 5. Value 65535 for MSS is valid in IPv6 and means "as large as possible,
				117	* probably even Jumbo".
				118	*/
				119	static __u16 tcp_advertise_mss(struct sock *sk)
				120	{
				121	struct tcp_sock *tp = tcp_sk(sk);
				122	const struct dst_entry *dst = __sk_dst_get(sk);
				123	int mss = tp->advmss;
				124
				125	if (dst) {
				126	unsigned int metric = dst_metric_advmss(dst);
				127
				128	if (metric < mss) {
				129	mss = metric;
				130	tp->advmss = mss;
				131	}
				132	}
				133
				134	return (__u16)mss;
				135	}
				136
				137	/* RFC2861. Reset CWND after idle period longer RTO to "restart window".
				138	* This is the first part of cwnd validation mechanism.
				139	*/
				140	void tcp_cwnd_restart(struct sock *sk, s32 delta)
				141	{
				142	struct tcp_sock *tp = tcp_sk(sk);
				143	u32 restart_cwnd = tcp_init_cwnd(tp, __sk_dst_get(sk));
				144	u32 cwnd = tp->snd_cwnd;
				145
				146	tcp_ca_event(sk, CA_EVENT_CWND_RESTART);
				147
				148	tp->snd_ssthresh = tcp_current_ssthresh(sk);
				149	restart_cwnd = min(restart_cwnd, cwnd);
				150
				151	while ((delta -= inet_csk(sk)->icsk_rto) > 0 && cwnd > restart_cwnd)
				152	cwnd >>= 1;
				153	tp->snd_cwnd = max(cwnd, restart_cwnd);
				154	tp->snd_cwnd_stamp = tcp_jiffies32;
				155	tp->snd_cwnd_used = 0;
				156	}
				157
				158	/* Congestion state accounting after a packet has been sent. */
				159	static void tcp_event_data_sent(struct tcp_sock *tp,
				160	struct sock *sk)
				161	{
				162	struct inet_connection_sock *icsk = inet_csk(sk);
				163	const u32 now = tcp_jiffies32;
				164
				165	if (tcp_packets_in_flight(tp) == 0)
				166	tcp_ca_event(sk, CA_EVENT_TX_START);
				167
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	168	/* If this is the first data packet sent in response to the
				169	* previous received data,
				170	* and it is a reply for ato after last received packet,
				171	* increase pingpong count.
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	172	*/
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	173	if (before(tp->lsndtime, icsk->icsk_ack.lrcvtime) &&
				174	(u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato)
				175	inet_csk_inc_pingpong_cnt(sk);
				176
				177	tp->lsndtime = now;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	178	}
				179
				180	/* Account for an ACK we sent. */
				181	static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts,
				182	u32 rcv_nxt)
				183	{
				184	struct tcp_sock *tp = tcp_sk(sk);
				185
				186	if (unlikely(tp->compressed_ack > TCP_FASTRETRANS_THRESH)) {
				187	NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPACKCOMPRESSED,
				188	tp->compressed_ack - TCP_FASTRETRANS_THRESH);
				189	tp->compressed_ack = TCP_FASTRETRANS_THRESH;
				190	if (hrtimer_try_to_cancel(&tp->compressed_ack_timer) == 1)
				191	__sock_put(sk);
				192	}
				193
				194	if (unlikely(rcv_nxt != tp->rcv_nxt))
				195	return; /* Special ACK sent by DCTCP to reflect ECN */
				196	tcp_dec_quickack_mode(sk, pkts);
				197	inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
				198	}
				199
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	200	/* Determine a window scaling and initial window to offer.
				201	* Based on the assumption that the given amount of space
				202	* will be offered. Store the results in the tp structure.
				203	* NOTE: for smooth operation initial space offering should
				204	* be a multiple of mss if possible. We assume here that mss >= 1.
				205	* This MUST be enforced by all callers.
				206	*/
				207	void tcp_select_initial_window(const struct sock *sk, int __space, __u32 mss,
				208	__u32 rcv_wnd, __u32 window_clamp,
				209	int wscale_ok, __u8 *rcv_wscale,
				210	__u32 init_rcv_wnd)
				211	{
				212	unsigned int space = (__space < 0 ? 0 : __space);
				213
				214	/* If no clamp set the clamp to the max possible scaled window */
				215	if (*window_clamp == 0)
				216	(*window_clamp) = (U16_MAX << TCP_MAX_WSCALE);
				217	space = min(*window_clamp, space);
				218
				219	/* Quantize space offering to a multiple of mss if possible. */
				220	if (space > mss)
				221	space = rounddown(space, mss);
				222
				223	/* NOTE: offering an initial window larger than 32767
				224	* will break some buggy TCP stacks. If the admin tells us
				225	* it is likely we could be speaking with such a buggy stack
				226	* we will truncate our initial window offering to 32K-1
				227	* unless the remote has sent us a window scaling option,
				228	* which we interpret as a sign the remote TCP is not
				229	* misinterpreting the window field as a signed quantity.
				230	*/
				231	if (sock_net(sk)->ipv4.sysctl_tcp_workaround_signed_windows)
				232	(*rcv_wnd) = min(space, MAX_TCP_WINDOW);
				233	else
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	234	(*rcv_wnd) = min_t(u32, space, U16_MAX);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	235
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	236	if (init_rcv_wnd)
				237	rcv_wnd = min(rcv_wnd, init_rcv_wnd * mss);
				238
				239	*rcv_wscale = 0;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	240	if (wscale_ok) {
				241	/* Set window scaling on max possible window */
				242	space = max_t(u32, space, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]);
				243	space = max_t(u32, space, sysctl_rmem_max);
				244	space = min_t(u32, space, *window_clamp);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	245	*rcv_wscale = clamp_t(int, ilog2(space) - 15,
				246	0, TCP_MAX_WSCALE);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	247	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	248	/* Set the clamp no higher than max representable value */
				249	(window_clamp) = min_t(__u32, U16_MAX << (rcv_wscale), *window_clamp);
				250	}
				251	EXPORT_SYMBOL(tcp_select_initial_window);
				252
				253	/* Chose a new window to advertise, update state in tcp_sock for the
				254	* socket, and return result with RFC1323 scaling applied. The return
				255	* value can be stuffed directly into th->window for an outgoing
				256	* frame.
				257	*/
				258	static u16 tcp_select_window(struct sock *sk)
				259	{
				260	struct tcp_sock *tp = tcp_sk(sk);
				261	u32 old_win = tp->rcv_wnd;
				262	u32 cur_win = tcp_receive_window(tp);
				263	u32 new_win = __tcp_select_window(sk);
				264
				265	/* Never shrink the offered window */
				266	if (new_win < cur_win) {
				267	/* Danger Will Robinson!
				268	* Don't update rcv_wup/rcv_wnd here or else
				269	* we will not be able to advertise a zero
				270	* window in time. --DaveM
				271	*
				272	* Relax Will Robinson.
				273	*/
				274	if (new_win == 0)
				275	NET_INC_STATS(sock_net(sk),
				276	LINUX_MIB_TCPWANTZEROWINDOWADV);
				277	new_win = ALIGN(cur_win, 1 << tp->rx_opt.rcv_wscale);
				278	}
				279	tp->rcv_wnd = new_win;
				280	tp->rcv_wup = tp->rcv_nxt;
				281
				282	/* Make sure we do not exceed the maximum possible
				283	* scaled window.
				284	*/
				285	if (!tp->rx_opt.rcv_wscale &&
				286	sock_net(sk)->ipv4.sysctl_tcp_workaround_signed_windows)
				287	new_win = min(new_win, MAX_TCP_WINDOW);
				288	else
				289	new_win = min(new_win, (65535U << tp->rx_opt.rcv_wscale));
				290
				291	/* RFC1323 scaling applied */
				292	new_win >>= tp->rx_opt.rcv_wscale;
				293
				294	/* If we advertise zero window, disable fast path. */
				295	if (new_win == 0) {
				296	tp->pred_flags = 0;
				297	if (old_win)
				298	NET_INC_STATS(sock_net(sk),
				299	LINUX_MIB_TCPTOZEROWINDOWADV);
				300	} else if (old_win == 0) {
				301	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFROMZEROWINDOWADV);
				302	}
				303
				304	return new_win;
				305	}
				306
				307	/* Packet ECN state for a SYN-ACK */
				308	static void tcp_ecn_send_synack(struct sock sk, struct sk_buff skb)
				309	{
				310	const struct tcp_sock *tp = tcp_sk(sk);
				311
				312	TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_CWR;
				313	if (!(tp->ecn_flags & TCP_ECN_OK))
				314	TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ECE;
				315	else if (tcp_ca_needs_ecn(sk) \|\|
				316	tcp_bpf_ca_needs_ecn(sk))
				317	INET_ECN_xmit(sk);
				318	}
				319
				320	/* Packet ECN state for a SYN. */
				321	static void tcp_ecn_send_syn(struct sock sk, struct sk_buff skb)
				322	{
				323	struct tcp_sock *tp = tcp_sk(sk);
				324	bool bpf_needs_ecn = tcp_bpf_ca_needs_ecn(sk);
				325	bool use_ecn = sock_net(sk)->ipv4.sysctl_tcp_ecn == 1 \|\|
				326	tcp_ca_needs_ecn(sk) \|\| bpf_needs_ecn;
				327
				328	if (!use_ecn) {
				329	const struct dst_entry *dst = __sk_dst_get(sk);
				330
				331	if (dst && dst_feature(dst, RTAX_FEATURE_ECN))
				332	use_ecn = true;
				333	}
				334
				335	tp->ecn_flags = 0;
				336
				337	if (use_ecn) {
				338	TCP_SKB_CB(skb)->tcp_flags \|= TCPHDR_ECE \| TCPHDR_CWR;
				339	tp->ecn_flags = TCP_ECN_OK;
				340	if (tcp_ca_needs_ecn(sk) \|\| bpf_needs_ecn)
				341	INET_ECN_xmit(sk);
				342	}
				343	}
				344
				345	static void tcp_ecn_clear_syn(struct sock sk, struct sk_buff skb)
				346	{
				347	if (sock_net(sk)->ipv4.sysctl_tcp_ecn_fallback)
				348	/* tp->ecn_flags are cleared at a later point in time when
				349	* SYN ACK is ultimatively being received.
				350	*/
				351	TCP_SKB_CB(skb)->tcp_flags &= ~(TCPHDR_ECE \| TCPHDR_CWR);
				352	}
				353
				354	static void
				355	tcp_ecn_make_synack(const struct request_sock req, struct tcphdr th)
				356	{
				357	if (inet_rsk(req)->ecn_ok)
				358	th->ece = 1;
				359	}
				360
				361	/* Set up ECN state for a packet on a ESTABLISHED socket that is about to
				362	* be sent.
				363	*/
				364	static void tcp_ecn_send(struct sock sk, struct sk_buff skb,
				365	struct tcphdr *th, int tcp_header_len)
				366	{
				367	struct tcp_sock *tp = tcp_sk(sk);
				368
				369	if (tp->ecn_flags & TCP_ECN_OK) {
				370	/* Not-retransmitted data segment: set ECT and inject CWR. */
				371	if (skb->len != tcp_header_len &&
				372	!before(TCP_SKB_CB(skb)->seq, tp->snd_nxt)) {
				373	INET_ECN_xmit(sk);
				374	if (tp->ecn_flags & TCP_ECN_QUEUE_CWR) {
				375	tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR;
				376	th->cwr = 1;
				377	skb_shinfo(skb)->gso_type \|= SKB_GSO_TCP_ECN;
				378	}
				379	} else if (!tcp_ca_needs_ecn(sk)) {
				380	/* ACK or retransmitted segment: clear ECT\|CE */
				381	INET_ECN_dontxmit(sk);
				382	}
				383	if (tp->ecn_flags & TCP_ECN_DEMAND_CWR)
				384	th->ece = 1;
				385	}
				386	}
				387
				388	/* Constructs common control bits of non-data skb. If SYN/FIN is present,
				389	* auto increment end seqno.
				390	*/
				391	static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)
				392	{
				393	skb->ip_summed = CHECKSUM_PARTIAL;
				394
				395	TCP_SKB_CB(skb)->tcp_flags = flags;
				396	TCP_SKB_CB(skb)->sacked = 0;
				397
				398	tcp_skb_pcount_set(skb, 1);
				399
				400	TCP_SKB_CB(skb)->seq = seq;
				401	if (flags & (TCPHDR_SYN \| TCPHDR_FIN))
				402	seq++;
				403	TCP_SKB_CB(skb)->end_seq = seq;
				404	}
				405
				406	static inline bool tcp_urg_mode(const struct tcp_sock *tp)
				407	{
				408	return tp->snd_una != tp->snd_up;
				409	}
				410
				411	#define OPTION_SACK_ADVERTISE (1 << 0)
				412	#define OPTION_TS (1 << 1)
				413	#define OPTION_MD5 (1 << 2)
				414	#define OPTION_WSCALE (1 << 3)
				415	#define OPTION_FAST_OPEN_COOKIE (1 << 8)
				416	#define OPTION_SMC (1 << 9)
				417
				418	static void smc_options_write(__be32 ptr, u16 options)
				419	{
				420	#if IS_ENABLED(CONFIG_SMC)
				421	if (static_branch_unlikely(&tcp_have_smc)) {
				422	if (unlikely(OPTION_SMC & *options)) {
				423	*ptr++ = htonl((TCPOPT_NOP << 24) \|
				424	(TCPOPT_NOP << 16) \|
				425	(TCPOPT_EXP << 8) \|
				426	(TCPOLEN_EXP_SMC_BASE));
				427	*ptr++ = htonl(TCPOPT_SMC_MAGIC);
				428	}
				429	}
				430	#endif
				431	}
				432
				433	struct tcp_out_options {
				434	u16 options; /* bit field of OPTION_* */
				435	u16 mss; /* 0 to disable */
				436	u8 ws; /* window scale, 0 to disable */
				437	u8 num_sack_blocks; /* number of SACK blocks to include */
				438	u8 hash_size; /* bytes in hash_location */
				439	__u8 hash_location; / temporary pointer, overloaded */
				440	__u32 tsval, tsecr; /* need to include OPTION_TS */
				441	struct tcp_fastopen_cookie fastopen_cookie; / Fast open cookie */
				442	};
				443
				444	/* Write previously computed TCP options to the packet.
				445	*
				446	* Beware: Something in the Internet is very sensitive to the ordering of
				447	* TCP options, we learned this through the hard way, so be careful here.
				448	* Luckily we can at least blame others for their non-compliance but from
				449	* inter-operability perspective it seems that we're somewhat stuck with
				450	* the ordering which we have been using if we want to keep working with
				451	* those broken things (not that it currently hurts anybody as there isn't
				452	* particular reason why the ordering would need to be changed).
				453	*
				454	* At least SACK_PERM as the first option is known to lead to a disaster
				455	* (but it may well be that other scenarios fail similarly).
				456	*/
				457	static void tcp_options_write(__be32 ptr, struct tcp_sock tp,
				458	struct tcp_out_options *opts)
				459	{
				460	u16 options = opts->options; /* mungable copy */
				461
				462	if (unlikely(OPTION_MD5 & options)) {
				463	*ptr++ = htonl((TCPOPT_NOP << 24) \| (TCPOPT_NOP << 16) \|
				464	(TCPOPT_MD5SIG << 8) \| TCPOLEN_MD5SIG);
				465	/* overload cookie hash location */
				466	opts->hash_location = (__u8 *)ptr;
				467	ptr += 4;
				468	}
				469
				470	if (unlikely(opts->mss)) {
				471	*ptr++ = htonl((TCPOPT_MSS << 24) \|
				472	(TCPOLEN_MSS << 16) \|
				473	opts->mss);
				474	}
				475
				476	if (likely(OPTION_TS & options)) {
				477	if (unlikely(OPTION_SACK_ADVERTISE & options)) {
				478	*ptr++ = htonl((TCPOPT_SACK_PERM << 24) \|
				479	(TCPOLEN_SACK_PERM << 16) \|
				480	(TCPOPT_TIMESTAMP << 8) \|
				481	TCPOLEN_TIMESTAMP);
				482	options &= ~OPTION_SACK_ADVERTISE;
				483	} else {
				484	*ptr++ = htonl((TCPOPT_NOP << 24) \|
				485	(TCPOPT_NOP << 16) \|
				486	(TCPOPT_TIMESTAMP << 8) \|
				487	TCPOLEN_TIMESTAMP);
				488	}
				489	*ptr++ = htonl(opts->tsval);
				490	*ptr++ = htonl(opts->tsecr);
				491	}
				492
				493	if (unlikely(OPTION_SACK_ADVERTISE & options)) {
				494	*ptr++ = htonl((TCPOPT_NOP << 24) \|
				495	(TCPOPT_NOP << 16) \|
				496	(TCPOPT_SACK_PERM << 8) \|
				497	TCPOLEN_SACK_PERM);
				498	}
				499
				500	if (unlikely(OPTION_WSCALE & options)) {
				501	*ptr++ = htonl((TCPOPT_NOP << 24) \|
				502	(TCPOPT_WINDOW << 16) \|
				503	(TCPOLEN_WINDOW << 8) \|
				504	opts->ws);
				505	}
				506
				507	if (unlikely(opts->num_sack_blocks)) {
				508	struct tcp_sack_block *sp = tp->rx_opt.dsack ?
				509	tp->duplicate_sack : tp->selective_acks;
				510	int this_sack;
				511
				512	*ptr++ = htonl((TCPOPT_NOP << 24) \|
				513	(TCPOPT_NOP << 16) \|
				514	(TCPOPT_SACK << 8) \|
				515	(TCPOLEN_SACK_BASE + (opts->num_sack_blocks *
				516	TCPOLEN_SACK_PERBLOCK)));
				517
				518	for (this_sack = 0; this_sack < opts->num_sack_blocks;
				519	++this_sack) {
				520	*ptr++ = htonl(sp[this_sack].start_seq);
				521	*ptr++ = htonl(sp[this_sack].end_seq);
				522	}
				523
				524	tp->rx_opt.dsack = 0;
				525	}
				526
				527	if (unlikely(OPTION_FAST_OPEN_COOKIE & options)) {
				528	struct tcp_fastopen_cookie *foc = opts->fastopen_cookie;
				529	u8 p = (u8 )ptr;
				530	u32 len; /* Fast Open option length */
				531
				532	if (foc->exp) {
				533	len = TCPOLEN_EXP_FASTOPEN_BASE + foc->len;
				534	*ptr = htonl((TCPOPT_EXP << 24) \| (len << 16) \|
				535	TCPOPT_FASTOPEN_MAGIC);
				536	p += TCPOLEN_EXP_FASTOPEN_BASE;
				537	} else {
				538	len = TCPOLEN_FASTOPEN_BASE + foc->len;
				539	*p++ = TCPOPT_FASTOPEN;
				540	*p++ = len;
				541	}
				542
				543	memcpy(p, foc->val, foc->len);
				544	if ((len & 3) == 2) {
				545	p[foc->len] = TCPOPT_NOP;
				546	p[foc->len + 1] = TCPOPT_NOP;
				547	}
				548	ptr += (len + 3) >> 2;
				549	}
				550
				551	smc_options_write(ptr, &options);
				552	}
				553
				554	static void smc_set_option(const struct tcp_sock *tp,
				555	struct tcp_out_options *opts,
				556	unsigned int *remaining)
				557	{
				558	#if IS_ENABLED(CONFIG_SMC)
				559	if (static_branch_unlikely(&tcp_have_smc)) {
				560	if (tp->syn_smc) {
				561	if (*remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) {
				562	opts->options \|= OPTION_SMC;
				563	*remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED;
				564	}
				565	}
				566	}
				567	#endif
				568	}
				569
				570	static void smc_set_option_cond(const struct tcp_sock *tp,
				571	const struct inet_request_sock *ireq,
				572	struct tcp_out_options *opts,
				573	unsigned int *remaining)
				574	{
				575	#if IS_ENABLED(CONFIG_SMC)
				576	if (static_branch_unlikely(&tcp_have_smc)) {
				577	if (tp->syn_smc && ireq->smc_ok) {
				578	if (*remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) {
				579	opts->options \|= OPTION_SMC;
				580	*remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED;
				581	}
				582	}
				583	}
				584	#endif
				585	}
				586
				587	/* Compute TCP options for SYN packets. This is not the final
				588	* network wire format yet.
				589	*/
				590	static unsigned int tcp_syn_options(struct sock sk, struct sk_buff skb,
				591	struct tcp_out_options *opts,
				592	struct tcp_md5sig_key **md5)
				593	{
				594	struct tcp_sock *tp = tcp_sk(sk);
				595	unsigned int remaining = MAX_TCP_OPTION_SPACE;
				596	struct tcp_fastopen_request *fastopen = tp->fastopen_req;
				597
				598	*md5 = NULL;
				599	#ifdef CONFIG_TCP_MD5SIG
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	600	if (static_branch_unlikely(&tcp_md5_needed) &&
				601	rcu_access_pointer(tp->md5sig_info)) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	602	*md5 = tp->af_specific->md5_lookup(sk, sk);
				603	if (*md5) {
				604	opts->options \|= OPTION_MD5;
				605	remaining -= TCPOLEN_MD5SIG_ALIGNED;
				606	}
				607	}
				608	#endif
				609
				610	/* We always get an MSS option. The option bytes which will be seen in
				611	* normal data packets should timestamps be used, must be in the MSS
				612	* advertised. But we subtract them from tp->mss_cache so that
				613	* calculations in tcp_sendmsg are simpler etc. So account for this
				614	* fact here if necessary. If we don't do this correctly, as a
				615	* receiver we won't recognize data packets as being full sized when we
				616	* should, and thus we won't abide by the delayed ACK rules correctly.
				617	* SACKs don't matter, we never delay an ACK when we have any of those
				618	* going out. */
				619	opts->mss = tcp_advertise_mss(sk);
				620	remaining -= TCPOLEN_MSS_ALIGNED;
				621
				622	if (likely(sock_net(sk)->ipv4.sysctl_tcp_timestamps && !*md5)) {
				623	opts->options \|= OPTION_TS;
				624	opts->tsval = tcp_skb_timestamp(skb) + tp->tsoffset;
				625	opts->tsecr = tp->rx_opt.ts_recent;
				626	remaining -= TCPOLEN_TSTAMP_ALIGNED;
				627	}
				628	if (likely(sock_net(sk)->ipv4.sysctl_tcp_window_scaling)) {
				629	opts->ws = tp->rx_opt.rcv_wscale;
				630	opts->options \|= OPTION_WSCALE;
				631	remaining -= TCPOLEN_WSCALE_ALIGNED;
				632	}
				633	if (likely(sock_net(sk)->ipv4.sysctl_tcp_sack)) {
				634	opts->options \|= OPTION_SACK_ADVERTISE;
				635	if (unlikely(!(OPTION_TS & opts->options)))
				636	remaining -= TCPOLEN_SACKPERM_ALIGNED;
				637	}
				638
				639	if (fastopen && fastopen->cookie.len >= 0) {
				640	u32 need = fastopen->cookie.len;
				641
				642	need += fastopen->cookie.exp ? TCPOLEN_EXP_FASTOPEN_BASE :
				643	TCPOLEN_FASTOPEN_BASE;
				644	need = (need + 3) & ~3U; /* Align to 32 bits */
				645	if (remaining >= need) {
				646	opts->options \|= OPTION_FAST_OPEN_COOKIE;
				647	opts->fastopen_cookie = &fastopen->cookie;
				648	remaining -= need;
				649	tp->syn_fastopen = 1;
				650	tp->syn_fastopen_exp = fastopen->cookie.exp ? 1 : 0;
				651	}
				652	}
				653
				654	smc_set_option(tp, opts, &remaining);
				655
				656	return MAX_TCP_OPTION_SPACE - remaining;
				657	}
				658
				659	/* Set up TCP options for SYN-ACKs. */
				660	static unsigned int tcp_synack_options(const struct sock *sk,
				661	struct request_sock *req,
				662	unsigned int mss, struct sk_buff *skb,
				663	struct tcp_out_options *opts,
				664	const struct tcp_md5sig_key *md5,
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	665	struct tcp_fastopen_cookie *foc,
				666	enum tcp_synack_type synack_type)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	667	{
				668	struct inet_request_sock *ireq = inet_rsk(req);
				669	unsigned int remaining = MAX_TCP_OPTION_SPACE;
				670
				671	#ifdef CONFIG_TCP_MD5SIG
				672	if (md5) {
				673	opts->options \|= OPTION_MD5;
				674	remaining -= TCPOLEN_MD5SIG_ALIGNED;
				675
				676	/* We can't fit any SACK blocks in a packet with MD5 + TS
				677	* options. There was discussion about disabling SACK
				678	* rather than TS in order to fit in better with old,
				679	* buggy kernels, but that was deemed to be unnecessary.
				680	*/
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	681	if (synack_type != TCP_SYNACK_COOKIE)
				682	ireq->tstamp_ok &= !ireq->sack_ok;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	683	}
				684	#endif
				685
				686	/* We always send an MSS option. */
				687	opts->mss = mss;
				688	remaining -= TCPOLEN_MSS_ALIGNED;
				689
				690	if (likely(ireq->wscale_ok)) {
				691	opts->ws = ireq->rcv_wscale;
				692	opts->options \|= OPTION_WSCALE;
				693	remaining -= TCPOLEN_WSCALE_ALIGNED;
				694	}
				695	if (likely(ireq->tstamp_ok)) {
				696	opts->options \|= OPTION_TS;
				697	opts->tsval = tcp_skb_timestamp(skb) + tcp_rsk(req)->ts_off;
				698	opts->tsecr = req->ts_recent;
				699	remaining -= TCPOLEN_TSTAMP_ALIGNED;
				700	}
				701	if (likely(ireq->sack_ok)) {
				702	opts->options \|= OPTION_SACK_ADVERTISE;
				703	if (unlikely(!ireq->tstamp_ok))
				704	remaining -= TCPOLEN_SACKPERM_ALIGNED;
				705	}
				706	if (foc != NULL && foc->len >= 0) {
				707	u32 need = foc->len;
				708
				709	need += foc->exp ? TCPOLEN_EXP_FASTOPEN_BASE :
				710	TCPOLEN_FASTOPEN_BASE;
				711	need = (need + 3) & ~3U; /* Align to 32 bits */
				712	if (remaining >= need) {
				713	opts->options \|= OPTION_FAST_OPEN_COOKIE;
				714	opts->fastopen_cookie = foc;
				715	remaining -= need;
				716	}
				717	}
				718
				719	smc_set_option_cond(tcp_sk(sk), ireq, opts, &remaining);
				720
				721	return MAX_TCP_OPTION_SPACE - remaining;
				722	}
				723
				724	/* Compute TCP options for ESTABLISHED sockets. This is not the
				725	* final wire format yet.
				726	*/
				727	static unsigned int tcp_established_options(struct sock sk, struct sk_buff skb,
				728	struct tcp_out_options *opts,
				729	struct tcp_md5sig_key **md5)
				730	{
				731	struct tcp_sock *tp = tcp_sk(sk);
				732	unsigned int size = 0;
				733	unsigned int eff_sacks;
				734
				735	opts->options = 0;
				736
				737	*md5 = NULL;
				738	#ifdef CONFIG_TCP_MD5SIG
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	739	if (static_branch_unlikely(&tcp_md5_needed) &&
				740	rcu_access_pointer(tp->md5sig_info)) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	741	*md5 = tp->af_specific->md5_lookup(sk, sk);
				742	if (*md5) {
				743	opts->options \|= OPTION_MD5;
				744	size += TCPOLEN_MD5SIG_ALIGNED;
				745	}
				746	}
				747	#endif
				748
				749	if (likely(tp->rx_opt.tstamp_ok)) {
				750	opts->options \|= OPTION_TS;
				751	opts->tsval = skb ? tcp_skb_timestamp(skb) + tp->tsoffset : 0;
				752	opts->tsecr = tp->rx_opt.ts_recent;
				753	size += TCPOLEN_TSTAMP_ALIGNED;
				754	}
				755
				756	eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack;
				757	if (unlikely(eff_sacks)) {
				758	const unsigned int remaining = MAX_TCP_OPTION_SPACE - size;
				759	opts->num_sack_blocks =
				760	min_t(unsigned int, eff_sacks,
				761	(remaining - TCPOLEN_SACK_BASE_ALIGNED) /
				762	TCPOLEN_SACK_PERBLOCK);
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	763	if (likely(opts->num_sack_blocks))
				764	size += TCPOLEN_SACK_BASE_ALIGNED +
				765	opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	766	}
				767
				768	return size;
				769	}
				770
				771
				772	/* TCP SMALL QUEUES (TSQ)
				773	*
				774	* TSQ goal is to keep small amount of skbs per tcp flow in tx queues (qdisc+dev)
				775	* to reduce RTT and bufferbloat.
				776	* We do this using a special skb destructor (tcp_wfree).
				777	*
				778	* Its important tcp_wfree() can be replaced by sock_wfree() in the event skb
				779	* needs to be reallocated in a driver.
				780	* The invariant being skb->truesize subtracted from sk->sk_wmem_alloc
				781	*
				782	* Since transmit from skb destructor is forbidden, we use a tasklet
				783	* to process all sockets that eventually need to send more skbs.
				784	* We use one tasklet per cpu, with its own queue of sockets.
				785	*/
				786	struct tsq_tasklet {
				787	struct tasklet_struct tasklet;
				788	struct list_head head; /* queue of tcp sockets */
				789	};
				790	static DEFINE_PER_CPU(struct tsq_tasklet, tsq_tasklet);
				791
				792	static void tcp_tsq_write(struct sock *sk)
				793	{
				794	if ((1 << sk->sk_state) &
				795	(TCPF_ESTABLISHED \| TCPF_FIN_WAIT1 \| TCPF_CLOSING \|
				796	TCPF_CLOSE_WAIT \| TCPF_LAST_ACK)) {
				797	struct tcp_sock *tp = tcp_sk(sk);
				798
				799	if (tp->lost_out > tp->retrans_out &&
				800	tp->snd_cwnd > tcp_packets_in_flight(tp)) {
				801	tcp_mstamp_refresh(tp);
				802	tcp_xmit_retransmit_queue(sk);
				803	}
				804
				805	tcp_write_xmit(sk, tcp_current_mss(sk), tp->nonagle,
				806	0, GFP_ATOMIC);
				807	}
				808	}
				809
				810	static void tcp_tsq_handler(struct sock *sk)
				811	{
				812	bh_lock_sock(sk);
				813	if (!sock_owned_by_user(sk))
				814	tcp_tsq_write(sk);
				815	else if (!test_and_set_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags))
				816	sock_hold(sk);
				817	bh_unlock_sock(sk);
				818	}
				819	/*
				820	* One tasklet per cpu tries to send more skbs.
				821	* We run in tasklet context but need to disable irqs when
				822	* transferring tsq->head because tcp_wfree() might
				823	* interrupt us (non NAPI drivers)
				824	*/
				825	static void tcp_tasklet_func(unsigned long data)
				826	{
				827	struct tsq_tasklet tsq = (struct tsq_tasklet )data;
				828	LIST_HEAD(list);
				829	unsigned long flags;
				830	struct list_head q, n;
				831	struct tcp_sock *tp;
				832	struct sock *sk;
				833
				834	local_irq_save(flags);
				835	list_splice_init(&tsq->head, &list);
				836	local_irq_restore(flags);
				837
				838	list_for_each_safe(q, n, &list) {
				839	tp = list_entry(q, struct tcp_sock, tsq_node);
				840	list_del(&tp->tsq_node);
				841
				842	sk = (struct sock *)tp;
				843	smp_mb__before_atomic();
				844	clear_bit(TSQ_QUEUED, &sk->sk_tsq_flags);
				845
				846	tcp_tsq_handler(sk);
				847	sk_free(sk);
				848	}
				849	}
				850
				851	#define TCP_DEFERRED_ALL (TCPF_TSQ_DEFERRED \| \
				852	TCPF_WRITE_TIMER_DEFERRED \| \
				853	TCPF_DELACK_TIMER_DEFERRED \| \
				854	TCPF_MTU_REDUCED_DEFERRED)
				855	/**
				856	* tcp_release_cb - tcp release_sock() callback
				857	* @sk: socket
				858	*
				859	* called from release_sock() to perform protocol dependent
				860	* actions before socket release.
				861	*/
				862	void tcp_release_cb(struct sock *sk)
				863	{
				864	unsigned long flags, nflags;
				865
				866	/* perform an atomic operation only if at least one flag is set */
				867	do {
				868	flags = sk->sk_tsq_flags;
				869	if (!(flags & TCP_DEFERRED_ALL))
				870	return;
				871	nflags = flags & ~TCP_DEFERRED_ALL;
				872	} while (cmpxchg(&sk->sk_tsq_flags, flags, nflags) != flags);
				873
				874	if (flags & TCPF_TSQ_DEFERRED) {
				875	tcp_tsq_write(sk);
				876	__sock_put(sk);
				877	}
				878	/* Here begins the tricky part :
				879	* We are called from release_sock() with :
				880	* 1) BH disabled
				881	* 2) sk_lock.slock spinlock held
				882	* 3) socket owned by us (sk->sk_lock.owned == 1)
				883	*
				884	* But following code is meant to be called from BH handlers,
				885	* so we should keep BH disabled, but early release socket ownership
				886	*/
				887	sock_release_ownership(sk);
				888
				889	if (flags & TCPF_WRITE_TIMER_DEFERRED) {
				890	tcp_write_timer_handler(sk);
				891	__sock_put(sk);
				892	}
				893	if (flags & TCPF_DELACK_TIMER_DEFERRED) {
				894	tcp_delack_timer_handler(sk);
				895	__sock_put(sk);
				896	}
				897	if (flags & TCPF_MTU_REDUCED_DEFERRED) {
				898	inet_csk(sk)->icsk_af_ops->mtu_reduced(sk);
				899	__sock_put(sk);
				900	}
				901	}
				902	EXPORT_SYMBOL(tcp_release_cb);
				903
				904	void __init tcp_tasklet_init(void)
				905	{
				906	int i;
				907
				908	for_each_possible_cpu(i) {
				909	struct tsq_tasklet *tsq = &per_cpu(tsq_tasklet, i);
				910
				911	INIT_LIST_HEAD(&tsq->head);
				912	tasklet_init(&tsq->tasklet,
				913	tcp_tasklet_func,
				914	(unsigned long)tsq);
				915	}
				916	}
				917
				918	/*
				919	* Write buffer destructor automatically called from kfree_skb.
				920	* We can't xmit new skbs from this context, as we might already
				921	* hold qdisc lock.
				922	*/
				923	void tcp_wfree(struct sk_buff *skb)
				924	{
				925	struct sock *sk = skb->sk;
				926	struct tcp_sock *tp = tcp_sk(sk);
				927	unsigned long flags, nval, oval;
				928
				929	/* Keep one reference on sk_wmem_alloc.
				930	* Will be released by sk_free() from here or tcp_tasklet_func()
				931	*/
				932	WARN_ON(refcount_sub_and_test(skb->truesize - 1, &sk->sk_wmem_alloc));
				933
				934	/* If this softirq is serviced by ksoftirqd, we are likely under stress.
				935	* Wait until our queues (qdisc + devices) are drained.
				936	* This gives :
				937	* - less callbacks to tcp_write_xmit(), reducing stress (batches)
				938	* - chance for incoming ACK (processed by another cpu maybe)
				939	* to migrate this flow (skb->ooo_okay will be eventually set)
				940	*/
				941	if (refcount_read(&sk->sk_wmem_alloc) >= SKB_TRUESIZE(1) && this_cpu_ksoftirqd() == current)
				942	goto out;
				943
				944	for (oval = READ_ONCE(sk->sk_tsq_flags);; oval = nval) {
				945	struct tsq_tasklet *tsq;
				946	bool empty;
				947
				948	if (!(oval & TSQF_THROTTLED) \|\| (oval & TSQF_QUEUED))
				949	goto out;
				950
				951	nval = (oval & ~TSQF_THROTTLED) \| TSQF_QUEUED;
				952	nval = cmpxchg(&sk->sk_tsq_flags, oval, nval);
				953	if (nval != oval)
				954	continue;
				955
				956	/* queue this socket to tasklet queue */
				957	local_irq_save(flags);
				958	tsq = this_cpu_ptr(&tsq_tasklet);
				959	empty = list_empty(&tsq->head);
				960	list_add(&tp->tsq_node, &tsq->head);
				961	if (empty)
				962	tasklet_schedule(&tsq->tasklet);
				963	local_irq_restore(flags);
				964	return;
				965	}
				966	out:
				967	sk_free(sk);
				968	}
				969
				970	/* Note: Called under soft irq.
				971	* We can call TCP stack right away, unless socket is owned by user.
				972	*/
				973	enum hrtimer_restart tcp_pace_kick(struct hrtimer *timer)
				974	{
				975	struct tcp_sock *tp = container_of(timer, struct tcp_sock, pacing_timer);
				976	struct sock sk = (struct sock )tp;
				977
				978	tcp_tsq_handler(sk);
				979	sock_put(sk);
				980
				981	return HRTIMER_NORESTART;
				982	}
				983
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	984	static void tcp_update_skb_after_send(struct sock sk, struct sk_buff skb,
				985	u64 prior_wstamp)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	986	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	987	struct tcp_sock *tp = tcp_sk(sk);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	988
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	989	if (sk->sk_pacing_status != SK_PACING_NONE) {
				990	unsigned long rate = sk->sk_pacing_rate;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	991
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	992	/* Original sch_fq does not pace first 10 MSS
				993	* Note that tp->data_segs_out overflows after 2^32 packets,
				994	* this is a minor annoyance.
				995	*/
				996	if (rate != ~0UL && rate && tp->data_segs_out >= 10) {
				997	u64 len_ns = div64_ul((u64)skb->len * NSEC_PER_SEC, rate);
				998	u64 credit = tp->tcp_wstamp_ns - prior_wstamp;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	999
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1000	/* take into account OS jitter */
				1001	len_ns -= min_t(u64, len_ns / 2, credit);
				1002	tp->tcp_wstamp_ns += len_ns;
				1003	}
				1004	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1005	list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue);
				1006	}
				1007
				1008	/* This routine actually transmits TCP packets queued in by
				1009	* tcp_do_sendmsg(). This is used by both the initial
				1010	* transmission and possible later retransmissions.
				1011	* All SKB's seen here are completely headerless. It is our
				1012	* job to build the TCP header, and pass the packet down to
				1013	* IP so it can do the same plus pass the packet off to the
				1014	* device.
				1015	*
				1016	* We are working here with either a clone of the original
				1017	* SKB, or a fresh unique copy made by the retransmit engine.
				1018	*/
				1019	static int __tcp_transmit_skb(struct sock sk, struct sk_buff skb,
				1020	int clone_it, gfp_t gfp_mask, u32 rcv_nxt)
				1021	{
				1022	const struct inet_connection_sock *icsk = inet_csk(sk);
				1023	struct inet_sock *inet;
				1024	struct tcp_sock *tp;
				1025	struct tcp_skb_cb *tcb;
				1026	struct tcp_out_options opts;
				1027	unsigned int tcp_options_size, tcp_header_size;
				1028	struct sk_buff *oskb = NULL;
				1029	struct tcp_md5sig_key *md5;
				1030	struct tcphdr *th;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1031	u64 prior_wstamp;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1032	int err;
				1033
				1034	BUG_ON(!skb \|\| !tcp_skb_pcount(skb));
				1035	tp = tcp_sk(sk);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1036	prior_wstamp = tp->tcp_wstamp_ns;
				1037	tp->tcp_wstamp_ns = max(tp->tcp_wstamp_ns, tp->tcp_clock_cache);
				1038	skb->skb_mstamp_ns = tp->tcp_wstamp_ns;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1039	if (clone_it) {
				1040	TCP_SKB_CB(skb)->tx.in_flight = TCP_SKB_CB(skb)->end_seq
				1041	- tp->snd_una;
				1042	oskb = skb;
				1043
				1044	tcp_skb_tsorted_save(oskb) {
				1045	if (unlikely(skb_cloned(oskb)))
				1046	skb = pskb_copy(oskb, gfp_mask);
				1047	else
				1048	skb = skb_clone(oskb, gfp_mask);
				1049	} tcp_skb_tsorted_restore(oskb);
				1050
				1051	if (unlikely(!skb))
				1052	return -ENOBUFS;
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	1053	/* retransmit skbs might have a non zero value in skb->dev
				1054	* because skb->dev is aliased with skb->rbnode.rb_left
				1055	*/
				1056	skb->dev = NULL;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1057	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1058
				1059	inet = inet_sk(sk);
				1060	tcb = TCP_SKB_CB(skb);
				1061	memset(&opts, 0, sizeof(opts));
				1062
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1063	if (unlikely(tcb->tcp_flags & TCPHDR_SYN)) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1064	tcp_options_size = tcp_syn_options(sk, skb, &opts, &md5);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1065	} else {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1066	tcp_options_size = tcp_established_options(sk, skb, &opts,
				1067	&md5);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1068	/* Force a PSH flag on all (GSO) packets to expedite GRO flush
				1069	* at receiver : This slightly improve GRO performance.
				1070	* Note that we do not force the PSH flag for non GSO packets,
				1071	* because they might be sent under high congestion events,
				1072	* and in this case it is better to delay the delivery of 1-MSS
				1073	* packets and thus the corresponding ACK packet that would
				1074	* release the following packet.
				1075	*/
				1076	if (tcp_skb_pcount(skb) > 1)
				1077	tcb->tcp_flags \|= TCPHDR_PSH;
				1078	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1079	tcp_header_size = tcp_options_size + sizeof(struct tcphdr);
				1080
				1081	/* if no packet is in qdisc/device queue, then allow XPS to select
				1082	* another queue. We can be called from tcp_tsq_handler()
				1083	* which holds one reference to sk.
				1084	*
				1085	* TODO: Ideally, in-flight pure ACK packets should not matter here.
				1086	* One way to get this would be to set skb->truesize = 2 on them.
				1087	*/
				1088	skb->ooo_okay = sk_wmem_alloc_get(sk) < SKB_TRUESIZE(1);
				1089
				1090	/* If we had to use memory reserve to allocate this skb,
				1091	* this might cause drops if packet is looped back :
				1092	* Other socket might not have SOCK_MEMALLOC.
				1093	* Packets not looped back do not care about pfmemalloc.
				1094	*/
				1095	skb->pfmemalloc = 0;
				1096
				1097	skb_push(skb, tcp_header_size);
				1098	skb_reset_transport_header(skb);
				1099
				1100	skb_orphan(skb);
				1101	skb->sk = sk;
				1102	skb->destructor = skb_is_tcp_pure_ack(skb) ? __sock_wfree : tcp_wfree;
				1103	skb_set_hash_from_sk(skb, sk);
				1104	refcount_add(skb->truesize, &sk->sk_wmem_alloc);
				1105
				1106	skb_set_dst_pending_confirm(skb, sk->sk_dst_pending_confirm);
				1107
				1108	/* Build TCP header and checksum it. */
				1109	th = (struct tcphdr *)skb->data;
				1110	th->source = inet->inet_sport;
				1111	th->dest = inet->inet_dport;
				1112	th->seq = htonl(tcb->seq);
				1113	th->ack_seq = htonl(rcv_nxt);
				1114	(((__be16 )th) + 6) = htons(((tcp_header_size >> 2) << 12) \|
				1115	tcb->tcp_flags);
				1116
				1117	th->check = 0;
				1118	th->urg_ptr = 0;
				1119
				1120	/* The urg_mode check is necessary during a below snd_una win probe */
				1121	if (unlikely(tcp_urg_mode(tp) && before(tcb->seq, tp->snd_up))) {
				1122	if (before(tp->snd_up, tcb->seq + 0x10000)) {
				1123	th->urg_ptr = htons(tp->snd_up - tcb->seq);
				1124	th->urg = 1;
				1125	} else if (after(tcb->seq + 0xFFFF, tp->snd_nxt)) {
				1126	th->urg_ptr = htons(0xFFFF);
				1127	th->urg = 1;
				1128	}
				1129	}
				1130
				1131	tcp_options_write((__be32 *)(th + 1), tp, &opts);
				1132	skb_shinfo(skb)->gso_type = sk->sk_gso_type;
				1133	if (likely(!(tcb->tcp_flags & TCPHDR_SYN))) {
				1134	th->window = htons(tcp_select_window(sk));
				1135	tcp_ecn_send(sk, skb, th, tcp_header_size);
				1136	} else {
				1137	/* RFC1323: The window in SYN & SYN/ACK segments
				1138	* is never scaled.
				1139	*/
				1140	th->window = htons(min(tp->rcv_wnd, 65535U));
				1141	}
				1142	#ifdef CONFIG_TCP_MD5SIG
				1143	/* Calculate the MD5 hash, as we have all we need now */
				1144	if (md5) {
				1145	sk_nocaps_add(sk, NETIF_F_GSO_MASK);
				1146	tp->af_specific->calc_md5_hash(opts.hash_location,
				1147	md5, sk, skb);
				1148	}
				1149	#endif
				1150
				1151	icsk->icsk_af_ops->send_check(sk, skb);
				1152
				1153	if (likely(tcb->tcp_flags & TCPHDR_ACK))
				1154	tcp_event_ack_sent(sk, tcp_skb_pcount(skb), rcv_nxt);
				1155
				1156	if (skb->len != tcp_header_size) {
				1157	tcp_event_data_sent(tp, sk);
				1158	tp->data_segs_out += tcp_skb_pcount(skb);
				1159	tp->bytes_sent += skb->len - tcp_header_size;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1160	}
				1161
				1162	if (after(tcb->end_seq, tp->snd_nxt) \|\| tcb->seq == tcb->end_seq)
				1163	TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS,
				1164	tcp_skb_pcount(skb));
				1165
				1166	tp->segs_out += tcp_skb_pcount(skb);
				1167	/* OK, its time to fill skb_shinfo(skb)->gso_{segs\|size} */
				1168	skb_shinfo(skb)->gso_segs = tcp_skb_pcount(skb);
				1169	skb_shinfo(skb)->gso_size = tcp_skb_mss(skb);
				1170
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1171	/* Leave earliest departure time in skb->tstamp (skb->skb_mstamp_ns) */
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1172
				1173	/* Cleanup our debris for IP stacks */
				1174	memset(skb->cb, 0, max(sizeof(struct inet_skb_parm),
				1175	sizeof(struct inet6_skb_parm)));
				1176
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1177	tcp_add_tx_delay(skb, tp);
				1178
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1179	err = icsk->icsk_af_ops->queue_xmit(sk, skb, &inet->cork.fl);
				1180
				1181	if (unlikely(err > 0)) {
				1182	tcp_enter_cwr(sk);
				1183	err = net_xmit_eval(err);
				1184	}
				1185	if (!err && oskb) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1186	tcp_update_skb_after_send(sk, oskb, prior_wstamp);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1187	tcp_rate_skb_sent(sk, oskb);
				1188	}
				1189	return err;
				1190	}
				1191
				1192	static int tcp_transmit_skb(struct sock sk, struct sk_buff skb, int clone_it,
				1193	gfp_t gfp_mask)
				1194	{
				1195	return __tcp_transmit_skb(sk, skb, clone_it, gfp_mask,
				1196	tcp_sk(sk)->rcv_nxt);
				1197	}
				1198
				1199	/* This routine just queues the buffer for sending.
				1200	*
				1201	* NOTE: probe0 timer is not checked, do not forget tcp_push_pending_frames,
				1202	* otherwise socket can stall.
				1203	*/
				1204	static void tcp_queue_skb(struct sock sk, struct sk_buff skb)
				1205	{
				1206	struct tcp_sock *tp = tcp_sk(sk);
				1207
				1208	/* Advance write_seq and place onto the write_queue. */
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1209	WRITE_ONCE(tp->write_seq, TCP_SKB_CB(skb)->end_seq);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1210	__skb_header_release(skb);
				1211	tcp_add_write_queue_tail(sk, skb);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1212	sk_wmem_queued_add(sk, skb->truesize);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1213	sk_mem_charge(sk, skb->truesize);
				1214	}
				1215
				1216	/* Initialize TSO segments for a packet. */
				1217	static void tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_now)
				1218	{
				1219	if (skb->len <= mss_now) {
				1220	/* Avoid the costly divide in the normal
				1221	* non-TSO case.
				1222	*/
				1223	tcp_skb_pcount_set(skb, 1);
				1224	TCP_SKB_CB(skb)->tcp_gso_size = 0;
				1225	} else {
				1226	tcp_skb_pcount_set(skb, DIV_ROUND_UP(skb->len, mss_now));
				1227	TCP_SKB_CB(skb)->tcp_gso_size = mss_now;
				1228	}
				1229	}
				1230
				1231	/* Pcount in the middle of the write queue got changed, we need to do various
				1232	* tweaks to fix counters
				1233	*/
				1234	static void tcp_adjust_pcount(struct sock sk, const struct sk_buff skb, int decr)
				1235	{
				1236	struct tcp_sock *tp = tcp_sk(sk);
				1237
				1238	tp->packets_out -= decr;
				1239
				1240	if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
				1241	tp->sacked_out -= decr;
				1242	if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS)
				1243	tp->retrans_out -= decr;
				1244	if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST)
				1245	tp->lost_out -= decr;
				1246
				1247	/* Reno case is special. Sigh... */
				1248	if (tcp_is_reno(tp) && decr > 0)
				1249	tp->sacked_out -= min_t(u32, tp->sacked_out, decr);
				1250
				1251	if (tp->lost_skb_hint &&
				1252	before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(tp->lost_skb_hint)->seq) &&
				1253	(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
				1254	tp->lost_cnt_hint -= decr;
				1255
				1256	tcp_verify_left_out(tp);
				1257	}
				1258
				1259	static bool tcp_has_tx_tstamp(const struct sk_buff *skb)
				1260	{
				1261	return TCP_SKB_CB(skb)->txstamp_ack \|\|
				1262	(skb_shinfo(skb)->tx_flags & SKBTX_ANY_TSTAMP);
				1263	}
				1264
				1265	static void tcp_fragment_tstamp(struct sk_buff skb, struct sk_buff skb2)
				1266	{
				1267	struct skb_shared_info *shinfo = skb_shinfo(skb);
				1268
				1269	if (unlikely(tcp_has_tx_tstamp(skb)) &&
				1270	!before(shinfo->tskey, TCP_SKB_CB(skb2)->seq)) {
				1271	struct skb_shared_info *shinfo2 = skb_shinfo(skb2);
				1272	u8 tsflags = shinfo->tx_flags & SKBTX_ANY_TSTAMP;
				1273
				1274	shinfo->tx_flags &= ~tsflags;
				1275	shinfo2->tx_flags \|= tsflags;
				1276	swap(shinfo->tskey, shinfo2->tskey);
				1277	TCP_SKB_CB(skb2)->txstamp_ack = TCP_SKB_CB(skb)->txstamp_ack;
				1278	TCP_SKB_CB(skb)->txstamp_ack = 0;
				1279	}
				1280	}
				1281
				1282	static void tcp_skb_fragment_eor(struct sk_buff skb, struct sk_buff skb2)
				1283	{
				1284	TCP_SKB_CB(skb2)->eor = TCP_SKB_CB(skb)->eor;
				1285	TCP_SKB_CB(skb)->eor = 0;
				1286	}
				1287
				1288	/* Insert buff after skb on the write or rtx queue of sk. */
				1289	static void tcp_insert_write_queue_after(struct sk_buff *skb,
				1290	struct sk_buff *buff,
				1291	struct sock *sk,
				1292	enum tcp_queue tcp_queue)
				1293	{
				1294	if (tcp_queue == TCP_FRAG_IN_WRITE_QUEUE)
				1295	__skb_queue_after(&sk->sk_write_queue, skb, buff);
				1296	else
				1297	tcp_rbtree_insert(&sk->tcp_rtx_queue, buff);
				1298	}
				1299
				1300	/* Function to create two new TCP segments. Shrinks the given segment
				1301	* to the specified size and appends a new segment with the rest of the
				1302	* packet to the list. This won't be called frequently, I hope.
				1303	* Remember, these are still headerless SKBs at this point.
				1304	*/
				1305	int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
				1306	struct sk_buff *skb, u32 len,
				1307	unsigned int mss_now, gfp_t gfp)
				1308	{
				1309	struct tcp_sock *tp = tcp_sk(sk);
				1310	struct sk_buff *buff;
				1311	int nsize, old_factor;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1312	long limit;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1313	int nlen;
				1314	u8 flags;
				1315
				1316	if (WARN_ON(len > skb->len))
				1317	return -EINVAL;
				1318
				1319	nsize = skb_headlen(skb) - len;
				1320	if (nsize < 0)
				1321	nsize = 0;
				1322
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1323	/* tcp_sendmsg() can overshoot sk_wmem_queued by one full size skb.
				1324	* We need some allowance to not penalize applications setting small
				1325	* SO_SNDBUF values.
				1326	* Also allow first and last skb in retransmit queue to be split.
				1327	*/
				1328	limit = sk->sk_sndbuf + 2 * SKB_TRUESIZE(GSO_MAX_SIZE);
				1329	if (unlikely((sk->sk_wmem_queued >> 1) > limit &&
				1330	tcp_queue != TCP_FRAG_IN_WRITE_QUEUE &&
				1331	skb != tcp_rtx_queue_head(sk) &&
				1332	skb != tcp_rtx_queue_tail(sk))) {
				1333	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPWQUEUETOOBIG);
				1334	return -ENOMEM;
				1335	}
				1336
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1337	if (skb_unclone(skb, gfp))
				1338	return -ENOMEM;
				1339
				1340	/* Get a new skb... force flag on. */
				1341	buff = sk_stream_alloc_skb(sk, nsize, gfp, true);
				1342	if (!buff)
				1343	return -ENOMEM; /* We'll just try again later. */
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1344	skb_copy_decrypted(buff, skb);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1345
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1346	sk_wmem_queued_add(sk, buff->truesize);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1347	sk_mem_charge(sk, buff->truesize);
				1348	nlen = skb->len - len - nsize;
				1349	buff->truesize += nlen;
				1350	skb->truesize -= nlen;
				1351
				1352	/* Correct the sequence numbers. */
				1353	TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
				1354	TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
				1355	TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
				1356
				1357	/* PSH and FIN should only be set in the second packet. */
				1358	flags = TCP_SKB_CB(skb)->tcp_flags;
				1359	TCP_SKB_CB(skb)->tcp_flags = flags & ~(TCPHDR_FIN \| TCPHDR_PSH);
				1360	TCP_SKB_CB(buff)->tcp_flags = flags;
				1361	TCP_SKB_CB(buff)->sacked = TCP_SKB_CB(skb)->sacked;
				1362	tcp_skb_fragment_eor(skb, buff);
				1363
				1364	skb_split(skb, buff, len);
				1365
				1366	buff->ip_summed = CHECKSUM_PARTIAL;
				1367
				1368	buff->tstamp = skb->tstamp;
				1369	tcp_fragment_tstamp(skb, buff);
				1370
				1371	old_factor = tcp_skb_pcount(skb);
				1372
				1373	/* Fix up tso_factor for both original and new SKB. */
				1374	tcp_set_skb_tso_segs(skb, mss_now);
				1375	tcp_set_skb_tso_segs(buff, mss_now);
				1376
				1377	/* Update delivered info for the new segment */
				1378	TCP_SKB_CB(buff)->tx = TCP_SKB_CB(skb)->tx;
				1379
				1380	/* If this packet has been sent out already, we must
				1381	* adjust the various packet counters.
				1382	*/
				1383	if (!before(tp->snd_nxt, TCP_SKB_CB(buff)->end_seq)) {
				1384	int diff = old_factor - tcp_skb_pcount(skb) -
				1385	tcp_skb_pcount(buff);
				1386
				1387	if (diff)
				1388	tcp_adjust_pcount(sk, skb, diff);
				1389	}
				1390
				1391	/* Link BUFF into the send queue. */
				1392	__skb_header_release(buff);
				1393	tcp_insert_write_queue_after(skb, buff, sk, tcp_queue);
				1394	if (tcp_queue == TCP_FRAG_IN_RTX_QUEUE)
				1395	list_add(&buff->tcp_tsorted_anchor, &skb->tcp_tsorted_anchor);
				1396
				1397	return 0;
				1398	}
				1399
				1400	/* This is similar to __pskb_pull_tail(). The difference is that pulled
				1401	* data is not copied, but immediately discarded.
				1402	*/
				1403	static int __pskb_trim_head(struct sk_buff *skb, int len)
				1404	{
				1405	struct skb_shared_info *shinfo;
				1406	int i, k, eat;
				1407
				1408	eat = min_t(int, len, skb_headlen(skb));
				1409	if (eat) {
				1410	__skb_pull(skb, eat);
				1411	len -= eat;
				1412	if (!len)
				1413	return 0;
				1414	}
				1415	eat = len;
				1416	k = 0;
				1417	shinfo = skb_shinfo(skb);
				1418	for (i = 0; i < shinfo->nr_frags; i++) {
				1419	int size = skb_frag_size(&shinfo->frags[i]);
				1420
				1421	if (size <= eat) {
				1422	skb_frag_unref(skb, i);
				1423	eat -= size;
				1424	} else {
				1425	shinfo->frags[k] = shinfo->frags[i];
				1426	if (eat) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1427	skb_frag_off_add(&shinfo->frags[k], eat);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1428	skb_frag_size_sub(&shinfo->frags[k], eat);
				1429	eat = 0;
				1430	}
				1431	k++;
				1432	}
				1433	}
				1434	shinfo->nr_frags = k;
				1435
				1436	skb->data_len -= len;
				1437	skb->len = skb->data_len;
				1438	return len;
				1439	}
				1440
				1441	/* Remove acked data from a packet in the transmit queue. */
				1442	int tcp_trim_head(struct sock sk, struct sk_buff skb, u32 len)
				1443	{
				1444	u32 delta_truesize;
				1445
				1446	if (skb_unclone(skb, GFP_ATOMIC))
				1447	return -ENOMEM;
				1448
				1449	delta_truesize = __pskb_trim_head(skb, len);
				1450
				1451	TCP_SKB_CB(skb)->seq += len;
				1452	skb->ip_summed = CHECKSUM_PARTIAL;
				1453
				1454	if (delta_truesize) {
				1455	skb->truesize -= delta_truesize;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1456	sk_wmem_queued_add(sk, -delta_truesize);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1457	sk_mem_uncharge(sk, delta_truesize);
				1458	sock_set_flag(sk, SOCK_QUEUE_SHRUNK);
				1459	}
				1460
				1461	/* Any change of skb->len requires recalculation of tso factor. */
				1462	if (tcp_skb_pcount(skb) > 1)
				1463	tcp_set_skb_tso_segs(skb, tcp_skb_mss(skb));
				1464
				1465	return 0;
				1466	}
				1467
				1468	/* Calculate MSS not accounting any TCP options. */
				1469	static inline int __tcp_mtu_to_mss(struct sock *sk, int pmtu)
				1470	{
				1471	const struct tcp_sock *tp = tcp_sk(sk);
				1472	const struct inet_connection_sock *icsk = inet_csk(sk);
				1473	int mss_now;
				1474
				1475	/* Calculate base mss without TCP options:
				1476	It is MMS_S - sizeof(tcphdr) of rfc1122
				1477	*/
				1478	mss_now = pmtu - icsk->icsk_af_ops->net_header_len - sizeof(struct tcphdr);
				1479
				1480	/* IPv6 adds a frag_hdr in case RTAX_FEATURE_ALLFRAG is set */
				1481	if (icsk->icsk_af_ops->net_frag_header_len) {
				1482	const struct dst_entry *dst = __sk_dst_get(sk);
				1483
				1484	if (dst && dst_allfrag(dst))
				1485	mss_now -= icsk->icsk_af_ops->net_frag_header_len;
				1486	}
				1487
				1488	/* Clamp it (mss_clamp does not include tcp options) */
				1489	if (mss_now > tp->rx_opt.mss_clamp)
				1490	mss_now = tp->rx_opt.mss_clamp;
				1491
				1492	/* Now subtract optional transport overhead */
				1493	mss_now -= icsk->icsk_ext_hdr_len;
				1494
				1495	/* Then reserve room for full set of TCP options and 8 bytes of data */
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1496	mss_now = max(mss_now, sock_net(sk)->ipv4.sysctl_tcp_min_snd_mss);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1497	return mss_now;
				1498	}
				1499
				1500	/* Calculate MSS. Not accounting for SACKs here. */
				1501	int tcp_mtu_to_mss(struct sock *sk, int pmtu)
				1502	{
				1503	/* Subtract TCP options size, not including SACKs */
				1504	return __tcp_mtu_to_mss(sk, pmtu) -
				1505	(tcp_sk(sk)->tcp_header_len - sizeof(struct tcphdr));
				1506	}
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	1507	EXPORT_SYMBOL(tcp_mtu_to_mss);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1508
				1509	/* Inverse of above */
				1510	int tcp_mss_to_mtu(struct sock *sk, int mss)
				1511	{
				1512	const struct tcp_sock *tp = tcp_sk(sk);
				1513	const struct inet_connection_sock *icsk = inet_csk(sk);
				1514	int mtu;
				1515
				1516	mtu = mss +
				1517	tp->tcp_header_len +
				1518	icsk->icsk_ext_hdr_len +
				1519	icsk->icsk_af_ops->net_header_len;
				1520
				1521	/* IPv6 adds a frag_hdr in case RTAX_FEATURE_ALLFRAG is set */
				1522	if (icsk->icsk_af_ops->net_frag_header_len) {
				1523	const struct dst_entry *dst = __sk_dst_get(sk);
				1524
				1525	if (dst && dst_allfrag(dst))
				1526	mtu += icsk->icsk_af_ops->net_frag_header_len;
				1527	}
				1528	return mtu;
				1529	}
				1530	EXPORT_SYMBOL(tcp_mss_to_mtu);
				1531
				1532	/* MTU probing init per socket */
				1533	void tcp_mtup_init(struct sock *sk)
				1534	{
				1535	struct tcp_sock *tp = tcp_sk(sk);
				1536	struct inet_connection_sock *icsk = inet_csk(sk);
				1537	struct net *net = sock_net(sk);
				1538
				1539	icsk->icsk_mtup.enabled = net->ipv4.sysctl_tcp_mtu_probing > 1;
				1540	icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp + sizeof(struct tcphdr) +
				1541	icsk->icsk_af_ops->net_header_len;
				1542	icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, net->ipv4.sysctl_tcp_base_mss);
				1543	icsk->icsk_mtup.probe_size = 0;
				1544	if (icsk->icsk_mtup.enabled)
				1545	icsk->icsk_mtup.probe_timestamp = tcp_jiffies32;
				1546	}
				1547	EXPORT_SYMBOL(tcp_mtup_init);
				1548
				1549	/* This function synchronize snd mss to current pmtu/exthdr set.
				1550
				1551	tp->rx_opt.user_mss is mss set by user by TCP_MAXSEG. It does NOT counts
				1552	for TCP options, but includes only bare TCP header.
				1553
				1554	tp->rx_opt.mss_clamp is mss negotiated at connection setup.
				1555	It is minimum of user_mss and mss received with SYN.
				1556	It also does not include TCP options.
				1557
				1558	inet_csk(sk)->icsk_pmtu_cookie is last pmtu, seen by this function.
				1559
				1560	tp->mss_cache is current effective sending mss, including
				1561	all tcp options except for SACKs. It is evaluated,
				1562	taking into account current pmtu, but never exceeds
				1563	tp->rx_opt.mss_clamp.
				1564
				1565	NOTE1. rfc1122 clearly states that advertised MSS
				1566	DOES NOT include either tcp or ip options.
				1567
				1568	NOTE2. inet_csk(sk)->icsk_pmtu_cookie and tp->mss_cache
				1569	are READ ONLY outside this function. --ANK (980731)
				1570	*/
				1571	unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
				1572	{
				1573	struct tcp_sock *tp = tcp_sk(sk);
				1574	struct inet_connection_sock *icsk = inet_csk(sk);
				1575	int mss_now;
				1576
				1577	if (icsk->icsk_mtup.search_high > pmtu)
				1578	icsk->icsk_mtup.search_high = pmtu;
				1579
				1580	mss_now = tcp_mtu_to_mss(sk, pmtu);
				1581	mss_now = tcp_bound_to_half_wnd(tp, mss_now);
				1582
				1583	/* And store cached results */
				1584	icsk->icsk_pmtu_cookie = pmtu;
				1585	if (icsk->icsk_mtup.enabled)
				1586	mss_now = min(mss_now, tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low));
				1587	tp->mss_cache = mss_now;
				1588
				1589	return mss_now;
				1590	}
				1591	EXPORT_SYMBOL(tcp_sync_mss);
				1592
				1593	/* Compute the current effective MSS, taking SACKs and IP options,
				1594	* and even PMTU discovery events into account.
				1595	*/
				1596	unsigned int tcp_current_mss(struct sock *sk)
				1597	{
				1598	const struct tcp_sock *tp = tcp_sk(sk);
				1599	const struct dst_entry *dst = __sk_dst_get(sk);
				1600	u32 mss_now;
				1601	unsigned int header_len;
				1602	struct tcp_out_options opts;
				1603	struct tcp_md5sig_key *md5;
				1604
				1605	mss_now = tp->mss_cache;
				1606
				1607	if (dst) {
				1608	u32 mtu = dst_mtu(dst);
				1609	if (mtu != inet_csk(sk)->icsk_pmtu_cookie)
				1610	mss_now = tcp_sync_mss(sk, mtu);
				1611	}
				1612
				1613	header_len = tcp_established_options(sk, NULL, &opts, &md5) +
				1614	sizeof(struct tcphdr);
				1615	/* The mss_cache is sized based on tp->tcp_header_len, which assumes
				1616	* some common options. If this is an odd packet (because we have SACK
				1617	* blocks etc) then our calculated header_len will be different, and
				1618	* we have to adjust mss_now correspondingly */
				1619	if (header_len != tp->tcp_header_len) {
				1620	int delta = (int) header_len - tp->tcp_header_len;
				1621	mss_now -= delta;
				1622	}
				1623
				1624	return mss_now;
				1625	}
				1626
				1627	/* RFC2861, slow part. Adjust cwnd, after it was not full during one rto.
				1628	* As additional protections, we do not touch cwnd in retransmission phases,
				1629	* and if application hit its sndbuf limit recently.
				1630	*/
				1631	static void tcp_cwnd_application_limited(struct sock *sk)
				1632	{
				1633	struct tcp_sock *tp = tcp_sk(sk);
				1634
				1635	if (inet_csk(sk)->icsk_ca_state == TCP_CA_Open &&
				1636	sk->sk_socket && !test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
				1637	/* Limited by application or receiver window. */
				1638	u32 init_win = tcp_init_cwnd(tp, __sk_dst_get(sk));
				1639	u32 win_used = max(tp->snd_cwnd_used, init_win);
				1640	if (win_used < tp->snd_cwnd) {
				1641	tp->snd_ssthresh = tcp_current_ssthresh(sk);
				1642	tp->snd_cwnd = (tp->snd_cwnd + win_used) >> 1;
				1643	}
				1644	tp->snd_cwnd_used = 0;
				1645	}
				1646	tp->snd_cwnd_stamp = tcp_jiffies32;
				1647	}
				1648
				1649	static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited)
				1650	{
				1651	const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
				1652	struct tcp_sock *tp = tcp_sk(sk);
				1653
				1654	/* Track the maximum number of outstanding packets in each
				1655	* window, and remember whether we were cwnd-limited then.
				1656	*/
				1657	if (!before(tp->snd_una, tp->max_packets_seq) \|\|
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	1658	tp->packets_out > tp->max_packets_out \|\|
				1659	is_cwnd_limited) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1660	tp->max_packets_out = tp->packets_out;
				1661	tp->max_packets_seq = tp->snd_nxt;
				1662	tp->is_cwnd_limited = is_cwnd_limited;
				1663	}
				1664
				1665	if (tcp_is_cwnd_limited(sk)) {
				1666	/* Network is feed fully. */
				1667	tp->snd_cwnd_used = 0;
				1668	tp->snd_cwnd_stamp = tcp_jiffies32;
				1669	} else {
				1670	/* Network starves. */
				1671	if (tp->packets_out > tp->snd_cwnd_used)
				1672	tp->snd_cwnd_used = tp->packets_out;
				1673
				1674	if (sock_net(sk)->ipv4.sysctl_tcp_slow_start_after_idle &&
				1675	(s32)(tcp_jiffies32 - tp->snd_cwnd_stamp) >= inet_csk(sk)->icsk_rto &&
				1676	!ca_ops->cong_control)
				1677	tcp_cwnd_application_limited(sk);
				1678
				1679	/* The following conditions together indicate the starvation
				1680	* is caused by insufficient sender buffer:
				1681	* 1) just sent some data (see tcp_write_xmit)
				1682	* 2) not cwnd limited (this else condition)
				1683	* 3) no more data to send (tcp_write_queue_empty())
				1684	* 4) application is hitting buffer limit (SOCK_NOSPACE)
				1685	*/
				1686	if (tcp_write_queue_empty(sk) && sk->sk_socket &&
				1687	test_bit(SOCK_NOSPACE, &sk->sk_socket->flags) &&
				1688	(1 << sk->sk_state) & (TCPF_ESTABLISHED \| TCPF_CLOSE_WAIT))
				1689	tcp_chrono_start(sk, TCP_CHRONO_SNDBUF_LIMITED);
				1690	}
				1691	}
				1692
				1693	/* Minshall's variant of the Nagle send check. */
				1694	static bool tcp_minshall_check(const struct tcp_sock *tp)
				1695	{
				1696	return after(tp->snd_sml, tp->snd_una) &&
				1697	!after(tp->snd_sml, tp->snd_nxt);
				1698	}
				1699
				1700	/* Update snd_sml if this skb is under mss
				1701	* Note that a TSO packet might end with a sub-mss segment
				1702	* The test is really :
				1703	* if ((skb->len % mss) != 0)
				1704	* tp->snd_sml = TCP_SKB_CB(skb)->end_seq;
				1705	* But we can avoid doing the divide again given we already have
				1706	* skb_pcount = skb->len / mss_now
				1707	*/
				1708	static void tcp_minshall_update(struct tcp_sock *tp, unsigned int mss_now,
				1709	const struct sk_buff *skb)
				1710	{
				1711	if (skb->len < tcp_skb_pcount(skb) * mss_now)
				1712	tp->snd_sml = TCP_SKB_CB(skb)->end_seq;
				1713	}
				1714
				1715	/* Return false, if packet can be sent now without violation Nagle's rules:
				1716	* 1. It is full sized. (provided by caller in %partial bool)
				1717	* 2. Or it contains FIN. (already checked by caller)
				1718	* 3. Or TCP_CORK is not set, and TCP_NODELAY is set.
				1719	* 4. Or TCP_CORK is not set, and all sent packets are ACKed.
				1720	* With Minshall's modification: all sent small packets are ACKed.
				1721	*/
				1722	static bool tcp_nagle_check(bool partial, const struct tcp_sock *tp,
				1723	int nonagle)
				1724	{
				1725	return partial &&
				1726	((nonagle & TCP_NAGLE_CORK) \|\|
				1727	(!nonagle && tp->packets_out && tcp_minshall_check(tp)));
				1728	}
				1729
				1730	/* Return how many segs we'd like on a TSO packet,
				1731	* to send one TSO packet per ms
				1732	*/
				1733	static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now,
				1734	int min_tso_segs)
				1735	{
				1736	u32 bytes, segs;
				1737
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1738	bytes = min_t(unsigned long,
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	1739	sk->sk_pacing_rate >> READ_ONCE(sk->sk_pacing_shift),
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1740	sk->sk_gso_max_size - 1 - MAX_TCP_HEADER);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1741
				1742	/* Goal is to send at least one packet per ms,
				1743	* not one big TSO packet every 100 ms.
				1744	* This preserves ACK clocking and is consistent
				1745	* with tcp_tso_should_defer() heuristic.
				1746	*/
				1747	segs = max_t(u32, bytes / mss_now, min_tso_segs);
				1748
				1749	return segs;
				1750	}
				1751
				1752	/* Return the number of segments we want in the skb we are transmitting.
				1753	* See if congestion control module wants to decide; otherwise, autosize.
				1754	*/
				1755	static u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now)
				1756	{
				1757	const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
				1758	u32 min_tso, tso_segs;
				1759
				1760	min_tso = ca_ops->min_tso_segs ?
				1761	ca_ops->min_tso_segs(sk) :
				1762	sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs;
				1763
				1764	tso_segs = tcp_tso_autosize(sk, mss_now, min_tso);
				1765	return min_t(u32, tso_segs, sk->sk_gso_max_segs);
				1766	}
				1767
				1768	/* Returns the portion of skb which can be sent right away */
				1769	static unsigned int tcp_mss_split_point(const struct sock *sk,
				1770	const struct sk_buff *skb,
				1771	unsigned int mss_now,
				1772	unsigned int max_segs,
				1773	int nonagle)
				1774	{
				1775	const struct tcp_sock *tp = tcp_sk(sk);
				1776	u32 partial, needed, window, max_len;
				1777
				1778	window = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
				1779	max_len = mss_now * max_segs;
				1780
				1781	if (likely(max_len <= window && skb != tcp_write_queue_tail(sk)))
				1782	return max_len;
				1783
				1784	needed = min(skb->len, window);
				1785
				1786	if (max_len <= needed)
				1787	return max_len;
				1788
				1789	partial = needed % mss_now;
				1790	/* If last segment is not a full MSS, check if Nagle rules allow us
				1791	* to include this last segment in this skb.
				1792	* Otherwise, we'll split the skb at last MSS boundary
				1793	*/
				1794	if (tcp_nagle_check(partial != 0, tp, nonagle))
				1795	return needed - partial;
				1796
				1797	return needed;
				1798	}
				1799
				1800	/* Can at least one segment of SKB be sent right now, according to the
				1801	* congestion window rules? If so, return how many segments are allowed.
				1802	*/
				1803	static inline unsigned int tcp_cwnd_test(const struct tcp_sock *tp,
				1804	const struct sk_buff *skb)
				1805	{
				1806	u32 in_flight, cwnd, halfcwnd;
				1807
				1808	/* Don't be strict about the congestion window for the final FIN. */
				1809	if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) &&
				1810	tcp_skb_pcount(skb) == 1)
				1811	return 1;
				1812
				1813	in_flight = tcp_packets_in_flight(tp);
				1814	cwnd = tp->snd_cwnd;
				1815	if (in_flight >= cwnd)
				1816	return 0;
				1817
				1818	/* For better scheduling, ensure we have at least
				1819	* 2 GSO packets in flight.
				1820	*/
				1821	halfcwnd = max(cwnd >> 1, 1U);
				1822	return min(halfcwnd, cwnd - in_flight);
				1823	}
				1824
				1825	/* Initialize TSO state of a skb.
				1826	* This must be invoked the first time we consider transmitting
				1827	* SKB onto the wire.
				1828	*/
				1829	static int tcp_init_tso_segs(struct sk_buff *skb, unsigned int mss_now)
				1830	{
				1831	int tso_segs = tcp_skb_pcount(skb);
				1832
				1833	if (!tso_segs \|\| (tso_segs > 1 && tcp_skb_mss(skb) != mss_now)) {
				1834	tcp_set_skb_tso_segs(skb, mss_now);
				1835	tso_segs = tcp_skb_pcount(skb);
				1836	}
				1837	return tso_segs;
				1838	}
				1839
				1840
				1841	/* Return true if the Nagle test allows this packet to be
				1842	* sent now.
				1843	*/
				1844	static inline bool tcp_nagle_test(const struct tcp_sock tp, const struct sk_buff skb,
				1845	unsigned int cur_mss, int nonagle)
				1846	{
				1847	/* Nagle rule does not apply to frames, which sit in the middle of the
				1848	* write_queue (they have no chances to get new data).
				1849	*
				1850	* This is implemented in the callers, where they modify the 'nonagle'
				1851	* argument based upon the location of SKB in the send queue.
				1852	*/
				1853	if (nonagle & TCP_NAGLE_PUSH)
				1854	return true;
				1855
				1856	/* Don't use the nagle rule for urgent data (or for the final FIN). */
				1857	if (tcp_urg_mode(tp) \|\| (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN))
				1858	return true;
				1859
				1860	if (!tcp_nagle_check(skb->len < cur_mss, tp, nonagle))
				1861	return true;
				1862
				1863	return false;
				1864	}
				1865
				1866	/* Does at least the first segment of SKB fit into the send window? */
				1867	static bool tcp_snd_wnd_test(const struct tcp_sock *tp,
				1868	const struct sk_buff *skb,
				1869	unsigned int cur_mss)
				1870	{
				1871	u32 end_seq = TCP_SKB_CB(skb)->end_seq;
				1872
				1873	if (skb->len > cur_mss)
				1874	end_seq = TCP_SKB_CB(skb)->seq + cur_mss;
				1875
				1876	return !after(end_seq, tcp_wnd_end(tp));
				1877	}
				1878
				1879	/* Trim TSO SKB to LEN bytes, put the remaining data into a new packet
				1880	* which is put after SKB on the list. It is very much like
				1881	* tcp_fragment() except that it may make several kinds of assumptions
				1882	* in order to speed up the splitting operation. In particular, we
				1883	* know that all the data is in scatter-gather pages, and that the
				1884	* packet has never been sent out before (and thus is not cloned).
				1885	*/
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1886	static int tso_fragment(struct sock sk, struct sk_buff skb, unsigned int len,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1887	unsigned int mss_now, gfp_t gfp)
				1888	{
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1889	int nlen = skb->len - len;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1890	struct sk_buff *buff;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1891	u8 flags;
				1892
				1893	/* All of a TSO frame must be composed of paged data. */
				1894	if (skb->len != skb->data_len)
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1895	return tcp_fragment(sk, TCP_FRAG_IN_WRITE_QUEUE,
				1896	skb, len, mss_now, gfp);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1897
				1898	buff = sk_stream_alloc_skb(sk, 0, gfp, true);
				1899	if (unlikely(!buff))
				1900	return -ENOMEM;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1901	skb_copy_decrypted(buff, skb);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1902
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1903	sk_wmem_queued_add(sk, buff->truesize);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1904	sk_mem_charge(sk, buff->truesize);
				1905	buff->truesize += nlen;
				1906	skb->truesize -= nlen;
				1907
				1908	/* Correct the sequence numbers. */
				1909	TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
				1910	TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
				1911	TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
				1912
				1913	/* PSH and FIN should only be set in the second packet. */
				1914	flags = TCP_SKB_CB(skb)->tcp_flags;
				1915	TCP_SKB_CB(skb)->tcp_flags = flags & ~(TCPHDR_FIN \| TCPHDR_PSH);
				1916	TCP_SKB_CB(buff)->tcp_flags = flags;
				1917
				1918	/* This packet was never sent out yet, so no SACK bits. */
				1919	TCP_SKB_CB(buff)->sacked = 0;
				1920
				1921	tcp_skb_fragment_eor(skb, buff);
				1922
				1923	buff->ip_summed = CHECKSUM_PARTIAL;
				1924	skb_split(skb, buff, len);
				1925	tcp_fragment_tstamp(skb, buff);
				1926
				1927	/* Fix up tso_factor for both original and new SKB. */
				1928	tcp_set_skb_tso_segs(skb, mss_now);
				1929	tcp_set_skb_tso_segs(buff, mss_now);
				1930
				1931	/* Link BUFF into the send queue. */
				1932	__skb_header_release(buff);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1933	tcp_insert_write_queue_after(skb, buff, sk, TCP_FRAG_IN_WRITE_QUEUE);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1934
				1935	return 0;
				1936	}
				1937
				1938	/* Try to defer sending, if possible, in order to minimize the amount
				1939	* of TSO splitting we do. View it as a kind of TSO Nagle test.
				1940	*
				1941	* This algorithm is from John Heffner.
				1942	*/
				1943	static bool tcp_tso_should_defer(struct sock sk, struct sk_buff skb,
				1944	bool *is_cwnd_limited,
				1945	bool *is_rwnd_limited,
				1946	u32 max_segs)
				1947	{
				1948	const struct inet_connection_sock *icsk = inet_csk(sk);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1949	u32 send_win, cong_win, limit, in_flight;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1950	struct tcp_sock *tp = tcp_sk(sk);
				1951	struct sk_buff *head;
				1952	int win_divisor;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1953	s64 delta;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1954
				1955	if (icsk->icsk_ca_state >= TCP_CA_Recovery)
				1956	goto send_now;
				1957
				1958	/* Avoid bursty behavior by allowing defer
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1959	* only if the last write was recent (1 ms).
				1960	* Note that tp->tcp_wstamp_ns can be in the future if we have
				1961	* packets waiting in a qdisc or device for EDT delivery.
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1962	*/
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1963	delta = tp->tcp_clock_cache - tp->tcp_wstamp_ns - NSEC_PER_MSEC;
				1964	if (delta > 0)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1965	goto send_now;
				1966
				1967	in_flight = tcp_packets_in_flight(tp);
				1968
				1969	BUG_ON(tcp_skb_pcount(skb) <= 1);
				1970	BUG_ON(tp->snd_cwnd <= in_flight);
				1971
				1972	send_win = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
				1973
				1974	/* From in_flight test above, we know that cwnd > in_flight. */
				1975	cong_win = (tp->snd_cwnd - in_flight) * tp->mss_cache;
				1976
				1977	limit = min(send_win, cong_win);
				1978
				1979	/* If a full-sized TSO skb can be sent, do it. */
				1980	if (limit >= max_segs * tp->mss_cache)
				1981	goto send_now;
				1982
				1983	/* Middle in queue won't get any more data, full sendable already? */
				1984	if ((skb != tcp_write_queue_tail(sk)) && (limit >= skb->len))
				1985	goto send_now;
				1986
				1987	win_divisor = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tso_win_divisor);
				1988	if (win_divisor) {
				1989	u32 chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache);
				1990
				1991	/* If at least some fraction of a window is available,
				1992	* just use it.
				1993	*/
				1994	chunk /= win_divisor;
				1995	if (limit >= chunk)
				1996	goto send_now;
				1997	} else {
				1998	/* Different approach, try not to defer past a single
				1999	* ACK. Receiver should ACK every other full sized
				2000	* frame, so if we have space for more than 3 frames
				2001	* then send now.
				2002	*/
				2003	if (limit > tcp_max_tso_deferred_mss(tp) * tp->mss_cache)
				2004	goto send_now;
				2005	}
				2006
				2007	/* TODO : use tsorted_sent_queue ? */
				2008	head = tcp_rtx_queue_head(sk);
				2009	if (!head)
				2010	goto send_now;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2011	delta = tp->tcp_clock_cache - head->tstamp;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2012	/* If next ACK is likely to come too late (half srtt), do not defer */
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2013	if ((s64)(delta - (u64)NSEC_PER_USEC * (tp->srtt_us >> 4)) < 0)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2014	goto send_now;
				2015
				2016	/* Ok, it looks like it is advisable to defer.
				2017	* Three cases are tracked :
				2018	* 1) We are cwnd-limited
				2019	* 2) We are rwnd-limited
				2020	* 3) We are application limited.
				2021	*/
				2022	if (cong_win < send_win) {
				2023	if (cong_win <= skb->len) {
				2024	*is_cwnd_limited = true;
				2025	return true;
				2026	}
				2027	} else {
				2028	if (send_win <= skb->len) {
				2029	*is_rwnd_limited = true;
				2030	return true;
				2031	}
				2032	}
				2033
				2034	/* If this packet won't get more data, do not wait. */
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2035	if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) \|\|
				2036	TCP_SKB_CB(skb)->eor)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2037	goto send_now;
				2038
				2039	return true;
				2040
				2041	send_now:
				2042	return false;
				2043	}
				2044
				2045	static inline void tcp_mtu_check_reprobe(struct sock *sk)
				2046	{
				2047	struct inet_connection_sock *icsk = inet_csk(sk);
				2048	struct tcp_sock *tp = tcp_sk(sk);
				2049	struct net *net = sock_net(sk);
				2050	u32 interval;
				2051	s32 delta;
				2052
				2053	interval = net->ipv4.sysctl_tcp_probe_interval;
				2054	delta = tcp_jiffies32 - icsk->icsk_mtup.probe_timestamp;
				2055	if (unlikely(delta >= interval * HZ)) {
				2056	int mss = tcp_current_mss(sk);
				2057
				2058	/* Update current search range */
				2059	icsk->icsk_mtup.probe_size = 0;
				2060	icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp +
				2061	sizeof(struct tcphdr) +
				2062	icsk->icsk_af_ops->net_header_len;
				2063	icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss);
				2064
				2065	/* Update probe time stamp */
				2066	icsk->icsk_mtup.probe_timestamp = tcp_jiffies32;
				2067	}
				2068	}
				2069
				2070	static bool tcp_can_coalesce_send_queue_head(struct sock *sk, int len)
				2071	{
				2072	struct sk_buff skb, next;
				2073
				2074	skb = tcp_send_head(sk);
				2075	tcp_for_write_queue_from_safe(skb, next, sk) {
				2076	if (len <= skb->len)
				2077	break;
				2078
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2079	if (unlikely(TCP_SKB_CB(skb)->eor) \|\| tcp_has_tx_tstamp(skb))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2080	return false;
				2081
				2082	len -= skb->len;
				2083	}
				2084
				2085	return true;
				2086	}
				2087
				2088	/* Create a new MTU probe if we are ready.
				2089	* MTU probe is regularly attempting to increase the path MTU by
				2090	* deliberately sending larger packets. This discovers routing
				2091	* changes resulting in larger path MTUs.
				2092	*
				2093	* Returns 0 if we should wait to probe (no cwnd available),
				2094	* 1 if a probe was sent,
				2095	* -1 otherwise
				2096	*/
				2097	static int tcp_mtu_probe(struct sock *sk)
				2098	{
				2099	struct inet_connection_sock *icsk = inet_csk(sk);
				2100	struct tcp_sock *tp = tcp_sk(sk);
				2101	struct sk_buff skb, nskb, *next;
				2102	struct net *net = sock_net(sk);
				2103	int probe_size;
				2104	int size_needed;
				2105	int copy, len;
				2106	int mss_now;
				2107	int interval;
				2108
				2109	/* Not currently probing/verifying,
				2110	* not in recovery,
				2111	* have enough cwnd, and
				2112	* not SACKing (the variable headers throw things off)
				2113	*/
				2114	if (likely(!icsk->icsk_mtup.enabled \|\|
				2115	icsk->icsk_mtup.probe_size \|\|
				2116	inet_csk(sk)->icsk_ca_state != TCP_CA_Open \|\|
				2117	tp->snd_cwnd < 11 \|\|
				2118	tp->rx_opt.num_sacks \|\| tp->rx_opt.dsack))
				2119	return -1;
				2120
				2121	/* Use binary search for probe_size between tcp_mss_base,
				2122	* and current mss_clamp. if (search_high - search_low)
				2123	* smaller than a threshold, backoff from probing.
				2124	*/
				2125	mss_now = tcp_current_mss(sk);
				2126	probe_size = tcp_mtu_to_mss(sk, (icsk->icsk_mtup.search_high +
				2127	icsk->icsk_mtup.search_low) >> 1);
				2128	size_needed = probe_size + (tp->reordering + 1) * tp->mss_cache;
				2129	interval = icsk->icsk_mtup.search_high - icsk->icsk_mtup.search_low;
				2130	/* When misfortune happens, we are reprobing actively,
				2131	* and then reprobe timer has expired. We stick with current
				2132	* probing process by not resetting search range to its orignal.
				2133	*/
				2134	if (probe_size > tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_high) \|\|
				2135	interval < net->ipv4.sysctl_tcp_probe_threshold) {
				2136	/* Check whether enough time has elaplased for
				2137	* another round of probing.
				2138	*/
				2139	tcp_mtu_check_reprobe(sk);
				2140	return -1;
				2141	}
				2142
				2143	/* Have enough data in the send queue to probe? */
				2144	if (tp->write_seq - tp->snd_nxt < size_needed)
				2145	return -1;
				2146
				2147	if (tp->snd_wnd < size_needed)
				2148	return -1;
				2149	if (after(tp->snd_nxt + size_needed, tcp_wnd_end(tp)))
				2150	return 0;
				2151
				2152	/* Do we need to wait to drain cwnd? With none in flight, don't stall */
				2153	if (tcp_packets_in_flight(tp) + 2 > tp->snd_cwnd) {
				2154	if (!tcp_packets_in_flight(tp))
				2155	return -1;
				2156	else
				2157	return 0;
				2158	}
				2159
				2160	if (!tcp_can_coalesce_send_queue_head(sk, probe_size))
				2161	return -1;
				2162
				2163	/* We're allowed to probe. Build it now. */
				2164	nskb = sk_stream_alloc_skb(sk, probe_size, GFP_ATOMIC, false);
				2165	if (!nskb)
				2166	return -1;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2167	sk_wmem_queued_add(sk, nskb->truesize);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2168	sk_mem_charge(sk, nskb->truesize);
				2169
				2170	skb = tcp_send_head(sk);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2171	skb_copy_decrypted(nskb, skb);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2172
				2173	TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(skb)->seq;
				2174	TCP_SKB_CB(nskb)->end_seq = TCP_SKB_CB(skb)->seq + probe_size;
				2175	TCP_SKB_CB(nskb)->tcp_flags = TCPHDR_ACK;
				2176	TCP_SKB_CB(nskb)->sacked = 0;
				2177	nskb->csum = 0;
				2178	nskb->ip_summed = CHECKSUM_PARTIAL;
				2179
				2180	tcp_insert_write_queue_before(nskb, skb, sk);
				2181	tcp_highest_sack_replace(sk, skb, nskb);
				2182
				2183	len = 0;
				2184	tcp_for_write_queue_from_safe(skb, next, sk) {
				2185	copy = min_t(int, skb->len, probe_size - len);
				2186	skb_copy_bits(skb, 0, skb_put(nskb, copy), copy);
				2187
				2188	if (skb->len <= copy) {
				2189	/* We've eaten all the data from this skb.
				2190	* Throw it away. */
				2191	TCP_SKB_CB(nskb)->tcp_flags \|= TCP_SKB_CB(skb)->tcp_flags;
				2192	/* If this is the last SKB we copy and eor is set
				2193	* we need to propagate it to the new skb.
				2194	*/
				2195	TCP_SKB_CB(nskb)->eor = TCP_SKB_CB(skb)->eor;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2196	tcp_skb_collapse_tstamp(nskb, skb);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2197	tcp_unlink_write_queue(skb, sk);
				2198	sk_wmem_free_skb(sk, skb);
				2199	} else {
				2200	TCP_SKB_CB(nskb)->tcp_flags \|= TCP_SKB_CB(skb)->tcp_flags &
				2201	~(TCPHDR_FIN\|TCPHDR_PSH);
				2202	if (!skb_shinfo(skb)->nr_frags) {
				2203	skb_pull(skb, copy);
				2204	} else {
				2205	__pskb_trim_head(skb, copy);
				2206	tcp_set_skb_tso_segs(skb, mss_now);
				2207	}
				2208	TCP_SKB_CB(skb)->seq += copy;
				2209	}
				2210
				2211	len += copy;
				2212
				2213	if (len >= probe_size)
				2214	break;
				2215	}
				2216	tcp_init_tso_segs(nskb, nskb->len);
				2217
				2218	/* We're ready to send. If this fails, the probe will
				2219	* be resegmented into mss-sized pieces by tcp_write_xmit().
				2220	*/
				2221	if (!tcp_transmit_skb(sk, nskb, 1, GFP_ATOMIC)) {
				2222	/* Decrement cwnd here because we are sending
				2223	* effectively two packets. */
				2224	tp->snd_cwnd--;
				2225	tcp_event_new_data_sent(sk, nskb);
				2226
				2227	icsk->icsk_mtup.probe_size = tcp_mss_to_mtu(sk, nskb->len);
				2228	tp->mtu_probe.probe_seq_start = TCP_SKB_CB(nskb)->seq;
				2229	tp->mtu_probe.probe_seq_end = TCP_SKB_CB(nskb)->end_seq;
				2230
				2231	return 1;
				2232	}
				2233
				2234	return -1;
				2235	}
				2236
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2237	static bool tcp_pacing_check(struct sock *sk)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2238	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2239	struct tcp_sock *tp = tcp_sk(sk);
				2240
				2241	if (!tcp_needs_internal_pacing(sk))
				2242	return false;
				2243
				2244	if (tp->tcp_wstamp_ns <= tp->tcp_clock_cache)
				2245	return false;
				2246
				2247	if (!hrtimer_is_queued(&tp->pacing_timer)) {
				2248	hrtimer_start(&tp->pacing_timer,
				2249	ns_to_ktime(tp->tcp_wstamp_ns),
				2250	HRTIMER_MODE_ABS_PINNED_SOFT);
				2251	sock_hold(sk);
				2252	}
				2253	return true;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2254	}
				2255
				2256	/* TCP Small Queues :
				2257	* Control number of packets in qdisc/devices to two packets / or ~1 ms.
				2258	* (These limits are doubled for retransmits)
				2259	* This allows for :
				2260	* - better RTT estimation and ACK scheduling
				2261	* - faster recovery
				2262	* - high rates
				2263	* Alas, some drivers / subsystems require a fair amount
				2264	* of queued bytes to ensure line rate.
				2265	* One example is wifi aggregation (802.11 AMPDU)
				2266	*/
				2267	static bool tcp_small_queue_check(struct sock sk, const struct sk_buff skb,
				2268	unsigned int factor)
				2269	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2270	unsigned long limit;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2271
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2272	limit = max_t(unsigned long,
				2273	2 * skb->truesize,
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	2274	sk->sk_pacing_rate >> READ_ONCE(sk->sk_pacing_shift));
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2275	if (sk->sk_pacing_status == SK_PACING_NONE)
				2276	limit = min_t(unsigned long, limit,
				2277	sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2278	limit <<= factor;
				2279
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2280	if (static_branch_unlikely(&tcp_tx_delay_enabled) &&
				2281	tcp_sk(sk)->tcp_tx_delay) {
				2282	u64 extra_bytes = (u64)sk->sk_pacing_rate * tcp_sk(sk)->tcp_tx_delay;
				2283
				2284	/* TSQ is based on skb truesize sum (sk_wmem_alloc), so we
				2285	* approximate our needs assuming an ~100% skb->truesize overhead.
				2286	* USEC_PER_SEC is approximated by 2^20.
				2287	* do_div(extra_bytes, USEC_PER_SEC/2) is replaced by a right shift.
				2288	*/
				2289	extra_bytes >>= (20 - 1);
				2290	limit += extra_bytes;
				2291	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2292	if (refcount_read(&sk->sk_wmem_alloc) > limit) {
				2293	/* Always send skb if rtx queue is empty.
				2294	* No need to wait for TX completion to call us back,
				2295	* after softirq/tasklet schedule.
				2296	* This helps when TX completions are delayed too much.
				2297	*/
				2298	if (tcp_rtx_queue_empty(sk))
				2299	return false;
				2300
				2301	set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags);
				2302	/* It is possible TX completion already happened
				2303	* before we set TSQ_THROTTLED, so we must
				2304	* test again the condition.
				2305	*/
				2306	smp_mb__after_atomic();
				2307	if (refcount_read(&sk->sk_wmem_alloc) > limit)
				2308	return true;
				2309	}
				2310	return false;
				2311	}
				2312
				2313	static void tcp_chrono_set(struct tcp_sock *tp, const enum tcp_chrono new)
				2314	{
				2315	const u32 now = tcp_jiffies32;
				2316	enum tcp_chrono old = tp->chrono_type;
				2317
				2318	if (old > TCP_CHRONO_UNSPEC)
				2319	tp->chrono_stat[old - 1] += now - tp->chrono_start;
				2320	tp->chrono_start = now;
				2321	tp->chrono_type = new;
				2322	}
				2323
				2324	void tcp_chrono_start(struct sock *sk, const enum tcp_chrono type)
				2325	{
				2326	struct tcp_sock *tp = tcp_sk(sk);
				2327
				2328	/* If there are multiple conditions worthy of tracking in a
				2329	* chronograph then the highest priority enum takes precedence
				2330	* over the other conditions. So that if something "more interesting"
				2331	* starts happening, stop the previous chrono and start a new one.
				2332	*/
				2333	if (type > tp->chrono_type)
				2334	tcp_chrono_set(tp, type);
				2335	}
				2336
				2337	void tcp_chrono_stop(struct sock *sk, const enum tcp_chrono type)
				2338	{
				2339	struct tcp_sock *tp = tcp_sk(sk);
				2340
				2341
				2342	/* There are multiple conditions worthy of tracking in a
				2343	* chronograph, so that the highest priority enum takes
				2344	* precedence over the other conditions (see tcp_chrono_start).
				2345	* If a condition stops, we only stop chrono tracking if
				2346	* it's the "most interesting" or current chrono we are
				2347	* tracking and starts busy chrono if we have pending data.
				2348	*/
				2349	if (tcp_rtx_and_write_queues_empty(sk))
				2350	tcp_chrono_set(tp, TCP_CHRONO_UNSPEC);
				2351	else if (type == tp->chrono_type)
				2352	tcp_chrono_set(tp, TCP_CHRONO_BUSY);
				2353	}
				2354
				2355	/* This routine writes packets to the network. It advances the
				2356	* send_head. This happens as incoming acks open up the remote
				2357	* window for us.
				2358	*
				2359	* LARGESEND note: !tcp_urg_mode is overkill, only frames between
				2360	* snd_up-64k-mss .. snd_up cannot be large. However, taking into
				2361	* account rare use of URG, this is not a big flaw.
				2362	*
				2363	* Send at most one packet when push_one > 0. Temporarily ignore
				2364	* cwnd limit to force at most one packet out when push_one == 2.
				2365
				2366	* Returns true, if no segments are in flight and we have queued segments,
				2367	* but cannot send anything now because of SWS or another problem.
				2368	*/
				2369	static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
				2370	int push_one, gfp_t gfp)
				2371	{
				2372	struct tcp_sock *tp = tcp_sk(sk);
				2373	struct sk_buff *skb;
				2374	unsigned int tso_segs, sent_pkts;
				2375	int cwnd_quota;
				2376	int result;
				2377	bool is_cwnd_limited = false, is_rwnd_limited = false;
				2378	u32 max_segs;
				2379
				2380	sent_pkts = 0;
				2381
				2382	tcp_mstamp_refresh(tp);
				2383	if (!push_one) {
				2384	/* Do MTU probing. */
				2385	result = tcp_mtu_probe(sk);
				2386	if (!result) {
				2387	return false;
				2388	} else if (result > 0) {
				2389	sent_pkts = 1;
				2390	}
				2391	}
				2392
				2393	max_segs = tcp_tso_segs(sk, mss_now);
				2394	while ((skb = tcp_send_head(sk))) {
				2395	unsigned int limit;
				2396
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2397	if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) {
				2398	/* "skb_mstamp_ns" is used as a start point for the retransmit timer */
				2399	skb->skb_mstamp_ns = tp->tcp_wstamp_ns = tp->tcp_clock_cache;
				2400	list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue);
				2401	tcp_init_tso_segs(skb, mss_now);
				2402	goto repair; /* Skip network transmission */
				2403	}
				2404
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2405	if (tcp_pacing_check(sk))
				2406	break;
				2407
				2408	tso_segs = tcp_init_tso_segs(skb, mss_now);
				2409	BUG_ON(!tso_segs);
				2410
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2411	cwnd_quota = tcp_cwnd_test(tp, skb);
				2412	if (!cwnd_quota) {
				2413	if (push_one == 2)
				2414	/* Force out a loss probe pkt. */
				2415	cwnd_quota = 1;
				2416	else
				2417	break;
				2418	}
				2419
				2420	if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now))) {
				2421	is_rwnd_limited = true;
				2422	break;
				2423	}
				2424
				2425	if (tso_segs == 1) {
				2426	if (unlikely(!tcp_nagle_test(tp, skb, mss_now,
				2427	(tcp_skb_is_last(sk, skb) ?
				2428	nonagle : TCP_NAGLE_PUSH))))
				2429	break;
				2430	} else {
				2431	if (!push_one &&
				2432	tcp_tso_should_defer(sk, skb, &is_cwnd_limited,
				2433	&is_rwnd_limited, max_segs))
				2434	break;
				2435	}
				2436
				2437	limit = mss_now;
				2438	if (tso_segs > 1 && !tcp_urg_mode(tp))
				2439	limit = tcp_mss_split_point(sk, skb, mss_now,
				2440	min_t(unsigned int,
				2441	cwnd_quota,
				2442	max_segs),
				2443	nonagle);
				2444
				2445	if (skb->len > limit &&
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2446	unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2447	break;
				2448
				2449	if (tcp_small_queue_check(sk, skb, 0))
				2450	break;
				2451
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	2452	/* Argh, we hit an empty skb(), presumably a thread
				2453	* is sleeping in sendmsg()/sk_stream_wait_memory().
				2454	* We do not want to send a pure-ack packet and have
				2455	* a strange looking rtx queue with empty packet(s).
				2456	*/
				2457	if (TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq)
				2458	break;
				2459
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2460	if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp)))
				2461	break;
				2462
				2463	repair:
				2464	/* Advance the send_head. This one is sent out.
				2465	* This call will increment packets_out.
				2466	*/
				2467	tcp_event_new_data_sent(sk, skb);
				2468
				2469	tcp_minshall_update(tp, mss_now, skb);
				2470	sent_pkts += tcp_skb_pcount(skb);
				2471
				2472	if (push_one)
				2473	break;
				2474	}
				2475
				2476	if (is_rwnd_limited)
				2477	tcp_chrono_start(sk, TCP_CHRONO_RWND_LIMITED);
				2478	else
				2479	tcp_chrono_stop(sk, TCP_CHRONO_RWND_LIMITED);
				2480
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	2481	is_cwnd_limited \|= (tcp_packets_in_flight(tp) >= tp->snd_cwnd);
				2482	if (likely(sent_pkts \|\| is_cwnd_limited))
				2483	tcp_cwnd_validate(sk, is_cwnd_limited);
				2484
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2485	if (likely(sent_pkts)) {
				2486	if (tcp_in_cwnd_reduction(sk))
				2487	tp->prr_out += sent_pkts;
				2488
				2489	/* Send one loss probe per tail loss episode. */
				2490	if (push_one != 2)
				2491	tcp_schedule_loss_probe(sk, false);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2492	return false;
				2493	}
				2494	return !tp->packets_out && !tcp_write_queue_empty(sk);
				2495	}
				2496
				2497	bool tcp_schedule_loss_probe(struct sock *sk, bool advancing_rto)
				2498	{
				2499	struct inet_connection_sock *icsk = inet_csk(sk);
				2500	struct tcp_sock *tp = tcp_sk(sk);
				2501	u32 timeout, rto_delta_us;
				2502	int early_retrans;
				2503
				2504	/* Don't do any loss probe on a Fast Open connection before 3WHS
				2505	* finishes.
				2506	*/
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2507	if (rcu_access_pointer(tp->fastopen_rsk))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2508	return false;
				2509
				2510	early_retrans = sock_net(sk)->ipv4.sysctl_tcp_early_retrans;
				2511	/* Schedule a loss probe in 2*RTT for SACK capable connections
				2512	* not in loss recovery, that are either limited by cwnd or application.
				2513	*/
				2514	if ((early_retrans != 3 && early_retrans != 4) \|\|
				2515	!tp->packets_out \|\| !tcp_is_sack(tp) \|\|
				2516	(icsk->icsk_ca_state != TCP_CA_Open &&
				2517	icsk->icsk_ca_state != TCP_CA_CWR))
				2518	return false;
				2519
				2520	/* Probe timeout is 2*rtt. Add minimum RTO to account
				2521	* for delayed ack when there's one outstanding packet. If no RTT
				2522	* sample is available then probe after TCP_TIMEOUT_INIT.
				2523	*/
				2524	if (tp->srtt_us) {
				2525	timeout = usecs_to_jiffies(tp->srtt_us >> 2);
				2526	if (tp->packets_out == 1)
				2527	timeout += TCP_RTO_MIN;
				2528	else
				2529	timeout += TCP_TIMEOUT_MIN;
				2530	} else {
				2531	timeout = TCP_TIMEOUT_INIT;
				2532	}
				2533
				2534	/* If the RTO formula yields an earlier time, then use that time. */
				2535	rto_delta_us = advancing_rto ?
				2536	jiffies_to_usecs(inet_csk(sk)->icsk_rto) :
				2537	tcp_rto_delta_us(sk); /* How far in future is RTO? */
				2538	if (rto_delta_us > 0)
				2539	timeout = min_t(u32, timeout, usecs_to_jiffies(rto_delta_us));
				2540
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2541	tcp_reset_xmit_timer(sk, ICSK_TIME_LOSS_PROBE, timeout,
				2542	TCP_RTO_MAX, NULL);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2543	return true;
				2544	}
				2545
				2546	/* Thanks to skb fast clones, we can detect if a prior transmit of
				2547	* a packet is still in a qdisc or driver queue.
				2548	* In this case, there is very little point doing a retransmit !
				2549	*/
				2550	static bool skb_still_in_host_queue(const struct sock *sk,
				2551	const struct sk_buff *skb)
				2552	{
				2553	if (unlikely(skb_fclone_busy(sk, skb))) {
				2554	NET_INC_STATS(sock_net(sk),
				2555	LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES);
				2556	return true;
				2557	}
				2558	return false;
				2559	}
				2560
				2561	/* When probe timeout (PTO) fires, try send a new segment if possible, else
				2562	* retransmit the last segment.
				2563	*/
				2564	void tcp_send_loss_probe(struct sock *sk)
				2565	{
				2566	struct tcp_sock *tp = tcp_sk(sk);
				2567	struct sk_buff *skb;
				2568	int pcount;
				2569	int mss = tcp_current_mss(sk);
				2570
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	2571	/* At most one outstanding TLP */
				2572	if (tp->tlp_high_seq)
				2573	goto rearm_timer;
				2574
				2575	tp->tlp_retrans = 0;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2576	skb = tcp_send_head(sk);
				2577	if (skb && tcp_snd_wnd_test(tp, skb, mss)) {
				2578	pcount = tp->packets_out;
				2579	tcp_write_xmit(sk, mss, TCP_NAGLE_OFF, 2, GFP_ATOMIC);
				2580	if (tp->packets_out > pcount)
				2581	goto probe_sent;
				2582	goto rearm_timer;
				2583	}
				2584	skb = skb_rb_last(&sk->tcp_rtx_queue);
				2585	if (unlikely(!skb)) {
				2586	WARN_ONCE(tp->packets_out,
				2587	"invalid inflight: %u state %u cwnd %u mss %d\n",
				2588	tp->packets_out, sk->sk_state, tp->snd_cwnd, mss);
				2589	inet_csk(sk)->icsk_pending = 0;
				2590	return;
				2591	}
				2592
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2593	if (skb_still_in_host_queue(sk, skb))
				2594	goto rearm_timer;
				2595
				2596	pcount = tcp_skb_pcount(skb);
				2597	if (WARN_ON(!pcount))
				2598	goto rearm_timer;
				2599
				2600	if ((pcount > 1) && (skb->len > (pcount - 1) * mss)) {
				2601	if (unlikely(tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb,
				2602	(pcount - 1) * mss, mss,
				2603	GFP_ATOMIC)))
				2604	goto rearm_timer;
				2605	skb = skb_rb_next(skb);
				2606	}
				2607
				2608	if (WARN_ON(!skb \|\| !tcp_skb_pcount(skb)))
				2609	goto rearm_timer;
				2610
				2611	if (__tcp_retransmit_skb(sk, skb, 1))
				2612	goto rearm_timer;
				2613
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	2614	tp->tlp_retrans = 1;
				2615
				2616	probe_sent:
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2617	/* Record snd_nxt for loss detection. */
				2618	tp->tlp_high_seq = tp->snd_nxt;
				2619
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2620	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPLOSSPROBES);
				2621	/* Reset s.t. tcp_rearm_rto will restart timer from now */
				2622	inet_csk(sk)->icsk_pending = 0;
				2623	rearm_timer:
				2624	tcp_rearm_rto(sk);
				2625	}
				2626
				2627	/* Push out any pending frames which were held back due to
				2628	* TCP_CORK or attempt at coalescing tiny packets.
				2629	* The socket must be locked by the caller.
				2630	*/
				2631	void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss,
				2632	int nonagle)
				2633	{
				2634	/* If we are closed, the bytes will have to remain here.
				2635	* In time closedown will finish, we empty the write queue and
				2636	* all will be happy.
				2637	*/
				2638	if (unlikely(sk->sk_state == TCP_CLOSE))
				2639	return;
				2640
				2641	if (tcp_write_xmit(sk, cur_mss, nonagle, 0,
				2642	sk_gfp_mask(sk, GFP_ATOMIC)))
				2643	tcp_check_probe_timer(sk);
				2644	}
				2645
				2646	/* Send _single_ skb sitting at the send head. This function requires
				2647	* true push pending frames to setup probe timer etc.
				2648	*/
				2649	void tcp_push_one(struct sock *sk, unsigned int mss_now)
				2650	{
				2651	struct sk_buff *skb = tcp_send_head(sk);
				2652
				2653	BUG_ON(!skb \|\| skb->len < mss_now);
				2654
				2655	tcp_write_xmit(sk, mss_now, TCP_NAGLE_PUSH, 1, sk->sk_allocation);
				2656	}
				2657
				2658	/* This function returns the amount that we can raise the
				2659	* usable window based on the following constraints
				2660	*
				2661	* 1. The window can never be shrunk once it is offered (RFC 793)
				2662	* 2. We limit memory per socket
				2663	*
				2664	* RFC 1122:
				2665	* "the suggested [SWS] avoidance algorithm for the receiver is to keep
				2666	* RECV.NEXT + RCV.WIN fixed until:
				2667	* RCV.BUFF - RCV.USER - RCV.WINDOW >= min(1/2 RCV.BUFF, MSS)"
				2668	*
				2669	* i.e. don't raise the right edge of the window until you can raise
				2670	* it at least MSS bytes.
				2671	*
				2672	* Unfortunately, the recommended algorithm breaks header prediction,
				2673	* since header prediction assumes th->window stays fixed.
				2674	*
				2675	* Strictly speaking, keeping th->window fixed violates the receiver
				2676	* side SWS prevention criteria. The problem is that under this rule
				2677	* a stream of single byte packets will cause the right side of the
				2678	* window to always advance by a single byte.
				2679	*
				2680	* Of course, if the sender implements sender side SWS prevention
				2681	* then this will not be a problem.
				2682	*
				2683	* BSD seems to make the following compromise:
				2684	*
				2685	* If the free space is less than the 1/4 of the maximum
				2686	* space available and the free space is less than 1/2 mss,
				2687	* then set the window to 0.
				2688	* [ Actually, bsd uses MSS and 1/4 of maximal _window_ ]
				2689	* Otherwise, just prevent the window from shrinking
				2690	* and from being larger than the largest representable value.
				2691	*
				2692	* This prevents incremental opening of the window in the regime
				2693	* where TCP is limited by the speed of the reader side taking
				2694	* data out of the TCP receive queue. It does nothing about
				2695	* those cases where the window is constrained on the sender side
				2696	* because the pipeline is full.
				2697	*
				2698	* BSD also seems to "accidentally" limit itself to windows that are a
				2699	* multiple of MSS, at least until the free space gets quite small.
				2700	* This would appear to be a side effect of the mbuf implementation.
				2701	* Combining these two algorithms results in the observed behavior
				2702	* of having a fixed window size at almost all times.
				2703	*
				2704	* Below we obtain similar behavior by forcing the offered window to
				2705	* a multiple of the mss when it is feasible to do so.
				2706	*
				2707	* Note, we don't "adjust" for TIMESTAMP or SACK option bytes.
				2708	* Regular options like TIMESTAMP are taken into account.
				2709	*/
				2710	u32 __tcp_select_window(struct sock *sk)
				2711	{
				2712	struct inet_connection_sock *icsk = inet_csk(sk);
				2713	struct tcp_sock *tp = tcp_sk(sk);
				2714	/* MSS for the peer's data. Previous versions used mss_clamp
				2715	* here. I don't know if the value based on our guesses
				2716	* of peer's MSS is better for the performance. It's more correct
				2717	* but may be worse for the performance because of rcv_mss
				2718	* fluctuations. --SAW 1998/11/1
				2719	*/
				2720	int mss = icsk->icsk_ack.rcv_mss;
				2721	int free_space = tcp_space(sk);
				2722	int allowed_space = tcp_full_space(sk);
				2723	int full_space = min_t(int, tp->window_clamp, allowed_space);
				2724	int window;
				2725
				2726	if (unlikely(mss > full_space)) {
				2727	mss = full_space;
				2728	if (mss <= 0)
				2729	return 0;
				2730	}
				2731	if (free_space < (full_space >> 1)) {
				2732	icsk->icsk_ack.quick = 0;
				2733
				2734	if (tcp_under_memory_pressure(sk))
				2735	tp->rcv_ssthresh = min(tp->rcv_ssthresh,
				2736	4U * tp->advmss);
				2737
				2738	/* free_space might become our new window, make sure we don't
				2739	* increase it due to wscale.
				2740	*/
				2741	free_space = round_down(free_space, 1 << tp->rx_opt.rcv_wscale);
				2742
				2743	/* if free space is less than mss estimate, or is below 1/16th
				2744	* of the maximum allowed, try to move to zero-window, else
				2745	* tcp_clamp_window() will grow rcv buf up to tcp_rmem[2], and
				2746	* new incoming data is dropped due to memory limits.
				2747	* With large window, mss test triggers way too late in order
				2748	* to announce zero window in time before rmem limit kicks in.
				2749	*/
				2750	if (free_space < (allowed_space >> 4) \|\| free_space < mss)
				2751	return 0;
				2752	}
				2753
				2754	if (free_space > tp->rcv_ssthresh)
				2755	free_space = tp->rcv_ssthresh;
				2756
				2757	/* Don't do rounding if we are using window scaling, since the
				2758	* scaled window will not line up with the MSS boundary anyway.
				2759	*/
				2760	if (tp->rx_opt.rcv_wscale) {
				2761	window = free_space;
				2762
				2763	/* Advertise enough space so that it won't get scaled away.
				2764	* Import case: prevent zero window announcement if
				2765	* 1<<rcv_wscale > mss.
				2766	*/
				2767	window = ALIGN(window, (1 << tp->rx_opt.rcv_wscale));
				2768	} else {
				2769	window = tp->rcv_wnd;
				2770	/* Get the largest window that is a nice multiple of mss.
				2771	* Window clamp already applied above.
				2772	* If our current window offering is within 1 mss of the
				2773	* free space we just keep it. This prevents the divide
				2774	* and multiply from happening most of the time.
				2775	* We also don't do any window rounding when the free space
				2776	* is too small.
				2777	*/
				2778	if (window <= free_space - mss \|\| window > free_space)
				2779	window = rounddown(free_space, mss);
				2780	else if (mss == full_space &&
				2781	free_space > window + (full_space >> 1))
				2782	window = free_space;
				2783	}
				2784
				2785	return window;
				2786	}
				2787
				2788	void tcp_skb_collapse_tstamp(struct sk_buff *skb,
				2789	const struct sk_buff *next_skb)
				2790	{
				2791	if (unlikely(tcp_has_tx_tstamp(next_skb))) {
				2792	const struct skb_shared_info *next_shinfo =
				2793	skb_shinfo(next_skb);
				2794	struct skb_shared_info *shinfo = skb_shinfo(skb);
				2795
				2796	shinfo->tx_flags \|= next_shinfo->tx_flags & SKBTX_ANY_TSTAMP;
				2797	shinfo->tskey = next_shinfo->tskey;
				2798	TCP_SKB_CB(skb)->txstamp_ack \|=
				2799	TCP_SKB_CB(next_skb)->txstamp_ack;
				2800	}
				2801	}
				2802
				2803	/* Collapses two adjacent SKB's during retransmission. */
				2804	static bool tcp_collapse_retrans(struct sock sk, struct sk_buff skb)
				2805	{
				2806	struct tcp_sock *tp = tcp_sk(sk);
				2807	struct sk_buff *next_skb = skb_rb_next(skb);
				2808	int next_skb_size;
				2809
				2810	next_skb_size = next_skb->len;
				2811
				2812	BUG_ON(tcp_skb_pcount(skb) != 1 \|\| tcp_skb_pcount(next_skb) != 1);
				2813
				2814	if (next_skb_size) {
				2815	if (next_skb_size <= skb_availroom(skb))
				2816	skb_copy_bits(next_skb, 0, skb_put(skb, next_skb_size),
				2817	next_skb_size);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2818	else if (!tcp_skb_shift(skb, next_skb, 1, next_skb_size))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2819	return false;
				2820	}
				2821	tcp_highest_sack_replace(sk, next_skb, skb);
				2822
				2823	/* Update sequence range on original skb. */
				2824	TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq;
				2825
				2826	/* Merge over control information. This moves PSH/FIN etc. over */
				2827	TCP_SKB_CB(skb)->tcp_flags \|= TCP_SKB_CB(next_skb)->tcp_flags;
				2828
				2829	/* All done, get rid of second SKB and account for it so
				2830	* packet counting does not break.
				2831	*/
				2832	TCP_SKB_CB(skb)->sacked \|= TCP_SKB_CB(next_skb)->sacked & TCPCB_EVER_RETRANS;
				2833	TCP_SKB_CB(skb)->eor = TCP_SKB_CB(next_skb)->eor;
				2834
				2835	/* changed transmit queue under us so clear hints */
				2836	tcp_clear_retrans_hints_partial(tp);
				2837	if (next_skb == tp->retransmit_skb_hint)
				2838	tp->retransmit_skb_hint = skb;
				2839
				2840	tcp_adjust_pcount(sk, next_skb, tcp_skb_pcount(next_skb));
				2841
				2842	tcp_skb_collapse_tstamp(skb, next_skb);
				2843
				2844	tcp_rtx_queue_unlink_and_free(next_skb, sk);
				2845	return true;
				2846	}
				2847
				2848	/* Check if coalescing SKBs is legal. */
				2849	static bool tcp_can_collapse(const struct sock sk, const struct sk_buff skb)
				2850	{
				2851	if (tcp_skb_pcount(skb) > 1)
				2852	return false;
				2853	if (skb_cloned(skb))
				2854	return false;
				2855	/* Some heuristics for collapsing over SACK'd could be invented */
				2856	if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
				2857	return false;
				2858
				2859	return true;
				2860	}
				2861
				2862	/* Collapse packets in the retransmit queue to make to create
				2863	* less packets on the wire. This is only done on retransmission.
				2864	*/
				2865	static void tcp_retrans_try_collapse(struct sock sk, struct sk_buff to,
				2866	int space)
				2867	{
				2868	struct tcp_sock *tp = tcp_sk(sk);
				2869	struct sk_buff skb = to, tmp;
				2870	bool first = true;
				2871
				2872	if (!sock_net(sk)->ipv4.sysctl_tcp_retrans_collapse)
				2873	return;
				2874	if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
				2875	return;
				2876
				2877	skb_rbtree_walk_from_safe(skb, tmp) {
				2878	if (!tcp_can_collapse(sk, skb))
				2879	break;
				2880
				2881	if (!tcp_skb_can_collapse_to(to))
				2882	break;
				2883
				2884	space -= skb->len;
				2885
				2886	if (first) {
				2887	first = false;
				2888	continue;
				2889	}
				2890
				2891	if (space < 0)
				2892	break;
				2893
				2894	if (after(TCP_SKB_CB(skb)->end_seq, tcp_wnd_end(tp)))
				2895	break;
				2896
				2897	if (!tcp_collapse_retrans(sk, to))
				2898	break;
				2899	}
				2900	}
				2901
				2902	/* This retransmits one SKB. Policy decisions and retransmit queue
				2903	* state updates are done by the caller. Returns non-zero if an
				2904	* error occurred which prevented the send.
				2905	*/
				2906	int __tcp_retransmit_skb(struct sock sk, struct sk_buff skb, int segs)
				2907	{
				2908	struct inet_connection_sock *icsk = inet_csk(sk);
				2909	struct tcp_sock *tp = tcp_sk(sk);
				2910	unsigned int cur_mss;
				2911	int diff, len, err;
				2912
				2913
				2914	/* Inconclusive MTU probe */
				2915	if (icsk->icsk_mtup.probe_size)
				2916	icsk->icsk_mtup.probe_size = 0;
				2917
				2918	/* Do not sent more than we queued. 1/4 is reserved for possible
				2919	* copying overhead: fragmentation, tunneling, mangling etc.
				2920	*/
				2921	if (refcount_read(&sk->sk_wmem_alloc) >
				2922	min_t(u32, sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2),
				2923	sk->sk_sndbuf))
				2924	return -EAGAIN;
				2925
				2926	if (skb_still_in_host_queue(sk, skb))
				2927	return -EBUSY;
				2928
				2929	if (before(TCP_SKB_CB(skb)->seq, tp->snd_una)) {
				2930	if (unlikely(before(TCP_SKB_CB(skb)->end_seq, tp->snd_una))) {
				2931	WARN_ON_ONCE(1);
				2932	return -EINVAL;
				2933	}
				2934	if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq))
				2935	return -ENOMEM;
				2936	}
				2937
				2938	if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk))
				2939	return -EHOSTUNREACH; /* Routing failure or similar. */
				2940
				2941	cur_mss = tcp_current_mss(sk);
				2942
				2943	/* If receiver has shrunk his window, and skb is out of
				2944	* new window, do not retransmit it. The exception is the
				2945	* case, when window is shrunk to zero. In this case
				2946	* our retransmit serves as a zero window probe.
				2947	*/
				2948	if (!before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp)) &&
				2949	TCP_SKB_CB(skb)->seq != tp->snd_una)
				2950	return -EAGAIN;
				2951
				2952	len = cur_mss * segs;
				2953	if (skb->len > len) {
				2954	if (tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb, len,
				2955	cur_mss, GFP_ATOMIC))
				2956	return -ENOMEM; /* We'll try again later. */
				2957	} else {
				2958	if (skb_unclone(skb, GFP_ATOMIC))
				2959	return -ENOMEM;
				2960
				2961	diff = tcp_skb_pcount(skb);
				2962	tcp_set_skb_tso_segs(skb, cur_mss);
				2963	diff -= tcp_skb_pcount(skb);
				2964	if (diff)
				2965	tcp_adjust_pcount(sk, skb, diff);
				2966	if (skb->len < cur_mss)
				2967	tcp_retrans_try_collapse(sk, skb, cur_mss);
				2968	}
				2969
				2970	/* RFC3168, section 6.1.1.1. ECN fallback */
				2971	if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN_ECN) == TCPHDR_SYN_ECN)
				2972	tcp_ecn_clear_syn(sk, skb);
				2973
				2974	/* Update global and local TCP statistics. */
				2975	segs = tcp_skb_pcount(skb);
				2976	TCP_ADD_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS, segs);
				2977	if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
				2978	__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
				2979	tp->total_retrans += segs;
				2980	tp->bytes_retrans += skb->len;
				2981
				2982	/* make sure skb->data is aligned on arches that require it
				2983	* and check if ack-trimming & collapsing extended the headroom
				2984	* beyond what csum_start can cover.
				2985	*/
				2986	if (unlikely((NET_IP_ALIGN && ((unsigned long)skb->data & 3)) \|\|
				2987	skb_headroom(skb) >= 0xFFFF)) {
				2988	struct sk_buff *nskb;
				2989
				2990	tcp_skb_tsorted_save(skb) {
				2991	nskb = __pskb_copy(skb, MAX_TCP_HEADER, GFP_ATOMIC);
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	2992	if (nskb) {
				2993	nskb->dev = NULL;
				2994	err = tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC);
				2995	} else {
				2996	err = -ENOBUFS;
				2997	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2998	} tcp_skb_tsorted_restore(skb);
				2999
				3000	if (!err) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3001	tcp_update_skb_after_send(sk, skb, tp->tcp_wstamp_ns);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3002	tcp_rate_skb_sent(sk, skb);
				3003	}
				3004	} else {
				3005	err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
				3006	}
				3007
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3008	/* To avoid taking spuriously low RTT samples based on a timestamp
				3009	* for a transmit that never happened, always mark EVER_RETRANS
				3010	*/
				3011	TCP_SKB_CB(skb)->sacked \|= TCPCB_EVER_RETRANS;
				3012
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3013	if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_RETRANS_CB_FLAG))
				3014	tcp_call_bpf_3arg(sk, BPF_SOCK_OPS_RETRANS_CB,
				3015	TCP_SKB_CB(skb)->seq, segs, err);
				3016
				3017	if (likely(!err)) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3018	trace_tcp_retransmit_skb(sk, skb);
				3019	} else if (err != -EBUSY) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3020	NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL, segs);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3021	}
				3022	return err;
				3023	}
				3024
				3025	int tcp_retransmit_skb(struct sock sk, struct sk_buff skb, int segs)
				3026	{
				3027	struct tcp_sock *tp = tcp_sk(sk);
				3028	int err = __tcp_retransmit_skb(sk, skb, segs);
				3029
				3030	if (err == 0) {
				3031	#if FASTRETRANS_DEBUG > 0
				3032	if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
				3033	net_dbg_ratelimited("retrans_out leaked\n");
				3034	}
				3035	#endif
				3036	TCP_SKB_CB(skb)->sacked \|= TCPCB_RETRANS;
				3037	tp->retrans_out += tcp_skb_pcount(skb);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3038	}
				3039
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3040	/* Save stamp of the first (attempted) retransmit. */
				3041	if (!tp->retrans_stamp)
				3042	tp->retrans_stamp = tcp_skb_timestamp(skb);
				3043
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3044	if (tp->undo_retrans < 0)
				3045	tp->undo_retrans = 0;
				3046	tp->undo_retrans += tcp_skb_pcount(skb);
				3047	return err;
				3048	}
				3049
				3050	/* This gets called after a retransmit timeout, and the initially
				3051	* retransmitted data is acknowledged. It tries to continue
				3052	* resending the rest of the retransmit queue, until either
				3053	* we've sent it all or the congestion window limit is reached.
				3054	*/
				3055	void tcp_xmit_retransmit_queue(struct sock *sk)
				3056	{
				3057	const struct inet_connection_sock *icsk = inet_csk(sk);
				3058	struct sk_buff skb, rtx_head, *hole = NULL;
				3059	struct tcp_sock *tp = tcp_sk(sk);
				3060	u32 max_segs;
				3061	int mib_idx;
				3062
				3063	if (!tp->packets_out)
				3064	return;
				3065
				3066	rtx_head = tcp_rtx_queue_head(sk);
				3067	skb = tp->retransmit_skb_hint ?: rtx_head;
				3068	max_segs = tcp_tso_segs(sk, tcp_current_mss(sk));
				3069	skb_rbtree_walk_from(skb) {
				3070	__u8 sacked;
				3071	int segs;
				3072
				3073	if (tcp_pacing_check(sk))
				3074	break;
				3075
				3076	/* we could do better than to assign each time */
				3077	if (!hole)
				3078	tp->retransmit_skb_hint = skb;
				3079
				3080	segs = tp->snd_cwnd - tcp_packets_in_flight(tp);
				3081	if (segs <= 0)
				3082	return;
				3083	sacked = TCP_SKB_CB(skb)->sacked;
				3084	/* In case tcp_shift_skb_data() have aggregated large skbs,
				3085	* we need to make sure not sending too bigs TSO packets
				3086	*/
				3087	segs = min_t(int, segs, max_segs);
				3088
				3089	if (tp->retrans_out >= tp->lost_out) {
				3090	break;
				3091	} else if (!(sacked & TCPCB_LOST)) {
				3092	if (!hole && !(sacked & (TCPCB_SACKED_RETRANS\|TCPCB_SACKED_ACKED)))
				3093	hole = skb;
				3094	continue;
				3095
				3096	} else {
				3097	if (icsk->icsk_ca_state != TCP_CA_Loss)
				3098	mib_idx = LINUX_MIB_TCPFASTRETRANS;
				3099	else
				3100	mib_idx = LINUX_MIB_TCPSLOWSTARTRETRANS;
				3101	}
				3102
				3103	if (sacked & (TCPCB_SACKED_ACKED\|TCPCB_SACKED_RETRANS))
				3104	continue;
				3105
				3106	if (tcp_small_queue_check(sk, skb, 1))
				3107	return;
				3108
				3109	if (tcp_retransmit_skb(sk, skb, segs))
				3110	return;
				3111
				3112	NET_ADD_STATS(sock_net(sk), mib_idx, tcp_skb_pcount(skb));
				3113
				3114	if (tcp_in_cwnd_reduction(sk))
				3115	tp->prr_out += tcp_skb_pcount(skb);
				3116
				3117	if (skb == rtx_head &&
				3118	icsk->icsk_pending != ICSK_TIME_REO_TIMEOUT)
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3119	tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
				3120	inet_csk(sk)->icsk_rto,
				3121	TCP_RTO_MAX,
				3122	skb);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3123	}
				3124	}
				3125
				3126	/* We allow to exceed memory limits for FIN packets to expedite
				3127	* connection tear down and (memory) recovery.
				3128	* Otherwise tcp_send_fin() could be tempted to either delay FIN
				3129	* or even be forced to close flow without any FIN.
				3130	* In general, we want to allow one skb per socket to avoid hangs
				3131	* with edge trigger epoll()
				3132	*/
				3133	void sk_forced_mem_schedule(struct sock *sk, int size)
				3134	{
				3135	int amt;
				3136
				3137	if (size <= sk->sk_forward_alloc)
				3138	return;
				3139	amt = sk_mem_pages(size);
				3140	sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
				3141	sk_memory_allocated_add(sk, amt);
				3142
				3143	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
				3144	mem_cgroup_charge_skmem(sk->sk_memcg, amt);
				3145	}
				3146
				3147	/* Send a FIN. The caller locks the socket for us.
				3148	* We should try to send a FIN packet really hard, but eventually give up.
				3149	*/
				3150	void tcp_send_fin(struct sock *sk)
				3151	{
				3152	struct sk_buff skb, tskb = tcp_write_queue_tail(sk);
				3153	struct tcp_sock *tp = tcp_sk(sk);
				3154
				3155	/* Optimization, tack on the FIN if we have one skb in write queue and
				3156	* this skb was not yet sent, or we are under memory pressure.
				3157	* Note: in the latter case, FIN packet will be sent after a timeout,
				3158	* as TCP stack thinks it has already been transmitted.
				3159	*/
				3160	if (!tskb && tcp_under_memory_pressure(sk))
				3161	tskb = skb_rb_last(&sk->tcp_rtx_queue);
				3162
				3163	if (tskb) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3164	TCP_SKB_CB(tskb)->tcp_flags \|= TCPHDR_FIN;
				3165	TCP_SKB_CB(tskb)->end_seq++;
				3166	tp->write_seq++;
				3167	if (tcp_write_queue_empty(sk)) {
				3168	/* This means tskb was already sent.
				3169	* Pretend we included the FIN on previous transmit.
				3170	* We need to set tp->snd_nxt to the value it would have
				3171	* if FIN had been sent. This is because retransmit path
				3172	* does not change tp->snd_nxt.
				3173	*/
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3174	WRITE_ONCE(tp->snd_nxt, tp->snd_nxt + 1);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3175	return;
				3176	}
				3177	} else {
				3178	skb = alloc_skb_fclone(MAX_TCP_HEADER, sk->sk_allocation);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3179	if (unlikely(!skb))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3180	return;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3181
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3182	INIT_LIST_HEAD(&skb->tcp_tsorted_anchor);
				3183	skb_reserve(skb, MAX_TCP_HEADER);
				3184	sk_forced_mem_schedule(sk, skb->truesize);
				3185	/* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */
				3186	tcp_init_nondata_skb(skb, tp->write_seq,
				3187	TCPHDR_ACK \| TCPHDR_FIN);
				3188	tcp_queue_skb(sk, skb);
				3189	}
				3190	__tcp_push_pending_frames(sk, tcp_current_mss(sk), TCP_NAGLE_OFF);
				3191	}
				3192
				3193	/* We get here when a process closes a file descriptor (either due to
				3194	* an explicit close() or as a byproduct of exit()'ing) and there
				3195	* was unread data in the receive queue. This behavior is recommended
				3196	* by RFC 2525, section 2.17. -DaveM
				3197	*/
				3198	void tcp_send_active_reset(struct sock *sk, gfp_t priority)
				3199	{
				3200	struct sk_buff *skb;
				3201
				3202	TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTRSTS);
				3203
				3204	/* NOTE: No TCP options attached and we never retransmit this. */
				3205	skb = alloc_skb(MAX_TCP_HEADER, priority);
				3206	if (!skb) {
				3207	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED);
				3208	return;
				3209	}
				3210
				3211	/* Reserve space for headers and prepare control bits. */
				3212	skb_reserve(skb, MAX_TCP_HEADER);
				3213	tcp_init_nondata_skb(skb, tcp_acceptable_seq(sk),
				3214	TCPHDR_ACK \| TCPHDR_RST);
				3215	tcp_mstamp_refresh(tcp_sk(sk));
				3216	/* Send it off. */
				3217	if (tcp_transmit_skb(sk, skb, 0, priority))
				3218	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED);
				3219
				3220	/* skb of trace_tcp_send_reset() keeps the skb that caused RST,
				3221	* skb here is different to the troublesome skb, so use NULL
				3222	*/
				3223	trace_tcp_send_reset(sk, NULL);
				3224	}
				3225
				3226	/* Send a crossed SYN-ACK during socket establishment.
				3227	* WARNING: This routine must only be called when we have already sent
				3228	* a SYN packet that crossed the incoming SYN that caused this routine
				3229	* to get called. If this assumption fails then the initial rcv_wnd
				3230	* and rcv_wscale values will not be correct.
				3231	*/
				3232	int tcp_send_synack(struct sock *sk)
				3233	{
				3234	struct sk_buff *skb;
				3235
				3236	skb = tcp_rtx_queue_head(sk);
				3237	if (!skb \|\| !(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
				3238	pr_err("%s: wrong queue state\n", __func__);
				3239	return -EFAULT;
				3240	}
				3241	if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK)) {
				3242	if (skb_cloned(skb)) {
				3243	struct sk_buff *nskb;
				3244
				3245	tcp_skb_tsorted_save(skb) {
				3246	nskb = skb_copy(skb, GFP_ATOMIC);
				3247	} tcp_skb_tsorted_restore(skb);
				3248	if (!nskb)
				3249	return -ENOMEM;
				3250	INIT_LIST_HEAD(&nskb->tcp_tsorted_anchor);
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	3251	tcp_highest_sack_replace(sk, skb, nskb);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3252	tcp_rtx_queue_unlink_and_free(skb, sk);
				3253	__skb_header_release(nskb);
				3254	tcp_rbtree_insert(&sk->tcp_rtx_queue, nskb);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3255	sk_wmem_queued_add(sk, nskb->truesize);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3256	sk_mem_charge(sk, nskb->truesize);
				3257	skb = nskb;
				3258	}
				3259
				3260	TCP_SKB_CB(skb)->tcp_flags \|= TCPHDR_ACK;
				3261	tcp_ecn_send_synack(sk, skb);
				3262	}
				3263	return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
				3264	}
				3265
				3266	/**
				3267	* tcp_make_synack - Prepare a SYN-ACK.
				3268	* sk: listener socket
				3269	* dst: dst entry attached to the SYNACK
				3270	* req: request_sock pointer
				3271	*
				3272	* Allocate one skb and build a SYNACK packet.
				3273	* @dst is consumed : Caller should not use it again.
				3274	*/
				3275	struct sk_buff tcp_make_synack(const struct sock sk, struct dst_entry *dst,
				3276	struct request_sock *req,
				3277	struct tcp_fastopen_cookie *foc,
				3278	enum tcp_synack_type synack_type)
				3279	{
				3280	struct inet_request_sock *ireq = inet_rsk(req);
				3281	const struct tcp_sock *tp = tcp_sk(sk);
				3282	struct tcp_md5sig_key *md5 = NULL;
				3283	struct tcp_out_options opts;
				3284	struct sk_buff *skb;
				3285	int tcp_header_size;
				3286	struct tcphdr *th;
				3287	int mss;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3288	u64 now;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3289
				3290	skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
				3291	if (unlikely(!skb)) {
				3292	dst_release(dst);
				3293	return NULL;
				3294	}
				3295	/* Reserve space for headers. */
				3296	skb_reserve(skb, MAX_TCP_HEADER);
				3297
				3298	switch (synack_type) {
				3299	case TCP_SYNACK_NORMAL:
				3300	skb_set_owner_w(skb, req_to_sk(req));
				3301	break;
				3302	case TCP_SYNACK_COOKIE:
				3303	/* Under synflood, we do not attach skb to a socket,
				3304	* to avoid false sharing.
				3305	*/
				3306	break;
				3307	case TCP_SYNACK_FASTOPEN:
				3308	/* sk is a const pointer, because we want to express multiple
				3309	* cpu might call us concurrently.
				3310	* sk->sk_wmem_alloc in an atomic, we can promote to rw.
				3311	*/
				3312	skb_set_owner_w(skb, (struct sock *)sk);
				3313	break;
				3314	}
				3315	skb_dst_set(skb, dst);
				3316
				3317	mss = tcp_mss_clamp(tp, dst_metric_advmss(dst));
				3318
				3319	memset(&opts, 0, sizeof(opts));
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3320	now = tcp_clock_ns();
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3321	#ifdef CONFIG_SYN_COOKIES
				3322	if (unlikely(req->cookie_ts))
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3323	skb->skb_mstamp_ns = cookie_init_timestamp(req);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3324	else
				3325	#endif
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3326	{
				3327	skb->skb_mstamp_ns = now;
				3328	if (!tcp_rsk(req)->snt_synack) /* Timestamp first SYNACK */
				3329	tcp_rsk(req)->snt_synack = tcp_skb_timestamp_us(skb);
				3330	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3331
				3332	#ifdef CONFIG_TCP_MD5SIG
				3333	rcu_read_lock();
				3334	md5 = tcp_rsk(req)->af_specific->req_md5_lookup(sk, req_to_sk(req));
				3335	#endif
				3336	skb_set_hash(skb, tcp_rsk(req)->txhash, PKT_HASH_TYPE_L4);
				3337	tcp_header_size = tcp_synack_options(sk, req, mss, skb, &opts, md5,
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	3338	foc, synack_type) + sizeof(*th);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3339
				3340	skb_push(skb, tcp_header_size);
				3341	skb_reset_transport_header(skb);
				3342
				3343	th = (struct tcphdr *)skb->data;
				3344	memset(th, 0, sizeof(struct tcphdr));
				3345	th->syn = 1;
				3346	th->ack = 1;
				3347	tcp_ecn_make_synack(req, th);
				3348	th->source = htons(ireq->ir_num);
				3349	th->dest = ireq->ir_rmt_port;
				3350	skb->mark = ireq->ir_mark;
				3351	skb->ip_summed = CHECKSUM_PARTIAL;
				3352	th->seq = htonl(tcp_rsk(req)->snt_isn);
				3353	/* XXX data is queued and acked as is. No buffer/window check */
				3354	th->ack_seq = htonl(tcp_rsk(req)->rcv_nxt);
				3355
				3356	/* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */
				3357	th->window = htons(min(req->rsk_rcv_wnd, 65535U));
				3358	tcp_options_write((__be32 *)(th + 1), NULL, &opts);
				3359	th->doff = (tcp_header_size >> 2);
				3360	__TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTSEGS);
				3361
				3362	#ifdef CONFIG_TCP_MD5SIG
				3363	/* Okay, we have all we need - do the md5 hash if needed */
				3364	if (md5)
				3365	tcp_rsk(req)->af_specific->calc_md5_hash(opts.hash_location,
				3366	md5, req_to_sk(req), skb);
				3367	rcu_read_unlock();
				3368	#endif
				3369
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3370	skb->skb_mstamp_ns = now;
				3371	tcp_add_tx_delay(skb, tp);
				3372
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3373	return skb;
				3374	}
				3375	EXPORT_SYMBOL(tcp_make_synack);
				3376
				3377	static void tcp_ca_dst_init(struct sock sk, const struct dst_entry dst)
				3378	{
				3379	struct inet_connection_sock *icsk = inet_csk(sk);
				3380	const struct tcp_congestion_ops *ca;
				3381	u32 ca_key = dst_metric(dst, RTAX_CC_ALGO);
				3382
				3383	if (ca_key == TCP_CA_UNSPEC)
				3384	return;
				3385
				3386	rcu_read_lock();
				3387	ca = tcp_ca_find_key(ca_key);
				3388	if (likely(ca && try_module_get(ca->owner))) {
				3389	module_put(icsk->icsk_ca_ops->owner);
				3390	icsk->icsk_ca_dst_locked = tcp_ca_dst_locked(dst);
				3391	icsk->icsk_ca_ops = ca;
				3392	}
				3393	rcu_read_unlock();
				3394	}
				3395
				3396	/* Do all connect socket setups that can be done AF independent. */
				3397	static void tcp_connect_init(struct sock *sk)
				3398	{
				3399	const struct dst_entry *dst = __sk_dst_get(sk);
				3400	struct tcp_sock *tp = tcp_sk(sk);
				3401	__u8 rcv_wscale;
				3402	u32 rcv_wnd;
				3403
				3404	/* We'll fix this up when we get a response from the other end.
				3405	* See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT.
				3406	*/
				3407	tp->tcp_header_len = sizeof(struct tcphdr);
				3408	if (sock_net(sk)->ipv4.sysctl_tcp_timestamps)
				3409	tp->tcp_header_len += TCPOLEN_TSTAMP_ALIGNED;
				3410
				3411	#ifdef CONFIG_TCP_MD5SIG
				3412	if (tp->af_specific->md5_lookup(sk, sk))
				3413	tp->tcp_header_len += TCPOLEN_MD5SIG_ALIGNED;
				3414	#endif
				3415
				3416	/* If user gave his TCP_MAXSEG, record it to clamp */
				3417	if (tp->rx_opt.user_mss)
				3418	tp->rx_opt.mss_clamp = tp->rx_opt.user_mss;
				3419	tp->max_window = 0;
				3420	tcp_mtup_init(sk);
				3421	tcp_sync_mss(sk, dst_mtu(dst));
				3422
				3423	tcp_ca_dst_init(sk, dst);
				3424
				3425	if (!tp->window_clamp)
				3426	tp->window_clamp = dst_metric(dst, RTAX_WINDOW);
				3427	tp->advmss = tcp_mss_clamp(tp, dst_metric_advmss(dst));
				3428
				3429	tcp_initialize_rcv_mss(sk);
				3430
				3431	/* limit the window selection if the user enforce a smaller rx buffer */
				3432	if (sk->sk_userlocks & SOCK_RCVBUF_LOCK &&
				3433	(tp->window_clamp > tcp_full_space(sk) \|\| tp->window_clamp == 0))
				3434	tp->window_clamp = tcp_full_space(sk);
				3435
				3436	rcv_wnd = tcp_rwnd_init_bpf(sk);
				3437	if (rcv_wnd == 0)
				3438	rcv_wnd = dst_metric(dst, RTAX_INITRWND);
				3439
				3440	tcp_select_initial_window(sk, tcp_full_space(sk),
				3441	tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0),
				3442	&tp->rcv_wnd,
				3443	&tp->window_clamp,
				3444	sock_net(sk)->ipv4.sysctl_tcp_window_scaling,
				3445	&rcv_wscale,
				3446	rcv_wnd);
				3447
				3448	tp->rx_opt.rcv_wscale = rcv_wscale;
				3449	tp->rcv_ssthresh = tp->rcv_wnd;
				3450
				3451	sk->sk_err = 0;
				3452	sock_reset_flag(sk, SOCK_DONE);
				3453	tp->snd_wnd = 0;
				3454	tcp_init_wl(tp, 0);
				3455	tcp_write_queue_purge(sk);
				3456	tp->snd_una = tp->write_seq;
				3457	tp->snd_sml = tp->write_seq;
				3458	tp->snd_up = tp->write_seq;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3459	WRITE_ONCE(tp->snd_nxt, tp->write_seq);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3460
				3461	if (likely(!tp->repair))
				3462	tp->rcv_nxt = 0;
				3463	else
				3464	tp->rcv_tstamp = tcp_jiffies32;
				3465	tp->rcv_wup = tp->rcv_nxt;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3466	WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3467
				3468	inet_csk(sk)->icsk_rto = tcp_timeout_init(sk);
				3469	inet_csk(sk)->icsk_retransmits = 0;
				3470	tcp_clear_retrans(tp);
				3471	}
				3472
				3473	static void tcp_connect_queue_skb(struct sock sk, struct sk_buff skb)
				3474	{
				3475	struct tcp_sock *tp = tcp_sk(sk);
				3476	struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
				3477
				3478	tcb->end_seq += skb->len;
				3479	__skb_header_release(skb);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3480	sk_wmem_queued_add(sk, skb->truesize);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3481	sk_mem_charge(sk, skb->truesize);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3482	WRITE_ONCE(tp->write_seq, tcb->end_seq);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3483	tp->packets_out += tcp_skb_pcount(skb);
				3484	}
				3485
				3486	/* Build and send a SYN with data and (cached) Fast Open cookie. However,
				3487	* queue a data-only packet after the regular SYN, such that regular SYNs
				3488	* are retransmitted on timeouts. Also if the remote SYN-ACK acknowledges
				3489	* only the SYN sequence, the data are retransmitted in the first ACK.
				3490	* If cookie is not cached or other error occurs, falls back to send a
				3491	* regular SYN with Fast Open cookie request option.
				3492	*/
				3493	static int tcp_send_syn_data(struct sock sk, struct sk_buff syn)
				3494	{
				3495	struct tcp_sock *tp = tcp_sk(sk);
				3496	struct tcp_fastopen_request *fo = tp->fastopen_req;
				3497	int space, err = 0;
				3498	struct sk_buff *syn_data;
				3499
				3500	tp->rx_opt.mss_clamp = tp->advmss; /* If MSS is not cached */
				3501	if (!tcp_fastopen_cookie_check(sk, &tp->rx_opt.mss_clamp, &fo->cookie))
				3502	goto fallback;
				3503
				3504	/* MSS for SYN-data is based on cached MSS and bounded by PMTU and
				3505	* user-MSS. Reserve maximum option space for middleboxes that add
				3506	* private TCP options. The cost is reduced data space in SYN :(
				3507	*/
				3508	tp->rx_opt.mss_clamp = tcp_mss_clamp(tp, tp->rx_opt.mss_clamp);
				3509
				3510	space = __tcp_mtu_to_mss(sk, inet_csk(sk)->icsk_pmtu_cookie) -
				3511	MAX_TCP_OPTION_SPACE;
				3512
				3513	space = min_t(size_t, space, fo->size);
				3514
				3515	/* limit to order-0 allocations */
				3516	space = min_t(size_t, space, SKB_MAX_HEAD(MAX_TCP_HEADER));
				3517
				3518	syn_data = sk_stream_alloc_skb(sk, space, sk->sk_allocation, false);
				3519	if (!syn_data)
				3520	goto fallback;
				3521	syn_data->ip_summed = CHECKSUM_PARTIAL;
				3522	memcpy(syn_data->cb, syn->cb, sizeof(syn->cb));
				3523	if (space) {
				3524	int copied = copy_from_iter(skb_put(syn_data, space), space,
				3525	&fo->data->msg_iter);
				3526	if (unlikely(!copied)) {
				3527	tcp_skb_tsorted_anchor_cleanup(syn_data);
				3528	kfree_skb(syn_data);
				3529	goto fallback;
				3530	}
				3531	if (copied != space) {
				3532	skb_trim(syn_data, copied);
				3533	space = copied;
				3534	}
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3535	skb_zcopy_set(syn_data, fo->uarg, NULL);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3536	}
				3537	/* No more data pending in inet_wait_for_connect() */
				3538	if (space == fo->size)
				3539	fo->data = NULL;
				3540	fo->copied = space;
				3541
				3542	tcp_connect_queue_skb(sk, syn_data);
				3543	if (syn_data->len)
				3544	tcp_chrono_start(sk, TCP_CHRONO_BUSY);
				3545
				3546	err = tcp_transmit_skb(sk, syn_data, 1, sk->sk_allocation);
				3547
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3548	syn->skb_mstamp_ns = syn_data->skb_mstamp_ns;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3549
				3550	/* Now full SYN+DATA was cloned and sent (or not),
				3551	* remove the SYN from the original skb (syn_data)
				3552	* we keep in write queue in case of a retransmit, as we
				3553	* also have the SYN packet (with no data) in the same queue.
				3554	*/
				3555	TCP_SKB_CB(syn_data)->seq++;
				3556	TCP_SKB_CB(syn_data)->tcp_flags = TCPHDR_ACK \| TCPHDR_PSH;
				3557	if (!err) {
				3558	tp->syn_data = (fo->copied > 0);
				3559	tcp_rbtree_insert(&sk->tcp_rtx_queue, syn_data);
				3560	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT);
				3561	goto done;
				3562	}
				3563
				3564	/* data was not sent, put it in write_queue */
				3565	__skb_queue_tail(&sk->sk_write_queue, syn_data);
				3566	tp->packets_out -= tcp_skb_pcount(syn_data);
				3567
				3568	fallback:
				3569	/* Send a regular SYN with Fast Open cookie request option */
				3570	if (fo->cookie.len > 0)
				3571	fo->cookie.len = 0;
				3572	err = tcp_transmit_skb(sk, syn, 1, sk->sk_allocation);
				3573	if (err)
				3574	tp->syn_fastopen = 0;
				3575	done:
				3576	fo->cookie.len = -1; /* Exclude Fast Open option for SYN retries */
				3577	return err;
				3578	}
				3579
				3580	/* Build a SYN and send it off. */
				3581	int tcp_connect(struct sock *sk)
				3582	{
				3583	struct tcp_sock *tp = tcp_sk(sk);
				3584	struct sk_buff *buff;
				3585	int err;
				3586
				3587	tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_CONNECT_CB, 0, NULL);
				3588
				3589	if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk))
				3590	return -EHOSTUNREACH; /* Routing failure or similar. */
				3591
				3592	tcp_connect_init(sk);
				3593
				3594	if (unlikely(tp->repair)) {
				3595	tcp_finish_connect(sk, NULL);
				3596	return 0;
				3597	}
				3598
				3599	buff = sk_stream_alloc_skb(sk, 0, sk->sk_allocation, true);
				3600	if (unlikely(!buff))
				3601	return -ENOBUFS;
				3602
				3603	tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN);
				3604	tcp_mstamp_refresh(tp);
				3605	tp->retrans_stamp = tcp_time_stamp(tp);
				3606	tcp_connect_queue_skb(sk, buff);
				3607	tcp_ecn_send_syn(sk, buff);
				3608	tcp_rbtree_insert(&sk->tcp_rtx_queue, buff);
				3609
				3610	/* Send off SYN; include data in Fast Open. */
				3611	err = tp->fastopen_req ? tcp_send_syn_data(sk, buff) :
				3612	tcp_transmit_skb(sk, buff, 1, sk->sk_allocation);
				3613	if (err == -ECONNREFUSED)
				3614	return err;
				3615
				3616	/* We change tp->snd_nxt after the tcp_transmit_skb() call
				3617	* in order to make this packet get counted in tcpOutSegs.
				3618	*/
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3619	WRITE_ONCE(tp->snd_nxt, tp->write_seq);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3620	tp->pushed_seq = tp->write_seq;
				3621	buff = tcp_send_head(sk);
				3622	if (unlikely(buff)) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3623	WRITE_ONCE(tp->snd_nxt, TCP_SKB_CB(buff)->seq);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3624	tp->pushed_seq = TCP_SKB_CB(buff)->seq;
				3625	}
				3626	TCP_INC_STATS(sock_net(sk), TCP_MIB_ACTIVEOPENS);
				3627
				3628	/* Timer for repeating the SYN until an answer. */
				3629	inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
				3630	inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
				3631	return 0;
				3632	}
				3633	EXPORT_SYMBOL(tcp_connect);
				3634
				3635	/* Send out a delayed ack, the caller does the policy checking
				3636	* to see if we should even be here. See tcp_input.c:tcp_ack_snd_check()
				3637	* for details.
				3638	*/
				3639	void tcp_send_delayed_ack(struct sock *sk)
				3640	{
				3641	struct inet_connection_sock *icsk = inet_csk(sk);
				3642	int ato = icsk->icsk_ack.ato;
				3643	unsigned long timeout;
				3644
				3645	if (ato > TCP_DELACK_MIN) {
				3646	const struct tcp_sock *tp = tcp_sk(sk);
				3647	int max_ato = HZ / 2;
				3648
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3649	if (inet_csk_in_pingpong_mode(sk) \|\|
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3650	(icsk->icsk_ack.pending & ICSK_ACK_PUSHED))
				3651	max_ato = TCP_DELACK_MAX;
				3652
				3653	/* Slow path, intersegment interval is "high". */
				3654
				3655	/* If some rtt estimate is known, use it to bound delayed ack.
				3656	* Do not use inet_csk(sk)->icsk_rto here, use results of rtt measurements
				3657	* directly.
				3658	*/
				3659	if (tp->srtt_us) {
				3660	int rtt = max_t(int, usecs_to_jiffies(tp->srtt_us >> 3),
				3661	TCP_DELACK_MIN);
				3662
				3663	if (rtt < max_ato)
				3664	max_ato = rtt;
				3665	}
				3666
				3667	ato = min(ato, max_ato);
				3668	}
				3669
				3670	/* Stay within the limit we were given */
				3671	timeout = jiffies + ato;
				3672
				3673	/* Use new timeout only if there wasn't a older one earlier. */
				3674	if (icsk->icsk_ack.pending & ICSK_ACK_TIMER) {
				3675	/* If delack timer was blocked or is about to expire,
				3676	* send ACK now.
				3677	*/
				3678	if (icsk->icsk_ack.blocked \|\|
				3679	time_before_eq(icsk->icsk_ack.timeout, jiffies + (ato >> 2))) {
				3680	tcp_send_ack(sk);
				3681	return;
				3682	}
				3683
				3684	if (!time_before(timeout, icsk->icsk_ack.timeout))
				3685	timeout = icsk->icsk_ack.timeout;
				3686	}
				3687	icsk->icsk_ack.pending \|= ICSK_ACK_SCHED \| ICSK_ACK_TIMER;
				3688	icsk->icsk_ack.timeout = timeout;
				3689	sk_reset_timer(sk, &icsk->icsk_delack_timer, timeout);
				3690	}
				3691
				3692	/* This routine sends an ack and also updates the window. */
				3693	void __tcp_send_ack(struct sock *sk, u32 rcv_nxt)
				3694	{
				3695	struct sk_buff *buff;
				3696
				3697	/* If we have been reset, we may not send again. */
				3698	if (sk->sk_state == TCP_CLOSE)
				3699	return;
				3700
				3701	/* We are not putting this on the write queue, so
				3702	* tcp_transmit_skb() will set the ownership to this
				3703	* sock.
				3704	*/
				3705	buff = alloc_skb(MAX_TCP_HEADER,
				3706	sk_gfp_mask(sk, GFP_ATOMIC \| __GFP_NOWARN));
				3707	if (unlikely(!buff)) {
				3708	inet_csk_schedule_ack(sk);
				3709	inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN;
				3710	inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
				3711	TCP_DELACK_MAX, TCP_RTO_MAX);
				3712	return;
				3713	}
				3714
				3715	/* Reserve space for headers and prepare control bits. */
				3716	skb_reserve(buff, MAX_TCP_HEADER);
				3717	tcp_init_nondata_skb(buff, tcp_acceptable_seq(sk), TCPHDR_ACK);
				3718
				3719	/* We do not want pure acks influencing TCP Small Queues or fq/pacing
				3720	* too much.
				3721	* SKB_TRUESIZE(max(1 .. 66, MAX_TCP_HEADER)) is unfortunately ~784
				3722	*/
				3723	skb_set_tcp_pure_ack(buff);
				3724
				3725	/* Send it off, this clears delayed acks for us. */
				3726	__tcp_transmit_skb(sk, buff, 0, (__force gfp_t)0, rcv_nxt);
				3727	}
				3728	EXPORT_SYMBOL_GPL(__tcp_send_ack);
				3729
				3730	void tcp_send_ack(struct sock *sk)
				3731	{
				3732	__tcp_send_ack(sk, tcp_sk(sk)->rcv_nxt);
				3733	}
				3734
				3735	/* This routine sends a packet with an out of date sequence
				3736	* number. It assumes the other end will try to ack it.
				3737	*
				3738	* Question: what should we make while urgent mode?
				3739	* 4.4BSD forces sending single byte of data. We cannot send
				3740	* out of window data, because we have SND.NXT==SND.MAX...
				3741	*
				3742	* Current solution: to send TWO zero-length segments in urgent mode:
				3743	* one is with SEG.SEQ=SND.UNA to deliver urgent pointer, another is
				3744	* out-of-date with SND.UNA-1 to probe window.
				3745	*/
				3746	static int tcp_xmit_probe_skb(struct sock *sk, int urgent, int mib)
				3747	{
				3748	struct tcp_sock *tp = tcp_sk(sk);
				3749	struct sk_buff *skb;
				3750
				3751	/* We don't queue it, tcp_transmit_skb() sets ownership. */
				3752	skb = alloc_skb(MAX_TCP_HEADER,
				3753	sk_gfp_mask(sk, GFP_ATOMIC \| __GFP_NOWARN));
				3754	if (!skb)
				3755	return -1;
				3756
				3757	/* Reserve space for headers and set control bits. */
				3758	skb_reserve(skb, MAX_TCP_HEADER);
				3759	/* Use a previous sequence. This should cause the other
				3760	* end to send an ack. Don't queue or clone SKB, just
				3761	* send it.
				3762	*/
				3763	tcp_init_nondata_skb(skb, tp->snd_una - !urgent, TCPHDR_ACK);
				3764	NET_INC_STATS(sock_net(sk), mib);
				3765	return tcp_transmit_skb(sk, skb, 0, (__force gfp_t)0);
				3766	}
				3767
				3768	/* Called from setsockopt( ... TCP_REPAIR ) */
				3769	void tcp_send_window_probe(struct sock *sk)
				3770	{
				3771	if (sk->sk_state == TCP_ESTABLISHED) {
				3772	tcp_sk(sk)->snd_wl1 = tcp_sk(sk)->rcv_nxt - 1;
				3773	tcp_mstamp_refresh(tcp_sk(sk));
				3774	tcp_xmit_probe_skb(sk, 0, LINUX_MIB_TCPWINPROBE);
				3775	}
				3776	}
				3777
				3778	/* Initiate keepalive or window probe from timer. */
				3779	int tcp_write_wakeup(struct sock *sk, int mib)
				3780	{
				3781	struct tcp_sock *tp = tcp_sk(sk);
				3782	struct sk_buff *skb;
				3783
				3784	if (sk->sk_state == TCP_CLOSE)
				3785	return -1;
				3786
				3787	skb = tcp_send_head(sk);
				3788	if (skb && before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp))) {
				3789	int err;
				3790	unsigned int mss = tcp_current_mss(sk);
				3791	unsigned int seg_size = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
				3792
				3793	if (before(tp->pushed_seq, TCP_SKB_CB(skb)->end_seq))
				3794	tp->pushed_seq = TCP_SKB_CB(skb)->end_seq;
				3795
				3796	/* We are probing the opening of a window
				3797	* but the window size is != 0
				3798	* must have been a result SWS avoidance ( sender )
				3799	*/
				3800	if (seg_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq \|\|
				3801	skb->len > mss) {
				3802	seg_size = min(seg_size, mss);
				3803	TCP_SKB_CB(skb)->tcp_flags \|= TCPHDR_PSH;
				3804	if (tcp_fragment(sk, TCP_FRAG_IN_WRITE_QUEUE,
				3805	skb, seg_size, mss, GFP_ATOMIC))
				3806	return -1;
				3807	} else if (!tcp_skb_pcount(skb))
				3808	tcp_set_skb_tso_segs(skb, mss);
				3809
				3810	TCP_SKB_CB(skb)->tcp_flags \|= TCPHDR_PSH;
				3811	err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
				3812	if (!err)
				3813	tcp_event_new_data_sent(sk, skb);
				3814	return err;
				3815	} else {
				3816	if (between(tp->snd_up, tp->snd_una + 1, tp->snd_una + 0xFFFF))
				3817	tcp_xmit_probe_skb(sk, 1, mib);
				3818	return tcp_xmit_probe_skb(sk, 0, mib);
				3819	}
				3820	}
				3821
				3822	/* A window probe timeout has occurred. If window is not closed send
				3823	* a partial packet else a zero probe.
				3824	*/
				3825	void tcp_send_probe0(struct sock *sk)
				3826	{
				3827	struct inet_connection_sock *icsk = inet_csk(sk);
				3828	struct tcp_sock *tp = tcp_sk(sk);
				3829	struct net *net = sock_net(sk);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3830	unsigned long timeout;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3831	int err;
				3832
				3833	err = tcp_write_wakeup(sk, LINUX_MIB_TCPWINPROBE);
				3834
				3835	if (tp->packets_out \|\| tcp_write_queue_empty(sk)) {
				3836	/* Cancel probe timer, if it is not required. */
				3837	icsk->icsk_probes_out = 0;
				3838	icsk->icsk_backoff = 0;
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	3839	icsk->icsk_probes_tstamp = 0;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3840	return;
				3841	}
				3842
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3843	icsk->icsk_probes_out++;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3844	if (err <= 0) {
				3845	if (icsk->icsk_backoff < net->ipv4.sysctl_tcp_retries2)
				3846	icsk->icsk_backoff++;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3847	timeout = tcp_probe0_when(sk, TCP_RTO_MAX);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3848	} else {
				3849	/* If packet was not sent due to local congestion,
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3850	* Let senders fight for local resources conservatively.
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3851	*/
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3852	timeout = TCP_RESOURCE_PROBE_INTERVAL;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3853	}
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	3854
				3855	timeout = tcp_clamp_probe0_to_user_timeout(sk, timeout);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3856	tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0, timeout, TCP_RTO_MAX, NULL);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3857	}
				3858
				3859	int tcp_rtx_synack(const struct sock sk, struct request_sock req)
				3860	{
				3861	const struct tcp_request_sock_ops *af_ops = tcp_rsk(req)->af_specific;
				3862	struct flowi fl;
				3863	int res;
				3864
				3865	tcp_rsk(req)->txhash = net_tx_rndhash();
				3866	res = af_ops->send_synack(sk, NULL, &fl, req, NULL, TCP_SYNACK_NORMAL);
				3867	if (!res) {
				3868	__TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS);
				3869	__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
				3870	if (unlikely(tcp_passive_fastopen(sk)))
				3871	tcp_sk(sk)->total_retrans++;
				3872	trace_tcp_retransmit_synack(sk, req);
				3873	}
				3874	return res;
				3875	}
				3876	EXPORT_SYMBOL(tcp_rtx_synack);