Blame - net/ipv4/tcp.c - hafnium/third_party/linux

blob: 5c8d0fb498256ca485b576d86310401aa8964573 [file] [log] [blame]

David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1	// SPDX-License-Identifier: GPL-2.0-or-later
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2	/*
				3	* INET An implementation of the TCP/IP protocol suite for the LINUX
				4	* operating system. INET is implemented using the BSD Socket
				5	* interface as the means of communication with the user level.
				6	*
				7	* Implementation of the Transmission Control Protocol(TCP).
				8	*
				9	* Authors: Ross Biro
				10	* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
				11	* Mark Evans, <evansmp@uhura.aston.ac.uk>
				12	* Corey Minyard <wf-rch!minyard@relay.EU.net>
				13	* Florian La Roche, <flla@stud.uni-sb.de>
				14	* Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
				15	* Linus Torvalds, <torvalds@cs.helsinki.fi>
				16	* Alan Cox, <gw4pts@gw4pts.ampr.org>
				17	* Matthew Dillon, <dillon@apollo.west.oic.com>
				18	* Arnt Gulbrandsen, <agulbra@nvg.unit.no>
				19	* Jorge Cwik, <jorge@laser.satlink.net>
				20	*
				21	* Fixes:
				22	* Alan Cox : Numerous verify_area() calls
				23	* Alan Cox : Set the ACK bit on a reset
				24	* Alan Cox : Stopped it crashing if it closed while
				25	* sk->inuse=1 and was trying to connect
				26	* (tcp_err()).
				27	* Alan Cox : All icmp error handling was broken
				28	* pointers passed where wrong and the
				29	* socket was looked up backwards. Nobody
				30	* tested any icmp error code obviously.
				31	* Alan Cox : tcp_err() now handled properly. It
				32	* wakes people on errors. poll
				33	* behaves and the icmp error race
				34	* has gone by moving it into sock.c
				35	* Alan Cox : tcp_send_reset() fixed to work for
				36	* everything not just packets for
				37	* unknown sockets.
				38	* Alan Cox : tcp option processing.
				39	* Alan Cox : Reset tweaked (still not 100%) [Had
				40	* syn rule wrong]
				41	* Herp Rosmanith : More reset fixes
				42	* Alan Cox : No longer acks invalid rst frames.
				43	* Acking any kind of RST is right out.
				44	* Alan Cox : Sets an ignore me flag on an rst
				45	* receive otherwise odd bits of prattle
				46	* escape still
				47	* Alan Cox : Fixed another acking RST frame bug.
				48	* Should stop LAN workplace lockups.
				49	* Alan Cox : Some tidyups using the new skb list
				50	* facilities
				51	* Alan Cox : sk->keepopen now seems to work
				52	* Alan Cox : Pulls options out correctly on accepts
				53	* Alan Cox : Fixed assorted sk->rqueue->next errors
				54	* Alan Cox : PSH doesn't end a TCP read. Switched a
				55	* bit to skb ops.
				56	* Alan Cox : Tidied tcp_data to avoid a potential
				57	* nasty.
				58	* Alan Cox : Added some better commenting, as the
				59	* tcp is hard to follow
				60	* Alan Cox : Removed incorrect check for 20 * psh
				61	* Michael O'Reilly : ack < copied bug fix.
				62	* Johannes Stille : Misc tcp fixes (not all in yet).
				63	* Alan Cox : FIN with no memory -> CRASH
				64	* Alan Cox : Added socket option proto entries.
				65	* Also added awareness of them to accept.
				66	* Alan Cox : Added TCP options (SOL_TCP)
				67	* Alan Cox : Switched wakeup calls to callbacks,
				68	* so the kernel can layer network
				69	* sockets.
				70	* Alan Cox : Use ip_tos/ip_ttl settings.
				71	* Alan Cox : Handle FIN (more) properly (we hope).
				72	* Alan Cox : RST frames sent on unsynchronised
				73	* state ack error.
				74	* Alan Cox : Put in missing check for SYN bit.
				75	* Alan Cox : Added tcp_select_window() aka NET2E
				76	* window non shrink trick.
				77	* Alan Cox : Added a couple of small NET2E timer
				78	* fixes
				79	* Charles Hedrick : TCP fixes
				80	* Toomas Tamm : TCP window fixes
				81	* Alan Cox : Small URG fix to rlogin ^C ack fight
				82	* Charles Hedrick : Rewrote most of it to actually work
				83	* Linus : Rewrote tcp_read() and URG handling
				84	* completely
				85	* Gerhard Koerting: Fixed some missing timer handling
				86	* Matthew Dillon : Reworked TCP machine states as per RFC
				87	* Gerhard Koerting: PC/TCP workarounds
				88	* Adam Caldwell : Assorted timer/timing errors
				89	* Matthew Dillon : Fixed another RST bug
				90	* Alan Cox : Move to kernel side addressing changes.
				91	* Alan Cox : Beginning work on TCP fastpathing
				92	* (not yet usable)
				93	* Arnt Gulbrandsen: Turbocharged tcp_check() routine.
				94	* Alan Cox : TCP fast path debugging
				95	* Alan Cox : Window clamping
				96	* Michael Riepe : Bug in tcp_check()
				97	* Matt Dillon : More TCP improvements and RST bug fixes
				98	* Matt Dillon : Yet more small nasties remove from the
				99	* TCP code (Be very nice to this man if
				100	* tcp finally works 100%) 8)
				101	* Alan Cox : BSD accept semantics.
				102	* Alan Cox : Reset on closedown bug.
				103	* Peter De Schrijver : ENOTCONN check missing in tcp_sendto().
				104	* Michael Pall : Handle poll() after URG properly in
				105	* all cases.
				106	* Michael Pall : Undo the last fix in tcp_read_urg()
				107	* (multi URG PUSH broke rlogin).
				108	* Michael Pall : Fix the multi URG PUSH problem in
				109	* tcp_readable(), poll() after URG
				110	* works now.
				111	* Michael Pall : recv(...,MSG_OOB) never blocks in the
				112	* BSD api.
				113	* Alan Cox : Changed the semantics of sk->socket to
				114	* fix a race and a signal problem with
				115	* accept() and async I/O.
				116	* Alan Cox : Relaxed the rules on tcp_sendto().
				117	* Yury Shevchuk : Really fixed accept() blocking problem.
				118	* Craig I. Hagan : Allow for BSD compatible TIME_WAIT for
				119	* clients/servers which listen in on
				120	* fixed ports.
				121	* Alan Cox : Cleaned the above up and shrank it to
				122	* a sensible code size.
				123	* Alan Cox : Self connect lockup fix.
				124	* Alan Cox : No connect to multicast.
				125	* Ross Biro : Close unaccepted children on master
				126	* socket close.
				127	* Alan Cox : Reset tracing code.
				128	* Alan Cox : Spurious resets on shutdown.
				129	* Alan Cox : Giant 15 minute/60 second timer error
				130	* Alan Cox : Small whoops in polling before an
				131	* accept.
				132	* Alan Cox : Kept the state trace facility since
				133	* it's handy for debugging.
				134	* Alan Cox : More reset handler fixes.
				135	* Alan Cox : Started rewriting the code based on
				136	* the RFC's for other useful protocol
				137	* references see: Comer, KA9Q NOS, and
				138	* for a reference on the difference
				139	* between specifications and how BSD
				140	* works see the 4.4lite source.
				141	* A.N.Kuznetsov : Don't time wait on completion of tidy
				142	* close.
				143	* Linus Torvalds : Fin/Shutdown & copied_seq changes.
				144	* Linus Torvalds : Fixed BSD port reuse to work first syn
				145	* Alan Cox : Reimplemented timers as per the RFC
				146	* and using multiple timers for sanity.
				147	* Alan Cox : Small bug fixes, and a lot of new
				148	* comments.
				149	* Alan Cox : Fixed dual reader crash by locking
				150	* the buffers (much like datagram.c)
				151	* Alan Cox : Fixed stuck sockets in probe. A probe
				152	* now gets fed up of retrying without
				153	* (even a no space) answer.
				154	* Alan Cox : Extracted closing code better
				155	* Alan Cox : Fixed the closing state machine to
				156	* resemble the RFC.
				157	* Alan Cox : More 'per spec' fixes.
				158	* Jorge Cwik : Even faster checksumming.
				159	* Alan Cox : tcp_data() doesn't ack illegal PSH
				160	* only frames. At least one pc tcp stack
				161	* generates them.
				162	* Alan Cox : Cache last socket.
				163	* Alan Cox : Per route irtt.
				164	* Matt Day : poll()->select() match BSD precisely on error
				165	* Alan Cox : New buffers
				166	* Marc Tamsky : Various sk->prot->retransmits and
				167	* sk->retransmits misupdating fixed.
				168	* Fixed tcp_write_timeout: stuck close,
				169	* and TCP syn retries gets used now.
				170	* Mark Yarvis : In tcp_read_wakeup(), don't send an
				171	* ack if state is TCP_CLOSED.
				172	* Alan Cox : Look up device on a retransmit - routes may
				173	* change. Doesn't yet cope with MSS shrink right
				174	* but it's a start!
				175	* Marc Tamsky : Closing in closing fixes.
				176	* Mike Shaver : RFC1122 verifications.
				177	* Alan Cox : rcv_saddr errors.
				178	* Alan Cox : Block double connect().
				179	* Alan Cox : Small hooks for enSKIP.
				180	* Alexey Kuznetsov: Path MTU discovery.
				181	* Alan Cox : Support soft errors.
				182	* Alan Cox : Fix MTU discovery pathological case
				183	* when the remote claims no mtu!
				184	* Marc Tamsky : TCP_CLOSE fix.
				185	* Colin (G3TNE) : Send a reset on syn ack replies in
				186	* window but wrong (fixes NT lpd problems)
				187	* Pedro Roque : Better TCP window handling, delayed ack.
				188	* Joerg Reuter : No modification of locked buffers in
				189	* tcp_do_retransmit()
				190	* Eric Schenk : Changed receiver side silly window
				191	* avoidance algorithm to BSD style
				192	* algorithm. This doubles throughput
				193	* against machines running Solaris,
				194	* and seems to result in general
				195	* improvement.
				196	* Stefan Magdalinski : adjusted tcp_readable() to fix FIONREAD
				197	* Willy Konynenberg : Transparent proxying support.
				198	* Mike McLagan : Routing by source
				199	* Keith Owens : Do proper merging with partial SKB's in
				200	* tcp_do_sendmsg to avoid burstiness.
				201	* Eric Schenk : Fix fast close down bug with
				202	* shutdown() followed by close().
				203	* Andi Kleen : Make poll agree with SIGIO
				204	* Salvatore Sanfilippo : Support SO_LINGER with linger == 1 and
				205	* lingertime == 0 (RFC 793 ABORT Call)
				206	* Hirokazu Takahashi : Use copy_from_user() instead of
				207	* csum_and_copy_from_user() if possible.
				208	*
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	209	* Description of States:
				210	*
				211	* TCP_SYN_SENT sent a connection request, waiting for ack
				212	*
				213	* TCP_SYN_RECV received a connection request, sent ack,
				214	* waiting for final ack in three-way handshake.
				215	*
				216	* TCP_ESTABLISHED connection established
				217	*
				218	* TCP_FIN_WAIT1 our side has shutdown, waiting to complete
				219	* transmission of remaining buffered data
				220	*
				221	* TCP_FIN_WAIT2 all buffered data sent, waiting for remote
				222	* to shutdown
				223	*
				224	* TCP_CLOSING both sides have shutdown but we still have
				225	* data we have to finish sending
				226	*
				227	* TCP_TIME_WAIT timeout to catch resent junk before entering
				228	* closed, can only be entered from FIN_WAIT2
				229	* or CLOSING. Required because the other end
				230	* may not have gotten our last ACK causing it
				231	* to retransmit the data packet (which we ignore)
				232	*
				233	* TCP_CLOSE_WAIT remote side has shutdown and is waiting for
				234	* us to finish writing our data and to shutdown
				235	* (we have to close() to move on to LAST_ACK)
				236	*
				237	* TCP_LAST_ACK out side has shutdown after remote has
				238	* shutdown. There may still be data in our
				239	* buffer that we have to finish sending
				240	*
				241	* TCP_CLOSE socket is finished
				242	*/
				243
				244	#define pr_fmt(fmt) "TCP: " fmt
				245
				246	#include <crypto/hash.h>
				247	#include <linux/kernel.h>
				248	#include <linux/module.h>
				249	#include <linux/types.h>
				250	#include <linux/fcntl.h>
				251	#include <linux/poll.h>
				252	#include <linux/inet_diag.h>
				253	#include <linux/init.h>
				254	#include <linux/fs.h>
				255	#include <linux/skbuff.h>
				256	#include <linux/scatterlist.h>
				257	#include <linux/splice.h>
				258	#include <linux/net.h>
				259	#include <linux/socket.h>
				260	#include <linux/random.h>
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	261	#include <linux/memblock.h>
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	262	#include <linux/highmem.h>
				263	#include <linux/swap.h>
				264	#include <linux/cache.h>
				265	#include <linux/err.h>
				266	#include <linux/time.h>
				267	#include <linux/slab.h>
				268	#include <linux/errqueue.h>
				269	#include <linux/static_key.h>
				270
				271	#include <net/icmp.h>
				272	#include <net/inet_common.h>
				273	#include <net/tcp.h>
				274	#include <net/xfrm.h>
				275	#include <net/ip.h>
				276	#include <net/sock.h>
				277
				278	#include <linux/uaccess.h>
				279	#include <asm/ioctls.h>
				280	#include <net/busy_poll.h>
				281
				282	struct percpu_counter tcp_orphan_count;
				283	EXPORT_SYMBOL_GPL(tcp_orphan_count);
				284
				285	long sysctl_tcp_mem[3] __read_mostly;
				286	EXPORT_SYMBOL(sysctl_tcp_mem);
				287
				288	atomic_long_t tcp_memory_allocated; /* Current allocated memory. */
				289	EXPORT_SYMBOL(tcp_memory_allocated);
				290
				291	#if IS_ENABLED(CONFIG_SMC)
				292	DEFINE_STATIC_KEY_FALSE(tcp_have_smc);
				293	EXPORT_SYMBOL(tcp_have_smc);
				294	#endif
				295
				296	/*
				297	* Current number of TCP sockets.
				298	*/
				299	struct percpu_counter tcp_sockets_allocated;
				300	EXPORT_SYMBOL(tcp_sockets_allocated);
				301
				302	/*
				303	* TCP splice context
				304	*/
				305	struct tcp_splice_state {
				306	struct pipe_inode_info *pipe;
				307	size_t len;
				308	unsigned int flags;
				309	};
				310
				311	/*
				312	* Pressure flag: try to collapse.
				313	* Technical note: it is used by multiple contexts non atomically.
				314	* All the __sk_mem_schedule() is of this nature: accounting
				315	* is strict, actions are advisory and have some latency.
				316	*/
				317	unsigned long tcp_memory_pressure __read_mostly;
				318	EXPORT_SYMBOL_GPL(tcp_memory_pressure);
				319
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	320	DEFINE_STATIC_KEY_FALSE(tcp_rx_skb_cache_key);
				321	EXPORT_SYMBOL(tcp_rx_skb_cache_key);
				322
				323	DEFINE_STATIC_KEY_FALSE(tcp_tx_skb_cache_key);
				324
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	325	void tcp_enter_memory_pressure(struct sock *sk)
				326	{
				327	unsigned long val;
				328
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	329	if (READ_ONCE(tcp_memory_pressure))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	330	return;
				331	val = jiffies;
				332
				333	if (!val)
				334	val--;
				335	if (!cmpxchg(&tcp_memory_pressure, 0, val))
				336	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURES);
				337	}
				338	EXPORT_SYMBOL_GPL(tcp_enter_memory_pressure);
				339
				340	void tcp_leave_memory_pressure(struct sock *sk)
				341	{
				342	unsigned long val;
				343
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	344	if (!READ_ONCE(tcp_memory_pressure))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	345	return;
				346	val = xchg(&tcp_memory_pressure, 0);
				347	if (val)
				348	NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURESCHRONO,
				349	jiffies_to_msecs(jiffies - val));
				350	}
				351	EXPORT_SYMBOL_GPL(tcp_leave_memory_pressure);
				352
				353	/* Convert seconds to retransmits based on initial and max timeout */
				354	static u8 secs_to_retrans(int seconds, int timeout, int rto_max)
				355	{
				356	u8 res = 0;
				357
				358	if (seconds > 0) {
				359	int period = timeout;
				360
				361	res = 1;
				362	while (seconds > period && res < 255) {
				363	res++;
				364	timeout <<= 1;
				365	if (timeout > rto_max)
				366	timeout = rto_max;
				367	period += timeout;
				368	}
				369	}
				370	return res;
				371	}
				372
				373	/* Convert retransmits to seconds based on initial and max timeout */
				374	static int retrans_to_secs(u8 retrans, int timeout, int rto_max)
				375	{
				376	int period = 0;
				377
				378	if (retrans > 0) {
				379	period = timeout;
				380	while (--retrans) {
				381	timeout <<= 1;
				382	if (timeout > rto_max)
				383	timeout = rto_max;
				384	period += timeout;
				385	}
				386	}
				387	return period;
				388	}
				389
				390	static u64 tcp_compute_delivery_rate(const struct tcp_sock *tp)
				391	{
				392	u32 rate = READ_ONCE(tp->rate_delivered);
				393	u32 intv = READ_ONCE(tp->rate_interval_us);
				394	u64 rate64 = 0;
				395
				396	if (rate && intv) {
				397	rate64 = (u64)rate * tp->mss_cache * USEC_PER_SEC;
				398	do_div(rate64, intv);
				399	}
				400	return rate64;
				401	}
				402
				403	/* Address-family independent initialization for a tcp_sock.
				404	*
				405	* NOTE: A lot of things set to zero explicitly by call to
				406	* sk_alloc() so need not be done here.
				407	*/
				408	void tcp_init_sock(struct sock *sk)
				409	{
				410	struct inet_connection_sock *icsk = inet_csk(sk);
				411	struct tcp_sock *tp = tcp_sk(sk);
				412
				413	tp->out_of_order_queue = RB_ROOT;
				414	sk->tcp_rtx_queue = RB_ROOT;
				415	tcp_init_xmit_timers(sk);
				416	INIT_LIST_HEAD(&tp->tsq_node);
				417	INIT_LIST_HEAD(&tp->tsorted_sent_queue);
				418
				419	icsk->icsk_rto = TCP_TIMEOUT_INIT;
				420	tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
				421	minmax_reset(&tp->rtt_min, tcp_jiffies32, ~0U);
				422
				423	/* So many TCP implementations out there (incorrectly) count the
				424	* initial SYN frame in their delayed-ACK and congestion control
				425	* algorithms that we must have the following bandaid to talk
				426	* efficiently to them. -DaveM
				427	*/
				428	tp->snd_cwnd = TCP_INIT_CWND;
				429
				430	/* There's a bubble in the pipe until at least the first ACK. */
				431	tp->app_limited = ~0U;
				432
				433	/* See draft-stevens-tcpca-spec-01 for discussion of the
				434	* initialization of these values.
				435	*/
				436	tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
				437	tp->snd_cwnd_clamp = ~0;
				438	tp->mss_cache = TCP_MSS_DEFAULT;
				439
				440	tp->reordering = sock_net(sk)->ipv4.sysctl_tcp_reordering;
				441	tcp_assign_congestion_control(sk);
				442
				443	tp->tsoffset = 0;
				444	tp->rack.reo_wnd_steps = 1;
				445
				446	sk->sk_state = TCP_CLOSE;
				447
				448	sk->sk_write_space = sk_stream_write_space;
				449	sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
				450
				451	icsk->icsk_sync_mss = tcp_sync_mss;
				452
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	453	WRITE_ONCE(sk->sk_sndbuf, sock_net(sk)->ipv4.sysctl_tcp_wmem[1]);
				454	WRITE_ONCE(sk->sk_rcvbuf, sock_net(sk)->ipv4.sysctl_tcp_rmem[1]);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	455
				456	sk_sockets_allocated_inc(sk);
				457	sk->sk_route_forced_caps = NETIF_F_GSO;
				458	}
				459	EXPORT_SYMBOL(tcp_init_sock);
				460
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	461	static void tcp_tx_timestamp(struct sock *sk, u16 tsflags)
				462	{
				463	struct sk_buff *skb = tcp_write_queue_tail(sk);
				464
				465	if (tsflags && skb) {
				466	struct skb_shared_info *shinfo = skb_shinfo(skb);
				467	struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
				468
				469	sock_tx_timestamp(sk, tsflags, &shinfo->tx_flags);
				470	if (tsflags & SOF_TIMESTAMPING_TX_ACK)
				471	tcb->txstamp_ack = 1;
				472	if (tsflags & SOF_TIMESTAMPING_TX_RECORD_MASK)
				473	shinfo->tskey = TCP_SKB_CB(skb)->seq + skb->len - 1;
				474	}
				475	}
				476
				477	static inline bool tcp_stream_is_readable(const struct tcp_sock *tp,
				478	int target, struct sock *sk)
				479	{
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	480	int avail = READ_ONCE(tp->rcv_nxt) - READ_ONCE(tp->copied_seq);
				481
				482	if (avail > 0) {
				483	if (avail >= target)
				484	return true;
				485	if (tcp_rmem_pressure(sk))
				486	return true;
				487	if (tcp_receive_window(tp) <= inet_csk(sk)->icsk_ack.rcv_mss)
				488	return true;
				489	}
				490	if (sk->sk_prot->stream_memory_read)
				491	return sk->sk_prot->stream_memory_read(sk);
				492	return false;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	493	}
				494
				495	/*
				496	* Wait for a TCP event.
				497	*
				498	* Note that we don't need to lock the socket, as the upper poll layers
				499	* take care of normal races (between the test and the event) and we don't
				500	* go look at any of the socket buffers directly.
				501	*/
				502	__poll_t tcp_poll(struct file file, struct socket sock, poll_table *wait)
				503	{
				504	__poll_t mask;
				505	struct sock *sk = sock->sk;
				506	const struct tcp_sock *tp = tcp_sk(sk);
				507	int state;
				508
				509	sock_poll_wait(file, sock, wait);
				510
				511	state = inet_sk_state_load(sk);
				512	if (state == TCP_LISTEN)
				513	return inet_csk_listen_poll(sk);
				514
				515	/* Socket is not locked. We are protected from async events
				516	* by poll logic and correct handling of state changes
				517	* made by other threads is impossible in any case.
				518	*/
				519
				520	mask = 0;
				521
				522	/*
				523	* EPOLLHUP is certainly not done right. But poll() doesn't
				524	* have a notion of HUP in just one direction, and for a
				525	* socket the read side is more interesting.
				526	*
				527	* Some poll() documentation says that EPOLLHUP is incompatible
				528	* with the EPOLLOUT/POLLWR flags, so somebody should check this
				529	* all. But careful, it tends to be safer to return too many
				530	* bits than too few, and you can easily break real applications
				531	* if you don't tell them that something has hung up!
				532	*
				533	* Check-me.
				534	*
				535	* Check number 1. EPOLLHUP is _UNMASKABLE_ event (see UNIX98 and
				536	* our fs/select.c). It means that after we received EOF,
				537	* poll always returns immediately, making impossible poll() on write()
				538	* in state CLOSE_WAIT. One solution is evident --- to set EPOLLHUP
				539	* if and only if shutdown has been made in both directions.
				540	* Actually, it is interesting to look how Solaris and DUX
				541	* solve this dilemma. I would prefer, if EPOLLHUP were maskable,
				542	* then we could set it on SND_SHUTDOWN. BTW examples given
				543	* in Stevens' books assume exactly this behaviour, it explains
				544	* why EPOLLHUP is incompatible with EPOLLOUT. --ANK
				545	*
				546	* NOTE. Check for TCP_CLOSE is added. The goal is to prevent
				547	* blocking on fresh not-connected or disconnected socket. --ANK
				548	*/
				549	if (sk->sk_shutdown == SHUTDOWN_MASK \|\| state == TCP_CLOSE)
				550	mask \|= EPOLLHUP;
				551	if (sk->sk_shutdown & RCV_SHUTDOWN)
				552	mask \|= EPOLLIN \| EPOLLRDNORM \| EPOLLRDHUP;
				553
				554	/* Connected or passive Fast Open socket? */
				555	if (state != TCP_SYN_SENT &&
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	556	(state != TCP_SYN_RECV \|\| rcu_access_pointer(tp->fastopen_rsk))) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	557	int target = sock_rcvlowat(sk, 0, INT_MAX);
				558
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	559	if (READ_ONCE(tp->urg_seq) == READ_ONCE(tp->copied_seq) &&
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	560	!sock_flag(sk, SOCK_URGINLINE) &&
				561	tp->urg_data)
				562	target++;
				563
				564	if (tcp_stream_is_readable(tp, target, sk))
				565	mask \|= EPOLLIN \| EPOLLRDNORM;
				566
				567	if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
				568	if (sk_stream_is_writeable(sk)) {
				569	mask \|= EPOLLOUT \| EPOLLWRNORM;
				570	} else { /* send SIGIO later */
				571	sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
				572	set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
				573
				574	/* Race breaker. If space is freed after
				575	* wspace test but before the flags are set,
				576	* IO signal will be lost. Memory barrier
				577	* pairs with the input side.
				578	*/
				579	smp_mb__after_atomic();
				580	if (sk_stream_is_writeable(sk))
				581	mask \|= EPOLLOUT \| EPOLLWRNORM;
				582	}
				583	} else
				584	mask \|= EPOLLOUT \| EPOLLWRNORM;
				585
				586	if (tp->urg_data & TCP_URG_VALID)
				587	mask \|= EPOLLPRI;
				588	} else if (state == TCP_SYN_SENT && inet_sk(sk)->defer_connect) {
				589	/* Active TCP fastopen socket with defer_connect
				590	* Return EPOLLOUT so application can call write()
				591	* in order for kernel to generate SYN+data
				592	*/
				593	mask \|= EPOLLOUT \| EPOLLWRNORM;
				594	}
				595	/* This barrier is coupled with smp_wmb() in tcp_reset() */
				596	smp_rmb();
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	597	if (sk->sk_err \|\| !skb_queue_empty_lockless(&sk->sk_error_queue))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	598	mask \|= EPOLLERR;
				599
				600	return mask;
				601	}
				602	EXPORT_SYMBOL(tcp_poll);
				603
				604	int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
				605	{
				606	struct tcp_sock *tp = tcp_sk(sk);
				607	int answ;
				608	bool slow;
				609
				610	switch (cmd) {
				611	case SIOCINQ:
				612	if (sk->sk_state == TCP_LISTEN)
				613	return -EINVAL;
				614
				615	slow = lock_sock_fast(sk);
				616	answ = tcp_inq(sk);
				617	unlock_sock_fast(sk, slow);
				618	break;
				619	case SIOCATMARK:
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	620	answ = tp->urg_data &&
				621	READ_ONCE(tp->urg_seq) == READ_ONCE(tp->copied_seq);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	622	break;
				623	case SIOCOUTQ:
				624	if (sk->sk_state == TCP_LISTEN)
				625	return -EINVAL;
				626
				627	if ((1 << sk->sk_state) & (TCPF_SYN_SENT \| TCPF_SYN_RECV))
				628	answ = 0;
				629	else
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	630	answ = READ_ONCE(tp->write_seq) - tp->snd_una;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	631	break;
				632	case SIOCOUTQNSD:
				633	if (sk->sk_state == TCP_LISTEN)
				634	return -EINVAL;
				635
				636	if ((1 << sk->sk_state) & (TCPF_SYN_SENT \| TCPF_SYN_RECV))
				637	answ = 0;
				638	else
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	639	answ = READ_ONCE(tp->write_seq) -
				640	READ_ONCE(tp->snd_nxt);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	641	break;
				642	default:
				643	return -ENOIOCTLCMD;
				644	}
				645
				646	return put_user(answ, (int __user *)arg);
				647	}
				648	EXPORT_SYMBOL(tcp_ioctl);
				649
				650	static inline void tcp_mark_push(struct tcp_sock tp, struct sk_buff skb)
				651	{
				652	TCP_SKB_CB(skb)->tcp_flags \|= TCPHDR_PSH;
				653	tp->pushed_seq = tp->write_seq;
				654	}
				655
				656	static inline bool forced_push(const struct tcp_sock *tp)
				657	{
				658	return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
				659	}
				660
				661	static void skb_entail(struct sock sk, struct sk_buff skb)
				662	{
				663	struct tcp_sock *tp = tcp_sk(sk);
				664	struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
				665
				666	skb->csum = 0;
				667	tcb->seq = tcb->end_seq = tp->write_seq;
				668	tcb->tcp_flags = TCPHDR_ACK;
				669	tcb->sacked = 0;
				670	__skb_header_release(skb);
				671	tcp_add_write_queue_tail(sk, skb);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	672	sk_wmem_queued_add(sk, skb->truesize);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	673	sk_mem_charge(sk, skb->truesize);
				674	if (tp->nonagle & TCP_NAGLE_PUSH)
				675	tp->nonagle &= ~TCP_NAGLE_PUSH;
				676
				677	tcp_slow_start_after_idle_check(sk);
				678	}
				679
				680	static inline void tcp_mark_urg(struct tcp_sock *tp, int flags)
				681	{
				682	if (flags & MSG_OOB)
				683	tp->snd_up = tp->write_seq;
				684	}
				685
				686	/* If a not yet filled skb is pushed, do not send it if
				687	* we have data packets in Qdisc or NIC queues :
				688	* Because TX completion will happen shortly, it gives a chance
				689	* to coalesce future sendmsg() payload into this skb, without
				690	* need for a timer, and with no latency trade off.
				691	* As packets containing data payload have a bigger truesize
				692	* than pure acks (dataless) packets, the last checks prevent
				693	* autocorking if we only have an ACK in Qdisc/NIC queues,
				694	* or if TX completion was delayed after we processed ACK packet.
				695	*/
				696	static bool tcp_should_autocork(struct sock sk, struct sk_buff skb,
				697	int size_goal)
				698	{
				699	return skb->len < size_goal &&
				700	sock_net(sk)->ipv4.sysctl_tcp_autocorking &&
				701	!tcp_rtx_queue_empty(sk) &&
				702	refcount_read(&sk->sk_wmem_alloc) > skb->truesize;
				703	}
				704
				705	static void tcp_push(struct sock *sk, int flags, int mss_now,
				706	int nonagle, int size_goal)
				707	{
				708	struct tcp_sock *tp = tcp_sk(sk);
				709	struct sk_buff *skb;
				710
				711	skb = tcp_write_queue_tail(sk);
				712	if (!skb)
				713	return;
				714	if (!(flags & MSG_MORE) \|\| forced_push(tp))
				715	tcp_mark_push(tp, skb);
				716
				717	tcp_mark_urg(tp, flags);
				718
				719	if (tcp_should_autocork(sk, skb, size_goal)) {
				720
				721	/* avoid atomic op if TSQ_THROTTLED bit is already set */
				722	if (!test_bit(TSQ_THROTTLED, &sk->sk_tsq_flags)) {
				723	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAUTOCORKING);
				724	set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags);
				725	}
				726	/* It is possible TX completion already happened
				727	* before we set TSQ_THROTTLED.
				728	*/
				729	if (refcount_read(&sk->sk_wmem_alloc) > skb->truesize)
				730	return;
				731	}
				732
				733	if (flags & MSG_MORE)
				734	nonagle = TCP_NAGLE_CORK;
				735
				736	__tcp_push_pending_frames(sk, mss_now, nonagle);
				737	}
				738
				739	static int tcp_splice_data_recv(read_descriptor_t rd_desc, struct sk_buff skb,
				740	unsigned int offset, size_t len)
				741	{
				742	struct tcp_splice_state *tss = rd_desc->arg.data;
				743	int ret;
				744
				745	ret = skb_splice_bits(skb, skb->sk, offset, tss->pipe,
				746	min(rd_desc->count, len), tss->flags);
				747	if (ret > 0)
				748	rd_desc->count -= ret;
				749	return ret;
				750	}
				751
				752	static int __tcp_splice_read(struct sock sk, struct tcp_splice_state tss)
				753	{
				754	/* Store TCP splice context information in read_descriptor_t. */
				755	read_descriptor_t rd_desc = {
				756	.arg.data = tss,
				757	.count = tss->len,
				758	};
				759
				760	return tcp_read_sock(sk, &rd_desc, tcp_splice_data_recv);
				761	}
				762
				763	/**
				764	* tcp_splice_read - splice data from TCP socket to a pipe
				765	* @sock: socket to splice from
				766	* @ppos: position (not valid)
				767	* @pipe: pipe to splice to
				768	* @len: number of bytes to splice
				769	* @flags: splice modifier flags
				770	*
				771	* Description:
				772	* Will read pages from given socket and fill them into a pipe.
				773	*
				774	**/
				775	ssize_t tcp_splice_read(struct socket sock, loff_t ppos,
				776	struct pipe_inode_info *pipe, size_t len,
				777	unsigned int flags)
				778	{
				779	struct sock *sk = sock->sk;
				780	struct tcp_splice_state tss = {
				781	.pipe = pipe,
				782	.len = len,
				783	.flags = flags,
				784	};
				785	long timeo;
				786	ssize_t spliced;
				787	int ret;
				788
				789	sock_rps_record_flow(sk);
				790	/*
				791	* We can't seek on a socket input
				792	*/
				793	if (unlikely(*ppos))
				794	return -ESPIPE;
				795
				796	ret = spliced = 0;
				797
				798	lock_sock(sk);
				799
				800	timeo = sock_rcvtimeo(sk, sock->file->f_flags & O_NONBLOCK);
				801	while (tss.len) {
				802	ret = __tcp_splice_read(sk, &tss);
				803	if (ret < 0)
				804	break;
				805	else if (!ret) {
				806	if (spliced)
				807	break;
				808	if (sock_flag(sk, SOCK_DONE))
				809	break;
				810	if (sk->sk_err) {
				811	ret = sock_error(sk);
				812	break;
				813	}
				814	if (sk->sk_shutdown & RCV_SHUTDOWN)
				815	break;
				816	if (sk->sk_state == TCP_CLOSE) {
				817	/*
				818	* This occurs when user tries to read
				819	* from never connected socket.
				820	*/
				821	ret = -ENOTCONN;
				822	break;
				823	}
				824	if (!timeo) {
				825	ret = -EAGAIN;
				826	break;
				827	}
				828	/* if __tcp_splice_read() got nothing while we have
				829	* an skb in receive queue, we do not want to loop.
				830	* This might happen with URG data.
				831	*/
				832	if (!skb_queue_empty(&sk->sk_receive_queue))
				833	break;
				834	sk_wait_data(sk, &timeo, NULL);
				835	if (signal_pending(current)) {
				836	ret = sock_intr_errno(timeo);
				837	break;
				838	}
				839	continue;
				840	}
				841	tss.len -= ret;
				842	spliced += ret;
				843
				844	if (!timeo)
				845	break;
				846	release_sock(sk);
				847	lock_sock(sk);
				848
				849	if (sk->sk_err \|\| sk->sk_state == TCP_CLOSE \|\|
				850	(sk->sk_shutdown & RCV_SHUTDOWN) \|\|
				851	signal_pending(current))
				852	break;
				853	}
				854
				855	release_sock(sk);
				856
				857	if (spliced)
				858	return spliced;
				859
				860	return ret;
				861	}
				862	EXPORT_SYMBOL(tcp_splice_read);
				863
				864	struct sk_buff sk_stream_alloc_skb(struct sock sk, int size, gfp_t gfp,
				865	bool force_schedule)
				866	{
				867	struct sk_buff *skb;
				868
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	869	if (likely(!size)) {
				870	skb = sk->sk_tx_skb_cache;
				871	if (skb) {
				872	skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
				873	sk->sk_tx_skb_cache = NULL;
				874	pskb_trim(skb, 0);
				875	INIT_LIST_HEAD(&skb->tcp_tsorted_anchor);
				876	skb_shinfo(skb)->tx_flags = 0;
				877	memset(TCP_SKB_CB(skb), 0, sizeof(struct tcp_skb_cb));
				878	return skb;
				879	}
				880	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	881	/* The TCP header must be at least 32-bit aligned. */
				882	size = ALIGN(size, 4);
				883
				884	if (unlikely(tcp_under_memory_pressure(sk)))
				885	sk_mem_reclaim_partial(sk);
				886
				887	skb = alloc_skb_fclone(size + sk->sk_prot->max_header, gfp);
				888	if (likely(skb)) {
				889	bool mem_scheduled;
				890
				891	if (force_schedule) {
				892	mem_scheduled = true;
				893	sk_forced_mem_schedule(sk, skb->truesize);
				894	} else {
				895	mem_scheduled = sk_wmem_schedule(sk, skb->truesize);
				896	}
				897	if (likely(mem_scheduled)) {
				898	skb_reserve(skb, sk->sk_prot->max_header);
				899	/*
				900	* Make sure that we have exactly size bytes
				901	* available to the caller, no more, no less.
				902	*/
				903	skb->reserved_tailroom = skb->end - skb->tail - size;
				904	INIT_LIST_HEAD(&skb->tcp_tsorted_anchor);
				905	return skb;
				906	}
				907	__kfree_skb(skb);
				908	} else {
				909	sk->sk_prot->enter_memory_pressure(sk);
				910	sk_stream_moderate_sndbuf(sk);
				911	}
				912	return NULL;
				913	}
				914
				915	static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
				916	int large_allowed)
				917	{
				918	struct tcp_sock *tp = tcp_sk(sk);
				919	u32 new_size_goal, size_goal;
				920
				921	if (!large_allowed)
				922	return mss_now;
				923
				924	/* Note : tcp_tso_autosize() will eventually split this later */
				925	new_size_goal = sk->sk_gso_max_size - 1 - MAX_TCP_HEADER;
				926	new_size_goal = tcp_bound_to_half_wnd(tp, new_size_goal);
				927
				928	/* We try hard to avoid divides here */
				929	size_goal = tp->gso_segs * mss_now;
				930	if (unlikely(new_size_goal < size_goal \|\|
				931	new_size_goal >= size_goal + mss_now)) {
				932	tp->gso_segs = min_t(u16, new_size_goal / mss_now,
				933	sk->sk_gso_max_segs);
				934	size_goal = tp->gso_segs * mss_now;
				935	}
				936
				937	return max(size_goal, mss_now);
				938	}
				939
				940	static int tcp_send_mss(struct sock sk, int size_goal, int flags)
				941	{
				942	int mss_now;
				943
				944	mss_now = tcp_current_mss(sk);
				945	*size_goal = tcp_xmit_size_goal(sk, mss_now, !(flags & MSG_OOB));
				946
				947	return mss_now;
				948	}
				949
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	950	/* In some cases, both sendpage() and sendmsg() could have added
				951	* an skb to the write queue, but failed adding payload on it.
				952	* We need to remove it to consume less memory, but more
				953	* importantly be able to generate EPOLLOUT for Edge Trigger epoll()
				954	* users.
				955	*/
				956	static void tcp_remove_empty_skb(struct sock sk, struct sk_buff skb)
				957	{
				958	if (skb && !skb->len) {
				959	tcp_unlink_write_queue(skb, sk);
				960	if (tcp_write_queue_empty(sk))
				961	tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
				962	sk_wmem_free_skb(sk, skb);
				963	}
				964	}
				965
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	966	ssize_t do_tcp_sendpages(struct sock sk, struct page page, int offset,
				967	size_t size, int flags)
				968	{
				969	struct tcp_sock *tp = tcp_sk(sk);
				970	int mss_now, size_goal;
				971	int err;
				972	ssize_t copied;
				973	long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
				974
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	975	if (IS_ENABLED(CONFIG_DEBUG_VM) &&
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	976	WARN_ONCE(!sendpage_ok(page),
				977	"page must not be a Slab one and have page_count > 0"))
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	978	return -EINVAL;
				979
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	980	/* Wait for a connection to finish. One exception is TCP Fast Open
				981	* (passive side) where data is allowed to be sent before a connection
				982	* is fully established.
				983	*/
				984	if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED \| TCPF_CLOSE_WAIT)) &&
				985	!tcp_passive_fastopen(sk)) {
				986	err = sk_stream_wait_connect(sk, &timeo);
				987	if (err != 0)
				988	goto out_err;
				989	}
				990
				991	sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
				992
				993	mss_now = tcp_send_mss(sk, &size_goal, flags);
				994	copied = 0;
				995
				996	err = -EPIPE;
				997	if (sk->sk_err \|\| (sk->sk_shutdown & SEND_SHUTDOWN))
				998	goto out_err;
				999
				1000	while (size > 0) {
				1001	struct sk_buff *skb = tcp_write_queue_tail(sk);
				1002	int copy, i;
				1003	bool can_coalesce;
				1004
				1005	if (!skb \|\| (copy = size_goal - skb->len) <= 0 \|\|
				1006	!tcp_skb_can_collapse_to(skb)) {
				1007	new_segment:
				1008	if (!sk_stream_memory_free(sk))
				1009	goto wait_for_sndbuf;
				1010
				1011	skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation,
				1012	tcp_rtx_and_write_queues_empty(sk));
				1013	if (!skb)
				1014	goto wait_for_memory;
				1015
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1016	#ifdef CONFIG_TLS_DEVICE
				1017	skb->decrypted = !!(flags & MSG_SENDPAGE_DECRYPTED);
				1018	#endif
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1019	skb_entail(sk, skb);
				1020	copy = size_goal;
				1021	}
				1022
				1023	if (copy > size)
				1024	copy = size;
				1025
				1026	i = skb_shinfo(skb)->nr_frags;
				1027	can_coalesce = skb_can_coalesce(skb, i, page, offset);
				1028	if (!can_coalesce && i >= sysctl_max_skb_frags) {
				1029	tcp_mark_push(tp, skb);
				1030	goto new_segment;
				1031	}
				1032	if (!sk_wmem_schedule(sk, copy))
				1033	goto wait_for_memory;
				1034
				1035	if (can_coalesce) {
				1036	skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
				1037	} else {
				1038	get_page(page);
				1039	skb_fill_page_desc(skb, i, page, offset, copy);
				1040	}
				1041
				1042	if (!(flags & MSG_NO_SHARED_FRAGS))
				1043	skb_shinfo(skb)->tx_flags \|= SKBTX_SHARED_FRAG;
				1044
				1045	skb->len += copy;
				1046	skb->data_len += copy;
				1047	skb->truesize += copy;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1048	sk_wmem_queued_add(sk, copy);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1049	sk_mem_charge(sk, copy);
				1050	skb->ip_summed = CHECKSUM_PARTIAL;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1051	WRITE_ONCE(tp->write_seq, tp->write_seq + copy);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1052	TCP_SKB_CB(skb)->end_seq += copy;
				1053	tcp_skb_pcount_set(skb, 0);
				1054
				1055	if (!copied)
				1056	TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;
				1057
				1058	copied += copy;
				1059	offset += copy;
				1060	size -= copy;
				1061	if (!size)
				1062	goto out;
				1063
				1064	if (skb->len < size_goal \|\| (flags & MSG_OOB))
				1065	continue;
				1066
				1067	if (forced_push(tp)) {
				1068	tcp_mark_push(tp, skb);
				1069	__tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);
				1070	} else if (skb == tcp_send_head(sk))
				1071	tcp_push_one(sk, mss_now);
				1072	continue;
				1073
				1074	wait_for_sndbuf:
				1075	set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
				1076	wait_for_memory:
				1077	tcp_push(sk, flags & ~MSG_MORE, mss_now,
				1078	TCP_NAGLE_PUSH, size_goal);
				1079
				1080	err = sk_stream_wait_memory(sk, &timeo);
				1081	if (err != 0)
				1082	goto do_error;
				1083
				1084	mss_now = tcp_send_mss(sk, &size_goal, flags);
				1085	}
				1086
				1087	out:
				1088	if (copied) {
				1089	tcp_tx_timestamp(sk, sk->sk_tsflags);
				1090	if (!(flags & MSG_SENDPAGE_NOTLAST))
				1091	tcp_push(sk, flags, mss_now, tp->nonagle, size_goal);
				1092	}
				1093	return copied;
				1094
				1095	do_error:
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1096	tcp_remove_empty_skb(sk, tcp_write_queue_tail(sk));
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1097	if (copied)
				1098	goto out;
				1099	out_err:
				1100	/* make sure we wake any epoll edge trigger waiter */
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	1101	if (unlikely(tcp_rtx_and_write_queues_empty(sk) && err == -EAGAIN)) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1102	sk->sk_write_space(sk);
				1103	tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED);
				1104	}
				1105	return sk_stream_error(sk, flags, err);
				1106	}
				1107	EXPORT_SYMBOL_GPL(do_tcp_sendpages);
				1108
				1109	int tcp_sendpage_locked(struct sock sk, struct page page, int offset,
				1110	size_t size, int flags)
				1111	{
				1112	if (!(sk->sk_route_caps & NETIF_F_SG))
				1113	return sock_no_sendpage_locked(sk, page, offset, size, flags);
				1114
				1115	tcp_rate_check_app_limited(sk); /* is sending application-limited? */
				1116
				1117	return do_tcp_sendpages(sk, page, offset, size, flags);
				1118	}
				1119	EXPORT_SYMBOL_GPL(tcp_sendpage_locked);
				1120
				1121	int tcp_sendpage(struct sock sk, struct page page, int offset,
				1122	size_t size, int flags)
				1123	{
				1124	int ret;
				1125
				1126	lock_sock(sk);
				1127	ret = tcp_sendpage_locked(sk, page, offset, size, flags);
				1128	release_sock(sk);
				1129
				1130	return ret;
				1131	}
				1132	EXPORT_SYMBOL(tcp_sendpage);
				1133
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1134	void tcp_free_fastopen_req(struct tcp_sock *tp)
				1135	{
				1136	if (tp->fastopen_req) {
				1137	kfree(tp->fastopen_req);
				1138	tp->fastopen_req = NULL;
				1139	}
				1140	}
				1141
				1142	static int tcp_sendmsg_fastopen(struct sock sk, struct msghdr msg,
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1143	int *copied, size_t size,
				1144	struct ubuf_info *uarg)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1145	{
				1146	struct tcp_sock *tp = tcp_sk(sk);
				1147	struct inet_sock *inet = inet_sk(sk);
				1148	struct sockaddr *uaddr = msg->msg_name;
				1149	int err, flags;
				1150
				1151	if (!(sock_net(sk)->ipv4.sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) \|\|
				1152	(uaddr && msg->msg_namelen >= sizeof(uaddr->sa_family) &&
				1153	uaddr->sa_family == AF_UNSPEC))
				1154	return -EOPNOTSUPP;
				1155	if (tp->fastopen_req)
				1156	return -EALREADY; /* Another Fast Open is in progress */
				1157
				1158	tp->fastopen_req = kzalloc(sizeof(struct tcp_fastopen_request),
				1159	sk->sk_allocation);
				1160	if (unlikely(!tp->fastopen_req))
				1161	return -ENOBUFS;
				1162	tp->fastopen_req->data = msg;
				1163	tp->fastopen_req->size = size;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1164	tp->fastopen_req->uarg = uarg;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1165
				1166	if (inet->defer_connect) {
				1167	err = tcp_connect(sk);
				1168	/* Same failure procedure as in tcp_v4/6_connect */
				1169	if (err) {
				1170	tcp_set_state(sk, TCP_CLOSE);
				1171	inet->inet_dport = 0;
				1172	sk->sk_route_caps = 0;
				1173	}
				1174	}
				1175	flags = (msg->msg_flags & MSG_DONTWAIT) ? O_NONBLOCK : 0;
				1176	err = __inet_stream_connect(sk->sk_socket, uaddr,
				1177	msg->msg_namelen, flags, 1);
				1178	/* fastopen_req could already be freed in __inet_stream_connect
				1179	* if the connection times out or gets rst
				1180	*/
				1181	if (tp->fastopen_req) {
				1182	*copied = tp->fastopen_req->copied;
				1183	tcp_free_fastopen_req(tp);
				1184	inet->defer_connect = 0;
				1185	}
				1186	return err;
				1187	}
				1188
				1189	int tcp_sendmsg_locked(struct sock sk, struct msghdr msg, size_t size)
				1190	{
				1191	struct tcp_sock *tp = tcp_sk(sk);
				1192	struct ubuf_info *uarg = NULL;
				1193	struct sk_buff *skb;
				1194	struct sockcm_cookie sockc;
				1195	int flags, err, copied = 0;
				1196	int mss_now = 0, size_goal, copied_syn = 0;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1197	int process_backlog = 0;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1198	bool zc = false;
				1199	long timeo;
				1200
				1201	flags = msg->msg_flags;
				1202
				1203	if (flags & MSG_ZEROCOPY && size && sock_flag(sk, SOCK_ZEROCOPY)) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1204	skb = tcp_write_queue_tail(sk);
				1205	uarg = sock_zerocopy_realloc(sk, size, skb_zcopy(skb));
				1206	if (!uarg) {
				1207	err = -ENOBUFS;
				1208	goto out_err;
				1209	}
				1210
				1211	zc = sk->sk_route_caps & NETIF_F_SG;
				1212	if (!zc)
				1213	uarg->zerocopy = 0;
				1214	}
				1215
				1216	if (unlikely(flags & MSG_FASTOPEN \|\| inet_sk(sk)->defer_connect) &&
				1217	!tp->repair) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1218	err = tcp_sendmsg_fastopen(sk, msg, &copied_syn, size, uarg);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1219	if (err == -EINPROGRESS && copied_syn > 0)
				1220	goto out;
				1221	else if (err)
				1222	goto out_err;
				1223	}
				1224
				1225	timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
				1226
				1227	tcp_rate_check_app_limited(sk); /* is sending application-limited? */
				1228
				1229	/* Wait for a connection to finish. One exception is TCP Fast Open
				1230	* (passive side) where data is allowed to be sent before a connection
				1231	* is fully established.
				1232	*/
				1233	if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED \| TCPF_CLOSE_WAIT)) &&
				1234	!tcp_passive_fastopen(sk)) {
				1235	err = sk_stream_wait_connect(sk, &timeo);
				1236	if (err != 0)
				1237	goto do_error;
				1238	}
				1239
				1240	if (unlikely(tp->repair)) {
				1241	if (tp->repair_queue == TCP_RECV_QUEUE) {
				1242	copied = tcp_send_rcvq(sk, msg, size);
				1243	goto out_nopush;
				1244	}
				1245
				1246	err = -EINVAL;
				1247	if (tp->repair_queue == TCP_NO_QUEUE)
				1248	goto out_err;
				1249
				1250	/* 'common' sending to sendq */
				1251	}
				1252
				1253	sockcm_init(&sockc, sk);
				1254	if (msg->msg_controllen) {
				1255	err = sock_cmsg_send(sk, msg, &sockc);
				1256	if (unlikely(err)) {
				1257	err = -EINVAL;
				1258	goto out_err;
				1259	}
				1260	}
				1261
				1262	/* This should be in poll */
				1263	sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
				1264
				1265	/* Ok commence sending. */
				1266	copied = 0;
				1267
				1268	restart:
				1269	mss_now = tcp_send_mss(sk, &size_goal, flags);
				1270
				1271	err = -EPIPE;
				1272	if (sk->sk_err \|\| (sk->sk_shutdown & SEND_SHUTDOWN))
				1273	goto do_error;
				1274
				1275	while (msg_data_left(msg)) {
				1276	int copy = 0;
				1277
				1278	skb = tcp_write_queue_tail(sk);
				1279	if (skb)
				1280	copy = size_goal - skb->len;
				1281
				1282	if (copy <= 0 \|\| !tcp_skb_can_collapse_to(skb)) {
				1283	bool first_skb;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1284
				1285	new_segment:
				1286	if (!sk_stream_memory_free(sk))
				1287	goto wait_for_sndbuf;
				1288
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1289	if (unlikely(process_backlog >= 16)) {
				1290	process_backlog = 0;
				1291	if (sk_flush_backlog(sk))
				1292	goto restart;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1293	}
				1294	first_skb = tcp_rtx_and_write_queues_empty(sk);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1295	skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1296	first_skb);
				1297	if (!skb)
				1298	goto wait_for_memory;
				1299
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1300	process_backlog++;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1301	skb->ip_summed = CHECKSUM_PARTIAL;
				1302
				1303	skb_entail(sk, skb);
				1304	copy = size_goal;
				1305
				1306	/* All packets are restored as if they have
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1307	* already been sent. skb_mstamp_ns isn't set to
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1308	* avoid wrong rtt estimation.
				1309	*/
				1310	if (tp->repair)
				1311	TCP_SKB_CB(skb)->sacked \|= TCPCB_REPAIRED;
				1312	}
				1313
				1314	/* Try to append data to the end of skb. */
				1315	if (copy > msg_data_left(msg))
				1316	copy = msg_data_left(msg);
				1317
				1318	/* Where to copy to? */
				1319	if (skb_availroom(skb) > 0 && !zc) {
				1320	/* We have some space in skb head. Superb! */
				1321	copy = min_t(int, copy, skb_availroom(skb));
				1322	err = skb_add_data_nocache(sk, skb, &msg->msg_iter, copy);
				1323	if (err)
				1324	goto do_fault;
				1325	} else if (!zc) {
				1326	bool merge = true;
				1327	int i = skb_shinfo(skb)->nr_frags;
				1328	struct page_frag *pfrag = sk_page_frag(sk);
				1329
				1330	if (!sk_page_frag_refill(sk, pfrag))
				1331	goto wait_for_memory;
				1332
				1333	if (!skb_can_coalesce(skb, i, pfrag->page,
				1334	pfrag->offset)) {
				1335	if (i >= sysctl_max_skb_frags) {
				1336	tcp_mark_push(tp, skb);
				1337	goto new_segment;
				1338	}
				1339	merge = false;
				1340	}
				1341
				1342	copy = min_t(int, copy, pfrag->size - pfrag->offset);
				1343
				1344	if (!sk_wmem_schedule(sk, copy))
				1345	goto wait_for_memory;
				1346
				1347	err = skb_copy_to_page_nocache(sk, &msg->msg_iter, skb,
				1348	pfrag->page,
				1349	pfrag->offset,
				1350	copy);
				1351	if (err)
				1352	goto do_error;
				1353
				1354	/* Update the skb. */
				1355	if (merge) {
				1356	skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
				1357	} else {
				1358	skb_fill_page_desc(skb, i, pfrag->page,
				1359	pfrag->offset, copy);
				1360	page_ref_inc(pfrag->page);
				1361	}
				1362	pfrag->offset += copy;
				1363	} else {
				1364	err = skb_zerocopy_iter_stream(sk, skb, msg, copy, uarg);
				1365	if (err == -EMSGSIZE \|\| err == -EEXIST) {
				1366	tcp_mark_push(tp, skb);
				1367	goto new_segment;
				1368	}
				1369	if (err < 0)
				1370	goto do_error;
				1371	copy = err;
				1372	}
				1373
				1374	if (!copied)
				1375	TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;
				1376
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1377	WRITE_ONCE(tp->write_seq, tp->write_seq + copy);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1378	TCP_SKB_CB(skb)->end_seq += copy;
				1379	tcp_skb_pcount_set(skb, 0);
				1380
				1381	copied += copy;
				1382	if (!msg_data_left(msg)) {
				1383	if (unlikely(flags & MSG_EOR))
				1384	TCP_SKB_CB(skb)->eor = 1;
				1385	goto out;
				1386	}
				1387
				1388	if (skb->len < size_goal \|\| (flags & MSG_OOB) \|\| unlikely(tp->repair))
				1389	continue;
				1390
				1391	if (forced_push(tp)) {
				1392	tcp_mark_push(tp, skb);
				1393	__tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);
				1394	} else if (skb == tcp_send_head(sk))
				1395	tcp_push_one(sk, mss_now);
				1396	continue;
				1397
				1398	wait_for_sndbuf:
				1399	set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
				1400	wait_for_memory:
				1401	if (copied)
				1402	tcp_push(sk, flags & ~MSG_MORE, mss_now,
				1403	TCP_NAGLE_PUSH, size_goal);
				1404
				1405	err = sk_stream_wait_memory(sk, &timeo);
				1406	if (err != 0)
				1407	goto do_error;
				1408
				1409	mss_now = tcp_send_mss(sk, &size_goal, flags);
				1410	}
				1411
				1412	out:
				1413	if (copied) {
				1414	tcp_tx_timestamp(sk, sockc.tsflags);
				1415	tcp_push(sk, flags, mss_now, tp->nonagle, size_goal);
				1416	}
				1417	out_nopush:
				1418	sock_zerocopy_put(uarg);
				1419	return copied + copied_syn;
				1420
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1421	do_error:
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1422	skb = tcp_write_queue_tail(sk);
				1423	do_fault:
				1424	tcp_remove_empty_skb(sk, skb);
				1425
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1426	if (copied + copied_syn)
				1427	goto out;
				1428	out_err:
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1429	sock_zerocopy_put_abort(uarg, true);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1430	err = sk_stream_error(sk, flags, err);
				1431	/* make sure we wake any epoll edge trigger waiter */
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	1432	if (unlikely(tcp_rtx_and_write_queues_empty(sk) && err == -EAGAIN)) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1433	sk->sk_write_space(sk);
				1434	tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED);
				1435	}
				1436	return err;
				1437	}
				1438	EXPORT_SYMBOL_GPL(tcp_sendmsg_locked);
				1439
				1440	int tcp_sendmsg(struct sock sk, struct msghdr msg, size_t size)
				1441	{
				1442	int ret;
				1443
				1444	lock_sock(sk);
				1445	ret = tcp_sendmsg_locked(sk, msg, size);
				1446	release_sock(sk);
				1447
				1448	return ret;
				1449	}
				1450	EXPORT_SYMBOL(tcp_sendmsg);
				1451
				1452	/*
				1453	* Handle reading urgent data. BSD has very simple semantics for
				1454	* this, no blocking and very strange errors 8)
				1455	*/
				1456
				1457	static int tcp_recv_urg(struct sock sk, struct msghdr msg, int len, int flags)
				1458	{
				1459	struct tcp_sock *tp = tcp_sk(sk);
				1460
				1461	/* No URG data to read. */
				1462	if (sock_flag(sk, SOCK_URGINLINE) \|\| !tp->urg_data \|\|
				1463	tp->urg_data == TCP_URG_READ)
				1464	return -EINVAL; /* Yes this is right ! */
				1465
				1466	if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE))
				1467	return -ENOTCONN;
				1468
				1469	if (tp->urg_data & TCP_URG_VALID) {
				1470	int err = 0;
				1471	char c = tp->urg_data;
				1472
				1473	if (!(flags & MSG_PEEK))
				1474	tp->urg_data = TCP_URG_READ;
				1475
				1476	/* Read urgent data. */
				1477	msg->msg_flags \|= MSG_OOB;
				1478
				1479	if (len > 0) {
				1480	if (!(flags & MSG_TRUNC))
				1481	err = memcpy_to_msg(msg, &c, 1);
				1482	len = 1;
				1483	} else
				1484	msg->msg_flags \|= MSG_TRUNC;
				1485
				1486	return err ? -EFAULT : len;
				1487	}
				1488
				1489	if (sk->sk_state == TCP_CLOSE \|\| (sk->sk_shutdown & RCV_SHUTDOWN))
				1490	return 0;
				1491
				1492	/* Fixed the recv(..., MSG_OOB) behaviour. BSD docs and
				1493	* the available implementations agree in this case:
				1494	* this call should never block, independent of the
				1495	* blocking state of the socket.
				1496	* Mike <pall@rz.uni-karlsruhe.de>
				1497	*/
				1498	return -EAGAIN;
				1499	}
				1500
				1501	static int tcp_peek_sndq(struct sock sk, struct msghdr msg, int len)
				1502	{
				1503	struct sk_buff *skb;
				1504	int copied = 0, err = 0;
				1505
				1506	/* XXX -- need to support SO_PEEK_OFF */
				1507
				1508	skb_rbtree_walk(skb, &sk->tcp_rtx_queue) {
				1509	err = skb_copy_datagram_msg(skb, 0, msg, skb->len);
				1510	if (err)
				1511	return err;
				1512	copied += skb->len;
				1513	}
				1514
				1515	skb_queue_walk(&sk->sk_write_queue, skb) {
				1516	err = skb_copy_datagram_msg(skb, 0, msg, skb->len);
				1517	if (err)
				1518	break;
				1519
				1520	copied += skb->len;
				1521	}
				1522
				1523	return err ?: copied;
				1524	}
				1525
				1526	/* Clean up the receive buffer for full frames taken by the user,
				1527	* then send an ACK if necessary. COPIED is the number of bytes
				1528	* tcp_recvmsg has given to the user so far, it speeds up the
				1529	* calculation of whether or not we must ACK for the sake of
				1530	* a window update.
				1531	*/
				1532	static void tcp_cleanup_rbuf(struct sock *sk, int copied)
				1533	{
				1534	struct tcp_sock *tp = tcp_sk(sk);
				1535	bool time_to_ack = false;
				1536
				1537	struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
				1538
				1539	WARN(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq),
				1540	"cleanup rbuf bug: copied %X seq %X rcvnxt %X\n",
				1541	tp->copied_seq, TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt);
				1542
				1543	if (inet_csk_ack_scheduled(sk)) {
				1544	const struct inet_connection_sock *icsk = inet_csk(sk);
				1545	/* Delayed ACKs frequently hit locked sockets during bulk
				1546	* receive. */
				1547	if (icsk->icsk_ack.blocked \|\|
				1548	/* Once-per-two-segments ACK was not sent by tcp_input.c */
				1549	tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss \|\|
				1550	/*
				1551	* If this read emptied read buffer, we send ACK, if
				1552	* connection is not bidirectional, user drained
				1553	* receive buffer and there was a small segment
				1554	* in queue.
				1555	*/
				1556	(copied > 0 &&
				1557	((icsk->icsk_ack.pending & ICSK_ACK_PUSHED2) \|\|
				1558	((icsk->icsk_ack.pending & ICSK_ACK_PUSHED) &&
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1559	!inet_csk_in_pingpong_mode(sk))) &&
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1560	!atomic_read(&sk->sk_rmem_alloc)))
				1561	time_to_ack = true;
				1562	}
				1563
				1564	/* We send an ACK if we can now advertise a non-zero window
				1565	* which has been raised "significantly".
				1566	*
				1567	* Even if window raised up to infinity, do not send window open ACK
				1568	* in states, where we will not receive more. It is useless.
				1569	*/
				1570	if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
				1571	__u32 rcv_window_now = tcp_receive_window(tp);
				1572
				1573	/* Optimize, __tcp_select_window() is not cheap. */
				1574	if (2*rcv_window_now <= tp->window_clamp) {
				1575	__u32 new_window = __tcp_select_window(sk);
				1576
				1577	/* Send ACK now, if this read freed lots of space
				1578	* in our buffer. Certainly, new_window is new window.
				1579	* We can advertise it now, if it is not less than current one.
				1580	* "Lots" means "at least twice" here.
				1581	*/
				1582	if (new_window && new_window >= 2 * rcv_window_now)
				1583	time_to_ack = true;
				1584	}
				1585	}
				1586	if (time_to_ack)
				1587	tcp_send_ack(sk);
				1588	}
				1589
				1590	static struct sk_buff tcp_recv_skb(struct sock sk, u32 seq, u32 *off)
				1591	{
				1592	struct sk_buff *skb;
				1593	u32 offset;
				1594
				1595	while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) {
				1596	offset = seq - TCP_SKB_CB(skb)->seq;
				1597	if (unlikely(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
				1598	pr_err_once("%s: found a SYN, please report !\n", __func__);
				1599	offset--;
				1600	}
				1601	if (offset < skb->len \|\| (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) {
				1602	*off = offset;
				1603	return skb;
				1604	}
				1605	/* This looks weird, but this can happen if TCP collapsing
				1606	* splitted a fat GRO packet, while we released socket lock
				1607	* in skb_splice_bits()
				1608	*/
				1609	sk_eat_skb(sk, skb);
				1610	}
				1611	return NULL;
				1612	}
				1613
				1614	/*
				1615	* This routine provides an alternative to tcp_recvmsg() for routines
				1616	* that would like to handle copying from skbuffs directly in 'sendfile'
				1617	* fashion.
				1618	* Note:
				1619	* - It is assumed that the socket was locked by the caller.
				1620	* - The routine does not block.
				1621	* - At present, there is no support for reading OOB data
				1622	* or for 'peeking' the socket using this routine
				1623	* (although both would be easy to implement).
				1624	*/
				1625	int tcp_read_sock(struct sock sk, read_descriptor_t desc,
				1626	sk_read_actor_t recv_actor)
				1627	{
				1628	struct sk_buff *skb;
				1629	struct tcp_sock *tp = tcp_sk(sk);
				1630	u32 seq = tp->copied_seq;
				1631	u32 offset;
				1632	int copied = 0;
				1633
				1634	if (sk->sk_state == TCP_LISTEN)
				1635	return -ENOTCONN;
				1636	while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
				1637	if (offset < skb->len) {
				1638	int used;
				1639	size_t len;
				1640
				1641	len = skb->len - offset;
				1642	/* Stop reading if we hit a patch of urgent data */
				1643	if (tp->urg_data) {
				1644	u32 urg_offset = tp->urg_seq - seq;
				1645	if (urg_offset < len)
				1646	len = urg_offset;
				1647	if (!len)
				1648	break;
				1649	}
				1650	used = recv_actor(desc, skb, offset, len);
				1651	if (used <= 0) {
				1652	if (!copied)
				1653	copied = used;
				1654	break;
				1655	} else if (used <= len) {
				1656	seq += used;
				1657	copied += used;
				1658	offset += used;
				1659	}
				1660	/* If recv_actor drops the lock (e.g. TCP splice
				1661	* receive) the skb pointer might be invalid when
				1662	* getting here: tcp_collapse might have deleted it
				1663	* while aggregating skbs from the socket queue.
				1664	*/
				1665	skb = tcp_recv_skb(sk, seq - 1, &offset);
				1666	if (!skb)
				1667	break;
				1668	/* TCP coalescing might have appended data to the skb.
				1669	* Try to splice more frags
				1670	*/
				1671	if (offset + 1 != skb->len)
				1672	continue;
				1673	}
				1674	if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) {
				1675	sk_eat_skb(sk, skb);
				1676	++seq;
				1677	break;
				1678	}
				1679	sk_eat_skb(sk, skb);
				1680	if (!desc->count)
				1681	break;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1682	WRITE_ONCE(tp->copied_seq, seq);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1683	}
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1684	WRITE_ONCE(tp->copied_seq, seq);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1685
				1686	tcp_rcv_space_adjust(sk);
				1687
				1688	/* Clean up data we have read: This will do ACK frames. */
				1689	if (copied > 0) {
				1690	tcp_recv_skb(sk, seq, &offset);
				1691	tcp_cleanup_rbuf(sk, copied);
				1692	}
				1693	return copied;
				1694	}
				1695	EXPORT_SYMBOL(tcp_read_sock);
				1696
				1697	int tcp_peek_len(struct socket *sock)
				1698	{
				1699	return tcp_inq(sock->sk);
				1700	}
				1701	EXPORT_SYMBOL(tcp_peek_len);
				1702
				1703	/* Make sure sk_rcvbuf is big enough to satisfy SO_RCVLOWAT hint */
				1704	int tcp_set_rcvlowat(struct sock *sk, int val)
				1705	{
				1706	int cap;
				1707
				1708	if (sk->sk_userlocks & SOCK_RCVBUF_LOCK)
				1709	cap = sk->sk_rcvbuf >> 1;
				1710	else
				1711	cap = sock_net(sk)->ipv4.sysctl_tcp_rmem[2] >> 1;
				1712	val = min(val, cap);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1713	WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1714
				1715	/* Check if we need to signal EPOLLIN right now */
				1716	tcp_data_ready(sk);
				1717
				1718	if (sk->sk_userlocks & SOCK_RCVBUF_LOCK)
				1719	return 0;
				1720
				1721	val <<= 1;
				1722	if (val > sk->sk_rcvbuf) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1723	WRITE_ONCE(sk->sk_rcvbuf, val);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1724	tcp_sk(sk)->window_clamp = tcp_win_from_space(sk, val);
				1725	}
				1726	return 0;
				1727	}
				1728	EXPORT_SYMBOL(tcp_set_rcvlowat);
				1729
				1730	#ifdef CONFIG_MMU
				1731	static const struct vm_operations_struct tcp_vm_ops = {
				1732	};
				1733
				1734	int tcp_mmap(struct file file, struct socket sock,
				1735	struct vm_area_struct *vma)
				1736	{
				1737	if (vma->vm_flags & (VM_WRITE \| VM_EXEC))
				1738	return -EPERM;
				1739	vma->vm_flags &= ~(VM_MAYWRITE \| VM_MAYEXEC);
				1740
				1741	/* Instruct vm_insert_page() to not down_read(mmap_sem) */
				1742	vma->vm_flags \|= VM_MIXEDMAP;
				1743
				1744	vma->vm_ops = &tcp_vm_ops;
				1745	return 0;
				1746	}
				1747	EXPORT_SYMBOL(tcp_mmap);
				1748
				1749	static int tcp_zerocopy_receive(struct sock *sk,
				1750	struct tcp_zerocopy_receive *zc)
				1751	{
				1752	unsigned long address = (unsigned long)zc->address;
				1753	const skb_frag_t *frags = NULL;
				1754	u32 length = 0, seq, offset;
				1755	struct vm_area_struct *vma;
				1756	struct sk_buff *skb = NULL;
				1757	struct tcp_sock *tp;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1758	int inq;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1759	int ret;
				1760
				1761	if (address & (PAGE_SIZE - 1) \|\| address != zc->address)
				1762	return -EINVAL;
				1763
				1764	if (sk->sk_state == TCP_LISTEN)
				1765	return -ENOTCONN;
				1766
				1767	sock_rps_record_flow(sk);
				1768
				1769	down_read(&current->mm->mmap_sem);
				1770
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1771	vma = find_vma(current->mm, address);
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	1772	if (!vma \|\| vma->vm_start > address \|\| vma->vm_ops != &tcp_vm_ops) {
				1773	up_read(&current->mm->mmap_sem);
				1774	return -EINVAL;
				1775	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1776	zc->length = min_t(unsigned long, zc->length, vma->vm_end - address);
				1777
				1778	tp = tcp_sk(sk);
				1779	seq = tp->copied_seq;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1780	inq = tcp_inq(sk);
				1781	zc->length = min_t(u32, zc->length, inq);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1782	zc->length &= ~(PAGE_SIZE - 1);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1783	if (zc->length) {
				1784	zap_page_range(vma, address, zc->length);
				1785	zc->recv_skip_hint = 0;
				1786	} else {
				1787	zc->recv_skip_hint = inq;
				1788	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1789	ret = 0;
				1790	while (length + PAGE_SIZE <= zc->length) {
				1791	if (zc->recv_skip_hint < PAGE_SIZE) {
				1792	if (skb) {
				1793	skb = skb->next;
				1794	offset = seq - TCP_SKB_CB(skb)->seq;
				1795	} else {
				1796	skb = tcp_recv_skb(sk, seq, &offset);
				1797	}
				1798
				1799	zc->recv_skip_hint = skb->len - offset;
				1800	offset -= skb_headlen(skb);
				1801	if ((int)offset < 0 \|\| skb_has_frag_list(skb))
				1802	break;
				1803	frags = skb_shinfo(skb)->frags;
				1804	while (offset) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1805	if (skb_frag_size(frags) > offset)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1806	goto out;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1807	offset -= skb_frag_size(frags);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1808	frags++;
				1809	}
				1810	}
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1811	if (skb_frag_size(frags) != PAGE_SIZE \|\| skb_frag_off(frags)) {
				1812	int remaining = zc->recv_skip_hint;
				1813
				1814	while (remaining && (skb_frag_size(frags) != PAGE_SIZE \|\|
				1815	skb_frag_off(frags))) {
				1816	remaining -= skb_frag_size(frags);
				1817	frags++;
				1818	}
				1819	zc->recv_skip_hint -= remaining;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1820	break;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1821	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1822	ret = vm_insert_page(vma, address + length,
				1823	skb_frag_page(frags));
				1824	if (ret)
				1825	break;
				1826	length += PAGE_SIZE;
				1827	seq += PAGE_SIZE;
				1828	zc->recv_skip_hint -= PAGE_SIZE;
				1829	frags++;
				1830	}
				1831	out:
				1832	up_read(&current->mm->mmap_sem);
				1833	if (length) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1834	WRITE_ONCE(tp->copied_seq, seq);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1835	tcp_rcv_space_adjust(sk);
				1836
				1837	/* Clean up data we have read: This will do ACK frames. */
				1838	tcp_recv_skb(sk, seq, &offset);
				1839	tcp_cleanup_rbuf(sk, length);
				1840	ret = 0;
				1841	if (length == zc->length)
				1842	zc->recv_skip_hint = 0;
				1843	} else {
				1844	if (!zc->recv_skip_hint && sock_flag(sk, SOCK_DONE))
				1845	ret = -EIO;
				1846	}
				1847	zc->length = length;
				1848	return ret;
				1849	}
				1850	#endif
				1851
				1852	static void tcp_update_recv_tstamps(struct sk_buff *skb,
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1853	struct scm_timestamping_internal *tss)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1854	{
				1855	if (skb->tstamp)
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1856	tss->ts[0] = ktime_to_timespec64(skb->tstamp);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1857	else
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1858	tss->ts[0] = (struct timespec64) {0};
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1859
				1860	if (skb_hwtstamps(skb)->hwtstamp)
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1861	tss->ts[2] = ktime_to_timespec64(skb_hwtstamps(skb)->hwtstamp);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1862	else
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1863	tss->ts[2] = (struct timespec64) {0};
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1864	}
				1865
				1866	/* Similar to __sock_recv_timestamp, but does not require an skb */
				1867	static void tcp_recv_timestamp(struct msghdr msg, const struct sock sk,
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1868	struct scm_timestamping_internal *tss)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1869	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1870	int new_tstamp = sock_flag(sk, SOCK_TSTAMP_NEW);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1871	bool has_timestamping = false;
				1872
				1873	if (tss->ts[0].tv_sec \|\| tss->ts[0].tv_nsec) {
				1874	if (sock_flag(sk, SOCK_RCVTSTAMP)) {
				1875	if (sock_flag(sk, SOCK_RCVTSTAMPNS)) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1876	if (new_tstamp) {
				1877	struct __kernel_timespec kts = {tss->ts[0].tv_sec, tss->ts[0].tv_nsec};
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1878
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1879	put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPNS_NEW,
				1880	sizeof(kts), &kts);
				1881	} else {
				1882	struct timespec ts_old = timespec64_to_timespec(tss->ts[0]);
				1883
				1884	put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPNS_OLD,
				1885	sizeof(ts_old), &ts_old);
				1886	}
				1887	} else {
				1888	if (new_tstamp) {
				1889	struct __kernel_sock_timeval stv;
				1890
				1891	stv.tv_sec = tss->ts[0].tv_sec;
				1892	stv.tv_usec = tss->ts[0].tv_nsec / 1000;
				1893	put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_NEW,
				1894	sizeof(stv), &stv);
				1895	} else {
				1896	struct __kernel_old_timeval tv;
				1897
				1898	tv.tv_sec = tss->ts[0].tv_sec;
				1899	tv.tv_usec = tss->ts[0].tv_nsec / 1000;
				1900	put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_OLD,
				1901	sizeof(tv), &tv);
				1902	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1903	}
				1904	}
				1905
				1906	if (sk->sk_tsflags & SOF_TIMESTAMPING_SOFTWARE)
				1907	has_timestamping = true;
				1908	else
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1909	tss->ts[0] = (struct timespec64) {0};
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1910	}
				1911
				1912	if (tss->ts[2].tv_sec \|\| tss->ts[2].tv_nsec) {
				1913	if (sk->sk_tsflags & SOF_TIMESTAMPING_RAW_HARDWARE)
				1914	has_timestamping = true;
				1915	else
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1916	tss->ts[2] = (struct timespec64) {0};
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1917	}
				1918
				1919	if (has_timestamping) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1920	tss->ts[1] = (struct timespec64) {0};
				1921	if (sock_flag(sk, SOCK_TSTAMP_NEW))
				1922	put_cmsg_scm_timestamping64(msg, tss);
				1923	else
				1924	put_cmsg_scm_timestamping(msg, tss);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1925	}
				1926	}
				1927
				1928	static int tcp_inq_hint(struct sock *sk)
				1929	{
				1930	const struct tcp_sock *tp = tcp_sk(sk);
				1931	u32 copied_seq = READ_ONCE(tp->copied_seq);
				1932	u32 rcv_nxt = READ_ONCE(tp->rcv_nxt);
				1933	int inq;
				1934
				1935	inq = rcv_nxt - copied_seq;
				1936	if (unlikely(inq < 0 \|\| copied_seq != READ_ONCE(tp->copied_seq))) {
				1937	lock_sock(sk);
				1938	inq = tp->rcv_nxt - tp->copied_seq;
				1939	release_sock(sk);
				1940	}
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1941	/* After receiving a FIN, tell the user-space to continue reading
				1942	* by returning a non-zero inq.
				1943	*/
				1944	if (inq == 0 && sock_flag(sk, SOCK_DONE))
				1945	inq = 1;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1946	return inq;
				1947	}
				1948
				1949	/*
				1950	* This routine copies from a sock struct into the user buffer.
				1951	*
				1952	* Technical note: in 2.3 we work on _locked_ socket, so that
				1953	* tricks with *seq access order and skb->users are not required.
				1954	* Probably, code can be easily improved even more.
				1955	*/
				1956
				1957	int tcp_recvmsg(struct sock sk, struct msghdr msg, size_t len, int nonblock,
				1958	int flags, int *addr_len)
				1959	{
				1960	struct tcp_sock *tp = tcp_sk(sk);
				1961	int copied = 0;
				1962	u32 peek_seq;
				1963	u32 *seq;
				1964	unsigned long used;
				1965	int err, inq;
				1966	int target; /* Read at least this many bytes */
				1967	long timeo;
				1968	struct sk_buff skb, last;
				1969	u32 urg_hole = 0;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1970	struct scm_timestamping_internal tss;
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	1971	int cmsg_flags;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1972
				1973	if (unlikely(flags & MSG_ERRQUEUE))
				1974	return inet_recv_error(sk, msg, len, addr_len);
				1975
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1976	if (sk_can_busy_loop(sk) && skb_queue_empty_lockless(&sk->sk_receive_queue) &&
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1977	(sk->sk_state == TCP_ESTABLISHED))
				1978	sk_busy_loop(sk, nonblock);
				1979
				1980	lock_sock(sk);
				1981
				1982	err = -ENOTCONN;
				1983	if (sk->sk_state == TCP_LISTEN)
				1984	goto out;
				1985
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	1986	cmsg_flags = tp->recvmsg_inq ? 1 : 0;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1987	timeo = sock_rcvtimeo(sk, nonblock);
				1988
				1989	/* Urgent data needs to be handled specially. */
				1990	if (flags & MSG_OOB)
				1991	goto recv_urg;
				1992
				1993	if (unlikely(tp->repair)) {
				1994	err = -EPERM;
				1995	if (!(flags & MSG_PEEK))
				1996	goto out;
				1997
				1998	if (tp->repair_queue == TCP_SEND_QUEUE)
				1999	goto recv_sndq;
				2000
				2001	err = -EINVAL;
				2002	if (tp->repair_queue == TCP_NO_QUEUE)
				2003	goto out;
				2004
				2005	/* 'common' recv queue MSG_PEEK-ing */
				2006	}
				2007
				2008	seq = &tp->copied_seq;
				2009	if (flags & MSG_PEEK) {
				2010	peek_seq = tp->copied_seq;
				2011	seq = &peek_seq;
				2012	}
				2013
				2014	target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
				2015
				2016	do {
				2017	u32 offset;
				2018
				2019	/* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */
				2020	if (tp->urg_data && tp->urg_seq == *seq) {
				2021	if (copied)
				2022	break;
				2023	if (signal_pending(current)) {
				2024	copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
				2025	break;
				2026	}
				2027	}
				2028
				2029	/* Next get a buffer. */
				2030
				2031	last = skb_peek_tail(&sk->sk_receive_queue);
				2032	skb_queue_walk(&sk->sk_receive_queue, skb) {
				2033	last = skb;
				2034	/* Now that we have two receive queues this
				2035	* shouldn't happen.
				2036	*/
				2037	if (WARN(before(*seq, TCP_SKB_CB(skb)->seq),
				2038	"TCP recvmsg seq # bug: copied %X, seq %X, rcvnxt %X, fl %X\n",
				2039	*seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt,
				2040	flags))
				2041	break;
				2042
				2043	offset = *seq - TCP_SKB_CB(skb)->seq;
				2044	if (unlikely(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
				2045	pr_err_once("%s: found a SYN, please report !\n", __func__);
				2046	offset--;
				2047	}
				2048	if (offset < skb->len)
				2049	goto found_ok_skb;
				2050	if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
				2051	goto found_fin_ok;
				2052	WARN(!(flags & MSG_PEEK),
				2053	"TCP recvmsg seq # bug 2: copied %X, seq %X, rcvnxt %X, fl %X\n",
				2054	*seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt, flags);
				2055	}
				2056
				2057	/* Well, if we have backlog, try to process it now yet. */
				2058
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	2059	if (copied >= target && !READ_ONCE(sk->sk_backlog.tail))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2060	break;
				2061
				2062	if (copied) {
				2063	if (sk->sk_err \|\|
				2064	sk->sk_state == TCP_CLOSE \|\|
				2065	(sk->sk_shutdown & RCV_SHUTDOWN) \|\|
				2066	!timeo \|\|
				2067	signal_pending(current))
				2068	break;
				2069	} else {
				2070	if (sock_flag(sk, SOCK_DONE))
				2071	break;
				2072
				2073	if (sk->sk_err) {
				2074	copied = sock_error(sk);
				2075	break;
				2076	}
				2077
				2078	if (sk->sk_shutdown & RCV_SHUTDOWN)
				2079	break;
				2080
				2081	if (sk->sk_state == TCP_CLOSE) {
				2082	/* This occurs when user tries to read
				2083	* from never connected socket.
				2084	*/
				2085	copied = -ENOTCONN;
				2086	break;
				2087	}
				2088
				2089	if (!timeo) {
				2090	copied = -EAGAIN;
				2091	break;
				2092	}
				2093
				2094	if (signal_pending(current)) {
				2095	copied = sock_intr_errno(timeo);
				2096	break;
				2097	}
				2098	}
				2099
				2100	tcp_cleanup_rbuf(sk, copied);
				2101
				2102	if (copied >= target) {
				2103	/* Do not sleep, just process backlog. */
				2104	release_sock(sk);
				2105	lock_sock(sk);
				2106	} else {
				2107	sk_wait_data(sk, &timeo, last);
				2108	}
				2109
				2110	if ((flags & MSG_PEEK) &&
				2111	(peek_seq - copied - urg_hole != tp->copied_seq)) {
				2112	net_dbg_ratelimited("TCP(%s:%d): Application bug, race in MSG_PEEK\n",
				2113	current->comm,
				2114	task_pid_nr(current));
				2115	peek_seq = tp->copied_seq;
				2116	}
				2117	continue;
				2118
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2119	found_ok_skb:
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2120	/* Ok so how much can we use? */
				2121	used = skb->len - offset;
				2122	if (len < used)
				2123	used = len;
				2124
				2125	/* Do we have urgent data here? */
				2126	if (tp->urg_data) {
				2127	u32 urg_offset = tp->urg_seq - *seq;
				2128	if (urg_offset < used) {
				2129	if (!urg_offset) {
				2130	if (!sock_flag(sk, SOCK_URGINLINE)) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2131	WRITE_ONCE(seq, seq + 1);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2132	urg_hole++;
				2133	offset++;
				2134	used--;
				2135	if (!used)
				2136	goto skip_copy;
				2137	}
				2138	} else
				2139	used = urg_offset;
				2140	}
				2141	}
				2142
				2143	if (!(flags & MSG_TRUNC)) {
				2144	err = skb_copy_datagram_msg(skb, offset, msg, used);
				2145	if (err) {
				2146	/* Exception. Bailout! */
				2147	if (!copied)
				2148	copied = -EFAULT;
				2149	break;
				2150	}
				2151	}
				2152
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2153	WRITE_ONCE(seq, seq + used);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2154	copied += used;
				2155	len -= used;
				2156
				2157	tcp_rcv_space_adjust(sk);
				2158
				2159	skip_copy:
				2160	if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
				2161	tp->urg_data = 0;
				2162	tcp_fast_path_check(sk);
				2163	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2164
				2165	if (TCP_SKB_CB(skb)->has_rxtstamp) {
				2166	tcp_update_recv_tstamps(skb, &tss);
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	2167	cmsg_flags \|= 2;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2168	}
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	2169
				2170	if (used + offset < skb->len)
				2171	continue;
				2172
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2173	if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
				2174	goto found_fin_ok;
				2175	if (!(flags & MSG_PEEK))
				2176	sk_eat_skb(sk, skb);
				2177	continue;
				2178
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2179	found_fin_ok:
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2180	/* Process the FIN. */
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2181	WRITE_ONCE(seq, seq + 1);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2182	if (!(flags & MSG_PEEK))
				2183	sk_eat_skb(sk, skb);
				2184	break;
				2185	} while (len > 0);
				2186
				2187	/* According to UNIX98, msg_name/msg_namelen are ignored
				2188	* on connected socket. I was just happy when found this 8) --ANK
				2189	*/
				2190
				2191	/* Clean up data we have read: This will do ACK frames. */
				2192	tcp_cleanup_rbuf(sk, copied);
				2193
				2194	release_sock(sk);
				2195
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	2196	if (cmsg_flags) {
				2197	if (cmsg_flags & 2)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2198	tcp_recv_timestamp(msg, sk, &tss);
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	2199	if (cmsg_flags & 1) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2200	inq = tcp_inq_hint(sk);
				2201	put_cmsg(msg, SOL_TCP, TCP_CM_INQ, sizeof(inq), &inq);
				2202	}
				2203	}
				2204
				2205	return copied;
				2206
				2207	out:
				2208	release_sock(sk);
				2209	return err;
				2210
				2211	recv_urg:
				2212	err = tcp_recv_urg(sk, msg, len, flags);
				2213	goto out;
				2214
				2215	recv_sndq:
				2216	err = tcp_peek_sndq(sk, msg, len);
				2217	goto out;
				2218	}
				2219	EXPORT_SYMBOL(tcp_recvmsg);
				2220
				2221	void tcp_set_state(struct sock *sk, int state)
				2222	{
				2223	int oldstate = sk->sk_state;
				2224
				2225	/* We defined a new enum for TCP states that are exported in BPF
				2226	* so as not force the internal TCP states to be frozen. The
				2227	* following checks will detect if an internal state value ever
				2228	* differs from the BPF value. If this ever happens, then we will
				2229	* need to remap the internal value to the BPF value before calling
				2230	* tcp_call_bpf_2arg.
				2231	*/
				2232	BUILD_BUG_ON((int)BPF_TCP_ESTABLISHED != (int)TCP_ESTABLISHED);
				2233	BUILD_BUG_ON((int)BPF_TCP_SYN_SENT != (int)TCP_SYN_SENT);
				2234	BUILD_BUG_ON((int)BPF_TCP_SYN_RECV != (int)TCP_SYN_RECV);
				2235	BUILD_BUG_ON((int)BPF_TCP_FIN_WAIT1 != (int)TCP_FIN_WAIT1);
				2236	BUILD_BUG_ON((int)BPF_TCP_FIN_WAIT2 != (int)TCP_FIN_WAIT2);
				2237	BUILD_BUG_ON((int)BPF_TCP_TIME_WAIT != (int)TCP_TIME_WAIT);
				2238	BUILD_BUG_ON((int)BPF_TCP_CLOSE != (int)TCP_CLOSE);
				2239	BUILD_BUG_ON((int)BPF_TCP_CLOSE_WAIT != (int)TCP_CLOSE_WAIT);
				2240	BUILD_BUG_ON((int)BPF_TCP_LAST_ACK != (int)TCP_LAST_ACK);
				2241	BUILD_BUG_ON((int)BPF_TCP_LISTEN != (int)TCP_LISTEN);
				2242	BUILD_BUG_ON((int)BPF_TCP_CLOSING != (int)TCP_CLOSING);
				2243	BUILD_BUG_ON((int)BPF_TCP_NEW_SYN_RECV != (int)TCP_NEW_SYN_RECV);
				2244	BUILD_BUG_ON((int)BPF_TCP_MAX_STATES != (int)TCP_MAX_STATES);
				2245
				2246	if (BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), BPF_SOCK_OPS_STATE_CB_FLAG))
				2247	tcp_call_bpf_2arg(sk, BPF_SOCK_OPS_STATE_CB, oldstate, state);
				2248
				2249	switch (state) {
				2250	case TCP_ESTABLISHED:
				2251	if (oldstate != TCP_ESTABLISHED)
				2252	TCP_INC_STATS(sock_net(sk), TCP_MIB_CURRESTAB);
				2253	break;
				2254
				2255	case TCP_CLOSE:
				2256	if (oldstate == TCP_CLOSE_WAIT \|\| oldstate == TCP_ESTABLISHED)
				2257	TCP_INC_STATS(sock_net(sk), TCP_MIB_ESTABRESETS);
				2258
				2259	sk->sk_prot->unhash(sk);
				2260	if (inet_csk(sk)->icsk_bind_hash &&
				2261	!(sk->sk_userlocks & SOCK_BINDPORT_LOCK))
				2262	inet_put_port(sk);
				2263	/* fall through */
				2264	default:
				2265	if (oldstate == TCP_ESTABLISHED)
				2266	TCP_DEC_STATS(sock_net(sk), TCP_MIB_CURRESTAB);
				2267	}
				2268
				2269	/* Change state AFTER socket is unhashed to avoid closed
				2270	* socket sitting in hash tables.
				2271	*/
				2272	inet_sk_state_store(sk, state);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2273	}
				2274	EXPORT_SYMBOL_GPL(tcp_set_state);
				2275
				2276	/*
				2277	* State processing on a close. This implements the state shift for
				2278	* sending our FIN frame. Note that we only send a FIN for some
				2279	* states. A shutdown() may have already sent the FIN, or we may be
				2280	* closed.
				2281	*/
				2282
				2283	static const unsigned char new_state[16] = {
				2284	/* current state: new state: action: */
				2285	[0 /* (Invalid) */] = TCP_CLOSE,
				2286	[TCP_ESTABLISHED] = TCP_FIN_WAIT1 \| TCP_ACTION_FIN,
				2287	[TCP_SYN_SENT] = TCP_CLOSE,
				2288	[TCP_SYN_RECV] = TCP_FIN_WAIT1 \| TCP_ACTION_FIN,
				2289	[TCP_FIN_WAIT1] = TCP_FIN_WAIT1,
				2290	[TCP_FIN_WAIT2] = TCP_FIN_WAIT2,
				2291	[TCP_TIME_WAIT] = TCP_CLOSE,
				2292	[TCP_CLOSE] = TCP_CLOSE,
				2293	[TCP_CLOSE_WAIT] = TCP_LAST_ACK \| TCP_ACTION_FIN,
				2294	[TCP_LAST_ACK] = TCP_LAST_ACK,
				2295	[TCP_LISTEN] = TCP_CLOSE,
				2296	[TCP_CLOSING] = TCP_CLOSING,
				2297	[TCP_NEW_SYN_RECV] = TCP_CLOSE, /* should not happen ! */
				2298	};
				2299
				2300	static int tcp_close_state(struct sock *sk)
				2301	{
				2302	int next = (int)new_state[sk->sk_state];
				2303	int ns = next & TCP_STATE_MASK;
				2304
				2305	tcp_set_state(sk, ns);
				2306
				2307	return next & TCP_ACTION_FIN;
				2308	}
				2309
				2310	/*
				2311	* Shutdown the sending side of a connection. Much like close except
				2312	* that we don't receive shut down or sock_set_flag(sk, SOCK_DEAD).
				2313	*/
				2314
				2315	void tcp_shutdown(struct sock *sk, int how)
				2316	{
				2317	/* We need to grab some memory, and put together a FIN,
				2318	* and then put it into the queue to be sent.
				2319	* Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
				2320	*/
				2321	if (!(how & SEND_SHUTDOWN))
				2322	return;
				2323
				2324	/* If we've already sent a FIN, or it's a closed state, skip this. */
				2325	if ((1 << sk->sk_state) &
				2326	(TCPF_ESTABLISHED \| TCPF_SYN_SENT \|
				2327	TCPF_SYN_RECV \| TCPF_CLOSE_WAIT)) {
				2328	/* Clear out any half completed packets. FIN if needed. */
				2329	if (tcp_close_state(sk))
				2330	tcp_send_fin(sk);
				2331	}
				2332	}
				2333	EXPORT_SYMBOL(tcp_shutdown);
				2334
				2335	bool tcp_check_oom(struct sock *sk, int shift)
				2336	{
				2337	bool too_many_orphans, out_of_socket_memory;
				2338
				2339	too_many_orphans = tcp_too_many_orphans(sk, shift);
				2340	out_of_socket_memory = tcp_out_of_memory(sk);
				2341
				2342	if (too_many_orphans)
				2343	net_info_ratelimited("too many orphaned sockets\n");
				2344	if (out_of_socket_memory)
				2345	net_info_ratelimited("out of memory -- consider tuning tcp_mem\n");
				2346	return too_many_orphans \|\| out_of_socket_memory;
				2347	}
				2348
				2349	void tcp_close(struct sock *sk, long timeout)
				2350	{
				2351	struct sk_buff *skb;
				2352	int data_was_unread = 0;
				2353	int state;
				2354
				2355	lock_sock(sk);
				2356	sk->sk_shutdown = SHUTDOWN_MASK;
				2357
				2358	if (sk->sk_state == TCP_LISTEN) {
				2359	tcp_set_state(sk, TCP_CLOSE);
				2360
				2361	/* Special case. */
				2362	inet_csk_listen_stop(sk);
				2363
				2364	goto adjudge_to_death;
				2365	}
				2366
				2367	/* We need to flush the recv. buffs. We do this only on the
				2368	* descriptor close, not protocol-sourced closes, because the
				2369	* reader process may not have drained the data yet!
				2370	*/
				2371	while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
				2372	u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq;
				2373
				2374	if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
				2375	len--;
				2376	data_was_unread += len;
				2377	__kfree_skb(skb);
				2378	}
				2379
				2380	sk_mem_reclaim(sk);
				2381
				2382	/* If socket has been already reset (e.g. in tcp_reset()) - kill it. */
				2383	if (sk->sk_state == TCP_CLOSE)
				2384	goto adjudge_to_death;
				2385
				2386	/* As outlined in RFC 2525, section 2.17, we send a RST here because
				2387	* data was lost. To witness the awful effects of the old behavior of
				2388	* always doing a FIN, run an older 2.1.x kernel or 2.0.x, start a bulk
				2389	* GET in an FTP client, suspend the process, wait for the client to
				2390	* advertise a zero window, then kill -9 the FTP client, wheee...
				2391	* Note: timeout is always zero in such a case.
				2392	*/
				2393	if (unlikely(tcp_sk(sk)->repair)) {
				2394	sk->sk_prot->disconnect(sk, 0);
				2395	} else if (data_was_unread) {
				2396	/* Unread data was tossed, zap the connection. */
				2397	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE);
				2398	tcp_set_state(sk, TCP_CLOSE);
				2399	tcp_send_active_reset(sk, sk->sk_allocation);
				2400	} else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
				2401	/* Check zero linger _after_ checking for unread data. */
				2402	sk->sk_prot->disconnect(sk, 0);
				2403	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
				2404	} else if (tcp_close_state(sk)) {
				2405	/* We FIN if the application ate all the data before
				2406	* zapping the connection.
				2407	*/
				2408
				2409	/* RED-PEN. Formally speaking, we have broken TCP state
				2410	* machine. State transitions:
				2411	*
				2412	* TCP_ESTABLISHED -> TCP_FIN_WAIT1
				2413	* TCP_SYN_RECV -> TCP_FIN_WAIT1 (forget it, it's impossible)
				2414	* TCP_CLOSE_WAIT -> TCP_LAST_ACK
				2415	*
				2416	* are legal only when FIN has been sent (i.e. in window),
				2417	* rather than queued out of window. Purists blame.
				2418	*
				2419	* F.e. "RFC state" is ESTABLISHED,
				2420	* if Linux state is FIN-WAIT-1, but FIN is still not sent.
				2421	*
				2422	* The visible declinations are that sometimes
				2423	* we enter time-wait state, when it is not required really
				2424	* (harmless), do not send active resets, when they are
				2425	* required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when
				2426	* they look as CLOSING or LAST_ACK for Linux)
				2427	* Probably, I missed some more holelets.
				2428	* --ANK
				2429	* XXX (TFO) - To start off we don't support SYN+ACK+FIN
				2430	* in a single packet! (May consider it later but will
				2431	* probably need API support or TCP_CORK SYN-ACK until
				2432	* data is written and socket is closed.)
				2433	*/
				2434	tcp_send_fin(sk);
				2435	}
				2436
				2437	sk_stream_wait_close(sk, timeout);
				2438
				2439	adjudge_to_death:
				2440	state = sk->sk_state;
				2441	sock_hold(sk);
				2442	sock_orphan(sk);
				2443
				2444	local_bh_disable();
				2445	bh_lock_sock(sk);
				2446	/* remove backlog if any, without releasing ownership. */
				2447	__release_sock(sk);
				2448
				2449	percpu_counter_inc(sk->sk_prot->orphan_count);
				2450
				2451	/* Have we already been destroyed by a softirq or backlog? */
				2452	if (state != TCP_CLOSE && sk->sk_state == TCP_CLOSE)
				2453	goto out;
				2454
				2455	/* This is a (useful) BSD violating of the RFC. There is a
				2456	* problem with TCP as specified in that the other end could
				2457	* keep a socket open forever with no application left this end.
				2458	* We use a 1 minute timeout (about the same as BSD) then kill
				2459	* our end. If they send after that then tough - BUT: long enough
				2460	* that we won't make the old 4*rto = almost no time - whoops
				2461	* reset mistake.
				2462	*
				2463	* Nope, it was not mistake. It is really desired behaviour
				2464	* f.e. on http servers, when such sockets are useless, but
				2465	* consume significant resources. Let's do it with special
				2466	* linger2 option. --ANK
				2467	*/
				2468
				2469	if (sk->sk_state == TCP_FIN_WAIT2) {
				2470	struct tcp_sock *tp = tcp_sk(sk);
				2471	if (tp->linger2 < 0) {
				2472	tcp_set_state(sk, TCP_CLOSE);
				2473	tcp_send_active_reset(sk, GFP_ATOMIC);
				2474	__NET_INC_STATS(sock_net(sk),
				2475	LINUX_MIB_TCPABORTONLINGER);
				2476	} else {
				2477	const int tmo = tcp_fin_time(sk);
				2478
				2479	if (tmo > TCP_TIMEWAIT_LEN) {
				2480	inet_csk_reset_keepalive_timer(sk,
				2481	tmo - TCP_TIMEWAIT_LEN);
				2482	} else {
				2483	tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
				2484	goto out;
				2485	}
				2486	}
				2487	}
				2488	if (sk->sk_state != TCP_CLOSE) {
				2489	sk_mem_reclaim(sk);
				2490	if (tcp_check_oom(sk, 0)) {
				2491	tcp_set_state(sk, TCP_CLOSE);
				2492	tcp_send_active_reset(sk, GFP_ATOMIC);
				2493	__NET_INC_STATS(sock_net(sk),
				2494	LINUX_MIB_TCPABORTONMEMORY);
				2495	} else if (!check_net(sock_net(sk))) {
				2496	/* Not possible to send reset; just close */
				2497	tcp_set_state(sk, TCP_CLOSE);
				2498	}
				2499	}
				2500
				2501	if (sk->sk_state == TCP_CLOSE) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2502	struct request_sock *req;
				2503
				2504	req = rcu_dereference_protected(tcp_sk(sk)->fastopen_rsk,
				2505	lockdep_sock_is_held(sk));
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2506	/* We could get here with a non-NULL req if the socket is
				2507	* aborted (e.g., closed with unread data) before 3WHS
				2508	* finishes.
				2509	*/
				2510	if (req)
				2511	reqsk_fastopen_remove(sk, req, false);
				2512	inet_csk_destroy_sock(sk);
				2513	}
				2514	/* Otherwise, socket is reprieved until protocol close. */
				2515
				2516	out:
				2517	bh_unlock_sock(sk);
				2518	local_bh_enable();
				2519	release_sock(sk);
				2520	sock_put(sk);
				2521	}
				2522	EXPORT_SYMBOL(tcp_close);
				2523
				2524	/* These states need RST on ABORT according to RFC793 */
				2525
				2526	static inline bool tcp_need_reset(int state)
				2527	{
				2528	return (1 << state) &
				2529	(TCPF_ESTABLISHED \| TCPF_CLOSE_WAIT \| TCPF_FIN_WAIT1 \|
				2530	TCPF_FIN_WAIT2 \| TCPF_SYN_RECV);
				2531	}
				2532
				2533	static void tcp_rtx_queue_purge(struct sock *sk)
				2534	{
				2535	struct rb_node *p = rb_first(&sk->tcp_rtx_queue);
				2536
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	2537	tcp_sk(sk)->highest_sack = NULL;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2538	while (p) {
				2539	struct sk_buff *skb = rb_to_skb(p);
				2540
				2541	p = rb_next(p);
				2542	/* Since we are deleting whole queue, no need to
				2543	* list_del(&skb->tcp_tsorted_anchor)
				2544	*/
				2545	tcp_rtx_queue_unlink(skb, sk);
				2546	sk_wmem_free_skb(sk, skb);
				2547	}
				2548	}
				2549
				2550	void tcp_write_queue_purge(struct sock *sk)
				2551	{
				2552	struct sk_buff *skb;
				2553
				2554	tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
				2555	while ((skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
				2556	tcp_skb_tsorted_anchor_cleanup(skb);
				2557	sk_wmem_free_skb(sk, skb);
				2558	}
				2559	tcp_rtx_queue_purge(sk);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2560	skb = sk->sk_tx_skb_cache;
				2561	if (skb) {
				2562	__kfree_skb(skb);
				2563	sk->sk_tx_skb_cache = NULL;
				2564	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2565	INIT_LIST_HEAD(&tcp_sk(sk)->tsorted_sent_queue);
				2566	sk_mem_reclaim(sk);
				2567	tcp_clear_all_retrans_hints(tcp_sk(sk));
				2568	tcp_sk(sk)->packets_out = 0;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2569	inet_csk(sk)->icsk_backoff = 0;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2570	}
				2571
				2572	int tcp_disconnect(struct sock *sk, int flags)
				2573	{
				2574	struct inet_sock *inet = inet_sk(sk);
				2575	struct inet_connection_sock *icsk = inet_csk(sk);
				2576	struct tcp_sock *tp = tcp_sk(sk);
				2577	int old_state = sk->sk_state;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2578	u32 seq;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2579
				2580	if (old_state != TCP_CLOSE)
				2581	tcp_set_state(sk, TCP_CLOSE);
				2582
				2583	/* ABORT function of RFC793 */
				2584	if (old_state == TCP_LISTEN) {
				2585	inet_csk_listen_stop(sk);
				2586	} else if (unlikely(tp->repair)) {
				2587	sk->sk_err = ECONNABORTED;
				2588	} else if (tcp_need_reset(old_state) \|\|
				2589	(tp->snd_nxt != tp->write_seq &&
				2590	(1 << old_state) & (TCPF_CLOSING \| TCPF_LAST_ACK))) {
				2591	/* The last check adjusts for discrepancy of Linux wrt. RFC
				2592	* states
				2593	*/
				2594	tcp_send_active_reset(sk, gfp_any());
				2595	sk->sk_err = ECONNRESET;
				2596	} else if (old_state == TCP_SYN_SENT)
				2597	sk->sk_err = ECONNRESET;
				2598
				2599	tcp_clear_xmit_timers(sk);
				2600	__skb_queue_purge(&sk->sk_receive_queue);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2601	if (sk->sk_rx_skb_cache) {
				2602	__kfree_skb(sk->sk_rx_skb_cache);
				2603	sk->sk_rx_skb_cache = NULL;
				2604	}
				2605	WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2606	tp->urg_data = 0;
				2607	tcp_write_queue_purge(sk);
				2608	tcp_fastopen_active_disable_ofo_check(sk);
				2609	skb_rbtree_purge(&tp->out_of_order_queue);
				2610
				2611	inet->inet_dport = 0;
				2612
				2613	if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
				2614	inet_reset_saddr(sk);
				2615
				2616	sk->sk_shutdown = 0;
				2617	sock_reset_flag(sk, SOCK_DONE);
				2618	tp->srtt_us = 0;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2619	tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2620	tp->rcv_rtt_last_tsecr = 0;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2621
				2622	seq = tp->write_seq + tp->max_window + 2;
				2623	if (!seq)
				2624	seq = 1;
				2625	WRITE_ONCE(tp->write_seq, seq);
				2626
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2627	icsk->icsk_backoff = 0;
				2628	tp->snd_cwnd = 2;
				2629	icsk->icsk_probes_out = 0;
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	2630	icsk->icsk_probes_tstamp = 0;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2631	icsk->icsk_rto = TCP_TIMEOUT_INIT;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2632	tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2633	tp->snd_cwnd = TCP_INIT_CWND;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2634	tp->snd_cwnd_cnt = 0;
				2635	tp->window_clamp = 0;
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	2636	tp->delivered = 0;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2637	tp->delivered_ce = 0;
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	2638	if (icsk->icsk_ca_ops->release)
				2639	icsk->icsk_ca_ops->release(sk);
				2640	memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv));
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2641	tcp_set_ca_state(sk, TCP_CA_Open);
				2642	tp->is_sack_reneg = 0;
				2643	tcp_clear_retrans(tp);
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	2644	tp->total_retrans = 0;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2645	inet_csk_delack_init(sk);
				2646	/* Initialize rcv_mss to TCP_MIN_MSS to avoid division by 0
				2647	* issue in __tcp_select_window()
				2648	*/
				2649	icsk->icsk_ack.rcv_mss = TCP_MIN_MSS;
				2650	memset(&tp->rx_opt, 0, sizeof(tp->rx_opt));
				2651	__sk_dst_reset(sk);
				2652	dst_release(sk->sk_rx_dst);
				2653	sk->sk_rx_dst = NULL;
				2654	tcp_saved_syn_free(tp);
				2655	tp->compressed_ack = 0;
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	2656	tp->segs_in = 0;
				2657	tp->segs_out = 0;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2658	tp->bytes_sent = 0;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2659	tp->bytes_acked = 0;
				2660	tp->bytes_received = 0;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2661	tp->bytes_retrans = 0;
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	2662	tp->data_segs_in = 0;
				2663	tp->data_segs_out = 0;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2664	tp->duplicate_sack[0].start_seq = 0;
				2665	tp->duplicate_sack[0].end_seq = 0;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2666	tp->dsack_dups = 0;
				2667	tp->reord_seen = 0;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2668	tp->retrans_out = 0;
				2669	tp->sacked_out = 0;
				2670	tp->tlp_high_seq = 0;
				2671	tp->last_oow_ack_time = 0;
				2672	/* There's a bubble in the pipe until at least the first ACK. */
				2673	tp->app_limited = ~0U;
				2674	tp->rack.mstamp = 0;
				2675	tp->rack.advanced = 0;
				2676	tp->rack.reo_wnd_steps = 1;
				2677	tp->rack.last_delivered = 0;
				2678	tp->rack.reo_wnd_persist = 0;
				2679	tp->rack.dsack_seen = 0;
				2680	tp->syn_data_acked = 0;
				2681	tp->rx_opt.saw_tstamp = 0;
				2682	tp->rx_opt.dsack = 0;
				2683	tp->rx_opt.num_sacks = 0;
				2684	tp->rcv_ooopack = 0;
				2685
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2686
				2687	/* Clean up fastopen related fields */
				2688	tcp_free_fastopen_req(tp);
				2689	inet->defer_connect = 0;
				2690
				2691	WARN_ON(inet->inet_num && !icsk->icsk_bind_hash);
				2692
				2693	if (sk->sk_frag.page) {
				2694	put_page(sk->sk_frag.page);
				2695	sk->sk_frag.page = NULL;
				2696	sk->sk_frag.offset = 0;
				2697	}
				2698
				2699	sk->sk_error_report(sk);
				2700	return 0;
				2701	}
				2702	EXPORT_SYMBOL(tcp_disconnect);
				2703
				2704	static inline bool tcp_can_repair_sock(const struct sock *sk)
				2705	{
				2706	return ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN) &&
				2707	(sk->sk_state != TCP_LISTEN);
				2708	}
				2709
				2710	static int tcp_repair_set_window(struct tcp_sock tp, char __user optbuf, int len)
				2711	{
				2712	struct tcp_repair_window opt;
				2713
				2714	if (!tp->repair)
				2715	return -EPERM;
				2716
				2717	if (len != sizeof(opt))
				2718	return -EINVAL;
				2719
				2720	if (copy_from_user(&opt, optbuf, sizeof(opt)))
				2721	return -EFAULT;
				2722
				2723	if (opt.max_window < opt.snd_wnd)
				2724	return -EINVAL;
				2725
				2726	if (after(opt.snd_wl1, tp->rcv_nxt + opt.rcv_wnd))
				2727	return -EINVAL;
				2728
				2729	if (after(opt.rcv_wup, tp->rcv_nxt))
				2730	return -EINVAL;
				2731
				2732	tp->snd_wl1 = opt.snd_wl1;
				2733	tp->snd_wnd = opt.snd_wnd;
				2734	tp->max_window = opt.max_window;
				2735
				2736	tp->rcv_wnd = opt.rcv_wnd;
				2737	tp->rcv_wup = opt.rcv_wup;
				2738
				2739	return 0;
				2740	}
				2741
				2742	static int tcp_repair_options_est(struct sock *sk,
				2743	struct tcp_repair_opt __user *optbuf, unsigned int len)
				2744	{
				2745	struct tcp_sock *tp = tcp_sk(sk);
				2746	struct tcp_repair_opt opt;
				2747
				2748	while (len >= sizeof(opt)) {
				2749	if (copy_from_user(&opt, optbuf, sizeof(opt)))
				2750	return -EFAULT;
				2751
				2752	optbuf++;
				2753	len -= sizeof(opt);
				2754
				2755	switch (opt.opt_code) {
				2756	case TCPOPT_MSS:
				2757	tp->rx_opt.mss_clamp = opt.opt_val;
				2758	tcp_mtup_init(sk);
				2759	break;
				2760	case TCPOPT_WINDOW:
				2761	{
				2762	u16 snd_wscale = opt.opt_val & 0xFFFF;
				2763	u16 rcv_wscale = opt.opt_val >> 16;
				2764
				2765	if (snd_wscale > TCP_MAX_WSCALE \|\| rcv_wscale > TCP_MAX_WSCALE)
				2766	return -EFBIG;
				2767
				2768	tp->rx_opt.snd_wscale = snd_wscale;
				2769	tp->rx_opt.rcv_wscale = rcv_wscale;
				2770	tp->rx_opt.wscale_ok = 1;
				2771	}
				2772	break;
				2773	case TCPOPT_SACK_PERM:
				2774	if (opt.opt_val != 0)
				2775	return -EINVAL;
				2776
				2777	tp->rx_opt.sack_ok \|= TCP_SACK_SEEN;
				2778	break;
				2779	case TCPOPT_TIMESTAMP:
				2780	if (opt.opt_val != 0)
				2781	return -EINVAL;
				2782
				2783	tp->rx_opt.tstamp_ok = 1;
				2784	break;
				2785	}
				2786	}
				2787
				2788	return 0;
				2789	}
				2790
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2791	DEFINE_STATIC_KEY_FALSE(tcp_tx_delay_enabled);
				2792	EXPORT_SYMBOL(tcp_tx_delay_enabled);
				2793
				2794	static void tcp_enable_tx_delay(void)
				2795	{
				2796	if (!static_branch_unlikely(&tcp_tx_delay_enabled)) {
				2797	static int __tcp_tx_delay_enabled = 0;
				2798
				2799	if (cmpxchg(&__tcp_tx_delay_enabled, 0, 1) == 0) {
				2800	static_branch_enable(&tcp_tx_delay_enabled);
				2801	pr_info("TCP_TX_DELAY enabled\n");
				2802	}
				2803	}
				2804	}
				2805
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2806	/*
				2807	* Socket option code for TCP.
				2808	*/
				2809	static int do_tcp_setsockopt(struct sock *sk, int level,
				2810	int optname, char __user *optval, unsigned int optlen)
				2811	{
				2812	struct tcp_sock *tp = tcp_sk(sk);
				2813	struct inet_connection_sock *icsk = inet_csk(sk);
				2814	struct net *net = sock_net(sk);
				2815	int val;
				2816	int err = 0;
				2817
				2818	/* These are data/string values, all the others are ints */
				2819	switch (optname) {
				2820	case TCP_CONGESTION: {
				2821	char name[TCP_CA_NAME_MAX];
				2822
				2823	if (optlen < 1)
				2824	return -EINVAL;
				2825
				2826	val = strncpy_from_user(name, optval,
				2827	min_t(long, TCP_CA_NAME_MAX-1, optlen));
				2828	if (val < 0)
				2829	return -EFAULT;
				2830	name[val] = 0;
				2831
				2832	lock_sock(sk);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2833	err = tcp_set_congestion_control(sk, name, true, true,
				2834	ns_capable(sock_net(sk)->user_ns,
				2835	CAP_NET_ADMIN));
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2836	release_sock(sk);
				2837	return err;
				2838	}
				2839	case TCP_ULP: {
				2840	char name[TCP_ULP_NAME_MAX];
				2841
				2842	if (optlen < 1)
				2843	return -EINVAL;
				2844
				2845	val = strncpy_from_user(name, optval,
				2846	min_t(long, TCP_ULP_NAME_MAX - 1,
				2847	optlen));
				2848	if (val < 0)
				2849	return -EFAULT;
				2850	name[val] = 0;
				2851
				2852	lock_sock(sk);
				2853	err = tcp_set_ulp(sk, name);
				2854	release_sock(sk);
				2855	return err;
				2856	}
				2857	case TCP_FASTOPEN_KEY: {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2858	__u8 key[TCP_FASTOPEN_KEY_BUF_LENGTH];
				2859	__u8 *backup_key = NULL;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2860
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2861	/* Allow a backup key as well to facilitate key rotation
				2862	* First key is the active one.
				2863	*/
				2864	if (optlen != TCP_FASTOPEN_KEY_LENGTH &&
				2865	optlen != TCP_FASTOPEN_KEY_BUF_LENGTH)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2866	return -EINVAL;
				2867
				2868	if (copy_from_user(key, optval, optlen))
				2869	return -EFAULT;
				2870
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2871	if (optlen == TCP_FASTOPEN_KEY_BUF_LENGTH)
				2872	backup_key = key + TCP_FASTOPEN_KEY_LENGTH;
				2873
				2874	return tcp_fastopen_reset_cipher(net, sk, key, backup_key);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2875	}
				2876	default:
				2877	/* fallthru */
				2878	break;
				2879	}
				2880
				2881	if (optlen < sizeof(int))
				2882	return -EINVAL;
				2883
				2884	if (get_user(val, (int __user *)optval))
				2885	return -EFAULT;
				2886
				2887	lock_sock(sk);
				2888
				2889	switch (optname) {
				2890	case TCP_MAXSEG:
				2891	/* Values greater than interface MTU won't take effect. However
				2892	* at the point when this call is done we typically don't yet
				2893	* know which interface is going to be used
				2894	*/
				2895	if (val && (val < TCP_MIN_MSS \|\| val > MAX_TCP_WINDOW)) {
				2896	err = -EINVAL;
				2897	break;
				2898	}
				2899	tp->rx_opt.user_mss = val;
				2900	break;
				2901
				2902	case TCP_NODELAY:
				2903	if (val) {
				2904	/* TCP_NODELAY is weaker than TCP_CORK, so that
				2905	* this option on corked socket is remembered, but
				2906	* it is not activated until cork is cleared.
				2907	*
				2908	* However, when TCP_NODELAY is set we make
				2909	* an explicit push, which overrides even TCP_CORK
				2910	* for currently queued segments.
				2911	*/
				2912	tp->nonagle \|= TCP_NAGLE_OFF\|TCP_NAGLE_PUSH;
				2913	tcp_push_pending_frames(sk);
				2914	} else {
				2915	tp->nonagle &= ~TCP_NAGLE_OFF;
				2916	}
				2917	break;
				2918
				2919	case TCP_THIN_LINEAR_TIMEOUTS:
				2920	if (val < 0 \|\| val > 1)
				2921	err = -EINVAL;
				2922	else
				2923	tp->thin_lto = val;
				2924	break;
				2925
				2926	case TCP_THIN_DUPACK:
				2927	if (val < 0 \|\| val > 1)
				2928	err = -EINVAL;
				2929	break;
				2930
				2931	case TCP_REPAIR:
				2932	if (!tcp_can_repair_sock(sk))
				2933	err = -EPERM;
				2934	else if (val == TCP_REPAIR_ON) {
				2935	tp->repair = 1;
				2936	sk->sk_reuse = SK_FORCE_REUSE;
				2937	tp->repair_queue = TCP_NO_QUEUE;
				2938	} else if (val == TCP_REPAIR_OFF) {
				2939	tp->repair = 0;
				2940	sk->sk_reuse = SK_NO_REUSE;
				2941	tcp_send_window_probe(sk);
				2942	} else if (val == TCP_REPAIR_OFF_NO_WP) {
				2943	tp->repair = 0;
				2944	sk->sk_reuse = SK_NO_REUSE;
				2945	} else
				2946	err = -EINVAL;
				2947
				2948	break;
				2949
				2950	case TCP_REPAIR_QUEUE:
				2951	if (!tp->repair)
				2952	err = -EPERM;
				2953	else if ((unsigned int)val < TCP_QUEUES_NR)
				2954	tp->repair_queue = val;
				2955	else
				2956	err = -EINVAL;
				2957	break;
				2958
				2959	case TCP_QUEUE_SEQ:
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	2960	if (sk->sk_state != TCP_CLOSE) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2961	err = -EPERM;
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	2962	} else if (tp->repair_queue == TCP_SEND_QUEUE) {
				2963	if (!tcp_rtx_queue_empty(sk))
				2964	err = -EPERM;
				2965	else
				2966	WRITE_ONCE(tp->write_seq, val);
				2967	} else if (tp->repair_queue == TCP_RECV_QUEUE) {
				2968	if (tp->rcv_nxt != tp->copied_seq) {
				2969	err = -EPERM;
				2970	} else {
				2971	WRITE_ONCE(tp->rcv_nxt, val);
				2972	WRITE_ONCE(tp->copied_seq, val);
				2973	}
				2974	} else {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2975	err = -EINVAL;
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	2976	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2977	break;
				2978
				2979	case TCP_REPAIR_OPTIONS:
				2980	if (!tp->repair)
				2981	err = -EINVAL;
				2982	else if (sk->sk_state == TCP_ESTABLISHED)
				2983	err = tcp_repair_options_est(sk,
				2984	(struct tcp_repair_opt __user *)optval,
				2985	optlen);
				2986	else
				2987	err = -EPERM;
				2988	break;
				2989
				2990	case TCP_CORK:
				2991	/* When set indicates to always queue non-full frames.
				2992	* Later the user clears this option and we transmit
				2993	* any pending partial frames in the queue. This is
				2994	* meant to be used alongside sendfile() to get properly
				2995	* filled frames when the user (for example) must write
				2996	* out headers with a write() call first and then use
				2997	* sendfile to send out the data parts.
				2998	*
				2999	* TCP_CORK can be set together with TCP_NODELAY and it is
				3000	* stronger than TCP_NODELAY.
				3001	*/
				3002	if (val) {
				3003	tp->nonagle \|= TCP_NAGLE_CORK;
				3004	} else {
				3005	tp->nonagle &= ~TCP_NAGLE_CORK;
				3006	if (tp->nonagle&TCP_NAGLE_OFF)
				3007	tp->nonagle \|= TCP_NAGLE_PUSH;
				3008	tcp_push_pending_frames(sk);
				3009	}
				3010	break;
				3011
				3012	case TCP_KEEPIDLE:
				3013	if (val < 1 \|\| val > MAX_TCP_KEEPIDLE)
				3014	err = -EINVAL;
				3015	else {
				3016	tp->keepalive_time = val * HZ;
				3017	if (sock_flag(sk, SOCK_KEEPOPEN) &&
				3018	!((1 << sk->sk_state) &
				3019	(TCPF_CLOSE \| TCPF_LISTEN))) {
				3020	u32 elapsed = keepalive_time_elapsed(tp);
				3021	if (tp->keepalive_time > elapsed)
				3022	elapsed = tp->keepalive_time - elapsed;
				3023	else
				3024	elapsed = 0;
				3025	inet_csk_reset_keepalive_timer(sk, elapsed);
				3026	}
				3027	}
				3028	break;
				3029	case TCP_KEEPINTVL:
				3030	if (val < 1 \|\| val > MAX_TCP_KEEPINTVL)
				3031	err = -EINVAL;
				3032	else
				3033	tp->keepalive_intvl = val * HZ;
				3034	break;
				3035	case TCP_KEEPCNT:
				3036	if (val < 1 \|\| val > MAX_TCP_KEEPCNT)
				3037	err = -EINVAL;
				3038	else
				3039	tp->keepalive_probes = val;
				3040	break;
				3041	case TCP_SYNCNT:
				3042	if (val < 1 \|\| val > MAX_TCP_SYNCNT)
				3043	err = -EINVAL;
				3044	else
				3045	icsk->icsk_syn_retries = val;
				3046	break;
				3047
				3048	case TCP_SAVE_SYN:
				3049	if (val < 0 \|\| val > 1)
				3050	err = -EINVAL;
				3051	else
				3052	tp->save_syn = val;
				3053	break;
				3054
				3055	case TCP_LINGER2:
				3056	if (val < 0)
				3057	tp->linger2 = -1;
				3058	else if (val > net->ipv4.sysctl_tcp_fin_timeout / HZ)
				3059	tp->linger2 = 0;
				3060	else
				3061	tp->linger2 = val * HZ;
				3062	break;
				3063
				3064	case TCP_DEFER_ACCEPT:
				3065	/* Translate value in seconds to number of retransmits */
				3066	icsk->icsk_accept_queue.rskq_defer_accept =
				3067	secs_to_retrans(val, TCP_TIMEOUT_INIT / HZ,
				3068	TCP_RTO_MAX / HZ);
				3069	break;
				3070
				3071	case TCP_WINDOW_CLAMP:
				3072	if (!val) {
				3073	if (sk->sk_state != TCP_CLOSE) {
				3074	err = -EINVAL;
				3075	break;
				3076	}
				3077	tp->window_clamp = 0;
				3078	} else
				3079	tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ?
				3080	SOCK_MIN_RCVBUF / 2 : val;
				3081	break;
				3082
				3083	case TCP_QUICKACK:
				3084	if (!val) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3085	inet_csk_enter_pingpong_mode(sk);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3086	} else {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3087	inet_csk_exit_pingpong_mode(sk);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3088	if ((1 << sk->sk_state) &
				3089	(TCPF_ESTABLISHED \| TCPF_CLOSE_WAIT) &&
				3090	inet_csk_ack_scheduled(sk)) {
				3091	icsk->icsk_ack.pending \|= ICSK_ACK_PUSHED;
				3092	tcp_cleanup_rbuf(sk, 1);
				3093	if (!(val & 1))
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3094	inet_csk_enter_pingpong_mode(sk);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3095	}
				3096	}
				3097	break;
				3098
				3099	#ifdef CONFIG_TCP_MD5SIG
				3100	case TCP_MD5SIG:
				3101	case TCP_MD5SIG_EXT:
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	3102	err = tp->af_specific->md5_parse(sk, optname, optval, optlen);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3103	break;
				3104	#endif
				3105	case TCP_USER_TIMEOUT:
				3106	/* Cap the max time in ms TCP will retry or probe the window
				3107	* before giving up and aborting (ETIMEDOUT) a connection.
				3108	*/
				3109	if (val < 0)
				3110	err = -EINVAL;
				3111	else
				3112	icsk->icsk_user_timeout = val;
				3113	break;
				3114
				3115	case TCP_FASTOPEN:
				3116	if (val >= 0 && ((1 << sk->sk_state) & (TCPF_CLOSE \|
				3117	TCPF_LISTEN))) {
				3118	tcp_fastopen_init_key_once(net);
				3119
				3120	fastopen_queue_tune(sk, val);
				3121	} else {
				3122	err = -EINVAL;
				3123	}
				3124	break;
				3125	case TCP_FASTOPEN_CONNECT:
				3126	if (val > 1 \|\| val < 0) {
				3127	err = -EINVAL;
				3128	} else if (net->ipv4.sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) {
				3129	if (sk->sk_state == TCP_CLOSE)
				3130	tp->fastopen_connect = val;
				3131	else
				3132	err = -EINVAL;
				3133	} else {
				3134	err = -EOPNOTSUPP;
				3135	}
				3136	break;
				3137	case TCP_FASTOPEN_NO_COOKIE:
				3138	if (val > 1 \|\| val < 0)
				3139	err = -EINVAL;
				3140	else if (!((1 << sk->sk_state) & (TCPF_CLOSE \| TCPF_LISTEN)))
				3141	err = -EINVAL;
				3142	else
				3143	tp->fastopen_no_cookie = val;
				3144	break;
				3145	case TCP_TIMESTAMP:
				3146	if (!tp->repair)
				3147	err = -EPERM;
				3148	else
				3149	tp->tsoffset = val - tcp_time_stamp_raw();
				3150	break;
				3151	case TCP_REPAIR_WINDOW:
				3152	err = tcp_repair_set_window(tp, optval, optlen);
				3153	break;
				3154	case TCP_NOTSENT_LOWAT:
				3155	tp->notsent_lowat = val;
				3156	sk->sk_write_space(sk);
				3157	break;
				3158	case TCP_INQ:
				3159	if (val > 1 \|\| val < 0)
				3160	err = -EINVAL;
				3161	else
				3162	tp->recvmsg_inq = val;
				3163	break;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3164	case TCP_TX_DELAY:
				3165	if (val)
				3166	tcp_enable_tx_delay();
				3167	tp->tcp_tx_delay = val;
				3168	break;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3169	default:
				3170	err = -ENOPROTOOPT;
				3171	break;
				3172	}
				3173
				3174	release_sock(sk);
				3175	return err;
				3176	}
				3177
				3178	int tcp_setsockopt(struct sock sk, int level, int optname, char __user optval,
				3179	unsigned int optlen)
				3180	{
				3181	const struct inet_connection_sock *icsk = inet_csk(sk);
				3182
				3183	if (level != SOL_TCP)
				3184	return icsk->icsk_af_ops->setsockopt(sk, level, optname,
				3185	optval, optlen);
				3186	return do_tcp_setsockopt(sk, level, optname, optval, optlen);
				3187	}
				3188	EXPORT_SYMBOL(tcp_setsockopt);
				3189
				3190	#ifdef CONFIG_COMPAT
				3191	int compat_tcp_setsockopt(struct sock *sk, int level, int optname,
				3192	char __user *optval, unsigned int optlen)
				3193	{
				3194	if (level != SOL_TCP)
				3195	return inet_csk_compat_setsockopt(sk, level, optname,
				3196	optval, optlen);
				3197	return do_tcp_setsockopt(sk, level, optname, optval, optlen);
				3198	}
				3199	EXPORT_SYMBOL(compat_tcp_setsockopt);
				3200	#endif
				3201
				3202	static void tcp_get_info_chrono_stats(const struct tcp_sock *tp,
				3203	struct tcp_info *info)
				3204	{
				3205	u64 stats[__TCP_CHRONO_MAX], total = 0;
				3206	enum tcp_chrono i;
				3207
				3208	for (i = TCP_CHRONO_BUSY; i < __TCP_CHRONO_MAX; ++i) {
				3209	stats[i] = tp->chrono_stat[i - 1];
				3210	if (i == tp->chrono_type)
				3211	stats[i] += tcp_jiffies32 - tp->chrono_start;
				3212	stats[i] *= USEC_PER_SEC / HZ;
				3213	total += stats[i];
				3214	}
				3215
				3216	info->tcpi_busy_time = total;
				3217	info->tcpi_rwnd_limited = stats[TCP_CHRONO_RWND_LIMITED];
				3218	info->tcpi_sndbuf_limited = stats[TCP_CHRONO_SNDBUF_LIMITED];
				3219	}
				3220
				3221	/* Return information about state of tcp endpoint in API format. */
				3222	void tcp_get_info(struct sock sk, struct tcp_info info)
				3223	{
				3224	const struct tcp_sock tp = tcp_sk(sk); / iff sk_type == SOCK_STREAM */
				3225	const struct inet_connection_sock *icsk = inet_csk(sk);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3226	unsigned long rate;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3227	u32 now;
				3228	u64 rate64;
				3229	bool slow;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3230
				3231	memset(info, 0, sizeof(*info));
				3232	if (sk->sk_type != SOCK_STREAM)
				3233	return;
				3234
				3235	info->tcpi_state = inet_sk_state_load(sk);
				3236
				3237	/* Report meaningful fields for all TCP states, including listeners */
				3238	rate = READ_ONCE(sk->sk_pacing_rate);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3239	rate64 = (rate != ~0UL) ? rate : ~0ULL;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3240	info->tcpi_pacing_rate = rate64;
				3241
				3242	rate = READ_ONCE(sk->sk_max_pacing_rate);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3243	rate64 = (rate != ~0UL) ? rate : ~0ULL;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3244	info->tcpi_max_pacing_rate = rate64;
				3245
				3246	info->tcpi_reordering = tp->reordering;
				3247	info->tcpi_snd_cwnd = tp->snd_cwnd;
				3248
				3249	if (info->tcpi_state == TCP_LISTEN) {
				3250	/* listeners aliased fields :
				3251	* tcpi_unacked -> Number of children ready for accept()
				3252	* tcpi_sacked -> max backlog
				3253	*/
				3254	info->tcpi_unacked = sk->sk_ack_backlog;
				3255	info->tcpi_sacked = sk->sk_max_ack_backlog;
				3256	return;
				3257	}
				3258
				3259	slow = lock_sock_fast(sk);
				3260
				3261	info->tcpi_ca_state = icsk->icsk_ca_state;
				3262	info->tcpi_retransmits = icsk->icsk_retransmits;
				3263	info->tcpi_probes = icsk->icsk_probes_out;
				3264	info->tcpi_backoff = icsk->icsk_backoff;
				3265
				3266	if (tp->rx_opt.tstamp_ok)
				3267	info->tcpi_options \|= TCPI_OPT_TIMESTAMPS;
				3268	if (tcp_is_sack(tp))
				3269	info->tcpi_options \|= TCPI_OPT_SACK;
				3270	if (tp->rx_opt.wscale_ok) {
				3271	info->tcpi_options \|= TCPI_OPT_WSCALE;
				3272	info->tcpi_snd_wscale = tp->rx_opt.snd_wscale;
				3273	info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale;
				3274	}
				3275
				3276	if (tp->ecn_flags & TCP_ECN_OK)
				3277	info->tcpi_options \|= TCPI_OPT_ECN;
				3278	if (tp->ecn_flags & TCP_ECN_SEEN)
				3279	info->tcpi_options \|= TCPI_OPT_ECN_SEEN;
				3280	if (tp->syn_data_acked)
				3281	info->tcpi_options \|= TCPI_OPT_SYN_DATA;
				3282
				3283	info->tcpi_rto = jiffies_to_usecs(icsk->icsk_rto);
				3284	info->tcpi_ato = jiffies_to_usecs(icsk->icsk_ack.ato);
				3285	info->tcpi_snd_mss = tp->mss_cache;
				3286	info->tcpi_rcv_mss = icsk->icsk_ack.rcv_mss;
				3287
				3288	info->tcpi_unacked = tp->packets_out;
				3289	info->tcpi_sacked = tp->sacked_out;
				3290
				3291	info->tcpi_lost = tp->lost_out;
				3292	info->tcpi_retrans = tp->retrans_out;
				3293
				3294	now = tcp_jiffies32;
				3295	info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime);
				3296	info->tcpi_last_data_recv = jiffies_to_msecs(now - icsk->icsk_ack.lrcvtime);
				3297	info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp);
				3298
				3299	info->tcpi_pmtu = icsk->icsk_pmtu_cookie;
				3300	info->tcpi_rcv_ssthresh = tp->rcv_ssthresh;
				3301	info->tcpi_rtt = tp->srtt_us >> 3;
				3302	info->tcpi_rttvar = tp->mdev_us >> 2;
				3303	info->tcpi_snd_ssthresh = tp->snd_ssthresh;
				3304	info->tcpi_advmss = tp->advmss;
				3305
				3306	info->tcpi_rcv_rtt = tp->rcv_rtt_est.rtt_us >> 3;
				3307	info->tcpi_rcv_space = tp->rcvq_space.space;
				3308
				3309	info->tcpi_total_retrans = tp->total_retrans;
				3310
				3311	info->tcpi_bytes_acked = tp->bytes_acked;
				3312	info->tcpi_bytes_received = tp->bytes_received;
				3313	info->tcpi_notsent_bytes = max_t(int, 0, tp->write_seq - tp->snd_nxt);
				3314	tcp_get_info_chrono_stats(tp, info);
				3315
				3316	info->tcpi_segs_out = tp->segs_out;
				3317	info->tcpi_segs_in = tp->segs_in;
				3318
				3319	info->tcpi_min_rtt = tcp_min_rtt(tp);
				3320	info->tcpi_data_segs_in = tp->data_segs_in;
				3321	info->tcpi_data_segs_out = tp->data_segs_out;
				3322
				3323	info->tcpi_delivery_rate_app_limited = tp->rate_app_limited ? 1 : 0;
				3324	rate64 = tcp_compute_delivery_rate(tp);
				3325	if (rate64)
				3326	info->tcpi_delivery_rate = rate64;
				3327	info->tcpi_delivered = tp->delivered;
				3328	info->tcpi_delivered_ce = tp->delivered_ce;
				3329	info->tcpi_bytes_sent = tp->bytes_sent;
				3330	info->tcpi_bytes_retrans = tp->bytes_retrans;
				3331	info->tcpi_dsack_dups = tp->dsack_dups;
				3332	info->tcpi_reord_seen = tp->reord_seen;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3333	info->tcpi_rcv_ooopack = tp->rcv_ooopack;
				3334	info->tcpi_snd_wnd = tp->snd_wnd;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3335	unlock_sock_fast(sk, slow);
				3336	}
				3337	EXPORT_SYMBOL_GPL(tcp_get_info);
				3338
				3339	static size_t tcp_opt_stats_get_size(void)
				3340	{
				3341	return
				3342	nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_BUSY */
				3343	nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_RWND_LIMITED */
				3344	nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_SNDBUF_LIMITED */
				3345	nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_DATA_SEGS_OUT */
				3346	nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_TOTAL_RETRANS */
				3347	nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_PACING_RATE */
				3348	nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_DELIVERY_RATE */
				3349	nla_total_size(sizeof(u32)) + /* TCP_NLA_SND_CWND */
				3350	nla_total_size(sizeof(u32)) + /* TCP_NLA_REORDERING */
				3351	nla_total_size(sizeof(u32)) + /* TCP_NLA_MIN_RTT */
				3352	nla_total_size(sizeof(u8)) + /* TCP_NLA_RECUR_RETRANS */
				3353	nla_total_size(sizeof(u8)) + /* TCP_NLA_DELIVERY_RATE_APP_LMT */
				3354	nla_total_size(sizeof(u32)) + /* TCP_NLA_SNDQ_SIZE */
				3355	nla_total_size(sizeof(u8)) + /* TCP_NLA_CA_STATE */
				3356	nla_total_size(sizeof(u32)) + /* TCP_NLA_SND_SSTHRESH */
				3357	nla_total_size(sizeof(u32)) + /* TCP_NLA_DELIVERED */
				3358	nla_total_size(sizeof(u32)) + /* TCP_NLA_DELIVERED_CE */
				3359	nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_BYTES_SENT */
				3360	nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_BYTES_RETRANS */
				3361	nla_total_size(sizeof(u32)) + /* TCP_NLA_DSACK_DUPS */
				3362	nla_total_size(sizeof(u32)) + /* TCP_NLA_REORD_SEEN */
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3363	nla_total_size(sizeof(u32)) + /* TCP_NLA_SRTT */
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3364	0;
				3365	}
				3366
				3367	struct sk_buff tcp_get_timestamping_opt_stats(const struct sock sk)
				3368	{
				3369	const struct tcp_sock *tp = tcp_sk(sk);
				3370	struct sk_buff *stats;
				3371	struct tcp_info info;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3372	unsigned long rate;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3373	u64 rate64;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3374
				3375	stats = alloc_skb(tcp_opt_stats_get_size(), GFP_ATOMIC);
				3376	if (!stats)
				3377	return NULL;
				3378
				3379	tcp_get_info_chrono_stats(tp, &info);
				3380	nla_put_u64_64bit(stats, TCP_NLA_BUSY,
				3381	info.tcpi_busy_time, TCP_NLA_PAD);
				3382	nla_put_u64_64bit(stats, TCP_NLA_RWND_LIMITED,
				3383	info.tcpi_rwnd_limited, TCP_NLA_PAD);
				3384	nla_put_u64_64bit(stats, TCP_NLA_SNDBUF_LIMITED,
				3385	info.tcpi_sndbuf_limited, TCP_NLA_PAD);
				3386	nla_put_u64_64bit(stats, TCP_NLA_DATA_SEGS_OUT,
				3387	tp->data_segs_out, TCP_NLA_PAD);
				3388	nla_put_u64_64bit(stats, TCP_NLA_TOTAL_RETRANS,
				3389	tp->total_retrans, TCP_NLA_PAD);
				3390
				3391	rate = READ_ONCE(sk->sk_pacing_rate);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3392	rate64 = (rate != ~0UL) ? rate : ~0ULL;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3393	nla_put_u64_64bit(stats, TCP_NLA_PACING_RATE, rate64, TCP_NLA_PAD);
				3394
				3395	rate64 = tcp_compute_delivery_rate(tp);
				3396	nla_put_u64_64bit(stats, TCP_NLA_DELIVERY_RATE, rate64, TCP_NLA_PAD);
				3397
				3398	nla_put_u32(stats, TCP_NLA_SND_CWND, tp->snd_cwnd);
				3399	nla_put_u32(stats, TCP_NLA_REORDERING, tp->reordering);
				3400	nla_put_u32(stats, TCP_NLA_MIN_RTT, tcp_min_rtt(tp));
				3401
				3402	nla_put_u8(stats, TCP_NLA_RECUR_RETRANS, inet_csk(sk)->icsk_retransmits);
				3403	nla_put_u8(stats, TCP_NLA_DELIVERY_RATE_APP_LMT, !!tp->rate_app_limited);
				3404	nla_put_u32(stats, TCP_NLA_SND_SSTHRESH, tp->snd_ssthresh);
				3405	nla_put_u32(stats, TCP_NLA_DELIVERED, tp->delivered);
				3406	nla_put_u32(stats, TCP_NLA_DELIVERED_CE, tp->delivered_ce);
				3407
				3408	nla_put_u32(stats, TCP_NLA_SNDQ_SIZE, tp->write_seq - tp->snd_una);
				3409	nla_put_u8(stats, TCP_NLA_CA_STATE, inet_csk(sk)->icsk_ca_state);
				3410
				3411	nla_put_u64_64bit(stats, TCP_NLA_BYTES_SENT, tp->bytes_sent,
				3412	TCP_NLA_PAD);
				3413	nla_put_u64_64bit(stats, TCP_NLA_BYTES_RETRANS, tp->bytes_retrans,
				3414	TCP_NLA_PAD);
				3415	nla_put_u32(stats, TCP_NLA_DSACK_DUPS, tp->dsack_dups);
				3416	nla_put_u32(stats, TCP_NLA_REORD_SEEN, tp->reord_seen);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3417	nla_put_u32(stats, TCP_NLA_SRTT, tp->srtt_us >> 3);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3418
				3419	return stats;
				3420	}
				3421
				3422	static int do_tcp_getsockopt(struct sock *sk, int level,
				3423	int optname, char __user optval, int __user optlen)
				3424	{
				3425	struct inet_connection_sock *icsk = inet_csk(sk);
				3426	struct tcp_sock *tp = tcp_sk(sk);
				3427	struct net *net = sock_net(sk);
				3428	int val, len;
				3429
				3430	if (get_user(len, optlen))
				3431	return -EFAULT;
				3432
				3433	len = min_t(unsigned int, len, sizeof(int));
				3434
				3435	if (len < 0)
				3436	return -EINVAL;
				3437
				3438	switch (optname) {
				3439	case TCP_MAXSEG:
				3440	val = tp->mss_cache;
				3441	if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE \| TCPF_LISTEN)))
				3442	val = tp->rx_opt.user_mss;
				3443	if (tp->repair)
				3444	val = tp->rx_opt.mss_clamp;
				3445	break;
				3446	case TCP_NODELAY:
				3447	val = !!(tp->nonagle&TCP_NAGLE_OFF);
				3448	break;
				3449	case TCP_CORK:
				3450	val = !!(tp->nonagle&TCP_NAGLE_CORK);
				3451	break;
				3452	case TCP_KEEPIDLE:
				3453	val = keepalive_time_when(tp) / HZ;
				3454	break;
				3455	case TCP_KEEPINTVL:
				3456	val = keepalive_intvl_when(tp) / HZ;
				3457	break;
				3458	case TCP_KEEPCNT:
				3459	val = keepalive_probes(tp);
				3460	break;
				3461	case TCP_SYNCNT:
				3462	val = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_syn_retries;
				3463	break;
				3464	case TCP_LINGER2:
				3465	val = tp->linger2;
				3466	if (val >= 0)
				3467	val = (val ? : net->ipv4.sysctl_tcp_fin_timeout) / HZ;
				3468	break;
				3469	case TCP_DEFER_ACCEPT:
				3470	val = retrans_to_secs(icsk->icsk_accept_queue.rskq_defer_accept,
				3471	TCP_TIMEOUT_INIT / HZ, TCP_RTO_MAX / HZ);
				3472	break;
				3473	case TCP_WINDOW_CLAMP:
				3474	val = tp->window_clamp;
				3475	break;
				3476	case TCP_INFO: {
				3477	struct tcp_info info;
				3478
				3479	if (get_user(len, optlen))
				3480	return -EFAULT;
				3481
				3482	tcp_get_info(sk, &info);
				3483
				3484	len = min_t(unsigned int, len, sizeof(info));
				3485	if (put_user(len, optlen))
				3486	return -EFAULT;
				3487	if (copy_to_user(optval, &info, len))
				3488	return -EFAULT;
				3489	return 0;
				3490	}
				3491	case TCP_CC_INFO: {
				3492	const struct tcp_congestion_ops *ca_ops;
				3493	union tcp_cc_info info;
				3494	size_t sz = 0;
				3495	int attr;
				3496
				3497	if (get_user(len, optlen))
				3498	return -EFAULT;
				3499
				3500	ca_ops = icsk->icsk_ca_ops;
				3501	if (ca_ops && ca_ops->get_info)
				3502	sz = ca_ops->get_info(sk, ~0U, &attr, &info);
				3503
				3504	len = min_t(unsigned int, len, sz);
				3505	if (put_user(len, optlen))
				3506	return -EFAULT;
				3507	if (copy_to_user(optval, &info, len))
				3508	return -EFAULT;
				3509	return 0;
				3510	}
				3511	case TCP_QUICKACK:
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3512	val = !inet_csk_in_pingpong_mode(sk);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3513	break;
				3514
				3515	case TCP_CONGESTION:
				3516	if (get_user(len, optlen))
				3517	return -EFAULT;
				3518	len = min_t(unsigned int, len, TCP_CA_NAME_MAX);
				3519	if (put_user(len, optlen))
				3520	return -EFAULT;
				3521	if (copy_to_user(optval, icsk->icsk_ca_ops->name, len))
				3522	return -EFAULT;
				3523	return 0;
				3524
				3525	case TCP_ULP:
				3526	if (get_user(len, optlen))
				3527	return -EFAULT;
				3528	len = min_t(unsigned int, len, TCP_ULP_NAME_MAX);
				3529	if (!icsk->icsk_ulp_ops) {
				3530	if (put_user(0, optlen))
				3531	return -EFAULT;
				3532	return 0;
				3533	}
				3534	if (put_user(len, optlen))
				3535	return -EFAULT;
				3536	if (copy_to_user(optval, icsk->icsk_ulp_ops->name, len))
				3537	return -EFAULT;
				3538	return 0;
				3539
				3540	case TCP_FASTOPEN_KEY: {
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	3541	u64 key[TCP_FASTOPEN_KEY_BUF_LENGTH / sizeof(u64)];
				3542	unsigned int key_len;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3543
				3544	if (get_user(len, optlen))
				3545	return -EFAULT;
				3546
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	3547	key_len = tcp_fastopen_get_cipher(net, icsk, key) *
				3548	TCP_FASTOPEN_KEY_LENGTH;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3549	len = min_t(unsigned int, len, key_len);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3550	if (put_user(len, optlen))
				3551	return -EFAULT;
				3552	if (copy_to_user(optval, key, len))
				3553	return -EFAULT;
				3554	return 0;
				3555	}
				3556	case TCP_THIN_LINEAR_TIMEOUTS:
				3557	val = tp->thin_lto;
				3558	break;
				3559
				3560	case TCP_THIN_DUPACK:
				3561	val = 0;
				3562	break;
				3563
				3564	case TCP_REPAIR:
				3565	val = tp->repair;
				3566	break;
				3567
				3568	case TCP_REPAIR_QUEUE:
				3569	if (tp->repair)
				3570	val = tp->repair_queue;
				3571	else
				3572	return -EINVAL;
				3573	break;
				3574
				3575	case TCP_REPAIR_WINDOW: {
				3576	struct tcp_repair_window opt;
				3577
				3578	if (get_user(len, optlen))
				3579	return -EFAULT;
				3580
				3581	if (len != sizeof(opt))
				3582	return -EINVAL;
				3583
				3584	if (!tp->repair)
				3585	return -EPERM;
				3586
				3587	opt.snd_wl1 = tp->snd_wl1;
				3588	opt.snd_wnd = tp->snd_wnd;
				3589	opt.max_window = tp->max_window;
				3590	opt.rcv_wnd = tp->rcv_wnd;
				3591	opt.rcv_wup = tp->rcv_wup;
				3592
				3593	if (copy_to_user(optval, &opt, len))
				3594	return -EFAULT;
				3595	return 0;
				3596	}
				3597	case TCP_QUEUE_SEQ:
				3598	if (tp->repair_queue == TCP_SEND_QUEUE)
				3599	val = tp->write_seq;
				3600	else if (tp->repair_queue == TCP_RECV_QUEUE)
				3601	val = tp->rcv_nxt;
				3602	else
				3603	return -EINVAL;
				3604	break;
				3605
				3606	case TCP_USER_TIMEOUT:
				3607	val = icsk->icsk_user_timeout;
				3608	break;
				3609
				3610	case TCP_FASTOPEN:
				3611	val = icsk->icsk_accept_queue.fastopenq.max_qlen;
				3612	break;
				3613
				3614	case TCP_FASTOPEN_CONNECT:
				3615	val = tp->fastopen_connect;
				3616	break;
				3617
				3618	case TCP_FASTOPEN_NO_COOKIE:
				3619	val = tp->fastopen_no_cookie;
				3620	break;
				3621
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3622	case TCP_TX_DELAY:
				3623	val = tp->tcp_tx_delay;
				3624	break;
				3625
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3626	case TCP_TIMESTAMP:
				3627	val = tcp_time_stamp_raw() + tp->tsoffset;
				3628	break;
				3629	case TCP_NOTSENT_LOWAT:
				3630	val = tp->notsent_lowat;
				3631	break;
				3632	case TCP_INQ:
				3633	val = tp->recvmsg_inq;
				3634	break;
				3635	case TCP_SAVE_SYN:
				3636	val = tp->save_syn;
				3637	break;
				3638	case TCP_SAVED_SYN: {
				3639	if (get_user(len, optlen))
				3640	return -EFAULT;
				3641
				3642	lock_sock(sk);
				3643	if (tp->saved_syn) {
				3644	if (len < tp->saved_syn[0]) {
				3645	if (put_user(tp->saved_syn[0], optlen)) {
				3646	release_sock(sk);
				3647	return -EFAULT;
				3648	}
				3649	release_sock(sk);
				3650	return -EINVAL;
				3651	}
				3652	len = tp->saved_syn[0];
				3653	if (put_user(len, optlen)) {
				3654	release_sock(sk);
				3655	return -EFAULT;
				3656	}
				3657	if (copy_to_user(optval, tp->saved_syn + 1, len)) {
				3658	release_sock(sk);
				3659	return -EFAULT;
				3660	}
				3661	tcp_saved_syn_free(tp);
				3662	release_sock(sk);
				3663	} else {
				3664	release_sock(sk);
				3665	len = 0;
				3666	if (put_user(len, optlen))
				3667	return -EFAULT;
				3668	}
				3669	return 0;
				3670	}
				3671	#ifdef CONFIG_MMU
				3672	case TCP_ZEROCOPY_RECEIVE: {
				3673	struct tcp_zerocopy_receive zc;
				3674	int err;
				3675
				3676	if (get_user(len, optlen))
				3677	return -EFAULT;
				3678	if (len != sizeof(zc))
				3679	return -EINVAL;
				3680	if (copy_from_user(&zc, optval, len))
				3681	return -EFAULT;
				3682	lock_sock(sk);
				3683	err = tcp_zerocopy_receive(sk, &zc);
				3684	release_sock(sk);
				3685	if (!err && copy_to_user(optval, &zc, len))
				3686	err = -EFAULT;
				3687	return err;
				3688	}
				3689	#endif
				3690	default:
				3691	return -ENOPROTOOPT;
				3692	}
				3693
				3694	if (put_user(len, optlen))
				3695	return -EFAULT;
				3696	if (copy_to_user(optval, &val, len))
				3697	return -EFAULT;
				3698	return 0;
				3699	}
				3700
				3701	int tcp_getsockopt(struct sock sk, int level, int optname, char __user optval,
				3702	int __user *optlen)
				3703	{
				3704	struct inet_connection_sock *icsk = inet_csk(sk);
				3705
				3706	if (level != SOL_TCP)
				3707	return icsk->icsk_af_ops->getsockopt(sk, level, optname,
				3708	optval, optlen);
				3709	return do_tcp_getsockopt(sk, level, optname, optval, optlen);
				3710	}
				3711	EXPORT_SYMBOL(tcp_getsockopt);
				3712
				3713	#ifdef CONFIG_COMPAT
				3714	int compat_tcp_getsockopt(struct sock *sk, int level, int optname,
				3715	char __user optval, int __user optlen)
				3716	{
				3717	if (level != SOL_TCP)
				3718	return inet_csk_compat_getsockopt(sk, level, optname,
				3719	optval, optlen);
				3720	return do_tcp_getsockopt(sk, level, optname, optval, optlen);
				3721	}
				3722	EXPORT_SYMBOL(compat_tcp_getsockopt);
				3723	#endif
				3724
				3725	#ifdef CONFIG_TCP_MD5SIG
				3726	static DEFINE_PER_CPU(struct tcp_md5sig_pool, tcp_md5sig_pool);
				3727	static DEFINE_MUTEX(tcp_md5sig_mutex);
				3728	static bool tcp_md5sig_pool_populated = false;
				3729
				3730	static void __tcp_alloc_md5sig_pool(void)
				3731	{
				3732	struct crypto_ahash *hash;
				3733	int cpu;
				3734
				3735	hash = crypto_alloc_ahash("md5", 0, CRYPTO_ALG_ASYNC);
				3736	if (IS_ERR(hash))
				3737	return;
				3738
				3739	for_each_possible_cpu(cpu) {
				3740	void *scratch = per_cpu(tcp_md5sig_pool, cpu).scratch;
				3741	struct ahash_request *req;
				3742
				3743	if (!scratch) {
				3744	scratch = kmalloc_node(sizeof(union tcp_md5sum_block) +
				3745	sizeof(struct tcphdr),
				3746	GFP_KERNEL,
				3747	cpu_to_node(cpu));
				3748	if (!scratch)
				3749	return;
				3750	per_cpu(tcp_md5sig_pool, cpu).scratch = scratch;
				3751	}
				3752	if (per_cpu(tcp_md5sig_pool, cpu).md5_req)
				3753	continue;
				3754
				3755	req = ahash_request_alloc(hash, GFP_KERNEL);
				3756	if (!req)
				3757	return;
				3758
				3759	ahash_request_set_callback(req, 0, NULL, NULL);
				3760
				3761	per_cpu(tcp_md5sig_pool, cpu).md5_req = req;
				3762	}
				3763	/* before setting tcp_md5sig_pool_populated, we must commit all writes
				3764	* to memory. See smp_rmb() in tcp_get_md5sig_pool()
				3765	*/
				3766	smp_wmb();
				3767	tcp_md5sig_pool_populated = true;
				3768	}
				3769
				3770	bool tcp_alloc_md5sig_pool(void)
				3771	{
				3772	if (unlikely(!tcp_md5sig_pool_populated)) {
				3773	mutex_lock(&tcp_md5sig_mutex);
				3774
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3775	if (!tcp_md5sig_pool_populated) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3776	__tcp_alloc_md5sig_pool();
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3777	if (tcp_md5sig_pool_populated)
				3778	static_branch_inc(&tcp_md5_needed);
				3779	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3780
				3781	mutex_unlock(&tcp_md5sig_mutex);
				3782	}
				3783	return tcp_md5sig_pool_populated;
				3784	}
				3785	EXPORT_SYMBOL(tcp_alloc_md5sig_pool);
				3786
				3787
				3788	/**
				3789	* tcp_get_md5sig_pool - get md5sig_pool for this user
				3790	*
				3791	* We use percpu structure, so if we succeed, we exit with preemption
				3792	* and BH disabled, to make sure another thread or softirq handling
				3793	* wont try to get same context.
				3794	*/
				3795	struct tcp_md5sig_pool *tcp_get_md5sig_pool(void)
				3796	{
				3797	local_bh_disable();
				3798
				3799	if (tcp_md5sig_pool_populated) {
				3800	/* coupled with smp_wmb() in __tcp_alloc_md5sig_pool() */
				3801	smp_rmb();
				3802	return this_cpu_ptr(&tcp_md5sig_pool);
				3803	}
				3804	local_bh_enable();
				3805	return NULL;
				3806	}
				3807	EXPORT_SYMBOL(tcp_get_md5sig_pool);
				3808
				3809	int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp,
				3810	const struct sk_buff *skb, unsigned int header_len)
				3811	{
				3812	struct scatterlist sg;
				3813	const struct tcphdr *tp = tcp_hdr(skb);
				3814	struct ahash_request *req = hp->md5_req;
				3815	unsigned int i;
				3816	const unsigned int head_data_len = skb_headlen(skb) > header_len ?
				3817	skb_headlen(skb) - header_len : 0;
				3818	const struct skb_shared_info *shi = skb_shinfo(skb);
				3819	struct sk_buff *frag_iter;
				3820
				3821	sg_init_table(&sg, 1);
				3822
				3823	sg_set_buf(&sg, ((u8 *) tp) + header_len, head_data_len);
				3824	ahash_request_set_crypt(req, &sg, NULL, head_data_len);
				3825	if (crypto_ahash_update(req))
				3826	return 1;
				3827
				3828	for (i = 0; i < shi->nr_frags; ++i) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3829	const skb_frag_t *f = &shi->frags[i];
				3830	unsigned int offset = skb_frag_off(f);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3831	struct page *page = skb_frag_page(f) + (offset >> PAGE_SHIFT);
				3832
				3833	sg_set_page(&sg, page, skb_frag_size(f),
				3834	offset_in_page(offset));
				3835	ahash_request_set_crypt(req, &sg, NULL, skb_frag_size(f));
				3836	if (crypto_ahash_update(req))
				3837	return 1;
				3838	}
				3839
				3840	skb_walk_frags(skb, frag_iter)
				3841	if (tcp_md5_hash_skb_data(hp, frag_iter, 0))
				3842	return 1;
				3843
				3844	return 0;
				3845	}
				3846	EXPORT_SYMBOL(tcp_md5_hash_skb_data);
				3847
				3848	int tcp_md5_hash_key(struct tcp_md5sig_pool hp, const struct tcp_md5sig_key key)
				3849	{
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	3850	u8 keylen = READ_ONCE(key->keylen); /* paired with WRITE_ONCE() in tcp_md5_do_add */
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3851	struct scatterlist sg;
				3852
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	3853	sg_init_one(&sg, key->key, keylen);
				3854	ahash_request_set_crypt(hp->md5_req, &sg, NULL, keylen);
				3855
				3856	/* tcp_md5_do_add() might change key->key under us */
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3857	return crypto_ahash_update(hp->md5_req);
				3858	}
				3859	EXPORT_SYMBOL(tcp_md5_hash_key);
				3860
				3861	#endif
				3862
				3863	void tcp_done(struct sock *sk)
				3864	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3865	struct request_sock *req;
				3866
				3867	/* We might be called with a new socket, after
				3868	* inet_csk_prepare_forced_close() has been called
				3869	* so we can not use lockdep_sock_is_held(sk)
				3870	*/
				3871	req = rcu_dereference_protected(tcp_sk(sk)->fastopen_rsk, 1);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3872
				3873	if (sk->sk_state == TCP_SYN_SENT \|\| sk->sk_state == TCP_SYN_RECV)
				3874	TCP_INC_STATS(sock_net(sk), TCP_MIB_ATTEMPTFAILS);
				3875
				3876	tcp_set_state(sk, TCP_CLOSE);
				3877	tcp_clear_xmit_timers(sk);
				3878	if (req)
				3879	reqsk_fastopen_remove(sk, req, false);
				3880
				3881	sk->sk_shutdown = SHUTDOWN_MASK;
				3882
				3883	if (!sock_flag(sk, SOCK_DEAD))
				3884	sk->sk_state_change(sk);
				3885	else
				3886	inet_csk_destroy_sock(sk);
				3887	}
				3888	EXPORT_SYMBOL_GPL(tcp_done);
				3889
				3890	int tcp_abort(struct sock *sk, int err)
				3891	{
				3892	if (!sk_fullsock(sk)) {
				3893	if (sk->sk_state == TCP_NEW_SYN_RECV) {
				3894	struct request_sock *req = inet_reqsk(sk);
				3895
				3896	local_bh_disable();
				3897	inet_csk_reqsk_queue_drop(req->rsk_listener, req);
				3898	local_bh_enable();
				3899	return 0;
				3900	}
				3901	return -EOPNOTSUPP;
				3902	}
				3903
				3904	/* Don't race with userspace socket closes such as tcp_close. */
				3905	lock_sock(sk);
				3906
				3907	if (sk->sk_state == TCP_LISTEN) {
				3908	tcp_set_state(sk, TCP_CLOSE);
				3909	inet_csk_listen_stop(sk);
				3910	}
				3911
				3912	/* Don't race with BH socket closes such as inet_csk_listen_stop. */
				3913	local_bh_disable();
				3914	bh_lock_sock(sk);
				3915
				3916	if (!sock_flag(sk, SOCK_DEAD)) {
				3917	sk->sk_err = err;
				3918	/* This barrier is coupled with smp_rmb() in tcp_poll() */
				3919	smp_wmb();
				3920	sk->sk_error_report(sk);
				3921	if (tcp_need_reset(sk->sk_state))
				3922	tcp_send_active_reset(sk, GFP_ATOMIC);
				3923	tcp_done(sk);
				3924	}
				3925
				3926	bh_unlock_sock(sk);
				3927	local_bh_enable();
				3928	tcp_write_queue_purge(sk);
				3929	release_sock(sk);
				3930	return 0;
				3931	}
				3932	EXPORT_SYMBOL_GPL(tcp_abort);
				3933
				3934	extern struct tcp_congestion_ops tcp_reno;
				3935
				3936	static __initdata unsigned long thash_entries;
				3937	static int __init set_thash_entries(char *str)
				3938	{
				3939	ssize_t ret;
				3940
				3941	if (!str)
				3942	return 0;
				3943
				3944	ret = kstrtoul(str, 0, &thash_entries);
				3945	if (ret)
				3946	return 0;
				3947
				3948	return 1;
				3949	}
				3950	__setup("thash_entries=", set_thash_entries);
				3951
				3952	static void __init tcp_init_mem(void)
				3953	{
				3954	unsigned long limit = nr_free_buffer_pages() / 16;
				3955
				3956	limit = max(limit, 128UL);
				3957	sysctl_tcp_mem[0] = limit / 4 * 3; /* 4.68 % */
				3958	sysctl_tcp_mem[1] = limit; /* 6.25 % */
				3959	sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2; /* 9.37 % */
				3960	}
				3961
				3962	void __init tcp_init(void)
				3963	{
				3964	int max_rshare, max_wshare, cnt;
				3965	unsigned long limit;
				3966	unsigned int i;
				3967
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3968	BUILD_BUG_ON(TCP_MIN_SND_MSS <= MAX_TCP_OPTION_SPACE);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3969	BUILD_BUG_ON(sizeof(struct tcp_skb_cb) >
				3970	FIELD_SIZEOF(struct sk_buff, cb));
				3971
				3972	percpu_counter_init(&tcp_sockets_allocated, 0, GFP_KERNEL);
				3973	percpu_counter_init(&tcp_orphan_count, 0, GFP_KERNEL);
				3974	inet_hashinfo_init(&tcp_hashinfo);
				3975	inet_hashinfo2_init(&tcp_hashinfo, "tcp_listen_portaddr_hash",
				3976	thash_entries, 21, /* one slot per 2 MB*/
				3977	0, 64 * 1024);
				3978	tcp_hashinfo.bind_bucket_cachep =
				3979	kmem_cache_create("tcp_bind_bucket",
				3980	sizeof(struct inet_bind_bucket), 0,
				3981	SLAB_HWCACHE_ALIGN\|SLAB_PANIC, NULL);
				3982
				3983	/* Size and allocate the main established and bind bucket
				3984	* hash tables.
				3985	*
				3986	* The methodology is similar to that of the buffer cache.
				3987	*/
				3988	tcp_hashinfo.ehash =
				3989	alloc_large_system_hash("TCP established",
				3990	sizeof(struct inet_ehash_bucket),
				3991	thash_entries,
				3992	17, /* one slot per 128 KB of memory */
				3993	0,
				3994	NULL,
				3995	&tcp_hashinfo.ehash_mask,
				3996	0,
				3997	thash_entries ? 0 : 512 * 1024);
				3998	for (i = 0; i <= tcp_hashinfo.ehash_mask; i++)
				3999	INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].chain, i);
				4000
				4001	if (inet_ehash_locks_alloc(&tcp_hashinfo))
				4002	panic("TCP: failed to alloc ehash_locks");
				4003	tcp_hashinfo.bhash =
				4004	alloc_large_system_hash("TCP bind",
				4005	sizeof(struct inet_bind_hashbucket),
				4006	tcp_hashinfo.ehash_mask + 1,
				4007	17, /* one slot per 128 KB of memory */
				4008	0,
				4009	&tcp_hashinfo.bhash_size,
				4010	NULL,
				4011	0,
				4012	64 * 1024);
				4013	tcp_hashinfo.bhash_size = 1U << tcp_hashinfo.bhash_size;
				4014	for (i = 0; i < tcp_hashinfo.bhash_size; i++) {
				4015	spin_lock_init(&tcp_hashinfo.bhash[i].lock);
				4016	INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain);
				4017	}
				4018
				4019
				4020	cnt = tcp_hashinfo.ehash_mask + 1;
				4021	sysctl_tcp_max_orphans = cnt / 2;
				4022
				4023	tcp_init_mem();
				4024	/* Set per-socket limits to no more than 1/128 the pressure threshold */
				4025	limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7);
				4026	max_wshare = min(4UL10241024, limit);
				4027	max_rshare = min(6UL10241024, limit);
				4028
				4029	init_net.ipv4.sysctl_tcp_wmem[0] = SK_MEM_QUANTUM;
				4030	init_net.ipv4.sysctl_tcp_wmem[1] = 16*1024;
				4031	init_net.ipv4.sysctl_tcp_wmem[2] = max(64*1024, max_wshare);
				4032
				4033	init_net.ipv4.sysctl_tcp_rmem[0] = SK_MEM_QUANTUM;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4034	init_net.ipv4.sysctl_tcp_rmem[1] = 131072;
				4035	init_net.ipv4.sysctl_tcp_rmem[2] = max(131072, max_rshare);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4036
				4037	pr_info("Hash tables configured (established %u bind %u)\n",
				4038	tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size);
				4039
				4040	tcp_v4_init();
				4041	tcp_metrics_init();
				4042	BUG_ON(tcp_register_congestion_control(&tcp_reno) != 0);
				4043	tcp_tasklet_init();
				4044	}