Blame - net/ipv4/tcp_ipv4.c - hafnium/third_party/linux

blob: cd426313a29819b34648086b551fe9390d8a0b0a [file] [log] [blame]

Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame^]	1	/*
				2	* INET An implementation of the TCP/IP protocol suite for the LINUX
				3	* operating system. INET is implemented using the BSD Socket
				4	* interface as the means of communication with the user level.
				5	*
				6	* Implementation of the Transmission Control Protocol(TCP).
				7	*
				8	* IPv4 specific functions
				9	*
				10	*
				11	* code split from:
				12	* linux/ipv4/tcp.c
				13	* linux/ipv4/tcp_input.c
				14	* linux/ipv4/tcp_output.c
				15	*
				16	* See tcp.c for author information
				17	*
				18	* This program is free software; you can redistribute it and/or
				19	* modify it under the terms of the GNU General Public License
				20	* as published by the Free Software Foundation; either version
				21	* 2 of the License, or (at your option) any later version.
				22	*/
				23
				24	/*
				25	* Changes:
				26	* David S. Miller : New socket lookup architecture.
				27	* This code is dedicated to John Dyson.
				28	* David S. Miller : Change semantics of established hash,
				29	* half is devoted to TIME_WAIT sockets
				30	* and the rest go in the other half.
				31	* Andi Kleen : Add support for syncookies and fixed
				32	* some bugs: ip options weren't passed to
				33	* the TCP layer, missed a check for an
				34	* ACK bit.
				35	* Andi Kleen : Implemented fast path mtu discovery.
				36	* Fixed many serious bugs in the
				37	* request_sock handling and moved
				38	* most of it into the af independent code.
				39	* Added tail drop and some other bugfixes.
				40	* Added new listen semantics.
				41	* Mike McLagan : Routing by source
				42	* Juan Jose Ciarlante: ip_dynaddr bits
				43	* Andi Kleen: various fixes.
				44	* Vitaly E. Lavrov : Transparent proxy revived after year
				45	* coma.
				46	* Andi Kleen : Fix new listen.
				47	* Andi Kleen : Fix accept error reporting.
				48	* YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
				49	* Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
				50	* a single port at the same time.
				51	*/
				52
				53	#define pr_fmt(fmt) "TCP: " fmt
				54
				55	#include <linux/bottom_half.h>
				56	#include <linux/types.h>
				57	#include <linux/fcntl.h>
				58	#include <linux/module.h>
				59	#include <linux/random.h>
				60	#include <linux/cache.h>
				61	#include <linux/jhash.h>
				62	#include <linux/init.h>
				63	#include <linux/times.h>
				64	#include <linux/slab.h>
				65
				66	#include <net/net_namespace.h>
				67	#include <net/icmp.h>
				68	#include <net/inet_hashtables.h>
				69	#include <net/tcp.h>
				70	#include <net/transp_v6.h>
				71	#include <net/ipv6.h>
				72	#include <net/inet_common.h>
				73	#include <net/timewait_sock.h>
				74	#include <net/xfrm.h>
				75	#include <net/secure_seq.h>
				76	#include <net/busy_poll.h>
				77
				78	#include <linux/inet.h>
				79	#include <linux/ipv6.h>
				80	#include <linux/stddef.h>
				81	#include <linux/proc_fs.h>
				82	#include <linux/seq_file.h>
				83	#include <linux/inetdevice.h>
				84
				85	#include <crypto/hash.h>
				86	#include <linux/scatterlist.h>
				87
				88	#include <trace/events/tcp.h>
				89
				90	#ifdef CONFIG_TCP_MD5SIG
				91	static int tcp_v4_md5_hash_hdr(char md5_hash, const struct tcp_md5sig_key key,
				92	__be32 daddr, __be32 saddr, const struct tcphdr *th);
				93	#endif
				94
				95	struct inet_hashinfo tcp_hashinfo;
				96	EXPORT_SYMBOL(tcp_hashinfo);
				97
				98	static u32 tcp_v4_init_seq(const struct sk_buff *skb)
				99	{
				100	return secure_tcp_seq(ip_hdr(skb)->daddr,
				101	ip_hdr(skb)->saddr,
				102	tcp_hdr(skb)->dest,
				103	tcp_hdr(skb)->source);
				104	}
				105
				106	static u32 tcp_v4_init_ts_off(const struct net net, const struct sk_buff skb)
				107	{
				108	return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
				109	}
				110
				111	int tcp_twsk_unique(struct sock sk, struct sock sktw, void *twp)
				112	{
				113	const struct inet_timewait_sock *tw = inet_twsk(sktw);
				114	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
				115	struct tcp_sock *tp = tcp_sk(sk);
				116	int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse;
				117
				118	if (reuse == 2) {
				119	/* Still does not detect everything that goes through
				120	* lo, since we require a loopback src or dst address
				121	* or direct binding to 'lo' interface.
				122	*/
				123	bool loopback = false;
				124	if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
				125	loopback = true;
				126	#if IS_ENABLED(CONFIG_IPV6)
				127	if (tw->tw_family == AF_INET6) {
				128	if (ipv6_addr_loopback(&tw->tw_v6_daddr) \|\|
				129	(ipv6_addr_v4mapped(&tw->tw_v6_daddr) &&
				130	(tw->tw_v6_daddr.s6_addr[12] == 127)) \|\|
				131	ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) \|\|
				132	(ipv6_addr_v4mapped(&tw->tw_v6_rcv_saddr) &&
				133	(tw->tw_v6_rcv_saddr.s6_addr[12] == 127)))
				134	loopback = true;
				135	} else
				136	#endif
				137	{
				138	if (ipv4_is_loopback(tw->tw_daddr) \|\|
				139	ipv4_is_loopback(tw->tw_rcv_saddr))
				140	loopback = true;
				141	}
				142	if (!loopback)
				143	reuse = 0;
				144	}
				145
				146	/* With PAWS, it is safe from the viewpoint
				147	of data integrity. Even without PAWS it is safe provided sequence
				148	spaces do not overlap i.e. at data rates <= 80Mbit/sec.
				149
				150	Actually, the idea is close to VJ's one, only timestamp cache is
				151	held not per host, but per port pair and TW bucket is used as state
				152	holder.
				153
				154	If TW bucket has been already destroyed we fall back to VJ's scheme
				155	and use initial timestamp retrieved from peer table.
				156	*/
				157	if (tcptw->tw_ts_recent_stamp &&
				158	(!twp \|\| (reuse && time_after32(ktime_get_seconds(),
				159	tcptw->tw_ts_recent_stamp)))) {
				160	/* In case of repair and re-using TIME-WAIT sockets we still
				161	* want to be sure that it is safe as above but honor the
				162	* sequence numbers and time stamps set as part of the repair
				163	* process.
				164	*
				165	* Without this check re-using a TIME-WAIT socket with TCP
				166	* repair would accumulate a -1 on the repair assigned
				167	* sequence number. The first time it is reused the sequence
				168	* is -1, the second time -2, etc. This fixes that issue
				169	* without appearing to create any others.
				170	*/
				171	if (likely(!tp->repair)) {
				172	tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
				173	if (tp->write_seq == 0)
				174	tp->write_seq = 1;
				175	tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
				176	tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
				177	}
				178	sock_hold(sktw);
				179	return 1;
				180	}
				181
				182	return 0;
				183	}
				184	EXPORT_SYMBOL_GPL(tcp_twsk_unique);
				185
				186	static int tcp_v4_pre_connect(struct sock sk, struct sockaddr uaddr,
				187	int addr_len)
				188	{
				189	/* This check is replicated from tcp_v4_connect() and intended to
				190	* prevent BPF program called below from accessing bytes that are out
				191	* of the bound specified by user in addr_len.
				192	*/
				193	if (addr_len < sizeof(struct sockaddr_in))
				194	return -EINVAL;
				195
				196	sock_owned_by_me(sk);
				197
				198	return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
				199	}
				200
				201	/* This will initiate an outgoing connection. */
				202	int tcp_v4_connect(struct sock sk, struct sockaddr uaddr, int addr_len)
				203	{
				204	struct sockaddr_in usin = (struct sockaddr_in )uaddr;
				205	struct inet_sock *inet = inet_sk(sk);
				206	struct tcp_sock *tp = tcp_sk(sk);
				207	__be16 orig_sport, orig_dport;
				208	__be32 daddr, nexthop;
				209	struct flowi4 *fl4;
				210	struct rtable *rt;
				211	int err;
				212	struct ip_options_rcu *inet_opt;
				213	struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
				214
				215	if (addr_len < sizeof(struct sockaddr_in))
				216	return -EINVAL;
				217
				218	if (usin->sin_family != AF_INET)
				219	return -EAFNOSUPPORT;
				220
				221	nexthop = daddr = usin->sin_addr.s_addr;
				222	inet_opt = rcu_dereference_protected(inet->inet_opt,
				223	lockdep_sock_is_held(sk));
				224	if (inet_opt && inet_opt->opt.srr) {
				225	if (!daddr)
				226	return -EINVAL;
				227	nexthop = inet_opt->opt.faddr;
				228	}
				229
				230	orig_sport = inet->inet_sport;
				231	orig_dport = usin->sin_port;
				232	fl4 = &inet->cork.fl.u.ip4;
				233	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
				234	RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
				235	IPPROTO_TCP,
				236	orig_sport, orig_dport, sk);
				237	if (IS_ERR(rt)) {
				238	err = PTR_ERR(rt);
				239	if (err == -ENETUNREACH)
				240	IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
				241	return err;
				242	}
				243
				244	if (rt->rt_flags & (RTCF_MULTICAST \| RTCF_BROADCAST)) {
				245	ip_rt_put(rt);
				246	return -ENETUNREACH;
				247	}
				248
				249	if (!inet_opt \|\| !inet_opt->opt.srr)
				250	daddr = fl4->daddr;
				251
				252	if (!inet->inet_saddr)
				253	inet->inet_saddr = fl4->saddr;
				254	sk_rcv_saddr_set(sk, inet->inet_saddr);
				255
				256	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
				257	/* Reset inherited state */
				258	tp->rx_opt.ts_recent = 0;
				259	tp->rx_opt.ts_recent_stamp = 0;
				260	if (likely(!tp->repair))
				261	tp->write_seq = 0;
				262	}
				263
				264	inet->inet_dport = usin->sin_port;
				265	sk_daddr_set(sk, daddr);
				266
				267	inet_csk(sk)->icsk_ext_hdr_len = 0;
				268	if (inet_opt)
				269	inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
				270
				271	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
				272
				273	/* Socket identity is still unknown (sport may be zero).
				274	* However we set state to SYN-SENT and not releasing socket
				275	* lock select source port, enter ourselves into the hash tables and
				276	* complete initialization after this.
				277	*/
				278	tcp_set_state(sk, TCP_SYN_SENT);
				279	err = inet_hash_connect(tcp_death_row, sk);
				280	if (err)
				281	goto failure;
				282
				283	sk_set_txhash(sk);
				284
				285	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
				286	inet->inet_sport, inet->inet_dport, sk);
				287	if (IS_ERR(rt)) {
				288	err = PTR_ERR(rt);
				289	rt = NULL;
				290	goto failure;
				291	}
				292	/* OK, now commit destination to socket. */
				293	sk->sk_gso_type = SKB_GSO_TCPV4;
				294	sk_setup_caps(sk, &rt->dst);
				295	rt = NULL;
				296
				297	if (likely(!tp->repair)) {
				298	if (!tp->write_seq)
				299	tp->write_seq = secure_tcp_seq(inet->inet_saddr,
				300	inet->inet_daddr,
				301	inet->inet_sport,
				302	usin->sin_port);
				303	tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
				304	inet->inet_saddr,
				305	inet->inet_daddr);
				306	}
				307
				308	inet->inet_id = tp->write_seq ^ jiffies;
				309
				310	if (tcp_fastopen_defer_connect(sk, &err))
				311	return err;
				312	if (err)
				313	goto failure;
				314
				315	err = tcp_connect(sk);
				316
				317	if (err)
				318	goto failure;
				319
				320	return 0;
				321
				322	failure:
				323	/*
				324	* This unhashes the socket and releases the local port,
				325	* if necessary.
				326	*/
				327	tcp_set_state(sk, TCP_CLOSE);
				328	ip_rt_put(rt);
				329	sk->sk_route_caps = 0;
				330	inet->inet_dport = 0;
				331	return err;
				332	}
				333	EXPORT_SYMBOL(tcp_v4_connect);
				334
				335	/*
				336	* This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
				337	* It can be called through tcp_release_cb() if socket was owned by user
				338	* at the time tcp_v4_err() was called to handle ICMP message.
				339	*/
				340	void tcp_v4_mtu_reduced(struct sock *sk)
				341	{
				342	struct inet_sock *inet = inet_sk(sk);
				343	struct dst_entry *dst;
				344	u32 mtu;
				345
				346	if ((1 << sk->sk_state) & (TCPF_LISTEN \| TCPF_CLOSE))
				347	return;
				348	mtu = tcp_sk(sk)->mtu_info;
				349	dst = inet_csk_update_pmtu(sk, mtu);
				350	if (!dst)
				351	return;
				352
				353	/* Something is about to be wrong... Remember soft error
				354	* for the case, if this connection will not able to recover.
				355	*/
				356	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
				357	sk->sk_err_soft = EMSGSIZE;
				358
				359	mtu = dst_mtu(dst);
				360
				361	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
				362	ip_sk_accept_pmtu(sk) &&
				363	inet_csk(sk)->icsk_pmtu_cookie > mtu) {
				364	tcp_sync_mss(sk, mtu);
				365
				366	/* Resend the TCP packet because it's
				367	* clear that the old packet has been
				368	* dropped. This is the new "fast" path mtu
				369	* discovery.
				370	*/
				371	tcp_simple_retransmit(sk);
				372	} /* else let the usual retransmit timer handle it */
				373	}
				374	EXPORT_SYMBOL(tcp_v4_mtu_reduced);
				375
				376	static void do_redirect(struct sk_buff skb, struct sock sk)
				377	{
				378	struct dst_entry *dst = __sk_dst_check(sk, 0);
				379
				380	if (dst)
				381	dst->ops->redirect(dst, sk, skb);
				382	}
				383
				384
				385	/* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
				386	void tcp_req_err(struct sock *sk, u32 seq, bool abort)
				387	{
				388	struct request_sock *req = inet_reqsk(sk);
				389	struct net *net = sock_net(sk);
				390
				391	/* ICMPs are not backlogged, hence we cannot get
				392	* an established socket here.
				393	*/
				394	if (seq != tcp_rsk(req)->snt_isn) {
				395	__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
				396	} else if (abort) {
				397	/*
				398	* Still in SYN_RECV, just remove it silently.
				399	* There is no good way to pass the error to the newly
				400	* created socket, and POSIX does not want network
				401	* errors returned from accept().
				402	*/
				403	inet_csk_reqsk_queue_drop(req->rsk_listener, req);
				404	tcp_listendrop(req->rsk_listener);
				405	}
				406	reqsk_put(req);
				407	}
				408	EXPORT_SYMBOL(tcp_req_err);
				409
				410	/*
				411	* This routine is called by the ICMP module when it gets some
				412	* sort of error condition. If err < 0 then the socket should
				413	* be closed and the error returned to the user. If err > 0
				414	* it's just the icmp type << 8 \| icmp code. After adjustment
				415	* header points to the first 8 bytes of the tcp header. We need
				416	* to find the appropriate port.
				417	*
				418	* The locking strategy used here is very "optimistic". When
				419	* someone else accesses the socket the ICMP is just dropped
				420	* and for some paths there is no check at all.
				421	* A more general error queue to queue errors for later handling
				422	* is probably better.
				423	*
				424	*/
				425
				426	void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
				427	{
				428	const struct iphdr iph = (const struct iphdr )icmp_skb->data;
				429	struct tcphdr th = (struct tcphdr )(icmp_skb->data + (iph->ihl << 2));
				430	struct inet_connection_sock *icsk;
				431	struct tcp_sock *tp;
				432	struct inet_sock *inet;
				433	const int type = icmp_hdr(icmp_skb)->type;
				434	const int code = icmp_hdr(icmp_skb)->code;
				435	struct sock *sk;
				436	struct sk_buff *skb;
				437	struct request_sock *fastopen;
				438	u32 seq, snd_una;
				439	s32 remaining;
				440	u32 delta_us;
				441	int err;
				442	struct net *net = dev_net(icmp_skb->dev);
				443
				444	sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
				445	th->dest, iph->saddr, ntohs(th->source),
				446	inet_iif(icmp_skb), 0);
				447	if (!sk) {
				448	__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
				449	return;
				450	}
				451	if (sk->sk_state == TCP_TIME_WAIT) {
				452	inet_twsk_put(inet_twsk(sk));
				453	return;
				454	}
				455	seq = ntohl(th->seq);
				456	if (sk->sk_state == TCP_NEW_SYN_RECV)
				457	return tcp_req_err(sk, seq,
				458	type == ICMP_PARAMETERPROB \|\|
				459	type == ICMP_TIME_EXCEEDED \|\|
				460	(type == ICMP_DEST_UNREACH &&
				461	(code == ICMP_NET_UNREACH \|\|
				462	code == ICMP_HOST_UNREACH)));
				463
				464	bh_lock_sock(sk);
				465	/* If too many ICMPs get dropped on busy
				466	* servers this needs to be solved differently.
				467	* We do take care of PMTU discovery (RFC1191) special case :
				468	* we can receive locally generated ICMP messages while socket is held.
				469	*/
				470	if (sock_owned_by_user(sk)) {
				471	if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
				472	__NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
				473	}
				474	if (sk->sk_state == TCP_CLOSE)
				475	goto out;
				476
				477	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
				478	__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
				479	goto out;
				480	}
				481
				482	icsk = inet_csk(sk);
				483	tp = tcp_sk(sk);
				484	/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
				485	fastopen = tp->fastopen_rsk;
				486	snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
				487	if (sk->sk_state != TCP_LISTEN &&
				488	!between(seq, snd_una, tp->snd_nxt)) {
				489	__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
				490	goto out;
				491	}
				492
				493	switch (type) {
				494	case ICMP_REDIRECT:
				495	if (!sock_owned_by_user(sk))
				496	do_redirect(icmp_skb, sk);
				497	goto out;
				498	case ICMP_SOURCE_QUENCH:
				499	/* Just silently ignore these. */
				500	goto out;
				501	case ICMP_PARAMETERPROB:
				502	err = EPROTO;
				503	break;
				504	case ICMP_DEST_UNREACH:
				505	if (code > NR_ICMP_UNREACH)
				506	goto out;
				507
				508	if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
				509	/* We are not interested in TCP_LISTEN and open_requests
				510	* (SYN-ACKs send out by Linux are always <576bytes so
				511	* they should go through unfragmented).
				512	*/
				513	if (sk->sk_state == TCP_LISTEN)
				514	goto out;
				515
				516	tp->mtu_info = info;
				517	if (!sock_owned_by_user(sk)) {
				518	tcp_v4_mtu_reduced(sk);
				519	} else {
				520	if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
				521	sock_hold(sk);
				522	}
				523	goto out;
				524	}
				525
				526	err = icmp_err_convert[code].errno;
				527	/* check if icmp_skb allows revert of backoff
				528	* (see draft-zimmermann-tcp-lcd) */
				529	if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
				530	break;
				531	if (seq != tp->snd_una \|\| !icsk->icsk_retransmits \|\|
				532	!icsk->icsk_backoff \|\| fastopen)
				533	break;
				534
				535	if (sock_owned_by_user(sk))
				536	break;
				537
				538	icsk->icsk_backoff--;
				539	icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
				540	TCP_TIMEOUT_INIT;
				541	icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
				542
				543	skb = tcp_rtx_queue_head(sk);
				544	BUG_ON(!skb);
				545
				546	tcp_mstamp_refresh(tp);
				547	delta_us = (u32)(tp->tcp_mstamp - skb->skb_mstamp);
				548	remaining = icsk->icsk_rto -
				549	usecs_to_jiffies(delta_us);
				550
				551	if (remaining > 0) {
				552	inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
				553	remaining, TCP_RTO_MAX);
				554	} else {
				555	/* RTO revert clocked out retransmission.
				556	* Will retransmit now */
				557	tcp_retransmit_timer(sk);
				558	}
				559
				560	break;
				561	case ICMP_TIME_EXCEEDED:
				562	err = EHOSTUNREACH;
				563	break;
				564	default:
				565	goto out;
				566	}
				567
				568	switch (sk->sk_state) {
				569	case TCP_SYN_SENT:
				570	case TCP_SYN_RECV:
				571	/* Only in fast or simultaneous open. If a fast open socket is
				572	* is already accepted it is treated as a connected one below.
				573	*/
				574	if (fastopen && !fastopen->sk)
				575	break;
				576
				577	if (!sock_owned_by_user(sk)) {
				578	sk->sk_err = err;
				579
				580	sk->sk_error_report(sk);
				581
				582	tcp_done(sk);
				583	} else {
				584	sk->sk_err_soft = err;
				585	}
				586	goto out;
				587	}
				588
				589	/* If we've already connected we will keep trying
				590	* until we time out, or the user gives up.
				591	*
				592	* rfc1122 4.2.3.9 allows to consider as hard errors
				593	* only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
				594	* but it is obsoleted by pmtu discovery).
				595	*
				596	* Note, that in modern internet, where routing is unreliable
				597	* and in each dark corner broken firewalls sit, sending random
				598	* errors ordered by their masters even this two messages finally lose
				599	* their original sense (even Linux sends invalid PORT_UNREACHs)
				600	*
				601	* Now we are in compliance with RFCs.
				602	* --ANK (980905)
				603	*/
				604
				605	inet = inet_sk(sk);
				606	if (!sock_owned_by_user(sk) && inet->recverr) {
				607	sk->sk_err = err;
				608	sk->sk_error_report(sk);
				609	} else { /* Only an error on timeout */
				610	sk->sk_err_soft = err;
				611	}
				612
				613	out:
				614	bh_unlock_sock(sk);
				615	sock_put(sk);
				616	}
				617
				618	void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
				619	{
				620	struct tcphdr *th = tcp_hdr(skb);
				621
				622	th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
				623	skb->csum_start = skb_transport_header(skb) - skb->head;
				624	skb->csum_offset = offsetof(struct tcphdr, check);
				625	}
				626
				627	/* This routine computes an IPv4 TCP checksum. */
				628	void tcp_v4_send_check(struct sock sk, struct sk_buff skb)
				629	{
				630	const struct inet_sock *inet = inet_sk(sk);
				631
				632	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
				633	}
				634	EXPORT_SYMBOL(tcp_v4_send_check);
				635
				636	/*
				637	* This routine will send an RST to the other tcp.
				638	*
				639	* Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
				640	* for reset.
				641	* Answer: if a packet caused RST, it is not for a socket
				642	* existing in our system, if it is matched to a socket,
				643	* it is just duplicate segment or bug in other side's TCP.
				644	* So that we build reply only basing on parameters
				645	* arrived with segment.
				646	* Exception: precedence violation. We do not implement it in any case.
				647	*/
				648
				649	static void tcp_v4_send_reset(const struct sock sk, struct sk_buff skb)
				650	{
				651	const struct tcphdr *th = tcp_hdr(skb);
				652	struct {
				653	struct tcphdr th;
				654	#ifdef CONFIG_TCP_MD5SIG
				655	__be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
				656	#endif
				657	} rep;
				658	struct ip_reply_arg arg;
				659	#ifdef CONFIG_TCP_MD5SIG
				660	struct tcp_md5sig_key *key = NULL;
				661	const __u8 *hash_location = NULL;
				662	unsigned char newhash[16];
				663	int genhash;
				664	struct sock *sk1 = NULL;
				665	#endif
				666	struct net *net;
				667	struct sock *ctl_sk;
				668
				669	/* Never send a reset in response to a reset. */
				670	if (th->rst)
				671	return;
				672
				673	/* If sk not NULL, it means we did a successful lookup and incoming
				674	* route had to be correct. prequeue might have dropped our dst.
				675	*/
				676	if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
				677	return;
				678
				679	/* Swap the send and the receive. */
				680	memset(&rep, 0, sizeof(rep));
				681	rep.th.dest = th->source;
				682	rep.th.source = th->dest;
				683	rep.th.doff = sizeof(struct tcphdr) / 4;
				684	rep.th.rst = 1;
				685
				686	if (th->ack) {
				687	rep.th.seq = th->ack_seq;
				688	} else {
				689	rep.th.ack = 1;
				690	rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
				691	skb->len - (th->doff << 2));
				692	}
				693
				694	memset(&arg, 0, sizeof(arg));
				695	arg.iov[0].iov_base = (unsigned char *)&rep;
				696	arg.iov[0].iov_len = sizeof(rep.th);
				697
				698	net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
				699	#ifdef CONFIG_TCP_MD5SIG
				700	rcu_read_lock();
				701	hash_location = tcp_parse_md5sig_option(th);
				702	if (sk && sk_fullsock(sk)) {
				703	key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
				704	&ip_hdr(skb)->saddr, AF_INET);
				705	} else if (hash_location) {
				706	/*
				707	* active side is lost. Try to find listening socket through
				708	* source port, and then find md5 key through listening socket.
				709	* we are not loose security here:
				710	* Incoming packet is checked with md5 hash with finding key,
				711	* no RST generated if md5 hash doesn't match.
				712	*/
				713	sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
				714	ip_hdr(skb)->saddr,
				715	th->source, ip_hdr(skb)->daddr,
				716	ntohs(th->source), inet_iif(skb),
				717	tcp_v4_sdif(skb));
				718	/* don't send rst if it can't find key */
				719	if (!sk1)
				720	goto out;
				721
				722	key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
				723	&ip_hdr(skb)->saddr, AF_INET);
				724	if (!key)
				725	goto out;
				726
				727
				728	genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
				729	if (genhash \|\| memcmp(hash_location, newhash, 16) != 0)
				730	goto out;
				731
				732	}
				733
				734	if (key) {
				735	rep.opt[0] = htonl((TCPOPT_NOP << 24) \|
				736	(TCPOPT_NOP << 16) \|
				737	(TCPOPT_MD5SIG << 8) \|
				738	TCPOLEN_MD5SIG);
				739	/* Update length and the length the header thinks exists */
				740	arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
				741	rep.th.doff = arg.iov[0].iov_len / 4;
				742
				743	tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
				744	key, ip_hdr(skb)->saddr,
				745	ip_hdr(skb)->daddr, &rep.th);
				746	}
				747	#endif
				748	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
				749	ip_hdr(skb)->saddr, /* XXX */
				750	arg.iov[0].iov_len, IPPROTO_TCP, 0);
				751	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
				752	arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
				753
				754	/* When socket is gone, all binding information is lost.
				755	* routing might fail in this case. No choice here, if we choose to force
				756	* input interface, we will misroute in case of asymmetric route.
				757	*/
				758	if (sk) {
				759	arg.bound_dev_if = sk->sk_bound_dev_if;
				760	if (sk_fullsock(sk))
				761	trace_tcp_send_reset(sk, skb);
				762	}
				763
				764	BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
				765	offsetof(struct inet_timewait_sock, tw_bound_dev_if));
				766
				767	arg.tos = ip_hdr(skb)->tos;
				768	arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
				769	local_bh_disable();
				770	ctl_sk = *this_cpu_ptr(net->ipv4.tcp_sk);
				771	if (sk)
				772	ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
				773	inet_twsk(sk)->tw_mark : sk->sk_mark;
				774	ip_send_unicast_reply(ctl_sk,
				775	skb, &TCP_SKB_CB(skb)->header.h4.opt,
				776	ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
				777	&arg, arg.iov[0].iov_len);
				778
				779	ctl_sk->sk_mark = 0;
				780	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
				781	__TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
				782	local_bh_enable();
				783
				784	#ifdef CONFIG_TCP_MD5SIG
				785	out:
				786	rcu_read_unlock();
				787	#endif
				788	}
				789
				790	/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
				791	outside socket context is ugly, certainly. What can I do?
				792	*/
				793
				794	static void tcp_v4_send_ack(const struct sock *sk,
				795	struct sk_buff *skb, u32 seq, u32 ack,
				796	u32 win, u32 tsval, u32 tsecr, int oif,
				797	struct tcp_md5sig_key *key,
				798	int reply_flags, u8 tos)
				799	{
				800	const struct tcphdr *th = tcp_hdr(skb);
				801	struct {
				802	struct tcphdr th;
				803	__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
				804	#ifdef CONFIG_TCP_MD5SIG
				805	+ (TCPOLEN_MD5SIG_ALIGNED >> 2)
				806	#endif
				807	];
				808	} rep;
				809	struct net *net = sock_net(sk);
				810	struct ip_reply_arg arg;
				811	struct sock *ctl_sk;
				812
				813	memset(&rep.th, 0, sizeof(struct tcphdr));
				814	memset(&arg, 0, sizeof(arg));
				815
				816	arg.iov[0].iov_base = (unsigned char *)&rep;
				817	arg.iov[0].iov_len = sizeof(rep.th);
				818	if (tsecr) {
				819	rep.opt[0] = htonl((TCPOPT_NOP << 24) \| (TCPOPT_NOP << 16) \|
				820	(TCPOPT_TIMESTAMP << 8) \|
				821	TCPOLEN_TIMESTAMP);
				822	rep.opt[1] = htonl(tsval);
				823	rep.opt[2] = htonl(tsecr);
				824	arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
				825	}
				826
				827	/* Swap the send and the receive. */
				828	rep.th.dest = th->source;
				829	rep.th.source = th->dest;
				830	rep.th.doff = arg.iov[0].iov_len / 4;
				831	rep.th.seq = htonl(seq);
				832	rep.th.ack_seq = htonl(ack);
				833	rep.th.ack = 1;
				834	rep.th.window = htons(win);
				835
				836	#ifdef CONFIG_TCP_MD5SIG
				837	if (key) {
				838	int offset = (tsecr) ? 3 : 0;
				839
				840	rep.opt[offset++] = htonl((TCPOPT_NOP << 24) \|
				841	(TCPOPT_NOP << 16) \|
				842	(TCPOPT_MD5SIG << 8) \|
				843	TCPOLEN_MD5SIG);
				844	arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
				845	rep.th.doff = arg.iov[0].iov_len/4;
				846
				847	tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
				848	key, ip_hdr(skb)->saddr,
				849	ip_hdr(skb)->daddr, &rep.th);
				850	}
				851	#endif
				852	arg.flags = reply_flags;
				853	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
				854	ip_hdr(skb)->saddr, /* XXX */
				855	arg.iov[0].iov_len, IPPROTO_TCP, 0);
				856	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
				857	if (oif)
				858	arg.bound_dev_if = oif;
				859	arg.tos = tos;
				860	arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
				861	local_bh_disable();
				862	ctl_sk = *this_cpu_ptr(net->ipv4.tcp_sk);
				863	if (sk)
				864	ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
				865	inet_twsk(sk)->tw_mark : sk->sk_mark;
				866	ip_send_unicast_reply(ctl_sk,
				867	skb, &TCP_SKB_CB(skb)->header.h4.opt,
				868	ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
				869	&arg, arg.iov[0].iov_len);
				870
				871	ctl_sk->sk_mark = 0;
				872	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
				873	local_bh_enable();
				874	}
				875
				876	static void tcp_v4_timewait_ack(struct sock sk, struct sk_buff skb)
				877	{
				878	struct inet_timewait_sock *tw = inet_twsk(sk);
				879	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
				880
				881	tcp_v4_send_ack(sk, skb,
				882	tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
				883	tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
				884	tcp_time_stamp_raw() + tcptw->tw_ts_offset,
				885	tcptw->tw_ts_recent,
				886	tw->tw_bound_dev_if,
				887	tcp_twsk_md5_key(tcptw),
				888	tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
				889	tw->tw_tos
				890	);
				891
				892	inet_twsk_put(tw);
				893	}
				894
				895	static void tcp_v4_reqsk_send_ack(const struct sock sk, struct sk_buff skb,
				896	struct request_sock *req)
				897	{
				898	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
				899	* sk->sk_state == TCP_SYN_RECV -> for Fast Open.
				900	*/
				901	u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
				902	tcp_sk(sk)->snd_nxt;
				903
				904	/* RFC 7323 2.3
				905	* The window field (SEG.WND) of every outgoing segment, with the
				906	* exception of <SYN> segments, MUST be right-shifted by
				907	* Rcv.Wind.Shift bits:
				908	*/
				909	tcp_v4_send_ack(sk, skb, seq,
				910	tcp_rsk(req)->rcv_nxt,
				911	req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
				912	tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
				913	req->ts_recent,
				914	0,
				915	tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->saddr,
				916	AF_INET),
				917	inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
				918	ip_hdr(skb)->tos);
				919	}
				920
				921	/*
				922	* Send a SYN-ACK after having received a SYN.
				923	* This still operates on a request_sock only, not on a big
				924	* socket.
				925	*/
				926	static int tcp_v4_send_synack(const struct sock sk, struct dst_entry dst,
				927	struct flowi *fl,
				928	struct request_sock *req,
				929	struct tcp_fastopen_cookie *foc,
				930	enum tcp_synack_type synack_type)
				931	{
				932	const struct inet_request_sock *ireq = inet_rsk(req);
				933	struct flowi4 fl4;
				934	int err = -1;
				935	struct sk_buff *skb;
				936
				937	/* First, grab a route. */
				938	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
				939	return -1;
				940
				941	skb = tcp_make_synack(sk, dst, req, foc, synack_type);
				942
				943	if (skb) {
				944	__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
				945
				946	rcu_read_lock();
				947	err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
				948	ireq->ir_rmt_addr,
				949	rcu_dereference(ireq->ireq_opt));
				950	rcu_read_unlock();
				951	err = net_xmit_eval(err);
				952	}
				953
				954	return err;
				955	}
				956
				957	/*
				958	* IPv4 request_sock destructor.
				959	*/
				960	static void tcp_v4_reqsk_destructor(struct request_sock *req)
				961	{
				962	kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
				963	}
				964
				965	#ifdef CONFIG_TCP_MD5SIG
				966	/*
				967	* RFC2385 MD5 checksumming requires a mapping of
				968	* IP address->MD5 Key.
				969	* We need to maintain these in the sk structure.
				970	*/
				971
				972	/* Find the Key structure for an address. */
				973	struct tcp_md5sig_key tcp_md5_do_lookup(const struct sock sk,
				974	const union tcp_md5_addr *addr,
				975	int family)
				976	{
				977	const struct tcp_sock *tp = tcp_sk(sk);
				978	struct tcp_md5sig_key *key;
				979	const struct tcp_md5sig_info *md5sig;
				980	__be32 mask;
				981	struct tcp_md5sig_key *best_match = NULL;
				982	bool match;
				983
				984	/* caller either holds rcu_read_lock() or socket lock */
				985	md5sig = rcu_dereference_check(tp->md5sig_info,
				986	lockdep_sock_is_held(sk));
				987	if (!md5sig)
				988	return NULL;
				989
				990	hlist_for_each_entry_rcu(key, &md5sig->head, node) {
				991	if (key->family != family)
				992	continue;
				993
				994	if (family == AF_INET) {
				995	mask = inet_make_mask(key->prefixlen);
				996	match = (key->addr.a4.s_addr & mask) ==
				997	(addr->a4.s_addr & mask);
				998	#if IS_ENABLED(CONFIG_IPV6)
				999	} else if (family == AF_INET6) {
				1000	match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
				1001	key->prefixlen);
				1002	#endif
				1003	} else {
				1004	match = false;
				1005	}
				1006
				1007	if (match && (!best_match \|\|
				1008	key->prefixlen > best_match->prefixlen))
				1009	best_match = key;
				1010	}
				1011	return best_match;
				1012	}
				1013	EXPORT_SYMBOL(tcp_md5_do_lookup);
				1014
				1015	static struct tcp_md5sig_key tcp_md5_do_lookup_exact(const struct sock sk,
				1016	const union tcp_md5_addr *addr,
				1017	int family, u8 prefixlen)
				1018	{
				1019	const struct tcp_sock *tp = tcp_sk(sk);
				1020	struct tcp_md5sig_key *key;
				1021	unsigned int size = sizeof(struct in_addr);
				1022	const struct tcp_md5sig_info *md5sig;
				1023
				1024	/* caller either holds rcu_read_lock() or socket lock */
				1025	md5sig = rcu_dereference_check(tp->md5sig_info,
				1026	lockdep_sock_is_held(sk));
				1027	if (!md5sig)
				1028	return NULL;
				1029	#if IS_ENABLED(CONFIG_IPV6)
				1030	if (family == AF_INET6)
				1031	size = sizeof(struct in6_addr);
				1032	#endif
				1033	hlist_for_each_entry_rcu(key, &md5sig->head, node) {
				1034	if (key->family != family)
				1035	continue;
				1036	if (!memcmp(&key->addr, addr, size) &&
				1037	key->prefixlen == prefixlen)
				1038	return key;
				1039	}
				1040	return NULL;
				1041	}
				1042
				1043	struct tcp_md5sig_key tcp_v4_md5_lookup(const struct sock sk,
				1044	const struct sock *addr_sk)
				1045	{
				1046	const union tcp_md5_addr *addr;
				1047
				1048	addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
				1049	return tcp_md5_do_lookup(sk, addr, AF_INET);
				1050	}
				1051	EXPORT_SYMBOL(tcp_v4_md5_lookup);
				1052
				1053	/* This can be called on a newly created socket, from other files */
				1054	int tcp_md5_do_add(struct sock sk, const union tcp_md5_addr addr,
				1055	int family, u8 prefixlen, const u8 *newkey, u8 newkeylen,
				1056	gfp_t gfp)
				1057	{
				1058	/* Add Key to the list */
				1059	struct tcp_md5sig_key *key;
				1060	struct tcp_sock *tp = tcp_sk(sk);
				1061	struct tcp_md5sig_info *md5sig;
				1062
				1063	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
				1064	if (key) {
				1065	/* Pre-existing entry - just update that one. */
				1066	memcpy(key->key, newkey, newkeylen);
				1067	key->keylen = newkeylen;
				1068	return 0;
				1069	}
				1070
				1071	md5sig = rcu_dereference_protected(tp->md5sig_info,
				1072	lockdep_sock_is_held(sk));
				1073	if (!md5sig) {
				1074	md5sig = kmalloc(sizeof(*md5sig), gfp);
				1075	if (!md5sig)
				1076	return -ENOMEM;
				1077
				1078	sk_nocaps_add(sk, NETIF_F_GSO_MASK);
				1079	INIT_HLIST_HEAD(&md5sig->head);
				1080	rcu_assign_pointer(tp->md5sig_info, md5sig);
				1081	}
				1082
				1083	key = sock_kmalloc(sk, sizeof(*key), gfp);
				1084	if (!key)
				1085	return -ENOMEM;
				1086	if (!tcp_alloc_md5sig_pool()) {
				1087	sock_kfree_s(sk, key, sizeof(*key));
				1088	return -ENOMEM;
				1089	}
				1090
				1091	memcpy(key->key, newkey, newkeylen);
				1092	key->keylen = newkeylen;
				1093	key->family = family;
				1094	key->prefixlen = prefixlen;
				1095	memcpy(&key->addr, addr,
				1096	(family == AF_INET6) ? sizeof(struct in6_addr) :
				1097	sizeof(struct in_addr));
				1098	hlist_add_head_rcu(&key->node, &md5sig->head);
				1099	return 0;
				1100	}
				1101	EXPORT_SYMBOL(tcp_md5_do_add);
				1102
				1103	int tcp_md5_do_del(struct sock sk, const union tcp_md5_addr addr, int family,
				1104	u8 prefixlen)
				1105	{
				1106	struct tcp_md5sig_key *key;
				1107
				1108	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
				1109	if (!key)
				1110	return -ENOENT;
				1111	hlist_del_rcu(&key->node);
				1112	atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
				1113	kfree_rcu(key, rcu);
				1114	return 0;
				1115	}
				1116	EXPORT_SYMBOL(tcp_md5_do_del);
				1117
				1118	static void tcp_clear_md5_list(struct sock *sk)
				1119	{
				1120	struct tcp_sock *tp = tcp_sk(sk);
				1121	struct tcp_md5sig_key *key;
				1122	struct hlist_node *n;
				1123	struct tcp_md5sig_info *md5sig;
				1124
				1125	md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
				1126
				1127	hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
				1128	hlist_del_rcu(&key->node);
				1129	atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
				1130	kfree_rcu(key, rcu);
				1131	}
				1132	}
				1133
				1134	static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
				1135	char __user *optval, int optlen)
				1136	{
				1137	struct tcp_md5sig cmd;
				1138	struct sockaddr_in sin = (struct sockaddr_in )&cmd.tcpm_addr;
				1139	u8 prefixlen = 32;
				1140
				1141	if (optlen < sizeof(cmd))
				1142	return -EINVAL;
				1143
				1144	if (copy_from_user(&cmd, optval, sizeof(cmd)))
				1145	return -EFAULT;
				1146
				1147	if (sin->sin_family != AF_INET)
				1148	return -EINVAL;
				1149
				1150	if (optname == TCP_MD5SIG_EXT &&
				1151	cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
				1152	prefixlen = cmd.tcpm_prefixlen;
				1153	if (prefixlen > 32)
				1154	return -EINVAL;
				1155	}
				1156
				1157	if (!cmd.tcpm_keylen)
				1158	return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
				1159	AF_INET, prefixlen);
				1160
				1161	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
				1162	return -EINVAL;
				1163
				1164	return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
				1165	AF_INET, prefixlen, cmd.tcpm_key, cmd.tcpm_keylen,
				1166	GFP_KERNEL);
				1167	}
				1168
				1169	static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
				1170	__be32 daddr, __be32 saddr,
				1171	const struct tcphdr *th, int nbytes)
				1172	{
				1173	struct tcp4_pseudohdr *bp;
				1174	struct scatterlist sg;
				1175	struct tcphdr *_th;
				1176
				1177	bp = hp->scratch;
				1178	bp->saddr = saddr;
				1179	bp->daddr = daddr;
				1180	bp->pad = 0;
				1181	bp->protocol = IPPROTO_TCP;
				1182	bp->len = cpu_to_be16(nbytes);
				1183
				1184	_th = (struct tcphdr *)(bp + 1);
				1185	memcpy(_th, th, sizeof(*th));
				1186	_th->check = 0;
				1187
				1188	sg_init_one(&sg, bp, sizeof(bp) + sizeof(th));
				1189	ahash_request_set_crypt(hp->md5_req, &sg, NULL,
				1190	sizeof(bp) + sizeof(th));
				1191	return crypto_ahash_update(hp->md5_req);
				1192	}
				1193
				1194	static int tcp_v4_md5_hash_hdr(char md5_hash, const struct tcp_md5sig_key key,
				1195	__be32 daddr, __be32 saddr, const struct tcphdr *th)
				1196	{
				1197	struct tcp_md5sig_pool *hp;
				1198	struct ahash_request *req;
				1199
				1200	hp = tcp_get_md5sig_pool();
				1201	if (!hp)
				1202	goto clear_hash_noput;
				1203	req = hp->md5_req;
				1204
				1205	if (crypto_ahash_init(req))
				1206	goto clear_hash;
				1207	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
				1208	goto clear_hash;
				1209	if (tcp_md5_hash_key(hp, key))
				1210	goto clear_hash;
				1211	ahash_request_set_crypt(req, NULL, md5_hash, 0);
				1212	if (crypto_ahash_final(req))
				1213	goto clear_hash;
				1214
				1215	tcp_put_md5sig_pool();
				1216	return 0;
				1217
				1218	clear_hash:
				1219	tcp_put_md5sig_pool();
				1220	clear_hash_noput:
				1221	memset(md5_hash, 0, 16);
				1222	return 1;
				1223	}
				1224
				1225	int tcp_v4_md5_hash_skb(char md5_hash, const struct tcp_md5sig_key key,
				1226	const struct sock *sk,
				1227	const struct sk_buff *skb)
				1228	{
				1229	struct tcp_md5sig_pool *hp;
				1230	struct ahash_request *req;
				1231	const struct tcphdr *th = tcp_hdr(skb);
				1232	__be32 saddr, daddr;
				1233
				1234	if (sk) { /* valid for establish/request sockets */
				1235	saddr = sk->sk_rcv_saddr;
				1236	daddr = sk->sk_daddr;
				1237	} else {
				1238	const struct iphdr *iph = ip_hdr(skb);
				1239	saddr = iph->saddr;
				1240	daddr = iph->daddr;
				1241	}
				1242
				1243	hp = tcp_get_md5sig_pool();
				1244	if (!hp)
				1245	goto clear_hash_noput;
				1246	req = hp->md5_req;
				1247
				1248	if (crypto_ahash_init(req))
				1249	goto clear_hash;
				1250
				1251	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
				1252	goto clear_hash;
				1253	if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
				1254	goto clear_hash;
				1255	if (tcp_md5_hash_key(hp, key))
				1256	goto clear_hash;
				1257	ahash_request_set_crypt(req, NULL, md5_hash, 0);
				1258	if (crypto_ahash_final(req))
				1259	goto clear_hash;
				1260
				1261	tcp_put_md5sig_pool();
				1262	return 0;
				1263
				1264	clear_hash:
				1265	tcp_put_md5sig_pool();
				1266	clear_hash_noput:
				1267	memset(md5_hash, 0, 16);
				1268	return 1;
				1269	}
				1270	EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
				1271
				1272	#endif
				1273
				1274	/* Called with rcu_read_lock() */
				1275	static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
				1276	const struct sk_buff *skb)
				1277	{
				1278	#ifdef CONFIG_TCP_MD5SIG
				1279	/*
				1280	* This gets called for each TCP segment that arrives
				1281	* so we want to be efficient.
				1282	* We have 3 drop cases:
				1283	* o No MD5 hash and one expected.
				1284	* o MD5 hash and we're not expecting one.
				1285	* o MD5 hash and its wrong.
				1286	*/
				1287	const __u8 *hash_location = NULL;
				1288	struct tcp_md5sig_key *hash_expected;
				1289	const struct iphdr *iph = ip_hdr(skb);
				1290	const struct tcphdr *th = tcp_hdr(skb);
				1291	int genhash;
				1292	unsigned char newhash[16];
				1293
				1294	hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
				1295	AF_INET);
				1296	hash_location = tcp_parse_md5sig_option(th);
				1297
				1298	/* We've parsed the options - do we have a hash? */
				1299	if (!hash_expected && !hash_location)
				1300	return false;
				1301
				1302	if (hash_expected && !hash_location) {
				1303	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
				1304	return true;
				1305	}
				1306
				1307	if (!hash_expected && hash_location) {
				1308	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
				1309	return true;
				1310	}
				1311
				1312	/* Okay, so this is hash_expected and hash_location -
				1313	* so we need to calculate the checksum.
				1314	*/
				1315	genhash = tcp_v4_md5_hash_skb(newhash,
				1316	hash_expected,
				1317	NULL, skb);
				1318
				1319	if (genhash \|\| memcmp(hash_location, newhash, 16) != 0) {
				1320	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
				1321	net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
				1322	&iph->saddr, ntohs(th->source),
				1323	&iph->daddr, ntohs(th->dest),
				1324	genhash ? " tcp_v4_calc_md5_hash failed"
				1325	: "");
				1326	return true;
				1327	}
				1328	return false;
				1329	#endif
				1330	return false;
				1331	}
				1332
				1333	static void tcp_v4_init_req(struct request_sock *req,
				1334	const struct sock *sk_listener,
				1335	struct sk_buff *skb)
				1336	{
				1337	struct inet_request_sock *ireq = inet_rsk(req);
				1338	struct net *net = sock_net(sk_listener);
				1339
				1340	sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
				1341	sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
				1342	RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
				1343	}
				1344
				1345	static struct dst_entry tcp_v4_route_req(const struct sock sk,
				1346	struct flowi *fl,
				1347	const struct request_sock *req)
				1348	{
				1349	return inet_csk_route_req(sk, &fl->u.ip4, req);
				1350	}
				1351
				1352	struct request_sock_ops tcp_request_sock_ops __read_mostly = {
				1353	.family = PF_INET,
				1354	.obj_size = sizeof(struct tcp_request_sock),
				1355	.rtx_syn_ack = tcp_rtx_synack,
				1356	.send_ack = tcp_v4_reqsk_send_ack,
				1357	.destructor = tcp_v4_reqsk_destructor,
				1358	.send_reset = tcp_v4_send_reset,
				1359	.syn_ack_timeout = tcp_syn_ack_timeout,
				1360	};
				1361
				1362	static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
				1363	.mss_clamp = TCP_MSS_DEFAULT,
				1364	#ifdef CONFIG_TCP_MD5SIG
				1365	.req_md5_lookup = tcp_v4_md5_lookup,
				1366	.calc_md5_hash = tcp_v4_md5_hash_skb,
				1367	#endif
				1368	.init_req = tcp_v4_init_req,
				1369	#ifdef CONFIG_SYN_COOKIES
				1370	.cookie_init_seq = cookie_v4_init_sequence,
				1371	#endif
				1372	.route_req = tcp_v4_route_req,
				1373	.init_seq = tcp_v4_init_seq,
				1374	.init_ts_off = tcp_v4_init_ts_off,
				1375	.send_synack = tcp_v4_send_synack,
				1376	};
				1377
				1378	int tcp_v4_conn_request(struct sock sk, struct sk_buff skb)
				1379	{
				1380	/* Never answer to SYNs send to broadcast or multicast */
				1381	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST \| RTCF_MULTICAST))
				1382	goto drop;
				1383
				1384	return tcp_conn_request(&tcp_request_sock_ops,
				1385	&tcp_request_sock_ipv4_ops, sk, skb);
				1386
				1387	drop:
				1388	tcp_listendrop(sk);
				1389	return 0;
				1390	}
				1391	EXPORT_SYMBOL(tcp_v4_conn_request);
				1392
				1393
				1394	/*
				1395	* The three way handshake has completed - we got a valid synack -
				1396	* now create the new socket.
				1397	*/
				1398	struct sock tcp_v4_syn_recv_sock(const struct sock sk, struct sk_buff *skb,
				1399	struct request_sock *req,
				1400	struct dst_entry *dst,
				1401	struct request_sock *req_unhash,
				1402	bool *own_req)
				1403	{
				1404	struct inet_request_sock *ireq;
				1405	struct inet_sock *newinet;
				1406	struct tcp_sock *newtp;
				1407	struct sock *newsk;
				1408	#ifdef CONFIG_TCP_MD5SIG
				1409	struct tcp_md5sig_key *key;
				1410	#endif
				1411	struct ip_options_rcu *inet_opt;
				1412
				1413	if (sk_acceptq_is_full(sk))
				1414	goto exit_overflow;
				1415
				1416	newsk = tcp_create_openreq_child(sk, req, skb);
				1417	if (!newsk)
				1418	goto exit_nonewsk;
				1419
				1420	newsk->sk_gso_type = SKB_GSO_TCPV4;
				1421	inet_sk_rx_dst_set(newsk, skb);
				1422
				1423	newtp = tcp_sk(newsk);
				1424	newinet = inet_sk(newsk);
				1425	ireq = inet_rsk(req);
				1426	sk_daddr_set(newsk, ireq->ir_rmt_addr);
				1427	sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
				1428	newsk->sk_bound_dev_if = ireq->ir_iif;
				1429	newinet->inet_saddr = ireq->ir_loc_addr;
				1430	inet_opt = rcu_dereference(ireq->ireq_opt);
				1431	RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
				1432	newinet->mc_index = inet_iif(skb);
				1433	newinet->mc_ttl = ip_hdr(skb)->ttl;
				1434	newinet->rcv_tos = ip_hdr(skb)->tos;
				1435	inet_csk(newsk)->icsk_ext_hdr_len = 0;
				1436	if (inet_opt)
				1437	inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
				1438	newinet->inet_id = newtp->write_seq ^ jiffies;
				1439
				1440	if (!dst) {
				1441	dst = inet_csk_route_child_sock(sk, newsk, req);
				1442	if (!dst)
				1443	goto put_and_exit;
				1444	} else {
				1445	/* syncookie case : see end of cookie_v4_check() */
				1446	}
				1447	sk_setup_caps(newsk, dst);
				1448
				1449	tcp_ca_openreq_child(newsk, dst);
				1450
				1451	tcp_sync_mss(newsk, dst_mtu(dst));
				1452	newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
				1453
				1454	tcp_initialize_rcv_mss(newsk);
				1455
				1456	#ifdef CONFIG_TCP_MD5SIG
				1457	/* Copy over the MD5 key from the original socket */
				1458	key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
				1459	AF_INET);
				1460	if (key) {
				1461	/*
				1462	* We're using one, so create a matching key
				1463	* on the newsk structure. If we fail to get
				1464	* memory, then we end up not copying the key
				1465	* across. Shucks.
				1466	*/
				1467	tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
				1468	AF_INET, 32, key->key, key->keylen, GFP_ATOMIC);
				1469	sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
				1470	}
				1471	#endif
				1472
				1473	if (__inet_inherit_port(sk, newsk) < 0)
				1474	goto put_and_exit;
				1475	*own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
				1476	if (likely(*own_req)) {
				1477	tcp_move_syn(newtp, req);
				1478	ireq->ireq_opt = NULL;
				1479	} else {
				1480	newinet->inet_opt = NULL;
				1481	}
				1482	return newsk;
				1483
				1484	exit_overflow:
				1485	NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
				1486	exit_nonewsk:
				1487	dst_release(dst);
				1488	exit:
				1489	tcp_listendrop(sk);
				1490	return NULL;
				1491	put_and_exit:
				1492	newinet->inet_opt = NULL;
				1493	inet_csk_prepare_forced_close(newsk);
				1494	tcp_done(newsk);
				1495	goto exit;
				1496	}
				1497	EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
				1498
				1499	static struct sock tcp_v4_cookie_check(struct sock sk, struct sk_buff *skb)
				1500	{
				1501	#ifdef CONFIG_SYN_COOKIES
				1502	const struct tcphdr *th = tcp_hdr(skb);
				1503
				1504	if (!th->syn)
				1505	sk = cookie_v4_check(sk, skb);
				1506	#endif
				1507	return sk;
				1508	}
				1509
				1510	/* The socket must have it's spinlock held when we get
				1511	* here, unless it is a TCP_LISTEN socket.
				1512	*
				1513	* We have a potential double-lock case here, so even when
				1514	* doing backlog processing we use the BH locking scheme.
				1515	* This is because we cannot sleep with the original spinlock
				1516	* held.
				1517	*/
				1518	int tcp_v4_do_rcv(struct sock sk, struct sk_buff skb)
				1519	{
				1520	struct sock *rsk;
				1521
				1522	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
				1523	struct dst_entry *dst = sk->sk_rx_dst;
				1524
				1525	sock_rps_save_rxhash(sk, skb);
				1526	sk_mark_napi_id(sk, skb);
				1527	if (dst) {
				1528	if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif \|\|
				1529	!dst->ops->check(dst, 0)) {
				1530	dst_release(dst);
				1531	sk->sk_rx_dst = NULL;
				1532	}
				1533	}
				1534	tcp_rcv_established(sk, skb);
				1535	return 0;
				1536	}
				1537
				1538	if (tcp_checksum_complete(skb))
				1539	goto csum_err;
				1540
				1541	if (sk->sk_state == TCP_LISTEN) {
				1542	struct sock *nsk = tcp_v4_cookie_check(sk, skb);
				1543
				1544	if (!nsk)
				1545	goto discard;
				1546	if (nsk != sk) {
				1547	if (tcp_child_process(sk, nsk, skb)) {
				1548	rsk = nsk;
				1549	goto reset;
				1550	}
				1551	return 0;
				1552	}
				1553	} else
				1554	sock_rps_save_rxhash(sk, skb);
				1555
				1556	if (tcp_rcv_state_process(sk, skb)) {
				1557	rsk = sk;
				1558	goto reset;
				1559	}
				1560	return 0;
				1561
				1562	reset:
				1563	tcp_v4_send_reset(rsk, skb);
				1564	discard:
				1565	kfree_skb(skb);
				1566	/* Be careful here. If this function gets more complicated and
				1567	* gcc suffers from register pressure on the x86, sk (in %ebx)
				1568	* might be destroyed here. This current version compiles correctly,
				1569	* but you have been warned.
				1570	*/
				1571	return 0;
				1572
				1573	csum_err:
				1574	TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
				1575	TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
				1576	goto discard;
				1577	}
				1578	EXPORT_SYMBOL(tcp_v4_do_rcv);
				1579
				1580	int tcp_v4_early_demux(struct sk_buff *skb)
				1581	{
				1582	const struct iphdr *iph;
				1583	const struct tcphdr *th;
				1584	struct sock *sk;
				1585
				1586	if (skb->pkt_type != PACKET_HOST)
				1587	return 0;
				1588
				1589	if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
				1590	return 0;
				1591
				1592	iph = ip_hdr(skb);
				1593	th = tcp_hdr(skb);
				1594
				1595	if (th->doff < sizeof(struct tcphdr) / 4)
				1596	return 0;
				1597
				1598	sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
				1599	iph->saddr, th->source,
				1600	iph->daddr, ntohs(th->dest),
				1601	skb->skb_iif, inet_sdif(skb));
				1602	if (sk) {
				1603	skb->sk = sk;
				1604	skb->destructor = sock_edemux;
				1605	if (sk_fullsock(sk)) {
				1606	struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
				1607
				1608	if (dst)
				1609	dst = dst_check(dst, 0);
				1610	if (dst &&
				1611	inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
				1612	skb_dst_set_noref(skb, dst);
				1613	}
				1614	}
				1615	return 0;
				1616	}
				1617
				1618	bool tcp_add_backlog(struct sock sk, struct sk_buff skb)
				1619	{
				1620	u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf;
				1621
				1622	/* Only socket owner can try to collapse/prune rx queues
				1623	* to reduce memory overhead, so add a little headroom here.
				1624	* Few sockets backlog are possibly concurrently non empty.
				1625	*/
				1626	limit += 64*1024;
				1627
				1628	/* In case all data was pulled from skb frags (in __pskb_pull_tail()),
				1629	* we can fix skb->truesize to its real value to avoid future drops.
				1630	* This is valid because skb is not yet charged to the socket.
				1631	* It has been noticed pure SACK packets were sometimes dropped
				1632	* (if cooked by drivers without copybreak feature).
				1633	*/
				1634	skb_condense(skb);
				1635
				1636	if (unlikely(sk_add_backlog(sk, skb, limit))) {
				1637	bh_unlock_sock(sk);
				1638	__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
				1639	return true;
				1640	}
				1641	return false;
				1642	}
				1643	EXPORT_SYMBOL(tcp_add_backlog);
				1644
				1645	int tcp_filter(struct sock sk, struct sk_buff skb)
				1646	{
				1647	struct tcphdr th = (struct tcphdr )skb->data;
				1648	unsigned int eaten = skb->len;
				1649	int err;
				1650
				1651	err = sk_filter_trim_cap(sk, skb, th->doff * 4);
				1652	if (!err) {
				1653	eaten -= skb->len;
				1654	TCP_SKB_CB(skb)->end_seq -= eaten;
				1655	}
				1656	return err;
				1657	}
				1658	EXPORT_SYMBOL(tcp_filter);
				1659
				1660	static void tcp_v4_restore_cb(struct sk_buff *skb)
				1661	{
				1662	memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
				1663	sizeof(struct inet_skb_parm));
				1664	}
				1665
				1666	static void tcp_v4_fill_cb(struct sk_buff skb, const struct iphdr iph,
				1667	const struct tcphdr *th)
				1668	{
				1669	/* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
				1670	* barrier() makes sure compiler wont play fool^Waliasing games.
				1671	*/
				1672	memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
				1673	sizeof(struct inet_skb_parm));
				1674	barrier();
				1675
				1676	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
				1677	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
				1678	skb->len - th->doff * 4);
				1679	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
				1680	TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
				1681	TCP_SKB_CB(skb)->tcp_tw_isn = 0;
				1682	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
				1683	TCP_SKB_CB(skb)->sacked = 0;
				1684	TCP_SKB_CB(skb)->has_rxtstamp =
				1685	skb->tstamp \|\| skb_hwtstamps(skb)->hwtstamp;
				1686	}
				1687
				1688	/*
				1689	* From tcp_input.c
				1690	*/
				1691
				1692	int tcp_v4_rcv(struct sk_buff *skb)
				1693	{
				1694	struct net *net = dev_net(skb->dev);
				1695	int sdif = inet_sdif(skb);
				1696	const struct iphdr *iph;
				1697	const struct tcphdr *th;
				1698	bool refcounted;
				1699	struct sock *sk;
				1700	int ret;
				1701
				1702	if (skb->pkt_type != PACKET_HOST)
				1703	goto discard_it;
				1704
				1705	/* Count it even if it's bad */
				1706	__TCP_INC_STATS(net, TCP_MIB_INSEGS);
				1707
				1708	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
				1709	goto discard_it;
				1710
				1711	th = (const struct tcphdr *)skb->data;
				1712
				1713	if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
				1714	goto bad_packet;
				1715	if (!pskb_may_pull(skb, th->doff * 4))
				1716	goto discard_it;
				1717
				1718	/* An explanation is required here, I think.
				1719	* Packet length and doff are validated by header prediction,
				1720	* provided case of th->doff==0 is eliminated.
				1721	* So, we defer the checks. */
				1722
				1723	if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
				1724	goto csum_error;
				1725
				1726	th = (const struct tcphdr *)skb->data;
				1727	iph = ip_hdr(skb);
				1728	lookup:
				1729	sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
				1730	th->dest, sdif, &refcounted);
				1731	if (!sk)
				1732	goto no_tcp_socket;
				1733
				1734	process:
				1735	if (sk->sk_state == TCP_TIME_WAIT)
				1736	goto do_time_wait;
				1737
				1738	if (sk->sk_state == TCP_NEW_SYN_RECV) {
				1739	struct request_sock *req = inet_reqsk(sk);
				1740	bool req_stolen = false;
				1741	struct sock *nsk;
				1742
				1743	sk = req->rsk_listener;
				1744	if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
				1745	sk_drops_add(sk, skb);
				1746	reqsk_put(req);
				1747	goto discard_it;
				1748	}
				1749	if (tcp_checksum_complete(skb)) {
				1750	reqsk_put(req);
				1751	goto csum_error;
				1752	}
				1753	if (unlikely(sk->sk_state != TCP_LISTEN)) {
				1754	inet_csk_reqsk_queue_drop_and_put(sk, req);
				1755	goto lookup;
				1756	}
				1757	/* We own a reference on the listener, increase it again
				1758	* as we might lose it too soon.
				1759	*/
				1760	sock_hold(sk);
				1761	refcounted = true;
				1762	nsk = NULL;
				1763	if (!tcp_filter(sk, skb)) {
				1764	th = (const struct tcphdr *)skb->data;
				1765	iph = ip_hdr(skb);
				1766	tcp_v4_fill_cb(skb, iph, th);
				1767	nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
				1768	}
				1769	if (!nsk) {
				1770	reqsk_put(req);
				1771	if (req_stolen) {
				1772	/* Another cpu got exclusive access to req
				1773	* and created a full blown socket.
				1774	* Try to feed this packet to this socket
				1775	* instead of discarding it.
				1776	*/
				1777	tcp_v4_restore_cb(skb);
				1778	sock_put(sk);
				1779	goto lookup;
				1780	}
				1781	goto discard_and_relse;
				1782	}
				1783	if (nsk == sk) {
				1784	reqsk_put(req);
				1785	tcp_v4_restore_cb(skb);
				1786	} else if (tcp_child_process(sk, nsk, skb)) {
				1787	tcp_v4_send_reset(nsk, skb);
				1788	goto discard_and_relse;
				1789	} else {
				1790	sock_put(sk);
				1791	return 0;
				1792	}
				1793	}
				1794	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
				1795	__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
				1796	goto discard_and_relse;
				1797	}
				1798
				1799	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
				1800	goto discard_and_relse;
				1801
				1802	if (tcp_v4_inbound_md5_hash(sk, skb))
				1803	goto discard_and_relse;
				1804
				1805	nf_reset(skb);
				1806
				1807	if (tcp_filter(sk, skb))
				1808	goto discard_and_relse;
				1809	th = (const struct tcphdr *)skb->data;
				1810	iph = ip_hdr(skb);
				1811	tcp_v4_fill_cb(skb, iph, th);
				1812
				1813	skb->dev = NULL;
				1814
				1815	if (sk->sk_state == TCP_LISTEN) {
				1816	ret = tcp_v4_do_rcv(sk, skb);
				1817	goto put_and_return;
				1818	}
				1819
				1820	sk_incoming_cpu_update(sk);
				1821
				1822	bh_lock_sock_nested(sk);
				1823	tcp_segs_in(tcp_sk(sk), skb);
				1824	ret = 0;
				1825	if (!sock_owned_by_user(sk)) {
				1826	ret = tcp_v4_do_rcv(sk, skb);
				1827	} else if (tcp_add_backlog(sk, skb)) {
				1828	goto discard_and_relse;
				1829	}
				1830	bh_unlock_sock(sk);
				1831
				1832	put_and_return:
				1833	if (refcounted)
				1834	sock_put(sk);
				1835
				1836	return ret;
				1837
				1838	no_tcp_socket:
				1839	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
				1840	goto discard_it;
				1841
				1842	tcp_v4_fill_cb(skb, iph, th);
				1843
				1844	if (tcp_checksum_complete(skb)) {
				1845	csum_error:
				1846	__TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
				1847	bad_packet:
				1848	__TCP_INC_STATS(net, TCP_MIB_INERRS);
				1849	} else {
				1850	tcp_v4_send_reset(NULL, skb);
				1851	}
				1852
				1853	discard_it:
				1854	/* Discard frame. */
				1855	kfree_skb(skb);
				1856	return 0;
				1857
				1858	discard_and_relse:
				1859	sk_drops_add(sk, skb);
				1860	if (refcounted)
				1861	sock_put(sk);
				1862	goto discard_it;
				1863
				1864	do_time_wait:
				1865	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
				1866	inet_twsk_put(inet_twsk(sk));
				1867	goto discard_it;
				1868	}
				1869
				1870	tcp_v4_fill_cb(skb, iph, th);
				1871
				1872	if (tcp_checksum_complete(skb)) {
				1873	inet_twsk_put(inet_twsk(sk));
				1874	goto csum_error;
				1875	}
				1876	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
				1877	case TCP_TW_SYN: {
				1878	struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
				1879	&tcp_hashinfo, skb,
				1880	__tcp_hdrlen(th),
				1881	iph->saddr, th->source,
				1882	iph->daddr, th->dest,
				1883	inet_iif(skb),
				1884	sdif);
				1885	if (sk2) {
				1886	inet_twsk_deschedule_put(inet_twsk(sk));
				1887	sk = sk2;
				1888	tcp_v4_restore_cb(skb);
				1889	refcounted = false;
				1890	goto process;
				1891	}
				1892	}
				1893	/* to ACK */
				1894	/* fall through */
				1895	case TCP_TW_ACK:
				1896	tcp_v4_timewait_ack(sk, skb);
				1897	break;
				1898	case TCP_TW_RST:
				1899	tcp_v4_send_reset(sk, skb);
				1900	inet_twsk_deschedule_put(inet_twsk(sk));
				1901	goto discard_it;
				1902	case TCP_TW_SUCCESS:;
				1903	}
				1904	goto discard_it;
				1905	}
				1906
				1907	static struct timewait_sock_ops tcp_timewait_sock_ops = {
				1908	.twsk_obj_size = sizeof(struct tcp_timewait_sock),
				1909	.twsk_unique = tcp_twsk_unique,
				1910	.twsk_destructor= tcp_twsk_destructor,
				1911	};
				1912
				1913	void inet_sk_rx_dst_set(struct sock sk, const struct sk_buff skb)
				1914	{
				1915	struct dst_entry *dst = skb_dst(skb);
				1916
				1917	if (dst && dst_hold_safe(dst)) {
				1918	sk->sk_rx_dst = dst;
				1919	inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
				1920	}
				1921	}
				1922	EXPORT_SYMBOL(inet_sk_rx_dst_set);
				1923
				1924	const struct inet_connection_sock_af_ops ipv4_specific = {
				1925	.queue_xmit = ip_queue_xmit,
				1926	.send_check = tcp_v4_send_check,
				1927	.rebuild_header = inet_sk_rebuild_header,
				1928	.sk_rx_dst_set = inet_sk_rx_dst_set,
				1929	.conn_request = tcp_v4_conn_request,
				1930	.syn_recv_sock = tcp_v4_syn_recv_sock,
				1931	.net_header_len = sizeof(struct iphdr),
				1932	.setsockopt = ip_setsockopt,
				1933	.getsockopt = ip_getsockopt,
				1934	.addr2sockaddr = inet_csk_addr2sockaddr,
				1935	.sockaddr_len = sizeof(struct sockaddr_in),
				1936	#ifdef CONFIG_COMPAT
				1937	.compat_setsockopt = compat_ip_setsockopt,
				1938	.compat_getsockopt = compat_ip_getsockopt,
				1939	#endif
				1940	.mtu_reduced = tcp_v4_mtu_reduced,
				1941	};
				1942	EXPORT_SYMBOL(ipv4_specific);
				1943
				1944	#ifdef CONFIG_TCP_MD5SIG
				1945	static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
				1946	.md5_lookup = tcp_v4_md5_lookup,
				1947	.calc_md5_hash = tcp_v4_md5_hash_skb,
				1948	.md5_parse = tcp_v4_parse_md5_keys,
				1949	};
				1950	#endif
				1951
				1952	/* NOTE: A lot of things set to zero explicitly by call to
				1953	* sk_alloc() so need not be done here.
				1954	*/
				1955	static int tcp_v4_init_sock(struct sock *sk)
				1956	{
				1957	struct inet_connection_sock *icsk = inet_csk(sk);
				1958
				1959	tcp_init_sock(sk);
				1960
				1961	icsk->icsk_af_ops = &ipv4_specific;
				1962
				1963	#ifdef CONFIG_TCP_MD5SIG
				1964	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
				1965	#endif
				1966
				1967	return 0;
				1968	}
				1969
				1970	void tcp_v4_destroy_sock(struct sock *sk)
				1971	{
				1972	struct tcp_sock *tp = tcp_sk(sk);
				1973
				1974	trace_tcp_destroy_sock(sk);
				1975
				1976	tcp_clear_xmit_timers(sk);
				1977
				1978	tcp_cleanup_congestion_control(sk);
				1979
				1980	tcp_cleanup_ulp(sk);
				1981
				1982	/* Cleanup up the write buffer. */
				1983	tcp_write_queue_purge(sk);
				1984
				1985	/* Check if we want to disable active TFO */
				1986	tcp_fastopen_active_disable_ofo_check(sk);
				1987
				1988	/* Cleans up our, hopefully empty, out_of_order_queue. */
				1989	skb_rbtree_purge(&tp->out_of_order_queue);
				1990
				1991	#ifdef CONFIG_TCP_MD5SIG
				1992	/* Clean up the MD5 key list, if any */
				1993	if (tp->md5sig_info) {
				1994	tcp_clear_md5_list(sk);
				1995	kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
				1996	tp->md5sig_info = NULL;
				1997	}
				1998	#endif
				1999
				2000	/* Clean up a referenced TCP bind bucket. */
				2001	if (inet_csk(sk)->icsk_bind_hash)
				2002	inet_put_port(sk);
				2003
				2004	BUG_ON(tp->fastopen_rsk);
				2005
				2006	/* If socket is aborted during connect operation */
				2007	tcp_free_fastopen_req(tp);
				2008	tcp_fastopen_destroy_cipher(sk);
				2009	tcp_saved_syn_free(tp);
				2010
				2011	sk_sockets_allocated_dec(sk);
				2012	}
				2013	EXPORT_SYMBOL(tcp_v4_destroy_sock);
				2014
				2015	#ifdef CONFIG_PROC_FS
				2016	/* Proc filesystem TCP sock list dumping. */
				2017
				2018	/*
				2019	* Get next listener socket follow cur. If cur is NULL, get first socket
				2020	* starting from bucket given in st->bucket; when st->bucket is zero the
				2021	* very first socket in the hash table is returned.
				2022	*/
				2023	static void listening_get_next(struct seq_file seq, void *cur)
				2024	{
				2025	struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
				2026	struct tcp_iter_state *st = seq->private;
				2027	struct net *net = seq_file_net(seq);
				2028	struct inet_listen_hashbucket *ilb;
				2029	struct sock *sk = cur;
				2030
				2031	if (!sk) {
				2032	get_head:
				2033	ilb = &tcp_hashinfo.listening_hash[st->bucket];
				2034	spin_lock(&ilb->lock);
				2035	sk = sk_head(&ilb->head);
				2036	st->offset = 0;
				2037	goto get_sk;
				2038	}
				2039	ilb = &tcp_hashinfo.listening_hash[st->bucket];
				2040	++st->num;
				2041	++st->offset;
				2042
				2043	sk = sk_next(sk);
				2044	get_sk:
				2045	sk_for_each_from(sk) {
				2046	if (!net_eq(sock_net(sk), net))
				2047	continue;
				2048	if (sk->sk_family == afinfo->family)
				2049	return sk;
				2050	}
				2051	spin_unlock(&ilb->lock);
				2052	st->offset = 0;
				2053	if (++st->bucket < INET_LHTABLE_SIZE)
				2054	goto get_head;
				2055	return NULL;
				2056	}
				2057
				2058	static void listening_get_idx(struct seq_file seq, loff_t *pos)
				2059	{
				2060	struct tcp_iter_state *st = seq->private;
				2061	void *rc;
				2062
				2063	st->bucket = 0;
				2064	st->offset = 0;
				2065	rc = listening_get_next(seq, NULL);
				2066
				2067	while (rc && *pos) {
				2068	rc = listening_get_next(seq, rc);
				2069	--*pos;
				2070	}
				2071	return rc;
				2072	}
				2073
				2074	static inline bool empty_bucket(const struct tcp_iter_state *st)
				2075	{
				2076	return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
				2077	}
				2078
				2079	/*
				2080	* Get first established socket starting from bucket given in st->bucket.
				2081	* If st->bucket is zero, the very first socket in the hash is returned.
				2082	*/
				2083	static void established_get_first(struct seq_file seq)
				2084	{
				2085	struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
				2086	struct tcp_iter_state *st = seq->private;
				2087	struct net *net = seq_file_net(seq);
				2088	void *rc = NULL;
				2089
				2090	st->offset = 0;
				2091	for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
				2092	struct sock *sk;
				2093	struct hlist_nulls_node *node;
				2094	spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
				2095
				2096	/* Lockless fast path for the common case of empty buckets */
				2097	if (empty_bucket(st))
				2098	continue;
				2099
				2100	spin_lock_bh(lock);
				2101	sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
				2102	if (sk->sk_family != afinfo->family \|\|
				2103	!net_eq(sock_net(sk), net)) {
				2104	continue;
				2105	}
				2106	rc = sk;
				2107	goto out;
				2108	}
				2109	spin_unlock_bh(lock);
				2110	}
				2111	out:
				2112	return rc;
				2113	}
				2114
				2115	static void established_get_next(struct seq_file seq, void *cur)
				2116	{
				2117	struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
				2118	struct sock *sk = cur;
				2119	struct hlist_nulls_node *node;
				2120	struct tcp_iter_state *st = seq->private;
				2121	struct net *net = seq_file_net(seq);
				2122
				2123	++st->num;
				2124	++st->offset;
				2125
				2126	sk = sk_nulls_next(sk);
				2127
				2128	sk_nulls_for_each_from(sk, node) {
				2129	if (sk->sk_family == afinfo->family &&
				2130	net_eq(sock_net(sk), net))
				2131	return sk;
				2132	}
				2133
				2134	spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
				2135	++st->bucket;
				2136	return established_get_first(seq);
				2137	}
				2138
				2139	static void established_get_idx(struct seq_file seq, loff_t pos)
				2140	{
				2141	struct tcp_iter_state *st = seq->private;
				2142	void *rc;
				2143
				2144	st->bucket = 0;
				2145	rc = established_get_first(seq);
				2146
				2147	while (rc && pos) {
				2148	rc = established_get_next(seq, rc);
				2149	--pos;
				2150	}
				2151	return rc;
				2152	}
				2153
				2154	static void tcp_get_idx(struct seq_file seq, loff_t pos)
				2155	{
				2156	void *rc;
				2157	struct tcp_iter_state *st = seq->private;
				2158
				2159	st->state = TCP_SEQ_STATE_LISTENING;
				2160	rc = listening_get_idx(seq, &pos);
				2161
				2162	if (!rc) {
				2163	st->state = TCP_SEQ_STATE_ESTABLISHED;
				2164	rc = established_get_idx(seq, pos);
				2165	}
				2166
				2167	return rc;
				2168	}
				2169
				2170	static void tcp_seek_last_pos(struct seq_file seq)
				2171	{
				2172	struct tcp_iter_state *st = seq->private;
				2173	int offset = st->offset;
				2174	int orig_num = st->num;
				2175	void *rc = NULL;
				2176
				2177	switch (st->state) {
				2178	case TCP_SEQ_STATE_LISTENING:
				2179	if (st->bucket >= INET_LHTABLE_SIZE)
				2180	break;
				2181	st->state = TCP_SEQ_STATE_LISTENING;
				2182	rc = listening_get_next(seq, NULL);
				2183	while (offset-- && rc)
				2184	rc = listening_get_next(seq, rc);
				2185	if (rc)
				2186	break;
				2187	st->bucket = 0;
				2188	st->state = TCP_SEQ_STATE_ESTABLISHED;
				2189	/* Fallthrough */
				2190	case TCP_SEQ_STATE_ESTABLISHED:
				2191	if (st->bucket > tcp_hashinfo.ehash_mask)
				2192	break;
				2193	rc = established_get_first(seq);
				2194	while (offset-- && rc)
				2195	rc = established_get_next(seq, rc);
				2196	}
				2197
				2198	st->num = orig_num;
				2199
				2200	return rc;
				2201	}
				2202
				2203	void tcp_seq_start(struct seq_file seq, loff_t *pos)
				2204	{
				2205	struct tcp_iter_state *st = seq->private;
				2206	void *rc;
				2207
				2208	if (pos && pos == st->last_pos) {
				2209	rc = tcp_seek_last_pos(seq);
				2210	if (rc)
				2211	goto out;
				2212	}
				2213
				2214	st->state = TCP_SEQ_STATE_LISTENING;
				2215	st->num = 0;
				2216	st->bucket = 0;
				2217	st->offset = 0;
				2218	rc = pos ? tcp_get_idx(seq, pos - 1) : SEQ_START_TOKEN;
				2219
				2220	out:
				2221	st->last_pos = *pos;
				2222	return rc;
				2223	}
				2224	EXPORT_SYMBOL(tcp_seq_start);
				2225
				2226	void tcp_seq_next(struct seq_file seq, void v, loff_t pos)
				2227	{
				2228	struct tcp_iter_state *st = seq->private;
				2229	void *rc = NULL;
				2230
				2231	if (v == SEQ_START_TOKEN) {
				2232	rc = tcp_get_idx(seq, 0);
				2233	goto out;
				2234	}
				2235
				2236	switch (st->state) {
				2237	case TCP_SEQ_STATE_LISTENING:
				2238	rc = listening_get_next(seq, v);
				2239	if (!rc) {
				2240	st->state = TCP_SEQ_STATE_ESTABLISHED;
				2241	st->bucket = 0;
				2242	st->offset = 0;
				2243	rc = established_get_first(seq);
				2244	}
				2245	break;
				2246	case TCP_SEQ_STATE_ESTABLISHED:
				2247	rc = established_get_next(seq, v);
				2248	break;
				2249	}
				2250	out:
				2251	++*pos;
				2252	st->last_pos = *pos;
				2253	return rc;
				2254	}
				2255	EXPORT_SYMBOL(tcp_seq_next);
				2256
				2257	void tcp_seq_stop(struct seq_file seq, void v)
				2258	{
				2259	struct tcp_iter_state *st = seq->private;
				2260
				2261	switch (st->state) {
				2262	case TCP_SEQ_STATE_LISTENING:
				2263	if (v != SEQ_START_TOKEN)
				2264	spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
				2265	break;
				2266	case TCP_SEQ_STATE_ESTABLISHED:
				2267	if (v)
				2268	spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
				2269	break;
				2270	}
				2271	}
				2272	EXPORT_SYMBOL(tcp_seq_stop);
				2273
				2274	static void get_openreq4(const struct request_sock *req,
				2275	struct seq_file *f, int i)
				2276	{
				2277	const struct inet_request_sock *ireq = inet_rsk(req);
				2278	long delta = req->rsk_timer.expires - jiffies;
				2279
				2280	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
				2281	" %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
				2282	i,
				2283	ireq->ir_loc_addr,
				2284	ireq->ir_num,
				2285	ireq->ir_rmt_addr,
				2286	ntohs(ireq->ir_rmt_port),
				2287	TCP_SYN_RECV,
				2288	0, 0, /* could print option size, but that is af dependent. */
				2289	1, /* timers active (only the expire timer) */
				2290	jiffies_delta_to_clock_t(delta),
				2291	req->num_timeout,
				2292	from_kuid_munged(seq_user_ns(f),
				2293	sock_i_uid(req->rsk_listener)),
				2294	0, /* non standard timer */
				2295	0, /* open_requests have no inode */
				2296	0,
				2297	req);
				2298	}
				2299
				2300	static void get_tcp4_sock(struct sock sk, struct seq_file f, int i)
				2301	{
				2302	int timer_active;
				2303	unsigned long timer_expires;
				2304	const struct tcp_sock *tp = tcp_sk(sk);
				2305	const struct inet_connection_sock *icsk = inet_csk(sk);
				2306	const struct inet_sock *inet = inet_sk(sk);
				2307	const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
				2308	__be32 dest = inet->inet_daddr;
				2309	__be32 src = inet->inet_rcv_saddr;
				2310	__u16 destp = ntohs(inet->inet_dport);
				2311	__u16 srcp = ntohs(inet->inet_sport);
				2312	int rx_queue;
				2313	int state;
				2314
				2315	if (icsk->icsk_pending == ICSK_TIME_RETRANS \|\|
				2316	icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT \|\|
				2317	icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
				2318	timer_active = 1;
				2319	timer_expires = icsk->icsk_timeout;
				2320	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
				2321	timer_active = 4;
				2322	timer_expires = icsk->icsk_timeout;
				2323	} else if (timer_pending(&sk->sk_timer)) {
				2324	timer_active = 2;
				2325	timer_expires = sk->sk_timer.expires;
				2326	} else {
				2327	timer_active = 0;
				2328	timer_expires = jiffies;
				2329	}
				2330
				2331	state = inet_sk_state_load(sk);
				2332	if (state == TCP_LISTEN)
				2333	rx_queue = sk->sk_ack_backlog;
				2334	else
				2335	/* Because we don't lock the socket,
				2336	* we might find a transient negative value.
				2337	*/
				2338	rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
				2339
				2340	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
				2341	"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
				2342	i, src, srcp, dest, destp, state,
				2343	tp->write_seq - tp->snd_una,
				2344	rx_queue,
				2345	timer_active,
				2346	jiffies_delta_to_clock_t(timer_expires - jiffies),
				2347	icsk->icsk_retransmits,
				2348	from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
				2349	icsk->icsk_probes_out,
				2350	sock_i_ino(sk),
				2351	refcount_read(&sk->sk_refcnt), sk,
				2352	jiffies_to_clock_t(icsk->icsk_rto),
				2353	jiffies_to_clock_t(icsk->icsk_ack.ato),
				2354	(icsk->icsk_ack.quick << 1) \| icsk->icsk_ack.pingpong,
				2355	tp->snd_cwnd,
				2356	state == TCP_LISTEN ?
				2357	fastopenq->max_qlen :
				2358	(tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
				2359	}
				2360
				2361	static void get_timewait4_sock(const struct inet_timewait_sock *tw,
				2362	struct seq_file *f, int i)
				2363	{
				2364	long delta = tw->tw_timer.expires - jiffies;
				2365	__be32 dest, src;
				2366	__u16 destp, srcp;
				2367
				2368	dest = tw->tw_daddr;
				2369	src = tw->tw_rcv_saddr;
				2370	destp = ntohs(tw->tw_dport);
				2371	srcp = ntohs(tw->tw_sport);
				2372
				2373	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
				2374	" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
				2375	i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
				2376	3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
				2377	refcount_read(&tw->tw_refcnt), tw);
				2378	}
				2379
				2380	#define TMPSZ 150
				2381
				2382	static int tcp4_seq_show(struct seq_file seq, void v)
				2383	{
				2384	struct tcp_iter_state *st;
				2385	struct sock *sk = v;
				2386
				2387	seq_setwidth(seq, TMPSZ - 1);
				2388	if (v == SEQ_START_TOKEN) {
				2389	seq_puts(seq, " sl local_address rem_address st tx_queue "
				2390	"rx_queue tr tm->when retrnsmt uid timeout "
				2391	"inode");
				2392	goto out;
				2393	}
				2394	st = seq->private;
				2395
				2396	if (sk->sk_state == TCP_TIME_WAIT)
				2397	get_timewait4_sock(v, seq, st->num);
				2398	else if (sk->sk_state == TCP_NEW_SYN_RECV)
				2399	get_openreq4(v, seq, st->num);
				2400	else
				2401	get_tcp4_sock(v, seq, st->num);
				2402	out:
				2403	seq_pad(seq, '\n');
				2404	return 0;
				2405	}
				2406
				2407	static const struct seq_operations tcp4_seq_ops = {
				2408	.show = tcp4_seq_show,
				2409	.start = tcp_seq_start,
				2410	.next = tcp_seq_next,
				2411	.stop = tcp_seq_stop,
				2412	};
				2413
				2414	static struct tcp_seq_afinfo tcp4_seq_afinfo = {
				2415	.family = AF_INET,
				2416	};
				2417
				2418	static int __net_init tcp4_proc_init_net(struct net *net)
				2419	{
				2420	if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
				2421	sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
				2422	return -ENOMEM;
				2423	return 0;
				2424	}
				2425
				2426	static void __net_exit tcp4_proc_exit_net(struct net *net)
				2427	{
				2428	remove_proc_entry("tcp", net->proc_net);
				2429	}
				2430
				2431	static struct pernet_operations tcp4_net_ops = {
				2432	.init = tcp4_proc_init_net,
				2433	.exit = tcp4_proc_exit_net,
				2434	};
				2435
				2436	int __init tcp4_proc_init(void)
				2437	{
				2438	return register_pernet_subsys(&tcp4_net_ops);
				2439	}
				2440
				2441	void tcp4_proc_exit(void)
				2442	{
				2443	unregister_pernet_subsys(&tcp4_net_ops);
				2444	}
				2445	#endif /* CONFIG_PROC_FS */
				2446
				2447	struct proto tcp_prot = {
				2448	.name = "TCP",
				2449	.owner = THIS_MODULE,
				2450	.close = tcp_close,
				2451	.pre_connect = tcp_v4_pre_connect,
				2452	.connect = tcp_v4_connect,
				2453	.disconnect = tcp_disconnect,
				2454	.accept = inet_csk_accept,
				2455	.ioctl = tcp_ioctl,
				2456	.init = tcp_v4_init_sock,
				2457	.destroy = tcp_v4_destroy_sock,
				2458	.shutdown = tcp_shutdown,
				2459	.setsockopt = tcp_setsockopt,
				2460	.getsockopt = tcp_getsockopt,
				2461	.keepalive = tcp_set_keepalive,
				2462	.recvmsg = tcp_recvmsg,
				2463	.sendmsg = tcp_sendmsg,
				2464	.sendpage = tcp_sendpage,
				2465	.backlog_rcv = tcp_v4_do_rcv,
				2466	.release_cb = tcp_release_cb,
				2467	.hash = inet_hash,
				2468	.unhash = inet_unhash,
				2469	.get_port = inet_csk_get_port,
				2470	.enter_memory_pressure = tcp_enter_memory_pressure,
				2471	.leave_memory_pressure = tcp_leave_memory_pressure,
				2472	.stream_memory_free = tcp_stream_memory_free,
				2473	.sockets_allocated = &tcp_sockets_allocated,
				2474	.orphan_count = &tcp_orphan_count,
				2475	.memory_allocated = &tcp_memory_allocated,
				2476	.memory_pressure = &tcp_memory_pressure,
				2477	.sysctl_mem = sysctl_tcp_mem,
				2478	.sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem),
				2479	.sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem),
				2480	.max_header = MAX_TCP_HEADER,
				2481	.obj_size = sizeof(struct tcp_sock),
				2482	.slab_flags = SLAB_TYPESAFE_BY_RCU,
				2483	.twsk_prot = &tcp_timewait_sock_ops,
				2484	.rsk_prot = &tcp_request_sock_ops,
				2485	.h.hashinfo = &tcp_hashinfo,
				2486	.no_autobind = true,
				2487	#ifdef CONFIG_COMPAT
				2488	.compat_setsockopt = compat_tcp_setsockopt,
				2489	.compat_getsockopt = compat_tcp_getsockopt,
				2490	#endif
				2491	.diag_destroy = tcp_abort,
				2492	};
				2493	EXPORT_SYMBOL(tcp_prot);
				2494
				2495	static void __net_exit tcp_sk_exit(struct net *net)
				2496	{
				2497	int cpu;
				2498
				2499	module_put(net->ipv4.tcp_congestion_control->owner);
				2500
				2501	for_each_possible_cpu(cpu)
				2502	inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
				2503	free_percpu(net->ipv4.tcp_sk);
				2504	}
				2505
				2506	static int __net_init tcp_sk_init(struct net *net)
				2507	{
				2508	int res, cpu, cnt;
				2509
				2510	net->ipv4.tcp_sk = alloc_percpu(struct sock *);
				2511	if (!net->ipv4.tcp_sk)
				2512	return -ENOMEM;
				2513
				2514	for_each_possible_cpu(cpu) {
				2515	struct sock *sk;
				2516
				2517	res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
				2518	IPPROTO_TCP, net);
				2519	if (res)
				2520	goto fail;
				2521	sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
				2522
				2523	/* Please enforce IP_DF and IPID==0 for RST and
				2524	* ACK sent in SYN-RECV and TIME-WAIT state.
				2525	*/
				2526	inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
				2527
				2528	*per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
				2529	}
				2530
				2531	net->ipv4.sysctl_tcp_ecn = 2;
				2532	net->ipv4.sysctl_tcp_ecn_fallback = 1;
				2533
				2534	net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
				2535	net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
				2536	net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
				2537
				2538	net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
				2539	net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
				2540	net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
				2541
				2542	net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
				2543	net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
				2544	net->ipv4.sysctl_tcp_syncookies = 1;
				2545	net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
				2546	net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
				2547	net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
				2548	net->ipv4.sysctl_tcp_orphan_retries = 0;
				2549	net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
				2550	net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
				2551	net->ipv4.sysctl_tcp_tw_reuse = 2;
				2552
				2553	cnt = tcp_hashinfo.ehash_mask + 1;
				2554	net->ipv4.tcp_death_row.sysctl_max_tw_buckets = (cnt + 1) / 2;
				2555	net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
				2556
				2557	net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 256);
				2558	net->ipv4.sysctl_tcp_sack = 1;
				2559	net->ipv4.sysctl_tcp_window_scaling = 1;
				2560	net->ipv4.sysctl_tcp_timestamps = 1;
				2561	net->ipv4.sysctl_tcp_early_retrans = 3;
				2562	net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
				2563	net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */
				2564	net->ipv4.sysctl_tcp_retrans_collapse = 1;
				2565	net->ipv4.sysctl_tcp_max_reordering = 300;
				2566	net->ipv4.sysctl_tcp_dsack = 1;
				2567	net->ipv4.sysctl_tcp_app_win = 31;
				2568	net->ipv4.sysctl_tcp_adv_win_scale = 1;
				2569	net->ipv4.sysctl_tcp_frto = 2;
				2570	net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
				2571	/* This limits the percentage of the congestion window which we
				2572	* will allow a single TSO frame to consume. Building TSO frames
				2573	* which are too large can cause TCP streams to be bursty.
				2574	*/
				2575	net->ipv4.sysctl_tcp_tso_win_divisor = 3;
				2576	/* Default TSQ limit of four TSO segments */
				2577	net->ipv4.sysctl_tcp_limit_output_bytes = 262144;
				2578	/* rfc5961 challenge ack rate limiting */
				2579	net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
				2580	net->ipv4.sysctl_tcp_min_tso_segs = 2;
				2581	net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
				2582	net->ipv4.sysctl_tcp_autocorking = 1;
				2583	net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
				2584	net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
				2585	net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
				2586	if (net != &init_net) {
				2587	memcpy(net->ipv4.sysctl_tcp_rmem,
				2588	init_net.ipv4.sysctl_tcp_rmem,
				2589	sizeof(init_net.ipv4.sysctl_tcp_rmem));
				2590	memcpy(net->ipv4.sysctl_tcp_wmem,
				2591	init_net.ipv4.sysctl_tcp_wmem,
				2592	sizeof(init_net.ipv4.sysctl_tcp_wmem));
				2593	}
				2594	net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
				2595	net->ipv4.sysctl_tcp_comp_sack_nr = 44;
				2596	net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
				2597	spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
				2598	net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60;
				2599	atomic_set(&net->ipv4.tfo_active_disable_times, 0);
				2600
				2601	/* Reno is always built in */
				2602	if (!net_eq(net, &init_net) &&
				2603	try_module_get(init_net.ipv4.tcp_congestion_control->owner))
				2604	net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
				2605	else
				2606	net->ipv4.tcp_congestion_control = &tcp_reno;
				2607
				2608	return 0;
				2609	fail:
				2610	tcp_sk_exit(net);
				2611
				2612	return res;
				2613	}
				2614
				2615	static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
				2616	{
				2617	struct net *net;
				2618
				2619	inet_twsk_purge(&tcp_hashinfo, AF_INET);
				2620
				2621	list_for_each_entry(net, net_exit_list, exit_list)
				2622	tcp_fastopen_ctx_destroy(net);
				2623	}
				2624
				2625	static struct pernet_operations __net_initdata tcp_sk_ops = {
				2626	.init = tcp_sk_init,
				2627	.exit = tcp_sk_exit,
				2628	.exit_batch = tcp_sk_exit_batch,
				2629	};
				2630
				2631	void __init tcp_v4_init(void)
				2632	{
				2633	if (register_pernet_subsys(&tcp_sk_ops))
				2634	panic("Failed to create the TCP control socket.\n");
				2635	}