blob: 006a34b185378d6903020a4a054d552f5932fabc [file] [log] [blame]
David Brazdil0f672f62019-12-10 10:32:29 +00001// SPDX-License-Identifier: GPL-2.0-or-later
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002/*
3 * INET An implementation of the TCP/IP protocol suite for the LINUX
4 * operating system. INET is implemented using the BSD Socket
5 * interface as the means of communication with the user level.
6 *
7 * Generic INET transport hashtables
8 *
9 * Authors: Lotsa people, from code originally in tcp
Andrew Scullb4b6d4a2019-01-02 15:54:55 +000010 */
11
12#include <linux/module.h>
13#include <linux/random.h>
14#include <linux/sched.h>
15#include <linux/slab.h>
16#include <linux/wait.h>
17#include <linux/vmalloc.h>
David Brazdil0f672f62019-12-10 10:32:29 +000018#include <linux/memblock.h>
Andrew Scullb4b6d4a2019-01-02 15:54:55 +000019
20#include <net/addrconf.h>
21#include <net/inet_connection_sock.h>
22#include <net/inet_hashtables.h>
23#include <net/secure_seq.h>
24#include <net/ip.h>
25#include <net/tcp.h>
26#include <net/sock_reuseport.h>
27
28static u32 inet_ehashfn(const struct net *net, const __be32 laddr,
29 const __u16 lport, const __be32 faddr,
30 const __be16 fport)
31{
32 static u32 inet_ehash_secret __read_mostly;
33
34 net_get_random_once(&inet_ehash_secret, sizeof(inet_ehash_secret));
35
36 return __inet_ehashfn(laddr, lport, faddr, fport,
37 inet_ehash_secret + net_hash_mix(net));
38}
39
40/* This function handles inet_sock, but also timewait and request sockets
41 * for IPv4/IPv6.
42 */
43static u32 sk_ehashfn(const struct sock *sk)
44{
45#if IS_ENABLED(CONFIG_IPV6)
46 if (sk->sk_family == AF_INET6 &&
47 !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
48 return inet6_ehashfn(sock_net(sk),
49 &sk->sk_v6_rcv_saddr, sk->sk_num,
50 &sk->sk_v6_daddr, sk->sk_dport);
51#endif
52 return inet_ehashfn(sock_net(sk),
53 sk->sk_rcv_saddr, sk->sk_num,
54 sk->sk_daddr, sk->sk_dport);
55}
56
57/*
58 * Allocate and initialize a new local port bind bucket.
59 * The bindhash mutex for snum's hash chain must be held here.
60 */
61struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep,
62 struct net *net,
63 struct inet_bind_hashbucket *head,
David Brazdil0f672f62019-12-10 10:32:29 +000064 const unsigned short snum,
65 int l3mdev)
Andrew Scullb4b6d4a2019-01-02 15:54:55 +000066{
67 struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC);
68
69 if (tb) {
70 write_pnet(&tb->ib_net, net);
David Brazdil0f672f62019-12-10 10:32:29 +000071 tb->l3mdev = l3mdev;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +000072 tb->port = snum;
73 tb->fastreuse = 0;
74 tb->fastreuseport = 0;
75 INIT_HLIST_HEAD(&tb->owners);
76 hlist_add_head(&tb->node, &head->chain);
77 }
78 return tb;
79}
80
81/*
82 * Caller must hold hashbucket lock for this tb with local BH disabled
83 */
84void inet_bind_bucket_destroy(struct kmem_cache *cachep, struct inet_bind_bucket *tb)
85{
86 if (hlist_empty(&tb->owners)) {
87 __hlist_del(&tb->node);
88 kmem_cache_free(cachep, tb);
89 }
90}
91
92void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
93 const unsigned short snum)
94{
95 inet_sk(sk)->inet_num = snum;
96 sk_add_bind_node(sk, &tb->owners);
97 inet_csk(sk)->icsk_bind_hash = tb;
98}
99
100/*
101 * Get rid of any references to a local port held by the given sock.
102 */
103static void __inet_put_port(struct sock *sk)
104{
105 struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
106 const int bhash = inet_bhashfn(sock_net(sk), inet_sk(sk)->inet_num,
107 hashinfo->bhash_size);
108 struct inet_bind_hashbucket *head = &hashinfo->bhash[bhash];
109 struct inet_bind_bucket *tb;
110
111 spin_lock(&head->lock);
112 tb = inet_csk(sk)->icsk_bind_hash;
113 __sk_del_bind_node(sk);
114 inet_csk(sk)->icsk_bind_hash = NULL;
115 inet_sk(sk)->inet_num = 0;
116 inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb);
117 spin_unlock(&head->lock);
118}
119
120void inet_put_port(struct sock *sk)
121{
122 local_bh_disable();
123 __inet_put_port(sk);
124 local_bh_enable();
125}
126EXPORT_SYMBOL(inet_put_port);
127
128int __inet_inherit_port(const struct sock *sk, struct sock *child)
129{
130 struct inet_hashinfo *table = sk->sk_prot->h.hashinfo;
131 unsigned short port = inet_sk(child)->inet_num;
132 const int bhash = inet_bhashfn(sock_net(sk), port,
133 table->bhash_size);
134 struct inet_bind_hashbucket *head = &table->bhash[bhash];
135 struct inet_bind_bucket *tb;
David Brazdil0f672f62019-12-10 10:32:29 +0000136 int l3mdev;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000137
138 spin_lock(&head->lock);
139 tb = inet_csk(sk)->icsk_bind_hash;
140 if (unlikely(!tb)) {
141 spin_unlock(&head->lock);
142 return -ENOENT;
143 }
144 if (tb->port != port) {
David Brazdil0f672f62019-12-10 10:32:29 +0000145 l3mdev = inet_sk_bound_l3mdev(sk);
146
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000147 /* NOTE: using tproxy and redirecting skbs to a proxy
148 * on a different listener port breaks the assumption
149 * that the listener socket's icsk_bind_hash is the same
150 * as that of the child socket. We have to look up or
151 * create a new bind bucket for the child here. */
152 inet_bind_bucket_for_each(tb, &head->chain) {
153 if (net_eq(ib_net(tb), sock_net(sk)) &&
David Brazdil0f672f62019-12-10 10:32:29 +0000154 tb->l3mdev == l3mdev && tb->port == port)
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000155 break;
156 }
157 if (!tb) {
158 tb = inet_bind_bucket_create(table->bind_bucket_cachep,
David Brazdil0f672f62019-12-10 10:32:29 +0000159 sock_net(sk), head, port,
160 l3mdev);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000161 if (!tb) {
162 spin_unlock(&head->lock);
163 return -ENOMEM;
164 }
165 }
Olivier Deprez0e641232021-09-23 10:07:05 +0200166 inet_csk_update_fastreuse(tb, child);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000167 }
168 inet_bind_hash(child, tb, port);
169 spin_unlock(&head->lock);
170
171 return 0;
172}
173EXPORT_SYMBOL_GPL(__inet_inherit_port);
174
175static struct inet_listen_hashbucket *
176inet_lhash2_bucket_sk(struct inet_hashinfo *h, struct sock *sk)
177{
178 u32 hash;
179
180#if IS_ENABLED(CONFIG_IPV6)
181 if (sk->sk_family == AF_INET6)
182 hash = ipv6_portaddr_hash(sock_net(sk),
183 &sk->sk_v6_rcv_saddr,
184 inet_sk(sk)->inet_num);
185 else
186#endif
187 hash = ipv4_portaddr_hash(sock_net(sk),
188 inet_sk(sk)->inet_rcv_saddr,
189 inet_sk(sk)->inet_num);
190 return inet_lhash2_bucket(h, hash);
191}
192
193static void inet_hash2(struct inet_hashinfo *h, struct sock *sk)
194{
195 struct inet_listen_hashbucket *ilb2;
196
197 if (!h->lhash2)
198 return;
199
200 ilb2 = inet_lhash2_bucket_sk(h, sk);
201
202 spin_lock(&ilb2->lock);
203 if (sk->sk_reuseport && sk->sk_family == AF_INET6)
204 hlist_add_tail_rcu(&inet_csk(sk)->icsk_listen_portaddr_node,
205 &ilb2->head);
206 else
207 hlist_add_head_rcu(&inet_csk(sk)->icsk_listen_portaddr_node,
208 &ilb2->head);
209 ilb2->count++;
210 spin_unlock(&ilb2->lock);
211}
212
213static void inet_unhash2(struct inet_hashinfo *h, struct sock *sk)
214{
215 struct inet_listen_hashbucket *ilb2;
216
217 if (!h->lhash2 ||
218 WARN_ON_ONCE(hlist_unhashed(&inet_csk(sk)->icsk_listen_portaddr_node)))
219 return;
220
221 ilb2 = inet_lhash2_bucket_sk(h, sk);
222
223 spin_lock(&ilb2->lock);
224 hlist_del_init_rcu(&inet_csk(sk)->icsk_listen_portaddr_node);
225 ilb2->count--;
226 spin_unlock(&ilb2->lock);
227}
228
229static inline int compute_score(struct sock *sk, struct net *net,
230 const unsigned short hnum, const __be32 daddr,
231 const int dif, const int sdif, bool exact_dif)
232{
233 int score = -1;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000234
David Brazdil0f672f62019-12-10 10:32:29 +0000235 if (net_eq(sock_net(sk), net) && sk->sk_num == hnum &&
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000236 !ipv6_only_sock(sk)) {
David Brazdil0f672f62019-12-10 10:32:29 +0000237 if (sk->sk_rcv_saddr != daddr)
238 return -1;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000239
David Brazdil0f672f62019-12-10 10:32:29 +0000240 if (!inet_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif))
241 return -1;
242
243 score = sk->sk_family == PF_INET ? 2 : 1;
244 if (READ_ONCE(sk->sk_incoming_cpu) == raw_smp_processor_id())
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000245 score++;
246 }
247 return score;
248}
249
250/*
251 * Here are some nice properties to exploit here. The BSD API
252 * does not allow a listening sock to specify the remote port nor the
253 * remote address for the connection. So always assume those are both
254 * wildcarded during the search since they can never be otherwise.
255 */
256
257/* called with rcu_read_lock() : No refcount taken on the socket */
258static struct sock *inet_lhash2_lookup(struct net *net,
259 struct inet_listen_hashbucket *ilb2,
260 struct sk_buff *skb, int doff,
261 const __be32 saddr, __be16 sport,
262 const __be32 daddr, const unsigned short hnum,
263 const int dif, const int sdif)
264{
265 bool exact_dif = inet_exact_dif_match(net, skb);
266 struct inet_connection_sock *icsk;
267 struct sock *sk, *result = NULL;
268 int score, hiscore = 0;
269 u32 phash = 0;
270
271 inet_lhash2_for_each_icsk_rcu(icsk, &ilb2->head) {
272 sk = (struct sock *)icsk;
273 score = compute_score(sk, net, hnum, daddr,
274 dif, sdif, exact_dif);
275 if (score > hiscore) {
276 if (sk->sk_reuseport) {
277 phash = inet_ehashfn(net, daddr, hnum,
278 saddr, sport);
279 result = reuseport_select_sock(sk, phash,
280 skb, doff);
281 if (result)
282 return result;
283 }
284 result = sk;
285 hiscore = score;
286 }
287 }
288
289 return result;
290}
291
292struct sock *__inet_lookup_listener(struct net *net,
293 struct inet_hashinfo *hashinfo,
294 struct sk_buff *skb, int doff,
295 const __be32 saddr, __be16 sport,
296 const __be32 daddr, const unsigned short hnum,
297 const int dif, const int sdif)
298{
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000299 struct inet_listen_hashbucket *ilb2;
David Brazdil0f672f62019-12-10 10:32:29 +0000300 struct sock *result = NULL;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000301 unsigned int hash2;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000302
303 hash2 = ipv4_portaddr_hash(net, daddr, hnum);
304 ilb2 = inet_lhash2_bucket(hashinfo, hash2);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000305
306 result = inet_lhash2_lookup(net, ilb2, skb, doff,
307 saddr, sport, daddr, hnum,
308 dif, sdif);
309 if (result)
310 goto done;
311
312 /* Lookup lhash2 with INADDR_ANY */
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000313 hash2 = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum);
314 ilb2 = inet_lhash2_bucket(hashinfo, hash2);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000315
316 result = inet_lhash2_lookup(net, ilb2, skb, doff,
David Brazdil0f672f62019-12-10 10:32:29 +0000317 saddr, sport, htonl(INADDR_ANY), hnum,
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000318 dif, sdif);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000319done:
David Brazdil0f672f62019-12-10 10:32:29 +0000320 if (IS_ERR(result))
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000321 return NULL;
322 return result;
323}
324EXPORT_SYMBOL_GPL(__inet_lookup_listener);
325
326/* All sockets share common refcount, but have different destructors */
327void sock_gen_put(struct sock *sk)
328{
329 if (!refcount_dec_and_test(&sk->sk_refcnt))
330 return;
331
332 if (sk->sk_state == TCP_TIME_WAIT)
333 inet_twsk_free(inet_twsk(sk));
334 else if (sk->sk_state == TCP_NEW_SYN_RECV)
335 reqsk_free(inet_reqsk(sk));
336 else
337 sk_free(sk);
338}
339EXPORT_SYMBOL_GPL(sock_gen_put);
340
341void sock_edemux(struct sk_buff *skb)
342{
343 sock_gen_put(skb->sk);
344}
345EXPORT_SYMBOL(sock_edemux);
346
347struct sock *__inet_lookup_established(struct net *net,
348 struct inet_hashinfo *hashinfo,
349 const __be32 saddr, const __be16 sport,
350 const __be32 daddr, const u16 hnum,
351 const int dif, const int sdif)
352{
353 INET_ADDR_COOKIE(acookie, saddr, daddr);
354 const __portpair ports = INET_COMBINED_PORTS(sport, hnum);
355 struct sock *sk;
356 const struct hlist_nulls_node *node;
357 /* Optimize here for direct hit, only listening connections can
358 * have wildcards anyways.
359 */
360 unsigned int hash = inet_ehashfn(net, daddr, hnum, saddr, sport);
361 unsigned int slot = hash & hashinfo->ehash_mask;
362 struct inet_ehash_bucket *head = &hashinfo->ehash[slot];
363
364begin:
365 sk_nulls_for_each_rcu(sk, node, &head->chain) {
366 if (sk->sk_hash != hash)
367 continue;
368 if (likely(INET_MATCH(sk, net, acookie,
369 saddr, daddr, ports, dif, sdif))) {
370 if (unlikely(!refcount_inc_not_zero(&sk->sk_refcnt)))
371 goto out;
372 if (unlikely(!INET_MATCH(sk, net, acookie,
373 saddr, daddr, ports,
374 dif, sdif))) {
375 sock_gen_put(sk);
376 goto begin;
377 }
378 goto found;
379 }
380 }
381 /*
382 * if the nulls value we got at the end of this lookup is
383 * not the expected one, we must restart lookup.
384 * We probably met an item that was moved to another chain.
385 */
386 if (get_nulls_value(node) != slot)
387 goto begin;
388out:
389 sk = NULL;
390found:
391 return sk;
392}
393EXPORT_SYMBOL_GPL(__inet_lookup_established);
394
395/* called with local bh disabled */
396static int __inet_check_established(struct inet_timewait_death_row *death_row,
397 struct sock *sk, __u16 lport,
398 struct inet_timewait_sock **twp)
399{
400 struct inet_hashinfo *hinfo = death_row->hashinfo;
401 struct inet_sock *inet = inet_sk(sk);
402 __be32 daddr = inet->inet_rcv_saddr;
403 __be32 saddr = inet->inet_daddr;
404 int dif = sk->sk_bound_dev_if;
405 struct net *net = sock_net(sk);
406 int sdif = l3mdev_master_ifindex_by_index(net, dif);
407 INET_ADDR_COOKIE(acookie, saddr, daddr);
408 const __portpair ports = INET_COMBINED_PORTS(inet->inet_dport, lport);
409 unsigned int hash = inet_ehashfn(net, daddr, lport,
410 saddr, inet->inet_dport);
411 struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash);
412 spinlock_t *lock = inet_ehash_lockp(hinfo, hash);
413 struct sock *sk2;
414 const struct hlist_nulls_node *node;
415 struct inet_timewait_sock *tw = NULL;
416
417 spin_lock(lock);
418
419 sk_nulls_for_each(sk2, node, &head->chain) {
420 if (sk2->sk_hash != hash)
421 continue;
422
423 if (likely(INET_MATCH(sk2, net, acookie,
424 saddr, daddr, ports, dif, sdif))) {
425 if (sk2->sk_state == TCP_TIME_WAIT) {
426 tw = inet_twsk(sk2);
427 if (twsk_unique(sk, sk2, twp))
428 break;
429 }
430 goto not_unique;
431 }
432 }
433
434 /* Must record num and sport now. Otherwise we will see
435 * in hash table socket with a funny identity.
436 */
437 inet->inet_num = lport;
438 inet->inet_sport = htons(lport);
439 sk->sk_hash = hash;
440 WARN_ON(!sk_unhashed(sk));
441 __sk_nulls_add_node_rcu(sk, &head->chain);
442 if (tw) {
443 sk_nulls_del_node_init_rcu((struct sock *)tw);
444 __NET_INC_STATS(net, LINUX_MIB_TIMEWAITRECYCLED);
445 }
446 spin_unlock(lock);
447 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
448
449 if (twp) {
450 *twp = tw;
451 } else if (tw) {
452 /* Silly. Should hash-dance instead... */
453 inet_twsk_deschedule_put(tw);
454 }
455 return 0;
456
457not_unique:
458 spin_unlock(lock);
459 return -EADDRNOTAVAIL;
460}
461
462static u32 inet_sk_port_offset(const struct sock *sk)
463{
464 const struct inet_sock *inet = inet_sk(sk);
465
466 return secure_ipv4_port_ephemeral(inet->inet_rcv_saddr,
467 inet->inet_daddr,
468 inet->inet_dport);
469}
470
471/* insert a socket into ehash, and eventually remove another one
472 * (The another one can be a SYN_RECV or TIMEWAIT
473 */
474bool inet_ehash_insert(struct sock *sk, struct sock *osk)
475{
476 struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
477 struct hlist_nulls_head *list;
478 struct inet_ehash_bucket *head;
479 spinlock_t *lock;
480 bool ret = true;
481
482 WARN_ON_ONCE(!sk_unhashed(sk));
483
484 sk->sk_hash = sk_ehashfn(sk);
485 head = inet_ehash_bucket(hashinfo, sk->sk_hash);
486 list = &head->chain;
487 lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
488
489 spin_lock(lock);
490 if (osk) {
491 WARN_ON_ONCE(sk->sk_hash != osk->sk_hash);
492 ret = sk_nulls_del_node_init_rcu(osk);
493 }
494 if (ret)
495 __sk_nulls_add_node_rcu(sk, list);
496 spin_unlock(lock);
497 return ret;
498}
499
500bool inet_ehash_nolisten(struct sock *sk, struct sock *osk)
501{
502 bool ok = inet_ehash_insert(sk, osk);
503
504 if (ok) {
505 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
506 } else {
507 percpu_counter_inc(sk->sk_prot->orphan_count);
508 inet_sk_set_state(sk, TCP_CLOSE);
509 sock_set_flag(sk, SOCK_DEAD);
510 inet_csk_destroy_sock(sk);
511 }
512 return ok;
513}
514EXPORT_SYMBOL_GPL(inet_ehash_nolisten);
515
516static int inet_reuseport_add_sock(struct sock *sk,
517 struct inet_listen_hashbucket *ilb)
518{
519 struct inet_bind_bucket *tb = inet_csk(sk)->icsk_bind_hash;
Olivier Deprez0e641232021-09-23 10:07:05 +0200520 const struct hlist_nulls_node *node;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000521 struct sock *sk2;
522 kuid_t uid = sock_i_uid(sk);
523
Olivier Deprez0e641232021-09-23 10:07:05 +0200524 sk_nulls_for_each_rcu(sk2, node, &ilb->nulls_head) {
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000525 if (sk2 != sk &&
526 sk2->sk_family == sk->sk_family &&
527 ipv6_only_sock(sk2) == ipv6_only_sock(sk) &&
528 sk2->sk_bound_dev_if == sk->sk_bound_dev_if &&
529 inet_csk(sk2)->icsk_bind_hash == tb &&
530 sk2->sk_reuseport && uid_eq(uid, sock_i_uid(sk2)) &&
531 inet_rcv_saddr_equal(sk, sk2, false))
532 return reuseport_add_sock(sk, sk2,
533 inet_rcv_saddr_any(sk));
534 }
535
536 return reuseport_alloc(sk, inet_rcv_saddr_any(sk));
537}
538
539int __inet_hash(struct sock *sk, struct sock *osk)
540{
541 struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
542 struct inet_listen_hashbucket *ilb;
543 int err = 0;
544
545 if (sk->sk_state != TCP_LISTEN) {
546 inet_ehash_nolisten(sk, osk);
547 return 0;
548 }
549 WARN_ON(!sk_unhashed(sk));
550 ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)];
551
552 spin_lock(&ilb->lock);
553 if (sk->sk_reuseport) {
554 err = inet_reuseport_add_sock(sk, ilb);
555 if (err)
556 goto unlock;
557 }
558 if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport &&
559 sk->sk_family == AF_INET6)
Olivier Deprez0e641232021-09-23 10:07:05 +0200560 __sk_nulls_add_node_tail_rcu(sk, &ilb->nulls_head);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000561 else
Olivier Deprez0e641232021-09-23 10:07:05 +0200562 __sk_nulls_add_node_rcu(sk, &ilb->nulls_head);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000563 inet_hash2(hashinfo, sk);
564 ilb->count++;
565 sock_set_flag(sk, SOCK_RCU_FREE);
566 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
567unlock:
568 spin_unlock(&ilb->lock);
569
570 return err;
571}
572EXPORT_SYMBOL(__inet_hash);
573
574int inet_hash(struct sock *sk)
575{
576 int err = 0;
577
578 if (sk->sk_state != TCP_CLOSE) {
579 local_bh_disable();
580 err = __inet_hash(sk, NULL);
581 local_bh_enable();
582 }
583
584 return err;
585}
586EXPORT_SYMBOL_GPL(inet_hash);
587
588void inet_unhash(struct sock *sk)
589{
590 struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
591 struct inet_listen_hashbucket *ilb = NULL;
592 spinlock_t *lock;
593
594 if (sk_unhashed(sk))
595 return;
596
597 if (sk->sk_state == TCP_LISTEN) {
598 ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)];
599 lock = &ilb->lock;
600 } else {
601 lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
602 }
603 spin_lock_bh(lock);
604 if (sk_unhashed(sk))
605 goto unlock;
606
607 if (rcu_access_pointer(sk->sk_reuseport_cb))
608 reuseport_detach_sock(sk);
609 if (ilb) {
610 inet_unhash2(hashinfo, sk);
Olivier Deprez0e641232021-09-23 10:07:05 +0200611 ilb->count--;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000612 }
Olivier Deprez0e641232021-09-23 10:07:05 +0200613 __sk_nulls_del_node_init_rcu(sk);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000614 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
615unlock:
616 spin_unlock_bh(lock);
617}
618EXPORT_SYMBOL_GPL(inet_unhash);
619
620int __inet_hash_connect(struct inet_timewait_death_row *death_row,
621 struct sock *sk, u32 port_offset,
622 int (*check_established)(struct inet_timewait_death_row *,
623 struct sock *, __u16, struct inet_timewait_sock **))
624{
625 struct inet_hashinfo *hinfo = death_row->hashinfo;
626 struct inet_timewait_sock *tw = NULL;
627 struct inet_bind_hashbucket *head;
628 int port = inet_sk(sk)->inet_num;
629 struct net *net = sock_net(sk);
630 struct inet_bind_bucket *tb;
631 u32 remaining, offset;
632 int ret, i, low, high;
633 static u32 hint;
David Brazdil0f672f62019-12-10 10:32:29 +0000634 int l3mdev;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000635
636 if (port) {
637 head = &hinfo->bhash[inet_bhashfn(net, port,
638 hinfo->bhash_size)];
639 tb = inet_csk(sk)->icsk_bind_hash;
640 spin_lock_bh(&head->lock);
641 if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
642 inet_ehash_nolisten(sk, NULL);
643 spin_unlock_bh(&head->lock);
644 return 0;
645 }
646 spin_unlock(&head->lock);
647 /* No definite answer... Walk to established hash table */
648 ret = check_established(death_row, sk, port, NULL);
649 local_bh_enable();
650 return ret;
651 }
652
David Brazdil0f672f62019-12-10 10:32:29 +0000653 l3mdev = inet_sk_bound_l3mdev(sk);
654
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000655 inet_get_local_port_range(net, &low, &high);
656 high++; /* [32768, 60999] -> [32768, 61000[ */
657 remaining = high - low;
658 if (likely(remaining > 1))
659 remaining &= ~1U;
660
661 offset = (hint + port_offset) % remaining;
662 /* In first pass we try ports of @low parity.
663 * inet_csk_get_port() does the opposite choice.
664 */
665 offset &= ~1U;
666other_parity_scan:
667 port = low + offset;
668 for (i = 0; i < remaining; i += 2, port += 2) {
669 if (unlikely(port >= high))
670 port -= remaining;
671 if (inet_is_local_reserved_port(net, port))
672 continue;
673 head = &hinfo->bhash[inet_bhashfn(net, port,
674 hinfo->bhash_size)];
675 spin_lock_bh(&head->lock);
676
677 /* Does not bother with rcv_saddr checks, because
678 * the established check is already unique enough.
679 */
680 inet_bind_bucket_for_each(tb, &head->chain) {
David Brazdil0f672f62019-12-10 10:32:29 +0000681 if (net_eq(ib_net(tb), net) && tb->l3mdev == l3mdev &&
682 tb->port == port) {
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000683 if (tb->fastreuse >= 0 ||
684 tb->fastreuseport >= 0)
685 goto next_port;
686 WARN_ON(hlist_empty(&tb->owners));
687 if (!check_established(death_row, sk,
688 port, &tw))
689 goto ok;
690 goto next_port;
691 }
692 }
693
694 tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep,
David Brazdil0f672f62019-12-10 10:32:29 +0000695 net, head, port, l3mdev);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000696 if (!tb) {
697 spin_unlock_bh(&head->lock);
698 return -ENOMEM;
699 }
700 tb->fastreuse = -1;
701 tb->fastreuseport = -1;
702 goto ok;
703next_port:
704 spin_unlock_bh(&head->lock);
705 cond_resched();
706 }
707
708 offset++;
709 if ((offset & 1) && remaining > 1)
710 goto other_parity_scan;
711
712 return -EADDRNOTAVAIL;
713
714ok:
715 hint += i + 2;
716
717 /* Head lock still held and bh's disabled */
718 inet_bind_hash(sk, tb, port);
719 if (sk_unhashed(sk)) {
720 inet_sk(sk)->inet_sport = htons(port);
721 inet_ehash_nolisten(sk, (struct sock *)tw);
722 }
723 if (tw)
724 inet_twsk_bind_unhash(tw, hinfo);
725 spin_unlock(&head->lock);
726 if (tw)
727 inet_twsk_deschedule_put(tw);
728 local_bh_enable();
729 return 0;
730}
731
732/*
733 * Bind a port for a connect operation and hash it.
734 */
735int inet_hash_connect(struct inet_timewait_death_row *death_row,
736 struct sock *sk)
737{
738 u32 port_offset = 0;
739
740 if (!inet_sk(sk)->inet_num)
741 port_offset = inet_sk_port_offset(sk);
742 return __inet_hash_connect(death_row, sk, port_offset,
743 __inet_check_established);
744}
745EXPORT_SYMBOL_GPL(inet_hash_connect);
746
747void inet_hashinfo_init(struct inet_hashinfo *h)
748{
749 int i;
750
751 for (i = 0; i < INET_LHTABLE_SIZE; i++) {
752 spin_lock_init(&h->listening_hash[i].lock);
Olivier Deprez0e641232021-09-23 10:07:05 +0200753 INIT_HLIST_NULLS_HEAD(&h->listening_hash[i].nulls_head,
754 i + LISTENING_NULLS_BASE);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000755 h->listening_hash[i].count = 0;
756 }
757
758 h->lhash2 = NULL;
759}
760EXPORT_SYMBOL_GPL(inet_hashinfo_init);
761
David Brazdil0f672f62019-12-10 10:32:29 +0000762static void init_hashinfo_lhash2(struct inet_hashinfo *h)
763{
764 int i;
765
766 for (i = 0; i <= h->lhash2_mask; i++) {
767 spin_lock_init(&h->lhash2[i].lock);
768 INIT_HLIST_HEAD(&h->lhash2[i].head);
769 h->lhash2[i].count = 0;
770 }
771}
772
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000773void __init inet_hashinfo2_init(struct inet_hashinfo *h, const char *name,
774 unsigned long numentries, int scale,
775 unsigned long low_limit,
776 unsigned long high_limit)
777{
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000778 h->lhash2 = alloc_large_system_hash(name,
779 sizeof(*h->lhash2),
780 numentries,
781 scale,
782 0,
783 NULL,
784 &h->lhash2_mask,
785 low_limit,
786 high_limit);
David Brazdil0f672f62019-12-10 10:32:29 +0000787 init_hashinfo_lhash2(h);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000788}
789
David Brazdil0f672f62019-12-10 10:32:29 +0000790int inet_hashinfo2_init_mod(struct inet_hashinfo *h)
791{
792 h->lhash2 = kmalloc_array(INET_LHTABLE_SIZE, sizeof(*h->lhash2), GFP_KERNEL);
793 if (!h->lhash2)
794 return -ENOMEM;
795
796 h->lhash2_mask = INET_LHTABLE_SIZE - 1;
797 /* INET_LHTABLE_SIZE must be a power of 2 */
798 BUG_ON(INET_LHTABLE_SIZE & h->lhash2_mask);
799
800 init_hashinfo_lhash2(h);
801 return 0;
802}
803EXPORT_SYMBOL_GPL(inet_hashinfo2_init_mod);
804
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000805int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo)
806{
807 unsigned int locksz = sizeof(spinlock_t);
808 unsigned int i, nblocks = 1;
809
810 if (locksz != 0) {
811 /* allocate 2 cache lines or at least one spinlock per cpu */
812 nblocks = max(2U * L1_CACHE_BYTES / locksz, 1U);
813 nblocks = roundup_pow_of_two(nblocks * num_possible_cpus());
814
815 /* no more locks than number of hash buckets */
816 nblocks = min(nblocks, hashinfo->ehash_mask + 1);
817
818 hashinfo->ehash_locks = kvmalloc_array(nblocks, locksz, GFP_KERNEL);
819 if (!hashinfo->ehash_locks)
820 return -ENOMEM;
821
822 for (i = 0; i < nblocks; i++)
823 spin_lock_init(&hashinfo->ehash_locks[i]);
824 }
825 hashinfo->ehash_locks_mask = nblocks - 1;
826 return 0;
827}
828EXPORT_SYMBOL_GPL(inet_ehash_locks_alloc);