blob: 539492998864e6f6fcdab11438e9657996b233ca [file] [log] [blame]
David Brazdil0f672f62019-12-10 10:32:29 +00001// SPDX-License-Identifier: GPL-2.0-or-later
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002/*
3 * INET An implementation of the TCP/IP protocol suite for the LINUX
4 * operating system. INET is implemented using the BSD Socket
5 * interface as the means of communication with the user level.
6 *
7 * ROUTE - implementation of the IP router.
8 *
9 * Authors: Ross Biro
10 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
11 * Alan Cox, <gw4pts@gw4pts.ampr.org>
12 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
13 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
14 *
15 * Fixes:
16 * Alan Cox : Verify area fixes.
17 * Alan Cox : cli() protects routing changes
18 * Rui Oliveira : ICMP routing table updates
19 * (rco@di.uminho.pt) Routing table insertion and update
20 * Linus Torvalds : Rewrote bits to be sensible
21 * Alan Cox : Added BSD route gw semantics
22 * Alan Cox : Super /proc >4K
23 * Alan Cox : MTU in route table
24 * Alan Cox : MSS actually. Also added the window
25 * clamper.
26 * Sam Lantinga : Fixed route matching in rt_del()
27 * Alan Cox : Routing cache support.
28 * Alan Cox : Removed compatibility cruft.
29 * Alan Cox : RTF_REJECT support.
30 * Alan Cox : TCP irtt support.
31 * Jonathan Naylor : Added Metric support.
32 * Miquel van Smoorenburg : BSD API fixes.
33 * Miquel van Smoorenburg : Metrics.
34 * Alan Cox : Use __u32 properly
35 * Alan Cox : Aligned routing errors more closely with BSD
36 * our system is still very different.
37 * Alan Cox : Faster /proc handling
38 * Alexey Kuznetsov : Massive rework to support tree based routing,
39 * routing caches and better behaviour.
40 *
41 * Olaf Erb : irtt wasn't being copied right.
42 * Bjorn Ekwall : Kerneld route support.
43 * Alan Cox : Multicast fixed (I hope)
44 * Pavel Krauz : Limited broadcast fixed
45 * Mike McLagan : Routing by source
46 * Alexey Kuznetsov : End of old history. Split to fib.c and
47 * route.c and rewritten from scratch.
48 * Andi Kleen : Load-limit warning messages.
49 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
50 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
51 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
52 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
53 * Marc Boucher : routing by fwmark
54 * Robert Olsson : Added rt_cache statistics
55 * Arnaldo C. Melo : Convert proc stuff to seq_file
56 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
57 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
58 * Ilia Sotnikov : Removed TOS from hash calculations
Andrew Scullb4b6d4a2019-01-02 15:54:55 +000059 */
60
61#define pr_fmt(fmt) "IPv4: " fmt
62
63#include <linux/module.h>
64#include <linux/uaccess.h>
65#include <linux/bitops.h>
66#include <linux/types.h>
67#include <linux/kernel.h>
68#include <linux/mm.h>
Olivier Deprez0e641232021-09-23 10:07:05 +020069#include <linux/memblock.h>
Andrew Scullb4b6d4a2019-01-02 15:54:55 +000070#include <linux/string.h>
71#include <linux/socket.h>
72#include <linux/sockios.h>
73#include <linux/errno.h>
74#include <linux/in.h>
75#include <linux/inet.h>
76#include <linux/netdevice.h>
77#include <linux/proc_fs.h>
78#include <linux/init.h>
79#include <linux/skbuff.h>
80#include <linux/inetdevice.h>
81#include <linux/igmp.h>
82#include <linux/pkt_sched.h>
83#include <linux/mroute.h>
84#include <linux/netfilter_ipv4.h>
85#include <linux/random.h>
86#include <linux/rcupdate.h>
87#include <linux/times.h>
88#include <linux/slab.h>
89#include <linux/jhash.h>
90#include <net/dst.h>
91#include <net/dst_metadata.h>
92#include <net/net_namespace.h>
93#include <net/protocol.h>
94#include <net/ip.h>
95#include <net/route.h>
96#include <net/inetpeer.h>
97#include <net/sock.h>
98#include <net/ip_fib.h>
David Brazdil0f672f62019-12-10 10:32:29 +000099#include <net/nexthop.h>
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000100#include <net/arp.h>
101#include <net/tcp.h>
102#include <net/icmp.h>
103#include <net/xfrm.h>
104#include <net/lwtunnel.h>
105#include <net/netevent.h>
106#include <net/rtnetlink.h>
107#ifdef CONFIG_SYSCTL
108#include <linux/sysctl.h>
109#endif
110#include <net/secure_seq.h>
111#include <net/ip_tunnels.h>
112#include <net/l3mdev.h>
113
114#include "fib_lookup.h"
115
116#define RT_FL_TOS(oldflp4) \
117 ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
118
119#define RT_GC_TIMEOUT (300*HZ)
120
121static int ip_rt_max_size;
122static int ip_rt_redirect_number __read_mostly = 9;
123static int ip_rt_redirect_load __read_mostly = HZ / 50;
124static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
125static int ip_rt_error_cost __read_mostly = HZ;
126static int ip_rt_error_burst __read_mostly = 5 * HZ;
127static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
128static u32 ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
129static int ip_rt_min_advmss __read_mostly = 256;
130
131static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
132
133/*
134 * Interface to generic destination cache.
135 */
136
137static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
138static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
139static unsigned int ipv4_mtu(const struct dst_entry *dst);
140static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
141static void ipv4_link_failure(struct sk_buff *skb);
142static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
Olivier Deprez0e641232021-09-23 10:07:05 +0200143 struct sk_buff *skb, u32 mtu,
144 bool confirm_neigh);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000145static void ip_do_redirect(struct dst_entry *dst, struct sock *sk,
146 struct sk_buff *skb);
147static void ipv4_dst_destroy(struct dst_entry *dst);
148
149static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
150{
151 WARN_ON(1);
152 return NULL;
153}
154
155static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
156 struct sk_buff *skb,
157 const void *daddr);
158static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
159
160static struct dst_ops ipv4_dst_ops = {
161 .family = AF_INET,
162 .check = ipv4_dst_check,
163 .default_advmss = ipv4_default_advmss,
164 .mtu = ipv4_mtu,
165 .cow_metrics = ipv4_cow_metrics,
166 .destroy = ipv4_dst_destroy,
167 .negative_advice = ipv4_negative_advice,
168 .link_failure = ipv4_link_failure,
169 .update_pmtu = ip_rt_update_pmtu,
170 .redirect = ip_do_redirect,
171 .local_out = __ip_local_out,
172 .neigh_lookup = ipv4_neigh_lookup,
173 .confirm_neigh = ipv4_confirm_neigh,
174};
175
176#define ECN_OR_COST(class) TC_PRIO_##class
177
178const __u8 ip_tos2prio[16] = {
179 TC_PRIO_BESTEFFORT,
180 ECN_OR_COST(BESTEFFORT),
181 TC_PRIO_BESTEFFORT,
182 ECN_OR_COST(BESTEFFORT),
183 TC_PRIO_BULK,
184 ECN_OR_COST(BULK),
185 TC_PRIO_BULK,
186 ECN_OR_COST(BULK),
187 TC_PRIO_INTERACTIVE,
188 ECN_OR_COST(INTERACTIVE),
189 TC_PRIO_INTERACTIVE,
190 ECN_OR_COST(INTERACTIVE),
191 TC_PRIO_INTERACTIVE_BULK,
192 ECN_OR_COST(INTERACTIVE_BULK),
193 TC_PRIO_INTERACTIVE_BULK,
194 ECN_OR_COST(INTERACTIVE_BULK)
195};
196EXPORT_SYMBOL(ip_tos2prio);
197
198static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
199#define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
200
201#ifdef CONFIG_PROC_FS
202static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
203{
204 if (*pos)
205 return NULL;
206 return SEQ_START_TOKEN;
207}
208
209static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
210{
211 ++*pos;
212 return NULL;
213}
214
215static void rt_cache_seq_stop(struct seq_file *seq, void *v)
216{
217}
218
219static int rt_cache_seq_show(struct seq_file *seq, void *v)
220{
221 if (v == SEQ_START_TOKEN)
222 seq_printf(seq, "%-127s\n",
223 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
224 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
225 "HHUptod\tSpecDst");
226 return 0;
227}
228
229static const struct seq_operations rt_cache_seq_ops = {
230 .start = rt_cache_seq_start,
231 .next = rt_cache_seq_next,
232 .stop = rt_cache_seq_stop,
233 .show = rt_cache_seq_show,
234};
235
236static int rt_cache_seq_open(struct inode *inode, struct file *file)
237{
238 return seq_open(file, &rt_cache_seq_ops);
239}
240
241static const struct file_operations rt_cache_seq_fops = {
242 .open = rt_cache_seq_open,
243 .read = seq_read,
244 .llseek = seq_lseek,
245 .release = seq_release,
246};
247
248
249static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
250{
251 int cpu;
252
253 if (*pos == 0)
254 return SEQ_START_TOKEN;
255
256 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
257 if (!cpu_possible(cpu))
258 continue;
259 *pos = cpu+1;
260 return &per_cpu(rt_cache_stat, cpu);
261 }
262 return NULL;
263}
264
265static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
266{
267 int cpu;
268
269 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
270 if (!cpu_possible(cpu))
271 continue;
272 *pos = cpu+1;
273 return &per_cpu(rt_cache_stat, cpu);
274 }
Olivier Deprez0e641232021-09-23 10:07:05 +0200275 (*pos)++;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000276 return NULL;
277
278}
279
280static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
281{
282
283}
284
285static int rt_cpu_seq_show(struct seq_file *seq, void *v)
286{
287 struct rt_cache_stat *st = v;
288
289 if (v == SEQ_START_TOKEN) {
290 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
291 return 0;
292 }
293
294 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
295 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
296 dst_entries_get_slow(&ipv4_dst_ops),
297 0, /* st->in_hit */
298 st->in_slow_tot,
299 st->in_slow_mc,
300 st->in_no_route,
301 st->in_brd,
302 st->in_martian_dst,
303 st->in_martian_src,
304
305 0, /* st->out_hit */
306 st->out_slow_tot,
307 st->out_slow_mc,
308
309 0, /* st->gc_total */
310 0, /* st->gc_ignored */
311 0, /* st->gc_goal_miss */
312 0, /* st->gc_dst_overflow */
313 0, /* st->in_hlist_search */
314 0 /* st->out_hlist_search */
315 );
316 return 0;
317}
318
319static const struct seq_operations rt_cpu_seq_ops = {
320 .start = rt_cpu_seq_start,
321 .next = rt_cpu_seq_next,
322 .stop = rt_cpu_seq_stop,
323 .show = rt_cpu_seq_show,
324};
325
326
327static int rt_cpu_seq_open(struct inode *inode, struct file *file)
328{
329 return seq_open(file, &rt_cpu_seq_ops);
330}
331
332static const struct file_operations rt_cpu_seq_fops = {
333 .open = rt_cpu_seq_open,
334 .read = seq_read,
335 .llseek = seq_lseek,
336 .release = seq_release,
337};
338
339#ifdef CONFIG_IP_ROUTE_CLASSID
340static int rt_acct_proc_show(struct seq_file *m, void *v)
341{
342 struct ip_rt_acct *dst, *src;
343 unsigned int i, j;
344
345 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
346 if (!dst)
347 return -ENOMEM;
348
349 for_each_possible_cpu(i) {
350 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
351 for (j = 0; j < 256; j++) {
352 dst[j].o_bytes += src[j].o_bytes;
353 dst[j].o_packets += src[j].o_packets;
354 dst[j].i_bytes += src[j].i_bytes;
355 dst[j].i_packets += src[j].i_packets;
356 }
357 }
358
359 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
360 kfree(dst);
361 return 0;
362}
363#endif
364
365static int __net_init ip_rt_do_proc_init(struct net *net)
366{
367 struct proc_dir_entry *pde;
368
369 pde = proc_create("rt_cache", 0444, net->proc_net,
370 &rt_cache_seq_fops);
371 if (!pde)
372 goto err1;
373
374 pde = proc_create("rt_cache", 0444,
375 net->proc_net_stat, &rt_cpu_seq_fops);
376 if (!pde)
377 goto err2;
378
379#ifdef CONFIG_IP_ROUTE_CLASSID
380 pde = proc_create_single("rt_acct", 0, net->proc_net,
381 rt_acct_proc_show);
382 if (!pde)
383 goto err3;
384#endif
385 return 0;
386
387#ifdef CONFIG_IP_ROUTE_CLASSID
388err3:
389 remove_proc_entry("rt_cache", net->proc_net_stat);
390#endif
391err2:
392 remove_proc_entry("rt_cache", net->proc_net);
393err1:
394 return -ENOMEM;
395}
396
397static void __net_exit ip_rt_do_proc_exit(struct net *net)
398{
399 remove_proc_entry("rt_cache", net->proc_net_stat);
400 remove_proc_entry("rt_cache", net->proc_net);
401#ifdef CONFIG_IP_ROUTE_CLASSID
402 remove_proc_entry("rt_acct", net->proc_net);
403#endif
404}
405
406static struct pernet_operations ip_rt_proc_ops __net_initdata = {
407 .init = ip_rt_do_proc_init,
408 .exit = ip_rt_do_proc_exit,
409};
410
411static int __init ip_rt_proc_init(void)
412{
413 return register_pernet_subsys(&ip_rt_proc_ops);
414}
415
416#else
417static inline int ip_rt_proc_init(void)
418{
419 return 0;
420}
421#endif /* CONFIG_PROC_FS */
422
423static inline bool rt_is_expired(const struct rtable *rth)
424{
425 return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
426}
427
428void rt_cache_flush(struct net *net)
429{
430 rt_genid_bump_ipv4(net);
431}
432
433static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
434 struct sk_buff *skb,
435 const void *daddr)
436{
David Brazdil0f672f62019-12-10 10:32:29 +0000437 const struct rtable *rt = container_of(dst, struct rtable, dst);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000438 struct net_device *dev = dst->dev;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000439 struct neighbour *n;
440
David Brazdil0f672f62019-12-10 10:32:29 +0000441 rcu_read_lock_bh();
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000442
David Brazdil0f672f62019-12-10 10:32:29 +0000443 if (likely(rt->rt_gw_family == AF_INET)) {
444 n = ip_neigh_gw4(dev, rt->rt_gw4);
445 } else if (rt->rt_gw_family == AF_INET6) {
446 n = ip_neigh_gw6(dev, &rt->rt_gw6);
447 } else {
448 __be32 pkey;
449
450 pkey = skb ? ip_hdr(skb)->daddr : *((__be32 *) daddr);
451 n = ip_neigh_gw4(dev, pkey);
452 }
453
454 if (!IS_ERR(n) && !refcount_inc_not_zero(&n->refcnt))
455 n = NULL;
456
457 rcu_read_unlock_bh();
458
459 return n;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000460}
461
462static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
463{
David Brazdil0f672f62019-12-10 10:32:29 +0000464 const struct rtable *rt = container_of(dst, struct rtable, dst);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000465 struct net_device *dev = dst->dev;
466 const __be32 *pkey = daddr;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000467
David Brazdil0f672f62019-12-10 10:32:29 +0000468 if (rt->rt_gw_family == AF_INET) {
469 pkey = (const __be32 *)&rt->rt_gw4;
470 } else if (rt->rt_gw_family == AF_INET6) {
471 return __ipv6_confirm_neigh_stub(dev, &rt->rt_gw6);
472 } else if (!daddr ||
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000473 (rt->rt_flags &
David Brazdil0f672f62019-12-10 10:32:29 +0000474 (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL))) {
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000475 return;
David Brazdil0f672f62019-12-10 10:32:29 +0000476 }
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000477 __ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
478}
479
Olivier Deprez0e641232021-09-23 10:07:05 +0200480/* Hash tables of size 2048..262144 depending on RAM size.
481 * Each bucket uses 8 bytes.
482 */
483static u32 ip_idents_mask __read_mostly;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000484static atomic_t *ip_idents __read_mostly;
485static u32 *ip_tstamps __read_mostly;
486
487/* In order to protect privacy, we add a perturbation to identifiers
488 * if one generator is seldom used. This makes hard for an attacker
489 * to infer how many packets were sent between two points in time.
490 */
491u32 ip_idents_reserve(u32 hash, int segs)
492{
Olivier Deprez0e641232021-09-23 10:07:05 +0200493 u32 bucket, old, now = (u32)jiffies;
494 atomic_t *p_id;
495 u32 *p_tstamp;
496 u32 delta = 0;
497
498 bucket = hash & ip_idents_mask;
499 p_tstamp = ip_tstamps + bucket;
500 p_id = ip_idents + bucket;
501 old = READ_ONCE(*p_tstamp);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000502
503 if (old != now && cmpxchg(p_tstamp, old, now) == old)
504 delta = prandom_u32_max(now - old);
505
Olivier Deprez0e641232021-09-23 10:07:05 +0200506 /* If UBSAN reports an error there, please make sure your compiler
507 * supports -fno-strict-overflow before reporting it that was a bug
508 * in UBSAN, and it has been fixed in GCC-8.
509 */
510 return atomic_add_return(segs + delta, p_id) - segs;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000511}
512EXPORT_SYMBOL(ip_idents_reserve);
513
514void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
515{
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000516 u32 hash, id;
517
David Brazdil0f672f62019-12-10 10:32:29 +0000518 /* Note the following code is not safe, but this is okay. */
519 if (unlikely(siphash_key_is_zero(&net->ipv4.ip_id_key)))
520 get_random_bytes(&net->ipv4.ip_id_key,
521 sizeof(net->ipv4.ip_id_key));
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000522
David Brazdil0f672f62019-12-10 10:32:29 +0000523 hash = siphash_3u32((__force u32)iph->daddr,
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000524 (__force u32)iph->saddr,
David Brazdil0f672f62019-12-10 10:32:29 +0000525 iph->protocol,
526 &net->ipv4.ip_id_key);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000527 id = ip_idents_reserve(hash, segs);
528 iph->id = htons(id);
529}
530EXPORT_SYMBOL(__ip_select_ident);
531
532static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
533 const struct sock *sk,
534 const struct iphdr *iph,
535 int oif, u8 tos,
536 u8 prot, u32 mark, int flow_flags)
537{
538 if (sk) {
539 const struct inet_sock *inet = inet_sk(sk);
540
541 oif = sk->sk_bound_dev_if;
542 mark = sk->sk_mark;
543 tos = RT_CONN_FLAGS(sk);
544 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
545 }
546 flowi4_init_output(fl4, oif, mark, tos,
547 RT_SCOPE_UNIVERSE, prot,
548 flow_flags,
549 iph->daddr, iph->saddr, 0, 0,
550 sock_net_uid(net, sk));
551}
552
553static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
554 const struct sock *sk)
555{
556 const struct net *net = dev_net(skb->dev);
557 const struct iphdr *iph = ip_hdr(skb);
558 int oif = skb->dev->ifindex;
559 u8 tos = RT_TOS(iph->tos);
560 u8 prot = iph->protocol;
561 u32 mark = skb->mark;
562
563 __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
564}
565
566static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
567{
568 const struct inet_sock *inet = inet_sk(sk);
569 const struct ip_options_rcu *inet_opt;
570 __be32 daddr = inet->inet_daddr;
571
572 rcu_read_lock();
573 inet_opt = rcu_dereference(inet->inet_opt);
574 if (inet_opt && inet_opt->opt.srr)
575 daddr = inet_opt->opt.faddr;
576 flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
577 RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
578 inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
579 inet_sk_flowi_flags(sk),
580 daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
581 rcu_read_unlock();
582}
583
584static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
585 const struct sk_buff *skb)
586{
587 if (skb)
588 build_skb_flow_key(fl4, skb, sk);
589 else
590 build_sk_flow_key(fl4, sk);
591}
592
593static DEFINE_SPINLOCK(fnhe_lock);
594
595static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
596{
597 struct rtable *rt;
598
599 rt = rcu_dereference(fnhe->fnhe_rth_input);
600 if (rt) {
601 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
602 dst_dev_put(&rt->dst);
603 dst_release(&rt->dst);
604 }
605 rt = rcu_dereference(fnhe->fnhe_rth_output);
606 if (rt) {
607 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
608 dst_dev_put(&rt->dst);
609 dst_release(&rt->dst);
610 }
611}
612
Olivier Deprez0e641232021-09-23 10:07:05 +0200613static void fnhe_remove_oldest(struct fnhe_hash_bucket *hash)
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000614{
Olivier Deprez0e641232021-09-23 10:07:05 +0200615 struct fib_nh_exception __rcu **fnhe_p, **oldest_p;
616 struct fib_nh_exception *fnhe, *oldest = NULL;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000617
Olivier Deprez0e641232021-09-23 10:07:05 +0200618 for (fnhe_p = &hash->chain; ; fnhe_p = &fnhe->fnhe_next) {
619 fnhe = rcu_dereference_protected(*fnhe_p,
620 lockdep_is_held(&fnhe_lock));
621 if (!fnhe)
622 break;
623 if (!oldest ||
624 time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp)) {
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000625 oldest = fnhe;
Olivier Deprez0e641232021-09-23 10:07:05 +0200626 oldest_p = fnhe_p;
627 }
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000628 }
629 fnhe_flush_routes(oldest);
Olivier Deprez0e641232021-09-23 10:07:05 +0200630 *oldest_p = oldest->fnhe_next;
631 kfree_rcu(oldest, rcu);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000632}
633
634static inline u32 fnhe_hashfun(__be32 daddr)
635{
636 static u32 fnhe_hashrnd __read_mostly;
637 u32 hval;
638
639 net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
640 hval = jhash_1word((__force u32) daddr, fnhe_hashrnd);
641 return hash_32(hval, FNHE_HASH_SHIFT);
642}
643
644static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
645{
646 rt->rt_pmtu = fnhe->fnhe_pmtu;
647 rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
648 rt->dst.expires = fnhe->fnhe_expires;
649
650 if (fnhe->fnhe_gw) {
651 rt->rt_flags |= RTCF_REDIRECTED;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000652 rt->rt_uses_gateway = 1;
David Brazdil0f672f62019-12-10 10:32:29 +0000653 rt->rt_gw_family = AF_INET;
654 rt->rt_gw4 = fnhe->fnhe_gw;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000655 }
656}
657
David Brazdil0f672f62019-12-10 10:32:29 +0000658static void update_or_create_fnhe(struct fib_nh_common *nhc, __be32 daddr,
659 __be32 gw, u32 pmtu, bool lock,
660 unsigned long expires)
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000661{
662 struct fnhe_hash_bucket *hash;
663 struct fib_nh_exception *fnhe;
664 struct rtable *rt;
665 u32 genid, hval;
666 unsigned int i;
667 int depth;
668
David Brazdil0f672f62019-12-10 10:32:29 +0000669 genid = fnhe_genid(dev_net(nhc->nhc_dev));
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000670 hval = fnhe_hashfun(daddr);
671
672 spin_lock_bh(&fnhe_lock);
673
David Brazdil0f672f62019-12-10 10:32:29 +0000674 hash = rcu_dereference(nhc->nhc_exceptions);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000675 if (!hash) {
676 hash = kcalloc(FNHE_HASH_SIZE, sizeof(*hash), GFP_ATOMIC);
677 if (!hash)
678 goto out_unlock;
David Brazdil0f672f62019-12-10 10:32:29 +0000679 rcu_assign_pointer(nhc->nhc_exceptions, hash);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000680 }
681
682 hash += hval;
683
684 depth = 0;
685 for (fnhe = rcu_dereference(hash->chain); fnhe;
686 fnhe = rcu_dereference(fnhe->fnhe_next)) {
687 if (fnhe->fnhe_daddr == daddr)
688 break;
689 depth++;
690 }
691
692 if (fnhe) {
693 if (fnhe->fnhe_genid != genid)
694 fnhe->fnhe_genid = genid;
695 if (gw)
696 fnhe->fnhe_gw = gw;
697 if (pmtu) {
698 fnhe->fnhe_pmtu = pmtu;
699 fnhe->fnhe_mtu_locked = lock;
700 }
701 fnhe->fnhe_expires = max(1UL, expires);
702 /* Update all cached dsts too */
703 rt = rcu_dereference(fnhe->fnhe_rth_input);
704 if (rt)
705 fill_route_from_fnhe(rt, fnhe);
706 rt = rcu_dereference(fnhe->fnhe_rth_output);
707 if (rt)
708 fill_route_from_fnhe(rt, fnhe);
709 } else {
Olivier Deprez0e641232021-09-23 10:07:05 +0200710 /* Randomize max depth to avoid some side channels attacks. */
711 int max_depth = FNHE_RECLAIM_DEPTH +
712 prandom_u32_max(FNHE_RECLAIM_DEPTH);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000713
Olivier Deprez0e641232021-09-23 10:07:05 +0200714 while (depth > max_depth) {
715 fnhe_remove_oldest(hash);
716 depth--;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000717 }
Olivier Deprez0e641232021-09-23 10:07:05 +0200718
719 fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
720 if (!fnhe)
721 goto out_unlock;
722
723 fnhe->fnhe_next = hash->chain;
724
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000725 fnhe->fnhe_genid = genid;
726 fnhe->fnhe_daddr = daddr;
727 fnhe->fnhe_gw = gw;
728 fnhe->fnhe_pmtu = pmtu;
729 fnhe->fnhe_mtu_locked = lock;
730 fnhe->fnhe_expires = max(1UL, expires);
731
Olivier Deprez0e641232021-09-23 10:07:05 +0200732 rcu_assign_pointer(hash->chain, fnhe);
733
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000734 /* Exception created; mark the cached routes for the nexthop
735 * stale, so anyone caching it rechecks if this exception
736 * applies to them.
737 */
David Brazdil0f672f62019-12-10 10:32:29 +0000738 rt = rcu_dereference(nhc->nhc_rth_input);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000739 if (rt)
740 rt->dst.obsolete = DST_OBSOLETE_KILL;
741
742 for_each_possible_cpu(i) {
743 struct rtable __rcu **prt;
David Brazdil0f672f62019-12-10 10:32:29 +0000744 prt = per_cpu_ptr(nhc->nhc_pcpu_rth_output, i);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000745 rt = rcu_dereference(*prt);
746 if (rt)
747 rt->dst.obsolete = DST_OBSOLETE_KILL;
748 }
749 }
750
751 fnhe->fnhe_stamp = jiffies;
752
753out_unlock:
754 spin_unlock_bh(&fnhe_lock);
755}
756
757static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
758 bool kill_route)
759{
760 __be32 new_gw = icmp_hdr(skb)->un.gateway;
761 __be32 old_gw = ip_hdr(skb)->saddr;
762 struct net_device *dev = skb->dev;
763 struct in_device *in_dev;
764 struct fib_result res;
765 struct neighbour *n;
766 struct net *net;
767
768 switch (icmp_hdr(skb)->code & 7) {
769 case ICMP_REDIR_NET:
770 case ICMP_REDIR_NETTOS:
771 case ICMP_REDIR_HOST:
772 case ICMP_REDIR_HOSTTOS:
773 break;
774
775 default:
776 return;
777 }
778
David Brazdil0f672f62019-12-10 10:32:29 +0000779 if (rt->rt_gw_family != AF_INET || rt->rt_gw4 != old_gw)
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000780 return;
781
782 in_dev = __in_dev_get_rcu(dev);
783 if (!in_dev)
784 return;
785
786 net = dev_net(dev);
787 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
788 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
789 ipv4_is_zeronet(new_gw))
790 goto reject_redirect;
791
792 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
793 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
794 goto reject_redirect;
795 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
796 goto reject_redirect;
797 } else {
798 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
799 goto reject_redirect;
800 }
801
802 n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
803 if (!n)
804 n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
805 if (!IS_ERR(n)) {
806 if (!(n->nud_state & NUD_VALID)) {
807 neigh_event_send(n, NULL);
808 } else {
809 if (fib_lookup(net, fl4, &res, 0) == 0) {
Olivier Deprez0e641232021-09-23 10:07:05 +0200810 struct fib_nh_common *nhc;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000811
Olivier Deprez0e641232021-09-23 10:07:05 +0200812 fib_select_path(net, &res, fl4, skb);
813 nhc = FIB_RES_NHC(res);
David Brazdil0f672f62019-12-10 10:32:29 +0000814 update_or_create_fnhe(nhc, fl4->daddr, new_gw,
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000815 0, false,
816 jiffies + ip_rt_gc_timeout);
817 }
818 if (kill_route)
819 rt->dst.obsolete = DST_OBSOLETE_KILL;
820 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
821 }
822 neigh_release(n);
823 }
824 return;
825
826reject_redirect:
827#ifdef CONFIG_IP_ROUTE_VERBOSE
828 if (IN_DEV_LOG_MARTIANS(in_dev)) {
829 const struct iphdr *iph = (const struct iphdr *) skb->data;
830 __be32 daddr = iph->daddr;
831 __be32 saddr = iph->saddr;
832
833 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
834 " Advised path = %pI4 -> %pI4\n",
835 &old_gw, dev->name, &new_gw,
836 &saddr, &daddr);
837 }
838#endif
839 ;
840}
841
842static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
843{
844 struct rtable *rt;
845 struct flowi4 fl4;
846 const struct iphdr *iph = (const struct iphdr *) skb->data;
847 struct net *net = dev_net(skb->dev);
848 int oif = skb->dev->ifindex;
849 u8 tos = RT_TOS(iph->tos);
850 u8 prot = iph->protocol;
851 u32 mark = skb->mark;
852
853 rt = (struct rtable *) dst;
854
855 __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
856 __ip_do_redirect(rt, skb, &fl4, true);
857}
858
859static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
860{
861 struct rtable *rt = (struct rtable *)dst;
862 struct dst_entry *ret = dst;
863
864 if (rt) {
865 if (dst->obsolete > 0) {
866 ip_rt_put(rt);
867 ret = NULL;
868 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
869 rt->dst.expires) {
870 ip_rt_put(rt);
871 ret = NULL;
872 }
873 }
874 return ret;
875}
876
877/*
878 * Algorithm:
879 * 1. The first ip_rt_redirect_number redirects are sent
880 * with exponential backoff, then we stop sending them at all,
881 * assuming that the host ignores our redirects.
882 * 2. If we did not see packets requiring redirects
883 * during ip_rt_redirect_silence, we assume that the host
884 * forgot redirected route and start to send redirects again.
885 *
886 * This algorithm is much cheaper and more intelligent than dumb load limiting
887 * in icmp.c.
888 *
889 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
890 * and "frag. need" (breaks PMTU discovery) in icmp.c.
891 */
892
893void ip_rt_send_redirect(struct sk_buff *skb)
894{
895 struct rtable *rt = skb_rtable(skb);
896 struct in_device *in_dev;
897 struct inet_peer *peer;
898 struct net *net;
899 int log_martians;
900 int vif;
901
902 rcu_read_lock();
903 in_dev = __in_dev_get_rcu(rt->dst.dev);
904 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
905 rcu_read_unlock();
906 return;
907 }
908 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
909 vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
910 rcu_read_unlock();
911
912 net = dev_net(rt->dst.dev);
913 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
914 if (!peer) {
915 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
916 rt_nexthop(rt, ip_hdr(skb)->daddr));
917 return;
918 }
919
920 /* No redirected packets during ip_rt_redirect_silence;
921 * reset the algorithm.
922 */
David Brazdil0f672f62019-12-10 10:32:29 +0000923 if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) {
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000924 peer->rate_tokens = 0;
David Brazdil0f672f62019-12-10 10:32:29 +0000925 peer->n_redirects = 0;
926 }
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000927
928 /* Too many ignored redirects; do not send anything
929 * set dst.rate_last to the last seen redirected packet.
930 */
David Brazdil0f672f62019-12-10 10:32:29 +0000931 if (peer->n_redirects >= ip_rt_redirect_number) {
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000932 peer->rate_last = jiffies;
933 goto out_put_peer;
934 }
935
936 /* Check for load limit; set rate_last to the latest sent
937 * redirect.
938 */
Olivier Deprez0e641232021-09-23 10:07:05 +0200939 if (peer->n_redirects == 0 ||
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000940 time_after(jiffies,
941 (peer->rate_last +
David Brazdil0f672f62019-12-10 10:32:29 +0000942 (ip_rt_redirect_load << peer->n_redirects)))) {
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000943 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
944
945 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
946 peer->rate_last = jiffies;
David Brazdil0f672f62019-12-10 10:32:29 +0000947 ++peer->n_redirects;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000948#ifdef CONFIG_IP_ROUTE_VERBOSE
949 if (log_martians &&
David Brazdil0f672f62019-12-10 10:32:29 +0000950 peer->n_redirects == ip_rt_redirect_number)
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000951 net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
952 &ip_hdr(skb)->saddr, inet_iif(skb),
953 &ip_hdr(skb)->daddr, &gw);
954#endif
955 }
956out_put_peer:
957 inet_putpeer(peer);
958}
959
960static int ip_error(struct sk_buff *skb)
961{
962 struct rtable *rt = skb_rtable(skb);
963 struct net_device *dev = skb->dev;
964 struct in_device *in_dev;
965 struct inet_peer *peer;
966 unsigned long now;
967 struct net *net;
968 bool send;
969 int code;
970
971 if (netif_is_l3_master(skb->dev)) {
972 dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif);
973 if (!dev)
974 goto out;
975 }
976
977 in_dev = __in_dev_get_rcu(dev);
978
979 /* IP on this device is disabled. */
980 if (!in_dev)
981 goto out;
982
983 net = dev_net(rt->dst.dev);
984 if (!IN_DEV_FORWARD(in_dev)) {
985 switch (rt->dst.error) {
986 case EHOSTUNREACH:
987 __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
988 break;
989
990 case ENETUNREACH:
991 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
992 break;
993 }
994 goto out;
995 }
996
997 switch (rt->dst.error) {
998 case EINVAL:
999 default:
1000 goto out;
1001 case EHOSTUNREACH:
1002 code = ICMP_HOST_UNREACH;
1003 break;
1004 case ENETUNREACH:
1005 code = ICMP_NET_UNREACH;
1006 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
1007 break;
1008 case EACCES:
1009 code = ICMP_PKT_FILTERED;
1010 break;
1011 }
1012
1013 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
1014 l3mdev_master_ifindex(skb->dev), 1);
1015
1016 send = true;
1017 if (peer) {
1018 now = jiffies;
1019 peer->rate_tokens += now - peer->rate_last;
1020 if (peer->rate_tokens > ip_rt_error_burst)
1021 peer->rate_tokens = ip_rt_error_burst;
1022 peer->rate_last = now;
1023 if (peer->rate_tokens >= ip_rt_error_cost)
1024 peer->rate_tokens -= ip_rt_error_cost;
1025 else
1026 send = false;
1027 inet_putpeer(peer);
1028 }
1029 if (send)
1030 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1031
1032out: kfree_skb(skb);
1033 return 0;
1034}
1035
1036static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1037{
1038 struct dst_entry *dst = &rt->dst;
Olivier Deprez0e641232021-09-23 10:07:05 +02001039 struct net *net = dev_net(dst->dev);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001040 u32 old_mtu = ipv4_mtu(dst);
1041 struct fib_result res;
1042 bool lock = false;
1043
1044 if (ip_mtu_locked(dst))
1045 return;
1046
1047 if (old_mtu < mtu)
1048 return;
1049
1050 if (mtu < ip_rt_min_pmtu) {
1051 lock = true;
1052 mtu = min(old_mtu, ip_rt_min_pmtu);
1053 }
1054
1055 if (rt->rt_pmtu == mtu && !lock &&
1056 time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1057 return;
1058
1059 rcu_read_lock();
Olivier Deprez0e641232021-09-23 10:07:05 +02001060 if (fib_lookup(net, fl4, &res, 0) == 0) {
1061 struct fib_nh_common *nhc;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001062
Olivier Deprez0e641232021-09-23 10:07:05 +02001063 fib_select_path(net, &res, fl4, NULL);
1064 nhc = FIB_RES_NHC(res);
David Brazdil0f672f62019-12-10 10:32:29 +00001065 update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock,
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001066 jiffies + ip_rt_mtu_expires);
1067 }
1068 rcu_read_unlock();
1069}
1070
1071static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
Olivier Deprez0e641232021-09-23 10:07:05 +02001072 struct sk_buff *skb, u32 mtu,
1073 bool confirm_neigh)
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001074{
1075 struct rtable *rt = (struct rtable *) dst;
1076 struct flowi4 fl4;
1077
1078 ip_rt_build_flow_key(&fl4, sk, skb);
1079 __ip_rt_update_pmtu(rt, &fl4, mtu);
1080}
1081
1082void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
David Brazdil0f672f62019-12-10 10:32:29 +00001083 int oif, u8 protocol)
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001084{
1085 const struct iphdr *iph = (const struct iphdr *) skb->data;
1086 struct flowi4 fl4;
1087 struct rtable *rt;
David Brazdil0f672f62019-12-10 10:32:29 +00001088 u32 mark = IP4_REPLY_MARK(net, skb->mark);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001089
1090 __build_flow_key(net, &fl4, NULL, iph, oif,
David Brazdil0f672f62019-12-10 10:32:29 +00001091 RT_TOS(iph->tos), protocol, mark, 0);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001092 rt = __ip_route_output_key(net, &fl4);
1093 if (!IS_ERR(rt)) {
1094 __ip_rt_update_pmtu(rt, &fl4, mtu);
1095 ip_rt_put(rt);
1096 }
1097}
1098EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1099
1100static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1101{
1102 const struct iphdr *iph = (const struct iphdr *) skb->data;
1103 struct flowi4 fl4;
1104 struct rtable *rt;
1105
1106 __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1107
1108 if (!fl4.flowi4_mark)
1109 fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1110
1111 rt = __ip_route_output_key(sock_net(sk), &fl4);
1112 if (!IS_ERR(rt)) {
1113 __ip_rt_update_pmtu(rt, &fl4, mtu);
1114 ip_rt_put(rt);
1115 }
1116}
1117
1118void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1119{
1120 const struct iphdr *iph = (const struct iphdr *) skb->data;
1121 struct flowi4 fl4;
1122 struct rtable *rt;
1123 struct dst_entry *odst = NULL;
1124 bool new = false;
1125 struct net *net = sock_net(sk);
1126
1127 bh_lock_sock(sk);
1128
1129 if (!ip_sk_accept_pmtu(sk))
1130 goto out;
1131
1132 odst = sk_dst_get(sk);
1133
1134 if (sock_owned_by_user(sk) || !odst) {
1135 __ipv4_sk_update_pmtu(skb, sk, mtu);
1136 goto out;
1137 }
1138
1139 __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1140
1141 rt = (struct rtable *)odst;
1142 if (odst->obsolete && !odst->ops->check(odst, 0)) {
1143 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1144 if (IS_ERR(rt))
1145 goto out;
1146
1147 new = true;
1148 }
1149
1150 __ip_rt_update_pmtu((struct rtable *) xfrm_dst_path(&rt->dst), &fl4, mtu);
1151
1152 if (!dst_check(&rt->dst, 0)) {
1153 if (new)
1154 dst_release(&rt->dst);
1155
1156 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1157 if (IS_ERR(rt))
1158 goto out;
1159
1160 new = true;
1161 }
1162
1163 if (new)
1164 sk_dst_set(sk, &rt->dst);
1165
1166out:
1167 bh_unlock_sock(sk);
1168 dst_release(odst);
1169}
1170EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1171
1172void ipv4_redirect(struct sk_buff *skb, struct net *net,
David Brazdil0f672f62019-12-10 10:32:29 +00001173 int oif, u8 protocol)
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001174{
1175 const struct iphdr *iph = (const struct iphdr *) skb->data;
1176 struct flowi4 fl4;
1177 struct rtable *rt;
1178
1179 __build_flow_key(net, &fl4, NULL, iph, oif,
David Brazdil0f672f62019-12-10 10:32:29 +00001180 RT_TOS(iph->tos), protocol, 0, 0);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001181 rt = __ip_route_output_key(net, &fl4);
1182 if (!IS_ERR(rt)) {
1183 __ip_do_redirect(rt, skb, &fl4, false);
1184 ip_rt_put(rt);
1185 }
1186}
1187EXPORT_SYMBOL_GPL(ipv4_redirect);
1188
1189void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1190{
1191 const struct iphdr *iph = (const struct iphdr *) skb->data;
1192 struct flowi4 fl4;
1193 struct rtable *rt;
1194 struct net *net = sock_net(sk);
1195
1196 __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1197 rt = __ip_route_output_key(net, &fl4);
1198 if (!IS_ERR(rt)) {
1199 __ip_do_redirect(rt, skb, &fl4, false);
1200 ip_rt_put(rt);
1201 }
1202}
1203EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1204
1205static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1206{
1207 struct rtable *rt = (struct rtable *) dst;
1208
1209 /* All IPV4 dsts are created with ->obsolete set to the value
1210 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1211 * into this function always.
1212 *
1213 * When a PMTU/redirect information update invalidates a route,
1214 * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
David Brazdil0f672f62019-12-10 10:32:29 +00001215 * DST_OBSOLETE_DEAD.
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001216 */
1217 if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1218 return NULL;
1219 return dst;
1220}
1221
David Brazdil0f672f62019-12-10 10:32:29 +00001222static void ipv4_send_dest_unreach(struct sk_buff *skb)
1223{
1224 struct ip_options opt;
1225 int res;
1226
1227 /* Recompile ip options since IPCB may not be valid anymore.
1228 * Also check we have a reasonable ipv4 header.
1229 */
1230 if (!pskb_network_may_pull(skb, sizeof(struct iphdr)) ||
1231 ip_hdr(skb)->version != 4 || ip_hdr(skb)->ihl < 5)
1232 return;
1233
1234 memset(&opt, 0, sizeof(opt));
1235 if (ip_hdr(skb)->ihl > 5) {
1236 if (!pskb_network_may_pull(skb, ip_hdr(skb)->ihl * 4))
1237 return;
1238 opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr);
1239
1240 rcu_read_lock();
1241 res = __ip_options_compile(dev_net(skb->dev), &opt, skb, NULL);
1242 rcu_read_unlock();
1243
1244 if (res)
1245 return;
1246 }
1247 __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &opt);
1248}
1249
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001250static void ipv4_link_failure(struct sk_buff *skb)
1251{
1252 struct rtable *rt;
1253
David Brazdil0f672f62019-12-10 10:32:29 +00001254 ipv4_send_dest_unreach(skb);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001255
1256 rt = skb_rtable(skb);
1257 if (rt)
1258 dst_set_expires(&rt->dst, 0);
1259}
1260
1261static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1262{
1263 pr_debug("%s: %pI4 -> %pI4, %s\n",
1264 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1265 skb->dev ? skb->dev->name : "?");
1266 kfree_skb(skb);
1267 WARN_ON(1);
1268 return 0;
1269}
1270
1271/*
1272 We do not cache source address of outgoing interface,
1273 because it is used only by IP RR, TS and SRR options,
1274 so that it out of fast path.
1275
1276 BTW remember: "addr" is allowed to be not aligned
1277 in IP options!
1278 */
1279
1280void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1281{
1282 __be32 src;
1283
1284 if (rt_is_output_route(rt))
1285 src = ip_hdr(skb)->saddr;
1286 else {
1287 struct fib_result res;
David Brazdil0f672f62019-12-10 10:32:29 +00001288 struct iphdr *iph = ip_hdr(skb);
1289 struct flowi4 fl4 = {
1290 .daddr = iph->daddr,
1291 .saddr = iph->saddr,
1292 .flowi4_tos = RT_TOS(iph->tos),
1293 .flowi4_oif = rt->dst.dev->ifindex,
1294 .flowi4_iif = skb->dev->ifindex,
1295 .flowi4_mark = skb->mark,
1296 };
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001297
1298 rcu_read_lock();
1299 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
David Brazdil0f672f62019-12-10 10:32:29 +00001300 src = fib_result_prefsrc(dev_net(rt->dst.dev), &res);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001301 else
1302 src = inet_select_addr(rt->dst.dev,
1303 rt_nexthop(rt, iph->daddr),
1304 RT_SCOPE_UNIVERSE);
1305 rcu_read_unlock();
1306 }
1307 memcpy(addr, &src, 4);
1308}
1309
1310#ifdef CONFIG_IP_ROUTE_CLASSID
1311static void set_class_tag(struct rtable *rt, u32 tag)
1312{
1313 if (!(rt->dst.tclassid & 0xFFFF))
1314 rt->dst.tclassid |= tag & 0xFFFF;
1315 if (!(rt->dst.tclassid & 0xFFFF0000))
1316 rt->dst.tclassid |= tag & 0xFFFF0000;
1317}
1318#endif
1319
1320static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1321{
1322 unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
1323 unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
1324 ip_rt_min_advmss);
1325
1326 return min(advmss, IPV4_MAX_PMTU - header_size);
1327}
1328
1329static unsigned int ipv4_mtu(const struct dst_entry *dst)
1330{
1331 const struct rtable *rt = (const struct rtable *) dst;
1332 unsigned int mtu = rt->rt_pmtu;
1333
1334 if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1335 mtu = dst_metric_raw(dst, RTAX_MTU);
1336
1337 if (mtu)
Olivier Deprez0e641232021-09-23 10:07:05 +02001338 goto out;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001339
1340 mtu = READ_ONCE(dst->dev->mtu);
1341
1342 if (unlikely(ip_mtu_locked(dst))) {
1343 if (rt->rt_uses_gateway && mtu > 576)
1344 mtu = 576;
1345 }
1346
Olivier Deprez0e641232021-09-23 10:07:05 +02001347out:
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001348 mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1349
1350 return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1351}
1352
David Brazdil0f672f62019-12-10 10:32:29 +00001353static void ip_del_fnhe(struct fib_nh_common *nhc, __be32 daddr)
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001354{
1355 struct fnhe_hash_bucket *hash;
1356 struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1357 u32 hval = fnhe_hashfun(daddr);
1358
1359 spin_lock_bh(&fnhe_lock);
1360
David Brazdil0f672f62019-12-10 10:32:29 +00001361 hash = rcu_dereference_protected(nhc->nhc_exceptions,
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001362 lockdep_is_held(&fnhe_lock));
1363 hash += hval;
1364
1365 fnhe_p = &hash->chain;
1366 fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1367 while (fnhe) {
1368 if (fnhe->fnhe_daddr == daddr) {
1369 rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1370 fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
David Brazdil0f672f62019-12-10 10:32:29 +00001371 /* set fnhe_daddr to 0 to ensure it won't bind with
1372 * new dsts in rt_bind_exception().
1373 */
1374 fnhe->fnhe_daddr = 0;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001375 fnhe_flush_routes(fnhe);
1376 kfree_rcu(fnhe, rcu);
1377 break;
1378 }
1379 fnhe_p = &fnhe->fnhe_next;
1380 fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1381 lockdep_is_held(&fnhe_lock));
1382 }
1383
1384 spin_unlock_bh(&fnhe_lock);
1385}
1386
David Brazdil0f672f62019-12-10 10:32:29 +00001387static struct fib_nh_exception *find_exception(struct fib_nh_common *nhc,
1388 __be32 daddr)
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001389{
David Brazdil0f672f62019-12-10 10:32:29 +00001390 struct fnhe_hash_bucket *hash = rcu_dereference(nhc->nhc_exceptions);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001391 struct fib_nh_exception *fnhe;
1392 u32 hval;
1393
1394 if (!hash)
1395 return NULL;
1396
1397 hval = fnhe_hashfun(daddr);
1398
1399 for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1400 fnhe = rcu_dereference(fnhe->fnhe_next)) {
1401 if (fnhe->fnhe_daddr == daddr) {
1402 if (fnhe->fnhe_expires &&
1403 time_after(jiffies, fnhe->fnhe_expires)) {
David Brazdil0f672f62019-12-10 10:32:29 +00001404 ip_del_fnhe(nhc, daddr);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001405 break;
1406 }
1407 return fnhe;
1408 }
1409 }
1410 return NULL;
1411}
1412
1413/* MTU selection:
1414 * 1. mtu on route is locked - use it
1415 * 2. mtu from nexthop exception
1416 * 3. mtu from egress device
1417 */
1418
1419u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr)
1420{
David Brazdil0f672f62019-12-10 10:32:29 +00001421 struct fib_nh_common *nhc = res->nhc;
1422 struct net_device *dev = nhc->nhc_dev;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001423 struct fib_info *fi = res->fi;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001424 u32 mtu = 0;
1425
1426 if (dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu ||
1427 fi->fib_metrics->metrics[RTAX_LOCK - 1] & (1 << RTAX_MTU))
1428 mtu = fi->fib_mtu;
1429
1430 if (likely(!mtu)) {
1431 struct fib_nh_exception *fnhe;
1432
David Brazdil0f672f62019-12-10 10:32:29 +00001433 fnhe = find_exception(nhc, daddr);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001434 if (fnhe && !time_after_eq(jiffies, fnhe->fnhe_expires))
1435 mtu = fnhe->fnhe_pmtu;
1436 }
1437
1438 if (likely(!mtu))
1439 mtu = min(READ_ONCE(dev->mtu), IP_MAX_MTU);
1440
David Brazdil0f672f62019-12-10 10:32:29 +00001441 return mtu - lwtunnel_headroom(nhc->nhc_lwtstate, mtu);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001442}
1443
1444static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1445 __be32 daddr, const bool do_cache)
1446{
1447 bool ret = false;
1448
1449 spin_lock_bh(&fnhe_lock);
1450
1451 if (daddr == fnhe->fnhe_daddr) {
1452 struct rtable __rcu **porig;
1453 struct rtable *orig;
1454 int genid = fnhe_genid(dev_net(rt->dst.dev));
1455
1456 if (rt_is_input_route(rt))
1457 porig = &fnhe->fnhe_rth_input;
1458 else
1459 porig = &fnhe->fnhe_rth_output;
1460 orig = rcu_dereference(*porig);
1461
1462 if (fnhe->fnhe_genid != genid) {
1463 fnhe->fnhe_genid = genid;
1464 fnhe->fnhe_gw = 0;
1465 fnhe->fnhe_pmtu = 0;
1466 fnhe->fnhe_expires = 0;
1467 fnhe->fnhe_mtu_locked = false;
1468 fnhe_flush_routes(fnhe);
1469 orig = NULL;
1470 }
1471 fill_route_from_fnhe(rt, fnhe);
David Brazdil0f672f62019-12-10 10:32:29 +00001472 if (!rt->rt_gw4) {
1473 rt->rt_gw4 = daddr;
1474 rt->rt_gw_family = AF_INET;
1475 }
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001476
1477 if (do_cache) {
1478 dst_hold(&rt->dst);
1479 rcu_assign_pointer(*porig, rt);
1480 if (orig) {
1481 dst_dev_put(&orig->dst);
1482 dst_release(&orig->dst);
1483 }
1484 ret = true;
1485 }
1486
1487 fnhe->fnhe_stamp = jiffies;
1488 }
1489 spin_unlock_bh(&fnhe_lock);
1490
1491 return ret;
1492}
1493
David Brazdil0f672f62019-12-10 10:32:29 +00001494static bool rt_cache_route(struct fib_nh_common *nhc, struct rtable *rt)
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001495{
1496 struct rtable *orig, *prev, **p;
1497 bool ret = true;
1498
1499 if (rt_is_input_route(rt)) {
David Brazdil0f672f62019-12-10 10:32:29 +00001500 p = (struct rtable **)&nhc->nhc_rth_input;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001501 } else {
David Brazdil0f672f62019-12-10 10:32:29 +00001502 p = (struct rtable **)raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001503 }
1504 orig = *p;
1505
1506 /* hold dst before doing cmpxchg() to avoid race condition
1507 * on this dst
1508 */
1509 dst_hold(&rt->dst);
1510 prev = cmpxchg(p, orig, rt);
1511 if (prev == orig) {
1512 if (orig) {
David Brazdil0f672f62019-12-10 10:32:29 +00001513 rt_add_uncached_list(orig);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001514 dst_release(&orig->dst);
1515 }
1516 } else {
1517 dst_release(&rt->dst);
1518 ret = false;
1519 }
1520
1521 return ret;
1522}
1523
1524struct uncached_list {
1525 spinlock_t lock;
1526 struct list_head head;
1527};
1528
1529static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1530
1531void rt_add_uncached_list(struct rtable *rt)
1532{
1533 struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1534
1535 rt->rt_uncached_list = ul;
1536
1537 spin_lock_bh(&ul->lock);
1538 list_add_tail(&rt->rt_uncached, &ul->head);
1539 spin_unlock_bh(&ul->lock);
1540}
1541
1542void rt_del_uncached_list(struct rtable *rt)
1543{
1544 if (!list_empty(&rt->rt_uncached)) {
1545 struct uncached_list *ul = rt->rt_uncached_list;
1546
1547 spin_lock_bh(&ul->lock);
1548 list_del(&rt->rt_uncached);
1549 spin_unlock_bh(&ul->lock);
1550 }
1551}
1552
1553static void ipv4_dst_destroy(struct dst_entry *dst)
1554{
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001555 struct rtable *rt = (struct rtable *)dst;
1556
David Brazdil0f672f62019-12-10 10:32:29 +00001557 ip_dst_metrics_put(dst);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001558 rt_del_uncached_list(rt);
1559}
1560
1561void rt_flush_dev(struct net_device *dev)
1562{
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001563 struct rtable *rt;
1564 int cpu;
1565
1566 for_each_possible_cpu(cpu) {
1567 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1568
1569 spin_lock_bh(&ul->lock);
1570 list_for_each_entry(rt, &ul->head, rt_uncached) {
1571 if (rt->dst.dev != dev)
1572 continue;
David Brazdil0f672f62019-12-10 10:32:29 +00001573 rt->dst.dev = blackhole_netdev;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001574 dev_hold(rt->dst.dev);
1575 dev_put(dev);
1576 }
1577 spin_unlock_bh(&ul->lock);
1578 }
1579}
1580
1581static bool rt_cache_valid(const struct rtable *rt)
1582{
1583 return rt &&
1584 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1585 !rt_is_expired(rt);
1586}
1587
1588static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1589 const struct fib_result *res,
1590 struct fib_nh_exception *fnhe,
1591 struct fib_info *fi, u16 type, u32 itag,
1592 const bool do_cache)
1593{
1594 bool cached = false;
1595
1596 if (fi) {
David Brazdil0f672f62019-12-10 10:32:29 +00001597 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001598
David Brazdil0f672f62019-12-10 10:32:29 +00001599 if (nhc->nhc_gw_family && nhc->nhc_scope == RT_SCOPE_LINK) {
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001600 rt->rt_uses_gateway = 1;
David Brazdil0f672f62019-12-10 10:32:29 +00001601 rt->rt_gw_family = nhc->nhc_gw_family;
1602 /* only INET and INET6 are supported */
1603 if (likely(nhc->nhc_gw_family == AF_INET))
1604 rt->rt_gw4 = nhc->nhc_gw.ipv4;
1605 else
1606 rt->rt_gw6 = nhc->nhc_gw.ipv6;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001607 }
David Brazdil0f672f62019-12-10 10:32:29 +00001608
1609 ip_dst_init_metrics(&rt->dst, fi->fib_metrics);
1610
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001611#ifdef CONFIG_IP_ROUTE_CLASSID
David Brazdil0f672f62019-12-10 10:32:29 +00001612 if (nhc->nhc_family == AF_INET) {
1613 struct fib_nh *nh;
1614
1615 nh = container_of(nhc, struct fib_nh, nh_common);
1616 rt->dst.tclassid = nh->nh_tclassid;
1617 }
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001618#endif
David Brazdil0f672f62019-12-10 10:32:29 +00001619 rt->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001620 if (unlikely(fnhe))
1621 cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
1622 else if (do_cache)
David Brazdil0f672f62019-12-10 10:32:29 +00001623 cached = rt_cache_route(nhc, rt);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001624 if (unlikely(!cached)) {
1625 /* Routes we intend to cache in nexthop exception or
1626 * FIB nexthop have the DST_NOCACHE bit clear.
1627 * However, if we are unsuccessful at storing this
1628 * route into the cache we really need to set it.
1629 */
David Brazdil0f672f62019-12-10 10:32:29 +00001630 if (!rt->rt_gw4) {
1631 rt->rt_gw_family = AF_INET;
1632 rt->rt_gw4 = daddr;
1633 }
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001634 rt_add_uncached_list(rt);
1635 }
1636 } else
1637 rt_add_uncached_list(rt);
1638
1639#ifdef CONFIG_IP_ROUTE_CLASSID
1640#ifdef CONFIG_IP_MULTIPLE_TABLES
1641 set_class_tag(rt, res->tclassid);
1642#endif
1643 set_class_tag(rt, itag);
1644#endif
1645}
1646
1647struct rtable *rt_dst_alloc(struct net_device *dev,
1648 unsigned int flags, u16 type,
1649 bool nopolicy, bool noxfrm, bool will_cache)
1650{
1651 struct rtable *rt;
1652
1653 rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1654 (will_cache ? 0 : DST_HOST) |
1655 (nopolicy ? DST_NOPOLICY : 0) |
1656 (noxfrm ? DST_NOXFRM : 0));
1657
1658 if (rt) {
1659 rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1660 rt->rt_flags = flags;
1661 rt->rt_type = type;
1662 rt->rt_is_input = 0;
1663 rt->rt_iif = 0;
1664 rt->rt_pmtu = 0;
1665 rt->rt_mtu_locked = 0;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001666 rt->rt_uses_gateway = 0;
David Brazdil0f672f62019-12-10 10:32:29 +00001667 rt->rt_gw_family = 0;
1668 rt->rt_gw4 = 0;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001669 INIT_LIST_HEAD(&rt->rt_uncached);
1670
1671 rt->dst.output = ip_output;
1672 if (flags & RTCF_LOCAL)
1673 rt->dst.input = ip_local_deliver;
1674 }
1675
1676 return rt;
1677}
1678EXPORT_SYMBOL(rt_dst_alloc);
1679
David Brazdil0f672f62019-12-10 10:32:29 +00001680struct rtable *rt_dst_clone(struct net_device *dev, struct rtable *rt)
1681{
1682 struct rtable *new_rt;
1683
1684 new_rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1685 rt->dst.flags);
1686
1687 if (new_rt) {
1688 new_rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1689 new_rt->rt_flags = rt->rt_flags;
1690 new_rt->rt_type = rt->rt_type;
1691 new_rt->rt_is_input = rt->rt_is_input;
1692 new_rt->rt_iif = rt->rt_iif;
1693 new_rt->rt_pmtu = rt->rt_pmtu;
1694 new_rt->rt_mtu_locked = rt->rt_mtu_locked;
1695 new_rt->rt_gw_family = rt->rt_gw_family;
1696 if (rt->rt_gw_family == AF_INET)
1697 new_rt->rt_gw4 = rt->rt_gw4;
1698 else if (rt->rt_gw_family == AF_INET6)
1699 new_rt->rt_gw6 = rt->rt_gw6;
1700 INIT_LIST_HEAD(&new_rt->rt_uncached);
1701
1702 new_rt->dst.flags |= DST_HOST;
1703 new_rt->dst.input = rt->dst.input;
1704 new_rt->dst.output = rt->dst.output;
1705 new_rt->dst.error = rt->dst.error;
1706 new_rt->dst.lastuse = jiffies;
1707 new_rt->dst.lwtstate = lwtstate_get(rt->dst.lwtstate);
1708 }
1709 return new_rt;
1710}
1711EXPORT_SYMBOL(rt_dst_clone);
1712
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001713/* called in rcu_read_lock() section */
1714int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1715 u8 tos, struct net_device *dev,
1716 struct in_device *in_dev, u32 *itag)
1717{
1718 int err;
1719
1720 /* Primary sanity checks. */
1721 if (!in_dev)
1722 return -EINVAL;
1723
1724 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1725 skb->protocol != htons(ETH_P_IP))
1726 return -EINVAL;
1727
1728 if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1729 return -EINVAL;
1730
1731 if (ipv4_is_zeronet(saddr)) {
David Brazdil0f672f62019-12-10 10:32:29 +00001732 if (!ipv4_is_local_multicast(daddr) &&
1733 ip_hdr(skb)->protocol != IPPROTO_IGMP)
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001734 return -EINVAL;
1735 } else {
1736 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1737 in_dev, itag);
1738 if (err < 0)
1739 return err;
1740 }
1741 return 0;
1742}
1743
1744/* called in rcu_read_lock() section */
1745static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1746 u8 tos, struct net_device *dev, int our)
1747{
1748 struct in_device *in_dev = __in_dev_get_rcu(dev);
1749 unsigned int flags = RTCF_MULTICAST;
1750 struct rtable *rth;
1751 u32 itag = 0;
1752 int err;
1753
1754 err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
1755 if (err)
1756 return err;
1757
1758 if (our)
1759 flags |= RTCF_LOCAL;
1760
1761 rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1762 IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1763 if (!rth)
1764 return -ENOBUFS;
1765
1766#ifdef CONFIG_IP_ROUTE_CLASSID
1767 rth->dst.tclassid = itag;
1768#endif
1769 rth->dst.output = ip_rt_bug;
1770 rth->rt_is_input= 1;
1771
1772#ifdef CONFIG_IP_MROUTE
1773 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1774 rth->dst.input = ip_mr_input;
1775#endif
1776 RT_CACHE_STAT_INC(in_slow_mc);
1777
1778 skb_dst_set(skb, &rth->dst);
1779 return 0;
1780}
1781
1782
1783static void ip_handle_martian_source(struct net_device *dev,
1784 struct in_device *in_dev,
1785 struct sk_buff *skb,
1786 __be32 daddr,
1787 __be32 saddr)
1788{
1789 RT_CACHE_STAT_INC(in_martian_src);
1790#ifdef CONFIG_IP_ROUTE_VERBOSE
1791 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1792 /*
1793 * RFC1812 recommendation, if source is martian,
1794 * the only hint is MAC header.
1795 */
1796 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1797 &daddr, &saddr, dev->name);
1798 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1799 print_hex_dump(KERN_WARNING, "ll header: ",
1800 DUMP_PREFIX_OFFSET, 16, 1,
1801 skb_mac_header(skb),
David Brazdil0f672f62019-12-10 10:32:29 +00001802 dev->hard_header_len, false);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001803 }
1804 }
1805#endif
1806}
1807
1808/* called in rcu_read_lock() section */
1809static int __mkroute_input(struct sk_buff *skb,
1810 const struct fib_result *res,
1811 struct in_device *in_dev,
1812 __be32 daddr, __be32 saddr, u32 tos)
1813{
David Brazdil0f672f62019-12-10 10:32:29 +00001814 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1815 struct net_device *dev = nhc->nhc_dev;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001816 struct fib_nh_exception *fnhe;
1817 struct rtable *rth;
1818 int err;
1819 struct in_device *out_dev;
1820 bool do_cache;
1821 u32 itag = 0;
1822
1823 /* get a working reference to the output device */
David Brazdil0f672f62019-12-10 10:32:29 +00001824 out_dev = __in_dev_get_rcu(dev);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001825 if (!out_dev) {
1826 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1827 return -EINVAL;
1828 }
1829
1830 err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1831 in_dev->dev, in_dev, &itag);
1832 if (err < 0) {
1833 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1834 saddr);
1835
1836 goto cleanup;
1837 }
1838
1839 do_cache = res->fi && !itag;
1840 if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
David Brazdil0f672f62019-12-10 10:32:29 +00001841 skb->protocol == htons(ETH_P_IP)) {
1842 __be32 gw;
1843
1844 gw = nhc->nhc_gw_family == AF_INET ? nhc->nhc_gw.ipv4 : 0;
1845 if (IN_DEV_SHARED_MEDIA(out_dev) ||
1846 inet_addr_onlink(out_dev, saddr, gw))
1847 IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1848 }
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001849
1850 if (skb->protocol != htons(ETH_P_IP)) {
1851 /* Not IP (i.e. ARP). Do not create route, if it is
1852 * invalid for proxy arp. DNAT routes are always valid.
1853 *
1854 * Proxy arp feature have been extended to allow, ARP
1855 * replies back to the same interface, to support
1856 * Private VLAN switch technologies. See arp.c.
1857 */
1858 if (out_dev == in_dev &&
1859 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1860 err = -EINVAL;
1861 goto cleanup;
1862 }
1863 }
1864
David Brazdil0f672f62019-12-10 10:32:29 +00001865 fnhe = find_exception(nhc, daddr);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001866 if (do_cache) {
1867 if (fnhe)
1868 rth = rcu_dereference(fnhe->fnhe_rth_input);
1869 else
David Brazdil0f672f62019-12-10 10:32:29 +00001870 rth = rcu_dereference(nhc->nhc_rth_input);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001871 if (rt_cache_valid(rth)) {
1872 skb_dst_set_noref(skb, &rth->dst);
1873 goto out;
1874 }
1875 }
1876
1877 rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1878 IN_DEV_CONF_GET(in_dev, NOPOLICY),
1879 IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1880 if (!rth) {
1881 err = -ENOBUFS;
1882 goto cleanup;
1883 }
1884
1885 rth->rt_is_input = 1;
1886 RT_CACHE_STAT_INC(in_slow_tot);
1887
1888 rth->dst.input = ip_forward;
1889
1890 rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
1891 do_cache);
1892 lwtunnel_set_redirect(&rth->dst);
1893 skb_dst_set(skb, &rth->dst);
1894out:
1895 err = 0;
1896 cleanup:
1897 return err;
1898}
1899
1900#ifdef CONFIG_IP_ROUTE_MULTIPATH
1901/* To make ICMP packets follow the right flow, the multipath hash is
1902 * calculated from the inner IP addresses.
1903 */
1904static void ip_multipath_l3_keys(const struct sk_buff *skb,
1905 struct flow_keys *hash_keys)
1906{
1907 const struct iphdr *outer_iph = ip_hdr(skb);
1908 const struct iphdr *key_iph = outer_iph;
1909 const struct iphdr *inner_iph;
1910 const struct icmphdr *icmph;
1911 struct iphdr _inner_iph;
1912 struct icmphdr _icmph;
1913
1914 if (likely(outer_iph->protocol != IPPROTO_ICMP))
1915 goto out;
1916
1917 if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1918 goto out;
1919
1920 icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1921 &_icmph);
1922 if (!icmph)
1923 goto out;
1924
1925 if (icmph->type != ICMP_DEST_UNREACH &&
1926 icmph->type != ICMP_REDIRECT &&
1927 icmph->type != ICMP_TIME_EXCEEDED &&
1928 icmph->type != ICMP_PARAMETERPROB)
1929 goto out;
1930
1931 inner_iph = skb_header_pointer(skb,
1932 outer_iph->ihl * 4 + sizeof(_icmph),
1933 sizeof(_inner_iph), &_inner_iph);
1934 if (!inner_iph)
1935 goto out;
1936
1937 key_iph = inner_iph;
1938out:
1939 hash_keys->addrs.v4addrs.src = key_iph->saddr;
1940 hash_keys->addrs.v4addrs.dst = key_iph->daddr;
1941}
1942
1943/* if skb is set it will be used and fl4 can be NULL */
1944int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
1945 const struct sk_buff *skb, struct flow_keys *flkeys)
1946{
David Brazdil0f672f62019-12-10 10:32:29 +00001947 u32 multipath_hash = fl4 ? fl4->flowi4_multipath_hash : 0;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001948 struct flow_keys hash_keys;
1949 u32 mhash;
1950
1951 switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
1952 case 0:
1953 memset(&hash_keys, 0, sizeof(hash_keys));
1954 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1955 if (skb) {
1956 ip_multipath_l3_keys(skb, &hash_keys);
1957 } else {
1958 hash_keys.addrs.v4addrs.src = fl4->saddr;
1959 hash_keys.addrs.v4addrs.dst = fl4->daddr;
1960 }
1961 break;
1962 case 1:
1963 /* skb is currently provided only when forwarding */
1964 if (skb) {
1965 unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1966 struct flow_keys keys;
1967
1968 /* short-circuit if we already have L4 hash present */
1969 if (skb->l4_hash)
1970 return skb_get_hash_raw(skb) >> 1;
1971
1972 memset(&hash_keys, 0, sizeof(hash_keys));
1973
1974 if (!flkeys) {
1975 skb_flow_dissect_flow_keys(skb, &keys, flag);
1976 flkeys = &keys;
1977 }
1978
1979 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1980 hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src;
1981 hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst;
1982 hash_keys.ports.src = flkeys->ports.src;
1983 hash_keys.ports.dst = flkeys->ports.dst;
1984 hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
1985 } else {
1986 memset(&hash_keys, 0, sizeof(hash_keys));
1987 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1988 hash_keys.addrs.v4addrs.src = fl4->saddr;
1989 hash_keys.addrs.v4addrs.dst = fl4->daddr;
1990 hash_keys.ports.src = fl4->fl4_sport;
1991 hash_keys.ports.dst = fl4->fl4_dport;
1992 hash_keys.basic.ip_proto = fl4->flowi4_proto;
1993 }
1994 break;
David Brazdil0f672f62019-12-10 10:32:29 +00001995 case 2:
1996 memset(&hash_keys, 0, sizeof(hash_keys));
1997 /* skb is currently provided only when forwarding */
1998 if (skb) {
1999 struct flow_keys keys;
2000
2001 skb_flow_dissect_flow_keys(skb, &keys, 0);
2002 /* Inner can be v4 or v6 */
2003 if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
2004 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2005 hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
2006 hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
2007 } else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
2008 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2009 hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
2010 hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst;
2011 hash_keys.tags.flow_label = keys.tags.flow_label;
2012 hash_keys.basic.ip_proto = keys.basic.ip_proto;
2013 } else {
2014 /* Same as case 0 */
2015 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2016 ip_multipath_l3_keys(skb, &hash_keys);
2017 }
2018 } else {
2019 /* Same as case 0 */
2020 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2021 hash_keys.addrs.v4addrs.src = fl4->saddr;
2022 hash_keys.addrs.v4addrs.dst = fl4->daddr;
2023 }
2024 break;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002025 }
2026 mhash = flow_hash_from_keys(&hash_keys);
2027
David Brazdil0f672f62019-12-10 10:32:29 +00002028 if (multipath_hash)
2029 mhash = jhash_2words(mhash, multipath_hash, 0);
2030
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002031 return mhash >> 1;
2032}
2033#endif /* CONFIG_IP_ROUTE_MULTIPATH */
2034
2035static int ip_mkroute_input(struct sk_buff *skb,
2036 struct fib_result *res,
2037 struct in_device *in_dev,
2038 __be32 daddr, __be32 saddr, u32 tos,
2039 struct flow_keys *hkeys)
2040{
2041#ifdef CONFIG_IP_ROUTE_MULTIPATH
David Brazdil0f672f62019-12-10 10:32:29 +00002042 if (res->fi && fib_info_num_path(res->fi) > 1) {
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002043 int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys);
2044
2045 fib_select_multipath(res, h);
2046 }
2047#endif
2048
2049 /* create a routing cache entry */
2050 return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
2051}
2052
2053/*
2054 * NOTE. We drop all the packets that has local source
2055 * addresses, because every properly looped back packet
2056 * must have correct destination already attached by output routine.
2057 *
2058 * Such approach solves two big problems:
2059 * 1. Not simplex devices are handled properly.
2060 * 2. IP spoofing attempts are filtered with 100% of guarantee.
2061 * called with rcu_read_lock()
2062 */
2063
2064static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2065 u8 tos, struct net_device *dev,
2066 struct fib_result *res)
2067{
2068 struct in_device *in_dev = __in_dev_get_rcu(dev);
2069 struct flow_keys *flkeys = NULL, _flkeys;
2070 struct net *net = dev_net(dev);
2071 struct ip_tunnel_info *tun_info;
2072 int err = -EINVAL;
2073 unsigned int flags = 0;
2074 u32 itag = 0;
2075 struct rtable *rth;
2076 struct flowi4 fl4;
David Brazdil0f672f62019-12-10 10:32:29 +00002077 bool do_cache = true;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002078
2079 /* IP on this device is disabled. */
2080
2081 if (!in_dev)
2082 goto out;
2083
2084 /* Check for the most weird martians, which can be not detected
2085 by fib_lookup.
2086 */
2087
2088 tun_info = skb_tunnel_info(skb);
2089 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2090 fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
2091 else
2092 fl4.flowi4_tun_key.tun_id = 0;
2093 skb_dst_drop(skb);
2094
2095 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2096 goto martian_source;
2097
2098 res->fi = NULL;
2099 res->table = NULL;
2100 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2101 goto brd_input;
2102
2103 /* Accept zero addresses only to limited broadcast;
2104 * I even do not know to fix it or not. Waiting for complains :-)
2105 */
2106 if (ipv4_is_zeronet(saddr))
2107 goto martian_source;
2108
2109 if (ipv4_is_zeronet(daddr))
2110 goto martian_destination;
2111
2112 /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
2113 * and call it once if daddr or/and saddr are loopback addresses
2114 */
2115 if (ipv4_is_loopback(daddr)) {
2116 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2117 goto martian_destination;
2118 } else if (ipv4_is_loopback(saddr)) {
2119 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2120 goto martian_source;
2121 }
2122
2123 /*
2124 * Now we are ready to route packet.
2125 */
2126 fl4.flowi4_oif = 0;
2127 fl4.flowi4_iif = dev->ifindex;
2128 fl4.flowi4_mark = skb->mark;
2129 fl4.flowi4_tos = tos;
2130 fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2131 fl4.flowi4_flags = 0;
2132 fl4.daddr = daddr;
2133 fl4.saddr = saddr;
2134 fl4.flowi4_uid = sock_net_uid(net, NULL);
Olivier Deprez0e641232021-09-23 10:07:05 +02002135 fl4.flowi4_multipath_hash = 0;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002136
2137 if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys)) {
2138 flkeys = &_flkeys;
2139 } else {
2140 fl4.flowi4_proto = 0;
2141 fl4.fl4_sport = 0;
2142 fl4.fl4_dport = 0;
2143 }
2144
2145 err = fib_lookup(net, &fl4, res, 0);
2146 if (err != 0) {
2147 if (!IN_DEV_FORWARD(in_dev))
2148 err = -EHOSTUNREACH;
2149 goto no_route;
2150 }
2151
2152 if (res->type == RTN_BROADCAST) {
2153 if (IN_DEV_BFORWARD(in_dev))
2154 goto make_route;
David Brazdil0f672f62019-12-10 10:32:29 +00002155 /* not do cache if bc_forwarding is enabled */
2156 if (IPV4_DEVCONF_ALL(net, BC_FORWARDING))
2157 do_cache = false;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002158 goto brd_input;
2159 }
2160
2161 if (res->type == RTN_LOCAL) {
2162 err = fib_validate_source(skb, saddr, daddr, tos,
2163 0, dev, in_dev, &itag);
2164 if (err < 0)
2165 goto martian_source;
2166 goto local_input;
2167 }
2168
2169 if (!IN_DEV_FORWARD(in_dev)) {
2170 err = -EHOSTUNREACH;
2171 goto no_route;
2172 }
2173 if (res->type != RTN_UNICAST)
2174 goto martian_destination;
2175
2176make_route:
2177 err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos, flkeys);
2178out: return err;
2179
2180brd_input:
2181 if (skb->protocol != htons(ETH_P_IP))
2182 goto e_inval;
2183
2184 if (!ipv4_is_zeronet(saddr)) {
2185 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2186 in_dev, &itag);
2187 if (err < 0)
2188 goto martian_source;
2189 }
2190 flags |= RTCF_BROADCAST;
2191 res->type = RTN_BROADCAST;
2192 RT_CACHE_STAT_INC(in_brd);
2193
2194local_input:
David Brazdil0f672f62019-12-10 10:32:29 +00002195 do_cache &= res->fi && !itag;
2196 if (do_cache) {
2197 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2198
2199 rth = rcu_dereference(nhc->nhc_rth_input);
2200 if (rt_cache_valid(rth)) {
2201 skb_dst_set_noref(skb, &rth->dst);
2202 err = 0;
2203 goto out;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002204 }
2205 }
2206
2207 rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev,
2208 flags | RTCF_LOCAL, res->type,
2209 IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
2210 if (!rth)
2211 goto e_nobufs;
2212
2213 rth->dst.output= ip_rt_bug;
2214#ifdef CONFIG_IP_ROUTE_CLASSID
2215 rth->dst.tclassid = itag;
2216#endif
2217 rth->rt_is_input = 1;
2218
2219 RT_CACHE_STAT_INC(in_slow_tot);
2220 if (res->type == RTN_UNREACHABLE) {
2221 rth->dst.input= ip_error;
2222 rth->dst.error= -err;
2223 rth->rt_flags &= ~RTCF_LOCAL;
2224 }
2225
2226 if (do_cache) {
David Brazdil0f672f62019-12-10 10:32:29 +00002227 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002228
David Brazdil0f672f62019-12-10 10:32:29 +00002229 rth->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002230 if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2231 WARN_ON(rth->dst.input == lwtunnel_input);
2232 rth->dst.lwtstate->orig_input = rth->dst.input;
2233 rth->dst.input = lwtunnel_input;
2234 }
2235
David Brazdil0f672f62019-12-10 10:32:29 +00002236 if (unlikely(!rt_cache_route(nhc, rth)))
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002237 rt_add_uncached_list(rth);
2238 }
2239 skb_dst_set(skb, &rth->dst);
2240 err = 0;
2241 goto out;
2242
2243no_route:
2244 RT_CACHE_STAT_INC(in_no_route);
2245 res->type = RTN_UNREACHABLE;
2246 res->fi = NULL;
2247 res->table = NULL;
2248 goto local_input;
2249
2250 /*
2251 * Do not cache martian addresses: they should be logged (RFC1812)
2252 */
2253martian_destination:
2254 RT_CACHE_STAT_INC(in_martian_dst);
2255#ifdef CONFIG_IP_ROUTE_VERBOSE
2256 if (IN_DEV_LOG_MARTIANS(in_dev))
2257 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2258 &daddr, &saddr, dev->name);
2259#endif
2260
2261e_inval:
2262 err = -EINVAL;
2263 goto out;
2264
2265e_nobufs:
2266 err = -ENOBUFS;
2267 goto out;
2268
2269martian_source:
2270 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2271 goto out;
2272}
2273
2274int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2275 u8 tos, struct net_device *dev)
2276{
2277 struct fib_result res;
2278 int err;
2279
2280 tos &= IPTOS_RT_MASK;
2281 rcu_read_lock();
2282 err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
2283 rcu_read_unlock();
2284
2285 return err;
2286}
2287EXPORT_SYMBOL(ip_route_input_noref);
2288
2289/* called with rcu_read_lock held */
2290int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2291 u8 tos, struct net_device *dev, struct fib_result *res)
2292{
2293 /* Multicast recognition logic is moved from route cache to here.
2294 The problem was that too many Ethernet cards have broken/missing
2295 hardware multicast filters :-( As result the host on multicasting
2296 network acquires a lot of useless route cache entries, sort of
2297 SDR messages from all the world. Now we try to get rid of them.
2298 Really, provided software IP multicast filter is organized
2299 reasonably (at least, hashed), it does not result in a slowdown
2300 comparing with route cache reject entries.
2301 Note, that multicast routers are not affected, because
2302 route cache entry is created eventually.
2303 */
2304 if (ipv4_is_multicast(daddr)) {
2305 struct in_device *in_dev = __in_dev_get_rcu(dev);
2306 int our = 0;
2307 int err = -EINVAL;
2308
David Brazdil0f672f62019-12-10 10:32:29 +00002309 if (!in_dev)
2310 return err;
2311 our = ip_check_mc_rcu(in_dev, daddr, saddr,
2312 ip_hdr(skb)->protocol);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002313
2314 /* check l3 master if no match yet */
David Brazdil0f672f62019-12-10 10:32:29 +00002315 if (!our && netif_is_l3_slave(dev)) {
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002316 struct in_device *l3_in_dev;
2317
2318 l3_in_dev = __in_dev_get_rcu(skb->dev);
2319 if (l3_in_dev)
2320 our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2321 ip_hdr(skb)->protocol);
2322 }
2323
2324 if (our
2325#ifdef CONFIG_IP_MROUTE
2326 ||
2327 (!ipv4_is_local_multicast(daddr) &&
2328 IN_DEV_MFORWARD(in_dev))
2329#endif
2330 ) {
2331 err = ip_route_input_mc(skb, daddr, saddr,
2332 tos, dev, our);
2333 }
2334 return err;
2335 }
2336
2337 return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
2338}
2339
2340/* called with rcu_read_lock() */
2341static struct rtable *__mkroute_output(const struct fib_result *res,
2342 const struct flowi4 *fl4, int orig_oif,
2343 struct net_device *dev_out,
2344 unsigned int flags)
2345{
2346 struct fib_info *fi = res->fi;
2347 struct fib_nh_exception *fnhe;
2348 struct in_device *in_dev;
2349 u16 type = res->type;
2350 struct rtable *rth;
2351 bool do_cache;
2352
2353 in_dev = __in_dev_get_rcu(dev_out);
2354 if (!in_dev)
2355 return ERR_PTR(-EINVAL);
2356
2357 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2358 if (ipv4_is_loopback(fl4->saddr) &&
2359 !(dev_out->flags & IFF_LOOPBACK) &&
2360 !netif_is_l3_master(dev_out))
2361 return ERR_PTR(-EINVAL);
2362
2363 if (ipv4_is_lbcast(fl4->daddr))
2364 type = RTN_BROADCAST;
2365 else if (ipv4_is_multicast(fl4->daddr))
2366 type = RTN_MULTICAST;
2367 else if (ipv4_is_zeronet(fl4->daddr))
2368 return ERR_PTR(-EINVAL);
2369
2370 if (dev_out->flags & IFF_LOOPBACK)
2371 flags |= RTCF_LOCAL;
2372
2373 do_cache = true;
2374 if (type == RTN_BROADCAST) {
2375 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2376 fi = NULL;
2377 } else if (type == RTN_MULTICAST) {
2378 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2379 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2380 fl4->flowi4_proto))
2381 flags &= ~RTCF_LOCAL;
2382 else
2383 do_cache = false;
2384 /* If multicast route do not exist use
2385 * default one, but do not gateway in this case.
2386 * Yes, it is hack.
2387 */
2388 if (fi && res->prefixlen < 4)
2389 fi = NULL;
2390 } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2391 (orig_oif != dev_out->ifindex)) {
2392 /* For local routes that require a particular output interface
2393 * we do not want to cache the result. Caching the result
2394 * causes incorrect behaviour when there are multiple source
2395 * addresses on the interface, the end result being that if the
2396 * intended recipient is waiting on that interface for the
2397 * packet he won't receive it because it will be delivered on
2398 * the loopback interface and the IP_PKTINFO ipi_ifindex will
2399 * be set to the loopback interface as well.
2400 */
2401 do_cache = false;
2402 }
2403
2404 fnhe = NULL;
2405 do_cache &= fi != NULL;
2406 if (fi) {
David Brazdil0f672f62019-12-10 10:32:29 +00002407 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002408 struct rtable __rcu **prth;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002409
David Brazdil0f672f62019-12-10 10:32:29 +00002410 fnhe = find_exception(nhc, fl4->daddr);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002411 if (!do_cache)
2412 goto add;
2413 if (fnhe) {
2414 prth = &fnhe->fnhe_rth_output;
2415 } else {
2416 if (unlikely(fl4->flowi4_flags &
2417 FLOWI_FLAG_KNOWN_NH &&
David Brazdil0f672f62019-12-10 10:32:29 +00002418 !(nhc->nhc_gw_family &&
2419 nhc->nhc_scope == RT_SCOPE_LINK))) {
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002420 do_cache = false;
2421 goto add;
2422 }
David Brazdil0f672f62019-12-10 10:32:29 +00002423 prth = raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002424 }
2425 rth = rcu_dereference(*prth);
2426 if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
2427 return rth;
2428 }
2429
2430add:
2431 rth = rt_dst_alloc(dev_out, flags, type,
2432 IN_DEV_CONF_GET(in_dev, NOPOLICY),
2433 IN_DEV_CONF_GET(in_dev, NOXFRM),
2434 do_cache);
2435 if (!rth)
2436 return ERR_PTR(-ENOBUFS);
2437
2438 rth->rt_iif = orig_oif;
2439
2440 RT_CACHE_STAT_INC(out_slow_tot);
2441
2442 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2443 if (flags & RTCF_LOCAL &&
2444 !(dev_out->flags & IFF_LOOPBACK)) {
2445 rth->dst.output = ip_mc_output;
2446 RT_CACHE_STAT_INC(out_slow_mc);
2447 }
2448#ifdef CONFIG_IP_MROUTE
2449 if (type == RTN_MULTICAST) {
2450 if (IN_DEV_MFORWARD(in_dev) &&
2451 !ipv4_is_local_multicast(fl4->daddr)) {
2452 rth->dst.input = ip_mr_input;
2453 rth->dst.output = ip_mc_output;
2454 }
2455 }
2456#endif
2457 }
2458
2459 rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
2460 lwtunnel_set_redirect(&rth->dst);
2461
2462 return rth;
2463}
2464
2465/*
2466 * Major route resolver routine.
2467 */
2468
2469struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2470 const struct sk_buff *skb)
2471{
2472 __u8 tos = RT_FL_TOS(fl4);
2473 struct fib_result res = {
2474 .type = RTN_UNSPEC,
2475 .fi = NULL,
2476 .table = NULL,
2477 .tclassid = 0,
2478 };
2479 struct rtable *rth;
2480
2481 fl4->flowi4_iif = LOOPBACK_IFINDEX;
2482 fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2483 fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2484 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2485
2486 rcu_read_lock();
2487 rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
2488 rcu_read_unlock();
2489
2490 return rth;
2491}
2492EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
2493
2494struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
2495 struct fib_result *res,
2496 const struct sk_buff *skb)
2497{
2498 struct net_device *dev_out = NULL;
2499 int orig_oif = fl4->flowi4_oif;
2500 unsigned int flags = 0;
2501 struct rtable *rth;
David Brazdil0f672f62019-12-10 10:32:29 +00002502 int err;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002503
2504 if (fl4->saddr) {
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002505 if (ipv4_is_multicast(fl4->saddr) ||
2506 ipv4_is_lbcast(fl4->saddr) ||
David Brazdil0f672f62019-12-10 10:32:29 +00002507 ipv4_is_zeronet(fl4->saddr)) {
2508 rth = ERR_PTR(-EINVAL);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002509 goto out;
David Brazdil0f672f62019-12-10 10:32:29 +00002510 }
2511
2512 rth = ERR_PTR(-ENETUNREACH);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002513
2514 /* I removed check for oif == dev_out->oif here.
2515 It was wrong for two reasons:
2516 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2517 is assigned to multiple interfaces.
2518 2. Moreover, we are allowed to send packets with saddr
2519 of another iface. --ANK
2520 */
2521
2522 if (fl4->flowi4_oif == 0 &&
2523 (ipv4_is_multicast(fl4->daddr) ||
2524 ipv4_is_lbcast(fl4->daddr))) {
2525 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2526 dev_out = __ip_dev_find(net, fl4->saddr, false);
2527 if (!dev_out)
2528 goto out;
2529
2530 /* Special hack: user can direct multicasts
2531 and limited broadcast via necessary interface
2532 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2533 This hack is not just for fun, it allows
2534 vic,vat and friends to work.
2535 They bind socket to loopback, set ttl to zero
2536 and expect that it will work.
2537 From the viewpoint of routing cache they are broken,
2538 because we are not allowed to build multicast path
2539 with loopback source addr (look, routing cache
2540 cannot know, that ttl is zero, so that packet
2541 will not leave this host and route is valid).
2542 Luckily, this hack is good workaround.
2543 */
2544
2545 fl4->flowi4_oif = dev_out->ifindex;
2546 goto make_route;
2547 }
2548
2549 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2550 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2551 if (!__ip_dev_find(net, fl4->saddr, false))
2552 goto out;
2553 }
2554 }
2555
2556
2557 if (fl4->flowi4_oif) {
2558 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2559 rth = ERR_PTR(-ENODEV);
2560 if (!dev_out)
2561 goto out;
2562
2563 /* RACE: Check return value of inet_select_addr instead. */
2564 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2565 rth = ERR_PTR(-ENETUNREACH);
2566 goto out;
2567 }
2568 if (ipv4_is_local_multicast(fl4->daddr) ||
2569 ipv4_is_lbcast(fl4->daddr) ||
2570 fl4->flowi4_proto == IPPROTO_IGMP) {
2571 if (!fl4->saddr)
2572 fl4->saddr = inet_select_addr(dev_out, 0,
2573 RT_SCOPE_LINK);
2574 goto make_route;
2575 }
2576 if (!fl4->saddr) {
2577 if (ipv4_is_multicast(fl4->daddr))
2578 fl4->saddr = inet_select_addr(dev_out, 0,
2579 fl4->flowi4_scope);
2580 else if (!fl4->daddr)
2581 fl4->saddr = inet_select_addr(dev_out, 0,
2582 RT_SCOPE_HOST);
2583 }
2584 }
2585
2586 if (!fl4->daddr) {
2587 fl4->daddr = fl4->saddr;
2588 if (!fl4->daddr)
2589 fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2590 dev_out = net->loopback_dev;
2591 fl4->flowi4_oif = LOOPBACK_IFINDEX;
2592 res->type = RTN_LOCAL;
2593 flags |= RTCF_LOCAL;
2594 goto make_route;
2595 }
2596
2597 err = fib_lookup(net, fl4, res, 0);
2598 if (err) {
2599 res->fi = NULL;
2600 res->table = NULL;
2601 if (fl4->flowi4_oif &&
2602 (ipv4_is_multicast(fl4->daddr) ||
2603 !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
2604 /* Apparently, routing tables are wrong. Assume,
2605 that the destination is on link.
2606
2607 WHY? DW.
2608 Because we are allowed to send to iface
2609 even if it has NO routes and NO assigned
2610 addresses. When oif is specified, routing
2611 tables are looked up with only one purpose:
2612 to catch if destination is gatewayed, rather than
2613 direct. Moreover, if MSG_DONTROUTE is set,
2614 we send packet, ignoring both routing tables
2615 and ifaddr state. --ANK
2616
2617
2618 We could make it even if oif is unknown,
2619 likely IPv6, but we do not.
2620 */
2621
2622 if (fl4->saddr == 0)
2623 fl4->saddr = inet_select_addr(dev_out, 0,
2624 RT_SCOPE_LINK);
2625 res->type = RTN_UNICAST;
2626 goto make_route;
2627 }
2628 rth = ERR_PTR(err);
2629 goto out;
2630 }
2631
2632 if (res->type == RTN_LOCAL) {
2633 if (!fl4->saddr) {
2634 if (res->fi->fib_prefsrc)
2635 fl4->saddr = res->fi->fib_prefsrc;
2636 else
2637 fl4->saddr = fl4->daddr;
2638 }
2639
2640 /* L3 master device is the loopback for that domain */
2641 dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
2642 net->loopback_dev;
2643
2644 /* make sure orig_oif points to fib result device even
2645 * though packet rx/tx happens over loopback or l3mdev
2646 */
2647 orig_oif = FIB_RES_OIF(*res);
2648
2649 fl4->flowi4_oif = dev_out->ifindex;
2650 flags |= RTCF_LOCAL;
2651 goto make_route;
2652 }
2653
2654 fib_select_path(net, res, fl4, skb);
2655
2656 dev_out = FIB_RES_DEV(*res);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002657
2658make_route:
2659 rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
2660
2661out:
2662 return rth;
2663}
2664
2665static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2666{
2667 return NULL;
2668}
2669
2670static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2671{
2672 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2673
2674 return mtu ? : dst->dev->mtu;
2675}
2676
2677static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
Olivier Deprez0e641232021-09-23 10:07:05 +02002678 struct sk_buff *skb, u32 mtu,
2679 bool confirm_neigh)
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002680{
2681}
2682
2683static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2684 struct sk_buff *skb)
2685{
2686}
2687
2688static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2689 unsigned long old)
2690{
2691 return NULL;
2692}
2693
2694static struct dst_ops ipv4_dst_blackhole_ops = {
2695 .family = AF_INET,
2696 .check = ipv4_blackhole_dst_check,
2697 .mtu = ipv4_blackhole_mtu,
2698 .default_advmss = ipv4_default_advmss,
2699 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
2700 .redirect = ipv4_rt_blackhole_redirect,
2701 .cow_metrics = ipv4_rt_blackhole_cow_metrics,
2702 .neigh_lookup = ipv4_neigh_lookup,
2703};
2704
2705struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2706{
2707 struct rtable *ort = (struct rtable *) dst_orig;
2708 struct rtable *rt;
2709
2710 rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0);
2711 if (rt) {
2712 struct dst_entry *new = &rt->dst;
2713
2714 new->__use = 1;
2715 new->input = dst_discard;
2716 new->output = dst_discard_out;
2717
2718 new->dev = net->loopback_dev;
2719 if (new->dev)
2720 dev_hold(new->dev);
2721
2722 rt->rt_is_input = ort->rt_is_input;
2723 rt->rt_iif = ort->rt_iif;
2724 rt->rt_pmtu = ort->rt_pmtu;
2725 rt->rt_mtu_locked = ort->rt_mtu_locked;
2726
2727 rt->rt_genid = rt_genid_ipv4(net);
2728 rt->rt_flags = ort->rt_flags;
2729 rt->rt_type = ort->rt_type;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002730 rt->rt_uses_gateway = ort->rt_uses_gateway;
David Brazdil0f672f62019-12-10 10:32:29 +00002731 rt->rt_gw_family = ort->rt_gw_family;
2732 if (rt->rt_gw_family == AF_INET)
2733 rt->rt_gw4 = ort->rt_gw4;
2734 else if (rt->rt_gw_family == AF_INET6)
2735 rt->rt_gw6 = ort->rt_gw6;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002736
2737 INIT_LIST_HEAD(&rt->rt_uncached);
2738 }
2739
2740 dst_release(dst_orig);
2741
2742 return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2743}
2744
2745struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2746 const struct sock *sk)
2747{
2748 struct rtable *rt = __ip_route_output_key(net, flp4);
2749
2750 if (IS_ERR(rt))
2751 return rt;
2752
Olivier Deprez0e641232021-09-23 10:07:05 +02002753 if (flp4->flowi4_proto) {
2754 flp4->flowi4_oif = rt->dst.dev->ifindex;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002755 rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2756 flowi4_to_flowi(flp4),
2757 sk, 0);
Olivier Deprez0e641232021-09-23 10:07:05 +02002758 }
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002759
2760 return rt;
2761}
2762EXPORT_SYMBOL_GPL(ip_route_output_flow);
2763
2764/* called with rcu_read_lock held */
2765static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
2766 struct rtable *rt, u32 table_id, struct flowi4 *fl4,
David Brazdil0f672f62019-12-10 10:32:29 +00002767 struct sk_buff *skb, u32 portid, u32 seq,
2768 unsigned int flags)
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002769{
2770 struct rtmsg *r;
2771 struct nlmsghdr *nlh;
2772 unsigned long expires = 0;
2773 u32 error;
2774 u32 metrics[RTAX_MAX];
2775
David Brazdil0f672f62019-12-10 10:32:29 +00002776 nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), flags);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002777 if (!nlh)
2778 return -EMSGSIZE;
2779
2780 r = nlmsg_data(nlh);
2781 r->rtm_family = AF_INET;
2782 r->rtm_dst_len = 32;
2783 r->rtm_src_len = 0;
David Brazdil0f672f62019-12-10 10:32:29 +00002784 r->rtm_tos = fl4 ? fl4->flowi4_tos : 0;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002785 r->rtm_table = table_id < 256 ? table_id : RT_TABLE_COMPAT;
2786 if (nla_put_u32(skb, RTA_TABLE, table_id))
2787 goto nla_put_failure;
2788 r->rtm_type = rt->rt_type;
2789 r->rtm_scope = RT_SCOPE_UNIVERSE;
2790 r->rtm_protocol = RTPROT_UNSPEC;
2791 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2792 if (rt->rt_flags & RTCF_NOTIFY)
2793 r->rtm_flags |= RTM_F_NOTIFY;
2794 if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2795 r->rtm_flags |= RTCF_DOREDIRECT;
2796
2797 if (nla_put_in_addr(skb, RTA_DST, dst))
2798 goto nla_put_failure;
2799 if (src) {
2800 r->rtm_src_len = 32;
2801 if (nla_put_in_addr(skb, RTA_SRC, src))
2802 goto nla_put_failure;
2803 }
2804 if (rt->dst.dev &&
2805 nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2806 goto nla_put_failure;
2807#ifdef CONFIG_IP_ROUTE_CLASSID
2808 if (rt->dst.tclassid &&
2809 nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2810 goto nla_put_failure;
2811#endif
David Brazdil0f672f62019-12-10 10:32:29 +00002812 if (fl4 && !rt_is_input_route(rt) &&
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002813 fl4->saddr != src) {
2814 if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2815 goto nla_put_failure;
2816 }
David Brazdil0f672f62019-12-10 10:32:29 +00002817 if (rt->rt_uses_gateway) {
2818 if (rt->rt_gw_family == AF_INET &&
2819 nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gw4)) {
2820 goto nla_put_failure;
2821 } else if (rt->rt_gw_family == AF_INET6) {
2822 int alen = sizeof(struct in6_addr);
2823 struct nlattr *nla;
2824 struct rtvia *via;
2825
2826 nla = nla_reserve(skb, RTA_VIA, alen + 2);
2827 if (!nla)
2828 goto nla_put_failure;
2829
2830 via = nla_data(nla);
2831 via->rtvia_family = AF_INET6;
2832 memcpy(via->rtvia_addr, &rt->rt_gw6, alen);
2833 }
2834 }
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002835
2836 expires = rt->dst.expires;
2837 if (expires) {
2838 unsigned long now = jiffies;
2839
2840 if (time_before(now, expires))
2841 expires -= now;
2842 else
2843 expires = 0;
2844 }
2845
2846 memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2847 if (rt->rt_pmtu && expires)
2848 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2849 if (rt->rt_mtu_locked && expires)
2850 metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
2851 if (rtnetlink_put_metrics(skb, metrics) < 0)
2852 goto nla_put_failure;
2853
David Brazdil0f672f62019-12-10 10:32:29 +00002854 if (fl4) {
2855 if (fl4->flowi4_mark &&
2856 nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2857 goto nla_put_failure;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002858
David Brazdil0f672f62019-12-10 10:32:29 +00002859 if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2860 nla_put_u32(skb, RTA_UID,
2861 from_kuid_munged(current_user_ns(),
2862 fl4->flowi4_uid)))
2863 goto nla_put_failure;
2864
2865 if (rt_is_input_route(rt)) {
2866#ifdef CONFIG_IP_MROUTE
2867 if (ipv4_is_multicast(dst) &&
2868 !ipv4_is_local_multicast(dst) &&
2869 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2870 int err = ipmr_get_route(net, skb,
2871 fl4->saddr, fl4->daddr,
2872 r, portid);
2873
2874 if (err <= 0) {
2875 if (err == 0)
2876 return 0;
2877 goto nla_put_failure;
2878 }
2879 } else
2880#endif
2881 if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif))
2882 goto nla_put_failure;
2883 }
2884 }
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002885
2886 error = rt->dst.error;
2887
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002888 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2889 goto nla_put_failure;
2890
2891 nlmsg_end(skb, nlh);
2892 return 0;
2893
2894nla_put_failure:
2895 nlmsg_cancel(skb, nlh);
2896 return -EMSGSIZE;
2897}
2898
David Brazdil0f672f62019-12-10 10:32:29 +00002899static int fnhe_dump_bucket(struct net *net, struct sk_buff *skb,
2900 struct netlink_callback *cb, u32 table_id,
2901 struct fnhe_hash_bucket *bucket, int genid,
2902 int *fa_index, int fa_start, unsigned int flags)
2903{
2904 int i;
2905
2906 for (i = 0; i < FNHE_HASH_SIZE; i++) {
2907 struct fib_nh_exception *fnhe;
2908
2909 for (fnhe = rcu_dereference(bucket[i].chain); fnhe;
2910 fnhe = rcu_dereference(fnhe->fnhe_next)) {
2911 struct rtable *rt;
2912 int err;
2913
2914 if (*fa_index < fa_start)
2915 goto next;
2916
2917 if (fnhe->fnhe_genid != genid)
2918 goto next;
2919
2920 if (fnhe->fnhe_expires &&
2921 time_after(jiffies, fnhe->fnhe_expires))
2922 goto next;
2923
2924 rt = rcu_dereference(fnhe->fnhe_rth_input);
2925 if (!rt)
2926 rt = rcu_dereference(fnhe->fnhe_rth_output);
2927 if (!rt)
2928 goto next;
2929
2930 err = rt_fill_info(net, fnhe->fnhe_daddr, 0, rt,
2931 table_id, NULL, skb,
2932 NETLINK_CB(cb->skb).portid,
2933 cb->nlh->nlmsg_seq, flags);
2934 if (err)
2935 return err;
2936next:
2937 (*fa_index)++;
2938 }
2939 }
2940
2941 return 0;
2942}
2943
2944int fib_dump_info_fnhe(struct sk_buff *skb, struct netlink_callback *cb,
2945 u32 table_id, struct fib_info *fi,
2946 int *fa_index, int fa_start, unsigned int flags)
2947{
2948 struct net *net = sock_net(cb->skb->sk);
2949 int nhsel, genid = fnhe_genid(net);
2950
2951 for (nhsel = 0; nhsel < fib_info_num_path(fi); nhsel++) {
2952 struct fib_nh_common *nhc = fib_info_nhc(fi, nhsel);
2953 struct fnhe_hash_bucket *bucket;
2954 int err;
2955
2956 if (nhc->nhc_flags & RTNH_F_DEAD)
2957 continue;
2958
2959 rcu_read_lock();
2960 bucket = rcu_dereference(nhc->nhc_exceptions);
2961 err = 0;
2962 if (bucket)
2963 err = fnhe_dump_bucket(net, skb, cb, table_id, bucket,
2964 genid, fa_index, fa_start,
2965 flags);
2966 rcu_read_unlock();
2967 if (err)
2968 return err;
2969 }
2970
2971 return 0;
2972}
2973
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002974static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst,
2975 u8 ip_proto, __be16 sport,
2976 __be16 dport)
2977{
2978 struct sk_buff *skb;
2979 struct iphdr *iph;
2980
2981 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2982 if (!skb)
2983 return NULL;
2984
2985 /* Reserve room for dummy headers, this skb can pass
2986 * through good chunk of routing engine.
2987 */
2988 skb_reset_mac_header(skb);
2989 skb_reset_network_header(skb);
2990 skb->protocol = htons(ETH_P_IP);
2991 iph = skb_put(skb, sizeof(struct iphdr));
2992 iph->protocol = ip_proto;
2993 iph->saddr = src;
2994 iph->daddr = dst;
2995 iph->version = 0x4;
2996 iph->frag_off = 0;
2997 iph->ihl = 0x5;
2998 skb_set_transport_header(skb, skb->len);
2999
3000 switch (iph->protocol) {
3001 case IPPROTO_UDP: {
3002 struct udphdr *udph;
3003
3004 udph = skb_put_zero(skb, sizeof(struct udphdr));
3005 udph->source = sport;
3006 udph->dest = dport;
Olivier Deprez0e641232021-09-23 10:07:05 +02003007 udph->len = htons(sizeof(struct udphdr));
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00003008 udph->check = 0;
3009 break;
3010 }
3011 case IPPROTO_TCP: {
3012 struct tcphdr *tcph;
3013
3014 tcph = skb_put_zero(skb, sizeof(struct tcphdr));
3015 tcph->source = sport;
3016 tcph->dest = dport;
3017 tcph->doff = sizeof(struct tcphdr) / 4;
3018 tcph->rst = 1;
3019 tcph->check = ~tcp_v4_check(sizeof(struct tcphdr),
3020 src, dst, 0);
3021 break;
3022 }
3023 case IPPROTO_ICMP: {
3024 struct icmphdr *icmph;
3025
3026 icmph = skb_put_zero(skb, sizeof(struct icmphdr));
3027 icmph->type = ICMP_ECHO;
3028 icmph->code = 0;
3029 }
3030 }
3031
3032 return skb;
3033}
3034
David Brazdil0f672f62019-12-10 10:32:29 +00003035static int inet_rtm_valid_getroute_req(struct sk_buff *skb,
3036 const struct nlmsghdr *nlh,
3037 struct nlattr **tb,
3038 struct netlink_ext_ack *extack)
3039{
3040 struct rtmsg *rtm;
3041 int i, err;
3042
3043 if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
3044 NL_SET_ERR_MSG(extack,
3045 "ipv4: Invalid header for route get request");
3046 return -EINVAL;
3047 }
3048
3049 if (!netlink_strict_get_check(skb))
3050 return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
3051 rtm_ipv4_policy, extack);
3052
3053 rtm = nlmsg_data(nlh);
3054 if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) ||
3055 (rtm->rtm_dst_len && rtm->rtm_dst_len != 32) ||
3056 rtm->rtm_table || rtm->rtm_protocol ||
3057 rtm->rtm_scope || rtm->rtm_type) {
3058 NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for route get request");
3059 return -EINVAL;
3060 }
3061
3062 if (rtm->rtm_flags & ~(RTM_F_NOTIFY |
3063 RTM_F_LOOKUP_TABLE |
3064 RTM_F_FIB_MATCH)) {
3065 NL_SET_ERR_MSG(extack, "ipv4: Unsupported rtm_flags for route get request");
3066 return -EINVAL;
3067 }
3068
3069 err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
3070 rtm_ipv4_policy, extack);
3071 if (err)
3072 return err;
3073
3074 if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
3075 (tb[RTA_DST] && !rtm->rtm_dst_len)) {
3076 NL_SET_ERR_MSG(extack, "ipv4: rtm_src_len and rtm_dst_len must be 32 for IPv4");
3077 return -EINVAL;
3078 }
3079
3080 for (i = 0; i <= RTA_MAX; i++) {
3081 if (!tb[i])
3082 continue;
3083
3084 switch (i) {
3085 case RTA_IIF:
3086 case RTA_OIF:
3087 case RTA_SRC:
3088 case RTA_DST:
3089 case RTA_IP_PROTO:
3090 case RTA_SPORT:
3091 case RTA_DPORT:
3092 case RTA_MARK:
3093 case RTA_UID:
3094 break;
3095 default:
3096 NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in route get request");
3097 return -EINVAL;
3098 }
3099 }
3100
3101 return 0;
3102}
3103
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00003104static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
3105 struct netlink_ext_ack *extack)
3106{
3107 struct net *net = sock_net(in_skb->sk);
3108 struct nlattr *tb[RTA_MAX+1];
3109 u32 table_id = RT_TABLE_MAIN;
3110 __be16 sport = 0, dport = 0;
3111 struct fib_result res = {};
3112 u8 ip_proto = IPPROTO_UDP;
3113 struct rtable *rt = NULL;
3114 struct sk_buff *skb;
3115 struct rtmsg *rtm;
David Brazdil0f672f62019-12-10 10:32:29 +00003116 struct flowi4 fl4 = {};
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00003117 __be32 dst = 0;
3118 __be32 src = 0;
3119 kuid_t uid;
3120 u32 iif;
3121 int err;
3122 int mark;
3123
David Brazdil0f672f62019-12-10 10:32:29 +00003124 err = inet_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00003125 if (err < 0)
3126 return err;
3127
3128 rtm = nlmsg_data(nlh);
3129 src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
3130 dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
3131 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
3132 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
3133 if (tb[RTA_UID])
3134 uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
3135 else
3136 uid = (iif ? INVALID_UID : current_uid());
3137
3138 if (tb[RTA_IP_PROTO]) {
3139 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
David Brazdil0f672f62019-12-10 10:32:29 +00003140 &ip_proto, AF_INET, extack);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00003141 if (err)
3142 return err;
3143 }
3144
3145 if (tb[RTA_SPORT])
3146 sport = nla_get_be16(tb[RTA_SPORT]);
3147
3148 if (tb[RTA_DPORT])
3149 dport = nla_get_be16(tb[RTA_DPORT]);
3150
3151 skb = inet_rtm_getroute_build_skb(src, dst, ip_proto, sport, dport);
3152 if (!skb)
3153 return -ENOBUFS;
3154
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00003155 fl4.daddr = dst;
3156 fl4.saddr = src;
Olivier Deprez0e641232021-09-23 10:07:05 +02003157 fl4.flowi4_tos = rtm->rtm_tos & IPTOS_RT_MASK;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00003158 fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
3159 fl4.flowi4_mark = mark;
3160 fl4.flowi4_uid = uid;
3161 if (sport)
3162 fl4.fl4_sport = sport;
3163 if (dport)
3164 fl4.fl4_dport = dport;
3165 fl4.flowi4_proto = ip_proto;
3166
3167 rcu_read_lock();
3168
3169 if (iif) {
3170 struct net_device *dev;
3171
3172 dev = dev_get_by_index_rcu(net, iif);
3173 if (!dev) {
3174 err = -ENODEV;
3175 goto errout_rcu;
3176 }
3177
3178 fl4.flowi4_iif = iif; /* for rt_fill_info */
3179 skb->dev = dev;
3180 skb->mark = mark;
Olivier Deprez0e641232021-09-23 10:07:05 +02003181 err = ip_route_input_rcu(skb, dst, src,
3182 rtm->rtm_tos & IPTOS_RT_MASK, dev,
3183 &res);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00003184
3185 rt = skb_rtable(skb);
3186 if (err == 0 && rt->dst.error)
3187 err = -rt->dst.error;
3188 } else {
3189 fl4.flowi4_iif = LOOPBACK_IFINDEX;
David Brazdil0f672f62019-12-10 10:32:29 +00003190 skb->dev = net->loopback_dev;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00003191 rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
3192 err = 0;
3193 if (IS_ERR(rt))
3194 err = PTR_ERR(rt);
3195 else
3196 skb_dst_set(skb, &rt->dst);
3197 }
3198
3199 if (err)
3200 goto errout_rcu;
3201
3202 if (rtm->rtm_flags & RTM_F_NOTIFY)
3203 rt->rt_flags |= RTCF_NOTIFY;
3204
3205 if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
3206 table_id = res.table ? res.table->tb_id : 0;
3207
3208 /* reset skb for netlink reply msg */
3209 skb_trim(skb, 0);
3210 skb_reset_network_header(skb);
3211 skb_reset_transport_header(skb);
3212 skb_reset_mac_header(skb);
3213
3214 if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
3215 if (!res.fi) {
3216 err = fib_props[res.type].error;
3217 if (!err)
3218 err = -EHOSTUNREACH;
3219 goto errout_rcu;
3220 }
3221 err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
3222 nlh->nlmsg_seq, RTM_NEWROUTE, table_id,
3223 rt->rt_type, res.prefix, res.prefixlen,
3224 fl4.flowi4_tos, res.fi, 0);
3225 } else {
3226 err = rt_fill_info(net, dst, src, rt, table_id, &fl4, skb,
David Brazdil0f672f62019-12-10 10:32:29 +00003227 NETLINK_CB(in_skb).portid,
3228 nlh->nlmsg_seq, 0);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00003229 }
3230 if (err < 0)
3231 goto errout_rcu;
3232
3233 rcu_read_unlock();
3234
3235 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
3236
3237errout_free:
3238 return err;
3239errout_rcu:
3240 rcu_read_unlock();
3241 kfree_skb(skb);
3242 goto errout_free;
3243}
3244
3245void ip_rt_multicast_event(struct in_device *in_dev)
3246{
3247 rt_cache_flush(dev_net(in_dev->dev));
3248}
3249
3250#ifdef CONFIG_SYSCTL
3251static int ip_rt_gc_interval __read_mostly = 60 * HZ;
3252static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
3253static int ip_rt_gc_elasticity __read_mostly = 8;
3254static int ip_min_valid_pmtu __read_mostly = IPV4_MIN_MTU;
3255
3256static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
3257 void __user *buffer,
3258 size_t *lenp, loff_t *ppos)
3259{
3260 struct net *net = (struct net *)__ctl->extra1;
3261
3262 if (write) {
3263 rt_cache_flush(net);
3264 fnhe_genid_bump(net);
3265 return 0;
3266 }
3267
3268 return -EINVAL;
3269}
3270
3271static struct ctl_table ipv4_route_table[] = {
3272 {
3273 .procname = "gc_thresh",
3274 .data = &ipv4_dst_ops.gc_thresh,
3275 .maxlen = sizeof(int),
3276 .mode = 0644,
3277 .proc_handler = proc_dointvec,
3278 },
3279 {
3280 .procname = "max_size",
3281 .data = &ip_rt_max_size,
3282 .maxlen = sizeof(int),
3283 .mode = 0644,
3284 .proc_handler = proc_dointvec,
3285 },
3286 {
3287 /* Deprecated. Use gc_min_interval_ms */
3288
3289 .procname = "gc_min_interval",
3290 .data = &ip_rt_gc_min_interval,
3291 .maxlen = sizeof(int),
3292 .mode = 0644,
3293 .proc_handler = proc_dointvec_jiffies,
3294 },
3295 {
3296 .procname = "gc_min_interval_ms",
3297 .data = &ip_rt_gc_min_interval,
3298 .maxlen = sizeof(int),
3299 .mode = 0644,
3300 .proc_handler = proc_dointvec_ms_jiffies,
3301 },
3302 {
3303 .procname = "gc_timeout",
3304 .data = &ip_rt_gc_timeout,
3305 .maxlen = sizeof(int),
3306 .mode = 0644,
3307 .proc_handler = proc_dointvec_jiffies,
3308 },
3309 {
3310 .procname = "gc_interval",
3311 .data = &ip_rt_gc_interval,
3312 .maxlen = sizeof(int),
3313 .mode = 0644,
3314 .proc_handler = proc_dointvec_jiffies,
3315 },
3316 {
3317 .procname = "redirect_load",
3318 .data = &ip_rt_redirect_load,
3319 .maxlen = sizeof(int),
3320 .mode = 0644,
3321 .proc_handler = proc_dointvec,
3322 },
3323 {
3324 .procname = "redirect_number",
3325 .data = &ip_rt_redirect_number,
3326 .maxlen = sizeof(int),
3327 .mode = 0644,
3328 .proc_handler = proc_dointvec,
3329 },
3330 {
3331 .procname = "redirect_silence",
3332 .data = &ip_rt_redirect_silence,
3333 .maxlen = sizeof(int),
3334 .mode = 0644,
3335 .proc_handler = proc_dointvec,
3336 },
3337 {
3338 .procname = "error_cost",
3339 .data = &ip_rt_error_cost,
3340 .maxlen = sizeof(int),
3341 .mode = 0644,
3342 .proc_handler = proc_dointvec,
3343 },
3344 {
3345 .procname = "error_burst",
3346 .data = &ip_rt_error_burst,
3347 .maxlen = sizeof(int),
3348 .mode = 0644,
3349 .proc_handler = proc_dointvec,
3350 },
3351 {
3352 .procname = "gc_elasticity",
3353 .data = &ip_rt_gc_elasticity,
3354 .maxlen = sizeof(int),
3355 .mode = 0644,
3356 .proc_handler = proc_dointvec,
3357 },
3358 {
3359 .procname = "mtu_expires",
3360 .data = &ip_rt_mtu_expires,
3361 .maxlen = sizeof(int),
3362 .mode = 0644,
3363 .proc_handler = proc_dointvec_jiffies,
3364 },
3365 {
3366 .procname = "min_pmtu",
3367 .data = &ip_rt_min_pmtu,
3368 .maxlen = sizeof(int),
3369 .mode = 0644,
3370 .proc_handler = proc_dointvec_minmax,
3371 .extra1 = &ip_min_valid_pmtu,
3372 },
3373 {
3374 .procname = "min_adv_mss",
3375 .data = &ip_rt_min_advmss,
3376 .maxlen = sizeof(int),
3377 .mode = 0644,
3378 .proc_handler = proc_dointvec,
3379 },
3380 { }
3381};
3382
David Brazdil0f672f62019-12-10 10:32:29 +00003383static const char ipv4_route_flush_procname[] = "flush";
3384
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00003385static struct ctl_table ipv4_route_flush_table[] = {
3386 {
David Brazdil0f672f62019-12-10 10:32:29 +00003387 .procname = ipv4_route_flush_procname,
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00003388 .maxlen = sizeof(int),
3389 .mode = 0200,
3390 .proc_handler = ipv4_sysctl_rtcache_flush,
3391 },
3392 { },
3393};
3394
3395static __net_init int sysctl_route_net_init(struct net *net)
3396{
3397 struct ctl_table *tbl;
3398
3399 tbl = ipv4_route_flush_table;
3400 if (!net_eq(net, &init_net)) {
3401 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3402 if (!tbl)
3403 goto err_dup;
3404
David Brazdil0f672f62019-12-10 10:32:29 +00003405 /* Don't export non-whitelisted sysctls to unprivileged users */
3406 if (net->user_ns != &init_user_ns) {
3407 if (tbl[0].procname != ipv4_route_flush_procname)
3408 tbl[0].procname = NULL;
3409 }
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00003410 }
3411 tbl[0].extra1 = net;
3412
3413 net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
3414 if (!net->ipv4.route_hdr)
3415 goto err_reg;
3416 return 0;
3417
3418err_reg:
3419 if (tbl != ipv4_route_flush_table)
3420 kfree(tbl);
3421err_dup:
3422 return -ENOMEM;
3423}
3424
3425static __net_exit void sysctl_route_net_exit(struct net *net)
3426{
3427 struct ctl_table *tbl;
3428
3429 tbl = net->ipv4.route_hdr->ctl_table_arg;
3430 unregister_net_sysctl_table(net->ipv4.route_hdr);
3431 BUG_ON(tbl == ipv4_route_flush_table);
3432 kfree(tbl);
3433}
3434
3435static __net_initdata struct pernet_operations sysctl_route_ops = {
3436 .init = sysctl_route_net_init,
3437 .exit = sysctl_route_net_exit,
3438};
3439#endif
3440
3441static __net_init int rt_genid_init(struct net *net)
3442{
3443 atomic_set(&net->ipv4.rt_genid, 0);
3444 atomic_set(&net->fnhe_genid, 0);
3445 atomic_set(&net->ipv4.dev_addr_genid, get_random_int());
3446 return 0;
3447}
3448
3449static __net_initdata struct pernet_operations rt_genid_ops = {
3450 .init = rt_genid_init,
3451};
3452
3453static int __net_init ipv4_inetpeer_init(struct net *net)
3454{
3455 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3456
3457 if (!bp)
3458 return -ENOMEM;
3459 inet_peer_base_init(bp);
3460 net->ipv4.peers = bp;
3461 return 0;
3462}
3463
3464static void __net_exit ipv4_inetpeer_exit(struct net *net)
3465{
3466 struct inet_peer_base *bp = net->ipv4.peers;
3467
3468 net->ipv4.peers = NULL;
3469 inetpeer_invalidate_tree(bp);
3470 kfree(bp);
3471}
3472
3473static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3474 .init = ipv4_inetpeer_init,
3475 .exit = ipv4_inetpeer_exit,
3476};
3477
3478#ifdef CONFIG_IP_ROUTE_CLASSID
3479struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3480#endif /* CONFIG_IP_ROUTE_CLASSID */
3481
3482int __init ip_rt_init(void)
3483{
Olivier Deprez0e641232021-09-23 10:07:05 +02003484 void *idents_hash;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00003485 int cpu;
3486
Olivier Deprez0e641232021-09-23 10:07:05 +02003487 /* For modern hosts, this will use 2 MB of memory */
3488 idents_hash = alloc_large_system_hash("IP idents",
3489 sizeof(*ip_idents) + sizeof(*ip_tstamps),
3490 0,
3491 16, /* one bucket per 64 KB */
3492 HASH_ZERO,
3493 NULL,
3494 &ip_idents_mask,
3495 2048,
3496 256*1024);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00003497
Olivier Deprez0e641232021-09-23 10:07:05 +02003498 ip_idents = idents_hash;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00003499
Olivier Deprez0e641232021-09-23 10:07:05 +02003500 prandom_bytes(ip_idents, (ip_idents_mask + 1) * sizeof(*ip_idents));
3501
3502 ip_tstamps = idents_hash + (ip_idents_mask + 1) * sizeof(*ip_idents);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00003503
3504 for_each_possible_cpu(cpu) {
3505 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
3506
3507 INIT_LIST_HEAD(&ul->head);
3508 spin_lock_init(&ul->lock);
3509 }
3510#ifdef CONFIG_IP_ROUTE_CLASSID
3511 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3512 if (!ip_rt_acct)
3513 panic("IP: failed to allocate ip_rt_acct\n");
3514#endif
3515
3516 ipv4_dst_ops.kmem_cachep =
3517 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3518 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3519
3520 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3521
3522 if (dst_entries_init(&ipv4_dst_ops) < 0)
3523 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3524
3525 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3526 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3527
3528 ipv4_dst_ops.gc_thresh = ~0;
3529 ip_rt_max_size = INT_MAX;
3530
3531 devinet_init();
3532 ip_fib_init();
3533
3534 if (ip_rt_proc_init())
3535 pr_err("Unable to create route proc files\n");
3536#ifdef CONFIG_XFRM
3537 xfrm_init();
3538 xfrm4_init();
3539#endif
3540 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
3541 RTNL_FLAG_DOIT_UNLOCKED);
3542
3543#ifdef CONFIG_SYSCTL
3544 register_pernet_subsys(&sysctl_route_ops);
3545#endif
3546 register_pernet_subsys(&rt_genid_ops);
3547 register_pernet_subsys(&ipv4_inetpeer_ops);
3548 return 0;
3549}
3550
3551#ifdef CONFIG_SYSCTL
3552/*
3553 * We really need to sanitize the damn ipv4 init order, then all
3554 * this nonsense will go away.
3555 */
3556void __init ip_static_sysctl_init(void)
3557{
3558 register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3559}
3560#endif