blob: a33681dc479614b35d1195d27ca0284ef3a822d2 [file] [log] [blame]
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001/*
2 * Linux INET6 implementation
3 * FIB front-end.
4 *
5 * Authors:
6 * Pedro Roque <roque@di.fc.ul.pt>
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
12 */
13
14/* Changes:
15 *
16 * YOSHIFUJI Hideaki @USAGI
17 * reworked default router selection.
18 * - respect outgoing interface
19 * - select from (probably) reachable routers (i.e.
20 * routers in REACHABLE, STALE, DELAY or PROBE states).
21 * - always select the same router if it is (probably)
22 * reachable. otherwise, round-robin the list.
23 * Ville Nuorvala
24 * Fixed routing subtrees.
25 */
26
27#define pr_fmt(fmt) "IPv6: " fmt
28
29#include <linux/capability.h>
30#include <linux/errno.h>
31#include <linux/export.h>
32#include <linux/types.h>
33#include <linux/times.h>
34#include <linux/socket.h>
35#include <linux/sockios.h>
36#include <linux/net.h>
37#include <linux/route.h>
38#include <linux/netdevice.h>
39#include <linux/in6.h>
40#include <linux/mroute6.h>
41#include <linux/init.h>
42#include <linux/if_arp.h>
43#include <linux/proc_fs.h>
44#include <linux/seq_file.h>
45#include <linux/nsproxy.h>
46#include <linux/slab.h>
47#include <linux/jhash.h>
48#include <net/net_namespace.h>
49#include <net/snmp.h>
50#include <net/ipv6.h>
51#include <net/ip6_fib.h>
52#include <net/ip6_route.h>
53#include <net/ndisc.h>
54#include <net/addrconf.h>
55#include <net/tcp.h>
56#include <linux/rtnetlink.h>
57#include <net/dst.h>
58#include <net/dst_metadata.h>
59#include <net/xfrm.h>
60#include <net/netevent.h>
61#include <net/netlink.h>
62#include <net/nexthop.h>
63#include <net/lwtunnel.h>
64#include <net/ip_tunnels.h>
65#include <net/l3mdev.h>
66#include <net/ip.h>
67#include <linux/uaccess.h>
68
69#ifdef CONFIG_SYSCTL
70#include <linux/sysctl.h>
71#endif
72
73static int ip6_rt_type_to_error(u8 fib6_type);
74
75#define CREATE_TRACE_POINTS
76#include <trace/events/fib6.h>
77EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup);
78#undef CREATE_TRACE_POINTS
79
80enum rt6_nud_state {
81 RT6_NUD_FAIL_HARD = -3,
82 RT6_NUD_FAIL_PROBE = -2,
83 RT6_NUD_FAIL_DO_RR = -1,
84 RT6_NUD_SUCCEED = 1
85};
86
87static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
88static unsigned int ip6_default_advmss(const struct dst_entry *dst);
89static unsigned int ip6_mtu(const struct dst_entry *dst);
90static struct dst_entry *ip6_negative_advice(struct dst_entry *);
91static void ip6_dst_destroy(struct dst_entry *);
92static void ip6_dst_ifdown(struct dst_entry *,
93 struct net_device *dev, int how);
94static int ip6_dst_gc(struct dst_ops *ops);
95
96static int ip6_pkt_discard(struct sk_buff *skb);
97static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
98static int ip6_pkt_prohibit(struct sk_buff *skb);
99static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
100static void ip6_link_failure(struct sk_buff *skb);
101static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
102 struct sk_buff *skb, u32 mtu);
103static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
104 struct sk_buff *skb);
105static int rt6_score_route(struct fib6_info *rt, int oif, int strict);
106static size_t rt6_nlmsg_size(struct fib6_info *rt);
107static int rt6_fill_node(struct net *net, struct sk_buff *skb,
108 struct fib6_info *rt, struct dst_entry *dst,
109 struct in6_addr *dest, struct in6_addr *src,
110 int iif, int type, u32 portid, u32 seq,
111 unsigned int flags);
112static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
113 struct in6_addr *daddr,
114 struct in6_addr *saddr);
115
116#ifdef CONFIG_IPV6_ROUTE_INFO
117static struct fib6_info *rt6_add_route_info(struct net *net,
118 const struct in6_addr *prefix, int prefixlen,
119 const struct in6_addr *gwaddr,
120 struct net_device *dev,
121 unsigned int pref);
122static struct fib6_info *rt6_get_route_info(struct net *net,
123 const struct in6_addr *prefix, int prefixlen,
124 const struct in6_addr *gwaddr,
125 struct net_device *dev);
126#endif
127
128struct uncached_list {
129 spinlock_t lock;
130 struct list_head head;
131};
132
133static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
134
135void rt6_uncached_list_add(struct rt6_info *rt)
136{
137 struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
138
139 rt->rt6i_uncached_list = ul;
140
141 spin_lock_bh(&ul->lock);
142 list_add_tail(&rt->rt6i_uncached, &ul->head);
143 spin_unlock_bh(&ul->lock);
144}
145
146void rt6_uncached_list_del(struct rt6_info *rt)
147{
148 if (!list_empty(&rt->rt6i_uncached)) {
149 struct uncached_list *ul = rt->rt6i_uncached_list;
150 struct net *net = dev_net(rt->dst.dev);
151
152 spin_lock_bh(&ul->lock);
153 list_del(&rt->rt6i_uncached);
154 atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
155 spin_unlock_bh(&ul->lock);
156 }
157}
158
159static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
160{
161 struct net_device *loopback_dev = net->loopback_dev;
162 int cpu;
163
164 if (dev == loopback_dev)
165 return;
166
167 for_each_possible_cpu(cpu) {
168 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
169 struct rt6_info *rt;
170
171 spin_lock_bh(&ul->lock);
172 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
173 struct inet6_dev *rt_idev = rt->rt6i_idev;
174 struct net_device *rt_dev = rt->dst.dev;
175
176 if (rt_idev->dev == dev) {
177 rt->rt6i_idev = in6_dev_get(loopback_dev);
178 in6_dev_put(rt_idev);
179 }
180
181 if (rt_dev == dev) {
182 rt->dst.dev = loopback_dev;
183 dev_hold(rt->dst.dev);
184 dev_put(rt_dev);
185 }
186 }
187 spin_unlock_bh(&ul->lock);
188 }
189}
190
191static inline const void *choose_neigh_daddr(const struct in6_addr *p,
192 struct sk_buff *skb,
193 const void *daddr)
194{
195 if (!ipv6_addr_any(p))
196 return (const void *) p;
197 else if (skb)
198 return &ipv6_hdr(skb)->daddr;
199 return daddr;
200}
201
202struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw,
203 struct net_device *dev,
204 struct sk_buff *skb,
205 const void *daddr)
206{
207 struct neighbour *n;
208
209 daddr = choose_neigh_daddr(gw, skb, daddr);
210 n = __ipv6_neigh_lookup(dev, daddr);
211 if (n)
212 return n;
213 return neigh_create(&nd_tbl, daddr, dev);
214}
215
216static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst,
217 struct sk_buff *skb,
218 const void *daddr)
219{
220 const struct rt6_info *rt = container_of(dst, struct rt6_info, dst);
221
222 return ip6_neigh_lookup(&rt->rt6i_gateway, dst->dev, skb, daddr);
223}
224
225static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
226{
227 struct net_device *dev = dst->dev;
228 struct rt6_info *rt = (struct rt6_info *)dst;
229
230 daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr);
231 if (!daddr)
232 return;
233 if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
234 return;
235 if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
236 return;
237 __ipv6_confirm_neigh(dev, daddr);
238}
239
240static struct dst_ops ip6_dst_ops_template = {
241 .family = AF_INET6,
242 .gc = ip6_dst_gc,
243 .gc_thresh = 1024,
244 .check = ip6_dst_check,
245 .default_advmss = ip6_default_advmss,
246 .mtu = ip6_mtu,
247 .cow_metrics = dst_cow_metrics_generic,
248 .destroy = ip6_dst_destroy,
249 .ifdown = ip6_dst_ifdown,
250 .negative_advice = ip6_negative_advice,
251 .link_failure = ip6_link_failure,
252 .update_pmtu = ip6_rt_update_pmtu,
253 .redirect = rt6_do_redirect,
254 .local_out = __ip6_local_out,
255 .neigh_lookup = ip6_dst_neigh_lookup,
256 .confirm_neigh = ip6_confirm_neigh,
257};
258
259static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
260{
261 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
262
263 return mtu ? : dst->dev->mtu;
264}
265
266static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
267 struct sk_buff *skb, u32 mtu)
268{
269}
270
271static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
272 struct sk_buff *skb)
273{
274}
275
276static struct dst_ops ip6_dst_blackhole_ops = {
277 .family = AF_INET6,
278 .destroy = ip6_dst_destroy,
279 .check = ip6_dst_check,
280 .mtu = ip6_blackhole_mtu,
281 .default_advmss = ip6_default_advmss,
282 .update_pmtu = ip6_rt_blackhole_update_pmtu,
283 .redirect = ip6_rt_blackhole_redirect,
284 .cow_metrics = dst_cow_metrics_generic,
285 .neigh_lookup = ip6_dst_neigh_lookup,
286};
287
288static const u32 ip6_template_metrics[RTAX_MAX] = {
289 [RTAX_HOPLIMIT - 1] = 0,
290};
291
292static const struct fib6_info fib6_null_entry_template = {
293 .fib6_flags = (RTF_REJECT | RTF_NONEXTHOP),
294 .fib6_protocol = RTPROT_KERNEL,
295 .fib6_metric = ~(u32)0,
296 .fib6_ref = ATOMIC_INIT(1),
297 .fib6_type = RTN_UNREACHABLE,
298 .fib6_metrics = (struct dst_metrics *)&dst_default_metrics,
299};
300
301static const struct rt6_info ip6_null_entry_template = {
302 .dst = {
303 .__refcnt = ATOMIC_INIT(1),
304 .__use = 1,
305 .obsolete = DST_OBSOLETE_FORCE_CHK,
306 .error = -ENETUNREACH,
307 .input = ip6_pkt_discard,
308 .output = ip6_pkt_discard_out,
309 },
310 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
311};
312
313#ifdef CONFIG_IPV6_MULTIPLE_TABLES
314
315static const struct rt6_info ip6_prohibit_entry_template = {
316 .dst = {
317 .__refcnt = ATOMIC_INIT(1),
318 .__use = 1,
319 .obsolete = DST_OBSOLETE_FORCE_CHK,
320 .error = -EACCES,
321 .input = ip6_pkt_prohibit,
322 .output = ip6_pkt_prohibit_out,
323 },
324 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
325};
326
327static const struct rt6_info ip6_blk_hole_entry_template = {
328 .dst = {
329 .__refcnt = ATOMIC_INIT(1),
330 .__use = 1,
331 .obsolete = DST_OBSOLETE_FORCE_CHK,
332 .error = -EINVAL,
333 .input = dst_discard,
334 .output = dst_discard_out,
335 },
336 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
337};
338
339#endif
340
341static void rt6_info_init(struct rt6_info *rt)
342{
343 struct dst_entry *dst = &rt->dst;
344
345 memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
346 INIT_LIST_HEAD(&rt->rt6i_uncached);
347}
348
349/* allocate dst with ip6_dst_ops */
350struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev,
351 int flags)
352{
353 struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
354 1, DST_OBSOLETE_FORCE_CHK, flags);
355
356 if (rt) {
357 rt6_info_init(rt);
358 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
359 }
360
361 return rt;
362}
363EXPORT_SYMBOL(ip6_dst_alloc);
364
365static void ip6_dst_destroy(struct dst_entry *dst)
366{
367 struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst);
368 struct rt6_info *rt = (struct rt6_info *)dst;
369 struct fib6_info *from;
370 struct inet6_dev *idev;
371
372 if (p != &dst_default_metrics && refcount_dec_and_test(&p->refcnt))
373 kfree(p);
374
375 rt6_uncached_list_del(rt);
376
377 idev = rt->rt6i_idev;
378 if (idev) {
379 rt->rt6i_idev = NULL;
380 in6_dev_put(idev);
381 }
382
383 rcu_read_lock();
384 from = rcu_dereference(rt->from);
385 rcu_assign_pointer(rt->from, NULL);
386 fib6_info_release(from);
387 rcu_read_unlock();
388}
389
390static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
391 int how)
392{
393 struct rt6_info *rt = (struct rt6_info *)dst;
394 struct inet6_dev *idev = rt->rt6i_idev;
395 struct net_device *loopback_dev =
396 dev_net(dev)->loopback_dev;
397
398 if (idev && idev->dev != loopback_dev) {
399 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
400 if (loopback_idev) {
401 rt->rt6i_idev = loopback_idev;
402 in6_dev_put(idev);
403 }
404 }
405}
406
407static bool __rt6_check_expired(const struct rt6_info *rt)
408{
409 if (rt->rt6i_flags & RTF_EXPIRES)
410 return time_after(jiffies, rt->dst.expires);
411 else
412 return false;
413}
414
415static bool rt6_check_expired(const struct rt6_info *rt)
416{
417 struct fib6_info *from;
418
419 from = rcu_dereference(rt->from);
420
421 if (rt->rt6i_flags & RTF_EXPIRES) {
422 if (time_after(jiffies, rt->dst.expires))
423 return true;
424 } else if (from) {
425 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
426 fib6_check_expired(from);
427 }
428 return false;
429}
430
431struct fib6_info *fib6_multipath_select(const struct net *net,
432 struct fib6_info *match,
433 struct flowi6 *fl6, int oif,
434 const struct sk_buff *skb,
435 int strict)
436{
437 struct fib6_info *sibling, *next_sibling;
438
439 /* We might have already computed the hash for ICMPv6 errors. In such
440 * case it will always be non-zero. Otherwise now is the time to do it.
441 */
442 if (!fl6->mp_hash)
443 fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
444
445 if (fl6->mp_hash <= atomic_read(&match->fib6_nh.nh_upper_bound))
446 return match;
447
448 list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings,
449 fib6_siblings) {
450 int nh_upper_bound;
451
452 nh_upper_bound = atomic_read(&sibling->fib6_nh.nh_upper_bound);
453 if (fl6->mp_hash > nh_upper_bound)
454 continue;
455 if (rt6_score_route(sibling, oif, strict) < 0)
456 break;
457 match = sibling;
458 break;
459 }
460
461 return match;
462}
463
464/*
465 * Route lookup. rcu_read_lock() should be held.
466 */
467
468static inline struct fib6_info *rt6_device_match(struct net *net,
469 struct fib6_info *rt,
470 const struct in6_addr *saddr,
471 int oif,
472 int flags)
473{
474 struct fib6_info *sprt;
475
476 if (!oif && ipv6_addr_any(saddr) &&
477 !(rt->fib6_nh.nh_flags & RTNH_F_DEAD))
478 return rt;
479
480 for (sprt = rt; sprt; sprt = rcu_dereference(sprt->fib6_next)) {
481 const struct net_device *dev = sprt->fib6_nh.nh_dev;
482
483 if (sprt->fib6_nh.nh_flags & RTNH_F_DEAD)
484 continue;
485
486 if (oif) {
487 if (dev->ifindex == oif)
488 return sprt;
489 } else {
490 if (ipv6_chk_addr(net, saddr, dev,
491 flags & RT6_LOOKUP_F_IFACE))
492 return sprt;
493 }
494 }
495
496 if (oif && flags & RT6_LOOKUP_F_IFACE)
497 return net->ipv6.fib6_null_entry;
498
499 return rt->fib6_nh.nh_flags & RTNH_F_DEAD ? net->ipv6.fib6_null_entry : rt;
500}
501
502#ifdef CONFIG_IPV6_ROUTER_PREF
503struct __rt6_probe_work {
504 struct work_struct work;
505 struct in6_addr target;
506 struct net_device *dev;
507};
508
509static void rt6_probe_deferred(struct work_struct *w)
510{
511 struct in6_addr mcaddr;
512 struct __rt6_probe_work *work =
513 container_of(w, struct __rt6_probe_work, work);
514
515 addrconf_addr_solict_mult(&work->target, &mcaddr);
516 ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
517 dev_put(work->dev);
518 kfree(work);
519}
520
521static void rt6_probe(struct fib6_info *rt)
522{
523 struct __rt6_probe_work *work = NULL;
524 const struct in6_addr *nh_gw;
525 struct neighbour *neigh;
526 struct net_device *dev;
527 struct inet6_dev *idev;
528
529 /*
530 * Okay, this does not seem to be appropriate
531 * for now, however, we need to check if it
532 * is really so; aka Router Reachability Probing.
533 *
534 * Router Reachability Probe MUST be rate-limited
535 * to no more than one per minute.
536 */
537 if (!rt || !(rt->fib6_flags & RTF_GATEWAY))
538 return;
539
540 nh_gw = &rt->fib6_nh.nh_gw;
541 dev = rt->fib6_nh.nh_dev;
542 rcu_read_lock_bh();
543 idev = __in6_dev_get(dev);
544 neigh = __ipv6_neigh_lookup_noref(dev, nh_gw);
545 if (neigh) {
546 if (neigh->nud_state & NUD_VALID)
547 goto out;
548
549 write_lock(&neigh->lock);
550 if (!(neigh->nud_state & NUD_VALID) &&
551 time_after(jiffies,
552 neigh->updated + idev->cnf.rtr_probe_interval)) {
553 work = kmalloc(sizeof(*work), GFP_ATOMIC);
554 if (work)
555 __neigh_set_probe_once(neigh);
556 }
557 write_unlock(&neigh->lock);
558 } else if (time_after(jiffies, rt->last_probe +
559 idev->cnf.rtr_probe_interval)) {
560 work = kmalloc(sizeof(*work), GFP_ATOMIC);
561 }
562
563 if (work) {
564 rt->last_probe = jiffies;
565 INIT_WORK(&work->work, rt6_probe_deferred);
566 work->target = *nh_gw;
567 dev_hold(dev);
568 work->dev = dev;
569 schedule_work(&work->work);
570 }
571
572out:
573 rcu_read_unlock_bh();
574}
575#else
576static inline void rt6_probe(struct fib6_info *rt)
577{
578}
579#endif
580
581/*
582 * Default Router Selection (RFC 2461 6.3.6)
583 */
584static inline int rt6_check_dev(struct fib6_info *rt, int oif)
585{
586 const struct net_device *dev = rt->fib6_nh.nh_dev;
587
588 if (!oif || dev->ifindex == oif)
589 return 2;
590 return 0;
591}
592
593static inline enum rt6_nud_state rt6_check_neigh(struct fib6_info *rt)
594{
595 enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
596 struct neighbour *neigh;
597
598 if (rt->fib6_flags & RTF_NONEXTHOP ||
599 !(rt->fib6_flags & RTF_GATEWAY))
600 return RT6_NUD_SUCCEED;
601
602 rcu_read_lock_bh();
603 neigh = __ipv6_neigh_lookup_noref(rt->fib6_nh.nh_dev,
604 &rt->fib6_nh.nh_gw);
605 if (neigh) {
606 read_lock(&neigh->lock);
607 if (neigh->nud_state & NUD_VALID)
608 ret = RT6_NUD_SUCCEED;
609#ifdef CONFIG_IPV6_ROUTER_PREF
610 else if (!(neigh->nud_state & NUD_FAILED))
611 ret = RT6_NUD_SUCCEED;
612 else
613 ret = RT6_NUD_FAIL_PROBE;
614#endif
615 read_unlock(&neigh->lock);
616 } else {
617 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
618 RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
619 }
620 rcu_read_unlock_bh();
621
622 return ret;
623}
624
625static int rt6_score_route(struct fib6_info *rt, int oif, int strict)
626{
627 int m;
628
629 m = rt6_check_dev(rt, oif);
630 if (!m && (strict & RT6_LOOKUP_F_IFACE))
631 return RT6_NUD_FAIL_HARD;
632#ifdef CONFIG_IPV6_ROUTER_PREF
633 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->fib6_flags)) << 2;
634#endif
635 if (strict & RT6_LOOKUP_F_REACHABLE) {
636 int n = rt6_check_neigh(rt);
637 if (n < 0)
638 return n;
639 }
640 return m;
641}
642
643/* called with rc_read_lock held */
644static inline bool fib6_ignore_linkdown(const struct fib6_info *f6i)
645{
646 const struct net_device *dev = fib6_info_nh_dev(f6i);
647 bool rc = false;
648
649 if (dev) {
650 const struct inet6_dev *idev = __in6_dev_get(dev);
651
652 rc = !!idev->cnf.ignore_routes_with_linkdown;
653 }
654
655 return rc;
656}
657
658static struct fib6_info *find_match(struct fib6_info *rt, int oif, int strict,
659 int *mpri, struct fib6_info *match,
660 bool *do_rr)
661{
662 int m;
663 bool match_do_rr = false;
664
665 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
666 goto out;
667
668 if (fib6_ignore_linkdown(rt) &&
669 rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
670 !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
671 goto out;
672
673 if (fib6_check_expired(rt))
674 goto out;
675
676 m = rt6_score_route(rt, oif, strict);
677 if (m == RT6_NUD_FAIL_DO_RR) {
678 match_do_rr = true;
679 m = 0; /* lowest valid score */
680 } else if (m == RT6_NUD_FAIL_HARD) {
681 goto out;
682 }
683
684 if (strict & RT6_LOOKUP_F_REACHABLE)
685 rt6_probe(rt);
686
687 /* note that m can be RT6_NUD_FAIL_PROBE at this point */
688 if (m > *mpri) {
689 *do_rr = match_do_rr;
690 *mpri = m;
691 match = rt;
692 }
693out:
694 return match;
695}
696
697static struct fib6_info *find_rr_leaf(struct fib6_node *fn,
698 struct fib6_info *leaf,
699 struct fib6_info *rr_head,
700 u32 metric, int oif, int strict,
701 bool *do_rr)
702{
703 struct fib6_info *rt, *match, *cont;
704 int mpri = -1;
705
706 match = NULL;
707 cont = NULL;
708 for (rt = rr_head; rt; rt = rcu_dereference(rt->fib6_next)) {
709 if (rt->fib6_metric != metric) {
710 cont = rt;
711 break;
712 }
713
714 match = find_match(rt, oif, strict, &mpri, match, do_rr);
715 }
716
717 for (rt = leaf; rt && rt != rr_head;
718 rt = rcu_dereference(rt->fib6_next)) {
719 if (rt->fib6_metric != metric) {
720 cont = rt;
721 break;
722 }
723
724 match = find_match(rt, oif, strict, &mpri, match, do_rr);
725 }
726
727 if (match || !cont)
728 return match;
729
730 for (rt = cont; rt; rt = rcu_dereference(rt->fib6_next))
731 match = find_match(rt, oif, strict, &mpri, match, do_rr);
732
733 return match;
734}
735
736static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn,
737 int oif, int strict)
738{
739 struct fib6_info *leaf = rcu_dereference(fn->leaf);
740 struct fib6_info *match, *rt0;
741 bool do_rr = false;
742 int key_plen;
743
744 if (!leaf || leaf == net->ipv6.fib6_null_entry)
745 return net->ipv6.fib6_null_entry;
746
747 rt0 = rcu_dereference(fn->rr_ptr);
748 if (!rt0)
749 rt0 = leaf;
750
751 /* Double check to make sure fn is not an intermediate node
752 * and fn->leaf does not points to its child's leaf
753 * (This might happen if all routes under fn are deleted from
754 * the tree and fib6_repair_tree() is called on the node.)
755 */
756 key_plen = rt0->fib6_dst.plen;
757#ifdef CONFIG_IPV6_SUBTREES
758 if (rt0->fib6_src.plen)
759 key_plen = rt0->fib6_src.plen;
760#endif
761 if (fn->fn_bit != key_plen)
762 return net->ipv6.fib6_null_entry;
763
764 match = find_rr_leaf(fn, leaf, rt0, rt0->fib6_metric, oif, strict,
765 &do_rr);
766
767 if (do_rr) {
768 struct fib6_info *next = rcu_dereference(rt0->fib6_next);
769
770 /* no entries matched; do round-robin */
771 if (!next || next->fib6_metric != rt0->fib6_metric)
772 next = leaf;
773
774 if (next != rt0) {
775 spin_lock_bh(&leaf->fib6_table->tb6_lock);
776 /* make sure next is not being deleted from the tree */
777 if (next->fib6_node)
778 rcu_assign_pointer(fn->rr_ptr, next);
779 spin_unlock_bh(&leaf->fib6_table->tb6_lock);
780 }
781 }
782
783 return match ? match : net->ipv6.fib6_null_entry;
784}
785
786static bool rt6_is_gw_or_nonexthop(const struct fib6_info *rt)
787{
788 return (rt->fib6_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
789}
790
791#ifdef CONFIG_IPV6_ROUTE_INFO
792int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
793 const struct in6_addr *gwaddr)
794{
795 struct net *net = dev_net(dev);
796 struct route_info *rinfo = (struct route_info *) opt;
797 struct in6_addr prefix_buf, *prefix;
798 unsigned int pref;
799 unsigned long lifetime;
800 struct fib6_info *rt;
801
802 if (len < sizeof(struct route_info)) {
803 return -EINVAL;
804 }
805
806 /* Sanity check for prefix_len and length */
807 if (rinfo->length > 3) {
808 return -EINVAL;
809 } else if (rinfo->prefix_len > 128) {
810 return -EINVAL;
811 } else if (rinfo->prefix_len > 64) {
812 if (rinfo->length < 2) {
813 return -EINVAL;
814 }
815 } else if (rinfo->prefix_len > 0) {
816 if (rinfo->length < 1) {
817 return -EINVAL;
818 }
819 }
820
821 pref = rinfo->route_pref;
822 if (pref == ICMPV6_ROUTER_PREF_INVALID)
823 return -EINVAL;
824
825 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
826
827 if (rinfo->length == 3)
828 prefix = (struct in6_addr *)rinfo->prefix;
829 else {
830 /* this function is safe */
831 ipv6_addr_prefix(&prefix_buf,
832 (struct in6_addr *)rinfo->prefix,
833 rinfo->prefix_len);
834 prefix = &prefix_buf;
835 }
836
837 if (rinfo->prefix_len == 0)
838 rt = rt6_get_dflt_router(net, gwaddr, dev);
839 else
840 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
841 gwaddr, dev);
842
843 if (rt && !lifetime) {
844 ip6_del_rt(net, rt);
845 rt = NULL;
846 }
847
848 if (!rt && lifetime)
849 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
850 dev, pref);
851 else if (rt)
852 rt->fib6_flags = RTF_ROUTEINFO |
853 (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
854
855 if (rt) {
856 if (!addrconf_finite_timeout(lifetime))
857 fib6_clean_expires(rt);
858 else
859 fib6_set_expires(rt, jiffies + HZ * lifetime);
860
861 fib6_info_release(rt);
862 }
863 return 0;
864}
865#endif
866
867/*
868 * Misc support functions
869 */
870
871/* called with rcu_lock held */
872static struct net_device *ip6_rt_get_dev_rcu(struct fib6_info *rt)
873{
874 struct net_device *dev = rt->fib6_nh.nh_dev;
875
876 if (rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) {
877 /* for copies of local routes, dst->dev needs to be the
878 * device if it is a master device, the master device if
879 * device is enslaved, and the loopback as the default
880 */
881 if (netif_is_l3_slave(dev) &&
882 !rt6_need_strict(&rt->fib6_dst.addr))
883 dev = l3mdev_master_dev_rcu(dev);
884 else if (!netif_is_l3_master(dev))
885 dev = dev_net(dev)->loopback_dev;
886 /* last case is netif_is_l3_master(dev) is true in which
887 * case we want dev returned to be dev
888 */
889 }
890
891 return dev;
892}
893
894static const int fib6_prop[RTN_MAX + 1] = {
895 [RTN_UNSPEC] = 0,
896 [RTN_UNICAST] = 0,
897 [RTN_LOCAL] = 0,
898 [RTN_BROADCAST] = 0,
899 [RTN_ANYCAST] = 0,
900 [RTN_MULTICAST] = 0,
901 [RTN_BLACKHOLE] = -EINVAL,
902 [RTN_UNREACHABLE] = -EHOSTUNREACH,
903 [RTN_PROHIBIT] = -EACCES,
904 [RTN_THROW] = -EAGAIN,
905 [RTN_NAT] = -EINVAL,
906 [RTN_XRESOLVE] = -EINVAL,
907};
908
909static int ip6_rt_type_to_error(u8 fib6_type)
910{
911 return fib6_prop[fib6_type];
912}
913
914static unsigned short fib6_info_dst_flags(struct fib6_info *rt)
915{
916 unsigned short flags = 0;
917
918 if (rt->dst_nocount)
919 flags |= DST_NOCOUNT;
920 if (rt->dst_nopolicy)
921 flags |= DST_NOPOLICY;
922 if (rt->dst_host)
923 flags |= DST_HOST;
924
925 return flags;
926}
927
928static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct fib6_info *ort)
929{
930 rt->dst.error = ip6_rt_type_to_error(ort->fib6_type);
931
932 switch (ort->fib6_type) {
933 case RTN_BLACKHOLE:
934 rt->dst.output = dst_discard_out;
935 rt->dst.input = dst_discard;
936 break;
937 case RTN_PROHIBIT:
938 rt->dst.output = ip6_pkt_prohibit_out;
939 rt->dst.input = ip6_pkt_prohibit;
940 break;
941 case RTN_THROW:
942 case RTN_UNREACHABLE:
943 default:
944 rt->dst.output = ip6_pkt_discard_out;
945 rt->dst.input = ip6_pkt_discard;
946 break;
947 }
948}
949
950static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort)
951{
952 if (ort->fib6_flags & RTF_REJECT) {
953 ip6_rt_init_dst_reject(rt, ort);
954 return;
955 }
956
957 rt->dst.error = 0;
958 rt->dst.output = ip6_output;
959
960 if (ort->fib6_type == RTN_LOCAL || ort->fib6_type == RTN_ANYCAST) {
961 rt->dst.input = ip6_input;
962 } else if (ipv6_addr_type(&ort->fib6_dst.addr) & IPV6_ADDR_MULTICAST) {
963 rt->dst.input = ip6_mc_input;
964 } else {
965 rt->dst.input = ip6_forward;
966 }
967
968 if (ort->fib6_nh.nh_lwtstate) {
969 rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate);
970 lwtunnel_set_redirect(&rt->dst);
971 }
972
973 rt->dst.lastuse = jiffies;
974}
975
976/* Caller must already hold reference to @from */
977static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from)
978{
979 rt->rt6i_flags &= ~RTF_EXPIRES;
980 rcu_assign_pointer(rt->from, from);
981 dst_init_metrics(&rt->dst, from->fib6_metrics->metrics, true);
982 if (from->fib6_metrics != &dst_default_metrics) {
983 rt->dst._metrics |= DST_METRICS_REFCOUNTED;
984 refcount_inc(&from->fib6_metrics->refcnt);
985 }
986}
987
988/* Caller must already hold reference to @ort */
989static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort)
990{
991 struct net_device *dev = fib6_info_nh_dev(ort);
992
993 ip6_rt_init_dst(rt, ort);
994
995 rt->rt6i_dst = ort->fib6_dst;
996 rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL;
997 rt->rt6i_gateway = ort->fib6_nh.nh_gw;
998 rt->rt6i_flags = ort->fib6_flags;
999 rt6_set_from(rt, ort);
1000#ifdef CONFIG_IPV6_SUBTREES
1001 rt->rt6i_src = ort->fib6_src;
1002#endif
1003 rt->rt6i_prefsrc = ort->fib6_prefsrc;
1004}
1005
1006static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
1007 struct in6_addr *saddr)
1008{
1009 struct fib6_node *pn, *sn;
1010 while (1) {
1011 if (fn->fn_flags & RTN_TL_ROOT)
1012 return NULL;
1013 pn = rcu_dereference(fn->parent);
1014 sn = FIB6_SUBTREE(pn);
1015 if (sn && sn != fn)
1016 fn = fib6_node_lookup(sn, NULL, saddr);
1017 else
1018 fn = pn;
1019 if (fn->fn_flags & RTN_RTINFO)
1020 return fn;
1021 }
1022}
1023
1024static bool ip6_hold_safe(struct net *net, struct rt6_info **prt,
1025 bool null_fallback)
1026{
1027 struct rt6_info *rt = *prt;
1028
1029 if (dst_hold_safe(&rt->dst))
1030 return true;
1031 if (null_fallback) {
1032 rt = net->ipv6.ip6_null_entry;
1033 dst_hold(&rt->dst);
1034 } else {
1035 rt = NULL;
1036 }
1037 *prt = rt;
1038 return false;
1039}
1040
1041/* called with rcu_lock held */
1042static struct rt6_info *ip6_create_rt_rcu(struct fib6_info *rt)
1043{
1044 unsigned short flags = fib6_info_dst_flags(rt);
1045 struct net_device *dev = rt->fib6_nh.nh_dev;
1046 struct rt6_info *nrt;
1047
1048 if (!fib6_info_hold_safe(rt))
1049 return NULL;
1050
1051 nrt = ip6_dst_alloc(dev_net(dev), dev, flags);
1052 if (nrt)
1053 ip6_rt_copy_init(nrt, rt);
1054 else
1055 fib6_info_release(rt);
1056
1057 return nrt;
1058}
1059
1060static struct rt6_info *ip6_pol_route_lookup(struct net *net,
1061 struct fib6_table *table,
1062 struct flowi6 *fl6,
1063 const struct sk_buff *skb,
1064 int flags)
1065{
1066 struct fib6_info *f6i;
1067 struct fib6_node *fn;
1068 struct rt6_info *rt;
1069
1070 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1071 flags &= ~RT6_LOOKUP_F_IFACE;
1072
1073 rcu_read_lock();
1074 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1075restart:
1076 f6i = rcu_dereference(fn->leaf);
1077 if (!f6i) {
1078 f6i = net->ipv6.fib6_null_entry;
1079 } else {
1080 f6i = rt6_device_match(net, f6i, &fl6->saddr,
1081 fl6->flowi6_oif, flags);
1082 if (f6i->fib6_nsiblings && fl6->flowi6_oif == 0)
1083 f6i = fib6_multipath_select(net, f6i, fl6,
1084 fl6->flowi6_oif, skb,
1085 flags);
1086 }
1087 if (f6i == net->ipv6.fib6_null_entry) {
1088 fn = fib6_backtrack(fn, &fl6->saddr);
1089 if (fn)
1090 goto restart;
1091 }
1092
1093 trace_fib6_table_lookup(net, f6i, table, fl6);
1094
1095 /* Search through exception table */
1096 rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1097 if (rt) {
1098 if (ip6_hold_safe(net, &rt, true))
1099 dst_use_noref(&rt->dst, jiffies);
1100 } else if (f6i == net->ipv6.fib6_null_entry) {
1101 rt = net->ipv6.ip6_null_entry;
1102 dst_hold(&rt->dst);
1103 } else {
1104 rt = ip6_create_rt_rcu(f6i);
1105 if (!rt) {
1106 rt = net->ipv6.ip6_null_entry;
1107 dst_hold(&rt->dst);
1108 }
1109 }
1110
1111 rcu_read_unlock();
1112
1113 return rt;
1114}
1115
1116struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
1117 const struct sk_buff *skb, int flags)
1118{
1119 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup);
1120}
1121EXPORT_SYMBOL_GPL(ip6_route_lookup);
1122
1123struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
1124 const struct in6_addr *saddr, int oif,
1125 const struct sk_buff *skb, int strict)
1126{
1127 struct flowi6 fl6 = {
1128 .flowi6_oif = oif,
1129 .daddr = *daddr,
1130 };
1131 struct dst_entry *dst;
1132 int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
1133
1134 if (saddr) {
1135 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
1136 flags |= RT6_LOOKUP_F_HAS_SADDR;
1137 }
1138
1139 dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup);
1140 if (dst->error == 0)
1141 return (struct rt6_info *) dst;
1142
1143 dst_release(dst);
1144
1145 return NULL;
1146}
1147EXPORT_SYMBOL(rt6_lookup);
1148
1149/* ip6_ins_rt is called with FREE table->tb6_lock.
1150 * It takes new route entry, the addition fails by any reason the
1151 * route is released.
1152 * Caller must hold dst before calling it.
1153 */
1154
1155static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info,
1156 struct netlink_ext_ack *extack)
1157{
1158 int err;
1159 struct fib6_table *table;
1160
1161 table = rt->fib6_table;
1162 spin_lock_bh(&table->tb6_lock);
1163 err = fib6_add(&table->tb6_root, rt, info, extack);
1164 spin_unlock_bh(&table->tb6_lock);
1165
1166 return err;
1167}
1168
1169int ip6_ins_rt(struct net *net, struct fib6_info *rt)
1170{
1171 struct nl_info info = { .nl_net = net, };
1172
1173 return __ip6_ins_rt(rt, &info, NULL);
1174}
1175
1176static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort,
1177 const struct in6_addr *daddr,
1178 const struct in6_addr *saddr)
1179{
1180 struct net_device *dev;
1181 struct rt6_info *rt;
1182
1183 /*
1184 * Clone the route.
1185 */
1186
1187 if (!fib6_info_hold_safe(ort))
1188 return NULL;
1189
1190 dev = ip6_rt_get_dev_rcu(ort);
1191 rt = ip6_dst_alloc(dev_net(dev), dev, 0);
1192 if (!rt) {
1193 fib6_info_release(ort);
1194 return NULL;
1195 }
1196
1197 ip6_rt_copy_init(rt, ort);
1198 rt->rt6i_flags |= RTF_CACHE;
1199 rt->dst.flags |= DST_HOST;
1200 rt->rt6i_dst.addr = *daddr;
1201 rt->rt6i_dst.plen = 128;
1202
1203 if (!rt6_is_gw_or_nonexthop(ort)) {
1204 if (ort->fib6_dst.plen != 128 &&
1205 ipv6_addr_equal(&ort->fib6_dst.addr, daddr))
1206 rt->rt6i_flags |= RTF_ANYCAST;
1207#ifdef CONFIG_IPV6_SUBTREES
1208 if (rt->rt6i_src.plen && saddr) {
1209 rt->rt6i_src.addr = *saddr;
1210 rt->rt6i_src.plen = 128;
1211 }
1212#endif
1213 }
1214
1215 return rt;
1216}
1217
1218static struct rt6_info *ip6_rt_pcpu_alloc(struct fib6_info *rt)
1219{
1220 unsigned short flags = fib6_info_dst_flags(rt);
1221 struct net_device *dev;
1222 struct rt6_info *pcpu_rt;
1223
1224 if (!fib6_info_hold_safe(rt))
1225 return NULL;
1226
1227 rcu_read_lock();
1228 dev = ip6_rt_get_dev_rcu(rt);
1229 pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags);
1230 rcu_read_unlock();
1231 if (!pcpu_rt) {
1232 fib6_info_release(rt);
1233 return NULL;
1234 }
1235 ip6_rt_copy_init(pcpu_rt, rt);
1236 pcpu_rt->rt6i_flags |= RTF_PCPU;
1237 return pcpu_rt;
1238}
1239
1240/* It should be called with rcu_read_lock() acquired */
1241static struct rt6_info *rt6_get_pcpu_route(struct fib6_info *rt)
1242{
1243 struct rt6_info *pcpu_rt, **p;
1244
1245 p = this_cpu_ptr(rt->rt6i_pcpu);
1246 pcpu_rt = *p;
1247
1248 if (pcpu_rt)
1249 ip6_hold_safe(NULL, &pcpu_rt, false);
1250
1251 return pcpu_rt;
1252}
1253
1254static struct rt6_info *rt6_make_pcpu_route(struct net *net,
1255 struct fib6_info *rt)
1256{
1257 struct rt6_info *pcpu_rt, *prev, **p;
1258
1259 pcpu_rt = ip6_rt_pcpu_alloc(rt);
1260 if (!pcpu_rt) {
1261 dst_hold(&net->ipv6.ip6_null_entry->dst);
1262 return net->ipv6.ip6_null_entry;
1263 }
1264
1265 dst_hold(&pcpu_rt->dst);
1266 p = this_cpu_ptr(rt->rt6i_pcpu);
1267 prev = cmpxchg(p, NULL, pcpu_rt);
1268 BUG_ON(prev);
1269
1270 return pcpu_rt;
1271}
1272
1273/* exception hash table implementation
1274 */
1275static DEFINE_SPINLOCK(rt6_exception_lock);
1276
1277/* Remove rt6_ex from hash table and free the memory
1278 * Caller must hold rt6_exception_lock
1279 */
1280static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1281 struct rt6_exception *rt6_ex)
1282{
1283 struct net *net;
1284
1285 if (!bucket || !rt6_ex)
1286 return;
1287
1288 net = dev_net(rt6_ex->rt6i->dst.dev);
1289 hlist_del_rcu(&rt6_ex->hlist);
1290 dst_release(&rt6_ex->rt6i->dst);
1291 kfree_rcu(rt6_ex, rcu);
1292 WARN_ON_ONCE(!bucket->depth);
1293 bucket->depth--;
1294 net->ipv6.rt6_stats->fib_rt_cache--;
1295}
1296
1297/* Remove oldest rt6_ex in bucket and free the memory
1298 * Caller must hold rt6_exception_lock
1299 */
1300static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1301{
1302 struct rt6_exception *rt6_ex, *oldest = NULL;
1303
1304 if (!bucket)
1305 return;
1306
1307 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1308 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1309 oldest = rt6_ex;
1310 }
1311 rt6_remove_exception(bucket, oldest);
1312}
1313
1314static u32 rt6_exception_hash(const struct in6_addr *dst,
1315 const struct in6_addr *src)
1316{
1317 static u32 seed __read_mostly;
1318 u32 val;
1319
1320 net_get_random_once(&seed, sizeof(seed));
1321 val = jhash(dst, sizeof(*dst), seed);
1322
1323#ifdef CONFIG_IPV6_SUBTREES
1324 if (src)
1325 val = jhash(src, sizeof(*src), val);
1326#endif
1327 return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1328}
1329
1330/* Helper function to find the cached rt in the hash table
1331 * and update bucket pointer to point to the bucket for this
1332 * (daddr, saddr) pair
1333 * Caller must hold rt6_exception_lock
1334 */
1335static struct rt6_exception *
1336__rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1337 const struct in6_addr *daddr,
1338 const struct in6_addr *saddr)
1339{
1340 struct rt6_exception *rt6_ex;
1341 u32 hval;
1342
1343 if (!(*bucket) || !daddr)
1344 return NULL;
1345
1346 hval = rt6_exception_hash(daddr, saddr);
1347 *bucket += hval;
1348
1349 hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1350 struct rt6_info *rt6 = rt6_ex->rt6i;
1351 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1352
1353#ifdef CONFIG_IPV6_SUBTREES
1354 if (matched && saddr)
1355 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1356#endif
1357 if (matched)
1358 return rt6_ex;
1359 }
1360 return NULL;
1361}
1362
1363/* Helper function to find the cached rt in the hash table
1364 * and update bucket pointer to point to the bucket for this
1365 * (daddr, saddr) pair
1366 * Caller must hold rcu_read_lock()
1367 */
1368static struct rt6_exception *
1369__rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1370 const struct in6_addr *daddr,
1371 const struct in6_addr *saddr)
1372{
1373 struct rt6_exception *rt6_ex;
1374 u32 hval;
1375
1376 WARN_ON_ONCE(!rcu_read_lock_held());
1377
1378 if (!(*bucket) || !daddr)
1379 return NULL;
1380
1381 hval = rt6_exception_hash(daddr, saddr);
1382 *bucket += hval;
1383
1384 hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1385 struct rt6_info *rt6 = rt6_ex->rt6i;
1386 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1387
1388#ifdef CONFIG_IPV6_SUBTREES
1389 if (matched && saddr)
1390 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1391#endif
1392 if (matched)
1393 return rt6_ex;
1394 }
1395 return NULL;
1396}
1397
1398static unsigned int fib6_mtu(const struct fib6_info *rt)
1399{
1400 unsigned int mtu;
1401
1402 if (rt->fib6_pmtu) {
1403 mtu = rt->fib6_pmtu;
1404 } else {
1405 struct net_device *dev = fib6_info_nh_dev(rt);
1406 struct inet6_dev *idev;
1407
1408 rcu_read_lock();
1409 idev = __in6_dev_get(dev);
1410 mtu = idev->cnf.mtu6;
1411 rcu_read_unlock();
1412 }
1413
1414 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
1415
1416 return mtu - lwtunnel_headroom(rt->fib6_nh.nh_lwtstate, mtu);
1417}
1418
1419static int rt6_insert_exception(struct rt6_info *nrt,
1420 struct fib6_info *ort)
1421{
1422 struct net *net = dev_net(nrt->dst.dev);
1423 struct rt6_exception_bucket *bucket;
1424 struct in6_addr *src_key = NULL;
1425 struct rt6_exception *rt6_ex;
1426 int err = 0;
1427
1428 spin_lock_bh(&rt6_exception_lock);
1429
1430 if (ort->exception_bucket_flushed) {
1431 err = -EINVAL;
1432 goto out;
1433 }
1434
1435 bucket = rcu_dereference_protected(ort->rt6i_exception_bucket,
1436 lockdep_is_held(&rt6_exception_lock));
1437 if (!bucket) {
1438 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1439 GFP_ATOMIC);
1440 if (!bucket) {
1441 err = -ENOMEM;
1442 goto out;
1443 }
1444 rcu_assign_pointer(ort->rt6i_exception_bucket, bucket);
1445 }
1446
1447#ifdef CONFIG_IPV6_SUBTREES
1448 /* rt6i_src.plen != 0 indicates ort is in subtree
1449 * and exception table is indexed by a hash of
1450 * both rt6i_dst and rt6i_src.
1451 * Otherwise, the exception table is indexed by
1452 * a hash of only rt6i_dst.
1453 */
1454 if (ort->fib6_src.plen)
1455 src_key = &nrt->rt6i_src.addr;
1456#endif
1457
1458 /* Update rt6i_prefsrc as it could be changed
1459 * in rt6_remove_prefsrc()
1460 */
1461 nrt->rt6i_prefsrc = ort->fib6_prefsrc;
1462 /* rt6_mtu_change() might lower mtu on ort.
1463 * Only insert this exception route if its mtu
1464 * is less than ort's mtu value.
1465 */
1466 if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(ort)) {
1467 err = -EINVAL;
1468 goto out;
1469 }
1470
1471 rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1472 src_key);
1473 if (rt6_ex)
1474 rt6_remove_exception(bucket, rt6_ex);
1475
1476 rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1477 if (!rt6_ex) {
1478 err = -ENOMEM;
1479 goto out;
1480 }
1481 rt6_ex->rt6i = nrt;
1482 rt6_ex->stamp = jiffies;
1483 hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1484 bucket->depth++;
1485 net->ipv6.rt6_stats->fib_rt_cache++;
1486
1487 if (bucket->depth > FIB6_MAX_DEPTH)
1488 rt6_exception_remove_oldest(bucket);
1489
1490out:
1491 spin_unlock_bh(&rt6_exception_lock);
1492
1493 /* Update fn->fn_sernum to invalidate all cached dst */
1494 if (!err) {
1495 spin_lock_bh(&ort->fib6_table->tb6_lock);
1496 fib6_update_sernum(net, ort);
1497 spin_unlock_bh(&ort->fib6_table->tb6_lock);
1498 fib6_force_start_gc(net);
1499 }
1500
1501 return err;
1502}
1503
1504void rt6_flush_exceptions(struct fib6_info *rt)
1505{
1506 struct rt6_exception_bucket *bucket;
1507 struct rt6_exception *rt6_ex;
1508 struct hlist_node *tmp;
1509 int i;
1510
1511 spin_lock_bh(&rt6_exception_lock);
1512 /* Prevent rt6_insert_exception() to recreate the bucket list */
1513 rt->exception_bucket_flushed = 1;
1514
1515 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1516 lockdep_is_held(&rt6_exception_lock));
1517 if (!bucket)
1518 goto out;
1519
1520 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1521 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1522 rt6_remove_exception(bucket, rt6_ex);
1523 WARN_ON_ONCE(bucket->depth);
1524 bucket++;
1525 }
1526
1527out:
1528 spin_unlock_bh(&rt6_exception_lock);
1529}
1530
1531/* Find cached rt in the hash table inside passed in rt
1532 * Caller has to hold rcu_read_lock()
1533 */
1534static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
1535 struct in6_addr *daddr,
1536 struct in6_addr *saddr)
1537{
1538 struct rt6_exception_bucket *bucket;
1539 struct in6_addr *src_key = NULL;
1540 struct rt6_exception *rt6_ex;
1541 struct rt6_info *res = NULL;
1542
1543 bucket = rcu_dereference(rt->rt6i_exception_bucket);
1544
1545#ifdef CONFIG_IPV6_SUBTREES
1546 /* rt6i_src.plen != 0 indicates rt is in subtree
1547 * and exception table is indexed by a hash of
1548 * both rt6i_dst and rt6i_src.
1549 * Otherwise, the exception table is indexed by
1550 * a hash of only rt6i_dst.
1551 */
1552 if (rt->fib6_src.plen)
1553 src_key = saddr;
1554#endif
1555 rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1556
1557 if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1558 res = rt6_ex->rt6i;
1559
1560 return res;
1561}
1562
1563/* Remove the passed in cached rt from the hash table that contains it */
1564static int rt6_remove_exception_rt(struct rt6_info *rt)
1565{
1566 struct rt6_exception_bucket *bucket;
1567 struct in6_addr *src_key = NULL;
1568 struct rt6_exception *rt6_ex;
1569 struct fib6_info *from;
1570 int err;
1571
1572 from = rcu_dereference(rt->from);
1573 if (!from ||
1574 !(rt->rt6i_flags & RTF_CACHE))
1575 return -EINVAL;
1576
1577 if (!rcu_access_pointer(from->rt6i_exception_bucket))
1578 return -ENOENT;
1579
1580 spin_lock_bh(&rt6_exception_lock);
1581 bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1582 lockdep_is_held(&rt6_exception_lock));
1583#ifdef CONFIG_IPV6_SUBTREES
1584 /* rt6i_src.plen != 0 indicates 'from' is in subtree
1585 * and exception table is indexed by a hash of
1586 * both rt6i_dst and rt6i_src.
1587 * Otherwise, the exception table is indexed by
1588 * a hash of only rt6i_dst.
1589 */
1590 if (from->fib6_src.plen)
1591 src_key = &rt->rt6i_src.addr;
1592#endif
1593 rt6_ex = __rt6_find_exception_spinlock(&bucket,
1594 &rt->rt6i_dst.addr,
1595 src_key);
1596 if (rt6_ex) {
1597 rt6_remove_exception(bucket, rt6_ex);
1598 err = 0;
1599 } else {
1600 err = -ENOENT;
1601 }
1602
1603 spin_unlock_bh(&rt6_exception_lock);
1604 return err;
1605}
1606
1607/* Find rt6_ex which contains the passed in rt cache and
1608 * refresh its stamp
1609 */
1610static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1611{
1612 struct rt6_exception_bucket *bucket;
1613 struct fib6_info *from = rt->from;
1614 struct in6_addr *src_key = NULL;
1615 struct rt6_exception *rt6_ex;
1616
1617 if (!from ||
1618 !(rt->rt6i_flags & RTF_CACHE))
1619 return;
1620
1621 rcu_read_lock();
1622 bucket = rcu_dereference(from->rt6i_exception_bucket);
1623
1624#ifdef CONFIG_IPV6_SUBTREES
1625 /* rt6i_src.plen != 0 indicates 'from' is in subtree
1626 * and exception table is indexed by a hash of
1627 * both rt6i_dst and rt6i_src.
1628 * Otherwise, the exception table is indexed by
1629 * a hash of only rt6i_dst.
1630 */
1631 if (from->fib6_src.plen)
1632 src_key = &rt->rt6i_src.addr;
1633#endif
1634 rt6_ex = __rt6_find_exception_rcu(&bucket,
1635 &rt->rt6i_dst.addr,
1636 src_key);
1637 if (rt6_ex)
1638 rt6_ex->stamp = jiffies;
1639
1640 rcu_read_unlock();
1641}
1642
1643static void rt6_exceptions_remove_prefsrc(struct fib6_info *rt)
1644{
1645 struct rt6_exception_bucket *bucket;
1646 struct rt6_exception *rt6_ex;
1647 int i;
1648
1649 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1650 lockdep_is_held(&rt6_exception_lock));
1651
1652 if (bucket) {
1653 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1654 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1655 rt6_ex->rt6i->rt6i_prefsrc.plen = 0;
1656 }
1657 bucket++;
1658 }
1659 }
1660}
1661
1662static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
1663 struct rt6_info *rt, int mtu)
1664{
1665 /* If the new MTU is lower than the route PMTU, this new MTU will be the
1666 * lowest MTU in the path: always allow updating the route PMTU to
1667 * reflect PMTU decreases.
1668 *
1669 * If the new MTU is higher, and the route PMTU is equal to the local
1670 * MTU, this means the old MTU is the lowest in the path, so allow
1671 * updating it: if other nodes now have lower MTUs, PMTU discovery will
1672 * handle this.
1673 */
1674
1675 if (dst_mtu(&rt->dst) >= mtu)
1676 return true;
1677
1678 if (dst_mtu(&rt->dst) == idev->cnf.mtu6)
1679 return true;
1680
1681 return false;
1682}
1683
1684static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
1685 struct fib6_info *rt, int mtu)
1686{
1687 struct rt6_exception_bucket *bucket;
1688 struct rt6_exception *rt6_ex;
1689 int i;
1690
1691 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1692 lockdep_is_held(&rt6_exception_lock));
1693
1694 if (!bucket)
1695 return;
1696
1697 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1698 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1699 struct rt6_info *entry = rt6_ex->rt6i;
1700
1701 /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected
1702 * route), the metrics of its rt->from have already
1703 * been updated.
1704 */
1705 if (dst_metric_raw(&entry->dst, RTAX_MTU) &&
1706 rt6_mtu_change_route_allowed(idev, entry, mtu))
1707 dst_metric_set(&entry->dst, RTAX_MTU, mtu);
1708 }
1709 bucket++;
1710 }
1711}
1712
1713#define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE)
1714
1715static void rt6_exceptions_clean_tohost(struct fib6_info *rt,
1716 struct in6_addr *gateway)
1717{
1718 struct rt6_exception_bucket *bucket;
1719 struct rt6_exception *rt6_ex;
1720 struct hlist_node *tmp;
1721 int i;
1722
1723 if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1724 return;
1725
1726 spin_lock_bh(&rt6_exception_lock);
1727 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1728 lockdep_is_held(&rt6_exception_lock));
1729
1730 if (bucket) {
1731 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1732 hlist_for_each_entry_safe(rt6_ex, tmp,
1733 &bucket->chain, hlist) {
1734 struct rt6_info *entry = rt6_ex->rt6i;
1735
1736 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1737 RTF_CACHE_GATEWAY &&
1738 ipv6_addr_equal(gateway,
1739 &entry->rt6i_gateway)) {
1740 rt6_remove_exception(bucket, rt6_ex);
1741 }
1742 }
1743 bucket++;
1744 }
1745 }
1746
1747 spin_unlock_bh(&rt6_exception_lock);
1748}
1749
1750static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1751 struct rt6_exception *rt6_ex,
1752 struct fib6_gc_args *gc_args,
1753 unsigned long now)
1754{
1755 struct rt6_info *rt = rt6_ex->rt6i;
1756
1757 /* we are pruning and obsoleting aged-out and non gateway exceptions
1758 * even if others have still references to them, so that on next
1759 * dst_check() such references can be dropped.
1760 * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
1761 * expired, independently from their aging, as per RFC 8201 section 4
1762 */
1763 if (!(rt->rt6i_flags & RTF_EXPIRES)) {
1764 if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1765 RT6_TRACE("aging clone %p\n", rt);
1766 rt6_remove_exception(bucket, rt6_ex);
1767 return;
1768 }
1769 } else if (time_after(jiffies, rt->dst.expires)) {
1770 RT6_TRACE("purging expired route %p\n", rt);
1771 rt6_remove_exception(bucket, rt6_ex);
1772 return;
1773 }
1774
1775 if (rt->rt6i_flags & RTF_GATEWAY) {
1776 struct neighbour *neigh;
1777 __u8 neigh_flags = 0;
1778
1779 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
1780 if (neigh)
1781 neigh_flags = neigh->flags;
1782
1783 if (!(neigh_flags & NTF_ROUTER)) {
1784 RT6_TRACE("purging route %p via non-router but gateway\n",
1785 rt);
1786 rt6_remove_exception(bucket, rt6_ex);
1787 return;
1788 }
1789 }
1790
1791 gc_args->more++;
1792}
1793
1794void rt6_age_exceptions(struct fib6_info *rt,
1795 struct fib6_gc_args *gc_args,
1796 unsigned long now)
1797{
1798 struct rt6_exception_bucket *bucket;
1799 struct rt6_exception *rt6_ex;
1800 struct hlist_node *tmp;
1801 int i;
1802
1803 if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1804 return;
1805
1806 rcu_read_lock_bh();
1807 spin_lock(&rt6_exception_lock);
1808 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1809 lockdep_is_held(&rt6_exception_lock));
1810
1811 if (bucket) {
1812 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1813 hlist_for_each_entry_safe(rt6_ex, tmp,
1814 &bucket->chain, hlist) {
1815 rt6_age_examine_exception(bucket, rt6_ex,
1816 gc_args, now);
1817 }
1818 bucket++;
1819 }
1820 }
1821 spin_unlock(&rt6_exception_lock);
1822 rcu_read_unlock_bh();
1823}
1824
1825/* must be called with rcu lock held */
1826struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table,
1827 int oif, struct flowi6 *fl6, int strict)
1828{
1829 struct fib6_node *fn, *saved_fn;
1830 struct fib6_info *f6i;
1831
1832 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1833 saved_fn = fn;
1834
1835 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1836 oif = 0;
1837
1838redo_rt6_select:
1839 f6i = rt6_select(net, fn, oif, strict);
1840 if (f6i == net->ipv6.fib6_null_entry) {
1841 fn = fib6_backtrack(fn, &fl6->saddr);
1842 if (fn)
1843 goto redo_rt6_select;
1844 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1845 /* also consider unreachable route */
1846 strict &= ~RT6_LOOKUP_F_REACHABLE;
1847 fn = saved_fn;
1848 goto redo_rt6_select;
1849 }
1850 }
1851
1852 trace_fib6_table_lookup(net, f6i, table, fl6);
1853
1854 return f6i;
1855}
1856
1857struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1858 int oif, struct flowi6 *fl6,
1859 const struct sk_buff *skb, int flags)
1860{
1861 struct fib6_info *f6i;
1862 struct rt6_info *rt;
1863 int strict = 0;
1864
1865 strict |= flags & RT6_LOOKUP_F_IFACE;
1866 strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1867 if (net->ipv6.devconf_all->forwarding == 0)
1868 strict |= RT6_LOOKUP_F_REACHABLE;
1869
1870 rcu_read_lock();
1871
1872 f6i = fib6_table_lookup(net, table, oif, fl6, strict);
1873 if (f6i->fib6_nsiblings)
1874 f6i = fib6_multipath_select(net, f6i, fl6, oif, skb, strict);
1875
1876 if (f6i == net->ipv6.fib6_null_entry) {
1877 rt = net->ipv6.ip6_null_entry;
1878 rcu_read_unlock();
1879 dst_hold(&rt->dst);
1880 return rt;
1881 }
1882
1883 /*Search through exception table */
1884 rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1885 if (rt) {
1886 if (ip6_hold_safe(net, &rt, true))
1887 dst_use_noref(&rt->dst, jiffies);
1888
1889 rcu_read_unlock();
1890 return rt;
1891 } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1892 !(f6i->fib6_flags & RTF_GATEWAY))) {
1893 /* Create a RTF_CACHE clone which will not be
1894 * owned by the fib6 tree. It is for the special case where
1895 * the daddr in the skb during the neighbor look-up is different
1896 * from the fl6->daddr used to look-up route here.
1897 */
1898 struct rt6_info *uncached_rt;
1899
1900 uncached_rt = ip6_rt_cache_alloc(f6i, &fl6->daddr, NULL);
1901
1902 rcu_read_unlock();
1903
1904 if (uncached_rt) {
1905 /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1906 * No need for another dst_hold()
1907 */
1908 rt6_uncached_list_add(uncached_rt);
1909 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
1910 } else {
1911 uncached_rt = net->ipv6.ip6_null_entry;
1912 dst_hold(&uncached_rt->dst);
1913 }
1914
1915 return uncached_rt;
1916 } else {
1917 /* Get a percpu copy */
1918
1919 struct rt6_info *pcpu_rt;
1920
1921 local_bh_disable();
1922 pcpu_rt = rt6_get_pcpu_route(f6i);
1923
1924 if (!pcpu_rt)
1925 pcpu_rt = rt6_make_pcpu_route(net, f6i);
1926
1927 local_bh_enable();
1928 rcu_read_unlock();
1929
1930 return pcpu_rt;
1931 }
1932}
1933EXPORT_SYMBOL_GPL(ip6_pol_route);
1934
1935static struct rt6_info *ip6_pol_route_input(struct net *net,
1936 struct fib6_table *table,
1937 struct flowi6 *fl6,
1938 const struct sk_buff *skb,
1939 int flags)
1940{
1941 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags);
1942}
1943
1944struct dst_entry *ip6_route_input_lookup(struct net *net,
1945 struct net_device *dev,
1946 struct flowi6 *fl6,
1947 const struct sk_buff *skb,
1948 int flags)
1949{
1950 if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1951 flags |= RT6_LOOKUP_F_IFACE;
1952
1953 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input);
1954}
1955EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1956
1957static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1958 struct flow_keys *keys,
1959 struct flow_keys *flkeys)
1960{
1961 const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1962 const struct ipv6hdr *key_iph = outer_iph;
1963 struct flow_keys *_flkeys = flkeys;
1964 const struct ipv6hdr *inner_iph;
1965 const struct icmp6hdr *icmph;
1966 struct ipv6hdr _inner_iph;
1967 struct icmp6hdr _icmph;
1968
1969 if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
1970 goto out;
1971
1972 icmph = skb_header_pointer(skb, skb_transport_offset(skb),
1973 sizeof(_icmph), &_icmph);
1974 if (!icmph)
1975 goto out;
1976
1977 if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
1978 icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
1979 icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
1980 icmph->icmp6_type != ICMPV6_PARAMPROB)
1981 goto out;
1982
1983 inner_iph = skb_header_pointer(skb,
1984 skb_transport_offset(skb) + sizeof(*icmph),
1985 sizeof(_inner_iph), &_inner_iph);
1986 if (!inner_iph)
1987 goto out;
1988
1989 key_iph = inner_iph;
1990 _flkeys = NULL;
1991out:
1992 if (_flkeys) {
1993 keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src;
1994 keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst;
1995 keys->tags.flow_label = _flkeys->tags.flow_label;
1996 keys->basic.ip_proto = _flkeys->basic.ip_proto;
1997 } else {
1998 keys->addrs.v6addrs.src = key_iph->saddr;
1999 keys->addrs.v6addrs.dst = key_iph->daddr;
2000 keys->tags.flow_label = ip6_flowlabel(key_iph);
2001 keys->basic.ip_proto = key_iph->nexthdr;
2002 }
2003}
2004
2005/* if skb is set it will be used and fl6 can be NULL */
2006u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
2007 const struct sk_buff *skb, struct flow_keys *flkeys)
2008{
2009 struct flow_keys hash_keys;
2010 u32 mhash;
2011
2012 switch (ip6_multipath_hash_policy(net)) {
2013 case 0:
2014 memset(&hash_keys, 0, sizeof(hash_keys));
2015 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2016 if (skb) {
2017 ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
2018 } else {
2019 hash_keys.addrs.v6addrs.src = fl6->saddr;
2020 hash_keys.addrs.v6addrs.dst = fl6->daddr;
2021 hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
2022 hash_keys.basic.ip_proto = fl6->flowi6_proto;
2023 }
2024 break;
2025 case 1:
2026 if (skb) {
2027 unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
2028 struct flow_keys keys;
2029
2030 /* short-circuit if we already have L4 hash present */
2031 if (skb->l4_hash)
2032 return skb_get_hash_raw(skb) >> 1;
2033
2034 memset(&hash_keys, 0, sizeof(hash_keys));
2035
2036 if (!flkeys) {
2037 skb_flow_dissect_flow_keys(skb, &keys, flag);
2038 flkeys = &keys;
2039 }
2040 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2041 hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
2042 hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
2043 hash_keys.ports.src = flkeys->ports.src;
2044 hash_keys.ports.dst = flkeys->ports.dst;
2045 hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
2046 } else {
2047 memset(&hash_keys, 0, sizeof(hash_keys));
2048 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2049 hash_keys.addrs.v6addrs.src = fl6->saddr;
2050 hash_keys.addrs.v6addrs.dst = fl6->daddr;
2051 hash_keys.ports.src = fl6->fl6_sport;
2052 hash_keys.ports.dst = fl6->fl6_dport;
2053 hash_keys.basic.ip_proto = fl6->flowi6_proto;
2054 }
2055 break;
2056 }
2057 mhash = flow_hash_from_keys(&hash_keys);
2058
2059 return mhash >> 1;
2060}
2061
2062void ip6_route_input(struct sk_buff *skb)
2063{
2064 const struct ipv6hdr *iph = ipv6_hdr(skb);
2065 struct net *net = dev_net(skb->dev);
2066 int flags = RT6_LOOKUP_F_HAS_SADDR;
2067 struct ip_tunnel_info *tun_info;
2068 struct flowi6 fl6 = {
2069 .flowi6_iif = skb->dev->ifindex,
2070 .daddr = iph->daddr,
2071 .saddr = iph->saddr,
2072 .flowlabel = ip6_flowinfo(iph),
2073 .flowi6_mark = skb->mark,
2074 .flowi6_proto = iph->nexthdr,
2075 };
2076 struct flow_keys *flkeys = NULL, _flkeys;
2077
2078 tun_info = skb_tunnel_info(skb);
2079 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2080 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
2081
2082 if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys))
2083 flkeys = &_flkeys;
2084
2085 if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
2086 fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys);
2087 skb_dst_drop(skb);
2088 skb_dst_set(skb,
2089 ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags));
2090}
2091
2092static struct rt6_info *ip6_pol_route_output(struct net *net,
2093 struct fib6_table *table,
2094 struct flowi6 *fl6,
2095 const struct sk_buff *skb,
2096 int flags)
2097{
2098 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags);
2099}
2100
2101struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
2102 struct flowi6 *fl6, int flags)
2103{
2104 bool any_src;
2105
2106 if (rt6_need_strict(&fl6->daddr)) {
2107 struct dst_entry *dst;
2108
2109 dst = l3mdev_link_scope_lookup(net, fl6);
2110 if (dst)
2111 return dst;
2112 }
2113
2114 fl6->flowi6_iif = LOOPBACK_IFINDEX;
2115
2116 any_src = ipv6_addr_any(&fl6->saddr);
2117 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
2118 (fl6->flowi6_oif && any_src))
2119 flags |= RT6_LOOKUP_F_IFACE;
2120
2121 if (!any_src)
2122 flags |= RT6_LOOKUP_F_HAS_SADDR;
2123 else if (sk)
2124 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
2125
2126 return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output);
2127}
2128EXPORT_SYMBOL_GPL(ip6_route_output_flags);
2129
2130struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2131{
2132 struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
2133 struct net_device *loopback_dev = net->loopback_dev;
2134 struct dst_entry *new = NULL;
2135
2136 rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
2137 DST_OBSOLETE_DEAD, 0);
2138 if (rt) {
2139 rt6_info_init(rt);
2140 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
2141
2142 new = &rt->dst;
2143 new->__use = 1;
2144 new->input = dst_discard;
2145 new->output = dst_discard_out;
2146
2147 dst_copy_metrics(new, &ort->dst);
2148
2149 rt->rt6i_idev = in6_dev_get(loopback_dev);
2150 rt->rt6i_gateway = ort->rt6i_gateway;
2151 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
2152
2153 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
2154#ifdef CONFIG_IPV6_SUBTREES
2155 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
2156#endif
2157 }
2158
2159 dst_release(dst_orig);
2160 return new ? new : ERR_PTR(-ENOMEM);
2161}
2162
2163/*
2164 * Destination cache support functions
2165 */
2166
2167static bool fib6_check(struct fib6_info *f6i, u32 cookie)
2168{
2169 u32 rt_cookie = 0;
2170
2171 if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie)
2172 return false;
2173
2174 if (fib6_check_expired(f6i))
2175 return false;
2176
2177 return true;
2178}
2179
2180static struct dst_entry *rt6_check(struct rt6_info *rt,
2181 struct fib6_info *from,
2182 u32 cookie)
2183{
2184 u32 rt_cookie = 0;
2185
2186 if ((from && !fib6_get_cookie_safe(from, &rt_cookie)) ||
2187 rt_cookie != cookie)
2188 return NULL;
2189
2190 if (rt6_check_expired(rt))
2191 return NULL;
2192
2193 return &rt->dst;
2194}
2195
2196static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt,
2197 struct fib6_info *from,
2198 u32 cookie)
2199{
2200 if (!__rt6_check_expired(rt) &&
2201 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
2202 fib6_check(from, cookie))
2203 return &rt->dst;
2204 else
2205 return NULL;
2206}
2207
2208static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
2209{
2210 struct dst_entry *dst_ret;
2211 struct fib6_info *from;
2212 struct rt6_info *rt;
2213
2214 rt = container_of(dst, struct rt6_info, dst);
2215
2216 rcu_read_lock();
2217
2218 /* All IPV6 dsts are created with ->obsolete set to the value
2219 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
2220 * into this function always.
2221 */
2222
2223 from = rcu_dereference(rt->from);
2224
2225 if (from && (rt->rt6i_flags & RTF_PCPU ||
2226 unlikely(!list_empty(&rt->rt6i_uncached))))
2227 dst_ret = rt6_dst_from_check(rt, from, cookie);
2228 else
2229 dst_ret = rt6_check(rt, from, cookie);
2230
2231 rcu_read_unlock();
2232
2233 return dst_ret;
2234}
2235
2236static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
2237{
2238 struct rt6_info *rt = (struct rt6_info *) dst;
2239
2240 if (rt) {
2241 if (rt->rt6i_flags & RTF_CACHE) {
2242 rcu_read_lock();
2243 if (rt6_check_expired(rt)) {
2244 rt6_remove_exception_rt(rt);
2245 dst = NULL;
2246 }
2247 rcu_read_unlock();
2248 } else {
2249 dst_release(dst);
2250 dst = NULL;
2251 }
2252 }
2253 return dst;
2254}
2255
2256static void ip6_link_failure(struct sk_buff *skb)
2257{
2258 struct rt6_info *rt;
2259
2260 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
2261
2262 rt = (struct rt6_info *) skb_dst(skb);
2263 if (rt) {
2264 rcu_read_lock();
2265 if (rt->rt6i_flags & RTF_CACHE) {
2266 rt6_remove_exception_rt(rt);
2267 } else {
2268 struct fib6_info *from;
2269 struct fib6_node *fn;
2270
2271 from = rcu_dereference(rt->from);
2272 if (from) {
2273 fn = rcu_dereference(from->fib6_node);
2274 if (fn && (rt->rt6i_flags & RTF_DEFAULT))
2275 fn->fn_sernum = -1;
2276 }
2277 }
2278 rcu_read_unlock();
2279 }
2280}
2281
2282static void rt6_update_expires(struct rt6_info *rt0, int timeout)
2283{
2284 if (!(rt0->rt6i_flags & RTF_EXPIRES)) {
2285 struct fib6_info *from;
2286
2287 rcu_read_lock();
2288 from = rcu_dereference(rt0->from);
2289 if (from)
2290 rt0->dst.expires = from->expires;
2291 rcu_read_unlock();
2292 }
2293
2294 dst_set_expires(&rt0->dst, timeout);
2295 rt0->rt6i_flags |= RTF_EXPIRES;
2296}
2297
2298static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2299{
2300 struct net *net = dev_net(rt->dst.dev);
2301
2302 dst_metric_set(&rt->dst, RTAX_MTU, mtu);
2303 rt->rt6i_flags |= RTF_MODIFIED;
2304 rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2305}
2306
2307static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2308{
2309 bool from_set;
2310
2311 rcu_read_lock();
2312 from_set = !!rcu_dereference(rt->from);
2313 rcu_read_unlock();
2314
2315 return !(rt->rt6i_flags & RTF_CACHE) &&
2316 (rt->rt6i_flags & RTF_PCPU || from_set);
2317}
2318
2319static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2320 const struct ipv6hdr *iph, u32 mtu)
2321{
2322 const struct in6_addr *daddr, *saddr;
2323 struct rt6_info *rt6 = (struct rt6_info *)dst;
2324
2325 if (dst_metric_locked(dst, RTAX_MTU))
2326 return;
2327
2328 if (iph) {
2329 daddr = &iph->daddr;
2330 saddr = &iph->saddr;
2331 } else if (sk) {
2332 daddr = &sk->sk_v6_daddr;
2333 saddr = &inet6_sk(sk)->saddr;
2334 } else {
2335 daddr = NULL;
2336 saddr = NULL;
2337 }
2338 dst_confirm_neigh(dst, daddr);
2339 mtu = max_t(u32, mtu, IPV6_MIN_MTU);
2340 if (mtu >= dst_mtu(dst))
2341 return;
2342
2343 if (!rt6_cache_allowed_for_pmtu(rt6)) {
2344 rt6_do_update_pmtu(rt6, mtu);
2345 /* update rt6_ex->stamp for cache */
2346 if (rt6->rt6i_flags & RTF_CACHE)
2347 rt6_update_exception_stamp_rt(rt6);
2348 } else if (daddr) {
2349 struct fib6_info *from;
2350 struct rt6_info *nrt6;
2351
2352 rcu_read_lock();
2353 from = rcu_dereference(rt6->from);
2354 nrt6 = ip6_rt_cache_alloc(from, daddr, saddr);
2355 if (nrt6) {
2356 rt6_do_update_pmtu(nrt6, mtu);
2357 if (rt6_insert_exception(nrt6, from))
2358 dst_release_immediate(&nrt6->dst);
2359 }
2360 rcu_read_unlock();
2361 }
2362}
2363
2364static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2365 struct sk_buff *skb, u32 mtu)
2366{
2367 __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
2368}
2369
2370void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2371 int oif, u32 mark, kuid_t uid)
2372{
2373 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2374 struct dst_entry *dst;
2375 struct flowi6 fl6;
2376
2377 memset(&fl6, 0, sizeof(fl6));
2378 fl6.flowi6_oif = oif;
2379 fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
2380 fl6.daddr = iph->daddr;
2381 fl6.saddr = iph->saddr;
2382 fl6.flowlabel = ip6_flowinfo(iph);
2383 fl6.flowi6_uid = uid;
2384
2385 dst = ip6_route_output(net, NULL, &fl6);
2386 if (!dst->error)
2387 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2388 dst_release(dst);
2389}
2390EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2391
2392void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2393{
2394 int oif = sk->sk_bound_dev_if;
2395 struct dst_entry *dst;
2396
2397 if (!oif && skb->dev)
2398 oif = l3mdev_master_ifindex(skb->dev);
2399
2400 ip6_update_pmtu(skb, sock_net(sk), mtu, oif, sk->sk_mark, sk->sk_uid);
2401
2402 dst = __sk_dst_get(sk);
2403 if (!dst || !dst->obsolete ||
2404 dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2405 return;
2406
2407 bh_lock_sock(sk);
2408 if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2409 ip6_datagram_dst_update(sk, false);
2410 bh_unlock_sock(sk);
2411}
2412EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2413
2414void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst,
2415 const struct flowi6 *fl6)
2416{
2417#ifdef CONFIG_IPV6_SUBTREES
2418 struct ipv6_pinfo *np = inet6_sk(sk);
2419#endif
2420
2421 ip6_dst_store(sk, dst,
2422 ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ?
2423 &sk->sk_v6_daddr : NULL,
2424#ifdef CONFIG_IPV6_SUBTREES
2425 ipv6_addr_equal(&fl6->saddr, &np->saddr) ?
2426 &np->saddr :
2427#endif
2428 NULL);
2429}
2430
2431/* Handle redirects */
2432struct ip6rd_flowi {
2433 struct flowi6 fl6;
2434 struct in6_addr gateway;
2435};
2436
2437static struct rt6_info *__ip6_route_redirect(struct net *net,
2438 struct fib6_table *table,
2439 struct flowi6 *fl6,
2440 const struct sk_buff *skb,
2441 int flags)
2442{
2443 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2444 struct rt6_info *ret = NULL, *rt_cache;
2445 struct fib6_info *rt;
2446 struct fib6_node *fn;
2447
2448 /* Get the "current" route for this destination and
2449 * check if the redirect has come from appropriate router.
2450 *
2451 * RFC 4861 specifies that redirects should only be
2452 * accepted if they come from the nexthop to the target.
2453 * Due to the way the routes are chosen, this notion
2454 * is a bit fuzzy and one might need to check all possible
2455 * routes.
2456 */
2457
2458 rcu_read_lock();
2459 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2460restart:
2461 for_each_fib6_node_rt_rcu(fn) {
2462 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
2463 continue;
2464 if (fib6_check_expired(rt))
2465 continue;
2466 if (rt->fib6_flags & RTF_REJECT)
2467 break;
2468 if (!(rt->fib6_flags & RTF_GATEWAY))
2469 continue;
2470 if (fl6->flowi6_oif != rt->fib6_nh.nh_dev->ifindex)
2471 continue;
2472 /* rt_cache's gateway might be different from its 'parent'
2473 * in the case of an ip redirect.
2474 * So we keep searching in the exception table if the gateway
2475 * is different.
2476 */
2477 if (!ipv6_addr_equal(&rdfl->gateway, &rt->fib6_nh.nh_gw)) {
2478 rt_cache = rt6_find_cached_rt(rt,
2479 &fl6->daddr,
2480 &fl6->saddr);
2481 if (rt_cache &&
2482 ipv6_addr_equal(&rdfl->gateway,
2483 &rt_cache->rt6i_gateway)) {
2484 ret = rt_cache;
2485 break;
2486 }
2487 continue;
2488 }
2489 break;
2490 }
2491
2492 if (!rt)
2493 rt = net->ipv6.fib6_null_entry;
2494 else if (rt->fib6_flags & RTF_REJECT) {
2495 ret = net->ipv6.ip6_null_entry;
2496 goto out;
2497 }
2498
2499 if (rt == net->ipv6.fib6_null_entry) {
2500 fn = fib6_backtrack(fn, &fl6->saddr);
2501 if (fn)
2502 goto restart;
2503 }
2504
2505out:
2506 if (ret)
2507 ip6_hold_safe(net, &ret, true);
2508 else
2509 ret = ip6_create_rt_rcu(rt);
2510
2511 rcu_read_unlock();
2512
2513 trace_fib6_table_lookup(net, rt, table, fl6);
2514 return ret;
2515};
2516
2517static struct dst_entry *ip6_route_redirect(struct net *net,
2518 const struct flowi6 *fl6,
2519 const struct sk_buff *skb,
2520 const struct in6_addr *gateway)
2521{
2522 int flags = RT6_LOOKUP_F_HAS_SADDR;
2523 struct ip6rd_flowi rdfl;
2524
2525 rdfl.fl6 = *fl6;
2526 rdfl.gateway = *gateway;
2527
2528 return fib6_rule_lookup(net, &rdfl.fl6, skb,
2529 flags, __ip6_route_redirect);
2530}
2531
2532void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2533 kuid_t uid)
2534{
2535 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2536 struct dst_entry *dst;
2537 struct flowi6 fl6;
2538
2539 memset(&fl6, 0, sizeof(fl6));
2540 fl6.flowi6_iif = LOOPBACK_IFINDEX;
2541 fl6.flowi6_oif = oif;
2542 fl6.flowi6_mark = mark;
2543 fl6.daddr = iph->daddr;
2544 fl6.saddr = iph->saddr;
2545 fl6.flowlabel = ip6_flowinfo(iph);
2546 fl6.flowi6_uid = uid;
2547
2548 dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr);
2549 rt6_do_redirect(dst, NULL, skb);
2550 dst_release(dst);
2551}
2552EXPORT_SYMBOL_GPL(ip6_redirect);
2553
2554void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
2555 u32 mark)
2556{
2557 const struct ipv6hdr *iph = ipv6_hdr(skb);
2558 const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2559 struct dst_entry *dst;
2560 struct flowi6 fl6;
2561
2562 memset(&fl6, 0, sizeof(fl6));
2563 fl6.flowi6_iif = LOOPBACK_IFINDEX;
2564 fl6.flowi6_oif = oif;
2565 fl6.flowi6_mark = mark;
2566 fl6.daddr = msg->dest;
2567 fl6.saddr = iph->daddr;
2568 fl6.flowi6_uid = sock_net_uid(net, NULL);
2569
2570 dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr);
2571 rt6_do_redirect(dst, NULL, skb);
2572 dst_release(dst);
2573}
2574
2575void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2576{
2577 ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2578 sk->sk_uid);
2579}
2580EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2581
2582static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2583{
2584 struct net_device *dev = dst->dev;
2585 unsigned int mtu = dst_mtu(dst);
2586 struct net *net = dev_net(dev);
2587
2588 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2589
2590 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2591 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
2592
2593 /*
2594 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2595 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2596 * IPV6_MAXPLEN is also valid and means: "any MSS,
2597 * rely only on pmtu discovery"
2598 */
2599 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2600 mtu = IPV6_MAXPLEN;
2601 return mtu;
2602}
2603
2604static unsigned int ip6_mtu(const struct dst_entry *dst)
2605{
2606 struct inet6_dev *idev;
2607 unsigned int mtu;
2608
2609 mtu = dst_metric_raw(dst, RTAX_MTU);
2610 if (mtu)
2611 goto out;
2612
2613 mtu = IPV6_MIN_MTU;
2614
2615 rcu_read_lock();
2616 idev = __in6_dev_get(dst->dev);
2617 if (idev)
2618 mtu = idev->cnf.mtu6;
2619 rcu_read_unlock();
2620
2621out:
2622 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2623
2624 return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2625}
2626
2627/* MTU selection:
2628 * 1. mtu on route is locked - use it
2629 * 2. mtu from nexthop exception
2630 * 3. mtu from egress device
2631 *
2632 * based on ip6_dst_mtu_forward and exception logic of
2633 * rt6_find_cached_rt; called with rcu_read_lock
2634 */
2635u32 ip6_mtu_from_fib6(struct fib6_info *f6i, struct in6_addr *daddr,
2636 struct in6_addr *saddr)
2637{
2638 struct rt6_exception_bucket *bucket;
2639 struct rt6_exception *rt6_ex;
2640 struct in6_addr *src_key;
2641 struct inet6_dev *idev;
2642 u32 mtu = 0;
2643
2644 if (unlikely(fib6_metric_locked(f6i, RTAX_MTU))) {
2645 mtu = f6i->fib6_pmtu;
2646 if (mtu)
2647 goto out;
2648 }
2649
2650 src_key = NULL;
2651#ifdef CONFIG_IPV6_SUBTREES
2652 if (f6i->fib6_src.plen)
2653 src_key = saddr;
2654#endif
2655
2656 bucket = rcu_dereference(f6i->rt6i_exception_bucket);
2657 rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
2658 if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
2659 mtu = dst_metric_raw(&rt6_ex->rt6i->dst, RTAX_MTU);
2660
2661 if (likely(!mtu)) {
2662 struct net_device *dev = fib6_info_nh_dev(f6i);
2663
2664 mtu = IPV6_MIN_MTU;
2665 idev = __in6_dev_get(dev);
2666 if (idev && idev->cnf.mtu6 > mtu)
2667 mtu = idev->cnf.mtu6;
2668 }
2669
2670 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2671out:
2672 return mtu - lwtunnel_headroom(fib6_info_nh_lwt(f6i), mtu);
2673}
2674
2675struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2676 struct flowi6 *fl6)
2677{
2678 struct dst_entry *dst;
2679 struct rt6_info *rt;
2680 struct inet6_dev *idev = in6_dev_get(dev);
2681 struct net *net = dev_net(dev);
2682
2683 if (unlikely(!idev))
2684 return ERR_PTR(-ENODEV);
2685
2686 rt = ip6_dst_alloc(net, dev, 0);
2687 if (unlikely(!rt)) {
2688 in6_dev_put(idev);
2689 dst = ERR_PTR(-ENOMEM);
2690 goto out;
2691 }
2692
2693 rt->dst.flags |= DST_HOST;
2694 rt->dst.input = ip6_input;
2695 rt->dst.output = ip6_output;
2696 rt->rt6i_gateway = fl6->daddr;
2697 rt->rt6i_dst.addr = fl6->daddr;
2698 rt->rt6i_dst.plen = 128;
2699 rt->rt6i_idev = idev;
2700 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
2701
2702 /* Add this dst into uncached_list so that rt6_disable_ip() can
2703 * do proper release of the net_device
2704 */
2705 rt6_uncached_list_add(rt);
2706 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
2707
2708 dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2709
2710out:
2711 return dst;
2712}
2713
2714static int ip6_dst_gc(struct dst_ops *ops)
2715{
2716 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2717 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2718 int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2719 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2720 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2721 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2722 int entries;
2723
2724 entries = dst_entries_get_fast(ops);
2725 if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2726 entries <= rt_max_size)
2727 goto out;
2728
2729 net->ipv6.ip6_rt_gc_expire++;
2730 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2731 entries = dst_entries_get_slow(ops);
2732 if (entries < ops->gc_thresh)
2733 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
2734out:
2735 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2736 return entries > rt_max_size;
2737}
2738
2739static int ip6_convert_metrics(struct net *net, struct fib6_info *rt,
2740 struct fib6_config *cfg)
2741{
2742 struct dst_metrics *p;
2743
2744 if (!cfg->fc_mx)
2745 return 0;
2746
2747 p = kzalloc(sizeof(*rt->fib6_metrics), GFP_KERNEL);
2748 if (unlikely(!p))
2749 return -ENOMEM;
2750
2751 refcount_set(&p->refcnt, 1);
2752 rt->fib6_metrics = p;
2753
2754 return ip_metrics_convert(net, cfg->fc_mx, cfg->fc_mx_len, p->metrics);
2755}
2756
2757static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2758 struct fib6_config *cfg,
2759 const struct in6_addr *gw_addr,
2760 u32 tbid, int flags)
2761{
2762 struct flowi6 fl6 = {
2763 .flowi6_oif = cfg->fc_ifindex,
2764 .daddr = *gw_addr,
2765 .saddr = cfg->fc_prefsrc,
2766 };
2767 struct fib6_table *table;
2768 struct rt6_info *rt;
2769
2770 table = fib6_get_table(net, tbid);
2771 if (!table)
2772 return NULL;
2773
2774 if (!ipv6_addr_any(&cfg->fc_prefsrc))
2775 flags |= RT6_LOOKUP_F_HAS_SADDR;
2776
2777 flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
2778 rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags);
2779
2780 /* if table lookup failed, fall back to full lookup */
2781 if (rt == net->ipv6.ip6_null_entry) {
2782 ip6_rt_put(rt);
2783 rt = NULL;
2784 }
2785
2786 return rt;
2787}
2788
2789static int ip6_route_check_nh_onlink(struct net *net,
2790 struct fib6_config *cfg,
2791 const struct net_device *dev,
2792 struct netlink_ext_ack *extack)
2793{
2794 u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN;
2795 const struct in6_addr *gw_addr = &cfg->fc_gateway;
2796 u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT;
2797 struct rt6_info *grt;
2798 int err;
2799
2800 err = 0;
2801 grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0);
2802 if (grt) {
2803 if (!grt->dst.error &&
2804 /* ignore match if it is the default route */
2805 grt->from && !ipv6_addr_any(&grt->from->fib6_dst.addr) &&
2806 (grt->rt6i_flags & flags || dev != grt->dst.dev)) {
2807 NL_SET_ERR_MSG(extack,
2808 "Nexthop has invalid gateway or device mismatch");
2809 err = -EINVAL;
2810 }
2811
2812 ip6_rt_put(grt);
2813 }
2814
2815 return err;
2816}
2817
2818static int ip6_route_check_nh(struct net *net,
2819 struct fib6_config *cfg,
2820 struct net_device **_dev,
2821 struct inet6_dev **idev)
2822{
2823 const struct in6_addr *gw_addr = &cfg->fc_gateway;
2824 struct net_device *dev = _dev ? *_dev : NULL;
2825 struct rt6_info *grt = NULL;
2826 int err = -EHOSTUNREACH;
2827
2828 if (cfg->fc_table) {
2829 int flags = RT6_LOOKUP_F_IFACE;
2830
2831 grt = ip6_nh_lookup_table(net, cfg, gw_addr,
2832 cfg->fc_table, flags);
2833 if (grt) {
2834 if (grt->rt6i_flags & RTF_GATEWAY ||
2835 (dev && dev != grt->dst.dev)) {
2836 ip6_rt_put(grt);
2837 grt = NULL;
2838 }
2839 }
2840 }
2841
2842 if (!grt)
2843 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1);
2844
2845 if (!grt)
2846 goto out;
2847
2848 if (dev) {
2849 if (dev != grt->dst.dev) {
2850 ip6_rt_put(grt);
2851 goto out;
2852 }
2853 } else {
2854 *_dev = dev = grt->dst.dev;
2855 *idev = grt->rt6i_idev;
2856 dev_hold(dev);
2857 in6_dev_hold(grt->rt6i_idev);
2858 }
2859
2860 if (!(grt->rt6i_flags & RTF_GATEWAY))
2861 err = 0;
2862
2863 ip6_rt_put(grt);
2864
2865out:
2866 return err;
2867}
2868
2869static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
2870 struct net_device **_dev, struct inet6_dev **idev,
2871 struct netlink_ext_ack *extack)
2872{
2873 const struct in6_addr *gw_addr = &cfg->fc_gateway;
2874 int gwa_type = ipv6_addr_type(gw_addr);
2875 bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true;
2876 const struct net_device *dev = *_dev;
2877 bool need_addr_check = !dev;
2878 int err = -EINVAL;
2879
2880 /* if gw_addr is local we will fail to detect this in case
2881 * address is still TENTATIVE (DAD in progress). rt6_lookup()
2882 * will return already-added prefix route via interface that
2883 * prefix route was assigned to, which might be non-loopback.
2884 */
2885 if (dev &&
2886 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2887 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2888 goto out;
2889 }
2890
2891 if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) {
2892 /* IPv6 strictly inhibits using not link-local
2893 * addresses as nexthop address.
2894 * Otherwise, router will not able to send redirects.
2895 * It is very good, but in some (rare!) circumstances
2896 * (SIT, PtP, NBMA NOARP links) it is handy to allow
2897 * some exceptions. --ANK
2898 * We allow IPv4-mapped nexthops to support RFC4798-type
2899 * addressing
2900 */
2901 if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) {
2902 NL_SET_ERR_MSG(extack, "Invalid gateway address");
2903 goto out;
2904 }
2905
2906 if (cfg->fc_flags & RTNH_F_ONLINK)
2907 err = ip6_route_check_nh_onlink(net, cfg, dev, extack);
2908 else
2909 err = ip6_route_check_nh(net, cfg, _dev, idev);
2910
2911 if (err)
2912 goto out;
2913 }
2914
2915 /* reload in case device was changed */
2916 dev = *_dev;
2917
2918 err = -EINVAL;
2919 if (!dev) {
2920 NL_SET_ERR_MSG(extack, "Egress device not specified");
2921 goto out;
2922 } else if (dev->flags & IFF_LOOPBACK) {
2923 NL_SET_ERR_MSG(extack,
2924 "Egress device can not be loopback device for this route");
2925 goto out;
2926 }
2927
2928 /* if we did not check gw_addr above, do so now that the
2929 * egress device has been resolved.
2930 */
2931 if (need_addr_check &&
2932 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2933 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2934 goto out;
2935 }
2936
2937 err = 0;
2938out:
2939 return err;
2940}
2941
2942static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
2943 gfp_t gfp_flags,
2944 struct netlink_ext_ack *extack)
2945{
2946 struct net *net = cfg->fc_nlinfo.nl_net;
2947 struct fib6_info *rt = NULL;
2948 struct net_device *dev = NULL;
2949 struct inet6_dev *idev = NULL;
2950 struct fib6_table *table;
2951 int addr_type;
2952 int err = -EINVAL;
2953
2954 /* RTF_PCPU is an internal flag; can not be set by userspace */
2955 if (cfg->fc_flags & RTF_PCPU) {
2956 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
2957 goto out;
2958 }
2959
2960 /* RTF_CACHE is an internal flag; can not be set by userspace */
2961 if (cfg->fc_flags & RTF_CACHE) {
2962 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
2963 goto out;
2964 }
2965
2966 if (cfg->fc_type > RTN_MAX) {
2967 NL_SET_ERR_MSG(extack, "Invalid route type");
2968 goto out;
2969 }
2970
2971 if (cfg->fc_dst_len > 128) {
2972 NL_SET_ERR_MSG(extack, "Invalid prefix length");
2973 goto out;
2974 }
2975 if (cfg->fc_src_len > 128) {
2976 NL_SET_ERR_MSG(extack, "Invalid source address length");
2977 goto out;
2978 }
2979#ifndef CONFIG_IPV6_SUBTREES
2980 if (cfg->fc_src_len) {
2981 NL_SET_ERR_MSG(extack,
2982 "Specifying source address requires IPV6_SUBTREES to be enabled");
2983 goto out;
2984 }
2985#endif
2986 if (cfg->fc_ifindex) {
2987 err = -ENODEV;
2988 dev = dev_get_by_index(net, cfg->fc_ifindex);
2989 if (!dev)
2990 goto out;
2991 idev = in6_dev_get(dev);
2992 if (!idev)
2993 goto out;
2994 }
2995
2996 if (cfg->fc_metric == 0)
2997 cfg->fc_metric = IP6_RT_PRIO_USER;
2998
2999 if (cfg->fc_flags & RTNH_F_ONLINK) {
3000 if (!dev) {
3001 NL_SET_ERR_MSG(extack,
3002 "Nexthop device required for onlink");
3003 err = -ENODEV;
3004 goto out;
3005 }
3006
3007 if (!(dev->flags & IFF_UP)) {
3008 NL_SET_ERR_MSG(extack, "Nexthop device is not up");
3009 err = -ENETDOWN;
3010 goto out;
3011 }
3012 }
3013
3014 err = -ENOBUFS;
3015 if (cfg->fc_nlinfo.nlh &&
3016 !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
3017 table = fib6_get_table(net, cfg->fc_table);
3018 if (!table) {
3019 pr_warn("NLM_F_CREATE should be specified when creating new route\n");
3020 table = fib6_new_table(net, cfg->fc_table);
3021 }
3022 } else {
3023 table = fib6_new_table(net, cfg->fc_table);
3024 }
3025
3026 if (!table)
3027 goto out;
3028
3029 err = -ENOMEM;
3030 rt = fib6_info_alloc(gfp_flags);
3031 if (!rt)
3032 goto out;
3033
3034 if (cfg->fc_flags & RTF_ADDRCONF)
3035 rt->dst_nocount = true;
3036
3037 err = ip6_convert_metrics(net, rt, cfg);
3038 if (err < 0)
3039 goto out;
3040
3041 if (cfg->fc_flags & RTF_EXPIRES)
3042 fib6_set_expires(rt, jiffies +
3043 clock_t_to_jiffies(cfg->fc_expires));
3044 else
3045 fib6_clean_expires(rt);
3046
3047 if (cfg->fc_protocol == RTPROT_UNSPEC)
3048 cfg->fc_protocol = RTPROT_BOOT;
3049 rt->fib6_protocol = cfg->fc_protocol;
3050
3051 addr_type = ipv6_addr_type(&cfg->fc_dst);
3052
3053 if (cfg->fc_encap) {
3054 struct lwtunnel_state *lwtstate;
3055
3056 err = lwtunnel_build_state(cfg->fc_encap_type,
3057 cfg->fc_encap, AF_INET6, cfg,
3058 &lwtstate, extack);
3059 if (err)
3060 goto out;
3061 rt->fib6_nh.nh_lwtstate = lwtstate_get(lwtstate);
3062 }
3063
3064 ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
3065 rt->fib6_dst.plen = cfg->fc_dst_len;
3066 if (rt->fib6_dst.plen == 128)
3067 rt->dst_host = true;
3068
3069#ifdef CONFIG_IPV6_SUBTREES
3070 ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len);
3071 rt->fib6_src.plen = cfg->fc_src_len;
3072#endif
3073
3074 rt->fib6_metric = cfg->fc_metric;
3075 rt->fib6_nh.nh_weight = 1;
3076
3077 rt->fib6_type = cfg->fc_type;
3078
3079 /* We cannot add true routes via loopback here,
3080 they would result in kernel looping; promote them to reject routes
3081 */
3082 if ((cfg->fc_flags & RTF_REJECT) ||
3083 (dev && (dev->flags & IFF_LOOPBACK) &&
3084 !(addr_type & IPV6_ADDR_LOOPBACK) &&
3085 !(cfg->fc_flags & RTF_LOCAL))) {
3086 /* hold loopback dev/idev if we haven't done so. */
3087 if (dev != net->loopback_dev) {
3088 if (dev) {
3089 dev_put(dev);
3090 in6_dev_put(idev);
3091 }
3092 dev = net->loopback_dev;
3093 dev_hold(dev);
3094 idev = in6_dev_get(dev);
3095 if (!idev) {
3096 err = -ENODEV;
3097 goto out;
3098 }
3099 }
3100 rt->fib6_flags = RTF_REJECT|RTF_NONEXTHOP;
3101 goto install_route;
3102 }
3103
3104 if (cfg->fc_flags & RTF_GATEWAY) {
3105 err = ip6_validate_gw(net, cfg, &dev, &idev, extack);
3106 if (err)
3107 goto out;
3108
3109 rt->fib6_nh.nh_gw = cfg->fc_gateway;
3110 }
3111
3112 err = -ENODEV;
3113 if (!dev)
3114 goto out;
3115
3116 if (idev->cnf.disable_ipv6) {
3117 NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device");
3118 err = -EACCES;
3119 goto out;
3120 }
3121
3122 if (!(dev->flags & IFF_UP)) {
3123 NL_SET_ERR_MSG(extack, "Nexthop device is not up");
3124 err = -ENETDOWN;
3125 goto out;
3126 }
3127
3128 if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
3129 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
3130 NL_SET_ERR_MSG(extack, "Invalid source address");
3131 err = -EINVAL;
3132 goto out;
3133 }
3134 rt->fib6_prefsrc.addr = cfg->fc_prefsrc;
3135 rt->fib6_prefsrc.plen = 128;
3136 } else
3137 rt->fib6_prefsrc.plen = 0;
3138
3139 rt->fib6_flags = cfg->fc_flags;
3140
3141install_route:
3142 if (!(rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
3143 !netif_carrier_ok(dev))
3144 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
3145 rt->fib6_nh.nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK);
3146 rt->fib6_nh.nh_dev = dev;
3147 rt->fib6_table = table;
3148
3149 cfg->fc_nlinfo.nl_net = dev_net(dev);
3150
3151 if (idev)
3152 in6_dev_put(idev);
3153
3154 return rt;
3155out:
3156 if (dev)
3157 dev_put(dev);
3158 if (idev)
3159 in6_dev_put(idev);
3160
3161 fib6_info_release(rt);
3162 return ERR_PTR(err);
3163}
3164
3165int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
3166 struct netlink_ext_ack *extack)
3167{
3168 struct fib6_info *rt;
3169 int err;
3170
3171 rt = ip6_route_info_create(cfg, gfp_flags, extack);
3172 if (IS_ERR(rt))
3173 return PTR_ERR(rt);
3174
3175 err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack);
3176 fib6_info_release(rt);
3177
3178 return err;
3179}
3180
3181static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info)
3182{
3183 struct net *net = info->nl_net;
3184 struct fib6_table *table;
3185 int err;
3186
3187 if (rt == net->ipv6.fib6_null_entry) {
3188 err = -ENOENT;
3189 goto out;
3190 }
3191
3192 table = rt->fib6_table;
3193 spin_lock_bh(&table->tb6_lock);
3194 err = fib6_del(rt, info);
3195 spin_unlock_bh(&table->tb6_lock);
3196
3197out:
3198 fib6_info_release(rt);
3199 return err;
3200}
3201
3202int ip6_del_rt(struct net *net, struct fib6_info *rt)
3203{
3204 struct nl_info info = { .nl_net = net };
3205
3206 return __ip6_del_rt(rt, &info);
3207}
3208
3209static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg)
3210{
3211 struct nl_info *info = &cfg->fc_nlinfo;
3212 struct net *net = info->nl_net;
3213 struct sk_buff *skb = NULL;
3214 struct fib6_table *table;
3215 int err = -ENOENT;
3216
3217 if (rt == net->ipv6.fib6_null_entry)
3218 goto out_put;
3219 table = rt->fib6_table;
3220 spin_lock_bh(&table->tb6_lock);
3221
3222 if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) {
3223 struct fib6_info *sibling, *next_sibling;
3224
3225 /* prefer to send a single notification with all hops */
3226 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3227 if (skb) {
3228 u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3229
3230 if (rt6_fill_node(net, skb, rt, NULL,
3231 NULL, NULL, 0, RTM_DELROUTE,
3232 info->portid, seq, 0) < 0) {
3233 kfree_skb(skb);
3234 skb = NULL;
3235 } else
3236 info->skip_notify = 1;
3237 }
3238
3239 list_for_each_entry_safe(sibling, next_sibling,
3240 &rt->fib6_siblings,
3241 fib6_siblings) {
3242 err = fib6_del(sibling, info);
3243 if (err)
3244 goto out_unlock;
3245 }
3246 }
3247
3248 err = fib6_del(rt, info);
3249out_unlock:
3250 spin_unlock_bh(&table->tb6_lock);
3251out_put:
3252 fib6_info_release(rt);
3253
3254 if (skb) {
3255 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3256 info->nlh, gfp_any());
3257 }
3258 return err;
3259}
3260
3261static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg)
3262{
3263 int rc = -ESRCH;
3264
3265 if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex)
3266 goto out;
3267
3268 if (cfg->fc_flags & RTF_GATEWAY &&
3269 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
3270 goto out;
3271
3272 rc = rt6_remove_exception_rt(rt);
3273out:
3274 return rc;
3275}
3276
3277static int ip6_route_del(struct fib6_config *cfg,
3278 struct netlink_ext_ack *extack)
3279{
3280 struct rt6_info *rt_cache;
3281 struct fib6_table *table;
3282 struct fib6_info *rt;
3283 struct fib6_node *fn;
3284 int err = -ESRCH;
3285
3286 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
3287 if (!table) {
3288 NL_SET_ERR_MSG(extack, "FIB table does not exist");
3289 return err;
3290 }
3291
3292 rcu_read_lock();
3293
3294 fn = fib6_locate(&table->tb6_root,
3295 &cfg->fc_dst, cfg->fc_dst_len,
3296 &cfg->fc_src, cfg->fc_src_len,
3297 !(cfg->fc_flags & RTF_CACHE));
3298
3299 if (fn) {
3300 for_each_fib6_node_rt_rcu(fn) {
3301 if (cfg->fc_flags & RTF_CACHE) {
3302 int rc;
3303
3304 rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
3305 &cfg->fc_src);
3306 if (rt_cache) {
3307 rc = ip6_del_cached_rt(rt_cache, cfg);
3308 if (rc != -ESRCH) {
3309 rcu_read_unlock();
3310 return rc;
3311 }
3312 }
3313 continue;
3314 }
3315 if (cfg->fc_ifindex &&
3316 (!rt->fib6_nh.nh_dev ||
3317 rt->fib6_nh.nh_dev->ifindex != cfg->fc_ifindex))
3318 continue;
3319 if (cfg->fc_flags & RTF_GATEWAY &&
3320 !ipv6_addr_equal(&cfg->fc_gateway, &rt->fib6_nh.nh_gw))
3321 continue;
3322 if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric)
3323 continue;
3324 if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol)
3325 continue;
3326 if (!fib6_info_hold_safe(rt))
3327 continue;
3328 rcu_read_unlock();
3329
3330 /* if gateway was specified only delete the one hop */
3331 if (cfg->fc_flags & RTF_GATEWAY)
3332 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
3333
3334 return __ip6_del_rt_siblings(rt, cfg);
3335 }
3336 }
3337 rcu_read_unlock();
3338
3339 return err;
3340}
3341
3342static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
3343{
3344 struct netevent_redirect netevent;
3345 struct rt6_info *rt, *nrt = NULL;
3346 struct ndisc_options ndopts;
3347 struct inet6_dev *in6_dev;
3348 struct neighbour *neigh;
3349 struct fib6_info *from;
3350 struct rd_msg *msg;
3351 int optlen, on_link;
3352 u8 *lladdr;
3353
3354 optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
3355 optlen -= sizeof(*msg);
3356
3357 if (optlen < 0) {
3358 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
3359 return;
3360 }
3361
3362 msg = (struct rd_msg *)icmp6_hdr(skb);
3363
3364 if (ipv6_addr_is_multicast(&msg->dest)) {
3365 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
3366 return;
3367 }
3368
3369 on_link = 0;
3370 if (ipv6_addr_equal(&msg->dest, &msg->target)) {
3371 on_link = 1;
3372 } else if (ipv6_addr_type(&msg->target) !=
3373 (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
3374 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
3375 return;
3376 }
3377
3378 in6_dev = __in6_dev_get(skb->dev);
3379 if (!in6_dev)
3380 return;
3381 if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
3382 return;
3383
3384 /* RFC2461 8.1:
3385 * The IP source address of the Redirect MUST be the same as the current
3386 * first-hop router for the specified ICMP Destination Address.
3387 */
3388
3389 if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
3390 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
3391 return;
3392 }
3393
3394 lladdr = NULL;
3395 if (ndopts.nd_opts_tgt_lladdr) {
3396 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
3397 skb->dev);
3398 if (!lladdr) {
3399 net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
3400 return;
3401 }
3402 }
3403
3404 rt = (struct rt6_info *) dst;
3405 if (rt->rt6i_flags & RTF_REJECT) {
3406 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
3407 return;
3408 }
3409
3410 /* Redirect received -> path was valid.
3411 * Look, redirects are sent only in response to data packets,
3412 * so that this nexthop apparently is reachable. --ANK
3413 */
3414 dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
3415
3416 neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
3417 if (!neigh)
3418 return;
3419
3420 /*
3421 * We have finally decided to accept it.
3422 */
3423
3424 ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
3425 NEIGH_UPDATE_F_WEAK_OVERRIDE|
3426 NEIGH_UPDATE_F_OVERRIDE|
3427 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
3428 NEIGH_UPDATE_F_ISROUTER)),
3429 NDISC_REDIRECT, &ndopts);
3430
3431 rcu_read_lock();
3432 from = rcu_dereference(rt->from);
3433 /* This fib6_info_hold() is safe here because we hold reference to rt
3434 * and rt already holds reference to fib6_info.
3435 */
3436 fib6_info_hold(from);
3437 rcu_read_unlock();
3438
3439 nrt = ip6_rt_cache_alloc(from, &msg->dest, NULL);
3440 if (!nrt)
3441 goto out;
3442
3443 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
3444 if (on_link)
3445 nrt->rt6i_flags &= ~RTF_GATEWAY;
3446
3447 nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
3448
3449 /* No need to remove rt from the exception table if rt is
3450 * a cached route because rt6_insert_exception() will
3451 * takes care of it
3452 */
3453 if (rt6_insert_exception(nrt, from)) {
3454 dst_release_immediate(&nrt->dst);
3455 goto out;
3456 }
3457
3458 netevent.old = &rt->dst;
3459 netevent.new = &nrt->dst;
3460 netevent.daddr = &msg->dest;
3461 netevent.neigh = neigh;
3462 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
3463
3464out:
3465 fib6_info_release(from);
3466 neigh_release(neigh);
3467}
3468
3469#ifdef CONFIG_IPV6_ROUTE_INFO
3470static struct fib6_info *rt6_get_route_info(struct net *net,
3471 const struct in6_addr *prefix, int prefixlen,
3472 const struct in6_addr *gwaddr,
3473 struct net_device *dev)
3474{
3475 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3476 int ifindex = dev->ifindex;
3477 struct fib6_node *fn;
3478 struct fib6_info *rt = NULL;
3479 struct fib6_table *table;
3480
3481 table = fib6_get_table(net, tb_id);
3482 if (!table)
3483 return NULL;
3484
3485 rcu_read_lock();
3486 fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3487 if (!fn)
3488 goto out;
3489
3490 for_each_fib6_node_rt_rcu(fn) {
3491 if (rt->fib6_nh.nh_dev->ifindex != ifindex)
3492 continue;
3493 if ((rt->fib6_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
3494 continue;
3495 if (!ipv6_addr_equal(&rt->fib6_nh.nh_gw, gwaddr))
3496 continue;
3497 if (!fib6_info_hold_safe(rt))
3498 continue;
3499 break;
3500 }
3501out:
3502 rcu_read_unlock();
3503 return rt;
3504}
3505
3506static struct fib6_info *rt6_add_route_info(struct net *net,
3507 const struct in6_addr *prefix, int prefixlen,
3508 const struct in6_addr *gwaddr,
3509 struct net_device *dev,
3510 unsigned int pref)
3511{
3512 struct fib6_config cfg = {
3513 .fc_metric = IP6_RT_PRIO_USER,
3514 .fc_ifindex = dev->ifindex,
3515 .fc_dst_len = prefixlen,
3516 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3517 RTF_UP | RTF_PREF(pref),
3518 .fc_protocol = RTPROT_RA,
3519 .fc_type = RTN_UNICAST,
3520 .fc_nlinfo.portid = 0,
3521 .fc_nlinfo.nlh = NULL,
3522 .fc_nlinfo.nl_net = net,
3523 };
3524
3525 cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
3526 cfg.fc_dst = *prefix;
3527 cfg.fc_gateway = *gwaddr;
3528
3529 /* We should treat it as a default route if prefix length is 0. */
3530 if (!prefixlen)
3531 cfg.fc_flags |= RTF_DEFAULT;
3532
3533 ip6_route_add(&cfg, GFP_ATOMIC, NULL);
3534
3535 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3536}
3537#endif
3538
3539struct fib6_info *rt6_get_dflt_router(struct net *net,
3540 const struct in6_addr *addr,
3541 struct net_device *dev)
3542{
3543 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3544 struct fib6_info *rt;
3545 struct fib6_table *table;
3546
3547 table = fib6_get_table(net, tb_id);
3548 if (!table)
3549 return NULL;
3550
3551 rcu_read_lock();
3552 for_each_fib6_node_rt_rcu(&table->tb6_root) {
3553 if (dev == rt->fib6_nh.nh_dev &&
3554 ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3555 ipv6_addr_equal(&rt->fib6_nh.nh_gw, addr))
3556 break;
3557 }
3558 if (rt && !fib6_info_hold_safe(rt))
3559 rt = NULL;
3560 rcu_read_unlock();
3561 return rt;
3562}
3563
3564struct fib6_info *rt6_add_dflt_router(struct net *net,
3565 const struct in6_addr *gwaddr,
3566 struct net_device *dev,
3567 unsigned int pref)
3568{
3569 struct fib6_config cfg = {
3570 .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3571 .fc_metric = IP6_RT_PRIO_USER,
3572 .fc_ifindex = dev->ifindex,
3573 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3574 RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3575 .fc_protocol = RTPROT_RA,
3576 .fc_type = RTN_UNICAST,
3577 .fc_nlinfo.portid = 0,
3578 .fc_nlinfo.nlh = NULL,
3579 .fc_nlinfo.nl_net = net,
3580 };
3581
3582 cfg.fc_gateway = *gwaddr;
3583
3584 if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) {
3585 struct fib6_table *table;
3586
3587 table = fib6_get_table(dev_net(dev), cfg.fc_table);
3588 if (table)
3589 table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3590 }
3591
3592 return rt6_get_dflt_router(net, gwaddr, dev);
3593}
3594
3595static void __rt6_purge_dflt_routers(struct net *net,
3596 struct fib6_table *table)
3597{
3598 struct fib6_info *rt;
3599
3600restart:
3601 rcu_read_lock();
3602 for_each_fib6_node_rt_rcu(&table->tb6_root) {
3603 struct net_device *dev = fib6_info_nh_dev(rt);
3604 struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL;
3605
3606 if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3607 (!idev || idev->cnf.accept_ra != 2) &&
3608 fib6_info_hold_safe(rt)) {
3609 rcu_read_unlock();
3610 ip6_del_rt(net, rt);
3611 goto restart;
3612 }
3613 }
3614 rcu_read_unlock();
3615
3616 table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
3617}
3618
3619void rt6_purge_dflt_routers(struct net *net)
3620{
3621 struct fib6_table *table;
3622 struct hlist_head *head;
3623 unsigned int h;
3624
3625 rcu_read_lock();
3626
3627 for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
3628 head = &net->ipv6.fib_table_hash[h];
3629 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
3630 if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3631 __rt6_purge_dflt_routers(net, table);
3632 }
3633 }
3634
3635 rcu_read_unlock();
3636}
3637
3638static void rtmsg_to_fib6_config(struct net *net,
3639 struct in6_rtmsg *rtmsg,
3640 struct fib6_config *cfg)
3641{
3642 memset(cfg, 0, sizeof(*cfg));
3643
3644 cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3645 : RT6_TABLE_MAIN;
3646 cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
3647 cfg->fc_metric = rtmsg->rtmsg_metric;
3648 cfg->fc_expires = rtmsg->rtmsg_info;
3649 cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
3650 cfg->fc_src_len = rtmsg->rtmsg_src_len;
3651 cfg->fc_flags = rtmsg->rtmsg_flags;
3652 cfg->fc_type = rtmsg->rtmsg_type;
3653
3654 cfg->fc_nlinfo.nl_net = net;
3655
3656 cfg->fc_dst = rtmsg->rtmsg_dst;
3657 cfg->fc_src = rtmsg->rtmsg_src;
3658 cfg->fc_gateway = rtmsg->rtmsg_gateway;
3659}
3660
3661int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3662{
3663 struct fib6_config cfg;
3664 struct in6_rtmsg rtmsg;
3665 int err;
3666
3667 switch (cmd) {
3668 case SIOCADDRT: /* Add a route */
3669 case SIOCDELRT: /* Delete a route */
3670 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3671 return -EPERM;
3672 err = copy_from_user(&rtmsg, arg,
3673 sizeof(struct in6_rtmsg));
3674 if (err)
3675 return -EFAULT;
3676
3677 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3678
3679 rtnl_lock();
3680 switch (cmd) {
3681 case SIOCADDRT:
3682 err = ip6_route_add(&cfg, GFP_KERNEL, NULL);
3683 break;
3684 case SIOCDELRT:
3685 err = ip6_route_del(&cfg, NULL);
3686 break;
3687 default:
3688 err = -EINVAL;
3689 }
3690 rtnl_unlock();
3691
3692 return err;
3693 }
3694
3695 return -EINVAL;
3696}
3697
3698/*
3699 * Drop the packet on the floor
3700 */
3701
3702static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
3703{
3704 int type;
3705 struct dst_entry *dst = skb_dst(skb);
3706 switch (ipstats_mib_noroutes) {
3707 case IPSTATS_MIB_INNOROUTES:
3708 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
3709 if (type == IPV6_ADDR_ANY) {
3710 IP6_INC_STATS(dev_net(dst->dev),
3711 __in6_dev_get_safely(skb->dev),
3712 IPSTATS_MIB_INADDRERRORS);
3713 break;
3714 }
3715 /* FALLTHROUGH */
3716 case IPSTATS_MIB_OUTNOROUTES:
3717 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3718 ipstats_mib_noroutes);
3719 break;
3720 }
3721 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
3722 kfree_skb(skb);
3723 return 0;
3724}
3725
3726static int ip6_pkt_discard(struct sk_buff *skb)
3727{
3728 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3729}
3730
3731static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3732{
3733 skb->dev = skb_dst(skb)->dev;
3734 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
3735}
3736
3737static int ip6_pkt_prohibit(struct sk_buff *skb)
3738{
3739 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3740}
3741
3742static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3743{
3744 skb->dev = skb_dst(skb)->dev;
3745 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3746}
3747
3748/*
3749 * Allocate a dst for local (unicast / anycast) address.
3750 */
3751
3752struct fib6_info *addrconf_f6i_alloc(struct net *net,
3753 struct inet6_dev *idev,
3754 const struct in6_addr *addr,
3755 bool anycast, gfp_t gfp_flags)
3756{
3757 u32 tb_id;
3758 struct net_device *dev = idev->dev;
3759 struct fib6_info *f6i;
3760
3761 f6i = fib6_info_alloc(gfp_flags);
3762 if (!f6i)
3763 return ERR_PTR(-ENOMEM);
3764
3765 f6i->dst_nocount = true;
3766 f6i->dst_host = true;
3767 f6i->fib6_protocol = RTPROT_KERNEL;
3768 f6i->fib6_flags = RTF_UP | RTF_NONEXTHOP;
3769 if (anycast) {
3770 f6i->fib6_type = RTN_ANYCAST;
3771 f6i->fib6_flags |= RTF_ANYCAST;
3772 } else {
3773 f6i->fib6_type = RTN_LOCAL;
3774 f6i->fib6_flags |= RTF_LOCAL;
3775 }
3776
3777 f6i->fib6_nh.nh_gw = *addr;
3778 dev_hold(dev);
3779 f6i->fib6_nh.nh_dev = dev;
3780 f6i->fib6_dst.addr = *addr;
3781 f6i->fib6_dst.plen = 128;
3782 tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
3783 f6i->fib6_table = fib6_get_table(net, tb_id);
3784
3785 return f6i;
3786}
3787
3788/* remove deleted ip from prefsrc entries */
3789struct arg_dev_net_ip {
3790 struct net_device *dev;
3791 struct net *net;
3792 struct in6_addr *addr;
3793};
3794
3795static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg)
3796{
3797 struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3798 struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3799 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3800
3801 if (((void *)rt->fib6_nh.nh_dev == dev || !dev) &&
3802 rt != net->ipv6.fib6_null_entry &&
3803 ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) {
3804 spin_lock_bh(&rt6_exception_lock);
3805 /* remove prefsrc entry */
3806 rt->fib6_prefsrc.plen = 0;
3807 /* need to update cache as well */
3808 rt6_exceptions_remove_prefsrc(rt);
3809 spin_unlock_bh(&rt6_exception_lock);
3810 }
3811 return 0;
3812}
3813
3814void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
3815{
3816 struct net *net = dev_net(ifp->idev->dev);
3817 struct arg_dev_net_ip adni = {
3818 .dev = ifp->idev->dev,
3819 .net = net,
3820 .addr = &ifp->addr,
3821 };
3822 fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3823}
3824
3825#define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
3826
3827/* Remove routers and update dst entries when gateway turn into host. */
3828static int fib6_clean_tohost(struct fib6_info *rt, void *arg)
3829{
3830 struct in6_addr *gateway = (struct in6_addr *)arg;
3831
3832 if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3833 ipv6_addr_equal(gateway, &rt->fib6_nh.nh_gw)) {
3834 return -1;
3835 }
3836
3837 /* Further clean up cached routes in exception table.
3838 * This is needed because cached route may have a different
3839 * gateway than its 'parent' in the case of an ip redirect.
3840 */
3841 rt6_exceptions_clean_tohost(rt, gateway);
3842
3843 return 0;
3844}
3845
3846void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
3847{
3848 fib6_clean_all(net, fib6_clean_tohost, gateway);
3849}
3850
3851struct arg_netdev_event {
3852 const struct net_device *dev;
3853 union {
3854 unsigned int nh_flags;
3855 unsigned long event;
3856 };
3857};
3858
3859static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt)
3860{
3861 struct fib6_info *iter;
3862 struct fib6_node *fn;
3863
3864 fn = rcu_dereference_protected(rt->fib6_node,
3865 lockdep_is_held(&rt->fib6_table->tb6_lock));
3866 iter = rcu_dereference_protected(fn->leaf,
3867 lockdep_is_held(&rt->fib6_table->tb6_lock));
3868 while (iter) {
3869 if (iter->fib6_metric == rt->fib6_metric &&
3870 rt6_qualify_for_ecmp(iter))
3871 return iter;
3872 iter = rcu_dereference_protected(iter->fib6_next,
3873 lockdep_is_held(&rt->fib6_table->tb6_lock));
3874 }
3875
3876 return NULL;
3877}
3878
3879static bool rt6_is_dead(const struct fib6_info *rt)
3880{
3881 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD ||
3882 (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
3883 fib6_ignore_linkdown(rt)))
3884 return true;
3885
3886 return false;
3887}
3888
3889static int rt6_multipath_total_weight(const struct fib6_info *rt)
3890{
3891 struct fib6_info *iter;
3892 int total = 0;
3893
3894 if (!rt6_is_dead(rt))
3895 total += rt->fib6_nh.nh_weight;
3896
3897 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) {
3898 if (!rt6_is_dead(iter))
3899 total += iter->fib6_nh.nh_weight;
3900 }
3901
3902 return total;
3903}
3904
3905static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total)
3906{
3907 int upper_bound = -1;
3908
3909 if (!rt6_is_dead(rt)) {
3910 *weight += rt->fib6_nh.nh_weight;
3911 upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
3912 total) - 1;
3913 }
3914 atomic_set(&rt->fib6_nh.nh_upper_bound, upper_bound);
3915}
3916
3917static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total)
3918{
3919 struct fib6_info *iter;
3920 int weight = 0;
3921
3922 rt6_upper_bound_set(rt, &weight, total);
3923
3924 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3925 rt6_upper_bound_set(iter, &weight, total);
3926}
3927
3928void rt6_multipath_rebalance(struct fib6_info *rt)
3929{
3930 struct fib6_info *first;
3931 int total;
3932
3933 /* In case the entire multipath route was marked for flushing,
3934 * then there is no need to rebalance upon the removal of every
3935 * sibling route.
3936 */
3937 if (!rt->fib6_nsiblings || rt->should_flush)
3938 return;
3939
3940 /* During lookup routes are evaluated in order, so we need to
3941 * make sure upper bounds are assigned from the first sibling
3942 * onwards.
3943 */
3944 first = rt6_multipath_first_sibling(rt);
3945 if (WARN_ON_ONCE(!first))
3946 return;
3947
3948 total = rt6_multipath_total_weight(first);
3949 rt6_multipath_upper_bound_set(first, total);
3950}
3951
3952static int fib6_ifup(struct fib6_info *rt, void *p_arg)
3953{
3954 const struct arg_netdev_event *arg = p_arg;
3955 struct net *net = dev_net(arg->dev);
3956
3957 if (rt != net->ipv6.fib6_null_entry && rt->fib6_nh.nh_dev == arg->dev) {
3958 rt->fib6_nh.nh_flags &= ~arg->nh_flags;
3959 fib6_update_sernum_upto_root(net, rt);
3960 rt6_multipath_rebalance(rt);
3961 }
3962
3963 return 0;
3964}
3965
3966void rt6_sync_up(struct net_device *dev, unsigned int nh_flags)
3967{
3968 struct arg_netdev_event arg = {
3969 .dev = dev,
3970 {
3971 .nh_flags = nh_flags,
3972 },
3973 };
3974
3975 if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev))
3976 arg.nh_flags |= RTNH_F_LINKDOWN;
3977
3978 fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
3979}
3980
3981static bool rt6_multipath_uses_dev(const struct fib6_info *rt,
3982 const struct net_device *dev)
3983{
3984 struct fib6_info *iter;
3985
3986 if (rt->fib6_nh.nh_dev == dev)
3987 return true;
3988 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3989 if (iter->fib6_nh.nh_dev == dev)
3990 return true;
3991
3992 return false;
3993}
3994
3995static void rt6_multipath_flush(struct fib6_info *rt)
3996{
3997 struct fib6_info *iter;
3998
3999 rt->should_flush = 1;
4000 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4001 iter->should_flush = 1;
4002}
4003
4004static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt,
4005 const struct net_device *down_dev)
4006{
4007 struct fib6_info *iter;
4008 unsigned int dead = 0;
4009
4010 if (rt->fib6_nh.nh_dev == down_dev ||
4011 rt->fib6_nh.nh_flags & RTNH_F_DEAD)
4012 dead++;
4013 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4014 if (iter->fib6_nh.nh_dev == down_dev ||
4015 iter->fib6_nh.nh_flags & RTNH_F_DEAD)
4016 dead++;
4017
4018 return dead;
4019}
4020
4021static void rt6_multipath_nh_flags_set(struct fib6_info *rt,
4022 const struct net_device *dev,
4023 unsigned int nh_flags)
4024{
4025 struct fib6_info *iter;
4026
4027 if (rt->fib6_nh.nh_dev == dev)
4028 rt->fib6_nh.nh_flags |= nh_flags;
4029 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4030 if (iter->fib6_nh.nh_dev == dev)
4031 iter->fib6_nh.nh_flags |= nh_flags;
4032}
4033
4034/* called with write lock held for table with rt */
4035static int fib6_ifdown(struct fib6_info *rt, void *p_arg)
4036{
4037 const struct arg_netdev_event *arg = p_arg;
4038 const struct net_device *dev = arg->dev;
4039 struct net *net = dev_net(dev);
4040
4041 if (rt == net->ipv6.fib6_null_entry)
4042 return 0;
4043
4044 switch (arg->event) {
4045 case NETDEV_UNREGISTER:
4046 return rt->fib6_nh.nh_dev == dev ? -1 : 0;
4047 case NETDEV_DOWN:
4048 if (rt->should_flush)
4049 return -1;
4050 if (!rt->fib6_nsiblings)
4051 return rt->fib6_nh.nh_dev == dev ? -1 : 0;
4052 if (rt6_multipath_uses_dev(rt, dev)) {
4053 unsigned int count;
4054
4055 count = rt6_multipath_dead_count(rt, dev);
4056 if (rt->fib6_nsiblings + 1 == count) {
4057 rt6_multipath_flush(rt);
4058 return -1;
4059 }
4060 rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
4061 RTNH_F_LINKDOWN);
4062 fib6_update_sernum(net, rt);
4063 rt6_multipath_rebalance(rt);
4064 }
4065 return -2;
4066 case NETDEV_CHANGE:
4067 if (rt->fib6_nh.nh_dev != dev ||
4068 rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST))
4069 break;
4070 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
4071 rt6_multipath_rebalance(rt);
4072 break;
4073 }
4074
4075 return 0;
4076}
4077
4078void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
4079{
4080 struct arg_netdev_event arg = {
4081 .dev = dev,
4082 {
4083 .event = event,
4084 },
4085 };
4086
4087 fib6_clean_all(dev_net(dev), fib6_ifdown, &arg);
4088}
4089
4090void rt6_disable_ip(struct net_device *dev, unsigned long event)
4091{
4092 rt6_sync_down_dev(dev, event);
4093 rt6_uncached_list_flush_dev(dev_net(dev), dev);
4094 neigh_ifdown(&nd_tbl, dev);
4095}
4096
4097struct rt6_mtu_change_arg {
4098 struct net_device *dev;
4099 unsigned int mtu;
4100};
4101
4102static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg)
4103{
4104 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
4105 struct inet6_dev *idev;
4106
4107 /* In IPv6 pmtu discovery is not optional,
4108 so that RTAX_MTU lock cannot disable it.
4109 We still use this lock to block changes
4110 caused by addrconf/ndisc.
4111 */
4112
4113 idev = __in6_dev_get(arg->dev);
4114 if (!idev)
4115 return 0;
4116
4117 /* For administrative MTU increase, there is no way to discover
4118 IPv6 PMTU increase, so PMTU increase should be updated here.
4119 Since RFC 1981 doesn't include administrative MTU increase
4120 update PMTU increase is a MUST. (i.e. jumbo frame)
4121 */
4122 if (rt->fib6_nh.nh_dev == arg->dev &&
4123 !fib6_metric_locked(rt, RTAX_MTU)) {
4124 u32 mtu = rt->fib6_pmtu;
4125
4126 if (mtu >= arg->mtu ||
4127 (mtu < arg->mtu && mtu == idev->cnf.mtu6))
4128 fib6_metric_set(rt, RTAX_MTU, arg->mtu);
4129
4130 spin_lock_bh(&rt6_exception_lock);
4131 rt6_exceptions_update_pmtu(idev, rt, arg->mtu);
4132 spin_unlock_bh(&rt6_exception_lock);
4133 }
4134 return 0;
4135}
4136
4137void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
4138{
4139 struct rt6_mtu_change_arg arg = {
4140 .dev = dev,
4141 .mtu = mtu,
4142 };
4143
4144 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
4145}
4146
4147static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
4148 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) },
4149 [RTA_PREFSRC] = { .len = sizeof(struct in6_addr) },
4150 [RTA_OIF] = { .type = NLA_U32 },
4151 [RTA_IIF] = { .type = NLA_U32 },
4152 [RTA_PRIORITY] = { .type = NLA_U32 },
4153 [RTA_METRICS] = { .type = NLA_NESTED },
4154 [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) },
4155 [RTA_PREF] = { .type = NLA_U8 },
4156 [RTA_ENCAP_TYPE] = { .type = NLA_U16 },
4157 [RTA_ENCAP] = { .type = NLA_NESTED },
4158 [RTA_EXPIRES] = { .type = NLA_U32 },
4159 [RTA_UID] = { .type = NLA_U32 },
4160 [RTA_MARK] = { .type = NLA_U32 },
4161 [RTA_TABLE] = { .type = NLA_U32 },
4162 [RTA_IP_PROTO] = { .type = NLA_U8 },
4163 [RTA_SPORT] = { .type = NLA_U16 },
4164 [RTA_DPORT] = { .type = NLA_U16 },
4165};
4166
4167static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
4168 struct fib6_config *cfg,
4169 struct netlink_ext_ack *extack)
4170{
4171 struct rtmsg *rtm;
4172 struct nlattr *tb[RTA_MAX+1];
4173 unsigned int pref;
4174 int err;
4175
4176 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4177 NULL);
4178 if (err < 0)
4179 goto errout;
4180
4181 err = -EINVAL;
4182 rtm = nlmsg_data(nlh);
4183 memset(cfg, 0, sizeof(*cfg));
4184
4185 cfg->fc_table = rtm->rtm_table;
4186 cfg->fc_dst_len = rtm->rtm_dst_len;
4187 cfg->fc_src_len = rtm->rtm_src_len;
4188 cfg->fc_flags = RTF_UP;
4189 cfg->fc_protocol = rtm->rtm_protocol;
4190 cfg->fc_type = rtm->rtm_type;
4191
4192 if (rtm->rtm_type == RTN_UNREACHABLE ||
4193 rtm->rtm_type == RTN_BLACKHOLE ||
4194 rtm->rtm_type == RTN_PROHIBIT ||
4195 rtm->rtm_type == RTN_THROW)
4196 cfg->fc_flags |= RTF_REJECT;
4197
4198 if (rtm->rtm_type == RTN_LOCAL)
4199 cfg->fc_flags |= RTF_LOCAL;
4200
4201 if (rtm->rtm_flags & RTM_F_CLONED)
4202 cfg->fc_flags |= RTF_CACHE;
4203
4204 cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);
4205
4206 cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
4207 cfg->fc_nlinfo.nlh = nlh;
4208 cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
4209
4210 if (tb[RTA_GATEWAY]) {
4211 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
4212 cfg->fc_flags |= RTF_GATEWAY;
4213 }
4214
4215 if (tb[RTA_DST]) {
4216 int plen = (rtm->rtm_dst_len + 7) >> 3;
4217
4218 if (nla_len(tb[RTA_DST]) < plen)
4219 goto errout;
4220
4221 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
4222 }
4223
4224 if (tb[RTA_SRC]) {
4225 int plen = (rtm->rtm_src_len + 7) >> 3;
4226
4227 if (nla_len(tb[RTA_SRC]) < plen)
4228 goto errout;
4229
4230 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
4231 }
4232
4233 if (tb[RTA_PREFSRC])
4234 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
4235
4236 if (tb[RTA_OIF])
4237 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
4238
4239 if (tb[RTA_PRIORITY])
4240 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
4241
4242 if (tb[RTA_METRICS]) {
4243 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
4244 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
4245 }
4246
4247 if (tb[RTA_TABLE])
4248 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
4249
4250 if (tb[RTA_MULTIPATH]) {
4251 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
4252 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
4253
4254 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
4255 cfg->fc_mp_len, extack);
4256 if (err < 0)
4257 goto errout;
4258 }
4259
4260 if (tb[RTA_PREF]) {
4261 pref = nla_get_u8(tb[RTA_PREF]);
4262 if (pref != ICMPV6_ROUTER_PREF_LOW &&
4263 pref != ICMPV6_ROUTER_PREF_HIGH)
4264 pref = ICMPV6_ROUTER_PREF_MEDIUM;
4265 cfg->fc_flags |= RTF_PREF(pref);
4266 }
4267
4268 if (tb[RTA_ENCAP])
4269 cfg->fc_encap = tb[RTA_ENCAP];
4270
4271 if (tb[RTA_ENCAP_TYPE]) {
4272 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
4273
4274 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
4275 if (err < 0)
4276 goto errout;
4277 }
4278
4279 if (tb[RTA_EXPIRES]) {
4280 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
4281
4282 if (addrconf_finite_timeout(timeout)) {
4283 cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
4284 cfg->fc_flags |= RTF_EXPIRES;
4285 }
4286 }
4287
4288 err = 0;
4289errout:
4290 return err;
4291}
4292
4293struct rt6_nh {
4294 struct fib6_info *fib6_info;
4295 struct fib6_config r_cfg;
4296 struct list_head next;
4297};
4298
4299static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
4300{
4301 struct rt6_nh *nh;
4302
4303 list_for_each_entry(nh, rt6_nh_list, next) {
4304 pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n",
4305 &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
4306 nh->r_cfg.fc_ifindex);
4307 }
4308}
4309
4310static int ip6_route_info_append(struct net *net,
4311 struct list_head *rt6_nh_list,
4312 struct fib6_info *rt,
4313 struct fib6_config *r_cfg)
4314{
4315 struct rt6_nh *nh;
4316 int err = -EEXIST;
4317
4318 list_for_each_entry(nh, rt6_nh_list, next) {
4319 /* check if fib6_info already exists */
4320 if (rt6_duplicate_nexthop(nh->fib6_info, rt))
4321 return err;
4322 }
4323
4324 nh = kzalloc(sizeof(*nh), GFP_KERNEL);
4325 if (!nh)
4326 return -ENOMEM;
4327 nh->fib6_info = rt;
4328 memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
4329 list_add_tail(&nh->next, rt6_nh_list);
4330
4331 return 0;
4332}
4333
4334static void ip6_route_mpath_notify(struct fib6_info *rt,
4335 struct fib6_info *rt_last,
4336 struct nl_info *info,
4337 __u16 nlflags)
4338{
4339 /* if this is an APPEND route, then rt points to the first route
4340 * inserted and rt_last points to last route inserted. Userspace
4341 * wants a consistent dump of the route which starts at the first
4342 * nexthop. Since sibling routes are always added at the end of
4343 * the list, find the first sibling of the last route appended
4344 */
4345 if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) {
4346 rt = list_first_entry(&rt_last->fib6_siblings,
4347 struct fib6_info,
4348 fib6_siblings);
4349 }
4350
4351 if (rt)
4352 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
4353}
4354
4355static int ip6_route_multipath_add(struct fib6_config *cfg,
4356 struct netlink_ext_ack *extack)
4357{
4358 struct fib6_info *rt_notif = NULL, *rt_last = NULL;
4359 struct nl_info *info = &cfg->fc_nlinfo;
4360 struct fib6_config r_cfg;
4361 struct rtnexthop *rtnh;
4362 struct fib6_info *rt;
4363 struct rt6_nh *err_nh;
4364 struct rt6_nh *nh, *nh_safe;
4365 __u16 nlflags;
4366 int remaining;
4367 int attrlen;
4368 int err = 1;
4369 int nhn = 0;
4370 int replace = (cfg->fc_nlinfo.nlh &&
4371 (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
4372 LIST_HEAD(rt6_nh_list);
4373
4374 nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
4375 if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
4376 nlflags |= NLM_F_APPEND;
4377
4378 remaining = cfg->fc_mp_len;
4379 rtnh = (struct rtnexthop *)cfg->fc_mp;
4380
4381 /* Parse a Multipath Entry and build a list (rt6_nh_list) of
4382 * fib6_info structs per nexthop
4383 */
4384 while (rtnh_ok(rtnh, remaining)) {
4385 memcpy(&r_cfg, cfg, sizeof(*cfg));
4386 if (rtnh->rtnh_ifindex)
4387 r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4388
4389 attrlen = rtnh_attrlen(rtnh);
4390 if (attrlen > 0) {
4391 struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4392
4393 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4394 if (nla) {
4395 r_cfg.fc_gateway = nla_get_in6_addr(nla);
4396 r_cfg.fc_flags |= RTF_GATEWAY;
4397 }
4398 r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
4399 nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
4400 if (nla)
4401 r_cfg.fc_encap_type = nla_get_u16(nla);
4402 }
4403
4404 r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK);
4405 rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack);
4406 if (IS_ERR(rt)) {
4407 err = PTR_ERR(rt);
4408 rt = NULL;
4409 goto cleanup;
4410 }
4411 if (!rt6_qualify_for_ecmp(rt)) {
4412 err = -EINVAL;
4413 NL_SET_ERR_MSG(extack,
4414 "Device only routes can not be added for IPv6 using the multipath API.");
4415 fib6_info_release(rt);
4416 goto cleanup;
4417 }
4418
4419 rt->fib6_nh.nh_weight = rtnh->rtnh_hops + 1;
4420
4421 err = ip6_route_info_append(info->nl_net, &rt6_nh_list,
4422 rt, &r_cfg);
4423 if (err) {
4424 fib6_info_release(rt);
4425 goto cleanup;
4426 }
4427
4428 rtnh = rtnh_next(rtnh, &remaining);
4429 }
4430
4431 /* for add and replace send one notification with all nexthops.
4432 * Skip the notification in fib6_add_rt2node and send one with
4433 * the full route when done
4434 */
4435 info->skip_notify = 1;
4436
4437 err_nh = NULL;
4438 list_for_each_entry(nh, &rt6_nh_list, next) {
4439 err = __ip6_ins_rt(nh->fib6_info, info, extack);
4440 fib6_info_release(nh->fib6_info);
4441
4442 if (!err) {
4443 /* save reference to last route successfully inserted */
4444 rt_last = nh->fib6_info;
4445
4446 /* save reference to first route for notification */
4447 if (!rt_notif)
4448 rt_notif = nh->fib6_info;
4449 }
4450
4451 /* nh->fib6_info is used or freed at this point, reset to NULL*/
4452 nh->fib6_info = NULL;
4453 if (err) {
4454 if (replace && nhn)
4455 ip6_print_replace_route_err(&rt6_nh_list);
4456 err_nh = nh;
4457 goto add_errout;
4458 }
4459
4460 /* Because each route is added like a single route we remove
4461 * these flags after the first nexthop: if there is a collision,
4462 * we have already failed to add the first nexthop:
4463 * fib6_add_rt2node() has rejected it; when replacing, old
4464 * nexthops have been replaced by first new, the rest should
4465 * be added to it.
4466 */
4467 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
4468 NLM_F_REPLACE);
4469 nhn++;
4470 }
4471
4472 /* success ... tell user about new route */
4473 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4474 goto cleanup;
4475
4476add_errout:
4477 /* send notification for routes that were added so that
4478 * the delete notifications sent by ip6_route_del are
4479 * coherent
4480 */
4481 if (rt_notif)
4482 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4483
4484 /* Delete routes that were already added */
4485 list_for_each_entry(nh, &rt6_nh_list, next) {
4486 if (err_nh == nh)
4487 break;
4488 ip6_route_del(&nh->r_cfg, extack);
4489 }
4490
4491cleanup:
4492 list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
4493 if (nh->fib6_info)
4494 fib6_info_release(nh->fib6_info);
4495 list_del(&nh->next);
4496 kfree(nh);
4497 }
4498
4499 return err;
4500}
4501
4502static int ip6_route_multipath_del(struct fib6_config *cfg,
4503 struct netlink_ext_ack *extack)
4504{
4505 struct fib6_config r_cfg;
4506 struct rtnexthop *rtnh;
4507 int remaining;
4508 int attrlen;
4509 int err = 1, last_err = 0;
4510
4511 remaining = cfg->fc_mp_len;
4512 rtnh = (struct rtnexthop *)cfg->fc_mp;
4513
4514 /* Parse a Multipath Entry */
4515 while (rtnh_ok(rtnh, remaining)) {
4516 memcpy(&r_cfg, cfg, sizeof(*cfg));
4517 if (rtnh->rtnh_ifindex)
4518 r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4519
4520 attrlen = rtnh_attrlen(rtnh);
4521 if (attrlen > 0) {
4522 struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4523
4524 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4525 if (nla) {
4526 nla_memcpy(&r_cfg.fc_gateway, nla, 16);
4527 r_cfg.fc_flags |= RTF_GATEWAY;
4528 }
4529 }
4530 err = ip6_route_del(&r_cfg, extack);
4531 if (err)
4532 last_err = err;
4533
4534 rtnh = rtnh_next(rtnh, &remaining);
4535 }
4536
4537 return last_err;
4538}
4539
4540static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4541 struct netlink_ext_ack *extack)
4542{
4543 struct fib6_config cfg;
4544 int err;
4545
4546 err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4547 if (err < 0)
4548 return err;
4549
4550 if (cfg.fc_mp)
4551 return ip6_route_multipath_del(&cfg, extack);
4552 else {
4553 cfg.fc_delete_all_nh = 1;
4554 return ip6_route_del(&cfg, extack);
4555 }
4556}
4557
4558static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4559 struct netlink_ext_ack *extack)
4560{
4561 struct fib6_config cfg;
4562 int err;
4563
4564 err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4565 if (err < 0)
4566 return err;
4567
4568 if (cfg.fc_mp)
4569 return ip6_route_multipath_add(&cfg, extack);
4570 else
4571 return ip6_route_add(&cfg, GFP_KERNEL, extack);
4572}
4573
4574static size_t rt6_nlmsg_size(struct fib6_info *rt)
4575{
4576 int nexthop_len = 0;
4577
4578 if (rt->fib6_nsiblings) {
4579 nexthop_len = nla_total_size(0) /* RTA_MULTIPATH */
4580 + NLA_ALIGN(sizeof(struct rtnexthop))
4581 + nla_total_size(16) /* RTA_GATEWAY */
4582 + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate);
4583
4584 nexthop_len *= rt->fib6_nsiblings;
4585 }
4586
4587 return NLMSG_ALIGN(sizeof(struct rtmsg))
4588 + nla_total_size(16) /* RTA_SRC */
4589 + nla_total_size(16) /* RTA_DST */
4590 + nla_total_size(16) /* RTA_GATEWAY */
4591 + nla_total_size(16) /* RTA_PREFSRC */
4592 + nla_total_size(4) /* RTA_TABLE */
4593 + nla_total_size(4) /* RTA_IIF */
4594 + nla_total_size(4) /* RTA_OIF */
4595 + nla_total_size(4) /* RTA_PRIORITY */
4596 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
4597 + nla_total_size(sizeof(struct rta_cacheinfo))
4598 + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
4599 + nla_total_size(1) /* RTA_PREF */
4600 + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate)
4601 + nexthop_len;
4602}
4603
4604static int rt6_nexthop_info(struct sk_buff *skb, struct fib6_info *rt,
4605 unsigned int *flags, bool skip_oif)
4606{
4607 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
4608 *flags |= RTNH_F_DEAD;
4609
4610 if (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN) {
4611 *flags |= RTNH_F_LINKDOWN;
4612
4613 rcu_read_lock();
4614 if (fib6_ignore_linkdown(rt))
4615 *flags |= RTNH_F_DEAD;
4616 rcu_read_unlock();
4617 }
4618
4619 if (rt->fib6_flags & RTF_GATEWAY) {
4620 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->fib6_nh.nh_gw) < 0)
4621 goto nla_put_failure;
4622 }
4623
4624 *flags |= (rt->fib6_nh.nh_flags & RTNH_F_ONLINK);
4625 if (rt->fib6_nh.nh_flags & RTNH_F_OFFLOAD)
4626 *flags |= RTNH_F_OFFLOAD;
4627
4628 /* not needed for multipath encoding b/c it has a rtnexthop struct */
4629 if (!skip_oif && rt->fib6_nh.nh_dev &&
4630 nla_put_u32(skb, RTA_OIF, rt->fib6_nh.nh_dev->ifindex))
4631 goto nla_put_failure;
4632
4633 if (rt->fib6_nh.nh_lwtstate &&
4634 lwtunnel_fill_encap(skb, rt->fib6_nh.nh_lwtstate) < 0)
4635 goto nla_put_failure;
4636
4637 return 0;
4638
4639nla_put_failure:
4640 return -EMSGSIZE;
4641}
4642
4643/* add multipath next hop */
4644static int rt6_add_nexthop(struct sk_buff *skb, struct fib6_info *rt)
4645{
4646 const struct net_device *dev = rt->fib6_nh.nh_dev;
4647 struct rtnexthop *rtnh;
4648 unsigned int flags = 0;
4649
4650 rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
4651 if (!rtnh)
4652 goto nla_put_failure;
4653
4654 rtnh->rtnh_hops = rt->fib6_nh.nh_weight - 1;
4655 rtnh->rtnh_ifindex = dev ? dev->ifindex : 0;
4656
4657 if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
4658 goto nla_put_failure;
4659
4660 rtnh->rtnh_flags = flags;
4661
4662 /* length of rtnetlink header + attributes */
4663 rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
4664
4665 return 0;
4666
4667nla_put_failure:
4668 return -EMSGSIZE;
4669}
4670
4671static int rt6_fill_node(struct net *net, struct sk_buff *skb,
4672 struct fib6_info *rt, struct dst_entry *dst,
4673 struct in6_addr *dest, struct in6_addr *src,
4674 int iif, int type, u32 portid, u32 seq,
4675 unsigned int flags)
4676{
4677 struct rt6_info *rt6 = (struct rt6_info *)dst;
4678 struct rt6key *rt6_dst, *rt6_src;
4679 u32 *pmetrics, table, rt6_flags;
4680 struct nlmsghdr *nlh;
4681 struct rtmsg *rtm;
4682 long expires = 0;
4683
4684 nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
4685 if (!nlh)
4686 return -EMSGSIZE;
4687
4688 if (rt6) {
4689 rt6_dst = &rt6->rt6i_dst;
4690 rt6_src = &rt6->rt6i_src;
4691 rt6_flags = rt6->rt6i_flags;
4692 } else {
4693 rt6_dst = &rt->fib6_dst;
4694 rt6_src = &rt->fib6_src;
4695 rt6_flags = rt->fib6_flags;
4696 }
4697
4698 rtm = nlmsg_data(nlh);
4699 rtm->rtm_family = AF_INET6;
4700 rtm->rtm_dst_len = rt6_dst->plen;
4701 rtm->rtm_src_len = rt6_src->plen;
4702 rtm->rtm_tos = 0;
4703 if (rt->fib6_table)
4704 table = rt->fib6_table->tb6_id;
4705 else
4706 table = RT6_TABLE_UNSPEC;
4707 rtm->rtm_table = table;
4708 if (nla_put_u32(skb, RTA_TABLE, table))
4709 goto nla_put_failure;
4710
4711 rtm->rtm_type = rt->fib6_type;
4712 rtm->rtm_flags = 0;
4713 rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4714 rtm->rtm_protocol = rt->fib6_protocol;
4715
4716 if (rt6_flags & RTF_CACHE)
4717 rtm->rtm_flags |= RTM_F_CLONED;
4718
4719 if (dest) {
4720 if (nla_put_in6_addr(skb, RTA_DST, dest))
4721 goto nla_put_failure;
4722 rtm->rtm_dst_len = 128;
4723 } else if (rtm->rtm_dst_len)
4724 if (nla_put_in6_addr(skb, RTA_DST, &rt6_dst->addr))
4725 goto nla_put_failure;
4726#ifdef CONFIG_IPV6_SUBTREES
4727 if (src) {
4728 if (nla_put_in6_addr(skb, RTA_SRC, src))
4729 goto nla_put_failure;
4730 rtm->rtm_src_len = 128;
4731 } else if (rtm->rtm_src_len &&
4732 nla_put_in6_addr(skb, RTA_SRC, &rt6_src->addr))
4733 goto nla_put_failure;
4734#endif
4735 if (iif) {
4736#ifdef CONFIG_IPV6_MROUTE
4737 if (ipv6_addr_is_multicast(&rt6_dst->addr)) {
4738 int err = ip6mr_get_route(net, skb, rtm, portid);
4739
4740 if (err == 0)
4741 return 0;
4742 if (err < 0)
4743 goto nla_put_failure;
4744 } else
4745#endif
4746 if (nla_put_u32(skb, RTA_IIF, iif))
4747 goto nla_put_failure;
4748 } else if (dest) {
4749 struct in6_addr saddr_buf;
4750 if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 &&
4751 nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4752 goto nla_put_failure;
4753 }
4754
4755 if (rt->fib6_prefsrc.plen) {
4756 struct in6_addr saddr_buf;
4757 saddr_buf = rt->fib6_prefsrc.addr;
4758 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4759 goto nla_put_failure;
4760 }
4761
4762 pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics;
4763 if (rtnetlink_put_metrics(skb, pmetrics) < 0)
4764 goto nla_put_failure;
4765
4766 if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric))
4767 goto nla_put_failure;
4768
4769 /* For multipath routes, walk the siblings list and add
4770 * each as a nexthop within RTA_MULTIPATH.
4771 */
4772 if (rt6) {
4773 if (rt6_flags & RTF_GATEWAY &&
4774 nla_put_in6_addr(skb, RTA_GATEWAY, &rt6->rt6i_gateway))
4775 goto nla_put_failure;
4776
4777 if (dst->dev && nla_put_u32(skb, RTA_OIF, dst->dev->ifindex))
4778 goto nla_put_failure;
4779 } else if (rt->fib6_nsiblings) {
4780 struct fib6_info *sibling, *next_sibling;
4781 struct nlattr *mp;
4782
4783 mp = nla_nest_start(skb, RTA_MULTIPATH);
4784 if (!mp)
4785 goto nla_put_failure;
4786
4787 if (rt6_add_nexthop(skb, rt) < 0)
4788 goto nla_put_failure;
4789
4790 list_for_each_entry_safe(sibling, next_sibling,
4791 &rt->fib6_siblings, fib6_siblings) {
4792 if (rt6_add_nexthop(skb, sibling) < 0)
4793 goto nla_put_failure;
4794 }
4795
4796 nla_nest_end(skb, mp);
4797 } else {
4798 if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0)
4799 goto nla_put_failure;
4800 }
4801
4802 if (rt6_flags & RTF_EXPIRES) {
4803 expires = dst ? dst->expires : rt->expires;
4804 expires -= jiffies;
4805 }
4806
4807 if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0)
4808 goto nla_put_failure;
4809
4810 if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt6_flags)))
4811 goto nla_put_failure;
4812
4813
4814 nlmsg_end(skb, nlh);
4815 return 0;
4816
4817nla_put_failure:
4818 nlmsg_cancel(skb, nlh);
4819 return -EMSGSIZE;
4820}
4821
4822int rt6_dump_route(struct fib6_info *rt, void *p_arg)
4823{
4824 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4825 struct net *net = arg->net;
4826
4827 if (rt == net->ipv6.fib6_null_entry)
4828 return 0;
4829
4830 if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
4831 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
4832
4833 /* user wants prefix routes only */
4834 if (rtm->rtm_flags & RTM_F_PREFIX &&
4835 !(rt->fib6_flags & RTF_PREFIX_RT)) {
4836 /* success since this is not a prefix route */
4837 return 1;
4838 }
4839 }
4840
4841 return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0,
4842 RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid,
4843 arg->cb->nlh->nlmsg_seq, NLM_F_MULTI);
4844}
4845
4846static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4847 struct netlink_ext_ack *extack)
4848{
4849 struct net *net = sock_net(in_skb->sk);
4850 struct nlattr *tb[RTA_MAX+1];
4851 int err, iif = 0, oif = 0;
4852 struct fib6_info *from;
4853 struct dst_entry *dst;
4854 struct rt6_info *rt;
4855 struct sk_buff *skb;
4856 struct rtmsg *rtm;
4857 struct flowi6 fl6;
4858 bool fibmatch;
4859
4860 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4861 extack);
4862 if (err < 0)
4863 goto errout;
4864
4865 err = -EINVAL;
4866 memset(&fl6, 0, sizeof(fl6));
4867 rtm = nlmsg_data(nlh);
4868 fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4869 fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
4870
4871 if (tb[RTA_SRC]) {
4872 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
4873 goto errout;
4874
4875 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
4876 }
4877
4878 if (tb[RTA_DST]) {
4879 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
4880 goto errout;
4881
4882 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
4883 }
4884
4885 if (tb[RTA_IIF])
4886 iif = nla_get_u32(tb[RTA_IIF]);
4887
4888 if (tb[RTA_OIF])
4889 oif = nla_get_u32(tb[RTA_OIF]);
4890
4891 if (tb[RTA_MARK])
4892 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
4893
4894 if (tb[RTA_UID])
4895 fl6.flowi6_uid = make_kuid(current_user_ns(),
4896 nla_get_u32(tb[RTA_UID]));
4897 else
4898 fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
4899
4900 if (tb[RTA_SPORT])
4901 fl6.fl6_sport = nla_get_be16(tb[RTA_SPORT]);
4902
4903 if (tb[RTA_DPORT])
4904 fl6.fl6_dport = nla_get_be16(tb[RTA_DPORT]);
4905
4906 if (tb[RTA_IP_PROTO]) {
4907 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
4908 &fl6.flowi6_proto, extack);
4909 if (err)
4910 goto errout;
4911 }
4912
4913 if (iif) {
4914 struct net_device *dev;
4915 int flags = 0;
4916
4917 rcu_read_lock();
4918
4919 dev = dev_get_by_index_rcu(net, iif);
4920 if (!dev) {
4921 rcu_read_unlock();
4922 err = -ENODEV;
4923 goto errout;
4924 }
4925
4926 fl6.flowi6_iif = iif;
4927
4928 if (!ipv6_addr_any(&fl6.saddr))
4929 flags |= RT6_LOOKUP_F_HAS_SADDR;
4930
4931 dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags);
4932
4933 rcu_read_unlock();
4934 } else {
4935 fl6.flowi6_oif = oif;
4936
4937 dst = ip6_route_output(net, NULL, &fl6);
4938 }
4939
4940
4941 rt = container_of(dst, struct rt6_info, dst);
4942 if (rt->dst.error) {
4943 err = rt->dst.error;
4944 ip6_rt_put(rt);
4945 goto errout;
4946 }
4947
4948 if (rt == net->ipv6.ip6_null_entry) {
4949 err = rt->dst.error;
4950 ip6_rt_put(rt);
4951 goto errout;
4952 }
4953
4954 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
4955 if (!skb) {
4956 ip6_rt_put(rt);
4957 err = -ENOBUFS;
4958 goto errout;
4959 }
4960
4961 skb_dst_set(skb, &rt->dst);
4962
4963 rcu_read_lock();
4964 from = rcu_dereference(rt->from);
4965
4966 if (fibmatch)
4967 err = rt6_fill_node(net, skb, from, NULL, NULL, NULL, iif,
4968 RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4969 nlh->nlmsg_seq, 0);
4970 else
4971 err = rt6_fill_node(net, skb, from, dst, &fl6.daddr,
4972 &fl6.saddr, iif, RTM_NEWROUTE,
4973 NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
4974 0);
4975 rcu_read_unlock();
4976
4977 if (err < 0) {
4978 kfree_skb(skb);
4979 goto errout;
4980 }
4981
4982 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
4983errout:
4984 return err;
4985}
4986
4987void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
4988 unsigned int nlm_flags)
4989{
4990 struct sk_buff *skb;
4991 struct net *net = info->nl_net;
4992 u32 seq;
4993 int err;
4994
4995 err = -ENOBUFS;
4996 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
4997
4998 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
4999 if (!skb)
5000 goto errout;
5001
5002 err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0,
5003 event, info->portid, seq, nlm_flags);
5004 if (err < 0) {
5005 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
5006 WARN_ON(err == -EMSGSIZE);
5007 kfree_skb(skb);
5008 goto errout;
5009 }
5010 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
5011 info->nlh, gfp_any());
5012 return;
5013errout:
5014 if (err < 0)
5015 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
5016}
5017
5018static int ip6_route_dev_notify(struct notifier_block *this,
5019 unsigned long event, void *ptr)
5020{
5021 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
5022 struct net *net = dev_net(dev);
5023
5024 if (!(dev->flags & IFF_LOOPBACK))
5025 return NOTIFY_OK;
5026
5027 if (event == NETDEV_REGISTER) {
5028 net->ipv6.fib6_null_entry->fib6_nh.nh_dev = dev;
5029 net->ipv6.ip6_null_entry->dst.dev = dev;
5030 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
5031#ifdef CONFIG_IPV6_MULTIPLE_TABLES
5032 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
5033 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
5034 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
5035 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
5036#endif
5037 } else if (event == NETDEV_UNREGISTER &&
5038 dev->reg_state != NETREG_UNREGISTERED) {
5039 /* NETDEV_UNREGISTER could be fired for multiple times by
5040 * netdev_wait_allrefs(). Make sure we only call this once.
5041 */
5042 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
5043#ifdef CONFIG_IPV6_MULTIPLE_TABLES
5044 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
5045 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
5046#endif
5047 }
5048
5049 return NOTIFY_OK;
5050}
5051
5052/*
5053 * /proc
5054 */
5055
5056#ifdef CONFIG_PROC_FS
5057static int rt6_stats_seq_show(struct seq_file *seq, void *v)
5058{
5059 struct net *net = (struct net *)seq->private;
5060 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
5061 net->ipv6.rt6_stats->fib_nodes,
5062 net->ipv6.rt6_stats->fib_route_nodes,
5063 atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
5064 net->ipv6.rt6_stats->fib_rt_entries,
5065 net->ipv6.rt6_stats->fib_rt_cache,
5066 dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
5067 net->ipv6.rt6_stats->fib_discarded_routes);
5068
5069 return 0;
5070}
5071#endif /* CONFIG_PROC_FS */
5072
5073#ifdef CONFIG_SYSCTL
5074
5075static
5076int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
5077 void __user *buffer, size_t *lenp, loff_t *ppos)
5078{
5079 struct net *net;
5080 int delay;
5081 if (!write)
5082 return -EINVAL;
5083
5084 net = (struct net *)ctl->extra1;
5085 delay = net->ipv6.sysctl.flush_delay;
5086 proc_dointvec(ctl, write, buffer, lenp, ppos);
5087 fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
5088 return 0;
5089}
5090
5091struct ctl_table ipv6_route_table_template[] = {
5092 {
5093 .procname = "flush",
5094 .data = &init_net.ipv6.sysctl.flush_delay,
5095 .maxlen = sizeof(int),
5096 .mode = 0200,
5097 .proc_handler = ipv6_sysctl_rtcache_flush
5098 },
5099 {
5100 .procname = "gc_thresh",
5101 .data = &ip6_dst_ops_template.gc_thresh,
5102 .maxlen = sizeof(int),
5103 .mode = 0644,
5104 .proc_handler = proc_dointvec,
5105 },
5106 {
5107 .procname = "max_size",
5108 .data = &init_net.ipv6.sysctl.ip6_rt_max_size,
5109 .maxlen = sizeof(int),
5110 .mode = 0644,
5111 .proc_handler = proc_dointvec,
5112 },
5113 {
5114 .procname = "gc_min_interval",
5115 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5116 .maxlen = sizeof(int),
5117 .mode = 0644,
5118 .proc_handler = proc_dointvec_jiffies,
5119 },
5120 {
5121 .procname = "gc_timeout",
5122 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
5123 .maxlen = sizeof(int),
5124 .mode = 0644,
5125 .proc_handler = proc_dointvec_jiffies,
5126 },
5127 {
5128 .procname = "gc_interval",
5129 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval,
5130 .maxlen = sizeof(int),
5131 .mode = 0644,
5132 .proc_handler = proc_dointvec_jiffies,
5133 },
5134 {
5135 .procname = "gc_elasticity",
5136 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
5137 .maxlen = sizeof(int),
5138 .mode = 0644,
5139 .proc_handler = proc_dointvec,
5140 },
5141 {
5142 .procname = "mtu_expires",
5143 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
5144 .maxlen = sizeof(int),
5145 .mode = 0644,
5146 .proc_handler = proc_dointvec_jiffies,
5147 },
5148 {
5149 .procname = "min_adv_mss",
5150 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss,
5151 .maxlen = sizeof(int),
5152 .mode = 0644,
5153 .proc_handler = proc_dointvec,
5154 },
5155 {
5156 .procname = "gc_min_interval_ms",
5157 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5158 .maxlen = sizeof(int),
5159 .mode = 0644,
5160 .proc_handler = proc_dointvec_ms_jiffies,
5161 },
5162 { }
5163};
5164
5165struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
5166{
5167 struct ctl_table *table;
5168
5169 table = kmemdup(ipv6_route_table_template,
5170 sizeof(ipv6_route_table_template),
5171 GFP_KERNEL);
5172
5173 if (table) {
5174 table[0].data = &net->ipv6.sysctl.flush_delay;
5175 table[0].extra1 = net;
5176 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
5177 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
5178 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5179 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
5180 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
5181 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
5182 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
5183 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
5184 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5185
5186 /* Don't export sysctls to unprivileged users */
5187 if (net->user_ns != &init_user_ns)
5188 table[0].procname = NULL;
5189 }
5190
5191 return table;
5192}
5193#endif
5194
5195static int __net_init ip6_route_net_init(struct net *net)
5196{
5197 int ret = -ENOMEM;
5198
5199 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
5200 sizeof(net->ipv6.ip6_dst_ops));
5201
5202 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
5203 goto out_ip6_dst_ops;
5204
5205 net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template,
5206 sizeof(*net->ipv6.fib6_null_entry),
5207 GFP_KERNEL);
5208 if (!net->ipv6.fib6_null_entry)
5209 goto out_ip6_dst_entries;
5210
5211 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
5212 sizeof(*net->ipv6.ip6_null_entry),
5213 GFP_KERNEL);
5214 if (!net->ipv6.ip6_null_entry)
5215 goto out_fib6_null_entry;
5216 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5217 dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
5218 ip6_template_metrics, true);
5219
5220#ifdef CONFIG_IPV6_MULTIPLE_TABLES
5221 net->ipv6.fib6_has_custom_rules = false;
5222 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
5223 sizeof(*net->ipv6.ip6_prohibit_entry),
5224 GFP_KERNEL);
5225 if (!net->ipv6.ip6_prohibit_entry)
5226 goto out_ip6_null_entry;
5227 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5228 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
5229 ip6_template_metrics, true);
5230
5231 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
5232 sizeof(*net->ipv6.ip6_blk_hole_entry),
5233 GFP_KERNEL);
5234 if (!net->ipv6.ip6_blk_hole_entry)
5235 goto out_ip6_prohibit_entry;
5236 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5237 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
5238 ip6_template_metrics, true);
5239#endif
5240
5241 net->ipv6.sysctl.flush_delay = 0;
5242 net->ipv6.sysctl.ip6_rt_max_size = 4096;
5243 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
5244 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
5245 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
5246 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
5247 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
5248 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
5249
5250 net->ipv6.ip6_rt_gc_expire = 30*HZ;
5251
5252 ret = 0;
5253out:
5254 return ret;
5255
5256#ifdef CONFIG_IPV6_MULTIPLE_TABLES
5257out_ip6_prohibit_entry:
5258 kfree(net->ipv6.ip6_prohibit_entry);
5259out_ip6_null_entry:
5260 kfree(net->ipv6.ip6_null_entry);
5261#endif
5262out_fib6_null_entry:
5263 kfree(net->ipv6.fib6_null_entry);
5264out_ip6_dst_entries:
5265 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5266out_ip6_dst_ops:
5267 goto out;
5268}
5269
5270static void __net_exit ip6_route_net_exit(struct net *net)
5271{
5272 kfree(net->ipv6.fib6_null_entry);
5273 kfree(net->ipv6.ip6_null_entry);
5274#ifdef CONFIG_IPV6_MULTIPLE_TABLES
5275 kfree(net->ipv6.ip6_prohibit_entry);
5276 kfree(net->ipv6.ip6_blk_hole_entry);
5277#endif
5278 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5279}
5280
5281static int __net_init ip6_route_net_init_late(struct net *net)
5282{
5283#ifdef CONFIG_PROC_FS
5284 proc_create_net("ipv6_route", 0, net->proc_net, &ipv6_route_seq_ops,
5285 sizeof(struct ipv6_route_iter));
5286 proc_create_net_single("rt6_stats", 0444, net->proc_net,
5287 rt6_stats_seq_show, NULL);
5288#endif
5289 return 0;
5290}
5291
5292static void __net_exit ip6_route_net_exit_late(struct net *net)
5293{
5294#ifdef CONFIG_PROC_FS
5295 remove_proc_entry("ipv6_route", net->proc_net);
5296 remove_proc_entry("rt6_stats", net->proc_net);
5297#endif
5298}
5299
5300static struct pernet_operations ip6_route_net_ops = {
5301 .init = ip6_route_net_init,
5302 .exit = ip6_route_net_exit,
5303};
5304
5305static int __net_init ipv6_inetpeer_init(struct net *net)
5306{
5307 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
5308
5309 if (!bp)
5310 return -ENOMEM;
5311 inet_peer_base_init(bp);
5312 net->ipv6.peers = bp;
5313 return 0;
5314}
5315
5316static void __net_exit ipv6_inetpeer_exit(struct net *net)
5317{
5318 struct inet_peer_base *bp = net->ipv6.peers;
5319
5320 net->ipv6.peers = NULL;
5321 inetpeer_invalidate_tree(bp);
5322 kfree(bp);
5323}
5324
5325static struct pernet_operations ipv6_inetpeer_ops = {
5326 .init = ipv6_inetpeer_init,
5327 .exit = ipv6_inetpeer_exit,
5328};
5329
5330static struct pernet_operations ip6_route_net_late_ops = {
5331 .init = ip6_route_net_init_late,
5332 .exit = ip6_route_net_exit_late,
5333};
5334
5335static struct notifier_block ip6_route_dev_notifier = {
5336 .notifier_call = ip6_route_dev_notify,
5337 .priority = ADDRCONF_NOTIFY_PRIORITY - 10,
5338};
5339
5340void __init ip6_route_init_special_entries(void)
5341{
5342 /* Registering of the loopback is done before this portion of code,
5343 * the loopback reference in rt6_info will not be taken, do it
5344 * manually for init_net */
5345 init_net.ipv6.fib6_null_entry->fib6_nh.nh_dev = init_net.loopback_dev;
5346 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
5347 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5348 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5349 init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
5350 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5351 init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
5352 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5353 #endif
5354}
5355
5356int __init ip6_route_init(void)
5357{
5358 int ret;
5359 int cpu;
5360
5361 ret = -ENOMEM;
5362 ip6_dst_ops_template.kmem_cachep =
5363 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
5364 SLAB_HWCACHE_ALIGN, NULL);
5365 if (!ip6_dst_ops_template.kmem_cachep)
5366 goto out;
5367
5368 ret = dst_entries_init(&ip6_dst_blackhole_ops);
5369 if (ret)
5370 goto out_kmem_cache;
5371
5372 ret = register_pernet_subsys(&ipv6_inetpeer_ops);
5373 if (ret)
5374 goto out_dst_entries;
5375
5376 ret = register_pernet_subsys(&ip6_route_net_ops);
5377 if (ret)
5378 goto out_register_inetpeer;
5379
5380 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
5381
5382 ret = fib6_init();
5383 if (ret)
5384 goto out_register_subsys;
5385
5386 ret = xfrm6_init();
5387 if (ret)
5388 goto out_fib6_init;
5389
5390 ret = fib6_rules_init();
5391 if (ret)
5392 goto xfrm6_init;
5393
5394 ret = register_pernet_subsys(&ip6_route_net_late_ops);
5395 if (ret)
5396 goto fib6_rules_init;
5397
5398 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE,
5399 inet6_rtm_newroute, NULL, 0);
5400 if (ret < 0)
5401 goto out_register_late_subsys;
5402
5403 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE,
5404 inet6_rtm_delroute, NULL, 0);
5405 if (ret < 0)
5406 goto out_register_late_subsys;
5407
5408 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE,
5409 inet6_rtm_getroute, NULL,
5410 RTNL_FLAG_DOIT_UNLOCKED);
5411 if (ret < 0)
5412 goto out_register_late_subsys;
5413
5414 ret = register_netdevice_notifier(&ip6_route_dev_notifier);
5415 if (ret)
5416 goto out_register_late_subsys;
5417
5418 for_each_possible_cpu(cpu) {
5419 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
5420
5421 INIT_LIST_HEAD(&ul->head);
5422 spin_lock_init(&ul->lock);
5423 }
5424
5425out:
5426 return ret;
5427
5428out_register_late_subsys:
5429 rtnl_unregister_all(PF_INET6);
5430 unregister_pernet_subsys(&ip6_route_net_late_ops);
5431fib6_rules_init:
5432 fib6_rules_cleanup();
5433xfrm6_init:
5434 xfrm6_fini();
5435out_fib6_init:
5436 fib6_gc_cleanup();
5437out_register_subsys:
5438 unregister_pernet_subsys(&ip6_route_net_ops);
5439out_register_inetpeer:
5440 unregister_pernet_subsys(&ipv6_inetpeer_ops);
5441out_dst_entries:
5442 dst_entries_destroy(&ip6_dst_blackhole_ops);
5443out_kmem_cache:
5444 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5445 goto out;
5446}
5447
5448void ip6_route_cleanup(void)
5449{
5450 unregister_netdevice_notifier(&ip6_route_dev_notifier);
5451 unregister_pernet_subsys(&ip6_route_net_late_ops);
5452 fib6_rules_cleanup();
5453 xfrm6_fini();
5454 fib6_gc_cleanup();
5455 unregister_pernet_subsys(&ipv6_inetpeer_ops);
5456 unregister_pernet_subsys(&ip6_route_net_ops);
5457 dst_entries_destroy(&ip6_dst_blackhole_ops);
5458 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5459}