Update Linux to v5.4.2
Change-Id: Idf6911045d9d382da2cfe01b1edff026404ac8fd
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 8c1abcb..a8d3141 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1,17 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* TUN - Universal TUN/TAP device driver.
* Copyright (C) 1999-2002 Maxim Krasnyansky <maxk@qualcomm.com>
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
* $Id: tun.c,v 1.15 2002/03/01 02:44:24 maxk Exp $
*/
@@ -113,7 +104,6 @@
} while (0)
#endif
-#define TUN_HEADROOM 256
#define TUN_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD)
/* TUN device flags */
@@ -170,7 +160,6 @@
struct tun_file {
struct sock sk;
struct socket socket;
- struct socket_wq wq;
struct tun_struct __rcu *tun;
struct fasync_struct *fasync;
/* only used for fasnyc */
@@ -189,6 +178,11 @@
struct xdp_rxq_info xdp_rxq;
};
+struct tun_page {
+ struct page *page;
+ int count;
+};
+
struct tun_flow_entry {
struct hlist_node hash_link;
struct rcu_head rcu;
@@ -197,7 +191,7 @@
u32 rxhash;
u32 rps_rxhash;
int queue_index;
- unsigned long updated;
+ unsigned long updated ____cacheline_aligned_in_smp;
};
#define TUN_NUM_FLOW_ENTRIES 1024
@@ -525,18 +519,17 @@
unsigned long delay = tun->ageing_time;
u16 queue_index = tfile->queue_index;
- if (!rxhash)
- return;
- else
- head = &tun->flows[tun_hashfn(rxhash)];
+ head = &tun->flows[tun_hashfn(rxhash)];
rcu_read_lock();
e = tun_flow_find(head, rxhash);
if (likely(e)) {
/* TODO: keep queueing to old queue until it's empty? */
- e->queue_index = queue_index;
- e->updated = jiffies;
+ if (READ_ONCE(e->queue_index) != queue_index)
+ WRITE_ONCE(e->queue_index, queue_index);
+ if (e->updated != jiffies)
+ e->updated = jiffies;
sock_rps_record_flow_hash(e->rps_rxhash);
} else {
spin_lock_bh(&tun->lock);
@@ -563,12 +556,11 @@
e->rps_rxhash = hash;
}
-/* We try to identify a flow through its rxhash first. The reason that
+/* We try to identify a flow through its rxhash. The reason that
* we do not check rxq no. is because some cards(e.g 82599), chooses
* the rxq based on the txq where the last packet of the flow comes. As
* the userspace application move between processors, we may get a
- * different rxq no. here. If we could not get rxhash, then we would
- * hope the rxq no. may help here.
+ * different rxq no. here.
*/
static u16 tun_automq_select_queue(struct tun_struct *tun, struct sk_buff *skb)
{
@@ -579,18 +571,13 @@
numqueues = READ_ONCE(tun->numqueues);
txq = __skb_get_hash_symmetric(skb);
- if (txq) {
- e = tun_flow_find(&tun->flows[tun_hashfn(txq)], txq);
- if (e) {
- tun_flow_save_rps_rxhash(e, txq);
- txq = e->queue_index;
- } else
- /* use multiply and shift instead of expensive divide */
- txq = ((u64)txq * numqueues) >> 32;
- } else if (likely(skb_rx_queue_recorded(skb))) {
- txq = skb_get_rx_queue(skb);
- while (unlikely(txq >= numqueues))
- txq -= numqueues;
+ e = tun_flow_find(&tun->flows[tun_hashfn(txq)], txq);
+ if (e) {
+ tun_flow_save_rps_rxhash(e, txq);
+ txq = e->queue_index;
+ } else {
+ /* use multiply and shift instead of expensive divide */
+ txq = ((u64)txq * numqueues) >> 32;
}
return txq;
@@ -599,18 +586,22 @@
static u16 tun_ebpf_select_queue(struct tun_struct *tun, struct sk_buff *skb)
{
struct tun_prog *prog;
+ u32 numqueues;
u16 ret = 0;
+ numqueues = READ_ONCE(tun->numqueues);
+ if (!numqueues)
+ return 0;
+
prog = rcu_dereference(tun->steering_prog);
if (prog)
ret = bpf_prog_run_clear_cb(prog->prog, skb);
- return ret % tun->numqueues;
+ return ret % numqueues;
}
static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb,
- struct net_device *sb_dev,
- select_queue_fallback_t fallback)
+ struct net_device *sb_dev)
{
struct tun_struct *tun = netdev_priv(dev);
u16 ret;
@@ -703,6 +694,8 @@
tun->tfiles[tun->numqueues - 1]);
ntfile = rtnl_dereference(tun->tfiles[index]);
ntfile->queue_index = index;
+ rcu_assign_pointer(tun->tfiles[tun->numqueues - 1],
+ NULL);
--tun->numqueues;
if (clean) {
@@ -794,7 +787,8 @@
}
static int tun_attach(struct tun_struct *tun, struct file *file,
- bool skip_filter, bool napi, bool napi_frags)
+ bool skip_filter, bool napi, bool napi_frags,
+ bool publish_tun)
{
struct tun_file *tfile = file->private_data;
struct net_device *dev = tun->dev;
@@ -859,10 +853,6 @@
err = 0;
}
- rcu_assign_pointer(tfile->tun, tun);
- rcu_assign_pointer(tun->tfiles[tun->numqueues], tfile);
- tun->numqueues++;
-
if (tfile->detached) {
tun_enable_queue(tfile);
} else {
@@ -870,12 +860,22 @@
tun_napi_init(tun, tfile, napi, napi_frags);
}
- tun_set_real_num_queues(tun);
+ if (rtnl_dereference(tun->xdp_prog))
+ sock_set_flag(&tfile->sk, SOCK_XDP);
/* device is allowed to go away first, so no need to hold extra
* refcnt.
*/
+ /* Publish tfile->tun and tun->tfiles only after we've fully
+ * initialized tfile; otherwise we risk using half-initialized
+ * object.
+ */
+ if (publish_tun)
+ rcu_assign_pointer(tfile->tun, tun);
+ rcu_assign_pointer(tun->tfiles[tun->numqueues], tfile);
+ tun->numqueues++;
+ tun_set_real_num_queues(tun);
out:
return err;
}
@@ -1015,18 +1015,8 @@
/* Net device open. */
static int tun_net_open(struct net_device *dev)
{
- struct tun_struct *tun = netdev_priv(dev);
- int i;
-
netif_tx_start_all_queues(dev);
- for (i = 0; i < tun->numqueues; i++) {
- struct tun_file *tfile;
-
- tfile = rtnl_dereference(tun->tfiles[i]);
- tfile->socket.sk->sk_write_space(tfile->socket.sk);
- }
-
return 0;
}
@@ -1041,20 +1031,17 @@
static void tun_automq_xmit(struct tun_struct *tun, struct sk_buff *skb)
{
#ifdef CONFIG_RPS
- if (tun->numqueues == 1 && static_key_false(&rps_needed)) {
+ if (tun->numqueues == 1 && static_branch_unlikely(&rps_needed)) {
/* Select queue was not called for the skbuff, so we extract the
* RPS hash and save it into the flow_table here.
*/
+ struct tun_flow_entry *e;
__u32 rxhash;
rxhash = __skb_get_hash_symmetric(skb);
- if (rxhash) {
- struct tun_flow_entry *e;
- e = tun_flow_find(&tun->flows[tun_hashfn(rxhash)],
- rxhash);
- if (e)
- tun_flow_save_rps_rxhash(e, rxhash);
- }
+ e = tun_flow_find(&tun->flows[tun_hashfn(rxhash)], rxhash);
+ if (e)
+ tun_flow_save_rps_rxhash(e, rxhash);
}
#endif
}
@@ -1083,7 +1070,7 @@
tfile = rcu_dereference(tun->tfiles[txq]);
/* Drop packet if interface is not attached */
- if (txq >= tun->numqueues)
+ if (!tfile)
goto drop;
if (!rcu_dereference(tun->steering_prog))
@@ -1117,7 +1104,7 @@
*/
skb_orphan(skb);
- nf_reset(skb);
+ nf_reset_ct(skb);
if (ptr_ring_produce(&tfile->tx_ring, skb))
goto drop;
@@ -1205,13 +1192,29 @@
struct netlink_ext_ack *extack)
{
struct tun_struct *tun = netdev_priv(dev);
+ struct tun_file *tfile;
struct bpf_prog *old_prog;
+ int i;
old_prog = rtnl_dereference(tun->xdp_prog);
rcu_assign_pointer(tun->xdp_prog, prog);
if (old_prog)
bpf_prog_put(old_prog);
+ for (i = 0; i < tun->numqueues; i++) {
+ tfile = rtnl_dereference(tun->tfiles[i]);
+ if (prog)
+ sock_set_flag(&tfile->sk, SOCK_XDP);
+ else
+ sock_reset_flag(&tfile->sk, SOCK_XDP);
+ }
+ list_for_each_entry(tfile, &tun->disabled, next) {
+ if (prog)
+ sock_set_flag(&tfile->sk, SOCK_XDP);
+ else
+ sock_reset_flag(&tfile->sk, SOCK_XDP);
+ }
+
return 0;
}
@@ -1240,6 +1243,21 @@
}
}
+static int tun_net_change_carrier(struct net_device *dev, bool new_carrier)
+{
+ if (new_carrier) {
+ struct tun_struct *tun = netdev_priv(dev);
+
+ if (!tun->numqueues)
+ return -EPERM;
+
+ netif_carrier_on(dev);
+ } else {
+ netif_carrier_off(dev);
+ }
+ return 0;
+}
+
static const struct net_device_ops tun_netdev_ops = {
.ndo_uninit = tun_net_uninit,
.ndo_open = tun_net_open,
@@ -1249,6 +1267,7 @@
.ndo_select_queue = tun_select_queue,
.ndo_set_rx_headroom = tun_set_headroom,
.ndo_get_stats64 = tun_net_get_stats64,
+ .ndo_change_carrier = tun_net_change_carrier,
};
static void __tun_xdp_flush_tfile(struct tun_file *tfile)
@@ -1274,6 +1293,7 @@
rcu_read_lock();
+resample:
numqueues = READ_ONCE(tun->numqueues);
if (!numqueues) {
rcu_read_unlock();
@@ -1282,6 +1302,8 @@
tfile = rcu_dereference(tun->tfiles[smp_processor_id() %
numqueues]);
+ if (unlikely(!tfile))
+ goto resample;
spin_lock(&tfile->tx_ring.producer_lock);
for (i = 0; i < n; i++) {
@@ -1331,6 +1353,7 @@
.ndo_get_stats64 = tun_net_get_stats64,
.ndo_bpf = tun_xdp,
.ndo_xdp_xmit = tun_xdp_xmit,
+ .ndo_change_carrier = tun_net_change_carrier,
};
static void tun_flow_init(struct tun_struct *tun)
@@ -1464,23 +1487,22 @@
skb->truesize += skb->data_len;
for (i = 1; i < it->nr_segs; i++) {
- struct page_frag *pfrag = ¤t->task_frag;
size_t fragsz = it->iov[i].iov_len;
+ struct page *page;
+ void *frag;
if (fragsz == 0 || fragsz > PAGE_SIZE) {
err = -EINVAL;
goto free;
}
-
- if (!skb_page_frag_refill(fragsz, pfrag, GFP_KERNEL)) {
+ frag = netdev_alloc_frag(fragsz);
+ if (!frag) {
err = -ENOMEM;
goto free;
}
-
- skb_fill_page_desc(skb, i - 1, pfrag->page,
- pfrag->offset, fragsz);
- page_ref_inc(pfrag->page);
- pfrag->offset += fragsz;
+ page = virt_to_head_page(frag);
+ skb_fill_page_desc(skb, i - 1, page,
+ frag - page_address(page), fragsz);
}
return skb;
@@ -1579,6 +1601,57 @@
return true;
}
+static struct sk_buff *__tun_build_skb(struct tun_file *tfile,
+ struct page_frag *alloc_frag, char *buf,
+ int buflen, int len, int pad)
+{
+ struct sk_buff *skb = build_skb(buf, buflen);
+
+ if (!skb)
+ return ERR_PTR(-ENOMEM);
+
+ skb_reserve(skb, pad);
+ skb_put(skb, len);
+ skb_set_owner_w(skb, tfile->socket.sk);
+
+ get_page(alloc_frag->page);
+ alloc_frag->offset += buflen;
+
+ return skb;
+}
+
+static int tun_xdp_act(struct tun_struct *tun, struct bpf_prog *xdp_prog,
+ struct xdp_buff *xdp, u32 act)
+{
+ int err;
+
+ switch (act) {
+ case XDP_REDIRECT:
+ err = xdp_do_redirect(tun->dev, xdp, xdp_prog);
+ if (err)
+ return err;
+ break;
+ case XDP_TX:
+ err = tun_xdp_tx(tun->dev, xdp);
+ if (err < 0)
+ return err;
+ break;
+ case XDP_PASS:
+ break;
+ default:
+ bpf_warn_invalid_xdp_action(act);
+ /* fall through */
+ case XDP_ABORTED:
+ trace_xdp_exception(tun->dev, xdp_prog, act);
+ /* fall through */
+ case XDP_DROP:
+ this_cpu_inc(tun->pcpu_stats->rx_dropped);
+ break;
+ }
+
+ return act;
+}
+
static struct sk_buff *tun_build_skb(struct tun_struct *tun,
struct tun_file *tfile,
struct iov_iter *from,
@@ -1586,18 +1659,17 @@
int len, int *skb_xdp)
{
struct page_frag *alloc_frag = ¤t->task_frag;
- struct sk_buff *skb;
struct bpf_prog *xdp_prog;
int buflen = SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
- unsigned int delta = 0;
char *buf;
size_t copied;
- int err, pad = TUN_RX_PAD;
+ int pad = TUN_RX_PAD;
+ int err = 0;
rcu_read_lock();
xdp_prog = rcu_dereference(tun->xdp_prog);
if (xdp_prog)
- pad += TUN_HEADROOM;
+ pad += XDP_PACKET_HEADROOM;
buflen += SKB_DATA_ALIGN(len + pad);
rcu_read_unlock();
@@ -1616,17 +1688,19 @@
* of xdp_prog above, this should be rare and for simplicity
* we do XDP on skb in case the headroom is not enough.
*/
- if (hdr->gso_type || !xdp_prog)
+ if (hdr->gso_type || !xdp_prog) {
*skb_xdp = 1;
- else
- *skb_xdp = 0;
+ return __tun_build_skb(tfile, alloc_frag, buf, buflen, len,
+ pad);
+ }
+
+ *skb_xdp = 0;
local_bh_disable();
rcu_read_lock();
xdp_prog = rcu_dereference(tun->xdp_prog);
- if (xdp_prog && !*skb_xdp) {
+ if (xdp_prog) {
struct xdp_buff xdp;
- void *orig_data;
u32 act;
xdp.data_hard_start = buf;
@@ -1634,66 +1708,33 @@
xdp_set_data_meta_invalid(&xdp);
xdp.data_end = xdp.data + len;
xdp.rxq = &tfile->xdp_rxq;
- orig_data = xdp.data;
+
act = bpf_prog_run_xdp(xdp_prog, &xdp);
-
- switch (act) {
- case XDP_REDIRECT:
+ if (act == XDP_REDIRECT || act == XDP_TX) {
get_page(alloc_frag->page);
alloc_frag->offset += buflen;
- err = xdp_do_redirect(tun->dev, &xdp, xdp_prog);
- xdp_do_flush_map();
- if (err)
- goto err_redirect;
- rcu_read_unlock();
- local_bh_enable();
- return NULL;
- case XDP_TX:
- get_page(alloc_frag->page);
- alloc_frag->offset += buflen;
- if (tun_xdp_tx(tun->dev, &xdp) < 0)
- goto err_redirect;
- rcu_read_unlock();
- local_bh_enable();
- return NULL;
- case XDP_PASS:
- delta = orig_data - xdp.data;
- len = xdp.data_end - xdp.data;
- break;
- default:
- bpf_warn_invalid_xdp_action(act);
- /* fall through */
- case XDP_ABORTED:
- trace_xdp_exception(tun->dev, xdp_prog, act);
- /* fall through */
- case XDP_DROP:
- goto err_xdp;
}
+ err = tun_xdp_act(tun, xdp_prog, &xdp, act);
+ if (err < 0)
+ goto err_xdp;
+ if (err == XDP_REDIRECT)
+ xdp_do_flush_map();
+ if (err != XDP_PASS)
+ goto out;
+
+ pad = xdp.data - xdp.data_hard_start;
+ len = xdp.data_end - xdp.data;
}
-
- skb = build_skb(buf, buflen);
- if (!skb) {
- rcu_read_unlock();
- local_bh_enable();
- return ERR_PTR(-ENOMEM);
- }
-
- skb_reserve(skb, pad - delta);
- skb_put(skb, len);
- get_page(alloc_frag->page);
- alloc_frag->offset += buflen;
-
rcu_read_unlock();
local_bh_enable();
- return skb;
+ return __tun_build_skb(tfile, alloc_frag, buf, buflen, len, pad);
-err_redirect:
- put_page(alloc_frag->page);
err_xdp:
+ put_page(alloc_frag->page);
+out:
rcu_read_unlock();
local_bh_enable();
- this_cpu_inc(tun->pcpu_stats->rx_dropped);
return NULL;
}
@@ -1716,9 +1757,6 @@
int skb_xdp = 1;
bool frags = tun_napi_frags_enabled(tfile);
- if (!(tun->dev->flags & IFF_UP))
- return -EIO;
-
if (!(tun->flags & IFF_NO_PI)) {
if (len < sizeof(pi))
return -EINVAL;
@@ -1820,6 +1858,8 @@
err = skb_copy_datagram_from_iter(skb, 0, from, len);
if (err) {
+ err = -EFAULT;
+drop:
this_cpu_inc(tun->pcpu_stats->rx_dropped);
kfree_skb(skb);
if (frags) {
@@ -1827,7 +1867,7 @@
mutex_unlock(&tfile->napi_mutex);
}
- return -EFAULT;
+ return err;
}
}
@@ -1882,7 +1922,7 @@
}
skb_reset_network_header(skb);
- skb_probe_transport_header(skb, 0);
+ skb_probe_transport_header(skb);
if (skb_xdp) {
struct bpf_prog *xdp_prog;
@@ -1911,13 +1951,22 @@
!tfile->detached)
rxhash = __skb_get_hash_symmetric(skb);
+ rcu_read_lock();
+ if (unlikely(!(tun->dev->flags & IFF_UP))) {
+ err = -EIO;
+ rcu_read_unlock();
+ goto drop;
+ }
+
if (frags) {
/* Exercise flow dissector code path. */
- u32 headlen = eth_get_headlen(skb->data, skb_headlen(skb));
+ u32 headlen = eth_get_headlen(tun->dev, skb->data,
+ skb_headlen(skb));
if (unlikely(headlen > skb_headlen(skb))) {
this_cpu_inc(tun->pcpu_stats->rx_dropped);
napi_free_frags(&tfile->napi);
+ rcu_read_unlock();
mutex_unlock(&tfile->napi_mutex);
WARN_ON(1);
return -ENOMEM;
@@ -1945,6 +1994,7 @@
} else {
netif_rx_ni(skb);
}
+ rcu_read_unlock();
stats = get_cpu_ptr(tun->pcpu_stats);
u64_stats_update_begin(&stats->syncp);
@@ -2119,10 +2169,10 @@
goto out;
}
- add_wait_queue(&tfile->wq.wait, &wait);
- current->state = TASK_INTERRUPTIBLE;
+ add_wait_queue(&tfile->socket.wq.wait, &wait);
while (1) {
+ set_current_state(TASK_INTERRUPTIBLE);
ptr = ptr_ring_consume(&tfile->tx_ring);
if (ptr)
break;
@@ -2138,8 +2188,8 @@
schedule();
}
- current->state = TASK_RUNNING;
- remove_wait_queue(&tfile->wq.wait, &wait);
+ __set_current_state(TASK_RUNNING);
+ remove_wait_queue(&tfile->socket.wq.wait, &wait);
out:
*err = error;
@@ -2356,18 +2406,158 @@
kill_fasync(&tfile->fasync, SIGIO, POLL_OUT);
}
+static void tun_put_page(struct tun_page *tpage)
+{
+ if (tpage->page)
+ __page_frag_cache_drain(tpage->page, tpage->count);
+}
+
+static int tun_xdp_one(struct tun_struct *tun,
+ struct tun_file *tfile,
+ struct xdp_buff *xdp, int *flush,
+ struct tun_page *tpage)
+{
+ unsigned int datasize = xdp->data_end - xdp->data;
+ struct tun_xdp_hdr *hdr = xdp->data_hard_start;
+ struct virtio_net_hdr *gso = &hdr->gso;
+ struct tun_pcpu_stats *stats;
+ struct bpf_prog *xdp_prog;
+ struct sk_buff *skb = NULL;
+ u32 rxhash = 0, act;
+ int buflen = hdr->buflen;
+ int err = 0;
+ bool skb_xdp = false;
+ struct page *page;
+
+ xdp_prog = rcu_dereference(tun->xdp_prog);
+ if (xdp_prog) {
+ if (gso->gso_type) {
+ skb_xdp = true;
+ goto build;
+ }
+ xdp_set_data_meta_invalid(xdp);
+ xdp->rxq = &tfile->xdp_rxq;
+
+ act = bpf_prog_run_xdp(xdp_prog, xdp);
+ err = tun_xdp_act(tun, xdp_prog, xdp, act);
+ if (err < 0) {
+ put_page(virt_to_head_page(xdp->data));
+ return err;
+ }
+
+ switch (err) {
+ case XDP_REDIRECT:
+ *flush = true;
+ /* fall through */
+ case XDP_TX:
+ return 0;
+ case XDP_PASS:
+ break;
+ default:
+ page = virt_to_head_page(xdp->data);
+ if (tpage->page == page) {
+ ++tpage->count;
+ } else {
+ tun_put_page(tpage);
+ tpage->page = page;
+ tpage->count = 1;
+ }
+ return 0;
+ }
+ }
+
+build:
+ skb = build_skb(xdp->data_hard_start, buflen);
+ if (!skb) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ skb_reserve(skb, xdp->data - xdp->data_hard_start);
+ skb_put(skb, xdp->data_end - xdp->data);
+
+ if (virtio_net_hdr_to_skb(skb, gso, tun_is_little_endian(tun))) {
+ this_cpu_inc(tun->pcpu_stats->rx_frame_errors);
+ kfree_skb(skb);
+ err = -EINVAL;
+ goto out;
+ }
+
+ skb->protocol = eth_type_trans(skb, tun->dev);
+ skb_reset_network_header(skb);
+ skb_probe_transport_header(skb);
+
+ if (skb_xdp) {
+ err = do_xdp_generic(xdp_prog, skb);
+ if (err != XDP_PASS)
+ goto out;
+ }
+
+ if (!rcu_dereference(tun->steering_prog) && tun->numqueues > 1 &&
+ !tfile->detached)
+ rxhash = __skb_get_hash_symmetric(skb);
+
+ skb_record_rx_queue(skb, tfile->queue_index);
+ netif_receive_skb(skb);
+
+ /* No need for get_cpu_ptr() here since this function is
+ * always called with bh disabled
+ */
+ stats = this_cpu_ptr(tun->pcpu_stats);
+ u64_stats_update_begin(&stats->syncp);
+ stats->rx_packets++;
+ stats->rx_bytes += datasize;
+ u64_stats_update_end(&stats->syncp);
+
+ if (rxhash)
+ tun_flow_update(tun, rxhash, tfile);
+
+out:
+ return err;
+}
+
static int tun_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
{
- int ret;
+ int ret, i;
struct tun_file *tfile = container_of(sock, struct tun_file, socket);
struct tun_struct *tun = tun_get(tfile);
+ struct tun_msg_ctl *ctl = m->msg_control;
+ struct xdp_buff *xdp;
if (!tun)
return -EBADFD;
- ret = tun_get_user(tun, tfile, m->msg_control, &m->msg_iter,
+ if (ctl && (ctl->type == TUN_MSG_PTR)) {
+ struct tun_page tpage;
+ int n = ctl->num;
+ int flush = 0;
+
+ memset(&tpage, 0, sizeof(tpage));
+
+ local_bh_disable();
+ rcu_read_lock();
+
+ for (i = 0; i < n; i++) {
+ xdp = &((struct xdp_buff *)ctl->ptr)[i];
+ tun_xdp_one(tun, tfile, xdp, &flush, &tpage);
+ }
+
+ if (flush)
+ xdp_do_flush_map();
+
+ rcu_read_unlock();
+ local_bh_enable();
+
+ tun_put_page(&tpage);
+
+ ret = total_len;
+ goto out;
+ }
+
+ ret = tun_get_user(tun, tfile, ctl ? ctl->ptr : NULL, &m->msg_iter,
m->msg_flags & MSG_DONTWAIT,
m->msg_flags & MSG_MORE);
+out:
tun_put(tun);
return ret;
}
@@ -2542,7 +2732,7 @@
err = tun_attach(tun, file, ifr->ifr_flags & IFF_NOFILTER,
ifr->ifr_flags & IFF_NAPI,
- ifr->ifr_flags & IFF_NAPI_FRAGS);
+ ifr->ifr_flags & IFF_NAPI_FRAGS, true);
if (err < 0)
return err;
@@ -2641,13 +2831,17 @@
INIT_LIST_HEAD(&tun->disabled);
err = tun_attach(tun, file, false, ifr->ifr_flags & IFF_NAPI,
- ifr->ifr_flags & IFF_NAPI_FRAGS);
+ ifr->ifr_flags & IFF_NAPI_FRAGS, false);
if (err < 0)
goto err_free_flow;
err = register_netdevice(tun->dev);
if (err < 0)
goto err_detach;
+ /* free_netdev() won't check refcnt, to aovid race
+ * with dev_put() we need publish tun after registration.
+ */
+ rcu_assign_pointer(tfile->tun, tun);
}
netif_carrier_on(tun->dev);
@@ -2678,8 +2872,7 @@
return err;
}
-static void tun_get_iff(struct net *net, struct tun_struct *tun,
- struct ifreq *ifr)
+static void tun_get_iff(struct tun_struct *tun, struct ifreq *ifr)
{
tun_debug(KERN_INFO, tun, "tun_get_iff\n");
@@ -2791,7 +2984,7 @@
if (ret < 0)
goto unlock;
ret = tun_attach(tun, file, false, tun->flags & IFF_NAPI,
- tun->flags & IFF_NAPI_FRAGS);
+ tun->flags & IFF_NAPI_FRAGS, true);
} else if (ifr->ifr_flags & IFF_DETACH_QUEUE) {
tun = rtnl_dereference(tfile->tun);
if (!tun || !(tun->flags & IFF_MULTI_QUEUE) || tfile->detached)
@@ -2836,12 +3029,12 @@
struct net *net = sock_net(&tfile->sk);
struct tun_struct *tun;
void __user* argp = (void __user*)arg;
+ unsigned int ifindex, carrier;
struct ifreq ifr;
kuid_t owner;
kgid_t group;
int sndbuf;
int vnet_hdr_sz;
- unsigned int ifindex;
int le;
int ret;
bool do_notify = false;
@@ -2908,10 +3101,11 @@
tun_debug(KERN_INFO, tun, "tun_chr_ioctl cmd %u\n", cmd);
+ net = dev_net(tun->dev);
ret = 0;
switch (cmd) {
case TUNGETIFF:
- tun_get_iff(current->nsproxy->net_ns, tun, &ifr);
+ tun_get_iff(tun, &ifr);
if (tfile->detached)
ifr.ifr_flags |= IFF_DETACH_QUEUE;
@@ -3019,7 +3213,7 @@
tun_debug(KERN_DEBUG, tun, "set hw address: %pM\n",
ifr.ifr_hwaddr.sa_data);
- ret = dev_set_mac_address(tun->dev, &ifr.ifr_hwaddr);
+ ret = dev_set_mac_address(tun->dev, &ifr.ifr_hwaddr, NULL);
break;
case TUNGETSNDBUF:
@@ -3125,6 +3319,21 @@
ret = tun_set_ebpf(tun, &tun->filter_prog, argp);
break;
+ case TUNSETCARRIER:
+ ret = -EFAULT;
+ if (copy_from_user(&carrier, argp, sizeof(carrier)))
+ goto unlock;
+
+ ret = tun_net_change_carrier(tun->dev, (bool)carrier);
+ break;
+
+ case TUNGETDEVNETNS:
+ ret = -EPERM;
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+ goto unlock;
+ ret = open_related_ns(&net->ns, get_net_ns);
+ break;
+
default:
ret = -EINVAL;
break;
@@ -3214,8 +3423,7 @@
tfile->flags = 0;
tfile->ifindex = 0;
- init_waitqueue_head(&tfile->wq.wait);
- RCU_INIT_POINTER(tfile->socket.wq, &tfile->wq);
+ init_waitqueue_head(&tfile->socket.wq.wait);
tfile->socket.file = file;
tfile->socket.ops = &tun_socket_ops;
@@ -3254,7 +3462,7 @@
rtnl_lock();
tun = tun_get(tfile);
if (tun)
- tun_get_iff(current->nsproxy->net_ns, tun, &ifr);
+ tun_get_iff(tun, &ifr);
rtnl_unlock();
if (tun)
@@ -3423,6 +3631,7 @@
{
struct net_device *dev = netdev_notifier_info_to_dev(ptr);
struct tun_struct *tun = netdev_priv(dev);
+ int i;
if (dev->rtnl_link_ops != &tun_link_ops)
return NOTIFY_DONE;
@@ -3432,6 +3641,14 @@
if (tun_queue_resize(tun))
return NOTIFY_BAD;
break;
+ case NETDEV_UP:
+ for (i = 0; i < tun->numqueues; i++) {
+ struct tun_file *tfile;
+
+ tfile = rtnl_dereference(tun->tfiles[i]);
+ tfile->socket.sk->sk_write_space(tfile->socket.sk);
+ }
+ break;
default:
break;
}