Blame - drivers/vhost/net.c - hafnium/third_party/linux

blob: cec9173aac6f532d5c867e529a54796d6a520960 [file] [log] [blame]

David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1	// SPDX-License-Identifier: GPL-2.0-only
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2	/* Copyright (C) 2009 Red Hat, Inc.
				3	* Author: Michael S. Tsirkin <mst@redhat.com>
				4	*
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5	* virtio-net server in host kernel.
				6	*/
				7
				8	#include <linux/compat.h>
				9	#include <linux/eventfd.h>
				10	#include <linux/vhost.h>
				11	#include <linux/virtio_net.h>
				12	#include <linux/miscdevice.h>
				13	#include <linux/module.h>
				14	#include <linux/moduleparam.h>
				15	#include <linux/mutex.h>
				16	#include <linux/workqueue.h>
				17	#include <linux/file.h>
				18	#include <linux/slab.h>
				19	#include <linux/sched/clock.h>
				20	#include <linux/sched/signal.h>
				21	#include <linux/vmalloc.h>
				22
				23	#include <linux/net.h>
				24	#include <linux/if_packet.h>
				25	#include <linux/if_arp.h>
				26	#include <linux/if_tun.h>
				27	#include <linux/if_macvlan.h>
				28	#include <linux/if_tap.h>
				29	#include <linux/if_vlan.h>
				30	#include <linux/skb_array.h>
				31	#include <linux/skbuff.h>
				32
				33	#include <net/sock.h>
				34	#include <net/xdp.h>
				35
				36	#include "vhost.h"
				37
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	38	static int experimental_zcopytx = 0;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	39	module_param(experimental_zcopytx, int, 0444);
				40	MODULE_PARM_DESC(experimental_zcopytx, "Enable Zero Copy TX;"
				41	" 1 -Enable; 0 - Disable");
				42
				43	/* Max number of bytes transferred before requeueing the job.
				44	* Using this limit prevents one virtqueue from starving others. */
				45	#define VHOST_NET_WEIGHT 0x80000
				46
				47	/* Max number of packets transferred before requeueing the job.
				48	* Using this limit prevents one virtqueue from starving others with small
				49	* pkts.
				50	*/
				51	#define VHOST_NET_PKT_WEIGHT 256
				52
				53	/* MAX number of TX used buffers for outstanding zerocopy */
				54	#define VHOST_MAX_PEND 128
				55	#define VHOST_GOODCOPY_LEN 256
				56
				57	/*
				58	* For transmit, used buffer len is unused; we override it to track buffer
				59	* status internally; used for zerocopy tx only.
				60	*/
				61	/* Lower device DMA failed */
				62	#define VHOST_DMA_FAILED_LEN ((__force __virtio32)3)
				63	/* Lower device DMA done */
				64	#define VHOST_DMA_DONE_LEN ((__force __virtio32)2)
				65	/* Lower device DMA in progress */
				66	#define VHOST_DMA_IN_PROGRESS ((__force __virtio32)1)
				67	/* Buffer unused */
				68	#define VHOST_DMA_CLEAR_LEN ((__force __virtio32)0)
				69
				70	#define VHOST_DMA_IS_DONE(len) ((__force u32)(len) >= (__force u32)VHOST_DMA_DONE_LEN)
				71
				72	enum {
				73	VHOST_NET_FEATURES = VHOST_FEATURES \|
				74	(1ULL << VHOST_NET_F_VIRTIO_NET_HDR) \|
				75	(1ULL << VIRTIO_NET_F_MRG_RXBUF) \|
				76	(1ULL << VIRTIO_F_IOMMU_PLATFORM)
				77	};
				78
				79	enum {
				80	VHOST_NET_BACKEND_FEATURES = (1ULL << VHOST_BACKEND_F_IOTLB_MSG_V2)
				81	};
				82
				83	enum {
				84	VHOST_NET_VQ_RX = 0,
				85	VHOST_NET_VQ_TX = 1,
				86	VHOST_NET_VQ_MAX = 2,
				87	};
				88
				89	struct vhost_net_ubuf_ref {
				90	/* refcount follows semantics similar to kref:
				91	* 0: object is released
				92	* 1: no outstanding ubufs
				93	* >1: outstanding ubufs
				94	*/
				95	atomic_t refcount;
				96	wait_queue_head_t wait;
				97	struct vhost_virtqueue *vq;
				98	};
				99
				100	#define VHOST_NET_BATCH 64
				101	struct vhost_net_buf {
				102	void **queue;
				103	int tail;
				104	int head;
				105	};
				106
				107	struct vhost_net_virtqueue {
				108	struct vhost_virtqueue vq;
				109	size_t vhost_hlen;
				110	size_t sock_hlen;
				111	/* vhost zerocopy support fields below: */
				112	/* last used idx for outstanding DMA zerocopy buffers */
				113	int upend_idx;
				114	/* For TX, first used idx for DMA done zerocopy buffers
				115	* For RX, number of batched heads
				116	*/
				117	int done_idx;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	118	/* Number of XDP frames batched */
				119	int batched_xdp;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	120	/* an array of userspace buffers info */
				121	struct ubuf_info *ubuf_info;
				122	/* Reference counting for outstanding ubufs.
				123	* Protected by vq mutex. Writers must also take device mutex. */
				124	struct vhost_net_ubuf_ref *ubufs;
				125	struct ptr_ring *rx_ring;
				126	struct vhost_net_buf rxq;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	127	/* Batched XDP buffs */
				128	struct xdp_buff *xdp;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	129	};
				130
				131	struct vhost_net {
				132	struct vhost_dev dev;
				133	struct vhost_net_virtqueue vqs[VHOST_NET_VQ_MAX];
				134	struct vhost_poll poll[VHOST_NET_VQ_MAX];
				135	/* Number of TX recently submitted.
				136	* Protected by tx vq lock. */
				137	unsigned tx_packets;
				138	/* Number of times zerocopy TX recently failed.
				139	* Protected by tx vq lock. */
				140	unsigned tx_zcopy_err;
				141	/* Flush in progress. Protected by tx vq lock. */
				142	bool tx_flush;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	143	/* Private page frag */
				144	struct page_frag page_frag;
				145	/* Refcount bias of page frag */
				146	int refcnt_bias;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	147	};
				148
				149	static unsigned vhost_net_zcopy_mask __read_mostly;
				150
				151	static void vhost_net_buf_get_ptr(struct vhost_net_buf rxq)
				152	{
				153	if (rxq->tail != rxq->head)
				154	return rxq->queue[rxq->head];
				155	else
				156	return NULL;
				157	}
				158
				159	static int vhost_net_buf_get_size(struct vhost_net_buf *rxq)
				160	{
				161	return rxq->tail - rxq->head;
				162	}
				163
				164	static int vhost_net_buf_is_empty(struct vhost_net_buf *rxq)
				165	{
				166	return rxq->tail == rxq->head;
				167	}
				168
				169	static void vhost_net_buf_consume(struct vhost_net_buf rxq)
				170	{
				171	void *ret = vhost_net_buf_get_ptr(rxq);
				172	++rxq->head;
				173	return ret;
				174	}
				175
				176	static int vhost_net_buf_produce(struct vhost_net_virtqueue *nvq)
				177	{
				178	struct vhost_net_buf *rxq = &nvq->rxq;
				179
				180	rxq->head = 0;
				181	rxq->tail = ptr_ring_consume_batched(nvq->rx_ring, rxq->queue,
				182	VHOST_NET_BATCH);
				183	return rxq->tail;
				184	}
				185
				186	static void vhost_net_buf_unproduce(struct vhost_net_virtqueue *nvq)
				187	{
				188	struct vhost_net_buf *rxq = &nvq->rxq;
				189
				190	if (nvq->rx_ring && !vhost_net_buf_is_empty(rxq)) {
				191	ptr_ring_unconsume(nvq->rx_ring, rxq->queue + rxq->head,
				192	vhost_net_buf_get_size(rxq),
				193	tun_ptr_free);
				194	rxq->head = rxq->tail = 0;
				195	}
				196	}
				197
				198	static int vhost_net_buf_peek_len(void *ptr)
				199	{
				200	if (tun_is_xdp_frame(ptr)) {
				201	struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr);
				202
				203	return xdpf->len;
				204	}
				205
				206	return __skb_array_len_with_tag(ptr);
				207	}
				208
				209	static int vhost_net_buf_peek(struct vhost_net_virtqueue *nvq)
				210	{
				211	struct vhost_net_buf *rxq = &nvq->rxq;
				212
				213	if (!vhost_net_buf_is_empty(rxq))
				214	goto out;
				215
				216	if (!vhost_net_buf_produce(nvq))
				217	return 0;
				218
				219	out:
				220	return vhost_net_buf_peek_len(vhost_net_buf_get_ptr(rxq));
				221	}
				222
				223	static void vhost_net_buf_init(struct vhost_net_buf *rxq)
				224	{
				225	rxq->head = rxq->tail = 0;
				226	}
				227
				228	static void vhost_net_enable_zcopy(int vq)
				229	{
				230	vhost_net_zcopy_mask \|= 0x1 << vq;
				231	}
				232
				233	static struct vhost_net_ubuf_ref *
				234	vhost_net_ubuf_alloc(struct vhost_virtqueue *vq, bool zcopy)
				235	{
				236	struct vhost_net_ubuf_ref *ubufs;
				237	/* No zero copy backend? Nothing to count. */
				238	if (!zcopy)
				239	return NULL;
				240	ubufs = kmalloc(sizeof(*ubufs), GFP_KERNEL);
				241	if (!ubufs)
				242	return ERR_PTR(-ENOMEM);
				243	atomic_set(&ubufs->refcount, 1);
				244	init_waitqueue_head(&ubufs->wait);
				245	ubufs->vq = vq;
				246	return ubufs;
				247	}
				248
				249	static int vhost_net_ubuf_put(struct vhost_net_ubuf_ref *ubufs)
				250	{
				251	int r = atomic_sub_return(1, &ubufs->refcount);
				252	if (unlikely(!r))
				253	wake_up(&ubufs->wait);
				254	return r;
				255	}
				256
				257	static void vhost_net_ubuf_put_and_wait(struct vhost_net_ubuf_ref *ubufs)
				258	{
				259	vhost_net_ubuf_put(ubufs);
				260	wait_event(ubufs->wait, !atomic_read(&ubufs->refcount));
				261	}
				262
				263	static void vhost_net_ubuf_put_wait_and_free(struct vhost_net_ubuf_ref *ubufs)
				264	{
				265	vhost_net_ubuf_put_and_wait(ubufs);
				266	kfree(ubufs);
				267	}
				268
				269	static void vhost_net_clear_ubuf_info(struct vhost_net *n)
				270	{
				271	int i;
				272
				273	for (i = 0; i < VHOST_NET_VQ_MAX; ++i) {
				274	kfree(n->vqs[i].ubuf_info);
				275	n->vqs[i].ubuf_info = NULL;
				276	}
				277	}
				278
				279	static int vhost_net_set_ubuf_info(struct vhost_net *n)
				280	{
				281	bool zcopy;
				282	int i;
				283
				284	for (i = 0; i < VHOST_NET_VQ_MAX; ++i) {
				285	zcopy = vhost_net_zcopy_mask & (0x1 << i);
				286	if (!zcopy)
				287	continue;
				288	n->vqs[i].ubuf_info =
				289	kmalloc_array(UIO_MAXIOV,
				290	sizeof(*n->vqs[i].ubuf_info),
				291	GFP_KERNEL);
				292	if (!n->vqs[i].ubuf_info)
				293	goto err;
				294	}
				295	return 0;
				296
				297	err:
				298	vhost_net_clear_ubuf_info(n);
				299	return -ENOMEM;
				300	}
				301
				302	static void vhost_net_vq_reset(struct vhost_net *n)
				303	{
				304	int i;
				305
				306	vhost_net_clear_ubuf_info(n);
				307
				308	for (i = 0; i < VHOST_NET_VQ_MAX; i++) {
				309	n->vqs[i].done_idx = 0;
				310	n->vqs[i].upend_idx = 0;
				311	n->vqs[i].ubufs = NULL;
				312	n->vqs[i].vhost_hlen = 0;
				313	n->vqs[i].sock_hlen = 0;
				314	vhost_net_buf_init(&n->vqs[i].rxq);
				315	}
				316
				317	}
				318
				319	static void vhost_net_tx_packet(struct vhost_net *net)
				320	{
				321	++net->tx_packets;
				322	if (net->tx_packets < 1024)
				323	return;
				324	net->tx_packets = 0;
				325	net->tx_zcopy_err = 0;
				326	}
				327
				328	static void vhost_net_tx_err(struct vhost_net *net)
				329	{
				330	++net->tx_zcopy_err;
				331	}
				332
				333	static bool vhost_net_tx_select_zcopy(struct vhost_net *net)
				334	{
				335	/* TX flush waits for outstanding DMAs to be done.
				336	* Don't start new DMAs.
				337	*/
				338	return !net->tx_flush &&
				339	net->tx_packets / 64 >= net->tx_zcopy_err;
				340	}
				341
				342	static bool vhost_sock_zcopy(struct socket *sock)
				343	{
				344	return unlikely(experimental_zcopytx) &&
				345	sock_flag(sock->sk, SOCK_ZEROCOPY);
				346	}
				347
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	348	static bool vhost_sock_xdp(struct socket *sock)
				349	{
				350	return sock_flag(sock->sk, SOCK_XDP);
				351	}
				352
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	353	/* In case of DMA done not in order in lower device driver for some reason.
				354	* upend_idx is used to track end of used idx, done_idx is used to track head
				355	* of used idx. Once lower device DMA done contiguously, we will signal KVM
				356	* guest used idx.
				357	*/
				358	static void vhost_zerocopy_signal_used(struct vhost_net *net,
				359	struct vhost_virtqueue *vq)
				360	{
				361	struct vhost_net_virtqueue *nvq =
				362	container_of(vq, struct vhost_net_virtqueue, vq);
				363	int i, add;
				364	int j = 0;
				365
				366	for (i = nvq->done_idx; i != nvq->upend_idx; i = (i + 1) % UIO_MAXIOV) {
				367	if (vq->heads[i].len == VHOST_DMA_FAILED_LEN)
				368	vhost_net_tx_err(net);
				369	if (VHOST_DMA_IS_DONE(vq->heads[i].len)) {
				370	vq->heads[i].len = VHOST_DMA_CLEAR_LEN;
				371	++j;
				372	} else
				373	break;
				374	}
				375	while (j) {
				376	add = min(UIO_MAXIOV - nvq->done_idx, j);
				377	vhost_add_used_and_signal_n(vq->dev, vq,
				378	&vq->heads[nvq->done_idx], add);
				379	nvq->done_idx = (nvq->done_idx + add) % UIO_MAXIOV;
				380	j -= add;
				381	}
				382	}
				383
				384	static void vhost_zerocopy_callback(struct ubuf_info *ubuf, bool success)
				385	{
				386	struct vhost_net_ubuf_ref *ubufs = ubuf->ctx;
				387	struct vhost_virtqueue *vq = ubufs->vq;
				388	int cnt;
				389
				390	rcu_read_lock_bh();
				391
				392	/* set len to mark this desc buffers done DMA */
				393	vq->heads[ubuf->desc].len = success ?
				394	VHOST_DMA_DONE_LEN : VHOST_DMA_FAILED_LEN;
				395	cnt = vhost_net_ubuf_put(ubufs);
				396
				397	/*
				398	* Trigger polling thread if guest stopped submitting new buffers:
				399	* in this case, the refcount after decrement will eventually reach 1.
				400	* We also trigger polling periodically after each 16 packets
				401	* (the value 16 here is more or less arbitrary, it's tuned to trigger
				402	* less than 10% of times).
				403	*/
				404	if (cnt <= 1 \|\| !(cnt % 16))
				405	vhost_poll_queue(&vq->poll);
				406
				407	rcu_read_unlock_bh();
				408	}
				409
				410	static inline unsigned long busy_clock(void)
				411	{
				412	return local_clock() >> 10;
				413	}
				414
				415	static bool vhost_can_busy_poll(unsigned long endtime)
				416	{
				417	return likely(!need_resched() && !time_after(busy_clock(), endtime) &&
				418	!signal_pending(current));
				419	}
				420
				421	static void vhost_net_disable_vq(struct vhost_net *n,
				422	struct vhost_virtqueue *vq)
				423	{
				424	struct vhost_net_virtqueue *nvq =
				425	container_of(vq, struct vhost_net_virtqueue, vq);
				426	struct vhost_poll *poll = n->poll + (nvq - n->vqs);
				427	if (!vq->private_data)
				428	return;
				429	vhost_poll_stop(poll);
				430	}
				431
				432	static int vhost_net_enable_vq(struct vhost_net *n,
				433	struct vhost_virtqueue *vq)
				434	{
				435	struct vhost_net_virtqueue *nvq =
				436	container_of(vq, struct vhost_net_virtqueue, vq);
				437	struct vhost_poll *poll = n->poll + (nvq - n->vqs);
				438	struct socket *sock;
				439
				440	sock = vq->private_data;
				441	if (!sock)
				442	return 0;
				443
				444	return vhost_poll_start(poll, sock->file);
				445	}
				446
				447	static void vhost_net_signal_used(struct vhost_net_virtqueue *nvq)
				448	{
				449	struct vhost_virtqueue *vq = &nvq->vq;
				450	struct vhost_dev *dev = vq->dev;
				451
				452	if (!nvq->done_idx)
				453	return;
				454
				455	vhost_add_used_and_signal_n(dev, vq, vq->heads, nvq->done_idx);
				456	nvq->done_idx = 0;
				457	}
				458
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	459	static void vhost_tx_batch(struct vhost_net *net,
				460	struct vhost_net_virtqueue *nvq,
				461	struct socket *sock,
				462	struct msghdr *msghdr)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	463	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	464	struct tun_msg_ctl ctl = {
				465	.type = TUN_MSG_PTR,
				466	.num = nvq->batched_xdp,
				467	.ptr = nvq->xdp,
				468	};
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	469	int i, err;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	470
				471	if (nvq->batched_xdp == 0)
				472	goto signal_used;
				473
				474	msghdr->msg_control = &ctl;
				475	err = sock->ops->sendmsg(sock, msghdr, 0);
				476	if (unlikely(err < 0)) {
				477	vq_err(&nvq->vq, "Fail to batch sending packets\n");
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	478
				479	/* free pages owned by XDP; since this is an unlikely error path,
				480	* keep it simple and avoid more complex bulk update for the
				481	* used pages
				482	*/
				483	for (i = 0; i < nvq->batched_xdp; ++i)
				484	put_page(virt_to_head_page(nvq->xdp[i].data));
				485	nvq->batched_xdp = 0;
				486	nvq->done_idx = 0;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	487	return;
				488	}
				489
				490	signal_used:
				491	vhost_net_signal_used(nvq);
				492	nvq->batched_xdp = 0;
				493	}
				494
				495	static int sock_has_rx_data(struct socket *sock)
				496	{
				497	if (unlikely(!sock))
				498	return 0;
				499
				500	if (sock->ops->peek_len)
				501	return sock->ops->peek_len(sock);
				502
				503	return skb_queue_empty(&sock->sk->sk_receive_queue);
				504	}
				505
				506	static void vhost_net_busy_poll_try_queue(struct vhost_net *net,
				507	struct vhost_virtqueue *vq)
				508	{
				509	if (!vhost_vq_avail_empty(&net->dev, vq)) {
				510	vhost_poll_queue(&vq->poll);
				511	} else if (unlikely(vhost_enable_notify(&net->dev, vq))) {
				512	vhost_disable_notify(&net->dev, vq);
				513	vhost_poll_queue(&vq->poll);
				514	}
				515	}
				516
				517	static void vhost_net_busy_poll(struct vhost_net *net,
				518	struct vhost_virtqueue *rvq,
				519	struct vhost_virtqueue *tvq,
				520	bool *busyloop_intr,
				521	bool poll_rx)
				522	{
				523	unsigned long busyloop_timeout;
				524	unsigned long endtime;
				525	struct socket *sock;
				526	struct vhost_virtqueue *vq = poll_rx ? tvq : rvq;
				527
				528	/* Try to hold the vq mutex of the paired virtqueue. We can't
				529	* use mutex_lock() here since we could not guarantee a
				530	* consistenet lock ordering.
				531	*/
				532	if (!mutex_trylock(&vq->mutex))
				533	return;
				534
				535	vhost_disable_notify(&net->dev, vq);
				536	sock = rvq->private_data;
				537
				538	busyloop_timeout = poll_rx ? rvq->busyloop_timeout:
				539	tvq->busyloop_timeout;
				540
				541	preempt_disable();
				542	endtime = busy_clock() + busyloop_timeout;
				543
				544	while (vhost_can_busy_poll(endtime)) {
				545	if (vhost_has_work(&net->dev)) {
				546	*busyloop_intr = true;
				547	break;
				548	}
				549
				550	if ((sock_has_rx_data(sock) &&
				551	!vhost_vq_avail_empty(&net->dev, rvq)) \|\|
				552	!vhost_vq_avail_empty(&net->dev, tvq))
				553	break;
				554
				555	cpu_relax();
				556	}
				557
				558	preempt_enable();
				559
				560	if (poll_rx \|\| sock_has_rx_data(sock))
				561	vhost_net_busy_poll_try_queue(net, vq);
				562	else if (!poll_rx) /* On tx here, sock has no rx data. */
				563	vhost_enable_notify(&net->dev, rvq);
				564
				565	mutex_unlock(&vq->mutex);
				566	}
				567
				568	static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
				569	struct vhost_net_virtqueue *tnvq,
				570	unsigned int out_num, unsigned int in_num,
				571	struct msghdr msghdr, bool busyloop_intr)
				572	{
				573	struct vhost_net_virtqueue *rnvq = &net->vqs[VHOST_NET_VQ_RX];
				574	struct vhost_virtqueue *rvq = &rnvq->vq;
				575	struct vhost_virtqueue *tvq = &tnvq->vq;
				576
				577	int r = vhost_get_vq_desc(tvq, tvq->iov, ARRAY_SIZE(tvq->iov),
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	578	out_num, in_num, NULL, NULL);
				579
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	580	if (r == tvq->num && tvq->busyloop_timeout) {
				581	/* Flush batched packets first */
				582	if (!vhost_sock_zcopy(tvq->private_data))
				583	vhost_tx_batch(net, tnvq, tvq->private_data, msghdr);
				584
				585	vhost_net_busy_poll(net, rvq, tvq, busyloop_intr, false);
				586
				587	r = vhost_get_vq_desc(tvq, tvq->iov, ARRAY_SIZE(tvq->iov),
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	588	out_num, in_num, NULL, NULL);
				589	}
				590
				591	return r;
				592	}
				593
				594	static bool vhost_exceeds_maxpend(struct vhost_net *net)
				595	{
				596	struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
				597	struct vhost_virtqueue *vq = &nvq->vq;
				598
				599	return (nvq->upend_idx + UIO_MAXIOV - nvq->done_idx) % UIO_MAXIOV >
				600	min_t(unsigned int, VHOST_MAX_PEND, vq->num >> 2);
				601	}
				602
				603	static size_t init_iov_iter(struct vhost_virtqueue vq, struct iov_iter iter,
				604	size_t hdr_size, int out)
				605	{
				606	/* Skip header. TODO: support TSO. */
				607	size_t len = iov_length(vq->iov, out);
				608
				609	iov_iter_init(iter, WRITE, vq->iov, out, len);
				610	iov_iter_advance(iter, hdr_size);
				611
				612	return iov_iter_count(iter);
				613	}
				614
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	615	static int get_tx_bufs(struct vhost_net *net,
				616	struct vhost_net_virtqueue *nvq,
				617	struct msghdr *msg,
				618	unsigned int out, unsigned int in,
				619	size_t len, bool busyloop_intr)
				620	{
				621	struct vhost_virtqueue *vq = &nvq->vq;
				622	int ret;
				623
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	624	ret = vhost_net_tx_get_vq_desc(net, nvq, out, in, msg, busyloop_intr);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	625
				626	if (ret < 0 \|\| ret == vq->num)
				627	return ret;
				628
				629	if (*in) {
				630	vq_err(vq, "Unexpected descriptor format for TX: out %d, int %d\n",
				631	out, in);
				632	return -EFAULT;
				633	}
				634
				635	/* Sanity check */
				636	len = init_iov_iter(vq, &msg->msg_iter, nvq->vhost_hlen, out);
				637	if (*len == 0) {
				638	vq_err(vq, "Unexpected header len for TX: %zd expected %zd\n",
				639	*len, nvq->vhost_hlen);
				640	return -EFAULT;
				641	}
				642
				643	return ret;
				644	}
				645
				646	static bool tx_can_batch(struct vhost_virtqueue *vq, size_t total_len)
				647	{
				648	return total_len < VHOST_NET_WEIGHT &&
				649	!vhost_vq_avail_empty(vq->dev, vq);
				650	}
				651
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	652	#define SKB_FRAG_PAGE_ORDER get_order(32768)
				653
				654	static bool vhost_net_page_frag_refill(struct vhost_net *net, unsigned int sz,
				655	struct page_frag *pfrag, gfp_t gfp)
				656	{
				657	if (pfrag->page) {
				658	if (pfrag->offset + sz <= pfrag->size)
				659	return true;
				660	__page_frag_cache_drain(pfrag->page, net->refcnt_bias);
				661	}
				662
				663	pfrag->offset = 0;
				664	net->refcnt_bias = 0;
				665	if (SKB_FRAG_PAGE_ORDER) {
				666	/* Avoid direct reclaim but allow kswapd to wake */
				667	pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) \|
				668	__GFP_COMP \| __GFP_NOWARN \|
				669	__GFP_NORETRY,
				670	SKB_FRAG_PAGE_ORDER);
				671	if (likely(pfrag->page)) {
				672	pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
				673	goto done;
				674	}
				675	}
				676	pfrag->page = alloc_page(gfp);
				677	if (likely(pfrag->page)) {
				678	pfrag->size = PAGE_SIZE;
				679	goto done;
				680	}
				681	return false;
				682
				683	done:
				684	net->refcnt_bias = USHRT_MAX;
				685	page_ref_add(pfrag->page, USHRT_MAX - 1);
				686	return true;
				687	}
				688
				689	#define VHOST_NET_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD)
				690
				691	static int vhost_net_build_xdp(struct vhost_net_virtqueue *nvq,
				692	struct iov_iter *from)
				693	{
				694	struct vhost_virtqueue *vq = &nvq->vq;
				695	struct vhost_net *net = container_of(vq->dev, struct vhost_net,
				696	dev);
				697	struct socket *sock = vq->private_data;
				698	struct page_frag *alloc_frag = &net->page_frag;
				699	struct virtio_net_hdr *gso;
				700	struct xdp_buff *xdp = &nvq->xdp[nvq->batched_xdp];
				701	struct tun_xdp_hdr *hdr;
				702	size_t len = iov_iter_count(from);
				703	int headroom = vhost_sock_xdp(sock) ? XDP_PACKET_HEADROOM : 0;
				704	int buflen = SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
				705	int pad = SKB_DATA_ALIGN(VHOST_NET_RX_PAD + headroom + nvq->sock_hlen);
				706	int sock_hlen = nvq->sock_hlen;
				707	void *buf;
				708	int copied;
				709
				710	if (unlikely(len < nvq->sock_hlen))
				711	return -EFAULT;
				712
				713	if (SKB_DATA_ALIGN(len + pad) +
				714	SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) > PAGE_SIZE)
				715	return -ENOSPC;
				716
				717	buflen += SKB_DATA_ALIGN(len + pad);
				718	alloc_frag->offset = ALIGN((u64)alloc_frag->offset, SMP_CACHE_BYTES);
				719	if (unlikely(!vhost_net_page_frag_refill(net, buflen,
				720	alloc_frag, GFP_KERNEL)))
				721	return -ENOMEM;
				722
				723	buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
				724	copied = copy_page_from_iter(alloc_frag->page,
				725	alloc_frag->offset +
				726	offsetof(struct tun_xdp_hdr, gso),
				727	sock_hlen, from);
				728	if (copied != sock_hlen)
				729	return -EFAULT;
				730
				731	hdr = buf;
				732	gso = &hdr->gso;
				733
				734	if ((gso->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
				735	vhost16_to_cpu(vq, gso->csum_start) +
				736	vhost16_to_cpu(vq, gso->csum_offset) + 2 >
				737	vhost16_to_cpu(vq, gso->hdr_len)) {
				738	gso->hdr_len = cpu_to_vhost16(vq,
				739	vhost16_to_cpu(vq, gso->csum_start) +
				740	vhost16_to_cpu(vq, gso->csum_offset) + 2);
				741
				742	if (vhost16_to_cpu(vq, gso->hdr_len) > len)
				743	return -EINVAL;
				744	}
				745
				746	len -= sock_hlen;
				747	copied = copy_page_from_iter(alloc_frag->page,
				748	alloc_frag->offset + pad,
				749	len, from);
				750	if (copied != len)
				751	return -EFAULT;
				752
				753	xdp->data_hard_start = buf;
				754	xdp->data = buf + pad;
				755	xdp->data_end = xdp->data + len;
				756	hdr->buflen = buflen;
				757
				758	--net->refcnt_bias;
				759	alloc_frag->offset += buflen;
				760
				761	++nvq->batched_xdp;
				762
				763	return 0;
				764	}
				765
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	766	static void handle_tx_copy(struct vhost_net net, struct socket sock)
				767	{
				768	struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
				769	struct vhost_virtqueue *vq = &nvq->vq;
				770	unsigned out, in;
				771	int head;
				772	struct msghdr msg = {
				773	.msg_name = NULL,
				774	.msg_namelen = 0,
				775	.msg_control = NULL,
				776	.msg_controllen = 0,
				777	.msg_flags = MSG_DONTWAIT,
				778	};
				779	size_t len, total_len = 0;
				780	int err;
				781	int sent_pkts = 0;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	782	bool sock_can_batch = (sock->sk->sk_sndbuf == INT_MAX);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	783
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	784	do {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	785	bool busyloop_intr = false;
				786
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	787	if (nvq->done_idx == VHOST_NET_BATCH)
				788	vhost_tx_batch(net, nvq, sock, &msg);
				789
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	790	head = get_tx_bufs(net, nvq, &msg, &out, &in, &len,
				791	&busyloop_intr);
				792	/* On error, stop handling until the next kick. */
				793	if (unlikely(head < 0))
				794	break;
				795	/* Nothing new? Wait for eventfd to tell us they refilled. */
				796	if (head == vq->num) {
				797	if (unlikely(busyloop_intr)) {
				798	vhost_poll_queue(&vq->poll);
				799	} else if (unlikely(vhost_enable_notify(&net->dev,
				800	vq))) {
				801	vhost_disable_notify(&net->dev, vq);
				802	continue;
				803	}
				804	break;
				805	}
				806
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	807	total_len += len;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	808
				809	/* For simplicity, TX batching is only enabled if
				810	* sndbuf is unlimited.
				811	*/
				812	if (sock_can_batch) {
				813	err = vhost_net_build_xdp(nvq, &msg.msg_iter);
				814	if (!err) {
				815	goto done;
				816	} else if (unlikely(err != -ENOSPC)) {
				817	vhost_tx_batch(net, nvq, sock, &msg);
				818	vhost_discard_vq_desc(vq, 1);
				819	vhost_net_enable_vq(net, vq);
				820	break;
				821	}
				822
				823	/* We can't build XDP buff, go for single
				824	* packet path but let's flush batched
				825	* packets.
				826	*/
				827	vhost_tx_batch(net, nvq, sock, &msg);
				828	msg.msg_control = NULL;
				829	} else {
				830	if (tx_can_batch(vq, total_len))
				831	msg.msg_flags \|= MSG_MORE;
				832	else
				833	msg.msg_flags &= ~MSG_MORE;
				834	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	835
				836	/* TODO: Check specific error and bomb out unless ENOBUFS? */
				837	err = sock->ops->sendmsg(sock, &msg, len);
				838	if (unlikely(err < 0)) {
				839	vhost_discard_vq_desc(vq, 1);
				840	vhost_net_enable_vq(net, vq);
				841	break;
				842	}
				843	if (err != len)
				844	pr_debug("Truncated TX packet: len %d != %zd\n",
				845	err, len);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	846	done:
				847	vq->heads[nvq->done_idx].id = cpu_to_vhost32(vq, head);
				848	vq->heads[nvq->done_idx].len = 0;
				849	++nvq->done_idx;
				850	} while (likely(!vhost_exceeds_weight(vq, ++sent_pkts, total_len)));
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	851
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	852	vhost_tx_batch(net, nvq, sock, &msg);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	853	}
				854
				855	static void handle_tx_zerocopy(struct vhost_net net, struct socket sock)
				856	{
				857	struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
				858	struct vhost_virtqueue *vq = &nvq->vq;
				859	unsigned out, in;
				860	int head;
				861	struct msghdr msg = {
				862	.msg_name = NULL,
				863	.msg_namelen = 0,
				864	.msg_control = NULL,
				865	.msg_controllen = 0,
				866	.msg_flags = MSG_DONTWAIT,
				867	};
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	868	struct tun_msg_ctl ctl;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	869	size_t len, total_len = 0;
				870	int err;
				871	struct vhost_net_ubuf_ref *uninitialized_var(ubufs);
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	872	struct ubuf_info *ubuf;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	873	bool zcopy_used;
				874	int sent_pkts = 0;
				875
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	876	do {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	877	bool busyloop_intr;
				878
				879	/* Release DMAs done buffers first */
				880	vhost_zerocopy_signal_used(net, vq);
				881
				882	busyloop_intr = false;
				883	head = get_tx_bufs(net, nvq, &msg, &out, &in, &len,
				884	&busyloop_intr);
				885	/* On error, stop handling until the next kick. */
				886	if (unlikely(head < 0))
				887	break;
				888	/* Nothing new? Wait for eventfd to tell us they refilled. */
				889	if (head == vq->num) {
				890	if (unlikely(busyloop_intr)) {
				891	vhost_poll_queue(&vq->poll);
				892	} else if (unlikely(vhost_enable_notify(&net->dev, vq))) {
				893	vhost_disable_notify(&net->dev, vq);
				894	continue;
				895	}
				896	break;
				897	}
				898
				899	zcopy_used = len >= VHOST_GOODCOPY_LEN
				900	&& !vhost_exceeds_maxpend(net)
				901	&& vhost_net_tx_select_zcopy(net);
				902
				903	/* use msg_control to pass vhost zerocopy ubuf info to skb */
				904	if (zcopy_used) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	905	ubuf = nvq->ubuf_info + nvq->upend_idx;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	906	vq->heads[nvq->upend_idx].id = cpu_to_vhost32(vq, head);
				907	vq->heads[nvq->upend_idx].len = VHOST_DMA_IN_PROGRESS;
				908	ubuf->callback = vhost_zerocopy_callback;
				909	ubuf->ctx = nvq->ubufs;
				910	ubuf->desc = nvq->upend_idx;
				911	refcount_set(&ubuf->refcnt, 1);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	912	msg.msg_control = &ctl;
				913	ctl.type = TUN_MSG_UBUF;
				914	ctl.ptr = ubuf;
				915	msg.msg_controllen = sizeof(ctl);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	916	ubufs = nvq->ubufs;
				917	atomic_inc(&ubufs->refcount);
				918	nvq->upend_idx = (nvq->upend_idx + 1) % UIO_MAXIOV;
				919	} else {
				920	msg.msg_control = NULL;
				921	ubufs = NULL;
				922	}
				923	total_len += len;
				924	if (tx_can_batch(vq, total_len) &&
				925	likely(!vhost_exceeds_maxpend(net))) {
				926	msg.msg_flags \|= MSG_MORE;
				927	} else {
				928	msg.msg_flags &= ~MSG_MORE;
				929	}
				930
				931	/* TODO: Check specific error and bomb out unless ENOBUFS? */
				932	err = sock->ops->sendmsg(sock, &msg, len);
				933	if (unlikely(err < 0)) {
				934	if (zcopy_used) {
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	935	if (vq->heads[ubuf->desc].len == VHOST_DMA_IN_PROGRESS)
				936	vhost_net_ubuf_put(ubufs);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	937	nvq->upend_idx = ((unsigned)nvq->upend_idx - 1)
				938	% UIO_MAXIOV;
				939	}
				940	vhost_discard_vq_desc(vq, 1);
				941	vhost_net_enable_vq(net, vq);
				942	break;
				943	}
				944	if (err != len)
				945	pr_debug("Truncated TX packet: "
				946	" len %d != %zd\n", err, len);
				947	if (!zcopy_used)
				948	vhost_add_used_and_signal(&net->dev, vq, head, 0);
				949	else
				950	vhost_zerocopy_signal_used(net, vq);
				951	vhost_net_tx_packet(net);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	952	} while (likely(!vhost_exceeds_weight(vq, ++sent_pkts, total_len)));
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	953	}
				954
				955	/* Expects to be always run from workqueue - which acts as
				956	* read-size critical section for our kind of RCU. */
				957	static void handle_tx(struct vhost_net *net)
				958	{
				959	struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
				960	struct vhost_virtqueue *vq = &nvq->vq;
				961	struct socket *sock;
				962
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	963	mutex_lock_nested(&vq->mutex, VHOST_NET_VQ_TX);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	964	sock = vq->private_data;
				965	if (!sock)
				966	goto out;
				967
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	968	if (!vq_meta_prefetch(vq))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	969	goto out;
				970
				971	vhost_disable_notify(&net->dev, vq);
				972	vhost_net_disable_vq(net, vq);
				973
				974	if (vhost_sock_zcopy(sock))
				975	handle_tx_zerocopy(net, sock);
				976	else
				977	handle_tx_copy(net, sock);
				978
				979	out:
				980	mutex_unlock(&vq->mutex);
				981	}
				982
				983	static int peek_head_len(struct vhost_net_virtqueue rvq, struct sock sk)
				984	{
				985	struct sk_buff *head;
				986	int len = 0;
				987	unsigned long flags;
				988
				989	if (rvq->rx_ring)
				990	return vhost_net_buf_peek(rvq);
				991
				992	spin_lock_irqsave(&sk->sk_receive_queue.lock, flags);
				993	head = skb_peek(&sk->sk_receive_queue);
				994	if (likely(head)) {
				995	len = head->len;
				996	if (skb_vlan_tag_present(head))
				997	len += VLAN_HLEN;
				998	}
				999
				1000	spin_unlock_irqrestore(&sk->sk_receive_queue.lock, flags);
				1001	return len;
				1002	}
				1003
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1004	static int vhost_net_rx_peek_head_len(struct vhost_net net, struct sock sk,
				1005	bool *busyloop_intr)
				1006	{
				1007	struct vhost_net_virtqueue *rnvq = &net->vqs[VHOST_NET_VQ_RX];
				1008	struct vhost_net_virtqueue *tnvq = &net->vqs[VHOST_NET_VQ_TX];
				1009	struct vhost_virtqueue *rvq = &rnvq->vq;
				1010	struct vhost_virtqueue *tvq = &tnvq->vq;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1011	int len = peek_head_len(rnvq, sk);
				1012
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1013	if (!len && rvq->busyloop_timeout) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1014	/* Flush batched heads first */
				1015	vhost_net_signal_used(rnvq);
				1016	/* Both tx vq and rx socket were polled here */
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1017	vhost_net_busy_poll(net, rvq, tvq, busyloop_intr, true);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1018
				1019	len = peek_head_len(rnvq, sk);
				1020	}
				1021
				1022	return len;
				1023	}
				1024
				1025	/* This is a multi-buffer version of vhost_get_desc, that works if
				1026	* vq has read descriptors only.
				1027	* @vq - the relevant virtqueue
				1028	* @datalen - data length we'll be reading
				1029	* @iovcount - returned count of io vectors we fill
				1030	* @log - vhost log
				1031	* @log_num - log offset
				1032	* @quota - headcount quota, 1 for big buffer
				1033	* returns number of buffer heads allocated, negative on error
				1034	*/
				1035	static int get_rx_bufs(struct vhost_virtqueue *vq,
				1036	struct vring_used_elem *heads,
				1037	int datalen,
				1038	unsigned *iovcount,
				1039	struct vhost_log *log,
				1040	unsigned *log_num,
				1041	unsigned int quota)
				1042	{
				1043	unsigned int out, in;
				1044	int seg = 0;
				1045	int headcount = 0;
				1046	unsigned d;
				1047	int r, nlogs = 0;
				1048	/* len is always initialized before use since we are always called with
				1049	* datalen > 0.
				1050	*/
				1051	u32 uninitialized_var(len);
				1052
				1053	while (datalen > 0 && headcount < quota) {
				1054	if (unlikely(seg >= UIO_MAXIOV)) {
				1055	r = -ENOBUFS;
				1056	goto err;
				1057	}
				1058	r = vhost_get_vq_desc(vq, vq->iov + seg,
				1059	ARRAY_SIZE(vq->iov) - seg, &out,
				1060	&in, log, log_num);
				1061	if (unlikely(r < 0))
				1062	goto err;
				1063
				1064	d = r;
				1065	if (d == vq->num) {
				1066	r = 0;
				1067	goto err;
				1068	}
				1069	if (unlikely(out \|\| in <= 0)) {
				1070	vq_err(vq, "unexpected descriptor format for RX: "
				1071	"out %d, in %d\n", out, in);
				1072	r = -EINVAL;
				1073	goto err;
				1074	}
				1075	if (unlikely(log)) {
				1076	nlogs += *log_num;
				1077	log += *log_num;
				1078	}
				1079	heads[headcount].id = cpu_to_vhost32(vq, d);
				1080	len = iov_length(vq->iov + seg, in);
				1081	heads[headcount].len = cpu_to_vhost32(vq, len);
				1082	datalen -= len;
				1083	++headcount;
				1084	seg += in;
				1085	}
				1086	heads[headcount - 1].len = cpu_to_vhost32(vq, len + datalen);
				1087	*iovcount = seg;
				1088	if (unlikely(log))
				1089	*log_num = nlogs;
				1090
				1091	/* Detect overrun */
				1092	if (unlikely(datalen > 0)) {
				1093	r = UIO_MAXIOV + 1;
				1094	goto err;
				1095	}
				1096	return headcount;
				1097	err:
				1098	vhost_discard_vq_desc(vq, headcount);
				1099	return r;
				1100	}
				1101
				1102	/* Expects to be always run from workqueue - which acts as
				1103	* read-size critical section for our kind of RCU. */
				1104	static void handle_rx(struct vhost_net *net)
				1105	{
				1106	struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_RX];
				1107	struct vhost_virtqueue *vq = &nvq->vq;
				1108	unsigned uninitialized_var(in), log;
				1109	struct vhost_log *vq_log;
				1110	struct msghdr msg = {
				1111	.msg_name = NULL,
				1112	.msg_namelen = 0,
				1113	.msg_control = NULL, /* FIXME: get and handle RX aux data. */
				1114	.msg_controllen = 0,
				1115	.msg_flags = MSG_DONTWAIT,
				1116	};
				1117	struct virtio_net_hdr hdr = {
				1118	.flags = 0,
				1119	.gso_type = VIRTIO_NET_HDR_GSO_NONE
				1120	};
				1121	size_t total_len = 0;
				1122	int err, mergeable;
				1123	s16 headcount;
				1124	size_t vhost_hlen, sock_hlen;
				1125	size_t vhost_len, sock_len;
				1126	bool busyloop_intr = false;
				1127	struct socket *sock;
				1128	struct iov_iter fixup;
				1129	__virtio16 num_buffers;
				1130	int recv_pkts = 0;
				1131
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1132	mutex_lock_nested(&vq->mutex, VHOST_NET_VQ_RX);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1133	sock = vq->private_data;
				1134	if (!sock)
				1135	goto out;
				1136
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1137	if (!vq_meta_prefetch(vq))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1138	goto out;
				1139
				1140	vhost_disable_notify(&net->dev, vq);
				1141	vhost_net_disable_vq(net, vq);
				1142
				1143	vhost_hlen = nvq->vhost_hlen;
				1144	sock_hlen = nvq->sock_hlen;
				1145
				1146	vq_log = unlikely(vhost_has_feature(vq, VHOST_F_LOG_ALL)) ?
				1147	vq->log : NULL;
				1148	mergeable = vhost_has_feature(vq, VIRTIO_NET_F_MRG_RXBUF);
				1149
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1150	do {
				1151	sock_len = vhost_net_rx_peek_head_len(net, sock->sk,
				1152	&busyloop_intr);
				1153	if (!sock_len)
				1154	break;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1155	sock_len += sock_hlen;
				1156	vhost_len = sock_len + vhost_hlen;
				1157	headcount = get_rx_bufs(vq, vq->heads + nvq->done_idx,
				1158	vhost_len, &in, vq_log, &log,
				1159	likely(mergeable) ? UIO_MAXIOV : 1);
				1160	/* On error, stop handling until the next kick. */
				1161	if (unlikely(headcount < 0))
				1162	goto out;
				1163	/* OK, now we need to know about added descriptors. */
				1164	if (!headcount) {
				1165	if (unlikely(busyloop_intr)) {
				1166	vhost_poll_queue(&vq->poll);
				1167	} else if (unlikely(vhost_enable_notify(&net->dev, vq))) {
				1168	/* They have slipped one in as we were
				1169	* doing that: check again. */
				1170	vhost_disable_notify(&net->dev, vq);
				1171	continue;
				1172	}
				1173	/* Nothing new? Wait for eventfd to tell us
				1174	* they refilled. */
				1175	goto out;
				1176	}
				1177	busyloop_intr = false;
				1178	if (nvq->rx_ring)
				1179	msg.msg_control = vhost_net_buf_consume(&nvq->rxq);
				1180	/* On overrun, truncate and discard */
				1181	if (unlikely(headcount > UIO_MAXIOV)) {
				1182	iov_iter_init(&msg.msg_iter, READ, vq->iov, 1, 1);
				1183	err = sock->ops->recvmsg(sock, &msg,
				1184	1, MSG_DONTWAIT \| MSG_TRUNC);
				1185	pr_debug("Discarded rx packet: len %zd\n", sock_len);
				1186	continue;
				1187	}
				1188	/* We don't need to be notified again. */
				1189	iov_iter_init(&msg.msg_iter, READ, vq->iov, in, vhost_len);
				1190	fixup = msg.msg_iter;
				1191	if (unlikely((vhost_hlen))) {
				1192	/* We will supply the header ourselves
				1193	* TODO: support TSO.
				1194	*/
				1195	iov_iter_advance(&msg.msg_iter, vhost_hlen);
				1196	}
				1197	err = sock->ops->recvmsg(sock, &msg,
				1198	sock_len, MSG_DONTWAIT \| MSG_TRUNC);
				1199	/* Userspace might have consumed the packet meanwhile:
				1200	* it's not supposed to do this usually, but might be hard
				1201	* to prevent. Discard data we got (if any) and keep going. */
				1202	if (unlikely(err != sock_len)) {
				1203	pr_debug("Discarded rx packet: "
				1204	" len %d, expected %zd\n", err, sock_len);
				1205	vhost_discard_vq_desc(vq, headcount);
				1206	continue;
				1207	}
				1208	/* Supply virtio_net_hdr if VHOST_NET_F_VIRTIO_NET_HDR */
				1209	if (unlikely(vhost_hlen)) {
				1210	if (copy_to_iter(&hdr, sizeof(hdr),
				1211	&fixup) != sizeof(hdr)) {
				1212	vq_err(vq, "Unable to write vnet_hdr "
				1213	"at addr %p\n", vq->iov->iov_base);
				1214	goto out;
				1215	}
				1216	} else {
				1217	/* Header came from socket; we'll need to patch
				1218	* ->num_buffers over if VIRTIO_NET_F_MRG_RXBUF
				1219	*/
				1220	iov_iter_advance(&fixup, sizeof(hdr));
				1221	}
				1222	/* TODO: Should check and handle checksum. */
				1223
				1224	num_buffers = cpu_to_vhost16(vq, headcount);
				1225	if (likely(mergeable) &&
				1226	copy_to_iter(&num_buffers, sizeof num_buffers,
				1227	&fixup) != sizeof num_buffers) {
				1228	vq_err(vq, "Failed num_buffers write");
				1229	vhost_discard_vq_desc(vq, headcount);
				1230	goto out;
				1231	}
				1232	nvq->done_idx += headcount;
				1233	if (nvq->done_idx > VHOST_NET_BATCH)
				1234	vhost_net_signal_used(nvq);
				1235	if (unlikely(vq_log))
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1236	vhost_log_write(vq, vq_log, log, vhost_len,
				1237	vq->iov, in);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1238	total_len += vhost_len;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1239	} while (likely(!vhost_exceeds_weight(vq, ++recv_pkts, total_len)));
				1240
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1241	if (unlikely(busyloop_intr))
				1242	vhost_poll_queue(&vq->poll);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1243	else if (!sock_len)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1244	vhost_net_enable_vq(net, vq);
				1245	out:
				1246	vhost_net_signal_used(nvq);
				1247	mutex_unlock(&vq->mutex);
				1248	}
				1249
				1250	static void handle_tx_kick(struct vhost_work *work)
				1251	{
				1252	struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue,
				1253	poll.work);
				1254	struct vhost_net *net = container_of(vq->dev, struct vhost_net, dev);
				1255
				1256	handle_tx(net);
				1257	}
				1258
				1259	static void handle_rx_kick(struct vhost_work *work)
				1260	{
				1261	struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue,
				1262	poll.work);
				1263	struct vhost_net *net = container_of(vq->dev, struct vhost_net, dev);
				1264
				1265	handle_rx(net);
				1266	}
				1267
				1268	static void handle_tx_net(struct vhost_work *work)
				1269	{
				1270	struct vhost_net *net = container_of(work, struct vhost_net,
				1271	poll[VHOST_NET_VQ_TX].work);
				1272	handle_tx(net);
				1273	}
				1274
				1275	static void handle_rx_net(struct vhost_work *work)
				1276	{
				1277	struct vhost_net *net = container_of(work, struct vhost_net,
				1278	poll[VHOST_NET_VQ_RX].work);
				1279	handle_rx(net);
				1280	}
				1281
				1282	static int vhost_net_open(struct inode inode, struct file f)
				1283	{
				1284	struct vhost_net *n;
				1285	struct vhost_dev *dev;
				1286	struct vhost_virtqueue **vqs;
				1287	void **queue;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1288	struct xdp_buff *xdp;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1289	int i;
				1290
				1291	n = kvmalloc(sizeof *n, GFP_KERNEL \| __GFP_RETRY_MAYFAIL);
				1292	if (!n)
				1293	return -ENOMEM;
				1294	vqs = kmalloc_array(VHOST_NET_VQ_MAX, sizeof(*vqs), GFP_KERNEL);
				1295	if (!vqs) {
				1296	kvfree(n);
				1297	return -ENOMEM;
				1298	}
				1299
				1300	queue = kmalloc_array(VHOST_NET_BATCH, sizeof(void *),
				1301	GFP_KERNEL);
				1302	if (!queue) {
				1303	kfree(vqs);
				1304	kvfree(n);
				1305	return -ENOMEM;
				1306	}
				1307	n->vqs[VHOST_NET_VQ_RX].rxq.queue = queue;
				1308
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1309	xdp = kmalloc_array(VHOST_NET_BATCH, sizeof(*xdp), GFP_KERNEL);
				1310	if (!xdp) {
				1311	kfree(vqs);
				1312	kvfree(n);
				1313	kfree(queue);
				1314	return -ENOMEM;
				1315	}
				1316	n->vqs[VHOST_NET_VQ_TX].xdp = xdp;
				1317
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1318	dev = &n->dev;
				1319	vqs[VHOST_NET_VQ_TX] = &n->vqs[VHOST_NET_VQ_TX].vq;
				1320	vqs[VHOST_NET_VQ_RX] = &n->vqs[VHOST_NET_VQ_RX].vq;
				1321	n->vqs[VHOST_NET_VQ_TX].vq.handle_kick = handle_tx_kick;
				1322	n->vqs[VHOST_NET_VQ_RX].vq.handle_kick = handle_rx_kick;
				1323	for (i = 0; i < VHOST_NET_VQ_MAX; i++) {
				1324	n->vqs[i].ubufs = NULL;
				1325	n->vqs[i].ubuf_info = NULL;
				1326	n->vqs[i].upend_idx = 0;
				1327	n->vqs[i].done_idx = 0;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1328	n->vqs[i].batched_xdp = 0;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1329	n->vqs[i].vhost_hlen = 0;
				1330	n->vqs[i].sock_hlen = 0;
				1331	n->vqs[i].rx_ring = NULL;
				1332	vhost_net_buf_init(&n->vqs[i].rxq);
				1333	}
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1334	vhost_dev_init(dev, vqs, VHOST_NET_VQ_MAX,
				1335	UIO_MAXIOV + VHOST_NET_BATCH,
				1336	VHOST_NET_PKT_WEIGHT, VHOST_NET_WEIGHT);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1337
				1338	vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, EPOLLOUT, dev);
				1339	vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, EPOLLIN, dev);
				1340
				1341	f->private_data = n;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1342	n->page_frag.page = NULL;
				1343	n->refcnt_bias = 0;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1344
				1345	return 0;
				1346	}
				1347
				1348	static struct socket vhost_net_stop_vq(struct vhost_net n,
				1349	struct vhost_virtqueue *vq)
				1350	{
				1351	struct socket *sock;
				1352	struct vhost_net_virtqueue *nvq =
				1353	container_of(vq, struct vhost_net_virtqueue, vq);
				1354
				1355	mutex_lock(&vq->mutex);
				1356	sock = vq->private_data;
				1357	vhost_net_disable_vq(n, vq);
				1358	vq->private_data = NULL;
				1359	vhost_net_buf_unproduce(nvq);
				1360	nvq->rx_ring = NULL;
				1361	mutex_unlock(&vq->mutex);
				1362	return sock;
				1363	}
				1364
				1365	static void vhost_net_stop(struct vhost_net n, struct socket *tx_sock,
				1366	struct socket **rx_sock)
				1367	{
				1368	*tx_sock = vhost_net_stop_vq(n, &n->vqs[VHOST_NET_VQ_TX].vq);
				1369	*rx_sock = vhost_net_stop_vq(n, &n->vqs[VHOST_NET_VQ_RX].vq);
				1370	}
				1371
				1372	static void vhost_net_flush_vq(struct vhost_net *n, int index)
				1373	{
				1374	vhost_poll_flush(n->poll + index);
				1375	vhost_poll_flush(&n->vqs[index].vq.poll);
				1376	}
				1377
				1378	static void vhost_net_flush(struct vhost_net *n)
				1379	{
				1380	vhost_net_flush_vq(n, VHOST_NET_VQ_TX);
				1381	vhost_net_flush_vq(n, VHOST_NET_VQ_RX);
				1382	if (n->vqs[VHOST_NET_VQ_TX].ubufs) {
				1383	mutex_lock(&n->vqs[VHOST_NET_VQ_TX].vq.mutex);
				1384	n->tx_flush = true;
				1385	mutex_unlock(&n->vqs[VHOST_NET_VQ_TX].vq.mutex);
				1386	/* Wait for all lower device DMAs done. */
				1387	vhost_net_ubuf_put_and_wait(n->vqs[VHOST_NET_VQ_TX].ubufs);
				1388	mutex_lock(&n->vqs[VHOST_NET_VQ_TX].vq.mutex);
				1389	n->tx_flush = false;
				1390	atomic_set(&n->vqs[VHOST_NET_VQ_TX].ubufs->refcount, 1);
				1391	mutex_unlock(&n->vqs[VHOST_NET_VQ_TX].vq.mutex);
				1392	}
				1393	}
				1394
				1395	static int vhost_net_release(struct inode inode, struct file f)
				1396	{
				1397	struct vhost_net *n = f->private_data;
				1398	struct socket *tx_sock;
				1399	struct socket *rx_sock;
				1400
				1401	vhost_net_stop(n, &tx_sock, &rx_sock);
				1402	vhost_net_flush(n);
				1403	vhost_dev_stop(&n->dev);
				1404	vhost_dev_cleanup(&n->dev);
				1405	vhost_net_vq_reset(n);
				1406	if (tx_sock)
				1407	sockfd_put(tx_sock);
				1408	if (rx_sock)
				1409	sockfd_put(rx_sock);
				1410	/* Make sure no callbacks are outstanding */
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1411	synchronize_rcu();
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1412	/* We do an extra flush before freeing memory,
				1413	* since jobs can re-queue themselves. */
				1414	vhost_net_flush(n);
				1415	kfree(n->vqs[VHOST_NET_VQ_RX].rxq.queue);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1416	kfree(n->vqs[VHOST_NET_VQ_TX].xdp);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1417	kfree(n->dev.vqs);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1418	if (n->page_frag.page)
				1419	__page_frag_cache_drain(n->page_frag.page, n->refcnt_bias);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1420	kvfree(n);
				1421	return 0;
				1422	}
				1423
				1424	static struct socket *get_raw_socket(int fd)
				1425	{
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1426	int r;
				1427	struct socket *sock = sockfd_lookup(fd, &r);
				1428
				1429	if (!sock)
				1430	return ERR_PTR(-ENOTSOCK);
				1431
				1432	/* Parameter checking */
				1433	if (sock->sk->sk_type != SOCK_RAW) {
				1434	r = -ESOCKTNOSUPPORT;
				1435	goto err;
				1436	}
				1437
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	1438	if (sock->sk->sk_family != AF_PACKET) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1439	r = -EPFNOSUPPORT;
				1440	goto err;
				1441	}
				1442	return sock;
				1443	err:
				1444	sockfd_put(sock);
				1445	return ERR_PTR(r);
				1446	}
				1447
				1448	static struct ptr_ring *get_tap_ptr_ring(int fd)
				1449	{
				1450	struct ptr_ring *ring;
				1451	struct file *file = fget(fd);
				1452
				1453	if (!file)
				1454	return NULL;
				1455	ring = tun_get_tx_ring(file);
				1456	if (!IS_ERR(ring))
				1457	goto out;
				1458	ring = tap_get_ptr_ring(file);
				1459	if (!IS_ERR(ring))
				1460	goto out;
				1461	ring = NULL;
				1462	out:
				1463	fput(file);
				1464	return ring;
				1465	}
				1466
				1467	static struct socket *get_tap_socket(int fd)
				1468	{
				1469	struct file *file = fget(fd);
				1470	struct socket *sock;
				1471
				1472	if (!file)
				1473	return ERR_PTR(-EBADF);
				1474	sock = tun_get_socket(file);
				1475	if (!IS_ERR(sock))
				1476	return sock;
				1477	sock = tap_get_socket(file);
				1478	if (IS_ERR(sock))
				1479	fput(file);
				1480	return sock;
				1481	}
				1482
				1483	static struct socket *get_socket(int fd)
				1484	{
				1485	struct socket *sock;
				1486
				1487	/* special case to disable backend */
				1488	if (fd == -1)
				1489	return NULL;
				1490	sock = get_raw_socket(fd);
				1491	if (!IS_ERR(sock))
				1492	return sock;
				1493	sock = get_tap_socket(fd);
				1494	if (!IS_ERR(sock))
				1495	return sock;
				1496	return ERR_PTR(-ENOTSOCK);
				1497	}
				1498
				1499	static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
				1500	{
				1501	struct socket sock, oldsock;
				1502	struct vhost_virtqueue *vq;
				1503	struct vhost_net_virtqueue *nvq;
				1504	struct vhost_net_ubuf_ref ubufs, oldubufs = NULL;
				1505	int r;
				1506
				1507	mutex_lock(&n->dev.mutex);
				1508	r = vhost_dev_check_owner(&n->dev);
				1509	if (r)
				1510	goto err;
				1511
				1512	if (index >= VHOST_NET_VQ_MAX) {
				1513	r = -ENOBUFS;
				1514	goto err;
				1515	}
				1516	vq = &n->vqs[index].vq;
				1517	nvq = &n->vqs[index];
				1518	mutex_lock(&vq->mutex);
				1519
				1520	/* Verify that ring has been setup correctly. */
				1521	if (!vhost_vq_access_ok(vq)) {
				1522	r = -EFAULT;
				1523	goto err_vq;
				1524	}
				1525	sock = get_socket(fd);
				1526	if (IS_ERR(sock)) {
				1527	r = PTR_ERR(sock);
				1528	goto err_vq;
				1529	}
				1530
				1531	/* start polling new socket */
				1532	oldsock = vq->private_data;
				1533	if (sock != oldsock) {
				1534	ubufs = vhost_net_ubuf_alloc(vq,
				1535	sock && vhost_sock_zcopy(sock));
				1536	if (IS_ERR(ubufs)) {
				1537	r = PTR_ERR(ubufs);
				1538	goto err_ubufs;
				1539	}
				1540
				1541	vhost_net_disable_vq(n, vq);
				1542	vq->private_data = sock;
				1543	vhost_net_buf_unproduce(nvq);
				1544	r = vhost_vq_init_access(vq);
				1545	if (r)
				1546	goto err_used;
				1547	r = vhost_net_enable_vq(n, vq);
				1548	if (r)
				1549	goto err_used;
				1550	if (index == VHOST_NET_VQ_RX)
				1551	nvq->rx_ring = get_tap_ptr_ring(fd);
				1552
				1553	oldubufs = nvq->ubufs;
				1554	nvq->ubufs = ubufs;
				1555
				1556	n->tx_packets = 0;
				1557	n->tx_zcopy_err = 0;
				1558	n->tx_flush = false;
				1559	}
				1560
				1561	mutex_unlock(&vq->mutex);
				1562
				1563	if (oldubufs) {
				1564	vhost_net_ubuf_put_wait_and_free(oldubufs);
				1565	mutex_lock(&vq->mutex);
				1566	vhost_zerocopy_signal_used(n, vq);
				1567	mutex_unlock(&vq->mutex);
				1568	}
				1569
				1570	if (oldsock) {
				1571	vhost_net_flush_vq(n, index);
				1572	sockfd_put(oldsock);
				1573	}
				1574
				1575	mutex_unlock(&n->dev.mutex);
				1576	return 0;
				1577
				1578	err_used:
				1579	vq->private_data = oldsock;
				1580	vhost_net_enable_vq(n, vq);
				1581	if (ubufs)
				1582	vhost_net_ubuf_put_wait_and_free(ubufs);
				1583	err_ubufs:
				1584	if (sock)
				1585	sockfd_put(sock);
				1586	err_vq:
				1587	mutex_unlock(&vq->mutex);
				1588	err:
				1589	mutex_unlock(&n->dev.mutex);
				1590	return r;
				1591	}
				1592
				1593	static long vhost_net_reset_owner(struct vhost_net *n)
				1594	{
				1595	struct socket *tx_sock = NULL;
				1596	struct socket *rx_sock = NULL;
				1597	long err;
				1598	struct vhost_umem *umem;
				1599
				1600	mutex_lock(&n->dev.mutex);
				1601	err = vhost_dev_check_owner(&n->dev);
				1602	if (err)
				1603	goto done;
				1604	umem = vhost_dev_reset_owner_prepare();
				1605	if (!umem) {
				1606	err = -ENOMEM;
				1607	goto done;
				1608	}
				1609	vhost_net_stop(n, &tx_sock, &rx_sock);
				1610	vhost_net_flush(n);
				1611	vhost_dev_stop(&n->dev);
				1612	vhost_dev_reset_owner(&n->dev, umem);
				1613	vhost_net_vq_reset(n);
				1614	done:
				1615	mutex_unlock(&n->dev.mutex);
				1616	if (tx_sock)
				1617	sockfd_put(tx_sock);
				1618	if (rx_sock)
				1619	sockfd_put(rx_sock);
				1620	return err;
				1621	}
				1622
				1623	static int vhost_net_set_backend_features(struct vhost_net *n, u64 features)
				1624	{
				1625	int i;
				1626
				1627	mutex_lock(&n->dev.mutex);
				1628	for (i = 0; i < VHOST_NET_VQ_MAX; ++i) {
				1629	mutex_lock(&n->vqs[i].vq.mutex);
				1630	n->vqs[i].vq.acked_backend_features = features;
				1631	mutex_unlock(&n->vqs[i].vq.mutex);
				1632	}
				1633	mutex_unlock(&n->dev.mutex);
				1634
				1635	return 0;
				1636	}
				1637
				1638	static int vhost_net_set_features(struct vhost_net *n, u64 features)
				1639	{
				1640	size_t vhost_hlen, sock_hlen, hdr_len;
				1641	int i;
				1642
				1643	hdr_len = (features & ((1ULL << VIRTIO_NET_F_MRG_RXBUF) \|
				1644	(1ULL << VIRTIO_F_VERSION_1))) ?
				1645	sizeof(struct virtio_net_hdr_mrg_rxbuf) :
				1646	sizeof(struct virtio_net_hdr);
				1647	if (features & (1 << VHOST_NET_F_VIRTIO_NET_HDR)) {
				1648	/* vhost provides vnet_hdr */
				1649	vhost_hlen = hdr_len;
				1650	sock_hlen = 0;
				1651	} else {
				1652	/* socket provides vnet_hdr */
				1653	vhost_hlen = 0;
				1654	sock_hlen = hdr_len;
				1655	}
				1656	mutex_lock(&n->dev.mutex);
				1657	if ((features & (1 << VHOST_F_LOG_ALL)) &&
				1658	!vhost_log_access_ok(&n->dev))
				1659	goto out_unlock;
				1660
				1661	if ((features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))) {
				1662	if (vhost_init_device_iotlb(&n->dev, true))
				1663	goto out_unlock;
				1664	}
				1665
				1666	for (i = 0; i < VHOST_NET_VQ_MAX; ++i) {
				1667	mutex_lock(&n->vqs[i].vq.mutex);
				1668	n->vqs[i].vq.acked_features = features;
				1669	n->vqs[i].vhost_hlen = vhost_hlen;
				1670	n->vqs[i].sock_hlen = sock_hlen;
				1671	mutex_unlock(&n->vqs[i].vq.mutex);
				1672	}
				1673	mutex_unlock(&n->dev.mutex);
				1674	return 0;
				1675
				1676	out_unlock:
				1677	mutex_unlock(&n->dev.mutex);
				1678	return -EFAULT;
				1679	}
				1680
				1681	static long vhost_net_set_owner(struct vhost_net *n)
				1682	{
				1683	int r;
				1684
				1685	mutex_lock(&n->dev.mutex);
				1686	if (vhost_dev_has_owner(&n->dev)) {
				1687	r = -EBUSY;
				1688	goto out;
				1689	}
				1690	r = vhost_net_set_ubuf_info(n);
				1691	if (r)
				1692	goto out;
				1693	r = vhost_dev_set_owner(&n->dev);
				1694	if (r)
				1695	vhost_net_clear_ubuf_info(n);
				1696	vhost_net_flush(n);
				1697	out:
				1698	mutex_unlock(&n->dev.mutex);
				1699	return r;
				1700	}
				1701
				1702	static long vhost_net_ioctl(struct file *f, unsigned int ioctl,
				1703	unsigned long arg)
				1704	{
				1705	struct vhost_net *n = f->private_data;
				1706	void __user argp = (void __user )arg;
				1707	u64 __user *featurep = argp;
				1708	struct vhost_vring_file backend;
				1709	u64 features;
				1710	int r;
				1711
				1712	switch (ioctl) {
				1713	case VHOST_NET_SET_BACKEND:
				1714	if (copy_from_user(&backend, argp, sizeof backend))
				1715	return -EFAULT;
				1716	return vhost_net_set_backend(n, backend.index, backend.fd);
				1717	case VHOST_GET_FEATURES:
				1718	features = VHOST_NET_FEATURES;
				1719	if (copy_to_user(featurep, &features, sizeof features))
				1720	return -EFAULT;
				1721	return 0;
				1722	case VHOST_SET_FEATURES:
				1723	if (copy_from_user(&features, featurep, sizeof features))
				1724	return -EFAULT;
				1725	if (features & ~VHOST_NET_FEATURES)
				1726	return -EOPNOTSUPP;
				1727	return vhost_net_set_features(n, features);
				1728	case VHOST_GET_BACKEND_FEATURES:
				1729	features = VHOST_NET_BACKEND_FEATURES;
				1730	if (copy_to_user(featurep, &features, sizeof(features)))
				1731	return -EFAULT;
				1732	return 0;
				1733	case VHOST_SET_BACKEND_FEATURES:
				1734	if (copy_from_user(&features, featurep, sizeof(features)))
				1735	return -EFAULT;
				1736	if (features & ~VHOST_NET_BACKEND_FEATURES)
				1737	return -EOPNOTSUPP;
				1738	return vhost_net_set_backend_features(n, features);
				1739	case VHOST_RESET_OWNER:
				1740	return vhost_net_reset_owner(n);
				1741	case VHOST_SET_OWNER:
				1742	return vhost_net_set_owner(n);
				1743	default:
				1744	mutex_lock(&n->dev.mutex);
				1745	r = vhost_dev_ioctl(&n->dev, ioctl, argp);
				1746	if (r == -ENOIOCTLCMD)
				1747	r = vhost_vring_ioctl(&n->dev, ioctl, argp);
				1748	else
				1749	vhost_net_flush(n);
				1750	mutex_unlock(&n->dev.mutex);
				1751	return r;
				1752	}
				1753	}
				1754
				1755	#ifdef CONFIG_COMPAT
				1756	static long vhost_net_compat_ioctl(struct file *f, unsigned int ioctl,
				1757	unsigned long arg)
				1758	{
				1759	return vhost_net_ioctl(f, ioctl, (unsigned long)compat_ptr(arg));
				1760	}
				1761	#endif
				1762
				1763	static ssize_t vhost_net_chr_read_iter(struct kiocb iocb, struct iov_iter to)
				1764	{
				1765	struct file *file = iocb->ki_filp;
				1766	struct vhost_net *n = file->private_data;
				1767	struct vhost_dev *dev = &n->dev;
				1768	int noblock = file->f_flags & O_NONBLOCK;
				1769
				1770	return vhost_chr_read_iter(dev, to, noblock);
				1771	}
				1772
				1773	static ssize_t vhost_net_chr_write_iter(struct kiocb *iocb,
				1774	struct iov_iter *from)
				1775	{
				1776	struct file *file = iocb->ki_filp;
				1777	struct vhost_net *n = file->private_data;
				1778	struct vhost_dev *dev = &n->dev;
				1779
				1780	return vhost_chr_write_iter(dev, from);
				1781	}
				1782
				1783	static __poll_t vhost_net_chr_poll(struct file file, poll_table wait)
				1784	{
				1785	struct vhost_net *n = file->private_data;
				1786	struct vhost_dev *dev = &n->dev;
				1787
				1788	return vhost_chr_poll(file, dev, wait);
				1789	}
				1790
				1791	static const struct file_operations vhost_net_fops = {
				1792	.owner = THIS_MODULE,
				1793	.release = vhost_net_release,
				1794	.read_iter = vhost_net_chr_read_iter,
				1795	.write_iter = vhost_net_chr_write_iter,
				1796	.poll = vhost_net_chr_poll,
				1797	.unlocked_ioctl = vhost_net_ioctl,
				1798	#ifdef CONFIG_COMPAT
				1799	.compat_ioctl = vhost_net_compat_ioctl,
				1800	#endif
				1801	.open = vhost_net_open,
				1802	.llseek = noop_llseek,
				1803	};
				1804
				1805	static struct miscdevice vhost_net_misc = {
				1806	.minor = VHOST_NET_MINOR,
				1807	.name = "vhost-net",
				1808	.fops = &vhost_net_fops,
				1809	};
				1810
				1811	static int vhost_net_init(void)
				1812	{
				1813	if (experimental_zcopytx)
				1814	vhost_net_enable_zcopy(VHOST_NET_VQ_TX);
				1815	return misc_register(&vhost_net_misc);
				1816	}
				1817	module_init(vhost_net_init);
				1818
				1819	static void vhost_net_exit(void)
				1820	{
				1821	misc_deregister(&vhost_net_misc);
				1822	}
				1823	module_exit(vhost_net_exit);
				1824
				1825	MODULE_VERSION("0.0.1");
				1826	MODULE_LICENSE("GPL v2");
				1827	MODULE_AUTHOR("Michael S. Tsirkin");
				1828	MODULE_DESCRIPTION("Host kernel accelerator for virtio net");
				1829	MODULE_ALIAS_MISCDEV(VHOST_NET_MINOR);
				1830	MODULE_ALIAS("devname:vhost-net");