Blame - drivers/vhost/net.c - hafnium/third_party/linux

blob: da02c3e96e7b2ed4b71a6713418f3933bff36fb2 [file] [log] [blame]

David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1	// SPDX-License-Identifier: GPL-2.0-only
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2	/* Copyright (C) 2009 Red Hat, Inc.
				3	* Author: Michael S. Tsirkin <mst@redhat.com>
				4	*
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5	* virtio-net server in host kernel.
				6	*/
				7
				8	#include <linux/compat.h>
				9	#include <linux/eventfd.h>
				10	#include <linux/vhost.h>
				11	#include <linux/virtio_net.h>
				12	#include <linux/miscdevice.h>
				13	#include <linux/module.h>
				14	#include <linux/moduleparam.h>
				15	#include <linux/mutex.h>
				16	#include <linux/workqueue.h>
				17	#include <linux/file.h>
				18	#include <linux/slab.h>
				19	#include <linux/sched/clock.h>
				20	#include <linux/sched/signal.h>
				21	#include <linux/vmalloc.h>
				22
				23	#include <linux/net.h>
				24	#include <linux/if_packet.h>
				25	#include <linux/if_arp.h>
				26	#include <linux/if_tun.h>
				27	#include <linux/if_macvlan.h>
				28	#include <linux/if_tap.h>
				29	#include <linux/if_vlan.h>
				30	#include <linux/skb_array.h>
				31	#include <linux/skbuff.h>
				32
				33	#include <net/sock.h>
				34	#include <net/xdp.h>
				35
				36	#include "vhost.h"
				37
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	38	static int experimental_zcopytx = 0;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	39	module_param(experimental_zcopytx, int, 0444);
				40	MODULE_PARM_DESC(experimental_zcopytx, "Enable Zero Copy TX;"
				41	" 1 -Enable; 0 - Disable");
				42
				43	/* Max number of bytes transferred before requeueing the job.
				44	* Using this limit prevents one virtqueue from starving others. */
				45	#define VHOST_NET_WEIGHT 0x80000
				46
				47	/* Max number of packets transferred before requeueing the job.
				48	* Using this limit prevents one virtqueue from starving others with small
				49	* pkts.
				50	*/
				51	#define VHOST_NET_PKT_WEIGHT 256
				52
				53	/* MAX number of TX used buffers for outstanding zerocopy */
				54	#define VHOST_MAX_PEND 128
				55	#define VHOST_GOODCOPY_LEN 256
				56
				57	/*
				58	* For transmit, used buffer len is unused; we override it to track buffer
				59	* status internally; used for zerocopy tx only.
				60	*/
				61	/* Lower device DMA failed */
				62	#define VHOST_DMA_FAILED_LEN ((__force __virtio32)3)
				63	/* Lower device DMA done */
				64	#define VHOST_DMA_DONE_LEN ((__force __virtio32)2)
				65	/* Lower device DMA in progress */
				66	#define VHOST_DMA_IN_PROGRESS ((__force __virtio32)1)
				67	/* Buffer unused */
				68	#define VHOST_DMA_CLEAR_LEN ((__force __virtio32)0)
				69
				70	#define VHOST_DMA_IS_DONE(len) ((__force u32)(len) >= (__force u32)VHOST_DMA_DONE_LEN)
				71
				72	enum {
				73	VHOST_NET_FEATURES = VHOST_FEATURES \|
				74	(1ULL << VHOST_NET_F_VIRTIO_NET_HDR) \|
				75	(1ULL << VIRTIO_NET_F_MRG_RXBUF) \|
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	76	(1ULL << VIRTIO_F_ACCESS_PLATFORM)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	77	};
				78
				79	enum {
				80	VHOST_NET_BACKEND_FEATURES = (1ULL << VHOST_BACKEND_F_IOTLB_MSG_V2)
				81	};
				82
				83	enum {
				84	VHOST_NET_VQ_RX = 0,
				85	VHOST_NET_VQ_TX = 1,
				86	VHOST_NET_VQ_MAX = 2,
				87	};
				88
				89	struct vhost_net_ubuf_ref {
				90	/* refcount follows semantics similar to kref:
				91	* 0: object is released
				92	* 1: no outstanding ubufs
				93	* >1: outstanding ubufs
				94	*/
				95	atomic_t refcount;
				96	wait_queue_head_t wait;
				97	struct vhost_virtqueue *vq;
				98	};
				99
				100	#define VHOST_NET_BATCH 64
				101	struct vhost_net_buf {
				102	void **queue;
				103	int tail;
				104	int head;
				105	};
				106
				107	struct vhost_net_virtqueue {
				108	struct vhost_virtqueue vq;
				109	size_t vhost_hlen;
				110	size_t sock_hlen;
				111	/* vhost zerocopy support fields below: */
				112	/* last used idx for outstanding DMA zerocopy buffers */
				113	int upend_idx;
				114	/* For TX, first used idx for DMA done zerocopy buffers
				115	* For RX, number of batched heads
				116	*/
				117	int done_idx;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	118	/* Number of XDP frames batched */
				119	int batched_xdp;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	120	/* an array of userspace buffers info */
				121	struct ubuf_info *ubuf_info;
				122	/* Reference counting for outstanding ubufs.
				123	* Protected by vq mutex. Writers must also take device mutex. */
				124	struct vhost_net_ubuf_ref *ubufs;
				125	struct ptr_ring *rx_ring;
				126	struct vhost_net_buf rxq;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	127	/* Batched XDP buffs */
				128	struct xdp_buff *xdp;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	129	};
				130
				131	struct vhost_net {
				132	struct vhost_dev dev;
				133	struct vhost_net_virtqueue vqs[VHOST_NET_VQ_MAX];
				134	struct vhost_poll poll[VHOST_NET_VQ_MAX];
				135	/* Number of TX recently submitted.
				136	* Protected by tx vq lock. */
				137	unsigned tx_packets;
				138	/* Number of times zerocopy TX recently failed.
				139	* Protected by tx vq lock. */
				140	unsigned tx_zcopy_err;
				141	/* Flush in progress. Protected by tx vq lock. */
				142	bool tx_flush;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	143	/* Private page frag */
				144	struct page_frag page_frag;
				145	/* Refcount bias of page frag */
				146	int refcnt_bias;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	147	};
				148
				149	static unsigned vhost_net_zcopy_mask __read_mostly;
				150
				151	static void vhost_net_buf_get_ptr(struct vhost_net_buf rxq)
				152	{
				153	if (rxq->tail != rxq->head)
				154	return rxq->queue[rxq->head];
				155	else
				156	return NULL;
				157	}
				158
				159	static int vhost_net_buf_get_size(struct vhost_net_buf *rxq)
				160	{
				161	return rxq->tail - rxq->head;
				162	}
				163
				164	static int vhost_net_buf_is_empty(struct vhost_net_buf *rxq)
				165	{
				166	return rxq->tail == rxq->head;
				167	}
				168
				169	static void vhost_net_buf_consume(struct vhost_net_buf rxq)
				170	{
				171	void *ret = vhost_net_buf_get_ptr(rxq);
				172	++rxq->head;
				173	return ret;
				174	}
				175
				176	static int vhost_net_buf_produce(struct vhost_net_virtqueue *nvq)
				177	{
				178	struct vhost_net_buf *rxq = &nvq->rxq;
				179
				180	rxq->head = 0;
				181	rxq->tail = ptr_ring_consume_batched(nvq->rx_ring, rxq->queue,
				182	VHOST_NET_BATCH);
				183	return rxq->tail;
				184	}
				185
				186	static void vhost_net_buf_unproduce(struct vhost_net_virtqueue *nvq)
				187	{
				188	struct vhost_net_buf *rxq = &nvq->rxq;
				189
				190	if (nvq->rx_ring && !vhost_net_buf_is_empty(rxq)) {
				191	ptr_ring_unconsume(nvq->rx_ring, rxq->queue + rxq->head,
				192	vhost_net_buf_get_size(rxq),
				193	tun_ptr_free);
				194	rxq->head = rxq->tail = 0;
				195	}
				196	}
				197
				198	static int vhost_net_buf_peek_len(void *ptr)
				199	{
				200	if (tun_is_xdp_frame(ptr)) {
				201	struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr);
				202
				203	return xdpf->len;
				204	}
				205
				206	return __skb_array_len_with_tag(ptr);
				207	}
				208
				209	static int vhost_net_buf_peek(struct vhost_net_virtqueue *nvq)
				210	{
				211	struct vhost_net_buf *rxq = &nvq->rxq;
				212
				213	if (!vhost_net_buf_is_empty(rxq))
				214	goto out;
				215
				216	if (!vhost_net_buf_produce(nvq))
				217	return 0;
				218
				219	out:
				220	return vhost_net_buf_peek_len(vhost_net_buf_get_ptr(rxq));
				221	}
				222
				223	static void vhost_net_buf_init(struct vhost_net_buf *rxq)
				224	{
				225	rxq->head = rxq->tail = 0;
				226	}
				227
				228	static void vhost_net_enable_zcopy(int vq)
				229	{
				230	vhost_net_zcopy_mask \|= 0x1 << vq;
				231	}
				232
				233	static struct vhost_net_ubuf_ref *
				234	vhost_net_ubuf_alloc(struct vhost_virtqueue *vq, bool zcopy)
				235	{
				236	struct vhost_net_ubuf_ref *ubufs;
				237	/* No zero copy backend? Nothing to count. */
				238	if (!zcopy)
				239	return NULL;
				240	ubufs = kmalloc(sizeof(*ubufs), GFP_KERNEL);
				241	if (!ubufs)
				242	return ERR_PTR(-ENOMEM);
				243	atomic_set(&ubufs->refcount, 1);
				244	init_waitqueue_head(&ubufs->wait);
				245	ubufs->vq = vq;
				246	return ubufs;
				247	}
				248
				249	static int vhost_net_ubuf_put(struct vhost_net_ubuf_ref *ubufs)
				250	{
				251	int r = atomic_sub_return(1, &ubufs->refcount);
				252	if (unlikely(!r))
				253	wake_up(&ubufs->wait);
				254	return r;
				255	}
				256
				257	static void vhost_net_ubuf_put_and_wait(struct vhost_net_ubuf_ref *ubufs)
				258	{
				259	vhost_net_ubuf_put(ubufs);
				260	wait_event(ubufs->wait, !atomic_read(&ubufs->refcount));
				261	}
				262
				263	static void vhost_net_ubuf_put_wait_and_free(struct vhost_net_ubuf_ref *ubufs)
				264	{
				265	vhost_net_ubuf_put_and_wait(ubufs);
				266	kfree(ubufs);
				267	}
				268
				269	static void vhost_net_clear_ubuf_info(struct vhost_net *n)
				270	{
				271	int i;
				272
				273	for (i = 0; i < VHOST_NET_VQ_MAX; ++i) {
				274	kfree(n->vqs[i].ubuf_info);
				275	n->vqs[i].ubuf_info = NULL;
				276	}
				277	}
				278
				279	static int vhost_net_set_ubuf_info(struct vhost_net *n)
				280	{
				281	bool zcopy;
				282	int i;
				283
				284	for (i = 0; i < VHOST_NET_VQ_MAX; ++i) {
				285	zcopy = vhost_net_zcopy_mask & (0x1 << i);
				286	if (!zcopy)
				287	continue;
				288	n->vqs[i].ubuf_info =
				289	kmalloc_array(UIO_MAXIOV,
				290	sizeof(*n->vqs[i].ubuf_info),
				291	GFP_KERNEL);
				292	if (!n->vqs[i].ubuf_info)
				293	goto err;
				294	}
				295	return 0;
				296
				297	err:
				298	vhost_net_clear_ubuf_info(n);
				299	return -ENOMEM;
				300	}
				301
				302	static void vhost_net_vq_reset(struct vhost_net *n)
				303	{
				304	int i;
				305
				306	vhost_net_clear_ubuf_info(n);
				307
				308	for (i = 0; i < VHOST_NET_VQ_MAX; i++) {
				309	n->vqs[i].done_idx = 0;
				310	n->vqs[i].upend_idx = 0;
				311	n->vqs[i].ubufs = NULL;
				312	n->vqs[i].vhost_hlen = 0;
				313	n->vqs[i].sock_hlen = 0;
				314	vhost_net_buf_init(&n->vqs[i].rxq);
				315	}
				316
				317	}
				318
				319	static void vhost_net_tx_packet(struct vhost_net *net)
				320	{
				321	++net->tx_packets;
				322	if (net->tx_packets < 1024)
				323	return;
				324	net->tx_packets = 0;
				325	net->tx_zcopy_err = 0;
				326	}
				327
				328	static void vhost_net_tx_err(struct vhost_net *net)
				329	{
				330	++net->tx_zcopy_err;
				331	}
				332
				333	static bool vhost_net_tx_select_zcopy(struct vhost_net *net)
				334	{
				335	/* TX flush waits for outstanding DMAs to be done.
				336	* Don't start new DMAs.
				337	*/
				338	return !net->tx_flush &&
				339	net->tx_packets / 64 >= net->tx_zcopy_err;
				340	}
				341
				342	static bool vhost_sock_zcopy(struct socket *sock)
				343	{
				344	return unlikely(experimental_zcopytx) &&
				345	sock_flag(sock->sk, SOCK_ZEROCOPY);
				346	}
				347
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	348	static bool vhost_sock_xdp(struct socket *sock)
				349	{
				350	return sock_flag(sock->sk, SOCK_XDP);
				351	}
				352
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	353	/* In case of DMA done not in order in lower device driver for some reason.
				354	* upend_idx is used to track end of used idx, done_idx is used to track head
				355	* of used idx. Once lower device DMA done contiguously, we will signal KVM
				356	* guest used idx.
				357	*/
				358	static void vhost_zerocopy_signal_used(struct vhost_net *net,
				359	struct vhost_virtqueue *vq)
				360	{
				361	struct vhost_net_virtqueue *nvq =
				362	container_of(vq, struct vhost_net_virtqueue, vq);
				363	int i, add;
				364	int j = 0;
				365
				366	for (i = nvq->done_idx; i != nvq->upend_idx; i = (i + 1) % UIO_MAXIOV) {
				367	if (vq->heads[i].len == VHOST_DMA_FAILED_LEN)
				368	vhost_net_tx_err(net);
				369	if (VHOST_DMA_IS_DONE(vq->heads[i].len)) {
				370	vq->heads[i].len = VHOST_DMA_CLEAR_LEN;
				371	++j;
				372	} else
				373	break;
				374	}
				375	while (j) {
				376	add = min(UIO_MAXIOV - nvq->done_idx, j);
				377	vhost_add_used_and_signal_n(vq->dev, vq,
				378	&vq->heads[nvq->done_idx], add);
				379	nvq->done_idx = (nvq->done_idx + add) % UIO_MAXIOV;
				380	j -= add;
				381	}
				382	}
				383
				384	static void vhost_zerocopy_callback(struct ubuf_info *ubuf, bool success)
				385	{
				386	struct vhost_net_ubuf_ref *ubufs = ubuf->ctx;
				387	struct vhost_virtqueue *vq = ubufs->vq;
				388	int cnt;
				389
				390	rcu_read_lock_bh();
				391
				392	/* set len to mark this desc buffers done DMA */
				393	vq->heads[ubuf->desc].len = success ?
				394	VHOST_DMA_DONE_LEN : VHOST_DMA_FAILED_LEN;
				395	cnt = vhost_net_ubuf_put(ubufs);
				396
				397	/*
				398	* Trigger polling thread if guest stopped submitting new buffers:
				399	* in this case, the refcount after decrement will eventually reach 1.
				400	* We also trigger polling periodically after each 16 packets
				401	* (the value 16 here is more or less arbitrary, it's tuned to trigger
				402	* less than 10% of times).
				403	*/
				404	if (cnt <= 1 \|\| !(cnt % 16))
				405	vhost_poll_queue(&vq->poll);
				406
				407	rcu_read_unlock_bh();
				408	}
				409
				410	static inline unsigned long busy_clock(void)
				411	{
				412	return local_clock() >> 10;
				413	}
				414
				415	static bool vhost_can_busy_poll(unsigned long endtime)
				416	{
				417	return likely(!need_resched() && !time_after(busy_clock(), endtime) &&
				418	!signal_pending(current));
				419	}
				420
				421	static void vhost_net_disable_vq(struct vhost_net *n,
				422	struct vhost_virtqueue *vq)
				423	{
				424	struct vhost_net_virtqueue *nvq =
				425	container_of(vq, struct vhost_net_virtqueue, vq);
				426	struct vhost_poll *poll = n->poll + (nvq - n->vqs);
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	427	if (!vhost_vq_get_backend(vq))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	428	return;
				429	vhost_poll_stop(poll);
				430	}
				431
				432	static int vhost_net_enable_vq(struct vhost_net *n,
				433	struct vhost_virtqueue *vq)
				434	{
				435	struct vhost_net_virtqueue *nvq =
				436	container_of(vq, struct vhost_net_virtqueue, vq);
				437	struct vhost_poll *poll = n->poll + (nvq - n->vqs);
				438	struct socket *sock;
				439
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	440	sock = vhost_vq_get_backend(vq);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	441	if (!sock)
				442	return 0;
				443
				444	return vhost_poll_start(poll, sock->file);
				445	}
				446
				447	static void vhost_net_signal_used(struct vhost_net_virtqueue *nvq)
				448	{
				449	struct vhost_virtqueue *vq = &nvq->vq;
				450	struct vhost_dev *dev = vq->dev;
				451
				452	if (!nvq->done_idx)
				453	return;
				454
				455	vhost_add_used_and_signal_n(dev, vq, vq->heads, nvq->done_idx);
				456	nvq->done_idx = 0;
				457	}
				458
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	459	static void vhost_tx_batch(struct vhost_net *net,
				460	struct vhost_net_virtqueue *nvq,
				461	struct socket *sock,
				462	struct msghdr *msghdr)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	463	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	464	struct tun_msg_ctl ctl = {
				465	.type = TUN_MSG_PTR,
				466	.num = nvq->batched_xdp,
				467	.ptr = nvq->xdp,
				468	};
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame]	469	int i, err;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	470
				471	if (nvq->batched_xdp == 0)
				472	goto signal_used;
				473
				474	msghdr->msg_control = &ctl;
				475	err = sock->ops->sendmsg(sock, msghdr, 0);
				476	if (unlikely(err < 0)) {
				477	vq_err(&nvq->vq, "Fail to batch sending packets\n");
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame]	478
				479	/* free pages owned by XDP; since this is an unlikely error path,
				480	* keep it simple and avoid more complex bulk update for the
				481	* used pages
				482	*/
				483	for (i = 0; i < nvq->batched_xdp; ++i)
				484	put_page(virt_to_head_page(nvq->xdp[i].data));
				485	nvq->batched_xdp = 0;
				486	nvq->done_idx = 0;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	487	return;
				488	}
				489
				490	signal_used:
				491	vhost_net_signal_used(nvq);
				492	nvq->batched_xdp = 0;
				493	}
				494
				495	static int sock_has_rx_data(struct socket *sock)
				496	{
				497	if (unlikely(!sock))
				498	return 0;
				499
				500	if (sock->ops->peek_len)
				501	return sock->ops->peek_len(sock);
				502
				503	return skb_queue_empty(&sock->sk->sk_receive_queue);
				504	}
				505
				506	static void vhost_net_busy_poll_try_queue(struct vhost_net *net,
				507	struct vhost_virtqueue *vq)
				508	{
				509	if (!vhost_vq_avail_empty(&net->dev, vq)) {
				510	vhost_poll_queue(&vq->poll);
				511	} else if (unlikely(vhost_enable_notify(&net->dev, vq))) {
				512	vhost_disable_notify(&net->dev, vq);
				513	vhost_poll_queue(&vq->poll);
				514	}
				515	}
				516
				517	static void vhost_net_busy_poll(struct vhost_net *net,
				518	struct vhost_virtqueue *rvq,
				519	struct vhost_virtqueue *tvq,
				520	bool *busyloop_intr,
				521	bool poll_rx)
				522	{
				523	unsigned long busyloop_timeout;
				524	unsigned long endtime;
				525	struct socket *sock;
				526	struct vhost_virtqueue *vq = poll_rx ? tvq : rvq;
				527
				528	/* Try to hold the vq mutex of the paired virtqueue. We can't
				529	* use mutex_lock() here since we could not guarantee a
				530	* consistenet lock ordering.
				531	*/
				532	if (!mutex_trylock(&vq->mutex))
				533	return;
				534
				535	vhost_disable_notify(&net->dev, vq);
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	536	sock = vhost_vq_get_backend(rvq);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	537
				538	busyloop_timeout = poll_rx ? rvq->busyloop_timeout:
				539	tvq->busyloop_timeout;
				540
				541	preempt_disable();
				542	endtime = busy_clock() + busyloop_timeout;
				543
				544	while (vhost_can_busy_poll(endtime)) {
				545	if (vhost_has_work(&net->dev)) {
				546	*busyloop_intr = true;
				547	break;
				548	}
				549
				550	if ((sock_has_rx_data(sock) &&
				551	!vhost_vq_avail_empty(&net->dev, rvq)) \|\|
				552	!vhost_vq_avail_empty(&net->dev, tvq))
				553	break;
				554
				555	cpu_relax();
				556	}
				557
				558	preempt_enable();
				559
				560	if (poll_rx \|\| sock_has_rx_data(sock))
				561	vhost_net_busy_poll_try_queue(net, vq);
				562	else if (!poll_rx) /* On tx here, sock has no rx data. */
				563	vhost_enable_notify(&net->dev, rvq);
				564
				565	mutex_unlock(&vq->mutex);
				566	}
				567
				568	static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
				569	struct vhost_net_virtqueue *tnvq,
				570	unsigned int out_num, unsigned int in_num,
				571	struct msghdr msghdr, bool busyloop_intr)
				572	{
				573	struct vhost_net_virtqueue *rnvq = &net->vqs[VHOST_NET_VQ_RX];
				574	struct vhost_virtqueue *rvq = &rnvq->vq;
				575	struct vhost_virtqueue *tvq = &tnvq->vq;
				576
				577	int r = vhost_get_vq_desc(tvq, tvq->iov, ARRAY_SIZE(tvq->iov),
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	578	out_num, in_num, NULL, NULL);
				579
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	580	if (r == tvq->num && tvq->busyloop_timeout) {
				581	/* Flush batched packets first */
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	582	if (!vhost_sock_zcopy(vhost_vq_get_backend(tvq)))
				583	vhost_tx_batch(net, tnvq,
				584	vhost_vq_get_backend(tvq),
				585	msghdr);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	586
				587	vhost_net_busy_poll(net, rvq, tvq, busyloop_intr, false);
				588
				589	r = vhost_get_vq_desc(tvq, tvq->iov, ARRAY_SIZE(tvq->iov),
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	590	out_num, in_num, NULL, NULL);
				591	}
				592
				593	return r;
				594	}
				595
				596	static bool vhost_exceeds_maxpend(struct vhost_net *net)
				597	{
				598	struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
				599	struct vhost_virtqueue *vq = &nvq->vq;
				600
				601	return (nvq->upend_idx + UIO_MAXIOV - nvq->done_idx) % UIO_MAXIOV >
				602	min_t(unsigned int, VHOST_MAX_PEND, vq->num >> 2);
				603	}
				604
				605	static size_t init_iov_iter(struct vhost_virtqueue vq, struct iov_iter iter,
				606	size_t hdr_size, int out)
				607	{
				608	/* Skip header. TODO: support TSO. */
				609	size_t len = iov_length(vq->iov, out);
				610
				611	iov_iter_init(iter, WRITE, vq->iov, out, len);
				612	iov_iter_advance(iter, hdr_size);
				613
				614	return iov_iter_count(iter);
				615	}
				616
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	617	static int get_tx_bufs(struct vhost_net *net,
				618	struct vhost_net_virtqueue *nvq,
				619	struct msghdr *msg,
				620	unsigned int out, unsigned int in,
				621	size_t len, bool busyloop_intr)
				622	{
				623	struct vhost_virtqueue *vq = &nvq->vq;
				624	int ret;
				625
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	626	ret = vhost_net_tx_get_vq_desc(net, nvq, out, in, msg, busyloop_intr);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	627
				628	if (ret < 0 \|\| ret == vq->num)
				629	return ret;
				630
				631	if (*in) {
				632	vq_err(vq, "Unexpected descriptor format for TX: out %d, int %d\n",
				633	out, in);
				634	return -EFAULT;
				635	}
				636
				637	/* Sanity check */
				638	len = init_iov_iter(vq, &msg->msg_iter, nvq->vhost_hlen, out);
				639	if (*len == 0) {
				640	vq_err(vq, "Unexpected header len for TX: %zd expected %zd\n",
				641	*len, nvq->vhost_hlen);
				642	return -EFAULT;
				643	}
				644
				645	return ret;
				646	}
				647
				648	static bool tx_can_batch(struct vhost_virtqueue *vq, size_t total_len)
				649	{
				650	return total_len < VHOST_NET_WEIGHT &&
				651	!vhost_vq_avail_empty(vq->dev, vq);
				652	}
				653
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	654	#define SKB_FRAG_PAGE_ORDER get_order(32768)
				655
				656	static bool vhost_net_page_frag_refill(struct vhost_net *net, unsigned int sz,
				657	struct page_frag *pfrag, gfp_t gfp)
				658	{
				659	if (pfrag->page) {
				660	if (pfrag->offset + sz <= pfrag->size)
				661	return true;
				662	__page_frag_cache_drain(pfrag->page, net->refcnt_bias);
				663	}
				664
				665	pfrag->offset = 0;
				666	net->refcnt_bias = 0;
				667	if (SKB_FRAG_PAGE_ORDER) {
				668	/* Avoid direct reclaim but allow kswapd to wake */
				669	pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) \|
				670	__GFP_COMP \| __GFP_NOWARN \|
				671	__GFP_NORETRY,
				672	SKB_FRAG_PAGE_ORDER);
				673	if (likely(pfrag->page)) {
				674	pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
				675	goto done;
				676	}
				677	}
				678	pfrag->page = alloc_page(gfp);
				679	if (likely(pfrag->page)) {
				680	pfrag->size = PAGE_SIZE;
				681	goto done;
				682	}
				683	return false;
				684
				685	done:
				686	net->refcnt_bias = USHRT_MAX;
				687	page_ref_add(pfrag->page, USHRT_MAX - 1);
				688	return true;
				689	}
				690
				691	#define VHOST_NET_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD)
				692
				693	static int vhost_net_build_xdp(struct vhost_net_virtqueue *nvq,
				694	struct iov_iter *from)
				695	{
				696	struct vhost_virtqueue *vq = &nvq->vq;
				697	struct vhost_net *net = container_of(vq->dev, struct vhost_net,
				698	dev);
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	699	struct socket *sock = vhost_vq_get_backend(vq);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	700	struct page_frag *alloc_frag = &net->page_frag;
				701	struct virtio_net_hdr *gso;
				702	struct xdp_buff *xdp = &nvq->xdp[nvq->batched_xdp];
				703	struct tun_xdp_hdr *hdr;
				704	size_t len = iov_iter_count(from);
				705	int headroom = vhost_sock_xdp(sock) ? XDP_PACKET_HEADROOM : 0;
				706	int buflen = SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
				707	int pad = SKB_DATA_ALIGN(VHOST_NET_RX_PAD + headroom + nvq->sock_hlen);
				708	int sock_hlen = nvq->sock_hlen;
				709	void *buf;
				710	int copied;
				711
				712	if (unlikely(len < nvq->sock_hlen))
				713	return -EFAULT;
				714
				715	if (SKB_DATA_ALIGN(len + pad) +
				716	SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) > PAGE_SIZE)
				717	return -ENOSPC;
				718
				719	buflen += SKB_DATA_ALIGN(len + pad);
				720	alloc_frag->offset = ALIGN((u64)alloc_frag->offset, SMP_CACHE_BYTES);
				721	if (unlikely(!vhost_net_page_frag_refill(net, buflen,
				722	alloc_frag, GFP_KERNEL)))
				723	return -ENOMEM;
				724
				725	buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
				726	copied = copy_page_from_iter(alloc_frag->page,
				727	alloc_frag->offset +
				728	offsetof(struct tun_xdp_hdr, gso),
				729	sock_hlen, from);
				730	if (copied != sock_hlen)
				731	return -EFAULT;
				732
				733	hdr = buf;
				734	gso = &hdr->gso;
				735
				736	if ((gso->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
				737	vhost16_to_cpu(vq, gso->csum_start) +
				738	vhost16_to_cpu(vq, gso->csum_offset) + 2 >
				739	vhost16_to_cpu(vq, gso->hdr_len)) {
				740	gso->hdr_len = cpu_to_vhost16(vq,
				741	vhost16_to_cpu(vq, gso->csum_start) +
				742	vhost16_to_cpu(vq, gso->csum_offset) + 2);
				743
				744	if (vhost16_to_cpu(vq, gso->hdr_len) > len)
				745	return -EINVAL;
				746	}
				747
				748	len -= sock_hlen;
				749	copied = copy_page_from_iter(alloc_frag->page,
				750	alloc_frag->offset + pad,
				751	len, from);
				752	if (copied != len)
				753	return -EFAULT;
				754
				755	xdp->data_hard_start = buf;
				756	xdp->data = buf + pad;
				757	xdp->data_end = xdp->data + len;
				758	hdr->buflen = buflen;
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	759	xdp->frame_sz = buflen;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	760
				761	--net->refcnt_bias;
				762	alloc_frag->offset += buflen;
				763
				764	++nvq->batched_xdp;
				765
				766	return 0;
				767	}
				768
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	769	static void handle_tx_copy(struct vhost_net net, struct socket sock)
				770	{
				771	struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
				772	struct vhost_virtqueue *vq = &nvq->vq;
				773	unsigned out, in;
				774	int head;
				775	struct msghdr msg = {
				776	.msg_name = NULL,
				777	.msg_namelen = 0,
				778	.msg_control = NULL,
				779	.msg_controllen = 0,
				780	.msg_flags = MSG_DONTWAIT,
				781	};
				782	size_t len, total_len = 0;
				783	int err;
				784	int sent_pkts = 0;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	785	bool sock_can_batch = (sock->sk->sk_sndbuf == INT_MAX);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	786
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	787	do {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	788	bool busyloop_intr = false;
				789
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	790	if (nvq->done_idx == VHOST_NET_BATCH)
				791	vhost_tx_batch(net, nvq, sock, &msg);
				792
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	793	head = get_tx_bufs(net, nvq, &msg, &out, &in, &len,
				794	&busyloop_intr);
				795	/* On error, stop handling until the next kick. */
				796	if (unlikely(head < 0))
				797	break;
				798	/* Nothing new? Wait for eventfd to tell us they refilled. */
				799	if (head == vq->num) {
				800	if (unlikely(busyloop_intr)) {
				801	vhost_poll_queue(&vq->poll);
				802	} else if (unlikely(vhost_enable_notify(&net->dev,
				803	vq))) {
				804	vhost_disable_notify(&net->dev, vq);
				805	continue;
				806	}
				807	break;
				808	}
				809
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	810	total_len += len;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	811
				812	/* For simplicity, TX batching is only enabled if
				813	* sndbuf is unlimited.
				814	*/
				815	if (sock_can_batch) {
				816	err = vhost_net_build_xdp(nvq, &msg.msg_iter);
				817	if (!err) {
				818	goto done;
				819	} else if (unlikely(err != -ENOSPC)) {
				820	vhost_tx_batch(net, nvq, sock, &msg);
				821	vhost_discard_vq_desc(vq, 1);
				822	vhost_net_enable_vq(net, vq);
				823	break;
				824	}
				825
				826	/* We can't build XDP buff, go for single
				827	* packet path but let's flush batched
				828	* packets.
				829	*/
				830	vhost_tx_batch(net, nvq, sock, &msg);
				831	msg.msg_control = NULL;
				832	} else {
				833	if (tx_can_batch(vq, total_len))
				834	msg.msg_flags \|= MSG_MORE;
				835	else
				836	msg.msg_flags &= ~MSG_MORE;
				837	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	838
				839	/* TODO: Check specific error and bomb out unless ENOBUFS? */
				840	err = sock->ops->sendmsg(sock, &msg, len);
				841	if (unlikely(err < 0)) {
				842	vhost_discard_vq_desc(vq, 1);
				843	vhost_net_enable_vq(net, vq);
				844	break;
				845	}
				846	if (err != len)
				847	pr_debug("Truncated TX packet: len %d != %zd\n",
				848	err, len);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	849	done:
				850	vq->heads[nvq->done_idx].id = cpu_to_vhost32(vq, head);
				851	vq->heads[nvq->done_idx].len = 0;
				852	++nvq->done_idx;
				853	} while (likely(!vhost_exceeds_weight(vq, ++sent_pkts, total_len)));
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	854
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	855	vhost_tx_batch(net, nvq, sock, &msg);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	856	}
				857
				858	static void handle_tx_zerocopy(struct vhost_net net, struct socket sock)
				859	{
				860	struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
				861	struct vhost_virtqueue *vq = &nvq->vq;
				862	unsigned out, in;
				863	int head;
				864	struct msghdr msg = {
				865	.msg_name = NULL,
				866	.msg_namelen = 0,
				867	.msg_control = NULL,
				868	.msg_controllen = 0,
				869	.msg_flags = MSG_DONTWAIT,
				870	};
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	871	struct tun_msg_ctl ctl;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	872	size_t len, total_len = 0;
				873	int err;
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	874	struct vhost_net_ubuf_ref *ubufs;
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame]	875	struct ubuf_info *ubuf;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	876	bool zcopy_used;
				877	int sent_pkts = 0;
				878
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	879	do {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	880	bool busyloop_intr;
				881
				882	/* Release DMAs done buffers first */
				883	vhost_zerocopy_signal_used(net, vq);
				884
				885	busyloop_intr = false;
				886	head = get_tx_bufs(net, nvq, &msg, &out, &in, &len,
				887	&busyloop_intr);
				888	/* On error, stop handling until the next kick. */
				889	if (unlikely(head < 0))
				890	break;
				891	/* Nothing new? Wait for eventfd to tell us they refilled. */
				892	if (head == vq->num) {
				893	if (unlikely(busyloop_intr)) {
				894	vhost_poll_queue(&vq->poll);
				895	} else if (unlikely(vhost_enable_notify(&net->dev, vq))) {
				896	vhost_disable_notify(&net->dev, vq);
				897	continue;
				898	}
				899	break;
				900	}
				901
				902	zcopy_used = len >= VHOST_GOODCOPY_LEN
				903	&& !vhost_exceeds_maxpend(net)
				904	&& vhost_net_tx_select_zcopy(net);
				905
				906	/* use msg_control to pass vhost zerocopy ubuf info to skb */
				907	if (zcopy_used) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	908	ubuf = nvq->ubuf_info + nvq->upend_idx;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	909	vq->heads[nvq->upend_idx].id = cpu_to_vhost32(vq, head);
				910	vq->heads[nvq->upend_idx].len = VHOST_DMA_IN_PROGRESS;
				911	ubuf->callback = vhost_zerocopy_callback;
				912	ubuf->ctx = nvq->ubufs;
				913	ubuf->desc = nvq->upend_idx;
				914	refcount_set(&ubuf->refcnt, 1);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	915	msg.msg_control = &ctl;
				916	ctl.type = TUN_MSG_UBUF;
				917	ctl.ptr = ubuf;
				918	msg.msg_controllen = sizeof(ctl);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	919	ubufs = nvq->ubufs;
				920	atomic_inc(&ubufs->refcount);
				921	nvq->upend_idx = (nvq->upend_idx + 1) % UIO_MAXIOV;
				922	} else {
				923	msg.msg_control = NULL;
				924	ubufs = NULL;
				925	}
				926	total_len += len;
				927	if (tx_can_batch(vq, total_len) &&
				928	likely(!vhost_exceeds_maxpend(net))) {
				929	msg.msg_flags \|= MSG_MORE;
				930	} else {
				931	msg.msg_flags &= ~MSG_MORE;
				932	}
				933
				934	/* TODO: Check specific error and bomb out unless ENOBUFS? */
				935	err = sock->ops->sendmsg(sock, &msg, len);
				936	if (unlikely(err < 0)) {
				937	if (zcopy_used) {
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame]	938	if (vq->heads[ubuf->desc].len == VHOST_DMA_IN_PROGRESS)
				939	vhost_net_ubuf_put(ubufs);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	940	nvq->upend_idx = ((unsigned)nvq->upend_idx - 1)
				941	% UIO_MAXIOV;
				942	}
				943	vhost_discard_vq_desc(vq, 1);
				944	vhost_net_enable_vq(net, vq);
				945	break;
				946	}
				947	if (err != len)
				948	pr_debug("Truncated TX packet: "
				949	" len %d != %zd\n", err, len);
				950	if (!zcopy_used)
				951	vhost_add_used_and_signal(&net->dev, vq, head, 0);
				952	else
				953	vhost_zerocopy_signal_used(net, vq);
				954	vhost_net_tx_packet(net);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	955	} while (likely(!vhost_exceeds_weight(vq, ++sent_pkts, total_len)));
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	956	}
				957
				958	/* Expects to be always run from workqueue - which acts as
				959	* read-size critical section for our kind of RCU. */
				960	static void handle_tx(struct vhost_net *net)
				961	{
				962	struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
				963	struct vhost_virtqueue *vq = &nvq->vq;
				964	struct socket *sock;
				965
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	966	mutex_lock_nested(&vq->mutex, VHOST_NET_VQ_TX);
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	967	sock = vhost_vq_get_backend(vq);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	968	if (!sock)
				969	goto out;
				970
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	971	if (!vq_meta_prefetch(vq))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	972	goto out;
				973
				974	vhost_disable_notify(&net->dev, vq);
				975	vhost_net_disable_vq(net, vq);
				976
				977	if (vhost_sock_zcopy(sock))
				978	handle_tx_zerocopy(net, sock);
				979	else
				980	handle_tx_copy(net, sock);
				981
				982	out:
				983	mutex_unlock(&vq->mutex);
				984	}
				985
				986	static int peek_head_len(struct vhost_net_virtqueue rvq, struct sock sk)
				987	{
				988	struct sk_buff *head;
				989	int len = 0;
				990	unsigned long flags;
				991
				992	if (rvq->rx_ring)
				993	return vhost_net_buf_peek(rvq);
				994
				995	spin_lock_irqsave(&sk->sk_receive_queue.lock, flags);
				996	head = skb_peek(&sk->sk_receive_queue);
				997	if (likely(head)) {
				998	len = head->len;
				999	if (skb_vlan_tag_present(head))
				1000	len += VLAN_HLEN;
				1001	}
				1002
				1003	spin_unlock_irqrestore(&sk->sk_receive_queue.lock, flags);
				1004	return len;
				1005	}
				1006
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1007	static int vhost_net_rx_peek_head_len(struct vhost_net net, struct sock sk,
				1008	bool *busyloop_intr)
				1009	{
				1010	struct vhost_net_virtqueue *rnvq = &net->vqs[VHOST_NET_VQ_RX];
				1011	struct vhost_net_virtqueue *tnvq = &net->vqs[VHOST_NET_VQ_TX];
				1012	struct vhost_virtqueue *rvq = &rnvq->vq;
				1013	struct vhost_virtqueue *tvq = &tnvq->vq;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1014	int len = peek_head_len(rnvq, sk);
				1015
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1016	if (!len && rvq->busyloop_timeout) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1017	/* Flush batched heads first */
				1018	vhost_net_signal_used(rnvq);
				1019	/* Both tx vq and rx socket were polled here */
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1020	vhost_net_busy_poll(net, rvq, tvq, busyloop_intr, true);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1021
				1022	len = peek_head_len(rnvq, sk);
				1023	}
				1024
				1025	return len;
				1026	}
				1027
				1028	/* This is a multi-buffer version of vhost_get_desc, that works if
				1029	* vq has read descriptors only.
				1030	* @vq - the relevant virtqueue
				1031	* @datalen - data length we'll be reading
				1032	* @iovcount - returned count of io vectors we fill
				1033	* @log - vhost log
				1034	* @log_num - log offset
				1035	* @quota - headcount quota, 1 for big buffer
				1036	* returns number of buffer heads allocated, negative on error
				1037	*/
				1038	static int get_rx_bufs(struct vhost_virtqueue *vq,
				1039	struct vring_used_elem *heads,
				1040	int datalen,
				1041	unsigned *iovcount,
				1042	struct vhost_log *log,
				1043	unsigned *log_num,
				1044	unsigned int quota)
				1045	{
				1046	unsigned int out, in;
				1047	int seg = 0;
				1048	int headcount = 0;
				1049	unsigned d;
				1050	int r, nlogs = 0;
				1051	/* len is always initialized before use since we are always called with
				1052	* datalen > 0.
				1053	*/
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1054	u32 len;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1055
				1056	while (datalen > 0 && headcount < quota) {
				1057	if (unlikely(seg >= UIO_MAXIOV)) {
				1058	r = -ENOBUFS;
				1059	goto err;
				1060	}
				1061	r = vhost_get_vq_desc(vq, vq->iov + seg,
				1062	ARRAY_SIZE(vq->iov) - seg, &out,
				1063	&in, log, log_num);
				1064	if (unlikely(r < 0))
				1065	goto err;
				1066
				1067	d = r;
				1068	if (d == vq->num) {
				1069	r = 0;
				1070	goto err;
				1071	}
				1072	if (unlikely(out \|\| in <= 0)) {
				1073	vq_err(vq, "unexpected descriptor format for RX: "
				1074	"out %d, in %d\n", out, in);
				1075	r = -EINVAL;
				1076	goto err;
				1077	}
				1078	if (unlikely(log)) {
				1079	nlogs += *log_num;
				1080	log += *log_num;
				1081	}
				1082	heads[headcount].id = cpu_to_vhost32(vq, d);
				1083	len = iov_length(vq->iov + seg, in);
				1084	heads[headcount].len = cpu_to_vhost32(vq, len);
				1085	datalen -= len;
				1086	++headcount;
				1087	seg += in;
				1088	}
				1089	heads[headcount - 1].len = cpu_to_vhost32(vq, len + datalen);
				1090	*iovcount = seg;
				1091	if (unlikely(log))
				1092	*log_num = nlogs;
				1093
				1094	/* Detect overrun */
				1095	if (unlikely(datalen > 0)) {
				1096	r = UIO_MAXIOV + 1;
				1097	goto err;
				1098	}
				1099	return headcount;
				1100	err:
				1101	vhost_discard_vq_desc(vq, headcount);
				1102	return r;
				1103	}
				1104
				1105	/* Expects to be always run from workqueue - which acts as
				1106	* read-size critical section for our kind of RCU. */
				1107	static void handle_rx(struct vhost_net *net)
				1108	{
				1109	struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_RX];
				1110	struct vhost_virtqueue *vq = &nvq->vq;
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1111	unsigned in, log;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1112	struct vhost_log *vq_log;
				1113	struct msghdr msg = {
				1114	.msg_name = NULL,
				1115	.msg_namelen = 0,
				1116	.msg_control = NULL, /* FIXME: get and handle RX aux data. */
				1117	.msg_controllen = 0,
				1118	.msg_flags = MSG_DONTWAIT,
				1119	};
				1120	struct virtio_net_hdr hdr = {
				1121	.flags = 0,
				1122	.gso_type = VIRTIO_NET_HDR_GSO_NONE
				1123	};
				1124	size_t total_len = 0;
				1125	int err, mergeable;
				1126	s16 headcount;
				1127	size_t vhost_hlen, sock_hlen;
				1128	size_t vhost_len, sock_len;
				1129	bool busyloop_intr = false;
				1130	struct socket *sock;
				1131	struct iov_iter fixup;
				1132	__virtio16 num_buffers;
				1133	int recv_pkts = 0;
				1134
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1135	mutex_lock_nested(&vq->mutex, VHOST_NET_VQ_RX);
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1136	sock = vhost_vq_get_backend(vq);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1137	if (!sock)
				1138	goto out;
				1139
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1140	if (!vq_meta_prefetch(vq))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1141	goto out;
				1142
				1143	vhost_disable_notify(&net->dev, vq);
				1144	vhost_net_disable_vq(net, vq);
				1145
				1146	vhost_hlen = nvq->vhost_hlen;
				1147	sock_hlen = nvq->sock_hlen;
				1148
				1149	vq_log = unlikely(vhost_has_feature(vq, VHOST_F_LOG_ALL)) ?
				1150	vq->log : NULL;
				1151	mergeable = vhost_has_feature(vq, VIRTIO_NET_F_MRG_RXBUF);
				1152
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1153	do {
				1154	sock_len = vhost_net_rx_peek_head_len(net, sock->sk,
				1155	&busyloop_intr);
				1156	if (!sock_len)
				1157	break;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1158	sock_len += sock_hlen;
				1159	vhost_len = sock_len + vhost_hlen;
				1160	headcount = get_rx_bufs(vq, vq->heads + nvq->done_idx,
				1161	vhost_len, &in, vq_log, &log,
				1162	likely(mergeable) ? UIO_MAXIOV : 1);
				1163	/* On error, stop handling until the next kick. */
				1164	if (unlikely(headcount < 0))
				1165	goto out;
				1166	/* OK, now we need to know about added descriptors. */
				1167	if (!headcount) {
				1168	if (unlikely(busyloop_intr)) {
				1169	vhost_poll_queue(&vq->poll);
				1170	} else if (unlikely(vhost_enable_notify(&net->dev, vq))) {
				1171	/* They have slipped one in as we were
				1172	* doing that: check again. */
				1173	vhost_disable_notify(&net->dev, vq);
				1174	continue;
				1175	}
				1176	/* Nothing new? Wait for eventfd to tell us
				1177	* they refilled. */
				1178	goto out;
				1179	}
				1180	busyloop_intr = false;
				1181	if (nvq->rx_ring)
				1182	msg.msg_control = vhost_net_buf_consume(&nvq->rxq);
				1183	/* On overrun, truncate and discard */
				1184	if (unlikely(headcount > UIO_MAXIOV)) {
				1185	iov_iter_init(&msg.msg_iter, READ, vq->iov, 1, 1);
				1186	err = sock->ops->recvmsg(sock, &msg,
				1187	1, MSG_DONTWAIT \| MSG_TRUNC);
				1188	pr_debug("Discarded rx packet: len %zd\n", sock_len);
				1189	continue;
				1190	}
				1191	/* We don't need to be notified again. */
				1192	iov_iter_init(&msg.msg_iter, READ, vq->iov, in, vhost_len);
				1193	fixup = msg.msg_iter;
				1194	if (unlikely((vhost_hlen))) {
				1195	/* We will supply the header ourselves
				1196	* TODO: support TSO.
				1197	*/
				1198	iov_iter_advance(&msg.msg_iter, vhost_hlen);
				1199	}
				1200	err = sock->ops->recvmsg(sock, &msg,
				1201	sock_len, MSG_DONTWAIT \| MSG_TRUNC);
				1202	/* Userspace might have consumed the packet meanwhile:
				1203	* it's not supposed to do this usually, but might be hard
				1204	* to prevent. Discard data we got (if any) and keep going. */
				1205	if (unlikely(err != sock_len)) {
				1206	pr_debug("Discarded rx packet: "
				1207	" len %d, expected %zd\n", err, sock_len);
				1208	vhost_discard_vq_desc(vq, headcount);
				1209	continue;
				1210	}
				1211	/* Supply virtio_net_hdr if VHOST_NET_F_VIRTIO_NET_HDR */
				1212	if (unlikely(vhost_hlen)) {
				1213	if (copy_to_iter(&hdr, sizeof(hdr),
				1214	&fixup) != sizeof(hdr)) {
				1215	vq_err(vq, "Unable to write vnet_hdr "
				1216	"at addr %p\n", vq->iov->iov_base);
				1217	goto out;
				1218	}
				1219	} else {
				1220	/* Header came from socket; we'll need to patch
				1221	* ->num_buffers over if VIRTIO_NET_F_MRG_RXBUF
				1222	*/
				1223	iov_iter_advance(&fixup, sizeof(hdr));
				1224	}
				1225	/* TODO: Should check and handle checksum. */
				1226
				1227	num_buffers = cpu_to_vhost16(vq, headcount);
				1228	if (likely(mergeable) &&
				1229	copy_to_iter(&num_buffers, sizeof num_buffers,
				1230	&fixup) != sizeof num_buffers) {
				1231	vq_err(vq, "Failed num_buffers write");
				1232	vhost_discard_vq_desc(vq, headcount);
				1233	goto out;
				1234	}
				1235	nvq->done_idx += headcount;
				1236	if (nvq->done_idx > VHOST_NET_BATCH)
				1237	vhost_net_signal_used(nvq);
				1238	if (unlikely(vq_log))
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1239	vhost_log_write(vq, vq_log, log, vhost_len,
				1240	vq->iov, in);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1241	total_len += vhost_len;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1242	} while (likely(!vhost_exceeds_weight(vq, ++recv_pkts, total_len)));
				1243
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1244	if (unlikely(busyloop_intr))
				1245	vhost_poll_queue(&vq->poll);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1246	else if (!sock_len)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1247	vhost_net_enable_vq(net, vq);
				1248	out:
				1249	vhost_net_signal_used(nvq);
				1250	mutex_unlock(&vq->mutex);
				1251	}
				1252
				1253	static void handle_tx_kick(struct vhost_work *work)
				1254	{
				1255	struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue,
				1256	poll.work);
				1257	struct vhost_net *net = container_of(vq->dev, struct vhost_net, dev);
				1258
				1259	handle_tx(net);
				1260	}
				1261
				1262	static void handle_rx_kick(struct vhost_work *work)
				1263	{
				1264	struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue,
				1265	poll.work);
				1266	struct vhost_net *net = container_of(vq->dev, struct vhost_net, dev);
				1267
				1268	handle_rx(net);
				1269	}
				1270
				1271	static void handle_tx_net(struct vhost_work *work)
				1272	{
				1273	struct vhost_net *net = container_of(work, struct vhost_net,
				1274	poll[VHOST_NET_VQ_TX].work);
				1275	handle_tx(net);
				1276	}
				1277
				1278	static void handle_rx_net(struct vhost_work *work)
				1279	{
				1280	struct vhost_net *net = container_of(work, struct vhost_net,
				1281	poll[VHOST_NET_VQ_RX].work);
				1282	handle_rx(net);
				1283	}
				1284
				1285	static int vhost_net_open(struct inode inode, struct file f)
				1286	{
				1287	struct vhost_net *n;
				1288	struct vhost_dev *dev;
				1289	struct vhost_virtqueue **vqs;
				1290	void **queue;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1291	struct xdp_buff *xdp;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1292	int i;
				1293
				1294	n = kvmalloc(sizeof *n, GFP_KERNEL \| __GFP_RETRY_MAYFAIL);
				1295	if (!n)
				1296	return -ENOMEM;
				1297	vqs = kmalloc_array(VHOST_NET_VQ_MAX, sizeof(*vqs), GFP_KERNEL);
				1298	if (!vqs) {
				1299	kvfree(n);
				1300	return -ENOMEM;
				1301	}
				1302
				1303	queue = kmalloc_array(VHOST_NET_BATCH, sizeof(void *),
				1304	GFP_KERNEL);
				1305	if (!queue) {
				1306	kfree(vqs);
				1307	kvfree(n);
				1308	return -ENOMEM;
				1309	}
				1310	n->vqs[VHOST_NET_VQ_RX].rxq.queue = queue;
				1311
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1312	xdp = kmalloc_array(VHOST_NET_BATCH, sizeof(*xdp), GFP_KERNEL);
				1313	if (!xdp) {
				1314	kfree(vqs);
				1315	kvfree(n);
				1316	kfree(queue);
				1317	return -ENOMEM;
				1318	}
				1319	n->vqs[VHOST_NET_VQ_TX].xdp = xdp;
				1320
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1321	dev = &n->dev;
				1322	vqs[VHOST_NET_VQ_TX] = &n->vqs[VHOST_NET_VQ_TX].vq;
				1323	vqs[VHOST_NET_VQ_RX] = &n->vqs[VHOST_NET_VQ_RX].vq;
				1324	n->vqs[VHOST_NET_VQ_TX].vq.handle_kick = handle_tx_kick;
				1325	n->vqs[VHOST_NET_VQ_RX].vq.handle_kick = handle_rx_kick;
				1326	for (i = 0; i < VHOST_NET_VQ_MAX; i++) {
				1327	n->vqs[i].ubufs = NULL;
				1328	n->vqs[i].ubuf_info = NULL;
				1329	n->vqs[i].upend_idx = 0;
				1330	n->vqs[i].done_idx = 0;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1331	n->vqs[i].batched_xdp = 0;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1332	n->vqs[i].vhost_hlen = 0;
				1333	n->vqs[i].sock_hlen = 0;
				1334	n->vqs[i].rx_ring = NULL;
				1335	vhost_net_buf_init(&n->vqs[i].rxq);
				1336	}
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1337	vhost_dev_init(dev, vqs, VHOST_NET_VQ_MAX,
				1338	UIO_MAXIOV + VHOST_NET_BATCH,
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1339	VHOST_NET_PKT_WEIGHT, VHOST_NET_WEIGHT, true,
				1340	NULL);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1341
				1342	vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, EPOLLOUT, dev);
				1343	vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, EPOLLIN, dev);
				1344
				1345	f->private_data = n;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1346	n->page_frag.page = NULL;
				1347	n->refcnt_bias = 0;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1348
				1349	return 0;
				1350	}
				1351
				1352	static struct socket vhost_net_stop_vq(struct vhost_net n,
				1353	struct vhost_virtqueue *vq)
				1354	{
				1355	struct socket *sock;
				1356	struct vhost_net_virtqueue *nvq =
				1357	container_of(vq, struct vhost_net_virtqueue, vq);
				1358
				1359	mutex_lock(&vq->mutex);
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1360	sock = vhost_vq_get_backend(vq);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1361	vhost_net_disable_vq(n, vq);
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1362	vhost_vq_set_backend(vq, NULL);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1363	vhost_net_buf_unproduce(nvq);
				1364	nvq->rx_ring = NULL;
				1365	mutex_unlock(&vq->mutex);
				1366	return sock;
				1367	}
				1368
				1369	static void vhost_net_stop(struct vhost_net n, struct socket *tx_sock,
				1370	struct socket **rx_sock)
				1371	{
				1372	*tx_sock = vhost_net_stop_vq(n, &n->vqs[VHOST_NET_VQ_TX].vq);
				1373	*rx_sock = vhost_net_stop_vq(n, &n->vqs[VHOST_NET_VQ_RX].vq);
				1374	}
				1375
				1376	static void vhost_net_flush_vq(struct vhost_net *n, int index)
				1377	{
				1378	vhost_poll_flush(n->poll + index);
				1379	vhost_poll_flush(&n->vqs[index].vq.poll);
				1380	}
				1381
				1382	static void vhost_net_flush(struct vhost_net *n)
				1383	{
				1384	vhost_net_flush_vq(n, VHOST_NET_VQ_TX);
				1385	vhost_net_flush_vq(n, VHOST_NET_VQ_RX);
				1386	if (n->vqs[VHOST_NET_VQ_TX].ubufs) {
				1387	mutex_lock(&n->vqs[VHOST_NET_VQ_TX].vq.mutex);
				1388	n->tx_flush = true;
				1389	mutex_unlock(&n->vqs[VHOST_NET_VQ_TX].vq.mutex);
				1390	/* Wait for all lower device DMAs done. */
				1391	vhost_net_ubuf_put_and_wait(n->vqs[VHOST_NET_VQ_TX].ubufs);
				1392	mutex_lock(&n->vqs[VHOST_NET_VQ_TX].vq.mutex);
				1393	n->tx_flush = false;
				1394	atomic_set(&n->vqs[VHOST_NET_VQ_TX].ubufs->refcount, 1);
				1395	mutex_unlock(&n->vqs[VHOST_NET_VQ_TX].vq.mutex);
				1396	}
				1397	}
				1398
				1399	static int vhost_net_release(struct inode inode, struct file f)
				1400	{
				1401	struct vhost_net *n = f->private_data;
				1402	struct socket *tx_sock;
				1403	struct socket *rx_sock;
				1404
				1405	vhost_net_stop(n, &tx_sock, &rx_sock);
				1406	vhost_net_flush(n);
				1407	vhost_dev_stop(&n->dev);
				1408	vhost_dev_cleanup(&n->dev);
				1409	vhost_net_vq_reset(n);
				1410	if (tx_sock)
				1411	sockfd_put(tx_sock);
				1412	if (rx_sock)
				1413	sockfd_put(rx_sock);
				1414	/* Make sure no callbacks are outstanding */
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1415	synchronize_rcu();
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1416	/* We do an extra flush before freeing memory,
				1417	* since jobs can re-queue themselves. */
				1418	vhost_net_flush(n);
				1419	kfree(n->vqs[VHOST_NET_VQ_RX].rxq.queue);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1420	kfree(n->vqs[VHOST_NET_VQ_TX].xdp);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1421	kfree(n->dev.vqs);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1422	if (n->page_frag.page)
				1423	__page_frag_cache_drain(n->page_frag.page, n->refcnt_bias);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1424	kvfree(n);
				1425	return 0;
				1426	}
				1427
				1428	static struct socket *get_raw_socket(int fd)
				1429	{
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1430	int r;
				1431	struct socket *sock = sockfd_lookup(fd, &r);
				1432
				1433	if (!sock)
				1434	return ERR_PTR(-ENOTSOCK);
				1435
				1436	/* Parameter checking */
				1437	if (sock->sk->sk_type != SOCK_RAW) {
				1438	r = -ESOCKTNOSUPPORT;
				1439	goto err;
				1440	}
				1441
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame]	1442	if (sock->sk->sk_family != AF_PACKET) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1443	r = -EPFNOSUPPORT;
				1444	goto err;
				1445	}
				1446	return sock;
				1447	err:
				1448	sockfd_put(sock);
				1449	return ERR_PTR(r);
				1450	}
				1451
				1452	static struct ptr_ring *get_tap_ptr_ring(int fd)
				1453	{
				1454	struct ptr_ring *ring;
				1455	struct file *file = fget(fd);
				1456
				1457	if (!file)
				1458	return NULL;
				1459	ring = tun_get_tx_ring(file);
				1460	if (!IS_ERR(ring))
				1461	goto out;
				1462	ring = tap_get_ptr_ring(file);
				1463	if (!IS_ERR(ring))
				1464	goto out;
				1465	ring = NULL;
				1466	out:
				1467	fput(file);
				1468	return ring;
				1469	}
				1470
				1471	static struct socket *get_tap_socket(int fd)
				1472	{
				1473	struct file *file = fget(fd);
				1474	struct socket *sock;
				1475
				1476	if (!file)
				1477	return ERR_PTR(-EBADF);
				1478	sock = tun_get_socket(file);
				1479	if (!IS_ERR(sock))
				1480	return sock;
				1481	sock = tap_get_socket(file);
				1482	if (IS_ERR(sock))
				1483	fput(file);
				1484	return sock;
				1485	}
				1486
				1487	static struct socket *get_socket(int fd)
				1488	{
				1489	struct socket *sock;
				1490
				1491	/* special case to disable backend */
				1492	if (fd == -1)
				1493	return NULL;
				1494	sock = get_raw_socket(fd);
				1495	if (!IS_ERR(sock))
				1496	return sock;
				1497	sock = get_tap_socket(fd);
				1498	if (!IS_ERR(sock))
				1499	return sock;
				1500	return ERR_PTR(-ENOTSOCK);
				1501	}
				1502
				1503	static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
				1504	{
				1505	struct socket sock, oldsock;
				1506	struct vhost_virtqueue *vq;
				1507	struct vhost_net_virtqueue *nvq;
				1508	struct vhost_net_ubuf_ref ubufs, oldubufs = NULL;
				1509	int r;
				1510
				1511	mutex_lock(&n->dev.mutex);
				1512	r = vhost_dev_check_owner(&n->dev);
				1513	if (r)
				1514	goto err;
				1515
				1516	if (index >= VHOST_NET_VQ_MAX) {
				1517	r = -ENOBUFS;
				1518	goto err;
				1519	}
				1520	vq = &n->vqs[index].vq;
				1521	nvq = &n->vqs[index];
				1522	mutex_lock(&vq->mutex);
				1523
				1524	/* Verify that ring has been setup correctly. */
				1525	if (!vhost_vq_access_ok(vq)) {
				1526	r = -EFAULT;
				1527	goto err_vq;
				1528	}
				1529	sock = get_socket(fd);
				1530	if (IS_ERR(sock)) {
				1531	r = PTR_ERR(sock);
				1532	goto err_vq;
				1533	}
				1534
				1535	/* start polling new socket */
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1536	oldsock = vhost_vq_get_backend(vq);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1537	if (sock != oldsock) {
				1538	ubufs = vhost_net_ubuf_alloc(vq,
				1539	sock && vhost_sock_zcopy(sock));
				1540	if (IS_ERR(ubufs)) {
				1541	r = PTR_ERR(ubufs);
				1542	goto err_ubufs;
				1543	}
				1544
				1545	vhost_net_disable_vq(n, vq);
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1546	vhost_vq_set_backend(vq, sock);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1547	vhost_net_buf_unproduce(nvq);
				1548	r = vhost_vq_init_access(vq);
				1549	if (r)
				1550	goto err_used;
				1551	r = vhost_net_enable_vq(n, vq);
				1552	if (r)
				1553	goto err_used;
				1554	if (index == VHOST_NET_VQ_RX)
				1555	nvq->rx_ring = get_tap_ptr_ring(fd);
				1556
				1557	oldubufs = nvq->ubufs;
				1558	nvq->ubufs = ubufs;
				1559
				1560	n->tx_packets = 0;
				1561	n->tx_zcopy_err = 0;
				1562	n->tx_flush = false;
				1563	}
				1564
				1565	mutex_unlock(&vq->mutex);
				1566
				1567	if (oldubufs) {
				1568	vhost_net_ubuf_put_wait_and_free(oldubufs);
				1569	mutex_lock(&vq->mutex);
				1570	vhost_zerocopy_signal_used(n, vq);
				1571	mutex_unlock(&vq->mutex);
				1572	}
				1573
				1574	if (oldsock) {
				1575	vhost_net_flush_vq(n, index);
				1576	sockfd_put(oldsock);
				1577	}
				1578
				1579	mutex_unlock(&n->dev.mutex);
				1580	return 0;
				1581
				1582	err_used:
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1583	vhost_vq_set_backend(vq, oldsock);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1584	vhost_net_enable_vq(n, vq);
				1585	if (ubufs)
				1586	vhost_net_ubuf_put_wait_and_free(ubufs);
				1587	err_ubufs:
				1588	if (sock)
				1589	sockfd_put(sock);
				1590	err_vq:
				1591	mutex_unlock(&vq->mutex);
				1592	err:
				1593	mutex_unlock(&n->dev.mutex);
				1594	return r;
				1595	}
				1596
				1597	static long vhost_net_reset_owner(struct vhost_net *n)
				1598	{
				1599	struct socket *tx_sock = NULL;
				1600	struct socket *rx_sock = NULL;
				1601	long err;
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1602	struct vhost_iotlb *umem;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1603
				1604	mutex_lock(&n->dev.mutex);
				1605	err = vhost_dev_check_owner(&n->dev);
				1606	if (err)
				1607	goto done;
				1608	umem = vhost_dev_reset_owner_prepare();
				1609	if (!umem) {
				1610	err = -ENOMEM;
				1611	goto done;
				1612	}
				1613	vhost_net_stop(n, &tx_sock, &rx_sock);
				1614	vhost_net_flush(n);
				1615	vhost_dev_stop(&n->dev);
				1616	vhost_dev_reset_owner(&n->dev, umem);
				1617	vhost_net_vq_reset(n);
				1618	done:
				1619	mutex_unlock(&n->dev.mutex);
				1620	if (tx_sock)
				1621	sockfd_put(tx_sock);
				1622	if (rx_sock)
				1623	sockfd_put(rx_sock);
				1624	return err;
				1625	}
				1626
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1627	static int vhost_net_set_features(struct vhost_net *n, u64 features)
				1628	{
				1629	size_t vhost_hlen, sock_hlen, hdr_len;
				1630	int i;
				1631
				1632	hdr_len = (features & ((1ULL << VIRTIO_NET_F_MRG_RXBUF) \|
				1633	(1ULL << VIRTIO_F_VERSION_1))) ?
				1634	sizeof(struct virtio_net_hdr_mrg_rxbuf) :
				1635	sizeof(struct virtio_net_hdr);
				1636	if (features & (1 << VHOST_NET_F_VIRTIO_NET_HDR)) {
				1637	/* vhost provides vnet_hdr */
				1638	vhost_hlen = hdr_len;
				1639	sock_hlen = 0;
				1640	} else {
				1641	/* socket provides vnet_hdr */
				1642	vhost_hlen = 0;
				1643	sock_hlen = hdr_len;
				1644	}
				1645	mutex_lock(&n->dev.mutex);
				1646	if ((features & (1 << VHOST_F_LOG_ALL)) &&
				1647	!vhost_log_access_ok(&n->dev))
				1648	goto out_unlock;
				1649
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1650	if ((features & (1ULL << VIRTIO_F_ACCESS_PLATFORM))) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1651	if (vhost_init_device_iotlb(&n->dev, true))
				1652	goto out_unlock;
				1653	}
				1654
				1655	for (i = 0; i < VHOST_NET_VQ_MAX; ++i) {
				1656	mutex_lock(&n->vqs[i].vq.mutex);
				1657	n->vqs[i].vq.acked_features = features;
				1658	n->vqs[i].vhost_hlen = vhost_hlen;
				1659	n->vqs[i].sock_hlen = sock_hlen;
				1660	mutex_unlock(&n->vqs[i].vq.mutex);
				1661	}
				1662	mutex_unlock(&n->dev.mutex);
				1663	return 0;
				1664
				1665	out_unlock:
				1666	mutex_unlock(&n->dev.mutex);
				1667	return -EFAULT;
				1668	}
				1669
				1670	static long vhost_net_set_owner(struct vhost_net *n)
				1671	{
				1672	int r;
				1673
				1674	mutex_lock(&n->dev.mutex);
				1675	if (vhost_dev_has_owner(&n->dev)) {
				1676	r = -EBUSY;
				1677	goto out;
				1678	}
				1679	r = vhost_net_set_ubuf_info(n);
				1680	if (r)
				1681	goto out;
				1682	r = vhost_dev_set_owner(&n->dev);
				1683	if (r)
				1684	vhost_net_clear_ubuf_info(n);
				1685	vhost_net_flush(n);
				1686	out:
				1687	mutex_unlock(&n->dev.mutex);
				1688	return r;
				1689	}
				1690
				1691	static long vhost_net_ioctl(struct file *f, unsigned int ioctl,
				1692	unsigned long arg)
				1693	{
				1694	struct vhost_net *n = f->private_data;
				1695	void __user argp = (void __user )arg;
				1696	u64 __user *featurep = argp;
				1697	struct vhost_vring_file backend;
				1698	u64 features;
				1699	int r;
				1700
				1701	switch (ioctl) {
				1702	case VHOST_NET_SET_BACKEND:
				1703	if (copy_from_user(&backend, argp, sizeof backend))
				1704	return -EFAULT;
				1705	return vhost_net_set_backend(n, backend.index, backend.fd);
				1706	case VHOST_GET_FEATURES:
				1707	features = VHOST_NET_FEATURES;
				1708	if (copy_to_user(featurep, &features, sizeof features))
				1709	return -EFAULT;
				1710	return 0;
				1711	case VHOST_SET_FEATURES:
				1712	if (copy_from_user(&features, featurep, sizeof features))
				1713	return -EFAULT;
				1714	if (features & ~VHOST_NET_FEATURES)
				1715	return -EOPNOTSUPP;
				1716	return vhost_net_set_features(n, features);
				1717	case VHOST_GET_BACKEND_FEATURES:
				1718	features = VHOST_NET_BACKEND_FEATURES;
				1719	if (copy_to_user(featurep, &features, sizeof(features)))
				1720	return -EFAULT;
				1721	return 0;
				1722	case VHOST_SET_BACKEND_FEATURES:
				1723	if (copy_from_user(&features, featurep, sizeof(features)))
				1724	return -EFAULT;
				1725	if (features & ~VHOST_NET_BACKEND_FEATURES)
				1726	return -EOPNOTSUPP;
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1727	vhost_set_backend_features(&n->dev, features);
				1728	return 0;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1729	case VHOST_RESET_OWNER:
				1730	return vhost_net_reset_owner(n);
				1731	case VHOST_SET_OWNER:
				1732	return vhost_net_set_owner(n);
				1733	default:
				1734	mutex_lock(&n->dev.mutex);
				1735	r = vhost_dev_ioctl(&n->dev, ioctl, argp);
				1736	if (r == -ENOIOCTLCMD)
				1737	r = vhost_vring_ioctl(&n->dev, ioctl, argp);
				1738	else
				1739	vhost_net_flush(n);
				1740	mutex_unlock(&n->dev.mutex);
				1741	return r;
				1742	}
				1743	}
				1744
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1745	static ssize_t vhost_net_chr_read_iter(struct kiocb iocb, struct iov_iter to)
				1746	{
				1747	struct file *file = iocb->ki_filp;
				1748	struct vhost_net *n = file->private_data;
				1749	struct vhost_dev *dev = &n->dev;
				1750	int noblock = file->f_flags & O_NONBLOCK;
				1751
				1752	return vhost_chr_read_iter(dev, to, noblock);
				1753	}
				1754
				1755	static ssize_t vhost_net_chr_write_iter(struct kiocb *iocb,
				1756	struct iov_iter *from)
				1757	{
				1758	struct file *file = iocb->ki_filp;
				1759	struct vhost_net *n = file->private_data;
				1760	struct vhost_dev *dev = &n->dev;
				1761
				1762	return vhost_chr_write_iter(dev, from);
				1763	}
				1764
				1765	static __poll_t vhost_net_chr_poll(struct file file, poll_table wait)
				1766	{
				1767	struct vhost_net *n = file->private_data;
				1768	struct vhost_dev *dev = &n->dev;
				1769
				1770	return vhost_chr_poll(file, dev, wait);
				1771	}
				1772
				1773	static const struct file_operations vhost_net_fops = {
				1774	.owner = THIS_MODULE,
				1775	.release = vhost_net_release,
				1776	.read_iter = vhost_net_chr_read_iter,
				1777	.write_iter = vhost_net_chr_write_iter,
				1778	.poll = vhost_net_chr_poll,
				1779	.unlocked_ioctl = vhost_net_ioctl,
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1780	.compat_ioctl = compat_ptr_ioctl,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1781	.open = vhost_net_open,
				1782	.llseek = noop_llseek,
				1783	};
				1784
				1785	static struct miscdevice vhost_net_misc = {
				1786	.minor = VHOST_NET_MINOR,
				1787	.name = "vhost-net",
				1788	.fops = &vhost_net_fops,
				1789	};
				1790
				1791	static int vhost_net_init(void)
				1792	{
				1793	if (experimental_zcopytx)
				1794	vhost_net_enable_zcopy(VHOST_NET_VQ_TX);
				1795	return misc_register(&vhost_net_misc);
				1796	}
				1797	module_init(vhost_net_init);
				1798
				1799	static void vhost_net_exit(void)
				1800	{
				1801	misc_deregister(&vhost_net_misc);
				1802	}
				1803	module_exit(vhost_net_exit);
				1804
				1805	MODULE_VERSION("0.0.1");
				1806	MODULE_LICENSE("GPL v2");
				1807	MODULE_AUTHOR("Michael S. Tsirkin");
				1808	MODULE_DESCRIPTION("Host kernel accelerator for virtio net");
				1809	MODULE_ALIAS_MISCDEV(VHOST_NET_MINOR);
				1810	MODULE_ALIAS("devname:vhost-net");