Blame - kernel/bpf/ringbuf.c - hafnium/third_party/linux

blob: 1e4bf23528a3de1d955e7f73284508e90d31332e [file] [log] [blame]

Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1	#include <linux/bpf.h>
				2	#include <linux/btf.h>
				3	#include <linux/err.h>
				4	#include <linux/irq_work.h>
				5	#include <linux/slab.h>
				6	#include <linux/filter.h>
				7	#include <linux/mm.h>
				8	#include <linux/vmalloc.h>
				9	#include <linux/wait.h>
				10	#include <linux/poll.h>
				11	#include <linux/kmemleak.h>
				12	#include <uapi/linux/btf.h>
				13
				14	#define RINGBUF_CREATE_FLAG_MASK (BPF_F_NUMA_NODE)
				15
				16	/* non-mmap()'able part of bpf_ringbuf (everything up to consumer page) */
				17	#define RINGBUF_PGOFF \
				18	(offsetof(struct bpf_ringbuf, consumer_pos) >> PAGE_SHIFT)
				19	/* consumer page and producer page */
				20	#define RINGBUF_POS_PAGES 2
				21
				22	#define RINGBUF_MAX_RECORD_SZ (UINT_MAX/4)
				23
				24	/* Maximum size of ring buffer area is limited by 32-bit page offset within
				25	* record header, counted in pages. Reserve 8 bits for extensibility, and take
				26	* into account few extra pages for consumer/producer pages and
				27	* non-mmap()'able parts. This gives 64GB limit, which seems plenty for single
				28	* ring buffer.
				29	*/
				30	#define RINGBUF_MAX_DATA_SZ \
				31	(((1ULL << 24) - RINGBUF_POS_PAGES - RINGBUF_PGOFF) * PAGE_SIZE)
				32
				33	struct bpf_ringbuf {
				34	wait_queue_head_t waitq;
				35	struct irq_work work;
				36	u64 mask;
				37	struct page **pages;
				38	int nr_pages;
				39	spinlock_t spinlock ____cacheline_aligned_in_smp;
				40	/* Consumer and producer counters are put into separate pages to allow
				41	* mapping consumer page as r/w, but restrict producer page to r/o.
				42	* This protects producer position from being modified by user-space
				43	* application and ruining in-kernel position tracking.
				44	*/
				45	unsigned long consumer_pos __aligned(PAGE_SIZE);
				46	unsigned long producer_pos __aligned(PAGE_SIZE);
				47	char data[] __aligned(PAGE_SIZE);
				48	};
				49
				50	struct bpf_ringbuf_map {
				51	struct bpf_map map;
				52	struct bpf_map_memory memory;
				53	struct bpf_ringbuf *rb;
				54	};
				55
				56	/* 8-byte ring buffer record header structure */
				57	struct bpf_ringbuf_hdr {
				58	u32 len;
				59	u32 pg_off;
				60	};
				61
				62	static struct bpf_ringbuf *bpf_ringbuf_area_alloc(size_t data_sz, int numa_node)
				63	{
				64	const gfp_t flags = GFP_KERNEL \| __GFP_RETRY_MAYFAIL \| __GFP_NOWARN \|
				65	__GFP_ZERO;
				66	int nr_meta_pages = RINGBUF_PGOFF + RINGBUF_POS_PAGES;
				67	int nr_data_pages = data_sz >> PAGE_SHIFT;
				68	int nr_pages = nr_meta_pages + nr_data_pages;
				69	struct page *pages, page;
				70	struct bpf_ringbuf *rb;
				71	size_t array_size;
				72	int i;
				73
				74	/* Each data page is mapped twice to allow "virtual"
				75	* continuous read of samples wrapping around the end of ring
				76	* buffer area:
				77	* ------------------------------------------------------
				78	* \| meta pages \| real data pages \| same data pages \|
				79	* ------------------------------------------------------
				80	* \| \| 1 2 3 4 5 6 7 8 9 \| 1 2 3 4 5 6 7 8 9 \|
				81	* ------------------------------------------------------
				82	* \| \| TA DA \| TA DA \|
				83	* ------------------------------------------------------
				84	* ^^^^^^^
				85	* \|
				86	* Here, no need to worry about special handling of wrapped-around
				87	* data due to double-mapped data pages. This works both in kernel and
				88	* when mmap()'ed in user-space, simplifying both kernel and
				89	* user-space implementations significantly.
				90	*/
				91	array_size = (nr_meta_pages + 2 * nr_data_pages) * sizeof(*pages);
				92	if (array_size > PAGE_SIZE)
				93	pages = vmalloc_node(array_size, numa_node);
				94	else
				95	pages = kmalloc_node(array_size, flags, numa_node);
				96	if (!pages)
				97	return NULL;
				98
				99	for (i = 0; i < nr_pages; i++) {
				100	page = alloc_pages_node(numa_node, flags, 0);
				101	if (!page) {
				102	nr_pages = i;
				103	goto err_free_pages;
				104	}
				105	pages[i] = page;
				106	if (i >= nr_meta_pages)
				107	pages[nr_data_pages + i] = page;
				108	}
				109
				110	rb = vmap(pages, nr_meta_pages + 2 * nr_data_pages,
				111	VM_MAP \| VM_USERMAP, PAGE_KERNEL);
				112	if (rb) {
				113	kmemleak_not_leak(pages);
				114	rb->pages = pages;
				115	rb->nr_pages = nr_pages;
				116	return rb;
				117	}
				118
				119	err_free_pages:
				120	for (i = 0; i < nr_pages; i++)
				121	__free_page(pages[i]);
				122	kvfree(pages);
				123	return NULL;
				124	}
				125
				126	static void bpf_ringbuf_notify(struct irq_work *work)
				127	{
				128	struct bpf_ringbuf *rb = container_of(work, struct bpf_ringbuf, work);
				129
				130	wake_up_all(&rb->waitq);
				131	}
				132
				133	static struct bpf_ringbuf *bpf_ringbuf_alloc(size_t data_sz, int numa_node)
				134	{
				135	struct bpf_ringbuf *rb;
				136
				137	rb = bpf_ringbuf_area_alloc(data_sz, numa_node);
				138	if (!rb)
				139	return ERR_PTR(-ENOMEM);
				140
				141	spin_lock_init(&rb->spinlock);
				142	init_waitqueue_head(&rb->waitq);
				143	init_irq_work(&rb->work, bpf_ringbuf_notify);
				144
				145	rb->mask = data_sz - 1;
				146	rb->consumer_pos = 0;
				147	rb->producer_pos = 0;
				148
				149	return rb;
				150	}
				151
				152	static struct bpf_map ringbuf_map_alloc(union bpf_attr attr)
				153	{
				154	struct bpf_ringbuf_map *rb_map;
				155	u64 cost;
				156	int err;
				157
				158	if (attr->map_flags & ~RINGBUF_CREATE_FLAG_MASK)
				159	return ERR_PTR(-EINVAL);
				160
				161	if (attr->key_size \|\| attr->value_size \|\|
				162	!is_power_of_2(attr->max_entries) \|\|
				163	!PAGE_ALIGNED(attr->max_entries))
				164	return ERR_PTR(-EINVAL);
				165
				166	#ifdef CONFIG_64BIT
				167	/* on 32-bit arch, it's impossible to overflow record's hdr->pgoff */
				168	if (attr->max_entries > RINGBUF_MAX_DATA_SZ)
				169	return ERR_PTR(-E2BIG);
				170	#endif
				171
				172	rb_map = kzalloc(sizeof(*rb_map), GFP_USER);
				173	if (!rb_map)
				174	return ERR_PTR(-ENOMEM);
				175
				176	bpf_map_init_from_attr(&rb_map->map, attr);
				177
				178	cost = sizeof(struct bpf_ringbuf_map) +
				179	sizeof(struct bpf_ringbuf) +
				180	attr->max_entries;
				181	err = bpf_map_charge_init(&rb_map->map.memory, cost);
				182	if (err)
				183	goto err_free_map;
				184
				185	rb_map->rb = bpf_ringbuf_alloc(attr->max_entries, rb_map->map.numa_node);
				186	if (IS_ERR(rb_map->rb)) {
				187	err = PTR_ERR(rb_map->rb);
				188	goto err_uncharge;
				189	}
				190
				191	return &rb_map->map;
				192
				193	err_uncharge:
				194	bpf_map_charge_finish(&rb_map->map.memory);
				195	err_free_map:
				196	kfree(rb_map);
				197	return ERR_PTR(err);
				198	}
				199
				200	static void bpf_ringbuf_free(struct bpf_ringbuf *rb)
				201	{
				202	/* copy pages pointer and nr_pages to local variable, as we are going
				203	* to unmap rb itself with vunmap() below
				204	*/
				205	struct page **pages = rb->pages;
				206	int i, nr_pages = rb->nr_pages;
				207
				208	vunmap(rb);
				209	for (i = 0; i < nr_pages; i++)
				210	__free_page(pages[i]);
				211	kvfree(pages);
				212	}
				213
				214	static void ringbuf_map_free(struct bpf_map *map)
				215	{
				216	struct bpf_ringbuf_map *rb_map;
				217
				218	rb_map = container_of(map, struct bpf_ringbuf_map, map);
				219	bpf_ringbuf_free(rb_map->rb);
				220	kfree(rb_map);
				221	}
				222
				223	static void ringbuf_map_lookup_elem(struct bpf_map map, void *key)
				224	{
				225	return ERR_PTR(-ENOTSUPP);
				226	}
				227
				228	static int ringbuf_map_update_elem(struct bpf_map map, void key, void *value,
				229	u64 flags)
				230	{
				231	return -ENOTSUPP;
				232	}
				233
				234	static int ringbuf_map_delete_elem(struct bpf_map map, void key)
				235	{
				236	return -ENOTSUPP;
				237	}
				238
				239	static int ringbuf_map_get_next_key(struct bpf_map map, void key,
				240	void *next_key)
				241	{
				242	return -ENOTSUPP;
				243	}
				244
				245	static int ringbuf_map_mmap(struct bpf_map map, struct vm_area_struct vma)
				246	{
				247	struct bpf_ringbuf_map *rb_map;
				248
				249	rb_map = container_of(map, struct bpf_ringbuf_map, map);
				250
				251	if (vma->vm_flags & VM_WRITE) {
				252	/* allow writable mapping for the consumer_pos only */
				253	if (vma->vm_pgoff != 0 \|\| vma->vm_end - vma->vm_start != PAGE_SIZE)
				254	return -EPERM;
				255	} else {
				256	vma->vm_flags &= ~VM_MAYWRITE;
				257	}
				258	/* remap_vmalloc_range() checks size and offset constraints */
				259	return remap_vmalloc_range(vma, rb_map->rb,
				260	vma->vm_pgoff + RINGBUF_PGOFF);
				261	}
				262
				263	static unsigned long ringbuf_avail_data_sz(struct bpf_ringbuf *rb)
				264	{
				265	unsigned long cons_pos, prod_pos;
				266
				267	cons_pos = smp_load_acquire(&rb->consumer_pos);
				268	prod_pos = smp_load_acquire(&rb->producer_pos);
				269	return prod_pos - cons_pos;
				270	}
				271
				272	static __poll_t ringbuf_map_poll(struct bpf_map map, struct file filp,
				273	struct poll_table_struct *pts)
				274	{
				275	struct bpf_ringbuf_map *rb_map;
				276
				277	rb_map = container_of(map, struct bpf_ringbuf_map, map);
				278	poll_wait(filp, &rb_map->rb->waitq, pts);
				279
				280	if (ringbuf_avail_data_sz(rb_map->rb))
				281	return EPOLLIN \| EPOLLRDNORM;
				282	return 0;
				283	}
				284
				285	static int ringbuf_map_btf_id;
				286	const struct bpf_map_ops ringbuf_map_ops = {
				287	.map_meta_equal = bpf_map_meta_equal,
				288	.map_alloc = ringbuf_map_alloc,
				289	.map_free = ringbuf_map_free,
				290	.map_mmap = ringbuf_map_mmap,
				291	.map_poll = ringbuf_map_poll,
				292	.map_lookup_elem = ringbuf_map_lookup_elem,
				293	.map_update_elem = ringbuf_map_update_elem,
				294	.map_delete_elem = ringbuf_map_delete_elem,
				295	.map_get_next_key = ringbuf_map_get_next_key,
				296	.map_btf_name = "bpf_ringbuf_map",
				297	.map_btf_id = &ringbuf_map_btf_id,
				298	};
				299
				300	/* Given pointer to ring buffer record metadata and struct bpf_ringbuf itself,
				301	* calculate offset from record metadata to ring buffer in pages, rounded
				302	* down. This page offset is stored as part of record metadata and allows to
				303	* restore struct bpf_ringbuf * from record pointer. This page offset is
				304	* stored at offset 4 of record metadata header.
				305	*/
				306	static size_t bpf_ringbuf_rec_pg_off(struct bpf_ringbuf *rb,
				307	struct bpf_ringbuf_hdr *hdr)
				308	{
				309	return ((void )hdr - (void )rb) >> PAGE_SHIFT;
				310	}
				311
				312	/* Given pointer to ring buffer record header, restore pointer to struct
				313	* bpf_ringbuf itself by using page offset stored at offset 4
				314	*/
				315	static struct bpf_ringbuf *
				316	bpf_ringbuf_restore_from_rec(struct bpf_ringbuf_hdr *hdr)
				317	{
				318	unsigned long addr = (unsigned long)(void *)hdr;
				319	unsigned long off = (unsigned long)hdr->pg_off << PAGE_SHIFT;
				320
				321	return (void*)((addr & PAGE_MASK) - off);
				322	}
				323
				324	static void __bpf_ringbuf_reserve(struct bpf_ringbuf rb, u64 size)
				325	{
				326	unsigned long cons_pos, prod_pos, new_prod_pos, flags;
				327	u32 len, pg_off;
				328	struct bpf_ringbuf_hdr *hdr;
				329
				330	if (unlikely(size > RINGBUF_MAX_RECORD_SZ))
				331	return NULL;
				332
				333	len = round_up(size + BPF_RINGBUF_HDR_SZ, 8);
				334	if (len > rb->mask + 1)
				335	return NULL;
				336
				337	cons_pos = smp_load_acquire(&rb->consumer_pos);
				338
				339	if (in_nmi()) {
				340	if (!spin_trylock_irqsave(&rb->spinlock, flags))
				341	return NULL;
				342	} else {
				343	spin_lock_irqsave(&rb->spinlock, flags);
				344	}
				345
				346	prod_pos = rb->producer_pos;
				347	new_prod_pos = prod_pos + len;
				348
				349	/* check for out of ringbuf space by ensuring producer position
				350	* doesn't advance more than (ringbuf_size - 1) ahead
				351	*/
				352	if (new_prod_pos - cons_pos > rb->mask) {
				353	spin_unlock_irqrestore(&rb->spinlock, flags);
				354	return NULL;
				355	}
				356
				357	hdr = (void *)rb->data + (prod_pos & rb->mask);
				358	pg_off = bpf_ringbuf_rec_pg_off(rb, hdr);
				359	hdr->len = size \| BPF_RINGBUF_BUSY_BIT;
				360	hdr->pg_off = pg_off;
				361
				362	/* pairs with consumer's smp_load_acquire() */
				363	smp_store_release(&rb->producer_pos, new_prod_pos);
				364
				365	spin_unlock_irqrestore(&rb->spinlock, flags);
				366
				367	return (void *)hdr + BPF_RINGBUF_HDR_SZ;
				368	}
				369
				370	BPF_CALL_3(bpf_ringbuf_reserve, struct bpf_map *, map, u64, size, u64, flags)
				371	{
				372	struct bpf_ringbuf_map *rb_map;
				373
				374	if (unlikely(flags))
				375	return 0;
				376
				377	rb_map = container_of(map, struct bpf_ringbuf_map, map);
				378	return (unsigned long)__bpf_ringbuf_reserve(rb_map->rb, size);
				379	}
				380
				381	const struct bpf_func_proto bpf_ringbuf_reserve_proto = {
				382	.func = bpf_ringbuf_reserve,
				383	.ret_type = RET_PTR_TO_ALLOC_MEM_OR_NULL,
				384	.arg1_type = ARG_CONST_MAP_PTR,
				385	.arg2_type = ARG_CONST_ALLOC_SIZE_OR_ZERO,
				386	.arg3_type = ARG_ANYTHING,
				387	};
				388
				389	static void bpf_ringbuf_commit(void *sample, u64 flags, bool discard)
				390	{
				391	unsigned long rec_pos, cons_pos;
				392	struct bpf_ringbuf_hdr *hdr;
				393	struct bpf_ringbuf *rb;
				394	u32 new_len;
				395
				396	hdr = sample - BPF_RINGBUF_HDR_SZ;
				397	rb = bpf_ringbuf_restore_from_rec(hdr);
				398	new_len = hdr->len ^ BPF_RINGBUF_BUSY_BIT;
				399	if (discard)
				400	new_len \|= BPF_RINGBUF_DISCARD_BIT;
				401
				402	/* update record header with correct final size prefix */
				403	xchg(&hdr->len, new_len);
				404
				405	/* if consumer caught up and is waiting for our record, notify about
				406	* new data availability
				407	*/
				408	rec_pos = (void )hdr - (void )rb->data;
				409	cons_pos = smp_load_acquire(&rb->consumer_pos) & rb->mask;
				410
				411	if (flags & BPF_RB_FORCE_WAKEUP)
				412	irq_work_queue(&rb->work);
				413	else if (cons_pos == rec_pos && !(flags & BPF_RB_NO_WAKEUP))
				414	irq_work_queue(&rb->work);
				415	}
				416
				417	BPF_CALL_2(bpf_ringbuf_submit, void *, sample, u64, flags)
				418	{
				419	bpf_ringbuf_commit(sample, flags, false /* discard */);
				420	return 0;
				421	}
				422
				423	const struct bpf_func_proto bpf_ringbuf_submit_proto = {
				424	.func = bpf_ringbuf_submit,
				425	.ret_type = RET_VOID,
				426	.arg1_type = ARG_PTR_TO_ALLOC_MEM,
				427	.arg2_type = ARG_ANYTHING,
				428	};
				429
				430	BPF_CALL_2(bpf_ringbuf_discard, void *, sample, u64, flags)
				431	{
				432	bpf_ringbuf_commit(sample, flags, true /* discard */);
				433	return 0;
				434	}
				435
				436	const struct bpf_func_proto bpf_ringbuf_discard_proto = {
				437	.func = bpf_ringbuf_discard,
				438	.ret_type = RET_VOID,
				439	.arg1_type = ARG_PTR_TO_ALLOC_MEM,
				440	.arg2_type = ARG_ANYTHING,
				441	};
				442
				443	BPF_CALL_4(bpf_ringbuf_output, struct bpf_map , map, void , data, u64, size,
				444	u64, flags)
				445	{
				446	struct bpf_ringbuf_map *rb_map;
				447	void *rec;
				448
				449	if (unlikely(flags & ~(BPF_RB_NO_WAKEUP \| BPF_RB_FORCE_WAKEUP)))
				450	return -EINVAL;
				451
				452	rb_map = container_of(map, struct bpf_ringbuf_map, map);
				453	rec = __bpf_ringbuf_reserve(rb_map->rb, size);
				454	if (!rec)
				455	return -EAGAIN;
				456
				457	memcpy(rec, data, size);
				458	bpf_ringbuf_commit(rec, flags, false /* discard */);
				459	return 0;
				460	}
				461
				462	const struct bpf_func_proto bpf_ringbuf_output_proto = {
				463	.func = bpf_ringbuf_output,
				464	.ret_type = RET_INTEGER,
				465	.arg1_type = ARG_CONST_MAP_PTR,
				466	.arg2_type = ARG_PTR_TO_MEM,
				467	.arg3_type = ARG_CONST_SIZE_OR_ZERO,
				468	.arg4_type = ARG_ANYTHING,
				469	};
				470
				471	BPF_CALL_2(bpf_ringbuf_query, struct bpf_map *, map, u64, flags)
				472	{
				473	struct bpf_ringbuf *rb;
				474
				475	rb = container_of(map, struct bpf_ringbuf_map, map)->rb;
				476
				477	switch (flags) {
				478	case BPF_RB_AVAIL_DATA:
				479	return ringbuf_avail_data_sz(rb);
				480	case BPF_RB_RING_SIZE:
				481	return rb->mask + 1;
				482	case BPF_RB_CONS_POS:
				483	return smp_load_acquire(&rb->consumer_pos);
				484	case BPF_RB_PROD_POS:
				485	return smp_load_acquire(&rb->producer_pos);
				486	default:
				487	return 0;
				488	}
				489	}
				490
				491	const struct bpf_func_proto bpf_ringbuf_query_proto = {
				492	.func = bpf_ringbuf_query,
				493	.ret_type = RET_INTEGER,
				494	.arg1_type = ARG_CONST_MAP_PTR,
				495	.arg2_type = ARG_ANYTHING,
				496	};