Blame - kernel/events/ring_buffer.c - hafnium/third_party/linux

blob: ffb59a4ef4ff3d5e4774296c802967224af40789 [file] [log] [blame]

David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1	// SPDX-License-Identifier: GPL-2.0
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2	/*
				3	* Performance events ring-buffer code:
				4	*
				5	* Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
				6	* Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
				7	* Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra
				8	* Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	9	*/
				10
				11	#include <linux/perf_event.h>
				12	#include <linux/vmalloc.h>
				13	#include <linux/slab.h>
				14	#include <linux/circ_buf.h>
				15	#include <linux/poll.h>
				16	#include <linux/nospec.h>
				17
				18	#include "internal.h"
				19
				20	static void perf_output_wakeup(struct perf_output_handle *handle)
				21	{
				22	atomic_set(&handle->rb->poll, EPOLLIN);
				23
				24	handle->event->pending_wakeup = 1;
				25	irq_work_queue(&handle->event->pending);
				26	}
				27
				28	/*
				29	* We need to ensure a later event_id doesn't publish a head when a former
				30	* event isn't done writing. However since we need to deal with NMIs we
				31	* cannot fully serialize things.
				32	*
				33	* We only publish the head (and generate a wakeup) when the outer-most
				34	* event completes.
				35	*/
				36	static void perf_output_get_handle(struct perf_output_handle *handle)
				37	{
				38	struct ring_buffer *rb = handle->rb;
				39
				40	preempt_disable();
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	41
				42	/*
				43	* Avoid an explicit LOAD/STORE such that architectures with memops
				44	* can use them.
				45	*/
				46	((volatile unsigned int )&rb->nest)++;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	47	handle->wakeup = local_read(&rb->wakeup);
				48	}
				49
				50	static void perf_output_put_handle(struct perf_output_handle *handle)
				51	{
				52	struct ring_buffer *rb = handle->rb;
				53	unsigned long head;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	54	unsigned int nest;
				55
				56	/*
				57	* If this isn't the outermost nesting, we don't have to update
				58	* @rb->user_page->data_head.
				59	*/
				60	nest = READ_ONCE(rb->nest);
				61	if (nest > 1) {
				62	WRITE_ONCE(rb->nest, nest - 1);
				63	goto out;
				64	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	65
				66	again:
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	67	/*
				68	* In order to avoid publishing a head value that goes backwards,
				69	* we must ensure the load of @rb->head happens after we've
				70	* incremented @rb->nest.
				71	*
				72	* Otherwise we can observe a @rb->head value before one published
				73	* by an IRQ/NMI happening between the load and the increment.
				74	*/
				75	barrier();
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	76	head = local_read(&rb->head);
				77
				78	/*
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	79	* IRQ/NMI can happen here and advance @rb->head, causing our
				80	* load above to be stale.
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	81	*/
				82
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	83	/*
				84	* Since the mmap() consumer (userspace) can run on a different CPU:
				85	*
				86	* kernel user
				87	*
				88	* if (LOAD ->data_tail) { LOAD ->data_head
				89	* (A) smp_rmb() (C)
				90	* STORE $data LOAD $data
				91	* smp_wmb() (B) smp_mb() (D)
				92	* STORE ->data_head STORE ->data_tail
				93	* }
				94	*
				95	* Where A pairs with D, and B pairs with C.
				96	*
				97	* In our case (A) is a control dependency that separates the load of
				98	* the ->data_tail and the stores of $data. In case ->data_tail
				99	* indicates there is no room in the buffer to store $data we do not.
				100	*
				101	* D needs to be a full barrier since it separates the data READ
				102	* from the tail WRITE.
				103	*
				104	* For B a WMB is sufficient since it separates two WRITEs, and for C
				105	* an RMB is sufficient since it separates two READs.
				106	*
				107	* See perf_output_begin().
				108	*/
				109	smp_wmb(); /* B, matches C */
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	110	WRITE_ONCE(rb->user_page->data_head, head);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	111
				112	/*
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	113	* We must publish the head before decrementing the nest count,
				114	* otherwise an IRQ/NMI can publish a more recent head value and our
				115	* write will (temporarily) publish a stale value.
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	116	*/
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	117	barrier();
				118	WRITE_ONCE(rb->nest, 0);
				119
				120	/*
				121	* Ensure we decrement @rb->nest before we validate the @rb->head.
				122	* Otherwise we cannot be sure we caught the 'last' nested update.
				123	*/
				124	barrier();
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	125	if (unlikely(head != local_read(&rb->head))) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	126	WRITE_ONCE(rb->nest, 1);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	127	goto again;
				128	}
				129
				130	if (handle->wakeup != local_read(&rb->wakeup))
				131	perf_output_wakeup(handle);
				132
				133	out:
				134	preempt_enable();
				135	}
				136
				137	static __always_inline bool
				138	ring_buffer_has_space(unsigned long head, unsigned long tail,
				139	unsigned long data_size, unsigned int size,
				140	bool backward)
				141	{
				142	if (!backward)
				143	return CIRC_SPACE(head, tail, data_size) >= size;
				144	else
				145	return CIRC_SPACE(tail, head, data_size) >= size;
				146	}
				147
				148	static __always_inline int
				149	__perf_output_begin(struct perf_output_handle *handle,
				150	struct perf_event *event, unsigned int size,
				151	bool backward)
				152	{
				153	struct ring_buffer *rb;
				154	unsigned long tail, offset, head;
				155	int have_lost, page_shift;
				156	struct {
				157	struct perf_event_header header;
				158	u64 id;
				159	u64 lost;
				160	} lost_event;
				161
				162	rcu_read_lock();
				163	/*
				164	* For inherited events we send all the output towards the parent.
				165	*/
				166	if (event->parent)
				167	event = event->parent;
				168
				169	rb = rcu_dereference(event->rb);
				170	if (unlikely(!rb))
				171	goto out;
				172
				173	if (unlikely(rb->paused)) {
				174	if (rb->nr_pages)
				175	local_inc(&rb->lost);
				176	goto out;
				177	}
				178
				179	handle->rb = rb;
				180	handle->event = event;
				181
				182	have_lost = local_read(&rb->lost);
				183	if (unlikely(have_lost)) {
				184	size += sizeof(lost_event);
				185	if (event->attr.sample_id_all)
				186	size += event->id_header_size;
				187	}
				188
				189	perf_output_get_handle(handle);
				190
				191	do {
				192	tail = READ_ONCE(rb->user_page->data_tail);
				193	offset = head = local_read(&rb->head);
				194	if (!rb->overwrite) {
				195	if (unlikely(!ring_buffer_has_space(head, tail,
				196	perf_data_size(rb),
				197	size, backward)))
				198	goto fail;
				199	}
				200
				201	/*
				202	* The above forms a control dependency barrier separating the
				203	* @tail load above from the data stores below. Since the @tail
				204	* load is required to compute the branch to fail below.
				205	*
				206	* A, matches D; the full memory barrier userspace SHOULD issue
				207	* after reading the data and before storing the new tail
				208	* position.
				209	*
				210	* See perf_output_put_handle().
				211	*/
				212
				213	if (!backward)
				214	head += size;
				215	else
				216	head -= size;
				217	} while (local_cmpxchg(&rb->head, offset, head) != offset);
				218
				219	if (backward) {
				220	offset = head;
				221	head = (u64)(-head);
				222	}
				223
				224	/*
				225	* We rely on the implied barrier() by local_cmpxchg() to ensure
				226	* none of the data stores below can be lifted up by the compiler.
				227	*/
				228
				229	if (unlikely(head - local_read(&rb->wakeup) > rb->watermark))
				230	local_add(rb->watermark, &rb->wakeup);
				231
				232	page_shift = PAGE_SHIFT + page_order(rb);
				233
				234	handle->page = (offset >> page_shift) & (rb->nr_pages - 1);
				235	offset &= (1UL << page_shift) - 1;
				236	handle->addr = rb->data_pages[handle->page] + offset;
				237	handle->size = (1UL << page_shift) - offset;
				238
				239	if (unlikely(have_lost)) {
				240	struct perf_sample_data sample_data;
				241
				242	lost_event.header.size = sizeof(lost_event);
				243	lost_event.header.type = PERF_RECORD_LOST;
				244	lost_event.header.misc = 0;
				245	lost_event.id = event->id;
				246	lost_event.lost = local_xchg(&rb->lost, 0);
				247
				248	perf_event_header__init_id(&lost_event.header,
				249	&sample_data, event);
				250	perf_output_put(handle, lost_event);
				251	perf_event__output_id_sample(event, handle, &sample_data);
				252	}
				253
				254	return 0;
				255
				256	fail:
				257	local_inc(&rb->lost);
				258	perf_output_put_handle(handle);
				259	out:
				260	rcu_read_unlock();
				261
				262	return -ENOSPC;
				263	}
				264
				265	int perf_output_begin_forward(struct perf_output_handle *handle,
				266	struct perf_event *event, unsigned int size)
				267	{
				268	return __perf_output_begin(handle, event, size, false);
				269	}
				270
				271	int perf_output_begin_backward(struct perf_output_handle *handle,
				272	struct perf_event *event, unsigned int size)
				273	{
				274	return __perf_output_begin(handle, event, size, true);
				275	}
				276
				277	int perf_output_begin(struct perf_output_handle *handle,
				278	struct perf_event *event, unsigned int size)
				279	{
				280
				281	return __perf_output_begin(handle, event, size,
				282	unlikely(is_write_backward(event)));
				283	}
				284
				285	unsigned int perf_output_copy(struct perf_output_handle *handle,
				286	const void *buf, unsigned int len)
				287	{
				288	return __output_copy(handle, buf, len);
				289	}
				290
				291	unsigned int perf_output_skip(struct perf_output_handle *handle,
				292	unsigned int len)
				293	{
				294	return __output_skip(handle, NULL, len);
				295	}
				296
				297	void perf_output_end(struct perf_output_handle *handle)
				298	{
				299	perf_output_put_handle(handle);
				300	rcu_read_unlock();
				301	}
				302
				303	static void
				304	ring_buffer_init(struct ring_buffer *rb, long watermark, int flags)
				305	{
				306	long max_size = perf_data_size(rb);
				307
				308	if (watermark)
				309	rb->watermark = min(max_size, watermark);
				310
				311	if (!rb->watermark)
				312	rb->watermark = max_size / 2;
				313
				314	if (flags & RING_BUFFER_WRITABLE)
				315	rb->overwrite = 0;
				316	else
				317	rb->overwrite = 1;
				318
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	319	refcount_set(&rb->refcount, 1);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	320
				321	INIT_LIST_HEAD(&rb->event_list);
				322	spin_lock_init(&rb->event_lock);
				323
				324	/*
				325	* perf_output_begin() only checks rb->paused, therefore
				326	* rb->paused must be true if we have no pages for output.
				327	*/
				328	if (!rb->nr_pages)
				329	rb->paused = 1;
				330	}
				331
				332	void perf_aux_output_flag(struct perf_output_handle *handle, u64 flags)
				333	{
				334	/*
				335	* OVERWRITE is determined by perf_aux_output_end() and can't
				336	* be passed in directly.
				337	*/
				338	if (WARN_ON_ONCE(flags & PERF_AUX_FLAG_OVERWRITE))
				339	return;
				340
				341	handle->aux_flags \|= flags;
				342	}
				343	EXPORT_SYMBOL_GPL(perf_aux_output_flag);
				344
				345	/*
				346	* This is called before hardware starts writing to the AUX area to
				347	* obtain an output handle and make sure there's room in the buffer.
				348	* When the capture completes, call perf_aux_output_end() to commit
				349	* the recorded data to the buffer.
				350	*
				351	* The ordering is similar to that of perf_output_{begin,end}, with
				352	* the exception of (B), which should be taken care of by the pmu
				353	* driver, since ordering rules will differ depending on hardware.
				354	*
				355	* Call this from pmu::start(); see the comment in perf_aux_output_end()
				356	* about its use in pmu callbacks. Both can also be called from the PMI
				357	* handler if needed.
				358	*/
				359	void perf_aux_output_begin(struct perf_output_handle handle,
				360	struct perf_event *event)
				361	{
				362	struct perf_event *output_event = event;
				363	unsigned long aux_head, aux_tail;
				364	struct ring_buffer *rb;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	365	unsigned int nest;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	366
				367	if (output_event->parent)
				368	output_event = output_event->parent;
				369
				370	/*
				371	* Since this will typically be open across pmu::add/pmu::del, we
				372	* grab ring_buffer's refcount instead of holding rcu read lock
				373	* to make sure it doesn't disappear under us.
				374	*/
				375	rb = ring_buffer_get(output_event);
				376	if (!rb)
				377	return NULL;
				378
				379	if (!rb_has_aux(rb))
				380	goto err;
				381
				382	/*
				383	* If aux_mmap_count is zero, the aux buffer is in perf_mmap_close(),
				384	* about to get freed, so we leave immediately.
				385	*
				386	* Checking rb::aux_mmap_count and rb::refcount has to be done in
				387	* the same order, see perf_mmap_close. Otherwise we end up freeing
				388	* aux pages in this path, which is a bug, because in_atomic().
				389	*/
				390	if (!atomic_read(&rb->aux_mmap_count))
				391	goto err;
				392
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	393	if (!refcount_inc_not_zero(&rb->aux_refcount))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	394	goto err;
				395
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	396	nest = READ_ONCE(rb->aux_nest);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	397	/*
				398	* Nesting is not supported for AUX area, make sure nested
				399	* writers are caught early
				400	*/
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	401	if (WARN_ON_ONCE(nest))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	402	goto err_put;
				403
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	404	WRITE_ONCE(rb->aux_nest, nest + 1);
				405
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	406	aux_head = rb->aux_head;
				407
				408	handle->rb = rb;
				409	handle->event = event;
				410	handle->head = aux_head;
				411	handle->size = 0;
				412	handle->aux_flags = 0;
				413
				414	/*
				415	* In overwrite mode, AUX data stores do not depend on aux_tail,
				416	* therefore (A) control dependency barrier does not exist. The
				417	* (B) <-> (C) ordering is still observed by the pmu driver.
				418	*/
				419	if (!rb->aux_overwrite) {
				420	aux_tail = READ_ONCE(rb->user_page->aux_tail);
				421	handle->wakeup = rb->aux_wakeup + rb->aux_watermark;
				422	if (aux_head - aux_tail < perf_aux_size(rb))
				423	handle->size = CIRC_SPACE(aux_head, aux_tail, perf_aux_size(rb));
				424
				425	/*
				426	* handle->size computation depends on aux_tail load; this forms a
				427	* control dependency barrier separating aux_tail load from aux data
				428	* store that will be enabled on successful return
				429	*/
				430	if (!handle->size) { /* A, matches D */
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	431	event->pending_disable = smp_processor_id();
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	432	perf_output_wakeup(handle);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	433	WRITE_ONCE(rb->aux_nest, 0);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	434	goto err_put;
				435	}
				436	}
				437
				438	return handle->rb->aux_priv;
				439
				440	err_put:
				441	/* can't be last */
				442	rb_free_aux(rb);
				443
				444	err:
				445	ring_buffer_put(rb);
				446	handle->event = NULL;
				447
				448	return NULL;
				449	}
				450	EXPORT_SYMBOL_GPL(perf_aux_output_begin);
				451
				452	static __always_inline bool rb_need_aux_wakeup(struct ring_buffer *rb)
				453	{
				454	if (rb->aux_overwrite)
				455	return false;
				456
				457	if (rb->aux_head - rb->aux_wakeup >= rb->aux_watermark) {
				458	rb->aux_wakeup = rounddown(rb->aux_head, rb->aux_watermark);
				459	return true;
				460	}
				461
				462	return false;
				463	}
				464
				465	/*
				466	* Commit the data written by hardware into the ring buffer by adjusting
				467	* aux_head and posting a PERF_RECORD_AUX into the perf buffer. It is the
				468	* pmu driver's responsibility to observe ordering rules of the hardware,
				469	* so that all the data is externally visible before this is called.
				470	*
				471	* Note: this has to be called from pmu::stop() callback, as the assumption
				472	* of the AUX buffer management code is that after pmu::stop(), the AUX
				473	* transaction must be stopped and therefore drop the AUX reference count.
				474	*/
				475	void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size)
				476	{
				477	bool wakeup = !!(handle->aux_flags & PERF_AUX_FLAG_TRUNCATED);
				478	struct ring_buffer *rb = handle->rb;
				479	unsigned long aux_head;
				480
				481	/* in overwrite mode, driver provides aux_head via handle */
				482	if (rb->aux_overwrite) {
				483	handle->aux_flags \|= PERF_AUX_FLAG_OVERWRITE;
				484
				485	aux_head = handle->head;
				486	rb->aux_head = aux_head;
				487	} else {
				488	handle->aux_flags &= ~PERF_AUX_FLAG_OVERWRITE;
				489
				490	aux_head = rb->aux_head;
				491	rb->aux_head += size;
				492	}
				493
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	494	/*
				495	* Only send RECORD_AUX if we have something useful to communicate
				496	*
				497	* Note: the OVERWRITE records by themselves are not considered
				498	* useful, as they don't communicate any new information,
				499	* aside from the short-lived offset, that becomes history at
				500	* the next event sched-in and therefore isn't useful.
				501	* The userspace that needs to copy out AUX data in overwrite
				502	* mode should know to use user_page::aux_head for the actual
				503	* offset. So, from now on we don't output AUX records that
				504	* have only OVERWRITE flag set.
				505	*/
				506	if (size \|\| (handle->aux_flags & ~(u64)PERF_AUX_FLAG_OVERWRITE))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	507	perf_event_aux_event(handle->event, aux_head, size,
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	508	handle->aux_flags);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	509
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	510	WRITE_ONCE(rb->user_page->aux_head, rb->aux_head);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	511	if (rb_need_aux_wakeup(rb))
				512	wakeup = true;
				513
				514	if (wakeup) {
				515	if (handle->aux_flags & PERF_AUX_FLAG_TRUNCATED)
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	516	handle->event->pending_disable = smp_processor_id();
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	517	perf_output_wakeup(handle);
				518	}
				519
				520	handle->event = NULL;
				521
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	522	WRITE_ONCE(rb->aux_nest, 0);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	523	/* can't be last */
				524	rb_free_aux(rb);
				525	ring_buffer_put(rb);
				526	}
				527	EXPORT_SYMBOL_GPL(perf_aux_output_end);
				528
				529	/*
				530	* Skip over a given number of bytes in the AUX buffer, due to, for example,
				531	* hardware's alignment constraints.
				532	*/
				533	int perf_aux_output_skip(struct perf_output_handle *handle, unsigned long size)
				534	{
				535	struct ring_buffer *rb = handle->rb;
				536
				537	if (size > handle->size)
				538	return -ENOSPC;
				539
				540	rb->aux_head += size;
				541
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	542	WRITE_ONCE(rb->user_page->aux_head, rb->aux_head);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	543	if (rb_need_aux_wakeup(rb)) {
				544	perf_output_wakeup(handle);
				545	handle->wakeup = rb->aux_wakeup + rb->aux_watermark;
				546	}
				547
				548	handle->head = rb->aux_head;
				549	handle->size -= size;
				550
				551	return 0;
				552	}
				553	EXPORT_SYMBOL_GPL(perf_aux_output_skip);
				554
				555	void perf_get_aux(struct perf_output_handle handle)
				556	{
				557	/* this is only valid between perf_aux_output_begin and _end /
				558	if (!handle->event)
				559	return NULL;
				560
				561	return handle->rb->aux_priv;
				562	}
				563	EXPORT_SYMBOL_GPL(perf_get_aux);
				564
				565	#define PERF_AUX_GFP (GFP_KERNEL \| __GFP_ZERO \| __GFP_NOWARN \| __GFP_NORETRY)
				566
				567	static struct page *rb_alloc_aux_page(int node, int order)
				568	{
				569	struct page *page;
				570
				571	if (order > MAX_ORDER)
				572	order = MAX_ORDER;
				573
				574	do {
				575	page = alloc_pages_node(node, PERF_AUX_GFP, order);
				576	} while (!page && order--);
				577
				578	if (page && order) {
				579	/*
				580	* Communicate the allocation size to the driver:
				581	* if we managed to secure a high-order allocation,
				582	* set its first page's private to this order;
				583	* !PagePrivate(page) means it's just a normal page.
				584	*/
				585	split_page(page, order);
				586	SetPagePrivate(page);
				587	set_page_private(page, order);
				588	}
				589
				590	return page;
				591	}
				592
				593	static void rb_free_aux_page(struct ring_buffer *rb, int idx)
				594	{
				595	struct page *page = virt_to_page(rb->aux_pages[idx]);
				596
				597	ClearPagePrivate(page);
				598	page->mapping = NULL;
				599	__free_page(page);
				600	}
				601
				602	static void __rb_free_aux(struct ring_buffer *rb)
				603	{
				604	int pg;
				605
				606	/*
				607	* Should never happen, the last reference should be dropped from
				608	* perf_mmap_close() path, which first stops aux transactions (which
				609	* in turn are the atomic holders of aux_refcount) and then does the
				610	* last rb_free_aux().
				611	*/
				612	WARN_ON_ONCE(in_atomic());
				613
				614	if (rb->aux_priv) {
				615	rb->free_aux(rb->aux_priv);
				616	rb->free_aux = NULL;
				617	rb->aux_priv = NULL;
				618	}
				619
				620	if (rb->aux_nr_pages) {
				621	for (pg = 0; pg < rb->aux_nr_pages; pg++)
				622	rb_free_aux_page(rb, pg);
				623
				624	kfree(rb->aux_pages);
				625	rb->aux_nr_pages = 0;
				626	}
				627	}
				628
				629	int rb_alloc_aux(struct ring_buffer rb, struct perf_event event,
				630	pgoff_t pgoff, int nr_pages, long watermark, int flags)
				631	{
				632	bool overwrite = !(flags & RING_BUFFER_WRITABLE);
				633	int node = (event->cpu == -1) ? -1 : cpu_to_node(event->cpu);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	634	int ret = -ENOMEM, max_order;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	635
				636	if (!has_aux(event))
				637	return -EOPNOTSUPP;
				638
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	639	/*
				640	* We need to start with the max_order that fits in nr_pages,
				641	* not the other way around, hence ilog2() and not get_order.
				642	*/
				643	max_order = ilog2(nr_pages);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	644
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	645	/*
				646	* PMU requests more than one contiguous chunks of memory
				647	* for SW double buffering
				648	*/
				649	if (!overwrite) {
				650	if (!max_order)
				651	return -EINVAL;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	652
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	653	max_order--;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	654	}
				655
				656	rb->aux_pages = kcalloc_node(nr_pages, sizeof(void *), GFP_KERNEL,
				657	node);
				658	if (!rb->aux_pages)
				659	return -ENOMEM;
				660
				661	rb->free_aux = event->pmu->free_aux;
				662	for (rb->aux_nr_pages = 0; rb->aux_nr_pages < nr_pages;) {
				663	struct page *page;
				664	int last, order;
				665
				666	order = min(max_order, ilog2(nr_pages - rb->aux_nr_pages));
				667	page = rb_alloc_aux_page(node, order);
				668	if (!page)
				669	goto out;
				670
				671	for (last = rb->aux_nr_pages + (1 << page_private(page));
				672	last > rb->aux_nr_pages; rb->aux_nr_pages++)
				673	rb->aux_pages[rb->aux_nr_pages] = page_address(page++);
				674	}
				675
				676	/*
				677	* In overwrite mode, PMUs that don't support SG may not handle more
				678	* than one contiguous allocation, since they rely on PMI to do double
				679	* buffering. In this case, the entire buffer has to be one contiguous
				680	* chunk.
				681	*/
				682	if ((event->pmu->capabilities & PERF_PMU_CAP_AUX_NO_SG) &&
				683	overwrite) {
				684	struct page *page = virt_to_page(rb->aux_pages[0]);
				685
				686	if (page_private(page) != max_order)
				687	goto out;
				688	}
				689
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	690	rb->aux_priv = event->pmu->setup_aux(event, rb->aux_pages, nr_pages,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	691	overwrite);
				692	if (!rb->aux_priv)
				693	goto out;
				694
				695	ret = 0;
				696
				697	/*
				698	* aux_pages (and pmu driver's private data, aux_priv) will be
				699	* referenced in both producer's and consumer's contexts, thus
				700	* we keep a refcount here to make sure either of the two can
				701	* reference them safely.
				702	*/
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	703	refcount_set(&rb->aux_refcount, 1);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	704
				705	rb->aux_overwrite = overwrite;
				706	rb->aux_watermark = watermark;
				707
				708	if (!rb->aux_watermark && !rb->aux_overwrite)
				709	rb->aux_watermark = nr_pages << (PAGE_SHIFT - 1);
				710
				711	out:
				712	if (!ret)
				713	rb->aux_pgoff = pgoff;
				714	else
				715	__rb_free_aux(rb);
				716
				717	return ret;
				718	}
				719
				720	void rb_free_aux(struct ring_buffer *rb)
				721	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	722	if (refcount_dec_and_test(&rb->aux_refcount))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	723	__rb_free_aux(rb);
				724	}
				725
				726	#ifndef CONFIG_PERF_USE_VMALLOC
				727
				728	/*
				729	* Back perf_mmap() with regular GFP_KERNEL-0 pages.
				730	*/
				731
				732	static struct page *
				733	__perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff)
				734	{
				735	if (pgoff > rb->nr_pages)
				736	return NULL;
				737
				738	if (pgoff == 0)
				739	return virt_to_page(rb->user_page);
				740
				741	return virt_to_page(rb->data_pages[pgoff - 1]);
				742	}
				743
				744	static void *perf_mmap_alloc_page(int cpu)
				745	{
				746	struct page *page;
				747	int node;
				748
				749	node = (cpu == -1) ? cpu : cpu_to_node(cpu);
				750	page = alloc_pages_node(node, GFP_KERNEL \| __GFP_ZERO, 0);
				751	if (!page)
				752	return NULL;
				753
				754	return page_address(page);
				755	}
				756
				757	struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags)
				758	{
				759	struct ring_buffer *rb;
				760	unsigned long size;
				761	int i;
				762
				763	size = sizeof(struct ring_buffer);
				764	size += nr_pages * sizeof(void *);
				765
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	766	if (order_base_2(size) >= PAGE_SHIFT+MAX_ORDER)
				767	goto fail;
				768
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	769	rb = kzalloc(size, GFP_KERNEL);
				770	if (!rb)
				771	goto fail;
				772
				773	rb->user_page = perf_mmap_alloc_page(cpu);
				774	if (!rb->user_page)
				775	goto fail_user_page;
				776
				777	for (i = 0; i < nr_pages; i++) {
				778	rb->data_pages[i] = perf_mmap_alloc_page(cpu);
				779	if (!rb->data_pages[i])
				780	goto fail_data_pages;
				781	}
				782
				783	rb->nr_pages = nr_pages;
				784
				785	ring_buffer_init(rb, watermark, flags);
				786
				787	return rb;
				788
				789	fail_data_pages:
				790	for (i--; i >= 0; i--)
				791	free_page((unsigned long)rb->data_pages[i]);
				792
				793	free_page((unsigned long)rb->user_page);
				794
				795	fail_user_page:
				796	kfree(rb);
				797
				798	fail:
				799	return NULL;
				800	}
				801
				802	static void perf_mmap_free_page(unsigned long addr)
				803	{
				804	struct page page = virt_to_page((void )addr);
				805
				806	page->mapping = NULL;
				807	__free_page(page);
				808	}
				809
				810	void rb_free(struct ring_buffer *rb)
				811	{
				812	int i;
				813
				814	perf_mmap_free_page((unsigned long)rb->user_page);
				815	for (i = 0; i < rb->nr_pages; i++)
				816	perf_mmap_free_page((unsigned long)rb->data_pages[i]);
				817	kfree(rb);
				818	}
				819
				820	#else
				821	static int data_page_nr(struct ring_buffer *rb)
				822	{
				823	return rb->nr_pages << page_order(rb);
				824	}
				825
				826	static struct page *
				827	__perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff)
				828	{
				829	/* The '>' counts in the user page. */
				830	if (pgoff > data_page_nr(rb))
				831	return NULL;
				832
				833	return vmalloc_to_page((void )rb->user_page + pgoff PAGE_SIZE);
				834	}
				835
				836	static void perf_mmap_unmark_page(void *addr)
				837	{
				838	struct page *page = vmalloc_to_page(addr);
				839
				840	page->mapping = NULL;
				841	}
				842
				843	static void rb_free_work(struct work_struct *work)
				844	{
				845	struct ring_buffer *rb;
				846	void *base;
				847	int i, nr;
				848
				849	rb = container_of(work, struct ring_buffer, work);
				850	nr = data_page_nr(rb);
				851
				852	base = rb->user_page;
				853	/* The '<=' counts in the user page. */
				854	for (i = 0; i <= nr; i++)
				855	perf_mmap_unmark_page(base + (i * PAGE_SIZE));
				856
				857	vfree(base);
				858	kfree(rb);
				859	}
				860
				861	void rb_free(struct ring_buffer *rb)
				862	{
				863	schedule_work(&rb->work);
				864	}
				865
				866	struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags)
				867	{
				868	struct ring_buffer *rb;
				869	unsigned long size;
				870	void *all_buf;
				871
				872	size = sizeof(struct ring_buffer);
				873	size += sizeof(void *);
				874
				875	rb = kzalloc(size, GFP_KERNEL);
				876	if (!rb)
				877	goto fail;
				878
				879	INIT_WORK(&rb->work, rb_free_work);
				880
				881	all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE);
				882	if (!all_buf)
				883	goto fail_all_buf;
				884
				885	rb->user_page = all_buf;
				886	rb->data_pages[0] = all_buf + PAGE_SIZE;
				887	if (nr_pages) {
				888	rb->nr_pages = 1;
				889	rb->page_order = ilog2(nr_pages);
				890	}
				891
				892	ring_buffer_init(rb, watermark, flags);
				893
				894	return rb;
				895
				896	fail_all_buf:
				897	kfree(rb);
				898
				899	fail:
				900	return NULL;
				901	}
				902
				903	#endif
				904
				905	struct page *
				906	perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff)
				907	{
				908	if (rb->aux_nr_pages) {
				909	/* above AUX space */
				910	if (pgoff > rb->aux_pgoff + rb->aux_nr_pages)
				911	return NULL;
				912
				913	/* AUX space */
				914	if (pgoff >= rb->aux_pgoff) {
				915	int aux_pgoff = array_index_nospec(pgoff - rb->aux_pgoff, rb->aux_nr_pages);
				916	return virt_to_page(rb->aux_pages[aux_pgoff]);
				917	}
				918	}
				919
				920	return __perf_mmap_to_page(rb, pgoff);
				921	}