Blame - kernel/trace/trace_event_perf.c - hafnium/third_party/linux.git

blob: a9dfa04ffa4499117833495aded1db5fdc19eedd [file] [log] [blame]

Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1	// SPDX-License-Identifier: GPL-2.0
				2	/*
				3	* trace event based perf event profiling/tracing
				4	*
				5	* Copyright (C) 2009 Red Hat Inc, Peter Zijlstra
				6	* Copyright (C) 2009-2010 Frederic Weisbecker <fweisbec@gmail.com>
				7	*/
				8
				9	#include <linux/module.h>
				10	#include <linux/kprobes.h>
				11	#include "trace.h"
				12	#include "trace_probe.h"
				13
				14	static char __percpu *perf_trace_buf[PERF_NR_CONTEXTS];
				15
				16	/*
				17	* Force it to be aligned to unsigned long to avoid misaligned accesses
				18	* suprises
				19	*/
				20	typedef typeof(unsigned long [PERF_MAX_TRACE_SIZE / sizeof(unsigned long)])
				21	perf_trace_t;
				22
				23	/* Count the events in use (per event id, not per instance) */
				24	static int total_ref_count;
				25
				26	static int perf_trace_event_perm(struct trace_event_call *tp_event,
				27	struct perf_event *p_event)
				28	{
				29	if (tp_event->perf_perm) {
				30	int ret = tp_event->perf_perm(tp_event, p_event);
				31	if (ret)
				32	return ret;
				33	}
				34
				35	/*
				36	* We checked and allowed to create parent,
				37	* allow children without checking.
				38	*/
				39	if (p_event->parent)
				40	return 0;
				41
				42	/*
				43	* It's ok to check current process (owner) permissions in here,
				44	* because code below is called only via perf_event_open syscall.
				45	*/
				46
				47	/* The ftrace function trace is allowed only for root. */
				48	if (ftrace_event_is_function(tp_event)) {
				49	if (perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN))
				50	return -EPERM;
				51
				52	if (!is_sampling_event(p_event))
				53	return 0;
				54
				55	/*
				56	* We don't allow user space callchains for function trace
				57	* event, due to issues with page faults while tracing page
				58	* fault handler and its overall trickiness nature.
				59	*/
				60	if (!p_event->attr.exclude_callchain_user)
				61	return -EINVAL;
				62
				63	/*
				64	* Same reason to disable user stack dump as for user space
				65	* callchains above.
				66	*/
				67	if (p_event->attr.sample_type & PERF_SAMPLE_STACK_USER)
				68	return -EINVAL;
				69	}
				70
				71	/* No tracing, just counting, so no obvious leak */
				72	if (!(p_event->attr.sample_type & PERF_SAMPLE_RAW))
				73	return 0;
				74
				75	/* Some events are ok to be traced by non-root users... */
				76	if (p_event->attach_state == PERF_ATTACH_TASK) {
				77	if (tp_event->flags & TRACE_EVENT_FL_CAP_ANY)
				78	return 0;
				79	}
				80
				81	/*
				82	* ...otherwise raw tracepoint data can be a severe data leak,
				83	* only allow root to have these.
				84	*/
				85	if (perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN))
				86	return -EPERM;
				87
				88	return 0;
				89	}
				90
				91	static int perf_trace_event_reg(struct trace_event_call *tp_event,
				92	struct perf_event *p_event)
				93	{
				94	struct hlist_head __percpu *list;
				95	int ret = -ENOMEM;
				96	int cpu;
				97
				98	p_event->tp_event = tp_event;
				99	if (tp_event->perf_refcount++ > 0)
				100	return 0;
				101
				102	list = alloc_percpu(struct hlist_head);
				103	if (!list)
				104	goto fail;
				105
				106	for_each_possible_cpu(cpu)
				107	INIT_HLIST_HEAD(per_cpu_ptr(list, cpu));
				108
				109	tp_event->perf_events = list;
				110
				111	if (!total_ref_count) {
				112	char __percpu *buf;
				113	int i;
				114
				115	for (i = 0; i < PERF_NR_CONTEXTS; i++) {
				116	buf = (char __percpu *)alloc_percpu(perf_trace_t);
				117	if (!buf)
				118	goto fail;
				119
				120	perf_trace_buf[i] = buf;
				121	}
				122	}
				123
				124	ret = tp_event->class->reg(tp_event, TRACE_REG_PERF_REGISTER, NULL);
				125	if (ret)
				126	goto fail;
				127
				128	total_ref_count++;
				129	return 0;
				130
				131	fail:
				132	if (!total_ref_count) {
				133	int i;
				134
				135	for (i = 0; i < PERF_NR_CONTEXTS; i++) {
				136	free_percpu(perf_trace_buf[i]);
				137	perf_trace_buf[i] = NULL;
				138	}
				139	}
				140
				141	if (!--tp_event->perf_refcount) {
				142	free_percpu(tp_event->perf_events);
				143	tp_event->perf_events = NULL;
				144	}
				145
				146	return ret;
				147	}
				148
				149	static void perf_trace_event_unreg(struct perf_event *p_event)
				150	{
				151	struct trace_event_call *tp_event = p_event->tp_event;
				152	int i;
				153
				154	if (--tp_event->perf_refcount > 0)
				155	goto out;
				156
				157	tp_event->class->reg(tp_event, TRACE_REG_PERF_UNREGISTER, NULL);
				158
				159	/*
				160	* Ensure our callback won't be called anymore. The buffers
				161	* will be freed after that.
				162	*/
				163	tracepoint_synchronize_unregister();
				164
				165	free_percpu(tp_event->perf_events);
				166	tp_event->perf_events = NULL;
				167
				168	if (!--total_ref_count) {
				169	for (i = 0; i < PERF_NR_CONTEXTS; i++) {
				170	free_percpu(perf_trace_buf[i]);
				171	perf_trace_buf[i] = NULL;
				172	}
				173	}
				174	out:
				175	module_put(tp_event->mod);
				176	}
				177
				178	static int perf_trace_event_open(struct perf_event *p_event)
				179	{
				180	struct trace_event_call *tp_event = p_event->tp_event;
				181	return tp_event->class->reg(tp_event, TRACE_REG_PERF_OPEN, p_event);
				182	}
				183
				184	static void perf_trace_event_close(struct perf_event *p_event)
				185	{
				186	struct trace_event_call *tp_event = p_event->tp_event;
				187	tp_event->class->reg(tp_event, TRACE_REG_PERF_CLOSE, p_event);
				188	}
				189
				190	static int perf_trace_event_init(struct trace_event_call *tp_event,
				191	struct perf_event *p_event)
				192	{
				193	int ret;
				194
				195	ret = perf_trace_event_perm(tp_event, p_event);
				196	if (ret)
				197	return ret;
				198
				199	ret = perf_trace_event_reg(tp_event, p_event);
				200	if (ret)
				201	return ret;
				202
				203	ret = perf_trace_event_open(p_event);
				204	if (ret) {
				205	perf_trace_event_unreg(p_event);
				206	return ret;
				207	}
				208
				209	return 0;
				210	}
				211
				212	int perf_trace_init(struct perf_event *p_event)
				213	{
				214	struct trace_event_call *tp_event;
				215	u64 event_id = p_event->attr.config;
				216	int ret = -EINVAL;
				217
				218	mutex_lock(&event_mutex);
				219	list_for_each_entry(tp_event, &ftrace_events, list) {
				220	if (tp_event->event.type == event_id &&
				221	tp_event->class && tp_event->class->reg &&
				222	try_module_get(tp_event->mod)) {
				223	ret = perf_trace_event_init(tp_event, p_event);
				224	if (ret)
				225	module_put(tp_event->mod);
				226	break;
				227	}
				228	}
				229	mutex_unlock(&event_mutex);
				230
				231	return ret;
				232	}
				233
				234	void perf_trace_destroy(struct perf_event *p_event)
				235	{
				236	mutex_lock(&event_mutex);
				237	perf_trace_event_close(p_event);
				238	perf_trace_event_unreg(p_event);
				239	mutex_unlock(&event_mutex);
				240	}
				241
				242	#ifdef CONFIG_KPROBE_EVENTS
				243	int perf_kprobe_init(struct perf_event *p_event, bool is_retprobe)
				244	{
				245	int ret;
				246	char *func = NULL;
				247	struct trace_event_call *tp_event;
				248
				249	if (p_event->attr.kprobe_func) {
				250	func = kzalloc(KSYM_NAME_LEN, GFP_KERNEL);
				251	if (!func)
				252	return -ENOMEM;
				253	ret = strncpy_from_user(
				254	func, u64_to_user_ptr(p_event->attr.kprobe_func),
				255	KSYM_NAME_LEN);
				256	if (ret == KSYM_NAME_LEN)
				257	ret = -E2BIG;
				258	if (ret < 0)
				259	goto out;
				260
				261	if (func[0] == '\0') {
				262	kfree(func);
				263	func = NULL;
				264	}
				265	}
				266
				267	tp_event = create_local_trace_kprobe(
				268	func, (void *)(unsigned long)(p_event->attr.kprobe_addr),
				269	p_event->attr.probe_offset, is_retprobe);
				270	if (IS_ERR(tp_event)) {
				271	ret = PTR_ERR(tp_event);
				272	goto out;
				273	}
				274
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	275	mutex_lock(&event_mutex);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	276	ret = perf_trace_event_init(tp_event, p_event);
				277	if (ret)
				278	destroy_local_trace_kprobe(tp_event);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	279	mutex_unlock(&event_mutex);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	280	out:
				281	kfree(func);
				282	return ret;
				283	}
				284
				285	void perf_kprobe_destroy(struct perf_event *p_event)
				286	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	287	mutex_lock(&event_mutex);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	288	perf_trace_event_close(p_event);
				289	perf_trace_event_unreg(p_event);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	290	mutex_unlock(&event_mutex);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	291
				292	destroy_local_trace_kprobe(p_event->tp_event);
				293	}
				294	#endif /* CONFIG_KPROBE_EVENTS */
				295
				296	#ifdef CONFIG_UPROBE_EVENTS
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	297	int perf_uprobe_init(struct perf_event *p_event,
				298	unsigned long ref_ctr_offset, bool is_retprobe)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	299	{
				300	int ret;
				301	char *path = NULL;
				302	struct trace_event_call *tp_event;
				303
				304	if (!p_event->attr.uprobe_path)
				305	return -EINVAL;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	306
				307	path = strndup_user(u64_to_user_ptr(p_event->attr.uprobe_path),
				308	PATH_MAX);
				309	if (IS_ERR(path)) {
				310	ret = PTR_ERR(path);
				311	return (ret == -EINVAL) ? -E2BIG : ret;
				312	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	313	if (path[0] == '\0') {
				314	ret = -EINVAL;
				315	goto out;
				316	}
				317
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	318	tp_event = create_local_trace_uprobe(path, p_event->attr.probe_offset,
				319	ref_ctr_offset, is_retprobe);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	320	if (IS_ERR(tp_event)) {
				321	ret = PTR_ERR(tp_event);
				322	goto out;
				323	}
				324
				325	/*
				326	* local trace_uprobe need to hold event_mutex to call
				327	* uprobe_buffer_enable() and uprobe_buffer_disable().
				328	* event_mutex is not required for local trace_kprobes.
				329	*/
				330	mutex_lock(&event_mutex);
				331	ret = perf_trace_event_init(tp_event, p_event);
				332	if (ret)
				333	destroy_local_trace_uprobe(tp_event);
				334	mutex_unlock(&event_mutex);
				335	out:
				336	kfree(path);
				337	return ret;
				338	}
				339
				340	void perf_uprobe_destroy(struct perf_event *p_event)
				341	{
				342	mutex_lock(&event_mutex);
				343	perf_trace_event_close(p_event);
				344	perf_trace_event_unreg(p_event);
				345	mutex_unlock(&event_mutex);
				346	destroy_local_trace_uprobe(p_event->tp_event);
				347	}
				348	#endif /* CONFIG_UPROBE_EVENTS */
				349
				350	int perf_trace_add(struct perf_event *p_event, int flags)
				351	{
				352	struct trace_event_call *tp_event = p_event->tp_event;
				353
				354	if (!(flags & PERF_EF_START))
				355	p_event->hw.state = PERF_HES_STOPPED;
				356
				357	/*
				358	* If TRACE_REG_PERF_ADD returns false; no custom action was performed
				359	* and we need to take the default action of enqueueing our event on
				360	* the right per-cpu hlist.
				361	*/
				362	if (!tp_event->class->reg(tp_event, TRACE_REG_PERF_ADD, p_event)) {
				363	struct hlist_head __percpu *pcpu_list;
				364	struct hlist_head *list;
				365
				366	pcpu_list = tp_event->perf_events;
				367	if (WARN_ON_ONCE(!pcpu_list))
				368	return -EINVAL;
				369
				370	list = this_cpu_ptr(pcpu_list);
				371	hlist_add_head_rcu(&p_event->hlist_entry, list);
				372	}
				373
				374	return 0;
				375	}
				376
				377	void perf_trace_del(struct perf_event *p_event, int flags)
				378	{
				379	struct trace_event_call *tp_event = p_event->tp_event;
				380
				381	/*
				382	* If TRACE_REG_PERF_DEL returns false; no custom action was performed
				383	* and we need to take the default action of dequeueing our event from
				384	* the right per-cpu hlist.
				385	*/
				386	if (!tp_event->class->reg(tp_event, TRACE_REG_PERF_DEL, p_event))
				387	hlist_del_rcu(&p_event->hlist_entry);
				388	}
				389
				390	void perf_trace_buf_alloc(int size, struct pt_regs regs, int rctxp)
				391	{
				392	char *raw_data;
				393	int rctx;
				394
				395	BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(unsigned long));
				396
				397	if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
				398	"perf buffer not large enough"))
				399	return NULL;
				400
				401	*rctxp = rctx = perf_swevent_get_recursion_context();
				402	if (rctx < 0)
				403	return NULL;
				404
				405	if (regs)
				406	*regs = this_cpu_ptr(&__perf_regs[rctx]);
				407	raw_data = this_cpu_ptr(perf_trace_buf[rctx]);
				408
				409	/* zero the dead bytes from align to not leak stack to user */
				410	memset(&raw_data[size - sizeof(u64)], 0, sizeof(u64));
				411	return raw_data;
				412	}
				413	EXPORT_SYMBOL_GPL(perf_trace_buf_alloc);
				414	NOKPROBE_SYMBOL(perf_trace_buf_alloc);
				415
				416	void perf_trace_buf_update(void *record, u16 type)
				417	{
				418	struct trace_entry *entry = record;
				419	int pc = preempt_count();
				420	unsigned long flags;
				421
				422	local_save_flags(flags);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	423	tracing_generic_entry_update(entry, type, flags, pc);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	424	}
				425	NOKPROBE_SYMBOL(perf_trace_buf_update);
				426
				427	#ifdef CONFIG_FUNCTION_TRACER
				428	static void
				429	perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip,
				430	struct ftrace_ops ops, struct pt_regs pt_regs)
				431	{
				432	struct ftrace_entry *entry;
				433	struct perf_event *event;
				434	struct hlist_head head;
				435	struct pt_regs regs;
				436	int rctx;
				437
				438	if ((unsigned long)ops->private != smp_processor_id())
				439	return;
				440
				441	event = container_of(ops, struct perf_event, ftrace_ops);
				442
				443	/*
				444	* @event->hlist entry is NULL (per INIT_HLIST_NODE), and all
				445	* the perf code does is hlist_for_each_entry_rcu(), so we can
				446	* get away with simply setting the @head.first pointer in order
				447	* to create a singular list.
				448	*/
				449	head.first = &event->hlist_entry;
				450
				451	#define ENTRY_SIZE (ALIGN(sizeof(struct ftrace_entry) + sizeof(u32), \
				452	sizeof(u64)) - sizeof(u32))
				453
				454	BUILD_BUG_ON(ENTRY_SIZE > PERF_MAX_TRACE_SIZE);
				455
				456	memset(&regs, 0, sizeof(regs));
				457	perf_fetch_caller_regs(&regs);
				458
				459	entry = perf_trace_buf_alloc(ENTRY_SIZE, NULL, &rctx);
				460	if (!entry)
				461	return;
				462
				463	entry->ip = ip;
				464	entry->parent_ip = parent_ip;
				465	perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, TRACE_FN,
				466	1, &regs, &head, NULL);
				467
				468	#undef ENTRY_SIZE
				469	}
				470
				471	static int perf_ftrace_function_register(struct perf_event *event)
				472	{
				473	struct ftrace_ops *ops = &event->ftrace_ops;
				474
				475	ops->flags = FTRACE_OPS_FL_RCU;
				476	ops->func = perf_ftrace_function_call;
				477	ops->private = (void *)(unsigned long)nr_cpu_ids;
				478
				479	return register_ftrace_function(ops);
				480	}
				481
				482	static int perf_ftrace_function_unregister(struct perf_event *event)
				483	{
				484	struct ftrace_ops *ops = &event->ftrace_ops;
				485	int ret = unregister_ftrace_function(ops);
				486	ftrace_free_filter(ops);
				487	return ret;
				488	}
				489
				490	int perf_ftrace_event_register(struct trace_event_call *call,
				491	enum trace_reg type, void *data)
				492	{
				493	struct perf_event *event = data;
				494
				495	switch (type) {
				496	case TRACE_REG_REGISTER:
				497	case TRACE_REG_UNREGISTER:
				498	break;
				499	case TRACE_REG_PERF_REGISTER:
				500	case TRACE_REG_PERF_UNREGISTER:
				501	return 0;
				502	case TRACE_REG_PERF_OPEN:
				503	return perf_ftrace_function_register(data);
				504	case TRACE_REG_PERF_CLOSE:
				505	return perf_ftrace_function_unregister(data);
				506	case TRACE_REG_PERF_ADD:
				507	event->ftrace_ops.private = (void *)(unsigned long)smp_processor_id();
				508	return 1;
				509	case TRACE_REG_PERF_DEL:
				510	event->ftrace_ops.private = (void *)(unsigned long)nr_cpu_ids;
				511	return 1;
				512	}
				513
				514	return -EINVAL;
				515	}
				516	#endif /* CONFIG_FUNCTION_TRACER */