Blame - fs/proc/base.c - hafnium/third_party/linux.git

blob: 5a187e9b72212b91993434e58d7ec8de44bb4dff [file] [log] [blame]

Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1	// SPDX-License-Identifier: GPL-2.0
				2	/*
				3	* linux/fs/proc/base.c
				4	*
				5	* Copyright (C) 1991, 1992 Linus Torvalds
				6	*
				7	* proc base directory handling functions
				8	*
				9	* 1999, Al Viro. Rewritten. Now it covers the whole per-process part.
				10	* Instead of using magical inumbers to determine the kind of object
				11	* we allocate and fill in-core inodes upon lookup. They don't even
				12	* go into icache. We cache the reference to task_struct upon lookup too.
				13	* Eventually it should become a filesystem in its own. We don't use the
				14	* rest of procfs anymore.
				15	*
				16	*
				17	* Changelog:
				18	* 17-Jan-2005
				19	* Allan Bezerra
				20	* Bruna Moreira <bruna.moreira@indt.org.br>
				21	* Edjard Mota <edjard.mota@indt.org.br>
				22	* Ilias Biris <ilias.biris@indt.org.br>
				23	* Mauricio Lin <mauricio.lin@indt.org.br>
				24	*
				25	* Embedded Linux Lab - 10LE Instituto Nokia de Tecnologia - INdT
				26	*
				27	* A new process specific entry (smaps) included in /proc. It shows the
				28	* size of rss for each memory area. The maps entry lacks information
				29	* about physical memory size (rss) for each mapped file, i.e.,
				30	* rss information for executables and library files.
				31	* This additional information is useful for any tools that need to know
				32	* about physical memory consumption for a process specific library.
				33	*
				34	* Changelog:
				35	* 21-Feb-2005
				36	* Embedded Linux Lab - 10LE Instituto Nokia de Tecnologia - INdT
				37	* Pud inclusion in the page table walking.
				38	*
				39	* ChangeLog:
				40	* 10-Mar-2005
				41	* 10LE Instituto Nokia de Tecnologia - INdT:
				42	* A better way to walks through the page table as suggested by Hugh Dickins.
				43	*
				44	* Simo Piiroinen <simo.piiroinen@nokia.com>:
				45	* Smaps information related to shared, private, clean and dirty pages.
				46	*
				47	* Paul Mundt <paul.mundt@nokia.com>:
				48	* Overall revision about smaps.
				49	*/
				50
				51	#include <linux/uaccess.h>
				52
				53	#include <linux/errno.h>
				54	#include <linux/time.h>
				55	#include <linux/proc_fs.h>
				56	#include <linux/stat.h>
				57	#include <linux/task_io_accounting_ops.h>
				58	#include <linux/init.h>
				59	#include <linux/capability.h>
				60	#include <linux/file.h>
				61	#include <linux/fdtable.h>
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	62	#include <linux/generic-radix-tree.h>
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	63	#include <linux/string.h>
				64	#include <linux/seq_file.h>
				65	#include <linux/namei.h>
				66	#include <linux/mnt_namespace.h>
				67	#include <linux/mm.h>
				68	#include <linux/swap.h>
				69	#include <linux/rcupdate.h>
				70	#include <linux/kallsyms.h>
				71	#include <linux/stacktrace.h>
				72	#include <linux/resource.h>
				73	#include <linux/module.h>
				74	#include <linux/mount.h>
				75	#include <linux/security.h>
				76	#include <linux/ptrace.h>
				77	#include <linux/tracehook.h>
				78	#include <linux/printk.h>
				79	#include <linux/cache.h>
				80	#include <linux/cgroup.h>
				81	#include <linux/cpuset.h>
				82	#include <linux/audit.h>
				83	#include <linux/poll.h>
				84	#include <linux/nsproxy.h>
				85	#include <linux/oom.h>
				86	#include <linux/elf.h>
				87	#include <linux/pid_namespace.h>
				88	#include <linux/user_namespace.h>
				89	#include <linux/fs_struct.h>
				90	#include <linux/slab.h>
				91	#include <linux/sched/autogroup.h>
				92	#include <linux/sched/mm.h>
				93	#include <linux/sched/coredump.h>
				94	#include <linux/sched/debug.h>
				95	#include <linux/sched/stat.h>
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	96	#include <linux/posix-timers.h>
				97	#include <trace/events/oom.h>
				98	#include "internal.h"
				99	#include "fd.h"
				100
				101	#include "../../lib/kstrtox.h"
				102
				103	/* NOTE:
				104	* Implementing inode permission operations in /proc is almost
				105	* certainly an error. Permission checks need to happen during
				106	* each system call not at open time. The reason is that most of
				107	* what we wish to check for permissions in /proc varies at runtime.
				108	*
				109	* The classic example of a problem is opening file descriptors
				110	* in /proc for a task before it execs a suid executable.
				111	*/
				112
				113	static u8 nlink_tid __ro_after_init;
				114	static u8 nlink_tgid __ro_after_init;
				115
				116	struct pid_entry {
				117	const char *name;
				118	unsigned int len;
				119	umode_t mode;
				120	const struct inode_operations *iop;
				121	const struct file_operations *fop;
				122	union proc_op op;
				123	};
				124
				125	#define NOD(NAME, MODE, IOP, FOP, OP) { \
				126	.name = (NAME), \
				127	.len = sizeof(NAME) - 1, \
				128	.mode = MODE, \
				129	.iop = IOP, \
				130	.fop = FOP, \
				131	.op = OP, \
				132	}
				133
				134	#define DIR(NAME, MODE, iops, fops) \
				135	NOD(NAME, (S_IFDIR\|(MODE)), &iops, &fops, {} )
				136	#define LNK(NAME, get_link) \
				137	NOD(NAME, (S_IFLNK\|S_IRWXUGO), \
				138	&proc_pid_link_inode_operations, NULL, \
				139	{ .proc_get_link = get_link } )
				140	#define REG(NAME, MODE, fops) \
				141	NOD(NAME, (S_IFREG\|(MODE)), NULL, &fops, {})
				142	#define ONE(NAME, MODE, show) \
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	143	NOD(NAME, (S_IFREG\|(MODE)), \
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	144	NULL, &proc_single_file_operations, \
				145	{ .proc_show = show } )
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	146	#define ATTR(LSM, NAME, MODE) \
				147	NOD(NAME, (S_IFREG\|(MODE)), \
				148	NULL, &proc_pid_attr_operations, \
				149	{ .lsm = LSM })
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	150
				151	/*
				152	* Count the number of hardlinks for the pid_entry table, excluding the .
				153	* and .. links.
				154	*/
				155	static unsigned int __init pid_entry_nlink(const struct pid_entry *entries,
				156	unsigned int n)
				157	{
				158	unsigned int i;
				159	unsigned int count;
				160
				161	count = 2;
				162	for (i = 0; i < n; ++i) {
				163	if (S_ISDIR(entries[i].mode))
				164	++count;
				165	}
				166
				167	return count;
				168	}
				169
				170	static int get_task_root(struct task_struct task, struct path root)
				171	{
				172	int result = -ENOENT;
				173
				174	task_lock(task);
				175	if (task->fs) {
				176	get_fs_root(task->fs, root);
				177	result = 0;
				178	}
				179	task_unlock(task);
				180	return result;
				181	}
				182
				183	static int proc_cwd_link(struct dentry dentry, struct path path)
				184	{
				185	struct task_struct *task = get_proc_task(d_inode(dentry));
				186	int result = -ENOENT;
				187
				188	if (task) {
				189	task_lock(task);
				190	if (task->fs) {
				191	get_fs_pwd(task->fs, path);
				192	result = 0;
				193	}
				194	task_unlock(task);
				195	put_task_struct(task);
				196	}
				197	return result;
				198	}
				199
				200	static int proc_root_link(struct dentry dentry, struct path path)
				201	{
				202	struct task_struct *task = get_proc_task(d_inode(dentry));
				203	int result = -ENOENT;
				204
				205	if (task) {
				206	result = get_task_root(task, path);
				207	put_task_struct(task);
				208	}
				209	return result;
				210	}
				211
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	212	/*
				213	* If the user used setproctitle(), we just get the string from
				214	* user space at arg_start, and limit it to a maximum of one page.
				215	*/
				216	static ssize_t get_mm_proctitle(struct mm_struct mm, char __user buf,
				217	size_t count, unsigned long pos,
				218	unsigned long arg_start)
				219	{
				220	char *page;
				221	int ret, got;
				222
				223	if (pos >= PAGE_SIZE)
				224	return 0;
				225
				226	page = (char *)__get_free_page(GFP_KERNEL);
				227	if (!page)
				228	return -ENOMEM;
				229
				230	ret = 0;
				231	got = access_remote_vm(mm, arg_start, page, PAGE_SIZE, FOLL_ANON);
				232	if (got > 0) {
				233	int len = strnlen(page, got);
				234
				235	/* Include the NUL character if it was found */
				236	if (len < got)
				237	len++;
				238
				239	if (len > pos) {
				240	len -= pos;
				241	if (len > count)
				242	len = count;
				243	len -= copy_to_user(buf, page+pos, len);
				244	if (!len)
				245	len = -EFAULT;
				246	ret = len;
				247	}
				248	}
				249	free_page((unsigned long)page);
				250	return ret;
				251	}
				252
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	253	static ssize_t get_mm_cmdline(struct mm_struct mm, char __user buf,
				254	size_t count, loff_t *ppos)
				255	{
				256	unsigned long arg_start, arg_end, env_start, env_end;
				257	unsigned long pos, len;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	258	char *page, c;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	259
				260	/* Check if process spawned far enough to have cmdline. */
				261	if (!mm->env_end)
				262	return 0;
				263
				264	spin_lock(&mm->arg_lock);
				265	arg_start = mm->arg_start;
				266	arg_end = mm->arg_end;
				267	env_start = mm->env_start;
				268	env_end = mm->env_end;
				269	spin_unlock(&mm->arg_lock);
				270
				271	if (arg_start >= arg_end)
				272	return 0;
				273
				274	/*
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	275	* We allow setproctitle() to overwrite the argument
				276	* strings, and overflow past the original end. But
				277	* only when it overflows into the environment area.
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	278	*/
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	279	if (env_start != arg_end \|\| env_end < env_start)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	280	env_start = env_end = arg_end;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	281	len = env_end - arg_start;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	282
				283	/* We're not going to care if "ppos" has high bits set /
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	284	pos = *ppos;
				285	if (pos >= len)
				286	return 0;
				287	if (count > len - pos)
				288	count = len - pos;
				289	if (!count)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	290	return 0;
				291
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	292	/*
				293	* Magical special case: if the argv[] end byte is not
				294	* zero, the user has overwritten it with setproctitle(3).
				295	*
				296	* Possible future enhancement: do this only once when
				297	* pos is 0, and set a flag in the 'struct file'.
				298	*/
				299	if (access_remote_vm(mm, arg_end-1, &c, 1, FOLL_ANON) == 1 && c)
				300	return get_mm_proctitle(mm, buf, count, pos, arg_start);
				301
				302	/*
				303	* For the non-setproctitle() case we limit things strictly
				304	* to the [arg_start, arg_end[ range.
				305	*/
				306	pos += arg_start;
				307	if (pos < arg_start \|\| pos >= arg_end)
				308	return 0;
				309	if (count > arg_end - pos)
				310	count = arg_end - pos;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	311
				312	page = (char *)__get_free_page(GFP_KERNEL);
				313	if (!page)
				314	return -ENOMEM;
				315
				316	len = 0;
				317	while (count) {
				318	int got;
				319	size_t size = min_t(size_t, PAGE_SIZE, count);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	320
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	321	got = access_remote_vm(mm, pos, page, size, FOLL_ANON);
				322	if (got <= 0)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	323	break;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	324	got -= copy_to_user(buf, page, got);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	325	if (unlikely(!got)) {
				326	if (!len)
				327	len = -EFAULT;
				328	break;
				329	}
				330	pos += got;
				331	buf += got;
				332	len += got;
				333	count -= got;
				334	}
				335
				336	free_page((unsigned long)page);
				337	return len;
				338	}
				339
				340	static ssize_t get_task_cmdline(struct task_struct tsk, char __user buf,
				341	size_t count, loff_t *pos)
				342	{
				343	struct mm_struct *mm;
				344	ssize_t ret;
				345
				346	mm = get_task_mm(tsk);
				347	if (!mm)
				348	return 0;
				349
				350	ret = get_mm_cmdline(mm, buf, count, pos);
				351	mmput(mm);
				352	return ret;
				353	}
				354
				355	static ssize_t proc_pid_cmdline_read(struct file file, char __user buf,
				356	size_t count, loff_t *pos)
				357	{
				358	struct task_struct *tsk;
				359	ssize_t ret;
				360
				361	BUG_ON(*pos < 0);
				362
				363	tsk = get_proc_task(file_inode(file));
				364	if (!tsk)
				365	return -ESRCH;
				366	ret = get_task_cmdline(tsk, buf, count, pos);
				367	put_task_struct(tsk);
				368	if (ret > 0)
				369	*pos += ret;
				370	return ret;
				371	}
				372
				373	static const struct file_operations proc_pid_cmdline_ops = {
				374	.read = proc_pid_cmdline_read,
				375	.llseek = generic_file_llseek,
				376	};
				377
				378	#ifdef CONFIG_KALLSYMS
				379	/*
				380	* Provides a wchan file via kallsyms in a proper one-value-per-file format.
				381	* Returns the resolved symbol. If that fails, simply return the address.
				382	*/
				383	static int proc_pid_wchan(struct seq_file m, struct pid_namespace ns,
				384	struct pid pid, struct task_struct task)
				385	{
				386	unsigned long wchan;
				387	char symname[KSYM_NAME_LEN];
				388
				389	if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS))
				390	goto print0;
				391
				392	wchan = get_wchan(task);
				393	if (wchan && !lookup_symbol_name(wchan, symname)) {
				394	seq_puts(m, symname);
				395	return 0;
				396	}
				397
				398	print0:
				399	seq_putc(m, '0');
				400	return 0;
				401	}
				402	#endif /* CONFIG_KALLSYMS */
				403
				404	static int lock_trace(struct task_struct *task)
				405	{
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	406	int err = down_read_killable(&task->signal->exec_update_lock);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	407	if (err)
				408	return err;
				409	if (!ptrace_may_access(task, PTRACE_MODE_ATTACH_FSCREDS)) {
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	410	up_read(&task->signal->exec_update_lock);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	411	return -EPERM;
				412	}
				413	return 0;
				414	}
				415
				416	static void unlock_trace(struct task_struct *task)
				417	{
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	418	up_read(&task->signal->exec_update_lock);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	419	}
				420
				421	#ifdef CONFIG_STACKTRACE
				422
				423	#define MAX_STACK_TRACE_DEPTH 64
				424
				425	static int proc_pid_stack(struct seq_file m, struct pid_namespace ns,
				426	struct pid pid, struct task_struct task)
				427	{
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	428	unsigned long *entries;
				429	int err;
				430
				431	/*
				432	* The ability to racily run the kernel stack unwinder on a running task
				433	* and then observe the unwinder output is scary; while it is useful for
				434	* debugging kernel issues, it can also allow an attacker to leak kernel
				435	* stack contents.
				436	* Doing this in a manner that is at least safe from races would require
				437	* some work to ensure that the remote task can not be scheduled; and
				438	* even then, this would still expose the unwinder as local attack
				439	* surface.
				440	* Therefore, this interface is restricted to root.
				441	*/
				442	if (!file_ns_capable(m->file, &init_user_ns, CAP_SYS_ADMIN))
				443	return -EACCES;
				444
				445	entries = kmalloc_array(MAX_STACK_TRACE_DEPTH, sizeof(*entries),
				446	GFP_KERNEL);
				447	if (!entries)
				448	return -ENOMEM;
				449
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	450	err = lock_trace(task);
				451	if (!err) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	452	unsigned int i, nr_entries;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	453
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	454	nr_entries = stack_trace_save_tsk(task, entries,
				455	MAX_STACK_TRACE_DEPTH, 0);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	456
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	457	for (i = 0; i < nr_entries; i++) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	458	seq_printf(m, "[<0>] %pB\n", (void *)entries[i]);
				459	}
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	460
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	461	unlock_trace(task);
				462	}
				463	kfree(entries);
				464
				465	return err;
				466	}
				467	#endif
				468
				469	#ifdef CONFIG_SCHED_INFO
				470	/*
				471	* Provides /proc/PID/schedstat
				472	*/
				473	static int proc_pid_schedstat(struct seq_file m, struct pid_namespace ns,
				474	struct pid pid, struct task_struct task)
				475	{
				476	if (unlikely(!sched_info_on()))
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	477	seq_puts(m, "0 0 0\n");
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	478	else
				479	seq_printf(m, "%llu %llu %lu\n",
				480	(unsigned long long)task->se.sum_exec_runtime,
				481	(unsigned long long)task->sched_info.run_delay,
				482	task->sched_info.pcount);
				483
				484	return 0;
				485	}
				486	#endif
				487
				488	#ifdef CONFIG_LATENCYTOP
				489	static int lstats_show_proc(struct seq_file m, void v)
				490	{
				491	int i;
				492	struct inode *inode = m->private;
				493	struct task_struct *task = get_proc_task(inode);
				494
				495	if (!task)
				496	return -ESRCH;
				497	seq_puts(m, "Latency Top version : v0.1\n");
				498	for (i = 0; i < LT_SAVECOUNT; i++) {
				499	struct latency_record *lr = &task->latency_record[i];
				500	if (lr->backtrace[0]) {
				501	int q;
				502	seq_printf(m, "%i %li %li",
				503	lr->count, lr->time, lr->max);
				504	for (q = 0; q < LT_BACKTRACEDEPTH; q++) {
				505	unsigned long bt = lr->backtrace[q];
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	506
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	507	if (!bt)
				508	break;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	509	seq_printf(m, " %ps", (void *)bt);
				510	}
				511	seq_putc(m, '\n');
				512	}
				513
				514	}
				515	put_task_struct(task);
				516	return 0;
				517	}
				518
				519	static int lstats_open(struct inode inode, struct file file)
				520	{
				521	return single_open(file, lstats_show_proc, inode);
				522	}
				523
				524	static ssize_t lstats_write(struct file file, const char __user buf,
				525	size_t count, loff_t *offs)
				526	{
				527	struct task_struct *task = get_proc_task(file_inode(file));
				528
				529	if (!task)
				530	return -ESRCH;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	531	clear_tsk_latency_tracing(task);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	532	put_task_struct(task);
				533
				534	return count;
				535	}
				536
				537	static const struct file_operations proc_lstats_operations = {
				538	.open = lstats_open,
				539	.read = seq_read,
				540	.write = lstats_write,
				541	.llseek = seq_lseek,
				542	.release = single_release,
				543	};
				544
				545	#endif
				546
				547	static int proc_oom_score(struct seq_file m, struct pid_namespace ns,
				548	struct pid pid, struct task_struct task)
				549	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	550	unsigned long totalpages = totalram_pages() + total_swap_pages;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	551	unsigned long points = 0;
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	552	long badness;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	553
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	554	badness = oom_badness(task, totalpages);
				555	/*
				556	* Special case OOM_SCORE_ADJ_MIN for all others scale the
				557	* badness value into [0, 2000] range which we have been
				558	* exporting for a long time so userspace might depend on it.
				559	*/
				560	if (badness != LONG_MIN)
				561	points = (1000 + badness * 1000 / (long)totalpages) * 2 / 3;
				562
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	563	seq_printf(m, "%lu\n", points);
				564
				565	return 0;
				566	}
				567
				568	struct limit_names {
				569	const char *name;
				570	const char *unit;
				571	};
				572
				573	static const struct limit_names lnames[RLIM_NLIMITS] = {
				574	[RLIMIT_CPU] = {"Max cpu time", "seconds"},
				575	[RLIMIT_FSIZE] = {"Max file size", "bytes"},
				576	[RLIMIT_DATA] = {"Max data size", "bytes"},
				577	[RLIMIT_STACK] = {"Max stack size", "bytes"},
				578	[RLIMIT_CORE] = {"Max core file size", "bytes"},
				579	[RLIMIT_RSS] = {"Max resident set", "bytes"},
				580	[RLIMIT_NPROC] = {"Max processes", "processes"},
				581	[RLIMIT_NOFILE] = {"Max open files", "files"},
				582	[RLIMIT_MEMLOCK] = {"Max locked memory", "bytes"},
				583	[RLIMIT_AS] = {"Max address space", "bytes"},
				584	[RLIMIT_LOCKS] = {"Max file locks", "locks"},
				585	[RLIMIT_SIGPENDING] = {"Max pending signals", "signals"},
				586	[RLIMIT_MSGQUEUE] = {"Max msgqueue size", "bytes"},
				587	[RLIMIT_NICE] = {"Max nice priority", NULL},
				588	[RLIMIT_RTPRIO] = {"Max realtime priority", NULL},
				589	[RLIMIT_RTTIME] = {"Max realtime timeout", "us"},
				590	};
				591
				592	/* Display limits for a process */
				593	static int proc_pid_limits(struct seq_file m, struct pid_namespace ns,
				594	struct pid pid, struct task_struct task)
				595	{
				596	unsigned int i;
				597	unsigned long flags;
				598
				599	struct rlimit rlim[RLIM_NLIMITS];
				600
				601	if (!lock_task_sighand(task, &flags))
				602	return 0;
				603	memcpy(rlim, task->signal->rlim, sizeof(struct rlimit) * RLIM_NLIMITS);
				604	unlock_task_sighand(task, &flags);
				605
				606	/*
				607	* print the file header
				608	*/
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	609	seq_puts(m, "Limit "
				610	"Soft Limit "
				611	"Hard Limit "
				612	"Units \n");
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	613
				614	for (i = 0; i < RLIM_NLIMITS; i++) {
				615	if (rlim[i].rlim_cur == RLIM_INFINITY)
				616	seq_printf(m, "%-25s %-20s ",
				617	lnames[i].name, "unlimited");
				618	else
				619	seq_printf(m, "%-25s %-20lu ",
				620	lnames[i].name, rlim[i].rlim_cur);
				621
				622	if (rlim[i].rlim_max == RLIM_INFINITY)
				623	seq_printf(m, "%-20s ", "unlimited");
				624	else
				625	seq_printf(m, "%-20lu ", rlim[i].rlim_max);
				626
				627	if (lnames[i].unit)
				628	seq_printf(m, "%-10s\n", lnames[i].unit);
				629	else
				630	seq_putc(m, '\n');
				631	}
				632
				633	return 0;
				634	}
				635
				636	#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
				637	static int proc_pid_syscall(struct seq_file m, struct pid_namespace ns,
				638	struct pid pid, struct task_struct task)
				639	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	640	struct syscall_info info;
				641	u64 *args = &info.data.args[0];
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	642	int res;
				643
				644	res = lock_trace(task);
				645	if (res)
				646	return res;
				647
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	648	if (task_current_syscall(task, &info))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	649	seq_puts(m, "running\n");
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	650	else if (info.data.nr < 0)
				651	seq_printf(m, "%d 0x%llx 0x%llx\n",
				652	info.data.nr, info.sp, info.data.instruction_pointer);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	653	else
				654	seq_printf(m,
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	655	"%d 0x%llx 0x%llx 0x%llx 0x%llx 0x%llx 0x%llx 0x%llx 0x%llx\n",
				656	info.data.nr,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	657	args[0], args[1], args[2], args[3], args[4], args[5],
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	658	info.sp, info.data.instruction_pointer);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	659	unlock_trace(task);
				660
				661	return 0;
				662	}
				663	#endif /* CONFIG_HAVE_ARCH_TRACEHOOK */
				664
				665	/************************************************************************/
				666	/* Here the fs part begins */
				667	/************************************************************************/
				668
				669	/* permission checks */
				670	static int proc_fd_access_allowed(struct inode *inode)
				671	{
				672	struct task_struct *task;
				673	int allowed = 0;
				674	/* Allow access to a task's file descriptors if it is us or we
				675	* may use ptrace attach to the process and find out that
				676	* information.
				677	*/
				678	task = get_proc_task(inode);
				679	if (task) {
				680	allowed = ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS);
				681	put_task_struct(task);
				682	}
				683	return allowed;
				684	}
				685
				686	int proc_setattr(struct dentry dentry, struct iattr attr)
				687	{
				688	int error;
				689	struct inode *inode = d_inode(dentry);
				690
				691	if (attr->ia_valid & ATTR_MODE)
				692	return -EPERM;
				693
				694	error = setattr_prepare(dentry, attr);
				695	if (error)
				696	return error;
				697
				698	setattr_copy(inode, attr);
				699	mark_inode_dirty(inode);
				700	return 0;
				701	}
				702
				703	/*
				704	* May current process learn task's sched/cmdline info (for hide_pid_min=1)
				705	* or euid/egid (for hide_pid_min=2)?
				706	*/
				707	static bool has_pid_permissions(struct pid_namespace *pid,
				708	struct task_struct *task,
				709	int hide_pid_min)
				710	{
				711	if (pid->hide_pid < hide_pid_min)
				712	return true;
				713	if (in_group_p(pid->pid_gid))
				714	return true;
				715	return ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS);
				716	}
				717
				718
				719	static int proc_pid_permission(struct inode *inode, int mask)
				720	{
				721	struct pid_namespace *pid = proc_pid_ns(inode);
				722	struct task_struct *task;
				723	bool has_perms;
				724
				725	task = get_proc_task(inode);
				726	if (!task)
				727	return -ESRCH;
				728	has_perms = has_pid_permissions(pid, task, HIDEPID_NO_ACCESS);
				729	put_task_struct(task);
				730
				731	if (!has_perms) {
				732	if (pid->hide_pid == HIDEPID_INVISIBLE) {
				733	/*
				734	* Let's make getdents(), stat(), and open()
				735	* consistent with each other. If a process
				736	* may not stat() a file, it shouldn't be seen
				737	* in procfs at all.
				738	*/
				739	return -ENOENT;
				740	}
				741
				742	return -EPERM;
				743	}
				744	return generic_permission(inode, mask);
				745	}
				746
				747
				748
				749	static const struct inode_operations proc_def_inode_operations = {
				750	.setattr = proc_setattr,
				751	};
				752
				753	static int proc_single_show(struct seq_file m, void v)
				754	{
				755	struct inode *inode = m->private;
				756	struct pid_namespace *ns = proc_pid_ns(inode);
				757	struct pid *pid = proc_pid(inode);
				758	struct task_struct *task;
				759	int ret;
				760
				761	task = get_pid_task(pid, PIDTYPE_PID);
				762	if (!task)
				763	return -ESRCH;
				764
				765	ret = PROC_I(inode)->op.proc_show(m, ns, pid, task);
				766
				767	put_task_struct(task);
				768	return ret;
				769	}
				770
				771	static int proc_single_open(struct inode inode, struct file filp)
				772	{
				773	return single_open(filp, proc_single_show, inode);
				774	}
				775
				776	static const struct file_operations proc_single_file_operations = {
				777	.open = proc_single_open,
				778	.read = seq_read,
				779	.llseek = seq_lseek,
				780	.release = single_release,
				781	};
				782
				783
				784	struct mm_struct proc_mem_open(struct inode inode, unsigned int mode)
				785	{
				786	struct task_struct *task = get_proc_task(inode);
				787	struct mm_struct *mm = ERR_PTR(-ESRCH);
				788
				789	if (task) {
				790	mm = mm_access(task, mode \| PTRACE_MODE_FSCREDS);
				791	put_task_struct(task);
				792
				793	if (!IS_ERR_OR_NULL(mm)) {
				794	/* ensure this mm_struct can't be freed */
				795	mmgrab(mm);
				796	/* but do not pin its memory */
				797	mmput(mm);
				798	}
				799	}
				800
				801	return mm;
				802	}
				803
				804	static int __mem_open(struct inode inode, struct file file, unsigned int mode)
				805	{
				806	struct mm_struct *mm = proc_mem_open(inode, mode);
				807
				808	if (IS_ERR(mm))
				809	return PTR_ERR(mm);
				810
				811	file->private_data = mm;
				812	return 0;
				813	}
				814
				815	static int mem_open(struct inode inode, struct file file)
				816	{
				817	int ret = __mem_open(inode, file, PTRACE_MODE_ATTACH);
				818
				819	/* OK to pass negative loff_t, we can catch out-of-range */
				820	file->f_mode \|= FMODE_UNSIGNED_OFFSET;
				821
				822	return ret;
				823	}
				824
				825	static ssize_t mem_rw(struct file file, char __user buf,
				826	size_t count, loff_t *ppos, int write)
				827	{
				828	struct mm_struct *mm = file->private_data;
				829	unsigned long addr = *ppos;
				830	ssize_t copied;
				831	char *page;
				832	unsigned int flags;
				833
				834	if (!mm)
				835	return 0;
				836
				837	page = (char *)__get_free_page(GFP_KERNEL);
				838	if (!page)
				839	return -ENOMEM;
				840
				841	copied = 0;
				842	if (!mmget_not_zero(mm))
				843	goto free;
				844
				845	flags = FOLL_FORCE \| (write ? FOLL_WRITE : 0);
				846
				847	while (count > 0) {
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	848	size_t this_len = min_t(size_t, count, PAGE_SIZE);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	849
				850	if (write && copy_from_user(page, buf, this_len)) {
				851	copied = -EFAULT;
				852	break;
				853	}
				854
				855	this_len = access_remote_vm(mm, addr, page, this_len, flags);
				856	if (!this_len) {
				857	if (!copied)
				858	copied = -EIO;
				859	break;
				860	}
				861
				862	if (!write && copy_to_user(buf, page, this_len)) {
				863	copied = -EFAULT;
				864	break;
				865	}
				866
				867	buf += this_len;
				868	addr += this_len;
				869	copied += this_len;
				870	count -= this_len;
				871	}
				872	*ppos = addr;
				873
				874	mmput(mm);
				875	free:
				876	free_page((unsigned long) page);
				877	return copied;
				878	}
				879
				880	static ssize_t mem_read(struct file file, char __user buf,
				881	size_t count, loff_t *ppos)
				882	{
				883	return mem_rw(file, buf, count, ppos, 0);
				884	}
				885
				886	static ssize_t mem_write(struct file file, const char __user buf,
				887	size_t count, loff_t *ppos)
				888	{
				889	return mem_rw(file, (char __user*)buf, count, ppos, 1);
				890	}
				891
				892	loff_t mem_lseek(struct file *file, loff_t offset, int orig)
				893	{
				894	switch (orig) {
				895	case 0:
				896	file->f_pos = offset;
				897	break;
				898	case 1:
				899	file->f_pos += offset;
				900	break;
				901	default:
				902	return -EINVAL;
				903	}
				904	force_successful_syscall_return();
				905	return file->f_pos;
				906	}
				907
				908	static int mem_release(struct inode inode, struct file file)
				909	{
				910	struct mm_struct *mm = file->private_data;
				911	if (mm)
				912	mmdrop(mm);
				913	return 0;
				914	}
				915
				916	static const struct file_operations proc_mem_operations = {
				917	.llseek = mem_lseek,
				918	.read = mem_read,
				919	.write = mem_write,
				920	.open = mem_open,
				921	.release = mem_release,
				922	};
				923
				924	static int environ_open(struct inode inode, struct file file)
				925	{
				926	return __mem_open(inode, file, PTRACE_MODE_READ);
				927	}
				928
				929	static ssize_t environ_read(struct file file, char __user buf,
				930	size_t count, loff_t *ppos)
				931	{
				932	char *page;
				933	unsigned long src = *ppos;
				934	int ret = 0;
				935	struct mm_struct *mm = file->private_data;
				936	unsigned long env_start, env_end;
				937
				938	/* Ensure the process spawned far enough to have an environment. */
				939	if (!mm \|\| !mm->env_end)
				940	return 0;
				941
				942	page = (char *)__get_free_page(GFP_KERNEL);
				943	if (!page)
				944	return -ENOMEM;
				945
				946	ret = 0;
				947	if (!mmget_not_zero(mm))
				948	goto free;
				949
				950	spin_lock(&mm->arg_lock);
				951	env_start = mm->env_start;
				952	env_end = mm->env_end;
				953	spin_unlock(&mm->arg_lock);
				954
				955	while (count > 0) {
				956	size_t this_len, max_len;
				957	int retval;
				958
				959	if (src >= (env_end - env_start))
				960	break;
				961
				962	this_len = env_end - (env_start + src);
				963
				964	max_len = min_t(size_t, PAGE_SIZE, count);
				965	this_len = min(max_len, this_len);
				966
				967	retval = access_remote_vm(mm, (env_start + src), page, this_len, FOLL_ANON);
				968
				969	if (retval <= 0) {
				970	ret = retval;
				971	break;
				972	}
				973
				974	if (copy_to_user(buf, page, retval)) {
				975	ret = -EFAULT;
				976	break;
				977	}
				978
				979	ret += retval;
				980	src += retval;
				981	buf += retval;
				982	count -= retval;
				983	}
				984	*ppos = src;
				985	mmput(mm);
				986
				987	free:
				988	free_page((unsigned long) page);
				989	return ret;
				990	}
				991
				992	static const struct file_operations proc_environ_operations = {
				993	.open = environ_open,
				994	.read = environ_read,
				995	.llseek = generic_file_llseek,
				996	.release = mem_release,
				997	};
				998
				999	static int auxv_open(struct inode inode, struct file file)
				1000	{
				1001	return __mem_open(inode, file, PTRACE_MODE_READ_FSCREDS);
				1002	}
				1003
				1004	static ssize_t auxv_read(struct file file, char __user buf,
				1005	size_t count, loff_t *ppos)
				1006	{
				1007	struct mm_struct *mm = file->private_data;
				1008	unsigned int nwords = 0;
				1009
				1010	if (!mm)
				1011	return 0;
				1012	do {
				1013	nwords += 2;
				1014	} while (mm->saved_auxv[nwords - 2] != 0); /* AT_NULL */
				1015	return simple_read_from_buffer(buf, count, ppos, mm->saved_auxv,
				1016	nwords * sizeof(mm->saved_auxv[0]));
				1017	}
				1018
				1019	static const struct file_operations proc_auxv_operations = {
				1020	.open = auxv_open,
				1021	.read = auxv_read,
				1022	.llseek = generic_file_llseek,
				1023	.release = mem_release,
				1024	};
				1025
				1026	static ssize_t oom_adj_read(struct file file, char __user buf, size_t count,
				1027	loff_t *ppos)
				1028	{
				1029	struct task_struct *task = get_proc_task(file_inode(file));
				1030	char buffer[PROC_NUMBUF];
				1031	int oom_adj = OOM_ADJUST_MIN;
				1032	size_t len;
				1033
				1034	if (!task)
				1035	return -ESRCH;
				1036	if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MAX)
				1037	oom_adj = OOM_ADJUST_MAX;
				1038	else
				1039	oom_adj = (task->signal->oom_score_adj * -OOM_DISABLE) /
				1040	OOM_SCORE_ADJ_MAX;
				1041	put_task_struct(task);
				1042	len = snprintf(buffer, sizeof(buffer), "%d\n", oom_adj);
				1043	return simple_read_from_buffer(buf, count, ppos, buffer, len);
				1044	}
				1045
				1046	static int __set_oom_adj(struct file *file, int oom_adj, bool legacy)
				1047	{
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1048	struct mm_struct *mm = NULL;
				1049	struct task_struct *task;
				1050	int err = 0;
				1051
				1052	task = get_proc_task(file_inode(file));
				1053	if (!task)
				1054	return -ESRCH;
				1055
				1056	mutex_lock(&oom_adj_mutex);
				1057	if (legacy) {
				1058	if (oom_adj < task->signal->oom_score_adj &&
				1059	!capable(CAP_SYS_RESOURCE)) {
				1060	err = -EACCES;
				1061	goto err_unlock;
				1062	}
				1063	/*
				1064	* /proc/pid/oom_adj is provided for legacy purposes, ask users to use
				1065	* /proc/pid/oom_score_adj instead.
				1066	*/
				1067	pr_warn_once("%s (%d): /proc/%d/oom_adj is deprecated, please use /proc/%d/oom_score_adj instead.\n",
				1068	current->comm, task_pid_nr(current), task_pid_nr(task),
				1069	task_pid_nr(task));
				1070	} else {
				1071	if ((short)oom_adj < task->signal->oom_score_adj_min &&
				1072	!capable(CAP_SYS_RESOURCE)) {
				1073	err = -EACCES;
				1074	goto err_unlock;
				1075	}
				1076	}
				1077
				1078	/*
				1079	* Make sure we will check other processes sharing the mm if this is
				1080	* not vfrok which wants its own oom_score_adj.
				1081	* pin the mm so it doesn't go away and get reused after task_unlock
				1082	*/
				1083	if (!task->vfork_done) {
				1084	struct task_struct *p = find_lock_task_mm(task);
				1085
				1086	if (p) {
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	1087	if (test_bit(MMF_MULTIPROCESS, &p->mm->flags)) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1088	mm = p->mm;
				1089	mmgrab(mm);
				1090	}
				1091	task_unlock(p);
				1092	}
				1093	}
				1094
				1095	task->signal->oom_score_adj = oom_adj;
				1096	if (!legacy && has_capability_noaudit(current, CAP_SYS_RESOURCE))
				1097	task->signal->oom_score_adj_min = (short)oom_adj;
				1098	trace_oom_score_adj_update(task);
				1099
				1100	if (mm) {
				1101	struct task_struct *p;
				1102
				1103	rcu_read_lock();
				1104	for_each_process(p) {
				1105	if (same_thread_group(task, p))
				1106	continue;
				1107
				1108	/* do not touch kernel threads or the global init */
				1109	if (p->flags & PF_KTHREAD \|\| is_global_init(p))
				1110	continue;
				1111
				1112	task_lock(p);
				1113	if (!p->vfork_done && process_shares_mm(p, mm)) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1114	p->signal->oom_score_adj = oom_adj;
				1115	if (!legacy && has_capability_noaudit(current, CAP_SYS_RESOURCE))
				1116	p->signal->oom_score_adj_min = (short)oom_adj;
				1117	}
				1118	task_unlock(p);
				1119	}
				1120	rcu_read_unlock();
				1121	mmdrop(mm);
				1122	}
				1123	err_unlock:
				1124	mutex_unlock(&oom_adj_mutex);
				1125	put_task_struct(task);
				1126	return err;
				1127	}
				1128
				1129	/*
				1130	* /proc/pid/oom_adj exists solely for backwards compatibility with previous
				1131	* kernels. The effective policy is defined by oom_score_adj, which has a
				1132	* different scale: oom_adj grew exponentially and oom_score_adj grows linearly.
				1133	* Values written to oom_adj are simply mapped linearly to oom_score_adj.
				1134	* Processes that become oom disabled via oom_adj will still be oom disabled
				1135	* with this implementation.
				1136	*
				1137	* oom_adj cannot be removed since existing userspace binaries use it.
				1138	*/
				1139	static ssize_t oom_adj_write(struct file file, const char __user buf,
				1140	size_t count, loff_t *ppos)
				1141	{
				1142	char buffer[PROC_NUMBUF];
				1143	int oom_adj;
				1144	int err;
				1145
				1146	memset(buffer, 0, sizeof(buffer));
				1147	if (count > sizeof(buffer) - 1)
				1148	count = sizeof(buffer) - 1;
				1149	if (copy_from_user(buffer, buf, count)) {
				1150	err = -EFAULT;
				1151	goto out;
				1152	}
				1153
				1154	err = kstrtoint(strstrip(buffer), 0, &oom_adj);
				1155	if (err)
				1156	goto out;
				1157	if ((oom_adj < OOM_ADJUST_MIN \|\| oom_adj > OOM_ADJUST_MAX) &&
				1158	oom_adj != OOM_DISABLE) {
				1159	err = -EINVAL;
				1160	goto out;
				1161	}
				1162
				1163	/*
				1164	* Scale /proc/pid/oom_score_adj appropriately ensuring that a maximum
				1165	* value is always attainable.
				1166	*/
				1167	if (oom_adj == OOM_ADJUST_MAX)
				1168	oom_adj = OOM_SCORE_ADJ_MAX;
				1169	else
				1170	oom_adj = (oom_adj * OOM_SCORE_ADJ_MAX) / -OOM_DISABLE;
				1171
				1172	err = __set_oom_adj(file, oom_adj, true);
				1173	out:
				1174	return err < 0 ? err : count;
				1175	}
				1176
				1177	static const struct file_operations proc_oom_adj_operations = {
				1178	.read = oom_adj_read,
				1179	.write = oom_adj_write,
				1180	.llseek = generic_file_llseek,
				1181	};
				1182
				1183	static ssize_t oom_score_adj_read(struct file file, char __user buf,
				1184	size_t count, loff_t *ppos)
				1185	{
				1186	struct task_struct *task = get_proc_task(file_inode(file));
				1187	char buffer[PROC_NUMBUF];
				1188	short oom_score_adj = OOM_SCORE_ADJ_MIN;
				1189	size_t len;
				1190
				1191	if (!task)
				1192	return -ESRCH;
				1193	oom_score_adj = task->signal->oom_score_adj;
				1194	put_task_struct(task);
				1195	len = snprintf(buffer, sizeof(buffer), "%hd\n", oom_score_adj);
				1196	return simple_read_from_buffer(buf, count, ppos, buffer, len);
				1197	}
				1198
				1199	static ssize_t oom_score_adj_write(struct file file, const char __user buf,
				1200	size_t count, loff_t *ppos)
				1201	{
				1202	char buffer[PROC_NUMBUF];
				1203	int oom_score_adj;
				1204	int err;
				1205
				1206	memset(buffer, 0, sizeof(buffer));
				1207	if (count > sizeof(buffer) - 1)
				1208	count = sizeof(buffer) - 1;
				1209	if (copy_from_user(buffer, buf, count)) {
				1210	err = -EFAULT;
				1211	goto out;
				1212	}
				1213
				1214	err = kstrtoint(strstrip(buffer), 0, &oom_score_adj);
				1215	if (err)
				1216	goto out;
				1217	if (oom_score_adj < OOM_SCORE_ADJ_MIN \|\|
				1218	oom_score_adj > OOM_SCORE_ADJ_MAX) {
				1219	err = -EINVAL;
				1220	goto out;
				1221	}
				1222
				1223	err = __set_oom_adj(file, oom_score_adj, false);
				1224	out:
				1225	return err < 0 ? err : count;
				1226	}
				1227
				1228	static const struct file_operations proc_oom_score_adj_operations = {
				1229	.read = oom_score_adj_read,
				1230	.write = oom_score_adj_write,
				1231	.llseek = default_llseek,
				1232	};
				1233
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1234	#ifdef CONFIG_AUDIT
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1235	#define TMPBUFLEN 11
				1236	static ssize_t proc_loginuid_read(struct file * file, char __user * buf,
				1237	size_t count, loff_t *ppos)
				1238	{
				1239	struct inode * inode = file_inode(file);
				1240	struct task_struct *task = get_proc_task(inode);
				1241	ssize_t length;
				1242	char tmpbuf[TMPBUFLEN];
				1243
				1244	if (!task)
				1245	return -ESRCH;
				1246	length = scnprintf(tmpbuf, TMPBUFLEN, "%u",
				1247	from_kuid(file->f_cred->user_ns,
				1248	audit_get_loginuid(task)));
				1249	put_task_struct(task);
				1250	return simple_read_from_buffer(buf, count, ppos, tmpbuf, length);
				1251	}
				1252
				1253	static ssize_t proc_loginuid_write(struct file * file, const char __user * buf,
				1254	size_t count, loff_t *ppos)
				1255	{
				1256	struct inode * inode = file_inode(file);
				1257	uid_t loginuid;
				1258	kuid_t kloginuid;
				1259	int rv;
				1260
				1261	rcu_read_lock();
				1262	if (current != pid_task(proc_pid(inode), PIDTYPE_PID)) {
				1263	rcu_read_unlock();
				1264	return -EPERM;
				1265	}
				1266	rcu_read_unlock();
				1267
				1268	if (*ppos != 0) {
				1269	/* No partial writes. */
				1270	return -EINVAL;
				1271	}
				1272
				1273	rv = kstrtou32_from_user(buf, count, 10, &loginuid);
				1274	if (rv < 0)
				1275	return rv;
				1276
				1277	/* is userspace tring to explicitly UNSET the loginuid? */
				1278	if (loginuid == AUDIT_UID_UNSET) {
				1279	kloginuid = INVALID_UID;
				1280	} else {
				1281	kloginuid = make_kuid(file->f_cred->user_ns, loginuid);
				1282	if (!uid_valid(kloginuid))
				1283	return -EINVAL;
				1284	}
				1285
				1286	rv = audit_set_loginuid(kloginuid);
				1287	if (rv < 0)
				1288	return rv;
				1289	return count;
				1290	}
				1291
				1292	static const struct file_operations proc_loginuid_operations = {
				1293	.read = proc_loginuid_read,
				1294	.write = proc_loginuid_write,
				1295	.llseek = generic_file_llseek,
				1296	};
				1297
				1298	static ssize_t proc_sessionid_read(struct file * file, char __user * buf,
				1299	size_t count, loff_t *ppos)
				1300	{
				1301	struct inode * inode = file_inode(file);
				1302	struct task_struct *task = get_proc_task(inode);
				1303	ssize_t length;
				1304	char tmpbuf[TMPBUFLEN];
				1305
				1306	if (!task)
				1307	return -ESRCH;
				1308	length = scnprintf(tmpbuf, TMPBUFLEN, "%u",
				1309	audit_get_sessionid(task));
				1310	put_task_struct(task);
				1311	return simple_read_from_buffer(buf, count, ppos, tmpbuf, length);
				1312	}
				1313
				1314	static const struct file_operations proc_sessionid_operations = {
				1315	.read = proc_sessionid_read,
				1316	.llseek = generic_file_llseek,
				1317	};
				1318	#endif
				1319
				1320	#ifdef CONFIG_FAULT_INJECTION
				1321	static ssize_t proc_fault_inject_read(struct file * file, char __user * buf,
				1322	size_t count, loff_t *ppos)
				1323	{
				1324	struct task_struct *task = get_proc_task(file_inode(file));
				1325	char buffer[PROC_NUMBUF];
				1326	size_t len;
				1327	int make_it_fail;
				1328
				1329	if (!task)
				1330	return -ESRCH;
				1331	make_it_fail = task->make_it_fail;
				1332	put_task_struct(task);
				1333
				1334	len = snprintf(buffer, sizeof(buffer), "%i\n", make_it_fail);
				1335
				1336	return simple_read_from_buffer(buf, count, ppos, buffer, len);
				1337	}
				1338
				1339	static ssize_t proc_fault_inject_write(struct file * file,
				1340	const char __user * buf, size_t count, loff_t *ppos)
				1341	{
				1342	struct task_struct *task;
				1343	char buffer[PROC_NUMBUF];
				1344	int make_it_fail;
				1345	int rv;
				1346
				1347	if (!capable(CAP_SYS_RESOURCE))
				1348	return -EPERM;
				1349	memset(buffer, 0, sizeof(buffer));
				1350	if (count > sizeof(buffer) - 1)
				1351	count = sizeof(buffer) - 1;
				1352	if (copy_from_user(buffer, buf, count))
				1353	return -EFAULT;
				1354	rv = kstrtoint(strstrip(buffer), 0, &make_it_fail);
				1355	if (rv < 0)
				1356	return rv;
				1357	if (make_it_fail < 0 \|\| make_it_fail > 1)
				1358	return -EINVAL;
				1359
				1360	task = get_proc_task(file_inode(file));
				1361	if (!task)
				1362	return -ESRCH;
				1363	task->make_it_fail = make_it_fail;
				1364	put_task_struct(task);
				1365
				1366	return count;
				1367	}
				1368
				1369	static const struct file_operations proc_fault_inject_operations = {
				1370	.read = proc_fault_inject_read,
				1371	.write = proc_fault_inject_write,
				1372	.llseek = generic_file_llseek,
				1373	};
				1374
				1375	static ssize_t proc_fail_nth_write(struct file file, const char __user buf,
				1376	size_t count, loff_t *ppos)
				1377	{
				1378	struct task_struct *task;
				1379	int err;
				1380	unsigned int n;
				1381
				1382	err = kstrtouint_from_user(buf, count, 0, &n);
				1383	if (err)
				1384	return err;
				1385
				1386	task = get_proc_task(file_inode(file));
				1387	if (!task)
				1388	return -ESRCH;
				1389	task->fail_nth = n;
				1390	put_task_struct(task);
				1391
				1392	return count;
				1393	}
				1394
				1395	static ssize_t proc_fail_nth_read(struct file file, char __user buf,
				1396	size_t count, loff_t *ppos)
				1397	{
				1398	struct task_struct *task;
				1399	char numbuf[PROC_NUMBUF];
				1400	ssize_t len;
				1401
				1402	task = get_proc_task(file_inode(file));
				1403	if (!task)
				1404	return -ESRCH;
				1405	len = snprintf(numbuf, sizeof(numbuf), "%u\n", task->fail_nth);
				1406	put_task_struct(task);
				1407	return simple_read_from_buffer(buf, count, ppos, numbuf, len);
				1408	}
				1409
				1410	static const struct file_operations proc_fail_nth_operations = {
				1411	.read = proc_fail_nth_read,
				1412	.write = proc_fail_nth_write,
				1413	};
				1414	#endif
				1415
				1416
				1417	#ifdef CONFIG_SCHED_DEBUG
				1418	/*
				1419	* Print out various scheduling related per-task fields:
				1420	*/
				1421	static int sched_show(struct seq_file m, void v)
				1422	{
				1423	struct inode *inode = m->private;
				1424	struct pid_namespace *ns = proc_pid_ns(inode);
				1425	struct task_struct *p;
				1426
				1427	p = get_proc_task(inode);
				1428	if (!p)
				1429	return -ESRCH;
				1430	proc_sched_show_task(p, ns, m);
				1431
				1432	put_task_struct(p);
				1433
				1434	return 0;
				1435	}
				1436
				1437	static ssize_t
				1438	sched_write(struct file file, const char __user buf,
				1439	size_t count, loff_t *offset)
				1440	{
				1441	struct inode *inode = file_inode(file);
				1442	struct task_struct *p;
				1443
				1444	p = get_proc_task(inode);
				1445	if (!p)
				1446	return -ESRCH;
				1447	proc_sched_set_task(p);
				1448
				1449	put_task_struct(p);
				1450
				1451	return count;
				1452	}
				1453
				1454	static int sched_open(struct inode inode, struct file filp)
				1455	{
				1456	return single_open(filp, sched_show, inode);
				1457	}
				1458
				1459	static const struct file_operations proc_pid_sched_operations = {
				1460	.open = sched_open,
				1461	.read = seq_read,
				1462	.write = sched_write,
				1463	.llseek = seq_lseek,
				1464	.release = single_release,
				1465	};
				1466
				1467	#endif
				1468
				1469	#ifdef CONFIG_SCHED_AUTOGROUP
				1470	/*
				1471	* Print out autogroup related information:
				1472	*/
				1473	static int sched_autogroup_show(struct seq_file m, void v)
				1474	{
				1475	struct inode *inode = m->private;
				1476	struct task_struct *p;
				1477
				1478	p = get_proc_task(inode);
				1479	if (!p)
				1480	return -ESRCH;
				1481	proc_sched_autogroup_show_task(p, m);
				1482
				1483	put_task_struct(p);
				1484
				1485	return 0;
				1486	}
				1487
				1488	static ssize_t
				1489	sched_autogroup_write(struct file file, const char __user buf,
				1490	size_t count, loff_t *offset)
				1491	{
				1492	struct inode *inode = file_inode(file);
				1493	struct task_struct *p;
				1494	char buffer[PROC_NUMBUF];
				1495	int nice;
				1496	int err;
				1497
				1498	memset(buffer, 0, sizeof(buffer));
				1499	if (count > sizeof(buffer) - 1)
				1500	count = sizeof(buffer) - 1;
				1501	if (copy_from_user(buffer, buf, count))
				1502	return -EFAULT;
				1503
				1504	err = kstrtoint(strstrip(buffer), 0, &nice);
				1505	if (err < 0)
				1506	return err;
				1507
				1508	p = get_proc_task(inode);
				1509	if (!p)
				1510	return -ESRCH;
				1511
				1512	err = proc_sched_autogroup_set_nice(p, nice);
				1513	if (err)
				1514	count = err;
				1515
				1516	put_task_struct(p);
				1517
				1518	return count;
				1519	}
				1520
				1521	static int sched_autogroup_open(struct inode inode, struct file filp)
				1522	{
				1523	int ret;
				1524
				1525	ret = single_open(filp, sched_autogroup_show, NULL);
				1526	if (!ret) {
				1527	struct seq_file *m = filp->private_data;
				1528
				1529	m->private = inode;
				1530	}
				1531	return ret;
				1532	}
				1533
				1534	static const struct file_operations proc_pid_sched_autogroup_operations = {
				1535	.open = sched_autogroup_open,
				1536	.read = seq_read,
				1537	.write = sched_autogroup_write,
				1538	.llseek = seq_lseek,
				1539	.release = single_release,
				1540	};
				1541
				1542	#endif /* CONFIG_SCHED_AUTOGROUP */
				1543
				1544	static ssize_t comm_write(struct file file, const char __user buf,
				1545	size_t count, loff_t *offset)
				1546	{
				1547	struct inode *inode = file_inode(file);
				1548	struct task_struct *p;
				1549	char buffer[TASK_COMM_LEN];
				1550	const size_t maxlen = sizeof(buffer) - 1;
				1551
				1552	memset(buffer, 0, sizeof(buffer));
				1553	if (copy_from_user(buffer, buf, count > maxlen ? maxlen : count))
				1554	return -EFAULT;
				1555
				1556	p = get_proc_task(inode);
				1557	if (!p)
				1558	return -ESRCH;
				1559
				1560	if (same_thread_group(current, p))
				1561	set_task_comm(p, buffer);
				1562	else
				1563	count = -EINVAL;
				1564
				1565	put_task_struct(p);
				1566
				1567	return count;
				1568	}
				1569
				1570	static int comm_show(struct seq_file m, void v)
				1571	{
				1572	struct inode *inode = m->private;
				1573	struct task_struct *p;
				1574
				1575	p = get_proc_task(inode);
				1576	if (!p)
				1577	return -ESRCH;
				1578
				1579	proc_task_name(m, p, false);
				1580	seq_putc(m, '\n');
				1581
				1582	put_task_struct(p);
				1583
				1584	return 0;
				1585	}
				1586
				1587	static int comm_open(struct inode inode, struct file filp)
				1588	{
				1589	return single_open(filp, comm_show, inode);
				1590	}
				1591
				1592	static const struct file_operations proc_pid_set_comm_operations = {
				1593	.open = comm_open,
				1594	.read = seq_read,
				1595	.write = comm_write,
				1596	.llseek = seq_lseek,
				1597	.release = single_release,
				1598	};
				1599
				1600	static int proc_exe_link(struct dentry dentry, struct path exe_path)
				1601	{
				1602	struct task_struct *task;
				1603	struct file *exe_file;
				1604
				1605	task = get_proc_task(d_inode(dentry));
				1606	if (!task)
				1607	return -ENOENT;
				1608	exe_file = get_task_exe_file(task);
				1609	put_task_struct(task);
				1610	if (exe_file) {
				1611	*exe_path = exe_file->f_path;
				1612	path_get(&exe_file->f_path);
				1613	fput(exe_file);
				1614	return 0;
				1615	} else
				1616	return -ENOENT;
				1617	}
				1618
				1619	static const char proc_pid_get_link(struct dentry dentry,
				1620	struct inode *inode,
				1621	struct delayed_call *done)
				1622	{
				1623	struct path path;
				1624	int error = -EACCES;
				1625
				1626	if (!dentry)
				1627	return ERR_PTR(-ECHILD);
				1628
				1629	/* Are we allowed to snoop on the tasks file descriptors? */
				1630	if (!proc_fd_access_allowed(inode))
				1631	goto out;
				1632
				1633	error = PROC_I(inode)->op.proc_get_link(dentry, &path);
				1634	if (error)
				1635	goto out;
				1636
				1637	nd_jump_link(&path);
				1638	return NULL;
				1639	out:
				1640	return ERR_PTR(error);
				1641	}
				1642
				1643	static int do_proc_readlink(struct path path, char __user buffer, int buflen)
				1644	{
				1645	char tmp = (char )__get_free_page(GFP_KERNEL);
				1646	char *pathname;
				1647	int len;
				1648
				1649	if (!tmp)
				1650	return -ENOMEM;
				1651
				1652	pathname = d_path(path, tmp, PAGE_SIZE);
				1653	len = PTR_ERR(pathname);
				1654	if (IS_ERR(pathname))
				1655	goto out;
				1656	len = tmp + PAGE_SIZE - 1 - pathname;
				1657
				1658	if (len > buflen)
				1659	len = buflen;
				1660	if (copy_to_user(buffer, pathname, len))
				1661	len = -EFAULT;
				1662	out:
				1663	free_page((unsigned long)tmp);
				1664	return len;
				1665	}
				1666
				1667	static int proc_pid_readlink(struct dentry * dentry, char __user * buffer, int buflen)
				1668	{
				1669	int error = -EACCES;
				1670	struct inode *inode = d_inode(dentry);
				1671	struct path path;
				1672
				1673	/* Are we allowed to snoop on the tasks file descriptors? */
				1674	if (!proc_fd_access_allowed(inode))
				1675	goto out;
				1676
				1677	error = PROC_I(inode)->op.proc_get_link(dentry, &path);
				1678	if (error)
				1679	goto out;
				1680
				1681	error = do_proc_readlink(&path, buffer, buflen);
				1682	path_put(&path);
				1683	out:
				1684	return error;
				1685	}
				1686
				1687	const struct inode_operations proc_pid_link_inode_operations = {
				1688	.readlink = proc_pid_readlink,
				1689	.get_link = proc_pid_get_link,
				1690	.setattr = proc_setattr,
				1691	};
				1692
				1693
				1694	/* building an inode */
				1695
				1696	void task_dump_owner(struct task_struct *task, umode_t mode,
				1697	kuid_t ruid, kgid_t rgid)
				1698	{
				1699	/* Depending on the state of dumpable compute who should own a
				1700	* proc file for a task.
				1701	*/
				1702	const struct cred *cred;
				1703	kuid_t uid;
				1704	kgid_t gid;
				1705
				1706	if (unlikely(task->flags & PF_KTHREAD)) {
				1707	*ruid = GLOBAL_ROOT_UID;
				1708	*rgid = GLOBAL_ROOT_GID;
				1709	return;
				1710	}
				1711
				1712	/* Default to the tasks effective ownership */
				1713	rcu_read_lock();
				1714	cred = __task_cred(task);
				1715	uid = cred->euid;
				1716	gid = cred->egid;
				1717	rcu_read_unlock();
				1718
				1719	/*
				1720	* Before the /proc/pid/status file was created the only way to read
				1721	* the effective uid of a /process was to stat /proc/pid. Reading
				1722	* /proc/pid/status is slow enough that procps and other packages
				1723	* kept stating /proc/pid. To keep the rules in /proc simple I have
				1724	* made this apply to all per process world readable and executable
				1725	* directories.
				1726	*/
				1727	if (mode != (S_IFDIR\|S_IRUGO\|S_IXUGO)) {
				1728	struct mm_struct *mm;
				1729	task_lock(task);
				1730	mm = task->mm;
				1731	/* Make non-dumpable tasks owned by some root */
				1732	if (mm) {
				1733	if (get_dumpable(mm) != SUID_DUMP_USER) {
				1734	struct user_namespace *user_ns = mm->user_ns;
				1735
				1736	uid = make_kuid(user_ns, 0);
				1737	if (!uid_valid(uid))
				1738	uid = GLOBAL_ROOT_UID;
				1739
				1740	gid = make_kgid(user_ns, 0);
				1741	if (!gid_valid(gid))
				1742	gid = GLOBAL_ROOT_GID;
				1743	}
				1744	} else {
				1745	uid = GLOBAL_ROOT_UID;
				1746	gid = GLOBAL_ROOT_GID;
				1747	}
				1748	task_unlock(task);
				1749	}
				1750	*ruid = uid;
				1751	*rgid = gid;
				1752	}
				1753
				1754	struct inode proc_pid_make_inode(struct super_block sb,
				1755	struct task_struct *task, umode_t mode)
				1756	{
				1757	struct inode * inode;
				1758	struct proc_inode *ei;
				1759
				1760	/* We need a new inode */
				1761
				1762	inode = new_inode(sb);
				1763	if (!inode)
				1764	goto out;
				1765
				1766	/* Common stuff */
				1767	ei = PROC_I(inode);
				1768	inode->i_mode = mode;
				1769	inode->i_ino = get_next_ino();
				1770	inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
				1771	inode->i_op = &proc_def_inode_operations;
				1772
				1773	/*
				1774	* grab the reference to task.
				1775	*/
				1776	ei->pid = get_task_pid(task, PIDTYPE_PID);
				1777	if (!ei->pid)
				1778	goto out_unlock;
				1779
				1780	task_dump_owner(task, 0, &inode->i_uid, &inode->i_gid);
				1781	security_task_to_inode(task, inode);
				1782
				1783	out:
				1784	return inode;
				1785
				1786	out_unlock:
				1787	iput(inode);
				1788	return NULL;
				1789	}
				1790
				1791	int pid_getattr(const struct path path, struct kstat stat,
				1792	u32 request_mask, unsigned int query_flags)
				1793	{
				1794	struct inode *inode = d_inode(path->dentry);
				1795	struct pid_namespace *pid = proc_pid_ns(inode);
				1796	struct task_struct *task;
				1797
				1798	generic_fillattr(inode, stat);
				1799
				1800	stat->uid = GLOBAL_ROOT_UID;
				1801	stat->gid = GLOBAL_ROOT_GID;
				1802	rcu_read_lock();
				1803	task = pid_task(proc_pid(inode), PIDTYPE_PID);
				1804	if (task) {
				1805	if (!has_pid_permissions(pid, task, HIDEPID_INVISIBLE)) {
				1806	rcu_read_unlock();
				1807	/*
				1808	* This doesn't prevent learning whether PID exists,
				1809	* it only makes getattr() consistent with readdir().
				1810	*/
				1811	return -ENOENT;
				1812	}
				1813	task_dump_owner(task, inode->i_mode, &stat->uid, &stat->gid);
				1814	}
				1815	rcu_read_unlock();
				1816	return 0;
				1817	}
				1818
				1819	/* dentry stuff */
				1820
				1821	/*
				1822	* Set <pid>/... inode ownership (can change due to setuid(), etc.)
				1823	*/
				1824	void pid_update_inode(struct task_struct task, struct inode inode)
				1825	{
				1826	task_dump_owner(task, inode->i_mode, &inode->i_uid, &inode->i_gid);
				1827
				1828	inode->i_mode &= ~(S_ISUID \| S_ISGID);
				1829	security_task_to_inode(task, inode);
				1830	}
				1831
				1832	/*
				1833	* Rewrite the inode's ownerships here because the owning task may have
				1834	* performed a setuid(), etc.
				1835	*
				1836	*/
				1837	static int pid_revalidate(struct dentry *dentry, unsigned int flags)
				1838	{
				1839	struct inode *inode;
				1840	struct task_struct *task;
				1841
				1842	if (flags & LOOKUP_RCU)
				1843	return -ECHILD;
				1844
				1845	inode = d_inode(dentry);
				1846	task = get_proc_task(inode);
				1847
				1848	if (task) {
				1849	pid_update_inode(task, inode);
				1850	put_task_struct(task);
				1851	return 1;
				1852	}
				1853	return 0;
				1854	}
				1855
				1856	static inline bool proc_inode_is_dead(struct inode *inode)
				1857	{
				1858	return !proc_pid(inode)->tasks[PIDTYPE_PID].first;
				1859	}
				1860
				1861	int pid_delete_dentry(const struct dentry *dentry)
				1862	{
				1863	/* Is the task we represent dead?
				1864	* If so, then don't put the dentry on the lru list,
				1865	* kill it immediately.
				1866	*/
				1867	return proc_inode_is_dead(d_inode(dentry));
				1868	}
				1869
				1870	const struct dentry_operations pid_dentry_operations =
				1871	{
				1872	.d_revalidate = pid_revalidate,
				1873	.d_delete = pid_delete_dentry,
				1874	};
				1875
				1876	/* Lookups */
				1877
				1878	/*
				1879	* Fill a directory entry.
				1880	*
				1881	* If possible create the dcache entry and derive our inode number and
				1882	* file type from dcache entry.
				1883	*
				1884	* Since all of the proc inode numbers are dynamically generated, the inode
				1885	* numbers do not exist until the inode is cache. This means creating the
				1886	* the dcache entry in readdir is necessary to keep the inode numbers
				1887	* reported by readdir in sync with the inode numbers reported
				1888	* by stat.
				1889	*/
				1890	bool proc_fill_cache(struct file file, struct dir_context ctx,
				1891	const char *name, unsigned int len,
				1892	instantiate_t instantiate, struct task_struct task, const void ptr)
				1893	{
				1894	struct dentry child, dir = file->f_path.dentry;
				1895	struct qstr qname = QSTR_INIT(name, len);
				1896	struct inode *inode;
				1897	unsigned type = DT_UNKNOWN;
				1898	ino_t ino = 1;
				1899
				1900	child = d_hash_and_lookup(dir, &qname);
				1901	if (!child) {
				1902	DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
				1903	child = d_alloc_parallel(dir, &qname, &wq);
				1904	if (IS_ERR(child))
				1905	goto end_instantiate;
				1906	if (d_in_lookup(child)) {
				1907	struct dentry *res;
				1908	res = instantiate(child, task, ptr);
				1909	d_lookup_done(child);
				1910	if (unlikely(res)) {
				1911	dput(child);
				1912	child = res;
				1913	if (IS_ERR(child))
				1914	goto end_instantiate;
				1915	}
				1916	}
				1917	}
				1918	inode = d_inode(child);
				1919	ino = inode->i_ino;
				1920	type = inode->i_mode >> 12;
				1921	dput(child);
				1922	end_instantiate:
				1923	return dir_emit(ctx, name, len, ino, type);
				1924	}
				1925
				1926	/*
				1927	* dname_to_vma_addr - maps a dentry name into two unsigned longs
				1928	* which represent vma start and end addresses.
				1929	*/
				1930	static int dname_to_vma_addr(struct dentry *dentry,
				1931	unsigned long start, unsigned long end)
				1932	{
				1933	const char *str = dentry->d_name.name;
				1934	unsigned long long sval, eval;
				1935	unsigned int len;
				1936
				1937	if (str[0] == '0' && str[1] != '-')
				1938	return -EINVAL;
				1939	len = _parse_integer(str, 16, &sval);
				1940	if (len & KSTRTOX_OVERFLOW)
				1941	return -EINVAL;
				1942	if (sval != (unsigned long)sval)
				1943	return -EINVAL;
				1944	str += len;
				1945
				1946	if (*str != '-')
				1947	return -EINVAL;
				1948	str++;
				1949
				1950	if (str[0] == '0' && str[1])
				1951	return -EINVAL;
				1952	len = _parse_integer(str, 16, &eval);
				1953	if (len & KSTRTOX_OVERFLOW)
				1954	return -EINVAL;
				1955	if (eval != (unsigned long)eval)
				1956	return -EINVAL;
				1957	str += len;
				1958
				1959	if (*str != '\0')
				1960	return -EINVAL;
				1961
				1962	*start = sval;
				1963	*end = eval;
				1964
				1965	return 0;
				1966	}
				1967
				1968	static int map_files_d_revalidate(struct dentry *dentry, unsigned int flags)
				1969	{
				1970	unsigned long vm_start, vm_end;
				1971	bool exact_vma_exists = false;
				1972	struct mm_struct *mm = NULL;
				1973	struct task_struct *task;
				1974	struct inode *inode;
				1975	int status = 0;
				1976
				1977	if (flags & LOOKUP_RCU)
				1978	return -ECHILD;
				1979
				1980	inode = d_inode(dentry);
				1981	task = get_proc_task(inode);
				1982	if (!task)
				1983	goto out_notask;
				1984
				1985	mm = mm_access(task, PTRACE_MODE_READ_FSCREDS);
				1986	if (IS_ERR_OR_NULL(mm))
				1987	goto out;
				1988
				1989	if (!dname_to_vma_addr(dentry, &vm_start, &vm_end)) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1990	status = down_read_killable(&mm->mmap_sem);
				1991	if (!status) {
				1992	exact_vma_exists = !!find_exact_vma(mm, vm_start,
				1993	vm_end);
				1994	up_read(&mm->mmap_sem);
				1995	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1996	}
				1997
				1998	mmput(mm);
				1999
				2000	if (exact_vma_exists) {
				2001	task_dump_owner(task, 0, &inode->i_uid, &inode->i_gid);
				2002
				2003	security_task_to_inode(task, inode);
				2004	status = 1;
				2005	}
				2006
				2007	out:
				2008	put_task_struct(task);
				2009
				2010	out_notask:
				2011	return status;
				2012	}
				2013
				2014	static const struct dentry_operations tid_map_files_dentry_operations = {
				2015	.d_revalidate = map_files_d_revalidate,
				2016	.d_delete = pid_delete_dentry,
				2017	};
				2018
				2019	static int map_files_get_link(struct dentry dentry, struct path path)
				2020	{
				2021	unsigned long vm_start, vm_end;
				2022	struct vm_area_struct *vma;
				2023	struct task_struct *task;
				2024	struct mm_struct *mm;
				2025	int rc;
				2026
				2027	rc = -ENOENT;
				2028	task = get_proc_task(d_inode(dentry));
				2029	if (!task)
				2030	goto out;
				2031
				2032	mm = get_task_mm(task);
				2033	put_task_struct(task);
				2034	if (!mm)
				2035	goto out;
				2036
				2037	rc = dname_to_vma_addr(dentry, &vm_start, &vm_end);
				2038	if (rc)
				2039	goto out_mmput;
				2040
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2041	rc = down_read_killable(&mm->mmap_sem);
				2042	if (rc)
				2043	goto out_mmput;
				2044
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2045	rc = -ENOENT;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2046	vma = find_exact_vma(mm, vm_start, vm_end);
				2047	if (vma && vma->vm_file) {
				2048	*path = vma->vm_file->f_path;
				2049	path_get(path);
				2050	rc = 0;
				2051	}
				2052	up_read(&mm->mmap_sem);
				2053
				2054	out_mmput:
				2055	mmput(mm);
				2056	out:
				2057	return rc;
				2058	}
				2059
				2060	struct map_files_info {
				2061	unsigned long start;
				2062	unsigned long end;
				2063	fmode_t mode;
				2064	};
				2065
				2066	/*
				2067	* Only allow CAP_SYS_ADMIN to follow the links, due to concerns about how the
				2068	* symlinks may be used to bypass permissions on ancestor directories in the
				2069	* path to the file in question.
				2070	*/
				2071	static const char *
				2072	proc_map_files_get_link(struct dentry *dentry,
				2073	struct inode *inode,
				2074	struct delayed_call *done)
				2075	{
				2076	if (!capable(CAP_SYS_ADMIN))
				2077	return ERR_PTR(-EPERM);
				2078
				2079	return proc_pid_get_link(dentry, inode, done);
				2080	}
				2081
				2082	/*
				2083	* Identical to proc_pid_link_inode_operations except for get_link()
				2084	*/
				2085	static const struct inode_operations proc_map_files_link_inode_operations = {
				2086	.readlink = proc_pid_readlink,
				2087	.get_link = proc_map_files_get_link,
				2088	.setattr = proc_setattr,
				2089	};
				2090
				2091	static struct dentry *
				2092	proc_map_files_instantiate(struct dentry *dentry,
				2093	struct task_struct task, const void ptr)
				2094	{
				2095	fmode_t mode = (fmode_t)(unsigned long)ptr;
				2096	struct proc_inode *ei;
				2097	struct inode *inode;
				2098
				2099	inode = proc_pid_make_inode(dentry->d_sb, task, S_IFLNK \|
				2100	((mode & FMODE_READ ) ? S_IRUSR : 0) \|
				2101	((mode & FMODE_WRITE) ? S_IWUSR : 0));
				2102	if (!inode)
				2103	return ERR_PTR(-ENOENT);
				2104
				2105	ei = PROC_I(inode);
				2106	ei->op.proc_get_link = map_files_get_link;
				2107
				2108	inode->i_op = &proc_map_files_link_inode_operations;
				2109	inode->i_size = 64;
				2110
				2111	d_set_d_op(dentry, &tid_map_files_dentry_operations);
				2112	return d_splice_alias(inode, dentry);
				2113	}
				2114
				2115	static struct dentry proc_map_files_lookup(struct inode dir,
				2116	struct dentry *dentry, unsigned int flags)
				2117	{
				2118	unsigned long vm_start, vm_end;
				2119	struct vm_area_struct *vma;
				2120	struct task_struct *task;
				2121	struct dentry *result;
				2122	struct mm_struct *mm;
				2123
				2124	result = ERR_PTR(-ENOENT);
				2125	task = get_proc_task(dir);
				2126	if (!task)
				2127	goto out;
				2128
				2129	result = ERR_PTR(-EACCES);
				2130	if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS))
				2131	goto out_put_task;
				2132
				2133	result = ERR_PTR(-ENOENT);
				2134	if (dname_to_vma_addr(dentry, &vm_start, &vm_end))
				2135	goto out_put_task;
				2136
				2137	mm = get_task_mm(task);
				2138	if (!mm)
				2139	goto out_put_task;
				2140
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2141	result = ERR_PTR(-EINTR);
				2142	if (down_read_killable(&mm->mmap_sem))
				2143	goto out_put_mm;
				2144
				2145	result = ERR_PTR(-ENOENT);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2146	vma = find_exact_vma(mm, vm_start, vm_end);
				2147	if (!vma)
				2148	goto out_no_vma;
				2149
				2150	if (vma->vm_file)
				2151	result = proc_map_files_instantiate(dentry, task,
				2152	(void *)(unsigned long)vma->vm_file->f_mode);
				2153
				2154	out_no_vma:
				2155	up_read(&mm->mmap_sem);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2156	out_put_mm:
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2157	mmput(mm);
				2158	out_put_task:
				2159	put_task_struct(task);
				2160	out:
				2161	return result;
				2162	}
				2163
				2164	static const struct inode_operations proc_map_files_inode_operations = {
				2165	.lookup = proc_map_files_lookup,
				2166	.permission = proc_fd_permission,
				2167	.setattr = proc_setattr,
				2168	};
				2169
				2170	static int
				2171	proc_map_files_readdir(struct file file, struct dir_context ctx)
				2172	{
				2173	struct vm_area_struct *vma;
				2174	struct task_struct *task;
				2175	struct mm_struct *mm;
				2176	unsigned long nr_files, pos, i;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2177	GENRADIX(struct map_files_info) fa;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2178	struct map_files_info *p;
				2179	int ret;
				2180
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2181	genradix_init(&fa);
				2182
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2183	ret = -ENOENT;
				2184	task = get_proc_task(file_inode(file));
				2185	if (!task)
				2186	goto out;
				2187
				2188	ret = -EACCES;
				2189	if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS))
				2190	goto out_put_task;
				2191
				2192	ret = 0;
				2193	if (!dir_emit_dots(file, ctx))
				2194	goto out_put_task;
				2195
				2196	mm = get_task_mm(task);
				2197	if (!mm)
				2198	goto out_put_task;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2199
				2200	ret = down_read_killable(&mm->mmap_sem);
				2201	if (ret) {
				2202	mmput(mm);
				2203	goto out_put_task;
				2204	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2205
				2206	nr_files = 0;
				2207
				2208	/*
				2209	* We need two passes here:
				2210	*
				2211	* 1) Collect vmas of mapped files with mmap_sem taken
				2212	* 2) Release mmap_sem and instantiate entries
				2213	*
				2214	* otherwise we get lockdep complained, since filldir()
				2215	* routine might require mmap_sem taken in might_fault().
				2216	*/
				2217
				2218	for (vma = mm->mmap, pos = 2; vma; vma = vma->vm_next) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2219	if (!vma->vm_file)
				2220	continue;
				2221	if (++pos <= ctx->pos)
				2222	continue;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2223
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2224	p = genradix_ptr_alloc(&fa, nr_files++, GFP_KERNEL);
				2225	if (!p) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2226	ret = -ENOMEM;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2227	up_read(&mm->mmap_sem);
				2228	mmput(mm);
				2229	goto out_put_task;
				2230	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2231
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2232	p->start = vma->vm_start;
				2233	p->end = vma->vm_end;
				2234	p->mode = vma->vm_file->f_mode;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2235	}
				2236	up_read(&mm->mmap_sem);
				2237	mmput(mm);
				2238
				2239	for (i = 0; i < nr_files; i++) {
				2240	char buf[4 * sizeof(long) + 2]; /* max: %lx-%lx\0 */
				2241	unsigned int len;
				2242
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2243	p = genradix_ptr(&fa, i);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2244	len = snprintf(buf, sizeof(buf), "%lx-%lx", p->start, p->end);
				2245	if (!proc_fill_cache(file, ctx,
				2246	buf, len,
				2247	proc_map_files_instantiate,
				2248	task,
				2249	(void *)(unsigned long)p->mode))
				2250	break;
				2251	ctx->pos++;
				2252	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2253
				2254	out_put_task:
				2255	put_task_struct(task);
				2256	out:
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2257	genradix_free(&fa);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2258	return ret;
				2259	}
				2260
				2261	static const struct file_operations proc_map_files_operations = {
				2262	.read = generic_read_dir,
				2263	.iterate_shared = proc_map_files_readdir,
				2264	.llseek = generic_file_llseek,
				2265	};
				2266
				2267	#if defined(CONFIG_CHECKPOINT_RESTORE) && defined(CONFIG_POSIX_TIMERS)
				2268	struct timers_private {
				2269	struct pid *pid;
				2270	struct task_struct *task;
				2271	struct sighand_struct *sighand;
				2272	struct pid_namespace *ns;
				2273	unsigned long flags;
				2274	};
				2275
				2276	static void timers_start(struct seq_file m, loff_t *pos)
				2277	{
				2278	struct timers_private *tp = m->private;
				2279
				2280	tp->task = get_pid_task(tp->pid, PIDTYPE_PID);
				2281	if (!tp->task)
				2282	return ERR_PTR(-ESRCH);
				2283
				2284	tp->sighand = lock_task_sighand(tp->task, &tp->flags);
				2285	if (!tp->sighand)
				2286	return ERR_PTR(-ESRCH);
				2287
				2288	return seq_list_start(&tp->task->signal->posix_timers, *pos);
				2289	}
				2290
				2291	static void timers_next(struct seq_file m, void v, loff_t pos)
				2292	{
				2293	struct timers_private *tp = m->private;
				2294	return seq_list_next(v, &tp->task->signal->posix_timers, pos);
				2295	}
				2296
				2297	static void timers_stop(struct seq_file m, void v)
				2298	{
				2299	struct timers_private *tp = m->private;
				2300
				2301	if (tp->sighand) {
				2302	unlock_task_sighand(tp->task, &tp->flags);
				2303	tp->sighand = NULL;
				2304	}
				2305
				2306	if (tp->task) {
				2307	put_task_struct(tp->task);
				2308	tp->task = NULL;
				2309	}
				2310	}
				2311
				2312	static int show_timer(struct seq_file m, void v)
				2313	{
				2314	struct k_itimer *timer;
				2315	struct timers_private *tp = m->private;
				2316	int notify;
				2317	static const char * const nstr[] = {
				2318	[SIGEV_SIGNAL] = "signal",
				2319	[SIGEV_NONE] = "none",
				2320	[SIGEV_THREAD] = "thread",
				2321	};
				2322
				2323	timer = list_entry((struct list_head *)v, struct k_itimer, list);
				2324	notify = timer->it_sigev_notify;
				2325
				2326	seq_printf(m, "ID: %d\n", timer->it_id);
				2327	seq_printf(m, "signal: %d/%px\n",
				2328	timer->sigq->info.si_signo,
				2329	timer->sigq->info.si_value.sival_ptr);
				2330	seq_printf(m, "notify: %s/%s.%d\n",
				2331	nstr[notify & ~SIGEV_THREAD_ID],
				2332	(notify & SIGEV_THREAD_ID) ? "tid" : "pid",
				2333	pid_nr_ns(timer->it_pid, tp->ns));
				2334	seq_printf(m, "ClockID: %d\n", timer->it_clock);
				2335
				2336	return 0;
				2337	}
				2338
				2339	static const struct seq_operations proc_timers_seq_ops = {
				2340	.start = timers_start,
				2341	.next = timers_next,
				2342	.stop = timers_stop,
				2343	.show = show_timer,
				2344	};
				2345
				2346	static int proc_timers_open(struct inode inode, struct file file)
				2347	{
				2348	struct timers_private *tp;
				2349
				2350	tp = __seq_open_private(file, &proc_timers_seq_ops,
				2351	sizeof(struct timers_private));
				2352	if (!tp)
				2353	return -ENOMEM;
				2354
				2355	tp->pid = proc_pid(inode);
				2356	tp->ns = proc_pid_ns(inode);
				2357	return 0;
				2358	}
				2359
				2360	static const struct file_operations proc_timers_operations = {
				2361	.open = proc_timers_open,
				2362	.read = seq_read,
				2363	.llseek = seq_lseek,
				2364	.release = seq_release_private,
				2365	};
				2366	#endif
				2367
				2368	static ssize_t timerslack_ns_write(struct file file, const char __user buf,
				2369	size_t count, loff_t *offset)
				2370	{
				2371	struct inode *inode = file_inode(file);
				2372	struct task_struct *p;
				2373	u64 slack_ns;
				2374	int err;
				2375
				2376	err = kstrtoull_from_user(buf, count, 10, &slack_ns);
				2377	if (err < 0)
				2378	return err;
				2379
				2380	p = get_proc_task(inode);
				2381	if (!p)
				2382	return -ESRCH;
				2383
				2384	if (p != current) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2385	rcu_read_lock();
				2386	if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
				2387	rcu_read_unlock();
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2388	count = -EPERM;
				2389	goto out;
				2390	}
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2391	rcu_read_unlock();
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2392
				2393	err = security_task_setscheduler(p);
				2394	if (err) {
				2395	count = err;
				2396	goto out;
				2397	}
				2398	}
				2399
				2400	task_lock(p);
				2401	if (slack_ns == 0)
				2402	p->timer_slack_ns = p->default_timer_slack_ns;
				2403	else
				2404	p->timer_slack_ns = slack_ns;
				2405	task_unlock(p);
				2406
				2407	out:
				2408	put_task_struct(p);
				2409
				2410	return count;
				2411	}
				2412
				2413	static int timerslack_ns_show(struct seq_file m, void v)
				2414	{
				2415	struct inode *inode = m->private;
				2416	struct task_struct *p;
				2417	int err = 0;
				2418
				2419	p = get_proc_task(inode);
				2420	if (!p)
				2421	return -ESRCH;
				2422
				2423	if (p != current) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2424	rcu_read_lock();
				2425	if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
				2426	rcu_read_unlock();
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2427	err = -EPERM;
				2428	goto out;
				2429	}
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2430	rcu_read_unlock();
				2431
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2432	err = security_task_getscheduler(p);
				2433	if (err)
				2434	goto out;
				2435	}
				2436
				2437	task_lock(p);
				2438	seq_printf(m, "%llu\n", p->timer_slack_ns);
				2439	task_unlock(p);
				2440
				2441	out:
				2442	put_task_struct(p);
				2443
				2444	return err;
				2445	}
				2446
				2447	static int timerslack_ns_open(struct inode inode, struct file filp)
				2448	{
				2449	return single_open(filp, timerslack_ns_show, inode);
				2450	}
				2451
				2452	static const struct file_operations proc_pid_set_timerslack_ns_operations = {
				2453	.open = timerslack_ns_open,
				2454	.read = seq_read,
				2455	.write = timerslack_ns_write,
				2456	.llseek = seq_lseek,
				2457	.release = single_release,
				2458	};
				2459
				2460	static struct dentry proc_pident_instantiate(struct dentry dentry,
				2461	struct task_struct task, const void ptr)
				2462	{
				2463	const struct pid_entry *p = ptr;
				2464	struct inode *inode;
				2465	struct proc_inode *ei;
				2466
				2467	inode = proc_pid_make_inode(dentry->d_sb, task, p->mode);
				2468	if (!inode)
				2469	return ERR_PTR(-ENOENT);
				2470
				2471	ei = PROC_I(inode);
				2472	if (S_ISDIR(inode->i_mode))
				2473	set_nlink(inode, 2); /* Use getattr to fix if necessary */
				2474	if (p->iop)
				2475	inode->i_op = p->iop;
				2476	if (p->fop)
				2477	inode->i_fop = p->fop;
				2478	ei->op = p->op;
				2479	pid_update_inode(task, inode);
				2480	d_set_d_op(dentry, &pid_dentry_operations);
				2481	return d_splice_alias(inode, dentry);
				2482	}
				2483
				2484	static struct dentry proc_pident_lookup(struct inode dir,
				2485	struct dentry *dentry,
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2486	const struct pid_entry *p,
				2487	const struct pid_entry *end)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2488	{
				2489	struct task_struct *task = get_proc_task(dir);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2490	struct dentry *res = ERR_PTR(-ENOENT);
				2491
				2492	if (!task)
				2493	goto out_no_task;
				2494
				2495	/*
				2496	* Yes, it does not scale. And it should not. Don't add
				2497	* new entries into /proc/<tgid>/ without very good reasons.
				2498	*/
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2499	for (; p < end; p++) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2500	if (p->len != dentry->d_name.len)
				2501	continue;
				2502	if (!memcmp(dentry->d_name.name, p->name, p->len)) {
				2503	res = proc_pident_instantiate(dentry, task, p);
				2504	break;
				2505	}
				2506	}
				2507	put_task_struct(task);
				2508	out_no_task:
				2509	return res;
				2510	}
				2511
				2512	static int proc_pident_readdir(struct file file, struct dir_context ctx,
				2513	const struct pid_entry *ents, unsigned int nents)
				2514	{
				2515	struct task_struct *task = get_proc_task(file_inode(file));
				2516	const struct pid_entry *p;
				2517
				2518	if (!task)
				2519	return -ENOENT;
				2520
				2521	if (!dir_emit_dots(file, ctx))
				2522	goto out;
				2523
				2524	if (ctx->pos >= nents + 2)
				2525	goto out;
				2526
				2527	for (p = ents + (ctx->pos - 2); p < ents + nents; p++) {
				2528	if (!proc_fill_cache(file, ctx, p->name, p->len,
				2529	proc_pident_instantiate, task, p))
				2530	break;
				2531	ctx->pos++;
				2532	}
				2533	out:
				2534	put_task_struct(task);
				2535	return 0;
				2536	}
				2537
				2538	#ifdef CONFIG_SECURITY
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	2539	static int proc_pid_attr_open(struct inode inode, struct file file)
				2540	{
				2541	file->private_data = NULL;
				2542	__mem_open(inode, file, PTRACE_MODE_READ_FSCREDS);
				2543	return 0;
				2544	}
				2545
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2546	static ssize_t proc_pid_attr_read(struct file * file, char __user * buf,
				2547	size_t count, loff_t *ppos)
				2548	{
				2549	struct inode * inode = file_inode(file);
				2550	char *p = NULL;
				2551	ssize_t length;
				2552	struct task_struct *task = get_proc_task(inode);
				2553
				2554	if (!task)
				2555	return -ESRCH;
				2556
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2557	length = security_getprocattr(task, PROC_I(inode)->op.lsm,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2558	(char*)file->f_path.dentry->d_name.name,
				2559	&p);
				2560	put_task_struct(task);
				2561	if (length > 0)
				2562	length = simple_read_from_buffer(buf, count, ppos, p, length);
				2563	kfree(p);
				2564	return length;
				2565	}
				2566
				2567	static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf,
				2568	size_t count, loff_t *ppos)
				2569	{
				2570	struct inode * inode = file_inode(file);
				2571	struct task_struct *task;
				2572	void *page;
				2573	int rv;
				2574
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	2575	/* A task may only write when it was the opener. */
				2576	if (file->private_data != current->mm)
				2577	return -EPERM;
				2578
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2579	rcu_read_lock();
				2580	task = pid_task(proc_pid(inode), PIDTYPE_PID);
				2581	if (!task) {
				2582	rcu_read_unlock();
				2583	return -ESRCH;
				2584	}
				2585	/* A task may only write its own attributes. */
				2586	if (current != task) {
				2587	rcu_read_unlock();
				2588	return -EACCES;
				2589	}
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2590	/* Prevent changes to overridden credentials. */
				2591	if (current_cred() != current_real_cred()) {
				2592	rcu_read_unlock();
				2593	return -EBUSY;
				2594	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2595	rcu_read_unlock();
				2596
				2597	if (count > PAGE_SIZE)
				2598	count = PAGE_SIZE;
				2599
				2600	/* No partial writes. */
				2601	if (*ppos != 0)
				2602	return -EINVAL;
				2603
				2604	page = memdup_user(buf, count);
				2605	if (IS_ERR(page)) {
				2606	rv = PTR_ERR(page);
				2607	goto out;
				2608	}
				2609
				2610	/* Guard against adverse ptrace interaction */
				2611	rv = mutex_lock_interruptible(&current->signal->cred_guard_mutex);
				2612	if (rv < 0)
				2613	goto out_free;
				2614
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2615	rv = security_setprocattr(PROC_I(inode)->op.lsm,
				2616	file->f_path.dentry->d_name.name, page,
				2617	count);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2618	mutex_unlock(&current->signal->cred_guard_mutex);
				2619	out_free:
				2620	kfree(page);
				2621	out:
				2622	return rv;
				2623	}
				2624
				2625	static const struct file_operations proc_pid_attr_operations = {
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	2626	.open = proc_pid_attr_open,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2627	.read = proc_pid_attr_read,
				2628	.write = proc_pid_attr_write,
				2629	.llseek = generic_file_llseek,
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	2630	.release = mem_release,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2631	};
				2632
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2633	#define LSM_DIR_OPS(LSM) \
				2634	static int proc_##LSM##_attr_dir_iterate(struct file *filp, \
				2635	struct dir_context *ctx) \
				2636	{ \
				2637	return proc_pident_readdir(filp, ctx, \
				2638	LSM##_attr_dir_stuff, \
				2639	ARRAY_SIZE(LSM##_attr_dir_stuff)); \
				2640	} \
				2641	\
				2642	static const struct file_operations proc_##LSM##_attr_dir_ops = { \
				2643	.read = generic_read_dir, \
				2644	.iterate = proc_##LSM##_attr_dir_iterate, \
				2645	.llseek = default_llseek, \
				2646	}; \
				2647	\
				2648	static struct dentry proc_##LSM##_attr_dir_lookup(struct inode dir, \
				2649	struct dentry *dentry, unsigned int flags) \
				2650	{ \
				2651	return proc_pident_lookup(dir, dentry, \
				2652	LSM##_attr_dir_stuff, \
				2653	LSM##_attr_dir_stuff + ARRAY_SIZE(LSM##_attr_dir_stuff)); \
				2654	} \
				2655	\
				2656	static const struct inode_operations proc_##LSM##_attr_dir_inode_ops = { \
				2657	.lookup = proc_##LSM##_attr_dir_lookup, \
				2658	.getattr = pid_getattr, \
				2659	.setattr = proc_setattr, \
				2660	}
				2661
				2662	#ifdef CONFIG_SECURITY_SMACK
				2663	static const struct pid_entry smack_attr_dir_stuff[] = {
				2664	ATTR("smack", "current", 0666),
				2665	};
				2666	LSM_DIR_OPS(smack);
				2667	#endif
				2668
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2669	static const struct pid_entry attr_dir_stuff[] = {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2670	ATTR(NULL, "current", 0666),
				2671	ATTR(NULL, "prev", 0444),
				2672	ATTR(NULL, "exec", 0666),
				2673	ATTR(NULL, "fscreate", 0666),
				2674	ATTR(NULL, "keycreate", 0666),
				2675	ATTR(NULL, "sockcreate", 0666),
				2676	#ifdef CONFIG_SECURITY_SMACK
				2677	DIR("smack", 0555,
				2678	proc_smack_attr_dir_inode_ops, proc_smack_attr_dir_ops),
				2679	#endif
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2680	};
				2681
				2682	static int proc_attr_dir_readdir(struct file file, struct dir_context ctx)
				2683	{
				2684	return proc_pident_readdir(file, ctx,
				2685	attr_dir_stuff, ARRAY_SIZE(attr_dir_stuff));
				2686	}
				2687
				2688	static const struct file_operations proc_attr_dir_operations = {
				2689	.read = generic_read_dir,
				2690	.iterate_shared = proc_attr_dir_readdir,
				2691	.llseek = generic_file_llseek,
				2692	};
				2693
				2694	static struct dentry proc_attr_dir_lookup(struct inode dir,
				2695	struct dentry *dentry, unsigned int flags)
				2696	{
				2697	return proc_pident_lookup(dir, dentry,
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2698	attr_dir_stuff,
				2699	attr_dir_stuff + ARRAY_SIZE(attr_dir_stuff));
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2700	}
				2701
				2702	static const struct inode_operations proc_attr_dir_inode_operations = {
				2703	.lookup = proc_attr_dir_lookup,
				2704	.getattr = pid_getattr,
				2705	.setattr = proc_setattr,
				2706	};
				2707
				2708	#endif
				2709
				2710	#ifdef CONFIG_ELF_CORE
				2711	static ssize_t proc_coredump_filter_read(struct file file, char __user buf,
				2712	size_t count, loff_t *ppos)
				2713	{
				2714	struct task_struct *task = get_proc_task(file_inode(file));
				2715	struct mm_struct *mm;
				2716	char buffer[PROC_NUMBUF];
				2717	size_t len;
				2718	int ret;
				2719
				2720	if (!task)
				2721	return -ESRCH;
				2722
				2723	ret = 0;
				2724	mm = get_task_mm(task);
				2725	if (mm) {
				2726	len = snprintf(buffer, sizeof(buffer), "%08lx\n",
				2727	((mm->flags & MMF_DUMP_FILTER_MASK) >>
				2728	MMF_DUMP_FILTER_SHIFT));
				2729	mmput(mm);
				2730	ret = simple_read_from_buffer(buf, count, ppos, buffer, len);
				2731	}
				2732
				2733	put_task_struct(task);
				2734
				2735	return ret;
				2736	}
				2737
				2738	static ssize_t proc_coredump_filter_write(struct file *file,
				2739	const char __user *buf,
				2740	size_t count,
				2741	loff_t *ppos)
				2742	{
				2743	struct task_struct *task;
				2744	struct mm_struct *mm;
				2745	unsigned int val;
				2746	int ret;
				2747	int i;
				2748	unsigned long mask;
				2749
				2750	ret = kstrtouint_from_user(buf, count, 0, &val);
				2751	if (ret < 0)
				2752	return ret;
				2753
				2754	ret = -ESRCH;
				2755	task = get_proc_task(file_inode(file));
				2756	if (!task)
				2757	goto out_no_task;
				2758
				2759	mm = get_task_mm(task);
				2760	if (!mm)
				2761	goto out_no_mm;
				2762	ret = 0;
				2763
				2764	for (i = 0, mask = 1; i < MMF_DUMP_FILTER_BITS; i++, mask <<= 1) {
				2765	if (val & mask)
				2766	set_bit(i + MMF_DUMP_FILTER_SHIFT, &mm->flags);
				2767	else
				2768	clear_bit(i + MMF_DUMP_FILTER_SHIFT, &mm->flags);
				2769	}
				2770
				2771	mmput(mm);
				2772	out_no_mm:
				2773	put_task_struct(task);
				2774	out_no_task:
				2775	if (ret < 0)
				2776	return ret;
				2777	return count;
				2778	}
				2779
				2780	static const struct file_operations proc_coredump_filter_operations = {
				2781	.read = proc_coredump_filter_read,
				2782	.write = proc_coredump_filter_write,
				2783	.llseek = generic_file_llseek,
				2784	};
				2785	#endif
				2786
				2787	#ifdef CONFIG_TASK_IO_ACCOUNTING
				2788	static int do_io_accounting(struct task_struct task, struct seq_file m, int whole)
				2789	{
				2790	struct task_io_accounting acct = task->ioac;
				2791	unsigned long flags;
				2792	int result;
				2793
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	2794	result = down_read_killable(&task->signal->exec_update_lock);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2795	if (result)
				2796	return result;
				2797
				2798	if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) {
				2799	result = -EACCES;
				2800	goto out_unlock;
				2801	}
				2802
				2803	if (whole && lock_task_sighand(task, &flags)) {
				2804	struct task_struct *t = task;
				2805
				2806	task_io_accounting_add(&acct, &task->signal->ioac);
				2807	while_each_thread(task, t)
				2808	task_io_accounting_add(&acct, &t->ioac);
				2809
				2810	unlock_task_sighand(task, &flags);
				2811	}
				2812	seq_printf(m,
				2813	"rchar: %llu\n"
				2814	"wchar: %llu\n"
				2815	"syscr: %llu\n"
				2816	"syscw: %llu\n"
				2817	"read_bytes: %llu\n"
				2818	"write_bytes: %llu\n"
				2819	"cancelled_write_bytes: %llu\n",
				2820	(unsigned long long)acct.rchar,
				2821	(unsigned long long)acct.wchar,
				2822	(unsigned long long)acct.syscr,
				2823	(unsigned long long)acct.syscw,
				2824	(unsigned long long)acct.read_bytes,
				2825	(unsigned long long)acct.write_bytes,
				2826	(unsigned long long)acct.cancelled_write_bytes);
				2827	result = 0;
				2828
				2829	out_unlock:
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	2830	up_read(&task->signal->exec_update_lock);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2831	return result;
				2832	}
				2833
				2834	static int proc_tid_io_accounting(struct seq_file m, struct pid_namespace ns,
				2835	struct pid pid, struct task_struct task)
				2836	{
				2837	return do_io_accounting(task, m, 0);
				2838	}
				2839
				2840	static int proc_tgid_io_accounting(struct seq_file m, struct pid_namespace ns,
				2841	struct pid pid, struct task_struct task)
				2842	{
				2843	return do_io_accounting(task, m, 1);
				2844	}
				2845	#endif /* CONFIG_TASK_IO_ACCOUNTING */
				2846
				2847	#ifdef CONFIG_USER_NS
				2848	static int proc_id_map_open(struct inode inode, struct file file,
				2849	const struct seq_operations *seq_ops)
				2850	{
				2851	struct user_namespace *ns = NULL;
				2852	struct task_struct *task;
				2853	struct seq_file *seq;
				2854	int ret = -EINVAL;
				2855
				2856	task = get_proc_task(inode);
				2857	if (task) {
				2858	rcu_read_lock();
				2859	ns = get_user_ns(task_cred_xxx(task, user_ns));
				2860	rcu_read_unlock();
				2861	put_task_struct(task);
				2862	}
				2863	if (!ns)
				2864	goto err;
				2865
				2866	ret = seq_open(file, seq_ops);
				2867	if (ret)
				2868	goto err_put_ns;
				2869
				2870	seq = file->private_data;
				2871	seq->private = ns;
				2872
				2873	return 0;
				2874	err_put_ns:
				2875	put_user_ns(ns);
				2876	err:
				2877	return ret;
				2878	}
				2879
				2880	static int proc_id_map_release(struct inode inode, struct file file)
				2881	{
				2882	struct seq_file *seq = file->private_data;
				2883	struct user_namespace *ns = seq->private;
				2884	put_user_ns(ns);
				2885	return seq_release(inode, file);
				2886	}
				2887
				2888	static int proc_uid_map_open(struct inode inode, struct file file)
				2889	{
				2890	return proc_id_map_open(inode, file, &proc_uid_seq_operations);
				2891	}
				2892
				2893	static int proc_gid_map_open(struct inode inode, struct file file)
				2894	{
				2895	return proc_id_map_open(inode, file, &proc_gid_seq_operations);
				2896	}
				2897
				2898	static int proc_projid_map_open(struct inode inode, struct file file)
				2899	{
				2900	return proc_id_map_open(inode, file, &proc_projid_seq_operations);
				2901	}
				2902
				2903	static const struct file_operations proc_uid_map_operations = {
				2904	.open = proc_uid_map_open,
				2905	.write = proc_uid_map_write,
				2906	.read = seq_read,
				2907	.llseek = seq_lseek,
				2908	.release = proc_id_map_release,
				2909	};
				2910
				2911	static const struct file_operations proc_gid_map_operations = {
				2912	.open = proc_gid_map_open,
				2913	.write = proc_gid_map_write,
				2914	.read = seq_read,
				2915	.llseek = seq_lseek,
				2916	.release = proc_id_map_release,
				2917	};
				2918
				2919	static const struct file_operations proc_projid_map_operations = {
				2920	.open = proc_projid_map_open,
				2921	.write = proc_projid_map_write,
				2922	.read = seq_read,
				2923	.llseek = seq_lseek,
				2924	.release = proc_id_map_release,
				2925	};
				2926
				2927	static int proc_setgroups_open(struct inode inode, struct file file)
				2928	{
				2929	struct user_namespace *ns = NULL;
				2930	struct task_struct *task;
				2931	int ret;
				2932
				2933	ret = -ESRCH;
				2934	task = get_proc_task(inode);
				2935	if (task) {
				2936	rcu_read_lock();
				2937	ns = get_user_ns(task_cred_xxx(task, user_ns));
				2938	rcu_read_unlock();
				2939	put_task_struct(task);
				2940	}
				2941	if (!ns)
				2942	goto err;
				2943
				2944	if (file->f_mode & FMODE_WRITE) {
				2945	ret = -EACCES;
				2946	if (!ns_capable(ns, CAP_SYS_ADMIN))
				2947	goto err_put_ns;
				2948	}
				2949
				2950	ret = single_open(file, &proc_setgroups_show, ns);
				2951	if (ret)
				2952	goto err_put_ns;
				2953
				2954	return 0;
				2955	err_put_ns:
				2956	put_user_ns(ns);
				2957	err:
				2958	return ret;
				2959	}
				2960
				2961	static int proc_setgroups_release(struct inode inode, struct file file)
				2962	{
				2963	struct seq_file *seq = file->private_data;
				2964	struct user_namespace *ns = seq->private;
				2965	int ret = single_release(inode, file);
				2966	put_user_ns(ns);
				2967	return ret;
				2968	}
				2969
				2970	static const struct file_operations proc_setgroups_operations = {
				2971	.open = proc_setgroups_open,
				2972	.write = proc_setgroups_write,
				2973	.read = seq_read,
				2974	.llseek = seq_lseek,
				2975	.release = proc_setgroups_release,
				2976	};
				2977	#endif /* CONFIG_USER_NS */
				2978
				2979	static int proc_pid_personality(struct seq_file m, struct pid_namespace ns,
				2980	struct pid pid, struct task_struct task)
				2981	{
				2982	int err = lock_trace(task);
				2983	if (!err) {
				2984	seq_printf(m, "%08x\n", task->personality);
				2985	unlock_trace(task);
				2986	}
				2987	return err;
				2988	}
				2989
				2990	#ifdef CONFIG_LIVEPATCH
				2991	static int proc_pid_patch_state(struct seq_file m, struct pid_namespace ns,
				2992	struct pid pid, struct task_struct task)
				2993	{
				2994	seq_printf(m, "%d\n", task->patch_state);
				2995	return 0;
				2996	}
				2997	#endif /* CONFIG_LIVEPATCH */
				2998
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2999	#ifdef CONFIG_STACKLEAK_METRICS
				3000	static int proc_stack_depth(struct seq_file m, struct pid_namespace ns,
				3001	struct pid pid, struct task_struct task)
				3002	{
				3003	unsigned long prev_depth = THREAD_SIZE -
				3004	(task->prev_lowest_stack & (THREAD_SIZE - 1));
				3005	unsigned long depth = THREAD_SIZE -
				3006	(task->lowest_stack & (THREAD_SIZE - 1));
				3007
				3008	seq_printf(m, "previous stack depth: %lu\nstack depth: %lu\n",
				3009	prev_depth, depth);
				3010	return 0;
				3011	}
				3012	#endif /* CONFIG_STACKLEAK_METRICS */
				3013
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3014	/*
				3015	* Thread groups
				3016	*/
				3017	static const struct file_operations proc_task_operations;
				3018	static const struct inode_operations proc_task_inode_operations;
				3019
				3020	static const struct pid_entry tgid_base_stuff[] = {
				3021	DIR("task", S_IRUGO\|S_IXUGO, proc_task_inode_operations, proc_task_operations),
				3022	DIR("fd", S_IRUSR\|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
				3023	DIR("map_files", S_IRUSR\|S_IXUSR, proc_map_files_inode_operations, proc_map_files_operations),
				3024	DIR("fdinfo", S_IRUSR\|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
				3025	DIR("ns", S_IRUSR\|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations),
				3026	#ifdef CONFIG_NET
				3027	DIR("net", S_IRUGO\|S_IXUGO, proc_net_inode_operations, proc_net_operations),
				3028	#endif
				3029	REG("environ", S_IRUSR, proc_environ_operations),
				3030	REG("auxv", S_IRUSR, proc_auxv_operations),
				3031	ONE("status", S_IRUGO, proc_pid_status),
				3032	ONE("personality", S_IRUSR, proc_pid_personality),
				3033	ONE("limits", S_IRUGO, proc_pid_limits),
				3034	#ifdef CONFIG_SCHED_DEBUG
				3035	REG("sched", S_IRUGO\|S_IWUSR, proc_pid_sched_operations),
				3036	#endif
				3037	#ifdef CONFIG_SCHED_AUTOGROUP
				3038	REG("autogroup", S_IRUGO\|S_IWUSR, proc_pid_sched_autogroup_operations),
				3039	#endif
				3040	REG("comm", S_IRUGO\|S_IWUSR, proc_pid_set_comm_operations),
				3041	#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
				3042	ONE("syscall", S_IRUSR, proc_pid_syscall),
				3043	#endif
				3044	REG("cmdline", S_IRUGO, proc_pid_cmdline_ops),
				3045	ONE("stat", S_IRUGO, proc_tgid_stat),
				3046	ONE("statm", S_IRUGO, proc_pid_statm),
				3047	REG("maps", S_IRUGO, proc_pid_maps_operations),
				3048	#ifdef CONFIG_NUMA
				3049	REG("numa_maps", S_IRUGO, proc_pid_numa_maps_operations),
				3050	#endif
				3051	REG("mem", S_IRUSR\|S_IWUSR, proc_mem_operations),
				3052	LNK("cwd", proc_cwd_link),
				3053	LNK("root", proc_root_link),
				3054	LNK("exe", proc_exe_link),
				3055	REG("mounts", S_IRUGO, proc_mounts_operations),
				3056	REG("mountinfo", S_IRUGO, proc_mountinfo_operations),
				3057	REG("mountstats", S_IRUSR, proc_mountstats_operations),
				3058	#ifdef CONFIG_PROC_PAGE_MONITOR
				3059	REG("clear_refs", S_IWUSR, proc_clear_refs_operations),
				3060	REG("smaps", S_IRUGO, proc_pid_smaps_operations),
				3061	REG("smaps_rollup", S_IRUGO, proc_pid_smaps_rollup_operations),
				3062	REG("pagemap", S_IRUSR, proc_pagemap_operations),
				3063	#endif
				3064	#ifdef CONFIG_SECURITY
				3065	DIR("attr", S_IRUGO\|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations),
				3066	#endif
				3067	#ifdef CONFIG_KALLSYMS
				3068	ONE("wchan", S_IRUGO, proc_pid_wchan),
				3069	#endif
				3070	#ifdef CONFIG_STACKTRACE
				3071	ONE("stack", S_IRUSR, proc_pid_stack),
				3072	#endif
				3073	#ifdef CONFIG_SCHED_INFO
				3074	ONE("schedstat", S_IRUGO, proc_pid_schedstat),
				3075	#endif
				3076	#ifdef CONFIG_LATENCYTOP
				3077	REG("latency", S_IRUGO, proc_lstats_operations),
				3078	#endif
				3079	#ifdef CONFIG_PROC_PID_CPUSET
				3080	ONE("cpuset", S_IRUGO, proc_cpuset_show),
				3081	#endif
				3082	#ifdef CONFIG_CGROUPS
				3083	ONE("cgroup", S_IRUGO, proc_cgroup_show),
				3084	#endif
				3085	ONE("oom_score", S_IRUGO, proc_oom_score),
				3086	REG("oom_adj", S_IRUGO\|S_IWUSR, proc_oom_adj_operations),
				3087	REG("oom_score_adj", S_IRUGO\|S_IWUSR, proc_oom_score_adj_operations),
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3088	#ifdef CONFIG_AUDIT
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3089	REG("loginuid", S_IWUSR\|S_IRUGO, proc_loginuid_operations),
				3090	REG("sessionid", S_IRUGO, proc_sessionid_operations),
				3091	#endif
				3092	#ifdef CONFIG_FAULT_INJECTION
				3093	REG("make-it-fail", S_IRUGO\|S_IWUSR, proc_fault_inject_operations),
				3094	REG("fail-nth", 0644, proc_fail_nth_operations),
				3095	#endif
				3096	#ifdef CONFIG_ELF_CORE
				3097	REG("coredump_filter", S_IRUGO\|S_IWUSR, proc_coredump_filter_operations),
				3098	#endif
				3099	#ifdef CONFIG_TASK_IO_ACCOUNTING
				3100	ONE("io", S_IRUSR, proc_tgid_io_accounting),
				3101	#endif
				3102	#ifdef CONFIG_USER_NS
				3103	REG("uid_map", S_IRUGO\|S_IWUSR, proc_uid_map_operations),
				3104	REG("gid_map", S_IRUGO\|S_IWUSR, proc_gid_map_operations),
				3105	REG("projid_map", S_IRUGO\|S_IWUSR, proc_projid_map_operations),
				3106	REG("setgroups", S_IRUGO\|S_IWUSR, proc_setgroups_operations),
				3107	#endif
				3108	#if defined(CONFIG_CHECKPOINT_RESTORE) && defined(CONFIG_POSIX_TIMERS)
				3109	REG("timers", S_IRUGO, proc_timers_operations),
				3110	#endif
				3111	REG("timerslack_ns", S_IRUGO\|S_IWUGO, proc_pid_set_timerslack_ns_operations),
				3112	#ifdef CONFIG_LIVEPATCH
				3113	ONE("patch_state", S_IRUSR, proc_pid_patch_state),
				3114	#endif
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3115	#ifdef CONFIG_STACKLEAK_METRICS
				3116	ONE("stack_depth", S_IRUGO, proc_stack_depth),
				3117	#endif
				3118	#ifdef CONFIG_PROC_PID_ARCH_STATUS
				3119	ONE("arch_status", S_IRUGO, proc_pid_arch_status),
				3120	#endif
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3121	};
				3122
				3123	static int proc_tgid_base_readdir(struct file file, struct dir_context ctx)
				3124	{
				3125	return proc_pident_readdir(file, ctx,
				3126	tgid_base_stuff, ARRAY_SIZE(tgid_base_stuff));
				3127	}
				3128
				3129	static const struct file_operations proc_tgid_base_operations = {
				3130	.read = generic_read_dir,
				3131	.iterate_shared = proc_tgid_base_readdir,
				3132	.llseek = generic_file_llseek,
				3133	};
				3134
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3135	struct pid tgid_pidfd_to_pid(const struct file file)
				3136	{
				3137	if (file->f_op != &proc_tgid_base_operations)
				3138	return ERR_PTR(-EBADF);
				3139
				3140	return proc_pid(file_inode(file));
				3141	}
				3142
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3143	static struct dentry proc_tgid_base_lookup(struct inode dir, struct dentry *dentry, unsigned int flags)
				3144	{
				3145	return proc_pident_lookup(dir, dentry,
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3146	tgid_base_stuff,
				3147	tgid_base_stuff + ARRAY_SIZE(tgid_base_stuff));
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3148	}
				3149
				3150	static const struct inode_operations proc_tgid_base_inode_operations = {
				3151	.lookup = proc_tgid_base_lookup,
				3152	.getattr = pid_getattr,
				3153	.setattr = proc_setattr,
				3154	.permission = proc_pid_permission,
				3155	};
				3156
				3157	static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid)
				3158	{
				3159	struct dentry dentry, leader, *dir;
				3160	char buf[10 + 1];
				3161	struct qstr name;
				3162
				3163	name.name = buf;
				3164	name.len = snprintf(buf, sizeof(buf), "%u", pid);
				3165	/* no ->d_hash() rejects on procfs */
				3166	dentry = d_hash_and_lookup(mnt->mnt_root, &name);
				3167	if (dentry) {
				3168	d_invalidate(dentry);
				3169	dput(dentry);
				3170	}
				3171
				3172	if (pid == tgid)
				3173	return;
				3174
				3175	name.name = buf;
				3176	name.len = snprintf(buf, sizeof(buf), "%u", tgid);
				3177	leader = d_hash_and_lookup(mnt->mnt_root, &name);
				3178	if (!leader)
				3179	goto out;
				3180
				3181	name.name = "task";
				3182	name.len = strlen(name.name);
				3183	dir = d_hash_and_lookup(leader, &name);
				3184	if (!dir)
				3185	goto out_put_leader;
				3186
				3187	name.name = buf;
				3188	name.len = snprintf(buf, sizeof(buf), "%u", pid);
				3189	dentry = d_hash_and_lookup(dir, &name);
				3190	if (dentry) {
				3191	d_invalidate(dentry);
				3192	dput(dentry);
				3193	}
				3194
				3195	dput(dir);
				3196	out_put_leader:
				3197	dput(leader);
				3198	out:
				3199	return;
				3200	}
				3201
				3202	/**
				3203	* proc_flush_task - Remove dcache entries for @task from the /proc dcache.
				3204	* @task: task that should be flushed.
				3205	*
				3206	* When flushing dentries from proc, one needs to flush them from global
				3207	* proc (proc_mnt) and from all the namespaces' procs this task was seen
				3208	* in. This call is supposed to do all of this job.
				3209	*
				3210	* Looks in the dcache for
				3211	* /proc/@pid
				3212	* /proc/@tgid/task/@pid
				3213	* if either directory is present flushes it and all of it'ts children
				3214	* from the dcache.
				3215	*
				3216	* It is safe and reasonable to cache /proc entries for a task until
				3217	* that task exits. After that they just clog up the dcache with
				3218	* useless entries, possibly causing useful dcache entries to be
				3219	* flushed instead. This routine is proved to flush those useless
				3220	* dcache entries at process exit time.
				3221	*
				3222	* NOTE: This routine is just an optimization so it does not guarantee
				3223	* that no dcache entries will exist at process exit time it
				3224	* just makes it very unlikely that any will persist.
				3225	*/
				3226
				3227	void proc_flush_task(struct task_struct *task)
				3228	{
				3229	int i;
				3230	struct pid pid, tgid;
				3231	struct upid *upid;
				3232
				3233	pid = task_pid(task);
				3234	tgid = task_tgid(task);
				3235
				3236	for (i = 0; i <= pid->level; i++) {
				3237	upid = &pid->numbers[i];
				3238	proc_flush_task_mnt(upid->ns->proc_mnt, upid->nr,
				3239	tgid->numbers[i].nr);
				3240	}
				3241	}
				3242
				3243	static struct dentry proc_pid_instantiate(struct dentry dentry,
				3244	struct task_struct task, const void ptr)
				3245	{
				3246	struct inode *inode;
				3247
				3248	inode = proc_pid_make_inode(dentry->d_sb, task, S_IFDIR \| S_IRUGO \| S_IXUGO);
				3249	if (!inode)
				3250	return ERR_PTR(-ENOENT);
				3251
				3252	inode->i_op = &proc_tgid_base_inode_operations;
				3253	inode->i_fop = &proc_tgid_base_operations;
				3254	inode->i_flags\|=S_IMMUTABLE;
				3255
				3256	set_nlink(inode, nlink_tgid);
				3257	pid_update_inode(task, inode);
				3258
				3259	d_set_d_op(dentry, &pid_dentry_operations);
				3260	return d_splice_alias(inode, dentry);
				3261	}
				3262
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3263	struct dentry proc_pid_lookup(struct dentry dentry, unsigned int flags)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3264	{
				3265	struct task_struct *task;
				3266	unsigned tgid;
				3267	struct pid_namespace *ns;
				3268	struct dentry *result = ERR_PTR(-ENOENT);
				3269
				3270	tgid = name_to_int(&dentry->d_name);
				3271	if (tgid == ~0U)
				3272	goto out;
				3273
				3274	ns = dentry->d_sb->s_fs_info;
				3275	rcu_read_lock();
				3276	task = find_task_by_pid_ns(tgid, ns);
				3277	if (task)
				3278	get_task_struct(task);
				3279	rcu_read_unlock();
				3280	if (!task)
				3281	goto out;
				3282
				3283	result = proc_pid_instantiate(dentry, task, NULL);
				3284	put_task_struct(task);
				3285	out:
				3286	return result;
				3287	}
				3288
				3289	/*
				3290	* Find the first task with tgid >= tgid
				3291	*
				3292	*/
				3293	struct tgid_iter {
				3294	unsigned int tgid;
				3295	struct task_struct *task;
				3296	};
				3297	static struct tgid_iter next_tgid(struct pid_namespace *ns, struct tgid_iter iter)
				3298	{
				3299	struct pid *pid;
				3300
				3301	if (iter.task)
				3302	put_task_struct(iter.task);
				3303	rcu_read_lock();
				3304	retry:
				3305	iter.task = NULL;
				3306	pid = find_ge_pid(iter.tgid, ns);
				3307	if (pid) {
				3308	iter.tgid = pid_nr_ns(pid, ns);
				3309	iter.task = pid_task(pid, PIDTYPE_PID);
				3310	/* What we to know is if the pid we have find is the
				3311	* pid of a thread_group_leader. Testing for task
				3312	* being a thread_group_leader is the obvious thing
				3313	* todo but there is a window when it fails, due to
				3314	* the pid transfer logic in de_thread.
				3315	*
				3316	* So we perform the straight forward test of seeing
				3317	* if the pid we have found is the pid of a thread
				3318	* group leader, and don't worry if the task we have
				3319	* found doesn't happen to be a thread group leader.
				3320	* As we don't care in the case of readdir.
				3321	*/
				3322	if (!iter.task \|\| !has_group_leader_pid(iter.task)) {
				3323	iter.tgid += 1;
				3324	goto retry;
				3325	}
				3326	get_task_struct(iter.task);
				3327	}
				3328	rcu_read_unlock();
				3329	return iter;
				3330	}
				3331
				3332	#define TGID_OFFSET (FIRST_PROCESS_ENTRY + 2)
				3333
				3334	/* for the /proc/ directory itself, after non-process stuff has been done */
				3335	int proc_pid_readdir(struct file file, struct dir_context ctx)
				3336	{
				3337	struct tgid_iter iter;
				3338	struct pid_namespace *ns = proc_pid_ns(file_inode(file));
				3339	loff_t pos = ctx->pos;
				3340
				3341	if (pos >= PID_MAX_LIMIT + TGID_OFFSET)
				3342	return 0;
				3343
				3344	if (pos == TGID_OFFSET - 2) {
				3345	struct inode *inode = d_inode(ns->proc_self);
				3346	if (!dir_emit(ctx, "self", 4, inode->i_ino, DT_LNK))
				3347	return 0;
				3348	ctx->pos = pos = pos + 1;
				3349	}
				3350	if (pos == TGID_OFFSET - 1) {
				3351	struct inode *inode = d_inode(ns->proc_thread_self);
				3352	if (!dir_emit(ctx, "thread-self", 11, inode->i_ino, DT_LNK))
				3353	return 0;
				3354	ctx->pos = pos = pos + 1;
				3355	}
				3356	iter.tgid = pos - TGID_OFFSET;
				3357	iter.task = NULL;
				3358	for (iter = next_tgid(ns, iter);
				3359	iter.task;
				3360	iter.tgid += 1, iter = next_tgid(ns, iter)) {
				3361	char name[10 + 1];
				3362	unsigned int len;
				3363
				3364	cond_resched();
				3365	if (!has_pid_permissions(ns, iter.task, HIDEPID_INVISIBLE))
				3366	continue;
				3367
				3368	len = snprintf(name, sizeof(name), "%u", iter.tgid);
				3369	ctx->pos = iter.tgid + TGID_OFFSET;
				3370	if (!proc_fill_cache(file, ctx, name, len,
				3371	proc_pid_instantiate, iter.task, NULL)) {
				3372	put_task_struct(iter.task);
				3373	return 0;
				3374	}
				3375	}
				3376	ctx->pos = PID_MAX_LIMIT + TGID_OFFSET;
				3377	return 0;
				3378	}
				3379
				3380	/*
				3381	* proc_tid_comm_permission is a special permission function exclusively
				3382	* used for the node /proc/<pid>/task/<tid>/comm.
				3383	* It bypasses generic permission checks in the case where a task of the same
				3384	* task group attempts to access the node.
				3385	* The rationale behind this is that glibc and bionic access this node for
				3386	* cross thread naming (pthread_set/getname_np(!self)). However, if
				3387	* PR_SET_DUMPABLE gets set to 0 this node among others becomes uid=0 gid=0,
				3388	* which locks out the cross thread naming implementation.
				3389	* This function makes sure that the node is always accessible for members of
				3390	* same thread group.
				3391	*/
				3392	static int proc_tid_comm_permission(struct inode *inode, int mask)
				3393	{
				3394	bool is_same_tgroup;
				3395	struct task_struct *task;
				3396
				3397	task = get_proc_task(inode);
				3398	if (!task)
				3399	return -ESRCH;
				3400	is_same_tgroup = same_thread_group(current, task);
				3401	put_task_struct(task);
				3402
				3403	if (likely(is_same_tgroup && !(mask & MAY_EXEC))) {
				3404	/* This file (/proc/<pid>/task/<tid>/comm) can always be
				3405	* read or written by the members of the corresponding
				3406	* thread group.
				3407	*/
				3408	return 0;
				3409	}
				3410
				3411	return generic_permission(inode, mask);
				3412	}
				3413
				3414	static const struct inode_operations proc_tid_comm_inode_operations = {
				3415	.permission = proc_tid_comm_permission,
				3416	};
				3417
				3418	/*
				3419	* Tasks
				3420	*/
				3421	static const struct pid_entry tid_base_stuff[] = {
				3422	DIR("fd", S_IRUSR\|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
				3423	DIR("fdinfo", S_IRUSR\|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
				3424	DIR("ns", S_IRUSR\|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations),
				3425	#ifdef CONFIG_NET
				3426	DIR("net", S_IRUGO\|S_IXUGO, proc_net_inode_operations, proc_net_operations),
				3427	#endif
				3428	REG("environ", S_IRUSR, proc_environ_operations),
				3429	REG("auxv", S_IRUSR, proc_auxv_operations),
				3430	ONE("status", S_IRUGO, proc_pid_status),
				3431	ONE("personality", S_IRUSR, proc_pid_personality),
				3432	ONE("limits", S_IRUGO, proc_pid_limits),
				3433	#ifdef CONFIG_SCHED_DEBUG
				3434	REG("sched", S_IRUGO\|S_IWUSR, proc_pid_sched_operations),
				3435	#endif
				3436	NOD("comm", S_IFREG\|S_IRUGO\|S_IWUSR,
				3437	&proc_tid_comm_inode_operations,
				3438	&proc_pid_set_comm_operations, {}),
				3439	#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
				3440	ONE("syscall", S_IRUSR, proc_pid_syscall),
				3441	#endif
				3442	REG("cmdline", S_IRUGO, proc_pid_cmdline_ops),
				3443	ONE("stat", S_IRUGO, proc_tid_stat),
				3444	ONE("statm", S_IRUGO, proc_pid_statm),
				3445	REG("maps", S_IRUGO, proc_pid_maps_operations),
				3446	#ifdef CONFIG_PROC_CHILDREN
				3447	REG("children", S_IRUGO, proc_tid_children_operations),
				3448	#endif
				3449	#ifdef CONFIG_NUMA
				3450	REG("numa_maps", S_IRUGO, proc_pid_numa_maps_operations),
				3451	#endif
				3452	REG("mem", S_IRUSR\|S_IWUSR, proc_mem_operations),
				3453	LNK("cwd", proc_cwd_link),
				3454	LNK("root", proc_root_link),
				3455	LNK("exe", proc_exe_link),
				3456	REG("mounts", S_IRUGO, proc_mounts_operations),
				3457	REG("mountinfo", S_IRUGO, proc_mountinfo_operations),
				3458	#ifdef CONFIG_PROC_PAGE_MONITOR
				3459	REG("clear_refs", S_IWUSR, proc_clear_refs_operations),
				3460	REG("smaps", S_IRUGO, proc_pid_smaps_operations),
				3461	REG("smaps_rollup", S_IRUGO, proc_pid_smaps_rollup_operations),
				3462	REG("pagemap", S_IRUSR, proc_pagemap_operations),
				3463	#endif
				3464	#ifdef CONFIG_SECURITY
				3465	DIR("attr", S_IRUGO\|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations),
				3466	#endif
				3467	#ifdef CONFIG_KALLSYMS
				3468	ONE("wchan", S_IRUGO, proc_pid_wchan),
				3469	#endif
				3470	#ifdef CONFIG_STACKTRACE
				3471	ONE("stack", S_IRUSR, proc_pid_stack),
				3472	#endif
				3473	#ifdef CONFIG_SCHED_INFO
				3474	ONE("schedstat", S_IRUGO, proc_pid_schedstat),
				3475	#endif
				3476	#ifdef CONFIG_LATENCYTOP
				3477	REG("latency", S_IRUGO, proc_lstats_operations),
				3478	#endif
				3479	#ifdef CONFIG_PROC_PID_CPUSET
				3480	ONE("cpuset", S_IRUGO, proc_cpuset_show),
				3481	#endif
				3482	#ifdef CONFIG_CGROUPS
				3483	ONE("cgroup", S_IRUGO, proc_cgroup_show),
				3484	#endif
				3485	ONE("oom_score", S_IRUGO, proc_oom_score),
				3486	REG("oom_adj", S_IRUGO\|S_IWUSR, proc_oom_adj_operations),
				3487	REG("oom_score_adj", S_IRUGO\|S_IWUSR, proc_oom_score_adj_operations),
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3488	#ifdef CONFIG_AUDIT
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3489	REG("loginuid", S_IWUSR\|S_IRUGO, proc_loginuid_operations),
				3490	REG("sessionid", S_IRUGO, proc_sessionid_operations),
				3491	#endif
				3492	#ifdef CONFIG_FAULT_INJECTION
				3493	REG("make-it-fail", S_IRUGO\|S_IWUSR, proc_fault_inject_operations),
				3494	REG("fail-nth", 0644, proc_fail_nth_operations),
				3495	#endif
				3496	#ifdef CONFIG_TASK_IO_ACCOUNTING
				3497	ONE("io", S_IRUSR, proc_tid_io_accounting),
				3498	#endif
				3499	#ifdef CONFIG_USER_NS
				3500	REG("uid_map", S_IRUGO\|S_IWUSR, proc_uid_map_operations),
				3501	REG("gid_map", S_IRUGO\|S_IWUSR, proc_gid_map_operations),
				3502	REG("projid_map", S_IRUGO\|S_IWUSR, proc_projid_map_operations),
				3503	REG("setgroups", S_IRUGO\|S_IWUSR, proc_setgroups_operations),
				3504	#endif
				3505	#ifdef CONFIG_LIVEPATCH
				3506	ONE("patch_state", S_IRUSR, proc_pid_patch_state),
				3507	#endif
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3508	#ifdef CONFIG_PROC_PID_ARCH_STATUS
				3509	ONE("arch_status", S_IRUGO, proc_pid_arch_status),
				3510	#endif
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3511	};
				3512
				3513	static int proc_tid_base_readdir(struct file file, struct dir_context ctx)
				3514	{
				3515	return proc_pident_readdir(file, ctx,
				3516	tid_base_stuff, ARRAY_SIZE(tid_base_stuff));
				3517	}
				3518
				3519	static struct dentry proc_tid_base_lookup(struct inode dir, struct dentry *dentry, unsigned int flags)
				3520	{
				3521	return proc_pident_lookup(dir, dentry,
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3522	tid_base_stuff,
				3523	tid_base_stuff + ARRAY_SIZE(tid_base_stuff));
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3524	}
				3525
				3526	static const struct file_operations proc_tid_base_operations = {
				3527	.read = generic_read_dir,
				3528	.iterate_shared = proc_tid_base_readdir,
				3529	.llseek = generic_file_llseek,
				3530	};
				3531
				3532	static const struct inode_operations proc_tid_base_inode_operations = {
				3533	.lookup = proc_tid_base_lookup,
				3534	.getattr = pid_getattr,
				3535	.setattr = proc_setattr,
				3536	};
				3537
				3538	static struct dentry proc_task_instantiate(struct dentry dentry,
				3539	struct task_struct task, const void ptr)
				3540	{
				3541	struct inode *inode;
				3542	inode = proc_pid_make_inode(dentry->d_sb, task, S_IFDIR \| S_IRUGO \| S_IXUGO);
				3543	if (!inode)
				3544	return ERR_PTR(-ENOENT);
				3545
				3546	inode->i_op = &proc_tid_base_inode_operations;
				3547	inode->i_fop = &proc_tid_base_operations;
				3548	inode->i_flags \|= S_IMMUTABLE;
				3549
				3550	set_nlink(inode, nlink_tid);
				3551	pid_update_inode(task, inode);
				3552
				3553	d_set_d_op(dentry, &pid_dentry_operations);
				3554	return d_splice_alias(inode, dentry);
				3555	}
				3556
				3557	static struct dentry proc_task_lookup(struct inode dir, struct dentry * dentry, unsigned int flags)
				3558	{
				3559	struct task_struct *task;
				3560	struct task_struct *leader = get_proc_task(dir);
				3561	unsigned tid;
				3562	struct pid_namespace *ns;
				3563	struct dentry *result = ERR_PTR(-ENOENT);
				3564
				3565	if (!leader)
				3566	goto out_no_task;
				3567
				3568	tid = name_to_int(&dentry->d_name);
				3569	if (tid == ~0U)
				3570	goto out;
				3571
				3572	ns = dentry->d_sb->s_fs_info;
				3573	rcu_read_lock();
				3574	task = find_task_by_pid_ns(tid, ns);
				3575	if (task)
				3576	get_task_struct(task);
				3577	rcu_read_unlock();
				3578	if (!task)
				3579	goto out;
				3580	if (!same_thread_group(leader, task))
				3581	goto out_drop_task;
				3582
				3583	result = proc_task_instantiate(dentry, task, NULL);
				3584	out_drop_task:
				3585	put_task_struct(task);
				3586	out:
				3587	put_task_struct(leader);
				3588	out_no_task:
				3589	return result;
				3590	}
				3591
				3592	/*
				3593	* Find the first tid of a thread group to return to user space.
				3594	*
				3595	* Usually this is just the thread group leader, but if the users
				3596	* buffer was too small or there was a seek into the middle of the
				3597	* directory we have more work todo.
				3598	*
				3599	* In the case of a short read we start with find_task_by_pid.
				3600	*
				3601	* In the case of a seek we start with the leader and walk nr
				3602	* threads past it.
				3603	*/
				3604	static struct task_struct first_tid(struct pid pid, int tid, loff_t f_pos,
				3605	struct pid_namespace *ns)
				3606	{
				3607	struct task_struct pos, task;
				3608	unsigned long nr = f_pos;
				3609
				3610	if (nr != f_pos) /* 32bit overflow? */
				3611	return NULL;
				3612
				3613	rcu_read_lock();
				3614	task = pid_task(pid, PIDTYPE_PID);
				3615	if (!task)
				3616	goto fail;
				3617
				3618	/* Attempt to start with the tid of a thread */
				3619	if (tid && nr) {
				3620	pos = find_task_by_pid_ns(tid, ns);
				3621	if (pos && same_thread_group(pos, task))
				3622	goto found;
				3623	}
				3624
				3625	/* If nr exceeds the number of threads there is nothing todo */
				3626	if (nr >= get_nr_threads(task))
				3627	goto fail;
				3628
				3629	/* If we haven't found our starting place yet start
				3630	* with the leader and walk nr threads forward.
				3631	*/
				3632	pos = task = task->group_leader;
				3633	do {
				3634	if (!nr--)
				3635	goto found;
				3636	} while_each_thread(task, pos);
				3637	fail:
				3638	pos = NULL;
				3639	goto out;
				3640	found:
				3641	get_task_struct(pos);
				3642	out:
				3643	rcu_read_unlock();
				3644	return pos;
				3645	}
				3646
				3647	/*
				3648	* Find the next thread in the thread list.
				3649	* Return NULL if there is an error or no next thread.
				3650	*
				3651	* The reference to the input task_struct is released.
				3652	*/
				3653	static struct task_struct next_tid(struct task_struct start)
				3654	{
				3655	struct task_struct *pos = NULL;
				3656	rcu_read_lock();
				3657	if (pid_alive(start)) {
				3658	pos = next_thread(start);
				3659	if (thread_group_leader(pos))
				3660	pos = NULL;
				3661	else
				3662	get_task_struct(pos);
				3663	}
				3664	rcu_read_unlock();
				3665	put_task_struct(start);
				3666	return pos;
				3667	}
				3668
				3669	/* for the /proc/TGID/task/ directories */
				3670	static int proc_task_readdir(struct file file, struct dir_context ctx)
				3671	{
				3672	struct inode *inode = file_inode(file);
				3673	struct task_struct *task;
				3674	struct pid_namespace *ns;
				3675	int tid;
				3676
				3677	if (proc_inode_is_dead(inode))
				3678	return -ENOENT;
				3679
				3680	if (!dir_emit_dots(file, ctx))
				3681	return 0;
				3682
				3683	/* f_version caches the tgid value that the last readdir call couldn't
				3684	* return. lseek aka telldir automagically resets f_version to 0.
				3685	*/
				3686	ns = proc_pid_ns(inode);
				3687	tid = (int)file->f_version;
				3688	file->f_version = 0;
				3689	for (task = first_tid(proc_pid(inode), tid, ctx->pos - 2, ns);
				3690	task;
				3691	task = next_tid(task), ctx->pos++) {
				3692	char name[10 + 1];
				3693	unsigned int len;
				3694	tid = task_pid_nr_ns(task, ns);
				3695	len = snprintf(name, sizeof(name), "%u", tid);
				3696	if (!proc_fill_cache(file, ctx, name, len,
				3697	proc_task_instantiate, task, NULL)) {
				3698	/* returning this tgid failed, save it as the first
				3699	* pid for the next readir call */
				3700	file->f_version = (u64)tid;
				3701	put_task_struct(task);
				3702	break;
				3703	}
				3704	}
				3705
				3706	return 0;
				3707	}
				3708
				3709	static int proc_task_getattr(const struct path path, struct kstat stat,
				3710	u32 request_mask, unsigned int query_flags)
				3711	{
				3712	struct inode *inode = d_inode(path->dentry);
				3713	struct task_struct *p = get_proc_task(inode);
				3714	generic_fillattr(inode, stat);
				3715
				3716	if (p) {
				3717	stat->nlink += get_nr_threads(p);
				3718	put_task_struct(p);
				3719	}
				3720
				3721	return 0;
				3722	}
				3723
				3724	static const struct inode_operations proc_task_inode_operations = {
				3725	.lookup = proc_task_lookup,
				3726	.getattr = proc_task_getattr,
				3727	.setattr = proc_setattr,
				3728	.permission = proc_pid_permission,
				3729	};
				3730
				3731	static const struct file_operations proc_task_operations = {
				3732	.read = generic_read_dir,
				3733	.iterate_shared = proc_task_readdir,
				3734	.llseek = generic_file_llseek,
				3735	};
				3736
				3737	void __init set_proc_pid_nlink(void)
				3738	{
				3739	nlink_tid = pid_entry_nlink(tid_base_stuff, ARRAY_SIZE(tid_base_stuff));
				3740	nlink_tgid = pid_entry_nlink(tgid_base_stuff, ARRAY_SIZE(tgid_base_stuff));
				3741	}