Update Linux to v5.10.109
Sourced from [1]
[1] https://cdn.kernel.org/pub/linux/kernel/v5.x/linux-5.10.109.tar.xz
Change-Id: I19bca9fc6762d4e63bcf3e4cba88bbe560d9c76c
Signed-off-by: Olivier Deprez <olivier.deprez@arm.com>
diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig
index cb5629b..c930001 100644
--- a/fs/proc/Kconfig
+++ b/fs/proc/Kconfig
@@ -23,7 +23,7 @@
/proc" or the equivalent line in /etc/fstab does the job.
The /proc file system is explained in the file
- <file:Documentation/filesystems/proc.txt> and on the proc(5) manpage
+ <file:Documentation/filesystems/proc.rst> and on the proc(5) manpage
("man 5 proc").
This option will enlarge your kernel by about 67 KB. Several
@@ -42,8 +42,8 @@
bool "/proc/vmcore support"
depends on PROC_FS && CRASH_DUMP
default y
- help
- Exports the dump image of crashed kernel in ELF format.
+ help
+ Exports the dump image of crashed kernel in ELF format.
config PROC_VMCORE_DEVICE_DUMP
bool "Device Hardware/Firmware Log Collection"
@@ -66,13 +66,13 @@
depends on PROC_FS
select SYSCTL
default y
- ---help---
+ help
The sysctl interface provides a means of dynamically changing
certain kernel parameters and variables on the fly without requiring
a recompile of the kernel or reboot of the system. The primary
interface is through /proc/sys. If you say Y here a tree of
modifiable sysctl entries will be generated beneath the
- /proc/sys directory. They are explained in the files
+ /proc/sys directory. They are explained in the files
in <file:Documentation/admin-guide/sysctl/>. Note that enabling this
option will enlarge the kernel by at least 8 KB.
@@ -88,14 +88,14 @@
Various /proc files exist to monitor process memory utilization:
/proc/pid/smaps, /proc/pid/clear_refs, /proc/pid/pagemap,
/proc/kpagecount, and /proc/kpageflags. Disabling these
- interfaces will reduce the size of the kernel by approximately 4kb.
+ interfaces will reduce the size of the kernel by approximately 4kb.
config PROC_CHILDREN
bool "Include /proc/<pid>/task/<tid>/children file"
default n
help
Provides a fast way to retrieve first level children pids of a task. See
- <file:Documentation/filesystems/proc.txt> for more information.
+ <file:Documentation/filesystems/proc.rst> for more information.
Say Y if you are running any user-space software which takes benefit from
this interface. For example, rkt is such a piece of software.
@@ -103,3 +103,7 @@
config PROC_PID_ARCH_STATUS
def_bool n
depends on PROC_FS
+
+config PROC_CPU_RESCTRL
+ def_bool n
+ depends on PROC_FS
diff --git a/fs/proc/Makefile b/fs/proc/Makefile
index ead487e..bd08616 100644
--- a/fs/proc/Makefile
+++ b/fs/proc/Makefile
@@ -33,3 +33,4 @@
proc-$(CONFIG_PROC_VMCORE) += vmcore.o
proc-$(CONFIG_PRINTK) += kmsg.o
proc-$(CONFIG_PROC_PAGE_MONITOR) += page.o
+proc-$(CONFIG_BOOT_CONFIG) += bootconfig.o
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 46dcb6f..18a4588 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -92,7 +92,6 @@
#include <linux/user_namespace.h>
#include <linux/fs_struct.h>
-#include <asm/pgtable.h>
#include <asm/processor.h>
#include "internal.h"
@@ -248,8 +247,8 @@
seq_putc(m, '\n');
}
-static void collect_sigign_sigcatch(struct task_struct *p, sigset_t *ign,
- sigset_t *catch)
+static void collect_sigign_sigcatch(struct task_struct *p, sigset_t *sigign,
+ sigset_t *sigcatch)
{
struct k_sigaction *k;
int i;
@@ -257,9 +256,9 @@
k = p->sighand->action;
for (i = 1; i <= _NSIG; ++i, ++k) {
if (k->sa.sa_handler == SIG_IGN)
- sigaddset(ign, i);
+ sigaddset(sigign, i);
else if (k->sa.sa_handler != SIG_DFL)
- sigaddset(catch, i);
+ sigaddset(sigcatch, i);
}
}
@@ -342,6 +341,10 @@
seq_put_decimal_ull(m, "NoNewPrivs:\t", task_no_new_privs(p));
#ifdef CONFIG_SECCOMP
seq_put_decimal_ull(m, "\nSeccomp:\t", p->seccomp.mode);
+#ifdef CONFIG_SECCOMP_FILTER
+ seq_put_decimal_ull(m, "\nSeccomp_filters:\t",
+ atomic_read(&p->seccomp.filter_count));
+#endif
#endif
seq_puts(m, "\nSpeculation_Store_Bypass:\t");
switch (arch_prctl_spec_ctrl_get(p, PR_SPEC_STORE_BYPASS)) {
@@ -533,7 +536,7 @@
nice = task_nice(task);
/* convert nsec -> ticks */
- start_time = nsec_to_clock_t(task->real_start_time);
+ start_time = nsec_to_clock_t(task->start_boottime);
seq_put_decimal_ull(m, "", pid_nr_ns(pid, ns));
seq_puts(m, " (");
@@ -635,28 +638,35 @@
int proc_pid_statm(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *task)
{
- unsigned long size = 0, resident = 0, shared = 0, text = 0, data = 0;
struct mm_struct *mm = get_task_mm(task);
if (mm) {
+ unsigned long size;
+ unsigned long resident = 0;
+ unsigned long shared = 0;
+ unsigned long text = 0;
+ unsigned long data = 0;
+
size = task_statm(mm, &shared, &text, &data, &resident);
mmput(mm);
- }
- /*
- * For quick read, open code by putting numbers directly
- * expected format is
- * seq_printf(m, "%lu %lu %lu %lu 0 %lu 0\n",
- * size, resident, shared, text, data);
- */
- seq_put_decimal_ull(m, "", size);
- seq_put_decimal_ull(m, " ", resident);
- seq_put_decimal_ull(m, " ", shared);
- seq_put_decimal_ull(m, " ", text);
- seq_put_decimal_ull(m, " ", 0);
- seq_put_decimal_ull(m, " ", data);
- seq_put_decimal_ull(m, " ", 0);
- seq_putc(m, '\n');
+ /*
+ * For quick read, open code by putting numbers directly
+ * expected format is
+ * seq_printf(m, "%lu %lu %lu %lu 0 %lu 0\n",
+ * size, resident, shared, text, data);
+ */
+ seq_put_decimal_ull(m, "", size);
+ seq_put_decimal_ull(m, " ", resident);
+ seq_put_decimal_ull(m, " ", shared);
+ seq_put_decimal_ull(m, " ", text);
+ seq_put_decimal_ull(m, " ", 0);
+ seq_put_decimal_ull(m, " ", data);
+ seq_put_decimal_ull(m, " ", 0);
+ seq_putc(m, '\n');
+ } else {
+ seq_write(m, "0 0 0 0 0 0 0\n", 14);
+ }
return 0;
}
@@ -721,7 +731,7 @@
{
struct inode *inode = file_inode(seq->file);
- seq_printf(seq, "%d ", pid_nr_ns(v, proc_pid_ns(inode)));
+ seq_printf(seq, "%d ", pid_nr_ns(v, proc_pid_ns(inode->i_sb)));
return 0;
}
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 5a187e9..5d52aea 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -94,6 +94,8 @@
#include <linux/sched/debug.h>
#include <linux/sched/stat.h>
#include <linux/posix-timers.h>
+#include <linux/time_namespace.h>
+#include <linux/resctrl.h>
#include <trace/events/oom.h>
#include "internal.h"
#include "fd.h"
@@ -704,13 +706,21 @@
* May current process learn task's sched/cmdline info (for hide_pid_min=1)
* or euid/egid (for hide_pid_min=2)?
*/
-static bool has_pid_permissions(struct pid_namespace *pid,
+static bool has_pid_permissions(struct proc_fs_info *fs_info,
struct task_struct *task,
- int hide_pid_min)
+ enum proc_hidepid hide_pid_min)
{
- if (pid->hide_pid < hide_pid_min)
+ /*
+ * If 'hidpid' mount option is set force a ptrace check,
+ * we indicate that we are using a filesystem syscall
+ * by passing PTRACE_MODE_READ_FSCREDS
+ */
+ if (fs_info->hide_pid == HIDEPID_NOT_PTRACEABLE)
+ return ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS);
+
+ if (fs_info->hide_pid < hide_pid_min)
return true;
- if (in_group_p(pid->pid_gid))
+ if (in_group_p(fs_info->pid_gid))
return true;
return ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS);
}
@@ -718,18 +728,18 @@
static int proc_pid_permission(struct inode *inode, int mask)
{
- struct pid_namespace *pid = proc_pid_ns(inode);
+ struct proc_fs_info *fs_info = proc_sb_info(inode->i_sb);
struct task_struct *task;
bool has_perms;
task = get_proc_task(inode);
if (!task)
return -ESRCH;
- has_perms = has_pid_permissions(pid, task, HIDEPID_NO_ACCESS);
+ has_perms = has_pid_permissions(fs_info, task, HIDEPID_NO_ACCESS);
put_task_struct(task);
if (!has_perms) {
- if (pid->hide_pid == HIDEPID_INVISIBLE) {
+ if (fs_info->hide_pid == HIDEPID_INVISIBLE) {
/*
* Let's make getdents(), stat(), and open()
* consistent with each other. If a process
@@ -753,7 +763,7 @@
static int proc_single_show(struct seq_file *m, void *v)
{
struct inode *inode = m->private;
- struct pid_namespace *ns = proc_pid_ns(inode);
+ struct pid_namespace *ns = proc_pid_ns(inode->i_sb);
struct pid *pid = proc_pid(inode);
struct task_struct *task;
int ret;
@@ -1039,6 +1049,8 @@
oom_adj = (task->signal->oom_score_adj * -OOM_DISABLE) /
OOM_SCORE_ADJ_MAX;
put_task_struct(task);
+ if (oom_adj > OOM_ADJUST_MAX)
+ oom_adj = OOM_ADJUST_MAX;
len = snprintf(buffer, sizeof(buffer), "%d\n", oom_adj);
return simple_read_from_buffer(buf, count, ppos, buffer, len);
}
@@ -1258,6 +1270,10 @@
kuid_t kloginuid;
int rv;
+ /* Don't let kthreads write their own loginuid */
+ if (current->flags & PF_KTHREAD)
+ return -EPERM;
+
rcu_read_lock();
if (current != pid_task(proc_pid(inode), PIDTYPE_PID)) {
rcu_read_unlock();
@@ -1421,7 +1437,7 @@
static int sched_show(struct seq_file *m, void *v)
{
struct inode *inode = m->private;
- struct pid_namespace *ns = proc_pid_ns(inode);
+ struct pid_namespace *ns = proc_pid_ns(inode->i_sb);
struct task_struct *p;
p = get_proc_task(inode);
@@ -1541,6 +1557,108 @@
#endif /* CONFIG_SCHED_AUTOGROUP */
+#ifdef CONFIG_TIME_NS
+static int timens_offsets_show(struct seq_file *m, void *v)
+{
+ struct task_struct *p;
+
+ p = get_proc_task(file_inode(m->file));
+ if (!p)
+ return -ESRCH;
+ proc_timens_show_offsets(p, m);
+
+ put_task_struct(p);
+
+ return 0;
+}
+
+static ssize_t timens_offsets_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct inode *inode = file_inode(file);
+ struct proc_timens_offset offsets[2];
+ char *kbuf = NULL, *pos, *next_line;
+ struct task_struct *p;
+ int ret, noffsets;
+
+ /* Only allow < page size writes at the beginning of the file */
+ if ((*ppos != 0) || (count >= PAGE_SIZE))
+ return -EINVAL;
+
+ /* Slurp in the user data */
+ kbuf = memdup_user_nul(buf, count);
+ if (IS_ERR(kbuf))
+ return PTR_ERR(kbuf);
+
+ /* Parse the user data */
+ ret = -EINVAL;
+ noffsets = 0;
+ for (pos = kbuf; pos; pos = next_line) {
+ struct proc_timens_offset *off = &offsets[noffsets];
+ char clock[10];
+ int err;
+
+ /* Find the end of line and ensure we don't look past it */
+ next_line = strchr(pos, '\n');
+ if (next_line) {
+ *next_line = '\0';
+ next_line++;
+ if (*next_line == '\0')
+ next_line = NULL;
+ }
+
+ err = sscanf(pos, "%9s %lld %lu", clock,
+ &off->val.tv_sec, &off->val.tv_nsec);
+ if (err != 3 || off->val.tv_nsec >= NSEC_PER_SEC)
+ goto out;
+
+ clock[sizeof(clock) - 1] = 0;
+ if (strcmp(clock, "monotonic") == 0 ||
+ strcmp(clock, __stringify(CLOCK_MONOTONIC)) == 0)
+ off->clockid = CLOCK_MONOTONIC;
+ else if (strcmp(clock, "boottime") == 0 ||
+ strcmp(clock, __stringify(CLOCK_BOOTTIME)) == 0)
+ off->clockid = CLOCK_BOOTTIME;
+ else
+ goto out;
+
+ noffsets++;
+ if (noffsets == ARRAY_SIZE(offsets)) {
+ if (next_line)
+ count = next_line - kbuf;
+ break;
+ }
+ }
+
+ ret = -ESRCH;
+ p = get_proc_task(inode);
+ if (!p)
+ goto out;
+ ret = proc_timens_set_offset(file, p, offsets, noffsets);
+ put_task_struct(p);
+ if (ret)
+ goto out;
+
+ ret = count;
+out:
+ kfree(kbuf);
+ return ret;
+}
+
+static int timens_offsets_open(struct inode *inode, struct file *filp)
+{
+ return single_open(filp, timens_offsets_show, inode);
+}
+
+static const struct file_operations proc_timens_offsets_operations = {
+ .open = timens_offsets_open,
+ .read = seq_read,
+ .write = timens_offsets_write,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+#endif /* CONFIG_TIME_NS */
+
static ssize_t comm_write(struct file *file, const char __user *buf,
size_t count, loff_t *offset)
{
@@ -1634,8 +1752,7 @@
if (error)
goto out;
- nd_jump_link(&path);
- return NULL;
+ error = nd_jump_link(&path);
out:
return ERR_PTR(error);
}
@@ -1751,11 +1868,25 @@
*rgid = gid;
}
+void proc_pid_evict_inode(struct proc_inode *ei)
+{
+ struct pid *pid = ei->pid;
+
+ if (S_ISDIR(ei->vfs_inode.i_mode)) {
+ spin_lock(&pid->lock);
+ hlist_del_init_rcu(&ei->sibling_inodes);
+ spin_unlock(&pid->lock);
+ }
+
+ put_pid(pid);
+}
+
struct inode *proc_pid_make_inode(struct super_block * sb,
struct task_struct *task, umode_t mode)
{
struct inode * inode;
struct proc_inode *ei;
+ struct pid *pid;
/* We need a new inode */
@@ -1773,10 +1904,18 @@
/*
* grab the reference to task.
*/
- ei->pid = get_task_pid(task, PIDTYPE_PID);
- if (!ei->pid)
+ pid = get_task_pid(task, PIDTYPE_PID);
+ if (!pid)
goto out_unlock;
+ /* Let the pid remember us for quick removal */
+ ei->pid = pid;
+ if (S_ISDIR(mode)) {
+ spin_lock(&pid->lock);
+ hlist_add_head_rcu(&ei->sibling_inodes, &pid->inodes);
+ spin_unlock(&pid->lock);
+ }
+
task_dump_owner(task, 0, &inode->i_uid, &inode->i_gid);
security_task_to_inode(task, inode);
@@ -1792,7 +1931,7 @@
u32 request_mask, unsigned int query_flags)
{
struct inode *inode = d_inode(path->dentry);
- struct pid_namespace *pid = proc_pid_ns(inode);
+ struct proc_fs_info *fs_info = proc_sb_info(inode->i_sb);
struct task_struct *task;
generic_fillattr(inode, stat);
@@ -1802,7 +1941,7 @@
rcu_read_lock();
task = pid_task(proc_pid(inode), PIDTYPE_PID);
if (task) {
- if (!has_pid_permissions(pid, task, HIDEPID_INVISIBLE)) {
+ if (!has_pid_permissions(fs_info, task, HIDEPID_INVISIBLE)) {
rcu_read_unlock();
/*
* This doesn't prevent learning whether PID exists,
@@ -1987,11 +2126,11 @@
goto out;
if (!dname_to_vma_addr(dentry, &vm_start, &vm_end)) {
- status = down_read_killable(&mm->mmap_sem);
+ status = mmap_read_lock_killable(mm);
if (!status) {
exact_vma_exists = !!find_exact_vma(mm, vm_start,
vm_end);
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
}
}
@@ -2038,7 +2177,7 @@
if (rc)
goto out_mmput;
- rc = down_read_killable(&mm->mmap_sem);
+ rc = mmap_read_lock_killable(mm);
if (rc)
goto out_mmput;
@@ -2049,7 +2188,7 @@
path_get(path);
rc = 0;
}
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
out_mmput:
mmput(mm);
@@ -2064,16 +2203,16 @@
};
/*
- * Only allow CAP_SYS_ADMIN to follow the links, due to concerns about how the
- * symlinks may be used to bypass permissions on ancestor directories in the
- * path to the file in question.
+ * Only allow CAP_SYS_ADMIN and CAP_CHECKPOINT_RESTORE to follow the links, due
+ * to concerns about how the symlinks may be used to bypass permissions on
+ * ancestor directories in the path to the file in question.
*/
static const char *
proc_map_files_get_link(struct dentry *dentry,
struct inode *inode,
struct delayed_call *done)
{
- if (!capable(CAP_SYS_ADMIN))
+ if (!checkpoint_restore_ns_capable(&init_user_ns))
return ERR_PTR(-EPERM);
return proc_pid_get_link(dentry, inode, done);
@@ -2139,7 +2278,7 @@
goto out_put_task;
result = ERR_PTR(-EINTR);
- if (down_read_killable(&mm->mmap_sem))
+ if (mmap_read_lock_killable(mm))
goto out_put_mm;
result = ERR_PTR(-ENOENT);
@@ -2152,7 +2291,7 @@
(void *)(unsigned long)vma->vm_file->f_mode);
out_no_vma:
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
out_put_mm:
mmput(mm);
out_put_task:
@@ -2197,7 +2336,7 @@
if (!mm)
goto out_put_task;
- ret = down_read_killable(&mm->mmap_sem);
+ ret = mmap_read_lock_killable(mm);
if (ret) {
mmput(mm);
goto out_put_task;
@@ -2208,11 +2347,11 @@
/*
* We need two passes here:
*
- * 1) Collect vmas of mapped files with mmap_sem taken
- * 2) Release mmap_sem and instantiate entries
+ * 1) Collect vmas of mapped files with mmap_lock taken
+ * 2) Release mmap_lock and instantiate entries
*
* otherwise we get lockdep complained, since filldir()
- * routine might require mmap_sem taken in might_fault().
+ * routine might require mmap_lock taken in might_fault().
*/
for (vma = mm->mmap, pos = 2; vma; vma = vma->vm_next) {
@@ -2224,7 +2363,7 @@
p = genradix_ptr_alloc(&fa, nr_files++, GFP_KERNEL);
if (!p) {
ret = -ENOMEM;
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
mmput(mm);
goto out_put_task;
}
@@ -2233,7 +2372,7 @@
p->end = vma->vm_end;
p->mode = vma->vm_file->f_mode;
}
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
mmput(mm);
for (i = 0; i < nr_files; i++) {
@@ -2353,7 +2492,7 @@
return -ENOMEM;
tp->pid = proc_pid(inode);
- tp->ns = proc_pid_ns(inode);
+ tp->ns = proc_pid_ns(inode->i_sb);
return 0;
}
@@ -2666,6 +2805,15 @@
LSM_DIR_OPS(smack);
#endif
+#ifdef CONFIG_SECURITY_APPARMOR
+static const struct pid_entry apparmor_attr_dir_stuff[] = {
+ ATTR("apparmor", "current", 0666),
+ ATTR("apparmor", "prev", 0444),
+ ATTR("apparmor", "exec", 0666),
+};
+LSM_DIR_OPS(apparmor);
+#endif
+
static const struct pid_entry attr_dir_stuff[] = {
ATTR(NULL, "current", 0666),
ATTR(NULL, "prev", 0444),
@@ -2677,6 +2825,10 @@
DIR("smack", 0555,
proc_smack_attr_dir_inode_ops, proc_smack_attr_dir_ops),
#endif
+#ifdef CONFIG_SECURITY_APPARMOR
+ DIR("apparmor", 0555,
+ proc_apparmor_attr_dir_inode_ops, proc_apparmor_attr_dir_ops),
+#endif
};
static int proc_attr_dir_readdir(struct file *file, struct dir_context *ctx)
@@ -3037,6 +3189,9 @@
#ifdef CONFIG_SCHED_AUTOGROUP
REG("autogroup", S_IRUGO|S_IWUSR, proc_pid_sched_autogroup_operations),
#endif
+#ifdef CONFIG_TIME_NS
+ REG("timens_offsets", S_IRUGO|S_IWUSR, proc_timens_offsets_operations),
+#endif
REG("comm", S_IRUGO|S_IWUSR, proc_pid_set_comm_operations),
#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
ONE("syscall", S_IRUSR, proc_pid_syscall),
@@ -3082,6 +3237,9 @@
#ifdef CONFIG_CGROUPS
ONE("cgroup", S_IRUGO, proc_cgroup_show),
#endif
+#ifdef CONFIG_PROC_CPU_RESCTRL
+ ONE("cpu_resctrl_groups", S_IRUGO, proc_resctrl_show),
+#endif
ONE("oom_score", S_IRUGO, proc_oom_score),
REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adj_operations),
REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
@@ -3154,90 +3312,28 @@
.permission = proc_pid_permission,
};
-static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid)
-{
- struct dentry *dentry, *leader, *dir;
- char buf[10 + 1];
- struct qstr name;
-
- name.name = buf;
- name.len = snprintf(buf, sizeof(buf), "%u", pid);
- /* no ->d_hash() rejects on procfs */
- dentry = d_hash_and_lookup(mnt->mnt_root, &name);
- if (dentry) {
- d_invalidate(dentry);
- dput(dentry);
- }
-
- if (pid == tgid)
- return;
-
- name.name = buf;
- name.len = snprintf(buf, sizeof(buf), "%u", tgid);
- leader = d_hash_and_lookup(mnt->mnt_root, &name);
- if (!leader)
- goto out;
-
- name.name = "task";
- name.len = strlen(name.name);
- dir = d_hash_and_lookup(leader, &name);
- if (!dir)
- goto out_put_leader;
-
- name.name = buf;
- name.len = snprintf(buf, sizeof(buf), "%u", pid);
- dentry = d_hash_and_lookup(dir, &name);
- if (dentry) {
- d_invalidate(dentry);
- dput(dentry);
- }
-
- dput(dir);
-out_put_leader:
- dput(leader);
-out:
- return;
-}
-
/**
- * proc_flush_task - Remove dcache entries for @task from the /proc dcache.
- * @task: task that should be flushed.
+ * proc_flush_pid - Remove dcache entries for @pid from the /proc dcache.
+ * @pid: pid that should be flushed.
*
- * When flushing dentries from proc, one needs to flush them from global
- * proc (proc_mnt) and from all the namespaces' procs this task was seen
- * in. This call is supposed to do all of this job.
- *
- * Looks in the dcache for
- * /proc/@pid
- * /proc/@tgid/task/@pid
- * if either directory is present flushes it and all of it'ts children
- * from the dcache.
+ * This function walks a list of inodes (that belong to any proc
+ * filesystem) that are attached to the pid and flushes them from
+ * the dentry cache.
*
* It is safe and reasonable to cache /proc entries for a task until
* that task exits. After that they just clog up the dcache with
* useless entries, possibly causing useful dcache entries to be
- * flushed instead. This routine is proved to flush those useless
- * dcache entries at process exit time.
+ * flushed instead. This routine is provided to flush those useless
+ * dcache entries when a process is reaped.
*
* NOTE: This routine is just an optimization so it does not guarantee
- * that no dcache entries will exist at process exit time it
- * just makes it very unlikely that any will persist.
+ * that no dcache entries will exist after a process is reaped
+ * it just makes it very unlikely that any will persist.
*/
-void proc_flush_task(struct task_struct *task)
+void proc_flush_pid(struct pid *pid)
{
- int i;
- struct pid *pid, *tgid;
- struct upid *upid;
-
- pid = task_pid(task);
- tgid = task_tgid(task);
-
- for (i = 0; i <= pid->level; i++) {
- upid = &pid->numbers[i];
- proc_flush_task_mnt(upid->ns->proc_mnt, upid->nr,
- tgid->numbers[i].nr);
- }
+ proc_invalidate_siblings_dcache(&pid->inodes, &pid->lock);
}
static struct dentry *proc_pid_instantiate(struct dentry * dentry,
@@ -3264,6 +3360,7 @@
{
struct task_struct *task;
unsigned tgid;
+ struct proc_fs_info *fs_info;
struct pid_namespace *ns;
struct dentry *result = ERR_PTR(-ENOENT);
@@ -3271,7 +3368,8 @@
if (tgid == ~0U)
goto out;
- ns = dentry->d_sb->s_fs_info;
+ fs_info = proc_sb_info(dentry->d_sb);
+ ns = fs_info->pid_ns;
rcu_read_lock();
task = find_task_by_pid_ns(tgid, ns);
if (task)
@@ -3280,7 +3378,14 @@
if (!task)
goto out;
+ /* Limit procfs to only ptraceable tasks */
+ if (fs_info->hide_pid == HIDEPID_NOT_PTRACEABLE) {
+ if (!has_pid_permissions(fs_info, task, HIDEPID_NO_ACCESS))
+ goto out_put_task;
+ }
+
result = proc_pid_instantiate(dentry, task, NULL);
+out_put_task:
put_task_struct(task);
out:
return result;
@@ -3306,20 +3411,8 @@
pid = find_ge_pid(iter.tgid, ns);
if (pid) {
iter.tgid = pid_nr_ns(pid, ns);
- iter.task = pid_task(pid, PIDTYPE_PID);
- /* What we to know is if the pid we have find is the
- * pid of a thread_group_leader. Testing for task
- * being a thread_group_leader is the obvious thing
- * todo but there is a window when it fails, due to
- * the pid transfer logic in de_thread.
- *
- * So we perform the straight forward test of seeing
- * if the pid we have found is the pid of a thread
- * group leader, and don't worry if the task we have
- * found doesn't happen to be a thread group leader.
- * As we don't care in the case of readdir.
- */
- if (!iter.task || !has_group_leader_pid(iter.task)) {
+ iter.task = pid_task(pid, PIDTYPE_TGID);
+ if (!iter.task) {
iter.tgid += 1;
goto retry;
}
@@ -3335,20 +3428,21 @@
int proc_pid_readdir(struct file *file, struct dir_context *ctx)
{
struct tgid_iter iter;
- struct pid_namespace *ns = proc_pid_ns(file_inode(file));
+ struct proc_fs_info *fs_info = proc_sb_info(file_inode(file)->i_sb);
+ struct pid_namespace *ns = proc_pid_ns(file_inode(file)->i_sb);
loff_t pos = ctx->pos;
if (pos >= PID_MAX_LIMIT + TGID_OFFSET)
return 0;
if (pos == TGID_OFFSET - 2) {
- struct inode *inode = d_inode(ns->proc_self);
+ struct inode *inode = d_inode(fs_info->proc_self);
if (!dir_emit(ctx, "self", 4, inode->i_ino, DT_LNK))
return 0;
ctx->pos = pos = pos + 1;
}
if (pos == TGID_OFFSET - 1) {
- struct inode *inode = d_inode(ns->proc_thread_self);
+ struct inode *inode = d_inode(fs_info->proc_thread_self);
if (!dir_emit(ctx, "thread-self", 11, inode->i_ino, DT_LNK))
return 0;
ctx->pos = pos = pos + 1;
@@ -3362,7 +3456,7 @@
unsigned int len;
cond_resched();
- if (!has_pid_permissions(ns, iter.task, HIDEPID_INVISIBLE))
+ if (!has_pid_permissions(fs_info, iter.task, HIDEPID_INVISIBLE))
continue;
len = snprintf(name, sizeof(name), "%u", iter.tgid);
@@ -3482,6 +3576,9 @@
#ifdef CONFIG_CGROUPS
ONE("cgroup", S_IRUGO, proc_cgroup_show),
#endif
+#ifdef CONFIG_PROC_CPU_RESCTRL
+ ONE("cpu_resctrl_groups", S_IRUGO, proc_resctrl_show),
+#endif
ONE("oom_score", S_IRUGO, proc_oom_score),
REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adj_operations),
REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
@@ -3559,6 +3656,7 @@
struct task_struct *task;
struct task_struct *leader = get_proc_task(dir);
unsigned tid;
+ struct proc_fs_info *fs_info;
struct pid_namespace *ns;
struct dentry *result = ERR_PTR(-ENOENT);
@@ -3569,7 +3667,8 @@
if (tid == ~0U)
goto out;
- ns = dentry->d_sb->s_fs_info;
+ fs_info = proc_sb_info(dentry->d_sb);
+ ns = fs_info->pid_ns;
rcu_read_lock();
task = find_task_by_pid_ns(tid, ns);
if (task)
@@ -3683,7 +3782,7 @@
/* f_version caches the tgid value that the last readdir call couldn't
* return. lseek aka telldir automagically resets f_version to 0.
*/
- ns = proc_pid_ns(inode);
+ ns = proc_pid_ns(inode->i_sb);
tid = (int)file->f_version;
file->f_version = 0;
for (task = first_tid(proc_pid(inode), tid, ctx->pos - 2, ns);
diff --git a/fs/proc/bootconfig.c b/fs/proc/bootconfig.c
new file mode 100644
index 0000000..ad31ec4
--- /dev/null
+++ b/fs/proc/bootconfig.c
@@ -0,0 +1,94 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * /proc/bootconfig - Extra boot configuration
+ */
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/printk.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/bootconfig.h>
+#include <linux/slab.h>
+
+static char *saved_boot_config;
+
+static int boot_config_proc_show(struct seq_file *m, void *v)
+{
+ if (saved_boot_config)
+ seq_puts(m, saved_boot_config);
+ return 0;
+}
+
+/* Rest size of buffer */
+#define rest(dst, end) ((end) > (dst) ? (end) - (dst) : 0)
+
+/* Return the needed total length if @size is 0 */
+static int __init copy_xbc_key_value_list(char *dst, size_t size)
+{
+ struct xbc_node *leaf, *vnode;
+ char *key, *end = dst + size;
+ const char *val;
+ char q;
+ int ret = 0;
+
+ key = kzalloc(XBC_KEYLEN_MAX, GFP_KERNEL);
+
+ xbc_for_each_key_value(leaf, val) {
+ ret = xbc_node_compose_key(leaf, key, XBC_KEYLEN_MAX);
+ if (ret < 0)
+ break;
+ ret = snprintf(dst, rest(dst, end), "%s = ", key);
+ if (ret < 0)
+ break;
+ dst += ret;
+ vnode = xbc_node_get_child(leaf);
+ if (vnode) {
+ xbc_array_for_each_value(vnode, val) {
+ if (strchr(val, '"'))
+ q = '\'';
+ else
+ q = '"';
+ ret = snprintf(dst, rest(dst, end), "%c%s%c%s",
+ q, val, q, vnode->next ? ", " : "\n");
+ if (ret < 0)
+ goto out;
+ dst += ret;
+ }
+ } else {
+ ret = snprintf(dst, rest(dst, end), "\"\"\n");
+ if (ret < 0)
+ break;
+ dst += ret;
+ }
+ }
+out:
+ kfree(key);
+
+ return ret < 0 ? ret : dst - (end - size);
+}
+
+static int __init proc_boot_config_init(void)
+{
+ int len;
+
+ len = copy_xbc_key_value_list(NULL, 0);
+ if (len < 0)
+ return len;
+
+ if (len > 0) {
+ saved_boot_config = kzalloc(len + 1, GFP_KERNEL);
+ if (!saved_boot_config)
+ return -ENOMEM;
+
+ len = copy_xbc_key_value_list(saved_boot_config, len + 1);
+ if (len < 0) {
+ kfree(saved_boot_config);
+ return len;
+ }
+ }
+
+ proc_create_single("bootconfig", 0, NULL, boot_config_proc_show);
+
+ return 0;
+}
+fs_initcall(proc_boot_config_init);
diff --git a/fs/proc/cpuinfo.c b/fs/proc/cpuinfo.c
index 96f1087..419760f 100644
--- a/fs/proc/cpuinfo.c
+++ b/fs/proc/cpuinfo.c
@@ -16,16 +16,17 @@
return seq_open(file, &cpuinfo_op);
}
-static const struct file_operations proc_cpuinfo_operations = {
- .open = cpuinfo_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release,
+static const struct proc_ops cpuinfo_proc_ops = {
+ .proc_flags = PROC_ENTRY_PERMANENT,
+ .proc_open = cpuinfo_open,
+ .proc_read_iter = seq_read_iter,
+ .proc_lseek = seq_lseek,
+ .proc_release = seq_release,
};
static int __init proc_cpuinfo_init(void)
{
- proc_create("cpuinfo", 0, NULL, &proc_cpuinfo_operations);
+ proc_create("cpuinfo", 0, NULL, &cpuinfo_proc_ops);
return 0;
}
fs_initcall(proc_cpuinfo_init);
diff --git a/fs/proc/devices.c b/fs/proc/devices.c
index 37d38697..837971e 100644
--- a/fs/proc/devices.c
+++ b/fs/proc/devices.c
@@ -3,6 +3,7 @@
#include <linux/init.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
+#include <linux/blkdev.h>
static int devinfo_show(struct seq_file *f, void *v)
{
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 8c3dbe1..09e4d8a 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -163,7 +163,6 @@
{
const char *cp = name, *next;
struct proc_dir_entry *de;
- unsigned int len;
de = *ret;
if (!de)
@@ -174,13 +173,12 @@
if (!next)
break;
- len = next - cp;
- de = pde_subdir_find(de, cp, len);
+ de = pde_subdir_find(de, cp, next - cp);
if (!de) {
WARN(1, "name '%s'\n", name);
return -ENOENT;
}
- cp += len + 1;
+ cp = next + 1;
}
*residual = cp;
*ret = de;
@@ -271,6 +269,11 @@
struct dentry *proc_lookup(struct inode *dir, struct dentry *dentry,
unsigned int flags)
{
+ struct proc_fs_info *fs_info = proc_sb_info(dir->i_sb);
+
+ if (fs_info->pidonly == PROC_PIDONLY_ON)
+ return ERR_PTR(-ENOENT);
+
return proc_lookup_de(dir, dentry, PDE(dir));
}
@@ -327,6 +330,10 @@
int proc_readdir(struct file *file, struct dir_context *ctx)
{
struct inode *inode = file_inode(file);
+ struct proc_fs_info *fs_info = proc_sb_info(inode->i_sb);
+
+ if (fs_info->pidonly == PROC_PIDONLY_ON)
+ return 1;
return proc_readdir_de(file, ctx, PDE(inode));
}
@@ -485,7 +492,7 @@
ent = __proc_create(&parent, name, S_IFDIR | mode, 2);
if (ent) {
ent->data = data;
- ent->proc_fops = &proc_dir_operations;
+ ent->proc_dir_ops = &proc_dir_operations;
ent->proc_iops = &proc_dir_inode_operations;
if (force_lookup) {
pde_force_lookup(ent);
@@ -525,7 +532,7 @@
ent = __proc_create(&parent, name, mode, 2);
if (ent) {
ent->data = NULL;
- ent->proc_fops = NULL;
+ ent->proc_dir_ops = NULL;
ent->proc_iops = NULL;
ent = proc_register(parent, ent);
}
@@ -553,27 +560,32 @@
return p;
}
+static inline void pde_set_flags(struct proc_dir_entry *pde)
+{
+ if (pde->proc_ops->proc_flags & PROC_ENTRY_PERMANENT)
+ pde->flags |= PROC_ENTRY_PERMANENT;
+}
+
struct proc_dir_entry *proc_create_data(const char *name, umode_t mode,
struct proc_dir_entry *parent,
- const struct file_operations *proc_fops, void *data)
+ const struct proc_ops *proc_ops, void *data)
{
struct proc_dir_entry *p;
- BUG_ON(proc_fops == NULL);
-
p = proc_create_reg(name, mode, &parent, data);
if (!p)
return NULL;
- p->proc_fops = proc_fops;
+ p->proc_ops = proc_ops;
+ pde_set_flags(p);
return proc_register(parent, p);
}
EXPORT_SYMBOL(proc_create_data);
struct proc_dir_entry *proc_create(const char *name, umode_t mode,
struct proc_dir_entry *parent,
- const struct file_operations *proc_fops)
+ const struct proc_ops *proc_ops)
{
- return proc_create_data(name, mode, parent, proc_fops, NULL);
+ return proc_create_data(name, mode, parent, proc_ops, NULL);
}
EXPORT_SYMBOL(proc_create);
@@ -595,11 +607,12 @@
return seq_release(inode, file);
}
-static const struct file_operations proc_seq_fops = {
- .open = proc_seq_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = proc_seq_release,
+static const struct proc_ops proc_seq_ops = {
+ /* not permanent -- can call into arbitrary seq_operations */
+ .proc_open = proc_seq_open,
+ .proc_read_iter = seq_read_iter,
+ .proc_lseek = seq_lseek,
+ .proc_release = proc_seq_release,
};
struct proc_dir_entry *proc_create_seq_private(const char *name, umode_t mode,
@@ -611,7 +624,7 @@
p = proc_create_reg(name, mode, &parent, data);
if (!p)
return NULL;
- p->proc_fops = &proc_seq_fops;
+ p->proc_ops = &proc_seq_ops;
p->seq_ops = ops;
p->state_size = state_size;
return proc_register(parent, p);
@@ -625,11 +638,12 @@
return single_open(file, de->single_show, de->data);
}
-static const struct file_operations proc_single_fops = {
- .open = proc_single_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
+static const struct proc_ops proc_single_ops = {
+ /* not permanent -- can call into arbitrary ->single_show */
+ .proc_open = proc_single_open,
+ .proc_read_iter = seq_read_iter,
+ .proc_lseek = seq_lseek,
+ .proc_release = single_release,
};
struct proc_dir_entry *proc_create_single_data(const char *name, umode_t mode,
@@ -641,7 +655,7 @@
p = proc_create_reg(name, mode, &parent, data);
if (!p)
return NULL;
- p->proc_fops = &proc_single_fops;
+ p->proc_ops = &proc_single_ops;
p->single_show = show;
return proc_register(parent, p);
}
@@ -686,9 +700,13 @@
de = pde_subdir_find(parent, fn, len);
if (de) {
- rb_erase(&de->subdir_node, &parent->subdir);
- if (S_ISDIR(de->mode)) {
- parent->nlink--;
+ if (unlikely(pde_is_permanent(de))) {
+ WARN(1, "removing permanent /proc entry '%s'", de->name);
+ de = NULL;
+ } else {
+ rb_erase(&de->subdir_node, &parent->subdir);
+ if (S_ISDIR(de->mode))
+ parent->nlink--;
}
}
write_unlock(&proc_subdir_lock);
@@ -724,12 +742,24 @@
write_unlock(&proc_subdir_lock);
return -ENOENT;
}
+ if (unlikely(pde_is_permanent(root))) {
+ write_unlock(&proc_subdir_lock);
+ WARN(1, "removing permanent /proc entry '%s/%s'",
+ root->parent->name, root->name);
+ return -EINVAL;
+ }
rb_erase(&root->subdir_node, &parent->subdir);
de = root;
while (1) {
next = pde_subdir_first(de);
if (next) {
+ if (unlikely(pde_is_permanent(next))) {
+ write_unlock(&proc_subdir_lock);
+ WARN(1, "removing permanent /proc entry '%s/%s'",
+ next->parent->name, next->name);
+ return -EINVAL;
+ }
rb_erase(&next->subdir_node, &de->subdir);
de = next;
continue;
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 3f0c890..bde6b6f 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -24,6 +24,7 @@
#include <linux/seq_file.h>
#include <linux/slab.h>
#include <linux/mount.h>
+#include <linux/bug.h>
#include <linux/uaccess.h>
@@ -33,21 +34,27 @@
{
struct proc_dir_entry *de;
struct ctl_table_header *head;
+ struct proc_inode *ei = PROC_I(inode);
truncate_inode_pages_final(&inode->i_data);
clear_inode(inode);
/* Stop tracking associated processes */
- put_pid(PROC_I(inode)->pid);
+ if (ei->pid) {
+ proc_pid_evict_inode(ei);
+ ei->pid = NULL;
+ }
/* Let go of any associated proc directory entry */
- de = PDE(inode);
- if (de)
+ de = ei->pde;
+ if (de) {
pde_put(de);
+ ei->pde = NULL;
+ }
- head = PROC_I(inode)->sysctl;
+ head = ei->sysctl;
if (head) {
- RCU_INIT_POINTER(PROC_I(inode)->sysctl, NULL);
+ RCU_INIT_POINTER(ei->sysctl, NULL);
proc_sys_evict_inode(inode, head);
}
}
@@ -68,6 +75,7 @@
ei->pde = NULL;
ei->sysctl = NULL;
ei->sysctl_entry = NULL;
+ INIT_HLIST_NODE(&ei->sibling_inodes);
ei->ns_ops = NULL;
return &ei->vfs_inode;
}
@@ -102,15 +110,84 @@
BUILD_BUG_ON(sizeof(struct proc_dir_entry) >= SIZEOF_PDE);
}
+void proc_invalidate_siblings_dcache(struct hlist_head *inodes, spinlock_t *lock)
+{
+ struct inode *inode;
+ struct proc_inode *ei;
+ struct hlist_node *node;
+ struct super_block *old_sb = NULL;
+
+ rcu_read_lock();
+ for (;;) {
+ struct super_block *sb;
+ node = hlist_first_rcu(inodes);
+ if (!node)
+ break;
+ ei = hlist_entry(node, struct proc_inode, sibling_inodes);
+ spin_lock(lock);
+ hlist_del_init_rcu(&ei->sibling_inodes);
+ spin_unlock(lock);
+
+ inode = &ei->vfs_inode;
+ sb = inode->i_sb;
+ if ((sb != old_sb) && !atomic_inc_not_zero(&sb->s_active))
+ continue;
+ inode = igrab(inode);
+ rcu_read_unlock();
+ if (sb != old_sb) {
+ if (old_sb)
+ deactivate_super(old_sb);
+ old_sb = sb;
+ }
+ if (unlikely(!inode)) {
+ rcu_read_lock();
+ continue;
+ }
+
+ if (S_ISDIR(inode->i_mode)) {
+ struct dentry *dir = d_find_any_alias(inode);
+ if (dir) {
+ d_invalidate(dir);
+ dput(dir);
+ }
+ } else {
+ struct dentry *dentry;
+ while ((dentry = d_find_alias(inode))) {
+ d_invalidate(dentry);
+ dput(dentry);
+ }
+ }
+ iput(inode);
+
+ rcu_read_lock();
+ }
+ rcu_read_unlock();
+ if (old_sb)
+ deactivate_super(old_sb);
+}
+
+static inline const char *hidepid2str(enum proc_hidepid v)
+{
+ switch (v) {
+ case HIDEPID_OFF: return "off";
+ case HIDEPID_NO_ACCESS: return "noaccess";
+ case HIDEPID_INVISIBLE: return "invisible";
+ case HIDEPID_NOT_PTRACEABLE: return "ptraceable";
+ }
+ WARN_ONCE(1, "bad hide_pid value: %d\n", v);
+ return "unknown";
+}
+
static int proc_show_options(struct seq_file *seq, struct dentry *root)
{
- struct super_block *sb = root->d_sb;
- struct pid_namespace *pid = sb->s_fs_info;
+ struct proc_fs_info *fs_info = proc_sb_info(root->d_sb);
- if (!gid_eq(pid->pid_gid, GLOBAL_ROOT_GID))
- seq_printf(seq, ",gid=%u", from_kgid_munged(&init_user_ns, pid->pid_gid));
- if (pid->hide_pid != HIDEPID_OFF)
- seq_printf(seq, ",hidepid=%u", pid->hide_pid);
+ if (!gid_eq(fs_info->pid_gid, GLOBAL_ROOT_GID))
+ seq_printf(seq, ",gid=%u", from_kgid_munged(&init_user_ns, fs_info->pid_gid));
+ if (fs_info->hide_pid != HIDEPID_OFF)
+ seq_printf(seq, ",hidepid=%s", hidepid2str(fs_info->hide_pid));
+ if (fs_info->pidonly != PROC_PIDONLY_OFF)
+ seq_printf(seq, ",subset=pid");
return 0;
}
@@ -139,6 +216,7 @@
/* pde is locked on entry, unlocked on exit */
static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo)
+ __releases(&pde->pde_unload_lock)
{
/*
* close() (proc_reg_release()) can't delete an entry and proceed:
@@ -163,7 +241,7 @@
pdeo->closing = true;
spin_unlock(&pde->pde_unload_lock);
file = pdeo->file;
- pde->proc_fops->release(file_inode(file), file);
+ pde->proc_ops->proc_release(file_inode(file), file);
spin_lock(&pde->pde_unload_lock);
/* After ->release. */
list_del(&pdeo->lh);
@@ -195,115 +273,208 @@
spin_unlock(&de->pde_unload_lock);
}
+static loff_t pde_lseek(struct proc_dir_entry *pde, struct file *file, loff_t offset, int whence)
+{
+ typeof_member(struct proc_ops, proc_lseek) lseek;
+
+ lseek = pde->proc_ops->proc_lseek;
+ if (!lseek)
+ lseek = default_llseek;
+ return lseek(file, offset, whence);
+}
+
static loff_t proc_reg_llseek(struct file *file, loff_t offset, int whence)
{
struct proc_dir_entry *pde = PDE(file_inode(file));
loff_t rv = -EINVAL;
- if (use_pde(pde)) {
- typeof_member(struct file_operations, llseek) llseek;
- llseek = pde->proc_fops->llseek;
- if (!llseek)
- llseek = default_llseek;
- rv = llseek(file, offset, whence);
+ if (pde_is_permanent(pde)) {
+ return pde_lseek(pde, file, offset, whence);
+ } else if (use_pde(pde)) {
+ rv = pde_lseek(pde, file, offset, whence);
unuse_pde(pde);
}
return rv;
}
+static ssize_t proc_reg_read_iter(struct kiocb *iocb, struct iov_iter *iter)
+{
+ struct proc_dir_entry *pde = PDE(file_inode(iocb->ki_filp));
+ ssize_t ret;
+
+ if (pde_is_permanent(pde))
+ return pde->proc_ops->proc_read_iter(iocb, iter);
+
+ if (!use_pde(pde))
+ return -EIO;
+ ret = pde->proc_ops->proc_read_iter(iocb, iter);
+ unuse_pde(pde);
+ return ret;
+}
+
+static ssize_t pde_read(struct proc_dir_entry *pde, struct file *file, char __user *buf, size_t count, loff_t *ppos)
+{
+ typeof_member(struct proc_ops, proc_read) read;
+
+ read = pde->proc_ops->proc_read;
+ if (read)
+ return read(file, buf, count, ppos);
+ return -EIO;
+}
+
static ssize_t proc_reg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
{
struct proc_dir_entry *pde = PDE(file_inode(file));
ssize_t rv = -EIO;
- if (use_pde(pde)) {
- typeof_member(struct file_operations, read) read;
- read = pde->proc_fops->read;
- if (read)
- rv = read(file, buf, count, ppos);
+ if (pde_is_permanent(pde)) {
+ return pde_read(pde, file, buf, count, ppos);
+ } else if (use_pde(pde)) {
+ rv = pde_read(pde, file, buf, count, ppos);
unuse_pde(pde);
}
return rv;
}
+static ssize_t pde_write(struct proc_dir_entry *pde, struct file *file, const char __user *buf, size_t count, loff_t *ppos)
+{
+ typeof_member(struct proc_ops, proc_write) write;
+
+ write = pde->proc_ops->proc_write;
+ if (write)
+ return write(file, buf, count, ppos);
+ return -EIO;
+}
+
static ssize_t proc_reg_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
{
struct proc_dir_entry *pde = PDE(file_inode(file));
ssize_t rv = -EIO;
- if (use_pde(pde)) {
- typeof_member(struct file_operations, write) write;
- write = pde->proc_fops->write;
- if (write)
- rv = write(file, buf, count, ppos);
+ if (pde_is_permanent(pde)) {
+ return pde_write(pde, file, buf, count, ppos);
+ } else if (use_pde(pde)) {
+ rv = pde_write(pde, file, buf, count, ppos);
unuse_pde(pde);
}
return rv;
}
+static __poll_t pde_poll(struct proc_dir_entry *pde, struct file *file, struct poll_table_struct *pts)
+{
+ typeof_member(struct proc_ops, proc_poll) poll;
+
+ poll = pde->proc_ops->proc_poll;
+ if (poll)
+ return poll(file, pts);
+ return DEFAULT_POLLMASK;
+}
+
static __poll_t proc_reg_poll(struct file *file, struct poll_table_struct *pts)
{
struct proc_dir_entry *pde = PDE(file_inode(file));
__poll_t rv = DEFAULT_POLLMASK;
- if (use_pde(pde)) {
- typeof_member(struct file_operations, poll) poll;
- poll = pde->proc_fops->poll;
- if (poll)
- rv = poll(file, pts);
+ if (pde_is_permanent(pde)) {
+ return pde_poll(pde, file, pts);
+ } else if (use_pde(pde)) {
+ rv = pde_poll(pde, file, pts);
unuse_pde(pde);
}
return rv;
}
+static long pde_ioctl(struct proc_dir_entry *pde, struct file *file, unsigned int cmd, unsigned long arg)
+{
+ typeof_member(struct proc_ops, proc_ioctl) ioctl;
+
+ ioctl = pde->proc_ops->proc_ioctl;
+ if (ioctl)
+ return ioctl(file, cmd, arg);
+ return -ENOTTY;
+}
+
static long proc_reg_unlocked_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
struct proc_dir_entry *pde = PDE(file_inode(file));
long rv = -ENOTTY;
- if (use_pde(pde)) {
- typeof_member(struct file_operations, unlocked_ioctl) ioctl;
- ioctl = pde->proc_fops->unlocked_ioctl;
- if (ioctl)
- rv = ioctl(file, cmd, arg);
+ if (pde_is_permanent(pde)) {
+ return pde_ioctl(pde, file, cmd, arg);
+ } else if (use_pde(pde)) {
+ rv = pde_ioctl(pde, file, cmd, arg);
unuse_pde(pde);
}
return rv;
}
#ifdef CONFIG_COMPAT
+static long pde_compat_ioctl(struct proc_dir_entry *pde, struct file *file, unsigned int cmd, unsigned long arg)
+{
+ typeof_member(struct proc_ops, proc_compat_ioctl) compat_ioctl;
+
+ compat_ioctl = pde->proc_ops->proc_compat_ioctl;
+ if (compat_ioctl)
+ return compat_ioctl(file, cmd, arg);
+ return -ENOTTY;
+}
+
static long proc_reg_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
struct proc_dir_entry *pde = PDE(file_inode(file));
long rv = -ENOTTY;
- if (use_pde(pde)) {
- typeof_member(struct file_operations, compat_ioctl) compat_ioctl;
-
- compat_ioctl = pde->proc_fops->compat_ioctl;
- if (compat_ioctl)
- rv = compat_ioctl(file, cmd, arg);
+ if (pde_is_permanent(pde)) {
+ return pde_compat_ioctl(pde, file, cmd, arg);
+ } else if (use_pde(pde)) {
+ rv = pde_compat_ioctl(pde, file, cmd, arg);
unuse_pde(pde);
}
return rv;
}
#endif
+static int pde_mmap(struct proc_dir_entry *pde, struct file *file, struct vm_area_struct *vma)
+{
+ typeof_member(struct proc_ops, proc_mmap) mmap;
+
+ mmap = pde->proc_ops->proc_mmap;
+ if (mmap)
+ return mmap(file, vma);
+ return -EIO;
+}
+
static int proc_reg_mmap(struct file *file, struct vm_area_struct *vma)
{
struct proc_dir_entry *pde = PDE(file_inode(file));
int rv = -EIO;
- if (use_pde(pde)) {
- typeof_member(struct file_operations, mmap) mmap;
- mmap = pde->proc_fops->mmap;
- if (mmap)
- rv = mmap(file, vma);
+ if (pde_is_permanent(pde)) {
+ return pde_mmap(pde, file, vma);
+ } else if (use_pde(pde)) {
+ rv = pde_mmap(pde, file, vma);
unuse_pde(pde);
}
return rv;
}
static unsigned long
+pde_get_unmapped_area(struct proc_dir_entry *pde, struct file *file, unsigned long orig_addr,
+ unsigned long len, unsigned long pgoff,
+ unsigned long flags)
+{
+ typeof_member(struct proc_ops, proc_get_unmapped_area) get_area;
+
+ get_area = pde->proc_ops->proc_get_unmapped_area;
+#ifdef CONFIG_MMU
+ if (!get_area)
+ get_area = current->mm->get_unmapped_area;
+#endif
+ if (get_area)
+ return get_area(file, orig_addr, len, pgoff, flags);
+ return orig_addr;
+}
+
+static unsigned long
proc_reg_get_unmapped_area(struct file *file, unsigned long orig_addr,
unsigned long len, unsigned long pgoff,
unsigned long flags)
@@ -311,19 +482,10 @@
struct proc_dir_entry *pde = PDE(file_inode(file));
unsigned long rv = -EIO;
- if (use_pde(pde)) {
- typeof_member(struct file_operations, get_unmapped_area) get_area;
-
- get_area = pde->proc_fops->get_unmapped_area;
-#ifdef CONFIG_MMU
- if (!get_area)
- get_area = current->mm->get_unmapped_area;
-#endif
-
- if (get_area)
- rv = get_area(file, orig_addr, len, pgoff, flags);
- else
- rv = orig_addr;
+ if (pde_is_permanent(pde)) {
+ return pde_get_unmapped_area(pde, file, orig_addr, len, pgoff, flags);
+ } else if (use_pde(pde)) {
+ rv = pde_get_unmapped_area(pde, file, orig_addr, len, pgoff, flags);
unuse_pde(pde);
}
return rv;
@@ -331,12 +493,23 @@
static int proc_reg_open(struct inode *inode, struct file *file)
{
+ struct proc_fs_info *fs_info = proc_sb_info(inode->i_sb);
struct proc_dir_entry *pde = PDE(inode);
int rv = 0;
- typeof_member(struct file_operations, open) open;
- typeof_member(struct file_operations, release) release;
+ typeof_member(struct proc_ops, proc_open) open;
+ typeof_member(struct proc_ops, proc_release) release;
struct pde_opener *pdeo;
+ if (pde_is_permanent(pde)) {
+ open = pde->proc_ops->proc_open;
+ if (open)
+ rv = open(inode, file);
+ return rv;
+ }
+
+ if (fs_info->pidonly == PROC_PIDONLY_ON)
+ return -ENOENT;
+
/*
* Ensure that
* 1) PDE's ->release hook will be called no matter what
@@ -351,7 +524,7 @@
if (!use_pde(pde))
return -ENOENT;
- release = pde->proc_fops->release;
+ release = pde->proc_ops->proc_release;
if (release) {
pdeo = kmem_cache_alloc(pde_opener_cache, GFP_KERNEL);
if (!pdeo) {
@@ -360,7 +533,7 @@
}
}
- open = pde->proc_fops->open;
+ open = pde->proc_ops->proc_open;
if (open)
rv = open(inode, file);
@@ -386,6 +559,17 @@
{
struct proc_dir_entry *pde = PDE(inode);
struct pde_opener *pdeo;
+
+ if (pde_is_permanent(pde)) {
+ typeof_member(struct proc_ops, proc_release) release;
+
+ release = pde->proc_ops->proc_release;
+ if (release) {
+ return release(inode, file);
+ }
+ return 0;
+ }
+
spin_lock(&pde->pde_unload_lock);
list_for_each_entry(pdeo, &pde->pde_openers, lh) {
if (pdeo->file == file) {
@@ -403,9 +587,19 @@
.write = proc_reg_write,
.poll = proc_reg_poll,
.unlocked_ioctl = proc_reg_unlocked_ioctl,
-#ifdef CONFIG_COMPAT
- .compat_ioctl = proc_reg_compat_ioctl,
-#endif
+ .mmap = proc_reg_mmap,
+ .get_unmapped_area = proc_reg_get_unmapped_area,
+ .open = proc_reg_open,
+ .release = proc_reg_release,
+};
+
+static const struct file_operations proc_iter_file_ops = {
+ .llseek = proc_reg_llseek,
+ .read_iter = proc_reg_read_iter,
+ .write = proc_reg_write,
+ .splice_read = generic_file_splice_read,
+ .poll = proc_reg_poll,
+ .unlocked_ioctl = proc_reg_unlocked_ioctl,
.mmap = proc_reg_mmap,
.get_unmapped_area = proc_reg_get_unmapped_area,
.open = proc_reg_open,
@@ -413,12 +607,27 @@
};
#ifdef CONFIG_COMPAT
-static const struct file_operations proc_reg_file_ops_no_compat = {
+static const struct file_operations proc_reg_file_ops_compat = {
.llseek = proc_reg_llseek,
.read = proc_reg_read,
.write = proc_reg_write,
.poll = proc_reg_poll,
.unlocked_ioctl = proc_reg_unlocked_ioctl,
+ .compat_ioctl = proc_reg_compat_ioctl,
+ .mmap = proc_reg_mmap,
+ .get_unmapped_area = proc_reg_get_unmapped_area,
+ .open = proc_reg_open,
+ .release = proc_reg_release,
+};
+
+static const struct file_operations proc_iter_file_ops_compat = {
+ .llseek = proc_reg_llseek,
+ .read_iter = proc_reg_read_iter,
+ .splice_read = generic_file_splice_read,
+ .write = proc_reg_write,
+ .poll = proc_reg_poll,
+ .unlocked_ioctl = proc_reg_unlocked_ioctl,
+ .compat_ioctl = proc_reg_compat_ioctl,
.mmap = proc_reg_mmap,
.get_unmapped_area = proc_reg_get_unmapped_area,
.open = proc_reg_open,
@@ -450,40 +659,51 @@
{
struct inode *inode = new_inode(sb);
- if (inode) {
- inode->i_ino = de->low_ino;
- inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
- PROC_I(inode)->pde = de;
+ if (!inode) {
+ pde_put(de);
+ return NULL;
+ }
- if (is_empty_pde(de)) {
- make_empty_dir_inode(inode);
- return inode;
- }
- if (de->mode) {
- inode->i_mode = de->mode;
- inode->i_uid = de->uid;
- inode->i_gid = de->gid;
- }
- if (de->size)
- inode->i_size = de->size;
- if (de->nlink)
- set_nlink(inode, de->nlink);
- WARN_ON(!de->proc_iops);
+ inode->i_ino = de->low_ino;
+ inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
+ PROC_I(inode)->pde = de;
+ if (is_empty_pde(de)) {
+ make_empty_dir_inode(inode);
+ return inode;
+ }
+
+ if (de->mode) {
+ inode->i_mode = de->mode;
+ inode->i_uid = de->uid;
+ inode->i_gid = de->gid;
+ }
+ if (de->size)
+ inode->i_size = de->size;
+ if (de->nlink)
+ set_nlink(inode, de->nlink);
+
+ if (S_ISREG(inode->i_mode)) {
inode->i_op = de->proc_iops;
- if (de->proc_fops) {
- if (S_ISREG(inode->i_mode)) {
+ if (de->proc_ops->proc_read_iter)
+ inode->i_fop = &proc_iter_file_ops;
+ else
+ inode->i_fop = &proc_reg_file_ops;
#ifdef CONFIG_COMPAT
- if (!de->proc_fops->compat_ioctl)
- inode->i_fop =
- &proc_reg_file_ops_no_compat;
- else
-#endif
- inode->i_fop = &proc_reg_file_ops;
- } else {
- inode->i_fop = de->proc_fops;
- }
+ if (de->proc_ops->proc_compat_ioctl) {
+ if (de->proc_ops->proc_read_iter)
+ inode->i_fop = &proc_iter_file_ops_compat;
+ else
+ inode->i_fop = &proc_reg_file_ops_compat;
}
- } else
- pde_put(de);
+#endif
+ } else if (S_ISDIR(inode->i_mode)) {
+ inode->i_op = de->proc_iops;
+ inode->i_fop = de->proc_dir_ops;
+ } else if (S_ISLNK(inode->i_mode)) {
+ inode->i_op = de->proc_iops;
+ inode->i_fop = NULL;
+ } else {
+ BUG();
+ }
return inode;
}
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 269acc1..afbe96b 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -39,7 +39,10 @@
spinlock_t pde_unload_lock;
struct completion *pde_unload_completion;
const struct inode_operations *proc_iops;
- const struct file_operations *proc_fops;
+ union {
+ const struct proc_ops *proc_ops;
+ const struct file_operations *proc_dir_ops;
+ };
const struct dentry_operations *proc_dops;
union {
const struct seq_operations *seq_ops;
@@ -58,6 +61,7 @@
struct rb_node subdir_node;
char *name;
umode_t mode;
+ u8 flags;
u8 namelen;
char inline_name[];
} __randomize_layout;
@@ -70,6 +74,11 @@
0)
#define SIZEOF_PDE_INLINE_NAME (SIZEOF_PDE - sizeof(struct proc_dir_entry))
+static inline bool pde_is_permanent(const struct proc_dir_entry *pde)
+{
+ return pde->flags & PROC_ENTRY_PERMANENT;
+}
+
extern struct kmem_cache *proc_dir_entry_cache;
void pde_free(struct proc_dir_entry *pde);
@@ -88,7 +97,7 @@
struct proc_dir_entry *pde;
struct ctl_table_header *sysctl;
struct ctl_table *sysctl_entry;
- struct hlist_node sysctl_inodes;
+ struct hlist_node sibling_inodes;
const struct proc_ns_operations *ns_ops;
struct inode vfs_inode;
} __randomize_layout;
@@ -155,6 +164,7 @@
extern const struct dentry_operations pid_dentry_operations;
extern int pid_getattr(const struct path *, struct kstat *, u32, unsigned int);
extern int proc_setattr(struct dentry *, struct iattr *);
+extern void proc_pid_evict_inode(struct proc_inode *);
extern struct inode *proc_pid_make_inode(struct super_block *, struct task_struct *, umode_t);
extern void pid_update_inode(struct task_struct *, struct inode *);
extern int pid_delete_dentry(const struct dentry *);
@@ -197,8 +207,8 @@
* inode.c
*/
struct pde_opener {
- struct file *file;
struct list_head lh;
+ struct file *file;
bool closing;
struct completion *c;
} __randomize_layout;
@@ -207,6 +217,7 @@
extern const struct super_operations proc_sops;
void proc_init_kmemcache(void);
+void proc_invalidate_siblings_dcache(struct hlist_head *inodes, spinlock_t *lock);
void set_proc_pid_nlink(void);
extern struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *);
extern void proc_entry_rundown(struct proc_dir_entry *);
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index e2ed8e0..e502414 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -512,7 +512,8 @@
* Using bounce buffer to bypass the
* hardened user copy kernel text checks.
*/
- if (probe_kernel_read(buf, (void *) start, tsz)) {
+ if (copy_from_kernel_nofault(buf, (void *)start,
+ tsz)) {
if (clear_user(buffer, tsz)) {
ret = -EFAULT;
goto out;
@@ -574,11 +575,11 @@
return 0;
}
-static const struct file_operations proc_kcore_operations = {
- .read = read_kcore,
- .open = open_kcore,
- .release = release_kcore,
- .llseek = default_llseek,
+static const struct proc_ops kcore_proc_ops = {
+ .proc_read = read_kcore,
+ .proc_open = open_kcore,
+ .proc_release = release_kcore,
+ .proc_lseek = default_llseek,
};
/* just remember that we have to update kcore */
@@ -637,8 +638,7 @@
static int __init proc_kcore_init(void)
{
- proc_root_kcore = proc_create("kcore", S_IRUSR, NULL,
- &proc_kcore_operations);
+ proc_root_kcore = proc_create("kcore", S_IRUSR, NULL, &kcore_proc_ops);
if (!proc_root_kcore) {
pr_err("couldn't create /proc/kcore\n");
return 0; /* Always returns 0. */
diff --git a/fs/proc/kmsg.c b/fs/proc/kmsg.c
index 4f4a2ab..b38ad55 100644
--- a/fs/proc/kmsg.c
+++ b/fs/proc/kmsg.c
@@ -49,17 +49,18 @@
}
-static const struct file_operations proc_kmsg_operations = {
- .read = kmsg_read,
- .poll = kmsg_poll,
- .open = kmsg_open,
- .release = kmsg_release,
- .llseek = generic_file_llseek,
+static const struct proc_ops kmsg_proc_ops = {
+ .proc_flags = PROC_ENTRY_PERMANENT,
+ .proc_read = kmsg_read,
+ .proc_poll = kmsg_poll,
+ .proc_open = kmsg_open,
+ .proc_release = kmsg_release,
+ .proc_lseek = generic_file_llseek,
};
static int __init proc_kmsg_init(void)
{
- proc_create("kmsg", S_IRUSR, NULL, &proc_kmsg_operations);
+ proc_create("kmsg", S_IRUSR, NULL, &kmsg_proc_ops);
return 0;
}
fs_initcall(proc_kmsg_init);
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 8c1f1bb..887a553 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -17,7 +17,6 @@
#include <linux/cma.h>
#endif
#include <asm/page.h>
-#include <asm/pgtable.h>
#include "internal.h"
void __attribute__((weak)) arch_report_meminfo(struct seq_file *m)
@@ -42,7 +41,7 @@
si_meminfo(&i);
si_swapinfo(&i);
- committed = percpu_counter_read_positive(&vm_committed_as);
+ committed = vm_memory_committed();
cached = global_node_page_state(NR_FILE_PAGES) -
total_swapcache_pages() - i.bufferram;
@@ -53,8 +52,8 @@
pages[lru] = global_node_page_state(NR_LRU_BASE + lru);
available = si_mem_available();
- sreclaimable = global_node_page_state(NR_SLAB_RECLAIMABLE);
- sunreclaim = global_node_page_state(NR_SLAB_UNRECLAIMABLE);
+ sreclaimable = global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B);
+ sunreclaim = global_node_page_state_pages(NR_SLAB_UNRECLAIMABLE_B);
show_val_kb(m, "MemTotal: ", i.totalram);
show_val_kb(m, "MemFree: ", i.freeram);
@@ -102,12 +101,15 @@
show_val_kb(m, "SReclaimable: ", sreclaimable);
show_val_kb(m, "SUnreclaim: ", sunreclaim);
seq_printf(m, "KernelStack: %8lu kB\n",
- global_zone_page_state(NR_KERNEL_STACK_KB));
+ global_node_page_state(NR_KERNEL_STACK_KB));
+#ifdef CONFIG_SHADOW_CALL_STACK
+ seq_printf(m, "ShadowCallStack:%8lu kB\n",
+ global_node_page_state(NR_KERNEL_SCS_KB));
+#endif
show_val_kb(m, "PageTables: ",
global_zone_page_state(NR_PAGETABLE));
- show_val_kb(m, "NFS_Unstable: ",
- global_node_page_state(NR_UNSTABLE_NFS));
+ show_val_kb(m, "NFS_Unstable: ", 0);
show_val_kb(m, "Bounce: ",
global_zone_page_state(NR_BOUNCE));
show_val_kb(m, "WritebackTmp: ",
diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c
index dd2b35f..8e159fc 100644
--- a/fs/proc/namespaces.c
+++ b/fs/proc/namespaces.c
@@ -33,6 +33,10 @@
#ifdef CONFIG_CGROUPS
&cgroupns_operations,
#endif
+#ifdef CONFIG_TIME_NS
+ &timens_operations,
+ &timens_for_children_operations,
+#endif
};
static const char *proc_ns_get_link(struct dentry *dentry,
@@ -42,22 +46,26 @@
const struct proc_ns_operations *ns_ops = PROC_I(inode)->ns_ops;
struct task_struct *task;
struct path ns_path;
- void *error = ERR_PTR(-EACCES);
+ int error = -EACCES;
if (!dentry)
return ERR_PTR(-ECHILD);
task = get_proc_task(inode);
if (!task)
- return error;
+ return ERR_PTR(-EACCES);
- if (ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) {
- error = ns_get_path(&ns_path, task, ns_ops);
- if (!error)
- nd_jump_link(&ns_path);
- }
+ if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS))
+ goto out;
+
+ error = ns_get_path(&ns_path, task, ns_ops);
+ if (error)
+ goto out;
+
+ error = nd_jump_link(&ns_path);
+out:
put_task_struct(task);
- return error;
+ return ERR_PTR(error);
}
static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int buflen)
diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c
index 14c2bad..13452b3 100644
--- a/fs/proc/nommu.c
+++ b/fs/proc/nommu.c
@@ -22,7 +22,6 @@
#include <linux/hugetlb.h>
#include <linux/vmalloc.h>
#include <linux/uaccess.h>
-#include <asm/pgtable.h>
#include <asm/tlb.h>
#include <asm/div64.h>
#include "internal.h"
diff --git a/fs/proc/page.c b/fs/proc/page.c
index 7c952ee..9f1077d 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -21,6 +21,21 @@
#define KPMMASK (KPMSIZE - 1)
#define KPMBITS (KPMSIZE * BITS_PER_BYTE)
+static inline unsigned long get_max_dump_pfn(void)
+{
+#ifdef CONFIG_SPARSEMEM
+ /*
+ * The memmap of early sections is completely populated and marked
+ * online even if max_pfn does not fall on a section boundary -
+ * pfn_to_online_page() will succeed on all pages. Allow inspecting
+ * these memmaps.
+ */
+ return round_up(max_pfn, PAGES_PER_SECTION);
+#else
+ return max_pfn;
+#endif
+}
+
/* /proc/kpagecount - an array exposing page counts
*
* Each entry is a u64 representing the corresponding
@@ -29,6 +44,7 @@
static ssize_t kpagecount_read(struct file *file, char __user *buf,
size_t count, loff_t *ppos)
{
+ const unsigned long max_dump_pfn = get_max_dump_pfn();
u64 __user *out = (u64 __user *)buf;
struct page *ppage;
unsigned long src = *ppos;
@@ -37,9 +53,11 @@
u64 pcount;
pfn = src / KPMSIZE;
- count = min_t(size_t, count, (max_pfn * KPMSIZE) - src);
if (src & KPMMASK || count & KPMMASK)
return -EINVAL;
+ if (src >= max_dump_pfn * KPMSIZE)
+ return 0;
+ count = min_t(unsigned long, count, (max_dump_pfn * KPMSIZE) - src);
while (count > 0) {
/*
@@ -71,9 +89,9 @@
return ret;
}
-static const struct file_operations proc_kpagecount_operations = {
- .llseek = mem_lseek,
- .read = kpagecount_read,
+static const struct proc_ops kpagecount_proc_ops = {
+ .proc_lseek = mem_lseek,
+ .proc_read = kpagecount_read,
};
/* /proc/kpageflags - an array exposing page flags
@@ -199,6 +217,9 @@
u |= kpf_copy_bit(k, KPF_PRIVATE_2, PG_private_2);
u |= kpf_copy_bit(k, KPF_OWNER_PRIVATE, PG_owner_priv_1);
u |= kpf_copy_bit(k, KPF_ARCH, PG_arch_1);
+#ifdef CONFIG_64BIT
+ u |= kpf_copy_bit(k, KPF_ARCH_2, PG_arch_2);
+#endif
return u;
};
@@ -206,6 +227,7 @@
static ssize_t kpageflags_read(struct file *file, char __user *buf,
size_t count, loff_t *ppos)
{
+ const unsigned long max_dump_pfn = get_max_dump_pfn();
u64 __user *out = (u64 __user *)buf;
struct page *ppage;
unsigned long src = *ppos;
@@ -213,9 +235,11 @@
ssize_t ret = 0;
pfn = src / KPMSIZE;
- count = min_t(unsigned long, count, (max_pfn * KPMSIZE) - src);
if (src & KPMMASK || count & KPMMASK)
return -EINVAL;
+ if (src >= max_dump_pfn * KPMSIZE)
+ return 0;
+ count = min_t(unsigned long, count, (max_dump_pfn * KPMSIZE) - src);
while (count > 0) {
/*
@@ -242,15 +266,16 @@
return ret;
}
-static const struct file_operations proc_kpageflags_operations = {
- .llseek = mem_lseek,
- .read = kpageflags_read,
+static const struct proc_ops kpageflags_proc_ops = {
+ .proc_lseek = mem_lseek,
+ .proc_read = kpageflags_read,
};
#ifdef CONFIG_MEMCG
static ssize_t kpagecgroup_read(struct file *file, char __user *buf,
size_t count, loff_t *ppos)
{
+ const unsigned long max_dump_pfn = get_max_dump_pfn();
u64 __user *out = (u64 __user *)buf;
struct page *ppage;
unsigned long src = *ppos;
@@ -259,9 +284,11 @@
u64 ino;
pfn = src / KPMSIZE;
- count = min_t(unsigned long, count, (max_pfn * KPMSIZE) - src);
if (src & KPMMASK || count & KPMMASK)
return -EINVAL;
+ if (src >= max_dump_pfn * KPMSIZE)
+ return 0;
+ count = min_t(unsigned long, count, (max_dump_pfn * KPMSIZE) - src);
while (count > 0) {
/*
@@ -293,18 +320,18 @@
return ret;
}
-static const struct file_operations proc_kpagecgroup_operations = {
- .llseek = mem_lseek,
- .read = kpagecgroup_read,
+static const struct proc_ops kpagecgroup_proc_ops = {
+ .proc_lseek = mem_lseek,
+ .proc_read = kpagecgroup_read,
};
#endif /* CONFIG_MEMCG */
static int __init proc_page_init(void)
{
- proc_create("kpagecount", S_IRUSR, NULL, &proc_kpagecount_operations);
- proc_create("kpageflags", S_IRUSR, NULL, &proc_kpageflags_operations);
+ proc_create("kpagecount", S_IRUSR, NULL, &kpagecount_proc_ops);
+ proc_create("kpageflags", S_IRUSR, NULL, &kpageflags_proc_ops);
#ifdef CONFIG_MEMCG
- proc_create("kpagecgroup", S_IRUSR, NULL, &proc_kpagecgroup_operations);
+ proc_create("kpagecgroup", S_IRUSR, NULL, &kpagecgroup_proc_ops);
#endif
return 0;
}
diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index 313b7c7..1aa9236 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -74,14 +74,33 @@
return 0;
}
-static const struct file_operations proc_net_seq_fops = {
- .open = seq_open_net,
- .read = seq_read,
- .write = proc_simple_write,
- .llseek = seq_lseek,
- .release = seq_release_net,
+static const struct proc_ops proc_net_seq_ops = {
+ .proc_open = seq_open_net,
+ .proc_read = seq_read,
+ .proc_write = proc_simple_write,
+ .proc_lseek = seq_lseek,
+ .proc_release = seq_release_net,
};
+int bpf_iter_init_seq_net(void *priv_data, struct bpf_iter_aux_info *aux)
+{
+#ifdef CONFIG_NET_NS
+ struct seq_net_private *p = priv_data;
+
+ p->net = get_net(current->nsproxy->net_ns);
+#endif
+ return 0;
+}
+
+void bpf_iter_fini_seq_net(void *priv_data)
+{
+#ifdef CONFIG_NET_NS
+ struct seq_net_private *p = priv_data;
+
+ put_net(p->net);
+#endif
+}
+
struct proc_dir_entry *proc_create_net_data(const char *name, umode_t mode,
struct proc_dir_entry *parent, const struct seq_operations *ops,
unsigned int state_size, void *data)
@@ -92,7 +111,7 @@
if (!p)
return NULL;
pde_force_lookup(p);
- p->proc_fops = &proc_net_seq_fops;
+ p->proc_ops = &proc_net_seq_ops;
p->seq_ops = ops;
p->state_size = state_size;
return proc_register(parent, p);
@@ -136,7 +155,7 @@
if (!p)
return NULL;
pde_force_lookup(p);
- p->proc_fops = &proc_net_seq_fops;
+ p->proc_ops = &proc_net_seq_ops;
p->seq_ops = ops;
p->state_size = state_size;
p->write = write;
@@ -167,12 +186,12 @@
return single_release(ino, f);
}
-static const struct file_operations proc_net_single_fops = {
- .open = single_open_net,
- .read = seq_read,
- .write = proc_simple_write,
- .llseek = seq_lseek,
- .release = single_release_net,
+static const struct proc_ops proc_net_single_ops = {
+ .proc_open = single_open_net,
+ .proc_read = seq_read,
+ .proc_write = proc_simple_write,
+ .proc_lseek = seq_lseek,
+ .proc_release = single_release_net,
};
struct proc_dir_entry *proc_create_net_single(const char *name, umode_t mode,
@@ -185,7 +204,7 @@
if (!p)
return NULL;
pde_force_lookup(p);
- p->proc_fops = &proc_net_single_fops;
+ p->proc_ops = &proc_net_single_ops;
p->single_show = show;
return proc_register(parent, p);
}
@@ -228,7 +247,7 @@
if (!p)
return NULL;
pde_force_lookup(p);
- p->proc_fops = &proc_net_single_fops;
+ p->proc_ops = &proc_net_single_ops;
p->single_show = show;
p->write = write;
return proc_register(parent, p);
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index d80989b..070d2df 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -12,8 +12,10 @@
#include <linux/cred.h>
#include <linux/namei.h>
#include <linux/mm.h>
+#include <linux/uio.h>
#include <linux/module.h>
#include <linux/bpf-cgroup.h>
+#include <linux/mount.h>
#include "internal.h"
static const struct dentry_operations proc_sys_dentry_operations;
@@ -267,42 +269,9 @@
complete(p->unregistering);
}
-static void proc_sys_prune_dcache(struct ctl_table_header *head)
+static void proc_sys_invalidate_dcache(struct ctl_table_header *head)
{
- struct inode *inode;
- struct proc_inode *ei;
- struct hlist_node *node;
- struct super_block *sb;
-
- rcu_read_lock();
- for (;;) {
- node = hlist_first_rcu(&head->inodes);
- if (!node)
- break;
- ei = hlist_entry(node, struct proc_inode, sysctl_inodes);
- spin_lock(&sysctl_lock);
- hlist_del_init_rcu(&ei->sysctl_inodes);
- spin_unlock(&sysctl_lock);
-
- inode = &ei->vfs_inode;
- sb = inode->i_sb;
- if (!atomic_inc_not_zero(&sb->s_active))
- continue;
- inode = igrab(inode);
- rcu_read_unlock();
- if (unlikely(!inode)) {
- deactivate_super(sb);
- rcu_read_lock();
- continue;
- }
-
- d_prune_aliases(inode);
- iput(inode);
- deactivate_super(sb);
-
- rcu_read_lock();
- }
- rcu_read_unlock();
+ proc_invalidate_siblings_dcache(&head->inodes, &sysctl_lock);
}
/* called under sysctl_lock, will reacquire if has to wait */
@@ -324,10 +293,10 @@
spin_unlock(&sysctl_lock);
}
/*
- * Prune dentries for unregistered sysctls: namespaced sysctls
+ * Invalidate dentries for unregistered sysctls: namespaced sysctls
* can have duplicate names and contaminate dcache very badly.
*/
- proc_sys_prune_dcache(p);
+ proc_sys_invalidate_dcache(p);
/*
* do not remove from the list until nobody holds it; walking the
* list in do_sysctl() relies on that.
@@ -483,7 +452,7 @@
}
ei->sysctl = head;
ei->sysctl_entry = table;
- hlist_add_head_rcu(&ei->sysctl_inodes, &head->inodes);
+ hlist_add_head_rcu(&ei->sibling_inodes, &head->inodes);
head->count++;
spin_unlock(&sysctl_lock);
@@ -514,7 +483,7 @@
void proc_sys_evict_inode(struct inode *inode, struct ctl_table_header *head)
{
spin_lock(&sysctl_lock);
- hlist_del_init_rcu(&PROC_I(inode)->sysctl_inodes);
+ hlist_del_init_rcu(&PROC_I(inode)->sibling_inodes);
if (!--head->count)
kfree_rcu(head, rcu);
spin_unlock(&sysctl_lock);
@@ -572,13 +541,14 @@
return err;
}
-static ssize_t proc_sys_call_handler(struct file *filp, void __user *buf,
- size_t count, loff_t *ppos, int write)
+static ssize_t proc_sys_call_handler(struct kiocb *iocb, struct iov_iter *iter,
+ int write)
{
- struct inode *inode = file_inode(filp);
+ struct inode *inode = file_inode(iocb->ki_filp);
struct ctl_table_header *head = grab_header(inode);
struct ctl_table *table = PROC_I(inode)->sysctl_entry;
- void *new_buf = NULL;
+ size_t count = iov_iter_count(iter);
+ char *kbuf;
ssize_t error;
if (IS_ERR(head))
@@ -597,43 +567,54 @@
if (!table->proc_handler)
goto out;
- error = BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write, buf, &count,
- ppos, &new_buf);
- if (error)
+ /* don't even try if the size is too large */
+ error = -ENOMEM;
+ if (count >= KMALLOC_MAX_SIZE)
+ goto out;
+ kbuf = kvzalloc(count + 1, GFP_KERNEL);
+ if (!kbuf)
goto out;
- /* careful: calling conventions are nasty here */
- if (new_buf) {
- mm_segment_t old_fs;
-
- old_fs = get_fs();
- set_fs(KERNEL_DS);
- error = table->proc_handler(table, write, (void __user *)new_buf,
- &count, ppos);
- set_fs(old_fs);
- kfree(new_buf);
- } else {
- error = table->proc_handler(table, write, buf, &count, ppos);
+ if (write) {
+ error = -EFAULT;
+ if (!copy_from_iter_full(kbuf, count, iter))
+ goto out_free_buf;
+ kbuf[count] = '\0';
}
- if (!error)
- error = count;
+ error = BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write, &kbuf, &count,
+ &iocb->ki_pos);
+ if (error)
+ goto out_free_buf;
+
+ /* careful: calling conventions are nasty here */
+ error = table->proc_handler(table, write, kbuf, &count, &iocb->ki_pos);
+ if (error)
+ goto out_free_buf;
+
+ if (!write) {
+ error = -EFAULT;
+ if (copy_to_iter(kbuf, count, iter) < count)
+ goto out_free_buf;
+ }
+
+ error = count;
+out_free_buf:
+ kvfree(kbuf);
out:
sysctl_head_finish(head);
return error;
}
-static ssize_t proc_sys_read(struct file *filp, char __user *buf,
- size_t count, loff_t *ppos)
+static ssize_t proc_sys_read(struct kiocb *iocb, struct iov_iter *iter)
{
- return proc_sys_call_handler(filp, (void __user *)buf, count, ppos, 0);
+ return proc_sys_call_handler(iocb, iter, 0);
}
-static ssize_t proc_sys_write(struct file *filp, const char __user *buf,
- size_t count, loff_t *ppos)
+static ssize_t proc_sys_write(struct kiocb *iocb, struct iov_iter *iter)
{
- return proc_sys_call_handler(filp, (void __user *)buf, count, ppos, 1);
+ return proc_sys_call_handler(iocb, iter, 1);
}
static int proc_sys_open(struct inode *inode, struct file *filp)
@@ -870,8 +851,10 @@
static const struct file_operations proc_sys_file_operations = {
.open = proc_sys_open,
.poll = proc_sys_poll,
- .read = proc_sys_read,
- .write = proc_sys_write,
+ .read_iter = proc_sys_read,
+ .write_iter = proc_sys_write,
+ .splice_read = generic_file_splice_read,
+ .splice_write = iter_file_splice_write,
.llseek = default_llseek,
};
@@ -1720,8 +1703,157 @@
proc_sys_root = proc_mkdir("sys", NULL);
proc_sys_root->proc_iops = &proc_sys_dir_operations;
- proc_sys_root->proc_fops = &proc_sys_dir_file_operations;
+ proc_sys_root->proc_dir_ops = &proc_sys_dir_file_operations;
proc_sys_root->nlink = 0;
return sysctl_init();
}
+
+struct sysctl_alias {
+ const char *kernel_param;
+ const char *sysctl_param;
+};
+
+/*
+ * Historically some settings had both sysctl and a command line parameter.
+ * With the generic sysctl. parameter support, we can handle them at a single
+ * place and only keep the historical name for compatibility. This is not meant
+ * to add brand new aliases. When adding existing aliases, consider whether
+ * the possibly different moment of changing the value (e.g. from early_param
+ * to the moment do_sysctl_args() is called) is an issue for the specific
+ * parameter.
+ */
+static const struct sysctl_alias sysctl_aliases[] = {
+ {"hardlockup_all_cpu_backtrace", "kernel.hardlockup_all_cpu_backtrace" },
+ {"hung_task_panic", "kernel.hung_task_panic" },
+ {"numa_zonelist_order", "vm.numa_zonelist_order" },
+ {"softlockup_all_cpu_backtrace", "kernel.softlockup_all_cpu_backtrace" },
+ {"softlockup_panic", "kernel.softlockup_panic" },
+ { }
+};
+
+static const char *sysctl_find_alias(char *param)
+{
+ const struct sysctl_alias *alias;
+
+ for (alias = &sysctl_aliases[0]; alias->kernel_param != NULL; alias++) {
+ if (strcmp(alias->kernel_param, param) == 0)
+ return alias->sysctl_param;
+ }
+
+ return NULL;
+}
+
+/* Set sysctl value passed on kernel command line. */
+static int process_sysctl_arg(char *param, char *val,
+ const char *unused, void *arg)
+{
+ char *path;
+ struct vfsmount **proc_mnt = arg;
+ struct file_system_type *proc_fs_type;
+ struct file *file;
+ int len;
+ int err;
+ loff_t pos = 0;
+ ssize_t wret;
+
+ if (strncmp(param, "sysctl", sizeof("sysctl") - 1) == 0) {
+ param += sizeof("sysctl") - 1;
+
+ if (param[0] != '/' && param[0] != '.')
+ return 0;
+
+ param++;
+ } else {
+ param = (char *) sysctl_find_alias(param);
+ if (!param)
+ return 0;
+ }
+
+ if (!val)
+ return -EINVAL;
+ len = strlen(val);
+ if (len == 0)
+ return -EINVAL;
+
+ /*
+ * To set sysctl options, we use a temporary mount of proc, look up the
+ * respective sys/ file and write to it. To avoid mounting it when no
+ * options were given, we mount it only when the first sysctl option is
+ * found. Why not a persistent mount? There are problems with a
+ * persistent mount of proc in that it forces userspace not to use any
+ * proc mount options.
+ */
+ if (!*proc_mnt) {
+ proc_fs_type = get_fs_type("proc");
+ if (!proc_fs_type) {
+ pr_err("Failed to find procfs to set sysctl from command line\n");
+ return 0;
+ }
+ *proc_mnt = kern_mount(proc_fs_type);
+ put_filesystem(proc_fs_type);
+ if (IS_ERR(*proc_mnt)) {
+ pr_err("Failed to mount procfs to set sysctl from command line\n");
+ return 0;
+ }
+ }
+
+ path = kasprintf(GFP_KERNEL, "sys/%s", param);
+ if (!path)
+ panic("%s: Failed to allocate path for %s\n", __func__, param);
+ strreplace(path, '.', '/');
+
+ file = file_open_root((*proc_mnt)->mnt_root, *proc_mnt, path, O_WRONLY, 0);
+ if (IS_ERR(file)) {
+ err = PTR_ERR(file);
+ if (err == -ENOENT)
+ pr_err("Failed to set sysctl parameter '%s=%s': parameter not found\n",
+ param, val);
+ else if (err == -EACCES)
+ pr_err("Failed to set sysctl parameter '%s=%s': permission denied (read-only?)\n",
+ param, val);
+ else
+ pr_err("Error %pe opening proc file to set sysctl parameter '%s=%s'\n",
+ file, param, val);
+ goto out;
+ }
+ wret = kernel_write(file, val, len, &pos);
+ if (wret < 0) {
+ err = wret;
+ if (err == -EINVAL)
+ pr_err("Failed to set sysctl parameter '%s=%s': invalid value\n",
+ param, val);
+ else
+ pr_err("Error %pe writing to proc file to set sysctl parameter '%s=%s'\n",
+ ERR_PTR(err), param, val);
+ } else if (wret != len) {
+ pr_err("Wrote only %zd bytes of %d writing to proc file %s to set sysctl parameter '%s=%s\n",
+ wret, len, path, param, val);
+ }
+
+ err = filp_close(file, NULL);
+ if (err)
+ pr_err("Error %pe closing proc file to set sysctl parameter '%s=%s\n",
+ ERR_PTR(err), param, val);
+out:
+ kfree(path);
+ return 0;
+}
+
+void do_sysctl_args(void)
+{
+ char *command_line;
+ struct vfsmount *proc_mnt = NULL;
+
+ command_line = kstrdup(saved_command_line, GFP_KERNEL);
+ if (!command_line)
+ panic("%s: Failed to allocate copy of command line\n", __func__);
+
+ parse_args("Setting sysctl args", command_line,
+ NULL, 0, -1, -1, &proc_mnt, process_sysctl_arg);
+
+ if (proc_mnt)
+ kern_unmount(proc_mnt);
+
+ kfree(command_line);
+}
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 0b7c8df..5e444d4 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -32,25 +32,85 @@
struct proc_fs_context {
struct pid_namespace *pid_ns;
unsigned int mask;
- int hidepid;
+ enum proc_hidepid hidepid;
int gid;
+ enum proc_pidonly pidonly;
};
enum proc_param {
Opt_gid,
Opt_hidepid,
+ Opt_subset,
};
-static const struct fs_parameter_spec proc_param_specs[] = {
+static const struct fs_parameter_spec proc_fs_parameters[] = {
fsparam_u32("gid", Opt_gid),
- fsparam_u32("hidepid", Opt_hidepid),
+ fsparam_string("hidepid", Opt_hidepid),
+ fsparam_string("subset", Opt_subset),
{}
};
-static const struct fs_parameter_description proc_fs_parameters = {
- .name = "proc",
- .specs = proc_param_specs,
-};
+static inline int valid_hidepid(unsigned int value)
+{
+ return (value == HIDEPID_OFF ||
+ value == HIDEPID_NO_ACCESS ||
+ value == HIDEPID_INVISIBLE ||
+ value == HIDEPID_NOT_PTRACEABLE);
+}
+
+static int proc_parse_hidepid_param(struct fs_context *fc, struct fs_parameter *param)
+{
+ struct proc_fs_context *ctx = fc->fs_private;
+ struct fs_parameter_spec hidepid_u32_spec = fsparam_u32("hidepid", Opt_hidepid);
+ struct fs_parse_result result;
+ int base = (unsigned long)hidepid_u32_spec.data;
+
+ if (param->type != fs_value_is_string)
+ return invalf(fc, "proc: unexpected type of hidepid value\n");
+
+ if (!kstrtouint(param->string, base, &result.uint_32)) {
+ if (!valid_hidepid(result.uint_32))
+ return invalf(fc, "proc: unknown value of hidepid - %s\n", param->string);
+ ctx->hidepid = result.uint_32;
+ return 0;
+ }
+
+ if (!strcmp(param->string, "off"))
+ ctx->hidepid = HIDEPID_OFF;
+ else if (!strcmp(param->string, "noaccess"))
+ ctx->hidepid = HIDEPID_NO_ACCESS;
+ else if (!strcmp(param->string, "invisible"))
+ ctx->hidepid = HIDEPID_INVISIBLE;
+ else if (!strcmp(param->string, "ptraceable"))
+ ctx->hidepid = HIDEPID_NOT_PTRACEABLE;
+ else
+ return invalf(fc, "proc: unknown value of hidepid - %s\n", param->string);
+
+ return 0;
+}
+
+static int proc_parse_subset_param(struct fs_context *fc, char *value)
+{
+ struct proc_fs_context *ctx = fc->fs_private;
+
+ while (value) {
+ char *ptr = strchr(value, ',');
+
+ if (ptr != NULL)
+ *ptr++ = '\0';
+
+ if (*value != '\0') {
+ if (!strcmp(value, "pid")) {
+ ctx->pidonly = PROC_PIDONLY_ON;
+ } else {
+ return invalf(fc, "proc: unsupported subset option - %s\n", value);
+ }
+ }
+ value = ptr;
+ }
+
+ return 0;
+}
static int proc_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
@@ -58,7 +118,7 @@
struct fs_parse_result result;
int opt;
- opt = fs_parse(fc, &proc_fs_parameters, param, &result);
+ opt = fs_parse(fc, proc_fs_parameters, param, &result);
if (opt < 0)
return opt;
@@ -68,10 +128,13 @@
break;
case Opt_hidepid:
- ctx->hidepid = result.uint_32;
- if (ctx->hidepid < HIDEPID_OFF ||
- ctx->hidepid > HIDEPID_INVISIBLE)
- return invalf(fc, "proc: hidepid value must be between 0 and 2.\n");
+ if (proc_parse_hidepid_param(fc, param))
+ return -EINVAL;
+ break;
+
+ case Opt_subset:
+ if (proc_parse_subset_param(fc, param->string) < 0)
+ return -EINVAL;
break;
default:
@@ -82,26 +145,33 @@
return 0;
}
-static void proc_apply_options(struct super_block *s,
+static void proc_apply_options(struct proc_fs_info *fs_info,
struct fs_context *fc,
- struct pid_namespace *pid_ns,
struct user_namespace *user_ns)
{
struct proc_fs_context *ctx = fc->fs_private;
if (ctx->mask & (1 << Opt_gid))
- pid_ns->pid_gid = make_kgid(user_ns, ctx->gid);
+ fs_info->pid_gid = make_kgid(user_ns, ctx->gid);
if (ctx->mask & (1 << Opt_hidepid))
- pid_ns->hide_pid = ctx->hidepid;
+ fs_info->hide_pid = ctx->hidepid;
+ if (ctx->mask & (1 << Opt_subset))
+ fs_info->pidonly = ctx->pidonly;
}
static int proc_fill_super(struct super_block *s, struct fs_context *fc)
{
- struct pid_namespace *pid_ns = get_pid_ns(s->s_fs_info);
+ struct proc_fs_context *ctx = fc->fs_private;
struct inode *root_inode;
+ struct proc_fs_info *fs_info;
int ret;
- proc_apply_options(s, fc, pid_ns, current_user_ns());
+ fs_info = kzalloc(sizeof(*fs_info), GFP_KERNEL);
+ if (!fs_info)
+ return -ENOMEM;
+
+ fs_info->pid_ns = get_pid_ns(ctx->pid_ns);
+ proc_apply_options(fs_info, fc, current_user_ns());
/* User space would break if executables or devices appear on proc */
s->s_iflags |= SB_I_USERNS_VISIBLE | SB_I_NOEXEC | SB_I_NODEV;
@@ -111,6 +181,7 @@
s->s_magic = PROC_SUPER_MAGIC;
s->s_op = &proc_sops;
s->s_time_gran = 1;
+ s->s_fs_info = fs_info;
/*
* procfs isn't actually a stacking filesystem; however, there is
@@ -118,7 +189,7 @@
* top of it
*/
s->s_stack_depth = FILESYSTEM_MAX_STACK_DEPTH;
-
+
/* procfs dentries and inodes don't require IO to create */
s->s_shrink.seeks = 0;
@@ -145,19 +216,17 @@
static int proc_reconfigure(struct fs_context *fc)
{
struct super_block *sb = fc->root->d_sb;
- struct pid_namespace *pid = sb->s_fs_info;
+ struct proc_fs_info *fs_info = proc_sb_info(sb);
sync_filesystem(sb);
- proc_apply_options(sb, fc, pid, current_user_ns());
+ proc_apply_options(fs_info, fc, current_user_ns());
return 0;
}
static int proc_get_tree(struct fs_context *fc)
{
- struct proc_fs_context *ctx = fc->fs_private;
-
- return get_tree_keyed(fc, proc_fill_super, ctx->pid_ns);
+ return get_tree_nodev(fc, proc_fill_super);
}
static void proc_fs_context_free(struct fs_context *fc)
@@ -193,21 +262,25 @@
static void proc_kill_sb(struct super_block *sb)
{
- struct pid_namespace *ns;
+ struct proc_fs_info *fs_info = proc_sb_info(sb);
- ns = (struct pid_namespace *)sb->s_fs_info;
- if (ns->proc_self)
- dput(ns->proc_self);
- if (ns->proc_thread_self)
- dput(ns->proc_thread_self);
+ if (!fs_info) {
+ kill_anon_super(sb);
+ return;
+ }
+
+ dput(fs_info->proc_self);
+ dput(fs_info->proc_thread_self);
+
kill_anon_super(sb);
- put_pid_ns(ns);
+ put_pid_ns(fs_info->pid_ns);
+ kfree(fs_info);
}
static struct file_system_type proc_fs_type = {
.name = "proc",
.init_fs_context = proc_init_fs_context,
- .parameters = &proc_fs_parameters,
+ .parameters = proc_fs_parameters,
.kill_sb = proc_kill_sb,
.fs_flags = FS_USERNS_MOUNT | FS_DISALLOW_NOTIFY_PERM,
};
@@ -292,44 +365,8 @@
.nlink = 2,
.refcnt = REFCOUNT_INIT(1),
.proc_iops = &proc_root_inode_operations,
- .proc_fops = &proc_root_operations,
+ .proc_dir_ops = &proc_root_operations,
.parent = &proc_root,
.subdir = RB_ROOT,
.name = "/proc",
};
-
-int pid_ns_prepare_proc(struct pid_namespace *ns)
-{
- struct proc_fs_context *ctx;
- struct fs_context *fc;
- struct vfsmount *mnt;
-
- fc = fs_context_for_mount(&proc_fs_type, SB_KERNMOUNT);
- if (IS_ERR(fc))
- return PTR_ERR(fc);
-
- if (fc->user_ns != ns->user_ns) {
- put_user_ns(fc->user_ns);
- fc->user_ns = get_user_ns(ns->user_ns);
- }
-
- ctx = fc->fs_private;
- if (ctx->pid_ns != ns) {
- put_pid_ns(ctx->pid_ns);
- get_pid_ns(ns);
- ctx->pid_ns = ns;
- }
-
- mnt = fc_mount(fc);
- put_fs_context(fc);
- if (IS_ERR(mnt))
- return PTR_ERR(mnt);
-
- ns->proc_mnt = mnt;
- return 0;
-}
-
-void pid_ns_release_proc(struct pid_namespace *ns)
-{
- kern_unmount(ns->proc_mnt);
-}
diff --git a/fs/proc/self.c b/fs/proc/self.c
index 5823368..a401215 100644
--- a/fs/proc/self.c
+++ b/fs/proc/self.c
@@ -12,7 +12,7 @@
struct inode *inode,
struct delayed_call *done)
{
- struct pid_namespace *ns = proc_pid_ns(inode);
+ struct pid_namespace *ns = proc_pid_ns(inode->i_sb);
pid_t tgid = task_tgid_nr_ns(current, ns);
char *name;
@@ -20,7 +20,7 @@
* Not currently supported. Once we can inherit all of struct pid,
* we can allow this.
*/
- if (current->flags & PF_KTHREAD)
+ if (current->flags & PF_IO_WORKER)
return ERR_PTR(-EOPNOTSUPP);
if (!tgid)
@@ -43,10 +43,10 @@
int proc_setup_self(struct super_block *s)
{
struct inode *root_inode = d_inode(s->s_root);
- struct pid_namespace *ns = proc_pid_ns(root_inode);
+ struct proc_fs_info *fs_info = proc_sb_info(s);
struct dentry *self;
int ret = -ENOMEM;
-
+
inode_lock(root_inode);
self = d_alloc_name(s->s_root, "self");
if (self) {
@@ -69,7 +69,7 @@
if (ret)
pr_err("proc_fill_super: can't allocate /proc/self\n");
else
- ns->proc_self = self;
+ fs_info->proc_self = self;
return ret;
}
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index 80c305f..3bed48d 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -23,7 +23,7 @@
#ifdef arch_idle_time
-static u64 get_idle_time(struct kernel_cpustat *kcs, int cpu)
+u64 get_idle_time(struct kernel_cpustat *kcs, int cpu)
{
u64 idle;
@@ -45,7 +45,7 @@
#else
-static u64 get_idle_time(struct kernel_cpustat *kcs, int cpu)
+u64 get_idle_time(struct kernel_cpustat *kcs, int cpu)
{
u64 idle, idle_usecs = -1ULL;
@@ -120,20 +120,23 @@
getboottime64(&boottime);
for_each_possible_cpu(i) {
- struct kernel_cpustat *kcs = &kcpustat_cpu(i);
+ struct kernel_cpustat kcpustat;
+ u64 *cpustat = kcpustat.cpustat;
- user += kcs->cpustat[CPUTIME_USER];
- nice += kcs->cpustat[CPUTIME_NICE];
- system += kcs->cpustat[CPUTIME_SYSTEM];
- idle += get_idle_time(kcs, i);
- iowait += get_iowait_time(kcs, i);
- irq += kcs->cpustat[CPUTIME_IRQ];
- softirq += kcs->cpustat[CPUTIME_SOFTIRQ];
- steal += kcs->cpustat[CPUTIME_STEAL];
- guest += kcs->cpustat[CPUTIME_GUEST];
- guest_nice += kcs->cpustat[CPUTIME_GUEST_NICE];
- sum += kstat_cpu_irqs_sum(i);
- sum += arch_irq_stat_cpu(i);
+ kcpustat_cpu_fetch(&kcpustat, i);
+
+ user += cpustat[CPUTIME_USER];
+ nice += cpustat[CPUTIME_NICE];
+ system += cpustat[CPUTIME_SYSTEM];
+ idle += get_idle_time(&kcpustat, i);
+ iowait += get_iowait_time(&kcpustat, i);
+ irq += cpustat[CPUTIME_IRQ];
+ softirq += cpustat[CPUTIME_SOFTIRQ];
+ steal += cpustat[CPUTIME_STEAL];
+ guest += cpustat[CPUTIME_GUEST];
+ guest_nice += cpustat[CPUTIME_GUEST_NICE];
+ sum += kstat_cpu_irqs_sum(i);
+ sum += arch_irq_stat_cpu(i);
for (j = 0; j < NR_SOFTIRQS; j++) {
unsigned int softirq_stat = kstat_softirqs_cpu(j, i);
@@ -157,19 +160,22 @@
seq_putc(p, '\n');
for_each_online_cpu(i) {
- struct kernel_cpustat *kcs = &kcpustat_cpu(i);
+ struct kernel_cpustat kcpustat;
+ u64 *cpustat = kcpustat.cpustat;
+
+ kcpustat_cpu_fetch(&kcpustat, i);
/* Copy values here to work around gcc-2.95.3, gcc-2.96 */
- user = kcs->cpustat[CPUTIME_USER];
- nice = kcs->cpustat[CPUTIME_NICE];
- system = kcs->cpustat[CPUTIME_SYSTEM];
- idle = get_idle_time(kcs, i);
- iowait = get_iowait_time(kcs, i);
- irq = kcs->cpustat[CPUTIME_IRQ];
- softirq = kcs->cpustat[CPUTIME_SOFTIRQ];
- steal = kcs->cpustat[CPUTIME_STEAL];
- guest = kcs->cpustat[CPUTIME_GUEST];
- guest_nice = kcs->cpustat[CPUTIME_GUEST_NICE];
+ user = cpustat[CPUTIME_USER];
+ nice = cpustat[CPUTIME_NICE];
+ system = cpustat[CPUTIME_SYSTEM];
+ idle = get_idle_time(&kcpustat, i);
+ iowait = get_iowait_time(&kcpustat, i);
+ irq = cpustat[CPUTIME_IRQ];
+ softirq = cpustat[CPUTIME_SOFTIRQ];
+ steal = cpustat[CPUTIME_STEAL];
+ guest = cpustat[CPUTIME_GUEST];
+ guest_nice = cpustat[CPUTIME_GUEST_NICE];
seq_printf(p, "cpu%d", i);
seq_put_decimal_ull(p, " ", nsec_to_clock_t(user));
seq_put_decimal_ull(p, " ", nsec_to_clock_t(nice));
@@ -217,16 +223,17 @@
return single_open_size(file, show_stat, NULL, size);
}
-static const struct file_operations proc_stat_operations = {
- .open = stat_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
+static const struct proc_ops stat_proc_ops = {
+ .proc_flags = PROC_ENTRY_PERMANENT,
+ .proc_open = stat_open,
+ .proc_read_iter = seq_read_iter,
+ .proc_lseek = seq_lseek,
+ .proc_release = single_release,
};
static int __init proc_stat_init(void)
{
- proc_create("stat", 0, NULL, &proc_stat_operations);
+ proc_create("stat", 0, NULL, &stat_proc_ops);
return 0;
}
fs_initcall(proc_stat_init);
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index f51dadd..ba98371 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -123,38 +123,14 @@
}
#endif
-static void vma_stop(struct proc_maps_private *priv)
-{
- struct mm_struct *mm = priv->mm;
-
- release_task_mempolicy(priv);
- up_read(&mm->mmap_sem);
- mmput(mm);
-}
-
-static struct vm_area_struct *
-m_next_vma(struct proc_maps_private *priv, struct vm_area_struct *vma)
-{
- if (vma == priv->tail_vma)
- return NULL;
- return vma->vm_next ?: priv->tail_vma;
-}
-
-static void m_cache_vma(struct seq_file *m, struct vm_area_struct *vma)
-{
- if (m->count < m->size) /* vma is copied successfully */
- m->version = m_next_vma(m->private, vma) ? vma->vm_end : -1UL;
-}
-
static void *m_start(struct seq_file *m, loff_t *ppos)
{
struct proc_maps_private *priv = m->private;
- unsigned long last_addr = m->version;
+ unsigned long last_addr = *ppos;
struct mm_struct *mm;
struct vm_area_struct *vma;
- unsigned int pos = *ppos;
- /* See m_cache_vma(). Zero at the start or after lseek. */
+ /* See m_next(). Zero at the start or after lseek. */
if (last_addr == -1UL)
return NULL;
@@ -163,64 +139,59 @@
return ERR_PTR(-ESRCH);
mm = priv->mm;
- if (!mm || !mmget_not_zero(mm))
+ if (!mm || !mmget_not_zero(mm)) {
+ put_task_struct(priv->task);
+ priv->task = NULL;
return NULL;
+ }
- if (down_read_killable(&mm->mmap_sem)) {
+ if (mmap_read_lock_killable(mm)) {
mmput(mm);
+ put_task_struct(priv->task);
+ priv->task = NULL;
return ERR_PTR(-EINTR);
}
hold_task_mempolicy(priv);
priv->tail_vma = get_gate_vma(mm);
- if (last_addr) {
- vma = find_vma(mm, last_addr - 1);
- if (vma && vma->vm_start <= last_addr)
- vma = m_next_vma(priv, vma);
- if (vma)
- return vma;
- }
-
- m->version = 0;
- if (pos < mm->map_count) {
- for (vma = mm->mmap; pos; pos--) {
- m->version = vma->vm_start;
- vma = vma->vm_next;
- }
+ vma = find_vma(mm, last_addr);
+ if (vma)
return vma;
- }
- /* we do not bother to update m->version in this case */
- if (pos == mm->map_count && priv->tail_vma)
- return priv->tail_vma;
-
- vma_stop(priv);
- return NULL;
+ return priv->tail_vma;
}
-static void *m_next(struct seq_file *m, void *v, loff_t *pos)
+static void *m_next(struct seq_file *m, void *v, loff_t *ppos)
{
struct proc_maps_private *priv = m->private;
- struct vm_area_struct *next;
+ struct vm_area_struct *next, *vma = v;
- (*pos)++;
- next = m_next_vma(priv, v);
- if (!next)
- vma_stop(priv);
+ if (vma == priv->tail_vma)
+ next = NULL;
+ else if (vma->vm_next)
+ next = vma->vm_next;
+ else
+ next = priv->tail_vma;
+
+ *ppos = next ? next->vm_start : -1UL;
+
return next;
}
static void m_stop(struct seq_file *m, void *v)
{
struct proc_maps_private *priv = m->private;
+ struct mm_struct *mm = priv->mm;
- if (!IS_ERR_OR_NULL(v))
- vma_stop(priv);
- if (priv->task) {
- put_task_struct(priv->task);
- priv->task = NULL;
- }
+ if (!priv->task)
+ return;
+
+ release_task_mempolicy(priv);
+ mmap_read_unlock(mm);
+ mmput(mm);
+ put_task_struct(priv->task);
+ priv->task = NULL;
}
static int proc_maps_open(struct inode *inode, struct file *file,
@@ -363,7 +334,6 @@
static int show_map(struct seq_file *m, void *v)
{
show_map_vma(m, v);
- m_cache_vma(m, v);
return 0;
}
@@ -460,7 +430,8 @@
}
static void smaps_account(struct mem_size_stats *mss, struct page *page,
- bool compound, bool young, bool dirty, bool locked)
+ bool compound, bool young, bool dirty, bool locked,
+ bool migration)
{
int i, nr = compound ? compound_nr(page) : 1;
unsigned long size = nr * PAGE_SIZE;
@@ -487,8 +458,15 @@
* page_count(page) == 1 guarantees the page is mapped exactly once.
* If any subpage of the compound page mapped with PTE it would elevate
* page_count().
+ *
+ * The page_mapcount() is called to get a snapshot of the mapcount.
+ * Without holding the page lock this snapshot can be slightly wrong as
+ * we cannot always read the mapcount atomically. It is not safe to
+ * call page_mapcount() even with PTL held if the page is not mapped,
+ * especially for migration entries. Treat regular migration entries
+ * as mapcount == 1.
*/
- if (page_count(page) == 1) {
+ if ((page_count(page) == 1) || migration) {
smaps_page_accumulate(mss, page, size, size << PSS_SHIFT, dirty,
locked, true);
return;
@@ -505,7 +483,7 @@
#ifdef CONFIG_SHMEM
static int smaps_pte_hole(unsigned long addr, unsigned long end,
- struct mm_walk *walk)
+ __always_unused int depth, struct mm_walk *walk)
{
struct mem_size_stats *mss = walk->private;
@@ -525,6 +503,7 @@
struct vm_area_struct *vma = walk->vma;
bool locked = !!(vma->vm_flags & VM_LOCKED);
struct page *page = NULL;
+ bool migration = false;
if (pte_present(*pte)) {
page = vm_normal_page(vma, addr, *pte);
@@ -544,29 +523,25 @@
} else {
mss->swap_pss += (u64)PAGE_SIZE << PSS_SHIFT;
}
- } else if (is_migration_entry(swpent))
+ } else if (is_migration_entry(swpent)) {
+ migration = true;
page = migration_entry_to_page(swpent);
- else if (is_device_private_entry(swpent))
+ } else if (is_device_private_entry(swpent))
page = device_private_entry_to_page(swpent);
} else if (unlikely(IS_ENABLED(CONFIG_SHMEM) && mss->check_shmem_swap
&& pte_none(*pte))) {
- page = find_get_entry(vma->vm_file->f_mapping,
+ page = xa_load(&vma->vm_file->f_mapping->i_pages,
linear_page_index(vma, addr));
- if (!page)
- return;
-
if (xa_is_value(page))
mss->swap += PAGE_SIZE;
- else
- put_page(page);
-
return;
}
if (!page)
return;
- smaps_account(mss, page, false, pte_young(*pte), pte_dirty(*pte), locked);
+ smaps_account(mss, page, false, pte_young(*pte), pte_dirty(*pte),
+ locked, migration);
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -576,10 +551,20 @@
struct mem_size_stats *mss = walk->private;
struct vm_area_struct *vma = walk->vma;
bool locked = !!(vma->vm_flags & VM_LOCKED);
- struct page *page;
+ struct page *page = NULL;
+ bool migration = false;
- /* FOLL_DUMP will return -EFAULT on huge zero page */
- page = follow_trans_huge_pmd(vma, addr, pmd, FOLL_DUMP);
+ if (pmd_present(*pmd)) {
+ /* FOLL_DUMP will return -EFAULT on huge zero page */
+ page = follow_trans_huge_pmd(vma, addr, pmd, FOLL_DUMP);
+ } else if (unlikely(thp_migration_supported() && is_swap_pmd(*pmd))) {
+ swp_entry_t entry = pmd_to_swp_entry(*pmd);
+
+ if (is_migration_entry(entry)) {
+ migration = true;
+ page = migration_entry_to_page(entry);
+ }
+ }
if (IS_ERR_OR_NULL(page))
return;
if (PageAnon(page))
@@ -590,7 +575,9 @@
/* pass */;
else
mss->file_thp += HPAGE_PMD_SIZE;
- smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd), locked);
+
+ smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd),
+ locked, migration);
}
#else
static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
@@ -608,8 +595,7 @@
ptl = pmd_trans_huge_lock(pmd, vma);
if (ptl) {
- if (pmd_present(*pmd))
- smaps_pmd_entry(pmd, addr, walk);
+ smaps_pmd_entry(pmd, addr, walk);
spin_unlock(ptl);
goto out;
}
@@ -617,7 +603,7 @@
if (pmd_trans_unstable(pmd))
goto out;
/*
- * The mmap_sem held all the way back in m_start() is what
+ * The mmap_lock held all the way back in m_start() is what
* keeps khugepaged out of here and from collapsing things
* in here.
*/
@@ -652,9 +638,6 @@
[ilog2(VM_GROWSDOWN)] = "gd",
[ilog2(VM_PFNMAP)] = "pf",
[ilog2(VM_DENYWRITE)] = "dw",
-#ifdef CONFIG_X86_INTEL_MPX
- [ilog2(VM_MPX)] = "mp",
-#endif
[ilog2(VM_LOCKED)] = "lo",
[ilog2(VM_IO)] = "io",
[ilog2(VM_SEQ_READ)] = "sr",
@@ -668,6 +651,9 @@
[ilog2(VM_ARCH_1)] = "ar",
[ilog2(VM_WIPEONFORK)] = "wf",
[ilog2(VM_DONTDUMP)] = "dd",
+#ifdef CONFIG_ARM64_BTI
+ [ilog2(VM_ARM64_BTI)] = "bt",
+#endif
#ifdef CONFIG_MEM_SOFT_DIRTY
[ilog2(VM_SOFTDIRTY)] = "sd",
#endif
@@ -677,6 +663,10 @@
[ilog2(VM_MERGEABLE)] = "mg",
[ilog2(VM_UFFD_MISSING)]= "um",
[ilog2(VM_UFFD_WP)] = "uw",
+#ifdef CONFIG_ARM64_MTE
+ [ilog2(VM_MTE)] = "mt",
+ [ilog2(VM_MTE_ALLOWED)] = "",
+#endif
#ifdef CONFIG_ARCH_HAS_PKEYS
/* These come out via ProtectionKey: */
[ilog2(VM_PKEY_BIT0)] = "",
@@ -747,9 +737,21 @@
.pte_hole = smaps_pte_hole,
};
+/*
+ * Gather mem stats from @vma with the indicated beginning
+ * address @start, and keep them in @mss.
+ *
+ * Use vm_start of @vma as the beginning address if @start is 0.
+ */
static void smap_gather_stats(struct vm_area_struct *vma,
- struct mem_size_stats *mss)
+ struct mem_size_stats *mss, unsigned long start)
{
+ const struct mm_walk_ops *ops = &smaps_walk_ops;
+
+ /* Invalid start */
+ if (start >= vma->vm_end)
+ return;
+
#ifdef CONFIG_SHMEM
/* In case of smaps_rollup, reset the value from previous vma */
mss->check_shmem_swap = false;
@@ -766,18 +768,20 @@
*/
unsigned long shmem_swapped = shmem_swap_usage(vma);
- if (!shmem_swapped || (vma->vm_flags & VM_SHARED) ||
- !(vma->vm_flags & VM_WRITE)) {
+ if (!start && (!shmem_swapped || (vma->vm_flags & VM_SHARED) ||
+ !(vma->vm_flags & VM_WRITE))) {
mss->swap += shmem_swapped;
} else {
mss->check_shmem_swap = true;
- walk_page_vma(vma, &smaps_shmem_walk_ops, mss);
- return;
+ ops = &smaps_shmem_walk_ops;
}
}
#endif
- /* mmap_sem is held in m_start */
- walk_page_vma(vma, &smaps_walk_ops, mss);
+ /* mmap_lock is held in m_start */
+ if (!start)
+ walk_page_vma(vma, ops, mss);
+ else
+ walk_page_range(vma->vm_mm, start, vma->vm_end, ops, mss);
}
#define SEQ_PUT_DEC(str, val) \
@@ -810,7 +814,7 @@
SEQ_PUT_DEC(" kB\nLazyFree: ", mss->lazyfree);
SEQ_PUT_DEC(" kB\nAnonHugePages: ", mss->anonymous_thp);
SEQ_PUT_DEC(" kB\nShmemPmdMapped: ", mss->shmem_thp);
- SEQ_PUT_DEC(" kB\nFilePmdMapped: ", mss->file_thp);
+ SEQ_PUT_DEC(" kB\nFilePmdMapped: ", mss->file_thp);
SEQ_PUT_DEC(" kB\nShared_Hugetlb: ", mss->shared_hugetlb);
seq_put_decimal_ull_width(m, " kB\nPrivate_Hugetlb: ",
mss->private_hugetlb >> 10, 7);
@@ -829,7 +833,7 @@
memset(&mss, 0, sizeof(mss));
- smap_gather_stats(vma, &mss);
+ smap_gather_stats(vma, &mss, 0);
show_map_vma(m, vma);
@@ -840,15 +844,13 @@
__show_smap(m, &mss, false);
- seq_printf(m, "THPeligible: %d\n",
- transparent_hugepage_enabled(vma));
+ seq_printf(m, "THPeligible: %d\n",
+ transparent_hugepage_active(vma));
if (arch_pkeys_enabled())
seq_printf(m, "ProtectionKey: %8u\n", vma_pkey(vma));
show_smap_vma_flags(m, vma);
- m_cache_vma(m, vma);
-
return 0;
}
@@ -873,15 +875,79 @@
memset(&mss, 0, sizeof(mss));
- ret = down_read_killable(&mm->mmap_sem);
+ ret = mmap_read_lock_killable(mm);
if (ret)
goto out_put_mm;
hold_task_mempolicy(priv);
- for (vma = priv->mm->mmap; vma; vma = vma->vm_next) {
- smap_gather_stats(vma, &mss);
+ for (vma = priv->mm->mmap; vma;) {
+ smap_gather_stats(vma, &mss, 0);
last_vma_end = vma->vm_end;
+
+ /*
+ * Release mmap_lock temporarily if someone wants to
+ * access it for write request.
+ */
+ if (mmap_lock_is_contended(mm)) {
+ mmap_read_unlock(mm);
+ ret = mmap_read_lock_killable(mm);
+ if (ret) {
+ release_task_mempolicy(priv);
+ goto out_put_mm;
+ }
+
+ /*
+ * After dropping the lock, there are four cases to
+ * consider. See the following example for explanation.
+ *
+ * +------+------+-----------+
+ * | VMA1 | VMA2 | VMA3 |
+ * +------+------+-----------+
+ * | | | |
+ * 4k 8k 16k 400k
+ *
+ * Suppose we drop the lock after reading VMA2 due to
+ * contention, then we get:
+ *
+ * last_vma_end = 16k
+ *
+ * 1) VMA2 is freed, but VMA3 exists:
+ *
+ * find_vma(mm, 16k - 1) will return VMA3.
+ * In this case, just continue from VMA3.
+ *
+ * 2) VMA2 still exists:
+ *
+ * find_vma(mm, 16k - 1) will return VMA2.
+ * Iterate the loop like the original one.
+ *
+ * 3) No more VMAs can be found:
+ *
+ * find_vma(mm, 16k - 1) will return NULL.
+ * No more things to do, just break.
+ *
+ * 4) (last_vma_end - 1) is the middle of a vma (VMA'):
+ *
+ * find_vma(mm, 16k - 1) will return VMA' whose range
+ * contains last_vma_end.
+ * Iterate VMA' from last_vma_end.
+ */
+ vma = find_vma(mm, last_vma_end - 1);
+ /* Case 3 above */
+ if (!vma)
+ break;
+
+ /* Case 1 above */
+ if (vma->vm_start >= last_vma_end)
+ continue;
+
+ /* Case 4 above */
+ if (vma->vm_end > last_vma_end)
+ smap_gather_stats(vma, &mss, last_vma_end);
+ }
+ /* Case 2 above */
+ vma = vma->vm_next;
}
show_vma_header_prefix(m, priv->mm->mmap->vm_start,
@@ -892,7 +958,7 @@
__show_smap(m, &mss, true);
release_task_mempolicy(priv);
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
out_put_mm:
mmput(mm);
@@ -985,6 +1051,25 @@
};
#ifdef CONFIG_MEM_SOFT_DIRTY
+
+#define is_cow_mapping(flags) (((flags) & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE)
+
+static inline bool pte_is_pinned(struct vm_area_struct *vma, unsigned long addr, pte_t pte)
+{
+ struct page *page;
+
+ if (!pte_write(pte))
+ return false;
+ if (!is_cow_mapping(vma->vm_flags))
+ return false;
+ if (likely(!atomic_read(&vma->vm_mm->has_pinned)))
+ return false;
+ page = vm_normal_page(vma, addr, pte);
+ if (!page)
+ return false;
+ return page_maybe_dma_pinned(page);
+}
+
static inline void clear_soft_dirty(struct vm_area_struct *vma,
unsigned long addr, pte_t *pte)
{
@@ -999,6 +1084,8 @@
if (pte_present(ptent)) {
pte_t old_pte;
+ if (pte_is_pinned(vma, addr, ptent))
+ return;
old_pte = ptep_modify_prot_start(vma, addr, pte);
ptent = pte_wrprotect(old_pte);
ptent = pte_clear_soft_dirty(ptent);
@@ -1139,7 +1226,6 @@
struct mm_struct *mm;
struct vm_area_struct *vma;
enum clear_refs_types type;
- struct mmu_gather tlb;
int itype;
int rv;
@@ -1165,71 +1251,41 @@
.type = type,
};
+ if (mmap_write_lock_killable(mm)) {
+ count = -EINTR;
+ goto out_mm;
+ }
if (type == CLEAR_REFS_MM_HIWATER_RSS) {
- if (down_write_killable(&mm->mmap_sem)) {
- count = -EINTR;
- goto out_mm;
- }
-
/*
* Writing 5 to /proc/pid/clear_refs resets the peak
* resident set size to this mm's current rss value.
*/
reset_mm_hiwater_rss(mm);
- up_write(&mm->mmap_sem);
- goto out_mm;
+ goto out_unlock;
}
- if (down_read_killable(&mm->mmap_sem)) {
- count = -EINTR;
- goto out_mm;
- }
- tlb_gather_mmu(&tlb, mm, 0, -1);
if (type == CLEAR_REFS_SOFT_DIRTY) {
for (vma = mm->mmap; vma; vma = vma->vm_next) {
if (!(vma->vm_flags & VM_SOFTDIRTY))
continue;
- up_read(&mm->mmap_sem);
- if (down_write_killable(&mm->mmap_sem)) {
- count = -EINTR;
- goto out_mm;
- }
- /*
- * Avoid to modify vma->vm_flags
- * without locked ops while the
- * coredump reads the vm_flags.
- */
- if (!mmget_still_valid(mm)) {
- /*
- * Silently return "count"
- * like if get_task_mm()
- * failed. FIXME: should this
- * function have returned
- * -ESRCH if get_task_mm()
- * failed like if
- * get_proc_task() fails?
- */
- up_write(&mm->mmap_sem);
- goto out_mm;
- }
- for (vma = mm->mmap; vma; vma = vma->vm_next) {
- vma->vm_flags &= ~VM_SOFTDIRTY;
- vma_set_page_prot(vma);
- }
- downgrade_write(&mm->mmap_sem);
- break;
+ vma->vm_flags &= ~VM_SOFTDIRTY;
+ vma_set_page_prot(vma);
}
+ inc_tlb_flush_pending(mm);
mmu_notifier_range_init(&range, MMU_NOTIFY_SOFT_DIRTY,
0, NULL, mm, 0, -1UL);
mmu_notifier_invalidate_range_start(&range);
}
walk_page_range(mm, 0, mm->highest_vm_end, &clear_refs_walk_ops,
&cp);
- if (type == CLEAR_REFS_SOFT_DIRTY)
+ if (type == CLEAR_REFS_SOFT_DIRTY) {
mmu_notifier_invalidate_range_end(&range);
- tlb_finish_mmu(&tlb, 0, -1);
- up_read(&mm->mmap_sem);
+ flush_tlb_mm(mm);
+ dec_tlb_flush_pending(mm);
+ }
+out_unlock:
+ mmap_write_unlock(mm);
out_mm:
mmput(mm);
}
@@ -1282,7 +1338,7 @@
}
static int pagemap_pte_hole(unsigned long start, unsigned long end,
- struct mm_walk *walk)
+ __always_unused int depth, struct mm_walk *walk)
{
struct pagemapread *pm = walk->private;
unsigned long addr = start;
@@ -1326,6 +1382,7 @@
{
u64 frame = 0, flags = 0;
struct page *page = NULL;
+ bool migration = false;
if (pte_present(pte)) {
if (pm->show_pfn)
@@ -1343,8 +1400,10 @@
frame = swp_type(entry) |
(swp_offset(entry) << MAX_SWAPFILES_SHIFT);
flags |= PM_SWAP;
- if (is_migration_entry(entry))
+ if (is_migration_entry(entry)) {
+ migration = true;
page = migration_entry_to_page(entry);
+ }
if (is_device_private_entry(entry))
page = device_private_entry_to_page(entry);
@@ -1352,7 +1411,7 @@
if (page && !PageAnon(page))
flags |= PM_FILE;
- if (page && page_mapcount(page) == 1)
+ if (page && !migration && page_mapcount(page) == 1)
flags |= PM_MMAP_EXCLUSIVE;
if (vma->vm_flags & VM_SOFTDIRTY)
flags |= PM_SOFT_DIRTY;
@@ -1368,8 +1427,9 @@
spinlock_t *ptl;
pte_t *pte, *orig_pte;
int err = 0;
-
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ bool migration = false;
+
ptl = pmd_trans_huge_lock(pmdp, vma);
if (ptl) {
u64 flags = 0, frame = 0;
@@ -1404,11 +1464,12 @@
if (pmd_swp_soft_dirty(pmd))
flags |= PM_SOFT_DIRTY;
VM_BUG_ON(!is_pmd_migration_entry(pmd));
+ migration = is_migration_entry(entry);
page = migration_entry_to_page(entry);
}
#endif
- if (page && page_mapcount(page) == 1)
+ if (page && !migration && page_mapcount(page) == 1)
flags |= PM_MMAP_EXCLUSIVE;
for (; addr != end; addr += PAGE_SIZE) {
@@ -1594,11 +1655,11 @@
/* overflow ? */
if (end < start_vaddr || end > end_vaddr)
end = end_vaddr;
- ret = down_read_killable(&mm->mmap_sem);
+ ret = mmap_read_lock_killable(mm);
if (ret)
goto out_free;
ret = walk_page_range(mm, start_vaddr, end, &pagemap_ops, &pm);
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
start_vaddr = end;
len = min(count, PM_ENTRY_BYTES * pm.pos);
@@ -1857,7 +1918,7 @@
if (is_vm_hugetlb_page(vma))
seq_puts(m, " huge");
- /* mmap_sem is held by m_start */
+ /* mmap_lock is held by m_start */
walk_page_vma(vma, &show_numa_ops, md);
if (!md->pages)
@@ -1891,7 +1952,6 @@
seq_printf(m, " kernelpagesize_kB=%lu", vma_kernel_pagesize(vma) >> 10);
out:
seq_putc(m, '\n');
- m_cache_vma(m, vma);
return 0;
}
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 7907e64..a6d21fc 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -25,7 +25,7 @@
struct rb_node *p;
unsigned long bytes = 0, sbytes = 0, slack = 0, size;
- down_read(&mm->mmap_sem);
+ mmap_read_lock(mm);
for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) {
vma = rb_entry(p, struct vm_area_struct, vm_rb);
@@ -77,7 +77,7 @@
"Shared:\t%8lu bytes\n",
bytes, slack, sbytes);
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
}
unsigned long task_vsize(struct mm_struct *mm)
@@ -86,12 +86,12 @@
struct rb_node *p;
unsigned long vsize = 0;
- down_read(&mm->mmap_sem);
+ mmap_read_lock(mm);
for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) {
vma = rb_entry(p, struct vm_area_struct, vm_rb);
vsize += vma->vm_end - vma->vm_start;
}
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
return vsize;
}
@@ -104,7 +104,7 @@
struct rb_node *p;
unsigned long size = kobjsize(mm);
- down_read(&mm->mmap_sem);
+ mmap_read_lock(mm);
for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) {
vma = rb_entry(p, struct vm_area_struct, vm_rb);
size += kobjsize(vma);
@@ -119,7 +119,7 @@
>> PAGE_SHIFT;
*data = (PAGE_ALIGN(mm->start_stack) - (mm->start_data & PAGE_MASK))
>> PAGE_SHIFT;
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
size >>= PAGE_SHIFT;
size += *text + *data;
*resident = size;
@@ -211,7 +211,7 @@
if (!mm || !mmget_not_zero(mm))
return NULL;
- if (down_read_killable(&mm->mmap_sem)) {
+ if (mmap_read_lock_killable(mm)) {
mmput(mm);
return ERR_PTR(-EINTR);
}
@@ -221,7 +221,7 @@
if (n-- == 0)
return p;
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
mmput(mm);
return NULL;
}
@@ -231,7 +231,7 @@
struct proc_maps_private *priv = m->private;
if (!IS_ERR_OR_NULL(_vml)) {
- up_read(&priv->mm->mmap_sem);
+ mmap_read_unlock(priv->mm);
mmput(priv->mm);
}
if (priv->task) {
diff --git a/fs/proc/thread_self.c b/fs/proc/thread_self.c
index fac9e50..d56681d 100644
--- a/fs/proc/thread_self.c
+++ b/fs/proc/thread_self.c
@@ -12,11 +12,18 @@
struct inode *inode,
struct delayed_call *done)
{
- struct pid_namespace *ns = proc_pid_ns(inode);
+ struct pid_namespace *ns = proc_pid_ns(inode->i_sb);
pid_t tgid = task_tgid_nr_ns(current, ns);
pid_t pid = task_pid_nr_ns(current, ns);
char *name;
+ /*
+ * Not currently supported. Once we can inherit all of struct pid,
+ * we can allow this.
+ */
+ if (current->flags & PF_IO_WORKER)
+ return ERR_PTR(-EOPNOTSUPP);
+
if (!pid)
return ERR_PTR(-ENOENT);
name = kmalloc(10 + 6 + 10 + 1, dentry ? GFP_KERNEL : GFP_ATOMIC);
@@ -36,7 +43,7 @@
int proc_setup_thread_self(struct super_block *s)
{
struct inode *root_inode = d_inode(s->s_root);
- struct pid_namespace *ns = proc_pid_ns(root_inode);
+ struct proc_fs_info *fs_info = proc_sb_info(s);
struct dentry *thread_self;
int ret = -ENOMEM;
@@ -60,9 +67,9 @@
inode_unlock(root_inode);
if (ret)
- pr_err("proc_fill_super: can't allocate /proc/thread_self\n");
+ pr_err("proc_fill_super: can't allocate /proc/thread-self\n");
else
- ns->proc_thread_self = thread_self;
+ fs_info->proc_thread_self = thread_self;
return ret;
}
diff --git a/fs/proc/uptime.c b/fs/proc/uptime.c
index a4c2791..deb99bc 100644
--- a/fs/proc/uptime.c
+++ b/fs/proc/uptime.c
@@ -5,22 +5,29 @@
#include <linux/sched.h>
#include <linux/seq_file.h>
#include <linux/time.h>
+#include <linux/time_namespace.h>
#include <linux/kernel_stat.h>
static int uptime_proc_show(struct seq_file *m, void *v)
{
struct timespec64 uptime;
struct timespec64 idle;
- u64 nsec;
+ u64 idle_nsec;
u32 rem;
int i;
- nsec = 0;
- for_each_possible_cpu(i)
- nsec += (__force u64) kcpustat_cpu(i).cpustat[CPUTIME_IDLE];
+ idle_nsec = 0;
+ for_each_possible_cpu(i) {
+ struct kernel_cpustat kcs;
+
+ kcpustat_cpu_fetch(&kcs, i);
+ idle_nsec += get_idle_time(&kcs, i);
+ }
ktime_get_boottime_ts64(&uptime);
- idle.tv_sec = div_u64_rem(nsec, NSEC_PER_SEC, &rem);
+ timens_add_boottime(&uptime);
+
+ idle.tv_sec = div_u64_rem(idle_nsec, NSEC_PER_SEC, &rem);
idle.tv_nsec = rem;
seq_printf(m, "%lu.%02lu %lu.%02lu\n",
(unsigned long) uptime.tv_sec,
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 080ca9d..0e4278d 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -27,7 +27,6 @@
#include <linux/pagemap.h>
#include <linux/uaccess.h>
#include <linux/mem_encrypt.h>
-#include <asm/pgtable.h>
#include <asm/io.h>
#include "internal.h"
@@ -125,9 +124,13 @@
nr_bytes = count;
/* If pfn is not ram, return zeros for sparse dump files */
- if (pfn_is_ram(pfn) == 0)
- memset(buf, 0, nr_bytes);
- else {
+ if (pfn_is_ram(pfn) == 0) {
+ tmp = 0;
+ if (!userbuf)
+ memset(buf, 0, nr_bytes);
+ else if (clear_user(buf, nr_bytes))
+ tmp = -EFAULT;
+ } else {
if (encrypted)
tmp = copy_oldmem_page_encrypted(pfn, buf,
nr_bytes,
@@ -136,10 +139,10 @@
else
tmp = copy_oldmem_page(pfn, buf, nr_bytes,
offset, userbuf);
-
- if (tmp < 0)
- return tmp;
}
+ if (tmp < 0)
+ return tmp;
+
*ppos += nr_bytes;
count -= nr_bytes;
buf += nr_bytes;
@@ -668,10 +671,10 @@
}
#endif
-static const struct file_operations proc_vmcore_operations = {
- .read = read_vmcore,
- .llseek = default_llseek,
- .mmap = mmap_vmcore,
+static const struct proc_ops vmcore_proc_ops = {
+ .proc_read = read_vmcore,
+ .proc_lseek = default_llseek,
+ .proc_mmap = mmap_vmcore,
};
static struct vmcore* __init get_new_element(void)
@@ -1556,7 +1559,7 @@
elfcorehdr_free(elfcorehdr_addr);
elfcorehdr_addr = ELFCORE_ADDR_ERR;
- proc_vmcore = proc_create("vmcore", S_IRUSR, NULL, &proc_vmcore_operations);
+ proc_vmcore = proc_create("vmcore", S_IRUSR, NULL, &vmcore_proc_ops);
if (proc_vmcore)
proc_vmcore->size = vmcore_size;
return 0;