Update Linux to v5.10.109
Sourced from [1]
[1] https://cdn.kernel.org/pub/linux/kernel/v5.x/linux-5.10.109.tar.xz
Change-Id: I19bca9fc6762d4e63bcf3e4cba88bbe560d9c76c
Signed-off-by: Olivier Deprez <olivier.deprez@arm.com>
diff --git a/kernel/pid.c b/kernel/pid.c
index 0a9f2e4..4856818 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -42,6 +42,8 @@
#include <linux/sched/signal.h>
#include <linux/sched/task.h>
#include <linux/idr.h>
+#include <net/sock.h>
+#include <uapi/linux/pidfd.h>
struct pid init_struct_pid = {
.count = REFCOUNT_INIT(1),
@@ -144,9 +146,6 @@
/* Handle a fork failure of the first process */
WARN_ON(ns->child_reaper);
ns->pid_allocated = 0;
- /* fall through */
- case 0:
- schedule_work(&ns->proc_work);
break;
}
@@ -157,7 +156,8 @@
call_rcu(&pid->rcu, delayed_put_pid);
}
-struct pid *alloc_pid(struct pid_namespace *ns)
+struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
+ size_t set_tid_size)
{
struct pid *pid;
enum pid_type type;
@@ -166,6 +166,17 @@
struct upid *upid;
int retval = -ENOMEM;
+ /*
+ * set_tid_size contains the size of the set_tid array. Starting at
+ * the most nested currently active PID namespace it tells alloc_pid()
+ * which PID to set for a process in that most nested PID namespace
+ * up to set_tid_size PID namespaces. It does not have to set the PID
+ * for a process in all nested PID namespaces but set_tid_size must
+ * never be greater than the current ns->level + 1.
+ */
+ if (set_tid_size > ns->level + 1)
+ return ERR_PTR(-EINVAL);
+
pid = kmem_cache_alloc(ns->pid_cachep, GFP_KERNEL);
if (!pid)
return ERR_PTR(retval);
@@ -174,24 +185,54 @@
pid->level = ns->level;
for (i = ns->level; i >= 0; i--) {
- int pid_min = 1;
+ int tid = 0;
+
+ if (set_tid_size) {
+ tid = set_tid[ns->level - i];
+
+ retval = -EINVAL;
+ if (tid < 1 || tid >= pid_max)
+ goto out_free;
+ /*
+ * Also fail if a PID != 1 is requested and
+ * no PID 1 exists.
+ */
+ if (tid != 1 && !tmp->child_reaper)
+ goto out_free;
+ retval = -EPERM;
+ if (!checkpoint_restore_ns_capable(tmp->user_ns))
+ goto out_free;
+ set_tid_size--;
+ }
idr_preload(GFP_KERNEL);
spin_lock_irq(&pidmap_lock);
- /*
- * init really needs pid 1, but after reaching the maximum
- * wrap back to RESERVED_PIDS
- */
- if (idr_get_cursor(&tmp->idr) > RESERVED_PIDS)
- pid_min = RESERVED_PIDS;
+ if (tid) {
+ nr = idr_alloc(&tmp->idr, NULL, tid,
+ tid + 1, GFP_ATOMIC);
+ /*
+ * If ENOSPC is returned it means that the PID is
+ * alreay in use. Return EEXIST in that case.
+ */
+ if (nr == -ENOSPC)
+ nr = -EEXIST;
+ } else {
+ int pid_min = 1;
+ /*
+ * init really needs pid 1, but after reaching the
+ * maximum wrap back to RESERVED_PIDS
+ */
+ if (idr_get_cursor(&tmp->idr) > RESERVED_PIDS)
+ pid_min = RESERVED_PIDS;
- /*
- * Store a null pointer so find_pid_ns does not find
- * a partially initialized PID (see below).
- */
- nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min,
- pid_max, GFP_ATOMIC);
+ /*
+ * Store a null pointer so find_pid_ns does not find
+ * a partially initialized PID (see below).
+ */
+ nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min,
+ pid_max, GFP_ATOMIC);
+ }
spin_unlock_irq(&pidmap_lock);
idr_preload_end();
@@ -205,17 +246,24 @@
tmp = tmp->parent;
}
- if (unlikely(is_child_reaper(pid))) {
- if (pid_ns_prepare_proc(ns))
- goto out_free;
- }
+ /*
+ * ENOMEM is not the most obvious choice especially for the case
+ * where the child subreaper has already exited and the pid
+ * namespace denies the creation of any new processes. But ENOMEM
+ * is what we have exposed to userspace for a long time and it is
+ * documented behavior for pid namespaces. So we can't easily
+ * change it even if there were an error code better suited.
+ */
+ retval = -ENOMEM;
get_pid_ns(ns);
refcount_set(&pid->count, 1);
+ spin_lock_init(&pid->lock);
for (type = 0; type < PIDTYPE_MAX; ++type)
INIT_HLIST_HEAD(&pid->tasks[type]);
init_waitqueue_head(&pid->wait_pidfd);
+ INIT_HLIST_HEAD(&pid->inodes);
upid = pid->numbers + ns->level;
spin_lock_irq(&pidmap_lock);
@@ -299,7 +347,7 @@
*pid_ptr = new;
for (tmp = PIDTYPE_MAX; --tmp >= 0; )
- if (!hlist_empty(&pid->tasks[tmp]))
+ if (pid_has_task(pid, tmp))
return;
free_pid(pid);
@@ -317,6 +365,25 @@
attach_pid(task, type);
}
+void exchange_tids(struct task_struct *left, struct task_struct *right)
+{
+ struct pid *pid1 = left->thread_pid;
+ struct pid *pid2 = right->thread_pid;
+ struct hlist_head *head1 = &pid1->tasks[PIDTYPE_PID];
+ struct hlist_head *head2 = &pid2->tasks[PIDTYPE_PID];
+
+ /* Swap the single entry tid lists */
+ hlists_swap_heads_rcu(head1, head2);
+
+ /* Swap the per task_struct pid */
+ rcu_assign_pointer(left->thread_pid, pid2);
+ rcu_assign_pointer(right->thread_pid, pid1);
+
+ /* Swap the cached value */
+ WRITE_ONCE(left->pid, pid_nr(pid2));
+ WRITE_ONCE(right->pid, pid_nr(pid1));
+}
+
/* transfer_pid is an optimization of attach_pid(new), detach_pid(old) */
void transfer_pid(struct task_struct *old, struct task_struct *new,
enum pid_type type)
@@ -430,8 +497,7 @@
rcu_read_lock();
if (!ns)
ns = task_active_pid_ns(current);
- if (likely(pid_alive(task)))
- nr = pid_nr_ns(rcu_dereference(*task_pid_ptr(task, type)), ns);
+ nr = pid_nr_ns(rcu_dereference(*task_pid_ptr(task, type)), ns);
rcu_read_unlock();
return nr;
@@ -454,10 +520,30 @@
return idr_get_next(&ns->idr, &nr);
}
+struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags)
+{
+ struct fd f;
+ struct pid *pid;
+
+ f = fdget(fd);
+ if (!f.file)
+ return ERR_PTR(-EBADF);
+
+ pid = pidfd_pid(f.file);
+ if (!IS_ERR(pid)) {
+ get_pid(pid);
+ *flags = f.file->f_flags;
+ }
+
+ fdput(f);
+ return pid;
+}
+
/**
* pidfd_create() - Create a new pid file descriptor.
*
- * @pid: struct pid that the pidfd will reference
+ * @pid: struct pid that the pidfd will reference
+ * @flags: flags to pass
*
* This creates a new pid file descriptor with the O_CLOEXEC flag set.
*
@@ -467,12 +553,12 @@
* Return: On success, a cloexec pidfd is returned.
* On error, a negative errno number will be returned.
*/
-static int pidfd_create(struct pid *pid)
+static int pidfd_create(struct pid *pid, unsigned int flags)
{
int fd;
fd = anon_inode_getfd("[pidfd]", &pidfd_fops, get_pid(pid),
- O_RDWR | O_CLOEXEC);
+ flags | O_RDWR | O_CLOEXEC);
if (fd < 0)
put_pid(pid);
@@ -497,10 +583,10 @@
*/
SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags)
{
- int fd, ret;
+ int fd;
struct pid *p;
- if (flags)
+ if (flags & ~PIDFD_NONBLOCK)
return -EINVAL;
if (pid <= 0)
@@ -510,13 +596,11 @@
if (!p)
return -ESRCH;
- ret = 0;
- rcu_read_lock();
- if (!pid_task(p, PIDTYPE_TGID))
- ret = -EINVAL;
- rcu_read_unlock();
+ if (pid_has_task(p, PIDTYPE_TGID))
+ fd = pidfd_create(p, flags);
+ else
+ fd = -EINVAL;
- fd = ret ?: pidfd_create(p);
put_pid(p);
return fd;
}
@@ -538,3 +622,84 @@
init_pid_ns.pid_cachep = KMEM_CACHE(pid,
SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT);
}
+
+static struct file *__pidfd_fget(struct task_struct *task, int fd)
+{
+ struct file *file;
+ int ret;
+
+ ret = down_read_killable(&task->signal->exec_update_lock);
+ if (ret)
+ return ERR_PTR(ret);
+
+ if (ptrace_may_access(task, PTRACE_MODE_ATTACH_REALCREDS))
+ file = fget_task(task, fd);
+ else
+ file = ERR_PTR(-EPERM);
+
+ up_read(&task->signal->exec_update_lock);
+
+ return file ?: ERR_PTR(-EBADF);
+}
+
+static int pidfd_getfd(struct pid *pid, int fd)
+{
+ struct task_struct *task;
+ struct file *file;
+ int ret;
+
+ task = get_pid_task(pid, PIDTYPE_PID);
+ if (!task)
+ return -ESRCH;
+
+ file = __pidfd_fget(task, fd);
+ put_task_struct(task);
+ if (IS_ERR(file))
+ return PTR_ERR(file);
+
+ ret = receive_fd(file, O_CLOEXEC);
+ fput(file);
+
+ return ret;
+}
+
+/**
+ * sys_pidfd_getfd() - Get a file descriptor from another process
+ *
+ * @pidfd: the pidfd file descriptor of the process
+ * @fd: the file descriptor number to get
+ * @flags: flags on how to get the fd (reserved)
+ *
+ * This syscall gets a copy of a file descriptor from another process
+ * based on the pidfd, and file descriptor number. It requires that
+ * the calling process has the ability to ptrace the process represented
+ * by the pidfd. The process which is having its file descriptor copied
+ * is otherwise unaffected.
+ *
+ * Return: On success, a cloexec file descriptor is returned.
+ * On error, a negative errno number will be returned.
+ */
+SYSCALL_DEFINE3(pidfd_getfd, int, pidfd, int, fd,
+ unsigned int, flags)
+{
+ struct pid *pid;
+ struct fd f;
+ int ret;
+
+ /* flags is currently unused - make sure it's unset */
+ if (flags)
+ return -EINVAL;
+
+ f = fdget(pidfd);
+ if (!f.file)
+ return -EBADF;
+
+ pid = pidfd_pid(f.file);
+ if (IS_ERR(pid))
+ ret = PTR_ERR(pid);
+ else
+ ret = pidfd_getfd(pid, fd);
+
+ fdput(f);
+ return ret;
+}