Update Linux to v5.10.109
Sourced from [1]
[1] https://cdn.kernel.org/pub/linux/kernel/v5.x/linux-5.10.109.tar.xz
Change-Id: I19bca9fc6762d4e63bcf3e4cba88bbe560d9c76c
Signed-off-by: Olivier Deprez <olivier.deprez@arm.com>
diff --git a/ipc/ipc_sysctl.c b/ipc/ipc_sysctl.c
index affd665..3f312bf 100644
--- a/ipc/ipc_sysctl.c
+++ b/ipc/ipc_sysctl.c
@@ -24,7 +24,7 @@
#ifdef CONFIG_PROC_SYSCTL
static int proc_ipc_dointvec(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
struct ctl_table ipc_table;
@@ -35,7 +35,7 @@
}
static int proc_ipc_dointvec_minmax(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
struct ctl_table ipc_table;
@@ -46,7 +46,7 @@
}
static int proc_ipc_dointvec_minmax_orphans(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
struct ipc_namespace *ns = current->nsproxy->ipc_ns;
int err = proc_ipc_dointvec_minmax(table, write, buffer, lenp, ppos);
@@ -59,7 +59,7 @@
}
static int proc_ipc_doulongvec_minmax(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
struct ctl_table ipc_table;
memcpy(&ipc_table, table, sizeof(ipc_table));
@@ -70,7 +70,7 @@
}
static int proc_ipc_auto_msgmni(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
struct ctl_table ipc_table;
int dummy = 0;
@@ -85,7 +85,7 @@
}
static int proc_ipc_sem_dointvec(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int ret, semmni;
struct ipc_namespace *ns = current->nsproxy->ipc_ns;
diff --git a/ipc/mq_sysctl.c b/ipc/mq_sysctl.c
index 7c00f28..72a92a0 100644
--- a/ipc/mq_sysctl.c
+++ b/ipc/mq_sysctl.c
@@ -19,7 +19,7 @@
}
static int proc_mq_dointvec(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
struct ctl_table mq_table;
memcpy(&mq_table, table, sizeof(mq_table));
@@ -29,7 +29,7 @@
}
static int proc_mq_dointvec_minmax(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
struct ctl_table mq_table;
memcpy(&mq_table, table, sizeof(mq_table));
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index 2ea0c08..05d2176 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -63,6 +63,66 @@
int priority;
};
+/*
+ * Locking:
+ *
+ * Accesses to a message queue are synchronized by acquiring info->lock.
+ *
+ * There are two notable exceptions:
+ * - The actual wakeup of a sleeping task is performed using the wake_q
+ * framework. info->lock is already released when wake_up_q is called.
+ * - The exit codepaths after sleeping check ext_wait_queue->state without
+ * any locks. If it is STATE_READY, then the syscall is completed without
+ * acquiring info->lock.
+ *
+ * MQ_BARRIER:
+ * To achieve proper release/acquire memory barrier pairing, the state is set to
+ * STATE_READY with smp_store_release(), and it is read with READ_ONCE followed
+ * by smp_acquire__after_ctrl_dep(). In addition, wake_q_add_safe() is used.
+ *
+ * This prevents the following races:
+ *
+ * 1) With the simple wake_q_add(), the task could be gone already before
+ * the increase of the reference happens
+ * Thread A
+ * Thread B
+ * WRITE_ONCE(wait.state, STATE_NONE);
+ * schedule_hrtimeout()
+ * wake_q_add(A)
+ * if (cmpxchg()) // success
+ * ->state = STATE_READY (reordered)
+ * <timeout returns>
+ * if (wait.state == STATE_READY) return;
+ * sysret to user space
+ * sys_exit()
+ * get_task_struct() // UaF
+ *
+ * Solution: Use wake_q_add_safe() and perform the get_task_struct() before
+ * the smp_store_release() that does ->state = STATE_READY.
+ *
+ * 2) Without proper _release/_acquire barriers, the woken up task
+ * could read stale data
+ *
+ * Thread A
+ * Thread B
+ * do_mq_timedreceive
+ * WRITE_ONCE(wait.state, STATE_NONE);
+ * schedule_hrtimeout()
+ * state = STATE_READY;
+ * <timeout returns>
+ * if (wait.state == STATE_READY) return;
+ * msg_ptr = wait.msg; // Access to stale data!
+ * receiver->msg = message; (reordered)
+ *
+ * Solution: use _release and _acquire barriers.
+ *
+ * 3) There is intentionally no barrier when setting current->state
+ * to TASK_INTERRUPTIBLE: spin_unlock(&info->lock) provides the
+ * release memory barrier, and the wakeup is triggered when holding
+ * info->lock, i.e. spin_lock(&info->lock) provided a pairing
+ * acquire memory barrier.
+ */
+
struct ext_wait_queue { /* queue of sleeping tasks */
struct task_struct *task;
struct list_head list;
@@ -180,11 +240,10 @@
info->msg_tree_rightmost = rb_prev(node);
rb_erase(node, &info->msg_tree);
- if (info->node_cache) {
+ if (info->node_cache)
kfree(leaf);
- } else {
+ else
info->node_cache = leaf;
- }
}
static inline struct msg_msg *msg_get(struct mqueue_inode_info *info)
@@ -647,18 +706,23 @@
wq_add(info, sr, ewp);
for (;;) {
+ /* memory barrier not required, we hold info->lock */
__set_current_state(TASK_INTERRUPTIBLE);
spin_unlock(&info->lock);
time = schedule_hrtimeout_range_clock(timeout, 0,
HRTIMER_MODE_ABS, CLOCK_REALTIME);
- if (ewp->state == STATE_READY) {
+ if (READ_ONCE(ewp->state) == STATE_READY) {
+ /* see MQ_BARRIER for purpose/pairing */
+ smp_acquire__after_ctrl_dep();
retval = 0;
goto out;
}
spin_lock(&info->lock);
- if (ewp->state == STATE_READY) {
+
+ /* we hold info->lock, so no memory barrier required */
+ if (READ_ONCE(ewp->state) == STATE_READY) {
retval = 0;
goto out_unlock;
}
@@ -935,6 +999,20 @@
* The same algorithm is used for senders.
*/
+static inline void __pipelined_op(struct wake_q_head *wake_q,
+ struct mqueue_inode_info *info,
+ struct ext_wait_queue *this)
+{
+ struct task_struct *task;
+
+ list_del(&this->list);
+ task = get_task_struct(this->task);
+
+ /* see MQ_BARRIER for purpose/pairing */
+ smp_store_release(&this->state, STATE_READY);
+ wake_q_add_safe(wake_q, task);
+}
+
/* pipelined_send() - send a message directly to the task waiting in
* sys_mq_timedreceive() (without inserting message into a queue).
*/
@@ -944,17 +1022,7 @@
struct ext_wait_queue *receiver)
{
receiver->msg = message;
- list_del(&receiver->list);
- wake_q_add(wake_q, receiver->task);
- /*
- * Rely on the implicit cmpxchg barrier from wake_q_add such
- * that we can ensure that updating receiver->state is the last
- * write operation: As once set, the receiver can continue,
- * and if we don't have the reference count from the wake_q,
- * yet, at that point we can later have a use-after-free
- * condition and bogus wakeup.
- */
- receiver->state = STATE_READY;
+ __pipelined_op(wake_q, info, receiver);
}
/* pipelined_receive() - if there is task waiting in sys_mq_timedsend()
@@ -972,9 +1040,7 @@
if (msg_insert(sender->msg, info))
return;
- list_del(&sender->list);
- wake_q_add(wake_q, sender->task);
- sender->state = STATE_READY;
+ __pipelined_op(wake_q, info, sender);
}
static int do_mq_timedsend(mqd_t mqdes, const char __user *u_msg_ptr,
@@ -1061,7 +1127,9 @@
} else {
wait.task = current;
wait.msg = (void *) msg_ptr;
- wait.state = STATE_NONE;
+
+ /* memory barrier not required, we hold info->lock */
+ WRITE_ONCE(wait.state, STATE_NONE);
ret = wq_sleep(info, SEND, timeout, &wait);
/*
* wq_sleep must be called with info->lock held, and
@@ -1164,7 +1232,9 @@
ret = -EAGAIN;
} else {
wait.task = current;
- wait.state = STATE_NONE;
+
+ /* memory barrier not required, we hold info->lock */
+ WRITE_ONCE(wait.state, STATE_NONE);
ret = wq_sleep(info, RECV, timeout, &wait);
msg_ptr = wait.msg;
}
diff --git a/ipc/msg.c b/ipc/msg.c
index 767587a..6e6c8e0 100644
--- a/ipc/msg.c
+++ b/ipc/msg.c
@@ -61,6 +61,16 @@
struct list_head q_senders;
} __randomize_layout;
+/*
+ * MSG_BARRIER Locking:
+ *
+ * Similar to the optimization used in ipc/mqueue.c, one syscall return path
+ * does not acquire any locks when it sees that a message exists in
+ * msg_receiver.r_msg. Therefore r_msg is set using smp_store_release()
+ * and accessed using READ_ONCE()+smp_acquire__after_ctrl_dep(). In addition,
+ * wake_q_add_safe() is used. See ipc/mqueue.c for more details
+ */
+
/* one msg_receiver structure for each sleeping receiver */
struct msg_receiver {
struct list_head r_list;
@@ -184,6 +194,10 @@
{
mss->tsk = current;
mss->msgsz = msgsz;
+ /*
+ * No memory barrier required: we did ipc_lock_object(),
+ * and the waker obtains that lock before calling wake_q_add().
+ */
__set_current_state(TASK_INTERRUPTIBLE);
list_add_tail(&mss->list, &msq->q_senders);
}
@@ -237,8 +251,13 @@
struct msg_receiver *msr, *t;
list_for_each_entry_safe(msr, t, &msq->q_receivers, r_list) {
- wake_q_add(wake_q, msr->r_tsk);
- WRITE_ONCE(msr->r_msg, ERR_PTR(res));
+ struct task_struct *r_tsk;
+
+ r_tsk = get_task_struct(msr->r_tsk);
+
+ /* see MSG_BARRIER for purpose/pairing */
+ smp_store_release(&msr->r_msg, ERR_PTR(res));
+ wake_q_add_safe(wake_q, r_tsk);
}
}
@@ -251,6 +270,8 @@
* before freeque() is called. msg_ids.rwsem remains locked on exit.
*/
static void freeque(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp)
+ __releases(RCU)
+ __releases(&msq->q_perm)
{
struct msg_msg *msg, *t;
struct msg_queue *msq = container_of(ipcp, struct msg_queue, q_perm);
@@ -799,13 +820,17 @@
list_del(&msr->r_list);
if (msr->r_maxsize < msg->m_ts) {
wake_q_add(wake_q, msr->r_tsk);
- WRITE_ONCE(msr->r_msg, ERR_PTR(-E2BIG));
+
+ /* See expunge_all regarding memory barrier */
+ smp_store_release(&msr->r_msg, ERR_PTR(-E2BIG));
} else {
ipc_update_pid(&msq->q_lrpid, task_pid(msr->r_tsk));
msq->q_rtime = ktime_get_real_seconds();
wake_q_add(wake_q, msr->r_tsk);
- WRITE_ONCE(msr->r_msg, msg);
+
+ /* See expunge_all regarding memory barrier */
+ smp_store_release(&msr->r_msg, msg);
return 1;
}
}
@@ -1155,7 +1180,11 @@
msr_d.r_maxsize = INT_MAX;
else
msr_d.r_maxsize = bufsz;
- msr_d.r_msg = ERR_PTR(-EAGAIN);
+
+ /* memory barrier not require due to ipc_lock_object() */
+ WRITE_ONCE(msr_d.r_msg, ERR_PTR(-EAGAIN));
+
+ /* memory barrier not required, we own ipc_lock_object() */
__set_current_state(TASK_INTERRUPTIBLE);
ipc_unlock_object(&msq->q_perm);
@@ -1184,8 +1213,12 @@
* signal) it will either see the message and continue ...
*/
msg = READ_ONCE(msr_d.r_msg);
- if (msg != ERR_PTR(-EAGAIN))
+ if (msg != ERR_PTR(-EAGAIN)) {
+ /* see MSG_BARRIER for purpose/pairing */
+ smp_acquire__after_ctrl_dep();
+
goto out_unlock1;
+ }
/*
* ... or see -EAGAIN, acquire the lock to check the message
@@ -1193,7 +1226,7 @@
*/
ipc_lock_object(&msq->q_perm);
- msg = msr_d.r_msg;
+ msg = READ_ONCE(msr_d.r_msg);
if (msg != ERR_PTR(-EAGAIN))
goto out_unlock0;
diff --git a/ipc/namespace.c b/ipc/namespace.c
index b3ca147..24e7b45 100644
--- a/ipc/namespace.c
+++ b/ipc/namespace.c
@@ -117,6 +117,10 @@
static void free_ipc_ns(struct ipc_namespace *ns)
{
+ /* mq_put_mnt() waits for a grace period as kern_unmount()
+ * uses synchronize_rcu().
+ */
+ mq_put_mnt(ns);
sem_exit_ns(ns);
msg_exit_ns(ns);
shm_exit_ns(ns);
@@ -127,6 +131,21 @@
kfree(ns);
}
+static LLIST_HEAD(free_ipc_list);
+static void free_ipc(struct work_struct *unused)
+{
+ struct llist_node *node = llist_del_all(&free_ipc_list);
+ struct ipc_namespace *n, *t;
+
+ llist_for_each_entry_safe(n, t, node, mnt_llist)
+ free_ipc_ns(n);
+}
+
+/*
+ * The work queue is used to avoid the cost of synchronize_rcu in kern_unmount.
+ */
+static DECLARE_WORK(free_ipc_work, free_ipc);
+
/*
* put_ipc_ns - drop a reference to an ipc namespace.
* @ns: the namespace to put
@@ -148,8 +167,9 @@
if (refcount_dec_and_lock(&ns->count, &mq_lock)) {
mq_clear_sbinfo(ns);
spin_unlock(&mq_lock);
- mq_put_mnt(ns);
- free_ipc_ns(ns);
+
+ if (llist_add(&ns->mnt_llist, &free_ipc_list))
+ schedule_work(&free_ipc_work);
}
}
@@ -177,15 +197,14 @@
return put_ipc_ns(to_ipc_ns(ns));
}
-static int ipcns_install(struct nsproxy *nsproxy, struct ns_common *new)
+static int ipcns_install(struct nsset *nsset, struct ns_common *new)
{
+ struct nsproxy *nsproxy = nsset->nsproxy;
struct ipc_namespace *ns = to_ipc_ns(new);
if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) ||
- !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
+ !ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN))
return -EPERM;
- /* Ditch state from the old ipc namespace */
- exit_sem(current);
put_ipc_ns(nsproxy->ipc_ns);
nsproxy->ipc_ns = get_ipc_ns(ns);
return 0;
diff --git a/ipc/sem.c b/ipc/sem.c
index fe12ea8..7d9c06b 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -205,15 +205,38 @@
*
* Memory ordering:
* Most ordering is enforced by using spin_lock() and spin_unlock().
- * The special case is use_global_lock:
+ *
+ * Exceptions:
+ * 1) use_global_lock: (SEM_BARRIER_1)
* Setting it from non-zero to 0 is a RELEASE, this is ensured by
- * using smp_store_release().
+ * using smp_store_release(): Immediately after setting it to 0,
+ * a simple op can start.
* Testing if it is non-zero is an ACQUIRE, this is ensured by using
* smp_load_acquire().
* Setting it from 0 to non-zero must be ordered with regards to
* this smp_load_acquire(), this is guaranteed because the smp_load_acquire()
* is inside a spin_lock() and after a write from 0 to non-zero a
* spin_lock()+spin_unlock() is done.
+ *
+ * 2) queue.status: (SEM_BARRIER_2)
+ * Initialization is done while holding sem_lock(), so no further barrier is
+ * required.
+ * Setting it to a result code is a RELEASE, this is ensured by both a
+ * smp_store_release() (for case a) and while holding sem_lock()
+ * (for case b).
+ * The AQUIRE when reading the result code without holding sem_lock() is
+ * achieved by using READ_ONCE() + smp_acquire__after_ctrl_dep().
+ * (case a above).
+ * Reading the result code while holding sem_lock() needs no further barriers,
+ * the locks inside sem_lock() enforce ordering (case b above)
+ *
+ * 3) current->state:
+ * current->state is set to TASK_INTERRUPTIBLE while holding sem_lock().
+ * The wakeup is handled using the wake_q infrastructure. wake_q wakeups may
+ * happen immediately after calling wake_q_add. As wake_q_add_safe() is called
+ * when holding sem_lock(), no further barriers are required.
+ *
+ * See also ipc/mqueue.c for more details on the covered races.
*/
#define sc_semmsl sem_ctls[0]
@@ -344,12 +367,8 @@
return;
}
if (sma->use_global_lock == 1) {
- /*
- * Immediately after setting use_global_lock to 0,
- * a simple op can start. Thus: all memory writes
- * performed by the current operation must be visible
- * before we set use_global_lock to 0.
- */
+
+ /* See SEM_BARRIER_1 for purpose/pairing */
smp_store_release(&sma->use_global_lock, 0);
} else {
sma->use_global_lock--;
@@ -400,7 +419,7 @@
*/
spin_lock(&sem->lock);
- /* pairs with smp_store_release() */
+ /* see SEM_BARRIER_1 for purpose/pairing */
if (!smp_load_acquire(&sma->use_global_lock)) {
/* fast path successful! */
return sops->sem_num;
@@ -566,8 +585,7 @@
/*
* Called with sem_ids.rwsem and ipcp locked.
*/
-static inline int sem_more_checks(struct kern_ipc_perm *ipcp,
- struct ipc_params *params)
+static int sem_more_checks(struct kern_ipc_perm *ipcp, struct ipc_params *params)
{
struct sem_array *sma;
@@ -766,15 +784,14 @@
static inline void wake_up_sem_queue_prepare(struct sem_queue *q, int error,
struct wake_q_head *wake_q)
{
- wake_q_add(wake_q, q->sleeper);
- /*
- * Rely on the above implicit barrier, such that we can
- * ensure that we hold reference to the task before setting
- * q->status. Otherwise we could race with do_exit if the
- * task is awoken by an external event before calling
- * wake_up_process().
- */
- WRITE_ONCE(q->status, error);
+ struct task_struct *sleeper;
+
+ sleeper = get_task_struct(q->sleeper);
+
+ /* see SEM_BARRIER_2 for purpuse/pairing */
+ smp_store_release(&q->status, error);
+
+ wake_q_add_safe(wake_q, sleeper);
}
static void unlink_queue(struct sem_array *sma, struct sem_queue *q)
@@ -1676,7 +1693,7 @@
case IPC_SET:
if (copy_semid_from_user(&semid64, p, version))
return -EFAULT;
- /* fall through */
+ fallthrough;
case IPC_RMID:
return semctl_down(ns, semid, cmd, &semid64);
default:
@@ -1790,7 +1807,7 @@
case IPC_SET:
if (copy_compat_semid_from_user(&semid64, p, version))
return -EFAULT;
- /* fallthru */
+ fallthrough;
case IPC_RMID:
return semctl_down(ns, semid, cmd, &semid64);
default:
@@ -2148,9 +2165,11 @@
}
do {
+ /* memory ordering ensured by the lock in sem_lock() */
WRITE_ONCE(queue.status, -EINTR);
queue.sleeper = current;
+ /* memory ordering is ensured by the lock in sem_lock() */
__set_current_state(TASK_INTERRUPTIBLE);
sem_unlock(sma, locknum);
rcu_read_unlock();
@@ -2173,13 +2192,8 @@
*/
error = READ_ONCE(queue.status);
if (error != -EINTR) {
- /*
- * User space could assume that semop() is a memory
- * barrier: Without the mb(), the cpu could
- * speculatively read in userspace stale data that was
- * overwritten by the previous owner of the semaphore.
- */
- smp_mb();
+ /* see SEM_BARRIER_2 for purpose/pairing */
+ smp_acquire__after_ctrl_dep();
goto out_free;
}
@@ -2189,6 +2203,9 @@
if (!ipc_valid_object(&sma->sem_perm))
goto out_unlock_free;
+ /*
+ * No necessity for any barrier: We are protect by sem_lock()
+ */
error = READ_ONCE(queue.status);
/*
diff --git a/ipc/shm.c b/ipc/shm.c
index ce1ca9f..471ac3e 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -62,9 +62,18 @@
struct pid *shm_lprid;
struct user_struct *mlock_user;
- /* The task created the shm object. NULL if the task is dead. */
+ /*
+ * The task created the shm object, for
+ * task_lock(shp->shm_creator)
+ */
struct task_struct *shm_creator;
- struct list_head shm_clist; /* list by creator */
+
+ /*
+ * List by creator. task_lock(->shm_creator) required for read/write.
+ * If list_empty(), then the creator is dead already.
+ */
+ struct list_head shm_clist;
+ struct ipc_namespace *ns;
} __randomize_layout;
/* shm_mode upper byte flags */
@@ -115,6 +124,7 @@
struct shmid_kernel *shp;
shp = container_of(ipcp, struct shmid_kernel, shm_perm);
+ WARN_ON(ns != shp->ns);
if (shp->shm_nattch) {
shp->shm_perm.mode |= SHM_DEST;
@@ -225,10 +235,43 @@
kvfree(shp);
}
-static inline void shm_rmid(struct ipc_namespace *ns, struct shmid_kernel *s)
+/*
+ * It has to be called with shp locked.
+ * It must be called before ipc_rmid()
+ */
+static inline void shm_clist_rm(struct shmid_kernel *shp)
{
- list_del(&s->shm_clist);
- ipc_rmid(&shm_ids(ns), &s->shm_perm);
+ struct task_struct *creator;
+
+ /* ensure that shm_creator does not disappear */
+ rcu_read_lock();
+
+ /*
+ * A concurrent exit_shm may do a list_del_init() as well.
+ * Just do nothing if exit_shm already did the work
+ */
+ if (!list_empty(&shp->shm_clist)) {
+ /*
+ * shp->shm_creator is guaranteed to be valid *only*
+ * if shp->shm_clist is not empty.
+ */
+ creator = shp->shm_creator;
+
+ task_lock(creator);
+ /*
+ * list_del_init() is a nop if the entry was already removed
+ * from the list.
+ */
+ list_del_init(&shp->shm_clist);
+ task_unlock(creator);
+ }
+ rcu_read_unlock();
+}
+
+static inline void shm_rmid(struct shmid_kernel *s)
+{
+ shm_clist_rm(s);
+ ipc_rmid(&shm_ids(s->ns), &s->shm_perm);
}
@@ -283,7 +326,7 @@
shm_file = shp->shm_file;
shp->shm_file = NULL;
ns->shm_tot -= (shp->shm_segsz + PAGE_SIZE - 1) >> PAGE_SHIFT;
- shm_rmid(ns, shp);
+ shm_rmid(shp);
shm_unlock(shp);
if (!is_file_hugepages(shm_file))
shmem_lock(shm_file, 0, shp->mlock_user);
@@ -306,10 +349,10 @@
*
* 2) sysctl kernel.shm_rmid_forced is set to 1.
*/
-static bool shm_may_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp)
+static bool shm_may_destroy(struct shmid_kernel *shp)
{
return (shp->shm_nattch == 0) &&
- (ns->shm_rmid_forced ||
+ (shp->ns->shm_rmid_forced ||
(shp->shm_perm.mode & SHM_DEST));
}
@@ -340,7 +383,7 @@
ipc_update_pid(&shp->shm_lprid, task_tgid(current));
shp->shm_dtim = ktime_get_real_seconds();
shp->shm_nattch--;
- if (shm_may_destroy(ns, shp))
+ if (shm_may_destroy(shp))
shm_destroy(ns, shp);
else
shm_unlock(shp);
@@ -361,10 +404,10 @@
*
* As shp->* are changed under rwsem, it's safe to skip shp locking.
*/
- if (shp->shm_creator != NULL)
+ if (!list_empty(&shp->shm_clist))
return 0;
- if (shm_may_destroy(ns, shp)) {
+ if (shm_may_destroy(shp)) {
shm_lock_by_ptr(shp);
shm_destroy(ns, shp);
}
@@ -382,48 +425,97 @@
/* Locking assumes this will only be called with task == current */
void exit_shm(struct task_struct *task)
{
- struct ipc_namespace *ns = task->nsproxy->ipc_ns;
- struct shmid_kernel *shp, *n;
+ for (;;) {
+ struct shmid_kernel *shp;
+ struct ipc_namespace *ns;
- if (list_empty(&task->sysvshm.shm_clist))
- return;
+ task_lock(task);
- /*
- * If kernel.shm_rmid_forced is not set then only keep track of
- * which shmids are orphaned, so that a later set of the sysctl
- * can clean them up.
- */
- if (!ns->shm_rmid_forced) {
- down_read(&shm_ids(ns).rwsem);
- list_for_each_entry(shp, &task->sysvshm.shm_clist, shm_clist)
- shp->shm_creator = NULL;
- /*
- * Only under read lock but we are only called on current
- * so no entry on the list will be shared.
- */
- list_del(&task->sysvshm.shm_clist);
- up_read(&shm_ids(ns).rwsem);
- return;
- }
-
- /*
- * Destroy all already created segments, that were not yet mapped,
- * and mark any mapped as orphan to cover the sysctl toggling.
- * Destroy is skipped if shm_may_destroy() returns false.
- */
- down_write(&shm_ids(ns).rwsem);
- list_for_each_entry_safe(shp, n, &task->sysvshm.shm_clist, shm_clist) {
- shp->shm_creator = NULL;
-
- if (shm_may_destroy(ns, shp)) {
- shm_lock_by_ptr(shp);
- shm_destroy(ns, shp);
+ if (list_empty(&task->sysvshm.shm_clist)) {
+ task_unlock(task);
+ break;
}
- }
- /* Remove the list head from any segments still attached. */
- list_del(&task->sysvshm.shm_clist);
- up_write(&shm_ids(ns).rwsem);
+ shp = list_first_entry(&task->sysvshm.shm_clist, struct shmid_kernel,
+ shm_clist);
+
+ /*
+ * 1) Get pointer to the ipc namespace. It is worth to say
+ * that this pointer is guaranteed to be valid because
+ * shp lifetime is always shorter than namespace lifetime
+ * in which shp lives.
+ * We taken task_lock it means that shp won't be freed.
+ */
+ ns = shp->ns;
+
+ /*
+ * 2) If kernel.shm_rmid_forced is not set then only keep track of
+ * which shmids are orphaned, so that a later set of the sysctl
+ * can clean them up.
+ */
+ if (!ns->shm_rmid_forced)
+ goto unlink_continue;
+
+ /*
+ * 3) get a reference to the namespace.
+ * The refcount could be already 0. If it is 0, then
+ * the shm objects will be free by free_ipc_work().
+ */
+ ns = get_ipc_ns_not_zero(ns);
+ if (!ns) {
+unlink_continue:
+ list_del_init(&shp->shm_clist);
+ task_unlock(task);
+ continue;
+ }
+
+ /*
+ * 4) get a reference to shp.
+ * This cannot fail: shm_clist_rm() is called before
+ * ipc_rmid(), thus the refcount cannot be 0.
+ */
+ WARN_ON(!ipc_rcu_getref(&shp->shm_perm));
+
+ /*
+ * 5) unlink the shm segment from the list of segments
+ * created by current.
+ * This must be done last. After unlinking,
+ * only the refcounts obtained above prevent IPC_RMID
+ * from destroying the segment or the namespace.
+ */
+ list_del_init(&shp->shm_clist);
+
+ task_unlock(task);
+
+ /*
+ * 6) we have all references
+ * Thus lock & if needed destroy shp.
+ */
+ down_write(&shm_ids(ns).rwsem);
+ shm_lock_by_ptr(shp);
+ /*
+ * rcu_read_lock was implicitly taken in shm_lock_by_ptr, it's
+ * safe to call ipc_rcu_putref here
+ */
+ ipc_rcu_putref(&shp->shm_perm, shm_rcu_free);
+
+ if (ipc_valid_object(&shp->shm_perm)) {
+ if (shm_may_destroy(shp))
+ shm_destroy(ns, shp);
+ else
+ shm_unlock(shp);
+ } else {
+ /*
+ * Someone else deleted the shp from namespace
+ * idr/kht while we have waited.
+ * Just unlock and continue.
+ */
+ shm_unlock(shp);
+ }
+
+ up_write(&shm_ids(ns).rwsem);
+ put_ipc_ns(ns); /* paired with get_ipc_ns_not_zero */
+ }
}
static vm_fault_t shm_fault(struct vm_fault *vmf)
@@ -680,7 +772,11 @@
if (error < 0)
goto no_id;
+ shp->ns = ns;
+
+ task_lock(current);
list_add(&shp->shm_clist, ¤t->sysvshm.shm_clist);
+ task_unlock(current);
/*
* shmid gets reported as "inode#" in /proc/pid/maps.
@@ -711,8 +807,7 @@
/*
* Called with shm_ids.rwsem and ipcp locked.
*/
-static inline int shm_more_checks(struct kern_ipc_perm *ipcp,
- struct ipc_params *params)
+static int shm_more_checks(struct kern_ipc_perm *ipcp, struct ipc_params *params)
{
struct shmid_kernel *shp;
@@ -1180,7 +1275,7 @@
case IPC_SET:
if (copy_shmid_from_user(&sem64, buf, version))
return -EFAULT;
- /* fallthru */
+ fallthrough;
case IPC_RMID:
return shmctl_down(ns, shmid, cmd, &sem64);
case SHM_LOCK:
@@ -1332,7 +1427,7 @@
}
}
-long compat_ksys_shmctl(int shmid, int cmd, void __user *uptr, int version)
+static long compat_ksys_shmctl(int shmid, int cmd, void __user *uptr, int version)
{
struct ipc_namespace *ns;
struct shmid64_ds sem64;
@@ -1375,13 +1470,12 @@
case IPC_SET:
if (copy_compat_shmid_from_user(&sem64, uptr, version))
return -EFAULT;
- /* fallthru */
+ fallthrough;
case IPC_RMID:
return shmctl_down(ns, shmid, cmd, &sem64);
case SHM_LOCK:
case SHM_UNLOCK:
return shmctl_do_lock(ns, shmid, cmd);
- break;
default:
return -EINVAL;
}
@@ -1544,7 +1638,7 @@
if (err)
goto out_fput;
- if (down_write_killable(¤t->mm->mmap_sem)) {
+ if (mmap_write_lock_killable(current->mm)) {
err = -EINTR;
goto out_fput;
}
@@ -1558,13 +1652,13 @@
goto invalid;
}
- addr = do_mmap_pgoff(file, addr, size, prot, flags, 0, &populate, NULL);
+ addr = do_mmap(file, addr, size, prot, flags, 0, &populate, NULL);
*raddr = addr;
err = 0;
if (IS_ERR_VALUE(addr))
err = (long)addr;
invalid:
- up_write(¤t->mm->mmap_sem);
+ mmap_write_unlock(current->mm);
if (populate)
mm_populate(addr, populate);
@@ -1575,7 +1669,8 @@
down_write(&shm_ids(ns).rwsem);
shp = shm_lock(ns, shmid);
shp->shm_nattch--;
- if (shm_may_destroy(ns, shp))
+
+ if (shm_may_destroy(shp))
shm_destroy(ns, shp);
else
shm_unlock(shp);
@@ -1638,7 +1733,7 @@
if (addr & ~PAGE_MASK)
return retval;
- if (down_write_killable(&mm->mmap_sem))
+ if (mmap_write_lock_killable(mm))
return -EINTR;
/*
@@ -1726,7 +1821,7 @@
#endif
- up_write(&mm->mmap_sem);
+ mmap_write_unlock(mm);
return retval;
}
diff --git a/ipc/syscall.c b/ipc/syscall.c
index 581bdff..dfb0e98 100644
--- a/ipc/syscall.c
+++ b/ipc/syscall.c
@@ -30,7 +30,7 @@
return ksys_semtimedop(first, (struct sembuf __user *)ptr,
second, NULL);
case SEMTIMEDOP:
- if (IS_ENABLED(CONFIG_64BIT) || !IS_ENABLED(CONFIG_64BIT_TIME))
+ if (IS_ENABLED(CONFIG_64BIT))
return ksys_semtimedop(first, ptr, second,
(const struct __kernel_timespec __user *)fifth);
else if (IS_ENABLED(CONFIG_COMPAT_32BIT_TIME))
diff --git a/ipc/util.c b/ipc/util.c
index 1821b63..bbb5190 100644
--- a/ipc/util.c
+++ b/ipc/util.c
@@ -100,7 +100,7 @@
static const struct rhashtable_params ipc_kht_params = {
.head_offset = offsetof(struct kern_ipc_perm, khtnode),
.key_offset = offsetof(struct kern_ipc_perm, key),
- .key_len = FIELD_SIZEOF(struct kern_ipc_perm, key),
+ .key_len = sizeof_field(struct kern_ipc_perm, key),
.automatic_shrinking = true,
};
@@ -126,7 +126,7 @@
}
#ifdef CONFIG_PROC_FS
-static const struct file_operations sysvipc_proc_fops;
+static const struct proc_ops sysvipc_proc_ops;
/**
* ipc_init_proc_interface - create a proc interface for sysipc types using a seq_file interface.
* @path: Path in procfs
@@ -151,7 +151,7 @@
pde = proc_create_data(path,
S_IRUGO, /* world readable */
NULL, /* parent dir */
- &sysvipc_proc_fops,
+ &sysvipc_proc_ops,
iface);
if (!pde)
kfree(iface);
@@ -446,8 +446,8 @@
static void ipc_kht_remove(struct ipc_ids *ids, struct kern_ipc_perm *ipcp)
{
if (ipcp->key != IPC_PRIVATE)
- rhashtable_remove_fast(&ids->key_ht, &ipcp->khtnode,
- ipc_kht_params);
+ WARN_ON_ONCE(rhashtable_remove_fast(&ids->key_ht, &ipcp->khtnode,
+ ipc_kht_params));
}
/**
@@ -462,7 +462,7 @@
{
int idx = ipcid_to_idx(ipcp->id);
- idr_remove(&ids->ipcs_idr, idx);
+ WARN_ON_ONCE(idr_remove(&ids->ipcs_idr, idx) != ipcp);
ipc_kht_remove(ids, ipcp);
ids->in_use--;
ipcp->deleted = true;
@@ -884,10 +884,11 @@
return seq_release_private(inode, file);
}
-static const struct file_operations sysvipc_proc_fops = {
- .open = sysvipc_proc_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = sysvipc_proc_release,
+static const struct proc_ops sysvipc_proc_ops = {
+ .proc_flags = PROC_ENTRY_PERMANENT,
+ .proc_open = sysvipc_proc_open,
+ .proc_read = seq_read,
+ .proc_lseek = seq_lseek,
+ .proc_release = sysvipc_proc_release,
};
#endif /* CONFIG_PROC_FS */