Update Linux to v5.4.2
Change-Id: Idf6911045d9d382da2cfe01b1edff026404ac8fd
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 42bbe68..c4159bc 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1,14 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* fs/eventpoll.c (Efficient event retrieval implementation)
* Copyright (C) 2001,...,2009 Davide Libenzi
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
* Davide Libenzi <davidel@xmailserver.org>
- *
*/
#include <linux/init.h>
@@ -50,10 +45,10 @@
*
* 1) epmutex (mutex)
* 2) ep->mtx (mutex)
- * 3) ep->wq.lock (spinlock)
+ * 3) ep->lock (rwlock)
*
* The acquire order is the one listed above, from 1 to 3.
- * We need a spinlock (ep->wq.lock) because we manipulate objects
+ * We need a rwlock (ep->lock) because we manipulate objects
* from inside the poll callback, that might be triggered from
* a wake_up() that in turn might be called from IRQ context.
* So we can't sleep inside the poll callback and hence we need
@@ -85,7 +80,7 @@
* of epoll file descriptors, we use the current recursion depth as
* the lockdep subkey.
* It is possible to drop the "ep->mtx" and to use the global
- * mutex "epmutex" (together with "ep->wq.lock") to have it working,
+ * mutex "epmutex" (together with "ep->lock") to have it working,
* but having "ep->mtx" will make the interface more scalable.
* Events that require holding "epmutex" are very rare, while for
* normal operations the epoll private "ep->mtx" will guarantee
@@ -182,8 +177,6 @@
* This structure is stored inside the "private_data" member of the file
* structure and represents the main data structure for the eventpoll
* interface.
- *
- * Access to it is protected by the lock inside wq.
*/
struct eventpoll {
/*
@@ -203,13 +196,16 @@
/* List of ready file descriptors */
struct list_head rdllist;
+ /* Lock which protects rdllist and ovflist */
+ rwlock_t lock;
+
/* RB tree root used to store monitored fd structs */
struct rb_root_cached rbr;
/*
* This is a single linked list that chains all the "struct epitem" that
* happened while transferring ready events to userspace w/out
- * holding ->wq.lock.
+ * holding ->lock.
*/
struct epitem *ovflist;
@@ -295,7 +291,7 @@
#include <linux/sysctl.h>
-static long zero;
+static long long_zero;
static long long_max = LONG_MAX;
struct ctl_table epoll_table[] = {
@@ -305,7 +301,7 @@
.maxlen = sizeof(max_user_watches),
.mode = 0644,
.proc_handler = proc_doulongvec_minmax,
- .extra1 = &zero,
+ .extra1 = &long_zero,
.extra2 = &long_max,
},
{ }
@@ -381,7 +377,8 @@
*/
static inline int ep_events_available(struct eventpoll *ep)
{
- return !list_empty(&ep->rdllist) || ep->ovflist != EP_UNACTIVE_PTR;
+ return !list_empty_careful(&ep->rdllist) ||
+ READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR;
}
#ifdef CONFIG_NET_RX_BUSY_POLL
@@ -471,7 +468,6 @@
* no re-entered.
*
* @ncalls: Pointer to the nested_calls structure to be used for this call.
- * @max_nests: Maximum number of allowed nesting calls.
* @nproc: Nested call core function pointer.
* @priv: Opaque data to be passed to the @nproc callback.
* @cookie: Cookie to be used to identify this nested call.
@@ -480,7 +476,7 @@
* Returns: Returns the code returned by the @nproc callback, or -1 if
* the maximum recursion limit has been exceeded.
*/
-static int ep_call_nested(struct nested_calls *ncalls, int max_nests,
+static int ep_call_nested(struct nested_calls *ncalls,
int (*nproc)(void *, void *, int), void *priv,
void *cookie, void *ctx)
{
@@ -499,7 +495,7 @@
*/
list_for_each_entry(tncur, lsthead, llink) {
if (tncur->ctx == ctx &&
- (tncur->cookie == cookie || ++call_nests > max_nests)) {
+ (tncur->cookie == cookie || ++call_nests > EP_MAX_NESTS)) {
/*
* Ops ... loop detected or maximum nest level reached.
* We abort this wake by breaking the cycle itself.
@@ -573,7 +569,7 @@
{
int this_cpu = get_cpu();
- ep_call_nested(&poll_safewake_ncalls, EP_MAX_NESTS,
+ ep_call_nested(&poll_safewake_ncalls,
ep_poll_wakeup_proc, NULL, wq, (void *) (long) this_cpu);
put_cpu();
@@ -697,23 +693,23 @@
* because we want the "sproc" callback to be able to do it
* in a lockless way.
*/
- spin_lock_irq(&ep->wq.lock);
+ write_lock_irq(&ep->lock);
list_splice_init(&ep->rdllist, &txlist);
- ep->ovflist = NULL;
- spin_unlock_irq(&ep->wq.lock);
+ WRITE_ONCE(ep->ovflist, NULL);
+ write_unlock_irq(&ep->lock);
/*
* Now call the callback function.
*/
res = (*sproc)(ep, &txlist, priv);
- spin_lock_irq(&ep->wq.lock);
+ write_lock_irq(&ep->lock);
/*
* During the time we spent inside the "sproc" callback, some
* other events might have been queued by the poll callback.
* We re-insert them inside the main ready-list here.
*/
- for (nepi = ep->ovflist; (epi = nepi) != NULL;
+ for (nepi = READ_ONCE(ep->ovflist); (epi = nepi) != NULL;
nepi = epi->next, epi->next = EP_UNACTIVE_PTR) {
/*
* We need to check if the item is already in the list.
@@ -722,7 +718,11 @@
* contain them, and the list_splice() below takes care of them.
*/
if (!ep_is_linked(epi)) {
- list_add_tail(&epi->rdllink, &ep->rdllist);
+ /*
+ * ->ovflist is LIFO, so we have to reverse it in order
+ * to keep in FIFO.
+ */
+ list_add(&epi->rdllink, &ep->rdllist);
ep_pm_stay_awake(epi);
}
}
@@ -731,7 +731,7 @@
* releasing the lock, events will be queued in the normal way inside
* ep->rdllist.
*/
- ep->ovflist = EP_UNACTIVE_PTR;
+ WRITE_ONCE(ep->ovflist, EP_UNACTIVE_PTR);
/*
* Quickly re-inject items left on "txlist".
@@ -745,11 +745,11 @@
* the ->poll() wait list (delayed after we release the lock).
*/
if (waitqueue_active(&ep->wq))
- wake_up_locked(&ep->wq);
+ wake_up(&ep->wq);
if (waitqueue_active(&ep->poll_wait))
pwake++;
}
- spin_unlock_irq(&ep->wq.lock);
+ write_unlock_irq(&ep->lock);
if (!ep_locked)
mutex_unlock(&ep->mtx);
@@ -789,10 +789,10 @@
rb_erase_cached(&epi->rbn, &ep->rbr);
- spin_lock_irq(&ep->wq.lock);
+ write_lock_irq(&ep->lock);
if (ep_is_linked(epi))
list_del_init(&epi->rdllink);
- spin_unlock_irq(&ep->wq.lock);
+ write_unlock_irq(&ep->lock);
wakeup_source_unregister(ep_wakeup_source(epi));
/*
@@ -842,7 +842,7 @@
* Walks through the whole tree by freeing each "struct epitem". At this
* point we are sure no poll callbacks will be lingering around, and also by
* holding "epmutex" we can be sure that no file cleanup code will hit
- * us during this operation. So we can avoid the lock on "ep->wq.lock".
+ * us during this operation. So we can avoid the lock on "ep->lock".
* We do not need to lock ep->mtx, either, we only do it to prevent
* a lockdep warning.
*/
@@ -1023,6 +1023,7 @@
goto free_uid;
mutex_init(&ep->mtx);
+ rwlock_init(&ep->lock);
init_waitqueue_head(&ep->wq);
init_waitqueue_head(&ep->poll_wait);
INIT_LIST_HEAD(&ep->rdllist);
@@ -1112,21 +1113,107 @@
}
#endif /* CONFIG_CHECKPOINT_RESTORE */
+/**
+ * Adds a new entry to the tail of the list in a lockless way, i.e.
+ * multiple CPUs are allowed to call this function concurrently.
+ *
+ * Beware: it is necessary to prevent any other modifications of the
+ * existing list until all changes are completed, in other words
+ * concurrent list_add_tail_lockless() calls should be protected
+ * with a read lock, where write lock acts as a barrier which
+ * makes sure all list_add_tail_lockless() calls are fully
+ * completed.
+ *
+ * Also an element can be locklessly added to the list only in one
+ * direction i.e. either to the tail either to the head, otherwise
+ * concurrent access will corrupt the list.
+ *
+ * Returns %false if element has been already added to the list, %true
+ * otherwise.
+ */
+static inline bool list_add_tail_lockless(struct list_head *new,
+ struct list_head *head)
+{
+ struct list_head *prev;
+
+ /*
+ * This is simple 'new->next = head' operation, but cmpxchg()
+ * is used in order to detect that same element has been just
+ * added to the list from another CPU: the winner observes
+ * new->next == new.
+ */
+ if (cmpxchg(&new->next, new, head) != new)
+ return false;
+
+ /*
+ * Initially ->next of a new element must be updated with the head
+ * (we are inserting to the tail) and only then pointers are atomically
+ * exchanged. XCHG guarantees memory ordering, thus ->next should be
+ * updated before pointers are actually swapped and pointers are
+ * swapped before prev->next is updated.
+ */
+
+ prev = xchg(&head->prev, new);
+
+ /*
+ * It is safe to modify prev->next and new->prev, because a new element
+ * is added only to the tail and new->next is updated before XCHG.
+ */
+
+ prev->next = new;
+ new->prev = prev;
+
+ return true;
+}
+
+/**
+ * Chains a new epi entry to the tail of the ep->ovflist in a lockless way,
+ * i.e. multiple CPUs are allowed to call this function concurrently.
+ *
+ * Returns %false if epi element has been already chained, %true otherwise.
+ */
+static inline bool chain_epi_lockless(struct epitem *epi)
+{
+ struct eventpoll *ep = epi->ep;
+
+ /* Check that the same epi has not been just chained from another CPU */
+ if (cmpxchg(&epi->next, EP_UNACTIVE_PTR, NULL) != EP_UNACTIVE_PTR)
+ return false;
+
+ /* Atomically exchange tail */
+ epi->next = xchg(&ep->ovflist, epi);
+
+ return true;
+}
+
/*
* This is the callback that is passed to the wait queue wakeup
* mechanism. It is called by the stored file descriptors when they
* have events to report.
+ *
+ * This callback takes a read lock in order not to content with concurrent
+ * events from another file descriptors, thus all modifications to ->rdllist
+ * or ->ovflist are lockless. Read lock is paired with the write lock from
+ * ep_scan_ready_list(), which stops all list modifications and guarantees
+ * that lists state is seen correctly.
+ *
+ * Another thing worth to mention is that ep_poll_callback() can be called
+ * concurrently for the same @epi from different CPUs if poll table was inited
+ * with several wait queues entries. Plural wakeup from different CPUs of a
+ * single wait queue is serialized by wq.lock, but the case when multiple wait
+ * queues are used should be detected accordingly. This is detected using
+ * cmpxchg() operation.
*/
static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
{
int pwake = 0;
- unsigned long flags;
struct epitem *epi = ep_item_from_wait(wait);
struct eventpoll *ep = epi->ep;
__poll_t pollflags = key_to_poll(key);
+ unsigned long flags;
int ewake = 0;
- spin_lock_irqsave(&ep->wq.lock, flags);
+ read_lock_irqsave(&ep->lock, flags);
ep_set_busy_poll_napi_id(epi);
@@ -1154,25 +1241,16 @@
* semantics). All the events that happen during that period of time are
* chained in ep->ovflist and requeued later on.
*/
- if (unlikely(ep->ovflist != EP_UNACTIVE_PTR)) {
- if (epi->next == EP_UNACTIVE_PTR) {
- epi->next = ep->ovflist;
- ep->ovflist = epi;
- if (epi->ws) {
- /*
- * Activate ep->ws since epi->ws may get
- * deactivated at any time.
- */
- __pm_stay_awake(ep->ws);
- }
-
- }
+ if (READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR) {
+ if (epi->next == EP_UNACTIVE_PTR &&
+ chain_epi_lockless(epi))
+ ep_pm_stay_awake_rcu(epi);
goto out_unlock;
}
/* If this file is already in the ready list we exit soon */
- if (!ep_is_linked(epi)) {
- list_add_tail(&epi->rdllink, &ep->rdllist);
+ if (!ep_is_linked(epi) &&
+ list_add_tail_lockless(&epi->rdllink, &ep->rdllist)) {
ep_pm_stay_awake_rcu(epi);
}
@@ -1197,13 +1275,13 @@
break;
}
}
- wake_up_locked(&ep->wq);
+ wake_up(&ep->wq);
}
if (waitqueue_active(&ep->poll_wait))
pwake++;
out_unlock:
- spin_unlock_irqrestore(&ep->wq.lock, flags);
+ read_unlock_irqrestore(&ep->lock, flags);
/* We have to call this outside the lock */
if (pwake)
@@ -1333,7 +1411,6 @@
}
} else {
error = ep_call_nested(&poll_loop_ncalls,
- EP_MAX_NESTS,
reverse_path_check_proc,
child_file, child_file,
current);
@@ -1367,7 +1444,7 @@
/* let's call this for all tfiles */
list_for_each_entry(current_file, &tfile_check_list, f_tfile_llink) {
path_count_init();
- error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
+ error = ep_call_nested(&poll_loop_ncalls,
reverse_path_check_proc, current_file,
current_file, current);
if (error)
@@ -1382,13 +1459,13 @@
struct wakeup_source *ws;
if (!epi->ep->ws) {
- epi->ep->ws = wakeup_source_register("eventpoll");
+ epi->ep->ws = wakeup_source_register(NULL, "eventpoll");
if (!epi->ep->ws)
return -ENOMEM;
}
name = epi->ffd.file->f_path.dentry->d_name.name;
- ws = wakeup_source_register(name);
+ ws = wakeup_source_register(NULL, name);
if (!ws)
return -ENOMEM;
@@ -1489,7 +1566,7 @@
goto error_remove_epi;
/* We have to drop the new item inside our item list to keep track of it */
- spin_lock_irq(&ep->wq.lock);
+ write_lock_irq(&ep->lock);
/* record NAPI ID of new item if present */
ep_set_busy_poll_napi_id(epi);
@@ -1501,12 +1578,12 @@
/* Notify waiting tasks that events are available */
if (waitqueue_active(&ep->wq))
- wake_up_locked(&ep->wq);
+ wake_up(&ep->wq);
if (waitqueue_active(&ep->poll_wait))
pwake++;
}
- spin_unlock_irq(&ep->wq.lock);
+ write_unlock_irq(&ep->lock);
atomic_long_inc(&ep->user->epoll_watches);
@@ -1532,10 +1609,10 @@
* list, since that is used/cleaned only inside a section bound by "mtx".
* And ep_insert() is called with "mtx" held.
*/
- spin_lock_irq(&ep->wq.lock);
+ write_lock_irq(&ep->lock);
if (ep_is_linked(epi))
list_del_init(&epi->rdllink);
- spin_unlock_irq(&ep->wq.lock);
+ write_unlock_irq(&ep->lock);
wakeup_source_unregister(ep_wakeup_source(epi));
@@ -1579,9 +1656,9 @@
* 1) Flush epi changes above to other CPUs. This ensures
* we do not miss events from ep_poll_callback if an
* event occurs immediately after we call f_op->poll().
- * We need this because we did not take ep->wq.lock while
+ * We need this because we did not take ep->lock while
* changing epi above (but ep_poll_callback does take
- * ep->wq.lock).
+ * ep->lock).
*
* 2) We also need to ensure we do not miss _past_ events
* when calling f_op->poll(). This barrier also
@@ -1600,18 +1677,18 @@
* list, push it inside.
*/
if (ep_item_poll(epi, &pt, 1)) {
- spin_lock_irq(&ep->wq.lock);
+ write_lock_irq(&ep->lock);
if (!ep_is_linked(epi)) {
list_add_tail(&epi->rdllink, &ep->rdllist);
ep_pm_stay_awake(epi);
/* Notify waiting tasks that events are available */
if (waitqueue_active(&ep->wq))
- wake_up_locked(&ep->wq);
+ wake_up(&ep->wq);
if (waitqueue_active(&ep->poll_wait))
pwake++;
}
- spin_unlock_irq(&ep->wq.lock);
+ write_unlock_irq(&ep->lock);
}
/* We have to call this outside the lock */
@@ -1626,21 +1703,24 @@
{
struct ep_send_events_data *esed = priv;
__poll_t revents;
- struct epitem *epi;
- struct epoll_event __user *uevent;
+ struct epitem *epi, *tmp;
+ struct epoll_event __user *uevent = esed->events;
struct wakeup_source *ws;
poll_table pt;
init_poll_funcptr(&pt, NULL);
+ esed->res = 0;
/*
* We can loop without lock because we are passed a task private list.
* Items cannot vanish during the loop because ep_scan_ready_list() is
* holding "mtx" during this call.
*/
- for (esed->res = 0, uevent = esed->events;
- !list_empty(head) && esed->res < esed->maxevents;) {
- epi = list_first_entry(head, struct epitem, rdllink);
+ lockdep_assert_held(&ep->mtx);
+
+ list_for_each_entry_safe(epi, tmp, head, rdllink) {
+ if (esed->res >= esed->maxevents)
+ break;
/*
* Activate ep->ws before deactivating epi->ws to prevent
@@ -1660,42 +1740,42 @@
list_del_init(&epi->rdllink);
- revents = ep_item_poll(epi, &pt, 1);
-
/*
* If the event mask intersect the caller-requested one,
* deliver the event to userspace. Again, ep_scan_ready_list()
- * is holding "mtx", so no operations coming from userspace
+ * is holding ep->mtx, so no operations coming from userspace
* can change the item.
*/
- if (revents) {
- if (__put_user(revents, &uevent->events) ||
- __put_user(epi->event.data, &uevent->data)) {
- list_add(&epi->rdllink, head);
- ep_pm_stay_awake(epi);
- if (!esed->res)
- esed->res = -EFAULT;
- return 0;
- }
- esed->res++;
- uevent++;
- if (epi->event.events & EPOLLONESHOT)
- epi->event.events &= EP_PRIVATE_BITS;
- else if (!(epi->event.events & EPOLLET)) {
- /*
- * If this file has been added with Level
- * Trigger mode, we need to insert back inside
- * the ready list, so that the next call to
- * epoll_wait() will check again the events
- * availability. At this point, no one can insert
- * into ep->rdllist besides us. The epoll_ctl()
- * callers are locked out by
- * ep_scan_ready_list() holding "mtx" and the
- * poll callback will queue them in ep->ovflist.
- */
- list_add_tail(&epi->rdllink, &ep->rdllist);
- ep_pm_stay_awake(epi);
- }
+ revents = ep_item_poll(epi, &pt, 1);
+ if (!revents)
+ continue;
+
+ if (__put_user(revents, &uevent->events) ||
+ __put_user(epi->event.data, &uevent->data)) {
+ list_add(&epi->rdllink, head);
+ ep_pm_stay_awake(epi);
+ if (!esed->res)
+ esed->res = -EFAULT;
+ return 0;
+ }
+ esed->res++;
+ uevent++;
+ if (epi->event.events & EPOLLONESHOT)
+ epi->event.events &= EP_PRIVATE_BITS;
+ else if (!(epi->event.events & EPOLLET)) {
+ /*
+ * If this file has been added with Level
+ * Trigger mode, we need to insert back inside
+ * the ready list, so that the next call to
+ * epoll_wait() will check again the events
+ * availability. At this point, no one can insert
+ * into ep->rdllist besides us. The epoll_ctl()
+ * callers are locked out by
+ * ep_scan_ready_list() holding "mtx" and the
+ * poll callback will queue them in ep->ovflist.
+ */
+ list_add_tail(&epi->rdllink, &ep->rdllist);
+ ep_pm_stay_awake(epi);
}
}
@@ -1747,6 +1827,7 @@
{
int res = 0, eavail, timed_out = 0;
u64 slack = 0;
+ bool waiter = false;
wait_queue_entry_t wait;
ktime_t expires, *to = NULL;
@@ -1761,11 +1842,18 @@
} else if (timeout == 0) {
/*
* Avoid the unnecessary trip to the wait queue loop, if the
- * caller specified a non blocking operation.
+ * caller specified a non blocking operation. We still need
+ * lock because we could race and not see an epi being added
+ * to the ready list while in irq callback. Thus incorrectly
+ * returning 0 back to userspace.
*/
timed_out = 1;
- spin_lock_irq(&ep->wq.lock);
- goto check_events;
+
+ write_lock_irq(&ep->lock);
+ eavail = ep_events_available(ep);
+ write_unlock_irq(&ep->lock);
+
+ goto send_events;
}
fetch_events:
@@ -1773,64 +1861,66 @@
if (!ep_events_available(ep))
ep_busy_loop(ep, timed_out);
- spin_lock_irq(&ep->wq.lock);
+ eavail = ep_events_available(ep);
+ if (eavail)
+ goto send_events;
- if (!ep_events_available(ep)) {
- /*
- * Busy poll timed out. Drop NAPI ID for now, we can add
- * it back in when we have moved a socket with a valid NAPI
- * ID onto the ready list.
- */
- ep_reset_busy_poll_napi_id(ep);
+ /*
+ * Busy poll timed out. Drop NAPI ID for now, we can add
+ * it back in when we have moved a socket with a valid NAPI
+ * ID onto the ready list.
+ */
+ ep_reset_busy_poll_napi_id(ep);
- /*
- * We don't have any available event to return to the caller.
- * We need to sleep here, and we will be wake up by
- * ep_poll_callback() when events will become available.
- */
+ /*
+ * We don't have any available event to return to the caller. We need
+ * to sleep here, and we will be woken by ep_poll_callback() when events
+ * become available.
+ */
+ if (!waiter) {
+ waiter = true;
init_waitqueue_entry(&wait, current);
+
+ spin_lock_irq(&ep->wq.lock);
__add_wait_queue_exclusive(&ep->wq, &wait);
+ spin_unlock_irq(&ep->wq.lock);
+ }
- for (;;) {
- /*
- * We don't want to sleep if the ep_poll_callback() sends us
- * a wakeup in between. That's why we set the task state
- * to TASK_INTERRUPTIBLE before doing the checks.
- */
- set_current_state(TASK_INTERRUPTIBLE);
- /*
- * Always short-circuit for fatal signals to allow
- * threads to make a timely exit without the chance of
- * finding more events available and fetching
- * repeatedly.
- */
- if (fatal_signal_pending(current)) {
- res = -EINTR;
- break;
- }
- if (ep_events_available(ep) || timed_out)
- break;
- if (signal_pending(current)) {
- res = -EINTR;
- break;
- }
-
- spin_unlock_irq(&ep->wq.lock);
- if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS))
- timed_out = 1;
-
- spin_lock_irq(&ep->wq.lock);
+ for (;;) {
+ /*
+ * We don't want to sleep if the ep_poll_callback() sends us
+ * a wakeup in between. That's why we set the task state
+ * to TASK_INTERRUPTIBLE before doing the checks.
+ */
+ set_current_state(TASK_INTERRUPTIBLE);
+ /*
+ * Always short-circuit for fatal signals to allow
+ * threads to make a timely exit without the chance of
+ * finding more events available and fetching
+ * repeatedly.
+ */
+ if (fatal_signal_pending(current)) {
+ res = -EINTR;
+ break;
}
- __remove_wait_queue(&ep->wq, &wait);
- __set_current_state(TASK_RUNNING);
+ eavail = ep_events_available(ep);
+ if (eavail)
+ break;
+ if (signal_pending(current)) {
+ res = -EINTR;
+ break;
+ }
+
+ if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS)) {
+ timed_out = 1;
+ break;
+ }
}
-check_events:
- /* Is it worth to try to dig for events ? */
- eavail = ep_events_available(ep);
- spin_unlock_irq(&ep->wq.lock);
+ __set_current_state(TASK_RUNNING);
+send_events:
/*
* Try to transfer events to user space. In case we get 0 events and
* there's still timeout left over, we go trying again in search of
@@ -1840,6 +1930,12 @@
!(res = ep_send_events(ep, events, maxevents)) && !timed_out)
goto fetch_events;
+ if (waiter) {
+ spin_lock_irq(&ep->wq.lock);
+ __remove_wait_queue(&ep->wq, &wait);
+ spin_unlock_irq(&ep->wq.lock);
+ }
+
return res;
}
@@ -1876,7 +1972,7 @@
ep_tovisit = epi->ffd.file->private_data;
if (ep_tovisit->visited)
continue;
- error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
+ error = ep_call_nested(&poll_loop_ncalls,
ep_loop_check_proc, epi->ffd.file,
ep_tovisit, current);
if (error != 0)
@@ -1916,7 +2012,7 @@
int ret;
struct eventpoll *ep_cur, *ep_next;
- ret = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
+ ret = ep_call_nested(&poll_loop_ncalls,
ep_loop_check_proc, file, ep, current);
/* clear visited list */
list_for_each_entry_safe(ep_cur, ep_next, &visited_list,
@@ -2172,7 +2268,7 @@
return -EINVAL;
/* Verify that the area passed by the user is writeable */
- if (!access_ok(VERIFY_WRITE, events, maxevents * sizeof(struct epoll_event)))
+ if (!access_ok(events, maxevents * sizeof(struct epoll_event)))
return -EFAULT;
/* Get the "struct file *" for the eventpoll file */
@@ -2217,37 +2313,17 @@
size_t, sigsetsize)
{
int error;
- sigset_t ksigmask, sigsaved;
/*
* If the caller wants a certain signal mask to be set during the wait,
* we apply it here.
*/
- if (sigmask) {
- if (sigsetsize != sizeof(sigset_t))
- return -EINVAL;
- if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask)))
- return -EFAULT;
- sigsaved = current->blocked;
- set_current_blocked(&ksigmask);
- }
+ error = set_user_sigmask(sigmask, sigsetsize);
+ if (error)
+ return error;
error = do_epoll_wait(epfd, events, maxevents, timeout);
-
- /*
- * If we changed the signal mask, we need to restore the original one.
- * In case we've got a signal while waiting, we do not restore the
- * signal mask yet, and we allow do_signal() to deliver the signal on
- * the way back to userspace, before the signal mask is restored.
- */
- if (sigmask) {
- if (error == -EINTR) {
- memcpy(¤t->saved_sigmask, &sigsaved,
- sizeof(sigsaved));
- set_restore_sigmask();
- } else
- set_current_blocked(&sigsaved);
- }
+ restore_saved_sigmask_unless(error == -EINTR);
return error;
}
@@ -2260,37 +2336,17 @@
compat_size_t, sigsetsize)
{
long err;
- sigset_t ksigmask, sigsaved;
/*
* If the caller wants a certain signal mask to be set during the wait,
* we apply it here.
*/
- if (sigmask) {
- if (sigsetsize != sizeof(compat_sigset_t))
- return -EINVAL;
- if (get_compat_sigset(&ksigmask, sigmask))
- return -EFAULT;
- sigsaved = current->blocked;
- set_current_blocked(&ksigmask);
- }
+ err = set_compat_user_sigmask(sigmask, sigsetsize);
+ if (err)
+ return err;
err = do_epoll_wait(epfd, events, maxevents, timeout);
-
- /*
- * If we changed the signal mask, we need to restore the original one.
- * In case we've got a signal while waiting, we do not restore the
- * signal mask yet, and we allow do_signal() to deliver the signal on
- * the way back to userspace, before the signal mask is restored.
- */
- if (sigmask) {
- if (err == -EINTR) {
- memcpy(¤t->saved_sigmask, &sigsaved,
- sizeof(sigsaved));
- set_restore_sigmask();
- } else
- set_current_blocked(&sigsaved);
- }
+ restore_saved_sigmask_unless(err == -EINTR);
return err;
}