Update Linux to v5.10.109
Sourced from [1]
[1] https://cdn.kernel.org/pub/linux/kernel/v5.x/linux-5.10.109.tar.xz
Change-Id: I19bca9fc6762d4e63bcf3e4cba88bbe560d9c76c
Signed-off-by: Olivier Deprez <olivier.deprez@arm.com>
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 21fb5a5..5fc9c9b 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -7,6 +7,12 @@
# that is not a function of syscall inputs. E.g. involuntary context switches.
KCOV_INSTRUMENT := n
+# There are numerous data races here, however, most of them are due to plain accesses.
+# This would make it even harder for syzbot to find reproducers, because these
+# bugs trigger without specific input. Disable by default, but should re-enable
+# eventually.
+KCSAN_SANITIZE := n
+
ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
# needed for x86 only. Why this used to be enabled for all architectures is beyond
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index 1152259..12bca64 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -370,7 +370,7 @@
if (sched_clock_stable())
return sched_clock() + __sched_clock_offset;
- if (!static_branch_unlikely(&sched_clock_running))
+ if (!static_branch_likely(&sched_clock_running))
return sched_clock();
preempt_disable_notrace();
@@ -393,7 +393,7 @@
if (sched_clock_stable())
return;
- if (!static_branch_unlikely(&sched_clock_running))
+ if (!static_branch_likely(&sched_clock_running))
return;
lockdep_assert_irqs_disabled();
@@ -460,7 +460,7 @@
u64 sched_clock_cpu(int cpu)
{
- if (!static_branch_unlikely(&sched_clock_running))
+ if (!static_branch_likely(&sched_clock_running))
return 0;
return sched_clock();
diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
index a1ad5b7..a778554 100644
--- a/kernel/sched/completion.c
+++ b/kernel/sched/completion.c
@@ -29,12 +29,12 @@
{
unsigned long flags;
- spin_lock_irqsave(&x->wait.lock, flags);
+ raw_spin_lock_irqsave(&x->wait.lock, flags);
if (x->done != UINT_MAX)
x->done++;
- __wake_up_locked(&x->wait, TASK_NORMAL, 1);
- spin_unlock_irqrestore(&x->wait.lock, flags);
+ swake_up_locked(&x->wait);
+ raw_spin_unlock_irqrestore(&x->wait.lock, flags);
}
EXPORT_SYMBOL(complete);
@@ -58,10 +58,12 @@
{
unsigned long flags;
- spin_lock_irqsave(&x->wait.lock, flags);
+ lockdep_assert_RT_in_threaded_ctx();
+
+ raw_spin_lock_irqsave(&x->wait.lock, flags);
x->done = UINT_MAX;
- __wake_up_locked(&x->wait, TASK_NORMAL, 0);
- spin_unlock_irqrestore(&x->wait.lock, flags);
+ swake_up_all_locked(&x->wait);
+ raw_spin_unlock_irqrestore(&x->wait.lock, flags);
}
EXPORT_SYMBOL(complete_all);
@@ -70,20 +72,20 @@
long (*action)(long), long timeout, int state)
{
if (!x->done) {
- DECLARE_WAITQUEUE(wait, current);
+ DECLARE_SWAITQUEUE(wait);
- __add_wait_queue_entry_tail_exclusive(&x->wait, &wait);
do {
if (signal_pending_state(state, current)) {
timeout = -ERESTARTSYS;
break;
}
+ __prepare_to_swait(&x->wait, &wait);
__set_current_state(state);
- spin_unlock_irq(&x->wait.lock);
+ raw_spin_unlock_irq(&x->wait.lock);
timeout = action(timeout);
- spin_lock_irq(&x->wait.lock);
+ raw_spin_lock_irq(&x->wait.lock);
} while (!x->done && timeout);
- __remove_wait_queue(&x->wait, &wait);
+ __finish_swait(&x->wait, &wait);
if (!x->done)
return timeout;
}
@@ -100,9 +102,9 @@
complete_acquire(x);
- spin_lock_irq(&x->wait.lock);
+ raw_spin_lock_irq(&x->wait.lock);
timeout = do_wait_for_common(x, action, timeout, state);
- spin_unlock_irq(&x->wait.lock);
+ raw_spin_unlock_irq(&x->wait.lock);
complete_release(x);
@@ -291,12 +293,12 @@
if (!READ_ONCE(x->done))
return false;
- spin_lock_irqsave(&x->wait.lock, flags);
+ raw_spin_lock_irqsave(&x->wait.lock, flags);
if (!x->done)
ret = false;
else if (x->done != UINT_MAX)
x->done--;
- spin_unlock_irqrestore(&x->wait.lock, flags);
+ raw_spin_unlock_irqrestore(&x->wait.lock, flags);
return ret;
}
EXPORT_SYMBOL(try_wait_for_completion);
@@ -322,8 +324,8 @@
* otherwise we can end up freeing the completion before complete()
* is done referencing it.
*/
- spin_lock_irqsave(&x->wait.lock, flags);
- spin_unlock_irqrestore(&x->wait.lock, flags);
+ raw_spin_lock_irqsave(&x->wait.lock, flags);
+ raw_spin_unlock_irqrestore(&x->wait.lock, flags);
return true;
}
EXPORT_SYMBOL(completion_done);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 5dc43d3..0a5f9fa 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6,22 +6,26 @@
*
* Copyright (C) 1991-2002 Linus Torvalds
*/
+#define CREATE_TRACE_POINTS
+#include <trace/events/sched.h>
+#undef CREATE_TRACE_POINTS
+
#include "sched.h"
#include <linux/nospec.h>
#include <linux/kcov.h>
+#include <linux/scs.h>
#include <asm/switch_to.h>
#include <asm/tlb.h>
#include "../workqueue_internal.h"
+#include "../../fs/io-wq.h"
#include "../smpboot.h"
#include "pelt.h"
-
-#define CREATE_TRACE_POINTS
-#include <trace/events/sched.h>
+#include "smp.h"
/*
* Export tracepoints that act as a bare tracehook (ie: have no trace event
@@ -32,7 +36,11 @@
EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_dl_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_irq_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_se_tp);
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_cpu_capacity_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp);
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_cfs_tp);
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_se_tp);
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_update_nr_running_tp);
DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
@@ -72,6 +80,100 @@
*/
int sysctl_sched_rt_runtime = 950000;
+
+/*
+ * Serialization rules:
+ *
+ * Lock order:
+ *
+ * p->pi_lock
+ * rq->lock
+ * hrtimer_cpu_base->lock (hrtimer_start() for bandwidth controls)
+ *
+ * rq1->lock
+ * rq2->lock where: rq1 < rq2
+ *
+ * Regular state:
+ *
+ * Normal scheduling state is serialized by rq->lock. __schedule() takes the
+ * local CPU's rq->lock, it optionally removes the task from the runqueue and
+ * always looks at the local rq data structures to find the most elegible task
+ * to run next.
+ *
+ * Task enqueue is also under rq->lock, possibly taken from another CPU.
+ * Wakeups from another LLC domain might use an IPI to transfer the enqueue to
+ * the local CPU to avoid bouncing the runqueue state around [ see
+ * ttwu_queue_wakelist() ]
+ *
+ * Task wakeup, specifically wakeups that involve migration, are horribly
+ * complicated to avoid having to take two rq->locks.
+ *
+ * Special state:
+ *
+ * System-calls and anything external will use task_rq_lock() which acquires
+ * both p->pi_lock and rq->lock. As a consequence the state they change is
+ * stable while holding either lock:
+ *
+ * - sched_setaffinity()/
+ * set_cpus_allowed_ptr(): p->cpus_ptr, p->nr_cpus_allowed
+ * - set_user_nice(): p->se.load, p->*prio
+ * - __sched_setscheduler(): p->sched_class, p->policy, p->*prio,
+ * p->se.load, p->rt_priority,
+ * p->dl.dl_{runtime, deadline, period, flags, bw, density}
+ * - sched_setnuma(): p->numa_preferred_nid
+ * - sched_move_task()/
+ * cpu_cgroup_fork(): p->sched_task_group
+ * - uclamp_update_active() p->uclamp*
+ *
+ * p->state <- TASK_*:
+ *
+ * is changed locklessly using set_current_state(), __set_current_state() or
+ * set_special_state(), see their respective comments, or by
+ * try_to_wake_up(). This latter uses p->pi_lock to serialize against
+ * concurrent self.
+ *
+ * p->on_rq <- { 0, 1 = TASK_ON_RQ_QUEUED, 2 = TASK_ON_RQ_MIGRATING }:
+ *
+ * is set by activate_task() and cleared by deactivate_task(), under
+ * rq->lock. Non-zero indicates the task is runnable, the special
+ * ON_RQ_MIGRATING state is used for migration without holding both
+ * rq->locks. It indicates task_cpu() is not stable, see task_rq_lock().
+ *
+ * p->on_cpu <- { 0, 1 }:
+ *
+ * is set by prepare_task() and cleared by finish_task() such that it will be
+ * set before p is scheduled-in and cleared after p is scheduled-out, both
+ * under rq->lock. Non-zero indicates the task is running on its CPU.
+ *
+ * [ The astute reader will observe that it is possible for two tasks on one
+ * CPU to have ->on_cpu = 1 at the same time. ]
+ *
+ * task_cpu(p): is changed by set_task_cpu(), the rules are:
+ *
+ * - Don't call set_task_cpu() on a blocked task:
+ *
+ * We don't care what CPU we're not running on, this simplifies hotplug,
+ * the CPU assignment of blocked tasks isn't required to be valid.
+ *
+ * - for try_to_wake_up(), called under p->pi_lock:
+ *
+ * This allows try_to_wake_up() to only take one rq->lock, see its comment.
+ *
+ * - for migration called under rq->lock:
+ * [ see task_on_rq_migrating() in task_rq_lock() ]
+ *
+ * o move_queued_task()
+ * o detach_task()
+ *
+ * - for migration called under double_rq_lock():
+ *
+ * o __migrate_swap_task()
+ * o push_rt_task() / pull_rt_task()
+ * o push_dl_task() / pull_dl_task()
+ * o dl_task_offline_migration()
+ *
+ */
+
/*
* __task_rq_lock - lock the rq @p resides on.
*/
@@ -218,6 +320,13 @@
update_rq_clock_task(rq, delta);
}
+static inline void
+rq_csd_init(struct rq *rq, struct __call_single_data *csd, smp_call_func_t func)
+{
+ csd->flags = 0;
+ csd->func = func;
+ csd->info = rq;
+}
#ifdef CONFIG_SCHED_HRTICK
/*
@@ -269,7 +378,6 @@
rq_lock(rq, &rf);
__hrtick_restart(rq);
- rq->hrtick_csd_pending = 0;
rq_unlock(rq, &rf);
}
@@ -290,12 +398,10 @@
delta = max_t(s64, delay, 10000LL);
rq->hrtick_time = ktime_add_ns(timer->base->get_time(), delta);
- if (rq == this_rq()) {
+ if (rq == this_rq())
__hrtick_restart(rq);
- } else if (!rq->hrtick_csd_pending) {
+ else
smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd);
- rq->hrtick_csd_pending = 1;
- }
}
#else
@@ -314,18 +420,14 @@
hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay),
HRTIMER_MODE_REL_PINNED_HARD);
}
+
#endif /* CONFIG_SMP */
static void hrtick_rq_init(struct rq *rq)
{
#ifdef CONFIG_SMP
- rq->hrtick_csd_pending = 0;
-
- rq->hrtick_csd.flags = 0;
- rq->hrtick_csd.func = __hrtick_start;
- rq->hrtick_csd.info = rq;
+ rq_csd_init(rq, &rq->hrtick_csd, __hrtick_start);
#endif
-
hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
rq->hrtick_timer.function = hrtick;
}
@@ -549,27 +651,32 @@
*/
int get_nohz_timer_target(void)
{
- int i, cpu = smp_processor_id();
+ int i, cpu = smp_processor_id(), default_cpu = -1;
struct sched_domain *sd;
- if (!idle_cpu(cpu) && housekeeping_cpu(cpu, HK_FLAG_TIMER))
- return cpu;
+ if (housekeeping_cpu(cpu, HK_FLAG_TIMER)) {
+ if (!idle_cpu(cpu))
+ return cpu;
+ default_cpu = cpu;
+ }
rcu_read_lock();
for_each_domain(cpu, sd) {
- for_each_cpu(i, sched_domain_span(sd)) {
+ for_each_cpu_and(i, sched_domain_span(sd),
+ housekeeping_cpumask(HK_FLAG_TIMER)) {
if (cpu == i)
continue;
- if (!idle_cpu(i) && housekeeping_cpu(i, HK_FLAG_TIMER)) {
+ if (!idle_cpu(i)) {
cpu = i;
goto unlock;
}
}
}
- if (!housekeeping_cpu(cpu, HK_FLAG_TIMER))
- cpu = housekeeping_any_cpu(HK_FLAG_TIMER);
+ if (default_cpu == -1)
+ default_cpu = housekeeping_any_cpu(HK_FLAG_TIMER);
+ cpu = default_cpu;
unlock:
rcu_read_unlock();
return cpu;
@@ -629,29 +736,23 @@
wake_up_idle_cpu(cpu);
}
-static inline bool got_nohz_idle_kick(void)
+static void nohz_csd_func(void *info)
{
- int cpu = smp_processor_id();
-
- if (!(atomic_read(nohz_flags(cpu)) & NOHZ_KICK_MASK))
- return false;
-
- if (idle_cpu(cpu) && !need_resched())
- return true;
+ struct rq *rq = info;
+ int cpu = cpu_of(rq);
+ unsigned int flags;
/*
- * We can't run Idle Load Balance on this CPU for this time so we
- * cancel it and clear NOHZ_BALANCE_KICK
+ * Release the rq::nohz_csd.
*/
- atomic_andnot(NOHZ_KICK_MASK, nohz_flags(cpu));
- return false;
-}
+ flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(cpu));
+ WARN_ON(!(flags & NOHZ_KICK_MASK));
-#else /* CONFIG_NO_HZ_COMMON */
-
-static inline bool got_nohz_idle_kick(void)
-{
- return false;
+ rq->idle_balance = idle_cpu(cpu);
+ if (rq->idle_balance && !need_resched()) {
+ rq->nohz_idle_balance = flags;
+ raise_softirq_irqoff(SCHED_SOFTIRQ);
+ }
}
#endif /* CONFIG_NO_HZ_COMMON */
@@ -753,7 +854,6 @@
if (task_has_idle_policy(p)) {
load->weight = scale_load(WEIGHT_IDLEPRIO);
load->inv_weight = WMULT_IDLEPRIO;
- p->se.runnable_weight = load->weight;
return;
}
@@ -766,7 +866,6 @@
} else {
load->weight = scale_load(sched_prio_to_weight[prio]);
load->inv_weight = sched_prio_to_wmult[prio];
- p->se.runnable_weight = load->weight;
}
}
@@ -789,6 +888,23 @@
/* Max allowed maximum utilization */
unsigned int sysctl_sched_uclamp_util_max = SCHED_CAPACITY_SCALE;
+/*
+ * By default RT tasks run at the maximum performance point/capacity of the
+ * system. Uclamp enforces this by always setting UCLAMP_MIN of RT tasks to
+ * SCHED_CAPACITY_SCALE.
+ *
+ * This knob allows admins to change the default behavior when uclamp is being
+ * used. In battery powered devices, particularly, running at the maximum
+ * capacity and frequency will increase energy consumption and shorten the
+ * battery life.
+ *
+ * This knob only affects RT tasks that their uclamp_se->user_defined == false.
+ *
+ * This knob will not override the system default sched_util_clamp_min defined
+ * above.
+ */
+unsigned int sysctl_sched_uclamp_util_min_rt_default = SCHED_CAPACITY_SCALE;
+
/* All clamps are required to be less or equal than these values */
static struct uclamp_se uclamp_default[UCLAMP_CNT];
@@ -823,11 +939,6 @@
return min_t(unsigned int, clamp_value / UCLAMP_BUCKET_DELTA, UCLAMP_BUCKETS - 1);
}
-static inline unsigned int uclamp_bucket_base_value(unsigned int clamp_value)
-{
- return UCLAMP_BUCKET_DELTA * uclamp_bucket_id(clamp_value);
-}
-
static inline unsigned int uclamp_none(enum uclamp_id clamp_id)
{
if (clamp_id == UCLAMP_MIN)
@@ -891,6 +1002,64 @@
return uclamp_idle_value(rq, clamp_id, clamp_value);
}
+static void __uclamp_update_util_min_rt_default(struct task_struct *p)
+{
+ unsigned int default_util_min;
+ struct uclamp_se *uc_se;
+
+ lockdep_assert_held(&p->pi_lock);
+
+ uc_se = &p->uclamp_req[UCLAMP_MIN];
+
+ /* Only sync if user didn't override the default */
+ if (uc_se->user_defined)
+ return;
+
+ default_util_min = sysctl_sched_uclamp_util_min_rt_default;
+ uclamp_se_set(uc_se, default_util_min, false);
+}
+
+static void uclamp_update_util_min_rt_default(struct task_struct *p)
+{
+ struct rq_flags rf;
+ struct rq *rq;
+
+ if (!rt_task(p))
+ return;
+
+ /* Protect updates to p->uclamp_* */
+ rq = task_rq_lock(p, &rf);
+ __uclamp_update_util_min_rt_default(p);
+ task_rq_unlock(rq, p, &rf);
+}
+
+static void uclamp_sync_util_min_rt_default(void)
+{
+ struct task_struct *g, *p;
+
+ /*
+ * copy_process() sysctl_uclamp
+ * uclamp_min_rt = X;
+ * write_lock(&tasklist_lock) read_lock(&tasklist_lock)
+ * // link thread smp_mb__after_spinlock()
+ * write_unlock(&tasklist_lock) read_unlock(&tasklist_lock);
+ * sched_post_fork() for_each_process_thread()
+ * __uclamp_sync_rt() __uclamp_sync_rt()
+ *
+ * Ensures that either sched_post_fork() will observe the new
+ * uclamp_min_rt or for_each_process_thread() will observe the new
+ * task.
+ */
+ read_lock(&tasklist_lock);
+ smp_mb__after_spinlock();
+ read_unlock(&tasklist_lock);
+
+ rcu_read_lock();
+ for_each_process_thread(g, p)
+ uclamp_update_util_min_rt_default(p);
+ rcu_read_unlock();
+}
+
static inline struct uclamp_se
uclamp_tg_restrict(struct task_struct *p, enum uclamp_id clamp_id)
{
@@ -939,17 +1108,17 @@
return uc_req;
}
-unsigned int uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id)
+unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id)
{
struct uclamp_se uc_eff;
/* Task currently refcounted: use back-annotated (effective) value */
if (p->uclamp[clamp_id].active)
- return p->uclamp[clamp_id].value;
+ return (unsigned long)p->uclamp[clamp_id].value;
uc_eff = uclamp_eff_get(p, clamp_id);
- return uc_eff.value;
+ return (unsigned long)uc_eff.value;
}
/*
@@ -1188,16 +1357,16 @@
#endif
int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
bool update_root_tg = false;
- int old_min, old_max;
+ int old_min, old_max, old_min_rt;
int result;
mutex_lock(&uclamp_mutex);
old_min = sysctl_sched_uclamp_util_min;
old_max = sysctl_sched_uclamp_util_max;
+ old_min_rt = sysctl_sched_uclamp_util_min_rt_default;
result = proc_dointvec(table, write, buffer, lenp, ppos);
if (result)
@@ -1206,7 +1375,9 @@
goto done;
if (sysctl_sched_uclamp_util_min > sysctl_sched_uclamp_util_max ||
- sysctl_sched_uclamp_util_max > SCHED_CAPACITY_SCALE) {
+ sysctl_sched_uclamp_util_max > SCHED_CAPACITY_SCALE ||
+ sysctl_sched_uclamp_util_min_rt_default > SCHED_CAPACITY_SCALE) {
+
result = -EINVAL;
goto undo;
}
@@ -1227,6 +1398,11 @@
uclamp_update_root_tg();
}
+ if (old_min_rt != sysctl_sched_uclamp_util_min_rt_default) {
+ static_branch_enable(&sched_uclamp_used);
+ uclamp_sync_util_min_rt_default();
+ }
+
/*
* We update all RUNNABLE tasks only when task groups are in use.
* Otherwise, keep it simple and do just a lazy update at each next
@@ -1238,6 +1414,7 @@
undo:
sysctl_sched_uclamp_util_min = old_min;
sysctl_sched_uclamp_util_max = old_max;
+ sysctl_sched_uclamp_util_min_rt_default = old_min_rt;
done:
mutex_unlock(&uclamp_mutex);
@@ -1283,17 +1460,20 @@
*/
for_each_clamp_id(clamp_id) {
struct uclamp_se *uc_se = &p->uclamp_req[clamp_id];
- unsigned int clamp_value = uclamp_none(clamp_id);
/* Keep using defined clamps across class changes */
if (uc_se->user_defined)
continue;
- /* By default, RT tasks always get 100% boost */
+ /*
+ * RT by default have a 100% boost value that could be modified
+ * at runtime.
+ */
if (unlikely(rt_task(p) && clamp_id == UCLAMP_MIN))
- clamp_value = uclamp_none(UCLAMP_MAX);
+ __uclamp_update_util_min_rt_default(p);
+ else
+ uclamp_se_set(uc_se, uclamp_none(clamp_id), false);
- uclamp_se_set(uc_se, clamp_value, false);
}
if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)))
@@ -1314,6 +1494,10 @@
{
enum uclamp_id clamp_id;
+ /*
+ * We don't need to hold task_rq_lock() when updating p->uclamp_* here
+ * as the task is still at its early fork stages.
+ */
for_each_clamp_id(clamp_id)
p->uclamp[clamp_id].active = false;
@@ -1326,6 +1510,11 @@
}
}
+static void uclamp_post_fork(struct task_struct *p)
+{
+ uclamp_update_util_min_rt_default(p);
+}
+
static void __init init_uclamp_rq(struct rq *rq)
{
enum uclamp_id clamp_id;
@@ -1337,7 +1526,7 @@
};
}
- rq->uclamp_flags = 0;
+ rq->uclamp_flags = UCLAMP_FLAG_IDLE;
}
static void __init init_uclamp(void)
@@ -1346,8 +1535,6 @@
enum uclamp_id clamp_id;
int cpu;
- mutex_init(&uclamp_mutex);
-
for_each_possible_cpu(cpu)
init_uclamp_rq(cpu_rq(cpu));
@@ -1378,6 +1565,7 @@
static void __setscheduler_uclamp(struct task_struct *p,
const struct sched_attr *attr) { }
static inline void uclamp_fork(struct task_struct *p) { }
+static inline void uclamp_post_fork(struct task_struct *p) { }
static inline void init_uclamp(void) { }
#endif /* CONFIG_UCLAMP_TASK */
@@ -1411,9 +1599,6 @@
void activate_task(struct rq *rq, struct task_struct *p, int flags)
{
- if (task_contributes_to_load(p))
- rq->nr_uninterruptible--;
-
enqueue_task(rq, p, flags);
p->on_rq = TASK_ON_RQ_QUEUED;
@@ -1423,18 +1608,21 @@
{
p->on_rq = (flags & DEQUEUE_SLEEP) ? 0 : TASK_ON_RQ_MIGRATING;
- if (task_contributes_to_load(p))
- rq->nr_uninterruptible++;
-
dequeue_task(rq, p, flags);
}
-/*
- * __normal_prio - return the priority that is based on the static prio
- */
-static inline int __normal_prio(struct task_struct *p)
+static inline int __normal_prio(int policy, int rt_prio, int nice)
{
- return p->static_prio;
+ int prio;
+
+ if (dl_policy(policy))
+ prio = MAX_DL_PRIO - 1;
+ else if (rt_policy(policy))
+ prio = MAX_RT_PRIO - 1 - rt_prio;
+ else
+ prio = NICE_TO_PRIO(nice);
+
+ return prio;
}
/*
@@ -1446,15 +1634,7 @@
*/
static inline int normal_prio(struct task_struct *p)
{
- int prio;
-
- if (task_has_dl_policy(p))
- prio = MAX_DL_PRIO-1;
- else if (task_has_rt_policy(p))
- prio = MAX_RT_PRIO-1 - p->rt_priority;
- else
- prio = __normal_prio(p);
- return prio;
+ return __normal_prio(p->policy, p->rt_priority, PRIO_TO_NICE(p->static_prio));
}
/*
@@ -1510,20 +1690,10 @@
void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
{
- const struct sched_class *class;
-
- if (p->sched_class == rq->curr->sched_class) {
+ if (p->sched_class == rq->curr->sched_class)
rq->curr->sched_class->check_preempt_curr(rq, p, flags);
- } else {
- for_each_class(class) {
- if (class == rq->curr->sched_class)
- break;
- if (class == p->sched_class) {
- resched_curr(rq);
- break;
- }
- }
- }
+ else if (p->sched_class > rq->curr->sched_class)
+ resched_curr(rq);
/*
* A queue event has occurred, and we're going to schedule. In
@@ -1535,17 +1705,6 @@
#ifdef CONFIG_SMP
-static inline bool is_per_cpu_kthread(struct task_struct *p)
-{
- if (!(p->flags & PF_KTHREAD))
- return false;
-
- if (p->nr_cpus_allowed != 1)
- return false;
-
- return true;
-}
-
/*
* Per-CPU kthreads are allowed to run on !active && online CPUs, see
* __set_cpus_allowed_ptr() and select_fallback_rq().
@@ -1585,8 +1744,7 @@
{
lockdep_assert_held(&rq->lock);
- WRITE_ONCE(p->on_rq, TASK_ON_RQ_MIGRATING);
- dequeue_task(rq, p, DEQUEUE_NOCLOCK);
+ deactivate_task(rq, p, DEQUEUE_NOCLOCK);
set_task_cpu(p, new_cpu);
rq_unlock(rq, rf);
@@ -1594,8 +1752,7 @@
rq_lock(rq, rf);
BUG_ON(task_cpu(p) != new_cpu);
- enqueue_task(rq, p, 0);
- p->on_rq = TASK_ON_RQ_QUEUED;
+ activate_task(rq, p, 0);
check_preempt_curr(rq, p, 0);
return rq;
@@ -1650,7 +1807,7 @@
* __migrate_task() such that we will not miss enforcing cpus_ptr
* during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test.
*/
- sched_ttwu_pending();
+ flush_smp_call_function_from_idle();
raw_spin_lock(&p->pi_lock);
rq_lock(rq, &rf);
@@ -1751,7 +1908,12 @@
if (cpumask_equal(&p->cpus_mask, new_mask))
goto out;
- dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask);
+ /*
+ * Picking a ~random cpu helps in cases where we are changing affinity
+ * for groups of tasks (ie. cpuset), so that load balancing is not
+ * immediately required to distribute the tasks within their new mask.
+ */
+ dest_cpu = cpumask_any_and_distribute(cpu_valid_mask, new_mask);
if (dest_cpu >= nr_cpu_ids) {
ret = -EINVAL;
goto out;
@@ -2163,7 +2325,7 @@
state = possible;
break;
}
- /* Fall-through */
+ fallthrough;
case possible:
do_set_cpus_allowed(p, cpu_possible_mask);
state = fail;
@@ -2220,12 +2382,6 @@
return cpu;
}
-static void update_avg(u64 *avg, u64 sample)
-{
- s64 diff = sample - *avg;
- *avg += diff >> 3;
-}
-
void sched_set_stop_task(int cpu, struct task_struct *stop)
{
struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
@@ -2348,25 +2504,49 @@
lockdep_assert_held(&rq->lock);
-#ifdef CONFIG_SMP
if (p->sched_contributes_to_load)
rq->nr_uninterruptible--;
+#ifdef CONFIG_SMP
if (wake_flags & WF_MIGRATED)
en_flags |= ENQUEUE_MIGRATED;
+ else
#endif
+ if (p->in_iowait) {
+ delayacct_blkio_end(p);
+ atomic_dec(&task_rq(p)->nr_iowait);
+ }
activate_task(rq, p, en_flags);
ttwu_do_wakeup(rq, p, wake_flags, rf);
}
/*
- * Called in case the task @p isn't fully descheduled from its runqueue,
- * in this case we must do a remote wakeup. Its a 'light' wakeup though,
- * since all we need to do is flip p->state to TASK_RUNNING, since
- * the task is still ->on_rq.
+ * Consider @p being inside a wait loop:
+ *
+ * for (;;) {
+ * set_current_state(TASK_UNINTERRUPTIBLE);
+ *
+ * if (CONDITION)
+ * break;
+ *
+ * schedule();
+ * }
+ * __set_current_state(TASK_RUNNING);
+ *
+ * between set_current_state() and schedule(). In this case @p is still
+ * runnable, so all that needs doing is change p->state back to TASK_RUNNING in
+ * an atomic manner.
+ *
+ * By taking task_rq(p)->lock we serialize against schedule(), if @p->on_rq
+ * then schedule() must still happen and p->state can be changed to
+ * TASK_RUNNING. Otherwise we lost the race, schedule() has happened, and we
+ * need to do a full wakeup with enqueue.
+ *
+ * Returns: %true when the wakeup is done,
+ * %false otherwise.
*/
-static int ttwu_remote(struct task_struct *p, int wake_flags)
+static int ttwu_runnable(struct task_struct *p, int wake_flags)
{
struct rq_flags rf;
struct rq *rq;
@@ -2385,75 +2565,63 @@
}
#ifdef CONFIG_SMP
-void sched_ttwu_pending(void)
+void sched_ttwu_pending(void *arg)
{
+ struct llist_node *llist = arg;
struct rq *rq = this_rq();
- struct llist_node *llist = llist_del_all(&rq->wake_list);
struct task_struct *p, *t;
struct rq_flags rf;
if (!llist)
return;
+ /*
+ * rq::ttwu_pending racy indication of out-standing wakeups.
+ * Races such that false-negatives are possible, since they
+ * are shorter lived that false-positives would be.
+ */
+ WRITE_ONCE(rq->ttwu_pending, 0);
+
rq_lock_irqsave(rq, &rf);
update_rq_clock(rq);
- llist_for_each_entry_safe(p, t, llist, wake_entry)
+ llist_for_each_entry_safe(p, t, llist, wake_entry.llist) {
+ if (WARN_ON_ONCE(p->on_cpu))
+ smp_cond_load_acquire(&p->on_cpu, !VAL);
+
+ if (WARN_ON_ONCE(task_cpu(p) != cpu_of(rq)))
+ set_task_cpu(p, cpu_of(rq));
+
ttwu_do_activate(rq, p, p->sched_remote_wakeup ? WF_MIGRATED : 0, &rf);
+ }
rq_unlock_irqrestore(rq, &rf);
}
-void scheduler_ipi(void)
+void send_call_function_single_ipi(int cpu)
{
- /*
- * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting
- * TIF_NEED_RESCHED remotely (for the first time) will also send
- * this IPI.
- */
- preempt_fold_need_resched();
+ struct rq *rq = cpu_rq(cpu);
- if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick())
- return;
-
- /*
- * Not all reschedule IPI handlers call irq_enter/irq_exit, since
- * traditionally all their work was done from the interrupt return
- * path. Now that we actually do some work, we need to make sure
- * we do call them.
- *
- * Some archs already do call them, luckily irq_enter/exit nest
- * properly.
- *
- * Arguably we should visit all archs and update all handlers,
- * however a fair share of IPIs are still resched only so this would
- * somewhat pessimize the simple resched case.
- */
- irq_enter();
- sched_ttwu_pending();
-
- /*
- * Check if someone kicked us for doing the nohz idle load balance.
- */
- if (unlikely(got_nohz_idle_kick())) {
- this_rq()->idle_balance = 1;
- raise_softirq_irqoff(SCHED_SOFTIRQ);
- }
- irq_exit();
+ if (!set_nr_if_polling(rq->idle))
+ arch_send_call_function_single_ipi(cpu);
+ else
+ trace_sched_wake_idle_without_ipi(cpu);
}
-static void ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags)
+/*
+ * Queue a task on the target CPUs wake_list and wake the CPU via IPI if
+ * necessary. The wakee CPU on receipt of the IPI will queue the task
+ * via sched_ttwu_wakeup() for activation so the wakee incurs the cost
+ * of the wakeup instead of the waker.
+ */
+static void __ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
{
struct rq *rq = cpu_rq(cpu);
p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED);
- if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) {
- if (!set_nr_if_polling(rq->idle))
- smp_send_reschedule(cpu);
- else
- trace_sched_wake_idle_without_ipi(cpu);
- }
+ WRITE_ONCE(rq->ttwu_pending, 1);
+ __smp_call_single_queue(cpu, &p->wake_entry.llist);
}
void wake_up_if_idle(int cpu)
@@ -2482,8 +2650,54 @@
bool cpus_share_cache(int this_cpu, int that_cpu)
{
+ if (this_cpu == that_cpu)
+ return true;
+
return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
}
+
+static inline bool ttwu_queue_cond(int cpu, int wake_flags)
+{
+ /*
+ * If the CPU does not share cache, then queue the task on the
+ * remote rqs wakelist to avoid accessing remote data.
+ */
+ if (!cpus_share_cache(smp_processor_id(), cpu))
+ return true;
+
+ /*
+ * If the task is descheduling and the only running task on the
+ * CPU then use the wakelist to offload the task activation to
+ * the soon-to-be-idle CPU as the current CPU is likely busy.
+ * nr_running is checked to avoid unnecessary task stacking.
+ */
+ if ((wake_flags & WF_ON_CPU) && cpu_rq(cpu)->nr_running <= 1)
+ return true;
+
+ return false;
+}
+
+static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
+{
+ if (sched_feat(TTWU_QUEUE) && ttwu_queue_cond(cpu, wake_flags)) {
+ if (WARN_ON_ONCE(cpu == smp_processor_id()))
+ return false;
+
+ sched_clock_cpu(cpu); /* Sync clocks across CPUs */
+ __ttwu_queue_wakelist(p, cpu, wake_flags);
+ return true;
+ }
+
+ return false;
+}
+
+#else /* !CONFIG_SMP */
+
+static inline bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
+{
+ return false;
+}
+
#endif /* CONFIG_SMP */
static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
@@ -2491,13 +2705,8 @@
struct rq *rq = cpu_rq(cpu);
struct rq_flags rf;
-#if defined(CONFIG_SMP)
- if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) {
- sched_clock_cpu(cpu); /* Sync clocks across CPUs */
- ttwu_queue_remote(p, cpu, wake_flags);
+ if (ttwu_queue_wakelist(p, cpu, wake_flags))
return;
- }
-#endif
rq_lock(rq, &rf);
update_rq_clock(rq);
@@ -2553,8 +2762,8 @@
* migration. However the means are completely different as there is no lock
* chain to provide order. Instead we do:
*
- * 1) smp_store_release(X->on_cpu, 0)
- * 2) smp_cond_load_acquire(!X->on_cpu)
+ * 1) smp_store_release(X->on_cpu, 0) -- finish_task()
+ * 2) smp_cond_load_acquire(!X->on_cpu) -- try_to_wake_up()
*
* Example:
*
@@ -2594,15 +2803,33 @@
* @state: the mask of task states that can be woken
* @wake_flags: wake modifier flags (WF_*)
*
- * If (@state & @p->state) @p->state = TASK_RUNNING.
+ * Conceptually does:
+ *
+ * If (@state & @p->state) @p->state = TASK_RUNNING.
*
* If the task was not queued/runnable, also place it back on a runqueue.
*
- * Atomic against schedule() which would dequeue a task, also see
- * set_current_state().
+ * This function is atomic against schedule() which would dequeue the task.
*
- * This function executes a full memory barrier before accessing the task
- * state; see set_current_state().
+ * It issues a full memory barrier before accessing @p->state, see the comment
+ * with set_current_state().
+ *
+ * Uses p->pi_lock to serialize against concurrent wake-ups.
+ *
+ * Relies on p->pi_lock stabilizing:
+ * - p->sched_class
+ * - p->cpus_ptr
+ * - p->sched_task_group
+ * in order to do migration, see its use of select_task_rq()/set_task_cpu().
+ *
+ * Tries really hard to only take one task_rq(p)->lock for performance.
+ * Takes rq->lock in:
+ * - ttwu_runnable() -- old rq, unavoidable, see comment there;
+ * - ttwu_queue() -- new rq, for enqueue of the task;
+ * - psi_ttwu_dequeue() -- much sadness :-( accounting will kill us.
+ *
+ * As a consequence we race really badly with just about everything. See the
+ * many memory barriers and their comments for details.
*
* Return: %true if @p->state changes (an actual wakeup was done),
* %false otherwise.
@@ -2618,7 +2845,7 @@
/*
* We're waking current, this means 'p->on_rq' and 'task_cpu(p)
* == smp_processor_id()'. Together this means we can special
- * case the whole 'p->on_rq && ttwu_remote()' case below
+ * case the whole 'p->on_rq && ttwu_runnable()' case below
* without taking any locks.
*
* In particular:
@@ -2630,7 +2857,6 @@
goto out;
success = 1;
- cpu = task_cpu(p);
trace_sched_waking(p);
p->state = TASK_RUNNING;
trace_sched_wakeup(p);
@@ -2640,8 +2866,8 @@
/*
* If we are going to wake up a thread waiting for CONDITION we
* need to ensure that CONDITION=1 done by the caller can not be
- * reordered with p->state check below. This pairs with mb() in
- * set_current_state() the waiting thread does.
+ * reordered with p->state check below. This pairs with smp_store_mb()
+ * in set_current_state() that the waiting thread does.
*/
raw_spin_lock_irqsave(&p->pi_lock, flags);
smp_mb__after_spinlock();
@@ -2652,7 +2878,6 @@
/* We're going to change ->state: */
success = 1;
- cpu = task_cpu(p);
/*
* Ensure we load p->on_rq _after_ p->state, otherwise it would
@@ -2673,9 +2898,11 @@
*
* Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in
* __schedule(). See the comment for smp_mb__after_spinlock().
+ *
+ * A similar smb_rmb() lives in try_invoke_on_locked_down_task().
*/
smp_rmb();
- if (p->on_rq && ttwu_remote(p, wake_flags))
+ if (READ_ONCE(p->on_rq) && ttwu_runnable(p, wake_flags))
goto unlock;
#ifdef CONFIG_SMP
@@ -2697,8 +2924,43 @@
*
* Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in
* __schedule(). See the comment for smp_mb__after_spinlock().
+ *
+ * Form a control-dep-acquire with p->on_rq == 0 above, to ensure
+ * schedule()'s deactivate_task() has 'happened' and p will no longer
+ * care about it's own p->state. See the comment in __schedule().
*/
- smp_rmb();
+ smp_acquire__after_ctrl_dep();
+
+ /*
+ * We're doing the wakeup (@success == 1), they did a dequeue (p->on_rq
+ * == 0), which means we need to do an enqueue, change p->state to
+ * TASK_WAKING such that we can unlock p->pi_lock before doing the
+ * enqueue, such as ttwu_queue_wakelist().
+ */
+ p->state = TASK_WAKING;
+
+ /*
+ * If the owning (remote) CPU is still in the middle of schedule() with
+ * this task as prev, considering queueing p on the remote CPUs wake_list
+ * which potentially sends an IPI instead of spinning on p->on_cpu to
+ * let the waker make forward progress. This is safe because IRQs are
+ * disabled and the IPI will deliver after on_cpu is cleared.
+ *
+ * Ensure we load task_cpu(p) after p->on_cpu:
+ *
+ * set_task_cpu(p, cpu);
+ * STORE p->cpu = @cpu
+ * __schedule() (switch to task 'p')
+ * LOCK rq->lock
+ * smp_mb__after_spin_lock() smp_cond_load_acquire(&p->on_cpu)
+ * STORE p->on_cpu = 1 LOAD p->cpu
+ *
+ * to ensure we observe the correct CPU on which the task is currently
+ * scheduling.
+ */
+ if (smp_load_acquire(&p->on_cpu) &&
+ ttwu_queue_wakelist(p, task_cpu(p), wake_flags | WF_ON_CPU))
+ goto unlock;
/*
* If the owning (remote) CPU is still in the middle of schedule() with
@@ -2711,28 +2973,19 @@
*/
smp_cond_load_acquire(&p->on_cpu, !VAL);
- p->sched_contributes_to_load = !!task_contributes_to_load(p);
- p->state = TASK_WAKING;
-
- if (p->in_iowait) {
- delayacct_blkio_end(p);
- atomic_dec(&task_rq(p)->nr_iowait);
- }
-
cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
if (task_cpu(p) != cpu) {
+ if (p->in_iowait) {
+ delayacct_blkio_end(p);
+ atomic_dec(&task_rq(p)->nr_iowait);
+ }
+
wake_flags |= WF_MIGRATED;
psi_ttwu_dequeue(p);
set_task_cpu(p, cpu);
}
-
-#else /* CONFIG_SMP */
-
- if (p->in_iowait) {
- delayacct_blkio_end(p);
- atomic_dec(&task_rq(p)->nr_iowait);
- }
-
+#else
+ cpu = task_cpu(p);
#endif /* CONFIG_SMP */
ttwu_queue(p, cpu, wake_flags);
@@ -2740,13 +2993,58 @@
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
out:
if (success)
- ttwu_stat(p, cpu, wake_flags);
+ ttwu_stat(p, task_cpu(p), wake_flags);
preempt_enable();
return success;
}
/**
+ * try_invoke_on_locked_down_task - Invoke a function on task in fixed state
+ * @p: Process for which the function is to be invoked, can be @current.
+ * @func: Function to invoke.
+ * @arg: Argument to function.
+ *
+ * If the specified task can be quickly locked into a definite state
+ * (either sleeping or on a given runqueue), arrange to keep it in that
+ * state while invoking @func(@arg). This function can use ->on_rq and
+ * task_curr() to work out what the state is, if required. Given that
+ * @func can be invoked with a runqueue lock held, it had better be quite
+ * lightweight.
+ *
+ * Returns:
+ * @false if the task slipped out from under the locks.
+ * @true if the task was locked onto a runqueue or is sleeping.
+ * However, @func can override this by returning @false.
+ */
+bool try_invoke_on_locked_down_task(struct task_struct *p, bool (*func)(struct task_struct *t, void *arg), void *arg)
+{
+ struct rq_flags rf;
+ bool ret = false;
+ struct rq *rq;
+
+ raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
+ if (p->on_rq) {
+ rq = __task_rq_lock(p, &rf);
+ if (task_rq(p) == rq)
+ ret = func(p, arg);
+ rq_unlock(rq, &rf);
+ } else {
+ switch (p->state) {
+ case TASK_RUNNING:
+ case TASK_WAKING:
+ break;
+ default:
+ smp_rmb(); // See smp_rmb() comment in try_to_wake_up().
+ if (!p->on_rq)
+ ret = func(p, arg);
+ }
+ }
+ raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags);
+ return ret;
+}
+
+/**
* wake_up_process - Wake up a specific process
* @p: The process to be woken up.
*
@@ -2814,6 +3112,9 @@
p->capture_control = NULL;
#endif
init_numa_balancing(clone_flags, p);
+#ifdef CONFIG_SMP
+ p->wake_entry.u_flags = CSD_TYPE_TTWU;
+#endif
}
DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
@@ -2830,7 +3131,7 @@
#ifdef CONFIG_PROC_SYSCTL
int sysctl_numa_balancing(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
struct ctl_table t;
int err;
@@ -2904,8 +3205,8 @@
}
#ifdef CONFIG_PROC_SYSCTL
-int sysctl_schedstats(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+int sysctl_schedstats(struct ctl_table *table, int write, void *buffer,
+ size_t *lenp, loff_t *ppos)
{
struct ctl_table t;
int err;
@@ -2933,8 +3234,6 @@
*/
int sched_fork(unsigned long clone_flags, struct task_struct *p)
{
- unsigned long flags;
-
__sched_fork(clone_flags, p);
/*
* We mark the process as NEW here. This guarantees that
@@ -2961,7 +3260,7 @@
} else if (PRIO_TO_NICE(p->static_prio) < 0)
p->static_prio = NICE_TO_PRIO(0);
- p->prio = p->normal_prio = __normal_prio(p);
+ p->prio = p->normal_prio = p->static_prio;
set_load_weight(p, false);
/*
@@ -2980,24 +3279,6 @@
init_entity_runnable_average(&p->se);
- /*
- * The child is not yet in the pid-hash so no cgroup attach races,
- * and the cgroup is pinned to this child due to cgroup_fork()
- * is ran before sched_fork().
- *
- * Silence PROVE_RCU.
- */
- raw_spin_lock_irqsave(&p->pi_lock, flags);
- rseq_migrate(p);
- /*
- * We're setting the CPU for the first time, we don't migrate,
- * so use __set_task_cpu().
- */
- __set_task_cpu(p, smp_processor_id());
- if (p->sched_class->task_fork)
- p->sched_class->task_fork(p);
- raw_spin_unlock_irqrestore(&p->pi_lock, flags);
-
#ifdef CONFIG_SCHED_INFO
if (likely(sched_info_on()))
memset(&p->sched_info, 0, sizeof(p->sched_info));
@@ -3013,6 +3294,32 @@
return 0;
}
+void sched_post_fork(struct task_struct *p, struct kernel_clone_args *kargs)
+{
+ unsigned long flags;
+#ifdef CONFIG_CGROUP_SCHED
+ struct task_group *tg;
+#endif
+
+ raw_spin_lock_irqsave(&p->pi_lock, flags);
+#ifdef CONFIG_CGROUP_SCHED
+ tg = container_of(kargs->cset->subsys[cpu_cgrp_id],
+ struct task_group, css);
+ p->sched_task_group = autogroup_task_group(p, tg);
+#endif
+ rseq_migrate(p);
+ /*
+ * We're setting the CPU for the first time, we don't migrate,
+ * so use __set_task_cpu().
+ */
+ __set_task_cpu(p, smp_processor_id());
+ if (p->sched_class->task_fork)
+ p->sched_class->task_fork(p);
+ raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+
+ uclamp_post_fork(p);
+}
+
unsigned long to_ratio(u64 period, u64 runtime)
{
if (runtime == RUNTIME_INF)
@@ -3170,8 +3477,10 @@
/*
* Claim the task as running, we do this before switching to it
* such that any running task will have this set.
+ *
+ * See the ttwu() WF_ON_CPU case and its ordering comment.
*/
- next->on_cpu = 1;
+ WRITE_ONCE(next->on_cpu, 1);
#endif
}
@@ -3179,8 +3488,9 @@
{
#ifdef CONFIG_SMP
/*
- * After ->on_cpu is cleared, the task can be moved to a different CPU.
- * We must ensure this doesn't happen until the switch is completely
+ * This must be the very last reference to @prev from this CPU. After
+ * p->on_cpu is cleared, the task can be moved to a different CPU. We
+ * must ensure this doesn't happen until the switch is completely
* finished.
*
* In particular, the load of prev->state in finish_task_switch() must
@@ -3202,7 +3512,7 @@
* do an early lockdep release here:
*/
rq_unpin_lock(rq, rf);
- spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
+ spin_release(&rq->lock.dep_map, _THIS_IP_);
#ifdef CONFIG_DEBUG_SPINLOCK
/* this is a valid case when another task releases the spinlock */
rq->lock.owner = next;
@@ -3689,12 +3999,16 @@
struct rq *rq = cpu_rq(cpu);
struct task_struct *curr = rq->curr;
struct rq_flags rf;
+ unsigned long thermal_pressure;
+ arch_scale_freq_tick();
sched_clock_tick();
rq_lock(rq, &rf);
update_rq_clock(rq);
+ thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq));
+ update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure);
curr->sched_class->task_tick(rq, curr, 0);
calc_global_load_tick(rq);
psi_task_tick(rq);
@@ -3958,8 +4272,7 @@
if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)
&& in_atomic_preempt_off()) {
pr_err("Preemption disabled at:");
- print_ip_sym(preempt_disable_ip);
- pr_cont("\n");
+ print_ip_sym(KERN_ERR, preempt_disable_ip);
}
if (panic_on_warn)
panic("scheduling while atomic\n");
@@ -3976,6 +4289,9 @@
#ifdef CONFIG_SCHED_STACK_END_CHECK
if (task_stack_end_corrupted(prev))
panic("corrupted stack end detected inside scheduler\n");
+
+ if (task_scs_end_corrupted(prev))
+ panic("corrupted shadow stack detected inside scheduler\n");
#endif
#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
@@ -3998,6 +4314,28 @@
schedstat_inc(this_rq()->sched_count);
}
+static void put_prev_task_balance(struct rq *rq, struct task_struct *prev,
+ struct rq_flags *rf)
+{
+#ifdef CONFIG_SMP
+ const struct sched_class *class;
+ /*
+ * We must do the balancing pass before put_prev_task(), such
+ * that when we release the rq->lock the task is in the same
+ * state as before we took rq->lock.
+ *
+ * We can terminate the balance pass as soon as we know there is
+ * a runnable task of @class priority or higher.
+ */
+ for_class_range(class, prev->sched_class, &idle_sched_class) {
+ if (class->balance(rq, prev, rf))
+ break;
+ }
+#endif
+
+ put_prev_task(rq, prev);
+}
+
/*
* Pick up the highest-prio task:
*/
@@ -4013,41 +4351,27 @@
* higher scheduling class, because otherwise those loose the
* opportunity to pull in more work from other CPUs.
*/
- if (likely((prev->sched_class == &idle_sched_class ||
- prev->sched_class == &fair_sched_class) &&
+ if (likely(prev->sched_class <= &fair_sched_class &&
rq->nr_running == rq->cfs.h_nr_running)) {
- p = fair_sched_class.pick_next_task(rq, prev, rf);
+ p = pick_next_task_fair(rq, prev, rf);
if (unlikely(p == RETRY_TASK))
goto restart;
/* Assumes fair_sched_class->next == idle_sched_class */
- if (unlikely(!p))
- p = idle_sched_class.pick_next_task(rq, prev, rf);
+ if (!p) {
+ put_prev_task(rq, prev);
+ p = pick_next_task_idle(rq);
+ }
return p;
}
restart:
-#ifdef CONFIG_SMP
- /*
- * We must do the balancing pass before put_next_task(), such
- * that when we release the rq->lock the task is in the same
- * state as before we took rq->lock.
- *
- * We can terminate the balance pass as soon as we know there is
- * a runnable task of @class priority or higher.
- */
- for_class_range(class, prev->sched_class, &idle_sched_class) {
- if (class->balance(rq, prev, rf))
- break;
- }
-#endif
-
- put_prev_task(rq, prev);
+ put_prev_task_balance(rq, prev, rf);
for_each_class(class) {
- p = class->pick_next_task(rq, NULL, NULL);
+ p = class->pick_next_task(rq);
if (p)
return p;
}
@@ -4099,6 +4423,7 @@
{
struct task_struct *prev, *next;
unsigned long *switch_count;
+ unsigned long prev_state;
struct rq_flags rf;
struct rq *rq;
int cpu;
@@ -4118,9 +4443,16 @@
/*
* Make sure that signal_pending_state()->signal_pending() below
* can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
- * done by the caller to avoid the race with signal_wake_up().
+ * done by the caller to avoid the race with signal_wake_up():
*
- * The membarrier system call requires a full memory barrier
+ * __set_current_state(@state) signal_wake_up()
+ * schedule() set_tsk_thread_flag(p, TIF_SIGPENDING)
+ * wake_up_state(p, state)
+ * LOCK rq->lock LOCK p->pi_state
+ * smp_mb__after_spinlock() smp_mb__after_spinlock()
+ * if (signal_pending_state()) if (p->state & @state)
+ *
+ * Also, the membarrier system call requires a full memory barrier
* after coming from user-space, before storing to rq->curr.
*/
rq_lock(rq, &rf);
@@ -4131,10 +4463,38 @@
update_rq_clock(rq);
switch_count = &prev->nivcsw;
- if (!preempt && prev->state) {
- if (signal_pending_state(prev->state, prev)) {
+
+ /*
+ * We must load prev->state once (task_struct::state is volatile), such
+ * that:
+ *
+ * - we form a control dependency vs deactivate_task() below.
+ * - ptrace_{,un}freeze_traced() can change ->state underneath us.
+ */
+ prev_state = prev->state;
+ if (!preempt && prev_state) {
+ if (signal_pending_state(prev_state, prev)) {
prev->state = TASK_RUNNING;
} else {
+ prev->sched_contributes_to_load =
+ (prev_state & TASK_UNINTERRUPTIBLE) &&
+ !(prev_state & TASK_NOLOAD) &&
+ !(prev->flags & PF_FROZEN);
+
+ if (prev->sched_contributes_to_load)
+ rq->nr_uninterruptible++;
+
+ /*
+ * __schedule() ttwu()
+ * prev_state = prev->state; if (p->on_rq && ...)
+ * if (prev_state) goto out;
+ * p->on_rq = 0; smp_acquire__after_ctrl_dep();
+ * p->state = TASK_WAKING
+ *
+ * Where __schedule() and ttwu() have matching control dependencies.
+ *
+ * After this, schedule() must not care about p->state any more.
+ */
deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK);
if (prev->in_iowait) {
@@ -4172,6 +4532,8 @@
*/
++*switch_count;
+ psi_sched_switch(prev, next, !task_on_rq_queued(prev));
+
trace_sched_switch(preempt, prev, next);
/* Also unlocks the rq: */
@@ -4202,9 +4564,12 @@
static inline void sched_submit_work(struct task_struct *tsk)
{
+ unsigned int task_flags;
+
if (!tsk->state)
return;
+ task_flags = tsk->flags;
/*
* If a worker went to sleep, notify and ask workqueue whether
* it wants to wake up a task to maintain concurrency.
@@ -4213,9 +4578,12 @@
* in the possible wakeup of a kworker and because wq_worker_sleeping()
* requires it.
*/
- if (tsk->flags & PF_WQ_WORKER) {
+ if (task_flags & (PF_WQ_WORKER | PF_IO_WORKER)) {
preempt_disable();
- wq_worker_sleeping(tsk);
+ if (task_flags & PF_WQ_WORKER)
+ wq_worker_sleeping(tsk);
+ else
+ io_wq_worker_sleeping(tsk);
preempt_enable_no_resched();
}
@@ -4232,8 +4600,12 @@
static void sched_update_worker(struct task_struct *tsk)
{
- if (tsk->flags & PF_WQ_WORKER)
- wq_worker_running(tsk);
+ if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) {
+ if (tsk->flags & PF_WQ_WORKER)
+ wq_worker_running(tsk);
+ else
+ io_wq_worker_running(tsk);
+ }
}
asmlinkage __visible void __sched schedule(void)
@@ -4437,10 +4809,23 @@
int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags,
void *key)
{
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_SCHED_DEBUG) && wake_flags & ~WF_SYNC);
return try_to_wake_up(curr->private, mode, wake_flags);
}
EXPORT_SYMBOL(default_wake_function);
+static void __setscheduler_prio(struct task_struct *p, int prio)
+{
+ if (dl_prio(prio))
+ p->sched_class = &dl_sched_class;
+ else if (rt_prio(prio))
+ p->sched_class = &rt_sched_class;
+ else
+ p->sched_class = &fair_sched_class;
+
+ p->prio = prio;
+}
+
#ifdef CONFIG_RT_MUTEXES
static inline int __rt_effective_prio(struct task_struct *pi_task, int prio)
@@ -4551,26 +4936,24 @@
if (!dl_prio(p->normal_prio) ||
(pi_task && dl_prio(pi_task->prio) &&
dl_entity_preempt(&pi_task->dl, &p->dl))) {
- p->dl.dl_boosted = 1;
+ p->dl.pi_se = pi_task->dl.pi_se;
queue_flag |= ENQUEUE_REPLENISH;
- } else
- p->dl.dl_boosted = 0;
- p->sched_class = &dl_sched_class;
+ } else {
+ p->dl.pi_se = &p->dl;
+ }
} else if (rt_prio(prio)) {
if (dl_prio(oldprio))
- p->dl.dl_boosted = 0;
+ p->dl.pi_se = &p->dl;
if (oldprio < prio)
queue_flag |= ENQUEUE_HEAD;
- p->sched_class = &rt_sched_class;
} else {
if (dl_prio(oldprio))
- p->dl.dl_boosted = 0;
+ p->dl.pi_se = &p->dl;
if (rt_prio(oldprio))
p->rt.timeout = 0;
- p->sched_class = &fair_sched_class;
}
- p->prio = prio;
+ __setscheduler_prio(p, prio);
if (queued)
enqueue_task(rq, p, queue_flag);
@@ -4596,7 +4979,7 @@
void set_user_nice(struct task_struct *p, long nice)
{
bool queued, running;
- int old_prio, delta;
+ int old_prio;
struct rq_flags rf;
struct rq *rq;
@@ -4630,19 +5013,18 @@
set_load_weight(p, true);
old_prio = p->prio;
p->prio = effective_prio(p);
- delta = p->prio - old_prio;
- if (queued) {
+ if (queued)
enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
- /*
- * If the task increased its priority or is running and
- * lowered its priority, then reschedule its CPU:
- */
- if (delta < 0 || (delta > 0 && task_running(rq, p)))
- resched_curr(rq);
- }
if (running)
set_next_task(rq, p);
+
+ /*
+ * If the task increased its priority or is running and
+ * lowered its priority, then reschedule its CPU:
+ */
+ p->sched_class->prio_changed(rq, p, old_prio);
+
out_unlock:
task_rq_unlock(rq, p, &rf);
}
@@ -4727,7 +5109,7 @@
return 0;
#ifdef CONFIG_SMP
- if (!llist_empty(&rq->wake_list))
+ if (rq->ttwu_pending)
return 0;
#endif
@@ -4804,35 +5186,6 @@
set_load_weight(p, true);
}
-/* Actually do priority change: must hold pi & rq lock. */
-static void __setscheduler(struct rq *rq, struct task_struct *p,
- const struct sched_attr *attr, bool keep_boost)
-{
- /*
- * If params can't change scheduling class changes aren't allowed
- * either.
- */
- if (attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)
- return;
-
- __setscheduler_params(p, attr);
-
- /*
- * Keep a potential priority boosting if called from
- * sched_setscheduler().
- */
- p->prio = normal_prio(p);
- if (keep_boost)
- p->prio = rt_effective_prio(p, p->prio);
-
- if (dl_prio(p->prio))
- p->sched_class = &dl_sched_class;
- else if (rt_prio(p->prio))
- p->sched_class = &rt_sched_class;
- else
- p->sched_class = &fair_sched_class;
-}
-
/*
* Check the target process has a UID that matches the current process's:
*/
@@ -4853,10 +5206,8 @@
const struct sched_attr *attr,
bool user, bool pi)
{
- int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 :
- MAX_RT_PRIO - 1 - attr->sched_priority;
- int retval, oldprio, oldpolicy = -1, queued, running;
- int new_effective_prio, policy = attr->sched_policy;
+ int oldpolicy = -1, policy = attr->sched_policy;
+ int retval, oldprio, newprio, queued, running;
const struct sched_class *prev_class;
struct rq_flags rf;
int reset_on_fork;
@@ -5054,6 +5405,7 @@
p->sched_reset_on_fork = reset_on_fork;
oldprio = p->prio;
+ newprio = __normal_prio(policy, attr->sched_priority, attr->sched_nice);
if (pi) {
/*
* Take priority boosted tasks into account. If the new
@@ -5062,8 +5414,8 @@
* the runqueue. This will be done when the task deboost
* itself.
*/
- new_effective_prio = rt_effective_prio(p, newprio);
- if (new_effective_prio == oldprio)
+ newprio = rt_effective_prio(p, newprio);
+ if (newprio == oldprio)
queue_flags &= ~DEQUEUE_MOVE;
}
@@ -5076,7 +5428,10 @@
prev_class = p->sched_class;
- __setscheduler(rq, p, attr, pi);
+ if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) {
+ __setscheduler_params(p, attr);
+ __setscheduler_prio(p, newprio);
+ }
__setscheduler_uclamp(p, attr);
if (queued) {
@@ -5140,6 +5495,8 @@
* @policy: new policy.
* @param: structure containing the new RT priority.
*
+ * Use sched_set_fifo(), read its comment.
+ *
* Return: 0 on success. An error code otherwise.
*
* NOTE that the task may be already dead.
@@ -5149,13 +5506,11 @@
{
return _sched_setscheduler(p, policy, param, true);
}
-EXPORT_SYMBOL_GPL(sched_setscheduler);
int sched_setattr(struct task_struct *p, const struct sched_attr *attr)
{
return __sched_setscheduler(p, attr, true, true);
}
-EXPORT_SYMBOL_GPL(sched_setattr);
int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr)
{
@@ -5180,7 +5535,51 @@
{
return _sched_setscheduler(p, policy, param, false);
}
-EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck);
+
+/*
+ * SCHED_FIFO is a broken scheduler model; that is, it is fundamentally
+ * incapable of resource management, which is the one thing an OS really should
+ * be doing.
+ *
+ * This is of course the reason it is limited to privileged users only.
+ *
+ * Worse still; it is fundamentally impossible to compose static priority
+ * workloads. You cannot take two correctly working static prio workloads
+ * and smash them together and still expect them to work.
+ *
+ * For this reason 'all' FIFO tasks the kernel creates are basically at:
+ *
+ * MAX_RT_PRIO / 2
+ *
+ * The administrator _MUST_ configure the system, the kernel simply doesn't
+ * know enough information to make a sensible choice.
+ */
+void sched_set_fifo(struct task_struct *p)
+{
+ struct sched_param sp = { .sched_priority = MAX_RT_PRIO / 2 };
+ WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0);
+}
+EXPORT_SYMBOL_GPL(sched_set_fifo);
+
+/*
+ * For when you don't much care about FIFO, but want to be above SCHED_NORMAL.
+ */
+void sched_set_fifo_low(struct task_struct *p)
+{
+ struct sched_param sp = { .sched_priority = 1 };
+ WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0);
+}
+EXPORT_SYMBOL_GPL(sched_set_fifo_low);
+
+void sched_set_normal(struct task_struct *p, int nice)
+{
+ struct sched_attr attr = {
+ .sched_policy = SCHED_NORMAL,
+ .sched_nice = nice,
+ };
+ WARN_ON_ONCE(sched_setattr_nocheck(p, &attr) != 0);
+}
+EXPORT_SYMBOL_GPL(sched_set_normal);
static int
do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
@@ -5471,6 +5870,11 @@
kattr.sched_nice = task_nice(p);
#ifdef CONFIG_UCLAMP_TASK
+ /*
+ * This could race with another potential updater, but this is fine
+ * because it'll correctly read the old or the new value. We don't need
+ * to guarantee who wins the race as long as it doesn't return garbage.
+ */
kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value;
kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value;
#endif
@@ -5824,7 +6228,7 @@
if (task_running(p_rq, p) || p->state)
goto out_unlock;
- yielded = curr->sched_class->yield_to_task(rq, p, preempt);
+ yielded = curr->sched_class->yield_to_task(rq, p);
if (yielded) {
schedstat_inc(rq->yld_count);
/*
@@ -6022,10 +6426,10 @@
if (!try_get_task_stack(p))
return;
- printk(KERN_INFO "%-15.15s %c", p->comm, task_state_to_char(p));
+ pr_info("task:%-15.15s state:%c", p->comm, task_state_to_char(p));
if (p->state == TASK_RUNNING)
- printk(KERN_CONT " running task ");
+ pr_cont(" running task ");
#ifdef CONFIG_DEBUG_STACK_USAGE
free = stack_not_used(p);
#endif
@@ -6034,12 +6438,12 @@
if (pid_alive(p))
ppid = task_pid_nr(rcu_dereference(p->real_parent));
rcu_read_unlock();
- printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
- task_pid_nr(p), ppid,
+ pr_cont(" stack:%5lu pid:%5d ppid:%6d flags:0x%08lx\n",
+ free, task_pid_nr(p), ppid,
(unsigned long)task_thread_info(p)->flags);
print_worker_info(KERN_INFO, p);
- show_stack(p, NULL);
+ show_stack(p, NULL, KERN_INFO);
put_task_stack(p);
}
EXPORT_SYMBOL_GPL(sched_show_task);
@@ -6070,13 +6474,6 @@
{
struct task_struct *g, *p;
-#if BITS_PER_LONG == 32
- printk(KERN_INFO
- " task PC stack pid father\n");
-#else
- printk(KERN_INFO
- " task PC stack pid father\n");
-#endif
rcu_read_lock();
for_each_process_thread(g, p) {
/*
@@ -6112,7 +6509,7 @@
* NOTE: this function does not set the idle thread's NEED_RESCHED
* flag, to make booting more robust.
*/
-void init_idle(struct task_struct *idle, int cpu)
+void __init init_idle(struct task_struct *idle, int cpu)
{
struct rq *rq = cpu_rq(cpu);
unsigned long flags;
@@ -6126,8 +6523,6 @@
idle->se.exec_start = sched_clock();
idle->flags |= PF_IDLE;
- kasan_unpoison_task_stack(idle);
-
#ifdef CONFIG_SMP
/*
* Its possible that init_idle() gets called multiple times on a task,
@@ -6308,7 +6703,7 @@
struct task_struct *next;
for_each_class(class) {
- next = class->pick_next_task(rq, NULL, NULL);
+ next = class->pick_next_task(rq);
if (next) {
next->sched_class->put_prev_task(rq, next);
return next;
@@ -6573,7 +6968,6 @@
struct rq_flags rf;
/* Handle pending wakeups and then migrate everything off */
- sched_ttwu_pending();
sched_tick_stop(cpu);
rq_lock_irqsave(rq, &rf);
@@ -6658,6 +7052,14 @@
unsigned long ptr = 0;
int i;
+ /* Make sure the linker didn't screw up */
+ BUG_ON(&idle_sched_class + 1 != &fair_sched_class ||
+ &fair_sched_class + 1 != &rt_sched_class ||
+ &rt_sched_class + 1 != &dl_sched_class);
+#ifdef CONFIG_SMP
+ BUG_ON(&dl_sched_class + 1 != &stop_sched_class);
+#endif
+
wait_bit_init();
#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -6676,6 +7078,8 @@
root_task_group.cfs_rq = (struct cfs_rq **)ptr;
ptr += nr_cpu_ids * sizeof(void **);
+ root_task_group.shares = ROOT_TASK_GROUP_LOAD;
+ init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
#endif /* CONFIG_FAIR_GROUP_SCHED */
#ifdef CONFIG_RT_GROUP_SCHED
root_task_group.rt_se = (struct sched_rt_entity **)ptr;
@@ -6728,7 +7132,6 @@
init_rt_rq(&rq->rt);
init_dl_rq(&rq->dl);
#ifdef CONFIG_FAIR_GROUP_SCHED
- root_task_group.shares = ROOT_TASK_GROUP_LOAD;
INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
/*
@@ -6750,7 +7153,6 @@
* We achieve this by letting root_task_group's tasks sit
* directly in rq->cfs (i.e root_task_group->se[] = NULL).
*/
- init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
#endif /* CONFIG_FAIR_GROUP_SCHED */
@@ -6776,9 +7178,10 @@
rq_attach_root(rq, &def_root_domain);
#ifdef CONFIG_NO_HZ_COMMON
- rq->last_load_update_tick = jiffies;
rq->last_blocked_load_update_tick = jiffies;
atomic_set(&rq->nohz_flags, 0);
+
+ rq_csd_init(rq, &rq->nohz_csd, nohz_csd_func);
#endif
#endif /* CONFIG_SMP */
hrtick_rq_init(rq);
@@ -6883,8 +7286,7 @@
if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)
&& !preempt_count_equals(preempt_offset)) {
pr_err("Preemption disabled at:");
- print_ip_sym(preempt_disable_ip);
- pr_cont("\n");
+ print_ip_sym(KERN_ERR, preempt_disable_ip);
}
dump_stack();
add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
@@ -8075,4 +8477,7 @@
/* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
};
-#undef CREATE_TRACE_POINTS
+void call_trace_sched_update_nr_running(struct rq *rq, int count)
+{
+ trace_sched_update_nr_running_tp(rq, count);
+}
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
index 9fbb103..941c28c 100644
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -5,6 +5,7 @@
* Based on the work by Paul Menage (menage@google.com) and Balbir Singh
* (balbir@in.ibm.com).
*/
+#include <asm/irq_regs.h>
#include "sched.h"
/* Time spent by the tasks of the CPU accounting group executing in ... */
@@ -339,7 +340,7 @@
{
struct cpuacct *ca;
int index = CPUACCT_STAT_SYSTEM;
- struct pt_regs *regs = task_pt_regs(tsk);
+ struct pt_regs *regs = get_irq_regs() ? : task_pt_regs(tsk);
if (regs && user_mode(regs))
index = CPUACCT_STAT_USER;
@@ -347,7 +348,7 @@
rcu_read_lock();
for (ca = task_ca(tsk); ca; ca = parent_ca(ca))
- this_cpu_ptr(ca->cpuusage)->usages[index] += cputime;
+ __this_cpu_add(ca->cpuusage->usages[index], cputime);
rcu_read_unlock();
}
@@ -363,7 +364,7 @@
rcu_read_lock();
for (ca = task_ca(tsk); ca != &root_cpuacct; ca = parent_ca(ca))
- this_cpu_ptr(ca->cpustat)->cpustat[index] += val;
+ __this_cpu_add(ca->cpustat->cpustat[index], val);
rcu_read_unlock();
}
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index 5cc4012..8cb06c8 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -121,6 +121,30 @@
if (later_mask &&
cpumask_and(later_mask, cp->free_cpus, p->cpus_ptr)) {
+ unsigned long cap, max_cap = 0;
+ int cpu, max_cpu = -1;
+
+ if (!static_branch_unlikely(&sched_asym_cpucapacity))
+ return 1;
+
+ /* Ensure the capacity of the CPUs fits the task. */
+ for_each_cpu(cpu, later_mask) {
+ if (!dl_task_fits_capacity(p, cpu)) {
+ cpumask_clear_cpu(cpu, later_mask);
+
+ cap = capacity_orig_of(cpu);
+
+ if (cap > max_cap ||
+ (cpu == task_cpu(p) && cap == max_cap)) {
+ max_cap = cap;
+ max_cpu = cpu;
+ }
+ }
+ }
+
+ if (cpumask_empty(later_mask))
+ cpumask_set_cpu(max_cpu, later_mask);
+
return 1;
} else {
int best_cpu = cpudl_maximum(cp);
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 4cb80e6..5e39da0 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -102,8 +102,12 @@
static bool sugov_update_next_freq(struct sugov_policy *sg_policy, u64 time,
unsigned int next_freq)
{
- if (sg_policy->next_freq == next_freq)
- return false;
+ if (!sg_policy->need_freq_update) {
+ if (sg_policy->next_freq == next_freq)
+ return false;
+ } else {
+ sg_policy->need_freq_update = cpufreq_driver_test_flags(CPUFREQ_NEED_UPDATE_LIMITS);
+ }
sg_policy->next_freq = next_freq;
sg_policy->last_freq_update_time = time;
@@ -114,22 +118,8 @@
static void sugov_fast_switch(struct sugov_policy *sg_policy, u64 time,
unsigned int next_freq)
{
- struct cpufreq_policy *policy = sg_policy->policy;
- int cpu;
-
- if (!sugov_update_next_freq(sg_policy, time, next_freq))
- return;
-
- next_freq = cpufreq_driver_fast_switch(policy, next_freq);
- if (!next_freq)
- return;
-
- policy->cur = next_freq;
-
- if (trace_cpu_frequency_enabled()) {
- for_each_cpu(cpu, policy->cpus)
- trace_cpu_frequency(next_freq, cpu);
- }
+ if (sugov_update_next_freq(sg_policy, time, next_freq))
+ cpufreq_driver_fast_switch(sg_policy->policy, next_freq);
}
static void sugov_deferred_update(struct sugov_policy *sg_policy, u64 time,
@@ -178,7 +168,6 @@
if (freq == sg_policy->cached_raw_freq && !sg_policy->need_freq_update)
return sg_policy->next_freq;
- sg_policy->need_freq_update = false;
sg_policy->cached_raw_freq = freq;
return cpufreq_driver_resolve_freq(policy, freq);
}
@@ -238,7 +227,7 @@
*/
util = util_cfs + cpu_util_rt(rq);
if (type == FREQUENCY_UTIL)
- util = uclamp_util_with(rq, util, p);
+ util = uclamp_rq_util_with(rq, util, p);
dl_util = cpu_util_dl(rq);
@@ -454,7 +443,7 @@
struct sugov_policy *sg_policy = sg_cpu->sg_policy;
unsigned long util, max;
unsigned int next_f;
- bool busy;
+ unsigned int cached_freq = sg_policy->cached_raw_freq;
sugov_iowait_boost(sg_cpu, time, flags);
sg_cpu->last_update = time;
@@ -464,9 +453,6 @@
if (!sugov_should_update_freq(sg_policy, time))
return;
- /* Limits may have changed, don't skip frequency update */
- busy = !sg_policy->need_freq_update && sugov_cpu_is_busy(sg_cpu);
-
util = sugov_get_util(sg_cpu);
max = sg_cpu->max;
util = sugov_iowait_apply(sg_cpu, time, util, max);
@@ -475,11 +461,11 @@
* Do not reduce the frequency if the CPU has not been idle
* recently, as the reduction is likely to be premature then.
*/
- if (busy && next_f < sg_policy->next_freq) {
+ if (sugov_cpu_is_busy(sg_cpu) && next_f < sg_policy->next_freq) {
next_f = sg_policy->next_freq;
- /* Reset cached freq as next_freq has changed */
- sg_policy->cached_raw_freq = 0;
+ /* Restore cached freq as next_freq has changed */
+ sg_policy->cached_raw_freq = cached_freq;
}
/*
@@ -624,9 +610,17 @@
};
ATTRIBUTE_GROUPS(sugov);
+static void sugov_tunables_free(struct kobject *kobj)
+{
+ struct gov_attr_set *attr_set = container_of(kobj, struct gov_attr_set, kobj);
+
+ kfree(to_sugov_tunables(attr_set));
+}
+
static struct kobj_type sugov_tunables_ktype = {
.default_groups = sugov_groups,
.sysfs_ops = &governor_sysfs_ops,
+ .release = &sugov_tunables_free,
};
/********************** cpufreq governor interface *********************/
@@ -726,12 +720,10 @@
return tunables;
}
-static void sugov_tunables_free(struct sugov_tunables *tunables)
+static void sugov_clear_global_tunables(void)
{
if (!have_governor_per_policy())
global_tunables = NULL;
-
- kfree(tunables);
}
static int sugov_init(struct cpufreq_policy *policy)
@@ -794,7 +786,7 @@
fail:
kobject_put(&tunables->attr_set.kobj);
policy->governor_data = NULL;
- sugov_tunables_free(tunables);
+ sugov_clear_global_tunables();
stop_kthread:
sugov_kthread_stop(sg_policy);
@@ -821,7 +813,7 @@
count = gov_attr_set_put(&tunables->attr_set, &sg_policy->tunables_hook);
policy->governor_data = NULL;
if (!count)
- sugov_tunables_free(tunables);
+ sugov_clear_global_tunables();
mutex_unlock(&global_tunables_lock);
@@ -840,9 +832,10 @@
sg_policy->next_freq = 0;
sg_policy->work_in_progress = false;
sg_policy->limits_changed = false;
- sg_policy->need_freq_update = false;
sg_policy->cached_raw_freq = 0;
+ sg_policy->need_freq_update = cpufreq_driver_test_flags(CPUFREQ_NEED_UPDATE_LIMITS);
+
for_each_cpu(cpu, policy->cpus) {
struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu);
@@ -894,7 +887,7 @@
struct cpufreq_governor schedutil_gov = {
.name = "schedutil",
.owner = THIS_MODULE,
- .dynamic_switching = true,
+ .flags = CPUFREQ_GOV_DYNAMIC_SWITCHING,
.init = sugov_init,
.exit = sugov_exit,
.start = sugov_start,
@@ -909,11 +902,7 @@
}
#endif
-static int __init sugov_register(void)
-{
- return cpufreq_register_governor(&schedutil_gov);
-}
-fs_initcall(sugov_register);
+cpufreq_governor_init(schedutil_gov);
#ifdef CONFIG_ENERGY_MODEL
extern bool sched_energy_update;
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
index b7abca9..0033731 100644
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -41,11 +41,72 @@
return cpupri;
}
+static inline int __cpupri_find(struct cpupri *cp, struct task_struct *p,
+ struct cpumask *lowest_mask, int idx)
+{
+ struct cpupri_vec *vec = &cp->pri_to_cpu[idx];
+ int skip = 0;
+
+ if (!atomic_read(&(vec)->count))
+ skip = 1;
+ /*
+ * When looking at the vector, we need to read the counter,
+ * do a memory barrier, then read the mask.
+ *
+ * Note: This is still all racey, but we can deal with it.
+ * Ideally, we only want to look at masks that are set.
+ *
+ * If a mask is not set, then the only thing wrong is that we
+ * did a little more work than necessary.
+ *
+ * If we read a zero count but the mask is set, because of the
+ * memory barriers, that can only happen when the highest prio
+ * task for a run queue has left the run queue, in which case,
+ * it will be followed by a pull. If the task we are processing
+ * fails to find a proper place to go, that pull request will
+ * pull this task if the run queue is running at a lower
+ * priority.
+ */
+ smp_rmb();
+
+ /* Need to do the rmb for every iteration */
+ if (skip)
+ return 0;
+
+ if (cpumask_any_and(p->cpus_ptr, vec->mask) >= nr_cpu_ids)
+ return 0;
+
+ if (lowest_mask) {
+ cpumask_and(lowest_mask, p->cpus_ptr, vec->mask);
+
+ /*
+ * We have to ensure that we have at least one bit
+ * still set in the array, since the map could have
+ * been concurrently emptied between the first and
+ * second reads of vec->mask. If we hit this
+ * condition, simply act as though we never hit this
+ * priority level and continue on.
+ */
+ if (cpumask_empty(lowest_mask))
+ return 0;
+ }
+
+ return 1;
+}
+
+int cpupri_find(struct cpupri *cp, struct task_struct *p,
+ struct cpumask *lowest_mask)
+{
+ return cpupri_find_fitness(cp, p, lowest_mask, NULL);
+}
+
/**
- * cpupri_find - find the best (lowest-pri) CPU in the system
+ * cpupri_find_fitness - find the best (lowest-pri) CPU in the system
* @cp: The cpupri context
* @p: The task
* @lowest_mask: A mask to fill in with selected CPUs (or NULL)
+ * @fitness_fn: A pointer to a function to do custom checks whether the CPU
+ * fits a specific criteria so that we only return those CPUs.
*
* Note: This function returns the recommended CPUs as calculated during the
* current invocation. By the time the call returns, the CPUs may have in
@@ -56,65 +117,59 @@
*
* Return: (int)bool - CPUs were found
*/
-int cpupri_find(struct cpupri *cp, struct task_struct *p,
- struct cpumask *lowest_mask)
+int cpupri_find_fitness(struct cpupri *cp, struct task_struct *p,
+ struct cpumask *lowest_mask,
+ bool (*fitness_fn)(struct task_struct *p, int cpu))
{
- int idx = 0;
int task_pri = convert_prio(p->prio);
+ int idx, cpu;
BUG_ON(task_pri >= CPUPRI_NR_PRIORITIES);
for (idx = 0; idx < task_pri; idx++) {
- struct cpupri_vec *vec = &cp->pri_to_cpu[idx];
- int skip = 0;
- if (!atomic_read(&(vec)->count))
- skip = 1;
- /*
- * When looking at the vector, we need to read the counter,
- * do a memory barrier, then read the mask.
- *
- * Note: This is still all racey, but we can deal with it.
- * Ideally, we only want to look at masks that are set.
- *
- * If a mask is not set, then the only thing wrong is that we
- * did a little more work than necessary.
- *
- * If we read a zero count but the mask is set, because of the
- * memory barriers, that can only happen when the highest prio
- * task for a run queue has left the run queue, in which case,
- * it will be followed by a pull. If the task we are processing
- * fails to find a proper place to go, that pull request will
- * pull this task if the run queue is running at a lower
- * priority.
- */
- smp_rmb();
-
- /* Need to do the rmb for every iteration */
- if (skip)
+ if (!__cpupri_find(cp, p, lowest_mask, idx))
continue;
- if (cpumask_any_and(p->cpus_ptr, vec->mask) >= nr_cpu_ids)
- continue;
+ if (!lowest_mask || !fitness_fn)
+ return 1;
- if (lowest_mask) {
- cpumask_and(lowest_mask, p->cpus_ptr, vec->mask);
-
- /*
- * We have to ensure that we have at least one bit
- * still set in the array, since the map could have
- * been concurrently emptied between the first and
- * second reads of vec->mask. If we hit this
- * condition, simply act as though we never hit this
- * priority level and continue on.
- */
- if (cpumask_any(lowest_mask) >= nr_cpu_ids)
- continue;
+ /* Ensure the capacity of the CPUs fit the task */
+ for_each_cpu(cpu, lowest_mask) {
+ if (!fitness_fn(p, cpu))
+ cpumask_clear_cpu(cpu, lowest_mask);
}
+ /*
+ * If no CPU at the current priority can fit the task
+ * continue looking
+ */
+ if (cpumask_empty(lowest_mask))
+ continue;
+
return 1;
}
+ /*
+ * If we failed to find a fitting lowest_mask, kick off a new search
+ * but without taking into account any fitness criteria this time.
+ *
+ * This rule favours honouring priority over fitting the task in the
+ * correct CPU (Capacity Awareness being the only user now).
+ * The idea is that if a higher priority task can run, then it should
+ * run even if this ends up being on unfitting CPU.
+ *
+ * The cost of this trade-off is not entirely clear and will probably
+ * be good for some workloads and bad for others.
+ *
+ * The main idea here is that if some CPUs were overcommitted, we try
+ * to spread which is what the scheduler traditionally did. Sys admins
+ * must do proper RT planning to avoid overloading the system if they
+ * really care.
+ */
+ if (fitness_fn)
+ return cpupri_find(cp, p, lowest_mask);
+
return 0;
}
diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h
index 7dc20a3..efbb492 100644
--- a/kernel/sched/cpupri.h
+++ b/kernel/sched/cpupri.h
@@ -18,7 +18,11 @@
};
#ifdef CONFIG_SMP
-int cpupri_find(struct cpupri *cp, struct task_struct *p, struct cpumask *lowest_mask);
+int cpupri_find(struct cpupri *cp, struct task_struct *p,
+ struct cpumask *lowest_mask);
+int cpupri_find_fitness(struct cpupri *cp, struct task_struct *p,
+ struct cpumask *lowest_mask,
+ bool (*fitness_fn)(struct task_struct *p, int cpu));
void cpupri_set(struct cpupri *cp, int cpu, int pri);
int cpupri_init(struct cpupri *cp);
void cpupri_cleanup(struct cpupri *cp);
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 46ed4e1..ca0eef7 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -147,10 +147,10 @@
/* Add guest time to cpustat. */
if (task_nice(p) > 0) {
- cpustat[CPUTIME_NICE] += cputime;
+ task_group_account_field(p, CPUTIME_NICE, cputime);
cpustat[CPUTIME_GUEST_NICE] += cputime;
} else {
- cpustat[CPUTIME_USER] += cputime;
+ task_group_account_field(p, CPUTIME_USER, cputime);
cpustat[CPUTIME_GUEST] += cputime;
}
}
@@ -355,7 +355,7 @@
* softirq as those do not count in task exec_runtime any more.
*/
static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
- struct rq *rq, int ticks)
+ int ticks)
{
u64 other, cputime = TICK_NSEC * ticks;
@@ -381,7 +381,7 @@
account_system_index_time(p, cputime, CPUTIME_SOFTIRQ);
} else if (user_tick) {
account_user_time(p, cputime);
- } else if (p == rq->idle) {
+ } else if (p == this_rq()->idle) {
account_idle_time(cputime);
} else if (p->flags & PF_VCPU) { /* System time or guest time */
account_guest_time(p, cputime);
@@ -392,40 +392,36 @@
static void irqtime_account_idle_ticks(int ticks)
{
- struct rq *rq = this_rq();
-
- irqtime_account_process_tick(current, 0, rq, ticks);
+ irqtime_account_process_tick(current, 0, ticks);
}
#else /* CONFIG_IRQ_TIME_ACCOUNTING */
static inline void irqtime_account_idle_ticks(int ticks) { }
static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick,
- struct rq *rq, int nr_ticks) { }
+ int nr_ticks) { }
#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
/*
* Use precise platform statistics if available:
*/
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
+
# ifndef __ARCH_HAS_VTIME_TASK_SWITCH
-void vtime_common_task_switch(struct task_struct *prev)
+void vtime_task_switch(struct task_struct *prev)
{
if (is_idle_task(prev))
vtime_account_idle(prev);
else
- vtime_account_system(prev);
+ vtime_account_kernel(prev);
vtime_flush(prev);
arch_vtime_task_switch(prev);
}
# endif
-#endif /* CONFIG_VIRT_CPU_ACCOUNTING */
-
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
/*
* Archs that account the whole time spent in the idle task
* (outside irq) as idle time can rely on this and just implement
- * vtime_account_system() and vtime_account_idle(). Archs that
+ * vtime_account_kernel() and vtime_account_idle(). Archs that
* have other meaning of the idle time (s390 only includes the
* time spent by the CPU when it's in low power mode) must override
* vtime_account().
@@ -436,7 +432,7 @@
if (!in_interrupt() && is_idle_task(tsk))
vtime_account_idle(tsk);
else
- vtime_account_system(tsk);
+ vtime_account_kernel(tsk);
}
EXPORT_SYMBOL_GPL(vtime_account_irq_enter);
#endif /* __ARCH_HAS_VTIME_ACCOUNT */
@@ -475,13 +471,12 @@
void account_process_tick(struct task_struct *p, int user_tick)
{
u64 cputime, steal;
- struct rq *rq = this_rq();
- if (vtime_accounting_cpu_enabled())
+ if (vtime_accounting_enabled_this_cpu())
return;
if (sched_clock_irqtime) {
- irqtime_account_process_tick(p, user_tick, rq, 1);
+ irqtime_account_process_tick(p, user_tick, 1);
return;
}
@@ -495,7 +490,7 @@
if (user_tick)
account_user_time(p, cputime);
- else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
+ else if ((p != this_rq()->idle) || (irq_count() != HARDIRQ_OFFSET))
account_system_time(p, HARDIRQ_OFFSET, cputime);
else
account_idle_time(cputime);
@@ -525,50 +520,6 @@
}
/*
- * Perform (stime * rtime) / total, but avoid multiplication overflow by
- * losing precision when the numbers are big.
- */
-static u64 scale_stime(u64 stime, u64 rtime, u64 total)
-{
- u64 scaled;
-
- for (;;) {
- /* Make sure "rtime" is the bigger of stime/rtime */
- if (stime > rtime)
- swap(rtime, stime);
-
- /* Make sure 'total' fits in 32 bits */
- if (total >> 32)
- goto drop_precision;
-
- /* Does rtime (and thus stime) fit in 32 bits? */
- if (!(rtime >> 32))
- break;
-
- /* Can we just balance rtime/stime rather than dropping bits? */
- if (stime >> 31)
- goto drop_precision;
-
- /* We can grow stime and shrink rtime and try to make them both fit */
- stime <<= 1;
- rtime >>= 1;
- continue;
-
-drop_precision:
- /* We drop from rtime, it has more bits than stime */
- rtime >>= 1;
- total >>= 1;
- }
-
- /*
- * Make sure gcc understands that this is a 32x32->64 multiply,
- * followed by a 64/32->64 divide.
- */
- scaled = div_u64((u64) (u32) stime * (u64) (u32) rtime, (u32)total);
- return scaled;
-}
-
-/*
* Adjust tick based cputime random precision against scheduler runtime
* accounting.
*
@@ -627,7 +578,7 @@
goto update;
}
- stime = scale_stime(stime, rtime, stime + utime);
+ stime = mul_u64_u64_div_u64(stime, rtime, stime + utime);
update:
/*
@@ -711,8 +662,8 @@
return delta - other;
}
-static void __vtime_account_system(struct task_struct *tsk,
- struct vtime *vtime)
+static void vtime_account_system(struct task_struct *tsk,
+ struct vtime *vtime)
{
vtime->stime += get_vtime_delta(vtime);
if (vtime->stime >= TICK_NSEC) {
@@ -731,7 +682,17 @@
}
}
-void vtime_account_system(struct task_struct *tsk)
+static void __vtime_account_kernel(struct task_struct *tsk,
+ struct vtime *vtime)
+{
+ /* We might have scheduled out from guest path */
+ if (vtime->state == VTIME_GUEST)
+ vtime_account_guest(tsk, vtime);
+ else
+ vtime_account_system(tsk, vtime);
+}
+
+void vtime_account_kernel(struct task_struct *tsk)
{
struct vtime *vtime = &tsk->vtime;
@@ -739,11 +700,7 @@
return;
write_seqcount_begin(&vtime->seqcount);
- /* We might have scheduled out from guest path */
- if (tsk->flags & PF_VCPU)
- vtime_account_guest(tsk, vtime);
- else
- __vtime_account_system(tsk, vtime);
+ __vtime_account_kernel(tsk, vtime);
write_seqcount_end(&vtime->seqcount);
}
@@ -752,7 +709,7 @@
struct vtime *vtime = &tsk->vtime;
write_seqcount_begin(&vtime->seqcount);
- __vtime_account_system(tsk, vtime);
+ vtime_account_system(tsk, vtime);
vtime->state = VTIME_USER;
write_seqcount_end(&vtime->seqcount);
}
@@ -782,8 +739,9 @@
* that can thus safely catch up with a tickless delta.
*/
write_seqcount_begin(&vtime->seqcount);
- __vtime_account_system(tsk, vtime);
+ vtime_account_system(tsk, vtime);
tsk->flags |= PF_VCPU;
+ vtime->state = VTIME_GUEST;
write_seqcount_end(&vtime->seqcount);
}
EXPORT_SYMBOL_GPL(vtime_guest_enter);
@@ -795,6 +753,7 @@
write_seqcount_begin(&vtime->seqcount);
vtime_account_guest(tsk, vtime);
tsk->flags &= ~PF_VCPU;
+ vtime->state = VTIME_SYS;
write_seqcount_end(&vtime->seqcount);
}
EXPORT_SYMBOL_GPL(vtime_guest_exit);
@@ -804,19 +763,30 @@
account_idle_time(get_vtime_delta(&tsk->vtime));
}
-void arch_vtime_task_switch(struct task_struct *prev)
+void vtime_task_switch_generic(struct task_struct *prev)
{
struct vtime *vtime = &prev->vtime;
write_seqcount_begin(&vtime->seqcount);
+ if (vtime->state == VTIME_IDLE)
+ vtime_account_idle(prev);
+ else
+ __vtime_account_kernel(prev, vtime);
vtime->state = VTIME_INACTIVE;
+ vtime->cpu = -1;
write_seqcount_end(&vtime->seqcount);
vtime = ¤t->vtime;
write_seqcount_begin(&vtime->seqcount);
- vtime->state = VTIME_SYS;
+ if (is_idle_task(current))
+ vtime->state = VTIME_IDLE;
+ else if (current->flags & PF_VCPU)
+ vtime->state = VTIME_GUEST;
+ else
+ vtime->state = VTIME_SYS;
vtime->starttime = sched_clock();
+ vtime->cpu = smp_processor_id();
write_seqcount_end(&vtime->seqcount);
}
@@ -827,8 +797,9 @@
local_irq_save(flags);
write_seqcount_begin(&vtime->seqcount);
- vtime->state = VTIME_SYS;
+ vtime->state = VTIME_IDLE;
vtime->starttime = sched_clock();
+ vtime->cpu = cpu;
write_seqcount_end(&vtime->seqcount);
local_irq_restore(flags);
}
@@ -846,7 +817,7 @@
seq = read_seqcount_begin(&vtime->seqcount);
gtime = t->gtime;
- if (vtime->state == VTIME_SYS && t->flags & PF_VCPU)
+ if (vtime->state == VTIME_GUEST)
gtime += vtime->gtime + vtime_delta(vtime);
} while (read_seqcount_retry(&vtime->seqcount, seq));
@@ -877,20 +848,233 @@
*utime = t->utime;
*stime = t->stime;
- /* Task is sleeping, nothing to add */
- if (vtime->state == VTIME_INACTIVE || is_idle_task(t))
+ /* Task is sleeping or idle, nothing to add */
+ if (vtime->state < VTIME_SYS)
continue;
delta = vtime_delta(vtime);
/*
- * Task runs either in user or kernel space, add pending nohz time to
- * the right place.
+ * Task runs either in user (including guest) or kernel space,
+ * add pending nohz time to the right place.
*/
- if (vtime->state == VTIME_USER || t->flags & PF_VCPU)
- *utime += vtime->utime + delta;
- else if (vtime->state == VTIME_SYS)
+ if (vtime->state == VTIME_SYS)
*stime += vtime->stime + delta;
+ else
+ *utime += vtime->utime + delta;
} while (read_seqcount_retry(&vtime->seqcount, seq));
}
+
+static int vtime_state_fetch(struct vtime *vtime, int cpu)
+{
+ int state = READ_ONCE(vtime->state);
+
+ /*
+ * We raced against a context switch, fetch the
+ * kcpustat task again.
+ */
+ if (vtime->cpu != cpu && vtime->cpu != -1)
+ return -EAGAIN;
+
+ /*
+ * Two possible things here:
+ * 1) We are seeing the scheduling out task (prev) or any past one.
+ * 2) We are seeing the scheduling in task (next) but it hasn't
+ * passed though vtime_task_switch() yet so the pending
+ * cputime of the prev task may not be flushed yet.
+ *
+ * Case 1) is ok but 2) is not. So wait for a safe VTIME state.
+ */
+ if (state == VTIME_INACTIVE)
+ return -EAGAIN;
+
+ return state;
+}
+
+static u64 kcpustat_user_vtime(struct vtime *vtime)
+{
+ if (vtime->state == VTIME_USER)
+ return vtime->utime + vtime_delta(vtime);
+ else if (vtime->state == VTIME_GUEST)
+ return vtime->gtime + vtime_delta(vtime);
+ return 0;
+}
+
+static int kcpustat_field_vtime(u64 *cpustat,
+ struct task_struct *tsk,
+ enum cpu_usage_stat usage,
+ int cpu, u64 *val)
+{
+ struct vtime *vtime = &tsk->vtime;
+ unsigned int seq;
+
+ do {
+ int state;
+
+ seq = read_seqcount_begin(&vtime->seqcount);
+
+ state = vtime_state_fetch(vtime, cpu);
+ if (state < 0)
+ return state;
+
+ *val = cpustat[usage];
+
+ /*
+ * Nice VS unnice cputime accounting may be inaccurate if
+ * the nice value has changed since the last vtime update.
+ * But proper fix would involve interrupting target on nice
+ * updates which is a no go on nohz_full (although the scheduler
+ * may still interrupt the target if rescheduling is needed...)
+ */
+ switch (usage) {
+ case CPUTIME_SYSTEM:
+ if (state == VTIME_SYS)
+ *val += vtime->stime + vtime_delta(vtime);
+ break;
+ case CPUTIME_USER:
+ if (task_nice(tsk) <= 0)
+ *val += kcpustat_user_vtime(vtime);
+ break;
+ case CPUTIME_NICE:
+ if (task_nice(tsk) > 0)
+ *val += kcpustat_user_vtime(vtime);
+ break;
+ case CPUTIME_GUEST:
+ if (state == VTIME_GUEST && task_nice(tsk) <= 0)
+ *val += vtime->gtime + vtime_delta(vtime);
+ break;
+ case CPUTIME_GUEST_NICE:
+ if (state == VTIME_GUEST && task_nice(tsk) > 0)
+ *val += vtime->gtime + vtime_delta(vtime);
+ break;
+ default:
+ break;
+ }
+ } while (read_seqcount_retry(&vtime->seqcount, seq));
+
+ return 0;
+}
+
+u64 kcpustat_field(struct kernel_cpustat *kcpustat,
+ enum cpu_usage_stat usage, int cpu)
+{
+ u64 *cpustat = kcpustat->cpustat;
+ u64 val = cpustat[usage];
+ struct rq *rq;
+ int err;
+
+ if (!vtime_accounting_enabled_cpu(cpu))
+ return val;
+
+ rq = cpu_rq(cpu);
+
+ for (;;) {
+ struct task_struct *curr;
+
+ rcu_read_lock();
+ curr = rcu_dereference(rq->curr);
+ if (WARN_ON_ONCE(!curr)) {
+ rcu_read_unlock();
+ return cpustat[usage];
+ }
+
+ err = kcpustat_field_vtime(cpustat, curr, usage, cpu, &val);
+ rcu_read_unlock();
+
+ if (!err)
+ return val;
+
+ cpu_relax();
+ }
+}
+EXPORT_SYMBOL_GPL(kcpustat_field);
+
+static int kcpustat_cpu_fetch_vtime(struct kernel_cpustat *dst,
+ const struct kernel_cpustat *src,
+ struct task_struct *tsk, int cpu)
+{
+ struct vtime *vtime = &tsk->vtime;
+ unsigned int seq;
+
+ do {
+ u64 *cpustat;
+ u64 delta;
+ int state;
+
+ seq = read_seqcount_begin(&vtime->seqcount);
+
+ state = vtime_state_fetch(vtime, cpu);
+ if (state < 0)
+ return state;
+
+ *dst = *src;
+ cpustat = dst->cpustat;
+
+ /* Task is sleeping, dead or idle, nothing to add */
+ if (state < VTIME_SYS)
+ continue;
+
+ delta = vtime_delta(vtime);
+
+ /*
+ * Task runs either in user (including guest) or kernel space,
+ * add pending nohz time to the right place.
+ */
+ if (state == VTIME_SYS) {
+ cpustat[CPUTIME_SYSTEM] += vtime->stime + delta;
+ } else if (state == VTIME_USER) {
+ if (task_nice(tsk) > 0)
+ cpustat[CPUTIME_NICE] += vtime->utime + delta;
+ else
+ cpustat[CPUTIME_USER] += vtime->utime + delta;
+ } else {
+ WARN_ON_ONCE(state != VTIME_GUEST);
+ if (task_nice(tsk) > 0) {
+ cpustat[CPUTIME_GUEST_NICE] += vtime->gtime + delta;
+ cpustat[CPUTIME_NICE] += vtime->gtime + delta;
+ } else {
+ cpustat[CPUTIME_GUEST] += vtime->gtime + delta;
+ cpustat[CPUTIME_USER] += vtime->gtime + delta;
+ }
+ }
+ } while (read_seqcount_retry(&vtime->seqcount, seq));
+
+ return 0;
+}
+
+void kcpustat_cpu_fetch(struct kernel_cpustat *dst, int cpu)
+{
+ const struct kernel_cpustat *src = &kcpustat_cpu(cpu);
+ struct rq *rq;
+ int err;
+
+ if (!vtime_accounting_enabled_cpu(cpu)) {
+ *dst = *src;
+ return;
+ }
+
+ rq = cpu_rq(cpu);
+
+ for (;;) {
+ struct task_struct *curr;
+
+ rcu_read_lock();
+ curr = rcu_dereference(rq->curr);
+ if (WARN_ON_ONCE(!curr)) {
+ rcu_read_unlock();
+ *dst = *src;
+ return;
+ }
+
+ err = kcpustat_cpu_fetch_vtime(dst, src, curr, cpu);
+ rcu_read_unlock();
+
+ if (!err)
+ return;
+
+ cpu_relax();
+ }
+}
+EXPORT_SYMBOL_GPL(kcpustat_cpu_fetch);
+
#endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 2bda9fd..a3ae00c 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -43,6 +43,28 @@
return !RB_EMPTY_NODE(&dl_se->rb_node);
}
+#ifdef CONFIG_RT_MUTEXES
+static inline struct sched_dl_entity *pi_of(struct sched_dl_entity *dl_se)
+{
+ return dl_se->pi_se;
+}
+
+static inline bool is_dl_boosted(struct sched_dl_entity *dl_se)
+{
+ return pi_of(dl_se) != dl_se;
+}
+#else
+static inline struct sched_dl_entity *pi_of(struct sched_dl_entity *dl_se)
+{
+ return dl_se;
+}
+
+static inline bool is_dl_boosted(struct sched_dl_entity *dl_se)
+{
+ return false;
+}
+#endif
+
#ifdef CONFIG_SMP
static inline struct dl_bw *dl_bw_of(int i)
{
@@ -54,15 +76,49 @@
static inline int dl_bw_cpus(int i)
{
struct root_domain *rd = cpu_rq(i)->rd;
- int cpus = 0;
+ int cpus;
RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(),
"sched RCU must be held");
+
+ if (cpumask_subset(rd->span, cpu_active_mask))
+ return cpumask_weight(rd->span);
+
+ cpus = 0;
+
for_each_cpu_and(i, rd->span, cpu_active_mask)
cpus++;
return cpus;
}
+
+static inline unsigned long __dl_bw_capacity(int i)
+{
+ struct root_domain *rd = cpu_rq(i)->rd;
+ unsigned long cap = 0;
+
+ RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(),
+ "sched RCU must be held");
+
+ for_each_cpu_and(i, rd->span, cpu_active_mask)
+ cap += capacity_orig_of(i);
+
+ return cap;
+}
+
+/*
+ * XXX Fix: If 'rq->rd == def_root_domain' perform AC against capacity
+ * of the CPU the task is running on rather rd's \Sum CPU capacity.
+ */
+static inline unsigned long dl_bw_capacity(int i)
+{
+ if (!static_branch_unlikely(&sched_asym_cpucapacity) &&
+ capacity_orig_of(i) == SCHED_CAPACITY_SCALE) {
+ return dl_bw_cpus(i) << SCHED_CAPACITY_SHIFT;
+ } else {
+ return __dl_bw_capacity(i);
+ }
+}
#else
static inline struct dl_bw *dl_bw_of(int i)
{
@@ -73,6 +129,11 @@
{
return 1;
}
+
+static inline unsigned long dl_bw_capacity(int i)
+{
+ return SCHED_CAPACITY_SCALE;
+}
#endif
static inline
@@ -153,7 +214,7 @@
__sub_running_bw(dl_se->dl_bw, dl_rq);
}
-void dl_change_utilization(struct task_struct *p, u64 new_bw)
+static void dl_change_utilization(struct task_struct *p, u64 new_bw)
{
struct rq *rq;
@@ -334,6 +395,8 @@
return dl_rq->root.rb_leftmost == &dl_se->rb_node;
}
+static void init_dl_rq_bw_ratio(struct dl_rq *dl_rq);
+
void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime)
{
raw_spin_lock_init(&dl_b->dl_runtime_lock);
@@ -657,7 +720,7 @@
struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
struct rq *rq = rq_of_dl_rq(dl_rq);
- WARN_ON(dl_se->dl_boosted);
+ WARN_ON(is_dl_boosted(dl_se));
WARN_ON(dl_time_before(rq_clock(rq), dl_se->deadline));
/*
@@ -695,21 +758,20 @@
* could happen are, typically, a entity voluntarily trying to overcome its
* runtime, or it just underestimated it during sched_setattr().
*/
-static void replenish_dl_entity(struct sched_dl_entity *dl_se,
- struct sched_dl_entity *pi_se)
+static void replenish_dl_entity(struct sched_dl_entity *dl_se)
{
struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
struct rq *rq = rq_of_dl_rq(dl_rq);
- BUG_ON(pi_se->dl_runtime <= 0);
+ BUG_ON(pi_of(dl_se)->dl_runtime <= 0);
/*
* This could be the case for a !-dl task that is boosted.
* Just go with full inherited parameters.
*/
if (dl_se->dl_deadline == 0) {
- dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
- dl_se->runtime = pi_se->dl_runtime;
+ dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline;
+ dl_se->runtime = pi_of(dl_se)->dl_runtime;
}
if (dl_se->dl_yielded && dl_se->runtime > 0)
@@ -722,8 +784,8 @@
* arbitrary large.
*/
while (dl_se->runtime <= 0) {
- dl_se->deadline += pi_se->dl_period;
- dl_se->runtime += pi_se->dl_runtime;
+ dl_se->deadline += pi_of(dl_se)->dl_period;
+ dl_se->runtime += pi_of(dl_se)->dl_runtime;
}
/*
@@ -737,8 +799,8 @@
*/
if (dl_time_before(dl_se->deadline, rq_clock(rq))) {
printk_deferred_once("sched: DL replenish lagged too much\n");
- dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
- dl_se->runtime = pi_se->dl_runtime;
+ dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline;
+ dl_se->runtime = pi_of(dl_se)->dl_runtime;
}
if (dl_se->dl_yielded)
@@ -771,8 +833,7 @@
* task with deadline equal to period this is the same of using
* dl_period instead of dl_deadline in the equation above.
*/
-static bool dl_entity_overflow(struct sched_dl_entity *dl_se,
- struct sched_dl_entity *pi_se, u64 t)
+static bool dl_entity_overflow(struct sched_dl_entity *dl_se, u64 t)
{
u64 left, right;
@@ -794,9 +855,9 @@
* of anything below microseconds resolution is actually fiction
* (but still we want to give the user that illusion >;).
*/
- left = (pi_se->dl_deadline >> DL_SCALE) * (dl_se->runtime >> DL_SCALE);
+ left = (pi_of(dl_se)->dl_deadline >> DL_SCALE) * (dl_se->runtime >> DL_SCALE);
right = ((dl_se->deadline - t) >> DL_SCALE) *
- (pi_se->dl_runtime >> DL_SCALE);
+ (pi_of(dl_se)->dl_runtime >> DL_SCALE);
return dl_time_before(right, left);
}
@@ -881,24 +942,23 @@
* Please refer to the comments update_dl_revised_wakeup() function to find
* more about the Revised CBS rule.
*/
-static void update_dl_entity(struct sched_dl_entity *dl_se,
- struct sched_dl_entity *pi_se)
+static void update_dl_entity(struct sched_dl_entity *dl_se)
{
struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
struct rq *rq = rq_of_dl_rq(dl_rq);
if (dl_time_before(dl_se->deadline, rq_clock(rq)) ||
- dl_entity_overflow(dl_se, pi_se, rq_clock(rq))) {
+ dl_entity_overflow(dl_se, rq_clock(rq))) {
if (unlikely(!dl_is_implicit(dl_se) &&
!dl_time_before(dl_se->deadline, rq_clock(rq)) &&
- !dl_se->dl_boosted)){
+ !is_dl_boosted(dl_se))) {
update_dl_revised_wakeup(dl_se, rq);
return;
}
- dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
- dl_se->runtime = pi_se->dl_runtime;
+ dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline;
+ dl_se->runtime = pi_of(dl_se)->dl_runtime;
}
}
@@ -997,7 +1057,7 @@
* The task might have been boosted by someone else and might be in the
* boosting/deboosting path, its not throttled.
*/
- if (dl_se->dl_boosted)
+ if (is_dl_boosted(dl_se))
goto unlock;
/*
@@ -1025,7 +1085,7 @@
* but do not enqueue -- wait for our wakeup to do that.
*/
if (!task_on_rq_queued(p)) {
- replenish_dl_entity(dl_se, dl_se);
+ replenish_dl_entity(dl_se);
goto unlock;
}
@@ -1096,7 +1156,7 @@
* cannot use the runtime, and so it replenishes the task. This rule
* works fine for implicit deadline tasks (deadline == period), and the
* CBS was designed for implicit deadline tasks. However, a task with
- * constrained deadline (deadine < period) might be awakened after the
+ * constrained deadline (deadline < period) might be awakened after the
* deadline, but before the next period. In this case, replenishing the
* task would allow it to run for runtime / deadline. As in this case
* deadline < period, CBS enables a task to run for more than the
@@ -1115,7 +1175,7 @@
if (dl_time_before(dl_se->deadline, rq_clock(rq)) &&
dl_time_before(rq_clock(rq), dl_next_period(dl_se))) {
- if (unlikely(dl_se->dl_boosted || !start_dl_timer(p)))
+ if (unlikely(is_dl_boosted(dl_se) || !start_dl_timer(p)))
return;
dl_se->dl_throttled = 1;
if (dl_se->runtime > 0)
@@ -1246,7 +1306,7 @@
dl_se->dl_overrun = 1;
__dequeue_task_dl(rq, curr, 0);
- if (unlikely(dl_se->dl_boosted || !start_dl_timer(curr)))
+ if (unlikely(is_dl_boosted(dl_se) || !start_dl_timer(curr)))
enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH);
if (!is_leftmost(curr, &rq->dl))
@@ -1440,8 +1500,7 @@
}
static void
-enqueue_dl_entity(struct sched_dl_entity *dl_se,
- struct sched_dl_entity *pi_se, int flags)
+enqueue_dl_entity(struct sched_dl_entity *dl_se, int flags)
{
BUG_ON(on_dl_rq(dl_se));
@@ -1452,9 +1511,9 @@
*/
if (flags & ENQUEUE_WAKEUP) {
task_contending(dl_se, flags);
- update_dl_entity(dl_se, pi_se);
+ update_dl_entity(dl_se);
} else if (flags & ENQUEUE_REPLENISH) {
- replenish_dl_entity(dl_se, pi_se);
+ replenish_dl_entity(dl_se);
} else if ((flags & ENQUEUE_RESTORE) &&
dl_time_before(dl_se->deadline,
rq_clock(rq_of_dl_rq(dl_rq_of_se(dl_se))))) {
@@ -1471,28 +1530,40 @@
static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
{
- struct task_struct *pi_task = rt_mutex_get_top_task(p);
- struct sched_dl_entity *pi_se = &p->dl;
-
- /*
- * Use the scheduling parameters of the top pi-waiter task if:
- * - we have a top pi-waiter which is a SCHED_DEADLINE task AND
- * - our dl_boosted is set (i.e. the pi-waiter's (absolute) deadline is
- * smaller than our deadline OR we are a !SCHED_DEADLINE task getting
- * boosted due to a SCHED_DEADLINE pi-waiter).
- * Otherwise we keep our runtime and deadline.
- */
- if (pi_task && dl_prio(pi_task->normal_prio) && p->dl.dl_boosted) {
- pi_se = &pi_task->dl;
+ if (is_dl_boosted(&p->dl)) {
+ /*
+ * Because of delays in the detection of the overrun of a
+ * thread's runtime, it might be the case that a thread
+ * goes to sleep in a rt mutex with negative runtime. As
+ * a consequence, the thread will be throttled.
+ *
+ * While waiting for the mutex, this thread can also be
+ * boosted via PI, resulting in a thread that is throttled
+ * and boosted at the same time.
+ *
+ * In this case, the boost overrides the throttle.
+ */
+ if (p->dl.dl_throttled) {
+ /*
+ * The replenish timer needs to be canceled. No
+ * problem if it fires concurrently: boosted threads
+ * are ignored in dl_task_timer().
+ */
+ hrtimer_try_to_cancel(&p->dl.dl_timer);
+ p->dl.dl_throttled = 0;
+ }
} else if (!dl_prio(p->normal_prio)) {
/*
- * Special case in which we have a !SCHED_DEADLINE task
- * that is going to be deboosted, but exceeds its
- * runtime while doing so. No point in replenishing
- * it, as it's going to return back to its original
- * scheduling class after this.
+ * Special case in which we have a !SCHED_DEADLINE task that is going
+ * to be deboosted, but exceeds its runtime while doing so. No point in
+ * replenishing it, as it's going to return back to its original
+ * scheduling class after this. If it has been throttled, we need to
+ * clear the flag, otherwise the task may wake up as throttled after
+ * being boosted again with no means to replenish the runtime and clear
+ * the throttle.
*/
- BUG_ON(!p->dl.dl_boosted || flags != ENQUEUE_REPLENISH);
+ p->dl.dl_throttled = 0;
+ BUG_ON(!is_dl_boosted(&p->dl) || flags != ENQUEUE_REPLENISH);
return;
}
@@ -1529,7 +1600,7 @@
return;
}
- enqueue_dl_entity(&p->dl, pi_se, flags);
+ enqueue_dl_entity(&p->dl, flags);
if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
enqueue_pushable_dl_task(rq, p);
@@ -1602,6 +1673,7 @@
select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags)
{
struct task_struct *curr;
+ bool select_rq;
struct rq *rq;
if (sd_flag != SD_BALANCE_WAKE)
@@ -1621,10 +1693,19 @@
* other hand, if it has a shorter deadline, we
* try to make it stay here, it might be important.
*/
- if (unlikely(dl_task(curr)) &&
- (curr->nr_cpus_allowed < 2 ||
- !dl_entity_preempt(&p->dl, &curr->dl)) &&
- (p->nr_cpus_allowed > 1)) {
+ select_rq = unlikely(dl_task(curr)) &&
+ (curr->nr_cpus_allowed < 2 ||
+ !dl_entity_preempt(&p->dl, &curr->dl)) &&
+ p->nr_cpus_allowed > 1;
+
+ /*
+ * Take the capacity of the CPU into account to
+ * ensure it fits the requirement of the task.
+ */
+ if (static_branch_unlikely(&sched_asym_cpucapacity))
+ select_rq |= !dl_task_fits_capacity(p, cpu);
+
+ if (select_rq) {
int target = find_later_rq(p);
if (target != -1 &&
@@ -1774,15 +1855,12 @@
return rb_entry(left, struct sched_dl_entity, rb_node);
}
-static struct task_struct *
-pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+static struct task_struct *pick_next_task_dl(struct rq *rq)
{
struct sched_dl_entity *dl_se;
struct dl_rq *dl_rq = &rq->dl;
struct task_struct *p;
- WARN_ON_ONCE(prev || rf);
-
if (!sched_dl_runnable(rq))
return NULL;
@@ -2434,8 +2512,8 @@
}
}
-const struct sched_class dl_sched_class = {
- .next = &rt_sched_class,
+const struct sched_class dl_sched_class
+ __section("__dl_sched_class") = {
.enqueue_task = enqueue_task_dl,
.dequeue_task = dequeue_task_dl,
.yield_task = yield_task_dl,
@@ -2503,7 +2581,7 @@
return ret;
}
-void init_dl_rq_bw_ratio(struct dl_rq *dl_rq)
+static void init_dl_rq_bw_ratio(struct dl_rq *dl_rq)
{
if (global_rt_runtime() == RUNTIME_INF) {
dl_rq->bw_ratio = 1 << RATIO_SHIFT;
@@ -2556,11 +2634,12 @@
int sched_dl_overflow(struct task_struct *p, int policy,
const struct sched_attr *attr)
{
- struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
u64 period = attr->sched_period ?: attr->sched_deadline;
u64 runtime = attr->sched_runtime;
u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0;
- int cpus, err = -1;
+ int cpus, err = -1, cpu = task_cpu(p);
+ struct dl_bw *dl_b = dl_bw_of(cpu);
+ unsigned long cap;
if (attr->sched_flags & SCHED_FLAG_SUGOV)
return 0;
@@ -2575,15 +2654,17 @@
* allocated bandwidth of the container.
*/
raw_spin_lock(&dl_b->lock);
- cpus = dl_bw_cpus(task_cpu(p));
+ cpus = dl_bw_cpus(cpu);
+ cap = dl_bw_capacity(cpu);
+
if (dl_policy(policy) && !task_has_dl_policy(p) &&
- !__dl_overflow(dl_b, cpus, 0, new_bw)) {
+ !__dl_overflow(dl_b, cap, 0, new_bw)) {
if (hrtimer_active(&p->dl.inactive_timer))
__dl_sub(dl_b, p->dl.dl_bw, cpus);
__dl_add(dl_b, new_bw, cpus);
err = 0;
} else if (dl_policy(policy) && task_has_dl_policy(p) &&
- !__dl_overflow(dl_b, cpus, p->dl.dl_bw, new_bw)) {
+ !__dl_overflow(dl_b, cap, p->dl.dl_bw, new_bw)) {
/*
* XXX this is slightly incorrect: when the task
* utilization decreases, we should delay the total
@@ -2641,6 +2722,14 @@
}
/*
+ * Default limits for DL period; on the top end we guard against small util
+ * tasks still getting rediculous long effective runtimes, on the bottom end we
+ * guard against timer DoS.
+ */
+unsigned int sysctl_sched_dl_period_max = 1 << 22; /* ~4 seconds */
+unsigned int sysctl_sched_dl_period_min = 100; /* 100 us */
+
+/*
* This function validates the new parameters of a -deadline task.
* We ask for the deadline not being zero, and greater or equal
* than the runtime, as well as the period of being zero or
@@ -2652,6 +2741,8 @@
*/
bool __checkparam_dl(const struct sched_attr *attr)
{
+ u64 period, max, min;
+
/* special dl tasks don't actually use any parameter */
if (attr->sched_flags & SCHED_FLAG_SUGOV)
return true;
@@ -2675,12 +2766,21 @@
attr->sched_period & (1ULL << 63))
return false;
+ period = attr->sched_period;
+ if (!period)
+ period = attr->sched_deadline;
+
/* runtime <= deadline <= period (if period != 0) */
- if ((attr->sched_period != 0 &&
- attr->sched_period < attr->sched_deadline) ||
+ if (period < attr->sched_deadline ||
attr->sched_deadline < attr->sched_runtime)
return false;
+ max = (u64)READ_ONCE(sysctl_sched_dl_period_max) * NSEC_PER_USEC;
+ min = (u64)READ_ONCE(sysctl_sched_dl_period_min) * NSEC_PER_USEC;
+
+ if (period < min || period > max)
+ return false;
+
return true;
}
@@ -2698,11 +2798,14 @@
dl_se->dl_bw = 0;
dl_se->dl_density = 0;
- dl_se->dl_boosted = 0;
dl_se->dl_throttled = 0;
dl_se->dl_yielded = 0;
dl_se->dl_non_contending = 0;
dl_se->dl_overrun = 0;
+
+#ifdef CONFIG_RT_MUTEXES
+ dl_se->pi_se = dl_se;
+#endif
}
bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr)
@@ -2721,19 +2824,19 @@
#ifdef CONFIG_SMP
int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allowed)
{
+ unsigned long flags, cap;
unsigned int dest_cpu;
struct dl_bw *dl_b;
bool overflow;
- int cpus, ret;
- unsigned long flags;
+ int ret;
dest_cpu = cpumask_any_and(cpu_active_mask, cs_cpus_allowed);
rcu_read_lock_sched();
dl_b = dl_bw_of(dest_cpu);
raw_spin_lock_irqsave(&dl_b->lock, flags);
- cpus = dl_bw_cpus(dest_cpu);
- overflow = __dl_overflow(dl_b, cpus, 0, p->dl.dl_bw);
+ cap = dl_bw_capacity(dest_cpu);
+ overflow = __dl_overflow(dl_b, cap, 0, p->dl.dl_bw);
if (overflow) {
ret = -EBUSY;
} else {
@@ -2743,6 +2846,8 @@
* We will free resources in the source root_domain
* later on (see set_cpus_allowed_dl()).
*/
+ int cpus = dl_bw_cpus(dest_cpu);
+
__dl_add(dl_b, p->dl.dl_bw, cpus);
ret = 0;
}
@@ -2775,16 +2880,15 @@
bool dl_cpu_busy(unsigned int cpu)
{
- unsigned long flags;
+ unsigned long flags, cap;
struct dl_bw *dl_b;
bool overflow;
- int cpus;
rcu_read_lock_sched();
dl_b = dl_bw_of(cpu);
raw_spin_lock_irqsave(&dl_b->lock, flags);
- cpus = dl_bw_cpus(cpu);
- overflow = __dl_overflow(dl_b, cpus, 0, 0);
+ cap = dl_bw_capacity(cpu);
+ overflow = __dl_overflow(dl_b, cap, 0, 0);
raw_spin_unlock_irqrestore(&dl_b->lock, flags);
rcu_read_unlock_sched();
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index faada71..70a5782 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -243,6 +243,60 @@
entry->proc_handler = proc_handler;
}
+static int sd_ctl_doflags(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ unsigned long flags = *(unsigned long *)table->data;
+ size_t data_size = 0;
+ size_t len = 0;
+ char *tmp, *buf;
+ int idx;
+
+ if (write)
+ return 0;
+
+ for_each_set_bit(idx, &flags, __SD_FLAG_CNT) {
+ char *name = sd_flag_debug[idx].name;
+
+ /* Name plus whitespace */
+ data_size += strlen(name) + 1;
+ }
+
+ if (*ppos > data_size) {
+ *lenp = 0;
+ return 0;
+ }
+
+ buf = kcalloc(data_size + 1, sizeof(*buf), GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+
+ for_each_set_bit(idx, &flags, __SD_FLAG_CNT) {
+ char *name = sd_flag_debug[idx].name;
+
+ len += snprintf(buf + len, strlen(name) + 2, "%s ", name);
+ }
+
+ tmp = buf + *ppos;
+ len -= *ppos;
+
+ if (len > *lenp)
+ len = *lenp;
+ if (len)
+ memcpy(buffer, tmp, len);
+ if (len < *lenp) {
+ ((char *)buffer)[len] = '\n';
+ len++;
+ }
+
+ *lenp = len;
+ *ppos += len;
+
+ kfree(buf);
+
+ return 0;
+}
+
static struct ctl_table *
sd_alloc_ctl_domain_table(struct sched_domain *sd)
{
@@ -256,7 +310,7 @@
set_table_entry(&table[2], "busy_factor", &sd->busy_factor, sizeof(int), 0644, proc_dointvec_minmax);
set_table_entry(&table[3], "imbalance_pct", &sd->imbalance_pct, sizeof(int), 0644, proc_dointvec_minmax);
set_table_entry(&table[4], "cache_nice_tries", &sd->cache_nice_tries, sizeof(int), 0644, proc_dointvec_minmax);
- set_table_entry(&table[5], "flags", &sd->flags, sizeof(int), 0444, proc_dointvec_minmax);
+ set_table_entry(&table[5], "flags", &sd->flags, sizeof(int), 0444, sd_ctl_doflags);
set_table_entry(&table[6], "max_newidle_lb_cost", &sd->max_newidle_lb_cost, sizeof(long), 0644, proc_doulongvec_minmax);
set_table_entry(&table[7], "name", sd->name, CORENAME_MAX_SIZE, 0444, proc_dostring);
/* &table[8] is terminator */
@@ -400,11 +454,10 @@
}
P(se->load.weight);
- P(se->runnable_weight);
#ifdef CONFIG_SMP
P(se->avg.load_avg);
P(se->avg.util_avg);
- P(se->avg.runnable_load_avg);
+ P(se->avg.runnable_avg);
#endif
#undef PN_SCHEDSTAT
@@ -457,7 +510,7 @@
else
SEQ_printf(m, " %c", task_state_to_char(p));
- SEQ_printf(m, "%15s %5d %9Ld.%06ld %9Ld %5d ",
+ SEQ_printf(m, " %15s %5d %9Ld.%06ld %9Ld %5d ",
p->comm, task_pid_nr(p),
SPLIT_NS(p->se.vruntime),
(long long)(p->nvcsw + p->nivcsw),
@@ -484,10 +537,10 @@
SEQ_printf(m, "\n");
SEQ_printf(m, "runnable tasks:\n");
- SEQ_printf(m, " S task PID tree-key switches prio"
+ SEQ_printf(m, " S task PID tree-key switches prio"
" wait-time sum-exec sum-sleep\n");
SEQ_printf(m, "-------------------------------------------------------"
- "----------------------------------------------------\n");
+ "------------------------------------------------------\n");
rcu_read_lock();
for_each_process_thread(g, p) {
@@ -543,11 +596,10 @@
SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running);
SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
#ifdef CONFIG_SMP
- SEQ_printf(m, " .%-30s: %ld\n", "runnable_weight", cfs_rq->runnable_weight);
SEQ_printf(m, " .%-30s: %lu\n", "load_avg",
cfs_rq->avg.load_avg);
- SEQ_printf(m, " .%-30s: %lu\n", "runnable_load_avg",
- cfs_rq->avg.runnable_load_avg);
+ SEQ_printf(m, " .%-30s: %lu\n", "runnable_avg",
+ cfs_rq->avg.runnable_avg);
SEQ_printf(m, " .%-30s: %lu\n", "util_avg",
cfs_rq->avg.util_avg);
SEQ_printf(m, " .%-30s: %u\n", "util_est_enqueued",
@@ -556,8 +608,8 @@
cfs_rq->removed.load_avg);
SEQ_printf(m, " .%-30s: %ld\n", "removed.util_avg",
cfs_rq->removed.util_avg);
- SEQ_printf(m, " .%-30s: %ld\n", "removed.runnable_sum",
- cfs_rq->removed.runnable_sum);
+ SEQ_printf(m, " .%-30s: %ld\n", "removed.runnable_avg",
+ cfs_rq->removed.runnable_avg);
#ifdef CONFIG_FAIR_GROUP_SCHED
SEQ_printf(m, " .%-30s: %lu\n", "tg_load_avg_contrib",
cfs_rq->tg_load_avg_contrib);
@@ -658,7 +710,6 @@
P(nr_running);
P(nr_switches);
- P(nr_load_updates);
P(nr_uninterruptible);
PN(next_balance);
SEQ_printf(m, " .%-30s: %ld\n", "curr->pid", (long)(task_pid_nr(rq->curr)));
@@ -767,9 +818,16 @@
int cpu;
sched_debug_header(NULL);
- for_each_online_cpu(cpu)
+ for_each_online_cpu(cpu) {
+ /*
+ * Need to reset softlockup watchdogs on all CPUs, because
+ * another CPU might be blocked waiting for us to process
+ * an IPI or stop_machine.
+ */
+ touch_nmi_watchdog();
+ touch_all_softlockup_watchdogs();
print_cpu(NULL, cpu);
-
+ }
}
/*
@@ -827,10 +885,13 @@
__initcall(init_sched_debug_procfs);
-#define __P(F) SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F)
-#define P(F) SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F)
-#define __PN(F) SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F))
-#define PN(F) SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F))
+#define __PS(S, F) SEQ_printf(m, "%-45s:%21Ld\n", S, (long long)(F))
+#define __P(F) __PS(#F, F)
+#define P(F) __PS(#F, p->F)
+#define PM(F, M) __PS(#F, p->F & (M))
+#define __PSN(S, F) SEQ_printf(m, "%-45s:%14Ld.%06ld\n", S, SPLIT_NS((long long)(F)))
+#define __PN(F) __PSN(#F, F)
+#define PN(F) __PSN(#F, p->F)
#ifdef CONFIG_NUMA_BALANCING
@@ -879,18 +940,9 @@
SEQ_printf(m,
"---------------------------------------------------------"
"----------\n");
-#define __P(F) \
- SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F)
-#define P(F) \
- SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F)
-#define P_SCHEDSTAT(F) \
- SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)schedstat_val(p->F))
-#define __PN(F) \
- SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F))
-#define PN(F) \
- SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F))
-#define PN_SCHEDSTAT(F) \
- SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)schedstat_val(p->F)))
+
+#define P_SCHEDSTAT(F) __PS(#F, schedstat_val(p->F))
+#define PN_SCHEDSTAT(F) __PSN(#F, schedstat_val(p->F))
PN(se.exec_start);
PN(se.vruntime);
@@ -950,23 +1002,26 @@
}
__P(nr_switches);
- SEQ_printf(m, "%-45s:%21Ld\n",
- "nr_voluntary_switches", (long long)p->nvcsw);
- SEQ_printf(m, "%-45s:%21Ld\n",
- "nr_involuntary_switches", (long long)p->nivcsw);
+ __PS("nr_voluntary_switches", p->nvcsw);
+ __PS("nr_involuntary_switches", p->nivcsw);
P(se.load.weight);
- P(se.runnable_weight);
#ifdef CONFIG_SMP
P(se.avg.load_sum);
- P(se.avg.runnable_load_sum);
+ P(se.avg.runnable_sum);
P(se.avg.util_sum);
P(se.avg.load_avg);
- P(se.avg.runnable_load_avg);
+ P(se.avg.runnable_avg);
P(se.avg.util_avg);
P(se.avg.last_update_time);
P(se.avg.util_est.ewma);
- P(se.avg.util_est.enqueued);
+ PM(se.avg.util_est.enqueued, ~UTIL_AVG_UNCHANGED);
+#endif
+#ifdef CONFIG_UCLAMP_TASK
+ __PS("uclamp.min", p->uclamp_req[UCLAMP_MIN].value);
+ __PS("uclamp.max", p->uclamp_req[UCLAMP_MAX].value);
+ __PS("effective uclamp.min", uclamp_eff_value(p, UCLAMP_MIN));
+ __PS("effective uclamp.max", uclamp_eff_value(p, UCLAMP_MAX));
#endif
P(policy);
P(prio);
@@ -975,11 +1030,7 @@
P(dl.deadline);
}
#undef PN_SCHEDSTAT
-#undef PN
-#undef __PN
#undef P_SCHEDSTAT
-#undef P
-#undef __P
{
unsigned int this_cpu = raw_smp_processor_id();
@@ -987,8 +1038,7 @@
t0 = cpu_clock(this_cpu);
t1 = cpu_clock(this_cpu);
- SEQ_printf(m, "%-45s:%21Ld\n",
- "clock-delta", (long long)(t1-t0));
+ __PS("clock-delta", t1-t0);
}
sched_show_numa(p, m);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 87d9fad..acd9833 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -22,8 +22,6 @@
*/
#include "sched.h"
-#include <trace/events/sched.h>
-
/*
* Targeted preemption latency for CPU-bound tasks:
*
@@ -86,6 +84,19 @@
const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
+int sched_thermal_decay_shift;
+static int __init setup_sched_thermal_decay_shift(char *str)
+{
+ int _shift = 0;
+
+ if (kstrtoint(str, 0, &_shift))
+ pr_warn("Unable to set scheduler thermal pressure decay shift parameter\n");
+
+ sched_thermal_decay_shift = clamp(_shift, 0, 10);
+ return 1;
+}
+__setup("sched_thermal_decay_shift=", setup_sched_thermal_decay_shift);
+
#ifdef CONFIG_SMP
/*
* For asym packing, by default the lower numbered CPU has higher priority.
@@ -178,7 +189,7 @@
#undef SET_SYSCTL
}
-void sched_init_granularity(void)
+void __init sched_init_granularity(void)
{
update_sysctl();
}
@@ -229,8 +240,7 @@
}
}
- /* hint to use a 32x32->64 mul */
- fact = (u64)(u32)fact * lw->inv_weight;
+ fact = mul_u32_u32(fact, lw->inv_weight);
while (fact >> 32) {
fact >>= 1;
@@ -633,8 +643,7 @@
*/
int sched_proc_update_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
unsigned int factor = get_update_sysctl_factor();
@@ -691,7 +700,13 @@
*/
static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);
+ unsigned int nr_running = cfs_rq->nr_running;
+ u64 slice;
+
+ if (sched_feat(ALT_PERIOD))
+ nr_running = rq_of(cfs_rq)->cfs.h_nr_running;
+
+ slice = __sched_period(nr_running + !se->on_rq);
for_each_sched_entity(se) {
struct load_weight *load;
@@ -708,6 +723,10 @@
}
slice = __calc_delta(slice, se->load.weight, load);
}
+
+ if (sched_feat(BASE_SLICE))
+ slice = max(slice, (u64)sysctl_sched_min_granularity);
+
return slice;
}
@@ -742,9 +761,7 @@
* nothing has been attached to the task group yet.
*/
if (entity_is_task(se))
- sa->runnable_load_avg = sa->load_avg = scale_load_down(se->load.weight);
-
- se->runnable_weight = se->load.weight;
+ sa->load_avg = scale_load_down(se->load.weight);
/* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
}
@@ -797,12 +814,14 @@
}
}
+ sa->runnable_avg = sa->util_avg;
+
if (p->sched_class != &fair_sched_class) {
/*
* For !fair tasks do:
*
update_cfs_rq_load_avg(now, cfs_rq);
- attach_entity_load_avg(cfs_rq, se, 0);
+ attach_entity_load_avg(cfs_rq, se);
switched_from_fair(rq, p);
*
* such that the next switched_to_fair() has the
@@ -822,7 +841,7 @@
void post_init_entity_util_avg(struct task_struct *p)
{
}
-static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
+static void update_tg_load_avg(struct cfs_rq *cfs_rq)
{
}
#endif /* CONFIG_SMP */
@@ -1082,7 +1101,7 @@
* more by CPU use than by memory faults.
*/
unsigned long *faults_cpu;
- unsigned long faults[0];
+ unsigned long faults[];
};
/*
@@ -1474,31 +1493,52 @@
group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4;
}
-static unsigned long cpu_runnable_load(struct rq *rq);
+/*
+ * 'numa_type' describes the node at the moment of load balancing.
+ */
+enum numa_type {
+ /* The node has spare capacity that can be used to run more tasks. */
+ node_has_spare = 0,
+ /*
+ * The node is fully used and the tasks don't compete for more CPU
+ * cycles. Nevertheless, some tasks might wait before running.
+ */
+ node_fully_busy,
+ /*
+ * The node is overloaded and can't provide expected CPU cycles to all
+ * tasks.
+ */
+ node_overloaded
+};
/* Cached statistics for all CPUs within a node */
struct numa_stats {
unsigned long load;
-
+ unsigned long runnable;
+ unsigned long util;
/* Total compute capacity of CPUs on a node */
unsigned long compute_capacity;
+ unsigned int nr_running;
+ unsigned int weight;
+ enum numa_type node_type;
+ int idle_cpu;
};
-/*
- * XXX borrowed from update_sg_lb_stats
- */
-static void update_numa_stats(struct numa_stats *ns, int nid)
+static inline bool is_core_idle(int cpu)
{
- int cpu;
+#ifdef CONFIG_SCHED_SMT
+ int sibling;
- memset(ns, 0, sizeof(*ns));
- for_each_cpu(cpu, cpumask_of_node(nid)) {
- struct rq *rq = cpu_rq(cpu);
+ for_each_cpu(sibling, cpu_smt_mask(cpu)) {
+ if (cpu == sibling)
+ continue;
- ns->load += cpu_runnable_load(rq);
- ns->compute_capacity += capacity_of(cpu);
+ if (!idle_cpu(sibling))
+ return false;
}
+#endif
+ return true;
}
struct task_numa_env {
@@ -1517,20 +1557,132 @@
int best_cpu;
};
+static unsigned long cpu_load(struct rq *rq);
+static unsigned long cpu_runnable(struct rq *rq);
+static unsigned long cpu_util(int cpu);
+static inline long adjust_numa_imbalance(int imbalance, int nr_running);
+
+static inline enum
+numa_type numa_classify(unsigned int imbalance_pct,
+ struct numa_stats *ns)
+{
+ if ((ns->nr_running > ns->weight) &&
+ (((ns->compute_capacity * 100) < (ns->util * imbalance_pct)) ||
+ ((ns->compute_capacity * imbalance_pct) < (ns->runnable * 100))))
+ return node_overloaded;
+
+ if ((ns->nr_running < ns->weight) ||
+ (((ns->compute_capacity * 100) > (ns->util * imbalance_pct)) &&
+ ((ns->compute_capacity * imbalance_pct) > (ns->runnable * 100))))
+ return node_has_spare;
+
+ return node_fully_busy;
+}
+
+#ifdef CONFIG_SCHED_SMT
+/* Forward declarations of select_idle_sibling helpers */
+static inline bool test_idle_cores(int cpu, bool def);
+static inline int numa_idle_core(int idle_core, int cpu)
+{
+ if (!static_branch_likely(&sched_smt_present) ||
+ idle_core >= 0 || !test_idle_cores(cpu, false))
+ return idle_core;
+
+ /*
+ * Prefer cores instead of packing HT siblings
+ * and triggering future load balancing.
+ */
+ if (is_core_idle(cpu))
+ idle_core = cpu;
+
+ return idle_core;
+}
+#else
+static inline int numa_idle_core(int idle_core, int cpu)
+{
+ return idle_core;
+}
+#endif
+
+/*
+ * Gather all necessary information to make NUMA balancing placement
+ * decisions that are compatible with standard load balancer. This
+ * borrows code and logic from update_sg_lb_stats but sharing a
+ * common implementation is impractical.
+ */
+static void update_numa_stats(struct task_numa_env *env,
+ struct numa_stats *ns, int nid,
+ bool find_idle)
+{
+ int cpu, idle_core = -1;
+
+ memset(ns, 0, sizeof(*ns));
+ ns->idle_cpu = -1;
+
+ rcu_read_lock();
+ for_each_cpu(cpu, cpumask_of_node(nid)) {
+ struct rq *rq = cpu_rq(cpu);
+
+ ns->load += cpu_load(rq);
+ ns->runnable += cpu_runnable(rq);
+ ns->util += cpu_util(cpu);
+ ns->nr_running += rq->cfs.h_nr_running;
+ ns->compute_capacity += capacity_of(cpu);
+
+ if (find_idle && !rq->nr_running && idle_cpu(cpu)) {
+ if (READ_ONCE(rq->numa_migrate_on) ||
+ !cpumask_test_cpu(cpu, env->p->cpus_ptr))
+ continue;
+
+ if (ns->idle_cpu == -1)
+ ns->idle_cpu = cpu;
+
+ idle_core = numa_idle_core(idle_core, cpu);
+ }
+ }
+ rcu_read_unlock();
+
+ ns->weight = cpumask_weight(cpumask_of_node(nid));
+
+ ns->node_type = numa_classify(env->imbalance_pct, ns);
+
+ if (idle_core >= 0)
+ ns->idle_cpu = idle_core;
+}
+
static void task_numa_assign(struct task_numa_env *env,
struct task_struct *p, long imp)
{
struct rq *rq = cpu_rq(env->dst_cpu);
- /* Bail out if run-queue part of active NUMA balance. */
- if (xchg(&rq->numa_migrate_on, 1))
- return;
+ /* Check if run-queue part of active NUMA balance. */
+ if (env->best_cpu != env->dst_cpu && xchg(&rq->numa_migrate_on, 1)) {
+ int cpu;
+ int start = env->dst_cpu;
+ /* Find alternative idle CPU. */
+ for_each_cpu_wrap(cpu, cpumask_of_node(env->dst_nid), start) {
+ if (cpu == env->best_cpu || !idle_cpu(cpu) ||
+ !cpumask_test_cpu(cpu, env->p->cpus_ptr)) {
+ continue;
+ }
+
+ env->dst_cpu = cpu;
+ rq = cpu_rq(env->dst_cpu);
+ if (!xchg(&rq->numa_migrate_on, 1))
+ goto assign;
+ }
+
+ /* Failed to find an alternative idle CPU */
+ return;
+ }
+
+assign:
/*
* Clear previous best_cpu/rq numa-migrate flag, since task now
* found a better CPU to move/swap.
*/
- if (env->best_cpu != -1) {
+ if (env->best_cpu != -1 && env->best_cpu != env->dst_cpu) {
rq = cpu_rq(env->best_cpu);
WRITE_ONCE(rq->numa_migrate_on, 0);
}
@@ -1586,7 +1738,7 @@
* into account that it might be best if task running on the dst_cpu should
* be exchanged with the source task
*/
-static void task_numa_compare(struct task_numa_env *env,
+static bool task_numa_compare(struct task_numa_env *env,
long taskimp, long groupimp, bool maymove)
{
struct numa_group *cur_ng, *p_ng = deref_curr_numa_group(env->p);
@@ -1597,9 +1749,10 @@
int dist = env->dist;
long moveimp = imp;
long load;
+ bool stopsearch = false;
if (READ_ONCE(dst_rq->numa_migrate_on))
- return;
+ return false;
rcu_read_lock();
cur = rcu_dereference(dst_rq->curr);
@@ -1610,8 +1763,10 @@
* Because we have preemption enabled we can get migrated around and
* end try selecting ourselves (current == env->p) as a swap candidate.
*/
- if (cur == env->p)
+ if (cur == env->p) {
+ stopsearch = true;
goto unlock;
+ }
if (!cur) {
if (maymove && moveimp >= env->best_imp)
@@ -1620,18 +1775,27 @@
goto unlock;
}
+ /* Skip this swap candidate if cannot move to the source cpu. */
+ if (!cpumask_test_cpu(env->src_cpu, cur->cpus_ptr))
+ goto unlock;
+
+ /*
+ * Skip this swap candidate if it is not moving to its preferred
+ * node and the best task is.
+ */
+ if (env->best_task &&
+ env->best_task->numa_preferred_nid == env->src_nid &&
+ cur->numa_preferred_nid != env->src_nid) {
+ goto unlock;
+ }
+
/*
* "imp" is the fault differential for the source task between the
* source and destination node. Calculate the total differential for
* the source task and potential destination task. The more negative
* the value is, the more remote accesses that would be expected to
* be incurred if the tasks were swapped.
- */
- /* Skip this swap candidate if cannot move to the source cpu */
- if (!cpumask_test_cpu(env->src_cpu, cur->cpus_ptr))
- goto unlock;
-
- /*
+ *
* If dst and source tasks are in the same NUMA group, or not
* in any group then look only at task weights.
*/
@@ -1658,6 +1822,19 @@
task_weight(cur, env->dst_nid, dist);
}
+ /* Discourage picking a task already on its preferred node */
+ if (cur->numa_preferred_nid == env->dst_nid)
+ imp -= imp / 16;
+
+ /*
+ * Encourage picking a task that moves to its preferred node.
+ * This potentially makes imp larger than it's maximum of
+ * 1998 (see SMALLIMP and task_weight for why) but in this
+ * case, it does not matter.
+ */
+ if (cur->numa_preferred_nid == env->src_nid)
+ imp += imp / 8;
+
if (maymove && moveimp > imp && moveimp > env->best_imp) {
imp = moveimp;
cur = NULL;
@@ -1665,6 +1842,15 @@
}
/*
+ * Prefer swapping with a task moving to its preferred node over a
+ * task that is not.
+ */
+ if (env->best_task && cur->numa_preferred_nid == env->src_nid &&
+ env->best_task->numa_preferred_nid != env->src_nid) {
+ goto assign;
+ }
+
+ /*
* If the NUMA importance is less than SMALLIMP,
* task migration might only result in ping pong
* of tasks and also hurt performance due to cache
@@ -1687,42 +1873,95 @@
goto unlock;
assign:
- /*
- * One idle CPU per node is evaluated for a task numa move.
- * Call select_idle_sibling to maybe find a better one.
- */
+ /* Evaluate an idle CPU for a task numa move. */
if (!cur) {
+ int cpu = env->dst_stats.idle_cpu;
+
+ /* Nothing cached so current CPU went idle since the search. */
+ if (cpu < 0)
+ cpu = env->dst_cpu;
+
/*
- * select_idle_siblings() uses an per-CPU cpumask that
- * can be used from IRQ context.
+ * If the CPU is no longer truly idle and the previous best CPU
+ * is, keep using it.
*/
- local_irq_disable();
- env->dst_cpu = select_idle_sibling(env->p, env->src_cpu,
- env->dst_cpu);
- local_irq_enable();
+ if (!idle_cpu(cpu) && env->best_cpu >= 0 &&
+ idle_cpu(env->best_cpu)) {
+ cpu = env->best_cpu;
+ }
+
+ env->dst_cpu = cpu;
}
task_numa_assign(env, cur, imp);
+
+ /*
+ * If a move to idle is allowed because there is capacity or load
+ * balance improves then stop the search. While a better swap
+ * candidate may exist, a search is not free.
+ */
+ if (maymove && !cur && env->best_cpu >= 0 && idle_cpu(env->best_cpu))
+ stopsearch = true;
+
+ /*
+ * If a swap candidate must be identified and the current best task
+ * moves its preferred node then stop the search.
+ */
+ if (!maymove && env->best_task &&
+ env->best_task->numa_preferred_nid == env->src_nid) {
+ stopsearch = true;
+ }
unlock:
rcu_read_unlock();
+
+ return stopsearch;
}
static void task_numa_find_cpu(struct task_numa_env *env,
long taskimp, long groupimp)
{
- long src_load, dst_load, load;
bool maymove = false;
int cpu;
- load = task_h_load(env->p);
- dst_load = env->dst_stats.load + load;
- src_load = env->src_stats.load - load;
-
/*
- * If the improvement from just moving env->p direction is better
- * than swapping tasks around, check if a move is possible.
+ * If dst node has spare capacity, then check if there is an
+ * imbalance that would be overruled by the load balancer.
*/
- maymove = !load_too_imbalanced(src_load, dst_load, env);
+ if (env->dst_stats.node_type == node_has_spare) {
+ unsigned int imbalance;
+ int src_running, dst_running;
+
+ /*
+ * Would movement cause an imbalance? Note that if src has
+ * more running tasks that the imbalance is ignored as the
+ * move improves the imbalance from the perspective of the
+ * CPU load balancer.
+ * */
+ src_running = env->src_stats.nr_running - 1;
+ dst_running = env->dst_stats.nr_running + 1;
+ imbalance = max(0, dst_running - src_running);
+ imbalance = adjust_numa_imbalance(imbalance, dst_running);
+
+ /* Use idle CPU if there is no imbalance */
+ if (!imbalance) {
+ maymove = true;
+ if (env->dst_stats.idle_cpu >= 0) {
+ env->dst_cpu = env->dst_stats.idle_cpu;
+ task_numa_assign(env, NULL, 0);
+ return;
+ }
+ }
+ } else {
+ long src_load, dst_load, load;
+ /*
+ * If the improvement from just moving env->p direction is better
+ * than swapping tasks around, check if a move is possible.
+ */
+ load = task_h_load(env->p);
+ dst_load = env->dst_stats.load + load;
+ src_load = env->src_stats.load - load;
+ maymove = !load_too_imbalanced(src_load, dst_load, env);
+ }
for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
/* Skip this CPU if the source task cannot migrate */
@@ -1730,7 +1969,8 @@
continue;
env->dst_cpu = cpu;
- task_numa_compare(env, taskimp, groupimp, maymove);
+ if (task_numa_compare(env, taskimp, groupimp, maymove))
+ break;
}
}
@@ -1784,10 +2024,10 @@
dist = env.dist = node_distance(env.src_nid, env.dst_nid);
taskweight = task_weight(p, env.src_nid, dist);
groupweight = group_weight(p, env.src_nid, dist);
- update_numa_stats(&env.src_stats, env.src_nid);
+ update_numa_stats(&env, &env.src_stats, env.src_nid, false);
taskimp = task_weight(p, env.dst_nid, dist) - taskweight;
groupimp = group_weight(p, env.dst_nid, dist) - groupweight;
- update_numa_stats(&env.dst_stats, env.dst_nid);
+ update_numa_stats(&env, &env.dst_stats, env.dst_nid, true);
/* Try to find a spot on the preferred nid. */
task_numa_find_cpu(&env, taskimp, groupimp);
@@ -1820,7 +2060,7 @@
env.dist = dist;
env.dst_nid = nid;
- update_numa_stats(&env.dst_stats, env.dst_nid);
+ update_numa_stats(&env, &env.dst_stats, env.dst_nid, true);
task_numa_find_cpu(&env, taskimp, groupimp);
}
}
@@ -1844,15 +2084,17 @@
}
/* No better CPU than the current one was found. */
- if (env.best_cpu == -1)
+ if (env.best_cpu == -1) {
+ trace_sched_stick_numa(p, env.src_cpu, NULL, -1);
return -EAGAIN;
+ }
best_rq = cpu_rq(env.best_cpu);
if (env.best_task == NULL) {
ret = migrate_task_to(p, env.best_cpu);
WRITE_ONCE(best_rq->numa_migrate_on, 0);
if (ret != 0)
- trace_sched_stick_numa(p, env.src_cpu, env.best_cpu);
+ trace_sched_stick_numa(p, env.src_cpu, NULL, env.best_cpu);
return ret;
}
@@ -1860,7 +2102,7 @@
WRITE_ONCE(best_rq->numa_migrate_on, 0);
if (ret != 0)
- trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task));
+ trace_sched_stick_numa(p, env.src_cpu, env.best_task, env.best_cpu);
put_task_struct(env.best_task);
return ret;
}
@@ -2541,7 +2783,7 @@
return;
- if (!down_read_trylock(&mm->mmap_sem))
+ if (!mmap_read_trylock(mm))
return;
vma = find_vma(mm, start);
if (!vma) {
@@ -2569,7 +2811,7 @@
* Skip inaccessible VMAs to avoid any confusion between
* PROT_NONE and NUMA hinting ptes
*/
- if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
+ if (!vma_is_accessible(vma))
continue;
do {
@@ -2609,7 +2851,7 @@
mm->numa_scan_offset = start;
else
reset_ptenuma_scan(p);
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
/*
* Make sure tasks use at least 32x as much time to run other code
@@ -2696,7 +2938,7 @@
curr->node_stamp += period;
if (!time_before(jiffies, curr->mm->numa_next_scan))
- task_work_add(curr, work, true);
+ task_work_add(curr, work, TWA_RESUME);
}
}
@@ -2831,25 +3073,6 @@
#ifdef CONFIG_SMP
static inline void
-enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
- cfs_rq->runnable_weight += se->runnable_weight;
-
- cfs_rq->avg.runnable_load_avg += se->avg.runnable_load_avg;
- cfs_rq->avg.runnable_load_sum += se_runnable(se) * se->avg.runnable_load_sum;
-}
-
-static inline void
-dequeue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
- cfs_rq->runnable_weight -= se->runnable_weight;
-
- sub_positive(&cfs_rq->avg.runnable_load_avg, se->avg.runnable_load_avg);
- sub_positive(&cfs_rq->avg.runnable_load_sum,
- se_runnable(se) * se->avg.runnable_load_sum);
-}
-
-static inline void
enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
cfs_rq->avg.load_avg += se->avg.load_avg;
@@ -2864,45 +3087,36 @@
}
#else
static inline void
-enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
-static inline void
-dequeue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
-static inline void
enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
static inline void
dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
#endif
static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
- unsigned long weight, unsigned long runnable)
+ unsigned long weight)
{
if (se->on_rq) {
/* commit outstanding execution time */
if (cfs_rq->curr == se)
update_curr(cfs_rq);
- account_entity_dequeue(cfs_rq, se);
- dequeue_runnable_load_avg(cfs_rq, se);
+ update_load_sub(&cfs_rq->load, se->load.weight);
}
dequeue_load_avg(cfs_rq, se);
- se->runnable_weight = runnable;
update_load_set(&se->load, weight);
#ifdef CONFIG_SMP
do {
- u32 divider = LOAD_AVG_MAX - 1024 + se->avg.period_contrib;
+ u32 divider = get_pelt_divider(&se->avg);
se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, divider);
- se->avg.runnable_load_avg =
- div_u64(se_runnable(se) * se->avg.runnable_load_sum, divider);
} while (0);
#endif
enqueue_load_avg(cfs_rq, se);
- if (se->on_rq) {
- account_entity_enqueue(cfs_rq, se);
- enqueue_runnable_load_avg(cfs_rq, se);
- }
+ if (se->on_rq)
+ update_load_add(&cfs_rq->load, se->load.weight);
+
}
void reweight_task(struct task_struct *p, int prio)
@@ -2912,7 +3126,7 @@
struct load_weight *load = &se->load;
unsigned long weight = scale_load(sched_prio_to_weight[prio]);
- reweight_entity(cfs_rq, se, weight, weight);
+ reweight_entity(cfs_rq, se, weight);
load->inv_weight = sched_prio_to_wmult[prio];
}
@@ -3024,50 +3238,6 @@
*/
return clamp_t(long, shares, MIN_SHARES, tg_shares);
}
-
-/*
- * This calculates the effective runnable weight for a group entity based on
- * the group entity weight calculated above.
- *
- * Because of the above approximation (2), our group entity weight is
- * an load_avg based ratio (3). This means that it includes blocked load and
- * does not represent the runnable weight.
- *
- * Approximate the group entity's runnable weight per ratio from the group
- * runqueue:
- *
- * grq->avg.runnable_load_avg
- * ge->runnable_weight = ge->load.weight * -------------------------- (7)
- * grq->avg.load_avg
- *
- * However, analogous to above, since the avg numbers are slow, this leads to
- * transients in the from-idle case. Instead we use:
- *
- * ge->runnable_weight = ge->load.weight *
- *
- * max(grq->avg.runnable_load_avg, grq->runnable_weight)
- * ----------------------------------------------------- (8)
- * max(grq->avg.load_avg, grq->load.weight)
- *
- * Where these max() serve both to use the 'instant' values to fix the slow
- * from-idle and avoid the /0 on to-idle, similar to (6).
- */
-static long calc_group_runnable(struct cfs_rq *cfs_rq, long shares)
-{
- long runnable, load_avg;
-
- load_avg = max(cfs_rq->avg.load_avg,
- scale_load_down(cfs_rq->load.weight));
-
- runnable = max(cfs_rq->avg.runnable_load_avg,
- scale_load_down(cfs_rq->runnable_weight));
-
- runnable *= shares;
- if (load_avg)
- runnable /= load_avg;
-
- return clamp_t(long, runnable, MIN_SHARES, shares);
-}
#endif /* CONFIG_SMP */
static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
@@ -3079,7 +3249,7 @@
static void update_cfs_group(struct sched_entity *se)
{
struct cfs_rq *gcfs_rq = group_cfs_rq(se);
- long shares, runnable;
+ long shares;
if (!gcfs_rq)
return;
@@ -3088,16 +3258,15 @@
return;
#ifndef CONFIG_SMP
- runnable = shares = READ_ONCE(gcfs_rq->tg->shares);
+ shares = READ_ONCE(gcfs_rq->tg->shares);
if (likely(se->load.weight == shares))
return;
#else
shares = calc_group_shares(gcfs_rq);
- runnable = calc_group_runnable(gcfs_rq, shares);
#endif
- reweight_entity(cfs_rq_of(se), se, shares, runnable);
+ reweight_entity(cfs_rq_of(se), se, shares);
}
#else /* CONFIG_FAIR_GROUP_SCHED */
@@ -3110,7 +3279,7 @@
{
struct rq *rq = rq_of(cfs_rq);
- if (&rq->cfs == cfs_rq || (flags & SCHED_CPUFREQ_MIGRATION)) {
+ if (&rq->cfs == cfs_rq) {
/*
* There are a few boundary cases this might miss but it should
* get called often enough that that should (hopefully) not be
@@ -3134,7 +3303,6 @@
/**
* update_tg_load_avg - update the tg's load avg
* @cfs_rq: the cfs_rq whose avg changed
- * @force: update regardless of how small the difference
*
* This function 'ensures': tg->load_avg := \Sum tg->cfs_rq[]->avg.load.
* However, because tg->load_avg is a global value there are performance
@@ -3146,7 +3314,7 @@
*
* Updating tg's load_avg is necessary before update_cfs_share().
*/
-static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
+static inline void update_tg_load_avg(struct cfs_rq *cfs_rq)
{
long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
@@ -3156,7 +3324,7 @@
if (cfs_rq->tg == &root_task_group)
return;
- if (force || abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
+ if (abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
atomic_long_add(delta, &cfs_rq->tg->load_avg);
cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg;
}
@@ -3211,7 +3379,6 @@
se->avg.last_update_time = n_last_update_time;
}
-
/*
* When on migration a sched_entity joins/leaves the PELT hierarchy, we need to
* propagate its contribution. The key to this propagation is the invariant
@@ -3222,11 +3389,11 @@
* _IFF_ we look at the pure running and runnable sums. Because they
* represent the very same entity, just at different points in the hierarchy.
*
- * Per the above update_tg_cfs_util() is trivial and simply copies the running
- * sum over (but still wrong, because the group entity and group rq do not have
- * their PELT windows aligned).
+ * Per the above update_tg_cfs_util() and update_tg_cfs_runnable() are trivial
+ * and simply copies the running/runnable sum over (but still wrong, because
+ * the group entity and group rq do not have their PELT windows aligned).
*
- * However, update_tg_cfs_runnable() is more complex. So we have:
+ * However, update_tg_cfs_load() is more complex. So we have:
*
* ge->avg.load_avg = ge->load.weight * ge->avg.runnable_avg (2)
*
@@ -3279,53 +3446,82 @@
* XXX: only do this for the part of runnable > running ?
*
*/
-
static inline void
update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
{
long delta = gcfs_rq->avg.util_avg - se->avg.util_avg;
+ u32 divider;
/* Nothing to update */
if (!delta)
return;
/*
- * The relation between sum and avg is:
- *
- * LOAD_AVG_MAX - 1024 + sa->period_contrib
- *
- * however, the PELT windows are not aligned between grq and gse.
+ * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
+ * See ___update_load_avg() for details.
*/
+ divider = get_pelt_divider(&cfs_rq->avg);
/* Set new sched_entity's utilization */
se->avg.util_avg = gcfs_rq->avg.util_avg;
- se->avg.util_sum = se->avg.util_avg * LOAD_AVG_MAX;
+ se->avg.util_sum = se->avg.util_avg * divider;
/* Update parent cfs_rq utilization */
add_positive(&cfs_rq->avg.util_avg, delta);
- cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * LOAD_AVG_MAX;
+ cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * divider;
}
static inline void
update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
{
- long delta_avg, running_sum, runnable_sum = gcfs_rq->prop_runnable_sum;
- unsigned long runnable_load_avg, load_avg;
- u64 runnable_load_sum, load_sum = 0;
- s64 delta_sum;
+ long delta = gcfs_rq->avg.runnable_avg - se->avg.runnable_avg;
+ u32 divider;
+
+ /* Nothing to update */
+ if (!delta)
+ return;
+
+ /*
+ * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
+ * See ___update_load_avg() for details.
+ */
+ divider = get_pelt_divider(&cfs_rq->avg);
+
+ /* Set new sched_entity's runnable */
+ se->avg.runnable_avg = gcfs_rq->avg.runnable_avg;
+ se->avg.runnable_sum = se->avg.runnable_avg * divider;
+
+ /* Update parent cfs_rq runnable */
+ add_positive(&cfs_rq->avg.runnable_avg, delta);
+ cfs_rq->avg.runnable_sum = cfs_rq->avg.runnable_avg * divider;
+}
+
+static inline void
+update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
+{
+ long delta, running_sum, runnable_sum = gcfs_rq->prop_runnable_sum;
+ unsigned long load_avg;
+ u64 load_sum = 0;
+ u32 divider;
if (!runnable_sum)
return;
gcfs_rq->prop_runnable_sum = 0;
+ /*
+ * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
+ * See ___update_load_avg() for details.
+ */
+ divider = get_pelt_divider(&cfs_rq->avg);
+
if (runnable_sum >= 0) {
/*
* Add runnable; clip at LOAD_AVG_MAX. Reflects that until
* the CPU is saturated running == runnable.
*/
runnable_sum += se->avg.load_sum;
- runnable_sum = min(runnable_sum, (long)LOAD_AVG_MAX);
+ runnable_sum = min_t(long, runnable_sum, divider);
} else {
/*
* Estimate the new unweighted runnable_sum of the gcfs_rq by
@@ -3350,28 +3546,15 @@
runnable_sum = max(runnable_sum, running_sum);
load_sum = (s64)se_weight(se) * runnable_sum;
- load_avg = div_s64(load_sum, LOAD_AVG_MAX);
+ load_avg = div_s64(load_sum, divider);
- delta_sum = load_sum - (s64)se_weight(se) * se->avg.load_sum;
- delta_avg = load_avg - se->avg.load_avg;
+ delta = load_avg - se->avg.load_avg;
se->avg.load_sum = runnable_sum;
se->avg.load_avg = load_avg;
- add_positive(&cfs_rq->avg.load_avg, delta_avg);
- add_positive(&cfs_rq->avg.load_sum, delta_sum);
- runnable_load_sum = (s64)se_runnable(se) * runnable_sum;
- runnable_load_avg = div_s64(runnable_load_sum, LOAD_AVG_MAX);
- delta_sum = runnable_load_sum - se_weight(se) * se->avg.runnable_load_sum;
- delta_avg = runnable_load_avg - se->avg.runnable_load_avg;
-
- se->avg.runnable_load_sum = runnable_sum;
- se->avg.runnable_load_avg = runnable_load_avg;
-
- if (se->on_rq) {
- add_positive(&cfs_rq->avg.runnable_load_avg, delta_avg);
- add_positive(&cfs_rq->avg.runnable_load_sum, delta_sum);
- }
+ add_positive(&cfs_rq->avg.load_avg, delta);
+ cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * divider;
}
static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum)
@@ -3400,6 +3583,7 @@
update_tg_cfs_util(cfs_rq, se, gcfs_rq);
update_tg_cfs_runnable(cfs_rq, se, gcfs_rq);
+ update_tg_cfs_load(cfs_rq, se, gcfs_rq);
trace_pelt_cfs_tp(cfs_rq);
trace_pelt_se_tp(se);
@@ -3439,7 +3623,7 @@
#else /* CONFIG_FAIR_GROUP_SCHED */
-static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
+static inline void update_tg_load_avg(struct cfs_rq *cfs_rq) {}
static inline int propagate_entity_load_avg(struct sched_entity *se)
{
@@ -3469,30 +3653,51 @@
static inline int
update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
{
- unsigned long removed_load = 0, removed_util = 0, removed_runnable_sum = 0;
+ unsigned long removed_load = 0, removed_util = 0, removed_runnable = 0;
struct sched_avg *sa = &cfs_rq->avg;
int decayed = 0;
if (cfs_rq->removed.nr) {
unsigned long r;
- u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib;
+ u32 divider = get_pelt_divider(&cfs_rq->avg);
raw_spin_lock(&cfs_rq->removed.lock);
swap(cfs_rq->removed.util_avg, removed_util);
swap(cfs_rq->removed.load_avg, removed_load);
- swap(cfs_rq->removed.runnable_sum, removed_runnable_sum);
+ swap(cfs_rq->removed.runnable_avg, removed_runnable);
cfs_rq->removed.nr = 0;
raw_spin_unlock(&cfs_rq->removed.lock);
r = removed_load;
sub_positive(&sa->load_avg, r);
- sub_positive(&sa->load_sum, r * divider);
+ sa->load_sum = sa->load_avg * divider;
r = removed_util;
sub_positive(&sa->util_avg, r);
sub_positive(&sa->util_sum, r * divider);
+ /*
+ * Because of rounding, se->util_sum might ends up being +1 more than
+ * cfs->util_sum. Although this is not a problem by itself, detaching
+ * a lot of tasks with the rounding problem between 2 updates of
+ * util_avg (~1ms) can make cfs->util_sum becoming null whereas
+ * cfs_util_avg is not.
+ * Check that util_sum is still above its lower bound for the new
+ * util_avg. Given that period_contrib might have moved since the last
+ * sync, we are only sure that util_sum must be above or equal to
+ * util_avg * minimum possible divider
+ */
+ sa->util_sum = max_t(u32, sa->util_sum, sa->util_avg * PELT_MIN_DIVIDER);
- add_tg_cfs_propagate(cfs_rq, -(long)removed_runnable_sum);
+ r = removed_runnable;
+ sub_positive(&sa->runnable_avg, r);
+ sa->runnable_sum = sa->runnable_avg * divider;
+
+ /*
+ * removed_runnable is the unweighted version of removed_load so we
+ * can use it to estimate removed_load_sum.
+ */
+ add_tg_cfs_propagate(cfs_rq,
+ -(long)(removed_runnable * divider) >> SCHED_CAPACITY_SHIFT);
decayed = 1;
}
@@ -3511,14 +3716,17 @@
* attach_entity_load_avg - attach this entity to its cfs_rq load avg
* @cfs_rq: cfs_rq to attach to
* @se: sched_entity to attach
- * @flags: migration hints
*
* Must call update_cfs_rq_load_avg() before this, since we rely on
* cfs_rq->avg.last_update_time being current.
*/
-static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
+static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib;
+ /*
+ * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
+ * See ___update_load_avg() for details.
+ */
+ u32 divider = get_pelt_divider(&cfs_rq->avg);
/*
* When we attach the @se to the @cfs_rq, we must align the decay
@@ -3538,21 +3746,23 @@
*/
se->avg.util_sum = se->avg.util_avg * divider;
+ se->avg.runnable_sum = se->avg.runnable_avg * divider;
+
se->avg.load_sum = divider;
if (se_weight(se)) {
se->avg.load_sum =
div_u64(se->avg.load_avg * se->avg.load_sum, se_weight(se));
}
- se->avg.runnable_load_sum = se->avg.load_sum;
-
enqueue_load_avg(cfs_rq, se);
cfs_rq->avg.util_avg += se->avg.util_avg;
cfs_rq->avg.util_sum += se->avg.util_sum;
+ cfs_rq->avg.runnable_avg += se->avg.runnable_avg;
+ cfs_rq->avg.runnable_sum += se->avg.runnable_sum;
add_tg_cfs_propagate(cfs_rq, se->avg.load_sum);
- cfs_rq_util_change(cfs_rq, flags);
+ cfs_rq_util_change(cfs_rq, 0);
trace_pelt_cfs_tp(cfs_rq);
}
@@ -3567,9 +3777,17 @@
*/
static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
+ /*
+ * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
+ * See ___update_load_avg() for details.
+ */
+ u32 divider = get_pelt_divider(&cfs_rq->avg);
+
dequeue_load_avg(cfs_rq, se);
sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
- sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
+ cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * divider;
+ sub_positive(&cfs_rq->avg.runnable_avg, se->avg.runnable_avg);
+ cfs_rq->avg.runnable_sum = cfs_rq->avg.runnable_avg * divider;
add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum);
@@ -3610,14 +3828,14 @@
*
* IOW we're enqueueing a task on a new CPU.
*/
- attach_entity_load_avg(cfs_rq, se, SCHED_CPUFREQ_MIGRATION);
- update_tg_load_avg(cfs_rq, 0);
+ attach_entity_load_avg(cfs_rq, se);
+ update_tg_load_avg(cfs_rq);
} else if (decayed) {
cfs_rq_util_change(cfs_rq, 0);
if (flags & UPDATE_TG)
- update_tg_load_avg(cfs_rq, 0);
+ update_tg_load_avg(cfs_rq);
}
}
@@ -3676,13 +3894,13 @@
++cfs_rq->removed.nr;
cfs_rq->removed.util_avg += se->avg.util_avg;
cfs_rq->removed.load_avg += se->avg.load_avg;
- cfs_rq->removed.runnable_sum += se->avg.load_sum; /* == runnable_sum */
+ cfs_rq->removed.runnable_avg += se->avg.runnable_avg;
raw_spin_unlock_irqrestore(&cfs_rq->removed.lock, flags);
}
-static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq)
+static inline unsigned long cfs_rq_runnable_avg(struct cfs_rq *cfs_rq)
{
- return cfs_rq->avg.runnable_load_avg;
+ return cfs_rq->avg.runnable_avg;
}
static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
@@ -3690,6 +3908,8 @@
return cfs_rq->avg.load_avg;
}
+static int newidle_balance(struct rq *this_rq, struct rq_flags *rf);
+
static inline unsigned long task_util(struct task_struct *p)
{
return READ_ONCE(p->se.avg.util_avg);
@@ -3699,7 +3919,7 @@
{
struct util_est ue = READ_ONCE(p->se.avg.util_est);
- return (max(ue.ewma, ue.enqueued) | UTIL_AVG_UNCHANGED);
+ return max(ue.ewma, (ue.enqueued & ~UTIL_AVG_UNCHANGED));
}
static inline unsigned long task_util_est(struct task_struct *p)
@@ -3707,6 +3927,20 @@
return max(task_util(p), _task_util_est(p));
}
+#ifdef CONFIG_UCLAMP_TASK
+static inline unsigned long uclamp_task_util(struct task_struct *p)
+{
+ return clamp(task_util_est(p),
+ uclamp_eff_value(p, UCLAMP_MIN),
+ uclamp_eff_value(p, UCLAMP_MAX));
+}
+#else
+static inline unsigned long uclamp_task_util(struct task_struct *p)
+{
+ return task_util_est(p);
+}
+#endif
+
static inline void util_est_enqueue(struct cfs_rq *cfs_rq,
struct task_struct *p)
{
@@ -3719,8 +3953,28 @@
enqueued = cfs_rq->avg.util_est.enqueued;
enqueued += _task_util_est(p);
WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued);
+
+ trace_sched_util_est_cfs_tp(cfs_rq);
}
+static inline void util_est_dequeue(struct cfs_rq *cfs_rq,
+ struct task_struct *p)
+{
+ unsigned int enqueued;
+
+ if (!sched_feat(UTIL_EST))
+ return;
+
+ /* Update root cfs_rq's estimated utilization */
+ enqueued = cfs_rq->avg.util_est.enqueued;
+ enqueued -= min_t(unsigned int, enqueued, _task_util_est(p));
+ WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued);
+
+ trace_sched_util_est_cfs_tp(cfs_rq);
+}
+
+#define UTIL_EST_MARGIN (SCHED_CAPACITY_SCALE / 100)
+
/*
* Check if a (signed) value is within a specified (unsigned) margin,
* based on the observation that:
@@ -3734,21 +3988,16 @@
return ((unsigned int)(value + margin - 1) < (2 * margin - 1));
}
-static void
-util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)
+static inline void util_est_update(struct cfs_rq *cfs_rq,
+ struct task_struct *p,
+ bool task_sleep)
{
- long last_ewma_diff;
+ long last_ewma_diff, last_enqueued_diff;
struct util_est ue;
- int cpu;
if (!sched_feat(UTIL_EST))
return;
- /* Update root cfs_rq's estimated utilization */
- ue.enqueued = cfs_rq->avg.util_est.enqueued;
- ue.enqueued -= min_t(unsigned int, ue.enqueued, _task_util_est(p));
- WRITE_ONCE(cfs_rq->avg.util_est.enqueued, ue.enqueued);
-
/*
* Skip update of task's estimated utilization when the task has not
* yet completed an activation, e.g. being migrated.
@@ -3764,21 +4013,38 @@
if (ue.enqueued & UTIL_AVG_UNCHANGED)
return;
+ last_enqueued_diff = ue.enqueued;
+
/*
- * Skip update of task's estimated utilization when its EWMA is
+ * Reset EWMA on utilization increases, the moving average is used only
+ * to smooth utilization decreases.
+ */
+ ue.enqueued = task_util(p);
+ if (sched_feat(UTIL_EST_FASTUP)) {
+ if (ue.ewma < ue.enqueued) {
+ ue.ewma = ue.enqueued;
+ goto done;
+ }
+ }
+
+ /*
+ * Skip update of task's estimated utilization when its members are
* already ~1% close to its last activation value.
*/
- ue.enqueued = (task_util(p) | UTIL_AVG_UNCHANGED);
last_ewma_diff = ue.enqueued - ue.ewma;
- if (within_margin(last_ewma_diff, (SCHED_CAPACITY_SCALE / 100)))
+ last_enqueued_diff -= ue.enqueued;
+ if (within_margin(last_ewma_diff, UTIL_EST_MARGIN)) {
+ if (!within_margin(last_enqueued_diff, UTIL_EST_MARGIN))
+ goto done;
+
return;
+ }
/*
* To avoid overestimation of actual task utilization, skip updates if
* we cannot grant there is idle time in this CPU.
*/
- cpu = cpu_of(rq_of(cfs_rq));
- if (task_util(p) > capacity_orig_of(cpu))
+ if (task_util(p) > capacity_orig_of(cpu_of(rq_of(cfs_rq))))
return;
/*
@@ -3801,12 +4067,16 @@
ue.ewma <<= UTIL_EST_WEIGHT_SHIFT;
ue.ewma += last_ewma_diff;
ue.ewma >>= UTIL_EST_WEIGHT_SHIFT;
+done:
+ ue.enqueued |= UTIL_AVG_UNCHANGED;
WRITE_ONCE(p->se.avg.util_est, ue);
+
+ trace_sched_util_est_se_tp(&p->se);
}
static inline int task_fits_capacity(struct task_struct *p, long capacity)
{
- return fits_capacity(task_util_est(p), capacity);
+ return fits_capacity(uclamp_task_util(p), capacity);
}
static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
@@ -3845,11 +4115,11 @@
static inline void remove_entity_load_avg(struct sched_entity *se) {}
static inline void
-attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) {}
+attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
static inline void
detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
-static inline int idle_balance(struct rq *rq, struct rq_flags *rf)
+static inline int newidle_balance(struct rq *rq, struct rq_flags *rf)
{
return 0;
}
@@ -3858,8 +4128,11 @@
util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) {}
static inline void
-util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p,
- bool task_sleep) {}
+util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p) {}
+
+static inline void
+util_est_update(struct cfs_rq *cfs_rq, struct task_struct *p,
+ bool task_sleep) {}
static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {}
#endif /* CONFIG_SMP */
@@ -3996,8 +4269,8 @@
* - Add its new weight to cfs_rq->load.weight
*/
update_load_avg(cfs_rq, se, UPDATE_TG | DO_ATTACH);
+ se_update_runnable(se);
update_cfs_group(se);
- enqueue_runnable_load_avg(cfs_rq, se);
account_entity_enqueue(cfs_rq, se);
if (flags & ENQUEUE_WAKEUP)
@@ -4086,7 +4359,7 @@
* of its group cfs_rq.
*/
update_load_avg(cfs_rq, se, UPDATE_TG);
- dequeue_runnable_load_avg(cfs_rq, se);
+ se_update_runnable(se);
update_stats_dequeue(cfs_rq, se, flags);
@@ -4238,17 +4511,17 @@
se = second;
}
- /*
- * Prefer last buddy, try to return the CPU to a preempted task.
- */
- if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
- se = cfs_rq->last;
-
- /*
- * Someone really wants this to run. If it's not unfair, run it.
- */
- if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
+ if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) {
+ /*
+ * Someone really wants this to run. If it's not unfair, run it.
+ */
se = cfs_rq->next;
+ } else if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1) {
+ /*
+ * Prefer last buddy, try to return the CPU to a preempted task.
+ */
+ se = cfs_rq->last;
+ }
clear_buddies(cfs_rq, se);
@@ -4554,8 +4827,13 @@
if (!se->on_rq)
break;
- if (dequeue)
+ if (dequeue) {
dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
+ } else {
+ update_load_avg(qcfs_rq, se, 0);
+ se_update_runnable(se);
+ }
+
qcfs_rq->h_nr_running -= task_delta;
qcfs_rq->idle_h_nr_running -= idle_task_delta;
@@ -4618,6 +4896,9 @@
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
+ update_load_avg(cfs_rq, se, UPDATE_TG);
+ se_update_runnable(se);
+
cfs_rq->h_nr_running += task_delta;
cfs_rq->idle_h_nr_running += idle_task_delta;
@@ -4657,11 +4938,10 @@
resched_curr(rq);
}
-static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, u64 remaining)
+static void distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
{
struct cfs_rq *cfs_rq;
- u64 runtime;
- u64 starting_runtime = remaining;
+ u64 runtime, remaining = 1;
rcu_read_lock();
list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
@@ -4676,10 +4956,13 @@
/* By the above check, this should never be true */
SCHED_WARN_ON(cfs_rq->runtime_remaining > 0);
+ raw_spin_lock(&cfs_b->lock);
runtime = -cfs_rq->runtime_remaining + 1;
- if (runtime > remaining)
- runtime = remaining;
- remaining -= runtime;
+ if (runtime > cfs_b->runtime)
+ runtime = cfs_b->runtime;
+ cfs_b->runtime -= runtime;
+ remaining = cfs_b->runtime;
+ raw_spin_unlock(&cfs_b->lock);
cfs_rq->runtime_remaining += runtime;
@@ -4694,8 +4977,6 @@
break;
}
rcu_read_unlock();
-
- return starting_runtime - remaining;
}
/*
@@ -4706,7 +4987,6 @@
*/
static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, unsigned long flags)
{
- u64 runtime;
int throttled;
/* no need to continue the timer with no bandwidth constraint */
@@ -4735,24 +5015,15 @@
cfs_b->nr_throttled += overrun;
/*
- * This check is repeated as we are holding onto the new bandwidth while
- * we unthrottle. This can potentially race with an unthrottled group
- * trying to acquire new bandwidth from the global pool. This can result
- * in us over-using our runtime if it is all used during this loop, but
- * only by limited amounts in that extreme case.
+ * This check is repeated as we release cfs_b->lock while we unthrottle.
*/
- while (throttled && cfs_b->runtime > 0 && !cfs_b->distribute_running) {
- runtime = cfs_b->runtime;
- cfs_b->distribute_running = 1;
+ while (throttled && cfs_b->runtime > 0) {
raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
/* we can't nest cfs_b->lock while distributing bandwidth */
- runtime = distribute_cfs_runtime(cfs_b, runtime);
+ distribute_cfs_runtime(cfs_b);
raw_spin_lock_irqsave(&cfs_b->lock, flags);
- cfs_b->distribute_running = 0;
throttled = !list_empty(&cfs_b->throttled_cfs_rq);
-
- lsub_positive(&cfs_b->runtime, runtime);
}
/*
@@ -4865,10 +5136,6 @@
/* confirm we're still not at a refresh boundary */
raw_spin_lock_irqsave(&cfs_b->lock, flags);
cfs_b->slack_started = false;
- if (cfs_b->distribute_running) {
- raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
- return;
- }
if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
@@ -4878,26 +5145,21 @@
if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice)
runtime = cfs_b->runtime;
- if (runtime)
- cfs_b->distribute_running = 1;
-
raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
if (!runtime)
return;
- runtime = distribute_cfs_runtime(cfs_b, runtime);
+ distribute_cfs_runtime(cfs_b);
raw_spin_lock_irqsave(&cfs_b->lock, flags);
- lsub_positive(&cfs_b->runtime, runtime);
- cfs_b->distribute_running = 0;
raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
}
/*
* When a group wakes up we want to make sure that its quota is not already
* expired/exceeded, otherwise it may be allowed to steal additional ticks of
- * runtime as update_curr() throttling can not not trigger until it's on-rq.
+ * runtime as update_curr() throttling can not trigger until it's on-rq.
*/
static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
{
@@ -5032,7 +5294,6 @@
cfs_b->period_timer.function = sched_cfs_period_timer;
hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
cfs_b->slack_timer.function = sched_cfs_slack_timer;
- cfs_b->distribute_running = 0;
cfs_b->slack_started = false;
}
@@ -5237,6 +5498,20 @@
static inline void update_overutilized_status(struct rq *rq) { }
#endif
+/* Runqueue only has SCHED_IDLE tasks enqueued */
+static int sched_idle_rq(struct rq *rq)
+{
+ return unlikely(rq->nr_running == rq->cfs.idle_h_nr_running &&
+ rq->nr_running);
+}
+
+#ifdef CONFIG_SMP
+static int sched_idle_cpu(int cpu)
+{
+ return sched_idle_rq(cpu_rq(cpu));
+}
+#endif
+
/*
* The enqueue_task method is called before nr_running is
* increased. Here we update the fair scheduling stats and
@@ -5286,6 +5561,7 @@
cfs_rq = cfs_rq_of(se);
update_load_avg(cfs_rq, se, UPDATE_TG);
+ se_update_runnable(se);
update_cfs_group(se);
cfs_rq->h_nr_running++;
@@ -5303,28 +5579,27 @@
list_add_leaf_cfs_rq(cfs_rq);
}
+ /* At this point se is NULL and we are at root level*/
+ add_nr_running(rq, 1);
+
+ /*
+ * Since new tasks are assigned an initial util_avg equal to
+ * half of the spare capacity of their CPU, tiny tasks have the
+ * ability to cross the overutilized threshold, which will
+ * result in the load balancer ruining all the task placement
+ * done by EAS. As a way to mitigate that effect, do not account
+ * for the first enqueue operation of new tasks during the
+ * overutilized flag detection.
+ *
+ * A better way of solving this problem would be to wait for
+ * the PELT signals of tasks to converge before taking them
+ * into account, but that is not straightforward to implement,
+ * and the following generally works well enough in practice.
+ */
+ if (!task_new)
+ update_overutilized_status(rq);
+
enqueue_throttle:
- if (!se) {
- add_nr_running(rq, 1);
- /*
- * Since new tasks are assigned an initial util_avg equal to
- * half of the spare capacity of their CPU, tiny tasks have the
- * ability to cross the overutilized threshold, which will
- * result in the load balancer ruining all the task placement
- * done by EAS. As a way to mitigate that effect, do not account
- * for the first enqueue operation of new tasks during the
- * overutilized flag detection.
- *
- * A better way of solving this problem would be to wait for
- * the PELT signals of tasks to converge before taking them
- * into account, but that is not straightforward to implement,
- * and the following generally works well enough in practice.
- */
- if (!task_new)
- update_overutilized_status(rq);
-
- }
-
if (cfs_bandwidth_used()) {
/*
* When bandwidth control is enabled; the cfs_rq_throttled()
@@ -5358,6 +5633,9 @@
struct sched_entity *se = &p->se;
int task_sleep = flags & DEQUEUE_SLEEP;
int idle_h_nr_running = task_has_idle_policy(p);
+ bool was_sched_idle = sched_idle_rq(rq);
+
+ util_est_dequeue(&rq->cfs, p);
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
@@ -5389,6 +5667,7 @@
cfs_rq = cfs_rq_of(se);
update_load_avg(cfs_rq, se, UPDATE_TG);
+ se_update_runnable(se);
update_cfs_group(se);
cfs_rq->h_nr_running--;
@@ -5400,11 +5679,15 @@
}
-dequeue_throttle:
- if (!se)
- sub_nr_running(rq, 1);
+ /* At this point se is NULL and we are at root level*/
+ sub_nr_running(rq, 1);
- util_est_dequeue(&rq->cfs, p, task_sleep);
+ /* balance early to pull high priority tasks */
+ if (unlikely(!was_sched_idle && sched_idle_rq(rq)))
+ rq->next_balance = jiffies;
+
+dequeue_throttle:
+ util_est_update(&rq->cfs, p, task_sleep);
hrtick_update(rq);
}
@@ -5426,18 +5709,63 @@
#endif /* CONFIG_NO_HZ_COMMON */
-/* CPU only has SCHED_IDLE tasks enqueued */
-static int sched_idle_cpu(int cpu)
+static unsigned long cpu_load(struct rq *rq)
{
- struct rq *rq = cpu_rq(cpu);
-
- return unlikely(rq->nr_running == rq->cfs.idle_h_nr_running &&
- rq->nr_running);
+ return cfs_rq_load_avg(&rq->cfs);
}
-static unsigned long cpu_runnable_load(struct rq *rq)
+/*
+ * cpu_load_without - compute CPU load without any contributions from *p
+ * @cpu: the CPU which load is requested
+ * @p: the task which load should be discounted
+ *
+ * The load of a CPU is defined by the load of tasks currently enqueued on that
+ * CPU as well as tasks which are currently sleeping after an execution on that
+ * CPU.
+ *
+ * This method returns the load of the specified CPU by discounting the load of
+ * the specified task, whenever the task is currently contributing to the CPU
+ * load.
+ */
+static unsigned long cpu_load_without(struct rq *rq, struct task_struct *p)
{
- return cfs_rq_runnable_load_avg(&rq->cfs);
+ struct cfs_rq *cfs_rq;
+ unsigned int load;
+
+ /* Task has no contribution or is new */
+ if (cpu_of(rq) != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
+ return cpu_load(rq);
+
+ cfs_rq = &rq->cfs;
+ load = READ_ONCE(cfs_rq->avg.load_avg);
+
+ /* Discount task's util from CPU's util */
+ lsub_positive(&load, task_h_load(p));
+
+ return load;
+}
+
+static unsigned long cpu_runnable(struct rq *rq)
+{
+ return cfs_rq_runnable_avg(&rq->cfs);
+}
+
+static unsigned long cpu_runnable_without(struct rq *rq, struct task_struct *p)
+{
+ struct cfs_rq *cfs_rq;
+ unsigned int runnable;
+
+ /* Task has no contribution or is new */
+ if (cpu_of(rq) != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
+ return cpu_runnable(rq);
+
+ cfs_rq = &rq->cfs;
+ runnable = READ_ONCE(cfs_rq->avg.runnable_avg);
+
+ /* Discount task's runnable from CPU's runnable */
+ lsub_positive(&runnable, p->se.avg.runnable_avg);
+
+ return runnable;
}
static unsigned long capacity_of(int cpu)
@@ -5445,18 +5773,6 @@
return cpu_rq(cpu)->cpu_capacity;
}
-static unsigned long cpu_avg_load_per_task(int cpu)
-{
- struct rq *rq = cpu_rq(cpu);
- unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running);
- unsigned long load_avg = cpu_runnable_load(rq);
-
- if (nr_running)
- return load_avg / nr_running;
-
- return 0;
-}
-
static void record_wakee(struct task_struct *p)
{
/*
@@ -5495,7 +5811,7 @@
{
unsigned int master = current->wakee_flips;
unsigned int slave = p->wakee_flips;
- int factor = this_cpu_read(sd_llc_size);
+ int factor = __this_cpu_read(sd_llc_size);
if (master < slave)
swap(master, slave);
@@ -5547,7 +5863,7 @@
s64 this_eff_load, prev_eff_load;
unsigned long task_load;
- this_eff_load = cpu_runnable_load(cpu_rq(this_cpu));
+ this_eff_load = cpu_load(cpu_rq(this_cpu));
if (sync) {
unsigned long current_load = task_h_load(current);
@@ -5565,7 +5881,7 @@
this_eff_load *= 100;
this_eff_load *= capacity_of(prev_cpu);
- prev_eff_load = cpu_runnable_load(cpu_rq(prev_cpu));
+ prev_eff_load = cpu_load(cpu_rq(prev_cpu));
prev_eff_load -= task_load;
if (sched_feat(WA_BIAS))
prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2;
@@ -5603,149 +5919,8 @@
return target;
}
-static unsigned long cpu_util_without(int cpu, struct task_struct *p);
-
-static unsigned long capacity_spare_without(int cpu, struct task_struct *p)
-{
- return max_t(long, capacity_of(cpu) - cpu_util_without(cpu, p), 0);
-}
-
-/*
- * find_idlest_group finds and returns the least busy CPU group within the
- * domain.
- *
- * Assumes p is allowed on at least one CPU in sd.
- */
static struct sched_group *
-find_idlest_group(struct sched_domain *sd, struct task_struct *p,
- int this_cpu, int sd_flag)
-{
- struct sched_group *idlest = NULL, *group = sd->groups;
- struct sched_group *most_spare_sg = NULL;
- unsigned long min_runnable_load = ULONG_MAX;
- unsigned long this_runnable_load = ULONG_MAX;
- unsigned long min_avg_load = ULONG_MAX, this_avg_load = ULONG_MAX;
- unsigned long most_spare = 0, this_spare = 0;
- int imbalance_scale = 100 + (sd->imbalance_pct-100)/2;
- unsigned long imbalance = scale_load_down(NICE_0_LOAD) *
- (sd->imbalance_pct-100) / 100;
-
- do {
- unsigned long load, avg_load, runnable_load;
- unsigned long spare_cap, max_spare_cap;
- int local_group;
- int i;
-
- /* Skip over this group if it has no CPUs allowed */
- if (!cpumask_intersects(sched_group_span(group),
- p->cpus_ptr))
- continue;
-
- local_group = cpumask_test_cpu(this_cpu,
- sched_group_span(group));
-
- /*
- * Tally up the load of all CPUs in the group and find
- * the group containing the CPU with most spare capacity.
- */
- avg_load = 0;
- runnable_load = 0;
- max_spare_cap = 0;
-
- for_each_cpu(i, sched_group_span(group)) {
- load = cpu_runnable_load(cpu_rq(i));
- runnable_load += load;
-
- avg_load += cfs_rq_load_avg(&cpu_rq(i)->cfs);
-
- spare_cap = capacity_spare_without(i, p);
-
- if (spare_cap > max_spare_cap)
- max_spare_cap = spare_cap;
- }
-
- /* Adjust by relative CPU capacity of the group */
- avg_load = (avg_load * SCHED_CAPACITY_SCALE) /
- group->sgc->capacity;
- runnable_load = (runnable_load * SCHED_CAPACITY_SCALE) /
- group->sgc->capacity;
-
- if (local_group) {
- this_runnable_load = runnable_load;
- this_avg_load = avg_load;
- this_spare = max_spare_cap;
- } else {
- if (min_runnable_load > (runnable_load + imbalance)) {
- /*
- * The runnable load is significantly smaller
- * so we can pick this new CPU:
- */
- min_runnable_load = runnable_load;
- min_avg_load = avg_load;
- idlest = group;
- } else if ((runnable_load < (min_runnable_load + imbalance)) &&
- (100*min_avg_load > imbalance_scale*avg_load)) {
- /*
- * The runnable loads are close so take the
- * blocked load into account through avg_load:
- */
- min_avg_load = avg_load;
- idlest = group;
- }
-
- if (most_spare < max_spare_cap) {
- most_spare = max_spare_cap;
- most_spare_sg = group;
- }
- }
- } while (group = group->next, group != sd->groups);
-
- /*
- * The cross-over point between using spare capacity or least load
- * is too conservative for high utilization tasks on partially
- * utilized systems if we require spare_capacity > task_util(p),
- * so we allow for some task stuffing by using
- * spare_capacity > task_util(p)/2.
- *
- * Spare capacity can't be used for fork because the utilization has
- * not been set yet, we must first select a rq to compute the initial
- * utilization.
- */
- if (sd_flag & SD_BALANCE_FORK)
- goto skip_spare;
-
- if (this_spare > task_util(p) / 2 &&
- imbalance_scale*this_spare > 100*most_spare)
- return NULL;
-
- if (most_spare > task_util(p) / 2)
- return most_spare_sg;
-
-skip_spare:
- if (!idlest)
- return NULL;
-
- /*
- * When comparing groups across NUMA domains, it's possible for the
- * local domain to be very lightly loaded relative to the remote
- * domains but "imbalance" skews the comparison making remote CPUs
- * look much more favourable. When considering cross-domain, add
- * imbalance to the runnable load on the remote node and consider
- * staying local.
- */
- if ((sd->flags & SD_NUMA) &&
- min_runnable_load + imbalance >= this_runnable_load)
- return NULL;
-
- if (min_runnable_load > (this_runnable_load + imbalance))
- return NULL;
-
- if ((this_runnable_load < (min_runnable_load + imbalance)) &&
- (100*this_avg_load < imbalance_scale*min_avg_load))
- return NULL;
-
- return idlest;
-}
+find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu);
/*
* find_idlest_group_cpu - find the idlest CPU among the CPUs in the group.
@@ -5757,7 +5932,7 @@
unsigned int min_exit_latency = UINT_MAX;
u64 latest_idle_timestamp = 0;
int least_loaded_cpu = this_cpu;
- int shallowest_idle_cpu = -1, si_cpu = -1;
+ int shallowest_idle_cpu = -1;
int i;
/* Check if we have any choice: */
@@ -5766,6 +5941,9 @@
/* Traverse only the allowed CPUs */
for_each_cpu_and(i, sched_group_span(group), p->cpus_ptr) {
+ if (sched_idle_cpu(i))
+ return i;
+
if (available_idle_cpu(i)) {
struct rq *rq = cpu_rq(i);
struct cpuidle_state *idle = idle_get_state(rq);
@@ -5788,13 +5966,8 @@
latest_idle_timestamp = rq->idle_stamp;
shallowest_idle_cpu = i;
}
- } else if (shallowest_idle_cpu == -1 && si_cpu == -1) {
- if (sched_idle_cpu(i)) {
- si_cpu = i;
- continue;
- }
-
- load = cpu_runnable_load(cpu_rq(i));
+ } else if (shallowest_idle_cpu == -1) {
+ load = cpu_load(cpu_rq(i));
if (load < min_load) {
min_load = load;
least_loaded_cpu = i;
@@ -5802,11 +5975,7 @@
}
}
- if (shallowest_idle_cpu != -1)
- return shallowest_idle_cpu;
- if (si_cpu != -1)
- return si_cpu;
- return least_loaded_cpu;
+ return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
}
static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p,
@@ -5818,7 +5987,7 @@
return prev_cpu;
/*
- * We need task's util for capacity_spare_without, sync it up to
+ * We need task's util for cpu_util_without, sync it up to
* prev_cpu's last_update_time.
*/
if (!(sd_flag & SD_BALANCE_FORK))
@@ -5834,7 +6003,7 @@
continue;
}
- group = find_idlest_group(sd, p, cpu, sd_flag);
+ group = find_idlest_group(sd, p, cpu);
if (!group) {
sd = sd->child;
continue;
@@ -5937,10 +6106,12 @@
bool idle = true;
for_each_cpu(cpu, cpu_smt_mask(core)) {
- __cpumask_clear_cpu(cpu, cpus);
- if (!available_idle_cpu(cpu))
+ if (!available_idle_cpu(cpu)) {
idle = false;
+ break;
+ }
}
+ cpumask_andnot(cpus, cpus, cpu_smt_mask(core));
if (idle)
return core;
@@ -5959,7 +6130,7 @@
*/
static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
{
- int cpu, si_cpu = -1;
+ int cpu;
if (!static_branch_likely(&sched_smt_present))
return -1;
@@ -5968,13 +6139,11 @@
if (!cpumask_test_cpu(cpu, p->cpus_ptr) ||
!cpumask_test_cpu(cpu, sched_domain_span(sd)))
continue;
- if (available_idle_cpu(cpu))
+ if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
return cpu;
- if (si_cpu == -1 && sched_idle_cpu(cpu))
- si_cpu = cpu;
}
- return si_cpu;
+ return -1;
}
#else /* CONFIG_SCHED_SMT */
@@ -6001,10 +6170,9 @@
struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
struct sched_domain *this_sd;
u64 avg_cost, avg_idle;
- u64 time, cost;
- s64 delta;
+ u64 time;
int this = smp_processor_id();
- int cpu, nr = INT_MAX, si_cpu = -1;
+ int cpu, nr = INT_MAX;
this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
if (!this_sd)
@@ -6034,46 +6202,113 @@
for_each_cpu_wrap(cpu, cpus, target) {
if (!--nr)
- return si_cpu;
- if (available_idle_cpu(cpu))
+ return -1;
+ if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
break;
- if (si_cpu == -1 && sched_idle_cpu(cpu))
- si_cpu = cpu;
}
time = cpu_clock(this) - time;
- cost = this_sd->avg_scan_cost;
- delta = (s64)(time - cost) / 8;
- this_sd->avg_scan_cost += delta;
+ update_avg(&this_sd->avg_scan_cost, time);
return cpu;
}
/*
+ * Scan the asym_capacity domain for idle CPUs; pick the first idle one on which
+ * the task fits. If no CPU is big enough, but there are idle ones, try to
+ * maximize capacity.
+ */
+static int
+select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
+{
+ unsigned long task_util, best_cap = 0;
+ int cpu, best_cpu = -1;
+ struct cpumask *cpus;
+
+ cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
+ cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
+
+ task_util = uclamp_task_util(p);
+
+ for_each_cpu_wrap(cpu, cpus, target) {
+ unsigned long cpu_cap = capacity_of(cpu);
+
+ if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu))
+ continue;
+ if (fits_capacity(task_util, cpu_cap))
+ return cpu;
+
+ if (cpu_cap > best_cap) {
+ best_cap = cpu_cap;
+ best_cpu = cpu;
+ }
+ }
+
+ return best_cpu;
+}
+
+static inline bool asym_fits_capacity(int task_util, int cpu)
+{
+ if (static_branch_unlikely(&sched_asym_cpucapacity))
+ return fits_capacity(task_util, capacity_of(cpu));
+
+ return true;
+}
+
+/*
* Try and locate an idle core/thread in the LLC cache domain.
*/
static int select_idle_sibling(struct task_struct *p, int prev, int target)
{
struct sched_domain *sd;
+ unsigned long task_util;
int i, recent_used_cpu;
- if (available_idle_cpu(target) || sched_idle_cpu(target))
+ /*
+ * On asymmetric system, update task utilization because we will check
+ * that the task fits with cpu's capacity.
+ */
+ if (static_branch_unlikely(&sched_asym_cpucapacity)) {
+ sync_entity_load_avg(&p->se);
+ task_util = uclamp_task_util(p);
+ }
+
+ if ((available_idle_cpu(target) || sched_idle_cpu(target)) &&
+ asym_fits_capacity(task_util, target))
return target;
/*
* If the previous CPU is cache affine and idle, don't be stupid:
*/
if (prev != target && cpus_share_cache(prev, target) &&
- (available_idle_cpu(prev) || sched_idle_cpu(prev)))
+ (available_idle_cpu(prev) || sched_idle_cpu(prev)) &&
+ asym_fits_capacity(task_util, prev))
return prev;
+ /*
+ * Allow a per-cpu kthread to stack with the wakee if the
+ * kworker thread and the tasks previous CPUs are the same.
+ * The assumption is that the wakee queued work for the
+ * per-cpu kthread that is now complete and the wakeup is
+ * essentially a sync wakeup. An obvious example of this
+ * pattern is IO completions.
+ */
+ if (is_per_cpu_kthread(current) &&
+ in_task() &&
+ prev == smp_processor_id() &&
+ this_rq()->nr_running <= 1 &&
+ asym_fits_capacity(task_util, prev)) {
+ return prev;
+ }
+
/* Check a recently used CPU as a potential idle candidate: */
recent_used_cpu = p->recent_used_cpu;
if (recent_used_cpu != prev &&
recent_used_cpu != target &&
cpus_share_cache(recent_used_cpu, target) &&
(available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) &&
- cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr)) {
+ cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr) &&
+ asym_fits_capacity(task_util, recent_used_cpu)) {
/*
* Replace recent_used_cpu with prev as it is a potential
* candidate for the next wake:
@@ -6082,6 +6317,26 @@
return recent_used_cpu;
}
+ /*
+ * For asymmetric CPU capacity systems, our domain of interest is
+ * sd_asym_cpucapacity rather than sd_llc.
+ */
+ if (static_branch_unlikely(&sched_asym_cpucapacity)) {
+ sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, target));
+ /*
+ * On an asymmetric CPU capacity system where an exclusive
+ * cpuset defines a symmetric island (i.e. one unique
+ * capacity_orig value through the cpuset), the key will be set
+ * but the CPUs within that cpuset will not have a domain with
+ * SD_ASYM_CPUCAPACITY. These should follow the usual symmetric
+ * capacity path.
+ */
+ if (sd) {
+ i = select_idle_capacity(p, sd, target);
+ return ((unsigned)i < nr_cpumask_bits) ? i : target;
+ }
+ }
+
sd = rcu_dereference(per_cpu(sd_llc, target));
if (!sd)
return target;
@@ -6243,33 +6498,6 @@
}
/*
- * Disable WAKE_AFFINE in the case where task @p doesn't fit in the
- * capacity of either the waking CPU @cpu or the previous CPU @prev_cpu.
- *
- * In that case WAKE_AFFINE doesn't make sense and we'll let
- * BALANCE_WAKE sort things out.
- */
-static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
-{
- long min_cap, max_cap;
-
- if (!static_branch_unlikely(&sched_asym_cpucapacity))
- return 0;
-
- min_cap = min(capacity_orig_of(prev_cpu), capacity_orig_of(cpu));
- max_cap = cpu_rq(cpu)->rd->max_cpu_capacity;
-
- /* Minimum capacity is close to max, no need to abort wake_affine */
- if (max_cap - min_cap < max_cap >> 3)
- return 0;
-
- /* Bring task utilization in sync with prev_cpu */
- sync_entity_load_avg(&p->se);
-
- return !task_fits_capacity(p, min_cap);
-}
-
-/*
* Predicts what cpu_util(@cpu) would return if @p was migrated (and enqueued)
* to @dst_cpu.
*/
@@ -6356,7 +6584,7 @@
max_util = max(max_util, cpu_util);
}
- return em_pd_energy(pd->em_pd, max_util, sum_util);
+ return em_cpu_energy(pd->em_pd, max_util, sum_util);
}
/*
@@ -6439,9 +6667,19 @@
if (!cpumask_test_cpu(cpu, p->cpus_ptr))
continue;
- /* Skip CPUs that will be overutilized. */
util = cpu_util_next(cpu, p, cpu);
cpu_cap = capacity_of(cpu);
+ spare_cap = cpu_cap;
+ lsub_positive(&spare_cap, util);
+
+ /*
+ * Skip CPUs that cannot satisfy the capacity request.
+ * IOW, placing the task there would make the CPU
+ * overutilized. Take uclamp into account to see how
+ * much capacity we can get out of the CPU; this is
+ * aligned with schedutil_cpu_util().
+ */
+ util = uclamp_rq_util_with(cpu_rq(cpu), util, p);
if (!fits_capacity(util, cpu_cap))
continue;
@@ -6456,7 +6694,6 @@
* Find the CPU with the maximum spare capacity in
* the performance domain
*/
- spare_cap = cpu_cap - util;
if (spare_cap > max_spare_cap) {
max_spare_cap = spare_cap;
max_spare_cap_cpu = cpu;
@@ -6525,15 +6762,11 @@
new_cpu = prev_cpu;
}
- want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu) &&
- cpumask_test_cpu(cpu, p->cpus_ptr);
+ want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, p->cpus_ptr);
}
rcu_read_lock();
for_each_domain(cpu, tmp) {
- if (!(tmp->flags & SD_LOAD_BALANCE))
- break;
-
/*
* If both 'cpu' and 'prev_cpu' are part of this domain,
* cpu is a valid SD_WAKE_AFFINE target.
@@ -6813,7 +7046,7 @@
set_last_buddy(se);
}
-static struct task_struct *
+struct task_struct *
pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
struct cfs_rq *cfs_rq = &rq->cfs;
@@ -6957,6 +7190,11 @@
return NULL;
}
+static struct task_struct *__pick_next_task_fair(struct rq *rq)
+{
+ return pick_next_task_fair(rq, NULL, NULL);
+}
+
/*
* Account for a descheduled task:
*/
@@ -7007,7 +7245,7 @@
set_skip_buddy(se);
}
-static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt)
+static bool yield_to_task_fair(struct rq *rq, struct task_struct *p)
{
struct sched_entity *se = &p->se;
@@ -7146,11 +7384,49 @@
enum fbq_type { regular, remote, all };
+/*
+ * 'group_type' describes the group of CPUs at the moment of load balancing.
+ *
+ * The enum is ordered by pulling priority, with the group with lowest priority
+ * first so the group_type can simply be compared when selecting the busiest
+ * group. See update_sd_pick_busiest().
+ */
enum group_type {
- group_other = 0,
+ /* The group has spare capacity that can be used to run more tasks. */
+ group_has_spare = 0,
+ /*
+ * The group is fully used and the tasks don't compete for more CPU
+ * cycles. Nevertheless, some tasks might wait before running.
+ */
+ group_fully_busy,
+ /*
+ * SD_ASYM_CPUCAPACITY only: One task doesn't fit with CPU's capacity
+ * and must be migrated to a more powerful CPU.
+ */
group_misfit_task,
+ /*
+ * SD_ASYM_PACKING only: One local CPU with higher capacity is available,
+ * and the task should be migrated to it instead of running on the
+ * current CPU.
+ */
+ group_asym_packing,
+ /*
+ * The tasks' affinity constraints previously prevented the scheduler
+ * from balancing the load across the system.
+ */
group_imbalanced,
- group_overloaded,
+ /*
+ * The CPU is overloaded and can't provide expected CPU cycles to all
+ * tasks.
+ */
+ group_overloaded
+};
+
+enum migration_type {
+ migrate_load = 0,
+ migrate_util,
+ migrate_task,
+ migrate_misfit
};
#define LBF_ALL_PINNED 0x01
@@ -7183,7 +7459,7 @@
unsigned int loop_max;
enum fbq_type fbq_type;
- enum group_type src_grp_type;
+ enum migration_type migration_type;
struct list_head tasks;
};
@@ -7202,6 +7478,10 @@
if (unlikely(task_has_idle_policy(p)))
return 0;
+ /* SMT siblings share cache */
+ if (env->sd->flags & SD_SHARE_CPUCAPACITY)
+ return 0;
+
/*
* Buddy candidates are cache hot:
*/
@@ -7410,7 +7690,7 @@
static const unsigned int sched_nr_migrate_break = 32;
/*
- * detach_tasks() -- tries to detach up to imbalance runnable load from
+ * detach_tasks() -- tries to detach up to imbalance load/util/tasks from
* busiest_rq, as part of a balancing operation within domain "sd".
*
* Returns number of detached tasks if successful and 0 otherwise.
@@ -7418,8 +7698,8 @@
static int detach_tasks(struct lb_env *env)
{
struct list_head *tasks = &env->src_rq->cfs_tasks;
+ unsigned long util, load;
struct task_struct *p;
- unsigned long load;
int detached = 0;
lockdep_assert_held(&env->src_rq->lock);
@@ -7452,27 +7732,59 @@
if (!can_migrate_task(p, env))
goto next;
- /*
- * Depending of the number of CPUs and tasks and the
- * cgroup hierarchy, task_h_load() can return a null
- * value. Make sure that env->imbalance decreases
- * otherwise detach_tasks() will stop only after
- * detaching up to loop_max tasks.
- */
- load = max_t(unsigned long, task_h_load(p), 1);
+ switch (env->migration_type) {
+ case migrate_load:
+ /*
+ * Depending of the number of CPUs and tasks and the
+ * cgroup hierarchy, task_h_load() can return a null
+ * value. Make sure that env->imbalance decreases
+ * otherwise detach_tasks() will stop only after
+ * detaching up to loop_max tasks.
+ */
+ load = max_t(unsigned long, task_h_load(p), 1);
+ if (sched_feat(LB_MIN) &&
+ load < 16 && !env->sd->nr_balance_failed)
+ goto next;
- if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
- goto next;
+ /*
+ * Make sure that we don't migrate too much load.
+ * Nevertheless, let relax the constraint if
+ * scheduler fails to find a good waiting task to
+ * migrate.
+ */
+ if (shr_bound(load, env->sd->nr_balance_failed) > env->imbalance)
+ goto next;
- if ((load / 2) > env->imbalance)
- goto next;
+ env->imbalance -= load;
+ break;
+
+ case migrate_util:
+ util = task_util_est(p);
+
+ if (util > env->imbalance)
+ goto next;
+
+ env->imbalance -= util;
+ break;
+
+ case migrate_task:
+ env->imbalance--;
+ break;
+
+ case migrate_misfit:
+ /* This is not a misfit task */
+ if (task_fits_capacity(p, capacity_of(env->src_cpu)))
+ goto next;
+
+ env->imbalance = 0;
+ break;
+ }
detach_task(p, env);
list_add(&p->se.group_node, &env->tasks);
detached++;
- env->imbalance -= load;
#ifdef CONFIG_PREEMPTION
/*
@@ -7486,7 +7798,7 @@
/*
* We only want to steal up to the prescribed amount of
- * runnable load.
+ * load/util/tasks.
*/
if (env->imbalance <= 0)
break;
@@ -7575,6 +7887,9 @@
if (READ_ONCE(rq->avg_dl.util_avg))
return true;
+ if (thermal_load_avg(rq))
+ return true;
+
#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
if (READ_ONCE(rq->avg_irq.util_avg))
return true;
@@ -7600,6 +7915,7 @@
{
const struct sched_class *curr_class;
u64 now = rq_clock_pelt(rq);
+ unsigned long thermal_pressure;
bool decayed;
/*
@@ -7608,8 +7924,11 @@
*/
curr_class = rq->curr->sched_class;
+ thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq));
+
decayed = update_rt_rq_load_avg(now, rq, curr_class == &rt_sched_class) |
update_dl_rq_load_avg(now, rq, curr_class == &dl_sched_class) |
+ update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure) |
update_irq_load_avg(rq, 0);
if (others_have_blocked(rq))
@@ -7631,7 +7950,7 @@
if (cfs_rq->avg.util_sum)
return false;
- if (cfs_rq->avg.runnable_load_sum)
+ if (cfs_rq->avg.runnable_sum)
return false;
return true;
@@ -7651,7 +7970,7 @@
struct sched_entity *se;
if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq)) {
- update_tg_load_avg(cfs_rq, 0);
+ update_tg_load_avg(cfs_rq);
if (cfs_rq == &rq->cfs)
decayed = true;
@@ -7768,14 +8087,15 @@
struct sg_lb_stats {
unsigned long avg_load; /*Avg load across the CPUs of the group */
unsigned long group_load; /* Total load over the CPUs of the group */
- unsigned long load_per_task;
unsigned long group_capacity;
- unsigned long group_util; /* Total utilization of the group */
- unsigned int sum_nr_running; /* Nr tasks running in the group */
+ unsigned long group_util; /* Total utilization over the CPUs of the group */
+ unsigned long group_runnable; /* Total runnable time over the CPUs of the group */
+ unsigned int sum_nr_running; /* Nr of tasks running in the group */
+ unsigned int sum_h_nr_running; /* Nr of CFS tasks running in the group */
unsigned int idle_cpus;
unsigned int group_weight;
enum group_type group_type;
- int group_no_capacity;
+ unsigned int group_asym_packing; /* Tasks should be moved to preferred CPU */
unsigned long group_misfit_task_load; /* A CPU has a task too big for its capacity */
#ifdef CONFIG_NUMA_BALANCING
unsigned int nr_numa_running;
@@ -7790,10 +8110,10 @@
struct sd_lb_stats {
struct sched_group *busiest; /* Busiest group in this sd */
struct sched_group *local; /* Local group in this sd */
- unsigned long total_running;
unsigned long total_load; /* Total load of all groups in sd */
unsigned long total_capacity; /* Total capacity of all groups in sd */
unsigned long avg_load; /* Average load across all groups in sd */
+ unsigned int prefer_sibling; /* tasks should go to sibling first */
struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */
struct sg_lb_stats local_stat; /* Statistics of the local group */
@@ -7804,24 +8124,23 @@
/*
* Skimp on the clearing to avoid duplicate work. We can avoid clearing
* local_stat because update_sg_lb_stats() does a full clear/assignment.
- * We must however clear busiest_stat::avg_load because
- * update_sd_pick_busiest() reads this before assignment.
+ * We must however set busiest_stat::group_type and
+ * busiest_stat::idle_cpus to the worst busiest group because
+ * update_sd_pick_busiest() reads these before assignment.
*/
*sds = (struct sd_lb_stats){
.busiest = NULL,
.local = NULL,
- .total_running = 0UL,
.total_load = 0UL,
.total_capacity = 0UL,
.busiest_stat = {
- .avg_load = 0UL,
- .sum_nr_running = 0,
- .group_type = group_other,
+ .idle_cpus = UINT_MAX,
+ .group_type = group_has_spare,
},
};
}
-static unsigned long scale_rt_capacity(struct sched_domain *sd, int cpu)
+static unsigned long scale_rt_capacity(int cpu)
{
struct rq *rq = cpu_rq(cpu);
unsigned long max = arch_scale_cpu_capacity(cpu);
@@ -7833,8 +8152,15 @@
if (unlikely(irq >= max))
return 1;
+ /*
+ * avg_rt.util_avg and avg_dl.util_avg track binary signals
+ * (running and not running) with weights 0 and 1024 respectively.
+ * avg_thermal.load_avg tracks thermal pressure and the weighted
+ * average uses the actual delta max capacity(load).
+ */
used = READ_ONCE(rq->avg_rt.util_avg);
used += READ_ONCE(rq->avg_dl.util_avg);
+ used += thermal_load_avg(rq);
if (unlikely(used >= max))
return 1;
@@ -7846,7 +8172,7 @@
static void update_cpu_capacity(struct sched_domain *sd, int cpu)
{
- unsigned long capacity = scale_rt_capacity(sd, cpu);
+ unsigned long capacity = scale_rt_capacity(cpu);
struct sched_group *sdg = sd->groups;
cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(cpu);
@@ -7855,6 +8181,8 @@
capacity = 1;
cpu_rq(cpu)->cpu_capacity = capacity;
+ trace_sched_cpu_capacity_tp(cpu_rq(cpu));
+
sdg->sgc->capacity = capacity;
sdg->sgc->min_capacity = capacity;
sdg->sgc->max_capacity = capacity;
@@ -7887,29 +8215,11 @@
*/
for_each_cpu(cpu, sched_group_span(sdg)) {
- struct sched_group_capacity *sgc;
- struct rq *rq = cpu_rq(cpu);
+ unsigned long cpu_cap = capacity_of(cpu);
- /*
- * build_sched_domains() -> init_sched_groups_capacity()
- * gets here before we've attached the domains to the
- * runqueues.
- *
- * Use capacity_of(), which is set irrespective of domains
- * in update_cpu_capacity().
- *
- * This avoids capacity from being 0 and
- * causing divide-by-zero issues on boot.
- */
- if (unlikely(!rq->sd)) {
- capacity += capacity_of(cpu);
- } else {
- sgc = rq->sd->groups->sgc;
- capacity += sgc->capacity;
- }
-
- min_capacity = min(capacity, min_capacity);
- max_capacity = max(capacity, max_capacity);
+ capacity += cpu_cap;
+ min_capacity = min(cpu_cap, min_capacity);
+ max_capacity = max(cpu_cap, max_capacity);
}
} else {
/*
@@ -8004,13 +8314,17 @@
* any benefit for the load balance.
*/
static inline bool
-group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs)
+group_has_capacity(unsigned int imbalance_pct, struct sg_lb_stats *sgs)
{
if (sgs->sum_nr_running < sgs->group_weight)
return true;
+ if ((sgs->group_capacity * imbalance_pct) <
+ (sgs->group_runnable * 100))
+ return false;
+
if ((sgs->group_capacity * 100) >
- (sgs->group_util * env->sd->imbalance_pct))
+ (sgs->group_util * imbalance_pct))
return true;
return false;
@@ -8025,13 +8339,17 @@
* false.
*/
static inline bool
-group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs)
+group_is_overloaded(unsigned int imbalance_pct, struct sg_lb_stats *sgs)
{
if (sgs->sum_nr_running <= sgs->group_weight)
return false;
if ((sgs->group_capacity * 100) <
- (sgs->group_util * env->sd->imbalance_pct))
+ (sgs->group_util * imbalance_pct))
+ return true;
+
+ if ((sgs->group_capacity * imbalance_pct) <
+ (sgs->group_runnable * 100))
return true;
return false;
@@ -8058,19 +8376,26 @@
}
static inline enum
-group_type group_classify(struct sched_group *group,
+group_type group_classify(unsigned int imbalance_pct,
+ struct sched_group *group,
struct sg_lb_stats *sgs)
{
- if (sgs->group_no_capacity)
+ if (group_is_overloaded(imbalance_pct, sgs))
return group_overloaded;
if (sg_imbalanced(group))
return group_imbalanced;
+ if (sgs->group_asym_packing)
+ return group_asym_packing;
+
if (sgs->group_misfit_task_load)
return group_misfit_task;
- return group_other;
+ if (!group_has_capacity(imbalance_pct, sgs))
+ return group_fully_busy;
+
+ return group_has_spare;
}
static bool update_nohz_stats(struct rq *rq, bool force)
@@ -8107,21 +8432,26 @@
struct sg_lb_stats *sgs,
int *sg_status)
{
- int i, nr_running;
+ int i, nr_running, local_group;
memset(sgs, 0, sizeof(*sgs));
+ local_group = cpumask_test_cpu(env->dst_cpu, sched_group_span(group));
+
for_each_cpu_and(i, sched_group_span(group), env->cpus) {
struct rq *rq = cpu_rq(i);
if ((env->flags & LBF_NOHZ_STATS) && update_nohz_stats(rq, false))
env->flags |= LBF_NOHZ_AGAIN;
- sgs->group_load += cpu_runnable_load(rq);
+ sgs->group_load += cpu_load(rq);
sgs->group_util += cpu_util(i);
- sgs->sum_nr_running += rq->cfs.h_nr_running;
+ sgs->group_runnable += cpu_runnable(rq);
+ sgs->sum_h_nr_running += rq->cfs.h_nr_running;
nr_running = rq->nr_running;
+ sgs->sum_nr_running += nr_running;
+
if (nr_running > 1)
*sg_status |= SG_OVERLOAD;
@@ -8135,9 +8465,16 @@
/*
* No need to call idle_cpu() if nr_running is not 0
*/
- if (!nr_running && idle_cpu(i))
+ if (!nr_running && idle_cpu(i)) {
sgs->idle_cpus++;
+ /* Idle cpu can't have misfit task */
+ continue;
+ }
+ if (local_group)
+ continue;
+
+ /* Check for a misfit task on the cpu */
if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
sgs->group_misfit_task_load < rq->misfit_task_load) {
sgs->group_misfit_task_load = rq->misfit_task_load;
@@ -8145,17 +8482,24 @@
}
}
- /* Adjust by relative CPU capacity of the group */
- sgs->group_capacity = group->sgc->capacity;
- sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity;
+ /* Check if dst CPU is idle and preferred to this group */
+ if (env->sd->flags & SD_ASYM_PACKING &&
+ env->idle != CPU_NOT_IDLE &&
+ sgs->sum_h_nr_running &&
+ sched_asym_prefer(env->dst_cpu, group->asym_prefer_cpu)) {
+ sgs->group_asym_packing = 1;
+ }
- if (sgs->sum_nr_running)
- sgs->load_per_task = sgs->group_load / sgs->sum_nr_running;
+ sgs->group_capacity = group->sgc->capacity;
sgs->group_weight = group->group_weight;
- sgs->group_no_capacity = group_is_overloaded(env, sgs);
- sgs->group_type = group_classify(group, sgs);
+ sgs->group_type = group_classify(env->sd->imbalance_pct, group, sgs);
+
+ /* Computing avg_load makes sense only when group is overloaded */
+ if (sgs->group_type == group_overloaded)
+ sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) /
+ sgs->group_capacity;
}
/**
@@ -8178,6 +8522,10 @@
{
struct sg_lb_stats *busiest = &sds->busiest_stat;
+ /* Make sure that there is at least one task to pull */
+ if (!sgs->sum_h_nr_running)
+ return false;
+
/*
* Don't try to pull misfit tasks we can't help.
* We can use max_capacity here as reduction in capacity on some
@@ -8186,7 +8534,7 @@
*/
if (sgs->group_type == group_misfit_task &&
(!group_smaller_max_cpu_capacity(sg, sds->local) ||
- !group_has_capacity(env, &sds->local_stat)))
+ sds->local_stat.group_type != group_has_spare))
return false;
if (sgs->group_type > busiest->group_type)
@@ -8195,62 +8543,92 @@
if (sgs->group_type < busiest->group_type)
return false;
- if (sgs->avg_load <= busiest->avg_load)
+ /*
+ * The candidate and the current busiest group are the same type of
+ * group. Let check which one is the busiest according to the type.
+ */
+
+ switch (sgs->group_type) {
+ case group_overloaded:
+ /* Select the overloaded group with highest avg_load. */
+ if (sgs->avg_load <= busiest->avg_load)
+ return false;
+ break;
+
+ case group_imbalanced:
+ /*
+ * Select the 1st imbalanced group as we don't have any way to
+ * choose one more than another.
+ */
return false;
- if (!(env->sd->flags & SD_ASYM_CPUCAPACITY))
- goto asym_packing;
-
- /*
- * Candidate sg has no more than one task per CPU and
- * has higher per-CPU capacity. Migrating tasks to less
- * capable CPUs may harm throughput. Maximize throughput,
- * power/energy consequences are not considered.
- */
- if (sgs->sum_nr_running <= sgs->group_weight &&
- group_smaller_min_cpu_capacity(sds->local, sg))
- return false;
-
- /*
- * If we have more than one misfit sg go with the biggest misfit.
- */
- if (sgs->group_type == group_misfit_task &&
- sgs->group_misfit_task_load < busiest->group_misfit_task_load)
- return false;
-
-asym_packing:
- /* This is the busiest node in its class. */
- if (!(env->sd->flags & SD_ASYM_PACKING))
- return true;
-
- /* No ASYM_PACKING if target CPU is already busy */
- if (env->idle == CPU_NOT_IDLE)
- return true;
- /*
- * ASYM_PACKING needs to move all the work to the highest
- * prority CPUs in the group, therefore mark all groups
- * of lower priority than ourself as busy.
- */
- if (sgs->sum_nr_running &&
- sched_asym_prefer(env->dst_cpu, sg->asym_prefer_cpu)) {
- if (!sds->busiest)
- return true;
-
+ case group_asym_packing:
/* Prefer to move from lowest priority CPU's work */
- if (sched_asym_prefer(sds->busiest->asym_prefer_cpu,
- sg->asym_prefer_cpu))
- return true;
+ if (sched_asym_prefer(sg->asym_prefer_cpu, sds->busiest->asym_prefer_cpu))
+ return false;
+ break;
+
+ case group_misfit_task:
+ /*
+ * If we have more than one misfit sg go with the biggest
+ * misfit.
+ */
+ if (sgs->group_misfit_task_load < busiest->group_misfit_task_load)
+ return false;
+ break;
+
+ case group_fully_busy:
+ /*
+ * Select the fully busy group with highest avg_load. In
+ * theory, there is no need to pull task from such kind of
+ * group because tasks have all compute capacity that they need
+ * but we can still improve the overall throughput by reducing
+ * contention when accessing shared HW resources.
+ *
+ * XXX for now avg_load is not computed and always 0 so we
+ * select the 1st one.
+ */
+ if (sgs->avg_load <= busiest->avg_load)
+ return false;
+ break;
+
+ case group_has_spare:
+ /*
+ * Select not overloaded group with lowest number of idle cpus
+ * and highest number of running tasks. We could also compare
+ * the spare capacity which is more stable but it can end up
+ * that the group has less spare capacity but finally more idle
+ * CPUs which means less opportunity to pull tasks.
+ */
+ if (sgs->idle_cpus > busiest->idle_cpus)
+ return false;
+ else if ((sgs->idle_cpus == busiest->idle_cpus) &&
+ (sgs->sum_nr_running <= busiest->sum_nr_running))
+ return false;
+
+ break;
}
- return false;
+ /*
+ * Candidate sg has no more than one task per CPU and has higher
+ * per-CPU capacity. Migrating tasks to less capable CPUs may harm
+ * throughput. Maximize throughput, power/energy consequences are not
+ * considered.
+ */
+ if ((env->sd->flags & SD_ASYM_CPUCAPACITY) &&
+ (sgs->group_type <= group_fully_busy) &&
+ (group_smaller_min_cpu_capacity(sds->local, sg)))
+ return false;
+
+ return true;
}
#ifdef CONFIG_NUMA_BALANCING
static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
{
- if (sgs->sum_nr_running > sgs->nr_numa_running)
+ if (sgs->sum_h_nr_running > sgs->nr_numa_running)
return regular;
- if (sgs->sum_nr_running > sgs->nr_preferred_running)
+ if (sgs->sum_h_nr_running > sgs->nr_preferred_running)
return remote;
return all;
}
@@ -8275,18 +8653,323 @@
}
#endif /* CONFIG_NUMA_BALANCING */
+
+struct sg_lb_stats;
+
+/*
+ * task_running_on_cpu - return 1 if @p is running on @cpu.
+ */
+
+static unsigned int task_running_on_cpu(int cpu, struct task_struct *p)
+{
+ /* Task has no contribution or is new */
+ if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
+ return 0;
+
+ if (task_on_rq_queued(p))
+ return 1;
+
+ return 0;
+}
+
+/**
+ * idle_cpu_without - would a given CPU be idle without p ?
+ * @cpu: the processor on which idleness is tested.
+ * @p: task which should be ignored.
+ *
+ * Return: 1 if the CPU would be idle. 0 otherwise.
+ */
+static int idle_cpu_without(int cpu, struct task_struct *p)
+{
+ struct rq *rq = cpu_rq(cpu);
+
+ if (rq->curr != rq->idle && rq->curr != p)
+ return 0;
+
+ /*
+ * rq->nr_running can't be used but an updated version without the
+ * impact of p on cpu must be used instead. The updated nr_running
+ * be computed and tested before calling idle_cpu_without().
+ */
+
+#ifdef CONFIG_SMP
+ if (rq->ttwu_pending)
+ return 0;
+#endif
+
+ return 1;
+}
+
+/*
+ * update_sg_wakeup_stats - Update sched_group's statistics for wakeup.
+ * @sd: The sched_domain level to look for idlest group.
+ * @group: sched_group whose statistics are to be updated.
+ * @sgs: variable to hold the statistics for this group.
+ * @p: The task for which we look for the idlest group/CPU.
+ */
+static inline void update_sg_wakeup_stats(struct sched_domain *sd,
+ struct sched_group *group,
+ struct sg_lb_stats *sgs,
+ struct task_struct *p)
+{
+ int i, nr_running;
+
+ memset(sgs, 0, sizeof(*sgs));
+
+ for_each_cpu(i, sched_group_span(group)) {
+ struct rq *rq = cpu_rq(i);
+ unsigned int local;
+
+ sgs->group_load += cpu_load_without(rq, p);
+ sgs->group_util += cpu_util_without(i, p);
+ sgs->group_runnable += cpu_runnable_without(rq, p);
+ local = task_running_on_cpu(i, p);
+ sgs->sum_h_nr_running += rq->cfs.h_nr_running - local;
+
+ nr_running = rq->nr_running - local;
+ sgs->sum_nr_running += nr_running;
+
+ /*
+ * No need to call idle_cpu_without() if nr_running is not 0
+ */
+ if (!nr_running && idle_cpu_without(i, p))
+ sgs->idle_cpus++;
+
+ }
+
+ /* Check if task fits in the group */
+ if (sd->flags & SD_ASYM_CPUCAPACITY &&
+ !task_fits_capacity(p, group->sgc->max_capacity)) {
+ sgs->group_misfit_task_load = 1;
+ }
+
+ sgs->group_capacity = group->sgc->capacity;
+
+ sgs->group_weight = group->group_weight;
+
+ sgs->group_type = group_classify(sd->imbalance_pct, group, sgs);
+
+ /*
+ * Computing avg_load makes sense only when group is fully busy or
+ * overloaded
+ */
+ if (sgs->group_type == group_fully_busy ||
+ sgs->group_type == group_overloaded)
+ sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) /
+ sgs->group_capacity;
+}
+
+static bool update_pick_idlest(struct sched_group *idlest,
+ struct sg_lb_stats *idlest_sgs,
+ struct sched_group *group,
+ struct sg_lb_stats *sgs)
+{
+ if (sgs->group_type < idlest_sgs->group_type)
+ return true;
+
+ if (sgs->group_type > idlest_sgs->group_type)
+ return false;
+
+ /*
+ * The candidate and the current idlest group are the same type of
+ * group. Let check which one is the idlest according to the type.
+ */
+
+ switch (sgs->group_type) {
+ case group_overloaded:
+ case group_fully_busy:
+ /* Select the group with lowest avg_load. */
+ if (idlest_sgs->avg_load <= sgs->avg_load)
+ return false;
+ break;
+
+ case group_imbalanced:
+ case group_asym_packing:
+ /* Those types are not used in the slow wakeup path */
+ return false;
+
+ case group_misfit_task:
+ /* Select group with the highest max capacity */
+ if (idlest->sgc->max_capacity >= group->sgc->max_capacity)
+ return false;
+ break;
+
+ case group_has_spare:
+ /* Select group with most idle CPUs */
+ if (idlest_sgs->idle_cpus > sgs->idle_cpus)
+ return false;
+
+ /* Select group with lowest group_util */
+ if (idlest_sgs->idle_cpus == sgs->idle_cpus &&
+ idlest_sgs->group_util <= sgs->group_util)
+ return false;
+
+ break;
+ }
+
+ return true;
+}
+
+/*
+ * find_idlest_group() finds and returns the least busy CPU group within the
+ * domain.
+ *
+ * Assumes p is allowed on at least one CPU in sd.
+ */
+static struct sched_group *
+find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
+{
+ struct sched_group *idlest = NULL, *local = NULL, *group = sd->groups;
+ struct sg_lb_stats local_sgs, tmp_sgs;
+ struct sg_lb_stats *sgs;
+ unsigned long imbalance;
+ struct sg_lb_stats idlest_sgs = {
+ .avg_load = UINT_MAX,
+ .group_type = group_overloaded,
+ };
+
+ imbalance = scale_load_down(NICE_0_LOAD) *
+ (sd->imbalance_pct-100) / 100;
+
+ do {
+ int local_group;
+
+ /* Skip over this group if it has no CPUs allowed */
+ if (!cpumask_intersects(sched_group_span(group),
+ p->cpus_ptr))
+ continue;
+
+ local_group = cpumask_test_cpu(this_cpu,
+ sched_group_span(group));
+
+ if (local_group) {
+ sgs = &local_sgs;
+ local = group;
+ } else {
+ sgs = &tmp_sgs;
+ }
+
+ update_sg_wakeup_stats(sd, group, sgs, p);
+
+ if (!local_group && update_pick_idlest(idlest, &idlest_sgs, group, sgs)) {
+ idlest = group;
+ idlest_sgs = *sgs;
+ }
+
+ } while (group = group->next, group != sd->groups);
+
+
+ /* There is no idlest group to push tasks to */
+ if (!idlest)
+ return NULL;
+
+ /* The local group has been skipped because of CPU affinity */
+ if (!local)
+ return idlest;
+
+ /*
+ * If the local group is idler than the selected idlest group
+ * don't try and push the task.
+ */
+ if (local_sgs.group_type < idlest_sgs.group_type)
+ return NULL;
+
+ /*
+ * If the local group is busier than the selected idlest group
+ * try and push the task.
+ */
+ if (local_sgs.group_type > idlest_sgs.group_type)
+ return idlest;
+
+ switch (local_sgs.group_type) {
+ case group_overloaded:
+ case group_fully_busy:
+ /*
+ * When comparing groups across NUMA domains, it's possible for
+ * the local domain to be very lightly loaded relative to the
+ * remote domains but "imbalance" skews the comparison making
+ * remote CPUs look much more favourable. When considering
+ * cross-domain, add imbalance to the load on the remote node
+ * and consider staying local.
+ */
+
+ if ((sd->flags & SD_NUMA) &&
+ ((idlest_sgs.avg_load + imbalance) >= local_sgs.avg_load))
+ return NULL;
+
+ /*
+ * If the local group is less loaded than the selected
+ * idlest group don't try and push any tasks.
+ */
+ if (idlest_sgs.avg_load >= (local_sgs.avg_load + imbalance))
+ return NULL;
+
+ if (100 * local_sgs.avg_load <= sd->imbalance_pct * idlest_sgs.avg_load)
+ return NULL;
+ break;
+
+ case group_imbalanced:
+ case group_asym_packing:
+ /* Those type are not used in the slow wakeup path */
+ return NULL;
+
+ case group_misfit_task:
+ /* Select group with the highest max capacity */
+ if (local->sgc->max_capacity >= idlest->sgc->max_capacity)
+ return NULL;
+ break;
+
+ case group_has_spare:
+ if (sd->flags & SD_NUMA) {
+#ifdef CONFIG_NUMA_BALANCING
+ int idlest_cpu;
+ /*
+ * If there is spare capacity at NUMA, try to select
+ * the preferred node
+ */
+ if (cpu_to_node(this_cpu) == p->numa_preferred_nid)
+ return NULL;
+
+ idlest_cpu = cpumask_first(sched_group_span(idlest));
+ if (cpu_to_node(idlest_cpu) == p->numa_preferred_nid)
+ return idlest;
+#endif
+ /*
+ * Otherwise, keep the task on this node to stay close
+ * its wakeup source and improve locality. If there is
+ * a real need of migration, periodic load balance will
+ * take care of it.
+ */
+ if (local_sgs.idle_cpus)
+ return NULL;
+ }
+
+ /*
+ * Select group with highest number of idle CPUs. We could also
+ * compare the utilization which is more stable but it can end
+ * up that the group has less spare capacity but finally more
+ * idle CPUs which means more opportunity to run task.
+ */
+ if (local_sgs.idle_cpus >= idlest_sgs.idle_cpus)
+ return NULL;
+ break;
+ }
+
+ return idlest;
+}
+
/**
* update_sd_lb_stats - Update sched_domain's statistics for load balancing.
* @env: The load balancing environment.
* @sds: variable to hold the statistics for this sched_domain.
*/
+
static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
{
struct sched_domain *child = env->sd->child;
struct sched_group *sg = env->sd->groups;
struct sg_lb_stats *local = &sds->local_stat;
struct sg_lb_stats tmp_sgs;
- bool prefer_sibling = child && child->flags & SD_PREFER_SIBLING;
int sg_status = 0;
#ifdef CONFIG_NO_HZ_COMMON
@@ -8313,22 +8996,6 @@
if (local_group)
goto next_group;
- /*
- * In case the child domain prefers tasks go to siblings
- * first, lower the sg capacity so that we'll try
- * and move all the excess tasks away. We lower the capacity
- * of a group only if the local group has the capacity to fit
- * these excess tasks. The extra check prevents the case where
- * you always pull from the heaviest group when it is already
- * under-utilized (possible with a large weight task outweighs
- * the tasks on the system).
- */
- if (prefer_sibling && sds->local &&
- group_has_capacity(env, local) &&
- (sgs->sum_nr_running > local->sum_nr_running + 1)) {
- sgs->group_no_capacity = 1;
- sgs->group_type = group_classify(sg, sgs);
- }
if (update_sd_pick_busiest(env, sds, sg, sgs)) {
sds->busiest = sg;
@@ -8337,13 +9004,15 @@
next_group:
/* Now, start updating sd_lb_stats */
- sds->total_running += sgs->sum_nr_running;
sds->total_load += sgs->group_load;
sds->total_capacity += sgs->group_capacity;
sg = sg->next;
} while (sg != env->sd->groups);
+ /* Tag domain that child domain prefers tasks go to siblings first */
+ sds->prefer_sibling = child && child->flags & SD_PREFER_SIBLING;
+
#ifdef CONFIG_NO_HZ_COMMON
if ((env->flags & LBF_NOHZ_AGAIN) &&
cpumask_subset(nohz.idle_cpus_mask, sched_domain_span(env->sd))) {
@@ -8373,119 +9042,19 @@
}
}
-/**
- * check_asym_packing - Check to see if the group is packed into the
- * sched domain.
- *
- * This is primarily intended to used at the sibling level. Some
- * cores like POWER7 prefer to use lower numbered SMT threads. In the
- * case of POWER7, it can move to lower SMT modes only when higher
- * threads are idle. When in lower SMT modes, the threads will
- * perform better since they share less core resources. Hence when we
- * have idle threads, we want them to be the higher ones.
- *
- * This packing function is run on idle threads. It checks to see if
- * the busiest CPU in this domain (core in the P7 case) has a higher
- * CPU number than the packing function is being run on. Here we are
- * assuming lower CPU number will be equivalent to lower a SMT thread
- * number.
- *
- * Return: 1 when packing is required and a task should be moved to
- * this CPU. The amount of the imbalance is returned in env->imbalance.
- *
- * @env: The load balancing environment.
- * @sds: Statistics of the sched_domain which is to be packed
- */
-static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
+static inline long adjust_numa_imbalance(int imbalance, int nr_running)
{
- int busiest_cpu;
-
- if (!(env->sd->flags & SD_ASYM_PACKING))
- return 0;
-
- if (env->idle == CPU_NOT_IDLE)
- return 0;
-
- if (!sds->busiest)
- return 0;
-
- busiest_cpu = sds->busiest->asym_prefer_cpu;
- if (sched_asym_prefer(busiest_cpu, env->dst_cpu))
- return 0;
-
- env->imbalance = sds->busiest_stat.group_load;
-
- return 1;
-}
-
-/**
- * fix_small_imbalance - Calculate the minor imbalance that exists
- * amongst the groups of a sched_domain, during
- * load balancing.
- * @env: The load balancing environment.
- * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
- */
-static inline
-void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
-{
- unsigned long tmp, capa_now = 0, capa_move = 0;
- unsigned int imbn = 2;
- unsigned long scaled_busy_load_per_task;
- struct sg_lb_stats *local, *busiest;
-
- local = &sds->local_stat;
- busiest = &sds->busiest_stat;
-
- if (!local->sum_nr_running)
- local->load_per_task = cpu_avg_load_per_task(env->dst_cpu);
- else if (busiest->load_per_task > local->load_per_task)
- imbn = 1;
-
- scaled_busy_load_per_task =
- (busiest->load_per_task * SCHED_CAPACITY_SCALE) /
- busiest->group_capacity;
-
- if (busiest->avg_load + scaled_busy_load_per_task >=
- local->avg_load + (scaled_busy_load_per_task * imbn)) {
- env->imbalance = busiest->load_per_task;
- return;
- }
+ unsigned int imbalance_min;
/*
- * OK, we don't have enough imbalance to justify moving tasks,
- * however we may be able to increase total CPU capacity used by
- * moving them.
+ * Allow a small imbalance based on a simple pair of communicating
+ * tasks that remain local when the source domain is almost idle.
*/
+ imbalance_min = 2;
+ if (nr_running <= imbalance_min)
+ return 0;
- capa_now += busiest->group_capacity *
- min(busiest->load_per_task, busiest->avg_load);
- capa_now += local->group_capacity *
- min(local->load_per_task, local->avg_load);
- capa_now /= SCHED_CAPACITY_SCALE;
-
- /* Amount of load we'd subtract */
- if (busiest->avg_load > scaled_busy_load_per_task) {
- capa_move += busiest->group_capacity *
- min(busiest->load_per_task,
- busiest->avg_load - scaled_busy_load_per_task);
- }
-
- /* Amount of load we'd add */
- if (busiest->avg_load * busiest->group_capacity <
- busiest->load_per_task * SCHED_CAPACITY_SCALE) {
- tmp = (busiest->avg_load * busiest->group_capacity) /
- local->group_capacity;
- } else {
- tmp = (busiest->load_per_task * SCHED_CAPACITY_SCALE) /
- local->group_capacity;
- }
- capa_move += local->group_capacity *
- min(local->load_per_task, local->avg_load + tmp);
- capa_move /= SCHED_CAPACITY_SCALE;
-
- /* Move if we gain throughput */
- if (capa_move > capa_now)
- env->imbalance = busiest->load_per_task;
+ return imbalance;
}
/**
@@ -8496,81 +9065,164 @@
*/
static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
{
- unsigned long max_pull, load_above_capacity = ~0UL;
struct sg_lb_stats *local, *busiest;
local = &sds->local_stat;
busiest = &sds->busiest_stat;
+ if (busiest->group_type == group_misfit_task) {
+ /* Set imbalance to allow misfit tasks to be balanced. */
+ env->migration_type = migrate_misfit;
+ env->imbalance = 1;
+ return;
+ }
+
+ if (busiest->group_type == group_asym_packing) {
+ /*
+ * In case of asym capacity, we will try to migrate all load to
+ * the preferred CPU.
+ */
+ env->migration_type = migrate_task;
+ env->imbalance = busiest->sum_h_nr_running;
+ return;
+ }
+
if (busiest->group_type == group_imbalanced) {
/*
* In the group_imb case we cannot rely on group-wide averages
- * to ensure CPU-load equilibrium, look at wider averages. XXX
+ * to ensure CPU-load equilibrium, try to move any task to fix
+ * the imbalance. The next load balance will take care of
+ * balancing back the system.
*/
- busiest->load_per_task =
- min(busiest->load_per_task, sds->avg_load);
+ env->migration_type = migrate_task;
+ env->imbalance = 1;
+ return;
}
/*
- * Avg load of busiest sg can be less and avg load of local sg can
- * be greater than avg load across all sgs of sd because avg load
- * factors in sg capacity and sgs with smaller group_type are
- * skipped when updating the busiest sg:
+ * Try to use spare capacity of local group without overloading it or
+ * emptying busiest.
*/
- if (busiest->group_type != group_misfit_task &&
- (busiest->avg_load <= sds->avg_load ||
- local->avg_load >= sds->avg_load)) {
- env->imbalance = 0;
- return fix_small_imbalance(env, sds);
+ if (local->group_type == group_has_spare) {
+ if ((busiest->group_type > group_fully_busy) &&
+ !(env->sd->flags & SD_SHARE_PKG_RESOURCES)) {
+ /*
+ * If busiest is overloaded, try to fill spare
+ * capacity. This might end up creating spare capacity
+ * in busiest or busiest still being overloaded but
+ * there is no simple way to directly compute the
+ * amount of load to migrate in order to balance the
+ * system.
+ */
+ env->migration_type = migrate_util;
+ env->imbalance = max(local->group_capacity, local->group_util) -
+ local->group_util;
+
+ /*
+ * In some cases, the group's utilization is max or even
+ * higher than capacity because of migrations but the
+ * local CPU is (newly) idle. There is at least one
+ * waiting task in this overloaded busiest group. Let's
+ * try to pull it.
+ */
+ if (env->idle != CPU_NOT_IDLE && env->imbalance == 0) {
+ env->migration_type = migrate_task;
+ env->imbalance = 1;
+ }
+
+ return;
+ }
+
+ if (busiest->group_weight == 1 || sds->prefer_sibling) {
+ unsigned int nr_diff = busiest->sum_nr_running;
+ /*
+ * When prefer sibling, evenly spread running tasks on
+ * groups.
+ */
+ env->migration_type = migrate_task;
+ lsub_positive(&nr_diff, local->sum_nr_running);
+ env->imbalance = nr_diff >> 1;
+ } else {
+
+ /*
+ * If there is no overload, we just want to even the number of
+ * idle cpus.
+ */
+ env->migration_type = migrate_task;
+ env->imbalance = max_t(long, 0, (local->idle_cpus -
+ busiest->idle_cpus) >> 1);
+ }
+
+ /* Consider allowing a small imbalance between NUMA groups */
+ if (env->sd->flags & SD_NUMA)
+ env->imbalance = adjust_numa_imbalance(env->imbalance,
+ busiest->sum_nr_running);
+
+ return;
}
/*
- * If there aren't any idle CPUs, avoid creating some.
+ * Local is fully busy but has to take more load to relieve the
+ * busiest group
*/
- if (busiest->group_type == group_overloaded &&
- local->group_type == group_overloaded) {
- load_above_capacity = busiest->sum_nr_running * SCHED_CAPACITY_SCALE;
- if (load_above_capacity > busiest->group_capacity) {
- load_above_capacity -= busiest->group_capacity;
- load_above_capacity *= scale_load_down(NICE_0_LOAD);
- load_above_capacity /= busiest->group_capacity;
- } else
- load_above_capacity = ~0UL;
+ if (local->group_type < group_overloaded) {
+ /*
+ * Local will become overloaded so the avg_load metrics are
+ * finally needed.
+ */
+
+ local->avg_load = (local->group_load * SCHED_CAPACITY_SCALE) /
+ local->group_capacity;
+
+ sds->avg_load = (sds->total_load * SCHED_CAPACITY_SCALE) /
+ sds->total_capacity;
+ /*
+ * If the local group is more loaded than the selected
+ * busiest group don't try to pull any tasks.
+ */
+ if (local->avg_load >= busiest->avg_load) {
+ env->imbalance = 0;
+ return;
+ }
}
/*
- * We're trying to get all the CPUs to the average_load, so we don't
- * want to push ourselves above the average load, nor do we wish to
- * reduce the max loaded CPU below the average load. At the same time,
- * we also don't want to reduce the group load below the group
- * capacity. Thus we look for the minimum possible imbalance.
+ * Both group are or will become overloaded and we're trying to get all
+ * the CPUs to the average_load, so we don't want to push ourselves
+ * above the average load, nor do we wish to reduce the max loaded CPU
+ * below the average load. At the same time, we also don't want to
+ * reduce the group load below the group capacity. Thus we look for
+ * the minimum possible imbalance.
*/
- max_pull = min(busiest->avg_load - sds->avg_load, load_above_capacity);
-
- /* How much load to actually move to equalise the imbalance */
+ env->migration_type = migrate_load;
env->imbalance = min(
- max_pull * busiest->group_capacity,
+ (busiest->avg_load - sds->avg_load) * busiest->group_capacity,
(sds->avg_load - local->avg_load) * local->group_capacity
) / SCHED_CAPACITY_SCALE;
-
- /* Boost imbalance to allow misfit task to be balanced. */
- if (busiest->group_type == group_misfit_task) {
- env->imbalance = max_t(long, env->imbalance,
- busiest->group_misfit_task_load);
- }
-
- /*
- * if *imbalance is less than the average load per runnable task
- * there is no guarantee that any tasks will be moved so we'll have
- * a think about bumping its value to force at least one task to be
- * moved
- */
- if (env->imbalance < busiest->load_per_task)
- return fix_small_imbalance(env, sds);
}
/******* find_busiest_group() helpers end here *********************/
+/*
+ * Decision matrix according to the local and busiest group type:
+ *
+ * busiest \ local has_spare fully_busy misfit asym imbalanced overloaded
+ * has_spare nr_idle balanced N/A N/A balanced balanced
+ * fully_busy nr_idle nr_idle N/A N/A balanced balanced
+ * misfit_task force N/A N/A N/A force force
+ * asym_packing force force N/A N/A force force
+ * imbalanced force force N/A N/A force force
+ * overloaded force force N/A N/A force avg_load
+ *
+ * N/A : Not Applicable because already filtered while updating
+ * statistics.
+ * balanced : The system is balanced for these 2 groups.
+ * force : Calculate the imbalance as load migration is probably needed.
+ * avg_load : Only if imbalance is significant enough.
+ * nr_idle : dst_cpu is not busy and the number of idle CPUs is quite
+ * different in groups.
+ */
+
/**
* find_busiest_group - Returns the busiest group within the sched_domain
* if there is an imbalance.
@@ -8590,7 +9242,7 @@
init_sd_lb_stats(&sds);
/*
- * Compute the various statistics relavent for load balancing at
+ * Compute the various statistics relevant for load balancing at
* this level.
*/
update_sd_lb_stats(env, &sds);
@@ -8605,17 +9257,17 @@
local = &sds.local_stat;
busiest = &sds.busiest_stat;
- /* ASYM feature bypasses nice load balance check */
- if (check_asym_packing(env, &sds))
- return sds.busiest;
-
/* There is no busy sibling group to pull tasks from */
- if (!sds.busiest || busiest->sum_nr_running == 0)
+ if (!sds.busiest)
goto out_balanced;
- /* XXX broken for overlapping NUMA groups */
- sds.avg_load = (SCHED_CAPACITY_SCALE * sds.total_load)
- / sds.total_capacity;
+ /* Misfit tasks should be dealt with regardless of the avg load */
+ if (busiest->group_type == group_misfit_task)
+ goto force_balance;
+
+ /* ASYM feature bypasses nice load balance check */
+ if (busiest->group_type == group_asym_packing)
+ goto force_balance;
/*
* If the busiest group is imbalanced the below checks don't
@@ -8626,55 +9278,80 @@
goto force_balance;
/*
- * When dst_cpu is idle, prevent SMP nice and/or asymmetric group
- * capacities from resulting in underutilization due to avg_load.
- */
- if (env->idle != CPU_NOT_IDLE && group_has_capacity(env, local) &&
- busiest->group_no_capacity)
- goto force_balance;
-
- /* Misfit tasks should be dealt with regardless of the avg load */
- if (busiest->group_type == group_misfit_task)
- goto force_balance;
-
- /*
* If the local group is busier than the selected busiest group
* don't try and pull any tasks.
*/
- if (local->avg_load >= busiest->avg_load)
+ if (local->group_type > busiest->group_type)
goto out_balanced;
/*
- * Don't pull any tasks if this group is already above the domain
- * average load.
+ * When groups are overloaded, use the avg_load to ensure fairness
+ * between tasks.
*/
- if (local->avg_load >= sds.avg_load)
- goto out_balanced;
-
- if (env->idle == CPU_IDLE) {
+ if (local->group_type == group_overloaded) {
/*
- * This CPU is idle. If the busiest group is not overloaded
- * and there is no imbalance between this and busiest group
- * wrt idle CPUs, it is balanced. The imbalance becomes
- * significant if the diff is greater than 1 otherwise we
- * might end up to just move the imbalance on another group
+ * If the local group is more loaded than the selected
+ * busiest group don't try to pull any tasks.
*/
- if ((busiest->group_type != group_overloaded) &&
- (local->idle_cpus <= (busiest->idle_cpus + 1)))
+ if (local->avg_load >= busiest->avg_load)
goto out_balanced;
- } else {
+
+ /* XXX broken for overlapping NUMA groups */
+ sds.avg_load = (sds.total_load * SCHED_CAPACITY_SCALE) /
+ sds.total_capacity;
+
/*
- * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
- * imbalance_pct to be conservative.
+ * Don't pull any tasks if this group is already above the
+ * domain average load.
+ */
+ if (local->avg_load >= sds.avg_load)
+ goto out_balanced;
+
+ /*
+ * If the busiest group is more loaded, use imbalance_pct to be
+ * conservative.
*/
if (100 * busiest->avg_load <=
env->sd->imbalance_pct * local->avg_load)
goto out_balanced;
}
+ /* Try to move all excess tasks to child's sibling domain */
+ if (sds.prefer_sibling && local->group_type == group_has_spare &&
+ busiest->sum_nr_running > local->sum_nr_running + 1)
+ goto force_balance;
+
+ if (busiest->group_type != group_overloaded) {
+ if (env->idle == CPU_NOT_IDLE)
+ /*
+ * If the busiest group is not overloaded (and as a
+ * result the local one too) but this CPU is already
+ * busy, let another idle CPU try to pull task.
+ */
+ goto out_balanced;
+
+ if (busiest->group_weight > 1 &&
+ local->idle_cpus <= (busiest->idle_cpus + 1))
+ /*
+ * If the busiest group is not overloaded
+ * and there is no imbalance between this and busiest
+ * group wrt idle CPUs, it is balanced. The imbalance
+ * becomes significant if the diff is greater than 1
+ * otherwise we might end up to just move the imbalance
+ * on another group. Of course this applies only if
+ * there is more than 1 CPU per group.
+ */
+ goto out_balanced;
+
+ if (busiest->sum_h_nr_running == 1)
+ /*
+ * busiest doesn't have any tasks waiting to run
+ */
+ goto out_balanced;
+ }
+
force_balance:
/* Looks like there is an imbalance. Compute it */
- env->src_grp_type = busiest->group_type;
calculate_imbalance(env, &sds);
return env->imbalance ? sds.busiest : NULL;
@@ -8690,11 +9367,13 @@
struct sched_group *group)
{
struct rq *busiest = NULL, *rq;
- unsigned long busiest_load = 0, busiest_capacity = 1;
+ unsigned long busiest_util = 0, busiest_load = 0, busiest_capacity = 1;
+ unsigned int busiest_nr = 0;
int i;
for_each_cpu_and(i, sched_group_span(group), env->cpus) {
- unsigned long capacity, load;
+ unsigned long capacity, load, util;
+ unsigned int nr_running;
enum fbq_type rt;
rq = cpu_rq(i);
@@ -8722,20 +9401,8 @@
if (rt > env->fbq_type)
continue;
- /*
- * For ASYM_CPUCAPACITY domains with misfit tasks we simply
- * seek the "biggest" misfit task.
- */
- if (env->src_grp_type == group_misfit_task) {
- if (rq->misfit_task_load > busiest_load) {
- busiest_load = rq->misfit_task_load;
- busiest = rq;
- }
-
- continue;
- }
-
capacity = capacity_of(i);
+ nr_running = rq->cfs.h_nr_running;
/*
* For ASYM_CPUCAPACITY domains, don't pick a CPU that could
@@ -8745,35 +9412,77 @@
*/
if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
capacity_of(env->dst_cpu) < capacity &&
- rq->nr_running == 1)
+ nr_running == 1)
continue;
- load = cpu_runnable_load(rq);
+ switch (env->migration_type) {
+ case migrate_load:
+ /*
+ * When comparing with load imbalance, use cpu_load()
+ * which is not scaled with the CPU capacity.
+ */
+ load = cpu_load(rq);
- /*
- * When comparing with imbalance, use cpu_runnable_load()
- * which is not scaled with the CPU capacity.
- */
+ if (nr_running == 1 && load > env->imbalance &&
+ !check_cpu_capacity(rq, env->sd))
+ break;
- if (rq->nr_running == 1 && load > env->imbalance &&
- !check_cpu_capacity(rq, env->sd))
- continue;
+ /*
+ * For the load comparisons with the other CPUs,
+ * consider the cpu_load() scaled with the CPU
+ * capacity, so that the load can be moved away
+ * from the CPU that is potentially running at a
+ * lower capacity.
+ *
+ * Thus we're looking for max(load_i / capacity_i),
+ * crosswise multiplication to rid ourselves of the
+ * division works out to:
+ * load_i * capacity_j > load_j * capacity_i;
+ * where j is our previous maximum.
+ */
+ if (load * busiest_capacity > busiest_load * capacity) {
+ busiest_load = load;
+ busiest_capacity = capacity;
+ busiest = rq;
+ }
+ break;
- /*
- * For the load comparisons with the other CPU's, consider
- * the cpu_runnable_load() scaled with the CPU capacity, so
- * that the load can be moved away from the CPU that is
- * potentially running at a lower capacity.
- *
- * Thus we're looking for max(load_i / capacity_i), crosswise
- * multiplication to rid ourselves of the division works out
- * to: load_i * capacity_j > load_j * capacity_i; where j is
- * our previous maximum.
- */
- if (load * busiest_capacity > busiest_load * capacity) {
- busiest_load = load;
- busiest_capacity = capacity;
- busiest = rq;
+ case migrate_util:
+ util = cpu_util(cpu_of(rq));
+
+ /*
+ * Don't try to pull utilization from a CPU with one
+ * running task. Whatever its utilization, we will fail
+ * detach the task.
+ */
+ if (nr_running <= 1)
+ continue;
+
+ if (busiest_util < util) {
+ busiest_util = util;
+ busiest = rq;
+ }
+ break;
+
+ case migrate_task:
+ if (busiest_nr < nr_running) {
+ busiest_nr = nr_running;
+ busiest = rq;
+ }
+ break;
+
+ case migrate_misfit:
+ /*
+ * For ASYM_CPUCAPACITY domains with misfit tasks we
+ * simply seek the "biggest" misfit task.
+ */
+ if (rq->misfit_task_load > busiest_load) {
+ busiest_load = rq->misfit_task_load;
+ busiest = rq;
+ }
+
+ break;
+
}
}
@@ -8819,7 +9528,7 @@
return 1;
}
- if (env->src_grp_type == group_misfit_task)
+ if (env->migration_type == migrate_misfit)
return 1;
return 0;
@@ -8840,7 +9549,7 @@
static int should_we_balance(struct lb_env *env)
{
struct sched_group *sg = env->sd->groups;
- int cpu, balance_cpu = -1;
+ int cpu;
/*
* Ensure the balancing environment is consistent; can happen
@@ -8861,18 +9570,12 @@
if (!idle_cpu(cpu))
continue;
- balance_cpu = cpu;
- break;
+ /* Are we the first idle CPU? */
+ return cpu == env->dst_cpu;
}
- if (balance_cpu == -1)
- balance_cpu = group_balance_cpu(sg);
-
- /*
- * First idle CPU or the first CPU(busiest) in this sched group
- * is eligible for doing load balancing at this and above domains.
- */
- return balance_cpu == env->dst_cpu;
+ /* Are we the first CPU of this group ? */
+ return group_balance_cpu(sg) == env->dst_cpu;
}
/*
@@ -9163,6 +9866,15 @@
/* scale ms to jiffies */
interval = msecs_to_jiffies(interval);
+
+ /*
+ * Reduce likelihood of busy balancing at higher domains racing with
+ * balancing at lower domains by preventing their balancing periods
+ * from being multiples of each other.
+ */
+ if (cpu_busy)
+ interval -= 1;
+
interval = clamp(interval, 1UL, max_load_balance_interval);
return interval;
@@ -9225,9 +9937,8 @@
/* Search for an sd spanning us and the target CPU. */
rcu_read_lock();
for_each_domain(target_cpu, sd) {
- if ((sd->flags & SD_LOAD_BALANCE) &&
- cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
- break;
+ if (cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
+ break;
}
if (likely(sd)) {
@@ -9293,6 +10004,7 @@
{
int continue_balancing = 1;
int cpu = rq->cpu;
+ int busy = idle != CPU_IDLE && !sched_idle_cpu(cpu);
unsigned long interval;
struct sched_domain *sd;
/* Earliest time when we have to do rebalance again */
@@ -9315,9 +10027,6 @@
}
max_cost += sd->max_newidle_lb_cost;
- if (!(sd->flags & SD_LOAD_BALANCE))
- continue;
-
/*
* Stop the load balance at this level. There is another
* CPU in our sched group which is doing load balancing more
@@ -9329,7 +10038,7 @@
break;
}
- interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
+ interval = get_sd_balance_interval(sd, busy);
need_serialize = sd->flags & SD_SERIALIZE;
if (need_serialize) {
@@ -9345,9 +10054,10 @@
* state even if we migrated tasks. Update it.
*/
idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
+ busy = idle != CPU_IDLE && !sched_idle_cpu(cpu);
}
sd->last_balance = jiffies;
- interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
+ interval = get_sd_balance_interval(sd, busy);
}
if (need_serialize)
spin_unlock(&balancing);
@@ -9438,17 +10148,20 @@
if (ilb_cpu >= nr_cpu_ids)
return;
+ /*
+ * Access to rq::nohz_csd is serialized by NOHZ_KICK_MASK; he who sets
+ * the first flag owns it; cleared by nohz_csd_func().
+ */
flags = atomic_fetch_or(flags, nohz_flags(ilb_cpu));
if (flags & NOHZ_KICK_MASK)
return;
/*
- * Use smp_send_reschedule() instead of resched_cpu().
- * This way we generate a sched IPI on the target CPU which
+ * This way we generate an IPI on the target CPU which
* is idle. And the softirq performing nohz idle load balance
* will be run before returning from the IPI.
*/
- smp_send_reschedule(ilb_cpu);
+ smp_call_function_single_async(ilb_cpu, &cpu_rq(ilb_cpu)->nohz_csd);
}
/*
@@ -9786,20 +10499,14 @@
*/
static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
{
- int this_cpu = this_rq->cpu;
- unsigned int flags;
+ unsigned int flags = this_rq->nohz_idle_balance;
- if (!(atomic_read(nohz_flags(this_cpu)) & NOHZ_KICK_MASK))
+ if (!flags)
return false;
- if (idle != CPU_IDLE) {
- atomic_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
- return false;
- }
+ this_rq->nohz_idle_balance = 0;
- /* could be _relaxed() */
- flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
- if (!(flags & NOHZ_KICK_MASK))
+ if (idle != CPU_IDLE)
return false;
_nohz_idle_balance(this_rq, flags, idle);
@@ -9853,8 +10560,13 @@
/*
* idle_balance is called by schedule() if this_cpu is about to become
* idle. Attempts to pull tasks from other CPUs.
+ *
+ * Returns:
+ * < 0 - we released the lock and there are !fair tasks present
+ * 0 - failed, no new tasks
+ * > 0 - success, new (fair) tasks present
*/
-int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
+static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
{
unsigned long next_balance = jiffies + HZ;
int this_cpu = this_rq->cpu;
@@ -9905,9 +10617,6 @@
int continue_balancing = 1;
u64 t0, domain_cost;
- if (!(sd->flags & SD_LOAD_BALANCE))
- continue;
-
if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) {
update_next_balance(sd, &next_balance);
break;
@@ -10097,6 +10806,9 @@
if (!task_on_rq_queued(p))
return;
+ if (rq->cfs.nr_running == 1)
+ return;
+
/*
* Reschedule if we are currently running on this runqueue and
* our priority decreased, or if we are not currently running on
@@ -10175,7 +10887,7 @@
/* Catch up with the cfs_rq and remove our load when we leave */
update_load_avg(cfs_rq, se, 0);
detach_entity_load_avg(cfs_rq, se);
- update_tg_load_avg(cfs_rq, false);
+ update_tg_load_avg(cfs_rq);
propagate_entity_cfs_rq(se);
}
@@ -10193,8 +10905,8 @@
/* Synchronize entity with its cfs_rq */
update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
- attach_entity_load_avg(cfs_rq, se, 0);
- update_tg_load_avg(cfs_rq, false);
+ attach_entity_load_avg(cfs_rq, se);
+ update_tg_load_avg(cfs_rq);
propagate_entity_cfs_rq(se);
}
@@ -10526,8 +11238,8 @@
/*
* All the scheduling class methods:
*/
-const struct sched_class fair_sched_class = {
- .next = &idle_sched_class,
+const struct sched_class fair_sched_class
+ __section("__fair_sched_class") = {
.enqueue_task = enqueue_task_fair,
.dequeue_task = dequeue_task_fair,
.yield_task = yield_task_fair,
@@ -10535,7 +11247,7 @@
.check_preempt_curr = check_preempt_wakeup,
- .pick_next_task = pick_next_task_fair,
+ .pick_next_task = __pick_next_task_fair,
.put_prev_task = put_prev_task_fair,
.set_next_task = set_next_task_fair,
@@ -10691,6 +11403,18 @@
}
EXPORT_SYMBOL_GPL(sched_trace_rq_cpu);
+int sched_trace_rq_cpu_capacity(struct rq *rq)
+{
+ return rq ?
+#ifdef CONFIG_SMP
+ rq->cpu_capacity
+#else
+ SCHED_CAPACITY_SCALE
+#endif
+ : -1;
+}
+EXPORT_SYMBOL_GPL(sched_trace_rq_cpu_capacity);
+
const struct cpumask *sched_trace_rd_span(struct root_domain *rd)
{
#ifdef CONFIG_SMP
@@ -10700,3 +11424,9 @@
#endif
}
EXPORT_SYMBOL_GPL(sched_trace_rd_span);
+
+int sched_trace_rq_nr_running(struct rq *rq)
+{
+ return rq ? rq->nr_running : -1;
+}
+EXPORT_SYMBOL_GPL(sched_trace_rq_nr_running);
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 2410db5..f1bf5e1 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -77,7 +77,7 @@
SCHED_FEAT(RT_PUSH_IPI, true)
#endif
-SCHED_FEAT(RT_RUNTIME_SHARE, true)
+SCHED_FEAT(RT_RUNTIME_SHARE, false)
SCHED_FEAT(LB_MIN, false)
SCHED_FEAT(ATTACH_AGE_LOAD, true)
@@ -89,3 +89,7 @@
* UtilEstimation. Use estimated CPU utilization.
*/
SCHED_FEAT(UTIL_EST, true)
+SCHED_FEAT(UTIL_EST_FASTUP, true)
+
+SCHED_FEAT(ALT_PERIOD, true)
+SCHED_FEAT(BASE_SLICE, true)
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 3f8c786..2593a73 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -54,17 +54,18 @@
static noinline int __cpuidle cpu_idle_poll(void)
{
- rcu_idle_enter();
- trace_cpu_idle_rcuidle(0, smp_processor_id());
- local_irq_enable();
+ trace_cpu_idle(0, smp_processor_id());
stop_critical_timings();
+ rcu_idle_enter();
+ local_irq_enable();
while (!tif_need_resched() &&
- (cpu_idle_force_poll || tick_check_broadcast_expired()))
+ (cpu_idle_force_poll || tick_check_broadcast_expired()))
cpu_relax();
- start_critical_timings();
- trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
+
rcu_idle_exit();
+ start_critical_timings();
+ trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id());
return 1;
}
@@ -77,7 +78,7 @@
void __weak arch_cpu_idle(void)
{
cpu_idle_force_poll = 1;
- local_irq_enable();
+ raw_local_irq_enable();
}
/**
@@ -90,12 +91,52 @@
if (current_clr_polling_and_test()) {
local_irq_enable();
} else {
+
+ trace_cpu_idle(1, smp_processor_id());
stop_critical_timings();
+
+ /*
+ * arch_cpu_idle() is supposed to enable IRQs, however
+ * we can't do that because of RCU and tracing.
+ *
+ * Trace IRQs enable here, then switch off RCU, and have
+ * arch_cpu_idle() use raw_local_irq_enable(). Note that
+ * rcu_idle_enter() relies on lockdep IRQ state, so switch that
+ * last -- this is very similar to the entry code.
+ */
+ trace_hardirqs_on_prepare();
+ lockdep_hardirqs_on_prepare(_THIS_IP_);
+ rcu_idle_enter();
+ lockdep_hardirqs_on(_THIS_IP_);
+
arch_cpu_idle();
+
+ /*
+ * OK, so IRQs are enabled here, but RCU needs them disabled to
+ * turn itself back on.. funny thing is that disabling IRQs
+ * will cause tracing, which needs RCU. Jump through hoops to
+ * make it 'work'.
+ */
+ raw_local_irq_disable();
+ lockdep_hardirqs_off(_THIS_IP_);
+ rcu_idle_exit();
+ lockdep_hardirqs_on(_THIS_IP_);
+ raw_local_irq_enable();
+
start_critical_timings();
+ trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id());
}
}
+static int call_cpuidle_s2idle(struct cpuidle_driver *drv,
+ struct cpuidle_device *dev)
+{
+ if (current_clr_polling_and_test())
+ return -EBUSY;
+
+ return cpuidle_enter_s2idle(drv, dev);
+}
+
static int call_cpuidle(struct cpuidle_driver *drv, struct cpuidle_device *dev,
int next_state)
{
@@ -104,7 +145,7 @@
* update no idle residency and return.
*/
if (current_clr_polling_and_test()) {
- dev->last_residency = 0;
+ dev->last_residency_ns = 0;
local_irq_enable();
return -EBUSY;
}
@@ -149,7 +190,6 @@
if (cpuidle_not_available(drv, dev)) {
tick_nohz_idle_stop_tick();
- rcu_idle_enter();
default_idle_call();
goto exit_idle;
@@ -158,30 +198,30 @@
/*
* Suspend-to-idle ("s2idle") is a system state in which all user space
* has been frozen, all I/O devices have been suspended and the only
- * activity happens here and in iterrupts (if any). In that case bypass
+ * activity happens here and in interrupts (if any). In that case bypass
* the cpuidle governor and go stratight for the deepest idle state
* available. Possibly also suspend the local tick and the entire
* timekeeping to prevent timer interrupts from kicking us out of idle
* until a proper wakeup interrupt happens.
*/
- if (idle_should_enter_s2idle() || dev->use_deepest_state) {
+ if (idle_should_enter_s2idle() || dev->forced_idle_latency_limit_ns) {
+ u64 max_latency_ns;
+
if (idle_should_enter_s2idle()) {
- rcu_idle_enter();
- entered_state = cpuidle_enter_s2idle(drv, dev);
- if (entered_state > 0) {
- local_irq_enable();
+ entered_state = call_cpuidle_s2idle(drv, dev);
+ if (entered_state > 0)
goto exit_idle;
- }
- rcu_idle_exit();
+ max_latency_ns = U64_MAX;
+ } else {
+ max_latency_ns = dev->forced_idle_latency_limit_ns;
}
tick_nohz_idle_stop_tick();
- rcu_idle_enter();
- next_state = cpuidle_find_deepest_state(drv, dev);
+ next_state = cpuidle_find_deepest_state(drv, dev, max_latency_ns);
call_cpuidle(drv, dev, next_state);
} else {
bool stop_tick = true;
@@ -196,8 +236,6 @@
else
tick_nohz_idle_retain_tick();
- rcu_idle_enter();
-
entered_state = call_cpuidle(drv, dev, next_state);
/*
* Give the governor an opportunity to reflect on the outcome
@@ -213,8 +251,6 @@
*/
if (WARN_ON_ONCE(irqs_disabled()))
local_irq_enable();
-
- rcu_idle_exit();
}
/*
@@ -284,7 +320,11 @@
*/
smp_mb__after_atomic();
- sched_ttwu_pending();
+ /*
+ * RCU relies on this call to be done outside of an RCU read-side
+ * critical section.
+ */
+ flush_smp_call_function_from_idle();
schedule_idle();
if (unlikely(klp_patch_pending(current)))
@@ -312,7 +352,7 @@
return HRTIMER_NORESTART;
}
-void play_idle(unsigned long duration_us)
+void play_idle_precise(u64 duration_ns, u64 latency_ns)
{
struct idle_timer it;
@@ -324,29 +364,29 @@
WARN_ON_ONCE(current->nr_cpus_allowed != 1);
WARN_ON_ONCE(!(current->flags & PF_KTHREAD));
WARN_ON_ONCE(!(current->flags & PF_NO_SETAFFINITY));
- WARN_ON_ONCE(!duration_us);
+ WARN_ON_ONCE(!duration_ns);
rcu_sleep_check();
preempt_disable();
current->flags |= PF_IDLE;
- cpuidle_use_deepest_state(true);
+ cpuidle_use_deepest_state(latency_ns);
it.done = 0;
- hrtimer_init_on_stack(&it.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ hrtimer_init_on_stack(&it.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
it.timer.function = idle_inject_timer_fn;
- hrtimer_start(&it.timer, ns_to_ktime(duration_us * NSEC_PER_USEC),
- HRTIMER_MODE_REL_PINNED);
+ hrtimer_start(&it.timer, ns_to_ktime(duration_ns),
+ HRTIMER_MODE_REL_PINNED_HARD);
while (!READ_ONCE(it.done))
do_idle();
- cpuidle_use_deepest_state(false);
+ cpuidle_use_deepest_state(0);
current->flags &= ~PF_IDLE;
preempt_fold_need_resched();
preempt_enable();
}
-EXPORT_SYMBOL_GPL(play_idle);
+EXPORT_SYMBOL_GPL(play_idle_precise);
void cpu_startup_entry(enum cpuhp_state state)
{
@@ -392,14 +432,10 @@
schedstat_inc(rq->sched_goidle);
}
-static struct task_struct *
-pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+struct task_struct *pick_next_task_idle(struct rq *rq)
{
struct task_struct *next = rq->idle;
- if (prev)
- put_prev_task(rq, prev);
-
set_next_task_idle(rq, next, true);
return next;
@@ -441,11 +477,6 @@
BUG();
}
-static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task)
-{
- return 0;
-}
-
static void update_curr_idle(struct rq *rq)
{
}
@@ -453,8 +484,8 @@
/*
* Simple, special scheduling class for the per-CPU idle tasks:
*/
-const struct sched_class idle_sched_class = {
- /* .next is NULL */
+const struct sched_class idle_sched_class
+ __section("__idle_sched_class") = {
/* no enqueue/yield_task for idle tasks */
/* dequeue is not valid, we print a debug message there: */
@@ -474,8 +505,6 @@
.task_tick = task_tick_idle,
- .get_rr_interval = get_rr_interval_idle,
-
.prio_changed = prio_changed_idle,
.switched_to = switched_to_idle,
.update_curr = update_curr_idle,
diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
index 9fcb2a6..5a6ea03 100644
--- a/kernel/sched/isolation.c
+++ b/kernel/sched/isolation.c
@@ -140,7 +140,8 @@
{
unsigned int flags;
- flags = HK_FLAG_TICK | HK_FLAG_WQ | HK_FLAG_TIMER | HK_FLAG_RCU | HK_FLAG_MISC;
+ flags = HK_FLAG_TICK | HK_FLAG_WQ | HK_FLAG_TIMER | HK_FLAG_RCU |
+ HK_FLAG_MISC | HK_FLAG_KTHREAD;
return housekeeping_setup(str, flags);
}
@@ -149,6 +150,9 @@
static int __init housekeeping_isolcpus_setup(char *str)
{
unsigned int flags = 0;
+ bool illegal = false;
+ char *par;
+ int len;
while (isalpha(*str)) {
if (!strncmp(str, "nohz,", 5)) {
@@ -163,8 +167,28 @@
continue;
}
- pr_warn("isolcpus: Error, unknown flag\n");
- return 0;
+ if (!strncmp(str, "managed_irq,", 12)) {
+ str += 12;
+ flags |= HK_FLAG_MANAGED_IRQ;
+ continue;
+ }
+
+ /*
+ * Skip unknown sub-parameter and validate that it is not
+ * containing an invalid character.
+ */
+ for (par = str, len = 0; *str && *str != ','; str++, len++) {
+ if (!isalpha(*str) && *str != '_')
+ illegal = true;
+ }
+
+ if (illegal) {
+ pr_warn("isolcpus: Invalid flag %.*s\n", len, par);
+ return 0;
+ }
+
+ pr_info("isolcpus: Skipped unknown flag %.*s\n", len, par);
+ str++;
}
/* Default behaviour for isolcpus without flags */
diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c
index de22da6..d2a6556 100644
--- a/kernel/sched/loadavg.c
+++ b/kernel/sched/loadavg.c
@@ -347,7 +347,7 @@
*
* Called from the global timer code.
*/
-void calc_global_load(unsigned long ticks)
+void calc_global_load(void)
{
unsigned long sample_window;
long active, delta;
diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c
index 46c142b..cc7cd51 100644
--- a/kernel/sched/membarrier.c
+++ b/kernel/sched/membarrier.c
@@ -18,12 +18,21 @@
#define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK 0
#endif
+#ifdef CONFIG_RSEQ
+#define MEMBARRIER_PRIVATE_EXPEDITED_RSEQ_BITMASK \
+ (MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ \
+ | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ)
+#else
+#define MEMBARRIER_PRIVATE_EXPEDITED_RSEQ_BITMASK 0
+#endif
+
#define MEMBARRIER_CMD_BITMASK \
(MEMBARRIER_CMD_GLOBAL | MEMBARRIER_CMD_GLOBAL_EXPEDITED \
| MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED \
| MEMBARRIER_CMD_PRIVATE_EXPEDITED \
| MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED \
- | MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK)
+ | MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK \
+ | MEMBARRIER_PRIVATE_EXPEDITED_RSEQ_BITMASK)
static void ipi_mb(void *info)
{
@@ -47,6 +56,19 @@
sync_core_before_usermode();
}
+static void ipi_rseq(void *info)
+{
+ /*
+ * Ensure that all stores done by the calling thread are visible
+ * to the current task before the current task resumes. We could
+ * probably optimize this away on most architectures, but by the
+ * time we've already sent an IPI, the cost of the extra smp_mb()
+ * is negligible.
+ */
+ smp_mb();
+ rseq_preempt(current);
+}
+
static void ipi_sync_rq_state(void *info)
{
struct mm_struct *mm = (struct mm_struct *) info;
@@ -146,27 +168,35 @@
return 0;
}
-static int membarrier_private_expedited(int flags)
+static int membarrier_private_expedited(int flags, int cpu_id)
{
- int cpu;
cpumask_var_t tmpmask;
struct mm_struct *mm = current->mm;
smp_call_func_t ipi_func = ipi_mb;
- if (flags & MEMBARRIER_FLAG_SYNC_CORE) {
+ if (flags == MEMBARRIER_FLAG_SYNC_CORE) {
if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE))
return -EINVAL;
if (!(atomic_read(&mm->membarrier_state) &
MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY))
return -EPERM;
ipi_func = ipi_sync_core;
+ } else if (flags == MEMBARRIER_FLAG_RSEQ) {
+ if (!IS_ENABLED(CONFIG_RSEQ))
+ return -EINVAL;
+ if (!(atomic_read(&mm->membarrier_state) &
+ MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ_READY))
+ return -EPERM;
+ ipi_func = ipi_rseq;
} else {
+ WARN_ON_ONCE(flags);
if (!(atomic_read(&mm->membarrier_state) &
MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY))
return -EPERM;
}
- if (atomic_read(&mm->mm_users) == 1 || num_online_cpus() == 1)
+ if (flags != MEMBARRIER_FLAG_SYNC_CORE &&
+ (atomic_read(&mm->mm_users) == 1 || num_online_cpus() == 1))
return 0;
/*
@@ -175,35 +205,73 @@
*/
smp_mb(); /* system call entry is not a mb. */
- if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
+ if (cpu_id < 0 && !zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
return -ENOMEM;
cpus_read_lock();
- rcu_read_lock();
- for_each_online_cpu(cpu) {
+
+ if (cpu_id >= 0) {
struct task_struct *p;
- /*
- * Skipping the current CPU is OK even through we can be
- * migrated at any point. The current CPU, at the point
- * where we read raw_smp_processor_id(), is ensured to
- * be in program order with respect to the caller
- * thread. Therefore, we can skip this CPU from the
- * iteration.
- */
- if (cpu == raw_smp_processor_id())
- continue;
- p = rcu_dereference(cpu_rq(cpu)->curr);
- if (p && p->mm == mm)
- __cpumask_set_cpu(cpu, tmpmask);
+ if (cpu_id >= nr_cpu_ids || !cpu_online(cpu_id))
+ goto out;
+ rcu_read_lock();
+ p = rcu_dereference(cpu_rq(cpu_id)->curr);
+ if (!p || p->mm != mm) {
+ rcu_read_unlock();
+ goto out;
+ }
+ rcu_read_unlock();
+ } else {
+ int cpu;
+
+ rcu_read_lock();
+ for_each_online_cpu(cpu) {
+ struct task_struct *p;
+
+ p = rcu_dereference(cpu_rq(cpu)->curr);
+ if (p && p->mm == mm)
+ __cpumask_set_cpu(cpu, tmpmask);
+ }
+ rcu_read_unlock();
}
- rcu_read_unlock();
- preempt_disable();
- smp_call_function_many(tmpmask, ipi_func, NULL, 1);
- preempt_enable();
+ if (cpu_id >= 0) {
+ /*
+ * smp_call_function_single() will call ipi_func() if cpu_id
+ * is the calling CPU.
+ */
+ smp_call_function_single(cpu_id, ipi_func, NULL, 1);
+ } else {
+ /*
+ * For regular membarrier, we can save a few cycles by
+ * skipping the current cpu -- we're about to do smp_mb()
+ * below, and if we migrate to a different cpu, this cpu
+ * and the new cpu will execute a full barrier in the
+ * scheduler.
+ *
+ * For SYNC_CORE, we do need a barrier on the current cpu --
+ * otherwise, if we are migrated and replaced by a different
+ * task in the same mm just before, during, or after
+ * membarrier, we will end up with some thread in the mm
+ * running without a core sync.
+ *
+ * For RSEQ, don't rseq_preempt() the caller. User code
+ * is not supposed to issue syscalls at all from inside an
+ * rseq critical section.
+ */
+ if (flags != MEMBARRIER_FLAG_SYNC_CORE) {
+ preempt_disable();
+ smp_call_function_many(tmpmask, ipi_func, NULL, true);
+ preempt_enable();
+ } else {
+ on_each_cpu_mask(tmpmask, ipi_func, NULL, true);
+ }
+ }
- free_cpumask_var(tmpmask);
+out:
+ if (cpu_id < 0)
+ free_cpumask_var(tmpmask);
cpus_read_unlock();
/*
@@ -248,7 +316,7 @@
/*
* For each cpu runqueue, if the task's mm match @mm, ensure that all
- * @mm's membarrier state set bits are also set in in the runqueue's
+ * @mm's membarrier state set bits are also set in the runqueue's
* membarrier state. This ensures that a runqueue scheduling
* between threads which are users of @mm has its membarrier state
* updated.
@@ -300,11 +368,18 @@
set_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED,
ret;
- if (flags & MEMBARRIER_FLAG_SYNC_CORE) {
+ if (flags == MEMBARRIER_FLAG_SYNC_CORE) {
if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE))
return -EINVAL;
ready_state =
MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY;
+ } else if (flags == MEMBARRIER_FLAG_RSEQ) {
+ if (!IS_ENABLED(CONFIG_RSEQ))
+ return -EINVAL;
+ ready_state =
+ MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ_READY;
+ } else {
+ WARN_ON_ONCE(flags);
}
/*
@@ -316,6 +391,8 @@
return 0;
if (flags & MEMBARRIER_FLAG_SYNC_CORE)
set_state |= MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE;
+ if (flags & MEMBARRIER_FLAG_RSEQ)
+ set_state |= MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ;
atomic_or(set_state, &mm->membarrier_state);
ret = sync_runqueues_membarrier_state(mm);
if (ret)
@@ -327,8 +404,15 @@
/**
* sys_membarrier - issue memory barriers on a set of threads
- * @cmd: Takes command values defined in enum membarrier_cmd.
- * @flags: Currently needs to be 0. For future extensions.
+ * @cmd: Takes command values defined in enum membarrier_cmd.
+ * @flags: Currently needs to be 0 for all commands other than
+ * MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ: in the latter
+ * case it can be MEMBARRIER_CMD_FLAG_CPU, indicating that @cpu_id
+ * contains the CPU on which to interrupt (= restart)
+ * the RSEQ critical section.
+ * @cpu_id: if @flags == MEMBARRIER_CMD_FLAG_CPU, indicates the cpu on which
+ * RSEQ CS should be interrupted (@cmd must be
+ * MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ).
*
* If this system call is not implemented, -ENOSYS is returned. If the
* command specified does not exist, not available on the running
@@ -354,10 +438,21 @@
* smp_mb() X O O
* sys_membarrier() O O O
*/
-SYSCALL_DEFINE2(membarrier, int, cmd, int, flags)
+SYSCALL_DEFINE3(membarrier, int, cmd, unsigned int, flags, int, cpu_id)
{
- if (unlikely(flags))
- return -EINVAL;
+ switch (cmd) {
+ case MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ:
+ if (unlikely(flags && flags != MEMBARRIER_CMD_FLAG_CPU))
+ return -EINVAL;
+ break;
+ default:
+ if (unlikely(flags))
+ return -EINVAL;
+ }
+
+ if (!(flags & MEMBARRIER_CMD_FLAG_CPU))
+ cpu_id = -1;
+
switch (cmd) {
case MEMBARRIER_CMD_QUERY:
{
@@ -379,13 +474,17 @@
case MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED:
return membarrier_register_global_expedited();
case MEMBARRIER_CMD_PRIVATE_EXPEDITED:
- return membarrier_private_expedited(0);
+ return membarrier_private_expedited(0, cpu_id);
case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED:
return membarrier_register_private_expedited(0);
case MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE:
- return membarrier_private_expedited(MEMBARRIER_FLAG_SYNC_CORE);
+ return membarrier_private_expedited(MEMBARRIER_FLAG_SYNC_CORE, cpu_id);
case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE:
return membarrier_register_private_expedited(MEMBARRIER_FLAG_SYNC_CORE);
+ case MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ:
+ return membarrier_private_expedited(MEMBARRIER_FLAG_RSEQ, cpu_id);
+ case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ:
+ return membarrier_register_private_expedited(MEMBARRIER_FLAG_RSEQ);
default:
return -EINVAL;
}
diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
index a96db50..2c613e1 100644
--- a/kernel/sched/pelt.c
+++ b/kernel/sched/pelt.c
@@ -28,8 +28,6 @@
#include "sched.h"
#include "pelt.h"
-#include <trace/events/sched.h>
-
/*
* Approximate:
* val * y^n, where y^32 ~= 0.5 (~1 scheduling period)
@@ -83,8 +81,6 @@
return c1 + c2 + c3;
}
-#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
-
/*
* Accumulate the three separate parts of the sum; d1 the remainder
* of the last (incomplete) period, d2 the span of full periods and d3
@@ -121,23 +117,35 @@
*/
if (periods) {
sa->load_sum = decay_load(sa->load_sum, periods);
- sa->runnable_load_sum =
- decay_load(sa->runnable_load_sum, periods);
+ sa->runnable_sum =
+ decay_load(sa->runnable_sum, periods);
sa->util_sum = decay_load((u64)(sa->util_sum), periods);
/*
* Step 2
*/
delta %= 1024;
- contrib = __accumulate_pelt_segments(periods,
- 1024 - sa->period_contrib, delta);
+ if (load) {
+ /*
+ * This relies on the:
+ *
+ * if (!load)
+ * runnable = running = 0;
+ *
+ * clause from ___update_load_sum(); this results in
+ * the below usage of @contrib to dissapear entirely,
+ * so no point in calculating it.
+ */
+ contrib = __accumulate_pelt_segments(periods,
+ 1024 - sa->period_contrib, delta);
+ }
}
sa->period_contrib = delta;
if (load)
sa->load_sum += load * contrib;
if (runnable)
- sa->runnable_load_sum += runnable * contrib;
+ sa->runnable_sum += runnable * contrib << SCHED_CAPACITY_SHIFT;
if (running)
sa->util_sum += contrib << SCHED_CAPACITY_SHIFT;
@@ -205,7 +213,9 @@
* This means that weight will be 0 but not running for a sched_entity
* but also for a cfs_rq if the latter becomes idle. As an example,
* this happens during idle_balance() which calls
- * update_blocked_averages()
+ * update_blocked_averages().
+ *
+ * Also see the comment in accumulate_sum().
*/
if (!load)
runnable = running = 0;
@@ -223,16 +233,40 @@
return 1;
}
+/*
+ * When syncing *_avg with *_sum, we must take into account the current
+ * position in the PELT segment otherwise the remaining part of the segment
+ * will be considered as idle time whereas it's not yet elapsed and this will
+ * generate unwanted oscillation in the range [1002..1024[.
+ *
+ * The max value of *_sum varies with the position in the time segment and is
+ * equals to :
+ *
+ * LOAD_AVG_MAX*y + sa->period_contrib
+ *
+ * which can be simplified into:
+ *
+ * LOAD_AVG_MAX - 1024 + sa->period_contrib
+ *
+ * because LOAD_AVG_MAX*y == LOAD_AVG_MAX-1024
+ *
+ * The same care must be taken when a sched entity is added, updated or
+ * removed from a cfs_rq and we need to update sched_avg. Scheduler entities
+ * and the cfs rq, to which they are attached, have the same position in the
+ * time segment because they use the same clock. This means that we can use
+ * the period_contrib of cfs_rq when updating the sched_avg of a sched_entity
+ * if it's more convenient.
+ */
static __always_inline void
-___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runnable)
+___update_load_avg(struct sched_avg *sa, unsigned long load)
{
- u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib;
+ u32 divider = get_pelt_divider(sa);
/*
* Step 2: update *_avg.
*/
sa->load_avg = div_u64(load * sa->load_sum, divider);
- sa->runnable_load_avg = div_u64(runnable * sa->runnable_load_sum, divider);
+ sa->runnable_avg = div_u64(sa->runnable_sum, divider);
WRITE_ONCE(sa->util_avg, sa->util_sum / divider);
}
@@ -240,33 +274,32 @@
* sched_entity:
*
* task:
- * se_runnable() == se_weight()
+ * se_weight() = se->load.weight
+ * se_runnable() = !!on_rq
*
* group: [ see update_cfs_group() ]
* se_weight() = tg->weight * grq->load_avg / tg->load_avg
- * se_runnable() = se_weight(se) * grq->runnable_load_avg / grq->load_avg
+ * se_runnable() = grq->h_nr_running
*
- * load_sum := runnable_sum
- * load_avg = se_weight(se) * runnable_avg
+ * runnable_sum = se_runnable() * runnable = grq->runnable_sum
+ * runnable_avg = runnable_sum
*
- * runnable_load_sum := runnable_sum
- * runnable_load_avg = se_runnable(se) * runnable_avg
- *
- * XXX collapse load_sum and runnable_load_sum
+ * load_sum := runnable
+ * load_avg = se_weight(se) * load_sum
*
* cfq_rq:
*
+ * runnable_sum = \Sum se->avg.runnable_sum
+ * runnable_avg = \Sum se->avg.runnable_avg
+ *
* load_sum = \Sum se_weight(se) * se->avg.load_sum
* load_avg = \Sum se->avg.load_avg
- *
- * runnable_load_sum = \Sum se_runnable(se) * se->avg.runnable_load_sum
- * runnable_load_avg = \Sum se->avg.runable_load_avg
*/
int __update_load_avg_blocked_se(u64 now, struct sched_entity *se)
{
if (___update_load_sum(now, &se->avg, 0, 0, 0)) {
- ___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
+ ___update_load_avg(&se->avg, se_weight(se));
trace_pelt_se_tp(se);
return 1;
}
@@ -276,10 +309,10 @@
int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- if (___update_load_sum(now, &se->avg, !!se->on_rq, !!se->on_rq,
+ if (___update_load_sum(now, &se->avg, !!se->on_rq, se_runnable(se),
cfs_rq->curr == se)) {
- ___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
+ ___update_load_avg(&se->avg, se_weight(se));
cfs_se_util_change(&se->avg);
trace_pelt_se_tp(se);
return 1;
@@ -292,10 +325,10 @@
{
if (___update_load_sum(now, &cfs_rq->avg,
scale_load_down(cfs_rq->load.weight),
- scale_load_down(cfs_rq->runnable_weight),
+ cfs_rq->h_nr_running,
cfs_rq->curr != NULL)) {
- ___update_load_avg(&cfs_rq->avg, 1, 1);
+ ___update_load_avg(&cfs_rq->avg, 1);
trace_pelt_cfs_tp(cfs_rq);
return 1;
}
@@ -308,9 +341,9 @@
*
* util_sum = \Sum se->avg.util_sum but se->avg.util_sum is not tracked
* util_sum = cpu_scale * load_sum
- * runnable_load_sum = load_sum
+ * runnable_sum = util_sum
*
- * load_avg and runnable_load_avg are not supported and meaningless.
+ * load_avg and runnable_avg are not supported and meaningless.
*
*/
@@ -321,7 +354,7 @@
running,
running)) {
- ___update_load_avg(&rq->avg_rt, 1, 1);
+ ___update_load_avg(&rq->avg_rt, 1);
trace_pelt_rt_tp(rq);
return 1;
}
@@ -334,7 +367,9 @@
*
* util_sum = \Sum se->avg.util_sum but se->avg.util_sum is not tracked
* util_sum = cpu_scale * load_sum
- * runnable_load_sum = load_sum
+ * runnable_sum = util_sum
+ *
+ * load_avg and runnable_avg are not supported and meaningless.
*
*/
@@ -345,7 +380,7 @@
running,
running)) {
- ___update_load_avg(&rq->avg_dl, 1, 1);
+ ___update_load_avg(&rq->avg_dl, 1);
trace_pelt_dl_tp(rq);
return 1;
}
@@ -353,13 +388,46 @@
return 0;
}
+#ifdef CONFIG_SCHED_THERMAL_PRESSURE
+/*
+ * thermal:
+ *
+ * load_sum = \Sum se->avg.load_sum but se->avg.load_sum is not tracked
+ *
+ * util_avg and runnable_load_avg are not supported and meaningless.
+ *
+ * Unlike rt/dl utilization tracking that track time spent by a cpu
+ * running a rt/dl task through util_avg, the average thermal pressure is
+ * tracked through load_avg. This is because thermal pressure signal is
+ * time weighted "delta" capacity unlike util_avg which is binary.
+ * "delta capacity" = actual capacity -
+ * capped capacity a cpu due to a thermal event.
+ */
+
+int update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity)
+{
+ if (___update_load_sum(now, &rq->avg_thermal,
+ capacity,
+ capacity,
+ capacity)) {
+ ___update_load_avg(&rq->avg_thermal, 1);
+ trace_pelt_thermal_tp(rq);
+ return 1;
+ }
+
+ return 0;
+}
+#endif
+
#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
/*
* irq:
*
* util_sum = \Sum se->avg.util_sum but se->avg.util_sum is not tracked
* util_sum = cpu_scale * load_sum
- * runnable_load_sum = load_sum
+ * runnable_sum = util_sum
+ *
+ * load_avg and runnable_avg are not supported and meaningless.
*
*/
@@ -396,7 +464,7 @@
1);
if (ret) {
- ___update_load_avg(&rq->avg_irq, 1, 1);
+ ___update_load_avg(&rq->avg_irq, 1);
trace_pelt_irq_tp(rq);
}
diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h
index afff644..45bf08e 100644
--- a/kernel/sched/pelt.h
+++ b/kernel/sched/pelt.h
@@ -7,6 +7,26 @@
int update_rt_rq_load_avg(u64 now, struct rq *rq, int running);
int update_dl_rq_load_avg(u64 now, struct rq *rq, int running);
+#ifdef CONFIG_SCHED_THERMAL_PRESSURE
+int update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity);
+
+static inline u64 thermal_load_avg(struct rq *rq)
+{
+ return READ_ONCE(rq->avg_thermal.load_avg);
+}
+#else
+static inline int
+update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity)
+{
+ return 0;
+}
+
+static inline u64 thermal_load_avg(struct rq *rq)
+{
+ return 0;
+}
+#endif
+
#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
int update_irq_load_avg(struct rq *rq, u64 running);
#else
@@ -17,14 +37,12 @@
}
#endif
-/*
- * When a task is dequeued, its estimated utilization should not be update if
- * its util_avg has not been updated at least once.
- * This flag is used to synchronize util_avg updates with util_est updates.
- * We map this information into the LSB bit of the utilization saved at
- * dequeue time (i.e. util_est.dequeued).
- */
-#define UTIL_AVG_UNCHANGED 0x1
+#define PELT_MIN_DIVIDER (LOAD_AVG_MAX - 1024)
+
+static inline u32 get_pelt_divider(struct sched_avg *avg)
+{
+ return PELT_MIN_DIVIDER + avg->period_contrib;
+}
static inline void cfs_se_util_change(struct sched_avg *avg)
{
@@ -33,7 +51,7 @@
if (!sched_feat(UTIL_EST))
return;
- /* Avoid store if the flag has been already set */
+ /* Avoid store if the flag has been already reset */
enqueued = avg->util_est.enqueued;
if (!(enqueued & UTIL_AVG_UNCHANGED))
return;
@@ -159,6 +177,17 @@
}
static inline int
+update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity)
+{
+ return 0;
+}
+
+static inline u64 thermal_load_avg(struct rq *rq)
+{
+ return 0;
+}
+
+static inline int
update_irq_load_avg(struct rq *rq, u64 running)
{
return 0;
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 9154e74..b7f38f3 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -179,6 +179,8 @@
static void psi_avgs_work(struct work_struct *work);
+static void poll_timer_fn(struct timer_list *t);
+
static void group_init(struct psi_group *group)
{
int cpu;
@@ -190,7 +192,6 @@
INIT_DELAYED_WORK(&group->avgs_work, psi_avgs_work);
mutex_init(&group->avgs_lock);
/* Init trigger-related members */
- atomic_set(&group->poll_scheduled, 0);
mutex_init(&group->trigger_lock);
INIT_LIST_HEAD(&group->triggers);
memset(group->nr_triggers, 0, sizeof(group->nr_triggers));
@@ -199,7 +200,9 @@
memset(group->polling_total, 0, sizeof(group->polling_total));
group->polling_next_update = ULLONG_MAX;
group->polling_until = 0;
- rcu_assign_pointer(group->poll_kworker, NULL);
+ init_waitqueue_head(&group->poll_wait);
+ timer_setup(&group->poll_timer, poll_timer_fn, 0);
+ rcu_assign_pointer(group->poll_task, NULL);
}
void __init psi_init(void)
@@ -225,7 +228,7 @@
case PSI_MEM_FULL:
return tasks[NR_MEMSTALL] && !tasks[NR_RUNNING];
case PSI_CPU_SOME:
- return tasks[NR_RUNNING] > 1;
+ return tasks[NR_RUNNING] > tasks[NR_ONCPU];
case PSI_NONIDLE:
return tasks[NR_IOWAIT] || tasks[NR_MEMSTALL] ||
tasks[NR_RUNNING];
@@ -547,47 +550,38 @@
return now + group->poll_min_period;
}
-/*
- * Schedule polling if it's not already scheduled. It's safe to call even from
- * hotpath because even though kthread_queue_delayed_work takes worker->lock
- * spinlock that spinlock is never contended due to poll_scheduled atomic
- * preventing such competition.
- */
+/* Schedule polling if it's not already scheduled. */
static void psi_schedule_poll_work(struct psi_group *group, unsigned long delay)
{
- struct kthread_worker *kworker;
+ struct task_struct *task;
- /* Do not reschedule if already scheduled */
- if (atomic_cmpxchg(&group->poll_scheduled, 0, 1) != 0)
+ /*
+ * Do not reschedule if already scheduled.
+ * Possible race with a timer scheduled after this check but before
+ * mod_timer below can be tolerated because group->polling_next_update
+ * will keep updates on schedule.
+ */
+ if (timer_pending(&group->poll_timer))
return;
rcu_read_lock();
- kworker = rcu_dereference(group->poll_kworker);
+ task = rcu_dereference(group->poll_task);
/*
* kworker might be NULL in case psi_trigger_destroy races with
* psi_task_change (hotpath) which can't use locks
*/
- if (likely(kworker))
- kthread_queue_delayed_work(kworker, &group->poll_work, delay);
- else
- atomic_set(&group->poll_scheduled, 0);
+ if (likely(task))
+ mod_timer(&group->poll_timer, jiffies + delay);
rcu_read_unlock();
}
-static void psi_poll_work(struct kthread_work *work)
+static void psi_poll_work(struct psi_group *group)
{
- struct kthread_delayed_work *dwork;
- struct psi_group *group;
u32 changed_states;
u64 now;
- dwork = container_of(work, struct kthread_delayed_work, work);
- group = container_of(dwork, struct psi_group, poll_work);
-
- atomic_set(&group->poll_scheduled, 0);
-
mutex_lock(&group->trigger_lock);
now = sched_clock();
@@ -623,6 +617,32 @@
mutex_unlock(&group->trigger_lock);
}
+static int psi_poll_worker(void *data)
+{
+ struct psi_group *group = (struct psi_group *)data;
+
+ sched_set_fifo_low(current);
+
+ while (true) {
+ wait_event_interruptible(group->poll_wait,
+ atomic_cmpxchg(&group->poll_wakeup, 1, 0) ||
+ kthread_should_stop());
+ if (kthread_should_stop())
+ break;
+
+ psi_poll_work(group);
+ }
+ return 0;
+}
+
+static void poll_timer_fn(struct timer_list *t)
+{
+ struct psi_group *group = from_timer(group, t, poll_timer);
+
+ atomic_set(&group->poll_wakeup, 1);
+ wake_up_interruptible(&group->poll_wait);
+}
+
static void record_times(struct psi_group_cpu *groupc, int cpu,
bool memstall_tick)
{
@@ -669,13 +689,14 @@
groupc->times[PSI_NONIDLE] += delta;
}
-static u32 psi_group_change(struct psi_group *group, int cpu,
- unsigned int clear, unsigned int set)
+static void psi_group_change(struct psi_group *group, int cpu,
+ unsigned int clear, unsigned int set,
+ bool wake_clock)
{
struct psi_group_cpu *groupc;
+ u32 state_mask = 0;
unsigned int t, m;
enum psi_states s;
- u32 state_mask = 0;
groupc = per_cpu_ptr(group->pcpu, cpu);
@@ -694,14 +715,15 @@
for (t = 0, m = clear; m; m &= ~(1 << t), t++) {
if (!(m & (1 << t)))
continue;
- if (groupc->tasks[t] == 0 && !psi_bug) {
- printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u] clear=%x set=%x\n",
+ if (groupc->tasks[t]) {
+ groupc->tasks[t]--;
+ } else if (!psi_bug) {
+ printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u] clear=%x set=%x\n",
cpu, t, groupc->tasks[0],
groupc->tasks[1], groupc->tasks[2],
- clear, set);
+ groupc->tasks[3], clear, set);
psi_bug = 1;
}
- groupc->tasks[t]--;
}
for (t = 0; set; set &= ~(1 << t), t++)
@@ -717,7 +739,11 @@
write_seqcount_end(&groupc->seq);
- return state_mask;
+ if (state_mask & group->poll_states)
+ psi_schedule_poll_work(group, 1);
+
+ if (wake_clock && !delayed_work_pending(&group->avgs_work))
+ schedule_delayed_work(&group->avgs_work, PSI_FREQ);
}
static struct psi_group *iterate_groups(struct task_struct *task, void **iter)
@@ -744,6 +770,21 @@
return &psi_system;
}
+static void psi_flags_change(struct task_struct *task, int clear, int set)
+{
+ if (((task->psi_flags & set) ||
+ (task->psi_flags & clear) != clear) &&
+ !psi_bug) {
+ printk_deferred(KERN_ERR "psi: inconsistent task state! task=%d:%s cpu=%d psi_flags=%x clear=%x set=%x\n",
+ task->pid, task->comm, task_cpu(task),
+ task->psi_flags, clear, set);
+ psi_bug = 1;
+ }
+
+ task->psi_flags &= ~clear;
+ task->psi_flags |= set;
+}
+
void psi_task_change(struct task_struct *task, int clear, int set)
{
int cpu = task_cpu(task);
@@ -754,17 +795,7 @@
if (!task->pid)
return;
- if (((task->psi_flags & set) ||
- (task->psi_flags & clear) != clear) &&
- !psi_bug) {
- printk_deferred(KERN_ERR "psi: inconsistent task state! task=%d:%s cpu=%d psi_flags=%x clear=%x set=%x\n",
- task->pid, task->comm, cpu,
- task->psi_flags, clear, set);
- psi_bug = 1;
- }
-
- task->psi_flags &= ~clear;
- task->psi_flags |= set;
+ psi_flags_change(task, clear, set);
/*
* Periodic aggregation shuts off if there is a period of no
@@ -777,14 +808,51 @@
wq_worker_last_func(task) == psi_avgs_work))
wake_clock = false;
- while ((group = iterate_groups(task, &iter))) {
- u32 state_mask = psi_group_change(group, cpu, clear, set);
+ while ((group = iterate_groups(task, &iter)))
+ psi_group_change(group, cpu, clear, set, wake_clock);
+}
- if (state_mask & group->poll_states)
- psi_schedule_poll_work(group, 1);
+void psi_task_switch(struct task_struct *prev, struct task_struct *next,
+ bool sleep)
+{
+ struct psi_group *group, *common = NULL;
+ int cpu = task_cpu(prev);
+ void *iter;
- if (wake_clock && !delayed_work_pending(&group->avgs_work))
- schedule_delayed_work(&group->avgs_work, PSI_FREQ);
+ if (next->pid) {
+ psi_flags_change(next, 0, TSK_ONCPU);
+ /*
+ * When moving state between tasks, the group that
+ * contains them both does not change: we can stop
+ * updating the tree once we reach the first common
+ * ancestor. Iterate @next's ancestors until we
+ * encounter @prev's state.
+ */
+ iter = NULL;
+ while ((group = iterate_groups(next, &iter))) {
+ if (per_cpu_ptr(group->pcpu, cpu)->tasks[NR_ONCPU]) {
+ common = group;
+ break;
+ }
+
+ psi_group_change(group, cpu, 0, TSK_ONCPU, true);
+ }
+ }
+
+ /*
+ * If this is a voluntary sleep, dequeue will have taken care
+ * of the outgoing TSK_ONCPU alongside TSK_RUNNING already. We
+ * only need to deal with it during preemption.
+ */
+ if (sleep)
+ return;
+
+ if (prev->pid) {
+ psi_flags_change(prev, TSK_ONCPU, 0);
+
+ iter = NULL;
+ while ((group = iterate_groups(prev, &iter)) && group != common)
+ psi_group_change(group, cpu, TSK_ONCPU, 0, true);
}
}
@@ -818,17 +886,17 @@
if (static_branch_likely(&psi_disabled))
return;
- *flags = current->flags & PF_MEMSTALL;
+ *flags = current->in_memstall;
if (*flags)
return;
/*
- * PF_MEMSTALL setting & accounting needs to be atomic wrt
+ * in_memstall setting & accounting needs to be atomic wrt
* changes to the task's scheduling state, otherwise we can
* race with CPU migration.
*/
rq = this_rq_lock_irq(&rf);
- current->flags |= PF_MEMSTALL;
+ current->in_memstall = 1;
psi_task_change(current, 0, TSK_MEMSTALL);
rq_unlock_irq(rq, &rf);
@@ -851,13 +919,13 @@
if (*flags)
return;
/*
- * PF_MEMSTALL clearing & accounting needs to be atomic wrt
+ * in_memstall clearing & accounting needs to be atomic wrt
* changes to the task's scheduling state, otherwise we could
* race with CPU migration.
*/
rq = this_rq_lock_irq(&rf);
- current->flags &= ~PF_MEMSTALL;
+ current->in_memstall = 0;
psi_task_change(current, TSK_MEMSTALL, 0);
rq_unlock_irq(rq, &rf);
@@ -916,12 +984,14 @@
rq = task_rq_lock(task, &rf);
- if (task_on_rq_queued(task))
+ if (task_on_rq_queued(task)) {
task_flags = TSK_RUNNING;
- else if (task->in_iowait)
+ if (task_current(rq, task))
+ task_flags |= TSK_ONCPU;
+ } else if (task->in_iowait)
task_flags = TSK_IOWAIT;
- if (task->flags & PF_MEMSTALL)
+ if (task->in_memstall)
task_flags |= TSK_MEMSTALL;
if (task_flags)
@@ -1046,26 +1116,21 @@
t->event = 0;
t->last_event_time = 0;
init_waitqueue_head(&t->event_wait);
- kref_init(&t->refcount);
mutex_lock(&group->trigger_lock);
- if (!rcu_access_pointer(group->poll_kworker)) {
- struct sched_param param = {
- .sched_priority = 1,
- };
- struct kthread_worker *kworker;
+ if (!rcu_access_pointer(group->poll_task)) {
+ struct task_struct *task;
- kworker = kthread_create_worker(0, "psimon");
- if (IS_ERR(kworker)) {
+ task = kthread_create(psi_poll_worker, group, "psimon");
+ if (IS_ERR(task)) {
kfree(t);
mutex_unlock(&group->trigger_lock);
- return ERR_CAST(kworker);
+ return ERR_CAST(task);
}
- sched_setscheduler_nocheck(kworker->task, SCHED_FIFO, ¶m);
- kthread_init_delayed_work(&group->poll_work,
- psi_poll_work);
- rcu_assign_pointer(group->poll_kworker, kworker);
+ atomic_set(&group->poll_wakeup, 0);
+ wake_up_process(task);
+ rcu_assign_pointer(group->poll_task, task);
}
list_add(&t->node, &group->triggers);
@@ -1079,15 +1144,19 @@
return t;
}
-static void psi_trigger_destroy(struct kref *ref)
+void psi_trigger_destroy(struct psi_trigger *t)
{
- struct psi_trigger *t = container_of(ref, struct psi_trigger, refcount);
- struct psi_group *group = t->group;
- struct kthread_worker *kworker_to_destroy = NULL;
+ struct psi_group *group;
+ struct task_struct *task_to_destroy = NULL;
- if (static_branch_likely(&psi_disabled))
+ /*
+ * We do not check psi_disabled since it might have been disabled after
+ * the trigger got created.
+ */
+ if (!t)
return;
+ group = t->group;
/*
* Wakeup waiters to stop polling. Can happen if cgroup is deleted
* from under a polling process.
@@ -1109,55 +1178,39 @@
period = min(period, div_u64(tmp->win.size,
UPDATES_PER_WINDOW));
group->poll_min_period = period;
- /* Destroy poll_kworker when the last trigger is destroyed */
+ /* Destroy poll_task when the last trigger is destroyed */
if (group->poll_states == 0) {
group->polling_until = 0;
- kworker_to_destroy = rcu_dereference_protected(
- group->poll_kworker,
+ task_to_destroy = rcu_dereference_protected(
+ group->poll_task,
lockdep_is_held(&group->trigger_lock));
- rcu_assign_pointer(group->poll_kworker, NULL);
+ rcu_assign_pointer(group->poll_task, NULL);
+ del_timer(&group->poll_timer);
}
}
mutex_unlock(&group->trigger_lock);
/*
- * Wait for both *trigger_ptr from psi_trigger_replace and
- * poll_kworker RCUs to complete their read-side critical sections
- * before destroying the trigger and optionally the poll_kworker
+ * Wait for psi_schedule_poll_work RCU to complete its read-side
+ * critical section before destroying the trigger and optionally the
+ * poll_task.
*/
synchronize_rcu();
/*
- * Destroy the kworker after releasing trigger_lock to prevent a
+ * Stop kthread 'psimon' after releasing trigger_lock to prevent a
* deadlock while waiting for psi_poll_work to acquire trigger_lock
*/
- if (kworker_to_destroy) {
+ if (task_to_destroy) {
/*
* After the RCU grace period has expired, the worker
- * can no longer be found through group->poll_kworker.
- * But it might have been already scheduled before
- * that - deschedule it cleanly before destroying it.
+ * can no longer be found through group->poll_task.
*/
- kthread_cancel_delayed_work_sync(&group->poll_work);
- atomic_set(&group->poll_scheduled, 0);
-
- kthread_destroy_worker(kworker_to_destroy);
+ kthread_stop(task_to_destroy);
}
kfree(t);
}
-void psi_trigger_replace(void **trigger_ptr, struct psi_trigger *new)
-{
- struct psi_trigger *old = *trigger_ptr;
-
- if (static_branch_likely(&psi_disabled))
- return;
-
- rcu_assign_pointer(*trigger_ptr, new);
- if (old)
- kref_put(&old->refcount, psi_trigger_destroy);
-}
-
__poll_t psi_trigger_poll(void **trigger_ptr,
struct file *file, poll_table *wait)
{
@@ -1167,24 +1220,15 @@
if (static_branch_likely(&psi_disabled))
return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI;
- rcu_read_lock();
-
- t = rcu_dereference(*(void __rcu __force **)trigger_ptr);
- if (!t) {
- rcu_read_unlock();
+ t = smp_load_acquire(trigger_ptr);
+ if (!t)
return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI;
- }
- kref_get(&t->refcount);
-
- rcu_read_unlock();
poll_wait(file, &t->event_wait, wait);
if (cmpxchg(&t->event, 1, 0) == 1)
ret |= EPOLLPRI;
- kref_put(&t->refcount, psi_trigger_destroy);
-
return ret;
}
@@ -1208,14 +1252,24 @@
buf[buf_size - 1] = '\0';
- new = psi_trigger_create(&psi_system, buf, nbytes, res);
- if (IS_ERR(new))
- return PTR_ERR(new);
-
seq = file->private_data;
+
/* Take seq->lock to protect seq->private from concurrent writes */
mutex_lock(&seq->lock);
- psi_trigger_replace(&seq->private, new);
+
+ /* Allow only one trigger per file descriptor */
+ if (seq->private) {
+ mutex_unlock(&seq->lock);
+ return -EBUSY;
+ }
+
+ new = psi_trigger_create(&psi_system, buf, nbytes, res);
+ if (IS_ERR(new)) {
+ mutex_unlock(&seq->lock);
+ return PTR_ERR(new);
+ }
+
+ smp_store_release(&seq->private, new);
mutex_unlock(&seq->lock);
return nbytes;
@@ -1250,43 +1304,45 @@
{
struct seq_file *seq = file->private_data;
- psi_trigger_replace(&seq->private, NULL);
+ psi_trigger_destroy(seq->private);
return single_release(inode, file);
}
-static const struct file_operations psi_io_fops = {
- .open = psi_io_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .write = psi_io_write,
- .poll = psi_fop_poll,
- .release = psi_fop_release,
+static const struct proc_ops psi_io_proc_ops = {
+ .proc_open = psi_io_open,
+ .proc_read = seq_read,
+ .proc_lseek = seq_lseek,
+ .proc_write = psi_io_write,
+ .proc_poll = psi_fop_poll,
+ .proc_release = psi_fop_release,
};
-static const struct file_operations psi_memory_fops = {
- .open = psi_memory_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .write = psi_memory_write,
- .poll = psi_fop_poll,
- .release = psi_fop_release,
+static const struct proc_ops psi_memory_proc_ops = {
+ .proc_open = psi_memory_open,
+ .proc_read = seq_read,
+ .proc_lseek = seq_lseek,
+ .proc_write = psi_memory_write,
+ .proc_poll = psi_fop_poll,
+ .proc_release = psi_fop_release,
};
-static const struct file_operations psi_cpu_fops = {
- .open = psi_cpu_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .write = psi_cpu_write,
- .poll = psi_fop_poll,
- .release = psi_fop_release,
+static const struct proc_ops psi_cpu_proc_ops = {
+ .proc_open = psi_cpu_open,
+ .proc_read = seq_read,
+ .proc_lseek = seq_lseek,
+ .proc_write = psi_cpu_write,
+ .proc_poll = psi_fop_poll,
+ .proc_release = psi_fop_release,
};
static int __init psi_proc_init(void)
{
- proc_mkdir("pressure", NULL);
- proc_create("pressure/io", 0, NULL, &psi_io_fops);
- proc_create("pressure/memory", 0, NULL, &psi_memory_fops);
- proc_create("pressure/cpu", 0, NULL, &psi_cpu_fops);
+ if (psi_enable) {
+ proc_mkdir("pressure", NULL);
+ proc_create("pressure/io", 0, NULL, &psi_io_proc_ops);
+ proc_create("pressure/memory", 0, NULL, &psi_memory_proc_ops);
+ proc_create("pressure/cpu", 0, NULL, &psi_cpu_proc_ops);
+ }
return 0;
}
module_init(psi_proc_init);
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 2dffb87..41b14d9 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -52,11 +52,8 @@
rt_b->rt_period_timer.function = sched_rt_period_timer;
}
-static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
+static inline void do_start_rt_bandwidth(struct rt_bandwidth *rt_b)
{
- if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
- return;
-
raw_spin_lock(&rt_b->rt_runtime_lock);
if (!rt_b->rt_period_active) {
rt_b->rt_period_active = 1;
@@ -75,6 +72,14 @@
raw_spin_unlock(&rt_b->rt_runtime_lock);
}
+static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
+{
+ if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
+ return;
+
+ do_start_rt_bandwidth(rt_b);
+}
+
void init_rt_rq(struct rt_rq *rt_rq)
{
struct rt_prio_array *array;
@@ -439,6 +444,45 @@
return rt_se->on_rq;
}
+#ifdef CONFIG_UCLAMP_TASK
+/*
+ * Verify the fitness of task @p to run on @cpu taking into account the uclamp
+ * settings.
+ *
+ * This check is only important for heterogeneous systems where uclamp_min value
+ * is higher than the capacity of a @cpu. For non-heterogeneous system this
+ * function will always return true.
+ *
+ * The function will return true if the capacity of the @cpu is >= the
+ * uclamp_min and false otherwise.
+ *
+ * Note that uclamp_min will be clamped to uclamp_max if uclamp_min
+ * > uclamp_max.
+ */
+static inline bool rt_task_fits_capacity(struct task_struct *p, int cpu)
+{
+ unsigned int min_cap;
+ unsigned int max_cap;
+ unsigned int cpu_cap;
+
+ /* Only heterogeneous systems can benefit from this check */
+ if (!static_branch_unlikely(&sched_asym_cpucapacity))
+ return true;
+
+ min_cap = uclamp_eff_value(p, UCLAMP_MIN);
+ max_cap = uclamp_eff_value(p, UCLAMP_MAX);
+
+ cpu_cap = capacity_orig_of(cpu);
+
+ return cpu_cap >= min(min_cap, max_cap);
+}
+#else
+static inline bool rt_task_fits_capacity(struct task_struct *p, int cpu)
+{
+ return true;
+}
+#endif
+
#ifdef CONFIG_RT_GROUP_SCHED
static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
@@ -983,13 +1027,17 @@
for_each_sched_rt_entity(rt_se) {
struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
+ int exceeded;
if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {
raw_spin_lock(&rt_rq->rt_runtime_lock);
rt_rq->rt_time += delta_exec;
- if (sched_rt_runtime_exceeded(rt_rq))
+ exceeded = sched_rt_runtime_exceeded(rt_rq);
+ if (exceeded)
resched_curr(rq);
raw_spin_unlock(&rt_rq->rt_runtime_lock);
+ if (exceeded)
+ do_start_rt_bandwidth(sched_rt_bandwidth(rt_rq));
}
}
}
@@ -1393,6 +1441,7 @@
{
struct task_struct *curr;
struct rq *rq;
+ bool test;
/* For anything but wake ups, just return the task_cpu */
if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK)
@@ -1424,13 +1473,26 @@
*
* This test is optimistic, if we get it wrong the load-balancer
* will have to sort it out.
+ *
+ * We take into account the capacity of the CPU to ensure it fits the
+ * requirement of the task - which is only important on heterogeneous
+ * systems like big.LITTLE.
*/
- if (curr && unlikely(rt_task(curr)) &&
- (curr->nr_cpus_allowed < 2 ||
- curr->prio <= p->prio)) {
+ test = curr &&
+ unlikely(rt_task(curr)) &&
+ (curr->nr_cpus_allowed < 2 || curr->prio <= p->prio);
+
+ if (test || !rt_task_fits_capacity(p, cpu)) {
int target = find_lowest_rq(p);
/*
+ * Bail out if we were forcing a migration to find a better
+ * fitting CPU but our search failed.
+ */
+ if (!test && target != -1 && !rt_task_fits_capacity(p, target))
+ goto out_unlock;
+
+ /*
* Don't bother moving it if the destination CPU is
* not running a lower priority task.
*/
@@ -1438,6 +1500,8 @@
p->prio < cpu_rq(target)->rt.highest_prio.curr)
cpu = target;
}
+
+out_unlock:
rcu_read_unlock();
out:
@@ -1458,8 +1522,8 @@
* p is migratable, so let's not schedule it and
* see if it is pushed or pulled somewhere else.
*/
- if (p->nr_cpus_allowed != 1
- && cpupri_find(&rq->rd->cpupri, p, NULL))
+ if (p->nr_cpus_allowed != 1 &&
+ cpupri_find(&rq->rd->cpupri, p, NULL))
return;
/*
@@ -1569,13 +1633,10 @@
return rt_task_of(rt_se);
}
-static struct task_struct *
-pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+static struct task_struct *pick_next_task_rt(struct rq *rq)
{
struct task_struct *p;
- WARN_ON_ONCE(prev || rf);
-
if (!sched_rt_runnable(rq))
return NULL;
@@ -1640,6 +1701,7 @@
struct cpumask *lowest_mask = this_cpu_cpumask_var_ptr(local_cpu_mask);
int this_cpu = smp_processor_id();
int cpu = task_cpu(task);
+ int ret;
/* Make sure the mask is initialized first */
if (unlikely(!lowest_mask))
@@ -1648,7 +1710,22 @@
if (task->nr_cpus_allowed == 1)
return -1; /* No other targets possible */
- if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask))
+ /*
+ * If we're on asym system ensure we consider the different capacities
+ * of the CPUs when searching for the lowest_mask.
+ */
+ if (static_branch_unlikely(&sched_asym_cpucapacity)) {
+
+ ret = cpupri_find_fitness(&task_rq(task)->rd->cpupri,
+ task, lowest_mask,
+ rt_task_fits_capacity);
+ } else {
+
+ ret = cpupri_find(&task_rq(task)->rd->cpupri,
+ task, lowest_mask);
+ }
+
+ if (!ret)
return -1; /* No targets found */
/*
@@ -2152,12 +2229,14 @@
*/
static void task_woken_rt(struct rq *rq, struct task_struct *p)
{
- if (!task_running(rq, p) &&
- !test_tsk_need_resched(rq->curr) &&
- p->nr_cpus_allowed > 1 &&
- (dl_task(rq->curr) || rt_task(rq->curr)) &&
- (rq->curr->nr_cpus_allowed < 2 ||
- rq->curr->prio <= p->prio))
+ bool need_to_push = !task_running(rq, p) &&
+ !test_tsk_need_resched(rq->curr) &&
+ p->nr_cpus_allowed > 1 &&
+ (dl_task(rq->curr) || rt_task(rq->curr)) &&
+ (rq->curr->nr_cpus_allowed < 2 ||
+ rq->curr->prio <= p->prio);
+
+ if (need_to_push)
push_rt_tasks(rq);
}
@@ -2366,8 +2445,8 @@
return 0;
}
-const struct sched_class rt_sched_class = {
- .next = &fair_sched_class,
+const struct sched_class rt_sched_class
+ __section("__rt_sched_class") = {
.enqueue_task = enqueue_task_rt,
.dequeue_task = dequeue_task_rt,
.yield_task = yield_task_rt,
@@ -2408,10 +2487,11 @@
*/
static DEFINE_MUTEX(rt_constraints_mutex);
-/* Must be called with tasklist_lock held */
static inline int tg_has_rt_tasks(struct task_group *tg)
{
- struct task_struct *g, *p;
+ struct task_struct *task;
+ struct css_task_iter it;
+ int ret = 0;
/*
* Autogroups do not have RT tasks; see autogroup_create().
@@ -2419,12 +2499,12 @@
if (task_group_is_autogroup(tg))
return 0;
- for_each_process_thread(g, p) {
- if (rt_task(p) && task_group(p) == tg)
- return 1;
- }
+ css_task_iter_start(&tg->css, 0, &it);
+ while (!ret && (task = css_task_iter_next(&it)))
+ ret |= rt_task(task);
+ css_task_iter_end(&it);
- return 0;
+ return ret;
}
struct rt_schedulable_data {
@@ -2455,9 +2535,10 @@
return -EINVAL;
/*
- * Ensure we don't starve existing RT tasks.
+ * Ensure we don't starve existing RT tasks if runtime turns zero.
*/
- if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))
+ if (rt_bandwidth_enabled() && !runtime &&
+ tg->rt_bandwidth.rt_runtime && tg_has_rt_tasks(tg))
return -EBUSY;
total = to_ratio(period, runtime);
@@ -2529,7 +2610,6 @@
return -EINVAL;
mutex_lock(&rt_constraints_mutex);
- read_lock(&tasklist_lock);
err = __rt_schedulable(tg, rt_period, rt_runtime);
if (err)
goto unlock;
@@ -2547,7 +2627,6 @@
}
raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
unlock:
- read_unlock(&tasklist_lock);
mutex_unlock(&rt_constraints_mutex);
return err;
@@ -2606,9 +2685,7 @@
int ret = 0;
mutex_lock(&rt_constraints_mutex);
- read_lock(&tasklist_lock);
ret = __rt_schedulable(NULL, 0, 0);
- read_unlock(&tasklist_lock);
mutex_unlock(&rt_constraints_mutex);
return ret;
@@ -2659,13 +2736,16 @@
static void sched_rt_do_global(void)
{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
def_rt_bandwidth.rt_runtime = global_rt_runtime();
def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period());
+ raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
}
-int sched_rt_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
+int sched_rt_handler(struct ctl_table *table, int write, void *buffer,
+ size_t *lenp, loff_t *ppos)
{
int old_period, old_runtime;
static DEFINE_MUTEX(mutex);
@@ -2703,9 +2783,8 @@
return ret;
}
-int sched_rr_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
+int sched_rr_handler(struct ctl_table *table, int write, void *buffer,
+ size_t *lenp, loff_t *ppos)
{
int ret;
static DEFINE_MUTEX(mutex);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index fe755c1..08db8e0 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -67,6 +67,7 @@
#include <linux/tsacct_kern.h>
#include <asm/tlb.h>
+#include <asm-generic/vmlinux.lds.h>
#ifdef CONFIG_PARAVIRT
# include <asm/paravirt.h>
@@ -75,6 +76,8 @@
#include "cpupri.h"
#include "cpudeadline.h"
+#include <trace/events/sched.h>
+
#ifdef CONFIG_SCHED_DEBUG
# define SCHED_WARN_ON(x) WARN_ONCE(x, #x)
#else
@@ -96,6 +99,7 @@
extern void calc_global_load_tick(struct rq *this_rq);
extern long calc_load_fold_active(struct rq *this_rq, long adjust);
+extern void call_trace_sched_update_nr_running(struct rq *rq, int count);
/*
* Helpers for converting nanosecond timing to jiffy resolution
*/
@@ -195,6 +199,19 @@
#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
+static inline void update_avg(u64 *avg, u64 sample)
+{
+ s64 diff = sample - *avg;
+ *avg += diff / 8;
+}
+
+/*
+ * Shifting a value by an exponent greater *or equal* to the size of said value
+ * is UB; cap at size-1.
+ */
+#define shr_bound(val, shift) \
+ (val >> min_t(typeof(shift), shift, BITS_PER_TYPE(typeof(val)) - 1))
+
/*
* !! For sched_setattr_nocheck() (kernel) only !!
*
@@ -300,14 +317,28 @@
__dl_update(dl_b, -((s32)tsk_bw / cpus));
}
-static inline
-bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw)
+static inline bool __dl_overflow(struct dl_bw *dl_b, unsigned long cap,
+ u64 old_bw, u64 new_bw)
{
return dl_b->bw != -1 &&
- dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw;
+ cap_scale(dl_b->bw, cap) < dl_b->total_bw - old_bw + new_bw;
}
-extern void dl_change_utilization(struct task_struct *p, u64 new_bw);
+/*
+ * Verify the fitness of task @p to run on @cpu taking into account the
+ * CPU original capacity and the runtime/deadline ratio of the task.
+ *
+ * The function will return true if the CPU original capacity of the
+ * @cpu scaled by SCHED_CAPACITY_SCALE >= runtime/deadline ratio of the
+ * task and false otherwise.
+ */
+static inline bool dl_task_fits_capacity(struct task_struct *p, int cpu)
+{
+ unsigned long cap = arch_scale_cpu_capacity(cpu);
+
+ return cap_scale(p->dl.dl_deadline, cap) >= p->dl.dl_runtime;
+}
+
extern void init_dl_bw(struct dl_bw *dl_b);
extern int sched_dl_global_validate(void);
extern void sched_dl_do_global(void);
@@ -340,7 +371,6 @@
u8 idle;
u8 period_active;
- u8 distribute_running;
u8 slack_started;
struct hrtimer period_timer;
struct hrtimer slack_timer;
@@ -491,7 +521,6 @@
/* CFS-related fields in a runqueue */
struct cfs_rq {
struct load_weight load;
- unsigned long runnable_weight;
unsigned int nr_running;
unsigned int h_nr_running; /* SCHED_{NORMAL,BATCH,IDLE} */
unsigned int idle_h_nr_running; /* SCHED_IDLE */
@@ -530,7 +559,7 @@
int nr;
unsigned long load_avg;
unsigned long util_avg;
- unsigned long runnable_sum;
+ unsigned long runnable_avg;
} removed;
#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -690,8 +719,30 @@
#ifdef CONFIG_FAIR_GROUP_SCHED
/* An entity is a task if it doesn't "own" a runqueue */
#define entity_is_task(se) (!se->my_q)
+
+static inline void se_update_runnable(struct sched_entity *se)
+{
+ if (!entity_is_task(se))
+ se->runnable_weight = se->my_q->h_nr_running;
+}
+
+static inline long se_runnable(struct sched_entity *se)
+{
+ if (entity_is_task(se))
+ return !!se->on_rq;
+ else
+ return se->runnable_weight;
+}
+
#else
#define entity_is_task(se) 1
+
+static inline void se_update_runnable(struct sched_entity *se) {}
+
+static inline long se_runnable(struct sched_entity *se)
+{
+ return !!se->on_rq;
+}
#endif
#ifdef CONFIG_SMP
@@ -703,10 +754,6 @@
return scale_load_down(se->load.weight);
}
-static inline long se_runnable(struct sched_entity *se)
-{
- return scale_load_down(se->runnable_weight);
-}
static inline bool sched_asym_prefer(int a, int b)
{
@@ -864,15 +911,17 @@
#endif
#ifdef CONFIG_NO_HZ_COMMON
#ifdef CONFIG_SMP
- unsigned long last_load_update_tick;
unsigned long last_blocked_load_update_tick;
unsigned int has_blocked_load;
+ call_single_data_t nohz_csd;
#endif /* CONFIG_SMP */
unsigned int nohz_tick_stopped;
- atomic_t nohz_flags;
+ atomic_t nohz_flags;
#endif /* CONFIG_NO_HZ_COMMON */
- unsigned long nr_load_updates;
+#ifdef CONFIG_SMP
+ unsigned int ttwu_pending;
+#endif
u64 nr_switches;
#ifdef CONFIG_UCLAMP_TASK
@@ -900,7 +949,7 @@
*/
unsigned long nr_uninterruptible;
- struct task_struct *curr;
+ struct task_struct __rcu *curr;
struct task_struct *idle;
struct task_struct *stop;
unsigned long next_balance;
@@ -928,6 +977,7 @@
struct callback_head *balance_callback;
+ unsigned char nohz_idle_balance;
unsigned char idle_balance;
unsigned long misfit_task_load;
@@ -948,12 +998,15 @@
#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
struct sched_avg avg_irq;
#endif
+#ifdef CONFIG_SCHED_THERMAL_PRESSURE
+ struct sched_avg avg_thermal;
+#endif
u64 idle_stamp;
u64 avg_idle;
/* This is used to determine avg_idle's max value */
u64 max_idle_balance_cost;
-#endif
+#endif /* CONFIG_SMP */
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
u64 prev_irq_time;
@@ -971,7 +1024,6 @@
#ifdef CONFIG_SCHED_HRTICK
#ifdef CONFIG_SMP
- int hrtick_csd_pending;
call_single_data_t hrtick_csd;
#endif
struct hrtimer hrtick_timer;
@@ -996,10 +1048,6 @@
unsigned int ttwu_local;
#endif
-#ifdef CONFIG_SMP
- struct llist_head wake_list;
-#endif
-
#ifdef CONFIG_CPU_IDLE
/* Must be inspected within a rcu lock section */
struct cpuidle_state *idle_state;
@@ -1112,6 +1160,24 @@
return rq->clock_task;
}
+/**
+ * By default the decay is the default pelt decay period.
+ * The decay shift can change the decay period in
+ * multiples of 32.
+ * Decay shift Decay period(ms)
+ * 0 32
+ * 1 64
+ * 2 128
+ * 3 256
+ * 4 512
+ */
+extern int sched_thermal_decay_shift;
+
+static inline u64 rq_clock_thermal(struct rq *rq)
+{
+ return rq_clock_task(rq) >> sched_thermal_decay_shift;
+}
+
static inline void rq_clock_skip_update(struct rq *rq)
{
lockdep_assert_held(&rq->lock);
@@ -1141,6 +1207,16 @@
#endif
};
+/*
+ * Lockdep annotation that avoids accidental unlocks; it's like a
+ * sticky/continuous lockdep_assert_held().
+ *
+ * This avoids code that has access to 'struct rq *rq' (basically everything in
+ * the scheduler) from accidentally unlocking the rq if they do not also have a
+ * copy of the (on-stack) 'struct rq_flags rf'.
+ *
+ * Also see Documentation/locking/lockdep-design.rst.
+ */
static inline void rq_pin_lock(struct rq *rq, struct rq_flags *rf)
{
rf->cookie = lockdep_pin_lock(&rq->lock);
@@ -1325,8 +1401,6 @@
rq->balance_callback = head;
}
-extern void sched_ttwu_pending(void);
-
#define rcu_dereference_check_sched_domain(p) \
rcu_dereference_check((p), \
lockdep_is_held(&sched_domains_mutex))
@@ -1342,8 +1416,6 @@
for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); \
__sd; __sd = __sd->parent)
-#define for_each_lower_domain(sd) for (; sd; sd = sd->child)
-
/**
* highest_flag_domain - Return highest sched_domain containing flag.
* @cpu: The CPU whose highest level of sched domain is to
@@ -1403,7 +1475,7 @@
int id;
#endif
- unsigned long cpumask[0]; /* Balance mask */
+ unsigned long cpumask[]; /* Balance mask */
};
struct sched_group {
@@ -1421,7 +1493,7 @@
* by attaching extra space to the end of the structure,
* depending on how many CPUs the kernel has booted up with)
*/
- unsigned long cpumask[0];
+ unsigned long cpumask[];
};
static inline struct cpumask *sched_group_span(struct sched_group *sg)
@@ -1464,15 +1536,11 @@
}
#endif
-extern int newidle_balance(struct rq *this_rq, struct rq_flags *rf);
+extern void flush_smp_call_function_from_idle(void);
-#else
-
-static inline void sched_ttwu_pending(void) { }
-
-static inline int newidle_balance(struct rq *this_rq, struct rq_flags *rf) { return 0; }
-
-#endif /* CONFIG_SMP */
+#else /* !CONFIG_SMP: */
+static inline void flush_smp_call_function_from_idle(void) { }
+#endif
#include "stats.h"
#include "autogroup.h"
@@ -1655,7 +1723,8 @@
*/
#define WF_SYNC 0x01 /* Waker goes to sleep after wakeup */
#define WF_FORK 0x02 /* Child wakeup after fork */
-#define WF_MIGRATED 0x4 /* Internal use, task got migrated */
+#define WF_MIGRATED 0x04 /* Internal use, task got migrated */
+#define WF_ON_CPU 0x08 /* Wakee is on_cpu */
/*
* To aid in avoiding the subversion of "niceness" due to uneven distribution
@@ -1712,7 +1781,6 @@
#define RETRY_TASK ((void *)-1UL)
struct sched_class {
- const struct sched_class *next;
#ifdef CONFIG_UCLAMP_TASK
int uclamp_enabled;
@@ -1721,24 +1789,12 @@
void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags);
void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags);
void (*yield_task) (struct rq *rq);
- bool (*yield_to_task)(struct rq *rq, struct task_struct *p, bool preempt);
+ bool (*yield_to_task)(struct rq *rq, struct task_struct *p);
void (*check_preempt_curr)(struct rq *rq, struct task_struct *p, int flags);
- /*
- * Both @prev and @rf are optional and may be NULL, in which case the
- * caller must already have invoked put_prev_task(rq, prev, rf).
- *
- * Otherwise it is the responsibility of the pick_next_task() to call
- * put_prev_task() on the @prev task or something equivalent, IFF it
- * returns a next task.
- *
- * In that case (@rf != NULL) it may return RETRY_TASK when it finds a
- * higher prio class has runnable tasks.
- */
- struct task_struct * (*pick_next_task)(struct rq *rq,
- struct task_struct *prev,
- struct rq_flags *rf);
+ struct task_struct *(*pick_next_task)(struct rq *rq);
+
void (*put_prev_task)(struct rq *rq, struct task_struct *p);
void (*set_next_task)(struct rq *rq, struct task_struct *p, bool first);
@@ -1781,7 +1837,7 @@
#ifdef CONFIG_FAIR_GROUP_SCHED
void (*task_change_group)(struct task_struct *p, int type);
#endif
-};
+} __aligned(STRUCT_ALIGNMENT); /* STRUCT_ALIGN(), vmlinux.lds.h */
static inline void put_prev_task(struct rq *rq, struct task_struct *prev)
{
@@ -1795,17 +1851,18 @@
next->sched_class->set_next_task(rq, next, false);
}
-#ifdef CONFIG_SMP
-#define sched_class_highest (&stop_sched_class)
-#else
-#define sched_class_highest (&dl_sched_class)
-#endif
+/* Defined in include/asm-generic/vmlinux.lds.h */
+extern struct sched_class __begin_sched_classes[];
+extern struct sched_class __end_sched_classes[];
+
+#define sched_class_highest (__end_sched_classes - 1)
+#define sched_class_lowest (__begin_sched_classes - 1)
#define for_class_range(class, _from, _to) \
- for (class = (_from); class != (_to); class = class->next)
+ for (class = (_from); class != (_to); class--)
#define for_each_class(class) \
- for_class_range(class, sched_class_highest, NULL)
+ for_class_range(class, sched_class_highest, sched_class_lowest)
extern const struct sched_class stop_sched_class;
extern const struct sched_class dl_sched_class;
@@ -1833,6 +1890,9 @@
return rq->cfs.nr_running > 0;
}
+extern struct task_struct *pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf);
+extern struct task_struct *pick_next_task_idle(struct rq *rq);
+
#ifdef CONFIG_SMP
extern void update_group_capacity(struct sched_domain *sd, int cpu);
@@ -1890,7 +1950,6 @@
extern void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime);
extern void init_dl_task_timer(struct sched_dl_entity *dl_se);
extern void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se);
-extern void init_dl_rq_bw_ratio(struct dl_rq *dl_rq);
#define BW_SHIFT 20
#define BW_UNIT (1 << BW_SHIFT)
@@ -1913,12 +1972,7 @@
*/
static inline void sched_update_tick_dependency(struct rq *rq)
{
- int cpu;
-
- if (!tick_nohz_full_enabled())
- return;
-
- cpu = cpu_of(rq);
+ int cpu = cpu_of(rq);
if (!tick_nohz_full_cpu(cpu))
return;
@@ -1938,6 +1992,9 @@
unsigned prev_nr = rq->nr_running;
rq->nr_running = prev_nr + count;
+ if (trace_sched_update_nr_running_tp_enabled()) {
+ call_trace_sched_update_nr_running(rq, count);
+ }
#ifdef CONFIG_SMP
if (prev_nr < 2 && rq->nr_running >= 2) {
@@ -1952,6 +2009,10 @@
static inline void sub_nr_running(struct rq *rq, unsigned count)
{
rq->nr_running -= count;
+ if (trace_sched_update_nr_running_tp_enabled()) {
+ call_trace_sched_update_nr_running(rq, -count);
+ }
+
/* Check if we still need preemption */
sched_update_tick_dependency(rq);
}
@@ -1991,7 +2052,24 @@
#endif /* CONFIG_SCHED_HRTICK */
+#ifndef arch_scale_freq_tick
+static __always_inline
+void arch_scale_freq_tick(void)
+{
+}
+#endif
+
#ifndef arch_scale_freq_capacity
+/**
+ * arch_scale_freq_capacity - get the frequency scale factor of a given CPU.
+ * @cpu: the CPU in question.
+ *
+ * Return: the frequency scale factor normalized against SCHED_CAPACITY_SCALE, i.e.
+ *
+ * f_curr
+ * ------ * SCHED_CAPACITY_SCALE
+ * f_max
+ */
static __always_inline
unsigned long arch_scale_freq_capacity(int cpu)
{
@@ -2323,10 +2401,10 @@
#endif /* CONFIG_CPU_FREQ */
#ifdef CONFIG_UCLAMP_TASK
-unsigned int uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id);
+unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id);
/**
- * uclamp_util_with - clamp @util with @rq and @p effective uclamp values.
+ * uclamp_rq_util_with - clamp @util with @rq and @p effective uclamp values.
* @rq: The rq to clamp against. Must not be NULL.
* @util: The util value to clamp.
* @p: The task to clamp against. Can be NULL if you want to clamp
@@ -2343,23 +2421,30 @@
* static key is disabled.
*/
static __always_inline
-unsigned int uclamp_util_with(struct rq *rq, unsigned int util,
- struct task_struct *p)
+unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util,
+ struct task_struct *p)
{
- unsigned int min_util;
- unsigned int max_util;
+ unsigned long min_util = 0;
+ unsigned long max_util = 0;
if (!static_branch_likely(&sched_uclamp_used))
return util;
- min_util = READ_ONCE(rq->uclamp[UCLAMP_MIN].value);
- max_util = READ_ONCE(rq->uclamp[UCLAMP_MAX].value);
-
if (p) {
- min_util = max(min_util, uclamp_eff_value(p, UCLAMP_MIN));
- max_util = max(max_util, uclamp_eff_value(p, UCLAMP_MAX));
+ min_util = uclamp_eff_value(p, UCLAMP_MIN);
+ max_util = uclamp_eff_value(p, UCLAMP_MAX);
+
+ /*
+ * Ignore last runnable task's max clamp, as this task will
+ * reset it. Similarly, no need to read the rq's min clamp.
+ */
+ if (rq->uclamp_flags & UCLAMP_FLAG_IDLE)
+ goto out;
}
+ min_util = max_t(unsigned long, min_util, READ_ONCE(rq->uclamp[UCLAMP_MIN].value));
+ max_util = max_t(unsigned long, max_util, READ_ONCE(rq->uclamp[UCLAMP_MAX].value));
+out:
/*
* Since CPU's {min,max}_util clamps are MAX aggregated considering
* RUNNABLE tasks with _different_ clamps, we can end up with an
@@ -2371,11 +2456,6 @@
return clamp(util, min_util, max_util);
}
-static inline unsigned int uclamp_util(struct rq *rq, unsigned int util)
-{
- return uclamp_util_with(rq, util, NULL);
-}
-
/*
* When uclamp is compiled in, the aggregation at rq level is 'turned off'
* by default in the fast path and only gets turned on once userspace performs
@@ -2389,12 +2469,9 @@
return static_branch_likely(&sched_uclamp_used);
}
#else /* CONFIG_UCLAMP_TASK */
-static inline unsigned int uclamp_util_with(struct rq *rq, unsigned int util,
- struct task_struct *p)
-{
- return util;
-}
-static inline unsigned int uclamp_util(struct rq *rq, unsigned int util)
+static inline
+unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util,
+ struct task_struct *p)
{
return util;
}
@@ -2551,3 +2628,19 @@
{
}
#endif
+
+#ifdef CONFIG_SMP
+static inline bool is_per_cpu_kthread(struct task_struct *p)
+{
+ if (!(p->flags & PF_KTHREAD))
+ return false;
+
+ if (p->nr_cpus_allowed != 1)
+ return false;
+
+ return true;
+}
+#endif
+
+void swake_up_all_locked(struct swait_queue_head *q);
+void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait);
diff --git a/kernel/sched/smp.h b/kernel/sched/smp.h
new file mode 100644
index 0000000..9620e32
--- /dev/null
+++ b/kernel/sched/smp.h
@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Scheduler internal SMP callback types and methods between the scheduler
+ * and other internal parts of the core kernel:
+ */
+
+extern void sched_ttwu_pending(void *arg);
+
+extern void send_call_function_single_ipi(int cpu);
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index ba683fe..33d0daf 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -70,7 +70,7 @@
return;
if (!wakeup || p->sched_psi_wake_requeue) {
- if (p->flags & PF_MEMSTALL)
+ if (p->in_memstall)
set |= TSK_MEMSTALL;
if (p->sched_psi_wake_requeue)
p->sched_psi_wake_requeue = 0;
@@ -90,9 +90,17 @@
return;
if (!sleep) {
- if (p->flags & PF_MEMSTALL)
+ if (p->in_memstall)
clear |= TSK_MEMSTALL;
} else {
+ /*
+ * When a task sleeps, schedule() dequeues it before
+ * switching to the next one. Merge the clearing of
+ * TSK_RUNNING and TSK_ONCPU to save an unnecessary
+ * psi_task_change() call in psi_sched_switch().
+ */
+ clear |= TSK_ONCPU;
+
if (p->in_iowait)
set |= TSK_IOWAIT;
}
@@ -109,14 +117,14 @@
* deregister its sleep-persistent psi states from the old
* queue, and let psi_enqueue() know it has to requeue.
*/
- if (unlikely(p->in_iowait || (p->flags & PF_MEMSTALL))) {
+ if (unlikely(p->in_iowait || p->in_memstall)) {
struct rq_flags rf;
struct rq *rq;
int clear = 0;
if (p->in_iowait)
clear |= TSK_IOWAIT;
- if (p->flags & PF_MEMSTALL)
+ if (p->in_memstall)
clear |= TSK_MEMSTALL;
rq = __task_rq_lock(p, &rf);
@@ -126,18 +134,31 @@
}
}
+static inline void psi_sched_switch(struct task_struct *prev,
+ struct task_struct *next,
+ bool sleep)
+{
+ if (static_branch_likely(&psi_disabled))
+ return;
+
+ psi_task_switch(prev, next, sleep);
+}
+
static inline void psi_task_tick(struct rq *rq)
{
if (static_branch_likely(&psi_disabled))
return;
- if (unlikely(rq->curr->flags & PF_MEMSTALL))
+ if (unlikely(rq->curr->in_memstall))
psi_memstall_tick(rq->curr, cpu_of(rq));
}
#else /* CONFIG_PSI */
static inline void psi_enqueue(struct task_struct *p, bool wakeup) {}
static inline void psi_dequeue(struct task_struct *p, bool sleep) {}
static inline void psi_ttwu_dequeue(struct task_struct *p) {}
+static inline void psi_sched_switch(struct task_struct *prev,
+ struct task_struct *next,
+ bool sleep) {}
static inline void psi_task_tick(struct rq *rq) {}
#endif /* CONFIG_PSI */
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index 02dc0a8..ceb5b6b 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -34,11 +34,8 @@
stop->se.exec_start = rq_clock_task(rq);
}
-static struct task_struct *
-pick_next_task_stop(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+static struct task_struct *pick_next_task_stop(struct rq *rq)
{
- WARN_ON_ONCE(prev || rf);
-
if (!sched_stop_runnable(rq))
return NULL;
@@ -105,12 +102,6 @@
BUG(); /* how!?, what priority? */
}
-static unsigned int
-get_rr_interval_stop(struct rq *rq, struct task_struct *task)
-{
- return 0;
-}
-
static void update_curr_stop(struct rq *rq)
{
}
@@ -118,8 +109,8 @@
/*
* Simple, special scheduling class for the per-CPU stop tasks:
*/
-const struct sched_class stop_sched_class = {
- .next = &dl_sched_class,
+const struct sched_class stop_sched_class
+ __section("__stop_sched_class") = {
.enqueue_task = enqueue_task_stop,
.dequeue_task = dequeue_task_stop,
@@ -139,8 +130,6 @@
.task_tick = task_tick_stop,
- .get_rr_interval = get_rr_interval_stop,
-
.prio_changed = prio_changed_stop,
.switched_to = switched_to_stop,
.update_curr = update_curr_stop,
diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c
index e83a3f8..e1c655f 100644
--- a/kernel/sched/swait.c
+++ b/kernel/sched/swait.c
@@ -32,6 +32,19 @@
}
EXPORT_SYMBOL(swake_up_locked);
+/*
+ * Wake up all waiters. This is an interface which is solely exposed for
+ * completions and not for general usage.
+ *
+ * It is intentionally different from swake_up_all() to allow usage from
+ * hard interrupt context and interrupt disabled regions.
+ */
+void swake_up_all_locked(struct swait_queue_head *q)
+{
+ while (!list_empty(&q->task_list))
+ swake_up_locked(q);
+}
+
void swake_up_one(struct swait_queue_head *q)
{
unsigned long flags;
@@ -69,7 +82,7 @@
}
EXPORT_SYMBOL(swake_up_all);
-static void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait)
+void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait)
{
wait->task = current;
if (list_empty(&wait->task_list))
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index ffaa97a..ff2c6d3 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -25,22 +25,22 @@
return sched_debug_enabled;
}
+#define SD_FLAG(_name, mflags) [__##_name] = { .meta_flags = mflags, .name = #_name },
+const struct sd_flag_debug sd_flag_debug[] = {
+#include <linux/sched/sd_flags.h>
+};
+#undef SD_FLAG
+
static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
struct cpumask *groupmask)
{
struct sched_group *group = sd->groups;
+ unsigned long flags = sd->flags;
+ unsigned int idx;
cpumask_clear(groupmask);
printk(KERN_DEBUG "%*s domain-%d: ", level, "", level);
-
- if (!(sd->flags & SD_LOAD_BALANCE)) {
- printk("does not load-balance\n");
- if (sd->parent)
- printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain has parent");
- return -1;
- }
-
printk(KERN_CONT "span=%*pbl level=%s\n",
cpumask_pr_args(sched_domain_span(sd)), sd->name);
@@ -51,6 +51,21 @@
printk(KERN_ERR "ERROR: domain->groups does not contain CPU%d\n", cpu);
}
+ for_each_set_bit(idx, &flags, __SD_FLAG_CNT) {
+ unsigned int flag = BIT(idx);
+ unsigned int meta_flags = sd_flag_debug[idx].meta_flags;
+
+ if ((meta_flags & SDF_SHARED_CHILD) && sd->child &&
+ !(sd->child->flags & flag))
+ printk(KERN_ERR "ERROR: flag %s set here but not in child\n",
+ sd_flag_debug[idx].name);
+
+ if ((meta_flags & SDF_SHARED_PARENT) && sd->parent &&
+ !(sd->parent->flags & flag))
+ printk(KERN_ERR "ERROR: flag %s set here but not in parent\n",
+ sd_flag_debug[idx].name);
+ }
+
printk(KERN_DEBUG "%*s groups:", level + 1, "");
do {
if (!group) {
@@ -145,23 +160,22 @@
}
#endif /* CONFIG_SCHED_DEBUG */
+/* Generate a mask of SD flags with the SDF_NEEDS_GROUPS metaflag */
+#define SD_FLAG(name, mflags) (name * !!((mflags) & SDF_NEEDS_GROUPS)) |
+static const unsigned int SD_DEGENERATE_GROUPS_MASK =
+#include <linux/sched/sd_flags.h>
+0;
+#undef SD_FLAG
+
static int sd_degenerate(struct sched_domain *sd)
{
if (cpumask_weight(sched_domain_span(sd)) == 1)
return 1;
/* Following flags need at least 2 groups */
- if (sd->flags & (SD_LOAD_BALANCE |
- SD_BALANCE_NEWIDLE |
- SD_BALANCE_FORK |
- SD_BALANCE_EXEC |
- SD_SHARE_CPUCAPACITY |
- SD_ASYM_CPUCAPACITY |
- SD_SHARE_PKG_RESOURCES |
- SD_SHARE_POWERDOMAIN)) {
- if (sd->groups != sd->groups->next)
- return 0;
- }
+ if ((sd->flags & SD_DEGENERATE_GROUPS_MASK) &&
+ (sd->groups != sd->groups->next))
+ return 0;
/* Following flags don't use groups */
if (sd->flags & (SD_WAKE_AFFINE))
@@ -182,19 +196,9 @@
return 0;
/* Flags needing groups don't count if only 1 group in parent */
- if (parent->groups == parent->groups->next) {
- pflags &= ~(SD_LOAD_BALANCE |
- SD_BALANCE_NEWIDLE |
- SD_BALANCE_FORK |
- SD_BALANCE_EXEC |
- SD_ASYM_CPUCAPACITY |
- SD_SHARE_CPUCAPACITY |
- SD_SHARE_PKG_RESOURCES |
- SD_PREFER_SIBLING |
- SD_SHARE_POWERDOMAIN);
- if (nr_node_ids == 1)
- pflags &= ~SD_SERIALIZE;
- }
+ if (parent->groups == parent->groups->next)
+ pflags &= ~SD_DEGENERATE_GROUPS_MASK;
+
if (~cflags & pflags)
return 0;
@@ -209,7 +213,7 @@
#ifdef CONFIG_PROC_SYSCTL
int sched_energy_aware_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int ret, state;
@@ -282,10 +286,10 @@
printk(KERN_DEBUG "root_domain %*pbl:", cpumask_pr_args(cpu_map));
while (pd) {
- printk(KERN_CONT " pd%d:{ cpus=%*pbl nr_cstate=%d }",
+ printk(KERN_CONT " pd%d:{ cpus=%*pbl nr_pstate=%d }",
cpumask_first(perf_domain_span(pd)),
cpumask_pr_args(perf_domain_span(pd)),
- em_pd_nr_cap_states(pd->em_pd));
+ em_pd_nr_perf_states(pd->em_pd));
pd = pd->next;
}
@@ -317,31 +321,32 @@
* EAS can be used on a root domain if it meets all the following conditions:
* 1. an Energy Model (EM) is available;
* 2. the SD_ASYM_CPUCAPACITY flag is set in the sched_domain hierarchy.
- * 3. the EM complexity is low enough to keep scheduling overheads low;
- * 4. schedutil is driving the frequency of all CPUs of the rd;
+ * 3. no SMT is detected.
+ * 4. the EM complexity is low enough to keep scheduling overheads low;
+ * 5. schedutil is driving the frequency of all CPUs of the rd;
*
* The complexity of the Energy Model is defined as:
*
- * C = nr_pd * (nr_cpus + nr_cs)
+ * C = nr_pd * (nr_cpus + nr_ps)
*
* with parameters defined as:
* - nr_pd: the number of performance domains
* - nr_cpus: the number of CPUs
- * - nr_cs: the sum of the number of capacity states of all performance
+ * - nr_ps: the sum of the number of performance states of all performance
* domains (for example, on a system with 2 performance domains,
- * with 10 capacity states each, nr_cs = 2 * 10 = 20).
+ * with 10 performance states each, nr_ps = 2 * 10 = 20).
*
* It is generally not a good idea to use such a model in the wake-up path on
* very complex platforms because of the associated scheduling overheads. The
* arbitrary constraint below prevents that. It makes EAS usable up to 16 CPUs
- * with per-CPU DVFS and less than 8 capacity states each, for example.
+ * with per-CPU DVFS and less than 8 performance states each, for example.
*/
#define EM_MAX_COMPLEXITY 2048
extern struct cpufreq_governor schedutil_gov;
static bool build_perf_domains(const struct cpumask *cpu_map)
{
- int i, nr_pd = 0, nr_cs = 0, nr_cpus = cpumask_weight(cpu_map);
+ int i, nr_pd = 0, nr_ps = 0, nr_cpus = cpumask_weight(cpu_map);
struct perf_domain *pd = NULL, *tmp;
int cpu = cpumask_first(cpu_map);
struct root_domain *rd = cpu_rq(cpu)->rd;
@@ -360,6 +365,13 @@
goto free;
}
+ /* EAS definitely does *not* handle SMT */
+ if (sched_smt_active()) {
+ pr_warn("rd %*pbl: Disabling EAS, SMT is not supported\n",
+ cpumask_pr_args(cpu_map));
+ goto free;
+ }
+
for_each_cpu(i, cpu_map) {
/* Skip already covered CPUs. */
if (find_pd(pd, i))
@@ -386,15 +398,15 @@
pd = tmp;
/*
- * Count performance domains and capacity states for the
+ * Count performance domains and performance states for the
* complexity check.
*/
nr_pd++;
- nr_cs += em_pd_nr_cap_states(pd->em_pd);
+ nr_ps += em_pd_nr_perf_states(pd->em_pd);
}
/* Bail out if the Energy Model complexity is too high. */
- if (nr_pd * (nr_cs + nr_cpus) > EM_MAX_COMPLEXITY) {
+ if (nr_pd * (nr_ps + nr_cpus) > EM_MAX_COMPLEXITY) {
WARN(1, "rd %*pbl: Failed to start EAS, EM complexity is too high\n",
cpumask_pr_args(cpu_map));
goto free;
@@ -1201,16 +1213,13 @@
if (!attr || attr->relax_domain_level < 0) {
if (default_relax_domain_level < 0)
return;
- else
- request = default_relax_domain_level;
+ request = default_relax_domain_level;
} else
request = attr->relax_domain_level;
- if (request < sd->level) {
+
+ if (sd->level > request) {
/* Turn off idle balance on this domain: */
sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
- } else {
- /* Turn on idle balance on this domain: */
- sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
}
}
@@ -1224,13 +1233,13 @@
case sa_rootdomain:
if (!atomic_read(&d->rd->refcount))
free_rootdomain(&d->rd->rcu);
- /* Fall through */
+ fallthrough;
case sa_sd:
free_percpu(d->sd);
- /* Fall through */
+ fallthrough;
case sa_sd_storage:
__sdt_free(cpu_map);
- /* Fall through */
+ fallthrough;
case sa_none:
break;
}
@@ -1297,7 +1306,6 @@
* SD_SHARE_CPUCAPACITY - describes SMT topologies
* SD_SHARE_PKG_RESOURCES - describes shared caches
* SD_NUMA - describes NUMA topologies
- * SD_SHARE_POWERDOMAIN - describes shared power domain
*
* Odd one out, which beside describing the topology has a quirk also
* prescribes the desired behaviour that goes along with it:
@@ -1308,8 +1316,7 @@
(SD_SHARE_CPUCAPACITY | \
SD_SHARE_PKG_RESOURCES | \
SD_NUMA | \
- SD_ASYM_PACKING | \
- SD_SHARE_POWERDOMAIN)
+ SD_ASYM_PACKING)
static struct sched_domain *
sd_init(struct sched_domain_topology_level *tl,
@@ -1341,13 +1348,12 @@
*sd = (struct sched_domain){
.min_interval = sd_weight,
.max_interval = 2*sd_weight,
- .busy_factor = 32,
- .imbalance_pct = 125,
+ .busy_factor = 16,
+ .imbalance_pct = 117,
.cache_nice_tries = 0,
- .flags = 1*SD_LOAD_BALANCE
- | 1*SD_BALANCE_NEWIDLE
+ .flags = 1*SD_BALANCE_NEWIDLE
| 1*SD_BALANCE_EXEC
| 1*SD_BALANCE_FORK
| 0*SD_BALANCE_WAKE
@@ -1377,18 +1383,9 @@
* Convert topological properties into behaviour.
*/
- if (sd->flags & SD_ASYM_CPUCAPACITY) {
- struct sched_domain *t = sd;
-
- /*
- * Don't attempt to spread across CPUs of different capacities.
- */
- if (sd->child)
- sd->child->flags &= ~SD_PREFER_SIBLING;
-
- for_each_lower_domain(t)
- t->flags |= SD_BALANCE_WAKE;
- }
+ /* Don't attempt to spread across CPUs of different capacities. */
+ if ((sd->flags & SD_ASYM_CPUCAPACITY) && sd->child)
+ sd->child->flags &= ~SD_PREFER_SIBLING;
if (sd->flags & SD_SHARE_CPUCAPACITY) {
sd->imbalance_pct = 110;
@@ -1552,66 +1549,58 @@
}
}
+
+#define NR_DISTANCE_VALUES (1 << DISTANCE_BITS)
+
void sched_init_numa(void)
{
- int next_distance, curr_distance = node_distance(0, 0);
struct sched_domain_topology_level *tl;
- int level = 0;
- int i, j, k;
-
- sched_domains_numa_distance = kzalloc(sizeof(int) * (nr_node_ids + 1), GFP_KERNEL);
- if (!sched_domains_numa_distance)
- return;
-
- /* Includes NUMA identity node at level 0. */
- sched_domains_numa_distance[level++] = curr_distance;
- sched_domains_numa_levels = level;
+ unsigned long *distance_map;
+ int nr_levels = 0;
+ int i, j;
/*
* O(nr_nodes^2) deduplicating selection sort -- in order to find the
* unique distances in the node_distance() table.
- *
- * Assumes node_distance(0,j) includes all distances in
- * node_distance(i,j) in order to avoid cubic time.
*/
- next_distance = curr_distance;
+ distance_map = bitmap_alloc(NR_DISTANCE_VALUES, GFP_KERNEL);
+ if (!distance_map)
+ return;
+
+ bitmap_zero(distance_map, NR_DISTANCE_VALUES);
for (i = 0; i < nr_node_ids; i++) {
for (j = 0; j < nr_node_ids; j++) {
- for (k = 0; k < nr_node_ids; k++) {
- int distance = node_distance(i, k);
+ int distance = node_distance(i, j);
- if (distance > curr_distance &&
- (distance < next_distance ||
- next_distance == curr_distance))
- next_distance = distance;
-
- /*
- * While not a strong assumption it would be nice to know
- * about cases where if node A is connected to B, B is not
- * equally connected to A.
- */
- if (sched_debug() && node_distance(k, i) != distance)
- sched_numa_warn("Node-distance not symmetric");
-
- if (sched_debug() && i && !find_numa_distance(distance))
- sched_numa_warn("Node-0 not representative");
+ if (distance < LOCAL_DISTANCE || distance >= NR_DISTANCE_VALUES) {
+ sched_numa_warn("Invalid distance value range");
+ return;
}
- if (next_distance != curr_distance) {
- sched_domains_numa_distance[level++] = next_distance;
- sched_domains_numa_levels = level;
- curr_distance = next_distance;
- } else break;
- }
- /*
- * In case of sched_debug() we verify the above assumption.
- */
- if (!sched_debug())
- break;
+ bitmap_set(distance_map, distance, 1);
+ }
+ }
+ /*
+ * We can now figure out how many unique distance values there are and
+ * allocate memory accordingly.
+ */
+ nr_levels = bitmap_weight(distance_map, NR_DISTANCE_VALUES);
+
+ sched_domains_numa_distance = kcalloc(nr_levels, sizeof(int), GFP_KERNEL);
+ if (!sched_domains_numa_distance) {
+ bitmap_free(distance_map);
+ return;
}
+ for (i = 0, j = 0; i < nr_levels; i++, j++) {
+ j = find_next_bit(distance_map, NR_DISTANCE_VALUES, j);
+ sched_domains_numa_distance[i] = j;
+ }
+
+ bitmap_free(distance_map);
+
/*
- * 'level' contains the number of unique distances
+ * 'nr_levels' contains the number of unique distances
*
* The sched_domains_numa_distance[] array includes the actual distance
* numbers.
@@ -1620,15 +1609,15 @@
/*
* Here, we should temporarily reset sched_domains_numa_levels to 0.
* If it fails to allocate memory for array sched_domains_numa_masks[][],
- * the array will contain less then 'level' members. This could be
+ * the array will contain less then 'nr_levels' members. This could be
* dangerous when we use it to iterate array sched_domains_numa_masks[][]
* in other functions.
*
- * We reset it to 'level' at the end of this function.
+ * We reset it to 'nr_levels' at the end of this function.
*/
sched_domains_numa_levels = 0;
- sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL);
+ sched_domains_numa_masks = kzalloc(sizeof(void *) * nr_levels, GFP_KERNEL);
if (!sched_domains_numa_masks)
return;
@@ -1636,7 +1625,7 @@
* Now for each level, construct a mask per node which contains all
* CPUs of nodes that are that many hops away from us.
*/
- for (i = 0; i < level; i++) {
+ for (i = 0; i < nr_levels; i++) {
sched_domains_numa_masks[i] =
kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);
if (!sched_domains_numa_masks[i])
@@ -1644,12 +1633,17 @@
for (j = 0; j < nr_node_ids; j++) {
struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL);
+ int k;
+
if (!mask)
return;
sched_domains_numa_masks[i][j] = mask;
for_each_node(k) {
+ if (sched_debug() && (node_distance(j, k) != node_distance(k, j)))
+ sched_numa_warn("Node-distance not symmetric");
+
if (node_distance(j, k) > sched_domains_numa_distance[i])
continue;
@@ -1661,7 +1655,7 @@
/* Compute default topology size */
for (i = 0; sched_domain_topology[i].mask; i++);
- tl = kzalloc((i + level + 1) *
+ tl = kzalloc((i + nr_levels + 1) *
sizeof(struct sched_domain_topology_level), GFP_KERNEL);
if (!tl)
return;
@@ -1684,7 +1678,7 @@
/*
* .. and append 'j' levels of NUMA goodness.
*/
- for (j = 1; j < level; i++, j++) {
+ for (j = 1; j < nr_levels; i++, j++) {
tl[i] = (struct sched_domain_topology_level){
.mask = sd_numa_mask,
.sd_flags = cpu_numa_flags,
@@ -1696,8 +1690,8 @@
sched_domain_topology = tl;
- sched_domains_numa_levels = level;
- sched_max_numa_distance = sched_domains_numa_distance[level - 1];
+ sched_domains_numa_levels = nr_levels;
+ sched_max_numa_distance = sched_domains_numa_distance[nr_levels - 1];
init_numa_topology_type();
}
@@ -2004,11 +1998,10 @@
/* Set up domains for CPUs specified by the cpu_map: */
for_each_cpu(i, cpu_map) {
struct sched_domain_topology_level *tl;
+ int dflags = 0;
sd = NULL;
for_each_sd_topology(tl) {
- int dflags = 0;
-
if (tl == tl_asym) {
dflags |= SD_ASYM_CPUCAPACITY;
has_asym = true;
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 84bd051..a55642a 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -169,7 +169,6 @@
* __wake_up_sync_key - wake up threads blocked on a waitqueue.
* @wq_head: the waitqueue
* @mode: which threads
- * @nr_exclusive: how many wake-one or wake-many threads to wake up
* @key: opaque value to be passed to wakeup targets
*
* The sync wakeup differs that the waker knows that it will schedule
@@ -183,29 +182,54 @@
* accessing the task state.
*/
void __wake_up_sync_key(struct wait_queue_head *wq_head, unsigned int mode,
- int nr_exclusive, void *key)
+ void *key)
{
- int wake_flags = 1; /* XXX WF_SYNC */
-
if (unlikely(!wq_head))
return;
- if (unlikely(nr_exclusive != 1))
- wake_flags = 0;
-
- __wake_up_common_lock(wq_head, mode, nr_exclusive, wake_flags, key);
+ __wake_up_common_lock(wq_head, mode, 1, WF_SYNC, key);
}
EXPORT_SYMBOL_GPL(__wake_up_sync_key);
+/**
+ * __wake_up_locked_sync_key - wake up a thread blocked on a locked waitqueue.
+ * @wq_head: the waitqueue
+ * @mode: which threads
+ * @key: opaque value to be passed to wakeup targets
+ *
+ * The sync wakeup differs in that the waker knows that it will schedule
+ * away soon, so while the target thread will be woken up, it will not
+ * be migrated to another CPU - ie. the two threads are 'synchronized'
+ * with each other. This can prevent needless bouncing between CPUs.
+ *
+ * On UP it can prevent extra preemption.
+ *
+ * If this function wakes up a task, it executes a full memory barrier before
+ * accessing the task state.
+ */
+void __wake_up_locked_sync_key(struct wait_queue_head *wq_head,
+ unsigned int mode, void *key)
+{
+ __wake_up_common(wq_head, mode, 1, WF_SYNC, key, NULL);
+}
+EXPORT_SYMBOL_GPL(__wake_up_locked_sync_key);
+
/*
* __wake_up_sync - see __wake_up_sync_key()
*/
-void __wake_up_sync(struct wait_queue_head *wq_head, unsigned int mode, int nr_exclusive)
+void __wake_up_sync(struct wait_queue_head *wq_head, unsigned int mode)
{
- __wake_up_sync_key(wq_head, mode, nr_exclusive, NULL);
+ __wake_up_sync_key(wq_head, mode, NULL);
}
EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
+void __wake_up_pollfree(struct wait_queue_head *wq_head)
+{
+ __wake_up(wq_head, TASK_NORMAL, 0, poll_to_key(EPOLLHUP | POLLFREE));
+ /* POLLFREE must have cleared the queue. */
+ WARN_ON_ONCE(waitqueue_active(wq_head));
+}
+
/*
* Note: we use "set_current_state()" _after_ the wait-queue add,
* because we need a memory barrier there on SMP, so that any
@@ -377,7 +401,7 @@
int ret = default_wake_function(wq_entry, mode, sync, key);
if (ret)
- list_del_init(&wq_entry->entry);
+ list_del_init_careful(&wq_entry->entry);
return ret;
}
diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c
index 45eba18..02ce292 100644
--- a/kernel/sched/wait_bit.c
+++ b/kernel/sched/wait_bit.c
@@ -179,6 +179,7 @@
.bit_nr = -1,
},
.wq_entry = {
+ .flags = flags,
.private = current,
.func = var_wake_function,
.entry = LIST_HEAD_INIT(wbq_entry->wq_entry.entry),