Update Linux to v5.4.148
Sourced from [1]
[1] https://cdn.kernel.org/pub/linux/kernel/v5.x/linux-5.4.148.tar.gz
Change-Id: Ib3d26c5ba9b022e2e03533005c4fed4d7c30b61b
Signed-off-by: Olivier Deprez <olivier.deprez@arm.com>
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 44123b4..5dc43d3 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -36,7 +36,7 @@
DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
-#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_JUMP_LABEL)
+#ifdef CONFIG_SCHED_DEBUG
/*
* Debugging: various feature bits
*
@@ -254,8 +254,9 @@
static void __hrtick_restart(struct rq *rq)
{
struct hrtimer *timer = &rq->hrtick_timer;
+ ktime_t time = rq->hrtick_time;
- hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED_HARD);
+ hrtimer_start(timer, time, HRTIMER_MODE_ABS_PINNED_HARD);
}
/*
@@ -280,7 +281,6 @@
void hrtick_start(struct rq *rq, u64 delay)
{
struct hrtimer *timer = &rq->hrtick_timer;
- ktime_t time;
s64 delta;
/*
@@ -288,9 +288,7 @@
* doesn't make sense and can cause timer DoS.
*/
delta = max_t(s64, delay, 10000LL);
- time = ktime_add_ns(timer->base->get_time(), delta);
-
- hrtimer_set_expires(timer, time);
+ rq->hrtick_time = ktime_add_ns(timer->base->get_time(), delta);
if (rq == this_rq()) {
__hrtick_restart(rq);
@@ -794,6 +792,26 @@
/* All clamps are required to be less or equal than these values */
static struct uclamp_se uclamp_default[UCLAMP_CNT];
+/*
+ * This static key is used to reduce the uclamp overhead in the fast path. It
+ * primarily disables the call to uclamp_rq_{inc, dec}() in
+ * enqueue/dequeue_task().
+ *
+ * This allows users to continue to enable uclamp in their kernel config with
+ * minimum uclamp overhead in the fast path.
+ *
+ * As soon as userspace modifies any of the uclamp knobs, the static key is
+ * enabled, since we have an actual users that make use of uclamp
+ * functionality.
+ *
+ * The knobs that would enable this static key are:
+ *
+ * * A task modifying its uclamp value with sched_setattr().
+ * * An admin modifying the sysctl_sched_uclamp_{min, max} via procfs.
+ * * An admin modifying the cgroup cpu.uclamp.{min, max}
+ */
+DEFINE_STATIC_KEY_FALSE(sched_uclamp_used);
+
/* Integer rounded range for each bucket */
#define UCLAMP_BUCKET_DELTA DIV_ROUND_CLOSEST(SCHED_CAPACITY_SCALE, UCLAMP_BUCKETS)
@@ -802,7 +820,7 @@
static inline unsigned int uclamp_bucket_id(unsigned int clamp_value)
{
- return clamp_value / UCLAMP_BUCKET_DELTA;
+ return min_t(unsigned int, clamp_value / UCLAMP_BUCKET_DELTA, UCLAMP_BUCKETS - 1);
}
static inline unsigned int uclamp_bucket_base_value(unsigned int clamp_value)
@@ -810,7 +828,7 @@
return UCLAMP_BUCKET_DELTA * uclamp_bucket_id(clamp_value);
}
-static inline enum uclamp_id uclamp_none(enum uclamp_id clamp_id)
+static inline unsigned int uclamp_none(enum uclamp_id clamp_id)
{
if (clamp_id == UCLAMP_MIN)
return 0;
@@ -853,7 +871,7 @@
}
static inline
-enum uclamp_id uclamp_rq_max_value(struct rq *rq, enum uclamp_id clamp_id,
+unsigned int uclamp_rq_max_value(struct rq *rq, enum uclamp_id clamp_id,
unsigned int clamp_value)
{
struct uclamp_bucket *bucket = rq->uclamp[clamp_id].bucket;
@@ -876,9 +894,10 @@
static inline struct uclamp_se
uclamp_tg_restrict(struct task_struct *p, enum uclamp_id clamp_id)
{
+ /* Copy by value as we could modify it */
struct uclamp_se uc_req = p->uclamp_req[clamp_id];
#ifdef CONFIG_UCLAMP_TASK_GROUP
- struct uclamp_se uc_max;
+ unsigned int tg_min, tg_max, value;
/*
* Tasks in autogroups or root task group will be
@@ -889,9 +908,11 @@
if (task_group(p) == &root_task_group)
return uc_req;
- uc_max = task_group(p)->uclamp[clamp_id];
- if (uc_req.value > uc_max.value || !uc_req.user_defined)
- return uc_max;
+ tg_min = task_group(p)->uclamp[UCLAMP_MIN].value;
+ tg_max = task_group(p)->uclamp[UCLAMP_MAX].value;
+ value = uc_req.value;
+ value = clamp(value, tg_min, tg_max);
+ uclamp_se_set(&uc_req, value, false);
#endif
return uc_req;
@@ -918,7 +939,7 @@
return uc_req;
}
-enum uclamp_id uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id)
+unsigned int uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id)
{
struct uclamp_se uc_eff;
@@ -990,10 +1011,38 @@
lockdep_assert_held(&rq->lock);
+ /*
+ * If sched_uclamp_used was enabled after task @p was enqueued,
+ * we could end up with unbalanced call to uclamp_rq_dec_id().
+ *
+ * In this case the uc_se->active flag should be false since no uclamp
+ * accounting was performed at enqueue time and we can just return
+ * here.
+ *
+ * Need to be careful of the following enqeueue/dequeue ordering
+ * problem too
+ *
+ * enqueue(taskA)
+ * // sched_uclamp_used gets enabled
+ * enqueue(taskB)
+ * dequeue(taskA)
+ * // Must not decrement bukcet->tasks here
+ * dequeue(taskB)
+ *
+ * where we could end up with stale data in uc_se and
+ * bucket[uc_se->bucket_id].
+ *
+ * The following check here eliminates the possibility of such race.
+ */
+ if (unlikely(!uc_se->active))
+ return;
+
bucket = &uc_rq->bucket[uc_se->bucket_id];
+
SCHED_WARN_ON(!bucket->tasks);
if (likely(bucket->tasks))
bucket->tasks--;
+
uc_se->active = false;
/*
@@ -1021,6 +1070,15 @@
{
enum uclamp_id clamp_id;
+ /*
+ * Avoid any overhead until uclamp is actually used by the userspace.
+ *
+ * The condition is constructed such that a NOP is generated when
+ * sched_uclamp_used is disabled.
+ */
+ if (!static_branch_unlikely(&sched_uclamp_used))
+ return;
+
if (unlikely(!p->sched_class->uclamp_enabled))
return;
@@ -1036,6 +1094,15 @@
{
enum uclamp_id clamp_id;
+ /*
+ * Avoid any overhead until uclamp is actually used by the userspace.
+ *
+ * The condition is constructed such that a NOP is generated when
+ * sched_uclamp_used is disabled.
+ */
+ if (!static_branch_unlikely(&sched_uclamp_used))
+ return;
+
if (unlikely(!p->sched_class->uclamp_enabled))
return;
@@ -1043,9 +1110,27 @@
uclamp_rq_dec_id(rq, p, clamp_id);
}
-static inline void
-uclamp_update_active(struct task_struct *p, enum uclamp_id clamp_id)
+static inline void uclamp_rq_reinc_id(struct rq *rq, struct task_struct *p,
+ enum uclamp_id clamp_id)
{
+ if (!p->uclamp[clamp_id].active)
+ return;
+
+ uclamp_rq_dec_id(rq, p, clamp_id);
+ uclamp_rq_inc_id(rq, p, clamp_id);
+
+ /*
+ * Make sure to clear the idle flag if we've transiently reached 0
+ * active tasks on rq.
+ */
+ if (clamp_id == UCLAMP_MAX && (rq->uclamp_flags & UCLAMP_FLAG_IDLE))
+ rq->uclamp_flags &= ~UCLAMP_FLAG_IDLE;
+}
+
+static inline void
+uclamp_update_active(struct task_struct *p)
+{
+ enum uclamp_id clamp_id;
struct rq_flags rf;
struct rq *rq;
@@ -1065,30 +1150,22 @@
* affecting a valid clamp bucket, the next time it's enqueued,
* it will already see the updated clamp bucket value.
*/
- if (p->uclamp[clamp_id].active) {
- uclamp_rq_dec_id(rq, p, clamp_id);
- uclamp_rq_inc_id(rq, p, clamp_id);
- }
+ for_each_clamp_id(clamp_id)
+ uclamp_rq_reinc_id(rq, p, clamp_id);
task_rq_unlock(rq, p, &rf);
}
#ifdef CONFIG_UCLAMP_TASK_GROUP
static inline void
-uclamp_update_active_tasks(struct cgroup_subsys_state *css,
- unsigned int clamps)
+uclamp_update_active_tasks(struct cgroup_subsys_state *css)
{
- enum uclamp_id clamp_id;
struct css_task_iter it;
struct task_struct *p;
css_task_iter_start(css, 0, &it);
- while ((p = css_task_iter_next(&it))) {
- for_each_clamp_id(clamp_id) {
- if ((0x1 << clamp_id) & clamps)
- uclamp_update_active(p, clamp_id);
- }
- }
+ while ((p = css_task_iter_next(&it)))
+ uclamp_update_active(p);
css_task_iter_end(&it);
}
@@ -1145,8 +1222,10 @@
update_root_tg = true;
}
- if (update_root_tg)
+ if (update_root_tg) {
+ static_branch_enable(&sched_uclamp_used);
uclamp_update_root_tg();
+ }
/*
* We update all RUNNABLE tasks only when task groups are in use.
@@ -1181,6 +1260,15 @@
if (upper_bound > SCHED_CAPACITY_SCALE)
return -EINVAL;
+ /*
+ * We have valid uclamp attributes; make sure uclamp is enabled.
+ *
+ * We need to do that here, because enabling static branches is a
+ * blocking operation which obviously cannot be done while holding
+ * scheduler locks.
+ */
+ static_branch_enable(&sched_uclamp_used);
+
return 0;
}
@@ -1233,16 +1321,25 @@
return;
for_each_clamp_id(clamp_id) {
- unsigned int clamp_value = uclamp_none(clamp_id);
-
- /* By default, RT tasks always get 100% boost */
- if (unlikely(rt_task(p) && clamp_id == UCLAMP_MIN))
- clamp_value = uclamp_none(UCLAMP_MAX);
-
- uclamp_se_set(&p->uclamp_req[clamp_id], clamp_value, false);
+ uclamp_se_set(&p->uclamp_req[clamp_id],
+ uclamp_none(clamp_id), false);
}
}
+static void __init init_uclamp_rq(struct rq *rq)
+{
+ enum uclamp_id clamp_id;
+ struct uclamp_rq *uc_rq = rq->uclamp;
+
+ for_each_clamp_id(clamp_id) {
+ uc_rq[clamp_id] = (struct uclamp_rq) {
+ .value = uclamp_none(clamp_id)
+ };
+ }
+
+ rq->uclamp_flags = 0;
+}
+
static void __init init_uclamp(void)
{
struct uclamp_se uc_max = {};
@@ -1251,10 +1348,8 @@
mutex_init(&uclamp_mutex);
- for_each_possible_cpu(cpu) {
- memset(&cpu_rq(cpu)->uclamp, 0, sizeof(struct uclamp_rq));
- cpu_rq(cpu)->uclamp_flags = 0;
- }
+ for_each_possible_cpu(cpu)
+ init_uclamp_rq(cpu_rq(cpu));
for_each_clamp_id(clamp_id) {
uclamp_se_set(&init_task.uclamp_req[clamp_id],
@@ -1653,7 +1748,7 @@
goto out;
}
- if (cpumask_equal(p->cpus_ptr, new_mask))
+ if (cpumask_equal(&p->cpus_mask, new_mask))
goto out;
dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask);
@@ -2893,6 +2988,7 @@
* Silence PROVE_RCU.
*/
raw_spin_lock_irqsave(&p->pi_lock, flags);
+ rseq_migrate(p);
/*
* We're setting the CPU for the first time, we don't migrate,
* so use __set_task_cpu().
@@ -2957,6 +3053,7 @@
* as we're not fully set-up yet.
*/
p->recent_used_cpu = task_cpu(p);
+ rseq_migrate(p);
__set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
#endif
rq = __task_rq_lock(p, &rf);
@@ -3667,28 +3764,31 @@
* statistics and checks timeslices in a time-independent way, regardless
* of when exactly it is running.
*/
- if (idle_cpu(cpu) || !tick_nohz_tick_stopped_cpu(cpu))
+ if (!tick_nohz_tick_stopped_cpu(cpu))
goto out_requeue;
rq_lock_irq(rq, &rf);
curr = rq->curr;
- if (is_idle_task(curr) || cpu_is_offline(cpu))
+ if (cpu_is_offline(cpu))
goto out_unlock;
update_rq_clock(rq);
- delta = rq_clock_task(rq) - curr->se.exec_start;
- /*
- * Make sure the next tick runs within a reasonable
- * amount of time.
- */
- WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3);
+ if (!is_idle_task(curr)) {
+ /*
+ * Make sure the next tick runs within a reasonable
+ * amount of time.
+ */
+ delta = rq_clock_task(rq) - curr->se.exec_start;
+ WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3);
+ }
curr->sched_class->task_tick(rq, curr, 0);
+ calc_load_nohz_remote(rq);
out_unlock:
rq_unlock_irq(rq, &rf);
-
out_requeue:
+
/*
* Run the remote tick once per second (1Hz). This arbitrary
* frequency is large enough to avoid overload but short enough
@@ -4110,7 +4210,8 @@
* it wants to wake up a task to maintain concurrency.
* As this function is called inside the schedule() context,
* we disable preemption to avoid it calling schedule() again
- * in the possible wakeup of a kworker.
+ * in the possible wakeup of a kworker and because wq_worker_sleeping()
+ * requires it.
*/
if (tsk->flags & PF_WQ_WORKER) {
preempt_disable();
@@ -4448,7 +4549,8 @@
*/
if (dl_prio(prio)) {
if (!dl_prio(p->normal_prio) ||
- (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) {
+ (pi_task && dl_prio(pi_task->prio) &&
+ dl_entity_preempt(&pi_task->dl, &p->dl))) {
p->dl.dl_boosted = 1;
queue_flag |= ENQUEUE_REPLENISH;
} else
@@ -5588,12 +5690,8 @@
schedstat_inc(rq->yld_count);
current->sched_class->yield_task(rq);
- /*
- * Since we are going to call schedule() anyway, there's
- * no need to preempt or enable interrupts:
- */
preempt_disable();
- rq_unlock(rq, &rf);
+ rq_unlock_irq(rq, &rf);
sched_preempt_enable_no_resched();
schedule();
@@ -6178,13 +6276,14 @@
struct mm_struct *mm = current->active_mm;
BUG_ON(cpu_online(smp_processor_id()));
+ BUG_ON(current != this_rq()->idle);
if (mm != &init_mm) {
switch_mm(mm, &init_mm, current);
- current->active_mm = &init_mm;
finish_arch_post_lock_switch();
}
- mmdrop(mm);
+
+ /* finish_cpu(), as ran on the BP, will clean up the active_mm state */
}
/*
@@ -7053,8 +7152,15 @@
if (queued)
enqueue_task(rq, tsk, queue_flags);
- if (running)
+ if (running) {
set_next_task(rq, tsk);
+ /*
+ * After changing group, the running task may have joined a
+ * throttled one but it's still the running task. Trigger a
+ * resched to make sure that task can still run.
+ */
+ resched_curr(rq);
+ }
task_rq_unlock(rq, tsk, &rf);
}
@@ -7090,6 +7196,16 @@
if (parent)
sched_online_group(tg, parent);
+
+#ifdef CONFIG_UCLAMP_TASK_GROUP
+ /* Propagate the effective uclamp value for the new group */
+ mutex_lock(&uclamp_mutex);
+ rcu_read_lock();
+ cpu_util_update_eff(css);
+ rcu_read_unlock();
+ mutex_unlock(&uclamp_mutex);
+#endif
+
return 0;
}
@@ -7177,6 +7293,9 @@
enum uclamp_id clamp_id;
unsigned int clamps;
+ lockdep_assert_held(&uclamp_mutex);
+ SCHED_WARN_ON(!rcu_read_lock_held());
+
css_for_each_descendant_pre(css, top_css) {
uc_parent = css_tg(css)->parent
? css_tg(css)->parent->uclamp : NULL;
@@ -7209,7 +7328,7 @@
}
/* Immediately update descendants RUNNABLE tasks */
- uclamp_update_active_tasks(css, clamps);
+ uclamp_update_active_tasks(css);
}
}
@@ -7244,7 +7363,7 @@
&req.percent);
if (req.ret)
return req;
- if (req.percent > UCLAMP_PERCENT_SCALE) {
+ if ((u64)req.percent > UCLAMP_PERCENT_SCALE) {
req.ret = -ERANGE;
return req;
}
@@ -7267,6 +7386,8 @@
if (req.ret)
return req.ret;
+ static_branch_enable(&sched_uclamp_used);
+
mutex_lock(&uclamp_mutex);
rcu_read_lock();
@@ -7361,6 +7482,8 @@
const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */
static const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */
+/* More than 203 days if BW_SHIFT equals 20. */
+static const u64 max_cfs_runtime = MAX_BW * NSEC_PER_USEC;
static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
@@ -7389,6 +7512,12 @@
return -EINVAL;
/*
+ * Bound quota to defend quota against overflow during bandwidth shift.
+ */
+ if (quota != RUNTIME_INF && quota > max_cfs_runtime)
+ return -EINVAL;
+
+ /*
* Prevent race between setting of cfs_rq->runtime_enabled and
* unthrottle_offline_cfs_rqs().
*/
diff --git a/kernel/sched/cpufreq.c b/kernel/sched/cpufreq.c
index b5dcd1d..7c2fe50 100644
--- a/kernel/sched/cpufreq.c
+++ b/kernel/sched/cpufreq.c
@@ -5,6 +5,8 @@
* Copyright (C) 2016, Intel Corporation
* Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
*/
+#include <linux/cpufreq.h>
+
#include "sched.h"
DEFINE_PER_CPU(struct update_util_data __rcu *, cpufreq_update_util_data);
@@ -57,3 +59,19 @@
rcu_assign_pointer(per_cpu(cpufreq_update_util_data, cpu), NULL);
}
EXPORT_SYMBOL_GPL(cpufreq_remove_update_util_hook);
+
+/**
+ * cpufreq_this_cpu_can_update - Check if cpufreq policy can be updated.
+ * @policy: cpufreq policy to check.
+ *
+ * Return 'true' if:
+ * - the local and remote CPUs share @policy,
+ * - dvfs_possible_from_any_cpu is set in @policy and the local CPU is not going
+ * offline (in which case it is not expected to run cpufreq updates any more).
+ */
+bool cpufreq_this_cpu_can_update(struct cpufreq_policy *policy)
+{
+ return cpumask_test_cpu(smp_processor_id(), policy->cpus) ||
+ (policy->dvfs_possible_from_any_cpu &&
+ rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data)));
+}
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 86800b4..4cb80e6 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -82,12 +82,10 @@
* by the hardware, as calculating the frequency is pointless if
* we cannot in fact act on it.
*
- * For the slow switching platforms, the kthread is always scheduled on
- * the right set of CPUs and any CPU can find the next frequency and
- * schedule the kthread.
+ * This is needed on the slow switching platforms too to prevent CPUs
+ * going offline from leaving stale IRQ work items behind.
*/
- if (sg_policy->policy->fast_switch_enabled &&
- !cpufreq_this_cpu_can_update(sg_policy->policy))
+ if (!cpufreq_this_cpu_can_update(sg_policy->policy))
return false;
if (unlikely(sg_policy->limits_changed)) {
@@ -212,7 +210,7 @@
unsigned long dl_util, util, irq;
struct rq *rq = cpu_rq(cpu);
- if (!IS_BUILTIN(CONFIG_UCLAMP_TASK) &&
+ if (!uclamp_is_used() &&
type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt)) {
return max;
}
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index a8a0803..2bda9fd 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1654,6 +1654,7 @@
*/
raw_spin_lock(&rq->lock);
if (p->dl.dl_non_contending) {
+ update_rq_clock(rq);
sub_running_bw(&p->dl, &rq->dl);
p->dl.dl_non_contending = 0;
/*
@@ -1743,13 +1744,16 @@
}
#endif
-static void set_next_task_dl(struct rq *rq, struct task_struct *p)
+static void set_next_task_dl(struct rq *rq, struct task_struct *p, bool first)
{
p->se.exec_start = rq_clock_task(rq);
/* You can't push away the running task */
dequeue_pushable_dl_task(rq, p);
+ if (!first)
+ return;
+
if (hrtick_enabled(rq))
start_hrtick_dl(rq, p);
@@ -1785,7 +1789,7 @@
dl_se = pick_next_dl_entity(rq, dl_rq);
BUG_ON(!dl_se);
p = dl_task_of(dl_se);
- set_next_task_dl(rq, p);
+ set_next_task_dl(rq, p, true);
return p;
}
@@ -2389,6 +2393,8 @@
check_preempt_curr_dl(rq, p, 0);
else
resched_curr(rq);
+ } else {
+ update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 0);
}
}
@@ -2466,7 +2472,7 @@
u64 period = global_rt_period();
u64 new_bw = to_ratio(period, runtime);
struct dl_bw *dl_b;
- int cpu, ret = 0;
+ int cpu, cpus, ret = 0;
unsigned long flags;
/*
@@ -2481,9 +2487,10 @@
for_each_possible_cpu(cpu) {
rcu_read_lock_sched();
dl_b = dl_bw_of(cpu);
+ cpus = dl_bw_cpus(cpu);
raw_spin_lock_irqsave(&dl_b->lock, flags);
- if (new_bw < dl_b->total_bw)
+ if (new_bw * cpus < dl_b->total_bw)
ret = -EBUSY;
raw_spin_unlock_irqrestore(&dl_b->lock, flags);
@@ -2616,7 +2623,7 @@
dl_se->dl_runtime = attr->sched_runtime;
dl_se->dl_deadline = attr->sched_deadline;
dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline;
- dl_se->flags = attr->sched_flags;
+ dl_se->flags = attr->sched_flags & SCHED_DL_FLAGS;
dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);
dl_se->dl_density = to_ratio(dl_se->dl_deadline, dl_se->dl_runtime);
}
@@ -2629,7 +2636,8 @@
attr->sched_runtime = dl_se->dl_runtime;
attr->sched_deadline = dl_se->dl_deadline;
attr->sched_period = dl_se->dl_period;
- attr->sched_flags = dl_se->flags;
+ attr->sched_flags &= ~SCHED_DL_FLAGS;
+ attr->sched_flags |= dl_se->flags;
}
/*
@@ -2690,6 +2698,7 @@
dl_se->dl_bw = 0;
dl_se->dl_density = 0;
+ dl_se->dl_boosted = 0;
dl_se->dl_throttled = 0;
dl_se->dl_yielded = 0;
dl_se->dl_non_contending = 0;
@@ -2703,7 +2712,7 @@
if (dl_se->dl_runtime != attr->sched_runtime ||
dl_se->dl_deadline != attr->sched_deadline ||
dl_se->dl_period != attr->sched_period ||
- dl_se->flags != attr->sched_flags)
+ dl_se->flags != (attr->sched_flags & SCHED_DL_FLAGS))
return true;
return false;
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index f7e4579..faada71 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -8,8 +8,6 @@
*/
#include "sched.h"
-static DEFINE_SPINLOCK(sched_debug_lock);
-
/*
* This allows printing both to /proc/sched_debug and
* to the console
@@ -258,7 +256,7 @@
set_table_entry(&table[2], "busy_factor", &sd->busy_factor, sizeof(int), 0644, proc_dointvec_minmax);
set_table_entry(&table[3], "imbalance_pct", &sd->imbalance_pct, sizeof(int), 0644, proc_dointvec_minmax);
set_table_entry(&table[4], "cache_nice_tries", &sd->cache_nice_tries, sizeof(int), 0644, proc_dointvec_minmax);
- set_table_entry(&table[5], "flags", &sd->flags, sizeof(int), 0644, proc_dointvec_minmax);
+ set_table_entry(&table[5], "flags", &sd->flags, sizeof(int), 0444, proc_dointvec_minmax);
set_table_entry(&table[6], "max_newidle_lb_cost", &sd->max_newidle_lb_cost, sizeof(long), 0644, proc_doulongvec_minmax);
set_table_entry(&table[7], "name", sd->name, CORENAME_MAX_SIZE, 0444, proc_dostring);
/* &table[8] is terminator */
@@ -417,16 +415,37 @@
#endif
#ifdef CONFIG_CGROUP_SCHED
+static DEFINE_SPINLOCK(sched_debug_lock);
static char group_path[PATH_MAX];
-static char *task_group_path(struct task_group *tg)
+static void task_group_path(struct task_group *tg, char *path, int plen)
{
- if (autogroup_path(tg, group_path, PATH_MAX))
- return group_path;
+ if (autogroup_path(tg, path, plen))
+ return;
- cgroup_path(tg->css.cgroup, group_path, PATH_MAX);
+ cgroup_path(tg->css.cgroup, path, plen);
+}
- return group_path;
+/*
+ * Only 1 SEQ_printf_task_group_path() caller can use the full length
+ * group_path[] for cgroup path. Other simultaneous callers will have
+ * to use a shorter stack buffer. A "..." suffix is appended at the end
+ * of the stack buffer so that it will show up in case the output length
+ * matches the given buffer size to indicate possible path name truncation.
+ */
+#define SEQ_printf_task_group_path(m, tg, fmt...) \
+{ \
+ if (spin_trylock(&sched_debug_lock)) { \
+ task_group_path(tg, group_path, sizeof(group_path)); \
+ SEQ_printf(m, fmt, group_path); \
+ spin_unlock(&sched_debug_lock); \
+ } else { \
+ char buf[128]; \
+ char *bufend = buf + sizeof(buf) - 3; \
+ task_group_path(tg, buf, bufend - buf); \
+ strcpy(bufend - 1, "..."); \
+ SEQ_printf(m, fmt, buf); \
+ } \
}
#endif
@@ -453,7 +472,7 @@
SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p));
#endif
#ifdef CONFIG_CGROUP_SCHED
- SEQ_printf(m, " %s", task_group_path(task_group(p)));
+ SEQ_printf_task_group_path(m, task_group(p), " %s")
#endif
SEQ_printf(m, "\n");
@@ -490,7 +509,7 @@
#ifdef CONFIG_FAIR_GROUP_SCHED
SEQ_printf(m, "\n");
- SEQ_printf(m, "cfs_rq[%d]:%s\n", cpu, task_group_path(cfs_rq->tg));
+ SEQ_printf_task_group_path(m, cfs_rq->tg, "cfs_rq[%d]:%s\n", cpu);
#else
SEQ_printf(m, "\n");
SEQ_printf(m, "cfs_rq[%d]:\n", cpu);
@@ -562,7 +581,7 @@
{
#ifdef CONFIG_RT_GROUP_SCHED
SEQ_printf(m, "\n");
- SEQ_printf(m, "rt_rq[%d]:%s\n", cpu, task_group_path(rt_rq->tg));
+ SEQ_printf_task_group_path(m, rt_rq->tg, "rt_rq[%d]:%s\n", cpu);
#else
SEQ_printf(m, "\n");
SEQ_printf(m, "rt_rq[%d]:\n", cpu);
@@ -614,7 +633,6 @@
static void print_cpu(struct seq_file *m, int cpu)
{
struct rq *rq = cpu_rq(cpu);
- unsigned long flags;
#ifdef CONFIG_X86
{
@@ -666,13 +684,11 @@
}
#undef P
- spin_lock_irqsave(&sched_debug_lock, flags);
print_cfs_stats(m, cpu);
print_rt_stats(m, cpu);
print_dl_stats(m, cpu);
print_rq(m, rq, cpu);
- spin_unlock_irqrestore(&sched_debug_lock, flags);
SEQ_printf(m, "\n");
}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 69a81a5..87d9fad 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2678,7 +2678,7 @@
/*
* We don't care about NUMA placement if we don't have memory.
*/
- if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work)
+ if ((curr->flags & (PF_EXITING | PF_KTHREAD)) || work->next != work)
return;
/*
@@ -2927,7 +2927,7 @@
*
* tg->weight * grq->load.weight
* ge->load.weight = ----------------------------- (1)
- * \Sum grq->load.weight
+ * \Sum grq->load.weight
*
* Now, because computing that sum is prohibitively expensive to compute (been
* there, done that) we approximate it with this average stuff. The average
@@ -2941,7 +2941,7 @@
*
* tg->weight * grq->avg.load_avg
* ge->load.weight = ------------------------------ (3)
- * tg->load_avg
+ * tg->load_avg
*
* Where: tg->load_avg ~= \Sum grq->avg.load_avg
*
@@ -2957,7 +2957,7 @@
*
* tg->weight * grq->load.weight
* ge->load.weight = ----------------------------- = tg->weight (4)
- * grp->load.weight
+ * grp->load.weight
*
* That is, the sum collapses because all other CPUs are idle; the UP scenario.
*
@@ -2976,7 +2976,7 @@
*
* tg->weight * grq->load.weight
* ge->load.weight = ----------------------------- (6)
- * tg_load_avg'
+ * tg_load_avg'
*
* Where:
*
@@ -3504,9 +3504,6 @@
cfs_rq->load_last_update_time_copy = sa->last_update_time;
#endif
- if (decayed)
- cfs_rq_util_change(cfs_rq, 0);
-
return decayed;
}
@@ -3616,8 +3613,12 @@
attach_entity_load_avg(cfs_rq, se, SCHED_CPUFREQ_MIGRATION);
update_tg_load_avg(cfs_rq, 0);
- } else if (decayed && (flags & UPDATE_TG))
- update_tg_load_avg(cfs_rq, 0);
+ } else if (decayed) {
+ cfs_rq_util_change(cfs_rq, 0);
+
+ if (flags & UPDATE_TG)
+ update_tg_load_avg(cfs_rq, 0);
+ }
}
#ifndef CONFIG_64BIT
@@ -3813,7 +3814,7 @@
if (!static_branch_unlikely(&sched_asym_cpucapacity))
return;
- if (!p) {
+ if (!p || p->nr_cpus_allowed == 1) {
rq->misfit_task_load = 0;
return;
}
@@ -3823,7 +3824,11 @@
return;
}
- rq->misfit_task_load = task_h_load(p);
+ /*
+ * Make sure that misfit_task_load will not be null even if
+ * task_h_load() returns 0.
+ */
+ rq->misfit_task_load = max_t(unsigned long, task_h_load(p), 1);
}
#else /* CONFIG_SMP */
@@ -3926,6 +3931,7 @@
#endif
}
+static inline bool cfs_bandwidth_used(void);
/*
* MIGRATION
@@ -4004,10 +4010,16 @@
__enqueue_entity(cfs_rq, se);
se->on_rq = 1;
- if (cfs_rq->nr_running == 1) {
+ /*
+ * When bandwidth control is enabled, cfs might have been removed
+ * because of a parent been throttled but cfs->nr_running > 1. Try to
+ * add it unconditionnally.
+ */
+ if (cfs_rq->nr_running == 1 || cfs_bandwidth_used())
list_add_leaf_cfs_rq(cfs_rq);
+
+ if (cfs_rq->nr_running == 1)
check_enqueue_throttle(cfs_rq);
- }
}
static void __clear_buddies_last(struct sched_entity *se)
@@ -4371,16 +4383,16 @@
}
/* returns 0 on failure to allocate runtime */
-static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+static int __assign_cfs_rq_runtime(struct cfs_bandwidth *cfs_b,
+ struct cfs_rq *cfs_rq, u64 target_runtime)
{
- struct task_group *tg = cfs_rq->tg;
- struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
- u64 amount = 0, min_amount;
+ u64 min_amount, amount = 0;
+
+ lockdep_assert_held(&cfs_b->lock);
/* note: this is a positive sum as runtime_remaining <= 0 */
- min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
+ min_amount = target_runtime - cfs_rq->runtime_remaining;
- raw_spin_lock(&cfs_b->lock);
if (cfs_b->quota == RUNTIME_INF)
amount = min_amount;
else {
@@ -4392,13 +4404,25 @@
cfs_b->idle = 0;
}
}
- raw_spin_unlock(&cfs_b->lock);
cfs_rq->runtime_remaining += amount;
return cfs_rq->runtime_remaining > 0;
}
+/* returns 0 on failure to allocate runtime */
+static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+{
+ struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
+ int ret;
+
+ raw_spin_lock(&cfs_b->lock);
+ ret = __assign_cfs_rq_runtime(cfs_b, cfs_rq, sched_cfs_bandwidth_slice());
+ raw_spin_unlock(&cfs_b->lock);
+
+ return ret;
+}
+
static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
{
/* dock delta_exec before expiring quota (as it could span periods) */
@@ -4487,13 +4511,33 @@
return 0;
}
-static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
+static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
{
struct rq *rq = rq_of(cfs_rq);
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
struct sched_entity *se;
long task_delta, idle_task_delta, dequeue = 1;
- bool empty;
+
+ raw_spin_lock(&cfs_b->lock);
+ /* This will start the period timer if necessary */
+ if (__assign_cfs_rq_runtime(cfs_b, cfs_rq, 1)) {
+ /*
+ * We have raced with bandwidth becoming available, and if we
+ * actually throttled the timer might not unthrottle us for an
+ * entire period. We additionally needed to make sure that any
+ * subsequent check_cfs_rq_runtime calls agree not to throttle
+ * us, as we may commit to do cfs put_prev+pick_next, so we ask
+ * for 1ns of runtime rather than just check cfs_b.
+ */
+ dequeue = 0;
+ } else {
+ list_add_tail_rcu(&cfs_rq->throttled_list,
+ &cfs_b->throttled_cfs_rq);
+ }
+ raw_spin_unlock(&cfs_b->lock);
+
+ if (!dequeue)
+ return false; /* Throttle no longer required. */
se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
@@ -4522,29 +4566,13 @@
if (!se)
sub_nr_running(rq, task_delta);
+ /*
+ * Note: distribution will already see us throttled via the
+ * throttled-list. rq->lock protects completion.
+ */
cfs_rq->throttled = 1;
cfs_rq->throttled_clock = rq_clock(rq);
- raw_spin_lock(&cfs_b->lock);
- empty = list_empty(&cfs_b->throttled_cfs_rq);
-
- /*
- * Add to the _head_ of the list, so that an already-started
- * distribute_cfs_runtime will not see us. If disribute_cfs_runtime is
- * not running add to the tail so that later runqueues don't get starved.
- */
- if (cfs_b->distribute_running)
- list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
- else
- list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
-
- /*
- * If we're the first throttled task, make sure the bandwidth
- * timer is running.
- */
- if (empty)
- start_cfs_bandwidth(cfs_b);
-
- raw_spin_unlock(&cfs_b->lock);
+ return true;
}
void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
@@ -4552,7 +4580,6 @@
struct rq *rq = rq_of(cfs_rq);
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
struct sched_entity *se;
- int enqueue = 1;
long task_delta, idle_task_delta;
se = cfs_rq->tg->se[cpu_of(rq)];
@@ -4576,23 +4603,55 @@
idle_task_delta = cfs_rq->idle_h_nr_running;
for_each_sched_entity(se) {
if (se->on_rq)
- enqueue = 0;
-
+ break;
cfs_rq = cfs_rq_of(se);
- if (enqueue)
- enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
+ enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
+
cfs_rq->h_nr_running += task_delta;
cfs_rq->idle_h_nr_running += idle_task_delta;
+ /* end evaluation on encountering a throttled cfs_rq */
if (cfs_rq_throttled(cfs_rq))
+ goto unthrottle_throttle;
+ }
+
+ for_each_sched_entity(se) {
+ cfs_rq = cfs_rq_of(se);
+
+ cfs_rq->h_nr_running += task_delta;
+ cfs_rq->idle_h_nr_running += idle_task_delta;
+
+
+ /* end evaluation on encountering a throttled cfs_rq */
+ if (cfs_rq_throttled(cfs_rq))
+ goto unthrottle_throttle;
+
+ /*
+ * One parent has been throttled and cfs_rq removed from the
+ * list. Add it back to not break the leaf list.
+ */
+ if (throttled_hierarchy(cfs_rq))
+ list_add_leaf_cfs_rq(cfs_rq);
+ }
+
+ /* At this point se is NULL and we are at root level*/
+ add_nr_running(rq, task_delta);
+
+unthrottle_throttle:
+ /*
+ * The cfs_rq_throttled() breaks in the above iteration can result in
+ * incomplete leaf list maintenance, resulting in triggering the
+ * assertion below.
+ */
+ for_each_sched_entity(se) {
+ cfs_rq = cfs_rq_of(se);
+
+ if (list_add_leaf_cfs_rq(cfs_rq))
break;
}
assert_list_leaf_cfs_rq(rq);
- if (!se)
- add_nr_running(rq, task_delta);
-
/* Determine whether we need to wake up potentially idle CPU: */
if (rq->curr == rq->idle && rq->cfs.nr_running)
resched_curr(rq);
@@ -4727,7 +4786,7 @@
static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
{
struct hrtimer *refresh_timer = &cfs_b->period_timer;
- u64 remaining;
+ s64 remaining;
/* if the call-back is running a quota refresh is already occurring */
if (hrtimer_callback_running(refresh_timer))
@@ -4735,7 +4794,7 @@
/* is a quota refresh about to occur? */
remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
- if (remaining < min_expire)
+ if (remaining < (s64)min_expire)
return 1;
return 0;
@@ -4892,8 +4951,7 @@
if (cfs_rq_throttled(cfs_rq))
return true;
- throttle_cfs_rq(cfs_rq);
- return true;
+ return throttle_cfs_rq(cfs_rq);
}
static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
@@ -4923,6 +4981,8 @@
if (!overrun)
break;
+ idle = do_sched_cfs_period_timer(cfs_b, overrun, flags);
+
if (++count > 3) {
u64 new, old = ktime_to_ns(cfs_b->period);
@@ -4952,8 +5012,6 @@
/* reset count so we don't come right back in here */
count = 0;
}
-
- idle = do_sched_cfs_period_timer(cfs_b, overrun, flags);
}
if (idle)
cfs_b->period_active = 0;
@@ -5190,6 +5248,7 @@
struct cfs_rq *cfs_rq;
struct sched_entity *se = &p->se;
int idle_h_nr_running = task_has_idle_policy(p);
+ int task_new = !(flags & ENQUEUE_WAKEUP);
/*
* The code below (indirectly) updates schedutil which looks at
@@ -5213,32 +5272,38 @@
cfs_rq = cfs_rq_of(se);
enqueue_entity(cfs_rq, se, flags);
- /*
- * end evaluation on encountering a throttled cfs_rq
- *
- * note: in the case of encountering a throttled cfs_rq we will
- * post the final h_nr_running increment below.
- */
- if (cfs_rq_throttled(cfs_rq))
- break;
cfs_rq->h_nr_running++;
cfs_rq->idle_h_nr_running += idle_h_nr_running;
+ /* end evaluation on encountering a throttled cfs_rq */
+ if (cfs_rq_throttled(cfs_rq))
+ goto enqueue_throttle;
+
flags = ENQUEUE_WAKEUP;
}
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
- cfs_rq->h_nr_running++;
- cfs_rq->idle_h_nr_running += idle_h_nr_running;
-
- if (cfs_rq_throttled(cfs_rq))
- break;
update_load_avg(cfs_rq, se, UPDATE_TG);
update_cfs_group(se);
+
+ cfs_rq->h_nr_running++;
+ cfs_rq->idle_h_nr_running += idle_h_nr_running;
+
+ /* end evaluation on encountering a throttled cfs_rq */
+ if (cfs_rq_throttled(cfs_rq))
+ goto enqueue_throttle;
+
+ /*
+ * One parent has been throttled and cfs_rq removed from the
+ * list. Add it back to not break the leaf list.
+ */
+ if (throttled_hierarchy(cfs_rq))
+ list_add_leaf_cfs_rq(cfs_rq);
}
+enqueue_throttle:
if (!se) {
add_nr_running(rq, 1);
/*
@@ -5255,7 +5320,7 @@
* into account, but that is not straightforward to implement,
* and the following generally works well enough in practice.
*/
- if (flags & ENQUEUE_WAKEUP)
+ if (!task_new)
update_overutilized_status(rq);
}
@@ -5298,17 +5363,13 @@
cfs_rq = cfs_rq_of(se);
dequeue_entity(cfs_rq, se, flags);
- /*
- * end evaluation on encountering a throttled cfs_rq
- *
- * note: in the case of encountering a throttled cfs_rq we will
- * post the final h_nr_running decrement below.
- */
- if (cfs_rq_throttled(cfs_rq))
- break;
cfs_rq->h_nr_running--;
cfs_rq->idle_h_nr_running -= idle_h_nr_running;
+ /* end evaluation on encountering a throttled cfs_rq */
+ if (cfs_rq_throttled(cfs_rq))
+ goto dequeue_throttle;
+
/* Don't dequeue parent if it has other entities besides us */
if (cfs_rq->load.weight) {
/* Avoid re-evaluating load for this entity: */
@@ -5326,16 +5387,20 @@
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
- cfs_rq->h_nr_running--;
- cfs_rq->idle_h_nr_running -= idle_h_nr_running;
-
- if (cfs_rq_throttled(cfs_rq))
- break;
update_load_avg(cfs_rq, se, UPDATE_TG);
update_cfs_group(se);
+
+ cfs_rq->h_nr_running--;
+ cfs_rq->idle_h_nr_running -= idle_h_nr_running;
+
+ /* end evaluation on encountering a throttled cfs_rq */
+ if (cfs_rq_throttled(cfs_rq))
+ goto dequeue_throttle;
+
}
+dequeue_throttle:
if (!se)
sub_nr_running(rq, 1);
@@ -5892,7 +5957,7 @@
/*
* Scan the local SMT mask for idle CPUs.
*/
-static int select_idle_smt(struct task_struct *p, int target)
+static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
{
int cpu, si_cpu = -1;
@@ -5900,7 +5965,8 @@
return -1;
for_each_cpu(cpu, cpu_smt_mask(target)) {
- if (!cpumask_test_cpu(cpu, p->cpus_ptr))
+ if (!cpumask_test_cpu(cpu, p->cpus_ptr) ||
+ !cpumask_test_cpu(cpu, sched_domain_span(sd)))
continue;
if (available_idle_cpu(cpu))
return cpu;
@@ -5918,7 +5984,7 @@
return -1;
}
-static inline int select_idle_smt(struct task_struct *p, int target)
+static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
{
return -1;
}
@@ -5932,6 +5998,7 @@
*/
static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target)
{
+ struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
struct sched_domain *this_sd;
u64 avg_cost, avg_idle;
u64 time, cost;
@@ -5963,11 +6030,11 @@
time = cpu_clock(this);
- for_each_cpu_wrap(cpu, sched_domain_span(sd), target) {
+ cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
+
+ for_each_cpu_wrap(cpu, cpus, target) {
if (!--nr)
return si_cpu;
- if (!cpumask_test_cpu(cpu, p->cpus_ptr))
- continue;
if (available_idle_cpu(cpu))
break;
if (si_cpu == -1 && sched_idle_cpu(cpu))
@@ -6027,7 +6094,7 @@
if ((unsigned)i < nr_cpumask_bits)
return i;
- i = select_idle_smt(p, target);
+ i = select_idle_smt(p, sd, target);
if ((unsigned)i < nr_cpumask_bits)
return i;
@@ -7233,6 +7300,10 @@
if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
return 0;
+ /* Disregard pcpu kthreads; they are where they need to be. */
+ if (kthread_is_per_cpu(p))
+ return 0;
+
if (!cpumask_test_cpu(env->dst_cpu, p->cpus_ptr)) {
int cpu;
@@ -7381,7 +7452,15 @@
if (!can_migrate_task(p, env))
goto next;
- load = task_h_load(p);
+ /*
+ * Depending of the number of CPUs and tasks and the
+ * cgroup hierarchy, task_h_load() can return a null
+ * value. Make sure that env->imbalance decreases
+ * otherwise detach_tasks() will stop only after
+ * detaching up to loop_max tasks.
+ */
+ load = max_t(unsigned long, task_h_load(p), 1);
+
if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
goto next;
@@ -7517,6 +7596,28 @@
static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) {}
#endif
+static bool __update_blocked_others(struct rq *rq, bool *done)
+{
+ const struct sched_class *curr_class;
+ u64 now = rq_clock_pelt(rq);
+ bool decayed;
+
+ /*
+ * update_load_avg() can call cpufreq_update_util(). Make sure that RT,
+ * DL and IRQ signals have been updated before updating CFS.
+ */
+ curr_class = rq->curr->sched_class;
+
+ decayed = update_rt_rq_load_avg(now, rq, curr_class == &rt_sched_class) |
+ update_dl_rq_load_avg(now, rq, curr_class == &dl_sched_class) |
+ update_irq_load_avg(rq, 0);
+
+ if (others_have_blocked(rq))
+ *done = false;
+
+ return decayed;
+}
+
#ifdef CONFIG_FAIR_GROUP_SCHED
static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
@@ -7536,29 +7637,11 @@
return true;
}
-static void update_blocked_averages(int cpu)
+static bool __update_blocked_fair(struct rq *rq, bool *done)
{
- struct rq *rq = cpu_rq(cpu);
struct cfs_rq *cfs_rq, *pos;
- const struct sched_class *curr_class;
- struct rq_flags rf;
- bool done = true;
-
- rq_lock_irqsave(rq, &rf);
- update_rq_clock(rq);
-
- /*
- * update_cfs_rq_load_avg() can call cpufreq_update_util(). Make sure
- * that RT, DL and IRQ signals have been updated before updating CFS.
- */
- curr_class = rq->curr->sched_class;
- update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class);
- update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class);
- update_irq_load_avg(rq, 0);
-
- /* Don't need periodic decay once load/util_avg are null */
- if (others_have_blocked(rq))
- done = false;
+ bool decayed = false;
+ int cpu = cpu_of(rq);
/*
* Iterates the task_group tree in a bottom up fashion, see
@@ -7567,13 +7650,17 @@
for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) {
struct sched_entity *se;
- if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq))
+ if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq)) {
update_tg_load_avg(cfs_rq, 0);
+ if (cfs_rq == &rq->cfs)
+ decayed = true;
+ }
+
/* Propagate pending load changes to the parent, if any: */
se = cfs_rq->tg->se[cpu];
if (se && !skip_blocked_update(se))
- update_load_avg(cfs_rq_of(se), se, 0);
+ update_load_avg(cfs_rq_of(se), se, UPDATE_TG);
/*
* There can be a lot of idle CPU cgroups. Don't let fully
@@ -7584,11 +7671,10 @@
/* Don't need periodic decay once load/util_avg are null */
if (cfs_rq_has_blocked(cfs_rq))
- done = false;
+ *done = false;
}
- update_blocked_load_status(rq, !done);
- rq_unlock_irqrestore(rq, &rf);
+ return decayed;
}
/*
@@ -7638,29 +7724,16 @@
cfs_rq_load_avg(cfs_rq) + 1);
}
#else
-static inline void update_blocked_averages(int cpu)
+static bool __update_blocked_fair(struct rq *rq, bool *done)
{
- struct rq *rq = cpu_rq(cpu);
struct cfs_rq *cfs_rq = &rq->cfs;
- const struct sched_class *curr_class;
- struct rq_flags rf;
+ bool decayed;
- rq_lock_irqsave(rq, &rf);
- update_rq_clock(rq);
+ decayed = update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq);
+ if (cfs_rq_has_blocked(cfs_rq))
+ *done = false;
- /*
- * update_cfs_rq_load_avg() can call cpufreq_update_util(). Make sure
- * that RT, DL and IRQ signals have been updated before updating CFS.
- */
- curr_class = rq->curr->sched_class;
- update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class);
- update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class);
- update_irq_load_avg(rq, 0);
-
- update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq);
-
- update_blocked_load_status(rq, cfs_rq_has_blocked(cfs_rq) || others_have_blocked(rq));
- rq_unlock_irqrestore(rq, &rf);
+ return decayed;
}
static unsigned long task_h_load(struct task_struct *p)
@@ -7669,6 +7742,24 @@
}
#endif
+static void update_blocked_averages(int cpu)
+{
+ bool decayed = false, done = true;
+ struct rq *rq = cpu_rq(cpu);
+ struct rq_flags rf;
+
+ rq_lock_irqsave(rq, &rf);
+ update_rq_clock(rq);
+
+ decayed |= __update_blocked_others(rq, &done);
+ decayed |= __update_blocked_fair(rq, &done);
+
+ update_blocked_load_status(rq, !done);
+ if (decayed)
+ cpufreq_update_util(rq, 0);
+ rq_unlock_irqrestore(rq, &rf);
+}
+
/********** Helpers for find_busiest_group ************************/
/*
@@ -9335,7 +9426,12 @@
{
int ilb_cpu;
- nohz.next_balance++;
+ /*
+ * Increase nohz.next_balance only when if full ilb is triggered but
+ * not if we only update stats.
+ */
+ if (flags & NOHZ_BALANCE_KICK)
+ nohz.next_balance = jiffies+1;
ilb_cpu = find_new_ilb();
@@ -9653,6 +9749,14 @@
}
}
+ /*
+ * next_balance will be updated only when there is a need.
+ * When the CPU is attached to null domain for ex, it will not be
+ * updated.
+ */
+ if (likely(update_next_balance))
+ nohz.next_balance = next_balance;
+
/* Newly idle CPU doesn't need an update */
if (idle != CPU_NEWLY_IDLE) {
update_blocked_averages(this_cpu);
@@ -9673,14 +9777,6 @@
if (has_blocked_load)
WRITE_ONCE(nohz.has_blocked, 1);
- /*
- * next_balance will be updated only when there is a need.
- * When the CPU is attached to null domain for ex, it will not be
- * updated.
- */
- if (likely(update_next_balance))
- nohz.next_balance = next_balance;
-
return ret;
}
@@ -10050,16 +10146,22 @@
{
struct cfs_rq *cfs_rq;
+ list_add_leaf_cfs_rq(cfs_rq_of(se));
+
/* Start to propagate at parent */
se = se->parent;
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
- if (cfs_rq_throttled(cfs_rq))
- break;
+ if (!cfs_rq_throttled(cfs_rq)){
+ update_load_avg(cfs_rq, se, UPDATE_TG);
+ list_add_leaf_cfs_rq(cfs_rq);
+ continue;
+ }
- update_load_avg(cfs_rq, se, UPDATE_TG);
+ if (list_add_leaf_cfs_rq(cfs_rq))
+ break;
}
}
#else
@@ -10151,7 +10253,7 @@
* This routine is mostly called to set cfs_rq->curr field when a task
* migrates between groups/classes.
*/
-static void set_next_task_fair(struct rq *rq, struct task_struct *p)
+static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
{
struct sched_entity *se = &p->se;
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index f65ef1e..3f8c786 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -249,6 +249,7 @@
}
arch_cpu_idle_enter();
+ rcu_nocb_flush_deferred_wakeup();
/*
* In poll mode we reenable interrupts and spin. Also if we
@@ -385,7 +386,7 @@
{
}
-static void set_next_task_idle(struct rq *rq, struct task_struct *next)
+static void set_next_task_idle(struct rq *rq, struct task_struct *next, bool first)
{
update_idle_core(rq);
schedstat_inc(rq->sched_goidle);
@@ -399,7 +400,7 @@
if (prev)
put_prev_task(rq, prev);
- set_next_task_idle(rq, next);
+ set_next_task_idle(rq, next, true);
return next;
}
diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c
index 28a5165..de22da6 100644
--- a/kernel/sched/loadavg.c
+++ b/kernel/sched/loadavg.c
@@ -231,16 +231,11 @@
return calc_load_idx & 1;
}
-void calc_load_nohz_start(void)
+static void calc_load_nohz_fold(struct rq *rq)
{
- struct rq *this_rq = this_rq();
long delta;
- /*
- * We're going into NO_HZ mode, if there's any pending delta, fold it
- * into the pending NO_HZ delta.
- */
- delta = calc_load_fold_active(this_rq, 0);
+ delta = calc_load_fold_active(rq, 0);
if (delta) {
int idx = calc_load_write_idx();
@@ -248,6 +243,24 @@
}
}
+void calc_load_nohz_start(void)
+{
+ /*
+ * We're going into NO_HZ mode, if there's any pending delta, fold it
+ * into the pending NO_HZ delta.
+ */
+ calc_load_nohz_fold(this_rq());
+}
+
+/*
+ * Keep track of the load for NOHZ_FULL, must be called between
+ * calc_load_nohz_{start,stop}().
+ */
+void calc_load_nohz_remote(struct rq *rq)
+{
+ calc_load_nohz_fold(rq);
+}
+
void calc_load_nohz_stop(void)
{
struct rq *this_rq = this_rq();
@@ -268,7 +281,7 @@
this_rq->calc_load_update += LOAD_FREQ;
}
-static long calc_load_nohz_fold(void)
+static long calc_load_nohz_read(void)
{
int idx = calc_load_read_idx();
long delta = 0;
@@ -323,7 +336,7 @@
}
#else /* !CONFIG_NO_HZ_COMMON */
-static inline long calc_load_nohz_fold(void) { return 0; }
+static inline long calc_load_nohz_read(void) { return 0; }
static inline void calc_global_nohz(void) { }
#endif /* CONFIG_NO_HZ_COMMON */
@@ -346,7 +359,7 @@
/*
* Fold the 'old' NO_HZ-delta to include all NO_HZ CPUs.
*/
- delta = calc_load_nohz_fold();
+ delta = calc_load_nohz_read();
if (delta)
atomic_long_add(delta, &calc_load_tasks);
diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c
index 168479a..46c142b 100644
--- a/kernel/sched/membarrier.c
+++ b/kernel/sched/membarrier.c
@@ -30,6 +30,23 @@
smp_mb(); /* IPIs should be serializing but paranoid. */
}
+static void ipi_sync_core(void *info)
+{
+ /*
+ * The smp_mb() in membarrier after all the IPIs is supposed to
+ * ensure that memory on remote CPUs that occur before the IPI
+ * become visible to membarrier()'s caller -- see scenario B in
+ * the big comment at the top of this file.
+ *
+ * A sync_core() would provide this guarantee, but
+ * sync_core_before_usermode() might end up being deferred until
+ * after membarrier()'s smp_mb().
+ */
+ smp_mb(); /* IPIs should be serializing but paranoid. */
+
+ sync_core_before_usermode();
+}
+
static void ipi_sync_rq_state(void *info)
{
struct mm_struct *mm = (struct mm_struct *) info;
@@ -134,6 +151,7 @@
int cpu;
cpumask_var_t tmpmask;
struct mm_struct *mm = current->mm;
+ smp_call_func_t ipi_func = ipi_mb;
if (flags & MEMBARRIER_FLAG_SYNC_CORE) {
if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE))
@@ -141,6 +159,7 @@
if (!(atomic_read(&mm->membarrier_state) &
MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY))
return -EPERM;
+ ipi_func = ipi_sync_core;
} else {
if (!(atomic_read(&mm->membarrier_state) &
MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY))
@@ -181,7 +200,7 @@
rcu_read_unlock();
preempt_disable();
- smp_call_function_many(tmpmask, ipi_mb, NULL, 1);
+ smp_call_function_many(tmpmask, ipi_func, NULL, 1);
preempt_enable();
free_cpumask_var(tmpmask);
@@ -246,9 +265,7 @@
}
rcu_read_unlock();
- preempt_disable();
- smp_call_function_many(tmpmask, ipi_sync_rq_state, mm, 1);
- preempt_enable();
+ on_each_cpu_mask(tmpmask, ipi_sync_rq_state, mm, true);
free_cpumask_var(tmpmask);
cpus_read_unlock();
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 517e371..9154e74 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -185,7 +185,8 @@
for_each_possible_cpu(cpu)
seqcount_init(&per_cpu_ptr(group->pcpu, cpu)->seq);
- group->avg_next_update = sched_clock() + psi_period;
+ group->avg_last_update = sched_clock();
+ group->avg_next_update = group->avg_last_update + psi_period;
INIT_DELAYED_WORK(&group->avgs_work, psi_avgs_work);
mutex_init(&group->avgs_lock);
/* Init trigger-related members */
@@ -481,7 +482,7 @@
u32 remaining;
remaining = win->size - elapsed;
- growth += div_u64(win->prev_growth * remaining, win->size);
+ growth += div64_u64(win->prev_growth * remaining, win->size);
}
return growth;
@@ -1198,6 +1199,9 @@
if (static_branch_likely(&psi_disabled))
return -EOPNOTSUPP;
+ if (!nbytes)
+ return -EINVAL;
+
buf_size = min(nbytes, sizeof(buf));
if (copy_from_user(buf, user_buf, buf_size))
return -EFAULT;
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 9b8adc0..2dffb87 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -9,6 +9,8 @@
int sched_rr_timeslice = RR_TIMESLICE;
int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE;
+/* More than 4 hours if BW_SHIFT equals 20. */
+static const u64 max_rt_runtime = MAX_BW;
static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
@@ -1515,13 +1517,16 @@
#endif
}
-static inline void set_next_task_rt(struct rq *rq, struct task_struct *p)
+static inline void set_next_task_rt(struct rq *rq, struct task_struct *p, bool first)
{
p->se.exec_start = rq_clock_task(rq);
/* The running task is never eligible for pushing */
dequeue_pushable_task(rq, p);
+ if (!first)
+ return;
+
/*
* If prev task was rt, put_prev_task() has already updated the
* utilization. We only care of the case where we start to schedule a
@@ -1575,7 +1580,7 @@
return NULL;
p = _pick_next_task_rt(rq);
- set_next_task_rt(rq, p);
+ set_next_task_rt(rq, p, true);
return p;
}
@@ -2216,13 +2221,20 @@
static void switched_to_rt(struct rq *rq, struct task_struct *p)
{
/*
- * If we are already running, then there's nothing
- * that needs to be done. But if we are not running
- * we may need to preempt the current running task.
- * If that current running task is also an RT task
+ * If we are running, update the avg_rt tracking, as the running time
+ * will now on be accounted into the latter.
+ */
+ if (task_current(rq, p)) {
+ update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 0);
+ return;
+ }
+
+ /*
+ * If we are not running we may need to preempt the current
+ * running task. If that current running task is also an RT task
* then see if we can move to another run queue.
*/
- if (task_on_rq_queued(p) && rq->curr != p) {
+ if (task_on_rq_queued(p)) {
#ifdef CONFIG_SMP
if (p->nr_cpus_allowed > 1 && rq->rt.overloaded)
rt_queue_push_tasks(rq);
@@ -2510,6 +2522,12 @@
if (rt_period == 0)
return -EINVAL;
+ /*
+ * Bound quota to defend quota against overflow during bandwidth shift.
+ */
+ if (rt_runtime != RUNTIME_INF && rt_runtime > max_rt_runtime)
+ return -EINVAL;
+
mutex_lock(&rt_constraints_mutex);
read_lock(&tasklist_lock);
err = __rt_schedulable(tg, rt_period, rt_runtime);
@@ -2631,7 +2649,9 @@
return -EINVAL;
if ((sysctl_sched_rt_runtime != RUNTIME_INF) &&
- (sysctl_sched_rt_runtime > sysctl_sched_rt_period))
+ ((sysctl_sched_rt_runtime > sysctl_sched_rt_period) ||
+ ((u64)sysctl_sched_rt_runtime *
+ NSEC_PER_USEC > max_rt_runtime)))
return -EINVAL;
return 0;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index c8870c5..fe755c1 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -118,7 +118,13 @@
#ifdef CONFIG_64BIT
# define NICE_0_LOAD_SHIFT (SCHED_FIXEDPOINT_SHIFT + SCHED_FIXEDPOINT_SHIFT)
# define scale_load(w) ((w) << SCHED_FIXEDPOINT_SHIFT)
-# define scale_load_down(w) ((w) >> SCHED_FIXEDPOINT_SHIFT)
+# define scale_load_down(w) \
+({ \
+ unsigned long __w = (w); \
+ if (__w) \
+ __w = max(2UL, __w >> SCHED_FIXEDPOINT_SHIFT); \
+ __w; \
+})
#else
# define NICE_0_LOAD_SHIFT (SCHED_FIXEDPOINT_SHIFT)
# define scale_load(w) (w)
@@ -203,6 +209,8 @@
*/
#define SCHED_FLAG_SUGOV 0x10000000
+#define SCHED_DL_FLAGS (SCHED_FLAG_RECLAIM | SCHED_FLAG_DL_OVERRUN | SCHED_FLAG_SUGOV)
+
static inline bool dl_entity_is_special(struct sched_dl_entity *dl_se)
{
#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL
@@ -241,30 +249,6 @@
void __dl_clear_params(struct task_struct *p);
-/*
- * To keep the bandwidth of -deadline tasks and groups under control
- * we need some place where:
- * - store the maximum -deadline bandwidth of the system (the group);
- * - cache the fraction of that bandwidth that is currently allocated.
- *
- * This is all done in the data structure below. It is similar to the
- * one used for RT-throttling (rt_bandwidth), with the main difference
- * that, since here we are only interested in admission control, we
- * do not decrease any runtime while the group "executes", neither we
- * need a timer to replenish it.
- *
- * With respect to SMP, the bandwidth is given on a per-CPU basis,
- * meaning that:
- * - dl_bw (< 100%) is the bandwidth of the system (group) on each CPU;
- * - dl_total_bw array contains, in the i-eth element, the currently
- * allocated bandwidth on the i-eth CPU.
- * Moreover, groups consume bandwidth on each CPU, while tasks only
- * consume bandwidth on the CPU they're running on.
- * Finally, dl_total_bw_cpu is used to cache the index of dl_total_bw
- * that will be shown the next time the proc or cgroup controls will
- * be red. It on its turn can be changed by writing on its own
- * control.
- */
struct dl_bandwidth {
raw_spinlock_t dl_runtime_lock;
u64 dl_runtime;
@@ -276,6 +260,24 @@
return sysctl_sched_rt_runtime >= 0;
}
+/*
+ * To keep the bandwidth of -deadline tasks under control
+ * we need some place where:
+ * - store the maximum -deadline bandwidth of each cpu;
+ * - cache the fraction of bandwidth that is currently allocated in
+ * each root domain;
+ *
+ * This is all done in the data structure below. It is similar to the
+ * one used for RT-throttling (rt_bandwidth), with the main difference
+ * that, since here we are only interested in admission control, we
+ * do not decrease any runtime while the group "executes", neither we
+ * need a timer to replenish it.
+ *
+ * With respect to SMP, bandwidth is given on a per root domain basis,
+ * meaning that:
+ * - bw (< 100%) is the deadline bandwidth of each CPU;
+ * - total_bw is the currently allocated bandwidth in each root domain;
+ */
struct dl_bw {
raw_spinlock_t lock;
u64 bw;
@@ -835,6 +837,8 @@
unsigned int value;
struct uclamp_bucket bucket[UCLAMP_BUCKETS];
};
+
+DECLARE_STATIC_KEY_FALSE(sched_uclamp_used);
#endif /* CONFIG_UCLAMP_TASK */
/*
@@ -971,6 +975,7 @@
call_single_data_t hrtick_csd;
#endif
struct hrtimer hrtick_timer;
+ ktime_t hrtick_time;
#endif
#ifdef CONFIG_SCHEDSTATS
@@ -1560,7 +1565,7 @@
#undef SCHED_FEAT
-#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_JUMP_LABEL)
+#ifdef CONFIG_SCHED_DEBUG
/*
* To support run-time toggling of sched features, all the translation units
@@ -1568,6 +1573,7 @@
*/
extern const_debug unsigned int sysctl_sched_features;
+#ifdef CONFIG_JUMP_LABEL
#define SCHED_FEAT(name, enabled) \
static __always_inline bool static_branch_##name(struct static_key *key) \
{ \
@@ -1580,7 +1586,13 @@
extern struct static_key sched_feat_keys[__SCHED_FEAT_NR];
#define sched_feat(x) (static_branch_##x(&sched_feat_keys[__SCHED_FEAT_##x]))
-#else /* !(SCHED_DEBUG && CONFIG_JUMP_LABEL) */
+#else /* !CONFIG_JUMP_LABEL */
+
+#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
+
+#endif /* CONFIG_JUMP_LABEL */
+
+#else /* !SCHED_DEBUG */
/*
* Each translation unit has its own copy of sysctl_sched_features to allow
@@ -1596,7 +1608,7 @@
#define sched_feat(x) !!(sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
-#endif /* SCHED_DEBUG && CONFIG_JUMP_LABEL */
+#endif /* SCHED_DEBUG */
extern struct static_key_false sched_numa_balancing;
extern struct static_key_false sched_schedstats;
@@ -1728,7 +1740,7 @@
struct task_struct *prev,
struct rq_flags *rf);
void (*put_prev_task)(struct rq *rq, struct task_struct *p);
- void (*set_next_task)(struct rq *rq, struct task_struct *p);
+ void (*set_next_task)(struct rq *rq, struct task_struct *p, bool first);
#ifdef CONFIG_SMP
int (*balance)(struct rq *rq, struct task_struct *prev, struct rq_flags *rf);
@@ -1780,7 +1792,7 @@
static inline void set_next_task(struct rq *rq, struct task_struct *next)
{
WARN_ON_ONCE(rq->curr != next);
- next->sched_class->set_next_task(rq, next);
+ next->sched_class->set_next_task(rq, next, false);
}
#ifdef CONFIG_SMP
@@ -1883,6 +1895,8 @@
#define BW_SHIFT 20
#define BW_UNIT (1 << BW_SHIFT)
#define RATIO_SHIFT 8
+#define MAX_BW_BITS (64 - BW_SHIFT)
+#define MAX_BW ((1ULL << MAX_BW_BITS) - 1)
unsigned long to_ratio(u64 period, u64 runtime);
extern void init_entity_runnable_average(struct sched_entity *se);
@@ -2309,14 +2323,37 @@
#endif /* CONFIG_CPU_FREQ */
#ifdef CONFIG_UCLAMP_TASK
-enum uclamp_id uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id);
+unsigned int uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id);
+/**
+ * uclamp_util_with - clamp @util with @rq and @p effective uclamp values.
+ * @rq: The rq to clamp against. Must not be NULL.
+ * @util: The util value to clamp.
+ * @p: The task to clamp against. Can be NULL if you want to clamp
+ * against @rq only.
+ *
+ * Clamps the passed @util to the max(@rq, @p) effective uclamp values.
+ *
+ * If sched_uclamp_used static key is disabled, then just return the util
+ * without any clamping since uclamp aggregation at the rq level in the fast
+ * path is disabled, rendering this operation a NOP.
+ *
+ * Use uclamp_eff_value() if you don't care about uclamp values at rq level. It
+ * will return the correct effective uclamp value of the task even if the
+ * static key is disabled.
+ */
static __always_inline
unsigned int uclamp_util_with(struct rq *rq, unsigned int util,
struct task_struct *p)
{
- unsigned int min_util = READ_ONCE(rq->uclamp[UCLAMP_MIN].value);
- unsigned int max_util = READ_ONCE(rq->uclamp[UCLAMP_MAX].value);
+ unsigned int min_util;
+ unsigned int max_util;
+
+ if (!static_branch_likely(&sched_uclamp_used))
+ return util;
+
+ min_util = READ_ONCE(rq->uclamp[UCLAMP_MIN].value);
+ max_util = READ_ONCE(rq->uclamp[UCLAMP_MAX].value);
if (p) {
min_util = max(min_util, uclamp_eff_value(p, UCLAMP_MIN));
@@ -2338,6 +2375,19 @@
{
return uclamp_util_with(rq, util, NULL);
}
+
+/*
+ * When uclamp is compiled in, the aggregation at rq level is 'turned off'
+ * by default in the fast path and only gets turned on once userspace performs
+ * an operation that requires it.
+ *
+ * Returns true if userspace opted-in to use uclamp and aggregation at rq level
+ * hence is active.
+ */
+static inline bool uclamp_is_used(void)
+{
+ return static_branch_likely(&sched_uclamp_used);
+}
#else /* CONFIG_UCLAMP_TASK */
static inline unsigned int uclamp_util_with(struct rq *rq, unsigned int util,
struct task_struct *p)
@@ -2348,6 +2398,11 @@
{
return util;
}
+
+static inline bool uclamp_is_used(void)
+{
+ return false;
+}
#endif /* CONFIG_UCLAMP_TASK */
#ifdef arch_scale_freq_capacity
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index c064073..02dc0a8 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -29,7 +29,7 @@
/* we're never preempted */
}
-static void set_next_task_stop(struct rq *rq, struct task_struct *stop)
+static void set_next_task_stop(struct rq *rq, struct task_struct *stop, bool first)
{
stop->se.exec_start = rq_clock_task(rq);
}
@@ -42,7 +42,7 @@
if (!sched_stop_runnable(rq))
return NULL;
- set_next_task_stop(rq, rq->stop);
+ set_next_task_stop(rq, rq->stop, true);
return rq->stop;
}
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 49b835f..ffaa97a 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1333,7 +1333,7 @@
sd_flags = (*tl->sd_flags)();
if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS,
"wrong sd_flags in topology description\n"))
- sd_flags &= ~TOPOLOGY_SD_FLAGS;
+ sd_flags &= TOPOLOGY_SD_FLAGS;
/* Apply detected topology flags */
sd_flags |= dflags;
@@ -1883,6 +1883,42 @@
}
/*
+ * Ensure topology masks are sane, i.e. there are no conflicts (overlaps) for
+ * any two given CPUs at this (non-NUMA) topology level.
+ */
+static bool topology_span_sane(struct sched_domain_topology_level *tl,
+ const struct cpumask *cpu_map, int cpu)
+{
+ int i;
+
+ /* NUMA levels are allowed to overlap */
+ if (tl->flags & SDTL_OVERLAP)
+ return true;
+
+ /*
+ * Non-NUMA levels cannot partially overlap - they must be either
+ * completely equal or completely disjoint. Otherwise we can end up
+ * breaking the sched_group lists - i.e. a later get_group() pass
+ * breaks the linking done for an earlier span.
+ */
+ for_each_cpu(i, cpu_map) {
+ if (i == cpu)
+ continue;
+ /*
+ * We should 'and' all those masks with 'cpu_map' to exactly
+ * match the topology we're about to build, but that can only
+ * remove CPUs, which only lessens our ability to detect
+ * overlaps
+ */
+ if (!cpumask_equal(tl->mask(cpu), tl->mask(i)) &&
+ cpumask_intersects(tl->mask(cpu), tl->mask(i)))
+ return false;
+ }
+
+ return true;
+}
+
+/*
* Find the sched_domain_topology_level where all CPU capacities are visible
* for all CPUs.
*/
@@ -1978,6 +2014,9 @@
has_asym = true;
}
+ if (WARN_ON(!topology_span_sane(tl, cpu_map, i)))
+ goto error;
+
sd = build_sched_domain(tl, cpu_map, attr, sd, dflags, i);
if (tl == sched_domain_topology)
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index c1e566a..84bd051 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -232,17 +232,22 @@
}
EXPORT_SYMBOL(prepare_to_wait);
-void
+/* Returns true if we are the first waiter in the queue, false otherwise. */
+bool
prepare_to_wait_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state)
{
unsigned long flags;
+ bool was_empty = false;
wq_entry->flags |= WQ_FLAG_EXCLUSIVE;
spin_lock_irqsave(&wq_head->lock, flags);
- if (list_empty(&wq_entry->entry))
+ if (list_empty(&wq_entry->entry)) {
+ was_empty = list_empty(&wq_head->head);
__add_wait_queue_entry_tail(wq_head, wq_entry);
+ }
set_current_state(state);
spin_unlock_irqrestore(&wq_head->lock, flags);
+ return was_empty;
}
EXPORT_SYMBOL(prepare_to_wait_exclusive);