blob: 2dffb8762e16b286506cb75680c58a4d38e0761d [file] [log] [blame]
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001// SPDX-License-Identifier: GPL-2.0
2/*
3 * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR
4 * policies)
5 */
6#include "sched.h"
7
8#include "pelt.h"
9
10int sched_rr_timeslice = RR_TIMESLICE;
11int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE;
Olivier Deprez0e641232021-09-23 10:07:05 +020012/* More than 4 hours if BW_SHIFT equals 20. */
13static const u64 max_rt_runtime = MAX_BW;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +000014
15static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
16
17struct rt_bandwidth def_rt_bandwidth;
18
19static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
20{
21 struct rt_bandwidth *rt_b =
22 container_of(timer, struct rt_bandwidth, rt_period_timer);
23 int idle = 0;
24 int overrun;
25
26 raw_spin_lock(&rt_b->rt_runtime_lock);
27 for (;;) {
28 overrun = hrtimer_forward_now(timer, rt_b->rt_period);
29 if (!overrun)
30 break;
31
32 raw_spin_unlock(&rt_b->rt_runtime_lock);
33 idle = do_sched_rt_period_timer(rt_b, overrun);
34 raw_spin_lock(&rt_b->rt_runtime_lock);
35 }
36 if (idle)
37 rt_b->rt_period_active = 0;
38 raw_spin_unlock(&rt_b->rt_runtime_lock);
39
40 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
41}
42
43void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
44{
45 rt_b->rt_period = ns_to_ktime(period);
46 rt_b->rt_runtime = runtime;
47
48 raw_spin_lock_init(&rt_b->rt_runtime_lock);
49
David Brazdil0f672f62019-12-10 10:32:29 +000050 hrtimer_init(&rt_b->rt_period_timer, CLOCK_MONOTONIC,
51 HRTIMER_MODE_REL_HARD);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +000052 rt_b->rt_period_timer.function = sched_rt_period_timer;
53}
54
55static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
56{
57 if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
58 return;
59
60 raw_spin_lock(&rt_b->rt_runtime_lock);
61 if (!rt_b->rt_period_active) {
62 rt_b->rt_period_active = 1;
63 /*
64 * SCHED_DEADLINE updates the bandwidth, as a run away
65 * RT task with a DL task could hog a CPU. But DL does
66 * not reset the period. If a deadline task was running
67 * without an RT task running, it can cause RT tasks to
68 * throttle when they start up. Kick the timer right away
69 * to update the period.
70 */
71 hrtimer_forward_now(&rt_b->rt_period_timer, ns_to_ktime(0));
David Brazdil0f672f62019-12-10 10:32:29 +000072 hrtimer_start_expires(&rt_b->rt_period_timer,
73 HRTIMER_MODE_ABS_PINNED_HARD);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +000074 }
75 raw_spin_unlock(&rt_b->rt_runtime_lock);
76}
77
78void init_rt_rq(struct rt_rq *rt_rq)
79{
80 struct rt_prio_array *array;
81 int i;
82
83 array = &rt_rq->active;
84 for (i = 0; i < MAX_RT_PRIO; i++) {
85 INIT_LIST_HEAD(array->queue + i);
86 __clear_bit(i, array->bitmap);
87 }
88 /* delimiter for bitsearch: */
89 __set_bit(MAX_RT_PRIO, array->bitmap);
90
91#if defined CONFIG_SMP
92 rt_rq->highest_prio.curr = MAX_RT_PRIO;
93 rt_rq->highest_prio.next = MAX_RT_PRIO;
94 rt_rq->rt_nr_migratory = 0;
95 rt_rq->overloaded = 0;
96 plist_head_init(&rt_rq->pushable_tasks);
97#endif /* CONFIG_SMP */
98 /* We start is dequeued state, because no RT tasks are queued */
99 rt_rq->rt_queued = 0;
100
101 rt_rq->rt_time = 0;
102 rt_rq->rt_throttled = 0;
103 rt_rq->rt_runtime = 0;
104 raw_spin_lock_init(&rt_rq->rt_runtime_lock);
105}
106
107#ifdef CONFIG_RT_GROUP_SCHED
108static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
109{
110 hrtimer_cancel(&rt_b->rt_period_timer);
111}
112
113#define rt_entity_is_task(rt_se) (!(rt_se)->my_q)
114
115static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
116{
117#ifdef CONFIG_SCHED_DEBUG
118 WARN_ON_ONCE(!rt_entity_is_task(rt_se));
119#endif
120 return container_of(rt_se, struct task_struct, rt);
121}
122
123static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
124{
125 return rt_rq->rq;
126}
127
128static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
129{
130 return rt_se->rt_rq;
131}
132
133static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se)
134{
135 struct rt_rq *rt_rq = rt_se->rt_rq;
136
137 return rt_rq->rq;
138}
139
140void free_rt_sched_group(struct task_group *tg)
141{
142 int i;
143
144 if (tg->rt_se)
145 destroy_rt_bandwidth(&tg->rt_bandwidth);
146
147 for_each_possible_cpu(i) {
148 if (tg->rt_rq)
149 kfree(tg->rt_rq[i]);
150 if (tg->rt_se)
151 kfree(tg->rt_se[i]);
152 }
153
154 kfree(tg->rt_rq);
155 kfree(tg->rt_se);
156}
157
158void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
159 struct sched_rt_entity *rt_se, int cpu,
160 struct sched_rt_entity *parent)
161{
162 struct rq *rq = cpu_rq(cpu);
163
164 rt_rq->highest_prio.curr = MAX_RT_PRIO;
165 rt_rq->rt_nr_boosted = 0;
166 rt_rq->rq = rq;
167 rt_rq->tg = tg;
168
169 tg->rt_rq[cpu] = rt_rq;
170 tg->rt_se[cpu] = rt_se;
171
172 if (!rt_se)
173 return;
174
175 if (!parent)
176 rt_se->rt_rq = &rq->rt;
177 else
178 rt_se->rt_rq = parent->my_q;
179
180 rt_se->my_q = rt_rq;
181 rt_se->parent = parent;
182 INIT_LIST_HEAD(&rt_se->run_list);
183}
184
185int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
186{
187 struct rt_rq *rt_rq;
188 struct sched_rt_entity *rt_se;
189 int i;
190
191 tg->rt_rq = kcalloc(nr_cpu_ids, sizeof(rt_rq), GFP_KERNEL);
192 if (!tg->rt_rq)
193 goto err;
194 tg->rt_se = kcalloc(nr_cpu_ids, sizeof(rt_se), GFP_KERNEL);
195 if (!tg->rt_se)
196 goto err;
197
198 init_rt_bandwidth(&tg->rt_bandwidth,
199 ktime_to_ns(def_rt_bandwidth.rt_period), 0);
200
201 for_each_possible_cpu(i) {
202 rt_rq = kzalloc_node(sizeof(struct rt_rq),
203 GFP_KERNEL, cpu_to_node(i));
204 if (!rt_rq)
205 goto err;
206
207 rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
208 GFP_KERNEL, cpu_to_node(i));
209 if (!rt_se)
210 goto err_free_rq;
211
212 init_rt_rq(rt_rq);
213 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
214 init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
215 }
216
217 return 1;
218
219err_free_rq:
220 kfree(rt_rq);
221err:
222 return 0;
223}
224
225#else /* CONFIG_RT_GROUP_SCHED */
226
227#define rt_entity_is_task(rt_se) (1)
228
229static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
230{
231 return container_of(rt_se, struct task_struct, rt);
232}
233
234static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
235{
236 return container_of(rt_rq, struct rq, rt);
237}
238
239static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se)
240{
241 struct task_struct *p = rt_task_of(rt_se);
242
243 return task_rq(p);
244}
245
246static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
247{
248 struct rq *rq = rq_of_rt_se(rt_se);
249
250 return &rq->rt;
251}
252
253void free_rt_sched_group(struct task_group *tg) { }
254
255int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
256{
257 return 1;
258}
259#endif /* CONFIG_RT_GROUP_SCHED */
260
261#ifdef CONFIG_SMP
262
263static void pull_rt_task(struct rq *this_rq);
264
265static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
266{
267 /* Try to pull RT tasks here if we lower this rq's prio */
268 return rq->rt.highest_prio.curr > prev->prio;
269}
270
271static inline int rt_overloaded(struct rq *rq)
272{
273 return atomic_read(&rq->rd->rto_count);
274}
275
276static inline void rt_set_overload(struct rq *rq)
277{
278 if (!rq->online)
279 return;
280
281 cpumask_set_cpu(rq->cpu, rq->rd->rto_mask);
282 /*
283 * Make sure the mask is visible before we set
284 * the overload count. That is checked to determine
285 * if we should look at the mask. It would be a shame
286 * if we looked at the mask, but the mask was not
287 * updated yet.
288 *
289 * Matched by the barrier in pull_rt_task().
290 */
291 smp_wmb();
292 atomic_inc(&rq->rd->rto_count);
293}
294
295static inline void rt_clear_overload(struct rq *rq)
296{
297 if (!rq->online)
298 return;
299
300 /* the order here really doesn't matter */
301 atomic_dec(&rq->rd->rto_count);
302 cpumask_clear_cpu(rq->cpu, rq->rd->rto_mask);
303}
304
305static void update_rt_migration(struct rt_rq *rt_rq)
306{
307 if (rt_rq->rt_nr_migratory && rt_rq->rt_nr_total > 1) {
308 if (!rt_rq->overloaded) {
309 rt_set_overload(rq_of_rt_rq(rt_rq));
310 rt_rq->overloaded = 1;
311 }
312 } else if (rt_rq->overloaded) {
313 rt_clear_overload(rq_of_rt_rq(rt_rq));
314 rt_rq->overloaded = 0;
315 }
316}
317
318static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
319{
320 struct task_struct *p;
321
322 if (!rt_entity_is_task(rt_se))
323 return;
324
325 p = rt_task_of(rt_se);
326 rt_rq = &rq_of_rt_rq(rt_rq)->rt;
327
328 rt_rq->rt_nr_total++;
329 if (p->nr_cpus_allowed > 1)
330 rt_rq->rt_nr_migratory++;
331
332 update_rt_migration(rt_rq);
333}
334
335static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
336{
337 struct task_struct *p;
338
339 if (!rt_entity_is_task(rt_se))
340 return;
341
342 p = rt_task_of(rt_se);
343 rt_rq = &rq_of_rt_rq(rt_rq)->rt;
344
345 rt_rq->rt_nr_total--;
346 if (p->nr_cpus_allowed > 1)
347 rt_rq->rt_nr_migratory--;
348
349 update_rt_migration(rt_rq);
350}
351
352static inline int has_pushable_tasks(struct rq *rq)
353{
354 return !plist_head_empty(&rq->rt.pushable_tasks);
355}
356
357static DEFINE_PER_CPU(struct callback_head, rt_push_head);
358static DEFINE_PER_CPU(struct callback_head, rt_pull_head);
359
360static void push_rt_tasks(struct rq *);
361static void pull_rt_task(struct rq *);
362
363static inline void rt_queue_push_tasks(struct rq *rq)
364{
365 if (!has_pushable_tasks(rq))
366 return;
367
368 queue_balance_callback(rq, &per_cpu(rt_push_head, rq->cpu), push_rt_tasks);
369}
370
371static inline void rt_queue_pull_task(struct rq *rq)
372{
373 queue_balance_callback(rq, &per_cpu(rt_pull_head, rq->cpu), pull_rt_task);
374}
375
376static void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
377{
378 plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
379 plist_node_init(&p->pushable_tasks, p->prio);
380 plist_add(&p->pushable_tasks, &rq->rt.pushable_tasks);
381
382 /* Update the highest prio pushable task */
383 if (p->prio < rq->rt.highest_prio.next)
384 rq->rt.highest_prio.next = p->prio;
385}
386
387static void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
388{
389 plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
390
391 /* Update the new highest prio pushable task */
392 if (has_pushable_tasks(rq)) {
393 p = plist_first_entry(&rq->rt.pushable_tasks,
394 struct task_struct, pushable_tasks);
395 rq->rt.highest_prio.next = p->prio;
396 } else
397 rq->rt.highest_prio.next = MAX_RT_PRIO;
398}
399
400#else
401
402static inline void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
403{
404}
405
406static inline void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
407{
408}
409
410static inline
411void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
412{
413}
414
415static inline
416void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
417{
418}
419
420static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
421{
422 return false;
423}
424
425static inline void pull_rt_task(struct rq *this_rq)
426{
427}
428
429static inline void rt_queue_push_tasks(struct rq *rq)
430{
431}
432#endif /* CONFIG_SMP */
433
434static void enqueue_top_rt_rq(struct rt_rq *rt_rq);
435static void dequeue_top_rt_rq(struct rt_rq *rt_rq);
436
437static inline int on_rt_rq(struct sched_rt_entity *rt_se)
438{
439 return rt_se->on_rq;
440}
441
442#ifdef CONFIG_RT_GROUP_SCHED
443
444static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
445{
446 if (!rt_rq->tg)
447 return RUNTIME_INF;
448
449 return rt_rq->rt_runtime;
450}
451
452static inline u64 sched_rt_period(struct rt_rq *rt_rq)
453{
454 return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period);
455}
456
457typedef struct task_group *rt_rq_iter_t;
458
459static inline struct task_group *next_task_group(struct task_group *tg)
460{
461 do {
462 tg = list_entry_rcu(tg->list.next,
463 typeof(struct task_group), list);
464 } while (&tg->list != &task_groups && task_group_is_autogroup(tg));
465
466 if (&tg->list == &task_groups)
467 tg = NULL;
468
469 return tg;
470}
471
472#define for_each_rt_rq(rt_rq, iter, rq) \
473 for (iter = container_of(&task_groups, typeof(*iter), list); \
474 (iter = next_task_group(iter)) && \
475 (rt_rq = iter->rt_rq[cpu_of(rq)]);)
476
477#define for_each_sched_rt_entity(rt_se) \
478 for (; rt_se; rt_se = rt_se->parent)
479
480static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
481{
482 return rt_se->my_q;
483}
484
485static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags);
486static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags);
487
488static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
489{
490 struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
491 struct rq *rq = rq_of_rt_rq(rt_rq);
492 struct sched_rt_entity *rt_se;
493
494 int cpu = cpu_of(rq);
495
496 rt_se = rt_rq->tg->rt_se[cpu];
497
498 if (rt_rq->rt_nr_running) {
499 if (!rt_se)
500 enqueue_top_rt_rq(rt_rq);
501 else if (!on_rt_rq(rt_se))
502 enqueue_rt_entity(rt_se, 0);
503
504 if (rt_rq->highest_prio.curr < curr->prio)
505 resched_curr(rq);
506 }
507}
508
509static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
510{
511 struct sched_rt_entity *rt_se;
512 int cpu = cpu_of(rq_of_rt_rq(rt_rq));
513
514 rt_se = rt_rq->tg->rt_se[cpu];
515
516 if (!rt_se) {
517 dequeue_top_rt_rq(rt_rq);
518 /* Kick cpufreq (see the comment in kernel/sched/sched.h). */
519 cpufreq_update_util(rq_of_rt_rq(rt_rq), 0);
520 }
521 else if (on_rt_rq(rt_se))
522 dequeue_rt_entity(rt_se, 0);
523}
524
525static inline int rt_rq_throttled(struct rt_rq *rt_rq)
526{
527 return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted;
528}
529
530static int rt_se_boosted(struct sched_rt_entity *rt_se)
531{
532 struct rt_rq *rt_rq = group_rt_rq(rt_se);
533 struct task_struct *p;
534
535 if (rt_rq)
536 return !!rt_rq->rt_nr_boosted;
537
538 p = rt_task_of(rt_se);
539 return p->prio != p->normal_prio;
540}
541
542#ifdef CONFIG_SMP
543static inline const struct cpumask *sched_rt_period_mask(void)
544{
545 return this_rq()->rd->span;
546}
547#else
548static inline const struct cpumask *sched_rt_period_mask(void)
549{
550 return cpu_online_mask;
551}
552#endif
553
554static inline
555struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
556{
557 return container_of(rt_b, struct task_group, rt_bandwidth)->rt_rq[cpu];
558}
559
560static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
561{
562 return &rt_rq->tg->rt_bandwidth;
563}
564
565#else /* !CONFIG_RT_GROUP_SCHED */
566
567static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
568{
569 return rt_rq->rt_runtime;
570}
571
572static inline u64 sched_rt_period(struct rt_rq *rt_rq)
573{
574 return ktime_to_ns(def_rt_bandwidth.rt_period);
575}
576
577typedef struct rt_rq *rt_rq_iter_t;
578
579#define for_each_rt_rq(rt_rq, iter, rq) \
580 for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
581
582#define for_each_sched_rt_entity(rt_se) \
583 for (; rt_se; rt_se = NULL)
584
585static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
586{
587 return NULL;
588}
589
590static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
591{
592 struct rq *rq = rq_of_rt_rq(rt_rq);
593
594 if (!rt_rq->rt_nr_running)
595 return;
596
597 enqueue_top_rt_rq(rt_rq);
598 resched_curr(rq);
599}
600
601static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
602{
603 dequeue_top_rt_rq(rt_rq);
604}
605
606static inline int rt_rq_throttled(struct rt_rq *rt_rq)
607{
608 return rt_rq->rt_throttled;
609}
610
611static inline const struct cpumask *sched_rt_period_mask(void)
612{
613 return cpu_online_mask;
614}
615
616static inline
617struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
618{
619 return &cpu_rq(cpu)->rt;
620}
621
622static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
623{
624 return &def_rt_bandwidth;
625}
626
627#endif /* CONFIG_RT_GROUP_SCHED */
628
629bool sched_rt_bandwidth_account(struct rt_rq *rt_rq)
630{
631 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
632
633 return (hrtimer_active(&rt_b->rt_period_timer) ||
634 rt_rq->rt_time < rt_b->rt_runtime);
635}
636
637#ifdef CONFIG_SMP
638/*
639 * We ran out of runtime, see if we can borrow some from our neighbours.
640 */
641static void do_balance_runtime(struct rt_rq *rt_rq)
642{
643 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
644 struct root_domain *rd = rq_of_rt_rq(rt_rq)->rd;
645 int i, weight;
646 u64 rt_period;
647
648 weight = cpumask_weight(rd->span);
649
650 raw_spin_lock(&rt_b->rt_runtime_lock);
651 rt_period = ktime_to_ns(rt_b->rt_period);
652 for_each_cpu(i, rd->span) {
653 struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
654 s64 diff;
655
656 if (iter == rt_rq)
657 continue;
658
659 raw_spin_lock(&iter->rt_runtime_lock);
660 /*
661 * Either all rqs have inf runtime and there's nothing to steal
662 * or __disable_runtime() below sets a specific rq to inf to
663 * indicate its been disabled and disalow stealing.
664 */
665 if (iter->rt_runtime == RUNTIME_INF)
666 goto next;
667
668 /*
669 * From runqueues with spare time, take 1/n part of their
670 * spare time, but no more than our period.
671 */
672 diff = iter->rt_runtime - iter->rt_time;
673 if (diff > 0) {
674 diff = div_u64((u64)diff, weight);
675 if (rt_rq->rt_runtime + diff > rt_period)
676 diff = rt_period - rt_rq->rt_runtime;
677 iter->rt_runtime -= diff;
678 rt_rq->rt_runtime += diff;
679 if (rt_rq->rt_runtime == rt_period) {
680 raw_spin_unlock(&iter->rt_runtime_lock);
681 break;
682 }
683 }
684next:
685 raw_spin_unlock(&iter->rt_runtime_lock);
686 }
687 raw_spin_unlock(&rt_b->rt_runtime_lock);
688}
689
690/*
691 * Ensure this RQ takes back all the runtime it lend to its neighbours.
692 */
693static void __disable_runtime(struct rq *rq)
694{
695 struct root_domain *rd = rq->rd;
696 rt_rq_iter_t iter;
697 struct rt_rq *rt_rq;
698
699 if (unlikely(!scheduler_running))
700 return;
701
702 for_each_rt_rq(rt_rq, iter, rq) {
703 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
704 s64 want;
705 int i;
706
707 raw_spin_lock(&rt_b->rt_runtime_lock);
708 raw_spin_lock(&rt_rq->rt_runtime_lock);
709 /*
710 * Either we're all inf and nobody needs to borrow, or we're
711 * already disabled and thus have nothing to do, or we have
712 * exactly the right amount of runtime to take out.
713 */
714 if (rt_rq->rt_runtime == RUNTIME_INF ||
715 rt_rq->rt_runtime == rt_b->rt_runtime)
716 goto balanced;
717 raw_spin_unlock(&rt_rq->rt_runtime_lock);
718
719 /*
720 * Calculate the difference between what we started out with
721 * and what we current have, that's the amount of runtime
722 * we lend and now have to reclaim.
723 */
724 want = rt_b->rt_runtime - rt_rq->rt_runtime;
725
726 /*
727 * Greedy reclaim, take back as much as we can.
728 */
729 for_each_cpu(i, rd->span) {
730 struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
731 s64 diff;
732
733 /*
734 * Can't reclaim from ourselves or disabled runqueues.
735 */
736 if (iter == rt_rq || iter->rt_runtime == RUNTIME_INF)
737 continue;
738
739 raw_spin_lock(&iter->rt_runtime_lock);
740 if (want > 0) {
741 diff = min_t(s64, iter->rt_runtime, want);
742 iter->rt_runtime -= diff;
743 want -= diff;
744 } else {
745 iter->rt_runtime -= want;
746 want -= want;
747 }
748 raw_spin_unlock(&iter->rt_runtime_lock);
749
750 if (!want)
751 break;
752 }
753
754 raw_spin_lock(&rt_rq->rt_runtime_lock);
755 /*
756 * We cannot be left wanting - that would mean some runtime
757 * leaked out of the system.
758 */
759 BUG_ON(want);
760balanced:
761 /*
762 * Disable all the borrow logic by pretending we have inf
763 * runtime - in which case borrowing doesn't make sense.
764 */
765 rt_rq->rt_runtime = RUNTIME_INF;
766 rt_rq->rt_throttled = 0;
767 raw_spin_unlock(&rt_rq->rt_runtime_lock);
768 raw_spin_unlock(&rt_b->rt_runtime_lock);
769
770 /* Make rt_rq available for pick_next_task() */
771 sched_rt_rq_enqueue(rt_rq);
772 }
773}
774
775static void __enable_runtime(struct rq *rq)
776{
777 rt_rq_iter_t iter;
778 struct rt_rq *rt_rq;
779
780 if (unlikely(!scheduler_running))
781 return;
782
783 /*
784 * Reset each runqueue's bandwidth settings
785 */
786 for_each_rt_rq(rt_rq, iter, rq) {
787 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
788
789 raw_spin_lock(&rt_b->rt_runtime_lock);
790 raw_spin_lock(&rt_rq->rt_runtime_lock);
791 rt_rq->rt_runtime = rt_b->rt_runtime;
792 rt_rq->rt_time = 0;
793 rt_rq->rt_throttled = 0;
794 raw_spin_unlock(&rt_rq->rt_runtime_lock);
795 raw_spin_unlock(&rt_b->rt_runtime_lock);
796 }
797}
798
799static void balance_runtime(struct rt_rq *rt_rq)
800{
801 if (!sched_feat(RT_RUNTIME_SHARE))
802 return;
803
804 if (rt_rq->rt_time > rt_rq->rt_runtime) {
805 raw_spin_unlock(&rt_rq->rt_runtime_lock);
806 do_balance_runtime(rt_rq);
807 raw_spin_lock(&rt_rq->rt_runtime_lock);
808 }
809}
810#else /* !CONFIG_SMP */
811static inline void balance_runtime(struct rt_rq *rt_rq) {}
812#endif /* CONFIG_SMP */
813
814static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
815{
816 int i, idle = 1, throttled = 0;
817 const struct cpumask *span;
818
819 span = sched_rt_period_mask();
820#ifdef CONFIG_RT_GROUP_SCHED
821 /*
822 * FIXME: isolated CPUs should really leave the root task group,
823 * whether they are isolcpus or were isolated via cpusets, lest
824 * the timer run on a CPU which does not service all runqueues,
825 * potentially leaving other CPUs indefinitely throttled. If
826 * isolation is really required, the user will turn the throttle
827 * off to kill the perturbations it causes anyway. Meanwhile,
828 * this maintains functionality for boot and/or troubleshooting.
829 */
830 if (rt_b == &root_task_group.rt_bandwidth)
831 span = cpu_online_mask;
832#endif
833 for_each_cpu(i, span) {
834 int enqueue = 0;
835 struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
836 struct rq *rq = rq_of_rt_rq(rt_rq);
837 int skip;
838
839 /*
840 * When span == cpu_online_mask, taking each rq->lock
841 * can be time-consuming. Try to avoid it when possible.
842 */
843 raw_spin_lock(&rt_rq->rt_runtime_lock);
844 if (!sched_feat(RT_RUNTIME_SHARE) && rt_rq->rt_runtime != RUNTIME_INF)
845 rt_rq->rt_runtime = rt_b->rt_runtime;
846 skip = !rt_rq->rt_time && !rt_rq->rt_nr_running;
847 raw_spin_unlock(&rt_rq->rt_runtime_lock);
848 if (skip)
849 continue;
850
851 raw_spin_lock(&rq->lock);
852 update_rq_clock(rq);
853
854 if (rt_rq->rt_time) {
855 u64 runtime;
856
857 raw_spin_lock(&rt_rq->rt_runtime_lock);
858 if (rt_rq->rt_throttled)
859 balance_runtime(rt_rq);
860 runtime = rt_rq->rt_runtime;
861 rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime);
862 if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) {
863 rt_rq->rt_throttled = 0;
864 enqueue = 1;
865
866 /*
867 * When we're idle and a woken (rt) task is
868 * throttled check_preempt_curr() will set
869 * skip_update and the time between the wakeup
870 * and this unthrottle will get accounted as
871 * 'runtime'.
872 */
873 if (rt_rq->rt_nr_running && rq->curr == rq->idle)
874 rq_clock_cancel_skipupdate(rq);
875 }
876 if (rt_rq->rt_time || rt_rq->rt_nr_running)
877 idle = 0;
878 raw_spin_unlock(&rt_rq->rt_runtime_lock);
879 } else if (rt_rq->rt_nr_running) {
880 idle = 0;
881 if (!rt_rq_throttled(rt_rq))
882 enqueue = 1;
883 }
884 if (rt_rq->rt_throttled)
885 throttled = 1;
886
887 if (enqueue)
888 sched_rt_rq_enqueue(rt_rq);
889 raw_spin_unlock(&rq->lock);
890 }
891
892 if (!throttled && (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF))
893 return 1;
894
895 return idle;
896}
897
898static inline int rt_se_prio(struct sched_rt_entity *rt_se)
899{
900#ifdef CONFIG_RT_GROUP_SCHED
901 struct rt_rq *rt_rq = group_rt_rq(rt_se);
902
903 if (rt_rq)
904 return rt_rq->highest_prio.curr;
905#endif
906
907 return rt_task_of(rt_se)->prio;
908}
909
910static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
911{
912 u64 runtime = sched_rt_runtime(rt_rq);
913
914 if (rt_rq->rt_throttled)
915 return rt_rq_throttled(rt_rq);
916
917 if (runtime >= sched_rt_period(rt_rq))
918 return 0;
919
920 balance_runtime(rt_rq);
921 runtime = sched_rt_runtime(rt_rq);
922 if (runtime == RUNTIME_INF)
923 return 0;
924
925 if (rt_rq->rt_time > runtime) {
926 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
927
928 /*
929 * Don't actually throttle groups that have no runtime assigned
930 * but accrue some time due to boosting.
931 */
932 if (likely(rt_b->rt_runtime)) {
933 rt_rq->rt_throttled = 1;
934 printk_deferred_once("sched: RT throttling activated\n");
935 } else {
936 /*
937 * In case we did anyway, make it go away,
938 * replenishment is a joke, since it will replenish us
939 * with exactly 0 ns.
940 */
941 rt_rq->rt_time = 0;
942 }
943
944 if (rt_rq_throttled(rt_rq)) {
945 sched_rt_rq_dequeue(rt_rq);
946 return 1;
947 }
948 }
949
950 return 0;
951}
952
953/*
954 * Update the current task's runtime statistics. Skip current tasks that
955 * are not in our scheduling class.
956 */
957static void update_curr_rt(struct rq *rq)
958{
959 struct task_struct *curr = rq->curr;
960 struct sched_rt_entity *rt_se = &curr->rt;
961 u64 delta_exec;
962 u64 now;
963
964 if (curr->sched_class != &rt_sched_class)
965 return;
966
967 now = rq_clock_task(rq);
968 delta_exec = now - curr->se.exec_start;
969 if (unlikely((s64)delta_exec <= 0))
970 return;
971
972 schedstat_set(curr->se.statistics.exec_max,
973 max(curr->se.statistics.exec_max, delta_exec));
974
975 curr->se.sum_exec_runtime += delta_exec;
976 account_group_exec_runtime(curr, delta_exec);
977
978 curr->se.exec_start = now;
979 cgroup_account_cputime(curr, delta_exec);
980
981 if (!rt_bandwidth_enabled())
982 return;
983
984 for_each_sched_rt_entity(rt_se) {
985 struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
986
987 if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {
988 raw_spin_lock(&rt_rq->rt_runtime_lock);
989 rt_rq->rt_time += delta_exec;
990 if (sched_rt_runtime_exceeded(rt_rq))
991 resched_curr(rq);
992 raw_spin_unlock(&rt_rq->rt_runtime_lock);
993 }
994 }
995}
996
997static void
998dequeue_top_rt_rq(struct rt_rq *rt_rq)
999{
1000 struct rq *rq = rq_of_rt_rq(rt_rq);
1001
1002 BUG_ON(&rq->rt != rt_rq);
1003
1004 if (!rt_rq->rt_queued)
1005 return;
1006
1007 BUG_ON(!rq->nr_running);
1008
1009 sub_nr_running(rq, rt_rq->rt_nr_running);
1010 rt_rq->rt_queued = 0;
1011
1012}
1013
1014static void
1015enqueue_top_rt_rq(struct rt_rq *rt_rq)
1016{
1017 struct rq *rq = rq_of_rt_rq(rt_rq);
1018
1019 BUG_ON(&rq->rt != rt_rq);
1020
1021 if (rt_rq->rt_queued)
1022 return;
1023
1024 if (rt_rq_throttled(rt_rq))
1025 return;
1026
1027 if (rt_rq->rt_nr_running) {
1028 add_nr_running(rq, rt_rq->rt_nr_running);
1029 rt_rq->rt_queued = 1;
1030 }
1031
1032 /* Kick cpufreq (see the comment in kernel/sched/sched.h). */
1033 cpufreq_update_util(rq, 0);
1034}
1035
1036#if defined CONFIG_SMP
1037
1038static void
1039inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
1040{
1041 struct rq *rq = rq_of_rt_rq(rt_rq);
1042
1043#ifdef CONFIG_RT_GROUP_SCHED
1044 /*
1045 * Change rq's cpupri only if rt_rq is the top queue.
1046 */
1047 if (&rq->rt != rt_rq)
1048 return;
1049#endif
1050 if (rq->online && prio < prev_prio)
1051 cpupri_set(&rq->rd->cpupri, rq->cpu, prio);
1052}
1053
1054static void
1055dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
1056{
1057 struct rq *rq = rq_of_rt_rq(rt_rq);
1058
1059#ifdef CONFIG_RT_GROUP_SCHED
1060 /*
1061 * Change rq's cpupri only if rt_rq is the top queue.
1062 */
1063 if (&rq->rt != rt_rq)
1064 return;
1065#endif
1066 if (rq->online && rt_rq->highest_prio.curr != prev_prio)
1067 cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr);
1068}
1069
1070#else /* CONFIG_SMP */
1071
1072static inline
1073void inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
1074static inline
1075void dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
1076
1077#endif /* CONFIG_SMP */
1078
1079#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
1080static void
1081inc_rt_prio(struct rt_rq *rt_rq, int prio)
1082{
1083 int prev_prio = rt_rq->highest_prio.curr;
1084
1085 if (prio < prev_prio)
1086 rt_rq->highest_prio.curr = prio;
1087
1088 inc_rt_prio_smp(rt_rq, prio, prev_prio);
1089}
1090
1091static void
1092dec_rt_prio(struct rt_rq *rt_rq, int prio)
1093{
1094 int prev_prio = rt_rq->highest_prio.curr;
1095
1096 if (rt_rq->rt_nr_running) {
1097
1098 WARN_ON(prio < prev_prio);
1099
1100 /*
1101 * This may have been our highest task, and therefore
1102 * we may have some recomputation to do
1103 */
1104 if (prio == prev_prio) {
1105 struct rt_prio_array *array = &rt_rq->active;
1106
1107 rt_rq->highest_prio.curr =
1108 sched_find_first_bit(array->bitmap);
1109 }
1110
1111 } else
1112 rt_rq->highest_prio.curr = MAX_RT_PRIO;
1113
1114 dec_rt_prio_smp(rt_rq, prio, prev_prio);
1115}
1116
1117#else
1118
1119static inline void inc_rt_prio(struct rt_rq *rt_rq, int prio) {}
1120static inline void dec_rt_prio(struct rt_rq *rt_rq, int prio) {}
1121
1122#endif /* CONFIG_SMP || CONFIG_RT_GROUP_SCHED */
1123
1124#ifdef CONFIG_RT_GROUP_SCHED
1125
1126static void
1127inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1128{
1129 if (rt_se_boosted(rt_se))
1130 rt_rq->rt_nr_boosted++;
1131
1132 if (rt_rq->tg)
1133 start_rt_bandwidth(&rt_rq->tg->rt_bandwidth);
1134}
1135
1136static void
1137dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1138{
1139 if (rt_se_boosted(rt_se))
1140 rt_rq->rt_nr_boosted--;
1141
1142 WARN_ON(!rt_rq->rt_nr_running && rt_rq->rt_nr_boosted);
1143}
1144
1145#else /* CONFIG_RT_GROUP_SCHED */
1146
1147static void
1148inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1149{
1150 start_rt_bandwidth(&def_rt_bandwidth);
1151}
1152
1153static inline
1154void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {}
1155
1156#endif /* CONFIG_RT_GROUP_SCHED */
1157
1158static inline
1159unsigned int rt_se_nr_running(struct sched_rt_entity *rt_se)
1160{
1161 struct rt_rq *group_rq = group_rt_rq(rt_se);
1162
1163 if (group_rq)
1164 return group_rq->rt_nr_running;
1165 else
1166 return 1;
1167}
1168
1169static inline
1170unsigned int rt_se_rr_nr_running(struct sched_rt_entity *rt_se)
1171{
1172 struct rt_rq *group_rq = group_rt_rq(rt_se);
1173 struct task_struct *tsk;
1174
1175 if (group_rq)
1176 return group_rq->rr_nr_running;
1177
1178 tsk = rt_task_of(rt_se);
1179
1180 return (tsk->policy == SCHED_RR) ? 1 : 0;
1181}
1182
1183static inline
1184void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1185{
1186 int prio = rt_se_prio(rt_se);
1187
1188 WARN_ON(!rt_prio(prio));
1189 rt_rq->rt_nr_running += rt_se_nr_running(rt_se);
1190 rt_rq->rr_nr_running += rt_se_rr_nr_running(rt_se);
1191
1192 inc_rt_prio(rt_rq, prio);
1193 inc_rt_migration(rt_se, rt_rq);
1194 inc_rt_group(rt_se, rt_rq);
1195}
1196
1197static inline
1198void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1199{
1200 WARN_ON(!rt_prio(rt_se_prio(rt_se)));
1201 WARN_ON(!rt_rq->rt_nr_running);
1202 rt_rq->rt_nr_running -= rt_se_nr_running(rt_se);
1203 rt_rq->rr_nr_running -= rt_se_rr_nr_running(rt_se);
1204
1205 dec_rt_prio(rt_rq, rt_se_prio(rt_se));
1206 dec_rt_migration(rt_se, rt_rq);
1207 dec_rt_group(rt_se, rt_rq);
1208}
1209
1210/*
1211 * Change rt_se->run_list location unless SAVE && !MOVE
1212 *
1213 * assumes ENQUEUE/DEQUEUE flags match
1214 */
1215static inline bool move_entity(unsigned int flags)
1216{
1217 if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) == DEQUEUE_SAVE)
1218 return false;
1219
1220 return true;
1221}
1222
1223static void __delist_rt_entity(struct sched_rt_entity *rt_se, struct rt_prio_array *array)
1224{
1225 list_del_init(&rt_se->run_list);
1226
1227 if (list_empty(array->queue + rt_se_prio(rt_se)))
1228 __clear_bit(rt_se_prio(rt_se), array->bitmap);
1229
1230 rt_se->on_list = 0;
1231}
1232
1233static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
1234{
1235 struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
1236 struct rt_prio_array *array = &rt_rq->active;
1237 struct rt_rq *group_rq = group_rt_rq(rt_se);
1238 struct list_head *queue = array->queue + rt_se_prio(rt_se);
1239
1240 /*
1241 * Don't enqueue the group if its throttled, or when empty.
1242 * The latter is a consequence of the former when a child group
1243 * get throttled and the current group doesn't have any other
1244 * active members.
1245 */
1246 if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) {
1247 if (rt_se->on_list)
1248 __delist_rt_entity(rt_se, array);
1249 return;
1250 }
1251
1252 if (move_entity(flags)) {
1253 WARN_ON_ONCE(rt_se->on_list);
1254 if (flags & ENQUEUE_HEAD)
1255 list_add(&rt_se->run_list, queue);
1256 else
1257 list_add_tail(&rt_se->run_list, queue);
1258
1259 __set_bit(rt_se_prio(rt_se), array->bitmap);
1260 rt_se->on_list = 1;
1261 }
1262 rt_se->on_rq = 1;
1263
1264 inc_rt_tasks(rt_se, rt_rq);
1265}
1266
1267static void __dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
1268{
1269 struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
1270 struct rt_prio_array *array = &rt_rq->active;
1271
1272 if (move_entity(flags)) {
1273 WARN_ON_ONCE(!rt_se->on_list);
1274 __delist_rt_entity(rt_se, array);
1275 }
1276 rt_se->on_rq = 0;
1277
1278 dec_rt_tasks(rt_se, rt_rq);
1279}
1280
1281/*
1282 * Because the prio of an upper entry depends on the lower
1283 * entries, we must remove entries top - down.
1284 */
1285static void dequeue_rt_stack(struct sched_rt_entity *rt_se, unsigned int flags)
1286{
1287 struct sched_rt_entity *back = NULL;
1288
1289 for_each_sched_rt_entity(rt_se) {
1290 rt_se->back = back;
1291 back = rt_se;
1292 }
1293
1294 dequeue_top_rt_rq(rt_rq_of_se(back));
1295
1296 for (rt_se = back; rt_se; rt_se = rt_se->back) {
1297 if (on_rt_rq(rt_se))
1298 __dequeue_rt_entity(rt_se, flags);
1299 }
1300}
1301
1302static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
1303{
1304 struct rq *rq = rq_of_rt_se(rt_se);
1305
1306 dequeue_rt_stack(rt_se, flags);
1307 for_each_sched_rt_entity(rt_se)
1308 __enqueue_rt_entity(rt_se, flags);
1309 enqueue_top_rt_rq(&rq->rt);
1310}
1311
1312static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
1313{
1314 struct rq *rq = rq_of_rt_se(rt_se);
1315
1316 dequeue_rt_stack(rt_se, flags);
1317
1318 for_each_sched_rt_entity(rt_se) {
1319 struct rt_rq *rt_rq = group_rt_rq(rt_se);
1320
1321 if (rt_rq && rt_rq->rt_nr_running)
1322 __enqueue_rt_entity(rt_se, flags);
1323 }
1324 enqueue_top_rt_rq(&rq->rt);
1325}
1326
1327/*
1328 * Adding/removing a task to/from a priority array:
1329 */
1330static void
1331enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
1332{
1333 struct sched_rt_entity *rt_se = &p->rt;
1334
1335 if (flags & ENQUEUE_WAKEUP)
1336 rt_se->timeout = 0;
1337
1338 enqueue_rt_entity(rt_se, flags);
1339
1340 if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
1341 enqueue_pushable_task(rq, p);
1342}
1343
1344static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
1345{
1346 struct sched_rt_entity *rt_se = &p->rt;
1347
1348 update_curr_rt(rq);
1349 dequeue_rt_entity(rt_se, flags);
1350
1351 dequeue_pushable_task(rq, p);
1352}
1353
1354/*
1355 * Put task to the head or the end of the run list without the overhead of
1356 * dequeue followed by enqueue.
1357 */
1358static void
1359requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int head)
1360{
1361 if (on_rt_rq(rt_se)) {
1362 struct rt_prio_array *array = &rt_rq->active;
1363 struct list_head *queue = array->queue + rt_se_prio(rt_se);
1364
1365 if (head)
1366 list_move(&rt_se->run_list, queue);
1367 else
1368 list_move_tail(&rt_se->run_list, queue);
1369 }
1370}
1371
1372static void requeue_task_rt(struct rq *rq, struct task_struct *p, int head)
1373{
1374 struct sched_rt_entity *rt_se = &p->rt;
1375 struct rt_rq *rt_rq;
1376
1377 for_each_sched_rt_entity(rt_se) {
1378 rt_rq = rt_rq_of_se(rt_se);
1379 requeue_rt_entity(rt_rq, rt_se, head);
1380 }
1381}
1382
1383static void yield_task_rt(struct rq *rq)
1384{
1385 requeue_task_rt(rq, rq->curr, 0);
1386}
1387
1388#ifdef CONFIG_SMP
1389static int find_lowest_rq(struct task_struct *task);
1390
1391static int
1392select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
1393{
1394 struct task_struct *curr;
1395 struct rq *rq;
1396
1397 /* For anything but wake ups, just return the task_cpu */
1398 if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK)
1399 goto out;
1400
1401 rq = cpu_rq(cpu);
1402
1403 rcu_read_lock();
1404 curr = READ_ONCE(rq->curr); /* unlocked access */
1405
1406 /*
1407 * If the current task on @p's runqueue is an RT task, then
1408 * try to see if we can wake this RT task up on another
1409 * runqueue. Otherwise simply start this RT task
1410 * on its current runqueue.
1411 *
1412 * We want to avoid overloading runqueues. If the woken
1413 * task is a higher priority, then it will stay on this CPU
1414 * and the lower prio task should be moved to another CPU.
1415 * Even though this will probably make the lower prio task
1416 * lose its cache, we do not want to bounce a higher task
1417 * around just because it gave up its CPU, perhaps for a
1418 * lock?
1419 *
1420 * For equal prio tasks, we just let the scheduler sort it out.
1421 *
1422 * Otherwise, just let it ride on the affined RQ and the
1423 * post-schedule router will push the preempted task away
1424 *
1425 * This test is optimistic, if we get it wrong the load-balancer
1426 * will have to sort it out.
1427 */
1428 if (curr && unlikely(rt_task(curr)) &&
1429 (curr->nr_cpus_allowed < 2 ||
1430 curr->prio <= p->prio)) {
1431 int target = find_lowest_rq(p);
1432
1433 /*
1434 * Don't bother moving it if the destination CPU is
1435 * not running a lower priority task.
1436 */
1437 if (target != -1 &&
1438 p->prio < cpu_rq(target)->rt.highest_prio.curr)
1439 cpu = target;
1440 }
1441 rcu_read_unlock();
1442
1443out:
1444 return cpu;
1445}
1446
1447static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
1448{
1449 /*
1450 * Current can't be migrated, useless to reschedule,
1451 * let's hope p can move out.
1452 */
1453 if (rq->curr->nr_cpus_allowed == 1 ||
1454 !cpupri_find(&rq->rd->cpupri, rq->curr, NULL))
1455 return;
1456
1457 /*
1458 * p is migratable, so let's not schedule it and
1459 * see if it is pushed or pulled somewhere else.
1460 */
1461 if (p->nr_cpus_allowed != 1
1462 && cpupri_find(&rq->rd->cpupri, p, NULL))
1463 return;
1464
1465 /*
1466 * There appear to be other CPUs that can accept
1467 * the current task but none can run 'p', so lets reschedule
1468 * to try and push the current task away:
1469 */
1470 requeue_task_rt(rq, p, 1);
1471 resched_curr(rq);
1472}
1473
David Brazdil0f672f62019-12-10 10:32:29 +00001474static int balance_rt(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
1475{
1476 if (!on_rt_rq(&p->rt) && need_pull_rt_task(rq, p)) {
1477 /*
1478 * This is OK, because current is on_cpu, which avoids it being
1479 * picked for load-balance and preemption/IRQs are still
1480 * disabled avoiding further scheduler activity on it and we've
1481 * not yet started the picking loop.
1482 */
1483 rq_unpin_lock(rq, rf);
1484 pull_rt_task(rq);
1485 rq_repin_lock(rq, rf);
1486 }
1487
1488 return sched_stop_runnable(rq) || sched_dl_runnable(rq) || sched_rt_runnable(rq);
1489}
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001490#endif /* CONFIG_SMP */
1491
1492/*
1493 * Preempt the current task with a newly woken task if needed:
1494 */
1495static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags)
1496{
1497 if (p->prio < rq->curr->prio) {
1498 resched_curr(rq);
1499 return;
1500 }
1501
1502#ifdef CONFIG_SMP
1503 /*
1504 * If:
1505 *
1506 * - the newly woken task is of equal priority to the current task
1507 * - the newly woken task is non-migratable while current is migratable
1508 * - current will be preempted on the next reschedule
1509 *
1510 * we should check to see if current can readily move to a different
1511 * cpu. If so, we will reschedule to allow the push logic to try
1512 * to move current somewhere else, making room for our non-migratable
1513 * task.
1514 */
1515 if (p->prio == rq->curr->prio && !test_tsk_need_resched(rq->curr))
1516 check_preempt_equal_prio(rq, p);
1517#endif
1518}
1519
Olivier Deprez0e641232021-09-23 10:07:05 +02001520static inline void set_next_task_rt(struct rq *rq, struct task_struct *p, bool first)
David Brazdil0f672f62019-12-10 10:32:29 +00001521{
1522 p->se.exec_start = rq_clock_task(rq);
1523
1524 /* The running task is never eligible for pushing */
1525 dequeue_pushable_task(rq, p);
1526
Olivier Deprez0e641232021-09-23 10:07:05 +02001527 if (!first)
1528 return;
1529
David Brazdil0f672f62019-12-10 10:32:29 +00001530 /*
1531 * If prev task was rt, put_prev_task() has already updated the
1532 * utilization. We only care of the case where we start to schedule a
1533 * rt task
1534 */
1535 if (rq->curr->sched_class != &rt_sched_class)
1536 update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 0);
1537
1538 rt_queue_push_tasks(rq);
1539}
1540
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001541static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
1542 struct rt_rq *rt_rq)
1543{
1544 struct rt_prio_array *array = &rt_rq->active;
1545 struct sched_rt_entity *next = NULL;
1546 struct list_head *queue;
1547 int idx;
1548
1549 idx = sched_find_first_bit(array->bitmap);
1550 BUG_ON(idx >= MAX_RT_PRIO);
1551
1552 queue = array->queue + idx;
1553 next = list_entry(queue->next, struct sched_rt_entity, run_list);
1554
1555 return next;
1556}
1557
1558static struct task_struct *_pick_next_task_rt(struct rq *rq)
1559{
1560 struct sched_rt_entity *rt_se;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001561 struct rt_rq *rt_rq = &rq->rt;
1562
1563 do {
1564 rt_se = pick_next_rt_entity(rq, rt_rq);
1565 BUG_ON(!rt_se);
1566 rt_rq = group_rt_rq(rt_se);
1567 } while (rt_rq);
1568
David Brazdil0f672f62019-12-10 10:32:29 +00001569 return rt_task_of(rt_se);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001570}
1571
1572static struct task_struct *
1573pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
1574{
1575 struct task_struct *p;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001576
David Brazdil0f672f62019-12-10 10:32:29 +00001577 WARN_ON_ONCE(prev || rf);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001578
David Brazdil0f672f62019-12-10 10:32:29 +00001579 if (!sched_rt_runnable(rq))
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001580 return NULL;
1581
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001582 p = _pick_next_task_rt(rq);
Olivier Deprez0e641232021-09-23 10:07:05 +02001583 set_next_task_rt(rq, p, true);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001584 return p;
1585}
1586
1587static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
1588{
1589 update_curr_rt(rq);
1590
David Brazdil0f672f62019-12-10 10:32:29 +00001591 update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 1);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001592
1593 /*
1594 * The previous task needs to be made eligible for pushing
1595 * if it is still active
1596 */
1597 if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1)
1598 enqueue_pushable_task(rq, p);
1599}
1600
1601#ifdef CONFIG_SMP
1602
1603/* Only try algorithms three times */
1604#define RT_MAX_TRIES 3
1605
1606static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
1607{
1608 if (!task_running(rq, p) &&
David Brazdil0f672f62019-12-10 10:32:29 +00001609 cpumask_test_cpu(cpu, p->cpus_ptr))
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001610 return 1;
1611
1612 return 0;
1613}
1614
1615/*
1616 * Return the highest pushable rq's task, which is suitable to be executed
1617 * on the CPU, NULL otherwise
1618 */
1619static struct task_struct *pick_highest_pushable_task(struct rq *rq, int cpu)
1620{
1621 struct plist_head *head = &rq->rt.pushable_tasks;
1622 struct task_struct *p;
1623
1624 if (!has_pushable_tasks(rq))
1625 return NULL;
1626
1627 plist_for_each_entry(p, head, pushable_tasks) {
1628 if (pick_rt_task(rq, p, cpu))
1629 return p;
1630 }
1631
1632 return NULL;
1633}
1634
1635static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask);
1636
1637static int find_lowest_rq(struct task_struct *task)
1638{
1639 struct sched_domain *sd;
1640 struct cpumask *lowest_mask = this_cpu_cpumask_var_ptr(local_cpu_mask);
1641 int this_cpu = smp_processor_id();
1642 int cpu = task_cpu(task);
1643
1644 /* Make sure the mask is initialized first */
1645 if (unlikely(!lowest_mask))
1646 return -1;
1647
1648 if (task->nr_cpus_allowed == 1)
1649 return -1; /* No other targets possible */
1650
1651 if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask))
1652 return -1; /* No targets found */
1653
1654 /*
1655 * At this point we have built a mask of CPUs representing the
1656 * lowest priority tasks in the system. Now we want to elect
1657 * the best one based on our affinity and topology.
1658 *
1659 * We prioritize the last CPU that the task executed on since
1660 * it is most likely cache-hot in that location.
1661 */
1662 if (cpumask_test_cpu(cpu, lowest_mask))
1663 return cpu;
1664
1665 /*
1666 * Otherwise, we consult the sched_domains span maps to figure
1667 * out which CPU is logically closest to our hot cache data.
1668 */
1669 if (!cpumask_test_cpu(this_cpu, lowest_mask))
1670 this_cpu = -1; /* Skip this_cpu opt if not among lowest */
1671
1672 rcu_read_lock();
1673 for_each_domain(cpu, sd) {
1674 if (sd->flags & SD_WAKE_AFFINE) {
1675 int best_cpu;
1676
1677 /*
1678 * "this_cpu" is cheaper to preempt than a
1679 * remote processor.
1680 */
1681 if (this_cpu != -1 &&
1682 cpumask_test_cpu(this_cpu, sched_domain_span(sd))) {
1683 rcu_read_unlock();
1684 return this_cpu;
1685 }
1686
1687 best_cpu = cpumask_first_and(lowest_mask,
1688 sched_domain_span(sd));
1689 if (best_cpu < nr_cpu_ids) {
1690 rcu_read_unlock();
1691 return best_cpu;
1692 }
1693 }
1694 }
1695 rcu_read_unlock();
1696
1697 /*
1698 * And finally, if there were no matches within the domains
1699 * just give the caller *something* to work with from the compatible
1700 * locations.
1701 */
1702 if (this_cpu != -1)
1703 return this_cpu;
1704
1705 cpu = cpumask_any(lowest_mask);
1706 if (cpu < nr_cpu_ids)
1707 return cpu;
1708
1709 return -1;
1710}
1711
1712/* Will lock the rq it finds */
1713static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
1714{
1715 struct rq *lowest_rq = NULL;
1716 int tries;
1717 int cpu;
1718
1719 for (tries = 0; tries < RT_MAX_TRIES; tries++) {
1720 cpu = find_lowest_rq(task);
1721
1722 if ((cpu == -1) || (cpu == rq->cpu))
1723 break;
1724
1725 lowest_rq = cpu_rq(cpu);
1726
1727 if (lowest_rq->rt.highest_prio.curr <= task->prio) {
1728 /*
1729 * Target rq has tasks of equal or higher priority,
1730 * retrying does not release any lock and is unlikely
1731 * to yield a different result.
1732 */
1733 lowest_rq = NULL;
1734 break;
1735 }
1736
1737 /* if the prio of this runqueue changed, try again */
1738 if (double_lock_balance(rq, lowest_rq)) {
1739 /*
1740 * We had to unlock the run queue. In
1741 * the mean time, task could have
1742 * migrated already or had its affinity changed.
1743 * Also make sure that it wasn't scheduled on its rq.
1744 */
1745 if (unlikely(task_rq(task) != rq ||
David Brazdil0f672f62019-12-10 10:32:29 +00001746 !cpumask_test_cpu(lowest_rq->cpu, task->cpus_ptr) ||
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001747 task_running(rq, task) ||
1748 !rt_task(task) ||
1749 !task_on_rq_queued(task))) {
1750
1751 double_unlock_balance(rq, lowest_rq);
1752 lowest_rq = NULL;
1753 break;
1754 }
1755 }
1756
1757 /* If this rq is still suitable use it. */
1758 if (lowest_rq->rt.highest_prio.curr > task->prio)
1759 break;
1760
1761 /* try again */
1762 double_unlock_balance(rq, lowest_rq);
1763 lowest_rq = NULL;
1764 }
1765
1766 return lowest_rq;
1767}
1768
1769static struct task_struct *pick_next_pushable_task(struct rq *rq)
1770{
1771 struct task_struct *p;
1772
1773 if (!has_pushable_tasks(rq))
1774 return NULL;
1775
1776 p = plist_first_entry(&rq->rt.pushable_tasks,
1777 struct task_struct, pushable_tasks);
1778
1779 BUG_ON(rq->cpu != task_cpu(p));
1780 BUG_ON(task_current(rq, p));
1781 BUG_ON(p->nr_cpus_allowed <= 1);
1782
1783 BUG_ON(!task_on_rq_queued(p));
1784 BUG_ON(!rt_task(p));
1785
1786 return p;
1787}
1788
1789/*
1790 * If the current CPU has more than one RT task, see if the non
1791 * running task can migrate over to a CPU that is running a task
1792 * of lesser priority.
1793 */
1794static int push_rt_task(struct rq *rq)
1795{
1796 struct task_struct *next_task;
1797 struct rq *lowest_rq;
1798 int ret = 0;
1799
1800 if (!rq->rt.overloaded)
1801 return 0;
1802
1803 next_task = pick_next_pushable_task(rq);
1804 if (!next_task)
1805 return 0;
1806
1807retry:
David Brazdil0f672f62019-12-10 10:32:29 +00001808 if (WARN_ON(next_task == rq->curr))
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001809 return 0;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001810
1811 /*
1812 * It's possible that the next_task slipped in of
1813 * higher priority than current. If that's the case
1814 * just reschedule current.
1815 */
1816 if (unlikely(next_task->prio < rq->curr->prio)) {
1817 resched_curr(rq);
1818 return 0;
1819 }
1820
1821 /* We might release rq lock */
1822 get_task_struct(next_task);
1823
1824 /* find_lock_lowest_rq locks the rq if found */
1825 lowest_rq = find_lock_lowest_rq(next_task, rq);
1826 if (!lowest_rq) {
1827 struct task_struct *task;
1828 /*
1829 * find_lock_lowest_rq releases rq->lock
1830 * so it is possible that next_task has migrated.
1831 *
1832 * We need to make sure that the task is still on the same
1833 * run-queue and is also still the next task eligible for
1834 * pushing.
1835 */
1836 task = pick_next_pushable_task(rq);
1837 if (task == next_task) {
1838 /*
1839 * The task hasn't migrated, and is still the next
1840 * eligible task, but we failed to find a run-queue
1841 * to push it to. Do not retry in this case, since
1842 * other CPUs will pull from us when ready.
1843 */
1844 goto out;
1845 }
1846
1847 if (!task)
1848 /* No more tasks, just exit */
1849 goto out;
1850
1851 /*
1852 * Something has shifted, try again.
1853 */
1854 put_task_struct(next_task);
1855 next_task = task;
1856 goto retry;
1857 }
1858
1859 deactivate_task(rq, next_task, 0);
1860 set_task_cpu(next_task, lowest_rq->cpu);
1861 activate_task(lowest_rq, next_task, 0);
1862 ret = 1;
1863
1864 resched_curr(lowest_rq);
1865
1866 double_unlock_balance(rq, lowest_rq);
1867
1868out:
1869 put_task_struct(next_task);
1870
1871 return ret;
1872}
1873
1874static void push_rt_tasks(struct rq *rq)
1875{
1876 /* push_rt_task will return true if it moved an RT */
1877 while (push_rt_task(rq))
1878 ;
1879}
1880
1881#ifdef HAVE_RT_PUSH_IPI
1882
1883/*
1884 * When a high priority task schedules out from a CPU and a lower priority
1885 * task is scheduled in, a check is made to see if there's any RT tasks
1886 * on other CPUs that are waiting to run because a higher priority RT task
1887 * is currently running on its CPU. In this case, the CPU with multiple RT
1888 * tasks queued on it (overloaded) needs to be notified that a CPU has opened
1889 * up that may be able to run one of its non-running queued RT tasks.
1890 *
1891 * All CPUs with overloaded RT tasks need to be notified as there is currently
1892 * no way to know which of these CPUs have the highest priority task waiting
1893 * to run. Instead of trying to take a spinlock on each of these CPUs,
1894 * which has shown to cause large latency when done on machines with many
1895 * CPUs, sending an IPI to the CPUs to have them push off the overloaded
1896 * RT tasks waiting to run.
1897 *
1898 * Just sending an IPI to each of the CPUs is also an issue, as on large
1899 * count CPU machines, this can cause an IPI storm on a CPU, especially
1900 * if its the only CPU with multiple RT tasks queued, and a large number
1901 * of CPUs scheduling a lower priority task at the same time.
1902 *
1903 * Each root domain has its own irq work function that can iterate over
1904 * all CPUs with RT overloaded tasks. Since all CPUs with overloaded RT
1905 * tassk must be checked if there's one or many CPUs that are lowering
1906 * their priority, there's a single irq work iterator that will try to
1907 * push off RT tasks that are waiting to run.
1908 *
1909 * When a CPU schedules a lower priority task, it will kick off the
1910 * irq work iterator that will jump to each CPU with overloaded RT tasks.
1911 * As it only takes the first CPU that schedules a lower priority task
1912 * to start the process, the rto_start variable is incremented and if
1913 * the atomic result is one, then that CPU will try to take the rto_lock.
1914 * This prevents high contention on the lock as the process handles all
1915 * CPUs scheduling lower priority tasks.
1916 *
1917 * All CPUs that are scheduling a lower priority task will increment the
1918 * rt_loop_next variable. This will make sure that the irq work iterator
1919 * checks all RT overloaded CPUs whenever a CPU schedules a new lower
1920 * priority task, even if the iterator is in the middle of a scan. Incrementing
1921 * the rt_loop_next will cause the iterator to perform another scan.
1922 *
1923 */
1924static int rto_next_cpu(struct root_domain *rd)
1925{
1926 int next;
1927 int cpu;
1928
1929 /*
1930 * When starting the IPI RT pushing, the rto_cpu is set to -1,
1931 * rt_next_cpu() will simply return the first CPU found in
1932 * the rto_mask.
1933 *
1934 * If rto_next_cpu() is called with rto_cpu is a valid CPU, it
1935 * will return the next CPU found in the rto_mask.
1936 *
1937 * If there are no more CPUs left in the rto_mask, then a check is made
1938 * against rto_loop and rto_loop_next. rto_loop is only updated with
1939 * the rto_lock held, but any CPU may increment the rto_loop_next
1940 * without any locking.
1941 */
1942 for (;;) {
1943
1944 /* When rto_cpu is -1 this acts like cpumask_first() */
1945 cpu = cpumask_next(rd->rto_cpu, rd->rto_mask);
1946
1947 rd->rto_cpu = cpu;
1948
1949 if (cpu < nr_cpu_ids)
1950 return cpu;
1951
1952 rd->rto_cpu = -1;
1953
1954 /*
1955 * ACQUIRE ensures we see the @rto_mask changes
1956 * made prior to the @next value observed.
1957 *
1958 * Matches WMB in rt_set_overload().
1959 */
1960 next = atomic_read_acquire(&rd->rto_loop_next);
1961
1962 if (rd->rto_loop == next)
1963 break;
1964
1965 rd->rto_loop = next;
1966 }
1967
1968 return -1;
1969}
1970
1971static inline bool rto_start_trylock(atomic_t *v)
1972{
1973 return !atomic_cmpxchg_acquire(v, 0, 1);
1974}
1975
1976static inline void rto_start_unlock(atomic_t *v)
1977{
1978 atomic_set_release(v, 0);
1979}
1980
1981static void tell_cpu_to_push(struct rq *rq)
1982{
1983 int cpu = -1;
1984
1985 /* Keep the loop going if the IPI is currently active */
1986 atomic_inc(&rq->rd->rto_loop_next);
1987
1988 /* Only one CPU can initiate a loop at a time */
1989 if (!rto_start_trylock(&rq->rd->rto_loop_start))
1990 return;
1991
1992 raw_spin_lock(&rq->rd->rto_lock);
1993
1994 /*
1995 * The rto_cpu is updated under the lock, if it has a valid CPU
1996 * then the IPI is still running and will continue due to the
1997 * update to loop_next, and nothing needs to be done here.
1998 * Otherwise it is finishing up and an ipi needs to be sent.
1999 */
2000 if (rq->rd->rto_cpu < 0)
2001 cpu = rto_next_cpu(rq->rd);
2002
2003 raw_spin_unlock(&rq->rd->rto_lock);
2004
2005 rto_start_unlock(&rq->rd->rto_loop_start);
2006
2007 if (cpu >= 0) {
2008 /* Make sure the rd does not get freed while pushing */
2009 sched_get_rd(rq->rd);
2010 irq_work_queue_on(&rq->rd->rto_push_work, cpu);
2011 }
2012}
2013
2014/* Called from hardirq context */
2015void rto_push_irq_work_func(struct irq_work *work)
2016{
2017 struct root_domain *rd =
2018 container_of(work, struct root_domain, rto_push_work);
2019 struct rq *rq;
2020 int cpu;
2021
2022 rq = this_rq();
2023
2024 /*
2025 * We do not need to grab the lock to check for has_pushable_tasks.
2026 * When it gets updated, a check is made if a push is possible.
2027 */
2028 if (has_pushable_tasks(rq)) {
2029 raw_spin_lock(&rq->lock);
2030 push_rt_tasks(rq);
2031 raw_spin_unlock(&rq->lock);
2032 }
2033
2034 raw_spin_lock(&rd->rto_lock);
2035
2036 /* Pass the IPI to the next rt overloaded queue */
2037 cpu = rto_next_cpu(rd);
2038
2039 raw_spin_unlock(&rd->rto_lock);
2040
2041 if (cpu < 0) {
2042 sched_put_rd(rd);
2043 return;
2044 }
2045
2046 /* Try the next RT overloaded CPU */
2047 irq_work_queue_on(&rd->rto_push_work, cpu);
2048}
2049#endif /* HAVE_RT_PUSH_IPI */
2050
2051static void pull_rt_task(struct rq *this_rq)
2052{
2053 int this_cpu = this_rq->cpu, cpu;
2054 bool resched = false;
2055 struct task_struct *p;
2056 struct rq *src_rq;
2057 int rt_overload_count = rt_overloaded(this_rq);
2058
2059 if (likely(!rt_overload_count))
2060 return;
2061
2062 /*
2063 * Match the barrier from rt_set_overloaded; this guarantees that if we
2064 * see overloaded we must also see the rto_mask bit.
2065 */
2066 smp_rmb();
2067
2068 /* If we are the only overloaded CPU do nothing */
2069 if (rt_overload_count == 1 &&
2070 cpumask_test_cpu(this_rq->cpu, this_rq->rd->rto_mask))
2071 return;
2072
2073#ifdef HAVE_RT_PUSH_IPI
2074 if (sched_feat(RT_PUSH_IPI)) {
2075 tell_cpu_to_push(this_rq);
2076 return;
2077 }
2078#endif
2079
2080 for_each_cpu(cpu, this_rq->rd->rto_mask) {
2081 if (this_cpu == cpu)
2082 continue;
2083
2084 src_rq = cpu_rq(cpu);
2085
2086 /*
2087 * Don't bother taking the src_rq->lock if the next highest
2088 * task is known to be lower-priority than our current task.
2089 * This may look racy, but if this value is about to go
2090 * logically higher, the src_rq will push this task away.
2091 * And if its going logically lower, we do not care
2092 */
2093 if (src_rq->rt.highest_prio.next >=
2094 this_rq->rt.highest_prio.curr)
2095 continue;
2096
2097 /*
2098 * We can potentially drop this_rq's lock in
2099 * double_lock_balance, and another CPU could
2100 * alter this_rq
2101 */
2102 double_lock_balance(this_rq, src_rq);
2103
2104 /*
2105 * We can pull only a task, which is pushable
2106 * on its rq, and no others.
2107 */
2108 p = pick_highest_pushable_task(src_rq, this_cpu);
2109
2110 /*
2111 * Do we have an RT task that preempts
2112 * the to-be-scheduled task?
2113 */
2114 if (p && (p->prio < this_rq->rt.highest_prio.curr)) {
2115 WARN_ON(p == src_rq->curr);
2116 WARN_ON(!task_on_rq_queued(p));
2117
2118 /*
2119 * There's a chance that p is higher in priority
2120 * than what's currently running on its CPU.
2121 * This is just that p is wakeing up and hasn't
2122 * had a chance to schedule. We only pull
2123 * p if it is lower in priority than the
2124 * current task on the run queue
2125 */
2126 if (p->prio < src_rq->curr->prio)
2127 goto skip;
2128
2129 resched = true;
2130
2131 deactivate_task(src_rq, p, 0);
2132 set_task_cpu(p, this_cpu);
2133 activate_task(this_rq, p, 0);
2134 /*
2135 * We continue with the search, just in
2136 * case there's an even higher prio task
2137 * in another runqueue. (low likelihood
2138 * but possible)
2139 */
2140 }
2141skip:
2142 double_unlock_balance(this_rq, src_rq);
2143 }
2144
2145 if (resched)
2146 resched_curr(this_rq);
2147}
2148
2149/*
2150 * If we are not running and we are not going to reschedule soon, we should
2151 * try to push tasks away now
2152 */
2153static void task_woken_rt(struct rq *rq, struct task_struct *p)
2154{
2155 if (!task_running(rq, p) &&
2156 !test_tsk_need_resched(rq->curr) &&
2157 p->nr_cpus_allowed > 1 &&
2158 (dl_task(rq->curr) || rt_task(rq->curr)) &&
2159 (rq->curr->nr_cpus_allowed < 2 ||
2160 rq->curr->prio <= p->prio))
2161 push_rt_tasks(rq);
2162}
2163
2164/* Assumes rq->lock is held */
2165static void rq_online_rt(struct rq *rq)
2166{
2167 if (rq->rt.overloaded)
2168 rt_set_overload(rq);
2169
2170 __enable_runtime(rq);
2171
2172 cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio.curr);
2173}
2174
2175/* Assumes rq->lock is held */
2176static void rq_offline_rt(struct rq *rq)
2177{
2178 if (rq->rt.overloaded)
2179 rt_clear_overload(rq);
2180
2181 __disable_runtime(rq);
2182
2183 cpupri_set(&rq->rd->cpupri, rq->cpu, CPUPRI_INVALID);
2184}
2185
2186/*
2187 * When switch from the rt queue, we bring ourselves to a position
2188 * that we might want to pull RT tasks from other runqueues.
2189 */
2190static void switched_from_rt(struct rq *rq, struct task_struct *p)
2191{
2192 /*
2193 * If there are other RT tasks then we will reschedule
2194 * and the scheduling of the other RT tasks will handle
2195 * the balancing. But if we are the last RT task
2196 * we may need to handle the pulling of RT tasks
2197 * now.
2198 */
2199 if (!task_on_rq_queued(p) || rq->rt.rt_nr_running)
2200 return;
2201
2202 rt_queue_pull_task(rq);
2203}
2204
2205void __init init_sched_rt_class(void)
2206{
2207 unsigned int i;
2208
2209 for_each_possible_cpu(i) {
2210 zalloc_cpumask_var_node(&per_cpu(local_cpu_mask, i),
2211 GFP_KERNEL, cpu_to_node(i));
2212 }
2213}
2214#endif /* CONFIG_SMP */
2215
2216/*
2217 * When switching a task to RT, we may overload the runqueue
2218 * with RT tasks. In this case we try to push them off to
2219 * other runqueues.
2220 */
2221static void switched_to_rt(struct rq *rq, struct task_struct *p)
2222{
2223 /*
Olivier Deprez0e641232021-09-23 10:07:05 +02002224 * If we are running, update the avg_rt tracking, as the running time
2225 * will now on be accounted into the latter.
2226 */
2227 if (task_current(rq, p)) {
2228 update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 0);
2229 return;
2230 }
2231
2232 /*
2233 * If we are not running we may need to preempt the current
2234 * running task. If that current running task is also an RT task
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002235 * then see if we can move to another run queue.
2236 */
Olivier Deprez0e641232021-09-23 10:07:05 +02002237 if (task_on_rq_queued(p)) {
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002238#ifdef CONFIG_SMP
2239 if (p->nr_cpus_allowed > 1 && rq->rt.overloaded)
2240 rt_queue_push_tasks(rq);
2241#endif /* CONFIG_SMP */
2242 if (p->prio < rq->curr->prio && cpu_online(cpu_of(rq)))
2243 resched_curr(rq);
2244 }
2245}
2246
2247/*
2248 * Priority of the task has changed. This may cause
2249 * us to initiate a push or pull.
2250 */
2251static void
2252prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
2253{
2254 if (!task_on_rq_queued(p))
2255 return;
2256
2257 if (rq->curr == p) {
2258#ifdef CONFIG_SMP
2259 /*
2260 * If our priority decreases while running, we
2261 * may need to pull tasks to this runqueue.
2262 */
2263 if (oldprio < p->prio)
2264 rt_queue_pull_task(rq);
2265
2266 /*
2267 * If there's a higher priority task waiting to run
2268 * then reschedule.
2269 */
2270 if (p->prio > rq->rt.highest_prio.curr)
2271 resched_curr(rq);
2272#else
2273 /* For UP simply resched on drop of prio */
2274 if (oldprio < p->prio)
2275 resched_curr(rq);
2276#endif /* CONFIG_SMP */
2277 } else {
2278 /*
2279 * This task is not running, but if it is
2280 * greater than the current running task
2281 * then reschedule.
2282 */
2283 if (p->prio < rq->curr->prio)
2284 resched_curr(rq);
2285 }
2286}
2287
2288#ifdef CONFIG_POSIX_TIMERS
2289static void watchdog(struct rq *rq, struct task_struct *p)
2290{
2291 unsigned long soft, hard;
2292
2293 /* max may change after cur was read, this will be fixed next tick */
2294 soft = task_rlimit(p, RLIMIT_RTTIME);
2295 hard = task_rlimit_max(p, RLIMIT_RTTIME);
2296
2297 if (soft != RLIM_INFINITY) {
2298 unsigned long next;
2299
2300 if (p->rt.watchdog_stamp != jiffies) {
2301 p->rt.timeout++;
2302 p->rt.watchdog_stamp = jiffies;
2303 }
2304
2305 next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ);
David Brazdil0f672f62019-12-10 10:32:29 +00002306 if (p->rt.timeout > next) {
2307 posix_cputimers_rt_watchdog(&p->posix_cputimers,
2308 p->se.sum_exec_runtime);
2309 }
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002310 }
2311}
2312#else
2313static inline void watchdog(struct rq *rq, struct task_struct *p) { }
2314#endif
2315
2316/*
2317 * scheduler tick hitting a task of our scheduling class.
2318 *
2319 * NOTE: This function can be called remotely by the tick offload that
2320 * goes along full dynticks. Therefore no local assumption can be made
2321 * and everything must be accessed through the @rq and @curr passed in
2322 * parameters.
2323 */
2324static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
2325{
2326 struct sched_rt_entity *rt_se = &p->rt;
2327
2328 update_curr_rt(rq);
David Brazdil0f672f62019-12-10 10:32:29 +00002329 update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 1);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002330
2331 watchdog(rq, p);
2332
2333 /*
2334 * RR tasks need a special form of timeslice management.
2335 * FIFO tasks have no timeslices.
2336 */
2337 if (p->policy != SCHED_RR)
2338 return;
2339
2340 if (--p->rt.time_slice)
2341 return;
2342
2343 p->rt.time_slice = sched_rr_timeslice;
2344
2345 /*
2346 * Requeue to the end of queue if we (and all of our ancestors) are not
2347 * the only element on the queue
2348 */
2349 for_each_sched_rt_entity(rt_se) {
2350 if (rt_se->run_list.prev != rt_se->run_list.next) {
2351 requeue_task_rt(rq, p, 0);
2352 resched_curr(rq);
2353 return;
2354 }
2355 }
2356}
2357
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002358static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
2359{
2360 /*
2361 * Time slice is 0 for SCHED_FIFO tasks
2362 */
2363 if (task->policy == SCHED_RR)
2364 return sched_rr_timeslice;
2365 else
2366 return 0;
2367}
2368
2369const struct sched_class rt_sched_class = {
2370 .next = &fair_sched_class,
2371 .enqueue_task = enqueue_task_rt,
2372 .dequeue_task = dequeue_task_rt,
2373 .yield_task = yield_task_rt,
2374
2375 .check_preempt_curr = check_preempt_curr_rt,
2376
2377 .pick_next_task = pick_next_task_rt,
2378 .put_prev_task = put_prev_task_rt,
David Brazdil0f672f62019-12-10 10:32:29 +00002379 .set_next_task = set_next_task_rt,
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002380
2381#ifdef CONFIG_SMP
David Brazdil0f672f62019-12-10 10:32:29 +00002382 .balance = balance_rt,
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002383 .select_task_rq = select_task_rq_rt,
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002384 .set_cpus_allowed = set_cpus_allowed_common,
2385 .rq_online = rq_online_rt,
2386 .rq_offline = rq_offline_rt,
2387 .task_woken = task_woken_rt,
2388 .switched_from = switched_from_rt,
2389#endif
2390
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002391 .task_tick = task_tick_rt,
2392
2393 .get_rr_interval = get_rr_interval_rt,
2394
2395 .prio_changed = prio_changed_rt,
2396 .switched_to = switched_to_rt,
2397
2398 .update_curr = update_curr_rt,
David Brazdil0f672f62019-12-10 10:32:29 +00002399
2400#ifdef CONFIG_UCLAMP_TASK
2401 .uclamp_enabled = 1,
2402#endif
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002403};
2404
2405#ifdef CONFIG_RT_GROUP_SCHED
2406/*
2407 * Ensure that the real time constraints are schedulable.
2408 */
2409static DEFINE_MUTEX(rt_constraints_mutex);
2410
2411/* Must be called with tasklist_lock held */
2412static inline int tg_has_rt_tasks(struct task_group *tg)
2413{
2414 struct task_struct *g, *p;
2415
2416 /*
2417 * Autogroups do not have RT tasks; see autogroup_create().
2418 */
2419 if (task_group_is_autogroup(tg))
2420 return 0;
2421
2422 for_each_process_thread(g, p) {
2423 if (rt_task(p) && task_group(p) == tg)
2424 return 1;
2425 }
2426
2427 return 0;
2428}
2429
2430struct rt_schedulable_data {
2431 struct task_group *tg;
2432 u64 rt_period;
2433 u64 rt_runtime;
2434};
2435
2436static int tg_rt_schedulable(struct task_group *tg, void *data)
2437{
2438 struct rt_schedulable_data *d = data;
2439 struct task_group *child;
2440 unsigned long total, sum = 0;
2441 u64 period, runtime;
2442
2443 period = ktime_to_ns(tg->rt_bandwidth.rt_period);
2444 runtime = tg->rt_bandwidth.rt_runtime;
2445
2446 if (tg == d->tg) {
2447 period = d->rt_period;
2448 runtime = d->rt_runtime;
2449 }
2450
2451 /*
2452 * Cannot have more runtime than the period.
2453 */
2454 if (runtime > period && runtime != RUNTIME_INF)
2455 return -EINVAL;
2456
2457 /*
2458 * Ensure we don't starve existing RT tasks.
2459 */
2460 if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))
2461 return -EBUSY;
2462
2463 total = to_ratio(period, runtime);
2464
2465 /*
2466 * Nobody can have more than the global setting allows.
2467 */
2468 if (total > to_ratio(global_rt_period(), global_rt_runtime()))
2469 return -EINVAL;
2470
2471 /*
2472 * The sum of our children's runtime should not exceed our own.
2473 */
2474 list_for_each_entry_rcu(child, &tg->children, siblings) {
2475 period = ktime_to_ns(child->rt_bandwidth.rt_period);
2476 runtime = child->rt_bandwidth.rt_runtime;
2477
2478 if (child == d->tg) {
2479 period = d->rt_period;
2480 runtime = d->rt_runtime;
2481 }
2482
2483 sum += to_ratio(period, runtime);
2484 }
2485
2486 if (sum > total)
2487 return -EINVAL;
2488
2489 return 0;
2490}
2491
2492static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
2493{
2494 int ret;
2495
2496 struct rt_schedulable_data data = {
2497 .tg = tg,
2498 .rt_period = period,
2499 .rt_runtime = runtime,
2500 };
2501
2502 rcu_read_lock();
2503 ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data);
2504 rcu_read_unlock();
2505
2506 return ret;
2507}
2508
2509static int tg_set_rt_bandwidth(struct task_group *tg,
2510 u64 rt_period, u64 rt_runtime)
2511{
2512 int i, err = 0;
2513
2514 /*
2515 * Disallowing the root group RT runtime is BAD, it would disallow the
2516 * kernel creating (and or operating) RT threads.
2517 */
2518 if (tg == &root_task_group && rt_runtime == 0)
2519 return -EINVAL;
2520
2521 /* No period doesn't make any sense. */
2522 if (rt_period == 0)
2523 return -EINVAL;
2524
Olivier Deprez0e641232021-09-23 10:07:05 +02002525 /*
2526 * Bound quota to defend quota against overflow during bandwidth shift.
2527 */
2528 if (rt_runtime != RUNTIME_INF && rt_runtime > max_rt_runtime)
2529 return -EINVAL;
2530
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002531 mutex_lock(&rt_constraints_mutex);
2532 read_lock(&tasklist_lock);
2533 err = __rt_schedulable(tg, rt_period, rt_runtime);
2534 if (err)
2535 goto unlock;
2536
2537 raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
2538 tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
2539 tg->rt_bandwidth.rt_runtime = rt_runtime;
2540
2541 for_each_possible_cpu(i) {
2542 struct rt_rq *rt_rq = tg->rt_rq[i];
2543
2544 raw_spin_lock(&rt_rq->rt_runtime_lock);
2545 rt_rq->rt_runtime = rt_runtime;
2546 raw_spin_unlock(&rt_rq->rt_runtime_lock);
2547 }
2548 raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
2549unlock:
2550 read_unlock(&tasklist_lock);
2551 mutex_unlock(&rt_constraints_mutex);
2552
2553 return err;
2554}
2555
2556int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
2557{
2558 u64 rt_runtime, rt_period;
2559
2560 rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
2561 rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
2562 if (rt_runtime_us < 0)
2563 rt_runtime = RUNTIME_INF;
David Brazdil0f672f62019-12-10 10:32:29 +00002564 else if ((u64)rt_runtime_us > U64_MAX / NSEC_PER_USEC)
2565 return -EINVAL;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002566
2567 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
2568}
2569
2570long sched_group_rt_runtime(struct task_group *tg)
2571{
2572 u64 rt_runtime_us;
2573
2574 if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF)
2575 return -1;
2576
2577 rt_runtime_us = tg->rt_bandwidth.rt_runtime;
2578 do_div(rt_runtime_us, NSEC_PER_USEC);
2579 return rt_runtime_us;
2580}
2581
2582int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us)
2583{
2584 u64 rt_runtime, rt_period;
2585
David Brazdil0f672f62019-12-10 10:32:29 +00002586 if (rt_period_us > U64_MAX / NSEC_PER_USEC)
2587 return -EINVAL;
2588
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002589 rt_period = rt_period_us * NSEC_PER_USEC;
2590 rt_runtime = tg->rt_bandwidth.rt_runtime;
2591
2592 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
2593}
2594
2595long sched_group_rt_period(struct task_group *tg)
2596{
2597 u64 rt_period_us;
2598
2599 rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period);
2600 do_div(rt_period_us, NSEC_PER_USEC);
2601 return rt_period_us;
2602}
2603
2604static int sched_rt_global_constraints(void)
2605{
2606 int ret = 0;
2607
2608 mutex_lock(&rt_constraints_mutex);
2609 read_lock(&tasklist_lock);
2610 ret = __rt_schedulable(NULL, 0, 0);
2611 read_unlock(&tasklist_lock);
2612 mutex_unlock(&rt_constraints_mutex);
2613
2614 return ret;
2615}
2616
2617int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
2618{
2619 /* Don't accept realtime tasks when there is no way for them to run */
2620 if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)
2621 return 0;
2622
2623 return 1;
2624}
2625
2626#else /* !CONFIG_RT_GROUP_SCHED */
2627static int sched_rt_global_constraints(void)
2628{
2629 unsigned long flags;
2630 int i;
2631
2632 raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
2633 for_each_possible_cpu(i) {
2634 struct rt_rq *rt_rq = &cpu_rq(i)->rt;
2635
2636 raw_spin_lock(&rt_rq->rt_runtime_lock);
2637 rt_rq->rt_runtime = global_rt_runtime();
2638 raw_spin_unlock(&rt_rq->rt_runtime_lock);
2639 }
2640 raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
2641
2642 return 0;
2643}
2644#endif /* CONFIG_RT_GROUP_SCHED */
2645
2646static int sched_rt_global_validate(void)
2647{
2648 if (sysctl_sched_rt_period <= 0)
2649 return -EINVAL;
2650
2651 if ((sysctl_sched_rt_runtime != RUNTIME_INF) &&
Olivier Deprez0e641232021-09-23 10:07:05 +02002652 ((sysctl_sched_rt_runtime > sysctl_sched_rt_period) ||
2653 ((u64)sysctl_sched_rt_runtime *
2654 NSEC_PER_USEC > max_rt_runtime)))
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002655 return -EINVAL;
2656
2657 return 0;
2658}
2659
2660static void sched_rt_do_global(void)
2661{
2662 def_rt_bandwidth.rt_runtime = global_rt_runtime();
2663 def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period());
2664}
2665
2666int sched_rt_handler(struct ctl_table *table, int write,
2667 void __user *buffer, size_t *lenp,
2668 loff_t *ppos)
2669{
2670 int old_period, old_runtime;
2671 static DEFINE_MUTEX(mutex);
2672 int ret;
2673
2674 mutex_lock(&mutex);
2675 old_period = sysctl_sched_rt_period;
2676 old_runtime = sysctl_sched_rt_runtime;
2677
2678 ret = proc_dointvec(table, write, buffer, lenp, ppos);
2679
2680 if (!ret && write) {
2681 ret = sched_rt_global_validate();
2682 if (ret)
2683 goto undo;
2684
2685 ret = sched_dl_global_validate();
2686 if (ret)
2687 goto undo;
2688
2689 ret = sched_rt_global_constraints();
2690 if (ret)
2691 goto undo;
2692
2693 sched_rt_do_global();
2694 sched_dl_do_global();
2695 }
2696 if (0) {
2697undo:
2698 sysctl_sched_rt_period = old_period;
2699 sysctl_sched_rt_runtime = old_runtime;
2700 }
2701 mutex_unlock(&mutex);
2702
2703 return ret;
2704}
2705
2706int sched_rr_handler(struct ctl_table *table, int write,
2707 void __user *buffer, size_t *lenp,
2708 loff_t *ppos)
2709{
2710 int ret;
2711 static DEFINE_MUTEX(mutex);
2712
2713 mutex_lock(&mutex);
2714 ret = proc_dointvec(table, write, buffer, lenp, ppos);
2715 /*
2716 * Make sure that internally we keep jiffies.
2717 * Also, writing zero resets the timeslice to default:
2718 */
2719 if (!ret && write) {
2720 sched_rr_timeslice =
2721 sysctl_sched_rr_timeslice <= 0 ? RR_TIMESLICE :
2722 msecs_to_jiffies(sysctl_sched_rr_timeslice);
2723 }
2724 mutex_unlock(&mutex);
2725
2726 return ret;
2727}
2728
2729#ifdef CONFIG_SCHED_DEBUG
2730void print_rt_stats(struct seq_file *m, int cpu)
2731{
2732 rt_rq_iter_t iter;
2733 struct rt_rq *rt_rq;
2734
2735 rcu_read_lock();
2736 for_each_rt_rq(rt_rq, iter, cpu_rq(cpu))
2737 print_rt_rq(m, cpu, rt_rq);
2738 rcu_read_unlock();
2739}
2740#endif /* CONFIG_SCHED_DEBUG */