blob: e6f22836c600b5e7b3b90b186cbc479e30d8b45c [file] [log] [blame]
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001// SPDX-License-Identifier: GPL-2.0
2/*
3 * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR
4 * policies)
5 */
6#include "sched.h"
7
8#include "pelt.h"
9
10int sched_rr_timeslice = RR_TIMESLICE;
11int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE;
Olivier Deprez0e641232021-09-23 10:07:05 +020012/* More than 4 hours if BW_SHIFT equals 20. */
13static const u64 max_rt_runtime = MAX_BW;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +000014
15static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
16
17struct rt_bandwidth def_rt_bandwidth;
18
19static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
20{
21 struct rt_bandwidth *rt_b =
22 container_of(timer, struct rt_bandwidth, rt_period_timer);
23 int idle = 0;
24 int overrun;
25
26 raw_spin_lock(&rt_b->rt_runtime_lock);
27 for (;;) {
28 overrun = hrtimer_forward_now(timer, rt_b->rt_period);
29 if (!overrun)
30 break;
31
32 raw_spin_unlock(&rt_b->rt_runtime_lock);
33 idle = do_sched_rt_period_timer(rt_b, overrun);
34 raw_spin_lock(&rt_b->rt_runtime_lock);
35 }
36 if (idle)
37 rt_b->rt_period_active = 0;
38 raw_spin_unlock(&rt_b->rt_runtime_lock);
39
40 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
41}
42
43void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
44{
45 rt_b->rt_period = ns_to_ktime(period);
46 rt_b->rt_runtime = runtime;
47
48 raw_spin_lock_init(&rt_b->rt_runtime_lock);
49
David Brazdil0f672f62019-12-10 10:32:29 +000050 hrtimer_init(&rt_b->rt_period_timer, CLOCK_MONOTONIC,
51 HRTIMER_MODE_REL_HARD);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +000052 rt_b->rt_period_timer.function = sched_rt_period_timer;
53}
54
Olivier Deprez157378f2022-04-04 15:47:50 +020055static inline void do_start_rt_bandwidth(struct rt_bandwidth *rt_b)
Andrew Scullb4b6d4a2019-01-02 15:54:55 +000056{
Andrew Scullb4b6d4a2019-01-02 15:54:55 +000057 raw_spin_lock(&rt_b->rt_runtime_lock);
58 if (!rt_b->rt_period_active) {
59 rt_b->rt_period_active = 1;
60 /*
61 * SCHED_DEADLINE updates the bandwidth, as a run away
62 * RT task with a DL task could hog a CPU. But DL does
63 * not reset the period. If a deadline task was running
64 * without an RT task running, it can cause RT tasks to
65 * throttle when they start up. Kick the timer right away
66 * to update the period.
67 */
68 hrtimer_forward_now(&rt_b->rt_period_timer, ns_to_ktime(0));
David Brazdil0f672f62019-12-10 10:32:29 +000069 hrtimer_start_expires(&rt_b->rt_period_timer,
70 HRTIMER_MODE_ABS_PINNED_HARD);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +000071 }
72 raw_spin_unlock(&rt_b->rt_runtime_lock);
73}
74
Olivier Deprez157378f2022-04-04 15:47:50 +020075static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
76{
77 if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
78 return;
79
80 do_start_rt_bandwidth(rt_b);
81}
82
Andrew Scullb4b6d4a2019-01-02 15:54:55 +000083void init_rt_rq(struct rt_rq *rt_rq)
84{
85 struct rt_prio_array *array;
86 int i;
87
88 array = &rt_rq->active;
89 for (i = 0; i < MAX_RT_PRIO; i++) {
90 INIT_LIST_HEAD(array->queue + i);
91 __clear_bit(i, array->bitmap);
92 }
93 /* delimiter for bitsearch: */
94 __set_bit(MAX_RT_PRIO, array->bitmap);
95
96#if defined CONFIG_SMP
97 rt_rq->highest_prio.curr = MAX_RT_PRIO;
98 rt_rq->highest_prio.next = MAX_RT_PRIO;
99 rt_rq->rt_nr_migratory = 0;
100 rt_rq->overloaded = 0;
101 plist_head_init(&rt_rq->pushable_tasks);
102#endif /* CONFIG_SMP */
103 /* We start is dequeued state, because no RT tasks are queued */
104 rt_rq->rt_queued = 0;
105
106 rt_rq->rt_time = 0;
107 rt_rq->rt_throttled = 0;
108 rt_rq->rt_runtime = 0;
109 raw_spin_lock_init(&rt_rq->rt_runtime_lock);
110}
111
112#ifdef CONFIG_RT_GROUP_SCHED
113static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
114{
115 hrtimer_cancel(&rt_b->rt_period_timer);
116}
117
118#define rt_entity_is_task(rt_se) (!(rt_se)->my_q)
119
120static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
121{
122#ifdef CONFIG_SCHED_DEBUG
123 WARN_ON_ONCE(!rt_entity_is_task(rt_se));
124#endif
125 return container_of(rt_se, struct task_struct, rt);
126}
127
128static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
129{
130 return rt_rq->rq;
131}
132
133static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
134{
135 return rt_se->rt_rq;
136}
137
138static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se)
139{
140 struct rt_rq *rt_rq = rt_se->rt_rq;
141
142 return rt_rq->rq;
143}
144
145void free_rt_sched_group(struct task_group *tg)
146{
147 int i;
148
149 if (tg->rt_se)
150 destroy_rt_bandwidth(&tg->rt_bandwidth);
151
152 for_each_possible_cpu(i) {
153 if (tg->rt_rq)
154 kfree(tg->rt_rq[i]);
155 if (tg->rt_se)
156 kfree(tg->rt_se[i]);
157 }
158
159 kfree(tg->rt_rq);
160 kfree(tg->rt_se);
161}
162
163void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
164 struct sched_rt_entity *rt_se, int cpu,
165 struct sched_rt_entity *parent)
166{
167 struct rq *rq = cpu_rq(cpu);
168
169 rt_rq->highest_prio.curr = MAX_RT_PRIO;
170 rt_rq->rt_nr_boosted = 0;
171 rt_rq->rq = rq;
172 rt_rq->tg = tg;
173
174 tg->rt_rq[cpu] = rt_rq;
175 tg->rt_se[cpu] = rt_se;
176
177 if (!rt_se)
178 return;
179
180 if (!parent)
181 rt_se->rt_rq = &rq->rt;
182 else
183 rt_se->rt_rq = parent->my_q;
184
185 rt_se->my_q = rt_rq;
186 rt_se->parent = parent;
187 INIT_LIST_HEAD(&rt_se->run_list);
188}
189
190int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
191{
192 struct rt_rq *rt_rq;
193 struct sched_rt_entity *rt_se;
194 int i;
195
196 tg->rt_rq = kcalloc(nr_cpu_ids, sizeof(rt_rq), GFP_KERNEL);
197 if (!tg->rt_rq)
198 goto err;
199 tg->rt_se = kcalloc(nr_cpu_ids, sizeof(rt_se), GFP_KERNEL);
200 if (!tg->rt_se)
201 goto err;
202
203 init_rt_bandwidth(&tg->rt_bandwidth,
204 ktime_to_ns(def_rt_bandwidth.rt_period), 0);
205
206 for_each_possible_cpu(i) {
207 rt_rq = kzalloc_node(sizeof(struct rt_rq),
208 GFP_KERNEL, cpu_to_node(i));
209 if (!rt_rq)
210 goto err;
211
212 rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
213 GFP_KERNEL, cpu_to_node(i));
214 if (!rt_se)
215 goto err_free_rq;
216
217 init_rt_rq(rt_rq);
218 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
219 init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
220 }
221
222 return 1;
223
224err_free_rq:
225 kfree(rt_rq);
226err:
227 return 0;
228}
229
230#else /* CONFIG_RT_GROUP_SCHED */
231
232#define rt_entity_is_task(rt_se) (1)
233
234static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
235{
236 return container_of(rt_se, struct task_struct, rt);
237}
238
239static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
240{
241 return container_of(rt_rq, struct rq, rt);
242}
243
244static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se)
245{
246 struct task_struct *p = rt_task_of(rt_se);
247
248 return task_rq(p);
249}
250
251static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
252{
253 struct rq *rq = rq_of_rt_se(rt_se);
254
255 return &rq->rt;
256}
257
258void free_rt_sched_group(struct task_group *tg) { }
259
260int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
261{
262 return 1;
263}
264#endif /* CONFIG_RT_GROUP_SCHED */
265
266#ifdef CONFIG_SMP
267
268static void pull_rt_task(struct rq *this_rq);
269
270static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
271{
272 /* Try to pull RT tasks here if we lower this rq's prio */
273 return rq->rt.highest_prio.curr > prev->prio;
274}
275
276static inline int rt_overloaded(struct rq *rq)
277{
278 return atomic_read(&rq->rd->rto_count);
279}
280
281static inline void rt_set_overload(struct rq *rq)
282{
283 if (!rq->online)
284 return;
285
286 cpumask_set_cpu(rq->cpu, rq->rd->rto_mask);
287 /*
288 * Make sure the mask is visible before we set
289 * the overload count. That is checked to determine
290 * if we should look at the mask. It would be a shame
291 * if we looked at the mask, but the mask was not
292 * updated yet.
293 *
294 * Matched by the barrier in pull_rt_task().
295 */
296 smp_wmb();
297 atomic_inc(&rq->rd->rto_count);
298}
299
300static inline void rt_clear_overload(struct rq *rq)
301{
302 if (!rq->online)
303 return;
304
305 /* the order here really doesn't matter */
306 atomic_dec(&rq->rd->rto_count);
307 cpumask_clear_cpu(rq->cpu, rq->rd->rto_mask);
308}
309
310static void update_rt_migration(struct rt_rq *rt_rq)
311{
312 if (rt_rq->rt_nr_migratory && rt_rq->rt_nr_total > 1) {
313 if (!rt_rq->overloaded) {
314 rt_set_overload(rq_of_rt_rq(rt_rq));
315 rt_rq->overloaded = 1;
316 }
317 } else if (rt_rq->overloaded) {
318 rt_clear_overload(rq_of_rt_rq(rt_rq));
319 rt_rq->overloaded = 0;
320 }
321}
322
323static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
324{
325 struct task_struct *p;
326
327 if (!rt_entity_is_task(rt_se))
328 return;
329
330 p = rt_task_of(rt_se);
331 rt_rq = &rq_of_rt_rq(rt_rq)->rt;
332
333 rt_rq->rt_nr_total++;
334 if (p->nr_cpus_allowed > 1)
335 rt_rq->rt_nr_migratory++;
336
337 update_rt_migration(rt_rq);
338}
339
340static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
341{
342 struct task_struct *p;
343
344 if (!rt_entity_is_task(rt_se))
345 return;
346
347 p = rt_task_of(rt_se);
348 rt_rq = &rq_of_rt_rq(rt_rq)->rt;
349
350 rt_rq->rt_nr_total--;
351 if (p->nr_cpus_allowed > 1)
352 rt_rq->rt_nr_migratory--;
353
354 update_rt_migration(rt_rq);
355}
356
357static inline int has_pushable_tasks(struct rq *rq)
358{
359 return !plist_head_empty(&rq->rt.pushable_tasks);
360}
361
362static DEFINE_PER_CPU(struct callback_head, rt_push_head);
363static DEFINE_PER_CPU(struct callback_head, rt_pull_head);
364
365static void push_rt_tasks(struct rq *);
366static void pull_rt_task(struct rq *);
367
368static inline void rt_queue_push_tasks(struct rq *rq)
369{
370 if (!has_pushable_tasks(rq))
371 return;
372
373 queue_balance_callback(rq, &per_cpu(rt_push_head, rq->cpu), push_rt_tasks);
374}
375
376static inline void rt_queue_pull_task(struct rq *rq)
377{
378 queue_balance_callback(rq, &per_cpu(rt_pull_head, rq->cpu), pull_rt_task);
379}
380
381static void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
382{
383 plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
384 plist_node_init(&p->pushable_tasks, p->prio);
385 plist_add(&p->pushable_tasks, &rq->rt.pushable_tasks);
386
387 /* Update the highest prio pushable task */
388 if (p->prio < rq->rt.highest_prio.next)
389 rq->rt.highest_prio.next = p->prio;
390}
391
392static void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
393{
394 plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
395
396 /* Update the new highest prio pushable task */
397 if (has_pushable_tasks(rq)) {
398 p = plist_first_entry(&rq->rt.pushable_tasks,
399 struct task_struct, pushable_tasks);
400 rq->rt.highest_prio.next = p->prio;
401 } else
402 rq->rt.highest_prio.next = MAX_RT_PRIO;
403}
404
405#else
406
407static inline void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
408{
409}
410
411static inline void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
412{
413}
414
415static inline
416void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
417{
418}
419
420static inline
421void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
422{
423}
424
425static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
426{
427 return false;
428}
429
430static inline void pull_rt_task(struct rq *this_rq)
431{
432}
433
434static inline void rt_queue_push_tasks(struct rq *rq)
435{
436}
437#endif /* CONFIG_SMP */
438
439static void enqueue_top_rt_rq(struct rt_rq *rt_rq);
Olivier Deprez92d4c212022-12-06 15:05:30 +0100440static void dequeue_top_rt_rq(struct rt_rq *rt_rq, unsigned int count);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000441
442static inline int on_rt_rq(struct sched_rt_entity *rt_se)
443{
444 return rt_se->on_rq;
445}
446
Olivier Deprez157378f2022-04-04 15:47:50 +0200447#ifdef CONFIG_UCLAMP_TASK
448/*
449 * Verify the fitness of task @p to run on @cpu taking into account the uclamp
450 * settings.
451 *
452 * This check is only important for heterogeneous systems where uclamp_min value
453 * is higher than the capacity of a @cpu. For non-heterogeneous system this
454 * function will always return true.
455 *
456 * The function will return true if the capacity of the @cpu is >= the
457 * uclamp_min and false otherwise.
458 *
459 * Note that uclamp_min will be clamped to uclamp_max if uclamp_min
460 * > uclamp_max.
461 */
462static inline bool rt_task_fits_capacity(struct task_struct *p, int cpu)
463{
464 unsigned int min_cap;
465 unsigned int max_cap;
466 unsigned int cpu_cap;
467
468 /* Only heterogeneous systems can benefit from this check */
469 if (!static_branch_unlikely(&sched_asym_cpucapacity))
470 return true;
471
472 min_cap = uclamp_eff_value(p, UCLAMP_MIN);
473 max_cap = uclamp_eff_value(p, UCLAMP_MAX);
474
475 cpu_cap = capacity_orig_of(cpu);
476
477 return cpu_cap >= min(min_cap, max_cap);
478}
479#else
480static inline bool rt_task_fits_capacity(struct task_struct *p, int cpu)
481{
482 return true;
483}
484#endif
485
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000486#ifdef CONFIG_RT_GROUP_SCHED
487
488static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
489{
490 if (!rt_rq->tg)
491 return RUNTIME_INF;
492
493 return rt_rq->rt_runtime;
494}
495
496static inline u64 sched_rt_period(struct rt_rq *rt_rq)
497{
498 return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period);
499}
500
501typedef struct task_group *rt_rq_iter_t;
502
503static inline struct task_group *next_task_group(struct task_group *tg)
504{
505 do {
506 tg = list_entry_rcu(tg->list.next,
507 typeof(struct task_group), list);
508 } while (&tg->list != &task_groups && task_group_is_autogroup(tg));
509
510 if (&tg->list == &task_groups)
511 tg = NULL;
512
513 return tg;
514}
515
516#define for_each_rt_rq(rt_rq, iter, rq) \
517 for (iter = container_of(&task_groups, typeof(*iter), list); \
518 (iter = next_task_group(iter)) && \
519 (rt_rq = iter->rt_rq[cpu_of(rq)]);)
520
521#define for_each_sched_rt_entity(rt_se) \
522 for (; rt_se; rt_se = rt_se->parent)
523
524static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
525{
526 return rt_se->my_q;
527}
528
529static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags);
530static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags);
531
532static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
533{
534 struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
535 struct rq *rq = rq_of_rt_rq(rt_rq);
536 struct sched_rt_entity *rt_se;
537
538 int cpu = cpu_of(rq);
539
540 rt_se = rt_rq->tg->rt_se[cpu];
541
542 if (rt_rq->rt_nr_running) {
543 if (!rt_se)
544 enqueue_top_rt_rq(rt_rq);
545 else if (!on_rt_rq(rt_se))
546 enqueue_rt_entity(rt_se, 0);
547
548 if (rt_rq->highest_prio.curr < curr->prio)
549 resched_curr(rq);
550 }
551}
552
553static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
554{
555 struct sched_rt_entity *rt_se;
556 int cpu = cpu_of(rq_of_rt_rq(rt_rq));
557
558 rt_se = rt_rq->tg->rt_se[cpu];
559
560 if (!rt_se) {
Olivier Deprez92d4c212022-12-06 15:05:30 +0100561 dequeue_top_rt_rq(rt_rq, rt_rq->rt_nr_running);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000562 /* Kick cpufreq (see the comment in kernel/sched/sched.h). */
563 cpufreq_update_util(rq_of_rt_rq(rt_rq), 0);
564 }
565 else if (on_rt_rq(rt_se))
566 dequeue_rt_entity(rt_se, 0);
567}
568
569static inline int rt_rq_throttled(struct rt_rq *rt_rq)
570{
571 return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted;
572}
573
574static int rt_se_boosted(struct sched_rt_entity *rt_se)
575{
576 struct rt_rq *rt_rq = group_rt_rq(rt_se);
577 struct task_struct *p;
578
579 if (rt_rq)
580 return !!rt_rq->rt_nr_boosted;
581
582 p = rt_task_of(rt_se);
583 return p->prio != p->normal_prio;
584}
585
586#ifdef CONFIG_SMP
587static inline const struct cpumask *sched_rt_period_mask(void)
588{
589 return this_rq()->rd->span;
590}
591#else
592static inline const struct cpumask *sched_rt_period_mask(void)
593{
594 return cpu_online_mask;
595}
596#endif
597
598static inline
599struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
600{
601 return container_of(rt_b, struct task_group, rt_bandwidth)->rt_rq[cpu];
602}
603
604static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
605{
606 return &rt_rq->tg->rt_bandwidth;
607}
608
609#else /* !CONFIG_RT_GROUP_SCHED */
610
611static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
612{
613 return rt_rq->rt_runtime;
614}
615
616static inline u64 sched_rt_period(struct rt_rq *rt_rq)
617{
618 return ktime_to_ns(def_rt_bandwidth.rt_period);
619}
620
621typedef struct rt_rq *rt_rq_iter_t;
622
623#define for_each_rt_rq(rt_rq, iter, rq) \
624 for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
625
626#define for_each_sched_rt_entity(rt_se) \
627 for (; rt_se; rt_se = NULL)
628
629static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
630{
631 return NULL;
632}
633
634static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
635{
636 struct rq *rq = rq_of_rt_rq(rt_rq);
637
638 if (!rt_rq->rt_nr_running)
639 return;
640
641 enqueue_top_rt_rq(rt_rq);
642 resched_curr(rq);
643}
644
645static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
646{
Olivier Deprez92d4c212022-12-06 15:05:30 +0100647 dequeue_top_rt_rq(rt_rq, rt_rq->rt_nr_running);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000648}
649
650static inline int rt_rq_throttled(struct rt_rq *rt_rq)
651{
652 return rt_rq->rt_throttled;
653}
654
655static inline const struct cpumask *sched_rt_period_mask(void)
656{
657 return cpu_online_mask;
658}
659
660static inline
661struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
662{
663 return &cpu_rq(cpu)->rt;
664}
665
666static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
667{
668 return &def_rt_bandwidth;
669}
670
671#endif /* CONFIG_RT_GROUP_SCHED */
672
673bool sched_rt_bandwidth_account(struct rt_rq *rt_rq)
674{
675 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
676
677 return (hrtimer_active(&rt_b->rt_period_timer) ||
678 rt_rq->rt_time < rt_b->rt_runtime);
679}
680
681#ifdef CONFIG_SMP
682/*
683 * We ran out of runtime, see if we can borrow some from our neighbours.
684 */
685static void do_balance_runtime(struct rt_rq *rt_rq)
686{
687 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
688 struct root_domain *rd = rq_of_rt_rq(rt_rq)->rd;
689 int i, weight;
690 u64 rt_period;
691
692 weight = cpumask_weight(rd->span);
693
694 raw_spin_lock(&rt_b->rt_runtime_lock);
695 rt_period = ktime_to_ns(rt_b->rt_period);
696 for_each_cpu(i, rd->span) {
697 struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
698 s64 diff;
699
700 if (iter == rt_rq)
701 continue;
702
703 raw_spin_lock(&iter->rt_runtime_lock);
704 /*
705 * Either all rqs have inf runtime and there's nothing to steal
706 * or __disable_runtime() below sets a specific rq to inf to
707 * indicate its been disabled and disalow stealing.
708 */
709 if (iter->rt_runtime == RUNTIME_INF)
710 goto next;
711
712 /*
713 * From runqueues with spare time, take 1/n part of their
714 * spare time, but no more than our period.
715 */
716 diff = iter->rt_runtime - iter->rt_time;
717 if (diff > 0) {
718 diff = div_u64((u64)diff, weight);
719 if (rt_rq->rt_runtime + diff > rt_period)
720 diff = rt_period - rt_rq->rt_runtime;
721 iter->rt_runtime -= diff;
722 rt_rq->rt_runtime += diff;
723 if (rt_rq->rt_runtime == rt_period) {
724 raw_spin_unlock(&iter->rt_runtime_lock);
725 break;
726 }
727 }
728next:
729 raw_spin_unlock(&iter->rt_runtime_lock);
730 }
731 raw_spin_unlock(&rt_b->rt_runtime_lock);
732}
733
734/*
735 * Ensure this RQ takes back all the runtime it lend to its neighbours.
736 */
737static void __disable_runtime(struct rq *rq)
738{
739 struct root_domain *rd = rq->rd;
740 rt_rq_iter_t iter;
741 struct rt_rq *rt_rq;
742
743 if (unlikely(!scheduler_running))
744 return;
745
746 for_each_rt_rq(rt_rq, iter, rq) {
747 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
748 s64 want;
749 int i;
750
751 raw_spin_lock(&rt_b->rt_runtime_lock);
752 raw_spin_lock(&rt_rq->rt_runtime_lock);
753 /*
754 * Either we're all inf and nobody needs to borrow, or we're
755 * already disabled and thus have nothing to do, or we have
756 * exactly the right amount of runtime to take out.
757 */
758 if (rt_rq->rt_runtime == RUNTIME_INF ||
759 rt_rq->rt_runtime == rt_b->rt_runtime)
760 goto balanced;
761 raw_spin_unlock(&rt_rq->rt_runtime_lock);
762
763 /*
764 * Calculate the difference between what we started out with
765 * and what we current have, that's the amount of runtime
766 * we lend and now have to reclaim.
767 */
768 want = rt_b->rt_runtime - rt_rq->rt_runtime;
769
770 /*
771 * Greedy reclaim, take back as much as we can.
772 */
773 for_each_cpu(i, rd->span) {
774 struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
775 s64 diff;
776
777 /*
778 * Can't reclaim from ourselves or disabled runqueues.
779 */
780 if (iter == rt_rq || iter->rt_runtime == RUNTIME_INF)
781 continue;
782
783 raw_spin_lock(&iter->rt_runtime_lock);
784 if (want > 0) {
785 diff = min_t(s64, iter->rt_runtime, want);
786 iter->rt_runtime -= diff;
787 want -= diff;
788 } else {
789 iter->rt_runtime -= want;
790 want -= want;
791 }
792 raw_spin_unlock(&iter->rt_runtime_lock);
793
794 if (!want)
795 break;
796 }
797
798 raw_spin_lock(&rt_rq->rt_runtime_lock);
799 /*
800 * We cannot be left wanting - that would mean some runtime
801 * leaked out of the system.
802 */
803 BUG_ON(want);
804balanced:
805 /*
806 * Disable all the borrow logic by pretending we have inf
807 * runtime - in which case borrowing doesn't make sense.
808 */
809 rt_rq->rt_runtime = RUNTIME_INF;
810 rt_rq->rt_throttled = 0;
811 raw_spin_unlock(&rt_rq->rt_runtime_lock);
812 raw_spin_unlock(&rt_b->rt_runtime_lock);
813
814 /* Make rt_rq available for pick_next_task() */
815 sched_rt_rq_enqueue(rt_rq);
816 }
817}
818
819static void __enable_runtime(struct rq *rq)
820{
821 rt_rq_iter_t iter;
822 struct rt_rq *rt_rq;
823
824 if (unlikely(!scheduler_running))
825 return;
826
827 /*
828 * Reset each runqueue's bandwidth settings
829 */
830 for_each_rt_rq(rt_rq, iter, rq) {
831 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
832
833 raw_spin_lock(&rt_b->rt_runtime_lock);
834 raw_spin_lock(&rt_rq->rt_runtime_lock);
835 rt_rq->rt_runtime = rt_b->rt_runtime;
836 rt_rq->rt_time = 0;
837 rt_rq->rt_throttled = 0;
838 raw_spin_unlock(&rt_rq->rt_runtime_lock);
839 raw_spin_unlock(&rt_b->rt_runtime_lock);
840 }
841}
842
843static void balance_runtime(struct rt_rq *rt_rq)
844{
845 if (!sched_feat(RT_RUNTIME_SHARE))
846 return;
847
848 if (rt_rq->rt_time > rt_rq->rt_runtime) {
849 raw_spin_unlock(&rt_rq->rt_runtime_lock);
850 do_balance_runtime(rt_rq);
851 raw_spin_lock(&rt_rq->rt_runtime_lock);
852 }
853}
854#else /* !CONFIG_SMP */
855static inline void balance_runtime(struct rt_rq *rt_rq) {}
856#endif /* CONFIG_SMP */
857
858static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
859{
860 int i, idle = 1, throttled = 0;
861 const struct cpumask *span;
862
863 span = sched_rt_period_mask();
864#ifdef CONFIG_RT_GROUP_SCHED
865 /*
866 * FIXME: isolated CPUs should really leave the root task group,
867 * whether they are isolcpus or were isolated via cpusets, lest
868 * the timer run on a CPU which does not service all runqueues,
869 * potentially leaving other CPUs indefinitely throttled. If
870 * isolation is really required, the user will turn the throttle
871 * off to kill the perturbations it causes anyway. Meanwhile,
872 * this maintains functionality for boot and/or troubleshooting.
873 */
874 if (rt_b == &root_task_group.rt_bandwidth)
875 span = cpu_online_mask;
876#endif
877 for_each_cpu(i, span) {
878 int enqueue = 0;
879 struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
880 struct rq *rq = rq_of_rt_rq(rt_rq);
881 int skip;
882
883 /*
884 * When span == cpu_online_mask, taking each rq->lock
885 * can be time-consuming. Try to avoid it when possible.
886 */
887 raw_spin_lock(&rt_rq->rt_runtime_lock);
888 if (!sched_feat(RT_RUNTIME_SHARE) && rt_rq->rt_runtime != RUNTIME_INF)
889 rt_rq->rt_runtime = rt_b->rt_runtime;
890 skip = !rt_rq->rt_time && !rt_rq->rt_nr_running;
891 raw_spin_unlock(&rt_rq->rt_runtime_lock);
892 if (skip)
893 continue;
894
895 raw_spin_lock(&rq->lock);
896 update_rq_clock(rq);
897
898 if (rt_rq->rt_time) {
899 u64 runtime;
900
901 raw_spin_lock(&rt_rq->rt_runtime_lock);
902 if (rt_rq->rt_throttled)
903 balance_runtime(rt_rq);
904 runtime = rt_rq->rt_runtime;
905 rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime);
906 if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) {
907 rt_rq->rt_throttled = 0;
908 enqueue = 1;
909
910 /*
911 * When we're idle and a woken (rt) task is
912 * throttled check_preempt_curr() will set
913 * skip_update and the time between the wakeup
914 * and this unthrottle will get accounted as
915 * 'runtime'.
916 */
917 if (rt_rq->rt_nr_running && rq->curr == rq->idle)
918 rq_clock_cancel_skipupdate(rq);
919 }
920 if (rt_rq->rt_time || rt_rq->rt_nr_running)
921 idle = 0;
922 raw_spin_unlock(&rt_rq->rt_runtime_lock);
923 } else if (rt_rq->rt_nr_running) {
924 idle = 0;
925 if (!rt_rq_throttled(rt_rq))
926 enqueue = 1;
927 }
928 if (rt_rq->rt_throttled)
929 throttled = 1;
930
931 if (enqueue)
932 sched_rt_rq_enqueue(rt_rq);
933 raw_spin_unlock(&rq->lock);
934 }
935
936 if (!throttled && (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF))
937 return 1;
938
939 return idle;
940}
941
942static inline int rt_se_prio(struct sched_rt_entity *rt_se)
943{
944#ifdef CONFIG_RT_GROUP_SCHED
945 struct rt_rq *rt_rq = group_rt_rq(rt_se);
946
947 if (rt_rq)
948 return rt_rq->highest_prio.curr;
949#endif
950
951 return rt_task_of(rt_se)->prio;
952}
953
954static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
955{
956 u64 runtime = sched_rt_runtime(rt_rq);
957
958 if (rt_rq->rt_throttled)
959 return rt_rq_throttled(rt_rq);
960
961 if (runtime >= sched_rt_period(rt_rq))
962 return 0;
963
964 balance_runtime(rt_rq);
965 runtime = sched_rt_runtime(rt_rq);
966 if (runtime == RUNTIME_INF)
967 return 0;
968
969 if (rt_rq->rt_time > runtime) {
970 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
971
972 /*
973 * Don't actually throttle groups that have no runtime assigned
974 * but accrue some time due to boosting.
975 */
976 if (likely(rt_b->rt_runtime)) {
977 rt_rq->rt_throttled = 1;
978 printk_deferred_once("sched: RT throttling activated\n");
979 } else {
980 /*
981 * In case we did anyway, make it go away,
982 * replenishment is a joke, since it will replenish us
983 * with exactly 0 ns.
984 */
985 rt_rq->rt_time = 0;
986 }
987
988 if (rt_rq_throttled(rt_rq)) {
989 sched_rt_rq_dequeue(rt_rq);
990 return 1;
991 }
992 }
993
994 return 0;
995}
996
997/*
998 * Update the current task's runtime statistics. Skip current tasks that
999 * are not in our scheduling class.
1000 */
1001static void update_curr_rt(struct rq *rq)
1002{
1003 struct task_struct *curr = rq->curr;
1004 struct sched_rt_entity *rt_se = &curr->rt;
1005 u64 delta_exec;
1006 u64 now;
1007
1008 if (curr->sched_class != &rt_sched_class)
1009 return;
1010
1011 now = rq_clock_task(rq);
1012 delta_exec = now - curr->se.exec_start;
1013 if (unlikely((s64)delta_exec <= 0))
1014 return;
1015
1016 schedstat_set(curr->se.statistics.exec_max,
1017 max(curr->se.statistics.exec_max, delta_exec));
1018
1019 curr->se.sum_exec_runtime += delta_exec;
1020 account_group_exec_runtime(curr, delta_exec);
1021
1022 curr->se.exec_start = now;
1023 cgroup_account_cputime(curr, delta_exec);
1024
1025 if (!rt_bandwidth_enabled())
1026 return;
1027
1028 for_each_sched_rt_entity(rt_se) {
1029 struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
Olivier Deprez157378f2022-04-04 15:47:50 +02001030 int exceeded;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001031
1032 if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {
1033 raw_spin_lock(&rt_rq->rt_runtime_lock);
1034 rt_rq->rt_time += delta_exec;
Olivier Deprez157378f2022-04-04 15:47:50 +02001035 exceeded = sched_rt_runtime_exceeded(rt_rq);
1036 if (exceeded)
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001037 resched_curr(rq);
1038 raw_spin_unlock(&rt_rq->rt_runtime_lock);
Olivier Deprez157378f2022-04-04 15:47:50 +02001039 if (exceeded)
1040 do_start_rt_bandwidth(sched_rt_bandwidth(rt_rq));
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001041 }
1042 }
1043}
1044
1045static void
Olivier Deprez92d4c212022-12-06 15:05:30 +01001046dequeue_top_rt_rq(struct rt_rq *rt_rq, unsigned int count)
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001047{
1048 struct rq *rq = rq_of_rt_rq(rt_rq);
1049
1050 BUG_ON(&rq->rt != rt_rq);
1051
1052 if (!rt_rq->rt_queued)
1053 return;
1054
1055 BUG_ON(!rq->nr_running);
1056
Olivier Deprez92d4c212022-12-06 15:05:30 +01001057 sub_nr_running(rq, count);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001058 rt_rq->rt_queued = 0;
1059
1060}
1061
1062static void
1063enqueue_top_rt_rq(struct rt_rq *rt_rq)
1064{
1065 struct rq *rq = rq_of_rt_rq(rt_rq);
1066
1067 BUG_ON(&rq->rt != rt_rq);
1068
1069 if (rt_rq->rt_queued)
1070 return;
1071
1072 if (rt_rq_throttled(rt_rq))
1073 return;
1074
1075 if (rt_rq->rt_nr_running) {
1076 add_nr_running(rq, rt_rq->rt_nr_running);
1077 rt_rq->rt_queued = 1;
1078 }
1079
1080 /* Kick cpufreq (see the comment in kernel/sched/sched.h). */
1081 cpufreq_update_util(rq, 0);
1082}
1083
1084#if defined CONFIG_SMP
1085
1086static void
1087inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
1088{
1089 struct rq *rq = rq_of_rt_rq(rt_rq);
1090
1091#ifdef CONFIG_RT_GROUP_SCHED
1092 /*
1093 * Change rq's cpupri only if rt_rq is the top queue.
1094 */
1095 if (&rq->rt != rt_rq)
1096 return;
1097#endif
1098 if (rq->online && prio < prev_prio)
1099 cpupri_set(&rq->rd->cpupri, rq->cpu, prio);
1100}
1101
1102static void
1103dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
1104{
1105 struct rq *rq = rq_of_rt_rq(rt_rq);
1106
1107#ifdef CONFIG_RT_GROUP_SCHED
1108 /*
1109 * Change rq's cpupri only if rt_rq is the top queue.
1110 */
1111 if (&rq->rt != rt_rq)
1112 return;
1113#endif
1114 if (rq->online && rt_rq->highest_prio.curr != prev_prio)
1115 cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr);
1116}
1117
1118#else /* CONFIG_SMP */
1119
1120static inline
1121void inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
1122static inline
1123void dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
1124
1125#endif /* CONFIG_SMP */
1126
1127#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
1128static void
1129inc_rt_prio(struct rt_rq *rt_rq, int prio)
1130{
1131 int prev_prio = rt_rq->highest_prio.curr;
1132
1133 if (prio < prev_prio)
1134 rt_rq->highest_prio.curr = prio;
1135
1136 inc_rt_prio_smp(rt_rq, prio, prev_prio);
1137}
1138
1139static void
1140dec_rt_prio(struct rt_rq *rt_rq, int prio)
1141{
1142 int prev_prio = rt_rq->highest_prio.curr;
1143
1144 if (rt_rq->rt_nr_running) {
1145
1146 WARN_ON(prio < prev_prio);
1147
1148 /*
1149 * This may have been our highest task, and therefore
1150 * we may have some recomputation to do
1151 */
1152 if (prio == prev_prio) {
1153 struct rt_prio_array *array = &rt_rq->active;
1154
1155 rt_rq->highest_prio.curr =
1156 sched_find_first_bit(array->bitmap);
1157 }
1158
1159 } else
1160 rt_rq->highest_prio.curr = MAX_RT_PRIO;
1161
1162 dec_rt_prio_smp(rt_rq, prio, prev_prio);
1163}
1164
1165#else
1166
1167static inline void inc_rt_prio(struct rt_rq *rt_rq, int prio) {}
1168static inline void dec_rt_prio(struct rt_rq *rt_rq, int prio) {}
1169
1170#endif /* CONFIG_SMP || CONFIG_RT_GROUP_SCHED */
1171
1172#ifdef CONFIG_RT_GROUP_SCHED
1173
1174static void
1175inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1176{
1177 if (rt_se_boosted(rt_se))
1178 rt_rq->rt_nr_boosted++;
1179
1180 if (rt_rq->tg)
1181 start_rt_bandwidth(&rt_rq->tg->rt_bandwidth);
1182}
1183
1184static void
1185dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1186{
1187 if (rt_se_boosted(rt_se))
1188 rt_rq->rt_nr_boosted--;
1189
1190 WARN_ON(!rt_rq->rt_nr_running && rt_rq->rt_nr_boosted);
1191}
1192
1193#else /* CONFIG_RT_GROUP_SCHED */
1194
1195static void
1196inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1197{
1198 start_rt_bandwidth(&def_rt_bandwidth);
1199}
1200
1201static inline
1202void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {}
1203
1204#endif /* CONFIG_RT_GROUP_SCHED */
1205
1206static inline
1207unsigned int rt_se_nr_running(struct sched_rt_entity *rt_se)
1208{
1209 struct rt_rq *group_rq = group_rt_rq(rt_se);
1210
1211 if (group_rq)
1212 return group_rq->rt_nr_running;
1213 else
1214 return 1;
1215}
1216
1217static inline
1218unsigned int rt_se_rr_nr_running(struct sched_rt_entity *rt_se)
1219{
1220 struct rt_rq *group_rq = group_rt_rq(rt_se);
1221 struct task_struct *tsk;
1222
1223 if (group_rq)
1224 return group_rq->rr_nr_running;
1225
1226 tsk = rt_task_of(rt_se);
1227
1228 return (tsk->policy == SCHED_RR) ? 1 : 0;
1229}
1230
1231static inline
1232void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1233{
1234 int prio = rt_se_prio(rt_se);
1235
1236 WARN_ON(!rt_prio(prio));
1237 rt_rq->rt_nr_running += rt_se_nr_running(rt_se);
1238 rt_rq->rr_nr_running += rt_se_rr_nr_running(rt_se);
1239
1240 inc_rt_prio(rt_rq, prio);
1241 inc_rt_migration(rt_se, rt_rq);
1242 inc_rt_group(rt_se, rt_rq);
1243}
1244
1245static inline
1246void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1247{
1248 WARN_ON(!rt_prio(rt_se_prio(rt_se)));
1249 WARN_ON(!rt_rq->rt_nr_running);
1250 rt_rq->rt_nr_running -= rt_se_nr_running(rt_se);
1251 rt_rq->rr_nr_running -= rt_se_rr_nr_running(rt_se);
1252
1253 dec_rt_prio(rt_rq, rt_se_prio(rt_se));
1254 dec_rt_migration(rt_se, rt_rq);
1255 dec_rt_group(rt_se, rt_rq);
1256}
1257
1258/*
1259 * Change rt_se->run_list location unless SAVE && !MOVE
1260 *
1261 * assumes ENQUEUE/DEQUEUE flags match
1262 */
1263static inline bool move_entity(unsigned int flags)
1264{
1265 if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) == DEQUEUE_SAVE)
1266 return false;
1267
1268 return true;
1269}
1270
1271static void __delist_rt_entity(struct sched_rt_entity *rt_se, struct rt_prio_array *array)
1272{
1273 list_del_init(&rt_se->run_list);
1274
1275 if (list_empty(array->queue + rt_se_prio(rt_se)))
1276 __clear_bit(rt_se_prio(rt_se), array->bitmap);
1277
1278 rt_se->on_list = 0;
1279}
1280
1281static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
1282{
1283 struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
1284 struct rt_prio_array *array = &rt_rq->active;
1285 struct rt_rq *group_rq = group_rt_rq(rt_se);
1286 struct list_head *queue = array->queue + rt_se_prio(rt_se);
1287
1288 /*
1289 * Don't enqueue the group if its throttled, or when empty.
1290 * The latter is a consequence of the former when a child group
1291 * get throttled and the current group doesn't have any other
1292 * active members.
1293 */
1294 if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) {
1295 if (rt_se->on_list)
1296 __delist_rt_entity(rt_se, array);
1297 return;
1298 }
1299
1300 if (move_entity(flags)) {
1301 WARN_ON_ONCE(rt_se->on_list);
1302 if (flags & ENQUEUE_HEAD)
1303 list_add(&rt_se->run_list, queue);
1304 else
1305 list_add_tail(&rt_se->run_list, queue);
1306
1307 __set_bit(rt_se_prio(rt_se), array->bitmap);
1308 rt_se->on_list = 1;
1309 }
1310 rt_se->on_rq = 1;
1311
1312 inc_rt_tasks(rt_se, rt_rq);
1313}
1314
1315static void __dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
1316{
1317 struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
1318 struct rt_prio_array *array = &rt_rq->active;
1319
1320 if (move_entity(flags)) {
1321 WARN_ON_ONCE(!rt_se->on_list);
1322 __delist_rt_entity(rt_se, array);
1323 }
1324 rt_se->on_rq = 0;
1325
1326 dec_rt_tasks(rt_se, rt_rq);
1327}
1328
1329/*
1330 * Because the prio of an upper entry depends on the lower
1331 * entries, we must remove entries top - down.
1332 */
1333static void dequeue_rt_stack(struct sched_rt_entity *rt_se, unsigned int flags)
1334{
1335 struct sched_rt_entity *back = NULL;
Olivier Deprez92d4c212022-12-06 15:05:30 +01001336 unsigned int rt_nr_running;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001337
1338 for_each_sched_rt_entity(rt_se) {
1339 rt_se->back = back;
1340 back = rt_se;
1341 }
1342
Olivier Deprez92d4c212022-12-06 15:05:30 +01001343 rt_nr_running = rt_rq_of_se(back)->rt_nr_running;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001344
1345 for (rt_se = back; rt_se; rt_se = rt_se->back) {
1346 if (on_rt_rq(rt_se))
1347 __dequeue_rt_entity(rt_se, flags);
1348 }
Olivier Deprez92d4c212022-12-06 15:05:30 +01001349
1350 dequeue_top_rt_rq(rt_rq_of_se(back), rt_nr_running);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001351}
1352
1353static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
1354{
1355 struct rq *rq = rq_of_rt_se(rt_se);
1356
1357 dequeue_rt_stack(rt_se, flags);
1358 for_each_sched_rt_entity(rt_se)
1359 __enqueue_rt_entity(rt_se, flags);
1360 enqueue_top_rt_rq(&rq->rt);
1361}
1362
1363static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
1364{
1365 struct rq *rq = rq_of_rt_se(rt_se);
1366
1367 dequeue_rt_stack(rt_se, flags);
1368
1369 for_each_sched_rt_entity(rt_se) {
1370 struct rt_rq *rt_rq = group_rt_rq(rt_se);
1371
1372 if (rt_rq && rt_rq->rt_nr_running)
1373 __enqueue_rt_entity(rt_se, flags);
1374 }
1375 enqueue_top_rt_rq(&rq->rt);
1376}
1377
1378/*
1379 * Adding/removing a task to/from a priority array:
1380 */
1381static void
1382enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
1383{
1384 struct sched_rt_entity *rt_se = &p->rt;
1385
1386 if (flags & ENQUEUE_WAKEUP)
1387 rt_se->timeout = 0;
1388
1389 enqueue_rt_entity(rt_se, flags);
1390
1391 if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
1392 enqueue_pushable_task(rq, p);
1393}
1394
1395static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
1396{
1397 struct sched_rt_entity *rt_se = &p->rt;
1398
1399 update_curr_rt(rq);
1400 dequeue_rt_entity(rt_se, flags);
1401
1402 dequeue_pushable_task(rq, p);
1403}
1404
1405/*
1406 * Put task to the head or the end of the run list without the overhead of
1407 * dequeue followed by enqueue.
1408 */
1409static void
1410requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int head)
1411{
1412 if (on_rt_rq(rt_se)) {
1413 struct rt_prio_array *array = &rt_rq->active;
1414 struct list_head *queue = array->queue + rt_se_prio(rt_se);
1415
1416 if (head)
1417 list_move(&rt_se->run_list, queue);
1418 else
1419 list_move_tail(&rt_se->run_list, queue);
1420 }
1421}
1422
1423static void requeue_task_rt(struct rq *rq, struct task_struct *p, int head)
1424{
1425 struct sched_rt_entity *rt_se = &p->rt;
1426 struct rt_rq *rt_rq;
1427
1428 for_each_sched_rt_entity(rt_se) {
1429 rt_rq = rt_rq_of_se(rt_se);
1430 requeue_rt_entity(rt_rq, rt_se, head);
1431 }
1432}
1433
1434static void yield_task_rt(struct rq *rq)
1435{
1436 requeue_task_rt(rq, rq->curr, 0);
1437}
1438
1439#ifdef CONFIG_SMP
1440static int find_lowest_rq(struct task_struct *task);
1441
1442static int
1443select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
1444{
1445 struct task_struct *curr;
1446 struct rq *rq;
Olivier Deprez157378f2022-04-04 15:47:50 +02001447 bool test;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001448
1449 /* For anything but wake ups, just return the task_cpu */
1450 if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK)
1451 goto out;
1452
1453 rq = cpu_rq(cpu);
1454
1455 rcu_read_lock();
1456 curr = READ_ONCE(rq->curr); /* unlocked access */
1457
1458 /*
1459 * If the current task on @p's runqueue is an RT task, then
1460 * try to see if we can wake this RT task up on another
1461 * runqueue. Otherwise simply start this RT task
1462 * on its current runqueue.
1463 *
1464 * We want to avoid overloading runqueues. If the woken
1465 * task is a higher priority, then it will stay on this CPU
1466 * and the lower prio task should be moved to another CPU.
1467 * Even though this will probably make the lower prio task
1468 * lose its cache, we do not want to bounce a higher task
1469 * around just because it gave up its CPU, perhaps for a
1470 * lock?
1471 *
1472 * For equal prio tasks, we just let the scheduler sort it out.
1473 *
1474 * Otherwise, just let it ride on the affined RQ and the
1475 * post-schedule router will push the preempted task away
1476 *
1477 * This test is optimistic, if we get it wrong the load-balancer
1478 * will have to sort it out.
Olivier Deprez157378f2022-04-04 15:47:50 +02001479 *
1480 * We take into account the capacity of the CPU to ensure it fits the
1481 * requirement of the task - which is only important on heterogeneous
1482 * systems like big.LITTLE.
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001483 */
Olivier Deprez157378f2022-04-04 15:47:50 +02001484 test = curr &&
1485 unlikely(rt_task(curr)) &&
1486 (curr->nr_cpus_allowed < 2 || curr->prio <= p->prio);
1487
1488 if (test || !rt_task_fits_capacity(p, cpu)) {
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001489 int target = find_lowest_rq(p);
1490
1491 /*
Olivier Deprez157378f2022-04-04 15:47:50 +02001492 * Bail out if we were forcing a migration to find a better
1493 * fitting CPU but our search failed.
1494 */
1495 if (!test && target != -1 && !rt_task_fits_capacity(p, target))
1496 goto out_unlock;
1497
1498 /*
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001499 * Don't bother moving it if the destination CPU is
1500 * not running a lower priority task.
1501 */
1502 if (target != -1 &&
1503 p->prio < cpu_rq(target)->rt.highest_prio.curr)
1504 cpu = target;
1505 }
Olivier Deprez157378f2022-04-04 15:47:50 +02001506
1507out_unlock:
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001508 rcu_read_unlock();
1509
1510out:
1511 return cpu;
1512}
1513
1514static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
1515{
1516 /*
1517 * Current can't be migrated, useless to reschedule,
1518 * let's hope p can move out.
1519 */
1520 if (rq->curr->nr_cpus_allowed == 1 ||
1521 !cpupri_find(&rq->rd->cpupri, rq->curr, NULL))
1522 return;
1523
1524 /*
1525 * p is migratable, so let's not schedule it and
1526 * see if it is pushed or pulled somewhere else.
1527 */
Olivier Deprez157378f2022-04-04 15:47:50 +02001528 if (p->nr_cpus_allowed != 1 &&
1529 cpupri_find(&rq->rd->cpupri, p, NULL))
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001530 return;
1531
1532 /*
1533 * There appear to be other CPUs that can accept
1534 * the current task but none can run 'p', so lets reschedule
1535 * to try and push the current task away:
1536 */
1537 requeue_task_rt(rq, p, 1);
1538 resched_curr(rq);
1539}
1540
David Brazdil0f672f62019-12-10 10:32:29 +00001541static int balance_rt(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
1542{
1543 if (!on_rt_rq(&p->rt) && need_pull_rt_task(rq, p)) {
1544 /*
1545 * This is OK, because current is on_cpu, which avoids it being
1546 * picked for load-balance and preemption/IRQs are still
1547 * disabled avoiding further scheduler activity on it and we've
1548 * not yet started the picking loop.
1549 */
1550 rq_unpin_lock(rq, rf);
1551 pull_rt_task(rq);
1552 rq_repin_lock(rq, rf);
1553 }
1554
1555 return sched_stop_runnable(rq) || sched_dl_runnable(rq) || sched_rt_runnable(rq);
1556}
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001557#endif /* CONFIG_SMP */
1558
1559/*
1560 * Preempt the current task with a newly woken task if needed:
1561 */
1562static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags)
1563{
1564 if (p->prio < rq->curr->prio) {
1565 resched_curr(rq);
1566 return;
1567 }
1568
1569#ifdef CONFIG_SMP
1570 /*
1571 * If:
1572 *
1573 * - the newly woken task is of equal priority to the current task
1574 * - the newly woken task is non-migratable while current is migratable
1575 * - current will be preempted on the next reschedule
1576 *
1577 * we should check to see if current can readily move to a different
1578 * cpu. If so, we will reschedule to allow the push logic to try
1579 * to move current somewhere else, making room for our non-migratable
1580 * task.
1581 */
1582 if (p->prio == rq->curr->prio && !test_tsk_need_resched(rq->curr))
1583 check_preempt_equal_prio(rq, p);
1584#endif
1585}
1586
Olivier Deprez0e641232021-09-23 10:07:05 +02001587static inline void set_next_task_rt(struct rq *rq, struct task_struct *p, bool first)
David Brazdil0f672f62019-12-10 10:32:29 +00001588{
1589 p->se.exec_start = rq_clock_task(rq);
1590
1591 /* The running task is never eligible for pushing */
1592 dequeue_pushable_task(rq, p);
1593
Olivier Deprez0e641232021-09-23 10:07:05 +02001594 if (!first)
1595 return;
1596
David Brazdil0f672f62019-12-10 10:32:29 +00001597 /*
1598 * If prev task was rt, put_prev_task() has already updated the
1599 * utilization. We only care of the case where we start to schedule a
1600 * rt task
1601 */
1602 if (rq->curr->sched_class != &rt_sched_class)
1603 update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 0);
1604
1605 rt_queue_push_tasks(rq);
1606}
1607
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001608static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
1609 struct rt_rq *rt_rq)
1610{
1611 struct rt_prio_array *array = &rt_rq->active;
1612 struct sched_rt_entity *next = NULL;
1613 struct list_head *queue;
1614 int idx;
1615
1616 idx = sched_find_first_bit(array->bitmap);
1617 BUG_ON(idx >= MAX_RT_PRIO);
1618
1619 queue = array->queue + idx;
1620 next = list_entry(queue->next, struct sched_rt_entity, run_list);
1621
1622 return next;
1623}
1624
1625static struct task_struct *_pick_next_task_rt(struct rq *rq)
1626{
1627 struct sched_rt_entity *rt_se;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001628 struct rt_rq *rt_rq = &rq->rt;
1629
1630 do {
1631 rt_se = pick_next_rt_entity(rq, rt_rq);
1632 BUG_ON(!rt_se);
1633 rt_rq = group_rt_rq(rt_se);
1634 } while (rt_rq);
1635
David Brazdil0f672f62019-12-10 10:32:29 +00001636 return rt_task_of(rt_se);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001637}
1638
Olivier Deprez157378f2022-04-04 15:47:50 +02001639static struct task_struct *pick_next_task_rt(struct rq *rq)
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001640{
1641 struct task_struct *p;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001642
David Brazdil0f672f62019-12-10 10:32:29 +00001643 if (!sched_rt_runnable(rq))
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001644 return NULL;
1645
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001646 p = _pick_next_task_rt(rq);
Olivier Deprez0e641232021-09-23 10:07:05 +02001647 set_next_task_rt(rq, p, true);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001648 return p;
1649}
1650
1651static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
1652{
1653 update_curr_rt(rq);
1654
David Brazdil0f672f62019-12-10 10:32:29 +00001655 update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 1);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001656
1657 /*
1658 * The previous task needs to be made eligible for pushing
1659 * if it is still active
1660 */
1661 if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1)
1662 enqueue_pushable_task(rq, p);
1663}
1664
1665#ifdef CONFIG_SMP
1666
1667/* Only try algorithms three times */
1668#define RT_MAX_TRIES 3
1669
1670static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
1671{
1672 if (!task_running(rq, p) &&
David Brazdil0f672f62019-12-10 10:32:29 +00001673 cpumask_test_cpu(cpu, p->cpus_ptr))
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001674 return 1;
1675
1676 return 0;
1677}
1678
1679/*
1680 * Return the highest pushable rq's task, which is suitable to be executed
1681 * on the CPU, NULL otherwise
1682 */
1683static struct task_struct *pick_highest_pushable_task(struct rq *rq, int cpu)
1684{
1685 struct plist_head *head = &rq->rt.pushable_tasks;
1686 struct task_struct *p;
1687
1688 if (!has_pushable_tasks(rq))
1689 return NULL;
1690
1691 plist_for_each_entry(p, head, pushable_tasks) {
1692 if (pick_rt_task(rq, p, cpu))
1693 return p;
1694 }
1695
1696 return NULL;
1697}
1698
1699static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask);
1700
1701static int find_lowest_rq(struct task_struct *task)
1702{
1703 struct sched_domain *sd;
1704 struct cpumask *lowest_mask = this_cpu_cpumask_var_ptr(local_cpu_mask);
1705 int this_cpu = smp_processor_id();
1706 int cpu = task_cpu(task);
Olivier Deprez157378f2022-04-04 15:47:50 +02001707 int ret;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001708
1709 /* Make sure the mask is initialized first */
1710 if (unlikely(!lowest_mask))
1711 return -1;
1712
1713 if (task->nr_cpus_allowed == 1)
1714 return -1; /* No other targets possible */
1715
Olivier Deprez157378f2022-04-04 15:47:50 +02001716 /*
1717 * If we're on asym system ensure we consider the different capacities
1718 * of the CPUs when searching for the lowest_mask.
1719 */
1720 if (static_branch_unlikely(&sched_asym_cpucapacity)) {
1721
1722 ret = cpupri_find_fitness(&task_rq(task)->rd->cpupri,
1723 task, lowest_mask,
1724 rt_task_fits_capacity);
1725 } else {
1726
1727 ret = cpupri_find(&task_rq(task)->rd->cpupri,
1728 task, lowest_mask);
1729 }
1730
1731 if (!ret)
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001732 return -1; /* No targets found */
1733
1734 /*
1735 * At this point we have built a mask of CPUs representing the
1736 * lowest priority tasks in the system. Now we want to elect
1737 * the best one based on our affinity and topology.
1738 *
1739 * We prioritize the last CPU that the task executed on since
1740 * it is most likely cache-hot in that location.
1741 */
1742 if (cpumask_test_cpu(cpu, lowest_mask))
1743 return cpu;
1744
1745 /*
1746 * Otherwise, we consult the sched_domains span maps to figure
1747 * out which CPU is logically closest to our hot cache data.
1748 */
1749 if (!cpumask_test_cpu(this_cpu, lowest_mask))
1750 this_cpu = -1; /* Skip this_cpu opt if not among lowest */
1751
1752 rcu_read_lock();
1753 for_each_domain(cpu, sd) {
1754 if (sd->flags & SD_WAKE_AFFINE) {
1755 int best_cpu;
1756
1757 /*
1758 * "this_cpu" is cheaper to preempt than a
1759 * remote processor.
1760 */
1761 if (this_cpu != -1 &&
1762 cpumask_test_cpu(this_cpu, sched_domain_span(sd))) {
1763 rcu_read_unlock();
1764 return this_cpu;
1765 }
1766
1767 best_cpu = cpumask_first_and(lowest_mask,
1768 sched_domain_span(sd));
1769 if (best_cpu < nr_cpu_ids) {
1770 rcu_read_unlock();
1771 return best_cpu;
1772 }
1773 }
1774 }
1775 rcu_read_unlock();
1776
1777 /*
1778 * And finally, if there were no matches within the domains
1779 * just give the caller *something* to work with from the compatible
1780 * locations.
1781 */
1782 if (this_cpu != -1)
1783 return this_cpu;
1784
1785 cpu = cpumask_any(lowest_mask);
1786 if (cpu < nr_cpu_ids)
1787 return cpu;
1788
1789 return -1;
1790}
1791
1792/* Will lock the rq it finds */
1793static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
1794{
1795 struct rq *lowest_rq = NULL;
1796 int tries;
1797 int cpu;
1798
1799 for (tries = 0; tries < RT_MAX_TRIES; tries++) {
1800 cpu = find_lowest_rq(task);
1801
1802 if ((cpu == -1) || (cpu == rq->cpu))
1803 break;
1804
1805 lowest_rq = cpu_rq(cpu);
1806
1807 if (lowest_rq->rt.highest_prio.curr <= task->prio) {
1808 /*
1809 * Target rq has tasks of equal or higher priority,
1810 * retrying does not release any lock and is unlikely
1811 * to yield a different result.
1812 */
1813 lowest_rq = NULL;
1814 break;
1815 }
1816
1817 /* if the prio of this runqueue changed, try again */
1818 if (double_lock_balance(rq, lowest_rq)) {
1819 /*
1820 * We had to unlock the run queue. In
1821 * the mean time, task could have
1822 * migrated already or had its affinity changed.
1823 * Also make sure that it wasn't scheduled on its rq.
1824 */
1825 if (unlikely(task_rq(task) != rq ||
David Brazdil0f672f62019-12-10 10:32:29 +00001826 !cpumask_test_cpu(lowest_rq->cpu, task->cpus_ptr) ||
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001827 task_running(rq, task) ||
1828 !rt_task(task) ||
1829 !task_on_rq_queued(task))) {
1830
1831 double_unlock_balance(rq, lowest_rq);
1832 lowest_rq = NULL;
1833 break;
1834 }
1835 }
1836
1837 /* If this rq is still suitable use it. */
1838 if (lowest_rq->rt.highest_prio.curr > task->prio)
1839 break;
1840
1841 /* try again */
1842 double_unlock_balance(rq, lowest_rq);
1843 lowest_rq = NULL;
1844 }
1845
1846 return lowest_rq;
1847}
1848
1849static struct task_struct *pick_next_pushable_task(struct rq *rq)
1850{
1851 struct task_struct *p;
1852
1853 if (!has_pushable_tasks(rq))
1854 return NULL;
1855
1856 p = plist_first_entry(&rq->rt.pushable_tasks,
1857 struct task_struct, pushable_tasks);
1858
1859 BUG_ON(rq->cpu != task_cpu(p));
1860 BUG_ON(task_current(rq, p));
1861 BUG_ON(p->nr_cpus_allowed <= 1);
1862
1863 BUG_ON(!task_on_rq_queued(p));
1864 BUG_ON(!rt_task(p));
1865
1866 return p;
1867}
1868
1869/*
1870 * If the current CPU has more than one RT task, see if the non
1871 * running task can migrate over to a CPU that is running a task
1872 * of lesser priority.
1873 */
1874static int push_rt_task(struct rq *rq)
1875{
1876 struct task_struct *next_task;
1877 struct rq *lowest_rq;
1878 int ret = 0;
1879
1880 if (!rq->rt.overloaded)
1881 return 0;
1882
1883 next_task = pick_next_pushable_task(rq);
1884 if (!next_task)
1885 return 0;
1886
1887retry:
David Brazdil0f672f62019-12-10 10:32:29 +00001888 if (WARN_ON(next_task == rq->curr))
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001889 return 0;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001890
1891 /*
1892 * It's possible that the next_task slipped in of
1893 * higher priority than current. If that's the case
1894 * just reschedule current.
1895 */
1896 if (unlikely(next_task->prio < rq->curr->prio)) {
1897 resched_curr(rq);
1898 return 0;
1899 }
1900
1901 /* We might release rq lock */
1902 get_task_struct(next_task);
1903
1904 /* find_lock_lowest_rq locks the rq if found */
1905 lowest_rq = find_lock_lowest_rq(next_task, rq);
1906 if (!lowest_rq) {
1907 struct task_struct *task;
1908 /*
1909 * find_lock_lowest_rq releases rq->lock
1910 * so it is possible that next_task has migrated.
1911 *
1912 * We need to make sure that the task is still on the same
1913 * run-queue and is also still the next task eligible for
1914 * pushing.
1915 */
1916 task = pick_next_pushable_task(rq);
1917 if (task == next_task) {
1918 /*
1919 * The task hasn't migrated, and is still the next
1920 * eligible task, but we failed to find a run-queue
1921 * to push it to. Do not retry in this case, since
1922 * other CPUs will pull from us when ready.
1923 */
1924 goto out;
1925 }
1926
1927 if (!task)
1928 /* No more tasks, just exit */
1929 goto out;
1930
1931 /*
1932 * Something has shifted, try again.
1933 */
1934 put_task_struct(next_task);
1935 next_task = task;
1936 goto retry;
1937 }
1938
1939 deactivate_task(rq, next_task, 0);
1940 set_task_cpu(next_task, lowest_rq->cpu);
1941 activate_task(lowest_rq, next_task, 0);
1942 ret = 1;
1943
1944 resched_curr(lowest_rq);
1945
1946 double_unlock_balance(rq, lowest_rq);
1947
1948out:
1949 put_task_struct(next_task);
1950
1951 return ret;
1952}
1953
1954static void push_rt_tasks(struct rq *rq)
1955{
1956 /* push_rt_task will return true if it moved an RT */
1957 while (push_rt_task(rq))
1958 ;
1959}
1960
1961#ifdef HAVE_RT_PUSH_IPI
1962
1963/*
1964 * When a high priority task schedules out from a CPU and a lower priority
1965 * task is scheduled in, a check is made to see if there's any RT tasks
1966 * on other CPUs that are waiting to run because a higher priority RT task
1967 * is currently running on its CPU. In this case, the CPU with multiple RT
1968 * tasks queued on it (overloaded) needs to be notified that a CPU has opened
1969 * up that may be able to run one of its non-running queued RT tasks.
1970 *
1971 * All CPUs with overloaded RT tasks need to be notified as there is currently
1972 * no way to know which of these CPUs have the highest priority task waiting
1973 * to run. Instead of trying to take a spinlock on each of these CPUs,
1974 * which has shown to cause large latency when done on machines with many
1975 * CPUs, sending an IPI to the CPUs to have them push off the overloaded
1976 * RT tasks waiting to run.
1977 *
1978 * Just sending an IPI to each of the CPUs is also an issue, as on large
1979 * count CPU machines, this can cause an IPI storm on a CPU, especially
1980 * if its the only CPU with multiple RT tasks queued, and a large number
1981 * of CPUs scheduling a lower priority task at the same time.
1982 *
1983 * Each root domain has its own irq work function that can iterate over
1984 * all CPUs with RT overloaded tasks. Since all CPUs with overloaded RT
1985 * tassk must be checked if there's one or many CPUs that are lowering
1986 * their priority, there's a single irq work iterator that will try to
1987 * push off RT tasks that are waiting to run.
1988 *
1989 * When a CPU schedules a lower priority task, it will kick off the
1990 * irq work iterator that will jump to each CPU with overloaded RT tasks.
1991 * As it only takes the first CPU that schedules a lower priority task
1992 * to start the process, the rto_start variable is incremented and if
1993 * the atomic result is one, then that CPU will try to take the rto_lock.
1994 * This prevents high contention on the lock as the process handles all
1995 * CPUs scheduling lower priority tasks.
1996 *
1997 * All CPUs that are scheduling a lower priority task will increment the
1998 * rt_loop_next variable. This will make sure that the irq work iterator
1999 * checks all RT overloaded CPUs whenever a CPU schedules a new lower
2000 * priority task, even if the iterator is in the middle of a scan. Incrementing
2001 * the rt_loop_next will cause the iterator to perform another scan.
2002 *
2003 */
2004static int rto_next_cpu(struct root_domain *rd)
2005{
2006 int next;
2007 int cpu;
2008
2009 /*
2010 * When starting the IPI RT pushing, the rto_cpu is set to -1,
2011 * rt_next_cpu() will simply return the first CPU found in
2012 * the rto_mask.
2013 *
2014 * If rto_next_cpu() is called with rto_cpu is a valid CPU, it
2015 * will return the next CPU found in the rto_mask.
2016 *
2017 * If there are no more CPUs left in the rto_mask, then a check is made
2018 * against rto_loop and rto_loop_next. rto_loop is only updated with
2019 * the rto_lock held, but any CPU may increment the rto_loop_next
2020 * without any locking.
2021 */
2022 for (;;) {
2023
2024 /* When rto_cpu is -1 this acts like cpumask_first() */
2025 cpu = cpumask_next(rd->rto_cpu, rd->rto_mask);
2026
2027 rd->rto_cpu = cpu;
2028
2029 if (cpu < nr_cpu_ids)
2030 return cpu;
2031
2032 rd->rto_cpu = -1;
2033
2034 /*
2035 * ACQUIRE ensures we see the @rto_mask changes
2036 * made prior to the @next value observed.
2037 *
2038 * Matches WMB in rt_set_overload().
2039 */
2040 next = atomic_read_acquire(&rd->rto_loop_next);
2041
2042 if (rd->rto_loop == next)
2043 break;
2044
2045 rd->rto_loop = next;
2046 }
2047
2048 return -1;
2049}
2050
2051static inline bool rto_start_trylock(atomic_t *v)
2052{
2053 return !atomic_cmpxchg_acquire(v, 0, 1);
2054}
2055
2056static inline void rto_start_unlock(atomic_t *v)
2057{
2058 atomic_set_release(v, 0);
2059}
2060
2061static void tell_cpu_to_push(struct rq *rq)
2062{
2063 int cpu = -1;
2064
2065 /* Keep the loop going if the IPI is currently active */
2066 atomic_inc(&rq->rd->rto_loop_next);
2067
2068 /* Only one CPU can initiate a loop at a time */
2069 if (!rto_start_trylock(&rq->rd->rto_loop_start))
2070 return;
2071
2072 raw_spin_lock(&rq->rd->rto_lock);
2073
2074 /*
2075 * The rto_cpu is updated under the lock, if it has a valid CPU
2076 * then the IPI is still running and will continue due to the
2077 * update to loop_next, and nothing needs to be done here.
2078 * Otherwise it is finishing up and an ipi needs to be sent.
2079 */
2080 if (rq->rd->rto_cpu < 0)
2081 cpu = rto_next_cpu(rq->rd);
2082
2083 raw_spin_unlock(&rq->rd->rto_lock);
2084
2085 rto_start_unlock(&rq->rd->rto_loop_start);
2086
2087 if (cpu >= 0) {
2088 /* Make sure the rd does not get freed while pushing */
2089 sched_get_rd(rq->rd);
2090 irq_work_queue_on(&rq->rd->rto_push_work, cpu);
2091 }
2092}
2093
2094/* Called from hardirq context */
2095void rto_push_irq_work_func(struct irq_work *work)
2096{
2097 struct root_domain *rd =
2098 container_of(work, struct root_domain, rto_push_work);
2099 struct rq *rq;
2100 int cpu;
2101
2102 rq = this_rq();
2103
2104 /*
2105 * We do not need to grab the lock to check for has_pushable_tasks.
2106 * When it gets updated, a check is made if a push is possible.
2107 */
2108 if (has_pushable_tasks(rq)) {
2109 raw_spin_lock(&rq->lock);
2110 push_rt_tasks(rq);
2111 raw_spin_unlock(&rq->lock);
2112 }
2113
2114 raw_spin_lock(&rd->rto_lock);
2115
2116 /* Pass the IPI to the next rt overloaded queue */
2117 cpu = rto_next_cpu(rd);
2118
2119 raw_spin_unlock(&rd->rto_lock);
2120
2121 if (cpu < 0) {
2122 sched_put_rd(rd);
2123 return;
2124 }
2125
2126 /* Try the next RT overloaded CPU */
2127 irq_work_queue_on(&rd->rto_push_work, cpu);
2128}
2129#endif /* HAVE_RT_PUSH_IPI */
2130
2131static void pull_rt_task(struct rq *this_rq)
2132{
2133 int this_cpu = this_rq->cpu, cpu;
2134 bool resched = false;
2135 struct task_struct *p;
2136 struct rq *src_rq;
2137 int rt_overload_count = rt_overloaded(this_rq);
2138
2139 if (likely(!rt_overload_count))
2140 return;
2141
2142 /*
2143 * Match the barrier from rt_set_overloaded; this guarantees that if we
2144 * see overloaded we must also see the rto_mask bit.
2145 */
2146 smp_rmb();
2147
2148 /* If we are the only overloaded CPU do nothing */
2149 if (rt_overload_count == 1 &&
2150 cpumask_test_cpu(this_rq->cpu, this_rq->rd->rto_mask))
2151 return;
2152
2153#ifdef HAVE_RT_PUSH_IPI
2154 if (sched_feat(RT_PUSH_IPI)) {
2155 tell_cpu_to_push(this_rq);
2156 return;
2157 }
2158#endif
2159
2160 for_each_cpu(cpu, this_rq->rd->rto_mask) {
2161 if (this_cpu == cpu)
2162 continue;
2163
2164 src_rq = cpu_rq(cpu);
2165
2166 /*
2167 * Don't bother taking the src_rq->lock if the next highest
2168 * task is known to be lower-priority than our current task.
2169 * This may look racy, but if this value is about to go
2170 * logically higher, the src_rq will push this task away.
2171 * And if its going logically lower, we do not care
2172 */
2173 if (src_rq->rt.highest_prio.next >=
2174 this_rq->rt.highest_prio.curr)
2175 continue;
2176
2177 /*
2178 * We can potentially drop this_rq's lock in
2179 * double_lock_balance, and another CPU could
2180 * alter this_rq
2181 */
2182 double_lock_balance(this_rq, src_rq);
2183
2184 /*
2185 * We can pull only a task, which is pushable
2186 * on its rq, and no others.
2187 */
2188 p = pick_highest_pushable_task(src_rq, this_cpu);
2189
2190 /*
2191 * Do we have an RT task that preempts
2192 * the to-be-scheduled task?
2193 */
2194 if (p && (p->prio < this_rq->rt.highest_prio.curr)) {
2195 WARN_ON(p == src_rq->curr);
2196 WARN_ON(!task_on_rq_queued(p));
2197
2198 /*
2199 * There's a chance that p is higher in priority
2200 * than what's currently running on its CPU.
2201 * This is just that p is wakeing up and hasn't
2202 * had a chance to schedule. We only pull
2203 * p if it is lower in priority than the
2204 * current task on the run queue
2205 */
2206 if (p->prio < src_rq->curr->prio)
2207 goto skip;
2208
2209 resched = true;
2210
2211 deactivate_task(src_rq, p, 0);
2212 set_task_cpu(p, this_cpu);
2213 activate_task(this_rq, p, 0);
2214 /*
2215 * We continue with the search, just in
2216 * case there's an even higher prio task
2217 * in another runqueue. (low likelihood
2218 * but possible)
2219 */
2220 }
2221skip:
2222 double_unlock_balance(this_rq, src_rq);
2223 }
2224
2225 if (resched)
2226 resched_curr(this_rq);
2227}
2228
2229/*
2230 * If we are not running and we are not going to reschedule soon, we should
2231 * try to push tasks away now
2232 */
2233static void task_woken_rt(struct rq *rq, struct task_struct *p)
2234{
Olivier Deprez157378f2022-04-04 15:47:50 +02002235 bool need_to_push = !task_running(rq, p) &&
2236 !test_tsk_need_resched(rq->curr) &&
2237 p->nr_cpus_allowed > 1 &&
2238 (dl_task(rq->curr) || rt_task(rq->curr)) &&
2239 (rq->curr->nr_cpus_allowed < 2 ||
2240 rq->curr->prio <= p->prio);
2241
2242 if (need_to_push)
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002243 push_rt_tasks(rq);
2244}
2245
2246/* Assumes rq->lock is held */
2247static void rq_online_rt(struct rq *rq)
2248{
2249 if (rq->rt.overloaded)
2250 rt_set_overload(rq);
2251
2252 __enable_runtime(rq);
2253
2254 cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio.curr);
2255}
2256
2257/* Assumes rq->lock is held */
2258static void rq_offline_rt(struct rq *rq)
2259{
2260 if (rq->rt.overloaded)
2261 rt_clear_overload(rq);
2262
2263 __disable_runtime(rq);
2264
2265 cpupri_set(&rq->rd->cpupri, rq->cpu, CPUPRI_INVALID);
2266}
2267
2268/*
2269 * When switch from the rt queue, we bring ourselves to a position
2270 * that we might want to pull RT tasks from other runqueues.
2271 */
2272static void switched_from_rt(struct rq *rq, struct task_struct *p)
2273{
2274 /*
2275 * If there are other RT tasks then we will reschedule
2276 * and the scheduling of the other RT tasks will handle
2277 * the balancing. But if we are the last RT task
2278 * we may need to handle the pulling of RT tasks
2279 * now.
2280 */
2281 if (!task_on_rq_queued(p) || rq->rt.rt_nr_running)
2282 return;
2283
2284 rt_queue_pull_task(rq);
2285}
2286
2287void __init init_sched_rt_class(void)
2288{
2289 unsigned int i;
2290
2291 for_each_possible_cpu(i) {
2292 zalloc_cpumask_var_node(&per_cpu(local_cpu_mask, i),
2293 GFP_KERNEL, cpu_to_node(i));
2294 }
2295}
2296#endif /* CONFIG_SMP */
2297
2298/*
2299 * When switching a task to RT, we may overload the runqueue
2300 * with RT tasks. In this case we try to push them off to
2301 * other runqueues.
2302 */
2303static void switched_to_rt(struct rq *rq, struct task_struct *p)
2304{
2305 /*
Olivier Deprez0e641232021-09-23 10:07:05 +02002306 * If we are running, update the avg_rt tracking, as the running time
2307 * will now on be accounted into the latter.
2308 */
2309 if (task_current(rq, p)) {
2310 update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 0);
2311 return;
2312 }
2313
2314 /*
2315 * If we are not running we may need to preempt the current
2316 * running task. If that current running task is also an RT task
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002317 * then see if we can move to another run queue.
2318 */
Olivier Deprez0e641232021-09-23 10:07:05 +02002319 if (task_on_rq_queued(p)) {
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002320#ifdef CONFIG_SMP
2321 if (p->nr_cpus_allowed > 1 && rq->rt.overloaded)
2322 rt_queue_push_tasks(rq);
2323#endif /* CONFIG_SMP */
2324 if (p->prio < rq->curr->prio && cpu_online(cpu_of(rq)))
2325 resched_curr(rq);
2326 }
2327}
2328
2329/*
2330 * Priority of the task has changed. This may cause
2331 * us to initiate a push or pull.
2332 */
2333static void
2334prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
2335{
2336 if (!task_on_rq_queued(p))
2337 return;
2338
2339 if (rq->curr == p) {
2340#ifdef CONFIG_SMP
2341 /*
2342 * If our priority decreases while running, we
2343 * may need to pull tasks to this runqueue.
2344 */
2345 if (oldprio < p->prio)
2346 rt_queue_pull_task(rq);
2347
2348 /*
2349 * If there's a higher priority task waiting to run
2350 * then reschedule.
2351 */
2352 if (p->prio > rq->rt.highest_prio.curr)
2353 resched_curr(rq);
2354#else
2355 /* For UP simply resched on drop of prio */
2356 if (oldprio < p->prio)
2357 resched_curr(rq);
2358#endif /* CONFIG_SMP */
2359 } else {
2360 /*
2361 * This task is not running, but if it is
2362 * greater than the current running task
2363 * then reschedule.
2364 */
2365 if (p->prio < rq->curr->prio)
2366 resched_curr(rq);
2367 }
2368}
2369
2370#ifdef CONFIG_POSIX_TIMERS
2371static void watchdog(struct rq *rq, struct task_struct *p)
2372{
2373 unsigned long soft, hard;
2374
2375 /* max may change after cur was read, this will be fixed next tick */
2376 soft = task_rlimit(p, RLIMIT_RTTIME);
2377 hard = task_rlimit_max(p, RLIMIT_RTTIME);
2378
2379 if (soft != RLIM_INFINITY) {
2380 unsigned long next;
2381
2382 if (p->rt.watchdog_stamp != jiffies) {
2383 p->rt.timeout++;
2384 p->rt.watchdog_stamp = jiffies;
2385 }
2386
2387 next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ);
David Brazdil0f672f62019-12-10 10:32:29 +00002388 if (p->rt.timeout > next) {
2389 posix_cputimers_rt_watchdog(&p->posix_cputimers,
2390 p->se.sum_exec_runtime);
2391 }
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002392 }
2393}
2394#else
2395static inline void watchdog(struct rq *rq, struct task_struct *p) { }
2396#endif
2397
2398/*
2399 * scheduler tick hitting a task of our scheduling class.
2400 *
2401 * NOTE: This function can be called remotely by the tick offload that
2402 * goes along full dynticks. Therefore no local assumption can be made
2403 * and everything must be accessed through the @rq and @curr passed in
2404 * parameters.
2405 */
2406static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
2407{
2408 struct sched_rt_entity *rt_se = &p->rt;
2409
2410 update_curr_rt(rq);
David Brazdil0f672f62019-12-10 10:32:29 +00002411 update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 1);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002412
2413 watchdog(rq, p);
2414
2415 /*
2416 * RR tasks need a special form of timeslice management.
2417 * FIFO tasks have no timeslices.
2418 */
2419 if (p->policy != SCHED_RR)
2420 return;
2421
2422 if (--p->rt.time_slice)
2423 return;
2424
2425 p->rt.time_slice = sched_rr_timeslice;
2426
2427 /*
2428 * Requeue to the end of queue if we (and all of our ancestors) are not
2429 * the only element on the queue
2430 */
2431 for_each_sched_rt_entity(rt_se) {
2432 if (rt_se->run_list.prev != rt_se->run_list.next) {
2433 requeue_task_rt(rq, p, 0);
2434 resched_curr(rq);
2435 return;
2436 }
2437 }
2438}
2439
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002440static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
2441{
2442 /*
2443 * Time slice is 0 for SCHED_FIFO tasks
2444 */
2445 if (task->policy == SCHED_RR)
2446 return sched_rr_timeslice;
2447 else
2448 return 0;
2449}
2450
Olivier Deprez157378f2022-04-04 15:47:50 +02002451const struct sched_class rt_sched_class
2452 __section("__rt_sched_class") = {
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002453 .enqueue_task = enqueue_task_rt,
2454 .dequeue_task = dequeue_task_rt,
2455 .yield_task = yield_task_rt,
2456
2457 .check_preempt_curr = check_preempt_curr_rt,
2458
2459 .pick_next_task = pick_next_task_rt,
2460 .put_prev_task = put_prev_task_rt,
David Brazdil0f672f62019-12-10 10:32:29 +00002461 .set_next_task = set_next_task_rt,
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002462
2463#ifdef CONFIG_SMP
David Brazdil0f672f62019-12-10 10:32:29 +00002464 .balance = balance_rt,
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002465 .select_task_rq = select_task_rq_rt,
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002466 .set_cpus_allowed = set_cpus_allowed_common,
2467 .rq_online = rq_online_rt,
2468 .rq_offline = rq_offline_rt,
2469 .task_woken = task_woken_rt,
2470 .switched_from = switched_from_rt,
2471#endif
2472
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002473 .task_tick = task_tick_rt,
2474
2475 .get_rr_interval = get_rr_interval_rt,
2476
2477 .prio_changed = prio_changed_rt,
2478 .switched_to = switched_to_rt,
2479
2480 .update_curr = update_curr_rt,
David Brazdil0f672f62019-12-10 10:32:29 +00002481
2482#ifdef CONFIG_UCLAMP_TASK
2483 .uclamp_enabled = 1,
2484#endif
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002485};
2486
2487#ifdef CONFIG_RT_GROUP_SCHED
2488/*
2489 * Ensure that the real time constraints are schedulable.
2490 */
2491static DEFINE_MUTEX(rt_constraints_mutex);
2492
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002493static inline int tg_has_rt_tasks(struct task_group *tg)
2494{
Olivier Deprez157378f2022-04-04 15:47:50 +02002495 struct task_struct *task;
2496 struct css_task_iter it;
2497 int ret = 0;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002498
2499 /*
2500 * Autogroups do not have RT tasks; see autogroup_create().
2501 */
2502 if (task_group_is_autogroup(tg))
2503 return 0;
2504
Olivier Deprez157378f2022-04-04 15:47:50 +02002505 css_task_iter_start(&tg->css, 0, &it);
2506 while (!ret && (task = css_task_iter_next(&it)))
2507 ret |= rt_task(task);
2508 css_task_iter_end(&it);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002509
Olivier Deprez157378f2022-04-04 15:47:50 +02002510 return ret;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002511}
2512
2513struct rt_schedulable_data {
2514 struct task_group *tg;
2515 u64 rt_period;
2516 u64 rt_runtime;
2517};
2518
2519static int tg_rt_schedulable(struct task_group *tg, void *data)
2520{
2521 struct rt_schedulable_data *d = data;
2522 struct task_group *child;
2523 unsigned long total, sum = 0;
2524 u64 period, runtime;
2525
2526 period = ktime_to_ns(tg->rt_bandwidth.rt_period);
2527 runtime = tg->rt_bandwidth.rt_runtime;
2528
2529 if (tg == d->tg) {
2530 period = d->rt_period;
2531 runtime = d->rt_runtime;
2532 }
2533
2534 /*
2535 * Cannot have more runtime than the period.
2536 */
2537 if (runtime > period && runtime != RUNTIME_INF)
2538 return -EINVAL;
2539
2540 /*
Olivier Deprez157378f2022-04-04 15:47:50 +02002541 * Ensure we don't starve existing RT tasks if runtime turns zero.
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002542 */
Olivier Deprez157378f2022-04-04 15:47:50 +02002543 if (rt_bandwidth_enabled() && !runtime &&
2544 tg->rt_bandwidth.rt_runtime && tg_has_rt_tasks(tg))
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002545 return -EBUSY;
2546
2547 total = to_ratio(period, runtime);
2548
2549 /*
2550 * Nobody can have more than the global setting allows.
2551 */
2552 if (total > to_ratio(global_rt_period(), global_rt_runtime()))
2553 return -EINVAL;
2554
2555 /*
2556 * The sum of our children's runtime should not exceed our own.
2557 */
2558 list_for_each_entry_rcu(child, &tg->children, siblings) {
2559 period = ktime_to_ns(child->rt_bandwidth.rt_period);
2560 runtime = child->rt_bandwidth.rt_runtime;
2561
2562 if (child == d->tg) {
2563 period = d->rt_period;
2564 runtime = d->rt_runtime;
2565 }
2566
2567 sum += to_ratio(period, runtime);
2568 }
2569
2570 if (sum > total)
2571 return -EINVAL;
2572
2573 return 0;
2574}
2575
2576static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
2577{
2578 int ret;
2579
2580 struct rt_schedulable_data data = {
2581 .tg = tg,
2582 .rt_period = period,
2583 .rt_runtime = runtime,
2584 };
2585
2586 rcu_read_lock();
2587 ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data);
2588 rcu_read_unlock();
2589
2590 return ret;
2591}
2592
2593static int tg_set_rt_bandwidth(struct task_group *tg,
2594 u64 rt_period, u64 rt_runtime)
2595{
2596 int i, err = 0;
2597
2598 /*
2599 * Disallowing the root group RT runtime is BAD, it would disallow the
2600 * kernel creating (and or operating) RT threads.
2601 */
2602 if (tg == &root_task_group && rt_runtime == 0)
2603 return -EINVAL;
2604
2605 /* No period doesn't make any sense. */
2606 if (rt_period == 0)
2607 return -EINVAL;
2608
Olivier Deprez0e641232021-09-23 10:07:05 +02002609 /*
2610 * Bound quota to defend quota against overflow during bandwidth shift.
2611 */
2612 if (rt_runtime != RUNTIME_INF && rt_runtime > max_rt_runtime)
2613 return -EINVAL;
2614
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002615 mutex_lock(&rt_constraints_mutex);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002616 err = __rt_schedulable(tg, rt_period, rt_runtime);
2617 if (err)
2618 goto unlock;
2619
2620 raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
2621 tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
2622 tg->rt_bandwidth.rt_runtime = rt_runtime;
2623
2624 for_each_possible_cpu(i) {
2625 struct rt_rq *rt_rq = tg->rt_rq[i];
2626
2627 raw_spin_lock(&rt_rq->rt_runtime_lock);
2628 rt_rq->rt_runtime = rt_runtime;
2629 raw_spin_unlock(&rt_rq->rt_runtime_lock);
2630 }
2631 raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
2632unlock:
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002633 mutex_unlock(&rt_constraints_mutex);
2634
2635 return err;
2636}
2637
2638int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
2639{
2640 u64 rt_runtime, rt_period;
2641
2642 rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
2643 rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
2644 if (rt_runtime_us < 0)
2645 rt_runtime = RUNTIME_INF;
David Brazdil0f672f62019-12-10 10:32:29 +00002646 else if ((u64)rt_runtime_us > U64_MAX / NSEC_PER_USEC)
2647 return -EINVAL;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002648
2649 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
2650}
2651
2652long sched_group_rt_runtime(struct task_group *tg)
2653{
2654 u64 rt_runtime_us;
2655
2656 if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF)
2657 return -1;
2658
2659 rt_runtime_us = tg->rt_bandwidth.rt_runtime;
2660 do_div(rt_runtime_us, NSEC_PER_USEC);
2661 return rt_runtime_us;
2662}
2663
2664int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us)
2665{
2666 u64 rt_runtime, rt_period;
2667
David Brazdil0f672f62019-12-10 10:32:29 +00002668 if (rt_period_us > U64_MAX / NSEC_PER_USEC)
2669 return -EINVAL;
2670
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002671 rt_period = rt_period_us * NSEC_PER_USEC;
2672 rt_runtime = tg->rt_bandwidth.rt_runtime;
2673
2674 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
2675}
2676
2677long sched_group_rt_period(struct task_group *tg)
2678{
2679 u64 rt_period_us;
2680
2681 rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period);
2682 do_div(rt_period_us, NSEC_PER_USEC);
2683 return rt_period_us;
2684}
2685
2686static int sched_rt_global_constraints(void)
2687{
2688 int ret = 0;
2689
2690 mutex_lock(&rt_constraints_mutex);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002691 ret = __rt_schedulable(NULL, 0, 0);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002692 mutex_unlock(&rt_constraints_mutex);
2693
2694 return ret;
2695}
2696
2697int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
2698{
2699 /* Don't accept realtime tasks when there is no way for them to run */
2700 if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)
2701 return 0;
2702
2703 return 1;
2704}
2705
2706#else /* !CONFIG_RT_GROUP_SCHED */
2707static int sched_rt_global_constraints(void)
2708{
2709 unsigned long flags;
2710 int i;
2711
2712 raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
2713 for_each_possible_cpu(i) {
2714 struct rt_rq *rt_rq = &cpu_rq(i)->rt;
2715
2716 raw_spin_lock(&rt_rq->rt_runtime_lock);
2717 rt_rq->rt_runtime = global_rt_runtime();
2718 raw_spin_unlock(&rt_rq->rt_runtime_lock);
2719 }
2720 raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
2721
2722 return 0;
2723}
2724#endif /* CONFIG_RT_GROUP_SCHED */
2725
2726static int sched_rt_global_validate(void)
2727{
2728 if (sysctl_sched_rt_period <= 0)
2729 return -EINVAL;
2730
2731 if ((sysctl_sched_rt_runtime != RUNTIME_INF) &&
Olivier Deprez0e641232021-09-23 10:07:05 +02002732 ((sysctl_sched_rt_runtime > sysctl_sched_rt_period) ||
2733 ((u64)sysctl_sched_rt_runtime *
2734 NSEC_PER_USEC > max_rt_runtime)))
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002735 return -EINVAL;
2736
2737 return 0;
2738}
2739
2740static void sched_rt_do_global(void)
2741{
Olivier Deprez157378f2022-04-04 15:47:50 +02002742 unsigned long flags;
2743
2744 raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002745 def_rt_bandwidth.rt_runtime = global_rt_runtime();
2746 def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period());
Olivier Deprez157378f2022-04-04 15:47:50 +02002747 raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002748}
2749
Olivier Deprez157378f2022-04-04 15:47:50 +02002750int sched_rt_handler(struct ctl_table *table, int write, void *buffer,
2751 size_t *lenp, loff_t *ppos)
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002752{
2753 int old_period, old_runtime;
2754 static DEFINE_MUTEX(mutex);
2755 int ret;
2756
2757 mutex_lock(&mutex);
2758 old_period = sysctl_sched_rt_period;
2759 old_runtime = sysctl_sched_rt_runtime;
2760
2761 ret = proc_dointvec(table, write, buffer, lenp, ppos);
2762
2763 if (!ret && write) {
2764 ret = sched_rt_global_validate();
2765 if (ret)
2766 goto undo;
2767
2768 ret = sched_dl_global_validate();
2769 if (ret)
2770 goto undo;
2771
2772 ret = sched_rt_global_constraints();
2773 if (ret)
2774 goto undo;
2775
2776 sched_rt_do_global();
2777 sched_dl_do_global();
2778 }
2779 if (0) {
2780undo:
2781 sysctl_sched_rt_period = old_period;
2782 sysctl_sched_rt_runtime = old_runtime;
2783 }
2784 mutex_unlock(&mutex);
2785
2786 return ret;
2787}
2788
Olivier Deprez157378f2022-04-04 15:47:50 +02002789int sched_rr_handler(struct ctl_table *table, int write, void *buffer,
2790 size_t *lenp, loff_t *ppos)
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002791{
2792 int ret;
2793 static DEFINE_MUTEX(mutex);
2794
2795 mutex_lock(&mutex);
2796 ret = proc_dointvec(table, write, buffer, lenp, ppos);
2797 /*
2798 * Make sure that internally we keep jiffies.
2799 * Also, writing zero resets the timeslice to default:
2800 */
2801 if (!ret && write) {
2802 sched_rr_timeslice =
2803 sysctl_sched_rr_timeslice <= 0 ? RR_TIMESLICE :
2804 msecs_to_jiffies(sysctl_sched_rr_timeslice);
2805 }
2806 mutex_unlock(&mutex);
2807
2808 return ret;
2809}
2810
2811#ifdef CONFIG_SCHED_DEBUG
2812void print_rt_stats(struct seq_file *m, int cpu)
2813{
2814 rt_rq_iter_t iter;
2815 struct rt_rq *rt_rq;
2816
2817 rcu_read_lock();
2818 for_each_rt_rq(rt_rq, iter, cpu_rq(cpu))
2819 print_rt_rq(m, cpu, rt_rq);
2820 rcu_read_unlock();
2821}
2822#endif /* CONFIG_SCHED_DEBUG */