Blame - kernel/sched/rt.c - hafnium/third_party/linux

blob: e6f22836c600b5e7b3b90b186cbc479e30d8b45c [file] [log] [blame]

Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1	// SPDX-License-Identifier: GPL-2.0
				2	/*
				3	* Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR
				4	* policies)
				5	*/
				6	#include "sched.h"
				7
				8	#include "pelt.h"
				9
				10	int sched_rr_timeslice = RR_TIMESLICE;
				11	int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE;
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame]	12	/* More than 4 hours if BW_SHIFT equals 20. */
				13	static const u64 max_rt_runtime = MAX_BW;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	14
				15	static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
				16
				17	struct rt_bandwidth def_rt_bandwidth;
				18
				19	static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
				20	{
				21	struct rt_bandwidth *rt_b =
				22	container_of(timer, struct rt_bandwidth, rt_period_timer);
				23	int idle = 0;
				24	int overrun;
				25
				26	raw_spin_lock(&rt_b->rt_runtime_lock);
				27	for (;;) {
				28	overrun = hrtimer_forward_now(timer, rt_b->rt_period);
				29	if (!overrun)
				30	break;
				31
				32	raw_spin_unlock(&rt_b->rt_runtime_lock);
				33	idle = do_sched_rt_period_timer(rt_b, overrun);
				34	raw_spin_lock(&rt_b->rt_runtime_lock);
				35	}
				36	if (idle)
				37	rt_b->rt_period_active = 0;
				38	raw_spin_unlock(&rt_b->rt_runtime_lock);
				39
				40	return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
				41	}
				42
				43	void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
				44	{
				45	rt_b->rt_period = ns_to_ktime(period);
				46	rt_b->rt_runtime = runtime;
				47
				48	raw_spin_lock_init(&rt_b->rt_runtime_lock);
				49
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	50	hrtimer_init(&rt_b->rt_period_timer, CLOCK_MONOTONIC,
				51	HRTIMER_MODE_REL_HARD);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	52	rt_b->rt_period_timer.function = sched_rt_period_timer;
				53	}
				54
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame]	55	static inline void do_start_rt_bandwidth(struct rt_bandwidth *rt_b)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	56	{
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	57	raw_spin_lock(&rt_b->rt_runtime_lock);
				58	if (!rt_b->rt_period_active) {
				59	rt_b->rt_period_active = 1;
				60	/*
				61	* SCHED_DEADLINE updates the bandwidth, as a run away
				62	* RT task with a DL task could hog a CPU. But DL does
				63	* not reset the period. If a deadline task was running
				64	* without an RT task running, it can cause RT tasks to
				65	* throttle when they start up. Kick the timer right away
				66	* to update the period.
				67	*/
				68	hrtimer_forward_now(&rt_b->rt_period_timer, ns_to_ktime(0));
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	69	hrtimer_start_expires(&rt_b->rt_period_timer,
				70	HRTIMER_MODE_ABS_PINNED_HARD);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	71	}
				72	raw_spin_unlock(&rt_b->rt_runtime_lock);
				73	}
				74
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame]	75	static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
				76	{
				77	if (!rt_bandwidth_enabled() \|\| rt_b->rt_runtime == RUNTIME_INF)
				78	return;
				79
				80	do_start_rt_bandwidth(rt_b);
				81	}
				82
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	83	void init_rt_rq(struct rt_rq *rt_rq)
				84	{
				85	struct rt_prio_array *array;
				86	int i;
				87
				88	array = &rt_rq->active;
				89	for (i = 0; i < MAX_RT_PRIO; i++) {
				90	INIT_LIST_HEAD(array->queue + i);
				91	__clear_bit(i, array->bitmap);
				92	}
				93	/* delimiter for bitsearch: */
				94	__set_bit(MAX_RT_PRIO, array->bitmap);
				95
				96	#if defined CONFIG_SMP
				97	rt_rq->highest_prio.curr = MAX_RT_PRIO;
				98	rt_rq->highest_prio.next = MAX_RT_PRIO;
				99	rt_rq->rt_nr_migratory = 0;
				100	rt_rq->overloaded = 0;
				101	plist_head_init(&rt_rq->pushable_tasks);
				102	#endif /* CONFIG_SMP */
				103	/* We start is dequeued state, because no RT tasks are queued */
				104	rt_rq->rt_queued = 0;
				105
				106	rt_rq->rt_time = 0;
				107	rt_rq->rt_throttled = 0;
				108	rt_rq->rt_runtime = 0;
				109	raw_spin_lock_init(&rt_rq->rt_runtime_lock);
				110	}
				111
				112	#ifdef CONFIG_RT_GROUP_SCHED
				113	static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
				114	{
				115	hrtimer_cancel(&rt_b->rt_period_timer);
				116	}
				117
				118	#define rt_entity_is_task(rt_se) (!(rt_se)->my_q)
				119
				120	static inline struct task_struct rt_task_of(struct sched_rt_entity rt_se)
				121	{
				122	#ifdef CONFIG_SCHED_DEBUG
				123	WARN_ON_ONCE(!rt_entity_is_task(rt_se));
				124	#endif
				125	return container_of(rt_se, struct task_struct, rt);
				126	}
				127
				128	static inline struct rq rq_of_rt_rq(struct rt_rq rt_rq)
				129	{
				130	return rt_rq->rq;
				131	}
				132
				133	static inline struct rt_rq rt_rq_of_se(struct sched_rt_entity rt_se)
				134	{
				135	return rt_se->rt_rq;
				136	}
				137
				138	static inline struct rq rq_of_rt_se(struct sched_rt_entity rt_se)
				139	{
				140	struct rt_rq *rt_rq = rt_se->rt_rq;
				141
				142	return rt_rq->rq;
				143	}
				144
				145	void free_rt_sched_group(struct task_group *tg)
				146	{
				147	int i;
				148
				149	if (tg->rt_se)
				150	destroy_rt_bandwidth(&tg->rt_bandwidth);
				151
				152	for_each_possible_cpu(i) {
				153	if (tg->rt_rq)
				154	kfree(tg->rt_rq[i]);
				155	if (tg->rt_se)
				156	kfree(tg->rt_se[i]);
				157	}
				158
				159	kfree(tg->rt_rq);
				160	kfree(tg->rt_se);
				161	}
				162
				163	void init_tg_rt_entry(struct task_group tg, struct rt_rq rt_rq,
				164	struct sched_rt_entity *rt_se, int cpu,
				165	struct sched_rt_entity *parent)
				166	{
				167	struct rq *rq = cpu_rq(cpu);
				168
				169	rt_rq->highest_prio.curr = MAX_RT_PRIO;
				170	rt_rq->rt_nr_boosted = 0;
				171	rt_rq->rq = rq;
				172	rt_rq->tg = tg;
				173
				174	tg->rt_rq[cpu] = rt_rq;
				175	tg->rt_se[cpu] = rt_se;
				176
				177	if (!rt_se)
				178	return;
				179
				180	if (!parent)
				181	rt_se->rt_rq = &rq->rt;
				182	else
				183	rt_se->rt_rq = parent->my_q;
				184
				185	rt_se->my_q = rt_rq;
				186	rt_se->parent = parent;
				187	INIT_LIST_HEAD(&rt_se->run_list);
				188	}
				189
				190	int alloc_rt_sched_group(struct task_group tg, struct task_group parent)
				191	{
				192	struct rt_rq *rt_rq;
				193	struct sched_rt_entity *rt_se;
				194	int i;
				195
				196	tg->rt_rq = kcalloc(nr_cpu_ids, sizeof(rt_rq), GFP_KERNEL);
				197	if (!tg->rt_rq)
				198	goto err;
				199	tg->rt_se = kcalloc(nr_cpu_ids, sizeof(rt_se), GFP_KERNEL);
				200	if (!tg->rt_se)
				201	goto err;
				202
				203	init_rt_bandwidth(&tg->rt_bandwidth,
				204	ktime_to_ns(def_rt_bandwidth.rt_period), 0);
				205
				206	for_each_possible_cpu(i) {
				207	rt_rq = kzalloc_node(sizeof(struct rt_rq),
				208	GFP_KERNEL, cpu_to_node(i));
				209	if (!rt_rq)
				210	goto err;
				211
				212	rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
				213	GFP_KERNEL, cpu_to_node(i));
				214	if (!rt_se)
				215	goto err_free_rq;
				216
				217	init_rt_rq(rt_rq);
				218	rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
				219	init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
				220	}
				221
				222	return 1;
				223
				224	err_free_rq:
				225	kfree(rt_rq);
				226	err:
				227	return 0;
				228	}
				229
				230	#else /* CONFIG_RT_GROUP_SCHED */
				231
				232	#define rt_entity_is_task(rt_se) (1)
				233
				234	static inline struct task_struct rt_task_of(struct sched_rt_entity rt_se)
				235	{
				236	return container_of(rt_se, struct task_struct, rt);
				237	}
				238
				239	static inline struct rq rq_of_rt_rq(struct rt_rq rt_rq)
				240	{
				241	return container_of(rt_rq, struct rq, rt);
				242	}
				243
				244	static inline struct rq rq_of_rt_se(struct sched_rt_entity rt_se)
				245	{
				246	struct task_struct *p = rt_task_of(rt_se);
				247
				248	return task_rq(p);
				249	}
				250
				251	static inline struct rt_rq rt_rq_of_se(struct sched_rt_entity rt_se)
				252	{
				253	struct rq *rq = rq_of_rt_se(rt_se);
				254
				255	return &rq->rt;
				256	}
				257
				258	void free_rt_sched_group(struct task_group *tg) { }
				259
				260	int alloc_rt_sched_group(struct task_group tg, struct task_group parent)
				261	{
				262	return 1;
				263	}
				264	#endif /* CONFIG_RT_GROUP_SCHED */
				265
				266	#ifdef CONFIG_SMP
				267
				268	static void pull_rt_task(struct rq *this_rq);
				269
				270	static inline bool need_pull_rt_task(struct rq rq, struct task_struct prev)
				271	{
				272	/* Try to pull RT tasks here if we lower this rq's prio */
				273	return rq->rt.highest_prio.curr > prev->prio;
				274	}
				275
				276	static inline int rt_overloaded(struct rq *rq)
				277	{
				278	return atomic_read(&rq->rd->rto_count);
				279	}
				280
				281	static inline void rt_set_overload(struct rq *rq)
				282	{
				283	if (!rq->online)
				284	return;
				285
				286	cpumask_set_cpu(rq->cpu, rq->rd->rto_mask);
				287	/*
				288	* Make sure the mask is visible before we set
				289	* the overload count. That is checked to determine
				290	* if we should look at the mask. It would be a shame
				291	* if we looked at the mask, but the mask was not
				292	* updated yet.
				293	*
				294	* Matched by the barrier in pull_rt_task().
				295	*/
				296	smp_wmb();
				297	atomic_inc(&rq->rd->rto_count);
				298	}
				299
				300	static inline void rt_clear_overload(struct rq *rq)
				301	{
				302	if (!rq->online)
				303	return;
				304
				305	/* the order here really doesn't matter */
				306	atomic_dec(&rq->rd->rto_count);
				307	cpumask_clear_cpu(rq->cpu, rq->rd->rto_mask);
				308	}
				309
				310	static void update_rt_migration(struct rt_rq *rt_rq)
				311	{
				312	if (rt_rq->rt_nr_migratory && rt_rq->rt_nr_total > 1) {
				313	if (!rt_rq->overloaded) {
				314	rt_set_overload(rq_of_rt_rq(rt_rq));
				315	rt_rq->overloaded = 1;
				316	}
				317	} else if (rt_rq->overloaded) {
				318	rt_clear_overload(rq_of_rt_rq(rt_rq));
				319	rt_rq->overloaded = 0;
				320	}
				321	}
				322
				323	static void inc_rt_migration(struct sched_rt_entity rt_se, struct rt_rq rt_rq)
				324	{
				325	struct task_struct *p;
				326
				327	if (!rt_entity_is_task(rt_se))
				328	return;
				329
				330	p = rt_task_of(rt_se);
				331	rt_rq = &rq_of_rt_rq(rt_rq)->rt;
				332
				333	rt_rq->rt_nr_total++;
				334	if (p->nr_cpus_allowed > 1)
				335	rt_rq->rt_nr_migratory++;
				336
				337	update_rt_migration(rt_rq);
				338	}
				339
				340	static void dec_rt_migration(struct sched_rt_entity rt_se, struct rt_rq rt_rq)
				341	{
				342	struct task_struct *p;
				343
				344	if (!rt_entity_is_task(rt_se))
				345	return;
				346
				347	p = rt_task_of(rt_se);
				348	rt_rq = &rq_of_rt_rq(rt_rq)->rt;
				349
				350	rt_rq->rt_nr_total--;
				351	if (p->nr_cpus_allowed > 1)
				352	rt_rq->rt_nr_migratory--;
				353
				354	update_rt_migration(rt_rq);
				355	}
				356
				357	static inline int has_pushable_tasks(struct rq *rq)
				358	{
				359	return !plist_head_empty(&rq->rt.pushable_tasks);
				360	}
				361
				362	static DEFINE_PER_CPU(struct callback_head, rt_push_head);
				363	static DEFINE_PER_CPU(struct callback_head, rt_pull_head);
				364
				365	static void push_rt_tasks(struct rq *);
				366	static void pull_rt_task(struct rq *);
				367
				368	static inline void rt_queue_push_tasks(struct rq *rq)
				369	{
				370	if (!has_pushable_tasks(rq))
				371	return;
				372
				373	queue_balance_callback(rq, &per_cpu(rt_push_head, rq->cpu), push_rt_tasks);
				374	}
				375
				376	static inline void rt_queue_pull_task(struct rq *rq)
				377	{
				378	queue_balance_callback(rq, &per_cpu(rt_pull_head, rq->cpu), pull_rt_task);
				379	}
				380
				381	static void enqueue_pushable_task(struct rq rq, struct task_struct p)
				382	{
				383	plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
				384	plist_node_init(&p->pushable_tasks, p->prio);
				385	plist_add(&p->pushable_tasks, &rq->rt.pushable_tasks);
				386
				387	/* Update the highest prio pushable task */
				388	if (p->prio < rq->rt.highest_prio.next)
				389	rq->rt.highest_prio.next = p->prio;
				390	}
				391
				392	static void dequeue_pushable_task(struct rq rq, struct task_struct p)
				393	{
				394	plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
				395
				396	/* Update the new highest prio pushable task */
				397	if (has_pushable_tasks(rq)) {
				398	p = plist_first_entry(&rq->rt.pushable_tasks,
				399	struct task_struct, pushable_tasks);
				400	rq->rt.highest_prio.next = p->prio;
				401	} else
				402	rq->rt.highest_prio.next = MAX_RT_PRIO;
				403	}
				404
				405	#else
				406
				407	static inline void enqueue_pushable_task(struct rq rq, struct task_struct p)
				408	{
				409	}
				410
				411	static inline void dequeue_pushable_task(struct rq rq, struct task_struct p)
				412	{
				413	}
				414
				415	static inline
				416	void inc_rt_migration(struct sched_rt_entity rt_se, struct rt_rq rt_rq)
				417	{
				418	}
				419
				420	static inline
				421	void dec_rt_migration(struct sched_rt_entity rt_se, struct rt_rq rt_rq)
				422	{
				423	}
				424
				425	static inline bool need_pull_rt_task(struct rq rq, struct task_struct prev)
				426	{
				427	return false;
				428	}
				429
				430	static inline void pull_rt_task(struct rq *this_rq)
				431	{
				432	}
				433
				434	static inline void rt_queue_push_tasks(struct rq *rq)
				435	{
				436	}
				437	#endif /* CONFIG_SMP */
				438
				439	static void enqueue_top_rt_rq(struct rt_rq *rt_rq);
Olivier Deprez	92d4c21	2022-12-06 15:05:30 +0100	[diff] [blame]	440	static void dequeue_top_rt_rq(struct rt_rq *rt_rq, unsigned int count);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	441
				442	static inline int on_rt_rq(struct sched_rt_entity *rt_se)
				443	{
				444	return rt_se->on_rq;
				445	}
				446
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame]	447	#ifdef CONFIG_UCLAMP_TASK
				448	/*
				449	* Verify the fitness of task @p to run on @cpu taking into account the uclamp
				450	* settings.
				451	*
				452	* This check is only important for heterogeneous systems where uclamp_min value
				453	* is higher than the capacity of a @cpu. For non-heterogeneous system this
				454	* function will always return true.
				455	*
				456	* The function will return true if the capacity of the @cpu is >= the
				457	* uclamp_min and false otherwise.
				458	*
				459	* Note that uclamp_min will be clamped to uclamp_max if uclamp_min
				460	* > uclamp_max.
				461	*/
				462	static inline bool rt_task_fits_capacity(struct task_struct *p, int cpu)
				463	{
				464	unsigned int min_cap;
				465	unsigned int max_cap;
				466	unsigned int cpu_cap;
				467
				468	/* Only heterogeneous systems can benefit from this check */
				469	if (!static_branch_unlikely(&sched_asym_cpucapacity))
				470	return true;
				471
				472	min_cap = uclamp_eff_value(p, UCLAMP_MIN);
				473	max_cap = uclamp_eff_value(p, UCLAMP_MAX);
				474
				475	cpu_cap = capacity_orig_of(cpu);
				476
				477	return cpu_cap >= min(min_cap, max_cap);
				478	}
				479	#else
				480	static inline bool rt_task_fits_capacity(struct task_struct *p, int cpu)
				481	{
				482	return true;
				483	}
				484	#endif
				485
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	486	#ifdef CONFIG_RT_GROUP_SCHED
				487
				488	static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
				489	{
				490	if (!rt_rq->tg)
				491	return RUNTIME_INF;
				492
				493	return rt_rq->rt_runtime;
				494	}
				495
				496	static inline u64 sched_rt_period(struct rt_rq *rt_rq)
				497	{
				498	return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period);
				499	}
				500
				501	typedef struct task_group *rt_rq_iter_t;
				502
				503	static inline struct task_group next_task_group(struct task_group tg)
				504	{
				505	do {
				506	tg = list_entry_rcu(tg->list.next,
				507	typeof(struct task_group), list);
				508	} while (&tg->list != &task_groups && task_group_is_autogroup(tg));
				509
				510	if (&tg->list == &task_groups)
				511	tg = NULL;
				512
				513	return tg;
				514	}
				515
				516	#define for_each_rt_rq(rt_rq, iter, rq) \
				517	for (iter = container_of(&task_groups, typeof(*iter), list); \
				518	(iter = next_task_group(iter)) && \
				519	(rt_rq = iter->rt_rq[cpu_of(rq)]);)
				520
				521	#define for_each_sched_rt_entity(rt_se) \
				522	for (; rt_se; rt_se = rt_se->parent)
				523
				524	static inline struct rt_rq group_rt_rq(struct sched_rt_entity rt_se)
				525	{
				526	return rt_se->my_q;
				527	}
				528
				529	static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags);
				530	static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags);
				531
				532	static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
				533	{
				534	struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
				535	struct rq *rq = rq_of_rt_rq(rt_rq);
				536	struct sched_rt_entity *rt_se;
				537
				538	int cpu = cpu_of(rq);
				539
				540	rt_se = rt_rq->tg->rt_se[cpu];
				541
				542	if (rt_rq->rt_nr_running) {
				543	if (!rt_se)
				544	enqueue_top_rt_rq(rt_rq);
				545	else if (!on_rt_rq(rt_se))
				546	enqueue_rt_entity(rt_se, 0);
				547
				548	if (rt_rq->highest_prio.curr < curr->prio)
				549	resched_curr(rq);
				550	}
				551	}
				552
				553	static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
				554	{
				555	struct sched_rt_entity *rt_se;
				556	int cpu = cpu_of(rq_of_rt_rq(rt_rq));
				557
				558	rt_se = rt_rq->tg->rt_se[cpu];
				559
				560	if (!rt_se) {
Olivier Deprez	92d4c21	2022-12-06 15:05:30 +0100	[diff] [blame]	561	dequeue_top_rt_rq(rt_rq, rt_rq->rt_nr_running);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	562	/* Kick cpufreq (see the comment in kernel/sched/sched.h). */
				563	cpufreq_update_util(rq_of_rt_rq(rt_rq), 0);
				564	}
				565	else if (on_rt_rq(rt_se))
				566	dequeue_rt_entity(rt_se, 0);
				567	}
				568
				569	static inline int rt_rq_throttled(struct rt_rq *rt_rq)
				570	{
				571	return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted;
				572	}
				573
				574	static int rt_se_boosted(struct sched_rt_entity *rt_se)
				575	{
				576	struct rt_rq *rt_rq = group_rt_rq(rt_se);
				577	struct task_struct *p;
				578
				579	if (rt_rq)
				580	return !!rt_rq->rt_nr_boosted;
				581
				582	p = rt_task_of(rt_se);
				583	return p->prio != p->normal_prio;
				584	}
				585
				586	#ifdef CONFIG_SMP
				587	static inline const struct cpumask *sched_rt_period_mask(void)
				588	{
				589	return this_rq()->rd->span;
				590	}
				591	#else
				592	static inline const struct cpumask *sched_rt_period_mask(void)
				593	{
				594	return cpu_online_mask;
				595	}
				596	#endif
				597
				598	static inline
				599	struct rt_rq sched_rt_period_rt_rq(struct rt_bandwidth rt_b, int cpu)
				600	{
				601	return container_of(rt_b, struct task_group, rt_bandwidth)->rt_rq[cpu];
				602	}
				603
				604	static inline struct rt_bandwidth sched_rt_bandwidth(struct rt_rq rt_rq)
				605	{
				606	return &rt_rq->tg->rt_bandwidth;
				607	}
				608
				609	#else /* !CONFIG_RT_GROUP_SCHED */
				610
				611	static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
				612	{
				613	return rt_rq->rt_runtime;
				614	}
				615
				616	static inline u64 sched_rt_period(struct rt_rq *rt_rq)
				617	{
				618	return ktime_to_ns(def_rt_bandwidth.rt_period);
				619	}
				620
				621	typedef struct rt_rq *rt_rq_iter_t;
				622
				623	#define for_each_rt_rq(rt_rq, iter, rq) \
				624	for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
				625
				626	#define for_each_sched_rt_entity(rt_se) \
				627	for (; rt_se; rt_se = NULL)
				628
				629	static inline struct rt_rq group_rt_rq(struct sched_rt_entity rt_se)
				630	{
				631	return NULL;
				632	}
				633
				634	static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
				635	{
				636	struct rq *rq = rq_of_rt_rq(rt_rq);
				637
				638	if (!rt_rq->rt_nr_running)
				639	return;
				640
				641	enqueue_top_rt_rq(rt_rq);
				642	resched_curr(rq);
				643	}
				644
				645	static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
				646	{
Olivier Deprez	92d4c21	2022-12-06 15:05:30 +0100	[diff] [blame]	647	dequeue_top_rt_rq(rt_rq, rt_rq->rt_nr_running);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	648	}
				649
				650	static inline int rt_rq_throttled(struct rt_rq *rt_rq)
				651	{
				652	return rt_rq->rt_throttled;
				653	}
				654
				655	static inline const struct cpumask *sched_rt_period_mask(void)
				656	{
				657	return cpu_online_mask;
				658	}
				659
				660	static inline
				661	struct rt_rq sched_rt_period_rt_rq(struct rt_bandwidth rt_b, int cpu)
				662	{
				663	return &cpu_rq(cpu)->rt;
				664	}
				665
				666	static inline struct rt_bandwidth sched_rt_bandwidth(struct rt_rq rt_rq)
				667	{
				668	return &def_rt_bandwidth;
				669	}
				670
				671	#endif /* CONFIG_RT_GROUP_SCHED */
				672
				673	bool sched_rt_bandwidth_account(struct rt_rq *rt_rq)
				674	{
				675	struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
				676
				677	return (hrtimer_active(&rt_b->rt_period_timer) \|\|
				678	rt_rq->rt_time < rt_b->rt_runtime);
				679	}
				680
				681	#ifdef CONFIG_SMP
				682	/*
				683	* We ran out of runtime, see if we can borrow some from our neighbours.
				684	*/
				685	static void do_balance_runtime(struct rt_rq *rt_rq)
				686	{
				687	struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
				688	struct root_domain *rd = rq_of_rt_rq(rt_rq)->rd;
				689	int i, weight;
				690	u64 rt_period;
				691
				692	weight = cpumask_weight(rd->span);
				693
				694	raw_spin_lock(&rt_b->rt_runtime_lock);
				695	rt_period = ktime_to_ns(rt_b->rt_period);
				696	for_each_cpu(i, rd->span) {
				697	struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
				698	s64 diff;
				699
				700	if (iter == rt_rq)
				701	continue;
				702
				703	raw_spin_lock(&iter->rt_runtime_lock);
				704	/*
				705	* Either all rqs have inf runtime and there's nothing to steal
				706	* or __disable_runtime() below sets a specific rq to inf to
				707	* indicate its been disabled and disalow stealing.
				708	*/
				709	if (iter->rt_runtime == RUNTIME_INF)
				710	goto next;
				711
				712	/*
				713	* From runqueues with spare time, take 1/n part of their
				714	* spare time, but no more than our period.
				715	*/
				716	diff = iter->rt_runtime - iter->rt_time;
				717	if (diff > 0) {
				718	diff = div_u64((u64)diff, weight);
				719	if (rt_rq->rt_runtime + diff > rt_period)
				720	diff = rt_period - rt_rq->rt_runtime;
				721	iter->rt_runtime -= diff;
				722	rt_rq->rt_runtime += diff;
				723	if (rt_rq->rt_runtime == rt_period) {
				724	raw_spin_unlock(&iter->rt_runtime_lock);
				725	break;
				726	}
				727	}
				728	next:
				729	raw_spin_unlock(&iter->rt_runtime_lock);
				730	}
				731	raw_spin_unlock(&rt_b->rt_runtime_lock);
				732	}
				733
				734	/*
				735	* Ensure this RQ takes back all the runtime it lend to its neighbours.
				736	*/
				737	static void __disable_runtime(struct rq *rq)
				738	{
				739	struct root_domain *rd = rq->rd;
				740	rt_rq_iter_t iter;
				741	struct rt_rq *rt_rq;
				742
				743	if (unlikely(!scheduler_running))
				744	return;
				745
				746	for_each_rt_rq(rt_rq, iter, rq) {
				747	struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
				748	s64 want;
				749	int i;
				750
				751	raw_spin_lock(&rt_b->rt_runtime_lock);
				752	raw_spin_lock(&rt_rq->rt_runtime_lock);
				753	/*
				754	* Either we're all inf and nobody needs to borrow, or we're
				755	* already disabled and thus have nothing to do, or we have
				756	* exactly the right amount of runtime to take out.
				757	*/
				758	if (rt_rq->rt_runtime == RUNTIME_INF \|\|
				759	rt_rq->rt_runtime == rt_b->rt_runtime)
				760	goto balanced;
				761	raw_spin_unlock(&rt_rq->rt_runtime_lock);
				762
				763	/*
				764	* Calculate the difference between what we started out with
				765	* and what we current have, that's the amount of runtime
				766	* we lend and now have to reclaim.
				767	*/
				768	want = rt_b->rt_runtime - rt_rq->rt_runtime;
				769
				770	/*
				771	* Greedy reclaim, take back as much as we can.
				772	*/
				773	for_each_cpu(i, rd->span) {
				774	struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
				775	s64 diff;
				776
				777	/*
				778	* Can't reclaim from ourselves or disabled runqueues.
				779	*/
				780	if (iter == rt_rq \|\| iter->rt_runtime == RUNTIME_INF)
				781	continue;
				782
				783	raw_spin_lock(&iter->rt_runtime_lock);
				784	if (want > 0) {
				785	diff = min_t(s64, iter->rt_runtime, want);
				786	iter->rt_runtime -= diff;
				787	want -= diff;
				788	} else {
				789	iter->rt_runtime -= want;
				790	want -= want;
				791	}
				792	raw_spin_unlock(&iter->rt_runtime_lock);
				793
				794	if (!want)
				795	break;
				796	}
				797
				798	raw_spin_lock(&rt_rq->rt_runtime_lock);
				799	/*
				800	* We cannot be left wanting - that would mean some runtime
				801	* leaked out of the system.
				802	*/
				803	BUG_ON(want);
				804	balanced:
				805	/*
				806	* Disable all the borrow logic by pretending we have inf
				807	* runtime - in which case borrowing doesn't make sense.
				808	*/
				809	rt_rq->rt_runtime = RUNTIME_INF;
				810	rt_rq->rt_throttled = 0;
				811	raw_spin_unlock(&rt_rq->rt_runtime_lock);
				812	raw_spin_unlock(&rt_b->rt_runtime_lock);
				813
				814	/* Make rt_rq available for pick_next_task() */
				815	sched_rt_rq_enqueue(rt_rq);
				816	}
				817	}
				818
				819	static void __enable_runtime(struct rq *rq)
				820	{
				821	rt_rq_iter_t iter;
				822	struct rt_rq *rt_rq;
				823
				824	if (unlikely(!scheduler_running))
				825	return;
				826
				827	/*
				828	* Reset each runqueue's bandwidth settings
				829	*/
				830	for_each_rt_rq(rt_rq, iter, rq) {
				831	struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
				832
				833	raw_spin_lock(&rt_b->rt_runtime_lock);
				834	raw_spin_lock(&rt_rq->rt_runtime_lock);
				835	rt_rq->rt_runtime = rt_b->rt_runtime;
				836	rt_rq->rt_time = 0;
				837	rt_rq->rt_throttled = 0;
				838	raw_spin_unlock(&rt_rq->rt_runtime_lock);
				839	raw_spin_unlock(&rt_b->rt_runtime_lock);
				840	}
				841	}
				842
				843	static void balance_runtime(struct rt_rq *rt_rq)
				844	{
				845	if (!sched_feat(RT_RUNTIME_SHARE))
				846	return;
				847
				848	if (rt_rq->rt_time > rt_rq->rt_runtime) {
				849	raw_spin_unlock(&rt_rq->rt_runtime_lock);
				850	do_balance_runtime(rt_rq);
				851	raw_spin_lock(&rt_rq->rt_runtime_lock);
				852	}
				853	}
				854	#else /* !CONFIG_SMP */
				855	static inline void balance_runtime(struct rt_rq *rt_rq) {}
				856	#endif /* CONFIG_SMP */
				857
				858	static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
				859	{
				860	int i, idle = 1, throttled = 0;
				861	const struct cpumask *span;
				862
				863	span = sched_rt_period_mask();
				864	#ifdef CONFIG_RT_GROUP_SCHED
				865	/*
				866	* FIXME: isolated CPUs should really leave the root task group,
				867	* whether they are isolcpus or were isolated via cpusets, lest
				868	* the timer run on a CPU which does not service all runqueues,
				869	* potentially leaving other CPUs indefinitely throttled. If
				870	* isolation is really required, the user will turn the throttle
				871	* off to kill the perturbations it causes anyway. Meanwhile,
				872	* this maintains functionality for boot and/or troubleshooting.
				873	*/
				874	if (rt_b == &root_task_group.rt_bandwidth)
				875	span = cpu_online_mask;
				876	#endif
				877	for_each_cpu(i, span) {
				878	int enqueue = 0;
				879	struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
				880	struct rq *rq = rq_of_rt_rq(rt_rq);
				881	int skip;
				882
				883	/*
				884	* When span == cpu_online_mask, taking each rq->lock
				885	* can be time-consuming. Try to avoid it when possible.
				886	*/
				887	raw_spin_lock(&rt_rq->rt_runtime_lock);
				888	if (!sched_feat(RT_RUNTIME_SHARE) && rt_rq->rt_runtime != RUNTIME_INF)
				889	rt_rq->rt_runtime = rt_b->rt_runtime;
				890	skip = !rt_rq->rt_time && !rt_rq->rt_nr_running;
				891	raw_spin_unlock(&rt_rq->rt_runtime_lock);
				892	if (skip)
				893	continue;
				894
				895	raw_spin_lock(&rq->lock);
				896	update_rq_clock(rq);
				897
				898	if (rt_rq->rt_time) {
				899	u64 runtime;
				900
				901	raw_spin_lock(&rt_rq->rt_runtime_lock);
				902	if (rt_rq->rt_throttled)
				903	balance_runtime(rt_rq);
				904	runtime = rt_rq->rt_runtime;
				905	rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime);
				906	if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) {
				907	rt_rq->rt_throttled = 0;
				908	enqueue = 1;
				909
				910	/*
				911	* When we're idle and a woken (rt) task is
				912	* throttled check_preempt_curr() will set
				913	* skip_update and the time between the wakeup
				914	* and this unthrottle will get accounted as
				915	* 'runtime'.
				916	*/
				917	if (rt_rq->rt_nr_running && rq->curr == rq->idle)
				918	rq_clock_cancel_skipupdate(rq);
				919	}
				920	if (rt_rq->rt_time \|\| rt_rq->rt_nr_running)
				921	idle = 0;
				922	raw_spin_unlock(&rt_rq->rt_runtime_lock);
				923	} else if (rt_rq->rt_nr_running) {
				924	idle = 0;
				925	if (!rt_rq_throttled(rt_rq))
				926	enqueue = 1;
				927	}
				928	if (rt_rq->rt_throttled)
				929	throttled = 1;
				930
				931	if (enqueue)
				932	sched_rt_rq_enqueue(rt_rq);
				933	raw_spin_unlock(&rq->lock);
				934	}
				935
				936	if (!throttled && (!rt_bandwidth_enabled() \|\| rt_b->rt_runtime == RUNTIME_INF))
				937	return 1;
				938
				939	return idle;
				940	}
				941
				942	static inline int rt_se_prio(struct sched_rt_entity *rt_se)
				943	{
				944	#ifdef CONFIG_RT_GROUP_SCHED
				945	struct rt_rq *rt_rq = group_rt_rq(rt_se);
				946
				947	if (rt_rq)
				948	return rt_rq->highest_prio.curr;
				949	#endif
				950
				951	return rt_task_of(rt_se)->prio;
				952	}
				953
				954	static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
				955	{
				956	u64 runtime = sched_rt_runtime(rt_rq);
				957
				958	if (rt_rq->rt_throttled)
				959	return rt_rq_throttled(rt_rq);
				960
				961	if (runtime >= sched_rt_period(rt_rq))
				962	return 0;
				963
				964	balance_runtime(rt_rq);
				965	runtime = sched_rt_runtime(rt_rq);
				966	if (runtime == RUNTIME_INF)
				967	return 0;
				968
				969	if (rt_rq->rt_time > runtime) {
				970	struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
				971
				972	/*
				973	* Don't actually throttle groups that have no runtime assigned
				974	* but accrue some time due to boosting.
				975	*/
				976	if (likely(rt_b->rt_runtime)) {
				977	rt_rq->rt_throttled = 1;
				978	printk_deferred_once("sched: RT throttling activated\n");
				979	} else {
				980	/*
				981	* In case we did anyway, make it go away,
				982	* replenishment is a joke, since it will replenish us
				983	* with exactly 0 ns.
				984	*/
				985	rt_rq->rt_time = 0;
				986	}
				987
				988	if (rt_rq_throttled(rt_rq)) {
				989	sched_rt_rq_dequeue(rt_rq);
				990	return 1;
				991	}
				992	}
				993
				994	return 0;
				995	}
				996
				997	/*
				998	* Update the current task's runtime statistics. Skip current tasks that
				999	* are not in our scheduling class.
				1000	*/
				1001	static void update_curr_rt(struct rq *rq)
				1002	{
				1003	struct task_struct *curr = rq->curr;
				1004	struct sched_rt_entity *rt_se = &curr->rt;
				1005	u64 delta_exec;
				1006	u64 now;
				1007
				1008	if (curr->sched_class != &rt_sched_class)
				1009	return;
				1010
				1011	now = rq_clock_task(rq);
				1012	delta_exec = now - curr->se.exec_start;
				1013	if (unlikely((s64)delta_exec <= 0))
				1014	return;
				1015
				1016	schedstat_set(curr->se.statistics.exec_max,
				1017	max(curr->se.statistics.exec_max, delta_exec));
				1018
				1019	curr->se.sum_exec_runtime += delta_exec;
				1020	account_group_exec_runtime(curr, delta_exec);
				1021
				1022	curr->se.exec_start = now;
				1023	cgroup_account_cputime(curr, delta_exec);
				1024
				1025	if (!rt_bandwidth_enabled())
				1026	return;
				1027
				1028	for_each_sched_rt_entity(rt_se) {
				1029	struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame]	1030	int exceeded;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1031
				1032	if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {
				1033	raw_spin_lock(&rt_rq->rt_runtime_lock);
				1034	rt_rq->rt_time += delta_exec;
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame]	1035	exceeded = sched_rt_runtime_exceeded(rt_rq);
				1036	if (exceeded)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1037	resched_curr(rq);
				1038	raw_spin_unlock(&rt_rq->rt_runtime_lock);
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame]	1039	if (exceeded)
				1040	do_start_rt_bandwidth(sched_rt_bandwidth(rt_rq));
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1041	}
				1042	}
				1043	}
				1044
				1045	static void
Olivier Deprez	92d4c21	2022-12-06 15:05:30 +0100	[diff] [blame]	1046	dequeue_top_rt_rq(struct rt_rq *rt_rq, unsigned int count)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1047	{
				1048	struct rq *rq = rq_of_rt_rq(rt_rq);
				1049
				1050	BUG_ON(&rq->rt != rt_rq);
				1051
				1052	if (!rt_rq->rt_queued)
				1053	return;
				1054
				1055	BUG_ON(!rq->nr_running);
				1056
Olivier Deprez	92d4c21	2022-12-06 15:05:30 +0100	[diff] [blame]	1057	sub_nr_running(rq, count);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1058	rt_rq->rt_queued = 0;
				1059
				1060	}
				1061
				1062	static void
				1063	enqueue_top_rt_rq(struct rt_rq *rt_rq)
				1064	{
				1065	struct rq *rq = rq_of_rt_rq(rt_rq);
				1066
				1067	BUG_ON(&rq->rt != rt_rq);
				1068
				1069	if (rt_rq->rt_queued)
				1070	return;
				1071
				1072	if (rt_rq_throttled(rt_rq))
				1073	return;
				1074
				1075	if (rt_rq->rt_nr_running) {
				1076	add_nr_running(rq, rt_rq->rt_nr_running);
				1077	rt_rq->rt_queued = 1;
				1078	}
				1079
				1080	/* Kick cpufreq (see the comment in kernel/sched/sched.h). */
				1081	cpufreq_update_util(rq, 0);
				1082	}
				1083
				1084	#if defined CONFIG_SMP
				1085
				1086	static void
				1087	inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
				1088	{
				1089	struct rq *rq = rq_of_rt_rq(rt_rq);
				1090
				1091	#ifdef CONFIG_RT_GROUP_SCHED
				1092	/*
				1093	* Change rq's cpupri only if rt_rq is the top queue.
				1094	*/
				1095	if (&rq->rt != rt_rq)
				1096	return;
				1097	#endif
				1098	if (rq->online && prio < prev_prio)
				1099	cpupri_set(&rq->rd->cpupri, rq->cpu, prio);
				1100	}
				1101
				1102	static void
				1103	dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
				1104	{
				1105	struct rq *rq = rq_of_rt_rq(rt_rq);
				1106
				1107	#ifdef CONFIG_RT_GROUP_SCHED
				1108	/*
				1109	* Change rq's cpupri only if rt_rq is the top queue.
				1110	*/
				1111	if (&rq->rt != rt_rq)
				1112	return;
				1113	#endif
				1114	if (rq->online && rt_rq->highest_prio.curr != prev_prio)
				1115	cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr);
				1116	}
				1117
				1118	#else /* CONFIG_SMP */
				1119
				1120	static inline
				1121	void inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
				1122	static inline
				1123	void dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
				1124
				1125	#endif /* CONFIG_SMP */
				1126
				1127	#if defined CONFIG_SMP \|\| defined CONFIG_RT_GROUP_SCHED
				1128	static void
				1129	inc_rt_prio(struct rt_rq *rt_rq, int prio)
				1130	{
				1131	int prev_prio = rt_rq->highest_prio.curr;
				1132
				1133	if (prio < prev_prio)
				1134	rt_rq->highest_prio.curr = prio;
				1135
				1136	inc_rt_prio_smp(rt_rq, prio, prev_prio);
				1137	}
				1138
				1139	static void
				1140	dec_rt_prio(struct rt_rq *rt_rq, int prio)
				1141	{
				1142	int prev_prio = rt_rq->highest_prio.curr;
				1143
				1144	if (rt_rq->rt_nr_running) {
				1145
				1146	WARN_ON(prio < prev_prio);
				1147
				1148	/*
				1149	* This may have been our highest task, and therefore
				1150	* we may have some recomputation to do
				1151	*/
				1152	if (prio == prev_prio) {
				1153	struct rt_prio_array *array = &rt_rq->active;
				1154
				1155	rt_rq->highest_prio.curr =
				1156	sched_find_first_bit(array->bitmap);
				1157	}
				1158
				1159	} else
				1160	rt_rq->highest_prio.curr = MAX_RT_PRIO;
				1161
				1162	dec_rt_prio_smp(rt_rq, prio, prev_prio);
				1163	}
				1164
				1165	#else
				1166
				1167	static inline void inc_rt_prio(struct rt_rq *rt_rq, int prio) {}
				1168	static inline void dec_rt_prio(struct rt_rq *rt_rq, int prio) {}
				1169
				1170	#endif /* CONFIG_SMP \|\| CONFIG_RT_GROUP_SCHED */
				1171
				1172	#ifdef CONFIG_RT_GROUP_SCHED
				1173
				1174	static void
				1175	inc_rt_group(struct sched_rt_entity rt_se, struct rt_rq rt_rq)
				1176	{
				1177	if (rt_se_boosted(rt_se))
				1178	rt_rq->rt_nr_boosted++;
				1179
				1180	if (rt_rq->tg)
				1181	start_rt_bandwidth(&rt_rq->tg->rt_bandwidth);
				1182	}
				1183
				1184	static void
				1185	dec_rt_group(struct sched_rt_entity rt_se, struct rt_rq rt_rq)
				1186	{
				1187	if (rt_se_boosted(rt_se))
				1188	rt_rq->rt_nr_boosted--;
				1189
				1190	WARN_ON(!rt_rq->rt_nr_running && rt_rq->rt_nr_boosted);
				1191	}
				1192
				1193	#else /* CONFIG_RT_GROUP_SCHED */
				1194
				1195	static void
				1196	inc_rt_group(struct sched_rt_entity rt_se, struct rt_rq rt_rq)
				1197	{
				1198	start_rt_bandwidth(&def_rt_bandwidth);
				1199	}
				1200
				1201	static inline
				1202	void dec_rt_group(struct sched_rt_entity rt_se, struct rt_rq rt_rq) {}
				1203
				1204	#endif /* CONFIG_RT_GROUP_SCHED */
				1205
				1206	static inline
				1207	unsigned int rt_se_nr_running(struct sched_rt_entity *rt_se)
				1208	{
				1209	struct rt_rq *group_rq = group_rt_rq(rt_se);
				1210
				1211	if (group_rq)
				1212	return group_rq->rt_nr_running;
				1213	else
				1214	return 1;
				1215	}
				1216
				1217	static inline
				1218	unsigned int rt_se_rr_nr_running(struct sched_rt_entity *rt_se)
				1219	{
				1220	struct rt_rq *group_rq = group_rt_rq(rt_se);
				1221	struct task_struct *tsk;
				1222
				1223	if (group_rq)
				1224	return group_rq->rr_nr_running;
				1225
				1226	tsk = rt_task_of(rt_se);
				1227
				1228	return (tsk->policy == SCHED_RR) ? 1 : 0;
				1229	}
				1230
				1231	static inline
				1232	void inc_rt_tasks(struct sched_rt_entity rt_se, struct rt_rq rt_rq)
				1233	{
				1234	int prio = rt_se_prio(rt_se);
				1235
				1236	WARN_ON(!rt_prio(prio));
				1237	rt_rq->rt_nr_running += rt_se_nr_running(rt_se);
				1238	rt_rq->rr_nr_running += rt_se_rr_nr_running(rt_se);
				1239
				1240	inc_rt_prio(rt_rq, prio);
				1241	inc_rt_migration(rt_se, rt_rq);
				1242	inc_rt_group(rt_se, rt_rq);
				1243	}
				1244
				1245	static inline
				1246	void dec_rt_tasks(struct sched_rt_entity rt_se, struct rt_rq rt_rq)
				1247	{
				1248	WARN_ON(!rt_prio(rt_se_prio(rt_se)));
				1249	WARN_ON(!rt_rq->rt_nr_running);
				1250	rt_rq->rt_nr_running -= rt_se_nr_running(rt_se);
				1251	rt_rq->rr_nr_running -= rt_se_rr_nr_running(rt_se);
				1252
				1253	dec_rt_prio(rt_rq, rt_se_prio(rt_se));
				1254	dec_rt_migration(rt_se, rt_rq);
				1255	dec_rt_group(rt_se, rt_rq);
				1256	}
				1257
				1258	/*
				1259	* Change rt_se->run_list location unless SAVE && !MOVE
				1260	*
				1261	* assumes ENQUEUE/DEQUEUE flags match
				1262	*/
				1263	static inline bool move_entity(unsigned int flags)
				1264	{
				1265	if ((flags & (DEQUEUE_SAVE \| DEQUEUE_MOVE)) == DEQUEUE_SAVE)
				1266	return false;
				1267
				1268	return true;
				1269	}
				1270
				1271	static void __delist_rt_entity(struct sched_rt_entity rt_se, struct rt_prio_array array)
				1272	{
				1273	list_del_init(&rt_se->run_list);
				1274
				1275	if (list_empty(array->queue + rt_se_prio(rt_se)))
				1276	__clear_bit(rt_se_prio(rt_se), array->bitmap);
				1277
				1278	rt_se->on_list = 0;
				1279	}
				1280
				1281	static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
				1282	{
				1283	struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
				1284	struct rt_prio_array *array = &rt_rq->active;
				1285	struct rt_rq *group_rq = group_rt_rq(rt_se);
				1286	struct list_head *queue = array->queue + rt_se_prio(rt_se);
				1287
				1288	/*
				1289	* Don't enqueue the group if its throttled, or when empty.
				1290	* The latter is a consequence of the former when a child group
				1291	* get throttled and the current group doesn't have any other
				1292	* active members.
				1293	*/
				1294	if (group_rq && (rt_rq_throttled(group_rq) \|\| !group_rq->rt_nr_running)) {
				1295	if (rt_se->on_list)
				1296	__delist_rt_entity(rt_se, array);
				1297	return;
				1298	}
				1299
				1300	if (move_entity(flags)) {
				1301	WARN_ON_ONCE(rt_se->on_list);
				1302	if (flags & ENQUEUE_HEAD)
				1303	list_add(&rt_se->run_list, queue);
				1304	else
				1305	list_add_tail(&rt_se->run_list, queue);
				1306
				1307	__set_bit(rt_se_prio(rt_se), array->bitmap);
				1308	rt_se->on_list = 1;
				1309	}
				1310	rt_se->on_rq = 1;
				1311
				1312	inc_rt_tasks(rt_se, rt_rq);
				1313	}
				1314
				1315	static void __dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
				1316	{
				1317	struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
				1318	struct rt_prio_array *array = &rt_rq->active;
				1319
				1320	if (move_entity(flags)) {
				1321	WARN_ON_ONCE(!rt_se->on_list);
				1322	__delist_rt_entity(rt_se, array);
				1323	}
				1324	rt_se->on_rq = 0;
				1325
				1326	dec_rt_tasks(rt_se, rt_rq);
				1327	}
				1328
				1329	/*
				1330	* Because the prio of an upper entry depends on the lower
				1331	* entries, we must remove entries top - down.
				1332	*/
				1333	static void dequeue_rt_stack(struct sched_rt_entity *rt_se, unsigned int flags)
				1334	{
				1335	struct sched_rt_entity *back = NULL;
Olivier Deprez	92d4c21	2022-12-06 15:05:30 +0100	[diff] [blame]	1336	unsigned int rt_nr_running;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1337
				1338	for_each_sched_rt_entity(rt_se) {
				1339	rt_se->back = back;
				1340	back = rt_se;
				1341	}
				1342
Olivier Deprez	92d4c21	2022-12-06 15:05:30 +0100	[diff] [blame]	1343	rt_nr_running = rt_rq_of_se(back)->rt_nr_running;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1344
				1345	for (rt_se = back; rt_se; rt_se = rt_se->back) {
				1346	if (on_rt_rq(rt_se))
				1347	__dequeue_rt_entity(rt_se, flags);
				1348	}
Olivier Deprez	92d4c21	2022-12-06 15:05:30 +0100	[diff] [blame]	1349
				1350	dequeue_top_rt_rq(rt_rq_of_se(back), rt_nr_running);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1351	}
				1352
				1353	static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
				1354	{
				1355	struct rq *rq = rq_of_rt_se(rt_se);
				1356
				1357	dequeue_rt_stack(rt_se, flags);
				1358	for_each_sched_rt_entity(rt_se)
				1359	__enqueue_rt_entity(rt_se, flags);
				1360	enqueue_top_rt_rq(&rq->rt);
				1361	}
				1362
				1363	static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
				1364	{
				1365	struct rq *rq = rq_of_rt_se(rt_se);
				1366
				1367	dequeue_rt_stack(rt_se, flags);
				1368
				1369	for_each_sched_rt_entity(rt_se) {
				1370	struct rt_rq *rt_rq = group_rt_rq(rt_se);
				1371
				1372	if (rt_rq && rt_rq->rt_nr_running)
				1373	__enqueue_rt_entity(rt_se, flags);
				1374	}
				1375	enqueue_top_rt_rq(&rq->rt);
				1376	}
				1377
				1378	/*
				1379	* Adding/removing a task to/from a priority array:
				1380	*/
				1381	static void
				1382	enqueue_task_rt(struct rq rq, struct task_struct p, int flags)
				1383	{
				1384	struct sched_rt_entity *rt_se = &p->rt;
				1385
				1386	if (flags & ENQUEUE_WAKEUP)
				1387	rt_se->timeout = 0;
				1388
				1389	enqueue_rt_entity(rt_se, flags);
				1390
				1391	if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
				1392	enqueue_pushable_task(rq, p);
				1393	}
				1394
				1395	static void dequeue_task_rt(struct rq rq, struct task_struct p, int flags)
				1396	{
				1397	struct sched_rt_entity *rt_se = &p->rt;
				1398
				1399	update_curr_rt(rq);
				1400	dequeue_rt_entity(rt_se, flags);
				1401
				1402	dequeue_pushable_task(rq, p);
				1403	}
				1404
				1405	/*
				1406	* Put task to the head or the end of the run list without the overhead of
				1407	* dequeue followed by enqueue.
				1408	*/
				1409	static void
				1410	requeue_rt_entity(struct rt_rq rt_rq, struct sched_rt_entity rt_se, int head)
				1411	{
				1412	if (on_rt_rq(rt_se)) {
				1413	struct rt_prio_array *array = &rt_rq->active;
				1414	struct list_head *queue = array->queue + rt_se_prio(rt_se);
				1415
				1416	if (head)
				1417	list_move(&rt_se->run_list, queue);
				1418	else
				1419	list_move_tail(&rt_se->run_list, queue);
				1420	}
				1421	}
				1422
				1423	static void requeue_task_rt(struct rq rq, struct task_struct p, int head)
				1424	{
				1425	struct sched_rt_entity *rt_se = &p->rt;
				1426	struct rt_rq *rt_rq;
				1427
				1428	for_each_sched_rt_entity(rt_se) {
				1429	rt_rq = rt_rq_of_se(rt_se);
				1430	requeue_rt_entity(rt_rq, rt_se, head);
				1431	}
				1432	}
				1433
				1434	static void yield_task_rt(struct rq *rq)
				1435	{
				1436	requeue_task_rt(rq, rq->curr, 0);
				1437	}
				1438
				1439	#ifdef CONFIG_SMP
				1440	static int find_lowest_rq(struct task_struct *task);
				1441
				1442	static int
				1443	select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
				1444	{
				1445	struct task_struct *curr;
				1446	struct rq *rq;
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame]	1447	bool test;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1448
				1449	/* For anything but wake ups, just return the task_cpu */
				1450	if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK)
				1451	goto out;
				1452
				1453	rq = cpu_rq(cpu);
				1454
				1455	rcu_read_lock();
				1456	curr = READ_ONCE(rq->curr); /* unlocked access */
				1457
				1458	/*
				1459	* If the current task on @p's runqueue is an RT task, then
				1460	* try to see if we can wake this RT task up on another
				1461	* runqueue. Otherwise simply start this RT task
				1462	* on its current runqueue.
				1463	*
				1464	* We want to avoid overloading runqueues. If the woken
				1465	* task is a higher priority, then it will stay on this CPU
				1466	* and the lower prio task should be moved to another CPU.
				1467	* Even though this will probably make the lower prio task
				1468	* lose its cache, we do not want to bounce a higher task
				1469	* around just because it gave up its CPU, perhaps for a
				1470	* lock?
				1471	*
				1472	* For equal prio tasks, we just let the scheduler sort it out.
				1473	*
				1474	* Otherwise, just let it ride on the affined RQ and the
				1475	* post-schedule router will push the preempted task away
				1476	*
				1477	* This test is optimistic, if we get it wrong the load-balancer
				1478	* will have to sort it out.
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame]	1479	*
				1480	* We take into account the capacity of the CPU to ensure it fits the
				1481	* requirement of the task - which is only important on heterogeneous
				1482	* systems like big.LITTLE.
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1483	*/
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame]	1484	test = curr &&
				1485	unlikely(rt_task(curr)) &&
				1486	(curr->nr_cpus_allowed < 2 \|\| curr->prio <= p->prio);
				1487
				1488	if (test \|\| !rt_task_fits_capacity(p, cpu)) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1489	int target = find_lowest_rq(p);
				1490
				1491	/*
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame]	1492	* Bail out if we were forcing a migration to find a better
				1493	* fitting CPU but our search failed.
				1494	*/
				1495	if (!test && target != -1 && !rt_task_fits_capacity(p, target))
				1496	goto out_unlock;
				1497
				1498	/*
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1499	* Don't bother moving it if the destination CPU is
				1500	* not running a lower priority task.
				1501	*/
				1502	if (target != -1 &&
				1503	p->prio < cpu_rq(target)->rt.highest_prio.curr)
				1504	cpu = target;
				1505	}
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame]	1506
				1507	out_unlock:
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1508	rcu_read_unlock();
				1509
				1510	out:
				1511	return cpu;
				1512	}
				1513
				1514	static void check_preempt_equal_prio(struct rq rq, struct task_struct p)
				1515	{
				1516	/*
				1517	* Current can't be migrated, useless to reschedule,
				1518	* let's hope p can move out.
				1519	*/
				1520	if (rq->curr->nr_cpus_allowed == 1 \|\|
				1521	!cpupri_find(&rq->rd->cpupri, rq->curr, NULL))
				1522	return;
				1523
				1524	/*
				1525	* p is migratable, so let's not schedule it and
				1526	* see if it is pushed or pulled somewhere else.
				1527	*/
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame]	1528	if (p->nr_cpus_allowed != 1 &&
				1529	cpupri_find(&rq->rd->cpupri, p, NULL))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1530	return;
				1531
				1532	/*
				1533	* There appear to be other CPUs that can accept
				1534	* the current task but none can run 'p', so lets reschedule
				1535	* to try and push the current task away:
				1536	*/
				1537	requeue_task_rt(rq, p, 1);
				1538	resched_curr(rq);
				1539	}
				1540
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1541	static int balance_rt(struct rq rq, struct task_struct p, struct rq_flags *rf)
				1542	{
				1543	if (!on_rt_rq(&p->rt) && need_pull_rt_task(rq, p)) {
				1544	/*
				1545	* This is OK, because current is on_cpu, which avoids it being
				1546	* picked for load-balance and preemption/IRQs are still
				1547	* disabled avoiding further scheduler activity on it and we've
				1548	* not yet started the picking loop.
				1549	*/
				1550	rq_unpin_lock(rq, rf);
				1551	pull_rt_task(rq);
				1552	rq_repin_lock(rq, rf);
				1553	}
				1554
				1555	return sched_stop_runnable(rq) \|\| sched_dl_runnable(rq) \|\| sched_rt_runnable(rq);
				1556	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1557	#endif /* CONFIG_SMP */
				1558
				1559	/*
				1560	* Preempt the current task with a newly woken task if needed:
				1561	*/
				1562	static void check_preempt_curr_rt(struct rq rq, struct task_struct p, int flags)
				1563	{
				1564	if (p->prio < rq->curr->prio) {
				1565	resched_curr(rq);
				1566	return;
				1567	}
				1568
				1569	#ifdef CONFIG_SMP
				1570	/*
				1571	* If:
				1572	*
				1573	* - the newly woken task is of equal priority to the current task
				1574	* - the newly woken task is non-migratable while current is migratable
				1575	* - current will be preempted on the next reschedule
				1576	*
				1577	* we should check to see if current can readily move to a different
				1578	* cpu. If so, we will reschedule to allow the push logic to try
				1579	* to move current somewhere else, making room for our non-migratable
				1580	* task.
				1581	*/
				1582	if (p->prio == rq->curr->prio && !test_tsk_need_resched(rq->curr))
				1583	check_preempt_equal_prio(rq, p);
				1584	#endif
				1585	}
				1586
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame]	1587	static inline void set_next_task_rt(struct rq rq, struct task_struct p, bool first)
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1588	{
				1589	p->se.exec_start = rq_clock_task(rq);
				1590
				1591	/* The running task is never eligible for pushing */
				1592	dequeue_pushable_task(rq, p);
				1593
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame]	1594	if (!first)
				1595	return;
				1596
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1597	/*
				1598	* If prev task was rt, put_prev_task() has already updated the
				1599	* utilization. We only care of the case where we start to schedule a
				1600	* rt task
				1601	*/
				1602	if (rq->curr->sched_class != &rt_sched_class)
				1603	update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 0);
				1604
				1605	rt_queue_push_tasks(rq);
				1606	}
				1607
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1608	static struct sched_rt_entity pick_next_rt_entity(struct rq rq,
				1609	struct rt_rq *rt_rq)
				1610	{
				1611	struct rt_prio_array *array = &rt_rq->active;
				1612	struct sched_rt_entity *next = NULL;
				1613	struct list_head *queue;
				1614	int idx;
				1615
				1616	idx = sched_find_first_bit(array->bitmap);
				1617	BUG_ON(idx >= MAX_RT_PRIO);
				1618
				1619	queue = array->queue + idx;
				1620	next = list_entry(queue->next, struct sched_rt_entity, run_list);
				1621
				1622	return next;
				1623	}
				1624
				1625	static struct task_struct _pick_next_task_rt(struct rq rq)
				1626	{
				1627	struct sched_rt_entity *rt_se;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1628	struct rt_rq *rt_rq = &rq->rt;
				1629
				1630	do {
				1631	rt_se = pick_next_rt_entity(rq, rt_rq);
				1632	BUG_ON(!rt_se);
				1633	rt_rq = group_rt_rq(rt_se);
				1634	} while (rt_rq);
				1635
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1636	return rt_task_of(rt_se);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1637	}
				1638
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame]	1639	static struct task_struct pick_next_task_rt(struct rq rq)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1640	{
				1641	struct task_struct *p;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1642
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1643	if (!sched_rt_runnable(rq))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1644	return NULL;
				1645
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1646	p = _pick_next_task_rt(rq);
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame]	1647	set_next_task_rt(rq, p, true);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1648	return p;
				1649	}
				1650
				1651	static void put_prev_task_rt(struct rq rq, struct task_struct p)
				1652	{
				1653	update_curr_rt(rq);
				1654
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1655	update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 1);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1656
				1657	/*
				1658	* The previous task needs to be made eligible for pushing
				1659	* if it is still active
				1660	*/
				1661	if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1)
				1662	enqueue_pushable_task(rq, p);
				1663	}
				1664
				1665	#ifdef CONFIG_SMP
				1666
				1667	/* Only try algorithms three times */
				1668	#define RT_MAX_TRIES 3
				1669
				1670	static int pick_rt_task(struct rq rq, struct task_struct p, int cpu)
				1671	{
				1672	if (!task_running(rq, p) &&
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1673	cpumask_test_cpu(cpu, p->cpus_ptr))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1674	return 1;
				1675
				1676	return 0;
				1677	}
				1678
				1679	/*
				1680	* Return the highest pushable rq's task, which is suitable to be executed
				1681	* on the CPU, NULL otherwise
				1682	*/
				1683	static struct task_struct pick_highest_pushable_task(struct rq rq, int cpu)
				1684	{
				1685	struct plist_head *head = &rq->rt.pushable_tasks;
				1686	struct task_struct *p;
				1687
				1688	if (!has_pushable_tasks(rq))
				1689	return NULL;
				1690
				1691	plist_for_each_entry(p, head, pushable_tasks) {
				1692	if (pick_rt_task(rq, p, cpu))
				1693	return p;
				1694	}
				1695
				1696	return NULL;
				1697	}
				1698
				1699	static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask);
				1700
				1701	static int find_lowest_rq(struct task_struct *task)
				1702	{
				1703	struct sched_domain *sd;
				1704	struct cpumask *lowest_mask = this_cpu_cpumask_var_ptr(local_cpu_mask);
				1705	int this_cpu = smp_processor_id();
				1706	int cpu = task_cpu(task);
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame]	1707	int ret;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1708
				1709	/* Make sure the mask is initialized first */
				1710	if (unlikely(!lowest_mask))
				1711	return -1;
				1712
				1713	if (task->nr_cpus_allowed == 1)
				1714	return -1; /* No other targets possible */
				1715
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame]	1716	/*
				1717	* If we're on asym system ensure we consider the different capacities
				1718	* of the CPUs when searching for the lowest_mask.
				1719	*/
				1720	if (static_branch_unlikely(&sched_asym_cpucapacity)) {
				1721
				1722	ret = cpupri_find_fitness(&task_rq(task)->rd->cpupri,
				1723	task, lowest_mask,
				1724	rt_task_fits_capacity);
				1725	} else {
				1726
				1727	ret = cpupri_find(&task_rq(task)->rd->cpupri,
				1728	task, lowest_mask);
				1729	}
				1730
				1731	if (!ret)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1732	return -1; /* No targets found */
				1733
				1734	/*
				1735	* At this point we have built a mask of CPUs representing the
				1736	* lowest priority tasks in the system. Now we want to elect
				1737	* the best one based on our affinity and topology.
				1738	*
				1739	* We prioritize the last CPU that the task executed on since
				1740	* it is most likely cache-hot in that location.
				1741	*/
				1742	if (cpumask_test_cpu(cpu, lowest_mask))
				1743	return cpu;
				1744
				1745	/*
				1746	* Otherwise, we consult the sched_domains span maps to figure
				1747	* out which CPU is logically closest to our hot cache data.
				1748	*/
				1749	if (!cpumask_test_cpu(this_cpu, lowest_mask))
				1750	this_cpu = -1; /* Skip this_cpu opt if not among lowest */
				1751
				1752	rcu_read_lock();
				1753	for_each_domain(cpu, sd) {
				1754	if (sd->flags & SD_WAKE_AFFINE) {
				1755	int best_cpu;
				1756
				1757	/*
				1758	* "this_cpu" is cheaper to preempt than a
				1759	* remote processor.
				1760	*/
				1761	if (this_cpu != -1 &&
				1762	cpumask_test_cpu(this_cpu, sched_domain_span(sd))) {
				1763	rcu_read_unlock();
				1764	return this_cpu;
				1765	}
				1766
				1767	best_cpu = cpumask_first_and(lowest_mask,
				1768	sched_domain_span(sd));
				1769	if (best_cpu < nr_cpu_ids) {
				1770	rcu_read_unlock();
				1771	return best_cpu;
				1772	}
				1773	}
				1774	}
				1775	rcu_read_unlock();
				1776
				1777	/*
				1778	* And finally, if there were no matches within the domains
				1779	* just give the caller something to work with from the compatible
				1780	* locations.
				1781	*/
				1782	if (this_cpu != -1)
				1783	return this_cpu;
				1784
				1785	cpu = cpumask_any(lowest_mask);
				1786	if (cpu < nr_cpu_ids)
				1787	return cpu;
				1788
				1789	return -1;
				1790	}
				1791
				1792	/* Will lock the rq it finds */
				1793	static struct rq find_lock_lowest_rq(struct task_struct task, struct rq *rq)
				1794	{
				1795	struct rq *lowest_rq = NULL;
				1796	int tries;
				1797	int cpu;
				1798
				1799	for (tries = 0; tries < RT_MAX_TRIES; tries++) {
				1800	cpu = find_lowest_rq(task);
				1801
				1802	if ((cpu == -1) \|\| (cpu == rq->cpu))
				1803	break;
				1804
				1805	lowest_rq = cpu_rq(cpu);
				1806
				1807	if (lowest_rq->rt.highest_prio.curr <= task->prio) {
				1808	/*
				1809	* Target rq has tasks of equal or higher priority,
				1810	* retrying does not release any lock and is unlikely
				1811	* to yield a different result.
				1812	*/
				1813	lowest_rq = NULL;
				1814	break;
				1815	}
				1816
				1817	/* if the prio of this runqueue changed, try again */
				1818	if (double_lock_balance(rq, lowest_rq)) {
				1819	/*
				1820	* We had to unlock the run queue. In
				1821	* the mean time, task could have
				1822	* migrated already or had its affinity changed.
				1823	* Also make sure that it wasn't scheduled on its rq.
				1824	*/
				1825	if (unlikely(task_rq(task) != rq \|\|
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1826	!cpumask_test_cpu(lowest_rq->cpu, task->cpus_ptr) \|\|
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1827	task_running(rq, task) \|\|
				1828	!rt_task(task) \|\|
				1829	!task_on_rq_queued(task))) {
				1830
				1831	double_unlock_balance(rq, lowest_rq);
				1832	lowest_rq = NULL;
				1833	break;
				1834	}
				1835	}
				1836
				1837	/* If this rq is still suitable use it. */
				1838	if (lowest_rq->rt.highest_prio.curr > task->prio)
				1839	break;
				1840
				1841	/* try again */
				1842	double_unlock_balance(rq, lowest_rq);
				1843	lowest_rq = NULL;
				1844	}
				1845
				1846	return lowest_rq;
				1847	}
				1848
				1849	static struct task_struct pick_next_pushable_task(struct rq rq)
				1850	{
				1851	struct task_struct *p;
				1852
				1853	if (!has_pushable_tasks(rq))
				1854	return NULL;
				1855
				1856	p = plist_first_entry(&rq->rt.pushable_tasks,
				1857	struct task_struct, pushable_tasks);
				1858
				1859	BUG_ON(rq->cpu != task_cpu(p));
				1860	BUG_ON(task_current(rq, p));
				1861	BUG_ON(p->nr_cpus_allowed <= 1);
				1862
				1863	BUG_ON(!task_on_rq_queued(p));
				1864	BUG_ON(!rt_task(p));
				1865
				1866	return p;
				1867	}
				1868
				1869	/*
				1870	* If the current CPU has more than one RT task, see if the non
				1871	* running task can migrate over to a CPU that is running a task
				1872	* of lesser priority.
				1873	*/
				1874	static int push_rt_task(struct rq *rq)
				1875	{
				1876	struct task_struct *next_task;
				1877	struct rq *lowest_rq;
				1878	int ret = 0;
				1879
				1880	if (!rq->rt.overloaded)
				1881	return 0;
				1882
				1883	next_task = pick_next_pushable_task(rq);
				1884	if (!next_task)
				1885	return 0;
				1886
				1887	retry:
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1888	if (WARN_ON(next_task == rq->curr))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1889	return 0;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1890
				1891	/*
				1892	* It's possible that the next_task slipped in of
				1893	* higher priority than current. If that's the case
				1894	* just reschedule current.
				1895	*/
				1896	if (unlikely(next_task->prio < rq->curr->prio)) {
				1897	resched_curr(rq);
				1898	return 0;
				1899	}
				1900
				1901	/* We might release rq lock */
				1902	get_task_struct(next_task);
				1903
				1904	/* find_lock_lowest_rq locks the rq if found */
				1905	lowest_rq = find_lock_lowest_rq(next_task, rq);
				1906	if (!lowest_rq) {
				1907	struct task_struct *task;
				1908	/*
				1909	* find_lock_lowest_rq releases rq->lock
				1910	* so it is possible that next_task has migrated.
				1911	*
				1912	* We need to make sure that the task is still on the same
				1913	* run-queue and is also still the next task eligible for
				1914	* pushing.
				1915	*/
				1916	task = pick_next_pushable_task(rq);
				1917	if (task == next_task) {
				1918	/*
				1919	* The task hasn't migrated, and is still the next
				1920	* eligible task, but we failed to find a run-queue
				1921	* to push it to. Do not retry in this case, since
				1922	* other CPUs will pull from us when ready.
				1923	*/
				1924	goto out;
				1925	}
				1926
				1927	if (!task)
				1928	/* No more tasks, just exit */
				1929	goto out;
				1930
				1931	/*
				1932	* Something has shifted, try again.
				1933	*/
				1934	put_task_struct(next_task);
				1935	next_task = task;
				1936	goto retry;
				1937	}
				1938
				1939	deactivate_task(rq, next_task, 0);
				1940	set_task_cpu(next_task, lowest_rq->cpu);
				1941	activate_task(lowest_rq, next_task, 0);
				1942	ret = 1;
				1943
				1944	resched_curr(lowest_rq);
				1945
				1946	double_unlock_balance(rq, lowest_rq);
				1947
				1948	out:
				1949	put_task_struct(next_task);
				1950
				1951	return ret;
				1952	}
				1953
				1954	static void push_rt_tasks(struct rq *rq)
				1955	{
				1956	/* push_rt_task will return true if it moved an RT */
				1957	while (push_rt_task(rq))
				1958	;
				1959	}
				1960
				1961	#ifdef HAVE_RT_PUSH_IPI
				1962
				1963	/*
				1964	* When a high priority task schedules out from a CPU and a lower priority
				1965	* task is scheduled in, a check is made to see if there's any RT tasks
				1966	* on other CPUs that are waiting to run because a higher priority RT task
				1967	* is currently running on its CPU. In this case, the CPU with multiple RT
				1968	* tasks queued on it (overloaded) needs to be notified that a CPU has opened
				1969	* up that may be able to run one of its non-running queued RT tasks.
				1970	*
				1971	* All CPUs with overloaded RT tasks need to be notified as there is currently
				1972	* no way to know which of these CPUs have the highest priority task waiting
				1973	* to run. Instead of trying to take a spinlock on each of these CPUs,
				1974	* which has shown to cause large latency when done on machines with many
				1975	* CPUs, sending an IPI to the CPUs to have them push off the overloaded
				1976	* RT tasks waiting to run.
				1977	*
				1978	* Just sending an IPI to each of the CPUs is also an issue, as on large
				1979	* count CPU machines, this can cause an IPI storm on a CPU, especially
				1980	* if its the only CPU with multiple RT tasks queued, and a large number
				1981	* of CPUs scheduling a lower priority task at the same time.
				1982	*
				1983	* Each root domain has its own irq work function that can iterate over
				1984	* all CPUs with RT overloaded tasks. Since all CPUs with overloaded RT
				1985	* tassk must be checked if there's one or many CPUs that are lowering
				1986	* their priority, there's a single irq work iterator that will try to
				1987	* push off RT tasks that are waiting to run.
				1988	*
				1989	* When a CPU schedules a lower priority task, it will kick off the
				1990	* irq work iterator that will jump to each CPU with overloaded RT tasks.
				1991	* As it only takes the first CPU that schedules a lower priority task
				1992	* to start the process, the rto_start variable is incremented and if
				1993	* the atomic result is one, then that CPU will try to take the rto_lock.
				1994	* This prevents high contention on the lock as the process handles all
				1995	* CPUs scheduling lower priority tasks.
				1996	*
				1997	* All CPUs that are scheduling a lower priority task will increment the
				1998	* rt_loop_next variable. This will make sure that the irq work iterator
				1999	* checks all RT overloaded CPUs whenever a CPU schedules a new lower
				2000	* priority task, even if the iterator is in the middle of a scan. Incrementing
				2001	* the rt_loop_next will cause the iterator to perform another scan.
				2002	*
				2003	*/
				2004	static int rto_next_cpu(struct root_domain *rd)
				2005	{
				2006	int next;
				2007	int cpu;
				2008
				2009	/*
				2010	* When starting the IPI RT pushing, the rto_cpu is set to -1,
				2011	* rt_next_cpu() will simply return the first CPU found in
				2012	* the rto_mask.
				2013	*
				2014	* If rto_next_cpu() is called with rto_cpu is a valid CPU, it
				2015	* will return the next CPU found in the rto_mask.
				2016	*
				2017	* If there are no more CPUs left in the rto_mask, then a check is made
				2018	* against rto_loop and rto_loop_next. rto_loop is only updated with
				2019	* the rto_lock held, but any CPU may increment the rto_loop_next
				2020	* without any locking.
				2021	*/
				2022	for (;;) {
				2023
				2024	/* When rto_cpu is -1 this acts like cpumask_first() */
				2025	cpu = cpumask_next(rd->rto_cpu, rd->rto_mask);
				2026
				2027	rd->rto_cpu = cpu;
				2028
				2029	if (cpu < nr_cpu_ids)
				2030	return cpu;
				2031
				2032	rd->rto_cpu = -1;
				2033
				2034	/*
				2035	* ACQUIRE ensures we see the @rto_mask changes
				2036	* made prior to the @next value observed.
				2037	*
				2038	* Matches WMB in rt_set_overload().
				2039	*/
				2040	next = atomic_read_acquire(&rd->rto_loop_next);
				2041
				2042	if (rd->rto_loop == next)
				2043	break;
				2044
				2045	rd->rto_loop = next;
				2046	}
				2047
				2048	return -1;
				2049	}
				2050
				2051	static inline bool rto_start_trylock(atomic_t *v)
				2052	{
				2053	return !atomic_cmpxchg_acquire(v, 0, 1);
				2054	}
				2055
				2056	static inline void rto_start_unlock(atomic_t *v)
				2057	{
				2058	atomic_set_release(v, 0);
				2059	}
				2060
				2061	static void tell_cpu_to_push(struct rq *rq)
				2062	{
				2063	int cpu = -1;
				2064
				2065	/* Keep the loop going if the IPI is currently active */
				2066	atomic_inc(&rq->rd->rto_loop_next);
				2067
				2068	/* Only one CPU can initiate a loop at a time */
				2069	if (!rto_start_trylock(&rq->rd->rto_loop_start))
				2070	return;
				2071
				2072	raw_spin_lock(&rq->rd->rto_lock);
				2073
				2074	/*
				2075	* The rto_cpu is updated under the lock, if it has a valid CPU
				2076	* then the IPI is still running and will continue due to the
				2077	* update to loop_next, and nothing needs to be done here.
				2078	* Otherwise it is finishing up and an ipi needs to be sent.
				2079	*/
				2080	if (rq->rd->rto_cpu < 0)
				2081	cpu = rto_next_cpu(rq->rd);
				2082
				2083	raw_spin_unlock(&rq->rd->rto_lock);
				2084
				2085	rto_start_unlock(&rq->rd->rto_loop_start);
				2086
				2087	if (cpu >= 0) {
				2088	/* Make sure the rd does not get freed while pushing */
				2089	sched_get_rd(rq->rd);
				2090	irq_work_queue_on(&rq->rd->rto_push_work, cpu);
				2091	}
				2092	}
				2093
				2094	/* Called from hardirq context */
				2095	void rto_push_irq_work_func(struct irq_work *work)
				2096	{
				2097	struct root_domain *rd =
				2098	container_of(work, struct root_domain, rto_push_work);
				2099	struct rq *rq;
				2100	int cpu;
				2101
				2102	rq = this_rq();
				2103
				2104	/*
				2105	* We do not need to grab the lock to check for has_pushable_tasks.
				2106	* When it gets updated, a check is made if a push is possible.
				2107	*/
				2108	if (has_pushable_tasks(rq)) {
				2109	raw_spin_lock(&rq->lock);
				2110	push_rt_tasks(rq);
				2111	raw_spin_unlock(&rq->lock);
				2112	}
				2113
				2114	raw_spin_lock(&rd->rto_lock);
				2115
				2116	/* Pass the IPI to the next rt overloaded queue */
				2117	cpu = rto_next_cpu(rd);
				2118
				2119	raw_spin_unlock(&rd->rto_lock);
				2120
				2121	if (cpu < 0) {
				2122	sched_put_rd(rd);
				2123	return;
				2124	}
				2125
				2126	/* Try the next RT overloaded CPU */
				2127	irq_work_queue_on(&rd->rto_push_work, cpu);
				2128	}
				2129	#endif /* HAVE_RT_PUSH_IPI */
				2130
				2131	static void pull_rt_task(struct rq *this_rq)
				2132	{
				2133	int this_cpu = this_rq->cpu, cpu;
				2134	bool resched = false;
				2135	struct task_struct *p;
				2136	struct rq *src_rq;
				2137	int rt_overload_count = rt_overloaded(this_rq);
				2138
				2139	if (likely(!rt_overload_count))
				2140	return;
				2141
				2142	/*
				2143	* Match the barrier from rt_set_overloaded; this guarantees that if we
				2144	* see overloaded we must also see the rto_mask bit.
				2145	*/
				2146	smp_rmb();
				2147
				2148	/* If we are the only overloaded CPU do nothing */
				2149	if (rt_overload_count == 1 &&
				2150	cpumask_test_cpu(this_rq->cpu, this_rq->rd->rto_mask))
				2151	return;
				2152
				2153	#ifdef HAVE_RT_PUSH_IPI
				2154	if (sched_feat(RT_PUSH_IPI)) {
				2155	tell_cpu_to_push(this_rq);
				2156	return;
				2157	}
				2158	#endif
				2159
				2160	for_each_cpu(cpu, this_rq->rd->rto_mask) {
				2161	if (this_cpu == cpu)
				2162	continue;
				2163
				2164	src_rq = cpu_rq(cpu);
				2165
				2166	/*
				2167	* Don't bother taking the src_rq->lock if the next highest
				2168	* task is known to be lower-priority than our current task.
				2169	* This may look racy, but if this value is about to go
				2170	* logically higher, the src_rq will push this task away.
				2171	* And if its going logically lower, we do not care
				2172	*/
				2173	if (src_rq->rt.highest_prio.next >=
				2174	this_rq->rt.highest_prio.curr)
				2175	continue;
				2176
				2177	/*
				2178	* We can potentially drop this_rq's lock in
				2179	* double_lock_balance, and another CPU could
				2180	* alter this_rq
				2181	*/
				2182	double_lock_balance(this_rq, src_rq);
				2183
				2184	/*
				2185	* We can pull only a task, which is pushable
				2186	* on its rq, and no others.
				2187	*/
				2188	p = pick_highest_pushable_task(src_rq, this_cpu);
				2189
				2190	/*
				2191	* Do we have an RT task that preempts
				2192	* the to-be-scheduled task?
				2193	*/
				2194	if (p && (p->prio < this_rq->rt.highest_prio.curr)) {
				2195	WARN_ON(p == src_rq->curr);
				2196	WARN_ON(!task_on_rq_queued(p));
				2197
				2198	/*
				2199	* There's a chance that p is higher in priority
				2200	* than what's currently running on its CPU.
				2201	* This is just that p is wakeing up and hasn't
				2202	* had a chance to schedule. We only pull
				2203	* p if it is lower in priority than the
				2204	* current task on the run queue
				2205	*/
				2206	if (p->prio < src_rq->curr->prio)
				2207	goto skip;
				2208
				2209	resched = true;
				2210
				2211	deactivate_task(src_rq, p, 0);
				2212	set_task_cpu(p, this_cpu);
				2213	activate_task(this_rq, p, 0);
				2214	/*
				2215	* We continue with the search, just in
				2216	* case there's an even higher prio task
				2217	* in another runqueue. (low likelihood
				2218	* but possible)
				2219	*/
				2220	}
				2221	skip:
				2222	double_unlock_balance(this_rq, src_rq);
				2223	}
				2224
				2225	if (resched)
				2226	resched_curr(this_rq);
				2227	}
				2228
				2229	/*
				2230	* If we are not running and we are not going to reschedule soon, we should
				2231	* try to push tasks away now
				2232	*/
				2233	static void task_woken_rt(struct rq rq, struct task_struct p)
				2234	{
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame]	2235	bool need_to_push = !task_running(rq, p) &&
				2236	!test_tsk_need_resched(rq->curr) &&
				2237	p->nr_cpus_allowed > 1 &&
				2238	(dl_task(rq->curr) \|\| rt_task(rq->curr)) &&
				2239	(rq->curr->nr_cpus_allowed < 2 \|\|
				2240	rq->curr->prio <= p->prio);
				2241
				2242	if (need_to_push)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2243	push_rt_tasks(rq);
				2244	}
				2245
				2246	/* Assumes rq->lock is held */
				2247	static void rq_online_rt(struct rq *rq)
				2248	{
				2249	if (rq->rt.overloaded)
				2250	rt_set_overload(rq);
				2251
				2252	__enable_runtime(rq);
				2253
				2254	cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio.curr);
				2255	}
				2256
				2257	/* Assumes rq->lock is held */
				2258	static void rq_offline_rt(struct rq *rq)
				2259	{
				2260	if (rq->rt.overloaded)
				2261	rt_clear_overload(rq);
				2262
				2263	__disable_runtime(rq);
				2264
				2265	cpupri_set(&rq->rd->cpupri, rq->cpu, CPUPRI_INVALID);
				2266	}
				2267
				2268	/*
				2269	* When switch from the rt queue, we bring ourselves to a position
				2270	* that we might want to pull RT tasks from other runqueues.
				2271	*/
				2272	static void switched_from_rt(struct rq rq, struct task_struct p)
				2273	{
				2274	/*
				2275	* If there are other RT tasks then we will reschedule
				2276	* and the scheduling of the other RT tasks will handle
				2277	* the balancing. But if we are the last RT task
				2278	* we may need to handle the pulling of RT tasks
				2279	* now.
				2280	*/
				2281	if (!task_on_rq_queued(p) \|\| rq->rt.rt_nr_running)
				2282	return;
				2283
				2284	rt_queue_pull_task(rq);
				2285	}
				2286
				2287	void __init init_sched_rt_class(void)
				2288	{
				2289	unsigned int i;
				2290
				2291	for_each_possible_cpu(i) {
				2292	zalloc_cpumask_var_node(&per_cpu(local_cpu_mask, i),
				2293	GFP_KERNEL, cpu_to_node(i));
				2294	}
				2295	}
				2296	#endif /* CONFIG_SMP */
				2297
				2298	/*
				2299	* When switching a task to RT, we may overload the runqueue
				2300	* with RT tasks. In this case we try to push them off to
				2301	* other runqueues.
				2302	*/
				2303	static void switched_to_rt(struct rq rq, struct task_struct p)
				2304	{
				2305	/*
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame]	2306	* If we are running, update the avg_rt tracking, as the running time
				2307	* will now on be accounted into the latter.
				2308	*/
				2309	if (task_current(rq, p)) {
				2310	update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 0);
				2311	return;
				2312	}
				2313
				2314	/*
				2315	* If we are not running we may need to preempt the current
				2316	* running task. If that current running task is also an RT task
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2317	* then see if we can move to another run queue.
				2318	*/
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame]	2319	if (task_on_rq_queued(p)) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2320	#ifdef CONFIG_SMP
				2321	if (p->nr_cpus_allowed > 1 && rq->rt.overloaded)
				2322	rt_queue_push_tasks(rq);
				2323	#endif /* CONFIG_SMP */
				2324	if (p->prio < rq->curr->prio && cpu_online(cpu_of(rq)))
				2325	resched_curr(rq);
				2326	}
				2327	}
				2328
				2329	/*
				2330	* Priority of the task has changed. This may cause
				2331	* us to initiate a push or pull.
				2332	*/
				2333	static void
				2334	prio_changed_rt(struct rq rq, struct task_struct p, int oldprio)
				2335	{
				2336	if (!task_on_rq_queued(p))
				2337	return;
				2338
				2339	if (rq->curr == p) {
				2340	#ifdef CONFIG_SMP
				2341	/*
				2342	* If our priority decreases while running, we
				2343	* may need to pull tasks to this runqueue.
				2344	*/
				2345	if (oldprio < p->prio)
				2346	rt_queue_pull_task(rq);
				2347
				2348	/*
				2349	* If there's a higher priority task waiting to run
				2350	* then reschedule.
				2351	*/
				2352	if (p->prio > rq->rt.highest_prio.curr)
				2353	resched_curr(rq);
				2354	#else
				2355	/* For UP simply resched on drop of prio */
				2356	if (oldprio < p->prio)
				2357	resched_curr(rq);
				2358	#endif /* CONFIG_SMP */
				2359	} else {
				2360	/*
				2361	* This task is not running, but if it is
				2362	* greater than the current running task
				2363	* then reschedule.
				2364	*/
				2365	if (p->prio < rq->curr->prio)
				2366	resched_curr(rq);
				2367	}
				2368	}
				2369
				2370	#ifdef CONFIG_POSIX_TIMERS
				2371	static void watchdog(struct rq rq, struct task_struct p)
				2372	{
				2373	unsigned long soft, hard;
				2374
				2375	/* max may change after cur was read, this will be fixed next tick */
				2376	soft = task_rlimit(p, RLIMIT_RTTIME);
				2377	hard = task_rlimit_max(p, RLIMIT_RTTIME);
				2378
				2379	if (soft != RLIM_INFINITY) {
				2380	unsigned long next;
				2381
				2382	if (p->rt.watchdog_stamp != jiffies) {
				2383	p->rt.timeout++;
				2384	p->rt.watchdog_stamp = jiffies;
				2385	}
				2386
				2387	next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2388	if (p->rt.timeout > next) {
				2389	posix_cputimers_rt_watchdog(&p->posix_cputimers,
				2390	p->se.sum_exec_runtime);
				2391	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2392	}
				2393	}
				2394	#else
				2395	static inline void watchdog(struct rq rq, struct task_struct p) { }
				2396	#endif
				2397
				2398	/*
				2399	* scheduler tick hitting a task of our scheduling class.
				2400	*
				2401	* NOTE: This function can be called remotely by the tick offload that
				2402	* goes along full dynticks. Therefore no local assumption can be made
				2403	* and everything must be accessed through the @rq and @curr passed in
				2404	* parameters.
				2405	*/
				2406	static void task_tick_rt(struct rq rq, struct task_struct p, int queued)
				2407	{
				2408	struct sched_rt_entity *rt_se = &p->rt;
				2409
				2410	update_curr_rt(rq);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2411	update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 1);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2412
				2413	watchdog(rq, p);
				2414
				2415	/*
				2416	* RR tasks need a special form of timeslice management.
				2417	* FIFO tasks have no timeslices.
				2418	*/
				2419	if (p->policy != SCHED_RR)
				2420	return;
				2421
				2422	if (--p->rt.time_slice)
				2423	return;
				2424
				2425	p->rt.time_slice = sched_rr_timeslice;
				2426
				2427	/*
				2428	* Requeue to the end of queue if we (and all of our ancestors) are not
				2429	* the only element on the queue
				2430	*/
				2431	for_each_sched_rt_entity(rt_se) {
				2432	if (rt_se->run_list.prev != rt_se->run_list.next) {
				2433	requeue_task_rt(rq, p, 0);
				2434	resched_curr(rq);
				2435	return;
				2436	}
				2437	}
				2438	}
				2439
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2440	static unsigned int get_rr_interval_rt(struct rq rq, struct task_struct task)
				2441	{
				2442	/*
				2443	* Time slice is 0 for SCHED_FIFO tasks
				2444	*/
				2445	if (task->policy == SCHED_RR)
				2446	return sched_rr_timeslice;
				2447	else
				2448	return 0;
				2449	}
				2450
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame]	2451	const struct sched_class rt_sched_class
				2452	__section("__rt_sched_class") = {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2453	.enqueue_task = enqueue_task_rt,
				2454	.dequeue_task = dequeue_task_rt,
				2455	.yield_task = yield_task_rt,
				2456
				2457	.check_preempt_curr = check_preempt_curr_rt,
				2458
				2459	.pick_next_task = pick_next_task_rt,
				2460	.put_prev_task = put_prev_task_rt,
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2461	.set_next_task = set_next_task_rt,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2462
				2463	#ifdef CONFIG_SMP
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2464	.balance = balance_rt,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2465	.select_task_rq = select_task_rq_rt,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2466	.set_cpus_allowed = set_cpus_allowed_common,
				2467	.rq_online = rq_online_rt,
				2468	.rq_offline = rq_offline_rt,
				2469	.task_woken = task_woken_rt,
				2470	.switched_from = switched_from_rt,
				2471	#endif
				2472
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2473	.task_tick = task_tick_rt,
				2474
				2475	.get_rr_interval = get_rr_interval_rt,
				2476
				2477	.prio_changed = prio_changed_rt,
				2478	.switched_to = switched_to_rt,
				2479
				2480	.update_curr = update_curr_rt,
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2481
				2482	#ifdef CONFIG_UCLAMP_TASK
				2483	.uclamp_enabled = 1,
				2484	#endif
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2485	};
				2486
				2487	#ifdef CONFIG_RT_GROUP_SCHED
				2488	/*
				2489	* Ensure that the real time constraints are schedulable.
				2490	*/
				2491	static DEFINE_MUTEX(rt_constraints_mutex);
				2492
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2493	static inline int tg_has_rt_tasks(struct task_group *tg)
				2494	{
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame]	2495	struct task_struct *task;
				2496	struct css_task_iter it;
				2497	int ret = 0;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2498
				2499	/*
				2500	* Autogroups do not have RT tasks; see autogroup_create().
				2501	*/
				2502	if (task_group_is_autogroup(tg))
				2503	return 0;
				2504
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame]	2505	css_task_iter_start(&tg->css, 0, &it);
				2506	while (!ret && (task = css_task_iter_next(&it)))
				2507	ret \|= rt_task(task);
				2508	css_task_iter_end(&it);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2509
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame]	2510	return ret;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2511	}
				2512
				2513	struct rt_schedulable_data {
				2514	struct task_group *tg;
				2515	u64 rt_period;
				2516	u64 rt_runtime;
				2517	};
				2518
				2519	static int tg_rt_schedulable(struct task_group tg, void data)
				2520	{
				2521	struct rt_schedulable_data *d = data;
				2522	struct task_group *child;
				2523	unsigned long total, sum = 0;
				2524	u64 period, runtime;
				2525
				2526	period = ktime_to_ns(tg->rt_bandwidth.rt_period);
				2527	runtime = tg->rt_bandwidth.rt_runtime;
				2528
				2529	if (tg == d->tg) {
				2530	period = d->rt_period;
				2531	runtime = d->rt_runtime;
				2532	}
				2533
				2534	/*
				2535	* Cannot have more runtime than the period.
				2536	*/
				2537	if (runtime > period && runtime != RUNTIME_INF)
				2538	return -EINVAL;
				2539
				2540	/*
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame]	2541	* Ensure we don't starve existing RT tasks if runtime turns zero.
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2542	*/
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame]	2543	if (rt_bandwidth_enabled() && !runtime &&
				2544	tg->rt_bandwidth.rt_runtime && tg_has_rt_tasks(tg))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2545	return -EBUSY;
				2546
				2547	total = to_ratio(period, runtime);
				2548
				2549	/*
				2550	* Nobody can have more than the global setting allows.
				2551	*/
				2552	if (total > to_ratio(global_rt_period(), global_rt_runtime()))
				2553	return -EINVAL;
				2554
				2555	/*
				2556	* The sum of our children's runtime should not exceed our own.
				2557	*/
				2558	list_for_each_entry_rcu(child, &tg->children, siblings) {
				2559	period = ktime_to_ns(child->rt_bandwidth.rt_period);
				2560	runtime = child->rt_bandwidth.rt_runtime;
				2561
				2562	if (child == d->tg) {
				2563	period = d->rt_period;
				2564	runtime = d->rt_runtime;
				2565	}
				2566
				2567	sum += to_ratio(period, runtime);
				2568	}
				2569
				2570	if (sum > total)
				2571	return -EINVAL;
				2572
				2573	return 0;
				2574	}
				2575
				2576	static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
				2577	{
				2578	int ret;
				2579
				2580	struct rt_schedulable_data data = {
				2581	.tg = tg,
				2582	.rt_period = period,
				2583	.rt_runtime = runtime,
				2584	};
				2585
				2586	rcu_read_lock();
				2587	ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data);
				2588	rcu_read_unlock();
				2589
				2590	return ret;
				2591	}
				2592
				2593	static int tg_set_rt_bandwidth(struct task_group *tg,
				2594	u64 rt_period, u64 rt_runtime)
				2595	{
				2596	int i, err = 0;
				2597
				2598	/*
				2599	* Disallowing the root group RT runtime is BAD, it would disallow the
				2600	* kernel creating (and or operating) RT threads.
				2601	*/
				2602	if (tg == &root_task_group && rt_runtime == 0)
				2603	return -EINVAL;
				2604
				2605	/* No period doesn't make any sense. */
				2606	if (rt_period == 0)
				2607	return -EINVAL;
				2608
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame]	2609	/*
				2610	* Bound quota to defend quota against overflow during bandwidth shift.
				2611	*/
				2612	if (rt_runtime != RUNTIME_INF && rt_runtime > max_rt_runtime)
				2613	return -EINVAL;
				2614
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2615	mutex_lock(&rt_constraints_mutex);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2616	err = __rt_schedulable(tg, rt_period, rt_runtime);
				2617	if (err)
				2618	goto unlock;
				2619
				2620	raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
				2621	tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
				2622	tg->rt_bandwidth.rt_runtime = rt_runtime;
				2623
				2624	for_each_possible_cpu(i) {
				2625	struct rt_rq *rt_rq = tg->rt_rq[i];
				2626
				2627	raw_spin_lock(&rt_rq->rt_runtime_lock);
				2628	rt_rq->rt_runtime = rt_runtime;
				2629	raw_spin_unlock(&rt_rq->rt_runtime_lock);
				2630	}
				2631	raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
				2632	unlock:
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2633	mutex_unlock(&rt_constraints_mutex);
				2634
				2635	return err;
				2636	}
				2637
				2638	int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
				2639	{
				2640	u64 rt_runtime, rt_period;
				2641
				2642	rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
				2643	rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
				2644	if (rt_runtime_us < 0)
				2645	rt_runtime = RUNTIME_INF;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2646	else if ((u64)rt_runtime_us > U64_MAX / NSEC_PER_USEC)
				2647	return -EINVAL;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2648
				2649	return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
				2650	}
				2651
				2652	long sched_group_rt_runtime(struct task_group *tg)
				2653	{
				2654	u64 rt_runtime_us;
				2655
				2656	if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF)
				2657	return -1;
				2658
				2659	rt_runtime_us = tg->rt_bandwidth.rt_runtime;
				2660	do_div(rt_runtime_us, NSEC_PER_USEC);
				2661	return rt_runtime_us;
				2662	}
				2663
				2664	int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us)
				2665	{
				2666	u64 rt_runtime, rt_period;
				2667
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2668	if (rt_period_us > U64_MAX / NSEC_PER_USEC)
				2669	return -EINVAL;
				2670
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2671	rt_period = rt_period_us * NSEC_PER_USEC;
				2672	rt_runtime = tg->rt_bandwidth.rt_runtime;
				2673
				2674	return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
				2675	}
				2676
				2677	long sched_group_rt_period(struct task_group *tg)
				2678	{
				2679	u64 rt_period_us;
				2680
				2681	rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period);
				2682	do_div(rt_period_us, NSEC_PER_USEC);
				2683	return rt_period_us;
				2684	}
				2685
				2686	static int sched_rt_global_constraints(void)
				2687	{
				2688	int ret = 0;
				2689
				2690	mutex_lock(&rt_constraints_mutex);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2691	ret = __rt_schedulable(NULL, 0, 0);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2692	mutex_unlock(&rt_constraints_mutex);
				2693
				2694	return ret;
				2695	}
				2696
				2697	int sched_rt_can_attach(struct task_group tg, struct task_struct tsk)
				2698	{
				2699	/* Don't accept realtime tasks when there is no way for them to run */
				2700	if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)
				2701	return 0;
				2702
				2703	return 1;
				2704	}
				2705
				2706	#else /* !CONFIG_RT_GROUP_SCHED */
				2707	static int sched_rt_global_constraints(void)
				2708	{
				2709	unsigned long flags;
				2710	int i;
				2711
				2712	raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
				2713	for_each_possible_cpu(i) {
				2714	struct rt_rq *rt_rq = &cpu_rq(i)->rt;
				2715
				2716	raw_spin_lock(&rt_rq->rt_runtime_lock);
				2717	rt_rq->rt_runtime = global_rt_runtime();
				2718	raw_spin_unlock(&rt_rq->rt_runtime_lock);
				2719	}
				2720	raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
				2721
				2722	return 0;
				2723	}
				2724	#endif /* CONFIG_RT_GROUP_SCHED */
				2725
				2726	static int sched_rt_global_validate(void)
				2727	{
				2728	if (sysctl_sched_rt_period <= 0)
				2729	return -EINVAL;
				2730
				2731	if ((sysctl_sched_rt_runtime != RUNTIME_INF) &&
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame]	2732	((sysctl_sched_rt_runtime > sysctl_sched_rt_period) \|\|
				2733	((u64)sysctl_sched_rt_runtime *
				2734	NSEC_PER_USEC > max_rt_runtime)))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2735	return -EINVAL;
				2736
				2737	return 0;
				2738	}
				2739
				2740	static void sched_rt_do_global(void)
				2741	{
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame]	2742	unsigned long flags;
				2743
				2744	raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2745	def_rt_bandwidth.rt_runtime = global_rt_runtime();
				2746	def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period());
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame]	2747	raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2748	}
				2749
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame]	2750	int sched_rt_handler(struct ctl_table table, int write, void buffer,
				2751	size_t lenp, loff_t ppos)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2752	{
				2753	int old_period, old_runtime;
				2754	static DEFINE_MUTEX(mutex);
				2755	int ret;
				2756
				2757	mutex_lock(&mutex);
				2758	old_period = sysctl_sched_rt_period;
				2759	old_runtime = sysctl_sched_rt_runtime;
				2760
				2761	ret = proc_dointvec(table, write, buffer, lenp, ppos);
				2762
				2763	if (!ret && write) {
				2764	ret = sched_rt_global_validate();
				2765	if (ret)
				2766	goto undo;
				2767
				2768	ret = sched_dl_global_validate();
				2769	if (ret)
				2770	goto undo;
				2771
				2772	ret = sched_rt_global_constraints();
				2773	if (ret)
				2774	goto undo;
				2775
				2776	sched_rt_do_global();
				2777	sched_dl_do_global();
				2778	}
				2779	if (0) {
				2780	undo:
				2781	sysctl_sched_rt_period = old_period;
				2782	sysctl_sched_rt_runtime = old_runtime;
				2783	}
				2784	mutex_unlock(&mutex);
				2785
				2786	return ret;
				2787	}
				2788
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame]	2789	int sched_rr_handler(struct ctl_table table, int write, void buffer,
				2790	size_t lenp, loff_t ppos)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2791	{
				2792	int ret;
				2793	static DEFINE_MUTEX(mutex);
				2794
				2795	mutex_lock(&mutex);
				2796	ret = proc_dointvec(table, write, buffer, lenp, ppos);
				2797	/*
				2798	* Make sure that internally we keep jiffies.
				2799	* Also, writing zero resets the timeslice to default:
				2800	*/
				2801	if (!ret && write) {
				2802	sched_rr_timeslice =
				2803	sysctl_sched_rr_timeslice <= 0 ? RR_TIMESLICE :
				2804	msecs_to_jiffies(sysctl_sched_rr_timeslice);
				2805	}
				2806	mutex_unlock(&mutex);
				2807
				2808	return ret;
				2809	}
				2810
				2811	#ifdef CONFIG_SCHED_DEBUG
				2812	void print_rt_stats(struct seq_file *m, int cpu)
				2813	{
				2814	rt_rq_iter_t iter;
				2815	struct rt_rq *rt_rq;
				2816
				2817	rcu_read_lock();
				2818	for_each_rt_rq(rt_rq, iter, cpu_rq(cpu))
				2819	print_rt_rq(m, cpu, rt_rq);
				2820	rcu_read_unlock();
				2821	}
				2822	#endif /* CONFIG_SCHED_DEBUG */