Blame - kernel/sched/fair.c - hafnium/third_party/linux

blob: 87d9fad9d01d6e25e75370d134bb6c082633d558 [file] [log] [blame]

Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1	// SPDX-License-Identifier: GPL-2.0
				2	/*
				3	* Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
				4	*
				5	* Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
				6	*
				7	* Interactivity improvements by Mike Galbraith
				8	* (C) 2007 Mike Galbraith <efault@gmx.de>
				9	*
				10	* Various enhancements by Dmitry Adamushko.
				11	* (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com>
				12	*
				13	* Group scheduling enhancements by Srivatsa Vaddagiri
				14	* Copyright IBM Corporation, 2007
				15	* Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
				16	*
				17	* Scaled math optimizations by Thomas Gleixner
				18	* Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
				19	*
				20	* Adaptive scheduling granularity, math enhancements by Peter Zijlstra
				21	* Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
				22	*/
				23	#include "sched.h"
				24
				25	#include <trace/events/sched.h>
				26
				27	/*
				28	* Targeted preemption latency for CPU-bound tasks:
				29	*
				30	* NOTE: this latency value is not the same as the concept of
				31	* 'timeslice length' - timeslices in CFS are of variable length
				32	* and have no persistent notion like in traditional, time-slice
				33	* based scheduling concepts.
				34	*
				35	* (to see the precise effective timeslice length of your workload,
				36	* run vmstat and monitor the context-switches (cs) field)
				37	*
				38	* (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
				39	*/
				40	unsigned int sysctl_sched_latency = 6000000ULL;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	41	static unsigned int normalized_sysctl_sched_latency = 6000000ULL;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	42
				43	/*
				44	* The initial- and re-scaling of tunables is configurable
				45	*
				46	* Options are:
				47	*
				48	* SCHED_TUNABLESCALING_NONE - unscaled, always *1
				49	* SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
				50	* SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
				51	*
				52	* (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
				53	*/
				54	enum sched_tunable_scaling sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG;
				55
				56	/*
				57	* Minimal preemption granularity for CPU-bound tasks:
				58	*
				59	* (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
				60	*/
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	61	unsigned int sysctl_sched_min_granularity = 750000ULL;
				62	static unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	63
				64	/*
				65	* This value is kept at sysctl_sched_latency/sysctl_sched_min_granularity
				66	*/
				67	static unsigned int sched_nr_latency = 8;
				68
				69	/*
				70	* After fork, child runs first. If set to 0 (default) then
				71	* parent will (try to) run first.
				72	*/
				73	unsigned int sysctl_sched_child_runs_first __read_mostly;
				74
				75	/*
				76	* SCHED_OTHER wake-up granularity.
				77	*
				78	* This option delays the preemption effects of decoupled workloads
				79	* and reduces their over-scheduling. Synchronous workloads will still
				80	* have immediate wakeup/sleep latencies.
				81	*
				82	* (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
				83	*/
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	84	unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
				85	static unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	86
				87	const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
				88
				89	#ifdef CONFIG_SMP
				90	/*
				91	* For asym packing, by default the lower numbered CPU has higher priority.
				92	*/
				93	int __weak arch_asym_cpu_priority(int cpu)
				94	{
				95	return -cpu;
				96	}
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	97
				98	/*
				99	* The margin used when comparing utilization with CPU capacity.
				100	*
				101	* (default: ~20%)
				102	*/
				103	#define fits_capacity(cap, max) ((cap) * 1280 < (max) * 1024)
				104
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	105	#endif
				106
				107	#ifdef CONFIG_CFS_BANDWIDTH
				108	/*
				109	* Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool
				110	* each time a cfs_rq requests quota.
				111	*
				112	* Note: in the case that the slice exceeds the runtime remaining (either due
				113	* to consumption or the quota being specified to be smaller than the slice)
				114	* we will always only issue the remaining available time.
				115	*
				116	* (default: 5 msec, units: microseconds)
				117	*/
				118	unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
				119	#endif
				120
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	121	static inline void update_load_add(struct load_weight *lw, unsigned long inc)
				122	{
				123	lw->weight += inc;
				124	lw->inv_weight = 0;
				125	}
				126
				127	static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
				128	{
				129	lw->weight -= dec;
				130	lw->inv_weight = 0;
				131	}
				132
				133	static inline void update_load_set(struct load_weight *lw, unsigned long w)
				134	{
				135	lw->weight = w;
				136	lw->inv_weight = 0;
				137	}
				138
				139	/*
				140	* Increase the granularity value when there are more CPUs,
				141	* because with more CPUs the 'effective latency' as visible
				142	* to users decreases. But the relationship is not linear,
				143	* so pick a second-best guess by going with the log2 of the
				144	* number of CPUs.
				145	*
				146	* This idea comes from the SD scheduler of Con Kolivas:
				147	*/
				148	static unsigned int get_update_sysctl_factor(void)
				149	{
				150	unsigned int cpus = min_t(unsigned int, num_online_cpus(), 8);
				151	unsigned int factor;
				152
				153	switch (sysctl_sched_tunable_scaling) {
				154	case SCHED_TUNABLESCALING_NONE:
				155	factor = 1;
				156	break;
				157	case SCHED_TUNABLESCALING_LINEAR:
				158	factor = cpus;
				159	break;
				160	case SCHED_TUNABLESCALING_LOG:
				161	default:
				162	factor = 1 + ilog2(cpus);
				163	break;
				164	}
				165
				166	return factor;
				167	}
				168
				169	static void update_sysctl(void)
				170	{
				171	unsigned int factor = get_update_sysctl_factor();
				172
				173	#define SET_SYSCTL(name) \
				174	(sysctl_##name = (factor) * normalized_sysctl_##name)
				175	SET_SYSCTL(sched_min_granularity);
				176	SET_SYSCTL(sched_latency);
				177	SET_SYSCTL(sched_wakeup_granularity);
				178	#undef SET_SYSCTL
				179	}
				180
				181	void sched_init_granularity(void)
				182	{
				183	update_sysctl();
				184	}
				185
				186	#define WMULT_CONST (~0U)
				187	#define WMULT_SHIFT 32
				188
				189	static void __update_inv_weight(struct load_weight *lw)
				190	{
				191	unsigned long w;
				192
				193	if (likely(lw->inv_weight))
				194	return;
				195
				196	w = scale_load_down(lw->weight);
				197
				198	if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
				199	lw->inv_weight = 1;
				200	else if (unlikely(!w))
				201	lw->inv_weight = WMULT_CONST;
				202	else
				203	lw->inv_weight = WMULT_CONST / w;
				204	}
				205
				206	/*
				207	* delta_exec * weight / lw.weight
				208	* OR
				209	* (delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT
				210	*
				211	* Either weight := NICE_0_LOAD and lw \e sched_prio_to_wmult[], in which case
				212	* we're guaranteed shift stays positive because inv_weight is guaranteed to
				213	* fit 32 bits, and NICE_0_LOAD gives another 10 bits; therefore shift >= 22.
				214	*
				215	* Or, weight =< lw.weight (because lw.weight is the runqueue weight), thus
				216	* weight/lw.weight <= 1, and therefore our shift will also be positive.
				217	*/
				218	static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)
				219	{
				220	u64 fact = scale_load_down(weight);
				221	int shift = WMULT_SHIFT;
				222
				223	__update_inv_weight(lw);
				224
				225	if (unlikely(fact >> 32)) {
				226	while (fact >> 32) {
				227	fact >>= 1;
				228	shift--;
				229	}
				230	}
				231
				232	/* hint to use a 32x32->64 mul */
				233	fact = (u64)(u32)fact * lw->inv_weight;
				234
				235	while (fact >> 32) {
				236	fact >>= 1;
				237	shift--;
				238	}
				239
				240	return mul_u64_u32_shr(delta_exec, fact, shift);
				241	}
				242
				243
				244	const struct sched_class fair_sched_class;
				245
				246	/**************************************************************
				247	* CFS operations on generic schedulable entities:
				248	*/
				249
				250	#ifdef CONFIG_FAIR_GROUP_SCHED
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	251	static inline struct task_struct task_of(struct sched_entity se)
				252	{
				253	SCHED_WARN_ON(!entity_is_task(se));
				254	return container_of(se, struct task_struct, se);
				255	}
				256
				257	/* Walk up scheduling entities hierarchy */
				258	#define for_each_sched_entity(se) \
				259	for (; se; se = se->parent)
				260
				261	static inline struct cfs_rq task_cfs_rq(struct task_struct p)
				262	{
				263	return p->se.cfs_rq;
				264	}
				265
				266	/* runqueue on which this entity is (to be) queued */
				267	static inline struct cfs_rq cfs_rq_of(struct sched_entity se)
				268	{
				269	return se->cfs_rq;
				270	}
				271
				272	/* runqueue "owned" by this group */
				273	static inline struct cfs_rq group_cfs_rq(struct sched_entity grp)
				274	{
				275	return grp->my_q;
				276	}
				277
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	278	static inline void cfs_rq_tg_path(struct cfs_rq cfs_rq, char path, int len)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	279	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	280	if (!path)
				281	return;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	282
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	283	if (cfs_rq && task_group_is_autogroup(cfs_rq->tg))
				284	autogroup_path(cfs_rq->tg, path, len);
				285	else if (cfs_rq && cfs_rq->tg->css.cgroup)
				286	cgroup_path(cfs_rq->tg->css.cgroup, path, len);
				287	else
				288	strlcpy(path, "(null)", len);
				289	}
				290
				291	static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
				292	{
				293	struct rq *rq = rq_of(cfs_rq);
				294	int cpu = cpu_of(rq);
				295
				296	if (cfs_rq->on_list)
				297	return rq->tmp_alone_branch == &rq->leaf_cfs_rq_list;
				298
				299	cfs_rq->on_list = 1;
				300
				301	/*
				302	* Ensure we either appear before our parent (if already
				303	* enqueued) or force our parent to appear after us when it is
				304	* enqueued. The fact that we always enqueue bottom-up
				305	* reduces this to two cases and a special case for the root
				306	* cfs_rq. Furthermore, it also means that we will always reset
				307	* tmp_alone_branch either when the branch is connected
				308	* to a tree or when we reach the top of the tree
				309	*/
				310	if (cfs_rq->tg->parent &&
				311	cfs_rq->tg->parent->cfs_rq[cpu]->on_list) {
				312	/*
				313	* If parent is already on the list, we add the child
				314	* just before. Thanks to circular linked property of
				315	* the list, this means to put the child at the tail
				316	* of the list that starts by parent.
				317	*/
				318	list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
				319	&(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list));
				320	/*
				321	* The branch is now connected to its tree so we can
				322	* reset tmp_alone_branch to the beginning of the
				323	* list.
				324	*/
				325	rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
				326	return true;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	327	}
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	328
				329	if (!cfs_rq->tg->parent) {
				330	/*
				331	* cfs rq without parent should be put
				332	* at the tail of the list.
				333	*/
				334	list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
				335	&rq->leaf_cfs_rq_list);
				336	/*
				337	* We have reach the top of a tree so we can reset
				338	* tmp_alone_branch to the beginning of the list.
				339	*/
				340	rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
				341	return true;
				342	}
				343
				344	/*
				345	* The parent has not already been added so we want to
				346	* make sure that it will be put after us.
				347	* tmp_alone_branch points to the begin of the branch
				348	* where we will add parent.
				349	*/
				350	list_add_rcu(&cfs_rq->leaf_cfs_rq_list, rq->tmp_alone_branch);
				351	/*
				352	* update tmp_alone_branch to points to the new begin
				353	* of the branch
				354	*/
				355	rq->tmp_alone_branch = &cfs_rq->leaf_cfs_rq_list;
				356	return false;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	357	}
				358
				359	static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
				360	{
				361	if (cfs_rq->on_list) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	362	struct rq *rq = rq_of(cfs_rq);
				363
				364	/*
				365	* With cfs_rq being unthrottled/throttled during an enqueue,
				366	* it can happen the tmp_alone_branch points the a leaf that
				367	* we finally want to del. In this case, tmp_alone_branch moves
				368	* to the prev element but it will point to rq->leaf_cfs_rq_list
				369	* at the end of the enqueue.
				370	*/
				371	if (rq->tmp_alone_branch == &cfs_rq->leaf_cfs_rq_list)
				372	rq->tmp_alone_branch = cfs_rq->leaf_cfs_rq_list.prev;
				373
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	374	list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
				375	cfs_rq->on_list = 0;
				376	}
				377	}
				378
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	379	static inline void assert_list_leaf_cfs_rq(struct rq *rq)
				380	{
				381	SCHED_WARN_ON(rq->tmp_alone_branch != &rq->leaf_cfs_rq_list);
				382	}
				383
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	384	/* Iterate thr' all leaf cfs_rq's on a runqueue */
				385	#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \
				386	list_for_each_entry_safe(cfs_rq, pos, &rq->leaf_cfs_rq_list, \
				387	leaf_cfs_rq_list)
				388
				389	/* Do the two (enqueued) entities belong to the same group ? */
				390	static inline struct cfs_rq *
				391	is_same_group(struct sched_entity se, struct sched_entity pse)
				392	{
				393	if (se->cfs_rq == pse->cfs_rq)
				394	return se->cfs_rq;
				395
				396	return NULL;
				397	}
				398
				399	static inline struct sched_entity parent_entity(struct sched_entity se)
				400	{
				401	return se->parent;
				402	}
				403
				404	static void
				405	find_matching_se(struct sched_entity se, struct sched_entity pse)
				406	{
				407	int se_depth, pse_depth;
				408
				409	/*
				410	* preemption test can be made between sibling entities who are in the
				411	* same cfs_rq i.e who have a common parent. Walk up the hierarchy of
				412	* both tasks until we find their ancestors who are siblings of common
				413	* parent.
				414	*/
				415
				416	/* First walk up until both entities are at same depth */
				417	se_depth = (*se)->depth;
				418	pse_depth = (*pse)->depth;
				419
				420	while (se_depth > pse_depth) {
				421	se_depth--;
				422	se = parent_entity(se);
				423	}
				424
				425	while (pse_depth > se_depth) {
				426	pse_depth--;
				427	pse = parent_entity(pse);
				428	}
				429
				430	while (!is_same_group(se, pse)) {
				431	se = parent_entity(se);
				432	pse = parent_entity(pse);
				433	}
				434	}
				435
				436	#else /* !CONFIG_FAIR_GROUP_SCHED */
				437
				438	static inline struct task_struct task_of(struct sched_entity se)
				439	{
				440	return container_of(se, struct task_struct, se);
				441	}
				442
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	443	#define for_each_sched_entity(se) \
				444	for (; se; se = NULL)
				445
				446	static inline struct cfs_rq task_cfs_rq(struct task_struct p)
				447	{
				448	return &task_rq(p)->cfs;
				449	}
				450
				451	static inline struct cfs_rq cfs_rq_of(struct sched_entity se)
				452	{
				453	struct task_struct *p = task_of(se);
				454	struct rq *rq = task_rq(p);
				455
				456	return &rq->cfs;
				457	}
				458
				459	/* runqueue "owned" by this group */
				460	static inline struct cfs_rq group_cfs_rq(struct sched_entity grp)
				461	{
				462	return NULL;
				463	}
				464
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	465	static inline void cfs_rq_tg_path(struct cfs_rq cfs_rq, char path, int len)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	466	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	467	if (path)
				468	strlcpy(path, "(null)", len);
				469	}
				470
				471	static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
				472	{
				473	return true;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	474	}
				475
				476	static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
				477	{
				478	}
				479
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	480	static inline void assert_list_leaf_cfs_rq(struct rq *rq)
				481	{
				482	}
				483
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	484	#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \
				485	for (cfs_rq = &rq->cfs, pos = NULL; cfs_rq; cfs_rq = pos)
				486
				487	static inline struct sched_entity parent_entity(struct sched_entity se)
				488	{
				489	return NULL;
				490	}
				491
				492	static inline void
				493	find_matching_se(struct sched_entity se, struct sched_entity pse)
				494	{
				495	}
				496
				497	#endif /* CONFIG_FAIR_GROUP_SCHED */
				498
				499	static __always_inline
				500	void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
				501
				502	/**************************************************************
				503	* Scheduling class tree data structure manipulation methods:
				504	*/
				505
				506	static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime)
				507	{
				508	s64 delta = (s64)(vruntime - max_vruntime);
				509	if (delta > 0)
				510	max_vruntime = vruntime;
				511
				512	return max_vruntime;
				513	}
				514
				515	static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
				516	{
				517	s64 delta = (s64)(vruntime - min_vruntime);
				518	if (delta < 0)
				519	min_vruntime = vruntime;
				520
				521	return min_vruntime;
				522	}
				523
				524	static inline int entity_before(struct sched_entity *a,
				525	struct sched_entity *b)
				526	{
				527	return (s64)(a->vruntime - b->vruntime) < 0;
				528	}
				529
				530	static void update_min_vruntime(struct cfs_rq *cfs_rq)
				531	{
				532	struct sched_entity *curr = cfs_rq->curr;
				533	struct rb_node *leftmost = rb_first_cached(&cfs_rq->tasks_timeline);
				534
				535	u64 vruntime = cfs_rq->min_vruntime;
				536
				537	if (curr) {
				538	if (curr->on_rq)
				539	vruntime = curr->vruntime;
				540	else
				541	curr = NULL;
				542	}
				543
				544	if (leftmost) { /* non-empty tree */
				545	struct sched_entity *se;
				546	se = rb_entry(leftmost, struct sched_entity, run_node);
				547
				548	if (!curr)
				549	vruntime = se->vruntime;
				550	else
				551	vruntime = min_vruntime(vruntime, se->vruntime);
				552	}
				553
				554	/* ensure we never gain time by being placed backwards. */
				555	cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
				556	#ifndef CONFIG_64BIT
				557	smp_wmb();
				558	cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
				559	#endif
				560	}
				561
				562	/*
				563	* Enqueue an entity into the rb-tree:
				564	*/
				565	static void __enqueue_entity(struct cfs_rq cfs_rq, struct sched_entity se)
				566	{
				567	struct rb_node **link = &cfs_rq->tasks_timeline.rb_root.rb_node;
				568	struct rb_node *parent = NULL;
				569	struct sched_entity *entry;
				570	bool leftmost = true;
				571
				572	/*
				573	* Find the right place in the rbtree:
				574	*/
				575	while (*link) {
				576	parent = *link;
				577	entry = rb_entry(parent, struct sched_entity, run_node);
				578	/*
				579	* We dont care about collisions. Nodes with
				580	* the same key stay together.
				581	*/
				582	if (entity_before(se, entry)) {
				583	link = &parent->rb_left;
				584	} else {
				585	link = &parent->rb_right;
				586	leftmost = false;
				587	}
				588	}
				589
				590	rb_link_node(&se->run_node, parent, link);
				591	rb_insert_color_cached(&se->run_node,
				592	&cfs_rq->tasks_timeline, leftmost);
				593	}
				594
				595	static void __dequeue_entity(struct cfs_rq cfs_rq, struct sched_entity se)
				596	{
				597	rb_erase_cached(&se->run_node, &cfs_rq->tasks_timeline);
				598	}
				599
				600	struct sched_entity __pick_first_entity(struct cfs_rq cfs_rq)
				601	{
				602	struct rb_node *left = rb_first_cached(&cfs_rq->tasks_timeline);
				603
				604	if (!left)
				605	return NULL;
				606
				607	return rb_entry(left, struct sched_entity, run_node);
				608	}
				609
				610	static struct sched_entity __pick_next_entity(struct sched_entity se)
				611	{
				612	struct rb_node *next = rb_next(&se->run_node);
				613
				614	if (!next)
				615	return NULL;
				616
				617	return rb_entry(next, struct sched_entity, run_node);
				618	}
				619
				620	#ifdef CONFIG_SCHED_DEBUG
				621	struct sched_entity __pick_last_entity(struct cfs_rq cfs_rq)
				622	{
				623	struct rb_node *last = rb_last(&cfs_rq->tasks_timeline.rb_root);
				624
				625	if (!last)
				626	return NULL;
				627
				628	return rb_entry(last, struct sched_entity, run_node);
				629	}
				630
				631	/**************************************************************
				632	* Scheduling class statistics methods:
				633	*/
				634
				635	int sched_proc_update_handler(struct ctl_table *table, int write,
				636	void __user buffer, size_t lenp,
				637	loff_t *ppos)
				638	{
				639	int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
				640	unsigned int factor = get_update_sysctl_factor();
				641
				642	if (ret \|\| !write)
				643	return ret;
				644
				645	sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
				646	sysctl_sched_min_granularity);
				647
				648	#define WRT_SYSCTL(name) \
				649	(normalized_sysctl_##name = sysctl_##name / (factor))
				650	WRT_SYSCTL(sched_min_granularity);
				651	WRT_SYSCTL(sched_latency);
				652	WRT_SYSCTL(sched_wakeup_granularity);
				653	#undef WRT_SYSCTL
				654
				655	return 0;
				656	}
				657	#endif
				658
				659	/*
				660	* delta /= w
				661	*/
				662	static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
				663	{
				664	if (unlikely(se->load.weight != NICE_0_LOAD))
				665	delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
				666
				667	return delta;
				668	}
				669
				670	/*
				671	* The idea is to set a period in which each task runs once.
				672	*
				673	* When there are too many tasks (sched_nr_latency) we have to stretch
				674	* this period because otherwise the slices get too small.
				675	*
				676	* p = (nr <= nl) ? l : l*nr/nl
				677	*/
				678	static u64 __sched_period(unsigned long nr_running)
				679	{
				680	if (unlikely(nr_running > sched_nr_latency))
				681	return nr_running * sysctl_sched_min_granularity;
				682	else
				683	return sysctl_sched_latency;
				684	}
				685
				686	/*
				687	* We calculate the wall-time slice from the period by taking a part
				688	* proportional to the weight.
				689	*
				690	* s = p*P[w/rw]
				691	*/
				692	static u64 sched_slice(struct cfs_rq cfs_rq, struct sched_entity se)
				693	{
				694	u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);
				695
				696	for_each_sched_entity(se) {
				697	struct load_weight *load;
				698	struct load_weight lw;
				699
				700	cfs_rq = cfs_rq_of(se);
				701	load = &cfs_rq->load;
				702
				703	if (unlikely(!se->on_rq)) {
				704	lw = cfs_rq->load;
				705
				706	update_load_add(&lw, se->load.weight);
				707	load = &lw;
				708	}
				709	slice = __calc_delta(slice, se->load.weight, load);
				710	}
				711	return slice;
				712	}
				713
				714	/*
				715	* We calculate the vruntime slice of a to-be-inserted task.
				716	*
				717	* vs = s/w
				718	*/
				719	static u64 sched_vslice(struct cfs_rq cfs_rq, struct sched_entity se)
				720	{
				721	return calc_delta_fair(sched_slice(cfs_rq, se), se);
				722	}
				723
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	724	#include "pelt.h"
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	725	#ifdef CONFIG_SMP
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	726
				727	static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);
				728	static unsigned long task_h_load(struct task_struct *p);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	729	static unsigned long capacity_of(int cpu);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	730
				731	/* Give new sched_entity start runnable values to heavy its load in infant time */
				732	void init_entity_runnable_average(struct sched_entity *se)
				733	{
				734	struct sched_avg *sa = &se->avg;
				735
				736	memset(sa, 0, sizeof(*sa));
				737
				738	/*
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	739	* Tasks are initialized with full load to be seen as heavy tasks until
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	740	* they get a chance to stabilize to their real load level.
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	741	* Group entities are initialized with zero load to reflect the fact that
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	742	* nothing has been attached to the task group yet.
				743	*/
				744	if (entity_is_task(se))
				745	sa->runnable_load_avg = sa->load_avg = scale_load_down(se->load.weight);
				746
				747	se->runnable_weight = se->load.weight;
				748
				749	/* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
				750	}
				751
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	752	static void attach_entity_cfs_rq(struct sched_entity *se);
				753
				754	/*
				755	* With new tasks being created, their initial util_avgs are extrapolated
				756	* based on the cfs_rq's current util_avg:
				757	*
				758	* util_avg = cfs_rq->util_avg / (cfs_rq->load_avg + 1) * se.load.weight
				759	*
				760	* However, in many cases, the above util_avg does not give a desired
				761	* value. Moreover, the sum of the util_avgs may be divergent, such
				762	* as when the series is a harmonic series.
				763	*
				764	* To solve this problem, we also cap the util_avg of successive tasks to
				765	* only 1/2 of the left utilization budget:
				766	*
				767	* util_avg_cap = (cpu_scale - cfs_rq->avg.util_avg) / 2^n
				768	*
				769	* where n denotes the nth task and cpu_scale the CPU capacity.
				770	*
				771	* For example, for a CPU with 1024 of capacity, a simplest series from
				772	* the beginning would be like:
				773	*
				774	* task util_avg: 512, 256, 128, 64, 32, 16, 8, ...
				775	* cfs_rq util_avg: 512, 768, 896, 960, 992, 1008, 1016, ...
				776	*
				777	* Finally, that extrapolated util_avg is clamped to the cap (util_avg_cap)
				778	* if util_avg > util_avg_cap.
				779	*/
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	780	void post_init_entity_util_avg(struct task_struct *p)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	781	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	782	struct sched_entity *se = &p->se;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	783	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				784	struct sched_avg *sa = &se->avg;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	785	long cpu_scale = arch_scale_cpu_capacity(cpu_of(rq_of(cfs_rq)));
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	786	long cap = (long)(cpu_scale - cfs_rq->avg.util_avg) / 2;
				787
				788	if (cap > 0) {
				789	if (cfs_rq->avg.util_avg != 0) {
				790	sa->util_avg = cfs_rq->avg.util_avg * se->load.weight;
				791	sa->util_avg /= (cfs_rq->avg.load_avg + 1);
				792
				793	if (sa->util_avg > cap)
				794	sa->util_avg = cap;
				795	} else {
				796	sa->util_avg = cap;
				797	}
				798	}
				799
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	800	if (p->sched_class != &fair_sched_class) {
				801	/*
				802	* For !fair tasks do:
				803	*
				804	update_cfs_rq_load_avg(now, cfs_rq);
				805	attach_entity_load_avg(cfs_rq, se, 0);
				806	switched_from_fair(rq, p);
				807	*
				808	* such that the next switched_to_fair() has the
				809	* expected state.
				810	*/
				811	se->avg.last_update_time = cfs_rq_clock_pelt(cfs_rq);
				812	return;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	813	}
				814
				815	attach_entity_cfs_rq(se);
				816	}
				817
				818	#else /* !CONFIG_SMP */
				819	void init_entity_runnable_average(struct sched_entity *se)
				820	{
				821	}
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	822	void post_init_entity_util_avg(struct task_struct *p)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	823	{
				824	}
				825	static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
				826	{
				827	}
				828	#endif /* CONFIG_SMP */
				829
				830	/*
				831	* Update the current task's runtime statistics.
				832	*/
				833	static void update_curr(struct cfs_rq *cfs_rq)
				834	{
				835	struct sched_entity *curr = cfs_rq->curr;
				836	u64 now = rq_clock_task(rq_of(cfs_rq));
				837	u64 delta_exec;
				838
				839	if (unlikely(!curr))
				840	return;
				841
				842	delta_exec = now - curr->exec_start;
				843	if (unlikely((s64)delta_exec <= 0))
				844	return;
				845
				846	curr->exec_start = now;
				847
				848	schedstat_set(curr->statistics.exec_max,
				849	max(delta_exec, curr->statistics.exec_max));
				850
				851	curr->sum_exec_runtime += delta_exec;
				852	schedstat_add(cfs_rq->exec_clock, delta_exec);
				853
				854	curr->vruntime += calc_delta_fair(delta_exec, curr);
				855	update_min_vruntime(cfs_rq);
				856
				857	if (entity_is_task(curr)) {
				858	struct task_struct *curtask = task_of(curr);
				859
				860	trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
				861	cgroup_account_cputime(curtask, delta_exec);
				862	account_group_exec_runtime(curtask, delta_exec);
				863	}
				864
				865	account_cfs_rq_runtime(cfs_rq, delta_exec);
				866	}
				867
				868	static void update_curr_fair(struct rq *rq)
				869	{
				870	update_curr(cfs_rq_of(&rq->curr->se));
				871	}
				872
				873	static inline void
				874	update_stats_wait_start(struct cfs_rq cfs_rq, struct sched_entity se)
				875	{
				876	u64 wait_start, prev_wait_start;
				877
				878	if (!schedstat_enabled())
				879	return;
				880
				881	wait_start = rq_clock(rq_of(cfs_rq));
				882	prev_wait_start = schedstat_val(se->statistics.wait_start);
				883
				884	if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) &&
				885	likely(wait_start > prev_wait_start))
				886	wait_start -= prev_wait_start;
				887
				888	__schedstat_set(se->statistics.wait_start, wait_start);
				889	}
				890
				891	static inline void
				892	update_stats_wait_end(struct cfs_rq cfs_rq, struct sched_entity se)
				893	{
				894	struct task_struct *p;
				895	u64 delta;
				896
				897	if (!schedstat_enabled())
				898	return;
				899
				900	delta = rq_clock(rq_of(cfs_rq)) - schedstat_val(se->statistics.wait_start);
				901
				902	if (entity_is_task(se)) {
				903	p = task_of(se);
				904	if (task_on_rq_migrating(p)) {
				905	/*
				906	* Preserve migrating task's wait time so wait_start
				907	* time stamp can be adjusted to accumulate wait time
				908	* prior to migration.
				909	*/
				910	__schedstat_set(se->statistics.wait_start, delta);
				911	return;
				912	}
				913	trace_sched_stat_wait(p, delta);
				914	}
				915
				916	__schedstat_set(se->statistics.wait_max,
				917	max(schedstat_val(se->statistics.wait_max), delta));
				918	__schedstat_inc(se->statistics.wait_count);
				919	__schedstat_add(se->statistics.wait_sum, delta);
				920	__schedstat_set(se->statistics.wait_start, 0);
				921	}
				922
				923	static inline void
				924	update_stats_enqueue_sleeper(struct cfs_rq cfs_rq, struct sched_entity se)
				925	{
				926	struct task_struct *tsk = NULL;
				927	u64 sleep_start, block_start;
				928
				929	if (!schedstat_enabled())
				930	return;
				931
				932	sleep_start = schedstat_val(se->statistics.sleep_start);
				933	block_start = schedstat_val(se->statistics.block_start);
				934
				935	if (entity_is_task(se))
				936	tsk = task_of(se);
				937
				938	if (sleep_start) {
				939	u64 delta = rq_clock(rq_of(cfs_rq)) - sleep_start;
				940
				941	if ((s64)delta < 0)
				942	delta = 0;
				943
				944	if (unlikely(delta > schedstat_val(se->statistics.sleep_max)))
				945	__schedstat_set(se->statistics.sleep_max, delta);
				946
				947	__schedstat_set(se->statistics.sleep_start, 0);
				948	__schedstat_add(se->statistics.sum_sleep_runtime, delta);
				949
				950	if (tsk) {
				951	account_scheduler_latency(tsk, delta >> 10, 1);
				952	trace_sched_stat_sleep(tsk, delta);
				953	}
				954	}
				955	if (block_start) {
				956	u64 delta = rq_clock(rq_of(cfs_rq)) - block_start;
				957
				958	if ((s64)delta < 0)
				959	delta = 0;
				960
				961	if (unlikely(delta > schedstat_val(se->statistics.block_max)))
				962	__schedstat_set(se->statistics.block_max, delta);
				963
				964	__schedstat_set(se->statistics.block_start, 0);
				965	__schedstat_add(se->statistics.sum_sleep_runtime, delta);
				966
				967	if (tsk) {
				968	if (tsk->in_iowait) {
				969	__schedstat_add(se->statistics.iowait_sum, delta);
				970	__schedstat_inc(se->statistics.iowait_count);
				971	trace_sched_stat_iowait(tsk, delta);
				972	}
				973
				974	trace_sched_stat_blocked(tsk, delta);
				975
				976	/*
				977	* Blocking time is in units of nanosecs, so shift by
				978	* 20 to get a milliseconds-range estimation of the
				979	* amount of time that the task spent sleeping:
				980	*/
				981	if (unlikely(prof_on == SLEEP_PROFILING)) {
				982	profile_hits(SLEEP_PROFILING,
				983	(void *)get_wchan(tsk),
				984	delta >> 20);
				985	}
				986	account_scheduler_latency(tsk, delta >> 10, 0);
				987	}
				988	}
				989	}
				990
				991	/*
				992	* Task is being enqueued - update stats:
				993	*/
				994	static inline void
				995	update_stats_enqueue(struct cfs_rq cfs_rq, struct sched_entity se, int flags)
				996	{
				997	if (!schedstat_enabled())
				998	return;
				999
				1000	/*
				1001	* Are we enqueueing a waiting task? (for current tasks
				1002	* a dequeue/enqueue event is a NOP)
				1003	*/
				1004	if (se != cfs_rq->curr)
				1005	update_stats_wait_start(cfs_rq, se);
				1006
				1007	if (flags & ENQUEUE_WAKEUP)
				1008	update_stats_enqueue_sleeper(cfs_rq, se);
				1009	}
				1010
				1011	static inline void
				1012	update_stats_dequeue(struct cfs_rq cfs_rq, struct sched_entity se, int flags)
				1013	{
				1014
				1015	if (!schedstat_enabled())
				1016	return;
				1017
				1018	/*
				1019	* Mark the end of the wait period if dequeueing a
				1020	* waiting task:
				1021	*/
				1022	if (se != cfs_rq->curr)
				1023	update_stats_wait_end(cfs_rq, se);
				1024
				1025	if ((flags & DEQUEUE_SLEEP) && entity_is_task(se)) {
				1026	struct task_struct *tsk = task_of(se);
				1027
				1028	if (tsk->state & TASK_INTERRUPTIBLE)
				1029	__schedstat_set(se->statistics.sleep_start,
				1030	rq_clock(rq_of(cfs_rq)));
				1031	if (tsk->state & TASK_UNINTERRUPTIBLE)
				1032	__schedstat_set(se->statistics.block_start,
				1033	rq_clock(rq_of(cfs_rq)));
				1034	}
				1035	}
				1036
				1037	/*
				1038	* We are picking a new current task - update its stats:
				1039	*/
				1040	static inline void
				1041	update_stats_curr_start(struct cfs_rq cfs_rq, struct sched_entity se)
				1042	{
				1043	/*
				1044	* We are starting a new run period:
				1045	*/
				1046	se->exec_start = rq_clock_task(rq_of(cfs_rq));
				1047	}
				1048
				1049	/**************************************************
				1050	* Scheduling class queueing methods:
				1051	*/
				1052
				1053	#ifdef CONFIG_NUMA_BALANCING
				1054	/*
				1055	* Approximate time to scan a full NUMA task in ms. The task scan period is
				1056	* calculated based on the tasks virtual memory size and
				1057	* numa_balancing_scan_size.
				1058	*/
				1059	unsigned int sysctl_numa_balancing_scan_period_min = 1000;
				1060	unsigned int sysctl_numa_balancing_scan_period_max = 60000;
				1061
				1062	/* Portion of address space to scan in MB */
				1063	unsigned int sysctl_numa_balancing_scan_size = 256;
				1064
				1065	/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
				1066	unsigned int sysctl_numa_balancing_scan_delay = 1000;
				1067
				1068	struct numa_group {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1069	refcount_t refcount;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1070
				1071	spinlock_t lock; /* nr_tasks, tasks */
				1072	int nr_tasks;
				1073	pid_t gid;
				1074	int active_nodes;
				1075
				1076	struct rcu_head rcu;
				1077	unsigned long total_faults;
				1078	unsigned long max_faults_cpu;
				1079	/*
				1080	* Faults_cpu is used to decide whether memory should move
				1081	* towards the CPU. As a consequence, these stats are weighted
				1082	* more by CPU use than by memory faults.
				1083	*/
				1084	unsigned long *faults_cpu;
				1085	unsigned long faults[0];
				1086	};
				1087
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1088	/*
				1089	* For functions that can be called in multiple contexts that permit reading
				1090	* ->numa_group (see struct task_struct for locking rules).
				1091	*/
				1092	static struct numa_group deref_task_numa_group(struct task_struct p)
				1093	{
				1094	return rcu_dereference_check(p->numa_group, p == current \|\|
				1095	(lockdep_is_held(&task_rq(p)->lock) && !READ_ONCE(p->on_cpu)));
				1096	}
				1097
				1098	static struct numa_group deref_curr_numa_group(struct task_struct p)
				1099	{
				1100	return rcu_dereference_protected(p->numa_group, p == current);
				1101	}
				1102
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1103	static inline unsigned long group_faults_priv(struct numa_group *ng);
				1104	static inline unsigned long group_faults_shared(struct numa_group *ng);
				1105
				1106	static unsigned int task_nr_scan_windows(struct task_struct *p)
				1107	{
				1108	unsigned long rss = 0;
				1109	unsigned long nr_scan_pages;
				1110
				1111	/*
				1112	* Calculations based on RSS as non-present and empty pages are skipped
				1113	* by the PTE scanner and NUMA hinting faults should be trapped based
				1114	* on resident pages
				1115	*/
				1116	nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT);
				1117	rss = get_mm_rss(p->mm);
				1118	if (!rss)
				1119	rss = nr_scan_pages;
				1120
				1121	rss = round_up(rss, nr_scan_pages);
				1122	return rss / nr_scan_pages;
				1123	}
				1124
				1125	/* For sanitys sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */
				1126	#define MAX_SCAN_WINDOW 2560
				1127
				1128	static unsigned int task_scan_min(struct task_struct *p)
				1129	{
				1130	unsigned int scan_size = READ_ONCE(sysctl_numa_balancing_scan_size);
				1131	unsigned int scan, floor;
				1132	unsigned int windows = 1;
				1133
				1134	if (scan_size < MAX_SCAN_WINDOW)
				1135	windows = MAX_SCAN_WINDOW / scan_size;
				1136	floor = 1000 / windows;
				1137
				1138	scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p);
				1139	return max_t(unsigned int, floor, scan);
				1140	}
				1141
				1142	static unsigned int task_scan_start(struct task_struct *p)
				1143	{
				1144	unsigned long smin = task_scan_min(p);
				1145	unsigned long period = smin;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1146	struct numa_group *ng;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1147
				1148	/* Scale the maximum scan period with the amount of shared memory. */
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1149	rcu_read_lock();
				1150	ng = rcu_dereference(p->numa_group);
				1151	if (ng) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1152	unsigned long shared = group_faults_shared(ng);
				1153	unsigned long private = group_faults_priv(ng);
				1154
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1155	period *= refcount_read(&ng->refcount);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1156	period *= shared + 1;
				1157	period /= private + shared + 1;
				1158	}
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1159	rcu_read_unlock();
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1160
				1161	return max(smin, period);
				1162	}
				1163
				1164	static unsigned int task_scan_max(struct task_struct *p)
				1165	{
				1166	unsigned long smin = task_scan_min(p);
				1167	unsigned long smax;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1168	struct numa_group *ng;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1169
				1170	/* Watch for min being lower than max due to floor calculations */
				1171	smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);
				1172
				1173	/* Scale the maximum scan period with the amount of shared memory. */
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1174	ng = deref_curr_numa_group(p);
				1175	if (ng) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1176	unsigned long shared = group_faults_shared(ng);
				1177	unsigned long private = group_faults_priv(ng);
				1178	unsigned long period = smax;
				1179
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1180	period *= refcount_read(&ng->refcount);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1181	period *= shared + 1;
				1182	period /= private + shared + 1;
				1183
				1184	smax = max(smax, period);
				1185	}
				1186
				1187	return max(smin, smax);
				1188	}
				1189
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1190	static void account_numa_enqueue(struct rq rq, struct task_struct p)
				1191	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1192	rq->nr_numa_running += (p->numa_preferred_nid != NUMA_NO_NODE);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1193	rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));
				1194	}
				1195
				1196	static void account_numa_dequeue(struct rq rq, struct task_struct p)
				1197	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1198	rq->nr_numa_running -= (p->numa_preferred_nid != NUMA_NO_NODE);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1199	rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
				1200	}
				1201
				1202	/* Shared or private faults. */
				1203	#define NR_NUMA_HINT_FAULT_TYPES 2
				1204
				1205	/* Memory and CPU locality */
				1206	#define NR_NUMA_HINT_FAULT_STATS (NR_NUMA_HINT_FAULT_TYPES * 2)
				1207
				1208	/* Averaged statistics, and temporary buffers. */
				1209	#define NR_NUMA_HINT_FAULT_BUCKETS (NR_NUMA_HINT_FAULT_STATS * 2)
				1210
				1211	pid_t task_numa_group_id(struct task_struct *p)
				1212	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1213	struct numa_group *ng;
				1214	pid_t gid = 0;
				1215
				1216	rcu_read_lock();
				1217	ng = rcu_dereference(p->numa_group);
				1218	if (ng)
				1219	gid = ng->gid;
				1220	rcu_read_unlock();
				1221
				1222	return gid;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1223	}
				1224
				1225	/*
				1226	* The averaged statistics, shared & private, memory & CPU,
				1227	* occupy the first half of the array. The second half of the
				1228	* array is for current counters, which are averaged into the
				1229	* first set by task_numa_placement.
				1230	*/
				1231	static inline int task_faults_idx(enum numa_faults_stats s, int nid, int priv)
				1232	{
				1233	return NR_NUMA_HINT_FAULT_TYPES * (s * nr_node_ids + nid) + priv;
				1234	}
				1235
				1236	static inline unsigned long task_faults(struct task_struct *p, int nid)
				1237	{
				1238	if (!p->numa_faults)
				1239	return 0;
				1240
				1241	return p->numa_faults[task_faults_idx(NUMA_MEM, nid, 0)] +
				1242	p->numa_faults[task_faults_idx(NUMA_MEM, nid, 1)];
				1243	}
				1244
				1245	static inline unsigned long group_faults(struct task_struct *p, int nid)
				1246	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1247	struct numa_group *ng = deref_task_numa_group(p);
				1248
				1249	if (!ng)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1250	return 0;
				1251
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1252	return ng->faults[task_faults_idx(NUMA_MEM, nid, 0)] +
				1253	ng->faults[task_faults_idx(NUMA_MEM, nid, 1)];
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1254	}
				1255
				1256	static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
				1257	{
				1258	return group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 0)] +
				1259	group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)];
				1260	}
				1261
				1262	static inline unsigned long group_faults_priv(struct numa_group *ng)
				1263	{
				1264	unsigned long faults = 0;
				1265	int node;
				1266
				1267	for_each_online_node(node) {
				1268	faults += ng->faults[task_faults_idx(NUMA_MEM, node, 1)];
				1269	}
				1270
				1271	return faults;
				1272	}
				1273
				1274	static inline unsigned long group_faults_shared(struct numa_group *ng)
				1275	{
				1276	unsigned long faults = 0;
				1277	int node;
				1278
				1279	for_each_online_node(node) {
				1280	faults += ng->faults[task_faults_idx(NUMA_MEM, node, 0)];
				1281	}
				1282
				1283	return faults;
				1284	}
				1285
				1286	/*
				1287	* A node triggering more than 1/3 as many NUMA faults as the maximum is
				1288	* considered part of a numa group's pseudo-interleaving set. Migrations
				1289	* between these nodes are slowed down, to allow things to settle down.
				1290	*/
				1291	#define ACTIVE_NODE_FRACTION 3
				1292
				1293	static bool numa_is_active_node(int nid, struct numa_group *ng)
				1294	{
				1295	return group_faults_cpu(ng, nid) * ACTIVE_NODE_FRACTION > ng->max_faults_cpu;
				1296	}
				1297
				1298	/* Handle placement on systems where not all nodes are directly connected. */
				1299	static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
				1300	int maxdist, bool task)
				1301	{
				1302	unsigned long score = 0;
				1303	int node;
				1304
				1305	/*
				1306	* All nodes are directly connected, and the same distance
				1307	* from each other. No need for fancy placement algorithms.
				1308	*/
				1309	if (sched_numa_topology_type == NUMA_DIRECT)
				1310	return 0;
				1311
				1312	/*
				1313	* This code is called for each node, introducing N^2 complexity,
				1314	* which should be ok given the number of nodes rarely exceeds 8.
				1315	*/
				1316	for_each_online_node(node) {
				1317	unsigned long faults;
				1318	int dist = node_distance(nid, node);
				1319
				1320	/*
				1321	* The furthest away nodes in the system are not interesting
				1322	* for placement; nid was already counted.
				1323	*/
				1324	if (dist == sched_max_numa_distance \|\| node == nid)
				1325	continue;
				1326
				1327	/*
				1328	* On systems with a backplane NUMA topology, compare groups
				1329	* of nodes, and move tasks towards the group with the most
				1330	* memory accesses. When comparing two nodes at distance
				1331	* "hoplimit", only nodes closer by than "hoplimit" are part
				1332	* of each group. Skip other nodes.
				1333	*/
				1334	if (sched_numa_topology_type == NUMA_BACKPLANE &&
				1335	dist >= maxdist)
				1336	continue;
				1337
				1338	/* Add up the faults from nearby nodes. */
				1339	if (task)
				1340	faults = task_faults(p, node);
				1341	else
				1342	faults = group_faults(p, node);
				1343
				1344	/*
				1345	* On systems with a glueless mesh NUMA topology, there are
				1346	* no fixed "groups of nodes". Instead, nodes that are not
				1347	* directly connected bounce traffic through intermediate
				1348	* nodes; a numa_group can occupy any set of nodes.
				1349	* The further away a node is, the less the faults count.
				1350	* This seems to result in good task placement.
				1351	*/
				1352	if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
				1353	faults *= (sched_max_numa_distance - dist);
				1354	faults /= (sched_max_numa_distance - LOCAL_DISTANCE);
				1355	}
				1356
				1357	score += faults;
				1358	}
				1359
				1360	return score;
				1361	}
				1362
				1363	/*
				1364	* These return the fraction of accesses done by a particular task, or
				1365	* task group, on a particular numa node. The group weight is given a
				1366	* larger multiplier, in order to group tasks together that are almost
				1367	* evenly spread out between numa nodes.
				1368	*/
				1369	static inline unsigned long task_weight(struct task_struct *p, int nid,
				1370	int dist)
				1371	{
				1372	unsigned long faults, total_faults;
				1373
				1374	if (!p->numa_faults)
				1375	return 0;
				1376
				1377	total_faults = p->total_numa_faults;
				1378
				1379	if (!total_faults)
				1380	return 0;
				1381
				1382	faults = task_faults(p, nid);
				1383	faults += score_nearby_nodes(p, nid, dist, true);
				1384
				1385	return 1000 * faults / total_faults;
				1386	}
				1387
				1388	static inline unsigned long group_weight(struct task_struct *p, int nid,
				1389	int dist)
				1390	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1391	struct numa_group *ng = deref_task_numa_group(p);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1392	unsigned long faults, total_faults;
				1393
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1394	if (!ng)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1395	return 0;
				1396
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1397	total_faults = ng->total_faults;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1398
				1399	if (!total_faults)
				1400	return 0;
				1401
				1402	faults = group_faults(p, nid);
				1403	faults += score_nearby_nodes(p, nid, dist, false);
				1404
				1405	return 1000 * faults / total_faults;
				1406	}
				1407
				1408	bool should_numa_migrate_memory(struct task_struct p, struct page page,
				1409	int src_nid, int dst_cpu)
				1410	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1411	struct numa_group *ng = deref_curr_numa_group(p);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1412	int dst_nid = cpu_to_node(dst_cpu);
				1413	int last_cpupid, this_cpupid;
				1414
				1415	this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
				1416	last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
				1417
				1418	/*
				1419	* Allow first faults or private faults to migrate immediately early in
				1420	* the lifetime of a task. The magic number 4 is based on waiting for
				1421	* two full passes of the "multi-stage node selection" test that is
				1422	* executed below.
				1423	*/
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1424	if ((p->numa_preferred_nid == NUMA_NO_NODE \|\| p->numa_scan_seq <= 4) &&
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1425	(cpupid_pid_unset(last_cpupid) \|\| cpupid_match_pid(p, last_cpupid)))
				1426	return true;
				1427
				1428	/*
				1429	* Multi-stage node selection is used in conjunction with a periodic
				1430	* migration fault to build a temporal task<->page relation. By using
				1431	* a two-stage filter we remove short/unlikely relations.
				1432	*
				1433	* Using P(p) ~ n_p / n_t as per frequentist probability, we can equate
				1434	* a task's usage of a particular page (n_p) per total usage of this
				1435	* page (n_t) (in a given time-span) to a probability.
				1436	*
				1437	* Our periodic faults will sample this probability and getting the
				1438	* same result twice in a row, given these samples are fully
				1439	* independent, is then given by P(n)^2, provided our sample period
				1440	* is sufficiently short compared to the usage pattern.
				1441	*
				1442	* This quadric squishes small probabilities, making it less likely we
				1443	* act on an unlikely task<->page relation.
				1444	*/
				1445	if (!cpupid_pid_unset(last_cpupid) &&
				1446	cpupid_to_nid(last_cpupid) != dst_nid)
				1447	return false;
				1448
				1449	/* Always allow migrate on private faults */
				1450	if (cpupid_match_pid(p, last_cpupid))
				1451	return true;
				1452
				1453	/* A shared fault, but p->numa_group has not been set up yet. */
				1454	if (!ng)
				1455	return true;
				1456
				1457	/*
				1458	* Destination node is much more heavily used than the source
				1459	* node? Allow migration.
				1460	*/
				1461	if (group_faults_cpu(ng, dst_nid) > group_faults_cpu(ng, src_nid) *
				1462	ACTIVE_NODE_FRACTION)
				1463	return true;
				1464
				1465	/*
				1466	* Distribute memory according to CPU & memory use on each node,
				1467	* with 3/4 hysteresis to avoid unnecessary memory migrations:
				1468	*
				1469	* faults_cpu(dst) 3 faults_cpu(src)
				1470	* --------------- * - > ---------------
				1471	* faults_mem(dst) 4 faults_mem(src)
				1472	*/
				1473	return group_faults_cpu(ng, dst_nid) * group_faults(p, src_nid) * 3 >
				1474	group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4;
				1475	}
				1476
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1477	static unsigned long cpu_runnable_load(struct rq *rq);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1478
				1479	/* Cached statistics for all CPUs within a node */
				1480	struct numa_stats {
				1481	unsigned long load;
				1482
				1483	/* Total compute capacity of CPUs on a node */
				1484	unsigned long compute_capacity;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1485	};
				1486
				1487	/*
				1488	* XXX borrowed from update_sg_lb_stats
				1489	*/
				1490	static void update_numa_stats(struct numa_stats *ns, int nid)
				1491	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1492	int cpu;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1493
				1494	memset(ns, 0, sizeof(*ns));
				1495	for_each_cpu(cpu, cpumask_of_node(nid)) {
				1496	struct rq *rq = cpu_rq(cpu);
				1497
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1498	ns->load += cpu_runnable_load(rq);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1499	ns->compute_capacity += capacity_of(cpu);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1500	}
				1501
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1502	}
				1503
				1504	struct task_numa_env {
				1505	struct task_struct *p;
				1506
				1507	int src_cpu, src_nid;
				1508	int dst_cpu, dst_nid;
				1509
				1510	struct numa_stats src_stats, dst_stats;
				1511
				1512	int imbalance_pct;
				1513	int dist;
				1514
				1515	struct task_struct *best_task;
				1516	long best_imp;
				1517	int best_cpu;
				1518	};
				1519
				1520	static void task_numa_assign(struct task_numa_env *env,
				1521	struct task_struct *p, long imp)
				1522	{
				1523	struct rq *rq = cpu_rq(env->dst_cpu);
				1524
				1525	/* Bail out if run-queue part of active NUMA balance. */
				1526	if (xchg(&rq->numa_migrate_on, 1))
				1527	return;
				1528
				1529	/*
				1530	* Clear previous best_cpu/rq numa-migrate flag, since task now
				1531	* found a better CPU to move/swap.
				1532	*/
				1533	if (env->best_cpu != -1) {
				1534	rq = cpu_rq(env->best_cpu);
				1535	WRITE_ONCE(rq->numa_migrate_on, 0);
				1536	}
				1537
				1538	if (env->best_task)
				1539	put_task_struct(env->best_task);
				1540	if (p)
				1541	get_task_struct(p);
				1542
				1543	env->best_task = p;
				1544	env->best_imp = imp;
				1545	env->best_cpu = env->dst_cpu;
				1546	}
				1547
				1548	static bool load_too_imbalanced(long src_load, long dst_load,
				1549	struct task_numa_env *env)
				1550	{
				1551	long imb, old_imb;
				1552	long orig_src_load, orig_dst_load;
				1553	long src_capacity, dst_capacity;
				1554
				1555	/*
				1556	* The load is corrected for the CPU capacity available on each node.
				1557	*
				1558	* src_load dst_load
				1559	* ------------ vs ---------
				1560	* src_capacity dst_capacity
				1561	*/
				1562	src_capacity = env->src_stats.compute_capacity;
				1563	dst_capacity = env->dst_stats.compute_capacity;
				1564
				1565	imb = abs(dst_load * src_capacity - src_load * dst_capacity);
				1566
				1567	orig_src_load = env->src_stats.load;
				1568	orig_dst_load = env->dst_stats.load;
				1569
				1570	old_imb = abs(orig_dst_load * src_capacity - orig_src_load * dst_capacity);
				1571
				1572	/* Would this change make things worse? */
				1573	return (imb > old_imb);
				1574	}
				1575
				1576	/*
				1577	* Maximum NUMA importance can be 1998 (2*999);
				1578	* SMALLIMP @ 30 would be close to 1998/64.
				1579	* Used to deter task migration.
				1580	*/
				1581	#define SMALLIMP 30
				1582
				1583	/*
				1584	* This checks if the overall compute and NUMA accesses of the system would
				1585	* be improved if the source tasks was migrated to the target dst_cpu taking
				1586	* into account that it might be best if task running on the dst_cpu should
				1587	* be exchanged with the source task
				1588	*/
				1589	static void task_numa_compare(struct task_numa_env *env,
				1590	long taskimp, long groupimp, bool maymove)
				1591	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1592	struct numa_group cur_ng, p_ng = deref_curr_numa_group(env->p);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1593	struct rq *dst_rq = cpu_rq(env->dst_cpu);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1594	long imp = p_ng ? groupimp : taskimp;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1595	struct task_struct *cur;
				1596	long src_load, dst_load;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1597	int dist = env->dist;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1598	long moveimp = imp;
				1599	long load;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1600
				1601	if (READ_ONCE(dst_rq->numa_migrate_on))
				1602	return;
				1603
				1604	rcu_read_lock();
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1605	cur = rcu_dereference(dst_rq->curr);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1606	if (cur && ((cur->flags & PF_EXITING) \|\| is_idle_task(cur)))
				1607	cur = NULL;
				1608
				1609	/*
				1610	* Because we have preemption enabled we can get migrated around and
				1611	* end try selecting ourselves (current == env->p) as a swap candidate.
				1612	*/
				1613	if (cur == env->p)
				1614	goto unlock;
				1615
				1616	if (!cur) {
				1617	if (maymove && moveimp >= env->best_imp)
				1618	goto assign;
				1619	else
				1620	goto unlock;
				1621	}
				1622
				1623	/*
				1624	* "imp" is the fault differential for the source task between the
				1625	* source and destination node. Calculate the total differential for
				1626	* the source task and potential destination task. The more negative
				1627	* the value is, the more remote accesses that would be expected to
				1628	* be incurred if the tasks were swapped.
				1629	*/
				1630	/* Skip this swap candidate if cannot move to the source cpu */
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1631	if (!cpumask_test_cpu(env->src_cpu, cur->cpus_ptr))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1632	goto unlock;
				1633
				1634	/*
				1635	* If dst and source tasks are in the same NUMA group, or not
				1636	* in any group then look only at task weights.
				1637	*/
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1638	cur_ng = rcu_dereference(cur->numa_group);
				1639	if (cur_ng == p_ng) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1640	imp = taskimp + task_weight(cur, env->src_nid, dist) -
				1641	task_weight(cur, env->dst_nid, dist);
				1642	/*
				1643	* Add some hysteresis to prevent swapping the
				1644	* tasks within a group over tiny differences.
				1645	*/
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1646	if (cur_ng)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1647	imp -= imp / 16;
				1648	} else {
				1649	/*
				1650	* Compare the group weights. If a task is all by itself
				1651	* (not part of a group), use the task weight instead.
				1652	*/
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1653	if (cur_ng && p_ng)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1654	imp += group_weight(cur, env->src_nid, dist) -
				1655	group_weight(cur, env->dst_nid, dist);
				1656	else
				1657	imp += task_weight(cur, env->src_nid, dist) -
				1658	task_weight(cur, env->dst_nid, dist);
				1659	}
				1660
				1661	if (maymove && moveimp > imp && moveimp > env->best_imp) {
				1662	imp = moveimp;
				1663	cur = NULL;
				1664	goto assign;
				1665	}
				1666
				1667	/*
				1668	* If the NUMA importance is less than SMALLIMP,
				1669	* task migration might only result in ping pong
				1670	* of tasks and also hurt performance due to cache
				1671	* misses.
				1672	*/
				1673	if (imp < SMALLIMP \|\| imp <= env->best_imp + SMALLIMP / 2)
				1674	goto unlock;
				1675
				1676	/*
				1677	* In the overloaded case, try and keep the load balanced.
				1678	*/
				1679	load = task_h_load(env->p) - task_h_load(cur);
				1680	if (!load)
				1681	goto assign;
				1682
				1683	dst_load = env->dst_stats.load + load;
				1684	src_load = env->src_stats.load - load;
				1685
				1686	if (load_too_imbalanced(src_load, dst_load, env))
				1687	goto unlock;
				1688
				1689	assign:
				1690	/*
				1691	* One idle CPU per node is evaluated for a task numa move.
				1692	* Call select_idle_sibling to maybe find a better one.
				1693	*/
				1694	if (!cur) {
				1695	/*
				1696	* select_idle_siblings() uses an per-CPU cpumask that
				1697	* can be used from IRQ context.
				1698	*/
				1699	local_irq_disable();
				1700	env->dst_cpu = select_idle_sibling(env->p, env->src_cpu,
				1701	env->dst_cpu);
				1702	local_irq_enable();
				1703	}
				1704
				1705	task_numa_assign(env, cur, imp);
				1706	unlock:
				1707	rcu_read_unlock();
				1708	}
				1709
				1710	static void task_numa_find_cpu(struct task_numa_env *env,
				1711	long taskimp, long groupimp)
				1712	{
				1713	long src_load, dst_load, load;
				1714	bool maymove = false;
				1715	int cpu;
				1716
				1717	load = task_h_load(env->p);
				1718	dst_load = env->dst_stats.load + load;
				1719	src_load = env->src_stats.load - load;
				1720
				1721	/*
				1722	* If the improvement from just moving env->p direction is better
				1723	* than swapping tasks around, check if a move is possible.
				1724	*/
				1725	maymove = !load_too_imbalanced(src_load, dst_load, env);
				1726
				1727	for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
				1728	/* Skip this CPU if the source task cannot migrate */
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1729	if (!cpumask_test_cpu(cpu, env->p->cpus_ptr))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1730	continue;
				1731
				1732	env->dst_cpu = cpu;
				1733	task_numa_compare(env, taskimp, groupimp, maymove);
				1734	}
				1735	}
				1736
				1737	static int task_numa_migrate(struct task_struct *p)
				1738	{
				1739	struct task_numa_env env = {
				1740	.p = p,
				1741
				1742	.src_cpu = task_cpu(p),
				1743	.src_nid = task_node(p),
				1744
				1745	.imbalance_pct = 112,
				1746
				1747	.best_task = NULL,
				1748	.best_imp = 0,
				1749	.best_cpu = -1,
				1750	};
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1751	unsigned long taskweight, groupweight;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1752	struct sched_domain *sd;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1753	long taskimp, groupimp;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1754	struct numa_group *ng;
				1755	struct rq *best_rq;
				1756	int nid, ret, dist;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1757
				1758	/*
				1759	* Pick the lowest SD_NUMA domain, as that would have the smallest
				1760	* imbalance and would be the first to start moving tasks about.
				1761	*
				1762	* And we want to avoid any moving of tasks about, as that would create
				1763	* random movement of tasks -- counter the numa conditions we're trying
				1764	* to satisfy here.
				1765	*/
				1766	rcu_read_lock();
				1767	sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
				1768	if (sd)
				1769	env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
				1770	rcu_read_unlock();
				1771
				1772	/*
				1773	* Cpusets can break the scheduler domain tree into smaller
				1774	* balance domains, some of which do not cross NUMA boundaries.
				1775	* Tasks that are "trapped" in such domains cannot be migrated
				1776	* elsewhere, so there is no point in (re)trying.
				1777	*/
				1778	if (unlikely(!sd)) {
				1779	sched_setnuma(p, task_node(p));
				1780	return -EINVAL;
				1781	}
				1782
				1783	env.dst_nid = p->numa_preferred_nid;
				1784	dist = env.dist = node_distance(env.src_nid, env.dst_nid);
				1785	taskweight = task_weight(p, env.src_nid, dist);
				1786	groupweight = group_weight(p, env.src_nid, dist);
				1787	update_numa_stats(&env.src_stats, env.src_nid);
				1788	taskimp = task_weight(p, env.dst_nid, dist) - taskweight;
				1789	groupimp = group_weight(p, env.dst_nid, dist) - groupweight;
				1790	update_numa_stats(&env.dst_stats, env.dst_nid);
				1791
				1792	/* Try to find a spot on the preferred nid. */
				1793	task_numa_find_cpu(&env, taskimp, groupimp);
				1794
				1795	/*
				1796	* Look at other nodes in these cases:
				1797	* - there is no space available on the preferred_nid
				1798	* - the task is part of a numa_group that is interleaved across
				1799	* multiple NUMA nodes; in order to better consolidate the group,
				1800	* we need to check other locations.
				1801	*/
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1802	ng = deref_curr_numa_group(p);
				1803	if (env.best_cpu == -1 \|\| (ng && ng->active_nodes > 1)) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1804	for_each_online_node(nid) {
				1805	if (nid == env.src_nid \|\| nid == p->numa_preferred_nid)
				1806	continue;
				1807
				1808	dist = node_distance(env.src_nid, env.dst_nid);
				1809	if (sched_numa_topology_type == NUMA_BACKPLANE &&
				1810	dist != env.dist) {
				1811	taskweight = task_weight(p, env.src_nid, dist);
				1812	groupweight = group_weight(p, env.src_nid, dist);
				1813	}
				1814
				1815	/* Only consider nodes where both task and groups benefit */
				1816	taskimp = task_weight(p, nid, dist) - taskweight;
				1817	groupimp = group_weight(p, nid, dist) - groupweight;
				1818	if (taskimp < 0 && groupimp < 0)
				1819	continue;
				1820
				1821	env.dist = dist;
				1822	env.dst_nid = nid;
				1823	update_numa_stats(&env.dst_stats, env.dst_nid);
				1824	task_numa_find_cpu(&env, taskimp, groupimp);
				1825	}
				1826	}
				1827
				1828	/*
				1829	* If the task is part of a workload that spans multiple NUMA nodes,
				1830	* and is migrating into one of the workload's active nodes, remember
				1831	* this node as the task's preferred numa node, so the workload can
				1832	* settle down.
				1833	* A task that migrated to a second choice node will be better off
				1834	* trying for a better one later. Do not set the preferred node here.
				1835	*/
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1836	if (ng) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1837	if (env.best_cpu == -1)
				1838	nid = env.src_nid;
				1839	else
				1840	nid = cpu_to_node(env.best_cpu);
				1841
				1842	if (nid != p->numa_preferred_nid)
				1843	sched_setnuma(p, nid);
				1844	}
				1845
				1846	/* No better CPU than the current one was found. */
				1847	if (env.best_cpu == -1)
				1848	return -EAGAIN;
				1849
				1850	best_rq = cpu_rq(env.best_cpu);
				1851	if (env.best_task == NULL) {
				1852	ret = migrate_task_to(p, env.best_cpu);
				1853	WRITE_ONCE(best_rq->numa_migrate_on, 0);
				1854	if (ret != 0)
				1855	trace_sched_stick_numa(p, env.src_cpu, env.best_cpu);
				1856	return ret;
				1857	}
				1858
				1859	ret = migrate_swap(p, env.best_task, env.best_cpu, env.src_cpu);
				1860	WRITE_ONCE(best_rq->numa_migrate_on, 0);
				1861
				1862	if (ret != 0)
				1863	trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task));
				1864	put_task_struct(env.best_task);
				1865	return ret;
				1866	}
				1867
				1868	/* Attempt to migrate a task to a CPU on the preferred node. */
				1869	static void numa_migrate_preferred(struct task_struct *p)
				1870	{
				1871	unsigned long interval = HZ;
				1872
				1873	/* This task has no NUMA fault statistics yet */
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1874	if (unlikely(p->numa_preferred_nid == NUMA_NO_NODE \|\| !p->numa_faults))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1875	return;
				1876
				1877	/* Periodically retry migrating the task to the preferred node */
				1878	interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16);
				1879	p->numa_migrate_retry = jiffies + interval;
				1880
				1881	/* Success if task is already running on preferred CPU */
				1882	if (task_node(p) == p->numa_preferred_nid)
				1883	return;
				1884
				1885	/* Otherwise, try migrate to a CPU on the preferred node */
				1886	task_numa_migrate(p);
				1887	}
				1888
				1889	/*
				1890	* Find out how many nodes on the workload is actively running on. Do this by
				1891	* tracking the nodes from which NUMA hinting faults are triggered. This can
				1892	* be different from the set of nodes where the workload's memory is currently
				1893	* located.
				1894	*/
				1895	static void numa_group_count_active_nodes(struct numa_group *numa_group)
				1896	{
				1897	unsigned long faults, max_faults = 0;
				1898	int nid, active_nodes = 0;
				1899
				1900	for_each_online_node(nid) {
				1901	faults = group_faults_cpu(numa_group, nid);
				1902	if (faults > max_faults)
				1903	max_faults = faults;
				1904	}
				1905
				1906	for_each_online_node(nid) {
				1907	faults = group_faults_cpu(numa_group, nid);
				1908	if (faults * ACTIVE_NODE_FRACTION > max_faults)
				1909	active_nodes++;
				1910	}
				1911
				1912	numa_group->max_faults_cpu = max_faults;
				1913	numa_group->active_nodes = active_nodes;
				1914	}
				1915
				1916	/*
				1917	* When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS
				1918	* increments. The more local the fault statistics are, the higher the scan
				1919	* period will be for the next scan window. If local/(local+remote) ratio is
				1920	* below NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS)
				1921	* the scan period will decrease. Aim for 70% local accesses.
				1922	*/
				1923	#define NUMA_PERIOD_SLOTS 10
				1924	#define NUMA_PERIOD_THRESHOLD 7
				1925
				1926	/*
				1927	* Increase the scan period (slow down scanning) if the majority of
				1928	* our memory is already on our local node, or if the majority of
				1929	* the page accesses are shared with other processes.
				1930	* Otherwise, decrease the scan period.
				1931	*/
				1932	static void update_task_scan_period(struct task_struct *p,
				1933	unsigned long shared, unsigned long private)
				1934	{
				1935	unsigned int period_slot;
				1936	int lr_ratio, ps_ratio;
				1937	int diff;
				1938
				1939	unsigned long remote = p->numa_faults_locality[0];
				1940	unsigned long local = p->numa_faults_locality[1];
				1941
				1942	/*
				1943	* If there were no record hinting faults then either the task is
				1944	* completely idle or all activity is areas that are not of interest
				1945	* to automatic numa balancing. Related to that, if there were failed
				1946	* migration then it implies we are migrating too quickly or the local
				1947	* node is overloaded. In either case, scan slower
				1948	*/
				1949	if (local + shared == 0 \|\| p->numa_faults_locality[2]) {
				1950	p->numa_scan_period = min(p->numa_scan_period_max,
				1951	p->numa_scan_period << 1);
				1952
				1953	p->mm->numa_next_scan = jiffies +
				1954	msecs_to_jiffies(p->numa_scan_period);
				1955
				1956	return;
				1957	}
				1958
				1959	/*
				1960	* Prepare to scale scan period relative to the current period.
				1961	* == NUMA_PERIOD_THRESHOLD scan period stays the same
				1962	* < NUMA_PERIOD_THRESHOLD scan period decreases (scan faster)
				1963	* >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower)
				1964	*/
				1965	period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS);
				1966	lr_ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);
				1967	ps_ratio = (private * NUMA_PERIOD_SLOTS) / (private + shared);
				1968
				1969	if (ps_ratio >= NUMA_PERIOD_THRESHOLD) {
				1970	/*
				1971	* Most memory accesses are local. There is no need to
				1972	* do fast NUMA scanning, since memory is already local.
				1973	*/
				1974	int slot = ps_ratio - NUMA_PERIOD_THRESHOLD;
				1975	if (!slot)
				1976	slot = 1;
				1977	diff = slot * period_slot;
				1978	} else if (lr_ratio >= NUMA_PERIOD_THRESHOLD) {
				1979	/*
				1980	* Most memory accesses are shared with other tasks.
				1981	* There is no point in continuing fast NUMA scanning,
				1982	* since other tasks may just move the memory elsewhere.
				1983	*/
				1984	int slot = lr_ratio - NUMA_PERIOD_THRESHOLD;
				1985	if (!slot)
				1986	slot = 1;
				1987	diff = slot * period_slot;
				1988	} else {
				1989	/*
				1990	* Private memory faults exceed (SLOTS-THRESHOLD)/SLOTS,
				1991	* yet they are not on the local NUMA node. Speed up
				1992	* NUMA scanning to get the memory moved over.
				1993	*/
				1994	int ratio = max(lr_ratio, ps_ratio);
				1995	diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;
				1996	}
				1997
				1998	p->numa_scan_period = clamp(p->numa_scan_period + diff,
				1999	task_scan_min(p), task_scan_max(p));
				2000	memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
				2001	}
				2002
				2003	/*
				2004	* Get the fraction of time the task has been running since the last
				2005	* NUMA placement cycle. The scheduler keeps similar statistics, but
				2006	* decays those on a 32ms period, which is orders of magnitude off
				2007	* from the dozens-of-seconds NUMA balancing period. Use the scheduler
				2008	* stats only if the task is so new there are no NUMA statistics yet.
				2009	*/
				2010	static u64 numa_get_avg_runtime(struct task_struct p, u64 period)
				2011	{
				2012	u64 runtime, delta, now;
				2013	/* Use the start of this time slice to avoid calculations. */
				2014	now = p->se.exec_start;
				2015	runtime = p->se.sum_exec_runtime;
				2016
				2017	if (p->last_task_numa_placement) {
				2018	delta = runtime - p->last_sum_exec_runtime;
				2019	*period = now - p->last_task_numa_placement;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2020
				2021	/* Avoid time going backwards, prevent potential divide error: */
				2022	if (unlikely((s64)*period < 0))
				2023	*period = 0;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2024	} else {
				2025	delta = p->se.avg.load_sum;
				2026	*period = LOAD_AVG_MAX;
				2027	}
				2028
				2029	p->last_sum_exec_runtime = runtime;
				2030	p->last_task_numa_placement = now;
				2031
				2032	return delta;
				2033	}
				2034
				2035	/*
				2036	* Determine the preferred nid for a task in a numa_group. This needs to
				2037	* be done in a way that produces consistent results with group_weight,
				2038	* otherwise workloads might not converge.
				2039	*/
				2040	static int preferred_group_nid(struct task_struct *p, int nid)
				2041	{
				2042	nodemask_t nodes;
				2043	int dist;
				2044
				2045	/* Direct connections between all NUMA nodes. */
				2046	if (sched_numa_topology_type == NUMA_DIRECT)
				2047	return nid;
				2048
				2049	/*
				2050	* On a system with glueless mesh NUMA topology, group_weight
				2051	* scores nodes according to the number of NUMA hinting faults on
				2052	* both the node itself, and on nearby nodes.
				2053	*/
				2054	if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
				2055	unsigned long score, max_score = 0;
				2056	int node, max_node = nid;
				2057
				2058	dist = sched_max_numa_distance;
				2059
				2060	for_each_online_node(node) {
				2061	score = group_weight(p, node, dist);
				2062	if (score > max_score) {
				2063	max_score = score;
				2064	max_node = node;
				2065	}
				2066	}
				2067	return max_node;
				2068	}
				2069
				2070	/*
				2071	* Finding the preferred nid in a system with NUMA backplane
				2072	* interconnect topology is more involved. The goal is to locate
				2073	* tasks from numa_groups near each other in the system, and
				2074	* untangle workloads from different sides of the system. This requires
				2075	* searching down the hierarchy of node groups, recursively searching
				2076	* inside the highest scoring group of nodes. The nodemask tricks
				2077	* keep the complexity of the search down.
				2078	*/
				2079	nodes = node_online_map;
				2080	for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) {
				2081	unsigned long max_faults = 0;
				2082	nodemask_t max_group = NODE_MASK_NONE;
				2083	int a, b;
				2084
				2085	/* Are there nodes at this distance from each other? */
				2086	if (!find_numa_distance(dist))
				2087	continue;
				2088
				2089	for_each_node_mask(a, nodes) {
				2090	unsigned long faults = 0;
				2091	nodemask_t this_group;
				2092	nodes_clear(this_group);
				2093
				2094	/* Sum group's NUMA faults; includes a==b case. */
				2095	for_each_node_mask(b, nodes) {
				2096	if (node_distance(a, b) < dist) {
				2097	faults += group_faults(p, b);
				2098	node_set(b, this_group);
				2099	node_clear(b, nodes);
				2100	}
				2101	}
				2102
				2103	/* Remember the top group. */
				2104	if (faults > max_faults) {
				2105	max_faults = faults;
				2106	max_group = this_group;
				2107	/*
				2108	* subtle: at the smallest distance there is
				2109	* just one node left in each "group", the
				2110	* winner is the preferred nid.
				2111	*/
				2112	nid = a;
				2113	}
				2114	}
				2115	/* Next round, evaluate the nodes within max_group. */
				2116	if (!max_faults)
				2117	break;
				2118	nodes = max_group;
				2119	}
				2120	return nid;
				2121	}
				2122
				2123	static void task_numa_placement(struct task_struct *p)
				2124	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2125	int seq, nid, max_nid = NUMA_NO_NODE;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2126	unsigned long max_faults = 0;
				2127	unsigned long fault_types[2] = { 0, 0 };
				2128	unsigned long total_faults;
				2129	u64 runtime, period;
				2130	spinlock_t *group_lock = NULL;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2131	struct numa_group *ng;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2132
				2133	/*
				2134	* The p->mm->numa_scan_seq field gets updated without
				2135	* exclusive access. Use READ_ONCE() here to ensure
				2136	* that the field is read in a single access:
				2137	*/
				2138	seq = READ_ONCE(p->mm->numa_scan_seq);
				2139	if (p->numa_scan_seq == seq)
				2140	return;
				2141	p->numa_scan_seq = seq;
				2142	p->numa_scan_period_max = task_scan_max(p);
				2143
				2144	total_faults = p->numa_faults_locality[0] +
				2145	p->numa_faults_locality[1];
				2146	runtime = numa_get_avg_runtime(p, &period);
				2147
				2148	/* If the task is part of a group prevent parallel updates to group stats */
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2149	ng = deref_curr_numa_group(p);
				2150	if (ng) {
				2151	group_lock = &ng->lock;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2152	spin_lock_irq(group_lock);
				2153	}
				2154
				2155	/* Find the node with the highest number of faults */
				2156	for_each_online_node(nid) {
				2157	/* Keep track of the offsets in numa_faults array */
				2158	int mem_idx, membuf_idx, cpu_idx, cpubuf_idx;
				2159	unsigned long faults = 0, group_faults = 0;
				2160	int priv;
				2161
				2162	for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) {
				2163	long diff, f_diff, f_weight;
				2164
				2165	mem_idx = task_faults_idx(NUMA_MEM, nid, priv);
				2166	membuf_idx = task_faults_idx(NUMA_MEMBUF, nid, priv);
				2167	cpu_idx = task_faults_idx(NUMA_CPU, nid, priv);
				2168	cpubuf_idx = task_faults_idx(NUMA_CPUBUF, nid, priv);
				2169
				2170	/* Decay existing window, copy faults since last scan */
				2171	diff = p->numa_faults[membuf_idx] - p->numa_faults[mem_idx] / 2;
				2172	fault_types[priv] += p->numa_faults[membuf_idx];
				2173	p->numa_faults[membuf_idx] = 0;
				2174
				2175	/*
				2176	* Normalize the faults_from, so all tasks in a group
				2177	* count according to CPU use, instead of by the raw
				2178	* number of faults. Tasks with little runtime have
				2179	* little over-all impact on throughput, and thus their
				2180	* faults are less important.
				2181	*/
				2182	f_weight = div64_u64(runtime << 16, period + 1);
				2183	f_weight = (f_weight * p->numa_faults[cpubuf_idx]) /
				2184	(total_faults + 1);
				2185	f_diff = f_weight - p->numa_faults[cpu_idx] / 2;
				2186	p->numa_faults[cpubuf_idx] = 0;
				2187
				2188	p->numa_faults[mem_idx] += diff;
				2189	p->numa_faults[cpu_idx] += f_diff;
				2190	faults += p->numa_faults[mem_idx];
				2191	p->total_numa_faults += diff;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2192	if (ng) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2193	/*
				2194	* safe because we can only change our own group
				2195	*
				2196	* mem_idx represents the offset for a given
				2197	* nid and priv in a specific region because it
				2198	* is at the beginning of the numa_faults array.
				2199	*/
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2200	ng->faults[mem_idx] += diff;
				2201	ng->faults_cpu[mem_idx] += f_diff;
				2202	ng->total_faults += diff;
				2203	group_faults += ng->faults[mem_idx];
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2204	}
				2205	}
				2206
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2207	if (!ng) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2208	if (faults > max_faults) {
				2209	max_faults = faults;
				2210	max_nid = nid;
				2211	}
				2212	} else if (group_faults > max_faults) {
				2213	max_faults = group_faults;
				2214	max_nid = nid;
				2215	}
				2216	}
				2217
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2218	if (ng) {
				2219	numa_group_count_active_nodes(ng);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2220	spin_unlock_irq(group_lock);
				2221	max_nid = preferred_group_nid(p, max_nid);
				2222	}
				2223
				2224	if (max_faults) {
				2225	/* Set the new preferred node */
				2226	if (max_nid != p->numa_preferred_nid)
				2227	sched_setnuma(p, max_nid);
				2228	}
				2229
				2230	update_task_scan_period(p, fault_types[0], fault_types[1]);
				2231	}
				2232
				2233	static inline int get_numa_group(struct numa_group *grp)
				2234	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2235	return refcount_inc_not_zero(&grp->refcount);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2236	}
				2237
				2238	static inline void put_numa_group(struct numa_group *grp)
				2239	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2240	if (refcount_dec_and_test(&grp->refcount))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2241	kfree_rcu(grp, rcu);
				2242	}
				2243
				2244	static void task_numa_group(struct task_struct *p, int cpupid, int flags,
				2245	int *priv)
				2246	{
				2247	struct numa_group grp, my_grp;
				2248	struct task_struct *tsk;
				2249	bool join = false;
				2250	int cpu = cpupid_to_cpu(cpupid);
				2251	int i;
				2252
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2253	if (unlikely(!deref_curr_numa_group(p))) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2254	unsigned int size = sizeof(struct numa_group) +
				2255	4nr_node_idssizeof(unsigned long);
				2256
				2257	grp = kzalloc(size, GFP_KERNEL \| __GFP_NOWARN);
				2258	if (!grp)
				2259	return;
				2260
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2261	refcount_set(&grp->refcount, 1);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2262	grp->active_nodes = 1;
				2263	grp->max_faults_cpu = 0;
				2264	spin_lock_init(&grp->lock);
				2265	grp->gid = p->pid;
				2266	/* Second half of the array tracks nids where faults happen */
				2267	grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES *
				2268	nr_node_ids;
				2269
				2270	for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
				2271	grp->faults[i] = p->numa_faults[i];
				2272
				2273	grp->total_faults = p->total_numa_faults;
				2274
				2275	grp->nr_tasks++;
				2276	rcu_assign_pointer(p->numa_group, grp);
				2277	}
				2278
				2279	rcu_read_lock();
				2280	tsk = READ_ONCE(cpu_rq(cpu)->curr);
				2281
				2282	if (!cpupid_match_pid(tsk, cpupid))
				2283	goto no_join;
				2284
				2285	grp = rcu_dereference(tsk->numa_group);
				2286	if (!grp)
				2287	goto no_join;
				2288
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2289	my_grp = deref_curr_numa_group(p);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2290	if (grp == my_grp)
				2291	goto no_join;
				2292
				2293	/*
				2294	* Only join the other group if its bigger; if we're the bigger group,
				2295	* the other task will join us.
				2296	*/
				2297	if (my_grp->nr_tasks > grp->nr_tasks)
				2298	goto no_join;
				2299
				2300	/*
				2301	* Tie-break on the grp address.
				2302	*/
				2303	if (my_grp->nr_tasks == grp->nr_tasks && my_grp > grp)
				2304	goto no_join;
				2305
				2306	/* Always join threads in the same process. */
				2307	if (tsk->mm == current->mm)
				2308	join = true;
				2309
				2310	/* Simple filter to avoid false positives due to PID collisions */
				2311	if (flags & TNF_SHARED)
				2312	join = true;
				2313
				2314	/* Update priv based on whether false sharing was detected */
				2315	*priv = !join;
				2316
				2317	if (join && !get_numa_group(grp))
				2318	goto no_join;
				2319
				2320	rcu_read_unlock();
				2321
				2322	if (!join)
				2323	return;
				2324
				2325	BUG_ON(irqs_disabled());
				2326	double_lock_irq(&my_grp->lock, &grp->lock);
				2327
				2328	for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) {
				2329	my_grp->faults[i] -= p->numa_faults[i];
				2330	grp->faults[i] += p->numa_faults[i];
				2331	}
				2332	my_grp->total_faults -= p->total_numa_faults;
				2333	grp->total_faults += p->total_numa_faults;
				2334
				2335	my_grp->nr_tasks--;
				2336	grp->nr_tasks++;
				2337
				2338	spin_unlock(&my_grp->lock);
				2339	spin_unlock_irq(&grp->lock);
				2340
				2341	rcu_assign_pointer(p->numa_group, grp);
				2342
				2343	put_numa_group(my_grp);
				2344	return;
				2345
				2346	no_join:
				2347	rcu_read_unlock();
				2348	return;
				2349	}
				2350
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2351	/*
				2352	* Get rid of NUMA staticstics associated with a task (either current or dead).
				2353	* If @final is set, the task is dead and has reached refcount zero, so we can
				2354	* safely free all relevant data structures. Otherwise, there might be
				2355	* concurrent reads from places like load balancing and procfs, and we should
				2356	* reset the data back to default state without freeing ->numa_faults.
				2357	*/
				2358	void task_numa_free(struct task_struct *p, bool final)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2359	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2360	/* safe: p either is current or is being freed by current */
				2361	struct numa_group *grp = rcu_dereference_raw(p->numa_group);
				2362	unsigned long *numa_faults = p->numa_faults;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2363	unsigned long flags;
				2364	int i;
				2365
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2366	if (!numa_faults)
				2367	return;
				2368
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2369	if (grp) {
				2370	spin_lock_irqsave(&grp->lock, flags);
				2371	for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
				2372	grp->faults[i] -= p->numa_faults[i];
				2373	grp->total_faults -= p->total_numa_faults;
				2374
				2375	grp->nr_tasks--;
				2376	spin_unlock_irqrestore(&grp->lock, flags);
				2377	RCU_INIT_POINTER(p->numa_group, NULL);
				2378	put_numa_group(grp);
				2379	}
				2380
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2381	if (final) {
				2382	p->numa_faults = NULL;
				2383	kfree(numa_faults);
				2384	} else {
				2385	p->total_numa_faults = 0;
				2386	for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
				2387	numa_faults[i] = 0;
				2388	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2389	}
				2390
				2391	/*
				2392	* Got a PROT_NONE fault for a page on @node.
				2393	*/
				2394	void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
				2395	{
				2396	struct task_struct *p = current;
				2397	bool migrated = flags & TNF_MIGRATED;
				2398	int cpu_node = task_node(current);
				2399	int local = !!(flags & TNF_FAULT_LOCAL);
				2400	struct numa_group *ng;
				2401	int priv;
				2402
				2403	if (!static_branch_likely(&sched_numa_balancing))
				2404	return;
				2405
				2406	/* for example, ksmd faulting in a user's mm */
				2407	if (!p->mm)
				2408	return;
				2409
				2410	/* Allocate buffer to track faults on a per-node basis */
				2411	if (unlikely(!p->numa_faults)) {
				2412	int size = sizeof(p->numa_faults)
				2413	NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids;
				2414
				2415	p->numa_faults = kzalloc(size, GFP_KERNEL\|__GFP_NOWARN);
				2416	if (!p->numa_faults)
				2417	return;
				2418
				2419	p->total_numa_faults = 0;
				2420	memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
				2421	}
				2422
				2423	/*
				2424	* First accesses are treated as private, otherwise consider accesses
				2425	* to be private if the accessing pid has not changed
				2426	*/
				2427	if (unlikely(last_cpupid == (-1 & LAST_CPUPID_MASK))) {
				2428	priv = 1;
				2429	} else {
				2430	priv = cpupid_match_pid(p, last_cpupid);
				2431	if (!priv && !(flags & TNF_NO_GROUP))
				2432	task_numa_group(p, last_cpupid, flags, &priv);
				2433	}
				2434
				2435	/*
				2436	* If a workload spans multiple NUMA nodes, a shared fault that
				2437	* occurs wholly within the set of nodes that the workload is
				2438	* actively using should be counted as local. This allows the
				2439	* scan rate to slow down when a workload has settled down.
				2440	*/
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2441	ng = deref_curr_numa_group(p);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2442	if (!priv && !local && ng && ng->active_nodes > 1 &&
				2443	numa_is_active_node(cpu_node, ng) &&
				2444	numa_is_active_node(mem_node, ng))
				2445	local = 1;
				2446
				2447	/*
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2448	* Retry to migrate task to preferred node periodically, in case it
				2449	* previously failed, or the scheduler moved us.
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2450	*/
				2451	if (time_after(jiffies, p->numa_migrate_retry)) {
				2452	task_numa_placement(p);
				2453	numa_migrate_preferred(p);
				2454	}
				2455
				2456	if (migrated)
				2457	p->numa_pages_migrated += pages;
				2458	if (flags & TNF_MIGRATE_FAIL)
				2459	p->numa_faults_locality[2] += pages;
				2460
				2461	p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages;
				2462	p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages;
				2463	p->numa_faults_locality[local] += pages;
				2464	}
				2465
				2466	static void reset_ptenuma_scan(struct task_struct *p)
				2467	{
				2468	/*
				2469	* We only did a read acquisition of the mmap sem, so
				2470	* p->mm->numa_scan_seq is written to without exclusive access
				2471	* and the update is not guaranteed to be atomic. That's not
				2472	* much of an issue though, since this is just used for
				2473	* statistical sampling. Use READ_ONCE/WRITE_ONCE, which are not
				2474	* expensive, to avoid any form of compiler optimizations:
				2475	*/
				2476	WRITE_ONCE(p->mm->numa_scan_seq, READ_ONCE(p->mm->numa_scan_seq) + 1);
				2477	p->mm->numa_scan_offset = 0;
				2478	}
				2479
				2480	/*
				2481	* The expensive part of numa migration is done from task_work context.
				2482	* Triggered from task_tick_numa().
				2483	*/
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2484	static void task_numa_work(struct callback_head *work)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2485	{
				2486	unsigned long migrate, next_scan, now = jiffies;
				2487	struct task_struct *p = current;
				2488	struct mm_struct *mm = p->mm;
				2489	u64 runtime = p->se.sum_exec_runtime;
				2490	struct vm_area_struct *vma;
				2491	unsigned long start, end;
				2492	unsigned long nr_pte_updates = 0;
				2493	long pages, virtpages;
				2494
				2495	SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work));
				2496
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2497	work->next = work;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2498	/*
				2499	* Who cares about NUMA placement when they're dying.
				2500	*
				2501	* NOTE: make sure not to dereference p->mm before this check,
				2502	* exit_task_work() happens _after_ exit_mm() so we could be called
				2503	* without p->mm even though we still had it when we enqueued this
				2504	* work.
				2505	*/
				2506	if (p->flags & PF_EXITING)
				2507	return;
				2508
				2509	if (!mm->numa_next_scan) {
				2510	mm->numa_next_scan = now +
				2511	msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
				2512	}
				2513
				2514	/*
				2515	* Enforce maximal scan/migration frequency..
				2516	*/
				2517	migrate = mm->numa_next_scan;
				2518	if (time_before(now, migrate))
				2519	return;
				2520
				2521	if (p->numa_scan_period == 0) {
				2522	p->numa_scan_period_max = task_scan_max(p);
				2523	p->numa_scan_period = task_scan_start(p);
				2524	}
				2525
				2526	next_scan = now + msecs_to_jiffies(p->numa_scan_period);
				2527	if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
				2528	return;
				2529
				2530	/*
				2531	* Delay this task enough that another task of this mm will likely win
				2532	* the next time around.
				2533	*/
				2534	p->node_stamp += 2 * TICK_NSEC;
				2535
				2536	start = mm->numa_scan_offset;
				2537	pages = sysctl_numa_balancing_scan_size;
				2538	pages <<= 20 - PAGE_SHIFT; /* MB in pages */
				2539	virtpages = pages * 8; /* Scan up to this much virtual space */
				2540	if (!pages)
				2541	return;
				2542
				2543
				2544	if (!down_read_trylock(&mm->mmap_sem))
				2545	return;
				2546	vma = find_vma(mm, start);
				2547	if (!vma) {
				2548	reset_ptenuma_scan(p);
				2549	start = 0;
				2550	vma = mm->mmap;
				2551	}
				2552	for (; vma; vma = vma->vm_next) {
				2553	if (!vma_migratable(vma) \|\| !vma_policy_mof(vma) \|\|
				2554	is_vm_hugetlb_page(vma) \|\| (vma->vm_flags & VM_MIXEDMAP)) {
				2555	continue;
				2556	}
				2557
				2558	/*
				2559	* Shared library pages mapped by multiple processes are not
				2560	* migrated as it is expected they are cache replicated. Avoid
				2561	* hinting faults in read-only file-backed mappings or the vdso
				2562	* as migrating the pages will be of marginal benefit.
				2563	*/
				2564	if (!vma->vm_mm \|\|
				2565	(vma->vm_file && (vma->vm_flags & (VM_READ\|VM_WRITE)) == (VM_READ)))
				2566	continue;
				2567
				2568	/*
				2569	* Skip inaccessible VMAs to avoid any confusion between
				2570	* PROT_NONE and NUMA hinting ptes
				2571	*/
				2572	if (!(vma->vm_flags & (VM_READ \| VM_EXEC \| VM_WRITE)))
				2573	continue;
				2574
				2575	do {
				2576	start = max(start, vma->vm_start);
				2577	end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
				2578	end = min(end, vma->vm_end);
				2579	nr_pte_updates = change_prot_numa(vma, start, end);
				2580
				2581	/*
				2582	* Try to scan sysctl_numa_balancing_size worth of
				2583	* hpages that have at least one present PTE that
				2584	* is not already pte-numa. If the VMA contains
				2585	* areas that are unused or already full of prot_numa
				2586	* PTEs, scan up to virtpages, to skip through those
				2587	* areas faster.
				2588	*/
				2589	if (nr_pte_updates)
				2590	pages -= (end - start) >> PAGE_SHIFT;
				2591	virtpages -= (end - start) >> PAGE_SHIFT;
				2592
				2593	start = end;
				2594	if (pages <= 0 \|\| virtpages <= 0)
				2595	goto out;
				2596
				2597	cond_resched();
				2598	} while (end != vma->vm_end);
				2599	}
				2600
				2601	out:
				2602	/*
				2603	* It is possible to reach the end of the VMA list but the last few
				2604	* VMAs are not guaranteed to the vma_migratable. If they are not, we
				2605	* would find the !migratable VMA on the next scan but not reset the
				2606	* scanner to the start so check it now.
				2607	*/
				2608	if (vma)
				2609	mm->numa_scan_offset = start;
				2610	else
				2611	reset_ptenuma_scan(p);
				2612	up_read(&mm->mmap_sem);
				2613
				2614	/*
				2615	* Make sure tasks use at least 32x as much time to run other code
				2616	* than they used here, to limit NUMA PTE scanning overhead to 3% max.
				2617	* Usually update_task_scan_period slows down scanning enough; on an
				2618	* overloaded system we need to limit overhead on a per task basis.
				2619	*/
				2620	if (unlikely(p->se.sum_exec_runtime != runtime)) {
				2621	u64 diff = p->se.sum_exec_runtime - runtime;
				2622	p->node_stamp += 32 * diff;
				2623	}
				2624	}
				2625
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2626	void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
				2627	{
				2628	int mm_users = 0;
				2629	struct mm_struct *mm = p->mm;
				2630
				2631	if (mm) {
				2632	mm_users = atomic_read(&mm->mm_users);
				2633	if (mm_users == 1) {
				2634	mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
				2635	mm->numa_scan_seq = 0;
				2636	}
				2637	}
				2638	p->node_stamp = 0;
				2639	p->numa_scan_seq = mm ? mm->numa_scan_seq : 0;
				2640	p->numa_scan_period = sysctl_numa_balancing_scan_delay;
				2641	/* Protect against double add, see task_tick_numa and task_numa_work */
				2642	p->numa_work.next = &p->numa_work;
				2643	p->numa_faults = NULL;
				2644	RCU_INIT_POINTER(p->numa_group, NULL);
				2645	p->last_task_numa_placement = 0;
				2646	p->last_sum_exec_runtime = 0;
				2647
				2648	init_task_work(&p->numa_work, task_numa_work);
				2649
				2650	/* New address space, reset the preferred nid */
				2651	if (!(clone_flags & CLONE_VM)) {
				2652	p->numa_preferred_nid = NUMA_NO_NODE;
				2653	return;
				2654	}
				2655
				2656	/*
				2657	* New thread, keep existing numa_preferred_nid which should be copied
				2658	* already by arch_dup_task_struct but stagger when scans start.
				2659	*/
				2660	if (mm) {
				2661	unsigned int delay;
				2662
				2663	delay = min_t(unsigned int, task_scan_max(current),
				2664	current->numa_scan_period * mm_users * NSEC_PER_MSEC);
				2665	delay += 2 * TICK_NSEC;
				2666	p->node_stamp = delay;
				2667	}
				2668	}
				2669
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2670	/*
				2671	* Drive the periodic memory faults..
				2672	*/
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2673	static void task_tick_numa(struct rq rq, struct task_struct curr)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2674	{
				2675	struct callback_head *work = &curr->numa_work;
				2676	u64 period, now;
				2677
				2678	/*
				2679	* We don't care about NUMA placement if we don't have memory.
				2680	*/
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	2681	if ((curr->flags & (PF_EXITING \| PF_KTHREAD)) \|\| work->next != work)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2682	return;
				2683
				2684	/*
				2685	* Using runtime rather than walltime has the dual advantage that
				2686	* we (mostly) drive the selection from busy threads and that the
				2687	* task needs to have done some actual work before we bother with
				2688	* NUMA placement.
				2689	*/
				2690	now = curr->se.sum_exec_runtime;
				2691	period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
				2692
				2693	if (now > curr->node_stamp + period) {
				2694	if (!curr->node_stamp)
				2695	curr->numa_scan_period = task_scan_start(curr);
				2696	curr->node_stamp += period;
				2697
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2698	if (!time_before(jiffies, curr->mm->numa_next_scan))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2699	task_work_add(curr, work, true);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2700	}
				2701	}
				2702
				2703	static void update_scan_period(struct task_struct *p, int new_cpu)
				2704	{
				2705	int src_nid = cpu_to_node(task_cpu(p));
				2706	int dst_nid = cpu_to_node(new_cpu);
				2707
				2708	if (!static_branch_likely(&sched_numa_balancing))
				2709	return;
				2710
				2711	if (!p->mm \|\| !p->numa_faults \|\| (p->flags & PF_EXITING))
				2712	return;
				2713
				2714	if (src_nid == dst_nid)
				2715	return;
				2716
				2717	/*
				2718	* Allow resets if faults have been trapped before one scan
				2719	* has completed. This is most likely due to a new task that
				2720	* is pulled cross-node due to wakeups or load balancing.
				2721	*/
				2722	if (p->numa_scan_seq) {
				2723	/*
				2724	* Avoid scan adjustments if moving to the preferred
				2725	* node or if the task was not previously running on
				2726	* the preferred node.
				2727	*/
				2728	if (dst_nid == p->numa_preferred_nid \|\|
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2729	(p->numa_preferred_nid != NUMA_NO_NODE &&
				2730	src_nid != p->numa_preferred_nid))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2731	return;
				2732	}
				2733
				2734	p->numa_scan_period = task_scan_start(p);
				2735	}
				2736
				2737	#else
				2738	static void task_tick_numa(struct rq rq, struct task_struct curr)
				2739	{
				2740	}
				2741
				2742	static inline void account_numa_enqueue(struct rq rq, struct task_struct p)
				2743	{
				2744	}
				2745
				2746	static inline void account_numa_dequeue(struct rq rq, struct task_struct p)
				2747	{
				2748	}
				2749
				2750	static inline void update_scan_period(struct task_struct *p, int new_cpu)
				2751	{
				2752	}
				2753
				2754	#endif /* CONFIG_NUMA_BALANCING */
				2755
				2756	static void
				2757	account_entity_enqueue(struct cfs_rq cfs_rq, struct sched_entity se)
				2758	{
				2759	update_load_add(&cfs_rq->load, se->load.weight);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2760	#ifdef CONFIG_SMP
				2761	if (entity_is_task(se)) {
				2762	struct rq *rq = rq_of(cfs_rq);
				2763
				2764	account_numa_enqueue(rq, task_of(se));
				2765	list_add(&se->group_node, &rq->cfs_tasks);
				2766	}
				2767	#endif
				2768	cfs_rq->nr_running++;
				2769	}
				2770
				2771	static void
				2772	account_entity_dequeue(struct cfs_rq cfs_rq, struct sched_entity se)
				2773	{
				2774	update_load_sub(&cfs_rq->load, se->load.weight);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2775	#ifdef CONFIG_SMP
				2776	if (entity_is_task(se)) {
				2777	account_numa_dequeue(rq_of(cfs_rq), task_of(se));
				2778	list_del_init(&se->group_node);
				2779	}
				2780	#endif
				2781	cfs_rq->nr_running--;
				2782	}
				2783
				2784	/*
				2785	* Signed add and clamp on underflow.
				2786	*
				2787	* Explicitly do a load-store to ensure the intermediate value never hits
				2788	* memory. This allows lockless observations without ever seeing the negative
				2789	* values.
				2790	*/
				2791	#define add_positive(_ptr, _val) do { \
				2792	typeof(_ptr) ptr = (_ptr); \
				2793	typeof(_val) val = (_val); \
				2794	typeof(ptr) res, var = READ_ONCE(ptr); \
				2795	\
				2796	res = var + val; \
				2797	\
				2798	if (val < 0 && res > var) \
				2799	res = 0; \
				2800	\
				2801	WRITE_ONCE(*ptr, res); \
				2802	} while (0)
				2803
				2804	/*
				2805	* Unsigned subtract and clamp on underflow.
				2806	*
				2807	* Explicitly do a load-store to ensure the intermediate value never hits
				2808	* memory. This allows lockless observations without ever seeing the negative
				2809	* values.
				2810	*/
				2811	#define sub_positive(_ptr, _val) do { \
				2812	typeof(_ptr) ptr = (_ptr); \
				2813	typeof(*ptr) val = (_val); \
				2814	typeof(ptr) res, var = READ_ONCE(ptr); \
				2815	res = var - val; \
				2816	if (res > var) \
				2817	res = 0; \
				2818	WRITE_ONCE(*ptr, res); \
				2819	} while (0)
				2820
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2821	/*
				2822	* Remove and clamp on negative, from a local variable.
				2823	*
				2824	* A variant of sub_positive(), which does not use explicit load-store
				2825	* and is thus optimized for local variable updates.
				2826	*/
				2827	#define lsub_positive(_ptr, _val) do { \
				2828	typeof(_ptr) ptr = (_ptr); \
				2829	ptr -= min_t(typeof(ptr), *ptr, _val); \
				2830	} while (0)
				2831
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2832	#ifdef CONFIG_SMP
				2833	static inline void
				2834	enqueue_runnable_load_avg(struct cfs_rq cfs_rq, struct sched_entity se)
				2835	{
				2836	cfs_rq->runnable_weight += se->runnable_weight;
				2837
				2838	cfs_rq->avg.runnable_load_avg += se->avg.runnable_load_avg;
				2839	cfs_rq->avg.runnable_load_sum += se_runnable(se) * se->avg.runnable_load_sum;
				2840	}
				2841
				2842	static inline void
				2843	dequeue_runnable_load_avg(struct cfs_rq cfs_rq, struct sched_entity se)
				2844	{
				2845	cfs_rq->runnable_weight -= se->runnable_weight;
				2846
				2847	sub_positive(&cfs_rq->avg.runnable_load_avg, se->avg.runnable_load_avg);
				2848	sub_positive(&cfs_rq->avg.runnable_load_sum,
				2849	se_runnable(se) * se->avg.runnable_load_sum);
				2850	}
				2851
				2852	static inline void
				2853	enqueue_load_avg(struct cfs_rq cfs_rq, struct sched_entity se)
				2854	{
				2855	cfs_rq->avg.load_avg += se->avg.load_avg;
				2856	cfs_rq->avg.load_sum += se_weight(se) * se->avg.load_sum;
				2857	}
				2858
				2859	static inline void
				2860	dequeue_load_avg(struct cfs_rq cfs_rq, struct sched_entity se)
				2861	{
				2862	sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
				2863	sub_positive(&cfs_rq->avg.load_sum, se_weight(se) * se->avg.load_sum);
				2864	}
				2865	#else
				2866	static inline void
				2867	enqueue_runnable_load_avg(struct cfs_rq cfs_rq, struct sched_entity se) { }
				2868	static inline void
				2869	dequeue_runnable_load_avg(struct cfs_rq cfs_rq, struct sched_entity se) { }
				2870	static inline void
				2871	enqueue_load_avg(struct cfs_rq cfs_rq, struct sched_entity se) { }
				2872	static inline void
				2873	dequeue_load_avg(struct cfs_rq cfs_rq, struct sched_entity se) { }
				2874	#endif
				2875
				2876	static void reweight_entity(struct cfs_rq cfs_rq, struct sched_entity se,
				2877	unsigned long weight, unsigned long runnable)
				2878	{
				2879	if (se->on_rq) {
				2880	/* commit outstanding execution time */
				2881	if (cfs_rq->curr == se)
				2882	update_curr(cfs_rq);
				2883	account_entity_dequeue(cfs_rq, se);
				2884	dequeue_runnable_load_avg(cfs_rq, se);
				2885	}
				2886	dequeue_load_avg(cfs_rq, se);
				2887
				2888	se->runnable_weight = runnable;
				2889	update_load_set(&se->load, weight);
				2890
				2891	#ifdef CONFIG_SMP
				2892	do {
				2893	u32 divider = LOAD_AVG_MAX - 1024 + se->avg.period_contrib;
				2894
				2895	se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, divider);
				2896	se->avg.runnable_load_avg =
				2897	div_u64(se_runnable(se) * se->avg.runnable_load_sum, divider);
				2898	} while (0);
				2899	#endif
				2900
				2901	enqueue_load_avg(cfs_rq, se);
				2902	if (se->on_rq) {
				2903	account_entity_enqueue(cfs_rq, se);
				2904	enqueue_runnable_load_avg(cfs_rq, se);
				2905	}
				2906	}
				2907
				2908	void reweight_task(struct task_struct *p, int prio)
				2909	{
				2910	struct sched_entity *se = &p->se;
				2911	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				2912	struct load_weight *load = &se->load;
				2913	unsigned long weight = scale_load(sched_prio_to_weight[prio]);
				2914
				2915	reweight_entity(cfs_rq, se, weight, weight);
				2916	load->inv_weight = sched_prio_to_wmult[prio];
				2917	}
				2918
				2919	#ifdef CONFIG_FAIR_GROUP_SCHED
				2920	#ifdef CONFIG_SMP
				2921	/*
				2922	* All this does is approximate the hierarchical proportion which includes that
				2923	* global sum we all love to hate.
				2924	*
				2925	* That is, the weight of a group entity, is the proportional share of the
				2926	* group weight based on the group runqueue weights. That is:
				2927	*
				2928	* tg->weight * grq->load.weight
				2929	* ge->load.weight = ----------------------------- (1)
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	2930	* \Sum grq->load.weight
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2931	*
				2932	* Now, because computing that sum is prohibitively expensive to compute (been
				2933	* there, done that) we approximate it with this average stuff. The average
				2934	* moves slower and therefore the approximation is cheaper and more stable.
				2935	*
				2936	* So instead of the above, we substitute:
				2937	*
				2938	* grq->load.weight -> grq->avg.load_avg (2)
				2939	*
				2940	* which yields the following:
				2941	*
				2942	* tg->weight * grq->avg.load_avg
				2943	* ge->load.weight = ------------------------------ (3)
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	2944	* tg->load_avg
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2945	*
				2946	* Where: tg->load_avg ~= \Sum grq->avg.load_avg
				2947	*
				2948	* That is shares_avg, and it is right (given the approximation (2)).
				2949	*
				2950	* The problem with it is that because the average is slow -- it was designed
				2951	* to be exactly that of course -- this leads to transients in boundary
				2952	* conditions. In specific, the case where the group was idle and we start the
				2953	* one task. It takes time for our CPU's grq->avg.load_avg to build up,
				2954	* yielding bad latency etc..
				2955	*
				2956	* Now, in that special case (1) reduces to:
				2957	*
				2958	* tg->weight * grq->load.weight
				2959	* ge->load.weight = ----------------------------- = tg->weight (4)
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	2960	* grp->load.weight
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2961	*
				2962	* That is, the sum collapses because all other CPUs are idle; the UP scenario.
				2963	*
				2964	* So what we do is modify our approximation (3) to approach (4) in the (near)
				2965	* UP case, like:
				2966	*
				2967	* ge->load.weight =
				2968	*
				2969	* tg->weight * grq->load.weight
				2970	* --------------------------------------------------- (5)
				2971	* tg->load_avg - grq->avg.load_avg + grq->load.weight
				2972	*
				2973	* But because grq->load.weight can drop to 0, resulting in a divide by zero,
				2974	* we need to use grq->avg.load_avg as its lower bound, which then gives:
				2975	*
				2976	*
				2977	* tg->weight * grq->load.weight
				2978	* ge->load.weight = ----------------------------- (6)
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	2979	* tg_load_avg'
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2980	*
				2981	* Where:
				2982	*
				2983	* tg_load_avg' = tg->load_avg - grq->avg.load_avg +
				2984	* max(grq->load.weight, grq->avg.load_avg)
				2985	*
				2986	* And that is shares_weight and is icky. In the (near) UP case it approaches
				2987	* (4) while in the normal case it approaches (3). It consistently
				2988	* overestimates the ge->load.weight and therefore:
				2989	*
				2990	* \Sum ge->load.weight >= tg->weight
				2991	*
				2992	* hence icky!
				2993	*/
				2994	static long calc_group_shares(struct cfs_rq *cfs_rq)
				2995	{
				2996	long tg_weight, tg_shares, load, shares;
				2997	struct task_group *tg = cfs_rq->tg;
				2998
				2999	tg_shares = READ_ONCE(tg->shares);
				3000
				3001	load = max(scale_load_down(cfs_rq->load.weight), cfs_rq->avg.load_avg);
				3002
				3003	tg_weight = atomic_long_read(&tg->load_avg);
				3004
				3005	/* Ensure tg_weight >= load */
				3006	tg_weight -= cfs_rq->tg_load_avg_contrib;
				3007	tg_weight += load;
				3008
				3009	shares = (tg_shares * load);
				3010	if (tg_weight)
				3011	shares /= tg_weight;
				3012
				3013	/*
				3014	* MIN_SHARES has to be unscaled here to support per-CPU partitioning
				3015	* of a group with small tg->shares value. It is a floor value which is
				3016	* assigned as a minimum load.weight to the sched_entity representing
				3017	* the group on a CPU.
				3018	*
				3019	* E.g. on 64-bit for a group with tg->shares of scale_load(15)=15*1024
				3020	* on an 8-core system with 8 tasks each runnable on one CPU shares has
				3021	* to be 1510241/8=1920 instead of scale_load(MIN_SHARES)=2*1024. In
				3022	* case no task is runnable on a CPU MIN_SHARES=2 should be returned
				3023	* instead of 0.
				3024	*/
				3025	return clamp_t(long, shares, MIN_SHARES, tg_shares);
				3026	}
				3027
				3028	/*
				3029	* This calculates the effective runnable weight for a group entity based on
				3030	* the group entity weight calculated above.
				3031	*
				3032	* Because of the above approximation (2), our group entity weight is
				3033	* an load_avg based ratio (3). This means that it includes blocked load and
				3034	* does not represent the runnable weight.
				3035	*
				3036	* Approximate the group entity's runnable weight per ratio from the group
				3037	* runqueue:
				3038	*
				3039	* grq->avg.runnable_load_avg
				3040	* ge->runnable_weight = ge->load.weight * -------------------------- (7)
				3041	* grq->avg.load_avg
				3042	*
				3043	* However, analogous to above, since the avg numbers are slow, this leads to
				3044	* transients in the from-idle case. Instead we use:
				3045	*
				3046	* ge->runnable_weight = ge->load.weight *
				3047	*
				3048	* max(grq->avg.runnable_load_avg, grq->runnable_weight)
				3049	* ----------------------------------------------------- (8)
				3050	* max(grq->avg.load_avg, grq->load.weight)
				3051	*
				3052	* Where these max() serve both to use the 'instant' values to fix the slow
				3053	* from-idle and avoid the /0 on to-idle, similar to (6).
				3054	*/
				3055	static long calc_group_runnable(struct cfs_rq *cfs_rq, long shares)
				3056	{
				3057	long runnable, load_avg;
				3058
				3059	load_avg = max(cfs_rq->avg.load_avg,
				3060	scale_load_down(cfs_rq->load.weight));
				3061
				3062	runnable = max(cfs_rq->avg.runnable_load_avg,
				3063	scale_load_down(cfs_rq->runnable_weight));
				3064
				3065	runnable *= shares;
				3066	if (load_avg)
				3067	runnable /= load_avg;
				3068
				3069	return clamp_t(long, runnable, MIN_SHARES, shares);
				3070	}
				3071	#endif /* CONFIG_SMP */
				3072
				3073	static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
				3074
				3075	/*
				3076	* Recomputes the group entity based on the current state of its group
				3077	* runqueue.
				3078	*/
				3079	static void update_cfs_group(struct sched_entity *se)
				3080	{
				3081	struct cfs_rq *gcfs_rq = group_cfs_rq(se);
				3082	long shares, runnable;
				3083
				3084	if (!gcfs_rq)
				3085	return;
				3086
				3087	if (throttled_hierarchy(gcfs_rq))
				3088	return;
				3089
				3090	#ifndef CONFIG_SMP
				3091	runnable = shares = READ_ONCE(gcfs_rq->tg->shares);
				3092
				3093	if (likely(se->load.weight == shares))
				3094	return;
				3095	#else
				3096	shares = calc_group_shares(gcfs_rq);
				3097	runnable = calc_group_runnable(gcfs_rq, shares);
				3098	#endif
				3099
				3100	reweight_entity(cfs_rq_of(se), se, shares, runnable);
				3101	}
				3102
				3103	#else /* CONFIG_FAIR_GROUP_SCHED */
				3104	static inline void update_cfs_group(struct sched_entity *se)
				3105	{
				3106	}
				3107	#endif /* CONFIG_FAIR_GROUP_SCHED */
				3108
				3109	static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags)
				3110	{
				3111	struct rq *rq = rq_of(cfs_rq);
				3112
				3113	if (&rq->cfs == cfs_rq \|\| (flags & SCHED_CPUFREQ_MIGRATION)) {
				3114	/*
				3115	* There are a few boundary cases this might miss but it should
				3116	* get called often enough that that should (hopefully) not be
				3117	* a real problem.
				3118	*
				3119	* It will not get called when we go idle, because the idle
				3120	* thread is a different class (!fair), nor will the utilization
				3121	* number include things like RT tasks.
				3122	*
				3123	* As is, the util number is not freq-invariant (we'd have to
				3124	* implement arch_scale_freq_capacity() for that).
				3125	*
				3126	* See cpu_util().
				3127	*/
				3128	cpufreq_update_util(rq, flags);
				3129	}
				3130	}
				3131
				3132	#ifdef CONFIG_SMP
				3133	#ifdef CONFIG_FAIR_GROUP_SCHED
				3134	/**
				3135	* update_tg_load_avg - update the tg's load avg
				3136	* @cfs_rq: the cfs_rq whose avg changed
				3137	* @force: update regardless of how small the difference
				3138	*
				3139	* This function 'ensures': tg->load_avg := \Sum tg->cfs_rq[]->avg.load.
				3140	* However, because tg->load_avg is a global value there are performance
				3141	* considerations.
				3142	*
				3143	* In order to avoid having to look at the other cfs_rq's, we use a
				3144	* differential update where we store the last value we propagated. This in
				3145	* turn allows skipping updates if the differential is 'small'.
				3146	*
				3147	* Updating tg's load_avg is necessary before update_cfs_share().
				3148	*/
				3149	static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
				3150	{
				3151	long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
				3152
				3153	/*
				3154	* No need to update load_avg for root_task_group as it is not used.
				3155	*/
				3156	if (cfs_rq->tg == &root_task_group)
				3157	return;
				3158
				3159	if (force \|\| abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
				3160	atomic_long_add(delta, &cfs_rq->tg->load_avg);
				3161	cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg;
				3162	}
				3163	}
				3164
				3165	/*
				3166	* Called within set_task_rq() right before setting a task's CPU. The
				3167	* caller only guarantees p->pi_lock is held; no other assumptions,
				3168	* including the state of rq->lock, should be made.
				3169	*/
				3170	void set_task_rq_fair(struct sched_entity *se,
				3171	struct cfs_rq prev, struct cfs_rq next)
				3172	{
				3173	u64 p_last_update_time;
				3174	u64 n_last_update_time;
				3175
				3176	if (!sched_feat(ATTACH_AGE_LOAD))
				3177	return;
				3178
				3179	/*
				3180	* We are supposed to update the task to "current" time, then its up to
				3181	* date and ready to go to new CPU/cfs_rq. But we have difficulty in
				3182	* getting what current time is, so simply throw away the out-of-date
				3183	* time. This will result in the wakee task is less decayed, but giving
				3184	* the wakee more load sounds not bad.
				3185	*/
				3186	if (!(se->avg.last_update_time && prev))
				3187	return;
				3188
				3189	#ifndef CONFIG_64BIT
				3190	{
				3191	u64 p_last_update_time_copy;
				3192	u64 n_last_update_time_copy;
				3193
				3194	do {
				3195	p_last_update_time_copy = prev->load_last_update_time_copy;
				3196	n_last_update_time_copy = next->load_last_update_time_copy;
				3197
				3198	smp_rmb();
				3199
				3200	p_last_update_time = prev->avg.last_update_time;
				3201	n_last_update_time = next->avg.last_update_time;
				3202
				3203	} while (p_last_update_time != p_last_update_time_copy \|\|
				3204	n_last_update_time != n_last_update_time_copy);
				3205	}
				3206	#else
				3207	p_last_update_time = prev->avg.last_update_time;
				3208	n_last_update_time = next->avg.last_update_time;
				3209	#endif
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3210	__update_load_avg_blocked_se(p_last_update_time, se);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3211	se->avg.last_update_time = n_last_update_time;
				3212	}
				3213
				3214
				3215	/*
				3216	* When on migration a sched_entity joins/leaves the PELT hierarchy, we need to
				3217	* propagate its contribution. The key to this propagation is the invariant
				3218	* that for each group:
				3219	*
				3220	* ge->avg == grq->avg (1)
				3221	*
				3222	* _IFF_ we look at the pure running and runnable sums. Because they
				3223	* represent the very same entity, just at different points in the hierarchy.
				3224	*
				3225	* Per the above update_tg_cfs_util() is trivial and simply copies the running
				3226	* sum over (but still wrong, because the group entity and group rq do not have
				3227	* their PELT windows aligned).
				3228	*
				3229	* However, update_tg_cfs_runnable() is more complex. So we have:
				3230	*
				3231	* ge->avg.load_avg = ge->load.weight * ge->avg.runnable_avg (2)
				3232	*
				3233	* And since, like util, the runnable part should be directly transferable,
				3234	* the following would _appear_ to be the straight forward approach:
				3235	*
				3236	* grq->avg.load_avg = grq->load.weight * grq->avg.runnable_avg (3)
				3237	*
				3238	* And per (1) we have:
				3239	*
				3240	* ge->avg.runnable_avg == grq->avg.runnable_avg
				3241	*
				3242	* Which gives:
				3243	*
				3244	* ge->load.weight * grq->avg.load_avg
				3245	* ge->avg.load_avg = ----------------------------------- (4)
				3246	* grq->load.weight
				3247	*
				3248	* Except that is wrong!
				3249	*
				3250	* Because while for entities historical weight is not important and we
				3251	* really only care about our future and therefore can consider a pure
				3252	* runnable sum, runqueues can NOT do this.
				3253	*
				3254	* We specifically want runqueues to have a load_avg that includes
				3255	* historical weights. Those represent the blocked load, the load we expect
				3256	* to (shortly) return to us. This only works by keeping the weights as
				3257	* integral part of the sum. We therefore cannot decompose as per (3).
				3258	*
				3259	* Another reason this doesn't work is that runnable isn't a 0-sum entity.
				3260	* Imagine a rq with 2 tasks that each are runnable 2/3 of the time. Then the
				3261	* rq itself is runnable anywhere between 2/3 and 1 depending on how the
				3262	* runnable section of these tasks overlap (or not). If they were to perfectly
				3263	* align the rq as a whole would be runnable 2/3 of the time. If however we
				3264	* always have at least 1 runnable task, the rq as a whole is always runnable.
				3265	*
				3266	* So we'll have to approximate.. :/
				3267	*
				3268	* Given the constraint:
				3269	*
				3270	* ge->avg.running_sum <= ge->avg.runnable_sum <= LOAD_AVG_MAX
				3271	*
				3272	* We can construct a rule that adds runnable to a rq by assuming minimal
				3273	* overlap.
				3274	*
				3275	* On removal, we'll assume each task is equally runnable; which yields:
				3276	*
				3277	* grq->avg.runnable_sum = grq->avg.load_sum / grq->load.weight
				3278	*
				3279	* XXX: only do this for the part of runnable > running ?
				3280	*
				3281	*/
				3282
				3283	static inline void
				3284	update_tg_cfs_util(struct cfs_rq cfs_rq, struct sched_entity se, struct cfs_rq *gcfs_rq)
				3285	{
				3286	long delta = gcfs_rq->avg.util_avg - se->avg.util_avg;
				3287
				3288	/* Nothing to update */
				3289	if (!delta)
				3290	return;
				3291
				3292	/*
				3293	* The relation between sum and avg is:
				3294	*
				3295	* LOAD_AVG_MAX - 1024 + sa->period_contrib
				3296	*
				3297	* however, the PELT windows are not aligned between grq and gse.
				3298	*/
				3299
				3300	/* Set new sched_entity's utilization */
				3301	se->avg.util_avg = gcfs_rq->avg.util_avg;
				3302	se->avg.util_sum = se->avg.util_avg * LOAD_AVG_MAX;
				3303
				3304	/* Update parent cfs_rq utilization */
				3305	add_positive(&cfs_rq->avg.util_avg, delta);
				3306	cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * LOAD_AVG_MAX;
				3307	}
				3308
				3309	static inline void
				3310	update_tg_cfs_runnable(struct cfs_rq cfs_rq, struct sched_entity se, struct cfs_rq *gcfs_rq)
				3311	{
				3312	long delta_avg, running_sum, runnable_sum = gcfs_rq->prop_runnable_sum;
				3313	unsigned long runnable_load_avg, load_avg;
				3314	u64 runnable_load_sum, load_sum = 0;
				3315	s64 delta_sum;
				3316
				3317	if (!runnable_sum)
				3318	return;
				3319
				3320	gcfs_rq->prop_runnable_sum = 0;
				3321
				3322	if (runnable_sum >= 0) {
				3323	/*
				3324	* Add runnable; clip at LOAD_AVG_MAX. Reflects that until
				3325	* the CPU is saturated running == runnable.
				3326	*/
				3327	runnable_sum += se->avg.load_sum;
				3328	runnable_sum = min(runnable_sum, (long)LOAD_AVG_MAX);
				3329	} else {
				3330	/*
				3331	* Estimate the new unweighted runnable_sum of the gcfs_rq by
				3332	* assuming all tasks are equally runnable.
				3333	*/
				3334	if (scale_load_down(gcfs_rq->load.weight)) {
				3335	load_sum = div_s64(gcfs_rq->avg.load_sum,
				3336	scale_load_down(gcfs_rq->load.weight));
				3337	}
				3338
				3339	/* But make sure to not inflate se's runnable */
				3340	runnable_sum = min(se->avg.load_sum, load_sum);
				3341	}
				3342
				3343	/*
				3344	* runnable_sum can't be lower than running_sum
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3345	* Rescale running sum to be in the same range as runnable sum
				3346	* running_sum is in [0 : LOAD_AVG_MAX << SCHED_CAPACITY_SHIFT]
				3347	* runnable_sum is in [0 : LOAD_AVG_MAX]
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3348	*/
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3349	running_sum = se->avg.util_sum >> SCHED_CAPACITY_SHIFT;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3350	runnable_sum = max(runnable_sum, running_sum);
				3351
				3352	load_sum = (s64)se_weight(se) * runnable_sum;
				3353	load_avg = div_s64(load_sum, LOAD_AVG_MAX);
				3354
				3355	delta_sum = load_sum - (s64)se_weight(se) * se->avg.load_sum;
				3356	delta_avg = load_avg - se->avg.load_avg;
				3357
				3358	se->avg.load_sum = runnable_sum;
				3359	se->avg.load_avg = load_avg;
				3360	add_positive(&cfs_rq->avg.load_avg, delta_avg);
				3361	add_positive(&cfs_rq->avg.load_sum, delta_sum);
				3362
				3363	runnable_load_sum = (s64)se_runnable(se) * runnable_sum;
				3364	runnable_load_avg = div_s64(runnable_load_sum, LOAD_AVG_MAX);
				3365	delta_sum = runnable_load_sum - se_weight(se) * se->avg.runnable_load_sum;
				3366	delta_avg = runnable_load_avg - se->avg.runnable_load_avg;
				3367
				3368	se->avg.runnable_load_sum = runnable_sum;
				3369	se->avg.runnable_load_avg = runnable_load_avg;
				3370
				3371	if (se->on_rq) {
				3372	add_positive(&cfs_rq->avg.runnable_load_avg, delta_avg);
				3373	add_positive(&cfs_rq->avg.runnable_load_sum, delta_sum);
				3374	}
				3375	}
				3376
				3377	static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum)
				3378	{
				3379	cfs_rq->propagate = 1;
				3380	cfs_rq->prop_runnable_sum += runnable_sum;
				3381	}
				3382
				3383	/* Update task and its cfs_rq load average */
				3384	static inline int propagate_entity_load_avg(struct sched_entity *se)
				3385	{
				3386	struct cfs_rq cfs_rq, gcfs_rq;
				3387
				3388	if (entity_is_task(se))
				3389	return 0;
				3390
				3391	gcfs_rq = group_cfs_rq(se);
				3392	if (!gcfs_rq->propagate)
				3393	return 0;
				3394
				3395	gcfs_rq->propagate = 0;
				3396
				3397	cfs_rq = cfs_rq_of(se);
				3398
				3399	add_tg_cfs_propagate(cfs_rq, gcfs_rq->prop_runnable_sum);
				3400
				3401	update_tg_cfs_util(cfs_rq, se, gcfs_rq);
				3402	update_tg_cfs_runnable(cfs_rq, se, gcfs_rq);
				3403
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3404	trace_pelt_cfs_tp(cfs_rq);
				3405	trace_pelt_se_tp(se);
				3406
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3407	return 1;
				3408	}
				3409
				3410	/*
				3411	* Check if we need to update the load and the utilization of a blocked
				3412	* group_entity:
				3413	*/
				3414	static inline bool skip_blocked_update(struct sched_entity *se)
				3415	{
				3416	struct cfs_rq *gcfs_rq = group_cfs_rq(se);
				3417
				3418	/*
				3419	* If sched_entity still have not zero load or utilization, we have to
				3420	* decay it:
				3421	*/
				3422	if (se->avg.load_avg \|\| se->avg.util_avg)
				3423	return false;
				3424
				3425	/*
				3426	* If there is a pending propagation, we have to update the load and
				3427	* the utilization of the sched_entity:
				3428	*/
				3429	if (gcfs_rq->propagate)
				3430	return false;
				3431
				3432	/*
				3433	* Otherwise, the load and the utilization of the sched_entity is
				3434	* already zero and there is no pending propagation, so it will be a
				3435	* waste of time to try to decay it:
				3436	*/
				3437	return true;
				3438	}
				3439
				3440	#else /* CONFIG_FAIR_GROUP_SCHED */
				3441
				3442	static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
				3443
				3444	static inline int propagate_entity_load_avg(struct sched_entity *se)
				3445	{
				3446	return 0;
				3447	}
				3448
				3449	static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum) {}
				3450
				3451	#endif /* CONFIG_FAIR_GROUP_SCHED */
				3452
				3453	/**
				3454	* update_cfs_rq_load_avg - update the cfs_rq's load/util averages
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3455	* @now: current time, as per cfs_rq_clock_pelt()
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3456	* @cfs_rq: cfs_rq to update
				3457	*
				3458	* The cfs_rq avg is the direct sum of all its entities (blocked and runnable)
				3459	* avg. The immediate corollary is that all (fair) tasks must be attached, see
				3460	* post_init_entity_util_avg().
				3461	*
				3462	* cfs_rq->avg is used for task_h_load() and update_cfs_share() for example.
				3463	*
				3464	* Returns true if the load decayed or we removed load.
				3465	*
				3466	* Since both these conditions indicate a changed cfs_rq->avg.load we should
				3467	* call update_tg_load_avg() when this function returns true.
				3468	*/
				3469	static inline int
				3470	update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
				3471	{
				3472	unsigned long removed_load = 0, removed_util = 0, removed_runnable_sum = 0;
				3473	struct sched_avg *sa = &cfs_rq->avg;
				3474	int decayed = 0;
				3475
				3476	if (cfs_rq->removed.nr) {
				3477	unsigned long r;
				3478	u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib;
				3479
				3480	raw_spin_lock(&cfs_rq->removed.lock);
				3481	swap(cfs_rq->removed.util_avg, removed_util);
				3482	swap(cfs_rq->removed.load_avg, removed_load);
				3483	swap(cfs_rq->removed.runnable_sum, removed_runnable_sum);
				3484	cfs_rq->removed.nr = 0;
				3485	raw_spin_unlock(&cfs_rq->removed.lock);
				3486
				3487	r = removed_load;
				3488	sub_positive(&sa->load_avg, r);
				3489	sub_positive(&sa->load_sum, r * divider);
				3490
				3491	r = removed_util;
				3492	sub_positive(&sa->util_avg, r);
				3493	sub_positive(&sa->util_sum, r * divider);
				3494
				3495	add_tg_cfs_propagate(cfs_rq, -(long)removed_runnable_sum);
				3496
				3497	decayed = 1;
				3498	}
				3499
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3500	decayed \|= __update_load_avg_cfs_rq(now, cfs_rq);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3501
				3502	#ifndef CONFIG_64BIT
				3503	smp_wmb();
				3504	cfs_rq->load_last_update_time_copy = sa->last_update_time;
				3505	#endif
				3506
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3507	return decayed;
				3508	}
				3509
				3510	/**
				3511	* attach_entity_load_avg - attach this entity to its cfs_rq load avg
				3512	* @cfs_rq: cfs_rq to attach to
				3513	* @se: sched_entity to attach
				3514	* @flags: migration hints
				3515	*
				3516	* Must call update_cfs_rq_load_avg() before this, since we rely on
				3517	* cfs_rq->avg.last_update_time being current.
				3518	*/
				3519	static void attach_entity_load_avg(struct cfs_rq cfs_rq, struct sched_entity se, int flags)
				3520	{
				3521	u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib;
				3522
				3523	/*
				3524	* When we attach the @se to the @cfs_rq, we must align the decay
				3525	* window because without that, really weird and wonderful things can
				3526	* happen.
				3527	*
				3528	* XXX illustrate
				3529	*/
				3530	se->avg.last_update_time = cfs_rq->avg.last_update_time;
				3531	se->avg.period_contrib = cfs_rq->avg.period_contrib;
				3532
				3533	/*
				3534	* Hell(o) Nasty stuff.. we need to recompute _sum based on the new
				3535	* period_contrib. This isn't strictly correct, but since we're
				3536	* entirely outside of the PELT hierarchy, nobody cares if we truncate
				3537	* _sum a little.
				3538	*/
				3539	se->avg.util_sum = se->avg.util_avg * divider;
				3540
				3541	se->avg.load_sum = divider;
				3542	if (se_weight(se)) {
				3543	se->avg.load_sum =
				3544	div_u64(se->avg.load_avg * se->avg.load_sum, se_weight(se));
				3545	}
				3546
				3547	se->avg.runnable_load_sum = se->avg.load_sum;
				3548
				3549	enqueue_load_avg(cfs_rq, se);
				3550	cfs_rq->avg.util_avg += se->avg.util_avg;
				3551	cfs_rq->avg.util_sum += se->avg.util_sum;
				3552
				3553	add_tg_cfs_propagate(cfs_rq, se->avg.load_sum);
				3554
				3555	cfs_rq_util_change(cfs_rq, flags);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3556
				3557	trace_pelt_cfs_tp(cfs_rq);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3558	}
				3559
				3560	/**
				3561	* detach_entity_load_avg - detach this entity from its cfs_rq load avg
				3562	* @cfs_rq: cfs_rq to detach from
				3563	* @se: sched_entity to detach
				3564	*
				3565	* Must call update_cfs_rq_load_avg() before this, since we rely on
				3566	* cfs_rq->avg.last_update_time being current.
				3567	*/
				3568	static void detach_entity_load_avg(struct cfs_rq cfs_rq, struct sched_entity se)
				3569	{
				3570	dequeue_load_avg(cfs_rq, se);
				3571	sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
				3572	sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
				3573
				3574	add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum);
				3575
				3576	cfs_rq_util_change(cfs_rq, 0);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3577
				3578	trace_pelt_cfs_tp(cfs_rq);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3579	}
				3580
				3581	/*
				3582	* Optional action to be done while updating the load average
				3583	*/
				3584	#define UPDATE_TG 0x1
				3585	#define SKIP_AGE_LOAD 0x2
				3586	#define DO_ATTACH 0x4
				3587
				3588	/* Update task and its cfs_rq load average */
				3589	static inline void update_load_avg(struct cfs_rq cfs_rq, struct sched_entity se, int flags)
				3590	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3591	u64 now = cfs_rq_clock_pelt(cfs_rq);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3592	int decayed;
				3593
				3594	/*
				3595	* Track task load average for carrying it to new CPU after migrated, and
				3596	* track group sched_entity load average for task_h_load calc in migration
				3597	*/
				3598	if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD))
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3599	__update_load_avg_se(now, cfs_rq, se);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3600
				3601	decayed = update_cfs_rq_load_avg(now, cfs_rq);
				3602	decayed \|= propagate_entity_load_avg(se);
				3603
				3604	if (!se->avg.last_update_time && (flags & DO_ATTACH)) {
				3605
				3606	/*
				3607	* DO_ATTACH means we're here from enqueue_entity().
				3608	* !last_update_time means we've passed through
				3609	* migrate_task_rq_fair() indicating we migrated.
				3610	*
				3611	* IOW we're enqueueing a task on a new CPU.
				3612	*/
				3613	attach_entity_load_avg(cfs_rq, se, SCHED_CPUFREQ_MIGRATION);
				3614	update_tg_load_avg(cfs_rq, 0);
				3615
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	3616	} else if (decayed) {
				3617	cfs_rq_util_change(cfs_rq, 0);
				3618
				3619	if (flags & UPDATE_TG)
				3620	update_tg_load_avg(cfs_rq, 0);
				3621	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3622	}
				3623
				3624	#ifndef CONFIG_64BIT
				3625	static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
				3626	{
				3627	u64 last_update_time_copy;
				3628	u64 last_update_time;
				3629
				3630	do {
				3631	last_update_time_copy = cfs_rq->load_last_update_time_copy;
				3632	smp_rmb();
				3633	last_update_time = cfs_rq->avg.last_update_time;
				3634	} while (last_update_time != last_update_time_copy);
				3635
				3636	return last_update_time;
				3637	}
				3638	#else
				3639	static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
				3640	{
				3641	return cfs_rq->avg.last_update_time;
				3642	}
				3643	#endif
				3644
				3645	/*
				3646	* Synchronize entity load avg of dequeued entity without locking
				3647	* the previous rq.
				3648	*/
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3649	static void sync_entity_load_avg(struct sched_entity *se)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3650	{
				3651	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				3652	u64 last_update_time;
				3653
				3654	last_update_time = cfs_rq_last_update_time(cfs_rq);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3655	__update_load_avg_blocked_se(last_update_time, se);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3656	}
				3657
				3658	/*
				3659	* Task first catches up with cfs_rq, and then subtract
				3660	* itself from the cfs_rq (task must be off the queue now).
				3661	*/
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3662	static void remove_entity_load_avg(struct sched_entity *se)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3663	{
				3664	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				3665	unsigned long flags;
				3666
				3667	/*
				3668	* tasks cannot exit without having gone through wake_up_new_task() ->
				3669	* post_init_entity_util_avg() which will have added things to the
				3670	* cfs_rq, so we can remove unconditionally.
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3671	*/
				3672
				3673	sync_entity_load_avg(se);
				3674
				3675	raw_spin_lock_irqsave(&cfs_rq->removed.lock, flags);
				3676	++cfs_rq->removed.nr;
				3677	cfs_rq->removed.util_avg += se->avg.util_avg;
				3678	cfs_rq->removed.load_avg += se->avg.load_avg;
				3679	cfs_rq->removed.runnable_sum += se->avg.load_sum; /* == runnable_sum */
				3680	raw_spin_unlock_irqrestore(&cfs_rq->removed.lock, flags);
				3681	}
				3682
				3683	static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq)
				3684	{
				3685	return cfs_rq->avg.runnable_load_avg;
				3686	}
				3687
				3688	static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
				3689	{
				3690	return cfs_rq->avg.load_avg;
				3691	}
				3692
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3693	static inline unsigned long task_util(struct task_struct *p)
				3694	{
				3695	return READ_ONCE(p->se.avg.util_avg);
				3696	}
				3697
				3698	static inline unsigned long _task_util_est(struct task_struct *p)
				3699	{
				3700	struct util_est ue = READ_ONCE(p->se.avg.util_est);
				3701
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3702	return (max(ue.ewma, ue.enqueued) \| UTIL_AVG_UNCHANGED);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3703	}
				3704
				3705	static inline unsigned long task_util_est(struct task_struct *p)
				3706	{
				3707	return max(task_util(p), _task_util_est(p));
				3708	}
				3709
				3710	static inline void util_est_enqueue(struct cfs_rq *cfs_rq,
				3711	struct task_struct *p)
				3712	{
				3713	unsigned int enqueued;
				3714
				3715	if (!sched_feat(UTIL_EST))
				3716	return;
				3717
				3718	/* Update root cfs_rq's estimated utilization */
				3719	enqueued = cfs_rq->avg.util_est.enqueued;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3720	enqueued += _task_util_est(p);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3721	WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued);
				3722	}
				3723
				3724	/*
				3725	* Check if a (signed) value is within a specified (unsigned) margin,
				3726	* based on the observation that:
				3727	*
				3728	* abs(x) < y := (unsigned)(x + y - 1) < (2 * y - 1)
				3729	*
				3730	* NOTE: this only works when value + maring < INT_MAX.
				3731	*/
				3732	static inline bool within_margin(int value, int margin)
				3733	{
				3734	return ((unsigned int)(value + margin - 1) < (2 * margin - 1));
				3735	}
				3736
				3737	static void
				3738	util_est_dequeue(struct cfs_rq cfs_rq, struct task_struct p, bool task_sleep)
				3739	{
				3740	long last_ewma_diff;
				3741	struct util_est ue;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3742	int cpu;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3743
				3744	if (!sched_feat(UTIL_EST))
				3745	return;
				3746
				3747	/* Update root cfs_rq's estimated utilization */
				3748	ue.enqueued = cfs_rq->avg.util_est.enqueued;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3749	ue.enqueued -= min_t(unsigned int, ue.enqueued, _task_util_est(p));
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3750	WRITE_ONCE(cfs_rq->avg.util_est.enqueued, ue.enqueued);
				3751
				3752	/*
				3753	* Skip update of task's estimated utilization when the task has not
				3754	* yet completed an activation, e.g. being migrated.
				3755	*/
				3756	if (!task_sleep)
				3757	return;
				3758
				3759	/*
				3760	* If the PELT values haven't changed since enqueue time,
				3761	* skip the util_est update.
				3762	*/
				3763	ue = p->se.avg.util_est;
				3764	if (ue.enqueued & UTIL_AVG_UNCHANGED)
				3765	return;
				3766
				3767	/*
				3768	* Skip update of task's estimated utilization when its EWMA is
				3769	* already ~1% close to its last activation value.
				3770	*/
				3771	ue.enqueued = (task_util(p) \| UTIL_AVG_UNCHANGED);
				3772	last_ewma_diff = ue.enqueued - ue.ewma;
				3773	if (within_margin(last_ewma_diff, (SCHED_CAPACITY_SCALE / 100)))
				3774	return;
				3775
				3776	/*
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3777	* To avoid overestimation of actual task utilization, skip updates if
				3778	* we cannot grant there is idle time in this CPU.
				3779	*/
				3780	cpu = cpu_of(rq_of(cfs_rq));
				3781	if (task_util(p) > capacity_orig_of(cpu))
				3782	return;
				3783
				3784	/*
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3785	* Update Task's estimated utilization
				3786	*
				3787	* When *p completes an activation we can consolidate another sample
				3788	* of the task size. This is done by storing the current PELT value
				3789	* as ue.enqueued and by using this value to update the Exponential
				3790	* Weighted Moving Average (EWMA):
				3791	*
				3792	* ewma(t) = w * task_util(p) + (1-w) * ewma(t-1)
				3793	* = w * task_util(p) + ewma(t-1) - w * ewma(t-1)
				3794	* = w * (task_util(p) - ewma(t-1)) + ewma(t-1)
				3795	* = w * ( last_ewma_diff ) + ewma(t-1)
				3796	* = w * (last_ewma_diff + ewma(t-1) / w)
				3797	*
				3798	* Where 'w' is the weight of new samples, which is configured to be
				3799	* 0.25, thus making w=1/4 ( >>= UTIL_EST_WEIGHT_SHIFT)
				3800	*/
				3801	ue.ewma <<= UTIL_EST_WEIGHT_SHIFT;
				3802	ue.ewma += last_ewma_diff;
				3803	ue.ewma >>= UTIL_EST_WEIGHT_SHIFT;
				3804	WRITE_ONCE(p->se.avg.util_est, ue);
				3805	}
				3806
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3807	static inline int task_fits_capacity(struct task_struct *p, long capacity)
				3808	{
				3809	return fits_capacity(task_util_est(p), capacity);
				3810	}
				3811
				3812	static inline void update_misfit_status(struct task_struct p, struct rq rq)
				3813	{
				3814	if (!static_branch_unlikely(&sched_asym_cpucapacity))
				3815	return;
				3816
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	3817	if (!p \|\| p->nr_cpus_allowed == 1) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3818	rq->misfit_task_load = 0;
				3819	return;
				3820	}
				3821
				3822	if (task_fits_capacity(p, capacity_of(cpu_of(rq)))) {
				3823	rq->misfit_task_load = 0;
				3824	return;
				3825	}
				3826
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	3827	/*
				3828	* Make sure that misfit_task_load will not be null even if
				3829	* task_h_load() returns 0.
				3830	*/
				3831	rq->misfit_task_load = max_t(unsigned long, task_h_load(p), 1);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3832	}
				3833
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3834	#else /* CONFIG_SMP */
				3835
				3836	#define UPDATE_TG 0x0
				3837	#define SKIP_AGE_LOAD 0x0
				3838	#define DO_ATTACH 0x0
				3839
				3840	static inline void update_load_avg(struct cfs_rq cfs_rq, struct sched_entity se, int not_used1)
				3841	{
				3842	cfs_rq_util_change(cfs_rq, 0);
				3843	}
				3844
				3845	static inline void remove_entity_load_avg(struct sched_entity *se) {}
				3846
				3847	static inline void
				3848	attach_entity_load_avg(struct cfs_rq cfs_rq, struct sched_entity se, int flags) {}
				3849	static inline void
				3850	detach_entity_load_avg(struct cfs_rq cfs_rq, struct sched_entity se) {}
				3851
				3852	static inline int idle_balance(struct rq rq, struct rq_flags rf)
				3853	{
				3854	return 0;
				3855	}
				3856
				3857	static inline void
				3858	util_est_enqueue(struct cfs_rq cfs_rq, struct task_struct p) {}
				3859
				3860	static inline void
				3861	util_est_dequeue(struct cfs_rq cfs_rq, struct task_struct p,
				3862	bool task_sleep) {}
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3863	static inline void update_misfit_status(struct task_struct p, struct rq rq) {}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3864
				3865	#endif /* CONFIG_SMP */
				3866
				3867	static void check_spread(struct cfs_rq cfs_rq, struct sched_entity se)
				3868	{
				3869	#ifdef CONFIG_SCHED_DEBUG
				3870	s64 d = se->vruntime - cfs_rq->min_vruntime;
				3871
				3872	if (d < 0)
				3873	d = -d;
				3874
				3875	if (d > 3*sysctl_sched_latency)
				3876	schedstat_inc(cfs_rq->nr_spread_over);
				3877	#endif
				3878	}
				3879
				3880	static void
				3881	place_entity(struct cfs_rq cfs_rq, struct sched_entity se, int initial)
				3882	{
				3883	u64 vruntime = cfs_rq->min_vruntime;
				3884
				3885	/*
				3886	* The 'current' period is already promised to the current tasks,
				3887	* however the extra weight of the new task will slow them down a
				3888	* little, place the new task so that it fits in the slot that
				3889	* stays open at the end.
				3890	*/
				3891	if (initial && sched_feat(START_DEBIT))
				3892	vruntime += sched_vslice(cfs_rq, se);
				3893
				3894	/* sleeps up to a single latency don't count. */
				3895	if (!initial) {
				3896	unsigned long thresh = sysctl_sched_latency;
				3897
				3898	/*
				3899	* Halve their sleep time's effect, to allow
				3900	* for a gentler effect of sleepers:
				3901	*/
				3902	if (sched_feat(GENTLE_FAIR_SLEEPERS))
				3903	thresh >>= 1;
				3904
				3905	vruntime -= thresh;
				3906	}
				3907
				3908	/* ensure we never gain time by being placed backwards. */
				3909	se->vruntime = max_vruntime(se->vruntime, vruntime);
				3910	}
				3911
				3912	static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
				3913
				3914	static inline void check_schedstat_required(void)
				3915	{
				3916	#ifdef CONFIG_SCHEDSTATS
				3917	if (schedstat_enabled())
				3918	return;
				3919
				3920	/* Force schedstat enabled if a dependent tracepoint is active */
				3921	if (trace_sched_stat_wait_enabled() \|\|
				3922	trace_sched_stat_sleep_enabled() \|\|
				3923	trace_sched_stat_iowait_enabled() \|\|
				3924	trace_sched_stat_blocked_enabled() \|\|
				3925	trace_sched_stat_runtime_enabled()) {
				3926	printk_deferred_once("Scheduler tracepoints stat_sleep, stat_iowait, "
				3927	"stat_blocked and stat_runtime require the "
				3928	"kernel parameter schedstats=enable or "
				3929	"kernel.sched_schedstats=1\n");
				3930	}
				3931	#endif
				3932	}
				3933
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	3934	static inline bool cfs_bandwidth_used(void);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3935
				3936	/*
				3937	* MIGRATION
				3938	*
				3939	* dequeue
				3940	* update_curr()
				3941	* update_min_vruntime()
				3942	* vruntime -= min_vruntime
				3943	*
				3944	* enqueue
				3945	* update_curr()
				3946	* update_min_vruntime()
				3947	* vruntime += min_vruntime
				3948	*
				3949	* this way the vruntime transition between RQs is done when both
				3950	* min_vruntime are up-to-date.
				3951	*
				3952	* WAKEUP (remote)
				3953	*
				3954	* ->migrate_task_rq_fair() (p->state == TASK_WAKING)
				3955	* vruntime -= min_vruntime
				3956	*
				3957	* enqueue
				3958	* update_curr()
				3959	* update_min_vruntime()
				3960	* vruntime += min_vruntime
				3961	*
				3962	* this way we don't have the most up-to-date min_vruntime on the originating
				3963	* CPU and an up-to-date min_vruntime on the destination CPU.
				3964	*/
				3965
				3966	static void
				3967	enqueue_entity(struct cfs_rq cfs_rq, struct sched_entity se, int flags)
				3968	{
				3969	bool renorm = !(flags & ENQUEUE_WAKEUP) \|\| (flags & ENQUEUE_MIGRATED);
				3970	bool curr = cfs_rq->curr == se;
				3971
				3972	/*
				3973	* If we're the current task, we must renormalise before calling
				3974	* update_curr().
				3975	*/
				3976	if (renorm && curr)
				3977	se->vruntime += cfs_rq->min_vruntime;
				3978
				3979	update_curr(cfs_rq);
				3980
				3981	/*
				3982	* Otherwise, renormalise after, such that we're placed at the current
				3983	* moment in time, instead of some random moment in the past. Being
				3984	* placed in the past could significantly boost this task to the
				3985	* fairness detriment of existing tasks.
				3986	*/
				3987	if (renorm && !curr)
				3988	se->vruntime += cfs_rq->min_vruntime;
				3989
				3990	/*
				3991	* When enqueuing a sched_entity, we must:
				3992	* - Update loads to have both entity and cfs_rq synced with now.
				3993	* - Add its load to cfs_rq->runnable_avg
				3994	* - For group_entity, update its weight to reflect the new share of
				3995	* its group cfs_rq
				3996	* - Add its new weight to cfs_rq->load.weight
				3997	*/
				3998	update_load_avg(cfs_rq, se, UPDATE_TG \| DO_ATTACH);
				3999	update_cfs_group(se);
				4000	enqueue_runnable_load_avg(cfs_rq, se);
				4001	account_entity_enqueue(cfs_rq, se);
				4002
				4003	if (flags & ENQUEUE_WAKEUP)
				4004	place_entity(cfs_rq, se, 0);
				4005
				4006	check_schedstat_required();
				4007	update_stats_enqueue(cfs_rq, se, flags);
				4008	check_spread(cfs_rq, se);
				4009	if (!curr)
				4010	__enqueue_entity(cfs_rq, se);
				4011	se->on_rq = 1;
				4012
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	4013	/*
				4014	* When bandwidth control is enabled, cfs might have been removed
				4015	* because of a parent been throttled but cfs->nr_running > 1. Try to
				4016	* add it unconditionnally.
				4017	*/
				4018	if (cfs_rq->nr_running == 1 \|\| cfs_bandwidth_used())
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4019	list_add_leaf_cfs_rq(cfs_rq);
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	4020
				4021	if (cfs_rq->nr_running == 1)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4022	check_enqueue_throttle(cfs_rq);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4023	}
				4024
				4025	static void __clear_buddies_last(struct sched_entity *se)
				4026	{
				4027	for_each_sched_entity(se) {
				4028	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				4029	if (cfs_rq->last != se)
				4030	break;
				4031
				4032	cfs_rq->last = NULL;
				4033	}
				4034	}
				4035
				4036	static void __clear_buddies_next(struct sched_entity *se)
				4037	{
				4038	for_each_sched_entity(se) {
				4039	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				4040	if (cfs_rq->next != se)
				4041	break;
				4042
				4043	cfs_rq->next = NULL;
				4044	}
				4045	}
				4046
				4047	static void __clear_buddies_skip(struct sched_entity *se)
				4048	{
				4049	for_each_sched_entity(se) {
				4050	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				4051	if (cfs_rq->skip != se)
				4052	break;
				4053
				4054	cfs_rq->skip = NULL;
				4055	}
				4056	}
				4057
				4058	static void clear_buddies(struct cfs_rq cfs_rq, struct sched_entity se)
				4059	{
				4060	if (cfs_rq->last == se)
				4061	__clear_buddies_last(se);
				4062
				4063	if (cfs_rq->next == se)
				4064	__clear_buddies_next(se);
				4065
				4066	if (cfs_rq->skip == se)
				4067	__clear_buddies_skip(se);
				4068	}
				4069
				4070	static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
				4071
				4072	static void
				4073	dequeue_entity(struct cfs_rq cfs_rq, struct sched_entity se, int flags)
				4074	{
				4075	/*
				4076	* Update run-time statistics of the 'current'.
				4077	*/
				4078	update_curr(cfs_rq);
				4079
				4080	/*
				4081	* When dequeuing a sched_entity, we must:
				4082	* - Update loads to have both entity and cfs_rq synced with now.
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4083	* - Subtract its load from the cfs_rq->runnable_avg.
				4084	* - Subtract its previous weight from cfs_rq->load.weight.
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4085	* - For group entity, update its weight to reflect the new share
				4086	* of its group cfs_rq.
				4087	*/
				4088	update_load_avg(cfs_rq, se, UPDATE_TG);
				4089	dequeue_runnable_load_avg(cfs_rq, se);
				4090
				4091	update_stats_dequeue(cfs_rq, se, flags);
				4092
				4093	clear_buddies(cfs_rq, se);
				4094
				4095	if (se != cfs_rq->curr)
				4096	__dequeue_entity(cfs_rq, se);
				4097	se->on_rq = 0;
				4098	account_entity_dequeue(cfs_rq, se);
				4099
				4100	/*
				4101	* Normalize after update_curr(); which will also have moved
				4102	* min_vruntime if @se is the one holding it back. But before doing
				4103	* update_min_vruntime() again, which will discount @se's position and
				4104	* can move min_vruntime forward still more.
				4105	*/
				4106	if (!(flags & DEQUEUE_SLEEP))
				4107	se->vruntime -= cfs_rq->min_vruntime;
				4108
				4109	/* return excess runtime on last dequeue */
				4110	return_cfs_rq_runtime(cfs_rq);
				4111
				4112	update_cfs_group(se);
				4113
				4114	/*
				4115	* Now advance min_vruntime if @se was the entity holding it back,
				4116	* except when: DEQUEUE_SAVE && !DEQUEUE_MOVE, in this case we'll be
				4117	* put back on, and if we advance min_vruntime, we'll be placed back
				4118	* further than we started -- ie. we'll be penalized.
				4119	*/
				4120	if ((flags & (DEQUEUE_SAVE \| DEQUEUE_MOVE)) != DEQUEUE_SAVE)
				4121	update_min_vruntime(cfs_rq);
				4122	}
				4123
				4124	/*
				4125	* Preempt the current task with a newly woken task if needed:
				4126	*/
				4127	static void
				4128	check_preempt_tick(struct cfs_rq cfs_rq, struct sched_entity curr)
				4129	{
				4130	unsigned long ideal_runtime, delta_exec;
				4131	struct sched_entity *se;
				4132	s64 delta;
				4133
				4134	ideal_runtime = sched_slice(cfs_rq, curr);
				4135	delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
				4136	if (delta_exec > ideal_runtime) {
				4137	resched_curr(rq_of(cfs_rq));
				4138	/*
				4139	* The current task ran long enough, ensure it doesn't get
				4140	* re-elected due to buddy favours.
				4141	*/
				4142	clear_buddies(cfs_rq, curr);
				4143	return;
				4144	}
				4145
				4146	/*
				4147	* Ensure that a task that missed wakeup preemption by a
				4148	* narrow margin doesn't have to wait for a full slice.
				4149	* This also mitigates buddy induced latencies under load.
				4150	*/
				4151	if (delta_exec < sysctl_sched_min_granularity)
				4152	return;
				4153
				4154	se = __pick_first_entity(cfs_rq);
				4155	delta = curr->vruntime - se->vruntime;
				4156
				4157	if (delta < 0)
				4158	return;
				4159
				4160	if (delta > ideal_runtime)
				4161	resched_curr(rq_of(cfs_rq));
				4162	}
				4163
				4164	static void
				4165	set_next_entity(struct cfs_rq cfs_rq, struct sched_entity se)
				4166	{
				4167	/* 'current' is not kept within the tree. */
				4168	if (se->on_rq) {
				4169	/*
				4170	* Any task has to be enqueued before it get to execute on
				4171	* a CPU. So account for the time it spent waiting on the
				4172	* runqueue.
				4173	*/
				4174	update_stats_wait_end(cfs_rq, se);
				4175	__dequeue_entity(cfs_rq, se);
				4176	update_load_avg(cfs_rq, se, UPDATE_TG);
				4177	}
				4178
				4179	update_stats_curr_start(cfs_rq, se);
				4180	cfs_rq->curr = se;
				4181
				4182	/*
				4183	* Track our maximum slice length, if the CPU's load is at
				4184	* least twice that of our own weight (i.e. dont track it
				4185	* when there are only lesser-weight tasks around):
				4186	*/
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4187	if (schedstat_enabled() &&
				4188	rq_of(cfs_rq)->cfs.load.weight >= 2*se->load.weight) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4189	schedstat_set(se->statistics.slice_max,
				4190	max((u64)schedstat_val(se->statistics.slice_max),
				4191	se->sum_exec_runtime - se->prev_sum_exec_runtime));
				4192	}
				4193
				4194	se->prev_sum_exec_runtime = se->sum_exec_runtime;
				4195	}
				4196
				4197	static int
				4198	wakeup_preempt_entity(struct sched_entity curr, struct sched_entity se);
				4199
				4200	/*
				4201	* Pick the next process, keeping these things in mind, in this order:
				4202	* 1) keep things fair between processes/task groups
				4203	* 2) pick the "next" process, since someone really wants that to run
				4204	* 3) pick the "last" process, for cache locality
				4205	* 4) do not run the "skip" process, if something else is available
				4206	*/
				4207	static struct sched_entity *
				4208	pick_next_entity(struct cfs_rq cfs_rq, struct sched_entity curr)
				4209	{
				4210	struct sched_entity *left = __pick_first_entity(cfs_rq);
				4211	struct sched_entity *se;
				4212
				4213	/*
				4214	* If curr is set we have to see if its left of the leftmost entity
				4215	* still in the tree, provided there was anything in the tree at all.
				4216	*/
				4217	if (!left \|\| (curr && entity_before(curr, left)))
				4218	left = curr;
				4219
				4220	se = left; /* ideally we run the leftmost entity */
				4221
				4222	/*
				4223	* Avoid running the skip buddy, if running something else can
				4224	* be done without getting too unfair.
				4225	*/
				4226	if (cfs_rq->skip == se) {
				4227	struct sched_entity *second;
				4228
				4229	if (se == curr) {
				4230	second = __pick_first_entity(cfs_rq);
				4231	} else {
				4232	second = __pick_next_entity(se);
				4233	if (!second \|\| (curr && entity_before(curr, second)))
				4234	second = curr;
				4235	}
				4236
				4237	if (second && wakeup_preempt_entity(second, left) < 1)
				4238	se = second;
				4239	}
				4240
				4241	/*
				4242	* Prefer last buddy, try to return the CPU to a preempted task.
				4243	*/
				4244	if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
				4245	se = cfs_rq->last;
				4246
				4247	/*
				4248	* Someone really wants this to run. If it's not unfair, run it.
				4249	*/
				4250	if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
				4251	se = cfs_rq->next;
				4252
				4253	clear_buddies(cfs_rq, se);
				4254
				4255	return se;
				4256	}
				4257
				4258	static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
				4259
				4260	static void put_prev_entity(struct cfs_rq cfs_rq, struct sched_entity prev)
				4261	{
				4262	/*
				4263	* If still on the runqueue then deactivate_task()
				4264	* was not called and update_curr() has to be done:
				4265	*/
				4266	if (prev->on_rq)
				4267	update_curr(cfs_rq);
				4268
				4269	/* throttle cfs_rqs exceeding runtime */
				4270	check_cfs_rq_runtime(cfs_rq);
				4271
				4272	check_spread(cfs_rq, prev);
				4273
				4274	if (prev->on_rq) {
				4275	update_stats_wait_start(cfs_rq, prev);
				4276	/* Put 'current' back into the tree. */
				4277	__enqueue_entity(cfs_rq, prev);
				4278	/* in !on_rq case, update occurred at dequeue */
				4279	update_load_avg(cfs_rq, prev, 0);
				4280	}
				4281	cfs_rq->curr = NULL;
				4282	}
				4283
				4284	static void
				4285	entity_tick(struct cfs_rq cfs_rq, struct sched_entity curr, int queued)
				4286	{
				4287	/*
				4288	* Update run-time statistics of the 'current'.
				4289	*/
				4290	update_curr(cfs_rq);
				4291
				4292	/*
				4293	* Ensure that runnable average is periodically updated.
				4294	*/
				4295	update_load_avg(cfs_rq, curr, UPDATE_TG);
				4296	update_cfs_group(curr);
				4297
				4298	#ifdef CONFIG_SCHED_HRTICK
				4299	/*
				4300	* queued ticks are scheduled to match the slice, so don't bother
				4301	* validating it and just reschedule.
				4302	*/
				4303	if (queued) {
				4304	resched_curr(rq_of(cfs_rq));
				4305	return;
				4306	}
				4307	/*
				4308	* don't let the period tick interfere with the hrtick preemption
				4309	*/
				4310	if (!sched_feat(DOUBLE_TICK) &&
				4311	hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
				4312	return;
				4313	#endif
				4314
				4315	if (cfs_rq->nr_running > 1)
				4316	check_preempt_tick(cfs_rq, curr);
				4317	}
				4318
				4319
				4320	/**************************************************
				4321	* CFS bandwidth control machinery
				4322	*/
				4323
				4324	#ifdef CONFIG_CFS_BANDWIDTH
				4325
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4326	#ifdef CONFIG_JUMP_LABEL
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4327	static struct static_key __cfs_bandwidth_used;
				4328
				4329	static inline bool cfs_bandwidth_used(void)
				4330	{
				4331	return static_key_false(&__cfs_bandwidth_used);
				4332	}
				4333
				4334	void cfs_bandwidth_usage_inc(void)
				4335	{
				4336	static_key_slow_inc_cpuslocked(&__cfs_bandwidth_used);
				4337	}
				4338
				4339	void cfs_bandwidth_usage_dec(void)
				4340	{
				4341	static_key_slow_dec_cpuslocked(&__cfs_bandwidth_used);
				4342	}
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4343	#else /* CONFIG_JUMP_LABEL */
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4344	static bool cfs_bandwidth_used(void)
				4345	{
				4346	return true;
				4347	}
				4348
				4349	void cfs_bandwidth_usage_inc(void) {}
				4350	void cfs_bandwidth_usage_dec(void) {}
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4351	#endif /* CONFIG_JUMP_LABEL */
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4352
				4353	/*
				4354	* default period for cfs group bandwidth.
				4355	* default: 0.1s, units: nanoseconds
				4356	*/
				4357	static inline u64 default_cfs_period(void)
				4358	{
				4359	return 100000000ULL;
				4360	}
				4361
				4362	static inline u64 sched_cfs_bandwidth_slice(void)
				4363	{
				4364	return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
				4365	}
				4366
				4367	/*
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4368	* Replenish runtime according to assigned quota. We use sched_clock_cpu
				4369	* directly instead of rq->clock to avoid adding additional synchronization
				4370	* around rq->lock.
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4371	*
				4372	* requires cfs_b->lock
				4373	*/
				4374	void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
				4375	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4376	if (cfs_b->quota != RUNTIME_INF)
				4377	cfs_b->runtime = cfs_b->quota;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4378	}
				4379
				4380	static inline struct cfs_bandwidth tg_cfs_bandwidth(struct task_group tg)
				4381	{
				4382	return &tg->cfs_bandwidth;
				4383	}
				4384
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4385	/* returns 0 on failure to allocate runtime */
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	4386	static int __assign_cfs_rq_runtime(struct cfs_bandwidth *cfs_b,
				4387	struct cfs_rq *cfs_rq, u64 target_runtime)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4388	{
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	4389	u64 min_amount, amount = 0;
				4390
				4391	lockdep_assert_held(&cfs_b->lock);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4392
				4393	/* note: this is a positive sum as runtime_remaining <= 0 */
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	4394	min_amount = target_runtime - cfs_rq->runtime_remaining;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4395
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4396	if (cfs_b->quota == RUNTIME_INF)
				4397	amount = min_amount;
				4398	else {
				4399	start_cfs_bandwidth(cfs_b);
				4400
				4401	if (cfs_b->runtime > 0) {
				4402	amount = min(cfs_b->runtime, min_amount);
				4403	cfs_b->runtime -= amount;
				4404	cfs_b->idle = 0;
				4405	}
				4406	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4407
				4408	cfs_rq->runtime_remaining += amount;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4409
				4410	return cfs_rq->runtime_remaining > 0;
				4411	}
				4412
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	4413	/* returns 0 on failure to allocate runtime */
				4414	static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
				4415	{
				4416	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
				4417	int ret;
				4418
				4419	raw_spin_lock(&cfs_b->lock);
				4420	ret = __assign_cfs_rq_runtime(cfs_b, cfs_rq, sched_cfs_bandwidth_slice());
				4421	raw_spin_unlock(&cfs_b->lock);
				4422
				4423	return ret;
				4424	}
				4425
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4426	static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
				4427	{
				4428	/* dock delta_exec before expiring quota (as it could span periods) */
				4429	cfs_rq->runtime_remaining -= delta_exec;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4430
				4431	if (likely(cfs_rq->runtime_remaining > 0))
				4432	return;
				4433
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4434	if (cfs_rq->throttled)
				4435	return;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4436	/*
				4437	* if we're unable to extend our runtime we resched so that the active
				4438	* hierarchy can be throttled
				4439	*/
				4440	if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
				4441	resched_curr(rq_of(cfs_rq));
				4442	}
				4443
				4444	static __always_inline
				4445	void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
				4446	{
				4447	if (!cfs_bandwidth_used() \|\| !cfs_rq->runtime_enabled)
				4448	return;
				4449
				4450	__account_cfs_rq_runtime(cfs_rq, delta_exec);
				4451	}
				4452
				4453	static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
				4454	{
				4455	return cfs_bandwidth_used() && cfs_rq->throttled;
				4456	}
				4457
				4458	/* check whether cfs_rq, or any parent, is throttled */
				4459	static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
				4460	{
				4461	return cfs_bandwidth_used() && cfs_rq->throttle_count;
				4462	}
				4463
				4464	/*
				4465	* Ensure that neither of the group entities corresponding to src_cpu or
				4466	* dest_cpu are members of a throttled hierarchy when performing group
				4467	* load-balance operations.
				4468	*/
				4469	static inline int throttled_lb_pair(struct task_group *tg,
				4470	int src_cpu, int dest_cpu)
				4471	{
				4472	struct cfs_rq src_cfs_rq, dest_cfs_rq;
				4473
				4474	src_cfs_rq = tg->cfs_rq[src_cpu];
				4475	dest_cfs_rq = tg->cfs_rq[dest_cpu];
				4476
				4477	return throttled_hierarchy(src_cfs_rq) \|\|
				4478	throttled_hierarchy(dest_cfs_rq);
				4479	}
				4480
				4481	static int tg_unthrottle_up(struct task_group tg, void data)
				4482	{
				4483	struct rq *rq = data;
				4484	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
				4485
				4486	cfs_rq->throttle_count--;
				4487	if (!cfs_rq->throttle_count) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4488	cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
				4489	cfs_rq->throttled_clock_task;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4490
				4491	/* Add cfs_rq with already running entity in the list */
				4492	if (cfs_rq->nr_running >= 1)
				4493	list_add_leaf_cfs_rq(cfs_rq);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4494	}
				4495
				4496	return 0;
				4497	}
				4498
				4499	static int tg_throttle_down(struct task_group tg, void data)
				4500	{
				4501	struct rq *rq = data;
				4502	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
				4503
				4504	/* group is entering throttled state, stop time */
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4505	if (!cfs_rq->throttle_count) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4506	cfs_rq->throttled_clock_task = rq_clock_task(rq);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4507	list_del_leaf_cfs_rq(cfs_rq);
				4508	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4509	cfs_rq->throttle_count++;
				4510
				4511	return 0;
				4512	}
				4513
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	4514	static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4515	{
				4516	struct rq *rq = rq_of(cfs_rq);
				4517	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
				4518	struct sched_entity *se;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4519	long task_delta, idle_task_delta, dequeue = 1;
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	4520
				4521	raw_spin_lock(&cfs_b->lock);
				4522	/* This will start the period timer if necessary */
				4523	if (__assign_cfs_rq_runtime(cfs_b, cfs_rq, 1)) {
				4524	/*
				4525	* We have raced with bandwidth becoming available, and if we
				4526	* actually throttled the timer might not unthrottle us for an
				4527	* entire period. We additionally needed to make sure that any
				4528	* subsequent check_cfs_rq_runtime calls agree not to throttle
				4529	* us, as we may commit to do cfs put_prev+pick_next, so we ask
				4530	* for 1ns of runtime rather than just check cfs_b.
				4531	*/
				4532	dequeue = 0;
				4533	} else {
				4534	list_add_tail_rcu(&cfs_rq->throttled_list,
				4535	&cfs_b->throttled_cfs_rq);
				4536	}
				4537	raw_spin_unlock(&cfs_b->lock);
				4538
				4539	if (!dequeue)
				4540	return false; /* Throttle no longer required. */
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4541
				4542	se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
				4543
				4544	/* freeze hierarchy runnable averages while throttled */
				4545	rcu_read_lock();
				4546	walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
				4547	rcu_read_unlock();
				4548
				4549	task_delta = cfs_rq->h_nr_running;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4550	idle_task_delta = cfs_rq->idle_h_nr_running;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4551	for_each_sched_entity(se) {
				4552	struct cfs_rq *qcfs_rq = cfs_rq_of(se);
				4553	/* throttled entity or throttle-on-deactivate */
				4554	if (!se->on_rq)
				4555	break;
				4556
				4557	if (dequeue)
				4558	dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
				4559	qcfs_rq->h_nr_running -= task_delta;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4560	qcfs_rq->idle_h_nr_running -= idle_task_delta;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4561
				4562	if (qcfs_rq->load.weight)
				4563	dequeue = 0;
				4564	}
				4565
				4566	if (!se)
				4567	sub_nr_running(rq, task_delta);
				4568
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	4569	/*
				4570	* Note: distribution will already see us throttled via the
				4571	* throttled-list. rq->lock protects completion.
				4572	*/
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4573	cfs_rq->throttled = 1;
				4574	cfs_rq->throttled_clock = rq_clock(rq);
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	4575	return true;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4576	}
				4577
				4578	void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
				4579	{
				4580	struct rq *rq = rq_of(cfs_rq);
				4581	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
				4582	struct sched_entity *se;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4583	long task_delta, idle_task_delta;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4584
				4585	se = cfs_rq->tg->se[cpu_of(rq)];
				4586
				4587	cfs_rq->throttled = 0;
				4588
				4589	update_rq_clock(rq);
				4590
				4591	raw_spin_lock(&cfs_b->lock);
				4592	cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock;
				4593	list_del_rcu(&cfs_rq->throttled_list);
				4594	raw_spin_unlock(&cfs_b->lock);
				4595
				4596	/* update hierarchical throttle state */
				4597	walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
				4598
				4599	if (!cfs_rq->load.weight)
				4600	return;
				4601
				4602	task_delta = cfs_rq->h_nr_running;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4603	idle_task_delta = cfs_rq->idle_h_nr_running;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4604	for_each_sched_entity(se) {
				4605	if (se->on_rq)
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	4606	break;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4607	cfs_rq = cfs_rq_of(se);
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	4608	enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
				4609
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4610	cfs_rq->h_nr_running += task_delta;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4611	cfs_rq->idle_h_nr_running += idle_task_delta;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4612
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	4613	/* end evaluation on encountering a throttled cfs_rq */
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4614	if (cfs_rq_throttled(cfs_rq))
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	4615	goto unthrottle_throttle;
				4616	}
				4617
				4618	for_each_sched_entity(se) {
				4619	cfs_rq = cfs_rq_of(se);
				4620
				4621	cfs_rq->h_nr_running += task_delta;
				4622	cfs_rq->idle_h_nr_running += idle_task_delta;
				4623
				4624
				4625	/* end evaluation on encountering a throttled cfs_rq */
				4626	if (cfs_rq_throttled(cfs_rq))
				4627	goto unthrottle_throttle;
				4628
				4629	/*
				4630	* One parent has been throttled and cfs_rq removed from the
				4631	* list. Add it back to not break the leaf list.
				4632	*/
				4633	if (throttled_hierarchy(cfs_rq))
				4634	list_add_leaf_cfs_rq(cfs_rq);
				4635	}
				4636
				4637	/* At this point se is NULL and we are at root level*/
				4638	add_nr_running(rq, task_delta);
				4639
				4640	unthrottle_throttle:
				4641	/*
				4642	* The cfs_rq_throttled() breaks in the above iteration can result in
				4643	* incomplete leaf list maintenance, resulting in triggering the
				4644	* assertion below.
				4645	*/
				4646	for_each_sched_entity(se) {
				4647	cfs_rq = cfs_rq_of(se);
				4648
				4649	if (list_add_leaf_cfs_rq(cfs_rq))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4650	break;
				4651	}
				4652
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4653	assert_list_leaf_cfs_rq(rq);
				4654
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4655	/* Determine whether we need to wake up potentially idle CPU: */
				4656	if (rq->curr == rq->idle && rq->cfs.nr_running)
				4657	resched_curr(rq);
				4658	}
				4659
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4660	static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, u64 remaining)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4661	{
				4662	struct cfs_rq *cfs_rq;
				4663	u64 runtime;
				4664	u64 starting_runtime = remaining;
				4665
				4666	rcu_read_lock();
				4667	list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
				4668	throttled_list) {
				4669	struct rq *rq = rq_of(cfs_rq);
				4670	struct rq_flags rf;
				4671
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4672	rq_lock_irqsave(rq, &rf);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4673	if (!cfs_rq_throttled(cfs_rq))
				4674	goto next;
				4675
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4676	/* By the above check, this should never be true */
				4677	SCHED_WARN_ON(cfs_rq->runtime_remaining > 0);
				4678
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4679	runtime = -cfs_rq->runtime_remaining + 1;
				4680	if (runtime > remaining)
				4681	runtime = remaining;
				4682	remaining -= runtime;
				4683
				4684	cfs_rq->runtime_remaining += runtime;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4685
				4686	/* we check whether we're throttled above */
				4687	if (cfs_rq->runtime_remaining > 0)
				4688	unthrottle_cfs_rq(cfs_rq);
				4689
				4690	next:
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4691	rq_unlock_irqrestore(rq, &rf);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4692
				4693	if (!remaining)
				4694	break;
				4695	}
				4696	rcu_read_unlock();
				4697
				4698	return starting_runtime - remaining;
				4699	}
				4700
				4701	/*
				4702	* Responsible for refilling a task_group's bandwidth and unthrottling its
				4703	* cfs_rqs as appropriate. If there has been no activity within the last
				4704	* period the timer is deactivated until scheduling resumes; cfs_b->idle is
				4705	* used to track this state.
				4706	*/
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4707	static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, unsigned long flags)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4708	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4709	u64 runtime;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4710	int throttled;
				4711
				4712	/* no need to continue the timer with no bandwidth constraint */
				4713	if (cfs_b->quota == RUNTIME_INF)
				4714	goto out_deactivate;
				4715
				4716	throttled = !list_empty(&cfs_b->throttled_cfs_rq);
				4717	cfs_b->nr_periods += overrun;
				4718
				4719	/*
				4720	* idle depends on !throttled (for the case of a large deficit), and if
				4721	* we're going inactive then everything else can be deferred
				4722	*/
				4723	if (cfs_b->idle && !throttled)
				4724	goto out_deactivate;
				4725
				4726	__refill_cfs_bandwidth_runtime(cfs_b);
				4727
				4728	if (!throttled) {
				4729	/* mark as potentially idle for the upcoming period */
				4730	cfs_b->idle = 1;
				4731	return 0;
				4732	}
				4733
				4734	/* account preceding periods in which throttling occurred */
				4735	cfs_b->nr_throttled += overrun;
				4736
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4737	/*
				4738	* This check is repeated as we are holding onto the new bandwidth while
				4739	* we unthrottle. This can potentially race with an unthrottled group
				4740	* trying to acquire new bandwidth from the global pool. This can result
				4741	* in us over-using our runtime if it is all used during this loop, but
				4742	* only by limited amounts in that extreme case.
				4743	*/
				4744	while (throttled && cfs_b->runtime > 0 && !cfs_b->distribute_running) {
				4745	runtime = cfs_b->runtime;
				4746	cfs_b->distribute_running = 1;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4747	raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4748	/* we can't nest cfs_b->lock while distributing bandwidth */
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4749	runtime = distribute_cfs_runtime(cfs_b, runtime);
				4750	raw_spin_lock_irqsave(&cfs_b->lock, flags);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4751
				4752	cfs_b->distribute_running = 0;
				4753	throttled = !list_empty(&cfs_b->throttled_cfs_rq);
				4754
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4755	lsub_positive(&cfs_b->runtime, runtime);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4756	}
				4757
				4758	/*
				4759	* While we are ensured activity in the period following an
				4760	* unthrottle, this also covers the case in which the new bandwidth is
				4761	* insufficient to cover the existing bandwidth deficit. (Forcing the
				4762	* timer to remain active while there are any throttled entities.)
				4763	*/
				4764	cfs_b->idle = 0;
				4765
				4766	return 0;
				4767
				4768	out_deactivate:
				4769	return 1;
				4770	}
				4771
				4772	/* a cfs_rq won't donate quota below this amount */
				4773	static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;
				4774	/* minimum remaining period time to redistribute slack quota */
				4775	static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
				4776	/* how long we wait to gather additional slack before distributing */
				4777	static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
				4778
				4779	/*
				4780	* Are we near the end of the current quota period?
				4781	*
				4782	* Requires cfs_b->lock for hrtimer_expires_remaining to be safe against the
				4783	* hrtimer base being cleared by hrtimer_start. In the case of
				4784	* migrate_hrtimers, base is never cleared, so we are fine.
				4785	*/
				4786	static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
				4787	{
				4788	struct hrtimer *refresh_timer = &cfs_b->period_timer;
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	4789	s64 remaining;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4790
				4791	/* if the call-back is running a quota refresh is already occurring */
				4792	if (hrtimer_callback_running(refresh_timer))
				4793	return 1;
				4794
				4795	/* is a quota refresh about to occur? */
				4796	remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	4797	if (remaining < (s64)min_expire)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4798	return 1;
				4799
				4800	return 0;
				4801	}
				4802
				4803	static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
				4804	{
				4805	u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
				4806
				4807	/* if there's a quota refresh soon don't bother with slack */
				4808	if (runtime_refresh_within(cfs_b, min_left))
				4809	return;
				4810
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4811	/* don't push forwards an existing deferred unthrottle */
				4812	if (cfs_b->slack_started)
				4813	return;
				4814	cfs_b->slack_started = true;
				4815
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4816	hrtimer_start(&cfs_b->slack_timer,
				4817	ns_to_ktime(cfs_bandwidth_slack_period),
				4818	HRTIMER_MODE_REL);
				4819	}
				4820
				4821	/* we know any runtime found here is valid as update_curr() precedes return */
				4822	static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
				4823	{
				4824	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
				4825	s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
				4826
				4827	if (slack_runtime <= 0)
				4828	return;
				4829
				4830	raw_spin_lock(&cfs_b->lock);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4831	if (cfs_b->quota != RUNTIME_INF) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4832	cfs_b->runtime += slack_runtime;
				4833
				4834	/* we are under rq->lock, defer unthrottling using a timer */
				4835	if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
				4836	!list_empty(&cfs_b->throttled_cfs_rq))
				4837	start_cfs_slack_bandwidth(cfs_b);
				4838	}
				4839	raw_spin_unlock(&cfs_b->lock);
				4840
				4841	/* even if it's not valid for return we don't want to try again */
				4842	cfs_rq->runtime_remaining -= slack_runtime;
				4843	}
				4844
				4845	static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
				4846	{
				4847	if (!cfs_bandwidth_used())
				4848	return;
				4849
				4850	if (!cfs_rq->runtime_enabled \|\| cfs_rq->nr_running)
				4851	return;
				4852
				4853	__return_cfs_rq_runtime(cfs_rq);
				4854	}
				4855
				4856	/*
				4857	* This is done with a timer (instead of inline with bandwidth return) since
				4858	* it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs.
				4859	*/
				4860	static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
				4861	{
				4862	u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4863	unsigned long flags;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4864
				4865	/* confirm we're still not at a refresh boundary */
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4866	raw_spin_lock_irqsave(&cfs_b->lock, flags);
				4867	cfs_b->slack_started = false;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4868	if (cfs_b->distribute_running) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4869	raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4870	return;
				4871	}
				4872
				4873	if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4874	raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4875	return;
				4876	}
				4877
				4878	if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice)
				4879	runtime = cfs_b->runtime;
				4880
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4881	if (runtime)
				4882	cfs_b->distribute_running = 1;
				4883
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4884	raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4885
				4886	if (!runtime)
				4887	return;
				4888
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4889	runtime = distribute_cfs_runtime(cfs_b, runtime);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4890
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4891	raw_spin_lock_irqsave(&cfs_b->lock, flags);
				4892	lsub_positive(&cfs_b->runtime, runtime);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4893	cfs_b->distribute_running = 0;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4894	raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4895	}
				4896
				4897	/*
				4898	* When a group wakes up we want to make sure that its quota is not already
				4899	* expired/exceeded, otherwise it may be allowed to steal additional ticks of
				4900	* runtime as update_curr() throttling can not not trigger until it's on-rq.
				4901	*/
				4902	static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
				4903	{
				4904	if (!cfs_bandwidth_used())
				4905	return;
				4906
				4907	/* an active group must be handled by the update_curr()->put() path */
				4908	if (!cfs_rq->runtime_enabled \|\| cfs_rq->curr)
				4909	return;
				4910
				4911	/* ensure the group is not already throttled */
				4912	if (cfs_rq_throttled(cfs_rq))
				4913	return;
				4914
				4915	/* update runtime allocation */
				4916	account_cfs_rq_runtime(cfs_rq, 0);
				4917	if (cfs_rq->runtime_remaining <= 0)
				4918	throttle_cfs_rq(cfs_rq);
				4919	}
				4920
				4921	static void sync_throttle(struct task_group *tg, int cpu)
				4922	{
				4923	struct cfs_rq pcfs_rq, cfs_rq;
				4924
				4925	if (!cfs_bandwidth_used())
				4926	return;
				4927
				4928	if (!tg->parent)
				4929	return;
				4930
				4931	cfs_rq = tg->cfs_rq[cpu];
				4932	pcfs_rq = tg->parent->cfs_rq[cpu];
				4933
				4934	cfs_rq->throttle_count = pcfs_rq->throttle_count;
				4935	cfs_rq->throttled_clock_task = rq_clock_task(cpu_rq(cpu));
				4936	}
				4937
				4938	/* conditionally throttle active cfs_rq's from put_prev_entity() */
				4939	static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
				4940	{
				4941	if (!cfs_bandwidth_used())
				4942	return false;
				4943
				4944	if (likely(!cfs_rq->runtime_enabled \|\| cfs_rq->runtime_remaining > 0))
				4945	return false;
				4946
				4947	/*
				4948	* it's possible for a throttled entity to be forced into a running
				4949	* state (e.g. set_curr_task), in this case we're finished.
				4950	*/
				4951	if (cfs_rq_throttled(cfs_rq))
				4952	return true;
				4953
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	4954	return throttle_cfs_rq(cfs_rq);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4955	}
				4956
				4957	static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
				4958	{
				4959	struct cfs_bandwidth *cfs_b =
				4960	container_of(timer, struct cfs_bandwidth, slack_timer);
				4961
				4962	do_sched_cfs_slack_timer(cfs_b);
				4963
				4964	return HRTIMER_NORESTART;
				4965	}
				4966
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4967	extern const u64 max_cfs_quota_period;
				4968
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4969	static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
				4970	{
				4971	struct cfs_bandwidth *cfs_b =
				4972	container_of(timer, struct cfs_bandwidth, period_timer);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4973	unsigned long flags;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4974	int overrun;
				4975	int idle = 0;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4976	int count = 0;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4977
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4978	raw_spin_lock_irqsave(&cfs_b->lock, flags);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4979	for (;;) {
				4980	overrun = hrtimer_forward_now(timer, cfs_b->period);
				4981	if (!overrun)
				4982	break;
				4983
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	4984	idle = do_sched_cfs_period_timer(cfs_b, overrun, flags);
				4985
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4986	if (++count > 3) {
				4987	u64 new, old = ktime_to_ns(cfs_b->period);
				4988
				4989	/*
				4990	* Grow period by a factor of 2 to avoid losing precision.
				4991	* Precision loss in the quota/period ratio can cause __cfs_schedulable
				4992	* to fail.
				4993	*/
				4994	new = old * 2;
				4995	if (new < max_cfs_quota_period) {
				4996	cfs_b->period = ns_to_ktime(new);
				4997	cfs_b->quota *= 2;
				4998
				4999	pr_warn_ratelimited(
				5000	"cfs_period_timer[cpu%d]: period too short, scaling up (new cfs_period_us = %lld, cfs_quota_us = %lld)\n",
				5001	smp_processor_id(),
				5002	div_u64(new, NSEC_PER_USEC),
				5003	div_u64(cfs_b->quota, NSEC_PER_USEC));
				5004	} else {
				5005	pr_warn_ratelimited(
				5006	"cfs_period_timer[cpu%d]: period too short, but cannot scale up without losing precision (cfs_period_us = %lld, cfs_quota_us = %lld)\n",
				5007	smp_processor_id(),
				5008	div_u64(old, NSEC_PER_USEC),
				5009	div_u64(cfs_b->quota, NSEC_PER_USEC));
				5010	}
				5011
				5012	/* reset count so we don't come right back in here */
				5013	count = 0;
				5014	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5015	}
				5016	if (idle)
				5017	cfs_b->period_active = 0;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5018	raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5019
				5020	return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
				5021	}
				5022
				5023	void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
				5024	{
				5025	raw_spin_lock_init(&cfs_b->lock);
				5026	cfs_b->runtime = 0;
				5027	cfs_b->quota = RUNTIME_INF;
				5028	cfs_b->period = ns_to_ktime(default_cfs_period());
				5029
				5030	INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
				5031	hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
				5032	cfs_b->period_timer.function = sched_cfs_period_timer;
				5033	hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
				5034	cfs_b->slack_timer.function = sched_cfs_slack_timer;
				5035	cfs_b->distribute_running = 0;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5036	cfs_b->slack_started = false;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5037	}
				5038
				5039	static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
				5040	{
				5041	cfs_rq->runtime_enabled = 0;
				5042	INIT_LIST_HEAD(&cfs_rq->throttled_list);
				5043	}
				5044
				5045	void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
				5046	{
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5047	lockdep_assert_held(&cfs_b->lock);
				5048
				5049	if (cfs_b->period_active)
				5050	return;
				5051
				5052	cfs_b->period_active = 1;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5053	hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5054	hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED);
				5055	}
				5056
				5057	static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
				5058	{
				5059	/* init_cfs_bandwidth() was not called */
				5060	if (!cfs_b->throttled_cfs_rq.next)
				5061	return;
				5062
				5063	hrtimer_cancel(&cfs_b->period_timer);
				5064	hrtimer_cancel(&cfs_b->slack_timer);
				5065	}
				5066
				5067	/*
				5068	* Both these CPU hotplug callbacks race against unregister_fair_sched_group()
				5069	*
				5070	* The race is harmless, since modifying bandwidth settings of unhooked group
				5071	* bits doesn't do much.
				5072	*/
				5073
				5074	/* cpu online calback */
				5075	static void __maybe_unused update_runtime_enabled(struct rq *rq)
				5076	{
				5077	struct task_group *tg;
				5078
				5079	lockdep_assert_held(&rq->lock);
				5080
				5081	rcu_read_lock();
				5082	list_for_each_entry_rcu(tg, &task_groups, list) {
				5083	struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
				5084	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
				5085
				5086	raw_spin_lock(&cfs_b->lock);
				5087	cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF;
				5088	raw_spin_unlock(&cfs_b->lock);
				5089	}
				5090	rcu_read_unlock();
				5091	}
				5092
				5093	/* cpu offline callback */
				5094	static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
				5095	{
				5096	struct task_group *tg;
				5097
				5098	lockdep_assert_held(&rq->lock);
				5099
				5100	rcu_read_lock();
				5101	list_for_each_entry_rcu(tg, &task_groups, list) {
				5102	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
				5103
				5104	if (!cfs_rq->runtime_enabled)
				5105	continue;
				5106
				5107	/*
				5108	* clock_task is not advancing so we just need to make sure
				5109	* there's some valid quota amount
				5110	*/
				5111	cfs_rq->runtime_remaining = 1;
				5112	/*
				5113	* Offline rq is schedulable till CPU is completely disabled
				5114	* in take_cpu_down(), so we prevent new cfs throttling here.
				5115	*/
				5116	cfs_rq->runtime_enabled = 0;
				5117
				5118	if (cfs_rq_throttled(cfs_rq))
				5119	unthrottle_cfs_rq(cfs_rq);
				5120	}
				5121	rcu_read_unlock();
				5122	}
				5123
				5124	#else /* CONFIG_CFS_BANDWIDTH */
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5125
				5126	static inline bool cfs_bandwidth_used(void)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5127	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5128	return false;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5129	}
				5130
				5131	static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
				5132	static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
				5133	static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
				5134	static inline void sync_throttle(struct task_group *tg, int cpu) {}
				5135	static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
				5136
				5137	static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
				5138	{
				5139	return 0;
				5140	}
				5141
				5142	static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
				5143	{
				5144	return 0;
				5145	}
				5146
				5147	static inline int throttled_lb_pair(struct task_group *tg,
				5148	int src_cpu, int dest_cpu)
				5149	{
				5150	return 0;
				5151	}
				5152
				5153	void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
				5154
				5155	#ifdef CONFIG_FAIR_GROUP_SCHED
				5156	static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
				5157	#endif
				5158
				5159	static inline struct cfs_bandwidth tg_cfs_bandwidth(struct task_group tg)
				5160	{
				5161	return NULL;
				5162	}
				5163	static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
				5164	static inline void update_runtime_enabled(struct rq *rq) {}
				5165	static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
				5166
				5167	#endif /* CONFIG_CFS_BANDWIDTH */
				5168
				5169	/**************************************************
				5170	* CFS operations on tasks:
				5171	*/
				5172
				5173	#ifdef CONFIG_SCHED_HRTICK
				5174	static void hrtick_start_fair(struct rq rq, struct task_struct p)
				5175	{
				5176	struct sched_entity *se = &p->se;
				5177	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				5178
				5179	SCHED_WARN_ON(task_rq(p) != rq);
				5180
				5181	if (rq->cfs.h_nr_running > 1) {
				5182	u64 slice = sched_slice(cfs_rq, se);
				5183	u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
				5184	s64 delta = slice - ran;
				5185
				5186	if (delta < 0) {
				5187	if (rq->curr == p)
				5188	resched_curr(rq);
				5189	return;
				5190	}
				5191	hrtick_start(rq, delta);
				5192	}
				5193	}
				5194
				5195	/*
				5196	* called from enqueue/dequeue and updates the hrtick when the
				5197	* current task is from our class and nr_running is low enough
				5198	* to matter.
				5199	*/
				5200	static void hrtick_update(struct rq *rq)
				5201	{
				5202	struct task_struct *curr = rq->curr;
				5203
				5204	if (!hrtick_enabled(rq) \|\| curr->sched_class != &fair_sched_class)
				5205	return;
				5206
				5207	if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
				5208	hrtick_start_fair(rq, curr);
				5209	}
				5210	#else /* !CONFIG_SCHED_HRTICK */
				5211	static inline void
				5212	hrtick_start_fair(struct rq rq, struct task_struct p)
				5213	{
				5214	}
				5215
				5216	static inline void hrtick_update(struct rq *rq)
				5217	{
				5218	}
				5219	#endif
				5220
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5221	#ifdef CONFIG_SMP
				5222	static inline unsigned long cpu_util(int cpu);
				5223
				5224	static inline bool cpu_overutilized(int cpu)
				5225	{
				5226	return !fits_capacity(cpu_util(cpu), capacity_of(cpu));
				5227	}
				5228
				5229	static inline void update_overutilized_status(struct rq *rq)
				5230	{
				5231	if (!READ_ONCE(rq->rd->overutilized) && cpu_overutilized(rq->cpu)) {
				5232	WRITE_ONCE(rq->rd->overutilized, SG_OVERUTILIZED);
				5233	trace_sched_overutilized_tp(rq->rd, SG_OVERUTILIZED);
				5234	}
				5235	}
				5236	#else
				5237	static inline void update_overutilized_status(struct rq *rq) { }
				5238	#endif
				5239
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5240	/*
				5241	* The enqueue_task method is called before nr_running is
				5242	* increased. Here we update the fair scheduling stats and
				5243	* then put the task into the rbtree:
				5244	*/
				5245	static void
				5246	enqueue_task_fair(struct rq rq, struct task_struct p, int flags)
				5247	{
				5248	struct cfs_rq *cfs_rq;
				5249	struct sched_entity *se = &p->se;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5250	int idle_h_nr_running = task_has_idle_policy(p);
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	5251	int task_new = !(flags & ENQUEUE_WAKEUP);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5252
				5253	/*
				5254	* The code below (indirectly) updates schedutil which looks at
				5255	* the cfs_rq utilization to select a frequency.
				5256	* Let's add the task's estimated utilization to the cfs_rq's
				5257	* estimated utilization, before we update schedutil.
				5258	*/
				5259	util_est_enqueue(&rq->cfs, p);
				5260
				5261	/*
				5262	* If in_iowait is set, the code below may not trigger any cpufreq
				5263	* utilization updates, so do it here explicitly with the IOWAIT flag
				5264	* passed.
				5265	*/
				5266	if (p->in_iowait)
				5267	cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT);
				5268
				5269	for_each_sched_entity(se) {
				5270	if (se->on_rq)
				5271	break;
				5272	cfs_rq = cfs_rq_of(se);
				5273	enqueue_entity(cfs_rq, se, flags);
				5274
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5275	cfs_rq->h_nr_running++;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5276	cfs_rq->idle_h_nr_running += idle_h_nr_running;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5277
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	5278	/* end evaluation on encountering a throttled cfs_rq */
				5279	if (cfs_rq_throttled(cfs_rq))
				5280	goto enqueue_throttle;
				5281
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5282	flags = ENQUEUE_WAKEUP;
				5283	}
				5284
				5285	for_each_sched_entity(se) {
				5286	cfs_rq = cfs_rq_of(se);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5287
				5288	update_load_avg(cfs_rq, se, UPDATE_TG);
				5289	update_cfs_group(se);
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	5290
				5291	cfs_rq->h_nr_running++;
				5292	cfs_rq->idle_h_nr_running += idle_h_nr_running;
				5293
				5294	/* end evaluation on encountering a throttled cfs_rq */
				5295	if (cfs_rq_throttled(cfs_rq))
				5296	goto enqueue_throttle;
				5297
				5298	/*
				5299	* One parent has been throttled and cfs_rq removed from the
				5300	* list. Add it back to not break the leaf list.
				5301	*/
				5302	if (throttled_hierarchy(cfs_rq))
				5303	list_add_leaf_cfs_rq(cfs_rq);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5304	}
				5305
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	5306	enqueue_throttle:
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5307	if (!se) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5308	add_nr_running(rq, 1);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5309	/*
				5310	* Since new tasks are assigned an initial util_avg equal to
				5311	* half of the spare capacity of their CPU, tiny tasks have the
				5312	* ability to cross the overutilized threshold, which will
				5313	* result in the load balancer ruining all the task placement
				5314	* done by EAS. As a way to mitigate that effect, do not account
				5315	* for the first enqueue operation of new tasks during the
				5316	* overutilized flag detection.
				5317	*
				5318	* A better way of solving this problem would be to wait for
				5319	* the PELT signals of tasks to converge before taking them
				5320	* into account, but that is not straightforward to implement,
				5321	* and the following generally works well enough in practice.
				5322	*/
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	5323	if (!task_new)
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5324	update_overutilized_status(rq);
				5325
				5326	}
				5327
				5328	if (cfs_bandwidth_used()) {
				5329	/*
				5330	* When bandwidth control is enabled; the cfs_rq_throttled()
				5331	* breaks in the above iteration can result in incomplete
				5332	* leaf list maintenance, resulting in triggering the assertion
				5333	* below.
				5334	*/
				5335	for_each_sched_entity(se) {
				5336	cfs_rq = cfs_rq_of(se);
				5337
				5338	if (list_add_leaf_cfs_rq(cfs_rq))
				5339	break;
				5340	}
				5341	}
				5342
				5343	assert_list_leaf_cfs_rq(rq);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5344
				5345	hrtick_update(rq);
				5346	}
				5347
				5348	static void set_next_buddy(struct sched_entity *se);
				5349
				5350	/*
				5351	* The dequeue_task method is called before nr_running is
				5352	* decreased. We remove the task from the rbtree and
				5353	* update the fair scheduling stats:
				5354	*/
				5355	static void dequeue_task_fair(struct rq rq, struct task_struct p, int flags)
				5356	{
				5357	struct cfs_rq *cfs_rq;
				5358	struct sched_entity *se = &p->se;
				5359	int task_sleep = flags & DEQUEUE_SLEEP;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5360	int idle_h_nr_running = task_has_idle_policy(p);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5361
				5362	for_each_sched_entity(se) {
				5363	cfs_rq = cfs_rq_of(se);
				5364	dequeue_entity(cfs_rq, se, flags);
				5365
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5366	cfs_rq->h_nr_running--;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5367	cfs_rq->idle_h_nr_running -= idle_h_nr_running;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5368
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	5369	/* end evaluation on encountering a throttled cfs_rq */
				5370	if (cfs_rq_throttled(cfs_rq))
				5371	goto dequeue_throttle;
				5372
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5373	/* Don't dequeue parent if it has other entities besides us */
				5374	if (cfs_rq->load.weight) {
				5375	/* Avoid re-evaluating load for this entity: */
				5376	se = parent_entity(se);
				5377	/*
				5378	* Bias pick_next to pick a task from this cfs_rq, as
				5379	* p is sleeping when it is within its sched_slice.
				5380	*/
				5381	if (task_sleep && se && !throttled_hierarchy(cfs_rq))
				5382	set_next_buddy(se);
				5383	break;
				5384	}
				5385	flags \|= DEQUEUE_SLEEP;
				5386	}
				5387
				5388	for_each_sched_entity(se) {
				5389	cfs_rq = cfs_rq_of(se);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5390
				5391	update_load_avg(cfs_rq, se, UPDATE_TG);
				5392	update_cfs_group(se);
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	5393
				5394	cfs_rq->h_nr_running--;
				5395	cfs_rq->idle_h_nr_running -= idle_h_nr_running;
				5396
				5397	/* end evaluation on encountering a throttled cfs_rq */
				5398	if (cfs_rq_throttled(cfs_rq))
				5399	goto dequeue_throttle;
				5400
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5401	}
				5402
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	5403	dequeue_throttle:
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5404	if (!se)
				5405	sub_nr_running(rq, 1);
				5406
				5407	util_est_dequeue(&rq->cfs, p, task_sleep);
				5408	hrtick_update(rq);
				5409	}
				5410
				5411	#ifdef CONFIG_SMP
				5412
				5413	/* Working cpumask for: load_balance, load_balance_newidle. */
				5414	DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
				5415	DEFINE_PER_CPU(cpumask_var_t, select_idle_mask);
				5416
				5417	#ifdef CONFIG_NO_HZ_COMMON
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5418
				5419	static struct {
				5420	cpumask_var_t idle_cpus_mask;
				5421	atomic_t nr_cpus;
				5422	int has_blocked; /* Idle CPUS has blocked load */
				5423	unsigned long next_balance; /* in jiffy units */
				5424	unsigned long next_blocked; /* Next update of blocked load in jiffies */
				5425	} nohz ____cacheline_aligned;
				5426
				5427	#endif /* CONFIG_NO_HZ_COMMON */
				5428
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5429	/* CPU only has SCHED_IDLE tasks enqueued */
				5430	static int sched_idle_cpu(int cpu)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5431	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5432	struct rq *rq = cpu_rq(cpu);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5433
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5434	return unlikely(rq->nr_running == rq->cfs.idle_h_nr_running &&
				5435	rq->nr_running);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5436	}
				5437
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5438	static unsigned long cpu_runnable_load(struct rq *rq)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5439	{
				5440	return cfs_rq_runnable_load_avg(&rq->cfs);
				5441	}
				5442
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5443	static unsigned long capacity_of(int cpu)
				5444	{
				5445	return cpu_rq(cpu)->cpu_capacity;
				5446	}
				5447
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5448	static unsigned long cpu_avg_load_per_task(int cpu)
				5449	{
				5450	struct rq *rq = cpu_rq(cpu);
				5451	unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5452	unsigned long load_avg = cpu_runnable_load(rq);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5453
				5454	if (nr_running)
				5455	return load_avg / nr_running;
				5456
				5457	return 0;
				5458	}
				5459
				5460	static void record_wakee(struct task_struct *p)
				5461	{
				5462	/*
				5463	* Only decay a single time; tasks that have less then 1 wakeup per
				5464	* jiffy will not have built up many flips.
				5465	*/
				5466	if (time_after(jiffies, current->wakee_flip_decay_ts + HZ)) {
				5467	current->wakee_flips >>= 1;
				5468	current->wakee_flip_decay_ts = jiffies;
				5469	}
				5470
				5471	if (current->last_wakee != p) {
				5472	current->last_wakee = p;
				5473	current->wakee_flips++;
				5474	}
				5475	}
				5476
				5477	/*
				5478	* Detect M:N waker/wakee relationships via a switching-frequency heuristic.
				5479	*
				5480	* A waker of many should wake a different task than the one last awakened
				5481	* at a frequency roughly N times higher than one of its wakees.
				5482	*
				5483	* In order to determine whether we should let the load spread vs consolidating
				5484	* to shared cache, we look for a minimum 'flip' frequency of llc_size in one
				5485	* partner, and a factor of lls_size higher frequency in the other.
				5486	*
				5487	* With both conditions met, we can be relatively sure that the relationship is
				5488	* non-monogamous, with partner count exceeding socket size.
				5489	*
				5490	* Waker/wakee being client/server, worker/dispatcher, interrupt source or
				5491	* whatever is irrelevant, spread criteria is apparent partner count exceeds
				5492	* socket size.
				5493	*/
				5494	static int wake_wide(struct task_struct *p)
				5495	{
				5496	unsigned int master = current->wakee_flips;
				5497	unsigned int slave = p->wakee_flips;
				5498	int factor = this_cpu_read(sd_llc_size);
				5499
				5500	if (master < slave)
				5501	swap(master, slave);
				5502	if (slave < factor \|\| master < slave * factor)
				5503	return 0;
				5504	return 1;
				5505	}
				5506
				5507	/*
				5508	* The purpose of wake_affine() is to quickly determine on which CPU we can run
				5509	* soonest. For the purpose of speed we only consider the waking and previous
				5510	* CPU.
				5511	*
				5512	* wake_affine_idle() - only considers 'now', it check if the waking CPU is
				5513	* cache-affine and is (or will be) idle.
				5514	*
				5515	* wake_affine_weight() - considers the weight to reflect the average
				5516	* scheduling latency of the CPUs. This seems to work
				5517	* for the overloaded case.
				5518	*/
				5519	static int
				5520	wake_affine_idle(int this_cpu, int prev_cpu, int sync)
				5521	{
				5522	/*
				5523	* If this_cpu is idle, it implies the wakeup is from interrupt
				5524	* context. Only allow the move if cache is shared. Otherwise an
				5525	* interrupt intensive workload could force all tasks onto one
				5526	* node depending on the IO topology or IRQ affinity settings.
				5527	*
				5528	* If the prev_cpu is idle and cache affine then avoid a migration.
				5529	* There is no guarantee that the cache hot data from an interrupt
				5530	* is more important than cache hot data on the prev_cpu and from
				5531	* a cpufreq perspective, it's better to have higher utilisation
				5532	* on one CPU.
				5533	*/
				5534	if (available_idle_cpu(this_cpu) && cpus_share_cache(this_cpu, prev_cpu))
				5535	return available_idle_cpu(prev_cpu) ? prev_cpu : this_cpu;
				5536
				5537	if (sync && cpu_rq(this_cpu)->nr_running == 1)
				5538	return this_cpu;
				5539
				5540	return nr_cpumask_bits;
				5541	}
				5542
				5543	static int
				5544	wake_affine_weight(struct sched_domain sd, struct task_struct p,
				5545	int this_cpu, int prev_cpu, int sync)
				5546	{
				5547	s64 this_eff_load, prev_eff_load;
				5548	unsigned long task_load;
				5549
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5550	this_eff_load = cpu_runnable_load(cpu_rq(this_cpu));
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5551
				5552	if (sync) {
				5553	unsigned long current_load = task_h_load(current);
				5554
				5555	if (current_load > this_eff_load)
				5556	return this_cpu;
				5557
				5558	this_eff_load -= current_load;
				5559	}
				5560
				5561	task_load = task_h_load(p);
				5562
				5563	this_eff_load += task_load;
				5564	if (sched_feat(WA_BIAS))
				5565	this_eff_load *= 100;
				5566	this_eff_load *= capacity_of(prev_cpu);
				5567
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5568	prev_eff_load = cpu_runnable_load(cpu_rq(prev_cpu));
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5569	prev_eff_load -= task_load;
				5570	if (sched_feat(WA_BIAS))
				5571	prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2;
				5572	prev_eff_load *= capacity_of(this_cpu);
				5573
				5574	/*
				5575	* If sync, adjust the weight of prev_eff_load such that if
				5576	* prev_eff == this_eff that select_idle_sibling() will consider
				5577	* stacking the wakee on top of the waker if no other CPU is
				5578	* idle.
				5579	*/
				5580	if (sync)
				5581	prev_eff_load += 1;
				5582
				5583	return this_eff_load < prev_eff_load ? this_cpu : nr_cpumask_bits;
				5584	}
				5585
				5586	static int wake_affine(struct sched_domain sd, struct task_struct p,
				5587	int this_cpu, int prev_cpu, int sync)
				5588	{
				5589	int target = nr_cpumask_bits;
				5590
				5591	if (sched_feat(WA_IDLE))
				5592	target = wake_affine_idle(this_cpu, prev_cpu, sync);
				5593
				5594	if (sched_feat(WA_WEIGHT) && target == nr_cpumask_bits)
				5595	target = wake_affine_weight(sd, p, this_cpu, prev_cpu, sync);
				5596
				5597	schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts);
				5598	if (target == nr_cpumask_bits)
				5599	return prev_cpu;
				5600
				5601	schedstat_inc(sd->ttwu_move_affine);
				5602	schedstat_inc(p->se.statistics.nr_wakeups_affine);
				5603	return target;
				5604	}
				5605
				5606	static unsigned long cpu_util_without(int cpu, struct task_struct *p);
				5607
				5608	static unsigned long capacity_spare_without(int cpu, struct task_struct *p)
				5609	{
				5610	return max_t(long, capacity_of(cpu) - cpu_util_without(cpu, p), 0);
				5611	}
				5612
				5613	/*
				5614	* find_idlest_group finds and returns the least busy CPU group within the
				5615	* domain.
				5616	*
				5617	* Assumes p is allowed on at least one CPU in sd.
				5618	*/
				5619	static struct sched_group *
				5620	find_idlest_group(struct sched_domain sd, struct task_struct p,
				5621	int this_cpu, int sd_flag)
				5622	{
				5623	struct sched_group idlest = NULL, group = sd->groups;
				5624	struct sched_group *most_spare_sg = NULL;
				5625	unsigned long min_runnable_load = ULONG_MAX;
				5626	unsigned long this_runnable_load = ULONG_MAX;
				5627	unsigned long min_avg_load = ULONG_MAX, this_avg_load = ULONG_MAX;
				5628	unsigned long most_spare = 0, this_spare = 0;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5629	int imbalance_scale = 100 + (sd->imbalance_pct-100)/2;
				5630	unsigned long imbalance = scale_load_down(NICE_0_LOAD) *
				5631	(sd->imbalance_pct-100) / 100;
				5632
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5633	do {
				5634	unsigned long load, avg_load, runnable_load;
				5635	unsigned long spare_cap, max_spare_cap;
				5636	int local_group;
				5637	int i;
				5638
				5639	/* Skip over this group if it has no CPUs allowed */
				5640	if (!cpumask_intersects(sched_group_span(group),
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5641	p->cpus_ptr))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5642	continue;
				5643
				5644	local_group = cpumask_test_cpu(this_cpu,
				5645	sched_group_span(group));
				5646
				5647	/*
				5648	* Tally up the load of all CPUs in the group and find
				5649	* the group containing the CPU with most spare capacity.
				5650	*/
				5651	avg_load = 0;
				5652	runnable_load = 0;
				5653	max_spare_cap = 0;
				5654
				5655	for_each_cpu(i, sched_group_span(group)) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5656	load = cpu_runnable_load(cpu_rq(i));
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5657	runnable_load += load;
				5658
				5659	avg_load += cfs_rq_load_avg(&cpu_rq(i)->cfs);
				5660
				5661	spare_cap = capacity_spare_without(i, p);
				5662
				5663	if (spare_cap > max_spare_cap)
				5664	max_spare_cap = spare_cap;
				5665	}
				5666
				5667	/* Adjust by relative CPU capacity of the group */
				5668	avg_load = (avg_load * SCHED_CAPACITY_SCALE) /
				5669	group->sgc->capacity;
				5670	runnable_load = (runnable_load * SCHED_CAPACITY_SCALE) /
				5671	group->sgc->capacity;
				5672
				5673	if (local_group) {
				5674	this_runnable_load = runnable_load;
				5675	this_avg_load = avg_load;
				5676	this_spare = max_spare_cap;
				5677	} else {
				5678	if (min_runnable_load > (runnable_load + imbalance)) {
				5679	/*
				5680	* The runnable load is significantly smaller
				5681	* so we can pick this new CPU:
				5682	*/
				5683	min_runnable_load = runnable_load;
				5684	min_avg_load = avg_load;
				5685	idlest = group;
				5686	} else if ((runnable_load < (min_runnable_load + imbalance)) &&
				5687	(100min_avg_load > imbalance_scaleavg_load)) {
				5688	/*
				5689	* The runnable loads are close so take the
				5690	* blocked load into account through avg_load:
				5691	*/
				5692	min_avg_load = avg_load;
				5693	idlest = group;
				5694	}
				5695
				5696	if (most_spare < max_spare_cap) {
				5697	most_spare = max_spare_cap;
				5698	most_spare_sg = group;
				5699	}
				5700	}
				5701	} while (group = group->next, group != sd->groups);
				5702
				5703	/*
				5704	* The cross-over point between using spare capacity or least load
				5705	* is too conservative for high utilization tasks on partially
				5706	* utilized systems if we require spare_capacity > task_util(p),
				5707	* so we allow for some task stuffing by using
				5708	* spare_capacity > task_util(p)/2.
				5709	*
				5710	* Spare capacity can't be used for fork because the utilization has
				5711	* not been set yet, we must first select a rq to compute the initial
				5712	* utilization.
				5713	*/
				5714	if (sd_flag & SD_BALANCE_FORK)
				5715	goto skip_spare;
				5716
				5717	if (this_spare > task_util(p) / 2 &&
				5718	imbalance_scalethis_spare > 100most_spare)
				5719	return NULL;
				5720
				5721	if (most_spare > task_util(p) / 2)
				5722	return most_spare_sg;
				5723
				5724	skip_spare:
				5725	if (!idlest)
				5726	return NULL;
				5727
				5728	/*
				5729	* When comparing groups across NUMA domains, it's possible for the
				5730	* local domain to be very lightly loaded relative to the remote
				5731	* domains but "imbalance" skews the comparison making remote CPUs
				5732	* look much more favourable. When considering cross-domain, add
				5733	* imbalance to the runnable load on the remote node and consider
				5734	* staying local.
				5735	*/
				5736	if ((sd->flags & SD_NUMA) &&
				5737	min_runnable_load + imbalance >= this_runnable_load)
				5738	return NULL;
				5739
				5740	if (min_runnable_load > (this_runnable_load + imbalance))
				5741	return NULL;
				5742
				5743	if ((this_runnable_load < (min_runnable_load + imbalance)) &&
				5744	(100this_avg_load < imbalance_scalemin_avg_load))
				5745	return NULL;
				5746
				5747	return idlest;
				5748	}
				5749
				5750	/*
				5751	* find_idlest_group_cpu - find the idlest CPU among the CPUs in the group.
				5752	*/
				5753	static int
				5754	find_idlest_group_cpu(struct sched_group group, struct task_struct p, int this_cpu)
				5755	{
				5756	unsigned long load, min_load = ULONG_MAX;
				5757	unsigned int min_exit_latency = UINT_MAX;
				5758	u64 latest_idle_timestamp = 0;
				5759	int least_loaded_cpu = this_cpu;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5760	int shallowest_idle_cpu = -1, si_cpu = -1;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5761	int i;
				5762
				5763	/* Check if we have any choice: */
				5764	if (group->group_weight == 1)
				5765	return cpumask_first(sched_group_span(group));
				5766
				5767	/* Traverse only the allowed CPUs */
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5768	for_each_cpu_and(i, sched_group_span(group), p->cpus_ptr) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5769	if (available_idle_cpu(i)) {
				5770	struct rq *rq = cpu_rq(i);
				5771	struct cpuidle_state *idle = idle_get_state(rq);
				5772	if (idle && idle->exit_latency < min_exit_latency) {
				5773	/*
				5774	* We give priority to a CPU whose idle state
				5775	* has the smallest exit latency irrespective
				5776	* of any idle timestamp.
				5777	*/
				5778	min_exit_latency = idle->exit_latency;
				5779	latest_idle_timestamp = rq->idle_stamp;
				5780	shallowest_idle_cpu = i;
				5781	} else if ((!idle \|\| idle->exit_latency == min_exit_latency) &&
				5782	rq->idle_stamp > latest_idle_timestamp) {
				5783	/*
				5784	* If equal or no active idle state, then
				5785	* the most recently idled CPU might have
				5786	* a warmer cache.
				5787	*/
				5788	latest_idle_timestamp = rq->idle_stamp;
				5789	shallowest_idle_cpu = i;
				5790	}
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5791	} else if (shallowest_idle_cpu == -1 && si_cpu == -1) {
				5792	if (sched_idle_cpu(i)) {
				5793	si_cpu = i;
				5794	continue;
				5795	}
				5796
				5797	load = cpu_runnable_load(cpu_rq(i));
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5798	if (load < min_load) {
				5799	min_load = load;
				5800	least_loaded_cpu = i;
				5801	}
				5802	}
				5803	}
				5804
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5805	if (shallowest_idle_cpu != -1)
				5806	return shallowest_idle_cpu;
				5807	if (si_cpu != -1)
				5808	return si_cpu;
				5809	return least_loaded_cpu;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5810	}
				5811
				5812	static inline int find_idlest_cpu(struct sched_domain sd, struct task_struct p,
				5813	int cpu, int prev_cpu, int sd_flag)
				5814	{
				5815	int new_cpu = cpu;
				5816
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5817	if (!cpumask_intersects(sched_domain_span(sd), p->cpus_ptr))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5818	return prev_cpu;
				5819
				5820	/*
				5821	* We need task's util for capacity_spare_without, sync it up to
				5822	* prev_cpu's last_update_time.
				5823	*/
				5824	if (!(sd_flag & SD_BALANCE_FORK))
				5825	sync_entity_load_avg(&p->se);
				5826
				5827	while (sd) {
				5828	struct sched_group *group;
				5829	struct sched_domain *tmp;
				5830	int weight;
				5831
				5832	if (!(sd->flags & sd_flag)) {
				5833	sd = sd->child;
				5834	continue;
				5835	}
				5836
				5837	group = find_idlest_group(sd, p, cpu, sd_flag);
				5838	if (!group) {
				5839	sd = sd->child;
				5840	continue;
				5841	}
				5842
				5843	new_cpu = find_idlest_group_cpu(group, p, cpu);
				5844	if (new_cpu == cpu) {
				5845	/* Now try balancing at a lower domain level of 'cpu': */
				5846	sd = sd->child;
				5847	continue;
				5848	}
				5849
				5850	/* Now try balancing at a lower domain level of 'new_cpu': */
				5851	cpu = new_cpu;
				5852	weight = sd->span_weight;
				5853	sd = NULL;
				5854	for_each_domain(cpu, tmp) {
				5855	if (weight <= tmp->span_weight)
				5856	break;
				5857	if (tmp->flags & sd_flag)
				5858	sd = tmp;
				5859	}
				5860	}
				5861
				5862	return new_cpu;
				5863	}
				5864
				5865	#ifdef CONFIG_SCHED_SMT
				5866	DEFINE_STATIC_KEY_FALSE(sched_smt_present);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5867	EXPORT_SYMBOL_GPL(sched_smt_present);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5868
				5869	static inline void set_idle_cores(int cpu, int val)
				5870	{
				5871	struct sched_domain_shared *sds;
				5872
				5873	sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
				5874	if (sds)
				5875	WRITE_ONCE(sds->has_idle_cores, val);
				5876	}
				5877
				5878	static inline bool test_idle_cores(int cpu, bool def)
				5879	{
				5880	struct sched_domain_shared *sds;
				5881
				5882	sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
				5883	if (sds)
				5884	return READ_ONCE(sds->has_idle_cores);
				5885
				5886	return def;
				5887	}
				5888
				5889	/*
				5890	* Scans the local SMT mask to see if the entire core is idle, and records this
				5891	* information in sd_llc_shared->has_idle_cores.
				5892	*
				5893	* Since SMT siblings share all cache levels, inspecting this limited remote
				5894	* state should be fairly cheap.
				5895	*/
				5896	void __update_idle_core(struct rq *rq)
				5897	{
				5898	int core = cpu_of(rq);
				5899	int cpu;
				5900
				5901	rcu_read_lock();
				5902	if (test_idle_cores(core, true))
				5903	goto unlock;
				5904
				5905	for_each_cpu(cpu, cpu_smt_mask(core)) {
				5906	if (cpu == core)
				5907	continue;
				5908
				5909	if (!available_idle_cpu(cpu))
				5910	goto unlock;
				5911	}
				5912
				5913	set_idle_cores(core, 1);
				5914	unlock:
				5915	rcu_read_unlock();
				5916	}
				5917
				5918	/*
				5919	* Scan the entire LLC domain for idle cores; this dynamically switches off if
				5920	* there are no idle cores left in the system; tracked through
				5921	* sd_llc->shared->has_idle_cores and enabled through update_idle_core() above.
				5922	*/
				5923	static int select_idle_core(struct task_struct p, struct sched_domain sd, int target)
				5924	{
				5925	struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
				5926	int core, cpu;
				5927
				5928	if (!static_branch_likely(&sched_smt_present))
				5929	return -1;
				5930
				5931	if (!test_idle_cores(target, false))
				5932	return -1;
				5933
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5934	cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5935
				5936	for_each_cpu_wrap(core, cpus, target) {
				5937	bool idle = true;
				5938
				5939	for_each_cpu(cpu, cpu_smt_mask(core)) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5940	__cpumask_clear_cpu(cpu, cpus);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5941	if (!available_idle_cpu(cpu))
				5942	idle = false;
				5943	}
				5944
				5945	if (idle)
				5946	return core;
				5947	}
				5948
				5949	/*
				5950	* Failed to find an idle core; stop looking for one.
				5951	*/
				5952	set_idle_cores(target, 0);
				5953
				5954	return -1;
				5955	}
				5956
				5957	/*
				5958	* Scan the local SMT mask for idle CPUs.
				5959	*/
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	5960	static int select_idle_smt(struct task_struct p, struct sched_domain sd, int target)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5961	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5962	int cpu, si_cpu = -1;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5963
				5964	if (!static_branch_likely(&sched_smt_present))
				5965	return -1;
				5966
				5967	for_each_cpu(cpu, cpu_smt_mask(target)) {
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	5968	if (!cpumask_test_cpu(cpu, p->cpus_ptr) \|\|
				5969	!cpumask_test_cpu(cpu, sched_domain_span(sd)))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5970	continue;
				5971	if (available_idle_cpu(cpu))
				5972	return cpu;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5973	if (si_cpu == -1 && sched_idle_cpu(cpu))
				5974	si_cpu = cpu;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5975	}
				5976
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5977	return si_cpu;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5978	}
				5979
				5980	#else /* CONFIG_SCHED_SMT */
				5981
				5982	static inline int select_idle_core(struct task_struct p, struct sched_domain sd, int target)
				5983	{
				5984	return -1;
				5985	}
				5986
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	5987	static inline int select_idle_smt(struct task_struct p, struct sched_domain sd, int target)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5988	{
				5989	return -1;
				5990	}
				5991
				5992	#endif /* CONFIG_SCHED_SMT */
				5993
				5994	/*
				5995	* Scan the LLC domain for idle CPUs; this is dynamically regulated by
				5996	* comparing the average scan cost (tracked in sd->avg_scan_cost) against the
				5997	* average idle time for this rq (as found in rq->avg_idle).
				5998	*/
				5999	static int select_idle_cpu(struct task_struct p, struct sched_domain sd, int target)
				6000	{
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	6001	struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6002	struct sched_domain *this_sd;
				6003	u64 avg_cost, avg_idle;
				6004	u64 time, cost;
				6005	s64 delta;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6006	int this = smp_processor_id();
				6007	int cpu, nr = INT_MAX, si_cpu = -1;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6008
				6009	this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
				6010	if (!this_sd)
				6011	return -1;
				6012
				6013	/*
				6014	* Due to large variance we need a large fuzz factor; hackbench in
				6015	* particularly is sensitive here.
				6016	*/
				6017	avg_idle = this_rq()->avg_idle / 512;
				6018	avg_cost = this_sd->avg_scan_cost + 1;
				6019
				6020	if (sched_feat(SIS_AVG_CPU) && avg_idle < avg_cost)
				6021	return -1;
				6022
				6023	if (sched_feat(SIS_PROP)) {
				6024	u64 span_avg = sd->span_weight * avg_idle;
				6025	if (span_avg > 4*avg_cost)
				6026	nr = div_u64(span_avg, avg_cost);
				6027	else
				6028	nr = 4;
				6029	}
				6030
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6031	time = cpu_clock(this);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6032
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	6033	cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
				6034
				6035	for_each_cpu_wrap(cpu, cpus, target) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6036	if (!--nr)
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6037	return si_cpu;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6038	if (available_idle_cpu(cpu))
				6039	break;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6040	if (si_cpu == -1 && sched_idle_cpu(cpu))
				6041	si_cpu = cpu;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6042	}
				6043
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6044	time = cpu_clock(this) - time;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6045	cost = this_sd->avg_scan_cost;
				6046	delta = (s64)(time - cost) / 8;
				6047	this_sd->avg_scan_cost += delta;
				6048
				6049	return cpu;
				6050	}
				6051
				6052	/*
				6053	* Try and locate an idle core/thread in the LLC cache domain.
				6054	*/
				6055	static int select_idle_sibling(struct task_struct *p, int prev, int target)
				6056	{
				6057	struct sched_domain *sd;
				6058	int i, recent_used_cpu;
				6059
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6060	if (available_idle_cpu(target) \|\| sched_idle_cpu(target))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6061	return target;
				6062
				6063	/*
				6064	* If the previous CPU is cache affine and idle, don't be stupid:
				6065	*/
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6066	if (prev != target && cpus_share_cache(prev, target) &&
				6067	(available_idle_cpu(prev) \|\| sched_idle_cpu(prev)))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6068	return prev;
				6069
				6070	/* Check a recently used CPU as a potential idle candidate: */
				6071	recent_used_cpu = p->recent_used_cpu;
				6072	if (recent_used_cpu != prev &&
				6073	recent_used_cpu != target &&
				6074	cpus_share_cache(recent_used_cpu, target) &&
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6075	(available_idle_cpu(recent_used_cpu) \|\| sched_idle_cpu(recent_used_cpu)) &&
				6076	cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr)) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6077	/*
				6078	* Replace recent_used_cpu with prev as it is a potential
				6079	* candidate for the next wake:
				6080	*/
				6081	p->recent_used_cpu = prev;
				6082	return recent_used_cpu;
				6083	}
				6084
				6085	sd = rcu_dereference(per_cpu(sd_llc, target));
				6086	if (!sd)
				6087	return target;
				6088
				6089	i = select_idle_core(p, sd, target);
				6090	if ((unsigned)i < nr_cpumask_bits)
				6091	return i;
				6092
				6093	i = select_idle_cpu(p, sd, target);
				6094	if ((unsigned)i < nr_cpumask_bits)
				6095	return i;
				6096
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	6097	i = select_idle_smt(p, sd, target);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6098	if ((unsigned)i < nr_cpumask_bits)
				6099	return i;
				6100
				6101	return target;
				6102	}
				6103
				6104	/**
				6105	* Amount of capacity of a CPU that is (estimated to be) used by CFS tasks
				6106	* @cpu: the CPU to get the utilization of
				6107	*
				6108	* The unit of the return value must be the one of capacity so we can compare
				6109	* the utilization with the capacity of the CPU that is available for CFS task
				6110	* (ie cpu_capacity).
				6111	*
				6112	* cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the
				6113	* recent utilization of currently non-runnable tasks on a CPU. It represents
				6114	* the amount of utilization of a CPU in the range [0..capacity_orig] where
				6115	* capacity_orig is the cpu_capacity available at the highest frequency
				6116	* (arch_scale_freq_capacity()).
				6117	* The utilization of a CPU converges towards a sum equal to or less than the
				6118	* current capacity (capacity_curr <= capacity_orig) of the CPU because it is
				6119	* the running time on this CPU scaled by capacity_curr.
				6120	*
				6121	* The estimated utilization of a CPU is defined to be the maximum between its
				6122	* cfs_rq.avg.util_avg and the sum of the estimated utilization of the tasks
				6123	* currently RUNNABLE on that CPU.
				6124	* This allows to properly represent the expected utilization of a CPU which
				6125	* has just got a big task running since a long sleep period. At the same time
				6126	* however it preserves the benefits of the "blocked utilization" in
				6127	* describing the potential for other tasks waking up on the same CPU.
				6128	*
				6129	* Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even
				6130	* higher than capacity_orig because of unfortunate rounding in
				6131	* cfs.avg.util_avg or just after migrating tasks and new task wakeups until
				6132	* the average stabilizes with the new running time. We need to check that the
				6133	* utilization stays within the range of [0..capacity_orig] and cap it if
				6134	* necessary. Without utilization capping, a group could be seen as overloaded
				6135	* (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of
				6136	* available capacity. We allow utilization to overshoot capacity_curr (but not
				6137	* capacity_orig) as it useful for predicting the capacity required after task
				6138	* migrations (scheduler-driven DVFS).
				6139	*
				6140	* Return: the (estimated) utilization for the specified CPU
				6141	*/
				6142	static inline unsigned long cpu_util(int cpu)
				6143	{
				6144	struct cfs_rq *cfs_rq;
				6145	unsigned int util;
				6146
				6147	cfs_rq = &cpu_rq(cpu)->cfs;
				6148	util = READ_ONCE(cfs_rq->avg.util_avg);
				6149
				6150	if (sched_feat(UTIL_EST))
				6151	util = max(util, READ_ONCE(cfs_rq->avg.util_est.enqueued));
				6152
				6153	return min_t(unsigned long, util, capacity_orig_of(cpu));
				6154	}
				6155
				6156	/*
				6157	* cpu_util_without: compute cpu utilization without any contributions from *p
				6158	* @cpu: the CPU which utilization is requested
				6159	* @p: the task which utilization should be discounted
				6160	*
				6161	* The utilization of a CPU is defined by the utilization of tasks currently
				6162	* enqueued on that CPU as well as tasks which are currently sleeping after an
				6163	* execution on that CPU.
				6164	*
				6165	* This method returns the utilization of the specified CPU by discounting the
				6166	* utilization of the specified task, whenever the task is currently
				6167	* contributing to the CPU utilization.
				6168	*/
				6169	static unsigned long cpu_util_without(int cpu, struct task_struct *p)
				6170	{
				6171	struct cfs_rq *cfs_rq;
				6172	unsigned int util;
				6173
				6174	/* Task has no contribution or is new */
				6175	if (cpu != task_cpu(p) \|\| !READ_ONCE(p->se.avg.last_update_time))
				6176	return cpu_util(cpu);
				6177
				6178	cfs_rq = &cpu_rq(cpu)->cfs;
				6179	util = READ_ONCE(cfs_rq->avg.util_avg);
				6180
				6181	/* Discount task's util from CPU's util */
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6182	lsub_positive(&util, task_util(p));
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6183
				6184	/*
				6185	* Covered cases:
				6186	*
				6187	* a) if *p is the only task sleeping on this CPU, then:
				6188	* cpu_util (== task_util) > util_est (== 0)
				6189	* and thus we return:
				6190	* cpu_util_without = (cpu_util - task_util) = 0
				6191	*
				6192	* b) if other tasks are SLEEPING on this CPU, which is now exiting
				6193	* IDLE, then:
				6194	* cpu_util >= task_util
				6195	* cpu_util > util_est (== 0)
				6196	* and thus we discount *p's blocked utilization to return:
				6197	* cpu_util_without = (cpu_util - task_util) >= 0
				6198	*
				6199	* c) if other tasks are RUNNABLE on that CPU and
				6200	* util_est > cpu_util
				6201	* then we use util_est since it returns a more restrictive
				6202	* estimation of the spare capacity on that CPU, by just
				6203	* considering the expected utilization of tasks already
				6204	* runnable on that CPU.
				6205	*
				6206	* Cases a) and b) are covered by the above code, while case c) is
				6207	* covered by the following code when estimated utilization is
				6208	* enabled.
				6209	*/
				6210	if (sched_feat(UTIL_EST)) {
				6211	unsigned int estimated =
				6212	READ_ONCE(cfs_rq->avg.util_est.enqueued);
				6213
				6214	/*
				6215	* Despite the following checks we still have a small window
				6216	* for a possible race, when an execl's select_task_rq_fair()
				6217	* races with LB's detach_task():
				6218	*
				6219	* detach_task()
				6220	* p->on_rq = TASK_ON_RQ_MIGRATING;
				6221	* ---------------------------------- A
				6222	* deactivate_task() \
				6223	* dequeue_task() + RaceTime
				6224	* util_est_dequeue() /
				6225	* ---------------------------------- B
				6226	*
				6227	* The additional check on "current == p" it's required to
				6228	* properly fix the execl regression and it helps in further
				6229	* reducing the chances for the above race.
				6230	*/
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6231	if (unlikely(task_on_rq_queued(p) \|\| current == p))
				6232	lsub_positive(&estimated, _task_util_est(p));
				6233
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6234	util = max(util, estimated);
				6235	}
				6236
				6237	/*
				6238	* Utilization (estimated) can exceed the CPU capacity, thus let's
				6239	* clamp to the maximum CPU capacity to ensure consistency with
				6240	* the cpu_util call.
				6241	*/
				6242	return min_t(unsigned long, util, capacity_orig_of(cpu));
				6243	}
				6244
				6245	/*
				6246	* Disable WAKE_AFFINE in the case where task @p doesn't fit in the
				6247	* capacity of either the waking CPU @cpu or the previous CPU @prev_cpu.
				6248	*
				6249	* In that case WAKE_AFFINE doesn't make sense and we'll let
				6250	* BALANCE_WAKE sort things out.
				6251	*/
				6252	static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
				6253	{
				6254	long min_cap, max_cap;
				6255
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6256	if (!static_branch_unlikely(&sched_asym_cpucapacity))
				6257	return 0;
				6258
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6259	min_cap = min(capacity_orig_of(prev_cpu), capacity_orig_of(cpu));
				6260	max_cap = cpu_rq(cpu)->rd->max_cpu_capacity;
				6261
				6262	/* Minimum capacity is close to max, no need to abort wake_affine */
				6263	if (max_cap - min_cap < max_cap >> 3)
				6264	return 0;
				6265
				6266	/* Bring task utilization in sync with prev_cpu */
				6267	sync_entity_load_avg(&p->se);
				6268
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6269	return !task_fits_capacity(p, min_cap);
				6270	}
				6271
				6272	/*
				6273	* Predicts what cpu_util(@cpu) would return if @p was migrated (and enqueued)
				6274	* to @dst_cpu.
				6275	*/
				6276	static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu)
				6277	{
				6278	struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs;
				6279	unsigned long util_est, util = READ_ONCE(cfs_rq->avg.util_avg);
				6280
				6281	/*
				6282	* If @p migrates from @cpu to another, remove its contribution. Or,
				6283	* if @p migrates from another CPU to @cpu, add its contribution. In
				6284	* the other cases, @cpu is not impacted by the migration, so the
				6285	* util_avg should already be correct.
				6286	*/
				6287	if (task_cpu(p) == cpu && dst_cpu != cpu)
				6288	sub_positive(&util, task_util(p));
				6289	else if (task_cpu(p) != cpu && dst_cpu == cpu)
				6290	util += task_util(p);
				6291
				6292	if (sched_feat(UTIL_EST)) {
				6293	util_est = READ_ONCE(cfs_rq->avg.util_est.enqueued);
				6294
				6295	/*
				6296	* During wake-up, the task isn't enqueued yet and doesn't
				6297	* appear in the cfs_rq->avg.util_est.enqueued of any rq,
				6298	* so just add it (if needed) to "simulate" what will be
				6299	* cpu_util() after the task has been enqueued.
				6300	*/
				6301	if (dst_cpu == cpu)
				6302	util_est += _task_util_est(p);
				6303
				6304	util = max(util, util_est);
				6305	}
				6306
				6307	return min(util, capacity_orig_of(cpu));
				6308	}
				6309
				6310	/*
				6311	* compute_energy(): Estimates the energy that @pd would consume if @p was
				6312	* migrated to @dst_cpu. compute_energy() predicts what will be the utilization
				6313	* landscape of @pd's CPUs after the task migration, and uses the Energy Model
				6314	* to compute what would be the energy if we decided to actually migrate that
				6315	* task.
				6316	*/
				6317	static long
				6318	compute_energy(struct task_struct p, int dst_cpu, struct perf_domain pd)
				6319	{
				6320	struct cpumask *pd_mask = perf_domain_span(pd);
				6321	unsigned long cpu_cap = arch_scale_cpu_capacity(cpumask_first(pd_mask));
				6322	unsigned long max_util = 0, sum_util = 0;
				6323	int cpu;
				6324
				6325	/*
				6326	* The capacity state of CPUs of the current rd can be driven by CPUs
				6327	* of another rd if they belong to the same pd. So, account for the
				6328	* utilization of these CPUs too by masking pd with cpu_online_mask
				6329	* instead of the rd span.
				6330	*
				6331	* If an entire pd is outside of the current rd, it will not appear in
				6332	* its pd list and will not be accounted by compute_energy().
				6333	*/
				6334	for_each_cpu_and(cpu, pd_mask, cpu_online_mask) {
				6335	unsigned long cpu_util, util_cfs = cpu_util_next(cpu, p, dst_cpu);
				6336	struct task_struct *tsk = cpu == dst_cpu ? p : NULL;
				6337
				6338	/*
				6339	* Busy time computation: utilization clamping is not
				6340	* required since the ratio (sum_util / cpu_capacity)
				6341	* is already enough to scale the EM reported power
				6342	* consumption at the (eventually clamped) cpu_capacity.
				6343	*/
				6344	sum_util += schedutil_cpu_util(cpu, util_cfs, cpu_cap,
				6345	ENERGY_UTIL, NULL);
				6346
				6347	/*
				6348	* Performance domain frequency: utilization clamping
				6349	* must be considered since it affects the selection
				6350	* of the performance domain frequency.
				6351	* NOTE: in case RT tasks are running, by default the
				6352	* FREQUENCY_UTIL's utilization can be max OPP.
				6353	*/
				6354	cpu_util = schedutil_cpu_util(cpu, util_cfs, cpu_cap,
				6355	FREQUENCY_UTIL, tsk);
				6356	max_util = max(max_util, cpu_util);
				6357	}
				6358
				6359	return em_pd_energy(pd->em_pd, max_util, sum_util);
				6360	}
				6361
				6362	/*
				6363	* find_energy_efficient_cpu(): Find most energy-efficient target CPU for the
				6364	* waking task. find_energy_efficient_cpu() looks for the CPU with maximum
				6365	* spare capacity in each performance domain and uses it as a potential
				6366	* candidate to execute the task. Then, it uses the Energy Model to figure
				6367	* out which of the CPU candidates is the most energy-efficient.
				6368	*
				6369	* The rationale for this heuristic is as follows. In a performance domain,
				6370	* all the most energy efficient CPU candidates (according to the Energy
				6371	* Model) are those for which we'll request a low frequency. When there are
				6372	* several CPUs for which the frequency request will be the same, we don't
				6373	* have enough data to break the tie between them, because the Energy Model
				6374	* only includes active power costs. With this model, if we assume that
				6375	* frequency requests follow utilization (e.g. using schedutil), the CPU with
				6376	* the maximum spare capacity in a performance domain is guaranteed to be among
				6377	* the best candidates of the performance domain.
				6378	*
				6379	* In practice, it could be preferable from an energy standpoint to pack
				6380	* small tasks on a CPU in order to let other CPUs go in deeper idle states,
				6381	* but that could also hurt our chances to go cluster idle, and we have no
				6382	* ways to tell with the current Energy Model if this is actually a good
				6383	* idea or not. So, find_energy_efficient_cpu() basically favors
				6384	* cluster-packing, and spreading inside a cluster. That should at least be
				6385	* a good thing for latency, and this is consistent with the idea that most
				6386	* of the energy savings of EAS come from the asymmetry of the system, and
				6387	* not so much from breaking the tie between identical CPUs. That's also the
				6388	* reason why EAS is enabled in the topology code only for systems where
				6389	* SD_ASYM_CPUCAPACITY is set.
				6390	*
				6391	* NOTE: Forkees are not accepted in the energy-aware wake-up path because
				6392	* they don't have any useful utilization data yet and it's not possible to
				6393	* forecast their impact on energy consumption. Consequently, they will be
				6394	* placed by find_idlest_cpu() on the least loaded CPU, which might turn out
				6395	* to be energy-inefficient in some use-cases. The alternative would be to
				6396	* bias new tasks towards specific types of CPUs first, or to try to infer
				6397	* their util_avg from the parent task, but those heuristics could hurt
				6398	* other use-cases too. So, until someone finds a better way to solve this,
				6399	* let's keep things simple by re-using the existing slow path.
				6400	*/
				6401	static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
				6402	{
				6403	unsigned long prev_delta = ULONG_MAX, best_delta = ULONG_MAX;
				6404	struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
				6405	unsigned long cpu_cap, util, base_energy = 0;
				6406	int cpu, best_energy_cpu = prev_cpu;
				6407	struct sched_domain *sd;
				6408	struct perf_domain *pd;
				6409
				6410	rcu_read_lock();
				6411	pd = rcu_dereference(rd->pd);
				6412	if (!pd \|\| READ_ONCE(rd->overutilized))
				6413	goto fail;
				6414
				6415	/*
				6416	* Energy-aware wake-up happens on the lowest sched_domain starting
				6417	* from sd_asym_cpucapacity spanning over this_cpu and prev_cpu.
				6418	*/
				6419	sd = rcu_dereference(*this_cpu_ptr(&sd_asym_cpucapacity));
				6420	while (sd && !cpumask_test_cpu(prev_cpu, sched_domain_span(sd)))
				6421	sd = sd->parent;
				6422	if (!sd)
				6423	goto fail;
				6424
				6425	sync_entity_load_avg(&p->se);
				6426	if (!task_util_est(p))
				6427	goto unlock;
				6428
				6429	for (; pd; pd = pd->next) {
				6430	unsigned long cur_delta, spare_cap, max_spare_cap = 0;
				6431	unsigned long base_energy_pd;
				6432	int max_spare_cap_cpu = -1;
				6433
				6434	/* Compute the 'base' energy of the pd, without @p */
				6435	base_energy_pd = compute_energy(p, -1, pd);
				6436	base_energy += base_energy_pd;
				6437
				6438	for_each_cpu_and(cpu, perf_domain_span(pd), sched_domain_span(sd)) {
				6439	if (!cpumask_test_cpu(cpu, p->cpus_ptr))
				6440	continue;
				6441
				6442	/* Skip CPUs that will be overutilized. */
				6443	util = cpu_util_next(cpu, p, cpu);
				6444	cpu_cap = capacity_of(cpu);
				6445	if (!fits_capacity(util, cpu_cap))
				6446	continue;
				6447
				6448	/* Always use prev_cpu as a candidate. */
				6449	if (cpu == prev_cpu) {
				6450	prev_delta = compute_energy(p, prev_cpu, pd);
				6451	prev_delta -= base_energy_pd;
				6452	best_delta = min(best_delta, prev_delta);
				6453	}
				6454
				6455	/*
				6456	* Find the CPU with the maximum spare capacity in
				6457	* the performance domain
				6458	*/
				6459	spare_cap = cpu_cap - util;
				6460	if (spare_cap > max_spare_cap) {
				6461	max_spare_cap = spare_cap;
				6462	max_spare_cap_cpu = cpu;
				6463	}
				6464	}
				6465
				6466	/* Evaluate the energy impact of using this CPU. */
				6467	if (max_spare_cap_cpu >= 0 && max_spare_cap_cpu != prev_cpu) {
				6468	cur_delta = compute_energy(p, max_spare_cap_cpu, pd);
				6469	cur_delta -= base_energy_pd;
				6470	if (cur_delta < best_delta) {
				6471	best_delta = cur_delta;
				6472	best_energy_cpu = max_spare_cap_cpu;
				6473	}
				6474	}
				6475	}
				6476	unlock:
				6477	rcu_read_unlock();
				6478
				6479	/*
				6480	* Pick the best CPU if prev_cpu cannot be used, or if it saves at
				6481	* least 6% of the energy used by prev_cpu.
				6482	*/
				6483	if (prev_delta == ULONG_MAX)
				6484	return best_energy_cpu;
				6485
				6486	if ((prev_delta - best_delta) > ((prev_delta + base_energy) >> 4))
				6487	return best_energy_cpu;
				6488
				6489	return prev_cpu;
				6490
				6491	fail:
				6492	rcu_read_unlock();
				6493
				6494	return -1;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6495	}
				6496
				6497	/*
				6498	* select_task_rq_fair: Select target runqueue for the waking task in domains
				6499	* that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
				6500	* SD_BALANCE_FORK, or SD_BALANCE_EXEC.
				6501	*
				6502	* Balances load by selecting the idlest CPU in the idlest group, or under
				6503	* certain conditions an idle sibling CPU if the domain has SD_WAKE_AFFINE set.
				6504	*
				6505	* Returns the target CPU number.
				6506	*
				6507	* preempt must be disabled.
				6508	*/
				6509	static int
				6510	select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags)
				6511	{
				6512	struct sched_domain tmp, sd = NULL;
				6513	int cpu = smp_processor_id();
				6514	int new_cpu = prev_cpu;
				6515	int want_affine = 0;
				6516	int sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING);
				6517
				6518	if (sd_flag & SD_BALANCE_WAKE) {
				6519	record_wakee(p);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6520
				6521	if (sched_energy_enabled()) {
				6522	new_cpu = find_energy_efficient_cpu(p, prev_cpu);
				6523	if (new_cpu >= 0)
				6524	return new_cpu;
				6525	new_cpu = prev_cpu;
				6526	}
				6527
				6528	want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu) &&
				6529	cpumask_test_cpu(cpu, p->cpus_ptr);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6530	}
				6531
				6532	rcu_read_lock();
				6533	for_each_domain(cpu, tmp) {
				6534	if (!(tmp->flags & SD_LOAD_BALANCE))
				6535	break;
				6536
				6537	/*
				6538	* If both 'cpu' and 'prev_cpu' are part of this domain,
				6539	* cpu is a valid SD_WAKE_AFFINE target.
				6540	*/
				6541	if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
				6542	cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
				6543	if (cpu != prev_cpu)
				6544	new_cpu = wake_affine(tmp, p, cpu, prev_cpu, sync);
				6545
				6546	sd = NULL; /* Prefer wake_affine over balance flags */
				6547	break;
				6548	}
				6549
				6550	if (tmp->flags & sd_flag)
				6551	sd = tmp;
				6552	else if (!want_affine)
				6553	break;
				6554	}
				6555
				6556	if (unlikely(sd)) {
				6557	/* Slow path */
				6558	new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag);
				6559	} else if (sd_flag & SD_BALANCE_WAKE) { /* XXX always ? */
				6560	/* Fast path */
				6561
				6562	new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
				6563
				6564	if (want_affine)
				6565	current->recent_used_cpu = cpu;
				6566	}
				6567	rcu_read_unlock();
				6568
				6569	return new_cpu;
				6570	}
				6571
				6572	static void detach_entity_cfs_rq(struct sched_entity *se);
				6573
				6574	/*
				6575	* Called immediately before a task is migrated to a new CPU; task_cpu(p) and
				6576	* cfs_rq_of(p) references at time of call are still valid and identify the
				6577	* previous CPU. The caller guarantees p->pi_lock or task_rq(p)->lock is held.
				6578	*/
				6579	static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
				6580	{
				6581	/*
				6582	* As blocked tasks retain absolute vruntime the migration needs to
				6583	* deal with this by subtracting the old and adding the new
				6584	* min_vruntime -- the latter is done by enqueue_entity() when placing
				6585	* the task on the new runqueue.
				6586	*/
				6587	if (p->state == TASK_WAKING) {
				6588	struct sched_entity *se = &p->se;
				6589	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				6590	u64 min_vruntime;
				6591
				6592	#ifndef CONFIG_64BIT
				6593	u64 min_vruntime_copy;
				6594
				6595	do {
				6596	min_vruntime_copy = cfs_rq->min_vruntime_copy;
				6597	smp_rmb();
				6598	min_vruntime = cfs_rq->min_vruntime;
				6599	} while (min_vruntime != min_vruntime_copy);
				6600	#else
				6601	min_vruntime = cfs_rq->min_vruntime;
				6602	#endif
				6603
				6604	se->vruntime -= min_vruntime;
				6605	}
				6606
				6607	if (p->on_rq == TASK_ON_RQ_MIGRATING) {
				6608	/*
				6609	* In case of TASK_ON_RQ_MIGRATING we in fact hold the 'old'
				6610	* rq->lock and can modify state directly.
				6611	*/
				6612	lockdep_assert_held(&task_rq(p)->lock);
				6613	detach_entity_cfs_rq(&p->se);
				6614
				6615	} else {
				6616	/*
				6617	* We are supposed to update the task to "current" time, then
				6618	* its up to date and ready to go to new CPU/cfs_rq. But we
				6619	* have difficulty in getting what current time is, so simply
				6620	* throw away the out-of-date time. This will result in the
				6621	* wakee task is less decayed, but giving the wakee more load
				6622	* sounds not bad.
				6623	*/
				6624	remove_entity_load_avg(&p->se);
				6625	}
				6626
				6627	/* Tell new CPU we are migrated */
				6628	p->se.avg.last_update_time = 0;
				6629
				6630	/* We have migrated, no longer consider this task hot */
				6631	p->se.exec_start = 0;
				6632
				6633	update_scan_period(p, new_cpu);
				6634	}
				6635
				6636	static void task_dead_fair(struct task_struct *p)
				6637	{
				6638	remove_entity_load_avg(&p->se);
				6639	}
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6640
				6641	static int
				6642	balance_fair(struct rq rq, struct task_struct prev, struct rq_flags *rf)
				6643	{
				6644	if (rq->nr_running)
				6645	return 1;
				6646
				6647	return newidle_balance(rq, rf) != 0;
				6648	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6649	#endif /* CONFIG_SMP */
				6650
				6651	static unsigned long wakeup_gran(struct sched_entity *se)
				6652	{
				6653	unsigned long gran = sysctl_sched_wakeup_granularity;
				6654
				6655	/*
				6656	* Since its curr running now, convert the gran from real-time
				6657	* to virtual-time in his units.
				6658	*
				6659	* By using 'se' instead of 'curr' we penalize light tasks, so
				6660	* they get preempted easier. That is, if 'se' < 'curr' then
				6661	* the resulting gran will be larger, therefore penalizing the
				6662	* lighter, if otoh 'se' > 'curr' then the resulting gran will
				6663	* be smaller, again penalizing the lighter task.
				6664	*
				6665	* This is especially important for buddies when the leftmost
				6666	* task is higher priority than the buddy.
				6667	*/
				6668	return calc_delta_fair(gran, se);
				6669	}
				6670
				6671	/*
				6672	* Should 'se' preempt 'curr'.
				6673	*
				6674	* \|s1
				6675	* \|s2
				6676	* \|s3
				6677	* g
				6678	* \|<--->\|c
				6679	*
				6680	* w(c, s1) = -1
				6681	* w(c, s2) = 0
				6682	* w(c, s3) = 1
				6683	*
				6684	*/
				6685	static int
				6686	wakeup_preempt_entity(struct sched_entity curr, struct sched_entity se)
				6687	{
				6688	s64 gran, vdiff = curr->vruntime - se->vruntime;
				6689
				6690	if (vdiff <= 0)
				6691	return -1;
				6692
				6693	gran = wakeup_gran(se);
				6694	if (vdiff > gran)
				6695	return 1;
				6696
				6697	return 0;
				6698	}
				6699
				6700	static void set_last_buddy(struct sched_entity *se)
				6701	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6702	if (entity_is_task(se) && unlikely(task_has_idle_policy(task_of(se))))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6703	return;
				6704
				6705	for_each_sched_entity(se) {
				6706	if (SCHED_WARN_ON(!se->on_rq))
				6707	return;
				6708	cfs_rq_of(se)->last = se;
				6709	}
				6710	}
				6711
				6712	static void set_next_buddy(struct sched_entity *se)
				6713	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6714	if (entity_is_task(se) && unlikely(task_has_idle_policy(task_of(se))))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6715	return;
				6716
				6717	for_each_sched_entity(se) {
				6718	if (SCHED_WARN_ON(!se->on_rq))
				6719	return;
				6720	cfs_rq_of(se)->next = se;
				6721	}
				6722	}
				6723
				6724	static void set_skip_buddy(struct sched_entity *se)
				6725	{
				6726	for_each_sched_entity(se)
				6727	cfs_rq_of(se)->skip = se;
				6728	}
				6729
				6730	/*
				6731	* Preempt the current task with a newly woken task if needed:
				6732	*/
				6733	static void check_preempt_wakeup(struct rq rq, struct task_struct p, int wake_flags)
				6734	{
				6735	struct task_struct *curr = rq->curr;
				6736	struct sched_entity se = &curr->se, pse = &p->se;
				6737	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
				6738	int scale = cfs_rq->nr_running >= sched_nr_latency;
				6739	int next_buddy_marked = 0;
				6740
				6741	if (unlikely(se == pse))
				6742	return;
				6743
				6744	/*
				6745	* This is possible from callers such as attach_tasks(), in which we
				6746	* unconditionally check_prempt_curr() after an enqueue (which may have
				6747	* lead to a throttle). This both saves work and prevents false
				6748	* next-buddy nomination below.
				6749	*/
				6750	if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
				6751	return;
				6752
				6753	if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
				6754	set_next_buddy(pse);
				6755	next_buddy_marked = 1;
				6756	}
				6757
				6758	/*
				6759	* We can come here with TIF_NEED_RESCHED already set from new task
				6760	* wake up path.
				6761	*
				6762	* Note: this also catches the edge-case of curr being in a throttled
				6763	* group (e.g. via set_curr_task), since update_curr() (in the
				6764	* enqueue of curr) will have resulted in resched being set. This
				6765	* prevents us from potentially nominating it as a false LAST_BUDDY
				6766	* below.
				6767	*/
				6768	if (test_tsk_need_resched(curr))
				6769	return;
				6770
				6771	/* Idle tasks are by definition preempted by non-idle tasks. */
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6772	if (unlikely(task_has_idle_policy(curr)) &&
				6773	likely(!task_has_idle_policy(p)))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6774	goto preempt;
				6775
				6776	/*
				6777	* Batch and idle tasks do not preempt non-idle tasks (their preemption
				6778	* is driven by the tick):
				6779	*/
				6780	if (unlikely(p->policy != SCHED_NORMAL) \|\| !sched_feat(WAKEUP_PREEMPTION))
				6781	return;
				6782
				6783	find_matching_se(&se, &pse);
				6784	update_curr(cfs_rq_of(se));
				6785	BUG_ON(!pse);
				6786	if (wakeup_preempt_entity(se, pse) == 1) {
				6787	/*
				6788	* Bias pick_next to pick the sched entity that is
				6789	* triggering this preemption.
				6790	*/
				6791	if (!next_buddy_marked)
				6792	set_next_buddy(pse);
				6793	goto preempt;
				6794	}
				6795
				6796	return;
				6797
				6798	preempt:
				6799	resched_curr(rq);
				6800	/*
				6801	* Only set the backward buddy when the current task is still
				6802	* on the rq. This can happen when a wakeup gets interleaved
				6803	* with schedule on the ->pre_schedule() or idle_balance()
				6804	* point, either of which can * drop the rq lock.
				6805	*
				6806	* Also, during early boot the idle thread is in the fair class,
				6807	* for obvious reasons its a bad idea to schedule back to it.
				6808	*/
				6809	if (unlikely(!se->on_rq \|\| curr == rq->idle))
				6810	return;
				6811
				6812	if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
				6813	set_last_buddy(se);
				6814	}
				6815
				6816	static struct task_struct *
				6817	pick_next_task_fair(struct rq rq, struct task_struct prev, struct rq_flags *rf)
				6818	{
				6819	struct cfs_rq *cfs_rq = &rq->cfs;
				6820	struct sched_entity *se;
				6821	struct task_struct *p;
				6822	int new_tasks;
				6823
				6824	again:
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6825	if (!sched_fair_runnable(rq))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6826	goto idle;
				6827
				6828	#ifdef CONFIG_FAIR_GROUP_SCHED
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6829	if (!prev \|\| prev->sched_class != &fair_sched_class)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6830	goto simple;
				6831
				6832	/*
				6833	* Because of the set_next_buddy() in dequeue_task_fair() it is rather
				6834	* likely that a next task is from the same cgroup as the current.
				6835	*
				6836	* Therefore attempt to avoid putting and setting the entire cgroup
				6837	* hierarchy, only change the part that actually changes.
				6838	*/
				6839
				6840	do {
				6841	struct sched_entity *curr = cfs_rq->curr;
				6842
				6843	/*
				6844	* Since we got here without doing put_prev_entity() we also
				6845	* have to consider cfs_rq->curr. If it is still a runnable
				6846	* entity, update_curr() will update its vruntime, otherwise
				6847	* forget we've ever seen it.
				6848	*/
				6849	if (curr) {
				6850	if (curr->on_rq)
				6851	update_curr(cfs_rq);
				6852	else
				6853	curr = NULL;
				6854
				6855	/*
				6856	* This call to check_cfs_rq_runtime() will do the
				6857	* throttle and dequeue its entity in the parent(s).
				6858	* Therefore the nr_running test will indeed
				6859	* be correct.
				6860	*/
				6861	if (unlikely(check_cfs_rq_runtime(cfs_rq))) {
				6862	cfs_rq = &rq->cfs;
				6863
				6864	if (!cfs_rq->nr_running)
				6865	goto idle;
				6866
				6867	goto simple;
				6868	}
				6869	}
				6870
				6871	se = pick_next_entity(cfs_rq, curr);
				6872	cfs_rq = group_cfs_rq(se);
				6873	} while (cfs_rq);
				6874
				6875	p = task_of(se);
				6876
				6877	/*
				6878	* Since we haven't yet done put_prev_entity and if the selected task
				6879	* is a different task than we started out with, try and touch the
				6880	* least amount of cfs_rqs.
				6881	*/
				6882	if (prev != p) {
				6883	struct sched_entity *pse = &prev->se;
				6884
				6885	while (!(cfs_rq = is_same_group(se, pse))) {
				6886	int se_depth = se->depth;
				6887	int pse_depth = pse->depth;
				6888
				6889	if (se_depth <= pse_depth) {
				6890	put_prev_entity(cfs_rq_of(pse), pse);
				6891	pse = parent_entity(pse);
				6892	}
				6893	if (se_depth >= pse_depth) {
				6894	set_next_entity(cfs_rq_of(se), se);
				6895	se = parent_entity(se);
				6896	}
				6897	}
				6898
				6899	put_prev_entity(cfs_rq, pse);
				6900	set_next_entity(cfs_rq, se);
				6901	}
				6902
				6903	goto done;
				6904	simple:
				6905	#endif
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6906	if (prev)
				6907	put_prev_task(rq, prev);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6908
				6909	do {
				6910	se = pick_next_entity(cfs_rq, NULL);
				6911	set_next_entity(cfs_rq, se);
				6912	cfs_rq = group_cfs_rq(se);
				6913	} while (cfs_rq);
				6914
				6915	p = task_of(se);
				6916
				6917	done: __maybe_unused;
				6918	#ifdef CONFIG_SMP
				6919	/*
				6920	* Move the next running task to the front of
				6921	* the list, so our cfs_tasks list becomes MRU
				6922	* one.
				6923	*/
				6924	list_move(&p->se.group_node, &rq->cfs_tasks);
				6925	#endif
				6926
				6927	if (hrtick_enabled(rq))
				6928	hrtick_start_fair(rq, p);
				6929
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6930	update_misfit_status(p, rq);
				6931
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6932	return p;
				6933
				6934	idle:
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6935	if (!rf)
				6936	return NULL;
				6937
				6938	new_tasks = newidle_balance(rq, rf);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6939
				6940	/*
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6941	* Because newidle_balance() releases (and re-acquires) rq->lock, it is
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6942	* possible for any higher priority task to appear. In that case we
				6943	* must re-start the pick_next_entity() loop.
				6944	*/
				6945	if (new_tasks < 0)
				6946	return RETRY_TASK;
				6947
				6948	if (new_tasks > 0)
				6949	goto again;
				6950
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6951	/*
				6952	* rq is about to be idle, check if we need to update the
				6953	* lost_idle_time of clock_pelt
				6954	*/
				6955	update_idle_rq_clock_pelt(rq);
				6956
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6957	return NULL;
				6958	}
				6959
				6960	/*
				6961	* Account for a descheduled task:
				6962	*/
				6963	static void put_prev_task_fair(struct rq rq, struct task_struct prev)
				6964	{
				6965	struct sched_entity *se = &prev->se;
				6966	struct cfs_rq *cfs_rq;
				6967
				6968	for_each_sched_entity(se) {
				6969	cfs_rq = cfs_rq_of(se);
				6970	put_prev_entity(cfs_rq, se);
				6971	}
				6972	}
				6973
				6974	/*
				6975	* sched_yield() is very simple
				6976	*
				6977	* The magic of dealing with the ->skip buddy is in pick_next_entity.
				6978	*/
				6979	static void yield_task_fair(struct rq *rq)
				6980	{
				6981	struct task_struct *curr = rq->curr;
				6982	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
				6983	struct sched_entity *se = &curr->se;
				6984
				6985	/*
				6986	* Are we the only task in the tree?
				6987	*/
				6988	if (unlikely(rq->nr_running == 1))
				6989	return;
				6990
				6991	clear_buddies(cfs_rq, se);
				6992
				6993	if (curr->policy != SCHED_BATCH) {
				6994	update_rq_clock(rq);
				6995	/*
				6996	* Update run-time statistics of the 'current'.
				6997	*/
				6998	update_curr(cfs_rq);
				6999	/*
				7000	* Tell update_rq_clock() that we've just updated,
				7001	* so we don't do microscopic update in schedule()
				7002	* and double the fastpath cost.
				7003	*/
				7004	rq_clock_skip_update(rq);
				7005	}
				7006
				7007	set_skip_buddy(se);
				7008	}
				7009
				7010	static bool yield_to_task_fair(struct rq rq, struct task_struct p, bool preempt)
				7011	{
				7012	struct sched_entity *se = &p->se;
				7013
				7014	/* throttled hierarchies are not runnable */
				7015	if (!se->on_rq \|\| throttled_hierarchy(cfs_rq_of(se)))
				7016	return false;
				7017
				7018	/* Tell the scheduler that we'd really like pse to run next. */
				7019	set_next_buddy(se);
				7020
				7021	yield_task_fair(rq);
				7022
				7023	return true;
				7024	}
				7025
				7026	#ifdef CONFIG_SMP
				7027	/**************************************************
				7028	* Fair scheduling class load-balancing methods.
				7029	*
				7030	* BASICS
				7031	*
				7032	* The purpose of load-balancing is to achieve the same basic fairness the
				7033	* per-CPU scheduler provides, namely provide a proportional amount of compute
				7034	* time to each task. This is expressed in the following equation:
				7035	*
				7036	* W_i,n/P_i == W_j,n/P_j for all i,j (1)
				7037	*
				7038	* Where W_i,n is the n-th weight average for CPU i. The instantaneous weight
				7039	* W_i,0 is defined as:
				7040	*
				7041	* W_i,0 = \Sum_j w_i,j (2)
				7042	*
				7043	* Where w_i,j is the weight of the j-th runnable task on CPU i. This weight
				7044	* is derived from the nice value as per sched_prio_to_weight[].
				7045	*
				7046	* The weight average is an exponential decay average of the instantaneous
				7047	* weight:
				7048	*
				7049	* W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0 (3)
				7050	*
				7051	* C_i is the compute capacity of CPU i, typically it is the
				7052	* fraction of 'recent' time available for SCHED_OTHER task execution. But it
				7053	* can also include other factors [XXX].
				7054	*
				7055	* To achieve this balance we define a measure of imbalance which follows
				7056	* directly from (1):
				7057	*
				7058	* imb_i,j = max{ avg(W/C), W_i/C_i } - min{ avg(W/C), W_j/C_j } (4)
				7059	*
				7060	* We them move tasks around to minimize the imbalance. In the continuous
				7061	* function space it is obvious this converges, in the discrete case we get
				7062	* a few fun cases generally called infeasible weight scenarios.
				7063	*
				7064	* [XXX expand on:
				7065	* - infeasible weights;
				7066	* - local vs global optima in the discrete case. ]
				7067	*
				7068	*
				7069	* SCHED DOMAINS
				7070	*
				7071	* In order to solve the imbalance equation (4), and avoid the obvious O(n^2)
				7072	* for all i,j solution, we create a tree of CPUs that follows the hardware
				7073	* topology where each level pairs two lower groups (or better). This results
				7074	* in O(log n) layers. Furthermore we reduce the number of CPUs going up the
				7075	* tree to only the first of the previous level and we decrease the frequency
				7076	* of load-balance at each level inv. proportional to the number of CPUs in
				7077	* the groups.
				7078	*
				7079	* This yields:
				7080	*
				7081	* log_2 n 1 n
				7082	* \Sum { --- * --- * 2^i } = O(n) (5)
				7083	* i = 0 2^i 2^i
				7084	* `- size of each group
				7085	* \| \| `- number of CPUs doing load-balance
				7086	* \| `- freq
				7087	* `- sum over all levels
				7088	*
				7089	* Coupled with a limit on how many tasks we can migrate every balance pass,
				7090	* this makes (5) the runtime complexity of the balancer.
				7091	*
				7092	* An important property here is that each CPU is still (indirectly) connected
				7093	* to every other CPU in at most O(log n) steps:
				7094	*
				7095	* The adjacency matrix of the resulting graph is given by:
				7096	*
				7097	* log_2 n
				7098	* A_i,j = \Union (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1) (6)
				7099	* k = 0
				7100	*
				7101	* And you'll find that:
				7102	*
				7103	* A^(log_2 n)_i,j != 0 for all i,j (7)
				7104	*
				7105	* Showing there's indeed a path between every CPU in at most O(log n) steps.
				7106	* The task movement gives a factor of O(m), giving a convergence complexity
				7107	* of:
				7108	*
				7109	* O(nm log n), n := nr_cpus, m := nr_tasks (8)
				7110	*
				7111	*
				7112	* WORK CONSERVING
				7113	*
				7114	* In order to avoid CPUs going idle while there's still work to do, new idle
				7115	* balancing is more aggressive and has the newly idle CPU iterate up the domain
				7116	* tree itself instead of relying on other CPUs to bring it work.
				7117	*
				7118	* This adds some complexity to both (5) and (8) but it reduces the total idle
				7119	* time.
				7120	*
				7121	* [XXX more?]
				7122	*
				7123	*
				7124	* CGROUPS
				7125	*
				7126	* Cgroups make a horror show out of (2), instead of a simple sum we get:
				7127	*
				7128	* s_k,i
				7129	* W_i,0 = \Sum_j \Prod_k w_k * ----- (9)
				7130	* S_k
				7131	*
				7132	* Where
				7133	*
				7134	* s_k,i = \Sum_j w_i,j,k and S_k = \Sum_i s_k,i (10)
				7135	*
				7136	* w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on CPU i.
				7137	*
				7138	* The big problem is S_k, its a global sum needed to compute a local (W_i)
				7139	* property.
				7140	*
				7141	* [XXX write more on how we solve this.. _after_ merging pjt's patches that
				7142	* rewrite all of this once again.]
				7143	*/
				7144
				7145	static unsigned long __read_mostly max_load_balance_interval = HZ/10;
				7146
				7147	enum fbq_type { regular, remote, all };
				7148
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	7149	enum group_type {
				7150	group_other = 0,
				7151	group_misfit_task,
				7152	group_imbalanced,
				7153	group_overloaded,
				7154	};
				7155
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7156	#define LBF_ALL_PINNED 0x01
				7157	#define LBF_NEED_BREAK 0x02
				7158	#define LBF_DST_PINNED 0x04
				7159	#define LBF_SOME_PINNED 0x08
				7160	#define LBF_NOHZ_STATS 0x10
				7161	#define LBF_NOHZ_AGAIN 0x20
				7162
				7163	struct lb_env {
				7164	struct sched_domain *sd;
				7165
				7166	struct rq *src_rq;
				7167	int src_cpu;
				7168
				7169	int dst_cpu;
				7170	struct rq *dst_rq;
				7171
				7172	struct cpumask *dst_grpmask;
				7173	int new_dst_cpu;
				7174	enum cpu_idle_type idle;
				7175	long imbalance;
				7176	/* The set of CPUs under consideration for load-balancing */
				7177	struct cpumask *cpus;
				7178
				7179	unsigned int flags;
				7180
				7181	unsigned int loop;
				7182	unsigned int loop_break;
				7183	unsigned int loop_max;
				7184
				7185	enum fbq_type fbq_type;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	7186	enum group_type src_grp_type;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7187	struct list_head tasks;
				7188	};
				7189
				7190	/*
				7191	* Is this task likely cache-hot:
				7192	*/
				7193	static int task_hot(struct task_struct p, struct lb_env env)
				7194	{
				7195	s64 delta;
				7196
				7197	lockdep_assert_held(&env->src_rq->lock);
				7198
				7199	if (p->sched_class != &fair_sched_class)
				7200	return 0;
				7201
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	7202	if (unlikely(task_has_idle_policy(p)))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7203	return 0;
				7204
				7205	/*
				7206	* Buddy candidates are cache hot:
				7207	*/
				7208	if (sched_feat(CACHE_HOT_BUDDY) && env->dst_rq->nr_running &&
				7209	(&p->se == cfs_rq_of(&p->se)->next \|\|
				7210	&p->se == cfs_rq_of(&p->se)->last))
				7211	return 1;
				7212
				7213	if (sysctl_sched_migration_cost == -1)
				7214	return 1;
				7215	if (sysctl_sched_migration_cost == 0)
				7216	return 0;
				7217
				7218	delta = rq_clock_task(env->src_rq) - p->se.exec_start;
				7219
				7220	return delta < (s64)sysctl_sched_migration_cost;
				7221	}
				7222
				7223	#ifdef CONFIG_NUMA_BALANCING
				7224	/*
				7225	* Returns 1, if task migration degrades locality
				7226	* Returns 0, if task migration improves locality i.e migration preferred.
				7227	* Returns -1, if task migration is not affected by locality.
				7228	*/
				7229	static int migrate_degrades_locality(struct task_struct p, struct lb_env env)
				7230	{
				7231	struct numa_group *numa_group = rcu_dereference(p->numa_group);
				7232	unsigned long src_weight, dst_weight;
				7233	int src_nid, dst_nid, dist;
				7234
				7235	if (!static_branch_likely(&sched_numa_balancing))
				7236	return -1;
				7237
				7238	if (!p->numa_faults \|\| !(env->sd->flags & SD_NUMA))
				7239	return -1;
				7240
				7241	src_nid = cpu_to_node(env->src_cpu);
				7242	dst_nid = cpu_to_node(env->dst_cpu);
				7243
				7244	if (src_nid == dst_nid)
				7245	return -1;
				7246
				7247	/* Migrating away from the preferred node is always bad. */
				7248	if (src_nid == p->numa_preferred_nid) {
				7249	if (env->src_rq->nr_running > env->src_rq->nr_preferred_running)
				7250	return 1;
				7251	else
				7252	return -1;
				7253	}
				7254
				7255	/* Encourage migration to the preferred node. */
				7256	if (dst_nid == p->numa_preferred_nid)
				7257	return 0;
				7258
				7259	/* Leaving a core idle is often worse than degrading locality. */
				7260	if (env->idle == CPU_IDLE)
				7261	return -1;
				7262
				7263	dist = node_distance(src_nid, dst_nid);
				7264	if (numa_group) {
				7265	src_weight = group_weight(p, src_nid, dist);
				7266	dst_weight = group_weight(p, dst_nid, dist);
				7267	} else {
				7268	src_weight = task_weight(p, src_nid, dist);
				7269	dst_weight = task_weight(p, dst_nid, dist);
				7270	}
				7271
				7272	return dst_weight < src_weight;
				7273	}
				7274
				7275	#else
				7276	static inline int migrate_degrades_locality(struct task_struct *p,
				7277	struct lb_env *env)
				7278	{
				7279	return -1;
				7280	}
				7281	#endif
				7282
				7283	/*
				7284	* can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
				7285	*/
				7286	static
				7287	int can_migrate_task(struct task_struct p, struct lb_env env)
				7288	{
				7289	int tsk_cache_hot;
				7290
				7291	lockdep_assert_held(&env->src_rq->lock);
				7292
				7293	/*
				7294	* We do not migrate tasks that are:
				7295	* 1) throttled_lb_pair, or
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	7296	* 2) cannot be migrated to this CPU due to cpus_ptr, or
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7297	* 3) running (obviously), or
				7298	* 4) are cache-hot on their current CPU.
				7299	*/
				7300	if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
				7301	return 0;
				7302
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	7303	/* Disregard pcpu kthreads; they are where they need to be. */
				7304	if (kthread_is_per_cpu(p))
				7305	return 0;
				7306
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	7307	if (!cpumask_test_cpu(env->dst_cpu, p->cpus_ptr)) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7308	int cpu;
				7309
				7310	schedstat_inc(p->se.statistics.nr_failed_migrations_affine);
				7311
				7312	env->flags \|= LBF_SOME_PINNED;
				7313
				7314	/*
				7315	* Remember if this task can be migrated to any other CPU in
				7316	* our sched_group. We may want to revisit it if we couldn't
				7317	* meet load balance goals by pulling other tasks on src_cpu.
				7318	*
				7319	* Avoid computing new_dst_cpu for NEWLY_IDLE or if we have
				7320	* already computed one in current iteration.
				7321	*/
				7322	if (env->idle == CPU_NEWLY_IDLE \|\| (env->flags & LBF_DST_PINNED))
				7323	return 0;
				7324
				7325	/* Prevent to re-select dst_cpu via env's CPUs: */
				7326	for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	7327	if (cpumask_test_cpu(cpu, p->cpus_ptr)) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7328	env->flags \|= LBF_DST_PINNED;
				7329	env->new_dst_cpu = cpu;
				7330	break;
				7331	}
				7332	}
				7333
				7334	return 0;
				7335	}
				7336
				7337	/* Record that we found atleast one task that could run on dst_cpu */
				7338	env->flags &= ~LBF_ALL_PINNED;
				7339
				7340	if (task_running(env->src_rq, p)) {
				7341	schedstat_inc(p->se.statistics.nr_failed_migrations_running);
				7342	return 0;
				7343	}
				7344
				7345	/*
				7346	* Aggressive migration if:
				7347	* 1) destination numa is preferred
				7348	* 2) task is cache cold, or
				7349	* 3) too many balance attempts have failed.
				7350	*/
				7351	tsk_cache_hot = migrate_degrades_locality(p, env);
				7352	if (tsk_cache_hot == -1)
				7353	tsk_cache_hot = task_hot(p, env);
				7354
				7355	if (tsk_cache_hot <= 0 \|\|
				7356	env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
				7357	if (tsk_cache_hot == 1) {
				7358	schedstat_inc(env->sd->lb_hot_gained[env->idle]);
				7359	schedstat_inc(p->se.statistics.nr_forced_migrations);
				7360	}
				7361	return 1;
				7362	}
				7363
				7364	schedstat_inc(p->se.statistics.nr_failed_migrations_hot);
				7365	return 0;
				7366	}
				7367
				7368	/*
				7369	* detach_task() -- detach the task for the migration specified in env
				7370	*/
				7371	static void detach_task(struct task_struct p, struct lb_env env)
				7372	{
				7373	lockdep_assert_held(&env->src_rq->lock);
				7374
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7375	deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK);
				7376	set_task_cpu(p, env->dst_cpu);
				7377	}
				7378
				7379	/*
				7380	* detach_one_task() -- tries to dequeue exactly one task from env->src_rq, as
				7381	* part of active balancing operations within "domain".
				7382	*
				7383	* Returns a task if successful and NULL otherwise.
				7384	*/
				7385	static struct task_struct detach_one_task(struct lb_env env)
				7386	{
				7387	struct task_struct *p;
				7388
				7389	lockdep_assert_held(&env->src_rq->lock);
				7390
				7391	list_for_each_entry_reverse(p,
				7392	&env->src_rq->cfs_tasks, se.group_node) {
				7393	if (!can_migrate_task(p, env))
				7394	continue;
				7395
				7396	detach_task(p, env);
				7397
				7398	/*
				7399	* Right now, this is only the second place where
				7400	* lb_gained[env->idle] is updated (other is detach_tasks)
				7401	* so we can safely collect stats here rather than
				7402	* inside detach_tasks().
				7403	*/
				7404	schedstat_inc(env->sd->lb_gained[env->idle]);
				7405	return p;
				7406	}
				7407	return NULL;
				7408	}
				7409
				7410	static const unsigned int sched_nr_migrate_break = 32;
				7411
				7412	/*
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	7413	* detach_tasks() -- tries to detach up to imbalance runnable load from
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7414	* busiest_rq, as part of a balancing operation within domain "sd".
				7415	*
				7416	* Returns number of detached tasks if successful and 0 otherwise.
				7417	*/
				7418	static int detach_tasks(struct lb_env *env)
				7419	{
				7420	struct list_head *tasks = &env->src_rq->cfs_tasks;
				7421	struct task_struct *p;
				7422	unsigned long load;
				7423	int detached = 0;
				7424
				7425	lockdep_assert_held(&env->src_rq->lock);
				7426
				7427	if (env->imbalance <= 0)
				7428	return 0;
				7429
				7430	while (!list_empty(tasks)) {
				7431	/*
				7432	* We don't want to steal all, otherwise we may be treated likewise,
				7433	* which could at worst lead to a livelock crash.
				7434	*/
				7435	if (env->idle != CPU_NOT_IDLE && env->src_rq->nr_running <= 1)
				7436	break;
				7437
				7438	p = list_last_entry(tasks, struct task_struct, se.group_node);
				7439
				7440	env->loop++;
				7441	/* We've more or less seen every task there is, call it quits */
				7442	if (env->loop > env->loop_max)
				7443	break;
				7444
				7445	/* take a breather every nr_migrate tasks */
				7446	if (env->loop > env->loop_break) {
				7447	env->loop_break += sched_nr_migrate_break;
				7448	env->flags \|= LBF_NEED_BREAK;
				7449	break;
				7450	}
				7451
				7452	if (!can_migrate_task(p, env))
				7453	goto next;
				7454
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	7455	/*
				7456	* Depending of the number of CPUs and tasks and the
				7457	* cgroup hierarchy, task_h_load() can return a null
				7458	* value. Make sure that env->imbalance decreases
				7459	* otherwise detach_tasks() will stop only after
				7460	* detaching up to loop_max tasks.
				7461	*/
				7462	load = max_t(unsigned long, task_h_load(p), 1);
				7463
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7464
				7465	if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
				7466	goto next;
				7467
				7468	if ((load / 2) > env->imbalance)
				7469	goto next;
				7470
				7471	detach_task(p, env);
				7472	list_add(&p->se.group_node, &env->tasks);
				7473
				7474	detached++;
				7475	env->imbalance -= load;
				7476
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	7477	#ifdef CONFIG_PREEMPTION
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7478	/*
				7479	* NEWIDLE balancing is a source of latency, so preemptible
				7480	* kernels will stop after the first task is detached to minimize
				7481	* the critical section.
				7482	*/
				7483	if (env->idle == CPU_NEWLY_IDLE)
				7484	break;
				7485	#endif
				7486
				7487	/*
				7488	* We only want to steal up to the prescribed amount of
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	7489	* runnable load.
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7490	*/
				7491	if (env->imbalance <= 0)
				7492	break;
				7493
				7494	continue;
				7495	next:
				7496	list_move(&p->se.group_node, tasks);
				7497	}
				7498
				7499	/*
				7500	* Right now, this is one of only two places we collect this stat
				7501	* so we can safely collect detach_one_task() stats here rather
				7502	* than inside detach_one_task().
				7503	*/
				7504	schedstat_add(env->sd->lb_gained[env->idle], detached);
				7505
				7506	return detached;
				7507	}
				7508
				7509	/*
				7510	* attach_task() -- attach the task detached by detach_task() to its new rq.
				7511	*/
				7512	static void attach_task(struct rq rq, struct task_struct p)
				7513	{
				7514	lockdep_assert_held(&rq->lock);
				7515
				7516	BUG_ON(task_rq(p) != rq);
				7517	activate_task(rq, p, ENQUEUE_NOCLOCK);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7518	check_preempt_curr(rq, p, 0);
				7519	}
				7520
				7521	/*
				7522	* attach_one_task() -- attaches the task returned from detach_one_task() to
				7523	* its new rq.
				7524	*/
				7525	static void attach_one_task(struct rq rq, struct task_struct p)
				7526	{
				7527	struct rq_flags rf;
				7528
				7529	rq_lock(rq, &rf);
				7530	update_rq_clock(rq);
				7531	attach_task(rq, p);
				7532	rq_unlock(rq, &rf);
				7533	}
				7534
				7535	/*
				7536	* attach_tasks() -- attaches all tasks detached by detach_tasks() to their
				7537	* new rq.
				7538	*/
				7539	static void attach_tasks(struct lb_env *env)
				7540	{
				7541	struct list_head *tasks = &env->tasks;
				7542	struct task_struct *p;
				7543	struct rq_flags rf;
				7544
				7545	rq_lock(env->dst_rq, &rf);
				7546	update_rq_clock(env->dst_rq);
				7547
				7548	while (!list_empty(tasks)) {
				7549	p = list_first_entry(tasks, struct task_struct, se.group_node);
				7550	list_del_init(&p->se.group_node);
				7551
				7552	attach_task(env->dst_rq, p);
				7553	}
				7554
				7555	rq_unlock(env->dst_rq, &rf);
				7556	}
				7557
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	7558	#ifdef CONFIG_NO_HZ_COMMON
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7559	static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq)
				7560	{
				7561	if (cfs_rq->avg.load_avg)
				7562	return true;
				7563
				7564	if (cfs_rq->avg.util_avg)
				7565	return true;
				7566
				7567	return false;
				7568	}
				7569
				7570	static inline bool others_have_blocked(struct rq *rq)
				7571	{
				7572	if (READ_ONCE(rq->avg_rt.util_avg))
				7573	return true;
				7574
				7575	if (READ_ONCE(rq->avg_dl.util_avg))
				7576	return true;
				7577
				7578	#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
				7579	if (READ_ONCE(rq->avg_irq.util_avg))
				7580	return true;
				7581	#endif
				7582
				7583	return false;
				7584	}
				7585
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	7586	static inline void update_blocked_load_status(struct rq *rq, bool has_blocked)
				7587	{
				7588	rq->last_blocked_load_update_tick = jiffies;
				7589
				7590	if (!has_blocked)
				7591	rq->has_blocked_load = 0;
				7592	}
				7593	#else
				7594	static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) { return false; }
				7595	static inline bool others_have_blocked(struct rq *rq) { return false; }
				7596	static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) {}
				7597	#endif
				7598
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	7599	static bool __update_blocked_others(struct rq rq, bool done)
				7600	{
				7601	const struct sched_class *curr_class;
				7602	u64 now = rq_clock_pelt(rq);
				7603	bool decayed;
				7604
				7605	/*
				7606	* update_load_avg() can call cpufreq_update_util(). Make sure that RT,
				7607	* DL and IRQ signals have been updated before updating CFS.
				7608	*/
				7609	curr_class = rq->curr->sched_class;
				7610
				7611	decayed = update_rt_rq_load_avg(now, rq, curr_class == &rt_sched_class) \|
				7612	update_dl_rq_load_avg(now, rq, curr_class == &dl_sched_class) \|
				7613	update_irq_load_avg(rq, 0);
				7614
				7615	if (others_have_blocked(rq))
				7616	*done = false;
				7617
				7618	return decayed;
				7619	}
				7620
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7621	#ifdef CONFIG_FAIR_GROUP_SCHED
				7622
				7623	static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
				7624	{
				7625	if (cfs_rq->load.weight)
				7626	return false;
				7627
				7628	if (cfs_rq->avg.load_sum)
				7629	return false;
				7630
				7631	if (cfs_rq->avg.util_sum)
				7632	return false;
				7633
				7634	if (cfs_rq->avg.runnable_load_sum)
				7635	return false;
				7636
				7637	return true;
				7638	}
				7639
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	7640	static bool __update_blocked_fair(struct rq rq, bool done)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7641	{
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7642	struct cfs_rq cfs_rq, pos;
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	7643	bool decayed = false;
				7644	int cpu = cpu_of(rq);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	7645
				7646	/*
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7647	* Iterates the task_group tree in a bottom up fashion, see
				7648	* list_add_leaf_cfs_rq() for details.
				7649	*/
				7650	for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) {
				7651	struct sched_entity *se;
				7652
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	7653	if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq)) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7654	update_tg_load_avg(cfs_rq, 0);
				7655
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	7656	if (cfs_rq == &rq->cfs)
				7657	decayed = true;
				7658	}
				7659
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7660	/* Propagate pending load changes to the parent, if any: */
				7661	se = cfs_rq->tg->se[cpu];
				7662	if (se && !skip_blocked_update(se))
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	7663	update_load_avg(cfs_rq_of(se), se, UPDATE_TG);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7664
				7665	/*
				7666	* There can be a lot of idle CPU cgroups. Don't let fully
				7667	* decayed cfs_rqs linger on the list.
				7668	*/
				7669	if (cfs_rq_is_decayed(cfs_rq))
				7670	list_del_leaf_cfs_rq(cfs_rq);
				7671
				7672	/* Don't need periodic decay once load/util_avg are null */
				7673	if (cfs_rq_has_blocked(cfs_rq))
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	7674	*done = false;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7675	}
				7676
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	7677	return decayed;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7678	}
				7679
				7680	/*
				7681	* Compute the hierarchical load factor for cfs_rq and all its ascendants.
				7682	* This needs to be done in a top-down fashion because the load of a child
				7683	* group is a fraction of its parents load.
				7684	*/
				7685	static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
				7686	{
				7687	struct rq *rq = rq_of(cfs_rq);
				7688	struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];
				7689	unsigned long now = jiffies;
				7690	unsigned long load;
				7691
				7692	if (cfs_rq->last_h_load_update == now)
				7693	return;
				7694
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	7695	WRITE_ONCE(cfs_rq->h_load_next, NULL);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7696	for_each_sched_entity(se) {
				7697	cfs_rq = cfs_rq_of(se);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	7698	WRITE_ONCE(cfs_rq->h_load_next, se);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7699	if (cfs_rq->last_h_load_update == now)
				7700	break;
				7701	}
				7702
				7703	if (!se) {
				7704	cfs_rq->h_load = cfs_rq_load_avg(cfs_rq);
				7705	cfs_rq->last_h_load_update = now;
				7706	}
				7707
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	7708	while ((se = READ_ONCE(cfs_rq->h_load_next)) != NULL) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7709	load = cfs_rq->h_load;
				7710	load = div64_ul(load * se->avg.load_avg,
				7711	cfs_rq_load_avg(cfs_rq) + 1);
				7712	cfs_rq = group_cfs_rq(se);
				7713	cfs_rq->h_load = load;
				7714	cfs_rq->last_h_load_update = now;
				7715	}
				7716	}
				7717
				7718	static unsigned long task_h_load(struct task_struct *p)
				7719	{
				7720	struct cfs_rq *cfs_rq = task_cfs_rq(p);
				7721
				7722	update_cfs_rq_h_load(cfs_rq);
				7723	return div64_ul(p->se.avg.load_avg * cfs_rq->h_load,
				7724	cfs_rq_load_avg(cfs_rq) + 1);
				7725	}
				7726	#else
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	7727	static bool __update_blocked_fair(struct rq rq, bool done)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7728	{
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7729	struct cfs_rq *cfs_rq = &rq->cfs;
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	7730	bool decayed;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7731
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	7732	decayed = update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq);
				7733	if (cfs_rq_has_blocked(cfs_rq))
				7734	*done = false;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7735
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	7736	return decayed;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7737	}
				7738
				7739	static unsigned long task_h_load(struct task_struct *p)
				7740	{
				7741	return p->se.avg.load_avg;
				7742	}
				7743	#endif
				7744
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	7745	static void update_blocked_averages(int cpu)
				7746	{
				7747	bool decayed = false, done = true;
				7748	struct rq *rq = cpu_rq(cpu);
				7749	struct rq_flags rf;
				7750
				7751	rq_lock_irqsave(rq, &rf);
				7752	update_rq_clock(rq);
				7753
				7754	decayed \|= __update_blocked_others(rq, &done);
				7755	decayed \|= __update_blocked_fair(rq, &done);
				7756
				7757	update_blocked_load_status(rq, !done);
				7758	if (decayed)
				7759	cpufreq_update_util(rq, 0);
				7760	rq_unlock_irqrestore(rq, &rf);
				7761	}
				7762
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7763	/******** Helpers for find_busiest_group **********************/
				7764
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7765	/*
				7766	* sg_lb_stats - stats of a sched_group required for load_balancing
				7767	*/
				7768	struct sg_lb_stats {
				7769	unsigned long avg_load; /Avg load across the CPUs of the group /
				7770	unsigned long group_load; /* Total load over the CPUs of the group */
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7771	unsigned long load_per_task;
				7772	unsigned long group_capacity;
				7773	unsigned long group_util; /* Total utilization of the group */
				7774	unsigned int sum_nr_running; /* Nr tasks running in the group */
				7775	unsigned int idle_cpus;
				7776	unsigned int group_weight;
				7777	enum group_type group_type;
				7778	int group_no_capacity;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	7779	unsigned long group_misfit_task_load; /* A CPU has a task too big for its capacity */
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7780	#ifdef CONFIG_NUMA_BALANCING
				7781	unsigned int nr_numa_running;
				7782	unsigned int nr_preferred_running;
				7783	#endif
				7784	};
				7785
				7786	/*
				7787	* sd_lb_stats - Structure to store the statistics of a sched_domain
				7788	* during load balancing.
				7789	*/
				7790	struct sd_lb_stats {
				7791	struct sched_group busiest; / Busiest group in this sd */
				7792	struct sched_group local; / Local group in this sd */
				7793	unsigned long total_running;
				7794	unsigned long total_load; /* Total load of all groups in sd */
				7795	unsigned long total_capacity; /* Total capacity of all groups in sd */
				7796	unsigned long avg_load; /* Average load across all groups in sd */
				7797
				7798	struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */
				7799	struct sg_lb_stats local_stat; /* Statistics of the local group */
				7800	};
				7801
				7802	static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
				7803	{
				7804	/*
				7805	* Skimp on the clearing to avoid duplicate work. We can avoid clearing
				7806	* local_stat because update_sg_lb_stats() does a full clear/assignment.
				7807	* We must however clear busiest_stat::avg_load because
				7808	* update_sd_pick_busiest() reads this before assignment.
				7809	*/
				7810	*sds = (struct sd_lb_stats){
				7811	.busiest = NULL,
				7812	.local = NULL,
				7813	.total_running = 0UL,
				7814	.total_load = 0UL,
				7815	.total_capacity = 0UL,
				7816	.busiest_stat = {
				7817	.avg_load = 0UL,
				7818	.sum_nr_running = 0,
				7819	.group_type = group_other,
				7820	},
				7821	};
				7822	}
				7823
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7824	static unsigned long scale_rt_capacity(struct sched_domain *sd, int cpu)
				7825	{
				7826	struct rq *rq = cpu_rq(cpu);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	7827	unsigned long max = arch_scale_cpu_capacity(cpu);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7828	unsigned long used, free;
				7829	unsigned long irq;
				7830
				7831	irq = cpu_util_irq(rq);
				7832
				7833	if (unlikely(irq >= max))
				7834	return 1;
				7835
				7836	used = READ_ONCE(rq->avg_rt.util_avg);
				7837	used += READ_ONCE(rq->avg_dl.util_avg);
				7838
				7839	if (unlikely(used >= max))
				7840	return 1;
				7841
				7842	free = max - used;
				7843
				7844	return scale_irq_capacity(free, irq, max);
				7845	}
				7846
				7847	static void update_cpu_capacity(struct sched_domain *sd, int cpu)
				7848	{
				7849	unsigned long capacity = scale_rt_capacity(sd, cpu);
				7850	struct sched_group *sdg = sd->groups;
				7851
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	7852	cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(cpu);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7853
				7854	if (!capacity)
				7855	capacity = 1;
				7856
				7857	cpu_rq(cpu)->cpu_capacity = capacity;
				7858	sdg->sgc->capacity = capacity;
				7859	sdg->sgc->min_capacity = capacity;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	7860	sdg->sgc->max_capacity = capacity;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7861	}
				7862
				7863	void update_group_capacity(struct sched_domain *sd, int cpu)
				7864	{
				7865	struct sched_domain *child = sd->child;
				7866	struct sched_group group, sdg = sd->groups;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	7867	unsigned long capacity, min_capacity, max_capacity;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7868	unsigned long interval;
				7869
				7870	interval = msecs_to_jiffies(sd->balance_interval);
				7871	interval = clamp(interval, 1UL, max_load_balance_interval);
				7872	sdg->sgc->next_update = jiffies + interval;
				7873
				7874	if (!child) {
				7875	update_cpu_capacity(sd, cpu);
				7876	return;
				7877	}
				7878
				7879	capacity = 0;
				7880	min_capacity = ULONG_MAX;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	7881	max_capacity = 0;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7882
				7883	if (child->flags & SD_OVERLAP) {
				7884	/*
				7885	* SD_OVERLAP domains cannot assume that child groups
				7886	* span the current group.
				7887	*/
				7888
				7889	for_each_cpu(cpu, sched_group_span(sdg)) {
				7890	struct sched_group_capacity *sgc;
				7891	struct rq *rq = cpu_rq(cpu);
				7892
				7893	/*
				7894	* build_sched_domains() -> init_sched_groups_capacity()
				7895	* gets here before we've attached the domains to the
				7896	* runqueues.
				7897	*
				7898	* Use capacity_of(), which is set irrespective of domains
				7899	* in update_cpu_capacity().
				7900	*
				7901	* This avoids capacity from being 0 and
				7902	* causing divide-by-zero issues on boot.
				7903	*/
				7904	if (unlikely(!rq->sd)) {
				7905	capacity += capacity_of(cpu);
				7906	} else {
				7907	sgc = rq->sd->groups->sgc;
				7908	capacity += sgc->capacity;
				7909	}
				7910
				7911	min_capacity = min(capacity, min_capacity);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	7912	max_capacity = max(capacity, max_capacity);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7913	}
				7914	} else {
				7915	/*
				7916	* !SD_OVERLAP domains can assume that child groups
				7917	* span the current group.
				7918	*/
				7919
				7920	group = child->groups;
				7921	do {
				7922	struct sched_group_capacity *sgc = group->sgc;
				7923
				7924	capacity += sgc->capacity;
				7925	min_capacity = min(sgc->min_capacity, min_capacity);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	7926	max_capacity = max(sgc->max_capacity, max_capacity);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7927	group = group->next;
				7928	} while (group != child->groups);
				7929	}
				7930
				7931	sdg->sgc->capacity = capacity;
				7932	sdg->sgc->min_capacity = min_capacity;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	7933	sdg->sgc->max_capacity = max_capacity;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7934	}
				7935
				7936	/*
				7937	* Check whether the capacity of the rq has been noticeably reduced by side
				7938	* activity. The imbalance_pct is used for the threshold.
				7939	* Return true is the capacity is reduced
				7940	*/
				7941	static inline int
				7942	check_cpu_capacity(struct rq rq, struct sched_domain sd)
				7943	{
				7944	return ((rq->cpu_capacity * sd->imbalance_pct) <
				7945	(rq->cpu_capacity_orig * 100));
				7946	}
				7947
				7948	/*
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	7949	* Check whether a rq has a misfit task and if it looks like we can actually
				7950	* help that task: we can migrate the task to a CPU of higher capacity, or
				7951	* the task's current CPU is heavily pressured.
				7952	*/
				7953	static inline int check_misfit_status(struct rq rq, struct sched_domain sd)
				7954	{
				7955	return rq->misfit_task_load &&
				7956	(rq->cpu_capacity_orig < rq->rd->max_cpu_capacity \|\|
				7957	check_cpu_capacity(rq, sd));
				7958	}
				7959
				7960	/*
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7961	* Group imbalance indicates (and tries to solve) the problem where balancing
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	7962	* groups is inadequate due to ->cpus_ptr constraints.
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7963	*
				7964	* Imagine a situation of two groups of 4 CPUs each and 4 tasks each with a
				7965	* cpumask covering 1 CPU of the first group and 3 CPUs of the second group.
				7966	* Something like:
				7967	*
				7968	* { 0 1 2 3 } { 4 5 6 7 }
				7969	* * * * *
				7970	*
				7971	* If we were to balance group-wise we'd place two tasks in the first group and
				7972	* two tasks in the second group. Clearly this is undesired as it will overload
				7973	* cpu 3 and leave one of the CPUs in the second group unused.
				7974	*
				7975	* The current solution to this issue is detecting the skew in the first group
				7976	* by noticing the lower domain failed to reach balance and had difficulty
				7977	* moving tasks due to affinity constraints.
				7978	*
				7979	* When this is so detected; this group becomes a candidate for busiest; see
				7980	* update_sd_pick_busiest(). And calculate_imbalance() and
				7981	* find_busiest_group() avoid some of the usual balance conditions to allow it
				7982	* to create an effective group imbalance.
				7983	*
				7984	* This is a somewhat tricky proposition since the next run might not find the
				7985	* group imbalance and decide the groups need to be balanced again. A most
				7986	* subtle and fragile situation.
				7987	*/
				7988
				7989	static inline int sg_imbalanced(struct sched_group *group)
				7990	{
				7991	return group->sgc->imbalance;
				7992	}
				7993
				7994	/*
				7995	* group_has_capacity returns true if the group has spare capacity that could
				7996	* be used by some tasks.
				7997	* We consider that a group has spare capacity if the * number of task is
				7998	* smaller than the number of CPUs or if the utilization is lower than the
				7999	* available capacity for CFS tasks.
				8000	* For the latter, we use a threshold to stabilize the state, to take into
				8001	* account the variance of the tasks' load and to return true if the available
				8002	* capacity in meaningful for the load balancer.
				8003	* As an example, an available capacity of 1% can appear but it doesn't make
				8004	* any benefit for the load balance.
				8005	*/
				8006	static inline bool
				8007	group_has_capacity(struct lb_env env, struct sg_lb_stats sgs)
				8008	{
				8009	if (sgs->sum_nr_running < sgs->group_weight)
				8010	return true;
				8011
				8012	if ((sgs->group_capacity * 100) >
				8013	(sgs->group_util * env->sd->imbalance_pct))
				8014	return true;
				8015
				8016	return false;
				8017	}
				8018
				8019	/*
				8020	* group_is_overloaded returns true if the group has more tasks than it can
				8021	* handle.
				8022	* group_is_overloaded is not equals to !group_has_capacity because a group
				8023	* with the exact right number of tasks, has no more spare capacity but is not
				8024	* overloaded so both group_has_capacity and group_is_overloaded return
				8025	* false.
				8026	*/
				8027	static inline bool
				8028	group_is_overloaded(struct lb_env env, struct sg_lb_stats sgs)
				8029	{
				8030	if (sgs->sum_nr_running <= sgs->group_weight)
				8031	return false;
				8032
				8033	if ((sgs->group_capacity * 100) <
				8034	(sgs->group_util * env->sd->imbalance_pct))
				8035	return true;
				8036
				8037	return false;
				8038	}
				8039
				8040	/*
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	8041	* group_smaller_min_cpu_capacity: Returns true if sched_group sg has smaller
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	8042	* per-CPU capacity than sched_group ref.
				8043	*/
				8044	static inline bool
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	8045	group_smaller_min_cpu_capacity(struct sched_group sg, struct sched_group ref)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	8046	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	8047	return fits_capacity(sg->sgc->min_capacity, ref->sgc->min_capacity);
				8048	}
				8049
				8050	/*
				8051	* group_smaller_max_cpu_capacity: Returns true if sched_group sg has smaller
				8052	* per-CPU capacity_orig than sched_group ref.
				8053	*/
				8054	static inline bool
				8055	group_smaller_max_cpu_capacity(struct sched_group sg, struct sched_group ref)
				8056	{
				8057	return fits_capacity(sg->sgc->max_capacity, ref->sgc->max_capacity);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	8058	}
				8059
				8060	static inline enum
				8061	group_type group_classify(struct sched_group *group,
				8062	struct sg_lb_stats *sgs)
				8063	{
				8064	if (sgs->group_no_capacity)
				8065	return group_overloaded;
				8066
				8067	if (sg_imbalanced(group))
				8068	return group_imbalanced;
				8069
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	8070	if (sgs->group_misfit_task_load)
				8071	return group_misfit_task;
				8072
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	8073	return group_other;
				8074	}
				8075
				8076	static bool update_nohz_stats(struct rq *rq, bool force)
				8077	{
				8078	#ifdef CONFIG_NO_HZ_COMMON
				8079	unsigned int cpu = rq->cpu;
				8080
				8081	if (!rq->has_blocked_load)
				8082	return false;
				8083
				8084	if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
				8085	return false;
				8086
				8087	if (!force && !time_after(jiffies, rq->last_blocked_load_update_tick))
				8088	return true;
				8089
				8090	update_blocked_averages(cpu);
				8091
				8092	return rq->has_blocked_load;
				8093	#else
				8094	return false;
				8095	#endif
				8096	}
				8097
				8098	/**
				8099	* update_sg_lb_stats - Update sched_group's statistics for load balancing.
				8100	* @env: The load balancing environment.
				8101	* @group: sched_group whose statistics are to be updated.
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	8102	* @sgs: variable to hold the statistics for this group.
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	8103	* @sg_status: Holds flag indicating the status of the sched_group
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	8104	*/
				8105	static inline void update_sg_lb_stats(struct lb_env *env,
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	8106	struct sched_group *group,
				8107	struct sg_lb_stats *sgs,
				8108	int *sg_status)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	8109	{
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	8110	int i, nr_running;
				8111
				8112	memset(sgs, 0, sizeof(*sgs));
				8113
				8114	for_each_cpu_and(i, sched_group_span(group), env->cpus) {
				8115	struct rq *rq = cpu_rq(i);
				8116
				8117	if ((env->flags & LBF_NOHZ_STATS) && update_nohz_stats(rq, false))
				8118	env->flags \|= LBF_NOHZ_AGAIN;
				8119
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	8120	sgs->group_load += cpu_runnable_load(rq);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	8121	sgs->group_util += cpu_util(i);
				8122	sgs->sum_nr_running += rq->cfs.h_nr_running;
				8123
				8124	nr_running = rq->nr_running;
				8125	if (nr_running > 1)
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	8126	*sg_status \|= SG_OVERLOAD;
				8127
				8128	if (cpu_overutilized(i))
				8129	*sg_status \|= SG_OVERUTILIZED;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	8130
				8131	#ifdef CONFIG_NUMA_BALANCING
				8132	sgs->nr_numa_running += rq->nr_numa_running;
				8133	sgs->nr_preferred_running += rq->nr_preferred_running;
				8134	#endif
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	8135	/*
				8136	* No need to call idle_cpu() if nr_running is not 0
				8137	*/
				8138	if (!nr_running && idle_cpu(i))
				8139	sgs->idle_cpus++;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	8140
				8141	if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
				8142	sgs->group_misfit_task_load < rq->misfit_task_load) {
				8143	sgs->group_misfit_task_load = rq->misfit_task_load;
				8144	*sg_status \|= SG_OVERLOAD;
				8145	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	8146	}
				8147
				8148	/* Adjust by relative CPU capacity of the group */
				8149	sgs->group_capacity = group->sgc->capacity;
				8150	sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity;
				8151
				8152	if (sgs->sum_nr_running)
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	8153	sgs->load_per_task = sgs->group_load / sgs->sum_nr_running;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	8154
				8155	sgs->group_weight = group->group_weight;
				8156
				8157	sgs->group_no_capacity = group_is_overloaded(env, sgs);
				8158	sgs->group_type = group_classify(group, sgs);
				8159	}
				8160
				8161	/**
				8162	* update_sd_pick_busiest - return 1 on busiest group
				8163	* @env: The load balancing environment.
				8164	* @sds: sched_domain statistics
				8165	* @sg: sched_group candidate to be checked for being the busiest
				8166	* @sgs: sched_group statistics
				8167	*
				8168	* Determine if @sg is a busier group than the previously selected
				8169	* busiest group.
				8170	*
				8171	* Return: %true if @sg is a busier group than the previously selected
				8172	* busiest group. %false otherwise.
				8173	*/
				8174	static bool update_sd_pick_busiest(struct lb_env *env,
				8175	struct sd_lb_stats *sds,
				8176	struct sched_group *sg,
				8177	struct sg_lb_stats *sgs)
				8178	{
				8179	struct sg_lb_stats *busiest = &sds->busiest_stat;
				8180
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	8181	/*
				8182	* Don't try to pull misfit tasks we can't help.
				8183	* We can use max_capacity here as reduction in capacity on some
				8184	* CPUs in the group should either be possible to resolve
				8185	* internally or be covered by avg_load imbalance (eventually).
				8186	*/
				8187	if (sgs->group_type == group_misfit_task &&
				8188	(!group_smaller_max_cpu_capacity(sg, sds->local) \|\|
				8189	!group_has_capacity(env, &sds->local_stat)))
				8190	return false;
				8191
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	8192	if (sgs->group_type > busiest->group_type)
				8193	return true;
				8194
				8195	if (sgs->group_type < busiest->group_type)
				8196	return false;
				8197
				8198	if (sgs->avg_load <= busiest->avg_load)
				8199	return false;
				8200
				8201	if (!(env->sd->flags & SD_ASYM_CPUCAPACITY))
				8202	goto asym_packing;
				8203
				8204	/*
				8205	* Candidate sg has no more than one task per CPU and
				8206	* has higher per-CPU capacity. Migrating tasks to less
				8207	* capable CPUs may harm throughput. Maximize throughput,
				8208	* power/energy consequences are not considered.
				8209	*/
				8210	if (sgs->sum_nr_running <= sgs->group_weight &&
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	8211	group_smaller_min_cpu_capacity(sds->local, sg))
				8212	return false;
				8213
				8214	/*
				8215	* If we have more than one misfit sg go with the biggest misfit.
				8216	*/
				8217	if (sgs->group_type == group_misfit_task &&
				8218	sgs->group_misfit_task_load < busiest->group_misfit_task_load)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	8219	return false;
				8220
				8221	asym_packing:
				8222	/* This is the busiest node in its class. */
				8223	if (!(env->sd->flags & SD_ASYM_PACKING))
				8224	return true;
				8225
				8226	/* No ASYM_PACKING if target CPU is already busy */
				8227	if (env->idle == CPU_NOT_IDLE)
				8228	return true;
				8229	/*
				8230	* ASYM_PACKING needs to move all the work to the highest
				8231	* prority CPUs in the group, therefore mark all groups
				8232	* of lower priority than ourself as busy.
				8233	*/
				8234	if (sgs->sum_nr_running &&
				8235	sched_asym_prefer(env->dst_cpu, sg->asym_prefer_cpu)) {
				8236	if (!sds->busiest)
				8237	return true;
				8238
				8239	/* Prefer to move from lowest priority CPU's work */
				8240	if (sched_asym_prefer(sds->busiest->asym_prefer_cpu,
				8241	sg->asym_prefer_cpu))
				8242	return true;
				8243	}
				8244
				8245	return false;
				8246	}
				8247
				8248	#ifdef CONFIG_NUMA_BALANCING
				8249	static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
				8250	{
				8251	if (sgs->sum_nr_running > sgs->nr_numa_running)
				8252	return regular;
				8253	if (sgs->sum_nr_running > sgs->nr_preferred_running)
				8254	return remote;
				8255	return all;
				8256	}
				8257
				8258	static inline enum fbq_type fbq_classify_rq(struct rq *rq)
				8259	{
				8260	if (rq->nr_running > rq->nr_numa_running)
				8261	return regular;
				8262	if (rq->nr_running > rq->nr_preferred_running)
				8263	return remote;
				8264	return all;
				8265	}
				8266	#else
				8267	static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
				8268	{
				8269	return all;
				8270	}
				8271
				8272	static inline enum fbq_type fbq_classify_rq(struct rq *rq)
				8273	{
				8274	return regular;
				8275	}
				8276	#endif /* CONFIG_NUMA_BALANCING */
				8277
				8278	/**
				8279	* update_sd_lb_stats - Update sched_domain's statistics for load balancing.
				8280	* @env: The load balancing environment.
				8281	* @sds: variable to hold the statistics for this sched_domain.
				8282	*/
				8283	static inline void update_sd_lb_stats(struct lb_env env, struct sd_lb_stats sds)
				8284	{
				8285	struct sched_domain *child = env->sd->child;
				8286	struct sched_group *sg = env->sd->groups;
				8287	struct sg_lb_stats *local = &sds->local_stat;
				8288	struct sg_lb_stats tmp_sgs;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	8289	bool prefer_sibling = child && child->flags & SD_PREFER_SIBLING;
				8290	int sg_status = 0;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	8291
				8292	#ifdef CONFIG_NO_HZ_COMMON
				8293	if (env->idle == CPU_NEWLY_IDLE && READ_ONCE(nohz.has_blocked))
				8294	env->flags \|= LBF_NOHZ_STATS;
				8295	#endif
				8296
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	8297	do {
				8298	struct sg_lb_stats *sgs = &tmp_sgs;
				8299	int local_group;
				8300
				8301	local_group = cpumask_test_cpu(env->dst_cpu, sched_group_span(sg));
				8302	if (local_group) {
				8303	sds->local = sg;
				8304	sgs = local;
				8305
				8306	if (env->idle != CPU_NEWLY_IDLE \|\|
				8307	time_after_eq(jiffies, sg->sgc->next_update))
				8308	update_group_capacity(env->sd, env->dst_cpu);
				8309	}
				8310
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	8311	update_sg_lb_stats(env, sg, sgs, &sg_status);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	8312
				8313	if (local_group)
				8314	goto next_group;
				8315
				8316	/*
				8317	* In case the child domain prefers tasks go to siblings
				8318	* first, lower the sg capacity so that we'll try
				8319	* and move all the excess tasks away. We lower the capacity
				8320	* of a group only if the local group has the capacity to fit
				8321	* these excess tasks. The extra check prevents the case where
				8322	* you always pull from the heaviest group when it is already
				8323	* under-utilized (possible with a large weight task outweighs
				8324	* the tasks on the system).
				8325	*/
				8326	if (prefer_sibling && sds->local &&
				8327	group_has_capacity(env, local) &&
				8328	(sgs->sum_nr_running > local->sum_nr_running + 1)) {
				8329	sgs->group_no_capacity = 1;
				8330	sgs->group_type = group_classify(sg, sgs);
				8331	}
				8332
				8333	if (update_sd_pick_busiest(env, sds, sg, sgs)) {
				8334	sds->busiest = sg;
				8335	sds->busiest_stat = *sgs;
				8336	}
				8337
				8338	next_group:
				8339	/* Now, start updating sd_lb_stats */
				8340	sds->total_running += sgs->sum_nr_running;
				8341	sds->total_load += sgs->group_load;
				8342	sds->total_capacity += sgs->group_capacity;
				8343
				8344	sg = sg->next;
				8345	} while (sg != env->sd->groups);
				8346
				8347	#ifdef CONFIG_NO_HZ_COMMON
				8348	if ((env->flags & LBF_NOHZ_AGAIN) &&
				8349	cpumask_subset(nohz.idle_cpus_mask, sched_domain_span(env->sd))) {
				8350
				8351	WRITE_ONCE(nohz.next_blocked,
				8352	jiffies + msecs_to_jiffies(LOAD_AVG_PERIOD));
				8353	}
				8354	#endif
				8355
				8356	if (env->sd->flags & SD_NUMA)
				8357	env->fbq_type = fbq_classify_group(&sds->busiest_stat);
				8358
				8359	if (!env->sd->parent) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	8360	struct root_domain *rd = env->dst_rq->rd;
				8361
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	8362	/* update overload indicator if we are at root domain */
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	8363	WRITE_ONCE(rd->overload, sg_status & SG_OVERLOAD);
				8364
				8365	/* Update over-utilization (tipping point, U >= 0) indicator */
				8366	WRITE_ONCE(rd->overutilized, sg_status & SG_OVERUTILIZED);
				8367	trace_sched_overutilized_tp(rd, sg_status & SG_OVERUTILIZED);
				8368	} else if (sg_status & SG_OVERUTILIZED) {
				8369	struct root_domain *rd = env->dst_rq->rd;
				8370
				8371	WRITE_ONCE(rd->overutilized, SG_OVERUTILIZED);
				8372	trace_sched_overutilized_tp(rd, SG_OVERUTILIZED);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	8373	}
				8374	}
				8375
				8376	/**
				8377	* check_asym_packing - Check to see if the group is packed into the
				8378	* sched domain.
				8379	*
				8380	* This is primarily intended to used at the sibling level. Some
				8381	* cores like POWER7 prefer to use lower numbered SMT threads. In the
				8382	* case of POWER7, it can move to lower SMT modes only when higher
				8383	* threads are idle. When in lower SMT modes, the threads will
				8384	* perform better since they share less core resources. Hence when we
				8385	* have idle threads, we want them to be the higher ones.
				8386	*
				8387	* This packing function is run on idle threads. It checks to see if
				8388	* the busiest CPU in this domain (core in the P7 case) has a higher
				8389	* CPU number than the packing function is being run on. Here we are
				8390	* assuming lower CPU number will be equivalent to lower a SMT thread
				8391	* number.
				8392	*
				8393	* Return: 1 when packing is required and a task should be moved to
				8394	* this CPU. The amount of the imbalance is returned in env->imbalance.
				8395	*
				8396	* @env: The load balancing environment.
				8397	* @sds: Statistics of the sched_domain which is to be packed
				8398	*/
				8399	static int check_asym_packing(struct lb_env env, struct sd_lb_stats sds)
				8400	{
				8401	int busiest_cpu;
				8402
				8403	if (!(env->sd->flags & SD_ASYM_PACKING))
				8404	return 0;
				8405
				8406	if (env->idle == CPU_NOT_IDLE)
				8407	return 0;
				8408
				8409	if (!sds->busiest)
				8410	return 0;
				8411
				8412	busiest_cpu = sds->busiest->asym_prefer_cpu;
				8413	if (sched_asym_prefer(busiest_cpu, env->dst_cpu))
				8414	return 0;
				8415
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	8416	env->imbalance = sds->busiest_stat.group_load;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	8417
				8418	return 1;
				8419	}
				8420
				8421	/**
				8422	* fix_small_imbalance - Calculate the minor imbalance that exists
				8423	* amongst the groups of a sched_domain, during
				8424	* load balancing.
				8425	* @env: The load balancing environment.
				8426	* @sds: Statistics of the sched_domain whose imbalance is to be calculated.
				8427	*/
				8428	static inline
				8429	void fix_small_imbalance(struct lb_env env, struct sd_lb_stats sds)
				8430	{
				8431	unsigned long tmp, capa_now = 0, capa_move = 0;
				8432	unsigned int imbn = 2;
				8433	unsigned long scaled_busy_load_per_task;
				8434	struct sg_lb_stats local, busiest;
				8435
				8436	local = &sds->local_stat;
				8437	busiest = &sds->busiest_stat;
				8438
				8439	if (!local->sum_nr_running)
				8440	local->load_per_task = cpu_avg_load_per_task(env->dst_cpu);
				8441	else if (busiest->load_per_task > local->load_per_task)
				8442	imbn = 1;
				8443
				8444	scaled_busy_load_per_task =
				8445	(busiest->load_per_task * SCHED_CAPACITY_SCALE) /
				8446	busiest->group_capacity;
				8447
				8448	if (busiest->avg_load + scaled_busy_load_per_task >=
				8449	local->avg_load + (scaled_busy_load_per_task * imbn)) {
				8450	env->imbalance = busiest->load_per_task;
				8451	return;
				8452	}
				8453
				8454	/*
				8455	* OK, we don't have enough imbalance to justify moving tasks,
				8456	* however we may be able to increase total CPU capacity used by
				8457	* moving them.
				8458	*/
				8459
				8460	capa_now += busiest->group_capacity *
				8461	min(busiest->load_per_task, busiest->avg_load);
				8462	capa_now += local->group_capacity *
				8463	min(local->load_per_task, local->avg_load);
				8464	capa_now /= SCHED_CAPACITY_SCALE;
				8465
				8466	/* Amount of load we'd subtract */
				8467	if (busiest->avg_load > scaled_busy_load_per_task) {
				8468	capa_move += busiest->group_capacity *
				8469	min(busiest->load_per_task,
				8470	busiest->avg_load - scaled_busy_load_per_task);
				8471	}
				8472
				8473	/* Amount of load we'd add */
				8474	if (busiest->avg_load * busiest->group_capacity <
				8475	busiest->load_per_task * SCHED_CAPACITY_SCALE) {
				8476	tmp = (busiest->avg_load * busiest->group_capacity) /
				8477	local->group_capacity;
				8478	} else {
				8479	tmp = (busiest->load_per_task * SCHED_CAPACITY_SCALE) /
				8480	local->group_capacity;
				8481	}
				8482	capa_move += local->group_capacity *
				8483	min(local->load_per_task, local->avg_load + tmp);
				8484	capa_move /= SCHED_CAPACITY_SCALE;
				8485
				8486	/* Move if we gain throughput */
				8487	if (capa_move > capa_now)
				8488	env->imbalance = busiest->load_per_task;
				8489	}
				8490
				8491	/**
				8492	* calculate_imbalance - Calculate the amount of imbalance present within the
				8493	* groups of a given sched_domain during load balance.
				8494	* @env: load balance environment
				8495	* @sds: statistics of the sched_domain whose imbalance is to be calculated.
				8496	*/
				8497	static inline void calculate_imbalance(struct lb_env env, struct sd_lb_stats sds)
				8498	{
				8499	unsigned long max_pull, load_above_capacity = ~0UL;
				8500	struct sg_lb_stats local, busiest;
				8501
				8502	local = &sds->local_stat;
				8503	busiest = &sds->busiest_stat;
				8504
				8505	if (busiest->group_type == group_imbalanced) {
				8506	/*
				8507	* In the group_imb case we cannot rely on group-wide averages
				8508	* to ensure CPU-load equilibrium, look at wider averages. XXX
				8509	*/
				8510	busiest->load_per_task =
				8511	min(busiest->load_per_task, sds->avg_load);
				8512	}
				8513
				8514	/*
				8515	* Avg load of busiest sg can be less and avg load of local sg can
				8516	* be greater than avg load across all sgs of sd because avg load
				8517	* factors in sg capacity and sgs with smaller group_type are
				8518	* skipped when updating the busiest sg:
				8519	*/
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	8520	if (busiest->group_type != group_misfit_task &&
				8521	(busiest->avg_load <= sds->avg_load \|\|
				8522	local->avg_load >= sds->avg_load)) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	8523	env->imbalance = 0;
				8524	return fix_small_imbalance(env, sds);
				8525	}
				8526
				8527	/*
				8528	* If there aren't any idle CPUs, avoid creating some.
				8529	*/
				8530	if (busiest->group_type == group_overloaded &&
				8531	local->group_type == group_overloaded) {
				8532	load_above_capacity = busiest->sum_nr_running * SCHED_CAPACITY_SCALE;
				8533	if (load_above_capacity > busiest->group_capacity) {
				8534	load_above_capacity -= busiest->group_capacity;
				8535	load_above_capacity *= scale_load_down(NICE_0_LOAD);
				8536	load_above_capacity /= busiest->group_capacity;
				8537	} else
				8538	load_above_capacity = ~0UL;
				8539	}
				8540
				8541	/*
				8542	* We're trying to get all the CPUs to the average_load, so we don't
				8543	* want to push ourselves above the average load, nor do we wish to
				8544	* reduce the max loaded CPU below the average load. At the same time,
				8545	* we also don't want to reduce the group load below the group
				8546	* capacity. Thus we look for the minimum possible imbalance.
				8547	*/
				8548	max_pull = min(busiest->avg_load - sds->avg_load, load_above_capacity);
				8549
				8550	/* How much load to actually move to equalise the imbalance */
				8551	env->imbalance = min(
				8552	max_pull * busiest->group_capacity,
				8553	(sds->avg_load - local->avg_load) * local->group_capacity
				8554	) / SCHED_CAPACITY_SCALE;
				8555
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	8556	/* Boost imbalance to allow misfit task to be balanced. */
				8557	if (busiest->group_type == group_misfit_task) {
				8558	env->imbalance = max_t(long, env->imbalance,
				8559	busiest->group_misfit_task_load);
				8560	}
				8561
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	8562	/*
				8563	* if *imbalance is less than the average load per runnable task
				8564	* there is no guarantee that any tasks will be moved so we'll have
				8565	* a think about bumping its value to force at least one task to be
				8566	* moved
				8567	*/
				8568	if (env->imbalance < busiest->load_per_task)
				8569	return fix_small_imbalance(env, sds);
				8570	}
				8571
				8572	/***** find_busiest_group() helpers end here *******************/
				8573
				8574	/**
				8575	* find_busiest_group - Returns the busiest group within the sched_domain
				8576	* if there is an imbalance.
				8577	*
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	8578	* Also calculates the amount of runnable load which should be moved
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	8579	* to restore balance.
				8580	*
				8581	* @env: The load balancing environment.
				8582	*
				8583	* Return: - The busiest group if imbalance exists.
				8584	*/
				8585	static struct sched_group find_busiest_group(struct lb_env env)
				8586	{
				8587	struct sg_lb_stats local, busiest;
				8588	struct sd_lb_stats sds;
				8589
				8590	init_sd_lb_stats(&sds);
				8591
				8592	/*
				8593	* Compute the various statistics relavent for load balancing at
				8594	* this level.
				8595	*/
				8596	update_sd_lb_stats(env, &sds);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	8597
				8598	if (sched_energy_enabled()) {
				8599	struct root_domain *rd = env->dst_rq->rd;
				8600
				8601	if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized))
				8602	goto out_balanced;
				8603	}
				8604
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	8605	local = &sds.local_stat;
				8606	busiest = &sds.busiest_stat;
				8607
				8608	/* ASYM feature bypasses nice load balance check */
				8609	if (check_asym_packing(env, &sds))
				8610	return sds.busiest;
				8611
				8612	/* There is no busy sibling group to pull tasks from */
				8613	if (!sds.busiest \|\| busiest->sum_nr_running == 0)
				8614	goto out_balanced;
				8615
				8616	/* XXX broken for overlapping NUMA groups */
				8617	sds.avg_load = (SCHED_CAPACITY_SCALE * sds.total_load)
				8618	/ sds.total_capacity;
				8619
				8620	/*
				8621	* If the busiest group is imbalanced the below checks don't
				8622	* work because they assume all things are equal, which typically
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	8623	* isn't true due to cpus_ptr constraints and the like.
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	8624	*/
				8625	if (busiest->group_type == group_imbalanced)
				8626	goto force_balance;
				8627
				8628	/*
				8629	* When dst_cpu is idle, prevent SMP nice and/or asymmetric group
				8630	* capacities from resulting in underutilization due to avg_load.
				8631	*/
				8632	if (env->idle != CPU_NOT_IDLE && group_has_capacity(env, local) &&
				8633	busiest->group_no_capacity)
				8634	goto force_balance;
				8635
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	8636	/* Misfit tasks should be dealt with regardless of the avg load */
				8637	if (busiest->group_type == group_misfit_task)
				8638	goto force_balance;
				8639
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	8640	/*
				8641	* If the local group is busier than the selected busiest group
				8642	* don't try and pull any tasks.
				8643	*/
				8644	if (local->avg_load >= busiest->avg_load)
				8645	goto out_balanced;
				8646
				8647	/*
				8648	* Don't pull any tasks if this group is already above the domain
				8649	* average load.
				8650	*/
				8651	if (local->avg_load >= sds.avg_load)
				8652	goto out_balanced;
				8653
				8654	if (env->idle == CPU_IDLE) {
				8655	/*
				8656	* This CPU is idle. If the busiest group is not overloaded
				8657	* and there is no imbalance between this and busiest group
				8658	* wrt idle CPUs, it is balanced. The imbalance becomes
				8659	* significant if the diff is greater than 1 otherwise we
				8660	* might end up to just move the imbalance on another group
				8661	*/
				8662	if ((busiest->group_type != group_overloaded) &&
				8663	(local->idle_cpus <= (busiest->idle_cpus + 1)))
				8664	goto out_balanced;
				8665	} else {
				8666	/*
				8667	* In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
				8668	* imbalance_pct to be conservative.
				8669	*/
				8670	if (100 * busiest->avg_load <=
				8671	env->sd->imbalance_pct * local->avg_load)
				8672	goto out_balanced;
				8673	}
				8674
				8675	force_balance:
				8676	/* Looks like there is an imbalance. Compute it */
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	8677	env->src_grp_type = busiest->group_type;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	8678	calculate_imbalance(env, &sds);
				8679	return env->imbalance ? sds.busiest : NULL;
				8680
				8681	out_balanced:
				8682	env->imbalance = 0;
				8683	return NULL;
				8684	}
				8685
				8686	/*
				8687	* find_busiest_queue - find the busiest runqueue among the CPUs in the group.
				8688	*/
				8689	static struct rq find_busiest_queue(struct lb_env env,
				8690	struct sched_group *group)
				8691	{
				8692	struct rq busiest = NULL, rq;
				8693	unsigned long busiest_load = 0, busiest_capacity = 1;
				8694	int i;
				8695
				8696	for_each_cpu_and(i, sched_group_span(group), env->cpus) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	8697	unsigned long capacity, load;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	8698	enum fbq_type rt;
				8699
				8700	rq = cpu_rq(i);
				8701	rt = fbq_classify_rq(rq);
				8702
				8703	/*
				8704	* We classify groups/runqueues into three groups:
				8705	* - regular: there are !numa tasks
				8706	* - remote: there are numa tasks that run on the 'wrong' node
				8707	* - all: there is no distinction
				8708	*
				8709	* In order to avoid migrating ideally placed numa tasks,
				8710	* ignore those when there's better options.
				8711	*
				8712	* If we ignore the actual busiest queue to migrate another
				8713	* task, the next balance pass can still reduce the busiest
				8714	* queue by moving tasks around inside the node.
				8715	*
				8716	* If we cannot move enough load due to this classification
				8717	* the next pass will adjust the group classification and
				8718	* allow migration of more tasks.
				8719	*
				8720	* Both cases only affect the total convergence complexity.
				8721	*/
				8722	if (rt > env->fbq_type)
				8723	continue;
				8724
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	8725	/*
				8726	* For ASYM_CPUCAPACITY domains with misfit tasks we simply
				8727	* seek the "biggest" misfit task.
				8728	*/
				8729	if (env->src_grp_type == group_misfit_task) {
				8730	if (rq->misfit_task_load > busiest_load) {
				8731	busiest_load = rq->misfit_task_load;
				8732	busiest = rq;
				8733	}
				8734
				8735	continue;
				8736	}
				8737
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	8738	capacity = capacity_of(i);
				8739
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	8740	/*
				8741	* For ASYM_CPUCAPACITY domains, don't pick a CPU that could
				8742	* eventually lead to active_balancing high->low capacity.
				8743	* Higher per-CPU capacity is considered better than balancing
				8744	* average load.
				8745	*/
				8746	if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
				8747	capacity_of(env->dst_cpu) < capacity &&
				8748	rq->nr_running == 1)
				8749	continue;
				8750
				8751	load = cpu_runnable_load(rq);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	8752
				8753	/*
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	8754	* When comparing with imbalance, use cpu_runnable_load()
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	8755	* which is not scaled with the CPU capacity.
				8756	*/
				8757
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	8758	if (rq->nr_running == 1 && load > env->imbalance &&
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	8759	!check_cpu_capacity(rq, env->sd))
				8760	continue;
				8761
				8762	/*
				8763	* For the load comparisons with the other CPU's, consider
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	8764	* the cpu_runnable_load() scaled with the CPU capacity, so
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	8765	* that the load can be moved away from the CPU that is
				8766	* potentially running at a lower capacity.
				8767	*
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	8768	* Thus we're looking for max(load_i / capacity_i), crosswise
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	8769	* multiplication to rid ourselves of the division works out
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	8770	* to: load_i * capacity_j > load_j * capacity_i; where j is
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	8771	* our previous maximum.
				8772	*/
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	8773	if (load * busiest_capacity > busiest_load * capacity) {
				8774	busiest_load = load;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	8775	busiest_capacity = capacity;
				8776	busiest = rq;
				8777	}
				8778	}
				8779
				8780	return busiest;
				8781	}
				8782
				8783	/*
				8784	* Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
				8785	* so long as it is large enough.
				8786	*/
				8787	#define MAX_PINNED_INTERVAL 512
				8788
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	8789	static inline bool
				8790	asym_active_balance(struct lb_env *env)
				8791	{
				8792	/*
				8793	* ASYM_PACKING needs to force migrate tasks from busy but
				8794	* lower priority CPUs in order to pack all tasks in the
				8795	* highest priority CPUs.
				8796	*/
				8797	return env->idle != CPU_NOT_IDLE && (env->sd->flags & SD_ASYM_PACKING) &&
				8798	sched_asym_prefer(env->dst_cpu, env->src_cpu);
				8799	}
				8800
				8801	static inline bool
				8802	voluntary_active_balance(struct lb_env *env)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	8803	{
				8804	struct sched_domain *sd = env->sd;
				8805
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	8806	if (asym_active_balance(env))
				8807	return 1;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	8808
				8809	/*
				8810	* The dst_cpu is idle and the src_cpu CPU has only 1 CFS task.
				8811	* It's worth migrating the task if the src_cpu's capacity is reduced
				8812	* because of other sched_class or IRQs if more capacity stays
				8813	* available on dst_cpu.
				8814	*/
				8815	if ((env->idle != CPU_NOT_IDLE) &&
				8816	(env->src_rq->cfs.h_nr_running == 1)) {
				8817	if ((check_cpu_capacity(env->src_rq, sd)) &&
				8818	(capacity_of(env->src_cpu)sd->imbalance_pct < capacity_of(env->dst_cpu)100))
				8819	return 1;
				8820	}
				8821
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	8822	if (env->src_grp_type == group_misfit_task)
				8823	return 1;
				8824
				8825	return 0;
				8826	}
				8827
				8828	static int need_active_balance(struct lb_env *env)
				8829	{
				8830	struct sched_domain *sd = env->sd;
				8831
				8832	if (voluntary_active_balance(env))
				8833	return 1;
				8834
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	8835	return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
				8836	}
				8837
				8838	static int active_load_balance_cpu_stop(void *data);
				8839
				8840	static int should_we_balance(struct lb_env *env)
				8841	{
				8842	struct sched_group *sg = env->sd->groups;
				8843	int cpu, balance_cpu = -1;
				8844
				8845	/*
				8846	* Ensure the balancing environment is consistent; can happen
				8847	* when the softirq triggers 'during' hotplug.
				8848	*/
				8849	if (!cpumask_test_cpu(env->dst_cpu, env->cpus))
				8850	return 0;
				8851
				8852	/*
				8853	* In the newly idle case, we will allow all the CPUs
				8854	* to do the newly idle load balance.
				8855	*/
				8856	if (env->idle == CPU_NEWLY_IDLE)
				8857	return 1;
				8858
				8859	/* Try to find first idle CPU */
				8860	for_each_cpu_and(cpu, group_balance_mask(sg), env->cpus) {
				8861	if (!idle_cpu(cpu))
				8862	continue;
				8863
				8864	balance_cpu = cpu;
				8865	break;
				8866	}
				8867
				8868	if (balance_cpu == -1)
				8869	balance_cpu = group_balance_cpu(sg);
				8870
				8871	/*
				8872	* First idle CPU or the first CPU(busiest) in this sched group
				8873	* is eligible for doing load balancing at this and above domains.
				8874	*/
				8875	return balance_cpu == env->dst_cpu;
				8876	}
				8877
				8878	/*
				8879	* Check this_cpu to ensure it is balanced within domain. Attempt to move
				8880	* tasks if there is an imbalance.
				8881	*/
				8882	static int load_balance(int this_cpu, struct rq *this_rq,
				8883	struct sched_domain *sd, enum cpu_idle_type idle,
				8884	int *continue_balancing)
				8885	{
				8886	int ld_moved, cur_ld_moved, active_balance = 0;
				8887	struct sched_domain *sd_parent = sd->parent;
				8888	struct sched_group *group;
				8889	struct rq *busiest;
				8890	struct rq_flags rf;
				8891	struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask);
				8892
				8893	struct lb_env env = {
				8894	.sd = sd,
				8895	.dst_cpu = this_cpu,
				8896	.dst_rq = this_rq,
				8897	.dst_grpmask = sched_group_span(sd->groups),
				8898	.idle = idle,
				8899	.loop_break = sched_nr_migrate_break,
				8900	.cpus = cpus,
				8901	.fbq_type = all,
				8902	.tasks = LIST_HEAD_INIT(env.tasks),
				8903	};
				8904
				8905	cpumask_and(cpus, sched_domain_span(sd), cpu_active_mask);
				8906
				8907	schedstat_inc(sd->lb_count[idle]);
				8908
				8909	redo:
				8910	if (!should_we_balance(&env)) {
				8911	*continue_balancing = 0;
				8912	goto out_balanced;
				8913	}
				8914
				8915	group = find_busiest_group(&env);
				8916	if (!group) {
				8917	schedstat_inc(sd->lb_nobusyg[idle]);
				8918	goto out_balanced;
				8919	}
				8920
				8921	busiest = find_busiest_queue(&env, group);
				8922	if (!busiest) {
				8923	schedstat_inc(sd->lb_nobusyq[idle]);
				8924	goto out_balanced;
				8925	}
				8926
				8927	BUG_ON(busiest == env.dst_rq);
				8928
				8929	schedstat_add(sd->lb_imbalance[idle], env.imbalance);
				8930
				8931	env.src_cpu = busiest->cpu;
				8932	env.src_rq = busiest;
				8933
				8934	ld_moved = 0;
				8935	if (busiest->nr_running > 1) {
				8936	/*
				8937	* Attempt to move tasks. If find_busiest_group has found
				8938	* an imbalance but busiest->nr_running <= 1, the group is
				8939	* still unbalanced. ld_moved simply stays zero, so it is
				8940	* correctly treated as an imbalance.
				8941	*/
				8942	env.flags \|= LBF_ALL_PINNED;
				8943	env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
				8944
				8945	more_balance:
				8946	rq_lock_irqsave(busiest, &rf);
				8947	update_rq_clock(busiest);
				8948
				8949	/*
				8950	* cur_ld_moved - load moved in current iteration
				8951	* ld_moved - cumulative load moved across iterations
				8952	*/
				8953	cur_ld_moved = detach_tasks(&env);
				8954
				8955	/*
				8956	* We've detached some tasks from busiest_rq. Every
				8957	* task is masked "TASK_ON_RQ_MIGRATING", so we can safely
				8958	* unlock busiest->lock, and we are able to be sure
				8959	* that nobody can manipulate the tasks in parallel.
				8960	* See task_rq_lock() family for the details.
				8961	*/
				8962
				8963	rq_unlock(busiest, &rf);
				8964
				8965	if (cur_ld_moved) {
				8966	attach_tasks(&env);
				8967	ld_moved += cur_ld_moved;
				8968	}
				8969
				8970	local_irq_restore(rf.flags);
				8971
				8972	if (env.flags & LBF_NEED_BREAK) {
				8973	env.flags &= ~LBF_NEED_BREAK;
				8974	goto more_balance;
				8975	}
				8976
				8977	/*
				8978	* Revisit (affine) tasks on src_cpu that couldn't be moved to
				8979	* us and move them to an alternate dst_cpu in our sched_group
				8980	* where they can run. The upper limit on how many times we
				8981	* iterate on same src_cpu is dependent on number of CPUs in our
				8982	* sched_group.
				8983	*
				8984	* This changes load balance semantics a bit on who can move
				8985	* load to a given_cpu. In addition to the given_cpu itself
				8986	* (or a ilb_cpu acting on its behalf where given_cpu is
				8987	* nohz-idle), we now have balance_cpu in a position to move
				8988	* load to given_cpu. In rare situations, this may cause
				8989	* conflicts (balance_cpu and given_cpu/ilb_cpu deciding
				8990	* _independently_ and at _same_ time to move some load to
				8991	* given_cpu) causing exceess load to be moved to given_cpu.
				8992	* This however should not happen so much in practice and
				8993	* moreover subsequent load balance cycles should correct the
				8994	* excess load moved.
				8995	*/
				8996	if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {
				8997
				8998	/* Prevent to re-select dst_cpu via env's CPUs */
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	8999	__cpumask_clear_cpu(env.dst_cpu, env.cpus);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	9000
				9001	env.dst_rq = cpu_rq(env.new_dst_cpu);
				9002	env.dst_cpu = env.new_dst_cpu;
				9003	env.flags &= ~LBF_DST_PINNED;
				9004	env.loop = 0;
				9005	env.loop_break = sched_nr_migrate_break;
				9006
				9007	/*
				9008	* Go back to "more_balance" rather than "redo" since we
				9009	* need to continue with same src_cpu.
				9010	*/
				9011	goto more_balance;
				9012	}
				9013
				9014	/*
				9015	* We failed to reach balance because of affinity.
				9016	*/
				9017	if (sd_parent) {
				9018	int *group_imbalance = &sd_parent->groups->sgc->imbalance;
				9019
				9020	if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0)
				9021	*group_imbalance = 1;
				9022	}
				9023
				9024	/* All tasks on this runqueue were pinned by CPU affinity */
				9025	if (unlikely(env.flags & LBF_ALL_PINNED)) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	9026	__cpumask_clear_cpu(cpu_of(busiest), cpus);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	9027	/*
				9028	* Attempting to continue load balancing at the current
				9029	* sched_domain level only makes sense if there are
				9030	* active CPUs remaining as possible busiest CPUs to
				9031	* pull load from which are not contained within the
				9032	* destination group that is receiving any migrated
				9033	* load.
				9034	*/
				9035	if (!cpumask_subset(cpus, env.dst_grpmask)) {
				9036	env.loop = 0;
				9037	env.loop_break = sched_nr_migrate_break;
				9038	goto redo;
				9039	}
				9040	goto out_all_pinned;
				9041	}
				9042	}
				9043
				9044	if (!ld_moved) {
				9045	schedstat_inc(sd->lb_failed[idle]);
				9046	/*
				9047	* Increment the failure counter only on periodic balance.
				9048	* We do not want newidle balance, which can be very
				9049	* frequent, pollute the failure counter causing
				9050	* excessive cache_hot migrations and active balances.
				9051	*/
				9052	if (idle != CPU_NEWLY_IDLE)
				9053	sd->nr_balance_failed++;
				9054
				9055	if (need_active_balance(&env)) {
				9056	unsigned long flags;
				9057
				9058	raw_spin_lock_irqsave(&busiest->lock, flags);
				9059
				9060	/*
				9061	* Don't kick the active_load_balance_cpu_stop,
				9062	* if the curr task on busiest CPU can't be
				9063	* moved to this_cpu:
				9064	*/
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	9065	if (!cpumask_test_cpu(this_cpu, busiest->curr->cpus_ptr)) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	9066	raw_spin_unlock_irqrestore(&busiest->lock,
				9067	flags);
				9068	env.flags \|= LBF_ALL_PINNED;
				9069	goto out_one_pinned;
				9070	}
				9071
				9072	/*
				9073	* ->active_balance synchronizes accesses to
				9074	* ->active_balance_work. Once set, it's cleared
				9075	* only after active load balance is finished.
				9076	*/
				9077	if (!busiest->active_balance) {
				9078	busiest->active_balance = 1;
				9079	busiest->push_cpu = this_cpu;
				9080	active_balance = 1;
				9081	}
				9082	raw_spin_unlock_irqrestore(&busiest->lock, flags);
				9083
				9084	if (active_balance) {
				9085	stop_one_cpu_nowait(cpu_of(busiest),
				9086	active_load_balance_cpu_stop, busiest,
				9087	&busiest->active_balance_work);
				9088	}
				9089
				9090	/* We've kicked active balancing, force task migration. */
				9091	sd->nr_balance_failed = sd->cache_nice_tries+1;
				9092	}
				9093	} else
				9094	sd->nr_balance_failed = 0;
				9095
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	9096	if (likely(!active_balance) \|\| voluntary_active_balance(&env)) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	9097	/* We were unbalanced, so reset the balancing interval */
				9098	sd->balance_interval = sd->min_interval;
				9099	} else {
				9100	/*
				9101	* If we've begun active balancing, start to back off. This
				9102	* case may not be covered by the all_pinned logic if there
				9103	* is only 1 task on the busy runqueue (because we don't call
				9104	* detach_tasks).
				9105	*/
				9106	if (sd->balance_interval < sd->max_interval)
				9107	sd->balance_interval *= 2;
				9108	}
				9109
				9110	goto out;
				9111
				9112	out_balanced:
				9113	/*
				9114	* We reach balance although we may have faced some affinity
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	9115	* constraints. Clear the imbalance flag only if other tasks got
				9116	* a chance to move and fix the imbalance.
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	9117	*/
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	9118	if (sd_parent && !(env.flags & LBF_ALL_PINNED)) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	9119	int *group_imbalance = &sd_parent->groups->sgc->imbalance;
				9120
				9121	if (*group_imbalance)
				9122	*group_imbalance = 0;
				9123	}
				9124
				9125	out_all_pinned:
				9126	/*
				9127	* We reach balance because all tasks are pinned at this level so
				9128	* we can't migrate them. Let the imbalance flag set so parent level
				9129	* can try to migrate them.
				9130	*/
				9131	schedstat_inc(sd->lb_balanced[idle]);
				9132
				9133	sd->nr_balance_failed = 0;
				9134
				9135	out_one_pinned:
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	9136	ld_moved = 0;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	9137
				9138	/*
				9139	* newidle_balance() disregards balance intervals, so we could
				9140	* repeatedly reach this code, which would lead to balance_interval
				9141	* skyrocketting in a short amount of time. Skip the balance_interval
				9142	* increase logic to avoid that.
				9143	*/
				9144	if (env.idle == CPU_NEWLY_IDLE)
				9145	goto out;
				9146
				9147	/* tune up the balancing interval */
				9148	if ((env.flags & LBF_ALL_PINNED &&
				9149	sd->balance_interval < MAX_PINNED_INTERVAL) \|\|
				9150	sd->balance_interval < sd->max_interval)
				9151	sd->balance_interval *= 2;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	9152	out:
				9153	return ld_moved;
				9154	}
				9155
				9156	static inline unsigned long
				9157	get_sd_balance_interval(struct sched_domain *sd, int cpu_busy)
				9158	{
				9159	unsigned long interval = sd->balance_interval;
				9160
				9161	if (cpu_busy)
				9162	interval *= sd->busy_factor;
				9163
				9164	/* scale ms to jiffies */
				9165	interval = msecs_to_jiffies(interval);
				9166	interval = clamp(interval, 1UL, max_load_balance_interval);
				9167
				9168	return interval;
				9169	}
				9170
				9171	static inline void
				9172	update_next_balance(struct sched_domain sd, unsigned long next_balance)
				9173	{
				9174	unsigned long interval, next;
				9175
				9176	/* used by idle balance, so cpu_busy = 0 */
				9177	interval = get_sd_balance_interval(sd, 0);
				9178	next = sd->last_balance + interval;
				9179
				9180	if (time_after(*next_balance, next))
				9181	*next_balance = next;
				9182	}
				9183
				9184	/*
				9185	* active_load_balance_cpu_stop is run by the CPU stopper. It pushes
				9186	* running tasks off the busiest CPU onto idle CPUs. It requires at
				9187	* least 1 task to be running on each physical CPU where possible, and
				9188	* avoids physical / logical imbalances.
				9189	*/
				9190	static int active_load_balance_cpu_stop(void *data)
				9191	{
				9192	struct rq *busiest_rq = data;
				9193	int busiest_cpu = cpu_of(busiest_rq);
				9194	int target_cpu = busiest_rq->push_cpu;
				9195	struct rq *target_rq = cpu_rq(target_cpu);
				9196	struct sched_domain *sd;
				9197	struct task_struct *p = NULL;
				9198	struct rq_flags rf;
				9199
				9200	rq_lock_irq(busiest_rq, &rf);
				9201	/*
				9202	* Between queueing the stop-work and running it is a hole in which
				9203	* CPUs can become inactive. We should not move tasks from or to
				9204	* inactive CPUs.
				9205	*/
				9206	if (!cpu_active(busiest_cpu) \|\| !cpu_active(target_cpu))
				9207	goto out_unlock;
				9208
				9209	/* Make sure the requested CPU hasn't gone down in the meantime: */
				9210	if (unlikely(busiest_cpu != smp_processor_id() \|\|
				9211	!busiest_rq->active_balance))
				9212	goto out_unlock;
				9213
				9214	/* Is there any task to move? */
				9215	if (busiest_rq->nr_running <= 1)
				9216	goto out_unlock;
				9217
				9218	/*
				9219	* This condition is "impossible", if it occurs
				9220	* we need to fix it. Originally reported by
				9221	* Bjorn Helgaas on a 128-CPU setup.
				9222	*/
				9223	BUG_ON(busiest_rq == target_rq);
				9224
				9225	/* Search for an sd spanning us and the target CPU. */
				9226	rcu_read_lock();
				9227	for_each_domain(target_cpu, sd) {
				9228	if ((sd->flags & SD_LOAD_BALANCE) &&
				9229	cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
				9230	break;
				9231	}
				9232
				9233	if (likely(sd)) {
				9234	struct lb_env env = {
				9235	.sd = sd,
				9236	.dst_cpu = target_cpu,
				9237	.dst_rq = target_rq,
				9238	.src_cpu = busiest_rq->cpu,
				9239	.src_rq = busiest_rq,
				9240	.idle = CPU_IDLE,
				9241	/*
				9242	* can_migrate_task() doesn't need to compute new_dst_cpu
				9243	* for active balancing. Since we have CPU_IDLE, but no
				9244	* @dst_grpmask we need to make that test go away with lying
				9245	* about DST_PINNED.
				9246	*/
				9247	.flags = LBF_DST_PINNED,
				9248	};
				9249
				9250	schedstat_inc(sd->alb_count);
				9251	update_rq_clock(busiest_rq);
				9252
				9253	p = detach_one_task(&env);
				9254	if (p) {
				9255	schedstat_inc(sd->alb_pushed);
				9256	/* Active balancing done, reset the failure counter. */
				9257	sd->nr_balance_failed = 0;
				9258	} else {
				9259	schedstat_inc(sd->alb_failed);
				9260	}
				9261	}
				9262	rcu_read_unlock();
				9263	out_unlock:
				9264	busiest_rq->active_balance = 0;
				9265	rq_unlock(busiest_rq, &rf);
				9266
				9267	if (p)
				9268	attach_one_task(target_rq, p);
				9269
				9270	local_irq_enable();
				9271
				9272	return 0;
				9273	}
				9274
				9275	static DEFINE_SPINLOCK(balancing);
				9276
				9277	/*
				9278	* Scale the max load_balance interval with the number of CPUs in the system.
				9279	* This trades load-balance latency on larger machines for less cross talk.
				9280	*/
				9281	void update_max_interval(void)
				9282	{
				9283	max_load_balance_interval = HZ*num_online_cpus()/10;
				9284	}
				9285
				9286	/*
				9287	* It checks each scheduling domain to see if it is due to be balanced,
				9288	* and initiates a balancing operation if so.
				9289	*
				9290	* Balancing parameters are set up in init_sched_domains.
				9291	*/
				9292	static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
				9293	{
				9294	int continue_balancing = 1;
				9295	int cpu = rq->cpu;
				9296	unsigned long interval;
				9297	struct sched_domain *sd;
				9298	/* Earliest time when we have to do rebalance again */
				9299	unsigned long next_balance = jiffies + 60*HZ;
				9300	int update_next_balance = 0;
				9301	int need_serialize, need_decay = 0;
				9302	u64 max_cost = 0;
				9303
				9304	rcu_read_lock();
				9305	for_each_domain(cpu, sd) {
				9306	/*
				9307	* Decay the newidle max times here because this is a regular
				9308	* visit to all the domains. Decay ~1% per second.
				9309	*/
				9310	if (time_after(jiffies, sd->next_decay_max_lb_cost)) {
				9311	sd->max_newidle_lb_cost =
				9312	(sd->max_newidle_lb_cost * 253) / 256;
				9313	sd->next_decay_max_lb_cost = jiffies + HZ;
				9314	need_decay = 1;
				9315	}
				9316	max_cost += sd->max_newidle_lb_cost;
				9317
				9318	if (!(sd->flags & SD_LOAD_BALANCE))
				9319	continue;
				9320
				9321	/*
				9322	* Stop the load balance at this level. There is another
				9323	* CPU in our sched group which is doing load balancing more
				9324	* actively.
				9325	*/
				9326	if (!continue_balancing) {
				9327	if (need_decay)
				9328	continue;
				9329	break;
				9330	}
				9331
				9332	interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
				9333
				9334	need_serialize = sd->flags & SD_SERIALIZE;
				9335	if (need_serialize) {
				9336	if (!spin_trylock(&balancing))
				9337	goto out;
				9338	}
				9339
				9340	if (time_after_eq(jiffies, sd->last_balance + interval)) {
				9341	if (load_balance(cpu, rq, sd, idle, &continue_balancing)) {
				9342	/*
				9343	* The LBF_DST_PINNED logic could have changed
				9344	* env->dst_cpu, so we can't know our idle
				9345	* state even if we migrated tasks. Update it.
				9346	*/
				9347	idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
				9348	}
				9349	sd->last_balance = jiffies;
				9350	interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
				9351	}
				9352	if (need_serialize)
				9353	spin_unlock(&balancing);
				9354	out:
				9355	if (time_after(next_balance, sd->last_balance + interval)) {
				9356	next_balance = sd->last_balance + interval;
				9357	update_next_balance = 1;
				9358	}
				9359	}
				9360	if (need_decay) {
				9361	/*
				9362	* Ensure the rq-wide value also decays but keep it at a
				9363	* reasonable floor to avoid funnies with rq->avg_idle.
				9364	*/
				9365	rq->max_idle_balance_cost =
				9366	max((u64)sysctl_sched_migration_cost, max_cost);
				9367	}
				9368	rcu_read_unlock();
				9369
				9370	/*
				9371	* next_balance will be updated only when there is a need.
				9372	* When the cpu is attached to null domain for ex, it will not be
				9373	* updated.
				9374	*/
				9375	if (likely(update_next_balance)) {
				9376	rq->next_balance = next_balance;
				9377
				9378	#ifdef CONFIG_NO_HZ_COMMON
				9379	/*
				9380	* If this CPU has been elected to perform the nohz idle
				9381	* balance. Other idle CPUs have already rebalanced with
				9382	* nohz_idle_balance() and nohz.next_balance has been
				9383	* updated accordingly. This CPU is now running the idle load
				9384	* balance for itself and we need to update the
				9385	* nohz.next_balance accordingly.
				9386	*/
				9387	if ((idle == CPU_IDLE) && time_after(nohz.next_balance, rq->next_balance))
				9388	nohz.next_balance = rq->next_balance;
				9389	#endif
				9390	}
				9391	}
				9392
				9393	static inline int on_null_domain(struct rq *rq)
				9394	{
				9395	return unlikely(!rcu_dereference_sched(rq->sd));
				9396	}
				9397
				9398	#ifdef CONFIG_NO_HZ_COMMON
				9399	/*
				9400	* idle load balancing details
				9401	* - When one of the busy CPUs notice that there may be an idle rebalancing
				9402	* needed, they will kick the idle load balancer, which then does idle
				9403	* load balancing for all the idle CPUs.
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	9404	* - HK_FLAG_MISC CPUs are used for this task, because HK_FLAG_SCHED not set
				9405	* anywhere yet.
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	9406	*/
				9407
				9408	static inline int find_new_ilb(void)
				9409	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	9410	int ilb;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	9411
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	9412	for_each_cpu_and(ilb, nohz.idle_cpus_mask,
				9413	housekeeping_cpumask(HK_FLAG_MISC)) {
				9414	if (idle_cpu(ilb))
				9415	return ilb;
				9416	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	9417
				9418	return nr_cpu_ids;
				9419	}
				9420
				9421	/*
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	9422	* Kick a CPU to do the nohz balancing, if it is time for it. We pick any
				9423	* idle CPU in the HK_FLAG_MISC housekeeping set (if there is one).
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	9424	*/
				9425	static void kick_ilb(unsigned int flags)
				9426	{
				9427	int ilb_cpu;
				9428
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	9429	/*
				9430	* Increase nohz.next_balance only when if full ilb is triggered but
				9431	* not if we only update stats.
				9432	*/
				9433	if (flags & NOHZ_BALANCE_KICK)
				9434	nohz.next_balance = jiffies+1;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	9435
				9436	ilb_cpu = find_new_ilb();
				9437
				9438	if (ilb_cpu >= nr_cpu_ids)
				9439	return;
				9440
				9441	flags = atomic_fetch_or(flags, nohz_flags(ilb_cpu));
				9442	if (flags & NOHZ_KICK_MASK)
				9443	return;
				9444
				9445	/*
				9446	* Use smp_send_reschedule() instead of resched_cpu().
				9447	* This way we generate a sched IPI on the target CPU which
				9448	* is idle. And the softirq performing nohz idle load balance
				9449	* will be run before returning from the IPI.
				9450	*/
				9451	smp_send_reschedule(ilb_cpu);
				9452	}
				9453
				9454	/*
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	9455	* Current decision point for kicking the idle load balancer in the presence
				9456	* of idle CPUs in the system.
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	9457	*/
				9458	static void nohz_balancer_kick(struct rq *rq)
				9459	{
				9460	unsigned long now = jiffies;
				9461	struct sched_domain_shared *sds;
				9462	struct sched_domain *sd;
				9463	int nr_busy, i, cpu = rq->cpu;
				9464	unsigned int flags = 0;
				9465
				9466	if (unlikely(rq->idle_balance))
				9467	return;
				9468
				9469	/*
				9470	* We may be recently in ticked or tickless idle mode. At the first
				9471	* busy tick after returning from idle, we will update the busy stats.
				9472	*/
				9473	nohz_balance_exit_idle(rq);
				9474
				9475	/*
				9476	* None are in tickless mode and hence no need for NOHZ idle load
				9477	* balancing.
				9478	*/
				9479	if (likely(!atomic_read(&nohz.nr_cpus)))
				9480	return;
				9481
				9482	if (READ_ONCE(nohz.has_blocked) &&
				9483	time_after(now, READ_ONCE(nohz.next_blocked)))
				9484	flags = NOHZ_STATS_KICK;
				9485
				9486	if (time_before(now, nohz.next_balance))
				9487	goto out;
				9488
				9489	if (rq->nr_running >= 2) {
				9490	flags = NOHZ_KICK_MASK;
				9491	goto out;
				9492	}
				9493
				9494	rcu_read_lock();
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	9495
				9496	sd = rcu_dereference(rq->sd);
				9497	if (sd) {
				9498	/*
				9499	* If there's a CFS task and the current CPU has reduced
				9500	* capacity; kick the ILB to see if there's a better CPU to run
				9501	* on.
				9502	*/
				9503	if (rq->cfs.h_nr_running >= 1 && check_cpu_capacity(rq, sd)) {
				9504	flags = NOHZ_KICK_MASK;
				9505	goto unlock;
				9506	}
				9507	}
				9508
				9509	sd = rcu_dereference(per_cpu(sd_asym_packing, cpu));
				9510	if (sd) {
				9511	/*
				9512	* When ASYM_PACKING; see if there's a more preferred CPU
				9513	* currently idle; in which case, kick the ILB to move tasks
				9514	* around.
				9515	*/
				9516	for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) {
				9517	if (sched_asym_prefer(i, cpu)) {
				9518	flags = NOHZ_KICK_MASK;
				9519	goto unlock;
				9520	}
				9521	}
				9522	}
				9523
				9524	sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, cpu));
				9525	if (sd) {
				9526	/*
				9527	* When ASYM_CPUCAPACITY; see if there's a higher capacity CPU
				9528	* to run the misfit task on.
				9529	*/
				9530	if (check_misfit_status(rq, sd)) {
				9531	flags = NOHZ_KICK_MASK;
				9532	goto unlock;
				9533	}
				9534
				9535	/*
				9536	* For asymmetric systems, we do not want to nicely balance
				9537	* cache use, instead we want to embrace asymmetry and only
				9538	* ensure tasks have enough CPU capacity.
				9539	*
				9540	* Skip the LLC logic because it's not relevant in that case.
				9541	*/
				9542	goto unlock;
				9543	}
				9544
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	9545	sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
				9546	if (sds) {
				9547	/*
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	9548	* If there is an imbalance between LLC domains (IOW we could
				9549	* increase the overall cache use), we need some less-loaded LLC
				9550	* domain to pull some load. Likewise, we may need to spread
				9551	* load within the current LLC domain (e.g. packed SMT cores but
				9552	* other CPUs are idle). We can't really know from here how busy
				9553	* the others are - so just get a nohz balance going if it looks
				9554	* like this LLC domain has tasks we could move.
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	9555	*/
				9556	nr_busy = atomic_read(&sds->nr_busy_cpus);
				9557	if (nr_busy > 1) {
				9558	flags = NOHZ_KICK_MASK;
				9559	goto unlock;
				9560	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	9561	}
				9562	unlock:
				9563	rcu_read_unlock();
				9564	out:
				9565	if (flags)
				9566	kick_ilb(flags);
				9567	}
				9568
				9569	static void set_cpu_sd_state_busy(int cpu)
				9570	{
				9571	struct sched_domain *sd;
				9572
				9573	rcu_read_lock();
				9574	sd = rcu_dereference(per_cpu(sd_llc, cpu));
				9575
				9576	if (!sd \|\| !sd->nohz_idle)
				9577	goto unlock;
				9578	sd->nohz_idle = 0;
				9579
				9580	atomic_inc(&sd->shared->nr_busy_cpus);
				9581	unlock:
				9582	rcu_read_unlock();
				9583	}
				9584
				9585	void nohz_balance_exit_idle(struct rq *rq)
				9586	{
				9587	SCHED_WARN_ON(rq != this_rq());
				9588
				9589	if (likely(!rq->nohz_tick_stopped))
				9590	return;
				9591
				9592	rq->nohz_tick_stopped = 0;
				9593	cpumask_clear_cpu(rq->cpu, nohz.idle_cpus_mask);
				9594	atomic_dec(&nohz.nr_cpus);
				9595
				9596	set_cpu_sd_state_busy(rq->cpu);
				9597	}
				9598
				9599	static void set_cpu_sd_state_idle(int cpu)
				9600	{
				9601	struct sched_domain *sd;
				9602
				9603	rcu_read_lock();
				9604	sd = rcu_dereference(per_cpu(sd_llc, cpu));
				9605
				9606	if (!sd \|\| sd->nohz_idle)
				9607	goto unlock;
				9608	sd->nohz_idle = 1;
				9609
				9610	atomic_dec(&sd->shared->nr_busy_cpus);
				9611	unlock:
				9612	rcu_read_unlock();
				9613	}
				9614
				9615	/*
				9616	* This routine will record that the CPU is going idle with tick stopped.
				9617	* This info will be used in performing idle load balancing in the future.
				9618	*/
				9619	void nohz_balance_enter_idle(int cpu)
				9620	{
				9621	struct rq *rq = cpu_rq(cpu);
				9622
				9623	SCHED_WARN_ON(cpu != smp_processor_id());
				9624
				9625	/* If this CPU is going down, then nothing needs to be done: */
				9626	if (!cpu_active(cpu))
				9627	return;
				9628
				9629	/* Spare idle load balancing on CPUs that don't want to be disturbed: */
				9630	if (!housekeeping_cpu(cpu, HK_FLAG_SCHED))
				9631	return;
				9632
				9633	/*
				9634	* Can be set safely without rq->lock held
				9635	* If a clear happens, it will have evaluated last additions because
				9636	* rq->lock is held during the check and the clear
				9637	*/
				9638	rq->has_blocked_load = 1;
				9639
				9640	/*
				9641	* The tick is still stopped but load could have been added in the
				9642	* meantime. We set the nohz.has_blocked flag to trig a check of the
				9643	* *_avg. The CPU is already part of nohz.idle_cpus_mask so the clear
				9644	* of nohz.has_blocked can only happen after checking the new load
				9645	*/
				9646	if (rq->nohz_tick_stopped)
				9647	goto out;
				9648
				9649	/* If we're a completely isolated CPU, we don't play: */
				9650	if (on_null_domain(rq))
				9651	return;
				9652
				9653	rq->nohz_tick_stopped = 1;
				9654
				9655	cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
				9656	atomic_inc(&nohz.nr_cpus);
				9657
				9658	/*
				9659	* Ensures that if nohz_idle_balance() fails to observe our
				9660	* @idle_cpus_mask store, it must observe the @has_blocked
				9661	* store.
				9662	*/
				9663	smp_mb__after_atomic();
				9664
				9665	set_cpu_sd_state_idle(cpu);
				9666
				9667	out:
				9668	/*
				9669	* Each time a cpu enter idle, we assume that it has blocked load and
				9670	* enable the periodic update of the load of idle cpus
				9671	*/
				9672	WRITE_ONCE(nohz.has_blocked, 1);
				9673	}
				9674
				9675	/*
				9676	* Internal function that runs load balance for all idle cpus. The load balance
				9677	* can be a simple update of blocked load or a complete load balance with
				9678	* tasks movement depending of flags.
				9679	* The function returns false if the loop has stopped before running
				9680	* through all idle CPUs.
				9681	*/
				9682	static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
				9683	enum cpu_idle_type idle)
				9684	{
				9685	/* Earliest time when we have to do rebalance again */
				9686	unsigned long now = jiffies;
				9687	unsigned long next_balance = now + 60*HZ;
				9688	bool has_blocked_load = false;
				9689	int update_next_balance = 0;
				9690	int this_cpu = this_rq->cpu;
				9691	int balance_cpu;
				9692	int ret = false;
				9693	struct rq *rq;
				9694
				9695	SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK);
				9696
				9697	/*
				9698	* We assume there will be no idle load after this update and clear
				9699	* the has_blocked flag. If a cpu enters idle in the mean time, it will
				9700	* set the has_blocked flag and trig another update of idle load.
				9701	* Because a cpu that becomes idle, is added to idle_cpus_mask before
				9702	* setting the flag, we are sure to not clear the state and not
				9703	* check the load of an idle cpu.
				9704	*/
				9705	WRITE_ONCE(nohz.has_blocked, 0);
				9706
				9707	/*
				9708	* Ensures that if we miss the CPU, we must see the has_blocked
				9709	* store from nohz_balance_enter_idle().
				9710	*/
				9711	smp_mb();
				9712
				9713	for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
				9714	if (balance_cpu == this_cpu \|\| !idle_cpu(balance_cpu))
				9715	continue;
				9716
				9717	/*
				9718	* If this CPU gets work to do, stop the load balancing
				9719	* work being done for other CPUs. Next load
				9720	* balancing owner will pick it up.
				9721	*/
				9722	if (need_resched()) {
				9723	has_blocked_load = true;
				9724	goto abort;
				9725	}
				9726
				9727	rq = cpu_rq(balance_cpu);
				9728
				9729	has_blocked_load \|= update_nohz_stats(rq, true);
				9730
				9731	/*
				9732	* If time for next balance is due,
				9733	* do the balance.
				9734	*/
				9735	if (time_after_eq(jiffies, rq->next_balance)) {
				9736	struct rq_flags rf;
				9737
				9738	rq_lock_irqsave(rq, &rf);
				9739	update_rq_clock(rq);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	9740	rq_unlock_irqrestore(rq, &rf);
				9741
				9742	if (flags & NOHZ_BALANCE_KICK)
				9743	rebalance_domains(rq, CPU_IDLE);
				9744	}
				9745
				9746	if (time_after(next_balance, rq->next_balance)) {
				9747	next_balance = rq->next_balance;
				9748	update_next_balance = 1;
				9749	}
				9750	}
				9751
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	9752	/*
				9753	* next_balance will be updated only when there is a need.
				9754	* When the CPU is attached to null domain for ex, it will not be
				9755	* updated.
				9756	*/
				9757	if (likely(update_next_balance))
				9758	nohz.next_balance = next_balance;
				9759
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	9760	/* Newly idle CPU doesn't need an update */
				9761	if (idle != CPU_NEWLY_IDLE) {
				9762	update_blocked_averages(this_cpu);
				9763	has_blocked_load \|= this_rq->has_blocked_load;
				9764	}
				9765
				9766	if (flags & NOHZ_BALANCE_KICK)
				9767	rebalance_domains(this_rq, CPU_IDLE);
				9768
				9769	WRITE_ONCE(nohz.next_blocked,
				9770	now + msecs_to_jiffies(LOAD_AVG_PERIOD));
				9771
				9772	/* The full idle balance loop has been done */
				9773	ret = true;
				9774
				9775	abort:
				9776	/* There is still blocked load, enable periodic update */
				9777	if (has_blocked_load)
				9778	WRITE_ONCE(nohz.has_blocked, 1);
				9779
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	9780	return ret;
				9781	}
				9782
				9783	/*
				9784	* In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
				9785	* rebalancing for all the cpus for whom scheduler ticks are stopped.
				9786	*/
				9787	static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
				9788	{
				9789	int this_cpu = this_rq->cpu;
				9790	unsigned int flags;
				9791
				9792	if (!(atomic_read(nohz_flags(this_cpu)) & NOHZ_KICK_MASK))
				9793	return false;
				9794
				9795	if (idle != CPU_IDLE) {
				9796	atomic_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
				9797	return false;
				9798	}
				9799
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	9800	/* could be _relaxed() */
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	9801	flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
				9802	if (!(flags & NOHZ_KICK_MASK))
				9803	return false;
				9804
				9805	_nohz_idle_balance(this_rq, flags, idle);
				9806
				9807	return true;
				9808	}
				9809
				9810	static void nohz_newidle_balance(struct rq *this_rq)
				9811	{
				9812	int this_cpu = this_rq->cpu;
				9813
				9814	/*
				9815	* This CPU doesn't want to be disturbed by scheduler
				9816	* housekeeping
				9817	*/
				9818	if (!housekeeping_cpu(this_cpu, HK_FLAG_SCHED))
				9819	return;
				9820
				9821	/* Will wake up very soon. No time for doing anything else*/
				9822	if (this_rq->avg_idle < sysctl_sched_migration_cost)
				9823	return;
				9824
				9825	/* Don't need to update blocked load of idle CPUs*/
				9826	if (!READ_ONCE(nohz.has_blocked) \|\|
				9827	time_before(jiffies, READ_ONCE(nohz.next_blocked)))
				9828	return;
				9829
				9830	raw_spin_unlock(&this_rq->lock);
				9831	/*
				9832	* This CPU is going to be idle and blocked load of idle CPUs
				9833	* need to be updated. Run the ilb locally as it is a good
				9834	* candidate for ilb instead of waking up another idle CPU.
				9835	* Kick an normal ilb if we failed to do the update.
				9836	*/
				9837	if (!_nohz_idle_balance(this_rq, NOHZ_STATS_KICK, CPU_NEWLY_IDLE))
				9838	kick_ilb(NOHZ_STATS_KICK);
				9839	raw_spin_lock(&this_rq->lock);
				9840	}
				9841
				9842	#else /* !CONFIG_NO_HZ_COMMON */
				9843	static inline void nohz_balancer_kick(struct rq *rq) { }
				9844
				9845	static inline bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
				9846	{
				9847	return false;
				9848	}
				9849
				9850	static inline void nohz_newidle_balance(struct rq *this_rq) { }
				9851	#endif /* CONFIG_NO_HZ_COMMON */
				9852
				9853	/*
				9854	* idle_balance is called by schedule() if this_cpu is about to become
				9855	* idle. Attempts to pull tasks from other CPUs.
				9856	*/
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	9857	int newidle_balance(struct rq this_rq, struct rq_flags rf)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	9858	{
				9859	unsigned long next_balance = jiffies + HZ;
				9860	int this_cpu = this_rq->cpu;
				9861	struct sched_domain *sd;
				9862	int pulled_task = 0;
				9863	u64 curr_cost = 0;
				9864
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	9865	update_misfit_status(NULL, this_rq);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	9866	/*
				9867	* We must set idle_stamp _before_ calling idle_balance(), such that we
				9868	* measure the duration of idle_balance() as idle time.
				9869	*/
				9870	this_rq->idle_stamp = rq_clock(this_rq);
				9871
				9872	/*
				9873	* Do not pull tasks towards !active CPUs...
				9874	*/
				9875	if (!cpu_active(this_cpu))
				9876	return 0;
				9877
				9878	/*
				9879	* This is OK, because current is on_cpu, which avoids it being picked
				9880	* for load-balance and preemption/IRQs are still disabled avoiding
				9881	* further scheduler activity on it and we're being very careful to
				9882	* re-start the picking loop.
				9883	*/
				9884	rq_unpin_lock(this_rq, rf);
				9885
				9886	if (this_rq->avg_idle < sysctl_sched_migration_cost \|\|
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	9887	!READ_ONCE(this_rq->rd->overload)) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	9888
				9889	rcu_read_lock();
				9890	sd = rcu_dereference_check_sched_domain(this_rq->sd);
				9891	if (sd)
				9892	update_next_balance(sd, &next_balance);
				9893	rcu_read_unlock();
				9894
				9895	nohz_newidle_balance(this_rq);
				9896
				9897	goto out;
				9898	}
				9899
				9900	raw_spin_unlock(&this_rq->lock);
				9901
				9902	update_blocked_averages(this_cpu);
				9903	rcu_read_lock();
				9904	for_each_domain(this_cpu, sd) {
				9905	int continue_balancing = 1;
				9906	u64 t0, domain_cost;
				9907
				9908	if (!(sd->flags & SD_LOAD_BALANCE))
				9909	continue;
				9910
				9911	if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) {
				9912	update_next_balance(sd, &next_balance);
				9913	break;
				9914	}
				9915
				9916	if (sd->flags & SD_BALANCE_NEWIDLE) {
				9917	t0 = sched_clock_cpu(this_cpu);
				9918
				9919	pulled_task = load_balance(this_cpu, this_rq,
				9920	sd, CPU_NEWLY_IDLE,
				9921	&continue_balancing);
				9922
				9923	domain_cost = sched_clock_cpu(this_cpu) - t0;
				9924	if (domain_cost > sd->max_newidle_lb_cost)
				9925	sd->max_newidle_lb_cost = domain_cost;
				9926
				9927	curr_cost += domain_cost;
				9928	}
				9929
				9930	update_next_balance(sd, &next_balance);
				9931
				9932	/*
				9933	* Stop searching for tasks to pull if there are
				9934	* now runnable tasks on this rq.
				9935	*/
				9936	if (pulled_task \|\| this_rq->nr_running > 0)
				9937	break;
				9938	}
				9939	rcu_read_unlock();
				9940
				9941	raw_spin_lock(&this_rq->lock);
				9942
				9943	if (curr_cost > this_rq->max_idle_balance_cost)
				9944	this_rq->max_idle_balance_cost = curr_cost;
				9945
				9946	out:
				9947	/*
				9948	* While browsing the domains, we released the rq lock, a task could
				9949	* have been enqueued in the meantime. Since we're not going idle,
				9950	* pretend we pulled a task.
				9951	*/
				9952	if (this_rq->cfs.h_nr_running && !pulled_task)
				9953	pulled_task = 1;
				9954
				9955	/* Move the next balance forward */
				9956	if (time_after(this_rq->next_balance, next_balance))
				9957	this_rq->next_balance = next_balance;
				9958
				9959	/* Is there a task of a high priority class? */
				9960	if (this_rq->nr_running != this_rq->cfs.h_nr_running)
				9961	pulled_task = -1;
				9962
				9963	if (pulled_task)
				9964	this_rq->idle_stamp = 0;
				9965
				9966	rq_repin_lock(this_rq, rf);
				9967
				9968	return pulled_task;
				9969	}
				9970
				9971	/*
				9972	* run_rebalance_domains is triggered when needed from the scheduler tick.
				9973	* Also triggered for nohz idle balancing (with nohz_balancing_kick set).
				9974	*/
				9975	static __latent_entropy void run_rebalance_domains(struct softirq_action *h)
				9976	{
				9977	struct rq *this_rq = this_rq();
				9978	enum cpu_idle_type idle = this_rq->idle_balance ?
				9979	CPU_IDLE : CPU_NOT_IDLE;
				9980
				9981	/*
				9982	* If this CPU has a pending nohz_balance_kick, then do the
				9983	* balancing on behalf of the other idle CPUs whose ticks are
				9984	* stopped. Do nohz_idle_balance before rebalance_domains to
				9985	* give the idle CPUs a chance to load balance. Else we may
				9986	* load balance only within the local sched_domain hierarchy
				9987	* and abort nohz_idle_balance altogether if we pull some load.
				9988	*/
				9989	if (nohz_idle_balance(this_rq, idle))
				9990	return;
				9991
				9992	/* normal load balance */
				9993	update_blocked_averages(this_rq->cpu);
				9994	rebalance_domains(this_rq, idle);
				9995	}
				9996
				9997	/*
				9998	* Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
				9999	*/
				10000	void trigger_load_balance(struct rq *rq)
				10001	{
				10002	/* Don't need to rebalance while attached to NULL domain */
				10003	if (unlikely(on_null_domain(rq)))
				10004	return;
				10005
				10006	if (time_after_eq(jiffies, rq->next_balance))
				10007	raise_softirq(SCHED_SOFTIRQ);
				10008
				10009	nohz_balancer_kick(rq);
				10010	}
				10011
				10012	static void rq_online_fair(struct rq *rq)
				10013	{
				10014	update_sysctl();
				10015
				10016	update_runtime_enabled(rq);
				10017	}
				10018
				10019	static void rq_offline_fair(struct rq *rq)
				10020	{
				10021	update_sysctl();
				10022
				10023	/* Ensure any throttled groups are reachable by pick_next_task */
				10024	unthrottle_offline_cfs_rqs(rq);
				10025	}
				10026
				10027	#endif /* CONFIG_SMP */
				10028
				10029	/*
				10030	* scheduler tick hitting a task of our scheduling class.
				10031	*
				10032	* NOTE: This function can be called remotely by the tick offload that
				10033	* goes along full dynticks. Therefore no local assumption can be made
				10034	* and everything must be accessed through the @rq and @curr passed in
				10035	* parameters.
				10036	*/
				10037	static void task_tick_fair(struct rq rq, struct task_struct curr, int queued)
				10038	{
				10039	struct cfs_rq *cfs_rq;
				10040	struct sched_entity *se = &curr->se;
				10041
				10042	for_each_sched_entity(se) {
				10043	cfs_rq = cfs_rq_of(se);
				10044	entity_tick(cfs_rq, se, queued);
				10045	}
				10046
				10047	if (static_branch_unlikely(&sched_numa_balancing))
				10048	task_tick_numa(rq, curr);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	10049
				10050	update_misfit_status(curr, rq);
				10051	update_overutilized_status(task_rq(curr));
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	10052	}
				10053
				10054	/*
				10055	* called on fork with the child task as argument from the parent's context
				10056	* - child not yet on the tasklist
				10057	* - preemption disabled
				10058	*/
				10059	static void task_fork_fair(struct task_struct *p)
				10060	{
				10061	struct cfs_rq *cfs_rq;
				10062	struct sched_entity se = &p->se, curr;
				10063	struct rq *rq = this_rq();
				10064	struct rq_flags rf;
				10065
				10066	rq_lock(rq, &rf);
				10067	update_rq_clock(rq);
				10068
				10069	cfs_rq = task_cfs_rq(current);
				10070	curr = cfs_rq->curr;
				10071	if (curr) {
				10072	update_curr(cfs_rq);
				10073	se->vruntime = curr->vruntime;
				10074	}
				10075	place_entity(cfs_rq, se, 1);
				10076
				10077	if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
				10078	/*
				10079	* Upon rescheduling, sched_class::put_prev_task() will place
				10080	* 'current' within the tree based on its new key value.
				10081	*/
				10082	swap(curr->vruntime, se->vruntime);
				10083	resched_curr(rq);
				10084	}
				10085
				10086	se->vruntime -= cfs_rq->min_vruntime;
				10087	rq_unlock(rq, &rf);
				10088	}
				10089
				10090	/*
				10091	* Priority of the task has changed. Check to see if we preempt
				10092	* the current task.
				10093	*/
				10094	static void
				10095	prio_changed_fair(struct rq rq, struct task_struct p, int oldprio)
				10096	{
				10097	if (!task_on_rq_queued(p))
				10098	return;
				10099
				10100	/*
				10101	* Reschedule if we are currently running on this runqueue and
				10102	* our priority decreased, or if we are not currently running on
				10103	* this runqueue and our priority is higher than the current's
				10104	*/
				10105	if (rq->curr == p) {
				10106	if (p->prio > oldprio)
				10107	resched_curr(rq);
				10108	} else
				10109	check_preempt_curr(rq, p, 0);
				10110	}
				10111
				10112	static inline bool vruntime_normalized(struct task_struct *p)
				10113	{
				10114	struct sched_entity *se = &p->se;
				10115
				10116	/*
				10117	* In both the TASK_ON_RQ_QUEUED and TASK_ON_RQ_MIGRATING cases,
				10118	* the dequeue_entity(.flags=0) will already have normalized the
				10119	* vruntime.
				10120	*/
				10121	if (p->on_rq)
				10122	return true;
				10123
				10124	/*
				10125	* When !on_rq, vruntime of the task has usually NOT been normalized.
				10126	* But there are some cases where it has already been normalized:
				10127	*
				10128	* - A forked child which is waiting for being woken up by
				10129	* wake_up_new_task().
				10130	* - A task which has been woken up by try_to_wake_up() and
				10131	* waiting for actually being woken up by sched_ttwu_pending().
				10132	*/
				10133	if (!se->sum_exec_runtime \|\|
				10134	(p->state == TASK_WAKING && p->sched_remote_wakeup))
				10135	return true;
				10136
				10137	return false;
				10138	}
				10139
				10140	#ifdef CONFIG_FAIR_GROUP_SCHED
				10141	/*
				10142	* Propagate the changes of the sched_entity across the tg tree to make it
				10143	* visible to the root
				10144	*/
				10145	static void propagate_entity_cfs_rq(struct sched_entity *se)
				10146	{
				10147	struct cfs_rq *cfs_rq;
				10148
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	10149	list_add_leaf_cfs_rq(cfs_rq_of(se));
				10150
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	10151	/* Start to propagate at parent */
				10152	se = se->parent;
				10153
				10154	for_each_sched_entity(se) {
				10155	cfs_rq = cfs_rq_of(se);
				10156
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	10157	if (!cfs_rq_throttled(cfs_rq)){
				10158	update_load_avg(cfs_rq, se, UPDATE_TG);
				10159	list_add_leaf_cfs_rq(cfs_rq);
				10160	continue;
				10161	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	10162
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	10163	if (list_add_leaf_cfs_rq(cfs_rq))
				10164	break;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	10165	}
				10166	}
				10167	#else
				10168	static void propagate_entity_cfs_rq(struct sched_entity *se) { }
				10169	#endif
				10170
				10171	static void detach_entity_cfs_rq(struct sched_entity *se)
				10172	{
				10173	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				10174
				10175	/* Catch up with the cfs_rq and remove our load when we leave */
				10176	update_load_avg(cfs_rq, se, 0);
				10177	detach_entity_load_avg(cfs_rq, se);
				10178	update_tg_load_avg(cfs_rq, false);
				10179	propagate_entity_cfs_rq(se);
				10180	}
				10181
				10182	static void attach_entity_cfs_rq(struct sched_entity *se)
				10183	{
				10184	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				10185
				10186	#ifdef CONFIG_FAIR_GROUP_SCHED
				10187	/*
				10188	* Since the real-depth could have been changed (only FAIR
				10189	* class maintain depth value), reset depth properly.
				10190	*/
				10191	se->depth = se->parent ? se->parent->depth + 1 : 0;
				10192	#endif
				10193
				10194	/* Synchronize entity with its cfs_rq */
				10195	update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
				10196	attach_entity_load_avg(cfs_rq, se, 0);
				10197	update_tg_load_avg(cfs_rq, false);
				10198	propagate_entity_cfs_rq(se);
				10199	}
				10200
				10201	static void detach_task_cfs_rq(struct task_struct *p)
				10202	{
				10203	struct sched_entity *se = &p->se;
				10204	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				10205
				10206	if (!vruntime_normalized(p)) {
				10207	/*
				10208	* Fix up our vruntime so that the current sleep doesn't
				10209	* cause 'unlimited' sleep bonus.
				10210	*/
				10211	place_entity(cfs_rq, se, 0);
				10212	se->vruntime -= cfs_rq->min_vruntime;
				10213	}
				10214
				10215	detach_entity_cfs_rq(se);
				10216	}
				10217
				10218	static void attach_task_cfs_rq(struct task_struct *p)
				10219	{
				10220	struct sched_entity *se = &p->se;
				10221	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				10222
				10223	attach_entity_cfs_rq(se);
				10224
				10225	if (!vruntime_normalized(p))
				10226	se->vruntime += cfs_rq->min_vruntime;
				10227	}
				10228
				10229	static void switched_from_fair(struct rq rq, struct task_struct p)
				10230	{
				10231	detach_task_cfs_rq(p);
				10232	}
				10233
				10234	static void switched_to_fair(struct rq rq, struct task_struct p)
				10235	{
				10236	attach_task_cfs_rq(p);
				10237
				10238	if (task_on_rq_queued(p)) {
				10239	/*
				10240	* We were most likely switched from sched_rt, so
				10241	* kick off the schedule if running, otherwise just see
				10242	* if we can still preempt the current task.
				10243	*/
				10244	if (rq->curr == p)
				10245	resched_curr(rq);
				10246	else
				10247	check_preempt_curr(rq, p, 0);
				10248	}
				10249	}
				10250
				10251	/* Account for a task changing its policy or group.
				10252	*
				10253	* This routine is mostly called to set cfs_rq->curr field when a task
				10254	* migrates between groups/classes.
				10255	*/
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	10256	static void set_next_task_fair(struct rq rq, struct task_struct p, bool first)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	10257	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	10258	struct sched_entity *se = &p->se;
				10259
				10260	#ifdef CONFIG_SMP
				10261	if (task_on_rq_queued(p)) {
				10262	/*
				10263	* Move the next running task to the front of the list, so our
				10264	* cfs_tasks list becomes MRU one.
				10265	*/
				10266	list_move(&se->group_node, &rq->cfs_tasks);
				10267	}
				10268	#endif
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	10269
				10270	for_each_sched_entity(se) {
				10271	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				10272
				10273	set_next_entity(cfs_rq, se);
				10274	/* ensure bandwidth has been allocated on our new cfs_rq */
				10275	account_cfs_rq_runtime(cfs_rq, 0);
				10276	}
				10277	}
				10278
				10279	void init_cfs_rq(struct cfs_rq *cfs_rq)
				10280	{
				10281	cfs_rq->tasks_timeline = RB_ROOT_CACHED;
				10282	cfs_rq->min_vruntime = (u64)(-(1LL << 20));
				10283	#ifndef CONFIG_64BIT
				10284	cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
				10285	#endif
				10286	#ifdef CONFIG_SMP
				10287	raw_spin_lock_init(&cfs_rq->removed.lock);
				10288	#endif
				10289	}
				10290
				10291	#ifdef CONFIG_FAIR_GROUP_SCHED
				10292	static void task_set_group_fair(struct task_struct *p)
				10293	{
				10294	struct sched_entity *se = &p->se;
				10295
				10296	set_task_rq(p, task_cpu(p));
				10297	se->depth = se->parent ? se->parent->depth + 1 : 0;
				10298	}
				10299
				10300	static void task_move_group_fair(struct task_struct *p)
				10301	{
				10302	detach_task_cfs_rq(p);
				10303	set_task_rq(p, task_cpu(p));
				10304
				10305	#ifdef CONFIG_SMP
				10306	/* Tell se's cfs_rq has been changed -- migrated */
				10307	p->se.avg.last_update_time = 0;
				10308	#endif
				10309	attach_task_cfs_rq(p);
				10310	}
				10311
				10312	static void task_change_group_fair(struct task_struct *p, int type)
				10313	{
				10314	switch (type) {
				10315	case TASK_SET_GROUP:
				10316	task_set_group_fair(p);
				10317	break;
				10318
				10319	case TASK_MOVE_GROUP:
				10320	task_move_group_fair(p);
				10321	break;
				10322	}
				10323	}
				10324
				10325	void free_fair_sched_group(struct task_group *tg)
				10326	{
				10327	int i;
				10328
				10329	destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
				10330
				10331	for_each_possible_cpu(i) {
				10332	if (tg->cfs_rq)
				10333	kfree(tg->cfs_rq[i]);
				10334	if (tg->se)
				10335	kfree(tg->se[i]);
				10336	}
				10337
				10338	kfree(tg->cfs_rq);
				10339	kfree(tg->se);
				10340	}
				10341
				10342	int alloc_fair_sched_group(struct task_group tg, struct task_group parent)
				10343	{
				10344	struct sched_entity *se;
				10345	struct cfs_rq *cfs_rq;
				10346	int i;
				10347
				10348	tg->cfs_rq = kcalloc(nr_cpu_ids, sizeof(cfs_rq), GFP_KERNEL);
				10349	if (!tg->cfs_rq)
				10350	goto err;
				10351	tg->se = kcalloc(nr_cpu_ids, sizeof(se), GFP_KERNEL);
				10352	if (!tg->se)
				10353	goto err;
				10354
				10355	tg->shares = NICE_0_LOAD;
				10356
				10357	init_cfs_bandwidth(tg_cfs_bandwidth(tg));
				10358
				10359	for_each_possible_cpu(i) {
				10360	cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
				10361	GFP_KERNEL, cpu_to_node(i));
				10362	if (!cfs_rq)
				10363	goto err;
				10364
				10365	se = kzalloc_node(sizeof(struct sched_entity),
				10366	GFP_KERNEL, cpu_to_node(i));
				10367	if (!se)
				10368	goto err_free_rq;
				10369
				10370	init_cfs_rq(cfs_rq);
				10371	init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
				10372	init_entity_runnable_average(se);
				10373	}
				10374
				10375	return 1;
				10376
				10377	err_free_rq:
				10378	kfree(cfs_rq);
				10379	err:
				10380	return 0;
				10381	}
				10382
				10383	void online_fair_sched_group(struct task_group *tg)
				10384	{
				10385	struct sched_entity *se;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	10386	struct rq_flags rf;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	10387	struct rq *rq;
				10388	int i;
				10389
				10390	for_each_possible_cpu(i) {
				10391	rq = cpu_rq(i);
				10392	se = tg->se[i];
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	10393	rq_lock_irq(rq, &rf);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	10394	update_rq_clock(rq);
				10395	attach_entity_cfs_rq(se);
				10396	sync_throttle(tg, i);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	10397	rq_unlock_irq(rq, &rf);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	10398	}
				10399	}
				10400
				10401	void unregister_fair_sched_group(struct task_group *tg)
				10402	{
				10403	unsigned long flags;
				10404	struct rq *rq;
				10405	int cpu;
				10406
				10407	for_each_possible_cpu(cpu) {
				10408	if (tg->se[cpu])
				10409	remove_entity_load_avg(tg->se[cpu]);
				10410
				10411	/*
				10412	* Only empty task groups can be destroyed; so we can speculatively
				10413	* check on_list without danger of it being re-added.
				10414	*/
				10415	if (!tg->cfs_rq[cpu]->on_list)
				10416	continue;
				10417
				10418	rq = cpu_rq(cpu);
				10419
				10420	raw_spin_lock_irqsave(&rq->lock, flags);
				10421	list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
				10422	raw_spin_unlock_irqrestore(&rq->lock, flags);
				10423	}
				10424	}
				10425
				10426	void init_tg_cfs_entry(struct task_group tg, struct cfs_rq cfs_rq,
				10427	struct sched_entity *se, int cpu,
				10428	struct sched_entity *parent)
				10429	{
				10430	struct rq *rq = cpu_rq(cpu);
				10431
				10432	cfs_rq->tg = tg;
				10433	cfs_rq->rq = rq;
				10434	init_cfs_rq_runtime(cfs_rq);
				10435
				10436	tg->cfs_rq[cpu] = cfs_rq;
				10437	tg->se[cpu] = se;
				10438
				10439	/* se could be NULL for root_task_group */
				10440	if (!se)
				10441	return;
				10442
				10443	if (!parent) {
				10444	se->cfs_rq = &rq->cfs;
				10445	se->depth = 0;
				10446	} else {
				10447	se->cfs_rq = parent->my_q;
				10448	se->depth = parent->depth + 1;
				10449	}
				10450
				10451	se->my_q = cfs_rq;
				10452	/* guarantee group entities always have weight */
				10453	update_load_set(&se->load, NICE_0_LOAD);
				10454	se->parent = parent;
				10455	}
				10456
				10457	static DEFINE_MUTEX(shares_mutex);
				10458
				10459	int sched_group_set_shares(struct task_group *tg, unsigned long shares)
				10460	{
				10461	int i;
				10462
				10463	/*
				10464	* We can't change the weight of the root cgroup.
				10465	*/
				10466	if (!tg->se[0])
				10467	return -EINVAL;
				10468
				10469	shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
				10470
				10471	mutex_lock(&shares_mutex);
				10472	if (tg->shares == shares)
				10473	goto done;
				10474
				10475	tg->shares = shares;
				10476	for_each_possible_cpu(i) {
				10477	struct rq *rq = cpu_rq(i);
				10478	struct sched_entity *se = tg->se[i];
				10479	struct rq_flags rf;
				10480
				10481	/* Propagate contribution to hierarchy */
				10482	rq_lock_irqsave(rq, &rf);
				10483	update_rq_clock(rq);
				10484	for_each_sched_entity(se) {
				10485	update_load_avg(cfs_rq_of(se), se, UPDATE_TG);
				10486	update_cfs_group(se);
				10487	}
				10488	rq_unlock_irqrestore(rq, &rf);
				10489	}
				10490
				10491	done:
				10492	mutex_unlock(&shares_mutex);
				10493	return 0;
				10494	}
				10495	#else /* CONFIG_FAIR_GROUP_SCHED */
				10496
				10497	void free_fair_sched_group(struct task_group *tg) { }
				10498
				10499	int alloc_fair_sched_group(struct task_group tg, struct task_group parent)
				10500	{
				10501	return 1;
				10502	}
				10503
				10504	void online_fair_sched_group(struct task_group *tg) { }
				10505
				10506	void unregister_fair_sched_group(struct task_group *tg) { }
				10507
				10508	#endif /* CONFIG_FAIR_GROUP_SCHED */
				10509
				10510
				10511	static unsigned int get_rr_interval_fair(struct rq rq, struct task_struct task)
				10512	{
				10513	struct sched_entity *se = &task->se;
				10514	unsigned int rr_interval = 0;
				10515
				10516	/*
				10517	* Time slice is 0 for SCHED_OTHER tasks that are on an otherwise
				10518	* idle runqueue:
				10519	*/
				10520	if (rq->cfs.load.weight)
				10521	rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se));
				10522
				10523	return rr_interval;
				10524	}
				10525
				10526	/*
				10527	* All the scheduling class methods:
				10528	*/
				10529	const struct sched_class fair_sched_class = {
				10530	.next = &idle_sched_class,
				10531	.enqueue_task = enqueue_task_fair,
				10532	.dequeue_task = dequeue_task_fair,
				10533	.yield_task = yield_task_fair,
				10534	.yield_to_task = yield_to_task_fair,
				10535
				10536	.check_preempt_curr = check_preempt_wakeup,
				10537
				10538	.pick_next_task = pick_next_task_fair,
				10539	.put_prev_task = put_prev_task_fair,
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	10540	.set_next_task = set_next_task_fair,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	10541
				10542	#ifdef CONFIG_SMP
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	10543	.balance = balance_fair,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	10544	.select_task_rq = select_task_rq_fair,
				10545	.migrate_task_rq = migrate_task_rq_fair,
				10546
				10547	.rq_online = rq_online_fair,
				10548	.rq_offline = rq_offline_fair,
				10549
				10550	.task_dead = task_dead_fair,
				10551	.set_cpus_allowed = set_cpus_allowed_common,
				10552	#endif
				10553
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	10554	.task_tick = task_tick_fair,
				10555	.task_fork = task_fork_fair,
				10556
				10557	.prio_changed = prio_changed_fair,
				10558	.switched_from = switched_from_fair,
				10559	.switched_to = switched_to_fair,
				10560
				10561	.get_rr_interval = get_rr_interval_fair,
				10562
				10563	.update_curr = update_curr_fair,
				10564
				10565	#ifdef CONFIG_FAIR_GROUP_SCHED
				10566	.task_change_group = task_change_group_fair,
				10567	#endif
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	10568
				10569	#ifdef CONFIG_UCLAMP_TASK
				10570	.uclamp_enabled = 1,
				10571	#endif
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	10572	};
				10573
				10574	#ifdef CONFIG_SCHED_DEBUG
				10575	void print_cfs_stats(struct seq_file *m, int cpu)
				10576	{
				10577	struct cfs_rq cfs_rq, pos;
				10578
				10579	rcu_read_lock();
				10580	for_each_leaf_cfs_rq_safe(cpu_rq(cpu), cfs_rq, pos)
				10581	print_cfs_rq(m, cpu, cfs_rq);
				10582	rcu_read_unlock();
				10583	}
				10584
				10585	#ifdef CONFIG_NUMA_BALANCING
				10586	void show_numa_stats(struct task_struct p, struct seq_file m)
				10587	{
				10588	int node;
				10589	unsigned long tsf = 0, tpf = 0, gsf = 0, gpf = 0;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	10590	struct numa_group *ng;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	10591
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	10592	rcu_read_lock();
				10593	ng = rcu_dereference(p->numa_group);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	10594	for_each_online_node(node) {
				10595	if (p->numa_faults) {
				10596	tsf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 0)];
				10597	tpf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 1)];
				10598	}
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	10599	if (ng) {
				10600	gsf = ng->faults[task_faults_idx(NUMA_MEM, node, 0)],
				10601	gpf = ng->faults[task_faults_idx(NUMA_MEM, node, 1)];
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	10602	}
				10603	print_numa_stats(m, node, tsf, tpf, gsf, gpf);
				10604	}
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	10605	rcu_read_unlock();
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	10606	}
				10607	#endif /* CONFIG_NUMA_BALANCING */
				10608	#endif /* CONFIG_SCHED_DEBUG */
				10609
				10610	__init void init_sched_fair_class(void)
				10611	{
				10612	#ifdef CONFIG_SMP
				10613	open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
				10614
				10615	#ifdef CONFIG_NO_HZ_COMMON
				10616	nohz.next_balance = jiffies;
				10617	nohz.next_blocked = jiffies;
				10618	zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
				10619	#endif
				10620	#endif /* SMP */
				10621
				10622	}
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	10623
				10624	/*
				10625	* Helper functions to facilitate extracting info from tracepoints.
				10626	*/
				10627
				10628	const struct sched_avg sched_trace_cfs_rq_avg(struct cfs_rq cfs_rq)
				10629	{
				10630	#ifdef CONFIG_SMP
				10631	return cfs_rq ? &cfs_rq->avg : NULL;
				10632	#else
				10633	return NULL;
				10634	#endif
				10635	}
				10636	EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_avg);
				10637
				10638	char sched_trace_cfs_rq_path(struct cfs_rq cfs_rq, char *str, int len)
				10639	{
				10640	if (!cfs_rq) {
				10641	if (str)
				10642	strlcpy(str, "(null)", len);
				10643	else
				10644	return NULL;
				10645	}
				10646
				10647	cfs_rq_tg_path(cfs_rq, str, len);
				10648	return str;
				10649	}
				10650	EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_path);
				10651
				10652	int sched_trace_cfs_rq_cpu(struct cfs_rq *cfs_rq)
				10653	{
				10654	return cfs_rq ? cpu_of(rq_of(cfs_rq)) : -1;
				10655	}
				10656	EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_cpu);
				10657
				10658	const struct sched_avg sched_trace_rq_avg_rt(struct rq rq)
				10659	{
				10660	#ifdef CONFIG_SMP
				10661	return rq ? &rq->avg_rt : NULL;
				10662	#else
				10663	return NULL;
				10664	#endif
				10665	}
				10666	EXPORT_SYMBOL_GPL(sched_trace_rq_avg_rt);
				10667
				10668	const struct sched_avg sched_trace_rq_avg_dl(struct rq rq)
				10669	{
				10670	#ifdef CONFIG_SMP
				10671	return rq ? &rq->avg_dl : NULL;
				10672	#else
				10673	return NULL;
				10674	#endif
				10675	}
				10676	EXPORT_SYMBOL_GPL(sched_trace_rq_avg_dl);
				10677
				10678	const struct sched_avg sched_trace_rq_avg_irq(struct rq rq)
				10679	{
				10680	#if defined(CONFIG_SMP) && defined(CONFIG_HAVE_SCHED_AVG_IRQ)
				10681	return rq ? &rq->avg_irq : NULL;
				10682	#else
				10683	return NULL;
				10684	#endif
				10685	}
				10686	EXPORT_SYMBOL_GPL(sched_trace_rq_avg_irq);
				10687
				10688	int sched_trace_rq_cpu(struct rq *rq)
				10689	{
				10690	return rq ? cpu_of(rq) : -1;
				10691	}
				10692	EXPORT_SYMBOL_GPL(sched_trace_rq_cpu);
				10693
				10694	const struct cpumask sched_trace_rd_span(struct root_domain rd)
				10695	{
				10696	#ifdef CONFIG_SMP
				10697	return rd ? rd->span : NULL;
				10698	#else
				10699	return NULL;
				10700	#endif
				10701	}
				10702	EXPORT_SYMBOL_GPL(sched_trace_rd_span);