Blame - kernel/cgroup/cpuset.c - hafnium/third_party/linux

blob: badfa8f153599dedd8841f80f8a5fb75456a1e97 [file] [log] [blame]

Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1	/*
				2	* kernel/cpuset.c
				3	*
				4	* Processor and Memory placement constraints for sets of tasks.
				5	*
				6	* Copyright (C) 2003 BULL SA.
				7	* Copyright (C) 2004-2007 Silicon Graphics, Inc.
				8	* Copyright (C) 2006 Google, Inc
				9	*
				10	* Portions derived from Patrick Mochel's sysfs code.
				11	* sysfs is Copyright (c) 2001-3 Patrick Mochel
				12	*
				13	* 2003-10-10 Written by Simon Derr.
				14	* 2003-10-22 Updates by Stephen Hemminger.
				15	* 2004 May-July Rework by Paul Jackson.
				16	* 2006 Rework by Paul Menage to use generic cgroups
				17	* 2008 Rework of the scheduler domains and CPU hotplug handling
				18	* by Max Krasnyansky
				19	*
				20	* This file is subject to the terms and conditions of the GNU General Public
				21	* License. See the file COPYING in the main directory of the Linux
				22	* distribution for more details.
				23	*/
				24
				25	#include <linux/cpu.h>
				26	#include <linux/cpumask.h>
				27	#include <linux/cpuset.h>
				28	#include <linux/err.h>
				29	#include <linux/errno.h>
				30	#include <linux/file.h>
				31	#include <linux/fs.h>
				32	#include <linux/init.h>
				33	#include <linux/interrupt.h>
				34	#include <linux/kernel.h>
				35	#include <linux/kmod.h>
				36	#include <linux/list.h>
				37	#include <linux/mempolicy.h>
				38	#include <linux/mm.h>
				39	#include <linux/memory.h>
				40	#include <linux/export.h>
				41	#include <linux/mount.h>
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	42	#include <linux/fs_context.h>
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	43	#include <linux/namei.h>
				44	#include <linux/pagemap.h>
				45	#include <linux/proc_fs.h>
				46	#include <linux/rcupdate.h>
				47	#include <linux/sched.h>
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	48	#include <linux/sched/deadline.h>
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	49	#include <linux/sched/mm.h>
				50	#include <linux/sched/task.h>
				51	#include <linux/seq_file.h>
				52	#include <linux/security.h>
				53	#include <linux/slab.h>
				54	#include <linux/spinlock.h>
				55	#include <linux/stat.h>
				56	#include <linux/string.h>
				57	#include <linux/time.h>
				58	#include <linux/time64.h>
				59	#include <linux/backing-dev.h>
				60	#include <linux/sort.h>
				61	#include <linux/oom.h>
				62	#include <linux/sched/isolation.h>
				63	#include <linux/uaccess.h>
				64	#include <linux/atomic.h>
				65	#include <linux/mutex.h>
				66	#include <linux/cgroup.h>
				67	#include <linux/wait.h>
				68
				69	DEFINE_STATIC_KEY_FALSE(cpusets_pre_enable_key);
				70	DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key);
				71
				72	/* See "Frequency meter" comments, below. */
				73
				74	struct fmeter {
				75	int cnt; /* unprocessed events count */
				76	int val; /* most recent output value */
				77	time64_t time; /* clock (secs) when val computed */
				78	spinlock_t lock; /* guards read or write of above */
				79	};
				80
				81	struct cpuset {
				82	struct cgroup_subsys_state css;
				83
				84	unsigned long flags; /* "unsigned long" so bitops work */
				85
				86	/*
				87	* On default hierarchy:
				88	*
				89	* The user-configured masks can only be changed by writing to
				90	* cpuset.cpus and cpuset.mems, and won't be limited by the
				91	* parent masks.
				92	*
				93	* The effective masks is the real masks that apply to the tasks
				94	* in the cpuset. They may be changed if the configured masks are
				95	* changed or hotplug happens.
				96	*
				97	* effective_mask == configured_mask & parent's effective_mask,
				98	* and if it ends up empty, it will inherit the parent's mask.
				99	*
				100	*
				101	* On legacy hierachy:
				102	*
				103	* The user-configured masks are always the same with effective masks.
				104	*/
				105
				106	/* user-configured CPUs and Memory Nodes allow to tasks */
				107	cpumask_var_t cpus_allowed;
				108	nodemask_t mems_allowed;
				109
				110	/* effective CPUs and Memory Nodes allow to tasks */
				111	cpumask_var_t effective_cpus;
				112	nodemask_t effective_mems;
				113
				114	/*
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	115	* CPUs allocated to child sub-partitions (default hierarchy only)
				116	* - CPUs granted by the parent = effective_cpus U subparts_cpus
				117	* - effective_cpus and subparts_cpus are mutually exclusive.
				118	*
				119	* effective_cpus contains only onlined CPUs, but subparts_cpus
				120	* may have offlined ones.
				121	*/
				122	cpumask_var_t subparts_cpus;
				123
				124	/*
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	125	* This is old Memory Nodes tasks took on.
				126	*
				127	* - top_cpuset.old_mems_allowed is initialized to mems_allowed.
				128	* - A new cpuset's old_mems_allowed is initialized when some
				129	* task is moved into it.
				130	* - old_mems_allowed is used in cpuset_migrate_mm() when we change
				131	* cpuset.mems_allowed and have tasks' nodemask updated, and
				132	* then old_mems_allowed is updated to mems_allowed.
				133	*/
				134	nodemask_t old_mems_allowed;
				135
				136	struct fmeter fmeter; /* memory_pressure filter */
				137
				138	/*
				139	* Tasks are being attached to this cpuset. Used to prevent
				140	* zeroing cpus/mems_allowed between ->can_attach() and ->attach().
				141	*/
				142	int attach_in_progress;
				143
				144	/* partition number for rebuild_sched_domains() */
				145	int pn;
				146
				147	/* for custom sched domain */
				148	int relax_domain_level;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	149
				150	/* number of CPUs in subparts_cpus */
				151	int nr_subparts_cpus;
				152
				153	/* partition root state */
				154	int partition_root_state;
				155
				156	/*
				157	* Default hierarchy only:
				158	* use_parent_ecpus - set if using parent's effective_cpus
				159	* child_ecpus_count - # of children with use_parent_ecpus set
				160	*/
				161	int use_parent_ecpus;
				162	int child_ecpus_count;
				163	};
				164
				165	/*
				166	* Partition root states:
				167	*
				168	* 0 - not a partition root
				169	*
				170	* 1 - partition root
				171	*
				172	* -1 - invalid partition root
				173	* None of the cpus in cpus_allowed can be put into the parent's
				174	* subparts_cpus. In this case, the cpuset is not a real partition
				175	* root anymore. However, the CPU_EXCLUSIVE bit will still be set
				176	* and the cpuset can be restored back to a partition root if the
				177	* parent cpuset can give more CPUs back to this child cpuset.
				178	*/
				179	#define PRS_DISABLED 0
				180	#define PRS_ENABLED 1
				181	#define PRS_ERROR -1
				182
				183	/*
				184	* Temporary cpumasks for working with partitions that are passed among
				185	* functions to avoid memory allocation in inner functions.
				186	*/
				187	struct tmpmasks {
				188	cpumask_var_t addmask, delmask; /* For partition root */
				189	cpumask_var_t new_cpus; /* For update_cpumasks_hier() */
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	190	};
				191
				192	static inline struct cpuset css_cs(struct cgroup_subsys_state css)
				193	{
				194	return css ? container_of(css, struct cpuset, css) : NULL;
				195	}
				196
				197	/* Retrieve the cpuset for a task */
				198	static inline struct cpuset task_cs(struct task_struct task)
				199	{
				200	return css_cs(task_css(task, cpuset_cgrp_id));
				201	}
				202
				203	static inline struct cpuset parent_cs(struct cpuset cs)
				204	{
				205	return css_cs(cs->css.parent);
				206	}
				207
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	208	/* bits in struct cpuset flags field */
				209	typedef enum {
				210	CS_ONLINE,
				211	CS_CPU_EXCLUSIVE,
				212	CS_MEM_EXCLUSIVE,
				213	CS_MEM_HARDWALL,
				214	CS_MEMORY_MIGRATE,
				215	CS_SCHED_LOAD_BALANCE,
				216	CS_SPREAD_PAGE,
				217	CS_SPREAD_SLAB,
				218	} cpuset_flagbits_t;
				219
				220	/* convenient tests for these bits */
				221	static inline bool is_cpuset_online(struct cpuset *cs)
				222	{
				223	return test_bit(CS_ONLINE, &cs->flags) && !css_is_dying(&cs->css);
				224	}
				225
				226	static inline int is_cpu_exclusive(const struct cpuset *cs)
				227	{
				228	return test_bit(CS_CPU_EXCLUSIVE, &cs->flags);
				229	}
				230
				231	static inline int is_mem_exclusive(const struct cpuset *cs)
				232	{
				233	return test_bit(CS_MEM_EXCLUSIVE, &cs->flags);
				234	}
				235
				236	static inline int is_mem_hardwall(const struct cpuset *cs)
				237	{
				238	return test_bit(CS_MEM_HARDWALL, &cs->flags);
				239	}
				240
				241	static inline int is_sched_load_balance(const struct cpuset *cs)
				242	{
				243	return test_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
				244	}
				245
				246	static inline int is_memory_migrate(const struct cpuset *cs)
				247	{
				248	return test_bit(CS_MEMORY_MIGRATE, &cs->flags);
				249	}
				250
				251	static inline int is_spread_page(const struct cpuset *cs)
				252	{
				253	return test_bit(CS_SPREAD_PAGE, &cs->flags);
				254	}
				255
				256	static inline int is_spread_slab(const struct cpuset *cs)
				257	{
				258	return test_bit(CS_SPREAD_SLAB, &cs->flags);
				259	}
				260
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	261	static inline int is_partition_root(const struct cpuset *cs)
				262	{
				263	return cs->partition_root_state > 0;
				264	}
				265
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	266	static struct cpuset top_cpuset = {
				267	.flags = ((1 << CS_ONLINE) \| (1 << CS_CPU_EXCLUSIVE) \|
				268	(1 << CS_MEM_EXCLUSIVE)),
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	269	.partition_root_state = PRS_ENABLED,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	270	};
				271
				272	/**
				273	* cpuset_for_each_child - traverse online children of a cpuset
				274	* @child_cs: loop cursor pointing to the current child
				275	* @pos_css: used for iteration
				276	* @parent_cs: target cpuset to walk children of
				277	*
				278	* Walk @child_cs through the online children of @parent_cs. Must be used
				279	* with RCU read locked.
				280	*/
				281	#define cpuset_for_each_child(child_cs, pos_css, parent_cs) \
				282	css_for_each_child((pos_css), &(parent_cs)->css) \
				283	if (is_cpuset_online(((child_cs) = css_cs((pos_css)))))
				284
				285	/**
				286	* cpuset_for_each_descendant_pre - pre-order walk of a cpuset's descendants
				287	* @des_cs: loop cursor pointing to the current descendant
				288	* @pos_css: used for iteration
				289	* @root_cs: target cpuset to walk ancestor of
				290	*
				291	* Walk @des_cs through the online descendants of @root_cs. Must be used
				292	* with RCU read locked. The caller may modify @pos_css by calling
				293	* css_rightmost_descendant() to skip subtree. @root_cs is included in the
				294	* iteration and the first node to be visited.
				295	*/
				296	#define cpuset_for_each_descendant_pre(des_cs, pos_css, root_cs) \
				297	css_for_each_descendant_pre((pos_css), &(root_cs)->css) \
				298	if (is_cpuset_online(((des_cs) = css_cs((pos_css)))))
				299
				300	/*
				301	* There are two global locks guarding cpuset structures - cpuset_mutex and
				302	* callback_lock. We also require taking task_lock() when dereferencing a
				303	* task's cpuset pointer. See "The task_lock() exception", at the end of this
				304	* comment.
				305	*
				306	* A task must hold both locks to modify cpusets. If a task holds
				307	* cpuset_mutex, then it blocks others wanting that mutex, ensuring that it
				308	* is the only task able to also acquire callback_lock and be able to
				309	* modify cpusets. It can perform various checks on the cpuset structure
				310	* first, knowing nothing will change. It can also allocate memory while
				311	* just holding cpuset_mutex. While it is performing these checks, various
				312	* callback routines can briefly acquire callback_lock to query cpusets.
				313	* Once it is ready to make the changes, it takes callback_lock, blocking
				314	* everyone else.
				315	*
				316	* Calls to the kernel memory allocator can not be made while holding
				317	* callback_lock, as that would risk double tripping on callback_lock
				318	* from one of the callbacks into the cpuset code from within
				319	* __alloc_pages().
				320	*
				321	* If a task is only holding callback_lock, then it has read-only
				322	* access to cpusets.
				323	*
				324	* Now, the task_struct fields mems_allowed and mempolicy may be changed
				325	* by other task, we use alloc_lock in the task_struct fields to protect
				326	* them.
				327	*
				328	* The cpuset_common_file_read() handlers only hold callback_lock across
				329	* small pieces of code, such as when reading out possibly multi-word
				330	* cpumasks and nodemasks.
				331	*
				332	* Accessing a task's cpuset should be done in accordance with the
				333	* guidelines for accessing subsystem state in kernel/cgroup.c
				334	*/
				335
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	336	DEFINE_STATIC_PERCPU_RWSEM(cpuset_rwsem);
				337
				338	void cpuset_read_lock(void)
				339	{
				340	percpu_down_read(&cpuset_rwsem);
				341	}
				342
				343	void cpuset_read_unlock(void)
				344	{
				345	percpu_up_read(&cpuset_rwsem);
				346	}
				347
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	348	static DEFINE_SPINLOCK(callback_lock);
				349
				350	static struct workqueue_struct *cpuset_migrate_mm_wq;
				351
				352	/*
				353	* CPU / memory hotplug is handled asynchronously.
				354	*/
				355	static void cpuset_hotplug_workfn(struct work_struct *work);
				356	static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn);
				357
				358	static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq);
				359
				360	/*
				361	* Cgroup v2 behavior is used when on default hierarchy or the
				362	* cgroup_v2_mode flag is set.
				363	*/
				364	static inline bool is_in_v2_mode(void)
				365	{
				366	return cgroup_subsys_on_dfl(cpuset_cgrp_subsys) \|\|
				367	(cpuset_cgrp_subsys.root->flags & CGRP_ROOT_CPUSET_V2_MODE);
				368	}
				369
				370	/*
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	371	* Return in pmask the portion of a cpusets's cpus_allowed that
				372	* are online. If none are online, walk up the cpuset hierarchy
				373	* until we find one that does have some online cpus.
				374	*
				375	* One way or another, we guarantee to return some non-empty subset
				376	* of cpu_online_mask.
				377	*
				378	* Call with callback_lock or cpuset_mutex held.
				379	*/
				380	static void guarantee_online_cpus(struct cpuset cs, struct cpumask pmask)
				381	{
				382	while (!cpumask_intersects(cs->effective_cpus, cpu_online_mask)) {
				383	cs = parent_cs(cs);
				384	if (unlikely(!cs)) {
				385	/*
				386	* The top cpuset doesn't have any online cpu as a
				387	* consequence of a race between cpuset_hotplug_work
				388	* and cpu hotplug notifier. But we know the top
				389	* cpuset's effective_cpus is on its way to to be
				390	* identical to cpu_online_mask.
				391	*/
				392	cpumask_copy(pmask, cpu_online_mask);
				393	return;
				394	}
				395	}
				396	cpumask_and(pmask, cs->effective_cpus, cpu_online_mask);
				397	}
				398
				399	/*
				400	* Return in *pmask the portion of a cpusets's mems_allowed that
				401	* are online, with memory. If none are online with memory, walk
				402	* up the cpuset hierarchy until we find one that does have some
				403	* online mems. The top cpuset always has some mems online.
				404	*
				405	* One way or another, we guarantee to return some non-empty subset
				406	* of node_states[N_MEMORY].
				407	*
				408	* Call with callback_lock or cpuset_mutex held.
				409	*/
				410	static void guarantee_online_mems(struct cpuset cs, nodemask_t pmask)
				411	{
				412	while (!nodes_intersects(cs->effective_mems, node_states[N_MEMORY]))
				413	cs = parent_cs(cs);
				414	nodes_and(*pmask, cs->effective_mems, node_states[N_MEMORY]);
				415	}
				416
				417	/*
				418	* update task's spread flag if cpuset's page/slab spread flag is set
				419	*
				420	* Call with callback_lock or cpuset_mutex held.
				421	*/
				422	static void cpuset_update_task_spread_flag(struct cpuset *cs,
				423	struct task_struct *tsk)
				424	{
				425	if (is_spread_page(cs))
				426	task_set_spread_page(tsk);
				427	else
				428	task_clear_spread_page(tsk);
				429
				430	if (is_spread_slab(cs))
				431	task_set_spread_slab(tsk);
				432	else
				433	task_clear_spread_slab(tsk);
				434	}
				435
				436	/*
				437	* is_cpuset_subset(p, q) - Is cpuset p a subset of cpuset q?
				438	*
				439	* One cpuset is a subset of another if all its allowed CPUs and
				440	* Memory Nodes are a subset of the other, and its exclusive flags
				441	* are only set if the other's are set. Call holding cpuset_mutex.
				442	*/
				443
				444	static int is_cpuset_subset(const struct cpuset p, const struct cpuset q)
				445	{
				446	return cpumask_subset(p->cpus_allowed, q->cpus_allowed) &&
				447	nodes_subset(p->mems_allowed, q->mems_allowed) &&
				448	is_cpu_exclusive(p) <= is_cpu_exclusive(q) &&
				449	is_mem_exclusive(p) <= is_mem_exclusive(q);
				450	}
				451
				452	/**
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	453	* alloc_cpumasks - allocate three cpumasks for cpuset
				454	* @cs: the cpuset that have cpumasks to be allocated.
				455	* @tmp: the tmpmasks structure pointer
				456	* Return: 0 if successful, -ENOMEM otherwise.
				457	*
				458	* Only one of the two input arguments should be non-NULL.
				459	*/
				460	static inline int alloc_cpumasks(struct cpuset cs, struct tmpmasks tmp)
				461	{
				462	cpumask_var_t pmask1, pmask2, *pmask3;
				463
				464	if (cs) {
				465	pmask1 = &cs->cpus_allowed;
				466	pmask2 = &cs->effective_cpus;
				467	pmask3 = &cs->subparts_cpus;
				468	} else {
				469	pmask1 = &tmp->new_cpus;
				470	pmask2 = &tmp->addmask;
				471	pmask3 = &tmp->delmask;
				472	}
				473
				474	if (!zalloc_cpumask_var(pmask1, GFP_KERNEL))
				475	return -ENOMEM;
				476
				477	if (!zalloc_cpumask_var(pmask2, GFP_KERNEL))
				478	goto free_one;
				479
				480	if (!zalloc_cpumask_var(pmask3, GFP_KERNEL))
				481	goto free_two;
				482
				483	return 0;
				484
				485	free_two:
				486	free_cpumask_var(*pmask2);
				487	free_one:
				488	free_cpumask_var(*pmask1);
				489	return -ENOMEM;
				490	}
				491
				492	/**
				493	* free_cpumasks - free cpumasks in a tmpmasks structure
				494	* @cs: the cpuset that have cpumasks to be free.
				495	* @tmp: the tmpmasks structure pointer
				496	*/
				497	static inline void free_cpumasks(struct cpuset cs, struct tmpmasks tmp)
				498	{
				499	if (cs) {
				500	free_cpumask_var(cs->cpus_allowed);
				501	free_cpumask_var(cs->effective_cpus);
				502	free_cpumask_var(cs->subparts_cpus);
				503	}
				504	if (tmp) {
				505	free_cpumask_var(tmp->new_cpus);
				506	free_cpumask_var(tmp->addmask);
				507	free_cpumask_var(tmp->delmask);
				508	}
				509	}
				510
				511	/**
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	512	* alloc_trial_cpuset - allocate a trial cpuset
				513	* @cs: the cpuset that the trial cpuset duplicates
				514	*/
				515	static struct cpuset alloc_trial_cpuset(struct cpuset cs)
				516	{
				517	struct cpuset *trial;
				518
				519	trial = kmemdup(cs, sizeof(*cs), GFP_KERNEL);
				520	if (!trial)
				521	return NULL;
				522
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	523	if (alloc_cpumasks(trial, NULL)) {
				524	kfree(trial);
				525	return NULL;
				526	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	527
				528	cpumask_copy(trial->cpus_allowed, cs->cpus_allowed);
				529	cpumask_copy(trial->effective_cpus, cs->effective_cpus);
				530	return trial;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	531	}
				532
				533	/**
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	534	* free_cpuset - free the cpuset
				535	* @cs: the cpuset to be freed
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	536	*/
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	537	static inline void free_cpuset(struct cpuset *cs)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	538	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	539	free_cpumasks(cs, NULL);
				540	kfree(cs);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	541	}
				542
				543	/*
				544	* validate_change() - Used to validate that any proposed cpuset change
				545	* follows the structural rules for cpusets.
				546	*
				547	* If we replaced the flag and mask values of the current cpuset
				548	* (cur) with those values in the trial cpuset (trial), would
				549	* our various subset and exclusive rules still be valid? Presumes
				550	* cpuset_mutex held.
				551	*
				552	* 'cur' is the address of an actual, in-use cpuset. Operations
				553	* such as list traversal that depend on the actual address of the
				554	* cpuset in the list must use cur below, not trial.
				555	*
				556	* 'trial' is the address of bulk structure copy of cur, with
				557	* perhaps one or more of the fields cpus_allowed, mems_allowed,
				558	* or flags changed to new, trial values.
				559	*
				560	* Return 0 if valid, -errno if not.
				561	*/
				562
				563	static int validate_change(struct cpuset cur, struct cpuset trial)
				564	{
				565	struct cgroup_subsys_state *css;
				566	struct cpuset c, par;
				567	int ret;
				568
				569	rcu_read_lock();
				570
				571	/* Each of our child cpusets must be a subset of us */
				572	ret = -EBUSY;
				573	cpuset_for_each_child(c, css, cur)
				574	if (!is_cpuset_subset(c, trial))
				575	goto out;
				576
				577	/* Remaining checks don't apply to root cpuset */
				578	ret = 0;
				579	if (cur == &top_cpuset)
				580	goto out;
				581
				582	par = parent_cs(cur);
				583
				584	/* On legacy hiearchy, we must be a subset of our parent cpuset. */
				585	ret = -EACCES;
				586	if (!is_in_v2_mode() && !is_cpuset_subset(trial, par))
				587	goto out;
				588
				589	/*
				590	* If either I or some sibling (!= me) is exclusive, we can't
				591	* overlap
				592	*/
				593	ret = -EINVAL;
				594	cpuset_for_each_child(c, css, par) {
				595	if ((is_cpu_exclusive(trial) \|\| is_cpu_exclusive(c)) &&
				596	c != cur &&
				597	cpumask_intersects(trial->cpus_allowed, c->cpus_allowed))
				598	goto out;
				599	if ((is_mem_exclusive(trial) \|\| is_mem_exclusive(c)) &&
				600	c != cur &&
				601	nodes_intersects(trial->mems_allowed, c->mems_allowed))
				602	goto out;
				603	}
				604
				605	/*
				606	* Cpusets with tasks - existing or newly being attached - can't
				607	* be changed to have empty cpus_allowed or mems_allowed.
				608	*/
				609	ret = -ENOSPC;
				610	if ((cgroup_is_populated(cur->css.cgroup) \|\| cur->attach_in_progress)) {
				611	if (!cpumask_empty(cur->cpus_allowed) &&
				612	cpumask_empty(trial->cpus_allowed))
				613	goto out;
				614	if (!nodes_empty(cur->mems_allowed) &&
				615	nodes_empty(trial->mems_allowed))
				616	goto out;
				617	}
				618
				619	/*
				620	* We can't shrink if we won't have enough room for SCHED_DEADLINE
				621	* tasks.
				622	*/
				623	ret = -EBUSY;
				624	if (is_cpu_exclusive(cur) &&
				625	!cpuset_cpumask_can_shrink(cur->cpus_allowed,
				626	trial->cpus_allowed))
				627	goto out;
				628
				629	ret = 0;
				630	out:
				631	rcu_read_unlock();
				632	return ret;
				633	}
				634
				635	#ifdef CONFIG_SMP
				636	/*
				637	* Helper routine for generate_sched_domains().
				638	* Do cpusets a, b have overlapping effective cpus_allowed masks?
				639	*/
				640	static int cpusets_overlap(struct cpuset a, struct cpuset b)
				641	{
				642	return cpumask_intersects(a->effective_cpus, b->effective_cpus);
				643	}
				644
				645	static void
				646	update_domain_attr(struct sched_domain_attr dattr, struct cpuset c)
				647	{
				648	if (dattr->relax_domain_level < c->relax_domain_level)
				649	dattr->relax_domain_level = c->relax_domain_level;
				650	return;
				651	}
				652
				653	static void update_domain_attr_tree(struct sched_domain_attr *dattr,
				654	struct cpuset *root_cs)
				655	{
				656	struct cpuset *cp;
				657	struct cgroup_subsys_state *pos_css;
				658
				659	rcu_read_lock();
				660	cpuset_for_each_descendant_pre(cp, pos_css, root_cs) {
				661	/* skip the whole subtree if @cp doesn't have any CPU */
				662	if (cpumask_empty(cp->cpus_allowed)) {
				663	pos_css = css_rightmost_descendant(pos_css);
				664	continue;
				665	}
				666
				667	if (is_sched_load_balance(cp))
				668	update_domain_attr(dattr, cp);
				669	}
				670	rcu_read_unlock();
				671	}
				672
				673	/* Must be called with cpuset_mutex held. */
				674	static inline int nr_cpusets(void)
				675	{
				676	/* jump label reference count + the top-level cpuset */
				677	return static_key_count(&cpusets_enabled_key.key) + 1;
				678	}
				679
				680	/*
				681	* generate_sched_domains()
				682	*
				683	* This function builds a partial partition of the systems CPUs
				684	* A 'partial partition' is a set of non-overlapping subsets whose
				685	* union is a subset of that set.
				686	* The output of this function needs to be passed to kernel/sched/core.c
				687	* partition_sched_domains() routine, which will rebuild the scheduler's
				688	* load balancing domains (sched domains) as specified by that partial
				689	* partition.
				690	*
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	691	* See "What is sched_load_balance" in Documentation/admin-guide/cgroup-v1/cpusets.rst
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	692	* for a background explanation of this.
				693	*
				694	* Does not return errors, on the theory that the callers of this
				695	* routine would rather not worry about failures to rebuild sched
				696	* domains when operating in the severe memory shortage situations
				697	* that could cause allocation failures below.
				698	*
				699	* Must be called with cpuset_mutex held.
				700	*
				701	* The three key local variables below are:
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	702	* cp - cpuset pointer, used (together with pos_css) to perform a
				703	* top-down scan of all cpusets. For our purposes, rebuilding
				704	* the schedulers sched domains, we can ignore !is_sched_load_
				705	* balance cpusets.
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	706	* csa - (for CpuSet Array) Array of pointers to all the cpusets
				707	* that need to be load balanced, for convenient iterative
				708	* access by the subsequent code that finds the best partition,
				709	* i.e the set of domains (subsets) of CPUs such that the
				710	* cpus_allowed of every cpuset marked is_sched_load_balance
				711	* is a subset of one of these domains, while there are as
				712	* many such domains as possible, each as small as possible.
				713	* doms - Conversion of 'csa' to an array of cpumasks, for passing to
				714	* the kernel/sched/core.c routine partition_sched_domains() in a
				715	* convenient format, that can be easily compared to the prior
				716	* value to determine what partition elements (sched domains)
				717	* were changed (added or removed.)
				718	*
				719	* Finding the best partition (set of domains):
				720	* The triple nested loops below over i, j, k scan over the
				721	* load balanced cpusets (using the array of cpuset pointers in
				722	* csa[]) looking for pairs of cpusets that have overlapping
				723	* cpus_allowed, but which don't have the same 'pn' partition
				724	* number and gives them in the same partition number. It keeps
				725	* looping on the 'restart' label until it can no longer find
				726	* any such pairs.
				727	*
				728	* The union of the cpus_allowed masks from the set of
				729	* all cpusets having the same 'pn' value then form the one
				730	* element of the partition (one sched domain) to be passed to
				731	* partition_sched_domains().
				732	*/
				733	static int generate_sched_domains(cpumask_var_t **domains,
				734	struct sched_domain_attr **attributes)
				735	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	736	struct cpuset cp; / top-down scan of cpusets */
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	737	struct cpuset *csa; / array of all cpuset ptrs */
				738	int csn; /* how many cpuset ptrs in csa so far */
				739	int i, j, k; /* indices for partition finding loops */
				740	cpumask_var_t doms; / resulting partition; i.e. sched domains */
				741	struct sched_domain_attr dattr; / attributes for custom domains */
				742	int ndoms = 0; /* number of sched domains in result */
				743	int nslot; /* next empty doms[] struct cpumask slot */
				744	struct cgroup_subsys_state *pos_css;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	745	bool root_load_balance = is_sched_load_balance(&top_cpuset);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	746
				747	doms = NULL;
				748	dattr = NULL;
				749	csa = NULL;
				750
				751	/* Special case for the 99% of systems with one, full, sched domain */
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	752	if (root_load_balance && !top_cpuset.nr_subparts_cpus) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	753	ndoms = 1;
				754	doms = alloc_sched_domains(ndoms);
				755	if (!doms)
				756	goto done;
				757
				758	dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL);
				759	if (dattr) {
				760	*dattr = SD_ATTR_INIT;
				761	update_domain_attr_tree(dattr, &top_cpuset);
				762	}
				763	cpumask_and(doms[0], top_cpuset.effective_cpus,
				764	housekeeping_cpumask(HK_FLAG_DOMAIN));
				765
				766	goto done;
				767	}
				768
				769	csa = kmalloc_array(nr_cpusets(), sizeof(cp), GFP_KERNEL);
				770	if (!csa)
				771	goto done;
				772	csn = 0;
				773
				774	rcu_read_lock();
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	775	if (root_load_balance)
				776	csa[csn++] = &top_cpuset;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	777	cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) {
				778	if (cp == &top_cpuset)
				779	continue;
				780	/*
				781	* Continue traversing beyond @cp iff @cp has some CPUs and
				782	* isn't load balancing. The former is obvious. The
				783	* latter: All child cpusets contain a subset of the
				784	* parent's cpus, so just skip them, and then we call
				785	* update_domain_attr_tree() to calc relax_domain_level of
				786	* the corresponding sched domain.
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	787	*
				788	* If root is load-balancing, we can skip @cp if it
				789	* is a subset of the root's effective_cpus.
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	790	*/
				791	if (!cpumask_empty(cp->cpus_allowed) &&
				792	!(is_sched_load_balance(cp) &&
				793	cpumask_intersects(cp->cpus_allowed,
				794	housekeeping_cpumask(HK_FLAG_DOMAIN))))
				795	continue;
				796
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	797	if (root_load_balance &&
				798	cpumask_subset(cp->cpus_allowed, top_cpuset.effective_cpus))
				799	continue;
				800
				801	if (is_sched_load_balance(cp) &&
				802	!cpumask_empty(cp->effective_cpus))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	803	csa[csn++] = cp;
				804
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	805	/* skip @cp's subtree if not a partition root */
				806	if (!is_partition_root(cp))
				807	pos_css = css_rightmost_descendant(pos_css);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	808	}
				809	rcu_read_unlock();
				810
				811	for (i = 0; i < csn; i++)
				812	csa[i]->pn = i;
				813	ndoms = csn;
				814
				815	restart:
				816	/* Find the best partition (set of sched domains) */
				817	for (i = 0; i < csn; i++) {
				818	struct cpuset *a = csa[i];
				819	int apn = a->pn;
				820
				821	for (j = 0; j < csn; j++) {
				822	struct cpuset *b = csa[j];
				823	int bpn = b->pn;
				824
				825	if (apn != bpn && cpusets_overlap(a, b)) {
				826	for (k = 0; k < csn; k++) {
				827	struct cpuset *c = csa[k];
				828
				829	if (c->pn == bpn)
				830	c->pn = apn;
				831	}
				832	ndoms--; /* one less element */
				833	goto restart;
				834	}
				835	}
				836	}
				837
				838	/*
				839	* Now we know how many domains to create.
				840	* Convert <csn, csa> to <ndoms, doms> and populate cpu masks.
				841	*/
				842	doms = alloc_sched_domains(ndoms);
				843	if (!doms)
				844	goto done;
				845
				846	/*
				847	* The rest of the code, including the scheduler, can deal with
				848	* dattr==NULL case. No need to abort if alloc fails.
				849	*/
				850	dattr = kmalloc_array(ndoms, sizeof(struct sched_domain_attr),
				851	GFP_KERNEL);
				852
				853	for (nslot = 0, i = 0; i < csn; i++) {
				854	struct cpuset *a = csa[i];
				855	struct cpumask *dp;
				856	int apn = a->pn;
				857
				858	if (apn < 0) {
				859	/* Skip completed partitions */
				860	continue;
				861	}
				862
				863	dp = doms[nslot];
				864
				865	if (nslot == ndoms) {
				866	static int warnings = 10;
				867	if (warnings) {
				868	pr_warn("rebuild_sched_domains confused: nslot %d, ndoms %d, csn %d, i %d, apn %d\n",
				869	nslot, ndoms, csn, i, apn);
				870	warnings--;
				871	}
				872	continue;
				873	}
				874
				875	cpumask_clear(dp);
				876	if (dattr)
				877	*(dattr + nslot) = SD_ATTR_INIT;
				878	for (j = i; j < csn; j++) {
				879	struct cpuset *b = csa[j];
				880
				881	if (apn == b->pn) {
				882	cpumask_or(dp, dp, b->effective_cpus);
				883	cpumask_and(dp, dp, housekeeping_cpumask(HK_FLAG_DOMAIN));
				884	if (dattr)
				885	update_domain_attr_tree(dattr + nslot, b);
				886
				887	/* Done with this partition */
				888	b->pn = -1;
				889	}
				890	}
				891	nslot++;
				892	}
				893	BUG_ON(nslot != ndoms);
				894
				895	done:
				896	kfree(csa);
				897
				898	/*
				899	* Fallback to the default domain if kmalloc() failed.
				900	* See comments in partition_sched_domains().
				901	*/
				902	if (doms == NULL)
				903	ndoms = 1;
				904
				905	*domains = doms;
				906	*attributes = dattr;
				907	return ndoms;
				908	}
				909
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	910	static void update_tasks_root_domain(struct cpuset *cs)
				911	{
				912	struct css_task_iter it;
				913	struct task_struct *task;
				914
				915	css_task_iter_start(&cs->css, 0, &it);
				916
				917	while ((task = css_task_iter_next(&it)))
				918	dl_add_task_root_domain(task);
				919
				920	css_task_iter_end(&it);
				921	}
				922
				923	static void rebuild_root_domains(void)
				924	{
				925	struct cpuset *cs = NULL;
				926	struct cgroup_subsys_state *pos_css;
				927
				928	percpu_rwsem_assert_held(&cpuset_rwsem);
				929	lockdep_assert_cpus_held();
				930	lockdep_assert_held(&sched_domains_mutex);
				931
				932	cgroup_enable_task_cg_lists();
				933
				934	rcu_read_lock();
				935
				936	/*
				937	* Clear default root domain DL accounting, it will be computed again
				938	* if a task belongs to it.
				939	*/
				940	dl_clear_root_domain(&def_root_domain);
				941
				942	cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
				943
				944	if (cpumask_empty(cs->effective_cpus)) {
				945	pos_css = css_rightmost_descendant(pos_css);
				946	continue;
				947	}
				948
				949	css_get(&cs->css);
				950
				951	rcu_read_unlock();
				952
				953	update_tasks_root_domain(cs);
				954
				955	rcu_read_lock();
				956	css_put(&cs->css);
				957	}
				958	rcu_read_unlock();
				959	}
				960
				961	static void
				962	partition_and_rebuild_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
				963	struct sched_domain_attr *dattr_new)
				964	{
				965	mutex_lock(&sched_domains_mutex);
				966	partition_sched_domains_locked(ndoms_new, doms_new, dattr_new);
				967	rebuild_root_domains();
				968	mutex_unlock(&sched_domains_mutex);
				969	}
				970
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	971	/*
				972	* Rebuild scheduler domains.
				973	*
				974	* If the flag 'sched_load_balance' of any cpuset with non-empty
				975	* 'cpus' changes, or if the 'cpus' allowed changes in any cpuset
				976	* which has that flag enabled, or if any cpuset with a non-empty
				977	* 'cpus' is removed, then call this routine to rebuild the
				978	* scheduler's dynamic sched domains.
				979	*
				980	* Call with cpuset_mutex held. Takes get_online_cpus().
				981	*/
				982	static void rebuild_sched_domains_locked(void)
				983	{
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	984	struct cgroup_subsys_state *pos_css;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	985	struct sched_domain_attr *attr;
				986	cpumask_var_t *doms;
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	987	struct cpuset *cs;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	988	int ndoms;
				989
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	990	lockdep_assert_cpus_held();
				991	percpu_rwsem_assert_held(&cpuset_rwsem);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	992
				993	/*
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	994	* If we have raced with CPU hotplug, return early to avoid
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	995	* passing doms with offlined cpu to partition_sched_domains().
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	996	* Anyways, cpuset_hotplug_workfn() will rebuild sched domains.
				997	*
				998	* With no CPUs in any subpartitions, top_cpuset's effective CPUs
				999	* should be the same as the active CPUs, so checking only top_cpuset
				1000	* is enough to detect racing CPU offlines.
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1001	*/
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1002	if (!top_cpuset.nr_subparts_cpus &&
				1003	!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
				1004	return;
				1005
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	1006	/*
				1007	* With subpartition CPUs, however, the effective CPUs of a partition
				1008	* root should be only a subset of the active CPUs. Since a CPU in any
				1009	* partition root could be offlined, all must be checked.
				1010	*/
				1011	if (top_cpuset.nr_subparts_cpus) {
				1012	rcu_read_lock();
				1013	cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
				1014	if (!is_partition_root(cs)) {
				1015	pos_css = css_rightmost_descendant(pos_css);
				1016	continue;
				1017	}
				1018	if (!cpumask_subset(cs->effective_cpus,
				1019	cpu_active_mask)) {
				1020	rcu_read_unlock();
				1021	return;
				1022	}
				1023	}
				1024	rcu_read_unlock();
				1025	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1026
				1027	/* Generate domain masks and attrs */
				1028	ndoms = generate_sched_domains(&doms, &attr);
				1029
				1030	/* Have scheduler rebuild the domains */
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1031	partition_and_rebuild_sched_domains(ndoms, doms, attr);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1032	}
				1033	#else /* !CONFIG_SMP */
				1034	static void rebuild_sched_domains_locked(void)
				1035	{
				1036	}
				1037	#endif /* CONFIG_SMP */
				1038
				1039	void rebuild_sched_domains(void)
				1040	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1041	get_online_cpus();
				1042	percpu_down_write(&cpuset_rwsem);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1043	rebuild_sched_domains_locked();
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1044	percpu_up_write(&cpuset_rwsem);
				1045	put_online_cpus();
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1046	}
				1047
				1048	/**
				1049	* update_tasks_cpumask - Update the cpumasks of tasks in the cpuset.
				1050	* @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
				1051	*
				1052	* Iterate through each task of @cs updating its cpus_allowed to the
				1053	* effective cpuset's. As this function is called with cpuset_mutex held,
				1054	* cpuset membership stays stable.
				1055	*/
				1056	static void update_tasks_cpumask(struct cpuset *cs)
				1057	{
				1058	struct css_task_iter it;
				1059	struct task_struct *task;
				1060
				1061	css_task_iter_start(&cs->css, 0, &it);
				1062	while ((task = css_task_iter_next(&it)))
				1063	set_cpus_allowed_ptr(task, cs->effective_cpus);
				1064	css_task_iter_end(&it);
				1065	}
				1066
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1067	/**
				1068	* compute_effective_cpumask - Compute the effective cpumask of the cpuset
				1069	* @new_cpus: the temp variable for the new effective_cpus mask
				1070	* @cs: the cpuset the need to recompute the new effective_cpus mask
				1071	* @parent: the parent cpuset
				1072	*
				1073	* If the parent has subpartition CPUs, include them in the list of
				1074	* allowable CPUs in computing the new effective_cpus mask. Since offlined
				1075	* CPUs are not removed from subparts_cpus, we have to use cpu_active_mask
				1076	* to mask those out.
				1077	*/
				1078	static void compute_effective_cpumask(struct cpumask *new_cpus,
				1079	struct cpuset cs, struct cpuset parent)
				1080	{
				1081	if (parent->nr_subparts_cpus) {
				1082	cpumask_or(new_cpus, parent->effective_cpus,
				1083	parent->subparts_cpus);
				1084	cpumask_and(new_cpus, new_cpus, cs->cpus_allowed);
				1085	cpumask_and(new_cpus, new_cpus, cpu_active_mask);
				1086	} else {
				1087	cpumask_and(new_cpus, cs->cpus_allowed, parent->effective_cpus);
				1088	}
				1089	}
				1090
				1091	/*
				1092	* Commands for update_parent_subparts_cpumask
				1093	*/
				1094	enum subparts_cmd {
				1095	partcmd_enable, /* Enable partition root */
				1096	partcmd_disable, /* Disable partition root */
				1097	partcmd_update, /* Update parent's subparts_cpus */
				1098	};
				1099
				1100	/**
				1101	* update_parent_subparts_cpumask - update subparts_cpus mask of parent cpuset
				1102	* @cpuset: The cpuset that requests change in partition root state
				1103	* @cmd: Partition root state change command
				1104	* @newmask: Optional new cpumask for partcmd_update
				1105	* @tmp: Temporary addmask and delmask
				1106	* Return: 0, 1 or an error code
				1107	*
				1108	* For partcmd_enable, the cpuset is being transformed from a non-partition
				1109	* root to a partition root. The cpus_allowed mask of the given cpuset will
				1110	* be put into parent's subparts_cpus and taken away from parent's
				1111	* effective_cpus. The function will return 0 if all the CPUs listed in
				1112	* cpus_allowed can be granted or an error code will be returned.
				1113	*
				1114	* For partcmd_disable, the cpuset is being transofrmed from a partition
				1115	* root back to a non-partition root. any CPUs in cpus_allowed that are in
				1116	* parent's subparts_cpus will be taken away from that cpumask and put back
				1117	* into parent's effective_cpus. 0 should always be returned.
				1118	*
				1119	* For partcmd_update, if the optional newmask is specified, the cpu
				1120	* list is to be changed from cpus_allowed to newmask. Otherwise,
				1121	* cpus_allowed is assumed to remain the same. The cpuset should either
				1122	* be a partition root or an invalid partition root. The partition root
				1123	* state may change if newmask is NULL and none of the requested CPUs can
				1124	* be granted by the parent. The function will return 1 if changes to
				1125	* parent's subparts_cpus and effective_cpus happen or 0 otherwise.
				1126	* Error code should only be returned when newmask is non-NULL.
				1127	*
				1128	* The partcmd_enable and partcmd_disable commands are used by
				1129	* update_prstate(). The partcmd_update command is used by
				1130	* update_cpumasks_hier() with newmask NULL and update_cpumask() with
				1131	* newmask set.
				1132	*
				1133	* The checking is more strict when enabling partition root than the
				1134	* other two commands.
				1135	*
				1136	* Because of the implicit cpu exclusive nature of a partition root,
				1137	* cpumask changes that violates the cpu exclusivity rule will not be
				1138	* permitted when checked by validate_change(). The validate_change()
				1139	* function will also prevent any changes to the cpu list if it is not
				1140	* a superset of children's cpu lists.
				1141	*/
				1142	static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd,
				1143	struct cpumask *newmask,
				1144	struct tmpmasks *tmp)
				1145	{
				1146	struct cpuset *parent = parent_cs(cpuset);
				1147	int adding; /* Moving cpus from effective_cpus to subparts_cpus */
				1148	int deleting; /* Moving cpus from subparts_cpus to effective_cpus */
				1149	bool part_error = false; /* Partition error? */
				1150
				1151	percpu_rwsem_assert_held(&cpuset_rwsem);
				1152
				1153	/*
				1154	* The parent must be a partition root.
				1155	* The new cpumask, if present, or the current cpus_allowed must
				1156	* not be empty.
				1157	*/
				1158	if (!is_partition_root(parent) \|\|
				1159	(newmask && cpumask_empty(newmask)) \|\|
				1160	(!newmask && cpumask_empty(cpuset->cpus_allowed)))
				1161	return -EINVAL;
				1162
				1163	/*
				1164	* Enabling/disabling partition root is not allowed if there are
				1165	* online children.
				1166	*/
				1167	if ((cmd != partcmd_update) && css_has_online_children(&cpuset->css))
				1168	return -EBUSY;
				1169
				1170	/*
				1171	* Enabling partition root is not allowed if not all the CPUs
				1172	* can be granted from parent's effective_cpus or at least one
				1173	* CPU will be left after that.
				1174	*/
				1175	if ((cmd == partcmd_enable) &&
				1176	(!cpumask_subset(cpuset->cpus_allowed, parent->effective_cpus) \|\|
				1177	cpumask_equal(cpuset->cpus_allowed, parent->effective_cpus)))
				1178	return -EINVAL;
				1179
				1180	/*
				1181	* A cpumask update cannot make parent's effective_cpus become empty.
				1182	*/
				1183	adding = deleting = false;
				1184	if (cmd == partcmd_enable) {
				1185	cpumask_copy(tmp->addmask, cpuset->cpus_allowed);
				1186	adding = true;
				1187	} else if (cmd == partcmd_disable) {
				1188	deleting = cpumask_and(tmp->delmask, cpuset->cpus_allowed,
				1189	parent->subparts_cpus);
				1190	} else if (newmask) {
				1191	/*
				1192	* partcmd_update with newmask:
				1193	*
				1194	* delmask = cpus_allowed & ~newmask & parent->subparts_cpus
				1195	* addmask = newmask & parent->effective_cpus
				1196	* & ~parent->subparts_cpus
				1197	*/
				1198	cpumask_andnot(tmp->delmask, cpuset->cpus_allowed, newmask);
				1199	deleting = cpumask_and(tmp->delmask, tmp->delmask,
				1200	parent->subparts_cpus);
				1201
				1202	cpumask_and(tmp->addmask, newmask, parent->effective_cpus);
				1203	adding = cpumask_andnot(tmp->addmask, tmp->addmask,
				1204	parent->subparts_cpus);
				1205	/*
				1206	* Return error if the new effective_cpus could become empty.
				1207	*/
				1208	if (adding &&
				1209	cpumask_equal(parent->effective_cpus, tmp->addmask)) {
				1210	if (!deleting)
				1211	return -EINVAL;
				1212	/*
				1213	* As some of the CPUs in subparts_cpus might have
				1214	* been offlined, we need to compute the real delmask
				1215	* to confirm that.
				1216	*/
				1217	if (!cpumask_and(tmp->addmask, tmp->delmask,
				1218	cpu_active_mask))
				1219	return -EINVAL;
				1220	cpumask_copy(tmp->addmask, parent->effective_cpus);
				1221	}
				1222	} else {
				1223	/*
				1224	* partcmd_update w/o newmask:
				1225	*
				1226	* addmask = cpus_allowed & parent->effectiveb_cpus
				1227	*
				1228	* Note that parent's subparts_cpus may have been
				1229	* pre-shrunk in case there is a change in the cpu list.
				1230	* So no deletion is needed.
				1231	*/
				1232	adding = cpumask_and(tmp->addmask, cpuset->cpus_allowed,
				1233	parent->effective_cpus);
				1234	part_error = cpumask_equal(tmp->addmask,
				1235	parent->effective_cpus);
				1236	}
				1237
				1238	if (cmd == partcmd_update) {
				1239	int prev_prs = cpuset->partition_root_state;
				1240
				1241	/*
				1242	* Check for possible transition between PRS_ENABLED
				1243	* and PRS_ERROR.
				1244	*/
				1245	switch (cpuset->partition_root_state) {
				1246	case PRS_ENABLED:
				1247	if (part_error)
				1248	cpuset->partition_root_state = PRS_ERROR;
				1249	break;
				1250	case PRS_ERROR:
				1251	if (!part_error)
				1252	cpuset->partition_root_state = PRS_ENABLED;
				1253	break;
				1254	}
				1255	/*
				1256	* Set part_error if previously in invalid state.
				1257	*/
				1258	part_error = (prev_prs == PRS_ERROR);
				1259	}
				1260
				1261	if (!part_error && (cpuset->partition_root_state == PRS_ERROR))
				1262	return 0; /* Nothing need to be done */
				1263
				1264	if (cpuset->partition_root_state == PRS_ERROR) {
				1265	/*
				1266	* Remove all its cpus from parent's subparts_cpus.
				1267	*/
				1268	adding = false;
				1269	deleting = cpumask_and(tmp->delmask, cpuset->cpus_allowed,
				1270	parent->subparts_cpus);
				1271	}
				1272
				1273	if (!adding && !deleting)
				1274	return 0;
				1275
				1276	/*
				1277	* Change the parent's subparts_cpus.
				1278	* Newly added CPUs will be removed from effective_cpus and
				1279	* newly deleted ones will be added back to effective_cpus.
				1280	*/
				1281	spin_lock_irq(&callback_lock);
				1282	if (adding) {
				1283	cpumask_or(parent->subparts_cpus,
				1284	parent->subparts_cpus, tmp->addmask);
				1285	cpumask_andnot(parent->effective_cpus,
				1286	parent->effective_cpus, tmp->addmask);
				1287	}
				1288	if (deleting) {
				1289	cpumask_andnot(parent->subparts_cpus,
				1290	parent->subparts_cpus, tmp->delmask);
				1291	/*
				1292	* Some of the CPUs in subparts_cpus might have been offlined.
				1293	*/
				1294	cpumask_and(tmp->delmask, tmp->delmask, cpu_active_mask);
				1295	cpumask_or(parent->effective_cpus,
				1296	parent->effective_cpus, tmp->delmask);
				1297	}
				1298
				1299	parent->nr_subparts_cpus = cpumask_weight(parent->subparts_cpus);
				1300	spin_unlock_irq(&callback_lock);
				1301
				1302	return cmd == partcmd_update;
				1303	}
				1304
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1305	/*
				1306	* update_cpumasks_hier - Update effective cpumasks and tasks in the subtree
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1307	* @cs: the cpuset to consider
				1308	* @tmp: temp variables for calculating effective_cpus & partition setup
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1309	*
				1310	* When congifured cpumask is changed, the effective cpumasks of this cpuset
				1311	* and all its descendants need to be updated.
				1312	*
				1313	* On legacy hierachy, effective_cpus will be the same with cpu_allowed.
				1314	*
				1315	* Called with cpuset_mutex held
				1316	*/
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1317	static void update_cpumasks_hier(struct cpuset cs, struct tmpmasks tmp)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1318	{
				1319	struct cpuset *cp;
				1320	struct cgroup_subsys_state *pos_css;
				1321	bool need_rebuild_sched_domains = false;
				1322
				1323	rcu_read_lock();
				1324	cpuset_for_each_descendant_pre(cp, pos_css, cs) {
				1325	struct cpuset *parent = parent_cs(cp);
				1326
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1327	compute_effective_cpumask(tmp->new_cpus, cp, parent);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1328
				1329	/*
				1330	* If it becomes empty, inherit the effective mask of the
				1331	* parent, which is guaranteed to have some CPUs.
				1332	*/
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1333	if (is_in_v2_mode() && cpumask_empty(tmp->new_cpus)) {
				1334	cpumask_copy(tmp->new_cpus, parent->effective_cpus);
				1335	if (!cp->use_parent_ecpus) {
				1336	cp->use_parent_ecpus = true;
				1337	parent->child_ecpus_count++;
				1338	}
				1339	} else if (cp->use_parent_ecpus) {
				1340	cp->use_parent_ecpus = false;
				1341	WARN_ON_ONCE(!parent->child_ecpus_count);
				1342	parent->child_ecpus_count--;
				1343	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1344
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1345	/*
				1346	* Skip the whole subtree if the cpumask remains the same
				1347	* and has no partition root state.
				1348	*/
				1349	if (!cp->partition_root_state &&
				1350	cpumask_equal(tmp->new_cpus, cp->effective_cpus)) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1351	pos_css = css_rightmost_descendant(pos_css);
				1352	continue;
				1353	}
				1354
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1355	/*
				1356	* update_parent_subparts_cpumask() should have been called
				1357	* for cs already in update_cpumask(). We should also call
				1358	* update_tasks_cpumask() again for tasks in the parent
				1359	* cpuset if the parent's subparts_cpus changes.
				1360	*/
				1361	if ((cp != cs) && cp->partition_root_state) {
				1362	switch (parent->partition_root_state) {
				1363	case PRS_DISABLED:
				1364	/*
				1365	* If parent is not a partition root or an
				1366	* invalid partition root, clear the state
				1367	* state and the CS_CPU_EXCLUSIVE flag.
				1368	*/
				1369	WARN_ON_ONCE(cp->partition_root_state
				1370	!= PRS_ERROR);
				1371	cp->partition_root_state = 0;
				1372
				1373	/*
				1374	* clear_bit() is an atomic operation and
				1375	* readers aren't interested in the state
				1376	* of CS_CPU_EXCLUSIVE anyway. So we can
				1377	* just update the flag without holding
				1378	* the callback_lock.
				1379	*/
				1380	clear_bit(CS_CPU_EXCLUSIVE, &cp->flags);
				1381	break;
				1382
				1383	case PRS_ENABLED:
				1384	if (update_parent_subparts_cpumask(cp, partcmd_update, NULL, tmp))
				1385	update_tasks_cpumask(parent);
				1386	break;
				1387
				1388	case PRS_ERROR:
				1389	/*
				1390	* When parent is invalid, it has to be too.
				1391	*/
				1392	cp->partition_root_state = PRS_ERROR;
				1393	if (cp->nr_subparts_cpus) {
				1394	cp->nr_subparts_cpus = 0;
				1395	cpumask_clear(cp->subparts_cpus);
				1396	}
				1397	break;
				1398	}
				1399	}
				1400
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1401	if (!css_tryget_online(&cp->css))
				1402	continue;
				1403	rcu_read_unlock();
				1404
				1405	spin_lock_irq(&callback_lock);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1406
				1407	cpumask_copy(cp->effective_cpus, tmp->new_cpus);
				1408	if (cp->nr_subparts_cpus &&
				1409	(cp->partition_root_state != PRS_ENABLED)) {
				1410	cp->nr_subparts_cpus = 0;
				1411	cpumask_clear(cp->subparts_cpus);
				1412	} else if (cp->nr_subparts_cpus) {
				1413	/*
				1414	* Make sure that effective_cpus & subparts_cpus
				1415	* are mutually exclusive.
				1416	*
				1417	* In the unlikely event that effective_cpus
				1418	* becomes empty. we clear cp->nr_subparts_cpus and
				1419	* let its child partition roots to compete for
				1420	* CPUs again.
				1421	*/
				1422	cpumask_andnot(cp->effective_cpus, cp->effective_cpus,
				1423	cp->subparts_cpus);
				1424	if (cpumask_empty(cp->effective_cpus)) {
				1425	cpumask_copy(cp->effective_cpus, tmp->new_cpus);
				1426	cpumask_clear(cp->subparts_cpus);
				1427	cp->nr_subparts_cpus = 0;
				1428	} else if (!cpumask_subset(cp->subparts_cpus,
				1429	tmp->new_cpus)) {
				1430	cpumask_andnot(cp->subparts_cpus,
				1431	cp->subparts_cpus, tmp->new_cpus);
				1432	cp->nr_subparts_cpus
				1433	= cpumask_weight(cp->subparts_cpus);
				1434	}
				1435	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1436	spin_unlock_irq(&callback_lock);
				1437
				1438	WARN_ON(!is_in_v2_mode() &&
				1439	!cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
				1440
				1441	update_tasks_cpumask(cp);
				1442
				1443	/*
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1444	* On legacy hierarchy, if the effective cpumask of any non-
				1445	* empty cpuset is changed, we need to rebuild sched domains.
				1446	* On default hierarchy, the cpuset needs to be a partition
				1447	* root as well.
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1448	*/
				1449	if (!cpumask_empty(cp->cpus_allowed) &&
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1450	is_sched_load_balance(cp) &&
				1451	(!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) \|\|
				1452	is_partition_root(cp)))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1453	need_rebuild_sched_domains = true;
				1454
				1455	rcu_read_lock();
				1456	css_put(&cp->css);
				1457	}
				1458	rcu_read_unlock();
				1459
				1460	if (need_rebuild_sched_domains)
				1461	rebuild_sched_domains_locked();
				1462	}
				1463
				1464	/**
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1465	* update_sibling_cpumasks - Update siblings cpumasks
				1466	* @parent: Parent cpuset
				1467	* @cs: Current cpuset
				1468	* @tmp: Temp variables
				1469	*/
				1470	static void update_sibling_cpumasks(struct cpuset parent, struct cpuset cs,
				1471	struct tmpmasks *tmp)
				1472	{
				1473	struct cpuset *sibling;
				1474	struct cgroup_subsys_state *pos_css;
				1475
				1476	/*
				1477	* Check all its siblings and call update_cpumasks_hier()
				1478	* if their use_parent_ecpus flag is set in order for them
				1479	* to use the right effective_cpus value.
				1480	*/
				1481	rcu_read_lock();
				1482	cpuset_for_each_child(sibling, pos_css, parent) {
				1483	if (sibling == cs)
				1484	continue;
				1485	if (!sibling->use_parent_ecpus)
				1486	continue;
				1487
				1488	update_cpumasks_hier(sibling, tmp);
				1489	}
				1490	rcu_read_unlock();
				1491	}
				1492
				1493	/**
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1494	* update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it
				1495	* @cs: the cpuset to consider
				1496	* @trialcs: trial cpuset
				1497	* @buf: buffer of cpu numbers written to this cpuset
				1498	*/
				1499	static int update_cpumask(struct cpuset cs, struct cpuset trialcs,
				1500	const char *buf)
				1501	{
				1502	int retval;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1503	struct tmpmasks tmp;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1504
				1505	/* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */
				1506	if (cs == &top_cpuset)
				1507	return -EACCES;
				1508
				1509	/*
				1510	* An empty cpus_allowed is ok only if the cpuset has no tasks.
				1511	* Since cpulist_parse() fails on an empty mask, we special case
				1512	* that parsing. The validate_change() call ensures that cpusets
				1513	* with tasks have cpus.
				1514	*/
				1515	if (!*buf) {
				1516	cpumask_clear(trialcs->cpus_allowed);
				1517	} else {
				1518	retval = cpulist_parse(buf, trialcs->cpus_allowed);
				1519	if (retval < 0)
				1520	return retval;
				1521
				1522	if (!cpumask_subset(trialcs->cpus_allowed,
				1523	top_cpuset.cpus_allowed))
				1524	return -EINVAL;
				1525	}
				1526
				1527	/* Nothing to do if the cpus didn't change */
				1528	if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed))
				1529	return 0;
				1530
				1531	retval = validate_change(cs, trialcs);
				1532	if (retval < 0)
				1533	return retval;
				1534
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1535	#ifdef CONFIG_CPUMASK_OFFSTACK
				1536	/*
				1537	* Use the cpumasks in trialcs for tmpmasks when they are pointers
				1538	* to allocated cpumasks.
				1539	*/
				1540	tmp.addmask = trialcs->subparts_cpus;
				1541	tmp.delmask = trialcs->effective_cpus;
				1542	tmp.new_cpus = trialcs->cpus_allowed;
				1543	#endif
				1544
				1545	if (cs->partition_root_state) {
				1546	/* Cpumask of a partition root cannot be empty */
				1547	if (cpumask_empty(trialcs->cpus_allowed))
				1548	return -EINVAL;
				1549	if (update_parent_subparts_cpumask(cs, partcmd_update,
				1550	trialcs->cpus_allowed, &tmp) < 0)
				1551	return -EINVAL;
				1552	}
				1553
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1554	spin_lock_irq(&callback_lock);
				1555	cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1556
				1557	/*
				1558	* Make sure that subparts_cpus is a subset of cpus_allowed.
				1559	*/
				1560	if (cs->nr_subparts_cpus) {
				1561	cpumask_andnot(cs->subparts_cpus, cs->subparts_cpus,
				1562	cs->cpus_allowed);
				1563	cs->nr_subparts_cpus = cpumask_weight(cs->subparts_cpus);
				1564	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1565	spin_unlock_irq(&callback_lock);
				1566
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1567	update_cpumasks_hier(cs, &tmp);
				1568
				1569	if (cs->partition_root_state) {
				1570	struct cpuset *parent = parent_cs(cs);
				1571
				1572	/*
				1573	* For partition root, update the cpumasks of sibling
				1574	* cpusets if they use parent's effective_cpus.
				1575	*/
				1576	if (parent->child_ecpus_count)
				1577	update_sibling_cpumasks(parent, cs, &tmp);
				1578	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1579	return 0;
				1580	}
				1581
				1582	/*
				1583	* Migrate memory region from one set of nodes to another. This is
				1584	* performed asynchronously as it can be called from process migration path
				1585	* holding locks involved in process management. All mm migrations are
				1586	* performed in the queued order and can be waited for by flushing
				1587	* cpuset_migrate_mm_wq.
				1588	*/
				1589
				1590	struct cpuset_migrate_mm_work {
				1591	struct work_struct work;
				1592	struct mm_struct *mm;
				1593	nodemask_t from;
				1594	nodemask_t to;
				1595	};
				1596
				1597	static void cpuset_migrate_mm_workfn(struct work_struct *work)
				1598	{
				1599	struct cpuset_migrate_mm_work *mwork =
				1600	container_of(work, struct cpuset_migrate_mm_work, work);
				1601
				1602	/* on a wq worker, no need to worry about %current's mems_allowed */
				1603	do_migrate_pages(mwork->mm, &mwork->from, &mwork->to, MPOL_MF_MOVE_ALL);
				1604	mmput(mwork->mm);
				1605	kfree(mwork);
				1606	}
				1607
				1608	static void cpuset_migrate_mm(struct mm_struct mm, const nodemask_t from,
				1609	const nodemask_t *to)
				1610	{
				1611	struct cpuset_migrate_mm_work *mwork;
				1612
				1613	mwork = kzalloc(sizeof(*mwork), GFP_KERNEL);
				1614	if (mwork) {
				1615	mwork->mm = mm;
				1616	mwork->from = *from;
				1617	mwork->to = *to;
				1618	INIT_WORK(&mwork->work, cpuset_migrate_mm_workfn);
				1619	queue_work(cpuset_migrate_mm_wq, &mwork->work);
				1620	} else {
				1621	mmput(mm);
				1622	}
				1623	}
				1624
				1625	static void cpuset_post_attach(void)
				1626	{
				1627	flush_workqueue(cpuset_migrate_mm_wq);
				1628	}
				1629
				1630	/*
				1631	* cpuset_change_task_nodemask - change task's mems_allowed and mempolicy
				1632	* @tsk: the task to change
				1633	* @newmems: new nodes that the task will be set
				1634	*
				1635	* We use the mems_allowed_seq seqlock to safely update both tsk->mems_allowed
				1636	* and rebind an eventual tasks' mempolicy. If the task is allocating in
				1637	* parallel, it might temporarily see an empty intersection, which results in
				1638	* a seqlock check and retry before OOM or allocation failure.
				1639	*/
				1640	static void cpuset_change_task_nodemask(struct task_struct *tsk,
				1641	nodemask_t *newmems)
				1642	{
				1643	task_lock(tsk);
				1644
				1645	local_irq_disable();
				1646	write_seqcount_begin(&tsk->mems_allowed_seq);
				1647
				1648	nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
				1649	mpol_rebind_task(tsk, newmems);
				1650	tsk->mems_allowed = *newmems;
				1651
				1652	write_seqcount_end(&tsk->mems_allowed_seq);
				1653	local_irq_enable();
				1654
				1655	task_unlock(tsk);
				1656	}
				1657
				1658	static void *cpuset_being_rebound;
				1659
				1660	/**
				1661	* update_tasks_nodemask - Update the nodemasks of tasks in the cpuset.
				1662	* @cs: the cpuset in which each task's mems_allowed mask needs to be changed
				1663	*
				1664	* Iterate through each task of @cs updating its mems_allowed to the
				1665	* effective cpuset's. As this function is called with cpuset_mutex held,
				1666	* cpuset membership stays stable.
				1667	*/
				1668	static void update_tasks_nodemask(struct cpuset *cs)
				1669	{
				1670	static nodemask_t newmems; /* protected by cpuset_mutex */
				1671	struct css_task_iter it;
				1672	struct task_struct *task;
				1673
				1674	cpuset_being_rebound = cs; /* causes mpol_dup() rebind */
				1675
				1676	guarantee_online_mems(cs, &newmems);
				1677
				1678	/*
				1679	* The mpol_rebind_mm() call takes mmap_sem, which we couldn't
				1680	* take while holding tasklist_lock. Forks can happen - the
				1681	* mpol_dup() cpuset_being_rebound check will catch such forks,
				1682	* and rebind their vma mempolicies too. Because we still hold
				1683	* the global cpuset_mutex, we know that no other rebind effort
				1684	* will be contending for the global variable cpuset_being_rebound.
				1685	* It's ok if we rebind the same mm twice; mpol_rebind_mm()
				1686	* is idempotent. Also migrate pages in each mm to new nodes.
				1687	*/
				1688	css_task_iter_start(&cs->css, 0, &it);
				1689	while ((task = css_task_iter_next(&it))) {
				1690	struct mm_struct *mm;
				1691	bool migrate;
				1692
				1693	cpuset_change_task_nodemask(task, &newmems);
				1694
				1695	mm = get_task_mm(task);
				1696	if (!mm)
				1697	continue;
				1698
				1699	migrate = is_memory_migrate(cs);
				1700
				1701	mpol_rebind_mm(mm, &cs->mems_allowed);
				1702	if (migrate)
				1703	cpuset_migrate_mm(mm, &cs->old_mems_allowed, &newmems);
				1704	else
				1705	mmput(mm);
				1706	}
				1707	css_task_iter_end(&it);
				1708
				1709	/*
				1710	* All the tasks' nodemasks have been updated, update
				1711	* cs->old_mems_allowed.
				1712	*/
				1713	cs->old_mems_allowed = newmems;
				1714
				1715	/* We're done rebinding vmas to this cpuset's new mems_allowed. */
				1716	cpuset_being_rebound = NULL;
				1717	}
				1718
				1719	/*
				1720	* update_nodemasks_hier - Update effective nodemasks and tasks in the subtree
				1721	* @cs: the cpuset to consider
				1722	* @new_mems: a temp variable for calculating new effective_mems
				1723	*
				1724	* When configured nodemask is changed, the effective nodemasks of this cpuset
				1725	* and all its descendants need to be updated.
				1726	*
				1727	* On legacy hiearchy, effective_mems will be the same with mems_allowed.
				1728	*
				1729	* Called with cpuset_mutex held
				1730	*/
				1731	static void update_nodemasks_hier(struct cpuset cs, nodemask_t new_mems)
				1732	{
				1733	struct cpuset *cp;
				1734	struct cgroup_subsys_state *pos_css;
				1735
				1736	rcu_read_lock();
				1737	cpuset_for_each_descendant_pre(cp, pos_css, cs) {
				1738	struct cpuset *parent = parent_cs(cp);
				1739
				1740	nodes_and(*new_mems, cp->mems_allowed, parent->effective_mems);
				1741
				1742	/*
				1743	* If it becomes empty, inherit the effective mask of the
				1744	* parent, which is guaranteed to have some MEMs.
				1745	*/
				1746	if (is_in_v2_mode() && nodes_empty(*new_mems))
				1747	*new_mems = parent->effective_mems;
				1748
				1749	/* Skip the whole subtree if the nodemask remains the same. */
				1750	if (nodes_equal(*new_mems, cp->effective_mems)) {
				1751	pos_css = css_rightmost_descendant(pos_css);
				1752	continue;
				1753	}
				1754
				1755	if (!css_tryget_online(&cp->css))
				1756	continue;
				1757	rcu_read_unlock();
				1758
				1759	spin_lock_irq(&callback_lock);
				1760	cp->effective_mems = *new_mems;
				1761	spin_unlock_irq(&callback_lock);
				1762
				1763	WARN_ON(!is_in_v2_mode() &&
				1764	!nodes_equal(cp->mems_allowed, cp->effective_mems));
				1765
				1766	update_tasks_nodemask(cp);
				1767
				1768	rcu_read_lock();
				1769	css_put(&cp->css);
				1770	}
				1771	rcu_read_unlock();
				1772	}
				1773
				1774	/*
				1775	* Handle user request to change the 'mems' memory placement
				1776	* of a cpuset. Needs to validate the request, update the
				1777	* cpusets mems_allowed, and for each task in the cpuset,
				1778	* update mems_allowed and rebind task's mempolicy and any vma
				1779	* mempolicies and if the cpuset is marked 'memory_migrate',
				1780	* migrate the tasks pages to the new memory.
				1781	*
				1782	* Call with cpuset_mutex held. May take callback_lock during call.
				1783	* Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
				1784	* lock each such tasks mm->mmap_sem, scan its vma's and rebind
				1785	* their mempolicies to the cpusets new mems_allowed.
				1786	*/
				1787	static int update_nodemask(struct cpuset cs, struct cpuset trialcs,
				1788	const char *buf)
				1789	{
				1790	int retval;
				1791
				1792	/*
				1793	* top_cpuset.mems_allowed tracks node_stats[N_MEMORY];
				1794	* it's read-only
				1795	*/
				1796	if (cs == &top_cpuset) {
				1797	retval = -EACCES;
				1798	goto done;
				1799	}
				1800
				1801	/*
				1802	* An empty mems_allowed is ok iff there are no tasks in the cpuset.
				1803	* Since nodelist_parse() fails on an empty mask, we special case
				1804	* that parsing. The validate_change() call ensures that cpusets
				1805	* with tasks have memory.
				1806	*/
				1807	if (!*buf) {
				1808	nodes_clear(trialcs->mems_allowed);
				1809	} else {
				1810	retval = nodelist_parse(buf, trialcs->mems_allowed);
				1811	if (retval < 0)
				1812	goto done;
				1813
				1814	if (!nodes_subset(trialcs->mems_allowed,
				1815	top_cpuset.mems_allowed)) {
				1816	retval = -EINVAL;
				1817	goto done;
				1818	}
				1819	}
				1820
				1821	if (nodes_equal(cs->mems_allowed, trialcs->mems_allowed)) {
				1822	retval = 0; /* Too easy - nothing to do */
				1823	goto done;
				1824	}
				1825	retval = validate_change(cs, trialcs);
				1826	if (retval < 0)
				1827	goto done;
				1828
				1829	spin_lock_irq(&callback_lock);
				1830	cs->mems_allowed = trialcs->mems_allowed;
				1831	spin_unlock_irq(&callback_lock);
				1832
				1833	/* use trialcs->mems_allowed as a temp variable */
				1834	update_nodemasks_hier(cs, &trialcs->mems_allowed);
				1835	done:
				1836	return retval;
				1837	}
				1838
				1839	bool current_cpuset_is_being_rebound(void)
				1840	{
				1841	bool ret;
				1842
				1843	rcu_read_lock();
				1844	ret = task_cs(current) == cpuset_being_rebound;
				1845	rcu_read_unlock();
				1846
				1847	return ret;
				1848	}
				1849
				1850	static int update_relax_domain_level(struct cpuset *cs, s64 val)
				1851	{
				1852	#ifdef CONFIG_SMP
				1853	if (val < -1 \|\| val >= sched_domain_level_max)
				1854	return -EINVAL;
				1855	#endif
				1856
				1857	if (val != cs->relax_domain_level) {
				1858	cs->relax_domain_level = val;
				1859	if (!cpumask_empty(cs->cpus_allowed) &&
				1860	is_sched_load_balance(cs))
				1861	rebuild_sched_domains_locked();
				1862	}
				1863
				1864	return 0;
				1865	}
				1866
				1867	/**
				1868	* update_tasks_flags - update the spread flags of tasks in the cpuset.
				1869	* @cs: the cpuset in which each task's spread flags needs to be changed
				1870	*
				1871	* Iterate through each task of @cs updating its spread flags. As this
				1872	* function is called with cpuset_mutex held, cpuset membership stays
				1873	* stable.
				1874	*/
				1875	static void update_tasks_flags(struct cpuset *cs)
				1876	{
				1877	struct css_task_iter it;
				1878	struct task_struct *task;
				1879
				1880	css_task_iter_start(&cs->css, 0, &it);
				1881	while ((task = css_task_iter_next(&it)))
				1882	cpuset_update_task_spread_flag(cs, task);
				1883	css_task_iter_end(&it);
				1884	}
				1885
				1886	/*
				1887	* update_flag - read a 0 or a 1 in a file and update associated flag
				1888	* bit: the bit to update (see cpuset_flagbits_t)
				1889	* cs: the cpuset to update
				1890	* turning_on: whether the flag is being set or cleared
				1891	*
				1892	* Call with cpuset_mutex held.
				1893	*/
				1894
				1895	static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
				1896	int turning_on)
				1897	{
				1898	struct cpuset *trialcs;
				1899	int balance_flag_changed;
				1900	int spread_flag_changed;
				1901	int err;
				1902
				1903	trialcs = alloc_trial_cpuset(cs);
				1904	if (!trialcs)
				1905	return -ENOMEM;
				1906
				1907	if (turning_on)
				1908	set_bit(bit, &trialcs->flags);
				1909	else
				1910	clear_bit(bit, &trialcs->flags);
				1911
				1912	err = validate_change(cs, trialcs);
				1913	if (err < 0)
				1914	goto out;
				1915
				1916	balance_flag_changed = (is_sched_load_balance(cs) !=
				1917	is_sched_load_balance(trialcs));
				1918
				1919	spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs))
				1920	\|\| (is_spread_page(cs) != is_spread_page(trialcs)));
				1921
				1922	spin_lock_irq(&callback_lock);
				1923	cs->flags = trialcs->flags;
				1924	spin_unlock_irq(&callback_lock);
				1925
				1926	if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
				1927	rebuild_sched_domains_locked();
				1928
				1929	if (spread_flag_changed)
				1930	update_tasks_flags(cs);
				1931	out:
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1932	free_cpuset(trialcs);
				1933	return err;
				1934	}
				1935
				1936	/*
				1937	* update_prstate - update partititon_root_state
				1938	* cs: the cpuset to update
				1939	* val: 0 - disabled, 1 - enabled
				1940	*
				1941	* Call with cpuset_mutex held.
				1942	*/
				1943	static int update_prstate(struct cpuset *cs, int val)
				1944	{
				1945	int err;
				1946	struct cpuset *parent = parent_cs(cs);
				1947	struct tmpmasks tmp;
				1948
				1949	if ((val != 0) && (val != 1))
				1950	return -EINVAL;
				1951	if (val == cs->partition_root_state)
				1952	return 0;
				1953
				1954	/*
				1955	* Cannot force a partial or invalid partition root to a full
				1956	* partition root.
				1957	*/
				1958	if (val && cs->partition_root_state)
				1959	return -EINVAL;
				1960
				1961	if (alloc_cpumasks(NULL, &tmp))
				1962	return -ENOMEM;
				1963
				1964	err = -EINVAL;
				1965	if (!cs->partition_root_state) {
				1966	/*
				1967	* Turning on partition root requires setting the
				1968	* CS_CPU_EXCLUSIVE bit implicitly as well and cpus_allowed
				1969	* cannot be NULL.
				1970	*/
				1971	if (cpumask_empty(cs->cpus_allowed))
				1972	goto out;
				1973
				1974	err = update_flag(CS_CPU_EXCLUSIVE, cs, 1);
				1975	if (err)
				1976	goto out;
				1977
				1978	err = update_parent_subparts_cpumask(cs, partcmd_enable,
				1979	NULL, &tmp);
				1980	if (err) {
				1981	update_flag(CS_CPU_EXCLUSIVE, cs, 0);
				1982	goto out;
				1983	}
				1984	cs->partition_root_state = PRS_ENABLED;
				1985	} else {
				1986	/*
				1987	* Turning off partition root will clear the
				1988	* CS_CPU_EXCLUSIVE bit.
				1989	*/
				1990	if (cs->partition_root_state == PRS_ERROR) {
				1991	cs->partition_root_state = 0;
				1992	update_flag(CS_CPU_EXCLUSIVE, cs, 0);
				1993	err = 0;
				1994	goto out;
				1995	}
				1996
				1997	err = update_parent_subparts_cpumask(cs, partcmd_disable,
				1998	NULL, &tmp);
				1999	if (err)
				2000	goto out;
				2001
				2002	cs->partition_root_state = 0;
				2003
				2004	/* Turning off CS_CPU_EXCLUSIVE will not return error */
				2005	update_flag(CS_CPU_EXCLUSIVE, cs, 0);
				2006	}
				2007
				2008	/*
				2009	* Update cpumask of parent's tasks except when it is the top
				2010	* cpuset as some system daemons cannot be mapped to other CPUs.
				2011	*/
				2012	if (parent != &top_cpuset)
				2013	update_tasks_cpumask(parent);
				2014
				2015	if (parent->child_ecpus_count)
				2016	update_sibling_cpumasks(parent, cs, &tmp);
				2017
				2018	rebuild_sched_domains_locked();
				2019	out:
				2020	free_cpumasks(NULL, &tmp);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2021	return err;
				2022	}
				2023
				2024	/*
				2025	* Frequency meter - How fast is some event occurring?
				2026	*
				2027	* These routines manage a digitally filtered, constant time based,
				2028	* event frequency meter. There are four routines:
				2029	* fmeter_init() - initialize a frequency meter.
				2030	* fmeter_markevent() - called each time the event happens.
				2031	* fmeter_getrate() - returns the recent rate of such events.
				2032	* fmeter_update() - internal routine used to update fmeter.
				2033	*
				2034	* A common data structure is passed to each of these routines,
				2035	* which is used to keep track of the state required to manage the
				2036	* frequency meter and its digital filter.
				2037	*
				2038	* The filter works on the number of events marked per unit time.
				2039	* The filter is single-pole low-pass recursive (IIR). The time unit
				2040	* is 1 second. Arithmetic is done using 32-bit integers scaled to
				2041	* simulate 3 decimal digits of precision (multiplied by 1000).
				2042	*
				2043	* With an FM_COEF of 933, and a time base of 1 second, the filter
				2044	* has a half-life of 10 seconds, meaning that if the events quit
				2045	* happening, then the rate returned from the fmeter_getrate()
				2046	* will be cut in half each 10 seconds, until it converges to zero.
				2047	*
				2048	* It is not worth doing a real infinitely recursive filter. If more
				2049	* than FM_MAXTICKS ticks have elapsed since the last filter event,
				2050	* just compute FM_MAXTICKS ticks worth, by which point the level
				2051	* will be stable.
				2052	*
				2053	* Limit the count of unprocessed events to FM_MAXCNT, so as to avoid
				2054	* arithmetic overflow in the fmeter_update() routine.
				2055	*
				2056	* Given the simple 32 bit integer arithmetic used, this meter works
				2057	* best for reporting rates between one per millisecond (msec) and
				2058	* one per 32 (approx) seconds. At constant rates faster than one
				2059	* per msec it maxes out at values just under 1,000,000. At constant
				2060	* rates between one per msec, and one per second it will stabilize
				2061	* to a value N*1000, where N is the rate of events per second.
				2062	* At constant rates between one per second and one per 32 seconds,
				2063	* it will be choppy, moving up on the seconds that have an event,
				2064	* and then decaying until the next event. At rates slower than
				2065	* about one in 32 seconds, it decays all the way back to zero between
				2066	* each event.
				2067	*/
				2068
				2069	#define FM_COEF 933 /* coefficient for half-life of 10 secs */
				2070	#define FM_MAXTICKS ((u32)99) /* useless computing more ticks than this */
				2071	#define FM_MAXCNT 1000000 /* limit cnt to avoid overflow */
				2072	#define FM_SCALE 1000 /* faux fixed point scale */
				2073
				2074	/* Initialize a frequency meter */
				2075	static void fmeter_init(struct fmeter *fmp)
				2076	{
				2077	fmp->cnt = 0;
				2078	fmp->val = 0;
				2079	fmp->time = 0;
				2080	spin_lock_init(&fmp->lock);
				2081	}
				2082
				2083	/* Internal meter update - process cnt events and update value */
				2084	static void fmeter_update(struct fmeter *fmp)
				2085	{
				2086	time64_t now;
				2087	u32 ticks;
				2088
				2089	now = ktime_get_seconds();
				2090	ticks = now - fmp->time;
				2091
				2092	if (ticks == 0)
				2093	return;
				2094
				2095	ticks = min(FM_MAXTICKS, ticks);
				2096	while (ticks-- > 0)
				2097	fmp->val = (FM_COEF * fmp->val) / FM_SCALE;
				2098	fmp->time = now;
				2099
				2100	fmp->val += ((FM_SCALE - FM_COEF) * fmp->cnt) / FM_SCALE;
				2101	fmp->cnt = 0;
				2102	}
				2103
				2104	/* Process any previous ticks, then bump cnt by one (times scale). */
				2105	static void fmeter_markevent(struct fmeter *fmp)
				2106	{
				2107	spin_lock(&fmp->lock);
				2108	fmeter_update(fmp);
				2109	fmp->cnt = min(FM_MAXCNT, fmp->cnt + FM_SCALE);
				2110	spin_unlock(&fmp->lock);
				2111	}
				2112
				2113	/* Process any previous ticks, then return current value. */
				2114	static int fmeter_getrate(struct fmeter *fmp)
				2115	{
				2116	int val;
				2117
				2118	spin_lock(&fmp->lock);
				2119	fmeter_update(fmp);
				2120	val = fmp->val;
				2121	spin_unlock(&fmp->lock);
				2122	return val;
				2123	}
				2124
				2125	static struct cpuset *cpuset_attach_old_cs;
				2126
				2127	/* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */
				2128	static int cpuset_can_attach(struct cgroup_taskset *tset)
				2129	{
				2130	struct cgroup_subsys_state *css;
				2131	struct cpuset *cs;
				2132	struct task_struct *task;
				2133	int ret;
				2134
				2135	/* used later by cpuset_attach() */
				2136	cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset, &css));
				2137	cs = css_cs(css);
				2138
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2139	percpu_down_write(&cpuset_rwsem);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2140
				2141	/* allow moving tasks into an empty cpuset if on default hierarchy */
				2142	ret = -ENOSPC;
				2143	if (!is_in_v2_mode() &&
				2144	(cpumask_empty(cs->cpus_allowed) \|\| nodes_empty(cs->mems_allowed)))
				2145	goto out_unlock;
				2146
				2147	cgroup_taskset_for_each(task, css, tset) {
				2148	ret = task_can_attach(task, cs->cpus_allowed);
				2149	if (ret)
				2150	goto out_unlock;
				2151	ret = security_task_setscheduler(task);
				2152	if (ret)
				2153	goto out_unlock;
				2154	}
				2155
				2156	/*
				2157	* Mark attach is in progress. This makes validate_change() fail
				2158	* changes which zero cpus/mems_allowed.
				2159	*/
				2160	cs->attach_in_progress++;
				2161	ret = 0;
				2162	out_unlock:
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2163	percpu_up_write(&cpuset_rwsem);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2164	return ret;
				2165	}
				2166
				2167	static void cpuset_cancel_attach(struct cgroup_taskset *tset)
				2168	{
				2169	struct cgroup_subsys_state *css;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2170
				2171	cgroup_taskset_first(tset, &css);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2172
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2173	percpu_down_write(&cpuset_rwsem);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2174	css_cs(css)->attach_in_progress--;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2175	percpu_up_write(&cpuset_rwsem);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2176	}
				2177
				2178	/*
				2179	* Protected by cpuset_mutex. cpus_attach is used only by cpuset_attach()
				2180	* but we can't allocate it dynamically there. Define it global and
				2181	* allocate from cpuset_init().
				2182	*/
				2183	static cpumask_var_t cpus_attach;
				2184
				2185	static void cpuset_attach(struct cgroup_taskset *tset)
				2186	{
				2187	/* static buf protected by cpuset_mutex */
				2188	static nodemask_t cpuset_attach_nodemask_to;
				2189	struct task_struct *task;
				2190	struct task_struct *leader;
				2191	struct cgroup_subsys_state *css;
				2192	struct cpuset *cs;
				2193	struct cpuset *oldcs = cpuset_attach_old_cs;
				2194
				2195	cgroup_taskset_first(tset, &css);
				2196	cs = css_cs(css);
				2197
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2198	percpu_down_write(&cpuset_rwsem);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2199
				2200	/* prepare for attach */
				2201	if (cs == &top_cpuset)
				2202	cpumask_copy(cpus_attach, cpu_possible_mask);
				2203	else
				2204	guarantee_online_cpus(cs, cpus_attach);
				2205
				2206	guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
				2207
				2208	cgroup_taskset_for_each(task, css, tset) {
				2209	/*
				2210	* can_attach beforehand should guarantee that this doesn't
				2211	* fail. TODO: have a better way to handle failure here
				2212	*/
				2213	WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach));
				2214
				2215	cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to);
				2216	cpuset_update_task_spread_flag(cs, task);
				2217	}
				2218
				2219	/*
				2220	* Change mm for all threadgroup leaders. This is expensive and may
				2221	* sleep and should be moved outside migration path proper.
				2222	*/
				2223	cpuset_attach_nodemask_to = cs->effective_mems;
				2224	cgroup_taskset_for_each_leader(leader, css, tset) {
				2225	struct mm_struct *mm = get_task_mm(leader);
				2226
				2227	if (mm) {
				2228	mpol_rebind_mm(mm, &cpuset_attach_nodemask_to);
				2229
				2230	/*
				2231	* old_mems_allowed is the same with mems_allowed
				2232	* here, except if this task is being moved
				2233	* automatically due to hotplug. In that case
				2234	* @mems_allowed has been updated and is empty, so
				2235	* @old_mems_allowed is the right nodesets that we
				2236	* migrate mm from.
				2237	*/
				2238	if (is_memory_migrate(cs))
				2239	cpuset_migrate_mm(mm, &oldcs->old_mems_allowed,
				2240	&cpuset_attach_nodemask_to);
				2241	else
				2242	mmput(mm);
				2243	}
				2244	}
				2245
				2246	cs->old_mems_allowed = cpuset_attach_nodemask_to;
				2247
				2248	cs->attach_in_progress--;
				2249	if (!cs->attach_in_progress)
				2250	wake_up(&cpuset_attach_wq);
				2251
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2252	percpu_up_write(&cpuset_rwsem);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2253	}
				2254
				2255	/* The various types of files and directories in a cpuset file system */
				2256
				2257	typedef enum {
				2258	FILE_MEMORY_MIGRATE,
				2259	FILE_CPULIST,
				2260	FILE_MEMLIST,
				2261	FILE_EFFECTIVE_CPULIST,
				2262	FILE_EFFECTIVE_MEMLIST,
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2263	FILE_SUBPARTS_CPULIST,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2264	FILE_CPU_EXCLUSIVE,
				2265	FILE_MEM_EXCLUSIVE,
				2266	FILE_MEM_HARDWALL,
				2267	FILE_SCHED_LOAD_BALANCE,
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2268	FILE_PARTITION_ROOT,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2269	FILE_SCHED_RELAX_DOMAIN_LEVEL,
				2270	FILE_MEMORY_PRESSURE_ENABLED,
				2271	FILE_MEMORY_PRESSURE,
				2272	FILE_SPREAD_PAGE,
				2273	FILE_SPREAD_SLAB,
				2274	} cpuset_filetype_t;
				2275
				2276	static int cpuset_write_u64(struct cgroup_subsys_state css, struct cftype cft,
				2277	u64 val)
				2278	{
				2279	struct cpuset *cs = css_cs(css);
				2280	cpuset_filetype_t type = cft->private;
				2281	int retval = 0;
				2282
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2283	get_online_cpus();
				2284	percpu_down_write(&cpuset_rwsem);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2285	if (!is_cpuset_online(cs)) {
				2286	retval = -ENODEV;
				2287	goto out_unlock;
				2288	}
				2289
				2290	switch (type) {
				2291	case FILE_CPU_EXCLUSIVE:
				2292	retval = update_flag(CS_CPU_EXCLUSIVE, cs, val);
				2293	break;
				2294	case FILE_MEM_EXCLUSIVE:
				2295	retval = update_flag(CS_MEM_EXCLUSIVE, cs, val);
				2296	break;
				2297	case FILE_MEM_HARDWALL:
				2298	retval = update_flag(CS_MEM_HARDWALL, cs, val);
				2299	break;
				2300	case FILE_SCHED_LOAD_BALANCE:
				2301	retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, val);
				2302	break;
				2303	case FILE_MEMORY_MIGRATE:
				2304	retval = update_flag(CS_MEMORY_MIGRATE, cs, val);
				2305	break;
				2306	case FILE_MEMORY_PRESSURE_ENABLED:
				2307	cpuset_memory_pressure_enabled = !!val;
				2308	break;
				2309	case FILE_SPREAD_PAGE:
				2310	retval = update_flag(CS_SPREAD_PAGE, cs, val);
				2311	break;
				2312	case FILE_SPREAD_SLAB:
				2313	retval = update_flag(CS_SPREAD_SLAB, cs, val);
				2314	break;
				2315	default:
				2316	retval = -EINVAL;
				2317	break;
				2318	}
				2319	out_unlock:
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2320	percpu_up_write(&cpuset_rwsem);
				2321	put_online_cpus();
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2322	return retval;
				2323	}
				2324
				2325	static int cpuset_write_s64(struct cgroup_subsys_state css, struct cftype cft,
				2326	s64 val)
				2327	{
				2328	struct cpuset *cs = css_cs(css);
				2329	cpuset_filetype_t type = cft->private;
				2330	int retval = -ENODEV;
				2331
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2332	get_online_cpus();
				2333	percpu_down_write(&cpuset_rwsem);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2334	if (!is_cpuset_online(cs))
				2335	goto out_unlock;
				2336
				2337	switch (type) {
				2338	case FILE_SCHED_RELAX_DOMAIN_LEVEL:
				2339	retval = update_relax_domain_level(cs, val);
				2340	break;
				2341	default:
				2342	retval = -EINVAL;
				2343	break;
				2344	}
				2345	out_unlock:
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2346	percpu_up_write(&cpuset_rwsem);
				2347	put_online_cpus();
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2348	return retval;
				2349	}
				2350
				2351	/*
				2352	* Common handling for a write to a "cpus" or "mems" file.
				2353	*/
				2354	static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
				2355	char *buf, size_t nbytes, loff_t off)
				2356	{
				2357	struct cpuset *cs = css_cs(of_css(of));
				2358	struct cpuset *trialcs;
				2359	int retval = -ENODEV;
				2360
				2361	buf = strstrip(buf);
				2362
				2363	/*
				2364	* CPU or memory hotunplug may leave @cs w/o any execution
				2365	* resources, in which case the hotplug code asynchronously updates
				2366	* configuration and transfers all tasks to the nearest ancestor
				2367	* which can execute.
				2368	*
				2369	* As writes to "cpus" or "mems" may restore @cs's execution
				2370	* resources, wait for the previously scheduled operations before
				2371	* proceeding, so that we don't end up keep removing tasks added
				2372	* after execution capability is restored.
				2373	*
				2374	* cpuset_hotplug_work calls back into cgroup core via
				2375	* cgroup_transfer_tasks() and waiting for it from a cgroupfs
				2376	* operation like this one can lead to a deadlock through kernfs
				2377	* active_ref protection. Let's break the protection. Losing the
				2378	* protection is okay as we check whether @cs is online after
				2379	* grabbing cpuset_mutex anyway. This only happens on the legacy
				2380	* hierarchies.
				2381	*/
				2382	css_get(&cs->css);
				2383	kernfs_break_active_protection(of->kn);
				2384	flush_work(&cpuset_hotplug_work);
				2385
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2386	get_online_cpus();
				2387	percpu_down_write(&cpuset_rwsem);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2388	if (!is_cpuset_online(cs))
				2389	goto out_unlock;
				2390
				2391	trialcs = alloc_trial_cpuset(cs);
				2392	if (!trialcs) {
				2393	retval = -ENOMEM;
				2394	goto out_unlock;
				2395	}
				2396
				2397	switch (of_cft(of)->private) {
				2398	case FILE_CPULIST:
				2399	retval = update_cpumask(cs, trialcs, buf);
				2400	break;
				2401	case FILE_MEMLIST:
				2402	retval = update_nodemask(cs, trialcs, buf);
				2403	break;
				2404	default:
				2405	retval = -EINVAL;
				2406	break;
				2407	}
				2408
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2409	free_cpuset(trialcs);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2410	out_unlock:
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2411	percpu_up_write(&cpuset_rwsem);
				2412	put_online_cpus();
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2413	kernfs_unbreak_active_protection(of->kn);
				2414	css_put(&cs->css);
				2415	flush_workqueue(cpuset_migrate_mm_wq);
				2416	return retval ?: nbytes;
				2417	}
				2418
				2419	/*
				2420	* These ascii lists should be read in a single call, by using a user
				2421	* buffer large enough to hold the entire map. If read in smaller
				2422	* chunks, there is no guarantee of atomicity. Since the display format
				2423	* used, list of ranges of sequential numbers, is variable length,
				2424	* and since these maps can change value dynamically, one could read
				2425	* gibberish by doing partial reads while a list was changing.
				2426	*/
				2427	static int cpuset_common_seq_show(struct seq_file sf, void v)
				2428	{
				2429	struct cpuset *cs = css_cs(seq_css(sf));
				2430	cpuset_filetype_t type = seq_cft(sf)->private;
				2431	int ret = 0;
				2432
				2433	spin_lock_irq(&callback_lock);
				2434
				2435	switch (type) {
				2436	case FILE_CPULIST:
				2437	seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->cpus_allowed));
				2438	break;
				2439	case FILE_MEMLIST:
				2440	seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->mems_allowed));
				2441	break;
				2442	case FILE_EFFECTIVE_CPULIST:
				2443	seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->effective_cpus));
				2444	break;
				2445	case FILE_EFFECTIVE_MEMLIST:
				2446	seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->effective_mems));
				2447	break;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2448	case FILE_SUBPARTS_CPULIST:
				2449	seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->subparts_cpus));
				2450	break;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2451	default:
				2452	ret = -EINVAL;
				2453	}
				2454
				2455	spin_unlock_irq(&callback_lock);
				2456	return ret;
				2457	}
				2458
				2459	static u64 cpuset_read_u64(struct cgroup_subsys_state css, struct cftype cft)
				2460	{
				2461	struct cpuset *cs = css_cs(css);
				2462	cpuset_filetype_t type = cft->private;
				2463	switch (type) {
				2464	case FILE_CPU_EXCLUSIVE:
				2465	return is_cpu_exclusive(cs);
				2466	case FILE_MEM_EXCLUSIVE:
				2467	return is_mem_exclusive(cs);
				2468	case FILE_MEM_HARDWALL:
				2469	return is_mem_hardwall(cs);
				2470	case FILE_SCHED_LOAD_BALANCE:
				2471	return is_sched_load_balance(cs);
				2472	case FILE_MEMORY_MIGRATE:
				2473	return is_memory_migrate(cs);
				2474	case FILE_MEMORY_PRESSURE_ENABLED:
				2475	return cpuset_memory_pressure_enabled;
				2476	case FILE_MEMORY_PRESSURE:
				2477	return fmeter_getrate(&cs->fmeter);
				2478	case FILE_SPREAD_PAGE:
				2479	return is_spread_page(cs);
				2480	case FILE_SPREAD_SLAB:
				2481	return is_spread_slab(cs);
				2482	default:
				2483	BUG();
				2484	}
				2485
				2486	/* Unreachable but makes gcc happy */
				2487	return 0;
				2488	}
				2489
				2490	static s64 cpuset_read_s64(struct cgroup_subsys_state css, struct cftype cft)
				2491	{
				2492	struct cpuset *cs = css_cs(css);
				2493	cpuset_filetype_t type = cft->private;
				2494	switch (type) {
				2495	case FILE_SCHED_RELAX_DOMAIN_LEVEL:
				2496	return cs->relax_domain_level;
				2497	default:
				2498	BUG();
				2499	}
				2500
				2501	/* Unrechable but makes gcc happy */
				2502	return 0;
				2503	}
				2504
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2505	static int sched_partition_show(struct seq_file seq, void v)
				2506	{
				2507	struct cpuset *cs = css_cs(seq_css(seq));
				2508
				2509	switch (cs->partition_root_state) {
				2510	case PRS_ENABLED:
				2511	seq_puts(seq, "root\n");
				2512	break;
				2513	case PRS_DISABLED:
				2514	seq_puts(seq, "member\n");
				2515	break;
				2516	case PRS_ERROR:
				2517	seq_puts(seq, "root invalid\n");
				2518	break;
				2519	}
				2520	return 0;
				2521	}
				2522
				2523	static ssize_t sched_partition_write(struct kernfs_open_file of, char buf,
				2524	size_t nbytes, loff_t off)
				2525	{
				2526	struct cpuset *cs = css_cs(of_css(of));
				2527	int val;
				2528	int retval = -ENODEV;
				2529
				2530	buf = strstrip(buf);
				2531
				2532	/*
				2533	* Convert "root" to ENABLED, and convert "member" to DISABLED.
				2534	*/
				2535	if (!strcmp(buf, "root"))
				2536	val = PRS_ENABLED;
				2537	else if (!strcmp(buf, "member"))
				2538	val = PRS_DISABLED;
				2539	else
				2540	return -EINVAL;
				2541
				2542	css_get(&cs->css);
				2543	get_online_cpus();
				2544	percpu_down_write(&cpuset_rwsem);
				2545	if (!is_cpuset_online(cs))
				2546	goto out_unlock;
				2547
				2548	retval = update_prstate(cs, val);
				2549	out_unlock:
				2550	percpu_up_write(&cpuset_rwsem);
				2551	put_online_cpus();
				2552	css_put(&cs->css);
				2553	return retval ?: nbytes;
				2554	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2555
				2556	/*
				2557	* for the common functions, 'private' gives the type of file
				2558	*/
				2559
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2560	static struct cftype legacy_files[] = {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2561	{
				2562	.name = "cpus",
				2563	.seq_show = cpuset_common_seq_show,
				2564	.write = cpuset_write_resmask,
				2565	.max_write_len = (100U + 6 * NR_CPUS),
				2566	.private = FILE_CPULIST,
				2567	},
				2568
				2569	{
				2570	.name = "mems",
				2571	.seq_show = cpuset_common_seq_show,
				2572	.write = cpuset_write_resmask,
				2573	.max_write_len = (100U + 6 * MAX_NUMNODES),
				2574	.private = FILE_MEMLIST,
				2575	},
				2576
				2577	{
				2578	.name = "effective_cpus",
				2579	.seq_show = cpuset_common_seq_show,
				2580	.private = FILE_EFFECTIVE_CPULIST,
				2581	},
				2582
				2583	{
				2584	.name = "effective_mems",
				2585	.seq_show = cpuset_common_seq_show,
				2586	.private = FILE_EFFECTIVE_MEMLIST,
				2587	},
				2588
				2589	{
				2590	.name = "cpu_exclusive",
				2591	.read_u64 = cpuset_read_u64,
				2592	.write_u64 = cpuset_write_u64,
				2593	.private = FILE_CPU_EXCLUSIVE,
				2594	},
				2595
				2596	{
				2597	.name = "mem_exclusive",
				2598	.read_u64 = cpuset_read_u64,
				2599	.write_u64 = cpuset_write_u64,
				2600	.private = FILE_MEM_EXCLUSIVE,
				2601	},
				2602
				2603	{
				2604	.name = "mem_hardwall",
				2605	.read_u64 = cpuset_read_u64,
				2606	.write_u64 = cpuset_write_u64,
				2607	.private = FILE_MEM_HARDWALL,
				2608	},
				2609
				2610	{
				2611	.name = "sched_load_balance",
				2612	.read_u64 = cpuset_read_u64,
				2613	.write_u64 = cpuset_write_u64,
				2614	.private = FILE_SCHED_LOAD_BALANCE,
				2615	},
				2616
				2617	{
				2618	.name = "sched_relax_domain_level",
				2619	.read_s64 = cpuset_read_s64,
				2620	.write_s64 = cpuset_write_s64,
				2621	.private = FILE_SCHED_RELAX_DOMAIN_LEVEL,
				2622	},
				2623
				2624	{
				2625	.name = "memory_migrate",
				2626	.read_u64 = cpuset_read_u64,
				2627	.write_u64 = cpuset_write_u64,
				2628	.private = FILE_MEMORY_MIGRATE,
				2629	},
				2630
				2631	{
				2632	.name = "memory_pressure",
				2633	.read_u64 = cpuset_read_u64,
				2634	.private = FILE_MEMORY_PRESSURE,
				2635	},
				2636
				2637	{
				2638	.name = "memory_spread_page",
				2639	.read_u64 = cpuset_read_u64,
				2640	.write_u64 = cpuset_write_u64,
				2641	.private = FILE_SPREAD_PAGE,
				2642	},
				2643
				2644	{
				2645	.name = "memory_spread_slab",
				2646	.read_u64 = cpuset_read_u64,
				2647	.write_u64 = cpuset_write_u64,
				2648	.private = FILE_SPREAD_SLAB,
				2649	},
				2650
				2651	{
				2652	.name = "memory_pressure_enabled",
				2653	.flags = CFTYPE_ONLY_ON_ROOT,
				2654	.read_u64 = cpuset_read_u64,
				2655	.write_u64 = cpuset_write_u64,
				2656	.private = FILE_MEMORY_PRESSURE_ENABLED,
				2657	},
				2658
				2659	{ } /* terminate */
				2660	};
				2661
				2662	/*
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2663	* This is currently a minimal set for the default hierarchy. It can be
				2664	* expanded later on by migrating more features and control files from v1.
				2665	*/
				2666	static struct cftype dfl_files[] = {
				2667	{
				2668	.name = "cpus",
				2669	.seq_show = cpuset_common_seq_show,
				2670	.write = cpuset_write_resmask,
				2671	.max_write_len = (100U + 6 * NR_CPUS),
				2672	.private = FILE_CPULIST,
				2673	.flags = CFTYPE_NOT_ON_ROOT,
				2674	},
				2675
				2676	{
				2677	.name = "mems",
				2678	.seq_show = cpuset_common_seq_show,
				2679	.write = cpuset_write_resmask,
				2680	.max_write_len = (100U + 6 * MAX_NUMNODES),
				2681	.private = FILE_MEMLIST,
				2682	.flags = CFTYPE_NOT_ON_ROOT,
				2683	},
				2684
				2685	{
				2686	.name = "cpus.effective",
				2687	.seq_show = cpuset_common_seq_show,
				2688	.private = FILE_EFFECTIVE_CPULIST,
				2689	},
				2690
				2691	{
				2692	.name = "mems.effective",
				2693	.seq_show = cpuset_common_seq_show,
				2694	.private = FILE_EFFECTIVE_MEMLIST,
				2695	},
				2696
				2697	{
				2698	.name = "cpus.partition",
				2699	.seq_show = sched_partition_show,
				2700	.write = sched_partition_write,
				2701	.private = FILE_PARTITION_ROOT,
				2702	.flags = CFTYPE_NOT_ON_ROOT,
				2703	},
				2704
				2705	{
				2706	.name = "cpus.subpartitions",
				2707	.seq_show = cpuset_common_seq_show,
				2708	.private = FILE_SUBPARTS_CPULIST,
				2709	.flags = CFTYPE_DEBUG,
				2710	},
				2711
				2712	{ } /* terminate */
				2713	};
				2714
				2715
				2716	/*
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2717	* cpuset_css_alloc - allocate a cpuset css
				2718	* cgrp: control group that the new cpuset will be part of
				2719	*/
				2720
				2721	static struct cgroup_subsys_state *
				2722	cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
				2723	{
				2724	struct cpuset *cs;
				2725
				2726	if (!parent_css)
				2727	return &top_cpuset.css;
				2728
				2729	cs = kzalloc(sizeof(*cs), GFP_KERNEL);
				2730	if (!cs)
				2731	return ERR_PTR(-ENOMEM);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2732
				2733	if (alloc_cpumasks(cs, NULL)) {
				2734	kfree(cs);
				2735	return ERR_PTR(-ENOMEM);
				2736	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2737
				2738	set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2739	nodes_clear(cs->mems_allowed);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2740	nodes_clear(cs->effective_mems);
				2741	fmeter_init(&cs->fmeter);
				2742	cs->relax_domain_level = -1;
				2743
				2744	return &cs->css;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2745	}
				2746
				2747	static int cpuset_css_online(struct cgroup_subsys_state *css)
				2748	{
				2749	struct cpuset *cs = css_cs(css);
				2750	struct cpuset *parent = parent_cs(cs);
				2751	struct cpuset *tmp_cs;
				2752	struct cgroup_subsys_state *pos_css;
				2753
				2754	if (!parent)
				2755	return 0;
				2756
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2757	get_online_cpus();
				2758	percpu_down_write(&cpuset_rwsem);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2759
				2760	set_bit(CS_ONLINE, &cs->flags);
				2761	if (is_spread_page(parent))
				2762	set_bit(CS_SPREAD_PAGE, &cs->flags);
				2763	if (is_spread_slab(parent))
				2764	set_bit(CS_SPREAD_SLAB, &cs->flags);
				2765
				2766	cpuset_inc();
				2767
				2768	spin_lock_irq(&callback_lock);
				2769	if (is_in_v2_mode()) {
				2770	cpumask_copy(cs->effective_cpus, parent->effective_cpus);
				2771	cs->effective_mems = parent->effective_mems;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2772	cs->use_parent_ecpus = true;
				2773	parent->child_ecpus_count++;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2774	}
				2775	spin_unlock_irq(&callback_lock);
				2776
				2777	if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
				2778	goto out_unlock;
				2779
				2780	/*
				2781	* Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is
				2782	* set. This flag handling is implemented in cgroup core for
				2783	* histrical reasons - the flag may be specified during mount.
				2784	*
				2785	* Currently, if any sibling cpusets have exclusive cpus or mem, we
				2786	* refuse to clone the configuration - thereby refusing the task to
				2787	* be entered, and as a result refusing the sys_unshare() or
				2788	* clone() which initiated it. If this becomes a problem for some
				2789	* users who wish to allow that scenario, then this could be
				2790	* changed to grant parent->cpus_allowed-sibling_cpus_exclusive
				2791	* (and likewise for mems) to the new cgroup.
				2792	*/
				2793	rcu_read_lock();
				2794	cpuset_for_each_child(tmp_cs, pos_css, parent) {
				2795	if (is_mem_exclusive(tmp_cs) \|\| is_cpu_exclusive(tmp_cs)) {
				2796	rcu_read_unlock();
				2797	goto out_unlock;
				2798	}
				2799	}
				2800	rcu_read_unlock();
				2801
				2802	spin_lock_irq(&callback_lock);
				2803	cs->mems_allowed = parent->mems_allowed;
				2804	cs->effective_mems = parent->mems_allowed;
				2805	cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
				2806	cpumask_copy(cs->effective_cpus, parent->cpus_allowed);
				2807	spin_unlock_irq(&callback_lock);
				2808	out_unlock:
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2809	percpu_up_write(&cpuset_rwsem);
				2810	put_online_cpus();
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2811	return 0;
				2812	}
				2813
				2814	/*
				2815	* If the cpuset being removed has its flag 'sched_load_balance'
				2816	* enabled, then simulate turning sched_load_balance off, which
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2817	* will call rebuild_sched_domains_locked(). That is not needed
				2818	* in the default hierarchy where only changes in partition
				2819	* will cause repartitioning.
				2820	*
				2821	* If the cpuset has the 'sched.partition' flag enabled, simulate
				2822	* turning 'sched.partition" off.
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2823	*/
				2824
				2825	static void cpuset_css_offline(struct cgroup_subsys_state *css)
				2826	{
				2827	struct cpuset *cs = css_cs(css);
				2828
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2829	get_online_cpus();
				2830	percpu_down_write(&cpuset_rwsem);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2831
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2832	if (is_partition_root(cs))
				2833	update_prstate(cs, 0);
				2834
				2835	if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
				2836	is_sched_load_balance(cs))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2837	update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
				2838
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2839	if (cs->use_parent_ecpus) {
				2840	struct cpuset *parent = parent_cs(cs);
				2841
				2842	cs->use_parent_ecpus = false;
				2843	parent->child_ecpus_count--;
				2844	}
				2845
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2846	cpuset_dec();
				2847	clear_bit(CS_ONLINE, &cs->flags);
				2848
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2849	percpu_up_write(&cpuset_rwsem);
				2850	put_online_cpus();
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2851	}
				2852
				2853	static void cpuset_css_free(struct cgroup_subsys_state *css)
				2854	{
				2855	struct cpuset *cs = css_cs(css);
				2856
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2857	free_cpuset(cs);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2858	}
				2859
				2860	static void cpuset_bind(struct cgroup_subsys_state *root_css)
				2861	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2862	percpu_down_write(&cpuset_rwsem);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2863	spin_lock_irq(&callback_lock);
				2864
				2865	if (is_in_v2_mode()) {
				2866	cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask);
				2867	top_cpuset.mems_allowed = node_possible_map;
				2868	} else {
				2869	cpumask_copy(top_cpuset.cpus_allowed,
				2870	top_cpuset.effective_cpus);
				2871	top_cpuset.mems_allowed = top_cpuset.effective_mems;
				2872	}
				2873
				2874	spin_unlock_irq(&callback_lock);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2875	percpu_up_write(&cpuset_rwsem);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2876	}
				2877
				2878	/*
				2879	* Make sure the new task conform to the current state of its parent,
				2880	* which could have been changed by cpuset just after it inherits the
				2881	* state from the parent and before it sits on the cgroup's task list.
				2882	*/
				2883	static void cpuset_fork(struct task_struct *task)
				2884	{
				2885	if (task_css_is_root(task, cpuset_cgrp_id))
				2886	return;
				2887
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2888	set_cpus_allowed_ptr(task, current->cpus_ptr);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2889	task->mems_allowed = current->mems_allowed;
				2890	}
				2891
				2892	struct cgroup_subsys cpuset_cgrp_subsys = {
				2893	.css_alloc = cpuset_css_alloc,
				2894	.css_online = cpuset_css_online,
				2895	.css_offline = cpuset_css_offline,
				2896	.css_free = cpuset_css_free,
				2897	.can_attach = cpuset_can_attach,
				2898	.cancel_attach = cpuset_cancel_attach,
				2899	.attach = cpuset_attach,
				2900	.post_attach = cpuset_post_attach,
				2901	.bind = cpuset_bind,
				2902	.fork = cpuset_fork,
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2903	.legacy_cftypes = legacy_files,
				2904	.dfl_cftypes = dfl_files,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2905	.early_init = true,
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2906	.threaded = true,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2907	};
				2908
				2909	/**
				2910	* cpuset_init - initialize cpusets at system boot
				2911	*
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2912	* Description: Initialize top_cpuset
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2913	**/
				2914
				2915	int __init cpuset_init(void)
				2916	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2917	BUG_ON(percpu_init_rwsem(&cpuset_rwsem));
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2918
				2919	BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL));
				2920	BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL));
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2921	BUG_ON(!zalloc_cpumask_var(&top_cpuset.subparts_cpus, GFP_KERNEL));
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2922
				2923	cpumask_setall(top_cpuset.cpus_allowed);
				2924	nodes_setall(top_cpuset.mems_allowed);
				2925	cpumask_setall(top_cpuset.effective_cpus);
				2926	nodes_setall(top_cpuset.effective_mems);
				2927
				2928	fmeter_init(&top_cpuset.fmeter);
				2929	set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags);
				2930	top_cpuset.relax_domain_level = -1;
				2931
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2932	BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL));
				2933
				2934	return 0;
				2935	}
				2936
				2937	/*
				2938	* If CPU and/or memory hotplug handlers, below, unplug any CPUs
				2939	* or memory nodes, we need to walk over the cpuset hierarchy,
				2940	* removing that CPU or node from all cpusets. If this removes the
				2941	* last CPU or node from a cpuset, then move the tasks in the empty
				2942	* cpuset to its next-highest non-empty parent.
				2943	*/
				2944	static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
				2945	{
				2946	struct cpuset *parent;
				2947
				2948	/*
				2949	* Find its next-highest non-empty parent, (top cpuset
				2950	* has online cpus, so can't be empty).
				2951	*/
				2952	parent = parent_cs(cs);
				2953	while (cpumask_empty(parent->cpus_allowed) \|\|
				2954	nodes_empty(parent->mems_allowed))
				2955	parent = parent_cs(parent);
				2956
				2957	if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) {
				2958	pr_err("cpuset: failed to transfer tasks out of empty cpuset ");
				2959	pr_cont_cgroup_name(cs->css.cgroup);
				2960	pr_cont("\n");
				2961	}
				2962	}
				2963
				2964	static void
				2965	hotplug_update_tasks_legacy(struct cpuset *cs,
				2966	struct cpumask new_cpus, nodemask_t new_mems,
				2967	bool cpus_updated, bool mems_updated)
				2968	{
				2969	bool is_empty;
				2970
				2971	spin_lock_irq(&callback_lock);
				2972	cpumask_copy(cs->cpus_allowed, new_cpus);
				2973	cpumask_copy(cs->effective_cpus, new_cpus);
				2974	cs->mems_allowed = *new_mems;
				2975	cs->effective_mems = *new_mems;
				2976	spin_unlock_irq(&callback_lock);
				2977
				2978	/*
				2979	* Don't call update_tasks_cpumask() if the cpuset becomes empty,
				2980	* as the tasks will be migratecd to an ancestor.
				2981	*/
				2982	if (cpus_updated && !cpumask_empty(cs->cpus_allowed))
				2983	update_tasks_cpumask(cs);
				2984	if (mems_updated && !nodes_empty(cs->mems_allowed))
				2985	update_tasks_nodemask(cs);
				2986
				2987	is_empty = cpumask_empty(cs->cpus_allowed) \|\|
				2988	nodes_empty(cs->mems_allowed);
				2989
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2990	percpu_up_write(&cpuset_rwsem);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2991
				2992	/*
				2993	* Move tasks to the nearest ancestor with execution resources,
				2994	* This is full cgroup operation which will also call back into
				2995	* cpuset. Should be done outside any lock.
				2996	*/
				2997	if (is_empty)
				2998	remove_tasks_in_empty_cpuset(cs);
				2999
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3000	percpu_down_write(&cpuset_rwsem);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3001	}
				3002
				3003	static void
				3004	hotplug_update_tasks(struct cpuset *cs,
				3005	struct cpumask new_cpus, nodemask_t new_mems,
				3006	bool cpus_updated, bool mems_updated)
				3007	{
				3008	if (cpumask_empty(new_cpus))
				3009	cpumask_copy(new_cpus, parent_cs(cs)->effective_cpus);
				3010	if (nodes_empty(*new_mems))
				3011	*new_mems = parent_cs(cs)->effective_mems;
				3012
				3013	spin_lock_irq(&callback_lock);
				3014	cpumask_copy(cs->effective_cpus, new_cpus);
				3015	cs->effective_mems = *new_mems;
				3016	spin_unlock_irq(&callback_lock);
				3017
				3018	if (cpus_updated)
				3019	update_tasks_cpumask(cs);
				3020	if (mems_updated)
				3021	update_tasks_nodemask(cs);
				3022	}
				3023
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3024	static bool force_rebuild;
				3025
				3026	void cpuset_force_rebuild(void)
				3027	{
				3028	force_rebuild = true;
				3029	}
				3030
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3031	/**
				3032	* cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug
				3033	* @cs: cpuset in interest
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3034	* @tmp: the tmpmasks structure pointer
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3035	*
				3036	* Compare @cs's cpu and mem masks against top_cpuset and if some have gone
				3037	* offline, update @cs accordingly. If @cs ends up with no CPU or memory,
				3038	* all its tasks are moved to the nearest ancestor with both resources.
				3039	*/
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3040	static void cpuset_hotplug_update_tasks(struct cpuset cs, struct tmpmasks tmp)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3041	{
				3042	static cpumask_t new_cpus;
				3043	static nodemask_t new_mems;
				3044	bool cpus_updated;
				3045	bool mems_updated;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3046	struct cpuset *parent;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3047	retry:
				3048	wait_event(cpuset_attach_wq, cs->attach_in_progress == 0);
				3049
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3050	percpu_down_write(&cpuset_rwsem);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3051
				3052	/*
				3053	* We have raced with task attaching. We wait until attaching
				3054	* is finished, so we won't attach a task to an empty cpuset.
				3055	*/
				3056	if (cs->attach_in_progress) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3057	percpu_up_write(&cpuset_rwsem);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3058	goto retry;
				3059	}
				3060
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3061	parent = parent_cs(cs);
				3062	compute_effective_cpumask(&new_cpus, cs, parent);
				3063	nodes_and(new_mems, cs->mems_allowed, parent->effective_mems);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3064
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3065	if (cs->nr_subparts_cpus)
				3066	/*
				3067	* Make sure that CPUs allocated to child partitions
				3068	* do not show up in effective_cpus.
				3069	*/
				3070	cpumask_andnot(&new_cpus, &new_cpus, cs->subparts_cpus);
				3071
				3072	if (!tmp \|\| !cs->partition_root_state)
				3073	goto update_tasks;
				3074
				3075	/*
				3076	* In the unlikely event that a partition root has empty
				3077	* effective_cpus or its parent becomes erroneous, we have to
				3078	* transition it to the erroneous state.
				3079	*/
				3080	if (is_partition_root(cs) && (cpumask_empty(&new_cpus) \|\|
				3081	(parent->partition_root_state == PRS_ERROR))) {
				3082	if (cs->nr_subparts_cpus) {
				3083	cs->nr_subparts_cpus = 0;
				3084	cpumask_clear(cs->subparts_cpus);
				3085	compute_effective_cpumask(&new_cpus, cs, parent);
				3086	}
				3087
				3088	/*
				3089	* If the effective_cpus is empty because the child
				3090	* partitions take away all the CPUs, we can keep
				3091	* the current partition and let the child partitions
				3092	* fight for available CPUs.
				3093	*/
				3094	if ((parent->partition_root_state == PRS_ERROR) \|\|
				3095	cpumask_empty(&new_cpus)) {
				3096	update_parent_subparts_cpumask(cs, partcmd_disable,
				3097	NULL, tmp);
				3098	cs->partition_root_state = PRS_ERROR;
				3099	}
				3100	cpuset_force_rebuild();
				3101	}
				3102
				3103	/*
				3104	* On the other hand, an erroneous partition root may be transitioned
				3105	* back to a regular one or a partition root with no CPU allocated
				3106	* from the parent may change to erroneous.
				3107	*/
				3108	if (is_partition_root(parent) &&
				3109	((cs->partition_root_state == PRS_ERROR) \|\|
				3110	!cpumask_intersects(&new_cpus, parent->subparts_cpus)) &&
				3111	update_parent_subparts_cpumask(cs, partcmd_update, NULL, tmp))
				3112	cpuset_force_rebuild();
				3113
				3114	update_tasks:
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3115	cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus);
				3116	mems_updated = !nodes_equal(new_mems, cs->effective_mems);
				3117
				3118	if (is_in_v2_mode())
				3119	hotplug_update_tasks(cs, &new_cpus, &new_mems,
				3120	cpus_updated, mems_updated);
				3121	else
				3122	hotplug_update_tasks_legacy(cs, &new_cpus, &new_mems,
				3123	cpus_updated, mems_updated);
				3124
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3125	percpu_up_write(&cpuset_rwsem);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3126	}
				3127
				3128	/**
				3129	* cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset
				3130	*
				3131	* This function is called after either CPU or memory configuration has
				3132	* changed and updates cpuset accordingly. The top_cpuset is always
				3133	* synchronized to cpu_active_mask and N_MEMORY, which is necessary in
				3134	* order to make cpusets transparent (of no affect) on systems that are
				3135	* actively using CPU hotplug but making no active use of cpusets.
				3136	*
				3137	* Non-root cpusets are only affected by offlining. If any CPUs or memory
				3138	* nodes have been taken down, cpuset_hotplug_update_tasks() is invoked on
				3139	* all descendants.
				3140	*
				3141	* Note that CPU offlining during suspend is ignored. We don't modify
				3142	* cpusets across suspend/resume cycles at all.
				3143	*/
				3144	static void cpuset_hotplug_workfn(struct work_struct *work)
				3145	{
				3146	static cpumask_t new_cpus;
				3147	static nodemask_t new_mems;
				3148	bool cpus_updated, mems_updated;
				3149	bool on_dfl = is_in_v2_mode();
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3150	struct tmpmasks tmp, *ptmp = NULL;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3151
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3152	if (on_dfl && !alloc_cpumasks(NULL, &tmp))
				3153	ptmp = &tmp;
				3154
				3155	percpu_down_write(&cpuset_rwsem);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3156
				3157	/* fetch the available cpus/mems and find out which changed how */
				3158	cpumask_copy(&new_cpus, cpu_active_mask);
				3159	new_mems = node_states[N_MEMORY];
				3160
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3161	/*
				3162	* If subparts_cpus is populated, it is likely that the check below
				3163	* will produce a false positive on cpus_updated when the cpu list
				3164	* isn't changed. It is extra work, but it is better to be safe.
				3165	*/
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3166	cpus_updated = !cpumask_equal(top_cpuset.effective_cpus, &new_cpus);
				3167	mems_updated = !nodes_equal(top_cpuset.effective_mems, new_mems);
				3168
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	3169	/*
				3170	* In the rare case that hotplug removes all the cpus in subparts_cpus,
				3171	* we assumed that cpus are updated.
				3172	*/
				3173	if (!cpus_updated && top_cpuset.nr_subparts_cpus)
				3174	cpus_updated = true;
				3175
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3176	/* synchronize cpus_allowed to cpu_active_mask */
				3177	if (cpus_updated) {
				3178	spin_lock_irq(&callback_lock);
				3179	if (!on_dfl)
				3180	cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3181	/*
				3182	* Make sure that CPUs allocated to child partitions
				3183	* do not show up in effective_cpus. If no CPU is left,
				3184	* we clear the subparts_cpus & let the child partitions
				3185	* fight for the CPUs again.
				3186	*/
				3187	if (top_cpuset.nr_subparts_cpus) {
				3188	if (cpumask_subset(&new_cpus,
				3189	top_cpuset.subparts_cpus)) {
				3190	top_cpuset.nr_subparts_cpus = 0;
				3191	cpumask_clear(top_cpuset.subparts_cpus);
				3192	} else {
				3193	cpumask_andnot(&new_cpus, &new_cpus,
				3194	top_cpuset.subparts_cpus);
				3195	}
				3196	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3197	cpumask_copy(top_cpuset.effective_cpus, &new_cpus);
				3198	spin_unlock_irq(&callback_lock);
				3199	/* we don't mess with cpumasks of tasks in top_cpuset */
				3200	}
				3201
				3202	/* synchronize mems_allowed to N_MEMORY */
				3203	if (mems_updated) {
				3204	spin_lock_irq(&callback_lock);
				3205	if (!on_dfl)
				3206	top_cpuset.mems_allowed = new_mems;
				3207	top_cpuset.effective_mems = new_mems;
				3208	spin_unlock_irq(&callback_lock);
				3209	update_tasks_nodemask(&top_cpuset);
				3210	}
				3211
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3212	percpu_up_write(&cpuset_rwsem);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3213
				3214	/* if cpus or mems changed, we need to propagate to descendants */
				3215	if (cpus_updated \|\| mems_updated) {
				3216	struct cpuset *cs;
				3217	struct cgroup_subsys_state *pos_css;
				3218
				3219	rcu_read_lock();
				3220	cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
				3221	if (cs == &top_cpuset \|\| !css_tryget_online(&cs->css))
				3222	continue;
				3223	rcu_read_unlock();
				3224
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3225	cpuset_hotplug_update_tasks(cs, ptmp);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3226
				3227	rcu_read_lock();
				3228	css_put(&cs->css);
				3229	}
				3230	rcu_read_unlock();
				3231	}
				3232
				3233	/* rebuild sched domains if cpus_allowed has changed */
				3234	if (cpus_updated \|\| force_rebuild) {
				3235	force_rebuild = false;
				3236	rebuild_sched_domains();
				3237	}
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3238
				3239	free_cpumasks(NULL, ptmp);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3240	}
				3241
				3242	void cpuset_update_active_cpus(void)
				3243	{
				3244	/*
				3245	* We're inside cpu hotplug critical region which usually nests
				3246	* inside cgroup synchronization. Bounce actual hotplug processing
				3247	* to a work item to avoid reverse locking order.
				3248	*/
				3249	schedule_work(&cpuset_hotplug_work);
				3250	}
				3251
				3252	void cpuset_wait_for_hotplug(void)
				3253	{
				3254	flush_work(&cpuset_hotplug_work);
				3255	}
				3256
				3257	/*
				3258	* Keep top_cpuset.mems_allowed tracking node_states[N_MEMORY].
				3259	* Call this routine anytime after node_states[N_MEMORY] changes.
				3260	* See cpuset_update_active_cpus() for CPU hotplug handling.
				3261	*/
				3262	static int cpuset_track_online_nodes(struct notifier_block *self,
				3263	unsigned long action, void *arg)
				3264	{
				3265	schedule_work(&cpuset_hotplug_work);
				3266	return NOTIFY_OK;
				3267	}
				3268
				3269	static struct notifier_block cpuset_track_online_nodes_nb = {
				3270	.notifier_call = cpuset_track_online_nodes,
				3271	.priority = 10, /* ??! */
				3272	};
				3273
				3274	/**
				3275	* cpuset_init_smp - initialize cpus_allowed
				3276	*
				3277	* Description: Finish top cpuset after cpu, node maps are initialized
				3278	*/
				3279	void __init cpuset_init_smp(void)
				3280	{
				3281	cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
				3282	top_cpuset.mems_allowed = node_states[N_MEMORY];
				3283	top_cpuset.old_mems_allowed = top_cpuset.mems_allowed;
				3284
				3285	cpumask_copy(top_cpuset.effective_cpus, cpu_active_mask);
				3286	top_cpuset.effective_mems = node_states[N_MEMORY];
				3287
				3288	register_hotmemory_notifier(&cpuset_track_online_nodes_nb);
				3289
				3290	cpuset_migrate_mm_wq = alloc_ordered_workqueue("cpuset_migrate_mm", 0);
				3291	BUG_ON(!cpuset_migrate_mm_wq);
				3292	}
				3293
				3294	/**
				3295	* cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset.
				3296	* @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed.
				3297	* @pmask: pointer to struct cpumask variable to receive cpus_allowed set.
				3298	*
				3299	* Description: Returns the cpumask_var_t cpus_allowed of the cpuset
				3300	* attached to the specified @tsk. Guaranteed to return some non-empty
				3301	* subset of cpu_online_mask, even if this means going outside the
				3302	* tasks cpuset.
				3303	**/
				3304
				3305	void cpuset_cpus_allowed(struct task_struct tsk, struct cpumask pmask)
				3306	{
				3307	unsigned long flags;
				3308
				3309	spin_lock_irqsave(&callback_lock, flags);
				3310	rcu_read_lock();
				3311	guarantee_online_cpus(task_cs(tsk), pmask);
				3312	rcu_read_unlock();
				3313	spin_unlock_irqrestore(&callback_lock, flags);
				3314	}
				3315
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3316	/**
				3317	* cpuset_cpus_allowed_fallback - final fallback before complete catastrophe.
				3318	* @tsk: pointer to task_struct with which the scheduler is struggling
				3319	*
				3320	* Description: In the case that the scheduler cannot find an allowed cpu in
				3321	* tsk->cpus_allowed, we fall back to task_cs(tsk)->cpus_allowed. In legacy
				3322	* mode however, this value is the same as task_cs(tsk)->effective_cpus,
				3323	* which will not contain a sane cpumask during cases such as cpu hotplugging.
				3324	* This is the absolute last resort for the scheduler and it is only used if
				3325	* _every_ other avenue has been traveled.
				3326	**/
				3327
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3328	void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
				3329	{
				3330	rcu_read_lock();
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3331	do_set_cpus_allowed(tsk, is_in_v2_mode() ?
				3332	task_cs(tsk)->cpus_allowed : cpu_possible_mask);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3333	rcu_read_unlock();
				3334
				3335	/*
				3336	* We own tsk->cpus_allowed, nobody can change it under us.
				3337	*
				3338	* But we used cs && cs->cpus_allowed lockless and thus can
				3339	* race with cgroup_attach_task() or update_cpumask() and get
				3340	* the wrong tsk->cpus_allowed. However, both cases imply the
				3341	* subsequent cpuset_change_cpumask()->set_cpus_allowed_ptr()
				3342	* which takes task_rq_lock().
				3343	*
				3344	* If we are called after it dropped the lock we must see all
				3345	* changes in tsk_cs()->cpus_allowed. Otherwise we can temporary
				3346	* set any mask even if it is not right from task_cs() pov,
				3347	* the pending set_cpus_allowed_ptr() will fix things.
				3348	*
				3349	* select_fallback_rq() will fix things ups and set cpu_possible_mask
				3350	* if required.
				3351	*/
				3352	}
				3353
				3354	void __init cpuset_init_current_mems_allowed(void)
				3355	{
				3356	nodes_setall(current->mems_allowed);
				3357	}
				3358
				3359	/**
				3360	* cpuset_mems_allowed - return mems_allowed mask from a tasks cpuset.
				3361	* @tsk: pointer to task_struct from which to obtain cpuset->mems_allowed.
				3362	*
				3363	* Description: Returns the nodemask_t mems_allowed of the cpuset
				3364	* attached to the specified @tsk. Guaranteed to return some non-empty
				3365	* subset of node_states[N_MEMORY], even if this means going outside the
				3366	* tasks cpuset.
				3367	**/
				3368
				3369	nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
				3370	{
				3371	nodemask_t mask;
				3372	unsigned long flags;
				3373
				3374	spin_lock_irqsave(&callback_lock, flags);
				3375	rcu_read_lock();
				3376	guarantee_online_mems(task_cs(tsk), &mask);
				3377	rcu_read_unlock();
				3378	spin_unlock_irqrestore(&callback_lock, flags);
				3379
				3380	return mask;
				3381	}
				3382
				3383	/**
				3384	* cpuset_nodemask_valid_mems_allowed - check nodemask vs. curremt mems_allowed
				3385	* @nodemask: the nodemask to be checked
				3386	*
				3387	* Are any of the nodes in the nodemask allowed in current->mems_allowed?
				3388	*/
				3389	int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
				3390	{
				3391	return nodes_intersects(*nodemask, current->mems_allowed);
				3392	}
				3393
				3394	/*
				3395	* nearest_hardwall_ancestor() - Returns the nearest mem_exclusive or
				3396	* mem_hardwall ancestor to the specified cpuset. Call holding
				3397	* callback_lock. If no ancestor is mem_exclusive or mem_hardwall
				3398	* (an unusual configuration), then returns the root cpuset.
				3399	*/
				3400	static struct cpuset nearest_hardwall_ancestor(struct cpuset cs)
				3401	{
				3402	while (!(is_mem_exclusive(cs) \|\| is_mem_hardwall(cs)) && parent_cs(cs))
				3403	cs = parent_cs(cs);
				3404	return cs;
				3405	}
				3406
				3407	/**
				3408	* cpuset_node_allowed - Can we allocate on a memory node?
				3409	* @node: is this an allowed node?
				3410	* @gfp_mask: memory allocation flags
				3411	*
				3412	* If we're in interrupt, yes, we can always allocate. If @node is set in
				3413	* current's mems_allowed, yes. If it's not a __GFP_HARDWALL request and this
				3414	* node is set in the nearest hardwalled cpuset ancestor to current's cpuset,
				3415	* yes. If current has access to memory reserves as an oom victim, yes.
				3416	* Otherwise, no.
				3417	*
				3418	* GFP_USER allocations are marked with the __GFP_HARDWALL bit,
				3419	* and do not allow allocations outside the current tasks cpuset
				3420	* unless the task has been OOM killed.
				3421	* GFP_KERNEL allocations are not so marked, so can escape to the
				3422	* nearest enclosing hardwalled ancestor cpuset.
				3423	*
				3424	* Scanning up parent cpusets requires callback_lock. The
				3425	* __alloc_pages() routine only calls here with __GFP_HARDWALL bit
				3426	* _not_ set if it's a GFP_KERNEL allocation, and all nodes in the
				3427	* current tasks mems_allowed came up empty on the first pass over
				3428	* the zonelist. So only GFP_KERNEL allocations, if all nodes in the
				3429	* cpuset are short of memory, might require taking the callback_lock.
				3430	*
				3431	* The first call here from mm/page_alloc:get_page_from_freelist()
				3432	* has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets,
				3433	* so no allocation on a node outside the cpuset is allowed (unless
				3434	* in interrupt, of course).
				3435	*
				3436	* The second pass through get_page_from_freelist() doesn't even call
				3437	* here for GFP_ATOMIC calls. For those calls, the __alloc_pages()
				3438	* variable 'wait' is not set, and the bit ALLOC_CPUSET is not set
				3439	* in alloc_flags. That logic and the checks below have the combined
				3440	* affect that:
				3441	* in_interrupt - any node ok (current task context irrelevant)
				3442	* GFP_ATOMIC - any node ok
				3443	* tsk_is_oom_victim - any node ok
				3444	* GFP_KERNEL - any node in enclosing hardwalled cpuset ok
				3445	* GFP_USER - only nodes in current tasks mems allowed ok.
				3446	*/
				3447	bool __cpuset_node_allowed(int node, gfp_t gfp_mask)
				3448	{
				3449	struct cpuset cs; / current cpuset ancestors */
				3450	int allowed; /* is allocation in zone z allowed? */
				3451	unsigned long flags;
				3452
				3453	if (in_interrupt())
				3454	return true;
				3455	if (node_isset(node, current->mems_allowed))
				3456	return true;
				3457	/*
				3458	* Allow tasks that have access to memory reserves because they have
				3459	* been OOM killed to get memory anywhere.
				3460	*/
				3461	if (unlikely(tsk_is_oom_victim(current)))
				3462	return true;
				3463	if (gfp_mask & __GFP_HARDWALL) /* If hardwall request, stop here */
				3464	return false;
				3465
				3466	if (current->flags & PF_EXITING) /* Let dying task have memory */
				3467	return true;
				3468
				3469	/* Not hardwall and node outside mems_allowed: scan up cpusets */
				3470	spin_lock_irqsave(&callback_lock, flags);
				3471
				3472	rcu_read_lock();
				3473	cs = nearest_hardwall_ancestor(task_cs(current));
				3474	allowed = node_isset(node, cs->mems_allowed);
				3475	rcu_read_unlock();
				3476
				3477	spin_unlock_irqrestore(&callback_lock, flags);
				3478	return allowed;
				3479	}
				3480
				3481	/**
				3482	* cpuset_mem_spread_node() - On which node to begin search for a file page
				3483	* cpuset_slab_spread_node() - On which node to begin search for a slab page
				3484	*
				3485	* If a task is marked PF_SPREAD_PAGE or PF_SPREAD_SLAB (as for
				3486	* tasks in a cpuset with is_spread_page or is_spread_slab set),
				3487	* and if the memory allocation used cpuset_mem_spread_node()
				3488	* to determine on which node to start looking, as it will for
				3489	* certain page cache or slab cache pages such as used for file
				3490	* system buffers and inode caches, then instead of starting on the
				3491	* local node to look for a free page, rather spread the starting
				3492	* node around the tasks mems_allowed nodes.
				3493	*
				3494	* We don't have to worry about the returned node being offline
				3495	* because "it can't happen", and even if it did, it would be ok.
				3496	*
				3497	* The routines calling guarantee_online_mems() are careful to
				3498	* only set nodes in task->mems_allowed that are online. So it
				3499	* should not be possible for the following code to return an
				3500	* offline node. But if it did, that would be ok, as this routine
				3501	* is not returning the node where the allocation must be, only
				3502	* the node where the search should start. The zonelist passed to
				3503	* __alloc_pages() will include all nodes. If the slab allocator
				3504	* is passed an offline node, it will fall back to the local node.
				3505	* See kmem_cache_alloc_node().
				3506	*/
				3507
				3508	static int cpuset_spread_node(int *rotor)
				3509	{
				3510	return rotor = next_node_in(rotor, current->mems_allowed);
				3511	}
				3512
				3513	int cpuset_mem_spread_node(void)
				3514	{
				3515	if (current->cpuset_mem_spread_rotor == NUMA_NO_NODE)
				3516	current->cpuset_mem_spread_rotor =
				3517	node_random(&current->mems_allowed);
				3518
				3519	return cpuset_spread_node(&current->cpuset_mem_spread_rotor);
				3520	}
				3521
				3522	int cpuset_slab_spread_node(void)
				3523	{
				3524	if (current->cpuset_slab_spread_rotor == NUMA_NO_NODE)
				3525	current->cpuset_slab_spread_rotor =
				3526	node_random(&current->mems_allowed);
				3527
				3528	return cpuset_spread_node(&current->cpuset_slab_spread_rotor);
				3529	}
				3530
				3531	EXPORT_SYMBOL_GPL(cpuset_mem_spread_node);
				3532
				3533	/**
				3534	* cpuset_mems_allowed_intersects - Does @tsk1's mems_allowed intersect @tsk2's?
				3535	* @tsk1: pointer to task_struct of some task.
				3536	* @tsk2: pointer to task_struct of some other task.
				3537	*
				3538	* Description: Return true if @tsk1's mems_allowed intersects the
				3539	* mems_allowed of @tsk2. Used by the OOM killer to determine if
				3540	* one of the task's memory usage might impact the memory available
				3541	* to the other.
				3542	**/
				3543
				3544	int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
				3545	const struct task_struct *tsk2)
				3546	{
				3547	return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed);
				3548	}
				3549
				3550	/**
				3551	* cpuset_print_current_mems_allowed - prints current's cpuset and mems_allowed
				3552	*
				3553	* Description: Prints current's name, cpuset name, and cached copy of its
				3554	* mems_allowed to the kernel log.
				3555	*/
				3556	void cpuset_print_current_mems_allowed(void)
				3557	{
				3558	struct cgroup *cgrp;
				3559
				3560	rcu_read_lock();
				3561
				3562	cgrp = task_cs(current)->css.cgroup;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3563	pr_cont(",cpuset=");
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3564	pr_cont_cgroup_name(cgrp);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3565	pr_cont(",mems_allowed=%*pbl",
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3566	nodemask_pr_args(&current->mems_allowed));
				3567
				3568	rcu_read_unlock();
				3569	}
				3570
				3571	/*
				3572	* Collection of memory_pressure is suppressed unless
				3573	* this flag is enabled by writing "1" to the special
				3574	* cpuset file 'memory_pressure_enabled' in the root cpuset.
				3575	*/
				3576
				3577	int cpuset_memory_pressure_enabled __read_mostly;
				3578
				3579	/**
				3580	* cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims.
				3581	*
				3582	* Keep a running average of the rate of synchronous (direct)
				3583	* page reclaim efforts initiated by tasks in each cpuset.
				3584	*
				3585	* This represents the rate at which some task in the cpuset
				3586	* ran low on memory on all nodes it was allowed to use, and
				3587	* had to enter the kernels page reclaim code in an effort to
				3588	* create more free memory by tossing clean pages or swapping
				3589	* or writing dirty pages.
				3590	*
				3591	* Display to user space in the per-cpuset read-only file
				3592	* "memory_pressure". Value displayed is an integer
				3593	* representing the recent rate of entry into the synchronous
				3594	* (direct) page reclaim by any task attached to the cpuset.
				3595	**/
				3596
				3597	void __cpuset_memory_pressure_bump(void)
				3598	{
				3599	rcu_read_lock();
				3600	fmeter_markevent(&task_cs(current)->fmeter);
				3601	rcu_read_unlock();
				3602	}
				3603
				3604	#ifdef CONFIG_PROC_PID_CPUSET
				3605	/*
				3606	* proc_cpuset_show()
				3607	* - Print tasks cpuset path into seq_file.
				3608	* - Used for /proc/<pid>/cpuset.
				3609	* - No need to task_lock(tsk) on this tsk->cpuset reference, as it
				3610	* doesn't really matter if tsk->cpuset changes after we read it,
				3611	* and we take cpuset_mutex, keeping cpuset_attach() from changing it
				3612	* anyway.
				3613	*/
				3614	int proc_cpuset_show(struct seq_file m, struct pid_namespace ns,
				3615	struct pid pid, struct task_struct tsk)
				3616	{
				3617	char *buf;
				3618	struct cgroup_subsys_state *css;
				3619	int retval;
				3620
				3621	retval = -ENOMEM;
				3622	buf = kmalloc(PATH_MAX, GFP_KERNEL);
				3623	if (!buf)
				3624	goto out;
				3625
				3626	css = task_get_css(tsk, cpuset_cgrp_id);
				3627	retval = cgroup_path_ns(css->cgroup, buf, PATH_MAX,
				3628	current->nsproxy->cgroup_ns);
				3629	css_put(css);
				3630	if (retval >= PATH_MAX)
				3631	retval = -ENAMETOOLONG;
				3632	if (retval < 0)
				3633	goto out_free;
				3634	seq_puts(m, buf);
				3635	seq_putc(m, '\n');
				3636	retval = 0;
				3637	out_free:
				3638	kfree(buf);
				3639	out:
				3640	return retval;
				3641	}
				3642	#endif /* CONFIG_PROC_PID_CPUSET */
				3643
				3644	/* Display task mems_allowed in /proc/<pid>/status file. */
				3645	void cpuset_task_status_allowed(struct seq_file m, struct task_struct task)
				3646	{
				3647	seq_printf(m, "Mems_allowed:\t%*pb\n",
				3648	nodemask_pr_args(&task->mems_allowed));
				3649	seq_printf(m, "Mems_allowed_list:\t%*pbl\n",
				3650	nodemask_pr_args(&task->mems_allowed));
				3651	}