Blame - include/linux/cgroup-defs.h - hafnium/third_party/linux.git

blob: c9fafca1c30c5247e87cba6e3f45b92e4f25193a [file] [log] [blame]

Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1	/* SPDX-License-Identifier: GPL-2.0 */
				2	/*
				3	* linux/cgroup-defs.h - basic definitions for cgroup
				4	*
				5	* This file provides basic type and interface. Include this file directly
				6	* only if necessary to avoid cyclic dependencies.
				7	*/
				8	#ifndef _LINUX_CGROUP_DEFS_H
				9	#define _LINUX_CGROUP_DEFS_H
				10
				11	#include <linux/limits.h>
				12	#include <linux/list.h>
				13	#include <linux/idr.h>
				14	#include <linux/wait.h>
				15	#include <linux/mutex.h>
				16	#include <linux/rcupdate.h>
				17	#include <linux/refcount.h>
				18	#include <linux/percpu-refcount.h>
				19	#include <linux/percpu-rwsem.h>
				20	#include <linux/u64_stats_sync.h>
				21	#include <linux/workqueue.h>
				22	#include <linux/bpf-cgroup.h>
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	23	#include <linux/psi_types.h>
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	24
				25	#ifdef CONFIG_CGROUPS
				26
				27	struct cgroup;
				28	struct cgroup_root;
				29	struct cgroup_subsys;
				30	struct cgroup_taskset;
				31	struct kernfs_node;
				32	struct kernfs_ops;
				33	struct kernfs_open_file;
				34	struct seq_file;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	35	struct poll_table_struct;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	36
				37	#define MAX_CGROUP_TYPE_NAMELEN 32
				38	#define MAX_CGROUP_ROOT_NAMELEN 64
				39	#define MAX_CFTYPE_NAME 64
				40
				41	/* define the enumeration of all cgroup subsystems */
				42	#define SUBSYS(_x) _x ## _cgrp_id,
				43	enum cgroup_subsys_id {
				44	#include <linux/cgroup_subsys.h>
				45	CGROUP_SUBSYS_COUNT,
				46	};
				47	#undef SUBSYS
				48
				49	/* bits in struct cgroup_subsys_state flags field */
				50	enum {
				51	CSS_NO_REF = (1 << 0), /* no reference counting for this css */
				52	CSS_ONLINE = (1 << 1), /* between ->css_online() and ->css_offline() */
				53	CSS_RELEASED = (1 << 2), /* refcnt reached zero, released */
				54	CSS_VISIBLE = (1 << 3), /* css is visible to userland */
				55	CSS_DYING = (1 << 4), /* css is dying */
				56	};
				57
				58	/* bits in struct cgroup flags field */
				59	enum {
				60	/* Control Group requires release notifications to userspace */
				61	CGRP_NOTIFY_ON_RELEASE,
				62	/*
				63	* Clone the parent's configuration when creating a new child
				64	* cpuset cgroup. For historical reasons, this option can be
				65	* specified at mount time and thus is implemented here.
				66	*/
				67	CGRP_CPUSET_CLONE_CHILDREN,
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	68
				69	/* Control group has to be frozen. */
				70	CGRP_FREEZE,
				71
				72	/* Cgroup is frozen. */
				73	CGRP_FROZEN,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	74	};
				75
				76	/* cgroup_root->flags */
				77	enum {
				78	CGRP_ROOT_NOPREFIX = (1 << 1), /* mounted subsystems have no named prefix */
				79	CGRP_ROOT_XATTR = (1 << 2), /* supports extended attributes */
				80
				81	/*
				82	* Consider namespaces as delegation boundaries. If this flag is
				83	* set, controller specific interface files in a namespace root
				84	* aren't writeable from inside the namespace.
				85	*/
				86	CGRP_ROOT_NS_DELEGATE = (1 << 3),
				87
				88	/*
				89	* Enable cpuset controller in v1 cgroup to use v2 behavior.
				90	*/
				91	CGRP_ROOT_CPUSET_V2_MODE = (1 << 4),
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	92
				93	/*
				94	* Enable legacy local memory.events.
				95	*/
				96	CGRP_ROOT_MEMORY_LOCAL_EVENTS = (1 << 5),
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame]	97
				98	/*
				99	* Enable recursive subtree protection
				100	*/
				101	CGRP_ROOT_MEMORY_RECURSIVE_PROT = (1 << 6),
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	102	};
				103
				104	/* cftype->flags */
				105	enum {
				106	CFTYPE_ONLY_ON_ROOT = (1 << 0), /* only create on root cgrp */
				107	CFTYPE_NOT_ON_ROOT = (1 << 1), /* don't create on root cgrp */
				108	CFTYPE_NS_DELEGATABLE = (1 << 2), /* writeable beyond delegation boundaries */
				109
				110	CFTYPE_NO_PREFIX = (1 << 3), /* (DON'T USE FOR NEW FILES) no subsys prefix */
				111	CFTYPE_WORLD_WRITABLE = (1 << 4), /* (DON'T USE FOR NEW FILES) S_IWUGO */
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	112	CFTYPE_DEBUG = (1 << 5), /* create when cgroup_debug */
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	113
				114	/* internal flags, do not use outside cgroup core proper */
				115	__CFTYPE_ONLY_ON_DFL = (1 << 16), /* only on default hierarchy */
				116	__CFTYPE_NOT_ON_DFL = (1 << 17), /* not on default hierarchy */
				117	};
				118
				119	/*
				120	* cgroup_file is the handle for a file instance created in a cgroup which
				121	* is used, for example, to generate file changed notifications. This can
				122	* be obtained by setting cftype->file_offset.
				123	*/
				124	struct cgroup_file {
				125	/* do not access any fields from outside cgroup core */
				126	struct kernfs_node *kn;
				127	unsigned long notified_at;
				128	struct timer_list notify_timer;
				129	};
				130
				131	/*
				132	* Per-subsystem/per-cgroup state maintained by the system. This is the
				133	* fundamental structural building block that controllers deal with.
				134	*
				135	* Fields marked with "PI:" are public and immutable and may be accessed
				136	* directly without synchronization.
				137	*/
				138	struct cgroup_subsys_state {
				139	/* PI: the cgroup that this css is attached to */
				140	struct cgroup *cgroup;
				141
				142	/* PI: the cgroup subsystem that this css is attached to */
				143	struct cgroup_subsys *ss;
				144
				145	/* reference count - access via css_[try]get() and css_put() */
				146	struct percpu_ref refcnt;
				147
				148	/* siblings list anchored at the parent's ->children */
				149	struct list_head sibling;
				150	struct list_head children;
				151
				152	/* flush target list anchored at cgrp->rstat_css_list */
				153	struct list_head rstat_css_node;
				154
				155	/*
				156	* PI: Subsys-unique ID. 0 is unused and root is always 1. The
				157	* matching css can be looked up using css_from_id().
				158	*/
				159	int id;
				160
				161	unsigned int flags;
				162
				163	/*
				164	* Monotonically increasing unique serial number which defines a
				165	* uniform order among all csses. It's guaranteed that all
				166	* ->children lists are in the ascending order of ->serial_nr and
				167	* used to allow interrupting and resuming iterations.
				168	*/
				169	u64 serial_nr;
				170
				171	/*
				172	* Incremented by online self and children. Used to guarantee that
				173	* parents are not offlined before their children.
				174	*/
				175	atomic_t online_cnt;
				176
				177	/* percpu_ref killing and RCU release */
				178	struct work_struct destroy_work;
				179	struct rcu_work destroy_rwork;
				180
				181	/*
				182	* PI: the parent css. Placed here for cache proximity to following
				183	* fields of the containing structure.
				184	*/
				185	struct cgroup_subsys_state *parent;
				186	};
				187
				188	/*
				189	* A css_set is a structure holding pointers to a set of
				190	* cgroup_subsys_state objects. This saves space in the task struct
				191	* object and speeds up fork()/exit(), since a single inc/dec and a
				192	* list_add()/del() can bump the reference count on the entire cgroup
				193	* set for a task.
				194	*/
				195	struct css_set {
				196	/*
				197	* Set of subsystem states, one for each subsystem. This array is
				198	* immutable after creation apart from the init_css_set during
				199	* subsystem registration (at boot time).
				200	*/
				201	struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT];
				202
				203	/* reference count */
				204	refcount_t refcount;
				205
				206	/*
				207	* For a domain cgroup, the following points to self. If threaded,
				208	* to the matching cset of the nearest domain ancestor. The
				209	* dom_cset provides access to the domain cgroup and its csses to
				210	* which domain level resource consumptions should be charged.
				211	*/
				212	struct css_set *dom_cset;
				213
				214	/* the default cgroup associated with this css_set */
				215	struct cgroup *dfl_cgrp;
				216
				217	/* internal task count, protected by css_set_lock */
				218	int nr_tasks;
				219
				220	/*
				221	* Lists running through all tasks using this cgroup group.
				222	* mg_tasks lists tasks which belong to this cset but are in the
				223	* process of being migrated out or in. Protected by
				224	* css_set_rwsem, but, during migration, once tasks are moved to
				225	* mg_tasks, it can be read safely while holding cgroup_mutex.
				226	*/
				227	struct list_head tasks;
				228	struct list_head mg_tasks;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	229	struct list_head dying_tasks;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	230
				231	/* all css_task_iters currently walking this cset */
				232	struct list_head task_iters;
				233
				234	/*
				235	* On the default hierarhcy, ->subsys[ssid] may point to a css
				236	* attached to an ancestor instead of the cgroup this css_set is
				237	* associated with. The following node is anchored at
				238	* ->subsys[ssid]->cgroup->e_csets[ssid] and provides a way to
				239	* iterate through all css's attached to a given cgroup.
				240	*/
				241	struct list_head e_cset_node[CGROUP_SUBSYS_COUNT];
				242
				243	/* all threaded csets whose ->dom_cset points to this cset */
				244	struct list_head threaded_csets;
				245	struct list_head threaded_csets_node;
				246
				247	/*
				248	* List running through all cgroup groups in the same hash
				249	* slot. Protected by css_set_lock
				250	*/
				251	struct hlist_node hlist;
				252
				253	/*
				254	* List of cgrp_cset_links pointing at cgroups referenced from this
				255	* css_set. Protected by css_set_lock.
				256	*/
				257	struct list_head cgrp_links;
				258
				259	/*
				260	* List of csets participating in the on-going migration either as
				261	* source or destination. Protected by cgroup_mutex.
				262	*/
Olivier Deprez	92d4c21	2022-12-06 15:05:30 +0100	[diff] [blame]	263	struct list_head mg_src_preload_node;
				264	struct list_head mg_dst_preload_node;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	265	struct list_head mg_node;
				266
				267	/*
				268	* If this cset is acting as the source of migration the following
				269	* two fields are set. mg_src_cgrp and mg_dst_cgrp are
				270	* respectively the source and destination cgroups of the on-going
				271	* migration. mg_dst_cset is the destination cset the target tasks
				272	* on this cset should be migrated to. Protected by cgroup_mutex.
				273	*/
				274	struct cgroup *mg_src_cgrp;
				275	struct cgroup *mg_dst_cgrp;
				276	struct css_set *mg_dst_cset;
				277
				278	/* dead and being drained, ignore for migration */
				279	bool dead;
				280
				281	/* For RCU-protected deletion */
				282	struct rcu_head rcu_head;
				283	};
				284
				285	struct cgroup_base_stat {
				286	struct task_cputime cputime;
				287	};
				288
				289	/*
				290	* rstat - cgroup scalable recursive statistics. Accounting is done
				291	* per-cpu in cgroup_rstat_cpu which is then lazily propagated up the
				292	* hierarchy on reads.
				293	*
				294	* When a stat gets updated, the cgroup_rstat_cpu and its ancestors are
				295	* linked into the updated tree. On the following read, propagation only
				296	* considers and consumes the updated tree. This makes reading O(the
				297	* number of descendants which have been active since last read) instead of
				298	* O(the total number of descendants).
				299	*
				300	* This is important because there can be a lot of (draining) cgroups which
				301	* aren't active and stat may be read frequently. The combination can
				302	* become very expensive. By propagating selectively, increasing reading
				303	* frequency decreases the cost of each read.
				304	*
				305	* This struct hosts both the fields which implement the above -
				306	* updated_children and updated_next - and the fields which track basic
				307	* resource statistics on top of it - bsync, bstat and last_bstat.
				308	*/
				309	struct cgroup_rstat_cpu {
				310	/*
				311	* ->bsync protects ->bstat. These are the only fields which get
				312	* updated in the hot path.
				313	*/
				314	struct u64_stats_sync bsync;
				315	struct cgroup_base_stat bstat;
				316
				317	/*
				318	* Snapshots at the last reading. These are used to calculate the
				319	* deltas to propagate to the global counters.
				320	*/
				321	struct cgroup_base_stat last_bstat;
				322
				323	/*
				324	* Child cgroups with stat updates on this cpu since the last read
				325	* are linked on the parent's ->updated_children through
				326	* ->updated_next.
				327	*
				328	* In addition to being more compact, singly-linked list pointing
				329	* to the cgroup makes it unnecessary for each per-cpu struct to
				330	* point back to the associated cgroup.
				331	*
				332	* Protected by per-cpu cgroup_rstat_cpu_lock.
				333	*/
				334	struct cgroup updated_children; / terminated by self cgroup */
				335	struct cgroup updated_next; / NULL iff not on the list */
				336	};
				337
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	338	struct cgroup_freezer_state {
				339	/* Should the cgroup and its descendants be frozen. */
				340	bool freeze;
				341
				342	/* Should the cgroup actually be frozen? */
				343	int e_freeze;
				344
				345	/* Fields below are protected by css_set_lock */
				346
				347	/* Number of frozen descendant cgroups */
				348	int nr_frozen_descendants;
				349
				350	/*
				351	* Number of tasks, which are counted as frozen:
				352	* frozen, SIGSTOPped, and PTRACEd.
				353	*/
				354	int nr_frozen_tasks;
				355	};
				356
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	357	struct cgroup {
				358	/* self css with NULL ->ss, points back to this cgroup */
				359	struct cgroup_subsys_state self;
				360
				361	unsigned long flags; /* "unsigned long" so bitops work */
				362
				363	/*
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	364	* The depth this cgroup is at. The root is at depth zero and each
				365	* step down the hierarchy increments the level. This along with
				366	* ancestor_ids[] can determine whether a given cgroup is a
				367	* descendant of another without traversing the hierarchy.
				368	*/
				369	int level;
				370
				371	/* Maximum allowed descent tree depth */
				372	int max_depth;
				373
				374	/*
				375	* Keep track of total numbers of visible and dying descent cgroups.
				376	* Dying cgroups are cgroups which were deleted by a user,
				377	* but are still existing because someone else is holding a reference.
				378	* max_descendants is a maximum allowed number of descent cgroups.
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	379	*
				380	* nr_descendants and nr_dying_descendants are protected
				381	* by cgroup_mutex and css_set_lock. It's fine to read them holding
				382	* any of cgroup_mutex and css_set_lock; for writing both locks
				383	* should be held.
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	384	*/
				385	int nr_descendants;
				386	int nr_dying_descendants;
				387	int max_descendants;
				388
				389	/*
				390	* Each non-empty css_set associated with this cgroup contributes
				391	* one to nr_populated_csets. The counter is zero iff this cgroup
				392	* doesn't have any tasks.
				393	*
				394	* All children which have non-zero nr_populated_csets and/or
				395	* nr_populated_children of their own contribute one to either
				396	* nr_populated_domain_children or nr_populated_threaded_children
				397	* depending on their type. Each counter is zero iff all cgroups
				398	* of the type in the subtree proper don't have any tasks.
				399	*/
				400	int nr_populated_csets;
				401	int nr_populated_domain_children;
				402	int nr_populated_threaded_children;
				403
				404	int nr_threaded_children; /* # of live threaded child cgroups */
				405
				406	struct kernfs_node kn; / cgroup kernfs entry */
				407	struct cgroup_file procs_file; /* handle for "cgroup.procs" */
				408	struct cgroup_file events_file; /* handle for "cgroup.events" */
				409
				410	/*
				411	* The bitmask of subsystems enabled on the child cgroups.
				412	* ->subtree_control is the one configured through
				413	* "cgroup.subtree_control" while ->child_ss_mask is the effective
				414	* one which may have more subsystems enabled. Controller knobs
				415	* are made available iff it's enabled in ->subtree_control.
				416	*/
				417	u16 subtree_control;
				418	u16 subtree_ss_mask;
				419	u16 old_subtree_control;
				420	u16 old_subtree_ss_mask;
				421
				422	/* Private pointers for each registered subsystem */
				423	struct cgroup_subsys_state __rcu *subsys[CGROUP_SUBSYS_COUNT];
				424
				425	struct cgroup_root *root;
				426
				427	/*
				428	* List of cgrp_cset_links pointing at css_sets with tasks in this
				429	* cgroup. Protected by css_set_lock.
				430	*/
				431	struct list_head cset_links;
				432
				433	/*
				434	* On the default hierarchy, a css_set for a cgroup with some
				435	* susbsys disabled will point to css's which are associated with
				436	* the closest ancestor which has the subsys enabled. The
				437	* following lists all css_sets which point to this cgroup's css
				438	* for the given subsystem.
				439	*/
				440	struct list_head e_csets[CGROUP_SUBSYS_COUNT];
				441
				442	/*
				443	* If !threaded, self. If threaded, it points to the nearest
				444	* domain ancestor. Inside a threaded subtree, cgroups are exempt
				445	* from process granularity and no-internal-task constraint.
				446	* Domain level resource consumptions which aren't tied to a
				447	* specific task are charged to the dom_cgrp.
				448	*/
				449	struct cgroup *dom_cgrp;
				450	struct cgroup old_dom_cgrp; / used while enabling threaded */
				451
				452	/* per-cpu recursive resource statistics */
				453	struct cgroup_rstat_cpu __percpu *rstat_cpu;
				454	struct list_head rstat_css_list;
				455
				456	/* cgroup basic resource statistics */
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame]	457	struct cgroup_base_stat last_bstat;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	458	struct cgroup_base_stat bstat;
				459	struct prev_cputime prev_cputime; /* for printing out cputime */
				460
				461	/*
				462	* list of pidlists, up to two for each namespace (one for procs, one
				463	* for tasks); created on demand.
				464	*/
				465	struct list_head pidlists;
				466	struct mutex pidlist_mutex;
				467
				468	/* used to wait for offlining of csses */
				469	wait_queue_head_t offline_waitq;
				470
				471	/* used to schedule release agent */
				472	struct work_struct release_agent_work;
				473
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	474	/* used to track pressure stalls */
				475	struct psi_group psi;
				476
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	477	/* used to store eBPF programs */
				478	struct cgroup_bpf bpf;
				479
				480	/* If there is block congestion on this cgroup. */
				481	atomic_t congestion_count;
				482
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	483	/* Used to store internal freezer state */
				484	struct cgroup_freezer_state freezer;
				485
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	486	/* ids of the ancestors at each level including self */
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame]	487	u64 ancestor_ids[];
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	488	};
				489
				490	/*
				491	* A cgroup_root represents the root of a cgroup hierarchy, and may be
				492	* associated with a kernfs_root to form an active hierarchy. This is
				493	* internal to cgroup core. Don't access directly from controllers.
				494	*/
				495	struct cgroup_root {
				496	struct kernfs_root *kf_root;
				497
				498	/* The bitmask of subsystems attached to this hierarchy */
				499	unsigned int subsys_mask;
				500
				501	/* Unique id for this hierarchy. */
				502	int hierarchy_id;
				503
				504	/* The root cgroup. Root is destroyed on its release. */
				505	struct cgroup cgrp;
				506
				507	/* for cgrp->ancestor_ids[0] */
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame]	508	u64 cgrp_ancestor_id_storage;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	509
				510	/* Number of cgroups in the hierarchy, used only for /proc/cgroups */
				511	atomic_t nr_cgrps;
				512
				513	/* A list running through the active hierarchies */
				514	struct list_head root_list;
				515
				516	/* Hierarchy-specific flags */
				517	unsigned int flags;
				518
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	519	/* The path to use for release notifications. */
				520	char release_agent_path[PATH_MAX];
				521
				522	/* The name for this hierarchy - may be empty */
				523	char name[MAX_CGROUP_ROOT_NAMELEN];
				524	};
				525
				526	/*
				527	* struct cftype: handler definitions for cgroup control files
				528	*
				529	* When reading/writing to a file:
				530	* - the cgroup to use is file->f_path.dentry->d_parent->d_fsdata
				531	* - the 'cftype' of the file is file->f_path.dentry->d_fsdata
				532	*/
				533	struct cftype {
				534	/*
				535	* By convention, the name should begin with the name of the
				536	* subsystem, followed by a period. Zero length string indicates
				537	* end of cftype array.
				538	*/
				539	char name[MAX_CFTYPE_NAME];
				540	unsigned long private;
				541
				542	/*
				543	* The maximum length of string, excluding trailing nul, that can
				544	* be passed to write. If < PAGE_SIZE-1, PAGE_SIZE-1 is assumed.
				545	*/
				546	size_t max_write_len;
				547
				548	/* CFTYPE_* flags */
				549	unsigned int flags;
				550
				551	/*
				552	* If non-zero, should contain the offset from the start of css to
				553	* a struct cgroup_file field. cgroup will record the handle of
				554	* the created file into it. The recorded handle can be used as
				555	* long as the containing css remains accessible.
				556	*/
				557	unsigned int file_offset;
				558
				559	/*
				560	* Fields used for internal bookkeeping. Initialized automatically
				561	* during registration.
				562	*/
				563	struct cgroup_subsys ss; / NULL for cgroup core files */
				564	struct list_head node; /* anchored at ss->cfts */
				565	struct kernfs_ops *kf_ops;
				566
				567	int (open)(struct kernfs_open_file of);
				568	void (release)(struct kernfs_open_file of);
				569
				570	/*
				571	* read_u64() is a shortcut for the common case of returning a
				572	* single integer. Use it in place of read()
				573	*/
				574	u64 (read_u64)(struct cgroup_subsys_state css, struct cftype *cft);
				575	/*
				576	* read_s64() is a signed version of read_u64()
				577	*/
				578	s64 (read_s64)(struct cgroup_subsys_state css, struct cftype *cft);
				579
				580	/* generic seq_file read interface */
				581	int (seq_show)(struct seq_file sf, void *v);
				582
				583	/* optional ops, implement all or none */
				584	void (seq_start)(struct seq_file sf, loff_t ppos);
				585	void (seq_next)(struct seq_file sf, void v, loff_t *ppos);
				586	void (seq_stop)(struct seq_file sf, void *v);
				587
				588	/*
				589	* write_u64() is a shortcut for the common case of accepting
				590	* a single integer (as parsed by simple_strtoull) from
				591	* userspace. Use in place of write(); return 0 or error.
				592	*/
				593	int (write_u64)(struct cgroup_subsys_state css, struct cftype *cft,
				594	u64 val);
				595	/*
				596	* write_s64() is a signed version of write_u64()
				597	*/
				598	int (write_s64)(struct cgroup_subsys_state css, struct cftype *cft,
				599	s64 val);
				600
				601	/*
				602	* write() is the generic write callback which maps directly to
				603	* kernfs write operation and overrides all other operations.
				604	* Maximum write size is determined by ->max_write_len. Use
				605	* of_css/cft() to access the associated css and cft.
				606	*/
				607	ssize_t (write)(struct kernfs_open_file of,
				608	char *buf, size_t nbytes, loff_t off);
				609
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	610	__poll_t (poll)(struct kernfs_open_file of,
				611	struct poll_table_struct *pt);
				612
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	613	#ifdef CONFIG_DEBUG_LOCK_ALLOC
				614	struct lock_class_key lockdep_key;
				615	#endif
				616	};
				617
				618	/*
				619	* Control Group subsystem type.
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	620	* See Documentation/admin-guide/cgroup-v1/cgroups.rst for details
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	621	*/
				622	struct cgroup_subsys {
				623	struct cgroup_subsys_state (css_alloc)(struct cgroup_subsys_state *parent_css);
				624	int (css_online)(struct cgroup_subsys_state css);
				625	void (css_offline)(struct cgroup_subsys_state css);
				626	void (css_released)(struct cgroup_subsys_state css);
				627	void (css_free)(struct cgroup_subsys_state css);
				628	void (css_reset)(struct cgroup_subsys_state css);
				629	void (css_rstat_flush)(struct cgroup_subsys_state css, int cpu);
				630	int (css_extra_stat_show)(struct seq_file seq,
				631	struct cgroup_subsys_state *css);
				632
				633	int (can_attach)(struct cgroup_taskset tset);
				634	void (cancel_attach)(struct cgroup_taskset tset);
				635	void (attach)(struct cgroup_taskset tset);
				636	void (*post_attach)(void);
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame]	637	int (can_fork)(struct task_struct task,
				638	struct css_set *cset);
				639	void (cancel_fork)(struct task_struct task, struct css_set *cset);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	640	void (fork)(struct task_struct task);
				641	void (exit)(struct task_struct task);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	642	void (release)(struct task_struct task);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	643	void (bind)(struct cgroup_subsys_state root_css);
				644
				645	bool early_init:1;
				646
				647	/*
				648	* If %true, the controller, on the default hierarchy, doesn't show
				649	* up in "cgroup.controllers" or "cgroup.subtree_control", is
				650	* implicitly enabled on all cgroups on the default hierarchy, and
				651	* bypasses the "no internal process" constraint. This is for
				652	* utility type controllers which is transparent to userland.
				653	*
				654	* An implicit controller can be stolen from the default hierarchy
				655	* anytime and thus must be okay with offline csses from previous
				656	* hierarchies coexisting with csses for the current one.
				657	*/
				658	bool implicit_on_dfl:1;
				659
				660	/*
				661	* If %true, the controller, supports threaded mode on the default
				662	* hierarchy. In a threaded subtree, both process granularity and
				663	* no-internal-process constraint are ignored and a threaded
				664	* controllers should be able to handle that.
				665	*
				666	* Note that as an implicit controller is automatically enabled on
				667	* all cgroups on the default hierarchy, it should also be
				668	* threaded. implicit && !threaded is not supported.
				669	*/
				670	bool threaded:1;
				671
				672	/*
				673	* If %false, this subsystem is properly hierarchical -
				674	* configuration, resource accounting and restriction on a parent
				675	* cgroup cover those of its children. If %true, hierarchy support
				676	* is broken in some ways - some subsystems ignore hierarchy
				677	* completely while others are only implemented half-way.
				678	*
				679	* It's now disallowed to create nested cgroups if the subsystem is
				680	* broken and cgroup core will emit a warning message on such
				681	* cases. Eventually, all subsystems will be made properly
				682	* hierarchical and this will go away.
				683	*/
				684	bool broken_hierarchy:1;
				685	bool warned_broken_hierarchy:1;
				686
				687	/* the following two fields are initialized automtically during boot */
				688	int id;
				689	const char *name;
				690
				691	/* optional, initialized automatically during boot if not set */
				692	const char *legacy_name;
				693
				694	/* link to parent, protected by cgroup_lock() */
				695	struct cgroup_root *root;
				696
				697	/* idr for css->id */
				698	struct idr css_idr;
				699
				700	/*
				701	* List of cftypes. Each entry is the first entry of an array
				702	* terminated by zero length name.
				703	*/
				704	struct list_head cfts;
				705
				706	/*
				707	* Base cftypes which are automatically registered. The two can
				708	* point to the same array.
				709	*/
				710	struct cftype dfl_cftypes; / for the default hierarchy */
				711	struct cftype legacy_cftypes; / for the legacy hierarchies */
				712
				713	/*
				714	* A subsystem may depend on other subsystems. When such subsystem
				715	* is enabled on a cgroup, the depended-upon subsystems are enabled
				716	* together if available. Subsystems enabled due to dependency are
				717	* not visible to userland until explicitly enabled. The following
				718	* specifies the mask of subsystems that this one depends on.
				719	*/
				720	unsigned int depends_on;
				721	};
				722
				723	extern struct percpu_rw_semaphore cgroup_threadgroup_rwsem;
				724
				725	/**
				726	* cgroup_threadgroup_change_begin - threadgroup exclusion for cgroups
				727	* @tsk: target task
				728	*
				729	* Allows cgroup operations to synchronize against threadgroup changes
				730	* using a percpu_rw_semaphore.
				731	*/
				732	static inline void cgroup_threadgroup_change_begin(struct task_struct *tsk)
				733	{
				734	percpu_down_read(&cgroup_threadgroup_rwsem);
				735	}
				736
				737	/**
				738	* cgroup_threadgroup_change_end - threadgroup exclusion for cgroups
				739	* @tsk: target task
				740	*
				741	* Counterpart of cgroup_threadcgroup_change_begin().
				742	*/
				743	static inline void cgroup_threadgroup_change_end(struct task_struct *tsk)
				744	{
				745	percpu_up_read(&cgroup_threadgroup_rwsem);
				746	}
				747
				748	#else /* CONFIG_CGROUPS */
				749
				750	#define CGROUP_SUBSYS_COUNT 0
				751
				752	static inline void cgroup_threadgroup_change_begin(struct task_struct *tsk)
				753	{
				754	might_sleep();
				755	}
				756
				757	static inline void cgroup_threadgroup_change_end(struct task_struct *tsk) {}
				758
				759	#endif /* CONFIG_CGROUPS */
				760
				761	#ifdef CONFIG_SOCK_CGROUP_DATA
				762
				763	/*
				764	* sock_cgroup_data is embedded at sock->sk_cgrp_data and contains
				765	* per-socket cgroup information except for memcg association.
				766	*
				767	* On legacy hierarchies, net_prio and net_cls controllers directly set
				768	* attributes on each sock which can then be tested by the network layer.
				769	* On the default hierarchy, each sock is associated with the cgroup it was
				770	* created in and the networking layer can match the cgroup directly.
				771	*
				772	* To avoid carrying all three cgroup related fields separately in sock,
				773	* sock_cgroup_data overloads (prioidx, classid) and the cgroup pointer.
				774	* On boot, sock_cgroup_data records the cgroup that the sock was created
				775	* in so that cgroup2 matches can be made; however, once either net_prio or
				776	* net_cls starts being used, the area is overriden to carry prioidx and/or
				777	* classid. The two modes are distinguished by whether the lowest bit is
				778	* set. Clear bit indicates cgroup pointer while set bit prioidx and
				779	* classid.
				780	*
				781	* While userland may start using net_prio or net_cls at any time, once
				782	* either is used, cgroup2 matching no longer works. There is no reason to
				783	* mix the two and this is in line with how legacy and v2 compatibility is
				784	* handled. On mode switch, cgroup references which are already being
				785	* pointed to by socks may be leaked. While this can be remedied by adding
				786	* synchronization around sock_cgroup_data, given that the number of leaked
				787	* cgroups is bound and highly unlikely to be high, this seems to be the
				788	* better trade-off.
				789	*/
				790	struct sock_cgroup_data {
				791	union {
				792	#ifdef __LITTLE_ENDIAN
				793	struct {
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame]	794	u8 is_data : 1;
				795	u8 no_refcnt : 1;
				796	u8 unused : 6;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	797	u8 padding;
				798	u16 prioidx;
				799	u32 classid;
				800	} __packed;
				801	#else
				802	struct {
				803	u32 classid;
				804	u16 prioidx;
				805	u8 padding;
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame]	806	u8 unused : 6;
				807	u8 no_refcnt : 1;
				808	u8 is_data : 1;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	809	} __packed;
				810	#endif
				811	u64 val;
				812	};
				813	};
				814
				815	/*
				816	* There's a theoretical window where the following accessors race with
				817	* updaters and return part of the previous pointer as the prioidx or
				818	* classid. Such races are short-lived and the result isn't critical.
				819	*/
				820	static inline u16 sock_cgroup_prioidx(const struct sock_cgroup_data *skcd)
				821	{
				822	/* fallback to 1 which is always the ID of the root cgroup */
				823	return (skcd->is_data & 1) ? skcd->prioidx : 1;
				824	}
				825
				826	static inline u32 sock_cgroup_classid(const struct sock_cgroup_data *skcd)
				827	{
				828	/* fallback to 0 which is the unconfigured default classid */
				829	return (skcd->is_data & 1) ? skcd->classid : 0;
				830	}
				831
				832	/*
				833	* If invoked concurrently, the updaters may clobber each other. The
				834	* caller is responsible for synchronization.
				835	*/
				836	static inline void sock_cgroup_set_prioidx(struct sock_cgroup_data *skcd,
				837	u16 prioidx)
				838	{
				839	struct sock_cgroup_data skcd_buf = {{ .val = READ_ONCE(skcd->val) }};
				840
				841	if (sock_cgroup_prioidx(&skcd_buf) == prioidx)
				842	return;
				843
				844	if (!(skcd_buf.is_data & 1)) {
				845	skcd_buf.val = 0;
				846	skcd_buf.is_data = 1;
				847	}
				848
				849	skcd_buf.prioidx = prioidx;
				850	WRITE_ONCE(skcd->val, skcd_buf.val); /* see sock_cgroup_ptr() */
				851	}
				852
				853	static inline void sock_cgroup_set_classid(struct sock_cgroup_data *skcd,
				854	u32 classid)
				855	{
				856	struct sock_cgroup_data skcd_buf = {{ .val = READ_ONCE(skcd->val) }};
				857
				858	if (sock_cgroup_classid(&skcd_buf) == classid)
				859	return;
				860
				861	if (!(skcd_buf.is_data & 1)) {
				862	skcd_buf.val = 0;
				863	skcd_buf.is_data = 1;
				864	}
				865
				866	skcd_buf.classid = classid;
				867	WRITE_ONCE(skcd->val, skcd_buf.val); /* see sock_cgroup_ptr() */
				868	}
				869
				870	#else /* CONFIG_SOCK_CGROUP_DATA */
				871
				872	struct sock_cgroup_data {
				873	};
				874
				875	#endif /* CONFIG_SOCK_CGROUP_DATA */
				876
				877	#endif /* _LINUX_CGROUP_DEFS_H */